From cfd0b0ea83fca939fa267f0c4ac45b4ee3157d22 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 31 May 2022 14:30:27 +0800 Subject: [PATCH 001/676] server: fix race during raft data migration (#12701) close tikv/tikv#12698 Close engine before cleaning up its data during raft engine migration. Signed-off-by: tabokie --- components/raft_log_engine/src/engine.rs | 2 -- components/server/src/server.rs | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 145a122802d..9707bdb28b7 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -444,8 +444,6 @@ impl RaftEngine for RaftLogEngine { None } - fn stop(&self) {} - fn dump_stats(&self) -> Result { // Raft engine won't dump anything. Ok("".to_owned()) diff --git a/components/server/src/server.rs b/components/server/src/server.rs index b9f3c7bd6f2..4344a706fde 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1414,6 +1414,8 @@ impl ConfiguredRaftEngine for RocksEngine { RaftLogEngine::new(config.raft_engine.config(), key_manager.clone(), None) .expect("failed to open raft engine for migration"); dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /*threads*/); + raft_engine.stop(); + drop(raft_engine); raft_data_state_machine.after_dump_data(); } raftdb @@ -1463,6 +1465,8 @@ impl ConfiguredRaftEngine for RaftLogEngine { .expect("failed to open raftdb for migration"); let raftdb = RocksEngine::from_db(Arc::new(raftdb)); dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /*threads*/); + raftdb.stop(); + drop(raftdb); raft_data_state_machine.after_dump_data(); } raft_engine From 6a67b08d7fc6fa0623009e5bcb756333c2a655af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 1 Jun 2022 00:42:27 +0800 Subject: [PATCH 002/676] log backup: disable test log by default (#12710) close tikv/tikv#12709 Signed-off-by: Yu Juncen --- components/backup-stream/src/router.rs | 5 ----- components/backup-stream/tests/mod.rs | 6 ------ 2 files changed, 11 deletions(-) diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 294ec2c0c98..8db9244d916 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -1328,7 +1328,6 @@ mod tests { #[tokio::test] async fn test_basic_file() -> Result<()> { - test_util::init_log_for_test(); let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); tokio::fs::create_dir_all(&tmp).await?; let (tx, rx) = dummy_scheduler(); @@ -1485,7 +1484,6 @@ mod tests { #[tokio::test] async fn test_flush_with_error() -> Result<()> { - test_util::init_log_for_test(); let (tx, _rx) = dummy_scheduler(); let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); let router = Arc::new(RouterInner::new( @@ -1517,7 +1515,6 @@ mod tests { #[tokio::test] async fn test_empty_resolved_ts() { - test_util::init_log_for_test(); let (tx, _rx) = dummy_scheduler(); let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); let router = RouterInner::new(tmp.clone(), tx, 32, Duration::from_secs(300)); @@ -1544,7 +1541,6 @@ mod tests { #[tokio::test] async fn test_flush_with_pausing_self() -> Result<()> { - test_util::init_log_for_test(); let (tx, rx) = dummy_scheduler(); let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); let router = Arc::new(RouterInner::new( @@ -1585,7 +1581,6 @@ mod tests { #[test] fn test_format_datetime() { - test_util::init_log_for_test(); let s = TempFileKey::format_date_time(431656320867237891); let s = s.to_string(); assert_eq!(s, "20220307"); diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 064b954d7bf..85bb633955b 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -507,7 +507,6 @@ mod test { #[test] fn basic() { - // test_util::init_log_for_test(); let mut suite = super::Suite::new("basic", 4); run_async_test(async { @@ -528,7 +527,6 @@ mod test { #[test] fn with_split() { - // test_util::init_log_for_test(); let mut suite = super::Suite::new("with_split", 4); run_async_test(async { let round1 = suite.write_records(0, 128, 1).await; @@ -548,7 +546,6 @@ mod test { #[test] /// This case tests whether the backup can continue when the leader failes. fn leader_down() { - // test_util::init_log_for_test(); let mut suite = super::Suite::new("leader_down", 4); suite.must_register_task(1, "test_leader_down"); suite.sync(); @@ -569,7 +566,6 @@ mod test { /// This case tests whehter the checkpoint ts (next backup ts) can be advanced correctly /// when async commit is enabled. fn async_commit() { - // test_util::init_log_for_test(); let mut suite = super::Suite::new("async_commit", 3); run_async_test(async { suite.must_register_task(1, "test_async_commit"); @@ -600,7 +596,6 @@ mod test { #[test] fn fatal_error() { - // test_util::init_log_for_test(); let mut suite = super::Suite::new("fatal_error", 3); suite.must_register_task(1, "test_fatal_error"); suite.sync(); @@ -657,7 +652,6 @@ mod test { #[test] fn inflight_messages() { - test_util::init_log_for_test(); // We should remove the failpoints when paniked or we may get stucked. defer! {{ fail::remove("delay_on_start_observe"); From 6d883b37a9a3299a62b649c97de27f3cd10e46c5 Mon Sep 17 00:00:00 2001 From: Xiaoguang Sun Date: Wed, 1 Jun 2022 13:26:27 +0800 Subject: [PATCH 003/676] Add link to website of TiKV's creator (#12703) close tikv/tikv#12702 Signed-off-by: Xiaoguang Sun --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index b9a2d9d9519..65bad6835ee 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Coverage Status](https://codecov.io/gh/tikv/tikv/branch/master/graph/badge.svg)](https://codecov.io/gh/tikv/tikv) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/2574/badge)](https://bestpractices.coreinfrastructure.org/projects/2574) -TiKV is an open-source, distributed, and transactional key-value database. Unlike other traditional NoSQL systems, TiKV not only provides classical key-value APIs, but also transactional APIs with ACID compliance. Built in Rust and powered by Raft, TiKV was originally created to complement [TiDB](https://github.com/pingcap/tidb), a distributed HTAP database compatible with the MySQL protocol. +TiKV is an open-source, distributed, and transactional key-value database. Unlike other traditional NoSQL systems, TiKV not only provides classical key-value APIs, but also transactional APIs with ACID compliance. Built in Rust and powered by Raft, TiKV was originally created by [PingCAP](https://en.pingcap.com) to complement [TiDB](https://github.com/pingcap/tidb), a distributed HTAP database compatible with the MySQL protocol. The design of TiKV ('Ti' stands for titanium) is inspired by some great distributed systems from Google, such as BigTable, Spanner, and Percolator, and some of the latest achievements in academia in recent years, such as the Raft consensus algorithm. @@ -134,10 +134,6 @@ See [CONTRIBUTING.md](./CONTRIBUTING.md). ## Client drivers -Currently, the interfaces to TiKV are the [TiDB Go client](https://github.com/pingcap/tidb/tree/master/store/tikv) and the [TiSpark Java client](https://github.com/pingcap/tispark/tree/master/tikv-client/src/main/java/com/pingcap/tikv). - -These are the clients for TiKV: - - [Go](https://github.com/tikv/client-go) (The most stable and widely used) - [Java](https://github.com/tikv/client-java) - [Rust](https://github.com/tikv/client-rust) From f7edbcf610cd6bb5c9040317edd3260188a6e87d Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 31 May 2022 23:10:27 -0700 Subject: [PATCH 004/676] raftstore: use approximate size to generate bucket unless split region check is needed and scan is used for it. (#12678) ref tikv/tikv#12597 When bucket is enabled, using CheckPolicy::Scan would lead to large amount of read IO after tikv restart. Before this PR, the Scan is used unless the region size reaches 1.5G, which is very rare for a 96 ~ 256MB's region-split-size. After this change, generating bucket won't introduce new scan unless the scan is necessary for splitting region. This can significantly reduce the read IO. Also refine the logic for the fix of 12597. Signed-off-by: qi.xu Co-authored-by: qi.xu Co-authored-by: Ti Chi Robot --- .../raftstore/src/coprocessor/config.rs | 3 + .../src/coprocessor/split_check/half.rs | 13 ++- .../src/coprocessor/split_check/keys.rs | 8 +- .../src/coprocessor/split_check/size.rs | 102 ++++++++++++++---- tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 6 files changed, 101 insertions(+), 27 deletions(-) diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index 0f553c879a2..1609cc3001a 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -50,6 +50,8 @@ pub struct Config { pub region_bucket_size: ReadableSize, // region size threshold for using approximate size instead of scan pub region_size_threshold_for_approximate: ReadableSize, + #[online_config(skip)] + pub prefer_approximate_bucket: bool, // ratio of region_bucket_size. (0, 0.5) // The region_bucket_merge_size_ratio * region_bucket_size is threshold to merge with its left neighbor bucket pub region_bucket_merge_size_ratio: f64, @@ -91,6 +93,7 @@ impl Default for Config { region_bucket_size: DEFAULT_BUCKET_SIZE, region_size_threshold_for_approximate: DEFAULT_BUCKET_SIZE * BATCH_SPLIT_LIMIT / 2 * 3, region_bucket_merge_size_ratio: DEFAULT_REGION_BUCKET_MERGE_SIZE_RATIO, + prefer_approximate_bucket: true, } } } diff --git a/components/raftstore/src/coprocessor/split_check/half.rs b/components/raftstore/src/coprocessor/split_check/half.rs index 87ee861c95c..a52b7a59d60 100644 --- a/components/raftstore/src/coprocessor/split_check/half.rs +++ b/components/raftstore/src/coprocessor/split_check/half.rs @@ -218,13 +218,13 @@ mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { - region_max_size: Some(ReadableSize(BUCKET_NUMBER_LIMIT as u64)), + region_split_size: ReadableSize(130_u64), enable_region_bucket: true, region_bucket_size: ReadableSize(20_u64), // so that each key below will form a bucket ..Default::default() }; - let mut runnable = - SplitCheckRunner::new(engine.clone(), tx.clone(), CoprocessorHost::new(tx, cfg)); + let cop_host = CoprocessorHost::new(tx.clone(), cfg); + let mut runnable = SplitCheckRunner::new(engine.clone(), tx, cop_host.clone()); let key_gen = |k: &[u8], i: u64, mvcc: bool| { if !mvcc { @@ -276,6 +276,9 @@ mod tests { Some(vec![bucket_range]), )); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Scan); + must_generate_buckets(&rx, &exp_bucket_keys); // testing split bucket with end key "" @@ -299,6 +302,8 @@ mod tests { CheckPolicy::Scan, Some(vec![bucket_range]), )); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Scan); must_generate_buckets(&rx, &exp_bucket_keys); @@ -345,7 +350,7 @@ mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { - region_max_size: Some(ReadableSize(BUCKET_NUMBER_LIMIT as u64)), + region_split_size: ReadableSize(130_u64), enable_region_bucket: true, region_bucket_size: ReadableSize(20_u64), // so that each key below will form a bucket ..Default::default() diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index bc9c847225a..22a81e54f31 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -182,12 +182,16 @@ where } REGION_KEYS_HISTOGRAM.observe(region_keys as f64); - if region_keys >= host.cfg.region_max_keys() { + // if bucket checker using scan is added, to utilize the scan, + // add keys checker as well for free + // It has the assumption that the size's checker is before the keys's check in the host + let need_split_region = region_keys >= host.cfg.region_max_keys(); + if need_split_region { info!( "approximate keys over threshold, need to do split check"; "region_id" => region.get_id(), "keys" => region_keys, - "threshold" => host.cfg.region_max_keys, + "threshold" => host.cfg.region_max_keys(), ); // Need to check keys. host.add_checker(Box::new(Checker::new( diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 59603782f5c..30198cd2337 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -51,13 +51,6 @@ where E: KvEngine, { fn on_kv(&mut self, _: &mut ObserverContext<'_>, entry: &KeyEntry) -> bool { - // If there's no need to check region split, skip it. - // Otherwise, the region whose keys > max region keys will not be splitted when batch_split_limit is zero, - // because eventually "over_limit && self.current_size + self.split_size >= self.max_size" - // will return true. - if self.batch_split_limit == 0 { - return false; - } let size = entry.entry_size() as u64; self.current_size += size; @@ -189,18 +182,19 @@ where ); } + let need_bucket_checker = + host.cfg.enable_region_bucket && region_size >= 2 * host.cfg.region_bucket_size.0; REGION_SIZE_HISTOGRAM.observe(region_size as f64); - if region_size >= host.cfg.region_max_size().0 - || host.cfg.enable_region_bucket && region_size >= 2 * host.cfg.region_bucket_size.0 - { - let batch_split_limit = if region_size >= host.cfg.region_max_size().0 { - host.cfg.batch_split_limit - } else { - // no region split check needed - 0 - }; + + let need_split_region = region_size >= host.cfg.region_max_size().0; + if need_split_region || need_bucket_checker { // when it's a large region use approximate way to produce split keys - if region_size >= host.cfg.region_size_threshold_for_approximate.0 { + if need_split_region { + if region_size >= host.cfg.region_size_threshold_for_approximate.0 { + policy = CheckPolicy::Approximate; + } + } else if host.cfg.prefer_approximate_bucket { + // when the check is only for bucket, use approximate anyway policy = CheckPolicy::Approximate; } @@ -210,13 +204,12 @@ where "size" => region_size, "threshold" => host.cfg.region_max_size().0, "policy" => ?policy, - "split_check" => batch_split_limit > 0, ); // Need to check size. host.add_checker(Box::new(Checker::new( host.cfg.region_max_size().0, host.cfg.region_split_size.0, - batch_split_limit, + host.cfg.batch_split_limit, policy, ))); } else { @@ -619,8 +612,8 @@ pub mod tests { keys::data_key(Key::from_raw(bytes).append_ts(ts).as_encoded()) } }; - let mut runnable = - SplitCheckRunner::new(engine.clone(), tx.clone(), CoprocessorHost::new(tx, cfg)); + let cop_host = CoprocessorHost::new(tx.clone(), cfg); + let mut runnable = SplitCheckRunner::new(engine.clone(), tx, cop_host.clone()); for i in 0..2000 { // if not mvcc, kv size is (6+1)*2 = 14, given bucket size is 3000, expect each bucket has about 210 keys // if mvcc, kv size is about 18*2 = 36, expect each bucket has about 80 keys @@ -638,6 +631,9 @@ pub mod tests { None, )); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Approximate); + if !mvcc { must_generate_buckets_approximate(&rx, None, 15000, 45000, mvcc); } else { @@ -664,6 +660,8 @@ pub mod tests { CheckPolicy::Approximate, Some(vec![BucketRange(start.clone(), end.clone())]), )); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Approximate); if !mvcc { must_generate_buckets_approximate(&rx, Some(BucketRange(start, end)), 150, 450, mvcc); @@ -696,6 +694,68 @@ pub mod tests { } } + #[test] + fn test_check_policy_for_bucket_generation() { + let path = Builder::new() + .prefix("test_check_policy_for_bucket_generation") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + let db_opts = DBOptions::default(); + let cfs_with_range_prop: HashSet<_> = LARGE_CFS.iter().cloned().collect(); + let mut cf_opt = ColumnFamilyOptions::new(); + cf_opt.set_no_range_properties(true); + cf_opt.set_disable_auto_compactions(true); + + let cfs_opts = ALL_CFS + .iter() + .map(|cf| { + if cfs_with_range_prop.contains(cf) { + let mut opt = ColumnFamilyOptions::new(); + opt.set_disable_auto_compactions(true); + CFOptions::new(cf, opt) + } else { + CFOptions::new(cf, cf_opt.clone()) + } + }) + .collect(); + let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let (tx, _rx) = mpsc::sync_channel(100); + let mut cfg = Config { + region_max_size: Some(ReadableSize(50000)), + region_split_size: ReadableSize(50000), + region_max_keys: Some(1000000), + region_split_keys: Some(1000000), + batch_split_limit: 5, + enable_region_bucket: true, + region_bucket_size: ReadableSize(1), // minimal bucket size + region_size_threshold_for_approximate: ReadableSize(500000000), + // follow split region's check policy, not force to use approximate + prefer_approximate_bucket: false, + ..Default::default() + }; + let mut region = Region::default(); + region.set_id(1); + region.set_start_key(vec![]); + region.set_end_key(vec![]); + region.mut_peers().push(Peer::default()); + region.mut_region_epoch().set_version(2); + region.mut_region_epoch().set_conf_ver(5); + for i in 0..20 { + let s = keys::data_key(format!("{:04}00", i).as_bytes()); + engine.put_cf(CF_WRITE, &s, &s).unwrap(); + } + + let cop_host = CoprocessorHost::new(tx.clone(), cfg.clone()); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Scan); + + cfg.prefer_approximate_bucket = true; + let cop_host = CoprocessorHost::new(tx, cfg); + let host = cop_host.new_split_checker_host(®ion, &engine, true, CheckPolicy::Scan); + assert_eq!(host.policy(), CheckPolicy::Approximate); + } + #[test] fn test_cf_lock_without_range_prop() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 21ca6747378..aa0559cbeb2 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -693,6 +693,7 @@ fn test_serde_custom_tikv_config() { enable_region_bucket: true, region_bucket_size: ReadableSize::mb(1), region_size_threshold_for_approximate: ReadableSize::mb(3), + prefer_approximate_bucket: false, region_bucket_merge_size_ratio: 0.4, }; let mut cert_allowed_cn = HashSet::default(); diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 61f0cb87e20..17c7635e846 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -232,6 +232,7 @@ enable-region-bucket = true region-bucket-size = "1MB" region-size-threshold-for-approximate = "3MB" region-bucket-merge-size-ratio = 0.4 +prefer-approximate-bucket = false [rocksdb] wal-recovery-mode = "absolute-consistency" From 8a2245455d4b6d117d203c599444f5455762cff6 Mon Sep 17 00:00:00 2001 From: qupeng Date: Wed, 1 Jun 2022 17:02:28 +0800 Subject: [PATCH 005/676] cdc: skip prewrite without value (#12612) ref tikv/tikv#12717 cdc: skip prewrite without value Signed-off-by: qupeng --- components/cdc/src/delegate.rs | 202 +++++++++--------- components/cdc/tests/integrations/test_cdc.rs | 38 +++- 2 files changed, 135 insertions(+), 105 deletions(-) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index dc9f36e92ec..2fb971a4024 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -10,7 +10,7 @@ use std::{ }; use api_version::{ApiV2, KeyMode, KvFormat}; -use collections::HashMap; +use collections::{HashMap, HashMapEntry}; use crossbeam::atomic::AtomicCell; use kvproto::{ cdcpb::{ @@ -481,83 +481,63 @@ impl Delegate { let mut rows = vec![Vec::with_capacity(entries_len)]; let mut current_rows_size: usize = 0; for entry in entries { + let (mut row, mut _has_value) = (EventRow::default(), false); + let row_size: usize; match entry { Some(KvEntry::RawKvEntry(kv_pair)) => { - let mut row = EventRow::default(); decode_rawkv(kv_pair.0, kv_pair.1, &mut row)?; - let row_size = row.key.len() + row.value.len(); - if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { - rows.push(Vec::with_capacity(entries_len)); - current_rows_size = 0; + row_size = row.key.len() + row.value.len(); + } + Some(KvEntry::TxnEntry(TxnEntry::Prewrite { + default, + lock, + old_value, + })) => { + let l = Lock::parse(&lock.1).unwrap(); + if decode_lock(lock.0, l, &mut row, &mut _has_value) { + continue; } - current_rows_size += row_size; - rows.last_mut().unwrap().push(row); + decode_default(default.1, &mut row, &mut _has_value); + row.old_value = old_value.finalized().unwrap_or_default(); + row_size = row.key.len() + row.value.len(); } - Some(KvEntry::TxnEntry(txn_entry)) => { - match txn_entry { - TxnEntry::Prewrite { - default, - lock, - old_value, - } => { - let mut row = EventRow::default(); - let skip = decode_lock(lock.0, Lock::parse(&lock.1).unwrap(), &mut row); - if skip { - continue; - } - decode_default(default.1, &mut row); - let row_size = row.key.len() + row.value.len(); - if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { - rows.push(Vec::with_capacity(entries_len)); - current_rows_size = 0; - } - current_rows_size += row_size; - row.old_value = old_value.finalized().unwrap_or_default(); - rows.last_mut().unwrap().push(row); - } - TxnEntry::Commit { - default, - write, - old_value, - } => { - let mut row = EventRow::default(); - let skip = decode_write(write.0, &write.1, &mut row, false); - if skip { - continue; - } - decode_default(default.1, &mut row); - - // This type means the row is self-contained, it has, - // 1. start_ts - // 2. commit_ts - // 3. key - // 4. value - if row.get_type() == EventLogType::Rollback { - // We dont need to send rollbacks to downstream, - // because downstream does not needs rollback to clean - // prewrite as it drops all previous stashed data. - continue; - } - set_event_row_type(&mut row, EventLogType::Committed); - row.old_value = old_value.finalized().unwrap_or_default(); - let row_size = row.key.len() + row.value.len(); - if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { - rows.push(Vec::with_capacity(entries_len)); - current_rows_size = 0; - } - current_rows_size += row_size; - rows.last_mut().unwrap().push(row); - } + Some(KvEntry::TxnEntry(TxnEntry::Commit { + default, + write, + old_value, + })) => { + if decode_write(write.0, &write.1, &mut row, &mut _has_value, false) { + continue; + } + decode_default(default.1, &mut row, &mut _has_value); + + // This type means the row is self-contained, it has, + // 1. start_ts + // 2. commit_ts + // 3. key + // 4. value + if row.get_type() == EventLogType::Rollback { + // We dont need to send rollbacks to downstream, + // because downstream does not needs rollback to clean + // prewrite as it drops all previous stashed data. + continue; } + set_event_row_type(&mut row, EventLogType::Committed); + row.old_value = old_value.finalized().unwrap_or_default(); + row_size = row.key.len() + row.value.len(); } None => { - let mut row = EventRow::default(); - // This type means scan has finished. set_event_row_type(&mut row, EventLogType::Initialized); - rows.last_mut().unwrap().push(row); + row_size = 0; } } + if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { + rows.push(Vec::with_capacity(entries_len)); + current_rows_size = 0; + } + current_rows_size += row_size; + rows.last_mut().unwrap().push(row); } let rows = rows @@ -596,7 +576,8 @@ impl Delegate { Ok(()) }; - let mut txn_rows: HashMap, EventRow> = HashMap::default(); + // map[key] -> (event, has_value). + let mut txn_rows: HashMap, (EventRow, bool)> = HashMap::default(); let mut raw_rows: Vec = Vec::new(); for mut req in requests { match req.get_cmd_type() { @@ -620,17 +601,20 @@ impl Delegate { } } - if !txn_rows.is_empty() { - let mut rows = Vec::with_capacity(txn_rows.len()); - for (_, v) in txn_rows { - rows.push(v); + let mut rows = Vec::with_capacity(txn_rows.len()); + for (_, (v, has_value)) in txn_rows { + if v.r_type == EventLogType::Prewrite && v.op_type == EventRowOpType::Put && !has_value + { + // It's possible that a prewrite command only contains lock but without + // default. It's not documented by classic Percolator but introduced with + // Large-Transaction. Those prewrites are not complete, we must skip them. + continue; } - self.sink_downstream(rows, index, ChangeDataRequestKvApi::TiDb)?; + rows.push(v); } + self.sink_downstream(rows, index, ChangeDataRequestKvApi::TiDb)?; - if !raw_rows.is_empty() { - self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv)?; - } + self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv)?; Ok(()) } @@ -641,6 +625,9 @@ impl Delegate { index: u64, kv_api: ChangeDataRequestKvApi, ) -> Result<()> { + if entries.is_empty() { + return Ok(()); + } let event_entries = EventEntries { entries: entries.into(), ..Default::default() @@ -676,7 +663,7 @@ impl Delegate { &mut self, put: PutRequest, is_one_pc: bool, - txn_rows: &mut HashMap, EventRow>, + txn_rows: &mut HashMap, (EventRow, bool)>, raw_rows: &mut Vec, read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, ) -> Result<()> { @@ -699,13 +686,13 @@ impl Delegate { &mut self, mut put: PutRequest, is_one_pc: bool, - rows: &mut HashMap, EventRow>, + rows: &mut HashMap, (EventRow, bool)>, mut read_old_value: impl FnMut(&mut EventRow, TimeStamp) -> Result<()>, ) -> Result<()> { match put.cf.as_str() { "write" => { - let mut row = EventRow::default(); - if decode_write(put.take_key(), put.get_value(), &mut row, true) { + let (mut row, mut has_value) = (EventRow::default(), false); + if decode_write(put.take_key(), &put.value, &mut row, &mut has_value, true) { return Ok(()); } @@ -734,36 +721,29 @@ impl Delegate { ); } - match rows.get_mut(&row.key) { - Some(row_with_value) => { - row.value = mem::take(&mut row_with_value.value); - *row_with_value = row; + match rows.entry(row.key.clone()) { + HashMapEntry::Occupied(o) => { + let o = o.into_mut(); + mem::swap(&mut o.0.value, &mut row.value); + o.0 = row; } - None => { - rows.insert(row.key.clone(), row); + HashMapEntry::Vacant(v) => { + v.insert((row, has_value)); } } } "lock" => { - let mut row = EventRow::default(); + let (mut row, mut has_value) = (EventRow::default(), false); let lock = Lock::parse(put.get_value()).unwrap(); let for_update_ts = lock.for_update_ts; - if decode_lock(put.take_key(), lock, &mut row) { + if decode_lock(put.take_key(), lock, &mut row, &mut has_value) { return Ok(()); } let read_old_ts = std::cmp::max(for_update_ts, row.start_ts.into()); read_old_value(&mut row, read_old_ts)?; - let occupied = rows.entry(row.key.clone()).or_default(); - if !occupied.value.is_empty() { - assert!(row.value.is_empty()); - let mut value = vec![]; - mem::swap(&mut occupied.value, &mut value); - row.value = value; - } - // In order to compute resolved ts, - // we must track inflight txns. + // In order to compute resolved ts, we must track inflight txns. match self.resolver { Some(ref mut resolver) => { resolver.track_lock(row.start_ts.into(), row.key.clone(), None) @@ -780,16 +760,20 @@ impl Delegate { } } - *occupied = row; + let occupied = rows.entry(row.key.clone()).or_default(); + if occupied.1 { + assert!(!has_value); + has_value = true; + mem::swap(&mut occupied.0.value, &mut row.value); + } + *occupied = (row, has_value); } "" | "default" => { let key = Key::from_encoded(put.take_key()).truncate_ts().unwrap(); let row = rows.entry(key.into_raw().unwrap()).or_default(); - decode_default(put.take_value(), row); - } - other => { - panic!("invalid cf {}", other); + decode_default(put.take_value(), &mut row.0, &mut row.1); } + other => panic!("invalid cf {}", other), } Ok(()) } @@ -909,7 +893,13 @@ fn make_overlapped_rollback(key: Key, row: &mut EventRow) { /// Decodes the write record and store its information in `row`. This may be called both when /// doing incremental scan of observing apply events. There's different behavior for the two /// case, distinguished by the `is_apply` parameter. -fn decode_write(key: Vec, value: &[u8], row: &mut EventRow, is_apply: bool) -> bool { +fn decode_write( + key: Vec, + value: &[u8], + row: &mut EventRow, + has_value: &mut bool, + is_apply: bool, +) -> bool { let key = Key::from_encoded(key); let write = WriteRef::parse(value).unwrap().to_owned(); @@ -946,12 +936,13 @@ fn decode_write(key: Vec, value: &[u8], row: &mut EventRow, is_apply: bool) set_event_row_type(row, r_type); if let Some(value) = write.short_value { row.value = value; + *has_value = true; } false } -fn decode_lock(key: Vec, lock: Lock, row: &mut EventRow) -> bool { +fn decode_lock(key: Vec, lock: Lock, row: &mut EventRow, has_value: &mut bool) -> bool { let op_type = match lock.lock_type { LockType::Put => EventRowOpType::Put, LockType::Delete => EventRowOpType::Delete, @@ -971,6 +962,7 @@ fn decode_lock(key: Vec, lock: Lock, row: &mut EventRow) -> bool { set_event_row_type(row, EventLogType::Prewrite); if let Some(value) = lock.short_value { row.value = value; + *has_value = true; } false @@ -998,10 +990,12 @@ fn decode_rawkv(key: Vec, value: Vec, row: &mut EventRow) -> Result<()> Ok(()) } -fn decode_default(value: Vec, row: &mut EventRow) { +fn decode_default(value: Vec, row: &mut EventRow, has_value: &mut bool) { if !value.is_empty() { row.value = value.to_vec(); } + // If default CF is given in a command it means the command always has a value. + *has_value = true; } #[cfg(test)] diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index 06b16de1f20..5f9f9bf7209 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -935,7 +935,7 @@ fn test_cdc_batch_size_limit_impl() { assert_eq!(events.len(), 1, "{:?}", events); match events.pop().unwrap().event.unwrap() { Event_oneof_event::Entries(es) => { - assert!(es.entries.len() == 2); + assert_eq!(es.entries.len(), 2); let e = &es.entries[0]; assert_eq!(e.get_type(), EventLogType::Prewrite, "{:?}", e.get_type()); assert_eq!(e.key, b"xk3", "{:?}", e.key); @@ -2318,3 +2318,39 @@ fn test_resolved_ts_with_learners() { } panic!("resolved timestamp should be advanced correctly"); } + +#[test] +fn test_prewrite_without_value() { + let cluster = new_server_cluster(0, 2); + cluster.pd_client.disable_default_operator(); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + let rid = suite.cluster.get_region(&[]).id; + let ctx = suite.get_context(rid); + let client = suite.get_tikv_client(rid).clone(); + let large_value = vec![b'x'; 2 * txn_types::SHORT_VALUE_MAX_LEN]; + + // Perform a pessimistic prewrite with a large value. + let mut muts = vec![Mutation::default()]; + muts[0].set_op(Op::Put); + muts[0].key = b"key".to_vec(); + muts[0].value = large_value.clone(); + try_kv_prewrite_pessimistic(&client, ctx.clone(), muts, b"key".to_vec(), 10); + + let req = suite.new_changedata_request(rid); + let (mut req_tx, _, receive_event) = new_event_feed(suite.get_region_cdc_client(rid)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + + // The prewrite can be retrieved from incremental scan. + let event = receive_event(false); + assert_eq!( + event.get_events()[0].get_entries().entries[0].value, + large_value + ); + + // check_txn_status will put the lock again, but without value. + must_check_txn_status(&client, ctx.clone(), b"key", 10, 12, 12); + must_kv_commit(&client, ctx, vec![b"key".to_vec()], 10, 14, 14); + // The lock without value shouldn't be retrieved. + let event = receive_event(false); + assert_eq!(event.get_events()[0].get_entries().entries[0].commit_ts, 14); +} From 4fca4e86f37f630a31b04f33ca21a4f8a42872f2 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 1 Jun 2022 02:28:27 -0700 Subject: [PATCH 006/676] *: optimize debug build (#12708) close tikv/tikv#12707 This PR optimize debug build by disabling all debuginfo excepts tests itself. So that the generated artifacts will be smaller and also speed up compile time a little. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- CONTRIBUTING.md | 2 +- Cargo.toml | 20 ++++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 85fcea3193e..faccf2818c1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -92,7 +92,7 @@ Please follow this style to make TiKV easy to review, maintain, and develop. ### Build issues -To reduce compilation time, TiKV builds do not include full debugging information by default — `release` and `bench` builds include no debuginfo; `dev` and `test` builds include full debug. To decrease compilation time with another ~5% (around 10 seconds for a 4 min build time), change the `debug = true` to `debug = 1` in the Cargo.toml file to only include line numbers for `dev` and `test`. Another way to change debuginfo is to precede build commands with `RUSTFLAGS=-Cdebuginfo=1` (for line numbers), or `RUSTFLAGS=-Cdebuginfo=2` (for full debuginfo). For example, +To reduce compilation time and disk usage, TiKV builds do not include full debugging information by default — only tests package will have line debug info enabled. To change debuginfo, just precede build commands with `RUSTFLAGS=-Cdebuginfo=1` (for line numbers), or `RUSTFLAGS=-Cdebuginfo=2` (for full debuginfo). For example, ```bash RUSTFLAGS=-Cdebuginfo=1 make dev diff --git a/Cargo.toml b/Cargo.toml index 61759a4b68a..477716d8893 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -277,9 +277,25 @@ members = [ ] default-members = ["cmd/tikv-server", "cmd/tikv-ctl"] +[profile.dev.package.grpcio-sys] +debug = false +opt-level = 1 + +[profile.dev.package.librocksdb_sys] +debug = false +opt-level = 1 + +[profile.dev.package.libtitan_sys] +debug = false +opt-level = 1 + +[profile.dev.package.tests] +debug = 1 +opt-level = 1 + [profile.dev] opt-level = 0 -debug = true +debug = 0 codegen-units = 4 lto = false incremental = true @@ -305,7 +321,7 @@ codegen-units = 4 [profile.test] opt-level = 0 -debug = true +debug = 0 codegen-units = 16 lto = false incremental = true From 4a8a3c5d5ec3be7176b4d5708c6fbec7c60b6108 Mon Sep 17 00:00:00 2001 From: qupeng Date: Wed, 1 Jun 2022 17:56:29 +0800 Subject: [PATCH 007/676] cdc: make tso worker threads configuable (#12576) ref tikv/tikv#12592 cdc: make tso worker threads configuable Signed-off-by: qupeng --- components/cdc/src/endpoint.rs | 2 +- components/cdc/src/initializer.rs | 2 +- components/resolved_ts/src/advance.rs | 39 +++++++++++++--------- src/config.rs | 8 +++++ src/server/service/kv.rs | 9 ++++- tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 7 files changed, 43 insertions(+), 19 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 0a0a7d9fcd5..3adaa8aca65 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -377,7 +377,7 @@ impl, E: KvEngine> Endpoint { .unwrap(); let tso_worker = Builder::new_multi_thread() .thread_name("tso") - .worker_threads(1) + .worker_threads(config.tso_worker_threads) .enable_time() .build() .unwrap(); diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 9a06448afba..6b80a8c21a0 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -522,7 +522,7 @@ impl Initializer { }); let valid_count = total_count - filtered_count; - let use_ts_filter = valid_count as f64 / total_count as f64 <= self.ts_filter_ratio; + let use_ts_filter = valid_count as f64 <= total_count as f64 * self.ts_filter_ratio; info!("cdc incremental scan uses ts filter: {}", use_ts_filter; "region_id" => self.region_id, "hint_min_ts" => hint_min_ts, diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index ddc52443cec..c438c4c53fa 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -14,7 +14,7 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use fail::fail_point; use futures::{compat::Future01CompatExt, future::select_all, FutureExt, TryFutureExt}; -use grpcio::{ChannelBuilder, Environment}; +use grpcio::{ChannelBuilder, Environment, Error as GrpcError, RpcStatusCode}; use kvproto::{ kvrpcpb::{CheckLeaderRequest, LeaderInfo}, metapb::{Peer, PeerRole}, @@ -254,17 +254,26 @@ pub async fn region_resolved_ts_store( .observe(elapsed.as_secs_f64()); }); - let rpc = client - .check_leader_async(&req) - .map_err(|e| (to_store, true, format!("[rpc create failed]{}", e)))?; + let rpc = match client.check_leader_async(&req) { + Ok(rpc) => rpc, + Err(GrpcError::RpcFailure(status)) + if status.code() == RpcStatusCode::UNIMPLEMENTED => + { + // Some stores like TiFlash don't implement it. + return Ok((to_store, vec![])); + } + Err(e) => return Err((to_store, true, format!("[rpc create failed]{}", e))), + }; + PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); - let resp = tokio::time::timeout(timeout, rpc) + let regions = tokio::time::timeout(timeout, rpc) .map_err(|e| (to_store, true, format!("[timeout] {}", e))) .await? - .map_err(|e| (to_store, true, format!("[rpc failed] {}", e)))?; - Ok((to_store, resp)) + .map_err(|e| (to_store, true, format!("[rpc failed] {}", e)))? + .take_regions(); + Ok((to_store, regions)) } .boxed() }) @@ -281,17 +290,15 @@ pub async fn region_resolved_ts_store( let (res, _, remains) = select_all(stores).await; stores = remains; match res { - Ok((to_store, resp)) => { - for region_id in resp.regions { - if let Some(r) = region_map.get(®ion_id) { - let resps = resp_map.entry(region_id).or_default(); - resps.push(to_store); - if region_has_quorum(r, resps) { - valid_regions.insert(region_id); - } + Ok((to_store, regions)) => regions.into_iter().for_each(|region_id| { + if let Some(r) = region_map.get(®ion_id) { + let resps = resp_map.entry(region_id).or_default(); + resps.push(to_store); + if region_has_quorum(r, resps) { + valid_regions.insert(region_id); } } - } + }), Err((to_store, reconnect, err)) => { info!("check leader failed"; "error" => ?err, "to_store" => to_store); if reconnect { diff --git a/src/config.rs b/src/config.rs index b36c14e5ee4..3908cdc9eac 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2394,6 +2394,13 @@ pub struct CdcConfig { /// `TsFilter` will be enabled if `incremental/total <= incremental_scan_ts_filter_ratio`. /// Set `incremental_scan_ts_filter_ratio` to 0 will disable it. pub incremental_scan_ts_filter_ratio: f64, + + /// Count of threads to confirm Region leadership in TiKV instances, 1 by default. + /// Please consider to increase it if count of regions on one TiKV instance is + /// greater than 20k. + #[online_config(skip)] + pub tso_worker_threads: usize, + pub sink_memory_quota: ReadableSize, pub old_value_cache_memory_quota: ReadableSize, // Deprecated! preserved for compatibility check. @@ -2416,6 +2423,7 @@ impl Default for CdcConfig { // is more than 500MB/s, so 128MB/s is enough. incremental_scan_speed_limit: ReadableSize::mb(128), incremental_scan_ts_filter_ratio: 0.2, + tso_worker_threads: 1, // 512MB memory for CDC sink. sink_memory_quota: ReadableSize::mb(512), // 512MB memory for old value cache. diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 6d578334dff..c4960b0629a 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1108,7 +1108,14 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let mut resp = CheckLeaderResponse::default(); resp.set_ts(ts); resp.set_regions(regions); - sink.success(resp).await?; + if let Err(e) = sink.success(resp).await { + // CheckLeader has a built-in fast-success mechanism, so `RemoteStopped` + // can be treated as a general situation. + if let GrpcError::RemoteStopped = e { + return ServerResult::Ok(()); + } + return Err(Error::from(e)); + } ServerResult::Ok(()) } .map_err(move |e| { diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index aa0559cbeb2..cbd695191d8 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -760,6 +760,7 @@ fn test_serde_custom_tikv_config() { incremental_scan_concurrency: 4, incremental_scan_speed_limit: ReadableSize(7), incremental_scan_ts_filter_ratio: 0.7, + tso_worker_threads: 2, old_value_cache_memory_quota: ReadableSize::mb(14), sink_memory_quota: ReadableSize::mb(7), }; diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 17c7635e846..d02aebc4df3 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -639,6 +639,7 @@ incremental-scan-threads = 3 incremental-scan-concurrency = 4 incremental-scan-speed-limit = 7 incremental-scan-ts-filter-ratio = 0.7 +tso-worker-threads = 2 old-value-cache-memory-quota = "14MB" sink-memory-quota = "7MB" From 761d591826c75eddc05170780e0afe59718f6b39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 1 Jun 2022 18:36:28 +0800 Subject: [PATCH 008/676] security, server: fix running local test (#12712) close tikv/tikv#12711 Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- components/backup-stream/tests/mod.rs | 22 ++++++---------------- components/security/Cargo.toml | 5 ++++- components/security/src/lib.rs | 2 ++ components/server/Cargo.toml | 2 +- 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 85bb633955b..339dd07f773 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -628,26 +628,16 @@ mod test { .global_progress_of_task("test_fatal_error"), ) .unwrap(); - assert_eq!(safepoints.len(), 4, "{:?}", safepoints); + assert!( - safepoints - .iter() - .take(3) - // They are choosing the lock safepoint, it must greater than the global checkpoint. - .all(|sp| { sp.safepoint.into_inner() >= checkpoint }), + safepoints.iter().any(|sp| { + sp.serivce.contains(&format!("{}", victim)) + && sp.ttl >= Duration::from_secs(60 * 60 * 24) + && sp.safepoint.into_inner() == checkpoint + }), "{:?}", safepoints ); - - let sp = &safepoints[3]; - assert!(sp.serivce.contains(&format!("{}", victim)), "{:?}", sp); - assert!(sp.ttl >= Duration::from_secs(60 * 60 * 24), "{:?}", sp); - assert!( - sp.safepoint.into_inner() == checkpoint, - "{:?} vs {}", - sp, - checkpoint - ); } #[test] diff --git a/components/security/Cargo.toml b/components/security/Cargo.toml index 2b498bc0965..8257d04f51f 100644 --- a/components/security/Cargo.toml +++ b/components/security/Cargo.toml @@ -4,6 +4,9 @@ version = "0.0.1" edition = "2018" publish = false +[features] +tonic = ["dep:tonic"] + [dependencies] collections = { path = "../collections" } encryption = { path = "../encryption", default-features = false } @@ -12,7 +15,7 @@ serde = "1.0" serde_derive = "1.0" serde_json = "1.0" tikv_util = { path = "../tikv_util", default-features = false } -tonic = "0.5" +tonic = { version = "0.5", features = ["tls"], optional = true } [dev-dependencies] tempfile = "3.0" diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index ec6cf0e6df2..ed5ff0d1fa4 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -18,6 +18,7 @@ use grpcio::{ RpcContext, RpcStatus, RpcStatusCode, ServerBuilder, ServerChecker, ServerCredentialsBuilder, ServerCredentialsFetcher, }; +#[cfg(feature = "tonic")] use tonic::transport::{channel::ClientTlsConfig, Certificate, Identity}; #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Default)] @@ -122,6 +123,7 @@ impl SecurityManager { }) } + #[cfg(feature = "tonic")] /// Make a tonic tls config via the config. pub fn tonic_tls_config(&self) -> Option { let (ca, cert, key) = self.cfg.load_certs().unwrap_or_default(); diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index f5a35c9bb2c..b53fde02cef 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -70,7 +70,7 @@ raftstore = { path = "../raftstore", default-features = false } rand = "0.8" resolved_ts = { path = "../../components/resolved_ts", default-features = false } resource_metering = { path = "../resource_metering" } -security = { path = "../security", default-features = false } +security = { path = "../security", default-features = false, features = ["tonic"] } serde_json = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } From a5987f34ade71caa34cc340c953bd67de5901ace Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Wed, 1 Jun 2022 19:56:27 +0800 Subject: [PATCH 009/676] util: record schedule wait duration of yatp pool (#12441) close tikv/tikv#12359 This commit makes use of the `schedule_time` in yatp to calculate the wait duration of each wake. The wait duration panel is added to the grafana for the unified read pool and the txn scheduler pool. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- .../tikv_util/src/yatp_pool/future_pool.rs | 12 +- components/tikv_util/src/yatp_pool/metrics.rs | 8 +- components/tikv_util/src/yatp_pool/mod.rs | 59 ++- metrics/grafana/tikv_details.json | 342 +++++++++++++++++- 5 files changed, 401 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 96b637fdc43..080a1ccc35f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7047,7 +7047,7 @@ checksum = "541b12c998c5b56aa2b4e6f18f03664eef9a4fd0a246a55594efae6cc2d964b5" [[package]] name = "yatp" version = "0.0.1" -source = "git+https://github.com/tikv/yatp.git?branch=master#5f3d58002b383bfd0014e271ae58261ecc072de3" +source = "git+https://github.com/tikv/yatp.git?branch=master#2f5f6e47ba6fce8d55e7a57b7ee39a93bc0e8194" dependencies = [ "crossbeam-deque", "dashmap", diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index e2ee39e2616..0beca9a5dee 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -13,19 +13,17 @@ use std::{ use fail::fail_point; use futures::channel::oneshot::{self, Canceled}; -use prometheus::{Histogram, IntCounter, IntGauge}; +use prometheus::{IntCounter, IntGauge}; use yatp::task::future; pub type ThreadPool = yatp::ThreadPool; use super::metrics; -use crate::time::Instant; #[derive(Clone)] struct Env { metrics_running_task_count: IntGauge, metrics_handled_task_count: IntCounter, - metrics_pool_schedule_duration: Histogram, } #[derive(Clone)] @@ -49,8 +47,6 @@ impl FuturePool { .with_label_values(&[name]), metrics_handled_task_count: metrics::FUTUREPOOL_HANDLED_TASK_VEC .with_label_values(&[name]), - metrics_pool_schedule_duration: metrics::FUTUREPOOL_SCHEDULE_DURATION_VEC - .with_label_values(&[name]), }; FuturePool { inner: Arc::new(PoolInner { @@ -149,8 +145,6 @@ impl PoolInner { where F: Future + Send + 'static, { - let timer = Instant::now_coarse(); - let h_schedule = self.env.metrics_pool_schedule_duration.clone(); let metrics_handled_task_count = self.env.metrics_handled_task_count.clone(); let metrics_running_task_count = self.env.metrics_running_task_count.clone(); @@ -159,7 +153,6 @@ impl PoolInner { metrics_running_task_count.inc(); self.pool.spawn(async move { - h_schedule.observe(timer.saturating_elapsed_secs()); let _ = future.await; metrics_handled_task_count.inc(); metrics_running_task_count.dec(); @@ -175,8 +168,6 @@ impl PoolInner { F: Future + Send + 'static, F::Output: Send, { - let timer = Instant::now_coarse(); - let h_schedule = self.env.metrics_pool_schedule_duration.clone(); let metrics_handled_task_count = self.env.metrics_handled_task_count.clone(); let metrics_running_task_count = self.env.metrics_running_task_count.clone(); @@ -185,7 +176,6 @@ impl PoolInner { let (tx, rx) = oneshot::channel(); metrics_running_task_count.inc(); self.pool.spawn(async move { - h_schedule.observe(timer.saturating_elapsed_secs()); let res = future.await; metrics_handled_task_count.inc(); metrics_running_task_count.dec(); diff --git a/components/tikv_util/src/yatp_pool/metrics.rs b/components/tikv_util/src/yatp_pool/metrics.rs index a472a6e000b..8ae1aa8910e 100644 --- a/components/tikv_util/src/yatp_pool/metrics.rs +++ b/components/tikv_util/src/yatp_pool/metrics.rs @@ -16,11 +16,11 @@ lazy_static! { &["name"] ) .unwrap(); - pub static ref FUTUREPOOL_SCHEDULE_DURATION_VEC: HistogramVec = register_histogram_vec!( - "tikv_futurepool_schedule_duration", - "Histogram of future_pool handle duration.", + pub static ref YATP_POOL_SCHEDULE_WAIT_DURATION_VEC: HistogramVec = register_histogram_vec!( + "tikv_yatp_pool_schedule_wait_duration", + "Histogram of yatp pool schedule wait duration.", &["name"], - exponential_buckets(0.0005, 2.0, 15).unwrap() + exponential_buckets(1e-5, 4.0, 12).unwrap() // 10us ~ 41s ) .unwrap(); } diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index 023e39b1e67..93cd46cc6ac 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -2,13 +2,15 @@ mod future_pool; mod metrics; + use std::sync::Arc; use fail::fail_point; pub use future_pool::{Full, FuturePool}; +use prometheus::Histogram; use yatp::{ pool::{CloneRunnerBuilder, Local, Runner}, - queue::{multilevel, QueueType}, + queue::{multilevel, QueueType, TaskCell as _}, task::future::{Runner as FutureRunner, TaskCell}, ThreadPool, }; @@ -89,6 +91,9 @@ pub struct YatpPoolRunner { after_start: Option>, before_stop: Option>, before_pause: Option>, + + // Statistics about the schedule wait duration. + schedule_wait_duration: Histogram, } impl Runner for YatpPoolRunner { @@ -105,7 +110,12 @@ impl Runner for YatpPoolRunner { tikv_alloc::add_thread_memory_accessor() } - fn handle(&mut self, local: &mut Local, task_cell: Self::TaskCell) -> bool { + fn handle(&mut self, local: &mut Local, mut task_cell: Self::TaskCell) -> bool { + let extras = task_cell.mut_extras(); + if let Some(schedule_time) = extras.schedule_time() { + self.schedule_wait_duration + .observe(schedule_time.elapsed().as_secs_f64()); + } let finished = self.inner.handle(local, task_cell); self.ticker.try_tick(); finished @@ -139,6 +149,7 @@ impl YatpPoolRunner { after_start: Option>, before_stop: Option>, before_pause: Option>, + schedule_wait_duration: Histogram, ) -> Self { YatpPoolRunner { inner, @@ -147,6 +158,7 @@ impl YatpPoolRunner { after_start, before_stop, before_pause, + schedule_wait_duration, } } } @@ -265,9 +277,8 @@ impl YatpPoolBuilder { } fn create_builder(&mut self) -> (yatp::Builder, YatpPoolRunner) { - let mut builder = yatp::Builder::new(thd_name!( - self.name_prefix.clone().unwrap_or_else(|| "".to_string()) - )); + let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); + let mut builder = yatp::Builder::new(thd_name!(name)); builder .stack_size(self.stack_size) .min_thread_count(self.min_thread_count) @@ -277,13 +288,51 @@ impl YatpPoolBuilder { let after_start = self.after_start.take(); let before_stop = self.before_stop.take(); let before_pause = self.before_pause.take(); + let schedule_wait_duration = + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); let read_pool_runner = YatpPoolRunner::new( Default::default(), self.ticker.clone(), after_start, before_stop, before_pause, + schedule_wait_duration, ); (builder, read_pool_runner) } } + +#[cfg(test)] +mod tests { + use std::sync::mpsc; + + use futures::compat::Future01CompatExt; + + use super::*; + use crate::timer::GLOBAL_TIMER_HANDLE; + + #[test] + fn test_record_schedule_wait_duration() { + let name = "test_record_schedule_wait_duration"; + let pool = YatpPoolBuilder::new(DefaultTicker::default()) + .name_prefix(name) + .build_single_level_pool(); + let (tx, rx) = mpsc::channel(); + for _ in 0..3 { + let tx = tx.clone(); + pool.spawn(async move { + GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + Duration::from_millis(100)) + .compat() + .await + .unwrap(); + tx.send(()).unwrap(); + }); + } + for _ in 0..3 { + rx.recv().unwrap(); + } + let histogram = metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); + assert_eq!(histogram.get_sample_count() as u32, 6, "{:?}", histogram); + } +} diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 97619803256..050a6727622 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -4454,6 +4454,206 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, + "hiddenSeries": false, + "id": 23763572581, + "legend": { + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, name))", + "hide": false, + "interval": "", + "legendFormat": "{{name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "99% Thread Pool Schedule Wait Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:150", + "format": "s", + "label": null, + "logBase": 2, + "max": "30", + "min": null, + "show": true + }, + { + "$$hashKey": "object:151", + "format": "short", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 60 + }, + "hiddenSeries": false, + "id": 23763572692, + "legend": { + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_yatp_pool_schedule_wait_duration_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (name) / sum(rate(tikv_yatp_pool_schedule_wait_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (name)", + "hide": false, + "interval": "", + "legendFormat": "{{name}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average Thread Pool Schedule Wait Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:150", + "format": "s", + "label": null, + "logBase": 2, + "max": "30", + "min": null, + "show": true + }, + { + "$$hashKey": "object:151", + "format": "short", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, @@ -16537,6 +16737,76 @@ "align": false, "alignLevel": null } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763572469, + "legend": { + "show": false + }, + "pluginVersion": "7.5.11", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"unified-read.*\"}[1m])", + "format": "heatmap", + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Unified Read Pool Wait Duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null } ], "title": "Unified Read Pool", @@ -19666,6 +19936,76 @@ "align": false, "alignLevel": null } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 45 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763572468, + "legend": { + "show": false + }, + "pluginVersion": "7.5.11", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sched-worker.*\"}[1m])", + "format": "heatmap", + "interval": "", + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Txn Scheduler Pool Wait Duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null } ], "repeat": null, @@ -42014,4 +42354,4 @@ "title": "Test-Cluster-TiKV-Details", "uid": "RDVQiEzZz", "version": 1 -} \ No newline at end of file +} From 2a508a583c52bdd40fd84630e094debd5b04e623 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 2 Jun 2022 10:32:27 +0800 Subject: [PATCH 010/676] engine: only override write stall configurations if unspecified (#12127) ref tikv/tikv#11424, ref tikv/tikv#11840 Signed-off-by: tabokie --- etc/config-template.toml | 20 +- src/config.rs | 311 +++++++++++++++++++++++-------- tests/integrations/config/mod.rs | 40 ++-- 3 files changed, 268 insertions(+), 103 deletions(-) diff --git a/etc/config-template.toml b/etc/config-template.toml index f301c553167..1e673fbc3fa 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -693,7 +693,7 @@ ## Maximum number of level-0 files. ## When the number of SST files of level-0 reaches the limit of `level0-stop-writes-trigger`, ## RocksDB stalls the new write operation. -# level0-stop-writes-trigger = 36 +# level0-stop-writes-trigger = 20 ## Amount of data to build up in memory (backed by an unsorted log on disk) before converting to a ## sorted on-disk file. It is the RocksDB MemTable size. @@ -745,6 +745,12 @@ ## "min-overlapping-ratio" # compaction-pri = "min-overlapping-ratio" +## Refer to storage.flow-control.soft-pending-compaction-bytes-limit. +# soft-pending-compaction-bytes-limit = "192GB" + +## Refer to storage.flow-control.hard-pending-compaction-bytes-limit. +# hard-pending-compaction-bytes-limit = "1000GB" + ## Indicating if we'd put index/filter blocks to the block cache. ## If not specified, each "table reader" object will pre-load index/filter block during table ## initialization. @@ -859,10 +865,12 @@ # level0-file-num-compaction-trigger = 4 # level0-slowdown-writes-trigger = 20 -# level0-stop-writes-trigger = 36 +# level0-stop-writes-trigger = 20 # cache-index-and-filter-blocks = true # pin-l0-filter-and-index-blocks = true # compaction-pri = "min-overlapping-ratio" +# soft-pending-compaction-bytes-limit = "192GB" +# hard-pending-compaction-bytes-limit = "1000GB" # read-amp-bytes-per-bit = 0 # dynamic-level-bytes = true # optimize-filters-for-hits = false @@ -880,10 +888,12 @@ # target-file-size-base = "8MB" # level0-file-num-compaction-trigger = 1 # level0-slowdown-writes-trigger = 20 -# level0-stop-writes-trigger = 36 +# level0-stop-writes-trigger = 20 # cache-index-and-filter-blocks = true # pin-l0-filter-and-index-blocks = true # compaction-pri = "by-compensated-size" +# soft-pending-compaction-bytes-limit = "192GB" +# hard-pending-compaction-bytes-limit = "1000GB" # read-amp-bytes-per-bit = 0 # dynamic-level-bytes = true # optimize-filters-for-hits = false @@ -937,10 +947,12 @@ # level0-file-num-compaction-trigger = 4 # level0-slowdown-writes-trigger = 20 -# level0-stop-writes-trigger = 36 +# level0-stop-writes-trigger = 20 # cache-index-and-filter-blocks = true # pin-l0-filter-and-index-blocks = true # compaction-pri = "by-compensated-size" +# soft-pending-compaction-bytes-limit = "192GB" +# hard-pending-compaction-bytes-limit = "1000GB" # read-amp-bytes-per-bit = 0 # dynamic-level-bytes = true # optimize-filters-for-hits = true diff --git a/src/config.rs b/src/config.rs index 3908cdc9eac..627901481d1 100644 --- a/src/config.rs +++ b/src/config.rs @@ -278,8 +278,8 @@ macro_rules! cf_config { pub max_bytes_for_level_base: ReadableSize, pub target_file_size_base: ReadableSize, pub level0_file_num_compaction_trigger: i32, - pub level0_slowdown_writes_trigger: i32, - pub level0_stop_writes_trigger: i32, + pub level0_slowdown_writes_trigger: Option, + pub level0_stop_writes_trigger: Option, pub max_compaction_bytes: ReadableSize, #[serde(with = "rocks_config::compaction_pri_serde")] #[online_config(skip)] @@ -294,8 +294,8 @@ macro_rules! cf_config { pub compaction_style: DBCompactionStyle, pub disable_auto_compactions: bool, pub disable_write_stall: bool, - pub soft_pending_compaction_bytes_limit: ReadableSize, - pub hard_pending_compaction_bytes_limit: ReadableSize, + pub soft_pending_compaction_bytes_limit: Option, + pub hard_pending_compaction_bytes_limit: Option, #[online_config(skip)] pub force_consistency_checks: bool, #[online_config(skip)] @@ -396,10 +396,14 @@ macro_rules! write_into_metrics { .set($cf.level0_file_num_compaction_trigger.into()); $metrics .with_label_values(&[$tag, "level0_slowdown_writes_trigger"]) - .set($cf.level0_slowdown_writes_trigger.into()); + .set( + $cf.level0_slowdown_writes_trigger + .unwrap_or_default() + .into(), + ); $metrics .with_label_values(&[$tag, "level0_stop_writes_trigger"]) - .set($cf.level0_stop_writes_trigger.into()); + .set($cf.level0_stop_writes_trigger.unwrap_or_default().into()); $metrics .with_label_values(&[$tag, "max_compaction_bytes"]) .set($cf.max_compaction_bytes.0 as f64); @@ -421,10 +425,18 @@ macro_rules! write_into_metrics { .set(($cf.disable_write_stall as i32).into()); $metrics .with_label_values(&[$tag, "soft_pending_compaction_bytes_limit"]) - .set($cf.soft_pending_compaction_bytes_limit.0 as f64); + .set( + $cf.soft_pending_compaction_bytes_limit + .unwrap_or_default() + .0 as f64, + ); $metrics .with_label_values(&[$tag, "hard_pending_compaction_bytes_limit"]) - .set($cf.hard_pending_compaction_bytes_limit.0 as f64); + .set( + $cf.hard_pending_compaction_bytes_limit + .unwrap_or_default() + .0 as f64, + ); $metrics .with_label_values(&[$tag, "force_consistency_checks"]) .set(($cf.force_consistency_checks as i32).into()); @@ -500,8 +512,12 @@ macro_rules! build_cf_opt { cf_opts.set_max_bytes_for_level_base($opt.max_bytes_for_level_base.0); cf_opts.set_target_file_size_base($opt.target_file_size_base.0); cf_opts.set_level_zero_file_num_compaction_trigger($opt.level0_file_num_compaction_trigger); - cf_opts.set_level_zero_slowdown_writes_trigger($opt.level0_slowdown_writes_trigger); - cf_opts.set_level_zero_stop_writes_trigger($opt.level0_stop_writes_trigger); + cf_opts.set_level_zero_slowdown_writes_trigger( + $opt.level0_slowdown_writes_trigger.unwrap_or_default(), + ); + cf_opts.set_level_zero_stop_writes_trigger( + $opt.level0_stop_writes_trigger.unwrap_or_default(), + ); cf_opts.set_max_compaction_bytes($opt.max_compaction_bytes.0); cf_opts.compaction_priority($opt.compaction_pri); cf_opts.set_level_compaction_dynamic_level_bytes($opt.dynamic_level_bytes); @@ -509,8 +525,16 @@ macro_rules! build_cf_opt { cf_opts.set_compaction_style($opt.compaction_style); cf_opts.set_disable_auto_compactions($opt.disable_auto_compactions); cf_opts.set_disable_write_stall($opt.disable_write_stall); - cf_opts.set_soft_pending_compaction_bytes_limit($opt.soft_pending_compaction_bytes_limit.0); - cf_opts.set_hard_pending_compaction_bytes_limit($opt.hard_pending_compaction_bytes_limit.0); + cf_opts.set_soft_pending_compaction_bytes_limit( + $opt.soft_pending_compaction_bytes_limit + .unwrap_or_default() + .0, + ); + cf_opts.set_hard_pending_compaction_bytes_limit( + $opt.hard_pending_compaction_bytes_limit + .unwrap_or_default() + .0, + ); cf_opts.set_optimize_filters_for_hits($opt.optimize_filters_for_hits); cf_opts.set_force_consistency_checks($opt.force_consistency_checks); if $opt.enable_doubly_skiplist { @@ -567,8 +591,8 @@ impl Default for DefaultCfConfig { max_bytes_for_level_base: ReadableSize::mb(512), target_file_size_base: ReadableSize::mb(8), level0_file_num_compaction_trigger: 4, - level0_slowdown_writes_trigger: 20, - level0_stop_writes_trigger: 36, + level0_slowdown_writes_trigger: None, + level0_stop_writes_trigger: None, max_compaction_bytes: ReadableSize::gb(2), compaction_pri: CompactionPriority::MinOverlappingRatio, dynamic_level_bytes: true, @@ -577,8 +601,8 @@ impl Default for DefaultCfConfig { compaction_style: DBCompactionStyle::Level, disable_auto_compactions: false, disable_write_stall: false, - soft_pending_compaction_bytes_limit: ReadableSize::gb(192), - hard_pending_compaction_bytes_limit: ReadableSize::gb(256), + soft_pending_compaction_bytes_limit: None, + hard_pending_compaction_bytes_limit: None, force_consistency_checks: false, prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, @@ -676,8 +700,8 @@ impl Default for WriteCfConfig { max_bytes_for_level_base: ReadableSize::mb(512), target_file_size_base: ReadableSize::mb(8), level0_file_num_compaction_trigger: 4, - level0_slowdown_writes_trigger: 20, - level0_stop_writes_trigger: 36, + level0_slowdown_writes_trigger: None, + level0_stop_writes_trigger: None, max_compaction_bytes: ReadableSize::gb(2), compaction_pri: CompactionPriority::MinOverlappingRatio, dynamic_level_bytes: true, @@ -686,8 +710,8 @@ impl Default for WriteCfConfig { compaction_style: DBCompactionStyle::Level, disable_auto_compactions: false, disable_write_stall: false, - soft_pending_compaction_bytes_limit: ReadableSize::gb(192), - hard_pending_compaction_bytes_limit: ReadableSize::gb(256), + soft_pending_compaction_bytes_limit: None, + hard_pending_compaction_bytes_limit: None, force_consistency_checks: false, prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, @@ -771,8 +795,8 @@ impl Default for LockCfConfig { max_bytes_for_level_base: ReadableSize::mb(128), target_file_size_base: ReadableSize::mb(8), level0_file_num_compaction_trigger: 1, - level0_slowdown_writes_trigger: 20, - level0_stop_writes_trigger: 36, + level0_slowdown_writes_trigger: None, + level0_stop_writes_trigger: None, max_compaction_bytes: ReadableSize::gb(2), compaction_pri: CompactionPriority::ByCompensatedSize, dynamic_level_bytes: true, @@ -781,8 +805,8 @@ impl Default for LockCfConfig { compaction_style: DBCompactionStyle::Level, disable_auto_compactions: false, disable_write_stall: false, - soft_pending_compaction_bytes_limit: ReadableSize::gb(192), - hard_pending_compaction_bytes_limit: ReadableSize::gb(256), + soft_pending_compaction_bytes_limit: None, + hard_pending_compaction_bytes_limit: None, force_consistency_checks: false, prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, @@ -844,8 +868,8 @@ impl Default for RaftCfConfig { max_bytes_for_level_base: ReadableSize::mb(128), target_file_size_base: ReadableSize::mb(8), level0_file_num_compaction_trigger: 1, - level0_slowdown_writes_trigger: 20, - level0_stop_writes_trigger: 36, + level0_slowdown_writes_trigger: None, + level0_stop_writes_trigger: None, max_compaction_bytes: ReadableSize::gb(2), compaction_pri: CompactionPriority::ByCompensatedSize, dynamic_level_bytes: true, @@ -854,8 +878,8 @@ impl Default for RaftCfConfig { compaction_style: DBCompactionStyle::Level, disable_auto_compactions: false, disable_write_stall: false, - soft_pending_compaction_bytes_limit: ReadableSize::gb(192), - hard_pending_compaction_bytes_limit: ReadableSize::gb(256), + soft_pending_compaction_bytes_limit: None, + hard_pending_compaction_bytes_limit: None, force_consistency_checks: false, prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, @@ -1207,8 +1231,8 @@ impl Default for RaftDefaultCfConfig { max_bytes_for_level_base: ReadableSize::mb(512), target_file_size_base: ReadableSize::mb(8), level0_file_num_compaction_trigger: 4, - level0_slowdown_writes_trigger: 20, - level0_stop_writes_trigger: 36, + level0_slowdown_writes_trigger: None, + level0_stop_writes_trigger: None, max_compaction_bytes: ReadableSize::gb(2), compaction_pri: CompactionPriority::ByCompensatedSize, dynamic_level_bytes: true, @@ -1217,8 +1241,8 @@ impl Default for RaftDefaultCfConfig { compaction_style: DBCompactionStyle::Level, disable_auto_compactions: false, disable_write_stall: false, - soft_pending_compaction_bytes_limit: ReadableSize::gb(192), - hard_pending_compaction_bytes_limit: ReadableSize::gb(256), + soft_pending_compaction_bytes_limit: None, + hard_pending_compaction_bytes_limit: None, force_consistency_checks: false, prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, @@ -2890,58 +2914,81 @@ impl TiKvConfig { self.causal_ts.validate()?; if self.storage.flow_control.enable { - // using raftdb write stall to control memtables as a safety net - self.raftdb.defaultcf.level0_slowdown_writes_trigger = 10000; - self.raftdb.defaultcf.level0_stop_writes_trigger = 10000; - self.raftdb.defaultcf.soft_pending_compaction_bytes_limit = ReadableSize(0); - self.raftdb.defaultcf.hard_pending_compaction_bytes_limit = ReadableSize(0); - - // disable kvdb write stall, and override related configs self.rocksdb.defaultcf.disable_write_stall = true; - self.rocksdb.defaultcf.level0_slowdown_writes_trigger = - self.storage.flow_control.l0_files_threshold as i32; - self.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = self - .storage - .flow_control - .soft_pending_compaction_bytes_limit; - self.rocksdb.defaultcf.hard_pending_compaction_bytes_limit = self - .storage - .flow_control - .hard_pending_compaction_bytes_limit; self.rocksdb.writecf.disable_write_stall = true; - self.rocksdb.writecf.level0_slowdown_writes_trigger = - self.storage.flow_control.l0_files_threshold as i32; - self.rocksdb.writecf.soft_pending_compaction_bytes_limit = self - .storage - .flow_control - .soft_pending_compaction_bytes_limit; - self.rocksdb.writecf.hard_pending_compaction_bytes_limit = self - .storage - .flow_control - .hard_pending_compaction_bytes_limit; self.rocksdb.lockcf.disable_write_stall = true; - self.rocksdb.lockcf.level0_slowdown_writes_trigger = - self.storage.flow_control.l0_files_threshold as i32; - self.rocksdb.lockcf.soft_pending_compaction_bytes_limit = self - .storage - .flow_control - .soft_pending_compaction_bytes_limit; - self.rocksdb.lockcf.hard_pending_compaction_bytes_limit = self - .storage - .flow_control - .hard_pending_compaction_bytes_limit; self.rocksdb.raftcf.disable_write_stall = true; - self.rocksdb.raftcf.level0_slowdown_writes_trigger = - self.storage.flow_control.l0_files_threshold as i32; - self.rocksdb.raftcf.soft_pending_compaction_bytes_limit = self - .storage - .flow_control - .soft_pending_compaction_bytes_limit; - self.rocksdb.raftcf.hard_pending_compaction_bytes_limit = self - .storage - .flow_control - .hard_pending_compaction_bytes_limit; } + // Fill in values for unspecified write stall configurations. + macro_rules! fill_cf_opts { + ($cf_opts:expr, $cfg:expr) => { + if let Some(v) = &mut $cf_opts.level0_slowdown_writes_trigger { + if $cfg.enable && *v > $cfg.l0_files_threshold as i32 { + warn!( + "{}.level0-slowdown-writes-trigger is too large. Setting it to \ + storage.flow-control.l0-files-threshold ({})", + stringify!($cf_opts), $cfg.l0_files_threshold + ); + *v = $cfg.l0_files_threshold as i32; + } + } else { + $cf_opts.level0_slowdown_writes_trigger = + Some($cfg.l0_files_threshold as i32); + } + if let Some(v) = &mut $cf_opts.level0_stop_writes_trigger { + if $cfg.enable && *v > $cfg.l0_files_threshold as i32 { + warn!( + "{}.level0-stop-writes-trigger is too large. Setting it to \ + storage.flow-control.l0-files-threshold ({})", + stringify!($cf_opts), $cfg.l0_files_threshold + ); + *v = $cfg.l0_files_threshold as i32; + } + } else { + $cf_opts.level0_stop_writes_trigger = + Some($cfg.l0_files_threshold as i32); + } + if let Some(v) = &mut $cf_opts.soft_pending_compaction_bytes_limit { + if $cfg.enable && v.0 > $cfg.soft_pending_compaction_bytes_limit.0 { + warn!( + "{}.soft-pending-compaction-bytes-limit is too large. Setting it to \ + storage.flow-control.soft-pending-compaction-bytes-limit ({})", + stringify!($cf_opts), $cfg.soft_pending_compaction_bytes_limit.0 + ); + *v = $cfg.soft_pending_compaction_bytes_limit; + } + } else { + $cf_opts.soft_pending_compaction_bytes_limit = + Some($cfg.soft_pending_compaction_bytes_limit); + } + if let Some(v) = &mut $cf_opts.hard_pending_compaction_bytes_limit { + if $cfg.enable && v.0 > $cfg.hard_pending_compaction_bytes_limit.0 { + warn!( + "{}.hard-pending-compaction-bytes-limit is too large. Setting it to \ + storage.flow-control.hard-pending-compaction-bytes-limit ({})", + stringify!($cf_opts), $cfg.hard_pending_compaction_bytes_limit.0 + ); + *v = $cfg.hard_pending_compaction_bytes_limit; + } + } else { + $cf_opts.hard_pending_compaction_bytes_limit = + Some($cfg.hard_pending_compaction_bytes_limit); + } + }; + } + let flow_control_cfg = if self.storage.flow_control.enable { + self.storage.flow_control.clone() + } else { + crate::storage::config::FlowControlConfig { + enable: false, + ..Default::default() + } + }; + fill_cf_opts!(self.raftdb.defaultcf, flow_control_cfg); + fill_cf_opts!(self.rocksdb.defaultcf, flow_control_cfg); + fill_cf_opts!(self.rocksdb.writecf, flow_control_cfg); + fill_cf_opts!(self.rocksdb.lockcf, flow_control_cfg); + fill_cf_opts!(self.rocksdb.raftcf, flow_control_cfg); if let Some(memory_usage_limit) = self.memory_usage_limit { let total = SysQuota::memory_limit_in_bytes(); @@ -5007,6 +5054,26 @@ mod tests { cfg.memory_usage_limit = None; cfg.raft_engine.mut_config().memory_limit = None; cfg.coprocessor_v2.coprocessor_plugin_directory = None; // Default is `None`, which is represented by not setting the key. + cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger = None; + cfg.rocksdb.defaultcf.level0_stop_writes_trigger = None; + cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = None; + cfg.rocksdb.defaultcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.writecf.level0_slowdown_writes_trigger = None; + cfg.rocksdb.writecf.level0_stop_writes_trigger = None; + cfg.rocksdb.writecf.soft_pending_compaction_bytes_limit = None; + cfg.rocksdb.writecf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.lockcf.level0_slowdown_writes_trigger = None; + cfg.rocksdb.lockcf.level0_stop_writes_trigger = None; + cfg.rocksdb.lockcf.soft_pending_compaction_bytes_limit = None; + cfg.rocksdb.lockcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.raftcf.level0_slowdown_writes_trigger = None; + cfg.rocksdb.raftcf.level0_stop_writes_trigger = None; + cfg.rocksdb.raftcf.soft_pending_compaction_bytes_limit = None; + cfg.rocksdb.raftcf.hard_pending_compaction_bytes_limit = None; + cfg.raftdb.defaultcf.level0_slowdown_writes_trigger = None; + cfg.raftdb.defaultcf.level0_stop_writes_trigger = None; + cfg.raftdb.defaultcf.soft_pending_compaction_bytes_limit = None; + cfg.raftdb.defaultcf.hard_pending_compaction_bytes_limit = None; assert_eq!(cfg, default_cfg); } @@ -5224,4 +5291,90 @@ mod tests { assert_eq!(serde_to_online_config(name.into()).as_str(), res); } } + + #[test] + fn test_flow_control_override() { + let content = r#" + [storage.flow-control] + enable = true + l0-files-threshold = 77 + soft-pending-compaction-bytes-limit = "777GB" + "#; + let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!( + cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger, + Some(77) + ); + assert_eq!( + cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit, + Some(ReadableSize::gb(777)) + ); + + // Override with default values if flow control is disabled. + let content = r#" + [storage.flow-control] + enable = false + l0-files-threshold = 77 + soft-pending-compaction-bytes-limit = "777GB" + [rocksdb.defaultcf] + level0-slowdown-writes-trigger = 888 + soft-pending-compaction-bytes-limit = "888GB" + [rocksdb.writecf] + "#; + let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!( + cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger, + Some(888) + ); + assert_eq!( + cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit, + Some(ReadableSize::gb(888)) + ); + matches!(cfg.rocksdb.writecf.level0_slowdown_writes_trigger, Some(v) if v != 77); + matches!(cfg.rocksdb.writecf.soft_pending_compaction_bytes_limit, Some(v) if v != ReadableSize::gb(777)); + + // Do not override when RocksDB configurations are specified. + let content = r#" + [storage.flow-control] + enable = true + l0-files-threshold = 77 + soft-pending-compaction-bytes-limit = "777GB" + [rocksdb.defaultcf] + level0-slowdown-writes-trigger = 66 + soft-pending-compaction-bytes-limit = "666GB" + "#; + let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!( + cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger, + Some(66) + ); + assert_eq!( + cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit, + Some(ReadableSize::gb(666)) + ); + + // Cannot specify larger configurations for RocksDB. + let content = r#" + [storage.flow-control] + enable = true + l0-files-threshold = 1 + soft-pending-compaction-bytes-limit = "1GB" + [rocksdb.defaultcf] + level0-slowdown-writes-trigger = 88 + soft-pending-compaction-bytes-limit = "888GB" + "#; + let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert_eq!( + cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger, + Some(1) + ); + assert_eq!( + cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit, + Some(ReadableSize::gb(1)) + ); + } } diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index cbd695191d8..3bd932262e5 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -324,8 +324,8 @@ fn test_serde_custom_tikv_config() { max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), level0_file_num_compaction_trigger: 123, - level0_slowdown_writes_trigger: 123, - level0_stop_writes_trigger: 123, + level0_slowdown_writes_trigger: Some(123), + level0_stop_writes_trigger: Some(123), max_compaction_bytes: ReadableSize::gb(1), compaction_pri: CompactionPriority::MinOverlappingRatio, dynamic_level_bytes: true, @@ -334,8 +334,8 @@ fn test_serde_custom_tikv_config() { compaction_style: DBCompactionStyle::Universal, disable_auto_compactions: true, disable_write_stall: true, - soft_pending_compaction_bytes_limit: ReadableSize::gb(12), - hard_pending_compaction_bytes_limit: ReadableSize::gb(12), + soft_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), + hard_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), force_consistency_checks: true, titan: titan_cf_config.clone(), prop_size_index_distance: 4000000, @@ -375,8 +375,8 @@ fn test_serde_custom_tikv_config() { max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), level0_file_num_compaction_trigger: 123, - level0_slowdown_writes_trigger: 123, - level0_stop_writes_trigger: 123, + level0_slowdown_writes_trigger: Some(123), + level0_stop_writes_trigger: Some(123), max_compaction_bytes: ReadableSize::gb(1), compaction_pri: CompactionPriority::MinOverlappingRatio, dynamic_level_bytes: true, @@ -385,8 +385,8 @@ fn test_serde_custom_tikv_config() { compaction_style: DBCompactionStyle::Universal, disable_auto_compactions: true, disable_write_stall: true, - soft_pending_compaction_bytes_limit: ReadableSize::gb(12), - hard_pending_compaction_bytes_limit: ReadableSize::gb(12), + soft_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), + hard_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), force_consistency_checks: true, titan: TitanCfConfig { min_blob_size: ReadableSize(1024), // default value @@ -440,8 +440,8 @@ fn test_serde_custom_tikv_config() { max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), level0_file_num_compaction_trigger: 123, - level0_slowdown_writes_trigger: 123, - level0_stop_writes_trigger: 123, + level0_slowdown_writes_trigger: Some(123), + level0_stop_writes_trigger: Some(123), max_compaction_bytes: ReadableSize::gb(1), compaction_pri: CompactionPriority::MinOverlappingRatio, dynamic_level_bytes: true, @@ -450,8 +450,8 @@ fn test_serde_custom_tikv_config() { compaction_style: DBCompactionStyle::Universal, disable_auto_compactions: true, disable_write_stall: true, - soft_pending_compaction_bytes_limit: ReadableSize::gb(12), - hard_pending_compaction_bytes_limit: ReadableSize::gb(12), + soft_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), + hard_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), force_consistency_checks: true, titan: TitanCfConfig { min_blob_size: ReadableSize(1024), // default value @@ -505,8 +505,8 @@ fn test_serde_custom_tikv_config() { max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), level0_file_num_compaction_trigger: 123, - level0_slowdown_writes_trigger: 123, - level0_stop_writes_trigger: 123, + level0_slowdown_writes_trigger: Some(123), + level0_stop_writes_trigger: Some(123), max_compaction_bytes: ReadableSize::gb(1), compaction_pri: CompactionPriority::MinOverlappingRatio, dynamic_level_bytes: true, @@ -515,8 +515,8 @@ fn test_serde_custom_tikv_config() { compaction_style: DBCompactionStyle::Universal, disable_auto_compactions: true, disable_write_stall: true, - soft_pending_compaction_bytes_limit: ReadableSize::gb(12), - hard_pending_compaction_bytes_limit: ReadableSize::gb(12), + soft_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), + hard_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), force_consistency_checks: true, titan: TitanCfConfig { min_blob_size: ReadableSize(1024), // default value @@ -599,8 +599,8 @@ fn test_serde_custom_tikv_config() { max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), level0_file_num_compaction_trigger: 123, - level0_slowdown_writes_trigger: 123, - level0_stop_writes_trigger: 123, + level0_slowdown_writes_trigger: Some(123), + level0_stop_writes_trigger: Some(123), max_compaction_bytes: ReadableSize::gb(1), compaction_pri: CompactionPriority::MinOverlappingRatio, dynamic_level_bytes: true, @@ -609,8 +609,8 @@ fn test_serde_custom_tikv_config() { compaction_style: DBCompactionStyle::Universal, disable_auto_compactions: true, disable_write_stall: true, - soft_pending_compaction_bytes_limit: ReadableSize::gb(12), - hard_pending_compaction_bytes_limit: ReadableSize::gb(12), + soft_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), + hard_pending_compaction_bytes_limit: Some(ReadableSize::gb(12)), force_consistency_checks: true, titan: titan_cf_config, prop_size_index_distance: 4000000, From da16e5eca3e44f8a987acdd6aad31acad3f5c05e Mon Sep 17 00:00:00 2001 From: Liqi Geng Date: Thu, 2 Jun 2022 14:18:28 +0800 Subject: [PATCH 011/676] copr: fix a wrong check in time parsing (#12740) ref tikv/tikv#12739, close tikv/tikv#12739 See #12739 Signed-off-by: gengliqi --- .../src/codec/mysql/time/mod.rs | 58 ++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 5d3222d0f3b..29b66725e2a 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -489,7 +489,7 @@ mod parser { } // the following statement checks fsp ((components.len() != 7 && components.len() != 2) - || input.as_bytes()[input.len() - components.last().unwrap().len() - 1] == b'.') + || (separators.len() >= components.len() - 1 /* should always true */ && separators[components.len() - 2] == b".")) .as_option()?; Some((components, if has_tz { Some(tz_offset) } else { None })) @@ -2237,6 +2237,25 @@ mod tests { ("2020-12-23 07:59:23", "2020-12-23 15:59:23+0800", 0, false), ("2020-12-23 23:59:23", "2020-12-23 15:59:23-08", 0, false), ("2020-12-23 07:59:23", "2020-12-23 15:59:23+08:00", 0, false), + ("2022-06-02 11:59:30", "2022-06-02 11:59:30.123Z", 0, false), + ( + "2022-06-02 03:59:30", + "2022-06-02 11:59:30.123+0800", + 0, + false, + ), + ( + "2022-06-02 19:59:30", + "2022-06-02 11:59:30.123-08", + 0, + false, + ), + ( + "2022-06-02 03:29:30", + "2022-06-02 11:59:30.123+08:30", + 0, + false, + ), ]; for (expected, actual, fsp, round) in cases { assert_eq!( @@ -2400,6 +2419,43 @@ mod tests { r: Some("2020-10-10 10:10:10.000000"), tp: TimeType::Timestamp, }, + Case { + tz: "+08:00", + t: "2022-06-02T10:10:10Z", + r: Some("2022-06-02 18:10:10.000000"), + tp: TimeType::DateTime, + }, + Case { + tz: "-08:00", + t: "2022-06-02T10:10:10Z", + r: Some("2022-06-02 02:10:10.000000"), + tp: TimeType::DateTime, + }, + Case { + tz: "+06:30", + t: "2022-06-02T10:10:10-05:00", + r: Some("2022-06-02 21:40:10.000000"), + tp: TimeType::DateTime, + }, + // Time with fraction + Case { + tz: "+08:00", + t: "2022-06-02T10:10:10.123Z", + r: Some("2022-06-02 18:10:10.123000"), + tp: TimeType::DateTime, + }, + Case { + tz: "-08:00", + t: "2022-06-02T10:10:10.123Z", + r: Some("2022-06-02 02:10:10.123000"), + tp: TimeType::DateTime, + }, + Case { + tz: "+06:30", + t: "2022-06-02T10:10:10.654321-05:00", + r: Some("2022-06-02 21:40:10.654321"), + tp: TimeType::DateTime, + }, ]; let mut result: Vec> = vec![]; for Case { tz, t, r: _, tp } in &cases { From 033d62d7f7b65d1edcd6da8cd70acee7041eefaa Mon Sep 17 00:00:00 2001 From: ekexium Date: Mon, 6 Jun 2022 13:16:29 +0800 Subject: [PATCH 012/676] log details for PessimisitcLockNotFound in check_for_newer_version (#12713) ref tikv/tikv#11612 Signed-off-by: ekexium Co-authored-by: Ti Chi Robot --- src/storage/txn/actions/prewrite.rs | 8 ++++++++ src/storage/txn/commands/prewrite.rs | 6 +++--- src/storage/txn/scheduler.rs | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index de5270a6b10..a96c5eabc8d 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -352,8 +352,16 @@ impl<'a> PrewriteMutation<'a> { self.write_conflict_error(&write, commit_ts)?; } } + // Note: PessimisticLockNotFound can happen on a non-pessimistically locked key, + // if it is a retrying prewrite request. TransactionKind::Pessimistic(for_update_ts) => { if commit_ts > for_update_ts { + warn!("conflicting write was found, pessimistic lock must be lost for the corresponding row key"; + "key" => %self.key, + "start_ts" => self.txn_props.start_ts, + "for_update_ts" => for_update_ts, + "conflicting start_ts" => write.start_ts, + "conflicting commit_ts" => commit_ts); return Err(ErrorInner::PessimisticLockNotFound { start_ts: self.txn_props.start_ts, key: self.key.clone().into_raw()?, diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index bb64c7641b8..4c2caec12b2 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -462,7 +462,7 @@ impl Prewriter { let mut final_min_commit_ts = TimeStamp::zero(); let mut locks = Vec::new(); - // Further check whether the prewrited transaction has been committed + // Further check whether the prewritten transaction has been committed // when encountering a WriteConflict or PessimisticLockNotFound error. // This extra check manages to make prewrite idempotent after the transaction // was committed. @@ -479,7 +479,7 @@ impl Prewriter { TxnCommitRecord::SingleRecord { commit_ts, write } if write.write_type != WriteType::Rollback => { - info!("prewrited transaction has been committed"; + info!("prewritten transaction has been committed"; "start_ts" => reader.start_ts, "commit_ts" => commit_ts, "key" => ?key, "write_type" => ?write.write_type); txn.clear(); @@ -943,7 +943,7 @@ mod tests { None, ) .unwrap(); - // All keys are prewrited successful with only one seek operations. + // All keys are prewritten successful with only one seek operations. assert_eq!(1, statistic.write.seek); let keys: Vec = mutations.iter().map(|m| m.key().clone()).collect(); commit(&engine, &mut statistic, keys.clone(), 104, 105).unwrap(); diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 60972dcfaec..3460a1de5fd 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -820,7 +820,7 @@ impl Scheduler { // error to the callback, and releases the latches. Err(err) => { SCHED_STAGE_COUNTER_VEC.get(tag).prepare_write_err.inc(); - debug!("write command failed at prewrite"; "cid" => cid, "err" => ?err); + debug!("write command failed"; "cid" => cid, "err" => ?err); scheduler.finish_with_err(cid, err); return; } From ba391ff506c8b7f4b0cd7c9ef0b9f04ce87c3d7e Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Mon, 6 Jun 2022 15:32:29 +0800 Subject: [PATCH 013/676] storage: measure the read duration for scheduler commands (#12716) ref tikv/tikv#12362 This commit measures the read duration spent on reading (e.g. write conflict checks). This fixes the missing part of scheduler commands and may help diagnosis when scheduler reading takes a long time. This commit also changes some now_coarse to now. now_coarse has a precision of 10ms on many systems, so it may not meet our precision requirement. Instant::now is fast enough to be called in these cases. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- metrics/grafana/tikv_details.json | 143 +++++++++++++++++++++++++++++- src/storage/mod.rs | 56 ++++++------ src/storage/txn/sched_pool.rs | 16 ---- src/storage/txn/scheduler.rs | 21 +++-- 4 files changed, 182 insertions(+), 54 deletions(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 050a6727622..6192b4f3a5e 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -19400,6 +19400,145 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The time consumed on reading when executing commit command", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 63 + }, + "hiddenSeries": false, + "id": 23763572710, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_processing_read_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "99%", + "metric": "", + "refId": "A", + "step": 10 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_processing_read_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "95%", + "metric": "", + "refId": "B", + "step": 10 + }, + { + "exemplar": true, + "expr": "sum(rate(tikv_scheduler_processing_read_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_processing_read_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"$command\"}[1m])) ", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "avg", + "metric": "", + "refId": "C", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Scheduler command read duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:95", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:96", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "cards": { "cardPadding": null, @@ -19422,8 +19561,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 60 + "x": 12, + "y": 63 }, "heatmap": {}, "hideZeroBuckets": true, diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 802c35af020..9e778afe064 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -547,7 +547,7 @@ impl Storage { key: Key, start_ts: TimeStamp, ) -> impl Future, KvGetStatistics)>> { - let stage_begin_ts = Instant::now_coarse(); + let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::get; let priority = ctx.get_priority(); let priority_tag = get_priority_tag(priority); @@ -563,7 +563,7 @@ impl Storage { let res = self.read_pool.spawn_handle( async move { - let stage_scheduled_ts = Instant::now_coarse(); + let stage_scheduled_ts = Instant::now(); tls_collect_query( ctx.get_region_id(), ctx.get_peer(), @@ -580,7 +580,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [key.as_encoded()])?; - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); // The bypass_locks and access_locks set will be checked at most once. // `TsSet::vec` is more efficient here. @@ -598,7 +598,7 @@ impl Storage { let snapshot = Self::with_tls_engine(|engine| Self::snapshot(engine, snap_ctx)).await?; { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let stage_snap_recv_ts = begin_instant; let buckets = snapshot.ext().get_buckets(); let mut statistics = Statistics::default(); @@ -656,7 +656,7 @@ impl Storage { .inc_by(quota_delay.as_micros() as u64); } - let stage_finished_ts = Instant::now_coarse(); + let stage_finished_ts = Instant::now(); let schedule_wait_time = stage_scheduled_ts.saturating_duration_since(stage_begin_ts); let snapshot_wait_time = @@ -724,7 +724,7 @@ impl Storage { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC .get(CMD) .observe(requests.len() as f64); - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let read_id = Some(ThreadReadId::new()); let mut statistics = Statistics::default(); let mut req_snaps = vec![]; @@ -871,7 +871,7 @@ impl Storage { keys: Vec, start_ts: TimeStamp, ) -> impl Future>, KvGetStatistics)>> { - let stage_begin_ts = Instant::now_coarse(); + let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::batch_get; let priority = ctx.get_priority(); let priority_tag = get_priority_tag(priority); @@ -888,7 +888,7 @@ impl Storage { let mut sample = quota_limiter.new_sample(); let res = self.read_pool.spawn_handle( async move { - let stage_scheduled_ts = Instant::now_coarse(); + let stage_scheduled_ts = Instant::now(); let mut key_ranges = vec![]; for key in &keys { key_ranges.push(build_key_range(key.as_encoded(), key.as_encoded(), false)); @@ -912,7 +912,7 @@ impl Storage { keys.iter().map(Key::as_encoded), )?; - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let bypass_locks = TsSet::from_u64s(ctx.take_resolved_locks()); let access_locks = TsSet::from_u64s(ctx.take_committed_locks()); @@ -928,7 +928,7 @@ impl Storage { let snapshot = Self::with_tls_engine(|engine| Self::snapshot(engine, snap_ctx)).await?; { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let stage_snap_recv_ts = begin_instant; let mut statistics = Vec::with_capacity(keys.len()); @@ -999,7 +999,7 @@ impl Storage { .inc_by(quota_delay.as_micros() as u64); } - let stage_finished_ts = Instant::now_coarse(); + let stage_finished_ts = Instant::now(); let schedule_wait_time = stage_scheduled_ts.saturating_duration_since(stage_begin_ts); let snapshot_wait_time = @@ -1102,7 +1102,7 @@ impl Storage { if reverse_scan { std::mem::swap(&mut start_key, &mut end_key); } - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let bypass_locks = TsSet::from_u64s(ctx.take_resolved_locks()); let access_locks = TsSet::from_u64s(ctx.take_committed_locks()); @@ -1155,7 +1155,7 @@ impl Storage { let snapshot = Self::with_tls_engine(|engine| Self::snapshot(engine, snap_ctx)).await?; { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let perf_statistics = ReadPerfInstant::new(); let buckets = snapshot.ext().get_buckets(); @@ -1266,7 +1266,7 @@ impl Storage { // Do not check_api_version in scan_lock, to be compatible with TiDB gc-worker, // which resolves locks on regions, and boundary of regions will be out of range of TiDB keys. - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); concurrency_manager.update_max_ts(max_ts); let begin_instant = Instant::now(); @@ -1305,7 +1305,7 @@ impl Storage { let snapshot = Self::with_tls_engine(|engine| Self::snapshot(engine, snap_ctx)).await?; { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let mut statistics = Statistics::default(); let perf_statistics = ReadPerfInstant::new(); let buckets = snapshot.ext().get_buckets(); @@ -1481,7 +1481,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [&key])?; - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -1492,7 +1492,7 @@ impl Storage { let store = RawStore::new(snapshot, api_version); let cf = Self::rawkv_cf(&cf, api_version)?; { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let mut stats = Statistics::default(); let key = F::encode_raw_key_owned(key, None); // Keys pass to `tls_collect_query` should be encoded, to get correct keys for region split. @@ -1577,7 +1577,7 @@ impl Storage { .map_err(Error::from)?; } - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let read_id = Some(ThreadReadId::new()); let mut snaps = vec![]; for (mut req, id) in gets.into_iter().zip(ids) { @@ -1604,7 +1604,7 @@ impl Storage { snaps.push((id, key, ctx, req, snap)); } Self::with_tls_engine(|engine| engine.release_snapshot()); - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); for (id, key, ctx, mut req, snap) in snaps { let cf = req.take_cf(); match snap.await { @@ -1684,7 +1684,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, &keys)?; - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -1694,7 +1694,7 @@ impl Storage { let buckets = snapshot.ext().get_buckets(); let store = RawStore::new(snapshot, api_version); { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let cf = Self::rawkv_cf(&cf, api_version)?; // no scan_count for this kind of op. @@ -2020,7 +2020,7 @@ impl Storage { [(Some(&start_key), end_key.as_ref())], )?; - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2031,7 +2031,7 @@ impl Storage { let cf = Self::rawkv_cf(&cf, api_version)?; { let store = RawStore::new(snapshot, api_version); - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let start_key = F::encode_raw_key_owned(start_key, None); let end_key = end_key.map(|k| F::encode_raw_key_owned(k, None)); @@ -2155,7 +2155,7 @@ impl Storage { .map(|range| (Some(range.get_start_key()), Some(range.get_end_key()))), )?; - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2297,7 +2297,7 @@ impl Storage { Self::check_api_version(api_version, ctx.api_version, CMD, [&key])?; - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2308,7 +2308,7 @@ impl Storage { let store = RawStore::new(snapshot, api_version); let cf = Self::rawkv_cf(&cf, api_version)?; { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let mut stats = Statistics::default(); let key = F::encode_raw_key_owned(key, None); // Keys pass to `tls_collect_query` should be encoded, to get correct keys for region split. @@ -2462,7 +2462,7 @@ impl Storage { .map(|range| (Some(range.get_start_key()), Some(range.get_end_key()))), )?; - let command_duration = tikv_util::time::Instant::now_coarse(); + let command_duration = tikv_util::time::Instant::now(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -2473,7 +2473,7 @@ impl Storage { let store = RawStore::new(snapshot, api_version); let cf = Self::rawkv_cf("", api_version)?; - let begin_instant = tikv_util::time::Instant::now_coarse(); + let begin_instant = tikv_util::time::Instant::now(); let mut stats = Vec::with_capacity(ranges.len()); let ret = store .raw_checksum_ranges(cf, &ranges, &mut stats) diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index d83d8fe6f46..12ff44bbd61 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -13,7 +13,6 @@ use prometheus::local::*; use raftstore::store::WriteStats; use tikv_util::{ sys::SysQuota, - time::Duration, yatp_pool::{FuturePool, PoolTicker, YatpPoolBuilder}, }; @@ -24,8 +23,6 @@ use crate::storage::{ pub struct SchedLocalMetrics { local_scan_details: HashMap<&'static str, Statistics>, - processing_read_duration: LocalHistogramVec, - processing_write_duration: LocalHistogramVec, command_keyread_histogram_vec: LocalHistogramVec, local_write_stats: WriteStats, } @@ -34,8 +31,6 @@ thread_local! { static TLS_SCHED_METRICS: RefCell = RefCell::new( SchedLocalMetrics { local_scan_details: HashMap::default(), - processing_read_duration: SCHED_PROCESSING_READ_HISTOGRAM_VEC.local(), - processing_write_duration: SCHED_PROCESSING_WRITE_HISTOGRAM_VEC.local(), command_keyread_histogram_vec: KV_COMMAND_KEYREAD_HISTOGRAM_VEC.local(), local_write_stats:WriteStats::default(), } @@ -112,8 +107,6 @@ pub fn tls_flush(reporter: &R) { } } } - m.processing_read_duration.flush(); - m.processing_write_duration.flush(); m.command_keyread_histogram_vec.flush(); // Report PD metrics @@ -132,15 +125,6 @@ pub fn tls_collect_query(region_id: u64, kind: QueryKind) { }); } -pub fn tls_collect_read_duration(cmd: &str, duration: Duration) { - TLS_SCHED_METRICS.with(|m| { - m.borrow_mut() - .processing_read_duration - .with_label_values(&[cmd]) - .observe(tikv_util::time::duration_to_sec(duration)) - }); -} - pub fn tls_collect_keyread_histogram_vec(cmd: &str, count: f64) { TLS_SCHED_METRICS.with(|m| { m.borrow_mut() diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 3460a1de5fd..283787e9ba1 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -64,9 +64,7 @@ use crate::{ commands::{Command, ResponsePolicy, WriteContext, WriteResult, WriteResultLockInfo}, flow_controller::FlowController, latch::{Latches, Lock}, - sched_pool::{ - tls_collect_query, tls_collect_read_duration, tls_collect_scan_details, SchedPool, - }, + sched_pool::{tls_collect_query, tls_collect_scan_details, SchedPool}, Error, ProcessResult, }, types::StorageCallback, @@ -733,8 +731,6 @@ impl Scheduler { tag, ts ); - - tls_collect_read_duration(tag.get_str(), elapsed); } .in_resource_metering_tag(resource_tag) .await; @@ -748,10 +744,14 @@ impl Scheduler { let tag = task.cmd.tag(); + let begin_instant = Instant::now(); let pr = task .cmd .process_read(snapshot, statistics) .unwrap_or_else(|e| ProcessResult::Failed { err: e.into() }); + SCHED_PROCESSING_READ_HISTOGRAM_STATIC + .get(tag) + .observe(begin_instant.saturating_elapsed_secs()); self.on_read_finished(task.cid, pr, tag); } @@ -782,10 +782,15 @@ impl Scheduler { statistics, async_apply_prewrite: self.inner.enable_async_apply_prewrite, }; - - task.cmd + let begin_instant = Instant::now(); + let res = task + .cmd .process_write(snapshot, context) - .map_err(StorageError::from) + .map_err(StorageError::from); + SCHED_PROCESSING_READ_HISTOGRAM_STATIC + .get(tag) + .observe(begin_instant.saturating_elapsed_secs()); + res }; if write_result.is_ok() { From 62545b0c5c854b4e42bf37d03dddfab2099ce20c Mon Sep 17 00:00:00 2001 From: haojinming Date: Tue, 7 Jun 2022 14:34:30 +0800 Subject: [PATCH 014/676] Reserve key space id encoding in backup convert (#12759) close tikv/tikv#12758 Signed-off-by: haojinming --- components/api_version/src/api_v1.rs | 24 ++++++----------- components/api_version/src/api_v1ttl.rs | 31 +++++++++------------ components/api_version/src/api_v2.rs | 27 ++++++++++++------- components/api_version/src/lib.rs | 36 ++++++++----------------- components/backup/src/endpoint.rs | 32 ++++++++++++++++------ components/backup/src/utils.rs | 14 +++++----- tests/integrations/backup/mod.rs | 16 ++++++++--- 7 files changed, 94 insertions(+), 86 deletions(-) diff --git a/components/api_version/src/api_v1.rs b/components/api_version/src/api_v1.rs index 9267d1397c7..5b980ea75f1 100644 --- a/components/api_version/src/api_v1.rs +++ b/components/api_version/src/api_v1.rs @@ -1,5 +1,7 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +use tikv_util::box_err; + use super::*; impl KvFormat for ApiV1 { @@ -43,28 +45,18 @@ impl KvFormat for ApiV1 { ) -> Result { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => Ok(Key::from_encoded_slice(key)), - ApiVersion::V2 => { - debug_assert_eq!(ApiV2::parse_key_mode(key), KeyMode::Raw); - let (mut user_key, _) = ApiV2::decode_raw_key(&Key::from_encoded_slice(key), true)?; - user_key.remove(0); // remove first byte `RAW_KEY_PREFIX` - Ok(Self::encode_raw_key_owned(user_key, None)) - } + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1")), // reject apiv2 -> apiv1 conversion } } fn convert_raw_user_key_range_version_from( src_api: ApiVersion, - mut start_key: Vec, - mut end_key: Vec, - ) -> (Vec, Vec) { + start_key: Vec, + end_key: Vec, + ) -> Result<(Vec, Vec)> { match src_api { - ApiVersion::V1 | ApiVersion::V1ttl => (start_key, end_key), - ApiVersion::V2 => { - // TODO: check raw key range after check_api_version_range is refactored. - start_key.remove(0); - end_key.remove(0); - (start_key, end_key) - } + ApiVersion::V1 | ApiVersion::V1ttl => Ok((start_key, end_key)), + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1")), // reject apiv2 -> apiv1 conversion } } } diff --git a/components/api_version/src/api_v1ttl.rs b/components/api_version/src/api_v1ttl.rs index ce42a023273..65c7f569aa6 100644 --- a/components/api_version/src/api_v1ttl.rs +++ b/components/api_version/src/api_v1ttl.rs @@ -1,9 +1,12 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::Result; -use tikv_util::codec::{ - number::{self, NumberEncoder}, - Error, +use tikv_util::{ + box_err, + codec::{ + number::{self, NumberEncoder}, + Error, + }, }; use super::*; @@ -67,28 +70,18 @@ impl KvFormat for ApiV1Ttl { ) -> Result { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => Ok(Key::from_encoded_slice(key)), - ApiVersion::V2 => { - debug_assert_eq!(ApiV2::parse_key_mode(key), KeyMode::Raw); - let (mut user_key, _) = ApiV2::decode_raw_key(&Key::from_encoded_slice(key), true)?; - user_key.remove(0); // remove first byte `RAW_KEY_PREFIX` - Ok(Self::encode_raw_key_owned(user_key, None)) - } + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1ttl")), // reject apiv2 -> apiv1ttl conversion } } fn convert_raw_user_key_range_version_from( src_api: ApiVersion, - mut start_key: Vec, - mut end_key: Vec, - ) -> (Vec, Vec) { + start_key: Vec, + end_key: Vec, + ) -> Result<(Vec, Vec)> { match src_api { - ApiVersion::V1 | ApiVersion::V1ttl => (start_key, end_key), - ApiVersion::V2 => { - // TODO: check raw key range after check_api_version_range is refactored. - start_key.remove(0); - end_key.remove(0); - (start_key, end_key) - } + ApiVersion::V1 | ApiVersion::V1ttl => Ok((start_key, end_key)), + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1ttl")), // reject apiv2 -> apiv1ttl conversion } } } diff --git a/components/api_version/src/api_v2.rs b/components/api_version/src/api_v2.rs index d12926cb39b..a8a177596ad 100644 --- a/components/api_version/src/api_v2.rs +++ b/components/api_version/src/api_v2.rs @@ -16,6 +16,8 @@ pub const RAW_KEY_PREFIX_END: u8 = RAW_KEY_PREFIX + 1; pub const TXN_KEY_PREFIX: u8 = b'x'; pub const TIDB_META_KEY_PREFIX: u8 = b'm'; pub const TIDB_TABLE_KEY_PREFIX: u8 = b't'; +pub const DEFAULT_KEY_SPACE_ID: [u8; 3] = [0, 0, 0]; // reserve 3 bytes for key space id. +pub const DEFAULT_KEY_SPACE_ID_END: [u8; 3] = [0, 0, 1]; pub const TIDB_RANGES: &[(&[u8], &[u8])] = &[ (&[TIDB_META_KEY_PREFIX], &[TIDB_META_KEY_PREFIX + 1]), @@ -182,9 +184,7 @@ impl KvFormat for ApiV2 { ) -> Result { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => { - let mut apiv2_key = Vec::with_capacity(ApiV2::get_encode_len(key.len() + 1)); - apiv2_key.push(RAW_KEY_PREFIX); - apiv2_key.extend(key); + let apiv2_key = ApiV2::add_prefix(key, &DEFAULT_KEY_SPACE_ID); Ok(Self::encode_raw_key_owned(apiv2_key, ts)) } ApiVersion::V2 => Ok(Key::from_encoded_slice(key)), @@ -195,18 +195,18 @@ impl KvFormat for ApiV2 { src_api: ApiVersion, mut start_key: Vec, mut end_key: Vec, - ) -> (Vec, Vec) { + ) -> Result<(Vec, Vec)> { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => { - start_key.insert(0, RAW_KEY_PREFIX); + start_key = ApiV2::add_prefix(&start_key, &DEFAULT_KEY_SPACE_ID); if end_key.is_empty() { - end_key.insert(0, RAW_KEY_PREFIX_END); + end_key = ApiV2::add_prefix(&end_key, &DEFAULT_KEY_SPACE_ID_END); } else { - end_key.insert(0, RAW_KEY_PREFIX); + end_key = ApiV2::add_prefix(&end_key, &DEFAULT_KEY_SPACE_ID); } - (start_key, end_key) + Ok((start_key, end_key)) } - ApiVersion::V2 => (start_key, end_key), + ApiVersion::V2 => Ok((start_key, end_key)), } } } @@ -235,6 +235,15 @@ impl ApiV2 { Ok(Key::split_on_ts_for(key)?) } + pub fn add_prefix(key: &[u8], key_space: &[u8]) -> Vec { + let mut apiv2_key = + Vec::with_capacity(ApiV2::get_encode_len(key.len() + key_space.len() + 1)); + apiv2_key.push(RAW_KEY_PREFIX); + apiv2_key.extend(key_space); // Reserved 3 bytes for key space id. + apiv2_key.extend(key); + apiv2_key + } + pub const ENCODED_LOGICAL_DELETE: [u8; 1] = [ValueMeta::DELETE_FLAG.bits]; } diff --git a/components/api_version/src/lib.rs b/components/api_version/src/lib.rs index b57b1dfae45..0dbdc833b86 100644 --- a/components/api_version/src/lib.rs +++ b/components/api_version/src/lib.rs @@ -80,7 +80,7 @@ pub trait KvFormat: Clone + Copy + 'static + Send + Sync { src_api: ApiVersion, start_key: Vec, end_key: Vec, - ) -> (Vec, Vec); + ) -> Result<(Vec, Vec)>; /// Convert the encoded value from src_api version to Self::TAG version fn convert_raw_encoded_value_version_from( @@ -633,8 +633,8 @@ mod tests { .clone() .into_iter() .map(|key| { - let mut v2_key = key; - v2_key.insert(0, RAW_KEY_PREFIX); + let mut v2_key = vec![RAW_KEY_PREFIX, 0, 0, 0]; + v2_key.extend(key); ApiV2::encode_raw_key_owned(v2_key, Some(TimeStamp::from(timestamp))).into_encoded() }) .collect(); @@ -642,8 +642,6 @@ mod tests { let test_cases = vec![ (ApiVersion::V1, ApiVersion::V2, &apiv1_keys, &apiv2_keys), (ApiVersion::V1ttl, ApiVersion::V2, &apiv1_keys, &apiv2_keys), - (ApiVersion::V2, ApiVersion::V1, &apiv2_keys, &apiv1_keys), - (ApiVersion::V2, ApiVersion::V1ttl, &apiv2_keys, &apiv1_keys), ]; for i in 0..apiv1_keys.len() { for (src_api_ver, dst_api_ver, src_data, dst_data) in test_cases.clone() { @@ -731,14 +729,14 @@ mod tests { .clone() .into_iter() .map(|(start_key, end_key)| { - let mut v2_start_key = start_key; - let mut v2_end_key = end_key; - v2_start_key.insert(0, RAW_KEY_PREFIX); - if v2_end_key.is_empty() { - v2_end_key.insert(0, RAW_KEY_PREFIX_END); + let mut v2_start_key = vec![RAW_KEY_PREFIX, 0, 0, 0]; // key space takes 3 bytes. + let mut v2_end_key = if end_key.is_empty() { + vec![RAW_KEY_PREFIX, 0, 0, 1] } else { - v2_end_key.insert(0, RAW_KEY_PREFIX); - } + vec![RAW_KEY_PREFIX, 0, 0, 0] // key space takes 3 bytes. + }; + v2_start_key.extend(start_key); + v2_end_key.extend(end_key); (v2_start_key, v2_end_key) }) .collect(); @@ -756,18 +754,6 @@ mod tests { &apiv1_key_ranges, &apiv2_key_ranges, ), - ( - ApiVersion::V2, - ApiVersion::V1, - &apiv2_key_ranges, - &apiv1_key_ranges, - ), - ( - ApiVersion::V2, - ApiVersion::V1ttl, - &apiv2_key_ranges, - &apiv1_key_ranges, - ), ]; for (src_api_ver, dst_api_ver, src_data, dst_data) in test_cases { for i in 0..apiv1_key_ranges.len() { @@ -775,7 +761,7 @@ mod tests { let (src_start, src_end) = src_data[i].clone(); API::convert_raw_user_key_range_version_from(src_api_ver, src_start, src_end) }); - assert_eq!(dst_key_range, dst_data[i]); + assert_eq!(dst_key_range.unwrap(), dst_data[i]); } } } diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 3a737ba52d2..37e6855302a 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -206,20 +206,30 @@ async fn save_backup_file_worker( let files = if msg.files.need_flush_keys() { match msg.files.save(&storage).await { Ok(mut split_files) => { + let mut has_err = false; for file in split_files.iter_mut() { // In the case that backup from v1 and restore to v2, // the file range need be encoded as v2 format. // And range in response keep in v1 format. - let (start, end) = codec.convert_key_range_to_dst_version( + let ret = codec.convert_key_range_to_dst_version( msg.start_key.clone(), msg.end_key.clone(), ); + if ret.is_err() { + has_err = true; + break; + } + let (start, end) = ret.unwrap(); file.set_start_key(start); file.set_end_key(end); file.set_start_version(msg.start_version.into_inner()); file.set_end_version(msg.end_version.into_inner()); } - Ok(split_files) + if has_err { + Err(box_err!("backup convert key range failed")) + } else { + Ok(split_files) + } } Err(e) => { error_unknown!(?e; "backup save file failed"); @@ -1524,7 +1534,10 @@ pub mod tests { format!("k{:0>10}", idx) }; if api_ver == ApiVersion::V2 { - key.insert(0, RAW_KEY_PREFIX as char); + // [0, 0, 0] is the default key space id. + let mut apiv2_key = [RAW_KEY_PREFIX, 0, 0, 0].to_vec(); + apiv2_key.extend(key.as_bytes()); + key = String::from_utf8(apiv2_key).unwrap(); } key } @@ -1561,7 +1574,10 @@ pub mod tests { ) -> Key { if (cur_ver == ApiVersion::V1 || cur_ver == ApiVersion::V1ttl) && dst_ver == ApiVersion::V2 { - raw_key.insert(0, RAW_KEY_PREFIX as char); + // [0, 0, 0] is the default key space id. + let mut apiv2_key = [RAW_KEY_PREFIX, 0, 0, 0].to_vec(); + apiv2_key.extend(raw_key.as_bytes()); + raw_key = String::from_utf8(apiv2_key).unwrap(); } Key::from_encoded(raw_key.into_bytes()) } @@ -1610,22 +1626,22 @@ pub mod tests { stats.reset(); let mut req = BackupRequest::default(); let backup_start = if cur_api_ver == ApiVersion::V2 { - vec![RAW_KEY_PREFIX] + vec![RAW_KEY_PREFIX, 0, 0, 0] // key space id takes 3 bytes. } else { vec![] }; let backup_end = if cur_api_ver == ApiVersion::V2 { - vec![RAW_KEY_PREFIX + 1] + vec![RAW_KEY_PREFIX, 0, 0, 1] // [0, 0, 1] is the end of the file } else { vec![] }; let file_start = if dst_api_ver == ApiVersion::V2 { - vec![RAW_KEY_PREFIX] + vec![RAW_KEY_PREFIX, 0, 0, 0] // key space id takes 3 bytes. } else { vec![] }; let file_end = if dst_api_ver == ApiVersion::V2 { - vec![RAW_KEY_PREFIX + 1] + vec![RAW_KEY_PREFIX, 0, 0, 1] // [0, 0, 1] is the end of the file } else { vec![] }; diff --git a/components/backup/src/utils.rs b/components/backup/src/utils.rs index 4d01631817c..1ced24f4abc 100644 --- a/components/backup/src/utils.rs +++ b/components/backup/src/utils.rs @@ -240,12 +240,14 @@ impl KeyValueCodec { &self, start_key: Vec, end_key: Vec, - ) -> (Vec, Vec) { + ) -> Result<(Vec, Vec)> { if !self.is_raw_kv { - return (start_key, end_key); + return Ok((start_key, end_key)); } dispatch_api_version!(self.dst_api_ver, { - API::convert_raw_user_key_range_version_from(self.cur_api_ver, start_key, end_key) + let (start, end) = + API::convert_raw_user_key_range_version_from(self.cur_api_ver, start_key, end_key)?; + Ok((start, end)) }) } } @@ -500,14 +502,14 @@ pub mod tests { ( ApiVersion::V1, ApiVersion::V2, - b"abc".to_vec(), - ApiV2::encode_raw_key_owned(b"rabc".to_vec(), ts), + [61, 62, 63].to_vec(), + ApiV2::encode_raw_key_owned([114, 0, 0, 0, 61, 62, 63].to_vec(), ts), ), ( ApiVersion::V1ttl, ApiVersion::V2, b"".to_vec(), - ApiV2::encode_raw_key_owned(b"r".to_vec(), ts), + ApiV2::encode_raw_key_owned([114, 0, 0, 0].to_vec(), ts), ), ]; diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index 1752c529cb0..6d171bcae28 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -354,7 +354,9 @@ fn test_backup_rawkv_cross_version_impl(cur_api_ver: ApiVersion, dst_api_ver: Ap let key = { let mut key = k.into_bytes(); if cur_api_ver != ApiVersion::V2 && dst_api_ver == ApiVersion::V2 { - key.insert(0, b'r') + let mut apiv2_key = [b'r', 0, 0, 0].to_vec(); + apiv2_key.extend(key); + key = apiv2_key; } key }; @@ -364,9 +366,17 @@ fn test_backup_rawkv_cross_version_impl(cur_api_ver: ApiVersion, dst_api_ver: Ap // Backup file should have same contents. // Set non-empty range to check if it's incorrectly encoded. + let (backup_start, backup_end) = if cur_api_ver != dst_api_ver { + ( + vec![b'r', 0, 0, 0, b'r', b'a'], + vec![b'r', 0, 0, 0, b'r', b'z'], + ) + } else { + (vec![b'r', b'a'], vec![b'r', b'z']) + }; let rx = target_suite.backup_raw( - vec![b'r', b'a'], // start - vec![b'r', b'z'], // end + backup_start, // start + backup_end, // end cf, &make_unique_dir(tmp.path()), dst_api_ver, From dee0e1eaac70f9f003755bbef443a5d36b59d5ff Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Tue, 7 Jun 2022 18:16:30 +0800 Subject: [PATCH 015/676] log-backup: fix bug about restoring point at TiCloud with KMS (#12757) close tikv/tikv#12750, close tikv/tikv#12751 Signed-off-by: joccau --- components/external_storage/src/lib.rs | 11 -- components/sst_importer/src/sst_importer.rs | 152 +++++++++++++++++++- 2 files changed, 146 insertions(+), 17 deletions(-) diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index 477b0a39a64..0bad03cbcca 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -9,7 +9,6 @@ extern crate slog_global; extern crate tikv_alloc; use std::{ - fs, io::{self, Write}, marker::Unpin, sync::Arc, @@ -87,16 +86,6 @@ pub trait ExternalStorage: 'static + Send + Sync { file_crypter: Option, ) -> io::Result<()> { let reader = self.read(storage_name); - if let Some(p) = restore_name.parent() { - // try create all parent dirs from the path (optional). - fs::create_dir_all(p).or_else(|e| { - if e.kind() == io::ErrorKind::AlreadyExists { - Ok(()) - } else { - Err(e) - } - })?; - } let output: &mut dyn Write = &mut File::create(restore_name)?; // the minimum speed of reading data, in bytes/second. // if reading speed is slower than this rate, we will stop with diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index dc92c405480..d1ef399d6d0 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -4,7 +4,7 @@ use std::{ borrow::Cow, collections::HashMap, fs::File, - io::{prelude::*, BufReader}, + io::{self, prelude::*, BufReader}, ops::Bound, path::{Path, PathBuf}, sync::Arc, @@ -230,16 +230,26 @@ impl SstImporter { dst_file: std::path::PathBuf, backend: &StorageBackend, expect_sha256: Option>, + support_kms: bool, file_crypter: Option, speed_limiter: &Limiter, ) -> Result<()> { let start_read = Instant::now(); + if let Some(p) = dst_file.parent() { + file_system::create_dir_all(p).or_else(|e| { + if e.kind() == io::ErrorKind::AlreadyExists { + Ok(()) + } else { + Err(e) + } + })?; + } // prepare to download the file from the external_storage // TODO: pass a config to support hdfs let ext_storage = external_storage_export::create_storage(backend, Default::default())?; let url = ext_storage.url()?.to_string(); - let ext_storage: Box = + let ext_storage: Box = if support_kms { if let Some(key_manager) = &self.key_manager { Box::new(external_storage_export::EncryptedExternalStorage { key_manager: (*key_manager).clone(), @@ -247,7 +257,10 @@ impl SstImporter { }) as _ } else { ext_storage as _ - }; + } + } else { + ext_storage as _ + }; let result = ext_storage.restore( src_file_name, @@ -313,6 +326,10 @@ impl SstImporter { path.temp.clone(), backend, expected_sha256, + // kv-files needn't are decrypted with KMS when download currently because these files are not encrypted when log-backup. + // It is different from sst-files because sst-files is encrypted when saved with rocksdb env with KMS. + // to do: support KMS when log-backup and restore point. + false, // don't support encrypt for now. None, speed_limiter, @@ -321,7 +338,13 @@ impl SstImporter { if let Some(p) = path.save.parent() { // we have v1 prefix in file name. - file_system::create_dir_all(p)?; + file_system::create_dir_all(p).or_else(|e| { + if e.kind() == io::ErrorKind::AlreadyExists { + Ok(()) + } else { + Err(e) + } + })?; } file_system::rename(path.temp, path.save.clone())?; @@ -474,6 +497,7 @@ impl SstImporter { path.temp.clone(), backend, None, + true, file_crypter, speed_limiter, )?; @@ -761,7 +785,7 @@ fn is_after_end_bound>(value: &[u8], bound: &Bound) -> bool { #[cfg(test)] mod tests { - use std::io; + use std::io::{self, BufWriter}; use engine_traits::{ collect, EncryptionMethod, Error as TraitError, ExternalSstFileInfo, Iterable, Iterator, @@ -772,7 +796,7 @@ mod tests { use tempfile::Builder; use test_sst_importer::*; use test_util::new_test_key_manager; - use tikv_util::stream::block_on_external_io; + use tikv_util::{codec::stream_event::EventEncoder, stream::block_on_external_io}; use txn_types::{Value, WriteType}; use uuid::Uuid; @@ -926,6 +950,15 @@ mod tests { } } + fn check_file_is_same(path_a: &Path, path_b: &Path) -> bool { + assert!(path_a.exists()); + assert!(path_b.exists()); + + let content_a = file_system::read(path_a).unwrap(); + let content_b = file_system::read(path_b).unwrap(); + content_a == content_b + } + fn new_key_manager_for_test() -> (tempfile::TempDir, Arc) { // test with tde let tmp_dir = tempfile::TempDir::new().unwrap(); @@ -981,6 +1014,41 @@ mod tests { }) } + fn create_sample_external_kv_file() -> Result<(tempfile::TempDir, StorageBackend, KvMeta)> { + let ext_dir = tempfile::tempdir()?; + let file_name = "v1/t000001/abc.log"; + let file_path = ext_dir.path().join(file_name); + std::fs::create_dir_all(file_path.parent().unwrap())?; + let file = File::create(file_path).unwrap(); + let mut buff = BufWriter::new(file); + + let kvs = vec![ + (b"t1_r01".to_vec(), b"tidb".to_vec()), + (b"t1_r02".to_vec(), b"tikv".to_vec()), + (b"t1_r03".to_vec(), b"pingcap".to_vec()), + ]; + + let mut sha256 = Hasher::new(MessageDigest::sha256()).unwrap(); + let mut len = 0; + for kv in kvs { + let encoded = EventEncoder::encode_event(&kv.0, &kv.1); + for slice in encoded { + len += buff.write(slice.as_ref()).unwrap(); + sha256.update(slice.as_ref()).unwrap(); + } + } + + let mut kv_meta = KvMeta::default(); + kv_meta.set_name(file_name.to_string()); + kv_meta.set_cf(String::from("default")); + kv_meta.set_is_delete(false); + kv_meta.set_length(len as _); + kv_meta.set_sha256(sha256.finish().unwrap().to_vec()); + + let backend = external_storage_export::make_local_backend(ext_dir.path()); + Ok((ext_dir, backend, kv_meta)) + } + fn create_sample_external_rawkv_sst_file( start_key: &[u8], end_key: &[u8], @@ -1156,6 +1224,78 @@ mod tests { assert_eq!(err.kind(), io::ErrorKind::TimedOut); } + #[test] + fn test_download_file_from_external_storage_for_sst() { + // creates a sample SST file. + let (_ext_sst_dir, backend, meta) = create_sample_external_sst_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager.clone()), + ApiVersion::V1, + ) + .unwrap(); + + // perform download file into .temp dir. + let file_name = "sample.sst"; + let path = importer.dir.get_import_path(file_name).unwrap(); + importer + .download_file_from_external_storage( + meta.get_length(), + file_name, + path.temp.clone(), + &backend, + None, + true, + None, + &Limiter::new(f64::INFINITY), + ) + .unwrap(); + check_file_exists(&path.temp, Some(&key_manager)); + assert!(!check_file_is_same( + &_ext_sst_dir.path().join(file_name), + &path.temp, + )); + } + + #[test] + fn test_download_file_from_external_storage_for_kv() { + let (_temp_dir, backend, kv_meta) = create_sample_external_kv_file().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + + let import_dir = tempfile::tempdir().unwrap(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager), + ApiVersion::V1, + ) + .unwrap(); + + let path = importer.dir.get_import_path(kv_meta.get_name()).unwrap(); + importer + .download_file_from_external_storage( + kv_meta.get_length(), + kv_meta.get_name(), + path.temp.clone(), + &backend, + Some(kv_meta.get_sha256().to_vec()), + false, + None, + &Limiter::new(f64::INFINITY), + ) + .unwrap(); + + assert!(check_file_is_same( + &_temp_dir.path().join(kv_meta.get_name()), + &path.temp, + )); + } + #[test] fn test_download_sst_no_key_rewrite() { // creates a sample SST file. From ae46f9b35d77409ad3ef946e842bce17d44571fe Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 7 Jun 2022 11:06:30 -0700 Subject: [PATCH 016/676] *: update jemalloc to 5.3.0 (#12661) close tikv/tikv#12660 Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 18 +++++++++--------- components/tikv_alloc/Cargo.toml | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 080a1ccc35f..75458e3d917 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2745,7 +2745,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#de8310c3983a30236ea03f802ed0c2401a4908ae" +source = "git+https://github.com/tikv/rust-rocksdb.git#c1f668d0c85612f5fe6ec8e4351df0fc0bef1286" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2764,7 +2764,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#de8310c3983a30236ea03f802ed0c2401a4908ae" +source = "git+https://github.com/tikv/rust-rocksdb.git#c1f668d0c85612f5fe6ec8e4351df0fc0bef1286" dependencies = [ "bzip2-sys", "cc", @@ -4573,7 +4573,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#de8310c3983a30236ea03f802ed0c2401a4908ae" +source = "git+https://github.com/tikv/rust-rocksdb.git#c1f668d0c85612f5fe6ec8e4351df0fc0bef1286" dependencies = [ "libc 0.2.125", "librocksdb_sys", @@ -6144,9 +6144,9 @@ dependencies = [ [[package]] name = "tikv-jemalloc-ctl" -version = "0.4.2" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb833c46ecbf8b6daeccb347cefcabf9c1beb5c9b0f853e1cec45632d9963e69" +checksum = "e37706572f4b151dff7a0146e040804e9c26fe3a3118591112f05cf12a4216c1" dependencies = [ "libc 0.2.125", "paste", @@ -6155,9 +6155,9 @@ dependencies = [ [[package]] name = "tikv-jemalloc-sys" -version = "0.4.3+5.2.1-patched.2" +version = "0.5.0+5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1792ccb507d955b46af42c123ea8863668fae24d03721e40cad6a41773dbb49" +checksum = "aeab4310214fe0226df8bfeb893a291a58b19682e8a07e1e1d4483ad4200d315" dependencies = [ "cc", "fs_extra", @@ -6166,9 +6166,9 @@ dependencies = [ [[package]] name = "tikv-jemallocator" -version = "0.4.3" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5b7bcecfafe4998587d636f9ae9d55eb9d0499877b88757767c346875067098" +checksum = "20612db8a13a6c06d57ec83953694185a367e16945f66565e8028d2c0bd76979" dependencies = [ "libc 0.2.125", "tikv-jemalloc-sys", diff --git a/components/tikv_alloc/Cargo.toml b/components/tikv_alloc/Cargo.toml index 2ebbd4da1bc..086744cab8f 100644 --- a/components/tikv_alloc/Cargo.toml +++ b/components/tikv_alloc/Cargo.toml @@ -35,15 +35,15 @@ optional = true features = ["bundled"] [dependencies.tikv-jemalloc-ctl] -version = "0.4.0" +version = "0.5.0" optional = true [dependencies.tikv-jemalloc-sys] -version = "0.4.0" +version = "0.5.0" optional = true features = ["stats"] [dependencies.tikv-jemallocator] -version = "0.4.0" +version = "0.5.0" optional = true features = ["unprefixed_malloc_on_supported_platforms", "stats"] From ffdff6b87606c0b1087d7507c64de9dac48a3d36 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 8 Jun 2022 16:46:31 +0800 Subject: [PATCH 017/676] store: add test for verifying bucket version change in try_batch (#12777) close tikv/tikv#12578 As the issue shows, try_batch may use out of date bucket meta for further operation which has not been detected by any test. This PR adds a test for it. Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/apply.rs | 94 +++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index a7c534ff823..b74a49c4273 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -5394,6 +5394,100 @@ mod tests { } } + #[test] + fn test_bucket_version_change_in_try_batch() { + let (_path, engine) = create_tmp_engine("test-bucket"); + let (_, importer) = create_tmp_importer("test-bucket"); + let obs = ApplyObserver::default(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_query_observer(1, BoxQueryObserver::new(obs)); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let cfg = { + let mut cfg = Config::default(); + cfg.apply_batch_system.pool_size = 1; + cfg.apply_batch_system.low_priority_pool_size = 0; + Arc::new(VersionTrack::new(cfg)) + }; + let (router, mut system) = create_apply_batch_system(&cfg.value()); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg, + sender, + region_scheduler, + coprocessor_host: host, + importer, + engine, + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("test-bucket".to_owned(), builder); + + let mut reg = Registration { + id: 1, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(1, 1)); + reg.region.set_start_key(b"k1".to_vec()); + reg.region.set_end_key(b"k2".to_vec()); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + router.schedule_task(1, Msg::Registration(reg)); + + let entry1 = { + let mut entry = EntryBuilder::new(1, 1); + entry = entry.put(b"key1", b"value1"); + entry.epoch(1, 3).build() + }; + + let entry2 = { + let mut entry = EntryBuilder::new(2, 1); + entry = entry.put(b"key2", b"value2"); + entry.epoch(1, 3).build() + }; + + let (capture_tx, _capture_rx) = mpsc::channel(); + let mut apply1 = apply(1, 1, 1, vec![entry1], vec![cb(1, 1, capture_tx.clone())]); + let bucket_meta = BucketMeta { + region_id: 1, + region_epoch: RegionEpoch::default(), + version: 1, + keys: vec![b"".to_vec(), b"".to_vec()], + sizes: vec![0, 0], + }; + apply1.bucket_meta = Some(Arc::new(bucket_meta)); + + let mut apply2 = apply(1, 1, 1, vec![entry2], vec![cb(2, 1, capture_tx)]); + let mut bucket_meta2 = BucketMeta { + region_id: 1, + region_epoch: RegionEpoch::default(), + version: 2, + keys: vec![b"".to_vec(), b"".to_vec()], + sizes: vec![0, 0], + }; + bucket_meta2.version = 2; + apply2.bucket_meta = Some(Arc::new(bucket_meta2)); + + router.schedule_task(1, Msg::apply(apply1)); + router.schedule_task(1, Msg::apply(apply2)); + + let res = fetch_apply_res(&rx); + let bucket_version = res.bucket_stat.unwrap().as_ref().meta.version; + + assert_eq!(bucket_version, 2); + + validate(&router, 1, |delegate| { + let bucket_version = delegate.buckets.as_ref().unwrap().meta.version; + assert_eq!(bucket_version, 2); + }); + } + #[test] fn test_cmd_observer() { let (_path, engine) = create_tmp_engine("test-delegate"); From fd7b4ad2e6662ea6b199eb2355b11fbe9c201204 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 9 Jun 2022 14:18:30 +0800 Subject: [PATCH 018/676] config: output warn log when some components config invalid (#12767) close tikv/tikv#12771 Signed-off-by: 3pointer Co-authored-by: zhangjinpeng1987 Co-authored-by: Ti Chi Robot --- components/cdc/src/endpoint.rs | 10 ++-- components/sst_importer/src/config.rs | 15 ++++-- src/config.rs | 70 +++++++++++++++++++-------- 3 files changed, 68 insertions(+), 27 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 3adaa8aca65..9b1b663b207 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -442,12 +442,12 @@ impl, E: KvEngine> Endpoint { fn on_change_cfg(&mut self, change: ConfigChange) { // Validate first. let mut validate_cfg = self.config.clone(); - validate_cfg.update(change.clone()); + validate_cfg.update(change); if let Err(e) = validate_cfg.validate() { warn!("cdc config update failed"; "error" => ?e); return; } - + let change = self.config.diff(&validate_cfg); info!( "cdc config updated"; "current config" => ?self.config, @@ -1542,13 +1542,13 @@ mod tests { let mut updated_cfg = cfg.clone(); { // Update it to be smaller than incremental_scan_threads, - // which will be an invalid change and will be lost. + // which will be an invalid change and will modified to incremental_scan_threads. updated_cfg.incremental_scan_concurrency = 2; } let diff = cfg.diff(&updated_cfg); ep.run(Task::ChangeConfig(diff)); - assert_eq!(ep.config.incremental_scan_concurrency, 6); - assert_eq!(ep.scan_concurrency_semaphore.available_permits(), 6); + assert_eq!(ep.config.incremental_scan_concurrency, 4); + assert_eq!(ep.scan_concurrency_semaphore.available_permits(), 4); { // Correct update. diff --git a/components/sst_importer/src/config.rs b/components/sst_importer/src/config.rs index a25d34ea24b..ef74a40fd01 100644 --- a/components/sst_importer/src/config.rs +++ b/components/sst_importer/src/config.rs @@ -27,12 +27,21 @@ impl Default for Config { } impl Config { - pub fn validate(&self) -> Result<(), Box> { + pub fn validate(&mut self) -> Result<(), Box> { + let default_cfg = Config::default(); if self.num_threads == 0 { - return Err("import.num_threads can not be 0".into()); + warn!( + "import.num_threads can not be 0, change it to {}", + default_cfg.num_threads + ); + self.num_threads = default_cfg.num_threads; } if self.stream_channel_window == 0 { - return Err("import.stream_channel_window can not be 0".into()); + warn!( + "import.stream_channel_window can not be 0, change it to {}", + default_cfg.stream_channel_window + ); + self.stream_channel_window = default_cfg.stream_channel_window; } Ok(()) } diff --git a/src/config.rs b/src/config.rs index 627901481d1..37278fd09e2 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2316,16 +2316,29 @@ pub struct BackupConfig { } impl BackupConfig { - pub fn validate(&self) -> Result<(), Box> { + pub fn validate(&mut self) -> Result<(), Box> { let limit = SysQuota::cpu_cores_quota() as usize; + let default_cfg = BackupConfig::default(); if self.num_threads == 0 || self.num_threads > limit { - return Err(format!("backup.num_threads cannot be 0 or larger than {}", limit).into()); + warn!( + "backup.num_threads cannot be 0 or larger than {}, change it to {}", + limit, default_cfg.num_threads + ); + self.num_threads = default_cfg.num_threads; } if self.batch_size == 0 { - return Err("backup.batch_size cannot be 0".into()); + warn!( + "backup.batch_size cannot be 0, change it to {}", + default_cfg.batch_size + ); + self.batch_size = default_cfg.batch_size; } if self.s3_multi_part_size.0 > ReadableSize::gb(5).0 { - return Err("backup.s3_multi_part_size cannot larger than 5GB".into()); + warn!( + "backup.s3_multi_part_size cannot larger than 5GB, change it to {:?}", + default_cfg.s3_multi_part_size + ); + self.s3_multi_part_size = default_cfg.s3_multi_part_size; } Ok(()) @@ -2373,9 +2386,15 @@ pub struct BackupStreamConfig { } impl BackupStreamConfig { - pub fn validate(&self) -> Result<(), Box> { - if self.num_threads == 0 { - return Err("backup.num_threads cannot be 0".into()); + pub fn validate(&mut self) -> Result<(), Box> { + let limit = SysQuota::cpu_cores_quota() as usize; + let default_cfg = BackupStreamConfig::default(); + if self.num_threads == 0 || self.num_threads > limit { + warn!( + "log_backup.num_threads cannot be 0 or larger than {}, change it to {}", + limit, default_cfg.num_threads + ); + self.num_threads = default_cfg.num_threads; } Ok(()) } @@ -2460,25 +2479,38 @@ impl Default for CdcConfig { impl CdcConfig { pub fn validate(&mut self) -> Result<(), Box> { + let default_cfg = CdcConfig::default(); if self.min_ts_interval.is_zero() { - return Err("cdc.min-ts-interval can't be 0".into()); + warn!( + "cdc.min-ts-interval can't be 0, change it to {}", + default_cfg.min_ts_interval + ); + self.min_ts_interval = default_cfg.min_ts_interval; } if self.incremental_scan_threads == 0 { - return Err("cdc.incremental-scan-threads can't be 0".into()); + warn!( + "cdc.incremental-scan-threads can't be 0, change it to {}", + default_cfg.incremental_scan_threads + ); + self.incremental_scan_threads = default_cfg.incremental_scan_threads; } if self.incremental_scan_concurrency < self.incremental_scan_threads { - return Err( - "cdc.incremental-scan-concurrency must be larger than cdc.incremental-scan-threads" - .into(), + warn!( + "cdc.incremental-scan-concurrency must be larger than cdc.incremental-scan-threads, + change it to {}", + self.incremental_scan_threads ); + self.incremental_scan_concurrency = self.incremental_scan_threads } if self.incremental_scan_ts_filter_ratio < 0.0 || self.incremental_scan_ts_filter_ratio > 1.0 { - return Err( - "cdc.incremental-scan-ts-filter-ratio should be larger than 0 and less than 1" - .into(), + warn!( + "cdc.incremental-scan-ts-filter-ratio should be larger than 0 and less than 1, + change it to {}", + default_cfg.incremental_scan_ts_filter_ratio ); + self.incremental_scan_ts_filter_ratio = default_cfg.incremental_scan_ts_filter_ratio; } Ok(()) } @@ -5125,21 +5157,21 @@ mod tests { min-ts-interval = "0s" "#; let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); - cfg.validate().unwrap_err(); + cfg.validate().unwrap(); let content = r#" [cdc] incremental-scan-threads = 0 "#; let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); - cfg.validate().unwrap_err(); + cfg.validate().unwrap(); let content = r#" [cdc] incremental-scan-concurrency = 0 "#; let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); - cfg.validate().unwrap_err(); + cfg.validate().unwrap(); let content = r#" [cdc] @@ -5147,7 +5179,7 @@ mod tests { incremental-scan-threads = 2 "#; let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); - cfg.validate().unwrap_err(); + cfg.validate().unwrap(); } #[test] From ab968ffb5e496ea1fce4ff40ee0e562247de98dd Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Fri, 10 Jun 2022 15:04:31 +0800 Subject: [PATCH 019/676] *: introduce general request tracker (#12460) ref tikv/tikv#12362 This commit is a large refactoring that moves collecting engine PerfContext from storage and coprocessor to engine_rocks and the tracker. Now, the storage and coprocessor are mostly decoupled with a specific engine (engine_rocks). And it introduces a general trakcer mechanism to collect the metrics of a request during its whole lifetime. It will help us collect more performance critical data of a single request more easily. Signed-off-by: Yilin Chen --- Cargo.lock | 20 ++ Cargo.toml | 2 + components/engine_panic/Cargo.toml | 1 + components/engine_panic/src/perf_context.rs | 3 +- components/engine_rocks/Cargo.toml | 1 + components/engine_rocks/src/perf_context.rs | 5 +- .../engine_rocks/src/perf_context_impl.rs | 228 +++++++++++++-- .../engine_rocks/src/perf_context_metrics.rs | 12 + components/engine_traits/Cargo.toml | 1 + components/engine_traits/src/perf_context.rs | 10 +- .../raftstore/src/store/async_io/write.rs | 2 +- components/raftstore/src/store/fsm/apply.rs | 2 +- components/raftstore/src/store/peer.rs | 2 +- components/test_storage/Cargo.toml | 1 + components/test_storage/src/sync_storage.rs | 4 +- components/tikv_kv/Cargo.toml | 1 + components/tikv_kv/src/lib.rs | 14 +- components/tikv_util/Cargo.toml | 1 + .../tikv_util/src/yatp_pool/future_pool.rs | 5 +- components/tracker/Cargo.toml | 14 + components/tracker/src/lib.rs | 86 ++++++ components/tracker/src/metrics.rs | 12 + components/tracker/src/slab.rs | 269 ++++++++++++++++++ components/tracker/src/tls.rs | 69 +++++ scripts/check-bins.py | 1 + src/coprocessor/endpoint.rs | 33 ++- src/coprocessor/interceptors/mod.rs | 4 +- src/coprocessor/interceptors/tracker.rs | 18 +- src/coprocessor/metrics.rs | 136 --------- src/coprocessor/tracker.rs | 138 +++++---- src/read_pool.rs | 5 +- src/server/service/batch.rs | 34 ++- src/server/service/kv.rs | 37 ++- src/storage/metrics.rs | 138 --------- src/storage/mod.rs | 134 ++++++--- 35 files changed, 996 insertions(+), 447 deletions(-) create mode 100644 components/tracker/Cargo.toml create mode 100644 components/tracker/src/lib.rs create mode 100644 components/tracker/src/metrics.rs create mode 100644 components/tracker/src/slab.rs create mode 100644 components/tracker/src/tls.rs diff --git a/Cargo.lock b/Cargo.lock index 75458e3d917..6691467f359 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1444,6 +1444,7 @@ dependencies = [ "raft", "tikv_alloc", "tikv_util", + "tracker", "txn_types", ] @@ -1482,6 +1483,7 @@ dependencies = [ "tikv_util", "time", "toml", + "tracker", "txn_types", ] @@ -1541,6 +1543,7 @@ dependencies = [ "tikv_alloc", "tikv_util", "toml", + "tracker", "txn_types", ] @@ -5666,6 +5669,7 @@ dependencies = [ "test_raftstore", "tikv", "tikv_util", + "tracker", "txn_types", ] @@ -6081,6 +6085,7 @@ dependencies = [ "tokio-openssl", "tokio-timer", "toml", + "tracker", "txn_types", "url", "uuid", @@ -6229,6 +6234,7 @@ dependencies = [ "tempfile", "thiserror", "tikv_util", + "tracker", "txn_types", ] @@ -6290,6 +6296,7 @@ dependencies = [ "tokio-executor", "tokio-timer", "toml", + "tracker", "url", "utime", "yatp", @@ -6598,6 +6605,19 @@ dependencies = [ "tracing", ] +[[package]] +name = "tracker" +version = "0.0.1" +dependencies = [ + "collections", + "kvproto", + "lazy_static", + "parking_lot 0.12.0", + "pin-project", + "prometheus", + "slab", +] + [[package]] name = "try-lock" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index 477716d8893..a1c1f315de3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -164,6 +164,7 @@ tokio = { version = "1.17", features = ["full"] } tokio-openssl = "0.6" tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } toml = "0.5" +tracker = { path = "components/tracker" } txn_types = { path = "components/txn_types", default-features = false } url = "2" uuid = { version = "0.8.1", features = ["serde", "v4"] } @@ -268,6 +269,7 @@ members = [ "components/tikv_alloc", "components/tikv_util", "components/tipb_helper", + "components/tracker", "components/txn_types", "fuzz", "fuzz/fuzzer-afl", diff --git a/components/engine_panic/Cargo.toml b/components/engine_panic/Cargo.toml index 36f9b92ec24..b00180c98d2 100644 --- a/components/engine_panic/Cargo.toml +++ b/components/engine_panic/Cargo.toml @@ -12,4 +12,5 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code tikv_alloc = { path = "../tikv_alloc" } # FIXME: Remove this dep from the engine_traits interface tikv_util = { path = "../tikv_util", default-features = false } +tracker = { path = "../tracker" } txn_types = { path = "../txn_types", default-features = false } diff --git a/components/engine_panic/src/perf_context.rs b/components/engine_panic/src/perf_context.rs index 654ac01a629..46d18c00e77 100644 --- a/components/engine_panic/src/perf_context.rs +++ b/components/engine_panic/src/perf_context.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{PerfContext, PerfContextExt, PerfContextKind, PerfLevel}; +use tracker::TrackerToken; use crate::engine::PanicEngine; @@ -19,7 +20,7 @@ impl PerfContext for PanicPerfContext { panic!() } - fn report_metrics(&mut self) { + fn report_metrics(&mut self, _: &[TrackerToken]) { panic!() } } diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index 7d1a90d7afe..e35438c4fe1 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -51,6 +51,7 @@ tempfile = "3.0" tikv_alloc = { path = "../tikv_alloc" } tikv_util = { path = "../tikv_util", default-features = false } time = "0.1" +tracker = { path = "../tracker" } txn_types = { path = "../txn_types", default-features = false } [dependencies.rocksdb] diff --git a/components/engine_rocks/src/perf_context.rs b/components/engine_rocks/src/perf_context.rs index 83ff4bca6bd..a731a9461dc 100644 --- a/components/engine_rocks/src/perf_context.rs +++ b/components/engine_rocks/src/perf_context.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{PerfContext, PerfContextExt, PerfContextKind, PerfLevel}; +use tracker::TrackerToken; use crate::{engine::RocksEngine, perf_context_impl::PerfContextStatistics}; @@ -30,7 +31,7 @@ impl PerfContext for RocksPerfContext { self.stats.start() } - fn report_metrics(&mut self) { - self.stats.report() + fn report_metrics(&mut self, trackers: &[TrackerToken]) { + self.stats.report(trackers) } } diff --git a/components/engine_rocks/src/perf_context_impl.rs b/components/engine_rocks/src/perf_context_impl.rs index 617abe506d8..c1c299def66 100644 --- a/components/engine_rocks/src/perf_context_impl.rs +++ b/components/engine_rocks/src/perf_context_impl.rs @@ -1,39 +1,39 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt::Debug, marker::PhantomData, ops::Sub}; +use std::{fmt::Debug, marker::PhantomData, mem, ops::Sub, time::Duration}; use derive_more::{Add, AddAssign, Sub, SubAssign}; use engine_traits::{PerfContextKind, PerfLevel}; use kvproto::kvrpcpb::ScanDetailV2; use lazy_static::lazy_static; use slog_derive::KV; +use tikv_util::time::Instant; +use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS}; use crate::{ - perf_context_metrics::{ - APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC, STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC, - }, - raw_util, set_perf_flags, set_perf_level, PerfContext as RawPerfContext, PerfFlag, PerfFlags, + perf_context_metrics::*, raw_util, set_perf_flags, set_perf_level, + PerfContext as RawPerfContext, PerfFlag, PerfFlags, }; macro_rules! report_write_perf_context { ($ctx: expr, $metric: ident) => { if $ctx.perf_level != PerfLevel::Disable { $ctx.write = WritePerfContext::capture(); - observe_perf_context_type!($ctx, $metric, write_wal_time); - observe_perf_context_type!($ctx, $metric, write_memtable_time); - observe_perf_context_type!($ctx, $metric, db_mutex_lock_nanos); - observe_perf_context_type!($ctx, $metric, pre_and_post_process); - observe_perf_context_type!($ctx, $metric, write_thread_wait); - observe_perf_context_type!($ctx, $metric, write_scheduling_flushes_compactions_time); - observe_perf_context_type!($ctx, $metric, db_condition_wait_nanos); - observe_perf_context_type!($ctx, $metric, write_delay_time); + observe_write_time!($ctx, $metric, write_wal_time); + observe_write_time!($ctx, $metric, write_memtable_time); + observe_write_time!($ctx, $metric, db_mutex_lock_nanos); + observe_write_time!($ctx, $metric, pre_and_post_process); + observe_write_time!($ctx, $metric, write_thread_wait); + observe_write_time!($ctx, $metric, write_scheduling_flushes_compactions_time); + observe_write_time!($ctx, $metric, db_condition_wait_nanos); + observe_write_time!($ctx, $metric, write_delay_time); } }; } -macro_rules! observe_perf_context_type { - ($s:expr, $metric: expr, $v:ident) => { - $metric.$v.observe(($s.write.$v) as f64 / 1e9); +macro_rules! observe_write_time { + ($ctx:expr, $metric: expr, $v:ident) => { + $metric.$v.observe(($ctx.write.$v) as f64 / 1e9); }; } @@ -143,6 +143,14 @@ impl ReadPerfContext { detail_v2.set_rocksdb_block_read_count(self.block_read_count); detail_v2.set_rocksdb_block_read_byte(self.block_read_byte); } + + fn report_to_tracker(&self, tracker: &mut Tracker) { + tracker.metrics.block_cache_hit_count += self.block_cache_hit_count; + tracker.metrics.block_read_byte += self.block_read_byte; + tracker.metrics.block_read_count += self.block_read_count; + tracker.metrics.deleted_key_skipped_count += self.internal_delete_skipped_count; + tracker.metrics.internal_key_skipped_count += self.internal_key_skipped_count; + } } #[derive(Default, Debug, Clone, Copy, Add, AddAssign, Sub, SubAssign, KV)] @@ -159,12 +167,15 @@ pub struct WritePerfContext { #[derive(Debug)] pub struct PerfContextStatistics { - pub perf_level: PerfLevel, - pub kind: PerfContextKind, - pub read: ReadPerfContext, - pub write: WritePerfContext, + perf_level: PerfLevel, + kind: PerfContextKind, + read: ReadPerfContext, + write: WritePerfContext, + last_flush_time: Instant, } +const FLUSH_METRICS_INTERVAL: Duration = Duration::from_secs(2); + impl PerfContextStatistics { /// Create an instance which stores instant statistics values, retrieved at creation. pub fn new(perf_level: PerfLevel, kind: PerfContextKind) -> Self { @@ -173,13 +184,16 @@ impl PerfContextStatistics { kind, read: Default::default(), write: Default::default(), + last_flush_time: Instant::now_coarse(), } } fn apply_perf_settings(&self) { if self.perf_level == PerfLevel::Uninitialized { match self.kind { - PerfContextKind::GenericRead => set_perf_flags(&*DEFAULT_READ_PERF_FLAGS), + PerfContextKind::Storage(_) | PerfContextKind::Coprocessor(_) => { + set_perf_flags(&*DEFAULT_READ_PERF_FLAGS) + } PerfContextKind::RaftstoreStore | PerfContextKind::RaftstoreApply => { set_perf_flags(&*DEFAULT_WRITE_PERF_FLAGS) } @@ -198,7 +212,7 @@ impl PerfContextStatistics { self.apply_perf_settings(); } - pub fn report(&mut self) { + pub fn report(&mut self, trackers: &[TrackerToken]) { match self.kind { PerfContextKind::RaftstoreApply => { report_write_perf_context!(self, APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC); @@ -206,15 +220,173 @@ impl PerfContextStatistics { PerfContextKind::RaftstoreStore => { report_write_perf_context!(self, STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC); } - PerfContextKind::GenericRead => { - // TODO: Currently, metrics about reading is reported in other ways. - // It is better to unify how to report the perf metrics. - // - // Here we only record the PerfContext data into the fields. - self.read = ReadPerfContext::capture(); + PerfContextKind::Storage(_) | PerfContextKind::Coprocessor(_) => { + let perf_context = ReadPerfContext::capture(); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| perf_context.report_to_tracker(t)); + } + self.read += perf_context; + self.flush_read_metrics(); } } } + + fn flush_read_metrics(&mut self) { + if self.last_flush_time.saturating_elapsed() < FLUSH_METRICS_INTERVAL { + return; + } + self.last_flush_time = Instant::now_coarse(); + let ctx = mem::take(&mut self.read); + let (v, tag) = match self.kind { + PerfContextKind::Storage(tag) => (&*STORAGE_ROCKSDB_PERF_COUNTER, tag), + PerfContextKind::Coprocessor(tag) => (&*COPR_ROCKSDB_PERF_COUNTER, tag), + _ => unreachable!(), + }; + v.get_metric_with_label_values(&[tag, "user_key_comparison_count"]) + .unwrap() + .inc_by(ctx.user_key_comparison_count); + v.get_metric_with_label_values(&[tag, "block_cache_hit_count"]) + .unwrap() + .inc_by(ctx.block_cache_hit_count); + v.get_metric_with_label_values(&[tag, "block_read_count"]) + .unwrap() + .inc_by(ctx.block_read_count); + v.get_metric_with_label_values(&[tag, "block_read_byte"]) + .unwrap() + .inc_by(ctx.block_read_byte); + v.get_metric_with_label_values(&[tag, "block_read_time"]) + .unwrap() + .inc_by(ctx.block_read_time); + v.get_metric_with_label_values(&[tag, "block_cache_index_hit_count"]) + .unwrap() + .inc_by(ctx.block_cache_index_hit_count); + v.get_metric_with_label_values(&[tag, "index_block_read_count"]) + .unwrap() + .inc_by(ctx.index_block_read_count); + v.get_metric_with_label_values(&[tag, "block_cache_filter_hit_count"]) + .unwrap() + .inc_by(ctx.block_cache_filter_hit_count); + v.get_metric_with_label_values(&[tag, "filter_block_read_count"]) + .unwrap() + .inc_by(ctx.filter_block_read_count); + v.get_metric_with_label_values(&[tag, "block_checksum_time"]) + .unwrap() + .inc_by(ctx.block_checksum_time); + v.get_metric_with_label_values(&[tag, "block_decompress_time"]) + .unwrap() + .inc_by(ctx.block_decompress_time); + v.get_metric_with_label_values(&[tag, "get_read_bytes"]) + .unwrap() + .inc_by(ctx.get_read_bytes); + v.get_metric_with_label_values(&[tag, "iter_read_bytes"]) + .unwrap() + .inc_by(ctx.iter_read_bytes); + v.get_metric_with_label_values(&[tag, "internal_key_skipped_count"]) + .unwrap() + .inc_by(ctx.internal_key_skipped_count); + v.get_metric_with_label_values(&[tag, "internal_delete_skipped_count"]) + .unwrap() + .inc_by(ctx.internal_delete_skipped_count); + v.get_metric_with_label_values(&[tag, "internal_recent_skipped_count"]) + .unwrap() + .inc_by(ctx.internal_recent_skipped_count); + v.get_metric_with_label_values(&[tag, "get_snapshot_time"]) + .unwrap() + .inc_by(ctx.get_snapshot_time); + v.get_metric_with_label_values(&[tag, "get_from_memtable_time"]) + .unwrap() + .inc_by(ctx.get_from_memtable_time); + v.get_metric_with_label_values(&[tag, "get_from_memtable_count"]) + .unwrap() + .inc_by(ctx.get_from_memtable_count); + v.get_metric_with_label_values(&[tag, "get_post_process_time"]) + .unwrap() + .inc_by(ctx.get_post_process_time); + v.get_metric_with_label_values(&[tag, "get_from_output_files_time"]) + .unwrap() + .inc_by(ctx.get_from_output_files_time); + v.get_metric_with_label_values(&[tag, "seek_on_memtable_time"]) + .unwrap() + .inc_by(ctx.seek_on_memtable_time); + v.get_metric_with_label_values(&[tag, "seek_on_memtable_count"]) + .unwrap() + .inc_by(ctx.seek_on_memtable_count); + v.get_metric_with_label_values(&[tag, "next_on_memtable_count"]) + .unwrap() + .inc_by(ctx.next_on_memtable_count); + v.get_metric_with_label_values(&[tag, "prev_on_memtable_count"]) + .unwrap() + .inc_by(ctx.prev_on_memtable_count); + v.get_metric_with_label_values(&[tag, "seek_child_seek_time"]) + .unwrap() + .inc_by(ctx.seek_child_seek_time); + v.get_metric_with_label_values(&[tag, "seek_child_seek_count"]) + .unwrap() + .inc_by(ctx.seek_child_seek_count); + v.get_metric_with_label_values(&[tag, "seek_min_heap_time"]) + .unwrap() + .inc_by(ctx.seek_min_heap_time); + v.get_metric_with_label_values(&[tag, "seek_max_heap_time"]) + .unwrap() + .inc_by(ctx.seek_max_heap_time); + v.get_metric_with_label_values(&[tag, "seek_internal_seek_time"]) + .unwrap() + .inc_by(ctx.seek_internal_seek_time); + v.get_metric_with_label_values(&[tag, "db_mutex_lock_nanos"]) + .unwrap() + .inc_by(ctx.db_mutex_lock_nanos); + v.get_metric_with_label_values(&[tag, "db_condition_wait_nanos"]) + .unwrap() + .inc_by(ctx.db_condition_wait_nanos); + v.get_metric_with_label_values(&[tag, "read_index_block_nanos"]) + .unwrap() + .inc_by(ctx.read_index_block_nanos); + v.get_metric_with_label_values(&[tag, "read_filter_block_nanos"]) + .unwrap() + .inc_by(ctx.read_filter_block_nanos); + v.get_metric_with_label_values(&[tag, "new_table_block_iter_nanos"]) + .unwrap() + .inc_by(ctx.new_table_block_iter_nanos); + v.get_metric_with_label_values(&[tag, "new_table_iterator_nanos"]) + .unwrap() + .inc_by(ctx.new_table_iterator_nanos); + v.get_metric_with_label_values(&[tag, "block_seek_nanos"]) + .unwrap() + .inc_by(ctx.block_seek_nanos); + v.get_metric_with_label_values(&[tag, "find_table_nanos"]) + .unwrap() + .inc_by(ctx.find_table_nanos); + v.get_metric_with_label_values(&[tag, "bloom_memtable_hit_count"]) + .unwrap() + .inc_by(ctx.bloom_memtable_hit_count); + v.get_metric_with_label_values(&[tag, "bloom_memtable_miss_count"]) + .unwrap() + .inc_by(ctx.bloom_memtable_miss_count); + v.get_metric_with_label_values(&[tag, "bloom_sst_hit_count"]) + .unwrap() + .inc_by(ctx.bloom_sst_hit_count); + v.get_metric_with_label_values(&[tag, "bloom_sst_miss_count"]) + .unwrap() + .inc_by(ctx.bloom_sst_miss_count); + v.get_metric_with_label_values(&[tag, "get_cpu_nanos"]) + .unwrap() + .inc_by(ctx.get_cpu_nanos); + v.get_metric_with_label_values(&[tag, "iter_next_cpu_nanos"]) + .unwrap() + .inc_by(ctx.iter_next_cpu_nanos); + v.get_metric_with_label_values(&[tag, "iter_prev_cpu_nanos"]) + .unwrap() + .inc_by(ctx.iter_prev_cpu_nanos); + v.get_metric_with_label_values(&[tag, "iter_seek_cpu_nanos"]) + .unwrap() + .inc_by(ctx.iter_seek_cpu_nanos); + v.get_metric_with_label_values(&[tag, "encrypt_data_nanos"]) + .unwrap() + .inc_by(ctx.encrypt_data_nanos); + v.get_metric_with_label_values(&[tag, "decrypt_data_nanos"]) + .unwrap() + .inc_by(ctx.decrypt_data_nanos); + } } pub trait PerfContextFields: Debug + Clone + Copy + Sub + slog::KV { diff --git a/components/engine_rocks/src/perf_context_metrics.rs b/components/engine_rocks/src/perf_context_metrics.rs index 5d58066500f..cca9f551bc1 100644 --- a/components/engine_rocks/src/perf_context_metrics.rs +++ b/components/engine_rocks/src/perf_context_metrics.rs @@ -36,6 +36,18 @@ lazy_static! { exponential_buckets(0.0005, 2.0, 20).unwrap() ) .unwrap(); + pub static ref STORAGE_ROCKSDB_PERF_COUNTER: IntCounterVec = register_int_counter_vec!( + "tikv_storage_rocksdb_perf", + "Total number of RocksDB internal operations from PerfContext", + &["req", "metric"] + ) + .unwrap(); + pub static ref COPR_ROCKSDB_PERF_COUNTER: IntCounterVec = register_int_counter_vec!( + "tikv_coprocessor_rocksdb_perf", + "Total number of RocksDB internal operations from PerfContext", + &["req", "metric"] + ) + .unwrap(); pub static ref APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration = auto_flush_from!(APPLY_PERF_CONTEXT_TIME_HISTOGRAM, PerfContextTimeDuration); pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration = diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index 3b8c3efa33b..fb4bb69e5bc 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -22,6 +22,7 @@ slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global thiserror = "1.0" tikv_alloc = { path = "../tikv_alloc" } tikv_util = { path = "../tikv_util", default-features = false } +tracker = { path = "../tracker" } txn_types = { path = "../txn_types", default-features = false } [dev-dependencies] diff --git a/components/engine_traits/src/perf_context.rs b/components/engine_traits/src/perf_context.rs index f213925ddbd..c46ec4a95c8 100644 --- a/components/engine_traits/src/perf_context.rs +++ b/components/engine_traits/src/perf_context.rs @@ -1,5 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use tikv_util::numeric_enum_serializing_mod; +use tracker::TrackerToken; #[derive(Copy, Clone, Debug, PartialEq)] pub enum PerfLevel { @@ -47,7 +48,10 @@ pub trait PerfContextExt { pub enum PerfContextKind { RaftstoreApply, RaftstoreStore, - GenericRead, + /// Commands in tikv::storage, the inner str is the command tag. + Storage(&'static str), + /// Coprocessor requests in tikv::coprocessor, the inner str is the request type. + Coprocessor(&'static str), } /// Reports metrics to prometheus @@ -58,6 +62,6 @@ pub trait PerfContext: Send { /// Reinitializes statistics and the perf level fn start_observe(&mut self); - /// Reports the current collected metrics to prometheus - fn report_metrics(&mut self); + /// Reports the current collected metrics to prometheus and trackers + fn report_metrics(&mut self, trackers: &[TrackerToken]); } diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index c9490738da4..373b64134d3 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -533,7 +533,7 @@ where self.store_id, self.tag, e ); }); - self.perf_context.report_metrics(); + self.perf_context.report_metrics(&[]); // TODO: pass in request trackers write_raft_time = duration_to_sec(now.saturating_elapsed()); STORE_WRITE_RAFTDB_DURATION_HISTOGRAM.observe(write_raft_time); } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index b74a49c4273..ca6cabb7a95 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -525,7 +525,7 @@ where self.kv_wb().write_opt(&write_opts).unwrap_or_else(|e| { panic!("failed to write to engine: {:?}", e); }); - self.perf_context.report_metrics(); + self.perf_context.report_metrics(&[]); // TODO: pass in request trackers self.sync_log_hint = false; let data_size = self.kv_wb().data_size(); if data_size > APPLY_WB_SHRINK_SIZE { diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9c480182943..374df821b9b 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1306,7 +1306,7 @@ where perf_context.start_observe(); engines.raft.consume(&mut raft_wb, true)?; - perf_context.report_metrics(); + perf_context.report_metrics(&[]); if self.get_store().is_initialized() && !keep_data { // If we meet panic when deleting data and raft log, the dirty data diff --git a/components/test_storage/Cargo.toml b/components/test_storage/Cargo.toml index 9a2c26aad22..65aa08cd101 100644 --- a/components/test_storage/Cargo.toml +++ b/components/test_storage/Cargo.toml @@ -30,4 +30,5 @@ raftstore = { path = "../raftstore", default-features = false } test_raftstore = { path = "../test_raftstore", default-features = false } tikv = { path = "../../", default-features = false } tikv_util = { path = "../tikv_util", default-features = false } +tracker = { path = "../tracker", default-features = false } txn_types = { path = "../txn_types", default-features = false } diff --git a/components/test_storage/src/sync_storage.rs b/components/test_storage/src/sync_storage.rs index af8a079a4de..b32dbe08fd5 100644 --- a/components/test_storage/src/sync_storage.rs +++ b/components/test_storage/src/sync_storage.rs @@ -19,6 +19,7 @@ use tikv::{ }, }; use tikv_util::time::Instant; +use tracker::INVALID_TRACKER_TOKEN; use txn_types::{Key, KvPair, Mutation, TimeStamp, Value}; /// A builder to build a `SyncTestStorage`. @@ -179,10 +180,11 @@ impl SyncTestStorage { req }) .collect(); + let trackers = keys.iter().map(|_| INVALID_TRACKER_TOKEN).collect(); let p = GetConsumer::new(); block_on( self.store - .batch_get_command(requests, ids, p.clone(), Instant::now()), + .batch_get_command(requests, ids, trackers, p.clone(), Instant::now()), )?; let mut values = vec![]; for value in p.take_data().into_iter() { diff --git a/components/tikv_kv/Cargo.toml b/components/tikv_kv/Cargo.toml index 50a92878404..5b640d3b0b7 100644 --- a/components/tikv_kv/Cargo.toml +++ b/components/tikv_kv/Cargo.toml @@ -46,6 +46,7 @@ slog_derive = "0.2" tempfile = "3.0" thiserror = "1.0" tikv_util = { path = "../tikv_util", default-features = false } +tracker = { path = "../tracker" } txn_types = { path = "../txn_types", default-features = false } [dev-dependencies] diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 009f8fbc93e..adb04fc25cd 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -20,7 +20,14 @@ mod raftstore_impls; mod rocksdb_engine; mod stats; -use std::{cell::UnsafeCell, error, num::NonZeroU64, ptr, result, sync::Arc, time::Duration}; +use std::{ + cell::UnsafeCell, + error, + num::NonZeroU64, + ptr, result, + sync::Arc, + time::{Duration, Instant}, +}; use engine_traits::{ CfName, IterOptions, KvEngine as LocalEngine, Mutable, MvccProperties, ReadOptions, WriteBatch, @@ -38,6 +45,7 @@ use pd_client::BucketMeta; use raftstore::store::{PessimisticLockPair, TxnExt}; use thiserror::Error; use tikv_util::{deadline::Deadline, escape, time::ThreadReadId}; +use tracker::with_tls_tracker; use txn_types::{Key, PessimisticLock, TimeStamp, TxnExtra, Value}; pub use self::{ @@ -561,6 +569,7 @@ pub fn snapshot( engine: &E, ctx: SnapContext<'_>, ) -> impl std::future::Future> { + let begin = Instant::now(); let (callback, future) = tikv_util::future::paired_must_called_future_callback(drop_snapshot_callback::); let val = engine.async_snapshot(ctx, callback); @@ -570,6 +579,9 @@ pub fn snapshot( let result = future .map_err(|cancel| Error::from(ErrorInner::Other(box_err!(cancel)))) .await?; + with_tls_tracker(|tracker| { + tracker.metrics.get_snapshot_nanos += begin.elapsed().as_nanos() as u64; + }); fail_point!("after-snapshot"); result } diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 45425f83cec..9bbea72d8d5 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -55,6 +55,7 @@ time = "0.1" tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-executor = "0.1" tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tracker = { path = "../tracker" } url = "2" yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 0beca9a5dee..a40221e3b6d 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -14,6 +14,7 @@ use std::{ use fail::fail_point; use futures::channel::oneshot::{self, Canceled}; use prometheus::{IntCounter, IntGauge}; +use tracker::TrackedFuture; use yatp::task::future; pub type ThreadPool = yatp::ThreadPool; @@ -81,7 +82,7 @@ impl FuturePool { where F: Future + Send + 'static, { - self.inner.spawn(future) + self.inner.spawn(TrackedFuture::new(future)) } /// Spawns a future in the pool and returns a handle to the result of the future. @@ -95,7 +96,7 @@ impl FuturePool { F: Future + Send + 'static, F::Output: Send, { - self.inner.spawn_handle(future) + self.inner.spawn_handle(TrackedFuture::new(future)) } } diff --git a/components/tracker/Cargo.toml b/components/tracker/Cargo.toml new file mode 100644 index 00000000000..fcaf546cf5b --- /dev/null +++ b/components/tracker/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "tracker" +version = "0.0.1" +edition = "2018" +publish = false + +[dependencies] +collections = { path = "../../components/collections" } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +lazy_static = "1" +parking_lot = "0.12" +pin-project = "1" +prometheus = "0.13" +slab = "0.4" diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs new file mode 100644 index 00000000000..909e093ed3f --- /dev/null +++ b/components/tracker/src/lib.rs @@ -0,0 +1,86 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(derive_default_enum)] +#![feature(array_from_fn)] + +mod metrics; +mod slab; +mod tls; + +use kvproto::kvrpcpb as pb; + +pub use self::{ + slab::{TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}, + tls::*, +}; + +#[derive(Debug)] +pub struct Tracker { + pub req_info: RequestInfo, + pub metrics: RequestMetrics, + // TODO: Add request stage info + // pub current_stage: RequestStage, +} + +impl Tracker { + pub fn new(req_info: RequestInfo) -> Self { + Self { + req_info, + metrics: Default::default(), + } + } + + pub fn write_scan_detail(&self, detail_v2: &mut pb::ScanDetailV2) { + detail_v2.set_rocksdb_block_read_byte(self.metrics.block_read_byte); + detail_v2.set_rocksdb_block_read_count(self.metrics.block_read_count); + detail_v2.set_rocksdb_block_cache_hit_count(self.metrics.block_cache_hit_count); + detail_v2.set_rocksdb_key_skipped_count(self.metrics.internal_key_skipped_count); + detail_v2.set_rocksdb_delete_skipped_count(self.metrics.deleted_key_skipped_count); + } +} + +#[derive(Debug, Default)] +pub struct RequestInfo { + pub region_id: u64, + pub start_ts: u64, + pub task_id: u64, + pub resource_group_tag: Vec, + pub request_type: RequestType, +} + +impl RequestInfo { + pub fn new(ctx: &pb::Context, request_type: RequestType, start_ts: u64) -> RequestInfo { + RequestInfo { + region_id: ctx.get_region_id(), + start_ts, + task_id: ctx.get_task_id(), + resource_group_tag: ctx.get_resource_group_tag().to_vec(), + request_type, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum RequestType { + #[default] + Unknown, + KvGet, + KvBatchGet, + KvBatchGetCommand, + KvScan, + KvScanLock, + CoprocessorDag, + CoprocessorAnalyze, + CoprocessorChecksum, +} + +#[derive(Debug, Default, Clone)] +pub struct RequestMetrics { + pub get_snapshot_nanos: u64, + pub block_cache_hit_count: u64, + pub block_read_count: u64, + pub block_read_byte: u64, + pub block_read_nanos: u64, + pub internal_key_skipped_count: u64, + pub deleted_key_skipped_count: u64, +} diff --git a/components/tracker/src/metrics.rs b/components/tracker/src/metrics.rs new file mode 100644 index 00000000000..90cce44cd52 --- /dev/null +++ b/components/tracker/src/metrics.rs @@ -0,0 +1,12 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use lazy_static::lazy_static; +use prometheus::*; + +lazy_static! { + pub static ref SLAB_FULL_COUNTER: IntCounter = register_int_counter!( + "tikv_tracker_slab_full_counter", + "Number of tracker slab insert failures because of fullness" + ) + .unwrap(); +} diff --git a/components/tracker/src/slab.rs b/components/tracker/src/slab.rs new file mode 100644 index 00000000000..9d2803e7585 --- /dev/null +++ b/components/tracker/src/slab.rs @@ -0,0 +1,269 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{array, cell::Cell, fmt}; + +use lazy_static::lazy_static; +use parking_lot::Mutex; +use slab::Slab; + +use crate::{metrics::*, Tracker}; + +const SLAB_SHARD_BITS: u32 = 6; +const SLAB_SHARD_COUNT: usize = 1 << SLAB_SHARD_BITS; // 64 +const SLAB_SHARD_INIT_CAPACITY: usize = 256; +const SLAB_SHARD_MAX_CAPACITY: usize = 4096; + +lazy_static! { + pub static ref GLOBAL_TRACKERS: ShardedSlab = ShardedSlab::new(SLAB_SHARD_INIT_CAPACITY); +} + +fn next_shard_id() -> usize { + thread_local! { + static CURRENT_SHARD_ID: Cell = Cell::new(0); + } + CURRENT_SHARD_ID.with(|c| { + let shard_id = c.get(); + c.set((shard_id + 1) % SLAB_SHARD_COUNT); + shard_id + }) +} + +pub struct ShardedSlab { + shards: [Mutex; SLAB_SHARD_COUNT], +} + +impl ShardedSlab { + pub fn new(capacity_per_shard: usize) -> ShardedSlab { + let shards = array::from_fn(|shard_id| { + Mutex::new(TrackerSlab::with_capacity( + shard_id as u32, + capacity_per_shard, + )) + }); + ShardedSlab { shards } + } + + pub fn insert(&self, tracker: Tracker) -> TrackerToken { + let shard_id = next_shard_id(); + self.shards[shard_id].lock().insert(tracker) + } + + pub fn remove(&self, token: TrackerToken) -> Option { + if token != INVALID_TRACKER_TOKEN { + let shard_id = token.shard_id(); + self.shards[shard_id as usize].lock().remove(token) + } else { + None + } + } + + pub fn with_tracker(&self, token: TrackerToken, f: F) -> Option + where + F: FnOnce(&mut Tracker) -> T, + { + if token != INVALID_TRACKER_TOKEN { + let shard_id = token.shard_id(); + self.shards[shard_id as usize].lock().get_mut(token).map(f) + } else { + None + } + } + + pub fn for_each(&self, mut f: F) + where + F: FnMut(&mut Tracker), + { + for shard in &self.shards { + for (_, tracker) in shard.lock().slab.iter_mut() { + f(&mut tracker.tracker) + } + } + } +} + +const SLAB_KEY_BITS: u32 = 32; +const SHARD_ID_BITS_SHIFT: u32 = 64 - SLAB_SHARD_BITS; +const SEQ_BITS_MASK: u32 = (1 << (SHARD_ID_BITS_SHIFT - SLAB_KEY_BITS)) - 1; + +struct TrackerSlab { + slab: Slab, + shard_id: u32, + seq: u32, +} + +impl TrackerSlab { + fn with_capacity(shard_id: u32, capacity: usize) -> Self { + assert!(capacity < SLAB_SHARD_MAX_CAPACITY); + TrackerSlab { + slab: Slab::with_capacity(capacity), + shard_id, + seq: 0, + } + } + + // Returns the seq and key of the inserted tracker. + // If the slab reaches the max capacity, the tracker will be dropped silently + // and INVALID_TRACKER_TOKEN will be returned. + fn insert(&mut self, tracker: Tracker) -> TrackerToken { + if self.slab.len() < SLAB_SHARD_MAX_CAPACITY { + self.seq = (self.seq + 1) & SEQ_BITS_MASK; + let key = self.slab.insert(SlabEntry { + tracker, + seq: self.seq, + }); + TrackerToken::new(self.shard_id, self.seq, key) + } else { + SLAB_FULL_COUNTER.inc(); + INVALID_TRACKER_TOKEN + } + } + + pub fn get_mut(&mut self, token: TrackerToken) -> Option<&mut Tracker> { + if let Some(entry) = self.slab.get_mut(token.key()) { + if entry.seq == token.seq() { + return Some(&mut entry.tracker); + } + } + None + } + + pub fn remove(&mut self, token: TrackerToken) -> Option { + if self.get_mut(token).is_some() { + Some(self.slab.remove(token.key()).tracker) + } else { + None + } + } +} + +struct SlabEntry { + tracker: Tracker, + seq: u32, +} + +pub const INVALID_TRACKER_TOKEN: TrackerToken = TrackerToken(u64::MAX); + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct TrackerToken(u64); + +impl TrackerToken { + fn new(shard_id: u32, seq: u32, key: usize) -> TrackerToken { + debug_assert!(shard_id < SLAB_SHARD_COUNT as u32); + debug_assert!(seq <= SEQ_BITS_MASK); + debug_assert!(key < (1 << SLAB_KEY_BITS)); + TrackerToken( + ((shard_id as u64) << SHARD_ID_BITS_SHIFT) + | ((seq as u64) << SLAB_KEY_BITS) + | (key as u64), + ) + } + + fn shard_id(&self) -> u32 { + (self.0 >> SHARD_ID_BITS_SHIFT) as u32 + } + + fn seq(&self) -> u32 { + (self.0 >> SLAB_KEY_BITS) as u32 & SEQ_BITS_MASK + } + + fn key(&self) -> usize { + (self.0 & ((1 << SLAB_KEY_BITS) - 1)) as usize + } +} + +impl fmt::Debug for TrackerToken { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("TrackerToken") + .field("shard_id", &self.shard_id()) + .field("seq", &self.seq()) + .field("key", &self.key()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, thread}; + + use super::*; + use crate::RequestInfo; + + #[test] + fn test_tracker_token() { + let shard_id = 47; + let seq = SEQ_BITS_MASK - 3; + let key = 65535; + let token = TrackerToken::new(shard_id, seq, key); + assert_eq!(token.shard_id(), shard_id); + assert_eq!(token.seq(), seq); + assert_eq!(token.key(), key); + } + + #[test] + fn test_basic() { + let slab = ShardedSlab::new(2); + // Insert 192 trackers + let tokens: Vec = (0..192) + .map(|i| { + let tracker = Tracker::new(RequestInfo { + task_id: i, + ..Default::default() + }); + slab.insert(tracker) + }) + .collect(); + // Get the tracker with the token and check the content + for (i, token) in tokens.iter().enumerate() { + slab.with_tracker(*token, |tracker| { + assert_eq!(i as u64, tracker.req_info.task_id); + }); + } + // Remove 0 ~ 128 trackers + for (i, token) in tokens[..128].iter().enumerate() { + let tracker = slab.remove(*token).unwrap(); + assert_eq!(i as u64, tracker.req_info.task_id); + } + // Insert another 192 trackers + for i in 192..384 { + let tracker = Tracker::new(RequestInfo { + task_id: i, + ..Default::default() + }); + slab.insert(tracker); + } + // Iterate over all trackers in the slab + let mut tracker_ids = Vec::new(); + slab.for_each(|tracker| tracker_ids.push(tracker.req_info.task_id)); + tracker_ids.sort_unstable(); + assert_eq!(tracker_ids, (128..384).collect::>()); + } + + #[test] + fn test_shard() { + let slab = Arc::new(ShardedSlab::new(4)); + let threads = [1, 2].map(|i| { + let slab = slab.clone(); + thread::spawn(move || { + for _ in 0..SLAB_SHARD_COUNT { + slab.insert(Tracker::new(RequestInfo { + task_id: i, + ..Default::default() + })); + } + }) + }); + for th in threads { + th.join().unwrap(); + } + for shard in &slab.shards { + let mut v: Vec<_> = shard + .lock() + .slab + .iter() + .map(|(_, entry)| entry.tracker.req_info.task_id) + .collect(); + v.sort_unstable(); + assert_eq!(v, [1, 2]); + } + } +} diff --git a/components/tracker/src/tls.rs b/components/tracker/src/tls.rs new file mode 100644 index 00000000000..982f483c8bc --- /dev/null +++ b/components/tracker/src/tls.rs @@ -0,0 +1,69 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cell::Cell, + future::Future, + pin::Pin, + task::{Context, Poll}, +}; + +use pin_project::pin_project; + +use crate::{slab::TrackerToken, Tracker, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; + +thread_local! { + static TLS_TRACKER_TOKEN: Cell = Cell::new(INVALID_TRACKER_TOKEN); +} + +pub fn set_tls_tracker_token(token: TrackerToken) { + TLS_TRACKER_TOKEN.with(|c| { + c.set(token); + }) +} + +pub fn clear_tls_tracker_token() { + set_tls_tracker_token(INVALID_TRACKER_TOKEN); +} + +pub fn get_tls_tracker_token() -> TrackerToken { + TLS_TRACKER_TOKEN.with(|c| c.get()) +} + +pub fn with_tls_tracker(mut f: F) +where + F: FnMut(&mut Tracker), +{ + TLS_TRACKER_TOKEN.with(|c| { + GLOBAL_TRACKERS.with_tracker(c.get(), &mut f); + }); +} + +#[pin_project] +pub struct TrackedFuture { + #[pin] + future: F, + tracker: TrackerToken, +} + +impl TrackedFuture { + pub fn new(future: F) -> TrackedFuture { + TrackedFuture { + future, + tracker: get_tls_tracker_token(), + } + } +} + +impl Future for TrackedFuture { + type Output = F::Output; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.project(); + TLS_TRACKER_TOKEN.with(|c| { + c.set(*this.tracker); + let res = this.future.poll(cx); + c.set(INVALID_TRACKER_TOKEN); + res + }) + } +} diff --git a/scripts/check-bins.py b/scripts/check-bins.py index 41d9d57c866..04a3b77c01d 100644 --- a/scripts/check-bins.py +++ b/scripts/check-bins.py @@ -14,6 +14,7 @@ "online_config", "online_config_derive", "match_template", "tidb_query_codegen", "panic_hook", "fuzz", "fuzzer_afl", "fuzzer_honggfuzz", "fuzzer_libfuzzer", "coprocessor_plugin_api", "example_plugin", "memory_trace_macros", "case_macros", + "tracker" } JEMALLOC_SYMBOL = ["je_arena_boot", " malloc"] diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 070fd25557b..fa1dce909a2 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -2,6 +2,9 @@ use std::{borrow::Cow, future::Future, marker::PhantomData, sync::Arc, time::Duration}; +use ::tracker::{ + set_tls_tracker_token, with_tls_tracker, RequestInfo, RequestType, GLOBAL_TRACKERS, +}; use async_stream::try_stream; use concurrency_manager::ConcurrencyManager; use engine_traits::PerfLevel; @@ -164,6 +167,7 @@ impl Endpoint { let mut input = CodedInputStream::from_bytes(&data); input.set_recursion_limit(self.recursion_limit); + let req_ctx: ReqContext; let builder: RequestHandlerBuilder; @@ -201,6 +205,10 @@ impl Endpoint { cache_match_version, self.perf_level, ); + with_tls_tracker(|tracker| { + tracker.req_info.request_type = RequestType::CoprocessorDag; + tracker.req_info.start_ts = start_ts; + }); self.check_memory_locks(&req_ctx)?; @@ -260,6 +268,10 @@ impl Endpoint { cache_match_version, self.perf_level, ); + with_tls_tracker(|tracker| { + tracker.req_info.request_type = RequestType::CoprocessorAnalyze; + tracker.req_info.start_ts = start_ts; + }); self.check_memory_locks(&req_ctx)?; let quota_limiter = self.quota_limiter.clone(); @@ -300,6 +312,10 @@ impl Endpoint { cache_match_version, self.perf_level, ); + with_tls_tracker(|tracker| { + tracker.req_info.request_type = RequestType::CoprocessorChecksum; + tracker.req_info.start_ts = start_ts; + }); self.check_memory_locks(&req_ctx)?; @@ -316,6 +332,7 @@ impl Endpoint { } tp => return Err(box_err!("unsupported tp {}", tp)), }; + Ok((builder, req_ctx)) } @@ -360,7 +377,7 @@ impl Endpoint { /// `RequestHandler` to process the request and produce a result. async fn handle_unary_request_impl( semaphore: Option>, - mut tracker: Box, + mut tracker: Box>, handler_builder: RequestHandlerBuilder, ) -> Result> { // When this function is being executed, it may be queued for a long time, so that @@ -468,17 +485,25 @@ impl Endpoint { req: coppb::Request, peer: Option, ) -> impl Future> { + let tracker = GLOBAL_TRACKERS.insert(::tracker::Tracker::new(RequestInfo::new( + req.get_context(), + RequestType::Unknown, + req.start_ts, + ))); + set_tls_tracker_token(tracker); let result_of_future = self .parse_request_and_check_memory_locks(req, peer, false) .map(|(handler_builder, req_ctx)| self.handle_unary_request(req_ctx, handler_builder)); async move { - match result_of_future { + let res = match result_of_future { Err(e) => make_error_response(e).into(), Ok(handle_fut) => handle_fut .await .unwrap_or_else(|e| make_error_response(e).into()), - } + }; + GLOBAL_TRACKERS.remove(tracker); + res } } @@ -489,7 +514,7 @@ impl Endpoint { /// `RequestHandler` multiple times to process the request and produce multiple results. fn handle_stream_request_impl( semaphore: Option>, - mut tracker: Box, + mut tracker: Box>, handler_builder: RequestHandlerBuilder, ) -> impl futures::stream::Stream> { try_stream! { diff --git a/src/coprocessor/interceptors/mod.rs b/src/coprocessor/interceptors/mod.rs index f7e280fb137..95496b234df 100644 --- a/src/coprocessor/interceptors/mod.rs +++ b/src/coprocessor/interceptors/mod.rs @@ -4,6 +4,4 @@ mod concurrency_limiter; mod deadline; mod tracker; -pub use concurrency_limiter::limit_concurrency; -pub use deadline::check_deadline; -pub use tracker::track; +pub use self::{concurrency_limiter::limit_concurrency, deadline::check_deadline, tracker::track}; diff --git a/src/coprocessor/interceptors/tracker.rs b/src/coprocessor/interceptors/tracker.rs index 4224a27e2be..ec8654887ef 100644 --- a/src/coprocessor/interceptors/tracker.rs +++ b/src/coprocessor/interceptors/tracker.rs @@ -7,38 +7,42 @@ use std::{ }; use pin_project::pin_project; +use tikv_kv::Engine; use crate::coprocessor::tracker::Tracker as CopTracker; -pub fn track<'a, F: Future + 'a>( +pub fn track<'a, F: Future + 'a, E: Engine>( fut: F, - cop_tracker: &'a mut CopTracker, + cop_tracker: &'a mut CopTracker, ) -> impl Future + 'a { Tracker::new(fut, cop_tracker) } #[pin_project] -struct Tracker<'a, F> +struct Tracker<'a, F, E> where F: Future, + E: Engine, { #[pin] fut: F, - cop_tracker: &'a mut CopTracker, + cop_tracker: &'a mut CopTracker, } -impl<'a, F> Tracker<'a, F> +impl<'a, F, E> Tracker<'a, F, E> where F: Future, + E: Engine, { - fn new(fut: F, cop_tracker: &'a mut CopTracker) -> Self { + fn new(fut: F, cop_tracker: &'a mut CopTracker) -> Self { Tracker { fut, cop_tracker } } } -impl<'a, F: Future> Future for Tracker<'a, F> +impl<'a, F, E> Future for Tracker<'a, F, E> where F: Future, + E: Engine, { type Output = F::Output; diff --git a/src/coprocessor/metrics.rs b/src/coprocessor/metrics.rs index f54064dcca3..f95ff6ee4db 100644 --- a/src/coprocessor/metrics.rs +++ b/src/coprocessor/metrics.rs @@ -3,7 +3,6 @@ use std::{cell::RefCell, mem, sync::Arc}; use collections::HashMap; -use engine_rocks::ReadPerfContext; use kvproto::{metapb, pdpb::QueryKind}; use pd_client::BucketMeta; use prometheus::*; @@ -62,57 +61,6 @@ make_auto_flush_static_metric! { snapshot, } - pub label_enum PerfMetric { - user_key_comparison_count, - block_cache_hit_count, - block_read_count, - block_read_byte, - block_read_time, - block_cache_index_hit_count, - index_block_read_count, - block_cache_filter_hit_count, - filter_block_read_count, - block_checksum_time, - block_decompress_time, - get_read_bytes, - iter_read_bytes, - internal_key_skipped_count, - internal_delete_skipped_count, - internal_recent_skipped_count, - get_snapshot_time, - get_from_memtable_time, - get_from_memtable_count, - get_post_process_time, - get_from_output_files_time, - seek_on_memtable_time, - seek_on_memtable_count, - next_on_memtable_count, - prev_on_memtable_count, - seek_child_seek_time, - seek_child_seek_count, - seek_min_heap_time, - seek_max_heap_time, - seek_internal_seek_time, - db_mutex_lock_nanos, - db_condition_wait_nanos, - read_index_block_nanos, - read_filter_block_nanos, - new_table_block_iter_nanos, - new_table_iterator_nanos, - block_seek_nanos, - find_table_nanos, - bloom_memtable_hit_count, - bloom_memtable_miss_count, - bloom_sst_hit_count, - bloom_sst_miss_count, - get_cpu_nanos, - iter_next_cpu_nanos, - iter_prev_cpu_nanos, - iter_seek_cpu_nanos, - encrypt_data_nanos, - decrypt_data_nanos, - } - pub label_enum MemLockCheckResult { unlocked, locked, @@ -127,11 +75,6 @@ make_auto_flush_static_metric! { "type" => WaitType, } - pub struct PerfCounter: LocalIntCounter { - "req" => ReqTag, - "metric" => PerfMetric, - } - pub struct CoprScanKeysHistogram: LocalHistogram { "req" => ReqTag, "kind" => ScanKeysKind, @@ -208,14 +151,6 @@ lazy_static! { .unwrap(); pub static ref COPR_SCAN_DETAILS_STATIC: CoprScanDetails = auto_flush_from!(COPR_SCAN_DETAILS, CoprScanDetails); - pub static ref COPR_ROCKSDB_PERF_COUNTER: IntCounterVec = register_int_counter_vec!( - "tikv_coprocessor_rocksdb_perf", - "Total number of RocksDB internal operations from PerfContext", - &["req", "metric"] - ) - .unwrap(); - pub static ref COPR_ROCKSDB_PERF_COUNTER_STATIC: PerfCounter = - auto_flush_from!(COPR_ROCKSDB_PERF_COUNTER, PerfCounter); pub static ref COPR_DAG_REQ_COUNT: IntCounterVec = register_int_counter_vec!( "tikv_coprocessor_dag_request_count", "Total number of DAG requests", @@ -266,7 +201,6 @@ make_static_metric! { pub struct CopLocalMetrics { local_scan_details: HashMap, local_read_stats: ReadStats, - local_perf_stats: HashMap, } thread_local! { @@ -274,20 +208,10 @@ thread_local! { CopLocalMetrics { local_scan_details: HashMap::default(), local_read_stats: ReadStats::default(), - local_perf_stats: HashMap::default(), } ); } -macro_rules! tls_flush_perf_stats { - ($tag:ident, $local_stats:ident, $stat:ident) => { - COPR_ROCKSDB_PERF_COUNTER_STATIC - .get($tag) - .$stat - .inc_by($local_stats.$stat as u64); - }; -} - impl From for CF { fn from(cf: GcKeysCF) -> CF { match cf { @@ -340,57 +264,6 @@ pub fn tls_flush(reporter: &R) { mem::swap(&mut read_stats, &mut m.local_read_stats); reporter.report_read_stats(read_stats); } - - for (req_tag, perf_stats) in m.local_perf_stats.drain() { - tls_flush_perf_stats!(req_tag, perf_stats, user_key_comparison_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_cache_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_read_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_read_byte); - tls_flush_perf_stats!(req_tag, perf_stats, block_read_time); - tls_flush_perf_stats!(req_tag, perf_stats, block_cache_index_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, index_block_read_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_cache_filter_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, filter_block_read_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_checksum_time); - tls_flush_perf_stats!(req_tag, perf_stats, block_decompress_time); - tls_flush_perf_stats!(req_tag, perf_stats, get_read_bytes); - tls_flush_perf_stats!(req_tag, perf_stats, iter_read_bytes); - tls_flush_perf_stats!(req_tag, perf_stats, internal_key_skipped_count); - tls_flush_perf_stats!(req_tag, perf_stats, internal_delete_skipped_count); - tls_flush_perf_stats!(req_tag, perf_stats, internal_recent_skipped_count); - tls_flush_perf_stats!(req_tag, perf_stats, get_snapshot_time); - tls_flush_perf_stats!(req_tag, perf_stats, get_from_memtable_time); - tls_flush_perf_stats!(req_tag, perf_stats, get_from_memtable_count); - tls_flush_perf_stats!(req_tag, perf_stats, get_post_process_time); - tls_flush_perf_stats!(req_tag, perf_stats, get_from_output_files_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_on_memtable_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_on_memtable_count); - tls_flush_perf_stats!(req_tag, perf_stats, next_on_memtable_count); - tls_flush_perf_stats!(req_tag, perf_stats, prev_on_memtable_count); - tls_flush_perf_stats!(req_tag, perf_stats, seek_child_seek_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_child_seek_count); - tls_flush_perf_stats!(req_tag, perf_stats, seek_min_heap_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_max_heap_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_internal_seek_time); - tls_flush_perf_stats!(req_tag, perf_stats, db_mutex_lock_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, db_condition_wait_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, read_index_block_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, read_filter_block_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, new_table_block_iter_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, new_table_iterator_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, block_seek_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, find_table_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, bloom_memtable_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, bloom_memtable_miss_count); - tls_flush_perf_stats!(req_tag, perf_stats, bloom_sst_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, bloom_sst_miss_count); - tls_flush_perf_stats!(req_tag, perf_stats, get_cpu_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, iter_next_cpu_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, iter_prev_cpu_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, iter_seek_cpu_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, encrypt_data_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, decrypt_data_nanos); - } }); } @@ -438,12 +311,3 @@ pub fn tls_collect_query( .add_query_num(region_id, peer, key_range, QueryKind::Coprocessor); }); } - -pub fn tls_collect_perf_stats(cmd: ReqTag, perf_stats: &ReadPerfContext) { - TLS_COP_METRICS.with(|m| { - *(m.borrow_mut() - .local_perf_stats - .entry(cmd) - .or_insert_with(Default::default)) += *perf_stats; - }); -} diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index a12f4ee0c71..df43ad39a69 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -1,9 +1,12 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use engine_rocks::{ReadPerfContext, RocksPerfContext}; -use engine_traits::{PerfContext, PerfContextKind}; +use std::{cell::RefCell, marker::PhantomData}; + +use ::tracker::{get_tls_tracker_token, with_tls_tracker}; +use engine_traits::{PerfContext, PerfContextExt, PerfContextKind}; use kvproto::{kvrpcpb, kvrpcpb::ScanDetailV2}; use pd_client::BucketMeta; +use tikv_kv::{with_tls_engine, Engine}; use tikv_util::time::{self, Duration, Instant}; use txn_types::Key; @@ -39,7 +42,7 @@ enum TrackerState { /// Track coprocessor requests to update statistics and provide slow logs. #[derive(Debug)] -pub struct Tracker { +pub struct Tracker { request_begin_at: Instant, // Intermediate results @@ -60,10 +63,6 @@ pub struct Tracker { item_process_time: Duration, total_process_time: Duration, total_storage_stats: Statistics, - // TODO: This leaks the RocksDB engine from abstraction, try to use the PerfContext - // in engine_trait instead. - perf_context: RocksPerfContext, - total_perf_stats: ReadPerfContext, // Accumulated perf statistics slow_log_threshold: Duration, scan_process_time_ms: u64, @@ -71,13 +70,15 @@ pub struct Tracker { // Request info, used to print slow log. pub req_ctx: ReqContext, + + _phantom: PhantomData E>, } -impl Tracker { +impl Tracker { /// Initialize the tracker. Normally it is called outside future pool's factory context, /// because the future pool might be full and we need to wait it. This kind of wait time /// has to be recorded. - pub fn new(req_ctx: ReqContext, slow_log_threshold: Duration) -> Tracker { + pub fn new(req_ctx: ReqContext, slow_log_threshold: Duration) -> Self { let now = Instant::now_coarse(); Tracker { request_begin_at: now, @@ -92,12 +93,11 @@ impl Tracker { total_suspend_time: Duration::default(), total_process_time: Duration::default(), total_storage_stats: Statistics::default(), - perf_context: RocksPerfContext::new(req_ctx.perf_level, PerfContextKind::GenericRead), - total_perf_stats: ReadPerfContext::default(), scan_process_time_ms: 0, slow_log_threshold, req_ctx, buckets: None, + _phantom: PhantomData, } } @@ -140,7 +140,7 @@ impl Tracker { _ => unreachable!(), } - self.perf_context.start_observe(); + self.with_perf_context(|perf_context| perf_context.start_observe()); self.current_stage = TrackerState::ItemBegan(now); } @@ -152,10 +152,9 @@ impl Tracker { if let Some(storage_stats) = some_storage_stats { self.total_storage_stats.add(&storage_stats); } - // Record delta perf statistics - self.perf_context.report_metrics(); - let perf_statistics = self.perf_context.stats.read; - self.total_perf_stats += perf_statistics; + self.with_perf_context(|perf_context| { + perf_context.report_metrics(&[get_tls_tracker_token()]) + }); self.current_stage = TrackerState::ItemFinished(now); } else { unreachable!() @@ -212,15 +211,7 @@ impl Tracker { detail_v2.set_processed_versions(self.total_storage_stats.write.processed_keys as u64); detail_v2.set_processed_versions_size(self.total_storage_stats.processed_size as u64); detail_v2.set_total_versions(self.total_storage_stats.write.total_op_count() as u64); - detail_v2.set_rocksdb_delete_skipped_count( - self.total_perf_stats.internal_delete_skipped_count as u64, - ); - detail_v2 - .set_rocksdb_key_skipped_count(self.total_perf_stats.internal_key_skipped_count as u64); - detail_v2 - .set_rocksdb_block_cache_hit_count(self.total_perf_stats.block_cache_hit_count as u64); - detail_v2.set_rocksdb_block_read_count(self.total_perf_stats.block_read_count as u64); - detail_v2.set_rocksdb_block_read_byte(self.total_perf_stats.block_read_byte as u64); + with_tls_tracker(|tracker| tracker.write_scan_detail(&mut detail_v2)); exec_details_v2.set_scan_detail_v2(detail_v2); (exec_details, exec_details_v2) @@ -252,33 +243,35 @@ impl Tracker { .unwrap_or_default() }); - info!(#"slow_log", "slow-query"; - "region_id" => &self.req_ctx.context.get_region_id(), - "remote_host" => &self.req_ctx.peer, - "total_lifetime" => ?self.req_lifetime, - "wait_time" => ?self.wait_time, - "wait_time.schedule" => ?self.schedule_wait_time, - "wait_time.snapshot" => ?self.snapshot_wait_time, - "handler_build_time" => ?self.handler_build_time, - "total_process_time" => ?self.total_process_time, - "total_suspend_time" => ?self.total_suspend_time, - "txn_start_ts" => self.req_ctx.txn_start_ts, - "table_id" => some_table_id, - "tag" => self.req_ctx.tag.get_str(), - "scan.is_desc" => self.req_ctx.is_desc_scan, - "scan.processed" => total_storage_stats.write.processed_keys, - "scan.processed_size" => total_storage_stats.processed_size, - "scan.total" => total_storage_stats.write.total_op_count(), - "scan.ranges" => self.req_ctx.ranges.len(), - "scan.range.first" => ?first_range, - "perf_stats.block_cache_hit_count" => self.total_perf_stats.block_cache_hit_count, - "perf_stats.block_read_count" => self.total_perf_stats.block_read_count, - "perf_stats.block_read_byte" => self.total_perf_stats.block_read_byte, - "perf_stats.internal_key_skipped_count" - => self.total_perf_stats.internal_key_skipped_count, - "perf_stats.internal_delete_skipped_count" - => self.total_perf_stats.internal_delete_skipped_count, - ); + with_tls_tracker(|tracker| { + info!(#"slow_log", "slow-query"; + "region_id" => &self.req_ctx.context.get_region_id(), + "remote_host" => &self.req_ctx.peer, + "total_lifetime" => ?self.req_lifetime, + "wait_time" => ?self.wait_time, + "wait_time.schedule" => ?self.schedule_wait_time, + "wait_time.snapshot" => ?self.snapshot_wait_time, + "handler_build_time" => ?self.handler_build_time, + "total_process_time" => ?self.total_process_time, + "total_suspend_time" => ?self.total_suspend_time, + "txn_start_ts" => self.req_ctx.txn_start_ts, + "table_id" => some_table_id, + "tag" => self.req_ctx.tag.get_str(), + "scan.is_desc" => self.req_ctx.is_desc_scan, + "scan.processed" => total_storage_stats.write.processed_keys, + "scan.processed_size" => total_storage_stats.processed_size, + "scan.total" => total_storage_stats.write.total_op_count(), + "scan.ranges" => self.req_ctx.ranges.len(), + "scan.range.first" => ?first_range, + "perf_stats.block_cache_hit_count" => tracker.metrics.block_cache_hit_count, + "perf_stats.block_read_count" => tracker.metrics.block_read_count, + "perf_stats.block_read_byte" => tracker.metrics.block_read_byte, + "perf_stats.internal_key_skipped_count" + => tracker.metrics.internal_key_skipped_count, + "perf_stats.internal_delete_skipped_count" + => tracker.metrics.deleted_key_skipped_count, + ) + }); } // req time @@ -325,7 +318,6 @@ impl Tracker { .observe(total_storage_stats.write.processed_keys as f64); tls_collect_scan_details(self.req_ctx.tag, &total_storage_stats); - tls_collect_perf_stats(self.req_ctx.tag, &self.total_perf_stats); let peer = self.req_ctx.context.get_peer(); let region_id = self.req_ctx.context.get_region_id(); @@ -353,9 +345,47 @@ impl Tracker { ); self.current_stage = TrackerState::Tracked; } + + fn with_perf_context(&self, f: F) -> T + where + F: FnOnce(&mut Box) -> T, + { + thread_local! { + static SELECT: RefCell>> = RefCell::new(None); + static INDEX: RefCell>> = RefCell::new(None); + static ANALYZE_TABLE: RefCell>> = RefCell::new(None); + static ANALYZE_INDEX: RefCell>> = RefCell::new(None); + static ANALYZE_FULL_SAMPLING: RefCell>> = RefCell::new(None); + static CHECKSUM_TABLE: RefCell>> = RefCell::new(None); + static CHECKSUM_INDEX: RefCell>> = RefCell::new(None); + static TEST: RefCell>> = RefCell::new(None); + } + let tls_cell = match self.req_ctx.tag { + ReqTag::select => &SELECT, + ReqTag::index => &INDEX, + ReqTag::analyze_table => &ANALYZE_TABLE, + ReqTag::analyze_index => &ANALYZE_INDEX, + ReqTag::analyze_full_sampling => &ANALYZE_FULL_SAMPLING, + ReqTag::checksum_table => &CHECKSUM_TABLE, + ReqTag::checksum_index => &CHECKSUM_INDEX, + ReqTag::test => &TEST, + }; + tls_cell.with(|c| { + let mut c = c.borrow_mut(); + let perf_context = c.get_or_insert_with(|| unsafe { + with_tls_engine::(|engine| { + Box::new(engine.kv_engine().get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), + )) + }) + }); + f(perf_context) + }) + } } -impl Drop for Tracker { +impl Drop for Tracker { /// `Tracker` may be dropped without even calling `on_begin_all_items`. For example, if /// get snapshot failed. So we fast-forward if some steps are missing. fn drop(&mut self) { diff --git a/src/read_pool.rs b/src/read_pool.rs index 239e0fc61e5..cebd1965153 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -15,6 +15,7 @@ use tikv_util::{ sys::SysQuota, yatp_pool::{self, FuturePool, PoolTicker, YatpPoolBuilder}, }; +use tracker::TrackedFuture; use yatp::{pool::Remote, queue::Extras, task::future::TaskCell}; use self::metrics::*; @@ -121,10 +122,10 @@ impl ReadPoolHandle { }; let extras = Extras::new_multilevel(task_id, fixed_level); let task_cell = TaskCell::new( - async move { + TrackedFuture::new(async move { f.await; running_tasks.dec(); - }, + }), extras, ); remote.spawn(task_cell); diff --git a/src/server/service/batch.rs b/src/server/service/batch.rs index e1f20439471..1a7fcb59c3a 100644 --- a/src/server/service/batch.rs +++ b/src/server/service/batch.rs @@ -2,9 +2,9 @@ // #[PerformanceCriticalPath] use api_version::KvFormat; -use engine_rocks::ReadPerfContext; use kvproto::kvrpcpb::*; use tikv_util::{future::poll_future_notify, mpsc::batch::Sender, time::Instant}; +use tracker::{with_tls_tracker, RequestInfo, RequestType, Tracker, TrackerToken, GLOBAL_TRACKERS}; use crate::{ server::{ @@ -27,6 +27,7 @@ pub struct ReqBatcher { gets: Vec, raw_gets: Vec, get_ids: Vec, + get_trackers: Vec, raw_get_ids: Vec, begin_instant: Instant, batch_size: usize, @@ -39,6 +40,7 @@ impl ReqBatcher { gets: vec![], raw_gets: vec![], get_ids: vec![], + get_trackers: vec![], raw_get_ids: vec![], begin_instant, batch_size: std::cmp::min(batch_size, MAX_BATCH_GET_REQUEST_COUNT), @@ -54,8 +56,14 @@ impl ReqBatcher { } pub fn add_get_request(&mut self, req: GetRequest, id: u64) { + let tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( + req.get_context(), + RequestType::KvBatchGetCommand, + req.get_version(), + ))); self.gets.push(req); self.get_ids.push(id); + self.get_trackers.push(tracker); } pub fn add_raw_get_request(&mut self, req: RawGetRequest, id: u64) { @@ -71,7 +79,8 @@ impl ReqBatcher { if self.gets.len() >= self.batch_size { let gets = std::mem::take(&mut self.gets); let ids = std::mem::take(&mut self.get_ids); - future_batch_get_command(storage, ids, gets, tx.clone(), self.begin_instant); + let trackers = std::mem::take(&mut self.get_trackers); + future_batch_get_command(storage, ids, gets, trackers, tx.clone(), self.begin_instant); } if self.raw_gets.len() >= self.batch_size { @@ -91,6 +100,7 @@ impl ReqBatcher { storage, self.get_ids, self.gets, + self.get_trackers, tx.clone(), self.begin_instant, ); @@ -141,24 +151,17 @@ pub struct GetCommandResponseConsumer { tx: Sender, } -impl ResponseBatchConsumer<(Option>, Statistics, ReadPerfContext)> - for GetCommandResponseConsumer -{ - fn consume( - &self, - id: u64, - res: Result<(Option>, Statistics, ReadPerfContext)>, - begin: Instant, - ) { +impl ResponseBatchConsumer<(Option>, Statistics)> for GetCommandResponseConsumer { + fn consume(&self, id: u64, res: Result<(Option>, Statistics)>, begin: Instant) { let mut resp = GetResponse::default(); if let Some(err) = extract_region_error(&res) { resp.set_region_error(err); } else { match res { - Ok((val, statistics, perf_statistics)) => { + Ok((val, statistics)) => { let scan_detail_v2 = resp.mut_exec_details_v2().mut_scan_detail_v2(); statistics.write_scan_detail(scan_detail_v2); - perf_statistics.write_scan_detail(scan_detail_v2); + with_tls_tracker(|tracker| tracker.write_scan_detail(scan_detail_v2)); match val { Some(val) => resp.set_value(val), None => resp.set_not_found(true), @@ -208,6 +211,7 @@ fn future_batch_get_command( storage: &Storage, requests: Vec, gets: Vec, + trackers: Vec, tx: Sender, begin_instant: tikv_util::time::Instant, ) { @@ -218,12 +222,16 @@ fn future_batch_get_command( let res = storage.batch_get_command( gets, requests, + trackers.clone(), GetCommandResponseConsumer { tx: tx.clone() }, begin_instant, ); let f = async move { // This error can only cause by readpool busy. let res = res.await; + for tracker in trackers { + GLOBAL_TRACKERS.remove(tracker); + } if let Some(e) = extract_region_error(&res) { let mut resp = GetResponse::default(); resp.set_region_error(e); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index c4960b0629a..73215f6922c 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -43,6 +43,7 @@ use tikv_util::{ time::{duration_to_ms, duration_to_sec, Instant}, worker::Scheduler, }; +use tracker::{set_tls_tracker_token, RequestInfo, RequestType, Tracker, GLOBAL_TRACKERS}; use txn_types::{self, Key}; use super::batch::{BatcherBuilder, ReqBatcher}; @@ -1327,6 +1328,12 @@ fn future_get( storage: &Storage, mut req: GetRequest, ) -> impl Future> { + let tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( + req.get_context(), + RequestType::KvGet, + req.get_version(), + ))); + set_tls_tracker_token(tracker); let start = Instant::now(); let v = storage.get( req.take_context(), @@ -1346,7 +1353,9 @@ fn future_get( let exec_detail_v2 = resp.mut_exec_details_v2(); let scan_detail_v2 = exec_detail_v2.mut_scan_detail_v2(); stats.stats.write_scan_detail(scan_detail_v2); - stats.perf_stats.write_scan_detail(scan_detail_v2); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + tracker.write_scan_detail(scan_detail_v2); + }); let time_detail = exec_detail_v2.mut_time_detail(); time_detail.set_kv_read_wall_time_ms(duration_ms as i64); time_detail.set_wait_wall_time_ms(stats.latency_stats.wait_wall_time_ms as i64); @@ -1360,6 +1369,7 @@ fn future_get( Err(e) => resp.set_error(extract_key_error(&e)), } } + GLOBAL_TRACKERS.remove(tracker); Ok(resp) } } @@ -1368,6 +1378,12 @@ fn future_scan( storage: &Storage, mut req: ScanRequest, ) -> impl Future> { + let tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( + req.get_context(), + RequestType::KvScan, + req.get_version(), + ))); + set_tls_tracker_token(tracker); let end_key = Key::from_raw_maybe_unbounded(req.get_end_key()); let v = storage.scan( @@ -1401,6 +1417,7 @@ fn future_scan( } } } + GLOBAL_TRACKERS.remove(tracker); Ok(resp) } } @@ -1409,6 +1426,12 @@ fn future_batch_get( storage: &Storage, mut req: BatchGetRequest, ) -> impl Future> { + let tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( + req.get_context(), + RequestType::KvBatchGet, + req.get_version(), + ))); + set_tls_tracker_token(tracker); let start = Instant::now(); let keys = req.get_keys().iter().map(|x| Key::from_raw(x)).collect(); let v = storage.batch_get(req.take_context(), keys, req.get_version().into()); @@ -1426,7 +1449,9 @@ fn future_batch_get( let exec_detail_v2 = resp.mut_exec_details_v2(); let scan_detail_v2 = exec_detail_v2.mut_scan_detail_v2(); stats.stats.write_scan_detail(scan_detail_v2); - stats.perf_stats.write_scan_detail(scan_detail_v2); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + tracker.write_scan_detail(scan_detail_v2); + }); let time_detail = exec_detail_v2.mut_time_detail(); time_detail.set_kv_read_wall_time_ms(duration_ms as i64); time_detail.set_wait_wall_time_ms(stats.latency_stats.wait_wall_time_ms as i64); @@ -1444,6 +1469,7 @@ fn future_batch_get( } } } + GLOBAL_TRACKERS.remove(tracker); Ok(resp) } } @@ -1452,6 +1478,12 @@ fn future_scan_lock( storage: &Storage, mut req: ScanLockRequest, ) -> impl Future> { + let tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( + req.get_context(), + RequestType::KvScanLock, + req.get_max_version(), + ))); + set_tls_tracker_token(tracker); let start_key = Key::from_raw_maybe_unbounded(req.get_start_key()); let end_key = Key::from_raw_maybe_unbounded(req.get_end_key()); @@ -1474,6 +1506,7 @@ fn future_scan_lock( Err(e) => resp.set_error(extract_key_error(&e)), } } + GLOBAL_TRACKERS.remove(tracker); Ok(resp) } } diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 408ad13ac20..fd4df727e54 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -5,7 +5,6 @@ use std::{cell::RefCell, mem, sync::Arc}; use collections::HashMap; -use engine_rocks::ReadPerfContext; use kvproto::{kvrpcpb::KeyRange, metapb, pdpb::QueryKind}; use pd_client::BucketMeta; use prometheus::*; @@ -20,7 +19,6 @@ use crate::{ struct StorageLocalMetrics { local_scan_details: HashMap, local_read_stats: ReadStats, - local_perf_stats: HashMap, } thread_local! { @@ -28,20 +26,10 @@ thread_local! { StorageLocalMetrics { local_scan_details: HashMap::default(), local_read_stats:ReadStats::default(), - local_perf_stats: HashMap::default(), } ); } -macro_rules! tls_flush_perf_stats { - ($tag:ident, $local_stats:ident, $stat:ident) => { - STORAGE_ROCKSDB_PERF_COUNTER_STATIC - .get($tag) - .$stat - .inc_by($local_stats.$stat as u64); - }; -} - pub fn tls_flush(reporter: &R) { TLS_STORAGE_METRICS.with(|m| { let mut m = m.borrow_mut(); @@ -64,57 +52,6 @@ pub fn tls_flush(reporter: &R) { mem::swap(&mut read_stats, &mut m.local_read_stats); reporter.report_read_stats(read_stats); } - - for (req_tag, perf_stats) in m.local_perf_stats.drain() { - tls_flush_perf_stats!(req_tag, perf_stats, user_key_comparison_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_cache_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_read_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_read_byte); - tls_flush_perf_stats!(req_tag, perf_stats, block_read_time); - tls_flush_perf_stats!(req_tag, perf_stats, block_cache_index_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, index_block_read_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_cache_filter_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, filter_block_read_count); - tls_flush_perf_stats!(req_tag, perf_stats, block_checksum_time); - tls_flush_perf_stats!(req_tag, perf_stats, block_decompress_time); - tls_flush_perf_stats!(req_tag, perf_stats, get_read_bytes); - tls_flush_perf_stats!(req_tag, perf_stats, iter_read_bytes); - tls_flush_perf_stats!(req_tag, perf_stats, internal_key_skipped_count); - tls_flush_perf_stats!(req_tag, perf_stats, internal_delete_skipped_count); - tls_flush_perf_stats!(req_tag, perf_stats, internal_recent_skipped_count); - tls_flush_perf_stats!(req_tag, perf_stats, get_snapshot_time); - tls_flush_perf_stats!(req_tag, perf_stats, get_from_memtable_time); - tls_flush_perf_stats!(req_tag, perf_stats, get_from_memtable_count); - tls_flush_perf_stats!(req_tag, perf_stats, get_post_process_time); - tls_flush_perf_stats!(req_tag, perf_stats, get_from_output_files_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_on_memtable_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_on_memtable_count); - tls_flush_perf_stats!(req_tag, perf_stats, next_on_memtable_count); - tls_flush_perf_stats!(req_tag, perf_stats, prev_on_memtable_count); - tls_flush_perf_stats!(req_tag, perf_stats, seek_child_seek_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_child_seek_count); - tls_flush_perf_stats!(req_tag, perf_stats, seek_min_heap_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_max_heap_time); - tls_flush_perf_stats!(req_tag, perf_stats, seek_internal_seek_time); - tls_flush_perf_stats!(req_tag, perf_stats, db_mutex_lock_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, db_condition_wait_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, read_index_block_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, read_filter_block_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, new_table_block_iter_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, new_table_iterator_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, block_seek_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, find_table_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, bloom_memtable_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, bloom_memtable_miss_count); - tls_flush_perf_stats!(req_tag, perf_stats, bloom_sst_hit_count); - tls_flush_perf_stats!(req_tag, perf_stats, bloom_sst_miss_count); - tls_flush_perf_stats!(req_tag, perf_stats, get_cpu_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, iter_next_cpu_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, iter_prev_cpu_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, iter_seek_cpu_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, encrypt_data_nanos); - tls_flush_perf_stats!(req_tag, perf_stats, decrypt_data_nanos); - } }); } @@ -177,15 +114,6 @@ pub fn tls_collect_query_batch( }); } -pub fn tls_collect_perf_stats(cmd: CommandKind, perf_stats: &ReadPerfContext) { - TLS_STORAGE_METRICS.with(|m| { - *(m.borrow_mut() - .local_perf_stats - .entry(cmd) - .or_insert_with(Default::default)) += *perf_stats; - }) -} - make_auto_flush_static_metric! { pub label_enum CommandKind { get, @@ -277,57 +205,6 @@ make_auto_flush_static_metric! { unlocked, } - pub label_enum PerfMetric { - user_key_comparison_count, - block_cache_hit_count, - block_read_count, - block_read_byte, - block_read_time, - block_cache_index_hit_count, - index_block_read_count, - block_cache_filter_hit_count, - filter_block_read_count, - block_checksum_time, - block_decompress_time, - get_read_bytes, - iter_read_bytes, - internal_key_skipped_count, - internal_delete_skipped_count, - internal_recent_skipped_count, - get_snapshot_time, - get_from_memtable_time, - get_from_memtable_count, - get_post_process_time, - get_from_output_files_time, - seek_on_memtable_time, - seek_on_memtable_count, - next_on_memtable_count, - prev_on_memtable_count, - seek_child_seek_time, - seek_child_seek_count, - seek_min_heap_time, - seek_max_heap_time, - seek_internal_seek_time, - db_mutex_lock_nanos, - db_condition_wait_nanos, - read_index_block_nanos, - read_filter_block_nanos, - new_table_block_iter_nanos, - new_table_iterator_nanos, - block_seek_nanos, - find_table_nanos, - bloom_memtable_hit_count, - bloom_memtable_miss_count, - bloom_sst_hit_count, - bloom_sst_miss_count, - get_cpu_nanos, - iter_next_cpu_nanos, - iter_prev_cpu_nanos, - iter_seek_cpu_nanos, - encrypt_data_nanos, - decrypt_data_nanos, - } - pub label_enum InMemoryPessimisticLockingResult { success, full, @@ -381,11 +258,6 @@ make_auto_flush_static_metric! { "result" => CheckMemLockResult, } - pub struct PerfCounter: LocalIntCounter { - "req" => CommandKind, - "metric" => PerfMetric, - } - pub struct TxnCommandThrottleTimeCounterVec: LocalIntCounter { "type" => CommandKind, } @@ -620,16 +492,6 @@ lazy_static! { pub static ref CHECK_MEM_LOCK_DURATION_HISTOGRAM_VEC: CheckMemLockHistogramVec = auto_flush_from!(CHECK_MEM_LOCK_DURATION_HISTOGRAM, CheckMemLockHistogramVec); - pub static ref STORAGE_ROCKSDB_PERF_COUNTER: IntCounterVec = register_int_counter_vec!( - "tikv_storage_rocksdb_perf", - "Total number of RocksDB internal operations from PerfContext", - &["req", "metric"] - ) - .unwrap(); - - pub static ref STORAGE_ROCKSDB_PERF_COUNTER_STATIC: PerfCounter = - auto_flush_from!(STORAGE_ROCKSDB_PERF_COUNTER, PerfCounter); - pub static ref TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_txn_command_throttle_time_total", "Total throttle time (microsecond) of txn commands.", diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 9e778afe064..692adec1ad1 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -52,6 +52,7 @@ mod types; use std::{ borrow::Cow, + cell::RefCell, iter, marker::PhantomData, sync::{ @@ -62,8 +63,10 @@ use std::{ use api_version::{ApiV1, ApiV2, KeyMode, KvFormat, RawValue}; use concurrency_manager::ConcurrencyManager; -use engine_rocks::{ReadPerfContext, ReadPerfInstant}; -use engine_traits::{raw_ttl::ttl_to_expire_ts, CfName, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS}; +use engine_traits::{ + raw_ttl::ttl_to_expire_ts, CfName, PerfContext, PerfContextExt, PerfContextKind, PerfLevel, + CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, +}; use futures::prelude::*; use kvproto::{ kvrpcpb::{ @@ -81,6 +84,10 @@ use tikv_util::{ quota_limiter::QuotaLimiter, time::{duration_to_ms, Instant, ThreadReadId}, }; +use tracker::{ + clear_tls_tracker_token, get_tls_tracker_token, set_tls_tracker_token, TrackedFuture, + TrackerToken, +}; use txn_types::{Key, KvPair, Lock, OldValues, TimeStamp, TsSet, Value}; pub use self::{ @@ -272,6 +279,42 @@ impl Storage { }) } + fn with_perf_context(cmd: CommandKind, f: Fn) -> T + where + Fn: FnOnce() -> T, + { + thread_local! { + static GET: RefCell>> = RefCell::new(None); + static BATCH_GET: RefCell>> = RefCell::new(None); + static BATCH_GET_COMMAND: RefCell>> = RefCell::new(None); + static SCAN: RefCell>> = RefCell::new(None); + static SCAN_LOCK: RefCell>> = RefCell::new(None); + } + let tls_cell = match cmd { + CommandKind::get => &GET, + CommandKind::batch_get => &BATCH_GET, + CommandKind::batch_get_command => &BATCH_GET_COMMAND, + CommandKind::scan => &SCAN, + CommandKind::scan_lock => &SCAN_LOCK, + _ => return f(), + }; + tls_cell.with(|c| { + let mut c = c.borrow_mut(); + let perf_context = c.get_or_insert_with(|| { + Self::with_tls_engine(|engine| { + Box::new(engine.kv_engine().get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Storage(cmd.get_str()), + )) + }) + }); + perf_context.start_observe(); + let res = f(); + perf_context.report_metrics(&[get_tls_tracker_token()]); + res + }) + } + /// Get the underlying `Engine` of the `Storage`. pub fn get_engine(&self) -> E { self.engine.clone() @@ -597,14 +640,14 @@ impl Storage { )?; let snapshot = Self::with_tls_engine(|engine| Self::snapshot(engine, snap_ctx)).await?; + { let begin_instant = Instant::now(); let stage_snap_recv_ts = begin_instant; let buckets = snapshot.ext().get_buckets(); let mut statistics = Statistics::default(); - let (result, delta) = { + let result = Self::with_perf_context(CMD, || { let _guard = sample.observe_cpu(); - let perf_statistics = ReadPerfInstant::new(); let snap_store = SnapshotStore::new( snapshot, start_ts, @@ -614,18 +657,15 @@ impl Storage { access_locks, false, ); - let result = snap_store + snap_store .get(&key, &mut statistics) // map storage::txn::Error -> storage::Error .map_err(Error::from) .map(|r| { KV_COMMAND_KEYREAD_HISTOGRAM_STATIC.get(CMD).observe(1_f64); r - }); - - let delta = perf_statistics.delta(); - (result, delta) - }; + }) + }); metrics::tls_collect_scan_details(CMD, &statistics); metrics::tls_collect_read_flow( ctx.get_region_id(), @@ -634,7 +674,6 @@ impl Storage { &statistics, buckets.as_ref(), ); - metrics::tls_collect_perf_stats(CMD, &delta); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) .observe(begin_instant.saturating_elapsed_secs()); @@ -675,7 +714,6 @@ impl Storage { result?, KvGetStatistics { stats: statistics, - perf_stats: delta, latency_stats, }, )) @@ -694,12 +732,11 @@ impl Storage { /// Get values of a set of keys with separate context from a snapshot, return a list of `Result`s. /// /// Only writes that are committed before their respective `start_ts` are visible. - pub fn batch_get_command< - P: 'static + ResponseBatchConsumer<(Option>, Statistics, ReadPerfContext)>, - >( + pub fn batch_get_command>, Statistics)>>( &self, requests: Vec, ids: Vec, + trackers: Vec, consumer: P, begin_instant: tikv_util::time::Instant, ) -> impl Future> { @@ -717,7 +754,8 @@ impl Storage { let resource_tag = self .resource_tag_factory .new_tag_with_key_ranges(rand_ctx, vec![(rand_key.clone(), rand_key)]); - + // Unset the TLS tracker because the future below does not belong to any specific request + clear_tls_tracker_token(); let res = self.read_pool.spawn_handle( async move { KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); @@ -729,7 +767,8 @@ impl Storage { let mut statistics = Statistics::default(); let mut req_snaps = vec![]; - for (mut req, id) in requests.into_iter().zip(ids) { + for ((mut req, id), tracker) in requests.into_iter().zip(ids).zip(trackers) { + set_tls_tracker_token(tracker); let mut ctx = req.take_context(); let region_id = ctx.get_region_id(); let peer = ctx.get_peer(); @@ -776,7 +815,7 @@ impl Storage { let snap = Self::with_tls_engine(|engine| Self::snapshot(engine, snap_ctx)); req_snaps.push(( - snap, + TrackedFuture::new(snap), key, start_ts, isolation_level, @@ -785,6 +824,7 @@ impl Storage { access_locks, region_id, id, + tracker, )); } Self::with_tls_engine(|engine| engine.release_snapshot()); @@ -799,9 +839,12 @@ impl Storage { access_locks, region_id, id, + tracker, ) = req_snap; - match snap.await { - Ok(snapshot) => { + let snap_res = snap.await; + set_tls_tracker_token(tracker); + match snap_res { + Ok(snapshot) => Self::with_perf_context(CMD, || { let buckets = snapshot.ext().get_buckets(); match PointGetterBuilder::new(snapshot, start_ts) .fill_cache(fill_cache) @@ -811,10 +854,8 @@ impl Storage { .build() { Ok(mut point_getter) => { - let perf_statistics = ReadPerfInstant::new(); let v = point_getter.get(&key); let stat = point_getter.take_statistics(); - let delta = perf_statistics.delta(); metrics::tls_collect_read_flow( region_id, Some(key.as_encoded()), @@ -822,12 +863,11 @@ impl Storage { &stat, buckets.as_ref(), ); - metrics::tls_collect_perf_stats(CMD, &delta); statistics.add(&stat); consumer.consume( id, v.map_err(|e| Error::from(txn::Error::from(e))) - .map(|v| (v, stat, delta)), + .map(|v| (v, stat)), begin_instant, ); } @@ -839,7 +879,7 @@ impl Storage { ); } } - } + }), Err(e) => { consumer.consume(id, Err(e), begin_instant); } @@ -933,9 +973,8 @@ impl Storage { let stage_snap_recv_ts = begin_instant; let mut statistics = Vec::with_capacity(keys.len()); let buckets = snapshot.ext().get_buckets(); - let (result, delta, stats) = { + let (result, stats) = Self::with_perf_context(CMD, || { let _guard = sample.observe_cpu(); - let perf_statistics = ReadPerfInstant::new(); let snap_store = SnapshotStore::new( snapshot, start_ts, @@ -976,11 +1015,9 @@ impl Storage { .observe(kv_pairs.len() as f64); kv_pairs }); - let delta = perf_statistics.delta(); - (result, delta, stats) - }; + (result, stats) + }); metrics::tls_collect_scan_details(CMD, &stats); - metrics::tls_collect_perf_stats(CMD, &delta); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) .observe(begin_instant.saturating_elapsed_secs()); @@ -1018,7 +1055,6 @@ impl Storage { result?, KvGetStatistics { stats, - perf_stats: delta, latency_stats, }, )) @@ -1154,9 +1190,8 @@ impl Storage { let snapshot = Self::with_tls_engine(|engine| Self::snapshot(engine, snap_ctx)).await?; - { + Self::with_perf_context(CMD, || { let begin_instant = Instant::now(); - let perf_statistics = ReadPerfInstant::new(); let buckets = snapshot.ext().get_buckets(); let snap_store = SnapshotStore::new( @@ -1174,7 +1209,6 @@ impl Storage { let res = scanner.scan(limit, sample_step); let statistics = scanner.take_statistics(); - let delta = perf_statistics.delta(); metrics::tls_collect_scan_details(CMD, &statistics); metrics::tls_collect_read_flow( ctx.get_region_id(), @@ -1183,7 +1217,6 @@ impl Storage { &statistics, buckets.as_ref(), ); - metrics::tls_collect_perf_stats(CMD, &delta); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) .observe(begin_instant.saturating_elapsed_secs()); @@ -1200,7 +1233,7 @@ impl Storage { .map(|x| x.map_err(Error::from)) .collect() }) - } + }) } .in_resource_metering_tag(resource_tag), priority, @@ -1304,10 +1337,9 @@ impl Storage { let snapshot = Self::with_tls_engine(|engine| Self::snapshot(engine, snap_ctx)).await?; - { + Self::with_perf_context(CMD, || { let begin_instant = Instant::now(); let mut statistics = Statistics::default(); - let perf_statistics = ReadPerfInstant::new(); let buckets = snapshot.ext().get_buckets(); let mut reader = MvccReader::new( snapshot, @@ -1331,7 +1363,6 @@ impl Storage { locks.push(lock_info); } - let delta = perf_statistics.delta(); metrics::tls_collect_scan_details(CMD, &statistics); metrics::tls_collect_read_flow( ctx.get_region_id(), @@ -1340,7 +1371,6 @@ impl Storage { &statistics, buckets.as_ref(), ); - metrics::tls_collect_perf_stats(CMD, &delta); SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(CMD) .observe(begin_instant.saturating_elapsed_secs()); @@ -1349,7 +1379,7 @@ impl Storage { .observe(command_duration.saturating_elapsed_secs()); Ok(locks) - } + }) } .in_resource_metering_tag(resource_tag), priority, @@ -3029,11 +3059,11 @@ pub mod test_util { } } - impl ResponseBatchConsumer<(Option>, Statistics, ReadPerfContext)> for GetConsumer { + impl ResponseBatchConsumer<(Option>, Statistics)> for GetConsumer { fn consume( &self, id: u64, - res: Result<(Option>, Statistics, ReadPerfContext)>, + res: Result<(Option>, Statistics)>, _: tikv_util::time::Instant, ) { self.data.lock().unwrap().push(GetResult { @@ -3060,7 +3090,6 @@ pub mod test_util { #[derive(Debug, Default, Clone)] pub struct KvGetStatistics { pub stats: Statistics, - pub perf_stats: ReadPerfContext, pub latency_stats: StageLatencyStats, } @@ -3085,6 +3114,7 @@ mod tests { use futures::executor::block_on; use kvproto::kvrpcpb::{AssertionLevel, CommandPri, Op}; use tikv_util::config::ReadableSize; + use tracker::INVALID_TRACKER_TOKEN; use txn_types::{Mutation, PessimisticLock, WriteType}; use super::{ @@ -3277,6 +3307,7 @@ mod tests { block_on(storage.batch_get_command( vec![create_get_request(b"c", 1), create_get_request(b"d", 1)], vec![1, 2], + vec![INVALID_TRACKER_TOKEN; 2], consumer.clone(), Instant::now(), )) @@ -3964,6 +3995,7 @@ mod tests { block_on(storage.batch_get_command( vec![create_get_request(b"c", 2), create_get_request(b"d", 2)], vec![1, 2], + vec![INVALID_TRACKER_TOKEN; 2], consumer.clone(), Instant::now(), )) @@ -4004,6 +4036,7 @@ mod tests { create_get_request(b"b", 5), ], vec![1, 2, 3, 4], + vec![INVALID_TRACKER_TOKEN; 4], consumer.clone(), Instant::now(), )) @@ -7736,6 +7769,7 @@ mod tests { block_on(storage.batch_get_command( vec![req1.clone(), req2], vec![1, 2], + vec![INVALID_TRACKER_TOKEN; 2], consumer.clone(), Instant::now(), )) @@ -7809,8 +7843,14 @@ mod tests { req.set_key(k1.clone()); req.set_version(110); let consumer = GetConsumer::new(); - block_on(storage.batch_get_command(vec![req], vec![1], consumer.clone(), Instant::now())) - .unwrap(); + block_on(storage.batch_get_command( + vec![req], + vec![1], + vec![INVALID_TRACKER_TOKEN], + consumer.clone(), + Instant::now(), + )) + .unwrap(); let res = consumer.take_data(); assert_eq!(res.len(), 1); assert_eq!(res[0].as_ref().unwrap(), &Some(v1.clone())); From c17e29b82b790d7e8379480384c4a3625d722297 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 10 Jun 2022 18:14:30 +0800 Subject: [PATCH 020/676] engine: deprecate gc-merge-rewrite option (#12798) close tikv/tikv#12797 Deprecate gc-merge-rewrite option. Signed-off-by: tabokie --- etc/config-template.toml | 17 +++++++++-------- src/config.rs | 20 +++++++++++++++++++- tests/integrations/config/mod.rs | 2 +- tests/integrations/config/test-custom.toml | 2 -- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/etc/config-template.toml b/etc/config-template.toml index 1e673fbc3fa..ab2ffa28acf 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -822,10 +822,15 @@ ## default: 0.5 # discardable-ratio = 0.5 -## The mode used to process blob files. In read-only mode Titan -## stops writing value into blob log. In fallback mode Titan -## converts blob index into real value on flush and compaction. -## This option is especially useful for downgrading Titan. +## The mode used to process blob files. In read-only mode Titan stops writing +## value into blob log. In fallback mode Titan converts blob index into real +## value on flush and compaction. +## +## This option can be used to disable Titan. More specifically, to disable +## Titan, set this option to fallback and perform a full compaction using +## tikv-ctl. Then, monitor the blob file size metrics. After the blob file size +## decreases to 0, you can set rocksdb.titan.enabled to false and restart TiKV. +## ## default: kNormal ## read-only: kReadOnly ## fallback: kFallback @@ -844,10 +849,6 @@ ## default: false # level-merge = false -## Use merge operator to rewrite GC blob index. -## default: false -# gc-merge-rewrite = false - ## Options for "Write" Column Family, which stores MVCC commit information [rocksdb.writecf] ## Recommend to set it the same as `rocksdb.defaultcf.compression-per-level`. diff --git a/src/config.rs b/src/config.rs index 37278fd09e2..d37e0892082 100644 --- a/src/config.rs +++ b/src/config.rs @@ -139,7 +139,10 @@ pub struct TitanCfConfig { pub range_merge: bool, #[online_config(skip)] pub max_sorted_runs: i32, + // deprecated. #[online_config(skip)] + #[doc(hidden)] + #[serde(skip_serializing)] pub gc_merge_rewrite: bool, } @@ -178,9 +181,23 @@ impl TitanCfConfig { opts.set_level_merge(self.level_merge); opts.set_range_merge(self.range_merge); opts.set_max_sorted_runs(self.max_sorted_runs); - opts.set_gc_merge_rewrite(self.gc_merge_rewrite); opts } + + fn validate(&self) -> Result<(), Box> { + if self.gc_merge_rewrite { + return Err( + "gc-merge-rewrite is deprecated. The data produced when this \ + option is enabled cannot be read by this version. Therefore, if \ + this option has been applied to an existing node, you must downgrade \ + it to the previous version and fully clean up the old data. See more \ + details of how to do that in the documentation for the blob-run-mode \ + confuguration." + .into(), + ); + } + Ok(()) + } } #[derive(Clone, Copy, Debug, Eq, PartialEq)] @@ -332,6 +349,7 @@ macro_rules! cf_config { ) .into()); } + self.titan.validate()?; Ok(()) } } diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 3bd932262e5..589b0ff7a56 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -256,7 +256,7 @@ fn test_serde_custom_tikv_config() { level_merge: true, range_merge: true, max_sorted_runs: 100, - gc_merge_rewrite: true, + gc_merge_rewrite: false, }; let titan_db_config = TitanDBConfig { enabled: true, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index d02aebc4df3..36b82b056f1 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -334,7 +334,6 @@ blob-run-mode = "fallback" level-merge = true range-merge = true max-sorted-runs = 100 -gc-merge-rewrite = true [rocksdb.writecf] block-size = "12KB" @@ -566,7 +565,6 @@ blob-run-mode = "fallback" level-merge = true range-merge = true max-sorted-runs = 100 -gc-merge-rewrite = true [raft-engine] enable = false From 265dbd2b1a3ec9c2c16224e84f2584f6534065fe Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 10 Jun 2022 07:48:31 -0700 Subject: [PATCH 021/676] update several deps (#12775) close tikv/tikv#12773 replace signal with signal-hook and update other deps by depbot. Signed-off-by: Jay Lee Co-authored-by: zhangjinpeng1987 --- Cargo.lock | 170 ++++------- cmd/tikv-ctl/Cargo.toml | 3 +- components/external_storage/export/Cargo.toml | 7 +- .../export/src/bin/tikv-cloud-storage.rs | 17 +- components/file_system/Cargo.toml | 1 - components/keys/Cargo.toml | 1 - components/resource_metering/Cargo.toml | 3 - components/server/Cargo.toml | 3 +- components/server/src/signal_handler.rs | 14 +- components/tidb_query_datatype/Cargo.toml | 2 +- .../src/simple_aggr_executor.rs | 2 +- components/tidb_query_expr/src/impl_cast.rs | 11 +- components/tidb_query_expr/src/impl_math.rs | 287 ++++++++++-------- .../tidb_query_expr/src/impl_miscellaneous.rs | 17 +- components/tidb_query_expr/src/impl_op.rs | 19 +- components/tikv_util/Cargo.toml | 2 +- 16 files changed, 284 insertions(+), 275 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6691467f359..19ccbcc72c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -633,6 +633,12 @@ version = "3.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12ae9db68ad7fac5fe51304d20f016c911539251075a214f8e663babefa35187" +[[package]] +name = "bytemuck" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdead85bdec19c194affaeeb670c0e41fe23de31459efd1c174d049269cf02cc" + [[package]] name = "byteorder" version = "1.3.4" @@ -1091,16 +1097,16 @@ dependencies = [ [[package]] name = "crossbeam" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd01a6eb3daaafa260f6fc94c3a6c36390abc2080e38e3e34ced87393fb77d80" +checksum = "4ae5588f6b3c3cb05239e90bd110f257254aecd01e4635400391aeae07497845" dependencies = [ "cfg-if 1.0.0", "crossbeam-channel", "crossbeam-deque", - "crossbeam-epoch 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-epoch 0.9.8", "crossbeam-queue", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.8", ] [[package]] @@ -1110,7 +1116,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.8", ] [[package]] @@ -1120,18 +1126,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-epoch 0.9.8", + "crossbeam-utils 0.8.8", ] [[package]] name = "crossbeam-epoch" version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2584f639eb95fea8c798496315b297cf81b9b58b6d30ab066a75455333cf4b12" +source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.3", "lazy_static", "memoffset", "scopeguard", @@ -1139,11 +1144,13 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1145cf131a2c6ba0615079ab6a638f7e1973ac9c2634fcbeaaad6114246efe8c" dependencies = [ + "autocfg", "cfg-if 1.0.0", - "crossbeam-utils 0.8.3 (git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0)", + "crossbeam-utils 0.8.8", "lazy_static", "memoffset", "scopeguard", @@ -1151,12 +1158,12 @@ dependencies = [ [[package]] name = "crossbeam-queue" -version = "0.3.1" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f6cb3c7f5b8e51bc3ebb73a2327ad4abdbd119dc13223f14f961d2f38486756" +checksum = "1f25d8400f4a7a5778f0e4e52384a48cbd9b5c495d110786187fc750075277a2" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.8", ] [[package]] @@ -1165,8 +1172,8 @@ version = "0.0.0" source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.3 (git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0)", - "crossbeam-utils 0.8.3 (git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0)", + "crossbeam-epoch 0.9.3", + "crossbeam-utils 0.8.3", "scopeguard", ] @@ -1184,8 +1191,7 @@ dependencies = [ [[package]] name = "crossbeam-utils" version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7e9d99fa91428effe99c5c6d4634cdeba32b8cf784fc428a2a687f61a952c49" +source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" dependencies = [ "autocfg", "cfg-if 1.0.0", @@ -1194,10 +1200,10 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf124c720b7686e3c2663cf54062ab0f68a88af2fb6a030e87e30bf721fcb38" dependencies = [ - "autocfg", "cfg-if 1.0.0", "lazy_static", ] @@ -1693,11 +1699,11 @@ dependencies = [ "libc 0.2.125", "libloading", "matches", - "nix 0.23.0", + "nix", "once_cell", "protobuf", "rust-ini", - "signal", + "signal-hook", "slog", "slog-global", "slog-term", @@ -1743,12 +1749,11 @@ dependencies = [ "bcc", "collections", "crc32fast", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.8", "fs2", "lazy_static", "libc 0.2.125", "maligned", - "nix 0.23.0", "online_config", "openssl", "parking_lot 0.12.0", @@ -2613,7 +2618,6 @@ dependencies = [ "panic_hook", "thiserror", "tikv_alloc", - "tikv_util", ] [[package]] @@ -3113,32 +3117,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "nix" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "becb657d662f1cd2ef38c7ad480ec6b8cf9e96b27adb543e594f9cf0f2e6065c" -dependencies = [ - "bitflags", - "cc", - "cfg-if 0.1.10", - "libc 0.2.125", - "void", -] - -[[package]] -name = "nix" -version = "0.23.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f305c2c2e4c39a82f7bf0bf65fb557f9070ce06781d4f2454295cc34b1c43188" -dependencies = [ - "bitflags", - "cc", - "cfg-if 1.0.0", - "libc 0.2.125", - "memoffset", -] - [[package]] name = "nix" version = "0.24.1" @@ -3196,9 +3174,9 @@ dependencies = [ [[package]] name = "notify" -version = "4.0.16" +version = "4.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2599080e87c9bd051ddb11b10074f4da7b1223298df65d4c2ec5bcf309af1533" +checksum = "ae03c8c853dba7bfd23e571ff0cff7bc9dceb40a4cd684cd1681824183f45257" dependencies = [ "bitflags", "filetime", @@ -3449,9 +3427,9 @@ checksum = "77af24da69f9d9341038eba93a073b1fdaaa1b788221b00a69bce9e762cb32de" [[package]] name = "openssl-src" -version = "111.17.0+1.1.1m" +version = "111.20.0+1.1.1o" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d6a336abd10814198f66e2a91ccd7336611f30334119ca8ce300536666fcf4" +checksum = "92892c4f87d56e376e469ace79f1128fdaded07646ddf73aa0be4706ff712dec" dependencies = [ "cc", ] @@ -3472,18 +3450,9 @@ dependencies = [ [[package]] name = "ordered-float" -version = "1.1.1" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3305af35278dd29f46fcdd139e0b1fbfae2153f0e5928b39b035542dd31e37b7" -dependencies = [ - "num-traits", -] - -[[package]] -name = "ordered-float" -version = "2.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "039f02eb0f69271f26abe3202189275d7aa2258b903cb0281b5de710a2570ff3" +checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" dependencies = [ "num-traits", ] @@ -3800,7 +3769,7 @@ dependencies = [ "inferno", "libc 0.2.125", "log", - "nix 0.24.1", + "nix", "once_cell", "parking_lot 0.12.0", "protobuf", @@ -4069,7 +4038,7 @@ dependencies = [ "log", "lz4-sys", "memmap2", - "nix 0.24.1", + "nix", "num-derive", "num-traits", "parking_lot 0.12.0", @@ -4162,7 +4131,7 @@ dependencies = [ "memory_trace_macros", "online_config", "openssl", - "ordered-float 2.7.0", + "ordered-float", "panic_hook", "parking_lot 0.12.0", "pd_client", @@ -4349,7 +4318,7 @@ checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" dependencies = [ "crossbeam-channel", "crossbeam-deque", - "crossbeam-utils 0.8.3 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.8.8", "lazy_static", "num_cpus", ] @@ -4390,9 +4359,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.5.4" +version = "1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +checksum = "d83f127d94bdbcda4c8cc2e50f6f84f4b611f69c902699ca385a39c3a75f9ff1" dependencies = [ "aho-corasick", "memchr", @@ -4410,9 +4379,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.25" +version = "0.6.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +checksum = "49b3de9ec5dc0a3417da371aab17d729997c15010e7fd24ff707773a33bddb64" [[package]] name = "remove_dir_all" @@ -4517,7 +4486,6 @@ dependencies = [ "serde_derive", "slog", "slog-global", - "thread-id", "tikv_util", ] @@ -4529,9 +4497,12 @@ checksum = "18eb52b6664d331053136fcac7e4883bdc6f5fc04a6aab3b0f75eafb80ab88b3" [[package]] name = "rgb" -version = "0.8.14" +version = "0.8.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089e4031214d129e201f8c3c8c2fe97cd7322478a0d1cdf78e7029b0042efdb" +checksum = "e74fdc210d8f24a7dbfedc13b04ba5764f5232754ccebfdf5fff1bad791ccbc6" +dependencies = [ + "bytemuck", +] [[package]] name = "rhai" @@ -5049,7 +5020,6 @@ dependencies = [ "libc 0.2.125", "log", "log_wrappers", - "nix 0.23.0", "pd_client", "prometheus", "protobuf", @@ -5061,7 +5031,7 @@ dependencies = [ "resource_metering", "security", "serde_json", - "signal", + "signal-hook", "slog", "slog-global", "tempfile", @@ -5100,20 +5070,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] -name = "signal" -version = "0.6.0" +name = "signal-hook" +version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "106428d9d96840ecdec5208c13ab8a4e28c38da1e0ccf2909fb44e41b992f897" +checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ "libc 0.2.125", - "nix 0.11.1", + "signal-hook-registry", ] [[package]] name = "signal-hook-registry" -version = "1.2.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce32ea0c6c56d5eacaeb814fbed9960547021d3edd010ded1425f180536b20ab" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ "libc 0.2.125", ] @@ -5805,17 +5775,6 @@ dependencies = [ "syn", ] -[[package]] -name = "thread-id" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fdfe0627923f7411a43ec9ec9c39c3a9b4151be313e0922042581fb6c9b717f" -dependencies = [ - "libc 0.2.125", - "redox_syscall 0.2.11", - "winapi 0.3.9", -] - [[package]] name = "thread_local" version = "1.1.4" @@ -5893,7 +5852,7 @@ dependencies = [ "num 0.3.0", "num-derive", "num-traits", - "ordered-float 1.1.1", + "ordered-float", "protobuf", "regex", "serde", @@ -6120,7 +6079,6 @@ dependencies = [ "libc 0.2.125", "log", "log_wrappers", - "nix 0.23.0", "pd_client", "prometheus", "protobuf", @@ -6133,7 +6091,7 @@ dependencies = [ "security", "serde_json", "server", - "signal", + "signal-hook", "slog", "slog-global", "structopt", @@ -6265,7 +6223,7 @@ dependencies = [ "libc 0.2.125", "log", "log_wrappers", - "nix 0.23.0", + "nix", "num-traits", "num_cpus", "online_config", @@ -6794,12 +6752,6 @@ dependencies = [ "syn", ] -[[package]] -name = "void" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" - [[package]] name = "walkdir" version = "2.3.1" diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index 9292df06fca..13d8b351e21 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -66,7 +66,6 @@ kvproto = { git = "https://github.com/pingcap/kvproto.git" } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { path = "../../components/log_wrappers" } -nix = "0.23" pd_client = { path = "../../components/pd_client", default-features = false } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } @@ -95,4 +94,4 @@ cc = "1.0" time = "0.1" [target.'cfg(unix)'.dependencies] -signal = "0.6" +signal-hook = "0.3" diff --git a/components/external_storage/export/Cargo.toml b/components/external_storage/export/Cargo.toml index d67e2b7a15f..1f75af2734a 100644 --- a/components/external_storage/export/Cargo.toml +++ b/components/external_storage/export/Cargo.toml @@ -40,13 +40,12 @@ cloud-storage-grpc = [ "futures", "futures-executor", "libc", - "signal", + "signal-hook", "slog", "slog-global", "slog-term", "tokio", "tokio-util", - "nix", ] [dependencies] @@ -89,8 +88,8 @@ name = "scli" path = "examples/scli.rs" [target.'cfg(unix)'.dependencies] -nix = { optional = true, version = "0.23" } -signal = { optional = true, version = "0.6" } +nix = { optional = true, version = "0.24" } +signal-hook = { optional = true, version = "0.3" } libc = { optional = true, version = "0.2" } slog = { optional = true, version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-term = { optional = true, version = "2.4" } diff --git a/components/external_storage/export/src/bin/tikv-cloud-storage.rs b/components/external_storage/export/src/bin/tikv-cloud-storage.rs index 3011a5079d1..07cd8507948 100644 --- a/components/external_storage/export/src/bin/tikv-cloud-storage.rs +++ b/components/external_storage/export/src/bin/tikv-cloud-storage.rs @@ -33,16 +33,19 @@ fn main() { #[cfg(unix)] mod wait { use libc::c_int; - use nix::sys::signal::{SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2}; - use signal::trap::Trap; + use signal_hook::{ + consts::{SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2}, + iterator::Signals, + Signals, + }; use slog_global::info; pub fn for_signal() { - let trap = Trap::trap(&[SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]); - for sig in trap { - match sig { - SIGUSR1 | SIGTERM | SIGINT | SIGHUP => { - info!("receive signal {}, stopping server...", sig as c_int); + let mut signals = Signals::new(&[SIGTERM, SIGINT, SIGHUP]).unwrap(); + for signal in &mut signals { + match signal { + SIGTERM | SIGINT | SIGHUP => { + info!("receive signal {}, stopping server...", signal); break; } // TODO: handle more signals diff --git a/components/file_system/Cargo.toml b/components/file_system/Cargo.toml index aa1cb56a991..e3924c0fc25 100644 --- a/components/file_system/Cargo.toml +++ b/components/file_system/Cargo.toml @@ -14,7 +14,6 @@ crossbeam-utils = "0.8.0" fs2 = "0.4" lazy_static = "1.3" libc = "0.2" -nix = "0.23" online_config = { path = "../online_config" } openssl = "0.10" parking_lot = "0.12" diff --git a/components/keys/Cargo.toml b/components/keys/Cargo.toml index de1a7089ce4..a9bd4ddbf18 100644 --- a/components/keys/Cargo.toml +++ b/components/keys/Cargo.toml @@ -10,7 +10,6 @@ kvproto = { git = "https://github.com/pingcap/kvproto.git" } log_wrappers = { path = "../log_wrappers" } thiserror = "1.0" tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } [dev-dependencies] panic_hook = { path = "../panic_hook" } diff --git a/components/resource_metering/Cargo.toml b/components/resource_metering/Cargo.toml index cecaa3c911b..72a0c0dc339 100644 --- a/components/resource_metering/Cargo.toml +++ b/components/resource_metering/Cargo.toml @@ -25,9 +25,6 @@ tikv_util = { path = "../tikv_util" } [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } -[target.'cfg(not(target_os = "linux"))'.dependencies] -thread-id = "4" - [dev-dependencies] rand = "0.8" diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index b53fde02cef..650f9f6932b 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -60,7 +60,6 @@ kvproto = { git = "https://github.com/pingcap/kvproto.git" } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { path = "../log_wrappers" } -nix = "0.23" pd_client = { path = "../pd_client", default-features = false } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } @@ -84,4 +83,4 @@ txn_types = { path = "../txn_types", default-features = false } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [target.'cfg(unix)'.dependencies] -signal = "0.6" +signal-hook = "0.3" diff --git a/components/server/src/signal_handler.rs b/components/server/src/signal_handler.rs index 5b73154241b..88c2ddac9f4 100644 --- a/components/server/src/signal_handler.rs +++ b/components/server/src/signal_handler.rs @@ -5,17 +5,19 @@ pub use self::imp::wait_for_signal; #[cfg(unix)] mod imp { use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine}; - use libc::c_int; - use signal::{trap::Trap, Signal::*}; + use signal_hook::{ + consts::{SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2}, + iterator::Signals, + }; use tikv_util::metrics; #[allow(dead_code)] pub fn wait_for_signal(engines: Option>) { - let trap = Trap::trap(&[SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]); - for sig in trap { - match sig { + let mut signals = Signals::new(&[SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]).unwrap(); + for signal in &mut signals { + match signal { SIGTERM | SIGINT | SIGHUP => { - info!("receive signal {}, stopping server...", sig as c_int); + info!("receive signal {}, stopping server...", signal); break; } SIGUSR1 => { diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index 698ebc8049c..56acb353302 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -25,7 +25,7 @@ nom = { version = "5.1.0", default-features = false, features = ["std"] } num = { version = "0.3", default-features = false } num-derive = "0.3" num-traits = "0.2" -ordered-float = "1.0" +ordered-float = "2.0" protobuf = "2" regex = "1.1" serde = "1.0" diff --git a/components/tidb_query_executors/src/simple_aggr_executor.rs b/components/tidb_query_executors/src/simple_aggr_executor.rs index 325082f42d6..1e1dd48929b 100644 --- a/components/tidb_query_executors/src/simple_aggr_executor.rs +++ b/components/tidb_query_executors/src/simple_aggr_executor.rs @@ -309,7 +309,7 @@ mod tests { Self { rows_with_null: 0, rows_without_null: 0, - sum: Real::from(0.0), + sum: Real::new(0.0).unwrap(), } } } diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 718bb9b3fa5..f6d6af4eb02 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -1833,10 +1833,13 @@ mod tests { let cs = vec![ // (input, expect) - (EnumRef::new("enum".as_bytes(), &0), Real::from(0.)), - (EnumRef::new("int".as_bytes(), &1), Real::from(1.)), - (EnumRef::new("real".as_bytes(), &2), Real::from(2.)), - (EnumRef::new("string".as_bytes(), &3), Real::from(3.)), + (EnumRef::new("enum".as_bytes(), &0), Real::new(0.).unwrap()), + (EnumRef::new("int".as_bytes(), &1), Real::new(1.).unwrap()), + (EnumRef::new("real".as_bytes(), &2), Real::new(2.).unwrap()), + ( + EnumRef::new("string".as_bytes(), &3), + Real::new(3.).unwrap(), + ), ]; for (input, expect) in cs { diff --git a/components/tidb_query_expr/src/impl_math.rs b/components/tidb_query_expr/src/impl_math.rs index 5ed1973c5d6..798ca2b9c6a 100644 --- a/components/tidb_query_expr/src/impl_math.rs +++ b/components/tidb_query_expr/src/impl_math.rs @@ -22,7 +22,7 @@ const MAX_RAND_VALUE: u32 = 0x3FFFFFFF; #[rpn_fn] #[inline] pub fn pi() -> Result> { - Ok(Some(Real::from(std::f64::consts::PI))) + Ok(Some(Real::new(std::f64::consts::PI).unwrap())) } #[rpn_fn] @@ -65,7 +65,7 @@ pub fn log10(arg: &Real) -> Result> { // If the given f64 is finite, returns `Some(Real)`. Otherwise returns None. fn f64_to_real(n: f64) -> Option { if n.is_finite() { - Some(Real::from(n)) + Some(Real::new(n).unwrap()) } else { None } @@ -92,7 +92,7 @@ impl Ceil for CeilReal { #[inline] fn ceil(_ctx: &mut EvalContext, arg: &Self::Input) -> Result> { - Ok(Some(Real::from(arg.ceil()))) + Ok(Some(Real::new(arg.ceil()).unwrap())) } } @@ -167,7 +167,7 @@ impl Floor for FloorReal { #[inline] fn floor(_ctx: &mut EvalContext, arg: &Self::Input) -> Result> { - Ok(Some(Real::from(arg.floor()))) + Ok(Some(Real::new(arg.floor()).unwrap())) } } @@ -272,11 +272,7 @@ fn sqrt(arg: &Real) -> Result> { None } else { let res = arg.sqrt(); - if res.is_nan() { - None - } else { - Some(Real::from(res)) - } + Real::new(res).ok() } }) } @@ -485,11 +481,11 @@ fn truncate_real(x: Real, d: i32) -> Real { let shift = 10_f64.powi(d); let tmp = x * shift; if *tmp == 0_f64 { - Real::from(0_f64) + Real::new(0_f64).unwrap() } else if tmp.is_infinite() { x } else { - Real::from(tmp.trunc() / shift) + Real::new(tmp.trunc() / shift).unwrap() } } @@ -548,7 +544,7 @@ pub fn round_with_frac_real(arg0: &Real, arg1: &Int) -> Result> { let digits = arg1; let power = 10.0_f64.powi(-digits as i32); let frac = *number / power; - Ok(Some(Real::from(frac.round() * power))) + Ok(Some(Real::new(frac.round() * power).unwrap())) } thread_local! { @@ -715,7 +711,7 @@ mod tests { let output = RpnFnScalarEvaluator::new() .evaluate(ScalarFuncSig::Pi) .unwrap(); - assert_eq!(output, Some(Real::from(std::f64::consts::PI))); + assert_eq!(output, Some(Real::new(std::f64::consts::PI).unwrap())); } #[test] @@ -743,8 +739,8 @@ mod tests { #[test] fn test_log_1_arg() { let test_cases = vec![ - (Some(std::f64::consts::E), Some(Real::from(1.0_f64))), - (Some(100.0), Some(Real::from(4.605170185988092_f64))), + (Some(std::f64::consts::E), Some(Real::new(1.0_f64).unwrap())), + (Some(100.0), Some(Real::new(4.605170185988092_f64).unwrap())), (Some(-1.0), None), (Some(0.0), None), (None, None), @@ -761,9 +757,21 @@ mod tests { #[test] fn test_log_2_arg() { let test_cases = vec![ - (Some(10.0_f64), Some(100.0_f64), Some(Real::from(2.0_f64))), - (Some(2.0_f64), Some(1.0_f64), Some(Real::from(0.0_f64))), - (Some(0.5_f64), Some(0.25_f64), Some(Real::from(2.0_f64))), + ( + Some(10.0_f64), + Some(100.0_f64), + Some(Real::new(2.0_f64).unwrap()), + ), + ( + Some(2.0_f64), + Some(1.0_f64), + Some(Real::new(0.0_f64).unwrap()), + ), + ( + Some(0.5_f64), + Some(0.25_f64), + Some(Real::new(2.0_f64).unwrap()), + ), (Some(-0.23323_f64), Some(2.0_f64), None), (Some(0_f64), Some(123_f64), None), (Some(1_f64), Some(123_f64), None), @@ -785,8 +793,8 @@ mod tests { #[test] fn test_log2() { let test_cases = vec![ - (Some(16_f64), Some(Real::from(4_f64))), - (Some(5_f64), Some(Real::from(2.321928094887362_f64))), + (Some(16_f64), Some(Real::new(4_f64).unwrap())), + (Some(5_f64), Some(Real::new(2.321928094887362_f64).unwrap())), (Some(-1.234_f64), None), (Some(0_f64), None), (None, None), @@ -803,8 +811,11 @@ mod tests { #[test] fn test_log10() { let test_cases = vec![ - (Some(100_f64), Some(Real::from(2_f64))), - (Some(101_f64), Some(Real::from(2.0043213737826426_f64))), + (Some(100_f64), Some(Real::new(2_f64).unwrap())), + ( + Some(101_f64), + Some(Real::new(2.0043213737826426_f64).unwrap()), + ), (Some(-1.234_f64), None), (Some(0_f64), None), (None, None), @@ -887,7 +898,7 @@ mod tests { (f64::MIN, f64::MIN), ]; for (expected, input) in cases { - let arg = Real::from(input); + let arg = Real::new(input).unwrap(); let expected = Real::new(expected).ok(); let output = RpnFnScalarEvaluator::new() .push_param(arg) @@ -1004,7 +1015,7 @@ mod tests { (f64::MIN, f64::MIN), ]; for (input, expected) in cases { - let arg = Real::from(input); + let arg = Real::new(input).unwrap(); let expected = Real::new(expected).ok(); let output = RpnFnScalarEvaluator::new() .push_param(arg) @@ -1122,8 +1133,11 @@ mod tests { fn test_sqrt() { let test_cases = vec![ (None, None), - (Some(64f64), Some(Real::from(8f64))), - (Some(2f64), Some(Real::from(std::f64::consts::SQRT_2))), + (Some(64f64), Some(Real::new(8f64).unwrap())), + ( + Some(2f64), + Some(Real::new(std::f64::consts::SQRT_2).unwrap()), + ), (Some(-16f64), None), (Some(f64::NAN), None), ]; @@ -1140,14 +1154,17 @@ mod tests { fn test_radians() { let test_cases = vec![ (None, None), - (Some(0_f64), Some(Real::from(0_f64))), - (Some(180_f64), Some(Real::from(std::f64::consts::PI))), + (Some(0_f64), Some(Real::new(0_f64).unwrap())), + ( + Some(180_f64), + Some(Real::new(std::f64::consts::PI).unwrap()), + ), ( Some(-360_f64), - Some(Real::from(-2_f64 * std::f64::consts::PI)), + Some(Real::new(-2_f64 * std::f64::consts::PI).unwrap()), ), (Some(f64::NAN), None), - (Some(f64::INFINITY), Some(Real::from(f64::INFINITY))), + (Some(f64::INFINITY), Some(Real::new(f64::INFINITY).unwrap())), ]; for (input, expect) in test_cases { let output = RpnFnScalarEvaluator::new() @@ -1168,17 +1185,17 @@ mod tests { ]; for (x, expected) in tests { let output = RpnFnScalarEvaluator::new() - .push_param(Some(Real::from(x))) + .push_param(Some(Real::new(x).unwrap())) .evaluate(ScalarFuncSig::Exp) .unwrap(); - assert_eq!(output, Some(Real::from(expected))); + assert_eq!(output, Some(Real::new(expected).unwrap())); } test_unary_func_ok_none::(ScalarFuncSig::Exp); let overflow_tests = vec![100000_f64]; for x in overflow_tests { let output: Result> = RpnFnScalarEvaluator::new() - .push_param(Some(Real::from(x))) + .push_param(Some(Real::new(x).unwrap())) .evaluate(ScalarFuncSig::Exp); assert!(output.is_err()); } @@ -1189,12 +1206,15 @@ mod tests { let tests_cases = vec![ (None, None), (Some(f64::NAN), None), - (Some(0f64), Some(Real::from(0f64))), - (Some(1f64), Some(Real::from(57.29577951308232_f64))), - (Some(std::f64::consts::PI), Some(Real::from(180.0_f64))), + (Some(0f64), Some(Real::new(0f64).unwrap())), + (Some(1f64), Some(Real::new(57.29577951308232_f64).unwrap())), + ( + Some(std::f64::consts::PI), + Some(Real::new(180.0_f64).unwrap()), + ), ( Some(-std::f64::consts::PI / 2.0_f64), - Some(Real::from(-90.0_f64)), + Some(Real::new(-90.0_f64).unwrap()), ), ]; for (input, expect) in tests_cases { @@ -1219,7 +1239,7 @@ mod tests { ]; for (input, expect) in valid_test_cases { let output: Option = RpnFnScalarEvaluator::new() - .push_param(Some(Real::from(input))) + .push_param(Some(Real::new(input).unwrap())) .evaluate(ScalarFuncSig::Sin) .unwrap(); assert!((output.unwrap().into_inner() - expect).abs() < f64::EPSILON); @@ -1236,7 +1256,7 @@ mod tests { ]; for (input, expect) in test_cases { let output: Option = RpnFnScalarEvaluator::new() - .push_param(Some(Real::from(input))) + .push_param(Some(Real::new(input).unwrap())) .evaluate(ScalarFuncSig::Cos) .unwrap(); assert!((output.unwrap().into_inner() - expect).abs() < f64::EPSILON); @@ -1257,7 +1277,7 @@ mod tests { ]; for (input, expect) in test_cases { let output: Option = RpnFnScalarEvaluator::new() - .push_param(Some(Real::from(input))) + .push_param(Some(Real::new(input).unwrap())) .evaluate(ScalarFuncSig::Tan) .unwrap(); assert!((output.unwrap().into_inner() - expect).abs() < f64::EPSILON); @@ -1284,14 +1304,14 @@ mod tests { ]; for (input, expect) in test_cases { let output: Option = RpnFnScalarEvaluator::new() - .push_param(Some(Real::from(input))) + .push_param(Some(Real::new(input).unwrap())) .evaluate(ScalarFuncSig::Cot) .unwrap(); assert!((output.unwrap().into_inner() - expect).abs() < f64::EPSILON); } assert!( RpnFnScalarEvaluator::new() - .push_param(Some(Real::from(0.0_f64))) + .push_param(Some(Real::new(0.0_f64).unwrap())) .evaluate::(ScalarFuncSig::Cot) .is_err() ); @@ -1301,27 +1321,27 @@ mod tests { fn test_pow() { let cases = vec![ ( - Some(Real::from(1.0f64)), - Some(Real::from(3.0f64)), - Some(Real::from(1.0f64)), + Some(Real::new(1.0f64).unwrap()), + Some(Real::new(3.0f64).unwrap()), + Some(Real::new(1.0f64).unwrap()), ), ( - Some(Real::from(3.0f64)), - Some(Real::from(0.0f64)), - Some(Real::from(1.0f64)), + Some(Real::new(3.0f64).unwrap()), + Some(Real::new(0.0f64).unwrap()), + Some(Real::new(1.0f64).unwrap()), ), ( - Some(Real::from(2.0f64)), - Some(Real::from(4.0f64)), - Some(Real::from(16.0f64)), + Some(Real::new(2.0f64).unwrap()), + Some(Real::new(4.0f64).unwrap()), + Some(Real::new(16.0f64).unwrap()), ), ( - Some(Real::from(f64::INFINITY)), - Some(Real::from(0.0f64)), - Some(Real::from(1.0f64)), + Some(Real::new(f64::INFINITY).unwrap()), + Some(Real::new(0.0f64).unwrap()), + Some(Real::new(1.0f64).unwrap()), ), - (Some(Real::from(4.0f64)), None, None), - (None, Some(Real::from(4.0f64)), None), + (Some(Real::new(4.0f64).unwrap()), None, None), + (None, Some(Real::new(4.0f64).unwrap()), None), (None, None, None), ]; @@ -1336,10 +1356,13 @@ mod tests { let invalid_cases = vec![ ( - Some(Real::from(f64::INFINITY)), - Some(Real::from(f64::INFINITY)), + Some(Real::new(f64::INFINITY).unwrap()), + Some(Real::new(f64::INFINITY).unwrap()), + ), + ( + Some(Real::new(0.0f64).unwrap()), + Some(Real::new(-9999999.0f64).unwrap()), ), - (Some(Real::from(0.0f64)), Some(Real::from(-9999999.0f64))), ]; for (lhs, rhs) in invalid_cases { @@ -1364,10 +1387,10 @@ mod tests { .unwrap() .unwrap(); - assert!(got1 < Real::from(1.0)); - assert!(got1 >= Real::from(0.0)); - assert!(got2 < Real::from(1.0)); - assert!(got2 >= Real::from(0.0)); + assert!(got1 < Real::new(1.0).unwrap()); + assert!(got1 >= Real::new(0.0).unwrap()); + assert!(got2 < Real::new(1.0).unwrap()); + assert!(got2 >= Real::new(0.0).unwrap()); assert_ne!(got1, got2); } @@ -1392,7 +1415,7 @@ mod tests { .evaluate::(ScalarFuncSig::RandWithSeedFirstGen) .unwrap() .unwrap(); - assert_eq!(got, Real::from(exp)); + assert_eq!(got, Real::new(exp).unwrap()); } let none_case_got = RpnFnScalarEvaluator::new() @@ -1400,24 +1423,27 @@ mod tests { .evaluate::(ScalarFuncSig::RandWithSeedFirstGen) .unwrap() .unwrap(); - assert_eq!(none_case_got, Real::from(0.15522042769493574)); + assert_eq!(none_case_got, Real::new(0.15522042769493574).unwrap()); } #[test] fn test_asin() { let test_cases = vec![ - (Some(Real::from(0.0_f64)), Some(Real::from(0.0_f64))), ( - Some(Real::from(1.0_f64)), - Some(Real::from(std::f64::consts::PI / 2.0_f64)), + Some(Real::new(0.0_f64).unwrap()), + Some(Real::new(0.0_f64).unwrap()), + ), + ( + Some(Real::new(1.0_f64).unwrap()), + Some(Real::new(std::f64::consts::PI / 2.0_f64).unwrap()), ), ( - Some(Real::from(-1.0_f64)), - Some(Real::from(-std::f64::consts::PI / 2.0_f64)), + Some(Real::new(-1.0_f64).unwrap()), + Some(Real::new(-std::f64::consts::PI / 2.0_f64).unwrap()), ), ( - Some(Real::from(std::f64::consts::SQRT_2 / 2.0_f64)), - Some(Real::from(std::f64::consts::PI / 4.0_f64)), + Some(Real::new(std::f64::consts::SQRT_2 / 2.0_f64).unwrap()), + Some(Real::new(std::f64::consts::PI / 4.0_f64).unwrap()), ), ]; for (input, expect) in test_cases { @@ -1428,9 +1454,9 @@ mod tests { assert!((output.unwrap() - expect.unwrap()).abs() < f64::EPSILON); } let invalid_test_cases = vec![ - (Some(Real::from(f64::INFINITY)), None), - (Some(Real::from(2.0_f64)), None), - (Some(Real::from(-2.0_f64)), None), + (Some(Real::new(f64::INFINITY).unwrap()), None), + (Some(Real::new(2.0_f64).unwrap()), None), + (Some(Real::new(-2.0_f64).unwrap()), None), ]; for (input, expect) in invalid_test_cases { let output: Option = RpnFnScalarEvaluator::new() @@ -1445,17 +1471,20 @@ mod tests { fn test_acos() { let test_cases = vec![ ( - Some(Real::from(0.0_f64)), - Some(Real::from(std::f64::consts::PI / 2.0_f64)), + Some(Real::new(0.0_f64).unwrap()), + Some(Real::new(std::f64::consts::PI / 2.0_f64).unwrap()), + ), + ( + Some(Real::new(1.0_f64).unwrap()), + Some(Real::new(0.0_f64).unwrap()), ), - (Some(Real::from(1.0_f64)), Some(Real::from(0.0_f64))), ( - Some(Real::from(-1.0_f64)), - Some(Real::from(std::f64::consts::PI)), + Some(Real::new(-1.0_f64).unwrap()), + Some(Real::new(std::f64::consts::PI).unwrap()), ), ( - Some(Real::from(std::f64::consts::SQRT_2 / 2.0_f64)), - Some(Real::from(std::f64::consts::PI / 4.0_f64)), + Some(Real::new(std::f64::consts::SQRT_2 / 2.0_f64).unwrap()), + Some(Real::new(std::f64::consts::PI / 4.0_f64).unwrap()), ), ]; for (input, expect) in test_cases { @@ -1466,9 +1495,9 @@ mod tests { assert!((output.unwrap() - expect.unwrap()).abs() < f64::EPSILON); } let invalid_test_cases = vec![ - (Some(Real::from(f64::INFINITY)), None), - (Some(Real::from(2.0_f64)), None), - (Some(Real::from(-2.0_f64)), None), + (Some(Real::new(f64::INFINITY).unwrap()), None), + (Some(Real::new(2.0_f64).unwrap()), None), + (Some(Real::new(-2.0_f64).unwrap()), None), ]; for (input, expect) in invalid_test_cases { let output: Option = RpnFnScalarEvaluator::new() @@ -1483,22 +1512,25 @@ mod tests { fn test_atan_1_arg() { let test_cases = vec![ ( - Some(Real::from(1.0_f64)), - Some(Real::from(std::f64::consts::PI / 4.0_f64)), + Some(Real::new(1.0_f64).unwrap()), + Some(Real::new(std::f64::consts::PI / 4.0_f64).unwrap()), ), ( - Some(Real::from(-1.0_f64)), - Some(Real::from(-std::f64::consts::PI / 4.0_f64)), + Some(Real::new(-1.0_f64).unwrap()), + Some(Real::new(-std::f64::consts::PI / 4.0_f64).unwrap()), ), ( - Some(Real::from(f64::MAX)), - Some(Real::from(std::f64::consts::PI / 2.0_f64)), + Some(Real::new(f64::MAX).unwrap()), + Some(Real::new(std::f64::consts::PI / 2.0_f64).unwrap()), ), ( - Some(Real::from(f64::MIN)), - Some(Real::from(-std::f64::consts::PI / 2.0_f64)), + Some(Real::new(f64::MIN).unwrap()), + Some(Real::new(-std::f64::consts::PI / 2.0_f64).unwrap()), + ), + ( + Some(Real::new(0.0_f64).unwrap()), + Some(Real::new(0.0_f64).unwrap()), ), - (Some(Real::from(0.0_f64)), Some(Real::from(0.0_f64))), ]; for (input, expect) in test_cases { let output: Option = RpnFnScalarEvaluator::new() @@ -1513,29 +1545,29 @@ mod tests { fn test_atan_2_args() { let test_cases = vec![ ( - Some(Real::from(0.0_f64)), - Some(Real::from(0.0_f64)), - Some(Real::from(0.0_f64)), + Some(Real::new(0.0_f64).unwrap()), + Some(Real::new(0.0_f64).unwrap()), + Some(Real::new(0.0_f64).unwrap()), ), ( - Some(Real::from(0.0_f64)), - Some(Real::from(-1.0_f64)), - Some(Real::from(std::f64::consts::PI)), + Some(Real::new(0.0_f64).unwrap()), + Some(Real::new(-1.0_f64).unwrap()), + Some(Real::new(std::f64::consts::PI).unwrap()), ), ( - Some(Real::from(1.0_f64)), - Some(Real::from(-1.0_f64)), - Some(Real::from(3.0_f64 * std::f64::consts::PI / 4.0_f64)), + Some(Real::new(1.0_f64).unwrap()), + Some(Real::new(-1.0_f64).unwrap()), + Some(Real::new(3.0_f64 * std::f64::consts::PI / 4.0_f64).unwrap()), ), ( - Some(Real::from(-1.0_f64)), - Some(Real::from(1.0_f64)), - Some(Real::from(-std::f64::consts::PI / 4.0_f64)), + Some(Real::new(-1.0_f64).unwrap()), + Some(Real::new(1.0_f64).unwrap()), + Some(Real::new(-std::f64::consts::PI / 4.0_f64).unwrap()), ), ( - Some(Real::from(1.0_f64)), - Some(Real::from(0.0_f64)), - Some(Real::from(std::f64::consts::PI / 2.0_f64)), + Some(Real::new(1.0_f64).unwrap()), + Some(Real::new(0.0_f64).unwrap()), + Some(Real::new(std::f64::consts::PI / 2.0_f64).unwrap()), ), ]; for (arg0, arg1, expect) in test_cases { @@ -1602,9 +1634,18 @@ mod tests { #[test] fn test_round_real() { let test_cases = vec![ - (Some(Real::from(-3.12_f64)), Some(Real::from(-3f64))), - (Some(Real::from(f64::MAX)), Some(Real::from(f64::MAX))), - (Some(Real::from(f64::MIN)), Some(Real::from(f64::MIN))), + ( + Some(Real::new(-3.12_f64).unwrap()), + Some(Real::new(-3f64).unwrap()), + ), + ( + Some(Real::new(f64::MAX).unwrap()), + Some(Real::new(f64::MAX).unwrap()), + ), + ( + Some(Real::new(f64::MIN).unwrap()), + Some(Real::new(f64::MIN).unwrap()), + ), (None, None), ]; @@ -1758,12 +1799,12 @@ mod tests { .build(); let output = RpnFnScalarEvaluator::new() - .push_param(Some(Real::from(lhs))) + .push_param(Some(Real::new(lhs).unwrap())) .push_param_with_field_type(Some(rhs), rhs_field_type) .evaluate::(ScalarFuncSig::TruncateReal) .unwrap(); - assert_eq!(output, Some(Real::from(expected))); + assert_eq!(output, Some(Real::new(expected).unwrap())); } } @@ -1948,26 +1989,26 @@ mod tests { let real_cases = vec![ ( - Some(Real::from(-1.298_f64)), + Some(Real::new(-1.298_f64).unwrap()), Some(1), - Some(Real::from(-1.3_f64)), + Some(Real::new(-1.3_f64).unwrap()), ), ( - Some(Real::from(-1.298_f64)), + Some(Real::new(-1.298_f64).unwrap()), Some(0), - Some(Real::from(-1.0_f64)), + Some(Real::new(-1.0_f64).unwrap()), ), ( - Some(Real::from(23.298_f64)), + Some(Real::new(23.298_f64).unwrap()), Some(2), - Some(Real::from(23.30_f64)), + Some(Real::new(23.30_f64).unwrap()), ), ( - Some(Real::from(23.298_f64)), + Some(Real::new(23.298_f64).unwrap()), Some(-1), - Some(Real::from(20.0_f64)), + Some(Real::new(20.0_f64).unwrap()), ), - (Some(Real::from(23.298_f64)), None, None), + (Some(Real::new(23.298_f64).unwrap()), None, None), (None, Some(2), None), (None, None, None), ]; diff --git a/components/tidb_query_expr/src/impl_miscellaneous.rs b/components/tidb_query_expr/src/impl_miscellaneous.rs index 9a7492b6813..5d2daed7f9a 100644 --- a/components/tidb_query_expr/src/impl_miscellaneous.rs +++ b/components/tidb_query_expr/src/impl_miscellaneous.rs @@ -318,14 +318,21 @@ mod tests { fn test_real_any_value() { let test_cases = vec![ (vec![], None), - (vec![Real::from(1.2_f64)], Some(Real::from(1.2_f64))), ( - vec![Real::from(1.2_f64), Real::from(2.3_f64)], - Some(Real::from(1.2_f64)), + vec![Real::new(1.2_f64).unwrap()], + Some(Real::new(1.2_f64).unwrap()), ), ( - vec![Real::from(1.2_f64), Real::from(2.3_f64), Real::from(3_f64)], - Some(Real::from(1.2_f64)), + vec![Real::new(1.2_f64).unwrap(), Real::new(2.3_f64).unwrap()], + Some(Real::new(1.2_f64).unwrap()), + ), + ( + vec![ + Real::new(1.2_f64).unwrap(), + Real::new(2.3_f64).unwrap(), + Real::new(3_f64).unwrap(), + ], + Some(Real::new(1.2_f64).unwrap()), ), ]; diff --git a/components/tidb_query_expr/src/impl_op.rs b/components/tidb_query_expr/src/impl_op.rs index 1b0ee419ef5..dce8920a545 100644 --- a/components/tidb_query_expr/src/impl_op.rs +++ b/components/tidb_query_expr/src/impl_op.rs @@ -440,12 +440,21 @@ mod tests { fn test_unary_minus_real() { let test_cases = vec![ (None, None), - (Some(Real::from(0.123_f64)), Some(Real::from(-0.123_f64))), - (Some(Real::from(-0.123_f64)), Some(Real::from(0.123_f64))), - (Some(Real::from(0.0_f64)), Some(Real::from(0.0_f64))), ( - Some(Real::from(f64::INFINITY)), - Some(Real::from(f64::NEG_INFINITY)), + Some(Real::new(0.123_f64).unwrap()), + Some(Real::new(-0.123_f64).unwrap()), + ), + ( + Some(Real::new(-0.123_f64).unwrap()), + Some(Real::new(0.123_f64).unwrap()), + ), + ( + Some(Real::new(0.0_f64).unwrap()), + Some(Real::new(0.0_f64).unwrap()), + ), + ( + Some(Real::new(f64::INFINITY).unwrap()), + Some(Real::new(f64::NEG_INFINITY).unwrap()), ), ]; for (arg, expect_output) in test_cases { diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 9bbea72d8d5..52d73429f4c 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -31,7 +31,7 @@ lazy_static = "1.3" libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { path = "../log_wrappers" } -nix = "0.23" +nix = "0.24" num-traits = "0.2" num_cpus = "1" online_config = { path = "../online_config" } From 050b6c077d130ee46616b018c30e3cb42890aee5 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 10 Jun 2022 23:08:31 +0800 Subject: [PATCH 022/676] server: check raft_client config change after flush (#12781) close tikv/tikv#12780 Signed-off-by: glorv Co-authored-by: zhangjinpeng1987 Co-authored-by: Ti Chi Robot --- src/server/raft_client.rs | 78 +++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index e0b30061f0b..bc691bcc05f 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -216,11 +216,20 @@ impl BatchMessageBuffer { msg_size } + #[inline] + fn maybe_refresh_config(&mut self) { + if let Some(new_cfg) = self.cfg_tracker.any_new() { + self.cfg = new_cfg.clone(); + } + } + #[cfg(test)] fn clear(&mut self) { self.batch = BatchRaftMessage::default(); self.size = 0; self.overflowing = None; + // try refresh config + self.maybe_refresh_config(); } } @@ -235,10 +244,6 @@ impl Buffer for BatchMessageBuffer { #[inline] fn push(&mut self, msg: RaftMessage) { let msg_size = Self::message_size(&msg); - // try refresh config before check - if let Some(new_cfg) = self.cfg_tracker.any_new() { - self.cfg = new_cfg.clone(); - } // To avoid building too large batch, we limit each batch's size. Since `msg_size` // is estimated, `GRPC_SEND_MSG_BUF` is reserved for errors. if self.size > 0 @@ -270,6 +275,12 @@ impl Buffer for BatchMessageBuffer { if let Some(more) = self.overflowing.take() { self.push(more); } + + // try refresh config after flush. `max_grpc_send_msg_len` and `raft_msg_max_batch_size` + // can impact the buffer push logic, but since they are soft restriction, we check config change + // at here to avoid affact performance since `push` is a hot path. + self.maybe_refresh_config(); + res } @@ -1190,6 +1201,21 @@ mod tests { assert!(msg_buf.full()); } + fn new_test_msg(size: usize) -> RaftMessage { + let mut msg = RaftMessage::default(); + msg.set_region_id(1); + let mut region_epoch = RegionEpoch::default(); + region_epoch.conf_ver = 1; + region_epoch.version = 0x123456; + msg.set_region_epoch(region_epoch); + msg.set_start_key(vec![0; size]); + msg.set_end_key(vec![]); + msg.mut_message().set_snapshot(Snapshot::default()); + msg.mut_message().set_commit(0); + assert_eq!(BatchMessageBuffer::message_size(&msg), size); + msg + } + #[test] fn test_push_raft_message_cfg_change() { let version_track = Arc::new(VersionTrack::new(Config::default())); @@ -1199,38 +1225,42 @@ mod tests { ); let default_grpc_msg_len = msg_buf.cfg.max_grpc_send_msg_len as usize; - let make_msg = |size: usize| { - let mut msg = RaftMessage::default(); - msg.set_region_id(1); - let mut region_epoch = RegionEpoch::default(); - region_epoch.conf_ver = 1; - region_epoch.version = 0x123456; - msg.set_region_epoch(region_epoch); - msg.set_start_key(vec![0; size]); - msg.set_end_key(vec![]); - msg.mut_message().set_snapshot(Snapshot::default()); - msg.mut_message().set_commit(0); - assert_eq!(BatchMessageBuffer::message_size(&msg), size); - msg - }; - let max_msg_len = default_grpc_msg_len - msg_buf.cfg.raft_client_grpc_send_msg_buffer; - msg_buf.push(make_msg(max_msg_len)); + msg_buf.push(new_test_msg(max_msg_len)); assert!(!msg_buf.full()); - msg_buf.push(make_msg(1)); + msg_buf.push(new_test_msg(1)); assert!(msg_buf.full()); - msg_buf.clear(); // update config version_track.update(|cfg| cfg.max_grpc_send_msg_len *= 2); + msg_buf.clear(); let new_max_msg_len = default_grpc_msg_len * 2 - msg_buf.cfg.raft_client_grpc_send_msg_buffer; for _i in 0..2 { - msg_buf.push(make_msg(new_max_msg_len / 2 - 1)); + msg_buf.push(new_test_msg(new_max_msg_len / 2 - 1)); assert!(!msg_buf.full()); } - msg_buf.push(make_msg(2)); + msg_buf.push(new_test_msg(2)); assert!(msg_buf.full()); } + + #[bench] + fn bench_client_buffer_push(b: &mut test::Bencher) { + let version_track = Arc::new(VersionTrack::new(Config::default())); + let mut msg_buf = BatchMessageBuffer::new( + &version_track, + Arc::new(ThreadLoadPool::with_threshold(100)), + ); + + b.iter(|| { + for _i in 0..10 { + msg_buf.push(test::black_box(new_test_msg(1024))); + } + // run clear to mock flush. + msg_buf.clear(); + + test::black_box(&mut msg_buf); + }); + } } From d6d6d6ee725639d05ce81577a71e4a76c14152ce Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Fri, 10 Jun 2022 14:48:31 -0700 Subject: [PATCH 023/676] engine_traits: refactor Engines to support both tablet and global kvdb (#12756) ref tikv/tikv#12772 add tablet related function into Engines API. The idea is that for region's KV data read/write, using tablet() instead of engines.kv regardless multirocks is enabled or not. Signed-off-by: tonyxuqqi Signed-off-by: qi.xu Co-authored-by: qi.xu --- Cargo.lock | 6 +- components/engine_traits/src/engine.rs | 144 ++++++++++++- components/engine_traits/src/engines.rs | 1 + components/raftstore/src/store/snap.rs | 2 +- components/server/src/server.rs | 2 +- components/test_raftstore/src/util.rs | 2 +- src/server/engine_factory.rs | 78 ++++++- src/server/engine_factory_v2.rs | 271 ++++++++++++++++++++++++ src/server/mod.rs | 1 + 9 files changed, 495 insertions(+), 12 deletions(-) create mode 100644 src/server/engine_factory_v2.rs diff --git a/Cargo.lock b/Cargo.lock index 19ccbcc72c8..cd8a55146af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2752,7 +2752,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#c1f668d0c85612f5fe6ec8e4351df0fc0bef1286" +source = "git+https://github.com/tikv/rust-rocksdb.git#773784178a0e8e5fdad81f4fd85448a3014a3700" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2771,7 +2771,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#c1f668d0c85612f5fe6ec8e4351df0fc0bef1286" +source = "git+https://github.com/tikv/rust-rocksdb.git#773784178a0e8e5fdad81f4fd85448a3014a3700" dependencies = [ "bzip2-sys", "cc", @@ -4547,7 +4547,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#c1f668d0c85612f5fe6ec8e4351df0fc0bef1286" +source = "git+https://github.com/tikv/rust-rocksdb.git#773784178a0e8e5fdad81f4fd85448a3014a3700" dependencies = [ "libc 0.2.125", "librocksdb_sys", diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index c4dad67e3c5..e97a15c75ae 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -1,6 +1,9 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::fmt::Debug; +use std::{ + fmt::Debug, + path::{Path, PathBuf}, +}; use crate::*; @@ -63,5 +66,142 @@ pub trait KvEngine: // It should be named as `EngineFactory` for consistency, but we are about to rename // engine to tablet, so always use tablet for new traits/types. pub trait TabletFactory { - fn create_tablet(&self) -> Result; + /// Create an tablet by id and suffix. If the tablet exists, it will fail. + /// The id is likely the region Id, the suffix could be the current raft log index. + /// They together could specify a unique path for a region's tablet. + /// The reason to have suffix is that we can keep more than one tablet for a region. + fn create_tablet(&self, id: u64, suffix: u64) -> Result; + + /// Open a tablet by id and suffix. If the tablet exists, it will open it. + /// If the tablet does not exist, it will create it. + fn open_tablet(&self, id: u64, suffix: u64) -> Result { + self.open_tablet_raw(&self.tablet_path(id, suffix), false) + } + + /// Open a tablet by id and suffix from cache---that means it should already be opened. + fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option { + if let Ok(engine) = self.open_tablet_raw(&self.tablet_path(id, suffix), false) { + return Some(engine); + } + None + } + + /// Open a tablet by id and any suffix from cache + fn open_tablet_cache_any(&self, id: u64) -> Option { + self.open_tablet_cache(id, 0) + } + + /// Open tablet by path and readonly flag + fn open_tablet_raw(&self, path: &Path, readonly: bool) -> Result; + + /// Create the shared db for v1 + fn create_shared_db(&self) -> Result; + + /// Destroy the tablet and its data + fn destroy_tablet(&self, id: u64, suffix: u64) -> crate::Result<()>; + + /// Check if the tablet with specified id/suffix exists + #[inline] + fn exists(&self, id: u64, suffix: u64) -> bool { + self.exists_raw(&self.tablet_path(id, suffix)) + } + + /// Check if the tablet with specified path exists + fn exists_raw(&self, path: &Path) -> bool; + + /// Get the tablet path by id and suffix + fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf; + + /// Tablets root path + fn tablets_path(&self) -> PathBuf; + + /// Clone the tablet factory instance + /// Here we don't use Clone traint because it will break the trait's object safty + fn clone(&self) -> Box + Send>; + + /// Loop visit all opened tablets cached by the specified function. + /// Once the tablet is opened/created, it will be cached in a hashmap + fn loop_tablet_cache(&self, _f: Box); + + /// Load the tablet from path for id and suffix--for scenarios such as applying snapshot + fn load_tablet(&self, _path: &Path, _id: u64, _suffix: u64) -> Result { + unimplemented!(); + } + + /// Mark the tablet with specified id and suffix tombostone + fn mark_tombstone(&self, _id: u64, _suffix: u64) { + unimplemented!(); + } + + /// Check if the tablet with specified id and suffix tombostone + fn is_tombstoned(&self, _region_id: u64, _suffix: u64) -> bool { + unimplemented!(); + } +} + +pub struct DummyFactory +where + EK: KvEngine, +{ + pub engine: Option, + pub root_path: String, +} + +impl TabletFactory for DummyFactory +where + EK: KvEngine, +{ + fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { + Ok(self.engine.as_ref().unwrap().clone()) + } + fn open_tablet_raw(&self, _path: &Path, _readonly: bool) -> Result { + Ok(self.engine.as_ref().unwrap().clone()) + } + fn create_shared_db(&self) -> Result { + Ok(self.engine.as_ref().unwrap().clone()) + } + fn destroy_tablet(&self, _id: u64, _suffix: u64) -> crate::Result<()> { + Ok(()) + } + fn exists_raw(&self, _path: &Path) -> bool { + true + } + fn tablet_path(&self, _id: u64, _suffix: u64) -> PathBuf { + PathBuf::from(&self.root_path) + } + fn tablets_path(&self) -> PathBuf { + PathBuf::from(&self.root_path) + } + + fn clone(&self) -> Box + Send> { + if self.engine.is_none() { + return Box::>::new(DummyFactory { + engine: None, + root_path: self.root_path.clone(), + }); + } + Box::>::new(DummyFactory { + engine: Some(self.engine.as_ref().unwrap().clone()), + root_path: self.root_path.clone(), + }) + } + fn loop_tablet_cache(&self, _f: Box) {} +} + +impl DummyFactory +where + EK: KvEngine, +{ + pub fn new() -> DummyFactory { + DummyFactory { + engine: None, + root_path: "/dummy_root".to_string(), + } + } +} + +impl Default for DummyFactory { + fn default() -> Self { + Self::new() + } } diff --git a/components/engine_traits/src/engines.rs b/components/engine_traits/src/engines.rs index fd0fa961c06..4e4089d52dc 100644 --- a/components/engine_traits/src/engines.rs +++ b/components/engine_traits/src/engines.rs @@ -7,6 +7,7 @@ use crate::{ #[derive(Clone, Debug)] pub struct Engines { + // kv can be either global kv store, or the tablet in multirocks version. pub kv: K, pub raft: R, } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index a39cda850fa..bb308efd054 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -2005,7 +2005,7 @@ pub mod tests { region_state.set_region(region); kv.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), ®ion_state)?; } - Ok(Engines { kv, raft }) + Ok(Engines::new(kv, raft)) } pub fn get_kv_count(snap: &impl EngineSnapshot) -> usize { diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 4344a706fde..f1fd2167f9d 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1503,7 +1503,7 @@ impl TiKvServer { } let factory = builder.build(); let kv_engine = factory - .create_tablet() + .create_shared_db() .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); let engines = Engines::new(kv_engine, raft_engine); diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 96082bc6fbb..288e99a3837 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -661,7 +661,7 @@ pub fn create_test_engine( builder = builder.compaction_filter_router(router); } let factory = builder.build(); - let engine = factory.create_tablet().unwrap(); + let engine = factory.create_shared_db().unwrap(); let engines = Engines::new(engine, raft_engine); (engines, key_manager, dir, sst_worker) } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index e9c508a9985..0c02cde0aef 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -29,6 +29,7 @@ struct FactoryInner { api_version: ApiVersion, flow_listener: Option, sst_recovery_sender: Option>, + root_db: Mutex>, } pub struct KvEngineFactoryBuilder { @@ -48,6 +49,7 @@ impl KvEngineFactoryBuilder { api_version: config.storage.api_version(), flow_listener: None, sst_recovery_sender: None, + root_db: Mutex::default(), }, router: None, } @@ -93,7 +95,7 @@ pub struct KvEngineFactory { } impl KvEngineFactory { - fn create_raftstore_compaction_listener(&self) -> Option { + pub fn create_raftstore_compaction_listener(&self) -> Option { let ch = match &self.router { Some(r) => Mutex::new(r.clone()), None => return None, @@ -126,7 +128,7 @@ impl KvEngineFactory { )) } - fn create_tablet(&self, tablet_path: &Path) -> Result { + pub fn create_tablet(&self, tablet_path: &Path) -> Result { // Create kv engine. let mut kv_db_opts = self.inner.rocksdb_config.build_opt(); kv_db_opts.set_env(self.inner.env.clone()); @@ -163,6 +165,34 @@ impl KvEngineFactory { Ok(kv_engine) } + pub fn destroy_tablet(&self, tablet_path: &Path) -> engine_traits::Result<()> { + info!("destroy tablet"; "path" => %tablet_path.display()); + // Create kv engine. + let mut kv_db_opts = self.inner.rocksdb_config.build_opt(); + kv_db_opts.set_env(self.inner.env.clone()); + if let Some(filter) = self.create_raftstore_compaction_listener() { + kv_db_opts.add_event_listener(filter); + } + let _kv_cfs_opts = self.inner.rocksdb_config.build_cf_opts( + &self.inner.block_cache, + self.inner.region_info_accessor.as_ref(), + self.inner.api_version, + ); + // TODOTODO: call rust-rocks or tirocks to destroy_engine; + /* + engine_rocks::raw_util::destroy_engine( + tablet_path.to_str().unwrap(), + kv_db_opts, + kv_cfs_opts, + )?;*/ + let _ = std::fs::remove_dir_all(tablet_path); + Ok(()) + } + + pub fn store_path(&self) -> PathBuf { + self.inner.store_path.clone() + } + #[inline] fn kv_engine_path(&self) -> PathBuf { self.inner.store_path.join(DEFAULT_ROCKSDB_SUB_DIR) @@ -171,8 +201,48 @@ impl KvEngineFactory { impl TabletFactory for KvEngineFactory { #[inline] - fn create_tablet(&self) -> Result { + fn create_shared_db(&self) -> Result { let root_path = self.kv_engine_path(); - self.create_tablet(&root_path) + let tablet = self.create_tablet(&root_path)?; + let mut root_db = self.inner.root_db.lock().unwrap(); + root_db.replace(tablet.clone()); + Ok(tablet) + } + + fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { + if let Ok(db) = self.inner.root_db.lock() { + let cp = db.as_ref().unwrap().clone(); + return Ok(cp); + } + self.create_shared_db() + } + + fn open_tablet_raw(&self, _path: &Path, _readonly: bool) -> Result { + TabletFactory::create_tablet(self, 0, 0) + } + + fn exists_raw(&self, _path: &Path) -> bool { + false + } + fn tablet_path(&self, _id: u64, _suffix: u64) -> PathBuf { + self.kv_engine_path() + } + fn tablets_path(&self) -> PathBuf { + self.kv_engine_path() + } + + #[inline] + fn destroy_tablet(&self, _id: u64, _suffix: u64) -> engine_traits::Result<()> { + Ok(()) + } + fn clone(&self) -> Box + Send> { + Box::new(std::clone::Clone::clone(self)) + } + + fn loop_tablet_cache(&self, mut f: Box) { + if let Ok(db) = self.inner.root_db.lock() { + let db = db.as_ref().unwrap(); + f(0, 0, db); + } } } diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs new file mode 100644 index 00000000000..2dca2ff14f3 --- /dev/null +++ b/src/server/engine_factory_v2.rs @@ -0,0 +1,271 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + path::{Path, PathBuf}, + sync::{Arc, Mutex}, +}; + +use collections::HashMap; +use engine_rocks::RocksEngine; +use engine_traits::{RaftEngine, Result, TabletFactory}; + +use crate::server::engine_factory::KvEngineFactory; + +const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; + +#[derive(Clone)] +pub struct KvEngineFactoryV2 { + inner: KvEngineFactory, + registry: Arc>>, +} + +impl TabletFactory for KvEngineFactoryV2 { + fn create_tablet(&self, id: u64, suffix: u64) -> Result { + let mut reg = self.registry.lock().unwrap(); + if let Some(db) = reg.get(&(id, suffix)) { + return Err(box_err!( + "region {} {} already exists", + id, + db.as_inner().path() + )); + } + let tablet_path = self.tablet_path(id, suffix); + let kv_engine = self.inner.create_tablet(&tablet_path)?; + debug!("inserting tablet"; "key" => ?(id, suffix)); + reg.insert((id, suffix), kv_engine.clone()); + Ok(kv_engine) + } + + fn open_tablet(&self, id: u64, suffix: u64) -> Result { + let mut reg = self.registry.lock().unwrap(); + if let Some(db) = reg.get(&(id, suffix)) { + return Ok(db.clone()); + } + + let db_path = self.tablet_path(id, suffix); + let db = self.open_tablet_raw(db_path.as_path(), false)?; + debug!("open tablet"; "key" => ?(id, suffix)); + reg.insert((id, suffix), db.clone()); + Ok(db) + } + + fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option { + let reg = self.registry.lock().unwrap(); + if let Some(db) = reg.get(&(id, suffix)) { + return Some(db.clone()); + } + None + } + + fn open_tablet_cache_any(&self, id: u64) -> Option { + let reg = self.registry.lock().unwrap(); + if let Some(k) = reg.keys().find(|k| k.0 == id) { + debug!("choose a random tablet"; "key" => ?k); + return Some(reg.get(k).unwrap().clone()); + } + None + } + + fn open_tablet_raw(&self, path: &Path, _readonly: bool) -> Result { + if !RocksEngine::exists(path.to_str().unwrap_or_default()) { + return Err(box_err!( + "path {} does not have db", + path.to_str().unwrap_or_default() + )); + } + let (mut tablet_id, mut tablet_suffix) = (0, 1); + if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { + let mut split = s.split('_'); + tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); + tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); + } + self.create_tablet(tablet_id, tablet_suffix) + } + + #[inline] + fn create_shared_db(&self) -> Result { + self.create_tablet(0, 0) + } + + #[inline] + fn exists_raw(&self, path: &Path) -> bool { + RocksEngine::exists(path.to_str().unwrap_or_default()) + } + + #[inline] + fn tablets_path(&self) -> PathBuf { + self.inner.store_path().join("tablets") + } + + #[inline] + fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + self.inner + .store_path() + .join(format!("tablets/{}_{}", id, suffix)) + } + + #[inline] + fn mark_tombstone(&self, region_id: u64, suffix: u64) { + let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); + std::fs::File::create(&path).unwrap(); + debug!("tombstone tablet"; "region_id" => region_id, "suffix" => suffix); + self.registry.lock().unwrap().remove(&(region_id, suffix)); + } + + #[inline] + fn is_tombstoned(&self, region_id: u64, suffix: u64) -> bool { + self.tablet_path(region_id, suffix) + .join(TOMBSTONE_MARK) + .exists() + } + + #[inline] + fn destroy_tablet(&self, id: u64, suffix: u64) -> engine_traits::Result<()> { + let path = self.tablet_path(id, suffix); + self.registry.lock().unwrap().remove(&(id, suffix)); + self.inner.destroy_tablet(&path) + } + + #[inline] + fn loop_tablet_cache(&self, mut f: Box) { + let reg = self.registry.lock().unwrap(); + for ((id, suffix), tablet) in &*reg { + f(*id, *suffix, tablet) + } + } + + #[inline] + fn load_tablet(&self, path: &Path, id: u64, suffix: u64) -> Result { + { + let reg = self.registry.lock().unwrap(); + if let Some(db) = reg.get(&(id, suffix)) { + return Err(box_err!( + "region {} {} already exists", + id, + db.as_inner().path() + )); + } + } + + let db_path = self.tablet_path(id, suffix); + std::fs::rename(path, &db_path)?; + self.open_tablet_raw(db_path.as_path(), false) + } + + fn clone(&self) -> Box + Send> { + Box::new(std::clone::Clone::clone(self)) + } +} + +#[cfg(test)] +mod tests { + use engine_traits::TabletFactory; + + use super::*; + use crate::{config::TiKvConfig, server::KvEngineFactoryBuilder}; + + lazy_static! { + static ref TEST_CONFIG: TiKvConfig = { + let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + let common_test_cfg = + manifest_dir.join("components/test_raftstore/src/common-test.toml"); + TiKvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { + panic!( + "invalid auto generated configuration file {}, err {}", + manifest_dir.display(), + e + ); + }) + }; + } + + impl KvEngineFactoryV2 { + pub fn new(inner: KvEngineFactory) -> Self { + KvEngineFactoryV2 { + inner, + registry: Arc::new(Mutex::new(HashMap::default())), + } + } + } + + #[test] + fn test_kvengine_factory() { + let cfg = TEST_CONFIG.clone(); + let dir = test_util::temp_dir("test_kvengine_factory", false); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let builder = KvEngineFactoryBuilder::::new(env, &cfg, dir.path()); + let factory = builder.build(); + let shared_db = factory.create_shared_db().unwrap(); + let tablet = TabletFactory::create_tablet(&factory, 1, 10); + assert!(tablet.is_ok()); + let tablet = tablet.unwrap(); + let tablet2 = factory.open_tablet(1, 10).unwrap(); + assert_eq!(tablet.as_inner().path(), shared_db.as_inner().path()); + assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + let tablet2 = factory.open_tablet_cache(1, 10).unwrap(); + assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + let tablet2 = factory.open_tablet_cache_any(1).unwrap(); + assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + let tablet_path = factory.tablet_path(1, 10); + let tablet2 = factory.open_tablet_raw(&tablet_path, false).unwrap(); + assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + } + + #[test] + fn test_kvengine_factory_v2() { + let cfg = TEST_CONFIG.clone(); + let dir = test_util::temp_dir("test_kvengine_factory_v2", false); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let builder = KvEngineFactoryBuilder::::new(env, &cfg, dir.path()); + let inner_factory = builder.build(); + let factory = KvEngineFactoryV2::new(inner_factory); + let tablet = factory.create_tablet(1, 10); + assert!(tablet.is_ok()); + let tablet = tablet.unwrap(); + let tablet2 = factory.open_tablet(1, 10).unwrap(); + assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + let tablet2 = factory.open_tablet_cache(1, 10).unwrap(); + assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + let tablet2 = factory.open_tablet_cache_any(1).unwrap(); + assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + let tablet_path = factory.tablet_path(1, 10); + let result = factory.open_tablet_raw(&tablet_path, false); + assert!(result.is_err()); + + assert!(factory.exists(1, 10)); + assert!(!factory.exists(1, 11)); + assert!(!factory.exists(2, 10)); + assert!(!factory.exists(2, 11)); + assert!(factory.exists_raw(&tablet_path)); + assert!(!factory.is_tombstoned(1, 10)); + assert!(factory.load_tablet(&tablet_path, 1, 10).is_err()); + assert!(factory.load_tablet(&tablet_path, 1, 20).is_ok()); + factory.mark_tombstone(1, 20); + assert!(factory.is_tombstoned(1, 20)); + factory.destroy_tablet(1, 20).unwrap(); + let result = factory.open_tablet(1, 20); + assert!(result.is_err()); + } + + #[test] + fn test_get_live_tablets() { + let cfg = TEST_CONFIG.clone(); + let dir = test_util::temp_dir("test_get_live_tablets", false); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let builder = KvEngineFactoryBuilder::::new(env, &cfg, dir.path()); + let inner_factory = builder.build(); + let factory = KvEngineFactoryV2::new(inner_factory); + factory.create_tablet(1, 10).unwrap(); + factory.create_tablet(2, 10).unwrap(); + let mut count = 0; + factory.loop_tablet_cache(Box::new(|id, suffix, _tablet| { + assert!(id == 1 || id == 2); + assert!(suffix == 10); + count += 1; + })); + assert_eq!(count, 2); + } +} diff --git a/src/server/mod.rs b/src/server/mod.rs index 69a8f87d58f..af1aa289de7 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -6,6 +6,7 @@ mod raft_client; pub mod config; pub mod debug; mod engine_factory; +mod engine_factory_v2; pub mod errors; pub mod gc_worker; pub mod load_statistics; From 17b8468e9411e7218befbb1372d7ced09a00f720 Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Mon, 13 Jun 2022 13:54:32 +0800 Subject: [PATCH 024/676] txn: add more debug information for the txn commands (#12779) close tikv/tikv#12803 In the investigation process, the debug log information helps a lot, but there's still some important information missing such as retrying or 1pc flags. Changed: 1. Add more useful information displaying commands. 2. Redact necessary parts. Signed-off-by: cfzjywxk Co-authored-by: Ti Chi Robot --- components/txn_types/src/types.rs | 38 ++++++++++++- .../txn/commands/acquire_pessimistic_lock.rs | 3 +- .../txn/commands/check_secondary_locks.rs | 2 +- src/storage/txn/commands/check_txn_status.rs | 4 +- src/storage/txn/commands/commit.rs | 2 +- src/storage/txn/commands/macros.rs | 35 ++++++++++++ .../txn/commands/pessimistic_rollback.rs | 2 +- src/storage/txn/commands/prewrite.rs | 54 ++++++++++++++++++- src/storage/txn/commands/resolve_lock.rs | 2 +- src/storage/txn/commands/resolve_lock_lite.rs | 2 +- src/storage/txn/commands/rollback.rs | 2 +- 11 files changed, 135 insertions(+), 11 deletions(-) diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 53d6c9e3e00..432f1eafc34 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -266,7 +266,7 @@ pub enum MutationType { /// (the key already exist or not exist). The assertion should pass if the mutation (in a prewrite /// request) is going to be finished successfully, otherwise it indicates there should be some bug /// causing the attempt to write wrong data. -#[derive(Debug, Clone)] +#[derive(Clone)] pub enum Mutation { /// Put `Value` into `Key`, overwriting any existing value. Put((Key, Value), Assertion), @@ -284,6 +284,42 @@ pub enum Mutation { CheckNotExists(Key, Assertion), } +impl Debug for Mutation { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}", self) + } +} + +impl Display for Mutation { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Mutation::Put((key, value), assertion) => write!( + f, + "Put key:{:?} value:{:?} assertion:{:?}", + key, + &log_wrappers::Value::value(value), + assertion + ), + Mutation::Delete(key, assertion) => { + write!(f, "Delete key:{:?} assertion:{:?}", key, assertion) + } + Mutation::Lock(key, assertion) => { + write!(f, "Lock key:{:?} assertion:{:?}", key, assertion) + } + Mutation::Insert((key, value), assertion) => write!( + f, + "Put key:{:?} value:{:?} assertion:{:?}", + key, + &log_wrappers::Value::value(value), + assertion + ), + Mutation::CheckNotExists(key, assertion) => { + write!(f, "CheckNotExists key:{:?} assertion:{:?}", key, assertion) + } + } + } +} + impl Mutation { pub fn key(&self) -> &Key { match self { diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index e1785a7409d..ca94382491c 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -26,7 +26,8 @@ command! { /// This can be rolled back with a [`PessimisticRollback`](Command::PessimisticRollback) command. AcquirePessimisticLock: cmd_ty => StorageResult, - display => "kv::command::acquirepessimisticlock keys({}) @ {} {} | {:?}", (keys.len, start_ts, for_update_ts, ctx), + display => "kv::command::acquirepessimisticlock keys({:?}) @ {} {} {} {:?} {} {} | {:?}", + (keys, start_ts, lock_ttl, for_update_ts, wait_timeout, min_commit_ts, check_existence, ctx), content => { /// The set of keys to lock. keys: Vec<(Key, bool)>, diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 9a8f681311c..65abc2ffd1b 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -29,7 +29,7 @@ command! { /// status being changed, a rollback may be written. CheckSecondaryLocks: cmd_ty => SecondaryLocksStatus, - display => "kv::command::CheckSecondaryLocks {} keys@{} | {:?}", (keys.len, start_ts, ctx), + display => "kv::command::CheckSecondaryLocks {:?} keys@{} | {:?}", (keys, start_ts, ctx), content => { /// The keys of secondary locks. keys: Vec, diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 844ba5792a7..7ce843594a9 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -29,7 +29,9 @@ command! { /// [`Prewrite`](Command::Prewrite). CheckTxnStatus: cmd_ty => TxnStatus, - display => "kv::command::check_txn_status {} @ {} curr({}, {}) | {:?}", (primary_key, lock_ts, caller_start_ts, current_ts, ctx), + display => "kv::command::check_txn_status {} @ {} curr({}, {}, {}, {}, {}) | {:?}", + (primary_key, lock_ts, caller_start_ts, current_ts, rollback_if_not_exist, + force_sync_commit, resolving_pessimistic_lock, ctx), content => { /// The primary key of the transaction. primary_key: Key, diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index d73dc23ee06..8241b1b9c9c 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -23,7 +23,7 @@ command! { /// This should be following a [`Prewrite`](Command::Prewrite). Commit: cmd_ty => TxnStatus, - display => "kv::command::commit {} {} -> {} | {:?}", (keys.len, lock_ts, commit_ts, ctx), + display => "kv::command::commit {:?} {} -> {} | {:?}", (keys, lock_ts, commit_ts, ctx), content => { /// The keys affected. keys: Vec, diff --git a/src/storage/txn/commands/macros.rs b/src/storage/txn/commands/macros.rs index 29ec846b864..ea19f599d6d 100644 --- a/src/storage/txn/commands/macros.rs +++ b/src/storage/txn/commands/macros.rs @@ -79,6 +79,41 @@ macro_rules! command { write!(f, "{}", self) } } + }; + ( + $(#[$outer_doc: meta])* + $cmd: ident: + cmd_ty => $cmd_ty: ty, + content => { + $($(#[$inner_doc:meta])* $arg: ident : $arg_ty: ty,)* + } + ) => { + $(#[$outer_doc])* + pub struct $cmd { + pub ctx: crate::storage::Context, + pub deadline: ::tikv_util::deadline::Deadline, + $($(#[$inner_doc])* pub $arg: $arg_ty,)* + } + + impl $cmd { + /// Return a `TypedCommand` that encapsulates the result of executing this command. + pub fn new( + $($arg: $arg_ty,)* + ctx: crate::storage::Context, + ) -> TypedCommand<$cmd_ty> { + let execution_duration_limit = if ctx.max_execution_duration_ms == 0 { + crate::storage::txn::scheduler::DEFAULT_EXECUTION_DURATION_LIMIT + } else { + ::std::time::Duration::from_millis(ctx.max_execution_duration_ms) + }; + let deadline = ::tikv_util::deadline::Deadline::from_now(execution_duration_limit); + Command::$cmd($cmd { + ctx, + deadline, + $($arg,)* + }).into() + } + } } } diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index e583a88d2f0..17a72610065 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -25,7 +25,7 @@ command! { /// This can roll back an [`AcquirePessimisticLock`](Command::AcquirePessimisticLock) command. PessimisticRollback: cmd_ty => Vec>, - display => "kv::command::pessimistic_rollback keys({}) @ {} {} | {:?}", (keys.len, start_ts, for_update_ts, ctx), + display => "kv::command::pessimistic_rollback keys({:?}) @ {} {} | {:?}", (keys, start_ts, for_update_ts, ctx), content => { /// The keys to be rolled back. keys: Vec, diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 4c2caec12b2..1c0cbabd193 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -42,7 +42,6 @@ command! { /// or a [`Rollback`](Command::Rollback) should follow. Prewrite: cmd_ty => PrewriteResult, - display => "kv::command::prewrite mutations({}) @ {} | {:?}", (mutations.len, start_ts, ctx), content => { /// The set of mutations to apply. mutations: Vec, @@ -71,6 +70,33 @@ command! { } } +impl std::fmt::Display for Prewrite { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "kv::command::prewrite mutations({:?}) primary({:?}) secondary_len({:?})@ {} {} {} {} {} {} {} {:?} | {:?}", + self.mutations, + log_wrappers::Value::key(self.primary.as_slice()), + self.secondary_keys.as_ref().map(|sk| sk.len()), + self.start_ts, + self.lock_ttl, + self.skip_constraint_check, + self.txn_size, + self.min_commit_ts, + self.max_commit_ts, + self.try_one_pc, + self.assertion_level, + self.ctx, + ) + } +} + +impl std::fmt::Debug for Prewrite { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} + impl Prewrite { #[cfg(test)] pub fn with_defaults( @@ -225,7 +251,6 @@ command! { /// or a [`Rollback`](Command::Rollback) should follow. PrewritePessimistic: cmd_ty => PrewriteResult, - display => "kv::command::prewrite_pessimistic mutations({}) @ {} | {:?}", (mutations.len, start_ts, ctx), content => { /// The set of mutations to apply; the bool = is pessimistic lock. mutations: Vec<(Mutation, bool)>, @@ -254,6 +279,31 @@ command! { } } +impl std::fmt::Display for PrewritePessimistic { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "kv::command::pessimistic_prewrite mutations({:?}) primary({:?}) secondary_len({:?})@ {} {} {} {} {} {} {:?}| {:?}", + self.mutations, + log_wrappers::Value::key(self.primary.as_slice()), + self.secondary_keys.as_ref().map(|sk| sk.len()), + self.start_ts, + self.lock_ttl, + self.txn_size, + self.min_commit_ts, + self.max_commit_ts, + self.try_one_pc, + self.assertion_level, + self.ctx, + ) + } +} +impl std::fmt::Debug for PrewritePessimistic { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self) + } +} + impl PrewritePessimistic { #[cfg(test)] pub fn with_defaults( diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index e369266fa6d..9db90f450d8 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -30,7 +30,7 @@ command! { /// This should follow after a `ResolveLockReadPhase`. ResolveLock: cmd_ty => (), - display => "kv::resolve_lock", (), + display => "kv::resolve_lock {:?} scan_key({:?}) key_locks({:?})", (txn_status, scan_key, key_locks), content => { /// Maps lock_ts to commit_ts. If a transaction was rolled back, it is mapped to 0. /// diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index 7879145369c..e797ea62bf9 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -22,7 +22,7 @@ command! { /// Resolve locks on `resolve_keys` according to `start_ts` and `commit_ts`. ResolveLockLite: cmd_ty => (), - display => "kv::resolve_lock_lite", (), + display => "kv::resolve_lock_lite resolve_keys({:?}) {} {} | {:?}", (resolve_keys, start_ts, commit_ts, ctx), content => { /// The transaction timestamp. start_ts: TimeStamp, diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index 6d686092f18..e6641147f04 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -24,7 +24,7 @@ command! { /// This should be following a [`Prewrite`](Command::Prewrite) on the given key. Rollback: cmd_ty => (), - display => "kv::command::rollback keys({}) @ {} | {:?}", (keys.len, start_ts, ctx), + display => "kv::command::rollback keys({:?}) @ {} | {:?}", (keys, start_ts, ctx), content => { keys: Vec, /// The transaction timestamp. From 7bfcf60730ad30ed83a2125dd13b576fe393c853 Mon Sep 17 00:00:00 2001 From: Connor Date: Mon, 13 Jun 2022 14:56:33 +0800 Subject: [PATCH 025/676] metrics: Add missing metrics for async io code path (#12788) close tikv/tikv#12787 Add missing metrics for async io code path Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- metrics/grafana/tikv_details.json | 149 +++++++++++++++++++++++++++++- 1 file changed, 148 insertions(+), 1 deletion(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 6192b4f3a5e..009868d3a5c 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -12092,7 +12092,7 @@ }, { "exemplar": true, - "expr": "sum(tikv_raftstore_io_reschedule_pending_task_total{instance=~\"$instance\"}) by (instance)", + "expr": "sum(tikv_raftstore_io_reschedule_pending_tasks_total{instance=~\"$instance\"}) by (instance)", "hide": false, "interval": "", "legendFormat": "pending-task-{{instance}}", @@ -13215,6 +13215,153 @@ "alignLevel": null } }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dashes": false, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The handle duration of each store write task msg", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 46 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763572700, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(delta(tikv_raftstore_store_write_handle_msg_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Store write handle msg duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dashes": false, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The distribution of write trigger size", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 46 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763572701, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(delta(tikv_raftstore_store_write_trigger_wb_bytes_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Store write trigger size", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "aliasColors": {}, "bars": false, From 0d9b7b33b7928be06016075329fe0b2c4fba25f5 Mon Sep 17 00:00:00 2001 From: Lucas Date: Tue, 14 Jun 2022 15:10:33 +0800 Subject: [PATCH 026/676] raftstore: reset perf context before each apply write (#12808) ref tikv/tikv#11044, close tikv/tikv#11044 Signed-off-by: Lucasliang --- components/raftstore/src/store/fsm/apply.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index ca6cabb7a95..e3c1172ef5b 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -520,6 +520,7 @@ where self.pending_ssts = vec![]; } if !self.kv_wb_mut().is_empty() { + self.perf_context.start_observe(); let mut write_opts = engine_traits::WriteOptions::new(); write_opts.set_sync(need_sync); self.kv_wb().write_opt(&write_opts).unwrap_or_else(|e| { @@ -3819,7 +3820,6 @@ where } update_cfg(&incoming.apply_batch_system); } - self.apply_ctx.perf_context.start_observe(); } fn handle_control(&mut self, control: &mut ControlFsm) -> Option { From 11b5d4c3d6ca740cc0bf272691fd15ec94345cd9 Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 15 Jun 2022 18:22:34 +0800 Subject: [PATCH 027/676] pd_client: Do not reconnect for pd unknown error (#12827) close tikv/tikv#12345 do not reconnect for pd unknown error Signed-off-by: Connor1996 --- components/pd_client/src/errors.rs | 5 +++-- components/test_pd/src/mocker/retry.rs | 12 ++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/components/pd_client/src/errors.rs b/components/pd_client/src/errors.rs index b86edfc6e98..a9e4ffe6266 100644 --- a/components/pd_client/src/errors.rs +++ b/components/pd_client/src/errors.rs @@ -30,8 +30,9 @@ pub type Result = result::Result; impl Error { pub fn retryable(&self) -> bool { match self { - Error::Grpc(_) | Error::Other(_) | Error::ClusterNotBootstrapped(_) => true, - Error::RegionNotFound(_) + Error::Grpc(_) | Error::ClusterNotBootstrapped(_) => true, + Error::Other(_) + | Error::RegionNotFound(_) | Error::StoreTombstone(_) | Error::GlobalConfigNotFound(_) | Error::ClusterBootstrapped(_) diff --git a/components/test_pd/src/mocker/retry.rs b/components/test_pd/src/mocker/retry.rs index ef49aee3f66..be9c90633c0 100644 --- a/components/test_pd/src/mocker/retry.rs +++ b/components/test_pd/src/mocker/retry.rs @@ -87,11 +87,9 @@ impl Default for NotRetry { impl PdMocker for NotRetry { fn get_region_by_id(&self, _: &GetRegionByIdRequest) -> Option> { if !self.is_visited.swap(true, Ordering::Relaxed) { - info!( - "[NotRetry] get_region_by_id returns Ok(_) with header has IncompatibleVersion error" - ); + info!("[NotRetry] get_region_by_id returns Ok(_) with header has RegionNotFound error"); let mut err = Error::default(); - err.set_type(ErrorType::IncompatibleVersion); + err.set_type(ErrorType::RegionNotFound); let mut resp = GetRegionResponse::default(); resp.mut_header().set_error(err); Some(Ok(resp)) @@ -103,11 +101,9 @@ impl PdMocker for NotRetry { fn get_store(&self, _: &GetStoreRequest) -> Option> { if !self.is_visited.swap(true, Ordering::Relaxed) { - info!( - "[NotRetry] get_region_by_id returns Ok(_) with header has IncompatibleVersion error" - ); + info!("[NotRetry] get_region_by_id returns Ok(_) with header has Unknown error"); let mut err = Error::default(); - err.set_type(ErrorType::IncompatibleVersion); + err.set_type(ErrorType::Unknown); let mut resp = GetStoreResponse::default(); resp.mut_header().set_error(err); Some(Ok(resp)) From 2fbf7ee5a348df5f1839ff6fc47753b50ff7c76f Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 15 Jun 2022 20:20:33 +0800 Subject: [PATCH 028/676] tests: ignore env var dependent tests at runtime (#12805) ref rust-lang/cargo#10250, close tikv/tikv#12804, ref rust-lang/rust#68007, ref rust-lang/rust#96132 Signed-off-by: tabokie --- Cargo.lock | 41 ++++++++++--------- cmd/build.rs | 2 +- components/coprocessor_plugin_api/src/lib.rs | 1 - .../encryption/src/encrypted_file/mod.rs | 1 - components/raftstore/src/lib.rs | 1 - components/tidb_query_executors/src/lib.rs | 2 - components/tidb_query_expr/src/lib.rs | 2 - components/tikv_alloc/src/jemalloc.rs | 6 +-- components/tikv_alloc/src/lib.rs | 39 ++++++++++++++++++ components/tracker/src/lib.rs | 1 - rust-toolchain | 2 +- scripts/test-all | 12 ++---- src/import/sst_service.rs | 2 +- src/server/service/mod.rs | 2 +- src/storage/mod.rs | 5 +-- src/storage/raw/encoded.rs | 5 +-- src/storage/raw/raw_mvcc.rs | 5 +-- 17 files changed, 75 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cd8a55146af..3350e0ef252 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -49,7 +49,7 @@ checksum = "43bb833f0bf979d8475d38fbf09ed3b8a55e1885fe93ad3f93239fc6a4f17b98" dependencies = [ "getrandom 0.2.3", "once_cell", - "version_check 0.9.2", + "version_check 0.9.4", ] [[package]] @@ -2122,7 +2122,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" dependencies = [ "typenum", - "version_check 0.9.2", + "version_check 0.9.4", ] [[package]] @@ -3169,7 +3169,7 @@ checksum = "1b1d11e1ef389c76fe5b81bcaf2ea32cf88b62bc494e19f493d0b30e7a930109" dependencies = [ "memchr", "minimal-lexical", - "version_check 0.9.2", + "version_check 0.9.4", ] [[package]] @@ -3796,7 +3796,7 @@ dependencies = [ "proc-macro2", "quote", "syn", - "version_check 0.9.2", + "version_check 0.9.4", ] [[package]] @@ -3807,7 +3807,7 @@ checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ "proc-macro2", "quote", - "version_check 0.9.2", + "version_check 0.9.4", ] [[package]] @@ -3999,9 +3999,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.9" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +checksum = "a1feb54ed693b93a84e14094943b84b7c4eae204c512b7ccb95ab0c66d278ad1" dependencies = [ "proc-macro2", ] @@ -4023,8 +4023,8 @@ dependencies = [ [[package]] name = "raft-engine" -version = "0.1.0" -source = "git+https://github.com/tikv/raft-engine.git#0e066f8626b43b2a8a0a6bc9c7f0502b6fdc3d05" +version = "0.2.0" +source = "git+https://github.com/tikv/raft-engine.git#4e89901a3eff850a47ea0e6b44bc74d9fed84769" dependencies = [ "byteorder", "crc32fast", @@ -4054,8 +4054,8 @@ dependencies = [ [[package]] name = "raft-engine-ctl" -version = "0.1.0" -source = "git+https://github.com/tikv/raft-engine.git#0e066f8626b43b2a8a0a6bc9c7f0502b6fdc3d05" +version = "0.2.0" +source = "git+https://github.com/tikv/raft-engine.git#4e89901a3eff850a47ea0e6b44bc74d9fed84769" dependencies = [ "clap 3.1.6", "env_logger", @@ -4506,11 +4506,12 @@ dependencies = [ [[package]] name = "rhai" -version = "1.4.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "898b114d6cfa18af4593393fdc6c7437118e7e624d97f635fba8c75fd5c06f56" +checksum = "9f06953bb8b9e4307cb7ccc0d9d018e2ddd25a30d32831f631ce4fe8f17671f7" dependencies = [ "ahash", + "bitflags", "instant", "num-traits", "rhai_codegen", @@ -4520,9 +4521,9 @@ dependencies = [ [[package]] name = "rhai_codegen" -version = "1.3.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e02d33d76a7aa8ec72ac8298d5b52134fd2dff77445ada0c65f6f8c40d8f2931" +checksum = "75a39bc2aa9258b282ee5518dac493491a9c4c11a6d7361b9d2644c922fc6488" dependencies = [ "proc-macro2", "quote", @@ -5173,11 +5174,13 @@ checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" [[package]] name = "smartstring" -version = "0.2.10" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e714dff2b33f2321fdcd475b71cec79781a692d846f37f415fb395a1d2bcd48e" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" dependencies = [ + "autocfg", "static_assertions", + "version_check 0.9.4", ] [[package]] @@ -6738,9 +6741,9 @@ checksum = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" [[package]] name = "version_check" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] name = "visible" diff --git a/cmd/build.rs b/cmd/build.rs index ef751a71feb..6d11a38f705 100644 --- a/cmd/build.rs +++ b/cmd/build.rs @@ -32,7 +32,7 @@ fn link_sys_lib(lib: &str, tool: &cc::Tool) { } // remove lib prefix and .a postfix. let libname = &lib[3..lib.len() - 2]; - println!("cargo:rustc-link-lib=static={}", &libname); + println!("cargo:rustc-link-lib=static:+whole-archive={}", &libname); println!( "cargo:rustc-link-search=native={}", path.parent().unwrap().display() diff --git a/components/coprocessor_plugin_api/src/lib.rs b/components/coprocessor_plugin_api/src/lib.rs index 6e90ef83d2a..ca61b54c724 100644 --- a/components/coprocessor_plugin_api/src/lib.rs +++ b/components/coprocessor_plugin_api/src/lib.rs @@ -1,5 +1,4 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -#![feature(const_fn_fn_ptr_basics)] //! This crate contains some necessary types and traits for implementing a custom coprocessor plugin //! for TiKV. diff --git a/components/encryption/src/encrypted_file/mod.rs b/components/encryption/src/encrypted_file/mod.rs index e52cba85afc..7bf31225db8 100644 --- a/components/encryption/src/encrypted_file/mod.rs +++ b/components/encryption/src/encrypted_file/mod.rs @@ -127,7 +127,6 @@ mod tests { let content = b"test content"; file.write(content, &PlaintextBackend::default()).unwrap(); - drop(file); let file = EncryptedFile::new(tmp.path(), "encrypted"); assert_eq!(file.read(&PlaintextBackend::default()).unwrap(), content); diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index cd50b74dc48..b212001657a 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -6,7 +6,6 @@ #![feature(min_specialization)] #![feature(box_patterns)] #![feature(hash_drain_filter)] -#![feature(vec_retain_mut)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/tidb_query_executors/src/lib.rs b/components/tidb_query_executors/src/lib.rs index 0aa69c3b8f5..b32518c600b 100644 --- a/components/tidb_query_executors/src/lib.rs +++ b/components/tidb_query_executors/src/lib.rs @@ -10,8 +10,6 @@ #![allow(incomplete_features)] #![feature(proc_macro_hygiene)] #![feature(specialization)] -#![feature(const_fn_fn_ptr_basics)] -#![feature(const_fn_trait_bound)] #![feature(const_mut_refs)] #[macro_use(box_try, warn)] diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index f11c0f89bbf..eec5bdad844 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -12,8 +12,6 @@ #![feature(proc_macro_hygiene)] #![feature(specialization)] #![feature(test)] -#![feature(const_fn_fn_ptr_basics)] -#![feature(const_fn_trait_bound)] #![feature(const_mut_refs)] #[macro_use(box_err, box_try, try_opt)] diff --git a/components/tikv_alloc/src/jemalloc.rs b/components/tikv_alloc/src/jemalloc.rs index 959f526bfaf..e8a21115142 100644 --- a/components/tikv_alloc/src/jemalloc.rs +++ b/components/tikv_alloc/src/jemalloc.rs @@ -192,10 +192,10 @@ mod profiling { // TODO: need a test for the dump_prof(None) case, but // the cleanup afterward is not simple. #[test] - #[ignore] - fn test_profiling_memory() { + #[ignore = "#ifdef MALLOC_CONF"] + fn test_profiling_memory_ifdef_malloc_conf() { // Make sure somebody has turned on profiling - assert!(is_profiling_on(), r#"Set MALLOC_CONF="prof:true""#); + assert!(is_profiling_on(), "set MALLOC_CONF=prof:true"); let dir = Builder::new() .prefix("test_profiling_memory") diff --git a/components/tikv_alloc/src/lib.rs b/components/tikv_alloc/src/lib.rs index df7efcd80bc..1435ca2bbd0 100644 --- a/components/tikv_alloc/src/lib.rs +++ b/components/tikv_alloc/src/lib.rs @@ -82,6 +82,10 @@ //! `--features=mem-profiling` to cargo for eather `tikv_alloc` or //! `tikv`. +#![cfg_attr(test, feature(test))] +#![cfg_attr(test, feature(custom_test_frameworks))] +#![cfg_attr(test, test_runner(runner::run_env_conditional_tests))] + #[cfg(feature = "jemalloc")] #[macro_use] extern crate lazy_static; @@ -124,3 +128,38 @@ pub use crate::{imp::*, trace::*}; #[global_allocator] static ALLOC: imp::Allocator = imp::allocator(); + +#[cfg(test)] +mod runner { + extern crate test; + use test::*; + + /// Check for ignored test cases with ignore message "#ifdef ". The test + /// case will be enabled if the specific environment variable is set. + pub fn run_env_conditional_tests(cases: &[&TestDescAndFn]) { + let cases: Vec<_> = cases + .iter() + .map(|case| { + let mut desc = case.desc.clone(); + let testfn = match case.testfn { + TestFn::StaticTestFn(f) => TestFn::StaticTestFn(f), + TestFn::StaticBenchFn(f) => TestFn::StaticBenchFn(f), + ref f => panic!("unexpected testfn {:?}", f), + }; + if let Some(msg) = desc.ignore_message { + let keyword = "#ifdef"; + if let Some(s) = msg.strip_prefix(keyword) { + let var_name = s.trim(); + if var_name.is_empty() || std::env::var(var_name).is_ok() { + desc.ignore = false; + desc.ignore_message = None; + } + } + } + TestDescAndFn { desc, testfn } + }) + .collect(); + let args = std::env::args().collect::>(); + test_main(&args, cases, None) + } +} diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index 909e093ed3f..ec3b6d37017 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -1,6 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -#![feature(derive_default_enum)] #![feature(array_from_fn)] mod metrics; diff --git a/rust-toolchain b/rust-toolchain index f24eb00edaf..b91c1b17580 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -nightly-2022-02-14 +nightly-2022-05-01 diff --git a/scripts/test-all b/scripts/test-all index daf7cf3f503..246a8f22176 100755 --- a/scripts/test-all +++ b/scripts/test-all @@ -13,17 +13,13 @@ if [[ -z $MAKEFILE_RUN ]] ; then fi ./scripts/test "$@" -- --nocapture -# The special Linux case below is testing the mem-profiling -# features in tikv_alloc, which are marked #[ignore] since -# they require special compile-time and run-time setup -# Fortunately rebuilding with the mem-profiling feature will only -# rebuild starting at jemalloc-sys. +# Re-run tests that requires specific environment variables. if [[ "$(uname)" == "Linux" ]]; then - export MALLOC_CONF=prof:true,prof_active:false - ./scripts/test -p tikv -p tikv_alloc --lib "$@" -- --nocapture --ignored + export MALLOC_CONF=prof:true + ./scripts/test ifdef_malloc_conf "$@" -- --nocapture fi if [[ "$(uname)" = "Linux" ]]; then EXTRA_CARGO_ARGS="" ./scripts/test --message-format=json-render-diagnostics -q --no-run -- --nocapture | - python scripts/check-bins.py --features "${TIKV_ENABLE_FEATURES}" --check-tests + python scripts/check-bins.py --features "${TIKV_ENABLE_FEATURES}" --check-tests fi \ No newline at end of file diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index fc41a504f42..ac892884e37 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -295,7 +295,7 @@ macro_rules! impl_write { Ok(resp) } .await; - crate::send_rpc_response!(res, sink, label, timer); + $crate::send_rpc_response!(res, sink, label, timer); }; self.threads.spawn_ok(buf_driver); diff --git a/src/server/service/mod.rs b/src/server/service/mod.rs index 36ea4c78a85..d80c2f6806c 100644 --- a/src/server/service/mod.rs +++ b/src/server/service/mod.rs @@ -18,7 +18,7 @@ pub use self::{ macro_rules! log_net_error { ($err:expr, $($args:tt)*) => {{ let e = $err; - if let crate::server::Error::Grpc(e) = e { + if let $crate::server::Error::Grpc(e) = e { info!($($args)*, "err" => %e); } else { debug!($($args)*, "err" => %e); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 692adec1ad1..63279780cfc 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -2721,10 +2721,7 @@ pub struct TxnTestSnapshot { impl Snapshot for TxnTestSnapshot { type Iter = S::Iter; - type Ext<'a> - where - S: 'a, - = TxnTestSnapshotExt<'a>; + type Ext<'a> = TxnTestSnapshotExt<'a> where S: 'a; fn get(&self, key: &Key) -> tikv_kv::Result> { self.snapshot.get(key) diff --git a/src/storage/raw/encoded.rs b/src/storage/raw/encoded.rs index 4c3629e14ef..b9b25015891 100644 --- a/src/storage/raw/encoded.rs +++ b/src/storage/raw/encoded.rs @@ -61,10 +61,7 @@ impl RawEncodeSnapshot { impl Snapshot for RawEncodeSnapshot { type Iter = RawEncodeIterator; - type Ext<'a> - where - S: 'a, - = S::Ext<'a>; + type Ext<'a> = S::Ext<'a> where S: 'a; fn get(&self, key: &Key) -> Result> { self.map_value(self.snap.get(key)) diff --git a/src/storage/raw/raw_mvcc.rs b/src/storage/raw/raw_mvcc.rs index 1f0bed9f945..4212b1c56ef 100644 --- a/src/storage/raw/raw_mvcc.rs +++ b/src/storage/raw/raw_mvcc.rs @@ -43,10 +43,7 @@ impl RawMvccSnapshot { impl Snapshot for RawMvccSnapshot { type Iter = RawMvccIterator; - type Ext<'a> - where - S: 'a, - = S::Ext<'a>; + type Ext<'a> = S::Ext<'a> where S: 'a; fn get(&self, key: &Key) -> Result> { self.seek_first_key_value_cf(None, None, key) From a80152ce04e7b18579f99d6407601599712aba2a Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Thu, 16 Jun 2022 11:56:34 +0800 Subject: [PATCH 029/676] storage: record perf statistics for scheduler commands (#12500) ref tikv/tikv#12362 This commit records perf contexts around executing txn scheduler commands. This helps us know the detail performance data of the underlying engine. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/tracker/Cargo.toml | 1 + components/tracker/src/lib.rs | 10 + components/tracker/src/slab.rs | 7 +- metrics/grafana/tikv_details.json | 230 +++++++++++++++++- src/server/service/kv.rs | 9 + src/storage/metrics.rs | 62 +++++ src/storage/mod.rs | 57 ++--- .../txn/commands/acquire_pessimistic_lock.rs | 1 + .../txn/commands/check_secondary_locks.rs | 1 + src/storage/txn/commands/check_txn_status.rs | 1 + src/storage/txn/commands/cleanup.rs | 1 + src/storage/txn/commands/commit.rs | 1 + src/storage/txn/commands/macros.rs | 8 + src/storage/txn/commands/mod.rs | 9 + .../txn/commands/pessimistic_rollback.rs | 1 + src/storage/txn/commands/prewrite.rs | 2 + src/storage/txn/commands/resolve_lock.rs | 1 + src/storage/txn/commands/resolve_lock_lite.rs | 1 + .../txn/commands/resolve_lock_readphase.rs | 1 + src/storage/txn/commands/rollback.rs | 1 + src/storage/txn/commands/txn_heart_beat.rs | 1 + src/storage/txn/scheduler.rs | 58 +++-- 23 files changed, 394 insertions(+), 71 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3350e0ef252..f94f088e563 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6571,6 +6571,7 @@ name = "tracker" version = "0.0.1" dependencies = [ "collections", + "crossbeam-utils 0.8.8", "kvproto", "lazy_static", "parking_lot 0.12.0", diff --git a/components/tracker/Cargo.toml b/components/tracker/Cargo.toml index fcaf546cf5b..f9b97010bd8 100644 --- a/components/tracker/Cargo.toml +++ b/components/tracker/Cargo.toml @@ -6,6 +6,7 @@ publish = false [dependencies] collections = { path = "../../components/collections" } +crossbeam-utils = "0.8" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1" parking_lot = "0.12" diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index ec3b6d37017..0e932658aba 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -68,6 +68,16 @@ pub enum RequestType { KvBatchGetCommand, KvScan, KvScanLock, + KvPrewrite, + KvCommit, + KvPessimisticLock, + KvCheckTxnStatus, + KvCheckSecondaryLocks, + KvCleanup, + KvResolveLock, + KvTxnHeartBeat, + KvRollback, + KvPessimisticRollback, CoprocessorDag, CoprocessorAnalyze, CoprocessorChecksum, diff --git a/components/tracker/src/slab.rs b/components/tracker/src/slab.rs index 9d2803e7585..f737ee1ed1e 100644 --- a/components/tracker/src/slab.rs +++ b/components/tracker/src/slab.rs @@ -2,6 +2,7 @@ use std::{array, cell::Cell, fmt}; +use crossbeam_utils::CachePadded; use lazy_static::lazy_static; use parking_lot::Mutex; use slab::Slab; @@ -29,16 +30,16 @@ fn next_shard_id() -> usize { } pub struct ShardedSlab { - shards: [Mutex; SLAB_SHARD_COUNT], + shards: [CachePadded>; SLAB_SHARD_COUNT], } impl ShardedSlab { pub fn new(capacity_per_shard: usize) -> ShardedSlab { let shards = array::from_fn(|shard_id| { - Mutex::new(TrackerSlab::with_capacity( + CachePadded::new(Mutex::new(TrackerSlab::with_capacity( shard_id as u32, capacity_per_shard, - )) + ))) }); ShardedSlab { shards } } diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 009868d3a5c..15dfa8c684b 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -4654,6 +4654,232 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 68 + }, + "hiddenSeries": false, + "id": 23763572784, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:80", + "alias": "/.*/", + "stack": "A" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_storage_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", metric=\"block_read_time\"}[1m])) by (req)", + "hide": false, + "interval": "", + "legendFormat": "{{req}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(tikv_coprocessor_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", metric=\"block_read_time\"}[1m])) by (req)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "copr-{{req}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IO time per second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:264", + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:265", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 68 + }, + "hiddenSeries": false, + "id": 23763572785, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:62", + "alias": "/.*/", + "stack": "A" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_storage_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", metric=\"block_read_byte\"}[1m])) by (req)", + "interval": "", + "legendFormat": "{{req}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(rate(tikv_coprocessor_rocksdb_perf{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", metric=\"block_read_byte\"}[1m])) by (req)", + "hide": false, + "interval": "", + "legendFormat": "copr-{{req}}", + "queryType": "randomWalk", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IO bytes per second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:264", + "format": "binBps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:265", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, @@ -16921,7 +17147,7 @@ "targets": [ { "exemplar": true, - "expr": "rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"unified-read.*\"}[1m])", + "expr": "sum(rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"unified-read.*\"}[1m])) by (le)", "format": "heatmap", "interval": "", "legendFormat": "{{le}}", @@ -20259,7 +20485,7 @@ "targets": [ { "exemplar": true, - "expr": "rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sched-worker.*\"}[1m])", + "expr": "sum(rate(tikv_yatp_pool_schedule_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sched-worker.*\"}[1m])) by (le)", "format": "heatmap", "interval": "", "legendFormat": "{{le}}", diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 73215f6922c..988e0624686 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1932,10 +1932,19 @@ macro_rules! txn_command_future { $req: $req_ty, ) -> impl Future> { $prelude + let tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( + $req.get_context(), + RequestType::Unknown, + 0, + ))); + set_tls_tracker_token(tracker); let (cb, f) = paired_future_callback(); let res = storage.sched_txn_command($req.into(), cb); async move { + defer!{{ + GLOBAL_TRACKERS.remove(tracker); + }}; let $v = match res { Err(e) => Err(e), Ok(_) => f.await?, diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index fd4df727e54..95f5809ec9e 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -5,11 +5,14 @@ use std::{cell::RefCell, mem, sync::Arc}; use collections::HashMap; +use engine_traits::{PerfContext, PerfContextExt, PerfContextKind, PerfLevel}; use kvproto::{kvrpcpb::KeyRange, metapb, pdpb::QueryKind}; use pd_client::BucketMeta; use prometheus::*; use prometheus_static_metric::*; use raftstore::store::{util::build_key_range, ReadStats}; +use tikv_kv::{with_tls_engine, Engine}; +use tracker::get_tls_tracker_token; use crate::{ server::metrics::{GcKeysCF as ServerGcKeysCF, GcKeysDetail as ServerGcKeysDetail}, @@ -296,6 +299,65 @@ impl From for GcKeysDetail { } } +// Safety: It should be only called when the thread-local engine exists. +pub unsafe fn with_perf_context(cmd: CommandKind, f: Fn) -> T +where + Fn: FnOnce() -> T, +{ + thread_local! { + static GET: RefCell>> = RefCell::new(None); + static BATCH_GET: RefCell>> = RefCell::new(None); + static BATCH_GET_COMMAND: RefCell>> = RefCell::new(None); + static SCAN: RefCell>> = RefCell::new(None); + static PREWRITE: RefCell>> = RefCell::new(None); + static ACQUIRE_PESSIMISTIC_LOCK: RefCell>> = RefCell::new(None); + static COMMIT: RefCell>> = RefCell::new(None); + static CLEANUP: RefCell>> = RefCell::new(None); + static ROLLBACK: RefCell>> = RefCell::new(None); + static PESSIMISTIC_ROLLBACK: RefCell>> = RefCell::new(None); + static TXN_HEART_BEAT: RefCell>> = RefCell::new(None); + static CHECK_TXN_STATUS: RefCell>> = RefCell::new(None); + static CHECK_SECONDARY_LOCKS: RefCell>> = RefCell::new(None); + static SCAN_LOCK: RefCell>> = RefCell::new(None); + static RESOLVE_LOCK: RefCell>> = RefCell::new(None); + static RESOLVE_LOCK_LITE: RefCell>> = RefCell::new(None); + } + let tls_cell = match cmd { + CommandKind::get => &GET, + CommandKind::batch_get => &BATCH_GET, + CommandKind::batch_get_command => &BATCH_GET_COMMAND, + CommandKind::scan => &SCAN, + CommandKind::prewrite => &PREWRITE, + CommandKind::acquire_pessimistic_lock => &ACQUIRE_PESSIMISTIC_LOCK, + CommandKind::commit => &COMMIT, + CommandKind::cleanup => &CLEANUP, + CommandKind::rollback => &ROLLBACK, + CommandKind::pessimistic_rollback => &PESSIMISTIC_ROLLBACK, + CommandKind::txn_heart_beat => &TXN_HEART_BEAT, + CommandKind::check_txn_status => &CHECK_TXN_STATUS, + CommandKind::check_secondary_locks => &CHECK_SECONDARY_LOCKS, + CommandKind::scan_lock => &SCAN_LOCK, + CommandKind::resolve_lock => &RESOLVE_LOCK, + CommandKind::resolve_lock_lite => &RESOLVE_LOCK_LITE, + _ => return f(), + }; + tls_cell.with(|c| { + let mut c = c.borrow_mut(); + let perf_context = c.get_or_insert_with(|| { + with_tls_engine(|engine: &E| { + Box::new(engine.kv_engine().get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Storage(cmd.get_str()), + )) + }) + }); + perf_context.start_observe(); + let res = f(); + perf_context.report_metrics(&[get_tls_tracker_token()]); + res + }) +} + lazy_static! { pub static ref KV_COMMAND_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_storage_command_total", diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 63279780cfc..f12f918b8aa 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -52,7 +52,6 @@ mod types; use std::{ borrow::Cow, - cell::RefCell, iter, marker::PhantomData, sync::{ @@ -63,10 +62,7 @@ use std::{ use api_version::{ApiV1, ApiV2, KeyMode, KvFormat, RawValue}; use concurrency_manager::ConcurrencyManager; -use engine_traits::{ - raw_ttl::ttl_to_expire_ts, CfName, PerfContext, PerfContextExt, PerfContextKind, PerfLevel, - CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, -}; +use engine_traits::{raw_ttl::ttl_to_expire_ts, CfName, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS}; use futures::prelude::*; use kvproto::{ kvrpcpb::{ @@ -85,8 +81,7 @@ use tikv_util::{ time::{duration_to_ms, Instant, ThreadReadId}, }; use tracker::{ - clear_tls_tracker_token, get_tls_tracker_token, set_tls_tracker_token, TrackedFuture, - TrackerToken, + clear_tls_tracker_token, set_tls_tracker_token, with_tls_tracker, TrackedFuture, TrackerToken, }; use txn_types::{Key, KvPair, Lock, OldValues, TimeStamp, TsSet, Value}; @@ -279,42 +274,6 @@ impl Storage { }) } - fn with_perf_context(cmd: CommandKind, f: Fn) -> T - where - Fn: FnOnce() -> T, - { - thread_local! { - static GET: RefCell>> = RefCell::new(None); - static BATCH_GET: RefCell>> = RefCell::new(None); - static BATCH_GET_COMMAND: RefCell>> = RefCell::new(None); - static SCAN: RefCell>> = RefCell::new(None); - static SCAN_LOCK: RefCell>> = RefCell::new(None); - } - let tls_cell = match cmd { - CommandKind::get => &GET, - CommandKind::batch_get => &BATCH_GET, - CommandKind::batch_get_command => &BATCH_GET_COMMAND, - CommandKind::scan => &SCAN, - CommandKind::scan_lock => &SCAN_LOCK, - _ => return f(), - }; - tls_cell.with(|c| { - let mut c = c.borrow_mut(); - let perf_context = c.get_or_insert_with(|| { - Self::with_tls_engine(|engine| { - Box::new(engine.kv_engine().get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Storage(cmd.get_str()), - )) - }) - }); - perf_context.start_observe(); - let res = f(); - perf_context.report_metrics(&[get_tls_tracker_token()]); - res - }) - } - /// Get the underlying `Engine` of the `Storage`. pub fn get_engine(&self) -> E { self.engine.clone() @@ -359,6 +318,14 @@ impl Storage { self.read_pool.get_normal_pool_size() } + fn with_perf_context(cmd: CommandKind, f: Fn) -> T + where + Fn: FnOnce() -> T, + { + // Safety: the read pools ensure that a TLS engine exists. + unsafe { with_perf_context::(cmd, f) } + } + #[inline] fn with_tls_engine(f: impl FnOnce(&E) -> R) -> R { // Safety: the read pools ensure that a TLS engine exists. @@ -1436,6 +1403,10 @@ impl Storage { } _ => {} } + with_tls_tracker(|tracker| { + tracker.req_info.start_ts = cmd.ts().into_inner(); + tracker.req_info.request_type = cmd.request_type(); + }); fail_point!("storage_drop_message", |_| Ok(())); cmd.incr_cmd_metric(); diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index ca94382491c..d49d759f3a5 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -53,6 +53,7 @@ command! { impl CommandExt for AcquirePessimisticLock { ctx!(); tag!(acquire_pessimistic_lock); + request_type!(KvPessimisticLock); ts!(start_ts); property!(can_be_pipelined); diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 65abc2ffd1b..c27e8dc1bc0 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -41,6 +41,7 @@ command! { impl CommandExt for CheckSecondaryLocks { ctx!(); tag!(check_secondary_locks); + request_type!(KvCheckSecondaryLocks); ts!(start_ts); write_bytes!(keys: multiple); gen_lock!(keys: multiple); diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 7ce843594a9..5ec0ae5c503 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -57,6 +57,7 @@ command! { impl CommandExt for CheckTxnStatus { ctx!(); tag!(check_txn_status); + request_type!(KvCheckTxnStatus); ts!(lock_ts); write_bytes!(primary_key); gen_lock!(primary_key); diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index aefcf128740..62c0aaa98c1 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -38,6 +38,7 @@ command! { impl CommandExt for Cleanup { ctx!(); tag!(cleanup); + request_type!(KvCleanup); ts!(start_ts); write_bytes!(key); gen_lock!(key); diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index 8241b1b9c9c..f89d4fc09af 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -37,6 +37,7 @@ command! { impl CommandExt for Commit { ctx!(); tag!(commit); + request_type!(KvCommit); ts!(commit_ts); write_bytes!(keys: multiple); gen_lock!(keys: multiple); diff --git a/src/storage/txn/commands/macros.rs b/src/storage/txn/commands/macros.rs index ea19f599d6d..c505714f2a4 100644 --- a/src/storage/txn/commands/macros.rs +++ b/src/storage/txn/commands/macros.rs @@ -139,6 +139,14 @@ macro_rules! tag { }; } +macro_rules! request_type { + ($req_type:ident) => { + fn request_type(&self) -> ::tracker::RequestType { + ::tracker::RequestType::$req_type + } + }; +} + macro_rules! write_bytes { ($field: ident) => { fn write_bytes(&self) -> usize { diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 1168dd15048..5cd94b172ff 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -48,6 +48,7 @@ pub use resolve_lock_lite::ResolveLockLite; pub use resolve_lock_readphase::ResolveLockReadPhase; pub use rollback::Rollback; use tikv_util::deadline::Deadline; +use tracker::RequestType; pub use txn_heart_beat::TxnHeartBeat; use txn_types::{Key, OldValues, TimeStamp, Value, Write}; @@ -467,6 +468,10 @@ fn find_mvcc_infos_by_key( pub trait CommandExt: Display { fn tag(&self) -> metrics::CommandKind; + fn request_type(&self) -> RequestType { + RequestType::Unknown + } + fn get_ctx(&self) -> &Context; fn get_ctx_mut(&mut self) -> &mut Context; @@ -645,6 +650,10 @@ impl Command { self.command_ext().tag() } + pub fn request_type(&self) -> RequestType { + self.command_ext().request_type() + } + pub fn ts(&self) -> TimeStamp { self.command_ext().ts() } diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index 17a72610065..bcafed8b0e6 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -38,6 +38,7 @@ command! { impl CommandExt for PessimisticRollback { ctx!(); tag!(pessimistic_rollback); + request_type!(KvPessimisticRollback); ts!(start_ts); write_bytes!(keys: multiple); gen_lock!(keys: multiple); diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 1c0cbabd193..cfe8f68c512 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -215,6 +215,7 @@ impl Prewrite { impl CommandExt for Prewrite { ctx!(); tag!(prewrite); + request_type!(KvPrewrite); ts!(start_ts); fn write_bytes(&self) -> usize { @@ -379,6 +380,7 @@ impl PrewritePessimistic { impl CommandExt for PrewritePessimistic { ctx!(); tag!(prewrite); + request_type!(KvPrewrite); ts!(start_ts); fn write_bytes(&self) -> usize { diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index 9db90f450d8..6638fe5cffd 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -57,6 +57,7 @@ command! { impl CommandExt for ResolveLock { ctx!(); tag!(resolve_lock); + request_type!(KvResolveLock); property!(is_sys_cmd); fn write_bytes(&self) -> usize { diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index e797ea62bf9..f69d4a107fc 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -36,6 +36,7 @@ command! { impl CommandExt for ResolveLockLite { ctx!(); tag!(resolve_lock_lite); + request_type!(KvResolveLock); ts!(start_ts); property!(is_sys_cmd); write_bytes!(resolve_keys: multiple); diff --git a/src/storage/txn/commands/resolve_lock_readphase.rs b/src/storage/txn/commands/resolve_lock_readphase.rs index 7c34cc71f4f..588303e0a3d 100644 --- a/src/storage/txn/commands/resolve_lock_readphase.rs +++ b/src/storage/txn/commands/resolve_lock_readphase.rs @@ -33,6 +33,7 @@ command! { impl CommandExt for ResolveLockReadPhase { ctx!(); tag!(resolve_lock); + request_type!(KvResolveLock); property!(readonly); fn write_bytes(&self) -> usize { diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index e6641147f04..70e7fc4a49d 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -35,6 +35,7 @@ command! { impl CommandExt for Rollback { ctx!(); tag!(rollback); + request_type!(KvRollback); ts!(start_ts); write_bytes!(keys: multiple); gen_lock!(keys: multiple); diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index d2af61d4506..e894cc6835e 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -40,6 +40,7 @@ command! { impl CommandExt for TxnHeartBeat { ctx!(); tag!(txn_heart_beat); + request_type!(KvTxnHeartBeat); ts!(start_ts); write_bytes!(primary_key); gen_lock!(primary_key); diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 283787e9ba1..f0e1529fab7 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -47,6 +47,7 @@ use raftstore::store::TxnExt; use resource_metering::{FutureExt, ResourceTagFactory}; use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData}; use tikv_util::{quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE}; +use tracker::{get_tls_tracker_token, set_tls_tracker_token, TrackerToken}; use txn_types::TimeStamp; use crate::{ @@ -59,7 +60,7 @@ use crate::{ SnapContext, Statistics, }, lock_manager::{self, DiagnosticContext, LockManager, WaitTimeout}, - metrics::{self, *}, + metrics::*, txn::{ commands::{Command, ResponsePolicy, WriteContext, WriteResult, WriteResultLockInfo}, flow_controller::FlowController, @@ -83,15 +84,17 @@ const IN_MEMORY_PESSIMISTIC_LOCK: Feature = Feature::require(6, 0, 0); /// Task is a running command. pub(super) struct Task { pub(super) cid: u64, + pub(super) tracker: TrackerToken, pub(super) cmd: Command, pub(super) extra_op: ExtraOp, } impl Task { /// Creates a task for a running command. - pub(super) fn new(cid: u64, cmd: Command) -> Task { + pub(super) fn new(cid: u64, tracker: TrackerToken, cmd: Command) -> Task { Task { cid, + tracker, cmd, extra_op: ExtraOp::Noop, } @@ -99,7 +102,7 @@ impl Task { } struct CmdTimer { - tag: metrics::CommandKind, + tag: CommandKind, begin: Instant, } @@ -122,7 +125,7 @@ struct TaskContext { // `cb` and `pr` safely. owned: AtomicBool, write_bytes: usize, - tag: metrics::CommandKind, + tag: CommandKind, // How long it waits on latches. // latch_timer: Option, latch_timer: Instant, @@ -411,8 +414,8 @@ impl Scheduler { fn schedule_command(&self, cmd: Command, callback: StorageCallback) { let cid = self.inner.gen_id(); - debug!("received new command"; "cid" => cid, "cmd" => ?cmd); - + let tracker = get_tls_tracker_token(); + debug!("received new command"; "cid" => cid, "cmd" => ?cmd, "tracker" => ?tracker); let tag = cmd.tag(); let priority_tag = get_priority_tag(cmd.priority()); SCHED_STAGE_COUNTER_VEC.get(tag).new.inc(); @@ -421,9 +424,10 @@ impl Scheduler { .inc(); let mut task_slot = self.inner.get_task_slot(cid); - let tctx = task_slot - .entry(cid) - .or_insert_with(|| self.inner.new_task_context(Task::new(cid, cmd), callback)); + let tctx = task_slot.entry(cid).or_insert_with(|| { + self.inner + .new_task_context(Task::new(cid, tracker, cmd), callback) + }); let deadline = tctx.task.as_ref().unwrap().cmd.deadline(); if self.inner.latches.acquire(&mut tctx.lock, cid) { fail_point!("txn_scheduler_acquire_success"); @@ -494,6 +498,7 @@ impl Scheduler { /// Executes the task in the sched pool. fn execute(&self, mut task: Task) { + set_tls_tracker_token(task.tracker); let sched = self.clone(); self.get_sched_pool(task.cmd.priority()) .pool @@ -537,6 +542,7 @@ impl Scheduler { debug!( "process cmd with snapshot"; "cid" => task.cid, "term" => ?term, "extra_op" => ?extra_op, + "trakcer" => ?task.tracker ); sched.process(snapshot, task).await; } @@ -575,7 +581,7 @@ impl Scheduler { /// /// If a next command is present, continues to execute; otherwise, delivers the result to the /// callback. - fn on_read_finished(&self, cid: u64, pr: ProcessResult, tag: metrics::CommandKind) { + fn on_read_finished(&self, cid: u64, pr: ProcessResult, tag: CommandKind) { SCHED_STAGE_COUNTER_VEC.get(tag).read_finish.inc(); debug!("read command finished"; "cid" => cid); @@ -599,7 +605,7 @@ impl Scheduler { lock_guards: Vec, pipelined: bool, async_apply_prewrite: bool, - tag: metrics::CommandKind, + tag: CommandKind, ) { // TODO: Does async apply prewrite worth a special metric here? if pipelined { @@ -674,8 +680,8 @@ impl Scheduler { cid: u64, cb: StorageCallback, pr: ProcessResult, - tag: metrics::CommandKind, - stage: metrics::CommandStageKind, + tag: CommandKind, + stage: CommandStageKind, ) { debug!("early return response"; "cid" => cid); SCHED_STAGE_COUNTER_VEC.get(tag).get(stage).inc(); @@ -745,10 +751,13 @@ impl Scheduler { let tag = task.cmd.tag(); let begin_instant = Instant::now(); - let pr = task - .cmd - .process_read(snapshot, statistics) - .unwrap_or_else(|e| ProcessResult::Failed { err: e.into() }); + let cmd = task.cmd; + let pr = unsafe { + with_perf_context::(tag, || { + cmd.process_read(snapshot, statistics) + .unwrap_or_else(|e| ProcessResult::Failed { err: e.into() }) + }) + }; SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(tag) .observe(begin_instant.saturating_elapsed_secs()); @@ -783,10 +792,13 @@ impl Scheduler { async_apply_prewrite: self.inner.enable_async_apply_prewrite, }; let begin_instant = Instant::now(); - let res = task - .cmd - .process_write(snapshot, context) - .map_err(StorageError::from); + let res = unsafe { + with_perf_context::(tag, || { + task.cmd + .process_write(snapshot, context) + .map_err(StorageError::from) + }) + }; SCHED_PROCESSING_READ_HISTOGRAM_STATIC .get(tag) .observe(begin_instant.saturating_elapsed_secs()); @@ -904,7 +916,7 @@ impl Scheduler { cb.unwrap(), pr.unwrap(), tag, - metrics::CommandStageKind::async_apply_prewrite, + CommandStageKind::async_apply_prewrite, ); }); is_async_apply_prewrite = true; @@ -934,7 +946,7 @@ impl Scheduler { cb.unwrap(), pr.unwrap(), tag, - metrics::CommandStageKind::pipelined_write, + CommandStageKind::pipelined_write, ); }); (Some(proposed_cb), None) From c1a09b83f6da437f49758dc713150ebc0da5fcb9 Mon Sep 17 00:00:00 2001 From: Jay Date: Sat, 18 Jun 2022 01:28:35 -0700 Subject: [PATCH 030/676] raftstorev2: add basic layout (#12843) ref tikv/tikv#12842 This is an attempt to reimplement raftstore using the new assumptions that peer's range can be overlapped. Currently, compatability is not considered, though we may think about how to migrate from old version by the end of this year. No concrete implementations is added yet, we may choose reuse implementation from v1 or implementing new logic base on actual requirement. The principle is 1. do not introduce history debt while reusing code as much as possible. 2. do not change the current implementations. Signed-off-by: Jay Lee --- Cargo.lock | 18 ++ Cargo.toml | 1 + components/raftstore-v2/Cargo.toml | 39 +++ components/raftstore-v2/src/fsm/apply.rs | 1 + components/raftstore-v2/src/fsm/mod.rs | 5 + components/raftstore-v2/src/fsm/peer.rs | 22 ++ components/raftstore-v2/src/fsm/store.rs | 1 + components/raftstore-v2/src/lib.rs | 19 ++ components/raftstore-v2/src/operation/mod.rs | 1 + components/raftstore-v2/src/raft/mod.rs | 7 + components/raftstore-v2/src/raft/peer.rs | 70 +++++ components/raftstore-v2/src/raft/storage.rs | 56 ++++ components/raftstore-v2/src/router/message.rs | 293 ++++++++++++++++++ components/raftstore-v2/src/router/mod.rs | 5 + components/raftstore/src/store/peer.rs | 3 +- 15 files changed, 540 insertions(+), 1 deletion(-) create mode 100644 components/raftstore-v2/Cargo.toml create mode 100644 components/raftstore-v2/src/fsm/apply.rs create mode 100644 components/raftstore-v2/src/fsm/mod.rs create mode 100644 components/raftstore-v2/src/fsm/peer.rs create mode 100644 components/raftstore-v2/src/fsm/store.rs create mode 100644 components/raftstore-v2/src/lib.rs create mode 100644 components/raftstore-v2/src/operation/mod.rs create mode 100644 components/raftstore-v2/src/raft/mod.rs create mode 100644 components/raftstore-v2/src/raft/peer.rs create mode 100644 components/raftstore-v2/src/raft/storage.rs create mode 100644 components/raftstore-v2/src/router/message.rs create mode 100644 components/raftstore-v2/src/router/mod.rs diff --git a/Cargo.lock b/Cargo.lock index f94f088e563..82978c6cbf8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4162,6 +4162,24 @@ dependencies = [ "yatp", ] +[[package]] +name = "raftstore-v2" +version = "0.1.0" +dependencies = [ + "collections", + "crossbeam", + "engine_traits", + "error_code", + "kvproto", + "pd_client", + "raft", + "raft-proto", + "raftstore", + "slog", + "smallvec", + "tikv_util", +] + [[package]] name = "rand" version = "0.4.6" diff --git a/Cargo.toml b/Cargo.toml index a1c1f315de3..e58963c694d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -247,6 +247,7 @@ members = [ "components/panic_hook", "components/pd_client", "components/raftstore", + "components/raftstore-v2", "components/resolved_ts", "components/resource_metering", "components/server", diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml new file mode 100644 index 00000000000..56d08c6a6b6 --- /dev/null +++ b/components/raftstore-v2/Cargo.toml @@ -0,0 +1,39 @@ +[package] +name = "raftstore-v2" +version = "0.1.0" +edition = "2021" + +[features] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] +failpoints = ["raftstore/failpoints"] +testexport = ["raftstore/testexport"] +test-engine-kv-rocksdb = [ + "raftstore/test-engine-kv-rocksdb" +] +test-engine-raft-raft-engine = [ + "raftstore/test-engine-raft-raft-engine" +] +test-engines-rocksdb = [ + "raftstore/test-engines-rocksdb", +] +test-engines-panic = [ + "raftstore/test-engines-panic", +] + +cloud-aws = ["raftstore/cloud-aws"] +cloud-gcp = ["raftstore/cloud-gcp"] +cloud-azure = ["raftstore/cloud-azure"] + +[dependencies] +collections = { path = "../collections" } +crossbeam = "0.8" +engine_traits = { path = "../engine_traits" } +error_code = { path = "../error_code" } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +pd_client = { path = "../pd_client" } +raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } +raft-proto = { version = "0.7.0" } +raftstore = { path = "../raftstore" } +slog = "2.3" +smallvec = "1.4" +tikv_util = { path = "../tikv_util", default-features = false } diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs new file mode 100644 index 00000000000..bb3db8c75d3 --- /dev/null +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -0,0 +1 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs new file mode 100644 index 00000000000..275313cbfb3 --- /dev/null +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod apply; +mod peer; +mod store; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs new file mode 100644 index 00000000000..5eaacf3e200 --- /dev/null +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -0,0 +1,22 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::RaftEngine; +use kvproto::metapb; +use raftstore::store::Config; +use slog::Logger; + +use crate::{raft::Peer, Result}; + +pub struct PeerFsm { + peer: Peer, +} + +impl PeerFsm { + pub fn new(peer: Peer) -> Result { + Ok(PeerFsm { peer }) + } + + pub fn logger(&self) -> &Logger { + self.peer.logger() + } +} diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs new file mode 100644 index 00000000000..bb3db8c75d3 --- /dev/null +++ b/components/raftstore-v2/src/fsm/store.rs @@ -0,0 +1 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs new file mode 100644 index 00000000000..98c72ca7632 --- /dev/null +++ b/components/raftstore-v2/src/lib.rs @@ -0,0 +1,19 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! Raftstore is the place where we implement multi-raft. +//! +//! The thread module of raftstore is batch-system, more check components/batch-system. +//! All state machines are defined in [`fsm`] module. Everything that wrapping raft is +//! implemented in [`raft`] module. And the commands are implemented in [`operation`] module. +//! All state machines are expected to communicate with messages. They are defined in +//! [`router`] module. + +#![allow(unused)] + +mod fsm; +mod operation; +mod raft; +mod router; + +pub use raftstore::{Error, Result}; +pub use router::{PeerMsg, PeerTick, StoreMsg, StoreTick}; diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs new file mode 100644 index 00000000000..bb3db8c75d3 --- /dev/null +++ b/components/raftstore-v2/src/operation/mod.rs @@ -0,0 +1 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. diff --git a/components/raftstore-v2/src/raft/mod.rs b/components/raftstore-v2/src/raft/mod.rs new file mode 100644 index 00000000000..7fd128d6788 --- /dev/null +++ b/components/raftstore-v2/src/raft/mod.rs @@ -0,0 +1,7 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod peer; +mod storage; + +pub use peer::Peer; +pub use storage::Storage; diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs new file mode 100644 index 00000000000..4af2c1ccddb --- /dev/null +++ b/components/raftstore-v2/src/raft/peer.rs @@ -0,0 +1,70 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::RaftEngine; +use kvproto::metapb; +use raft::RawNode; +use raftstore::store::Config; +use slog::{o, Logger}; +use tikv_util::{box_err, config::ReadableSize}; + +use super::storage::Storage; +use crate::Result; + +/// A peer that delegates commands between state machine and raft. +pub struct Peer { + region_id: u64, + peer: metapb::Peer, + raft_group: RawNode>, + logger: Logger, +} + +impl Peer { + pub fn new( + cfg: &Config, + store_id: u64, + region: metapb::Region, + engine: ER, + logger: Logger, + ) -> Result { + let peer = region + .get_peers() + .iter() + .find(|p| p.get_store_id() == store_id && p.get_id() != raft::INVALID_ID); + let peer = match peer { + Some(p) => p, + None => return Err(box_err!("no valid peer found in {:?}", region.get_peers())), + }; + let l = logger.new(o!("peer_id" => peer.id)); + + let ps = Storage::new(engine, l.clone()); + + let applied_index = ps.applied_index(); + + let raft_cfg = raft::Config { + id: peer.get_id(), + election_tick: cfg.raft_election_timeout_ticks, + heartbeat_tick: cfg.raft_heartbeat_ticks, + min_election_tick: cfg.raft_min_election_timeout_ticks, + max_election_tick: cfg.raft_max_election_timeout_ticks, + max_size_per_msg: cfg.raft_max_size_per_msg.0, + max_inflight_msgs: cfg.raft_max_inflight_msgs, + applied: applied_index, + check_quorum: true, + skip_bcast_commit: true, + pre_vote: cfg.prevote, + max_committed_size_per_ready: ReadableSize::mb(16).0, + ..Default::default() + }; + + Ok(Peer { + region_id: region.get_id(), + peer: peer.clone(), + raft_group: RawNode::new(&raft_cfg, ps, &logger)?, + logger: l, + }) + } + + pub fn logger(&self) -> &Logger { + &self.logger + } +} diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs new file mode 100644 index 00000000000..f999c6890d8 --- /dev/null +++ b/components/raftstore-v2/src/raft/storage.rs @@ -0,0 +1,56 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::RaftEngine; +use raft::{ + eraftpb::{Entry, Snapshot}, + GetEntriesContext, RaftState, +}; +use slog::Logger; + +/// A storage for raft. +pub struct Storage { + engine: ER, + logger: Logger, +} + +impl Storage { + pub fn new(engine: ER, logger: Logger) -> Storage { + Storage { engine, logger } + } + + pub fn applied_index(&self) -> u64 { + unimplemented!() + } +} + +impl raft::Storage for Storage { + fn initial_state(&self) -> raft::Result { + unimplemented!() + } + + fn entries( + &self, + low: u64, + high: u64, + max_size: impl Into>, + context: GetEntriesContext, + ) -> raft::Result> { + unimplemented!() + } + + fn term(&self, idx: u64) -> raft::Result { + unimplemented!() + } + + fn first_index(&self) -> raft::Result { + unimplemented!() + } + + fn last_index(&self) -> raft::Result { + unimplemented!() + } + + fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + unimplemented!() + } +} diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs new file mode 100644 index 00000000000..1ab85608034 --- /dev/null +++ b/components/raftstore-v2/src/router/message.rs @@ -0,0 +1,293 @@ +// Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. + +// #[PerformanceCriticalPath] +use std::{fmt, marker::PhantomData}; + +use engine_traits::{KvEngine, Snapshot}; +use kvproto::{ + kvrpcpb::ExtraOp as TxnExtraOp, + metapb, + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, +}; +use raftstore::store::{ + fsm::ApplyTaskRes, metrics::RaftEventDurationType, InspectedRaftMessage, RegionSnapshot, +}; +use tikv_util::{memory::HeapSize, time::Instant}; + +pub struct WriteResponseChannel; + +impl WriteResponseChannel { + /// Called after a request is proposed to the raft group successfully. It's + /// used to notify the caller to move on early because it's very likely the + /// request will be applied to the raftstore. + pub fn notify_proposed(&self) {} + + /// Called after a request is committed and before it's being applied, and + /// it's guaranteed that the request will be successfully applied soon. + pub fn notify_committed(&self) {} + + pub fn notify_applied(&self, _res: Result<(), RaftCmdResponse>) {} +} + +pub struct ReadResponseChannel { + _snap: PhantomData, +} + +pub struct ReadResponse { + pub snapshot: RegionSnapshot, + // What is this? + pub txn_extra_op: TxnExtraOp, +} + +impl ReadResponseChannel { + pub fn notify_read(&self, _res: Result, RaftCmdResponse>) {} +} + +// This is only necessary because of seeming limitations in derive(Clone) w/r/t +// generics. If it can be deleted in the future in favor of derive, it should +// be. +impl Clone for ReadResponse +where + S: Snapshot, +{ + fn clone(&self) -> ReadResponse { + ReadResponse { + snapshot: self.snapshot.clone(), + txn_extra_op: self.txn_extra_op, + } + } +} + +/// Variants of channels for `Msg`. +/// - `Read`: a channel for read only requests including `StatusRequest`, +/// `GetRequest` and `SnapRequest` +/// - `Write`: a channel for write only requests including `AdminRequest` +/// `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. +/// Prefer channel rather than callback because: +/// 1. channel can be reused, hence reduce allocations. +/// 2. channel may not need dynamic dispatch. +/// 3. caller can use async fashion. +/// 4. there will be no callback leak. +pub enum ResponseChannel { + /// No callback. + None, + /// Read callback. + Read(ReadResponseChannel), + /// Write callback. + Write(WriteResponseChannel), +} + +impl HeapSize for ResponseChannel {} + +impl ResponseChannel +where + S: Snapshot, +{ + pub fn notify_applied(self, resp: RaftCmdResponse) { + match self { + ResponseChannel::None => (), + ResponseChannel::Read(read) => { + read.notify_read(Err(resp)); + } + ResponseChannel::Write(write) => { + write.notify_applied(Err(resp)); + } + } + } + + pub fn notify_proposed(&mut self) { + if let ResponseChannel::Write(write) = self { + write.notify_proposed(); + } + } + + pub fn notify_committed(&mut self) { + if let ResponseChannel::Write(write) = self { + write.notify_committed(); + } + } + + pub fn invoke_read(self, args: ReadResponse) { + match self { + ResponseChannel::Read(read) => read.notify_read(Ok(args)), + other => panic!("expect Callback::Read(..), got {:?}", other), + } + } + + pub fn is_none(&self) -> bool { + matches!(self, ResponseChannel::None) + } +} + +impl fmt::Debug for ResponseChannel +where + S: Snapshot, +{ + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ResponseChannel::None => write!(fmt, "Callback::None"), + ResponseChannel::Read(_) => write!(fmt, "Callback::Read(..)"), + ResponseChannel::Write { .. } => write!(fmt, "Callback::Write(..)"), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum PeerTick { + Raft = 0, + RaftLogGc = 1, + SplitRegionCheck = 2, + PdHeartbeat = 3, + CheckMerge = 4, + CheckPeerStaleState = 5, + EntryCacheEvict = 6, + CheckLeaderLease = 7, + ReactivateMemoryLock = 8, + ReportBuckets = 9, +} + +impl PeerTick { + pub const VARIANT_COUNT: usize = Self::get_all_ticks().len(); + + #[inline] + pub fn tag(self) -> &'static str { + match self { + PeerTick::Raft => "raft", + PeerTick::RaftLogGc => "raft_log_gc", + PeerTick::SplitRegionCheck => "split_region_check", + PeerTick::PdHeartbeat => "pd_heartbeat", + PeerTick::CheckMerge => "check_merge", + PeerTick::CheckPeerStaleState => "check_peer_stale_state", + PeerTick::EntryCacheEvict => "entry_cache_evict", + PeerTick::CheckLeaderLease => "check_leader_lease", + PeerTick::ReactivateMemoryLock => "reactivate_memory_lock", + PeerTick::ReportBuckets => "report_buckets", + } + } + + pub const fn get_all_ticks() -> &'static [PeerTick] { + const TICKS: &[PeerTick] = &[ + PeerTick::Raft, + PeerTick::RaftLogGc, + PeerTick::SplitRegionCheck, + PeerTick::PdHeartbeat, + PeerTick::CheckMerge, + PeerTick::CheckPeerStaleState, + PeerTick::EntryCacheEvict, + PeerTick::CheckLeaderLease, + PeerTick::ReactivateMemoryLock, + PeerTick::ReportBuckets, + ]; + TICKS + } +} + +#[derive(Debug, Clone, Copy)] +pub enum StoreTick { + // No CompactLock and CompactCheck as they should be implemented by peer itself. + PdStoreHeartbeat, + SnapGc, + ConsistencyCheck, + CleanupImportSst, +} + +impl StoreTick { + #[inline] + pub fn tag(self) -> RaftEventDurationType { + match self { + StoreTick::PdStoreHeartbeat => RaftEventDurationType::pd_store_heartbeat, + StoreTick::SnapGc => RaftEventDurationType::snap_gc, + StoreTick::ConsistencyCheck => RaftEventDurationType::consistency_check, + StoreTick::CleanupImportSst => RaftEventDurationType::cleanup_import_sst, + } + } +} + +/// Raft command is the command that is expected to be proposed by the +/// leader of the target raft group. +#[derive(Debug)] +pub struct RaftCommand { + pub send_time: Instant, + pub request: RaftCmdRequest, + pub ch: ResponseChannel, +} + +impl RaftCommand { + #[inline] + pub fn new(request: RaftCmdRequest, ch: ResponseChannel) -> RaftCommand { + RaftCommand { + request, + ch, + send_time: Instant::now(), + } + } +} + +/// Message that can be sent to a peer. +pub enum PeerMsg { + /// Raft message is the message sent between raft nodes in the same + /// raft group. Messages need to be redirected to raftstore if target + /// peer doesn't exist. + RaftMessage(InspectedRaftMessage), + /// Raft command is the command that is expected to be proposed by the + /// leader of the target raft group. If it's failed to be sent, callback + /// usually needs to be called before dropping in case of resource leak. + RaftCommand(RaftCommand), + /// Tick is periodical task. If target peer doesn't exist there is a potential + /// that the raft node will not work anymore. + Tick(PeerTick), + /// Result of applying committed entries. The message can't be lost. + ApplyRes { + res: ApplyTaskRes, + }, + /// Start the FSM. + Start, + /// A message only used to notify a peer. + Noop, + Persisted { + peer_id: u64, + ready_number: u64, + }, +} + +impl fmt::Debug for PeerMsg { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + PeerMsg::RaftMessage(_) => write!(fmt, "Raft Message"), + PeerMsg::RaftCommand(_) => write!(fmt, "Raft Command"), + PeerMsg::Tick(tick) => write! { + fmt, + "{:?}", + tick + }, + PeerMsg::ApplyRes { res } => write!(fmt, "ApplyRes {:?}", res), + PeerMsg::Start => write!(fmt, "Startup"), + PeerMsg::Noop => write!(fmt, "Noop"), + PeerMsg::Persisted { + peer_id, + ready_number, + } => write!( + fmt, + "Persisted peer_id {}, ready_number {}", + peer_id, ready_number + ), + } + } +} + +pub enum StoreMsg { + RaftMessage(InspectedRaftMessage), + Tick(StoreTick), + Start { store: metapb::Store }, +} + +impl fmt::Debug for StoreMsg { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + StoreMsg::RaftMessage(_) => write!(fmt, "Raft Message"), + StoreMsg::Tick(tick) => write!(fmt, "StoreTick {:?}", tick), + StoreMsg::Start { ref store } => write!(fmt, "Start store {:?}", store), + } + } +} diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs new file mode 100644 index 00000000000..fd27349ef43 --- /dev/null +++ b/components/raftstore-v2/src/router/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod message; + +pub use message::{PeerMsg, PeerTick, StoreMsg, StoreTick}; diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 374df821b9b..73e1a6ecb50 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -109,7 +109,8 @@ use crate::{ const SHRINK_CACHE_CAPACITY: usize = 64; const MIN_BCAST_WAKE_UP_INTERVAL: u64 = 1_000; // 1s const REGION_READ_PROGRESS_CAP: usize = 128; -const MAX_COMMITTED_SIZE_PER_READY: u64 = 16 * 1024 * 1024; +#[doc(hidden)] +pub const MAX_COMMITTED_SIZE_PER_READY: u64 = 16 * 1024 * 1024; /// The returned states of the peer after checking whether it is stale #[derive(Debug, PartialEq, Eq)] From cead3f5685f58d6dbf3db5a1a47493f5c8faa19e Mon Sep 17 00:00:00 2001 From: Ziheng Gan Date: Mon, 20 Jun 2022 12:48:36 +0800 Subject: [PATCH 031/676] *: fix thread name truncating issue (#12442) ref tikv/tikv#5593, close tikv/tikv#12451 Based on #5593, add a global hashmap recording relationship between thread id and thread name; add some wrappers to spawn threads and update the hashmap. It's necessary to use `after_start_wrapper` and `before_stop_wrapper` together. Otherwise it may cause reporting a wrong thread name if a thread inserts its name to hashmap and doesn't remove it, while another thread reuses the same tid and doesn't update the hashmap. Signed-off-by: GanZiheng Co-authored-by: Ti Chi Robot --- clippy.toml | 9 + cmd/tikv-ctl/src/main.rs | 4 +- components/backup-stream/src/endpoint.rs | 5 +- components/backup/src/utils.rs | 6 +- components/batch-system/src/batch.rs | 7 +- components/cdc/src/endpoint.rs | 5 + components/cdc/src/initializer.rs | 7 +- components/encryption/src/manager/mod.rs | 4 +- components/encryption/src/master_key/kms.rs | 3 + components/file_system/src/io_stats/mod.rs | 6 +- components/pd_client/src/tso.rs | 4 +- .../raftstore/src/store/async_io/write.rs | 12 +- components/raftstore/src/store/worker/pd.rs | 3 +- .../src/store/worker/refresh_config.rs | 6 +- components/resolved_ts/src/advance.rs | 6 +- components/resolved_ts/src/scanner.rs | 4 +- components/server/src/server.rs | 9 +- components/test_raftstore/src/server.rs | 3 + components/test_util/src/lib.rs | 3 +- components/tikv_util/src/lib.rs | 4 +- .../tikv_util/src/metrics/threads_linux.rs | 16 +- components/tikv_util/src/sys/thread.rs | 170 +++++++++++++++++- components/tikv_util/src/time.rs | 3 +- components/tikv_util/src/timer.rs | 9 +- components/tikv_util/src/worker/future.rs | 3 +- components/tikv_util/src/yatp_pool/mod.rs | 4 +- scripts/clippy | 1 + src/import/sst_service.rs | 5 +- src/server/debug.rs | 7 +- src/server/gc_worker/gc_manager.rs | 11 +- src/server/load_statistics/linux.rs | 4 +- src/server/reset_to_version.rs | 3 +- src/server/server.rs | 5 + src/server/service/kv.rs | 5 +- src/server/snap.rs | 5 +- src/server/status_server/mod.rs | 5 +- src/storage/txn/flow_controller.rs | 7 +- 37 files changed, 313 insertions(+), 60 deletions(-) create mode 100644 clippy.toml diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 00000000000..2a4bb3e82b2 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,9 @@ +disallowed-methods = [ + { path = "std::thread::Builder::spawn", reason = "Wrapper function `::spawn_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, + + { path = "tokio::runtime::builder::Builder::on_thread_start", reason = "Wrapper function `::after_start_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, + { path = "tokio::runtime::builder::Builder::on_thread_stop", reason = "Wrapper function `::before_stop_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, + + { path = "futures_executor::thread_pool::ThreadPoolBuilder::after_start", reason = "Wrapper function `::after_start_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, + { path = "futures_executor::thread_pool::ThreadPoolBuilder::before_stop", reason = "Wrapper function `::before_stop_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, +] \ No newline at end of file diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 8ada0c7a426..3ad066df491 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -45,7 +45,7 @@ use regex::Regex; use security::{SecurityConfig, SecurityManager}; use structopt::{clap::ErrorKind, StructOpt}; use tikv::{config::TiKvConfig, server::debug::BottommostLevelCompaction}; -use tikv_util::{escape, run_and_wait_child_process, unescape}; +use tikv_util::{escape, run_and_wait_child_process, sys::thread::StdThreadBuildWrapper, unescape}; use txn_types::Key; use crate::{cmd::*, executor::*, util::*}; @@ -604,7 +604,7 @@ fn compact_whole_cluster( let cfs: Vec = cfs.iter().map(|cf| cf.to_string()).collect(); let h = thread::Builder::new() .name(format!("compact-{}", addr)) - .spawn(move || { + .spawn_wrapper(move || { tikv_alloc::add_thread_memory_accessor(); let debug_executor = new_debug_executor(&cfg, None, false, Some(&addr), mgr); for cf in cfs { diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 470ee53bb87..1c1efdcb546 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -29,6 +29,7 @@ use tikv_util::{ box_err, config::ReadableDuration, debug, defer, info, + sys::thread::ThreadBuildWrapper, time::Instant, warn, worker::{Runnable, Scheduler}, @@ -1016,10 +1017,10 @@ fn create_tokio_runtime(thread_count: usize, thread_name: &str) -> TokioResult TokioResu .thread_name(thread_name) .enable_io() .enable_time() - .on_thread_start(|| { + .after_start_wrapper(|| { tikv_alloc::add_thread_memory_accessor(); file_system::set_io_type(IOType::Export); }) - .on_thread_stop(|| { + .before_stop_wrapper(|| { tikv_alloc::remove_thread_memory_accessor(); }) .worker_threads(thread_count) diff --git a/components/batch-system/src/batch.rs b/components/batch-system/src/batch.rs index 3f8d433aefd..108058ee5f2 100644 --- a/components/batch-system/src/batch.rs +++ b/components/batch-system/src/batch.rs @@ -17,7 +17,10 @@ use std::{ use crossbeam::channel::{self, SendError}; use fail::fail_point; use file_system::{set_io_type, IOType}; -use tikv_util::{debug, error, info, mpsc, safe_panic, thd_name, time::Instant, warn}; +use tikv_util::{ + debug, error, info, mpsc, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, + time::Instant, warn, +}; use crate::{ config::Config, @@ -581,7 +584,7 @@ where let props = tikv_util::thread_group::current_properties(); let t = thread::Builder::new() .name(name) - .spawn(move || { + .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); set_io_type(IOType::ForegroundWrite); poller.poll(); diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 9b1b663b207..7ca640ac8b3 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -41,6 +41,7 @@ use security::SecurityManager; use tikv::{config::CdcConfig, storage::Statistics}; use tikv_util::{ debug, error, impl_display_as_debug, info, + sys::thread::ThreadBuildWrapper, time::Limiter, timer::SteadyTimer, warn, @@ -373,12 +374,16 @@ impl, E: KvEngine> Endpoint { let workers = Builder::new_multi_thread() .thread_name("cdcwkr") .worker_threads(config.incremental_scan_threads) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(); let tso_worker = Builder::new_multi_thread() .thread_name("tso") .worker_threads(config.tso_worker_threads) .enable_time() + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(); diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 6b80a8c21a0..a5dcf094acf 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -564,7 +564,10 @@ mod tests { }, TestEngineBuilder, }; - use tikv_util::worker::{LazyWorker, Runnable}; + use tikv_util::{ + sys::thread::ThreadBuildWrapper, + worker::{LazyWorker, Runnable}, + }; use tokio::runtime::{Builder, Runtime}; use super::*; @@ -608,6 +611,8 @@ mod tests { let pool = Builder::new_multi_thread() .thread_name("test-initializer-worker") .worker_threads(4) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(); let downstream_state = Arc::new(AtomicCell::new(DownstreamState::Initializing)); diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 0535cae16f1..bc4b97de7a2 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -17,7 +17,7 @@ use fail::fail_point; use file_system::File; use kvproto::encryptionpb::{DataKey, EncryptionMethod, FileDictionary, FileInfo, KeyDictionary}; use protobuf::Message; -use tikv_util::{box_err, debug, error, info, thd_name, warn}; +use tikv_util::{box_err, debug, error, info, sys::thread::StdThreadBuildWrapper, thd_name, warn}; use crate::{ config::EncryptionConfig, @@ -557,7 +557,7 @@ impl DataKeyManager { let (rotate_terminal, rx) = channel::bounded(1); let background_worker = std::thread::Builder::new() .name(thd_name!("enc:key")) - .spawn(move || { + .spawn_wrapper(move || { run_background_rotate_work(dict_clone, method, &*master_key, rx); })?; diff --git a/components/encryption/src/master_key/kms.rs b/components/encryption/src/master_key/kms.rs index 601c982a961..da1b6d80e0a 100644 --- a/components/encryption/src/master_key/kms.rs +++ b/components/encryption/src/master_key/kms.rs @@ -8,6 +8,7 @@ use kvproto::encryptionpb::EncryptedContent; use tikv_util::{ box_err, error, stream::{retry, with_timeout}, + sys::thread::ThreadBuildWrapper, }; use tokio::runtime::{Builder, Runtime}; @@ -81,6 +82,8 @@ impl KmsBackend { Builder::new_current_thread() .thread_name("kms-runtime") .enable_all() + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build()?, ); diff --git a/components/file_system/src/io_stats/mod.rs b/components/file_system/src/io_stats/mod.rs index f0e644ad4a4..d9c7ae9d519 100644 --- a/components/file_system/src/io_stats/mod.rs +++ b/components/file_system/src/io_stats/mod.rs @@ -45,6 +45,8 @@ pub use proc::*; #[cfg(test)] mod tests { + use tikv_util::sys::thread::StdThreadBuildWrapper; + use super::*; use crate::IOType; @@ -54,7 +56,7 @@ mod tests { let _ths = (0..8) .map(|_| { let tx_clone = tx.clone(); - std::thread::Builder::new().spawn(move || { + std::thread::Builder::new().spawn_wrapper(move || { set_io_type(IOType::ForegroundWrite); tx_clone.send(()).unwrap(); }) @@ -72,7 +74,7 @@ mod tests { let _ths = (0..8) .map(|_| { let tx_clone = tx.clone(); - std::thread::Builder::new().spawn(move || { + std::thread::Builder::new().spawn_wrapper(move || { set_io_type(IOType::ForegroundWrite); tx_clone.send(()).unwrap(); }) diff --git a/components/pd_client/src/tso.rs b/components/pd_client/src/tso.rs index ff951a3c77c..6c99e87e4e7 100644 --- a/components/pd_client/src/tso.rs +++ b/components/pd_client/src/tso.rs @@ -21,7 +21,7 @@ use futures::{ }; use grpcio::{CallOption, WriteFlags}; use kvproto::pdpb::{PdClient, TsoRequest, TsoResponse}; -use tikv_util::{box_err, info}; +use tikv_util::{box_err, info, sys::thread::StdThreadBuildWrapper}; use tokio::sync::{mpsc, oneshot, watch}; use txn_types::TimeStamp; @@ -61,7 +61,7 @@ impl TimestampOracle { // Start a background thread to handle TSO requests and responses thread::Builder::new() .name("tso-worker".into()) - .spawn(move || { + .spawn_wrapper(move || { block_on(run_tso( cluster_id, rpc_sender.sink_err_into(), diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 373b64134d3..f81160d689d 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -27,7 +27,9 @@ use raft::eraftpb::Entry; use tikv_util::{ box_err, config::{Tracker, VersionTrack}, - debug, info, slow_log, thd_name, + debug, info, slow_log, + sys::thread::StdThreadBuildWrapper, + thd_name, time::{duration_to_sec, Instant}, warn, }; @@ -692,9 +694,11 @@ where cfg, ); info!("starting store writer {}", i); - let t = thread::Builder::new().name(thd_name!(tag)).spawn(move || { - worker.run(); - })?; + let t = thread::Builder::new() + .name(thd_name!(tag)) + .spawn_wrapper(move || { + worker.run(); + })?; self.writers.push(tx); self.handlers.push(t); } diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 44954ba5e01..648e8e9344e 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -39,6 +39,7 @@ use resource_metering::{Collector, CollectorGuard, CollectorRegHandle, RawRecord use tikv_util::{ box_err, debug, error, info, metrics::ThreadInfoStatistics, + sys::thread::StdThreadBuildWrapper, thd_name, time::{Instant as TiInstant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, @@ -537,7 +538,7 @@ where } let h = Builder::new() .name(thd_name!("stats-monitor")) - .spawn(move || { + .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); tikv_alloc::add_thread_memory_accessor(); let mut thread_stats = ThreadInfoStatistics::new(); diff --git a/components/raftstore/src/store/worker/refresh_config.rs b/components/raftstore/src/store/worker/refresh_config.rs index 4ad92d5db68..d3681654975 100644 --- a/components/raftstore/src/store/worker/refresh_config.rs +++ b/components/raftstore/src/store/worker/refresh_config.rs @@ -7,7 +7,9 @@ use std::{ use batch_system::{BatchRouter, Fsm, FsmTypes, HandlerBuilder, Poller, PoolState, Priority}; use file_system::{set_io_type, IOType}; -use tikv_util::{debug, error, info, safe_panic, thd_name, worker::Runnable}; +use tikv_util::{ + debug, error, info, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, worker::Runnable, +}; use crate::store::fsm::{ apply::{ApplyFsm, ControlFsm}, @@ -70,7 +72,7 @@ where name_prefix, i + self.state.id_base, ))) - .spawn(move || { + .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); set_io_type(IOType::ForegroundWrite); poller.poll(); diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index c438c4c53fa..ef683724429 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -24,7 +24,9 @@ use pd_client::PdClient; use protobuf::Message; use raftstore::store::{fsm::StoreMeta, util::RegionReadProgressRegistry}; use security::SecurityManager; -use tikv_util::{info, time::Instant, timer::SteadyTimer, worker::Scheduler}; +use tikv_util::{ + info, sys::thread::ThreadBuildWrapper, time::Instant, timer::SteadyTimer, worker::Scheduler, +}; use tokio::{ runtime::{Builder, Runtime}, sync::Mutex, @@ -65,6 +67,8 @@ impl AdvanceTsWorker { .thread_name("advance-ts") .worker_threads(1) .enable_time() + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(); Self { diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index c52bf3bf166..835de79c161 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -19,7 +19,7 @@ use tikv::storage::{ mvcc::{DeltaScanner, MvccReader, ScannerBuilder}, txn::{TxnEntry, TxnEntryScanner}, }; -use tikv_util::{time::Instant, timer::GLOBAL_TIMER_HANDLE}; +use tikv_util::{sys::thread::ThreadBuildWrapper, time::Instant, timer::GLOBAL_TIMER_HANDLE}; use tokio::runtime::{Builder, Runtime}; use txn_types::{Key, Lock, LockType, TimeStamp}; @@ -74,6 +74,8 @@ impl, E: KvEngine> ScannerPool { Builder::new_multi_thread() .thread_name("inc-scan") .worker_threads(count) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(), ); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index f1fd2167f9d..6bf1de8e7a7 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -113,7 +113,10 @@ use tikv_util::{ }; use tokio::runtime::Builder; -use crate::{memory::*, raft_engine_switch::*, setup::*, signal_handler}; +use crate::{ + memory::*, raft_engine_switch::*, setup::*, signal_handler, + tikv_util::sys::thread::ThreadBuildWrapper, +}; #[inline] fn run_impl(config: TiKvConfig) { @@ -622,11 +625,11 @@ impl TiKvServer { Builder::new_multi_thread() .thread_name(thd_name!("debugger")) .worker_threads(1) - .on_thread_start(move || { + .after_start_wrapper(move || { tikv_alloc::add_thread_memory_accessor(); tikv_util::thread_group::set_properties(props.clone()); }) - .on_thread_stop(tikv_alloc::remove_thread_memory_accessor) + .before_stop_wrapper(tikv_alloc::remove_thread_memory_accessor) .build() .unwrap(), ); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index d156ab77adb..981843ddfc6 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -66,6 +66,7 @@ use tikv::{ use tikv_util::{ config::VersionTrack, quota_limiter::QuotaLimiter, + sys::thread::ThreadBuildWrapper, time::ThreadReadId, worker::{Builder as WorkerBuilder, LazyWorker}, HandyRwLock, @@ -448,6 +449,8 @@ impl ServerCluster { TokioBuilder::new_multi_thread() .thread_name(thd_name!("debugger")) .worker_threads(1) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(), ); diff --git a/components/test_util/src/lib.rs b/components/test_util/src/lib.rs index 9dca2ee2111..dc053bd6d20 100644 --- a/components/test_util/src/lib.rs +++ b/components/test_util/src/lib.rs @@ -20,6 +20,7 @@ use std::{ }; use rand::Rng; +use tikv_util::sys::thread::StdThreadBuildWrapper; pub use crate::{ encryption::*, @@ -36,7 +37,7 @@ pub fn setup_for_ci() { // of time to avoid causing timeout. thread::Builder::new() .name(tikv_util::thd_name!("backtrace-loader")) - .spawn(::backtrace::Backtrace::new) + .spawn_wrapper(::backtrace::Backtrace::new) .unwrap(); if env::var("CI").is_ok() { diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index b3cc43c550a..8445a0a97aa 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -32,6 +32,8 @@ use nix::{ }; use rand::rngs::ThreadRng; +use crate::sys::thread::StdThreadBuildWrapper; + #[macro_use] pub mod log; pub mod buffer_vec; @@ -468,7 +470,7 @@ pub fn set_panic_hook(panic_abort: bool, data_dir: &str) { // Caching is slow, spawn it in another thread to speed up. thread::Builder::new() .name(thd_name!("backtrace-loader")) - .spawn(::backtrace::Backtrace::new) + .spawn_wrapper(::backtrace::Backtrace::new) .unwrap(); let data_dir = data_dir.to_string(); diff --git a/components/tikv_util/src/metrics/threads_linux.rs b/components/tikv_util/src/metrics/threads_linux.rs index 8ee9aed05f5..4eae41b0f06 100644 --- a/components/tikv_util/src/metrics/threads_linux.rs +++ b/components/tikv_util/src/metrics/threads_linux.rs @@ -15,7 +15,7 @@ use prometheus::{ }; use crate::{ - sys::thread::{self, Pid}, + sys::thread::{self, Pid, THREAD_NAME_HASHMAP}, time::Instant, }; @@ -150,7 +150,12 @@ impl Collector for ThreadsCollector { // Threads CPU time. let total = thread::linux::cpu_total(&stat); // sanitize thread name before push metrics. - let name = sanitize_thread_name(tid, &stat.command); + let name = if let Some(thread_name) = THREAD_NAME_HASHMAP.lock().unwrap().get(&tid) + { + sanitize_thread_name(tid, thread_name) + } else { + sanitize_thread_name(tid, &stat.command) + }; let cpu_total = metrics .cpu_totals .get_metric_with_label_values(&[&name, &format!("{}", tid)]) @@ -471,6 +476,7 @@ mod tests { use std::{env::temp_dir, fs, io::Write, sync, time::Duration}; use super::*; + use crate::sys::thread::StdThreadBuildWrapper; #[test] fn test_thread_stat_io() { @@ -479,7 +485,7 @@ mod tests { let (tx1, rx1) = sync::mpsc::channel(); let h = std::thread::Builder::new() .name(name.to_owned()) - .spawn(move || { + .spawn_wrapper(move || { // Make `io::write_bytes` > 0 let mut tmp = temp_dir(); tmp.push(name); @@ -528,7 +534,7 @@ mod tests { let (tx1, rx1) = sync::mpsc::channel(); std::thread::Builder::new() .name(str1.to_owned()) - .spawn(move || { + .spawn_wrapper(move || { tx1.send(()).unwrap(); // Make `io::write_bytes` > 0 @@ -614,7 +620,7 @@ mod tests { let (tx1, rx1) = sync::mpsc::channel(); std::thread::Builder::new() .name(name) - .spawn(move || { + .spawn_wrapper(move || { tx1.send(()).unwrap(); let start = Instant::now(); diff --git a/components/tikv_util/src/sys/thread.rs b/components/tikv_util/src/sys/thread.rs index cc38cc8228f..445fc93974e 100644 --- a/components/tikv_util/src/sys/thread.rs +++ b/components/tikv_util/src/sys/thread.rs @@ -4,7 +4,9 @@ //! Only Linux platform is implemented correctly, for other platform, it only guarantees //! successful compilation. -use std::io; +use std::{io, io::Result, sync::Mutex, thread}; + +use collections::HashMap; /// A cross-platform CPU statistics data structure. #[derive(Debug, Copy, Clone, Default, PartialEq)] @@ -361,14 +363,121 @@ pub fn current_thread_stat() -> io::Result { thread_stat(process_id(), thread_id()) } +pub trait StdThreadBuildWrapper { + fn spawn_wrapper(self, f: F) -> io::Result> + where + F: FnOnce() -> T, + F: Send + 'static, + T: Send + 'static; +} + +pub trait ThreadBuildWrapper { + fn after_start_wrapper(&mut self, f: F) -> &mut Self + where + F: Fn() + Send + Sync + 'static; + + fn before_stop_wrapper(&mut self, f: F) -> &mut Self + where + F: Fn() + Send + Sync + 'static; +} + +lazy_static::lazy_static! { + pub static ref THREAD_NAME_HASHMAP: Mutex> = Mutex::new(HashMap::default()); +} + +pub(crate) fn add_thread_name_to_map() { + if let Some(name) = std::thread::current().name() { + let tid = thread_id(); + THREAD_NAME_HASHMAP + .lock() + .unwrap() + .insert(tid, name.to_string()); + debug!("tid {} thread name is {}", tid, name); + } +} + +pub(crate) fn remove_thread_name_from_map() { + let tid = thread_id(); + THREAD_NAME_HASHMAP.lock().unwrap().remove(&tid); +} + +impl StdThreadBuildWrapper for std::thread::Builder { + fn spawn_wrapper(self, f: F) -> Result> + where + F: FnOnce() -> T, + F: Send + 'static, + T: Send + 'static, + { + #[allow(clippy::disallowed_methods)] + self.spawn(|| { + add_thread_name_to_map(); + let res = f(); + remove_thread_name_from_map(); + res + }) + } +} + +impl ThreadBuildWrapper for tokio::runtime::Builder { + fn after_start_wrapper(&mut self, f: F) -> &mut Self + where + F: Fn() + Send + Sync + 'static, + { + #[allow(clippy::disallowed_methods)] + self.on_thread_start(move || { + add_thread_name_to_map(); + f(); + }) + } + + fn before_stop_wrapper(&mut self, f: F) -> &mut Self + where + F: Fn() + Send + Sync + 'static, + { + #[allow(clippy::disallowed_methods)] + self.on_thread_stop(move || { + f(); + remove_thread_name_from_map(); + }) + } +} + +impl ThreadBuildWrapper for futures::executor::ThreadPoolBuilder { + fn after_start_wrapper(&mut self, f: F) -> &mut Self + where + F: Fn() + Send + Sync + 'static, + { + #[allow(clippy::disallowed_methods)] + self.after_start(move |_| { + add_thread_name_to_map(); + f(); + }) + } + + fn before_stop_wrapper(&mut self, f: F) -> &mut Self + where + F: Fn() + Send + Sync + 'static, + { + #[allow(clippy::disallowed_methods)] + self.before_stop(move |_| { + f(); + remove_thread_name_from_map(); + }) + } +} + #[cfg(test)] mod tests { use std::{ collections::HashSet, + sync, sync::{Arc, Condvar, Mutex}, }; + use futures::executor::block_on; + use super::*; + use crate::yatp_pool::{DefaultTicker, YatpPoolBuilder}; #[test] fn test_thread_id() { @@ -427,4 +536,63 @@ mod tests { assert!(!ids.contains(tid)); } } + + #[test] + fn test_thread_name_wrapper() { + let thread_name = "thread_for_test"; + + let (tx, rx) = sync::mpsc::sync_channel(10); + + let get_name = move || { + let tid = thread_id(); + if let Some(name) = THREAD_NAME_HASHMAP.lock().unwrap().get(&tid) { + tx.clone().send(name.to_string()).unwrap(); + } else { + panic!("thread not found"); + } + }; + + // test std thread builder + std::thread::Builder::new() + .name(thread_name.to_string()) + .spawn_wrapper(get_name.clone()) + .unwrap() + .join() + .unwrap(); + + let name = rx.recv().unwrap(); + assert_eq!(name, thread_name); + + // test Yatp + let get_name_fn = get_name.clone(); + block_on( + YatpPoolBuilder::new(DefaultTicker {}) + .name_prefix(thread_name) + .after_start(|| {}) + .before_stop(|| {}) + .build_future_pool() + .spawn_handle(async move { get_name_fn() }) + .unwrap(), + ) + .unwrap(); + + let name = rx.recv().unwrap(); + assert!(name.contains(thread_name)); + + // test tokio thread builder + let get_name_fn = get_name; + block_on( + tokio::runtime::Builder::new_multi_thread() + .thread_name(thread_name) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) + .build() + .unwrap() + .spawn(async move { get_name_fn() }), + ) + .unwrap(); + + let name = rx.recv().unwrap(); + assert_eq!(name, thread_name); + } } diff --git a/components/tikv_util/src/time.rs b/components/tikv_util/src/time.rs index c8f210db7a7..57e9e261444 100644 --- a/components/tikv_util/src/time.rs +++ b/components/tikv_util/src/time.rs @@ -148,7 +148,7 @@ impl Monitor { let (tx, rx) = mpsc::channel(); let h = Builder::new() .name(thd_name!("time-monitor")) - .spawn(move || { + .spawn_wrapper(move || { crate::thread_group::set_properties(props); tikv_alloc::add_thread_memory_accessor(); while rx.try_recv().is_err() { @@ -205,6 +205,7 @@ use self::inner::monotonic_coarse_now; pub use self::inner::monotonic_now; /// Returns the monotonic raw time since some unspecified starting point. pub use self::inner::monotonic_raw_now; +use crate::sys::thread::StdThreadBuildWrapper; const NANOSECONDS_PER_SECOND: u64 = 1_000_000_000; const MILLISECOND_PER_SECOND: i64 = 1_000; diff --git a/components/tikv_util/src/timer.rs b/components/tikv_util/src/timer.rs index dc74dbb3b43..50cfa48f9aa 100644 --- a/components/tikv_util/src/timer.rs +++ b/components/tikv_util/src/timer.rs @@ -18,7 +18,10 @@ use tokio_timer::{ Delay, }; -use crate::time::{monotonic_raw_now, Instant}; +use crate::{ + sys::thread::StdThreadBuildWrapper, + time::{monotonic_raw_now, Instant}, +}; pub struct Timer { pending: BinaryHeap>>, @@ -98,7 +101,7 @@ fn start_global_timer() -> Handle { let props = crate::thread_group::current_properties(); Builder::new() .name(thd_name!("timer")) - .spawn(move || { + .spawn_wrapper(move || { crate::thread_group::set_properties(props); tikv_alloc::add_thread_memory_accessor(); let mut timer = tokio_timer::Timer::default(); @@ -197,7 +200,7 @@ fn start_global_steady_timer() -> SteadyTimer { let clock_ = clock.clone(); Builder::new() .name(thd_name!("steady-timer")) - .spawn(move || { + .spawn_wrapper(move || { let c = Clock::new_with_now(clock_); let mut timer = tokio_timer::Timer::new_with_now(ParkThread::new(), c); tx.send(timer.handle()).unwrap(); diff --git a/components/tikv_util/src/worker/future.rs b/components/tikv_util/src/worker/future.rs index 83b4d95bc58..be7c05589cb 100644 --- a/components/tikv_util/src/worker/future.rs +++ b/components/tikv_util/src/worker/future.rs @@ -16,6 +16,7 @@ use prometheus::IntGauge; use tokio::task::LocalSet; use super::metrics::*; +use crate::sys::thread::StdThreadBuildWrapper; pub struct Stopped(pub T); @@ -156,7 +157,7 @@ impl Worker { let props = crate::thread_group::current_properties(); let h = Builder::new() .name(thd_name!(self.scheduler.name.as_ref())) - .spawn(move || { + .spawn_wrapper(move || { crate::thread_group::set_properties(props); poll(runner, rx) })?; diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index 93cd46cc6ac..e2e57c9fbce 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -100,6 +100,7 @@ impl Runner for YatpPoolRunner { type TaskCell = TaskCell; fn start(&mut self, local: &mut Local) { + crate::sys::thread::add_thread_name_to_map(); if let Some(props) = self.props.take() { crate::thread_group::set_properties(Some(props)); } @@ -138,7 +139,8 @@ impl Runner for YatpPoolRunner { } self.ticker.on_tick(); self.inner.end(local); - tikv_alloc::remove_thread_memory_accessor() + tikv_alloc::remove_thread_memory_accessor(); + crate::sys::thread::remove_thread_name_from_map() } } diff --git a/scripts/clippy b/scripts/clippy index f0f46fccfa6..58bdafb817b 100755 --- a/scripts/clippy +++ b/scripts/clippy @@ -33,6 +33,7 @@ CLIPPY_LINTS=(-A clippy::module_inception \ -A clippy::enum_variant_names \ -W clippy::dbg_macro \ -W clippy::todo \ + -D clippy::disallowed-methods \ -D rust-2018-idioms) cargo clippy --workspace \ diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index ac892884e37..24e52a8057e 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -36,6 +36,7 @@ use sst_importer::{error_inc, metrics::*, sst_meta_to_path, Config, Error, Resul use tikv_util::{ config::ReadableSize, future::{create_stream_with_buffer, paired_future_callback}, + sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, }; use txn_types::{Key, WriteRef, WriteType}; @@ -83,12 +84,12 @@ where let threads = ThreadPoolBuilder::new() .pool_size(cfg.num_threads) .name_prefix("sst-importer") - .after_start(move |_| { + .after_start_wrapper(move || { tikv_util::thread_group::set_properties(props.clone()); tikv_alloc::add_thread_memory_accessor(); set_io_type(IOType::Import); }) - .before_stop(move |_| tikv_alloc::remove_thread_memory_accessor()) + .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) .create() .unwrap(); importer.start_switch_mode_check(&threads, engine.clone()); diff --git a/src/server/debug.rs b/src/server/debug.rs index f53f11eeec5..e5d6eba617f 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -35,7 +35,10 @@ use raftstore::{ }, }; use thiserror::Error; -use tikv_util::{config::ReadableSize, keybuilder::KeyBuilder, worker::Worker}; +use tikv_util::{ + config::ReadableSize, keybuilder::KeyBuilder, sys::thread::StdThreadBuildWrapper, + worker::Worker, +}; use txn_types::Key; pub use crate::storage::mvcc::MvccInfoIterator; @@ -441,7 +444,7 @@ impl Debugger { let props = tikv_util::thread_group::current_properties(); let thread = ThreadBuilder::new() .name(format!("mvcc-recover-thread-{}", thread_index)) - .spawn(move || { + .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); tikv_alloc::add_thread_memory_accessor(); info!( diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index 186a4694167..b009c80b728 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -22,7 +22,7 @@ use super::{ gc_worker::{sync_gc, GcSafePointProvider, GcTask}, Result, }; -use crate::server::metrics::*; +use crate::{server::metrics::*, tikv_util::sys::thread::StdThreadBuildWrapper}; const POLL_SAFE_POINT_INTERVAL_SECS: u64 = 10; @@ -279,7 +279,7 @@ impl GcMan let props = tikv_util::thread_group::current_properties(); let res: Result<_> = ThreadBuilder::new() .name(thd_name!("gc-manager")) - .spawn(move || { + .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); tikv_alloc::add_thread_memory_accessor(); self.run(); @@ -632,7 +632,10 @@ mod tests { coprocessor::{RegionInfo, Result as CopResult, SeekRegionCallback}, store::util::new_peer, }; - use tikv_util::worker::{Builder as WorkerBuilder, LazyWorker, Runnable}; + use tikv_util::{ + sys::thread::StdThreadBuildWrapper, + worker::{Builder as WorkerBuilder, LazyWorker, Runnable}, + }; use super::*; use crate::storage::Callback; @@ -821,7 +824,7 @@ mod tests { let (tx, rx) = channel(); ThreadBuilder::new() - .spawn(move || { + .spawn_wrapper(move || { let safe_point = gc_manager.wait_for_next_safe_point().unwrap(); tx.send(safe_point).unwrap(); }) diff --git a/src/server/load_statistics/linux.rs b/src/server/load_statistics/linux.rs index ff9d30a2997..f3a12593a51 100644 --- a/src/server/load_statistics/linux.rs +++ b/src/server/load_statistics/linux.rs @@ -115,6 +115,8 @@ fn calc_cpu_load(elapsed_millis: usize, start_usage: f64, end_usage: f64) -> usi mod tests { use std::{thread, time::Duration}; + use tikv_util::sys::thread::StdThreadBuildWrapper; + use super::*; #[test] @@ -124,7 +126,7 @@ mod tests { let l = loads.clone(); thread::Builder::new() .name(THREAD_NAME.to_string()) - .spawn(move || { + .spawn_wrapper(move || { let mut stats = ThreadLoadStatistics::new(2, THREAD_NAME, Arc::clone(&l)); let start = Instant::now(); loop { diff --git a/src/server/reset_to_version.rs b/src/server/reset_to_version.rs index dadb13f6692..7b99f48371d 100644 --- a/src/server/reset_to_version.rs +++ b/src/server/reset_to_version.rs @@ -11,6 +11,7 @@ use engine_traits::{ IterOptions, Iterable, Iterator, Mutable, SeekKey, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, CF_WRITE, }; +use tikv_util::sys::thread::StdThreadBuildWrapper; use txn_types::{Key, TimeStamp, Write, WriteRef}; use super::Result; @@ -218,7 +219,7 @@ impl ResetToVersionManager { } *self.worker_handle.borrow_mut() = Some(std::thread::Builder::new() .name("reset_to_version".to_string()) - .spawn(move || { + .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); tikv_alloc::add_thread_memory_accessor(); diff --git a/src/server/server.rs b/src/server/server.rs index 9a648c096c3..196a6584be7 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -44,6 +44,7 @@ use crate::{ read_pool::ReadPool, server::{gc_worker::GcWorker, Proxy}, storage::{lock_manager::LockManager, Engine, Storage}, + tikv_util::sys::thread::ThreadBuildWrapper, }; const LOAD_STATISTICS_SLOTS: usize = 4; @@ -109,6 +110,8 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En RuntimeBuilder::new_multi_thread() .thread_name(STATS_THREAD_PREFIX) .worker_threads(cfg.value().stats_concurrency) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(), ) @@ -526,6 +529,8 @@ mod tests { TokioBuilder::new_multi_thread() .thread_name(thd_name!("debugger")) .worker_threads(1) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) .build() .unwrap(), ); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 988e0624686..64ce2abb0e6 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -2197,6 +2197,7 @@ mod tests { use std::thread; use futures::{channel::oneshot, executor::block_on}; + use tikv_util::sys::thread::StdThreadBuildWrapper; use super::*; @@ -2207,7 +2208,7 @@ mod tests { thread::Builder::new() .name("source".to_owned()) - .spawn(move || { + .spawn_wrapper(move || { block_on(signal_rx).unwrap(); tx.send(100).unwrap(); }) @@ -2230,7 +2231,7 @@ mod tests { let (signal_tx, signal_rx) = oneshot::channel(); thread::Builder::new() .name("source".to_owned()) - .spawn(move || { + .spawn_wrapper(move || { tx.send(100).unwrap(); signal_tx.send(()).unwrap(); }) diff --git a/src/server/snap.rs b/src/server/snap.rs index d367fa65047..9b86b4778b4 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -43,6 +43,7 @@ use tikv_util::{ use tokio::runtime::{Builder as RuntimeBuilder, Runtime}; use super::{metrics::*, Config, Error, Result}; +use crate::tikv_util::sys::thread::ThreadBuildWrapper; pub type Callback = Box) + Send>; @@ -354,8 +355,8 @@ where pool: RuntimeBuilder::new_multi_thread() .thread_name(thd_name!("snap-sender")) .worker_threads(DEFAULT_POOL_SIZE) - .on_thread_start(tikv_alloc::add_thread_memory_accessor) - .on_thread_stop(tikv_alloc::remove_thread_memory_accessor) + .after_start_wrapper(tikv_alloc::add_thread_memory_accessor) + .before_stop_wrapper(tikv_alloc::remove_thread_memory_accessor) .build() .unwrap(), raft_router: r, diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 792d83f13de..1bb066d1a2c 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -64,6 +64,7 @@ use self::profile::{ use crate::{ config::{log_level_serde, ConfigController}, server::Result, + tikv_util::sys::thread::ThreadBuildWrapper, }; static TIMER_CANCELED: &str = "tokio timer canceled"; @@ -110,8 +111,8 @@ where .enable_all() .worker_threads(status_thread_pool_size) .thread_name("status-server") - .on_thread_start(|| debug!("Status server started")) - .on_thread_stop(|| debug!("stopping status server")) + .after_start_wrapper(|| debug!("Status server started")) + .before_stop_wrapper(|| debug!("stopping status server")) .build()?; let (tx, rx) = oneshot::channel::<()>(); diff --git a/src/storage/txn/flow_controller.rs b/src/storage/txn/flow_controller.rs index 378b4fd2aad..e29472594c6 100644 --- a/src/storage/txn/flow_controller.rs +++ b/src/storage/txn/flow_controller.rs @@ -20,7 +20,10 @@ use engine_rocks::FlowInfo; use engine_traits::{CFNamesExt, FlowControlFactorsExt}; use num_traits::cast::{AsPrimitive, FromPrimitive}; use rand::Rng; -use tikv_util::time::{Instant, Limiter}; +use tikv_util::{ + sys::thread::StdThreadBuildWrapper, + time::{Instant, Limiter}, +}; use crate::storage::{config::FlowControlConfig, metrics::*}; @@ -494,7 +497,7 @@ impl FlowChecker { fn start(self, rx: Receiver, flow_info_receiver: Receiver) -> JoinHandle<()> { Builder::new() .name(thd_name!("flow-checker")) - .spawn(move || { + .spawn_wrapper(move || { tikv_alloc::add_thread_memory_accessor(); let mut checker = self; let mut deadline = std::time::Instant::now(); From 6bc24929670b0e53cdc365b37c0c69afe63f19e9 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Mon, 20 Jun 2022 16:30:37 +0800 Subject: [PATCH 032/676] raftstore: record metrics of proposal send wait duration (#12837) ref tikv/tikv#12362 Our raft implementation has its flow control mechanism to limit the inflight message number. But we're not able to know whether and when we are throttled. Then, it's hard for us to know whether we should adjust the max-inflight-msgs config. This commit tries to improve this case. It's complex to add hooks to raft-rs to know how long a message is throttled, but we can do it in the raftstore. We record the propose time, and consume it and record it in the histogram when the messages is send through the transport. If flow control takes effect, the ready will be smaller than all under-replicated logs. So, we can know if flow control takes effect. Signed-off-by: Yilin Chen --- .../raftstore/src/store/local_metrics.rs | 3 + components/raftstore/src/store/metrics.rs | 6 + components/raftstore/src/store/peer.rs | 29 +++ metrics/grafana/tikv_details.json | 184 +++++++++++++++++- 4 files changed, 220 insertions(+), 2 deletions(-) diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index d6e6dc265bc..aa23f22bc2c 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -415,6 +415,7 @@ pub struct RaftMetrics { pub wf_persist_log: LocalHistogram, pub wf_commit_log: LocalHistogram, pub wf_commit_not_persist_log: LocalHistogram, + pub proposal_send_wait: LocalHistogram, pub raft_log_gc_skipped: RaftLogGcSkippedMetrics, } @@ -439,6 +440,7 @@ impl RaftMetrics { wf_persist_log: STORE_WF_PERSIST_LOG_DURATION_HISTOGRAM.local(), wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(), wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(), + proposal_send_wait: PROPOSAL_SEND_WAIT_DURATION_HISTOGRAM.local(), raft_log_gc_skipped: RaftLogGcSkippedMetrics::default(), } } @@ -461,6 +463,7 @@ impl RaftMetrics { self.wf_persist_log.flush(); self.wf_commit_log.flush(); self.wf_commit_not_persist_log.flush(); + self.proposal_send_wait.flush(); } let mut missing = self.leader_missing.lock().unwrap(); LEADER_MISSING.set(missing.len() as i64); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 3a4426fcbcb..e3d3a23e389 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -360,6 +360,12 @@ lazy_static! { "Bucketed histogram of proposals' commit but not persist duration", exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); + pub static ref PROPOSAL_SEND_WAIT_DURATION_HISTOGRAM: Histogram = + register_histogram!( + "tikv_raftstore_proposal_send_wait_duration_seconds", + "Bucketed histogram of proposals' send wait duration", + exponential_buckets(1e-6, 2.0, 26).unwrap() + ).unwrap(); pub static ref PEER_PROPOSAL_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 73e1a6ecb50..ad63e3b1b34 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -725,6 +725,9 @@ where #[getset(get = "pub")] leader_lease: Lease, pending_reads: ReadIndexQueue, + /// Record the propose instants to calculate the wait duration before + /// the proposal is sent through the Raft client. + pending_propose_instants: VecDeque<(u64, Instant)>, /// If it fails to send messages to leader. pub leader_unreachable: bool, @@ -925,6 +928,7 @@ where raft_max_inflight_msgs: cfg.raft_max_inflight_msgs, proposals: ProposalQueue::new(tag.clone()), pending_reads: Default::default(), + pending_propose_instants: Default::default(), peer_cache: RefCell::new(HashMap::default()), peer_heartbeats: HashMap::default(), peers_start_pending_time: vec![], @@ -1571,6 +1575,7 @@ where ctx: &mut PollContext, msgs: Vec, ) { + let now = Instant::now(); for msg in msgs { let msg_type = msg.get_message().get_msg_type(); if msg_type == MessageType::MsgTimeoutNow && self.is_leader() { @@ -1596,6 +1601,26 @@ where "disk_usage" => ?msg.get_disk_usage(), ); + for index in msg + .get_message() + .get_entries() + .iter() + .map(|e| e.get_index()) + { + while let Some((propose_idx, instant)) = self.pending_propose_instants.front() { + if index == *propose_idx { + ctx.raft_metrics + .proposal_send_wait + .observe(now.saturating_duration_since(*instant).as_secs_f64()); + } + if index >= *propose_idx { + self.pending_propose_instants.pop_front(); + } else { + break; + } + } + } + if let Err(e) = ctx.trans.send(msg) { // We use metrics to observe failure on production. debug!( @@ -2048,6 +2073,7 @@ where self.mut_store().cancel_generating_snap(None); self.clear_disk_full_peers(ctx); self.clear_in_memory_pessimistic_locks(); + self.pending_propose_instants.clear(); } _ => {} } @@ -4270,6 +4296,9 @@ where } } + self.pending_propose_instants + .push_back((propose_index, Instant::now())); + Ok(Either::Left(propose_index)) } diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 15dfa8c684b..46d72775cb6 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -13588,6 +13588,186 @@ "yBucketNumber": null, "yBucketSize": null }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 47 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763572784, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(delta(tikv_raftstore_proposal_send_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", + "refId": "A", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Proposal send wait duration", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 47 + }, + "hiddenSeries": false, + "id": 23763572783, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_proposal_send_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "99% Proposal send wait duration per server", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:106", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:107", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -13605,7 +13785,7 @@ "h": 7, "w": 12, "x": 0, - "y": 46 + "y": 54 }, "hiddenSeries": false, "id": 1975, @@ -13708,7 +13888,7 @@ "h": 7, "w": 12, "x": 12, - "y": 46 + "y": 54 }, "hiddenSeries": false, "id": 1976, From 90a1aa11e636b4a6735b155670f2e150f5dedfc4 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 20 Jun 2022 10:08:36 -0700 Subject: [PATCH 033/676] raftstore: skip flushing raft logs for uninitialized peer (#12847) close tikv/tikv#12825 Uninitialized peer has not received any logs, so it doesn't need to clean up any logs. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/peer.rs | 8 +++ .../raftstore/src/store/worker/raftlog_gc.rs | 1 + tests/failpoints/cases/test_split_region.rs | 70 +++++++++++++++++++ 3 files changed, 79 insertions(+) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index c61e3c3ba55..6abfc24c486 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -3126,6 +3126,14 @@ where return Some(DelayReason::UnPersistedReady); } + let is_initialized = self.fsm.peer.is_initialized(); + if !is_initialized { + // If the peer is uninitialized, then it can't receive any logs from leader. So + // no need to gc. If there was a peer with same region id on the store, and it had + // logs written, then it must be initialized, hence its log should be gc either + // before it's destroyed or during node restarts. + self.fsm.logs_gc_flushed = true; + } if !self.fsm.logs_gc_flushed { let start_index = self.fsm.peer.last_compacted_idx; let mut end_index = start_index; diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index bf892743300..71584a5e678 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -107,6 +107,7 @@ impl Runner { if self.tasks.is_empty() { return; } + fail::fail_point!("worker_gc_raft_log_flush"); // Sync wal of kv_db to make sure the data before apply_index has been persisted to disk. let start = Instant::now(); self.engines.kv.sync().unwrap_or_else(|e| { diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 68fed70ca25..8b42959fc01 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -984,3 +984,73 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { let resp = resp.join().unwrap(); assert!(resp.get_region_error().has_epoch_not_match(), "{:?}", resp); } + +/// Logs are gced asynchronously. If an uninitialized peer is destroyed before being replaced by +/// split, then the asynchronous log gc response may arrive after the peer is replaced, hence +/// it will lead to incorrect memory state. Actually, there is nothing to be gc for uninitialized +/// peer. The case is to guarantee such incorrect state will not happen. +#[test] +fn test_split_replace_skip_log_gc() { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(15); + cluster.cfg.raft_store.raft_log_gc_threshold = 15; + cluster.cfg.raft_store.right_derive_when_split = true; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.store_batch_system.pool_size = 2; + let pd_client = cluster.pd_client.clone(); + + // Disable default max peer number check. + pd_client.disable_default_operator(); + let r = cluster.run_conf_change(); + pd_client.must_add_peer(r, new_peer(3, 3)); + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + let before_check_snapshot_1_2_fp = "before_check_snapshot_1_2"; + fail::cfg(before_check_snapshot_1_2_fp, "pause").unwrap(); + + // So the split peer on store 2 always uninitialized. + let filter = RegionPacketFilter::new(1000, 2).msg_type(MessageType::MsgSnapshot); + cluster.add_send_filter(CloneFilterFactory(filter)); + + pd_client.must_add_peer(r, new_peer(2, 2)); + let region = pd_client.get_region(b"k1").unwrap(); + // [-∞, k2), [k2, +∞) + // b a + cluster.must_split(®ion, b"k2"); + + cluster.must_put(b"k3", b"v3"); + + // Because a is not initialized, so b must be created using heartbeat on store 3. + + // Simulate raft log gc stall. + let gc_fp = "worker_gc_raft_log_flush"; + let destroy_fp = "destroy_peer_after_pending_move"; + + fail::cfg(gc_fp, "pause").unwrap(); + let (tx, rx) = crossbeam::channel::bounded(0); + fail::cfg_callback(destroy_fp, move || { + let _ = tx.send(()); + let _ = tx.send(()); + }) + .unwrap(); + + let left = pd_client.get_region(b"k1").unwrap(); + let left_peer_on_store_2 = find_peer(&left, 2).unwrap(); + pd_client.must_remove_peer(left.get_id(), left_peer_on_store_2.clone()); + // Wait till destroy is triggered. + rx.recv_timeout(Duration::from_secs(3)).unwrap(); + // Make it split. + fail::remove(before_check_snapshot_1_2_fp); + // Wait till split is finished. + must_get_equal(&cluster.get_engine(2), b"k3", b"v3"); + // Wait a little bit so the uninitialized peer is replaced. + thread::sleep(Duration::from_millis(10)); + // Resume destroy. + rx.recv_timeout(Duration::from_secs(3)).unwrap(); + // Resume gc. + fail::remove(gc_fp); + // Check store 3 is still working correctly. + cluster.must_put(b"k4", b"v4"); + must_get_equal(&cluster.get_engine(2), b"k4", b"v4"); +} From 53dc82927417ad5fdea10e0e5a24586a1bce61eb Mon Sep 17 00:00:00 2001 From: kevin-xianliu <105765349+kevin-xianliu@users.noreply.github.com> Date: Mon, 20 Jun 2022 15:16:36 -0700 Subject: [PATCH 034/676] grafana/dashboard: adjusted for better readability (#12792) close tikv/tikv#12007, ref tikv/tikv#12007 - add 99%, 95% and avg graph for aysnc snapshot & write in storage panel - adjust heatmap in coprocessor panel for better readability - change "Raft log speed" to "Raft propose speed" in Raft Propose panel Signed-off-by: kevin-xianliu Co-authored-by: Jay --- metrics/grafana/tikv_details.json | 255 ++++++++++++++++++++++++++++-- 1 file changed, 246 insertions(+), 9 deletions(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 46d72775cb6..b8204654185 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -13831,7 +13831,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Raft log speed", + "title": "Raft propose speed", "tooltip": { "shared": true, "sort": 0, @@ -17706,6 +17706,239 @@ "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null + }, + { + "type": "graph", + "title": "Storage async snapshot duration", + "gridPos": { + "x": 0, + "y": 35, + "w": 12, + "h": 8 + }, + "id": 20000, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", + "legendFormat": "99%", + "interval": "", + "exemplar": true, + "refId": "A", + "queryType": "randomWalk", + "intervalFactor": 2 + }, + { + "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", + "legendFormat": "95%", + "interval": "", + "exemplar": true, + "refId": "B", + "hide": false, + "intervalFactor": 2 + }, + { + "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"snapshot\"}[1m]))", + "legendFormat": "avg", + "interval": "", + "exemplar": true, + "refId": "C", + "hide": false, + "intervalFactor": 2 + } + ], + "options": { + "alertThreshold": true + }, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "pluginVersion": "7.5.10", + "renderer": "flot", + "yaxes": [ + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "s", + "$$hashKey": "object:295" + }, + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "short", + "$$hashKey": "object:296" + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [], + "buckets": null + }, + "yaxis": { + "align": false, + "alignLevel": null + }, + "lines": true, + "fill": 1, + "linewidth": 1, + "dashLength": 10, + "spaceLength": 10, + "pointradius": 2, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "nullPointMode": "null", + "tooltip": { + "value_type": "individual", + "shared": true, + "sort": 0 + }, + "aliasColors": {}, + "seriesOverrides": [], + "thresholds": [], + "timeRegions": [], + "description": "The storage async snapshot duration", + "datasource": "${DS_TEST-CLUSTER}", + "fillGradient": 0, + "dashes": false, + "hiddenSeries": false, + "points": false, + "bars": false, + "stack": false, + "percentage": false, + "steppedLine": false, + "timeFrom": null, + "timeShift": null + }, + { + "type": "graph", + "title": "Storage async write duration", + "gridPos": { + "x": 12, + "y": 35, + "w": 12, + "h": 8 + }, + "id": 20001, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write\"}[1m])) by (le))", + "legendFormat": "99%", + "interval": "", + "exemplar": true, + "refId": "A", + "intervalFactor": 1 + }, + { + "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write\"}[1m])) by (le))", + "legendFormat": "95%", + "interval": "", + "exemplar": true, + "refId": "B", + "hide": false, + "intervalFactor": 1 + }, + { + "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"write\"}[1m]))", + "legendFormat": "avg", + "interval": "", + "exemplar": true, + "refId": "C", + "hide": false, + "intervalFactor": 1 + } + ], + "options": { + "alertThreshold": true + }, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "pluginVersion": "7.5.10", + "renderer": "flot", + "yaxes": [ + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "s", + "$$hashKey": "object:494" + }, + { + "label": null, + "show": true, + "logBase": 1, + "min": null, + "max": null, + "format": "short", + "$$hashKey": "object:495" + } + ], + "xaxis": { + "show": true, + "mode": "time", + "name": null, + "values": [], + "buckets": null + }, + "yaxis": { + "align": false, + "alignLevel": null + }, + "lines": true, + "fill": 2, + "linewidth": 1, + "dashLength": 10, + "spaceLength": 10, + "pointradius": 2, + "legend": { + "show": true, + "values": false, + "min": false, + "max": false, + "current": false, + "total": false, + "avg": false + }, + "nullPointMode": "null", + "tooltip": { + "value_type": "individual", + "shared": true, + "sort": 0 + }, + "aliasColors": {}, + "seriesOverrides": [], + "thresholds": [], + "timeRegions": [], + "description": "The storage async write duration", + "fillGradient": 0, + "dashes": false, + "hiddenSeries": false, + "points": false, + "bars": false, + "stack": false, + "percentage": false, + "steppedLine": false, + "timeFrom": null, + "timeShift": null } ], "repeat": null, @@ -23092,9 +23325,9 @@ "color": { "cardColor": "#5195ce", "colorScale": "linear", - "colorScheme": "interpolateBlues", + "colorScheme": "interpolateSpectral", "exponent": 0.5, - "min": 0, + "min": null, "mode": "spectrum" }, "dataFormat": "tsbuckets", @@ -23107,7 +23340,7 @@ "y": 22 }, "heatmap": {}, - "hideZeroBuckets": false, + "hideZeroBuckets": true, "highlightCards": true, "id": 3062, "legend": { @@ -23117,14 +23350,13 @@ "max": true, "min": false, "rightSide": true, - "show": true, + "show": false, "sort": "current", "sortDesc": true, "total": false, "values": true }, "links": [], - "reverseYBuckets": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_request_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", @@ -23138,15 +23370,13 @@ "title": "Request duration", "tooltip": { "show": true, - "showHistogram": true + "showHistogram": false }, "tooltipDecimals": 1, "type": "heatmap", "xAxis": { "show": true }, - "xBucketNumber": null, - "xBucketSize": null, "yAxis": { "decimals": 1, "format": "s", @@ -23157,6 +23387,13 @@ "splitFactor": null }, "yBucketBound": "upper", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "reverseYBuckets": false, + "xBucketNumber": null, + "xBucketSize": null, "yBucketNumber": null, "yBucketSize": null }, From 9f9333180fb7ab41095f9c579e3805d17bf431b4 Mon Sep 17 00:00:00 2001 From: Yujie Xia Date: Tue, 21 Jun 2022 15:34:37 +0800 Subject: [PATCH 035/676] bump master version to 6.2-alpha (#12858) close tikv/tikv#12859 Signed-off-by: Yujie Xia --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 82978c6cbf8..6a6cf62a6ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5955,7 +5955,7 @@ dependencies = [ [[package]] name = "tikv" -version = "6.1.0-alpha" +version = "6.2.0-alpha" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index e58963c694d..e0b8b195b0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "6.1.0-alpha" +version = "6.2.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 0ddac9965339edfcb71ad59373507b828f790b41 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 21 Jun 2022 00:58:37 -0700 Subject: [PATCH 036/676] *: check last sent snapshot for prepare merge (#12682) close tikv/tikv#12663 Guarantee min index of prepare merge larger than the index of last sent snapshot by recording an approximate last sent snapshot index. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/peer.rs | 16 +++- tests/failpoints/cases/test_merge.rs | 58 ------------- tests/integrations/raftstore/test_merge.rs | 96 ++++++++++++++++++++++ 3 files changed, 110 insertions(+), 60 deletions(-) diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index ad63e3b1b34..eb1fc93e1ee 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -785,6 +785,8 @@ where last_urgent_proposal_idx: u64, /// The index of the latest committed split command. last_committed_split_idx: u64, + /// The index of last sent snapshot + last_sent_snapshot_idx: u64, /// Approximate size of logs that is applied but not compacted yet. pub raft_log_size_hint: u64, @@ -955,6 +957,7 @@ where last_compacted_idx: 0, last_urgent_proposal_idx: u64::MAX, last_committed_split_idx: 0, + last_sent_snapshot_idx: 0, consistency_state: ConsistencyState { last_check_time: Instant::now(), index: INVALID_INDEX, @@ -1578,6 +1581,12 @@ where let now = Instant::now(); for msg in msgs { let msg_type = msg.get_message().get_msg_type(); + if msg_type == MessageType::MsgSnapshot { + let snap_index = msg.get_message().get_snapshot().get_metadata().get_index(); + if snap_index > self.last_sent_snapshot_idx { + self.last_sent_snapshot_idx = snap_index; + } + } if msg_type == MessageType::MsgTimeoutNow && self.is_leader() { // After a leader transfer procedure is triggered, the lease for // the old leader may be expired earlier than usual, since a new leader @@ -2052,6 +2061,7 @@ where // prewrites or commits will be just a waste. self.last_urgent_proposal_idx = self.raft_group.raft.raft_log.last_index(); self.raft_group.skip_bcast_commit(false); + self.last_sent_snapshot_idx = self.raft_group.raft.raft_log.last_index(); // A more recent read may happen on the old leader. So max ts should // be updated after a peer becomes leader. @@ -3995,12 +4005,14 @@ where || min_committed == 0 || last_index - min_matched > ctx.cfg.merge_max_log_gap || last_index - min_committed > ctx.cfg.merge_max_log_gap * 2 + || min_matched < self.last_sent_snapshot_idx { return Err(box_err!( - "log gap from matched: {} or committed: {} to last index: {} is too large, skip merge", + "log gap too large, skip merge: matched: {}, committed: {}, last index: {}, last_snapshot: {}", min_matched, min_committed, - last_index + last_index, + self.last_sent_snapshot_idx )); } let mut entry_size = 0; diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index af3f9cca499..c341d801c9b 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -1250,64 +1250,6 @@ fn test_prewrite_before_max_ts_is_synced() { assert!(!resp.get_region_error().has_max_timestamp_not_synced()); } -/// If term is changed in catching up logs, follower needs to update the term -/// correctly, otherwise will leave corrupted states. -#[test] -fn test_merge_election_and_restart() { - let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); - - let pd_client = Arc::clone(&cluster.pd_client); - pd_client.disable_default_operator(); - - let on_raft_gc_log_tick_fp = "on_raft_gc_log_tick"; - fail::cfg(on_raft_gc_log_tick_fp, "return()").unwrap(); - - cluster.run(); - - let region = pd_client.get_region(b"k1").unwrap(); - cluster.must_split(®ion, b"k2"); - - let r1 = pd_client.get_region(b"k1").unwrap(); - let r1_on_store1 = find_peer(&r1, 1).unwrap().to_owned(); - cluster.must_transfer_leader(r1.get_id(), r1_on_store1.clone()); - cluster.must_put(b"k11", b"v11"); - must_get_equal(&cluster.get_engine(2), b"k11", b"v11"); - - let r1_on_store2 = find_peer(&r1, 2).unwrap().to_owned(); - cluster.must_transfer_leader(r1.get_id(), r1_on_store2); - cluster.must_put(b"k12", b"v12"); - must_get_equal(&cluster.get_engine(1), b"k12", b"v12"); - - cluster.add_send_filter(CloneFilterFactory(RegionPacketFilter::new(r1.get_id(), 2))); - - // Wait new leader elected. - cluster.must_transfer_leader(r1.get_id(), r1_on_store1); - cluster.must_put(b"k13", b"v13"); - must_get_equal(&cluster.get_engine(1), b"k13", b"v13"); - must_get_none(&cluster.get_engine(2), b"k13"); - - // Don't actually execute commit merge - fail::cfg("after_handle_catch_up_logs_for_merge", "return()").unwrap(); - // Now region 1 can still be merged into region 2 because leader has committed index cache. - let r2 = pd_client.get_region(b"k3").unwrap(); - cluster.must_try_merge(r1.get_id(), r2.get_id()); - // r1 on store 2 should be able to apply all committed logs. - must_get_equal(&cluster.get_engine(2), b"k13", b"v13"); - - cluster.shutdown(); - cluster.clear_send_filters(); - fail::remove("after_handle_catch_up_logs_for_merge"); - cluster.start().unwrap(); - - // Wait for region elected to avoid timeout and backoff. - cluster.leader_of_region(r2.get_id()); - // If merge can be resumed correctly, the put should succeed. - cluster.must_put(b"k14", b"v14"); - // If logs from different term are process correctly, store 2 should have latest updates. - must_get_equal(&cluster.get_engine(2), b"k14", b"v14"); -} - /// Testing that the source peer's read delegate should not be removed by the target peer /// and only removed when the peer is destroyed #[test] diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 50a427b5ecd..df739d825bc 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -1547,3 +1547,99 @@ fn test_stale_message_after_merge() { cluster.must_put(b"k4", b"v4"); must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } + +/// Check whether merge should be prevented if follower may not have enough logs. +#[test] +fn test_prepare_merge_with_reset_matched() { + let mut cluster = new_server_cluster(0, 3); + configure_for_merge(&mut cluster); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let r = cluster.run_conf_change(); + pd_client.must_add_peer(r, new_peer(2, 2)); + cluster.add_send_filter(IsolationFilterFactory::new(3)); + pd_client.add_peer(r, new_peer(3, 3)); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + thread::sleep(Duration::from_millis(10)); + // So leader will replicate next command but can't know whether follower (2, 2) + // also commits the command. Supposing the index is i0. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(left.get_id(), 2) + .direction(Direction::Recv) + .msg_type(MessageType::MsgAppendResponse) + .allow(1), + )); + cluster.must_put(b"k11", b"v11"); + cluster.clear_send_filters(); + cluster.add_send_filter(IsolationFilterFactory::new(2)); + // So peer (3, 3) only have logs after i0. + must_get_equal(&cluster.get_engine(3), b"k11", b"v11"); + // Clear match information. + let left_on_store3 = find_peer(&left, 3).unwrap().to_owned(); + cluster.must_transfer_leader(left.get_id(), left_on_store3); + let left_on_store1 = find_peer(&left, 1).unwrap().to_owned(); + cluster.must_transfer_leader(left.get_id(), left_on_store1); + let res = cluster.try_merge(left.get_id(), right.get_id()); + // Now leader still knows peer(2, 2) has committed i0 - 1, so the min_match will + // become i0 - 1. But i0 - 1 is not a safe index as peer(3, 3) starts from i0 + 1. + assert!(res.get_header().has_error(), "{:?}", res); + cluster.clear_send_filters(); + // Now leader should replicate more logs and figure out a safe index. + pd_client.must_merge(left.get_id(), right.get_id()); +} + +/// Check if prepare merge min index is chosen correctly even if all match indexes are +/// correct. +#[test] +fn test_prepare_merge_with_5_nodes_snapshot() { + let mut cluster = new_server_cluster(0, 5); + configure_for_merge(&mut cluster); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); + + let peer_on_store1 = find_peer(&left, 1).unwrap().clone(); + cluster.must_transfer_leader(left.get_id(), peer_on_store1); + must_get_equal(&cluster.get_engine(5), b"k1", b"v1"); + let peer_on_store5 = find_peer(&left, 5).unwrap().clone(); + pd_client.must_remove_peer(left.get_id(), peer_on_store5); + must_get_none(&cluster.get_engine(5), b"k1"); + cluster.add_send_filter(IsolationFilterFactory::new(5)); + pd_client.add_peer(left.get_id(), new_peer(5, 16)); + + // Make sure there will be no admin entries after min_matched. + for (k, v) in &[(b"k11", b"v11"), (b"k12", b"v12")] { + cluster.must_put(*k, *v); + must_get_equal(&cluster.get_engine(4), *k, *v); + } + cluster.add_send_filter(IsolationFilterFactory::new(4)); + // So index of peer 4 becomes min_matched. + cluster.must_put(b"k13", b"v13"); + must_get_equal(&cluster.get_engine(1), b"k13", b"v13"); + + // Only remove send filter on store 5. + cluster.clear_send_filters(); + cluster.add_send_filter(IsolationFilterFactory::new(4)); + must_get_equal(&cluster.get_engine(5), b"k13", b"v13"); + let res = cluster.try_merge(left.get_id(), right.get_id()); + // min_matched from peer 4 is beyond the first index of peer 5, it should not be chosen + // for prepare merge. + assert!(res.get_header().has_error(), "{:?}", res); + cluster.clear_send_filters(); + // Now leader should replicate more logs and figure out a safe index. + pd_client.must_merge(left.get_id(), right.get_id()); +} From 4886024bc067437fec9eead7a56d3dbcbef59078 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 21 Jun 2022 16:38:37 +0800 Subject: [PATCH 037/676] tests: support running benches via custom test runner (#12821) close tikv/tikv#12820 Signed-off-by: tabokie Co-authored-by: zhangjinpeng1987 Co-authored-by: Ti Chi Robot --- components/test_util/src/runner.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/components/test_util/src/runner.rs b/components/test_util/src/runner.rs index e3d6cad5979..e7ef1ba0cb5 100644 --- a/components/test_util/src/runner.rs +++ b/components/test_util/src/runner.rs @@ -57,15 +57,15 @@ pub fn run_test_with_hook(cases: &[&TestDescAndFn], hook: impl TestHook + Send + .iter() .map(|case| { let name = case.desc.name.as_slice().to_owned(); - let h = hook.clone(); + let hook = hook.clone(); let f = match case.testfn { TestFn::StaticTestFn(f) => TestFn::DynTestFn(Box::new(move || { - let _watcher = CaseLifeWatcher::new(name, h); + let _watcher = CaseLifeWatcher::new(name.clone(), hook.clone()); f(); })), - TestFn::StaticBenchFn(f) => TestFn::DynTestFn(Box::new(move || { - let _watcher = CaseLifeWatcher::new(name, h); - bench::run_once(move |b| f(b)); + TestFn::StaticBenchFn(f) => TestFn::DynBenchFn(Box::new(move |b| { + let _watcher = CaseLifeWatcher::new(name.clone(), hook.clone()); + f(b); })), ref f => panic!("unexpected testfn {:?}", f), }; From 0a7d8601d81583a56149c7912e49016595968415 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 21 Jun 2022 15:36:36 -0700 Subject: [PATCH 038/676] Storage: Support regional flow controller (#12845) close tikv/tikv#12844 Make FlowController as trait and implement two versions for single rocksdb version and multi-rocksdb version Signed-off-by: qi.xu Co-authored-by: qi.xu --- Cargo.lock | 1 + Cargo.toml | 1 + components/engine_rocks/src/flow_listener.rs | 95 ++- components/engine_traits/src/engine.rs | 17 +- components/server/src/server.rs | 11 +- components/test_raftstore/src/server.rs | 9 +- src/config.rs | 11 +- src/server/engine_factory.rs | 25 +- src/server/engine_factory_v2.rs | 7 +- src/server/gc_worker/gc_worker.rs | 6 +- src/storage/mod.rs | 6 +- src/storage/txn/flow_controller/mod.rs | 76 ++ .../singleton_flow_controller.rs} | 681 ++++++++++++------ .../flow_controller/tablet_flow_controller.rs | 395 ++++++++++ src/storage/txn/scheduler.rs | 42 +- tests/failpoints/cases/test_storage.rs | 9 +- 16 files changed, 1095 insertions(+), 297 deletions(-) create mode 100644 src/storage/txn/flow_controller/mod.rs rename src/storage/txn/{flow_controller.rs => flow_controller/singleton_flow_controller.rs} (71%) create mode 100644 src/storage/txn/flow_controller/tablet_flow_controller.rs diff --git a/Cargo.lock b/Cargo.lock index 6a6cf62a6ec..cedc1229d0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5989,6 +5989,7 @@ dependencies = [ "futures-executor", "futures-timer", "futures-util", + "getset", "grpcio", "grpcio-health", "hex 0.4.2", diff --git a/Cargo.toml b/Cargo.toml index e0b8b195b0c..da68c7aa75c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -94,6 +94,7 @@ futures = { version = "0.3", features = ["thread-pool", "compat"] } futures-executor = "0.3.1" futures-timer = "3.0" futures-util = { version = "0.3.1", default-features = false, features = ["io", "async-await"] } +getset = "0.1" grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } hex = "0.4" diff --git a/components/engine_rocks/src/flow_listener.rs b/components/engine_rocks/src/flow_listener.rs index 5d36c2b66e9..f36b5393f7a 100644 --- a/components/engine_rocks/src/flow_listener.rs +++ b/components/engine_rocks/src/flow_listener.rs @@ -5,26 +5,57 @@ use std::sync::{mpsc::Sender, Arc, Mutex}; use collections::hash_set_with_capacity; use rocksdb::{CompactionJobInfo, EventListener, FlushJobInfo, IngestionInfo}; +#[derive(Clone)] pub enum FlowInfo { - L0(String, u64), - L0Intra(String, u64), - Flush(String, u64), - Compaction(String), - BeforeUnsafeDestroyRange, - AfterUnsafeDestroyRange, + L0(String, u64, u64, u64), + L0Intra(String, u64, u64, u64), + Flush(String, u64, u64, u64), + Compaction(String, u64, u64), + BeforeUnsafeDestroyRange(u64), + AfterUnsafeDestroyRange(u64), + Created(u64, u64), + Destroyed(u64, u64), } #[derive(Clone)] pub struct FlowListener { flow_info_sender: Arc>>, + region_id: u64, + suffix_id: u64, } impl FlowListener { pub fn new(flow_info_sender: Sender) -> Self { Self { flow_info_sender: Arc::new(Mutex::new(flow_info_sender)), + region_id: 0, + suffix_id: 0, + } + } + + pub fn clone_with(&self, region_id: u64, suffix_id: u64) -> Self { + Self { + flow_info_sender: self.flow_info_sender.clone(), + region_id, + suffix_id, } } + + pub fn on_created(&self) { + let _ = self + .flow_info_sender + .lock() + .unwrap() + .send(FlowInfo::Created(self.region_id, self.suffix_id)); + } + + pub fn on_destroyed(&self) { + let _ = self + .flow_info_sender + .lock() + .unwrap() + .send(FlowInfo::Destroyed(self.region_id, self.suffix_id)); + } } impl EventListener for FlowListener { @@ -32,11 +63,12 @@ impl EventListener for FlowListener { let mut total = 0; let p = info.table_properties(); total += p.data_size() + p.index_size() + p.filter_size(); - let _ = self - .flow_info_sender - .lock() - .unwrap() - .send(FlowInfo::Flush(info.cf_name().to_owned(), total)); + let _ = self.flow_info_sender.lock().unwrap().send(FlowInfo::Flush( + info.cf_name().to_owned(), + total, + self.region_id, + self.suffix_id, + )); } fn on_external_file_ingested(&self, info: &IngestionInfo) { @@ -45,18 +77,23 @@ impl EventListener for FlowListener { let mut total = 0; let p = info.table_properties(); total += p.data_size() + p.index_size() + p.filter_size(); - let _ = self - .flow_info_sender - .lock() - .unwrap() - .send(FlowInfo::Flush(info.cf_name().to_owned(), total)); + let _ = self.flow_info_sender.lock().unwrap().send(FlowInfo::Flush( + info.cf_name().to_owned(), + total, + self.region_id, + self.suffix_id, + )); } else { // ingestion may change the pending bytes. let _ = self .flow_info_sender .lock() .unwrap() - .send(FlowInfo::Compaction(info.cf_name().to_owned())); + .send(FlowInfo::Compaction( + info.cf_name().to_owned(), + self.region_id, + self.suffix_id, + )); } } @@ -97,7 +134,12 @@ impl EventListener for FlowListener { .flow_info_sender .lock() .unwrap() - .send(FlowInfo::L0Intra(info.cf_name().to_owned(), diff)); + .send(FlowInfo::L0Intra( + info.cf_name().to_owned(), + diff, + self.region_id, + self.suffix_id, + )); } else { let l0_input_file_at_input_level = info.input_file_count() - info.num_input_files_at_output_level(); @@ -116,11 +158,12 @@ impl EventListener for FlowListener { } } - let _ = self - .flow_info_sender - .lock() - .unwrap() - .send(FlowInfo::L0(info.cf_name().to_owned(), read_bytes)); + let _ = self.flow_info_sender.lock().unwrap().send(FlowInfo::L0( + info.cf_name().to_owned(), + read_bytes, + self.region_id, + self.suffix_id, + )); } } @@ -128,6 +171,10 @@ impl EventListener for FlowListener { .flow_info_sender .lock() .unwrap() - .send(FlowInfo::Compaction(info.cf_name().to_owned())); + .send(FlowInfo::Compaction( + info.cf_name().to_owned(), + self.region_id, + self.suffix_id, + )); } } diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index e97a15c75ae..a2aa5e5d908 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -141,7 +141,7 @@ pub trait TabletFactory { pub struct DummyFactory where - EK: KvEngine, + EK: Clone + Send + 'static, { pub engine: Option, pub root_path: String, @@ -149,7 +149,7 @@ where impl TabletFactory for DummyFactory where - EK: KvEngine, + EK: Clone + Send + 'static, { fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { Ok(self.engine.as_ref().unwrap().clone()) @@ -190,18 +190,15 @@ where impl DummyFactory where - EK: KvEngine, + EK: Clone + Send + 'static, { - pub fn new() -> DummyFactory { - DummyFactory { - engine: None, - root_path: "/dummy_root".to_string(), - } + pub fn new(engine: Option, root_path: String) -> DummyFactory { + DummyFactory { engine, root_path } } } -impl Default for DummyFactory { +impl Default for DummyFactory { fn default() -> Self { - Self::new() + Self::new(None, "/tmp".to_string()) } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 6bf1de8e7a7..11f6071dbc6 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -97,8 +97,11 @@ use tikv::{ GRPC_THREAD_PREFIX, }, storage::{ - self, config_manager::StorageConfigManger, mvcc::MvccConsistencyCheckObserver, - txn::flow_controller::FlowController, Engine, + self, + config_manager::StorageConfigManger, + mvcc::MvccConsistencyCheckObserver, + txn::flow_controller::{EngineFlowController, FlowController}, + Engine, }, }; use tikv_util::{ @@ -558,11 +561,11 @@ impl TiKvServer { } fn init_servers(&mut self) -> Arc> { - let flow_controller = Arc::new(FlowController::new( + let flow_controller = Arc::new(FlowController::Singleton(EngineFlowController::new( &self.config.storage.flow_control, self.engines.as_ref().unwrap().engine.kv_engine(), self.flow_info_receiver.take().unwrap(), - )); + ))); let gc_worker = self.init_gc_worker(); let mut ttl_checker = Box::new(LazyWorker::new("ttl-checker")); let ttl_scheduler = ttl_checker.scheduler(); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 981843ddfc6..88e0b079a4d 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -61,7 +61,12 @@ use tikv::{ ConnectionBuilder, Error, Node, PdStoreAddrResolver, RaftClient, RaftKv, Result as ServerResult, Server, ServerTransport, }, - storage::{self, kv::SnapContext, txn::flow_controller::FlowController, Engine}, + storage::{ + self, + kv::SnapContext, + txn::flow_controller::{EngineFlowController, FlowController}, + Engine, + }, }; use tikv_util::{ config::VersionTrack, @@ -385,7 +390,7 @@ impl ServerCluster { lock_mgr.clone(), concurrency_manager.clone(), lock_mgr.get_storage_dynamic_configs(), - Arc::new(FlowController::empty()), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), pd_sender, res_tag_factory.clone(), quota_limiter.clone(), diff --git a/src/config.rs b/src/config.rs index d37e0892082..3ff087f129c 100644 --- a/src/config.rs +++ b/src/config.rs @@ -3829,8 +3829,10 @@ mod tests { use crate::{ server::{config::ServerConfigManager, ttl::TtlCheckerTask}, storage::{ - config_manager::StorageConfigManger, lock_manager::DummyLockManager, - txn::flow_controller::FlowController, Storage, TestStorageBuilder, + config_manager::StorageConfigManger, + lock_manager::DummyLockManager, + txn::flow_controller::{EngineFlowController, FlowController}, + Storage, TestStorageBuilder, }, }; @@ -4186,6 +4188,7 @@ mod tests { assert_eq!(res.get("raftstore.store-pool-size"), Some(&"17".to_owned())); } + #[allow(clippy::type_complexity)] fn new_engines( cfg: TiKvConfig, ) -> ( @@ -4215,11 +4218,11 @@ mod tests { .unwrap(); let engine = storage.get_engine().get_rocksdb(); let (_tx, rx) = std::sync::mpsc::channel(); - let flow_controller = Arc::new(FlowController::new( + let flow_controller = Arc::new(FlowController::Singleton(EngineFlowController::new( &cfg.storage.flow_control, engine.clone(), rx, - )); + ))); let (shared, cfg_controller) = (cfg.storage.block_cache.shared, ConfigController::new(cfg)); cfg_controller.register( diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 0c02cde0aef..5212a211e69 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -128,7 +128,12 @@ impl KvEngineFactory { )) } - pub fn create_tablet(&self, tablet_path: &Path) -> Result { + pub fn create_tablet( + &self, + tablet_path: &Path, + region_id: u64, + suffix: u64, + ) -> Result { // Create kv engine. let mut kv_db_opts = self.inner.rocksdb_config.build_opt(); kv_db_opts.set_env(self.inner.env.clone()); @@ -140,7 +145,7 @@ impl KvEngineFactory { kv_db_opts.add_event_listener(filter); } if let Some(listener) = &self.inner.flow_listener { - kv_db_opts.add_event_listener(listener.clone()); + kv_db_opts.add_event_listener(listener.clone_with(region_id, suffix)); } let kv_cfs_opts = self.inner.rocksdb_config.build_cf_opts( &self.inner.block_cache, @@ -165,6 +170,13 @@ impl KvEngineFactory { Ok(kv_engine) } + pub fn on_tablet_created(&self, region_id: u64, suffix: u64) { + if let Some(listener) = &self.inner.flow_listener { + let listener = listener.clone_with(region_id, suffix); + listener.on_created(); + } + } + pub fn destroy_tablet(&self, tablet_path: &Path) -> engine_traits::Result<()> { info!("destroy tablet"; "path" => %tablet_path.display()); // Create kv engine. @@ -189,6 +201,13 @@ impl KvEngineFactory { Ok(()) } + pub fn on_tablet_destroy(&self, region_id: u64, suffix: u64) { + if let Some(listener) = &self.inner.flow_listener { + let listener = listener.clone_with(region_id, suffix); + listener.on_destroyed(); + } + } + pub fn store_path(&self) -> PathBuf { self.inner.store_path.clone() } @@ -203,7 +222,7 @@ impl TabletFactory for KvEngineFactory { #[inline] fn create_shared_db(&self) -> Result { let root_path = self.kv_engine_path(); - let tablet = self.create_tablet(&root_path)?; + let tablet = self.create_tablet(&root_path, 0, 0)?; let mut root_db = self.inner.root_db.lock().unwrap(); root_db.replace(tablet.clone()); Ok(tablet) diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 2dca2ff14f3..676272334ac 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -30,9 +30,10 @@ impl TabletFactory for KvEngineFactoryV2 { )); } let tablet_path = self.tablet_path(id, suffix); - let kv_engine = self.inner.create_tablet(&tablet_path)?; + let kv_engine = self.inner.create_tablet(&tablet_path, id, suffix)?; debug!("inserting tablet"; "key" => ?(id, suffix)); reg.insert((id, suffix), kv_engine.clone()); + self.inner.on_tablet_created(id, suffix); Ok(kv_engine) } @@ -123,7 +124,9 @@ impl TabletFactory for KvEngineFactoryV2 { fn destroy_tablet(&self, id: u64, suffix: u64) -> engine_traits::Result<()> { let path = self.tablet_path(id, suffix); self.registry.lock().unwrap().remove(&(id, suffix)); - self.inner.destroy_tablet(&path) + self.inner.destroy_tablet(&path)?; + self.inner.on_tablet_destroy(id, suffix); + Ok(()) } #[inline] diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index ca59416d495..7242a984d0d 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -615,7 +615,7 @@ where Ok(()) } - fn unsafe_destroy_range(&self, _: &Context, start_key: &Key, end_key: &Key) -> Result<()> { + fn unsafe_destroy_range(&self, ctx: &Context, start_key: &Key, end_key: &Key) -> Result<()> { info!( "unsafe destroy range started"; "start_key" => %start_key, "end_key" => %end_key @@ -624,7 +624,7 @@ where fail_point!("unsafe_destroy_range"); self.flow_info_sender - .send(FlowInfo::BeforeUnsafeDestroyRange) + .send(FlowInfo::BeforeUnsafeDestroyRange(ctx.region_id)) .unwrap(); let local_storage = self.engine.kv_engine(); @@ -691,7 +691,7 @@ where "start_key" => %start_key, "end_key" => %end_key, "cost_time" => ?cleanup_all_start_time.saturating_elapsed(), ); self.flow_info_sender - .send(FlowInfo::AfterUnsafeDestroyRange) + .send(FlowInfo::AfterUnsafeDestroyRange(ctx.region_id)) .unwrap(); self.raft_store_router diff --git a/src/storage/mod.rs b/src/storage/mod.rs index f12f918b8aa..7026ebab77d 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -108,7 +108,7 @@ use crate::{ mvcc::{MvccReader, PointGetterBuilder}, txn::{ commands::{RawAtomicStore, RawCompareAndSwap, TypedCommand}, - flow_controller::FlowController, + flow_controller::{EngineFlowController, FlowController}, scheduler::Scheduler as TxnScheduler, Command, }, @@ -2811,7 +2811,7 @@ impl TestStorageBuilder { pipelined_pessimistic_lock: self.pipelined_pessimistic_lock, in_memory_pessimistic_lock: self.in_memory_pessimistic_lock, }, - Arc::new(FlowController::empty()), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), DummyReporter, self.resource_tag_factory, Arc::new(QuotaLimiter::default()), @@ -2839,7 +2839,7 @@ impl TestStorageBuilder { pipelined_pessimistic_lock: self.pipelined_pessimistic_lock, in_memory_pessimistic_lock: self.in_memory_pessimistic_lock, }, - Arc::new(FlowController::empty()), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), diff --git a/src/storage/txn/flow_controller/mod.rs b/src/storage/txn/flow_controller/mod.rs new file mode 100644 index 00000000000..f109b9896a3 --- /dev/null +++ b/src/storage/txn/flow_controller/mod.rs @@ -0,0 +1,76 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +pub mod singleton_flow_controller; +pub mod tablet_flow_controller; + +use std::time::Duration; + +pub use singleton_flow_controller::EngineFlowController; +pub use tablet_flow_controller::TabletFlowController; + +pub enum FlowController { + Singleton(EngineFlowController), + Tablet(TabletFlowController), +} + +macro_rules! flow_controller_fn { + ($fn_name: ident, $region_id: ident, $type: ident) => { + pub fn $fn_name(&self, $region_id: u64) -> $type { + match self { + FlowController::Singleton(ref controller) => controller.$fn_name($region_id), + FlowController::Tablet(ref controller) => controller.$fn_name($region_id), + } + } + }; + ($fn_name: ident, $region_id: ident, $bytes: ident, $type: ident) => { + pub fn $fn_name(&self, $region_id: u64, $bytes: usize) -> $type { + match self { + FlowController::Singleton(ref controller) => { + controller.$fn_name($region_id, $bytes) + } + FlowController::Tablet(ref controller) => controller.$fn_name($region_id, $bytes), + } + } + }; +} + +impl FlowController { + flow_controller_fn!(should_drop, region_id, bool); + #[cfg(test)] + flow_controller_fn!(discard_ratio, region_id, f64); + flow_controller_fn!(consume, region_id, bytes, Duration); + #[cfg(test)] + flow_controller_fn!(total_bytes_consumed, region_id, usize); + flow_controller_fn!(is_unlimited, region_id, bool); + + pub fn unconsume(&self, region_id: u64, bytes: usize) { + match self { + FlowController::Singleton(ref controller) => controller.unconsume(region_id, bytes), + FlowController::Tablet(ref controller) => controller.unconsume(region_id, bytes), + } + } + pub fn enable(&self, enable: bool) { + match self { + FlowController::Singleton(ref controller) => controller.enable(enable), + FlowController::Tablet(ref controller) => controller.enable(enable), + } + } + + pub fn enabled(&self) -> bool { + match self { + FlowController::Singleton(ref controller) => controller.enabled(), + FlowController::Tablet(ref controller) => controller.enabled(), + } + } + + #[cfg(test)] + pub fn set_speed_limit(&self, region_id: u64, speed_limit: f64) { + match self { + FlowController::Singleton(ref controller) => { + controller.set_speed_limit(region_id, speed_limit) + } + FlowController::Tablet(ref controller) => { + controller.set_speed_limit(region_id, speed_limit) + } + } + } +} diff --git a/src/storage/txn/flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs similarity index 71% rename from src/storage/txn/flow_controller.rs rename to src/storage/txn/flow_controller/singleton_flow_controller.rs index e29472594c6..76671412abc 100644 --- a/src/storage/txn/flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -18,6 +18,7 @@ use std::{ use collections::HashMap; use engine_rocks::FlowInfo; use engine_traits::{CFNamesExt, FlowControlFactorsExt}; +use getset::{CopyGetters, Setters}; use num_traits::cast::{AsPrimitive, FromPrimitive}; use rand::Rng; use tikv_util::{ @@ -27,9 +28,9 @@ use tikv_util::{ use crate::storage::{config::FlowControlConfig, metrics::*}; -const TICK_DURATION: Duration = Duration::from_millis(1000); +pub(super) const TICK_DURATION: Duration = Duration::from_millis(1000); -const RATIO_SCALE_FACTOR: u32 = 10_000_000; +pub(super) const RATIO_SCALE_FACTOR: u32 = 10_000_000; const K_INC_SLOWDOWN_RATIO: f64 = 0.8; const K_DEC_SLOWDOWN_RATIO: f64 = 1.0 / K_INC_SLOWDOWN_RATIO; const MIN_THROTTLE_SPEED: f64 = 16.0 * 1024.0; // 16KB @@ -69,7 +70,7 @@ enum Trend { /// /// Here is a brief flow showing where the mechanism works: /// grpc -> check should drop(discardable ratio) -> limiter -> async write to raftstore -pub struct FlowController { +pub struct EngineFlowController { discard_ratio: Arc, limiter: Arc, enabled: Arc, @@ -77,13 +78,13 @@ pub struct FlowController { handle: Option>, } -enum Msg { +pub(super) enum Msg { Close, Enable, Disable, } -impl Drop for FlowController { +impl Drop for EngineFlowController { fn drop(&mut self) { let h = self.handle.take(); if h.is_none() { @@ -101,7 +102,7 @@ impl Drop for FlowController { } } -impl FlowController { +impl EngineFlowController { // only for test pub fn empty() -> Self { Self { @@ -142,28 +143,30 @@ impl FlowController { handle: Some(checker.start(rx, flow_info_receiver)), } } +} - pub fn should_drop(&self) -> bool { +impl EngineFlowController { + pub fn should_drop(&self, _region_id: u64) -> bool { let ratio = self.discard_ratio.load(Ordering::Relaxed); let mut rng = rand::thread_rng(); rng.gen_ratio(ratio, RATIO_SCALE_FACTOR) } #[cfg(test)] - pub fn discard_ratio(&self) -> f64 { + pub fn discard_ratio(&self, _region_id: u64) -> f64 { self.discard_ratio.load(Ordering::Relaxed) as f64 / RATIO_SCALE_FACTOR as f64 } - pub fn consume(&self, bytes: usize) -> Duration { + pub fn consume(&self, _region_id: u64, bytes: usize) -> Duration { self.limiter.consume_duration(bytes) } - pub fn unconsume(&self, bytes: usize) { + pub fn unconsume(&self, _region_id: u64, bytes: usize) { self.limiter.unconsume(bytes); } #[cfg(test)] - pub fn total_bytes_consumed(&self) -> usize { + pub fn total_bytes_consumed(&self, _region_id: u64) -> usize { self.limiter.total_bytes_consumed() } @@ -183,11 +186,11 @@ impl FlowController { } #[cfg(test)] - pub fn set_speed_limit(&self, speed_limit: f64) { + pub fn set_speed_limit(&self, _region_id: u64, speed_limit: f64) { self.limiter.set_speed_limit(speed_limit); } - pub fn is_unlimited(&self) -> bool { + pub fn is_unlimited(&self, _region_id: u64) -> bool { self.limiter.speed_limit() == f64::INFINITY } } @@ -365,7 +368,7 @@ where } } -// CFFlowChecker records some statistics and states related to one CF. +// CfFlowChecker records some statistics and states related to one CF. // These statistics fall into five categories: // * memtable // * L0 files @@ -373,7 +376,7 @@ where // * L0 consumption flow (compaction read flow of L0) // * pending compaction bytes // And all of them are collected from the hook of RocksDB's event listener. -struct CFFlowChecker { +struct CfFlowChecker { // Memtable related last_num_memtables: Smoother, memtable_debt: f64, @@ -416,7 +419,7 @@ struct CFFlowChecker { on_start_pending_bytes: bool, } -impl Default for CFFlowChecker { +impl Default for CfFlowChecker { fn default() -> Self { Self { last_num_memtables: Smoother::default(), @@ -438,14 +441,15 @@ impl Default for CFFlowChecker { } } -struct FlowChecker { - soft_pending_compaction_bytes_limit: u64, +#[derive(CopyGetters, Setters)] +pub(super) struct FlowChecker { + pub soft_pending_compaction_bytes_limit: u64, hard_pending_compaction_bytes_limit: u64, memtables_threshold: u64, l0_files_threshold: u64, - // CFFlowChecker for each CF. - cf_checkers: HashMap, + // CfFlowChecker for each CF. + cf_checkers: HashMap, // Record which CF is taking control of throttling, the throttle speed is // decided based on the statistics of the throttle CF. If the multiple CFs // exceed the threshold, choose the larger one. @@ -454,6 +458,7 @@ struct FlowChecker { // drop write requests(return ServerIsBusy to TiDB) randomly. discard_ratio: Arc, + #[getset(set = "pub")] engine: E, limiter: Arc, // Records the foreground write flow at scheduler level of last few seconds. @@ -462,6 +467,9 @@ struct FlowChecker { last_record_time: Instant, last_speed: f64, wait_for_destroy_range_finish: bool, + + #[getset(get_copy = "pub", set = "pub")] + tablet_suffix: u64, } impl FlowChecker { @@ -470,11 +478,21 @@ impl FlowChecker { engine: E, discard_ratio: Arc, limiter: Arc, + ) -> Self { + Self::new_with_tablet_suffix(config, engine, discard_ratio, limiter, 0) + } + + pub fn new_with_tablet_suffix( + config: &FlowControlConfig, + engine: E, + discard_ratio: Arc, + limiter: Arc, + tablet_suffix: u64, ) -> Self { let cf_checkers = engine .cf_names() .into_iter() - .map(|cf| (cf.to_owned(), CFFlowChecker::default())) + .map(|cf| (cf.to_owned(), CfFlowChecker::default())) .collect(); Self { @@ -491,6 +509,88 @@ impl FlowChecker { last_record_time: Instant::now_coarse(), last_speed: 0.0, wait_for_destroy_range_finish: false, + tablet_suffix, + } + } + + pub fn on_flow_info_msg( + &mut self, + enabled: bool, + flow_info: Result, + ) { + match flow_info { + Ok(FlowInfo::L0(cf, l0_bytes, ..)) => { + self.collect_l0_consumption_stats(&cf, l0_bytes); + if enabled { + self.on_l0_change(cf) + } + } + Ok(FlowInfo::L0Intra(cf, diff_bytes, ..)) => { + if diff_bytes > 0 { + // Intra L0 merges some deletion records, so regard it as a L0 compaction. + self.collect_l0_consumption_stats(&cf, diff_bytes); + if enabled { + self.on_l0_change(cf); + } + } + } + Ok(FlowInfo::Flush(cf, flush_bytes, ..)) => { + self.collect_l0_production_stats(&cf, flush_bytes); + if enabled { + self.on_memtable_change(&cf); + self.on_l0_change(cf) + } + } + Ok(FlowInfo::Compaction(cf, ..)) => { + if enabled { + self.on_pending_compaction_bytes_change(cf); + } + } + Ok(FlowInfo::BeforeUnsafeDestroyRange(..)) => { + if !enabled { + return; + } + self.wait_for_destroy_range_finish = true; + let soft = (self.soft_pending_compaction_bytes_limit as f64).log2(); + for cf_checker in self.cf_checkers.values_mut() { + let v = cf_checker.long_term_pending_bytes.get_avg(); + if v <= soft { + cf_checker.pending_bytes_before_unsafe_destroy_range = Some(v); + } + } + } + Ok(FlowInfo::AfterUnsafeDestroyRange(..)) => { + if !enabled { + return; + } + self.wait_for_destroy_range_finish = false; + for (cf, cf_checker) in &mut self.cf_checkers { + if let Some(before) = cf_checker.pending_bytes_before_unsafe_destroy_range { + let soft = (self.soft_pending_compaction_bytes_limit as f64).log2(); + let after = (self + .engine + .get_cf_pending_compaction_bytes(cf) + .unwrap_or(None) + .unwrap_or(0) as f64) + .log2(); + + assert!(before < soft); + if after >= soft { + // there is a pending bytes jump + SCHED_THROTTLE_ACTION_COUNTER + .with_label_values(&[cf, "pending_bytes_jump"]) + .inc(); + } else { + cf_checker.pending_bytes_before_unsafe_destroy_range = None; + } + } + } + } + Ok(FlowInfo::Created(..)) => {} + Ok(FlowInfo::Destroyed(..)) => {} + Err(e) => { + error!("failed to receive compaction info {:?}", e); + } } } @@ -515,85 +615,12 @@ impl FlowChecker { Err(_) => {} } - match flow_info_receiver.recv_deadline(deadline) { - Ok(FlowInfo::L0(cf, l0_bytes)) => { - checker.collect_l0_consumption_stats(&cf, l0_bytes); - if enabled { - checker.on_l0_change(cf) - } - } - Ok(FlowInfo::L0Intra(cf, diff_bytes)) => { - if diff_bytes > 0 { - // Intra L0 merges some deletion records, so regard it as a L0 compaction. - checker.collect_l0_consumption_stats(&cf, diff_bytes); - if enabled { - checker.on_l0_change(cf); - } - } - } - Ok(FlowInfo::Flush(cf, flush_bytes)) => { - checker.collect_l0_production_stats(&cf, flush_bytes); - if enabled { - checker.on_memtable_change(&cf); - checker.on_l0_change(cf) - } - } - Ok(FlowInfo::Compaction(cf)) => { - if enabled { - checker.on_pending_compaction_bytes_change(cf); - } - } - Ok(FlowInfo::BeforeUnsafeDestroyRange) => { - if !enabled { - continue; - } - checker.wait_for_destroy_range_finish = true; - let soft = (checker.soft_pending_compaction_bytes_limit as f64).log2(); - for cf_checker in checker.cf_checkers.values_mut() { - let v = cf_checker.long_term_pending_bytes.get_avg(); - if v <= soft { - cf_checker.pending_bytes_before_unsafe_destroy_range = Some(v); - } - } - } - Ok(FlowInfo::AfterUnsafeDestroyRange) => { - if !enabled { - continue; - } - checker.wait_for_destroy_range_finish = false; - for (cf, cf_checker) in &mut checker.cf_checkers { - if let Some(before) = - cf_checker.pending_bytes_before_unsafe_destroy_range - { - let soft = - (checker.soft_pending_compaction_bytes_limit as f64).log2(); - let after = (checker - .engine - .get_cf_pending_compaction_bytes(cf) - .unwrap_or(None) - .unwrap_or(0) - as f64) - .log2(); - - assert!(before < soft); - if after >= soft { - // there is a pending bytes jump - SCHED_THROTTLE_ACTION_COUNTER - .with_label_values(&[cf, "pending_bytes_jump"]) - .inc(); - } else { - cf_checker.pending_bytes_before_unsafe_destroy_range = None; - } - } - } - } - Err(RecvTimeoutError::Timeout) => { - checker.update_statistics(); - deadline = std::time::Instant::now() + TICK_DURATION; - } - Err(e) => { - error!("failed to receive compaction info {:?}", e); - } + let msg = flow_info_receiver.recv_deadline(deadline); + if let Err(RecvTimeoutError::Timeout) = msg { + checker.update_statistics(); + deadline = std::time::Instant::now() + TICK_DURATION; + } else { + checker.on_flow_info_msg(enabled, msg); } } tikv_alloc::remove_thread_memory_accessor(); @@ -601,7 +628,7 @@ impl FlowChecker { .unwrap() } - fn reset_statistics(&mut self) { + pub fn reset_statistics(&mut self) { SCHED_L0_TARGET_FLOW_GAUGE.set(0); for cf in self.cf_checkers.keys() { SCHED_THROTTLE_CF_GAUGE.with_label_values(&[cf]).set(0); @@ -621,7 +648,7 @@ impl FlowChecker { self.discard_ratio.store(0, Ordering::Relaxed); } - fn update_statistics(&mut self) { + pub fn update_statistics(&mut self) { if let Some(throttle_cf) = self.throttle_cf.as_ref() { SCHED_THROTTLE_CF_GAUGE .with_label_values(&[throttle_cf]) @@ -959,28 +986,28 @@ impl FlowChecker { } #[cfg(test)] -mod tests { +pub(super) mod tests { use std::sync::atomic::AtomicU64; use engine_traits::Result; - use super::*; + use super::{super::FlowController, *}; #[derive(Clone)] - struct EngineStub(Arc); + pub struct EngineStub(pub Arc); - struct EngineStubInner { + pub struct EngineStubInner { pub pending_compaction_bytes: AtomicU64, pub num_l0_files: AtomicU64, - pub num_memtable_files: AtomicU64, + pub num_memtables: AtomicU64, } impl EngineStub { - fn new() -> Self { + pub fn new() -> Self { Self(Arc::new(EngineStubInner { pending_compaction_bytes: AtomicU64::new(0), num_l0_files: AtomicU64::new(0), - num_memtable_files: AtomicU64::new(0), + num_memtables: AtomicU64::new(0), })) } } @@ -997,7 +1024,7 @@ mod tests { } fn get_cf_num_immutable_mem_table(&self, _cf: &str) -> Result> { - Ok(Some(self.0.num_memtable_files.load(Ordering::Relaxed))) + Ok(Some(self.0.num_memtables.load(Ordering::Relaxed))) } fn get_cf_pending_compaction_bytes(&self, _cf: &str) -> Result> { @@ -1007,18 +1034,13 @@ mod tests { } } - #[test] - fn test_flow_controller_basic() { - let stub = EngineStub::new(); - let (_tx, rx) = mpsc::channel(); - let flow_controller = FlowController::new(&FlowControlConfig::default(), stub, rx); - + pub fn test_flow_controller_basic_impl(flow_controller: &FlowController, region_id: u64) { // enable flow controller assert_eq!(flow_controller.enabled(), true); - assert_eq!(flow_controller.should_drop(), false); - assert_eq!(flow_controller.is_unlimited(), true); - assert_eq!(flow_controller.consume(0), Duration::ZERO); - assert_eq!(flow_controller.consume(1000), Duration::ZERO); + assert_eq!(flow_controller.should_drop(region_id), false); + assert_eq!(flow_controller.is_unlimited(region_id), true); + assert_eq!(flow_controller.consume(region_id, 0), Duration::ZERO); + assert_eq!(flow_controller.consume(region_id, 1000), Duration::ZERO); // disable flow controller flow_controller.enable(false); @@ -1026,73 +1048,156 @@ mod tests { // re-enable flow controller flow_controller.enable(true); assert_eq!(flow_controller.enabled(), true); - assert_eq!(flow_controller.should_drop(), false); - assert_eq!(flow_controller.is_unlimited(), true); - assert_eq!(flow_controller.consume(1), Duration::ZERO); + assert_eq!(flow_controller.should_drop(region_id), false); + assert_eq!(flow_controller.is_unlimited(region_id), true); + assert_eq!(flow_controller.consume(region_id, 1), Duration::ZERO); } #[test] - fn test_flow_controller_memtable() { + fn test_flow_controller_basic() { let stub = EngineStub::new(); - let (tx, rx) = mpsc::sync_channel(0); - let flow_controller = FlowController::new(&FlowControlConfig::default(), stub.clone(), rx); + let (_tx, rx) = mpsc::channel(); + let flow_controller = EngineFlowController::new(&FlowControlConfig::default(), stub, rx); + let flow_controller = FlowController::Singleton(flow_controller); + test_flow_controller_basic_impl(&flow_controller, 0); + } + + pub fn test_flow_controller_memtable_impl( + flow_controller: &FlowController, + stub: &EngineStub, + tx: &mpsc::SyncSender, + region_id: u64, + tablet_suffix: u64, + ) { + assert_eq!(flow_controller.consume(0, 2000), Duration::ZERO); + loop { + if flow_controller.total_bytes_consumed(0) == 0 { + break; + } + std::thread::sleep(TICK_DURATION); + } - assert_eq!(flow_controller.consume(2000), Duration::ZERO); + assert_eq!(flow_controller.consume(region_id, 2000), Duration::ZERO); loop { - if flow_controller.total_bytes_consumed() == 0 { + if flow_controller.total_bytes_consumed(region_id) == 0 { break; } std::thread::sleep(TICK_DURATION); } // exceeds the threshold on start - stub.0.num_memtable_files.store(8, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0)).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert_eq!(flow_controller.should_drop(), false); + stub.0.num_memtables.store(8, Ordering::Relaxed); + tx.send(FlowInfo::Flush( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert_eq!(flow_controller.should_drop(region_id), false); // on start check forbids flow control - assert_eq!(flow_controller.is_unlimited(), true); + assert_eq!(flow_controller.is_unlimited(region_id), true); // once falls below the threshold, pass the on start check - stub.0.num_memtable_files.store(1, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0)).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); + stub.0.num_memtables.store(1, Ordering::Relaxed); + tx.send(FlowInfo::Flush( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); // not throttle when the average of the sliding window doesn't exceeds the threshold - stub.0.num_memtable_files.store(6, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0)).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert_eq!(flow_controller.should_drop(), false); - assert_eq!(flow_controller.is_unlimited(), true); + stub.0.num_memtables.store(6, Ordering::Relaxed); + tx.send(FlowInfo::Flush( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert_eq!(flow_controller.should_drop(region_id), false); + assert_eq!(flow_controller.is_unlimited(region_id), true); // the average of sliding window exceeds the threshold - stub.0.num_memtable_files.store(6, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0)).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert_eq!(flow_controller.should_drop(), false); - assert_eq!(flow_controller.is_unlimited(), false); - assert_ne!(flow_controller.consume(2000), Duration::ZERO); + stub.0.num_memtables.store(6, Ordering::Relaxed); + tx.send(FlowInfo::Flush( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert_eq!(flow_controller.should_drop(region_id), false); + assert_eq!(flow_controller.is_unlimited(region_id), false); + assert_ne!(flow_controller.consume(region_id, 2000), Duration::ZERO); // not throttle once the number of memtables falls below the threshold - stub.0.num_memtable_files.store(1, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0)).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert_eq!(flow_controller.should_drop(), false); - assert_eq!(flow_controller.is_unlimited(), true); + stub.0.num_memtables.store(1, Ordering::Relaxed); + tx.send(FlowInfo::Flush( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert_eq!(flow_controller.should_drop(region_id), false); + assert_eq!(flow_controller.is_unlimited(region_id), true); } - #[test] - fn test_flow_controller_l0() { + fn test_flow_controller_memtable() { let stub = EngineStub::new(); let (tx, rx) = mpsc::sync_channel(0); - let flow_controller = FlowController::new(&FlowControlConfig::default(), stub.clone(), rx); + let flow_controller = + EngineFlowController::new(&FlowControlConfig::default(), stub.clone(), rx); + let flow_controller = FlowController::Singleton(flow_controller); + test_flow_controller_memtable_impl(&flow_controller, &stub, &tx, 0, 0); + } - assert_eq!(flow_controller.consume(2000), Duration::ZERO); + pub fn test_flow_controller_l0_impl( + flow_controller: &FlowController, + stub: &EngineStub, + tx: &mpsc::SyncSender, + region_id: u64, + tablet_suffix: u64, + ) { + assert_eq!(flow_controller.consume(region_id, 2000), Duration::ZERO); loop { - if flow_controller.total_bytes_consumed() == 0 { + if flow_controller.total_bytes_consumed(region_id) == 0 { break; } std::thread::sleep(TICK_DURATION); @@ -1100,115 +1205,251 @@ mod tests { // exceeds the threshold stub.0.num_l0_files.store(30, Ordering::Relaxed); - tx.send(FlowInfo::L0("default".to_string(), 0)).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert_eq!(flow_controller.should_drop(), false); + tx.send(FlowInfo::L0( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert_eq!(flow_controller.should_drop(region_id), false); // on start check forbids flow control - assert_eq!(flow_controller.is_unlimited(), true); + assert_eq!(flow_controller.is_unlimited(region_id), true); // once fall below the threshold, pass the on start check stub.0.num_l0_files.store(10, Ordering::Relaxed); - tx.send(FlowInfo::L0("default".to_string(), 0)).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); + tx.send(FlowInfo::L0( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); // exceeds the threshold, throttle now stub.0.num_l0_files.store(30, Ordering::Relaxed); - tx.send(FlowInfo::L0("default".to_string(), 0)).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert_eq!(flow_controller.should_drop(), false); - assert_eq!(flow_controller.is_unlimited(), false); - assert_ne!(flow_controller.consume(2000), Duration::ZERO); + tx.send(FlowInfo::L0( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert_eq!(flow_controller.should_drop(region_id), false); + assert_eq!(flow_controller.is_unlimited(region_id), false); + assert_ne!(flow_controller.consume(region_id, 2000), Duration::ZERO); } #[test] - fn test_flow_controller_pending_compaction_bytes() { + fn test_flow_controller_l0() { let stub = EngineStub::new(); let (tx, rx) = mpsc::sync_channel(0); - let flow_controller = FlowController::new(&FlowControlConfig::default(), stub.clone(), rx); + let flow_controller = + EngineFlowController::new(&FlowControlConfig::default(), stub.clone(), rx); + let flow_controller = FlowController::Singleton(flow_controller); + test_flow_controller_l0_impl(&flow_controller, &stub, &tx, 0, 0); + } + pub fn test_flow_controller_pending_compaction_bytes_impl( + flow_controller: &FlowController, + stub: &EngineStub, + tx: &mpsc::SyncSender, + region_id: u64, + tablet_suffix: u64, + ) { // exceeds the threshold stub.0 .pending_compaction_bytes .store(1000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string())) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); + tx.send(FlowInfo::Compaction( + "default".to_string(), + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); // on start check forbids flow control - assert!(flow_controller.discard_ratio() < f64::EPSILON); + assert!( + flow_controller.discard_ratio(region_id) < f64::EPSILON, + "discard_ratio {}", + flow_controller.discard_ratio(region_id) + ); // once fall below the threshold, pass the on start check stub.0 .pending_compaction_bytes .store(100 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string())) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); + tx.send(FlowInfo::Compaction( + "default".to_string(), + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); stub.0 .pending_compaction_bytes .store(1000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string())) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert!(flow_controller.discard_ratio() > f64::EPSILON); + tx.send(FlowInfo::Compaction( + "default".to_string(), + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert!(flow_controller.discard_ratio(region_id) > f64::EPSILON); stub.0 .pending_compaction_bytes .store(1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string())) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert!(flow_controller.discard_ratio() < f64::EPSILON); + tx.send(FlowInfo::Compaction( + "default".to_string(), + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); // pending compaction bytes jump after unsafe destroy range - tx.send(FlowInfo::BeforeUnsafeDestroyRange).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) + tx.send(FlowInfo::BeforeUnsafeDestroyRange(region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id, 0)) .unwrap(); - assert!(flow_controller.discard_ratio() < f64::EPSILON); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); // during unsafe destroy range, pending compaction bytes may change stub.0 .pending_compaction_bytes .store(1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string())) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert!(flow_controller.discard_ratio() < f64::EPSILON); + tx.send(FlowInfo::Compaction( + "default".to_string(), + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); stub.0 .pending_compaction_bytes .store(10000000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string())) - .unwrap(); - tx.send(FlowInfo::AfterUnsafeDestroyRange).unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) + tx.send(FlowInfo::Compaction( + "default".to_string(), + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::AfterUnsafeDestroyRange(region_id)) .unwrap(); - assert!(flow_controller.discard_ratio() < f64::EPSILON); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert!( + flow_controller.discard_ratio(region_id) < f64::EPSILON, + "discard_ratio {}", + flow_controller.discard_ratio(region_id) + ); // unfreeze the control stub.0 .pending_compaction_bytes .store(1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string())) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert!(flow_controller.discard_ratio() < f64::EPSILON); + tx.send(FlowInfo::Compaction( + "default".to_string(), + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); stub.0 .pending_compaction_bytes .store(1000000000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string())) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0)) - .unwrap(); - assert!(flow_controller.discard_ratio() > f64::EPSILON); + tx.send(FlowInfo::Compaction( + "default".to_string(), + region_id, + tablet_suffix, + )) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + assert!(flow_controller.discard_ratio(region_id) > f64::EPSILON); + } + + #[test] + fn test_flow_controller_pending_compaction_bytes() { + let stub = EngineStub::new(); + let (tx, rx) = mpsc::sync_channel(0); + let flow_controller = + EngineFlowController::new(&FlowControlConfig::default(), stub.clone(), rx); + let flow_controller = FlowController::Singleton(flow_controller); + test_flow_controller_pending_compaction_bytes_impl(&flow_controller, &stub, &tx, 0, 0); } #[test] diff --git a/src/storage/txn/flow_controller/tablet_flow_controller.rs b/src/storage/txn/flow_controller/tablet_flow_controller.rs new file mode 100644 index 00000000000..d177c203ba1 --- /dev/null +++ b/src/storage/txn/flow_controller/tablet_flow_controller.rs @@ -0,0 +1,395 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// #[PerformanceCriticalPath] +use std::{ + sync::{ + atomic::{AtomicBool, AtomicU32, Ordering}, + mpsc::{self, Receiver, RecvTimeoutError, SyncSender}, + Arc, RwLock, + }, + thread::{Builder, JoinHandle}, + time::Duration, +}; + +use collections::HashMap; +use engine_rocks::FlowInfo; +use engine_traits::{CFNamesExt, FlowControlFactorsExt, TabletFactory}; +use rand::Rng; +use tikv_util::{sys::thread::StdThreadBuildWrapper, time::Limiter}; + +use super::singleton_flow_controller::{FlowChecker, Msg, RATIO_SCALE_FACTOR, TICK_DURATION}; +use crate::storage::config::FlowControlConfig; + +type Limiters = Arc, Arc)>>>; +pub struct TabletFlowController { + enabled: Arc, + tx: Option>, + handle: Option>, + limiters: Limiters, +} + +impl Drop for TabletFlowController { + fn drop(&mut self) { + let h = self.handle.take(); + if h.is_none() { + return; + } + + if let Some(Err(e)) = self.tx.as_ref().map(|tx| tx.send(Msg::Close)) { + error!("send quit message for flow controller failed"; "err" => ?e); + return; + } + + if let Err(e) = h.unwrap().join() { + error!("join flow controller failed"; "err" => ?e); + } + } +} + +impl TabletFlowController { + pub fn new( + config: &FlowControlConfig, + tablet_factory: Arc + Send + Sync>, + flow_info_receiver: Receiver, + ) -> Self { + let (tx, rx) = mpsc::sync_channel(5); + tx.send(if config.enable { + Msg::Enable + } else { + Msg::Disable + }) + .unwrap(); + let flow_checkers: Arc>>> = + Arc::new(RwLock::new(HashMap::default())); + let limiters: Limiters = Arc::new(RwLock::new(HashMap::default())); + Self { + enabled: Arc::new(AtomicBool::new(config.enable)), + tx: Some(tx), + limiters: limiters.clone(), + handle: Some(FlowInfoDispatcher::start( + rx, + flow_info_receiver, + tablet_factory, + flow_checkers, + limiters, + config.clone(), + )), + } + } + + pub fn tablet_exist(&self, region_id: u64) -> bool { + let limiters = self.limiters.as_ref().read().unwrap(); + limiters.get(®ion_id).is_some() + } +} + +struct FlowInfoDispatcher; + +impl FlowInfoDispatcher { + fn start( + rx: Receiver, + flow_info_receiver: Receiver, + tablet_factory: Arc + Send + Sync>, + flow_checkers: Arc>>>, + limiters: Limiters, + config: FlowControlConfig, + ) -> JoinHandle<()> { + Builder::new() + .name(thd_name!("flow-checker")) + .spawn_wrapper(move || { + tikv_alloc::add_thread_memory_accessor(); + let mut deadline = std::time::Instant::now(); + let mut enabled = true; + loop { + match rx.try_recv() { + Ok(Msg::Close) => break, + Ok(Msg::Disable) => { + enabled = false; + let mut checkers = flow_checkers.as_ref().write().unwrap(); + for checker in (*checkers).values_mut() { + checker.reset_statistics(); + } + } + Ok(Msg::Enable) => { + enabled = true; + } + Err(_) => {} + } + + let insert_limiter_and_checker = |region_id, suffix| -> FlowChecker { + let engine = tablet_factory.open_tablet_cache(region_id, suffix).unwrap(); + let mut v = limiters.as_ref().write().unwrap(); + let discard_ratio = Arc::new(AtomicU32::new(0)); + let limiter = v.entry(region_id).or_insert(( + Arc::new( + ::builder(f64::INFINITY) + .refill(Duration::from_millis(1)) + .build(), + ), + discard_ratio, + )); + FlowChecker::new_with_tablet_suffix( + &config, + engine, + limiter.1.clone(), + limiter.0.clone(), + suffix, + ) + }; + let msg = flow_info_receiver.recv_deadline(deadline); + match msg.clone() { + Ok(FlowInfo::L0(_cf, _, region_id, suffix)) + | Ok(FlowInfo::L0Intra(_cf, _, region_id, suffix)) + | Ok(FlowInfo::Flush(_cf, _, region_id, suffix)) + | Ok(FlowInfo::Compaction(_cf, region_id, suffix)) => { + let mut checkers = flow_checkers.as_ref().write().unwrap(); + if let Some(checker) = checkers.get_mut(®ion_id) { + if checker.tablet_suffix() != suffix { + continue; + } + checker.on_flow_info_msg(enabled, msg); + } + } + Ok(FlowInfo::BeforeUnsafeDestroyRange(region_id)) + | Ok(FlowInfo::AfterUnsafeDestroyRange(region_id)) => { + let mut checkers = flow_checkers.as_ref().write().unwrap(); + if let Some(checker) = checkers.get_mut(®ion_id) { + checker.on_flow_info_msg(enabled, msg); + } + } + Ok(FlowInfo::Created(region_id, suffix)) => { + let mut checkers = flow_checkers.as_ref().write().unwrap(); + let checker = checkers + .entry(region_id) + .or_insert_with(|| insert_limiter_and_checker(region_id, suffix)); + // check if the checker's engine is exactly (region_id, suffix) + // if checker.suffix < suffix, it means its tablet is old and needs the refresh + if checker.tablet_suffix() < suffix { + let engine = + tablet_factory.open_tablet_cache(region_id, suffix).unwrap(); + checker.set_engine(engine); + checker.set_tablet_suffix(suffix); + } + } + Ok(FlowInfo::Destroyed(region_id, suffix)) => { + let mut remove_limiter = false; + { + let mut checkers = flow_checkers.as_ref().write().unwrap(); + if let Some(checker) = checkers.get_mut(®ion_id) { + if checker.tablet_suffix() == suffix { + checkers.remove(®ion_id); + remove_limiter = true; + } + } + } + if remove_limiter { + limiters.as_ref().write().unwrap().remove(®ion_id); + } + } + Err(RecvTimeoutError::Timeout) => { + let mut checkers = flow_checkers.as_ref().write().unwrap(); + for checker in (*checkers).values_mut() { + checker.update_statistics(); + } + deadline = std::time::Instant::now() + TICK_DURATION; + } + Err(e) => { + error!("failed to receive compaction info {:?}", e); + } + } + } + tikv_alloc::remove_thread_memory_accessor(); + }) + .unwrap() + } +} + +impl TabletFlowController { + pub fn should_drop(&self, region_id: u64) -> bool { + let limiters = self.limiters.as_ref().read().unwrap(); + if let Some(limiter) = limiters.get(®ion_id) { + let ratio = limiter.1.load(Ordering::Relaxed); + let mut rng = rand::thread_rng(); + return rng.gen_ratio(ratio, RATIO_SCALE_FACTOR); + } + false + } + + #[cfg(test)] + pub fn discard_ratio(&self, region_id: u64) -> f64 { + let limiters = self.limiters.as_ref().read().unwrap(); + if let Some(limiter) = limiters.get(®ion_id) { + let ratio = limiter.1.load(Ordering::Relaxed); + return ratio as f64 / RATIO_SCALE_FACTOR as f64; + } + 0.0 + } + + pub fn consume(&self, region_id: u64, bytes: usize) -> Duration { + let limiters = self.limiters.as_ref().read().unwrap(); + if let Some(limiter) = limiters.get(®ion_id) { + return limiter.0.consume_duration(bytes); + } + Duration::ZERO + } + + pub fn unconsume(&self, region_id: u64, bytes: usize) { + let limiters = self.limiters.as_ref().read().unwrap(); + if let Some(limiter) = limiters.get(®ion_id) { + limiter.0.unconsume(bytes); + } + } + + #[cfg(test)] + pub fn total_bytes_consumed(&self, region_id: u64) -> usize { + let limiters = self.limiters.as_ref().read().unwrap(); + if let Some(limiter) = limiters.get(®ion_id) { + return limiter.0.total_bytes_consumed(); + } + 0 + } + + pub fn enable(&self, enable: bool) { + self.enabled.store(enable, Ordering::Relaxed); + if let Some(tx) = &self.tx { + if enable { + tx.send(Msg::Enable).unwrap(); + } else { + tx.send(Msg::Disable).unwrap(); + } + } + } + + pub fn enabled(&self) -> bool { + self.enabled.load(Ordering::Relaxed) + } + + #[cfg(test)] + pub fn set_speed_limit(&self, region_id: u64, speed_limit: f64) { + let limiters = self.limiters.as_ref().read().unwrap(); + if let Some(limiter) = limiters.get(®ion_id) { + limiter.0.set_speed_limit(speed_limit); + } + } + + pub fn is_unlimited(&self, region_id: u64) -> bool { + let limiters = self.limiters.as_ref().read().unwrap(); + if let Some(limiter) = limiters.get(®ion_id) { + return limiter.0.speed_limit() == f64::INFINITY; + } + true + } +} + +#[cfg(test)] +mod tests { + use engine_rocks::FlowInfo; + use engine_traits::DummyFactory; + + use super::{ + super::{singleton_flow_controller::tests::*, FlowController}, + *, + }; + + fn create_tablet_flow_controller() -> (FlowController, mpsc::SyncSender, EngineStub) { + let (tx, rx) = mpsc::sync_channel(0); + let root_path = "/tmp"; + let stub = EngineStub::new(); + let factory = DummyFactory::::new(Some(stub.clone()), root_path.to_string()); + let tablet_factory = Arc::new(factory); + ( + FlowController::Tablet(TabletFlowController::new( + &FlowControlConfig::default(), + tablet_factory, + rx, + )), + tx, + stub, + ) + } + + #[test] + fn test_tablet_flow_controller_basic() { + let (flow_controller, tx, _) = create_tablet_flow_controller(); + let region_id = 5_u64; + let tablet_suffix = 5_u64; + tx.send(FlowInfo::Created(region_id, tablet_suffix)) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + test_flow_controller_basic_impl(&flow_controller, region_id); + tx.send(FlowInfo::Destroyed(region_id, tablet_suffix)) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + //assert!(!flow_controller.tablet_exist(region_id)); + } + + #[test] + fn test_tablet_flow_controller_memtable() { + let (flow_controller, tx, stub) = create_tablet_flow_controller(); + let region_id = 5_u64; + let tablet_suffix = 5_u64; + tx.send(FlowInfo::Created(region_id, tablet_suffix)) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + test_flow_controller_memtable_impl(&flow_controller, &stub, &tx, region_id, tablet_suffix); + } + + #[test] + fn test_tablet_flow_controller_l0() { + let (flow_controller, tx, stub) = create_tablet_flow_controller(); + let region_id = 5_u64; + let tablet_suffix = 5_u64; + tx.send(FlowInfo::Created(region_id, tablet_suffix)) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + test_flow_controller_l0_impl(&flow_controller, &stub, &tx, region_id, tablet_suffix); + } + + #[test] + fn test_tablet_flow_controller_pending_compaction_bytes() { + let (flow_controller, tx, stub) = create_tablet_flow_controller(); + let region_id = 5_u64; + let tablet_suffix = 5_u64; + tx.send(FlowInfo::Created(region_id, tablet_suffix)) + .unwrap(); + tx.send(FlowInfo::L0Intra( + "default".to_string(), + 0, + region_id, + tablet_suffix, + )) + .unwrap(); + + test_flow_controller_pending_compaction_bytes_impl( + &flow_controller, + &stub, + &tx, + region_id, + tablet_suffix, + ); + } +} diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index f0e1529fab7..ab866fe18bf 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -270,10 +270,10 @@ impl SchedulerInner { self.get_task_slot(cid).get_mut(&cid).unwrap().pr = Some(pr); } - fn too_busy(&self) -> bool { + fn too_busy(&self, region_id: u64) -> bool { fail_point!("txn_scheduler_busy", |_| true); self.running_write_bytes.load(Ordering::Acquire) >= self.sched_pending_write_threshold - || self.flow_controller.should_drop() + || self.flow_controller.should_drop(region_id) } /// Tries to acquire all the required latches for a command when waken up by @@ -394,7 +394,7 @@ impl Scheduler { pub(in crate::storage) fn run_cmd(&self, cmd: Command, callback: StorageCallback) { // write flow control - if cmd.need_flow_control() && self.inner.too_busy() { + if cmd.need_flow_control() && self.inner.too_busy(cmd.ctx().region_id) { SCHED_TOO_BUSY_COUNTER_VEC.get(cmd.tag()).inc(); callback.execute(ProcessResult::Failed { err: StorageError::from(StorageErrorInner::SchedTooBusy), @@ -845,6 +845,7 @@ impl Scheduler { // message when it finishes. Ok(res) => res, }; + let region_id = ctx.get_region_id(); SCHED_STAGE_COUNTER_VEC.get(tag).write.inc(); if let Some(lock_info) = lock_info { @@ -957,9 +958,9 @@ impl Scheduler { }; if self.inner.flow_controller.enabled() { - if self.inner.flow_controller.is_unlimited() { + if self.inner.flow_controller.is_unlimited(region_id) { // no need to delay if unthrottled, just call consume to record write flow - let _ = self.inner.flow_controller.consume(write_size); + let _ = self.inner.flow_controller.consume(region_id, write_size); } else { let start = Instant::now_coarse(); // Control mutex is used to ensure there is only one request consuming the quota. @@ -968,16 +969,16 @@ impl Scheduler { // without the mutex, the write flow can't throttled strictly. let control_mutex = self.inner.control_mutex.clone(); let _guard = control_mutex.lock().await; - let delay = self.inner.flow_controller.consume(write_size); + let delay = self.inner.flow_controller.consume(region_id, write_size); let delay_end = Instant::now_coarse() + delay; - while !self.inner.flow_controller.is_unlimited() { + while !self.inner.flow_controller.is_unlimited(region_id) { let now = Instant::now_coarse(); if now >= delay_end { break; } if now >= deadline.inner() { scheduler.finish_with_err(cid, StorageErrorInner::DeadlineExceeded); - self.inner.flow_controller.unconsume(write_size); + self.inner.flow_controller.unconsume(region_id, write_size); SCHED_THROTTLE_TIME.observe(start.saturating_elapsed_secs()); return; } @@ -1072,7 +1073,7 @@ impl Scheduler { // Only consume the quota when write succeeds, otherwise failed write requests may exhaust // the quota and other write requests would be in long delay. if sched.inner.flow_controller.enabled() { - sched.inner.flow_controller.unconsume(write_size); + sched.inner.flow_controller.unconsume(region_id, write_size); } } }) @@ -1189,7 +1190,12 @@ mod tests { lock_manager::DummyLockManager, mvcc::{self, Mutation}, test_util::latest_feature_gate, - txn::{commands, commands::TypedCommand, latch::*}, + txn::{ + commands, + commands::TypedCommand, + flow_controller::{EngineFlowController, FlowController}, + latch::*, + }, TestEngineBuilder, TxnStatus, }; @@ -1336,7 +1342,7 @@ mod tests { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, - Arc::new(FlowController::empty()), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), @@ -1394,7 +1400,7 @@ mod tests { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, - Arc::new(FlowController::empty()), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), @@ -1452,7 +1458,7 @@ mod tests { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, - Arc::new(FlowController::empty()), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), @@ -1469,7 +1475,7 @@ mod tests { let (cb, f) = paired_future_callback(); scheduler.inner.flow_controller.enable(true); - scheduler.inner.flow_controller.set_speed_limit(1.0); + scheduler.inner.flow_controller.set_speed_limit(0, 1.0); scheduler.run_cmd(cmd.cmd, StorageCallback::TxnStatus(cb)); // The task waits for 200ms until it locks the control_mutex, but the execution // time limit is 100ms. Before the mutex is locked, it should return @@ -1480,13 +1486,13 @@ mod tests { Err(StorageError(box StorageErrorInner::DeadlineExceeded)) )); // should unconsume if the request fails - assert_eq!(scheduler.inner.flow_controller.total_bytes_consumed(), 0); + assert_eq!(scheduler.inner.flow_controller.total_bytes_consumed(0), 0); // A new request should not be blocked without flow control. scheduler .inner .flow_controller - .set_speed_limit(f64::INFINITY); + .set_speed_limit(0, f64::INFINITY); let mut req = CheckTxnStatusRequest::default(); req.mut_context().max_execution_duration_ms = 100; req.set_primary_key(b"a".to_vec()); @@ -1518,7 +1524,7 @@ mod tests { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, - Arc::new(FlowController::empty()), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), @@ -1578,7 +1584,7 @@ mod tests { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(false)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, - Arc::new(FlowController::empty()), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 0b43e11c468..7d0bb8c0b74 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -33,8 +33,9 @@ use tikv::{ mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, test_util::*, txn::{ - commands, flow_controller::FlowController, Error as TxnError, - ErrorInner as TxnErrorInner, + commands, + flow_controller::{EngineFlowController, FlowController}, + Error as TxnError, ErrorInner as TxnErrorInner, }, Error as StorageError, ErrorInner as StorageErrorInner, *, }, @@ -253,11 +254,11 @@ fn test_scale_scheduler_pool() { let cfg = new_tikv_config(1); let kv_engine = storage.get_engine().kv_engine(); let (_tx, rx) = std::sync::mpsc::channel(); - let flow_controller = Arc::new(FlowController::new( + let flow_controller = Arc::new(FlowController::Singleton(EngineFlowController::new( &cfg.storage.flow_control, kv_engine.clone(), rx, - )); + ))); let cfg_controller = ConfigController::new(cfg.clone()); let (scheduler, _receiver) = dummy_scheduler(); From abc4e2750e8f83a39ae4f829dccd68a192618669 Mon Sep 17 00:00:00 2001 From: Shenghui Wu <793703860@qq.com> Date: Wed, 22 Jun 2022 13:22:36 +0800 Subject: [PATCH 039/676] copr: Support paging for executors (#12841) ref tikv/tikv#12848 Support paging for Aggregate/Limit/TopN executors. Signed-off-by: wshwsh12 <793703860@qq.com> Co-authored-by: Yilin Chen Co-authored-by: Ti Chi Robot --- .../tidb_query_datatype/src/expr/ctx.rs | 3 + .../src/fast_hash_aggr_executor.rs | 11 + components/tidb_query_executors/src/runner.rs | 4 +- .../src/slow_hash_aggr_executor.rs | 11 + .../src/stream_aggr_executor.rs | 11 + .../src/top_n_executor.rs | 194 ++++++++++++++++- .../src/util/aggr_executor.rs | 203 +++++++++++++++++- 7 files changed, 434 insertions(+), 3 deletions(-) diff --git a/components/tidb_query_datatype/src/expr/ctx.rs b/components/tidb_query_datatype/src/expr/ctx.rs index 748b47e4fe7..f92c561b013 100644 --- a/components/tidb_query_datatype/src/expr/ctx.rs +++ b/components/tidb_query_datatype/src/expr/ctx.rs @@ -70,6 +70,8 @@ pub struct EvalConfig { // warning is a executor stuff instead of a evaluation stuff. pub max_warning_cnt: usize, pub sql_mode: SqlMode, + + pub paging_size: Option, } impl Default for EvalConfig { @@ -105,6 +107,7 @@ impl EvalConfig { flag: Flag::empty(), max_warning_cnt: DEFAULT_MAX_WARNING_CNT, sql_mode: SqlMode::empty(), + paging_size: None, } } diff --git a/components/tidb_query_executors/src/fast_hash_aggr_executor.rs b/components/tidb_query_executors/src/fast_hash_aggr_executor.rs index 038ce448eef..c5859e48338 100644 --- a/components/tidb_query_executors/src/fast_hash_aggr_executor.rs +++ b/components/tidb_query_executors/src/fast_hash_aggr_executor.rs @@ -126,6 +126,17 @@ impl BatchFastHashAggregationExecutor { .unwrap() } + #[cfg(test)] + pub fn new_for_test_with_config( + config: Arc, + src: Src, + group_by_exp: RpnExpression, + aggr_defs: Vec, + aggr_def_parser: impl AggrDefinitionParser, + ) -> Self { + Self::new_impl(config, src, group_by_exp, aggr_defs, aggr_def_parser).unwrap() + } + pub fn new( config: Arc, src: Src, diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 2b918186e3d..9e118f676b9 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -378,7 +378,9 @@ impl BatchExecutorsRunner { ) -> Result { let executors_len = req.get_executors().len(); let collect_exec_summary = req.get_collect_execution_summaries(); - let config = Arc::new(EvalConfig::from_request(&req)?); + let mut config = EvalConfig::from_request(&req)?; + config.paging_size = paging_size; + let config = Arc::new(config); let out_most_executor = build_executors( req.take_executors().into(), diff --git a/components/tidb_query_executors/src/slow_hash_aggr_executor.rs b/components/tidb_query_executors/src/slow_hash_aggr_executor.rs index 5960caa9478..bd1e5cf8a80 100644 --- a/components/tidb_query_executors/src/slow_hash_aggr_executor.rs +++ b/components/tidb_query_executors/src/slow_hash_aggr_executor.rs @@ -104,6 +104,17 @@ impl BatchSlowHashAggregationExecutor { .unwrap() } + #[cfg(test)] + pub fn new_for_test_with_config( + config: Arc, + src: Src, + group_by_exps: Vec, + aggr_defs: Vec, + aggr_def_parser: impl AggrDefinitionParser, + ) -> Self { + Self::new_impl(config, src, group_by_exps, aggr_defs, aggr_def_parser).unwrap() + } + pub fn new( config: Arc, src: Src, diff --git a/components/tidb_query_executors/src/stream_aggr_executor.rs b/components/tidb_query_executors/src/stream_aggr_executor.rs index 705a2d8972a..52f55751442 100644 --- a/components/tidb_query_executors/src/stream_aggr_executor.rs +++ b/components/tidb_query_executors/src/stream_aggr_executor.rs @@ -123,6 +123,17 @@ impl BatchStreamAggregationExecutor { .unwrap() } + #[cfg(test)] + pub fn new_for_test_with_config( + config: Arc, + src: Src, + group_by_exps: Vec, + aggr_defs: Vec, + aggr_def_parser: impl AggrDefinitionParser, + ) -> Self { + Self::new_impl(config, src, group_by_exps, aggr_defs, aggr_def_parser).unwrap() + } + pub fn new( config: Arc, src: Src, diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index 20adbbad12c..112a3f3c33b 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -106,6 +106,35 @@ impl BatchTopNExecutor { } } + #[cfg(test)] + pub fn new_for_test_with_config( + config: Arc, + src: Src, + order_exprs: Vec, + order_is_desc: Vec, + n: usize, + ) -> Self { + assert_eq!(order_exprs.len(), order_is_desc.len()); + + let order_exprs_field_type: Vec = order_exprs + .iter() + .map(|expr| expr.ret_field_type(src.schema()).clone()) + .collect(); + + Self { + heap: BinaryHeap::new(), + eval_columns_buffer_unsafe: Box::new(Vec::new()), + order_exprs: order_exprs.into_boxed_slice(), + order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), + order_is_desc: order_is_desc.into_boxed_slice(), + n, + + context: EvalContext::new(config), + src, + is_ended: false, + } + } + pub fn new( config: std::sync::Arc, src: Src, @@ -296,7 +325,7 @@ impl BatchExecutor for BatchTopNExecutor { } #[inline] - fn next_batch(&mut self, _scan_rows: usize) -> BatchExecuteResult { + fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { assert!(!self.is_ended); if self.n == 0 { @@ -309,6 +338,12 @@ impl BatchExecutor for BatchTopNExecutor { }; } + if let Some(paging_size) = self.context.cfg.paging_size { + if self.n > paging_size as usize { + return self.src.next_batch(scan_rows); + } + } + let result = self.handle_next_batch(); match result { @@ -1307,4 +1342,161 @@ mod tests { ], ); } + + #[test] + fn test_top_paging() { + // Top N = 5 and PagingSize = 6, same with no-paging. + let test_top5_paging6 = |col_index: usize, is_desc: bool, expected: &[Option]| { + let mut config = EvalConfig::default(); + config.paging_size = Some(6); + let config = Arc::new(config); + let src_exec = make_src_executor_unsigned(); + let mut exec = BatchTopNExecutor::new_for_test_with_config( + config, + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(col_index) + .build_for_test(), + ], + vec![is_desc], + 5, + ); + + let r = exec.next_batch(1); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = exec.next_batch(1); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = exec.next_batch(1); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); + assert_eq!(r.physical_columns.rows_len(), 5); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[col_index].decoded().to_int_vec(), + expected + ); + assert!(r.is_drained.unwrap()); + }; + + test_top5_paging6( + 0, + false, + &[ + None, + Some(300_u64 as i64), + Some(2000_u64 as i64), + Some(9_223_372_036_854_775_807_u64 as i64), + Some(9_223_372_036_854_775_808_u64 as i64), + ], + ); + + test_top5_paging6( + 0, + true, + &[ + Some(18_446_744_073_709_551_615_u64 as i64), + Some(18_446_744_073_709_551_613_u64 as i64), + Some(9_223_372_036_854_775_808_u64 as i64), + Some(9_223_372_036_854_775_807_u64 as i64), + Some(2000_u64 as i64), + ], + ); + + test_top5_paging6( + 1, + false, + &[ + None, + Some(-9_223_372_036_854_775_808), + Some(-3), + Some(-1), + Some(300), + ], + ); + + test_top5_paging6( + 1, + true, + &[ + Some(9_223_372_036_854_775_807), + Some(2000), + Some(300), + Some(-1), + Some(-3), + ], + ); + + test_top5_paging6( + 2, + false, + &[ + None, + Some(300_u32 as i64), + Some(2000_u32 as i64), + Some(2_147_483_647_u32 as i64), + Some(2_147_483_648_u32 as i64), + ], + ); + + test_top5_paging6( + 2, + true, + &[ + Some(4_294_967_295_u32 as i64), + Some(4_294_967_295_u32 as i64), + Some(2_147_483_648_u32 as i64), + Some(2_147_483_647_u32 as i64), + Some(2000_u32 as i64), + ], + ); + + // Top N = 5 and PagingSize = 4, return all data and do nothing. + let test_top5_paging4 = |build_src_executor: fn() -> MockExecutor| { + let mut config = EvalConfig::default(); + config.paging_size = Some(4); + let config = Arc::new(config); + let src_exec = build_src_executor(); + let mut exec = BatchTopNExecutor::new_for_test_with_config( + config, + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + ], + vec![false], + 5, + ); + let mut exec2 = build_src_executor(); + + loop { + let r1 = exec.next_batch(1); + let r2 = exec2.next_batch(1); + assert_eq!(r1.logical_rows, r2.logical_rows); + assert_eq!( + r1.physical_columns.rows_len(), + r2.physical_columns.rows_len() + ); + assert_eq!( + r1.physical_columns.columns_len(), + r2.physical_columns.columns_len() + ); + let r1_is_drained = r1.is_drained.unwrap(); + assert_eq!(r1_is_drained, r2.is_drained.unwrap()); + if r1_is_drained { + break; + } + } + }; + + test_top5_paging4(make_src_executor_unsigned); + test_top5_paging4(make_src_executor); + test_top5_paging4(make_bytes_src_executor); + } } diff --git a/components/tidb_query_executors/src/util/aggr_executor.rs b/components/tidb_query_executors/src/util/aggr_executor.rs index 96c67e1b4d8..74a9429b390 100644 --- a/components/tidb_query_executors/src/util/aggr_executor.rs +++ b/components/tidb_query_executors/src/util/aggr_executor.rs @@ -121,6 +121,7 @@ pub struct AggregationExecutor, + required_row: Option, } impl> AggregationExecutor { @@ -185,6 +186,7 @@ impl> AggregationExecutor> AggregationExecutor> AggregationExecutor= required_row as usize { + src_is_drained = true + } + // StreamAgg will return groups_len - 1 rows immediately + if !src_is_drained && self.imp.is_partial_results_ready() { + self.required_row = Some(required_row + 1 - self.imp.groups_len() as u64) + } + } + // aggregate result is always available when source is drained let result = if src_is_drained || self.imp.is_partial_results_ready() { Some(self.aggregate_partial_results(src_is_drained)?) @@ -468,4 +480,193 @@ pub mod tests { ], ) } + + /// Builds an executor that will return these logical data: + /// + /// == Schema == + /// Col0(Real) Col1(Real) + /// == Call #1 == + /// NULL 1.0 + /// 7.0 2.0 + /// NULL NULL + /// NULL 4.5 + /// == Call #2 == + /// == Call #3 == + /// 1.5 4.5 + /// 6.0 6.0 + /// == Call #4 == + /// 6.0 6.0 + /// 7.0 7.0 + /// (drained) + pub fn make_src_executor_2() -> MockExecutor { + MockExecutor::new( + vec![FieldTypeTp::Double.into(), FieldTypeTp::Double.into()], + vec![ + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Real( + vec![None, None, None, Real::new(-5.0).ok(), Real::new(7.0).ok()] + .into(), + ), + VectorValue::Real( + vec![ + None, + Real::new(4.5).ok(), + Real::new(1.0).ok(), + None, + Real::new(2.0).ok(), + ] + .into(), + ), + ]), + logical_rows: vec![2, 4, 0, 1], + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Real(vec![None].into()), + VectorValue::Real(vec![Real::new(-10.0).ok()].into()), + ]), + logical_rows: Vec::new(), + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Real( + vec![ + Real::new(5.5).ok(), + Real::new(1.5).ok(), + Real::new(6.0).ok(), + ] + .into(), + ), + VectorValue::Real( + vec![None, Real::new(4.5).ok(), Real::new(6.0).ok()].into(), + ), + ]), + logical_rows: vec![1, 2], + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Real(vec![Real::new(7.0).ok(), Real::new(6.0).ok()].into()), + VectorValue::Real(vec![Real::new(7.0).ok(), Real::new(6.0).ok()].into()), + ]), + logical_rows: vec![1, 0], + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }, + ], + ) + } + + #[test] + #[allow(clippy::type_complexity)] + fn test_agg_paging() { + use std::sync::Arc; + + use tidb_query_datatype::expr::EvalConfig; + use tidb_query_expr::RpnExpressionBuilder; + use tipb::ExprType; + use tipb_helper::ExprDefBuilder; + + use crate::{ + BatchFastHashAggregationExecutor, BatchSlowHashAggregationExecutor, + BatchStreamAggregationExecutor, + }; + + let group_by_exp = || { + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test() + }; + + let aggr_definitions = vec![ + ExprDefBuilder::aggr_func(ExprType::Count, FieldTypeTp::LongLong) + .push_child(ExprDefBuilder::constant_int(1)) + .build(), + ]; + + let exec_fast = |src_exec, paging_size| { + let mut config = EvalConfig::default(); + config.paging_size = paging_size; + let config = Arc::new(config); + Box::new(BatchFastHashAggregationExecutor::new_for_test_with_config( + config, + src_exec, + group_by_exp(), + aggr_definitions.clone(), + AllAggrDefinitionParser, + )) as Box> + }; + + let exec_slow = |src_exec, paging_size| { + let mut config = EvalConfig::default(); + config.paging_size = paging_size; + let config = Arc::new(config); + Box::new(BatchSlowHashAggregationExecutor::new_for_test_with_config( + config, + src_exec, + vec![group_by_exp()], + aggr_definitions.clone(), + AllAggrDefinitionParser, + )) as Box> + }; + + let test_paging_size = vec![2, 5, 7]; + let expect_call_num = vec![1, 3, 4]; + let expect_row_num = vec![vec![4], vec![0, 0, 5], vec![0, 0, 0, 6]]; + let executor_builders: Vec) -> _>> = + vec![Box::new(exec_fast), Box::new(exec_slow)]; + for test_case in 0..test_paging_size.len() { + let paging_size = test_paging_size[test_case]; + let call_num = expect_call_num[test_case]; + let row_num = &expect_row_num[test_case]; + for exec_builder in &executor_builders { + let src_exec = make_src_executor_2(); + let mut exec = exec_builder(src_exec, Some(paging_size)); + for nth_call in 0..call_num { + let r = exec.next_batch(1); + if nth_call == call_num - 1 { + assert!(r.is_drained.unwrap()); + } else { + assert!(!r.is_drained.unwrap()); + } + assert_eq!(r.physical_columns.rows_len(), row_num[nth_call]); + } + } + } + + let expect_row_num2 = vec![vec![4], vec![3, 0, 2], vec![3, 0, 1, 2]]; + let exec_stream = |src_exec, paging_size| { + let mut config = EvalConfig::default(); + config.paging_size = paging_size; + let config = Arc::new(config); + Box::new(BatchStreamAggregationExecutor::new_for_test_with_config( + config, + src_exec, + vec![group_by_exp()], + aggr_definitions.clone(), + AllAggrDefinitionParser, + )) as Box> + }; + for test_case in 0..test_paging_size.len() { + let paging_size = test_paging_size[test_case]; + let call_num = expect_call_num[test_case]; + let row_num = &expect_row_num2[test_case]; + let mut exec = exec_stream(make_src_executor_2(), Some(paging_size)); + for nth_call in 0..call_num { + let r = exec.next_batch(1); + if nth_call == call_num - 1 { + assert!(r.is_drained.unwrap()); + } else { + assert!(!r.is_drained.unwrap()); + } + assert_eq!(r.physical_columns.rows_len(), row_num[nth_call]); + } + } + } } From 931c9fb2f14c0624583421a56740179fc4f158ab Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Wed, 22 Jun 2022 14:22:37 +0800 Subject: [PATCH 040/676] raftstore: Implement coprocessor observer on_empty_cmd (#12851) ref tikv/tikv#12849 Support new observers on_empty_cmd. Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- .../raftstore/src/coprocessor/dispatcher.rs | 20 +++++++++++++++++++ components/raftstore/src/coprocessor/mod.rs | 3 +++ components/raftstore/src/store/fsm/apply.rs | 5 ++++- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 8c8b857a47b..3f51dd918c6 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -342,6 +342,16 @@ impl CoprocessorHost { CoprocessorHost { registry, cfg } } + pub fn on_empty_cmd(&self, region: &Region, index: u64, term: u64) { + loop_ob!( + region, + &self.registry.query_observers, + on_empty_cmd, + index, + term, + ); + } + /// Call all propose hooks until bypass is set to true. pub fn pre_propose(&self, region: &Region, req: &mut RaftCmdRequest) -> Result<()> { if !req.has_admin_request() { @@ -623,6 +633,11 @@ mod tests { self.called.fetch_add(6, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } + + fn on_empty_cmd(&self, ctx: &mut ObserverContext<'_>, _index: u64, _term: u64) { + self.called.fetch_add(14, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + } } impl RoleObserver for TestCoprocessor { @@ -748,6 +763,11 @@ mod tests { host.on_flush_applied_cmd_batch(cb.level, vec![cb], &PanicEngine); // `post_apply` + `on_flush_applied_cmd_batch` => 13 + 6 = 19 assert_all!([&ob.called], &[74]); + + let mut empty_req = RaftCmdRequest::default(); + empty_req.set_requests(vec![Request::default()].into()); + host.on_empty_cmd(®ion, 0, 0); + assert_all!([&ob.called], &[88]); } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index a9772d948ed..39b412ce950 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -89,6 +89,9 @@ pub trait AdminObserver: Coprocessor { } pub trait QueryObserver: Coprocessor { + /// Hook when observe applying empty cmd, probably caused by leadership change. + fn on_empty_cmd(&self, _: &mut ObserverContext<'_>, _index: u64, _term: u64) {} + /// Hook to call before proposing write request. /// /// We don't propose read request, hence there is no hook for it yet. diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index e3c1172ef5b..e28c8cf2424 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1077,7 +1077,10 @@ where return self.process_raft_cmd(apply_ctx, index, term, cmd); } - // TOOD(cdc): should we observe empty cmd, aka leader change? + + // we should observe empty cmd, aka leader change, + // read index during confchange, or other situations. + apply_ctx.host.on_empty_cmd(&self.region, index, term); self.apply_state.set_applied_index(index); self.applied_index_term = term; From 595ae3fcc27183ce78e38211a2bab172c0d21a88 Mon Sep 17 00:00:00 2001 From: BornChanger <97348524+BornChanger@users.noreply.github.com> Date: Thu, 23 Jun 2022 20:20:37 +0800 Subject: [PATCH 041/676] *: support tune quota limiter for auto analyze at execution time (#12679) close tikv/tikv#12503 Signed-off-by: BornChanger --- components/server/src/server.rs | 99 +++++- components/test_raftstore/src/server.rs | 4 + components/tidb_query_executors/src/runner.rs | 2 +- components/tikv_util/src/metrics/mod.rs | 3 + components/tikv_util/src/quota_limiter.rs | 321 ++++++++++++++---- components/tikv_util/src/sys/cpu_time.rs | 238 ++++++++++++- etc/config-template.toml | 11 +- src/config.rs | 62 +++- src/coprocessor/endpoint.rs | 1 + src/coprocessor/mod.rs | 2 + src/coprocessor/statistics/analyze.rs | 27 +- src/server/service/diagnostics/sys.rs | 8 +- src/storage/mod.rs | 4 +- src/storage/txn/scheduler.rs | 2 +- 14 files changed, 693 insertions(+), 91 deletions(-) diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 11f6071dbc6..cded99edfe3 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -108,8 +108,9 @@ use tikv_util::{ check_environment_variables, config::{ensure_dir_exist, RaftDataStateMachine, VersionTrack}, math::MovingAvgU32, + metrics::INSTANCE_BACKEND_CPU_QUOTA, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, - sys::{disk, register_memory_usage_high_water, SysQuota}, + sys::{cpu_time::ProcessStat, disk, register_memory_usage_high_water, SysQuota}, thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, @@ -121,6 +122,19 @@ use crate::{ tikv_util::sys::thread::ThreadBuildWrapper, }; +// minimum number of core kept for background requests +const BACKGROUND_REQUEST_CORE_LOWER_BOUND: f64 = 1.0; +// max ratio of core quota for background requests +const BACKGROUND_REQUEST_CORE_MAX_RATIO: f64 = 0.95; +// default ratio of core quota for background requests = core_number * 0.5 +const BACKGROUND_REQUEST_CORE_DEFAULT_RATIO: f64 = 0.5; +// indication of TiKV instance is short of cpu +const SYSTEM_BUSY_THRESHOLD: f64 = 0.80; +// indication of TiKV instance in healthy state when cpu usage is in [0.5, 0.80) +const SYSTEM_HEALTHY_THRESHOLD: f64 = 0.50; +// pace of cpu quota adjustment +const CPU_QUOTA_ADJUSTMENT_PACE: f64 = 200.0; // 0.2 vcpu + #[inline] fn run_impl(config: TiKvConfig) { let mut tikv = TiKvServer::::init(config); @@ -144,6 +158,7 @@ fn run_impl(config: TiKvConfig) { tikv.init_storage_stats_task(engines); tikv.run_server(server_config); tikv.run_status_server(); + tikv.init_quota_tuning_task(tikv.quota_limiter.clone()); signal_handler::wait_for_signal(Some(tikv.engines.take().unwrap().engines)); tikv.stop(); @@ -185,6 +200,7 @@ const DEFAULT_METRICS_FLUSH_INTERVAL: Duration = Duration::from_millis(10_000); const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); +const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); /// A complete TiKV server. struct TiKvServer { @@ -279,11 +295,16 @@ impl TiKvServer { let latest_ts = block_on(pd_client.get_tso()).expect("failed to get timestamp from PD"); let concurrency_manager = ConcurrencyManager::new(latest_ts); + // use different quota for front-end and back-end requests let quota_limiter = Arc::new(QuotaLimiter::new( config.quota.foreground_cpu_time, config.quota.foreground_write_bandwidth, config.quota.foreground_read_bandwidth, + config.quota.background_cpu_time, + config.quota.background_write_bandwidth, + config.quota.background_read_bandwidth, config.quota.max_delay_duration, + config.quota.enable_auto_tune, )); TiKvServer { @@ -1222,6 +1243,82 @@ impl TiKvServer { }); } + // Only background cpu quota tuning is implemented at present. iops and frontend quota tuning is on the way + fn init_quota_tuning_task(&self, quota_limiter: Arc) { + // No need to do auto tune when capacity is really low + if SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO + < BACKGROUND_REQUEST_CORE_LOWER_BOUND + { + return; + }; + + // Determine the base cpu quota + let base_cpu_quota = { + // if cpu quota is not specified, start from optimistic case + if quota_limiter.cputime_limiter(false).is_infinite() { + let quota = 1000_f64 + * f64::max( + BACKGROUND_REQUEST_CORE_LOWER_BOUND, + SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_DEFAULT_RATIO, + ); + quota_limiter.set_cpu_time_limit(quota as usize, false); + quota + } else { + quota_limiter.cputime_limiter(false) / 1000_f64 + } + }; + + // Calculate the celling and floor quota + let celling_quota = f64::min( + base_cpu_quota * 2.0, + 1_000_f64 * SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO, + ); + let floor_quota = f64::max( + base_cpu_quota * 0.5, + 1_000_f64 * BACKGROUND_REQUEST_CORE_LOWER_BOUND, + ); + + let mut proc_stats: ProcessStat = ProcessStat::cur_proc_stat().unwrap(); + self.background_worker.spawn_interval_task( + DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL, + move || { + if quota_limiter.auto_tune_enabled() { + let old_quota = quota_limiter.cputime_limiter(false) / 1000_f64; + let cpu_usage = match proc_stats.cpu_usage() { + Ok(r) => r, + Err(_e) => 0.0, + }; + // Try tuning quota when cpu_usage is correctly collected. + // rule based tuning: + // 1) if instance is busy, shrink cpu quota for analyze by one quota pace until lower bound is hit; + // 2) if instance cpu usage is healthy, no op; + // 3) if instance is idle, increase cpu quota by one quota pace until upper bound is hit. + if cpu_usage > 0.0f64 { + let mut target_quota = old_quota; + + let cpu_util = cpu_usage / SysQuota::cpu_cores_quota(); + if cpu_util >= SYSTEM_BUSY_THRESHOLD { + target_quota = + f64::max(target_quota - CPU_QUOTA_ADJUSTMENT_PACE, floor_quota); + } else if cpu_util < SYSTEM_HEALTHY_THRESHOLD { + target_quota = + f64::min(target_quota + CPU_QUOTA_ADJUSTMENT_PACE, celling_quota); + } + + if old_quota != target_quota { + quota_limiter.set_cpu_time_limit(target_quota as usize, false); + debug!( + "cpu_time_limiter tuned for backend request"; + "cpu_util" => ?cpu_util, + "new quota" => ?target_quota); + INSTANCE_BACKEND_CPU_QUOTA.set(target_quota as i64); + } + } + } + }, + ); + } + fn init_storage_stats_task(&self, engines: Engines) { let config_disk_capacity: u64 = self.config.raft_store.capacity.0; let data_dir = self.config.storage.data_dir.clone(); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 88e0b079a4d..ac6a72e3a06 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -381,7 +381,11 @@ impl ServerCluster { cfg.quota.foreground_cpu_time, cfg.quota.foreground_write_bandwidth, cfg.quota.foreground_read_bandwidth, + cfg.quota.background_cpu_time, + cfg.quota.background_write_bandwidth, + cfg.quota.background_read_bandwidth, cfg.quota.max_delay_duration, + cfg.quota.enable_auto_tune, )); let store = create_raft_storage::<_, _, _, F>( engine, diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 9e118f676b9..dc88c1f6993 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -465,7 +465,7 @@ impl BatchExecutorsRunner { )? }; - let quota_delay = self.quota_limiter.async_consume(sample).await; + let quota_delay = self.quota_limiter.consume_sample(sample, true).await; if !quota_delay.is_zero() { NON_TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC_STATIC .get(ThrottleType::dag) diff --git a/components/tikv_util/src/metrics/mod.rs b/components/tikv_util/src/metrics/mod.rs index 46cc9931048..4b5a9abc2f7 100644 --- a/components/tikv_util/src/metrics/mod.rs +++ b/components/tikv_util/src/metrics/mod.rs @@ -77,6 +77,7 @@ make_auto_flush_static_metric! { pub label_enum ThrottleType { dag, analyze_full_sampling, + quota_limiter_auto_tuned, } pub struct NonTxnCommandThrottleTimeCounterVec: LocalIntCounter { @@ -102,6 +103,8 @@ lazy_static! { NON_TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC, NonTxnCommandThrottleTimeCounterVec ); + pub static ref INSTANCE_BACKEND_CPU_QUOTA: IntGauge = + register_int_gauge!("tikv_backend_cpu_quota", "cpu quota for backend request").unwrap(); } pub fn convert_record_pairs(m: HashMap) -> RecordPairVec { diff --git a/components/tikv_util/src/quota_limiter.rs b/components/tikv_util/src/quota_limiter.rs index 6179ab75da6..c9a761f49de 100644 --- a/components/tikv_util/src/quota_limiter.rs +++ b/components/tikv_util/src/quota_limiter.rs @@ -2,7 +2,7 @@ use std::{ sync::{ - atomic::{AtomicU64, Ordering}, + atomic::{AtomicBool, AtomicU64, Ordering}, Arc, }, time::Duration, @@ -22,15 +22,59 @@ use super::{ // It's better to use a universal approach. const CPU_LIMITER_REFILL_DURATION: Duration = Duration::from_millis(100); -// Quota limiter allows users to obtain stable performance by increasing the -// completion time of tasks through restrictions of different metrics. +// Limter can be issued to cpu, write and read bandwidth #[derive(Debug)] -pub struct QuotaLimiter { +pub struct LimiterItems { cputime_limiter: Limiter, write_bandwidth_limiter: Limiter, read_bandwidth_limiter: Limiter, +} + +impl LimiterItems { + pub fn new( + cpu_quota: usize, + write_bandwidth: ReadableSize, + read_bandwidth: ReadableSize, + ) -> Self { + let cputime_limiter = + Limiter::builder(QuotaLimiter::speed_limit(cpu_quota as f64 * 1000_f64)) + .refill(CPU_LIMITER_REFILL_DURATION) + .build(); + + let write_bandwidth_limiter = + Limiter::new(QuotaLimiter::speed_limit(write_bandwidth.0 as f64)); + + let read_bandwidth_limiter = + Limiter::new(QuotaLimiter::speed_limit(read_bandwidth.0 as f64)); + + Self { + cputime_limiter, + write_bandwidth_limiter, + read_bandwidth_limiter, + } + } +} + +impl Default for LimiterItems { + fn default() -> Self { + Self { + cputime_limiter: Limiter::new(f64::INFINITY), + write_bandwidth_limiter: Limiter::new(f64::INFINITY), + read_bandwidth_limiter: Limiter::new(f64::INFINITY), + } + } +} + +// Quota limiter allows users to obtain stable performance by increasing the +// completion time of tasks through restrictions of different metrics. +#[derive(Debug)] +pub struct QuotaLimiter { + foreground_limiters: LimiterItems, + background_limiters: LimiterItems, // max delay nano seconds max_delay_duration: AtomicU64, + // if auto tune is enabled + enable_auto_tune: AtomicBool, } // Throttle must be consumed in quota limiter. @@ -86,11 +130,13 @@ impl<'a> Drop for CpuObserveGuard<'a> { impl Default for QuotaLimiter { fn default() -> Self { + let foreground_limiters = LimiterItems::default(); + let background_limiters = LimiterItems::default(); Self { - cputime_limiter: Limiter::new(f64::INFINITY), - write_bandwidth_limiter: Limiter::new(f64::INFINITY), - read_bandwidth_limiter: Limiter::new(f64::INFINITY), + foreground_limiters, + background_limiters, max_delay_duration: AtomicU64::new(0), + enable_auto_tune: AtomicBool::new(false), } } } @@ -98,26 +144,33 @@ impl Default for QuotaLimiter { impl QuotaLimiter { // 1000 millicpu equals to 1vCPU, 0 means unlimited pub fn new( - cpu_quota: usize, - write_bandwidth: ReadableSize, - read_bandwidth: ReadableSize, + foreground_cpu_quota: usize, + foreground_write_bandwidth: ReadableSize, + foreground_read_bandwidth: ReadableSize, + background_cpu_quota: usize, + background_write_bandwidth: ReadableSize, + background_read_bandwidth: ReadableSize, max_delay_duration: ReadableDuration, + enable_auto_tune: bool, ) -> Self { - let cputime_limiter = Limiter::builder(Self::speed_limit(cpu_quota as f64 * 1000_f64)) - .refill(CPU_LIMITER_REFILL_DURATION) - .build(); - - let write_bandwidth_limiter = Limiter::new(Self::speed_limit(write_bandwidth.0 as f64)); - - let read_bandwidth_limiter = Limiter::new(Self::speed_limit(read_bandwidth.0 as f64)); - + let foreground_limiters = LimiterItems::new( + foreground_cpu_quota, + foreground_write_bandwidth, + foreground_read_bandwidth, + ); + let background_limiters = LimiterItems::new( + background_cpu_quota, + background_write_bandwidth, + background_read_bandwidth, + ); let max_delay_duration = AtomicU64::new(max_delay_duration.0.as_nanos() as u64); + let enable_auto_tune = AtomicBool::new(enable_auto_tune); Self { - cputime_limiter, - write_bandwidth_limiter, - read_bandwidth_limiter, + foreground_limiters, + background_limiters, max_delay_duration, + enable_auto_tune, } } @@ -129,18 +182,30 @@ impl QuotaLimiter { } } - pub fn set_cpu_time_limit(&self, quota_limit: usize) { - self.cputime_limiter + #[inline] + fn get_limiters(&self, is_foreground: bool) -> &LimiterItems { + if is_foreground { + &self.foreground_limiters + } else { + &self.background_limiters + } + } + + pub fn set_cpu_time_limit(&self, quota_limit: usize, is_foreground: bool) { + self.get_limiters(is_foreground) + .cputime_limiter .set_speed_limit(Self::speed_limit(quota_limit as f64 * 1000_f64)); } - pub fn set_write_bandwidth_limit(&self, write_bandwidth: ReadableSize) { - self.write_bandwidth_limiter + pub fn set_write_bandwidth_limit(&self, write_bandwidth: ReadableSize, is_foreground: bool) { + self.get_limiters(is_foreground) + .write_bandwidth_limiter .set_speed_limit(Self::speed_limit(write_bandwidth.0 as f64)); } - pub fn set_read_bandwidth_limit(&self, read_bandwidth: ReadableSize) { - self.read_bandwidth_limiter + pub fn set_read_bandwidth_limit(&self, read_bandwidth: ReadableSize, is_foreground: bool) { + self.get_limiters(is_foreground) + .read_bandwidth_limiter .set_speed_limit(Self::speed_limit(read_bandwidth.0 as f64)); } @@ -149,39 +214,68 @@ impl QuotaLimiter { .store(duration.0.as_nanos() as u64, Ordering::Relaxed); } + pub fn set_enable_auto_tune(&self, enable_auto_tune: bool) { + self.enable_auto_tune + .store(enable_auto_tune, Ordering::Relaxed); + } + + pub fn cputime_limiter(&self, is_foreground: bool) -> f64 { + self.get_limiters(is_foreground) + .cputime_limiter + .speed_limit() + } + fn max_delay_duration(&self) -> Duration { Duration::from_nanos(self.max_delay_duration.load(Ordering::Relaxed)) } + pub fn auto_tune_enabled(&self) -> bool { + self.enable_auto_tune.load(Ordering::Relaxed) + } + // To generate a sampler. pub fn new_sample(&self) -> Sample { Sample { read_bytes: 0, write_bytes: 0, cpu_time: Duration::ZERO, - enable_cpu_limit: !self.cputime_limiter.speed_limit().is_infinite(), + enable_cpu_limit: !self + .foreground_limiters + .cputime_limiter + .speed_limit() + .is_infinite() + || !self + .background_limiters + .cputime_limiter + .speed_limit() + .is_infinite(), } } // To consume a sampler and return delayed duration. // If the sampler is null, the speed limiter will just return ZERO. - pub async fn async_consume(&self, sample: Sample) -> Duration { + pub async fn consume_sample(&self, sample: Sample, is_foreground: bool) -> Duration { + let limiters = self.get_limiters(is_foreground); + let cpu_dur = if sample.cpu_time > Duration::ZERO { - self.cputime_limiter + limiters + .cputime_limiter .consume_duration(sample.cpu_time.as_micros() as usize) } else { Duration::ZERO }; let w_bw_dur = if sample.write_bytes > 0 { - self.write_bandwidth_limiter + limiters + .write_bandwidth_limiter .consume_duration(sample.write_bytes) } else { Duration::ZERO }; let r_bw_dur = if sample.read_bytes > 0 { - self.read_bandwidth_limiter + limiters + .read_bandwidth_limiter .consume_duration(sample.read_bytes) } else { Duration::ZERO @@ -206,12 +300,12 @@ impl QuotaLimiter { } pub struct QuotaLimitConfigManager { - limiter: Arc, + quota_limiter: Arc, } impl QuotaLimitConfigManager { - pub fn new(limiter: Arc) -> Self { - Self { limiter } + pub fn new(quota_limiter: Arc) -> Self { + Self { quota_limiter } } } @@ -221,22 +315,46 @@ impl ConfigManager for QuotaLimitConfigManager { change: ConfigChange, ) -> std::result::Result<(), Box> { if let Some(cpu_limit) = change.get("foreground_cpu_time") { - self.limiter.set_cpu_time_limit(cpu_limit.into()); + self.quota_limiter + .set_cpu_time_limit(cpu_limit.into(), true); } + if let Some(write_bandwidth) = change.get("foreground_write_bandwidth") { - self.limiter - .set_write_bandwidth_limit(write_bandwidth.clone().into()) + self.quota_limiter + .set_write_bandwidth_limit(write_bandwidth.clone().into(), true); } + if let Some(read_bandwidth) = change.get("foreground_read_bandwidth") { - self.limiter - .set_write_bandwidth_limit(read_bandwidth.clone().into()); + self.quota_limiter + .set_read_bandwidth_limit(read_bandwidth.clone().into(), true); + } + + if let Some(cpu_limit) = change.get("background_cpu_time") { + self.quota_limiter + .set_cpu_time_limit(cpu_limit.into(), false); + } + + if let Some(write_bandwidth) = change.get("background_write_bandwidth") { + self.quota_limiter + .set_write_bandwidth_limit(write_bandwidth.clone().into(), false); + } + + if let Some(read_bandwidth) = change.get("background_read_bandwidth") { + self.quota_limiter + .set_read_bandwidth_limit(read_bandwidth.clone().into(), false); } + if let Some(duration) = change.get("max_delay_duration") { let delay_dur: ReadableDuration = duration.clone().into(); - self.limiter + self.quota_limiter .max_delay_duration .store(delay_dur.0.as_nanos() as u64, Ordering::Relaxed); } + + if let Some(enable_auto_tune) = change.get("enable_auto_tune") { + self.quota_limiter + .set_enable_auto_tune(enable_auto_tune.clone().into()); + } Ok(()) } } @@ -252,10 +370,14 @@ mod tests { // refill duration = 100ms // bucket capacity = 100 let quota_limiter = QuotaLimiter::new( + 1000, + ReadableSize::kb(1), + ReadableSize::kb(1), 1000, ReadableSize::kb(1), ReadableSize::kb(1), ReadableDuration::millis(0), + false, ); let thread_start_time = ThreadTime::now(); @@ -269,81 +391,160 @@ mod tests { let mut sample = quota_limiter.new_sample(); sample.add_cpu_time(Duration::from_millis(60)); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::ZERO); + + let mut sample = quota_limiter.new_sample(); + sample.add_cpu_time(Duration::from_millis(50)); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::from_millis(110)); + + std::thread::sleep(Duration::from_millis(10)); + + let mut sample = quota_limiter.new_sample(); + sample.add_cpu_time(Duration::from_millis(20)); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + // should less 60+50+20 + assert!(should_delay < Duration::from_millis(130)); + + let mut sample = quota_limiter.new_sample(); + sample.add_cpu_time(Duration::from_millis(200)); + sample.add_write_bytes(256); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::from_millis(250)); + + // ThreadTime elapsed time is not long. + assert!(thread_start_time.elapsed() < Duration::from_millis(50)); + + quota_limiter.set_cpu_time_limit(2000, true); + let mut sample = quota_limiter.new_sample(); + sample.add_cpu_time(Duration::from_millis(200)); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::from_millis(100)); + + quota_limiter.set_read_bandwidth_limit(ReadableSize(512), true); + let mut sample = quota_limiter.new_sample(); + sample.add_read_bytes(128); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::from_millis(250)); + + quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(2), true); + let mut sample = quota_limiter.new_sample(); + sample.add_write_bytes(256); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::from_millis(125)); + + quota_limiter.set_max_delay_duration(ReadableDuration::millis(40)); + let mut sample = quota_limiter.new_sample(); + sample.add_read_bytes(256); + sample.add_write_bytes(512); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::from_millis(40)); + + // test change limiter to 0 + quota_limiter.set_cpu_time_limit(0, true); + let mut sample = quota_limiter.new_sample(); + sample.add_cpu_time(Duration::from_millis(100)); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::ZERO); + + quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(0), true); + let mut sample = quota_limiter.new_sample(); + sample.add_write_bytes(256); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::ZERO); + + quota_limiter.set_read_bandwidth_limit(ReadableSize::kb(0), true); + let mut sample = quota_limiter.new_sample(); + sample.add_read_bytes(256); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::ZERO); + + // set bandwidth back + quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(1), true); + quota_limiter.set_max_delay_duration(ReadableDuration::millis(0)); + let mut sample = quota_limiter.new_sample(); + sample.add_write_bytes(128); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + check_duration(should_delay, Duration::from_millis(125)); + + let mut sample = quota_limiter.new_sample(); + sample.add_cpu_time(Duration::from_millis(60)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::ZERO); let mut sample = quota_limiter.new_sample(); sample.add_cpu_time(Duration::from_millis(50)); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(110)); std::thread::sleep(Duration::from_millis(10)); let mut sample = quota_limiter.new_sample(); sample.add_cpu_time(Duration::from_millis(20)); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); // should less 60+50+20 assert!(should_delay < Duration::from_millis(130)); let mut sample = quota_limiter.new_sample(); sample.add_cpu_time(Duration::from_millis(200)); sample.add_write_bytes(256); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(250)); // ThreadTime elapsed time is not long. assert!(thread_start_time.elapsed() < Duration::from_millis(50)); - quota_limiter.set_cpu_time_limit(2000); + quota_limiter.set_cpu_time_limit(2000, false); let mut sample = quota_limiter.new_sample(); sample.add_cpu_time(Duration::from_millis(200)); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(100)); - quota_limiter.set_read_bandwidth_limit(ReadableSize(512)); + quota_limiter.set_read_bandwidth_limit(ReadableSize(512), false); let mut sample = quota_limiter.new_sample(); sample.add_read_bytes(128); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(250)); - quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(2)); + quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(2), false); let mut sample = quota_limiter.new_sample(); sample.add_write_bytes(256); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(125)); quota_limiter.set_max_delay_duration(ReadableDuration::millis(40)); let mut sample = quota_limiter.new_sample(); sample.add_read_bytes(256); sample.add_write_bytes(512); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(40)); // test change limiter to 0 - quota_limiter.set_cpu_time_limit(0); + quota_limiter.set_cpu_time_limit(0, false); let mut sample = quota_limiter.new_sample(); sample.add_cpu_time(Duration::from_millis(100)); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::ZERO); - quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(0)); + quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(0), false); let mut sample = quota_limiter.new_sample(); sample.add_write_bytes(256); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::ZERO); - quota_limiter.set_read_bandwidth_limit(ReadableSize::kb(0)); + quota_limiter.set_read_bandwidth_limit(ReadableSize::kb(0), false); let mut sample = quota_limiter.new_sample(); sample.add_read_bytes(256); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::ZERO); // set bandwidth back - quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(1)); + quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(1), false); quota_limiter.set_max_delay_duration(ReadableDuration::millis(0)); let mut sample = quota_limiter.new_sample(); sample.add_write_bytes(128); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(125)); } } diff --git a/components/tikv_util/src/sys/cpu_time.rs b/components/tikv_util/src/sys/cpu_time.rs index ff9515168c7..69fbb2fb251 100644 --- a/components/tikv_util/src/sys/cpu_time.rs +++ b/components/tikv_util/src/sys/cpu_time.rs @@ -2,12 +2,15 @@ // Modified from https://github.com/rust-lang/cargo/blob/426fae51f39ebf6c545a2c12f78bc09fbfdb7aa9/src/cargo/util/cpu.rs // TODO: Maybe use https://github.com/heim-rs/heim is better after https://github.com/heim-rs/heim/issues/233 is fixed. -use std::io; +use std::{ + io, mem, + time::{Duration, Instant}, +}; use derive_more::{Add, Sub}; -#[derive(Debug, Clone, Copy, Add, Sub)] -pub struct LiunxStyleCpuTime { +#[derive(Add, Sub)] +pub struct LinuxStyleCpuTime { pub user: u64, pub nice: u64, pub system: u64, @@ -20,7 +23,7 @@ pub struct LiunxStyleCpuTime { pub guest_nice: u64, } -impl LiunxStyleCpuTime { +impl LinuxStyleCpuTime { pub fn total(&self) -> u64 { // Note: guest(_nice) is not counted, since it is already in user. // See https://unix.stackexchange.com/questions/178045/proc-stat-is-guest-counted-into-user-time @@ -34,19 +37,57 @@ impl LiunxStyleCpuTime { + self.steal } - pub fn current() -> io::Result { + pub fn current() -> io::Result { imp::current() } } +pub use std::io::Result; + +pub use imp::cpu_time; + +/// A struct to monitor process cpu usage +#[derive(Clone, Copy)] +pub struct ProcessStat { + current_time: Instant, + cpu_time: Duration, +} + +impl ProcessStat { + pub fn cur_proc_stat() -> io::Result { + Ok(ProcessStat { + current_time: Instant::now(), + cpu_time: imp::cpu_time()?, + }) + } + + /// return the cpu usage from last invoke, + /// or when this struct created if it is the first invoke. + pub fn cpu_usage(&mut self) -> io::Result { + let new_time = imp::cpu_time()?; + let old_time = mem::replace(&mut self.cpu_time, new_time); + + let old_now = mem::replace(&mut self.current_time, Instant::now()); + let real_time = self.current_time.duration_since(old_now).as_secs_f64(); + + if real_time > 0.0 { + let cpu_time = new_time + .checked_sub(old_time) + .map(|dur| dur.as_secs_f64()) + .unwrap_or(0.0); + + Ok(cpu_time / real_time) + } else { + Ok(0.0) + } + } +} + #[cfg(target_os = "linux")] mod imp { - use std::{ - fs::File, - io::{self, Read}, - }; + use std::{fs::File, io, io::Read, time::Duration}; - pub fn current() -> io::Result { + pub fn current() -> io::Result { let mut state = String::new(); File::open("/proc/stat")?.read_to_string(&mut state)?; @@ -55,7 +96,7 @@ mod imp { if parts.next()? != "cpu" { return None; } - Some(super::LiunxStyleCpuTime { + Some(super::LinuxStyleCpuTime { user: parts.next()?.parse::().ok()?, nice: parts.next()?.parse::().ok()?, system: parts.next()?.parse::().ok()?, @@ -70,6 +111,19 @@ mod imp { })() .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "first line of /proc/stat malformed")) } + + pub fn cpu_time() -> io::Result { + let mut time = libc::timespec { + tv_sec: 0, + tv_nsec: 0, + }; + + if unsafe { libc::clock_gettime(libc::CLOCK_PROCESS_CPUTIME_ID, &mut time) } == 0 { + Ok(Duration::new(time.tv_sec as u64, time.tv_nsec as u32)) + } else { + Err(io::Error::last_os_error()) + } + } } #[cfg(target_os = "macos")] @@ -78,7 +132,7 @@ mod imp { use libc::*; - pub fn current() -> io::Result { + pub fn current() -> io::Result { // There's scant little documentation on `host_processor_info` // throughout the internet, so this is just modeled after what everyone // else is doing. For now this is modeled largely after libuv. @@ -98,7 +152,7 @@ mod imp { return Err(io::Error::from_raw_os_error(ret)); } - let mut ret = super::LiunxStyleCpuTime { + let mut ret = super::LinuxStyleCpuTime { user: 0, system: 0, idle: 0, @@ -122,16 +176,172 @@ mod imp { Ok(ret) } } + + pub fn cpu_time() -> io::Result { + let mut time = unsafe { std::mem::zeroed() }; + + if unsafe { libc::getrusage(libc::RUSAGE_SELF, &mut time) } == 0 { + let sec = time.ru_utime.tv_sec as u64 + time.ru_stime.tv_sec as u64; + let nsec = (time.ru_utime.tv_usec as u32 + time.ru_stime.tv_usec as u32) * 1000; + + Ok(std::time::Duration::new(sec, nsec)) + } else { + Err(io::Error::last_os_error()) + } + } } #[cfg(not(any(target_os = "linux", target_os = "macos")))] mod imp { use std::io; - pub fn current() -> io::Result { + pub fn current() -> io::Result { Err(io::Error::new( io::ErrorKind::Other, "unsupported platform to learn CPU state", )) } + + use std::{io, mem, time::Duration}; + + use scopeguard::defer; + use winapi::{ + shared::{ + minwindef::FILETIME, + ntdef::{FALSE, NULL}, + }, + um::{ + handleapi::CloseHandle, + processthreadsapi::{ + GetCurrentProcess, GetCurrentThreadId, GetProcessTimes, GetSystemTimes, + GetThreadTimes, OpenThread, + }, + sysinfoapi::{GetSystemInfo, SYSTEM_INFO}, + winnt::THREAD_QUERY_INFORMATION, + }, + }; + + /// convert to u64, unit 100 ns + fn filetime_to_ns100(ft: FILETIME) -> u64 { + ((ft.dwHighDateTime as u64) << 32) + ft.dwLowDateTime as u64 + } + + fn get_sys_times() -> io::Result<(u64, u64, u64)> { + let mut idle = FILETIME::default(); + let mut kernel = FILETIME::default(); + let mut user = FILETIME::default(); + + let ret = unsafe { GetSystemTimes(&mut idle, &mut kernel, &mut user) }; + if ret == 0 { + return Err(io::Error::last_os_error()); + } + + let idle = filetime_to_ns100(idle); + let kernel = filetime_to_ns100(kernel); + let user = filetime_to_ns100(user); + Ok((idle, kernel, user)) + } + + fn get_thread_times(tid: u32) -> io::Result<(u64, u64)> { + let handler = unsafe { OpenThread(THREAD_QUERY_INFORMATION, FALSE as i32, tid) }; + if handler == NULL { + return Err(io::Error::last_os_error()); + } + defer! {{ + unsafe { CloseHandle(handler) }; + }} + + let mut create_time = FILETIME::default(); + let mut exit_time = FILETIME::default(); + let mut kernel_time = FILETIME::default(); + let mut user_time = FILETIME::default(); + + let ret = unsafe { + GetThreadTimes( + handler, + &mut create_time, + &mut exit_time, + &mut kernel_time, + &mut user_time, + ) + }; + if ret == 0 { + return Err(io::Error::last_os_error()); + } + + let kernel_time = filetime_to_ns100(kernel_time); + let user_time = filetime_to_ns100(user_time); + Ok((kernel_time, user_time)) + } + + #[inline] + pub fn cpu_time() -> io::Result { + let (kernel_time, user_time) = unsafe { + let process = GetCurrentProcess(); + let mut create_time = mem::zeroed(); + let mut exit_time = mem::zeroed(); + let mut kernel_time = mem::zeroed(); + let mut user_time = mem::zeroed(); + + let ret = GetProcessTimes( + process, + &mut create_time, + &mut exit_time, + &mut kernel_time, + &mut user_time, + ); + + if ret != 0 { + (kernel_time, user_time) + } else { + return Err(io::Error::last_os_error()); + } + }; + + let kt = filetime_to_ns100(kernel_time); + let ut = filetime_to_ns100(user_time); + + // convert ns + // + // Note: make it ns unit may overflow in some cases. + // For example, a machine with 128 cores runs for one year. + let cpu = (kt + ut) * 100; + + // make it un-normalized + let cpu = cpu * processor_numbers()? as u64; + + Ok(Duration::from_nanos(cpu)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // this test should be executed alone. + #[test] + fn test_process_usage() { + let mut stat = ProcessStat::cur_proc_stat().unwrap(); + + std::thread::sleep(std::time::Duration::from_secs(1)); + + let usage = stat.cpu_usage().unwrap(); + + assert!(usage < 0.01); + + let num = 1; + for _ in 0..num * 10 { + std::thread::spawn(move || { + loop { + let _ = (0..10_000_000).into_iter().sum::(); + } + }); + } + + std::thread::sleep(std::time::Duration::from_secs(1)); + + let usage = stat.cpu_usage().unwrap(); + + assert!(usage > 0.9_f64) + } } diff --git a/etc/config-template.toml b/etc/config-template.toml index ab2ffa28acf..2195e681f62 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -52,8 +52,17 @@ # foreground-write-bandwidth = "0B" ## Read bandwidth limitation for this TiKV instance, default value is 0 which means unlimited. # foreground-read-bandwidth = "0B" -## Limitation of max delay duration for each request, default value is 0 which means unlimited. +## CPU quota for these background requests can use, default value is 0, it means unlimited. +## The unit is millicpu but for now this config is approximate and soft limit. +# background-cpu-time = 0 +## Write bandwidth limitation for backgroud request for this TiKV instance, default value is 0 which means unlimited. +# background-write-bandwidth = "0B" +## Read bandwidth limitation for background request for this TiKV instance, default value is 0 which means unlimited. +# background-read-bandwidth = "0B" +## Limitation of max delay duration, default value is 0 which means unlimited. # max-delay-duration = "500ms" +## Whether to enable quota auto tune +# enable-auto-tune = false [log] ## Log levels: debug, info, warn, error, fatal. diff --git a/src/config.rs b/src/config.rs index 3ff087f129c..d3ec96f6ba4 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2629,6 +2629,10 @@ pub struct QuotaConfig { pub foreground_write_bandwidth: ReadableSize, pub foreground_read_bandwidth: ReadableSize, pub max_delay_duration: ReadableDuration, + pub background_cpu_time: usize, + pub background_write_bandwidth: ReadableSize, + pub background_read_bandwidth: ReadableSize, + pub enable_auto_tune: bool, } impl Default for QuotaConfig { @@ -2638,6 +2642,10 @@ impl Default for QuotaConfig { foreground_write_bandwidth: ReadableSize(0), foreground_read_bandwidth: ReadableSize(0), max_delay_duration: ReadableDuration::millis(500), + background_cpu_time: 0, + background_write_bandwidth: ReadableSize(0), + background_read_bandwidth: ReadableSize(0), + enable_auto_tune: false, } } } @@ -4588,6 +4596,9 @@ mod tests { cfg.quota.foreground_cpu_time = 1000; cfg.quota.foreground_write_bandwidth = ReadableSize::mb(128); cfg.quota.foreground_read_bandwidth = ReadableSize::mb(256); + cfg.quota.background_cpu_time = 1000; + cfg.quota.background_write_bandwidth = ReadableSize::mb(128); + cfg.quota.background_read_bandwidth = ReadableSize::mb(256); cfg.quota.max_delay_duration = ReadableDuration::secs(1); cfg.validate().unwrap(); @@ -4595,7 +4606,11 @@ mod tests { cfg.quota.foreground_cpu_time, cfg.quota.foreground_write_bandwidth, cfg.quota.foreground_read_bandwidth, + cfg.quota.background_cpu_time, + cfg.quota.background_write_bandwidth, + cfg.quota.background_read_bandwidth, cfg.quota.max_delay_duration, + false, )); let cfg_controller = ConfigController::new(cfg.clone()); @@ -4627,7 +4642,7 @@ mod tests { let mut sample = quota_limiter.new_sample(); sample.add_read_bytes(ReadableSize::mb(32).0 as usize); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); assert_eq!(should_delay, Duration::from_millis(125)); cfg_controller @@ -4637,8 +4652,35 @@ mod tests { assert_eq!(cfg_controller.get_current(), cfg); let mut sample = quota_limiter.new_sample(); sample.add_write_bytes(ReadableSize::mb(128).0 as usize); - let should_delay = block_on(quota_limiter.async_consume(sample)); - assert_eq!(should_delay, Duration::from_millis(250)); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); + assert_eq!(should_delay, Duration::from_millis(500)); + + cfg_controller + .update_config("quota.background-cpu-time", "2000") + .unwrap(); + cfg.quota.background_cpu_time = 2000; + assert_eq!(cfg_controller.get_current(), cfg); + + cfg_controller + .update_config("quota.background-write-bandwidth", "256MB") + .unwrap(); + cfg.quota.background_write_bandwidth = ReadableSize::mb(256); + assert_eq!(cfg_controller.get_current(), cfg); + + let mut sample = quota_limiter.new_sample(); + sample.add_read_bytes(ReadableSize::mb(32).0 as usize); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); + assert_eq!(should_delay, Duration::from_millis(125)); + + cfg_controller + .update_config("quota.background-read-bandwidth", "512MB") + .unwrap(); + cfg.quota.background_read_bandwidth = ReadableSize::mb(512); + assert_eq!(cfg_controller.get_current(), cfg); + let mut sample = quota_limiter.new_sample(); + sample.add_write_bytes(ReadableSize::mb(128).0 as usize); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); + assert_eq!(should_delay, Duration::from_millis(500)); cfg_controller .update_config("quota.max-delay-duration", "50ms") @@ -4647,8 +4689,20 @@ mod tests { assert_eq!(cfg_controller.get_current(), cfg); let mut sample = quota_limiter.new_sample(); sample.add_write_bytes(ReadableSize::mb(128).0 as usize); - let should_delay = block_on(quota_limiter.async_consume(sample)); + let should_delay = block_on(quota_limiter.consume_sample(sample, true)); assert_eq!(should_delay, Duration::from_millis(50)); + + let mut sample = quota_limiter.new_sample(); + sample.add_write_bytes(ReadableSize::mb(128).0 as usize); + let should_delay = block_on(quota_limiter.consume_sample(sample, false)); + assert_eq!(should_delay, Duration::from_millis(50)); + + assert_eq!(cfg.quota.enable_auto_tune, false); + cfg_controller + .update_config("quota.enable-auto-tune", "true") + .unwrap(); + cfg.quota.enable_auto_tune = true; + assert_eq!(cfg_controller.get_current(), cfg); } #[test] diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index fa1dce909a2..9f2507562e6 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -274,6 +274,7 @@ impl Endpoint { }); self.check_memory_locks(&req_ctx)?; + let quota_limiter = self.quota_limiter.clone(); builder = Box::new(move |snap, req_ctx| { diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index 38d6dccc441..834033a60e1 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -54,6 +54,8 @@ pub const REQ_TYPE_DAG: i64 = 103; pub const REQ_TYPE_ANALYZE: i64 = 104; pub const REQ_TYPE_CHECKSUM: i64 = 105; +pub const REQ_FLAG_TIDB_SYSSESSION: u64 = 2048; + type HandlerStreamStepResult = Result<(Option, bool)>; /// An interface for all kind of Coprocessor request handlers. diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 2a8fc6ee81c..7b826487cc1 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -51,6 +51,7 @@ pub struct AnalyzeContext { ranges: Vec, storage_stats: Statistics, quota_limiter: Arc, + is_auto_analyze: bool, } impl AnalyzeContext { @@ -71,12 +72,15 @@ impl AnalyzeContext { req_ctx.access_locks.clone(), false, ); + let is_auto_analyze = req.get_flags() & REQ_FLAG_TIDB_SYSSESSION > 0; + Ok(Self { req, storage: Some(TiKvStorage::new(store, false)), ranges, storage_stats: Statistics::default(), quota_limiter, + is_auto_analyze, }) } @@ -272,8 +276,15 @@ impl RequestHandler for AnalyzeContext { let col_req = self.req.take_col_req(); let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = - RowSampleBuilder::new(col_req, storage, ranges, self.quota_limiter.clone())?; + + let mut builder = RowSampleBuilder::new( + col_req, + storage, + ranges, + self.quota_limiter.clone(), + self.is_auto_analyze, + )?; + let res = AnalyzeContext::handle_full_sampling(&mut builder).await; builder.data.collect_storage_stats(&mut self.storage_stats); res @@ -314,6 +325,7 @@ struct RowSampleBuilder { columns_info: Vec, column_groups: Vec, quota_limiter: Arc, + is_quota_auto_tune: bool, } impl RowSampleBuilder { @@ -322,6 +334,7 @@ impl RowSampleBuilder { storage: TiKvStorage>, ranges: Vec, quota_limiter: Arc, + is_quota_auto_tune: bool, ) -> Result { let columns_info: Vec<_> = req.take_columns_info().into(); if columns_info.is_empty() { @@ -346,6 +359,7 @@ impl RowSampleBuilder { columns_info, column_groups: req.take_column_groups().into(), quota_limiter, + is_quota_auto_tune, }) } @@ -431,7 +445,14 @@ impl RowSampleBuilder { } // Don't let analyze bandwidth limit the quota limiter, this is already limited in rate limiter. - let quota_delay = self.quota_limiter.async_consume(sample).await; + let quota_delay = { + if !self.is_quota_auto_tune { + self.quota_limiter.consume_sample(sample, true).await + } else { + self.quota_limiter.consume_sample(sample, false).await + } + }; + if !quota_delay.is_zero() { NON_TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC_STATIC .get(ThrottleType::analyze_full_sampling) diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index 742b8a8cb55..c0cc3eb1c6a 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -5,13 +5,13 @@ use std::{collections::HashMap, string::ToString}; use kvproto::diagnosticspb::{ServerInfoItem, ServerInfoPair}; use tikv_util::{ config::KIB, - sys::{cpu_time::LiunxStyleCpuTime, SysQuota, *}, + sys::{cpu_time::LinuxStyleCpuTime, SysQuota, *}, }; use walkdir::WalkDir; use crate::server::service::diagnostics::{ioload, SYS_INFO}; -type CpuTimeSnapshot = Option; +type CpuTimeSnapshot = Option; #[derive(Clone, Debug)] pub struct NicSnapshot { @@ -87,7 +87,7 @@ fn cpu_load_info(prev_cpu: CpuTimeSnapshot, collector: &mut Vec) return; } - let t2 = LiunxStyleCpuTime::current(); + let t2 = LinuxStyleCpuTime::current(); if t2.is_err() { return; } @@ -265,7 +265,7 @@ fn io_load_info(prev_io: HashMap, collector: &mut Vec CpuTimeSnapshot { - let t1 = LiunxStyleCpuTime::current(); + let t1 = LinuxStyleCpuTime::current(); if t1.is_err() { return None; } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 7026ebab77d..768579f0b15 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -655,7 +655,7 @@ impl Storage { .as_ref() .map_or(0, |v| v.len()); sample.add_read_bytes(read_bytes); - let quota_delay = quota_limiter.async_consume(sample).await; + let quota_delay = quota_limiter.consume_sample(sample, true).await; if !quota_delay.is_zero() { TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC_STATIC .get(CMD) @@ -996,7 +996,7 @@ impl Storage { + stats.cf_statistics(CF_LOCK).flow_stats.read_bytes + stats.cf_statistics(CF_WRITE).flow_stats.read_bytes; sample.add_read_bytes(read_bytes); - let quota_delay = quota_limiter.async_consume(sample).await; + let quota_delay = quota_limiter.consume_sample(sample, true).await; if !quota_delay.is_zero() { TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC_STATIC .get(CMD) diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index ab866fe18bf..a9b34b9b189 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -813,7 +813,7 @@ impl Scheduler { + statistics.cf_statistics(CF_LOCK).flow_stats.read_bytes + statistics.cf_statistics(CF_WRITE).flow_stats.read_bytes; sample.add_read_bytes(read_bytes); - let quota_delay = quota_limiter.async_consume(sample).await; + let quota_delay = quota_limiter.consume_sample(sample, true).await; if !quota_delay.is_zero() { TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC_STATIC .get(tag) From 1d66eddeb904de6222ba9d3dd94a7bef04af0725 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 23 Jun 2022 11:26:37 -0700 Subject: [PATCH 042/676] raftstorev2: add bootstrapping (#12877) ref tikv/tikv#12842 Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 6 + components/engine_panic/src/raft_engine.rs | 49 +++- components/engine_rocks/src/raft_engine.rs | 57 ++++- components/engine_traits/src/raft_engine.rs | 21 +- components/raft_log_engine/src/engine.rs | 84 +++++- components/raftstore-v2/Cargo.toml | 21 +- components/raftstore-v2/src/bootstrap.rs | 241 ++++++++++++++++++ components/raftstore-v2/src/fsm/mod.rs | 5 + components/raftstore-v2/src/fsm/store.rs | 2 + components/raftstore-v2/src/lib.rs | 8 +- components/raftstore-v2/src/raft/mod.rs | 2 +- components/raftstore-v2/src/raft/storage.rs | 82 +++++- components/raftstore-v2/src/router/message.rs | 2 +- .../raftstore-v2/tests/failpoints/mod.rs | 8 + .../tests/failpoints/test_bootstrap.rs | 61 +++++ components/test_pd/src/mocker/service.rs | 2 +- src/server/node.rs | 10 +- 17 files changed, 641 insertions(+), 20 deletions(-) create mode 100644 components/raftstore-v2/src/bootstrap.rs create mode 100644 components/raftstore-v2/tests/failpoints/mod.rs create mode 100644 components/raftstore-v2/tests/failpoints/test_bootstrap.rs diff --git a/Cargo.lock b/Cargo.lock index cedc1229d0b..489ef39eaec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4168,15 +4168,21 @@ version = "0.1.0" dependencies = [ "collections", "crossbeam", + "engine_test", "engine_traits", "error_code", + "fail", "kvproto", "pd_client", "raft", "raft-proto", "raftstore", "slog", + "slog-global", "smallvec", + "tempfile", + "test_pd", + "test_util", "tikv_util", ] diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 9842e1100ed..384bc60ffa6 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -1,7 +1,10 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{Error, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, Result}; -use kvproto::raft_serverpb::RaftLocalState; +use kvproto::{ + metapb::Region, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, +}; use raft::eraftpb::Entry; use crate::{engine::PanicEngine, write_batch::PanicWriteBatch}; @@ -29,6 +32,26 @@ impl RaftEngineReadOnly for PanicEngine { fn get_all_entries_to(&self, region_id: u64, buf: &mut Vec) -> Result<()> { panic!() } + + fn is_empty(&self) -> Result { + panic!() + } + + fn get_store_ident(&self) -> Result> { + panic!() + } + + fn get_prepare_bootstrap_region(&self) -> Result> { + panic!() + } + + fn get_region_state(&self, raft_group_id: u64) -> Result> { + panic!() + } + + fn get_apply_state(&self, raft_group_id: u64) -> Result> { + panic!() + } } impl RaftEngineDebug for PanicEngine { @@ -114,6 +137,10 @@ impl RaftEngine for PanicEngine { fn get_engine_size(&self) -> Result { panic!() } + + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { + panic!() + } } impl RaftLogBatch for PanicWriteBatch { @@ -140,4 +167,24 @@ impl RaftLogBatch for PanicWriteBatch { fn merge(&mut self, _: Self) -> Result<()> { panic!() } + + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()> { + panic!() + } + + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()> { + panic!() + } + + fn remove_prepare_bootstrap_region(&mut self) -> Result<()> { + panic!() + } + + fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { + panic!() + } + + fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { + panic!() + } } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index e081d057191..57a65ba661f 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -6,7 +6,10 @@ use engine_traits::{ RaftEngineReadOnly, RaftLogBatch, RaftLogGCTask, Result, SyncMutable, WriteBatch, WriteBatchExt, WriteOptions, CF_DEFAULT, RAFT_LOG_MULTI_GET_CNT, }; -use kvproto::raft_serverpb::RaftLocalState; +use kvproto::{ + metapb::Region, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, +}; use protobuf::Message; use raft::eraftpb::Entry; use tikv_util::{box_err, box_try}; @@ -117,6 +120,34 @@ impl RaftEngineReadOnly for RocksEngine { )?; Ok(()) } + + fn is_empty(&self) -> Result { + let mut is_empty = true; + self.scan_cf(CF_DEFAULT, b"", b"", false, |_, _| { + is_empty = false; + Ok(false) + })?; + + Ok(is_empty) + } + + fn get_store_ident(&self) -> Result> { + self.get_msg_cf(CF_DEFAULT, keys::STORE_IDENT_KEY) + } + + fn get_prepare_bootstrap_region(&self) -> Result> { + self.get_msg_cf(CF_DEFAULT, keys::PREPARE_BOOTSTRAP_KEY) + } + + fn get_region_state(&self, raft_group_id: u64) -> Result> { + let key = keys::region_state_key(raft_group_id); + self.get_msg_cf(CF_DEFAULT, &key) + } + + fn get_apply_state(&self, raft_group_id: u64) -> Result> { + let key = keys::apply_state_key(raft_group_id); + self.get_msg_cf(CF_DEFAULT, &key) + } } impl RaftEngineDebug for RocksEngine { @@ -303,6 +334,10 @@ impl RaftEngine for RocksEngine { Ok(used_size) } + + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { + self.put_msg(keys::STORE_IDENT_KEY, ident) + } } impl RaftLogBatch for RocksWriteBatch { @@ -336,6 +371,26 @@ impl RaftLogBatch for RocksWriteBatch { fn merge(&mut self, src: Self) -> Result<()> { WriteBatch::merge(self, src) } + + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()> { + self.put_msg(keys::STORE_IDENT_KEY, ident) + } + + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()> { + self.put_msg(keys::PREPARE_BOOTSTRAP_KEY, region) + } + + fn remove_prepare_bootstrap_region(&mut self) -> Result<()> { + self.delete(keys::PREPARE_BOOTSTRAP_KEY) + } + + fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { + self.put_msg(&keys::region_state_key(raft_group_id), state) + } + + fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { + self.put_msg(&keys::apply_state_key(raft_group_id), state) + } } impl RocksWriteBatch { diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index a0697218cf7..03cb2a41a41 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -1,6 +1,9 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use kvproto::raft_serverpb::RaftLocalState; +use kvproto::{ + metapb::Region, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, +}; use raft::eraftpb::Entry; use crate::*; @@ -8,7 +11,14 @@ use crate::*; pub const RAFT_LOG_MULTI_GET_CNT: u64 = 8; pub trait RaftEngineReadOnly: Sync + Send + 'static { + fn is_empty(&self) -> Result; + + fn get_store_ident(&self) -> Result>; + fn get_prepare_bootstrap_region(&self) -> Result>; + fn get_raft_state(&self, raft_group_id: u64) -> Result>; + fn get_region_state(&self, raft_group_id: u64) -> Result>; + fn get_apply_state(&self, raft_group_id: u64) -> Result>; fn get_entry(&self, raft_group_id: u64, index: u64) -> Result>; @@ -89,6 +99,8 @@ pub trait RaftEngine: RaftEngineReadOnly + Clone + Sync + Send + 'static { /// Note: `RaftLocalState` won't be updated in this call. fn append(&self, raft_group_id: u64, entries: Vec) -> Result; + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()>; + fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()>; /// Like `cut_logs` but the range could be very large. Return the deleted count. @@ -135,7 +147,14 @@ pub trait RaftLogBatch: Send { /// Remove Raft logs in [`from`, `to`) which will be overwritten later. fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64); + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()>; + + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()>; + fn remove_prepare_bootstrap_region(&mut self) -> Result<()>; + fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()>; + fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()>; + fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()>; /// The data size of this RaftLogBatch. fn persist_size(&self) -> usize; diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 9707bdb28b7..ae895f1ac36 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -13,7 +13,10 @@ use engine_traits::{ RaftLogBatch as RaftLogBatchTrait, RaftLogGCTask, Result, }; use file_system::{IOOp, IORateLimiter, IOType}; -use kvproto::raft_serverpb::RaftLocalState; +use kvproto::{ + metapb::Region, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, +}; use raft::eraftpb::Entry; use raft_engine::{ env::{DefaultFileSystem, FileSystem, Handle, WriteExt}, @@ -22,6 +25,9 @@ use raft_engine::{ pub use raft_engine::{Config as RaftEngineConfig, ReadableSize, RecoveryMode}; use tikv_util::Either; +// A special region ID representing global state. +const STORE_REGION_ID: u64 = 0; + #[derive(Clone)] pub struct MessageExtTyped; @@ -259,6 +265,10 @@ impl RaftLogEngine { pub struct RaftLogBatch(LogBatch); const RAFT_LOG_STATE_KEY: &[u8] = b"R"; +const STORE_IDENT_KEY: &[u8] = &[0x01]; +const PREPARE_BOOTSTRAP_REGION_KEY: &[u8] = &[0x02]; +const REGION_STATE_KEY: &[u8] = &[0x03]; +const APPLY_STATE_KEY: &[u8] = &[0x04]; impl RaftLogBatchTrait for RaftLogBatch { fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { @@ -288,6 +298,40 @@ impl RaftLogBatchTrait for RaftLogBatch { fn merge(&mut self, mut src: Self) -> Result<()> { self.0.merge(&mut src.0).map_err(transfer_error) } + + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()> { + self.0 + .put_message(STORE_REGION_ID, STORE_IDENT_KEY.to_vec(), ident) + .map_err(transfer_error) + } + + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()> { + self.0 + .put_message( + STORE_REGION_ID, + PREPARE_BOOTSTRAP_REGION_KEY.to_vec(), + region, + ) + .map_err(transfer_error) + } + + fn remove_prepare_bootstrap_region(&mut self) -> Result<()> { + self.0 + .delete(STORE_REGION_ID, PREPARE_BOOTSTRAP_REGION_KEY.to_vec()); + Ok(()) + } + + fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { + self.0 + .put_message(raft_group_id, REGION_STATE_KEY.to_vec(), state) + .map_err(transfer_error) + } + + fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { + self.0 + .put_message(raft_group_id, APPLY_STATE_KEY.to_vec(), state) + .map_err(transfer_error) + } } impl RaftEngineReadOnly for RaftLogEngine { @@ -324,6 +368,34 @@ impl RaftEngineReadOnly for RaftLogEngine { } Ok(()) } + + fn is_empty(&self) -> Result { + self.get_store_ident().map(|i| i.is_none()) + } + + fn get_store_ident(&self) -> Result> { + self.0 + .get_message(STORE_REGION_ID, STORE_IDENT_KEY) + .map_err(transfer_error) + } + + fn get_prepare_bootstrap_region(&self) -> Result> { + self.0 + .get_message(STORE_REGION_ID, PREPARE_BOOTSTRAP_REGION_KEY) + .map_err(transfer_error) + } + + fn get_region_state(&self, raft_group_id: u64) -> Result> { + self.0 + .get_message(raft_group_id, REGION_STATE_KEY) + .map_err(transfer_error) + } + + fn get_apply_state(&self, raft_group_id: u64) -> Result> { + self.0 + .get_message(raft_group_id, APPLY_STATE_KEY) + .map_err(transfer_error) + } } impl RaftEngineDebug for RaftLogEngine { @@ -389,6 +461,16 @@ impl RaftEngine for RaftLogEngine { self.0.write(&mut batch.0, false).map_err(transfer_error) } + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { + let mut batch = Self::LogBatch::default(); + batch + .0 + .put_message(STORE_REGION_ID, STORE_IDENT_KEY.to_vec(), ident) + .map_err(transfer_error)?; + self.0.write(&mut batch.0, true).map_err(transfer_error)?; + Ok(()) + } + fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { let mut batch = Self::LogBatch::default(); batch diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 56d08c6a6b6..100a2be409d 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -8,16 +8,20 @@ default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] failpoints = ["raftstore/failpoints"] testexport = ["raftstore/testexport"] test-engine-kv-rocksdb = [ - "raftstore/test-engine-kv-rocksdb" + "raftstore/test-engine-kv-rocksdb", + "engine_test/test-engine-kv-rocksdb", ] test-engine-raft-raft-engine = [ - "raftstore/test-engine-raft-raft-engine" + "raftstore/test-engine-raft-raft-engine", + "engine_test/test-engine-raft-raft-engine", ] test-engines-rocksdb = [ "raftstore/test-engines-rocksdb", + "engine_test/test-engines-rocksdb", ] test-engines-panic = [ "raftstore/test-engines-panic", + "engine_test/test-engines-panic", ] cloud-aws = ["raftstore/cloud-aws"] @@ -29,6 +33,7 @@ collections = { path = "../collections" } crossbeam = "0.8" engine_traits = { path = "../engine_traits" } error_code = { path = "../error_code" } +fail = "0.5" kvproto = { git = "https://github.com/pingcap/kvproto.git" } pd_client = { path = "../pd_client" } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } @@ -37,3 +42,15 @@ raftstore = { path = "../raftstore" } slog = "2.3" smallvec = "1.4" tikv_util = { path = "../tikv_util", default-features = false } + +[dev-dependencies] +engine_test = { path = "../engine_test", default-features = false } +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +tempfile = "3.0" +test_pd = { path = "../test_pd" } +test_util = { path = "../test_util" } + +[[test]] +name = "raftstore-v2-failpoints" +path = "tests/failpoints/mod.rs" +required-features = ["failpoints"] diff --git a/components/raftstore-v2/src/bootstrap.rs b/components/raftstore-v2/src/bootstrap.rs new file mode 100644 index 00000000000..55e1f6814c5 --- /dev/null +++ b/components/raftstore-v2/src/bootstrap.rs @@ -0,0 +1,241 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{thread, time::Duration}; + +use engine_traits::{RaftEngine, RaftLogBatch}; +use error_code::ErrorCodeExt; +use fail::fail_point; +use kvproto::{ + metapb::{Region, Store}, + raft_serverpb::{RaftLocalState, RegionLocalState, StoreIdent}, +}; +use pd_client::PdClient; +use raft::INVALID_ID; +use raftstore::store::initial_region; +use slog::{debug, error, info, warn, Logger}; +use tikv_util::{box_err, box_try}; + +use crate::{raft::write_initial_states, Result}; + +const MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT: u64 = 60; +const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs(3); + +/// A struct for bootstrapping the store. +/// +/// A typical bootstrap process should follow following order: +/// 1. bootstrap the store to get a store ID. +/// 2. bootstrap the first region using the last store ID. +pub struct Bootstrap<'a, ER: RaftEngine> { + engine: &'a ER, + cluster_id: u64, + // It's not performance critical. + pd_client: &'a dyn PdClient, + logger: Logger, +} + +// Although all methods won't change internal state, but they still receive `&mut self` as it's +// not thread safe to bootstrap concurrently. +impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { + pub fn new( + engine: &'a ER, + cluster_id: u64, + pd_client: &'a impl PdClient, + logger: Logger, + ) -> Self { + Self { + engine, + cluster_id, + pd_client, + logger, + } + } + + /// check store, return store id for the engine. + /// If the store is not bootstrapped, use None. + fn check_store(&mut self) -> Result> { + let ident = match self.engine.get_store_ident()? { + Some(ident) => ident, + None => return Ok(None), + }; + if ident.get_cluster_id() != self.cluster_id { + return Err(box_err!( + "cluster ID mismatch, local {} != remote {}, \ + you are trying to connect to another cluster, please reconnect to the correct PD", + ident.get_cluster_id(), + self.cluster_id + )); + } + if ident.get_store_id() == INVALID_ID { + return Err(box_err!("invalid store ident {:?}", ident)); + } + Ok(Some(ident.get_store_id())) + } + + fn inner_bootstrap_store(&mut self) -> Result { + let id = self.pd_client.alloc_id()?; + debug!(self.logger, "alloc store id"; "store_id" => id); + let mut ident = StoreIdent::default(); + if !self.engine.is_empty()? { + return Err(box_err!("store is not empty and has already had data.")); + } + ident.set_cluster_id(self.cluster_id); + ident.set_store_id(id); + self.engine.put_store_ident(&ident)?; + self.engine.sync()?; + fail_point!("node_after_bootstrap_store", |_| Err(box_err!( + "injected error: node_after_bootstrap_store" + ))); + Ok(id) + } + + /// Bootstrap the store and return the store ID. + /// + /// If store is bootstrapped already, return the store ID directly. + pub fn bootstrap_store(&mut self) -> Result { + let store_id = match self.check_store()? { + Some(id) => id, + None => self.inner_bootstrap_store()?, + }; + + Ok(store_id) + } + + fn prepare_bootstrap_first_region(&mut self, store_id: u64) -> Result { + let region_id = self.pd_client.alloc_id()?; + debug!( + self.logger, + "alloc first region id"; + "region_id" => region_id, + "cluster_id" => self.cluster_id, + "store_id" => store_id + ); + let peer_id = self.pd_client.alloc_id()?; + debug!( + self.logger, + "alloc first peer id for first region"; + "peer_id" => peer_id, + "region_id" => region_id, + ); + + let region = initial_region(store_id, region_id, peer_id); + + let mut wb = self.engine.log_batch(10); + wb.put_prepare_bootstrap_region(®ion)?; + write_initial_states(&mut wb, region.clone())?; + box_try!(self.engine.consume(&mut wb, true)); + + Ok(region) + } + + fn check_first_region_bootstrapped(&mut self) -> Result { + for _ in 0..MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT { + match self.pd_client.is_cluster_bootstrapped() { + Ok(b) => return Ok(b), + Err(e) => { + warn!(self.logger, "check cluster bootstrapped failed"; "err" => ?e); + } + } + thread::sleep(CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL); + } + Err(box_err!("check cluster bootstrapped failed")) + } + + fn check_or_prepare_bootstrap_first_region(&mut self, store_id: u64) -> Result> { + if let Some(first_region) = self.engine.get_prepare_bootstrap_region()? { + // Bootstrap is aborted last time, resume. It may succeed or fail last time, no matter + // what, at least we need a way to clean up. + Ok(Some(first_region)) + } else if self.check_first_region_bootstrapped()? { + // If other node has bootstrap the cluster, skip to avoid useless ID allocating and + // disk writes. + Ok(None) + } else { + // We are probably the first one triggering bootstrap. + self.prepare_bootstrap_first_region(store_id).map(Some) + } + } + + fn clear_prepare_bootstrap(&mut self, first_region_id: Option) -> Result<()> { + let mut wb = self.engine.log_batch(10); + wb.remove_prepare_bootstrap_region()?; + if let Some(id) = first_region_id { + box_try!( + self.engine + .clean(id, 0, &RaftLocalState::default(), &mut wb) + ); + } + box_try!(self.engine.consume(&mut wb, true)); + Ok(()) + } + + fn inner_bootstrap_first_region( + &mut self, + store: &Store, + first_region: &Region, + ) -> Result { + let region_id = first_region.get_id(); + let mut retry = 0; + while retry < MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT { + match self + .pd_client + .bootstrap_cluster(store.clone(), first_region.clone()) + { + Ok(_) => { + info!(self.logger, "bootstrap cluster ok"; "cluster_id" => self.cluster_id); + fail_point!("node_after_bootstrap_cluster", |_| Err(box_err!( + "injected error: node_after_bootstrap_cluster" + ))); + self.clear_prepare_bootstrap(None)?; + return Ok(true); + } + Err(pd_client::Error::ClusterBootstrapped(_)) => { + match self.pd_client.get_region(b"") { + Ok(region) => { + if region == *first_region { + self.clear_prepare_bootstrap(None)?; + return Ok(true); + } else { + info!(self.logger, "cluster is already bootstrapped"; "cluster_id" => self.cluster_id); + self.clear_prepare_bootstrap(Some(region_id))?; + return Ok(false); + } + } + Err(e) => { + warn!(self.logger, "get the first region failed"; "err" => ?e); + } + } + } + Err(e) => { + error!(self.logger, "bootstrap cluster"; "cluster_id" => self.cluster_id, "err" => ?e, "err_code" => %e.error_code()) + } + } + retry += 1; + thread::sleep(CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL); + } + Err(box_err!("bootstrapped cluster failed")) + } + + /// Bootstrap the first region. + /// + /// If the cluster is already bootstrapped, `None` is returned. + pub fn bootstrap_first_region( + &mut self, + store: &Store, + store_id: u64, + ) -> Result> { + let first_region = match self.check_or_prepare_bootstrap_first_region(store_id)? { + Some(r) => r, + None => return Ok(None), + }; + info!(self.logger, "trying to bootstrap first region"; "store_id" => store_id, "region" => ?first_region); + // cluster is not bootstrapped, and we choose first store to bootstrap + fail_point!("node_after_prepare_bootstrap_cluster", |_| Err(box_err!( + "injected error: node_after_prepare_bootstrap_cluster" + ))); + if self.inner_bootstrap_first_region(store, &first_region)? { + Ok(Some(first_region)) + } else { + Ok(None) + } + } +} diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs index 275313cbfb3..60c84984793 100644 --- a/components/raftstore-v2/src/fsm/mod.rs +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -1,5 +1,10 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +//! FSM is short for finite state machine. There are three types of FSMs, +//! - StoreFsm, used for handling control messages and global initialization. +//! - PeerFsm, used for handling messages specific for one raft peer. +//! - ApplyFsm, used for handling apply task for one raft peer. + mod apply; mod peer; mod store; diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index bb3db8c75d3..b568454e2c9 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -1 +1,3 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub struct StoreFsm {} diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 98c72ca7632..fac4511cfd4 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -4,16 +4,18 @@ //! //! The thread module of raftstore is batch-system, more check components/batch-system. //! All state machines are defined in [`fsm`] module. Everything that wrapping raft is -//! implemented in [`raft`] module. And the commands are implemented in [`operation`] module. -//! All state machines are expected to communicate with messages. They are defined in -//! [`router`] module. +//! implemented in [`raft`] module. And the commands, including split/merge/confchange/read/write, +//! are implemented in [`operation`] module. All state machines are expected to communicate with +//! messages. They are defined in [`router`] module. #![allow(unused)] +mod bootstrap; mod fsm; mod operation; mod raft; mod router; +pub use bootstrap::Bootstrap; pub use raftstore::{Error, Result}; pub use router::{PeerMsg, PeerTick, StoreMsg, StoreTick}; diff --git a/components/raftstore-v2/src/raft/mod.rs b/components/raftstore-v2/src/raft/mod.rs index 7fd128d6788..045e9ff89b3 100644 --- a/components/raftstore-v2/src/raft/mod.rs +++ b/components/raftstore-v2/src/raft/mod.rs @@ -4,4 +4,4 @@ mod peer; mod storage; pub use peer::Peer; -pub use storage::Storage; +pub use storage::{write_initial_states, Storage}; diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index f999c6890d8..f6dcad9578c 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -1,12 +1,45 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::RaftEngine; +use engine_traits::{RaftEngine, RaftLogBatch}; +use kvproto::{ + metapb::Region, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, +}; use raft::{ eraftpb::{Entry, Snapshot}, GetEntriesContext, RaftState, }; +use raftstore::store::{RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; use slog::Logger; +use crate::Result; + +pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { + let region_id = region.get_id(); + + let mut state = RegionLocalState::default(); + state.set_region(region); + wb.put_region_state(region_id, &state)?; + + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_term(RAFT_INIT_LOG_TERM); + wb.put_apply_state(region_id, &apply_state)?; + + let mut raft_state = RaftLocalState::default(); + raft_state.set_last_index(RAFT_INIT_LOG_INDEX); + raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); + raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); + wb.put_raft_state(region_id, &raft_state)?; + + Ok(()) +} + /// A storage for raft. pub struct Storage { engine: ER, @@ -54,3 +87,50 @@ impl raft::Storage for Storage { unimplemented!() } } + +#[cfg(test)] +mod tests { + use engine_traits::{RaftEngine, RaftEngineReadOnly, RaftLogBatch}; + use kvproto::{ + metapb::{Peer, Region}, + raft_serverpb::PeerState, + }; + use raftstore::store::{RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; + use tempfile::TempDir; + + #[test] + fn test_write_initial_states() { + let mut region = Region::default(); + region.set_id(4); + let mut p = Peer::default(); + p.set_id(5); + p.set_store_id(6); + region.mut_peers().push(p); + region.mut_region_epoch().set_version(2); + region.mut_region_epoch().set_conf_ver(4); + + let path = TempDir::new().unwrap(); + let engine = engine_test::new_temp_engine(&path); + let raft_engine = &engine.raft; + let mut wb = raft_engine.log_batch(10); + super::write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + + let local_state = raft_engine.get_region_state(4).unwrap().unwrap(); + assert_eq!(local_state.get_state(), PeerState::Normal); + assert_eq!(*local_state.get_region(), region); + + let raft_state = raft_engine.get_raft_state(4).unwrap().unwrap(); + assert_eq!(raft_state.get_last_index(), RAFT_INIT_LOG_INDEX); + let hs = raft_state.get_hard_state(); + assert_eq!(hs.get_term(), RAFT_INIT_LOG_TERM); + assert_eq!(hs.get_commit(), RAFT_INIT_LOG_INDEX); + + let apply_state = raft_engine.get_apply_state(4).unwrap().unwrap(); + assert_eq!(apply_state.get_applied_index(), RAFT_INIT_LOG_INDEX); + let ts = apply_state.get_truncated_state(); + assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); + assert_eq!(ts.get_term(), RAFT_INIT_LOG_TERM); + } +} diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 1ab85608034..75011163e83 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -1,4 +1,4 @@ -// Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] use std::{fmt, marker::PhantomData}; diff --git a/components/raftstore-v2/tests/failpoints/mod.rs b/components/raftstore-v2/tests/failpoints/mod.rs new file mode 100644 index 00000000000..88dfd0a81aa --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/mod.rs @@ -0,0 +1,8 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(test)] +#![feature(assert_matches)] +#![feature(custom_test_frameworks)] +#![test_runner(test_util::run_failpoint_tests)] + +mod test_bootstrap; diff --git a/components/raftstore-v2/tests/failpoints/test_bootstrap.rs b/components/raftstore-v2/tests/failpoints/test_bootstrap.rs new file mode 100644 index 00000000000..f56078a59f5 --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_bootstrap.rs @@ -0,0 +1,61 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::assert_matches::assert_matches; + +use engine_traits::RaftEngineReadOnly; +use kvproto::metapb::Store; +use raftstore_v2::Bootstrap; +use slog::o; +use tempfile::TempDir; + +#[test] +fn test_bootstrap_half_way_failure() { + let server = test_pd::Server::new(1); + let eps = server.bind_addrs(); + let pd_client = test_pd::util::new_client(eps, None); + let path = TempDir::new().unwrap(); + let engines = engine_test::new_temp_engine(&path); + let bootstrap = || { + let logger = slog_global::borrow_global().new(o!()); + let mut bootstrap = Bootstrap::new(&engines.raft, 0, &pd_client, logger); + match bootstrap.bootstrap_store() { + Ok(store_id) => { + let mut store = Store::default(); + store.set_id(store_id); + bootstrap.bootstrap_first_region(&store, store_id) + } + Err(e) => Err(e), + } + }; + + // Try to start this node, return after persisted some keys. + fail::cfg("node_after_bootstrap_store", "return").unwrap(); + let s = format!("{}", bootstrap().unwrap_err()); + assert!(s.contains("node_after_bootstrap_store"), "{}", s); + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(None)); + + let ident = engines.raft.get_store_ident().unwrap().unwrap(); + assert_ne!(ident.get_store_id(), 0); + + // Check whether it can bootstrap cluster successfully. + fail::remove("node_after_bootstrap_store"); + fail::cfg("node_after_prepare_bootstrap_cluster", "return").unwrap(); + let s = format!("{}", bootstrap().unwrap_err()); + assert!(s.contains("node_after_prepare_bootstrap_cluster"), "{}", s); + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(Some(_))); + + fail::remove("node_after_prepare_bootstrap_cluster"); + fail::cfg("node_after_bootstrap_cluster", "return").unwrap(); + let s = format!("{}", bootstrap().unwrap_err()); + assert!(s.contains("node_after_bootstrap_cluster"), "{}", s); + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(Some(_))); + + // Although aborted by error, rebootstrap should continue. + bootstrap().unwrap().unwrap(); + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(None)); + + // Second bootstrap should be noop. + assert_eq!(bootstrap().unwrap(), None); + + assert_matches!(engines.raft.get_prepare_bootstrap_region(), Ok(None)); +} diff --git a/components/test_pd/src/mocker/service.rs b/components/test_pd/src/mocker/service.rs index 95ffde14b7c..572eb9534f9 100644 --- a/components/test_pd/src/mocker/service.rs +++ b/components/test_pd/src/mocker/service.rs @@ -96,7 +96,7 @@ impl PdMocker for Service { if self.is_bootstrapped.load(Ordering::SeqCst) { let mut err = Error::default(); - err.set_type(ErrorType::Unknown); + err.set_type(ErrorType::AlreadyBootstrapped); err.set_message("cluster is already bootstrapped".to_owned()); header.set_error(err); resp.set_header(header); diff --git a/src/server/node.rs b/src/server/node.rs index 559055cbbb9..dfed9459b1c 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -43,7 +43,7 @@ use crate::{ }; const MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT: u64 = 60; -const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_SECONDS: u64 = 3; +const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs(3); /// Creates a new storage engine which is backed by the Raft consensus /// protocol. @@ -436,9 +436,7 @@ where Err(e) => error!(?e; "bootstrap cluster"; "cluster_id" => self.cluster_id,), } retry += 1; - thread::sleep(Duration::from_secs( - CHECK_CLUSTER_BOOTSTRAPPED_RETRY_SECONDS, - )); + thread::sleep(CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL); } Err(box_err!("bootstrapped cluster failed")) } @@ -451,9 +449,7 @@ where warn!("check cluster bootstrapped failed"; "err" => ?e); } } - thread::sleep(Duration::from_secs( - CHECK_CLUSTER_BOOTSTRAPPED_RETRY_SECONDS, - )); + thread::sleep(CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL); } Err(box_err!("check cluster bootstrapped failed")) } From 89694308cf98822a232c166cc54dc7b8a02ed3dc Mon Sep 17 00:00:00 2001 From: Lucas Date: Fri, 24 Jun 2022 13:44:38 +0800 Subject: [PATCH 043/676] coprocessor: fix panic on `analyze` when `max_sample_size == 0` (#12696) close tikv/tikv#11192, ref tikv/tikv#11425 Signed-off-by: Lucasliang Co-authored-by: Ti Chi Robot --- src/coprocessor/statistics/analyze.rs | 33 +++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 7b826487cc1..bb0348be98f 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -748,6 +748,10 @@ impl RowSampleCollector for ReservoirRowSampleCollector { } fn sampling(&mut self, data: Vec>) { + // We should tolerate the abnormal case => `self.max_sample_size == 0`. + if self.max_sample_size == 0 { + return; + } let mut need_push = false; let cur_rng = self.base.rng.gen_range(0, i64::MAX); if self.samples.len() < self.max_sample_size { @@ -1330,4 +1334,33 @@ mod tests { ); } } + + #[test] + fn test_abnormal_sampling() { + let sample_num = 0; // abnormal. + let row_num = 100; + let mut nums: Vec> = Vec::with_capacity(row_num); + for i in 0..row_num { + nums.push( + datum::encode_value(&mut EvalContext::default(), &[Datum::I64(i as i64)]).unwrap(), + ); + } + { + // Test for ReservoirRowSampleCollector + let mut collector = ReservoirRowSampleCollector::new(sample_num, 1000, 1); + for row in &nums { + collector.sampling([row.clone()].to_vec()); + } + assert_eq!(collector.samples.len(), 0); + } + { + // Test for BernoulliRowSampleCollector + let mut collector = + BernoulliRowSampleCollector::new(sample_num as f64 / row_num as f64, 1000, 1); + for row in &nums { + collector.sampling([row.clone()].to_vec()); + } + assert_eq!(collector.samples.len(), 0); + } + } } From 54b5cca4810c5687d718bebd6181bbd2948b4264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BE=99=E6=96=B9=E6=B7=9E?= Date: Fri, 24 Jun 2022 14:20:38 +0800 Subject: [PATCH 044/676] add reset_to_version command back (#12823) close tikv/tikv#12824 Signed-off-by: longfangsong Co-authored-by: Ti Chi Robot --- cmd/tikv-ctl/src/cmd.rs | 6 ++++++ cmd/tikv-ctl/src/main.rs | 1 + src/server/reset_to_version.rs | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cmd/tikv-ctl/src/cmd.rs b/cmd/tikv-ctl/src/cmd.rs index a1934c1acb8..4c49ccfa5ef 100644 --- a/cmd/tikv-ctl/src/cmd.rs +++ b/cmd/tikv-ctl/src/cmd.rs @@ -539,6 +539,12 @@ pub enum Cmd { /// PD endpoints pd: String, }, + /// Reset data in a TiKV to a certain version + ResetToVersion { + #[structopt(short = "v")] + /// The version to reset TiKV to + version: u64, + }, #[structopt(external_subcommand)] External(Vec), } diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 3ad066df491..e2ed740e779 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -479,6 +479,7 @@ fn main() { Cmd::Cluster {} => { debug_executor.dump_cluster_info(); } + Cmd::ResetToVersion { version } => debug_executor.reset_to_version(version), _ => { unreachable!() } diff --git a/src/server/reset_to_version.rs b/src/server/reset_to_version.rs index 7b99f48371d..263a8d2565a 100644 --- a/src/server/reset_to_version.rs +++ b/src/server/reset_to_version.rs @@ -233,7 +233,7 @@ impl ResetToVersionManager { *worker.state.lock() .expect("failed to lock `ResetToVersionWorker::state` in `ResetToVersionWorker::process_next_batch_lock`") = ResetToVersionState::Done; - + info!("Reset to version done!"); tikv_alloc::remove_thread_memory_accessor(); }) .expect("failed to spawn reset_to_version thread")); From a9c3e56552c803642f640e9b3fa8725aa3072400 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Fri, 24 Jun 2022 17:20:38 +0800 Subject: [PATCH 045/676] raftstore: record write durations into tracker (#12783) ref tikv/tikv#12362 This commit replaces the request_times in the raftstore callback with a tracker token. Then, the waterfall metrics of a raft command will be recorded into the tracker. Signed-off-by: Yilin Chen --- Cargo.lock | 1 + components/raftstore/Cargo.toml | 1 + components/raftstore/src/lib.rs | 1 + .../raftstore/src/store/async_io/write.rs | 45 ++++---- components/raftstore/src/store/fsm/apply.rs | 41 ++++--- components/raftstore/src/store/fsm/peer.rs | 25 ++-- .../raftstore/src/store/local_metrics.rs | 49 ++++++++ components/raftstore/src/store/msg.rs | 22 +++- components/raftstore/src/store/peer.rs | 107 +++++++++--------- components/tracker/src/lib.rs | 16 +++ 10 files changed, 197 insertions(+), 111 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 489ef39eaec..7c9902b7534 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4157,6 +4157,7 @@ dependencies = [ "tikv_util", "time", "tokio", + "tracker", "txn_types", "uuid", "yatp", diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 01519444b92..9d8c39d5746 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -85,6 +85,7 @@ tikv_alloc = { path = "../tikv_alloc" } tikv_util = { path = "../tikv_util", default-features = false } time = "0.1" tokio = { version = "1.5", features = ["sync", "rt-multi-thread"] } +tracker = { path = "../tracker" } txn_types = { path = "../txn_types", default-features = false } uuid = { version = "0.8.1", features = ["serde", "v4"] } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index b212001657a..ed70dacb37b 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -6,6 +6,7 @@ #![feature(min_specialization)] #![feature(box_patterns)] #![feature(hash_drain_filter)] +#![feature(let_chains)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index f81160d689d..99c4f56b7e4 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -38,7 +38,7 @@ use crate::{ store::{ config::Config, fsm::RaftRouter, - local_metrics::{RaftSendMessageMetrics, StoreWriteMetrics}, + local_metrics::{RaftSendMessageMetrics, StoreWriteMetrics, TimeTracker}, metrics::*, transport::Transport, util::LatencyInspector, @@ -97,7 +97,7 @@ where pub cut_logs: Option<(u64, u64)>, pub raft_state: Option, pub messages: Vec, - pub request_times: Vec, + pub trackers: Vec, } impl WriteTask @@ -117,7 +117,7 @@ where cut_logs: None, raft_state: None, messages: vec![], - request_times: vec![], + trackers: vec![], } } @@ -298,12 +298,12 @@ where } self.state_size = 0; if metrics.waterfall_metrics { - let now = Instant::now(); + let now = std::time::Instant::now(); for task in &self.tasks { - for t in &task.request_times { - metrics - .wf_before_write - .observe(duration_to_sec(now.saturating_duration_since(*t))); + for tracker in &task.trackers { + tracker.observe(now, &metrics.wf_before_write, |t| { + &mut t.metrics.wf_before_write_nanos + }); } } } @@ -311,12 +311,12 @@ where fn after_write_to_kv_db(&mut self, metrics: &StoreWriteMetrics) { if metrics.waterfall_metrics { - let now = Instant::now(); + let now = std::time::Instant::now(); for task in &self.tasks { - for t in &task.request_times { - metrics - .wf_kvdb_end - .observe(duration_to_sec(now.saturating_duration_since(*t))); + for tracker in &task.trackers { + tracker.observe(now, &metrics.wf_kvdb_end, |t| { + &mut t.metrics.wf_kvdb_end_nanos + }); } } } @@ -324,12 +324,12 @@ where fn after_write_to_raft_db(&mut self, metrics: &StoreWriteMetrics) { if metrics.waterfall_metrics { - let now = Instant::now(); + let now = std::time::Instant::now(); for task in &self.tasks { - for t in &task.request_times { - metrics - .wf_write_end - .observe(duration_to_sec(now.saturating_duration_since(*t))) + for tracker in &task.trackers { + tracker.observe(now, &metrics.wf_write_end, |t| { + &mut t.metrics.wf_write_end_nanos + }); } } } @@ -535,7 +535,14 @@ where self.store_id, self.tag, e ); }); - self.perf_context.report_metrics(&[]); // TODO: pass in request trackers + let trackers: Vec<_> = self + .batch + .tasks + .iter() + .flat_map(|task| task.trackers.iter().flat_map(|t| t.as_tracker_token())) + .collect(); + // TODO: Add a different perf context for raft engine. + self.perf_context.report_metrics(&trackers); write_raft_time = duration_to_sec(now.saturating_elapsed()); STORE_WRITE_RAFTDB_DURATION_HISTOGRAM.observe(write_raft_time); } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index e28c8cf2424..03034b76245 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -75,7 +75,7 @@ use crate::{ store::{ cmd_resp, fsm::RaftPollerBuilder, - local_metrics::RaftMetrics, + local_metrics::{RaftMetrics, TimeTracker}, memory::*, metrics::*, msg::{Callback, PeerMsg, ReadResponse, SignificantMsg}, @@ -526,7 +526,15 @@ where self.kv_wb().write_opt(&write_opts).unwrap_or_else(|e| { panic!("failed to write to engine: {:?}", e); }); - self.perf_context.report_metrics(&[]); // TODO: pass in request trackers + let trackers: Vec<_> = self + .applied_batch + .cb_batch + .iter() + .flat_map(|(cb, _)| cb.get_trackers()) + .flat_map(|trackers| trackers.iter().map(|t| t.as_tracker_token())) + .flatten() + .collect(); + self.perf_context.report_metrics(&trackers); self.sync_log_hint = false; let data_size = self.kv_wb().data_size(); if data_size > APPLY_WB_SHRINK_SIZE { @@ -557,13 +565,10 @@ where self.host .on_flush_applied_cmd_batch(batch_max_level, cmd_batch, &self.engine); // Invoke callbacks - let now = Instant::now(); + let now = std::time::Instant::now(); for (cb, resp) in cb_batch.drain(..) { - if let Some(times) = cb.get_request_times() { - for t in times { - self.apply_time - .observe(duration_to_sec(now.saturating_duration_since(*t))); - } + for tracker in cb.get_trackers().iter().flat_map(|v| *v) { + tracker.observe(now, &self.apply_time, |t| &mut t.metrics.apply_time_nanos); } cb.invoke_with_response(resp); } @@ -2912,17 +2917,17 @@ impl Apply { } pub fn on_schedule(&mut self, metrics: &RaftMetrics) { - let mut now = None; + let now = std::time::Instant::now(); for cb in &mut self.cbs { - if let Callback::Write { request_times, .. } = &mut cb.cb { - if now.is_none() { - now = Some(Instant::now()); - } - for t in request_times { - metrics - .store_time - .observe(duration_to_sec(now.unwrap().saturating_duration_since(*t))); - *t = now.unwrap(); + if let Callback::Write { trackers, .. } = &mut cb.cb { + for tracker in trackers { + tracker.observe(now, &metrics.store_time, |t| { + t.metrics.write_instant = Some(now); + &mut t.metrics.store_time_nanos + }); + if let TimeTracker::Instant(t) = tracker { + *t = now; + } } } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 6abfc24c486..e08c440d6a1 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -73,7 +73,7 @@ use crate::{ ExecResult, }, hibernate_state::{GroupState, HibernateState}, - local_metrics::RaftMetrics, + local_metrics::{RaftMetrics, TimeTracker}, memory::*, metrics::*, msg::{Callback, ExtCallback, InspectedRaftMessage}, @@ -523,11 +523,11 @@ where })) }; - let times: SmallVec<[TiInstant; 4]> = cbs + let tokens: SmallVec<[TimeTracker; 4]> = cbs .iter_mut() .filter_map(|cb| { - if let Callback::Write { request_times, .. } = cb { - Some(request_times[0]) + if let Callback::Write { trackers, .. } = cb { + Some(trackers[0]) } else { None } @@ -546,8 +546,8 @@ where committed_cb, ); - if let Callback::Write { request_times, .. } = &mut cb { - *request_times = times; + if let Callback::Write { trackers, .. } = &mut cb { + *trackers = tokens; } return Some((req, cb)); @@ -4774,14 +4774,11 @@ where } if self.ctx.raft_metrics.waterfall_metrics { - if let Some(request_times) = cb.get_request_times() { - let now = TiInstant::now(); - for t in request_times { - self.ctx - .raft_metrics - .wf_batch_wait - .observe(duration_to_sec(now.saturating_duration_since(*t))); - } + let now = Instant::now(); + for tracker in cb.get_trackers().iter().flat_map(|v| *v) { + tracker.observe(now, &self.ctx.raft_metrics.wf_batch_wait, |t| { + &mut t.metrics.wf_batch_wait_nanos + }); } } diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index aa23f22bc2c..304259c4571 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -6,6 +6,7 @@ use std::sync::{Arc, Mutex}; use collections::HashSet; use prometheus::local::LocalHistogram; use raft::eraftpb::MessageType; +use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS}; use super::metrics::*; @@ -499,3 +500,51 @@ impl StoreWriteMetrics { } } } + +/// Tracker for the durations of a raftstore request. +/// If a global tracker is not available, it will fallback to an Instant. +#[derive(Debug, Clone, Copy)] +pub enum TimeTracker { + Tracker(TrackerToken), + Instant(std::time::Instant), +} + +impl TimeTracker { + pub fn as_tracker_token(&self) -> Option { + match self { + TimeTracker::Tracker(tt) => Some(*tt), + TimeTracker::Instant(_) => None, + } + } + + pub fn observe( + &self, + now: std::time::Instant, + local_metric: &LocalHistogram, + tracker_metric: impl FnOnce(&mut Tracker) -> &mut u64, + ) { + match self { + TimeTracker::Tracker(t) => { + if let Some(dur) = GLOBAL_TRACKERS + .with_tracker(*t, |tracker| { + tracker.metrics.write_instant.map(|write_instant| { + let dur = now.saturating_duration_since(write_instant); + let metric = tracker_metric(tracker); + if *metric == 0 { + *metric = dur.as_nanos() as u64; + } + dur + }) + }) + .flatten() + { + local_metric.observe(dur.as_secs_f64()); + } + } + TimeTracker::Instant(t) => { + let dur = now.saturating_duration_since(*t); + local_metric.observe(dur.as_secs_f64()); + } + } + } +} diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 46903771344..46900878178 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -22,8 +22,9 @@ use pd_client::BucketMeta; use raft::{GetEntriesContext, SnapshotStatus}; use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; +use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; -use super::{AbstractPeer, RegionSnapshot}; +use super::{local_metrics::TimeTracker, AbstractPeer, RegionSnapshot}; use crate::store::{ fsm::apply::{CatchUpLogs, ChangeObserver, TaskRes as ApplyTaskRes}, metrics::RaftEventDurationType, @@ -98,7 +99,7 @@ pub enum Callback { /// `committed_cb` is called after a request is committed and before it's being applied, and /// it's guaranteed that the request will be successfully applied soon. committed_cb: Option, - request_times: SmallVec<[Instant; 4]>, + trackers: SmallVec<[TimeTracker; 4]>, }, #[cfg(any(test, feature = "testexport"))] /// Test purpose callback @@ -120,17 +121,28 @@ where proposed_cb: Option, committed_cb: Option, ) -> Self { + let tracker_token = get_tls_tracker_token(); + let now = std::time::Instant::now(); + let tracker = if tracker_token == INVALID_TRACKER_TOKEN { + TimeTracker::Instant(now) + } else { + GLOBAL_TRACKERS.with_tracker(tracker_token, |tracker| { + tracker.metrics.write_instant = Some(now); + }); + TimeTracker::Tracker(tracker_token) + }; + Callback::Write { cb, proposed_cb, committed_cb, - request_times: smallvec![Instant::now()], + trackers: smallvec![tracker], } } - pub fn get_request_times(&self) -> Option<&SmallVec<[Instant; 4]>> { + pub fn get_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { match self { - Callback::Write { request_times, .. } => Some(request_times), + Callback::Write { trackers, .. } => Some(trackers), _ => None, } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index eb1fc93e1ee..2853fcd4169 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -65,12 +65,13 @@ use tikv_util::{ Either, }; use time::Timespec; +use tracker::GLOBAL_TRACKERS; use txn_types::WriteBatchFlags; use uuid::Uuid; use super::{ cmd_resp, - local_metrics::{RaftMetrics, RaftReadyMetrics}, + local_metrics::{RaftMetrics, RaftReadyMetrics, TimeTracker}, metrics::*, peer_storage::{write_peer_state, CheckApplyingSnapStatus, HandleReadyResult, PeerStorage}, read_queue::{ReadIndexQueue, ReadIndexRequest}, @@ -137,16 +138,16 @@ impl ProposalQueue { } } - /// Find the request times of given index. - /// Caller should check if term is matched before using request times. - fn find_request_times(&self, index: u64) -> Option<(u64, &SmallVec<[TiInstant; 4]>)> { + /// Find the trackers of given index. + /// Caller should check if term is matched before using trackers. + fn find_trackers(&self, index: u64) -> Option<(u64, &SmallVec<[TimeTracker; 4]>)> { self.queue .binary_search_by_key(&index, |p: &Proposal<_>| p.index) .ok() .and_then(|i| { self.queue[i] .cb - .get_request_times() + .get_trackers() .map(|ts| (self.queue[i].term, ts)) }) } @@ -725,9 +726,6 @@ where #[getset(get = "pub")] leader_lease: Lease, pending_reads: ReadIndexQueue, - /// Record the propose instants to calculate the wait duration before - /// the proposal is sent through the Raft client. - pending_propose_instants: VecDeque<(u64, Instant)>, /// If it fails to send messages to leader. pub leader_unreachable: bool, @@ -930,7 +928,6 @@ where raft_max_inflight_msgs: cfg.raft_max_inflight_msgs, proposals: ProposalQueue::new(tag.clone()), pending_reads: Default::default(), - pending_propose_instants: Default::default(), peer_cache: RefCell::new(HashMap::default()), peer_heartbeats: HashMap::default(), peers_start_pending_time: vec![], @@ -1578,7 +1575,7 @@ where ctx: &mut PollContext, msgs: Vec, ) { - let now = Instant::now(); + let mut now = None; for msg in msgs { let msg_type = msg.get_message().get_msg_type(); if msg_type == MessageType::MsgSnapshot { @@ -1594,7 +1591,7 @@ where // network partition from the new leader. // For lease safety during leader transfer, transit `leader_lease` // to suspect. - self.leader_lease.suspect(monotonic_raw_now()); + self.leader_lease.suspect(*now.insert(monotonic_raw_now())); } let to_peer_id = msg.get_to_peer().get_id(); @@ -1610,22 +1607,31 @@ where "disk_usage" => ?msg.get_disk_usage(), ); - for index in msg + for (term, index) in msg .get_message() .get_entries() .iter() - .map(|e| e.get_index()) + .map(|e| (e.get_term(), e.get_index())) { - while let Some((propose_idx, instant)) = self.pending_propose_instants.front() { - if index == *propose_idx { + if let Ok(idx) = self + .proposals + .queue + .binary_search_by_key(&index, |p: &Proposal<_>| p.index) + { + let proposal = &self.proposals.queue[idx]; + if term == proposal.term + && let Some(propose_time) = proposal.propose_time + && let Ok(dur) = ((*now.get_or_insert(monotonic_raw_now())) - propose_time).to_std() { ctx.raft_metrics .proposal_send_wait - .observe(now.saturating_duration_since(*instant).as_secs_f64()); - } - if index >= *propose_idx { - self.pending_propose_instants.pop_front(); - } else { - break; + .observe(dur.as_secs_f64()); + for t in proposal.cb.get_trackers().iter().flat_map(|v| v.iter().flat_map(|t| t.as_tracker_token())) { + GLOBAL_TRACKERS.with_tracker(t, |trakcer| { + if trakcer.metrics.propose_send_wait_nanos == 0{ + trakcer.metrics.propose_send_wait_nanos = dur.as_nanos() as u64; + } + }); + } } } } @@ -1753,22 +1759,19 @@ where if !metrics.waterfall_metrics || self.proposals.is_empty() { return; } - let mut now = None; + let now = Instant::now(); for index in pre_persist_index + 1..=self.raft_group.raft.raft_log.persisted { - if let Some((term, times)) = self.proposals.find_request_times(index) { + if let Some((term, trackers)) = self.proposals.find_trackers(index) { if self .get_store() .term(index) .map(|t| t == term) .unwrap_or(false) { - if now.is_none() { - now = Some(TiInstant::now()); - } - for t in times { - metrics - .wf_persist_log - .observe(duration_to_sec(now.unwrap().saturating_duration_since(*t))); + for tracker in trackers { + tracker.observe(now, &metrics.wf_persist_log, |t| { + &mut t.metrics.wf_persist_log_nanos + }); } } } @@ -1779,25 +1782,26 @@ where if !metrics.waterfall_metrics || self.proposals.is_empty() { return; } - let mut now = None; + let now = Instant::now(); for index in pre_commit_index + 1..=self.raft_group.raft.raft_log.committed { - if let Some((term, times)) = self.proposals.find_request_times(index) { + if let Some((term, trackers)) = self.proposals.find_trackers(index) { if self .get_store() .term(index) .map(|t| t == term) .unwrap_or(false) { - if now.is_none() { - now = Some(TiInstant::now()); - } - let hist = if index <= self.raft_group.raft.raft_log.persisted { + let commit_persisted = index <= self.raft_group.raft.raft_log.persisted; + let hist = if commit_persisted { &metrics.wf_commit_log } else { &metrics.wf_commit_not_persist_log }; - for t in times { - hist.observe(duration_to_sec(now.unwrap().saturating_duration_since(*t))); + for tracker in trackers { + tracker.observe(now, hist, |t| { + t.metrics.commit_not_persisted = !commit_persisted; + &mut t.metrics.wf_commit_log_nanos + }); } } } @@ -2083,7 +2087,6 @@ where self.mut_store().cancel_generating_snap(None); self.clear_disk_full_peers(ctx); self.clear_in_memory_pessimistic_locks(); - self.pending_propose_instants.clear(); } _ => {} } @@ -2520,20 +2523,17 @@ where let state_role = ready.ss().map(|ss| ss.raft_state); let has_new_entries = !ready.entries().is_empty(); - let mut request_times = vec![]; + let mut trackers = vec![]; if ctx.raft_metrics.waterfall_metrics { - let mut now = None; + let now = Instant::now(); for entry in ready.entries() { - if let Some((term, times)) = self.proposals.find_request_times(entry.get_index()) { + if let Some((term, times)) = self.proposals.find_trackers(entry.get_index()) { if entry.term == term { - request_times.extend_from_slice(times); - if now.is_none() { - now = Some(TiInstant::now()); - } - for t in times { - ctx.raft_metrics.wf_send_to_queue.observe(duration_to_sec( - now.unwrap().saturating_duration_since(*t), - )); + trackers.extend_from_slice(times); + for tracker in times { + tracker.observe(now, &ctx.raft_metrics.wf_send_to_queue, |t| { + &mut t.metrics.wf_send_to_queue_nanos + }); } } } @@ -2560,8 +2560,8 @@ where task.messages = self.build_raft_messages(ctx, persisted_msgs); } - if !request_times.is_empty() { - task.request_times = request_times; + if !trackers.is_empty() { + task.trackers = trackers; } if let Some(write_worker) = &mut ctx.sync_write_worker { @@ -4308,9 +4308,6 @@ where } } - self.pending_propose_instants - .push_back((propose_index, Instant::now())); - Ok(Either::Left(propose_index)) } diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index 0e932658aba..25a5610d034 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -6,6 +6,8 @@ mod metrics; mod slab; mod tls; +use std::time::Instant; + use kvproto::kvrpcpb as pb; pub use self::{ @@ -92,4 +94,18 @@ pub struct RequestMetrics { pub block_read_nanos: u64, pub internal_key_skipped_count: u64, pub deleted_key_skipped_count: u64, + // temp instant used in raftstore metrics, first be the instant when creating the write callback, + // then reset when it is ready to apply + pub write_instant: Option, + pub wf_batch_wait_nanos: u64, + pub wf_send_to_queue_nanos: u64, + pub wf_persist_log_nanos: u64, + pub wf_before_write_nanos: u64, + pub wf_write_end_nanos: u64, + pub wf_kvdb_end_nanos: u64, + pub wf_commit_log_nanos: u64, + pub propose_send_wait_nanos: u64, + pub commit_not_persisted: bool, + pub store_time_nanos: u64, + pub apply_time_nanos: u64, } From 9fe0d5772399deffb4fcf199ccc9969792f1bcb5 Mon Sep 17 00:00:00 2001 From: Shenghui Wu <793703860@qq.com> Date: Mon, 27 Jun 2022 21:10:39 +0800 Subject: [PATCH 046/676] copr: update scanned range for each batch when paging is enable (#12886) ref tikv/tikv#12848 Update scanned range for each batch when paging is enable Signed-off-by: wshwsh12 <793703860@qq.com> Co-authored-by: Ti Chi Robot --- .../src/storage/ranges_iter.rs | 6 + .../tidb_query_common/src/storage/scanner.rs | 223 ++++++++++++++++-- .../src/util/scan_executor.rs | 4 +- 3 files changed, 211 insertions(+), 22 deletions(-) diff --git a/components/tidb_query_common/src/storage/ranges_iter.rs b/components/tidb_query_common/src/storage/ranges_iter.rs index 88d103a763f..061cd339129 100644 --- a/components/tidb_query_common/src/storage/ranges_iter.rs +++ b/components/tidb_query_common/src/storage/ranges_iter.rs @@ -64,6 +64,12 @@ impl RangesIterator { pub fn notify_drained(&mut self) { self.in_range = false; } + + /// Check drained. + #[inline] + pub fn is_drained(&mut self) -> bool { + self.iter.len() == 0 + } } #[cfg(test)] diff --git a/components/tidb_query_common/src/storage/scanner.rs b/components/tidb_query_common/src/storage/scanner.rs index 6e72ba13fca..1c1a1cea111 100644 --- a/components/tidb_query_common/src/storage/scanner.rs +++ b/components/tidb_query_common/src/storage/scanner.rs @@ -65,6 +65,15 @@ impl RangesScanner { // Note: This is not implemented over `Iterator` since it can fail. // TODO: Change to use reference to avoid allocation and copy. pub fn next(&mut self) -> Result, StorageError> { + self.next_opt(true) + } + + /// Fetches next row. + /// Note: `update_scanned_range` can control whether update the scanned range when `is_scanned_range_aware` is true. + pub fn next_opt( + &mut self, + update_scanned_range: bool, + ) -> Result, StorageError> { loop { let range = self.ranges_iter.next(); let some_row = match range { @@ -93,7 +102,7 @@ impl RangesScanner { return Ok(None); // drained } }; - if self.is_scanned_range_aware { + if self.is_scanned_range_aware && update_scanned_range { self.update_scanned_range_from_scanned_row(&some_row); } if some_row.is_some() { @@ -159,31 +168,35 @@ impl RangesScanner { fn update_scanned_range_from_new_point(&mut self, point: &PointRange) { assert!(self.is_scanned_range_aware); - self.update_working_range_end_key(); - self.current_range.lower_inclusive.clear(); - self.current_range.upper_exclusive.clear(); - self.current_range - .lower_inclusive - .extend_from_slice(&point.0); - self.current_range - .upper_exclusive - .extend_from_slice(&point.0); - self.current_range.upper_exclusive.push(0); + // Only update current_range for the first and the last range. + if self.current_range.lower_inclusive.is_empty() || self.ranges_iter.is_drained() { + self.current_range.lower_inclusive.clear(); + self.current_range.upper_exclusive.clear(); + self.current_range + .lower_inclusive + .extend_from_slice(&point.0); + self.current_range + .upper_exclusive + .extend_from_slice(&point.0); + self.current_range.upper_exclusive.push(0); + } self.update_working_range_begin_key(); } fn update_scanned_range_from_new_range(&mut self, range: &IntervalRange) { assert!(self.is_scanned_range_aware); - self.update_working_range_end_key(); - self.current_range.lower_inclusive.clear(); - self.current_range.upper_exclusive.clear(); - self.current_range - .lower_inclusive - .extend_from_slice(&range.lower_inclusive); - self.current_range - .upper_exclusive - .extend_from_slice(&range.upper_exclusive); + // Only update current_range for the first and the last range. + if self.current_range.lower_inclusive.is_empty() || self.ranges_iter.is_drained() { + self.current_range.lower_inclusive.clear(); + self.current_range.upper_exclusive.clear(); + self.current_range + .lower_inclusive + .extend_from_slice(&range.lower_inclusive); + self.current_range + .upper_exclusive + .extend_from_slice(&range.upper_exclusive); + } self.update_working_range_begin_key(); } @@ -666,4 +679,174 @@ mod tests { assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo"); } + + #[test] + fn test_scanned_range_forward2() { + let storage = create_storage(); + // Filled interval range + let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; + let mut scanner = RangesScanner::new(RangesScannerOptions { + storage: storage.clone(), + ranges, + scan_backward_in_range: false, + is_key_only: false, + is_scanned_range_aware: true, + }); + + // Only lower_inclusive is updated. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo"); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b""); + + // Upper_exclusive is updated. + assert_eq!(&scanner.next_opt(true).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Upper_exclusive is not updated. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Drained. + assert_eq!(scanner.next_opt(false).unwrap(), None); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_8"); + + let r = scanner.take_scanned_range(); + assert_eq!(&r.lower_inclusive, b"foo"); + assert_eq!(&r.upper_exclusive, b"foo_8"); + + // Multiple ranges + // TODO: caller should not pass in unordered ranges otherwise scanned ranges would be + // unsound. + let ranges = vec![ + IntervalRange::from(("foo", "foo_3")).into(), + IntervalRange::from(("foo_5", "foo_50")).into(), + IntervalRange::from(("bar", "bar_")).into(), + PointRange::from("bar_2").into(), + PointRange::from("bar_3").into(), + IntervalRange::from(("bar_4", "box")).into(), + ]; + let mut scanner = RangesScanner::new(RangesScannerOptions { + storage, + ranges, + scan_backward_in_range: false, + is_key_only: false, + is_scanned_range_aware: true, + }); + + // Only lower_inclusive is updated. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo"); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b""); + + // Upper_exclusive is updated. Updated by scanned row. + assert_eq!(&scanner.next_opt(true).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Upper_exclusive is not updated. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"bar"); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Upper_exclusive is not updated. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); + + // Drain. + assert_eq!(scanner.next_opt(false).unwrap(), None); + assert_eq!(&scanner.working_range_begin_key, b"foo"); + assert_eq!(&scanner.working_range_end_key, b"box"); + + let r = scanner.take_scanned_range(); + assert_eq!(&r.lower_inclusive, b"foo"); + assert_eq!(&r.upper_exclusive, b"box"); + } + + #[test] + fn test_scanned_range_backward2() { + let storage = create_storage(); + // Filled interval range + let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; + let mut scanner = RangesScanner::new(RangesScannerOptions { + storage: storage.clone(), + ranges, + scan_backward_in_range: true, + is_key_only: false, + is_scanned_range_aware: true, + }); + + // Only lower_inclusive is updated. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&scanner.working_range_begin_key, b"foo_8"); + assert_eq!(&scanner.working_range_end_key, b""); + + // Upper_exclusive is updated. + assert_eq!(&scanner.next_opt(true).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&scanner.working_range_begin_key, b"foo_8"); + assert_eq!(&scanner.working_range_end_key, b"foo_2"); + + // Upper_exclusive is not updated. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo"); + assert_eq!(&scanner.working_range_begin_key, b"foo_8"); + assert_eq!(&scanner.working_range_end_key, b"foo_2"); + + // Drained. + assert_eq!(scanner.next_opt(false).unwrap(), None); + assert_eq!(&scanner.working_range_begin_key, b"foo_8"); + assert_eq!(&scanner.working_range_end_key, b"foo"); + + let r = scanner.take_scanned_range(); + assert_eq!(&r.lower_inclusive, b"foo"); + assert_eq!(&r.upper_exclusive, b"foo_8"); + + // Multiple ranges + let ranges = vec![ + IntervalRange::from(("bar_4", "box")).into(), + PointRange::from("bar_3").into(), + PointRange::from("bar_2").into(), + IntervalRange::from(("bar", "bar_")).into(), + IntervalRange::from(("foo_5", "foo_50")).into(), + IntervalRange::from(("foo", "foo_3")).into(), + ]; + let mut scanner = RangesScanner::new(RangesScannerOptions { + storage, + ranges, + scan_backward_in_range: true, + is_key_only: false, + is_scanned_range_aware: true, + }); + + // Lower_inclusive is updated. Upper_exclusive is not update. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b""); + + // Upper_exclusive is updated. Updated by scanned row. + assert_eq!(&scanner.next_opt(true).unwrap().unwrap().0, b"bar"); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b"bar"); + + // Upper_exclusive is not update. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b"bar"); + + // Upper_exclusive is not update. + assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo"); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b"bar"); + + // Drain. + assert_eq!(scanner.next_opt(false).unwrap(), None); + assert_eq!(&scanner.working_range_begin_key, b"box"); + assert_eq!(&scanner.working_range_end_key, b"foo"); + + let r = scanner.take_scanned_range(); + assert_eq!(&r.lower_inclusive, b"foo"); + assert_eq!(&r.upper_exclusive, b"box"); + } } diff --git a/components/tidb_query_executors/src/util/scan_executor.rs b/components/tidb_query_executors/src/util/scan_executor.rs index 14cf9abb1b6..114bc77ee1a 100644 --- a/components/tidb_query_executors/src/util/scan_executor.rs +++ b/components/tidb_query_executors/src/util/scan_executor.rs @@ -102,8 +102,8 @@ impl ScanExecutor { ) -> Result { assert!(scan_rows > 0); - for _ in 0..scan_rows { - let some_row = self.scanner.next()?; + for i in 0..scan_rows { + let some_row = self.scanner.next_opt(i == scan_rows - 1)?; if let Some((key, value)) = some_row { // Retrieved one row from point range or non-point range. From 43ebcba1b7d89b7cd17cf2d5f20f3ba9689468ee Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 27 Jun 2022 16:10:39 -0700 Subject: [PATCH 047/676] TiKV: support tablet in DBConfigManager (#12884) close tikv/tikv#12883 DBConfigManager should work with tablets, meanwhile compatible with current single kv engine. Signed-off-by: qi.xu Co-authored-by: qi.xu --- components/engine_panic/src/engine.rs | 12 ++- components/engine_rocks/src/engine.rs | 11 ++ components/engine_traits/src/engine.rs | 135 +++++++++++++++++++++++-- components/server/src/server.rs | 8 +- src/config.rs | 108 ++++++++++++++------ src/server/engine_factory.rs | 29 ++++-- src/server/engine_factory_v2.rs | 39 +++++-- 7 files changed, 287 insertions(+), 55 deletions(-) diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index 33c7bc01541..5608b55ea00 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -2,7 +2,7 @@ use engine_traits::{ IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, Result, SeekKey, SyncMutable, - WriteOptions, + TabletAccessor, WriteOptions, }; use crate::{db_vector::PanicDBVector, snapshot::PanicSnapshot, write_batch::PanicWriteBatch}; @@ -24,6 +24,16 @@ impl KvEngine for PanicEngine { } } +impl TabletAccessor for PanicEngine { + fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &PanicEngine)) { + panic!() + } + + fn is_single_engine(&self) -> bool { + panic!() + } +} + impl Peekable for PanicEngine { type DBVector = PanicDBVector; diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 32bd259f160..60be2007367 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -4,6 +4,7 @@ use std::{any::Any, fs, path::Path, sync::Arc}; use engine_traits::{ Error, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable, + TabletAccessor, }; use rocksdb::{DBIterator, Writable, DB}; @@ -110,6 +111,16 @@ impl KvEngine for RocksEngine { } } +impl TabletAccessor for RocksEngine { + fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { + f(0, 0, self); + } + + fn is_single_engine(&self) -> bool { + true + } +} + impl Iterable for RocksEngine { type Iterator = RocksEngineIterator; diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index a2aa5e5d908..de99f924038 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -2,9 +2,14 @@ use std::{ fmt::Debug, + io::Write, path::{Path, PathBuf}, + str, + vec::Vec, }; +use tikv_util::error; + use crate::*; // FIXME: Revisit the remaining types and methods on KvEngine. Some of these are @@ -61,11 +66,95 @@ pub trait KvEngine: fn bad_downcast(&self) -> &T; } +/// TabletAccessor is the trait to access all the tablets with provided accessor +/// +/// For single rocksdb instance, it essentially accesses the global kvdb with the accessor +/// For multi rocksdb instances, it accesses all the tablets with the accessor +pub trait TabletAccessor { + /// Loop visit all opened tablets by the specified function. + fn for_each_opened_tablet(&self, _f: &mut (dyn FnMut(u64, u64, &EK))); + + /// return true if it's single engine; + /// return false if it's a multi-tablet factory; + fn is_single_engine(&self) -> bool; +} + +/// max error count to log +const MAX_ERROR_COUNT: u32 = 5; + +/// TabletErrorCollector is the facility struct to handle errors when using TabletAccessor::for_each_opened_tablet +/// +/// It will choose the last failed result as the final result, meanwhile logging errors up to MAX_ERROR_COUNT. +pub struct TabletErrorCollector { + errors: Vec, + max_error_count: u32, + error_count: u32, + result: std::result::Result<(), Box>, +} + +impl TabletErrorCollector { + pub fn new() -> Self { + Self { + errors: vec![], + max_error_count: MAX_ERROR_COUNT, + error_count: 0, + result: Ok(()), + } + } + + pub fn add_result(&mut self, region_id: u64, suffix: u64, result: Result<()>) { + if result.is_ok() { + return; + } + self.result = Err(Box::from(result.err().unwrap())); + self.error_count += 1; + if self.error_count > self.max_error_count { + return; + } + writeln!( + &mut self.errors, + "Tablet {}_{} encountered error: {:?}.", + region_id, suffix, self.result + ) + .unwrap(); + } + + fn flush_error(&self) { + if self.error_count > 0 { + error!( + "Total count {}. Sample errors: {}", + self.error_count, + str::from_utf8(&self.errors).unwrap() + ); + } + } + + pub fn take_result(&mut self) -> std::result::Result<(), Box> { + std::mem::replace(&mut self.result, Ok(())) + } + + pub fn get_error_count(&self) -> u32 { + self.error_count + } +} + +impl Default for TabletErrorCollector { + fn default() -> Self { + Self::new() + } +} + +impl Drop for TabletErrorCollector { + fn drop(&mut self) { + self.flush_error() + } +} + /// A factory trait to create new engine. /// // It should be named as `EngineFactory` for consistency, but we are about to rename // engine to tablet, so always use tablet for new traits/types. -pub trait TabletFactory { +pub trait TabletFactory: TabletAccessor { /// Create an tablet by id and suffix. If the tablet exists, it will fail. /// The id is likely the region Id, the suffix could be the current raft log index. /// They together could specify a unique path for a region's tablet. @@ -119,10 +208,6 @@ pub trait TabletFactory { /// Here we don't use Clone traint because it will break the trait's object safty fn clone(&self) -> Box + Send>; - /// Loop visit all opened tablets cached by the specified function. - /// Once the tablet is opened/created, it will be cached in a hashmap - fn loop_tablet_cache(&self, _f: Box); - /// Load the tablet from path for id and suffix--for scenarios such as applying snapshot fn load_tablet(&self, _path: &Path, _id: u64, _suffix: u64) -> Result { unimplemented!(); @@ -185,7 +270,20 @@ where root_path: self.root_path.clone(), }) } - fn loop_tablet_cache(&self, _f: Box) {} +} +impl TabletAccessor for DummyFactory +where + EK: Clone + Send + 'static, +{ + fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &EK)) { + if let Some(engine) = &self.engine { + f(0, 0, engine); + } + } + + fn is_single_engine(&self) -> bool { + true + } } impl DummyFactory @@ -202,3 +300,28 @@ impl Default for DummyFactory { Self::new(None, "/tmp".to_string()) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tablet_error_collector_ok() { + let mut err = TabletErrorCollector::new(); + err.add_result(1, 1, Ok(())); + assert!(err.take_result().is_ok()); + assert_eq!(err.get_error_count(), 0); + } + + #[test] + fn test_tablet_error_collector_err() { + let mut err = TabletErrorCollector::new(); + err.add_result(1, 1, Ok(())); + err.add_result(1, 1, Err("this is an error1".to_string().into())); + err.add_result(1, 1, Err("this is an error2".to_string().into())); + err.add_result(1, 1, Ok(())); + let r = err.take_result(); + assert!(r.is_err()); + assert_eq!(err.get_error_count(), 2); + } +} diff --git a/components/server/src/server.rs b/components/server/src/server.rs index cded99edfe3..e09eec7d5d8 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1531,7 +1531,11 @@ impl ConfiguredRaftEngine for RocksEngine { fn register_config(&self, cfg_controller: &mut ConfigController, share_cache: bool) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DBConfigManger::new(self.clone(), DBType::Raft, share_cache)), + Box::new(DBConfigManger::new( + Arc::new(self.clone()), + DBType::Raft, + share_cache, + )), ); } } @@ -1614,7 +1618,7 @@ impl TiKvServer { cfg_controller.register( tikv::config::Module::Rocksdb, Box::new(DBConfigManger::new( - engines.kv.clone(), + Arc::new(factory), DBType::Kv, self.config.storage.block_cache.shared, )), diff --git a/src/config.rs b/src/config.rs index d3ec96f6ba4..fd6ec16253b 100644 --- a/src/config.rs +++ b/src/config.rs @@ -12,6 +12,7 @@ use std::{ fs, i32, io::{Error as IoError, ErrorKind, Write}, path::Path, + str, sync::{Arc, RwLock}, usize, }; @@ -35,8 +36,8 @@ use engine_rocks::{ DEFAULT_PROP_KEYS_INDEX_DISTANCE, DEFAULT_PROP_SIZE_INDEX_DISTANCE, }; use engine_traits::{ - CFOptionsExt, ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptionsExt, CF_DEFAULT, - CF_LOCK, CF_RAFT, CF_WRITE, + CFOptionsExt, ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptionsExt, TabletAccessor, + TabletErrorCollector, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use file_system::{IOPriority, IORateLimiter}; use keys::region_raft_prefix_len; @@ -1494,31 +1495,39 @@ pub enum DBType { Raft, } -pub struct DBConfigManger { - db: RocksEngine, +pub struct DBConfigManger> { + tablet_accessor: Arc, db_type: DBType, shared_block_cache: bool, } -impl DBConfigManger { - pub fn new(db: RocksEngine, db_type: DBType, shared_block_cache: bool) -> Self { +impl> DBConfigManger { + pub fn new(tablet_accessor: Arc, db_type: DBType, shared_block_cache: bool) -> Self { DBConfigManger { - db, + tablet_accessor, db_type, shared_block_cache, } } -} -impl DBConfigManger { fn set_db_config(&self, opts: &[(&str, &str)]) -> Result<(), Box> { - self.db.set_db_options(opts)?; - Ok(()) + let mut error_collector = TabletErrorCollector::new(); + self.tablet_accessor + .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { + error_collector.add_result(region_id, suffix, db.set_db_options(opts)); + }); + error_collector.take_result() } fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> Result<(), Box> { + let mut error_collector = TabletErrorCollector::new(); self.validate_cf(cf)?; - self.db.set_options_cf(cf, opts)?; + self.tablet_accessor + .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { + error_collector.add_result(region_id, suffix, db.set_options_cf(cf, opts)); + }); + error_collector.take_result()?; + // Write config to metric for (cfg_name, cfg_value) in opts { let cfg_value = match cfg_value { @@ -1542,33 +1551,68 @@ impl DBConfigManger { block-cache.capacity in storage module instead" .into()); } - let opt = self.db.get_options_cf(cf)?; - opt.set_block_cache_capacity(size.0)?; + // for multi-rocks, shared block cache has to be enabled and thus should shortcut in the above if statement. + assert!(self.tablet_accessor.is_single_engine()); + let mut error_collector = TabletErrorCollector::new(); + self.tablet_accessor + .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { + let r = db.get_options_cf(cf); + if let Ok(opt) = r { + let r = opt.set_block_cache_capacity(size.0); + if let Err(r) = r { + error_collector.add_result(region_id, suffix, Err(r.into())); + } + } else if let Err(r) = r { + error_collector.add_result(region_id, suffix, Err(r)); + } + }); // Write config to metric CONFIG_ROCKSDB_GAUGE .with_label_values(&[cf, "block_cache_size"]) .set(size.0 as f64); - Ok(()) + error_collector.take_result() } fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> Result<(), Box> { - let mut opt = self.db.as_inner().get_db_options(); - opt.set_rate_bytes_per_sec(rate_bytes_per_sec)?; - Ok(()) + let mut error_collector = TabletErrorCollector::new(); + self.tablet_accessor + .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { + let mut opt = db.as_inner().get_db_options(); + let r = opt.set_rate_bytes_per_sec(rate_bytes_per_sec); + if let Err(r) = r { + error_collector.add_result(region_id, suffix, Err(r.into())); + } + }); + error_collector.take_result() } fn set_rate_limiter_auto_tuned( &self, rate_limiter_auto_tuned: bool, ) -> Result<(), Box> { - let mut opt = self.db.as_inner().get_db_options(); - opt.set_auto_tuned(rate_limiter_auto_tuned)?; - // double check the new state - let new_auto_tuned = opt.get_auto_tuned(); - if new_auto_tuned.is_none() || new_auto_tuned.unwrap() != rate_limiter_auto_tuned { - return Err("fail to set rate_limiter_auto_tuned".into()); - } - Ok(()) + let mut error_collector = TabletErrorCollector::new(); + self.tablet_accessor + .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { + let mut opt = db.as_inner().get_db_options(); + let r = opt.set_auto_tuned(rate_limiter_auto_tuned); + if let Err(r) = r { + error_collector.add_result(region_id, suffix, Err(r.into())); + } else { + // double check the new state + let new_auto_tuned = opt.get_auto_tuned(); + if new_auto_tuned.is_none() + || new_auto_tuned.unwrap() != rate_limiter_auto_tuned + { + error_collector.add_result( + region_id, + suffix, + Err("fail to set rate_limiter_auto_tuned".to_string().into()), + ); + } + } + }); + + error_collector.take_result() } fn set_max_background_jobs(&self, max_background_jobs: i32) -> Result<(), Box> { @@ -1599,7 +1643,7 @@ impl DBConfigManger { } } -impl ConfigManager for DBConfigManger { +impl + Send + Sync> ConfigManager for DBConfigManger { fn dispatch(&mut self, change: ConfigChange) -> Result<(), Box> { let change_str = format!("{:?}", change); let mut change: Vec<(String, ConfigValue)> = change.into_iter().collect(); @@ -3817,7 +3861,9 @@ mod tests { use api_version::{ApiV1, KvFormat}; use case_macros::*; - use engine_traits::{DBOptions as DBOptionsTrait, ALL_CFS}; + use engine_traits::{ + ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptions as DBOptionsTrait, ALL_CFS, + }; use futures::executor::block_on; use grpcio::ResourceQuota; use itertools::Itertools; @@ -4235,7 +4281,11 @@ mod tests { let (shared, cfg_controller) = (cfg.storage.block_cache.shared, ConfigController::new(cfg)); cfg_controller.register( Module::Rocksdb, - Box::new(DBConfigManger::new(engine.clone(), DBType::Kv, shared)), + Box::new(DBConfigManger::new( + Arc::new(engine.clone()), + DBType::Kv, + shared, + )), ); let (scheduler, receiver) = dummy_scheduler(); cfg_controller.register( diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 5212a211e69..fde3bc5a40f 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -10,7 +10,9 @@ use engine_rocks::{ CompactionListener, FlowListener, RocksCompactedEvent, RocksCompactionJobInfo, RocksEngine, RocksEventListener, }; -use engine_traits::{CompactionJobInfo, RaftEngine, Result, TabletFactory, CF_DEFAULT, CF_WRITE}; +use engine_traits::{ + CompactionJobInfo, RaftEngine, Result, TabletAccessor, TabletFactory, CF_DEFAULT, CF_WRITE, +}; use kvproto::kvrpcpb::ApiVersion; use raftstore::{ store::{RaftRouter, StoreMsg}, @@ -83,20 +85,29 @@ impl KvEngineFactoryBuilder { pub fn build(self) -> KvEngineFactory { KvEngineFactory { inner: Arc::new(self.inner), - router: self.router, + router: Mutex::new(self.router), } } } -#[derive(Clone)] pub struct KvEngineFactory { inner: Arc, - router: Option>, + router: Mutex>>, +} + +impl Clone for KvEngineFactory { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + router: Mutex::new(self.router.lock().unwrap().clone()), + } + } } impl KvEngineFactory { pub fn create_raftstore_compaction_listener(&self) -> Option { - let ch = match &self.router { + let router = self.router.lock().unwrap(); + let ch = match &*router { Some(r) => Mutex::new(r.clone()), None => return None, }; @@ -257,11 +268,17 @@ impl TabletFactory for KvEngineFactory { fn clone(&self) -> Box + Send> { Box::new(std::clone::Clone::clone(self)) } +} - fn loop_tablet_cache(&self, mut f: Box) { +impl TabletAccessor for KvEngineFactory { + fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { if let Ok(db) = self.inner.root_db.lock() { let db = db.as_ref().unwrap(); f(0, 0, db); } } + + fn is_single_engine(&self) -> bool { + true + } } diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 676272334ac..4027823f23c 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -7,7 +7,7 @@ use std::{ use collections::HashMap; use engine_rocks::RocksEngine; -use engine_traits::{RaftEngine, Result, TabletFactory}; +use engine_traits::{RaftEngine, Result, TabletAccessor, TabletFactory}; use crate::server::engine_factory::KvEngineFactory; @@ -129,14 +129,6 @@ impl TabletFactory for KvEngineFactoryV2 { Ok(()) } - #[inline] - fn loop_tablet_cache(&self, mut f: Box) { - let reg = self.registry.lock().unwrap(); - for ((id, suffix), tablet) in &*reg { - f(*id, *suffix, tablet) - } - } - #[inline] fn load_tablet(&self, path: &Path, id: u64, suffix: u64) -> Result { { @@ -160,6 +152,21 @@ impl TabletFactory for KvEngineFactoryV2 { } } +impl TabletAccessor for KvEngineFactoryV2 { + #[inline] + fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { + let reg = self.registry.lock().unwrap(); + for ((id, suffix), tablet) in &*reg { + f(*id, *suffix, tablet) + } + } + + // it have multi tablets. + fn is_single_engine(&self) -> bool { + false + } +} + #[cfg(test)] mod tests { use engine_traits::TabletFactory; @@ -213,6 +220,15 @@ mod tests { let tablet_path = factory.tablet_path(1, 10); let tablet2 = factory.open_tablet_raw(&tablet_path, false).unwrap(); assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + let mut count = 0; + factory.for_each_opened_tablet(&mut |id, suffix, _tablet| { + assert!(id == 0); + assert!(suffix == 0); + count += 1; + }); + assert_eq!(count, 1); + assert!(factory.is_single_engine()); + assert!(shared_db.is_single_engine()); } #[test] @@ -250,6 +266,7 @@ mod tests { factory.destroy_tablet(1, 20).unwrap(); let result = factory.open_tablet(1, 20); assert!(result.is_err()); + assert!(!factory.is_single_engine()); } #[test] @@ -264,11 +281,11 @@ mod tests { factory.create_tablet(1, 10).unwrap(); factory.create_tablet(2, 10).unwrap(); let mut count = 0; - factory.loop_tablet_cache(Box::new(|id, suffix, _tablet| { + factory.for_each_opened_tablet(&mut |id, suffix, _tablet| { assert!(id == 1 || id == 2); assert!(suffix == 10); count += 1; - })); + }); assert_eq!(count, 2); } } From 30d2c3d89b551532d2c6aedc67fdaf8f5b2847cc Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Mon, 27 Jun 2022 23:22:39 -0700 Subject: [PATCH 048/676] Fix flaky test by extending raft gc wait timeout (#12810) close tikv/tikv#12809 Fix flaky test by extending raft gc wait timeout Signed-off-by: v01dstar Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/peer.rs | 15 +- .../failpoints/cases/test_unsafe_recovery.rs | 5 +- .../raftstore/test_unsafe_recovery.rs | 183 +++++++++--------- 3 files changed, 109 insertions(+), 94 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index e08c440d6a1..ba819bda155 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -12,7 +12,7 @@ use std::{ iter::{FromIterator, Iterator}, mem, sync::{Arc, Mutex}, - time::Instant, + time::{Duration, Instant}, u64, }; @@ -5573,6 +5573,19 @@ where return; } + if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { + // Clean up the force leader state after a timeout, since the PD recovery process may + // have been aborted for some reasons. + if time.saturating_elapsed() + > cmp::max( + self.ctx.cfg.peer_stale_state_check_interval.0, + Duration::from_secs(60), + ) + { + self.on_exit_force_leader(); + } + } + if self.ctx.cfg.hibernate_regions { let group_state = self.fsm.hibernate_state.group_state(); if group_state == GroupState::Idle { diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index 292cba849df..f791b40c065 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -192,7 +192,7 @@ fn test_unsafe_recovery_execution_result_report() { } #[test] -fn test_unsafe_recover_wait_for_snapshot_apply() { +fn test_unsafe_recovery_wait_for_snapshot_apply() { let mut cluster = new_server_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(8); cluster.cfg.raft_store.merge_max_log_gap = 3; @@ -219,13 +219,12 @@ fn test_unsafe_recover_wait_for_snapshot_apply() { let _ = raft_gc_finished_tx.send(()); }) .unwrap(); - // Add at least 4m data (0..10).for_each(|_| cluster.must_put(b"random_k", b"random_v")); // Unblock raft log GC. drop(raft_gc_triggered_tx); // Wait until logs are GCed. raft_gc_finished_rx - .recv_timeout(Duration::from_secs(1)) + .recv_timeout(Duration::from_secs(3)) .unwrap(); // Makes the group lose its quorum. cluster.stop_node(nodes[2]); diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs index 7902c0a4c71..ebeb99ddfe7 100644 --- a/tests/integrations/raftstore/test_unsafe_recovery.rs +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -725,96 +725,99 @@ fn test_force_leader_on_hibernated_follower() { // Test the case that three of five nodes fail and force leader on the rest node // with triggering snapshot. -// #[test] -// fn test_force_leader_trigger_snapshot() { -// let mut cluster = new_node_cluster(0, 5); -// cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); -// cluster.cfg.raft_store.raft_election_timeout_ticks = 10; -// cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(90); -// cluster.pd_client.disable_default_operator(); -// -// cluster.run(); -// cluster.must_put(b"k1", b"v1"); -// -// let region = cluster.get_region(b"k1"); -// cluster.must_split(®ion, b"k9"); -// let region = cluster.get_region(b"k2"); -// let peer_on_store1 = find_peer(®ion, 1).unwrap(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); -// -// // Isolate node 2 -// cluster.add_send_filter(IsolationFilterFactory::new(2)); -// -// // Compact logs to force requesting snapshot after clearing send filters. -// let state = cluster.truncated_state(region.get_id(), 1); -// // Write some data to trigger snapshot. -// for i in 100..150 { -// let key = format!("k{}", i); -// let value = format!("v{}", i); -// cluster.must_put(key.as_bytes(), value.as_bytes()); -// } -// cluster.wait_log_truncated(region.get_id(), 1, state.get_index() + 40); -// -// cluster.stop_node(3); -// cluster.stop_node(4); -// cluster.stop_node(5); -// -// // Recover the isolation of 2, but still don't permit snapshot -// let recv_filter = Box::new( -// RegionPacketFilter::new(region.get_id(), 2) -// .direction(Direction::Recv) -// .msg_type(MessageType::MsgSnapshot), -// ); -// cluster.sim.wl().add_recv_filter(2, recv_filter); -// cluster.clear_send_filters(); -// -// // wait election timeout -// sleep_ms( -// cluster.cfg.raft_store.raft_election_timeout_ticks as u64 -// * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() -// * 5, -// ); -// cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); -// -// sleep_ms( -// cluster.cfg.raft_store.raft_election_timeout_ticks as u64 -// * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() -// * 3, -// ); -// let cmd = new_change_peer_request( -// ConfChangeType::RemoveNode, -// find_peer(®ion, 3).unwrap().clone(), -// ); -// let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); -// // Though it has a force leader now, but the command can't committed because the log is not replicated to all the alive peers. -// assert!( -// cluster -// .call_command_on_leader(req, Duration::from_millis(1000)) -// .unwrap() -// .get_header() -// .has_error() // error "there is a pending conf change" indicating no committed log after being the leader -// ); -// -// // Permit snapshot message, snapshot should be applied and advance commit index now. -// cluster.sim.wl().clear_recv_filters(2); -// cluster -// .pd_client -// .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); -// cluster -// .pd_client -// .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); -// cluster -// .pd_client -// .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); -// cluster.exit_force_leader(region.get_id(), 1); -// -// // quorum is formed, can propose command successfully now -// cluster.must_put(b"k4", b"v4"); -// assert_eq!(cluster.must_get(b"k2"), None); -// assert_eq!(cluster.must_get(b"k3"), None); -// assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); -// cluster.must_transfer_leader(region.get_id(), find_peer(®ion, 1).unwrap().clone()); -// } +#[test] +fn test_force_leader_trigger_snapshot() { + let mut cluster = new_node_cluster(0, 5); + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(90); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(8); + cluster.cfg.raft_store.merge_max_log_gap = 3; + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); + cluster.pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k9"); + let region = cluster.get_region(b"k2"); + let peer_on_store1 = find_peer(®ion, 1).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // Isolate node 2 + cluster.add_send_filter(IsolationFilterFactory::new(2)); + + // Compact logs to force requesting snapshot after clearing send filters. + let state = cluster.truncated_state(region.get_id(), 1); + // Write some data to trigger snapshot. + for i in 100..150 { + let key = format!("k{}", i); + let value = format!("v{}", i); + cluster.must_put(key.as_bytes(), value.as_bytes()); + } + cluster.wait_log_truncated(region.get_id(), 1, state.get_index() + 40); + + cluster.stop_node(3); + cluster.stop_node(4); + cluster.stop_node(5); + + // Recover the isolation of 2, but still don't permit snapshot + let recv_filter = Box::new( + RegionPacketFilter::new(region.get_id(), 2) + .direction(Direction::Recv) + .msg_type(MessageType::MsgSnapshot), + ); + cluster.sim.wl().add_recv_filter(2, recv_filter); + cluster.clear_send_filters(); + + // wait election timeout + sleep_ms( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 5, + ); + cluster.enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); + + sleep_ms( + cluster.cfg.raft_store.raft_election_timeout_ticks as u64 + * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() + * 3, + ); + let cmd = new_change_peer_request( + ConfChangeType::RemoveNode, + find_peer(®ion, 3).unwrap().clone(), + ); + let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); + // Though it has a force leader now, but the command can't committed because the log is not replicated to all the alive peers. + assert!( + cluster + .call_command_on_leader(req, Duration::from_millis(1000)) + .unwrap() + .get_header() + .has_error() // error "there is a pending conf change" indicating no committed log after being the leader + ); + + // Permit snapshot message, snapshot should be applied and advance commit index now. + cluster.sim.wl().clear_recv_filters(2); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 3).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 4).unwrap().clone()); + cluster + .pd_client + .must_remove_peer(region.get_id(), find_peer(®ion, 5).unwrap().clone()); + cluster.exit_force_leader(region.get_id(), 1); + + // quorum is formed, can propose command successfully now + cluster.must_put(b"k4", b"v4"); + assert_eq!(cluster.must_get(b"k2"), None); + assert_eq!(cluster.must_get(b"k3"), None); + assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); + cluster.must_transfer_leader(region.get_id(), find_peer(®ion, 1).unwrap().clone()); +} // Test the case that three of five nodes fail and force leader on the rest node // with uncommitted conf change. From 24ad73866fcc493b6d424ec495769e05c286438e Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 28 Jun 2022 15:44:39 +0800 Subject: [PATCH 049/676] online_config: allow return error when update config (#12910) close tikv/tikv#12909 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- components/backup/src/endpoint.rs | 3 +- components/cdc/src/endpoint.rs | 7 +- components/engine_rocks/src/config.rs | 25 ++--- components/file_system/src/lib.rs | 27 +++--- .../online_config_derive/src/lib.rs | 9 +- components/online_config/src/lib.rs | 91 +++++++++++++++++-- components/raftstore/src/store/config.rs | 2 +- .../raftstore/src/store/worker/split_check.rs | 5 +- .../src/store/worker/split_config.rs | 2 +- components/resolved_ts/src/endpoint.rs | 5 +- components/resource_metering/src/config.rs | 2 +- components/tikv_util/src/config.rs | 15 +-- src/config.rs | 22 ++--- src/server/config.rs | 2 +- src/server/gc_worker/config.rs | 3 +- src/server/raft_client.rs | 5 +- src/storage/config_manager.rs | 4 +- .../integrations/config/dynamic/gc_worker.rs | 15 ++- .../integrations/config/test_config_client.rs | 6 +- 19 files changed, 172 insertions(+), 78 deletions(-) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 37e6855302a..2a68cbb6bd8 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -575,8 +575,7 @@ pub struct ConfigManager(Arc>); impl online_config::ConfigManager for ConfigManager { fn dispatch(&mut self, change: online_config::ConfigChange) -> online_config::Result<()> { - self.0.write().unwrap().update(change); - Ok(()) + self.0.write().unwrap().update(change) } } diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 7ca640ac8b3..c78636b8e11 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -447,7 +447,10 @@ impl, E: KvEngine> Endpoint { fn on_change_cfg(&mut self, change: ConfigChange) { // Validate first. let mut validate_cfg = self.config.clone(); - validate_cfg.update(change); + if let Err(e) = validate_cfg.update(change) { + warn!("cdc config update failed"; "error" => ?e); + return; + } if let Err(e) = validate_cfg.validate() { warn!("cdc config update failed"; "error" => ?e); return; @@ -459,7 +462,7 @@ impl, E: KvEngine> Endpoint { "change" => ?change ); // Update the config here. The following adjustments will all use the new values. - self.config.update(change.clone()); + self.config.update(change.clone()).unwrap(); // Maybe the cache will be lost due to smaller capacity, // but it is acceptable. diff --git a/components/engine_rocks/src/config.rs b/components/engine_rocks/src/config.rs index 6442a5dab64..9c015b7e7d1 100644 --- a/components/engine_rocks/src/config.rs +++ b/components/engine_rocks/src/config.rs @@ -1,6 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::str::FromStr; +use std::{convert::TryFrom, str::FromStr}; use online_config::ConfigValue; use rocksdb::{ @@ -225,21 +225,22 @@ pub enum BlobRunMode { impl From for ConfigValue { fn from(mode: BlobRunMode) -> ConfigValue { - ConfigValue::BlobRunMode(format!("k{:?}", mode)) + let str_value = match mode { + BlobRunMode::Normal => "normal", + BlobRunMode::ReadOnly => "read-only", + BlobRunMode::Fallback => "fallback", + }; + ConfigValue::String(str_value.into()) } } -impl From for BlobRunMode { - fn from(c: ConfigValue) -> BlobRunMode { - if let ConfigValue::BlobRunMode(s) = c { - match s.as_str() { - "kNormal" => BlobRunMode::Normal, - "kReadOnly" => BlobRunMode::ReadOnly, - "kFallback" => BlobRunMode::Fallback, - m => panic!("expect: kNormal, kReadOnly or kFallback, got: {:?}", m), - } +impl TryFrom for BlobRunMode { + type Error = String; + fn try_from(c: ConfigValue) -> Result { + if let ConfigValue::String(s) = c { + Self::from_str(&s) } else { - panic!("expect: ConfigValue::BlobRunMode, got: {:?}", c); + panic!("expect: ConfigValue::String, got: {:?}", c); } } } diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index dd99b810e28..d5f8345cae3 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -18,10 +18,13 @@ mod metrics; mod metrics_manager; mod rate_limiter; -pub use std::fs::{ - canonicalize, create_dir, create_dir_all, hard_link, metadata, read_dir, read_link, remove_dir, - remove_dir_all, remove_file, rename, set_permissions, symlink_metadata, DirBuilder, DirEntry, - FileType, Metadata, Permissions, ReadDir, +pub use std::{ + convert::TryFrom, + fs::{ + canonicalize, create_dir, create_dir_all, hard_link, metadata, read_dir, read_link, + remove_dir, remove_dir_all, remove_file, rename, set_permissions, symlink_metadata, + DirBuilder, DirEntry, FileType, Metadata, Permissions, ReadDir, + }, }; use std::{ io::{self, ErrorKind, Read, Write}, @@ -205,19 +208,17 @@ impl<'de> Deserialize<'de> for IOPriority { impl From for ConfigValue { fn from(mode: IOPriority) -> ConfigValue { - ConfigValue::IOPriority(mode.as_str().to_owned()) + ConfigValue::String(mode.as_str().to_owned()) } } -impl From for IOPriority { - fn from(c: ConfigValue) -> IOPriority { - if let ConfigValue::IOPriority(s) = c { - match IOPriority::from_str(s.as_str()) { - Ok(p) => p, - _ => panic!("expect: low, medium, high, got: {:?}", s), - } +impl TryFrom for IOPriority { + type Error = String; + fn try_from(c: ConfigValue) -> Result { + if let ConfigValue::String(s) = c { + Self::from_str(s.as_str()) } else { - panic!("expect: ConfigValue::IOPriority, got: {:?}", c); + panic!("expect: ConfigValue::String, got: {:?}", c); } } } diff --git a/components/online_config/online_config_derive/src/lib.rs b/components/online_config/online_config_derive/src/lib.rs index 0981668d817..ed37aeac40c 100644 --- a/components/online_config/online_config_derive/src/lib.rs +++ b/components/online_config/online_config_derive/src/lib.rs @@ -172,7 +172,7 @@ fn update(fields: &Punctuated, crate_name: &Ident) -> Result, crate_name: &Ident) -> Result std::result::Result<(), Box> { #(#update_fields)* + Ok(()) } }) } diff --git a/components/online_config/src/lib.rs b/components/online_config/src/lib.rs index 51f1580cafd..fae347fee40 100644 --- a/components/online_config/src/lib.rs +++ b/components/online_config/src/lib.rs @@ -20,8 +20,6 @@ pub enum ConfigValue { Usize(usize), Bool(bool), String(String), - BlobRunMode(String), - IOPriority(String), Module(ConfigChange), Skip, None, @@ -39,8 +37,6 @@ impl Display for ConfigValue { ConfigValue::Usize(v) => write!(f, "{}", v), ConfigValue::Bool(v) => write!(f, "{}", v), ConfigValue::String(v) => write!(f, "{}", v), - ConfigValue::BlobRunMode(v) => write!(f, "{}", v), - ConfigValue::IOPriority(v) => write!(f, "{}", v), ConfigValue::Module(v) => write!(f, "{:?}", v), ConfigValue::Skip => write!(f, "ConfigValue::Skip"), ConfigValue::None => write!(f, ""), @@ -115,13 +111,13 @@ impl_into!(ConfigChange, Module); /// 3. `#[online_config(submodule)]` field, these fields represent the /// submodule, and should also derive `OnlineConfig` /// 4. normal fields, the type of these fields should be implment -/// `Into` and `From` for `ConfigValue` +/// `Into` and `From`/`TryFrom` for `ConfigValue` pub trait OnlineConfig<'a> { type Encoder: serde::Serialize; /// Compare to other config, return the difference fn diff(&self, _: &Self) -> ConfigChange; /// Update config with difference returned by `diff` - fn update(&mut self, _: ConfigChange); + fn update(&mut self, _: ConfigChange) -> Result<()>; /// Get encoder that can be serialize with `serde::Serializer` /// with the disappear of `#[online_config(hidden)]` field fn get_encoder(&'a self) -> Self::Encoder; @@ -137,6 +133,10 @@ pub trait ConfigManager: Send + Sync { #[cfg(test)] mod tests { + use std::convert::TryFrom; + + use serde::Serialize; + use super::*; use crate as online_config; @@ -194,7 +194,7 @@ mod tests { assert_eq!(sub_diff.remove("field1").map(Into::into), Some(1000u64)); assert_eq!(sub_diff.remove("field2").map(Into::into), Some(true)); } - cfg.update(diff); + cfg.update(diff).unwrap(); assert_eq!(cfg, updated_cfg, "cfg should be updated"); } @@ -204,7 +204,7 @@ mod tests { let diff = cfg.diff(&cfg.clone()); assert!(diff.is_empty(), "diff should be empty"); - cfg.update(diff); + cfg.update(diff).unwrap(); assert_eq!(cfg, TestConfig::default(), "cfg should not be updated"); } @@ -218,7 +218,7 @@ mod tests { let mut diff = HashMap::new(); diff.insert("skip_field".to_owned(), ConfigValue::U64(123)); - cfg.update(diff); + cfg.update(diff).unwrap(); assert_eq!(cfg, TestConfig::default(), "cfg should not be updated"); } @@ -241,7 +241,7 @@ mod tests { assert_eq!(sub_diff.remove("field2").map(Into::into), Some(true)); } - cfg.update(diff); + cfg.update(diff).unwrap(); assert_eq!( cfg.submodule_field, updated_cfg.submodule_field, "submodule should be updated" @@ -295,4 +295,75 @@ mod tests { "skip-field = \"\"\n\n[submodule-field]\nrename_field = false\n" ); } + + #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)] + pub enum TestEnum { + First, + Second, + } + + impl std::fmt::Display for TestEnum { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::First => f.write_str("first"), + Self::Second => f.write_str("second"), + } + } + } + + impl From for ConfigValue { + fn from(v: TestEnum) -> ConfigValue { + ConfigValue::String(format!("{}", v)) + } + } + + impl TryFrom for TestEnum { + type Error = String; + fn try_from(v: ConfigValue) -> std::result::Result { + if let ConfigValue::String(s) = v { + match s.as_str() { + "first" => Ok(Self::First), + "second" => Ok(Self::Second), + s => Err(format!("invalid config value: {}", s)), + } + } else { + panic!("expect ConfigValue::String, got: {:?}", v); + } + } + } + + #[derive(Clone, OnlineConfig, Debug, PartialEq)] + pub struct TestEnumConfig { + f1: u64, + e: TestEnum, + } + + impl Default for TestEnumConfig { + fn default() -> Self { + Self { + f1: 0, + e: TestEnum::First, + } + } + } + + #[test] + fn test_update_enum_config() { + let mut config = TestEnumConfig::default(); + + let mut diff = HashMap::new(); + diff.insert("f1".to_owned(), ConfigValue::U64(1)); + diff.insert("e".to_owned(), ConfigValue::String("second".into())); + config.update(diff).unwrap(); + + let updated = TestEnumConfig { + f1: 1, + e: TestEnum::Second, + }; + assert_eq!(config, updated); + + let mut diff = HashMap::new(); + diff.insert("e".to_owned(), ConfigValue::String("invalid".into())); + assert!(config.update(diff).is_err()); + } } diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 87b299d4cbb..fdd47d6c2ae 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -947,7 +947,7 @@ impl ConfigManager for RaftstoreConfigManager { { let change = change.clone(); self.config - .update(move |cfg: &mut Config| cfg.update(change)); + .update(move |cfg: &mut Config| cfg.update(change))?; } if let Some(ConfigValue::Module(raft_batch_system_change)) = change.get("store_batch_system") diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index ecb2d43f566..922f927ddb3 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -543,11 +543,14 @@ where } fn change_cfg(&mut self, change: ConfigChange) { + if let Err(e) = self.coprocessor.cfg.update(change.clone()) { + error!("update split check config failed"; "err" => ?e); + return; + }; info!( "split check config updated"; "change" => ?change ); - self.coprocessor.cfg.update(change); } } diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index da7f137765a..4d2634514be 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -117,7 +117,7 @@ impl ConfigManager for SplitConfigManager { { let change = change.clone(); self.0 - .update(move |cfg: &mut SplitConfig| cfg.update(change)); + .update(move |cfg: &mut SplitConfig| cfg.update(change))?; } info!( "load base split config changed"; diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 06fcb8c6860..bf4f9ba881e 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -579,7 +579,10 @@ where fn handle_change_config(&mut self, change: ConfigChange) { let prev = format!("{:?}", self.cfg); let prev_advance_ts_interval = self.cfg.advance_ts_interval; - self.cfg.update(change); + if let Err(e) = self.cfg.update(change) { + error!("update resolved-ts config unexpectly failed"; "err" => ?e); + return; + } if self.cfg.advance_ts_interval != prev_advance_ts_interval { // Increase the `cfg_version` to reject advance event that registered before self.cfg_version += 1; diff --git a/components/resource_metering/src/config.rs b/components/resource_metering/src/config.rs index ae28536f10e..90b09588e3a 100644 --- a/components/resource_metering/src/config.rs +++ b/components/resource_metering/src/config.rs @@ -110,7 +110,7 @@ impl ConfigManager { impl online_config::ConfigManager for ConfigManager { fn dispatch(&mut self, change: ConfigChange) -> Result<(), Box> { let mut new_config = self.current_config.clone(); - new_config.update(change); + new_config.update(change)?; new_config.validate()?; if self.current_config.receiver_address != new_config.receiver_address { self.address_notifier diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index 5a8206e234e..aa981603d17 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -1108,13 +1108,15 @@ impl VersionTrack { } } - /// Update the value - pub fn update(&self, f: F) + pub fn update(&self, f: F) -> Result where - F: FnOnce(&mut T), + F: FnOnce(&mut T) -> Result, { - f(&mut self.value.write().unwrap()); - self.version.fetch_add(1, Ordering::Release); + let res = f(&mut self.value.write().unwrap()); + if res.is_ok() { + self.version.fetch_add(1, Ordering::Release); + } + res } pub fn value(&self) -> RwLockReadGuard<'_, T> { @@ -1966,9 +1968,10 @@ mod tests { assert!(trackers.iter_mut().all(|tr| tr.any_new().is_none())); - vc.update(|v| { + let _ = vc.update(|v| -> Result<(), ()> { v.v1 = 1000; v.v2 = true; + Ok(()) }); for tr in trackers.iter_mut() { let incoming = tr.any_new(); diff --git a/src/config.rs b/src/config.rs index fd6ec16253b..fc6cde09e1c 100644 --- a/src/config.rs +++ b/src/config.rs @@ -39,7 +39,7 @@ use engine_traits::{ CFOptionsExt, ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptionsExt, TabletAccessor, TabletErrorCollector, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; -use file_system::{IOPriority, IORateLimiter}; +use file_system::IORateLimiter; use keys::region_raft_prefix_len; use kvproto::kvrpcpb::ApiVersion; use online_config::{ConfigChange, ConfigManager, ConfigValue, OnlineConfig, Result as CfgResult}; @@ -3625,8 +3625,6 @@ fn to_change_value(v: &str, typed: &ConfigValue) -> CfgResult { ConfigValue::I32(_) => ConfigValue::from(v.parse::()?), ConfigValue::Usize(_) => ConfigValue::from(v.parse::()?), ConfigValue::Bool(_) => ConfigValue::from(v.parse::()?), - ConfigValue::BlobRunMode(_) => ConfigValue::from(v.parse::()?), - ConfigValue::IOPriority(_) => ConfigValue::from(v.parse::()?), ConfigValue::String(_) => ConfigValue::String(v.to_owned()), _ => unreachable!(), }; @@ -3652,9 +3650,7 @@ fn to_toml_encode(change: HashMap) -> CfgResult Ok(true), + | ConfigValue::String(_) => Ok(true), ConfigValue::None => Err(Box::new(IoError::new( ErrorKind::Other, format!("unexpect none field: {:?}", c), @@ -3792,7 +3788,7 @@ impl ConfigController { diff = { let incoming = self.get_current(); let mut updated = incoming.clone(); - updated.update(diff); + updated.update(diff)?; // Config might be adjusted in `validate`. updated.validate()?; incoming.diff(&updated) @@ -3806,7 +3802,8 @@ impl ConfigController { // dispatched to corresponding config manager, to avoid dispatch change twice if let Some(mgr) = inner.config_mgrs.get_mut(&Module::from(name.as_str())) { if let Err(e) = mgr.dispatch(change.clone()) { - inner.current.update(to_update); + // we already verified the correctness at the beginning of this function. + inner.current.update(to_update).unwrap(); return Err(e); } } @@ -3818,7 +3815,8 @@ impl ConfigController { } } debug!("all config change had been dispatched"; "change" => ?to_update); - inner.current.update(to_update); + // we already verified the correctness at the beginning of this function. + inner.current.update(to_update).unwrap(); // Write change to the config file if let Some(change) = change { let content = { @@ -4395,7 +4393,7 @@ mod tests { cfg_controller .update_config("resolved-ts.advance-ts-interval", "100ms") .unwrap(); - resolved_ts_cfg.update(rx.recv().unwrap()); + resolved_ts_cfg.update(rx.recv().unwrap()).unwrap(); assert_eq!( resolved_ts_cfg.advance_ts_interval, ReadableDuration::millis(100) @@ -4416,7 +4414,7 @@ mod tests { cfg_controller .update_config("resolved-ts.advance-ts-interval", "3s") .unwrap(); - resolved_ts_cfg.update(rx.recv().unwrap()); + resolved_ts_cfg.update(rx.recv().unwrap()).unwrap(); assert_eq!( resolved_ts_cfg.advance_ts_interval, ReadableDuration::secs(3) @@ -4570,7 +4568,7 @@ mod tests { let diff = config_value_to_string(diff.into_iter().collect()); assert_eq!(diff.len(), 1); assert_eq!(diff[0].0.as_str(), "blob_run_mode"); - assert_eq!(diff[0].1.as_str(), "kFallback"); + assert_eq!(diff[0].1.as_str(), "fallback"); } #[test] diff --git a/src/server/config.rs b/src/server/config.rs index 648c0c0853d..e88ee55b8c9 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -421,7 +421,7 @@ impl ConfigManager for ServerConfigManager { fn dispatch(&mut self, c: ConfigChange) -> std::result::Result<(), Box> { { let change = c.clone(); - self.config.update(move |cfg| cfg.update(change)); + self.config.update(move |cfg| cfg.update(change))?; if let Some(value) = c.get("grpc_memory_pool_quota") { let mem_quota: ReadableSize = value.clone().into(); // the resize is done inplace indeed, but grpc-rs's api need self, so we just diff --git a/src/server/gc_worker/config.rs b/src/server/gc_worker/config.rs index 3b2699f5a11..9406e39d993 100644 --- a/src/server/gc_worker/config.rs +++ b/src/server/gc_worker/config.rs @@ -54,7 +54,8 @@ impl ConfigManager for GcWorkerConfigManager { ) -> std::result::Result<(), Box> { { let change = change.clone(); - self.0.update(move |cfg: &mut GcConfig| cfg.update(change)); + self.0 + .update(move |cfg: &mut GcConfig| cfg.update(change))?; } info!( "GC worker config changed"; diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index bc691bcc05f..214c5cb6b66 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -1232,7 +1232,10 @@ mod tests { assert!(msg_buf.full()); // update config - version_track.update(|cfg| cfg.max_grpc_send_msg_len *= 2); + let _ = version_track.update(|cfg| -> Result<(), ()> { + cfg.max_grpc_send_msg_len *= 2; + Ok(()) + }); msg_buf.clear(); let new_max_msg_len = diff --git a/src/storage/config_manager.rs b/src/storage/config_manager.rs index b72c0cbf16a..217ebbb25c8 100644 --- a/src/storage/config_manager.rs +++ b/src/storage/config_manager.rs @@ -2,7 +2,7 @@ //! Storage online config manager. -use std::sync::Arc; +use std::{convert::TryInto, sync::Arc}; use engine_traits::{CFNamesExt, CFOptionsExt, ColumnFamilyOptions, CF_DEFAULT}; use file_system::{get_io_rate_limiter, IOPriority, IOType}; @@ -110,7 +110,7 @@ impl ConfigManager for StorageConfigManger { for t in IOType::iter() { if let Some(priority) = io_rate_limit.remove(&(t.as_str().to_owned() + "_priority")) { - let priority: IOPriority = priority.into(); + let priority: IOPriority = priority.try_into()?; limiter.set_io_priority(t, priority); } } diff --git a/tests/integrations/config/dynamic/gc_worker.rs b/tests/integrations/config/dynamic/gc_worker.rs index fbc02b9266b..19e97058616 100644 --- a/tests/integrations/config/dynamic/gc_worker.rs +++ b/tests/integrations/config/dynamic/gc_worker.rs @@ -145,19 +145,28 @@ fn test_change_io_limit_by_debugger() { }); // Enable io iolimit - config_manager.update(|cfg: &mut GcConfig| cfg.max_write_bytes_per_sec = ReadableSize(1024)); + let _ = config_manager.update(|cfg: &mut GcConfig| -> Result<(), ()> { + cfg.max_write_bytes_per_sec = ReadableSize(1024); + Ok(()) + }); validate(&scheduler, move |_, limiter: &Limiter| { assert_eq!(limiter.speed_limit(), 1024.0); }); // Change io iolimit - config_manager.update(|cfg: &mut GcConfig| cfg.max_write_bytes_per_sec = ReadableSize(2048)); + let _ = config_manager.update(|cfg: &mut GcConfig| -> Result<(), ()> { + cfg.max_write_bytes_per_sec = ReadableSize(2048); + Ok(()) + }); validate(&scheduler, move |_, limiter: &Limiter| { assert_eq!(limiter.speed_limit(), 2048.0); }); // Disable io iolimit - config_manager.update(|cfg: &mut GcConfig| cfg.max_write_bytes_per_sec = ReadableSize(0)); + let _ = config_manager.update(|cfg: &mut GcConfig| -> Result<(), ()> { + cfg.max_write_bytes_per_sec = ReadableSize(0); + Ok(()) + }); validate(&scheduler, move |_, limiter: &Limiter| { assert_eq!(limiter.speed_limit(), f64::INFINITY); }); diff --git a/tests/integrations/config/test_config_client.rs b/tests/integrations/config/test_config_client.rs index b911dcb7b99..52cdc9cb012 100644 --- a/tests/integrations/config/test_config_client.rs +++ b/tests/integrations/config/test_config_client.rs @@ -64,8 +64,7 @@ fn test_dispatch_change() { impl ConfigManager for CfgManager { fn dispatch(&mut self, c: ConfigChange) -> Result<(), Box> { - self.0.lock().unwrap().update(c); - Ok(()) + self.0.lock().unwrap().update(c) } } @@ -198,8 +197,7 @@ fn test_update_from_toml_file() { impl ConfigManager for CfgManager { fn dispatch(&mut self, c: ConfigChange) -> Result<(), Box> { - self.0.lock().unwrap().update(c); - Ok(()) + self.0.lock().unwrap().update(c) } } From 6bd1d4510652279e4d260317b628aafa6c5fbd27 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Tue, 28 Jun 2022 15:58:39 +0800 Subject: [PATCH 050/676] raftstore: separate raft and kv perf contexts (#12915) ref tikv/raft-engine#227, ref tikv/tikv#12362 We used to record perf contexts for Raft RocksDB and KV RocksDB with the same PerfContext. But we also have raft-engine now. So, we will miss perf contexts if we still use RocksDB perf contexts. This commit adds PerfContext support to RaftEngine and distinguish it from the perf context used for applying. Then, we'll record correct perf statistics for both raft engine and KV DB. Updated raft-engine to include tikv/raft-engine#227 Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- Cargo.lock | 5 ++-- .../engine_rocks/src/perf_context_impl.rs | 16 ++++++++++ components/engine_traits/src/raft_engine.rs | 2 +- components/raft_log_engine/Cargo.toml | 1 + components/raft_log_engine/src/engine.rs | 18 ++++++++++-- components/raft_log_engine/src/lib.rs | 2 ++ .../raft_log_engine/src/perf_context.rs | 29 +++++++++++++++++++ .../raftstore/src/store/async_io/write.rs | 5 ++-- components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/fsm/store.rs | 9 ++++-- components/raftstore/src/store/peer.rs | 2 +- components/tracker/src/lib.rs | 9 ++++++ 12 files changed, 88 insertions(+), 12 deletions(-) create mode 100644 components/raft_log_engine/src/perf_context.rs diff --git a/Cargo.lock b/Cargo.lock index 7c9902b7534..dbc37bf0407 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4024,7 +4024,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.2.0" -source = "git+https://github.com/tikv/raft-engine.git#4e89901a3eff850a47ea0e6b44bc74d9fed84769" +source = "git+https://github.com/tikv/raft-engine.git#07dcadbf51b43fed70346e33b5db07723e655828" dependencies = [ "byteorder", "crc32fast", @@ -4055,7 +4055,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.2.0" -source = "git+https://github.com/tikv/raft-engine.git#4e89901a3eff850a47ea0e6b44bc74d9fed84769" +source = "git+https://github.com/tikv/raft-engine.git#07dcadbf51b43fed70346e33b5db07723e655828" dependencies = [ "clap 3.1.6", "env_logger", @@ -4092,6 +4092,7 @@ dependencies = [ "slog-global", "tikv_util", "time", + "tracker", ] [[package]] diff --git a/components/engine_rocks/src/perf_context_impl.rs b/components/engine_rocks/src/perf_context_impl.rs index c1c299def66..c6eb187b392 100644 --- a/components/engine_rocks/src/perf_context_impl.rs +++ b/components/engine_rocks/src/perf_context_impl.rs @@ -216,9 +216,25 @@ impl PerfContextStatistics { match self.kind { PerfContextKind::RaftstoreApply => { report_write_perf_context!(self, APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| { + t.metrics.apply_mutex_lock_nanos = self.write.db_mutex_lock_nanos; + t.metrics.apply_thread_wait_nanos = self.write.write_thread_wait; + t.metrics.apply_write_wal_nanos = self.write.write_wal_time; + t.metrics.apply_write_memtable_nanos = self.write.write_memtable_time; + }); + } } PerfContextKind::RaftstoreStore => { report_write_perf_context!(self, STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| { + t.metrics.store_mutex_lock_nanos = self.write.db_mutex_lock_nanos; + t.metrics.store_thread_wait_nanos = self.write.write_thread_wait; + t.metrics.store_write_wal_nanos = self.write.write_wal_time; + t.metrics.store_write_memtable_nanos = self.write.write_memtable_time; + }); + } } PerfContextKind::Storage(_) | PerfContextKind::Coprocessor(_) => { let perf_context = ReadPerfContext::capture(); diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 03cb2a41a41..e119184c556 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -65,7 +65,7 @@ pub struct RaftLogGCTask { pub to: u64, } -pub trait RaftEngine: RaftEngineReadOnly + Clone + Sync + Send + 'static { +pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send + 'static { type LogBatch: RaftLogBatch; fn log_batch(&self, capacity: usize) -> Self::LogBatch; diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 5df8d5f3852..d13e9ea4a0b 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -21,3 +21,4 @@ slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debu slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tikv_util = { path = "../tikv_util", default-features = false } time = "0.1" +tracker = { path = "../tracker" } diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index ae895f1ac36..8c9a7fd2b88 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -9,8 +9,8 @@ use std::{ use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter}; use engine_traits::{ - CacheStats, EncryptionKeyManager, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, - RaftLogBatch as RaftLogBatchTrait, RaftLogGCTask, Result, + CacheStats, EncryptionKeyManager, PerfContextExt, PerfContextKind, PerfLevel, RaftEngine, + RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, RaftLogGCTask, Result, }; use file_system::{IOOp, IORateLimiter, IOType}; use kvproto::{ @@ -25,6 +25,8 @@ use raft_engine::{ pub use raft_engine::{Config as RaftEngineConfig, ReadableSize, RecoveryMode}; use tikv_util::Either; +use crate::perf_context::RaftEnginePerfContext; + // A special region ID representing global state. const STORE_REGION_ID: u64 = 0; @@ -222,6 +224,10 @@ impl FileSystem for ManagedFileSystem { }) } } + + fn delete>(&self, path: P) -> IoResult<()> { + self.base_level_file_system.delete(path) + } } #[derive(Clone)] @@ -261,6 +267,14 @@ impl RaftLogEngine { } } +impl PerfContextExt for RaftLogEngine { + type PerfContext = RaftEnginePerfContext; + + fn get_perf_context(&self, _level: PerfLevel, _kind: PerfContextKind) -> Self::PerfContext { + RaftEnginePerfContext + } +} + #[derive(Default)] pub struct RaftLogBatch(LogBatch); diff --git a/components/raft_log_engine/src/lib.rs b/components/raft_log_engine/src/lib.rs index 8b83acfe6be..7b8757d6531 100644 --- a/components/raft_log_engine/src/lib.rs +++ b/components/raft_log_engine/src/lib.rs @@ -21,4 +21,6 @@ extern crate tikv_util; mod engine; +mod perf_context; + pub use engine::{RaftEngineConfig, RaftLogBatch, RaftLogEngine, ReadableSize, RecoveryMode}; diff --git a/components/raft_log_engine/src/perf_context.rs b/components/raft_log_engine/src/perf_context.rs new file mode 100644 index 00000000000..87946e2f48e --- /dev/null +++ b/components/raft_log_engine/src/perf_context.rs @@ -0,0 +1,29 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use raft_engine::get_perf_context; +use tracker::{TrackerToken, GLOBAL_TRACKERS}; + +#[derive(Debug)] +pub struct RaftEnginePerfContext; + +impl engine_traits::PerfContext for RaftEnginePerfContext { + fn start_observe(&mut self) { + raft_engine::set_perf_context(Default::default()); + } + + fn report_metrics(&mut self, trackers: &[TrackerToken]) { + let perf_context = get_perf_context(); + for token in trackers { + GLOBAL_TRACKERS.with_tracker(*token, |t| { + t.metrics.store_thread_wait_nanos = + perf_context.write_wait_duration.as_nanos() as u64; + t.metrics.store_write_wal_nanos = (perf_context.log_write_duration + + perf_context.log_sync_duration + + perf_context.log_rotate_duration) + .as_nanos() as u64; + t.metrics.store_write_memtable_nanos = + perf_context.apply_duration.as_nanos() as u64; + }); + } + } +} diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 99c4f56b7e4..c788f7c2d1e 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -353,7 +353,7 @@ where raft_write_size_limit: usize, metrics: StoreWriteMetrics, message_metrics: RaftSendMessageMetrics, - perf_context: EK::PerfContext, + perf_context: ER::PerfContext, pending_latency_inspect: Vec<(Instant, Vec)>, } @@ -378,7 +378,7 @@ where engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE), ); let perf_context = engines - .kv + .raft .get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); let cfg_tracker = cfg.clone().tracker(tag.clone()); Self { @@ -541,7 +541,6 @@ where .iter() .flat_map(|task| task.trackers.iter().flat_map(|t| t.as_tracker_token())) .collect(); - // TODO: Add a different perf context for raft engine. self.perf_context.report_metrics(&trackers); write_raft_time = duration_to_sec(now.saturating_elapsed()); STORE_WRITE_RAFTDB_DURATION_HISTOGRAM.observe(write_raft_time); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index ba819bda155..bed2b02a78f 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -3294,7 +3294,7 @@ where let is_initialized = self.fsm.peer.is_initialized(); if let Err(e) = self.fsm.peer.destroy( &self.ctx.engines, - &mut self.ctx.perf_context, + &mut self.ctx.raft_perf_context, merged_by_target, &self.ctx.pending_create_peers, ) { diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 54f4f45f9ab..63b0a583030 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -481,7 +481,8 @@ where pub ready_count: usize, pub has_ready: bool, pub current_time: Option, - pub perf_context: EK::PerfContext, + pub raft_perf_context: ER::PerfContext, + pub kv_perf_context: EK::PerfContext, pub tick_batch: Vec, pub node_start_time: Option, /// Disk usage for the store itself. @@ -1280,7 +1281,11 @@ where ready_count: 0, has_ready: false, current_time: None, - perf_context: self + raft_perf_context: self + .engines + .raft + .get_perf_context(self.cfg.value().perf_level, PerfContextKind::RaftstoreStore), + kv_perf_context: self .engines .kv .get_perf_context(self.cfg.value().perf_level, PerfContextKind::RaftstoreStore), diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 2853fcd4169..cf54d962075 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1221,7 +1221,7 @@ where pub fn destroy( &mut self, engines: &Engines, - perf_context: &mut EK::PerfContext, + perf_context: &mut ER::PerfContext, keep_data: bool, pending_create_peers: &Mutex>, ) -> Result<()> { diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index 25a5610d034..3729fb1ec9d 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -106,6 +106,15 @@ pub struct RequestMetrics { pub wf_commit_log_nanos: u64, pub propose_send_wait_nanos: u64, pub commit_not_persisted: bool, + pub store_mutex_lock_nanos: u64, + pub store_thread_wait_nanos: u64, + pub store_write_wal_nanos: u64, + pub store_write_memtable_nanos: u64, pub store_time_nanos: u64, + pub apply_wait_nanos: u64, pub apply_time_nanos: u64, + pub apply_mutex_lock_nanos: u64, + pub apply_thread_wait_nanos: u64, + pub apply_write_wal_nanos: u64, + pub apply_write_memtable_nanos: u64, } From d356be1d051f38b9cd9c9239468073c05ccc6c03 Mon Sep 17 00:00:00 2001 From: zkkxu <76540804+zkkxu@users.noreply.github.com> Date: Wed, 29 Jun 2022 14:48:39 +0800 Subject: [PATCH 051/676] gRPC: use gzip level-2 compression by default (#12791) ref tikv/tikv#12929 add initial arguments for gzip compression: gzip_compression_level: represent gzip compression level, the origin gzip compression level is 6 and hardcoding; compression_lower_bound: this represent gzip will compress the data only larger than this Signed-off-by: zkkxu Signed-off-by: xufei --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- src/server/config.rs | 8 ++++++++ src/server/raft_client.rs | 2 ++ src/server/snap.rs | 4 +++- tests/integrations/config/mod.rs | 2 ++ 6 files changed, 20 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dbc37bf0407..893b5d909f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2186,9 +2186,9 @@ dependencies = [ [[package]] name = "grpcio" -version = "0.10.2" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86ef249d9cb1b1843767501ae7463b500542e7f9e72d9c2d61ed320fbefa6c79" +checksum = "f9bcdd3694fa08158334501af37bdf5b4f00b1865b602d917e3cd74ecf80cd0a" dependencies = [ "futures-executor", "futures-util", @@ -2223,9 +2223,9 @@ dependencies = [ [[package]] name = "grpcio-sys" -version = "0.10.1+1.44.0" +version = "0.10.3+1.44.0-patched" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925586932dbbea927e913783da0be160ee74e0b0519d7b20cec35547a0a84631" +checksum = "f23adc509a3c4dea990e0ab8d2add4a65389ee69c288b7851d75dd1df7a6d6c6" dependencies = [ "bindgen 0.59.2", "cc", diff --git a/Cargo.toml b/Cargo.toml index da68c7aa75c..622547b2294 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -95,7 +95,7 @@ futures-executor = "0.3.1" futures-timer = "3.0" futures-util = { version = "0.3.1", default-features = false, features = ["io", "async-await"] } getset = "0.1" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { version = "0.10.3", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } hex = "0.4" http = "0" diff --git a/src/server/config.rs b/src/server/config.rs index e88ee55b8c9..8a581d5eeba 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -26,6 +26,8 @@ const DEFAULT_GRPC_CONCURRENT_STREAM: i32 = 1024; const DEFAULT_GRPC_RAFT_CONN_NUM: usize = 1; const DEFAULT_GRPC_MEMORY_POOL_QUOTA: u64 = isize::MAX as u64; const DEFAULT_GRPC_STREAM_INITIAL_WINDOW_SIZE: u64 = 2 * 1024 * 1024; +const DEFAULT_GRPC_GZIP_COMPRESSION_LEVEL: usize = 2; +const DEFAULT_GRPC_MIN_MESSAGE_SIZE_TO_COMPRESS: usize = 4096; // Number of rows in each chunk. const DEFAULT_ENDPOINT_BATCH_ROW_LIMIT: usize = 64; @@ -98,6 +100,10 @@ pub struct Config { #[online_config(skip)] pub grpc_compression_type: GrpcCompressionType, #[online_config(skip)] + pub grpc_gzip_compression_level: usize, + #[online_config(skip)] + pub grpc_min_message_size_to_compress: usize, + #[online_config(skip)] pub grpc_concurrency: usize, #[online_config(skip)] pub grpc_concurrent_stream: i32, @@ -213,6 +219,8 @@ impl Default for Config { raft_client_queue_size: 8192, raft_msg_max_batch_size: 128, grpc_compression_type: GrpcCompressionType::None, + grpc_gzip_compression_level: DEFAULT_GRPC_GZIP_COMPRESSION_LEVEL, + grpc_min_message_size_to_compress: DEFAULT_GRPC_MIN_MESSAGE_SIZE_TO_COMPRESS, grpc_concurrency: DEFAULT_GRPC_CONCURRENCY, grpc_concurrent_stream: DEFAULT_GRPC_CONCURRENT_STREAM, grpc_raft_conn_num: DEFAULT_GRPC_RAFT_CONN_NUM, diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index 214c5cb6b66..4b2815f5d73 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -692,6 +692,8 @@ where .keepalive_time(cfg.grpc_keepalive_time.0) .keepalive_timeout(cfg.grpc_keepalive_timeout.0) .default_compression_algorithm(cfg.grpc_compression_algorithm()) + .default_gzip_compression_level(cfg.grpc_gzip_compression_level) + .default_grpc_min_message_size_to_compress(cfg.grpc_min_message_size_to_compress) // hack: so it's different args, grpc will always create a new connection. .raw_cfg_int( CString::new("random id").unwrap(), diff --git a/src/server/snap.rs b/src/server/snap.rs index 9b86b4778b4..15304c51cdd 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -166,7 +166,9 @@ pub fn send_snap( .stream_initial_window_size(cfg.grpc_stream_initial_window_size.0 as i32) .keepalive_time(cfg.grpc_keepalive_time.0) .keepalive_timeout(cfg.grpc_keepalive_timeout.0) - .default_compression_algorithm(cfg.grpc_compression_algorithm()); + .default_compression_algorithm(cfg.grpc_compression_algorithm()) + .default_gzip_compression_level(cfg.grpc_gzip_compression_level) + .default_grpc_min_message_size_to_compress(cfg.grpc_min_message_size_to_compress); let channel = security_mgr.connect(cb, addr); let client = TikvClient::new(channel); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 589b0ff7a56..2428d265391 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -80,6 +80,8 @@ fn test_serde_custom_tikv_config() { labels: HashMap::from_iter([("a".to_owned(), "b".to_owned())]), advertise_addr: "example.com:443".to_owned(), status_addr: "example.com:443".to_owned(), + grpc_gzip_compression_level: 2, + grpc_min_message_size_to_compress: 4096, advertise_status_addr: "example.com:443".to_owned(), status_thread_pool_size: 1, max_grpc_send_msg_len: 6 * (1 << 20), From ed1c6a0affacfbfac7124c54e3a7b6931566e0e4 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 30 Jun 2022 10:44:38 +0800 Subject: [PATCH 052/676] tests: natively support nextest (#12799) close tikv/tikv#12769 Signed-off-by: tabokie Signed-off-by: Xinye Tao --- .config/nextest.toml | 7 +++++++ CONTRIBUTING.md | 6 ++++++ Makefile | 8 ++++++++ scripts/test | 10 ++++++---- scripts/test-all | 12 ++++++------ tests/failpoints/cases/test_split_region.rs | 2 ++ tests/failpoints/cases/test_unsafe_recovery.rs | 3 ++- 7 files changed, 37 insertions(+), 11 deletions(-) create mode 100644 .config/nextest.toml diff --git a/.config/nextest.toml b/.config/nextest.toml new file mode 100644 index 00000000000..247389fcd17 --- /dev/null +++ b/.config/nextest.toml @@ -0,0 +1,7 @@ +[profile.ci] +retries = 1 # Run at most 2 times +fail-fast = false +slow-timeout = { period = "60s", terminate-after = 2 } # Timeout=120s + +[profile.ci.junit] +path = "junit.xml" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index faccf2818c1..711b2bdb192 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -77,6 +77,12 @@ make test env EXTRA_CARGO_ARGS=$TESTNAME make test ``` +Alternatively, you can use [nextest](https://github.com/nextest-rs/nextest) to run tests: + +```bash +env EXTRA_CARGO_ARGS=$TESTNAME make test_with_nextest +``` + TiKV follows the Rust community coding style. We use Rustfmt and [Clippy](https://github.com/Manishearth/rust-clippy) to automatically format and lint our code. Using these tools is checked in our CI. These are as part of `make dev`, you can also run them alone: ```bash diff --git a/Makefile b/Makefile index a41055f7430..22c575abb8f 100644 --- a/Makefile +++ b/Makefile @@ -311,6 +311,14 @@ run: # Run tests under a variety of conditions. This should pass before # submitting pull requests. test: + ./scripts/test-all -- --nocapture + +# Run tests with nextest. +ifndef CUSTOM_TEST_COMMAND +test_with_nextest: export CUSTOM_TEST_COMMAND=nextest run +endif +test_with_nextest: export RUSTDOCFLAGS="-Z unstable-options --persist-doctests" +test_with_nextest: ./scripts/test-all ## Static analysis diff --git a/scripts/test b/scripts/test index 547cd20d25d..e4c46c6a620 100755 --- a/scripts/test +++ b/scripts/test @@ -8,16 +8,17 @@ set -euo pipefail # Run from the Makefile environment MAKEFILE_RUN=${MAKEFILE_RUN:-""} if [[ -z $MAKEFILE_RUN ]] ; then - COMMAND="$0 $*" exec make run + COMMAND="$0 $*" exec make run fi SHELL_DEBUG=${SHELL_DEBUG:-""} if [[ -n "$SHELL_DEBUG" ]] ; then - set -x + set -x fi DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH:-""} LOCAL_DIR=${LOCAL_DIR:-""} TIKV_ENABLE_FEATURES=${TIKV_ENABLE_FEATURES:-""} +CUSTOM_TEST_COMMAND=${CUSTOM_TEST_COMMAND:-"test"} # EXTRA_CARGO_ARGS is unecessary now: this can just be given as arguments to ./scripts/test-all or ./scripts/test EXTRA_CARGO_ARGS=${EXTRA_CARGO_ARGS:-""} @@ -27,6 +28,7 @@ export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}:${LOCAL_DIR}/lib" export LOG_LEVEL=DEBUG export RUST_BACKTRACE=full -cargo test --workspace \ - --exclude fuzzer-honggfuzz --exclude fuzzer-afl --exclude fuzzer-libfuzzer \ +cargo $CUSTOM_TEST_COMMAND --workspace \ + --exclude fuzz --exclude fuzzer-afl --exclude fuzzer-honggfuzz \ + --exclude fuzzer-libfuzzer --exclude fuzz-targets \ --features "${TIKV_ENABLE_FEATURES}" ${EXTRA_CARGO_ARGS} "$@" diff --git a/scripts/test-all b/scripts/test-all index 246a8f22176..2d37ccde992 100755 --- a/scripts/test-all +++ b/scripts/test-all @@ -9,17 +9,17 @@ set -euo pipefail # Run from the Makefile environment MAKEFILE_RUN=${MAKEFILE_RUN:-""} if [[ -z $MAKEFILE_RUN ]] ; then - COMMAND="$0 $*" exec make run + COMMAND="$0 $*" exec make run fi -./scripts/test "$@" -- --nocapture +./scripts/test "$@" && echo # Re-run tests that requires specific environment variables. if [[ "$(uname)" == "Linux" ]]; then export MALLOC_CONF=prof:true - ./scripts/test ifdef_malloc_conf "$@" -- --nocapture + ./scripts/test ifdef_malloc_conf "$@" && echo fi if [[ "$(uname)" = "Linux" ]]; then - EXTRA_CARGO_ARGS="" ./scripts/test --message-format=json-render-diagnostics -q --no-run -- --nocapture | - python scripts/check-bins.py --features "${TIKV_ENABLE_FEATURES}" --check-tests -fi \ No newline at end of file + CUSTOM_TEST_COMMAND="" EXTRA_CARGO_ARGS="" ./scripts/test --message-format=json-render-diagnostics -q --no-run | + python scripts/check-bins.py --features "${TIKV_ENABLE_FEATURES}" --check-tests +fi diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 8b42959fc01..09eb603ff8e 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -983,6 +983,8 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { fail::remove("txn_before_process_write"); let resp = resp.join().unwrap(); assert!(resp.get_region_error().has_epoch_not_match(), "{:?}", resp); + + fail::remove("on_split_invalidate_locks"); } /// Logs are gced asynchronously. If an uninitialized peer is destroyed before being replaced by diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index f791b40c065..290a3561be9 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -268,9 +268,10 @@ fn test_unsafe_recovery_wait_for_snapshot_apply() { sleep_ms(100); } assert_ne!(store_report, None); + fail::remove("worker_gc_raft_log"); fail::remove("worker_gc_raft_log_finished"); - fail::remove("raft_before_apply_snap_callback"); + fail::remove("region_apply_snap"); } #[test] From 2e1513c83ffa62fc2edc3d6d28c14cb92e82ddbe Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 30 Jun 2022 14:48:39 +0800 Subject: [PATCH 053/676] encryption: fix issue with opening plaintext files (#12272) close tikv/tikv#12162 Signed-off-by: tabokie --- components/encryption/src/crypter.rs | 3 ++ components/encryption/src/manager/mod.rs | 53 ++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/components/encryption/src/crypter.rs b/components/encryption/src/crypter.rs index c17560d4a38..9c148e62247 100644 --- a/components/encryption/src/crypter.rs +++ b/components/encryption/src/crypter.rs @@ -53,6 +53,7 @@ const CTR_IV_16: usize = 16; pub enum Iv { Gcm([u8; GCM_IV_12]), Ctr([u8; CTR_IV_16]), + Empty, } impl Iv { @@ -91,6 +92,7 @@ impl Iv { match self { Iv::Ctr(iv) => iv, Iv::Gcm(iv) => iv, + Iv::Empty => &[], } } @@ -102,6 +104,7 @@ impl Iv { Ok(()) } Iv::Gcm(_) => Err(box_err!("offset addition is not supported for GCM mode")), + Iv::Empty => Err(box_err!("empty Iv")), } } } diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index bc4b97de7a2..cd9be1b554d 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -12,7 +12,9 @@ use std::{ }; use crossbeam::channel::{self, select, tick}; -use engine_traits::{EncryptionKeyManager, FileEncryptionInfo}; +use engine_traits::{ + EncryptionKeyManager, EncryptionMethod as DBEncryptionMethod, FileEncryptionInfo, +}; use fail::fail_point; use file_system::File; use kvproto::encryptionpb::{DataKey, EncryptionMethod, FileDictionary, FileInfo, KeyDictionary}; @@ -597,7 +599,12 @@ impl DataKeyManager { writer, crypter::encryption_method_from_db_encryption_method(file.method), &file.key, - Iv::from_slice(&file.iv)?, + if file.method == DBEncryptionMethod::Plaintext { + debug_assert!(file.iv.is_empty()); + Iv::Empty + } else { + Iv::from_slice(&file.iv)? + }, ) } @@ -622,7 +629,12 @@ impl DataKeyManager { reader, crypter::encryption_method_from_db_encryption_method(file.method), &file.key, - Iv::from_slice(&file.iv)?, + if file.method == DBEncryptionMethod::Plaintext { + debug_assert!(file.iv.is_empty()); + Iv::Empty + } else { + Iv::from_slice(&file.iv)? + }, ) } @@ -1271,4 +1283,39 @@ mod tests { let result = new_key_manager(&tmp_dir, None, right_key, previous); assert!(result.is_ok()); } + + #[test] + fn test_plaintext_encrypter_writer() { + use std::io::{Read, Write}; + + let _guard = LOCK_FOR_GAUGE.lock().unwrap(); + let (key_path, _tmp_key_dir) = create_key_file("key"); + let master_key_backend = + Box::new(FileBackend::new(key_path.as_path()).unwrap()) as Box; + let tmp_dir = tempfile::TempDir::new().unwrap(); + let previous = new_mock_backend() as Box; + let manager = new_key_manager(&tmp_dir, None, master_key_backend, previous).unwrap(); + let path = tmp_dir.path().join("nonencyrpted"); + let content = "I'm exposed.".to_string(); + { + let raw = File::create(&path).unwrap(); + let mut f = manager + .open_file_with_writer(&path, raw, false /*create*/) + .unwrap(); + f.write_all(content.as_bytes()).unwrap(); + f.sync_all().unwrap(); + } + { + let mut buffer = String::new(); + let mut f = File::open(&path).unwrap(); + assert_eq!(f.read_to_string(&mut buffer).unwrap(), content.len()); + assert_eq!(buffer, content); + } + { + let mut buffer = String::new(); + let mut f = manager.open_file_for_read(&path).unwrap(); + assert_eq!(f.read_to_string(&mut buffer).unwrap(), content.len()); + assert_eq!(buffer, content); + } + } } From 50e0cf4ee720a36ce62a2d80c341cf48533e2977 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Fri, 1 Jul 2022 15:30:39 +0800 Subject: [PATCH 054/676] raftstore: support to split the region on half with the given key range (#12944) close tikv/tikv#12943 Support to split the region on half with the given key range. Signed-off-by: JmPotato --- components/keys/src/lib.rs | 6 +- .../src/coprocessor/split_check/half.rs | 73 ++++++++++++ components/raftstore/src/store/fsm/peer.rs | 36 +++++- components/raftstore/src/store/msg.rs | 6 +- components/raftstore/src/store/worker/pd.rs | 2 + .../raftstore/src/store/worker/split_check.rs | 104 ++++++++++++++++-- components/test_raftstore/src/cluster.rs | 2 + 7 files changed, 211 insertions(+), 18 deletions(-) diff --git a/components/keys/src/lib.rs b/components/keys/src/lib.rs index a403b939727..fa855bbe353 100644 --- a/components/keys/src/lib.rs +++ b/components/keys/src/lib.rs @@ -241,11 +241,11 @@ pub fn enc_end_key(region: &Region) -> Vec { } #[inline] -pub fn data_end_key(region_end_key: &[u8]) -> Vec { - if region_end_key.is_empty() { +pub fn data_end_key(key: &[u8]) -> Vec { + if key.is_empty() { DATA_MAX_KEY.to_vec() } else { - data_key(region_end_key) + data_key(key) } } diff --git a/components/raftstore/src/coprocessor/split_check/half.rs b/components/raftstore/src/coprocessor/split_check/half.rs index a52b7a59d60..f6d207df875 100644 --- a/components/raftstore/src/coprocessor/split_check/half.rs +++ b/components/raftstore/src/coprocessor/split_check/half.rs @@ -197,6 +197,79 @@ mod tests { must_split_at(&rx, ®ion, vec![split_key.into_encoded()]); } + #[test] + fn test_split_check_with_key_range() { + let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); + let path_str = path.path().to_str().unwrap(); + let db_opts = DBOptions::default(); + let cfs_opts = ALL_CFS + .iter() + .map(|cf| { + let cf_opts = ColumnFamilyOptions::new(); + CFOptions::new(cf, cf_opts) + }) + .collect(); + let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + + let mut region = Region::default(); + region.set_id(1); + region.mut_peers().push(Peer::default()); + region.mut_region_epoch().set_version(2); + region.mut_region_epoch().set_conf_ver(5); + + let (tx, rx) = mpsc::sync_channel(100); + let cfg = Config { + region_max_size: Some(ReadableSize(BUCKET_NUMBER_LIMIT as u64)), + ..Default::default() + }; + let mut runnable = + SplitCheckRunner::new(engine.clone(), tx.clone(), CoprocessorHost::new(tx, cfg)); + + for i in 0..11 { + let k = format!("{:04}", i).into_bytes(); + let k = keys::data_key(Key::from_raw(&k).as_encoded()); + engine.put_cf(CF_DEFAULT, &k, &k).unwrap(); + // Flush for every key so that we can know the exact middle key. + engine.flush_cf(CF_DEFAULT, true).unwrap(); + } + let start_key = Key::from_raw(b"0000").into_encoded(); + let end_key = Key::from_raw(b"0005").into_encoded(); + runnable.run(SplitCheckTask::split_check_key_range( + region.clone(), + Some(start_key), + Some(end_key), + false, + CheckPolicy::Scan, + None, + )); + let split_key = Key::from_raw(b"0003"); + must_split_at(&rx, ®ion, vec![split_key.into_encoded()]); + let start_key = Key::from_raw(b"0005").into_encoded(); + let end_key = Key::from_raw(b"0010").into_encoded(); + runnable.run(SplitCheckTask::split_check_key_range( + region.clone(), + Some(start_key), + Some(end_key), + false, + CheckPolicy::Scan, + None, + )); + let split_key = Key::from_raw(b"0008"); + must_split_at(&rx, ®ion, vec![split_key.into_encoded()]); + let start_key = Key::from_raw(b"0003").into_encoded(); + let end_key = Key::from_raw(b"0008").into_encoded(); + runnable.run(SplitCheckTask::split_check_key_range( + region.clone(), + Some(start_key), + Some(end_key), + false, + CheckPolicy::Scan, + None, + )); + let split_key = Key::from_raw(b"0006"); + must_split_at(&rx, ®ion, vec![split_key.into_encoded()]); + } + fn test_generate_region_bucket_impl(mvcc: bool) { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index bed2b02a78f..02c8d4fe650 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -960,11 +960,20 @@ where } CasualMessage::HalfSplitRegion { region_epoch, + start_key, + end_key, policy, source, cb, } => { - self.on_schedule_half_split_region(®ion_epoch, policy, source, cb); + self.on_schedule_half_split_region( + ®ion_epoch, + start_key, + end_key, + policy, + source, + cb, + ); } CasualMessage::GcSnap { snaps } => { self.on_gc_snap(snaps); @@ -5489,14 +5498,18 @@ where fn on_schedule_half_split_region( &mut self, region_epoch: &metapb::RegionEpoch, + start_key: Option>, + end_key: Option>, policy: CheckPolicy, source: &str, _cb: Callback, ) { + let is_key_range = start_key.is_some() && end_key.is_some(); info!( "on half split"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "is_key_range" => is_key_range, "policy" => ?policy, "source" => source, ); @@ -5506,6 +5519,7 @@ where "not leader, skip"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "is_key_range" => is_key_range, ); return; } @@ -5516,11 +5530,18 @@ where "receive a stale halfsplit message"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "is_key_range" => is_key_range, ); return; } - let split_check_bucket_ranges = self.gen_bucket_range_for_update(); + // Do not check the bucket ranges if we want to split the region with a given key range, + // this is to avoid compatibility issues. + let split_check_bucket_ranges = if !is_key_range { + self.gen_bucket_range_for_update() + } else { + None + }; #[cfg(any(test, feature = "testexport"))] { if let Callback::Test { cb } = _cb { @@ -5531,13 +5552,20 @@ where cb(peer_stat); } } - let task = - SplitCheckTask::split_check(region.clone(), false, policy, split_check_bucket_ranges); + let task = SplitCheckTask::split_check_key_range( + region.clone(), + start_key, + end_key, + false, + policy, + split_check_bucket_ranges, + ); if let Err(e) = self.ctx.split_check_scheduler.schedule(task) { error!( "failed to schedule split check"; "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), + "is_key_range" => is_key_range, "err" => %e, ); } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 46900878178..4f1ea017764 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -401,9 +401,13 @@ pub enum CasualMessage { CompactionDeclinedBytes { bytes: u64, }, - /// Half split the target region. + /// Half split the target region with the given key range. + /// If the key range is not provided, the region's start key + /// and end key will be used by default. HalfSplitRegion { region_epoch: RegionEpoch, + start_key: Option>, + end_key: Option>, policy: CheckPolicy, source: &'static str, cb: Callback, diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 648e8e9344e..a16ec50a7a0 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -1449,6 +1449,8 @@ where } else { CasualMessage::HalfSplitRegion { region_epoch: epoch, + start_key: None, + end_key: None, policy: split_region.get_policy(), source: "pd", cb: Callback::None, diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 922f927ddb3..3822575fb8e 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -16,6 +16,7 @@ use kvproto::{ }; use online_config::{ConfigChange, OnlineConfig}; use tikv_util::{box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable}; +use txn_types::Key; use super::metrics::*; #[cfg(any(test, feature = "testexport"))] @@ -145,6 +146,8 @@ pub struct Bucket { pub enum Task { SplitCheckTask { region: Region, + start_key: Option>, + end_key: Option>, auto_split: bool, policy: CheckPolicy, bucket_ranges: Option>, @@ -164,6 +167,26 @@ impl Task { ) -> Task { Task::SplitCheckTask { region, + start_key: None, + end_key: None, + auto_split, + policy, + bucket_ranges, + } + } + + pub fn split_check_key_range( + region: Region, + start_key: Option>, + end_key: Option>, + auto_split: bool, + policy: CheckPolicy, + bucket_ranges: Option>, + ) -> Task { + Task::SplitCheckTask { + region, + start_key, + end_key, auto_split, policy, bucket_ranges, @@ -175,11 +198,17 @@ impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { Task::SplitCheckTask { - region, auto_split, .. + region, + start_key, + end_key, + auto_split, + .. } => write!( f, - "[split check worker] Split Check Task for {}, auto_split: {:?}", + "[split check worker] Split Check Task for {}, start_key: {:?}, end_key: {:?}, auto_split: {:?}", region.get_id(), + start_key, + end_key, auto_split ), Task::ChangeConfig(_) => write!(f, "[split check worker] Change Config Task"), @@ -314,16 +343,29 @@ where fn check_split_and_bucket( &mut self, region: &Region, + start_key: Option>, + end_key: Option>, auto_split: bool, policy: CheckPolicy, bucket_ranges: Option>, ) { let region_id = region.get_id(); - let start_key = keys::enc_start_key(region); - let end_key = keys::enc_end_key(region); + let is_key_range = start_key.is_some() && end_key.is_some(); + let start_key = if is_key_range { + // This key is usually from a request, which should be encoded first. + keys::data_key(Key::from_raw(&start_key.unwrap()).as_encoded().as_slice()) + } else { + keys::enc_start_key(region) + }; + let end_key = if is_key_range { + keys::data_end_key(Key::from_raw(&end_key.unwrap()).as_encoded().as_slice()) + } else { + keys::enc_end_key(region) + }; debug!( "executing task"; "region_id" => region_id, + "is_key_range" => is_key_range, "start_key" => log_wrappers::Value::key(&start_key), "end_key" => log_wrappers::Value::key(&end_key), "policy" => ?policy, @@ -334,16 +376,33 @@ where .new_split_checker_host(region, &self.engine, auto_split, policy); if host.skip() { - debug!("skip split check"; "region_id" => region.get_id()); + debug!("skip split check"; + "region_id" => region.get_id(), + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + ); return; } let split_keys = match host.policy() { CheckPolicy::Scan => { - match self.scan_split_keys(&mut host, region, &start_key, &end_key, bucket_ranges) { + match self.scan_split_keys( + &mut host, + region, + is_key_range, + &start_key, + &end_key, + bucket_ranges, + ) { Ok(keys) => keys, Err(e) => { - error!(%e; "failed to scan split key"; "region_id" => region_id,); + error!(%e; "failed to scan split key"; + "region_id" => region_id, + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + ); return; } } @@ -357,6 +416,9 @@ where error!(%e; "approximate_check_bucket failed"; "region_id" => region_id, + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), ); } } @@ -368,17 +430,26 @@ where error!(%e; "failed to get approximate split key, try scan way"; "region_id" => region_id, + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), ); match self.scan_split_keys( &mut host, region, + is_key_range, &start_key, &end_key, bucket_ranges, ) { Ok(keys) => keys, Err(e) => { - error!(%e; "failed to scan split key"; "region_id" => region_id,); + error!(%e; "failed to scan split key"; + "region_id" => region_id, + "is_key_range" => is_key_range, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + ); return; } } @@ -408,12 +479,13 @@ where /// Gets the split keys by scanning the range. /// bucket_ranges: specify the ranges to generate buckets. - /// If none, gengerate buckets for the whole region. + /// If none, generate buckets for the whole region. /// If it's Some(vec![]), skip generating buckets. fn scan_split_keys( &self, host: &mut SplitCheckerHost<'_, E>, region: &Region, + is_key_range: bool, start_key: &[u8], end_key: &[u8], bucket_ranges: Option>, @@ -509,6 +581,9 @@ where } // if we scan the whole range, we can update approximate size and keys with accurate value. + if is_key_range { + return; + } info!( "update approximate size and keys with accurate value"; "region_id" => region.get_id(), @@ -565,10 +640,19 @@ where match task { Task::SplitCheckTask { region, + start_key, + end_key, auto_split, policy, bucket_ranges, - } => self.check_split_and_bucket(®ion, auto_split, policy, bucket_ranges), + } => self.check_split_and_bucket( + ®ion, + start_key, + end_key, + auto_split, + policy, + bucket_ranges, + ), Task::ChangeConfig(c) => self.change_cfg(c), Task::ApproximateBuckets(region) => { if self.coprocessor.cfg.enable_region_bucket { diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 63c7e3023c3..046d2396382 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1761,6 +1761,8 @@ impl Cluster { region.get_id(), CasualMessage::HalfSplitRegion { region_epoch: region.get_region_epoch().clone(), + start_key: None, + end_key: None, policy: CheckPolicy::Scan, source: "test", cb, From 05fd6298d1ee07e8dc66c0b76477da9d17cc22d3 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 4 Jul 2022 15:25:01 +0800 Subject: [PATCH 055/676] metrics, pd_worker: add time duration metrics for the Load Base Split (#12941) close tikv/tikv#12937 Add time duration metrics for the Load Base Split. Signed-off-by: JmPotato --- components/raftstore/src/store/metrics.rs | 5 + components/raftstore/src/store/worker/pd.rs | 2 + metrics/grafana/tikv_details.json | 126 ++++++++++++++++++++ 3 files changed, 133 insertions(+) diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index e3d3a23e389..c60152784a5 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -662,6 +662,11 @@ lazy_static! { linear_buckets(0.0, 0.05, 20).unwrap() ).unwrap(); + pub static ref LOAD_BASE_SPLIT_DURATION_HISTOGRAM : Histogram = register_histogram!( + "tikv_load_base_split_duration_seconds", + "Histogram of the time load base split costs in seconds" + ).unwrap(); + pub static ref QUERY_REGION_VEC: HistogramVec = register_histogram_vec!( "tikv_query_region", "Histogram of query", diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index a16ec50a7a0..5e4cf6e8399 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -598,6 +598,7 @@ where receiver: &Receiver, scheduler: &Scheduler>, ) { + let start_time = TiInstant::now(); auto_split_controller.refresh_cfg(); let mut others = vec![]; while let Ok(other) = receiver.try_recv() { @@ -621,6 +622,7 @@ where READ_QPS_TOPN.with_label_values(&[&i.to_string()]).set(0.0); } } + LOAD_BASE_SPLIT_DURATION_HISTOGRAM.observe(start_time.saturating_elapsed_secs()); } pub fn report_min_resolved_ts( diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index b8204654185..686c3a39a97 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -15476,6 +15476,132 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "tidb-cluster", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 118 + }, + "hiddenSeries": false, + "id": 23763572060, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.7", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.80, sum(rate(tikv_load_base_split_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}[1m])) by (le, instance))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "80%-{{instance}}", + "refId": "A", + "step": 4 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.90, sum(rate(tikv_load_base_split_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}[1m])) by (le, instance))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "99%-{{instance}}", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(rate(tikv_load_base_split_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\"}[1m])) by (instance)", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "avg-{{instance}}", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Load base split duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:270", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:271", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, From 16589c02a8c603423f33be8178b983f7e9577c04 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 5 Jul 2022 12:33:02 +0800 Subject: [PATCH 056/676] pd_worker, split_controller: introduce the new config and CPU collector registration mechanism (#12942) ref tikv/tikv#12063, ref tikv/tikv#12593 Introduce the new split config and CPU collector registration mechanism. Signed-off-by: JmPotato --- components/raftstore/src/store/worker/mod.rs | 2 +- components/raftstore/src/store/worker/pd.rs | 65 +++++++++++-- .../src/store/worker/split_config.rs | 32 ++++++ .../src/store/worker/split_controller.rs | 97 ++++++++++++++++++- 4 files changed, 182 insertions(+), 14 deletions(-) diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index a2ac27eed38..583e9341f0d 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -42,5 +42,5 @@ pub use self::{ Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, }, split_config::{SplitConfig, SplitConfigManager}, - split_controller::{AutoSplitController, ReadStats, WriteStats}, + split_controller::{AutoSplitController, ReadStats, SplitConfigChange, WriteStats}, }; diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 5e4cf6e8399..280c15b083f 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -58,7 +58,7 @@ use crate::store::{ worker::{ query_stats::QueryStats, split_controller::{SplitInfo, TOP_N}, - AutoSplitController, ReadStats, WriteStats, + AutoSplitController, ReadStats, SplitConfigChange, WriteStats, }, Callback, CasualMessage, Config, PeerMsg, RaftCmdExtraOpts, RaftCommand, RaftRouter, RegionReadProgressRegistry, SignificantMsg, SnapManager, StoreInfo, StoreMsg, TxnExt, @@ -183,6 +183,7 @@ where id: u64, duration: RaftstoreDuration, }, + UpdateRegionCPUCollector(bool), RegionCPURecords(Arc), ReportMinResolvedTS { store_id: u64, @@ -349,7 +350,7 @@ where log_wrappers::Value::key(split_key), ), Task::AutoSplit { ref split_infos } => { - write!(f, "auto split split regions, num is {}", split_infos.len(),) + write!(f, "auto split split regions, num is {}", split_infos.len()) } Task::AskBatchSplit { ref region, @@ -405,6 +406,12 @@ where Task::UpdateSlowScore { id, ref duration } => { write!(f, "compute slow score: id {}, duration {:?}", id, duration) } + Task::UpdateRegionCPUCollector(is_register) => { + if is_register { + return write!(f, "register region cpu collector"); + } + write!(f, "deregister region cpu collector") + } Task::RegionCPURecords(ref cpu_records) => { write!(f, "get region cpu records: {:?}", cpu_records) } @@ -599,7 +606,18 @@ where scheduler: &Scheduler>, ) { let start_time = TiInstant::now(); - auto_split_controller.refresh_cfg(); + match auto_split_controller.refresh_and_check_cfg() { + SplitConfigChange::UpdateRegionCPUCollector(is_register) => { + if let Err(e) = scheduler.schedule(Task::UpdateRegionCPUCollector(is_register)) { + error!( + "failed to register or deregister the region cpu collector"; + "is_register" => is_register, + "err" => ?e, + ); + } + } + SplitConfigChange::Noop => {} + } let mut others = vec![]; while let Ok(other) = receiver.try_recv() { others.push(other); @@ -842,7 +860,8 @@ where scheduler: Scheduler>, stats_monitor: StatsMonitor, - _region_cpu_records_collector: CollectorGuard, + collector_reg_handle: CollectorRegHandle, + region_cpu_records_collector: Option, // region_id -> total_cpu_time_ms (since last region heartbeat) region_cpu_records: HashMap, @@ -879,6 +898,18 @@ where region_read_progress: RegionReadProgressRegistry, health_service: Option, ) -> Runner { + // Register the region CPU records collector. + let mut region_cpu_records_collector = None; + if auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio + > 0.0 + { + region_cpu_records_collector = Some(collector_reg_handle.register( + Box::new(RegionCPUMeteringCollector::new(scheduler.clone())), + false, + )); + } let interval = store_heartbeat_interval / Self::INTERVAL_DIVISOR; let mut stats_monitor = StatsMonitor::new( interval, @@ -889,11 +920,6 @@ where error!("failed to start stats collector, error = {:?}", e); } - let _region_cpu_records_collector = collector_reg_handle.register( - Box::new(RegionCPUMeteringCollector::new(scheduler.clone())), - true, - ); - Runner { store_id, pd_client, @@ -905,7 +931,8 @@ where start_ts: UnixSecs::now(), scheduler, stats_monitor, - _region_cpu_records_collector, + collector_reg_handle, + region_cpu_records_collector, region_cpu_records: HashMap::default(), concurrency_manager, snap_mgr, @@ -968,6 +995,21 @@ where self.remote.spawn(f); } + fn handle_update_region_cpu_collector(&mut self, is_register: bool) { + // If it's a deregister task, just take and drop the original collector. + if !is_register { + self.region_cpu_records_collector.take(); + return; + } + if self.region_cpu_records_collector.is_some() { + return; + } + self.region_cpu_records_collector = Some(self.collector_reg_handle.register( + Box::new(RegionCPUMeteringCollector::new(self.scheduler.clone())), + false, + )); + } + // Note: The parameter doesn't contain `self` because this function may // be called in an asynchronous context. fn handle_ask_batch_split( @@ -1928,6 +1970,9 @@ where } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), Task::UpdateSlowScore { id, duration } => self.slow_score.record(id, duration.sum()), + Task::UpdateRegionCPUCollector(is_register) => { + self.handle_update_region_cpu_collector(is_register) + } Task::RegionCPURecords(records) => self.handle_region_cpu_records(records), Task::ReportMinResolvedTS { store_id, diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index 4d2634514be..58df082c3e6 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -19,6 +19,19 @@ const DEFAULT_SPLIT_BALANCE_SCORE: f64 = 0.25; // We get contained score by sample.contained/(sample.right+sample.left+sample.contained). It will be used to avoid to split regions requested by range. const DEFAULT_SPLIT_CONTAINED_SCORE: f64 = 0.5; +// If the `split_balance_score` and `split_contained_score` above could not be satisfied, we will try to split the region according to its CPU load, +// then these parameters below will start to work. +// When the gRPC poll thread CPU usage is higher than gRPC poll thread count * `DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, +// the CPU-based split won't be triggered no matter if the `DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO` and `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` are exceeded +// to prevent from increasing the gRPC poll CPU usage. +const DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.5; +// When the Unified Read Poll thread CPU usage is higher than Unified Read Poll thread count * `DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, +// the CPU-based split will try to check and record the top hot CPU region. +const DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.8; +// When the Unified Read Poll is hot and the region's CPU usage reaches `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` as a percentage of the Unified Read Poll, +// it will be added into the hot region list and may be split later as the top hot CPU region. +pub(crate) const REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.25; + lazy_static! { static ref SPLIT_CONFIG: Mutex>>> = Mutex::new(None); } @@ -43,6 +56,11 @@ pub struct SplitConfig { pub sample_num: usize, pub sample_threshold: u64, pub byte_threshold: usize, + #[doc(hidden)] + pub grpc_thread_cpu_overload_threshold_ratio: f64, + #[doc(hidden)] + pub unified_read_pool_thread_cpu_overload_threshold_ratio: f64, + pub region_cpu_overload_threshold_ratio: f64, // deprecated. #[online_config(skip)] #[doc(hidden)] @@ -65,6 +83,11 @@ impl Default for SplitConfig { sample_num: DEFAULT_SAMPLE_NUM, sample_threshold: DEFAULT_SAMPLE_THRESHOLD, byte_threshold: DEFAULT_BYTE_THRESHOLD, + grpc_thread_cpu_overload_threshold_ratio: + DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, + unified_read_pool_thread_cpu_overload_threshold_ratio: + DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO, + region_cpu_overload_threshold_ratio: REGION_CPU_OVERLOAD_THRESHOLD_RATIO, size_threshold: None, // deprecated. key_threshold: None, // deprecated. } @@ -87,6 +110,15 @@ impl SplitConfig { ("sample_num should be less than qps_threshold for load-base-split.").into(), ); } + if self.grpc_thread_cpu_overload_threshold_ratio > 1.0 + || self.grpc_thread_cpu_overload_threshold_ratio < 0.0 + || self.unified_read_pool_thread_cpu_overload_threshold_ratio > 1.0 + || self.unified_read_pool_thread_cpu_overload_threshold_ratio < 0.0 + || self.region_cpu_overload_threshold_ratio > 1.0 + || self.region_cpu_overload_threshold_ratio < 0.0 + { + return Err(("threshold ratio should be between 0 and 1.").into()); + } Ok(()) } } diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index d21c97285d0..b644ac88d85 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -519,10 +519,16 @@ pub struct SplitInfo { pub peer: Peer, } +#[derive(PartialEq, Debug)] +pub enum SplitConfigChange { + Noop, + UpdateRegionCPUCollector(bool), +} + pub struct AutoSplitController { // RegionID -> Recorder pub recorders: HashMap, - cfg: SplitConfig, + pub cfg: SplitConfig, cfg_tracker: Tracker, } @@ -645,19 +651,36 @@ impl AutoSplitController { }); } - pub fn refresh_cfg(&mut self) { + pub fn refresh_and_check_cfg(&mut self) -> SplitConfigChange { + let mut cfg_change = SplitConfigChange::Noop; if let Some(incoming) = self.cfg_tracker.any_new() { + if self.cfg.region_cpu_overload_threshold_ratio <= 0.0 + && incoming.region_cpu_overload_threshold_ratio > 0.0 + { + cfg_change = SplitConfigChange::UpdateRegionCPUCollector(true); + } + if self.cfg.region_cpu_overload_threshold_ratio > 0.0 + && incoming.region_cpu_overload_threshold_ratio <= 0.0 + { + cfg_change = SplitConfigChange::UpdateRegionCPUCollector(false); + } self.cfg = incoming.clone(); } + cfg_change } } #[cfg(test)] mod tests { + use online_config::{ConfigChange, ConfigManager, ConfigValue}; + use tikv_util::config::VersionTrack; use txn_types::Key; use super::*; - use crate::store::{util::build_key_range, worker::split_config::DEFAULT_SAMPLE_NUM}; + use crate::store::{ + util::build_key_range, + worker::split_config::{DEFAULT_SAMPLE_NUM, REGION_CPU_OVERLOAD_THRESHOLD_RATIO}, + }; enum Position { Left, @@ -1201,6 +1224,74 @@ mod tests { qps_stats } + #[test] + fn test_refresh_and_check_cfg() { + let split_config = SplitConfig::default(); + let mut split_cfg_manager = + SplitConfigManager::new(Arc::new(VersionTrack::new(split_config))); + let mut auto_split_controller = AutoSplitController::new(split_cfg_manager.clone()); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::Noop, + ); + assert_eq!( + auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio, + REGION_CPU_OVERLOAD_THRESHOLD_RATIO + ); + // Set to zero. + dispatch_split_cfg_change( + &mut split_cfg_manager, + "region_cpu_overload_threshold_ratio", + ConfigValue::F64(0.0), + ); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::UpdateRegionCPUCollector(false), + ); + assert_eq!( + auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio, + 0.0 + ); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::Noop, + ); + // Set to non-zero. + dispatch_split_cfg_change( + &mut split_cfg_manager, + "region_cpu_overload_threshold_ratio", + ConfigValue::F64(REGION_CPU_OVERLOAD_THRESHOLD_RATIO), + ); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::UpdateRegionCPUCollector(true), + ); + assert_eq!( + auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio, + REGION_CPU_OVERLOAD_THRESHOLD_RATIO + ); + assert_eq!( + auto_split_controller.refresh_and_check_cfg(), + SplitConfigChange::Noop, + ); + } + + fn dispatch_split_cfg_change( + split_cfg_manager: &mut SplitConfigManager, + cfg_name: &str, + cfg_value: ConfigValue, + ) { + let mut config_change = ConfigChange::new(); + config_change.insert(String::from(cfg_name), cfg_value); + split_cfg_manager.dispatch(config_change).unwrap(); + } + #[bench] fn samples_evaluate(b: &mut test::Bencher) { let mut samples = Samples(vec![Sample::new(b"c")]); From ed8257cabceb6eb5eddd4753dccedb076fb6dcb9 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Tue, 5 Jul 2022 15:15:01 +0800 Subject: [PATCH 057/676] server: collect count and duration by request source (#12954) ref tikv/tikv#12362 TiKV client can pass request_source through Context. It is useful for us to know how many requests there are from each source. So, this commit collects the count and the total duration by request source. The source label is not added to the command type in order to avoid creating too many label combinations. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- metrics/grafana/tikv_details.json | 216 ++++++++++++++++++++++++++++++ src/server/metrics.rs | 69 +++++++++- src/server/service/batch.rs | 52 +++++-- src/server/service/kv.rs | 92 ++++++++++--- src/storage/mod.rs | 39 +++++- 6 files changed, 427 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 893b5d909f5..1dfb74e3b13 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2623,7 +2623,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#0e2f26c0a46ae7d666d6ca4410046a39e0c96f36" +source = "git+https://github.com/pingcap/kvproto.git#acfe326c7cb2bdcdbfc991cada1973a68f34836f" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 686c3a39a97..6ef292f95e5 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -5567,6 +5567,222 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The QPS of different sources of gRPC request", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 23763572858, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_grpc_request_source_counter_vec{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (source)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "metric": "tikv_grpc_msg_duration_seconds_bucket", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "gRPC request sources QPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:69", + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:70", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The duration of different sources of gRPC request", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 29 + }, + "hiddenSeries": false, + "id": 23763572859, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_grpc_request_source_duration_vec{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (source)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "metric": "tikv_grpc_msg_duration_seconds_bucket", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "gRPC request sources duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:69", + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:70", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 8eda17034e1..caf6e1e86c4 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -1,7 +1,14 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. -use prometheus::{exponential_buckets, *}; +use std::{ + cell::{Cell, RefCell}, + time::Duration, +}; + +use collections::HashMap; +use prometheus::{exponential_buckets, local::LocalIntCounter, *}; use prometheus_static_metric::*; +use tikv_util::time::Instant; pub use crate::storage::kv::metrics::{ GcKeysCF, GcKeysCounterVec, GcKeysCounterVecInner, GcKeysDetail, @@ -240,6 +247,18 @@ lazy_static! { exponential_buckets(0.0001, 2.0, 20).unwrap() ) .unwrap(); + pub static ref GRPC_REQUEST_SOURCE_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_grpc_request_source_counter_vec", + "Counter of different sources of RPC requests", + &["source"] + ) + .unwrap(); + pub static ref GRPC_REQUEST_SOURCE_DURATION_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_grpc_request_source_duration_vec", + "Total duration of different sources of RPC requests (in microseconds)", + &["source"] + ) + .unwrap(); } lazy_static! { @@ -484,3 +503,51 @@ lazy_static! { pub static ref ASYNC_REQUESTS_DURATIONS_VEC: AsyncRequestsDurationVec = auto_flush_from!(ASYNC_REQUESTS_DURATIONS, AsyncRequestsDurationVec); } + +struct LocalRequestSourceMetrics { + pub count: LocalIntCounter, + pub duration_us: LocalIntCounter, +} + +impl LocalRequestSourceMetrics { + fn new(source: &str) -> Self { + LocalRequestSourceMetrics { + count: GRPC_REQUEST_SOURCE_COUNTER_VEC + .with_label_values(&[source]) + .local(), + duration_us: GRPC_REQUEST_SOURCE_DURATION_VEC + .with_label_values(&[source]) + .local(), + } + } +} + +thread_local! { + static REQUEST_SOURCE_METRICS_MAP: RefCell> = RefCell::new(HashMap::default()); + + static LAST_LOCAL_FLUSH_TIME: Cell = Cell::new(Instant::now_coarse()); +} + +pub fn record_request_source_metrics(source: String, duration: Duration) { + let need_flush = LAST_LOCAL_FLUSH_TIME.with(|last_local_flush_time| { + let now = Instant::now_coarse(); + if now - last_local_flush_time.get() > Duration::from_secs(1) { + last_local_flush_time.set(now); + true + } else { + false + } + }); + REQUEST_SOURCE_METRICS_MAP.with(|map| { + let mut map = map.borrow_mut(); + let metrics = map + .entry(source) + .or_insert_with_key(|k| LocalRequestSourceMetrics::new(k)); + metrics.count.inc(); + metrics.duration_us.inc_by(duration.as_micros() as u64); + if need_flush { + metrics.count.flush(); + metrics.duration_us.flush(); + } + }); +} diff --git a/src/server/service/batch.rs b/src/server/service/batch.rs index 1a7fcb59c3a..931017549c1 100644 --- a/src/server/service/batch.rs +++ b/src/server/service/batch.rs @@ -152,7 +152,13 @@ pub struct GetCommandResponseConsumer { } impl ResponseBatchConsumer<(Option>, Statistics)> for GetCommandResponseConsumer { - fn consume(&self, id: u64, res: Result<(Option>, Statistics)>, begin: Instant) { + fn consume( + &self, + id: u64, + res: Result<(Option>, Statistics)>, + begin: Instant, + request_source: String, + ) { let mut resp = GetResponse::default(); if let Some(err) = extract_region_error(&res) { resp.set_region_error(err); @@ -175,7 +181,8 @@ impl ResponseBatchConsumer<(Option>, Statistics)> for GetCommandResponse cmd: Some(batch_commands_response::response::Cmd::Get(resp)), ..Default::default() }; - let mesure = GrpcRequestDuration::new(begin, GrpcTypeKind::kv_batch_get_command); + let mesure = + GrpcRequestDuration::new(begin, GrpcTypeKind::kv_batch_get_command, request_source); let task = MeasuredSingleResponse::new(id, res, mesure); if self.tx.send_and_notify(task).is_err() { error!("KvService response batch commands fail"); @@ -184,7 +191,13 @@ impl ResponseBatchConsumer<(Option>, Statistics)> for GetCommandResponse } impl ResponseBatchConsumer>> for GetCommandResponseConsumer { - fn consume(&self, id: u64, res: Result>>, begin: Instant) { + fn consume( + &self, + id: u64, + res: Result>>, + begin: Instant, + request_source: String, + ) { let mut resp = RawGetResponse::default(); if let Some(err) = extract_region_error(&res) { resp.set_region_error(err); @@ -199,7 +212,8 @@ impl ResponseBatchConsumer>> for GetCommandResponseConsumer { cmd: Some(batch_commands_response::response::Cmd::RawGet(resp)), ..Default::default() }; - let mesure = GrpcRequestDuration::new(begin, GrpcTypeKind::raw_batch_get_command); + let mesure = + GrpcRequestDuration::new(begin, GrpcTypeKind::raw_batch_get_command, request_source); let task = MeasuredSingleResponse::new(id, res, mesure); if self.tx.send_and_notify(task).is_err() { error!("KvService response batch commands fail"); @@ -218,7 +232,11 @@ fn future_batch_get_command( REQUEST_BATCH_SIZE_HISTOGRAM_VEC .kv_get .observe(gets.len() as f64); - let ids = requests.clone(); + let id_sources: Vec<_> = requests + .iter() + .zip(gets.iter()) + .map(|(id, req)| (*id, req.get_context().get_request_source().to_string())) + .collect(); let res = storage.batch_get_command( gets, requests, @@ -235,13 +253,16 @@ fn future_batch_get_command( if let Some(e) = extract_region_error(&res) { let mut resp = GetResponse::default(); resp.set_region_error(e); - for id in ids { + for (id, source) in id_sources { let res = batch_commands_response::Response { cmd: Some(batch_commands_response::response::Cmd::Get(resp.clone())), ..Default::default() }; - let measure = - GrpcRequestDuration::new(begin_instant, GrpcTypeKind::kv_batch_get_command); + let measure = GrpcRequestDuration::new( + begin_instant, + GrpcTypeKind::kv_batch_get_command, + source, + ); let task = MeasuredSingleResponse::new(id, res, measure); if tx.send_and_notify(task).is_err() { error!("KvService response batch commands fail"); @@ -262,7 +283,11 @@ fn future_batch_raw_get_command( REQUEST_BATCH_SIZE_HISTOGRAM_VEC .raw_get .observe(gets.len() as f64); - let ids = requests.clone(); + let id_sources: Vec<_> = requests + .iter() + .zip(gets.iter()) + .map(|(id, req)| (*id, req.get_context().get_request_source().to_string())) + .collect(); let res = storage.raw_batch_get_command( gets, requests, @@ -274,13 +299,16 @@ fn future_batch_raw_get_command( if let Some(e) = extract_region_error(&res) { let mut resp = RawGetResponse::default(); resp.set_region_error(e); - for id in ids { + for (id, source) in id_sources { let res = batch_commands_response::Response { cmd: Some(batch_commands_response::response::Cmd::RawGet(resp.clone())), ..Default::default() }; - let measure = - GrpcRequestDuration::new(begin_instant, GrpcTypeKind::raw_batch_get_command); + let measure = GrpcRequestDuration::new( + begin_instant, + GrpcTypeKind::raw_batch_get_command, + source, + ); let task = MeasuredSingleResponse::new(id, res, measure); if tx.send_and_notify(task).is_err() { error!("KvService response batch commands fail"); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 64ce2abb0e6..5b084826861 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -184,17 +184,20 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor macro_rules! handle_request { ($fn_name: ident, $future_name: ident, $req_ty: ident, $resp_ty: ident) => { - fn $fn_name(&mut self, ctx: RpcContext<'_>, req: $req_ty, sink: UnarySink<$resp_ty>) { + fn $fn_name(&mut self, ctx: RpcContext<'_>, mut req: $req_ty, sink: UnarySink<$resp_ty>) { forward_unary!(self.proxy, $fn_name, ctx, req, sink); let begin_instant = Instant::now_coarse(); + let source = req.mut_context().take_request_source(); let resp = $future_name(&self.storage, req); let task = async move { let resp = resp.await?; sink.success(resp).await?; + let elapsed = begin_instant.saturating_elapsed(); GRPC_MSG_HISTOGRAM_STATIC .$fn_name - .observe(duration_to_sec(begin_instant.saturating_elapsed())); + .observe(elapsed.as_secs_f64()); + record_request_source_metrics(source, elapsed); ServerResult::Ok(()) } .map_err(|e| { @@ -367,16 +370,19 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ); } - fn coprocessor(&mut self, ctx: RpcContext<'_>, req: Request, sink: UnarySink) { + fn coprocessor(&mut self, ctx: RpcContext<'_>, mut req: Request, sink: UnarySink) { forward_unary!(self.proxy, coprocessor, ctx, req, sink); let begin_instant = Instant::now_coarse(); + let source = req.mut_context().take_request_source(); let future = future_copr(&self.copr, Some(ctx.peer()), req); let task = async move { let resp = future.await?.consume(); sink.success(resp).await?; + let elapsed = begin_instant.saturating_elapsed(); GRPC_MSG_HISTOGRAM_STATIC .coprocessor - .observe(duration_to_sec(begin_instant.saturating_elapsed())); + .observe(elapsed.as_secs_f64()); + record_request_source_metrics(source, elapsed); ServerResult::Ok(()) } .map_err(|e| { @@ -393,17 +399,20 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor fn raw_coprocessor( &mut self, ctx: RpcContext<'_>, - req: RawCoprocessorRequest, + mut req: RawCoprocessorRequest, sink: UnarySink, ) { let begin_instant = Instant::now_coarse(); + let source = req.mut_context().take_request_source(); let future = future_raw_coprocessor(&self.copr_v2, &self.storage, req); let task = async move { let resp = future.await?; sink.success(resp).await?; + let elapsed = begin_instant.saturating_elapsed(); GRPC_MSG_HISTOGRAM_STATIC .raw_coprocessor - .observe(duration_to_sec(begin_instant.saturating_elapsed())); + .observe(elapsed.as_secs_f64()); + record_request_source_metrics(source, elapsed); ServerResult::Ok(()) } .map_err(|e| { @@ -593,6 +602,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor assert!(!req.get_start_key().is_empty()); assert!(!req.get_end_key().is_empty()); + let source = req.mut_context().take_request_source(); let (cb, f) = paired_future_callback(); let res = self.gc_worker.unsafe_destroy_range( req.take_context(), @@ -612,9 +622,11 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor resp.set_error(format!("{}", e)); } sink.success(resp).await?; + let elapsed = begin_instant.saturating_elapsed(); GRPC_MSG_HISTOGRAM_STATIC .unsafe_destroy_range - .observe(duration_to_sec(begin_instant.saturating_elapsed())); + .observe(elapsed.as_secs_f64()); + record_request_source_metrics(source, elapsed); ServerResult::Ok(()) } .map_err(|e| { @@ -1022,10 +1034,16 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let mut response_retriever = response_retriever.map(move |item| { for measure in item.measures { - let GrpcRequestDuration { label, begin } = measure; + let GrpcRequestDuration { + label, + begin, + source, + } = measure; + let elapsed = begin.saturating_elapsed(); GRPC_MSG_HISTOGRAM_STATIC .get(label) - .observe(begin.saturating_elapsed_secs()); + .observe(elapsed.as_secs_f64()); + record_request_source_metrics(source, elapsed); } let mut r = item.batch_resp; @@ -1185,13 +1203,18 @@ fn response_batch_commands_request( tx: Sender, begin: Instant, label: GrpcTypeKind, + source: String, ) where MemoryTraceGuard: From, F: Future> + Send + 'static, { let task = async move { if let Ok(resp) = resp.await { - let measure = GrpcRequestDuration { begin, label }; + let measure = GrpcRequestDuration { + begin, + label, + source, + }; let task = MeasuredSingleResponse::new(id, resp, measure); if let Err(e) = tx.send_and_notify(task) { error!("KvService response batch commands fail"; "err" => ?e); @@ -1228,49 +1251,70 @@ fn handle_batch_commands_request( // For some invalid requests. let begin_instant = Instant::now(); let resp = future::ok(batch_commands_response::Response::default()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid, String::default()); }, - Some(batch_commands_request::request::Cmd::Get(req)) => { + Some(batch_commands_request::request::Cmd::Get(mut req)) => { if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_get(&req) }) { batcher.as_mut().unwrap().add_get_request(req, id); } else { let begin_instant = Instant::now(); + let source = req.mut_context().take_request_source(); let resp = future_get(storage, req) .map_ok(oneof!(batch_commands_response::response::Cmd::Get)) .map_err(|_| GRPC_MSG_FAIL_COUNTER.kv_get.inc()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::kv_get); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::kv_get, source); } }, - Some(batch_commands_request::request::Cmd::RawGet(req)) => { + Some(batch_commands_request::request::Cmd::RawGet(mut req)) => { if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_raw_get(&req) }) { batcher.as_mut().unwrap().add_raw_get_request(req, id); } else { let begin_instant = Instant::now(); + let source = req.mut_context().take_request_source(); let resp = future_raw_get(storage, req) .map_ok(oneof!(batch_commands_response::response::Cmd::RawGet)) .map_err(|_| GRPC_MSG_FAIL_COUNTER.raw_get.inc()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::raw_get); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::raw_get, source); } }, - Some(batch_commands_request::request::Cmd::Coprocessor(req)) => { + Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { let begin_instant = Instant::now(); + let source = req.mut_context().take_request_source(); let resp = future_copr(copr, Some(peer.to_string()), req) .map_ok(|resp| { resp.map(oneof!(batch_commands_response::response::Cmd::Coprocessor)) }) .map_err(|_| GRPC_MSG_FAIL_COUNTER.coprocessor.inc()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::coprocessor); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::coprocessor, source); }, - $(Some(batch_commands_request::request::Cmd::$cmd(req)) => { + Some(batch_commands_request::request::Cmd::Empty(req)) => { let begin_instant = Instant::now(); + let resp = future_handle_empty(req) + .map_ok(|resp| batch_commands_response::Response { + cmd: Some(batch_commands_response::response::Cmd::Empty(resp)), + ..Default::default() + }) + .map_err(|_| GRPC_MSG_FAIL_COUNTER.invalid.inc()); + response_batch_commands_request( + id, + resp, + tx.clone(), + begin_instant, + GrpcTypeKind::invalid, + String::default(), + ); + } + $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { + let begin_instant = Instant::now(); + let source = req.mut_context().take_request_source(); let resp = $future_fn($($arg,)* req) .map_ok(oneof!(batch_commands_response::response::Cmd::$cmd)) .map_err(|_| GRPC_MSG_FAIL_COUNTER.$metric_name.inc()); - response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::$metric_name); + response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::$metric_name, source); })* Some(batch_commands_request::request::Cmd::Import(_)) => unimplemented!(), } @@ -1302,7 +1346,6 @@ fn handle_batch_commands_request( RawCoprocessor, future_raw_coprocessor(copr_v2, storage), coprocessor; PessimisticLock, future_acquire_pessimistic_lock(storage), kv_pessimistic_lock; PessimisticRollback, future_pessimistic_rollback(storage), kv_pessimistic_rollback; - Empty, future_handle_empty(), invalid; } } @@ -2100,10 +2143,15 @@ pub mod batch_commands_request { pub struct GrpcRequestDuration { pub begin: Instant, pub label: GrpcTypeKind, + pub source: String, } impl GrpcRequestDuration { - pub fn new(begin: Instant, label: GrpcTypeKind) -> Self { - GrpcRequestDuration { begin, label } + pub fn new(begin: Instant, label: GrpcTypeKind, source: String) -> Self { + GrpcRequestDuration { + begin, + label, + source, + } } } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 768579f0b15..4e44bc0b37a 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -737,8 +737,10 @@ impl Storage { for ((mut req, id), tracker) in requests.into_iter().zip(ids).zip(trackers) { set_tls_tracker_token(tracker); let mut ctx = req.take_context(); + let source = ctx.take_request_source(); let region_id = ctx.get_region_id(); let peer = ctx.get_peer(); + let key = Key::from_raw(req.get_key()); tls_collect_query( region_id, @@ -775,7 +777,7 @@ impl Storage { snap_ctx } Err(e) => { - consumer.consume(id, Err(e), begin_instant); + consumer.consume(id, Err(e), begin_instant, source); continue; } }; @@ -791,6 +793,7 @@ impl Storage { access_locks, region_id, id, + source, tracker, )); } @@ -806,6 +809,7 @@ impl Storage { access_locks, region_id, id, + source, tracker, ) = req_snap; let snap_res = snap.await; @@ -836,6 +840,7 @@ impl Storage { v.map_err(|e| Error::from(txn::Error::from(e))) .map(|v| (v, stat)), begin_instant, + source, ); } Err(e) => { @@ -843,12 +848,13 @@ impl Storage { id, Err(Error::from(txn::Error::from(e))), begin_instant, + source, ); } } }), Err(e) => { - consumer.consume(id, Err(e), begin_instant); + consumer.consume(id, Err(e), begin_instant, source); } } } @@ -1606,7 +1612,7 @@ impl Storage { } Self::with_tls_engine(|engine| engine.release_snapshot()); let begin_instant = Instant::now(); - for (id, key, ctx, mut req, snap) in snaps { + for (id, key, mut ctx, mut req, snap) in snaps { let cf = req.take_cf(); match snap.await { Ok(snapshot) => { @@ -1621,6 +1627,7 @@ impl Storage { .raw_get_key_value(cf, &key, &mut stats) .map_err(Error::from), begin_instant, + ctx.take_request_source(), ); tls_collect_read_flow( ctx.get_region_id(), @@ -1631,12 +1638,17 @@ impl Storage { ); } Err(e) => { - consumer.consume(id, Err(e), begin_instant); + consumer.consume( + id, + Err(e), + begin_instant, + ctx.take_request_source(), + ); } } } Err(e) => { - consumer.consume(id, Err(e), begin_instant); + consumer.consume(id, Err(e), begin_instant, ctx.take_request_source()); } } } @@ -2849,7 +2861,13 @@ impl TestStorageBuilder { } pub trait ResponseBatchConsumer: Send { - fn consume(&self, id: u64, res: Result, begin: Instant); + fn consume( + &self, + id: u64, + res: Result, + begin: Instant, + request_source: String, + ); } pub mod test_util { @@ -3033,6 +3051,7 @@ pub mod test_util { id: u64, res: Result<(Option>, Statistics)>, _: tikv_util::time::Instant, + _source: String, ) { self.data.lock().unwrap().push(GetResult { id, @@ -3042,7 +3061,13 @@ pub mod test_util { } impl ResponseBatchConsumer>> for GetConsumer { - fn consume(&self, id: u64, res: Result>>, _: tikv_util::time::Instant) { + fn consume( + &self, + id: u64, + res: Result>>, + _: tikv_util::time::Instant, + _source: String, + ) { self.data.lock().unwrap().push(GetResult { id, res }); } } From f5993c19abcecdec92bf58868d0f757061196791 Mon Sep 17 00:00:00 2001 From: haojinming Date: Tue, 5 Jul 2022 18:07:02 +0800 Subject: [PATCH 058/676] [apiv2] encode key range in raw_checksum interface (#12951) close tikv/tikv#12950 Signed-off-by: haojinming --- src/storage/mod.rs | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 4e44bc0b37a..cb792d7aec2 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -2440,9 +2440,8 @@ impl Storage { &self, ctx: Context, algorithm: ChecksumAlgorithm, - ranges: Vec, + mut ranges: Vec, ) -> impl Future> { - // TODO: Modify this method in another PR for backup & restore feature of Api V2. const CMD: CommandKind = CommandKind::raw_checksum; let priority = ctx.get_priority(); let priority_tag = get_priority_tag(priority); @@ -2474,6 +2473,12 @@ impl Storage { .iter() .map(|range| (Some(range.get_start_key()), Some(range.get_end_key()))), )?; + for range in ranges.iter_mut() { + let start_key = F::encode_raw_key_owned(range.take_start_key(), None); + let end_key = F::encode_raw_key_owned(range.take_end_key(), None); + range.set_start_key(start_key.into_encoded()); + range.set_end_key(end_key.into_encoded()); + } let command_duration = tikv_util::time::Instant::now(); let snap_ctx = SnapContext { @@ -4564,6 +4569,7 @@ mod tests { let mut checksum: u64 = 0; let mut total_kvs: u64 = 0; let mut total_bytes: u64 = 0; + let mut is_first = true; // Write key-value pairs one by one for &(ref key, ref value) in &test_data { storage @@ -4576,13 +4582,18 @@ mod tests { expect_ok_callback(tx.clone(), 0), ) .unwrap(); - total_kvs += 1; - total_bytes += (key.len() + value.len()) as u64; - checksum = checksum_crc64_xor(checksum, digest.clone(), key, value); + // start key is set to b"r\0a\0", if raw_checksum does not encode the key, + // first key will be included in checksum. This is for testing issue #12950. + if !is_first { + total_kvs += 1; + total_bytes += (key.len() + value.len()) as u64; + checksum = checksum_crc64_xor(checksum, digest.clone(), key, value); + } + is_first = false; rx.recv().unwrap(); } let mut range = KeyRange::default(); - range.set_start_key(b"r\0a".to_vec()); + range.set_start_key(b"r\0a\0".to_vec()); range.set_end_key(b"r\0z".to_vec()); assert_eq!( (checksum, total_kvs, total_bytes), From c762224a57fddc5f3ec9c773416d6615e505f415 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Wed, 6 Jul 2022 15:57:02 +0800 Subject: [PATCH 059/676] cdc: add min_resolved_ts_lag metrics (#12968) close tikv/tikv#12967 cdc: add min_resolved_ts_lag metrics Signed-off-by: Neil Shen --- components/backup-stream/src/endpoint.rs | 2 +- components/cdc/src/endpoint.rs | 43 ++++++++++++++++++++---- components/cdc/src/metrics.rs | 4 +++ 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 1c1efdcb546..a89d5a66da4 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -454,7 +454,7 @@ where let range_init_result = init.initialize_range(start_key.clone(), end_key.clone()); match range_init_result { Ok(()) => { - info!("backup stream success to initialize"; + info!("backup stream success to initialize"; "start_key" => utils::redact(&start_key), "end_key" => utils::redact(&end_key), "take" => ?start.saturating_elapsed(),) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index c78636b8e11..54686424461 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -147,6 +147,7 @@ pub enum Task { MinTS { regions: Vec, min_ts: TimeStamp, + current_ts: TimeStamp, }, ResolverReady { observe_id: ObserveID, @@ -203,9 +204,15 @@ impl fmt::Debug for Task { .field("type", &"multi_batch") .field("multi_batch", &multi.len()) .finish(), - Task::MinTS { ref min_ts, .. } => { - de.field("type", &"mit_ts").field("min_ts", min_ts).finish() - } + Task::MinTS { + ref min_ts, + ref current_ts, + .. + } => de + .field("type", &"mit_ts") + .field("current_ts", current_ts) + .field("min_ts", min_ts) + .finish(), Task::ResolverReady { ref observe_id, ref region, @@ -348,6 +355,7 @@ pub struct Endpoint { region_read_progress: RegionReadProgressRegistry, // Metrics and logging. + current_ts: TimeStamp, min_resolved_ts: TimeStamp, min_ts_region_id: u64, resolved_region_count: usize, @@ -439,6 +447,7 @@ impl, E: KvEngine> Endpoint { region_read_progress, // Log the first resolved ts warning. warn_resolved_ts_repeat_count: WARN_RESOLVED_TS_COUNT_THRESHOLD, + current_ts: TimeStamp::zero(), }; ep.register_min_ts_event(); ep @@ -819,7 +828,7 @@ impl, E: KvEngine> Endpoint { } } - fn on_min_ts(&mut self, regions: Vec, min_ts: TimeStamp) { + fn on_min_ts(&mut self, regions: Vec, min_ts: TimeStamp, current_ts: TimeStamp) { // Reset resolved_regions to empty. let resolved_regions = &mut self.resolved_region_heap; resolved_regions.clear(); @@ -856,6 +865,7 @@ impl, E: KvEngine> Endpoint { } } } + self.current_ts = current_ts; let lag_millis = min_ts .physical() .saturating_sub(self.min_resolved_ts.physical()); @@ -1056,7 +1066,11 @@ impl, E: KvEngine> Endpoint { }; if !regions.is_empty() { - match scheduler.schedule(Task::MinTS { regions, min_ts }) { + match scheduler.schedule(Task::MinTS { + regions, + min_ts, + current_ts: min_ts_pd, + }) { Ok(_) | Err(ScheduleError::Stopped(_)) => (), // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not // advance normally. @@ -1134,7 +1148,11 @@ impl, E: KvEngine> Runnable for Endpoint { debug!("cdc run task"; "task" => %task); match task { - Task::MinTS { regions, min_ts } => self.on_min_ts(regions, min_ts), + Task::MinTS { + regions, + min_ts, + current_ts, + } => self.on_min_ts(regions, min_ts, current_ts), Task::Register { request, downstream, @@ -1214,8 +1232,14 @@ impl, E: KvEngine> RunnableWithTimer for Endpoin if self.min_resolved_ts != TimeStamp::max() { CDC_MIN_RESOLVED_TS_REGION.set(self.min_ts_region_id as i64); CDC_MIN_RESOLVED_TS.set(self.min_resolved_ts.physical() as i64); + CDC_MIN_RESOLVED_TS_LAG.set( + self.current_ts + .physical() + .saturating_sub(self.min_resolved_ts.physical()) as i64, + ); } self.min_resolved_ts = TimeStamp::max(); + self.current_ts = TimeStamp::max(); self.min_ts_region_id = 0; self.old_value_cache.flush_metrics(); @@ -1881,6 +1905,7 @@ mod tests { suite.run(Task::MinTS { regions: vec![1], min_ts: TimeStamp::from(1), + current_ts: TimeStamp::zero(), }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -1916,6 +1941,7 @@ mod tests { suite.run(Task::MinTS { regions: vec![1, 2], min_ts: TimeStamp::from(2), + current_ts: TimeStamp::zero(), }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -1960,6 +1986,7 @@ mod tests { suite.run(Task::MinTS { regions: vec![1, 2, 3], min_ts: TimeStamp::from(3), + current_ts: TimeStamp::zero(), }); let cdc_event = channel::recv_timeout(&mut rx, Duration::from_millis(500)) .unwrap() @@ -2193,6 +2220,7 @@ mod tests { suite.run(Task::MinTS { regions: vec![1], min_ts: TimeStamp::from(1), + current_ts: TimeStamp::zero(), }); // conn a must receive a resolved ts that only contains region 1. assert_batch_resolved_ts(conn_rxs.get_mut(0).unwrap(), vec![1], 1); @@ -2206,6 +2234,7 @@ mod tests { suite.run(Task::MinTS { regions: vec![1, 2], min_ts: TimeStamp::from(2), + current_ts: TimeStamp::zero(), }); // conn a must receive a resolved ts that contains region 1 and region 2. assert_batch_resolved_ts(conn_rxs.get_mut(0).unwrap(), vec![1, 2], 2); @@ -2219,6 +2248,7 @@ mod tests { suite.run(Task::MinTS { regions: vec![1, 2, 3], min_ts: TimeStamp::from(3), + current_ts: TimeStamp::zero(), }); // conn a must receive a resolved ts that contains region 1 and region 2. assert_batch_resolved_ts(conn_rxs.get_mut(0).unwrap(), vec![1, 2], 3); @@ -2228,6 +2258,7 @@ mod tests { suite.run(Task::MinTS { regions: vec![1, 3], min_ts: TimeStamp::from(4), + current_ts: TimeStamp::zero(), }); // conn a must receive a resolved ts that only contains region 1. assert_batch_resolved_ts(conn_rxs.get_mut(0).unwrap(), vec![1], 4); diff --git a/components/cdc/src/metrics.rs b/components/cdc/src/metrics.rs index 55a0124e567..0118b4d7916 100644 --- a/components/cdc/src/metrics.rs +++ b/components/cdc/src/metrics.rs @@ -108,6 +108,10 @@ lazy_static! { "The region which has minimal resolved ts" ) .unwrap(); + pub static ref CDC_MIN_RESOLVED_TS_LAG: IntGauge = register_int_gauge!( + "tikv_cdc_min_resolved_ts_lag", + "The lag between the minimal resolved ts and the current ts" + ).unwrap(); pub static ref CDC_MIN_RESOLVED_TS: IntGauge = register_int_gauge!( "tikv_cdc_min_resolved_ts", "The minimal resolved ts for current regions" From 07e7cd40dffcd5d7ce1c0f2693bb7ba59a3cd465 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Wed, 6 Jul 2022 16:27:03 +0800 Subject: [PATCH 060/676] raftstore: introduce the CPU-based Load Base Split strategy (#12955) ref tikv/tikv#12063, ref tikv/tikv#12593, ref tikv/tikv#12942 Introduce the CPU-based Load Base Split strategy. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/worker/pd.rs | 106 ++++-- .../src/store/worker/split_controller.rs | 325 ++++++++++++++++-- components/server/src/server.rs | 11 +- components/test_raftstore/src/server.rs | 9 +- src/read_pool.rs | 5 +- 5 files changed, 405 insertions(+), 51 deletions(-) diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 280c15b083f..afd84ad16dd 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -472,6 +472,7 @@ where handle: Option>, timer: Option>, read_stats_sender: Option>, + cpu_stats_sender: Option>>, collect_store_infos_interval: Duration, load_base_split_check_interval: Duration, collect_tick_interval: Duration, @@ -493,6 +494,7 @@ where handle: None, timer: None, read_stats_sender: None, + cpu_stats_sender: None, collect_store_infos_interval: interval, load_base_split_check_interval: cmp::min( DEFAULT_LOAD_BASE_SPLIT_CHECK_INTERVAL, @@ -537,6 +539,9 @@ where let (read_stats_sender, read_stats_receiver) = mpsc::channel(); self.read_stats_sender = Some(read_stats_sender); + let (cpu_stats_sender, cpu_stats_receiver) = mpsc::channel(); + self.cpu_stats_sender = Some(cpu_stats_sender); + let scheduler = self.scheduler.clone(); let props = tikv_util::thread_group::current_properties(); @@ -548,17 +553,25 @@ where .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); tikv_alloc::add_thread_memory_accessor(); - let mut thread_stats = ThreadInfoStatistics::new(); + // Create different `ThreadInfoStatistics` for different purposes to + // make sure the record won't be disturbed. + let mut collect_store_infos_thread_stats = ThreadInfoStatistics::new(); + let mut load_base_split_thread_stats = ThreadInfoStatistics::new(); while let Err(mpsc::RecvTimeoutError::Timeout) = timer_rx.recv_timeout(tick_interval) { if is_enable_tick(timer_cnt, collect_store_infos_interval) { - StatsMonitor::collect_store_infos(&mut thread_stats, &scheduler); + StatsMonitor::collect_store_infos( + &mut collect_store_infos_thread_stats, + &scheduler, + ); } if is_enable_tick(timer_cnt, load_base_split_check_interval) { StatsMonitor::load_base_split( &mut auto_split_controller, &read_stats_receiver, + &cpu_stats_receiver, + &mut load_base_split_thread_stats, &scheduler, ); } @@ -602,7 +615,9 @@ where pub fn load_base_split( auto_split_controller: &mut AutoSplitController, - receiver: &Receiver, + read_stats_receiver: &Receiver, + cpu_stats_receiver: &Receiver>, + thread_stats: &mut ThreadInfoStatistics, scheduler: &Scheduler>, ) { let start_time = TiInstant::now(); @@ -618,11 +633,17 @@ where } SplitConfigChange::Noop => {} } - let mut others = vec![]; - while let Ok(other) = receiver.try_recv() { - others.push(other); + let mut read_stats_vec = vec![]; + while let Ok(read_stats) = read_stats_receiver.try_recv() { + read_stats_vec.push(read_stats); } - let (top, split_infos) = auto_split_controller.flush(others); + let mut cpu_stats_vec = vec![]; + while let Ok(cpu_stats) = cpu_stats_receiver.try_recv() { + cpu_stats_vec.push(cpu_stats); + } + thread_stats.record(); + let (top_qps, split_infos) = + auto_split_controller.flush(read_stats_vec, cpu_stats_vec, thread_stats); auto_split_controller.clear(); let task = Task::AutoSplit { split_infos }; if let Err(e) = scheduler.schedule(task) { @@ -632,10 +653,10 @@ where ); } for i in 0..TOP_N { - if i < top.len() { + if i < top_qps.len() { READ_QPS_TOPN .with_label_values(&[&i.to_string()]) - .set(top[i] as f64); + .set(top_qps[i] as f64); } else { READ_QPS_TOPN.with_label_values(&[&i.to_string()]).set(0.0); } @@ -672,15 +693,22 @@ where if let Some(h) = self.handle.take() { drop(self.timer.take()); drop(self.read_stats_sender.take()); + drop(self.cpu_stats_sender.take()); if let Err(e) = h.join() { error!("join stats collector failed"; "err" => ?e); } } } - pub fn get_read_stats_sender(&self) -> &Option> { + #[inline(always)] + fn get_read_stats_sender(&self) -> &Option> { &self.read_stats_sender } + + #[inline(always)] + fn get_cpu_stats_sender(&self) -> &Option>> { + &self.cpu_stats_sender + } } const HOTSPOT_KEY_RATE_THRESHOLD: u64 = 128; @@ -1684,6 +1712,12 @@ where // which is the read load portion of the write path. // TODO: more accurate CPU consumption of a specified region. fn handle_region_cpu_records(&mut self, records: Arc) { + // Send Region CPU info to AutoSplitController inside the stats_monitor. + if let Some(cpu_stats_sender) = self.stats_monitor.get_cpu_stats_sender() { + if cpu_stats_sender.send(records.clone()).is_err() { + warn!("send region cpu info failed, are we shutting down?") + } + } calculate_region_cpu_records(self.store_id, records, &mut self.region_cpu_records); } @@ -1831,18 +1865,46 @@ where if let Ok(Some(region)) = pd_client.get_region_by_id(split_info.region_id).await { - Self::handle_ask_batch_split( - router.clone(), - scheduler.clone(), - pd_client.clone(), - region, - vec![split_info.split_key], - split_info.peer, - true, - Callback::None, - String::from("auto_split"), - remote.clone(), - ); + // Try to split the region with the given split key. + if let Some(split_key) = split_info.split_key { + Self::handle_ask_batch_split( + router.clone(), + scheduler.clone(), + pd_client.clone(), + region, + vec![split_key], + split_info.peer, + true, + Callback::None, + String::from("auto_split"), + remote.clone(), + ); + return; + } + // Try to split the region on half within the given key range + // if there is no `split_key` been given. + if split_info.start_key.is_some() && split_info.end_key.is_some() { + let start_key = split_info.start_key.unwrap(); + let end_key = split_info.end_key.unwrap(); + let region_id = region.get_id(); + let msg = CasualMessage::HalfSplitRegion { + region_epoch: region.get_region_epoch().clone(), + start_key: Some(start_key.clone()), + end_key: Some(end_key.clone()), + policy: pdpb::CheckPolicy::Scan, + source: "auto_split", + cb: Callback::None, + }; + if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) + { + error!("send auto half split request failed"; + "region_id" => region_id, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => ?e, + ); + } + } } } }; diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index b644ac88d85..dd3fcbf95be 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -4,7 +4,7 @@ use std::{ cmp::{min, Ordering}, collections::{BinaryHeap, HashMap, HashSet}, slice::{Iter, IterMut}, - sync::Arc, + sync::{mpsc::Receiver, Arc}, time::{Duration, SystemTime}, }; @@ -15,10 +15,12 @@ use kvproto::{ }; use pd_client::{merge_bucket_stats, new_bucket_stats, BucketMeta, BucketStat}; use rand::Rng; -use tikv_util::{config::Tracker, debug, info, warn}; +use resource_metering::RawRecords; +use tikv_util::{config::Tracker, debug, info, metrics::ThreadInfoStatistics, warn}; use crate::store::{ metrics::*, + util::build_key_range, worker::{ query_stats::{is_read_query, QueryStats}, split_config::get_sample_num, @@ -32,6 +34,8 @@ pub const TOP_N: usize = 10; // LOAD_BASE_SPLIT_EVENT metrics label definitions. // Workload fits the QPS threshold or byte threshold. const LOAD_FIT: &str = "load_fit"; +// Workload fits the CPU threshold. +const CPU_LOAD_FIT: &str = "cpu_load_fit"; // The statistical key is empty. const EMPTY_STATISTICAL_KEY: &str = "empty_statistical_key"; // Split info has been collected, ready to split. @@ -46,6 +50,10 @@ const NO_ENOUGH_LR_KEY: &str = "no_enough_lr_key"; const NO_BALANCE_KEY: &str = "no_balance_key"; // The number of contained keys does not meet the score. const NO_UNCROSS_KEY: &str = "no_uncross_key"; +// Split info for the top hot CPU region has been collected, ready to split. +const READY_TO_SPLIT_CPU_TOP: &str = "ready_to_split_cpu_top"; +// The top hot CPU region is not ready to split. +const UNABLE_TO_SPLIT_CPU_TOP: &str = "unable_to_split_cpu_top"; // It will return prefix sum of the given iter, // `read` is a function to process the item from the iter. @@ -277,6 +285,8 @@ pub struct Recorder { pub peer: Peer, pub key_ranges: Vec>, pub create_time: SystemTime, + pub cpu_usage: f64, + pub hottest_key_range: Option, } impl Recorder { @@ -286,6 +296,8 @@ impl Recorder { peer: Peer::default(), key_ranges: vec![], create_time: SystemTime::now(), + cpu_usage: 0.0, + hottest_key_range: None, } } @@ -299,6 +311,14 @@ impl Recorder { } } + fn update_cpu_usage(&mut self, cpu_usage: f64) { + self.cpu_usage = cpu_usage; + } + + fn update_hottest_key_range(&mut self, key_range: KeyRange) { + self.hottest_key_range = Some(key_range); + } + fn is_ready(&self) -> bool { self.key_ranges.len() >= self.detect_times } @@ -515,8 +535,41 @@ impl WriteStats { pub struct SplitInfo { pub region_id: u64, - pub split_key: Vec, pub peer: Peer, + pub split_key: Option>, + pub start_key: Option>, + pub end_key: Option>, +} + +impl SplitInfo { + // Create a SplitInfo with the given region_id, peer and split_key. + // This is used to split the region with this specified split key later. + fn with_split_key(region_id: u64, peer: Peer, split_key: Vec) -> Self { + SplitInfo { + region_id, + peer, + split_key: Some(split_key), + start_key: None, + end_key: None, + } + } + + // Create a SplitInfo with the given region_id, peer, start_key and end_key. + // This is used to split the region on half within the specified start and end keys later. + fn with_start_end_key( + region_id: u64, + peer: Peer, + start_key: Vec, + end_key: Vec, + ) -> Self { + SplitInfo { + region_id, + peer, + split_key: None, + start_key: Some(start_key), + end_key: Some(end_key), + } + } } #[derive(PartialEq, Debug)] @@ -530,25 +583,71 @@ pub struct AutoSplitController { pub recorders: HashMap, pub cfg: SplitConfig, cfg_tracker: Tracker, + // Thread-related info + max_grpc_thread_count: usize, + max_unified_read_pool_thread_count: usize, + unified_read_pool_scale_receiver: Option>, } impl AutoSplitController { - pub fn new(config_manager: SplitConfigManager) -> AutoSplitController { + pub fn new( + config_manager: SplitConfigManager, + max_grpc_thread_count: usize, + max_unified_read_pool_thread_count: usize, + unified_read_pool_scale_receiver: Option>, + ) -> AutoSplitController { AutoSplitController { recorders: HashMap::default(), cfg: config_manager.value().clone(), cfg_tracker: config_manager.0.clone().tracker("split_hub".to_owned()), + max_grpc_thread_count, + max_unified_read_pool_thread_count, + unified_read_pool_scale_receiver, } } pub fn default() -> AutoSplitController { - AutoSplitController::new(SplitConfigManager::default()) + AutoSplitController::new(SplitConfigManager::default(), 0, 0, None) + } + + fn should_check_region_cpu(&self) -> bool { + self.cfg.region_cpu_overload_threshold_ratio > 0.0 + } + + fn is_grpc_poll_busy(&self, grpc_thread_usage: f64) -> bool { + if self.max_grpc_thread_count == 0 { + return false; + } + let grpc_thread_cpu_overload_threshold = + self.max_grpc_thread_count as f64 * self.cfg.grpc_thread_cpu_overload_threshold_ratio; + grpc_thread_usage > 0.0 && grpc_thread_usage >= grpc_thread_cpu_overload_threshold + } + + fn is_unified_read_pool_busy(&self, unified_read_pool_thread_usage: f64) -> bool { + if self.max_unified_read_pool_thread_count == 0 { + return false; + } + let unified_read_pool_cpu_overload_threshold = self.max_unified_read_pool_thread_count + as f64 + * self + .cfg + .unified_read_pool_thread_cpu_overload_threshold_ratio; + unified_read_pool_thread_usage > 0.0 + && unified_read_pool_thread_usage >= unified_read_pool_cpu_overload_threshold + } + + fn is_region_busy(&self, unified_read_pool_thread_usage: f64, region_cpu_usage: f64) -> bool { + if unified_read_pool_thread_usage <= 0.0 || !self.should_check_region_cpu() { + return false; + } + region_cpu_usage / unified_read_pool_thread_usage + >= self.cfg.region_cpu_overload_threshold_ratio } - // collect the read stats from read_stats_vec and dispatch them to a region hashmap. + // collect the read stats from read_stats_vec and dispatch them to a Region HashMap. fn collect_read_stats(read_stats_vec: Vec) -> HashMap> { - // collect from different thread - let mut region_infos_map = HashMap::default(); // regionID-regionInfos + // RegionID -> Vec, collect the RegionInfo from different threads. + let mut region_infos_map = HashMap::default(); let capacity = read_stats_vec.len(); for read_stats in read_stats_vec { for (region_id, region_info) in read_stats.region_infos { @@ -561,13 +660,109 @@ impl AutoSplitController { region_infos_map } + // collect the CPU stats from cpu_stats_vec and dispatch them to a Region HashMap. + fn collect_cpu_stats( + &self, + cpu_stats_vec: Vec>, + ) -> HashMap)> { + // RegionID -> (CPU usage, Hottest Key Range), calculate the CPU usage and its hottest key range. + let mut region_cpu_map = HashMap::default(); + if !self.should_check_region_cpu() { + return region_cpu_map; + } + // Calculate the Region CPU usage. + let mut collect_interval_ms = 0; + let mut region_key_range_cpu_time_map = HashMap::new(); + cpu_stats_vec.iter().for_each(|cpu_stats| { + cpu_stats.records.iter().for_each(|(tag, record)| { + // Calculate the Region ID -> CPU Time. + region_cpu_map + .entry(tag.region_id) + .and_modify(|(cpu_time, _)| *cpu_time += record.cpu_time as f64) + .or_insert_with(|| (record.cpu_time as f64, None)); + // Calculate the (Region ID, Key Range) -> CPU Time. + tag.key_ranges.iter().for_each(|key_range| { + region_key_range_cpu_time_map + .entry((tag.region_id, key_range)) + .and_modify(|cpu_time| *cpu_time += record.cpu_time) + .or_insert_with(|| record.cpu_time); + }) + }); + collect_interval_ms += cpu_stats.duration.as_millis(); + }); + // Calculate the Region CPU usage. + region_cpu_map.iter_mut().for_each(|(_, (cpu_time, _))| { + if collect_interval_ms == 0 { + *cpu_time = 0.0; + } else { + *cpu_time /= collect_interval_ms as f64; + } + }); + // Choose the hottest key range for each Region. + let mut hottest_key_range_cpu_time_map = HashMap::with_capacity(region_cpu_map.len()); + region_key_range_cpu_time_map + .iter() + .for_each(|((region_id, key_range), cpu_time)| { + let hottest_key_range_cpu_time = hottest_key_range_cpu_time_map + .entry(*region_id) + .or_insert_with(|| 0); + if cpu_time > hottest_key_range_cpu_time { + region_cpu_map + .entry(*region_id) + .and_modify(|(_, old_key_range)| { + *old_key_range = + Some(build_key_range(&key_range.0, &key_range.1, false)); + }); + *hottest_key_range_cpu_time = *cpu_time; + } + }); + region_cpu_map + } + + fn collect_thread_usage(thread_stats: &ThreadInfoStatistics, name: &str) -> f64 { + thread_stats + .get_cpu_usages() + .iter() + .filter(|(thread_name, _)| thread_name.contains(name)) + .fold(0, |cpu_usage_sum, (_, cpu_usage)| { + // `cpu_usage` is in [0, 100]. + cpu_usage_sum + cpu_usage + }) as f64 + / 100.0 + } + // flush the read stats info into the recorder and check if the region needs to be split // according to all the stats info the recorder has collected before. - pub fn flush(&mut self, read_stats_vec: Vec) -> (Vec, Vec) { - let mut split_infos = vec![]; + pub fn flush( + &mut self, + read_stats_vec: Vec, + cpu_stats_vec: Vec>, + thread_stats: &ThreadInfoStatistics, + ) -> (Vec, Vec) { + let mut top_cpu_usage = vec![]; let mut top_qps = BinaryHeap::with_capacity(TOP_N); let region_infos_map = Self::collect_read_stats(read_stats_vec); + let region_cpu_map = self.collect_cpu_stats(cpu_stats_vec); + // Prepare some diagnostic info. + let (grpc_thread_usage, unified_read_pool_thread_usage) = ( + Self::collect_thread_usage(thread_stats, "grpc-server"), + Self::collect_thread_usage(thread_stats, "unified-read-po"), + ); + let (is_grpc_poll_busy, is_unified_read_pool_busy) = ( + self.is_grpc_poll_busy(grpc_thread_usage), + self.is_unified_read_pool_busy(unified_read_pool_thread_usage), + ); + debug!("flush to load base split"; + "max_grpc_thread_count" => self.max_grpc_thread_count, + "grpc_thread_usage" => grpc_thread_usage, + "max_unified_read_pool_thread_count" => self.max_unified_read_pool_thread_count, + "unified_read_pool_thread_usage" => unified_read_pool_thread_usage, + "is_grpc_poll_busy" => is_grpc_poll_busy, + "is_unified_read_pool_busy" => is_unified_read_pool_busy, + ); + // Start to record the read stats info. + let mut split_infos = vec![]; for (region_id, region_infos) in region_infos_map { let qps_prefix_sum = prefix_sum(region_infos.iter(), RegionInfo::get_read_qps); // region_infos is not empty, so it's safe to unwrap here. @@ -575,19 +770,32 @@ impl AutoSplitController { let byte = region_infos .iter() .fold(0, |flow, region_info| flow + region_info.flow.read_bytes); + let (cpu_usage, hottest_key_range) = region_cpu_map + .get(®ion_id) + .map(|(cpu_usage, key_range)| (*cpu_usage, key_range.clone())) + .unwrap_or((0.0, None)); + let is_region_busy = self.is_region_busy(unified_read_pool_thread_usage, cpu_usage); debug!("load base split params"; "region_id" => region_id, "qps" => qps, "qps_threshold" => self.cfg.qps_threshold, "byte" => byte, "byte_threshold" => self.cfg.byte_threshold, + "cpu_usage" => cpu_usage, + "is_region_busy" => is_region_busy, ); QUERY_REGION_VEC .with_label_values(&["read"]) .observe(qps as f64); - if qps < self.cfg.qps_threshold && byte < self.cfg.byte_threshold { + // 1. If the QPS and Byte do not meet the threshold, skip. + // 2. If the Unified Read Pool is not busy or + // the Region is not hot enough (takes up 50% of the Unified Read Pool CPU times), skip. + if qps < self.cfg.qps_threshold + && byte < self.cfg.byte_threshold + && (!is_unified_read_pool_busy || !is_region_busy) + { self.recorders.remove_entry(®ion_id); continue; } @@ -600,6 +808,10 @@ impl AutoSplitController { .entry(region_id) .or_insert_with(|| Recorder::new(detect_times)); recorder.update_peer(®ion_infos[0].peer); + recorder.update_cpu_usage(cpu_usage); + if let Some(hottest_key_range) = hottest_key_range { + recorder.update_hottest_key_range(hottest_key_range); + } let key_ranges = sample( self.cfg.sample_num, @@ -616,20 +828,27 @@ impl AutoSplitController { if recorder.is_ready() { let key = recorder.collect(&self.cfg); if !key.is_empty() { - split_infos.push(SplitInfo { + split_infos.push(SplitInfo::with_split_key( region_id, - split_key: key, - peer: recorder.peer.clone(), - }); + recorder.peer.clone(), + key, + )); LOAD_BASE_SPLIT_EVENT .with_label_values(&[READY_TO_SPLIT]) .inc(); info!("load base split region"; "region_id" => region_id, "qps" => qps, + "byte" => byte, + "cpu_usage" => cpu_usage, ); + self.recorders.remove(®ion_id); + } else if is_unified_read_pool_busy && is_region_busy { + LOAD_BASE_SPLIT_EVENT + .with_label_values(&[CPU_LOAD_FIT]) + .inc(); + top_cpu_usage.push(region_id); } - self.recorders.remove(®ion_id); } else { LOAD_BASE_SPLIT_EVENT .with_label_values(&[NOT_READY_TO_SPLIT]) @@ -639,6 +858,49 @@ impl AutoSplitController { top_qps.push(qps); } + // Check if the top CPU usage region could be split. + // TODO: avoid unnecessary split by introducing the feedback mechanism from PD. + if !top_cpu_usage.is_empty() && !is_grpc_poll_busy { + // Calculate by using the latest CPU usage. + top_cpu_usage.sort_unstable_by(|a, b| { + let cpu_usage_a = self.recorders.get(a).unwrap().cpu_usage; + let cpu_usage_b = self.recorders.get(b).unwrap().cpu_usage; + cpu_usage_b.partial_cmp(&cpu_usage_a).unwrap() + }); + let region_id = top_cpu_usage[0]; + let recorder = self.recorders.get_mut(®ion_id).unwrap(); + if recorder.hottest_key_range.is_some() { + split_infos.push(SplitInfo::with_start_end_key( + region_id, + recorder.peer.clone(), + recorder + .hottest_key_range + .as_ref() + .unwrap() + .start_key + .clone(), + recorder.hottest_key_range.as_ref().unwrap().end_key.clone(), + )); + LOAD_BASE_SPLIT_EVENT + .with_label_values(&[READY_TO_SPLIT_CPU_TOP]) + .inc(); + info!("load base split region"; + "region_id" => region_id, + "start_key" => log_wrappers::Value::key(&recorder.hottest_key_range.as_ref().unwrap().start_key), + "end_key" => log_wrappers::Value::key(&recorder.hottest_key_range.as_ref().unwrap().end_key), + "cpu_usage" => recorder.cpu_usage, + ); + } else { + LOAD_BASE_SPLIT_EVENT + .with_label_values(&[UNABLE_TO_SPLIT_CPU_TOP]) + .inc(); + } + } + // Clean up the rest top CPU usage recorders. + for region_id in top_cpu_usage { + self.recorders.remove(®ion_id); + } + (top_qps.into_vec(), split_infos) } @@ -666,6 +928,12 @@ impl AutoSplitController { } self.cfg = incoming.clone(); } + // Adjust with the size change of the Unified Read Pool. + if let Some(rx) = &self.unified_read_pool_scale_receiver { + if let Ok(max_thread_count) = rx.try_recv() { + self.max_unified_read_pool_thread_count = max_thread_count; + } + } cfg_change } } @@ -677,9 +945,8 @@ mod tests { use txn_types::Key; use super::*; - use crate::store::{ - util::build_key_range, - worker::split_config::{DEFAULT_SAMPLE_NUM, REGION_CPU_OVERLOAD_THRESHOLD_RATIO}, + use crate::store::worker::split_config::{ + DEFAULT_SAMPLE_NUM, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, }; enum Position { @@ -880,7 +1147,8 @@ mod tests { hub.cfg.sample_threshold = 0; for i in 0..10 { - let (_, split_infos) = hub.flush(qps_stats.clone()); + let (_, split_infos) = + hub.flush(qps_stats.clone(), vec![], &ThreadInfoStatistics::default()); if (i + 1) % hub.cfg.detect_times == 0 { assert_eq!( split_infos.len(), @@ -891,7 +1159,9 @@ mod tests { for obtain in &split_infos { let mut equal = false; for expect in &split_keys { - if obtain.split_key.cmp(&expect.to_vec()) == Ordering::Equal { + if obtain.split_key.as_ref().unwrap().cmp(&expect.to_vec()) + == Ordering::Equal + { equal = true; break; } @@ -936,7 +1206,7 @@ mod tests { ); } qps_stats_vec.push(qps_stats); - hub.flush(qps_stats_vec); + hub.flush(qps_stats_vec, vec![], &ThreadInfoStatistics::default()); } // Test the empty key ranges. @@ -949,7 +1219,7 @@ mod tests { qps_stats.add_query_num(1, &Peer::default(), KeyRange::default(), QueryKind::Get); } qps_stats_vec.push(qps_stats); - hub.flush(qps_stats_vec); + hub.flush(qps_stats_vec, vec![], &ThreadInfoStatistics::default()); } fn check_sample_length(key_ranges: Vec>) { @@ -1229,7 +1499,8 @@ mod tests { let split_config = SplitConfig::default(); let mut split_cfg_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(split_config))); - let mut auto_split_controller = AutoSplitController::new(split_cfg_manager.clone()); + let mut auto_split_controller = + AutoSplitController::new(split_cfg_manager.clone(), 0, 0, None); assert_eq!( auto_split_controller.refresh_and_check_cfg(), SplitConfigChange::Noop, @@ -1309,7 +1580,11 @@ mod tests { } b.iter(|| { let mut hub = AutoSplitController::default(); - hub.flush(other_qps_stats.clone()); + hub.flush( + other_qps_stats.clone(), + vec![], + &ThreadInfoStatistics::default(), + ); }); } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index e09eec7d5d8..351015fdd9a 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -761,13 +761,17 @@ impl TiKvServer { cop_read_pools.handle() }; + let mut unified_read_pool_scale_receiver = None; if self.config.readpool.is_unified_pool_enabled() { + let (unified_read_pool_scale_notifier, rx) = mpsc::sync_channel(10); cfg_controller.register( tikv::config::Module::Readpool, Box::new(ReadPoolConfigManager( unified_read_pool.as_ref().unwrap().handle(), + unified_read_pool_scale_notifier, )), ); + unified_read_pool_scale_receiver = Some(rx); } // Register causal observer for RawKV API V2 @@ -959,7 +963,12 @@ impl TiKvServer { Box::new(split_config_manager.clone()), ); - let auto_split_controller = AutoSplitController::new(split_config_manager); + let auto_split_controller = AutoSplitController::new( + split_config_manager, + self.config.server.grpc_concurrency, + self.config.readpool.unified.max_thread_count, + unified_read_pool_scale_receiver, + ); // `ConsistencyCheckObserver` must be registered before `Node::start`. let safe_point = Arc::new(AtomicU64::new(0)); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index ac6a72e3a06..b87cc5257a5 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -541,11 +541,13 @@ impl ServerCluster { cfg.server.addr = format!("{}", addr); let trans = server.transport(); let simulate_trans = SimulateTransport::new(trans); + let max_grpc_thread_count = cfg.server.grpc_concurrency; let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); // Register the role change observer of the lock manager. lock_mgr.register_detector_role_change_observer(&mut coprocessor_host); + let max_unified_read_pool_thread_count = cfg.readpool.unified.max_thread_count; let pessimistic_txn_cfg = cfg.tikv.pessimistic_txn; let split_check_runner = @@ -553,7 +555,12 @@ impl ServerCluster { let split_check_scheduler = bg_worker.start("split-check", split_check_runner); let split_config_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(cfg.tikv.split))); - let auto_split_controller = AutoSplitController::new(split_config_manager); + let auto_split_controller = AutoSplitController::new( + split_config_manager, + max_grpc_thread_count, + max_unified_read_pool_thread_count, + None, + ); node.start( engines, simulate_trans.clone(), diff --git a/src/read_pool.rs b/src/read_pool.rs index cebd1965153..7409c9a4b6e 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -2,7 +2,7 @@ use std::{ future::Future, - sync::{Arc, Mutex}, + sync::{mpsc::SyncSender, Arc, Mutex}, }; use file_system::{set_io_type, IOType}; @@ -292,13 +292,14 @@ impl From> for ReadPool { } } -pub struct ReadPoolConfigManager(pub ReadPoolHandle); +pub struct ReadPoolConfigManager(pub ReadPoolHandle, pub SyncSender); impl ConfigManager for ReadPoolConfigManager { fn dispatch(&mut self, change: ConfigChange) -> CfgResult<()> { if let Some(ConfigValue::Module(unified)) = change.get("unified") { if let Some(ConfigValue::Usize(max_thread_count)) = unified.get("max_thread_count") { self.0.scale_pool_size(*max_thread_count); + self.1.send(*max_thread_count)?; } } info!( From 8c39b6014e42a863e66cec1d3a360bb69ee869c6 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 7 Jul 2022 14:07:02 +0800 Subject: [PATCH 061/676] test: update nextest profile (#12975) ref tikv/tikv#12769 Signed-off-by: tabokie --- .config/nextest.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.config/nextest.toml b/.config/nextest.toml index 247389fcd17..6f67aa5ecdb 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -1,7 +1,8 @@ [profile.ci] -retries = 1 # Run at most 2 times +retries = 2 # Run at most 3 times fail-fast = false -slow-timeout = { period = "60s", terminate-after = 2 } # Timeout=120s +slow-timeout = { period = "60s", terminate-after = 2 } # Timeout 2m +failure-output = "final" [profile.ci.junit] path = "junit.xml" From 2ca69a52f0d8d375e4c52f7f32504ffb0af129a5 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 7 Jul 2022 16:03:03 +0800 Subject: [PATCH 062/676] raftstore: add some test cases for the CPU-based Load Base Split strategy (#12969) ref tikv/tikv#12063 Add some test cases for the CPU-based Load Base Split strategy. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- .../src/store/worker/split_controller.rs | 292 ++++++++++++++++-- 1 file changed, 262 insertions(+), 30 deletions(-) diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index dd3fcbf95be..1a3fb15af45 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -8,6 +8,7 @@ use std::{ time::{Duration, SystemTime}, }; +use fail::fail_point; use kvproto::{ kvrpcpb::KeyRange, metapb::{self, Peer}, @@ -615,6 +616,8 @@ impl AutoSplitController { } fn is_grpc_poll_busy(&self, grpc_thread_usage: f64) -> bool { + #[cfg(feature = "failpoints")] + fail_point!("mock_grpc_poll_is_not_busy", |_| { false }); if self.max_grpc_thread_count == 0 { return false; } @@ -624,6 +627,8 @@ impl AutoSplitController { } fn is_unified_read_pool_busy(&self, unified_read_pool_thread_usage: f64) -> bool { + #[cfg(feature = "failpoints")] + fail_point!("mock_unified_read_pool_is_busy", |_| { true }); if self.max_unified_read_pool_thread_count == 0 { return false; } @@ -637,6 +642,8 @@ impl AutoSplitController { } fn is_region_busy(&self, unified_read_pool_thread_usage: f64, region_cpu_usage: f64) -> bool { + #[cfg(feature = "failpoints")] + fail_point!("mock_region_is_busy", |_| { true }); if unified_read_pool_thread_usage <= 0.0 || !self.should_check_region_cpu() { return false; } @@ -941,6 +948,7 @@ impl AutoSplitController { #[cfg(test)] mod tests { use online_config::{ConfigChange, ConfigManager, ConfigValue}; + use resource_metering::{RawRecord, TagInfos}; use tikv_util::config::VersionTrack; use txn_types::Key; @@ -1059,7 +1067,7 @@ mod tests { build_key_range(b"a", b"b", false), build_key_range(b"b", b"c", false), ]; - check_split( + check_split_key( b"raw key", vec![gen_read_stats(1, raw_key_ranges.clone())], vec![b"b"], @@ -1073,14 +1081,14 @@ mod tests { build_key_range(key_a.as_encoded(), key_b.as_encoded(), false), build_key_range(key_b.as_encoded(), key_c.as_encoded(), false), ]; - check_split( + check_split_key( b"encoded key", vec![gen_read_stats(1, encoded_key_ranges.clone())], vec![key_b.as_encoded()], ); // mix mode - check_split( + check_split_key( b"mix key", vec![ gen_read_stats(1, raw_key_ranges), @@ -1090,7 +1098,7 @@ mod tests { ); // test distribution with contained key - for _i in 0..100 { + for _ in 0..100 { let key_ranges = vec![ build_key_range(b"a", b"k", false), build_key_range(b"b", b"j", false), @@ -1099,7 +1107,7 @@ mod tests { build_key_range(b"e", b"g", false), build_key_range(b"f", b"f", false), ]; - check_split( + check_split_key( b"isosceles triangle", vec![gen_read_stats(1, key_ranges)], vec![], @@ -1113,7 +1121,7 @@ mod tests { build_key_range(b"e", b"j", false), build_key_range(b"f", b"k", false), ]; - check_split( + check_split_key( b"parallelogram", vec![gen_read_stats(1, key_ranges)], vec![], @@ -1123,7 +1131,7 @@ mod tests { build_key_range(b"a", b"l", false), build_key_range(b"a", b"m", false), ]; - check_split( + check_split_key( b"right-angle trapezoid", vec![gen_read_stats(1, key_ranges)], vec![], @@ -1133,15 +1141,63 @@ mod tests { build_key_range(b"a", b"l", false), build_key_range(b"b", b"l", false), ]; - check_split( + check_split_key( b"right-angle trapezoid", vec![gen_read_stats(1, key_ranges)], vec![], ); } + + // test high CPU usage + fail::cfg("mock_grpc_poll_is_not_busy", "return(0)").unwrap(); + fail::cfg("mock_unified_read_pool_is_busy", "return(0)").unwrap(); + fail::cfg("mock_region_is_busy", "return(0)").unwrap(); + for _ in 0..100 { + let key_ranges = vec![ + build_key_range(b"a", b"l", false), + build_key_range(b"a", b"m", false), + ]; + check_split_key_range( + b"right-angle trapezoid with high CPU usage", + vec![gen_read_stats(1, key_ranges.clone())], + vec![gen_cpu_stats(1, key_ranges.clone(), vec![100, 200])], + b"a", + b"m", + ); + check_split_key_range( + b"right-angle trapezoid with high CPU usage", + vec![gen_read_stats(1, key_ranges.clone())], + vec![gen_cpu_stats(1, key_ranges, vec![200, 100])], + b"a", + b"l", + ); + + let key_ranges = vec![ + build_key_range(b"a", b"l", false), + build_key_range(b"b", b"l", false), + ]; + check_split_key_range( + b"right-angle trapezoid with high CPU usage", + vec![gen_read_stats(1, key_ranges.clone())], + vec![gen_cpu_stats(1, key_ranges.clone(), vec![100, 200])], + b"b", + b"l", + ); + check_split_key_range( + b"right-angle trapezoid with high CPU usage", + vec![gen_read_stats(1, key_ranges.clone())], + vec![gen_cpu_stats(1, key_ranges, vec![200, 100])], + b"a", + b"l", + ); + } + fail::remove("mock_grpc_poll_is_not_busy"); + fail::remove("mock_unified_read_pool_is_busy"); + fail::remove("mock_region_is_busy"); } - fn check_split(mode: &[u8], qps_stats: Vec, split_keys: Vec<&[u8]>) { + fn check_split_key(mode: &[u8], qps_stats: Vec, split_keys: Vec<&[u8]>) { + let mode = String::from_utf8(Vec::from(mode)).unwrap(); let mut hub = AutoSplitController::default(); hub.cfg.qps_threshold = 1; hub.cfg.sample_threshold = 0; @@ -1149,33 +1205,95 @@ mod tests { for i in 0..10 { let (_, split_infos) = hub.flush(qps_stats.clone(), vec![], &ThreadInfoStatistics::default()); - if (i + 1) % hub.cfg.detect_times == 0 { - assert_eq!( - split_infos.len(), - split_keys.len(), - "mode: {:?}", - String::from_utf8(Vec::from(mode)).unwrap() - ); - for obtain in &split_infos { - let mut equal = false; - for expect in &split_keys { - if obtain.split_key.as_ref().unwrap().cmp(&expect.to_vec()) - == Ordering::Equal - { - equal = true; - break; - } + if (i + 1) % hub.cfg.detect_times != 0 { + continue; + } + // Check the split key. + assert_eq!(split_infos.len(), split_keys.len(), "mode: {:?}", mode); + for obtain in &split_infos { + let mut equal = false; + for expect in &split_keys { + if obtain.split_key.as_ref().unwrap().cmp(&expect.to_vec()) == Ordering::Equal { + equal = true; + break; } - assert!( - equal, - "mode: {:?}", - String::from_utf8(Vec::from(mode)).unwrap() - ); } + assert!(equal, "mode: {:?}", mode); } } } + fn check_split_key_range( + mode: &[u8], + qps_stats: Vec, + cpu_stats: Vec>, + start_key: &[u8], + end_key: &[u8], + ) { + let mode = String::from_utf8(Vec::from(mode)).unwrap(); + let mut hub = AutoSplitController::default(); + hub.cfg.qps_threshold = 1; + hub.cfg.sample_threshold = 0; + + for i in 0..10 { + let (_, split_infos) = hub.flush( + qps_stats.clone(), + cpu_stats.clone(), + &ThreadInfoStatistics::default(), + ); + if (i + 1) % hub.cfg.detect_times != 0 { + continue; + } + assert_eq!(split_infos.len(), 1, "mode: {:?}", mode); + // Check the split key range. + let split_info = &split_infos[0]; + assert!(split_info.split_key.is_none(), "mode: {:?}", mode); + assert_eq!( + split_info + .start_key + .as_ref() + .unwrap() + .cmp(&start_key.to_vec()), + Ordering::Equal, + "mode: {:?}", + mode + ); + assert_eq!( + split_info.end_key.as_ref().unwrap().cmp(&end_key.to_vec()), + Ordering::Equal, + "mode: {:?}", + mode + ); + } + } + + fn gen_cpu_stats( + region_id: u64, + key_ranges: Vec, + cpu_times: Vec, + ) -> Arc { + let mut raw_records = RawRecords::default(); + raw_records.duration = Duration::from_millis(100); + for (idx, key_range) in key_ranges.iter().enumerate() { + let key_range_tag = Arc::new(TagInfos { + store_id: 0, + region_id, + peer_id: 0, + key_ranges: vec![(key_range.start_key.clone(), key_range.end_key.clone())], + extra_attachment: vec![], + }); + raw_records.records.insert( + key_range_tag.clone(), + RawRecord { + cpu_time: cpu_times[idx], + read_keys: 0, + write_keys: 0, + }, + ); + } + Arc::new(raw_records) + } + #[test] fn test_sample_key_num() { let mut hub = AutoSplitController::default(); @@ -1563,6 +1681,120 @@ mod tests { split_cfg_manager.dispatch(config_change).unwrap(); } + #[test] + fn test_collect_cpu_stats() { + let auto_split_controller = AutoSplitController::default(); + let region_cpu_map = auto_split_controller.collect_cpu_stats(vec![]); + assert!(region_cpu_map.is_empty()); + + let ab_key_range_tag = Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![(b"a".to_vec(), b"b".to_vec())], + extra_attachment: vec![], + }); + let cd_key_range_tag = Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![(b"c".to_vec(), b"d".to_vec())], + extra_attachment: vec![], + }); + let multiple_key_ranges_tag = Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![ + (b"a".to_vec(), b"b".to_vec()), + (b"c".to_vec(), b"d".to_vec()), + ], + extra_attachment: vec![], + }); + let empty_key_range_tag = Arc::new(TagInfos { + store_id: 0, + region_id: 1, + peer_id: 0, + key_ranges: vec![], + extra_attachment: vec![], + }); + + let test_cases = vec![ + (300, 150, 50, 50, Some(build_key_range(b"a", b"b", false))), + (150, 300, 50, 50, Some(build_key_range(b"c", b"d", false))), + (150, 50, 300, 50, Some(build_key_range(b"a", b"b", false))), + (50, 150, 300, 50, Some(build_key_range(b"c", b"d", false))), + (150, 50, 50, 300, Some(build_key_range(b"a", b"b", false))), + (100, 0, 0, 0, Some(build_key_range(b"a", b"b", false))), + (50, 0, 0, 50, Some(build_key_range(b"a", b"b", false))), + (50, 0, 0, 100, Some(build_key_range(b"a", b"b", false))), + (50, 0, 50, 0, Some(build_key_range(b"a", b"b", false))), + (0, 50, 50, 0, Some(build_key_range(b"c", b"d", false))), + (0, 0, 0, 100, None), + (0, 0, 0, 0, None), + ]; + for (i, test_case) in test_cases.iter().enumerate() { + let mut raw_records = RawRecords::default(); + raw_records.duration = Duration::from_millis(100); + // ["a", "b"] with (test_case.0)ms CPU time. + raw_records.records.insert( + ab_key_range_tag.clone(), + RawRecord { + cpu_time: test_case.0, + read_keys: 0, + write_keys: 0, + }, + ); + // ["c", "d"] with (test_case.1)ms CPU time. + raw_records.records.insert( + cd_key_range_tag.clone(), + RawRecord { + cpu_time: test_case.1, + read_keys: 0, + write_keys: 0, + }, + ); + // Multiple key ranges with (test_case.2)ms CPU time. + raw_records.records.insert( + multiple_key_ranges_tag.clone(), + RawRecord { + cpu_time: test_case.2, + read_keys: 0, + write_keys: 0, + }, + ); + // Empty key range with (test_case.3)ms CPU time. + raw_records.records.insert( + empty_key_range_tag.clone(), + RawRecord { + cpu_time: test_case.3, + read_keys: 0, + write_keys: 0, + }, + ); + let region_cpu_map = + auto_split_controller.collect_cpu_stats(vec![Arc::new(raw_records)]); + assert_eq!( + region_cpu_map.len(), + 1, + "test_collect_cpu_stats case: {}", + i + ); + assert_eq!( + region_cpu_map.get(&1).unwrap().0, + (test_case.0 + test_case.1 + test_case.2 + test_case.3) as f64 / 100.0, + "test_collect_cpu_stats case: {}", + i + ); + assert_eq!( + region_cpu_map.get(&1).unwrap().1, + test_case.4, + "test_collect_cpu_stats case: {}", + i + ); + } + } + #[bench] fn samples_evaluate(b: &mut test::Bencher) { let mut samples = Samples(vec![Sample::new(b"c")]); From fc49bdf8694c629184c2b512ced9390a56641b1a Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 8 Jul 2022 13:07:03 +0800 Subject: [PATCH 063/676] debug: Parameterize debug service (#12960) ref tikv/tikv#12849 Parameterize debug service Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- src/server/service/debug.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index 160daa1178b..740e597e5e2 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -1,7 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use engine_rocks::RocksEngine; -use engine_traits::{Engines, MiscExt, RaftEngine}; +use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine}; use futures::{ channel::oneshot, future::{Future, FutureExt, TryFutureExt}, @@ -53,25 +53,27 @@ fn error_to_grpc_error(tag: &'static str, e: Error) -> GrpcError { /// Service handles the RPC messages for the `Debug` service. #[derive(Clone)] -pub struct Service> { +pub struct Service> { pool: Handle, debugger: Debugger, raft_router: T, + _phantom: std::marker::PhantomData, } -impl> Service { +impl> Service { /// Constructs a new `Service` with `Engines`, a `RaftStoreRouter` and a `GcWorker`. pub fn new( engines: Engines, pool: Handle, raft_router: T, cfg_controller: ConfigController, - ) -> Service { + ) -> Service { let debugger = Debugger::new(engines, cfg_controller); Service { pool, debugger, raft_router, + _phantom: Default::default(), } } @@ -96,7 +98,9 @@ impl> Service { } } -impl + 'static> debugpb::Debug for Service { +impl + 'static> debugpb::Debug + for Service +{ fn get(&mut self, ctx: RpcContext<'_>, mut req: GetRequest, sink: UnarySink) { const TAG: &str = "debug_get"; @@ -532,7 +536,7 @@ impl + 'static> debugpb::Debug f } } -fn region_detail>( +fn region_detail>( raft_router: T, region_id: u64, store_id: u64, @@ -573,7 +577,7 @@ fn region_detail>( } } -fn consistency_check>( +fn consistency_check>( raft_router: T, mut detail: RegionDetailResponse, ) -> impl Future> { From b4bccd7a58faa775d1d7ec7e6b60201bb5ebc6f7 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 8 Jul 2022 05:45:04 -0700 Subject: [PATCH 064/676] raftstorev2: support building store batch system (#12921) ref tikv/tikv#12842 Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 4 +- components/engine_panic/src/raft_engine.rs | 8 + components/engine_rocks/src/raft_engine.rs | 28 ++ components/engine_traits/src/raft_engine.rs | 8 + components/raft_log_engine/src/engine.rs | 8 + components/raftstore-v2/Cargo.toml | 2 + components/raftstore-v2/src/batch/apply.rs | 4 + components/raftstore-v2/src/batch/mod.rs | 11 + components/raftstore-v2/src/batch/store.rs | 337 ++++++++++++++++++++ components/raftstore-v2/src/fsm/mod.rs | 3 + components/raftstore-v2/src/fsm/peer.rs | 105 +++++- components/raftstore-v2/src/fsm/store.rs | 64 +++- components/raftstore-v2/src/lib.rs | 3 + components/raftstore-v2/src/raft/peer.rs | 93 ++++-- components/raftstore-v2/src/raft/storage.rs | 108 ++++++- components/raftstore/src/store/fsm/peer.rs | 3 + components/raftstore/src/store/fsm/store.rs | 32 +- components/tikv_util/src/lib.rs | 10 + src/coprocessor/endpoint.rs | 24 +- src/coprocessor/tracker.rs | 6 +- src/server/service/kv.rs | 14 +- 21 files changed, 788 insertions(+), 87 deletions(-) create mode 100644 components/raftstore-v2/src/batch/apply.rs create mode 100644 components/raftstore-v2/src/batch/mod.rs create mode 100644 components/raftstore-v2/src/batch/store.rs diff --git a/Cargo.lock b/Cargo.lock index 1dfb74e3b13..bdb55d28de2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2623,7 +2623,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#acfe326c7cb2bdcdbfc991cada1973a68f34836f" +source = "git+https://github.com/pingcap/kvproto.git#a5d4ffd2ba337dad0bc99e9fb53bf665864a3f3b" dependencies = [ "futures 0.3.15", "grpcio", @@ -4168,12 +4168,14 @@ dependencies = [ name = "raftstore-v2" version = "0.1.0" dependencies = [ + "batch-system", "collections", "crossbeam", "engine_test", "engine_traits", "error_code", "fail", + "futures-util", "kvproto", "pd_client", "raft", diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 384bc60ffa6..d6f82c7f646 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -141,6 +141,14 @@ impl RaftEngine for PanicEngine { fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { panic!() } + + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From, + { + panic!() + } } impl RaftLogBatch for PanicWriteBatch { diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index 57a65ba661f..19ceea3062c 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -338,6 +338,34 @@ impl RaftEngine for RocksEngine { fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { self.put_msg(keys::STORE_IDENT_KEY, ident) } + + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From, + { + let start_key = keys::REGION_META_MIN_KEY; + let end_key = keys::REGION_META_MAX_KEY; + let mut err = None; + self.scan(start_key, end_key, false, |key, _| { + let (region_id, suffix) = box_try!(keys::decode_region_meta_key(key)); + if suffix != keys::REGION_STATE_SUFFIX { + return Ok(true); + } + + match f(region_id) { + Ok(()) => Ok(true), + Err(e) => { + err = Some(e); + Ok(false) + } + } + })?; + match err { + None => Ok(()), + Some(e) => Err(e), + } + } } impl RaftLogBatch for RocksWriteBatch { diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index e119184c556..7773ee3245f 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -138,6 +138,14 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send fn dump_stats(&self) -> Result; fn get_engine_size(&self) -> Result; + + /// Visit all available raft groups. + /// + /// If any error is returned, the iteration will stop. + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From; } pub trait RaftLogBatch: Send { diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 8c9a7fd2b88..9236e7947db 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -548,6 +548,14 @@ impl RaftEngine for RaftLogEngine { fn get_engine_size(&self) -> Result { Ok(self.0.get_used_size() as u64) } + + fn for_each_raft_group(&self, _f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From, + { + unimplemented!() + } } fn transfer_error(e: RaftEngineError) -> engine_traits::Error { diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 100a2be409d..5cdd2ee747f 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -29,11 +29,13 @@ cloud-gcp = ["raftstore/cloud-gcp"] cloud-azure = ["raftstore/cloud-azure"] [dependencies] +batch-system = { path = "../batch-system", default-features = false } collections = { path = "../collections" } crossbeam = "0.8" engine_traits = { path = "../engine_traits" } error_code = { path = "../error_code" } fail = "0.5" +futures-util = { version = "0.3", features = ["compat"] } kvproto = { git = "https://github.com/pingcap/kvproto.git" } pd_client = { path = "../pd_client" } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } diff --git a/components/raftstore-v2/src/batch/apply.rs b/components/raftstore-v2/src/batch/apply.rs new file mode 100644 index 00000000000..a7e392127d5 --- /dev/null +++ b/components/raftstore-v2/src/batch/apply.rs @@ -0,0 +1,4 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +/// Batch system for applying logs pipeline. +pub struct ApplySystem; diff --git a/components/raftstore-v2/src/batch/mod.rs b/components/raftstore-v2/src/batch/mod.rs new file mode 100644 index 00000000000..e856147220d --- /dev/null +++ b/components/raftstore-v2/src/batch/mod.rs @@ -0,0 +1,11 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains the specialized implementation of batch systems. +//! +//! StoreSystem is used for polling raft state machines, ApplySystem is used for +//! applying logs. + +mod apply; +mod store; + +pub use store::{create_store_batch_system, StoreContext, StoreSystem}; diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs new file mode 100644 index 00000000000..6a8974259ff --- /dev/null +++ b/components/raftstore-v2/src/batch/store.rs @@ -0,0 +1,337 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{mem, ops::DerefMut, sync::Arc, time::Duration}; + +use batch_system::{ + BasicMailbox, BatchRouter, BatchSystem, HandleResult, HandlerBuilder, PollHandler, +}; +use collections::HashMap; +use engine_traits::{Engines, KvEngine, RaftEngine, TabletFactory}; +use futures_util::{compat::Future01CompatExt, FutureExt}; +use kvproto::{metapb::Store, raft_serverpb::PeerState}; +use raftstore::store::{fsm::store::PeerTickBatch, Config, Transport}; +use slog::Logger; +use tikv_util::{ + box_err, + config::{Tracker, VersionTrack}, + future::poll_future_notify, + time::Instant as TiInstant, + timer::SteadyTimer, +}; + +use crate::{ + fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate}, + raft::Peer, + Error, PeerMsg, PeerTick, Result, StoreMsg, +}; + +/// A per thread context used for handling raft messages. +pub struct StoreContext { + /// A logger without any KV. It's clean for creating new PeerFSM. + pub logger: Logger, + /// The transport for sending messages to peers on other stores. + pub trans: T, + /// The latest configuration. + pub cfg: Config, + /// The tick batch for delay ticking. It will be flushed at the end of every round. + pub tick_batch: Vec, + /// The precise timer for scheduling tick. + pub timer: SteadyTimer, +} + +impl StoreContext { + fn new(cfg: Config, trans: T, logger: Logger) -> Self { + Self { + logger, + trans, + cfg, + tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], + timer: SteadyTimer::default(), + } + } +} + +/// Poller for polling raft state machines. +struct StorePoller { + store_msg_buf: Vec, + peer_msg_buf: Vec>, + poll_ctx: StoreContext, + cfg_tracker: Tracker, + last_flush_time: TiInstant, + need_flush_events: bool, +} + +impl StorePoller { + pub fn new(poll_ctx: StoreContext, cfg_tracker: Tracker) -> Self { + let mut poller = Self { + store_msg_buf: Vec::new(), + peer_msg_buf: Vec::new(), + poll_ctx, + cfg_tracker, + last_flush_time: TiInstant::now(), + need_flush_events: false, + }; + poller.apply_buf_capacity(); + poller + } + + /// Updates the internal buffer to latest capacity. + fn apply_buf_capacity(&mut self) { + let new_cap = self.messages_per_tick(); + tikv_util::set_vec_capacity(&mut self.store_msg_buf, new_cap); + tikv_util::set_vec_capacity(&mut self.peer_msg_buf, new_cap); + } + + #[inline] + fn messages_per_tick(&self) -> usize { + self.poll_ctx.cfg.messages_per_tick + } + + fn flush_events(&mut self) { + self.schedule_ticks(); + } + + fn schedule_ticks(&mut self) { + assert_eq!( + PeerTick::get_all_ticks().len(), + self.poll_ctx.tick_batch.len() + ); + for batch in &mut self.poll_ctx.tick_batch { + batch.schedule(&self.poll_ctx.timer); + } + } +} + +impl PollHandler, StoreFsm> + for StorePoller +{ + fn begin(&mut self, _batch_size: usize, update_cfg: F) + where + for<'a> F: FnOnce(&'a batch_system::Config), + { + let cfg = self.cfg_tracker.any_new().map(|c| c.clone()); + if let Some(cfg) = cfg { + if cfg.messages_per_tick != self.messages_per_tick() { + self.apply_buf_capacity(); + } + self.poll_ctx.cfg = cfg; + } + } + + fn handle_control(&mut self, store: &mut StoreFsm) -> Option { + let received_cnt = store.recv(&mut self.store_msg_buf); + let expected_msg_count = if received_cnt == self.messages_per_tick() { + None + } else { + Some(0) + }; + let mut delegate = StoreFsmDelegate::new(store, &mut self.poll_ctx); + delegate.handle_msgs(&mut self.store_msg_buf); + expected_msg_count + } + + fn handle_normal( + &mut self, + peer: &mut impl DerefMut>, + ) -> HandleResult { + let received_cnt = peer.recv(&mut self.peer_msg_buf); + let handle_result = if received_cnt == self.messages_per_tick() { + HandleResult::KeepProcessing + } else { + HandleResult::stop_at(0, false) + }; + let mut delegate = PeerFsmDelegate::new(peer, &mut self.poll_ctx); + delegate.handle_msgs(&mut self.peer_msg_buf); + handle_result + } + + fn light_end(&mut self, _batch: &mut [Option>>]) { + if self.poll_ctx.trans.need_flush() { + self.poll_ctx.trans.flush(); + } + + let now = TiInstant::now(); + if now.saturating_duration_since(self.last_flush_time) >= Duration::from_millis(1) { + self.last_flush_time = now; + self.need_flush_events = false; + self.flush_events(); + } else { + self.need_flush_events = true; + } + } + + fn end(&mut self, batch: &mut [Option>>]) {} + + fn pause(&mut self) { + if self.poll_ctx.trans.need_flush() { + self.poll_ctx.trans.flush(); + } + + if self.need_flush_events { + self.last_flush_time = TiInstant::now(); + self.need_flush_events = false; + self.flush_events(); + } + } +} + +struct StorePollerBuilder { + cfg: Arc>, + store_id: u64, + engine: ER, + tablet_factory: Arc>, + trans: T, + logger: Logger, +} + +impl StorePollerBuilder { + pub fn new( + cfg: Arc>, + store_id: u64, + engine: ER, + tablet_factory: Arc>, + trans: T, + logger: Logger, + ) -> Self { + StorePollerBuilder { + cfg, + store_id, + engine, + tablet_factory, + trans, + logger, + } + } + + /// Init all the existing raft machine and cleanup stale tablets. + fn init(&self) -> Result>> { + let mut regions = HashMap::default(); + let cfg = self.cfg.value(); + self.engine + .for_each_raft_group::(&mut |region_id| { + let peer = match Peer::new( + &cfg, + region_id, + self.store_id, + self.tablet_factory.as_ref(), + self.engine.clone(), + &self.logger, + )? { + Some(peer) => peer, + None => return Ok(()), + }; + let pair = PeerFsm::new(&cfg, peer)?; + let prev = regions.insert(region_id, pair); + if let Some((_, p)) = prev { + return Err(box_err!( + "duplicate region {:?} vs {:?}", + p.logger().list(), + regions[®ion_id].1.logger().list() + )); + } + Ok(()) + })?; + self.clean_up_tablets(®ions)?; + Ok(regions) + } + + fn clean_up_tablets(&self, peers: &HashMap>) -> Result<()> { + // TODO: list all available tablets and destroy those which are not in the peers. + Ok(()) + } +} + +impl HandlerBuilder, StoreFsm> for StorePollerBuilder +where + ER: RaftEngine, + EK: KvEngine, + T: Transport + 'static, +{ + type Handler = StorePoller; + + fn build(&mut self, priority: batch_system::Priority) -> Self::Handler { + let poll_ctx = StoreContext::new( + self.cfg.value().clone(), + self.trans.clone(), + self.logger.clone(), + ); + let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); + StorePoller::new(poll_ctx, cfg_tracker) + } +} + +/// The system used for poll raft activities. +pub struct StoreSystem { + system: BatchSystem, StoreFsm>, + logger: Logger, +} + +impl StoreSystem { + pub fn start( + &mut self, + store: Store, + cfg: Arc>, + raft_engine: ER, + tablet_factory: Arc>, + trans: T, + router: &StoreRouter, + ) -> Result<()> + where + T: Transport + 'static, + { + let mut builder = StorePollerBuilder::new( + cfg, + store.get_id(), + raft_engine, + tablet_factory, + trans, + self.logger.clone(), + ); + let peers = builder.init()?; + // Choose a different name so we know what version is actually used. rs stands + // for raft store. + let tag = format!("rs-{}", store.get_id()); + self.system.spawn(tag, builder); + + let mut mailboxes = Vec::with_capacity(peers.len()); + let mut address = Vec::with_capacity(peers.len()); + for (region_id, (tx, fsm)) in peers { + address.push(region_id); + mailboxes.push(( + region_id, + BasicMailbox::new(tx, fsm, router.state_cnt().clone()), + )); + } + router.register_all(mailboxes); + + // Make sure Msg::Start is the first message each FSM received. + for addr in address { + router.force_send(addr, PeerMsg::Start).unwrap(); + } + router.send_control(StoreMsg::Start { store }).unwrap(); + Ok(()) + } + + pub fn shutdown(&mut self) { + self.system.shutdown(); + } +} + +pub type StoreRouter = BatchRouter, StoreFsm>; + +/// Create the batch system for polling raft activities. +pub fn create_store_batch_system( + cfg: &Config, + store: Store, + logger: Logger, +) -> (StoreRouter, StoreSystem) +where + EK: KvEngine, + ER: RaftEngine, +{ + let (store_tx, store_fsm) = StoreFsm::new(cfg, store); + let (router, system) = + batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); + let system = StoreSystem { system, logger }; + (router, system) +} diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs index 60c84984793..9f3bcefac46 100644 --- a/components/raftstore-v2/src/fsm/mod.rs +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -8,3 +8,6 @@ mod apply; mod peer; mod store; + +pub use peer::{PeerFsm, PeerFsmDelegate, SenderFsmPair}; +pub use store::{StoreFsm, StoreFsmDelegate}; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 5eaacf3e200..8187575d658 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -1,22 +1,111 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::RaftEngine; +use std::borrow::Cow; + +use batch_system::{BasicMailbox, Fsm}; +use crossbeam::channel::TryRecvError; +use engine_traits::{KvEngine, RaftEngine}; use kvproto::metapb; use raftstore::store::Config; -use slog::Logger; +use slog::{info, Logger}; +use tikv_util::mpsc::{self, LooseBoundedSender, Receiver, Sender}; + +use crate::{batch::StoreContext, raft::Peer, PeerMsg, Result}; -use crate::{raft::Peer, Result}; +pub type SenderFsmPair = (LooseBoundedSender>, Box>); -pub struct PeerFsm { - peer: Peer, +pub struct PeerFsm { + peer: Peer, + logger: Logger, + mailbox: Option>>, + receiver: Receiver>, + is_stopped: bool, } -impl PeerFsm { - pub fn new(peer: Peer) -> Result { - Ok(PeerFsm { peer }) +impl PeerFsm { + pub fn new(cfg: &Config, peer: Peer) -> Result> { + let logger = peer.logger().clone(); + info!(logger, "create peer"); + let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); + let fsm = Box::new(PeerFsm { + logger, + peer, + mailbox: None, + receiver: rx, + is_stopped: false, + }); + Ok((tx, fsm)) + } + + #[inline] + pub fn peer(&self) -> &Peer { + &self.peer } + #[inline] pub fn logger(&self) -> &Logger { self.peer.logger() } + + /// Fetches messages to `peer_msg_buf`. It will stop when the buffer is full. + /// + /// Returns how many messages are fetched. + pub fn recv(&mut self, peer_msg_buf: &mut Vec>) -> usize { + let l = peer_msg_buf.len(); + for i in l..peer_msg_buf.capacity() { + match self.receiver.try_recv() { + Ok(msg) => peer_msg_buf.push(msg), + Err(e) => { + if let TryRecvError::Disconnected = e { + self.is_stopped = true; + } + return i - l; + } + } + } + peer_msg_buf.capacity() - l + } +} + +impl Fsm for PeerFsm { + type Message = PeerMsg; + + #[inline] + fn is_stopped(&self) -> bool { + self.is_stopped + } + + /// Set a mailbox to Fsm, which should be used to send message to itself. + fn set_mailbox(&mut self, mailbox: Cow<'_, BasicMailbox>) + where + Self: Sized, + { + self.mailbox = Some(mailbox.into_owned()); + } + + /// Take the mailbox from Fsm. Implementation should ensure there will be + /// no reference to mailbox after calling this method. + fn take_mailbox(&mut self) -> Option> + where + Self: Sized, + { + self.mailbox.take() + } +} + +pub struct PeerFsmDelegate<'a, EK: KvEngine, ER: RaftEngine, T> { + fsm: &'a mut PeerFsm, + store_ctx: &'a mut StoreContext, +} + +impl<'a, EK: KvEngine, ER: RaftEngine, T> PeerFsmDelegate<'a, EK, ER, T> { + pub fn new(fsm: &'a mut PeerFsm, store_ctx: &'a mut StoreContext) -> Self { + Self { fsm, store_ctx } + } + + pub fn handle_msgs(&self, peer_msgs_buf: &mut Vec>) { + for msg in peer_msgs_buf.drain(..) { + // TODO: handle the messages. + } + } } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index b568454e2c9..091b3fe11e9 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -1,3 +1,65 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -pub struct StoreFsm {} +use batch_system::Fsm; +use crossbeam::channel::TryRecvError; +use kvproto::metapb::Store; +use raftstore::store::Config; +use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; + +use crate::{batch::StoreContext, StoreMsg}; + +pub struct StoreFsm { + store: Store, + receiver: Receiver, +} + +impl StoreFsm { + pub fn new(cfg: &Config, store: Store) -> (LooseBoundedSender, Box) { + let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); + let fsm = Box::new(StoreFsm { + store, + receiver: rx, + }); + (tx, fsm) + } + + /// Fetches messages to `store_msg_buf`. It will stop when the buffer is full. + /// + /// Returns how many messages are fetched. + pub fn recv(&self, store_msg_buf: &mut Vec) -> usize { + let l = store_msg_buf.len(); + for i in l..store_msg_buf.capacity() { + match self.receiver.try_recv() { + Ok(msg) => store_msg_buf.push(msg), + Err(_) => return i - l, + } + } + store_msg_buf.capacity() - l + } +} + +impl Fsm for StoreFsm { + type Message = StoreMsg; + + #[inline] + fn is_stopped(&self) -> bool { + false + } +} + +pub struct StoreFsmDelegate<'a, T> { + fsm: &'a mut StoreFsm, + store_ctx: &'a mut StoreContext, +} + +impl<'a, T> StoreFsmDelegate<'a, T> { + pub fn new(fsm: &'a mut StoreFsm, store_ctx: &'a mut StoreContext) -> Self { + Self { fsm, store_ctx } + } + + pub fn handle_msgs(&self, store_msg_buf: &mut Vec) { + for msg in store_msg_buf.drain(..) { + // TODO: handle the messages. + } + } +} diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index fac4511cfd4..220fa0b2d33 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -10,12 +10,15 @@ #![allow(unused)] +mod batch; mod bootstrap; mod fsm; mod operation; mod raft; mod router; +pub(crate) use batch::StoreContext; +pub use batch::{create_store_batch_system, StoreSystem}; pub use bootstrap::Bootstrap; pub use raftstore::{Error, Result}; pub use router::{PeerMsg, PeerTick, StoreMsg, StoreTick}; diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 4af2c1ccddb..e2ccb068cbc 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -1,9 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::RaftEngine; -use kvproto::metapb; -use raft::RawNode; -use raftstore::store::Config; +use std::sync::Arc; + +use engine_traits::{KvEngine, RaftEngine, TabletFactory}; +use kvproto::{metapb, raft_serverpb::RegionLocalState}; +use raft::{RawNode, INVALID_ID}; +use raftstore::store::{util::find_peer, Config}; use slog::{o, Logger}; use tikv_util::{box_err, config::ReadableSize}; @@ -11,37 +13,35 @@ use super::storage::Storage; use crate::Result; /// A peer that delegates commands between state machine and raft. -pub struct Peer { - region_id: u64, - peer: metapb::Peer, +pub struct Peer { raft_group: RawNode>, + tablet: Option, logger: Logger, } -impl Peer { +impl Peer { + /// Creates a new peer. + /// + /// If peer is destroyed, None is returned. pub fn new( cfg: &Config, + region_id: u64, store_id: u64, - region: metapb::Region, + tablet_factory: &dyn TabletFactory, engine: ER, - logger: Logger, - ) -> Result { - let peer = region - .get_peers() - .iter() - .find(|p| p.get_store_id() == store_id && p.get_id() != raft::INVALID_ID); - let peer = match peer { - Some(p) => p, - None => return Err(box_err!("no valid peer found in {:?}", region.get_peers())), + logger: &Logger, + ) -> Result> { + let s = match Storage::new(region_id, store_id, engine, logger)? { + Some(s) => s, + None => return Ok(None), }; - let l = logger.new(o!("peer_id" => peer.id)); - - let ps = Storage::new(engine, l.clone()); + let logger = s.logger().clone(); - let applied_index = ps.applied_index(); + let applied_index = s.apply_state().get_applied_index(); + let peer_id = s.peer().get_id(); let raft_cfg = raft::Config { - id: peer.get_id(), + id: peer_id, election_tick: cfg.raft_election_timeout_ticks, heartbeat_tick: cfg.raft_heartbeat_ticks, min_election_tick: cfg.raft_min_election_timeout_ticks, @@ -56,14 +56,49 @@ impl Peer { ..Default::default() }; - Ok(Peer { - region_id: region.get_id(), - peer: peer.clone(), - raft_group: RawNode::new(&raft_cfg, ps, &logger)?, - logger: l, - }) + let tablet_index = s.region_state().get_tablet_index(); + let tablet = if tablet_index != 0 { + if !tablet_factory.exists(region_id, tablet_index) { + return Err(box_err!( + "missing tablet {} for region {}", + tablet_index, + region_id + )); + } + // TODO: Perhaps we should stop create the tablet automatically. + Some(tablet_factory.open_tablet(region_id, tablet_index)?) + } else { + None + }; + + Ok(Some(Peer { + raft_group: RawNode::new(&raft_cfg, s, &logger)?, + tablet, + logger, + })) + } + + #[inline] + pub fn region_id(&self) -> u64 { + self.raft_group.store().region_state().get_region().get_id() + } + + #[inline] + pub fn peer_id(&self) -> u64 { + self.raft_group.store().peer().get_id() + } + + #[inline] + pub fn storage(&self) -> &Storage { + self.raft_group.store() + } + + #[inline] + pub fn tablet(&self) -> &Option { + &self.tablet } + #[inline] pub fn logger(&self) -> &Logger { &self.logger } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index f6dcad9578c..fc25e12bad3 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -2,23 +2,25 @@ use engine_traits::{RaftEngine, RaftLogBatch}; use kvproto::{ - metapb::Region, - raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, + metapb::{self, Region}, + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; use raft::{ eraftpb::{Entry, Snapshot}, - GetEntriesContext, RaftState, + GetEntriesContext, RaftState, INVALID_ID, }; -use raftstore::store::{RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; -use slog::Logger; +use raftstore::store::{util::find_peer, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; +use slog::{o, Logger}; +use tikv_util::box_err; -use crate::Result; +use crate::{Error, Result}; pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { let region_id = region.get_id(); let mut state = RegionLocalState::default(); state.set_region(region); + state.set_tablet_index(RAFT_INIT_LOG_INDEX); wb.put_region_state(region_id, &state)?; let mut apply_state = RaftApplyState::default(); @@ -41,19 +43,104 @@ pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Resul } /// A storage for raft. +/// +/// It's similar to `PeerStorage` in v1. +#[derive(Debug)] pub struct Storage { engine: ER, + peer: metapb::Peer, + region_state: RegionLocalState, + raft_state: RaftLocalState, + apply_state: RaftApplyState, logger: Logger, } -impl Storage { - pub fn new(engine: ER, logger: Logger) -> Storage { - Storage { engine, logger } +impl Storage { + /// Creates a new storage. + /// + /// All metadata should be initialized before calling this method. If the region is destroyed + /// `None` will be returned. + pub fn new( + region_id: u64, + store_id: u64, + engine: ER, + logger: &Logger, + ) -> Result>> { + let region_state = match engine.get_region_state(region_id) { + Ok(Some(s)) => s, + res => { + return Err(box_err!("failed to get region state: {:?}", res)); + } + }; + + if region_state.get_state() == PeerState::Tombstone { + return Ok(None); + } + + let peer = find_peer(region_state.get_region(), store_id); + let peer = match peer { + Some(p) if p.get_id() != INVALID_ID => p, + _ => { + return Err(box_err!("no valid peer found in {:?}", region_state)); + } + }; + + let logger = logger.new(o!("region_id" => region_id, "peer_id" => peer.get_id())); + + let raft_state = match engine.get_raft_state(region_id) { + Ok(Some(s)) => s, + res => { + return Err(box_err!("failed to get raft state: {:?}", res)); + } + }; + + let apply_state = match engine.get_apply_state(region_id) { + Ok(Some(s)) => s, + res => { + return Err(box_err!("failed to get apply state: {:?}", res)); + } + }; + + let mut s = Storage { + engine, + peer: peer.clone(), + region_state, + raft_state, + apply_state, + logger, + }; + s.validate_state()?; + Ok(Some(s)) } - pub fn applied_index(&self) -> u64 { + fn validate_state(&mut self) -> Result<()> { unimplemented!() } + + #[inline] + pub fn region_state(&self) -> &RegionLocalState { + &self.region_state + } + + #[inline] + pub fn raft_state(&self) -> &RaftLocalState { + &self.raft_state + } + + #[inline] + pub fn apply_state(&self) -> &RaftApplyState { + &self.apply_state + } + + #[inline] + pub fn peer(&self) -> &metapb::Peer { + &self.peer + } + + #[inline] + pub fn logger(&self) -> &Logger { + &self.logger + } } impl raft::Storage for Storage { @@ -120,6 +207,7 @@ mod tests { let local_state = raft_engine.get_region_state(4).unwrap().unwrap(); assert_eq!(local_state.get_state(), PeerState::Normal); assert_eq!(*local_state.get_region(), region); + assert_eq!(local_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); let raft_state = raft_engine.get_raft_state(4).unwrap().unwrap(); assert_eq!(raft_state.get_last_index(), RAFT_INIT_LOG_INDEX); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 02c8d4fe650..fad93ac54d8 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2488,6 +2488,9 @@ where ExtraMessageType::MsgHibernateResponse => { self.on_hibernate_response(msg.get_from_peer()); } + ExtraMessageType::MsgRejectRaftLogCausedByMemoryUsage => { + unimplemented!() + } } } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 63b0a583030..c46cafb7e48 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -424,6 +424,22 @@ pub struct PeerTickBatch { pub wait_duration: Duration, } +impl PeerTickBatch { + #[inline] + pub fn schedule(&mut self, timer: &SteadyTimer) { + if self.ticks.is_empty() { + return; + } + let peer_ticks = mem::take(&mut self.ticks); + let f = timer.delay(self.wait_duration).compat().map(move |_| { + for tick in peer_ticks { + tick(); + } + }); + poll_future_notify(f); + } +} + impl Clone for PeerTickBatch { fn clone(&self) -> PeerTickBatch { PeerTickBatch { @@ -760,21 +776,7 @@ impl RaftPoller { fn flush_ticks(&mut self) { for t in PeerTick::get_all_ticks() { let idx = *t as usize; - if self.poll_ctx.tick_batch[idx].ticks.is_empty() { - continue; - } - let peer_ticks = mem::take(&mut self.poll_ctx.tick_batch[idx].ticks); - let f = self - .poll_ctx - .timer - .delay(self.poll_ctx.tick_batch[idx].wait_duration) - .compat() - .map(move |_| { - for tick in peer_ticks { - tick(); - } - }); - poll_future_notify(f); + self.poll_ctx.tick_batch[idx].schedule(&self.poll_ctx.timer); } } } diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index 8445a0a97aa..9b3e38aa9cc 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -9,6 +9,7 @@ extern crate test; use std::{ + cmp, collections::{ hash_map::Entry, vec_deque::{Iter, VecDeque}, @@ -584,6 +585,15 @@ pub fn build_on_master_branch() -> bool { option_env!("TIKV_BUILD_GIT_BRANCH").map_or(false, |b| "master" == b) } +/// Set the capacity of a vector to the given capacity. +pub fn set_vec_capacity(v: &mut Vec, cap: usize) { + match cap.cmp(&v.capacity()) { + cmp::Ordering::Less => v.shrink_to(cap), + cmp::Ordering::Greater => v.reserve_exact(cap - v.len()), + cmp::Ordering::Equal => {} + } +} + #[cfg(test)] mod tests { use std::{ diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 9f2507562e6..918d348f898 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -1290,19 +1290,19 @@ mod tests { use tikv_util::config::ReadableDuration; /// Asserted that the snapshot can be retrieved in 500ms. - const SNAPSHOT_DURATION_MS: i64 = 500; + const SNAPSHOT_DURATION_MS: u64 = 500; /// Asserted that the delay caused by OS scheduling other tasks is smaller than 200ms. /// This is mostly for CI. - const HANDLE_ERROR_MS: i64 = 200; + const HANDLE_ERROR_MS: u64 = 200; /// The acceptable error range for a coarse timer. Note that we use CLOCK_MONOTONIC_COARSE /// which can be slewed by time adjustment code (e.g., NTP, PTP). - const COARSE_ERROR_MS: i64 = 50; + const COARSE_ERROR_MS: u64 = 50; /// The duration that payload executes. - const PAYLOAD_SMALL: i64 = 3000; - const PAYLOAD_LARGE: i64 = 6000; + const PAYLOAD_SMALL: u64 = 3000; + const PAYLOAD_LARGE: u64 = 6000; let engine = TestEngineBuilder::new().build().unwrap(); @@ -1339,7 +1339,7 @@ mod tests { req_with_exec_detail.context.set_record_time_stat(true); { - let mut wait_time: i64 = 0; + let mut wait_time: u64 = 0; // Request 1: Unary, success response. let handler_builder = Box::new(|_, _: &_| { @@ -1388,7 +1388,7 @@ mod tests { resp.get_exec_details() .get_time_detail() .get_wait_wall_time_ms(), - wait_time - HANDLE_ERROR_MS - COARSE_ERROR_MS + wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) ); assert_lt!( resp.get_exec_details() @@ -1417,7 +1417,7 @@ mod tests { resp.get_exec_details() .get_time_detail() .get_wait_wall_time_ms(), - wait_time - HANDLE_ERROR_MS - COARSE_ERROR_MS + wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) ); assert_lt!( resp.get_exec_details() @@ -1504,7 +1504,7 @@ mod tests { } { - let mut wait_time: i64 = 0; + let mut wait_time: u64 = 0; // Request 1: Unary, success response. let handler_builder = Box::new(|_, _: &_| { @@ -1569,7 +1569,7 @@ mod tests { resp.get_exec_details() .get_time_detail() .get_wait_wall_time_ms(), - wait_time - HANDLE_ERROR_MS - COARSE_ERROR_MS + wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) ); assert_lt!( resp.get_exec_details() @@ -1602,7 +1602,7 @@ mod tests { .get_exec_details() .get_time_detail() .get_wait_wall_time_ms(), - wait_time - HANDLE_ERROR_MS - COARSE_ERROR_MS + wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) ); assert_lt!( resp[0] @@ -1632,7 +1632,7 @@ mod tests { .get_exec_details() .get_time_detail() .get_wait_wall_time_ms(), - wait_time - HANDLE_ERROR_MS - COARSE_ERROR_MS + wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) ); assert_lt!( resp[1] diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index df43ad39a69..064073825f4 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -196,9 +196,9 @@ impl Tracker { let mut exec_details = kvrpcpb::ExecDetails::default(); let mut td = kvrpcpb::TimeDetail::default(); - td.set_process_wall_time_ms(time::duration_to_ms(measure) as i64); - td.set_wait_wall_time_ms(time::duration_to_ms(self.wait_time) as i64); - td.set_kv_read_wall_time_ms(self.scan_process_time_ms as i64); + td.set_process_wall_time_ms(time::duration_to_ms(measure)); + td.set_wait_wall_time_ms(time::duration_to_ms(self.wait_time)); + td.set_kv_read_wall_time_ms(self.scan_process_time_ms); exec_details.set_time_detail(td.clone()); let detail = self.total_storage_stats.scan_detail(); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 5b084826861..336580dda58 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1400,10 +1400,9 @@ fn future_get( tracker.write_scan_detail(scan_detail_v2); }); let time_detail = exec_detail_v2.mut_time_detail(); - time_detail.set_kv_read_wall_time_ms(duration_ms as i64); - time_detail.set_wait_wall_time_ms(stats.latency_stats.wait_wall_time_ms as i64); - time_detail - .set_process_wall_time_ms(stats.latency_stats.process_wall_time_ms as i64); + time_detail.set_kv_read_wall_time_ms(duration_ms); + time_detail.set_wait_wall_time_ms(stats.latency_stats.wait_wall_time_ms); + time_detail.set_process_wall_time_ms(stats.latency_stats.process_wall_time_ms); match val { Some(val) => resp.set_value(val), None => resp.set_not_found(true), @@ -1496,10 +1495,9 @@ fn future_batch_get( tracker.write_scan_detail(scan_detail_v2); }); let time_detail = exec_detail_v2.mut_time_detail(); - time_detail.set_kv_read_wall_time_ms(duration_ms as i64); - time_detail.set_wait_wall_time_ms(stats.latency_stats.wait_wall_time_ms as i64); - time_detail - .set_process_wall_time_ms(stats.latency_stats.process_wall_time_ms as i64); + time_detail.set_kv_read_wall_time_ms(duration_ms); + time_detail.set_wait_wall_time_ms(stats.latency_stats.wait_wall_time_ms); + time_detail.set_process_wall_time_ms(stats.latency_stats.process_wall_time_ms); resp.set_pairs(pairs.into()); } Err(e) => { From 110059e68ae87880b348571ad359a51686396dda Mon Sep 17 00:00:00 2001 From: haojinming Date: Mon, 11 Jul 2022 13:21:05 +0800 Subject: [PATCH 065/676] cdc: Resolved-ts for RawKV (#12866) ref tikv/tikv#11965 Signed-off-by: haojinming Co-authored-by: Ping Yu --- Cargo.lock | 1 + .../backup-stream/src/subscription_track.rs | 2 +- components/causal_ts/src/lib.rs | 15 +- components/causal_ts/src/observer.rs | 39 +- components/cdc/Cargo.toml | 1 + components/cdc/src/delegate.rs | 57 ++- components/cdc/src/endpoint.rs | 449 +++++++++++++++++- components/cdc/src/initializer.rs | 2 +- components/cdc/src/metrics.rs | 7 + components/cdc/src/observer.rs | 48 +- components/resolved_ts/src/endpoint.rs | 2 +- components/resolved_ts/src/resolver.rs | 122 ++++- components/server/src/server.rs | 36 +- components/test_raftstore/src/server.rs | 5 +- src/config.rs | 15 + src/storage/kv/test_engine_builder.rs | 4 +- tests/integrations/config/mod.rs | 1 + 17 files changed, 726 insertions(+), 80 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bdb55d28de2..54b315afd36 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -758,6 +758,7 @@ version = "0.0.1" dependencies = [ "api_version", "bitflags", + "causal_ts", "collections", "concurrency_manager", "criterion", diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index f3852fe9782..9199f508d62 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -304,7 +304,7 @@ impl TwoPhaseResolver { return min_ts.min(stable_ts); } - self.resolver.resolve(min_ts) + self.resolver.resolve(min_ts).min() } pub fn resolved_ts(&self) -> TimeStamp { diff --git a/components/causal_ts/src/lib.rs b/components/causal_ts/src/lib.rs index 3507dc17926..ea5fe3bdcc3 100644 --- a/components/causal_ts/src/lib.rs +++ b/components/causal_ts/src/lib.rs @@ -15,7 +15,7 @@ mod observer; pub use observer::*; use txn_types::TimeStamp; -use crate::errors::Result; +pub use crate::errors::Result; /// Trait of causal timestamp provider. pub trait CausalTsProvider: Send + Sync { @@ -28,6 +28,10 @@ pub trait CausalTsProvider: Send + Sync { } } +pub trait RawTsTracker: Send + Sync + Clone { + fn track_ts(&self, region_id: u64, ts: TimeStamp) -> Result<()>; +} + pub mod tests { use std::sync::{ atomic::{AtomicU64, Ordering}, @@ -55,4 +59,13 @@ pub mod tests { Ok(self.ts.fetch_add(1, Ordering::Relaxed).into()) } } + + #[derive(Clone, Default)] + pub struct DummyRawTsTracker {} + + impl RawTsTracker for DummyRawTsTracker { + fn track_ts(&self, _region_id: u64, _ts: TimeStamp) -> Result<()> { + Ok(()) + } + } } diff --git a/components/causal_ts/src/observer.rs b/components/causal_ts/src/observer.rs index c89d480eddd..8d2c5abc95c 100644 --- a/components/causal_ts/src/observer.rs +++ b/components/causal_ts/src/observer.rs @@ -18,19 +18,21 @@ use raftstore::{ }, }; -use crate::CausalTsProvider; +use crate::{CausalTsProvider, RawTsTracker}; /// CausalObserver appends timestamp for RawKV V2 data, /// and invoke causal_ts_provider.flush() on specified event, e.g. leader transfer, snapshot apply. /// Should be used ONLY when API v2 is enabled. -pub struct CausalObserver { +pub struct CausalObserver { causal_ts_provider: Arc, + ts_tracker: Tk, } -impl Clone for CausalObserver { +impl Clone for CausalObserver { fn clone(&self) -> Self { Self { causal_ts_provider: self.causal_ts_provider.clone(), + ts_tracker: self.ts_tracker.clone(), } } } @@ -38,9 +40,12 @@ impl Clone for CausalObserver { // Causal observer's priority should be higher than all other observers, to avoid being bypassed. const CAUSAL_OBSERVER_PRIORITY: u32 = 0; -impl CausalObserver { - pub fn new(causal_ts_provider: Arc) -> Self { - Self { causal_ts_provider } +impl CausalObserver { + pub fn new(causal_ts_provider: Arc, ts_tracker: Tk) -> Self { + Self { + causal_ts_provider, + ts_tracker, + } } pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { @@ -61,7 +66,7 @@ impl CausalObserver { const REASON_LEADER_TRANSFER: &str = "leader_transfer"; const REASON_REGION_MERGE: &str = "region_merge"; -impl CausalObserver { +impl CausalObserver { fn flush_timestamp(&self, region: &Region, reason: &'static str) { fail::fail_point!("causal_observer_flush_timestamp", |_| ()); @@ -73,9 +78,9 @@ impl CausalObserver { } } -impl Coprocessor for CausalObserver {} +impl Coprocessor for CausalObserver {} -impl QueryObserver for CausalObserver { +impl QueryObserver for CausalObserver { fn pre_propose_query( &self, ctx: &mut ObserverContext<'_>, @@ -92,6 +97,12 @@ impl QueryObserver for CausalObserver { ts = Some(self.causal_ts_provider.get_ts().map_err(|err| { coprocessor::Error::Other(box_err!("Get causal timestamp error: {:?}", err)) })?); + // use prev ts as `resolved_ts` means the data with smaller or equal ts has already sink to cdc. + self.ts_tracker + .track_ts(region_id, ts.unwrap().prev()) + .map_err(|err| { + coprocessor::Error::Other(box_err!("track ts err: {:?}", err)) + })?; } ApiV2::append_ts_on_encoded_bytes(req.mut_put().mut_key(), ts.unwrap()); @@ -102,7 +113,7 @@ impl QueryObserver for CausalObserver { } } -impl RoleObserver for CausalObserver { +impl RoleObserver for CausalObserver { /// Observe becoming leader, to flush CausalTsProvider. fn on_role_change(&self, ctx: &mut ObserverContext<'_>, role_change: &RoleChange) { // In scenario of frequent leader transfer, the observing of change from @@ -119,7 +130,7 @@ impl RoleObserver for CausalObserver { } } -impl RegionChangeObserver for CausalObserver { +impl RegionChangeObserver for CausalObserver { fn on_region_changed( &self, ctx: &mut ObserverContext<'_>, @@ -155,14 +166,14 @@ pub mod tests { use txn_types::{Key, TimeStamp}; use super::*; - use crate::BatchTsoProvider; + use crate::{tests::DummyRawTsTracker, BatchTsoProvider}; - fn init() -> CausalObserver> { + fn init() -> CausalObserver, DummyRawTsTracker> { let pd_cli = Arc::new(TestPdClient::new(0, true)); pd_cli.set_tso(100.into()); let causal_ts_provider = Arc::new(block_on(BatchTsoProvider::new_opt(pd_cli, Duration::ZERO, 100)).unwrap()); - CausalObserver::new(causal_ts_provider) + CausalObserver::new(causal_ts_provider, DummyRawTsTracker::default()) } #[test] diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index f2e2dfd57ce..255ef552c73 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -30,6 +30,7 @@ failpoints = ["tikv/failpoints"] [dependencies] api_version = { path = "../api_version" } bitflags = "1.0" +causal_ts = { path = "../causal_ts" } collections = { path = "../collections" } concurrency_manager = { path = "../concurrency_manager", default-features = false } crossbeam = "0.8" diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 2fb971a4024..55a551490ac 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -28,7 +28,7 @@ use raftstore::{ store::util::compare_region_epoch, Error as RaftStoreError, }; -use resolved_ts::Resolver; +use resolved_ts::{ResolvedTs, Resolver}; use tikv::storage::{txn::TxnEntry, Statistics}; use tikv_util::{debug, info, warn}; use txn_types::{Key, Lock, LockType, TimeStamp, WriteBatchFlags, WriteRef, WriteType}; @@ -225,6 +225,8 @@ impl Drop for Pending { enum PendingLock { Track { key: Vec, start_ts: TimeStamp }, Untrack { key: Vec }, + RawTrack { ts: TimeStamp }, + RawUntrack { ts: TimeStamp }, } /// A CDC delegate of a raftstore region peer. @@ -244,7 +246,6 @@ pub struct Delegate { pending: Option, txn_extra_op: Arc>, failed: bool, - has_resolver: bool, } impl Delegate { @@ -259,14 +260,9 @@ impl Delegate { pending: Some(Pending::default()), txn_extra_op, failed: false, - has_resolver: false, } } - pub fn has_resolver(&self) -> bool { - self.has_resolver - } - /// Let downstream subscribe the delegate. /// Return error if subscribe fails and the `Delegate` won't be changed. pub fn subscribe(&mut self, downstream: Downstream) -> Result<()> { @@ -274,9 +270,6 @@ impl Delegate { // Check if the downstream is out dated. self.check_epoch_on_ready(&downstream)?; } - if downstream.kv_api == ChangeDataRequestKvApi::TiDb { - self.has_resolver = true; - } self.add_downstream(downstream); Ok(()) } @@ -401,6 +394,8 @@ impl Delegate { match lock { PendingLock::Track { key, start_ts } => resolver.track_lock(start_ts, key, None), PendingLock::Untrack { key } => resolver.untrack_lock(&key, None), + PendingLock::RawTrack { ts } => resolver.raw_track_lock(ts), + PendingLock::RawUntrack { ts } => resolver.raw_untrack_lock(ts), } } self.resolver = Some(resolver); @@ -416,7 +411,7 @@ impl Delegate { } /// Try advance and broadcast resolved ts. - pub fn on_min_ts(&mut self, min_ts: TimeStamp) -> Option { + pub fn on_min_ts(&mut self, min_ts: TimeStamp) -> Option { if self.resolver.is_none() { debug!("cdc region resolver not ready"; "region_id" => self.region_id, "min_ts" => min_ts); @@ -426,9 +421,9 @@ impl Delegate { let resolver = self.resolver.as_mut().unwrap(); let resolved_ts = resolver.resolve(min_ts); debug!("cdc resolved ts updated"; - "region_id" => self.region_id, "resolved_ts" => resolved_ts); + "region_id" => self.region_id, "resolved_ts" => ?resolved_ts); CDC_RESOLVED_TS_GAP_HISTOGRAM - .observe((min_ts.physical() - resolved_ts.physical()) as f64 / 1000f64); + .observe((min_ts.physical() - resolved_ts.min().physical()) as f64 / 1000f64); Some(resolved_ts) } @@ -613,10 +608,42 @@ impl Delegate { rows.push(v); } self.sink_downstream(rows, index, ChangeDataRequestKvApi::TiDb)?; + self.sink_raw_downstream(raw_rows, index) + } - self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv)?; + fn sink_raw_downstream(&mut self, entries: Vec, index: u64) -> Result<()> { + if entries.is_empty() { + return Ok(()); + } + // the entry's timestamp is non-decreasing, the last has the max ts. + let max_raw_ts = TimeStamp::from(entries.last().unwrap().commit_ts); + match self.resolver { + Some(ref mut resolver) => { + // use prev ts, see reason at CausalObserver::pre_propose_query + resolver.raw_untrack_lock(max_raw_ts.prev()); + } + None => { + assert!(self.pending.is_some(), "region resolver not ready"); + let pending = self.pending.as_mut().unwrap(); + pending + .locks + .push(PendingLock::RawUntrack { ts: max_raw_ts }); + } + } + self.sink_downstream(entries, index, ChangeDataRequestKvApi::RawKv) + } - Ok(()) + pub fn raw_track_ts(&mut self, ts: TimeStamp) { + match self.resolver { + Some(ref mut resolver) => { + resolver.raw_track_lock(ts); + } + None => { + assert!(self.pending.is_some(), "region resolver not ready"); + let pending = self.pending.as_mut().unwrap(); + pending.locks.push(PendingLock::RawTrack { ts }); + } + } } fn sink_downstream( diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 54686424461..7a67c2f9d85 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -17,7 +17,7 @@ use futures::compat::Future01CompatExt; use grpcio::Environment; use kvproto::{ cdcpb::{ - ChangeDataRequest, ChangeDataRequestKvApi, ClusterIdMismatch as ErrorClusterIdMismatch, + ChangeDataRequest, ClusterIdMismatch as ErrorClusterIdMismatch, Compatibility as ErrorCompatibility, DuplicateRequest as ErrorDuplicateRequest, Error as EventError, Event, Event_oneof_event, ResolvedTs, }, @@ -40,7 +40,7 @@ use resolved_ts::Resolver; use security::SecurityManager; use tikv::{config::CdcConfig, storage::Statistics}; use tikv_util::{ - debug, error, impl_display_as_debug, info, + box_err, debug, error, impl_display_as_debug, info, sys::thread::ThreadBuildWrapper, time::Limiter, timer::SteadyTimer, @@ -70,6 +70,8 @@ const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s const WARN_RESOLVED_TS_LAG_THRESHOLD: Duration = Duration::from_secs(600); // Suppress repeat resolved ts lag warning. const WARN_RESOLVED_TS_COUNT_THRESHOLD: usize = 10; +// if raw region's count is more than 10, begin detect outlier. +const RAW_RESOLVED_TS_OUTLIER_COUNT_THRESHOLD: usize = 10; pub enum Deregister { Downstream { @@ -170,6 +172,10 @@ pub enum Task { TxnExtra(TxnExtra), Validate(Validate), ChangeConfig(ConfigChange), + RawTrackTs { + region_id: u64, + ts: TimeStamp, + }, } impl_display_as_debug!(Task); @@ -241,11 +247,19 @@ impl fmt::Debug for Task { .field("type", &"change_config") .field("change", change) .finish(), + Task::RawTrackTs { + ref region_id, + ref ts, + } => de + .field("type", &"track_ts") + .field("region_id", ®ion_id) + .field("ts", &ts) + .finish(), } } } -#[derive(PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq, Eq)] struct ResolvedRegion { region_id: u64, resolved_ts: TimeStamp, @@ -315,6 +329,52 @@ impl ResolvedRegionHeap { } } +// need to sort all timestamps, vec.sort() is more efficient. +struct ResolvedRegionVec { + vec: Vec, +} + +impl ResolvedRegionVec { + fn push(&mut self, region_id: u64, resolved_ts: TimeStamp) { + self.vec.push(ResolvedRegion { + region_id, + resolved_ts, + }) + } + // extreme outier match the following two conditions: + // 1. https://en.wikipedia.org/wiki/Box_plot + // 2. the gap with min_ts is larger than raw_min_ts_outlier_threshold. + // return one region at maximum. + fn get_extreme_outlier( + &mut self, + min_ts: TimeStamp, + threshold: Duration, + ) -> Option { + // When the number is small, the confidence of outlier detection is low. + if self.vec.len() > RAW_RESOLVED_TS_OUTLIER_COUNT_THRESHOLD { + self.vec.sort(); + let size = self.vec.len(); + let q1_ts = self.vec[(size + 1) / 4].resolved_ts; + let q3_ts = self.vec[3 * (size + 1) / 4].resolved_ts; + let delta = q3_ts.physical().saturating_sub(q1_ts.physical()); + let first_resolved_region = &self.vec[0]; + if q1_ts + .physical() + .saturating_sub(first_resolved_region.resolved_ts.physical()) + > 3 * delta + && Duration::from_millis( + min_ts + .physical() + .saturating_sub(first_resolved_region.resolved_ts.physical()), + ) > threshold + { + return Some(first_resolved_region.to_owned()); + } + } + None + } +} + pub struct Endpoint { cluster_id: u64, @@ -694,10 +754,6 @@ impl, E: KvEngine> Endpoint { let checkpoint_ts = request.checkpoint_ts; let sched = self.scheduler.clone(); - // Now resolver is only used by tidb downstream. - // Resolver is created when the first tidb cdc request arrive. - let is_build_resolver = kv_api == ChangeDataRequestKvApi::TiDb && !delegate.has_resolver(); - let downstream_ = downstream.clone(); if let Err(err) = delegate.subscribe(downstream) { let error_event = err.into_error_event(region_id); @@ -739,7 +795,7 @@ impl, E: KvEngine> Endpoint { max_scan_batch_size: self.max_scan_batch_size, observe_id, checkpoint_ts: checkpoint_ts.into(), - build_resolver: is_build_resolver, + build_resolver: is_new_delegate, ts_filter_ratio: self.config.incremental_scan_ts_filter_ratio, kv_api, }; @@ -828,10 +884,44 @@ impl, E: KvEngine> Endpoint { } } + // detect outlier raw regions, schedule deregister for outlier raw regions. + fn handle_raw_outlier_regions( + &self, + raw_resolved_regions: &mut ResolvedRegionVec, + min_ts: TimeStamp, + ) { + if let Some(region) = raw_resolved_regions + .get_extreme_outlier(min_ts, self.config.raw_min_ts_outlier_threshold.into()) + { + if let Some(delegate) = self.capture_regions.get(®ion.region_id) { + let observe_id = delegate.handle.id; + let deregister = Deregister::Delegate { + region_id: region.region_id, + observe_id, + err: Error::Other(box_err!("raw region dead lock")), + }; + warn!( + "cdc deregister raw region as resolved_ts has much lag, dead lock may occurs."; + "region_id" => region.region_id, + "resolved_ts" => region.resolved_ts, + ); + if let Err(e) = self.scheduler.schedule(Task::Deregister(deregister)) { + error!("cdc schedule cdc task failed"; "error" => ?e); + } + CDC_RAW_OUTLIER_RESOLVED_TS_GAP.observe( + Duration::from_millis(min_ts.physical() - region.resolved_ts.physical()) + .as_secs_f64(), + ); + } + } + } + fn on_min_ts(&mut self, regions: Vec, min_ts: TimeStamp, current_ts: TimeStamp) { // Reset resolved_regions to empty. let resolved_regions = &mut self.resolved_region_heap; resolved_regions.clear(); + // rawkv only, if user does not use rawkv apiv2, raw_resolved_regions should be empty. + let mut raw_resolved_regions = ResolvedRegionVec { vec: vec![] }; let total_region_count = regions.len(); self.min_resolved_ts = TimeStamp::max(); @@ -849,13 +939,18 @@ impl, E: KvEngine> Endpoint { advance_failed_stale += 1; } if let Some(resolved_ts) = delegate.on_min_ts(min_ts) { - if resolved_ts < self.min_resolved_ts { - self.min_resolved_ts = resolved_ts; + if resolved_ts.min() < self.min_resolved_ts { + self.min_resolved_ts = resolved_ts.min(); self.min_ts_region_id = region_id; } - resolved_regions.push(region_id, resolved_ts); + resolved_regions.push(region_id, resolved_ts.min()); + // The judge of raw region is not accuracy here, and we may miss at most one + // "normal" raw region. But this will not break the correctness of outlier detection. + if resolved_ts.is_min_ts_from_raw() { + raw_resolved_regions.push(region_id, resolved_ts.raw_ts) + } - if resolved_ts == old_resolved_ts { + if resolved_ts.min() == old_resolved_ts { advance_failed_same += 1; } else { advance_ok += 1; @@ -897,6 +992,9 @@ impl, E: KvEngine> Endpoint { let (normal_min_resolved_ts, normal_regions) = resolved_regions.to_hash_set(); self.broadcast_resolved_ts(outlier_min_resolved_ts, outlier_regions); self.broadcast_resolved_ts(normal_min_resolved_ts, normal_regions); + + // rawkv only, if user does not use rawkv apiv2, raw_resolved_regions should be empty. + self.handle_raw_outlier_regions(&mut raw_resolved_regions, min_ts); } fn broadcast_resolved_ts(&self, min_resolved_ts: TimeStamp, regions: HashSet) { @@ -1013,6 +1111,7 @@ impl, E: KvEngine> Endpoint { let tikv_clients = self.tikv_clients.clone(); let hibernate_regions_compatible = self.config.hibernate_regions_compatible; let region_read_progress = self.region_read_progress.clone(); + let observer = self.observer.clone(); let fut = async move { let _ = timeout.compat().await; @@ -1040,6 +1139,13 @@ impl, E: KvEngine> Endpoint { Err(err) => panic!("failed to regiester min ts event, error: {:?}", err), } + // If flush_causal_timestamp fails, cannot schedule MinTS task + // as new coming raw data may use timestamp smaller than min_ts + if let Err(e) = observer.flush_causal_timestamp() { + error!("cdc flush causal timestamp failed"; "err" => ?e); + return; + } + let gate = pd_client.feature_gate(); let regions = @@ -1139,6 +1245,15 @@ impl, E: KvEngine> Endpoint { fn on_open_conn(&mut self, conn: Conn) { self.connections.insert(conn.get_id(), conn); } + + fn on_raw_track_ts(&mut self, region_id: u64, ts: TimeStamp) { + if let Some(ref mut delegate) = self.capture_regions.get_mut(®ion_id) { + delegate.raw_track_ts(ts); + } else { + // delegate should not be none, as region is checked in CdcObserver::track_ts. + warn!("no delegate is found."; "region_id" => region_id); + } + } } impl, E: KvEngine> Runnable for Endpoint { @@ -1210,6 +1325,7 @@ impl, E: KvEngine> Runnable for Endpoint { } }, Task::ChangeConfig(change) => self.on_change_cfg(change), + Task::RawTrackTs { region_id, ts } => self.on_raw_track_ts(region_id, ts), } } } @@ -1272,7 +1388,10 @@ impl TxnExtraScheduler for CdcTxnExtraScheduler { #[cfg(test)] mod tests { - use std::ops::{Deref, DerefMut}; + use std::{ + assert_matches::assert_matches, + ops::{Deref, DerefMut}, + }; use engine_rocks::RocksEngine; use kvproto::{ @@ -1861,6 +1980,252 @@ mod tests { } } + #[test] + fn test_raw_track_ts() { + let cfg = CdcConfig { + min_ts_interval: ReadableDuration(Duration::from_secs(60)), + ..Default::default() + }; + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V2); + suite.add_region(1, 100); + let quota = crate::channel::MemoryQuota::new(usize::MAX); + let (tx, _) = channel::channel(1, quota); + + let conn = Conn::new(tx, String::new()); + let conn_id = conn.get_id(); + suite.run(Task::OpenConn { conn }); + let mut req_header = Header::default(); + req_header.set_cluster_id(0); + let mut req = ChangeDataRequest::default(); + let region_id = 1; + req.set_region_id(region_id); + let region_epoch = req.get_region_epoch().clone(); + let downstream = Downstream::new( + "".to_string(), + region_epoch.clone(), + 1, + conn_id, + ChangeDataRequestKvApi::RawKv, + ); + // Enable batch resolved ts in the test. + let version = FeatureGate::batch_resolved_ts(); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + version, + }); + assert_eq!(suite.endpoint.capture_regions.len(), 1); + let observe_id = suite.endpoint.capture_regions[®ion_id].handle.id; + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + // Schedule resolver ready (resolver is built by conn a). + let mut region = Region::default(); + region.id = region_id; + region.set_region_epoch(region_epoch); + let resolver = Resolver::new(region_id); + suite.run(Task::ResolverReady { + observe_id, + region, + resolver, + }); + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + + let ts = TimeStamp::compose(10, 0); + suite.run(Task::RawTrackTs { region_id, ts }); + let delegate = suite.endpoint.capture_regions.get_mut(®ion_id).unwrap(); + let resolver = delegate.resolver.as_mut().unwrap(); + let raw_resolved_ts = resolver.resolve(TimeStamp::compose(20, 0)).min(); + assert_eq!(raw_resolved_ts, ts); + } + + #[test] + fn test_raw_pending_lock() { + let cfg = CdcConfig { + min_ts_interval: ReadableDuration(Duration::from_secs(60)), + ..Default::default() + }; + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V2); + suite.add_region(1, 100); + let quota = crate::channel::MemoryQuota::new(usize::MAX); + let (tx, _) = channel::channel(1, quota); + + let conn = Conn::new(tx, String::new()); + let conn_id = conn.get_id(); + suite.run(Task::OpenConn { conn }); + let mut req_header = Header::default(); + req_header.set_cluster_id(0); + let mut req = ChangeDataRequest::default(); + let region_id = 1; + req.set_region_id(region_id); + let region_epoch = req.get_region_epoch().clone(); + let downstream = Downstream::new( + "".to_string(), + region_epoch.clone(), + 1, + conn_id, + ChangeDataRequestKvApi::RawKv, + ); + // Enable batch resolved ts in the test. + let version = FeatureGate::batch_resolved_ts(); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + version, + }); + assert_eq!(suite.endpoint.capture_regions.len(), 1); + let observe_id = suite.endpoint.capture_regions[®ion_id].handle.id; + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + for i in 100..150 { + let ts = TimeStamp::compose(i, 0); + suite.run(Task::RawTrackTs { region_id, ts }); + } + let delegate = suite.endpoint.capture_regions.get_mut(®ion_id).unwrap(); + // region is not ready, so raw lock in resolver, raw ts is added to delegate.pending. + assert_eq!(delegate.resolver.is_none(), true); + // Schedule resolver ready (resolver is built by conn a). + let mut region = Region::default(); + region.id = region_id; + region.set_region_epoch(region_epoch); + let resolver = Resolver::new(region_id); + suite.run(Task::ResolverReady { + observe_id, + region, + resolver, + }); + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + // after region ready, pending locks will be added back to resolver. + let delegate = suite.endpoint.capture_regions.get_mut(®ion_id).unwrap(); + let resolver = delegate.resolver.as_mut().unwrap(); + let raw_resolved_ts = resolver.resolve(TimeStamp::compose(200, 0)).min(); + assert_eq!(raw_resolved_ts, TimeStamp::compose(100, 0)); + } + + #[test] + fn test_raw_dead_lock() { + let cfg = CdcConfig { + min_ts_interval: ReadableDuration(Duration::from_secs(60)), + ..Default::default() + }; + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V2); + let quota = crate::channel::MemoryQuota::new(usize::MAX); + let (tx, _) = channel::channel(1, quota); + let mut region_cnt = 0; + let mut start_ts: u64 = 200; + let region_ids: Vec = (1..50).collect(); + let dead_lock_region = 1; + let dead_lock_ts = TimeStamp::compose(1, 0); + let cur_tso = TimeStamp::compose(1000000, 0); + for region_id in region_ids.clone() { + suite.add_region(region_id, 100); + let conn = Conn::new(tx.clone(), String::new()); + let conn_id = conn.get_id(); + suite.run(Task::OpenConn { conn }); + let mut req_header = Header::default(); + req_header.set_cluster_id(0); + let mut req = ChangeDataRequest::default(); + req.set_region_id(region_id); + let region_epoch = req.get_region_epoch().clone(); + let downstream = Downstream::new( + "".to_string(), + region_epoch.clone(), + region_id, + conn_id, + ChangeDataRequestKvApi::RawKv, + ); + // Enable batch resolved ts in the test. + let version = FeatureGate::batch_resolved_ts(); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + version, + }); + region_cnt += 1; + assert_eq!(suite.endpoint.capture_regions.len(), region_cnt); + let observe_id = suite.endpoint.capture_regions[®ion_id].handle.id; + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + // Schedule resolver ready (resolver is built by conn a). + let mut region = Region::default(); + region.id = region_id; + region.set_region_epoch(region_epoch); + let resolver = Resolver::new(region_id); + suite.run(Task::ResolverReady { + observe_id, + region, + resolver, + }); + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + // let ts is same with region for testing convenience + // let first region has dead lock. + let ts = if region_id == dead_lock_region { + dead_lock_ts + } else { + TimeStamp::compose(start_ts, 0) + }; + start_ts += 1; + suite.run(Task::RawTrackTs { region_id, ts }); + let delegate = suite.endpoint.capture_regions.get_mut(®ion_id).unwrap(); + let resolver = delegate.resolver.as_mut().unwrap(); + let raw_resolved_ts = resolver.resolve(cur_tso).min(); + assert_eq!(raw_resolved_ts, ts); + } + let ob_id = suite + .endpoint + .capture_regions + .get(&dead_lock_region) + .unwrap() + .handle + .id; + suite.run(Task::MinTS { + regions: region_ids, + min_ts: cur_tso, + current_ts: cur_tso, + }); + let task_recv = suite + .task_rx + .recv_timeout(Duration::from_millis(500)) + .unwrap() + .unwrap(); + assert_matches!(task_recv, + Task::Deregister(Deregister::Delegate {region_id, observe_id, ..}) if + region_id == dead_lock_region && observe_id == ob_id); + let gap = Duration::from_millis(cur_tso.physical() - dead_lock_ts.physical()).as_secs_f64(); + assert_eq!(CDC_RAW_OUTLIER_RESOLVED_TS_GAP.get_sample_count(), 1); + assert_eq!(CDC_RAW_OUTLIER_RESOLVED_TS_GAP.get_sample_sum(), gap); + suite.run(task_recv); + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + assert_eq!( + suite + .endpoint + .capture_regions + .get(&dead_lock_region) + .is_none(), + true + ); + } + #[test] fn test_feature_gate() { let cfg = CdcConfig { @@ -2434,4 +2799,62 @@ mod tests { heap1.clear(); assert!(heap1.heap.is_empty()); } + + #[test] + fn test_resolved_region_vec() { + let mut region_vec = ResolvedRegionVec { + vec: Vec::with_capacity(9), + }; + let threshold = Duration::from_secs(60); + for i in 0..9 { + region_vec.push(i, TimeStamp::compose(i, 0)); + } + // count is not enough, no outlier. + assert_eq!( + region_vec + .get_extreme_outlier(1.into(), threshold) + .is_none(), + true + ); + let mut region_vec2 = ResolvedRegionVec { + vec: Vec::with_capacity(1002), + }; + for i in 2000..3000 { + region_vec2.push(i, TimeStamp::compose(i, 0)); + } + // count is enough, but no one satisfy the outlier algorithm + // outlier boundary is: 2250 - 3 * 500 = 750 + assert_eq!( + region_vec2 + .get_extreme_outlier(TimeStamp::compose(60_010, 0), threshold) + .is_none(), + true + ); + // count become 1001, boundary: 2249 - 3 * 501 = 746, no outlier + region_vec2.push(747, TimeStamp::compose(747, 0)); + assert_eq!( + region_vec2 + .get_extreme_outlier(TimeStamp::compose(61_000, 0), threshold) + .is_none(), + true + ); + // count become 1002, boundary: 2248 - 3 * 502 = 742, but ts gap is not larger than 60s. + region_vec2.push(741, TimeStamp::compose(741, 0)); + assert_eq!( + region_vec2 + .get_extreme_outlier(TimeStamp::compose(60_741, 0), threshold) + .is_none(), + true + ); + // all conditions are satisfied, return one outlier. + assert_eq!( + region_vec2 + .get_extreme_outlier(TimeStamp::compose(60_742, 0), threshold) + .unwrap(), + ResolvedRegion { + region_id: 741, + resolved_ts: TimeStamp::compose(741, 0) + } + ); + } } diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index a5dcf094acf..e1feb0c9795 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -445,7 +445,7 @@ impl Initializer { fn finish_building_resolver(&self, mut resolver: Resolver, region: Region) { let observe_id = self.observe_id; - let rts = resolver.resolve(TimeStamp::zero()); + let rts = resolver.resolve(TimeStamp::zero()).min(); info!( "cdc resolver initialized and schedule resolver ready"; "region_id" => region.get_id(), diff --git a/components/cdc/src/metrics.rs b/components/cdc/src/metrics.rs index 0118b4d7916..969e3b371a4 100644 --- a/components/cdc/src/metrics.rs +++ b/components/cdc/src/metrics.rs @@ -205,6 +205,13 @@ lazy_static! { ) .unwrap(); + pub static ref CDC_RAW_OUTLIER_RESOLVED_TS_GAP: Histogram = register_histogram!( + "tikv_cdc_raw_outlier_resolved_ts_gap_seconds", + "Bucketed histogram of the gap between cdc raw outlier resolver_ts and current tso", + exponential_buckets(1.0, 2.0, 15).unwrap() // outlier threshold is 60s by default. + ) + .unwrap(); + pub static ref CDC_ROCKSDB_PERF_COUNTER_STATIC: PerfCounter = auto_flush_from!(CDC_ROCKSDB_PERF_COUNTER, PerfCounter); } diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index cf8503450c5..5779d5f7e06 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -2,6 +2,7 @@ use std::sync::{Arc, RwLock}; +use causal_ts::{CausalTsProvider, Error as CausalTsError, RawTsTracker, Result as CausalTsResult}; use collections::HashMap; use engine_traits::KvEngine; use fail::fail_point; @@ -9,7 +10,8 @@ use kvproto::metapb::{Peer, Region}; use raft::StateRole; use raftstore::{coprocessor::*, store::RegionSnapshot, Error as RaftStoreError}; use tikv::storage::Statistics; -use tikv_util::{error, warn, worker::Scheduler}; +use tikv_util::{box_err, error, warn, worker::Scheduler}; +use txn_types::TimeStamp; use crate::{ endpoint::{Deregister, Task}, @@ -28,6 +30,8 @@ pub struct CdcObserver { // A shared registry for managing observed regions. // TODO: it may become a bottleneck, find a better way to manage the registry. observe_regions: Arc>>, + + pub causal_ts_provider: Option>, } impl CdcObserver { @@ -39,9 +43,14 @@ impl CdcObserver { CdcObserver { sched, observe_regions: Arc::default(), + causal_ts_provider: None, } } + pub fn set_causal_ts_provider(&mut self, provider: Arc) { + self.causal_ts_provider = Some(provider); + } + pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { // use 0 as the priority of the cmd observer. CDC should have a higher priority than // the `resolved-ts`'s cmd observer @@ -89,6 +98,12 @@ impl CdcObserver { .get(®ion_id) .cloned() } + + pub fn flush_causal_timestamp(&self) -> CausalTsResult<()> { + self.causal_ts_provider + .as_ref() + .map_or(Ok(()), |provider| provider.flush()) + } } impl Coprocessor for CdcObserver {} @@ -192,6 +207,24 @@ impl RegionChangeObserver for CdcObserver { } } +impl RawTsTracker for CdcObserver { + fn track_ts(&self, region_id: u64, ts: TimeStamp) -> CausalTsResult<()> { + if self.is_subscribed(region_id).is_some() { + self.sched + .schedule(Task::RawTrackTs { region_id, ts }) + .map_err(|err| { + CausalTsError::Other(box_err!( + "sched raw track ts err: {:?}, region: {:?}, ts: {:?}", + err, + region_id, + ts + )) + })?; + } + Ok(()) + } +} + #[cfg(test)] mod tests { use std::time::Duration; @@ -318,6 +351,19 @@ mod tests { observer.on_role_change(&mut ctx, &RoleChange::new(StateRole::Leader)); rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); + // track for unregistered region id. + observer.track_ts(2, 10.into()).unwrap(); + // no event for unregistered region id. + rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); + observer.track_ts(1, 10.into()).unwrap(); + match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { + Task::RawTrackTs { region_id, ts } => { + assert_eq!(region_id, 1); + assert_eq!(ts, 10.into()); + } + _ => panic!("unexpected task"), + }; + // unsubscribed fail if observer id is different. assert_eq!(observer.unsubscribe_region(1, ObserveID::new()), None); diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index bf4f9ba881e..90e3a3b7912 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -491,7 +491,7 @@ where for region_id in regions.iter() { if let Some(observe_region) = self.regions.get_mut(region_id) { if let ResolverStatus::Ready = observe_region.resolver_status { - let resolved_ts = observe_region.resolver.resolve(ts); + let resolved_ts = observe_region.resolver.resolve(ts).min(); if resolved_ts < min_ts { min_ts = resolved_ts; } diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 1669a0e8b65..12c7cbe0c56 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -1,6 +1,11 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp, collections::BTreeMap, sync::Arc}; +use std::{ + cmp, + cmp::Reverse, + collections::{BTreeMap, BinaryHeap}, + sync::Arc, +}; use collections::{HashMap, HashSet}; use raftstore::store::RegionReadProgress; @@ -8,6 +13,28 @@ use txn_types::TimeStamp; use crate::metrics::RTS_RESOLVED_FAIL_ADVANCE_VEC; +#[derive(Debug, Clone, Copy)] +pub struct ResolvedTs { + pub raw_ts: TimeStamp, + pub txn_ts: TimeStamp, +} + +impl ResolvedTs { + pub fn default() -> ResolvedTs { + ResolvedTs { + raw_ts: TimeStamp::zero(), + txn_ts: TimeStamp::zero(), + } + } + pub fn min(&self) -> TimeStamp { + cmp::min(self.raw_ts, self.txn_ts) + } + + pub fn is_min_ts_from_raw(&self) -> bool { + self.raw_ts < self.txn_ts + } +} + // Resolver resolves timestamps that guarantee no more commit will happen before // the timestamp. pub struct Resolver { @@ -16,8 +43,11 @@ pub struct Resolver { locks_by_key: HashMap, TimeStamp>, // start_ts -> locked keys. lock_ts_heap: BTreeMap>>, + // raw ts, depend on "non-decreasing" of entries' timestamp in the same region. + // BinaryHeap is max heap, so reverse order to get a min heap. Only used in rawkv. + raw_lock_ts_heap: BinaryHeap>, // The timestamps that guarantees no more commit will happen before. - resolved_ts: TimeStamp, + resolved_ts: ResolvedTs, // The highest index `Resolver` had been tracked tracked_index: u64, // The region read progress used to utilize `resolved_ts` to serve stale read request @@ -60,9 +90,10 @@ impl Resolver { ) -> Resolver { Resolver { region_id, - resolved_ts: TimeStamp::zero(), + resolved_ts: ResolvedTs::default(), locks_by_key: HashMap::default(), lock_ts_heap: BTreeMap::new(), + raw_lock_ts_heap: BinaryHeap::new(), read_progress, tracked_index: 0, min_ts: TimeStamp::zero(), @@ -71,7 +102,7 @@ impl Resolver { } pub fn resolved_ts(&self) -> TimeStamp { - self.resolved_ts + self.resolved_ts.min() } pub fn size(&self) -> usize { @@ -145,11 +176,27 @@ impl Resolver { } } + pub fn raw_track_lock(&mut self, ts: TimeStamp) { + debug!("raw track ts {}, region {}", ts, self.region_id); + self.raw_lock_ts_heap.push(Reverse(ts)); + } + + // untrack all timestamps smaller than input ts, depend on the raw ts in one region is non-decreasing + pub fn raw_untrack_lock(&mut self, ts: TimeStamp) { + debug!("raw untrack ts before {}, region {}", ts, self.region_id); + while let Some(&Reverse(min_ts)) = self.raw_lock_ts_heap.peek() { + if min_ts > ts { + break; + } + self.raw_lock_ts_heap.pop(); + } + } + /// Try to advance resolved ts. /// /// `min_ts` advances the resolver even if there is no write. /// Return None means the resolver is not initialized. - pub fn resolve(&mut self, min_ts: TimeStamp) -> TimeStamp { + pub fn resolve(&mut self, min_ts: TimeStamp) -> ResolvedTs { // The `Resolver` is stopped, not need to advance, just return the current `resolved_ts` if self.stopped { return self.resolved_ts; @@ -160,9 +207,8 @@ impl Resolver { let min_start_ts = min_lock.unwrap_or(min_ts); // No more commit happens before the ts. - let new_resolved_ts = cmp::min(min_start_ts, min_ts); - - if self.resolved_ts >= new_resolved_ts { + let new_txn_resolved_ts = cmp::min(min_start_ts, min_ts); + if self.resolved_ts.txn_ts >= new_txn_resolved_ts { let label = if has_lock { "has_lock" } else { "stale_ts" }; RTS_RESOLVED_FAIL_ADVANCE_VEC .with_label_values(&[label]) @@ -170,18 +216,25 @@ impl Resolver { } // Resolved ts never decrease. - self.resolved_ts = cmp::max(self.resolved_ts, new_resolved_ts); + self.resolved_ts.txn_ts = cmp::max(self.resolved_ts.txn_ts, new_txn_resolved_ts); // Publish an `(apply index, safe ts)` item into the region read progress if let Some(rrp) = &self.read_progress { - rrp.update_safe_ts(self.tracked_index, self.resolved_ts.into_inner()); + rrp.update_safe_ts(self.tracked_index, self.resolved_ts.txn_ts.into_inner()); } + let min_raw_ts = self + .raw_lock_ts_heap + .peek() + .map_or(min_ts, |ts| ts.to_owned().0); + // Resolved ts never decrease. + self.resolved_ts.raw_ts = cmp::max(self.resolved_ts.raw_ts, min_raw_ts); + let new_min_ts = if has_lock { // If there are some lock, the min_ts must be smaller than // the min start ts, so it guarantees to be smaller than // any late arriving commit ts. - new_resolved_ts // cmp::min(min_start_ts, min_ts) + new_txn_resolved_ts // cmp::min(min_start_ts, min_ts) } else { min_ts }; @@ -204,6 +257,10 @@ mod tests { Lock(u64, Key), // key Unlock(Key), + // raw ts + RawLock(u64), + // raw ts + RawUnlock(u64), // min_ts, expect Resolve(u64, u64), } @@ -257,6 +314,40 @@ mod tests { Event::Unlock(Key::from_raw(b"b")), Event::Unlock(Key::from_raw(b"a")), ], + // raw track lock + vec![Event::RawLock(1), Event::Resolve(2, 1)], + vec![Event::RawLock(1), Event::RawUnlock(1), Event::Resolve(2, 2)], + vec![Event::RawLock(1), Event::RawUnlock(2), Event::Resolve(5, 5)], + vec![ + Event::RawLock(1), + Event::RawUnlock(2), + Event::RawLock(3), + Event::Resolve(5, 3), + ], + vec![ + Event::RawLock(1), + Event::RawUnlock(2), + Event::RawLock(3), + Event::RawLock(4), + Event::Resolve(5, 3), + ], + // raw and txn mixed + vec![ + Event::Lock(1, Key::from_raw(b"a")), + Event::RawLock(2), + Event::RawUnlock(3), + Event::Resolve(5, 1), + Event::Unlock(Key::from_raw(b"a")), + Event::Resolve(6, 6), + ], + vec![ + Event::Lock(1, Key::from_raw(b"a")), + Event::RawLock(2), + Event::RawLock(3), + Event::Resolve(5, 1), + Event::Unlock(Key::from_raw(b"a")), + Event::Resolve(6, 2), + ], ]; for (i, case) in cases.into_iter().enumerate() { @@ -267,8 +358,15 @@ mod tests { resolver.track_lock(start_ts.into(), key.into_raw().unwrap(), None) } Event::Unlock(key) => resolver.untrack_lock(&key.into_raw().unwrap(), None), + Event::RawLock(ts) => resolver.raw_track_lock(ts.into()), + Event::RawUnlock(ts) => resolver.raw_untrack_lock(ts.into()), Event::Resolve(min_ts, expect) => { - assert_eq!(resolver.resolve(min_ts.into()), expect.into(), "case {}", i) + assert_eq!( + resolver.resolve(min_ts.into()).min(), + expect.into(), + "case {}", + i + ) } } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 351015fdd9a..51a21b91628 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -774,25 +774,8 @@ impl TiKvServer { unified_read_pool_scale_receiver = Some(rx); } - // Register causal observer for RawKV API V2 - if let ApiVersion::V2 = F::TAG { - let tso = block_on(causal_ts::BatchTsoProvider::new_opt( - self.pd_client.clone(), - self.config.causal_ts.renew_interval.0, - self.config.causal_ts.renew_batch_min_size, - )); - if let Err(e) = tso { - panic!("Causal timestamp provider initialize failed: {:?}", e); - } - let causal_ts_provider = Arc::new(tso.unwrap()); - info!("Causal timestamp provider startup."); - - let causal_ob = causal_ts::CausalObserver::new(causal_ts_provider); - causal_ob.register_to(self.coprocessor_host.as_mut().unwrap()); - } - // Register cdc. - let cdc_ob = cdc::CdcObserver::new(cdc_scheduler.clone()); + let mut cdc_ob = cdc::CdcObserver::new(cdc_scheduler.clone()); cdc_ob.register_to(self.coprocessor_host.as_mut().unwrap()); // Register cdc config manager. cfg_controller.register( @@ -818,6 +801,23 @@ impl TiKvServer { None }; + // Register causal observer for RawKV API V2 + if let ApiVersion::V2 = F::TAG { + let tso = block_on(causal_ts::BatchTsoProvider::new_opt( + self.pd_client.clone(), + self.config.causal_ts.renew_interval.0, + self.config.causal_ts.renew_batch_min_size, + )); + if let Err(e) = tso { + fatal!("Causal timestamp provider initialize failed: {:?}", e); + } + let causal_ts_provider = Arc::new(tso.unwrap()); + info!("Causal timestamp provider startup."); + cdc_ob.set_causal_ts_provider(causal_ts_provider.clone()); + let causal_ob = causal_ts::CausalObserver::new(causal_ts_provider, cdc_ob.clone()); + causal_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + } + let check_leader_runner = CheckLeaderRunner::new(engines.store_meta.clone()); let check_leader_scheduler = self .background_worker diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index b87cc5257a5..5d85fff86bc 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -9,7 +9,7 @@ use std::{ }; use api_version::{dispatch_api_version, KvFormat}; -use causal_ts::CausalTsProvider; +use causal_ts::{tests::DummyRawTsTracker, CausalTsProvider}; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; @@ -365,7 +365,8 @@ impl ServerCluster { ); self.causal_ts_providers .insert(node_id, causal_ts_provider.clone()); - let causal_ob = causal_ts::CausalObserver::new(causal_ts_provider); + let causal_ob = + causal_ts::CausalObserver::new(causal_ts_provider, DummyRawTsTracker::default()); causal_ob.register_to(&mut coprocessor_host); } diff --git a/src/config.rs b/src/config.rs index fc6cde09e1c..7dfbe1b0933 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2508,6 +2508,11 @@ pub struct CdcConfig { pub sink_memory_quota: ReadableSize, pub old_value_cache_memory_quota: ReadableSize, + + /// Threshold of raw regions' resolved_ts outlier detection. 60s by default. + #[online_config(skip)] + #[doc(hidden)] + pub raw_min_ts_outlier_threshold: ReadableDuration, // Deprecated! preserved for compatibility check. #[online_config(skip)] #[doc(hidden)] @@ -2533,6 +2538,8 @@ impl Default for CdcConfig { sink_memory_quota: ReadableSize::mb(512), // 512MB memory for old value cache. old_value_cache_memory_quota: ReadableSize::mb(512), + // Trigger raw region outlier judgement if resolved_ts's lag is over 60s. + raw_min_ts_outlier_threshold: ReadableDuration::secs(60), // Deprecated! preserved for compatibility check. old_value_cache_size: 0, } @@ -2574,6 +2581,14 @@ impl CdcConfig { ); self.incremental_scan_ts_filter_ratio = default_cfg.incremental_scan_ts_filter_ratio; } + if self.raw_min_ts_outlier_threshold.is_zero() { + warn!( + "cdc.raw_min_ts_outlier_threshold should be larger than 0, + change it to {}", + default_cfg.raw_min_ts_outlier_threshold + ); + self.raw_min_ts_outlier_threshold = default_cfg.raw_min_ts_outlier_threshold; + } Ok(()) } } diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index bb6f38d9d6b..94d750a20f7 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -5,6 +5,7 @@ use std::{ sync::Arc, }; +use causal_ts::tests::DummyRawTsTracker; use engine_rocks::{raw::ColumnFamilyOptions, raw_util::CFOptions}; use engine_traits::{CfName, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; use file_system::IORateLimiter; @@ -71,7 +72,8 @@ impl TestEngineBuilder { // Consider decoupling them. fn register_causal_observer(engine: &mut RocksEngine) { let causal_ts_provider = Arc::new(causal_ts::tests::TestProvider::default()); - let causal_ob = causal_ts::CausalObserver::new(causal_ts_provider); + let causal_ob = + causal_ts::CausalObserver::new(causal_ts_provider, DummyRawTsTracker::default()); engine.register_observer(|host| { causal_ob.register_to(host); }); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 2428d265391..54a596a50a2 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -765,6 +765,7 @@ fn test_serde_custom_tikv_config() { tso_worker_threads: 2, old_value_cache_memory_quota: ReadableSize::mb(14), sink_memory_quota: ReadableSize::mb(7), + raw_min_ts_outlier_threshold: ReadableDuration::secs(60), }; value.resolved_ts = ResolvedTsConfig { enable: true, From 88b659775dcbc3dd2d3a12fd836af75b3b423e84 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Mon, 11 Jul 2022 14:21:05 +0800 Subject: [PATCH 066/676] log-backup: support the new feature PiTR- backup/restore log at the tikv endpoint (#12976) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#12895 Signed-off-by: Yu Juncen Signed-off-by: joccau Signed-off-by: 3pointer Co-authored-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: 3pointer Co-authored-by: Ti Chi Robot Co-authored-by: kennytm --- Cargo.lock | 1 + components/backup-stream/Cargo.toml | 4 +- .../backup-stream/src/checkpoint_manager.rs | 439 ++++++++++ components/backup-stream/src/endpoint.rs | 796 ++++++++---------- components/backup-stream/src/errors.rs | 5 +- components/backup-stream/src/event_loader.rs | 84 +- components/backup-stream/src/lib.rs | 10 +- .../backup-stream/src/metadata/client.rs | 381 ++++++++- components/backup-stream/src/metadata/keys.rs | 38 +- .../backup-stream/src/metadata/metrics.rs | 6 + components/backup-stream/src/metadata/mod.rs | 2 +- .../backup-stream/src/metadata/store/etcd.rs | 161 +++- .../src/metadata/store/lazy_etcd.rs | 4 + .../backup-stream/src/metadata/store/mod.rs | 68 +- .../src/metadata/store/slash_etc.rs | 172 +++- components/backup-stream/src/metadata/test.rs | 114 ++- components/backup-stream/src/metrics.rs | 17 +- components/backup-stream/src/observer.rs | 4 +- components/backup-stream/src/router.rs | 250 +++++- components/backup-stream/src/service.rs | 92 ++ .../backup-stream/src/subscription_manager.rs | 650 ++++++++++++++ .../backup-stream/src/subscription_track.rs | 257 ++++-- components/backup-stream/src/utils.rs | 269 +++++- components/backup-stream/tests/mod.rs | 254 +++++- components/server/src/server.rs | 25 +- src/config.rs | 6 + src/import/sst_service.rs | 4 +- 27 files changed, 3383 insertions(+), 730 deletions(-) create mode 100644 components/backup-stream/src/checkpoint_manager.rs create mode 100644 components/backup-stream/src/service.rs create mode 100644 components/backup-stream/src/subscription_manager.rs diff --git a/Cargo.lock b/Cargo.lock index 54b315afd36..2cab9eb4b2d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -475,6 +475,7 @@ dependencies = [ "slog", "slog-global", "tempdir", + "tempfile", "test_raftstore", "test_util", "thiserror", diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index f14c0aa3c39..9e8049e0ec0 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -33,9 +33,10 @@ etcd-client = { version = "0.7", features = ["pub-response-field", "tls"] } external_storage = { path = "../external_storage", default-features = false } external_storage_export = { path = "../external_storage/export", default-features = false } fail = { version = "0.5", optional = true } - file_system = { path = "../file_system" } futures = "0.3" + +grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } hex = "0.4" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.4" @@ -71,6 +72,7 @@ grpcio = { version = "0.10", default-features = false, features = ["openssl-vend hex = "0.4" rand = "0.8.0" tempdir = "0.3" +tempfile = "3.0" test_raftstore = { path = "../test_raftstore", default-features = false } test_util = { path = "../test_util", default-features = false } url = "2" diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs new file mode 100644 index 00000000000..96e330f956d --- /dev/null +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -0,0 +1,439 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::HashMap, sync::Arc, time::Duration}; + +use kvproto::{ + errorpb::{Error as PbError, *}, + metapb::Region, +}; +use pd_client::PdClient; +use tikv_util::{info, worker::Scheduler}; +use txn_types::TimeStamp; + +use crate::{ + errors::{ContextualResultExt, Error, Result}, + metadata::{store::MetaStore, Checkpoint, CheckpointProvider, MetadataClient}, + metrics, + subscription_track::SubscriptionTracer, + try_send, RegionCheckpointOperation, Task, +}; + +/// A manager for maintaining the last flush ts. +/// This information is provided for the `advancer` in checkpoint V3, +/// which involved a central node (typically TiDB) for collecting all regions' checkpoint +/// then advancing the global checkpoint. +#[derive(Debug, Default)] +pub struct CheckpointManager { + items: HashMap, +} + +/// The result of getting a checkpoint. +/// The possibility of failed to getting checkpoint is pretty high: +/// because there is a gap between region leader change and flushing. +#[derive(Debug)] +pub enum GetCheckpointResult { + Ok { + region: Region, + checkpoint: TimeStamp, + }, + NotFound { + id: RegionIdWithVersion, + err: PbError, + }, + EpochNotMatch { + region: Region, + err: PbError, + }, +} + +impl GetCheckpointResult { + /// create an "ok" variant with region. + pub fn ok(region: Region, checkpoint: TimeStamp) -> Self { + Self::Ok { region, checkpoint } + } + + fn not_found(id: RegionIdWithVersion) -> Self { + Self::NotFound { + id, + err: not_leader(id.region_id), + } + } + + /// create a epoch not match variant with region + fn epoch_not_match(provided: RegionIdWithVersion, real: &Region) -> Self { + Self::EpochNotMatch { + region: real.clone(), + err: epoch_not_match( + provided.region_id, + provided.region_epoch_version, + real.get_region_epoch().get_version(), + ), + } + } +} + +impl CheckpointManager { + /// clear the manager. + pub fn clear(&mut self) { + self.items.clear(); + } + + /// update a region checkpoint in need. + pub fn update_region_checkpoint(&mut self, region: &Region, checkpoint: TimeStamp) { + let e = self.items.entry(region.get_id()); + e.and_modify(|old_cp| { + if old_cp.checkpoint < checkpoint + && old_cp.region.get_region_epoch().get_version() + <= region.get_region_epoch().get_version() + { + *old_cp = LastFlushTsOfRegion { + checkpoint, + region: region.clone(), + }; + } + }) + .or_insert_with(|| LastFlushTsOfRegion { + checkpoint, + region: region.clone(), + }); + } + + /// get checkpoint from a region. + pub fn get_from_region(&self, region: RegionIdWithVersion) -> GetCheckpointResult { + let checkpoint = self.items.get(®ion.region_id); + if checkpoint.is_none() { + return GetCheckpointResult::not_found(region); + } + let checkpoint = checkpoint.unwrap(); + if checkpoint.region.get_region_epoch().get_version() != region.region_epoch_version { + return GetCheckpointResult::epoch_not_match(region, &checkpoint.region); + } + GetCheckpointResult::ok(checkpoint.region.clone(), checkpoint.checkpoint) + } + + /// get all checkpoints stored. + pub fn get_all(&self) -> Vec { + self.items.values().cloned().collect() + } +} + +fn not_leader(r: u64) -> PbError { + let mut err = PbError::new(); + let mut nl = NotLeader::new(); + nl.set_region_id(r); + err.set_not_leader(nl); + err.set_message( + format!("the region {} isn't in the region_manager of log backup, maybe not leader or not flushed yet.", r)); + err +} + +fn epoch_not_match(id: u64, sent: u64, real: u64) -> PbError { + let mut err = PbError::new(); + let en = EpochNotMatch::new(); + err.set_epoch_not_match(en); + err.set_message(format!( + "the region {} has recorded version {}, but you sent {}", + id, real, sent, + )); + err +} + +#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)] +/// A simple region id, but versioned. +pub struct RegionIdWithVersion { + pub region_id: u64, + pub region_epoch_version: u64, +} + +impl RegionIdWithVersion { + pub fn new(id: u64, version: u64) -> Self { + Self { + region_id: id, + region_epoch_version: version, + } + } +} + +#[derive(Debug, Clone)] +pub struct LastFlushTsOfRegion { + pub region: Region, + pub checkpoint: TimeStamp, +} + +// Allow some type to +#[async_trait::async_trait] +pub trait FlushObserver: Send + 'static { + /// The callback when the flush has advanced the resolver. + async fn before(&mut self, checkpoints: Vec<(Region, TimeStamp)>); + /// The callback when the flush is done. (Files are fully written to external storage.) + async fn after(&mut self, task: &str, rts: u64) -> Result<()>; + /// The optional callback to rewrite the resolved ts of this flush. + /// Because the default method (collect all leader resolved ts in the store, and use the minimal TS.) + /// may lead to resolved ts rolling back, if we desire a stronger consistency, we can rewrite a safer resolved ts here. + /// Note the new resolved ts cannot be greater than the old resolved ts. + async fn rewrite_resolved_ts( + &mut self, + #[allow(unused_variables)] task: &str, + ) -> Option { + None + } +} + +pub struct BasicFlushObserver { + pd_cli: Arc, + store_id: u64, +} + +impl BasicFlushObserver { + pub fn new(pd_cli: Arc, store_id: u64) -> Self { + Self { pd_cli, store_id } + } +} + +#[async_trait::async_trait] +impl FlushObserver for BasicFlushObserver { + async fn before(&mut self, _checkpoints: Vec<(Region, TimeStamp)>) {} + + async fn after(&mut self, task: &str, rts: u64) -> Result<()> { + if let Err(err) = self + .pd_cli + .update_service_safe_point( + format!("backup-stream-{}-{}", task, self.store_id), + TimeStamp::new(rts), + // Add a service safe point for 30 mins (6x the default flush interval). + // It would probably be safe. + Duration::from_secs(1800), + ) + .await + { + Error::from(err).report("failed to update service safe point!"); + // don't give up? + } + + // Currently, we only support one task at the same time, + // so use the task as label would be ok. + metrics::STORE_CHECKPOINT_TS + .with_label_values(&[task]) + .set(rts as _); + Ok(()) + } +} + +pub struct CheckpointV2FlushObserver { + resolvers: SubscriptionTracer, + meta_cli: MetadataClient, + + fresh_regions: Vec, + checkpoints: Vec<(Region, TimeStamp)>, + can_advance: Option, + base: O, +} + +impl CheckpointV2FlushObserver { + pub fn new( + meta_cli: MetadataClient, + can_advance: F, + resolvers: SubscriptionTracer, + base: O, + ) -> Self { + Self { + resolvers, + meta_cli, + fresh_regions: vec![], + checkpoints: vec![], + can_advance: Some(can_advance), + base, + } + } +} + +#[async_trait::async_trait] +impl FlushObserver for CheckpointV2FlushObserver +where + S: MetaStore + 'static, + F: FnOnce() -> bool + Send + 'static, + O: FlushObserver, +{ + async fn before(&mut self, _checkpoints: Vec<(Region, TimeStamp)>) { + let fresh_regions = self.resolvers.collect_fresh_subs(); + let removal = self.resolvers.collect_removal_subs(); + let checkpoints = removal + .into_iter() + .map(|sub| (sub.meta, sub.resolver.resolved_ts())) + .collect::>(); + self.checkpoints = checkpoints; + self.fresh_regions = fresh_regions; + } + + async fn after(&mut self, task: &str, rts: u64) -> Result<()> { + if !self.can_advance.take().map(|f| f()).unwrap_or(true) { + let cp_now = self + .meta_cli + .get_local_task_checkpoint(task) + .await + .context(format_args!( + "during checking whether we should skip advancing ts to {}.", + rts + ))?; + // if we need to roll back checkpoint ts, don't prevent it. + if rts >= cp_now.into_inner() { + info!("skipping advance checkpoint."; "rts" => %rts, "old_rts" => %cp_now); + return Ok(()); + } + } + // Optionally upload the region checkpoint. + // Unless in some extreme condition, skipping upload the region checkpoint won't lead to data loss. + if let Err(err) = self + .meta_cli + .upload_region_checkpoint(task, &self.checkpoints) + .await + { + err.report("failed to upload region checkpoint"); + } + // we can advance the progress at next time. + // return early so we won't be mislead by the metrics. + self.meta_cli + .set_local_task_checkpoint(task, rts) + .await + .context(format_args!("on flushing task {}", task))?; + self.base.after(task, rts).await?; + self.meta_cli + .clear_region_checkpoint(task, &self.fresh_regions) + .await + .context(format_args!("on clearing the checkpoint for task {}", task))?; + Ok(()) + } +} + +pub struct CheckpointV3FlushObserver { + /// We should modify the rts (the local rts isn't right.) + /// This should be a BasicFlushObserver or something likewise. + baseline: O, + sched: Scheduler, + meta_cli: MetadataClient, + subs: SubscriptionTracer, + + checkpoints: Vec<(Region, TimeStamp)>, + global_checkpoint_cache: HashMap, +} + +impl CheckpointV3FlushObserver { + pub fn new( + sched: Scheduler, + meta_cli: MetadataClient, + subs: SubscriptionTracer, + baseline: O, + ) -> Self { + Self { + sched, + meta_cli, + checkpoints: vec![], + // We almost always have only one entry. + global_checkpoint_cache: HashMap::with_capacity(1), + subs, + baseline, + } + } +} + +impl CheckpointV3FlushObserver +where + S: MetaStore + 'static, + O: FlushObserver + Send, +{ + async fn get_checkpoint(&mut self, task: &str) -> Result { + let cp = match self.global_checkpoint_cache.get(task) { + Some(cp) => *cp, + None => { + let global_checkpoint = self.meta_cli.global_checkpoint_of_task(task).await?; + self.global_checkpoint_cache + .insert(task.to_owned(), global_checkpoint); + global_checkpoint + } + }; + Ok(cp) + } +} + +#[async_trait::async_trait] +impl FlushObserver for CheckpointV3FlushObserver +where + S: MetaStore + 'static, + O: FlushObserver + Send, +{ + async fn before(&mut self, checkpoints: Vec<(Region, TimeStamp)>) { + self.checkpoints = checkpoints; + } + + async fn after(&mut self, task: &str, _rts: u64) -> Result<()> { + self.subs.update_status_for_v3(); + let t = Task::RegionCheckpointsOp(RegionCheckpointOperation::Update(std::mem::take( + &mut self.checkpoints, + ))); + try_send!(self.sched, t); + let global_checkpoint = self.get_checkpoint(task).await?; + info!("getting global checkpoint from cache for updating."; "checkpoint" => ?global_checkpoint); + self.baseline + .after(task, global_checkpoint.ts.into_inner()) + .await?; + Ok(()) + } + + async fn rewrite_resolved_ts(&mut self, task: &str) -> Option { + let global_checkpoint = self + .get_checkpoint(task) + .await + .map_err(|err| err.report("failed to get resolved ts for rewriting")) + .ok()?; + info!("getting global checkpoint for updating."; "checkpoint" => ?global_checkpoint); + matches!(global_checkpoint.provider, CheckpointProvider::Global) + .then(|| global_checkpoint.ts) + } +} + +#[cfg(test)] +mod tests { + use std::assert_matches; + + use kvproto::metapb::*; + use txn_types::TimeStamp; + + use super::RegionIdWithVersion; + use crate::GetCheckpointResult; + + fn region(id: u64, version: u64, conf_version: u64) -> Region { + let mut r = Region::new(); + let mut e = RegionEpoch::new(); + e.set_version(version); + e.set_conf_ver(conf_version); + r.set_id(id); + r.set_region_epoch(e); + r + } + + #[test] + fn test_mgr() { + let mut mgr = super::CheckpointManager::default(); + mgr.update_region_checkpoint(®ion(1, 32, 8), TimeStamp::new(8)); + mgr.update_region_checkpoint(®ion(2, 34, 8), TimeStamp::new(15)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 8); + let r = mgr.get_from_region(RegionIdWithVersion::new(2, 33)); + assert_matches::assert_matches!(r, GetCheckpointResult::EpochNotMatch { .. }); + let r = mgr.get_from_region(RegionIdWithVersion::new(3, 44)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + mgr.update_region_checkpoint(®ion(1, 30, 8), TimeStamp::new(16)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 8); + + mgr.update_region_checkpoint(®ion(1, 30, 8), TimeStamp::new(16)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 8); + mgr.update_region_checkpoint(®ion(1, 32, 8), TimeStamp::new(16)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 16); + mgr.update_region_checkpoint(®ion(1, 33, 8), TimeStamp::new(24)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 33)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 24); + } +} diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index a89d5a66da4..490e0b48e8d 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -1,6 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + collections::HashSet, fmt, marker::PhantomData, path::PathBuf, @@ -18,11 +19,9 @@ use kvproto::{ }; use online_config::ConfigChange; use pd_client::PdClient; -use raft::StateRole; use raftstore::{ coprocessor::{CmdBatch, ObserveHandle, RegionInfoProvider}, router::RaftStoreRouter, - store::fsm::ChangeObserver, }; use tikv::config::BackupStreamConfig; use tikv_util::{ @@ -30,7 +29,7 @@ use tikv_util::{ config::ReadableDuration, debug, defer, info, sys::thread::ThreadBuildWrapper, - time::Instant, + time::{Instant, Limiter}, warn, worker::{Runnable, Scheduler}, HandyRwLock, @@ -38,42 +37,58 @@ use tikv_util::{ use tokio::{ io::Result as TokioResult, runtime::{Handle, Runtime}, + sync::oneshot, }; use tokio_stream::StreamExt; use txn_types::TimeStamp; -use yatp::task::callback::Handle as YatpHandle; use super::metrics::HANDLE_EVENT_DURATION_HISTOGRAM; use crate::{ annotate, + checkpoint_manager::{ + BasicFlushObserver, CheckpointManager, CheckpointV2FlushObserver, + CheckpointV3FlushObserver, FlushObserver, GetCheckpointResult, RegionIdWithVersion, + }, errors::{Error, Result}, event_loader::{InitialDataLoader, PendingMemoryQuota}, + future, metadata::{store::MetaStore, MetadataClient, MetadataEvent, StreamTask}, metrics::{self, TaskStatus}, observer::BackupStreamObserver, - router::{ApplyEvents, Router, FLUSH_STORAGE_INTERVAL}, + router::{ApplyEvents, Router}, + subscription_manager::{RegionSubscriptionManager, ResolvedRegions}, subscription_track::SubscriptionTracer, try_send, - utils::{self, StopWatch}, + utils::{self, CallbackWaitGroup, StopWatch, Work}, }; const SLOW_EVENT_THRESHOLD: f64 = 120.0; +/// CHECKPOINT_SAFEPOINT_TTL_IF_ERROR specifies the safe point TTL(24 hour) if task has fatal error. +const CHECKPOINT_SAFEPOINT_TTL_IF_ERROR: u64 = 24; pub struct Endpoint { - meta_client: MetadataClient, + // Note: those fields are more like a shared context between components. + // For now, we copied them everywhere, maybe we'd better extract them into a + // context type. + pub(crate) meta_client: MetadataClient, + pub(crate) scheduler: Scheduler, + pub(crate) store_id: u64, + pub(crate) regions: R, + pub(crate) engine: PhantomData, + pub(crate) router: RT, + pub(crate) pd_client: Arc, + pub(crate) subs: SubscriptionTracer, + pub(crate) concurrency_manager: ConcurrencyManager, + range_router: Router, - scheduler: Scheduler, observer: BackupStreamObserver, pool: Runtime, - store_id: u64, - regions: R, - engine: PhantomData, - router: RT, - pd_client: Arc, - subs: SubscriptionTracer, - concurrency_manager: ConcurrencyManager, initial_scan_memory_quota: PendingMemoryQuota, - scan_pool: ScanPool, + initial_scan_throughput_quota: Limiter, + region_operator: RegionSubscriptionManager, + failover_time: Option, + config: BackupStreamConfig, + checkpoint_mgr: CheckpointManager, } impl Endpoint @@ -98,7 +113,6 @@ where crate::metrics::STREAM_ENABLED.inc(); let pool = create_tokio_runtime(config.io_threads, "backup-stream") .expect("failed to create tokio runtime for backup stream worker."); - let scan_pool = create_scan_pool(config.num_threads); let meta_client = MetadataClient::new(store, store_id); let range_router = Router::new( @@ -123,7 +137,31 @@ where let initial_scan_memory_quota = PendingMemoryQuota::new(config.initial_scan_pending_memory_quota.0 as _); + let limit = if config.initial_scan_rate_limit.0 > 0 { + config.initial_scan_rate_limit.0 as f64 + } else { + f64::INFINITY + }; + let initial_scan_throughput_quota = Limiter::new(limit); info!("the endpoint of stream backup started"; "path" => %config.temp_path); + let subs = SubscriptionTracer::default(); + let (region_operator, op_loop) = RegionSubscriptionManager::start( + InitialDataLoader::new( + router.clone(), + accessor.clone(), + range_router.clone(), + subs.clone(), + scheduler.clone(), + initial_scan_memory_quota.clone(), + pool.handle().clone(), + initial_scan_throughput_quota.clone(), + ), + observer.clone(), + meta_client.clone(), + pd_client.clone(), + config.num_threads, + ); + pool.spawn(op_loop); Endpoint { meta_client, range_router, @@ -135,10 +173,14 @@ where engine: PhantomData, router, pd_client, - subs: Default::default(), + subs, concurrency_manager, initial_scan_memory_quota, - scan_pool, + initial_scan_throughput_quota, + region_operator, + failover_time: None, + config, + checkpoint_mgr: Default::default(), } } } @@ -172,7 +214,7 @@ where let safepoint = meta_cli.global_progress_of_task(&task).await?; pdc.update_service_safe_point( safepoint_name, - TimeStamp::new(safepoint), + TimeStamp::new(safepoint - 1), safepoint_ttl, ) .await?; @@ -198,9 +240,9 @@ where async fn starts_flush_ticks(router: Router) { loop { - // check every 15s. + // check every 5s. // TODO: maybe use global timer handle in the `tikv_utils::timer` (instead of enabling timing in the current runtime)? - tokio::time::sleep(Duration::from_secs(FLUSH_STORAGE_INTERVAL / 20)).await; + tokio::time::sleep(Duration::from_secs(5)).await; debug!("backup stream trigger flush tick"); router.tick().await; } @@ -217,6 +259,8 @@ where if task.is_paused { continue; } + // We have meet task upon store start, we must in a failover. + scheduler.schedule(Task::MarkFailover(Instant::now()))?; // move task to schedule scheduler.schedule(Task::WatchTask(TaskOp::AddTask(task)))?; } @@ -338,6 +382,25 @@ where } } + fn flush_observer(&self) -> Box { + let basic = BasicFlushObserver::new(self.pd_client.clone(), self.store_id); + if self.config.use_checkpoint_v3 { + Box::new(CheckpointV3FlushObserver::new( + self.scheduler.clone(), + self.meta_client.clone(), + self.subs.clone(), + basic, + )) + } else { + Box::new(CheckpointV2FlushObserver::new( + self.meta_client.clone(), + self.make_flush_guard(), + self.subs.clone(), + basic, + )) + } + } + /// Convert a batch of events to the cmd batch, and update the resolver status. fn record_batch(subs: SubscriptionTracer, batch: CmdBatch) -> Option { let region_id = batch.region_id; @@ -366,7 +429,7 @@ where Some(kvs) } - fn backup_batch(&self, batch: CmdBatch) { + fn backup_batch(&self, batch: CmdBatch, work: Work) { let mut sw = StopWatch::new(); let router = self.range_router.clone(); @@ -396,7 +459,8 @@ where } HANDLE_EVENT_DURATION_HISTOGRAM .with_label_values(&["save_to_temp_file"]) - .observe(time_cost) + .observe(time_cost); + drop(work) }); } @@ -410,6 +474,7 @@ where self.scheduler.clone(), self.initial_scan_memory_quota.clone(), self.pool.handle().clone(), + self.initial_scan_throughput_quota.clone(), ) } @@ -450,20 +515,20 @@ where "end_key" => utils::redact(&end_key), ); } - self.spawn_at_scan_pool(move || { - let range_init_result = init.initialize_range(start_key.clone(), end_key.clone()); - match range_init_result { - Ok(()) => { - info!("backup stream success to initialize"; + // Assuming the `region info provider` would read region info form `StoreMeta` directly and this would be fast. + // If this gets slow, maybe make it async again. (Will that bring race conditions? say `Start` handled after `ResfreshResolver` of some region.) + let range_init_result = init.initialize_range(start_key.clone(), end_key.clone()); + match range_init_result { + Ok(()) => { + info!("backup stream success to initialize"; "start_key" => utils::redact(&start_key), "end_key" => utils::redact(&end_key), "take" => ?start.saturating_elapsed(),) - } - Err(e) => { - e.report("backup stream initialize failed"); - } } - }); + Err(e) => { + e.report("backup stream initialize failed"); + } + } Ok(()) } @@ -483,6 +548,7 @@ where let cli = self.meta_client.clone(); let init = self.make_initial_loader(); let range_router = self.range_router.clone(); + let use_v3 = self.config.use_checkpoint_v3; info!( "register backup stream task"; @@ -503,50 +569,46 @@ where }), ); self.pool.block_on(async move { - let task_name = task.info.get_name(); - match cli.ranges_of_task(task_name).await { - Ok(ranges) => { - info!( - "register backup stream ranges"; - "task" => ?task, - "ranges-count" => ranges.inner.len(), - ); - let ranges = ranges - .inner - .into_iter() - .map(|(start_key, end_key)| { - (utils::wrap_key(start_key), utils::wrap_key(end_key)) - }) - .collect::>(); - if let Err(err) = range_router - .register_task(task.clone(), ranges.clone()) - .await - { - err.report(format!( - "failed to register backup stream task {}", - task.info.name - )); - return; - } - - for (start_key, end_key) in ranges { - let init = init.clone(); - - self.observe_and_scan_region(init, &task, start_key, end_key) - .await - .unwrap(); - } - info!( - "finish register backup stream ranges"; - "task" => ?task, - ); + let task_clone = task.clone(); + let run = async move { + let task_name = task.info.get_name(); + if !use_v3 { + cli.init_task(&task.info).await?; } - Err(e) => { - e.report(format!( - "failed to register backup stream task {} to router: ranges not found", - task.info.get_name() - )); + let ranges = cli.ranges_of_task(task_name).await?; + info!( + "register backup stream ranges"; + "task" => ?task, + "ranges-count" => ranges.inner.len(), + ); + let ranges = ranges + .inner + .into_iter() + .map(|(start_key, end_key)| { + (utils::wrap_key(start_key), utils::wrap_key(end_key)) + }) + .collect::>(); + range_router + .register_task(task.clone(), ranges.clone()) + .await?; + + for (start_key, end_key) in ranges { + let init = init.clone(); + + self.observe_and_scan_region(init, &task, start_key, end_key) + .await? } + info!( + "finish register backup stream ranges"; + "task" => ?task, + ); + Result::Ok(()) + }; + if let Err(e) = run.await { + e.report(format!( + "failed to register backup stream task {} to router: ranges not found", + task_clone.info.get_name() + )); } }); metrics::update_task_status(TaskStatus::Running, &task_name); @@ -557,7 +619,7 @@ where } fn pause_guard_duration(&self) -> Duration { - ReadableDuration::hours(24).0 + ReadableDuration::hours(CHECKPOINT_SAFEPOINT_TTL_IF_ERROR).0 } pub fn on_pause(&self, task: &str) { @@ -588,14 +650,22 @@ where pub fn on_unregister(&self, task: &str) -> Option { let info = self.unload_task(task); - - // reset the checkpoint ts of the task so it won't mislead the metrics. - metrics::STORE_CHECKPOINT_TS - .with_label_values(&[task]) - .set(0); + self.remove_metrics_after_unregister(task); info } + fn remove_metrics_after_unregister(&self, task: &str) { + // remove metrics of the task so it won't mislead the metrics. + let _ = metrics::STORE_CHECKPOINT_TS + .remove_label_values(&[task]) + .map_err( + |err| info!("failed to remove checkpoint ts metric"; "task" => task, "err" => %err), + ); + let _ = metrics::remove_task_status_metric(task).map_err( + |err| info!("failed to remove checkpoint ts metric"; "task" => task, "err" => %err), + ); + } + /// unload a task from memory: this would stop observe the changes required by the task temporarily. fn unload_task(&self, task: &str) -> Option { let router = self.range_router.clone(); @@ -607,357 +677,123 @@ where self.pool.block_on(router.unregister_task(task)) } - /// try advance the resolved ts by the pd tso. - async fn try_resolve( - cm: &ConcurrencyManager, - pd_client: Arc, - resolvers: SubscriptionTracer, - ) -> TimeStamp { - let pd_tso = pd_client - .get_tso() - .await - .map_err(|err| Error::from(err).report("failed to get tso from pd")) - .unwrap_or_default(); - cm.update_max_ts(pd_tso); - let min_ts = cm.global_min_lock_ts().unwrap_or(TimeStamp::max()); - let tso = Ord::min(pd_tso, min_ts); - let ts = resolvers.resolve_with(tso); - resolvers.warn_if_gap_too_huge(ts); - ts - } - - async fn flush_for_task( - task: String, - store_id: u64, - router: Router, - pd_cli: Arc, - resolvers: SubscriptionTracer, - meta_cli: MetadataClient, - concurrency_manager: ConcurrencyManager, - ) { - let start = Instant::now_coarse(); - // NOTE: Maybe push down the resolve step to the router? - // Or if there are too many duplicated `Flush` command, we may do some useless works. - let new_rts = Self::try_resolve(&concurrency_manager, pd_cli.clone(), resolvers).await; - #[cfg(feature = "failpoints")] - fail::fail_point!("delay_on_flush"); - metrics::FLUSH_DURATION - .with_label_values(&["resolve_by_now"]) - .observe(start.saturating_elapsed_secs()); - if let Some(rts) = router.do_flush(&task, store_id, new_rts).await { - info!("flushing and refreshing checkpoint ts."; - "checkpoint_ts" => %rts, - "task" => %task, - ); - if rts == 0 { - // We cannot advance the resolved ts for now. - return; + /// Make a guard for checking whether we can flush the checkpoint ts. + fn make_flush_guard(&self) -> impl FnOnce() -> bool + Send { + let failover = self.failover_time; + let flush_duration = self.config.max_flush_interval; + move || { + if failover + .as_ref() + .map(|failover_t| failover_t.saturating_elapsed() < flush_duration.0 * 2) + .unwrap_or(false) + { + warn!("during failover, skipping advancing resolved ts"; + "failover_time_ago" => ?failover.map(|failover_t| failover_t.saturating_elapsed())); + return false; } let in_flight = crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.load(Ordering::SeqCst); if in_flight > 0 { warn!("inflight leader detected, skipping advancing resolved ts"; "in_flight" => %in_flight); - return; - } - if let Err(err) = pd_cli - .update_service_safe_point( - format!("backup-stream-{}-{}", task, store_id), - TimeStamp::new(rts), - // Add a service safe point for 30 mins (6x the default flush interval). - // It would probably be safe. - Duration::from_secs(1800), - ) - .await - { - Error::from(err).report("failed to update service safe point!"); - // don't give up? + return false; } - if let Err(err) = meta_cli.step_task(&task, rts).await { - err.report(format!("on flushing task {}", task)); - // we can advance the progress at next time. - // return early so we won't be mislead by the metrics. - return; - } - metrics::STORE_CHECKPOINT_TS - // Currently, we only support one task at the same time, - // so use the task as label would be ok. - .with_label_values(&[task.as_str()]) - .set(rts as _) + true } } - pub fn on_force_flush(&self, task: String, store_id: u64) { - let router = self.range_router.clone(); - let cli = self.meta_client.clone(); + fn prepare_min_ts(&self) -> future![TimeStamp] { let pd_cli = self.pd_client.clone(); - let resolvers = self.subs.clone(); let cm = self.concurrency_manager.clone(); - self.pool.spawn(async move { - let info = router.get_task_info(&task).await; - // This should only happen in testing, it would be to unwrap... - let _ = info.unwrap().set_flushing_status_cas(false, true); - Self::flush_for_task(task, store_id, router, pd_cli, resolvers, cli, cm).await; - }); - } - - pub fn on_flush(&self, task: String, store_id: u64) { - let router = self.range_router.clone(); - let cli = self.meta_client.clone(); - let pd_cli = self.pd_client.clone(); - let resolvers = self.subs.clone(); - let cm = self.concurrency_manager.clone(); - self.pool.spawn(Self::flush_for_task( - task, store_id, router, pd_cli, resolvers, cli, cm, - )); + async move { + let pd_tso = pd_cli + .get_tso() + .await + .map_err(|err| Error::from(err).report("failed to get tso from pd")) + .unwrap_or_default(); + cm.update_max_ts(pd_tso); + let min_ts = cm.global_min_lock_ts().unwrap_or(TimeStamp::max()); + Ord::min(pd_tso, min_ts) + } } - /// Start observe over some region. - /// This would modify some internal state, and delegate the task to InitialLoader::observe_over. - fn observe_over(&self, region: &Region, handle: ObserveHandle) -> Result<()> { - let init = self.make_initial_loader(); - let region_id = region.get_id(); - self.subs.register_region(region, handle.clone(), None); - init.observe_over_with_retry(region, || { - ChangeObserver::from_pitr(region_id, handle.clone()) - })?; - Ok(()) + fn get_resolved_regions(&self, min_ts: TimeStamp) -> future![Result] { + let (tx, rx) = oneshot::channel(); + let op = self.region_operator.clone(); + async move { + let req = ObserveOp::ResolveRegions { + callback: Box::new(move |rs| { + let _ = tx.send(rs); + }), + min_ts, + }; + op.request(req).await; + rx.await + .map_err(|err| annotate!(err, "failed to send request for resolve regions")) + } } - fn observe_over_with_initial_data_from_checkpoint( - &self, - region: &Region, - task: String, - handle: ObserveHandle, - ) -> Result<()> { - let init = self.make_initial_loader(); - - let meta_cli = self.meta_client.clone(); - let last_checkpoint = TimeStamp::new( - self.pool - .block_on(meta_cli.global_progress_of_task(&task))?, - ); - self.subs - .register_region(region, handle.clone(), Some(last_checkpoint)); - - let region_id = region.get_id(); - let snap = init.observe_over_with_retry(region, move || { - ChangeObserver::from_pitr(region_id, handle.clone()) - })?; - let region = region.clone(); - - // we should not spawn initial scanning tasks to the tokio blocking pool - // beacuse it is also used for converting sync File I/O to async. (for now!) - // In that condition, if we blocking for some resouces(for example, the `MemoryQuota`) - // at the block threads, we may meet some ghosty deadlock. - self.spawn_at_scan_pool(move || { - let begin = Instant::now_coarse(); - match init.do_initial_scan(®ion, last_checkpoint, snap) { - Ok(stat) => { - info!("initial scanning of leader transforming finished!"; "takes" => ?begin.saturating_elapsed(), "region" => %region.get_id(), "from_ts" => %last_checkpoint); - utils::record_cf_stat("lock", &stat.lock); - utils::record_cf_stat("write", &stat.write); - utils::record_cf_stat("default", &stat.data); + fn do_flush(&self, task: String, min_ts: TimeStamp) -> future![Result<()>] { + let get_rts = self.get_resolved_regions(min_ts); + let router = self.range_router.clone(); + let store_id = self.store_id; + let mut flush_ob = self.flush_observer(); + async move { + let mut resolved = get_rts.await?; + let mut new_rts = resolved.global_checkpoint(); + #[cfg(feature = "failpoints")] + fail::fail_point!("delay_on_flush"); + flush_ob.before(resolved.take_region_checkpoints()).await; + if let Some(rewritten_rts) = flush_ob.rewrite_resolved_ts(&task).await { + info!("rewriting resolved ts"; "old" => %new_rts, "new" => %rewritten_rts); + new_rts = rewritten_rts.min(new_rts); + } + if let Some(rts) = router.do_flush(&task, store_id, new_rts).await { + info!("flushing and refreshing checkpoint ts."; + "checkpoint_ts" => %rts, + "task" => %task, + ); + if rts == 0 { + // We cannot advance the resolved ts for now. + return Ok(()); } - Err(err) => err.report(format!("during initial scanning of region {:?}", region)), + flush_ob.after(&task, rts).await? } + Ok(()) + } + } + + pub fn on_force_flush(&self, task: String) { + self.pool.block_on(async move { + let info = self.range_router.get_task_info(&task).await; + // This should only happen in testing, it would be to unwrap... + let _ = info.unwrap().set_flushing_status_cas(false, true); + let mts = self.prepare_min_ts().await; + try_send!(self.scheduler, Task::FlushWithMinTs(task, mts)); }); - Ok(()) } - // spawn a task at the scan pool. - fn spawn_at_scan_pool(&self, task: impl FnOnce() + Send + 'static) { - self.scan_pool.spawn(move |_: &mut YatpHandle<'_>| { - tikv_alloc::add_thread_memory_accessor(); - let _io_guard = file_system::WithIOType::new(file_system::IOType::Replication); - task(); - tikv_alloc::remove_thread_memory_accessor(); + pub fn on_flush(&self, task: String) { + self.pool.block_on(async move { + let mts = self.prepare_min_ts().await; + info!("min_ts prepared for flushing"; "min_ts" => %mts); + try_send!(self.scheduler, Task::FlushWithMinTs(task, mts)); }) } - fn find_task_by_region(&self, r: &Region) -> Option { - self.range_router - .find_task_by_range(&r.start_key, &r.end_key) + fn on_flush_with_min_ts(&self, task: String, min_ts: TimeStamp) { + self.pool.spawn(self.do_flush(task, min_ts).map(|r| { + if let Err(err) = r { + err.report("during updating flush status") + } + })); } /// Modify observe over some region. /// This would register the region to the RaftStore. pub fn on_modify_observe(&self, op: ObserveOp) { - info!("backup stream: on_modify_observe"; "op" => ?op); - match op { - ObserveOp::Start { - region, - needs_initial_scanning, - } => { - #[cfg(feature = "failpoints")] - fail::fail_point!("delay_on_start_observe"); - self.start_observe(region, needs_initial_scanning); - metrics::INITIAL_SCAN_REASON - .with_label_values(&["leader-changed"]) - .inc(); - crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.fetch_sub(1, Ordering::SeqCst); - } - ObserveOp::Stop { ref region } => { - self.subs.deregister_region(region, |_, _| true); - } - ObserveOp::CheckEpochAndStop { ref region } => { - self.subs.deregister_region(region, |old, new| { - raftstore::store::util::compare_region_epoch( - old.meta.get_region_epoch(), - new, - true, - true, - false, - ) - .map_err(|err| warn!("check epoch and stop failed."; "err" => %err)) - .is_ok() - }); - } - ObserveOp::RefreshResolver { ref region } => { - let need_refresh_all = !self.subs.try_update_region(region); - - if need_refresh_all { - let canceled = self.subs.deregister_region(region, |_, _| true); - let handle = ObserveHandle::new(); - if canceled { - let for_task = self.find_task_by_region(region).unwrap_or_else(|| { - panic!( - "BUG: the region {:?} is register to no task but being observed", - region - ) - }); - metrics::INITIAL_SCAN_REASON - .with_label_values(&["region-changed"]) - .inc(); - if let Err(e) = self.observe_over_with_initial_data_from_checkpoint( - region, - for_task, - handle.clone(), - ) { - try_send!( - self.scheduler, - Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { - region: region.clone(), - handle, - err: Box::new(e) - }) - ); - } - } - } - } - ObserveOp::NotifyFailToStartObserve { - region, - handle, - err, - } => { - info!("retry observe region"; "region" => %region.get_id(), "err" => %err); - // No need for retrying observe canceled. - if err.error_code() == error_code::backup_stream::OBSERVE_CANCELED { - return; - } - match self.retry_observe(region, handle) { - Ok(()) => {} - Err(e) => { - try_send!( - self.scheduler, - Task::FatalError( - format!("While retring to observe region, origin error is {}", err), - Box::new(e) - ) - ); - } - } - } - } - } - - fn start_observe(&self, region: Region, needs_initial_scanning: bool) { - let handle = ObserveHandle::new(); - let result = if needs_initial_scanning { - match self.find_task_by_region(®ion) { - None => { - warn!( - "the region {:?} is register to no task but being observed (start_key = {}; end_key = {}; task_stat = {:?}): maybe stale, aborting", - region, - utils::redact(®ion.get_start_key()), - utils::redact(®ion.get_end_key()), - self.range_router - ); - return; - } - - Some(for_task) => self.observe_over_with_initial_data_from_checkpoint( - ®ion, - for_task, - handle.clone(), - ), - } - } else { - self.observe_over(®ion, handle.clone()) - }; - if let Err(err) = result { - try_send!( - self.scheduler, - Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { - region, - handle, - err: Box::new(err) - }) - ); - } + self.pool.block_on(self.region_operator.request(op)); } - fn retry_observe(&self, region: Region, handle: ObserveHandle) -> Result<()> { - let (tx, rx) = crossbeam::channel::bounded(1); - self.regions - .find_region_by_id( - region.get_id(), - Box::new(move |item| { - tx.send(item) - .expect("BUG: failed to send to newly created channel."); - }), - ) - .map_err(|err| { - annotate!( - err, - "failed to send request to region info accessor, server maybe too too too busy. (region id = {})", - region.get_id() - ) - })?; - let new_region_info = rx - .recv() - .map_err(|err| annotate!(err, "BUG?: unexpected channel message dropped."))?; - if new_region_info.is_none() { - metrics::SKIP_RETRY - .with_label_values(&["region-absent"]) - .inc(); - return Ok(()); - } - let new_region_info = new_region_info.unwrap(); - if new_region_info.role != StateRole::Leader { - metrics::SKIP_RETRY.with_label_values(&["not-leader"]).inc(); - return Ok(()); - } - let removed = self.subs.deregister_region(®ion, |old, _| { - let should_remove = old.handle().id == handle.id; - if !should_remove { - warn!("stale retry command"; "region" => ?region, "handle" => ?handle, "old_handle" => ?old.handle()); - } - should_remove - }); - if !removed { - metrics::SKIP_RETRY - .with_label_values(&["stale-command"]) - .inc(); - return Ok(()); - } - metrics::INITIAL_SCAN_REASON - .with_label_values(&["retry"]) - .inc(); - self.start_observe(region, true); - Ok(()) - } - - pub fn run_task(&self, task: Task) { + pub fn run_task(&mut self, task: Task) { debug!("run backup stream task"; "task" => ?task, "store_id" => %self.store_id); let now = Instant::now_coarse(); let label = task.label(); @@ -968,9 +804,9 @@ where match task { Task::WatchTask(op) => self.handle_watch_task(op), Task::BatchEvent(events) => self.do_backup(events), - Task::Flush(task) => self.on_flush(task, self.store_id), + Task::Flush(task) => self.on_flush(task), Task::ModifyObserve(op) => self.on_modify_observe(op), - Task::ForceFlush(task) => self.on_force_flush(task, self.store_id), + Task::ForceFlush(task) => self.on_force_flush(task), Task::FatalError(task, err) => self.on_fatal_error(task, err), Task::ChangeConfig(_) => { warn!("change config online isn't supported for now.") @@ -986,25 +822,54 @@ where }); } } + Task::MarkFailover(t) => self.failover_time = Some(t), + Task::FlushWithMinTs(task, min_ts) => self.on_flush_with_min_ts(task, min_ts), + Task::RegionCheckpointsOp(s) => self.handle_region_checkpoints_op(s), + } + } + + pub fn handle_region_checkpoints_op(&mut self, op: RegionCheckpointOperation) { + match op { + RegionCheckpointOperation::Update(u) => { + // Let's clear all stale checkpoints first. + // Or they may slow down the global checkpoint. + self.checkpoint_mgr.clear(); + for (region, checkpoint) in u { + debug!("setting region checkpoint"; "region" => %region.get_id(), "ts" => %checkpoint); + self.checkpoint_mgr + .update_region_checkpoint(®ion, checkpoint) + } + } + RegionCheckpointOperation::Get(g, cb) => { + let _guard = self.pool.handle().enter(); + match g { + RegionSet::Universal => cb(self + .checkpoint_mgr + .get_all() + .into_iter() + .map(|c| GetCheckpointResult::ok(c.region.clone(), c.checkpoint)) + .collect()), + RegionSet::Regions(rs) => cb(rs + .iter() + .map(|(id, version)| { + self.checkpoint_mgr + .get_from_region(RegionIdWithVersion::new(*id, *version)) + }) + .collect()), + } + } } } pub fn do_backup(&self, events: Vec) { + let wg = CallbackWaitGroup::new(); for batch in events { - self.backup_batch(batch) + self.backup_batch(batch, wg.clone().work()); } + self.pool.block_on(wg.wait()) } } -type ScanPool = yatp::ThreadPool; - -/// Create a yatp pool for doing initial scanning. -fn create_scan_pool(num_threads: usize) -> ScanPool { - yatp::Builder::new("log-backup-scan") - .max_thread_count(num_threads) - .build_callback_pool() -} - /// Create a standard tokio runtime /// (which allows io and time reactor, involve thread memory accessor), fn create_tokio_runtime(thread_count: usize, thread_name: &str) -> TokioResult { @@ -1026,12 +891,32 @@ fn create_tokio_runtime(thread_count: usize, thread_name: &str) -> TokioResult), +} + +pub enum RegionCheckpointOperation { + Update(Vec<(Region, TimeStamp)>), + Get(RegionSet, Box) + Send>), +} + +impl fmt::Debug for RegionCheckpointOperation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Update(arg0) => f.debug_tuple("Update").field(arg0).finish(), + Self::Get(arg0, _) => f.debug_tuple("Get").field(arg0).finish(), + } + } +} + pub enum Task { WatchTask(TaskOp), BatchEvent(Vec), ChangeConfig(ConfigChange), - /// Flush the task with name. - Flush(String), /// Change the observe status of some region. ModifyObserve(ObserveOp), /// Convert status of some task into `flushing` and do flush then. @@ -1047,6 +932,18 @@ pub enum Task { // This returns `true`. Box bool + Send>, ), + /// Mark the store as a failover store. + /// This would prevent store from updating its checkpoint ts for a while. + /// Because we are not sure whether the regions in the store have new leader -- + /// we keep a safe checkpoint so they can choose a safe `from_ts` for initial scanning. + MarkFailover(Instant), + /// Flush the task with name. + Flush(String), + /// Execute the flush with the calculated `min_ts`. + /// This is an internal command only issued by the `Flush` task. + FlushWithMinTs(String, TimeStamp), + /// The command for getting region checkpoints. + RegionCheckpointsOp(RegionCheckpointOperation), } #[derive(Debug)] @@ -1057,19 +954,21 @@ pub enum TaskOp { ResumeTask(String), } -#[derive(Debug)] +/// The callback for resolving region. +type ResolveRegionsCallback = Box; + pub enum ObserveOp { Start { region: Region, - // if `true`, would scan and sink change from the global checkpoint ts. - // Note: maybe we'd better make it Option to make it more generic, - // but that needs the `observer` know where the checkpoint is, which is a little dirty... - needs_initial_scanning: bool, }, Stop { region: Region, }, - CheckEpochAndStop { + /// Destroy the region subscription. + /// Unlike `Stop`, this will assume the region would never go back. + /// For now, the effect of "never go back" is that we won't try to hint other store + /// the checkpoint ts of this region. + Destroy { region: Region, }, RefreshResolver { @@ -1080,6 +979,39 @@ pub enum ObserveOp { handle: ObserveHandle, err: Box, }, + ResolveRegions { + callback: ResolveRegionsCallback, + min_ts: TimeStamp, + }, +} + +impl std::fmt::Debug for ObserveOp { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Start { region } => f.debug_struct("Start").field("region", region).finish(), + Self::Stop { region } => f.debug_struct("Stop").field("region", region).finish(), + Self::Destroy { region } => f.debug_struct("Destroy").field("region", region).finish(), + Self::RefreshResolver { region } => f + .debug_struct("RefreshResolver") + .field("region", region) + .finish(), + Self::NotifyFailToStartObserve { + region, + handle, + err, + } => f + .debug_struct("NotifyFailToStartObserve") + .field("region", region) + .field("handle", handle) + .field("err", err) + .finish(), + Self::ResolveRegions { min_ts, .. } => f + .debug_struct("ResolveRegions") + .field("min_ts", min_ts) + .field("callback", &format_args!("fn {{ .. }}")) + .finish(), + } + } } impl fmt::Debug for Task { @@ -1098,6 +1030,16 @@ impl fmt::Debug for Task { f.debug_tuple("FatalError").field(task).field(err).finish() } Self::Sync(..) => f.debug_tuple("Sync").finish(), + Self::MarkFailover(t) => f + .debug_tuple("MarkFailover") + .field(&format_args!("{:?} ago", t.saturating_elapsed())) + .finish(), + Self::FlushWithMinTs(arg0, arg1) => f + .debug_tuple("FlushWithMinTs") + .field(arg0) + .field(arg1) + .finish(), + Self::RegionCheckpointsOp(s) => f.debug_tuple("GetRegionCheckpoints").field(s).finish(), } } } @@ -1123,13 +1065,17 @@ impl Task { Task::ModifyObserve(o) => match o { ObserveOp::Start { .. } => "modify_observe.start", ObserveOp::Stop { .. } => "modify_observe.stop", - ObserveOp::CheckEpochAndStop { .. } => "modify_observe.check_epoch_and_stop", + ObserveOp::Destroy { .. } => "modify_observe.destroy", ObserveOp::RefreshResolver { .. } => "modify_observe.refresh_resolver", ObserveOp::NotifyFailToStartObserve { .. } => "modify_observe.retry", + ObserveOp::ResolveRegions { .. } => "modify_observe.resolve", }, Task::ForceFlush(_) => "force_flush", Task::FatalError(..) => "fatal_error", Task::Sync(..) => "sync", + Task::MarkFailover(_) => "mark_failover", + Task::FlushWithMinTs(..) => "flush_with_min_ts", + Task::RegionCheckpointsOp(..) => "get_checkpoints", } } } diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index a4d4515c213..b049b0a29be 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -120,7 +120,10 @@ where #[macro_export(crate)] macro_rules! annotate { ($inner: expr, $message: expr) => { - Error::Other(tikv_util::box_err!("{}: {}", $message, $inner)) + { + use tikv_util::box_err; + $crate::errors::Error::Other(box_err!("{}: {}", $message, $inner)) + } }; ($inner: expr, $format: literal, $($args: expr),+) => { annotate!($inner, format_args!($format, $($args),+)) diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index d791ce6a825..fdba0194000 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -20,7 +20,12 @@ use tikv::storage::{ txn::{EntryBatch, TxnEntry, TxnEntryScanner}, Snapshot, Statistics, }; -use tikv_util::{box_err, time::Instant, warn, worker::Scheduler}; +use tikv_util::{ + box_err, + time::{Instant, Limiter}, + warn, + worker::Scheduler, +}; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use txn_types::{Key, Lock, TimeStamp}; @@ -69,8 +74,12 @@ impl PendingMemoryQuota { /// EventLoader transforms data from the snapshot into ApplyEvent. pub struct EventLoader { scanner: DeltaScanner, + // pooling the memory. + entry_batch: EntryBatch, } +const ENTRY_BATCH_SIZE: usize = 1024; + impl EventLoader { pub fn load_from( snapshot: S, @@ -93,20 +102,31 @@ impl EventLoader { from_ts, to_ts, region_id ))?; - Ok(Self { scanner }) + Ok(Self { + scanner, + entry_batch: EntryBatch::with_capacity(ENTRY_BATCH_SIZE), + }) + } + + /// Scan a batch of events from the snapshot, and save them into the internal buffer. + fn fill_entries(&mut self) -> Result { + assert!( + self.entry_batch.is_empty(), + "EventLoader: the entry batch isn't empty when filling entries, which is error-prone, please call `omit_entries` first. (len = {})", + self.entry_batch.len() + ); + self.scanner.scan_entries(&mut self.entry_batch)?; + Ok(self.scanner.take_statistics()) } - /// scan a batch of events from the snapshot. Tracking the locks at the same time. - /// note: maybe make something like [`EntryBatch`] for reducing allocation. - fn scan_batch( + /// Drain the internal buffer, converting them to the [`ApplyEvents`], + /// and tracking the locks at the same time. + fn omit_entries_to( &mut self, - batch_size: usize, result: &mut ApplyEvents, resolver: &mut TwoPhaseResolver, - ) -> Result { - let mut b = EntryBatch::with_capacity(batch_size); - self.scanner.scan_entries(&mut b)?; - for entry in b.drain() { + ) -> Result<()> { + for entry in self.entry_batch.drain() { match entry { TxnEntry::Prewrite { default: (key, value), @@ -149,7 +169,7 @@ impl EventLoader { } } } - Ok(self.scanner.take_statistics()) + Ok(()) } } @@ -158,15 +178,16 @@ impl EventLoader { /// Note: maybe we can merge those two structures? #[derive(Clone)] pub struct InitialDataLoader { - router: RT, - regions: R, + pub(crate) router: RT, + pub(crate) regions: R, // Note: maybe we can make it an abstract thing like `EventSink` with // method `async (KvEvent) -> Result<()>`? - sink: Router, - tracing: SubscriptionTracer, - scheduler: Scheduler, - quota: PendingMemoryQuota, - handle: tokio::runtime::Handle, + pub(crate) sink: Router, + pub(crate) tracing: SubscriptionTracer, + pub(crate) scheduler: Scheduler, + pub(crate) quota: PendingMemoryQuota, + pub(crate) handle: tokio::runtime::Handle, + pub(crate) limit: Limiter, _engine: PhantomData, } @@ -185,6 +206,7 @@ where sched: Scheduler, quota: PendingMemoryQuota, handle: tokio::runtime::Handle, + limiter: Limiter, ) -> Self { Self { router, @@ -195,6 +217,7 @@ where _engine: PhantomData, quota, handle, + limit: limiter, } } @@ -215,12 +238,17 @@ where Error::RaftRequest(pbe) => { !(pbe.has_epoch_not_match() || pbe.has_not_leader() - || pbe.get_message().contains("stale observe id")) + || pbe.get_message().contains("stale observe id") + || pbe.has_region_not_found()) } Error::RaftStore(raftstore::Error::RegionNotFound(_)) | Error::RaftStore(raftstore::Error::NotLeader(..)) => false, _ => true, }; + e.report(format_args!( + "during getting initial snapshot for region {:?}; can retry = {}", + region, can_retry + )); last_err = match last_err { None => Some(e), Some(err) => Some(Error::Contextual { @@ -347,8 +375,14 @@ where let start = Instant::now(); loop { let mut events = ApplyEvents::with_capacity(1024, region.id); - let stat = - self.with_resolver(region, |r| event_loader.scan_batch(1024, &mut events, r))?; + let stat = event_loader.fill_entries()?; + let disk_read = self.with_resolver(region, |r| { + let (result, byte_size) = utils::with_record_read_throughput(|| { + event_loader.omit_entries_to(&mut events, r) + }); + result?; + Result::Ok(byte_size) + })?; if events.is_empty() { metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); return Ok(stats.stat); @@ -359,6 +393,7 @@ where let event_size = events.size(); let sched = self.scheduler.clone(); let permit = self.quota.pending(event_size); + self.limit.blocking_consume(disk_read as _); debug!("sending events to router"; "size" => %event_size, "region" => %region_id); metrics::INCREMENTAL_SCAN_SIZE.observe(event_size as f64); metrics::HEAP_MEMORY.add(event_size as _); @@ -376,6 +411,7 @@ where region: &Region, start_ts: TimeStamp, snap: impl Snapshot, + on_finish: impl FnOnce() + Send + 'static, ) -> Result { let _guard = self.handle.enter(); // It is ok to sink more data than needed. So scan to +inf TS for convenance. @@ -405,6 +441,7 @@ where region_id )); } + on_finish() }); stats } @@ -425,10 +462,7 @@ where // At that time, we have nowhere to record the lock status of this region. let success = try_send!( self.scheduler, - Task::ModifyObserve(ObserveOp::Start { - region: r.region, - needs_initial_scanning: true - }) + Task::ModifyObserve(ObserveOp::Start { region: r.region }) ); if success { crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.fetch_add(1, Ordering::SeqCst); diff --git a/components/backup-stream/src/lib.rs b/components/backup-stream/src/lib.rs index a19b4b4fc2f..34dbfa33e4c 100644 --- a/components/backup-stream/src/lib.rs +++ b/components/backup-stream/src/lib.rs @@ -1,17 +1,23 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(slice_group_by)] #![feature(result_flattening)] #![feature(assert_matches)] #![feature(test)] +mod checkpoint_manager; pub mod config; mod endpoint; pub mod errors; mod event_loader; pub mod metadata; -mod metrics; +pub(crate) mod metrics; pub mod observer; pub mod router; +mod service; +mod subscription_manager; mod subscription_track; mod utils; -pub use endpoint::{Endpoint, ObserveOp, Task}; +pub use checkpoint_manager::GetCheckpointResult; +pub use endpoint::{Endpoint, ObserveOp, RegionCheckpointOperation, RegionSet, Task}; +pub use service::Service; diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 5f0e8b85bed..07d93162e00 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -1,18 +1,26 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{collections::HashMap, fmt::Debug}; +use std::{cmp::Ordering, collections::HashMap, fmt::Debug, path::Path, time::Duration}; -use kvproto::brpb::{StreamBackupError, StreamBackupTaskInfo}; +use kvproto::{ + brpb::{StreamBackupError, StreamBackupTaskInfo}, + metapb::Region, +}; use tikv_util::{defer, time::Instant, warn}; use tokio_stream::StreamExt; +use txn_types::TimeStamp; use super::{ keys::{self, KeyValue, MetaKey}, store::{ - GetExtra, Keys, KvEvent, KvEventType, MetaStore, Snapshot, Subscription, WithRevision, + CondTransaction, Condition, GetExtra, Keys, KvEvent, KvEventType, MetaStore, PutOption, + Snapshot, Subscription, Transaction, WithRevision, }, }; -use crate::errors::{Error, Result}; +use crate::{ + debug, + errors::{ContextualResultExt, Error, Result}, +}; /// Some operations over stream backup metadata key space. #[derive(Clone)] @@ -64,6 +72,115 @@ impl PartialEq for MetadataEvent { } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CheckpointProvider { + Store(u64), + Region { id: u64, version: u64 }, + Task, + Global, +} + +/// The polymorphic checkpoint. +/// The global checkpoint should be the minimal checkpoint of all checkpoints. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Checkpoint { + pub provider: CheckpointProvider, + pub ts: TimeStamp, +} + +impl Checkpoint { + pub fn from_kv(kv: &KeyValue) -> Result { + match std::str::from_utf8(kv.0.0.as_slice()) { + Ok(key) => Checkpoint::parse_from(Path::new(key), kv.1.as_slice()), + Err(_) => { + Ok(Checkpoint { + // The V1 checkpoint, maybe fill the store id? + provider: CheckpointProvider::Store(0), + ts: TimeStamp::new(parse_ts_from_bytes(kv.1.as_slice())?), + }) + } + } + } + + pub fn parse_from(path: &Path, checkpoint_ts: &[u8]) -> Result { + let segs = path.iter().map(|os| os.to_str()).collect::>(); + match segs.as_slice() { + [ + // We always use '/' as the path. + // NOTE: Maybe just `split` and don't use `path`? + Some("/"), + Some("tidb"), + Some("br-stream"), + Some("checkpoint"), + Some(_task_name), + Some("region"), + Some(id), + Some(epoch), + .., + ] => Self::from_region_parse_result(id, epoch, checkpoint_ts) + .context(format_args!("during parsing key {}", path.display())), + [ + // We always use '/' as the path. + // NOTE: Maybe just `split` and don't use `path`? + Some("/"), + Some("tidb"), + Some("br-stream"), + Some("checkpoint"), + Some(_task_name), + Some("store"), + Some(id), + .., + ] => Self::from_store_parse_result(id, checkpoint_ts) + .context(format_args!("during parsing key {}", path.display())), + [ + // We always use '/' as the path. + // NOTE: Maybe just `split` and don't use `path`? + Some("/"), + Some("tidb"), + Some("br-stream"), + Some("checkpoint"), + Some(_task_name), + Some("central_global"), + ] => Ok(Self { + provider: CheckpointProvider::Global, + ts: TimeStamp::new(parse_ts_from_bytes(checkpoint_ts)?), + }), + _ => Err(Error::MalformedMetadata(format!( + "cannot parse path {}(segs = {:?}) as checkpoint", + path.display(), + segs + ))), + } + } + + fn from_store_parse_result(id: &str, checkpoint_ts: &[u8]) -> Result { + let provider_id = id + .parse::() + .map_err(|err| Error::MalformedMetadata(err.to_string()))?; + let provider = CheckpointProvider::Store(provider_id); + let checkpoint = TimeStamp::new(parse_ts_from_bytes(checkpoint_ts)?); + Ok(Self { + provider, + ts: checkpoint, + }) + } + + fn from_region_parse_result(id: &str, version: &str, checkpoint_ts: &[u8]) -> Result { + let id = id + .parse::() + .map_err(|err| Error::MalformedMetadata(err.to_string()))?; + let version = version + .parse::() + .map_err(|err| Error::MalformedMetadata(err.to_string()))?; + let checkpoint = TimeStamp::new(parse_ts_from_bytes(checkpoint_ts)?); + let provider = CheckpointProvider::Region { id, version }; + Ok(Self { + provider, + ts: checkpoint, + }) + } +} + impl MetadataEvent { fn from_watch_event(event: &KvEvent) -> Option { // Maybe report an error when the kv isn't present? @@ -126,6 +243,27 @@ impl MetadataClient { } } + /// Initialize a task: execute some general operations over the keys. + /// For now, it sets the checkpoint ts if there isn't one for the current store. + pub async fn init_task(&self, task: &StreamBackupTaskInfo) -> Result<()> { + let if_present = Condition::new( + MetaKey::next_backup_ts_of(&task.name, self.store_id), + Ordering::Greater, + vec![], + ); + let txn = CondTransaction::new( + if_present, + Transaction::default(), + Transaction::default().put(KeyValue( + MetaKey::next_backup_ts_of(&task.name, self.store_id), + task.get_start_ts().to_be_bytes().to_vec(), + )), + ); + self.meta_store.txn_cond(txn).await + } + + /// Upload the last error information to the etcd. + /// This won't pause the task. Even this method would usually be paired with `pause`. pub async fn report_last_error(&self, name: &str, last_error: StreamBackupError) -> Result<()> { use protobuf::Message; let now = Instant::now(); @@ -284,7 +422,7 @@ impl MetadataClient { } /// forward the progress of some task. - pub async fn step_task(&self, task_name: &str, ts: u64) -> Result<()> { + pub async fn set_local_task_checkpoint(&self, task_name: &str, ts: u64) -> Result<()> { let now = Instant::now(); defer! { super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_step"]).observe(now.saturating_elapsed().as_secs_f64()) @@ -298,6 +436,25 @@ impl MetadataClient { Ok(()) } + pub async fn get_local_task_checkpoint(&self, task_name: &str) -> Result { + let now = Instant::now(); + defer! { + super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_step"]).observe(now.saturating_elapsed().as_secs_f64()) + } + let snap = self.meta_store.snapshot().await?; + let ts = snap + .get(Keys::Key(MetaKey::next_backup_ts_of( + task_name, + self.store_id, + ))) + .await?; + + match ts.as_slice() { + [ts, ..] => Ok(TimeStamp::new(parse_ts_from_bytes(ts.value())?)), + [] => Ok(self.get_task_start_ts_checkpoint(task_name).await?.ts), + } + } + /// get all target ranges of some task. pub async fn ranges_of_task( &self, @@ -391,47 +548,58 @@ impl MetadataClient { Ok(task.unwrap().info.start_ts) } else { assert_eq!(items.len(), 1); - Self::parse_ts_from_bytes(items[0].1.as_slice()) + parse_ts_from_bytes(items[0].1.as_slice()) } } - /// get the global progress (the min next_backup_ts among all stores). - pub async fn global_progress_of_task(&self, task_name: &str) -> Result { + pub async fn checkpoints_of(&self, task_name: &str) -> Result> { let now = Instant::now(); defer! { - super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_progress_get_global"]).observe(now.saturating_elapsed().as_secs_f64()) - } - let task = self.get_task(task_name).await?; - if task.is_none() { - return Err(Error::NoSuchTask { - task_name: task_name.to_owned(), - }); + super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["checkpoints_of"]).observe(now.saturating_elapsed().as_secs_f64()) } - let snap = self.meta_store.snapshot().await?; - let global_ts = snap.get(Keys::Prefix(MetaKey::next_backup_ts(task_name))) + let checkpoints = snap + .get(Keys::Prefix(MetaKey::next_backup_ts(task_name))) .await? .iter() .filter_map(|kv| { - Self::parse_ts_from_bytes(kv.1.as_slice()) + Checkpoint::from_kv(kv) .map_err(|err| warn!("br-stream: failed to parse next_backup_ts."; "key" => ?kv.0, "err" => %err)) .ok() }) - .min() - .unwrap_or(task.unwrap().info.start_ts); - Ok(global_ts) + .collect(); + Ok(checkpoints) } - fn parse_ts_from_bytes(next_backup_ts: &[u8]) -> Result { - if next_backup_ts.len() != 8 { - return Err(Error::MalformedMetadata(format!( - "the length of next_backup_ts is {} bytes, require 8 bytes", - next_backup_ts.len() - ))); - } - let mut buf = [0u8; 8]; - buf.copy_from_slice(next_backup_ts); - Ok(u64::from_be_bytes(buf)) + async fn get_task_start_ts_checkpoint(&self, task_name: &str) -> Result { + let task = self + .get_task(task_name) + .await? + .ok_or_else(|| Error::NoSuchTask { + task_name: task_name.to_owned(), + })?; + Ok(Checkpoint { + ts: TimeStamp::new(task.info.start_ts), + provider: CheckpointProvider::Task, + }) + } + + /// Get the global checkpoint of a task. + /// It is the smallest checkpoint of all types of checkpoint. + pub async fn global_checkpoint_of_task(&self, task_name: &str) -> Result { + let cp = match self.global_checkpoint_of(task_name).await? { + Some(cp) => cp, + None => self.get_task_start_ts_checkpoint(task_name).await?, + }; + Ok(cp) + } + + /// get the global progress (the min next_backup_ts among all stores). + pub async fn global_progress_of_task(&self, task_name: &str) -> Result { + let cp = self.global_checkpoint_of_task(task_name).await?; + debug!("getting global progress of task"; "checkpoint" => ?cp); + let ts = cp.ts.into_inner(); + Ok(ts) } /// insert a task with ranges into the metadata store. @@ -464,4 +632,155 @@ impl MetadataClient { .delete(Keys::Key(MetaKey::task_of(name))) .await } + + /// upload a region-level checkpoint. + pub async fn upload_region_checkpoint( + &self, + task_name: &str, + checkpoints: &[(Region, TimeStamp)], + ) -> Result<()> { + let txn = checkpoints + .iter() + .fold(Transaction::default(), |txn, (region, cp)| { + txn.put_opt( + KeyValue( + MetaKey::next_bakcup_ts_of_region(task_name, region), + (*cp).into_inner().to_be_bytes().to_vec(), + ), + PutOption { + ttl: Duration::from_secs(600), + }, + ) + }); + self.meta_store.txn(txn).await + } + + pub async fn clear_region_checkpoint(&self, task_name: &str, regions: &[Region]) -> Result<()> { + let txn = regions.iter().fold(Transaction::default(), |txn, region| { + txn.delete(Keys::Key(MetaKey::next_bakcup_ts_of_region( + task_name, region, + ))) + }); + self.meta_store.txn(txn).await + } + + pub async fn global_checkpoint_of(&self, task: &str) -> Result> { + let cps = self.checkpoints_of(task).await?; + let mut min_checkpoint = None; + for cp in cps { + match cp.provider { + CheckpointProvider::Store(..) => { + if min_checkpoint + .as_ref() + .map(|c: &Checkpoint| c.ts > cp.ts) + .unwrap_or(true) + { + min_checkpoint = Some(cp); + } + } + // The global checkpoint has higher priority than store checkpoint. + CheckpointProvider::Task | CheckpointProvider::Global => return Ok(Some(cp)), + CheckpointProvider::Region { .. } => continue, + } + } + Ok(min_checkpoint) + } + + pub async fn get_region_checkpoint(&self, task: &str, region: &Region) -> Result { + let key = MetaKey::next_bakcup_ts_of_region(task, region); + let s = self.meta_store.snapshot().await?; + let r = s.get(Keys::Key(key.clone())).await?; + match r.len() { + 0 => { + let global_cp = self.global_checkpoint_of(task).await?; + let cp = match global_cp { + None => self.get_task_start_ts_checkpoint(task).await?, + Some(cp) => cp, + }; + Ok(cp) + } + _ => Ok(Checkpoint::from_kv(&r[0])?), + } + } +} + +fn parse_ts_from_bytes(next_backup_ts: &[u8]) -> Result { + if next_backup_ts.len() != 8 { + return Err(Error::MalformedMetadata(format!( + "the length of next_backup_ts is {} bytes, require 8 bytes", + next_backup_ts.len() + ))); + } + let mut buf = [0u8; 8]; + buf.copy_from_slice(next_backup_ts); + Ok(u64::from_be_bytes(buf)) +} + +#[cfg(test)] +mod test { + use kvproto::metapb::{Region as RegionInfo, RegionEpoch}; + use txn_types::TimeStamp; + + use super::Checkpoint; + use crate::metadata::{ + client::CheckpointProvider, + keys::{KeyValue, MetaKey}, + }; + + #[test] + fn test_parse() { + struct Case { + provider: CheckpointProvider, + checkpoint: u64, + } + + fn run_case(c: Case) { + let key = match c.provider { + CheckpointProvider::Region { id, version } => { + let mut r = RegionInfo::new(); + let mut v = RegionEpoch::new(); + v.set_version(version); + r.set_region_epoch(v); + r.set_id(id); + MetaKey::next_bakcup_ts_of_region("test", &r) + } + CheckpointProvider::Store(id) => MetaKey::next_backup_ts_of("test", id), + _ => unreachable!(), + }; + let checkpoint = c.checkpoint; + let cp_bytes = checkpoint.to_be_bytes(); + let kv = KeyValue(key, cp_bytes.to_vec()); + let parsed = Checkpoint::from_kv(&kv).unwrap(); + assert_eq!( + parsed, + Checkpoint { + provider: c.provider, + ts: TimeStamp::new(c.checkpoint), + } + ); + } + use CheckpointProvider::*; + + let cases = vec![ + Case { + checkpoint: TimeStamp::compose(TimeStamp::physical_now(), 10).into_inner(), + provider: Region { id: 42, version: 8 }, + }, + Case { + checkpoint: u64::from_be_bytes(*b"let i=0;"), + provider: Store(3), + }, + Case { + checkpoint: u64::from_be_bytes(*b"(callcc)"), + provider: Region { + id: 16961, + version: 16, + }, + }, + ]; + + for case in cases { + run_case(case) + } + } } diff --git a/components/backup-stream/src/metadata/keys.rs b/components/backup-stream/src/metadata/keys.rs index be92da123ae..6920ba14a33 100644 --- a/components/backup-stream/src/metadata/keys.rs +++ b/components/backup-stream/src/metadata/keys.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use bytes::BufMut; +use kvproto::metapb::Region; const PREFIX: &str = "/tidb/br-stream"; const PATH_INFO: &str = "/info"; @@ -99,19 +99,36 @@ impl MetaKey { ranges } - /// The key of next backup ts of some region in some store. - pub fn next_backup_ts_of(name: &str, store_id: u64) -> Self { - let base = Self::next_backup_ts(name); - let mut buf = bytes::BytesMut::from(base.0.as_slice()); - buf.put_u64(store_id); - Self(buf.to_vec()) - } - // The prefix for next backup ts. pub fn next_backup_ts(name: &str) -> Self { Self(format!("{}{}/{}/", PREFIX, PATH_NEXT_BACKUP_TS, name).into_bytes()) } + /// The key of next backup ts of some region in some store. + pub fn next_backup_ts_of(name: &str, store_id: u64) -> Self { + Self( + format!( + "{}{}/{}/store/{}", + PREFIX, PATH_NEXT_BACKUP_TS, name, store_id + ) + .into_bytes(), + ) + } + + pub fn next_bakcup_ts_of_region(name: &str, region: &Region) -> Self { + Self( + format!( + "{}{}/{}/region/{}/{}", + PREFIX, + PATH_NEXT_BACKUP_TS, + name, + region.id, + region.get_region_epoch().get_version() + ) + .into_bytes(), + ) + } + pub fn pause_prefix_len() -> usize { Self::pause_prefix().0.len() } @@ -129,8 +146,7 @@ impl MetaKey { Self(format!("{}{}/{}/{}", PREFIX, PATH_LAST_ERROR, name, store).into_bytes()) } - /// return the key that keeps the range [self, self.next()) contains only - /// `self`. + /// return the key that keeps the range [self, self.next()) contains only `self`. pub fn next(&self) -> Self { let mut next = self.clone(); next.0.push(0); diff --git a/components/backup-stream/src/metadata/metrics.rs b/components/backup-stream/src/metadata/metrics.rs index f4ea1258ab7..1dea498834e 100644 --- a/components/backup-stream/src/metadata/metrics.rs +++ b/components/backup-stream/src/metadata/metrics.rs @@ -16,4 +16,10 @@ lazy_static! { "metadata event(task_add, task_removed, error) count.", &["type"], }.unwrap(); + + pub static ref METADATA_KEY_OPERATION: IntCounterVec = register_int_counter_vec! { + "tikv_log_backup_metadata_key_operation", + "the operation over keys", + &["type"], + }.unwrap(); } diff --git a/components/backup-stream/src/metadata/mod.rs b/components/backup-stream/src/metadata/mod.rs index a49eb305fa1..4c387533e49 100644 --- a/components/backup-stream/src/metadata/mod.rs +++ b/components/backup-stream/src/metadata/mod.rs @@ -6,5 +6,5 @@ mod metrics; pub mod store; mod test; -pub use client::{MetadataClient, MetadataEvent, StreamTask}; +pub use client::{Checkpoint, CheckpointProvider, MetadataClient, MetadataEvent, StreamTask}; pub use store::lazy_etcd::{ConnectionConfig, LazyEtcdClient}; diff --git a/components/backup-stream/src/metadata/store/etcd.rs b/components/backup-stream/src/metadata/store/etcd.rs index 7da46ea5dbf..2b940c905cd 100644 --- a/components/backup-stream/src/metadata/store/etcd.rs +++ b/components/backup-stream/src/metadata/store/etcd.rs @@ -1,21 +1,32 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{pin::Pin, sync::Arc}; +use std::{ + cmp::Ordering, + collections::{HashMap, HashSet}, + pin::Pin, + sync::Arc, + time::Duration, +}; use async_trait::async_trait; use etcd_client::{ - DeleteOptions, EventType, GetOptions, SortOrder, SortTarget, Txn, TxnOp, WatchOptions, + Client, Compare, CompareOp, DeleteOptions, EventType, GetOptions, PutOptions, SortOrder, + SortTarget, Txn, TxnOp, WatchOptions, }; use futures::StreamExt; use tikv_util::warn; use tokio::sync::Mutex; use tokio_stream::Stream; -use super::{GetExtra, GetResponse, Keys, KvChangeSubscription, KvEventType, MetaStore, Snapshot}; +use super::{ + GetExtra, GetResponse, Keys, KvChangeSubscription, KvEventType, MetaStore, Snapshot, + TransactionOp, +}; use crate::{ errors::Result, metadata::{ keys::{KeyValue, MetaKey}, + metrics::METADATA_KEY_OPERATION, store::{KvEvent, Subscription}, }, }; @@ -91,11 +102,6 @@ impl MetaStore for EtcdStore { }) } - async fn set(&self, pair: KeyValue) -> Result<()> { - self.0.lock().await.put(pair.0, pair.1, None).await?; - Ok(()) - } - async fn watch(&self, keys: Keys, start_rev: i64) -> Result { let mut opt = WatchOptions::new(); let key = prepare_opt!(opt, keys); @@ -128,6 +134,20 @@ impl MetaStore for EtcdStore { }) } + async fn txn(&self, t: super::Transaction) -> Result<()> { + let mut cli = self.0.lock().await; + let txns = Self::make_txn(&mut cli, t).await?; + for txn in txns { + cli.txn(txn).await?; + } + Ok(()) + } + + async fn set(&self, pair: KeyValue) -> Result<()> { + self.0.lock().await.put(pair.0, pair.1, None).await?; + Ok(()) + } + async fn delete(&self, keys: Keys) -> Result<()> { let mut opt = DeleteOptions::new(); let key = prepare_opt!(opt, keys); @@ -136,31 +156,114 @@ impl MetaStore for EtcdStore { Ok(()) } - async fn txn(&self, t: super::Transaction) -> Result<()> { - self.0.lock().await.txn(t.into()).await?; + async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { + let mut cli = self.0.lock().await; + let txn = Self::make_conditional_txn(&mut cli, txn).await?; + cli.txn(txn).await?; Ok(()) } } -impl From for Txn { - fn from(etcd_txn: super::Transaction) -> Txn { - let txn = Txn::default(); - txn.and_then( - etcd_txn - .into_ops() - .into_iter() - .map(|op| match op { - super::TransactionOp::Put(mut pair) => { - TxnOp::put(pair.take_key(), pair.take_value(), None) - } - super::TransactionOp::Delete(rng) => { - let mut opt = DeleteOptions::new(); - let key = prepare_opt!(opt, rng); - TxnOp::delete(key, Some(opt)) - } - }) - .collect::>(), - ) +impl EtcdStore { + fn collect_leases_needed(txn: &super::Transaction) -> HashSet { + txn.ops + .iter() + .filter_map(|op| match op { + TransactionOp::Put(_, opt) if opt.ttl.as_secs() > 0 => Some(opt.ttl), + _ => None, + }) + .collect() + } + + async fn make_leases( + cli: &mut Client, + needed: HashSet, + ) -> Result> { + let mut map = HashMap::with_capacity(needed.len()); + for lease_time in needed { + let lease_id = cli.lease_grant(lease_time.as_secs() as _, None).await?.id(); + map.insert(lease_time, lease_id); + } + Ok(map) + } + + fn partition_txns(mut txn: super::Transaction, leases: HashMap) -> Vec { + txn.ops + .chunks_mut(128) + .map(|txn| Txn::default().and_then(Self::to_txn(txn, &leases))) + .collect() + } + + fn to_compare(cond: super::Condition) -> Compare { + let op = match cond.result { + Ordering::Less => CompareOp::Less, + Ordering::Equal => CompareOp::Equal, + Ordering::Greater => CompareOp::Greater, + }; + Compare::value(cond.over_key, op, cond.arg) + } + + /// Convert the transcation operations to etcd transcation ops. + fn to_txn(ops: &mut [super::TransactionOp], leases: &HashMap) -> Vec { + ops.iter_mut().map(|op| match op { + TransactionOp::Put(key, opt) => { + let opts = if opt.ttl.as_secs() > 0 { + let lease = leases.get(&opt.ttl); + match lease { + None => { + warn!("lease not found, the request key may not have a ttl"; "dur" => ?opt.ttl); + None + } + Some(lease_id) => { + Some(PutOptions::new().with_lease(*lease_id)) + } + } + } else { + None + }; + TxnOp::put(key.take_key(), key.take_value(), opts) + }, + TransactionOp::Delete(rng) => { + let rng = std::mem::replace(rng, Keys::Key(MetaKey(vec![]))); + let mut opt = DeleteOptions::new(); + let key = prepare_opt!(opt, rng); + TxnOp::delete(key, Some(opt)) + }, + }).collect::>() + } + + /// Make a conditional txn. + /// For now, this wouldn't split huge transaction into smaller ones, + /// so when playing with etcd in PD, conditional transaction should be small. + async fn make_conditional_txn( + cli: &mut Client, + mut txn: super::CondTransaction, + ) -> Result { + let cond = Self::to_compare(txn.cond); + + let mut leases_needed = Self::collect_leases_needed(&txn.success); + leases_needed.extend(Self::collect_leases_needed(&txn.failure).into_iter()); + let leases = Self::make_leases(cli, leases_needed).await?; + let success = Self::to_txn(&mut txn.success.ops, &leases); + let failure = Self::to_txn(&mut txn.failure.ops, &leases); + Ok(Txn::new().when([cond]).and_then(success).or_else(failure)) + } + + async fn make_txn(cli: &mut Client, etcd_txn: super::Transaction) -> Result> { + let (put_cnt, delete_cnt) = etcd_txn.ops.iter().fold((0, 0), |(p, d), item| match item { + TransactionOp::Put(..) => (p + 1, d), + TransactionOp::Delete(_) => (p, d + 1), + }); + METADATA_KEY_OPERATION + .with_label_values(&["put"]) + .inc_by(put_cnt); + METADATA_KEY_OPERATION + .with_label_values(&["del"]) + .inc_by(delete_cnt); + let needed_leases = Self::collect_leases_needed(&etcd_txn); + let leases = Self::make_leases(cli, needed_leases).await?; + let txns = Self::partition_txns(etcd_txn, leases); + Ok(txns) } } diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 61145455419..7e1858b913e 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -140,4 +140,8 @@ impl MetaStore for LazyEtcdClient { async fn txn(&self, txn: super::Transaction) -> Result<()> { self.0.get_cli().await?.txn(txn).await } + + async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { + self.0.get_cli().await?.txn_cond(txn).await + } } diff --git a/components/backup-stream/src/metadata/store/mod.rs b/components/backup-stream/src/metadata/store/mod.rs index 58441d7ba72..0855582da59 100644 --- a/components/backup-stream/src/metadata/store/mod.rs +++ b/components/backup-stream/src/metadata/store/mod.rs @@ -1,11 +1,16 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +pub mod lazy_etcd; + +// Note: these mods also used for integration tests, +// so we cannot compile them only when `#[cfg(test)]`. +// (See https://github.com/rust-lang/rust/issues/84629) +// Maybe we'd better make a feature like `integration-test`? pub mod slash_etc; pub use slash_etc::SlashEtcStore; pub mod etcd; -pub mod lazy_etcd; -use std::{future::Future, pin::Pin}; +use std::{cmp::Ordering, future::Future, pin::Pin, time::Duration}; use async_trait::async_trait; pub use etcd::EtcdStore; @@ -23,25 +28,73 @@ pub struct Transaction { ops: Vec, } +/// A condition for executing a transcation. +/// Compare value a key with arg. +#[derive(Debug)] +pub struct Condition { + over_key: Vec, + result: Ordering, + arg: Vec, +} + +impl Condition { + pub fn new(over_key: MetaKey, result: Ordering, arg: Vec) -> Self { + Self { + over_key: over_key.0, + result, + arg, + } + } +} + +/// A conditional transaction. +/// This would atomicly evalute the condition, and execute corresponding transaction. +#[derive(Debug)] +pub struct CondTransaction { + cond: Condition, + success: Transaction, + failure: Transaction, +} + +impl CondTransaction { + pub fn new(cond: Condition, success: Transaction, failure: Transaction) -> Self { + Self { + cond, + success, + failure, + } + } +} + impl Transaction { fn into_ops(self) -> Vec { self.ops } - fn put(mut self, kv: KeyValue) -> Self { - self.ops.push(TransactionOp::Put(kv)); + pub fn put(mut self, kv: KeyValue) -> Self { + self.ops.push(TransactionOp::Put(kv, PutOption::default())); self } - fn delete(mut self, keys: Keys) -> Self { + pub fn put_opt(mut self, kv: KeyValue, opt: PutOption) -> Self { + self.ops.push(TransactionOp::Put(kv, opt)); + self + } + + pub fn delete(mut self, keys: Keys) -> Self { self.ops.push(TransactionOp::Delete(keys)); self } } +#[derive(Default, Debug)] +pub struct PutOption { + pub ttl: Duration, +} + #[derive(Debug)] pub enum TransactionOp { - Put(KeyValue), + Put(KeyValue, PutOption), Delete(Keys), } @@ -140,8 +193,9 @@ pub trait MetaStore: Clone + Send + Sync { /// Can be canceled then by polling the `cancel` future in the Subscription. async fn watch(&self, keys: Keys, start_rev: i64) -> Result; /// Execute an atomic write (write batch) over the store. - /// Maybe support etcd-like compare operations? async fn txn(&self, txn: Transaction) -> Result<()>; + /// Execute an conditional transaction over the store. + async fn txn_cond(&self, txn: CondTransaction) -> Result<()>; /// Set a key in the store. /// Maybe rename it to `put` to keeping consistency with etcd? diff --git a/components/backup-stream/src/metadata/store/slash_etc.rs b/components/backup-stream/src/metadata/store/slash_etc.rs index 48df7dbaaca..1a2f127501c 100644 --- a/components/backup-stream/src/metadata/store/slash_etc.rs +++ b/components/backup-stream/src/metadata/store/slash_etc.rs @@ -8,14 +8,13 @@ use std::{ }; use async_trait::async_trait; -use slog_global::error; -use tikv_util::warn; use tokio::sync::{ mpsc::{self, Sender}, Mutex, }; use tokio_stream::StreamExt; +use super::{Condition, Keys}; use crate::{ errors::Result, metadata::{ @@ -33,11 +32,34 @@ struct Subscriber { tx: Sender, } +/// A key with revision. +#[derive(Default, Eq, PartialEq, Ord, PartialOrd, Clone)] +struct Key(Vec, i64); + +impl std::fmt::Debug for Key { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Key") + .field(&format_args!( + "{}@{}", + log_wrappers::Value::key(&self.0), + self.1 + )) + .finish() + } +} + +/// A value (maybe tombstone.) +#[derive(Debug, Eq, PartialEq, Clone)] +enum Value { + Val(Vec), + Del, +} + /// An in-memory, single versioned storage. /// Emulating some interfaces of etcd for testing. #[derive(Default)] pub struct SlashEtc { - items: BTreeMap, Vec>, + items: BTreeMap, // Maybe a range tree here if the test gets too slow. subs: HashMap, revision: i64, @@ -54,26 +76,14 @@ impl Snapshot for WithRevision { extra: crate::metadata::store::GetExtra, ) -> Result { let data = self.inner.lock().await; - if data.revision != self.revision { - warn!( - "snapshot expired (multi version isn't supported yet, you may read steal data): {} vs {}", - data.revision, self.revision - ); - } - let (start_key, end_key) = keys.into_bound(); - let mut kvs = data - .items - .range::<[u8], _>(( - Bound::Included(start_key.as_slice()), - Bound::Excluded(end_key.as_slice()), - )) - .map(|(k, v)| KeyValue(MetaKey(k.clone()), v.clone())) - .collect::>(); - // use iterator operations (instead of collect all kv pairs in the range) - // if the test case get too slow. (How can we figure out whether there are more?) + let mut kvs = data.get_key(keys); + if extra.desc_order { kvs.reverse(); } + + // use iterator operations (instead of collect all kv pairs in the range) + // if the test case get too slow. (How can we figure out whether there are more?) let more = if extra.limit > 0 { let more = kvs.len() > extra.limit; kvs.truncate(extra.limit); @@ -90,9 +100,37 @@ impl Snapshot for WithRevision { } impl SlashEtc { + fn alloc_rev(&mut self) -> i64 { + self.revision += 1; + self.revision + } + + fn get_key(&self, keys: super::Keys) -> Vec { + let (start_key, end_key) = keys.into_bound(); + let mvccs = self + .items + .range(( + Bound::Included(&Key(start_key, 0)), + Bound::Excluded(&Key(end_key, 0)), + )) + .collect::>(); + let kvs = mvccs + .as_slice() + .group_by(|k1, k2| k1.0.0 == k2.0.0) + .filter_map(|k| { + let (k, v) = k.last()?; + match v { + Value::Val(val) => Some(KeyValue(MetaKey(k.0.clone()), val.clone())), + Value::Del => None, + } + }) + .collect::>(); + kvs + } + async fn set(&mut self, mut pair: crate::metadata::keys::KeyValue) -> Result<()> { let data = self; - data.revision += 1; + let rev = data.alloc_rev(); for sub in data.subs.values() { if pair.key() < sub.end_key.as_slice() && pair.key() >= sub.start_key.as_slice() { sub.tx @@ -104,33 +142,37 @@ impl SlashEtc { .unwrap(); } } - data.items.insert(pair.take_key(), pair.take_value()); + data.items + .insert(Key(pair.take_key(), rev), Value::Val(pair.take_value())); Ok(()) } async fn delete(&mut self, keys: crate::metadata::store::Keys) -> Result<()> { - let mut data = self; + let data = self; let (start_key, end_key) = keys.into_bound(); - data.revision += 1; - for mut victim in data + let rev = data.alloc_rev(); + let mut v = data .items - .range::<[u8], _>(( - Bound::Included(start_key.as_slice()), - Bound::Excluded(end_key.as_slice()), + .range(( + Bound::Included(Key(start_key, 0)), + Bound::Excluded(Key(end_key, data.revision)), )) - .map(|(k, _)| k.clone()) - .collect::>() - { - data.items.remove(&victim); + .map(|(k, _)| Key::clone(k)) + .collect::>(); + v.dedup_by(|k1, k2| k1.0 == k2.0); + + for mut victim in v { + let k = Key(victim.0.clone(), rev); + data.items.insert(k, Value::Del); for sub in data.subs.values() { - if victim.as_slice() < sub.end_key.as_slice() - && victim.as_slice() >= sub.start_key.as_slice() + if victim.0.as_slice() < sub.end_key.as_slice() + && victim.0.as_slice() >= sub.start_key.as_slice() { sub.tx .send(KvEvent { kind: KvEventType::Delete, - pair: KeyValue(MetaKey(std::mem::take(&mut victim)), vec![]), + pair: KeyValue(MetaKey(std::mem::take(&mut victim.0)), vec![]), }) .await .unwrap(); @@ -139,6 +181,16 @@ impl SlashEtc { } Ok(()) } + + /// A tool for dumpling the whole storage when test failed. + /// Add this to test code temporarily for debugging. + #[allow(dead_code)] + pub fn dump(&self) { + println!(">>>>>>> /etc (revision = {}) <<<<<<<", self.revision); + for (k, v) in self.items.iter() { + println!("{:?} => {:?}", k, v); + } + } } #[async_trait] @@ -158,17 +210,34 @@ impl MetaStore for SlashEtcStore { start_rev: i64, ) -> Result { let mut data = self.lock().await; - if start_rev != data.revision + 1 { - error!( - "start from arbitrary revision is not supported yet; only watch (current_rev + 1) supported. (self.revision = {}; start_rev = {})", - data.revision, start_rev - ); - } let id = data.sub_id_alloc.get(); data.sub_id_alloc.set(id + 1); let this = self.clone(); - let (tx, rx) = mpsc::channel(64); + let (tx, rx) = mpsc::channel(1024); let (start_key, end_key) = keys.into_bound(); + + // Sending events from [start_rev, now) to the client. + let mut pending = data + .items + .iter() + .filter(|(k, _)| k.1 >= start_rev) + .collect::>(); + pending.sort_by_key(|(k, _)| k.1); + for (k, v) in pending { + let event = match v { + Value::Val(val) => KvEvent { + kind: KvEventType::Put, + pair: KeyValue(MetaKey(k.0.clone()), val.clone()), + }, + Value::Del => KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(k.0.clone()), vec![]), + }, + }; + // Note: may panic if too many pending here? + tx.send(event).await.expect("too many pending events"); + } + data.subs.insert( id, Subscriber { @@ -190,10 +259,27 @@ impl MetaStore for SlashEtcStore { let mut data = self.lock().await; for op in txn.into_ops() { match op { - super::TransactionOp::Put(kv) => data.set(kv).await?, + super::TransactionOp::Put(kv, _) => data.set(kv).await?, super::TransactionOp::Delete(range) => data.delete(range).await?, } } Ok(()) } + + async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { + let l = self.lock().await; + let Condition { + over_key, + result, + arg, + } = txn.cond; + let success = l + .get_key(Keys::Key(MetaKey(over_key))) + .last() + .map(|k| k.0.0.cmp(&arg) == result) + .unwrap_or(false); + drop(l); + let do_txn = if success { txn.success } else { txn.failure }; + self.txn(do_txn).await + } } diff --git a/components/backup-stream/src/metadata/test.rs b/components/backup-stream/src/metadata/test.rs index bb5addd24a8..e70ed78b32c 100644 --- a/components/backup-stream/src/metadata/test.rs +++ b/components/backup-stream/src/metadata/test.rs @@ -7,13 +7,21 @@ use std::{ iter::FromIterator, }; -use kvproto::brpb::{Noop, StorageBackend}; +use kvproto::{ + brpb::{Noop, StorageBackend}, + metapb::Region, +}; use tokio_stream::StreamExt; +use txn_types::TimeStamp; use super::{MetadataClient, StreamTask}; use crate::{ errors::Result, - metadata::{store::SlashEtcStore, MetadataEvent}, + metadata::{ + client::{Checkpoint, CheckpointProvider}, + store::SlashEtcStore, + MetadataEvent, + }, }; fn test_meta_cli() -> MetadataClient { @@ -91,6 +99,12 @@ fn task_matches(expected: &[StreamTask], real: &[StreamTask]) { ); } +fn fake_region(id: u64) -> Region { + let mut r = Region::new(); + r.set_id(id); + r +} + #[tokio::test] async fn test_watch() -> Result<()> { let cli = test_meta_cli(); @@ -98,7 +112,7 @@ async fn test_watch() -> Result<()> { cli.insert_task_with_range(&task, &[]).await?; let initial_task_set = cli.get_tasks().await?; task_matches(initial_task_set.inner.as_slice(), &[task]); - let watcher = cli.events_from(initial_task_set.revision).await?; + let watcher = cli.events_from(initial_task_set.revision + 1).await?; let task2 = simple_task("simple_2"); cli.insert_task_with_range(&task2, &[]).await?; cli.remove_task("simple_1").await?; @@ -121,17 +135,97 @@ async fn test_progress() -> Result<()> { let cli = test_meta_cli(); let task = simple_task("simple_1"); cli.insert_task_with_range(&task, &[]).await?; - let progress = cli.progress_of_task(&task.info.name).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; assert_eq!(progress, task.info.start_ts); - cli.step_task(&task.info.name, 42).await?; - let progress = cli.progress_of_task(&task.info.name).await?; + cli.set_local_task_checkpoint(&task.info.name, 42).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; assert_eq!(progress, 42); - cli.step_task(&task.info.name, 43).await?; - let progress = cli.progress_of_task(&task.info.name).await?; + cli.set_local_task_checkpoint(&task.info.name, 43).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; assert_eq!(progress, 43); let other_store = MetadataClient::new(cli.meta_store.clone(), 43); - let progress = other_store.progress_of_task(&task.info.name).await?; - assert_eq!(progress, task.info.start_ts); + let progress = other_store + .get_local_task_checkpoint(&task.info.name) + .await?; + assert_eq!(progress.into_inner(), task.info.start_ts); + + Ok(()) +} + +#[tokio::test] +async fn test_init() -> Result<()> { + let cli = test_meta_cli(); + let mut task = simple_task("simple_2"); + cli.insert_task_with_range(&task, &[]).await?; + task.info.set_start_ts(42); + // Init task should set the checkpoint. + cli.init_task(&task.info).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; + assert_eq!(progress, 42); + cli.set_local_task_checkpoint(&task.info.name, 43).await?; + + // Init task again shouldn't roll back checkpoint. + cli.init_task(&task.info).await?; + let progress = cli.global_progress_of_task(&task.info.name).await?; + assert_eq!(progress, 43); + + Ok(()) +} + +#[tokio::test] +async fn test_region_checkpoint() -> Result<()> { + let cli = test_meta_cli(); + let task = simple_task("simple_2"); + cli.insert_task_with_range(&task, &[]).await?; + let cps = [ + (fake_region(1), TimeStamp::new(42)), + (fake_region(2), TimeStamp::new(64)), + ]; + cli.upload_region_checkpoint("simple_2", &cps).await?; + cli.set_local_task_checkpoint("simple_2", 50).await?; + + let rcp = cli + .get_region_checkpoint("simple_2", &fake_region(1)) + .await?; + assert_eq!( + rcp, + Checkpoint { + provider: CheckpointProvider::Region { id: 1, version: 0 }, + ts: TimeStamp::new(42) + } + ); + let gcp = cli + .get_region_checkpoint("simple_2", &fake_region(3)) + .await?; + assert_eq!( + gcp, + Checkpoint { + provider: CheckpointProvider::Store(42), + ts: TimeStamp::new(50) + } + ); + cli.clear_region_checkpoint("simple_2", &[fake_region(1)]) + .await?; + let rcp = cli + .get_region_checkpoint("simple_2", &fake_region(2)) + .await?; + assert_eq!( + rcp, + Checkpoint { + provider: CheckpointProvider::Region { id: 2, version: 0 }, + ts: TimeStamp::new(64) + } + ); + let gcp = cli + .get_region_checkpoint("simple_2", &fake_region(1)) + .await?; + assert_eq!( + gcp, + Checkpoint { + provider: CheckpointProvider::Store(42), + ts: TimeStamp::new(50) + } + ); Ok(()) } diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index 8ac5b30b000..a27dd1ea33b 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -25,6 +25,10 @@ pub fn update_task_status(status: TaskStatus, task: &str) { } } +pub fn remove_task_status_metric(task: &str) -> Result<()> { + TASK_STATUS.remove_label_values(&[task]) +} + lazy_static! { pub static ref INTERNAL_ACTOR_MESSAGE_HANDLE_DURATION: HistogramVec = register_histogram_vec!( "tikv_log_backup_interal_actor_acting_duration_sec", @@ -63,13 +67,13 @@ lazy_static! { "The total kv size skipped by the streaming", ) .unwrap(); - pub static ref STREAM_ERROR: CounterVec = register_counter_vec!( + pub static ref STREAM_ERROR: IntCounterVec = register_int_counter_vec!( "tikv_stream_errors", "The errors during stream backup.", &["type"] ) .unwrap(); - pub static ref STREAM_FATAL_ERROR: CounterVec = register_counter_vec!( + pub static ref STREAM_FATAL_ERROR: IntCounterVec = register_int_counter_vec!( "tikv_log_backup_fatal_errors", "The errors during stream backup.", &["type"] @@ -129,10 +133,9 @@ lazy_static! { "When gt 0, this node enabled streaming." ) .unwrap(); - pub static ref TRACK_REGION: IntCounterVec = register_int_counter_vec!( + pub static ref TRACK_REGION: IntGauge = register_int_gauge!( "tikv_stream_observed_region", "the region being observed by the current store.", - &["type"], ) .unwrap(); static ref TASK_STATUS: IntGaugeVec = register_int_gauge_vec!( @@ -141,4 +144,10 @@ lazy_static! { &["task"] ) .unwrap(); + pub static ref PENDING_INITIAL_SCAN_LEN: IntGaugeVec = register_int_gauge_vec!( + "pending_initial_scan", + "The pending initial scan", + &["stage"] + ) + .unwrap(); } diff --git a/components/backup-stream/src/observer.rs b/components/backup-stream/src/observer.rs index 02c63f62a60..ad22b67e145 100644 --- a/components/backup-stream/src/observer.rs +++ b/components/backup-stream/src/observer.rs @@ -71,7 +71,6 @@ impl BackupStreamObserver { .scheduler .schedule(Task::ModifyObserve(ObserveOp::Start { region: region.clone(), - needs_initial_scanning: true, })) { use crate::errors::Error; @@ -137,7 +136,6 @@ impl CmdObserver for BackupStreamObserver { self.scheduler, Task::ModifyObserve(ObserveOp::Start { region: region.clone(), - needs_initial_scanning: true, }) ); if success { @@ -174,7 +172,7 @@ impl RegionChangeObserver for BackupStreamObserver { RegionChangeEvent::Destroy => { try_send!( self.scheduler, - Task::ModifyObserve(ObserveOp::CheckEpochAndStop { + Task::ModifyObserve(ObserveOp::Destroy { region: ctx.region().clone(), }) ); diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 8db9244d916..dec4baeae89 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -42,7 +42,7 @@ use tokio::{ sync::{Mutex, RwLock}, }; use tokio_util::compat::TokioAsyncReadCompatExt; -use txn_types::{Key, Lock, TimeStamp}; +use txn_types::{Key, Lock, TimeStamp, WriteRef}; use super::errors::Result; use crate::{ @@ -56,8 +56,12 @@ use crate::{ utils::{self, SegmentMap, Slot, SlotMap, StopWatch}, }; -pub const FLUSH_STORAGE_INTERVAL: u64 = 300; -pub const FLUSH_FAILURE_BECOME_FATAL_THRESHOLD: usize = 16; +const FLUSH_FAILURE_BECOME_FATAL_THRESHOLD: usize = 30; + +/// FLUSH_LOG_CONCURRENT_BATCH_COUNT specifies the concurrent count to write to storage. +/// 'Log backup' will produce a large mount of small files during flush interval, +/// and storage could take mistaken if writing all of these files to storage concurrently. +const FLUSH_LOG_CONCURRENT_BATCH_COUNT: usize = 128; #[derive(Debug)] pub struct ApplyEvent { @@ -476,7 +480,6 @@ impl RouterInner { let result = task_info.do_flush(store_id, resolve_to).await; // set false to flushing whether success or fail task_info.set_flushing_status(false); - task_info.update_flush_time(); if let Err(e) = result { e.report("failed to flush task."); @@ -490,6 +493,8 @@ impl RouterInner { } return None; } + // if succeed in flushing, update flush_time. Or retry do_flush immediately. + task_info.update_flush_time(); result.ok().flatten() } _ => None, @@ -601,20 +606,24 @@ impl TempFileKey { return dt.format("%Y%m%d"); } + /// path_to_log_file specifies the path of record log. + /// eg. "v1/20220625/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log" fn path_to_log_file(&self, min_ts: u64, max_ts: u64) -> String { format!( - "v1/t{:08}/{}-{:012}-{}.log", - self.table_id, + "v1/{}/t{:08}/{:012}-{}.log", // We may delete a range of files, so using the max_ts for preventing remove some records wrong. Self::format_date_time(max_ts), + self.table_id, min_ts, uuid::Uuid::new_v4() ) } + /// path_to_schema_file specifies the path of schema log. + /// eg. "v1/20220625/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log" fn path_to_schema_file(min_ts: u64, max_ts: u64) -> String { format!( - "v1/schema-meta/{}-{:012}-{}.log", + "v1/{}/schema-meta/{:012}-{}.log", Self::format_date_time(max_ts), min_ts, uuid::Uuid::new_v4(), @@ -658,6 +667,20 @@ pub struct StreamTaskInfo { flush_fail_count: AtomicUsize, } +impl Drop for StreamTaskInfo { + fn drop(&mut self) { + let (success, failed): (Vec<_>, Vec<_>) = self + .flushing_files + .get_mut() + .drain(..) + .chain(self.files.get_mut().drain()) + .map(|(_, f)| f.into_inner().local_path) + .map(std::fs::remove_file) + .partition(|r| r.is_ok()); + info!("stream task info dropped, removing temp files"; "success" => %success.len(), "failure" => %failed.len()) + } +} + impl std::fmt::Debug for StreamTaskInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("StreamTaskInfo") @@ -803,6 +826,11 @@ impl StreamTaskInfo { /// move need-flushing files to flushing_files. pub async fn move_to_flushing_files(&self) -> &Self { + // if flushing_files is not empty, which represents this flush is a retry operation. + if !self.flushing_files.read().await.is_empty() { + return self; + } + let mut w = self.files.write().await; let mut fw = self.flushing_files.write().await; for (k, v) in w.drain() { @@ -863,10 +891,14 @@ impl StreamTaskInfo { // if failed to write storage, we should retry write flushing_files. let storage = self.storage.clone(); let files = self.flushing_files.write().await; - let futs = files - .iter() - .map(|(_, v)| Self::flush_log_file_to(storage.clone(), v)); - futures::future::try_join_all(futs).await?; + + for batch_files in files.chunks(FLUSH_LOG_CONCURRENT_BATCH_COUNT) { + let futs = batch_files + .iter() + .map(|(_, v)| Self::flush_log_file_to(storage.clone(), v)); + futures::future::try_join_all(futs).await?; + } + Ok(()) } @@ -967,6 +999,7 @@ struct DataFile { min_ts: TimeStamp, max_ts: TimeStamp, resolved_ts: TimeStamp, + min_begin_ts: Option, sha256: Hasher, inner: BufWriter, start_key: Vec, @@ -981,6 +1014,8 @@ struct DataFile { pub struct MetadataInfo { pub files: Vec, pub min_resolved_ts: Option, + pub min_ts: Option, + pub max_ts: Option, pub store_id: u64, } @@ -989,6 +1024,8 @@ impl MetadataInfo { Self { files: Vec::with_capacity(cap), min_resolved_ts: None, + min_ts: None, + max_ts: None, store_id: 0, } } @@ -1000,6 +1037,12 @@ impl MetadataInfo { fn push(&mut self, file: DataFileInfo) { let rts = file.resolved_ts; self.min_resolved_ts = self.min_resolved_ts.map_or(Some(rts), |r| Some(r.min(rts))); + self.min_ts = self + .min_ts + .map_or(Some(file.min_ts), |ts| Some(ts.min(file.min_ts))); + self.max_ts = self + .max_ts + .map_or(Some(file.max_ts), |ts| Some(ts.max(file.max_ts))); self.files.push(file); } @@ -1007,7 +1050,9 @@ impl MetadataInfo { let mut metadata = Metadata::new(); metadata.set_files(self.files.into()); metadata.set_store_id(self.store_id as _); - metadata.set_resolved_ts(self.min_resolved_ts.unwrap_or_default() as _); + metadata.set_resolved_ts(self.min_resolved_ts.unwrap_or_default()); + metadata.set_min_ts(self.min_ts.unwrap_or(0)); + metadata.set_max_ts(self.max_ts.unwrap_or(0)); metadata .write_to_bytes() @@ -1033,6 +1078,7 @@ impl DataFile { min_ts: TimeStamp::max(), max_ts: TimeStamp::zero(), resolved_ts: TimeStamp::zero(), + min_begin_ts: None, inner: BufWriter::with_capacity(128 * 1024, File::create(local_path.as_ref()).await?), sha256, number_of_entries: 0, @@ -1048,10 +1094,23 @@ impl DataFile { remove_file(&self.local_path).await } + fn decode_begin_ts(value: Vec) -> Result { + WriteRef::parse(&value).map_or_else( + |e| { + Err(Error::Other(box_err!( + "failed to parse write cf value: {}", + e + ))) + }, + |w| Ok(w.start_ts), + ) + } + /// Add a new KV pair to the file, returning its size. async fn on_events(&mut self, events: ApplyEvents) -> Result { let now = Instant::now_coarse(); let mut total_size = 0; + for mut event in events.events { let encoded = EventEncoder::encode_event(&event.key, &event.value); let mut size = 0; @@ -1069,6 +1128,13 @@ impl DataFile { self.min_ts = self.min_ts.min(ts); self.max_ts = self.max_ts.max(ts); self.resolved_ts = self.resolved_ts.max(events.region_resolved_ts.into()); + + // decode_begin_ts is used to maintain the txn when restore log. + // if value is empty, no need to decode begin_ts. + if event.cf == CF_WRITE && !event.value.is_empty() { + let begin_ts = Self::decode_begin_ts(event.value)?; + self.min_begin_ts = Some(self.min_begin_ts.map_or(begin_ts, |ts| ts.min(begin_ts))); + } self.number_of_entries += 1; self.file_size += size; self.update_key_bound(key.into_encoded()); @@ -1117,6 +1183,10 @@ impl DataFile { meta.set_max_ts(self.max_ts.into_inner() as _); meta.set_min_ts(self.min_ts.into_inner() as _); meta.set_resolved_ts(self.resolved_ts.into_inner() as _); + meta.set_min_begin_ts_in_default_cf( + self.min_begin_ts + .map_or(self.min_ts.into_inner(), |ts| ts.into_inner()), + ); meta.set_start_key(std::mem::take(&mut self.start_key)); meta.set_end_key(std::mem::take(&mut self.end_key)); meta.set_length(self.file_size as _); @@ -1161,6 +1231,7 @@ mod tests { codec::number::NumberEncoder, worker::{dummy_scheduler, ReceiverWrapper}, }; + use txn_types::{Write, WriteType}; use super::*; use crate::utils; @@ -1181,6 +1252,12 @@ mod tests { table_key } + fn make_value(t: WriteType, value: &[u8], start_ts: u64) -> Vec { + let start_ts = TimeStamp::new(start_ts); + let w = Write::new(t, start_ts, Some(value.to_vec())); + w.as_ref().to_bytes() + } + impl KvEventsBuilder { fn new(region_id: u64, region_resolved_ts: u64) -> Self { Self { @@ -1219,9 +1296,14 @@ mod tests { }) } - fn put_table(&mut self, cf: &'static str, table: i64, key: &[u8], value: &[u8]) { + fn put_table(&mut self, cf: CfName, table: i64, key: &[u8], value: &[u8]) { let table_key = make_table_key(table, key); - self.put_event(cf, table_key, value.to_vec()); + let value = if cf == CF_WRITE { + make_value(WriteType::Put, value, 12345) + } else { + value.to_vec() + }; + self.put_event(cf, table_key, value); } fn delete_table(&mut self, cf: &'static str, table: i64, key: &[u8]) { @@ -1229,7 +1311,7 @@ mod tests { self.delete_event(cf, table_key); } - fn flush_events(&mut self) -> ApplyEvents { + fn finish(&mut self) -> ApplyEvents { let region_id = self.events.region_id; let region_resolved_ts = self.events.region_resolved_ts; std::mem::replace( @@ -1326,15 +1408,7 @@ mod tests { } } - #[tokio::test] - async fn test_basic_file() -> Result<()> { - let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); - tokio::fs::create_dir_all(&tmp).await?; - let (tx, rx) = dummy_scheduler(); - let router = RouterInner::new(tmp.clone(), tx, 32, Duration::from_secs(300)); - let (stream_task, storage_path) = task("dummy".to_owned()).await?; - must_register_table(&router, stream_task, 1).await; - + async fn write_simple_data(router: &RouterInner) -> u64 { let now = TimeStamp::physical_now(); let mut region1 = KvEventsBuilder::new(1, now); let start_ts = TimeStamp::physical_now(); @@ -1345,8 +1419,21 @@ mod tests { region1.put_table(CF_WRITE, 2, b"hello", b"this isn't a write record :3"); region1.put_table(CF_WRITE, 1, b"hello", b"still isn't a write record :3"); region1.delete_table(CF_DEFAULT, 1, b"hello"); - let events = region1.flush_events(); + let events = region1.finish(); check_on_events_result(&router.on_events(events).await); + start_ts + } + + #[tokio::test] + async fn test_basic_file() -> Result<()> { + let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); + tokio::fs::create_dir_all(&tmp).await?; + let (tx, rx) = dummy_scheduler(); + let router = RouterInner::new(tmp.clone(), tx, 32, Duration::from_secs(300)); + let (stream_task, storage_path) = task("dummy".to_owned()).await?; + must_register_table(&router, stream_task, 1).await; + + let start_ts = write_simple_data(&router).await; tokio::time::sleep(Duration::from_millis(200)).await; let end_ts = TimeStamp::physical_now(); @@ -1405,6 +1492,58 @@ mod tests { Ok(()) } + fn mock_build_kv_events(table_id: i64, region_id: u64, resolved_ts: u64) -> ApplyEvents { + let mut events_builder = KvEventsBuilder::new(region_id, resolved_ts); + events_builder.put_table("default", table_id, b"hello", b"world"); + events_builder.finish() + } + + #[tokio::test] + async fn test_do_flush() { + let tmp_dir = tempfile::tempdir().unwrap(); + let backend = external_storage_export::make_local_backend(tmp_dir.path()); + let mut task_info = StreamBackupTaskInfo::default(); + task_info.set_storage(backend); + let stream_task = StreamTask { + info: task_info, + is_paused: false, + }; + let task = StreamTaskInfo::new( + tmp_dir.path().to_path_buf(), + stream_task, + Duration::from_secs(300), + ) + .await + .unwrap(); + + // on_event + let region_count = FLUSH_LOG_CONCURRENT_BATCH_COUNT + 5; + for i in 1..=region_count { + let kv_events = mock_build_kv_events(i as _, i as _, i as _); + task.on_events(kv_events).await.unwrap(); + } + // do_flush + task.set_flushing_status(true); + task.do_flush(1, TimeStamp::new(1)).await.unwrap(); + assert_eq!(task.flush_failure_count(), 0); + assert_eq!(task.files.read().await.is_empty(), true); + assert_eq!(task.flushing_files.read().await.is_empty(), true); + + // assert backup log files + let mut meta_count = 0; + let mut log_count = 0; + for entry in walkdir::WalkDir::new(tmp_dir.path()) { + let entry = entry.unwrap(); + if entry.path().extension() == Some(OsStr::new("meta")) { + meta_count += 1; + } else if entry.path().extension() == Some(OsStr::new("log")) { + log_count += 1; + } + } + assert_eq!(meta_count, 1); + assert_eq!(log_count, region_count); + } + struct ErrorStorage { inner: Inner, error_on_write: Box io::Result<()> + Send + Sync>, @@ -1507,8 +1646,12 @@ mod tests { .is_none() ); check_on_events_result(&router.on_events(build_kv_event(10, 10)).await); - let _ = router.do_flush("error_prone", 42, TimeStamp::max()).await; let t = router.get_task_info("error_prone").await.unwrap(); + let _ = router.do_flush("error_prone", 42, TimeStamp::max()).await; + assert_eq!(t.total_size() > 0, true); + + t.set_flushing_status(true); + let _ = router.do_flush("error_prone", 42, TimeStamp::max()).await; assert_eq!(t.total_size(), 0); Ok(()) } @@ -1539,6 +1682,47 @@ mod tests { assert_eq!(ts.into_inner(), rts); } + #[tokio::test] + async fn test_cleanup_when_stop() -> Result<()> { + let (tx, _rx) = dummy_scheduler(); + let tmp = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); + let router = Arc::new(RouterInner::new( + tmp.clone(), + tx, + 1, + Duration::from_secs(300), + )); + let (task, _path) = task("cleanup_test".to_owned()).await?; + must_register_table(&router, task, 1).await; + write_simple_data(&router).await; + router + .get_task_info("cleanup_test") + .await? + .move_to_flushing_files() + .await; + write_simple_data(&router).await; + let mut w = walkdir::WalkDir::new(&tmp).into_iter(); + assert!(w.next().is_some(), "the temp files doesn't created"); + drop(router); + let w = walkdir::WalkDir::new(&tmp) + .into_iter() + .filter_map(|entry| { + let e = entry.unwrap(); + e.path() + .extension() + .filter(|x| x.to_string_lossy() == "log") + .map(|_| e.clone()) + }) + .collect::>(); + + assert!( + w.is_empty(), + "the temp files should be removed, but it is {:?}", + w + ); + Ok(()) + } + #[tokio::test] async fn test_flush_with_pausing_self() -> Result<()> { let (tx, rx) = dummy_scheduler(); @@ -1556,8 +1740,8 @@ mod tests { i.storage = Arc::new(ErrorStorage::with_always_error(i.storage.clone())) }) .await; - for i in 0..=16 { - check_on_events_result(&router.on_events(build_kv_event(i * 10, 10)).await); + for i in 0..=FLUSH_FAILURE_BECOME_FATAL_THRESHOLD { + check_on_events_result(&router.on_events(build_kv_event((i * 10) as _, 10)).await); assert_eq!( router .do_flush("flush_failure", 42, TimeStamp::zero()) @@ -1585,4 +1769,14 @@ mod tests { let s = s.to_string(); assert_eq!(s, "20220307"); } + + #[test] + fn test_decode_begin_ts() { + let start_ts = TimeStamp::new(12345678); + let w = Write::new(WriteType::Put, start_ts, Some(b"short_value".to_vec())); + let value = w.as_ref().to_bytes(); + + let begin_ts = DataFile::decode_begin_ts(value).unwrap(); + assert_eq!(begin_ts, start_ts); + } } diff --git a/components/backup-stream/src/service.rs b/components/backup-stream/src/service.rs new file mode 100644 index 00000000000..47a149973b2 --- /dev/null +++ b/components/backup-stream/src/service.rs @@ -0,0 +1,92 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::collections::HashSet; + +use grpcio::RpcContext; +use kvproto::{logbackuppb::*, metapb::Region}; +use tikv_util::{warn, worker::Scheduler}; + +use crate::{ + checkpoint_manager::{GetCheckpointResult, RegionIdWithVersion}, + endpoint::{RegionCheckpointOperation, RegionSet}, + try_send, Task, +}; + +#[derive(Clone)] +pub struct Service { + endpoint: Scheduler, +} + +impl Service { + pub fn new(endpoint: Scheduler) -> Self { + Self { endpoint } + } +} + +fn id_of(region: &Region) -> RegionIdentity { + let mut id = RegionIdentity::new(); + id.set_id(region.get_id()); + id.set_epoch_version(region.get_region_epoch().get_version()); + id +} + +impl From for RegionIdentity { + fn from(val: RegionIdWithVersion) -> Self { + let mut id = RegionIdentity::new(); + id.set_id(val.region_id); + id.set_epoch_version(val.region_epoch_version); + id + } +} + +impl LogBackup for Service { + fn get_last_flush_ts_of_region( + &mut self, + _ctx: RpcContext<'_>, + mut req: GetLastFlushTsOfRegionRequest, + sink: grpcio::UnarySink, + ) { + let regions = req + .take_regions() + .into_iter() + .map(|id| (id.id, id.epoch_version)) + .collect::>(); + let t = Task::RegionCheckpointsOp(RegionCheckpointOperation::Get( + RegionSet::Regions(regions), + Box::new(move |rs| { + let mut resp = GetLastFlushTsOfRegionResponse::new(); + resp.set_checkpoints( + rs.into_iter() + .map(|r| match r { + GetCheckpointResult::Ok { region, checkpoint } => { + let mut r = RegionCheckpoint::new(); + let id = id_of(®ion); + r.set_region(id); + r.set_checkpoint(checkpoint.into_inner()); + r + } + GetCheckpointResult::NotFound { id, err } => { + let mut r = RegionCheckpoint::new(); + r.set_region(id.into()); + r.set_err(err); + r + } + GetCheckpointResult::EpochNotMatch { region, err } => { + let mut r = RegionCheckpoint::new(); + r.set_region(id_of(®ion)); + r.set_err(err); + r + } + }) + .collect(), + ); + tokio::spawn(async { + if let Err(e) = sink.success(resp).await { + warn!("failed to reply grpc resonse."; "err" => %e) + } + }); + }), + )); + try_send!(self.endpoint, t); + } +} diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs new file mode 100644 index 00000000000..fc4f0e2d4a7 --- /dev/null +++ b/components/backup-stream/src/subscription_manager.rs @@ -0,0 +1,650 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + time::Duration, +}; + +use crossbeam::channel::{Receiver as SyncReceiver, Sender as SyncSender}; +use crossbeam_channel::SendError; +use engine_traits::KvEngine; +use error_code::{backup_stream::OBSERVE_CANCELED, ErrorCodeExt}; +use futures::FutureExt; +use kvproto::metapb::Region; +use pd_client::PdClient; +use raft::StateRole; +use raftstore::{ + coprocessor::{ObserveHandle, RegionInfoProvider}, + router::RaftStoreRouter, + store::fsm::ChangeObserver, +}; +use tikv::storage::Statistics; +use tikv_util::{box_err, debug, info, time::Instant, warn, worker::Scheduler}; +use tokio::sync::mpsc::{channel, Receiver, Sender}; +use txn_types::TimeStamp; +use yatp::task::callback::Handle as YatpHandle; + +use crate::{ + annotate, + endpoint::ObserveOp, + errors::{Error, Result}, + event_loader::InitialDataLoader, + future, + metadata::{store::MetaStore, CheckpointProvider, MetadataClient}, + metrics, + observer::BackupStreamObserver, + router::Router, + subscription_track::SubscriptionTracer, + try_send, + utils::{self, CallbackWaitGroup, Work}, + Task, +}; + +type ScanPool = yatp::ThreadPool; + +/// a request for doing initial scanning. +struct ScanCmd { + region: Region, + handle: ObserveHandle, + last_checkpoint: TimeStamp, + work: Work, +} + +/// The response of requesting resolve the new checkpoint of regions. +pub struct ResolvedRegions { + items: Vec<(Region, TimeStamp)>, + checkpoint: TimeStamp, +} + +impl ResolvedRegions { + /// compose the calculated global checkpoint and region checkpoints. + /// note: maybe we can compute the global checkpoint internal and getting the interface clear. + /// however we must take the `min_ts` or we cannot provide valid global checkpoint if there + /// isn't any region checkpoint. + pub fn new(checkpoint: TimeStamp, checkpoints: Vec<(Region, TimeStamp)>) -> Self { + Self { + items: checkpoints, + checkpoint, + } + } + + /// take the region checkpoints from the structure. + pub fn take_region_checkpoints(&mut self) -> Vec<(Region, TimeStamp)> { + std::mem::take(&mut self.items) + } + + /// get the global checkpoint. + pub fn global_checkpoint(&self) -> TimeStamp { + self.checkpoint + } +} + +/// the abstraction over a "DB" which provides the initial scanning. +trait InitialScan: Clone { + fn do_initial_scan( + &self, + region: &Region, + start_ts: TimeStamp, + handle: ObserveHandle, + on_finish: impl FnOnce() + Send + 'static, + ) -> Result; +} + +impl InitialScan for InitialDataLoader +where + E: KvEngine, + R: RegionInfoProvider + Clone + 'static, + RT: RaftStoreRouter, +{ + fn do_initial_scan( + &self, + region: &Region, + start_ts: TimeStamp, + handle: ObserveHandle, + on_finish: impl FnOnce() + Send + 'static, + ) -> Result { + let region_id = region.get_id(); + let snap = self.observe_over_with_retry(region, move || { + ChangeObserver::from_pitr(region_id, handle.clone()) + })?; + let stat = self.do_initial_scan(region, start_ts, snap, on_finish)?; + Ok(stat) + } +} + +impl ScanCmd { + /// execute the initial scanning via the specificated [`InitialDataLoader`]. + fn exec_by(self, initial_scan: impl InitialScan) -> Result<()> { + let Self { + region, + handle, + last_checkpoint, + work, + } = self; + let begin = Instant::now_coarse(); + let stat = + initial_scan.do_initial_scan(®ion, last_checkpoint, handle, move || drop(work))?; + info!("initial scanning of leader transforming finished!"; "takes" => ?begin.saturating_elapsed(), "region" => %region.get_id(), "from_ts" => %last_checkpoint); + utils::record_cf_stat("lock", &stat.lock); + utils::record_cf_stat("write", &stat.write); + utils::record_cf_stat("default", &stat.data); + Ok(()) + } +} + +fn scan_executor_loop( + init: impl InitialScan, + cmds: SyncReceiver, + canceled: Arc, +) { + while let Ok(cmd) = cmds.recv() { + #[cfg(feature = "failpoints")] + fail::fail_point!("execute_scan_command"); + debug!("handling initial scan request"; "region_id" => %cmd.region.get_id()); + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["queuing"]) + .dec(); + if canceled.load(Ordering::Acquire) { + return; + } + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .inc(); + let region_id = cmd.region.get_id(); + if let Err(err) = cmd.exec_by(init.clone()) { + if err.error_code() != OBSERVE_CANCELED { + err.report(format!("during initial scanning of region {}", region_id)); + } + } + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["executing"]) + .dec(); + } +} + +/// spawn the executors in the scan pool. +/// we make workers thread instead of spawn scan task directly into the pool because the [`InitialDataLoader`] isn't `Sync` hence +/// we must use it very carefully or rustc (along with tokio) would complain that we made a `!Send` future. +/// so we have moved the data loader to the synchronous context so its reference won't be shared between threads any more. +fn spawn_executors(init: impl InitialScan + Send + 'static, number: usize) -> ScanPoolHandle { + let (tx, rx) = crossbeam::channel::bounded(MESSAGE_BUFFER_SIZE); + let pool = create_scan_pool(number); + let stopped = Arc::new(AtomicBool::new(false)); + for _ in 0..number { + let init = init.clone(); + let rx = rx.clone(); + let stopped = stopped.clone(); + pool.spawn(move |_: &mut YatpHandle<'_>| { + tikv_alloc::add_thread_memory_accessor(); + let _io_guard = file_system::WithIOType::new(file_system::IOType::Replication); + scan_executor_loop(init, rx, stopped); + tikv_alloc::remove_thread_memory_accessor(); + }) + } + ScanPoolHandle { + tx, + _pool: pool, + stopped, + } +} + +struct ScanPoolHandle { + tx: SyncSender, + stopped: Arc, + + // in fact, we won't use the pool any more. + // but we should hold the reference to the pool so it won't try to join the threads running. + _pool: ScanPool, +} + +impl Drop for ScanPoolHandle { + fn drop(&mut self) { + self.stopped.store(true, Ordering::Release); + } +} + +impl ScanPoolHandle { + fn request(&self, cmd: ScanCmd) -> std::result::Result<(), SendError> { + if self.stopped.load(Ordering::Acquire) { + warn!("scan pool is stopped, ignore the scan command"; "region" => %cmd.region.get_id()); + return Ok(()); + } + metrics::PENDING_INITIAL_SCAN_LEN + .with_label_values(&["queuing"]) + .inc(); + self.tx.send(cmd) + } +} + +/// The default channel size. +const MESSAGE_BUFFER_SIZE: usize = 4096; + +/// The operator for region subscription. +/// It make a queue for operations over the `SubscriptionTracer`, generally, +/// we should only modify the `SubscriptionTracer` itself (i.e. insert records, remove records) at here. +/// So the order subscription / desubscription won't be broken. +pub struct RegionSubscriptionManager { + // Note: these fields appear everywhere, maybe make them a `context` type? + regions: R, + meta_cli: MetadataClient, + pd_client: Arc, + range_router: Router, + scheduler: Scheduler, + observer: BackupStreamObserver, + subs: SubscriptionTracer, + + messenger: Sender, + scan_pool_handle: Arc, + scans: Arc, +} + +impl Clone for RegionSubscriptionManager +where + S: MetaStore + 'static, + R: RegionInfoProvider + Clone + 'static, + PDC: PdClient + 'static, +{ + fn clone(&self) -> Self { + Self { + regions: self.regions.clone(), + meta_cli: self.meta_cli.clone(), + // We should manually call Arc::clone here or rustc complains that `PDC` isn't `Clone`. + pd_client: Arc::clone(&self.pd_client), + range_router: self.range_router.clone(), + scheduler: self.scheduler.clone(), + observer: self.observer.clone(), + subs: self.subs.clone(), + messenger: self.messenger.clone(), + scan_pool_handle: self.scan_pool_handle.clone(), + scans: CallbackWaitGroup::new(), + } + } +} + +/// Create a yatp pool for doing initial scanning. +fn create_scan_pool(num_threads: usize) -> ScanPool { + yatp::Builder::new("log-backup-scan") + .max_thread_count(num_threads) + .build_callback_pool() +} + +impl RegionSubscriptionManager +where + S: MetaStore + 'static, + R: RegionInfoProvider + Clone + 'static, + PDC: PdClient + 'static, +{ + /// create a [`RegionSubscriptionManager`]. + /// + /// # returns + /// + /// a two-tuple, the first is the handle to the manager, the second is the operator loop future. + pub fn start( + initial_loader: InitialDataLoader, + observer: BackupStreamObserver, + meta_cli: MetadataClient, + pd_client: Arc, + scan_pool_size: usize, + ) -> (Self, future![()]) + where + E: KvEngine, + RT: RaftStoreRouter + 'static, + { + let (tx, rx) = channel(MESSAGE_BUFFER_SIZE); + let scan_pool_handle = spawn_executors(initial_loader.clone(), scan_pool_size); + let op = Self { + regions: initial_loader.regions.clone(), + meta_cli, + pd_client, + range_router: initial_loader.sink.clone(), + scheduler: initial_loader.scheduler.clone(), + observer, + subs: initial_loader.tracing, + messenger: tx, + scan_pool_handle: Arc::new(scan_pool_handle), + scans: CallbackWaitGroup::new(), + }; + let fut = op.clone().region_operator_loop(rx); + (op, fut) + } + + /// send an operation request to the manager. + /// the returned future would be resolved after send is success. + /// the opeartion would be executed asynchronously. + pub async fn request(&self, op: ObserveOp) { + if let Err(err) = self.messenger.send(op).await { + annotate!(err, "BUG: region operator channel closed.") + .report("when executing region op"); + } + } + + /// wait initial scanning get finished. + pub fn wait(&self, timeout: Duration) -> future![bool] { + tokio::time::timeout(timeout, self.scans.wait()).map(|result| result.is_err()) + } + + /// the handler loop. + async fn region_operator_loop(self, mut message_box: Receiver) { + while let Some(op) = message_box.recv().await { + info!("backup stream: on_modify_observe"; "op" => ?op); + match op { + ObserveOp::Start { region } => { + #[cfg(feature = "failpoints")] + fail::fail_point!("delay_on_start_observe"); + self.start_observe(region).await; + metrics::INITIAL_SCAN_REASON + .with_label_values(&["leader-changed"]) + .inc(); + crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.fetch_sub(1, Ordering::SeqCst); + } + ObserveOp::Stop { ref region } => { + self.subs.deregister_region_if(region, |_, _| true); + } + ObserveOp::Destroy { ref region } => { + let stopped = self.subs.deregister_region_if(region, |old, new| { + raftstore::store::util::compare_region_epoch( + old.meta.get_region_epoch(), + new, + true, + true, + false, + ) + .map_err(|err| warn!("check epoch and stop failed."; "err" => %err)) + .is_ok() + }); + if stopped { + self.subs.destroy_stopped_region(region.get_id()); + } + } + ObserveOp::RefreshResolver { ref region } => self.refresh_resolver(region).await, + ObserveOp::NotifyFailToStartObserve { + region, + handle, + err, + } => { + info!("retry observe region"; "region" => %region.get_id(), "err" => %err); + // No need for retrying observe canceled. + if err.error_code() == error_code::backup_stream::OBSERVE_CANCELED { + return; + } + match self.retry_observe(region, handle).await { + Ok(()) => {} + Err(e) => { + self.fatal( + e, + format!("While retring to observe region, origin error is {}", err), + ); + } + } + } + ObserveOp::ResolveRegions { callback, min_ts } => { + let now = Instant::now(); + let timedout = self.wait(Duration::from_secs(30)).await; + if timedout { + warn!("waiting for initial scanning done timed out, forcing progress(with risk of data loss)!"; + "take" => ?now.saturating_elapsed(), "timedout" => %timedout); + } + let cps = self.subs.resolve_with(min_ts); + let min_region = cps.iter().min_by_key(|(_, rts)| rts); + // If there isn't any region observed, the `min_ts` can be used as resolved ts safely. + let rts = min_region.map(|(_, rts)| *rts).unwrap_or(min_ts); + info!("getting checkpoint"; "defined_by_region" => ?min_region.map(|r| r.0.get_id()), "checkpoint" => %rts); + self.subs.warn_if_gap_too_huge(rts); + callback(ResolvedRegions::new(rts, cps)); + } + } + } + } + + fn fatal(&self, err: Error, message: String) { + try_send!(self.scheduler, Task::FatalError(message, Box::new(err))); + } + + async fn refresh_resolver(&self, region: &Region) { + let need_refresh_all = !self.subs.try_update_region(region); + + if need_refresh_all { + let canceled = self.subs.deregister_region_if(region, |_, _| true); + let handle = ObserveHandle::new(); + if canceled { + let for_task = self.find_task_by_region(region).unwrap_or_else(|| { + panic!( + "BUG: the region {:?} is register to no task but being observed", + region + ) + }); + metrics::INITIAL_SCAN_REASON + .with_label_values(&["region-changed"]) + .inc(); + let r = async { + self.observe_over_with_initial_data_from_checkpoint( + region, + self.get_last_checkpoint_of(&for_task, region).await?, + handle.clone(), + ); + Result::Ok(()) + } + .await; + if let Err(e) = r { + try_send!( + self.scheduler, + Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { + region: region.clone(), + handle, + err: Box::new(e) + }) + ); + } + } + } + } + + async fn try_start_observe(&self, region: &Region, handle: ObserveHandle) -> Result<()> { + match self.find_task_by_region(region) { + None => { + warn!( + "the region {:?} is register to no task but being observed (start_key = {}; end_key = {}; task_stat = {:?}): maybe stale, aborting", + region, + utils::redact(®ion.get_start_key()), + utils::redact(®ion.get_end_key()), + self.range_router + ); + } + + Some(for_task) => { + #[cfg(feature = "failpoints")] + fail::fail_point!("try_start_observe", |_| { + Err(Error::Other(box_err!("Nature is boring"))) + }); + let tso = self.get_last_checkpoint_of(&for_task, region).await?; + self.observe_over_with_initial_data_from_checkpoint(region, tso, handle.clone()); + } + } + Ok(()) + } + + async fn start_observe(&self, region: Region) { + let handle = ObserveHandle::new(); + if let Err(err) = self.try_start_observe(®ion, handle.clone()).await { + warn!("failed to start observe, retrying"; "err" => %err); + try_send!( + self.scheduler, + Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { + region, + handle, + err: Box::new(err) + }) + ); + } + } + + async fn retry_observe(&self, region: Region, handle: ObserveHandle) -> Result<()> { + let (tx, rx) = crossbeam::channel::bounded(1); + self.regions + .find_region_by_id( + region.get_id(), + Box::new(move |item| { + tx.send(item) + .expect("BUG: failed to send to newly created channel."); + }), + ) + .map_err(|err| { + annotate!( + err, + "failed to send request to region info accessor, server maybe too too too busy. (region id = {})", + region.get_id() + ) + })?; + let new_region_info = rx + .recv() + .map_err(|err| annotate!(err, "BUG?: unexpected channel message dropped."))?; + if new_region_info.is_none() { + metrics::SKIP_RETRY + .with_label_values(&["region-absent"]) + .inc(); + return Ok(()); + } + let new_region_info = new_region_info.unwrap(); + if new_region_info.role != StateRole::Leader { + metrics::SKIP_RETRY.with_label_values(&["not-leader"]).inc(); + return Ok(()); + } + // Note: we may fail before we insert the region info to the subscription map. + // At that time, the command isn't steal and we should retry it. + let mut exists = false; + let removed = self.subs.deregister_region_if(®ion, |old, _| { + exists = true; + let should_remove = old.handle().id == handle.id; + if !should_remove { + warn!("stale retry command"; "region" => ?region, "handle" => ?handle, "old_handle" => ?old.handle()); + } + should_remove + }); + if !removed && exists { + metrics::SKIP_RETRY + .with_label_values(&["stale-command"]) + .inc(); + return Ok(()); + } + metrics::INITIAL_SCAN_REASON + .with_label_values(&["retry"]) + .inc(); + self.start_observe(region).await; + Ok(()) + } + + async fn get_last_checkpoint_of(&self, task: &str, region: &Region) -> Result { + let meta_cli = self.meta_cli.clone(); + let cp = meta_cli.get_region_checkpoint(task, region).await?; + info!("got region checkpoint"; "region_id" => %region.get_id(), "checkpoint" => ?cp); + if matches!(cp.provider, CheckpointProvider::Global) { + metrics::STORE_CHECKPOINT_TS + .with_label_values(&[task]) + .set(cp.ts.into_inner() as _); + } + Ok(cp.ts) + } + + fn spawn_scan(&self, cmd: ScanCmd) { + // we should not spawn initial scanning tasks to the tokio blocking pool + // because it is also used for converting sync File I/O to async. (for now!) + // In that condition, if we blocking for some resources(for example, the `MemoryQuota`) + // at the block threads, we may meet some ghosty deadlock. + let s = self.scan_pool_handle.request(cmd); + if let Err(err) = s { + let region_id = err.0.region.get_id(); + annotate!(err, "BUG: scan_pool closed") + .report(format!("during initial scanning for region {}", region_id)); + } + } + + fn observe_over_with_initial_data_from_checkpoint( + &self, + region: &Region, + last_checkpoint: TimeStamp, + handle: ObserveHandle, + ) { + self.subs + .register_region(region, handle.clone(), Some(last_checkpoint)); + self.spawn_scan(ScanCmd { + region: region.clone(), + handle, + last_checkpoint, + work: self.scans.clone().work(), + }) + } + + fn find_task_by_region(&self, r: &Region) -> Option { + self.range_router + .find_task_by_range(&r.start_key, &r.end_key) + } +} + +#[cfg(test)] +mod test { + use kvproto::metapb::Region; + use tikv::storage::Statistics; + + use super::InitialScan; + #[cfg(feature = "failpoints")] + use crate::{subscription_manager::spawn_executors, utils::CallbackWaitGroup}; + + #[derive(Clone, Copy)] + struct NoopInitialScan; + + impl InitialScan for NoopInitialScan { + fn do_initial_scan( + &self, + _region: &Region, + _start_ts: txn_types::TimeStamp, + _handle: raftstore::coprocessor::ObserveHandle, + on_finish: impl FnOnce() + Send + 'static, + ) -> crate::errors::Result { + on_finish(); + Ok(Statistics::default()) + } + } + + #[cfg(feature = "failpoints")] + fn should_finish_in(f: impl FnOnce() + Send + 'static, d: std::time::Duration) { + let (tx, rx) = futures::channel::oneshot::channel(); + std::thread::spawn(move || { + f(); + tx.send(()).unwrap(); + }); + let pool = tokio::runtime::Builder::new_current_thread() + .enable_time() + .build() + .unwrap(); + let _e = pool.handle().enter(); + pool.block_on(tokio::time::timeout(d, rx)).unwrap().unwrap(); + } + + #[test] + #[cfg(feature = "failpoints")] + fn test_message_delay_and_exit() { + use std::time::Duration; + + use super::ScanCmd; + + let pool = spawn_executors(NoopInitialScan, 1); + let wg = CallbackWaitGroup::new(); + fail::cfg("execute_scan_command", "sleep(100)").unwrap(); + for _ in 0..100 { + let wg = wg.clone(); + pool.request(ScanCmd { + region: Default::default(), + handle: Default::default(), + last_checkpoint: Default::default(), + // Note: Maybe make here a Box or some other trait? + work: wg.work(), + }) + .unwrap() + } + + should_finish_in(move || drop(pool), Duration::from_secs(5)); + } +} diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 9199f508d62..e8a22f9840e 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -15,10 +15,21 @@ use crate::{debug, metrics::TRACK_REGION, utils}; #[derive(Clone, Default, Debug)] pub struct SubscriptionTracer(Arc>); +#[derive(Debug, Eq, PartialEq, Clone, Copy)] +pub enum SubscriptionState { + /// When it is newly added (maybe after split or leader transfered from other store), without any flush. + Fresh, + /// It has been flushed, and running normally. + Normal, + /// It has been moved to other store. + Removal, +} + pub struct RegionSubscription { pub meta: Region, pub(crate) handle: ObserveHandle, - resolver: TwoPhaseResolver, + pub(crate) resolver: TwoPhaseResolver, + state: SubscriptionState, } impl std::fmt::Debug for RegionSubscription { @@ -31,17 +42,32 @@ impl std::fmt::Debug for RegionSubscription { } impl RegionSubscription { + /// move self out. + fn take(&mut self) -> Self { + Self { + meta: self.meta.clone(), + handle: self.handle.clone(), + resolver: std::mem::replace(&mut self.resolver, TwoPhaseResolver::new(0, None)), + state: self.state, + } + } + pub fn new(region: Region, handle: ObserveHandle, start_ts: Option) -> Self { let resolver = TwoPhaseResolver::new(region.get_id(), start_ts); Self { handle, meta: region, resolver, + state: SubscriptionState::Fresh, } } - pub fn stop_observing(&self) { - self.handle.stop_observing() + pub fn stop(&mut self) { + if self.state == SubscriptionState::Removal { + return; + } + self.handle.stop_observing(); + self.state = SubscriptionState::Removal; } pub fn is_observing(&self) -> bool { @@ -58,22 +84,11 @@ impl RegionSubscription { } impl SubscriptionTracer { - /// get the current safe point: data before this ts have already be flushed and be able to be GCed. - pub fn safepoint(&self) -> TimeStamp { - // use the current resolved_ts is safe because it is only advanced when flushing. - self.0 - .iter() - .map(|r| r.resolver.resolved_ts()) - .min() - // NOTE: Maybe use the current timestamp? - .unwrap_or(TimeStamp::zero()) - } - /// clear the current `SubscriptionTracer`. pub fn clear(&self) { self.0.retain(|_, v| { - v.stop_observing(); - TRACK_REGION.with_label_values(&["dec"]).inc(); + v.stop(); + TRACK_REGION.dec(); false }); } @@ -89,25 +104,28 @@ impl SubscriptionTracer { start_ts: Option, ) { info!("start listen stream from store"; "observer" => ?handle, "region_id" => %region.get_id()); - TRACK_REGION.with_label_values(&["inc"]).inc(); - if let Some(o) = self.0.insert( + TRACK_REGION.inc(); + if let Some(mut o) = self.0.insert( region.get_id(), RegionSubscription::new(region.clone(), handle, start_ts), ) { - TRACK_REGION.with_label_values(&["dec"]).inc(); - warn!("register region which is already registered"; "region_id" => %region.get_id()); - o.stop_observing(); + if o.state != SubscriptionState::Removal { + TRACK_REGION.dec(); + warn!("register region which is already registered"; "region_id" => %region.get_id()); + } + o.stop(); } } /// try advance the resolved ts with the min ts of in-memory locks. - pub fn resolve_with(&self, min_ts: TimeStamp) -> TimeStamp { + /// returns the regions and theirs resolved ts. + pub fn resolve_with(&self, min_ts: TimeStamp) -> Vec<(Region, TimeStamp)> { self.0 .iter_mut() - .map(|mut s| s.resolver.resolve(min_ts)) - .min() - // If there isn't any region observed, the `min_ts` can be used as resolved ts safely. - .unwrap_or(min_ts) + // Don't advance the checkpoint ts of removed region. + .filter(|s| s.state != SubscriptionState::Removal) + .map(|mut s| (s.meta.clone(), s.resolver.resolve(min_ts))) + .collect() } #[inline(always)] @@ -130,24 +148,31 @@ impl SubscriptionTracer { } } + /// destroy subscription if the subscription is stopped. + pub fn destroy_stopped_region(&self, region_id: u64) { + self.0 + .remove_if(®ion_id, |_, sub| sub.state == SubscriptionState::Removal); + } + /// try to mark a region no longer be tracked by this observer. /// returns whether success (it failed if the region hasn't been observed when calling this.) - pub fn deregister_region( + pub fn deregister_region_if( &self, region: &Region, if_cond: impl FnOnce(&RegionSubscription, &Region) -> bool, ) -> bool { let region_id = region.get_id(); - let remove_result = self - .0 - .remove_if(®ion_id, |_, old_region| if_cond(old_region, region)); + let remove_result = self.0.get_mut(®ion_id); match remove_result { - Some(o) => { - TRACK_REGION.with_label_values(&["dec"]).inc(); - o.1.stop_observing(); - info!("stop listen stream from store"; "observer" => ?o.1, "region_id"=> %region_id); + Some(mut o) if if_cond(o.value(), region) => { + if o.state != SubscriptionState::Removal { + TRACK_REGION.dec(); + } + o.value_mut().stop(); + info!("stop listen stream from store"; "observer" => ?o.value(), "region_id"=> %region_id); true } + Some(_) => false, None => { warn!("trying to deregister region not registered"; "region_id" => %region_id); false @@ -181,22 +206,60 @@ impl SubscriptionTracer { false } + /// Remove and collect the subscriptions have been marked as removed. + pub fn collect_removal_subs(&self) -> Vec { + let mut result = vec![]; + self.0.retain(|_k, v| { + if v.state == SubscriptionState::Removal { + result.push(v.take()); + false + } else { + true + } + }); + result + } + + /// Collect the fresh subscriptions, and mark them as Normal. + pub fn collect_fresh_subs(&self) -> Vec { + self.0 + .iter_mut() + .filter_map(|mut s| { + let v = s.value_mut(); + if v.state == SubscriptionState::Fresh { + v.state = SubscriptionState::Normal; + Some(v.meta.clone()) + } else { + None + } + }) + .collect() + } + + /// Remove all "Removal" entries. + /// Set all "Fresh" entries to "Normal". + pub fn update_status_for_v3(&self) { + self.0.retain(|_k, v| match v.state { + SubscriptionState::Fresh => { + v.state = SubscriptionState::Normal; + true + } + SubscriptionState::Normal => true, + SubscriptionState::Removal => false, + }) + } + /// check whether the region_id should be observed by this observer. pub fn is_observing(&self, region_id: u64) -> bool { - let mut exists = false; - - // The region traced, check it whether is still be observing, - // if not, remove it. - let still_observing = self - .0 - // Assuming this closure would be called iff the key exists. - // So we can elide a `contains` check. - .remove_if(®ion_id, |_, o| { - exists = true; - !o.is_observing() - }) - .is_none(); - exists && still_observing + let sub = self.0.get_mut(®ion_id); + match sub { + Some(mut sub) if !sub.is_observing() || sub.state == SubscriptionState::Removal => { + sub.value_mut().stop(); + false + } + Some(_) => true, + None => false, + } } pub fn get_subscription_of( @@ -207,7 +270,7 @@ impl SubscriptionTracer { } } -/// This enhanced version of `Resolver` allow some unorder of lock events. +/// This enhanced version of `Resolver` allow some unordered lock events. /// The name "2-phase" means this is used for 2 *concurrency* phases of observing a region: /// 1. Doing the initial scanning. /// 2. Listening at the incremental data. @@ -216,24 +279,24 @@ impl SubscriptionTracer { /// +->(Start TS Of Task) +->(Task registered to KV) /// +--------------------------------+------------------------> /// ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ ^~~~~~~~~~~~~~~~~~~~~~~~~ -/// | +-> Phase 2: Listening incremtnal data. +/// | +-> Phase 2: Listening incremental data. /// +-> Phase 1: Initial scanning scans writes between start ts and now. /// ``` /// -/// In backup-stream, we execute these two tasks parallelly. Which may make some race conditions: -/// - When doing initial scanning, there may be a flush triggered, but the defult resolver +/// In backup-stream, we execute these two tasks parallel. Which may make some race conditions: +/// - When doing initial scanning, there may be a flush triggered, but the default resolver /// would probably resolved to the tip of incremental events. /// - When doing initial scanning, we meet and track a lock already meet by the incremental events, /// then the default resolver cannot untrack this lock any more. /// -/// This version of resolver did some change for solve these problmes: +/// This version of resolver did some change for solve these problems: /// - The resolver won't advance the resolved ts to greater than `stable_ts` if there is some. This /// can help us prevent resolved ts from advancing when initial scanning hasn't finished yet. /// - When we `untrack` a lock haven't been tracked, this would record it, and skip this lock if we want to track it then. /// This would be safe because: /// - untracking a lock not be tracked is no-op for now. /// - tracking a lock have already being untracked (unordered call of `track` and `untrack`) wouldn't happen at phase 2 for same region. -/// but only when phase 1 and phase 2 happend concurrently, at that time, we wouldn't and cannot advance the resolved ts. +/// but only when phase 1 and phase 2 happened concurrently, at that time, we wouldn't and cannot advance the resolved ts. pub struct TwoPhaseResolver { resolver: Resolver, future_locks: Vec, @@ -328,7 +391,18 @@ impl TwoPhaseResolver { for lock in std::mem::take(&mut self.future_locks).into_iter() { self.handle_future_lock(lock); } - self.stable_ts = None + let ts = self.stable_ts.take(); + match ts { + Some(ts) => { + // advance the internal resolver. + // the start ts of initial scanning would be a safe ts for min ts + // -- because is used to be a resolved ts. + self.resolver.resolve(ts); + } + None => { + warn!("BUG: a two-phase resolver is executing phase_one_done when not in phase one"; "resolver" => ?self) + } + } } } @@ -343,9 +417,11 @@ impl std::fmt::Debug for TwoPhaseResolver { #[cfg(test)] mod test { + use kvproto::metapb::{Region, RegionEpoch}; + use raftstore::coprocessor::ObserveHandle; use txn_types::TimeStamp; - use super::TwoPhaseResolver; + use super::{SubscriptionTracer, TwoPhaseResolver}; #[test] fn test_two_phase_resolver() { @@ -372,4 +448,73 @@ mod test { r.untrack_lock(&key[..]); assert_eq!(r.resolve(ts(57)), ts(57)); } + + fn region(id: u64, version: u64, conf_version: u64) -> Region { + let mut r = Region::new(); + let mut e = RegionEpoch::new(); + e.set_version(version); + e.set_conf_ver(conf_version); + r.set_id(id); + r.set_region_epoch(e); + r + } + + #[test] + fn test_delay_remove() { + let subs = SubscriptionTracer::default(); + let handle = ObserveHandle::new(); + subs.register_region(®ion(1, 1, 1), handle, Some(TimeStamp::new(42))); + assert!(subs.get_subscription_of(1).is_some()); + assert!(subs.is_observing(1)); + subs.deregister_region_if(®ion(1, 1, 1), |_, _| true); + assert!(!subs.is_observing(1)); + } + + #[test] + fn test_cal_checkpoint() { + let subs = SubscriptionTracer::default(); + subs.register_region( + ®ion(1, 1, 1), + ObserveHandle::new(), + Some(TimeStamp::new(42)), + ); + subs.register_region(®ion(2, 2, 1), ObserveHandle::new(), None); + subs.register_region( + ®ion(3, 4, 1), + ObserveHandle::new(), + Some(TimeStamp::new(88)), + ); + subs.get_subscription_of(3) + .unwrap() + .resolver + .phase_one_done(); + subs.register_region( + ®ion(4, 8, 1), + ObserveHandle::new(), + Some(TimeStamp::new(92)), + ); + let mut region4_sub = subs.get_subscription_of(4).unwrap(); + region4_sub.resolver.phase_one_done(); + region4_sub + .resolver + .track_lock(TimeStamp::new(128), b"Alpi".to_vec()); + subs.register_region(®ion(5, 8, 1), ObserveHandle::new(), None); + subs.deregister_region_if(®ion(5, 8, 1), |_, _| true); + drop(region4_sub); + + let mut rs = subs.resolve_with(TimeStamp::new(1000)); + rs.sort_by_key(|k| k.0.get_id()); + assert_eq!( + rs, + vec![ + (region(1, 1, 1), TimeStamp::new(42)), + (region(2, 2, 1), TimeStamp::new(1000)), + (region(3, 4, 1), TimeStamp::new(1000)), + (region(4, 8, 1), TimeStamp::new(128)), + ] + ); + let removal = subs.collect_removal_subs(); + assert_eq!(removal.len(), 1); + assert_eq!(removal[0].meta.get_id(), 5); + } } diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index c104a100b56..725a1c17f51 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -4,21 +4,36 @@ use std::{ borrow::Borrow, collections::{hash_map::RandomState, BTreeMap, HashMap}, ops::{Bound, RangeBounds}, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, time::Duration, }; +use engine_rocks::ReadPerfInstant; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; -use futures::{channel::mpsc, executor::block_on, StreamExt}; +use futures::{channel::mpsc, executor::block_on, FutureExt, StreamExt}; use kvproto::raft_cmdpb::{CmdType, Request}; use raft::StateRole; use raftstore::{coprocessor::RegionInfoProvider, RegionInfo}; use tikv::storage::CfStatistics; -use tikv_util::{box_err, time::Instant, warn, worker::Scheduler, Either}; -use tokio::sync::{Mutex, RwLock}; +use tikv_util::{ + box_err, + sys::inspector::{ + self_thread_inspector, IoStat, ThreadInspector, ThreadInspectorImpl as OsInspector, + }, + time::Instant, + warn, + worker::Scheduler, + Either, +}; +use tokio::sync::{oneshot, Mutex, RwLock}; use txn_types::{Key, Lock, LockType}; use crate::{ errors::{Error, Result}, + metadata::store::BoxFuture, Task, }; @@ -401,9 +416,139 @@ pub fn should_track_lock(l: &Lock) -> bool { } } +pub struct CallbackWaitGroup { + running: AtomicUsize, + on_finish_all: std::sync::Mutex>>, +} + +/// A shortcut for making an opaque future type for return type or argument type, +/// which is sendable and not borrowing any variables. +/// +/// `fut![T]` == `impl Future + Send + 'static` +#[macro_export(crate)] +macro_rules! future { + ($t:ty) => { impl core::future::Future + Send + 'static }; +} + +impl CallbackWaitGroup { + pub fn new() -> Arc { + Arc::new(Self { + running: AtomicUsize::new(0), + on_finish_all: std::sync::Mutex::default(), + }) + } + + fn work_done(&self) { + let last = self.running.fetch_sub(1, Ordering::SeqCst); + if last == 1 { + self.on_finish_all + .lock() + .unwrap() + .drain(..) + .for_each(|x| x()) + } + } + + /// wait until all running tasks done. + pub fn wait(&self) -> BoxFuture<()> { + // Fast path: no uploading. + if self.running.load(Ordering::SeqCst) == 0 { + return Box::pin(futures::future::ready(())); + } + + let (tx, rx) = oneshot::channel(); + self.on_finish_all.lock().unwrap().push(Box::new(move || { + // The waiter may timed out. + let _ = tx.send(()); + })); + // try to acquire the lock again. + if self.running.load(Ordering::SeqCst) == 0 { + return Box::pin(futures::future::ready(())); + } + Box::pin(rx.map(|_| ())) + } + + /// make a work, as long as the return value held, mark a work in the group is running. + pub fn work(self: Arc) -> Work { + self.running.fetch_add(1, Ordering::SeqCst); + Work(self) + } +} + +pub struct Work(Arc); + +impl Drop for Work { + fn drop(&mut self) { + self.0.work_done(); + } +} + +struct ReadThroughputRecorder { + // The system tool set. + ins: Option, + begin: Option, + // Once the system tool set get unavailable, + // we would use the "ejector" -- RocksDB perf context. + // NOTE: In fact I'm not sure whether we need the result of system level tool set -- + // but this is the current implement of cdc. We'd better keep consistent with them. + ejector: ReadPerfInstant, +} + +impl ReadThroughputRecorder { + fn start() -> Self { + let r = self_thread_inspector().ok().and_then(|insp| { + let stat = insp.io_stat().ok()??; + Some((insp, stat)) + }); + match r { + Some((ins, begin)) => Self { + ins: Some(ins), + begin: Some(begin), + ejector: ReadPerfInstant::new(), + }, + _ => Self { + ins: None, + begin: None, + ejector: ReadPerfInstant::new(), + }, + } + } + + fn try_get_delta_from_unix(&self) -> Option { + let ins = self.ins.as_ref()?; + let begin = self.begin.as_ref()?; + let end = ins.io_stat().ok()??; + Some(end.read - begin.read) + } + + fn end(self) -> u64 { + self.try_get_delta_from_unix() + .unwrap_or_else(|| self.ejector.delta().block_read_byte) + } +} + +/// try to record read throughput. +/// this uses the `proc` fs in the linux for recording the throughput. +/// if that failed, we would use the RocksDB perf context. +pub fn with_record_read_throughput(f: impl FnOnce() -> T) -> (T, u64) { + let recorder = ReadThroughputRecorder::start(); + let r = f(); + (r, recorder.end()) +} + #[cfg(test)] mod test { - use crate::utils::SegmentMap; + use std::{ + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + time::Duration, + }; + + use futures::executor::block_on; + + use crate::utils::{CallbackWaitGroup, SegmentMap}; #[test] fn test_segment_tree() { @@ -427,4 +572,120 @@ mod test { assert!(tree.is_overlapping((&2, &10))); assert!(tree.is_overlapping((&0, &9999999))); } + + #[test] + fn test_wait_group() { + #[derive(Debug)] + struct Case { + bg_task: usize, + repeat: usize, + } + + fn run_case(c: Case) { + for i in 0..c.repeat { + let wg = CallbackWaitGroup::new(); + let cnt = Arc::new(AtomicUsize::new(c.bg_task)); + for _ in 0..c.bg_task { + let cnt = cnt.clone(); + let work = wg.clone().work(); + tokio::spawn(async move { + cnt.fetch_sub(1, Ordering::SeqCst); + drop(work); + }); + } + let _ = block_on(tokio::time::timeout(Duration::from_secs(20), wg.wait())).unwrap(); + assert_eq!(cnt.load(Ordering::SeqCst), 0, "{:?}@{}", c, i); + } + } + + let cases = [ + Case { + bg_task: 200000, + repeat: 1, + }, + Case { + bg_task: 65535, + repeat: 1, + }, + Case { + bg_task: 512, + repeat: 1, + }, + Case { + bg_task: 2, + repeat: 100000, + }, + Case { + bg_task: 1, + repeat: 100000, + }, + Case { + bg_task: 0, + repeat: 1, + }, + ]; + + let pool = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) + .enable_time() + .build() + .unwrap(); + let _guard = pool.handle().enter(); + for case in cases { + run_case(case) + } + } + + /// skip it currently. Test it at local env successfully but failed at pod. + #[cfg(FALSE)] + #[test] + fn test_recorder() { + use engine_rocks::{raw::DB, RocksEngine}; + use engine_traits::{Iterable, KvEngine, Mutable, WriteBatch, WriteBatchExt, CF_DEFAULT}; + use tempdir::TempDir; + + let p = TempDir::new("test_db").unwrap(); + let mut opt = DBOptions::default(); + opt.create_if_missing(true); + let db = DB::open(opt.clone(), p.path().as_os_str().to_str().unwrap()).unwrap(); + let engine = RocksEngine::from_db(Arc::new(db)); + let mut wb = engine.write_batch(); + for i in 0..100 { + wb.put_cf(CF_DEFAULT, format!("hello{}", i).as_bytes(), b"world") + .unwrap(); + } + let mut wopt = WriteOptions::new(); + wopt.set_sync(true); + wb.write_opt(&wopt).unwrap(); + // force memtable to disk. + engine.get_sync_db().compact_range(None, None); + + let (items, size) = super::with_record_read_throughput(|| { + let mut items = vec![]; + let snap = engine.snapshot(); + snap.scan(b"", b"", false, |k, v| { + items.push((k.to_owned(), v.to_owned())); + Ok(true) + }) + .unwrap(); + items + }); + + let items_size = items.iter().map(|(k, v)| k.len() + v.len()).sum::() as u64; + + // considering the compression, we may get at least 1/2 of the real size. + assert!( + size > items_size / 2, + "the size recorded is too small: {} vs {}", + size, + items_size + ); + // considering the read amplification, we may get at most 2x of the real size. + assert!( + size < items_size * 2, + "the size recorded is too big: {} vs {}", + size, + items_size + ); + } } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 339dd07f773..fccd8a0626a 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -10,7 +10,11 @@ use std::{ }; use backup_stream::{ - metadata::{store::SlashEtcStore, MetadataClient, StreamTask}, + errors::Result, + metadata::{ + store::{MetaStore, SlashEtcStore}, + MetadataClient, StreamTask, + }, observer::BackupStreamObserver, router::Router, Endpoint, Task, @@ -78,9 +82,129 @@ fn make_encoded_record_key(table_id: i64, handle: u64, ts: u64) -> Vec { key.append_ts(TimeStamp::new(ts)).into_encoded() } +#[derive(Clone)] +struct ErrorStore { + inner: S, + + error_provider: Arc Result<()> + Send + Sync>, +} + +pub struct SuiteBuilder { + name: String, + nodes: usize, + use_v3: bool, + metastore_error: Box Result<()> + Send + Sync>, +} + +impl SuiteBuilder { + pub fn new_named(s: &str) -> Self { + Self { + name: s.to_owned(), + nodes: 4, + use_v3: false, + metastore_error: Box::new(|_| Ok(())), + } + } + + pub fn use_v3(mut self) -> Self { + self.use_v3 = true; + self + } + + pub fn nodes(mut self, n: usize) -> Self { + self.nodes = n; + self + } + + pub fn inject_meta_store_error(mut self, f: F) -> Self + where + F: Fn(&str) -> Result<()> + Send + Sync + 'static, + { + self.metastore_error = Box::new(f); + self + } + + pub fn build(self) -> Suite { + let Self { + name: case, + nodes: n, + use_v3, + metastore_error, + } = self; + + info!("start test"; "case" => %case, "nodes" => %n); + let cluster = new_server_cluster(42, n); + let mut suite = Suite { + endpoints: Default::default(), + meta_store: ErrorStore { + inner: Default::default(), + + error_provider: Arc::from(metastore_error), + }, + obs: Default::default(), + tikv_cli: Default::default(), + env: Arc::new(grpcio::Environment::new(1)), + cluster, + + temp_files: TempDir::new("temp").unwrap(), + flushed_files: TempDir::new("flush").unwrap(), + case_name: case, + }; + for id in 1..=(n as u64) { + let worker = suite.start_br_stream_on(id); + suite.endpoints.insert(id, worker); + } + suite.cluster.run(); + for id in 1..=(n as u64) { + suite.start_endpoint(id, use_v3); + } + // TODO: The current mock metastore (slash_etc) doesn't supports multi-version. + // We must wait until the endpoints get ready to watching the metastore, or some modifies may be lost. + // Either make Endpoint::with_client wait until watch did start or make slash_etc support multi-version, + // then we can get rid of this sleep. + std::thread::sleep(Duration::from_secs(1)); + suite + } +} + +#[async_trait::async_trait] +impl MetaStore for ErrorStore { + type Snap = S::Snap; + + async fn snapshot(&self) -> backup_stream::errors::Result { + (self.error_provider)("snapshot")?; + self.inner.snapshot().await + } + + async fn watch( + &self, + keys: backup_stream::metadata::store::Keys, + start_rev: i64, + ) -> backup_stream::errors::Result { + (self.error_provider)("watch")?; + self.inner.watch(keys, start_rev).await + } + + async fn txn( + &self, + txn: backup_stream::metadata::store::Transaction, + ) -> backup_stream::errors::Result<()> { + (self.error_provider)("txn")?; + self.inner.txn(txn).await + } + + async fn txn_cond( + &self, + txn: backup_stream::metadata::store::CondTransaction, + ) -> backup_stream::errors::Result<()> { + (self.error_provider)("txn_cond")?; + self.inner.txn_cond(txn).await + } +} + pub struct Suite { endpoints: HashMap>, - meta_store: SlashEtcStore, + meta_store: ErrorStore, cluster: Cluster, tikv_cli: HashMap, obs: HashMap, @@ -123,7 +247,7 @@ impl Suite { worker } - fn start_endpoint(&mut self, id: u64) { + fn start_endpoint(&mut self, id: u64, use_v3: bool) { let cluster = &mut self.cluster; let worker = self.endpoints.get_mut(&id).unwrap(); let sim = cluster.sim.wl(); @@ -132,6 +256,7 @@ impl Suite { let regions = sim.region_info_accessors.get(&id).unwrap().clone(); let mut cfg = BackupStreamConfig::default(); cfg.enable = true; + cfg.use_checkpoint_v3 = use_v3; cfg.temp_path = format!("/{}/{}", self.temp_files.path().display(), id); let ob = self.obs.get(&id).unwrap().clone(); let endpoint = Endpoint::new( @@ -148,37 +273,7 @@ impl Suite { worker.start(endpoint); } - pub fn new(case: &str, n: usize) -> Self { - let cluster = new_server_cluster(42, n); - let mut suite = Self { - endpoints: Default::default(), - meta_store: Default::default(), - obs: Default::default(), - tikv_cli: Default::default(), - env: Arc::new(grpcio::Environment::new(1)), - cluster, - - temp_files: TempDir::new("temp").unwrap(), - flushed_files: TempDir::new("flush").unwrap(), - case_name: case.to_owned(), - }; - for id in 1..=(n as u64) { - let worker = suite.start_br_stream_on(id); - suite.endpoints.insert(id, worker); - } - suite.cluster.run(); - for id in 1..=(n as u64) { - suite.start_endpoint(id); - } - // TODO: The current mock metastore (slash_etc) doesn't supports multi-version. - // We must wait until the endpoints get ready to watching the metastore, or some modifies may be lost. - // Either make Endpoint::with_client wait until watch did start or make slash_etc support multi-version, - // then we can get rid of this sleep. - std::thread::sleep(Duration::from_secs(1)); - suite - } - - fn get_meta_cli(&self) -> MetadataClient { + fn get_meta_cli(&self) -> MetadataClient> { MetadataClient::new(self.meta_store.clone(), 0) } @@ -239,7 +334,8 @@ impl Suite { } fn force_flush_files(&self, task: &str) { - self.run(|| Task::ForceFlush(task.to_owned())) + self.run(|| Task::ForceFlush(task.to_owned())); + self.sync(); } fn run(&self, mut t: impl FnMut() -> Task) { @@ -452,6 +548,10 @@ impl Suite { pub fn wait_for_flush(&self) { use std::ffi::OsString; + std::fs::File::open(&self.temp_files) + .unwrap() + .sync_all() + .unwrap(); for _ in 0..100 { if !walkdir::WalkDir::new(&self.temp_files) .into_iter() @@ -499,15 +599,19 @@ fn run_async_test(test: impl Future) -> T { mod test { use std::time::Duration; - use backup_stream::{errors::Error, metadata::MetadataClient, Task}; + use backup_stream::{ + errors::Error, metadata::MetadataClient, GetCheckpointResult, RegionCheckpointOperation, + RegionSet, Task, + }; use tikv_util::{box_err, defer, info, HandyRwLock}; use txn_types::TimeStamp; - use crate::{make_record_key, make_split_key_at_record, run_async_test}; + use crate::{make_record_key, make_split_key_at_record, run_async_test, SuiteBuilder}; #[test] fn basic() { - let mut suite = super::Suite::new("basic", 4); + let mut suite = super::SuiteBuilder::new_named("basic").use_v3().build(); + fail::cfg("try_start_observe", "1*return").unwrap(); run_async_test(async { // write data before the task starting, for testing incremental scanning. @@ -527,7 +631,9 @@ mod test { #[test] fn with_split() { - let mut suite = super::Suite::new("with_split", 4); + let mut suite = super::SuiteBuilder::new_named("with_split") + .use_v3() + .build(); run_async_test(async { let round1 = suite.write_records(0, 128, 1).await; suite.must_split(&make_split_key_at_record(1, 42)); @@ -546,7 +652,9 @@ mod test { #[test] /// This case tests whether the backup can continue when the leader failes. fn leader_down() { - let mut suite = super::Suite::new("leader_down", 4); + let mut suite = super::SuiteBuilder::new_named("leader_down") + .use_v3() + .build(); suite.must_register_task(1, "test_leader_down"); suite.sync(); let round1 = run_async_test(suite.write_records(0, 128, 1)); @@ -566,7 +674,9 @@ mod test { /// This case tests whehter the checkpoint ts (next backup ts) can be advanced correctly /// when async commit is enabled. fn async_commit() { - let mut suite = super::Suite::new("async_commit", 3); + let mut suite = super::SuiteBuilder::new_named("async_commit") + .nodes(3) + .build(); run_async_test(async { suite.must_register_task(1, "test_async_commit"); suite.sync(); @@ -596,7 +706,9 @@ mod test { #[test] fn fatal_error() { - let mut suite = super::Suite::new("fatal_error", 3); + let mut suite = super::SuiteBuilder::new_named("fatal_error") + .nodes(3) + .build(); suite.must_register_task(1, "test_fatal_error"); suite.sync(); run_async_test(suite.write_records(0, 1, 1)); @@ -633,7 +745,7 @@ mod test { safepoints.iter().any(|sp| { sp.serivce.contains(&format!("{}", victim)) && sp.ttl >= Duration::from_secs(60 * 60 * 24) - && sp.safepoint.into_inner() == checkpoint + && sp.safepoint.into_inner() == checkpoint - 1 }), "{:?}", safepoints @@ -647,7 +759,9 @@ mod test { fail::remove("delay_on_start_observe"); fail::remove("delay_on_flush"); }} - let mut suite = super::Suite::new("inflight_message", 3); + let mut suite = super::SuiteBuilder::new_named("inflight_message") + .nodes(3) + .build(); suite.must_register_task(1, "inflight_message"); run_async_test(suite.write_records(0, 128, 1)); fail::cfg("delay_on_flush", "pause").unwrap(); @@ -679,4 +793,56 @@ mod test { // The checkpoint should be advanced as expection when the inflight message has been consumed. assert!(checkpoint > 512, "checkpoint = {}", checkpoint); } + + #[test] + fn region_checkpoint_info() { + let mut suite = super::SuiteBuilder::new_named("checkpoint_info") + .nodes(1) + .use_v3() + .build(); + suite.must_register_task(1, "checkpoint_info"); + suite.must_split(&make_split_key_at_record(1, 42)); + run_async_test(suite.write_records(0, 128, 1)); + suite.force_flush_files("checkpoint_info"); + suite.wait_for_flush(); + std::thread::sleep(Duration::from_secs(1)); + let (tx, rx) = std::sync::mpsc::channel(); + suite.run(|| { + let tx = tx.clone(); + Task::RegionCheckpointsOp(RegionCheckpointOperation::Get( + RegionSet::Universal, + Box::new(move |rs| { + tx.send(rs).unwrap(); + }), + )) + }); + let checkpoints = rx.recv().unwrap(); + assert!(!checkpoints.is_empty(), "{:?}", checkpoints); + assert!( + checkpoints + .iter() + .all(|cp| matches!(cp, GetCheckpointResult::Ok { checkpoint, .. } if checkpoint.into_inner() > 256)), + "{:?}", + checkpoints + ); + } + + #[test] + fn region_failure() { + defer! {{ + fail::remove("try_start_observe"); + }} + let mut suite = SuiteBuilder::new_named("region_failure").build(); + let keys = run_async_test(suite.write_records(0, 128, 1)); + fail::cfg("try_start_observe", "1*return").unwrap(); + suite.must_register_task(1, "region_failure"); + suite.must_shuffle_leader(1); + let keys2 = run_async_test(suite.write_records(256, 128, 1)); + suite.force_flush_files("region_failure"); + suite.wait_for_flush(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys.union(&keys2).map(|s| s.as_slice()), + ); + } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 51a21b91628..37d031753ce 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -55,7 +55,8 @@ use grpcio_health::HealthService; use kvproto::{ brpb::create_backup, cdcpb::create_change_data, deadlock::create_deadlock, debugpb::create_debug, diagnosticspb::create_diagnostics, import_sstpb::create_import_sst, - kvrpcpb::ApiVersion, resource_usage_agent::create_resource_metering_pub_sub, + kvrpcpb::ApiVersion, logbackuppb::create_log_backup, + resource_usage_agent::create_resource_metering_pub_sub, }; use pd_client::{PdClient, RpcClient}; use raft_log_engine::RaftLogEngine; @@ -244,6 +245,7 @@ struct Servers { cdc_scheduler: tikv_util::worker::Scheduler, cdc_memory_quota: MemoryQuota, rsmeter_pubsub_service: resource_metering::PubSubService, + backup_stream_scheduler: Option>, } type LocalServer = @@ -884,7 +886,7 @@ impl TiKvServer { ); // Start backup stream - if self.config.backup_stream.enable { + let backup_stream_scheduler = if self.config.backup_stream.enable { // Create backup stream. let mut backup_stream_worker = Box::new(LazyWorker::new("backup-stream")); let backup_stream_scheduler = backup_stream_worker.scheduler(); @@ -910,7 +912,7 @@ impl TiKvServer { node.id(), etcd_cli, self.config.backup_stream.clone(), - backup_stream_scheduler, + backup_stream_scheduler.clone(), backup_stream_ob, self.region_info_accessor.clone(), self.router.clone(), @@ -919,7 +921,10 @@ impl TiKvServer { ); backup_stream_worker.start(backup_stream_endpoint); self.to_stop.push(backup_stream_worker); - } + Some(backup_stream_scheduler) + } else { + None + }; let import_path = self.store_path.join("import"); let mut importer = SstImporter::new( @@ -1076,6 +1081,7 @@ impl TiKvServer { cdc_scheduler, cdc_memory_quota, rsmeter_pubsub_service, + backup_stream_scheduler, }); server_config @@ -1197,6 +1203,17 @@ impl TiKvServer { { warn!("failed to register resource metering pubsub service"); } + + if let Some(sched) = servers.backup_stream_scheduler.take() { + let pitr_service = backup_stream::Service::new(sched); + if servers + .server + .register_service(create_log_backup(pitr_service)) + .is_some() + { + fatal!("failed to register log backup service"); + } + } } fn init_io_utility(&mut self) -> BytesFetcher { diff --git a/src/config.rs b/src/config.rs index 7dfbe1b0933..9e0abe37c94 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2445,6 +2445,10 @@ pub struct BackupStreamConfig { pub temp_file_size_limit_per_task: ReadableSize, #[online_config(skip)] pub initial_scan_pending_memory_quota: ReadableSize, + #[online_config(skip)] + pub initial_scan_rate_limit: ReadableSize, + #[online_config(skip)] + pub use_checkpoint_v3: bool, } impl BackupStreamConfig { @@ -2477,6 +2481,8 @@ impl Default for BackupStreamConfig { temp_path: String::new(), temp_file_size_limit_per_task: ReadableSize::mb(128), initial_scan_pending_memory_quota: ReadableSize(quota_size as _), + initial_scan_rate_limit: ReadableSize::mb(60), + use_checkpoint_v3: true, } } } diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 24e52a8057e..a81a34b1e71 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -1017,7 +1017,9 @@ fn write_needs_restore(write: &[u8]) -> bool { Ok(w) if matches!( w.write_type, - WriteType::Put | WriteType::Delete | WriteType::Rollback + // We only keep the last put / delete write CF, + // other write type may shadow the real data and cause data loss. + WriteType::Put | WriteType::Delete ) => { true From a49945bb134bd70211a1e6733f82518d9e02fba0 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Mon, 11 Jul 2022 14:45:05 +0800 Subject: [PATCH 067/676] *: set write and time details in RPC responses (#12900) ref tikv/tikv#931, ref tikv/tikv#12362 kvproto#931 adds a few more details about the time used by an RPC request. We are filling these additional information to the responses in this commit. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- .../engine_rocks/src/perf_context_impl.rs | 10 +- components/raftstore/src/store/fsm/apply.rs | 16 +- components/raftstore/src/store/peer.rs | 2 +- components/tikv_util/Cargo.toml | 2 +- components/tracker/src/lib.rs | 31 +++- src/coprocessor/endpoint.rs | 14 +- src/server/metrics.rs | 2 +- src/server/service/batch.rs | 2 +- src/server/service/kv.rs | 167 +++++++++++++----- tests/integrations/server/kv_service.rs | 128 ++++++++++++++ 10 files changed, 303 insertions(+), 71 deletions(-) diff --git a/components/engine_rocks/src/perf_context_impl.rs b/components/engine_rocks/src/perf_context_impl.rs index c6eb187b392..152a0a12785 100644 --- a/components/engine_rocks/src/perf_context_impl.rs +++ b/components/engine_rocks/src/perf_context_impl.rs @@ -4,7 +4,6 @@ use std::{fmt::Debug, marker::PhantomData, mem, ops::Sub, time::Duration}; use derive_more::{Add, AddAssign, Sub, SubAssign}; use engine_traits::{PerfContextKind, PerfLevel}; -use kvproto::kvrpcpb::ScanDetailV2; use lazy_static::lazy_static; use slog_derive::KV; use tikv_util::time::Instant; @@ -136,18 +135,11 @@ pub struct ReadPerfContext { } impl ReadPerfContext { - pub fn write_scan_detail(&self, detail_v2: &mut ScanDetailV2) { - detail_v2.set_rocksdb_delete_skipped_count(self.internal_delete_skipped_count); - detail_v2.set_rocksdb_key_skipped_count(self.internal_key_skipped_count); - detail_v2.set_rocksdb_block_cache_hit_count(self.block_cache_hit_count); - detail_v2.set_rocksdb_block_read_count(self.block_read_count); - detail_v2.set_rocksdb_block_read_byte(self.block_read_byte); - } - fn report_to_tracker(&self, tracker: &mut Tracker) { tracker.metrics.block_cache_hit_count += self.block_cache_hit_count; tracker.metrics.block_read_byte += self.block_read_byte; tracker.metrics.block_read_count += self.block_read_count; + tracker.metrics.block_read_nanos += self.block_read_time; tracker.metrics.deleted_key_skipped_count += self.internal_delete_skipped_count; tracker.metrics.internal_key_skipped_count += self.internal_key_skipped_count; } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 03034b76245..dfafcac338f 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -65,6 +65,7 @@ use tikv_util::{ Either, MustConsumeVec, }; use time::Timespec; +use tracker::GLOBAL_TRACKERS; use uuid::Builder as UuidBuilder; use self::memtrace::*; @@ -3675,9 +3676,18 @@ where match msg { Msg::Apply { start, mut apply } => { - apply_ctx - .apply_wait - .observe(start.saturating_elapsed_secs()); + let apply_wait = start.saturating_elapsed(); + apply_ctx.apply_wait.observe(apply_wait.as_secs_f64()); + for tracker in apply + .cbs + .iter() + .flat_map(|p| p.cb.get_trackers()) + .flat_map(|ts| ts.iter().flat_map(|t| t.as_tracker_token())) + { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { + t.metrics.apply_wait_nanos = apply_wait.as_nanos() as u64; + }); + } if let Some(batch) = batch_apply.as_mut() { if batch.try_batch(&mut apply) { diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index cf54d962075..5897309f0b2 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1627,7 +1627,7 @@ where .observe(dur.as_secs_f64()); for t in proposal.cb.get_trackers().iter().flat_map(|v| v.iter().flat_map(|t| t.as_tracker_token())) { GLOBAL_TRACKERS.with_tracker(t, |trakcer| { - if trakcer.metrics.propose_send_wait_nanos == 0{ + if trakcer.metrics.propose_send_wait_nanos == 0 { trakcer.metrics.propose_send_wait_nanos = dur.as_nanos() as u64; } }); diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 52d73429f4c..befe6559e32 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -22,7 +22,7 @@ crossbeam = "0.8" derive_more = "0.99.3" error_code = { path = "../error_code", default-features = false } fail = "0.5" -futures = { version = "0.3", features = ["compat"] } +futures = { version = "0.3", features = ["compat", "thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } http = "0.2.0" diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index 3729fb1ec9d..7e1aab80882 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -34,9 +34,38 @@ impl Tracker { pub fn write_scan_detail(&self, detail_v2: &mut pb::ScanDetailV2) { detail_v2.set_rocksdb_block_read_byte(self.metrics.block_read_byte); detail_v2.set_rocksdb_block_read_count(self.metrics.block_read_count); + detail_v2.set_rocksdb_block_read_nanos(self.metrics.block_read_nanos); detail_v2.set_rocksdb_block_cache_hit_count(self.metrics.block_cache_hit_count); detail_v2.set_rocksdb_key_skipped_count(self.metrics.internal_key_skipped_count); detail_v2.set_rocksdb_delete_skipped_count(self.metrics.deleted_key_skipped_count); + detail_v2.set_get_snapshot_nanos(self.metrics.get_snapshot_nanos); + } + + pub fn write_write_detail(&self, detail: &mut pb::WriteDetail) { + detail.set_store_batch_wait_nanos(self.metrics.wf_batch_wait_nanos); + detail.set_propose_send_wait_nanos(self.metrics.propose_send_wait_nanos); + detail.set_persist_log_nanos( + self.metrics.wf_persist_log_nanos - self.metrics.wf_send_to_queue_nanos, + ); + detail.set_raft_db_write_leader_wait_nanos( + self.metrics.store_mutex_lock_nanos + self.metrics.store_thread_wait_nanos, + ); + detail.set_raft_db_sync_log_nanos(self.metrics.store_write_wal_nanos); + detail.set_raft_db_write_memtable_nanos(self.metrics.store_write_memtable_nanos); + // It's an approximation considering generating proposal is fast CPU operation. + // And note that the time before flushing the raft message to the RPC channel is + // also counted in this value (to be improved in the future). + detail.set_commit_log_nanos( + self.metrics.wf_commit_log_nanos + - self.metrics.wf_batch_wait_nanos + - self.metrics.propose_send_wait_nanos, + ); + detail.set_apply_batch_wait_nanos(self.metrics.apply_wait_nanos); + detail.set_apply_log_nanos(self.metrics.apply_time_nanos - self.metrics.apply_wait_nanos); + detail.set_apply_mutex_lock_nanos(self.metrics.apply_mutex_lock_nanos); + detail.set_apply_write_leader_wait_nanos(self.metrics.apply_thread_wait_nanos); + detail.set_apply_write_wal_nanos(self.metrics.apply_wait_nanos); + detail.set_apply_write_memtable_nanos(self.metrics.apply_write_memtable_nanos); } } @@ -106,7 +135,7 @@ pub struct RequestMetrics { pub wf_commit_log_nanos: u64, pub propose_send_wait_nanos: u64, pub commit_not_persisted: bool, - pub store_mutex_lock_nanos: u64, + pub store_mutex_lock_nanos: u64, // should be 0 if using raft-engine pub store_thread_wait_nanos: u64, pub store_write_wal_nanos: u64, pub store_write_memtable_nanos: u64, diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 918d348f898..2b2ae03caa2 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -1376,7 +1376,7 @@ mod tests { resp.get_exec_details() .get_time_detail() .get_process_wall_time_ms(), - PAYLOAD_SMALL - COARSE_ERROR_MS + PAYLOAD_SMALL.saturating_sub(COARSE_ERROR_MS) ); assert_lt!( resp.get_exec_details() @@ -1405,7 +1405,7 @@ mod tests { resp.get_exec_details() .get_time_detail() .get_process_wall_time_ms(), - PAYLOAD_LARGE - COARSE_ERROR_MS + PAYLOAD_LARGE.saturating_sub(COARSE_ERROR_MS) ); assert_lt!( resp.get_exec_details() @@ -1471,7 +1471,7 @@ mod tests { resp.get_exec_details() .get_time_detail() .get_process_wall_time_ms(), - PAYLOAD_SMALL - COARSE_ERROR_MS + PAYLOAD_SMALL.saturating_sub(COARSE_ERROR_MS) ); assert_lt!( resp.get_exec_details() @@ -1493,7 +1493,7 @@ mod tests { resp.get_exec_details() .get_time_detail() .get_process_wall_time_ms(), - PAYLOAD_LARGE - COARSE_ERROR_MS + PAYLOAD_LARGE.saturating_sub(COARSE_ERROR_MS) ); assert_lt!( resp.get_exec_details() @@ -1557,7 +1557,7 @@ mod tests { resp.get_exec_details() .get_time_detail() .get_process_wall_time_ms(), - PAYLOAD_LARGE - COARSE_ERROR_MS + PAYLOAD_LARGE.saturating_sub(COARSE_ERROR_MS) ); assert_lt!( resp.get_exec_details() @@ -1588,7 +1588,7 @@ mod tests { .get_exec_details() .get_time_detail() .get_process_wall_time_ms(), - PAYLOAD_SMALL - COARSE_ERROR_MS + PAYLOAD_SMALL.saturating_sub(COARSE_ERROR_MS) ); assert_lt!( resp[0] @@ -1618,7 +1618,7 @@ mod tests { .get_exec_details() .get_time_detail() .get_process_wall_time_ms(), - PAYLOAD_LARGE - COARSE_ERROR_MS + PAYLOAD_LARGE.saturating_sub(COARSE_ERROR_MS) ); assert_lt!( resp[1] diff --git a/src/server/metrics.rs b/src/server/metrics.rs index caf6e1e86c4..9cd8631b275 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -223,7 +223,7 @@ lazy_static! { "tikv_grpc_msg_duration_seconds", "Bucketed histogram of grpc server messages", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(5e-5, 2.0, 22).unwrap() // 50us ~ 104s ) .unwrap(); pub static ref SERVER_INFO_GAUGE_VEC: IntGaugeVec = register_int_gauge_vec!( diff --git a/src/server/service/batch.rs b/src/server/service/batch.rs index 931017549c1..15a755c3468 100644 --- a/src/server/service/batch.rs +++ b/src/server/service/batch.rs @@ -35,7 +35,7 @@ pub struct ReqBatcher { impl ReqBatcher { pub fn new(batch_size: usize) -> ReqBatcher { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); ReqBatcher { gets: vec![], raw_gets: vec![], diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 336580dda58..878a138aafe 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1,7 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath]: Tikv gRPC APIs implementation -use std::sync::Arc; +use std::{mem, sync::Arc}; use api_version::KvFormat; use fail::fail_point; @@ -184,16 +184,20 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor macro_rules! handle_request { ($fn_name: ident, $future_name: ident, $req_ty: ident, $resp_ty: ident) => { + handle_request!($fn_name, $future_name, $req_ty, $resp_ty, no_time_detail); + }; + ($fn_name: ident, $future_name: ident, $req_ty: ident, $resp_ty: ident, $time_detail: tt) => { fn $fn_name(&mut self, ctx: RpcContext<'_>, mut req: $req_ty, sink: UnarySink<$resp_ty>) { forward_unary!(self.proxy, $fn_name, ctx, req, sink); - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); let resp = $future_name(&self.storage, req); let task = async move { let resp = resp.await?; - sink.success(resp).await?; let elapsed = begin_instant.saturating_elapsed(); + set_total_time!(resp, elapsed, $time_detail); + sink.success(resp).await?; GRPC_MSG_HISTOGRAM_STATIC .$fn_name .observe(elapsed.as_secs_f64()); @@ -213,30 +217,50 @@ macro_rules! handle_request { } } +macro_rules! set_total_time { + ($resp: ident, $duration: expr, no_time_detail) => {}; + ($resp: ident, $duration: expr, has_time_detail) => { + let mut $resp = $resp; + $resp + .mut_exec_details_v2() + .mut_time_detail() + .set_total_rpc_wall_time_ns($duration.as_nanos() as u64); + }; +} + impl + 'static, E: Engine, L: LockManager, F: KvFormat> Tikv for Service { - handle_request!(kv_get, future_get, GetRequest, GetResponse); + handle_request!(kv_get, future_get, GetRequest, GetResponse, has_time_detail); handle_request!(kv_scan, future_scan, ScanRequest, ScanResponse); handle_request!( kv_prewrite, future_prewrite, PrewriteRequest, - PrewriteResponse + PrewriteResponse, + has_time_detail ); handle_request!( kv_pessimistic_lock, future_acquire_pessimistic_lock, PessimisticLockRequest, - PessimisticLockResponse + PessimisticLockResponse, + has_time_detail ); handle_request!( kv_pessimistic_rollback, future_pessimistic_rollback, PessimisticRollbackRequest, - PessimisticRollbackResponse + PessimisticRollbackResponse, + has_time_detail + ); + handle_request!( + kv_commit, + future_commit, + CommitRequest, + CommitResponse, + has_time_detail ); - handle_request!(kv_commit, future_commit, CommitRequest, CommitResponse); handle_request!(kv_cleanup, future_cleanup, CleanupRequest, CleanupResponse); handle_request!( kv_batch_get, @@ -248,37 +272,43 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor kv_batch_rollback, future_batch_rollback, BatchRollbackRequest, - BatchRollbackResponse + BatchRollbackResponse, + has_time_detail ); handle_request!( kv_txn_heart_beat, future_txn_heart_beat, TxnHeartBeatRequest, - TxnHeartBeatResponse + TxnHeartBeatResponse, + has_time_detail ); handle_request!( kv_check_txn_status, future_check_txn_status, CheckTxnStatusRequest, - CheckTxnStatusResponse + CheckTxnStatusResponse, + has_time_detail ); handle_request!( kv_check_secondary_locks, future_check_secondary_locks, CheckSecondaryLocksRequest, - CheckSecondaryLocksResponse + CheckSecondaryLocksResponse, + has_time_detail ); handle_request!( kv_scan_lock, future_scan_lock, ScanLockRequest, - ScanLockResponse + ScanLockResponse, + has_time_detail ); handle_request!( kv_resolve_lock, future_resolve_lock, ResolveLockRequest, - ResolveLockResponse + ResolveLockResponse, + has_time_detail ); handle_request!( kv_delete_range, @@ -372,8 +402,8 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor fn coprocessor(&mut self, ctx: RpcContext<'_>, mut req: Request, sink: UnarySink) { forward_unary!(self.proxy, coprocessor, ctx, req, sink); - let begin_instant = Instant::now_coarse(); let source = req.mut_context().take_request_source(); + let begin_instant = Instant::now(); let future = future_copr(&self.copr, Some(ctx.peer()), req); let task = async move { let resp = future.await?.consume(); @@ -402,8 +432,8 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor mut req: RawCoprocessorRequest, sink: UnarySink, ) { - let begin_instant = Instant::now_coarse(); let source = req.mut_context().take_request_source(); + let begin_instant = Instant::now(); let future = future_raw_coprocessor(&self.copr_v2, &self.storage, req); let task = async move { let resp = future.await?; @@ -432,7 +462,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor req: RegisterLockObserverRequest, sink: UnarySink, ) { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let (cb, f) = paired_future_callback(); let res = self.gc_worker.start_collecting(req.get_max_ts().into(), cb); @@ -471,7 +501,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor req: CheckLockObserverRequest, sink: UnarySink, ) { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let (cb, f) = paired_future_callback(); let res = self @@ -514,7 +544,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor req: RemoveLockObserverRequest, sink: UnarySink, ) { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let (cb, f) = paired_future_callback(); let res = self.gc_worker.stop_collecting(req.get_max_ts().into(), cb); @@ -551,7 +581,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor mut req: PhysicalScanLockRequest, sink: UnarySink, ) { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let (cb, f) = paired_future_callback(); let res = self.gc_worker.physical_scan_lock( @@ -595,7 +625,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor mut req: UnsafeDestroyRangeRequest, sink: UnarySink, ) { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); // DestroyRange is a very dangerous operation. We don't allow passing MIN_KEY as start, or // MAX_KEY as end here. @@ -646,7 +676,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor req: Request, mut sink: ServerStreamingSink, ) { - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let mut stream = self .copr @@ -794,7 +824,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor sink: UnarySink, ) { forward_unary!(self.proxy, split_region, ctx, req, sink); - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let region_id = req.get_context().get_region_id(); let (cb, f) = paired_future_callback(); @@ -890,7 +920,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor sink: UnarySink, ) { forward_unary!(self.proxy, read_index, ctx, req, sink); - let begin_instant = Instant::now_coarse(); + let begin_instant = Instant::now(); let region_id = req.get_context().get_region_id(); let mut cmd = RaftCmdRequest::default(); @@ -1032,20 +1062,8 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor BatchRespCollector, ); - let mut response_retriever = response_retriever.map(move |item| { - for measure in item.measures { - let GrpcRequestDuration { - label, - begin, - source, - } = measure; - let elapsed = begin.saturating_elapsed(); - GRPC_MSG_HISTOGRAM_STATIC - .get(label) - .observe(elapsed.as_secs_f64()); - record_request_source_metrics(source, elapsed); - } - + let mut response_retriever = response_retriever.map(move |mut item| { + handle_measures_for_batch_commands(&mut item); let mut r = item.batch_resp; GRPC_RESP_BATCH_COMMANDS_SIZE.observe(r.request_ids.len() as f64); // TODO: per thread load is more reasonable for batching. @@ -1349,6 +1367,46 @@ fn handle_batch_commands_request( } } +fn handle_measures_for_batch_commands(measures: &mut MeasuredBatchResponse) { + use BatchCommandsResponse_Response_oneof_cmd::*; + let now = Instant::now(); + for (resp, measure) in measures + .batch_resp + .mut_responses() + .iter_mut() + .zip(mem::take(&mut measures.measures)) + { + let GrpcRequestDuration { + label, + begin, + source, + } = measure; + let elapsed = now.saturating_duration_since(begin); + GRPC_MSG_HISTOGRAM_STATIC + .get(label) + .observe(elapsed.as_secs_f64()); + record_request_source_metrics(source, elapsed); + let exec_details = resp.cmd.as_mut().and_then(|cmd| match cmd { + Get(resp) => Some(resp.mut_exec_details_v2()), + Prewrite(resp) => Some(resp.mut_exec_details_v2()), + Commit(resp) => Some(resp.mut_exec_details_v2()), + BatchGet(resp) => Some(resp.mut_exec_details_v2()), + ResolveLock(resp) => Some(resp.mut_exec_details_v2()), + Coprocessor(resp) => Some(resp.mut_exec_details_v2()), + PessimisticLock(resp) => Some(resp.mut_exec_details_v2()), + CheckTxnStatus(resp) => Some(resp.mut_exec_details_v2()), + TxnHeartBeat(resp) => Some(resp.mut_exec_details_v2()), + CheckSecondaryLocks(resp) => Some(resp.mut_exec_details_v2()), + _ => None, + }); + if let Some(exec_details) = exec_details { + exec_details + .mut_time_detail() + .set_total_rpc_wall_time_ns(elapsed.as_nanos() as u64); + } + } +} + async fn future_handle_empty( req: BatchCommandsEmptyRequest, ) -> ServerResult { @@ -1967,24 +2025,24 @@ fn future_raw_coprocessor( } macro_rules! txn_command_future { - ($fn_name: ident, $req_ty: ident, $resp_ty: ident, ($req: ident) $prelude: stmt; ($v: ident, $resp: ident) { $else_branch: expr }) => { + ($fn_name: ident, $req_ty: ident, $resp_ty: ident, ($req: ident) $prelude: stmt; ($v: ident, $resp: ident, $tracker: ident) { $else_branch: expr }) => { fn $fn_name( storage: &Storage, $req: $req_ty, ) -> impl Future> { $prelude - let tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( + let $tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( $req.get_context(), RequestType::Unknown, 0, ))); - set_tls_tracker_token(tracker); + set_tls_tracker_token($tracker); let (cb, f) = paired_future_callback(); let res = storage.sched_txn_command($req.into(), cb); async move { defer!{{ - GLOBAL_TRACKERS.remove(tracker); + GLOBAL_TRACKERS.remove($tracker); }}; let $v = match res { Err(e) => Err(e), @@ -2000,24 +2058,35 @@ macro_rules! txn_command_future { } } }; + ($fn_name: ident, $req_ty: ident, $resp_ty: ident, ($v: ident, $resp: ident, $tracker: ident) { $else_branch: expr }) => { + txn_command_future!($fn_name, $req_ty, $resp_ty, (req) {}; ($v, $resp, $tracker) { $else_branch }); + }; ($fn_name: ident, $req_ty: ident, $resp_ty: ident, ($v: ident, $resp: ident) { $else_branch: expr }) => { - txn_command_future!($fn_name, $req_ty, $resp_ty, (req) {}; ($v, $resp) { $else_branch }); + txn_command_future!($fn_name, $req_ty, $resp_ty, (req) {}; ($v, $resp, tracker) { $else_branch }); }; } -txn_command_future!(future_prewrite, PrewriteRequest, PrewriteResponse, (v, resp) {{ +txn_command_future!(future_prewrite, PrewriteRequest, PrewriteResponse, (v, resp, tracker) {{ if let Ok(v) = &v { resp.set_min_commit_ts(v.min_commit_ts.into_inner()); resp.set_one_pc_commit_ts(v.one_pc_commit_ts.into_inner()); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + tracker.write_scan_detail(resp.mut_exec_details_v2().mut_scan_detail_v2()); + tracker.write_write_detail(resp.mut_exec_details_v2().mut_write_detail()); + }); } resp.set_errors(extract_key_errors(v.map(|v| v.locks)).into()); }}); -txn_command_future!(future_acquire_pessimistic_lock, PessimisticLockRequest, PessimisticLockResponse, (v, resp) { +txn_command_future!(future_acquire_pessimistic_lock, PessimisticLockRequest, PessimisticLockResponse, (v, resp, tracker) { match v { Ok(Ok(res)) => { let (values, not_founds) = res.into_values_and_not_founds(); resp.set_values(values.into()); resp.set_not_founds(not_founds); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + tracker.write_scan_detail(resp.mut_exec_details_v2().mut_scan_detail_v2()); + tracker.write_write_detail(resp.mut_exec_details_v2().mut_write_detail()); + }); }, Err(e) | Ok(Err(e)) => resp.set_errors(vec![extract_key_error(&e)].into()), } @@ -2035,10 +2104,14 @@ txn_command_future!(future_resolve_lock, ResolveLockRequest, ResolveLockResponse resp.set_error(extract_key_error(&e)); } }); -txn_command_future!(future_commit, CommitRequest, CommitResponse, (v, resp) { +txn_command_future!(future_commit, CommitRequest, CommitResponse, (v, resp, tracker) { match v { Ok(TxnStatus::Committed { commit_ts }) => { - resp.set_commit_version(commit_ts.into_inner()) + resp.set_commit_version(commit_ts.into_inner()); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + tracker.write_scan_detail(resp.mut_exec_details_v2().mut_scan_detail_v2()); + tracker.write_write_detail(resp.mut_exec_details_v2().mut_write_detail()); + }); } Ok(_) => unreachable!(), Err(e) => resp.set_error(extract_key_error(&e)), diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 935b657fa3f..18f3f7278d5 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -2025,3 +2025,131 @@ fn test_storage_with_quota_limiter_disable() { assert!(begin.elapsed() < Duration::from_millis(500)); } + +#[test] +fn test_commands_write_detail() { + let (_cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + cluster.cfg.pessimistic_txn.pipelined = false; + cluster.cfg.pessimistic_txn.in_memory = false; + }); + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + + let check_scan_detail = |sc: &ScanDetailV2| { + assert!(sc.get_get_snapshot_nanos() > 0); + }; + let check_write_detail = |wd: &WriteDetail| { + assert!(wd.get_store_batch_wait_nanos() > 0); + assert!(wd.get_persist_log_nanos() > 0); + assert!(wd.get_raft_db_write_leader_wait_nanos() > 0); + assert!(wd.get_raft_db_sync_log_nanos() > 0); + assert!(wd.get_raft_db_write_memtable_nanos() > 0); + assert!(wd.get_commit_log_nanos() > 0); + assert!(wd.get_apply_batch_wait_nanos() > 0); + assert!(wd.get_apply_log_nanos() > 0); + assert!(wd.get_apply_mutex_lock_nanos() > 0); + assert!(wd.get_apply_write_wal_nanos() > 0); + assert!(wd.get_apply_write_memtable_nanos() > 0); + }; + + let mut mutation = Mutation::default(); + mutation.set_op(Op::PessimisticLock); + mutation.set_key(k.clone()); + + let mut pessimistic_lock_req = PessimisticLockRequest::default(); + pessimistic_lock_req.set_context(ctx.clone()); + pessimistic_lock_req.set_mutations(vec![mutation.clone()].into()); + pessimistic_lock_req.set_start_version(20); + pessimistic_lock_req.set_for_update_ts(20); + pessimistic_lock_req.set_primary_lock(k.clone()); + pessimistic_lock_req.set_lock_ttl(3000); + let pessimistic_lock_resp = client.kv_pessimistic_lock(&pessimistic_lock_req).unwrap(); + check_scan_detail( + pessimistic_lock_resp + .get_exec_details_v2() + .get_scan_detail_v2(), + ); + check_write_detail( + pessimistic_lock_resp + .get_exec_details_v2() + .get_write_detail(), + ); + + let mut prewrite_req = PrewriteRequest::default(); + mutation.set_op(Op::Put); + mutation.set_value(v); + prewrite_req.set_mutations(vec![mutation].into()); + prewrite_req.set_is_pessimistic_lock(vec![true]); + prewrite_req.set_context(ctx.clone()); + prewrite_req.set_primary_lock(k.clone()); + prewrite_req.set_start_version(20); + prewrite_req.set_for_update_ts(20); + prewrite_req.set_lock_ttl(3000); + let prewrite_resp = client.kv_prewrite(&prewrite_req).unwrap(); + check_scan_detail(prewrite_resp.get_exec_details_v2().get_scan_detail_v2()); + check_write_detail(prewrite_resp.get_exec_details_v2().get_write_detail()); + + let mut commit_req = CommitRequest::default(); + commit_req.set_context(ctx); + commit_req.set_keys(vec![k].into()); + commit_req.set_start_version(20); + commit_req.set_commit_version(30); + let commit_resp = client.kv_commit(&commit_req).unwrap(); + check_scan_detail(commit_resp.get_exec_details_v2().get_scan_detail_v2()); + check_write_detail(commit_resp.get_exec_details_v2().get_write_detail()); +} + +#[test] +fn test_rpc_wall_time() { + let mut cluster = new_server_cluster(0, 1); + cluster.run(); + + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let k = b"key".to_vec(); + let mut get_req = GetRequest::default(); + get_req.set_context(ctx); + get_req.key = k; + get_req.version = 10; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!( + get_resp + .get_exec_details_v2() + .get_time_detail() + .get_total_rpc_wall_time_ns() + > 0 + ); + + let (mut sender, receiver) = client.batch_commands().unwrap(); + let mut batch_req = BatchCommandsRequest::default(); + for i in 0..3 { + let mut req = batch_commands_request::Request::default(); + req.cmd = Some(batch_commands_request::request::Cmd::Get(get_req.clone())); + batch_req.mut_requests().push(req); + batch_req.mut_request_ids().push(i); + } + block_on(sender.send((batch_req, WriteFlags::default()))).unwrap(); + block_on(sender.close()).unwrap(); + + let (tx, rx) = mpsc::sync_channel(1); + thread::spawn(move || { + let mut responses = Vec::new(); + for r in block_on( + receiver + .map(move |b| b.unwrap().take_responses()) + .collect::>(), + ) { + responses.extend(r.into_vec()); + } + tx.send(responses).unwrap(); + }); + let responses = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(responses.len(), 3); + for resp in responses { + assert!( + resp.get_get() + .get_exec_details_v2() + .get_time_detail() + .get_total_rpc_wall_time_ns() + > 0 + ); + } +} From 5c941586f5163ca9dbb82a0d14e7f02b09732181 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 11 Jul 2022 18:11:05 +0800 Subject: [PATCH 068/676] raftstore: update the Load Base Split metrics event (#12992) ref tikv/tikv#12063 Update the Load Base Split metrics event to distinguish more cases. Signed-off-by: JmPotato --- .../raftstore/src/store/worker/split_controller.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 1a3fb15af45..338158c7505 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -53,7 +53,9 @@ const NO_BALANCE_KEY: &str = "no_balance_key"; const NO_UNCROSS_KEY: &str = "no_uncross_key"; // Split info for the top hot CPU region has been collected, ready to split. const READY_TO_SPLIT_CPU_TOP: &str = "ready_to_split_cpu_top"; -// The top hot CPU region is not ready to split. +// Hottest key range for the top hot CPU region could not be found. +const EMPTY_HOTTEST_KEY_RANGE: &str = "empty_hottest_key_range"; +// The top hot CPU region could not be split. const UNABLE_TO_SPLIT_CPU_TOP: &str = "unable_to_split_cpu_top"; // It will return prefix sum of the given iter, @@ -899,9 +901,13 @@ impl AutoSplitController { ); } else { LOAD_BASE_SPLIT_EVENT - .with_label_values(&[UNABLE_TO_SPLIT_CPU_TOP]) + .with_label_values(&[EMPTY_HOTTEST_KEY_RANGE]) .inc(); } + } else { + LOAD_BASE_SPLIT_EVENT + .with_label_values(&[UNABLE_TO_SPLIT_CPU_TOP]) + .inc(); } // Clean up the rest top CPU usage recorders. for region_id in top_cpu_usage { From 126da29086da78bad3171975d4269ba283d133bd Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 12 Jul 2022 14:25:05 +0800 Subject: [PATCH 069/676] *: mitigations for Raft Engine encryption key issue (#12892) close tikv/tikv#12890 Signed-off-by: tabokie --- Cargo.lock | 4 +- cmd/tikv-ctl/src/cmd.rs | 4 ++ cmd/tikv-ctl/src/main.rs | 29 +++++++++-- components/encryption/src/manager/mod.rs | 17 +++++++ components/raft_log_engine/src/engine.rs | 61 ++++++++++++++++-------- components/raft_log_engine/src/lib.rs | 4 +- 6 files changed, 90 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2cab9eb4b2d..9b7e72c2632 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4026,7 +4026,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.2.0" -source = "git+https://github.com/tikv/raft-engine.git#07dcadbf51b43fed70346e33b5db07723e655828" +source = "git+https://github.com/tikv/raft-engine.git#7a436eae40a6b62371123c96941e058b7fe52b63" dependencies = [ "byteorder", "crc32fast", @@ -4057,7 +4057,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.2.0" -source = "git+https://github.com/tikv/raft-engine.git#07dcadbf51b43fed70346e33b5db07723e655828" +source = "git+https://github.com/tikv/raft-engine.git#7a436eae40a6b62371123c96941e058b7fe52b63" dependencies = [ "clap 3.1.6", "env_logger", diff --git a/cmd/tikv-ctl/src/cmd.rs b/cmd/tikv-ctl/src/cmd.rs index 4c49ccfa5ef..74cc69034fc 100644 --- a/cmd/tikv-ctl/src/cmd.rs +++ b/cmd/tikv-ctl/src/cmd.rs @@ -529,6 +529,8 @@ pub enum Cmd { #[structopt(subcommand)] cmd: EncryptionMetaCmd, }, + /// Delete encryption keys that are no longer associated with physical files. + CleanupEncryptionMeta {}, /// Print bad ssts related infos BadSsts { #[structopt(long)] @@ -545,6 +547,8 @@ pub enum Cmd { /// The version to reset TiKV to version: u64, }, + /// Control for Raft Engine + RaftEngineCtl { args: Vec }, #[structopt(external_subcommand)] External(Vec), } diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index e2ed740e779..9609fffb9a5 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -41,6 +41,7 @@ use kvproto::{ }; use pd_client::{Config as PdConfig, PdClient, RpcClient}; use protobuf::Message; +use raft_log_engine::ManagedFileSystem; use regex::Regex; use security::{SecurityConfig, SecurityManager}; use structopt::{clap::ErrorKind, StructOpt}; @@ -99,10 +100,19 @@ fn main() { match args[0].as_str() { "ldb" => run_ldb_command(args, &cfg), "sst_dump" => run_sst_dump_command(args, &cfg), - "raft-engine-ctl" => run_raft_engine_ctl_command(args), _ => Opt::clap().print_help().unwrap(), } } + Cmd::RaftEngineCtl { args } => { + let key_manager = + data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) + .expect("data_key_manager_from_config should success"); + let file_system = Arc::new(ManagedFileSystem::new( + key_manager.map(|m| Arc::new(m)), + None, + )); + raft_engine_ctl::run_command(args, file_system); + } Cmd::BadSsts { manifest, pd } => { let data_dir = opt.data_dir.as_deref(); assert!(data_dir.is_some(), "--data-dir must be specified"); @@ -184,6 +194,19 @@ fn main() { DataKeyManager::dump_file_dict(&cfg.storage.data_dir, path.as_deref()).unwrap(); } }, + Cmd::CleanupEncryptionMeta {} => { + let key_manager = + match data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) + .expect("data_key_manager_from_config should success") + { + Some(mgr) => mgr, + None => { + println!("Encryption is disabled"); + return; + } + }; + key_manager.retain_encrypted_files(|fname| Path::new(fname).exists()) + } Cmd::CompactCluster { db, cf, @@ -662,10 +685,6 @@ fn run_sst_dump_command(args: Vec, cfg: &TiKvConfig) { engine_rocks::raw::run_sst_dump_tool(&args, &opts); } -fn run_raft_engine_ctl_command(args: Vec) { - raft_engine_ctl::run_command(args); -} - fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, cfg: &TiKvConfig) { let db = &cfg.infer_kv_engine_path(Some(data_dir)).unwrap(); println!( diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index cd9be1b554d..2240e212b84 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -466,6 +466,23 @@ impl DataKeyManager { Ok(Some(Self::from_dicts(dicts, args.method, master_key)?)) } + /// Will block file operation for a considerable amount of time. Only used for debugging purpose. + pub fn retain_encrypted_files(&self, f: impl Fn(&str) -> bool) { + let mut dict = self.dicts.file_dict.lock().unwrap(); + let mut file_dict_file = self.dicts.file_dict_file.lock().unwrap(); + dict.files.retain(|fname, info| { + if info.method != EncryptionMethod::Plaintext { + let retain = f(fname); + if !retain { + file_dict_file.remove(fname).unwrap(); + } + retain + } else { + false + } + }); + } + fn load_dicts(master_key: &dyn Backend, args: &DataKeyManagerArgs) -> Result { if args.method != EncryptionMethod::Plaintext && !master_key.is_secure() { return Err(box_err!( diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 9236e7947db..d2f8b7cb4e1 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -9,8 +9,9 @@ use std::{ use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter}; use engine_traits::{ - CacheStats, EncryptionKeyManager, PerfContextExt, PerfContextKind, PerfLevel, RaftEngine, - RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, RaftLogGCTask, Result, + CacheStats, EncryptionKeyManager, EncryptionMethod, PerfContextExt, PerfContextKind, PerfLevel, + RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, + RaftLogGCTask, Result, }; use file_system::{IOOp, IORateLimiter, IOType}; use kvproto::{ @@ -41,7 +42,7 @@ impl MessageExt for MessageExtTyped { } } -struct ManagedReader { +pub struct ManagedReader { inner: Either< ::Reader, DecrypterReader<::Reader>, @@ -71,7 +72,7 @@ impl Read for ManagedReader { } } -struct ManagedWriter { +pub struct ManagedWriter { inner: Either< ::Writer, EncrypterWriter<::Writer>, @@ -129,26 +130,26 @@ impl WriteExt for ManagedWriter { } } -struct ManagedFileSystem { - base_level_file_system: DefaultFileSystem, +pub struct ManagedFileSystem { + base_file_system: DefaultFileSystem, key_manager: Option>, rate_limiter: Option>, } impl ManagedFileSystem { - fn new( + pub fn new( key_manager: Option>, rate_limiter: Option>, ) -> Self { Self { - base_level_file_system: DefaultFileSystem, + base_file_system: DefaultFileSystem, key_manager, rate_limiter, } } } -struct ManagedHandle { +pub struct ManagedHandle { path: PathBuf, base: Arc<::Handle>, } @@ -169,7 +170,7 @@ impl FileSystem for ManagedFileSystem { type Writer = ManagedWriter; fn create>(&self, path: P) -> IoResult { - let base = Arc::new(self.base_level_file_system.create(path.as_ref())?); + let base = Arc::new(self.base_file_system.create(path.as_ref())?); if let Some(ref manager) = self.key_manager { manager.new_file(path.as_ref().to_str().unwrap())?; } @@ -182,14 +183,38 @@ impl FileSystem for ManagedFileSystem { fn open>(&self, path: P) -> IoResult { Ok(ManagedHandle { path: path.as_ref().to_path_buf(), - base: Arc::new(self.base_level_file_system.open(path.as_ref())?), + base: Arc::new(self.base_file_system.open(path.as_ref())?), }) } + fn delete>(&self, path: P) -> IoResult<()> { + if let Some(ref manager) = self.key_manager { + manager.delete_file(path.as_ref().to_str().unwrap())?; + } + self.base_file_system.delete(path) + } + + fn exists_metadata>(&self, path: P) -> bool { + if let Some(ref manager) = self.key_manager { + if let Ok(info) = manager.get_file(path.as_ref().to_str().unwrap()) { + if info.method != EncryptionMethod::Plaintext { + return true; + } + } + } + self.base_file_system.exists_metadata(path) + } + + fn delete_metadata>(&self, path: P) -> IoResult<()> { + if let Some(ref manager) = self.key_manager { + // Note: no error if the file doesn't exist. + manager.delete_file(path.as_ref().to_str().unwrap())?; + } + self.base_file_system.delete_metadata(path) + } + fn new_reader(&self, handle: Arc) -> IoResult { - let base_reader = self - .base_level_file_system - .new_reader(handle.base.clone())?; + let base_reader = self.base_file_system.new_reader(handle.base.clone())?; if let Some(ref key_manager) = self.key_manager { Ok(ManagedReader { inner: Either::Right(key_manager.open_file_with_reader(&handle.path, base_reader)?), @@ -204,9 +229,7 @@ impl FileSystem for ManagedFileSystem { } fn new_writer(&self, handle: Arc) -> IoResult { - let base_writer = self - .base_level_file_system - .new_writer(handle.base.clone())?; + let base_writer = self.base_file_system.new_writer(handle.base.clone())?; if let Some(ref key_manager) = self.key_manager { Ok(ManagedWriter { @@ -224,10 +247,6 @@ impl FileSystem for ManagedFileSystem { }) } } - - fn delete>(&self, path: P) -> IoResult<()> { - self.base_level_file_system.delete(path) - } } #[derive(Clone)] diff --git a/components/raft_log_engine/src/lib.rs b/components/raft_log_engine/src/lib.rs index 7b8757d6531..41ba961c48a 100644 --- a/components/raft_log_engine/src/lib.rs +++ b/components/raft_log_engine/src/lib.rs @@ -23,4 +23,6 @@ extern crate tikv_util; mod engine; mod perf_context; -pub use engine::{RaftEngineConfig, RaftLogBatch, RaftLogEngine, ReadableSize, RecoveryMode}; +pub use engine::{ + ManagedFileSystem, RaftEngineConfig, RaftLogBatch, RaftLogEngine, ReadableSize, RecoveryMode, +}; From b558d0bffc554c4c0094a483c65d95a50a4141cf Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 12 Jul 2022 19:22:26 +0800 Subject: [PATCH 070/676] engine: use 6.29 rocksdb (#12533) Ref https://github.com/tikv/rocksdb/issues/277 Added some configurations, they will be hidden from doc for now. - rocksdb.xxcf.prepopulate-block-cache-mode = "disabled" - rocksdb.xxcf.format-version = 2 - rocksdb.xxcf.checksum = "crc32c" - WriteOptions::memtable_insert_hint_per_batch = false - ReadOptions::auto_prefix_mode = false - ReadOptions::adaptive_readahead = false A few notes: - `test_need_gc::test_without_properties` is removed, because in the new version of RocksDB, some portion of flushed data is replayed to memtable, and breaks the assumption of file layout. I haven't pinpointed the root cause, but I suppose this test case is not that important. - `test_compact_files_in_range` is partially removed, because it raises error: `Invalid argument: Cannot compact file to up level, input file: /000032.sst level 6 > output level 3`. Signed-off-by: tabokie Co-authored-by: 5kbpers --- Cargo.lock | 35 +++---- components/engine_rocks/src/compact.rs | 21 ---- components/engine_rocks/src/config.rs | 114 +++++++++++++++++++++ components/engine_rocks/src/file_system.rs | 19 +++- components/engine_rocks/src/options.rs | 10 +- components/engine_rocks/src/raw.rs | 8 +- components/engine_rocks/src/sst.rs | 12 ++- components/engine_rocks/src/write_batch.rs | 1 - components/raftstore/src/store/snap.rs | 5 +- etc/config-template.toml | 59 +++++++++++ src/config.rs | 74 ++++++++++--- src/server/gc_worker/mod.rs | 53 ++-------- tests/integrations/backup/mod.rs | 4 + tests/integrations/config/mod.rs | 26 ++++- tests/integrations/config/test-custom.toml | 17 ++- tests/integrations/storage/test_titan.rs | 1 - 16 files changed, 330 insertions(+), 129 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9b7e72c2632..7e562246adc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1052,7 +1052,7 @@ dependencies = [ "clap 2.33.0", "criterion-plot", "csv", - "itertools 0.10.0", + "itertools", "lazy_static", "num-traits", "oorandom", @@ -1094,7 +1094,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" dependencies = [ "cast", - "itertools 0.10.0", + "itertools", ] [[package]] @@ -2550,15 +2550,6 @@ dependencies = [ "serde", ] -[[package]] -name = "itertools" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.10.0" @@ -2754,7 +2745,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#773784178a0e8e5fdad81f4fd85448a3014a3700" +source = "git+https://github.com/tikv/rust-rocksdb.git#d8b7ff8aee62aa9a406b64f7093049d62eeb9a1a" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2773,7 +2764,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#773784178a0e8e5fdad81f4fd85448a3014a3700" +source = "git+https://github.com/tikv/rust-rocksdb.git#d8b7ff8aee62aa9a406b64f7093049d62eeb9a1a" dependencies = [ "bzip2-sys", "cc", @@ -3915,7 +3906,7 @@ checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603" dependencies = [ "bytes", "heck 0.3.1", - "itertools 0.10.0", + "itertools", "log", "multimap", "petgraph", @@ -3932,7 +3923,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "600d2f334aa05acb02a755e217ef1ab6dea4d51b58b7846588b747edec04efba" dependencies = [ "anyhow", - "itertools 0.10.0", + "itertools", "proc-macro2", "quote", "syn", @@ -4125,7 +4116,7 @@ dependencies = [ "getset", "grpcio-health", "into_other", - "itertools 0.10.0", + "itertools", "keys", "kvproto", "lazy_static", @@ -4578,7 +4569,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#773784178a0e8e5fdad81f4fd85448a3014a3700" +source = "git+https://github.com/tikv/rust-rocksdb.git#d8b7ff8aee62aa9a406b64f7093049d62eeb9a1a" dependencies = [ "libc 0.2.125", "librocksdb_sys", @@ -5909,7 +5900,7 @@ dependencies = [ "collections", "fail", "futures 0.3.15", - "itertools 0.10.0", + "itertools", "kvproto", "log_wrappers", "match_template", @@ -6010,7 +6001,7 @@ dependencies = [ "hyper-openssl", "hyper-tls", "into_other", - "itertools 0.10.0", + "itertools", "keys", "kvproto", "lazy_static", @@ -7083,12 +7074,10 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "1.4.19+zstd.1.4.8" +version = "2.0.1+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec24a9273d24437afb8e71b16f3d9a5d569193cccdb7896213b59f552f387674" +checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b" dependencies = [ "cc", - "glob", - "itertools 0.9.0", "libc 0.2.125", ] diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index 05369015a1e..fef3af46f5c 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -229,26 +229,5 @@ mod tests { assert_eq!(level_n[0].get_smallestkey(), &[0]); assert_eq!(level_n[0].get_largestkey(), &[4]); } - - for cf_name in db.cf_names() { - let mut files = vec![]; - let cf = db.cf_handle(cf_name).unwrap(); - let cf_meta = db.get_column_family_meta_data(cf); - let cf_levels = cf_meta.get_levels(); - - for level in cf_levels.into_iter().rev() { - files.extend(level.get_files().iter().map(|f| f.get_name())); - } - - assert_eq!(files.len(), 2); - db.c() - .compact_files_cf(cf_name, files.clone(), Some(3), 0, true) - .unwrap(); - - let cf_meta = db.get_column_family_meta_data(cf); - let cf_levels = cf_meta.get_levels(); - assert_eq!(cf_levels[0].get_files().len(), 1); - assert_eq!(cf_levels[3].get_files().len(), 1); - } } } diff --git a/components/engine_rocks/src/config.rs b/components/engine_rocks/src/config.rs index 9c015b7e7d1..e121a1cea18 100644 --- a/components/engine_rocks/src/config.rs +++ b/components/engine_rocks/src/config.rs @@ -215,6 +215,120 @@ pub mod compression_type_serde { } } +pub mod checksum_serde { + use std::fmt; + + use rocksdb::ChecksumType; + use serde::{ + de::{Error, Unexpected, Visitor}, + Deserializer, Serializer, + }; + + pub fn serialize(t: &ChecksumType, serializer: S) -> Result + where + S: Serializer, + { + let name = match *t { + ChecksumType::NoChecksum => "no", + ChecksumType::CRC32c => "crc32c", + ChecksumType::XxHash => "xxhash", + ChecksumType::XxHash64 => "xxhash64", + ChecksumType::XXH3 => "xxh3", + }; + serializer.serialize_str(name) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct StrVistor; + impl<'de> Visitor<'de> for StrVistor { + type Value = ChecksumType; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(formatter, "a checksum type") + } + + fn visit_str(self, value: &str) -> Result + where + E: Error, + { + let str = match &*value.trim().to_lowercase() { + "no" => ChecksumType::NoChecksum, + "crc32c" => ChecksumType::CRC32c, + "xxhash" => ChecksumType::XxHash, + "xxhash64" => ChecksumType::XxHash64, + "xxh3" => ChecksumType::XXH3, + _ => { + return Err(E::invalid_value( + Unexpected::Other("invalid checksum type"), + &self, + )); + } + }; + Ok(str) + } + } + + deserializer.deserialize_str(StrVistor) + } +} + +pub mod prepopulate_block_cache_serde { + use std::fmt; + + use rocksdb::PrepopulateBlockCache; + use serde::{ + de::{Error, Unexpected, Visitor}, + Deserializer, Serializer, + }; + + pub fn serialize(t: &PrepopulateBlockCache, serializer: S) -> Result + where + S: Serializer, + { + let name = match *t { + PrepopulateBlockCache::Disabled => "disabled", + PrepopulateBlockCache::FlushOnly => "flush-only", + }; + serializer.serialize_str(name) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct StrVistor; + impl<'de> Visitor<'de> for StrVistor { + type Value = PrepopulateBlockCache; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(formatter, "a prepopulate block cache mode") + } + + fn visit_str(self, value: &str) -> Result + where + E: Error, + { + let str = match &*value.trim().to_lowercase() { + "disabled" => PrepopulateBlockCache::Disabled, + "flush-only" => PrepopulateBlockCache::FlushOnly, + _ => { + return Err(E::invalid_value( + Unexpected::Other("invalid prepopulate block cache mode"), + &self, + )); + } + }; + Ok(str) + } + } + + deserializer.deserialize_str(StrVistor) + } +} + #[derive(Copy, Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] pub enum BlobRunMode { diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index a9eebc161af..397eaead488 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -70,7 +70,9 @@ mod tests { #[test] fn test_inspected_compact() { - let value_size = 1024; + // NOTICE: Specific to RocksDB version. + let amplification_bytes = 2560; + let value_size = amplification_bytes * 2; let temp_dir = Builder::new() .prefix("test_inspected_compact") .tempdir() @@ -81,15 +83,16 @@ mod tests { db.put(&data_key(b"a1"), &value).unwrap(); db.put(&data_key(b"a2"), &value).unwrap(); + assert_eq!(stats.fetch(IOType::Flush, IOOp::Write), 0); db.flush(true /*sync*/).unwrap(); assert!(stats.fetch(IOType::Flush, IOOp::Write) > value_size * 2); - assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 3); + assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.put(&data_key(b"a2"), &value).unwrap(); db.put(&data_key(b"a3"), &value).unwrap(); db.flush(true /*sync*/).unwrap(); assert!(stats.fetch(IOType::Flush, IOOp::Write) > value_size * 2); - assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 3); + assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.c() .compact_range( @@ -100,8 +103,14 @@ mod tests { ) .unwrap(); assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) > value_size * 4); - assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) < value_size * 5); + assert!( + stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) + < value_size * 4 + amplification_bytes + ); assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Write) > value_size * 3); - assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Write) < value_size * 4); + assert!( + stats.fetch(IOType::LevelZeroCompaction, IOOp::Write) + < value_size * 3 + amplification_bytes + ); } } diff --git a/components/engine_rocks/src/options.rs b/components/engine_rocks/src/options.rs index c1610f64224..c50c7734f79 100644 --- a/components/engine_rocks/src/options.rs +++ b/components/engine_rocks/src/options.rs @@ -16,7 +16,7 @@ impl RocksReadOptions { impl From for RocksReadOptions { fn from(opts: engine_traits::ReadOptions) -> Self { let mut r = RawReadOptions::default(); - r.fill_cache(opts.fill_cache()); + r.set_fill_cache(opts.fill_cache()); RocksReadOptions(r) } } @@ -40,6 +40,8 @@ impl From for RocksWriteOptions { let mut r = RawWriteOptions::default(); r.set_sync(opts.sync()); r.set_no_slowdown(opts.no_slowdown()); + // TODO: enable it. + r.set_memtable_insert_hint_per_batch(false); RocksWriteOptions(r) } } @@ -59,16 +61,20 @@ impl From for RocksReadOptions { fn build_read_opts(iter_opts: engine_traits::IterOptions) -> RawReadOptions { let mut opts = RawReadOptions::new(); - opts.fill_cache(iter_opts.fill_cache()); + opts.set_fill_cache(iter_opts.fill_cache()); opts.set_max_skippable_internal_keys(iter_opts.max_skippable_internal_keys()); if iter_opts.key_only() { opts.set_titan_key_only(true); } if iter_opts.total_order_seek_used() { opts.set_total_order_seek(true); + // TODO: enable it. + opts.set_auto_prefix_mode(false); } else if iter_opts.prefix_same_as_start() { opts.set_prefix_same_as_start(true); } + // TODO: enable it. + opts.set_adaptive_readahead(false); if iter_opts.hint_min_ts().is_some() || iter_opts.hint_max_ts().is_some() { opts.set_table_filter(TsFilter::new( diff --git a/components/engine_rocks/src/raw.rs b/components/engine_rocks/src/raw.rs index 145931743dd..c7d2e3a0d31 100644 --- a/components/engine_rocks/src/raw.rs +++ b/components/engine_rocks/src/raw.rs @@ -8,13 +8,13 @@ pub use rocksdb::{ new_compaction_filter_raw, run_ldb_tool, run_sst_dump_tool, BlockBasedOptions, CFHandle, Cache, - ColumnFamilyOptions, CompactOptions, CompactionFilter, CompactionFilterContext, + ChecksumType, ColumnFamilyOptions, CompactOptions, CompactionFilter, CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, CompactionJobInfo, CompactionOptions, CompactionPriority, DBBottommostLevelCompaction, DBCompactionFilter, DBCompactionStyle, DBCompressionType, DBEntryType, DBInfoLogLevel, DBIterator, DBOptions, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, DBTitanDBBlobRunMode, Env, EventListener, IngestExternalFileOptions, LRUCacheOptions, - MemoryAllocator, PerfContext, Range, ReadOptions, SeekKey, SliceTransform, TableFilter, - TablePropertiesCollector, TablePropertiesCollectorFactory, TitanBlobIndex, TitanDBOptions, - Writable, WriteOptions, DB, + MemoryAllocator, PerfContext, PrepopulateBlockCache, Range, ReadOptions, SeekKey, + SliceTransform, TableFilter, TablePropertiesCollector, TablePropertiesCollectorFactory, + TitanBlobIndex, TitanDBOptions, Writable, WriteOptions, DB, }; diff --git a/components/engine_rocks/src/sst.rs b/components/engine_rocks/src/sst.rs index 58f300a8ec2..c7eb52e0527 100644 --- a/components/engine_rocks/src/sst.rs +++ b/components/engine_rocks/src/sst.rs @@ -222,9 +222,15 @@ impl SstWriterBuilder for RocksSstWriterBuilder { }; // TODO: 0 is a valid value for compression_level if self.compression_level != 0 { - // other three fields are default value. - // see: https://github.com/facebook/rocksdb/blob/8cb278d11a43773a3ac22e523f4d183b06d37d88/include/rocksdb/advanced_options.h#L146-L153 - io_options.set_compression_options(-14, self.compression_level, 0, 0, 0); + // other 4 fields are default value. + io_options.set_compression_options( + -14, + self.compression_level, + 0, /*strategy*/ + 0, /*max_dict_bytes*/ + 0, /*zstd_max_train_bytes*/ + 1, /*parallel_threads*/ + ); } io_options.compression(compress_type); // in rocksdb 5.5.1, SstFileWriter will try to use bottommost_compression and diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index 824882cc1e9..e9428b2c291 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -152,7 +152,6 @@ mod tests { let opt = RawDBOptions::default(); opt.enable_unordered_write(false); opt.enable_pipelined_write(false); - opt.enable_pipelined_commit(true); let engine = new_engine_opt( path.path().join("db").to_str().unwrap(), RocksDBOptions::from_raw(opt), diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index bb308efd054..eaf99506f4b 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -2862,6 +2862,8 @@ pub mod tests { s.write_all(&recv_remain).unwrap(); s.save().unwrap(); + let snap_size = snap_mgr.get_total_snap_size().unwrap(); + let max_snap_count = (max_total_size + snap_size - 1) / snap_size; for (i, region_id) in regions.into_iter().enumerate() { let key = SnapKey::new(region_id, 1, 1); let region = gen_test_region(region_id, 1, 1); @@ -2878,9 +2880,6 @@ pub mod tests { ) .unwrap(); - // TODO: this size may change in different RocksDB version. - let snap_size = 1660; - let max_snap_count = (max_total_size + snap_size - 1) / snap_size; // The first snap_size is for region 100. // That snapshot won't be deleted because it's not for generating. assert_eq!( diff --git a/etc/config-template.toml b/etc/config-template.toml index 2195e681f62..b63fe2ce235 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -803,6 +803,56 @@ ## for the same CF. # compaction-guard-max-output-file-size = "128M" +## Available versions: +## +## 0 -- This version can be read by all TiKV releases. Doesn't support changing +## checksum type (default is CRC32). +## +## 1 -- Can be read by all TiKV releases. Supports non-default checksum, like +## xxHash. It is written by RocksDB when BlockBasedTableOptions::checksum is +## something other than kCRC32c. (version 0 is silently upconverted) +## +## 2 -- Can be read by all TiKV releases. Changes the way we encode compressed +## blocks with LZ4, BZip2 and Zlib compression. +## +## 3 -- Can be read by TiKV's versions since 2.1. Changes the way we encode the +## keys in index blocks. +## This option only affects newly written tables. When reading existing tables, +## the information about version is read from the footer. +## +## 4 -- Can be read by TiKV's versions since 3.0. Changes the way we encode the +## values in index blocks. +## This option only affects newly written tables. When reading existing tables, +## the information about version is read from the footer. +## +# format-version = 2 + +## If enabled, prepopulate warm/hot blocks (data, uncompressed dict, index and +## filter blocks) which are already in memory into block cache at the time of +## flush. On a flush, the block that is in memory (in memtables) get flushed +## to the device. If using Direct IO, additional IO is incurred to read this +## data back into memory again, which is avoided by enabling this option. This +## further helps if the workload exhibits high temporal locality, where most +## of the reads go to recently written data. This also helps in case of +## Distributed FileSystem. +## +## disabled: kDisabled +## flush-only: kFlushOnly +## +# prepopulate-block-cache = "disabled" + +## Use the specified checksum type. Newly created table files will be +## protected with this checksum type. Old table files will still be readable, +## even though they have different checksum type. +## +## no: kNoChecksum +## crc32c: kCRC32c +## xxhash: kxxHash +## xxhash64: kxxHash64 +## xxh3: kXXH3 (supported since TiKV 6.2) +## +# checksum = "crc32c" + ## Options for "Default" Column Family for `Titan`. [rocksdb.defaultcf.titan] ## The smallest value to store in blob files. Value smaller than @@ -887,6 +937,9 @@ # enable-compaction-guard = true # compaction-guard-min-output-file-size = "8M" # compaction-guard-max-output-file-size = "128M" +# format-version = 2 +# prepopulate-block-cache = "disabled" +# checksum = "crc32c" [rocksdb.lockcf] # compression-per-level = ["no", "no", "no", "no", "no", "no", "no"] @@ -908,6 +961,9 @@ # dynamic-level-bytes = true # optimize-filters-for-hits = false # enable-compaction-guard = false +# format-version = 2 +# prepopulate-block-cache = "disabled" +# checksum = "crc32c" [raftdb] # max-background-jobs = 4 @@ -967,6 +1023,9 @@ # dynamic-level-bytes = true # optimize-filters-for-hits = true # enable-compaction-guard = false +# format-version = 2 +# prepopulate-block-cache = "disabled" +# checksum = "crc32c" [raft-engine] ## Determines whether to use Raft Engine to store raft logs. When it is diff --git a/src/config.rs b/src/config.rs index 9e0abe37c94..239c80a62ab 100644 --- a/src/config.rs +++ b/src/config.rs @@ -25,9 +25,9 @@ use engine_rocks::{ get_env, properties::MvccPropertiesCollectorFactory, raw::{ - BlockBasedOptions, Cache, ColumnFamilyOptions, CompactionPriority, DBCompactionStyle, - DBCompressionType, DBOptions, DBRateLimiterMode, DBRecoveryMode, Env, LRUCacheOptions, - TitanDBOptions, + BlockBasedOptions, Cache, ChecksumType, ColumnFamilyOptions, CompactionPriority, + DBCompactionStyle, DBCompressionType, DBOptions, DBRateLimiterMode, DBRecoveryMode, Env, + LRUCacheOptions, PrepopulateBlockCache, TitanDBOptions, }, raw_util::CFOptions, util::{FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform}, @@ -129,8 +129,11 @@ pub struct TitanCfConfig { pub max_gc_batch_size: ReadableSize, #[online_config(skip)] pub discardable_ratio: f64, + // deprecated. #[online_config(skip)] - pub sample_ratio: f64, + #[doc(hidden)] + #[serde(skip_serializing)] + pub sample_ratio: Option, #[online_config(skip)] pub merge_small_file_threshold: ReadableSize, pub blob_run_mode: BlobRunMode, @@ -156,7 +159,7 @@ impl Default for TitanCfConfig { min_gc_batch_size: ReadableSize::mb(16), max_gc_batch_size: ReadableSize::mb(64), discardable_ratio: 0.5, - sample_ratio: 0.1, + sample_ratio: None, merge_small_file_threshold: ReadableSize::mb(8), blob_run_mode: BlobRunMode::Normal, level_merge: false, @@ -176,7 +179,6 @@ impl TitanCfConfig { opts.set_min_gc_batch_size(self.min_gc_batch_size.0 as u64); opts.set_max_gc_batch_size(self.max_gc_batch_size.0 as u64); opts.set_discardable_ratio(self.discardable_ratio); - opts.set_sample_ratio(self.sample_ratio); opts.set_merge_small_file_threshold(self.merge_small_file_threshold.0 as u64); opts.set_blob_run_mode(self.blob_run_mode.into()); opts.set_level_merge(self.level_merge); @@ -197,6 +199,9 @@ impl TitanCfConfig { .into(), ); } + if self.sample_ratio.is_some() { + warn!("sample-ratio is deprecated. Ignoring the value."); + } Ok(()) } } @@ -335,6 +340,14 @@ macro_rules! cf_config { pub bottommost_zstd_compression_dict_size: i32, #[online_config(skip)] pub bottommost_zstd_compression_sample_size: i32, + #[serde(with = "rocks_config::prepopulate_block_cache_serde")] + #[online_config(skip)] + pub prepopulate_block_cache: PrepopulateBlockCache, + #[online_config(skip)] + pub format_version: u32, + #[serde(with = "rocks_config::checksum_serde")] + #[online_config(skip)] + pub checksum: ChecksumType, #[online_config(submodule)] pub titan: TitanCfConfig, } @@ -350,6 +363,10 @@ macro_rules! cf_config { ) .into()); } + if self.format_version > 5 { + // TODO: allow version 5 if we have another LTS capable of reading it? + return Err("format-version larger than 5 is unsupported".into()); + } self.titan.validate()?; Ok(()) } @@ -477,9 +494,6 @@ macro_rules! write_into_metrics { $metrics .with_label_values(&[$tag, "titan_discardable_ratio"]) .set($cf.titan.discardable_ratio); - $metrics - .with_label_values(&[$tag, "titan_sample_ratio"]) - .set($cf.titan.sample_ratio); $metrics .with_label_values(&[$tag, "titan_merge_small_file_threshold"]) .set($cf.titan.merge_small_file_threshold.0 as f64); @@ -503,12 +517,15 @@ macro_rules! build_cf_opt { .set_pin_l0_filter_and_index_blocks_in_cache($opt.pin_l0_filter_and_index_blocks); if $opt.use_bloom_filter { block_base_opts.set_bloom_filter( - $opt.bloom_filter_bits_per_key, + $opt.bloom_filter_bits_per_key as f64, $opt.block_based_bloom_filter, ); block_base_opts.set_whole_key_filtering($opt.whole_key_filtering); } block_base_opts.set_read_amp_bytes_per_bit($opt.read_amp_bytes_per_bit); + block_base_opts.set_prepopulate_block_cache($opt.prepopulate_block_cache); + block_base_opts.set_format_version($opt.format_version); + block_base_opts.set_checksum($opt.checksum); let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_block_based_table_factory(&block_base_opts); cf_opts.set_num_levels($opt.num_levels); @@ -524,6 +541,7 @@ macro_rules! build_cf_opt { 0, /* strategy */ $opt.bottommost_zstd_compression_dict_size, $opt.bottommost_zstd_compression_sample_size, + 1, /* parallel_threads */ ); cf_opts.set_write_buffer_size($opt.write_buffer_size.0); cf_opts.set_max_write_buffer_number($opt.max_write_buffer_number); @@ -629,10 +647,13 @@ impl Default for DefaultCfConfig { enable_compaction_guard: true, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), - titan: TitanCfConfig::default(), bottommost_level_compression: DBCompressionType::Zstd, bottommost_zstd_compression_dict_size: 0, bottommost_zstd_compression_sample_size: 0, + prepopulate_block_cache: PrepopulateBlockCache::Disabled, + format_version: 2, + checksum: ChecksumType::CRC32c, + titan: TitanCfConfig::default(), } } } @@ -738,10 +759,13 @@ impl Default for WriteCfConfig { enable_compaction_guard: true, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), - titan, bottommost_level_compression: DBCompressionType::Zstd, bottommost_zstd_compression_dict_size: 0, bottommost_zstd_compression_sample_size: 0, + prepopulate_block_cache: PrepopulateBlockCache::Disabled, + format_version: 2, + checksum: ChecksumType::CRC32c, + titan, } } } @@ -833,10 +857,13 @@ impl Default for LockCfConfig { enable_compaction_guard: false, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), - titan, bottommost_level_compression: DBCompressionType::Disable, bottommost_zstd_compression_dict_size: 0, bottommost_zstd_compression_sample_size: 0, + prepopulate_block_cache: PrepopulateBlockCache::Disabled, + format_version: 2, + checksum: ChecksumType::CRC32c, + titan, } } } @@ -906,10 +933,13 @@ impl Default for RaftCfConfig { enable_compaction_guard: false, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), - titan, bottommost_level_compression: DBCompressionType::Disable, bottommost_zstd_compression_dict_size: 0, bottommost_zstd_compression_sample_size: 0, + prepopulate_block_cache: PrepopulateBlockCache::Disabled, + format_version: 2, + checksum: ChecksumType::CRC32c, + titan, } } } @@ -1141,8 +1171,6 @@ impl DbConfig { self.use_direct_io_for_flush_and_compaction, ); opts.enable_pipelined_write(self.enable_pipelined_write); - let enable_pipelined_commit = !self.enable_pipelined_write && !self.enable_unordered_write; - opts.enable_pipelined_commit(enable_pipelined_commit); opts.enable_unordered_write(self.enable_unordered_write); opts.set_info_log(RocksdbLogger::default()); opts.set_info_log_level(self.info_log_level.into()); @@ -1269,10 +1297,13 @@ impl Default for RaftDefaultCfConfig { enable_compaction_guard: false, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), - titan: TitanCfConfig::default(), bottommost_level_compression: DBCompressionType::Disable, bottommost_zstd_compression_dict_size: 0, bottommost_zstd_compression_sample_size: 0, + prepopulate_block_cache: PrepopulateBlockCache::Disabled, + format_version: 2, + checksum: ChecksumType::CRC32c, + titan: TitanCfConfig::default(), } } } @@ -3398,6 +3429,15 @@ impl TiKvConfig { } } + if last_cfg.raftdb.defaultcf.format_version > 5 + || last_cfg.rocksdb.defaultcf.format_version > 5 + || last_cfg.rocksdb.writecf.format_version > 5 + || last_cfg.rocksdb.lockcf.format_version > 5 + || last_cfg.rocksdb.raftcf.format_version > 5 + { + return Err("format_version larger than 5 is unsupported".into()); + } + Ok(()) } diff --git a/src/server/gc_worker/mod.rs b/src/server/gc_worker/mod.rs index 0eea3b77131..4e2bc6e76de 100644 --- a/src/server/gc_worker/mod.rs +++ b/src/server/gc_worker/mod.rs @@ -64,17 +64,16 @@ mod tests { region: Region, safe_point: impl Into, need_gc: bool, - ) -> Option { + ) -> MvccProperties { let safe_point = safe_point.into(); let start = keys::data_key(region.get_start_key()); let end = keys::data_end_key(region.get_end_key()); let props = db .c() - .get_mvcc_properties_cf(CF_WRITE, safe_point, &start, &end); - if let Some(props) = props.as_ref() { - assert_eq!(check_need_gc(safe_point, 1.0, props), need_gc); - } + .get_mvcc_properties_cf(CF_WRITE, safe_point, &start, &end) + .unwrap(); + assert_eq!(check_need_gc(safe_point, 1.0, &props), need_gc); props } @@ -86,48 +85,22 @@ mod tests { .unwrap(); let path = path.path().to_str().unwrap(); let region = make_region(1, vec![0], vec![10]); - test_without_properties(path, ®ion); test_with_properties(path, ®ion); } - fn test_without_properties(path: &str, region: &Region) { - let db = open_db(path, false); + fn test_with_properties(path: &str, region: &Region) { + let db = open_db(path, true); let mut engine = RegionEngine::new(&db, region); // Put 2 keys. engine.put(&[1], 1, 1); engine.put(&[4], 2, 2); - assert!( - get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, true).is_none() - ); - engine.flush(); - // After this flush, we have a SST file without properties. - // Without properties, we always need GC. - assert!( - get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, true).is_none() - ); - } - - fn test_with_properties(path: &str, region: &Region) { - let db = open_db(path, true); - let mut engine = RegionEngine::new(&db, region); - // Put 2 keys. engine.put(&[2], 3, 3); engine.put(&[3], 4, 4); engine.flush(); - // After this flush, we have a SST file w/ properties, plus the SST - // file w/o properties from previous flush. We always need GC as - // long as we can't get properties from any SST files. - assert!( - get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, true).is_none() - ); engine.compact(); - // After this compact, the two SST files are compacted into a new - // SST file with properties. Now all SST files have properties and - // all keys have only one version, so we don't need gc. - let props = - get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, false).unwrap(); + let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, false); assert_eq!(props.min_ts, 1.into()); assert_eq!(props.max_ts, 4.into()); assert_eq!(props.num_rows, 4); @@ -143,8 +116,7 @@ mod tests { engine.flush(); // After this flush, keys 5,6 in the new SST file have more than one // versions, so we need gc. - let props = - get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, true).unwrap(); + let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, true); assert_eq!(props.min_ts, 1.into()); assert_eq!(props.max_ts, 8.into()); assert_eq!(props.num_rows, 6); @@ -152,8 +124,7 @@ mod tests { assert_eq!(props.num_versions, 8); assert_eq!(props.max_row_versions, 2); // But if the `safe_point` is older than all versions, we don't need gc too. - let props = - get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 0, false).unwrap(); + let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 0, false); assert_eq!(props.min_ts, TimeStamp::max()); assert_eq!(props.max_ts, TimeStamp::zero()); assert_eq!(props.num_rows, 0); @@ -167,8 +138,7 @@ mod tests { engine.compact(); // After this compact, all versions of keys 5,6 are deleted, // no keys have more than one versions, so we don't need gc. - let props = - get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, false).unwrap(); + let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, false); assert_eq!(props.min_ts, 1.into()); assert_eq!(props.max_ts, 4.into()); assert_eq!(props.num_rows, 4); @@ -179,8 +149,7 @@ mod tests { // A single lock version need gc. engine.lock(&[7], 9, 9); engine.flush(); - let props = - get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, true).unwrap(); + let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, true); assert_eq!(props.min_ts, 1.into()); assert_eq!(props.max_ts, 9.into()); assert_eq!(props.num_rows, 5); diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index 6d171bcae28..ccadcca674f 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -52,6 +52,10 @@ fn assert_same_files(mut files1: Vec, mut files2: Vec Date: Wed, 13 Jul 2022 10:59:05 +0800 Subject: [PATCH 071/676] raftstore: fix the building warning caused by the feature cfg (#13004) ref tikv/tikv#12063 Fix the building warning caused by the feature cfg. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- .../raftstore/src/store/worker/split_controller.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 338158c7505..2964796e4b2 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -8,7 +8,6 @@ use std::{ time::{Duration, SystemTime}, }; -use fail::fail_point; use kvproto::{ kvrpcpb::KeyRange, metapb::{self, Peer}, @@ -619,7 +618,7 @@ impl AutoSplitController { fn is_grpc_poll_busy(&self, grpc_thread_usage: f64) -> bool { #[cfg(feature = "failpoints")] - fail_point!("mock_grpc_poll_is_not_busy", |_| { false }); + fail::fail_point!("mock_grpc_poll_is_not_busy", |_| { false }); if self.max_grpc_thread_count == 0 { return false; } @@ -630,7 +629,7 @@ impl AutoSplitController { fn is_unified_read_pool_busy(&self, unified_read_pool_thread_usage: f64) -> bool { #[cfg(feature = "failpoints")] - fail_point!("mock_unified_read_pool_is_busy", |_| { true }); + fail::fail_point!("mock_unified_read_pool_is_busy", |_| { true }); if self.max_unified_read_pool_thread_count == 0 { return false; } @@ -645,7 +644,7 @@ impl AutoSplitController { fn is_region_busy(&self, unified_read_pool_thread_usage: f64, region_cpu_usage: f64) -> bool { #[cfg(feature = "failpoints")] - fail_point!("mock_region_is_busy", |_| { true }); + fail::fail_point!("mock_region_is_busy", |_| { true }); if unified_read_pool_thread_usage <= 0.0 || !self.should_check_region_cpu() { return false; } @@ -798,9 +797,8 @@ impl AutoSplitController { .with_label_values(&["read"]) .observe(qps as f64); - // 1. If the QPS and Byte do not meet the threshold, skip. - // 2. If the Unified Read Pool is not busy or - // the Region is not hot enough (takes up 50% of the Unified Read Pool CPU times), skip. + // 1. If the QPS or the byte does not meet the threshold, skip. + // 2. If the Unified Read Pool or the region is not hot enough, skip. if qps < self.cfg.qps_threshold && byte < self.cfg.byte_threshold && (!is_unified_read_pool_busy || !is_region_busy) From 46d999db06ccef233d8cc38c4a5931d9829b0dd5 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Wed, 13 Jul 2022 11:27:05 +0800 Subject: [PATCH 072/676] raftstore: Implement coprocessor observer pre_exec_admin(query) (#12868) ref tikv/tikv#12849 Support new observers pre_exec_admin(query). Signed-off-by: CalvinNeo --- .../raftstore/src/coprocessor/dispatcher.rs | 47 +++- components/raftstore/src/coprocessor/mod.rs | 10 + components/raftstore/src/store/fsm/apply.rs | 212 +++++++++++++++--- 3 files changed, 240 insertions(+), 29 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 3f51dd918c6..24b79bf4877 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -416,6 +416,29 @@ impl CoprocessorHost { } } + pub fn pre_exec(&self, region: &Region, cmd: &RaftCmdRequest) -> bool { + let mut ctx = ObserverContext::new(region); + if !cmd.has_admin_request() { + let query = cmd.get_requests(); + for observer in &self.registry.query_observers { + let observer = observer.observer.inner(); + if observer.pre_exec_query(&mut ctx, query) { + return true; + } + } + false + } else { + let admin = cmd.get_admin_request(); + for observer in &self.registry.admin_observers { + let observer = observer.observer.inner(); + if observer.pre_exec_admin(&mut ctx, admin) { + return true; + } + } + false + } + } + pub fn post_apply_plain_kvs_from_snapshot( &self, region: &Region, @@ -608,6 +631,12 @@ mod tests { self.called.fetch_add(3, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } + + fn pre_exec_admin(&self, ctx: &mut ObserverContext<'_>, _: &AdminRequest) -> bool { + self.called.fetch_add(16, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + false + } } impl QueryObserver for TestCoprocessor { @@ -634,6 +663,12 @@ mod tests { ctx.bypass = self.bypass.load(Ordering::SeqCst); } + fn pre_exec_query(&self, ctx: &mut ObserverContext<'_>, _: &[Request]) -> bool { + self.called.fetch_add(15, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + false + } + fn on_empty_cmd(&self, ctx: &mut ObserverContext<'_>, _index: u64, _term: u64) { self.called.fetch_add(14, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); @@ -767,7 +802,17 @@ mod tests { let mut empty_req = RaftCmdRequest::default(); empty_req.set_requests(vec![Request::default()].into()); host.on_empty_cmd(®ion, 0, 0); - assert_all!([&ob.called], &[88]); + assert_all!([&ob.called], &[88]); // 14 + + let mut query_req = RaftCmdRequest::default(); + query_req.set_requests(vec![Request::default()].into()); + host.pre_exec(®ion, &query_req); + assert_all!([&ob.called], &[103]); // 15 + + let mut admin_req = RaftCmdRequest::default(); + admin_req.set_admin_request(AdminRequest::default()); + host.pre_exec(®ion, &admin_req); + assert_all!([&ob.called], &[119]); // 16 } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 39b412ce950..2dc83c8d7af 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -86,6 +86,11 @@ pub trait AdminObserver: Coprocessor { /// Hook to call after applying admin request. /// For now, the `region` in `ObserverContext` is an empty region. fn post_apply_admin(&self, _: &mut ObserverContext<'_>, _: &AdminResponse) {} + + /// Hook before exec admin request, returns whether we should skip this admin. + fn pre_exec_admin(&self, _: &mut ObserverContext<'_>, _: &AdminRequest) -> bool { + false + } } pub trait QueryObserver: Coprocessor { @@ -105,6 +110,11 @@ pub trait QueryObserver: Coprocessor { /// Hook to call after applying write request. /// For now, the `region` in `ObserverContext` is an empty region. fn post_apply_query(&self, _: &mut ObserverContext<'_>, _: &Cmd) {} + + /// Hook before exec write request, returns whether we should skip this write. + fn pre_exec_query(&self, _: &mut ObserverContext<'_>, _: &[Request]) -> bool { + false + } } pub trait ApplySnapshotObserver: Coprocessor { diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index dfafcac338f..9c2e548f10e 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1242,38 +1242,50 @@ where // if pending remove, apply should be aborted already. assert!(!self.pending_remove); - ctx.exec_log_index = index; - ctx.exec_log_term = term; - ctx.kv_wb_mut().set_save_point(); - let mut origin_epoch = None; // Remember if the raft cmd fails to be applied, it must have no side effects. // E.g. `RaftApplyState` must not be changed. - let (resp, exec_result) = match self.exec_raft_cmd(ctx, req) { - Ok(a) => { - ctx.kv_wb_mut().pop_save_point().unwrap(); - if req.has_admin_request() { - origin_epoch = Some(self.region.get_region_epoch().clone()); - } - a + + let mut origin_epoch = None; + let (resp, exec_result) = if ctx.host.pre_exec(&self.region, req) { + // One of the observers want to filter execution of the command. + let mut resp = RaftCmdResponse::default(); + if !req.get_header().get_uuid().is_empty() { + let uuid = req.get_header().get_uuid().to_vec(); + resp.mut_header().set_uuid(uuid); } - Err(e) => { - // clear dirty values. - ctx.kv_wb_mut().rollback_to_save_point().unwrap(); - match e { - Error::EpochNotMatch(..) => debug!( - "epoch not match"; - "region_id" => self.region_id(), - "peer_id" => self.id(), - "err" => ?e - ), - _ => error!(?e; - "execute raft command"; - "region_id" => self.region_id(), - "peer_id" => self.id(), - ), + (resp, ApplyResult::None) + } else { + ctx.exec_log_index = index; + ctx.exec_log_term = term; + ctx.kv_wb_mut().set_save_point(); + let (resp, exec_result) = match self.exec_raft_cmd(ctx, req) { + Ok(a) => { + ctx.kv_wb_mut().pop_save_point().unwrap(); + if req.has_admin_request() { + origin_epoch = Some(self.region.get_region_epoch().clone()); + } + a } - (cmd_resp::new_error(e), ApplyResult::None) - } + Err(e) => { + // clear dirty values. + ctx.kv_wb_mut().rollback_to_save_point().unwrap(); + match e { + Error::EpochNotMatch(..) => debug!( + "epoch not match"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "err" => ?e + ), + _ => error!(?e; + "execute raft command"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + ), + } + (cmd_resp::new_error(e), ApplyResult::None) + } + }; + (resp, exec_result) }; if let ApplyResult::WaitMergeSource(_) = exec_result { return (resp, exec_result); @@ -4846,6 +4858,23 @@ mod tests { self } + fn compact_log(mut self, index: u64, term: u64) -> EntryBuilder { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::CompactLog); + req.mut_compact_log().set_compact_index(index); + req.mut_compact_log().set_compact_term(term); + self.req.set_admin_request(req); + self + } + + fn compute_hash(mut self, context: Vec) -> EntryBuilder { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ComputeHash); + req.mut_compute_hash().set_context(context); + self.req.set_admin_request(req); + self + } + fn build(mut self) -> Entry { self.entry .set_data(self.req.write_to_bytes().unwrap().into()); @@ -4858,6 +4887,8 @@ mod tests { pre_query_count: Arc, post_query_count: Arc, cmd_sink: Option>>>, + filter_compact_log: Arc, + filter_consistency_check: Arc, } impl Coprocessor for ApplyObserver {} @@ -4872,6 +4903,23 @@ mod tests { } } + impl AdminObserver for ApplyObserver { + fn pre_exec_admin(&self, _: &mut ObserverContext<'_>, req: &AdminRequest) -> bool { + let cmd_type = req.get_cmd_type(); + if cmd_type == AdminCmdType::CompactLog + && self.filter_compact_log.deref().load(Ordering::SeqCst) + { + return true; + }; + if (cmd_type == AdminCmdType::ComputeHash || cmd_type == AdminCmdType::VerifyHash) + && self.filter_consistency_check.deref().load(Ordering::SeqCst) + { + return true; + }; + false + } + } + impl CmdObserver for ApplyObserver where E: KvEngine, @@ -5506,6 +5554,114 @@ mod tests { }); } + #[test] + fn test_exec_observer() { + let (_path, engine) = create_tmp_engine("test-exec-observer"); + let (_import_dir, importer) = create_tmp_importer("test-exec-observer"); + let mut host = CoprocessorHost::::default(); + let obs = ApplyObserver::default(); + host.registry + .register_admin_observer(1, BoxAdminObserver::new(obs.clone())); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let cfg = Config::default(); + let (router, mut system) = create_apply_batch_system(&cfg); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-exec-observer".to_owned(), + cfg: Arc::new(VersionTrack::new(cfg)), + sender, + region_scheduler, + coprocessor_host: host, + importer, + engine, + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("test-exec-observer".to_owned(), builder); + + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(1, peer_id)); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + router.schedule_task(1, Msg::Registration(reg)); + + let mut index_id = 1; + let put_entry = EntryBuilder::new(1, 1) + .put(b"k1", b"v1") + .epoch(1, 3) + .build(); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![put_entry], vec![]))); + fetch_apply_res(&rx); + + index_id += 1; + let compact_entry = EntryBuilder::new(index_id, 1) + .compact_log(index_id - 1, 2) + .epoch(1, 3) + .build(); + // Filter CompactLog + obs.filter_compact_log.store(true, Ordering::SeqCst); + router.schedule_task( + 1, + Msg::apply(apply(peer_id, 1, 1, vec![compact_entry], vec![])), + ); + let apply_res = fetch_apply_res(&rx); + // applied_index can still be advanced. + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_index_term, 1); + // Executing CompactLog is filtered and takes no effect. + assert_eq!(apply_res.exec_res.len(), 0); + assert_eq!(apply_res.apply_state.get_truncated_state().get_index(), 0); + + index_id += 1; + // Don't filter CompactLog + obs.filter_compact_log.store(false, Ordering::SeqCst); + let compact_entry = EntryBuilder::new(index_id, 1) + .compact_log(index_id - 1, 2) + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply(peer_id, 1, 1, vec![compact_entry], vec![])), + ); + let apply_res = fetch_apply_res(&rx); + // applied_index can still be advanced. + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_index_term, 1); + // We can get exec result of CompactLog. + assert_eq!(apply_res.exec_res.len(), 1); + assert_eq!( + apply_res.apply_state.get_truncated_state().get_index(), + index_id - 1 + ); + + index_id += 1; + obs.filter_consistency_check.store(true, Ordering::SeqCst); + let compute_hash_entry = EntryBuilder::new(index_id, 1).compute_hash(vec![]).build(); + router.schedule_task( + 1, + Msg::apply(apply(peer_id, 1, 1, vec![compute_hash_entry], vec![])), + ); + let apply_res = fetch_apply_res(&rx); + // applied_index can still be advanced. + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_index_term, 1); + // We can't get exec result of ComputeHash. + assert_eq!(apply_res.exec_res.len(), 0); + obs.filter_consistency_check.store(false, Ordering::SeqCst); + + system.shutdown(); + } + #[test] fn test_cmd_observer() { let (_path, engine) = create_tmp_engine("test-delegate"); From 4f9a52872e57b210b389dda876d633e9f522aa47 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Wed, 13 Jul 2022 12:35:06 +0800 Subject: [PATCH 073/676] raftstore: make the gRPC poll busy check to consider the average value (#13005) ref tikv/tikv#12063 Make the gRPC poll busy check to consider the average value to make sure the check is accurate. Signed-off-by: JmPotato --- .../src/store/worker/split_config.rs | 2 +- .../src/store/worker/split_controller.rs | 101 +++++++++++++++++- 2 files changed, 97 insertions(+), 6 deletions(-) diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index 58df082c3e6..4fe00fff448 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -21,7 +21,7 @@ const DEFAULT_SPLIT_CONTAINED_SCORE: f64 = 0.5; // If the `split_balance_score` and `split_contained_score` above could not be satisfied, we will try to split the region according to its CPU load, // then these parameters below will start to work. -// When the gRPC poll thread CPU usage is higher than gRPC poll thread count * `DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, +// When the gRPC poll thread CPU usage (over the past `detect_times` seconds by default) is higher than gRPC poll thread count * `DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, // the CPU-based split won't be triggered no matter if the `DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO` and `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` are exceeded // to prevent from increasing the gRPC poll CPU usage. const DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.5; diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 2964796e4b2..3724e21c515 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -589,6 +589,7 @@ pub struct AutoSplitController { max_grpc_thread_count: usize, max_unified_read_pool_thread_count: usize, unified_read_pool_scale_receiver: Option>, + grpc_thread_usage_vec: Vec, } impl AutoSplitController { @@ -605,6 +606,7 @@ impl AutoSplitController { max_grpc_thread_count, max_unified_read_pool_thread_count, unified_read_pool_scale_receiver, + grpc_thread_usage_vec: vec![], } } @@ -612,19 +614,40 @@ impl AutoSplitController { AutoSplitController::new(SplitConfigManager::default(), 0, 0, None) } + fn update_grpc_thread_usage(&mut self, grpc_thread_usage: f64) { + self.grpc_thread_usage_vec.push(grpc_thread_usage); + let length = self.grpc_thread_usage_vec.len(); + let detect_times = self.cfg.detect_times as usize; + // Only keep the last `self.cfg.detect_times` elements. + if length > detect_times { + self.grpc_thread_usage_vec.drain(..length - detect_times); + } + } + + fn get_avg_grpc_thread_usage(&self) -> f64 { + let length = self.grpc_thread_usage_vec.len(); + if length == 0 { + return 0.0; + } + let sum = self.grpc_thread_usage_vec.iter().sum::(); + sum / length as f64 + } + fn should_check_region_cpu(&self) -> bool { self.cfg.region_cpu_overload_threshold_ratio > 0.0 } - fn is_grpc_poll_busy(&self, grpc_thread_usage: f64) -> bool { + fn is_grpc_poll_busy(&self, avg_grpc_thread_usage: f64) -> bool { #[cfg(feature = "failpoints")] fail::fail_point!("mock_grpc_poll_is_not_busy", |_| { false }); if self.max_grpc_thread_count == 0 { return false; } - let grpc_thread_cpu_overload_threshold = - self.max_grpc_thread_count as f64 * self.cfg.grpc_thread_cpu_overload_threshold_ratio; - grpc_thread_usage > 0.0 && grpc_thread_usage >= grpc_thread_cpu_overload_threshold + if self.cfg.grpc_thread_cpu_overload_threshold_ratio <= 0.0 { + return true; + } + avg_grpc_thread_usage + >= self.max_grpc_thread_count as f64 * self.cfg.grpc_thread_cpu_overload_threshold_ratio } fn is_unified_read_pool_busy(&self, unified_read_pool_thread_usage: f64) -> bool { @@ -756,13 +779,17 @@ impl AutoSplitController { Self::collect_thread_usage(thread_stats, "grpc-server"), Self::collect_thread_usage(thread_stats, "unified-read-po"), ); + // Update first before calculating the latest average gRPC poll CPU usage. + self.update_grpc_thread_usage(grpc_thread_usage); + let avg_grpc_thread_usage = self.get_avg_grpc_thread_usage(); let (is_grpc_poll_busy, is_unified_read_pool_busy) = ( - self.is_grpc_poll_busy(grpc_thread_usage), + self.is_grpc_poll_busy(avg_grpc_thread_usage), self.is_unified_read_pool_busy(unified_read_pool_thread_usage), ); debug!("flush to load base split"; "max_grpc_thread_count" => self.max_grpc_thread_count, "grpc_thread_usage" => grpc_thread_usage, + "avg_grpc_thread_usage" => avg_grpc_thread_usage, "max_unified_read_pool_thread_count" => self.max_unified_read_pool_thread_count, "unified_read_pool_thread_usage" => unified_read_pool_thread_usage, "is_grpc_poll_busy" => is_grpc_poll_busy, @@ -1799,6 +1826,70 @@ mod tests { } } + #[test] + fn test_avg_grpc_thread_cpu_usage_calculation() { + let mut auto_split_controller = AutoSplitController::default(); + let detect_times = auto_split_controller.cfg.detect_times as f64; + for grpc_thread_usage in 1..=5 { + auto_split_controller.update_grpc_thread_usage(grpc_thread_usage as f64); + } + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [1.0, 2.0, 3.0, 4.0, 5.0].iter().sum::() / 5.0, + ); + for grpc_thread_usage in 6..=10 { + auto_split_controller.update_grpc_thread_usage(grpc_thread_usage as f64); + } + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] + .iter() + .sum::() + / detect_times, + ); + for grpc_thread_usage in 11..=15 { + auto_split_controller.update_grpc_thread_usage(grpc_thread_usage as f64); + } + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0] + .iter() + .sum::() + / detect_times, + ); + for grpc_thread_usage in 1..=10 { + auto_split_controller.update_grpc_thread_usage(grpc_thread_usage as f64); + } + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] + .iter() + .sum::() + / detect_times, + ); + // Change the `detect_times` to a smaller value. + auto_split_controller.cfg.detect_times = 5; + let detect_times = auto_split_controller.cfg.detect_times as f64; + auto_split_controller.update_grpc_thread_usage(11.0); + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [7.0, 8.0, 9.0, 10.0, 11.0].iter().sum::() / detect_times, + ); + // Change the `detect_times` to a bigger value. + auto_split_controller.cfg.detect_times = 6; + let detect_times = auto_split_controller.cfg.detect_times as f64; + auto_split_controller.update_grpc_thread_usage(12.0); + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [7.0, 8.0, 9.0, 10.0, 11.0, 12.0].iter().sum::() / detect_times, + ); + auto_split_controller.update_grpc_thread_usage(13.0); + assert_eq!( + auto_split_controller.get_avg_grpc_thread_usage(), + [8.0, 9.0, 10.0, 11.0, 12.0, 13.0].iter().sum::() / detect_times, + ); + } + #[bench] fn samples_evaluate(b: &mut test::Bencher) { let mut samples = Samples(vec![Sample::new(b"c")]); From 08d2407efd80278b06bbeca7442738383378b8db Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Wed, 13 Jul 2022 12:49:05 +0800 Subject: [PATCH 074/676] raftstore: change send proposal time to waterfall metrics (#12993) ref tikv/tikv#12362 Durations related to a single query are recorded as waterfall metrics, which means it records the duration from the very beginning to the instant when the event happens. Previously, proposal_send_wait_nanos was an exception. So, this commit makes this consistent with other metrics. This commit also adds a Grafana Raft waterfall panel for it. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- .../raftstore/src/store/local_metrics.rs | 6 +- components/raftstore/src/store/metrics.rs | 12 +- components/raftstore/src/store/peer.rs | 17 +- components/tracker/src/lib.rs | 12 +- metrics/grafana/tikv_details.json | 436 +++++++++--------- 5 files changed, 231 insertions(+), 252 deletions(-) diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 304259c4571..923fb8ffc26 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -413,10 +413,10 @@ pub struct RaftMetrics { pub waterfall_metrics: bool, pub wf_batch_wait: LocalHistogram, pub wf_send_to_queue: LocalHistogram, + pub wf_send_proposal: LocalHistogram, pub wf_persist_log: LocalHistogram, pub wf_commit_log: LocalHistogram, pub wf_commit_not_persist_log: LocalHistogram, - pub proposal_send_wait: LocalHistogram, pub raft_log_gc_skipped: RaftLogGcSkippedMetrics, } @@ -438,10 +438,10 @@ impl RaftMetrics { waterfall_metrics, wf_batch_wait: STORE_WF_BATCH_WAIT_DURATION_HISTOGRAM.local(), wf_send_to_queue: STORE_WF_SEND_TO_QUEUE_DURATION_HISTOGRAM.local(), + wf_send_proposal: STORE_WF_SEND_PROPOSAL_DURATION_HISTOGRAM.local(), wf_persist_log: STORE_WF_PERSIST_LOG_DURATION_HISTOGRAM.local(), wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(), wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(), - proposal_send_wait: PROPOSAL_SEND_WAIT_DURATION_HISTOGRAM.local(), raft_log_gc_skipped: RaftLogGcSkippedMetrics::default(), } } @@ -461,10 +461,10 @@ impl RaftMetrics { if self.waterfall_metrics { self.wf_batch_wait.flush(); self.wf_send_to_queue.flush(); + self.wf_send_proposal.flush(); self.wf_persist_log.flush(); self.wf_commit_log.flush(); self.wf_commit_not_persist_log.flush(); - self.proposal_send_wait.flush(); } let mut missing = self.leader_missing.lock().unwrap(); LEADER_MISSING.set(missing.len() as i64); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index c60152784a5..c4a1c22d800 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -324,6 +324,12 @@ lazy_static! { "Bucketed histogram of proposals' send to write queue duration.", exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); + pub static ref STORE_WF_SEND_PROPOSAL_DURATION_HISTOGRAM: Histogram = + register_histogram!( + "tikv_raftstore_store_wf_send_proposal_duration_seconds", + "Bucketed histogram of proposals' waterfall send duration", + exponential_buckets(1e-6, 2.0, 26).unwrap() + ).unwrap(); pub static ref STORE_WF_BEFORE_WRITE_DURATION_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_store_wf_before_write_duration_seconds", @@ -360,12 +366,6 @@ lazy_static! { "Bucketed histogram of proposals' commit but not persist duration", exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); - pub static ref PROPOSAL_SEND_WAIT_DURATION_HISTOGRAM: Histogram = - register_histogram!( - "tikv_raftstore_proposal_send_wait_duration_seconds", - "Bucketed histogram of proposals' send wait duration", - exponential_buckets(1e-6, 2.0, 26).unwrap() - ).unwrap(); pub static ref PEER_PROPOSAL_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 5897309f0b2..489db8b9600 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -65,7 +65,6 @@ use tikv_util::{ Either, }; use time::Timespec; -use tracker::GLOBAL_TRACKERS; use txn_types::WriteBatchFlags; use uuid::Uuid; @@ -1576,6 +1575,7 @@ where msgs: Vec, ) { let mut now = None; + let std_now = Instant::now(); for msg in msgs { let msg_type = msg.get_message().get_msg_type(); if msg_type == MessageType::MsgSnapshot { @@ -1619,17 +1619,10 @@ where .binary_search_by_key(&index, |p: &Proposal<_>| p.index) { let proposal = &self.proposals.queue[idx]; - if term == proposal.term - && let Some(propose_time) = proposal.propose_time - && let Ok(dur) = ((*now.get_or_insert(monotonic_raw_now())) - propose_time).to_std() { - ctx.raft_metrics - .proposal_send_wait - .observe(dur.as_secs_f64()); - for t in proposal.cb.get_trackers().iter().flat_map(|v| v.iter().flat_map(|t| t.as_tracker_token())) { - GLOBAL_TRACKERS.with_tracker(t, |trakcer| { - if trakcer.metrics.propose_send_wait_nanos == 0 { - trakcer.metrics.propose_send_wait_nanos = dur.as_nanos() as u64; - } + if term == proposal.term { + for tracker in proposal.cb.get_trackers().iter().flat_map(|v| v.iter()) { + tracker.observe(std_now, &ctx.raft_metrics.wf_send_proposal, |t| { + &mut t.metrics.wf_send_proposal_nanos }); } } diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index 7e1aab80882..dbefbbe770c 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -43,7 +43,11 @@ impl Tracker { pub fn write_write_detail(&self, detail: &mut pb::WriteDetail) { detail.set_store_batch_wait_nanos(self.metrics.wf_batch_wait_nanos); - detail.set_propose_send_wait_nanos(self.metrics.propose_send_wait_nanos); + detail.set_propose_send_wait_nanos( + self.metrics + .wf_send_proposal_nanos + .saturating_sub(self.metrics.wf_send_to_queue_nanos), + ); detail.set_persist_log_nanos( self.metrics.wf_persist_log_nanos - self.metrics.wf_send_to_queue_nanos, ); @@ -56,9 +60,7 @@ impl Tracker { // And note that the time before flushing the raft message to the RPC channel is // also counted in this value (to be improved in the future). detail.set_commit_log_nanos( - self.metrics.wf_commit_log_nanos - - self.metrics.wf_batch_wait_nanos - - self.metrics.propose_send_wait_nanos, + self.metrics.wf_commit_log_nanos - self.metrics.wf_batch_wait_nanos, ); detail.set_apply_batch_wait_nanos(self.metrics.apply_wait_nanos); detail.set_apply_log_nanos(self.metrics.apply_time_nanos - self.metrics.apply_wait_nanos); @@ -128,12 +130,12 @@ pub struct RequestMetrics { pub write_instant: Option, pub wf_batch_wait_nanos: u64, pub wf_send_to_queue_nanos: u64, + pub wf_send_proposal_nanos: u64, pub wf_persist_log_nanos: u64, pub wf_before_write_nanos: u64, pub wf_write_end_nanos: u64, pub wf_kvdb_end_nanos: u64, pub wf_commit_log_nanos: u64, - pub propose_send_wait_nanos: u64, pub commit_not_persisted: bool, pub store_mutex_lock_nanos: u64, // should be 0 if using raft-engine pub store_thread_wait_nanos: u64, diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 6ef292f95e5..b07aff345a7 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -9877,7 +9877,7 @@ "h": 9, "w": 24, "x": 0, - "y": 9 + "y": 10 }, "hiddenSeries": false, "id": 13132, @@ -9897,7 +9897,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -10017,7 +10017,7 @@ "h": 8, "w": 12, "x": 0, - "y": 18 + "y": 19 }, "hiddenSeries": false, "id": 13257, @@ -10037,7 +10037,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -10157,7 +10157,7 @@ "h": 8, "w": 12, "x": 12, - "y": 18 + "y": 19 }, "hiddenSeries": false, "id": 13259, @@ -10177,7 +10177,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -10297,7 +10297,7 @@ "h": 8, "w": 12, "x": 0, - "y": 26 + "y": 27 }, "hiddenSeries": false, "id": 13261, @@ -10317,12 +10317,13 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { + "$$hashKey": "object:113", "alias": "count", "dashLength": 1, "dashes": true, @@ -10333,6 +10334,7 @@ "zindex": -3 }, { + "$$hashKey": "object:114", "alias": "avg", "fill": 7 } @@ -10395,6 +10397,7 @@ }, "yaxes": [ { + "$$hashKey": "object:139", "format": "s", "label": null, "logBase": 1, @@ -10403,6 +10406,7 @@ "show": true }, { + "$$hashKey": "object:140", "format": "short", "label": null, "logBase": 1, @@ -10437,7 +10441,7 @@ "h": 8, "w": 12, "x": 12, - "y": 26 + "y": 27 }, "hiddenSeries": false, "id": 13263, @@ -10457,12 +10461,13 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { + "$$hashKey": "object:217", "alias": "count", "dashLength": 1, "dashes": true, @@ -10473,6 +10478,7 @@ "zindex": -3 }, { + "$$hashKey": "object:218", "alias": "avg", "fill": 7 } @@ -10535,6 +10541,7 @@ }, "yaxes": [ { + "$$hashKey": "object:243", "format": "s", "label": null, "logBase": 1, @@ -10543,6 +10550,7 @@ "show": true }, { + "$$hashKey": "object:244", "format": "short", "label": null, "logBase": 1, @@ -10577,7 +10585,7 @@ "h": 8, "w": 12, "x": 0, - "y": 34 + "y": 35 }, "hiddenSeries": false, "id": 13265, @@ -10597,12 +10605,13 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { + "$$hashKey": "object:529", "alias": "count", "dashLength": 1, "dashes": true, @@ -10613,6 +10622,7 @@ "zindex": -3 }, { + "$$hashKey": "object:530", "alias": "avg", "fill": 7 } @@ -10675,6 +10685,7 @@ }, "yaxes": [ { + "$$hashKey": "object:555", "format": "s", "label": null, "logBase": 1, @@ -10683,6 +10694,7 @@ "show": true }, { + "$$hashKey": "object:556", "format": "short", "label": null, "logBase": 1, @@ -10706,7 +10718,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The before write time duration of each request", + "description": "The send raft message of the proposal duration of each request", "fieldConfig": { "defaults": {}, "overrides": [] @@ -10717,10 +10729,10 @@ "h": 8, "w": 12, "x": 12, - "y": 34 + "y": 35 }, "hiddenSeries": false, - "id": 13267, + "id": 23763572857, "legend": { "avg": false, "current": false, @@ -10737,12 +10749,13 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { + "$$hashKey": "object:529", "alias": "count", "dashLength": 1, "dashes": true, @@ -10753,6 +10766,7 @@ "zindex": -3 }, { + "$$hashKey": "object:530", "alias": "avg", "fill": 7 } @@ -10763,7 +10777,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", "hide": false, "interval": "", "legendFormat": "999%", @@ -10771,7 +10785,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", "hide": false, "interval": "", "legendFormat": "99%", @@ -10779,7 +10793,7 @@ }, { "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "expr": "sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_count{instance=~\"$instance\"}[30s]))", "hide": false, "interval": "", "legendFormat": "avg", @@ -10787,7 +10801,7 @@ }, { "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "expr": "sum(rate(tikv_raftstore_store_wf_send_proposal_duration_seconds_count{instance=~\"$instance\"}[30s]))", "hide": true, "instant": false, "interval": "", @@ -10799,7 +10813,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Store before write duration", + "title": "Store send proposal duration", "tooltip": { "shared": true, "sort": 0, @@ -10815,6 +10829,7 @@ }, "yaxes": [ { + "$$hashKey": "object:555", "format": "s", "label": null, "logBase": 1, @@ -10823,6 +10838,7 @@ "show": true }, { + "$$hashKey": "object:556", "format": "short", "label": null, "logBase": 1, @@ -10857,7 +10873,7 @@ "h": 8, "w": 12, "x": 0, - "y": 42 + "y": 43 }, "hiddenSeries": false, "id": 13269, @@ -10877,7 +10893,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -10986,7 +11002,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The write end duration of each request", + "description": "The before write time duration of each request", "fieldConfig": { "defaults": {}, "overrides": [] @@ -10997,10 +11013,10 @@ "h": 8, "w": 12, "x": 12, - "y": 42 + "y": 43 }, "hiddenSeries": false, - "id": 13271, + "id": 13267, "legend": { "avg": false, "current": false, @@ -11017,12 +11033,13 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { + "$$hashKey": "object:733", "alias": "count", "dashLength": 1, "dashes": true, @@ -11033,6 +11050,7 @@ "zindex": -3 }, { + "$$hashKey": "object:734", "alias": "avg", "fill": 7 } @@ -11043,7 +11061,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", "hide": false, "interval": "", "legendFormat": "999%", @@ -11051,7 +11069,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", "hide": false, "interval": "", "legendFormat": "99%", @@ -11059,7 +11077,7 @@ }, { "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "expr": "sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_count{instance=~\"$instance\"}[30s]))", "hide": false, "interval": "", "legendFormat": "avg", @@ -11067,7 +11085,7 @@ }, { "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "expr": "sum(rate(tikv_raftstore_store_wf_before_write_duration_seconds_count{instance=~\"$instance\"}[30s]))", "hide": true, "instant": false, "interval": "", @@ -11079,7 +11097,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Store write end duration", + "title": "Store before write duration", "tooltip": { "shared": true, "sort": 0, @@ -11095,6 +11113,7 @@ }, "yaxes": [ { + "$$hashKey": "object:759", "format": "s", "label": null, "logBase": 1, @@ -11103,6 +11122,7 @@ "show": true }, { + "$$hashKey": "object:760", "format": "short", "label": null, "logBase": 1, @@ -11137,7 +11157,7 @@ "h": 8, "w": 12, "x": 0, - "y": 50 + "y": 51 }, "hiddenSeries": false, "id": 13273, @@ -11157,7 +11177,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -11266,7 +11286,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The commit and persist duration of each request", + "description": "The write end duration of each request", "fieldConfig": { "defaults": {}, "overrides": [] @@ -11277,10 +11297,10 @@ "h": 8, "w": 12, "x": 12, - "y": 50 + "y": 51 }, "hiddenSeries": false, - "id": 13275, + "id": 13271, "legend": { "avg": false, "current": false, @@ -11297,12 +11317,13 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { + "$$hashKey": "object:841", "alias": "count", "dashLength": 1, "dashes": true, @@ -11313,6 +11334,7 @@ "zindex": -3 }, { + "$$hashKey": "object:842", "alias": "avg", "fill": 7 } @@ -11323,7 +11345,7 @@ "targets": [ { "exemplar": true, - "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", "hide": false, "interval": "", "legendFormat": "999%", @@ -11331,7 +11353,7 @@ }, { "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", "hide": false, "interval": "", "legendFormat": "99%", @@ -11339,7 +11361,7 @@ }, { "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "expr": "sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_count{instance=~\"$instance\"}[30s]))", "hide": false, "interval": "", "legendFormat": "avg", @@ -11347,8 +11369,8 @@ }, { "exemplar": true, - "expr": "sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_count{instance=~\"$instance\"}[30s]))", - "hide": false, + "expr": "sum(rate(tikv_raftstore_store_wf_write_end_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "hide": true, "instant": false, "interval": "", "legendFormat": "count", @@ -11359,7 +11381,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Store commit and persist duration", + "title": "Store write end duration", "tooltip": { "shared": true, "sort": 0, @@ -11375,6 +11397,7 @@ }, "yaxes": [ { + "$$hashKey": "object:867", "format": "s", "label": null, "logBase": 1, @@ -11383,6 +11406,7 @@ "show": true }, { + "$$hashKey": "object:868", "format": "short", "label": null, "logBase": 1, @@ -11417,7 +11441,7 @@ "h": 8, "w": 12, "x": 0, - "y": 58 + "y": 59 }, "hiddenSeries": false, "id": 13277, @@ -11437,7 +11461,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -11535,6 +11559,146 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": { + "99%": "#eab839", + "999%": "dark-red", + "count": "rgb(33, 250, 2)" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The commit and persist duration of each request", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 59 + }, + "hiddenSeries": false, + "id": 13275, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "count", + "dashLength": 1, + "dashes": true, + "fill": 2, + "spaceLength": 1, + "transform": "negative-Y", + "yaxis": 2, + "zindex": -3 + }, + { + "alias": "avg", + "fill": 7 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "999%", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[30s])) by (le))", + "hide": false, + "interval": "", + "legendFormat": "99%", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_sum{instance=~\"$instance\"}[30s])) / sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "hide": false, + "interval": "", + "legendFormat": "avg", + "refId": "C" + }, + { + "exemplar": true, + "expr": "sum(rate(tikv_raftstore_store_wf_commit_log_duration_seconds_count{instance=~\"$instance\"}[30s]))", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "count", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Store commit and persist duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "title": "Raft Waterfall", @@ -13804,186 +13968,6 @@ "yBucketNumber": null, "yBucketSize": null }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateSpectral", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "${DS_TEST-CLUSTER}", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 47 - }, - "heatmap": {}, - "hideZeroBuckets": true, - "highlightCards": true, - "id": 23763572784, - "legend": { - "show": false - }, - "links": [], - "reverseYBuckets": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(delta(tikv_raftstore_proposal_send_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", - "format": "heatmap", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{le}}", - "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Proposal send wait duration", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 0, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "upper", - "yBucketNumber": null, - "yBucketSize": null - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "editable": true, - "error": false, - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 47 - }, - "hiddenSeries": false, - "id": 23763572783, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.11", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_proposal_send_wait_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "99% Proposal send wait duration per server", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:106", - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "$$hashKey": "object:107", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "aliasColors": {}, "bars": false, From 03b44b5e219fb795b20ebb9367b66b2adf3800ac Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 12 Jul 2022 23:19:06 -0700 Subject: [PATCH 075/676] add TestTabletFactory and refine TabletFactory trait (#12940) close tikv/tikv#12939 Add TestTabletFactory for testing raftstorev2 with real multi-rocksdb. Before this change there's no way to create a tablet factory in raftstore's test code. And also this PR refines TabletFactory trait so that its implementation code will not hard wire with raftstore v1's code. This will make Tabletfactory's implementation work both with raftstore v1 and raftstore v2. Signed-off-by: qi.xu Co-authored-by: qi.xu --- Cargo.lock | 1 + Cargo.toml | 2 +- .../engine_rocks/src/compact_listener.rs | 18 +- components/engine_test/Cargo.toml | 1 + components/engine_test/src/lib.rs | 221 ++++++++++++++++-- components/raftstore/Cargo.toml | 3 +- .../raftstore/src/compacted_event_sender.rs | 23 ++ components/raftstore/src/lib.rs | 4 + components/raftstore/src/store/snap.rs | 10 +- components/server/Cargo.toml | 2 +- components/server/src/server.rs | 5 +- components/test_raftstore/src/util.rs | 8 +- src/server/engine_factory.rs | 64 ++--- src/server/engine_factory_v2.rs | 20 +- 14 files changed, 297 insertions(+), 85 deletions(-) create mode 100644 components/raftstore/src/compacted_event_sender.rs diff --git a/Cargo.lock b/Cargo.lock index 7e562246adc..15da9f000b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1520,6 +1520,7 @@ dependencies = [ name = "engine_test" version = "0.0.1" dependencies = [ + "collections", "encryption", "engine_panic", "engine_rocks", diff --git a/Cargo.toml b/Cargo.toml index 622547b2294..dd071c9809e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -132,7 +132,7 @@ prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft_log_engine = { path = "components/raft_log_engine", default-features = false } -raftstore = { path = "components/raftstore", default-features = false } +raftstore = { path = "components/raftstore", default-features = false, features = ["engine_rocks"] } rand = "0.7.3" regex = "1.3" resource_metering = { path = "components/resource_metering" } diff --git a/components/engine_rocks/src/compact_listener.rs b/components/engine_rocks/src/compact_listener.rs index 0affe70dd4b..2cfdb253eb0 100644 --- a/components/engine_rocks/src/compact_listener.rs +++ b/components/engine_rocks/src/compact_listener.rs @@ -7,6 +7,7 @@ use std::{ Bound::{Excluded, Included, Unbounded}, }, path::Path, + sync::Arc, }; use collections::hash_set_with_capacity; @@ -205,17 +206,26 @@ impl CompactedEvent for RocksCompactedEvent { pub type Filter = fn(&RocksCompactionJobInfo<'_>) -> bool; +/// The trait for sending RocksCompactedEvent event +/// This is to workaround Box cannot be cloned +pub trait CompactedEventSender { + fn send(&self, event: RocksCompactedEvent); +} + pub struct CompactionListener { - ch: Box, + event_sender: Arc, filter: Option, } impl CompactionListener { pub fn new( - ch: Box, + event_sender: Arc, filter: Option, ) -> CompactionListener { - CompactionListener { ch, filter } + CompactionListener { + event_sender, + filter, + } } } @@ -288,7 +298,7 @@ impl EventListener for CompactionListener { return; } - (self.ch)(RocksCompactedEvent::new( + self.event_sender.send(RocksCompactedEvent::new( info, smallest_key.unwrap(), largest_key.unwrap(), diff --git a/components/engine_test/Cargo.toml b/components/engine_test/Cargo.toml index 61061957563..a9bfbfd41d3 100644 --- a/components/engine_test/Cargo.toml +++ b/components/engine_test/Cargo.toml @@ -24,6 +24,7 @@ test-engines-panic = [ ] [dependencies] +collections = { path = "../collections", default-features = false } encryption = { path = "../encryption", default-features = false } engine_panic = { path = "../engine_panic", default-features = false } engine_rocks = { path = "../engine_rocks", default-features = false } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 4d804a17a9f..e5dddfdcee2 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -74,6 +74,12 @@ pub mod raft { /// Types and constructors for the "kv" engine pub mod kv { + use std::{ + path::{Path, PathBuf}, + sync::{Arc, Mutex}, + }; + + use collections::HashMap; #[cfg(feature = "test-engine-kv-panic")] pub use engine_panic::{ PanicEngine as KvTestEngine, PanicEngineIterator as KvTestEngineIterator, @@ -84,7 +90,8 @@ pub mod kv { RocksEngine as KvTestEngine, RocksEngineIterator as KvTestEngineIterator, RocksSnapshot as KvTestSnapshot, RocksWriteBatch as KvTestWriteBatch, }; - use engine_traits::Result; + use engine_traits::{Result, TabletAccessor, TabletFactory}; + use tikv_util::box_err; use crate::ctor::{CFOptions, DBOptions, KvEngineConstructorExt}; @@ -92,7 +99,7 @@ pub mod kv { path: &str, db_opt: Option, cfs: &[&str], - opts: Option>>, + opts: Option>, ) -> Result { KvTestEngine::new_kv_engine(path, db_opt, cfs, opts) } @@ -100,10 +107,186 @@ pub mod kv { pub fn new_engine_opt( path: &str, db_opt: DBOptions, - cfs_opts: Vec>, + cfs_opts: Vec, ) -> Result { KvTestEngine::new_kv_engine_opt(path, db_opt, cfs_opts) } + + const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; + + #[derive(Clone)] + pub struct TestTabletFactory { + root_path: String, + db_opt: Option, + cfs: Vec, + opts: Option>, + registry: Arc>>, + } + + impl TestTabletFactory { + pub fn new( + root_path: &str, + db_opt: Option, + cfs: &[&str], + opts: Option>, + ) -> Self { + Self { + root_path: root_path.to_string(), + db_opt, + cfs: cfs.iter().map(|s| s.to_string()).collect(), + opts, + registry: Arc::new(Mutex::new(HashMap::default())), + } + } + } + + impl TabletFactory for TestTabletFactory { + fn create_tablet(&self, id: u64, suffix: u64) -> Result { + let mut reg = self.registry.lock().unwrap(); + if let Some(db) = reg.get(&(id, suffix)) { + return Err(box_err!( + "region {} {} already exists", + id, + db.as_inner().path() + )); + } + let tablet_path = self.tablet_path(id, suffix); + let tablet_path = tablet_path.to_str().unwrap(); + let mut cfs = vec![]; + self.cfs.iter().for_each(|s| cfs.push(s.as_str())); + let kv_engine = KvTestEngine::new_kv_engine( + tablet_path, + self.db_opt.clone(), + cfs.as_slice(), + self.opts.clone(), + )?; + reg.insert((id, suffix), kv_engine.clone()); + Ok(kv_engine) + } + + fn open_tablet(&self, id: u64, suffix: u64) -> Result { + let mut reg = self.registry.lock().unwrap(); + if let Some(db) = reg.get(&(id, suffix)) { + return Ok(db.clone()); + } + + let db_path = self.tablet_path(id, suffix); + let db = self.open_tablet_raw(db_path.as_path(), false)?; + reg.insert((id, suffix), db.clone()); + Ok(db) + } + + fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option { + let reg = self.registry.lock().unwrap(); + if let Some(db) = reg.get(&(id, suffix)) { + return Some(db.clone()); + } + None + } + + fn open_tablet_cache_any(&self, id: u64) -> Option { + let reg = self.registry.lock().unwrap(); + if let Some(k) = reg.keys().find(|k| k.0 == id) { + return Some(reg.get(k).unwrap().clone()); + } + None + } + + fn open_tablet_raw(&self, path: &Path, _readonly: bool) -> Result { + if !KvTestEngine::exists(path.to_str().unwrap_or_default()) { + return Err(box_err!( + "path {} does not have db", + path.to_str().unwrap_or_default() + )); + } + let (mut tablet_id, mut tablet_suffix) = (0, 1); + if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { + let mut split = s.split('_'); + tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); + tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); + } + self.create_tablet(tablet_id, tablet_suffix) + } + + #[inline] + fn create_shared_db(&self) -> Result { + self.create_tablet(0, 0) + } + + #[inline] + fn exists_raw(&self, path: &Path) -> bool { + KvTestEngine::exists(path.to_str().unwrap_or_default()) + } + + #[inline] + fn tablets_path(&self) -> PathBuf { + Path::new(&self.root_path).join("tablets") + } + + #[inline] + fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + Path::new(&self.root_path).join(format!("tablets/{}_{}", id, suffix)) + } + + #[inline] + fn mark_tombstone(&self, region_id: u64, suffix: u64) { + let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); + std::fs::File::create(&path).unwrap(); + self.registry.lock().unwrap().remove(&(region_id, suffix)); + } + + #[inline] + fn is_tombstoned(&self, region_id: u64, suffix: u64) -> bool { + self.tablet_path(region_id, suffix) + .join(TOMBSTONE_MARK) + .exists() + } + + #[inline] + fn destroy_tablet(&self, id: u64, suffix: u64) -> engine_traits::Result<()> { + let path = self.tablet_path(id, suffix); + self.registry.lock().unwrap().remove(&(id, suffix)); + let _ = std::fs::remove_dir_all(path); + Ok(()) + } + + #[inline] + fn load_tablet(&self, path: &Path, id: u64, suffix: u64) -> Result { + { + let reg = self.registry.lock().unwrap(); + if let Some(db) = reg.get(&(id, suffix)) { + return Err(box_err!( + "region {} {} already exists", + id, + db.as_inner().path() + )); + } + } + + let db_path = self.tablet_path(id, suffix); + std::fs::rename(path, &db_path)?; + self.open_tablet_raw(db_path.as_path(), false) + } + + fn clone(&self) -> Box + Send> { + Box::new(std::clone::Clone::clone(self)) + } + } + + impl TabletAccessor for TestTabletFactory { + #[inline] + fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &KvTestEngine)) { + let reg = self.registry.lock().unwrap(); + for ((id, suffix), tablet) in &*reg { + f(*id, *suffix, tablet) + } + } + + // it have multi tablets. + fn is_single_engine(&self) -> bool { + false + } + } } /// Create a storage engine with a concrete type. This should ultimately be the @@ -145,7 +328,7 @@ pub mod ctor { path: &str, db_opt: Option, cfs: &[&str], - opts: Option>>, + opts: Option>, ) -> Result; /// Create a new engine with specified column families and options @@ -155,7 +338,7 @@ pub mod ctor { fn new_kv_engine_opt( path: &str, db_opt: DBOptions, - cfs_opts: Vec>, + cfs_opts: Vec, ) -> Result; } @@ -183,14 +366,18 @@ pub mod ctor { pub type RaftDBOptions = DBOptions; - pub struct CFOptions<'a> { - pub cf: &'a str, + #[derive(Clone)] + pub struct CFOptions { + pub cf: String, pub options: ColumnFamilyOptions, } - impl<'a> CFOptions<'a> { - pub fn new(cf: &'a str, options: ColumnFamilyOptions) -> CFOptions<'a> { - CFOptions { cf, options } + impl CFOptions { + pub fn new(cf: &str, options: ColumnFamilyOptions) -> CFOptions { + CFOptions { + cf: cf.to_string(), + options, + } } } @@ -297,7 +484,7 @@ pub mod ctor { _path: &str, _db_opt: Option, _cfs: &[&str], - _opts: Option>>, + _opts: Option>, ) -> Result { Ok(PanicEngine) } @@ -305,7 +492,7 @@ pub mod ctor { fn new_kv_engine_opt( _path: &str, _db_opt: DBOptions, - _cfs_opts: Vec>, + _cfs_opts: Vec, ) -> Result { Ok(PanicEngine) } @@ -345,7 +532,7 @@ pub mod ctor { path: &str, db_opt: Option, cfs: &[&str], - opts: Option>>, + opts: Option>, ) -> Result { let rocks_db_opts = match db_opt { Some(db_opt) => Some(get_rocks_db_opts(db_opt)?), @@ -367,7 +554,7 @@ pub mod ctor { let mut rocks_cf_opts = RocksColumnFamilyOptions::new(); set_standard_cf_opts(rocks_cf_opts.as_raw_mut(), &cf_opts.options); set_cf_opts(&mut rocks_cf_opts, &cf_opts.options); - RocksCFOptions::new(cf_opts.cf, rocks_cf_opts) + RocksCFOptions::new(&cf_opts.cf, rocks_cf_opts) }) .collect(); rocks_new_engine(path, rocks_db_opts, &[], Some(rocks_cfs_opts)) @@ -376,7 +563,7 @@ pub mod ctor { fn new_kv_engine_opt( path: &str, db_opt: DBOptions, - cfs_opts: Vec>, + cfs_opts: Vec, ) -> Result { let rocks_db_opts = get_rocks_db_opts(db_opt)?; let rocks_cfs_opts = cfs_opts @@ -385,7 +572,7 @@ pub mod ctor { let mut rocks_cf_opts = RocksColumnFamilyOptions::new(); set_standard_cf_opts(rocks_cf_opts.as_raw_mut(), &cf_opts.options); set_cf_opts(&mut rocks_cf_opts, &cf_opts.options); - RocksCFOptions::new(cf_opts.cf, rocks_cf_opts) + RocksCFOptions::new(&cf_opts.cf, rocks_cf_opts) }) .collect(); rocks_new_engine_opt(path, rocks_db_opts, rocks_cfs_opts) @@ -402,7 +589,7 @@ pub mod ctor { let mut rocks_cf_opts = RocksColumnFamilyOptions::new(); set_standard_cf_opts(rocks_cf_opts.as_raw_mut(), &cf_opts.options); set_cf_opts(&mut rocks_cf_opts, &cf_opts.options); - let default_cfs_opts = vec![RocksCFOptions::new(cf_opts.cf, rocks_cf_opts)]; + let default_cfs_opts = vec![RocksCFOptions::new(&cf_opts.cf, rocks_cf_opts)]; rocks_new_engine(path, rocks_db_opts, &[], Some(default_cfs_opts)) } } diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 9d8c39d5746..3b47ca08ec5 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" publish = false [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "engine_rocks"] failpoints = ["fail/failpoints"] testexport = [] test-engine-kv-rocksdb = [ @@ -38,6 +38,7 @@ crc32fast = "1.2" crossbeam = "0.8" derivative = "2" encryption = { path = "../encryption", default-features = false } +engine_rocks = { path = "../engine_rocks", default-features = false, optional = true } # Should be [dev-dependencies] but we need to control the features # https://github.com/rust-lang/cargo/issues/6915 diff --git a/components/raftstore/src/compacted_event_sender.rs b/components/raftstore/src/compacted_event_sender.rs new file mode 100644 index 00000000000..99ba70a0512 --- /dev/null +++ b/components/raftstore/src/compacted_event_sender.rs @@ -0,0 +1,23 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::sync::Mutex; + +use engine_rocks::{CompactedEventSender, RocksCompactedEvent, RocksEngine}; +use engine_traits::RaftEngine; +use tikv_util::error_unknown; + +use crate::store::{fsm::store::RaftRouter, StoreMsg}; + +// raftstore v1's implementation +pub struct RaftRouterCompactedEventSender { + pub router: Mutex>, +} + +impl CompactedEventSender for RaftRouterCompactedEventSender { + fn send(&self, event: RocksCompactedEvent) { + let router = self.router.lock().unwrap(); + let event = StoreMsg::CompactedEvent(event); + if let Err(e) = router.send_control(event) { + error_unknown!(?e; "send compaction finished event to raftstore failed"); + } + } +} diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index ed70dacb37b..f26022efe64 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -13,11 +13,15 @@ extern crate test; #[macro_use] extern crate derivative; +#[cfg(feature = "engine_rocks")] +pub mod compacted_event_sender; pub mod coprocessor; pub mod errors; pub mod router; pub mod store; +#[cfg(feature = "engine_rocks")] +pub use self::compacted_event_sender::RaftRouterCompactedEventSender; pub use self::{ coprocessor::{RegionInfo, RegionInfoAccessor, SeekRegionCallback}, errors::{DiscardReason, Error, Result}, diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index eaf99506f4b..7bcaeb5529b 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1918,12 +1918,12 @@ pub mod tests { const BYTE_SIZE: usize = 1; type DBBuilder = - fn(p: &Path, db_opt: Option, cf_opts: Option>>) -> Result; + fn(p: &Path, db_opt: Option, cf_opts: Option>) -> Result; pub fn open_test_empty_db( path: &Path, db_opt: Option, - cf_opts: Option>>, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, @@ -1936,7 +1936,7 @@ pub mod tests { pub fn open_test_db( path: &Path, db_opt: Option, - cf_opts: Option>>, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, @@ -1957,7 +1957,7 @@ pub mod tests { pub fn open_test_db_with_100keys( path: &Path, db_opt: Option, - cf_opts: Option>>, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, @@ -1981,7 +1981,7 @@ pub mod tests { path: &TempDir, raft_db_opt: Option, kv_db_opt: Option, - kv_cf_opts: Option>>, + kv_cf_opts: Option>, regions: &[u64], ) -> Result> { let p = path.path(); diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index 650f9f6932b..c2617d4896c 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -65,7 +65,7 @@ prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft_log_engine = { path = "../raft_log_engine", default-features = false } -raftstore = { path = "../raftstore", default-features = false } +raftstore = { path = "../raftstore", default-features = false, features = ["engine_rocks"] } rand = "0.8" resolved_ts = { path = "../../components/resolved_ts", default-features = false } resource_metering = { path = "../resource_metering" } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 37d031753ce..4bd95b1de60 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -76,6 +76,7 @@ use raftstore::{ AutoSplitController, CheckLeaderRunner, GlobalReplicationState, LocalReader, SnapManager, SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, }, + RaftRouterCompactedEventSender, }; use security::SecurityManager; use tikv::{ @@ -1627,7 +1628,9 @@ impl TiKvServer { // Create kv engine. let mut builder = KvEngineFactoryBuilder::new(env, &self.config, &self.store_path) - .compaction_filter_router(self.router.clone()) + .compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { + router: Mutex::new(self.router.clone()), + })) .region_info_accessor(self.region_info_accessor.clone()) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 288e99a3837..12ca8f9a867 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -4,7 +4,7 @@ use std::{ fmt::Write, path::Path, str::FromStr, - sync::{mpsc, Arc}, + sync::{mpsc, Arc, Mutex}, thread, time::Duration, }; @@ -44,7 +44,7 @@ use raft::eraftpb::ConfChangeType; pub use raftstore::store::util::{find_peer, new_learner_peer, new_peer}; use raftstore::{ store::{fsm::RaftRouter, *}, - Result, + RaftRouterCompactedEventSender, Result, }; use rand::RngCore; use server::server::ConfiguredRaftEngine; @@ -658,7 +658,9 @@ pub fn create_test_engine( builder = builder.block_cache(cache); } if let Some(router) = router { - builder = builder.compaction_filter_router(router); + builder = builder.compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { + router: Mutex::new(router), + })); } let factory = builder.build(); let engine = factory.create_shared_db().unwrap(); diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index fde3bc5a40f..04e1f72f05a 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -7,17 +7,14 @@ use std::{ use engine_rocks::{ raw::{Cache, Env}, - CompactionListener, FlowListener, RocksCompactedEvent, RocksCompactionJobInfo, RocksEngine, + CompactedEventSender, CompactionListener, FlowListener, RocksCompactionJobInfo, RocksEngine, RocksEventListener, }; use engine_traits::{ - CompactionJobInfo, RaftEngine, Result, TabletAccessor, TabletFactory, CF_DEFAULT, CF_WRITE, + CompactionJobInfo, Result, TabletAccessor, TabletFactory, CF_DEFAULT, CF_WRITE, }; use kvproto::kvrpcpb::ApiVersion; -use raftstore::{ - store::{RaftRouter, StoreMsg}, - RegionInfoAccessor, -}; +use raftstore::RegionInfoAccessor; use tikv_util::worker::Scheduler; use crate::config::{DbConfig, TiKvConfig, DEFAULT_ROCKSDB_SUB_DIR}; @@ -34,12 +31,12 @@ struct FactoryInner { root_db: Mutex>, } -pub struct KvEngineFactoryBuilder { +pub struct KvEngineFactoryBuilder { inner: FactoryInner, - router: Option>, + compact_event_sender: Option>, } -impl KvEngineFactoryBuilder { +impl KvEngineFactoryBuilder { pub fn new(env: Arc, config: &TiKvConfig, store_path: impl Into) -> Self { Self { inner: FactoryInner { @@ -53,7 +50,7 @@ impl KvEngineFactoryBuilder { sst_recovery_sender: None, root_db: Mutex::default(), }, - router: None, + compact_event_sender: None, } } @@ -77,40 +74,31 @@ impl KvEngineFactoryBuilder { self } - pub fn compaction_filter_router(mut self, router: RaftRouter) -> Self { - self.router = Some(router); + pub fn compaction_event_sender( + mut self, + sender: Arc, + ) -> Self { + self.compact_event_sender = Some(sender); self } - pub fn build(self) -> KvEngineFactory { + pub fn build(self) -> KvEngineFactory { KvEngineFactory { inner: Arc::new(self.inner), - router: Mutex::new(self.router), + compact_event_sender: self.compact_event_sender.clone(), } } } -pub struct KvEngineFactory { +#[derive(Clone)] +pub struct KvEngineFactory { inner: Arc, - router: Mutex>>, + compact_event_sender: Option>, } -impl Clone for KvEngineFactory { - fn clone(&self) -> Self { - Self { - inner: self.inner.clone(), - router: Mutex::new(self.router.lock().unwrap().clone()), - } - } -} - -impl KvEngineFactory { +impl KvEngineFactory { pub fn create_raftstore_compaction_listener(&self) -> Option { - let router = self.router.lock().unwrap(); - let ch = match &*router { - Some(r) => Mutex::new(r.clone()), - None => return None, - }; + self.compact_event_sender.as_ref()?; fn size_change_filter(info: &RocksCompactionJobInfo<'_>) -> bool { // When calculating region size, we only consider write and default // column families. @@ -125,16 +113,8 @@ impl KvEngineFactory { true } - - let compacted_handler = Box::new(move |compacted_event: RocksCompactedEvent| { - let ch = ch.lock().unwrap(); - let event = StoreMsg::CompactedEvent(compacted_event); - if let Err(e) = ch.send_control(event) { - error_unknown!(?e; "send compaction finished event to raftstore failed"); - } - }); Some(CompactionListener::new( - compacted_handler, + self.compact_event_sender.as_ref().unwrap().clone(), Some(size_change_filter), )) } @@ -229,7 +209,7 @@ impl KvEngineFactory { } } -impl TabletFactory for KvEngineFactory { +impl TabletFactory for KvEngineFactory { #[inline] fn create_shared_db(&self) -> Result { let root_path = self.kv_engine_path(); @@ -270,7 +250,7 @@ impl TabletFactory for KvEngineFactory { } } -impl TabletAccessor for KvEngineFactory { +impl TabletAccessor for KvEngineFactory { fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { if let Ok(db) = self.inner.root_db.lock() { let db = db.as_ref().unwrap(); diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 4027823f23c..ccd2f1d7b02 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -7,19 +7,19 @@ use std::{ use collections::HashMap; use engine_rocks::RocksEngine; -use engine_traits::{RaftEngine, Result, TabletAccessor, TabletFactory}; +use engine_traits::{Result, TabletAccessor, TabletFactory}; use crate::server::engine_factory::KvEngineFactory; const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; #[derive(Clone)] -pub struct KvEngineFactoryV2 { - inner: KvEngineFactory, +pub struct KvEngineFactoryV2 { + inner: KvEngineFactory, registry: Arc>>, } -impl TabletFactory for KvEngineFactoryV2 { +impl TabletFactory for KvEngineFactoryV2 { fn create_tablet(&self, id: u64, suffix: u64) -> Result { let mut reg = self.registry.lock().unwrap(); if let Some(db) = reg.get(&(id, suffix)) { @@ -152,7 +152,7 @@ impl TabletFactory for KvEngineFactoryV2 { } } -impl TabletAccessor for KvEngineFactoryV2 { +impl TabletAccessor for KvEngineFactoryV2 { #[inline] fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { let reg = self.registry.lock().unwrap(); @@ -189,8 +189,8 @@ mod tests { }; } - impl KvEngineFactoryV2 { - pub fn new(inner: KvEngineFactory) -> Self { + impl KvEngineFactoryV2 { + pub fn new(inner: KvEngineFactory) -> Self { KvEngineFactoryV2 { inner, registry: Arc::new(Mutex::new(HashMap::default())), @@ -204,7 +204,7 @@ mod tests { let dir = test_util::temp_dir("test_kvengine_factory", false); let env = cfg.build_shared_rocks_env(None, None).unwrap(); - let builder = KvEngineFactoryBuilder::::new(env, &cfg, dir.path()); + let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); let factory = builder.build(); let shared_db = factory.create_shared_db().unwrap(); let tablet = TabletFactory::create_tablet(&factory, 1, 10); @@ -237,7 +237,7 @@ mod tests { let dir = test_util::temp_dir("test_kvengine_factory_v2", false); let env = cfg.build_shared_rocks_env(None, None).unwrap(); - let builder = KvEngineFactoryBuilder::::new(env, &cfg, dir.path()); + let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); let inner_factory = builder.build(); let factory = KvEngineFactoryV2::new(inner_factory); let tablet = factory.create_tablet(1, 10); @@ -275,7 +275,7 @@ mod tests { let dir = test_util::temp_dir("test_get_live_tablets", false); let env = cfg.build_shared_rocks_env(None, None).unwrap(); - let builder = KvEngineFactoryBuilder::::new(env, &cfg, dir.path()); + let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); let inner_factory = builder.build(); let factory = KvEngineFactoryV2::new(inner_factory); factory.create_tablet(1, 10).unwrap(); From b33d3df696c74271d3674b42c9e3446b8d79e8c1 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 13 Jul 2022 17:01:06 +0800 Subject: [PATCH 076/676] tablet: load_tablet should remove the old tablet in the cache. (#12984) close tikv/tikv#12985 Signed-off-by: SpadeA-Tang --- components/engine_test/src/lib.rs | 25 ++++++++++++++++++------- src/server/engine_factory_v2.rs | 30 +++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index e5dddfdcee2..f7fd904fd1c 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -140,6 +140,17 @@ pub mod kv { } } + // Extract tablet id and tablet suffix from the path. + fn get_id_and_suffix_from_path(path: &Path) -> (u64, u64) { + let (mut tablet_id, mut tablet_suffix) = (0, 1); + if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { + let mut split = s.split('_'); + tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); + tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); + } + (tablet_id, tablet_suffix) + } + impl TabletFactory for TestTabletFactory { fn create_tablet(&self, id: u64, suffix: u64) -> Result { let mut reg = self.registry.lock().unwrap(); @@ -199,12 +210,7 @@ pub mod kv { path.to_str().unwrap_or_default() )); } - let (mut tablet_id, mut tablet_suffix) = (0, 1); - if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { - let mut split = s.split('_'); - tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); - tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); - } + let (tablet_id, tablet_suffix) = get_id_and_suffix_from_path(path); self.create_tablet(tablet_id, tablet_suffix) } @@ -265,7 +271,12 @@ pub mod kv { let db_path = self.tablet_path(id, suffix); std::fs::rename(path, &db_path)?; - self.open_tablet_raw(db_path.as_path(), false) + let new_engine = self.open_tablet_raw(db_path.as_path(), false); + if new_engine.is_ok() { + let (old_id, old_suffix) = get_id_and_suffix_from_path(path); + self.registry.lock().unwrap().remove(&(old_id, old_suffix)); + } + new_engine } fn clone(&self) -> Box + Send> { diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index ccd2f1d7b02..d1cc29bc88f 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -19,6 +19,17 @@ pub struct KvEngineFactoryV2 { registry: Arc>>, } +// Extract tablet id and tablet suffix from the path. +fn get_id_and_suffix_from_path(path: &Path) -> (u64, u64) { + let (mut tablet_id, mut tablet_suffix) = (0, 1); + if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { + let mut split = s.split('_'); + tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); + tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); + } + (tablet_id, tablet_suffix) +} + impl TabletFactory for KvEngineFactoryV2 { fn create_tablet(&self, id: u64, suffix: u64) -> Result { let mut reg = self.registry.lock().unwrap(); @@ -74,12 +85,7 @@ impl TabletFactory for KvEngineFactoryV2 { path.to_str().unwrap_or_default() )); } - let (mut tablet_id, mut tablet_suffix) = (0, 1); - if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { - let mut split = s.split('_'); - tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); - tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); - } + let (tablet_id, tablet_suffix) = get_id_and_suffix_from_path(path); self.create_tablet(tablet_id, tablet_suffix) } @@ -144,7 +150,12 @@ impl TabletFactory for KvEngineFactoryV2 { let db_path = self.tablet_path(id, suffix); std::fs::rename(path, &db_path)?; - self.open_tablet_raw(db_path.as_path(), false) + let new_engine = self.open_tablet_raw(db_path.as_path(), false); + if new_engine.is_ok() { + let (old_id, old_suffix) = get_id_and_suffix_from_path(path); + self.registry.lock().unwrap().remove(&(old_id, old_suffix)); + } + new_engine } fn clone(&self) -> Box + Send> { @@ -261,6 +272,11 @@ mod tests { assert!(!factory.is_tombstoned(1, 10)); assert!(factory.load_tablet(&tablet_path, 1, 10).is_err()); assert!(factory.load_tablet(&tablet_path, 1, 20).is_ok()); + // After we load it as with the new id or suffix, we should be unable to get it with + // the old id and suffix in the cache. + assert!(factory.open_tablet_cache(1, 10).is_none()); + assert!(factory.open_tablet_cache(1, 20).is_some()); + factory.mark_tombstone(1, 20); assert!(factory.is_tombstoned(1, 20)); factory.destroy_tablet(1, 20).unwrap(); From ab3d866ee3b163560ba35d5ba5e1863b9de7f47c Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Wed, 13 Jul 2022 19:01:05 +0800 Subject: [PATCH 077/676] log-backup: store log files by date and hour in sub directory (#13006) close tikv/tikv#12902 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- components/backup-stream/src/router.rs | 36 +++++++++++++++++++------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index dec4baeae89..debb4b417c8 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -530,6 +530,11 @@ struct TempFileKey { is_meta: bool, } +pub enum FormatType { + Date, + Hour, +} + impl TempFileKey { /// Create the key for an event. The key can be used to find which temporary file the event should be stored. fn of(kv: &ApplyEvent, region_id: u64) -> Self { @@ -588,7 +593,7 @@ impl TempFileKey { } } - fn format_date_time(ts: u64) -> impl Display { + fn format_date_time(ts: u64, t: FormatType) -> impl Display { use chrono::prelude::*; let millis = TimeStamp::physical(ts.into()); let dt = Utc.timestamp_millis(millis as _); @@ -600,19 +605,26 @@ impl TempFileKey { .format(&s.unwrap_or_else(|| "%Y%m".to_owned())) .to_string(); }); - return dt.format("%Y%m%d").to_string(); + match t { + FormatType::Date => dt.format("%Y%m%d").to_string(), + FormatType::Hour => dt.format("%H").to_string(), + } } #[cfg(not(feature = "failpoints"))] - return dt.format("%Y%m%d"); + match t { + FormatType::Date => dt.format("%Y%m%d"), + FormatType::Hour => dt.format("%H"), + } } /// path_to_log_file specifies the path of record log. - /// eg. "v1/20220625/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log" + /// eg. "v1/20220625/03/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log" fn path_to_log_file(&self, min_ts: u64, max_ts: u64) -> String { format!( - "v1/{}/t{:08}/{:012}-{}.log", + "v1/{}/{}/t{:08}/{:012}-{}.log", // We may delete a range of files, so using the max_ts for preventing remove some records wrong. - Self::format_date_time(max_ts), + Self::format_date_time(max_ts, FormatType::Date), + Self::format_date_time(max_ts, FormatType::Hour), self.table_id, min_ts, uuid::Uuid::new_v4() @@ -620,11 +632,12 @@ impl TempFileKey { } /// path_to_schema_file specifies the path of schema log. - /// eg. "v1/20220625/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log" + /// eg. "v1/20220625/03/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log" fn path_to_schema_file(min_ts: u64, max_ts: u64) -> String { format!( - "v1/{}/schema-meta/{:012}-{}.log", - Self::format_date_time(max_ts), + "v1/{}/{}/schema-meta/{:012}-{}.log", + Self::format_date_time(max_ts, FormatType::Date), + Self::format_date_time(max_ts, FormatType::Hour), min_ts, uuid::Uuid::new_v4(), ) @@ -1765,9 +1778,12 @@ mod tests { #[test] fn test_format_datetime() { - let s = TempFileKey::format_date_time(431656320867237891); + let s = TempFileKey::format_date_time(431656320867237891, FormatType::Date); let s = s.to_string(); assert_eq!(s, "20220307"); + + let s = TempFileKey::format_date_time(431656320867237891, FormatType::Hour); + assert_eq!(s.to_string(), "07"); } #[test] From 5b8deaaf81c350a3ad44f842b04e9e107fbab3c0 Mon Sep 17 00:00:00 2001 From: haojinming Date: Wed, 13 Jul 2022 19:21:05 +0800 Subject: [PATCH 078/676] BR: flush causal timestamp before backup start for rawkv apiv2 (#12991) ref tikv/migration#138, close tikv/tikv#12989 Signed-off-by: haojinming Co-authored-by: Ping Yu Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/backup/Cargo.toml | 1 + components/backup/src/endpoint.rs | 58 +++++++++++++++++++++++++++++-- components/causal_ts/src/lib.rs | 7 ++++ components/cdc/src/endpoint.rs | 39 +++++++++++++++++++-- components/cdc/src/observer.rs | 15 +------- components/cdc/tests/mod.rs | 1 + components/server/src/server.rs | 46 +++++++++++++++--------- components/test_backup/src/lib.rs | 1 + 9 files changed, 133 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 15da9f000b0..f1d08413c9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -395,6 +395,7 @@ version = "0.0.1" dependencies = [ "api_version", "async-channel", + "causal_ts", "collections", "concurrency_manager", "crc64fast", diff --git a/components/backup/Cargo.toml b/components/backup/Cargo.toml index effe13c4e08..85131c8e68f 100644 --- a/components/backup/Cargo.toml +++ b/components/backup/Cargo.toml @@ -35,6 +35,7 @@ failpoints = ["tikv/failpoints"] [dependencies] api_version = { path = "../api_version", default-features = false } async-channel = "1.4" +causal_ts = { path = "../causal_ts" } collections = { path = "../collections" } concurrency_manager = { path = "../concurrency_manager", default-features = false } crc64fast = "0.1" diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 2a68cbb6bd8..9402879fb5c 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -9,6 +9,7 @@ use std::{ }; use async_channel::SendError; +use causal_ts::CausalTsProvider; use concurrency_manager::ConcurrencyManager; use engine_rocks::raw::DB; use engine_traits::{name_to_cf, raw_ttl::ttl_current_ts, CfName, SstCompressionType}; @@ -661,6 +662,7 @@ pub struct Endpoint { concurrency_manager: ConcurrencyManager, softlimit: SoftLimitKeeper, api_version: ApiVersion, + causal_ts_provider: Option>, // used in rawkv apiv2 only pub(crate) engine: E, pub(crate) region_info: R, @@ -782,6 +784,7 @@ impl Endpoint { config: BackupConfig, concurrency_manager: ConcurrencyManager, api_version: ApiVersion, + causal_ts_provider: Option>, ) -> Endpoint { let pool = ControlThreadPool::new(); let rt = utils::create_tokio_runtime(config.io_thread_size, "backup-io").unwrap(); @@ -799,6 +802,7 @@ impl Endpoint { config_manager, concurrency_manager, api_version, + causal_ts_provider, } } @@ -962,6 +966,26 @@ impl Endpoint { } return; } + // Flush causal timestamp to make sure that future writes will have larger timestamps. + // And help TiKV-BR acquire a backup-ts with intact data smaller than it. + // (Note that intactness is not fully ensured now, until the safe-ts of RawKV is implemented. + // TiKV-BR need a workaround by rewinding backup-ts to a small "safe interval"). + if request.is_raw_kv { + if let Err(e) = self + .causal_ts_provider + .as_ref() + .map_or(Ok(()), |provider| provider.flush()) + { + error!("backup flush causal timestamp failed"; "err" => ?e); + let mut response = BackupResponse::default(); + let err_msg = format!("fail to flush causal ts, {:?}", e); + response.set_error(crate::Error::Other(box_err!(err_msg)).into()); + if let Err(err) = resp.unbounded_send(response) { + error_unknown!(?err; "backup failed to send response"); + } + return; + } + } let start_key = codec.encode_backup_key(request.start_key.clone()); let end_key = codec.encode_backup_key(request.end_key.clone()); @@ -1198,13 +1222,14 @@ pub mod tests { } pub fn new_endpoint() -> (TempDir, Endpoint) { - new_endpoint_with_limiter(None, ApiVersion::V1, false) + new_endpoint_with_limiter(None, ApiVersion::V1, false, None) } pub fn new_endpoint_with_limiter( limiter: Option>, api_version: ApiVersion, is_raw_kv: bool, + causal_ts_provider: Option>, ) -> (TempDir, Endpoint) { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() @@ -1236,6 +1261,7 @@ pub mod tests { }, concurrency_manager, api_version, + causal_ts_provider, ), ) } @@ -1445,7 +1471,7 @@ pub mod tests { fn test_handle_backup_task() { let limiter = Arc::new(IORateLimiter::new_for_test()); let stats = limiter.statistics().unwrap(); - let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), ApiVersion::V1, false); + let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), ApiVersion::V1, false, None); let engine = endpoint.engine.clone(); endpoint @@ -1584,7 +1610,7 @@ pub mod tests { fn test_handle_backup_raw_task_impl(cur_api_ver: ApiVersion, dst_api_ver: ApiVersion) -> bool { let limiter = Arc::new(IORateLimiter::new_for_test()); let stats = limiter.statistics().unwrap(); - let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), cur_api_ver, true); + let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), cur_api_ver, true, None); let engine = endpoint.engine.clone(); let start_key_idx: u64 = 100; @@ -1721,6 +1747,32 @@ pub mod tests { } } + #[test] + fn test_backup_raw_apiv2_causal_ts() { + let limiter = Arc::new(IORateLimiter::new_for_test()); + let ts_provider = Arc::new(causal_ts::tests::TestProvider::default()); + let start_ts = ts_provider.get_ts().unwrap(); + let (tmp, endpoint) = new_endpoint_with_limiter( + Some(limiter), + ApiVersion::V2, + true, + Some(ts_provider.clone()), + ); + + let mut req = BackupRequest::default(); + let (tx, _) = unbounded(); + let tmp1 = make_unique_dir(tmp.path()); + req.set_storage_backend(make_local_backend(&tmp1)); + req.set_start_key(b"r".to_vec()); + req.set_end_key(b"s".to_vec()); + req.set_is_raw_kv(true); + req.set_dst_api_version(ApiVersion::V2); + let (task, _) = Task::new(req, tx).unwrap(); + endpoint.handle_backup_task(task); + let end_ts = ts_provider.get_ts().unwrap(); + assert_eq!(end_ts.into_inner(), start_ts.next().into_inner() + 100); + } + #[test] fn test_scan_error() { let (tmp, endpoint) = new_endpoint(); diff --git a/components/causal_ts/src/lib.rs b/components/causal_ts/src/lib.rs index ea5fe3bdcc3..615f01365cd 100644 --- a/components/causal_ts/src/lib.rs +++ b/components/causal_ts/src/lib.rs @@ -58,6 +58,13 @@ pub mod tests { fn get_ts(&self) -> Result { Ok(self.ts.fetch_add(1, Ordering::Relaxed).into()) } + + // This is used for unit test. Add 100 from current. + // Do not modify this value as several test cases depend on it. + fn flush(&self) -> Result<()> { + self.ts.fetch_add(100, Ordering::Relaxed); + Ok(()) + } } #[derive(Clone, Default)] diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 7a67c2f9d85..fa6dcb97651 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -8,6 +8,7 @@ use std::{ time::Duration, }; +use causal_ts::CausalTsProvider; use collections::{HashMap, HashMapEntry, HashSet}; use concurrency_manager::ConcurrencyManager; use crossbeam::atomic::AtomicCell; @@ -413,6 +414,7 @@ pub struct Endpoint { env: Arc, security_mgr: Arc, region_read_progress: RegionReadProgressRegistry, + causal_ts_provider: Option>, // Metrics and logging. current_ts: TimeStamp, @@ -438,6 +440,7 @@ impl, E: KvEngine> Endpoint { env: Arc, security_mgr: Arc, sink_memory_quota: MemoryQuota, + causal_ts_provider: Option>, ) -> Endpoint { let workers = Builder::new_multi_thread() .thread_name("cdcwkr") @@ -508,6 +511,7 @@ impl, E: KvEngine> Endpoint { // Log the first resolved ts warning. warn_resolved_ts_repeat_count: WARN_RESOLVED_TS_COUNT_THRESHOLD, current_ts: TimeStamp::zero(), + causal_ts_provider, }; ep.register_min_ts_event(); ep @@ -1111,7 +1115,7 @@ impl, E: KvEngine> Endpoint { let tikv_clients = self.tikv_clients.clone(); let hibernate_regions_compatible = self.config.hibernate_regions_compatible; let region_read_progress = self.region_read_progress.clone(); - let observer = self.observer.clone(); + let causal_ts_provider = self.causal_ts_provider.clone(); let fut = async move { let _ = timeout.compat().await; @@ -1141,7 +1145,7 @@ impl, E: KvEngine> Endpoint { // If flush_causal_timestamp fails, cannot schedule MinTS task // as new coming raw data may use timestamp smaller than min_ts - if let Err(e) = observer.flush_causal_timestamp() { + if let Err(e) = causal_ts_provider.map_or(Ok(()), |provider| provider.flush()) { error!("cdc flush causal timestamp failed"; "err" => ?e); return; } @@ -1473,6 +1477,15 @@ mod tests { cfg: &CdcConfig, engine: Option, api_version: ApiVersion, + ) -> TestEndpointSuite { + mock_endpoint_with_ts_provider(cfg, engine, api_version, None) + } + + fn mock_endpoint_with_ts_provider( + cfg: &CdcConfig, + engine: Option, + api_version: ApiVersion, + causal_ts_provider: Option>, ) -> TestEndpointSuite { let (task_sched, task_rx) = dummy_scheduler(); let raft_router = MockRaftStoreRouter::new(); @@ -1495,6 +1508,7 @@ mod tests { Arc::new(Environment::new(1)), Arc::new(SecurityManager::default()), MemoryQuota::new(usize::MAX), + causal_ts_provider, ); TestEndpointSuite { @@ -2226,6 +2240,27 @@ mod tests { ); } + #[test] + fn test_raw_causal_ts_flush() { + let sleep_interval = Duration::from_secs(1); + let cfg = CdcConfig { + min_ts_interval: ReadableDuration(sleep_interval), + ..Default::default() + }; + let ts_provider = Arc::new(causal_ts::tests::TestProvider::default()); + let start_ts = ts_provider.get_ts().unwrap(); + let mut suite = + mock_endpoint_with_ts_provider(&cfg, None, ApiVersion::V2, Some(ts_provider.clone())); + suite.run(Task::RegisterMinTsEvent); + suite + .task_rx + .recv_timeout(Duration::from_millis(1500)) + .unwrap() + .unwrap(); + let end_ts = ts_provider.get_ts().unwrap(); + assert!(end_ts.into_inner() >= start_ts.next().into_inner() + 100); // may trigger more than once. + } + #[test] fn test_feature_gate() { let cfg = CdcConfig { diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 5779d5f7e06..6c0771cbc64 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -2,7 +2,7 @@ use std::sync::{Arc, RwLock}; -use causal_ts::{CausalTsProvider, Error as CausalTsError, RawTsTracker, Result as CausalTsResult}; +use causal_ts::{Error as CausalTsError, RawTsTracker, Result as CausalTsResult}; use collections::HashMap; use engine_traits::KvEngine; use fail::fail_point; @@ -30,8 +30,6 @@ pub struct CdcObserver { // A shared registry for managing observed regions. // TODO: it may become a bottleneck, find a better way to manage the registry. observe_regions: Arc>>, - - pub causal_ts_provider: Option>, } impl CdcObserver { @@ -43,14 +41,9 @@ impl CdcObserver { CdcObserver { sched, observe_regions: Arc::default(), - causal_ts_provider: None, } } - pub fn set_causal_ts_provider(&mut self, provider: Arc) { - self.causal_ts_provider = Some(provider); - } - pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { // use 0 as the priority of the cmd observer. CDC should have a higher priority than // the `resolved-ts`'s cmd observer @@ -98,12 +91,6 @@ impl CdcObserver { .get(®ion_id) .cloned() } - - pub fn flush_causal_timestamp(&self) -> CausalTsResult<()> { - self.causal_ts_provider - .as_ref() - .map_or(Ok(()), |provider| provider.flush()) - } } impl Coprocessor for CdcObserver {} diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index 6443ffea158..25283951450 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -188,6 +188,7 @@ impl TestSuiteBuilder { env, sim.security_mgr.clone(), MemoryQuota::new(usize::MAX), + None, ); let mut updated_cfg = cfg.clone(); updated_cfg.min_ts_interval = ReadableDuration::millis(100); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 4bd95b1de60..c0ed12bf73c 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -31,6 +31,7 @@ use backup_stream::{ metadata::{ConnectionConfig, LazyEtcdClient}, observer::BackupStreamObserver, }; +use causal_ts::{BatchTsoProvider, CausalTsProvider}; use cdc::{CdcConfigManager, MemoryQuota}; use concurrency_manager::ConcurrencyManager; use encryption_export::{data_key_manager_from_config, DataKeyManager}; @@ -139,7 +140,7 @@ const CPU_QUOTA_ADJUSTMENT_PACE: f64 = 200.0; // 0.2 vcpu #[inline] fn run_impl(config: TiKvConfig) { - let mut tikv = TiKvServer::::init(config); + let mut tikv = TiKvServer::::init::(config); // Must be called after `TiKvServer::init`. let memory_limit = tikv.config.memory_usage_limit.unwrap().0; @@ -230,6 +231,7 @@ struct TiKvServer { background_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + causal_ts_provider: Option>>, // used for rawkv apiv2 } struct TiKvEngines { @@ -254,7 +256,7 @@ type LocalServer = type LocalRaftKv = RaftKv>; impl TiKvServer { - fn init(mut config: TiKvConfig) -> TiKvServer { + fn init(mut config: TiKvConfig) -> TiKvServer { tikv_util::thread_group::set_properties(Some(GroupProperties::default())); // It is okay use pd config and security config before `init_config`, // because these configs must be provided by command line, and only @@ -310,6 +312,20 @@ impl TiKvServer { config.quota.enable_auto_tune, )); + let mut causal_ts_provider = None; + if let ApiVersion::V2 = F::TAG { + let tso = block_on(causal_ts::BatchTsoProvider::new_opt( + pd_client.clone(), + config.causal_ts.renew_interval.0, + config.causal_ts.renew_batch_min_size, + )); + if let Err(e) = tso { + fatal!("Causal timestamp provider initialize failed: {:?}", e); + } + causal_ts_provider = Some(Arc::new(tso.unwrap())); + info!("Causal timestamp provider startup."); + } + TiKvServer { config, cfg_controller: Some(cfg_controller), @@ -335,6 +351,7 @@ impl TiKvServer { flow_info_receiver: None, sst_worker: None, quota_limiter, + causal_ts_provider, } } @@ -778,7 +795,7 @@ impl TiKvServer { } // Register cdc. - let mut cdc_ob = cdc::CdcObserver::new(cdc_scheduler.clone()); + let cdc_ob = cdc::CdcObserver::new(cdc_scheduler.clone()); cdc_ob.register_to(self.coprocessor_host.as_mut().unwrap()); // Register cdc config manager. cfg_controller.register( @@ -805,21 +822,10 @@ impl TiKvServer { }; // Register causal observer for RawKV API V2 - if let ApiVersion::V2 = F::TAG { - let tso = block_on(causal_ts::BatchTsoProvider::new_opt( - self.pd_client.clone(), - self.config.causal_ts.renew_interval.0, - self.config.causal_ts.renew_batch_min_size, - )); - if let Err(e) = tso { - fatal!("Causal timestamp provider initialize failed: {:?}", e); - } - let causal_ts_provider = Arc::new(tso.unwrap()); - info!("Causal timestamp provider startup."); - cdc_ob.set_causal_ts_provider(causal_ts_provider.clone()); - let causal_ob = causal_ts::CausalObserver::new(causal_ts_provider, cdc_ob.clone()); + if let Some(provider) = self.causal_ts_provider.clone() { + let causal_ob = causal_ts::CausalObserver::new(provider, cdc_ob.clone()); causal_ob.register_to(self.coprocessor_host.as_mut().unwrap()); - } + }; let check_leader_runner = CheckLeaderRunner::new(engines.store_meta.clone()); let check_leader_scheduler = self @@ -1044,6 +1050,9 @@ impl TiKvServer { server.env(), self.security_mgr.clone(), cdc_memory_quota.clone(), + self.causal_ts_provider + .clone() + .map(|provider| provider as Arc), ); cdc_worker.start_with_timer(cdc_endpoint); self.to_stop.push(cdc_worker); @@ -1177,6 +1186,9 @@ impl TiKvServer { self.config.backup.clone(), self.concurrency_manager.clone(), self.config.storage.api_version(), + self.causal_ts_provider + .clone() + .map(|provider| provider as Arc), ); self.cfg_controller.as_mut().unwrap().register( tikv::config::Module::Backup, diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index f8f96b34921..bf14b86dfc8 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -94,6 +94,7 @@ impl TestSuite { }, sim.get_concurrency_manager(*id), api_version, + None, ); let mut worker = bg_worker.lazy_build(format!("backup-{}", id)); worker.start(backup_endpoint); From 23588581613251bdec283764a604197dd00ca377 Mon Sep 17 00:00:00 2001 From: haojinming Date: Thu, 14 Jul 2022 18:25:06 +0800 Subject: [PATCH 079/676] Backup: Do not fill cache when backup rawkv (#13022) close tikv/tikv#13020 Signed-off-by: haojinming --- components/backup/src/endpoint.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 9402879fb5c..0734af017d2 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -454,6 +454,7 @@ impl BackupRange { let mut cursor = CursorBuilder::new(snapshot, self.cf) .range(None, self.end_key.clone()) .scan_mode(ScanMode::Forward) + .fill_cache(false) .build()?; if let Some(begin) = self.start_key.clone() { if !cursor.seek(&begin, cfstatistics)? { From 1e98feecc6b4550cf0809c1583fb7a07479ecae9 Mon Sep 17 00:00:00 2001 From: Zwb Date: Thu, 14 Jul 2022 19:55:06 +0800 Subject: [PATCH 080/676] Optimize Commit pipeline performance (#12899) close tikv/tikv#12898 Optimize Commit pipeline performance Signed-off-by: Wenbo Zhang Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot --- Cargo.lock | 6 +- components/backup-stream/src/utils.rs | 1 + components/engine_rocks/src/engine.rs | 11 +- components/engine_rocks/src/misc.rs | 1 + components/engine_rocks/src/raft_engine.rs | 12 +- components/engine_rocks/src/write_batch.rs | 226 +++- components/engine_test/src/lib.rs | 12 +- components/engine_traits_tests/src/lib.rs | 19 + .../engine_traits_tests/src/write_batch.rs | 1133 ++++++++++++++++- components/raftstore/src/store/fsm/apply.rs | 1 + src/config.rs | 2 + src/server/debug.rs | 53 +- src/server/gc_worker/compaction_filter.rs | 6 +- src/server/reset_to_version.rs | 6 +- .../misc/writebatch/bench_writebatch.rs | 37 +- tests/failpoints/cases/test_gc_worker.rs | 4 +- 16 files changed, 1430 insertions(+), 100 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f1d08413c9c..d08e8fc3b25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2747,7 +2747,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#d8b7ff8aee62aa9a406b64f7093049d62eeb9a1a" +source = "git+https://github.com/tikv/rust-rocksdb.git#c8878e2df0c7c23d553d345d337d9dda332e2d5a" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2766,7 +2766,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#d8b7ff8aee62aa9a406b64f7093049d62eeb9a1a" +source = "git+https://github.com/tikv/rust-rocksdb.git#c8878e2df0c7c23d553d345d337d9dda332e2d5a" dependencies = [ "bzip2-sys", "cc", @@ -4571,7 +4571,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#d8b7ff8aee62aa9a406b64f7093049d62eeb9a1a" +source = "git+https://github.com/tikv/rust-rocksdb.git#c8878e2df0c7c23d553d345d337d9dda332e2d5a" dependencies = [ "libc 0.2.125", "librocksdb_sys", diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 725a1c17f51..678b571f3b5 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -647,6 +647,7 @@ mod test { let p = TempDir::new("test_db").unwrap(); let mut opt = DBOptions::default(); opt.create_if_missing(true); + opt.enable_multi_write_batch(true); let db = DB::open(opt.clone(), p.path().as_os_str().to_str().unwrap()).unwrap(); let engine = RocksEngine::from_db(Arc::new(db)); let mut wb = engine.write_batch(); diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 60be2007367..33af3b78036 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -26,16 +26,21 @@ use crate::{ pub struct RocksEngine { db: Arc, shared_block_cache: bool, + support_multi_batch_write: bool, } impl RocksEngine { pub fn from_db(db: Arc) -> Self { RocksEngine { - db, + db: db.clone(), shared_block_cache: false, + support_multi_batch_write: db.get_db_options().is_enable_multi_batch_write(), } } + // Notice: After obtaining RocksEngine through this method, please make sure + // it has been initialized with db, otherwise do not call its member methods, + // as it'll contain garbage members. pub fn from_ref(db: &Arc) -> &Self { unsafe { &*(db as *const Arc as *const RocksEngine) } } @@ -63,6 +68,10 @@ impl RocksEngine { pub fn set_shared_block_cache(&mut self, enable: bool) { self.shared_block_cache = enable; } + + pub fn support_multi_batch_write(&self) -> bool { + self.support_multi_batch_write + } } impl KvEngine for RocksEngine { diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 0ae93fe34df..d7741e98c26 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -567,6 +567,7 @@ mod tests { let mut opts = DBOptions::new(); opts.create_if_missing(true); + opts.enable_multi_batch_write(true); let mut cf_opts = ColumnFamilyOptions::new(); // Prefix extractor(trim the timestamp at tail) for write cf. diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index 19ceea3062c..2f67904486f 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -14,7 +14,7 @@ use protobuf::Message; use raft::eraftpb::Entry; use tikv_util::{box_err, box_try}; -use crate::{util, RocksEngine, RocksWriteBatch}; +use crate::{util, RocksEngine, RocksWriteBatchVec}; impl RaftEngineReadOnly for RocksEngine { fn get_raft_state(&self, raft_group_id: u64) -> Result> { @@ -176,7 +176,7 @@ impl RocksEngine { raft_group_id: u64, mut from: u64, to: u64, - raft_wb: &mut RocksWriteBatch, + raft_wb: &mut RocksWriteBatchVec, ) -> Result { if from == 0 { let start_key = keys::raft_log_key(raft_group_id, 0); @@ -207,10 +207,10 @@ impl RocksEngine { // for all KvEngines, but is currently implemented separately for // every engine. impl RaftEngine for RocksEngine { - type LogBatch = RocksWriteBatch; + type LogBatch = RocksWriteBatchVec; fn log_batch(&self, capacity: usize) -> Self::LogBatch { - RocksWriteBatch::with_capacity(self, capacity) + RocksWriteBatchVec::with_unit_capacity(self, capacity) } fn sync(&self) -> Result<()> { @@ -368,7 +368,7 @@ impl RaftEngine for RocksEngine { } } -impl RaftLogBatch for RocksWriteBatch { +impl RaftLogBatch for RocksWriteBatchVec { fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { let ser_buf = Vec::with_capacity(max_size as usize); @@ -421,7 +421,7 @@ impl RaftLogBatch for RocksWriteBatch { } } -impl RocksWriteBatch { +impl RocksWriteBatchVec { fn append_impl( &mut self, raft_group_id: u64, diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index e9428b2c291..1aa5c424521 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -7,126 +7,215 @@ use rocksdb::{Writable, WriteBatch as RawWriteBatch, DB}; use crate::{engine::RocksEngine, options::RocksWriteOptions, util::get_cf_handle}; +const WRITE_BATCH_MAX_BATCH: usize = 16; +const WRITE_BATCH_LIMIT: usize = 16; + impl WriteBatchExt for RocksEngine { - type WriteBatch = RocksWriteBatch; + type WriteBatch = RocksWriteBatchVec; const WRITE_BATCH_MAX_KEYS: usize = 256; - fn write_batch(&self) -> RocksWriteBatch { - RocksWriteBatch::new(self.as_inner().clone()) + fn write_batch(&self) -> RocksWriteBatchVec { + RocksWriteBatchVec::new( + Arc::clone(self.as_inner()), + WRITE_BATCH_LIMIT, + 1, + self.support_multi_batch_write(), + ) } - fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatch { - RocksWriteBatch::with_capacity(self, cap) + fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatchVec { + RocksWriteBatchVec::with_unit_capacity(self, cap) } } -pub struct RocksWriteBatch { +/// `RocksWriteBatchVec` is for method `MultiBatchWrite` of RocksDB, which splits a large WriteBatch +/// into many smaller ones and then any thread could help to deal with these small WriteBatch when it +/// is calling `MultiBatchCommit` and wait the front writer to finish writing. `MultiBatchWrite` will +/// perform much better than traditional `pipelined_write` when TiKV writes very large data into RocksDB. +/// We will remove this feature when `unordered_write` of RocksDB becomes more stable and becomes compatible +/// with Titan. +pub struct RocksWriteBatchVec { db: Arc, - wb: RawWriteBatch, + wbs: Vec, + save_points: Vec, + index: usize, + batch_size_limit: usize, + support_write_batch_vec: bool, } -impl RocksWriteBatch { - pub fn new(db: Arc) -> RocksWriteBatch { - let wb = RawWriteBatch::new(); - RocksWriteBatch { db, wb } - } - - pub fn with_capacity(engine: &RocksEngine, cap: usize) -> RocksWriteBatch { +impl RocksWriteBatchVec { + pub fn new( + db: Arc, + batch_size_limit: usize, + cap: usize, + support_write_batch_vec: bool, + ) -> RocksWriteBatchVec { let wb = RawWriteBatch::with_capacity(cap); - RocksWriteBatch { - db: engine.as_inner().clone(), - wb, + RocksWriteBatchVec { + db, + wbs: vec![wb], + save_points: vec![], + index: 0, + batch_size_limit, + support_write_batch_vec, } } - pub fn as_inner(&self) -> &RawWriteBatch { - &self.wb + pub fn with_unit_capacity(engine: &RocksEngine, cap: usize) -> RocksWriteBatchVec { + Self::new( + engine.as_inner().clone(), + WRITE_BATCH_LIMIT, + cap, + engine.support_multi_batch_write(), + ) } - pub fn as_raw(&self) -> &RawWriteBatch { - &self.wb + pub fn as_inner(&self) -> &[RawWriteBatch] { + &self.wbs[0..=self.index] } pub fn get_db(&self) -> &DB { self.db.as_ref() } + + /// `check_switch_batch` will split a large WriteBatch into many smaller ones. This is to avoid + /// a large WriteBatch blocking write_thread too long. + #[inline(always)] + fn check_switch_batch(&mut self) { + if self.support_write_batch_vec + && self.batch_size_limit > 0 + && self.wbs[self.index].count() >= self.batch_size_limit + { + self.index += 1; + if self.index >= self.wbs.len() { + self.wbs.push(RawWriteBatch::default()); + } + } + } } -impl engine_traits::WriteBatch for RocksWriteBatch { +impl engine_traits::WriteBatch for RocksWriteBatchVec { fn write_opt(&self, opts: &WriteOptions) -> Result<()> { let opt: RocksWriteOptions = opts.into(); - self.get_db() - .write_opt(&self.wb, &opt.into_raw()) - .map_err(Error::Engine) + if self.index > 0 { + self.get_db() + .multi_batch_write(self.as_inner(), &opt.into_raw()) + .map_err(Error::Engine) + } else { + self.get_db() + .write_opt(&self.wbs[0], &opt.into_raw()) + .map_err(Error::Engine) + } } fn data_size(&self) -> usize { - self.wb.data_size() + let mut size: usize = 0; + for i in 0..=self.index { + size += self.wbs[i].data_size(); + } + size } fn count(&self) -> usize { - self.wb.count() + self.wbs[self.index].count() + self.index * self.batch_size_limit } fn is_empty(&self) -> bool { - self.wb.is_empty() + self.wbs[0].is_empty() } fn should_write_to_engine(&self) -> bool { - self.count() > RocksEngine::WRITE_BATCH_MAX_KEYS + if self.support_write_batch_vec { + self.index >= WRITE_BATCH_MAX_BATCH + } else { + self.wbs[0].count() > RocksEngine::WRITE_BATCH_MAX_KEYS + } } fn clear(&mut self) { - self.wb.clear(); + for i in 0..=self.index { + self.wbs[i].clear(); + } + self.save_points.clear(); + // Avoid making the wbs too big at one time, then the memory will be kept + // after reusing + if self.index > WRITE_BATCH_MAX_BATCH + 1 { + self.wbs.shrink_to(WRITE_BATCH_MAX_BATCH + 1); + } + self.index = 0; } fn set_save_point(&mut self) { - self.wb.set_save_point(); + self.wbs[self.index].set_save_point(); + self.save_points.push(self.index); } fn pop_save_point(&mut self) -> Result<()> { - self.wb.pop_save_point().map_err(Error::Engine) + if let Some(x) = self.save_points.pop() { + return self.wbs[x].pop_save_point().map_err(Error::Engine); + } + Err(Error::Engine("no save point".into())) } fn rollback_to_save_point(&mut self) -> Result<()> { - self.wb.rollback_to_save_point().map_err(Error::Engine) + if let Some(x) = self.save_points.pop() { + for i in x + 1..=self.index { + self.wbs[i].clear(); + } + self.index = x; + return self.wbs[x].rollback_to_save_point().map_err(Error::Engine); + } + Err(Error::Engine("no save point".into())) } fn merge(&mut self, other: Self) -> Result<()> { - self.wb.append(other.wb.data()); + for wb in other.as_inner() { + self.check_switch_batch(); + self.wbs[self.index].append(wb.data()); + } Ok(()) } } -impl Mutable for RocksWriteBatch { +impl Mutable for RocksWriteBatchVec { fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { - self.wb.put(key, value).map_err(Error::Engine) + self.check_switch_batch(); + self.wbs[self.index].put(key, value).map_err(Error::Engine) } fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + self.check_switch_batch(); let handle = get_cf_handle(self.db.as_ref(), cf)?; - self.wb.put_cf(handle, key, value).map_err(Error::Engine) + self.wbs[self.index] + .put_cf(handle, key, value) + .map_err(Error::Engine) } fn delete(&mut self, key: &[u8]) -> Result<()> { - self.wb.delete(key).map_err(Error::Engine) + self.check_switch_batch(); + self.wbs[self.index].delete(key).map_err(Error::Engine) } fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { + self.check_switch_batch(); let handle = get_cf_handle(self.db.as_ref(), cf)?; - self.wb.delete_cf(handle, key).map_err(Error::Engine) + self.wbs[self.index] + .delete_cf(handle, key) + .map_err(Error::Engine) } fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - self.wb + self.check_switch_batch(); + self.wbs[self.index] .delete_range(begin_key, end_key) .map_err(Error::Engine) } fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + self.check_switch_batch(); let handle = get_cf_handle(self.db.as_ref(), cf)?; - self.wb + self.wbs[self.index] .delete_range_cf(handle, begin_key, end_key) .map_err(Error::Engine) } @@ -144,20 +233,27 @@ mod tests { }; #[test] - fn test_should_write_to_engine() { + fn test_should_write_to_engine_with_pipeline_write_mode() { let path = Builder::new() .prefix("test-should-write-to-engine") .tempdir() .unwrap(); let opt = RawDBOptions::default(); opt.enable_unordered_write(false); - opt.enable_pipelined_write(false); + opt.enable_pipelined_write(true); + opt.enable_multi_batch_write(false); let engine = new_engine_opt( path.path().join("db").to_str().unwrap(), RocksDBOptions::from_raw(opt), vec![], ) .unwrap(); + assert!( + !engine + .as_inner() + .get_db_options() + .is_enable_multi_batch_write() + ); let mut wb = engine.write_batch(); for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { wb.put(b"aaa", b"bbb").unwrap(); @@ -166,16 +262,58 @@ mod tests { wb.put(b"aaa", b"bbb").unwrap(); assert!(wb.should_write_to_engine()); wb.write().unwrap(); + let v = engine.get_value(b"aaa").unwrap(); + assert!(v.is_some()); assert_eq!(v.unwrap(), b"bbb"); - let mut wb = RocksWriteBatch::with_capacity(&engine, 1024); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } + + #[test] + fn test_should_write_to_engine_with_multi_batch_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let opt = RawDBOptions::default(); + opt.enable_unordered_write(false); + opt.enable_pipelined_write(false); + opt.enable_multi_batch_write(true); + let engine = new_engine_opt( + path.path().join("db").to_str().unwrap(), + RocksDBOptions::from_raw(opt), + vec![], + ) + .unwrap(); + assert!( + engine + .as_inner() + .get_db_options() + .is_enable_multi_batch_write() + ); + let mut wb = engine.write_batch(); for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { wb.put(b"aaa", b"bbb").unwrap(); } assert!(!wb.should_write_to_engine()); wb.put(b"aaa", b"bbb").unwrap(); assert!(wb.should_write_to_engine()); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _i in 0..WRITE_BATCH_MAX_BATCH * WRITE_BATCH_LIMIT { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); wb.clear(); assert!(!wb.should_write_to_engine()); } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index f7fd904fd1c..b670ef34500 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -88,7 +88,7 @@ pub mod kv { #[cfg(feature = "test-engine-kv-rocksdb")] pub use engine_rocks::{ RocksEngine as KvTestEngine, RocksEngineIterator as KvTestEngineIterator, - RocksSnapshot as KvTestSnapshot, RocksWriteBatch as KvTestWriteBatch, + RocksSnapshot as KvTestSnapshot, RocksWriteBatchVec as KvTestWriteBatch, }; use engine_traits::{Result, TabletAccessor, TabletFactory}; use tikv_util::box_err; @@ -363,6 +363,7 @@ pub mod ctor { pub struct DBOptions { key_manager: Option>, rate_limiter: Option>, + enable_multi_batch_write: bool, } impl DBOptions { @@ -373,6 +374,10 @@ pub mod ctor { pub fn set_rate_limiter(&mut self, rate_limiter: Option>) { self.rate_limiter = rate_limiter; } + + pub fn set_enable_multi_batch_write(&mut self, enable: bool) { + self.enable_multi_batch_write = enable; + } } pub type RaftDBOptions = DBOptions; @@ -644,6 +649,11 @@ pub mod ctor { let mut rocks_db_opts = RawRocksDBOptions::new(); let env = get_env(db_opts.key_manager.clone(), db_opts.rate_limiter)?; rocks_db_opts.set_env(env); + if db_opts.enable_multi_batch_write { + rocks_db_opts.enable_unordered_write(false); + rocks_db_opts.enable_pipelined_write(false); + rocks_db_opts.enable_multi_batch_write(true); + } let rocks_db_opts = RocksDBOptions::from_raw(rocks_db_opts); Ok(rocks_db_opts) } diff --git a/components/engine_traits_tests/src/lib.rs b/components/engine_traits_tests/src/lib.rs index 49fe26b4f4d..0ddb39c61ac 100644 --- a/components/engine_traits_tests/src/lib.rs +++ b/components/engine_traits_tests/src/lib.rs @@ -71,6 +71,25 @@ fn default_engine() -> TempDirEnginePair { } } +/// Create a multi batch write engine with only CF_DEFAULT +fn multi_batch_write_engine() -> TempDirEnginePair { + use engine_test::{ + ctor::{DBOptions as KvTestDBOptions, KvEngineConstructorExt}, + kv::KvTestEngine, + }; + use engine_traits::CF_DEFAULT; + + let dir = tempdir(); + let path = dir.path().to_str().unwrap(); + let mut opt = KvTestDBOptions::default(); + opt.set_enable_multi_batch_write(true); + let engine = KvTestEngine::new_kv_engine(path, Some(opt), &[CF_DEFAULT], None).unwrap(); + TempDirEnginePair { + engine, + tempdir: dir, + } +} + /// Create an engine with the specified column families fn engine_cfs(cfs: &[&str]) -> TempDirEnginePair { use engine_test::{ctor::KvEngineConstructorExt, kv::KvTestEngine}; diff --git a/components/engine_traits_tests/src/write_batch.rs b/components/engine_traits_tests/src/write_batch.rs index 0210dee3806..dc966cf03b6 100644 --- a/components/engine_traits_tests/src/write_batch.rs +++ b/components/engine_traits_tests/src/write_batch.rs @@ -4,13 +4,17 @@ use engine_test::kv::KvTestEngine; use engine_traits::{Mutable, Peekable, SyncMutable, WriteBatch, WriteBatchExt}; use panic_hook::recover_safe; -use super::{assert_engine_error, default_engine}; +use super::{assert_engine_error, default_engine, multi_batch_write_engine}; #[test] fn write_batch_none_no_commit() { let db = default_engine(); let wb = db.engine.write_batch(); drop(wb); + + let db = multi_batch_write_engine(); + let wb = db.engine.write_batch_with_cap(1024); + drop(wb); } #[test] @@ -18,6 +22,10 @@ fn write_batch_none() { let db = default_engine(); let wb = db.engine.write_batch(); wb.write().unwrap(); + + let db = multi_batch_write_engine(); + let wb = db.engine.write_batch_with_cap(1024); + wb.write().unwrap(); } #[test] @@ -31,6 +39,28 @@ fn write_batch_put() { wb.write().unwrap(); assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + + let db = multi_batch_write_engine(); + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..128_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"aa").unwrap(); + for i in 128..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.write().unwrap(); + + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + for i in 0..256_usize { + let x = i.to_be_bytes(); + assert_eq!(db.engine.get_value(&x).unwrap().unwrap(), &x); + } } #[test] @@ -46,6 +76,33 @@ fn write_batch_delete() { wb.write().unwrap(); assert!(db.engine.get_value(b"a").unwrap().is_none()); + + let db = multi_batch_write_engine(); + + for i in 0..127_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + db.engine.put(b"a", b"aa").unwrap(); + for i in 127..255_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..255_usize { + let k = i.to_be_bytes(); + wb.delete(&k).unwrap(); + } + wb.delete(b"a").unwrap(); + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_none()); + for i in 0..255_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -60,6 +117,25 @@ fn write_batch_write_twice_1() { wb.write().unwrap(); assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + + let db = multi_batch_write_engine(); + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..123_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"aa").unwrap(); + + wb.write().unwrap(); + wb.write().unwrap(); + + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + for i in 0..123_usize { + let x = i.to_be_bytes(); + assert_eq!(db.engine.get_value(&x).unwrap().unwrap(), &x); + } } #[test] @@ -78,6 +154,40 @@ fn write_batch_write_twice_2() { wb.write().unwrap(); assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + + let db = multi_batch_write_engine(); + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..128_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"aa").unwrap(); + + wb.write().unwrap(); + + db.engine.put(b"a", b"b").unwrap(); + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"b"); + + for i in 0..128_usize { + let k = i.to_be_bytes(); + let v = (2 * i + 1).to_be_bytes(); + db.engine.put(&k, &v).unwrap(); + } + for i in 0..128_usize { + let k = i.to_be_bytes(); + let v = (2 * i + 1).to_be_bytes(); + assert_eq!(db.engine.get_value(&k).unwrap().unwrap(), &v); + } + + wb.write().unwrap(); + + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + for i in 0..128_usize { + let x = i.to_be_bytes(); + assert_eq!(db.engine.get_value(&x).unwrap().unwrap(), &x); + } } #[test] @@ -95,6 +205,37 @@ fn write_batch_write_twice_3() { assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); assert_eq!(db.engine.get_value(b"b").unwrap().unwrap(), b"bb"); + + let db = multi_batch_write_engine(); + + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..128_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"aa").unwrap(); + + wb.write().unwrap(); + for i in 0..128_usize { + let k = i.to_be_bytes(); + let v = (2 * i + 1).to_be_bytes(); + db.engine.put(&k, &v).unwrap(); + } + db.engine.put(b"a", b"b").unwrap(); + for i in 128..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"b", b"bb").unwrap(); + wb.write().unwrap(); + + assert_eq!(db.engine.get_value(b"a").unwrap().unwrap(), b"aa"); + assert_eq!(db.engine.get_value(b"b").unwrap().unwrap(), b"bb"); + for i in 0..256_usize { + let x = i.to_be_bytes(); + assert_eq!(db.engine.get_value(&x).unwrap().unwrap(), &x); + } } #[test] @@ -117,6 +258,43 @@ fn write_batch_delete_range_basic() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + + let mut wb = db.engine.write_batch_with_cap(1024); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&32_usize.to_be_bytes(), &128_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + for i in 0..32_usize { + let x = i.to_be_bytes(); + assert!(db.engine.get_value(&x).unwrap().is_some()); + } + for i in 32..128_usize { + let x = i.to_be_bytes(); + assert!(db.engine.get_value(&x).unwrap().is_none()); + } + for i in 128..256_usize { + let x = i.to_be_bytes(); + assert!(db.engine.get_value(&x).unwrap().is_some()); + } } #[test] @@ -141,6 +319,54 @@ fn write_batch_delete_range_inexact() { assert!(db.engine.get_value(b"e").unwrap().is_none()); assert!(db.engine.get_value(b"f").unwrap().is_none()); assert!(db.engine.get_value(b"g").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + db.engine.put(b"g", b"").unwrap(); + + let mut wb = db.engine.write_batch_with_cap(1024); + for i in (0..256_usize).step_by(2) { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.delete_range(b"b", b"f").unwrap(); + wb.delete_range(&0_usize.to_be_bytes(), &252_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_none()); + assert!(db.engine.get_value(b"f").unwrap().is_none()); + assert!(db.engine.get_value(b"g").unwrap().is_some()); + for i in 0..252_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } + assert!( + db.engine + .get_value(&252_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&253_usize.to_be_bytes()) + .unwrap() + .is_none() + ); + assert!( + db.engine + .get_value(&254_usize.to_be_bytes()) + .unwrap() + .is_some() + ); } #[test] @@ -161,6 +387,43 @@ fn write_batch_delete_range_after_put() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.put(b"b", b"").unwrap(); + wb.put(b"c", b"").unwrap(); + wb.put(b"d", b"").unwrap(); + wb.put(b"e", b"").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &255_usize.to_be_bytes()) + .unwrap(); + wb.delete_range(b"b", b"e").unwrap(); + wb.write().unwrap(); + + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..255_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } + assert!( + db.engine + .get_value(&255_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); } #[test] @@ -180,6 +443,37 @@ fn write_batch_delete_range_none() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -203,6 +497,43 @@ fn write_batch_delete_range_twice() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + + let mut wb = db.engine.write_batch_with_cap(1024); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -226,6 +557,43 @@ fn write_batch_delete_range_twice_1() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -251,6 +619,49 @@ fn write_batch_delete_range_twice_2() { assert!(db.engine.get_value(b"c").unwrap().is_none()); assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + db.engine.put(b"e", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + db.engine.put(b"c", b"").unwrap(); + for i in 64..128_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + wb.delete_range(b"b", b"e").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &256_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_none()); + assert!(db.engine.get_value(b"c").unwrap().is_none()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + for i in 1..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -269,6 +680,30 @@ fn write_batch_delete_range_empty_range() { assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"b").unwrap().is_some()); assert!(db.engine.get_value(b"c").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"b", b"b").unwrap(); + wb.delete_range(&1_usize.to_be_bytes(), &1_usize.to_be_bytes()) + .unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_some()); + assert!(db.engine.get_value(b"c").unwrap().is_some()); + for i in 0..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } } #[test] @@ -292,6 +727,37 @@ fn write_batch_delete_range_backward_range() { assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"b").unwrap().is_some()); assert!(db.engine.get_value(b"c").unwrap().is_some()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.delete_range(b"c", b"a").unwrap(); + wb.delete_range(&256_usize.to_be_bytes(), &0_usize.to_be_bytes()) + .unwrap(); + + assert!( + recover_safe(|| { + wb.write().unwrap(); + }) + .is_err() + ); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_some()); + assert!(db.engine.get_value(b"c").unwrap().is_some()); + for i in 0..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } } #[test] @@ -334,6 +800,54 @@ fn write_batch_delete_range_backward_range_partial_commit() { assert!(db.engine.get_value(b"d").unwrap().is_none()); assert!(db.engine.get_value(b"e").unwrap().is_some()); assert!(db.engine.get_value(b"f").unwrap().is_none()); + + let db = multi_batch_write_engine(); + + db.engine.put(b"a", b"").unwrap(); + db.engine.put(b"b", b"").unwrap(); + db.engine.put(b"c", b"").unwrap(); + db.engine.put(b"d", b"").unwrap(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + db.engine.put(&x, &x).unwrap(); + } + + let mut wb = db.engine.write_batch_with_cap(1024); + + // Everything in the write batch before the panic + // due to bad range is going to end up committed. + // + // NB: This behavior seems pretty questionable and + // should probably be re-evaluated before other engines + // try to emulate it. + // + // A more reasonable solution might be to have a bogus + // delete_range request immediately panic. + wb.put(b"e", b"").unwrap(); + wb.delete(b"d").unwrap(); + wb.delete_range(b"c", b"a").unwrap(); + wb.put(b"f", b"").unwrap(); + wb.delete(b"a").unwrap(); + wb.delete_range(&128_usize.to_be_bytes(), &64_usize.to_be_bytes()) + .unwrap(); + wb.put(&256_usize.to_be_bytes(), b"").unwrap(); + for i in 0..64_usize { + wb.delete(&i.to_be_bytes()).unwrap(); + } + + assert!( + recover_safe(|| { + wb.write().unwrap(); + }) + .is_err() + ); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"b").unwrap().is_some()); + assert!(db.engine.get_value(b"c").unwrap().is_some()); + assert!(db.engine.get_value(b"d").unwrap().is_none()); + assert!(db.engine.get_value(b"e").unwrap().is_some()); + assert!(db.engine.get_value(b"f").unwrap().is_none()); } #[test] @@ -346,6 +860,18 @@ fn write_batch_is_empty() { assert!(!wb.is_empty()); wb.write().unwrap(); assert!(!wb.is_empty()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + assert!(wb.is_empty()); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + assert!(!wb.is_empty()); + wb.write().unwrap(); + assert!(!wb.is_empty()); } #[test] @@ -358,6 +884,17 @@ fn write_batch_count() { assert_eq!(wb.count(), 1); wb.write().unwrap(); assert_eq!(wb.count(), 1); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + assert_eq!(wb.count(), 0); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + assert_eq!(wb.count(), 256); + wb.write().unwrap(); + assert_eq!(wb.count(), 256); } #[test] @@ -374,6 +911,23 @@ fn write_batch_count_2() { assert_eq!(wb.count(), 3); wb.write().unwrap(); assert_eq!(wb.count(), 3); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + assert_eq!(wb.count(), 0); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + assert_eq!(wb.count(), 257); + wb.delete(b"a").unwrap(); + assert_eq!(wb.count(), 258); + wb.delete_range(b"a", b"b").unwrap(); + assert_eq!(wb.count(), 259); + wb.write().unwrap(); + assert_eq!(wb.count(), 259); } #[test] @@ -388,6 +942,21 @@ fn write_batch_clear() { assert_eq!(wb.count(), 0); wb.write().unwrap(); assert!(db.engine.get_value(b"a").unwrap().is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.clear(); + assert!(wb.is_empty()); + assert_eq!(wb.count(), 0); + wb.write().unwrap(); + for i in 0..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -403,6 +972,40 @@ fn cap_zero() { wb.write().unwrap(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"f").unwrap().is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(0); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.put(b"b", b"").unwrap(); + wb.put(b"c", b"").unwrap(); + wb.put(b"d", b"").unwrap(); + wb.put(b"e", b"").unwrap(); + wb.put(b"f", b"").unwrap(); + wb.write().unwrap(); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&123_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&255_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"f").unwrap().is_some()); } /// Write batch capacity seems to just be a suggestions @@ -419,6 +1022,41 @@ fn cap_two() { wb.write().unwrap(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"f").unwrap().is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(2); + + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.put(b"b", b"").unwrap(); + wb.put(b"c", b"").unwrap(); + wb.put(b"d", b"").unwrap(); + wb.put(b"e", b"").unwrap(); + wb.put(b"f", b"").unwrap(); + wb.write().unwrap(); + assert!( + db.engine + .get_value(&0_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&123_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!( + db.engine + .get_value(&255_usize.to_be_bytes()) + .unwrap() + .is_some() + ); + assert!(db.engine.get_value(b"a").unwrap().is_some()); + assert!(db.engine.get_value(b"f").unwrap().is_some()); } // We should write when count is greater than WRITE_BATCH_MAX_KEYS @@ -441,6 +1079,24 @@ fn should_write_to_engine() { break; } } + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = KvTestEngine::WRITE_BATCH_MAX_KEYS; + + let mut key = vec![]; + loop { + key.push(b'a'); + wb.put(&key, b"").unwrap(); + if key.len() <= max_keys { + assert!(!wb.should_write_to_engine()); + } + if key.len() == max_keys + 1 { + assert!(wb.should_write_to_engine()); + wb.write().unwrap(); + break; + } + } } // But there kind of aren't consequences for making huge write batches @@ -475,6 +1131,37 @@ fn should_write_to_engine_but_whatever() { break; } } + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = KvTestEngine::WRITE_BATCH_MAX_KEYS; + + let mut key = vec![]; + + loop { + key.push(b'a'); + wb.put(&key, b"").unwrap(); + if key.len() <= max_keys { + assert!(!wb.should_write_to_engine()); + } + if key.len() > max_keys { + assert!(wb.should_write_to_engine()); + } + if key.len() == max_keys * 2 { + assert!(wb.should_write_to_engine()); + wb.write().unwrap(); + break; + } + } + + let mut key = vec![]; + loop { + key.push(b'a'); + assert!(db.engine.get_value(&key).unwrap().is_some()); + if key.len() == max_keys * 2 { + break; + } + } } #[test] @@ -504,6 +1191,43 @@ fn data_size() { wb.clear(); let size8 = wb.data_size(); assert_eq!(size8, size1); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + let size1 = wb.data_size(); + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + let size2 = wb.data_size(); + assert!(size1 < size2); + wb.write().unwrap(); + let size3 = wb.data_size(); + assert_eq!(size2, size3); + wb.clear(); + let size4 = wb.data_size(); + assert_eq!(size4, size1); + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + let size5 = wb.data_size(); + assert!(size4 < size5); + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.delete(&x).unwrap(); + } + let size6 = wb.data_size(); + assert!(size5 < size6); + wb.delete_range(&0_usize.to_be_bytes(), &(max_keys * 2).to_be_bytes()) + .unwrap(); + let size7 = wb.data_size(); + assert!(size6 < size7); + wb.clear(); + let size8 = wb.data_size(); + assert_eq!(size8, size1); } #[test] @@ -513,6 +1237,12 @@ fn save_point_rollback_none() { let err = wb.rollback_to_save_point(); assert_engine_error(err); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); } #[test] @@ -522,14 +1252,40 @@ fn save_point_pop_none() { let err = wb.rollback_to_save_point(); assert_engine_error(err); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); } -#[test] -fn save_point_rollback_one() { - let db = default_engine(); - let mut wb = db.engine.write_batch(); +#[test] +fn save_point_rollback_one() { + let db = default_engine(); + let mut wb = db.engine.write_batch(); + + wb.set_save_point(); + wb.put(b"a", b"").unwrap(); + + wb.rollback_to_save_point().unwrap(); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); + let err = wb.pop_save_point(); + assert_engine_error(err); + wb.write().unwrap(); + let val = db.engine.get_value(b"a").unwrap(); + assert!(val.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); wb.set_save_point(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } wb.put(b"a", b"").unwrap(); wb.rollback_to_save_point().unwrap(); @@ -539,6 +1295,9 @@ fn save_point_rollback_one() { let err = wb.pop_save_point(); assert_engine_error(err); wb.write().unwrap(); + for i in 0..256_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } let val = db.engine.get_value(b"a").unwrap(); assert!(val.is_none()); } @@ -565,6 +1324,39 @@ fn save_point_rollback_two() { assert!(a.is_none()); let b = db.engine.get_value(b"b").unwrap(); assert!(b.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + wb.set_save_point(); + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.set_save_point(); + for i in max_keys..2 * max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"b", b"").unwrap(); + + wb.rollback_to_save_point().unwrap(); + wb.rollback_to_save_point().unwrap(); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); + let err = wb.pop_save_point(); + assert_engine_error(err); + wb.write().unwrap(); + let a = db.engine.get_value(b"a").unwrap(); + assert!(a.is_none()); + let b = db.engine.get_value(b"b").unwrap(); + assert!(b.is_none()); + for i in 0..2 * max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -582,6 +1374,35 @@ fn save_point_rollback_partial() { assert!(a.is_some()); let b = db.engine.get_value(b"b").unwrap(); assert!(b.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + for i in 0..max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.set_save_point(); + wb.put(b"b", b"").unwrap(); + for i in max_keys..2 * max_keys { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + + wb.rollback_to_save_point().unwrap(); + wb.write().unwrap(); + let a = db.engine.get_value(b"a").unwrap(); + assert!(a.is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + let b = db.engine.get_value(b"b").unwrap(); + assert!(b.is_none()); + for i in max_keys..2 * max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -606,6 +1427,38 @@ fn save_point_pop_rollback() { assert!(val.is_none()); let val = db.engine.get_value(b"b").unwrap(); assert!(val.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + + wb.set_save_point(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + wb.set_save_point(); + for i in 0..256_usize { + let x = i.to_be_bytes(); + wb.put(&x, &x).unwrap(); + } + wb.put(b"a", b"").unwrap(); + + wb.pop_save_point().unwrap(); + wb.rollback_to_save_point().unwrap(); + + let err = wb.rollback_to_save_point(); + assert_engine_error(err); + let err = wb.pop_save_point(); + assert_engine_error(err); + wb.write().unwrap(); + let val = db.engine.get_value(b"a").unwrap(); + assert!(val.is_none()); + let val = db.engine.get_value(b"b").unwrap(); + assert!(val.is_none()); + for i in 0..512_usize { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -631,6 +1484,41 @@ fn save_point_rollback_after_write() { let val = db.engine.get_value(b"a").unwrap(); assert!(val.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + wb.set_save_point(); + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + wb.put(b"a", b"").unwrap(); + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + + db.engine.delete(b"a").unwrap(); + for i in 0..max_keys { + db.engine.delete(&i.to_be_bytes()).unwrap(); + } + + assert!(db.engine.get_value(b"a").unwrap().is_none()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } + + wb.rollback_to_save_point().unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_none()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -655,6 +1543,38 @@ fn save_point_same_rollback_one() { assert!(a.is_some()); assert!(b.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + wb.put(b"a", b"").unwrap(); + + wb.set_save_point(); + wb.set_save_point(); + wb.set_save_point(); + + wb.put(b"b", b"").unwrap(); + for i in max_keys..2 * max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + wb.rollback_to_save_point().unwrap(); + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + + assert!(db.engine.get_value(b"b").unwrap().is_none()); + for i in max_keys..2 * max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -684,6 +1604,43 @@ fn save_point_same_rollback_all() { assert!(a.is_some()); assert!(b.is_none()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + wb.put(b"a", b"").unwrap(); + + wb.set_save_point(); + wb.set_save_point(); + wb.set_save_point(); + + wb.put(b"b", b"").unwrap(); + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + wb.rollback_to_save_point().unwrap(); + wb.rollback_to_save_point().unwrap(); + wb.rollback_to_save_point().unwrap(); + + assert_engine_error(wb.pop_save_point()); + assert_engine_error(wb.rollback_to_save_point()); + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + + assert!(db.engine.get_value(b"b").unwrap().is_none()); + for i in max_keys..2 * max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } } #[test] @@ -709,6 +1666,41 @@ fn save_point_pop_after_write() { let val = db.engine.get_value(b"a").unwrap(); assert!(val.is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + wb.set_save_point(); + wb.put(b"a", b"").unwrap(); + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + + db.engine.delete(b"a").unwrap(); + for i in 0..max_keys { + db.engine.delete(&i.to_be_bytes()).unwrap(); + } + + assert!(db.engine.get_value(b"a").unwrap().is_none()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_none()); + } + + wb.pop_save_point().unwrap(); + wb.write().unwrap(); + + assert!(db.engine.get_value(b"a").unwrap().is_some()); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } } #[test] @@ -733,6 +1725,42 @@ fn save_point_all_commands() { assert!(a.is_some()); assert!(b.is_none()); assert!(d.is_some()); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + for i in 0..max_keys / 2 { + db.engine.put(&i.to_be_bytes(), b"").unwrap(); + } + db.engine.put(b"a", b"").unwrap(); + for i in max_keys / 2..max_keys { + db.engine.put(&i.to_be_bytes(), b"").unwrap(); + } + db.engine.put(b"d", b"").unwrap(); + + wb.set_save_point(); + for i in 0..max_keys / 2 { + wb.delete(&i.to_be_bytes()).unwrap(); + } + wb.delete(b"a").unwrap(); + wb.put(b"b", b"").unwrap(); + wb.delete_range(b"c", b"e").unwrap(); + wb.delete_range(&(max_keys / 3).to_be_bytes(), &(2 * max_keys).to_be_bytes()) + .unwrap(); + + wb.rollback_to_save_point().unwrap(); + wb.write().unwrap(); + + let a = db.engine.get_value(b"a").unwrap(); + let b = db.engine.get_value(b"b").unwrap(); + let d = db.engine.get_value(b"d").unwrap(); + for i in 0..max_keys { + assert!(db.engine.get_value(&i.to_be_bytes()).unwrap().is_some()); + } + assert!(a.is_some()); + assert!(b.is_none()); + assert!(d.is_some()); } // What happens to the count() and is_empty() methods @@ -824,4 +1852,99 @@ fn save_points_and_counts() { assert_eq!(wb.is_empty(), true); assert_eq!(wb.count(), 0); + + let db = multi_batch_write_engine(); + let mut wb = db.engine.write_batch_with_cap(1024); + let max_keys = 256_usize; + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + wb.set_save_point(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.rollback_to_save_point().unwrap(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + wb.set_save_point(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.pop_save_point().unwrap(); + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.clear(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + wb.set_save_point(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.write().unwrap(); + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.rollback_to_save_point().unwrap(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + wb.set_save_point(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); + + for i in 0..max_keys { + wb.put(&i.to_be_bytes(), b"").unwrap(); + } + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.write().unwrap(); + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.pop_save_point().unwrap(); + + assert_eq!(wb.is_empty(), false); + assert_eq!(wb.count(), max_keys); + + wb.clear(); + + assert_eq!(wb.is_empty(), true); + assert_eq!(wb.count(), 0); } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 9c2e548f10e..aa57676925c 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -435,6 +435,7 @@ where priority: Priority, ) -> ApplyContext { let kv_wb = engine.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE); + ApplyContext { tag, timer: None, diff --git a/src/config.rs b/src/config.rs index 239c80a62ab..580e91712de 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1171,6 +1171,8 @@ impl DbConfig { self.use_direct_io_for_flush_and_compaction, ); opts.enable_pipelined_write(self.enable_pipelined_write); + let enable_multi_batch_write = !self.enable_pipelined_write && !self.enable_unordered_write; + opts.enable_multi_batch_write(enable_multi_batch_write); opts.enable_unordered_write(self.enable_unordered_write); opts.set_info_log(RocksdbLogger::default()); opts.set_info_log_level(self.info_log_level.into()); diff --git a/src/server/debug.rs b/src/server/debug.rs index e5d6eba617f..d10f58cc2ad 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -13,7 +13,7 @@ use collections::HashSet; use engine_rocks::{ raw::{CompactOptions, DBBottommostLevelCompaction, DB}, util::get_cf_handle, - Compat, RocksEngine, RocksEngineIterator, RocksMvccProperties, RocksWriteBatch, + Compat, RocksEngine, RocksEngineIterator, RocksMvccProperties, RocksWriteBatchVec, }; use engine_traits::{ Engines, IterOptions, Iterable, Iterator as EngineIterator, Mutable, MvccProperties, Peekable, @@ -577,11 +577,11 @@ impl Debugger { let msg = format!("Store {} in the failed list", store_id); return Err(Error::Other(msg.into())); } - let mut wb = RocksWriteBatch::new(self.engines.kv.as_inner().clone()); + let mut wb = self.engines.kv.write_batch(); let store_ids = HashSet::::from_iter(store_ids); { - let remove_stores = |key: &[u8], value: &[u8], kv_wb: &mut RocksWriteBatch| { + let remove_stores = |key: &[u8], value: &[u8], kv_wb: &mut RocksWriteBatchVec| { let (_, suffix_type) = box_try!(keys::decode_region_meta_key(key)); if suffix_type != keys::REGION_STATE_SUFFIX { return Ok(()); @@ -1010,7 +1010,7 @@ fn recover_mvcc_for_range( let wb_limit: usize = 10240; loop { - let mut wb = RocksWriteBatch::new(db.clone()); + let mut wb = db.c().write_batch(); mvcc_checker.check_mvcc(&mut wb, Some(wb_limit))?; let batch_size = wb.count(); @@ -1102,7 +1102,7 @@ impl MvccChecker { } } - pub fn check_mvcc(&mut self, wb: &mut RocksWriteBatch, limit: Option) -> Result<()> { + pub fn check_mvcc(&mut self, wb: &mut RocksWriteBatchVec, limit: Option) -> Result<()> { loop { // Find min key in the 3 CFs. let mut key = MvccChecker::min_key(None, &self.default_iter, |k| { @@ -1124,7 +1124,7 @@ impl MvccChecker { } } - fn check_mvcc_key(&mut self, wb: &mut RocksWriteBatch, key: &[u8]) -> Result<()> { + fn check_mvcc_key(&mut self, wb: &mut RocksWriteBatchVec, key: &[u8]) -> Result<()> { self.scan_count += 1; if self.scan_count % 1_000_000 == 0 { info!( @@ -1292,7 +1292,7 @@ impl MvccChecker { fn delete( &mut self, - wb: &mut RocksWriteBatch, + wb: &mut RocksWriteBatchVec, cf: &str, key: &[u8], ts: Option, @@ -1333,7 +1333,7 @@ fn set_region_tombstone( db: &Arc, store_id: u64, region: Region, - wb: &mut RocksWriteBatch, + wb: &mut RocksWriteBatchVec, ) -> Result<()> { let id = region.get_id(); let key = keys::region_state_key(id); @@ -1924,7 +1924,7 @@ mod tests { let cf2 = CF_RAFT; { - let mock_region_state = |wb: &mut RocksWriteBatch, region_id: u64, peers: &[u64]| { + let mock_region_state = |wb: &mut RocksWriteBatchVec, region_id: u64, peers: &[u64]| { let region_state_key = keys::region_state_key(region_id); let mut region_state = RegionLocalState::default(); region_state.set_state(PeerState::Normal); @@ -1945,20 +1945,23 @@ mod tests { wb.put_msg_cf(cf2, ®ion_state_key, ®ion_state) .unwrap(); }; - let mock_raft_state = - |wb: &mut RocksWriteBatch, region_id: u64, last_index: u64, commit_index: u64| { - let raft_state_key = keys::raft_state_key(region_id); - let mut raft_state = RaftLocalState::default(); - raft_state.set_last_index(last_index); - raft_state.mut_hard_state().set_commit(commit_index); - wb.put_msg_cf(cf1, &raft_state_key, &raft_state).unwrap(); - }; - let mock_apply_state = |wb: &mut RocksWriteBatch, region_id: u64, apply_index: u64| { - let raft_apply_key = keys::apply_state_key(region_id); - let mut apply_state = RaftApplyState::default(); - apply_state.set_applied_index(apply_index); - wb.put_msg_cf(cf2, &raft_apply_key, &apply_state).unwrap(); + let mock_raft_state = |wb: &mut RocksWriteBatchVec, + region_id: u64, + last_index: u64, + commit_index: u64| { + let raft_state_key = keys::raft_state_key(region_id); + let mut raft_state = RaftLocalState::default(); + raft_state.set_last_index(last_index); + raft_state.mut_hard_state().set_commit(commit_index); + wb.put_msg_cf(cf1, &raft_state_key, &raft_state).unwrap(); }; + let mock_apply_state = + |wb: &mut RocksWriteBatchVec, region_id: u64, apply_index: u64| { + let raft_apply_key = keys::apply_state_key(region_id); + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(apply_index); + wb.put_msg_cf(cf2, &raft_apply_key, &apply_state).unwrap(); + }; for ®ion_id in &[10, 11, 12] { mock_region_state(&mut wb2, region_id, &[store_id]); @@ -2176,9 +2179,11 @@ mod tests { .iter() .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) .collect(); - let db = Arc::new(new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap()); + let db_opt = DBOptions::new(); + db_opt.enable_multi_batch_write(true); + let db = Arc::new(new_engine_opt(path_str, db_opt, cfs_opts).unwrap()); // Write initial KVs. - let mut wb = db.c().write_batch(); + let mut wb = RocksEngine::from_db(db.clone()).write_batch(); for &(cf, ref k, ref v, _) in &kv { wb.put_cf(cf, &keys::data_key(k.as_encoded()), v).unwrap(); } diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 5dda55751e7..7d233430f70 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -18,7 +18,7 @@ use engine_rocks::{ CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, DBCompactionFilter, }, - RocksEngine, RocksMvccProperties, RocksWriteBatch, + RocksEngine, RocksMvccProperties, RocksWriteBatchVec, }; use engine_traits::{ KvEngine, MiscExt, Mutable, MvccProperties, WriteBatch, WriteBatchExt, WriteOptions, @@ -267,7 +267,7 @@ struct WriteCompactionFilter { is_bottommost_level: bool, encountered_errors: bool, - write_batch: RocksWriteBatch, + write_batch: RocksWriteBatchVec, gc_scheduler: Scheduler>, // A key batch which is going to be sent to the GC worker. mvcc_deletions: Vec, @@ -461,7 +461,7 @@ impl WriteCompactionFilter { } fn do_flush( - wb: &RocksWriteBatch, + wb: &RocksWriteBatchVec, wopts: &WriteOptions, ) -> Result<(), engine_traits::Error> { let _io_type_guard = WithIOType::new(IOType::Gc); diff --git a/src/server/reset_to_version.rs b/src/server/reset_to_version.rs index 263a8d2565a..1a7443f6d08 100644 --- a/src/server/reset_to_version.rs +++ b/src/server/reset_to_version.rs @@ -6,7 +6,7 @@ use std::{ thread::JoinHandle, }; -use engine_rocks::{RocksEngine, RocksEngineIterator, RocksWriteBatch}; +use engine_rocks::{RocksEngine, RocksEngineIterator, RocksWriteBatchVec}; use engine_traits::{ IterOptions, Iterable, Iterator, Mutable, SeekKey, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, CF_WRITE, @@ -121,7 +121,7 @@ impl ResetToVersionWorker { pub fn process_next_batch( &mut self, batch_size: usize, - wb: &mut RocksWriteBatch, + wb: &mut RocksWriteBatchVec, ) -> Result { let Batch { writes, has_more } = self.scan_next_batch(batch_size)?; for (key, write) in writes { @@ -140,7 +140,7 @@ impl ResetToVersionWorker { pub fn process_next_batch_lock( &mut self, batch_size: usize, - wb: &mut RocksWriteBatch, + wb: &mut RocksWriteBatchVec, ) -> Result { let mut has_more = true; for _ in 0..batch_size { diff --git a/tests/benches/misc/writebatch/bench_writebatch.rs b/tests/benches/misc/writebatch/bench_writebatch.rs index 3c96d79ee82..0c6e81a35ca 100644 --- a/tests/benches/misc/writebatch/bench_writebatch.rs +++ b/tests/benches/misc/writebatch/bench_writebatch.rs @@ -2,15 +2,19 @@ use std::sync::Arc; -use engine_rocks::{raw::DB, Compat, RocksWriteBatch}; +use engine_rocks::{ + raw::{DBOptions, DB}, + RocksEngine, RocksWriteBatchVec, +}; use engine_traits::{Mutable, WriteBatch, WriteBatchExt}; use tempfile::Builder; use test::Bencher; fn writebatch(db: &Arc, round: usize, batch_keys: usize) { let v = b"operators are syntactic sugar for calls to methods of built-in traits"; + let engine = RocksEngine::from_db(db.clone()); for r in 0..round { - let mut batch = db.c().write_batch(); + let mut batch = engine.write_batch(); for i in 0..batch_keys { let k = format!("key_round{}_key{}", r, i); batch.put(k.as_bytes(), v).unwrap(); @@ -24,7 +28,12 @@ fn bench_writebatch_impl(b: &mut Bencher, batch_keys: usize) { .prefix("/tmp/rocksdb_write_batch_bench") .tempdir() .unwrap(); - let db = Arc::new(DB::open_default(path.path().to_str().unwrap()).unwrap()); + let mut opts = DBOptions::new(); + opts.create_if_missing(true); + opts.enable_unordered_write(false); + opts.enable_pipelined_write(false); + opts.enable_multi_batch_write(true); + let db = Arc::new(DB::open(opts, path.path().to_str().unwrap()).unwrap()); let key_count = 1 << 13; let round = key_count / batch_keys; b.iter(|| { @@ -87,7 +96,7 @@ fn bench_writebatch_1024(b: &mut Bencher) { bench_writebatch_impl(b, 1024); } -fn fill_writebatch(wb: &mut RocksWriteBatch, target_size: usize) { +fn fill_writebatch(wb: &mut RocksWriteBatchVec, target_size: usize) { let (k, v) = (b"this is the key", b"this is the value"); loop { wb.put(k, v).unwrap(); @@ -103,9 +112,15 @@ fn bench_writebatch_without_capacity(b: &mut Bencher) { .prefix("/tmp/rocksdb_write_batch_bench") .tempdir() .unwrap(); - let db = Arc::new(DB::open_default(path.path().to_str().unwrap()).unwrap()); + let mut opts = DBOptions::new(); + opts.create_if_missing(true); + opts.enable_unordered_write(false); + opts.enable_pipelined_write(false); + opts.enable_multi_batch_write(true); + let db = Arc::new(DB::open(opts, path.path().to_str().unwrap()).unwrap()); + let engine = RocksEngine::from_db(db); b.iter(|| { - let mut wb = db.c().write_batch(); + let mut wb = engine.write_batch(); fill_writebatch(&mut wb, 4096); }); } @@ -116,9 +131,15 @@ fn bench_writebatch_with_capacity(b: &mut Bencher) { .prefix("/tmp/rocksdb_write_batch_bench") .tempdir() .unwrap(); - let db = Arc::new(DB::open_default(path.path().to_str().unwrap()).unwrap()); + let mut opts = DBOptions::new(); + opts.create_if_missing(true); + opts.enable_unordered_write(false); + opts.enable_pipelined_write(false); + opts.enable_multi_batch_write(true); + let db = Arc::new(DB::open(opts, path.path().to_str().unwrap()).unwrap()); + let engine = RocksEngine::from_db(db); b.iter(|| { - let mut wb = db.c().write_batch_with_cap(4096); + let mut wb = engine.write_batch_with_cap(4096); fill_writebatch(&mut wb, 4096); }); } diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index 9ceaa16e3c7..09308646421 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -7,7 +7,7 @@ use std::{ }; use collections::HashMap; -use engine_traits::Peekable; +use engine_traits::{Peekable, WriteBatch}; use grpcio::{ChannelBuilder, Environment}; use keys::data_key; use kvproto::{kvrpcpb::*, metapb::Region, tikvpb::TikvClient}; @@ -321,7 +321,7 @@ fn test_error_in_compaction_filter() { gc_runner.gc(&raw_engine); match gc_runner.gc_receiver.recv().unwrap() { - GcTask::OrphanVersions { wb, .. } => assert_eq!(wb.as_inner().count(), 2), + GcTask::OrphanVersions { wb, .. } => assert_eq!(wb.count(), 2), GcTask::GcKeys { .. } => {} _ => unreachable!(), } From 57c4a43cb81f1196a48325913d76fd1617cada4d Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Fri, 15 Jul 2022 09:11:05 +0800 Subject: [PATCH 081/676] log-backup: store log files by date/hour/store_id (#13018) ref tikv/tikv#12902 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 4 +- components/backup-stream/src/router.rs | 30 ++++++----- .../backup-stream/src/subscription_manager.rs | 51 ++++++++++--------- 3 files changed, 44 insertions(+), 41 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 490e0b48e8d..c779afebe45 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -324,7 +324,7 @@ where tokio::time::sleep(Duration::from_secs(2)).await; break; } - _ => panic!("BUG: invalid event {:?}", event), + _ => warn!("BUG: invalid event"; "event" => ?event), } } else { tokio::time::sleep(Duration::from_secs(1)).await; @@ -372,7 +372,7 @@ where tokio::time::sleep(Duration::from_secs(2)).await; break; } - _ => panic!("BUG: invalid event {:?}", event), + _ => warn!("BUG: invalid event"; "event" => ?event), } } else { tokio::time::sleep(Duration::from_secs(1)).await; diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index debb4b417c8..8311c08c7de 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -618,13 +618,14 @@ impl TempFileKey { } /// path_to_log_file specifies the path of record log. - /// eg. "v1/20220625/03/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log" - fn path_to_log_file(&self, min_ts: u64, max_ts: u64) -> String { + /// eg. "v1/${date}/${hour}/${store_id}/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log" + fn path_to_log_file(&self, store_id: u64, min_ts: u64, max_ts: u64) -> String { format!( - "v1/{}/{}/t{:08}/{:012}-{}.log", + "v1/{}/{}/{}/t{:08}/{:012}-{}.log", // We may delete a range of files, so using the max_ts for preventing remove some records wrong. Self::format_date_time(max_ts, FormatType::Date), Self::format_date_time(max_ts, FormatType::Hour), + store_id, self.table_id, min_ts, uuid::Uuid::new_v4() @@ -632,22 +633,23 @@ impl TempFileKey { } /// path_to_schema_file specifies the path of schema log. - /// eg. "v1/20220625/03/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log" - fn path_to_schema_file(min_ts: u64, max_ts: u64) -> String { + /// eg. "v1/${date}/${hour}/${store_id}/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log" + fn path_to_schema_file(store_id: u64, min_ts: u64, max_ts: u64) -> String { format!( - "v1/{}/{}/schema-meta/{:012}-{}.log", + "v1/{}/{}/{}/schema-meta/{:012}-{}.log", Self::format_date_time(max_ts, FormatType::Date), Self::format_date_time(max_ts, FormatType::Hour), + store_id, min_ts, uuid::Uuid::new_v4(), ) } - fn file_name(&self, min_ts: TimeStamp, max_ts: TimeStamp) -> String { + fn file_name(&self, store_id: u64, min_ts: TimeStamp, max_ts: TimeStamp) -> String { if self.is_meta { - Self::path_to_schema_file(min_ts.into_inner(), max_ts.into_inner()) + Self::path_to_schema_file(store_id, min_ts.into_inner(), max_ts.into_inner()) } else { - self.path_to_log_file(min_ts.into_inner(), max_ts.into_inner()) + self.path_to_log_file(store_id, min_ts.into_inner(), max_ts.into_inner()) } } } @@ -803,7 +805,7 @@ impl StreamTaskInfo { metadata.set_store_id(store_id); for (file_key, data_file) in w.iter() { let mut data_file = data_file.lock().await; - let file_meta = data_file.generate_metadata(file_key)?; + let file_meta = data_file.generate_metadata(file_key, store_id)?; metadata.push(file_meta) } Ok(metadata) @@ -1181,8 +1183,8 @@ impl DataFile { } /// generate the metadata in protocol buffer of the file. - fn generate_metadata(&mut self, file_key: &TempFileKey) -> Result { - self.set_storage_path(file_key.file_name(self.min_ts, self.max_ts)); + fn generate_metadata(&mut self, file_key: &TempFileKey, store_id: u64) -> Result { + self.set_storage_path(file_key.file_name(store_id, self.min_ts, self.max_ts)); let mut meta = DataFileInfo::new(); meta.set_sha256( @@ -1416,7 +1418,7 @@ mod tests { fn check_on_events_result(item: &Vec<(String, Result<()>)>) { for (task, r) in item { if let Err(err) = r { - panic!("task {} failed: {}", task, err); + warn!("task {} failed: {}", task, err); } } } @@ -1477,7 +1479,7 @@ mod tests { assert_eq!(cmds.len(), 1, "test cmds len = {}", cmds.len()); match &cmds[0] { Task::Flush(task) => assert_eq!(task, "dummy", "task = {}", task), - _ => panic!("the cmd isn't flush!"), + _ => warn!("the cmd isn't flush!"), } let mut meta_count = 0; diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index fc4f0e2d4a7..68c025b16c2 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -410,32 +410,33 @@ where let canceled = self.subs.deregister_region_if(region, |_, _| true); let handle = ObserveHandle::new(); if canceled { - let for_task = self.find_task_by_region(region).unwrap_or_else(|| { - panic!( + if let Some(for_task) = self.find_task_by_region(region) { + metrics::INITIAL_SCAN_REASON + .with_label_values(&["region-changed"]) + .inc(); + let r = async { + self.observe_over_with_initial_data_from_checkpoint( + region, + self.get_last_checkpoint_of(&for_task, region).await?, + handle.clone(), + ); + Result::Ok(()) + } + .await; + if let Err(e) = r { + try_send!( + self.scheduler, + Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { + region: region.clone(), + handle, + err: Box::new(e) + }) + ); + } + } else { + warn!( "BUG: the region {:?} is register to no task but being observed", - region - ) - }); - metrics::INITIAL_SCAN_REASON - .with_label_values(&["region-changed"]) - .inc(); - let r = async { - self.observe_over_with_initial_data_from_checkpoint( - region, - self.get_last_checkpoint_of(&for_task, region).await?, - handle.clone(), - ); - Result::Ok(()) - } - .await; - if let Err(e) = r { - try_send!( - self.scheduler, - Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { - region: region.clone(), - handle, - err: Box::new(e) - }) + ®ion ); } } From 956c2192a020b2852a615ed1716ebdc0b1bd316d Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 15 Jul 2022 11:09:06 +0800 Subject: [PATCH 082/676] server: support command line output config info (#12956) ref tikv/tikv#12492, ref tikv/tikv#12517 Signed-off-by: glorv Co-authored-by: Xinye Tao --- Cargo.lock | 1 + Cargo.toml | 2 +- cmd/tikv-server/Cargo.toml | 1 + cmd/tikv-server/src/main.rs | 23 ++++++++- src/config.rs | 96 ++++++++++++++++++++++++++++++++++++- src/lib.rs | 7 ++- 6 files changed, 126 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d08e8fc3b25..dcf1eb84937 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6170,6 +6170,7 @@ version = "0.0.1" dependencies = [ "cc", "clap 2.33.0", + "serde_json", "server", "tikv", "time", diff --git a/Cargo.toml b/Cargo.toml index dd071c9809e..b094c857d5e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -143,7 +143,7 @@ semver = "0.11" serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" serde_ignored = "0.1" -serde_json = "1.0" +serde_json = { version = "1.0", features = ["preserve_order"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } sst_importer = { path = "components/sst_importer", default-features = false } diff --git a/cmd/tikv-server/Cargo.toml b/cmd/tikv-server/Cargo.toml index e2f594cd8ad..9b1aa869037 100644 --- a/cmd/tikv-server/Cargo.toml +++ b/cmd/tikv-server/Cargo.toml @@ -32,6 +32,7 @@ pprof-fp = ["tikv/pprof-fp"] [dependencies] clap = "2.32" +serde_json = { version = "1.0", features = ["preserve_order"] } server = { path = "../../components/server", default-features = false } tikv = { path = "../../", default-features = false } toml = "0.5" diff --git a/cmd/tikv-server/src/main.rs b/cmd/tikv-server/src/main.rs index 4cb68c6e020..0d6e472a602 100644 --- a/cmd/tikv-server/src/main.rs +++ b/cmd/tikv-server/src/main.rs @@ -5,8 +5,9 @@ use std::{path::Path, process}; use clap::{crate_authors, App, Arg}; +use serde_json::{Map, Value}; use server::setup::{ensure_no_unrecognized_config, validate_and_persist_config}; -use tikv::config::TiKvConfig; +use tikv::config::{to_flatten_config_info, TiKvConfig}; fn main() { let build_timestamp = option_env!("TIKV_BUILD_TIME"); @@ -32,6 +33,15 @@ fn main() { .takes_value(false) .help("Check config file validity and exit"), ) + .arg( + Arg::with_name("config-info") + .required(false) + .long("config-info") + .takes_value(true) + .value_name("FORMAT") + .possible_values(&["json"]) + .help("print configuration information with specified format") + ) .arg( Arg::with_name("log-level") .short("L") @@ -186,5 +196,16 @@ fn main() { process::exit(0) } + let is_config_info = matches.is_present("config-info"); + if is_config_info { + let config_infos = to_flatten_config_info(&config); + let mut result = Map::new(); + result.insert("Component".into(), "TiKV Server".into()); + result.insert("Version".into(), tikv::tikv_build_version().into()); + result.insert("Parameters".into(), Value::Array(config_infos)); + println!("{}", serde_json::to_string_pretty(&result).unwrap()); + process::exit(0); + } + server::server::run_tikv(config); } diff --git a/src/config.rs b/src/config.rs index 580e91712de..9b06da58926 100644 --- a/src/config.rs +++ b/src/config.rs @@ -7,7 +7,7 @@ use std::{ cmp, - collections::HashMap, + collections::{HashMap, HashSet}, error::Error, fs, i32, io::{Error as IoError, ErrorKind, Write}, @@ -53,6 +53,7 @@ use raftstore::{ }; use resource_metering::Config as ResourceMeteringConfig; use security::SecurityConfig; +use serde_json::{to_value, Map, Value}; use tikv_util::{ config::{ self, LogFormat, RaftDataStateMachine, ReadableDuration, ReadableSize, TomlWriter, GIB, MIB, @@ -1059,6 +1060,7 @@ pub struct DbConfig { // back to write mode in 3.0 when set `enable_pipelined_write` true. The code of multi-batch-write // in RocksDB has been removed. #[online_config(skip)] + #[serde(skip_serializing)] pub enable_multi_batch_write: bool, #[online_config(skip)] pub enable_unordered_write: bool, @@ -3607,6 +3609,71 @@ pub fn write_config>(path: P, content: &[u8]) -> CfgResult<()> { Ok(()) } +// convert tikv config to a flatten array. +pub fn to_flatten_config_info(cfg: &TiKvConfig) -> Vec { + fn to_cfg_value(default_value: &Value, cfg_value: Option<&Value>, key: &str) -> Value { + let mut res = Map::with_capacity(2); + res.insert("Name".into(), Value::String(key.into())); + res.insert("DefaultValue".into(), default_value.clone()); + if let Some(cfg_val) = cfg_value { + if default_value != cfg_val { + res.insert("ValueInFile".into(), cfg_val.clone()); + } + } + + Value::Object(res) + } + + // configs that should not be flatten because the config type is HashMap instead of submodule. + lazy_static! { + static ref NO_FLATTEN_CFGS: HashSet<&'static str> = { + let mut set = HashSet::new(); + set.insert("server.labels"); + set + }; + } + + fn flatten_value( + default_obj: &Map, + value_obj: &Map, + key_buf: &mut String, + res: &mut Vec, + ) { + for (k, v) in default_obj.iter() { + let cfg_val = value_obj.get(k); + let prev_len = key_buf.len(); + if !key_buf.is_empty() { + key_buf.push('.'); + } + key_buf.push_str(k); + if v.is_object() && !NO_FLATTEN_CFGS.contains(key_buf.as_str()) { + flatten_value( + v.as_object().unwrap(), + cfg_val.unwrap().as_object().unwrap(), + key_buf, + res, + ); + } else { + res.push(to_cfg_value(v, cfg_val, key_buf)); + } + key_buf.truncate(prev_len); + } + } + + let cfg_value = to_value(cfg).unwrap(); + let default_value = to_value(TiKvConfig::default()).unwrap(); + + let mut key_buf = String::new(); + let mut res = Vec::new(); + flatten_value( + default_value.as_object().unwrap(), + cfg_value.as_object().unwrap(), + &mut key_buf, + &mut res, + ); + res +} + lazy_static! { pub static ref TIKVCONFIG_TYPED: ConfigChange = TiKvConfig::default().typed(); } @@ -4114,6 +4181,33 @@ mod tests { assert_eq!(cfg_from_file.raftdb.wal_dir, s1); } + #[test] + fn test_flatten_cfg() { + let mut cfg = TiKvConfig::default(); + cfg.server.labels.insert("zone".into(), "test".into()); + cfg.raft_store.raft_log_gc_count_limit = Some(123); + + let flattened = to_flatten_config_info(&cfg); + + let mut expected = HashMap::new(); + let mut labels = Map::new(); + labels.insert("zone".into(), Value::String("test".into())); + expected.insert("server.labels", Value::Object(labels)); + expected.insert( + "raftstore.raft-log-gc-count-limit", + Value::Number(123.into()), + ); + + for v in &flattened { + let obj = v.as_object().unwrap(); + if let Some(v) = expected.get(&obj["Name"].as_str().unwrap()) { + assert_eq!(v, &obj["ValueInFile"]); + } else { + assert!(!obj.contains_key("ValueInFile")); + } + } + } + #[test] fn test_create_parent_dir_if_missing() { let root_path = Builder::new() diff --git a/src/lib.rs b/src/lib.rs index d51457b1603..5b7bf6e2ac1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -61,7 +61,7 @@ pub fn tikv_version_info(build_time: Option<&str>) -> String { \nRust Version: {}\ \nEnable Features: {}\ \nProfile: {}", - env!("CARGO_PKG_VERSION"), + tikv_build_version(), option_env!("TIKV_EDITION").unwrap_or("Community"), option_env!("TIKV_BUILD_GIT_HASH").unwrap_or(fallback), option_env!("TIKV_BUILD_GIT_BRANCH").unwrap_or(fallback), @@ -74,6 +74,11 @@ pub fn tikv_version_info(build_time: Option<&str>) -> String { ) } +/// return the build version of tikv-server +pub fn tikv_build_version() -> &'static str { + env!("CARGO_PKG_VERSION") +} + /// Prints the tikv version information to the standard output. pub fn log_tikv_info(build_time: Option<&str>) { info!("Welcome to TiKV"); From 5ae20e21af29e6f5c63abbd4db8a481dc04539b6 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Fri, 15 Jul 2022 13:57:05 +0800 Subject: [PATCH 083/676] raftstore: only record UNABLE_TO_SPLIT_CPU_TOP when the top_cpu_usage is not empty (#13016) ref tikv/tikv#12063 Only record `UNABLE_TO_SPLIT_CPU_TOP` when the `top_cpu_usage` is not empty. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/metrics.rs | 36 ++++- .../src/store/worker/split_controller.rs | 137 ++++++------------ 2 files changed, 81 insertions(+), 92 deletions(-) diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index c4a1c22d800..69d84f45056 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -234,6 +234,37 @@ make_static_metric! { hibernated, }, } + + pub label_enum LoadBaseSplitEventType { + // Workload fits the QPS threshold or byte threshold. + load_fit, + // Workload fits the CPU threshold. + cpu_load_fit, + // The statistical key is empty. + empty_statistical_key, + // Split info has been collected, ready to split. + ready_to_split, + // Split info has not been collected yet, not ready to split. + not_ready_to_split, + // The number of sampled keys does not meet the threshold. + no_enough_sampled_key, + // The number of sampled keys located on left and right does not meet the threshold. + no_enough_lr_key, + // The number of balanced keys does not meet the score. + no_balance_key, + // The number of contained keys does not meet the score. + no_uncross_key, + // Split info for the top hot CPU region has been collected, ready to split. + ready_to_split_cpu_top, + // Hottest key range for the top hot CPU region could not be found. + empty_hottest_key_range, + // The top hot CPU region could not be split. + unable_to_split_cpu_top, + } + + pub struct LoadBaseSplitEventCounterVec: IntCounter { + "type" => LoadBaseSplitEventType, + } } lazy_static! { @@ -648,8 +679,9 @@ lazy_static! { &["order"] ).unwrap(); - pub static ref LOAD_BASE_SPLIT_EVENT: IntCounterVec = - register_int_counter_vec!( + pub static ref LOAD_BASE_SPLIT_EVENT: LoadBaseSplitEventCounterVec = + register_static_int_counter_vec!( + LoadBaseSplitEventCounterVec, "tikv_load_base_split_event", "Load base split event.", &["type"] diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 3724e21c515..013ac705be9 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -31,32 +31,6 @@ use crate::store::{ const DEFAULT_MAX_SAMPLE_LOOP_COUNT: usize = 10000; pub const TOP_N: usize = 10; -// LOAD_BASE_SPLIT_EVENT metrics label definitions. -// Workload fits the QPS threshold or byte threshold. -const LOAD_FIT: &str = "load_fit"; -// Workload fits the CPU threshold. -const CPU_LOAD_FIT: &str = "cpu_load_fit"; -// The statistical key is empty. -const EMPTY_STATISTICAL_KEY: &str = "empty_statistical_key"; -// Split info has been collected, ready to split. -const READY_TO_SPLIT: &str = "ready_to_split"; -// Split info has not been collected yet, not ready to split. -const NOT_READY_TO_SPLIT: &str = "not_ready_to_split"; -// The number of sampled keys does not meet the threshold. -const NO_ENOUGH_SAMPLED_KEY: &str = "no_enough_sampled_key"; -// The number of sampled keys located on left and right does not meet the threshold. -const NO_ENOUGH_LR_KEY: &str = "no_enough_lr_key"; -// The number of balanced keys does not meet the score. -const NO_BALANCE_KEY: &str = "no_balance_key"; -// The number of contained keys does not meet the score. -const NO_UNCROSS_KEY: &str = "no_uncross_key"; -// Split info for the top hot CPU region has been collected, ready to split. -const READY_TO_SPLIT_CPU_TOP: &str = "ready_to_split_cpu_top"; -// Hottest key range for the top hot CPU region could not be found. -const EMPTY_HOTTEST_KEY_RANGE: &str = "empty_hottest_key_range"; -// The top hot CPU region could not be split. -const UNABLE_TO_SPLIT_CPU_TOP: &str = "unable_to_split_cpu_top"; - // It will return prefix sum of the given iter, // `read` is a function to process the item from the iter. #[inline(always)] @@ -231,9 +205,7 @@ impl Samples { } let evaluated_key_num_lr = sample.left + sample.right; if evaluated_key_num_lr == 0 { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NO_ENOUGH_LR_KEY]) - .inc(); + LOAD_BASE_SPLIT_EVENT.no_enough_lr_key.inc(); continue; } let evaluated_key_num = (sample.contained + evaluated_key_num_lr) as f64; @@ -246,9 +218,7 @@ impl Samples { .with_label_values(&["balance_score"]) .observe(balance_score); if balance_score >= split_balance_score { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NO_BALANCE_KEY]) - .inc(); + LOAD_BASE_SPLIT_EVENT.no_balance_key.inc(); continue; } @@ -259,9 +229,7 @@ impl Samples { .with_label_values(&["contained_score"]) .observe(contained_score); if contained_score >= split_contained_score { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NO_UNCROSS_KEY]) - .inc(); + LOAD_BASE_SPLIT_EVENT.no_uncross_key.inc(); continue; } @@ -336,7 +304,7 @@ impl Recorder { // so we do this check after the samples are calculated. if (recorded_key_ranges.len() as u64) < config.sample_threshold { LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NO_ENOUGH_SAMPLED_KEY]) + .no_enough_sampled_key .inc_by(samples.0.len() as u64); return vec![]; } @@ -834,7 +802,7 @@ impl AutoSplitController { continue; } - LOAD_BASE_SPLIT_EVENT.with_label_values(&[LOAD_FIT]).inc(); + LOAD_BASE_SPLIT_EVENT.load_fit.inc(); let detect_times = self.cfg.detect_times; let recorder = self @@ -853,9 +821,7 @@ impl AutoSplitController { RegionInfo::get_key_ranges_mut, ); if key_ranges.is_empty() { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[EMPTY_STATISTICAL_KEY]) - .inc(); + LOAD_BASE_SPLIT_EVENT.empty_statistical_key.inc(); continue; } recorder.record(key_ranges); @@ -867,9 +833,7 @@ impl AutoSplitController { recorder.peer.clone(), key, )); - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[READY_TO_SPLIT]) - .inc(); + LOAD_BASE_SPLIT_EVENT.ready_to_split.inc(); info!("load base split region"; "region_id" => region_id, "qps" => qps, @@ -878,15 +842,11 @@ impl AutoSplitController { ); self.recorders.remove(®ion_id); } else if is_unified_read_pool_busy && is_region_busy { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[CPU_LOAD_FIT]) - .inc(); + LOAD_BASE_SPLIT_EVENT.cpu_load_fit.inc(); top_cpu_usage.push(region_id); } } else { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[NOT_READY_TO_SPLIT]) - .inc(); + LOAD_BASE_SPLIT_EVENT.not_ready_to_split.inc(); } top_qps.push(qps); @@ -894,49 +854,46 @@ impl AutoSplitController { // Check if the top CPU usage region could be split. // TODO: avoid unnecessary split by introducing the feedback mechanism from PD. - if !top_cpu_usage.is_empty() && !is_grpc_poll_busy { - // Calculate by using the latest CPU usage. - top_cpu_usage.sort_unstable_by(|a, b| { - let cpu_usage_a = self.recorders.get(a).unwrap().cpu_usage; - let cpu_usage_b = self.recorders.get(b).unwrap().cpu_usage; - cpu_usage_b.partial_cmp(&cpu_usage_a).unwrap() - }); - let region_id = top_cpu_usage[0]; - let recorder = self.recorders.get_mut(®ion_id).unwrap(); - if recorder.hottest_key_range.is_some() { - split_infos.push(SplitInfo::with_start_end_key( - region_id, - recorder.peer.clone(), - recorder - .hottest_key_range - .as_ref() - .unwrap() - .start_key - .clone(), - recorder.hottest_key_range.as_ref().unwrap().end_key.clone(), - )); - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[READY_TO_SPLIT_CPU_TOP]) - .inc(); - info!("load base split region"; - "region_id" => region_id, - "start_key" => log_wrappers::Value::key(&recorder.hottest_key_range.as_ref().unwrap().start_key), - "end_key" => log_wrappers::Value::key(&recorder.hottest_key_range.as_ref().unwrap().end_key), - "cpu_usage" => recorder.cpu_usage, - ); + if !top_cpu_usage.is_empty() { + // Only split the top CPU region when the gRPC poll is not busy. + if !is_grpc_poll_busy { + // Calculate by using the latest CPU usage. + top_cpu_usage.sort_unstable_by(|a, b| { + let cpu_usage_a = self.recorders.get(a).unwrap().cpu_usage; + let cpu_usage_b = self.recorders.get(b).unwrap().cpu_usage; + cpu_usage_b.partial_cmp(&cpu_usage_a).unwrap() + }); + let region_id = top_cpu_usage[0]; + let recorder = self.recorders.get_mut(®ion_id).unwrap(); + if recorder.hottest_key_range.is_some() { + split_infos.push(SplitInfo::with_start_end_key( + region_id, + recorder.peer.clone(), + recorder + .hottest_key_range + .as_ref() + .unwrap() + .start_key + .clone(), + recorder.hottest_key_range.as_ref().unwrap().end_key.clone(), + )); + LOAD_BASE_SPLIT_EVENT.ready_to_split_cpu_top.inc(); + info!("load base split region"; + "region_id" => region_id, + "start_key" => log_wrappers::Value::key(&recorder.hottest_key_range.as_ref().unwrap().start_key), + "end_key" => log_wrappers::Value::key(&recorder.hottest_key_range.as_ref().unwrap().end_key), + "cpu_usage" => recorder.cpu_usage, + ); + } else { + LOAD_BASE_SPLIT_EVENT.empty_hottest_key_range.inc(); + } } else { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[EMPTY_HOTTEST_KEY_RANGE]) - .inc(); + LOAD_BASE_SPLIT_EVENT.unable_to_split_cpu_top.inc(); + } + // Clean up the rest top CPU usage recorders. + for region_id in top_cpu_usage { + self.recorders.remove(®ion_id); } - } else { - LOAD_BASE_SPLIT_EVENT - .with_label_values(&[UNABLE_TO_SPLIT_CPU_TOP]) - .inc(); - } - // Clean up the rest top CPU usage recorders. - for region_id in top_cpu_usage { - self.recorders.remove(®ion_id); } (top_qps.into_vec(), split_infos) From 59c9676795d08a5ff2e35c899e3c3d30611bd2b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Fri, 15 Jul 2022 22:51:06 +0800 Subject: [PATCH 084/676] log-backup: fixed initial scanning data loss (#13024) ref tikv/tikv#12538 Now, initial scanning failure won't just report as a retryable error, but would retry internally and fire a fatal error if retry failed too many times. This PR also make the report of fatal error can provide a `TaskSelector`, which allows reporting errors in some contexts which cannot access the task name. Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 94 +++++++----- components/backup-stream/src/event_loader.rs | 66 ++++----- components/backup-stream/src/metrics.rs | 2 +- components/backup-stream/src/router.rs | 139 +++++++++++++++++- .../backup-stream/src/subscription_manager.rs | 120 +++++++++++---- components/backup-stream/src/utils.rs | 88 ++++++++++- components/backup-stream/tests/mod.rs | 31 +++- 7 files changed, 428 insertions(+), 112 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index c779afebe45..958df7286a7 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -55,7 +55,7 @@ use crate::{ metadata::{store::MetaStore, MetadataClient, MetadataEvent, StreamTask}, metrics::{self, TaskStatus}, observer::BackupStreamObserver, - router::{ApplyEvents, Router}, + router::{ApplyEvents, Router, TaskSelector}, subscription_manager::{RegionSubscriptionManager, ResolvedRegions}, subscription_track::SubscriptionTracer, try_send, @@ -197,45 +197,59 @@ where self.meta_client.clone() } - fn on_fatal_error(&self, task: String, err: Box) { - // Let's pause the task first. - self.unload_task(&task); + fn on_fatal_error(&self, select: TaskSelector, err: Box) { err.report_fatal(); - metrics::update_task_status(TaskStatus::Error, &task); - - let meta_cli = self.get_meta_client(); - let pdc = self.pd_client.clone(); - let store_id = self.store_id; - let sched = self.scheduler.clone(); - let safepoint_name = self.pause_guard_id_for_task(&task); - let safepoint_ttl = self.pause_guard_duration(); - self.pool.block_on(async move { - let err_fut = async { - let safepoint = meta_cli.global_progress_of_task(&task).await?; - pdc.update_service_safe_point( - safepoint_name, - TimeStamp::new(safepoint - 1), - safepoint_ttl, - ) - .await?; - meta_cli.pause(&task).await?; - let mut last_error = StreamBackupError::new(); - last_error.set_error_code(err.error_code().code.to_owned()); - last_error.set_error_message(err.to_string()); - last_error.set_store_id(store_id); - last_error.set_happen_at(TimeStamp::physical_now()); - meta_cli.report_last_error(&task, last_error).await?; - Result::Ok(()) - }; - if let Err(err_report) = err_fut.await { - err_report.report(format_args!("failed to upload error {}", err_report)); - // Let's retry reporting after 5s. - tokio::task::spawn(async move { - tokio::time::sleep(Duration::from_secs(5)).await; - try_send!(sched, Task::FatalError(task, err)); - }); - } - }) + let tasks = self + .pool + .block_on(self.range_router.select_task(select.reference())); + warn!("fatal error reporting"; "selector" => ?select, "selected" => ?tasks, "err" => %err); + for task in tasks { + // Let's pause the task first. + self.unload_task(&task); + metrics::update_task_status(TaskStatus::Error, &task); + + let meta_cli = self.get_meta_client(); + let pdc = self.pd_client.clone(); + let store_id = self.store_id; + let sched = self.scheduler.clone(); + let safepoint_name = self.pause_guard_id_for_task(&task); + let safepoint_ttl = self.pause_guard_duration(); + let code = err.error_code().code.to_owned(); + let msg = err.to_string(); + self.pool.block_on(async move { + let err_fut = async { + let safepoint = meta_cli.global_progress_of_task(&task).await?; + pdc.update_service_safe_point( + safepoint_name, + TimeStamp::new(safepoint - 1), + safepoint_ttl, + ) + .await?; + meta_cli.pause(&task).await?; + let mut last_error = StreamBackupError::new(); + last_error.set_error_code(code); + last_error.set_error_message(msg.clone()); + last_error.set_store_id(store_id); + last_error.set_happen_at(TimeStamp::physical_now()); + meta_cli.report_last_error(&task, last_error).await?; + Result::Ok(()) + }; + if let Err(err_report) = err_fut.await { + err_report.report(format_args!("failed to upload error {}", err_report)); + // Let's retry reporting after 5s. + tokio::task::spawn(async move { + tokio::time::sleep(Duration::from_secs(5)).await; + try_send!( + sched, + Task::FatalError( + TaskSelector::ByName(task.to_owned()), + Box::new(annotate!(err_report, "origin error: {}", msg)) + ) + ); + }); + } + }); + } } async fn starts_flush_ticks(router: Router) { @@ -922,7 +936,7 @@ pub enum Task { /// Convert status of some task into `flushing` and do flush then. ForceFlush(String), /// FatalError pauses the task and set the error. - FatalError(String, Box), + FatalError(TaskSelector, Box), /// Run the callback when see this message. Only for test usage. /// NOTE: Those messages for testing are not guared by `#[cfg(test)]` for now, because /// the integration test would not enable test config when compiling (why?) diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index fdba0194000..841f6ac75b6 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -23,10 +23,12 @@ use tikv::storage::{ use tikv_util::{ box_err, time::{Instant, Limiter}, - warn, worker::Scheduler, }; -use tokio::sync::{OwnedSemaphorePermit, Semaphore}; +use tokio::{ + runtime::Handle, + sync::{OwnedSemaphorePermit, Semaphore}, +}; use txn_types::{Key, Lock, TimeStamp}; use crate::{ @@ -64,7 +66,7 @@ impl PendingMemoryQuota { pub fn pending(&self, size: usize) -> PendingMemory { PendingMemory( - tokio::runtime::Handle::current() + Handle::current() .block_on(self.0.clone().acquire_many_owned(size as _)) .expect("BUG: the semaphore is closed unexpectedly."), ) @@ -186,7 +188,7 @@ pub struct InitialDataLoader { pub(crate) tracing: SubscriptionTracer, pub(crate) scheduler: Scheduler, pub(crate) quota: PendingMemoryQuota, - pub(crate) handle: tokio::runtime::Handle, + pub(crate) handle: Handle, pub(crate) limit: Limiter, _engine: PhantomData, @@ -205,7 +207,7 @@ where tracing: SubscriptionTracer, sched: Scheduler, quota: PendingMemoryQuota, - handle: tokio::runtime::Handle, + handle: Handle, limiter: Limiter, ) -> Self { Self { @@ -252,8 +254,8 @@ where last_err = match last_err { None => Some(e), Some(err) => Some(Error::Contextual { - context: format!("and error {}", e), - inner_error: Box::new(err), + context: format!("and error {}", err), + inner_error: Box::new(e), }), }; @@ -374,6 +376,10 @@ where let mut stats = StatisticsSummary::default(); let start = Instant::now(); loop { + #[cfg(feature = "failpoints")] + fail::fail_point!("scan_and_async_send", |msg| Err(Error::Other(box_err!( + "{:?}", msg + )))); let mut events = ApplyEvents::with_capacity(1024, region.id); let stat = event_loader.fill_entries()?; let disk_read = self.with_resolver(region, |r| { @@ -411,39 +417,31 @@ where region: &Region, start_ts: TimeStamp, snap: impl Snapshot, - on_finish: impl FnOnce() + Send + 'static, ) -> Result { let _guard = self.handle.enter(); - // It is ok to sink more data than needed. So scan to +inf TS for convenance. - let event_loader = EventLoader::load_from(snap, start_ts, TimeStamp::max(), region)?; let tr = self.tracing.clone(); let region_id = region.get_id(); let mut join_handles = Vec::with_capacity(8); - let stats = self.scan_and_async_send(region, event_loader, &mut join_handles); - - // we should mark phase one as finished whether scan successed. - // TODO: use an `WaitGroup` with asynchronous support. - let r = region.clone(); - tokio::spawn(async move { - for h in join_handles { - if let Err(err) = h.await { - warn!("failed to join task."; "err" => %err); - } - } - let result = Self::with_resolver_by(&tr, &r, |r| { - r.phase_one_done(); - Ok(()) - }); - if let Err(err) = result { - err.report(format_args!( - "failed to finish phase 1 for region {:?}", - region_id - )); - } - on_finish() - }); - stats + + // It is ok to sink more data than needed. So scan to +inf TS for convenance. + let event_loader = EventLoader::load_from(snap, start_ts, TimeStamp::max(), region)?; + let stats = self.scan_and_async_send(region, event_loader, &mut join_handles)?; + + Handle::current() + .block_on(futures::future::try_join_all(join_handles)) + .map_err(|err| annotate!(err, "tokio runtime failed to join consuming threads"))?; + + Self::with_resolver_by(&tr, region, |r| { + r.phase_one_done(); + Ok(()) + }) + .context(format_args!( + "failed to finish phase 1 for region {:?}", + region_id + ))?; + + Ok(stats) } /// initialize a range: it simply scan the regions with leader role and send them to [`initialize_region`]. diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index a27dd1ea33b..24a044bb4fb 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -145,7 +145,7 @@ lazy_static! { ) .unwrap(); pub static ref PENDING_INITIAL_SCAN_LEN: IntGaugeVec = register_int_gauge_vec!( - "pending_initial_scan", + "tikv_pending_initial_scan", "The pending initial scan", &["stage"] ) diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 8311c08c7de..7a2c895edb2 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -63,6 +63,50 @@ const FLUSH_FAILURE_BECOME_FATAL_THRESHOLD: usize = 30; /// and storage could take mistaken if writing all of these files to storage concurrently. const FLUSH_LOG_CONCURRENT_BATCH_COUNT: usize = 128; +#[derive(Clone, Debug)] +pub enum TaskSelector { + ByName(String), + ByKey(Vec), + ByRange(Vec, Vec), + All, +} + +impl TaskSelector { + pub fn reference(&self) -> TaskSelectorRef<'_> { + match self { + TaskSelector::ByName(s) => TaskSelectorRef::ByName(s), + TaskSelector::ByKey(k) => TaskSelectorRef::ByKey(&*k), + TaskSelector::ByRange(s, e) => TaskSelectorRef::ByRange(&*s, &*e), + TaskSelector::All => TaskSelectorRef::All, + } + } +} + +#[derive(Clone, Copy, Debug)] +pub enum TaskSelectorRef<'a> { + ByName(&'a str), + ByKey(&'a [u8]), + ByRange(&'a [u8], &'a [u8]), + All, +} + +impl<'a> TaskSelectorRef<'a> { + fn matches<'c, 'd>( + self, + task_name: &str, + mut task_range: impl Iterator, + ) -> bool { + match self { + TaskSelectorRef::ByName(name) => task_name == name, + TaskSelectorRef::ByKey(k) => task_range.any(|(s, e)| utils::is_in_range(k, (&*s, &*e))), + TaskSelectorRef::ByRange(x1, y1) => { + task_range.any(|(x2, y2)| utils::is_overlapping((x1, y1), (&*x2, &*y2))) + } + TaskSelectorRef::All => true, + } + } +} + #[derive(Debug)] pub struct ApplyEvent { pub key: Vec, @@ -376,7 +420,8 @@ impl RouterInner { // register task info let prefix_path = self.prefix.join(&task_name); - let stream_task = StreamTaskInfo::new(prefix_path, task, self.max_flush_interval).await?; + let stream_task = + StreamTaskInfo::new(prefix_path, task, self.max_flush_interval, ranges.clone()).await?; self.tasks .lock() .await @@ -405,6 +450,21 @@ impl RouterInner { r.get_value_by_point(key).cloned() } + pub async fn select_task(&self, selector: TaskSelectorRef<'_>) -> Vec { + let s = self.tasks.lock().await; + s.iter() + .filter(|(name, info)| { + selector.matches( + name.as_str(), + info.ranges + .iter() + .map(|(s, e)| (s.as_slice(), e.as_slice())), + ) + }) + .map(|(name, _)| name.to_owned()) + .collect() + } + #[cfg(test)] pub(crate) async fn must_mut_task_info(&self, task_name: &str, mutator: F) where @@ -488,7 +548,10 @@ impl RouterInner { // NOTE: Maybe we'd better record all errors and send them to the client? try_send!( self.scheduler, - Task::FatalError(task_name.to_owned(), Box::new(e)) + Task::FatalError( + TaskSelector::ByName(task_name.to_owned()), + Box::new(e) + ) ); } return None; @@ -658,6 +721,8 @@ pub struct StreamTaskInfo { pub(crate) task: StreamTask, /// support external storage. eg local/s3. pub(crate) storage: Arc, + /// The listening range of the task. + ranges: Vec<(Vec, Vec)>, /// The parent directory of temporary files. temp_dir: PathBuf, /// The temporary file index. Both meta (m prefixed keys) and data (t prefixed keys). @@ -714,6 +779,7 @@ impl StreamTaskInfo { temp_dir: PathBuf, task: StreamTask, flush_interval: Duration, + ranges: Vec<(Vec, Vec)>, ) -> Result { tokio::fs::create_dir_all(&temp_dir).await?; let storage = Arc::from(create_storage( @@ -724,6 +790,7 @@ impl StreamTaskInfo { task, storage, temp_dir, + ranges, min_resolved_ts: TimeStamp::max(), files: SlotMap::default(), flushing_files: RwLock::default(), @@ -1527,6 +1594,7 @@ mod tests { tmp_dir.path().to_path_buf(), stream_task, Duration::from_secs(300), + vec![(vec![], vec![])], ) .await .unwrap(); @@ -1768,7 +1836,7 @@ mod tests { assert!( messages.iter().any(|task| { if let Task::FatalError(name, _err) = task { - return name == "flush_failure"; + return matches!(name.reference(), TaskSelectorRef::ByName("flush_failure")); } false }), @@ -1797,4 +1865,69 @@ mod tests { let begin_ts = DataFile::decode_begin_ts(value).unwrap(); assert_eq!(begin_ts, start_ts); } + + #[test] + fn test_selector() { + type DummyTask<'a> = (&'a str, &'a [(&'a [u8], &'a [u8])]); + + #[derive(Debug, Clone, Copy)] + struct Case<'a /* 'static */> { + tasks: &'a [DummyTask<'a>], + selector: TaskSelectorRef<'a>, + selected: &'a [&'a str], + } + + let cases = [ + Case { + tasks: &[("Zhao", &[(b"", b"")]), ("Qian", &[(b"", b"")])], + selector: TaskSelectorRef::ByName("Zhao"), + selected: &["Zhao"], + }, + Case { + tasks: &[ + ("Zhao", &[(b"0001", b"1000"), (b"2000", b"")]), + ("Qian", &[(b"0002", b"1000")]), + ], + selector: TaskSelectorRef::ByKey(b"0001"), + selected: &["Zhao"], + }, + Case { + tasks: &[ + ("Zhao", &[(b"0001", b"1000"), (b"2000", b"")]), + ("Qian", &[(b"0002", b"1000")]), + ("Sun", &[(b"0004", b"1024")]), + ("Li", &[(b"1001", b"2048")]), + ], + selector: TaskSelectorRef::ByRange(b"1001", b"2000"), + selected: &["Sun", "Li"], + }, + Case { + tasks: &[ + ("Zhao", &[(b"0001", b"1000"), (b"2000", b"")]), + ("Qian", &[(b"0002", b"1000")]), + ("Sun", &[(b"0004", b"1024")]), + ("Li", &[(b"1001", b"2048")]), + ], + selector: TaskSelectorRef::All, + selected: &["Zhao", "Qian", "Sun", "Li"], + }, + ]; + + fn run(c: Case<'static>) { + assert!( + c.tasks + .iter() + .filter(|(name, range)| c.selector.matches(name, range.iter().copied())) + .map(|(name, _)| name) + .collect::>() + == c.selected.iter().collect::>(), + "case = {:?}", + c + ) + } + + for case in cases { + run(case) + } + } } diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 68c025b16c2..28c1ed6dd78 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -11,7 +11,7 @@ use std::{ use crossbeam::channel::{Receiver as SyncReceiver, Sender as SyncSender}; use crossbeam_channel::SendError; use engine_traits::KvEngine; -use error_code::{backup_stream::OBSERVE_CANCELED, ErrorCodeExt}; +use error_code::ErrorCodeExt; use futures::FutureExt; use kvproto::metapb::Region; use pd_client::PdClient; @@ -36,7 +36,7 @@ use crate::{ metadata::{store::MetaStore, CheckpointProvider, MetadataClient}, metrics, observer::BackupStreamObserver, - router::Router, + router::{Router, TaskSelector}, subscription_track::SubscriptionTracer, try_send, utils::{self, CallbackWaitGroup, Work}, @@ -45,12 +45,14 @@ use crate::{ type ScanPool = yatp::ThreadPool; +const INITIAL_SCAN_FAILURE_MAX_RETRY_TIME: usize = 10; + /// a request for doing initial scanning. struct ScanCmd { region: Region, handle: ObserveHandle, last_checkpoint: TimeStamp, - work: Work, + _work: Work, } /// The response of requesting resolve the new checkpoint of regions. @@ -82,6 +84,25 @@ impl ResolvedRegions { } } +/// returns whether the error should be retried. +/// for some errors, like `epoch not match` or `not leader`, +/// implies that the region is drifting, and no more need to be observed by us. +fn should_retry(err: &Error) -> bool { + match err.without_context() { + Error::RaftRequest(pbe) => { + !(pbe.has_epoch_not_match() + || pbe.has_not_leader() + || pbe.get_message().contains("stale observe id") + || pbe.has_region_not_found()) + } + Error::RaftStore(raftstore::Error::RegionNotFound(_)) + | Error::RaftStore(raftstore::Error::NotLeader(..)) + | Error::ObserveCanceled(..) + | Error::RaftStore(raftstore::Error::EpochNotMatch(..)) => false, + _ => true, + } +} + /// the abstraction over a "DB" which provides the initial scanning. trait InitialScan: Clone { fn do_initial_scan( @@ -89,8 +110,9 @@ trait InitialScan: Clone { region: &Region, start_ts: TimeStamp, handle: ObserveHandle, - on_finish: impl FnOnce() + Send + 'static, ) -> Result; + + fn handle_fatal_error(&self, region: &Region, err: Error); } impl InitialScan for InitialDataLoader @@ -104,35 +126,73 @@ where region: &Region, start_ts: TimeStamp, handle: ObserveHandle, - on_finish: impl FnOnce() + Send + 'static, ) -> Result { let region_id = region.get_id(); + // Note: we have external retry at `ScanCmd::exec_by_with_retry`, should we keep retrying here? let snap = self.observe_over_with_retry(region, move || { ChangeObserver::from_pitr(region_id, handle.clone()) })?; - let stat = self.do_initial_scan(region, start_ts, snap, on_finish)?; + let stat = self.do_initial_scan(region, start_ts, snap)?; Ok(stat) } + + fn handle_fatal_error(&self, region: &Region, err: Error) { + try_send!( + self.scheduler, + Task::FatalError( + TaskSelector::ByRange( + region.get_start_key().to_owned(), + region.get_end_key().to_owned() + ), + Box::new(err), + ) + ); + } } impl ScanCmd { /// execute the initial scanning via the specificated [`InitialDataLoader`]. - fn exec_by(self, initial_scan: impl InitialScan) -> Result<()> { + fn exec_by(&self, initial_scan: impl InitialScan) -> Result<()> { let Self { region, handle, last_checkpoint, - work, + .. } = self; let begin = Instant::now_coarse(); - let stat = - initial_scan.do_initial_scan(®ion, last_checkpoint, handle, move || drop(work))?; + let stat = initial_scan.do_initial_scan(region, *last_checkpoint, handle.clone())?; info!("initial scanning of leader transforming finished!"; "takes" => ?begin.saturating_elapsed(), "region" => %region.get_id(), "from_ts" => %last_checkpoint); utils::record_cf_stat("lock", &stat.lock); utils::record_cf_stat("write", &stat.write); utils::record_cf_stat("default", &stat.data); Ok(()) } + + /// execute the command, when meeting error, retrying. + fn exec_by_with_retry(self, init: impl InitialScan, cancel: &AtomicBool) { + let mut retry_time = INITIAL_SCAN_FAILURE_MAX_RETRY_TIME; + loop { + if cancel.load(Ordering::SeqCst) { + return; + } + match self.exec_by(init.clone()) { + Err(err) if should_retry(&err) && retry_time > 0 => { + // NOTE: blocking this thread may stick the process. + // Maybe spawn a task to tokio and reschedule the task then? + std::thread::sleep(Duration::from_millis(500)); + warn!("meet retryable error"; "err" => %err, "retry_time" => retry_time); + retry_time -= 1; + continue; + } + Err(err) if retry_time == 0 => { + init.handle_fatal_error(&self.region, err.context("retry time exceeds")); + break; + } + // Errors which `should_retry` returns false means they can be ignored. + Err(_) | Ok(_) => break, + } + } + } } fn scan_executor_loop( @@ -150,15 +210,11 @@ fn scan_executor_loop( if canceled.load(Ordering::Acquire) { return; } + metrics::PENDING_INITIAL_SCAN_LEN .with_label_values(&["executing"]) .inc(); - let region_id = cmd.region.get_id(); - if let Err(err) = cmd.exec_by(init.clone()) { - if err.error_code() != OBSERVE_CANCELED { - err.report(format!("during initial scanning of region {}", region_id)); - } - } + cmd.exec_by_with_retry(init.clone(), &canceled); metrics::PENDING_INITIAL_SCAN_LEN .with_label_values(&["executing"]) .dec(); @@ -370,13 +426,21 @@ where if err.error_code() == error_code::backup_stream::OBSERVE_CANCELED { return; } + let (start, end) = ( + region.get_start_key().to_owned(), + region.get_end_key().to_owned(), + ); match self.retry_observe(region, handle).await { Ok(()) => {} Err(e) => { - self.fatal( - e, - format!("While retring to observe region, origin error is {}", err), + let msg = Task::FatalError( + TaskSelector::ByRange(start, end), + Box::new(Error::Contextual { + context: format!("retry meet error, origin error is {}", err), + inner_error: Box::new(e), + }), ); + try_send!(self.scheduler, msg); } } } @@ -384,7 +448,7 @@ where let now = Instant::now(); let timedout = self.wait(Duration::from_secs(30)).await; if timedout { - warn!("waiting for initial scanning done timed out, forcing progress(with risk of data loss)!"; + warn!("waiting for initial scanning done timed out, forcing progress!"; "take" => ?now.saturating_elapsed(), "timedout" => %timedout); } let cps = self.subs.resolve_with(min_ts); @@ -399,10 +463,6 @@ where } } - fn fatal(&self, err: Error, message: String) { - try_send!(self.scheduler, Task::FatalError(message, Box::new(err))); - } - async fn refresh_resolver(&self, region: &Region) { let need_refresh_all = !self.subs.try_update_region(region); @@ -540,7 +600,7 @@ where async fn get_last_checkpoint_of(&self, task: &str, region: &Region) -> Result { let meta_cli = self.meta_cli.clone(); let cp = meta_cli.get_region_checkpoint(task, region).await?; - info!("got region checkpoint"; "region_id" => %region.get_id(), "checkpoint" => ?cp); + debug!("got region checkpoint"; "region_id" => %region.get_id(), "checkpoint" => ?cp); if matches!(cp.provider, CheckpointProvider::Global) { metrics::STORE_CHECKPOINT_TS .with_label_values(&[task]) @@ -574,7 +634,7 @@ where region: region.clone(), handle, last_checkpoint, - work: self.scans.clone().work(), + _work: self.scans.clone().work(), }) } @@ -602,11 +662,13 @@ mod test { _region: &Region, _start_ts: txn_types::TimeStamp, _handle: raftstore::coprocessor::ObserveHandle, - on_finish: impl FnOnce() + Send + 'static, ) -> crate::errors::Result { - on_finish(); Ok(Statistics::default()) } + + fn handle_fatal_error(&self, region: &Region, err: crate::errors::Error) { + panic!("fatal {:?} {}", region, err) + } } #[cfg(feature = "failpoints")] @@ -641,7 +703,7 @@ mod test { handle: Default::default(), last_checkpoint: Default::default(), // Note: Maybe make here a Box or some other trait? - work: wg.work(), + _work: wg.work(), }) .unwrap() } diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 678b571f3b5..486ce6ae0f8 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -34,6 +34,7 @@ use txn_types::{Key, Lock, LockType}; use crate::{ errors::{Error, Result}, metadata::store::BoxFuture, + router::TaskSelector, Task, }; @@ -397,7 +398,7 @@ pub fn handle_on_event_result(doom_messenger: &Scheduler, result: Vec<(Str try_send!( doom_messenger, Task::FatalError( - task, + TaskSelector::ByName(task), Box::new(err.context("failed to record event to local temporary files")) ) ); @@ -536,6 +537,38 @@ pub fn with_record_read_throughput(f: impl FnOnce() -> T) -> (T, u64) { (r, recorder.end()) } +/// test whether a key is in the range. +/// end key is exclusive. +/// empty end key means infinity. +pub fn is_in_range(key: &[u8], range: (&[u8], &[u8])) -> bool { + match range { + (start, b"") => key >= start, + (start, end) => key >= start && key < end, + } +} + +/// test whether two ranges overlapping. +/// end key is exclusive. +/// empty end key means infinity. +pub fn is_overlapping(range: (&[u8], &[u8]), range2: (&[u8], &[u8])) -> bool { + let (x1, y1) = range; + let (x2, y2) = range2; + match (x1, y1, x2, y2) { + // 1: |__________________| + // 2: |______________________| + (_, b"", _, b"") => true, + // 1: (x1)|__________________| + // 2: |_________________|(y2) + (x1, b"", _, y2) => x1 < y2, + // 1: |________________|(y1) + // 2: (x2)|_________________| + (_, y1, x2, b"") => x2 < y1, + // 1: (x1)|________|(y1) + // 2: (x2)|__________|(y2) + (x1, y1, x2, y2) => x2 < y1 && x1 < y2, + } +} + #[cfg(test)] mod test { use std::{ @@ -548,7 +581,58 @@ mod test { use futures::executor::block_on; - use crate::utils::{CallbackWaitGroup, SegmentMap}; + use crate::utils::{is_in_range, CallbackWaitGroup, SegmentMap}; + + #[test] + fn test_range_functions() { + #[derive(Debug)] + struct InRangeCase<'a> { + key: &'a [u8], + range: (&'a [u8], &'a [u8]), + expected: bool, + } + + let cases = [ + InRangeCase { + key: b"0001", + range: (b"0000", b"0002"), + expected: true, + }, + InRangeCase { + key: b"0003", + range: (b"0000", b"0002"), + expected: false, + }, + InRangeCase { + key: b"0002", + range: (b"0000", b"0002"), + expected: false, + }, + InRangeCase { + key: b"0000", + range: (b"0000", b"0002"), + expected: true, + }, + InRangeCase { + key: b"0018", + range: (b"0000", b""), + expected: true, + }, + InRangeCase { + key: b"0018", + range: (b"0019", b""), + expected: false, + }, + ]; + + for case in cases { + assert!( + is_in_range(case.key, case.range) == case.expected, + "case = {:?}", + case + ); + } + } #[test] fn test_segment_tree() { diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index fccd8a0626a..9ba59a181b2 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -600,8 +600,8 @@ mod test { use std::time::Duration; use backup_stream::{ - errors::Error, metadata::MetadataClient, GetCheckpointResult, RegionCheckpointOperation, - RegionSet, Task, + errors::Error, metadata::MetadataClient, router::TaskSelector, GetCheckpointResult, + RegionCheckpointOperation, RegionSet, Task, }; use tikv_util::{box_err, defer, info, HandyRwLock}; use txn_types::TimeStamp; @@ -718,7 +718,7 @@ mod test { endpoint .scheduler() .schedule(Task::FatalError( - "test_fatal_error".to_owned(), + TaskSelector::ByName("test_fatal_error".to_owned()), Box::new(Error::Other(box_err!("everything is alright"))), )) .unwrap(); @@ -845,4 +845,29 @@ mod test { keys.union(&keys2).map(|s| s.as_slice()), ); } + + #[test] + fn initial_scan_failure() { + defer! {{ + fail::remove("scan_and_async_send"); + }} + + let mut suite = SuiteBuilder::new_named("initial_scan_failure") + .nodes(1) + .build(); + let keys = run_async_test(suite.write_records(0, 128, 1)); + fail::cfg( + "scan_and_async_send", + "1*return(dive into the temporary dream, where the SLA never bothers)", + ) + .unwrap(); + suite.must_register_task(1, "initial_scan_failure"); + let keys2 = run_async_test(suite.write_records(256, 128, 1)); + suite.force_flush_files("initial_scan_failure"); + suite.wait_for_flush(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys.union(&keys2).map(|s| s.as_slice()), + ); + } } From 21f00d29c0ae5b0eca8562ce50a9b3bc0d8b9583 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Mon, 18 Jul 2022 10:37:07 +0800 Subject: [PATCH 085/676] raftstore: Implement coprocessor observer post_exec_admin(query) (#12850) ref tikv/tikv#12849 Support new observers post_exec_admin(query). Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- components/cdc/src/delegate.rs | 1 + .../raftstore/src/coprocessor/dispatcher.rs | 37 ++++- components/raftstore/src/coprocessor/mod.rs | 35 ++++- components/raftstore/src/store/fsm/apply.rs | 127 ++++++++++++++++-- components/resolved_ts/src/cmd.rs | 1 + components/resolved_ts/src/observer.rs | 2 +- .../gc_worker/applied_lock_collector.rs | 2 +- 7 files changed, 189 insertions(+), 16 deletions(-) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 55a551490ac..752c068e72a 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -441,6 +441,7 @@ impl Delegate { for cmd in batch.into_iter(self.region_id) { let Cmd { index, + term: _, mut request, mut response, } = cmd; diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 24b79bf4877..cd370e332e3 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -439,6 +439,37 @@ impl CoprocessorHost { } } + /// `post_exec` should be called immediately after we executed one raft command. + /// It notifies observers side effects of this command before execution of the next command, + /// including req/resp, apply state, modified region state, etc. + /// Return true observers think a persistence is necessary. + pub fn post_exec( + &self, + region: &Region, + cmd: &Cmd, + apply_state: &RaftApplyState, + region_state: &RegionState, + ) -> bool { + let mut ctx = ObserverContext::new(region); + if !cmd.response.has_admin_response() { + for observer in &self.registry.query_observers { + let observer = observer.observer.inner(); + if observer.post_exec_query(&mut ctx, cmd, apply_state, region_state) { + return true; + } + } + false + } else { + for observer in &self.registry.admin_observers { + let observer = observer.observer.inner(); + if observer.post_exec_admin(&mut ctx, cmd, apply_state, region_state) { + return true; + } + } + false + } + } + pub fn post_apply_plain_kvs_from_snapshot( &self, region: &Region, @@ -764,7 +795,7 @@ mod tests { assert_all!([&ob.called], &[3]); let mut admin_resp = RaftCmdResponse::default(); admin_resp.set_admin_response(AdminResponse::default()); - host.post_apply(®ion, &Cmd::new(0, admin_req, admin_resp)); + host.post_apply(®ion, &Cmd::new(0, 0, admin_req, admin_resp)); assert_all!([&ob.called], &[6]); let mut query_req = RaftCmdRequest::default(); @@ -774,7 +805,7 @@ mod tests { host.pre_apply(®ion, &query_req); assert_all!([&ob.called], &[15]); let query_resp = RaftCmdResponse::default(); - host.post_apply(®ion, &Cmd::new(0, query_req, query_resp)); + host.post_apply(®ion, &Cmd::new(0, 0, query_req, query_resp)); assert_all!([&ob.called], &[21]); host.on_role_change(®ion, RoleChange::new(StateRole::Leader)); @@ -853,7 +884,7 @@ mod tests { host.pre_apply(®ion, &req); assert_all!([&ob1.called, &ob2.called], &[0, base_score * 2 + 3]); - host.post_apply(®ion, &Cmd::new(0, req.clone(), resp.clone())); + host.post_apply(®ion, &Cmd::new(0, 0, req.clone(), resp.clone())); assert_all!([&ob1.called, &ob2.called], &[0, base_score * 3 + 6]); set_all!(&[&ob2.bypass], false); diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 2dc83c8d7af..b798c7577af 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -14,6 +14,7 @@ use kvproto::{ metapb::Region, pdpb::CheckPolicy, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, RaftCmdResponse, Request}, + raft_serverpb::RaftApplyState, }; use raft::{eraftpb, StateRole}; @@ -74,6 +75,12 @@ impl<'a> ObserverContext<'a> { } } +pub struct RegionState { + pub peer_id: u64, + pub pending_remove: bool, + pub modified_region: Option, +} + pub trait AdminObserver: Coprocessor { /// Hook to call before proposing admin request. fn pre_propose_admin(&self, _: &mut ObserverContext<'_>, _: &mut AdminRequest) -> Result<()> { @@ -91,6 +98,18 @@ pub trait AdminObserver: Coprocessor { fn pre_exec_admin(&self, _: &mut ObserverContext<'_>, _: &AdminRequest) -> bool { false } + + /// Hook to call immediately after exec command + /// Will be a special persistence after this exec if a observer returns true. + fn post_exec_admin( + &self, + _: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + ) -> bool { + false + } } pub trait QueryObserver: Coprocessor { @@ -115,6 +134,18 @@ pub trait QueryObserver: Coprocessor { fn pre_exec_query(&self, _: &mut ObserverContext<'_>, _: &[Request]) -> bool { false } + + /// Hook to call immediately after exec command. + /// Will be a special persistence after this exec if a observer returns true. + fn post_exec_query( + &self, + _: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + ) -> bool { + false + } } pub trait ApplySnapshotObserver: Coprocessor { @@ -215,14 +246,16 @@ pub trait RegionChangeObserver: Coprocessor { #[derive(Clone, Debug, Default)] pub struct Cmd { pub index: u64, + pub term: u64, pub request: RaftCmdRequest, pub response: RaftCmdResponse, } impl Cmd { - pub fn new(index: u64, request: RaftCmdRequest, response: RaftCmdResponse) -> Cmd { + pub fn new(index: u64, term: u64, request: RaftCmdRequest, response: RaftCmdResponse) -> Cmd { Cmd { index, + term, request, response, } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index aa57676925c..7ce35f827c5 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -72,7 +72,9 @@ use self::memtrace::*; use super::metrics::*; use crate::{ bytes_capacity, - coprocessor::{Cmd, CmdBatch, CmdObserveInfo, CoprocessorHost, ObserveHandle, ObserveLevel}, + coprocessor::{ + Cmd, CmdBatch, CmdObserveInfo, CoprocessorHost, ObserveHandle, ObserveLevel, RegionState, + }, store::{ cmd_resp, fsm::RaftPollerBuilder, @@ -292,6 +294,7 @@ pub enum ExecResult { } /// The possible returned value when applying logs. +#[derive(Debug)] pub enum ApplyResult { None, Yield, @@ -974,10 +977,11 @@ where let expect_index = self.apply_state.get_applied_index() + 1; if expect_index != entry.get_index() { panic!( - "{} expect index {}, but got {}", + "{} expect index {}, but got {}, ctx {}", self.tag, expect_index, - entry.get_index() + entry.get_index(), + apply_ctx.tag, ); } @@ -1202,7 +1206,8 @@ where apply_ctx.sync_log_hint |= should_sync_log(&cmd); apply_ctx.host.pre_apply(&self.region, &cmd); - let (mut resp, exec_result) = self.apply_raft_cmd(apply_ctx, index, term, &cmd); + let (mut resp, exec_result, should_write) = + self.apply_raft_cmd(apply_ctx, index, term, &cmd); if let ApplyResult::WaitMergeSource(_) = exec_result { return exec_result; } @@ -1218,10 +1223,14 @@ where // store will call it after handing exec result. cmd_resp::bind_term(&mut resp, self.term); let cmd_cb = self.find_pending(index, term, is_conf_change_cmd(&cmd)); - let cmd = Cmd::new(index, cmd, resp); + let cmd = Cmd::new(index, term, cmd, resp); apply_ctx .applied_batch .push(cmd_cb, cmd, &self.observe_info, self.region_id()); + if should_write { + debug!("persist data and apply state"; "region_id" => self.region_id(), "peer_id" => self.id(), "state" => ?self.apply_state); + apply_ctx.commit(self); + } exec_result } @@ -1239,7 +1248,7 @@ where index: u64, term: u64, req: &RaftCmdRequest, - ) -> (RaftCmdResponse, ApplyResult) { + ) -> (RaftCmdResponse, ApplyResult, bool) { // if pending remove, apply should be aborted already. assert!(!self.pending_remove); @@ -1289,12 +1298,33 @@ where (resp, exec_result) }; if let ApplyResult::WaitMergeSource(_) = exec_result { - return (resp, exec_result); + return (resp, exec_result, false); } self.apply_state.set_applied_index(index); self.applied_index_term = term; + let cmd = Cmd::new(index, term, req.clone(), resp.clone()); + let should_write = ctx.host.post_exec( + &self.region, + &cmd, + &self.apply_state, + &RegionState { + peer_id: self.id(), + pending_remove: self.pending_remove, + modified_region: match exec_result { + ApplyResult::Res(ref e) => match e { + ExecResult::SplitRegion { ref derived, .. } => Some(derived.clone()), + ExecResult::PrepareMerge { ref region, .. } => Some(region.clone()), + ExecResult::CommitMerge { ref region, .. } => Some(region.clone()), + ExecResult::RollbackMerge { ref region, .. } => Some(region.clone()), + _ => None, + }, + _ => None, + }, + }, + ); + if let ApplyResult::Res(ref exec_result) = exec_result { match *exec_result { ExecResult::ChangePeer(ref cp) => { @@ -1345,7 +1375,7 @@ where } } - (resp, exec_result) + (resp, exec_result, should_write) } fn destroy(&mut self, apply_ctx: &mut ApplyContext) { @@ -4859,6 +4889,14 @@ mod tests { self } + fn prepare_merge(mut self, target: metapb::Region) -> EntryBuilder { + let mut request = AdminRequest::default(); + request.set_cmd_type(AdminCmdType::PrepareMerge); + request.mut_prepare_merge().set_target(target); + self.req.set_admin_request(request); + self + } + fn compact_log(mut self, index: u64, term: u64) -> EntryBuilder { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::CompactLog); @@ -4905,6 +4943,27 @@ mod tests { } impl AdminObserver for ApplyObserver { + fn post_exec_admin( + &self, + _: &mut ObserverContext<'_>, + cmd: &Cmd, + _: &RaftApplyState, + region_state: &RegionState, + ) -> bool { + let request = cmd.request.get_admin_request(); + match request.get_cmd_type() { + AdminCmdType::CompactLog => true, + AdminCmdType::CommitMerge + | AdminCmdType::PrepareMerge + | AdminCmdType::RollbackMerge => { + assert!(region_state.modified_region.is_some()); + true + } + AdminCmdType::BatchSplit => true, + _ => false, + } + } + fn pre_exec_admin(&self, _: &mut ObserverContext<'_>, req: &AdminRequest) -> bool { let cmd_type = req.get_cmd_type(); if cmd_type == AdminCmdType::CompactLog @@ -5577,7 +5636,7 @@ mod tests { region_scheduler, coprocessor_host: host, importer, - engine, + engine: engine.clone(), router: router.clone(), store_id: 1, pending_create_peers, @@ -5597,13 +5656,16 @@ mod tests { router.schedule_task(1, Msg::Registration(reg)); let mut index_id = 1; - let put_entry = EntryBuilder::new(1, 1) + let put_entry = EntryBuilder::new(index_id, 1) .put(b"k1", b"v1") + .put(b"k2", b"v2") + .put(b"k3", b"v3") .epoch(1, 3) .build(); router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![put_entry], vec![]))); fetch_apply_res(&rx); + // Phase 1: we test if pre_exec will filter execution of commands correctly. index_id += 1; let compact_entry = EntryBuilder::new(index_id, 1) .compact_log(index_id - 1, 2) @@ -5660,6 +5722,51 @@ mod tests { assert_eq!(apply_res.exec_res.len(), 0); obs.filter_consistency_check.store(false, Ordering::SeqCst); + // Phase 2: we test if post_exec will persist when need. + // We choose BatchSplit in order to make sure `modified_region` is filled. + index_id += 1; + let mut splits = BatchSplitRequest::default(); + splits.set_right_derive(true); + splits.mut_requests().push(new_split_req(b"k2", 8, vec![7])); + let split = EntryBuilder::new(index_id, 1) + .split(splits) + .epoch(1, 3) + .build(); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![split], vec![]))); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_index_term, 1); + let (_, r8) = if let ExecResult::SplitRegion { + regions, + derived: _, + new_split_regions: _, + } = apply_res.exec_res.front().unwrap() + { + let r8 = regions.get(0).unwrap(); + let r1 = regions.get(1).unwrap(); + assert_eq!(r8.get_id(), 8); + assert_eq!(r1.get_id(), 1); + (r1, r8) + } else { + panic!("error split exec_res"); + }; + + index_id += 1; + let merge = EntryBuilder::new(index_id, 1) + .prepare_merge(r8.clone()) + .epoch(1, 3) + .build(); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![merge], vec![]))); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.apply_state.get_applied_index(), index_id); + assert_eq!(apply_res.applied_index_term, 1); + // PrepareMerge will trigger commit. + let state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap_or_default(); + assert_eq!(apply_res.apply_state, state); + system.shutdown(); } diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 8d1cd6e2a90..f561aa07e28 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -49,6 +49,7 @@ impl ChangeLog { .map(|cmd| { let Cmd { index, + term: _, mut request, mut response, } = cmd; diff --git a/components/resolved_ts/src/observer.rs b/components/resolved_ts/src/observer.rs index 483649c36e7..c9decaadc77 100644 --- a/components/resolved_ts/src/observer.rs +++ b/components/resolved_ts/src/observer.rs @@ -185,7 +185,7 @@ mod test { put_cf(CF_WRITE, b"k7", b"v"), put_cf(CF_WRITE, b"k8", b"v"), ]; - let mut cmd = Cmd::new(0, RaftCmdRequest::default(), RaftCmdResponse::default()); + let mut cmd = Cmd::new(0, 0, RaftCmdRequest::default(), RaftCmdResponse::default()); cmd.request.mut_requests().clear(); for put in &data { cmd.request.mut_requests().push(put.clone()); diff --git a/src/server/gc_worker/applied_lock_collector.rs b/src/server/gc_worker/applied_lock_collector.rs index 009b7fbf76c..9c30afc350b 100644 --- a/src/server/gc_worker/applied_lock_collector.rs +++ b/src/server/gc_worker/applied_lock_collector.rs @@ -541,7 +541,7 @@ mod tests { fn make_raft_cmd(requests: Vec) -> Cmd { let mut req = RaftCmdRequest::default(); req.set_requests(requests.into()); - Cmd::new(0, req, RaftCmdResponse::default()) + Cmd::new(0, 0, req, RaftCmdResponse::default()) } fn new_test_collector() -> (AppliedLockCollector, CoprocessorHost) { From 48c7c8fa9e222295d79caa926f8d3e9eb89e9310 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Mon, 18 Jul 2022 10:51:06 +0800 Subject: [PATCH 086/676] raftstore: Avoid printing error log in case sending CaptureChange message failed (#12995) close tikv/tikv#12996 Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/msg.rs | 11 +++++++++++ components/raftstore/src/store/transport.rs | 10 ++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 4f1ea017764..e3820a6d3ee 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -635,6 +635,17 @@ impl fmt::Debug for PeerMsg { } } +impl PeerMsg { + /// For some specific kind of messages, it's actually acceptable if failed to send it by + /// `significant_send`. This function determine if the current message is acceptable to fail. + pub fn is_send_failure_ignorable(&self) -> bool { + matches!( + self, + PeerMsg::SignificantMsg(SignificantMsg::CaptureChange { .. }) + ) + } +} + pub enum StoreMsg where EK: KvEngine, diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index 586b80ed6e5..f64fbae037e 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -6,7 +6,7 @@ use std::sync::mpsc; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, Snapshot}; use kvproto::raft_serverpb::RaftMessage; -use tikv_util::error; +use tikv_util::{error, warn}; use crate::{ store::{CasualMessage, PeerMsg, RaftCommand, RaftRouter, SignificantMsg, StoreMsg}, @@ -90,7 +90,13 @@ where .force_send(region_id, PeerMsg::SignificantMsg(msg)) { // TODO: panic here once we can detect system is shutting down reliably. - error!("failed to send significant msg"; "msg" => ?msg); + + // Avoid printing error log if it's not a severe problem failing to send it. + if msg.is_send_failure_ignorable() { + warn!("failed to send significant msg"; "msg" => ?msg); + } else { + error!("failed to send significant msg"; "msg" => ?msg); + } return Err(Error::RegionNotFound(region_id)); } From 08f4674a798a23815798c3effe948bca006314ab Mon Sep 17 00:00:00 2001 From: Jarvis Date: Mon, 18 Jul 2022 14:27:07 +0800 Subject: [PATCH 087/676] Sm4 support (#12927) ref tikv/tikv#299, ref tikv/tikv#302, ref tikv/tikv#706, ref tikv/tikv#930, ref tikv/tikv#962, ref tikv/tikv#1656, close tikv/tikv#13041 Add SM4 encryption algorithm. Signed-off-by: Jarvis Zheng Co-authored-by: Xinye Tao --- Cargo.lock | 28 +++++++++++++++------- components/encryption/src/config.rs | 3 +++ components/encryption/src/crypter.rs | 3 +++ components/encryption/src/io.rs | 5 ++++ components/engine_rocks/src/encryption.rs | 1 + components/engine_traits/src/encryption.rs | 1 + etc/config-template.toml | 5 ++-- 7 files changed, 36 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dcf1eb84937..5ddd904e637 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2618,7 +2618,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#a5d4ffd2ba337dad0bc99e9fb53bf665864a3f3b" +source = "git+https://github.com/pingcap/kvproto.git#d88fa382391ec305e879be7635e39beae6a19890" dependencies = [ "futures 0.3.15", "grpcio", @@ -2747,7 +2747,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#c8878e2df0c7c23d553d345d337d9dda332e2d5a" +source = "git+https://github.com/tikv/rust-rocksdb.git#2e00e78b945194e8a672e8e078b6c73956e9ace0" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2766,7 +2766,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#c8878e2df0c7c23d553d345d337d9dda332e2d5a" +source = "git+https://github.com/tikv/rust-rocksdb.git#2e00e78b945194e8a672e8e078b6c73956e9ace0" dependencies = [ "bzip2-sys", "cc", @@ -3402,18 +3402,30 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "openssl" -version = "0.10.38" +version = "0.10.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7ae222234c30df141154f159066c5093ff73b63204dcda7121eb082fc56a95" +checksum = "618febf65336490dfcf20b73f885f5651a0c89c64c2d4a8c3662585a70bf5bd0" dependencies = [ "bitflags", "cfg-if 1.0.0", "foreign-types", "libc 0.2.125", "once_cell", + "openssl-macros", "openssl-sys", ] +[[package]] +name = "openssl-macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "openssl-probe" version = "0.1.2" @@ -3431,9 +3443,9 @@ dependencies = [ [[package]] name = "openssl-sys" -version = "0.9.72" +version = "0.9.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e46109c383602735fa0a2e48dd2b7c892b048e1bf69e5c3b1d804b7d9c203cb" +checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f" dependencies = [ "autocfg", "cc", @@ -4571,7 +4583,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#c8878e2df0c7c23d553d345d337d9dda332e2d5a" +source = "git+https://github.com/tikv/rust-rocksdb.git#2e00e78b945194e8a672e8e078b6c73956e9ace0" dependencies = [ "libc 0.2.125", "librocksdb_sys", diff --git a/components/encryption/src/config.rs b/components/encryption/src/config.rs index 8cb779f1cdc..4f83a72855f 100644 --- a/components/encryption/src/config.rs +++ b/components/encryption/src/config.rs @@ -111,6 +111,7 @@ mod encryption_method_serde { const AES128_CTR: &str = "aes128-ctr"; const AES192_CTR: &str = "aes192-ctr"; const AES256_CTR: &str = "aes256-ctr"; + const SM4_CTR: &str = "sm4-ctr"; #[allow(clippy::trivially_copy_pass_by_ref)] pub fn serialize(method: &EncryptionMethod, serializer: S) -> Result @@ -123,6 +124,7 @@ mod encryption_method_serde { EncryptionMethod::Aes128Ctr => serializer.serialize_str(AES128_CTR), EncryptionMethod::Aes192Ctr => serializer.serialize_str(AES192_CTR), EncryptionMethod::Aes256Ctr => serializer.serialize_str(AES256_CTR), + EncryptionMethod::Sm4Ctr => serializer.serialize_str(SM4_CTR), } } @@ -149,6 +151,7 @@ mod encryption_method_serde { AES128_CTR => Ok(EncryptionMethod::Aes128Ctr), AES192_CTR => Ok(EncryptionMethod::Aes192Ctr), AES256_CTR => Ok(EncryptionMethod::Aes256Ctr), + SM4_CTR => Ok(EncryptionMethod::Sm4Ctr), _ => Err(E::invalid_value(Unexpected::Str(value), &self)), } } diff --git a/components/encryption/src/crypter.rs b/components/encryption/src/crypter.rs index 9c148e62247..f869817de2b 100644 --- a/components/encryption/src/crypter.rs +++ b/components/encryption/src/crypter.rs @@ -16,6 +16,7 @@ pub fn encryption_method_to_db_encryption_method(method: EncryptionMethod) -> DB EncryptionMethod::Aes128Ctr => DBEncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr => DBEncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr => DBEncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr => DBEncryptionMethod::Sm4Ctr, EncryptionMethod::Unknown => DBEncryptionMethod::Unknown, } } @@ -26,6 +27,7 @@ pub fn encryption_method_from_db_encryption_method(method: DBEncryptionMethod) - DBEncryptionMethod::Aes128Ctr => EncryptionMethod::Aes128Ctr, DBEncryptionMethod::Aes192Ctr => EncryptionMethod::Aes192Ctr, DBEncryptionMethod::Aes256Ctr => EncryptionMethod::Aes256Ctr, + DBEncryptionMethod::Sm4Ctr => EncryptionMethod::Sm4Ctr, DBEncryptionMethod::Unknown => EncryptionMethod::Unknown, } } @@ -40,6 +42,7 @@ pub fn get_method_key_length(method: EncryptionMethod) -> usize { EncryptionMethod::Aes128Ctr => 16, EncryptionMethod::Aes192Ctr => 24, EncryptionMethod::Aes256Ctr => 32, + EncryptionMethod::Sm4Ctr => 16, unknown => panic!("bad EncryptionMethod {:?}", unknown), } } diff --git a/components/encryption/src/io.rs b/components/encryption/src/io.rs index 6f7d28f61b8..d62542cb16a 100644 --- a/components/encryption/src/io.rs +++ b/components/encryption/src/io.rs @@ -377,6 +377,7 @@ pub fn create_aes_ctr_crypter( EncryptionMethod::Aes128Ctr => OCipher::aes_128_ctr(), EncryptionMethod::Aes192Ctr => OCipher::aes_192_ctr(), EncryptionMethod::Aes256Ctr => OCipher::aes_256_ctr(), + EncryptionMethod::Sm4Ctr => OCipher::sm4_ctr(), }; let crypter = OCrypter::new(cipher, mode, key, Some(iv.as_slice()))?; Ok((cipher, crypter)) @@ -525,6 +526,7 @@ mod tests { EncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, ]; let ivs = [ Iv::new_ctr(), @@ -593,6 +595,7 @@ mod tests { EncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, ]; let mut plaintext = vec![0; 10240]; OsRng.fill_bytes(&mut plaintext); @@ -628,6 +631,7 @@ mod tests { EncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, ]; let mut plaintext = vec![0; 10240]; OsRng.fill_bytes(&mut plaintext); @@ -700,6 +704,7 @@ mod tests { EncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, ]; let iv = Iv::new_ctr(); let mut plain_text = vec![0; 10240]; diff --git a/components/engine_rocks/src/encryption.rs b/components/engine_rocks/src/encryption.rs index a8ec54673b3..94c13e811a9 100644 --- a/components/engine_rocks/src/encryption.rs +++ b/components/engine_rocks/src/encryption.rs @@ -64,6 +64,7 @@ fn convert_encryption_method(input: EncryptionMethod) -> DBEncryptionMethod { EncryptionMethod::Aes128Ctr => DBEncryptionMethod::Aes128Ctr, EncryptionMethod::Aes192Ctr => DBEncryptionMethod::Aes192Ctr, EncryptionMethod::Aes256Ctr => DBEncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr => DBEncryptionMethod::Sm4Ctr, EncryptionMethod::Unknown => DBEncryptionMethod::Unknown, } } diff --git a/components/engine_traits/src/encryption.rs b/components/engine_traits/src/encryption.rs index 51b19c05907..41a0f97fb36 100644 --- a/components/engine_traits/src/encryption.rs +++ b/components/engine_traits/src/encryption.rs @@ -53,4 +53,5 @@ pub enum EncryptionMethod { Aes128Ctr = 2, Aes192Ctr = 3, Aes256Ctr = 4, + Sm4Ctr = 5, } diff --git a/etc/config-template.toml b/etc/config-template.toml index b63fe2ce235..a19533b7847 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -1101,8 +1101,9 @@ ## Configurations for encryption at rest. Experimental. [security.encryption] ## Encryption method to use for data files. -## Possible values are "plaintext", "aes128-ctr", "aes192-ctr" and "aes256-ctr". Value other than -## "plaintext" means encryption is enabled, in which case master key must be specified. +## Possible values are "plaintext", "aes128-ctr", "aes192-ctr", "aes256-ctr" and "sm4-ctr". +## Value other than "plaintext" means encryption is enabled, in which case +## master key must be specified. # data-encryption-method = "plaintext" ## Specifies how often TiKV rotates data encryption key. From 7dc2e017b407538b1a3ce19f0345c42d712c51dc Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Mon, 18 Jul 2022 15:43:07 +0800 Subject: [PATCH 088/676] log-backup: update global-checkpoint to storage periodically (#13035) ref tikv/tikv#1, ref tikv/tikv#12895 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 33 +++++++ components/backup-stream/src/router.rs | 117 ++++++++++++++++++++++- 2 files changed, 149 insertions(+), 1 deletion(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 958df7286a7..51e04023d60 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -801,6 +801,32 @@ where })); } + fn on_update_global_checkpoint(&self, task: String) { + self.pool.block_on(async move { + let ts = self.meta_client.global_progress_of_task(&task).await; + match ts { + Ok(global_checkpoint) => { + if let Err(e) = self + .range_router + .update_global_checkpoint(&task, global_checkpoint, self.store_id) + .await + { + warn!("backup stream failed to update global checkpoint."; + "task" => ?task, + "err" => ?e + ); + } + } + Err(e) => { + warn!("backup stream failed to get global checkpoint."; + "task" => ?task, + "err" => ?e + ); + } + } + }); + } + /// Modify observe over some region. /// This would register the region to the RaftStore. pub fn on_modify_observe(&self, op: ObserveOp) { @@ -839,6 +865,7 @@ where Task::MarkFailover(t) => self.failover_time = Some(t), Task::FlushWithMinTs(task, min_ts) => self.on_flush_with_min_ts(task, min_ts), Task::RegionCheckpointsOp(s) => self.handle_region_checkpoints_op(s), + Task::UpdateGlobalCheckpoint(task) => self.on_update_global_checkpoint(task), } } @@ -958,6 +985,8 @@ pub enum Task { FlushWithMinTs(String, TimeStamp), /// The command for getting region checkpoints. RegionCheckpointsOp(RegionCheckpointOperation), + /// update global-checkpoint-ts to storage. + UpdateGlobalCheckpoint(String), } #[derive(Debug)] @@ -1054,6 +1083,9 @@ impl fmt::Debug for Task { .field(arg1) .finish(), Self::RegionCheckpointsOp(s) => f.debug_tuple("GetRegionCheckpoints").field(s).finish(), + Self::UpdateGlobalCheckpoint(task) => { + f.debug_tuple("UpdateGlobalCheckpoint").field(task).finish() + } } } } @@ -1090,6 +1122,7 @@ impl Task { Task::MarkFailover(_) => "mark_failover", Task::FlushWithMinTs(..) => "flush_with_min_ts", Task::RegionCheckpointsOp(..) => "get_checkpoints", + Task::UpdateGlobalCheckpoint(..) => "update_global_checkpoint", } } } diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 7a2c895edb2..9812d4ed95f 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -8,7 +8,7 @@ use std::{ path::{Path, PathBuf}, result, sync::{ - atomic::{AtomicBool, AtomicPtr, AtomicUsize, Ordering}, + atomic::{AtomicBool, AtomicPtr, AtomicU64, AtomicUsize, Ordering}, Arc, RwLock as SyncRwLock, }, time::Duration, @@ -564,9 +564,28 @@ impl RouterInner { } } + pub async fn update_global_checkpoint( + &self, + task_name: &str, + global_checkpoint: u64, + store_id: u64, + ) -> Result<()> { + let t = self.get_task_info(task_name).await?; + t.update_global_checkpoint(global_checkpoint, store_id) + .await?; + Ok(()) + } + /// tick aims to flush log/meta to extern storage periodically. pub async fn tick(&self) { for (name, task_info) in self.tasks.lock().await.iter() { + if let Err(e) = self + .scheduler + .schedule(Task::UpdateGlobalCheckpoint(name.to_string())) + { + error!("backup stream schedule task failed"; "error" => ?e); + } + // if stream task need flush this time, schedule Task::Flush, or update time justly. if task_info.should_flush() && task_info.set_flushing_status_cas(false, true).is_ok() { info!( @@ -745,6 +764,8 @@ pub struct StreamTaskInfo { flushing: AtomicBool, /// This counts how many times this task has failed to flush. flush_fail_count: AtomicUsize, + /// global checkpoint ts for this task. + global_checkpoint_ts: AtomicU64, } impl Drop for StreamTaskInfo { @@ -786,6 +807,7 @@ impl StreamTaskInfo { task.info.get_storage(), BackendConfig::default(), )?); + let start_ts = task.info.get_start_ts(); Ok(Self { task, storage, @@ -799,6 +821,7 @@ impl StreamTaskInfo { total_size: AtomicUsize::new(0), flushing: AtomicBool::new(false), flush_fail_count: AtomicUsize::new(0), + global_checkpoint_ts: AtomicU64::new(start_ts), }) } @@ -1074,6 +1097,42 @@ impl StreamTaskInfo { result } + + pub async fn flush_global_checkpoint(&self, store_id: u64) -> Result<()> { + let filename = format!("v1/global_checkpoint/{}.ts", store_id); + let buff = self + .global_checkpoint_ts + .load(Ordering::SeqCst) + .to_le_bytes(); + self.storage + .write( + &filename, + UnpinReader(Box::new(Cursor::new(buff))), + buff.len() as _, + ) + .await?; + Ok(()) + } + + pub async fn update_global_checkpoint( + &self, + global_checkpoint: u64, + store_id: u64, + ) -> Result<()> { + let last_global_checkpoint = self.global_checkpoint_ts.load(Ordering::SeqCst); + if last_global_checkpoint < global_checkpoint { + let r = self.global_checkpoint_ts.compare_exchange( + last_global_checkpoint, + global_checkpoint, + Ordering::SeqCst, + Ordering::SeqCst, + ); + if r.is_ok() { + self.flush_global_checkpoint(store_id).await?; + } + } + Ok(()) + } } /// A opened log file with some metadata. @@ -1930,4 +1989,60 @@ mod tests { run(case) } } + + #[tokio::test] + async fn test_update_global_checkpoint() { + // create local storage + let tmp_dir = tempfile::tempdir().unwrap(); + let backend = external_storage_export::make_local_backend(tmp_dir.path()); + + // build a StreamTaskInfo + let mut task_info = StreamBackupTaskInfo::default(); + task_info.set_storage(backend); + let stream_task = StreamTask { + info: task_info, + is_paused: false, + }; + let task = StreamTaskInfo::new( + tmp_dir.path().to_path_buf(), + stream_task, + Duration::from_secs(300), + vec![(vec![], vec![])], + ) + .await + .unwrap(); + task.global_checkpoint_ts.store(10001, Ordering::SeqCst); + + // test no need to update global checkpoint + let store_id = 3; + let mut global_checkpoint = 10000; + let r = task + .update_global_checkpoint(global_checkpoint, store_id) + .await; + assert_eq!(r.is_ok(), true); + assert_eq!(task.global_checkpoint_ts.load(Ordering::SeqCst), 10001); + + // test update global checkpoint + global_checkpoint = 10002; + let r = task + .update_global_checkpoint(global_checkpoint, store_id) + .await; + assert_eq!(r.is_ok(), true); + assert_eq!( + task.global_checkpoint_ts.load(Ordering::SeqCst), + global_checkpoint + ); + + let filename = format!("v1/global_checkpoint/{}.ts", store_id); + let filepath = tmp_dir.as_ref().join(filename); + let exist = file_system::file_exists(filepath.clone()); + assert_eq!(exist, true); + + let buff = file_system::read(filepath).unwrap(); + assert_eq!(buff.len(), 8); + let mut ts = [b'0'; 8]; + ts.copy_from_slice(&buff); + let ts = u64::from_le_bytes(ts); + assert_eq!(ts, global_checkpoint); + } } From 9b1f195af8a14740ab611d630da2ab66ce105089 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Mon, 18 Jul 2022 17:39:08 +0800 Subject: [PATCH 089/676] raftstore: pub `check_sst_for_ingestion` (#13040) ref tikv/tikv#12849 pub `check_sst_for_ingestion` Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/apply.rs | 5 ++++- components/raftstore/src/store/fsm/mod.rs | 8 ++++---- components/raftstore/src/store/mod.rs | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 7ce35f827c5..16e039dd640 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -2855,7 +2855,10 @@ pub fn is_conf_change_cmd(msg: &RaftCmdRequest) -> bool { req.has_change_peer() || req.has_change_peer_v2() } -fn check_sst_for_ingestion(sst: &SstMeta, region: &Region) -> Result<()> { +/// This function is used to check whether an sst is valid for ingestion. +/// +/// The `sst` must have epoch and range matched with `region`. +pub fn check_sst_for_ingestion(sst: &SstMeta, region: &Region) -> Result<()> { let uuid = sst.get_uuid(); if let Err(e) = UuidBuilder::from_slice(uuid) { return Err(box_err!("invalid uuid {:?}: {:?}", uuid, e)); diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index 731ad5209b4..7aa93867158 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -11,10 +11,10 @@ pub mod store; pub use self::{ apply::{ - create_apply_batch_system, Apply, ApplyBatchSystem, ApplyMetrics, ApplyRes, ApplyRouter, - Builder as ApplyPollerBuilder, CatchUpLogs, ChangeObserver, ChangePeer, ExecResult, - GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, Registration, - TaskRes as ApplyTaskRes, + check_sst_for_ingestion, create_apply_batch_system, Apply, ApplyBatchSystem, ApplyMetrics, + ApplyRes, ApplyRouter, Builder as ApplyPollerBuilder, CatchUpLogs, ChangeObserver, + ChangePeer, ExecResult, GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, + Registration, TaskRes as ApplyTaskRes, }, peer::{DestroyPeerJob, PeerFsm}, store::{ diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index b1b8da54e2b..64c70bbc2e7 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -33,7 +33,7 @@ pub use self::{ }, compaction_guard::CompactionGuardGeneratorFactory, config::Config, - fsm::{DestroyPeerJob, RaftRouter, StoreInfo}, + fsm::{check_sst_for_ingestion, DestroyPeerJob, RaftRouter, StoreInfo}, hibernate_state::{GroupState, HibernateState}, memory::*, metrics::RAFT_ENTRY_FETCHES_VEC, From 190f4634872ae4f78d7f8a51a7450176f20d41f8 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Mon, 18 Jul 2022 21:31:07 +0800 Subject: [PATCH 090/676] log-backup: fix the missing sha256 calculation in a flush retry (#13033) ref tikv/tikv#208, close tikv/tikv#13034 Signed-off-by: 3pointer --- components/backup-stream/src/router.rs | 59 ++++++++++++++++++-------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 9812d4ed95f..1ad4c4ad4ca 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -747,7 +747,7 @@ pub struct StreamTaskInfo { /// The temporary file index. Both meta (m prefixed keys) and data (t prefixed keys). files: SlotMap, /// flushing_files contains files pending flush. - flushing_files: RwLock)>>, + flushing_files: RwLock, DataFileInfo)>>, /// last_flush_ts represents last time this task flushed to storage. last_flush_time: AtomicPtr, /// flush_interval represents the tick interval of flush, setting by users. @@ -774,6 +774,7 @@ impl Drop for StreamTaskInfo { .flushing_files .get_mut() .drain(..) + .map(|(a, b, _)| (a, b)) .chain(self.files.get_mut().drain()) .map(|(_, f)| f.into_inner().local_path) .map(std::fs::remove_file) @@ -880,7 +881,7 @@ impl StreamTaskInfo { pub async fn generate_metadata(&self, store_id: u64) -> Result { let w = self.flushing_files.read().await; // Let's flush all files first... - futures::future::join_all(w.iter().map(|(_, f)| async move { + futures::future::join_all(w.iter().map(|(_, f, _)| async move { let file = &mut f.lock().await.inner; file.flush().await?; file.get_ref().sync_all().await?; @@ -893,10 +894,8 @@ impl StreamTaskInfo { let mut metadata = MetadataInfo::with_capacity(w.len()); metadata.set_store_id(store_id); - for (file_key, data_file) in w.iter() { - let mut data_file = data_file.lock().await; - let file_meta = data_file.generate_metadata(file_key, store_id)?; - metadata.push(file_meta) + for (_, _, file_meta) in w.iter() { + metadata.push(file_meta.to_owned()) } Ok(metadata) } @@ -930,22 +929,27 @@ impl StreamTaskInfo { } /// move need-flushing files to flushing_files. - pub async fn move_to_flushing_files(&self) -> &Self { + pub async fn move_to_flushing_files(&self, store_id: u64) -> Result<&Self> { // if flushing_files is not empty, which represents this flush is a retry operation. if !self.flushing_files.read().await.is_empty() { - return self; + return Ok(self); } let mut w = self.files.write().await; let mut fw = self.flushing_files.write().await; for (k, v) in w.drain() { - fw.push((k, v)); + // we should generate file metadata(calculate sha256) when moving file. + // because sha256 calculation is a unsafe move operation. + // we cannot re-calculate it in retry. + // TODO refactor move_to_flushing_files and generate_metadata + let file_meta = v.lock().await.generate_metadata(&k, store_id)?; + fw.push((k, v, file_meta)); } - self + Ok(self) } pub async fn clear_flushing_files(&self) { - for (_, v) in self.flushing_files.write().await.drain(..) { + for (_, v, _) in self.flushing_files.write().await.drain(..) { let data_file = v.lock().await; debug!("removing data file"; "size" => %data_file.file_size, "name" => %data_file.local_path.display()); self.total_size @@ -1000,7 +1004,7 @@ impl StreamTaskInfo { for batch_files in files.chunks(FLUSH_LOG_CONCURRENT_BATCH_COUNT) { let futs = batch_files .iter() - .map(|(_, v)| Self::flush_log_file_to(storage.clone(), v)); + .map(|(_, v, _)| Self::flush_log_file_to(storage.clone(), v)); futures::future::try_join_all(futs).await?; } @@ -1046,8 +1050,8 @@ impl StreamTaskInfo { // generate meta data and prepare to flush to storage let mut metadata_info = self - .move_to_flushing_files() - .await + .move_to_flushing_files(store_id) + .await? .generate_metadata(store_id) .await?; metadata_info.min_resolved_ts = metadata_info @@ -1580,8 +1584,8 @@ mod tests { let end_ts = TimeStamp::physical_now(); let files = router.tasks.lock().await.get("dummy").unwrap().clone(); let meta = files - .move_to_flushing_files() - .await + .move_to_flushing_files(1) + .await? .generate_metadata(1) .await?; assert_eq!(meta.files.len(), 3, "test file len = {}", meta.files.len()); @@ -1596,6 +1600,25 @@ mod tests { start_ts, end_ts ); + + // in some case when flush failed to write files to storage. + // we may run `generate_metadata` again with same files. + let another_meta = files + .move_to_flushing_files(1) + .await? + .generate_metadata(1) + .await?; + + assert_eq!(meta.files.len(), another_meta.files.len()); + for i in 0..meta.files.len() { + let file1 = meta.files.get(i).unwrap(); + let file2 = another_meta.files.get(i).unwrap(); + // we have to make sure two times sha256 of file must be the same. + assert_eq!(file1.sha256, file2.sha256); + assert_eq!(file1.start_key, file2.start_key); + assert_eq!(file1.end_key, file2.end_key); + } + files.flush_log().await?; files.flush_meta(meta).await?; files.clear_flushing_files().await; @@ -1840,8 +1863,8 @@ mod tests { router .get_task_info("cleanup_test") .await? - .move_to_flushing_files() - .await; + .move_to_flushing_files(1) + .await?; write_simple_data(&router).await; let mut w = walkdir::WalkDir::new(&tmp).into_iter(); assert!(w.next().is_some(), "the temp files doesn't created"); From dfb8559444e85c6eafe22435e22c98f28bb40436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 19 Jul 2022 10:33:07 +0800 Subject: [PATCH 091/676] log-backup: added some new metrics for log backup (#13048) ref tikv/tikv#12534 Added the advancer metrics. Signed-off-by: Yu Juncen Co-authored-by: zhangjinpeng1987 --- metrics/grafana/tikv_details.json | 1373 ++++++++++++++++++++++++++--- 1 file changed, 1249 insertions(+), 124 deletions(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index b07aff345a7..adb398824ca 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -40123,7 +40123,7 @@ "h": 1, "w": 24, "x": 0, - "y": 49 + "y": 54 }, "id": 13016, "panels": [ @@ -40177,7 +40177,7 @@ "h": 4, "w": 5, "x": 0, - "y": 50 + "y": 55 }, "id": 14361, "options": { @@ -40195,7 +40195,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "targets": [ { "exemplar": true, @@ -40239,7 +40239,7 @@ "h": 8, "w": 8, "x": 5, - "y": 50 + "y": 55 }, "id": 14507, "options": { @@ -40257,7 +40257,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "targets": [ { "exemplar": true, @@ -40300,7 +40300,7 @@ "h": 8, "w": 8, "x": 13, - "y": 50 + "y": 55 }, "id": 14363, "options": { @@ -40318,11 +40318,11 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "targets": [ { "exemplar": true, - "expr": "increase(tikv_stream_flush_file_size_count{instance=~\"$instance\"}[30m])", + "expr": "round(increase(tikv_stream_flush_file_size_count{instance=~\"$instance\"}[30m]))", "instant": true, "interval": "", "legendFormat": "{{ instance }}", @@ -40361,7 +40361,7 @@ "h": 2, "w": 3, "x": 21, - "y": 50 + "y": 55 }, "id": 14508, "options": { @@ -40379,7 +40379,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "targets": [ { "exemplar": true, @@ -40422,7 +40422,7 @@ "h": 3, "w": 3, "x": 21, - "y": 52 + "y": 57 }, "id": 14362, "options": { @@ -40440,7 +40440,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "targets": [ { "exemplar": true, @@ -40517,9 +40517,9 @@ }, "gridPos": { "h": 4, - "w": 5, + "w": 2, "x": 0, - "y": 54 + "y": 59 }, "id": 14907, "options": { @@ -40537,7 +40537,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "targets": [ { "exemplar": true, @@ -40551,10 +40551,70 @@ ], "timeFrom": null, "timeShift": null, - "title": "Log Backup Task Status", + "title": "Task Status", "transformations": [], "type": "stat" }, + { + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "dark-blue", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 2, + "y": 59 + }, + "id": 15361, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "name" + }, + "pluginVersion": "7.5.11", + "targets": [ + { + "exemplar": true, + "expr": "tidb_log_backup_advancer_owner > 0", + "instant": true, + "interval": "", + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Advancer Owner", + "type": "stat" + }, { "datasource": "${DS_TEST-CLUSTER}", "description": "This is the summary of the file count has been flushed, summered by the data each TiKV has flushed since last boot. \n**NOTE: The size may get reduced if some of TiKVs reboot.**", @@ -40581,7 +40641,7 @@ "h": 3, "w": 3, "x": 21, - "y": 55 + "y": 60 }, "id": 14911, "options": { @@ -40599,11 +40659,11 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_flush_file_size_count{instance=~\"$instance\"}[30m]))", + "expr": "round(sum(increase(tikv_stream_flush_file_size_count{instance=~\"$instance\"}[30m])))", "hide": false, "instant": true, "interval": "", @@ -40637,7 +40697,7 @@ "h": 10, "w": 6, "x": 0, - "y": 58 + "y": 63 }, "hiddenSeries": false, "id": 13262, @@ -40664,7 +40724,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", @@ -40675,7 +40735,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(tikv_thread_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"backup_stream|log_backup_scan(_[0-9]+)?\"}[2m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"backup_stream|log-backup-scan(-[0-9]+)?\"}[2m])) by (instance)", "format": "time_series", "hide": false, "interval": "", @@ -40747,7 +40807,7 @@ "h": 10, "w": 6, "x": 6, - "y": 58 + "y": 63 }, "hiddenSeries": false, "id": 12843, @@ -40769,7 +40829,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -40849,7 +40909,7 @@ "h": 10, "w": 6, "x": 12, - "y": 58 + "y": 63 }, "hiddenSeries": false, "id": 14135, @@ -40870,7 +40930,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -40933,65 +40993,93 @@ } }, { + "alert": { + "alertRuleTags": {}, + "conditions": [ + { + "evaluator": { + "params": [ + 600000 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "for": "5m", + "frequency": "1m", + "handler": 1, + "name": "Checkpoint Lag Too Huge", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fieldConfig": { "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "min": 1, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 300000 - }, - { - "color": "red", - "value": 900000 - } - ] - }, "unit": "ms" }, "overrides": [] }, + "fill": 1, + "fillGradient": 0, "gridPos": { "h": 10, "w": 6, "x": 18, - "y": 58 + "y": 63 }, + "hiddenSeries": false, "id": 14774, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "center", - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "last" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "value_and_name" + "alertThreshold": true }, - "pluginVersion": "7.5.7", + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { "exemplar": true, - "expr": "time() * 1000 - tikv_stream_store_checkpoint_ts{instance=~\"$instance\"} / 262144 > 300000", - "instant": true, + "expr": "time() * 1000 - max(tidb_log_backup_last_checkpoint / 262144 > 0) by (task)", + "instant": false, "interval": "", - "legendFormat": "{{ task }}@{{ instance }}", + "legendFormat": "{{ task }}", "refId": "A" }, { @@ -41003,10 +41091,57 @@ "refId": "B" } ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 600000, + "visible": true + } + ], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Abnormal Checkpoint TS Lag", - "type": "stat" + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:228", + "format": "ms", + "label": null, + "logBase": 1, + "max": "3000000", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:229", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, @@ -41025,7 +41160,7 @@ "h": 10, "w": 6, "x": 0, - "y": 68 + "y": 73 }, "hiddenSeries": false, "id": 13100, @@ -41047,7 +41182,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -41127,7 +41262,7 @@ "h": 10, "w": 6, "x": 6, - "y": 68 + "y": 73 }, "hiddenSeries": false, "id": 14630, @@ -41147,7 +41282,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -41163,7 +41298,7 @@ "targets": [ { "exemplar": true, - "expr": "(tikv_stream_observed_region{instance=~\"$instance\", type=\"inc\"} - on(instance) tikv_stream_observed_region{instance=~\"$instance\", type=\"dec\"}) > 0", + "expr": "tikv_stream_observed_region{instance=~\"$instance\"}", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -41172,7 +41307,7 @@ }, { "exemplar": true, - "expr": "sum(tikv_stream_observed_region{instance=~\"$instance\", type=\"inc\"} - on(instance) tikv_stream_observed_region{instance=~\"$instance\", type=\"dec\"}) > 0", + "expr": "sum(tikv_stream_observed_region{instance=~\"$instance\"})", "hide": false, "interval": "", "legendFormat": "total", @@ -41228,7 +41363,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The errors met when backing up.", + "description": "The errors met when backing up.\n**They are retryable, don't worry.**", "fieldConfig": { "defaults": {}, "overrides": [] @@ -41239,7 +41374,7 @@ "h": 5, "w": 6, "x": 12, - "y": 68 + "y": 73 }, "hiddenSeries": false, "id": 13101, @@ -41261,7 +41396,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -41350,7 +41485,7 @@ "h": 10, "w": 6, "x": 18, - "y": 68 + "y": 73 }, "hiddenSeries": false, "id": 14910, @@ -41370,7 +41505,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -41388,7 +41523,7 @@ "targets": [ { "exemplar": true, - "expr": "min(tikv_stream_store_checkpoint_ts{instance=~\"$instance\"} / 262144) by (task)", + "expr": "max(tidb_log_backup_last_checkpoint{instance=~\"$instance\"} / 262144 > 0) by (task)", "instant": false, "interval": "", "legendFormat": "{{ task }}", @@ -41463,7 +41598,7 @@ "h": 5, "w": 6, "x": 12, - "y": 73 + "y": 78 }, "hiddenSeries": false, "id": 14908, @@ -41485,7 +41620,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -41593,7 +41728,7 @@ "h": 7, "w": 6, "x": 0, - "y": 78 + "y": 83 }, "heatmap": {}, "hideZeroBuckets": true, @@ -41676,7 +41811,7 @@ "h": 7, "w": 6, "x": 6, - "y": 78 + "y": 83 }, "heatmap": {}, "hideZeroBuckets": true, @@ -41759,7 +41894,7 @@ "h": 7, "w": 6, "x": 12, - "y": 78 + "y": 83 }, "heatmap": {}, "hideZeroBuckets": true, @@ -41842,7 +41977,7 @@ "h": 7, "w": 6, "x": 18, - "y": 78 + "y": 83 }, "heatmap": {}, "hideZeroBuckets": true, @@ -41925,7 +42060,7 @@ "h": 7, "w": 6, "x": 0, - "y": 85 + "y": 90 }, "heatmap": {}, "hideZeroBuckets": true, @@ -42008,7 +42143,7 @@ "h": 7, "w": 6, "x": 6, - "y": 85 + "y": 90 }, "heatmap": {}, "hideZeroBuckets": true, @@ -42091,7 +42226,7 @@ "h": 7, "w": 6, "x": 12, - "y": 85 + "y": 90 }, "heatmap": {}, "hideZeroBuckets": true, @@ -42174,7 +42309,7 @@ "h": 7, "w": 6, "x": 18, - "y": 85 + "y": 90 }, "heatmap": {}, "hideZeroBuckets": true, @@ -42238,7 +42373,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "Misc statistics of RocksDB during initial scanning.", + "description": "The internal message type count.", "fieldConfig": { "defaults": {}, "overrides": [] @@ -42249,17 +42384,16 @@ "h": 6, "w": 12, "x": 0, - "y": 92 + "y": 97 }, "hiddenSeries": false, - "id": 14270, + "id": 14914, "legend": { "avg": false, "current": false, "max": false, "min": false, - "rightSide": true, - "show": true, + "show": false, "total": false, "values": false }, @@ -42270,7 +42404,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -42281,9 +42415,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(tikv_stream_initial_scan_operations{instance=~\"$instance\", op!~\"read_bytes\"}[$__rate_interval])) BY (op, cf) > 0", + "expr": "sum(rate(tikv_log_backup_interal_actor_acting_duration_sec_count{instance=~\"$instance\"}[$__rate_interval])) by (message)", "interval": "", - "legendFormat": "{{ cf }}/{{ op }}", + "legendFormat": "{{ message }}", "queryType": "randomWalk", "refId": "A" } @@ -42292,7 +42426,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Initial Scan RocksDB Operation ", + "title": "Internal Message Type", "tooltip": { "shared": true, "sort": 0, @@ -42352,7 +42486,7 @@ "h": 6, "w": 6, "x": 12, - "y": 92 + "y": 97 }, "hiddenSeries": false, "id": 14912, @@ -42374,7 +42508,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -42385,7 +42519,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(histogram_quantile(0.99, rate(tikv_log_backup_interal_actor_acting_duration_sec_bucket[10m]))) by (message)", + "expr": "sum(histogram_quantile(0.99, rate(tikv_log_backup_interal_actor_acting_duration_sec_bucket{instance=~\"$instance\"}[10m]))) by (message)", "interval": "", "legendFormat": "{{ message }}", "queryType": "randomWalk", @@ -42452,7 +42586,7 @@ "h": 6, "w": 6, "x": 18, - "y": 92 + "y": 97 }, "hiddenSeries": false, "id": 14913, @@ -42472,7 +42606,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -42483,7 +42617,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(histogram_quantile(0.9, rate(tikv_log_backup_interal_actor_acting_duration_sec_bucket[10m]))) by (message)", + "expr": "sum(histogram_quantile(0.9, rate(tikv_log_backup_interal_actor_acting_duration_sec_bucket{instance=~\"$instance\"}[10m]))) by (message)", "interval": "", "legendFormat": "{{ message }}", "queryType": "randomWalk", @@ -42548,9 +42682,9 @@ "fillGradient": 0, "gridPos": { "h": 6, - "w": 12, + "w": 6, "x": 0, - "y": 98 + "y": 103 }, "hiddenSeries": false, "id": 14271, @@ -42571,7 +42705,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -42638,7 +42772,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The internal message type count.", + "description": "Misc statistics of RocksDB during initial scanning.", "fieldConfig": { "defaults": {}, "overrides": [] @@ -42648,17 +42782,18 @@ "gridPos": { "h": 6, "w": 6, - "x": 12, - "y": 98 + "x": 6, + "y": 103 }, "hiddenSeries": false, - "id": 14914, + "id": 14270, "legend": { "avg": false, "current": false, "max": false, "min": false, - "show": false, + "rightSide": true, + "show": true, "total": false, "values": false }, @@ -42669,7 +42804,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -42680,9 +42815,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(tikv_log_backup_interal_actor_acting_duration_sec_count[$__rate_interval])) by (message)", + "expr": "sum(rate(tikv_stream_initial_scan_operations{instance=~\"$instance\", op!~\"read_bytes\"}[$__rate_interval])) BY (op, cf) > 0", "interval": "", - "legendFormat": "{{ message }}", + "legendFormat": "{{ cf }}/{{ op }}", "queryType": "randomWalk", "refId": "A" } @@ -42691,7 +42826,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Internal Message Type", + "title": "Initial Scan RocksDB Operation ", "tooltip": { "shared": true, "sort": 0, @@ -42731,8 +42866,11 @@ } }, { - "aliasColors": {}, - "bars": false, + "aliasColors": { + "leader-changed": "blue", + "region-changed": "purple" + }, + "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", @@ -42746,8 +42884,8 @@ "gridPos": { "h": 6, "w": 6, - "x": 18, - "y": 98 + "x": 12, + "y": 103 }, "hiddenSeries": false, "id": 14915, @@ -42760,6 +42898,107 @@ "total": false, "values": false }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": false + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tikv_log_backup_initial_scan_reason{instance=~\"$instance\"}[$__rate_interval])) by (reason)", + "interval": "", + "legendFormat": "{{ message }}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Initial Scanning Trigger Reason", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2608", + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2609", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "del": "dark-red", + "put": "green" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 103 + }, + "hiddenSeries": false, + "id": 15176, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, "lines": true, "linewidth": 1, "nullPointMode": "null", @@ -42767,7 +43006,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 2, "points": false, "renderer": "flot", @@ -42778,9 +43017,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(tikv_log_backup_initial_scan_reason[$__rate_interval])) by (reason)", + "expr": "sum(rate(tikv_log_backup_metadata_key_operation{instance=~\"$instance\"}[$__rate_interval])) by (type)", "interval": "", - "legendFormat": "{{ message }}", + "legendFormat": "{{ type }}", "queryType": "randomWalk", "refId": "A" } @@ -42789,7 +43028,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Initial Scanning Trigger Reason Rate", + "title": "Region Checkpoint Key Putting", "tooltip": { "shared": true, "sort": 0, @@ -42827,6 +43066,892 @@ "align": false, "alignLevel": null } + }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 109 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 15544, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tidb_log_backup_advancer_batch_size_bucket{type=\"checkpoint\"}[$__interval])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Request Checkpoint Batch Size", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "none", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": 0, + "cardRound": 0 + }, + "color": { + "cardColor": "#FF9830", + "colorScale": "linear", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "max": null, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 109 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 15716, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tidb_log_backup_advancer_tick_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", step=~\"tick\"}[$__interval])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Tick Duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": 1, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 1, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": { + "epoch-not-match": "purple", + "not-leader": "blue", + "watch_task": "orange" + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The reason of advancer failed to be advanced.", + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 109 + }, + "hiddenSeries": false, + "id": 23763572666, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tidb_log_backup_region_request_failure{reason!=\"retryable-scan-region\"}[$__interval])) by (reason)", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ reason }}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Region Checkpoint Failure Reason", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:103", + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:104", + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "fail": "red", + "success": "green", + "watch_task": "orange" + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The result of getting region checkpoints.", + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 109 + }, + "hiddenSeries": false, + "id": 23763572665, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:834", + "alias": "fail", + "transform": "negative-Y", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tidb_log_backup_region_request[$__interval])) by (result)", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ result }}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "exemplar": true, + "expr": "", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Request Result", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:103", + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:104", + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "watch_task": "orange" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The internal handling message duration.", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 116 + }, + "hiddenSeries": false, + "id": 15359, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1017", + "alias": "consistency-check", + "yaxis": 1 + }, + { + "$$hashKey": "object:1018", + "alias": "get-checkpoints-of-store", + "yaxis": 2 + }, + { + "$$hashKey": "object:1019", + "alias": "get-checkpoints-in-range", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(histogram_quantile(0.99, rate(tidb_log_backup_advancer_tick_duration_sec_bucket[10m]))) by (step)", + "interval": "", + "legendFormat": "{{ step }}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Tick Duration (P99)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:103", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:104", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "watch_task": "orange" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The internal handling message duration.", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 116 + }, + "hiddenSeries": false, + "id": 15360, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1091", + "alias": "get-checkpoints-of-store", + "yaxis": 2 + }, + { + "$$hashKey": "object:1092", + "alias": "get-checkpoints-in-range", + "yaxis": 2 + }, + { + "$$hashKey": "object:1093", + "alias": "consistency-check", + "yaxis": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(histogram_quantile(0.9, rate(tidb_log_backup_advancer_tick_duration_sec_bucket[10m]))) by (step)", + "interval": "", + "legendFormat": "{{ step }}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Tick Duration (P90)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:103", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:104", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "watch_task": "orange" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The frequent of getting region level checkpoint.", + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 116 + }, + "hiddenSeries": false, + "id": 23763572733, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1091", + "alias": "get-checkpoints-of-store", + "yaxis": 2 + }, + { + "$$hashKey": "object:1092", + "alias": "get-checkpoints-in-range", + "yaxis": 2 + }, + { + "$$hashKey": "object:1093", + "alias": "consistency-check", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "rate(tidb_log_backup_advancer_tick_duration_sec_count{step=\"get-regions-in-range\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{ step }} {{ instance }}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Get Region Operation Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:103", + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:104", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "watch_task": "orange" + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The variant of checkpoint group.", + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 116 + }, + "hiddenSeries": false, + "id": 23763572734, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1091", + "alias": "get-checkpoints-of-store", + "yaxis": 2 + }, + { + "$$hashKey": "object:1092", + "alias": "get-checkpoints-in-range", + "yaxis": 2 + }, + { + "$$hashKey": "object:1093", + "alias": "consistency-check", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "increase(tidb_log_backup_advancer_tick_duration_sec_count{step=\"try-advance\"}[$__interval])", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ step }} {{ instance }}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Try Advance Trigger Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:103", + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:104", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "title": "Backup Log", From 21e3bd64e8de517e09836cda4991b446914f3f75 Mon Sep 17 00:00:00 2001 From: ekexium Date: Tue, 19 Jul 2022 11:05:07 +0800 Subject: [PATCH 092/676] make the smallest bucket size 10 us for many metrics (#13037) close tikv/tikv#13036 Change the smallest bucket size from 500us to 10 us for many metrics Signed-off-by: ekexium Co-authored-by: Ti Chi Robot --- .../engine_rocks/src/perf_context_metrics.rs | 4 ++-- components/external_storage/src/metrics.rs | 2 +- components/raftstore/src/store/metrics.rs | 16 ++++++++-------- components/raftstore/src/store/worker/metrics.rs | 4 ++-- src/coprocessor/metrics.rs | 8 ++++---- src/server/lock_manager/metrics.rs | 2 +- src/server/metrics.rs | 6 +++--- src/storage/metrics.rs | 10 +++++----- src/storage/txn/scheduler.rs | 6 +++--- 9 files changed, 29 insertions(+), 29 deletions(-) diff --git a/components/engine_rocks/src/perf_context_metrics.rs b/components/engine_rocks/src/perf_context_metrics.rs index cca9f551bc1..d384fc96dc9 100644 --- a/components/engine_rocks/src/perf_context_metrics.rs +++ b/components/engine_rocks/src/perf_context_metrics.rs @@ -26,14 +26,14 @@ lazy_static! { "tikv_raftstore_apply_perf_context_time_duration_secs", "Bucketed histogram of request wait time duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM: HistogramVec = register_histogram_vec!( "tikv_raftstore_store_perf_context_time_duration_secs", "Bucketed histogram of request wait time duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref STORAGE_ROCKSDB_PERF_COUNTER: IntCounterVec = register_int_counter_vec!( diff --git a/components/external_storage/src/metrics.rs b/components/external_storage/src/metrics.rs index 1cb0c37cfa8..99dabca158e 100644 --- a/components/external_storage/src/metrics.rs +++ b/components/external_storage/src/metrics.rs @@ -8,7 +8,7 @@ lazy_static! { "tikv_external_storage_create_seconds", "Bucketed histogram of creating external storage duration", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); } diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 69d84f45056..a983feb7909 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -429,21 +429,21 @@ lazy_static! { register_histogram!( "tikv_raftstore_commit_log_duration_seconds", "Bucketed histogram of peer commits logs duration.", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref STORE_APPLY_LOG_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_apply_log_duration_seconds", "Bucketed histogram of peer applying log duration.", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref APPLY_TASK_WAIT_TIME_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_apply_wait_time_duration_secs", "Bucketed histogram of apply task wait time duration.", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref STORE_RAFT_READY_COUNTER_VEC: IntCounterVec = @@ -494,7 +494,7 @@ lazy_static! { "tikv_raftstore_raft_process_duration_secs", "Bucketed histogram of peer processing raft duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref PEER_PROPOSE_LOG_SIZE_HISTOGRAM: Histogram = @@ -525,7 +525,7 @@ lazy_static! { register_histogram!( "tikv_raftstore_request_wait_time_duration_secs", "Bucketed histogram of request wait time duration.", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref PEER_GC_RAFT_LOG_COUNTER: IntCounter = @@ -655,7 +655,7 @@ lazy_static! { "tikv_raftstore_apply_perf_context_time_duration_secs", "Bucketed histogram of request wait time duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM: HistogramVec = @@ -663,7 +663,7 @@ lazy_static! { "tikv_raftstore_store_perf_context_time_duration_secs", "Bucketed histogram of request wait time duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration= @@ -755,7 +755,7 @@ lazy_static! { "tikv_raftstore_inspect_duration_seconds", "Bucketed histogram of inspect duration.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref STORE_SLOW_SCORE_GAUGE: Gauge = diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index 75ffc17c72b..e119fcdc3ab 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -72,7 +72,7 @@ lazy_static! { "tikv_raftstore_snapshot_duration_seconds", "Bucketed histogram of raftstore snapshot process duration", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref SNAP_HISTOGRAM: SnapHistogram = @@ -80,7 +80,7 @@ lazy_static! { pub static ref CHECK_SPILT_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_check_split_duration_seconds", "Bucketed histogram of raftstore split check duration", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref COMPACT_RANGE_CF: HistogramVec = register_histogram_vec!( diff --git a/src/coprocessor/metrics.rs b/src/coprocessor/metrics.rs index f95ff6ee4db..d757ec49d62 100644 --- a/src/coprocessor/metrics.rs +++ b/src/coprocessor/metrics.rs @@ -96,7 +96,7 @@ lazy_static! { "tikv_coprocessor_request_duration_seconds", "Bucketed histogram of coprocessor request duration", &["req"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref COPR_REQ_HISTOGRAM_STATIC: CoprReqHistogram = @@ -105,7 +105,7 @@ lazy_static! { "tikv_coprocessor_request_handle_seconds", "Bucketed histogram of coprocessor handle request duration", &["req"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref COPR_REQ_HANDLE_TIME_STATIC: CoprReqHistogram = @@ -114,7 +114,7 @@ lazy_static! { "tikv_coprocessor_request_wait_seconds", "Bucketed histogram of coprocessor request wait duration", &["req", "type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref COPR_REQ_WAIT_TIME_STATIC: ReqWaitHistogram = @@ -123,7 +123,7 @@ lazy_static! { "tikv_coprocessor_request_handler_build_seconds", "Bucketed histogram of coprocessor request handler build duration", &["req"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref COPR_REQ_HANDLER_BUILD_TIME_STATIC: CoprReqHistogram = diff --git a/src/server/lock_manager/metrics.rs b/src/server/lock_manager/metrics.rs index 10fac63b1b7..f400652966b 100644 --- a/src/server/lock_manager/metrics.rs +++ b/src/server/lock_manager/metrics.rs @@ -51,7 +51,7 @@ lazy_static! { pub static ref WAITER_LIFETIME_HISTOGRAM: Histogram = register_histogram!( "tikv_lock_manager_waiter_lifetime_duration", "Duration of waiters' lifetime in seconds", - exponential_buckets(0.0005, 2.0, 20).unwrap() // 0.5ms ~ 524s + exponential_buckets(0.00001, 2.0, 26).unwrap() // 0.5ms ~ 524s ) .unwrap(); pub static ref DETECT_DURATION_HISTOGRAM: Histogram = register_histogram!( diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 9cd8631b275..0d24c9f798b 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -321,7 +321,7 @@ lazy_static! { "tikv_gcworker_gc_task_duration_vec", "Duration of gc tasks execution", &["task"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref GC_TOO_BUSY_COUNTER: IntCounter = register_int_counter!( @@ -360,7 +360,7 @@ lazy_static! { pub static ref TTL_CHECKER_COMPACT_DURATION_HISTOGRAM: Histogram = register_histogram!( "tikv_ttl_checker_compact_duration", "Duration of ttl checker compact files execution", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref TTL_CHECKER_POLL_INTERVAL_GAUGE: IntGauge = register_int_gauge!( @@ -492,7 +492,7 @@ lazy_static! { "tikv_storage_engine_async_request_duration_seconds", "Bucketed histogram of processing successful asynchronous requests.", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); } diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 95f5809ec9e..07f1143bcb0 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -461,13 +461,13 @@ lazy_static! { register_histogram!( "tikv_scheduler_throttle_duration_seconds", "Bucketed histogram of peer commits logs duration.", - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ).unwrap(); pub static ref SCHED_HISTOGRAM_VEC: HistogramVec = register_histogram_vec!( "tikv_scheduler_command_duration_seconds", "Bucketed histogram of command execution", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref SCHED_HISTOGRAM_VEC_STATIC: SchedDurationVec = @@ -476,7 +476,7 @@ lazy_static! { "tikv_scheduler_latch_wait_duration_seconds", "Bucketed histogram of latch wait", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref SCHED_LATCH_HISTOGRAM_VEC: SchedLatchDurationVec = @@ -485,7 +485,7 @@ lazy_static! { "tikv_scheduler_processing_read_duration_seconds", "Bucketed histogram of processing read duration", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref SCHED_PROCESSING_READ_HISTOGRAM_STATIC: ProcessingReadVec = @@ -494,7 +494,7 @@ lazy_static! { "tikv_scheduler_processing_write_duration_seconds", "Bucketed histogram of processing write duration", &["type"], - exponential_buckets(0.0005, 2.0, 20).unwrap() + exponential_buckets(0.00001, 2.0, 26).unwrap() ) .unwrap(); pub static ref SCHED_TOO_BUSY_COUNTER: IntCounterVec = register_int_counter_vec!( diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index a9b34b9b189..e78dbdaa49d 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -155,10 +155,10 @@ impl TaskContext { owned: AtomicBool::new(false), write_bytes, tag, - latch_timer: Instant::now_coarse(), + latch_timer: Instant::now(), _cmd_timer: CmdTimer { tag, - begin: Instant::now_coarse(), + begin: Instant::now(), }, } } @@ -701,7 +701,7 @@ impl Scheduler { fail_point!("scheduler_async_snapshot_finish"); SCHED_STAGE_COUNTER_VEC.get(tag).process.inc(); - let timer = Instant::now_coarse(); + let timer = Instant::now(); let region_id = task.cmd.ctx().get_region_id(); let ts = task.cmd.ts(); From 0dff1be50281c72c55c5464751cd733032115ce4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 19 Jul 2022 12:23:07 +0800 Subject: [PATCH 093/676] log-backup: don't run the cond function if the entry state is removal (#13043) close tikv/tikv#13044 `SubscriptionManager::deregister_region_if` won't call the `cond` argument if the target has been removed by setting the state to `Removal`. This would fix some patterns like: let mut exists = false; subs.deregister_region_if(42,|_, _| exists = true); do_with(exists); Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- .../backup-stream/src/subscription_manager.rs | 9 ++++ .../backup-stream/src/subscription_track.rs | 23 ++++++--- components/backup-stream/tests/mod.rs | 48 +++++++++++++++++++ 3 files changed, 74 insertions(+), 6 deletions(-) diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 28c1ed6dd78..0b415f95bf6 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -598,6 +598,15 @@ where } async fn get_last_checkpoint_of(&self, task: &str, region: &Region) -> Result { + #[cfg(feature = "failpoints")] + fail::fail_point!("get_last_checkpoint_of", |hint| Err(Error::Other( + box_err!( + "get_last_checkpoint_of({}, {:?}) failed because {:?}", + task, + region, + hint + ) + ))); let meta_cli = self.meta_cli.clone(); let cp = meta_cli.get_region_checkpoint(task, region).await?; debug!("got region checkpoint"; "region_id" => %region.get_id(), "checkpoint" => ?cp); diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index e8a22f9840e..30063089804 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -164,15 +164,26 @@ impl SubscriptionTracer { let region_id = region.get_id(); let remove_result = self.0.get_mut(®ion_id); match remove_result { - Some(mut o) if if_cond(o.value(), region) => { - if o.state != SubscriptionState::Removal { + Some(mut o) => { + // If the state is 'removal', we should act as if the region subscription + // has been removed: the callback should not be called because somebody may + // use this method to check whether a key exists: + // ``` + // let mut present = false; + // deregister_region_if(42, |..| { present = true; }); + // ``` + // At that time, if we call the callback with stale value, the called may get false positive. + if o.state == SubscriptionState::Removal { + return false; + } + if if_cond(o.value(), region) { TRACK_REGION.dec(); + o.value_mut().stop(); + info!("stop listen stream from store"; "observer" => ?o.value(), "region_id"=> %region_id); + return true; } - o.value_mut().stop(); - info!("stop listen stream from store"; "observer" => ?o.value(), "region_id"=> %region_id); - true + false } - Some(_) => false, None => { warn!("trying to deregister region not registered"; "region_id" => %region_id); false diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 9ba59a181b2..b9559d86c1f 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -870,4 +870,52 @@ mod test { keys.union(&keys2).map(|s| s.as_slice()), ); } + + #[test] + fn failed_during_refresh_region() { + defer! { + fail::remove("get_last_checkpoint_of") + } + + let mut suite = SuiteBuilder::new_named("fail_to_refresh_region") + .nodes(1) + .use_v3() + .build(); + + suite.must_register_task(1, "fail_to_refresh_region"); + let keys = run_async_test(suite.write_records(0, 128, 1)); + fail::cfg( + "get_last_checkpoint_of", + "1*return(the stream handler wants to become a batch processor, and the batch processor wants to be a stream handler.)", + ).unwrap(); + + suite.must_split(b"SOLE"); + let keys2 = run_async_test(suite.write_records(256, 128, 1)); + suite.force_flush_files("fail_to_refresh_region"); + suite.wait_for_flush(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys.union(&keys2).map(|s| s.as_slice()), + ); + let leader = suite.cluster.leader_of_region(1).unwrap().store_id; + let (tx, rx) = std::sync::mpsc::channel(); + suite.endpoints[&leader] + .scheduler() + .schedule(Task::RegionCheckpointsOp(RegionCheckpointOperation::Get( + RegionSet::Universal, + Box::new(move |rs| { + let _ = tx.send(rs); + }), + ))) + .unwrap(); + + let regions = rx.recv_timeout(Duration::from_secs(10)).unwrap(); + assert!( + regions.iter().all(|item| { + matches!(item, GetCheckpointResult::Ok { checkpoint, .. } if checkpoint.into_inner() > 500) + }), + "{:?}", + regions + ); + } } From 82e8f865cbdaba5e08fc0fedcefec0b7ea877b70 Mon Sep 17 00:00:00 2001 From: MoCuishle28 <32541204+MoCuishle28@users.noreply.github.com> Date: Wed, 20 Jul 2022 13:03:08 +0800 Subject: [PATCH 094/676] br: Adjust the backup organization structure (#12958) close tikv/tikv#13063 Adjust the backup organization structure and add a store_id related prefix under the backup path. Signed-off-by: Gaoming Signed-off-by: MoCuishle28 <32541204+MoCuishle28@users.noreply.github.com> Signed-off-by: zhanggaoming Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/backup/Cargo.toml | 1 + components/backup/src/endpoint.rs | 104 ++++++++++++++++++----- components/backup/src/writer.rs | 4 +- components/cloud/aws/src/lib.rs | 2 +- components/cloud/aws/src/s3.rs | 2 +- components/external_storage/src/lib.rs | 2 +- components/external_storage/src/local.rs | 2 +- tests/integrations/backup/mod.rs | 6 +- 9 files changed, 96 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5ddd904e637..0dd646d56e8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -395,6 +395,7 @@ version = "0.0.1" dependencies = [ "api_version", "async-channel", + "aws", "causal_ts", "collections", "concurrency_manager", diff --git a/components/backup/Cargo.toml b/components/backup/Cargo.toml index 85131c8e68f..a59f8949b77 100644 --- a/components/backup/Cargo.toml +++ b/components/backup/Cargo.toml @@ -35,6 +35,7 @@ failpoints = ["tikv/failpoints"] [dependencies] api_version = { path = "../api_version", default-features = false } async-channel = "1.4" +aws = { path = "../cloud/aws" } causal_ts = { path = "../causal_ts" } collections = { path = "../collections" } concurrency_manager = { path = "../concurrency_manager", default-features = false } diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 0734af017d2..bbcf33d7899 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -294,6 +294,7 @@ impl BackupRange { backup_ts: TimeStamp, begin_ts: TimeStamp, saver: async_channel::Sender, + storage_name: &str, ) -> Result { assert!(!self.codec.is_raw_kv); @@ -363,7 +364,7 @@ impl BackupRange { .start_key .clone() .map_or_else(Vec::new, |k| k.into_raw().unwrap()); - let mut writer = writer_builder.build(next_file_start_key.clone())?; + let mut writer = writer_builder.build(next_file_start_key.clone(), storage_name)?; loop { if let Err(e) = scanner.scan_entries(&mut batch) { error!(?e; "backup scan entries failed"); @@ -397,7 +398,7 @@ impl BackupRange { send_to_worker_with_metrics(&saver, msg).await?; next_file_start_key = this_end_key; writer = writer_builder - .build(next_file_start_key.clone()) + .build(next_file_start_key.clone(), storage_name) .map_err(|e| { error_unknown!(?e; "backup writer failed"); e @@ -892,7 +893,7 @@ impl Endpoint { let input = brange.codec.decode_backup_key(Some(k)).unwrap_or_default(); file_system::sha256(&input).ok().map(hex::encode) }); - let name = backup_file_name(store_id, &brange.region, key); + let name = backup_file_name(store_id, &brange.region, key, _backend.name()); let ct = to_sst_compression_type(request.compression_type); let stat = if is_raw_kv { @@ -928,6 +929,7 @@ impl Endpoint { backup_ts, start_ts, saver_tx.clone(), + _backend.name(), ) .await }; @@ -1090,26 +1092,58 @@ fn get_max_start_key(start_key: Option<&Key>, region: &Region) -> Option { /// A name consists with five parts: store id, region_id, a epoch version, the hash of range start key and timestamp. /// range start key is used to keep the unique file name for file, to handle different tables exists on the same region. /// local unix timestamp is used to keep the unique file name for file, to handle receive the same request after connection reset. -pub fn backup_file_name(store_id: u64, region: &Region, key: Option) -> String { +pub fn backup_file_name( + store_id: u64, + region: &Region, + key: Option, + storage_name: &str, +) -> String { let start = SystemTime::now(); let since_the_epoch = start .duration_since(UNIX_EPOCH) .expect("Time went backwards"); - match key { - Some(k) => format!( - "{}_{}_{}_{}_{}", - store_id, - region.get_id(), - region.get_region_epoch().get_version(), - k, - since_the_epoch.as_millis() - ), - None => format!( - "{}_{}_{}", - store_id, - region.get_id(), - region.get_region_epoch().get_version() - ), + + match (key, storage_name) { + // See https://github.com/pingcap/tidb/issues/30087 + // To avoid 503 Slow Down error, if the backup storage is s3, + // organize the backup files by store_id (use slash (/) as delimiter). + (Some(k), aws::STORAGE_NAME | external_storage::local::STORAGE_NAME) => { + format!( + "{}/{}_{}_{}_{}", + store_id, + region.get_id(), + region.get_region_epoch().get_version(), + k, + since_the_epoch.as_millis() + ) + } + (Some(k), _) => { + format!( + "{}_{}_{}_{}_{}", + store_id, + region.get_id(), + region.get_region_epoch().get_version(), + k, + since_the_epoch.as_millis() + ) + } + + (None, aws::STORAGE_NAME | external_storage::local::STORAGE_NAME) => { + format!( + "{}/{}_{}", + store_id, + region.get_id(), + region.get_region_epoch().get_version() + ) + } + (None, _) => { + format!( + "{}_{}_{}", + store_id, + region.get_id(), + region.get_region_epoch().get_version() + ) + } } } @@ -1974,4 +2008,36 @@ pub mod tests { drop(pool); std::thread::sleep(Duration::from_millis(150)); } + + #[test] + fn test_backup_file_name() { + let region = metapb::Region::default(); + let store_id = 1; + let test_cases = vec!["s3", "local", "gcs", "azure", "hdfs"]; + let test_target = vec![ + "1/0_0_000", + "1/0_0_000", + "1_0_0_000", + "1_0_0_000", + "1_0_0_000", + ]; + + let delimiter = "_"; + for (storage_name, target) in test_cases.iter().zip(test_target.iter()) { + let key = Some(String::from("000")); + let filename = backup_file_name(store_id, ®ion, key, storage_name); + + let mut prefix_arr: Vec<&str> = filename.split(delimiter).collect(); + prefix_arr.remove(prefix_arr.len() - 1); + + assert_eq!(target.to_string(), prefix_arr.join(delimiter)); + } + + let test_target = vec!["1/0_0", "1/0_0", "1_0_0", "1_0_0", "1_0_0"]; + for (storage_name, target) in test_cases.iter().zip(test_target.iter()) { + let key = None; + let filename = backup_file_name(store_id, ®ion, key, storage_name); + assert_eq!(target.to_string(), filename); + } + } } diff --git a/components/backup/src/writer.rs b/components/backup/src/writer.rs index 8408fb7c002..4c4c6dc5ec7 100644 --- a/components/backup/src/writer.rs +++ b/components/backup/src/writer.rs @@ -198,10 +198,10 @@ impl BackupWriterBuilder { } } - pub fn build(&self, start_key: Vec) -> Result { + pub fn build(&self, start_key: Vec, storage_name: &str) -> Result { let key = file_system::sha256(&start_key).ok().map(hex::encode); let store_id = self.store_id; - let name = backup_file_name(store_id, &self.region, key); + let name = backup_file_name(store_id, &self.region, key, storage_name); BackupWriter::new( self.db.clone(), &name, diff --git a/components/cloud/aws/src/lib.rs b/components/cloud/aws/src/lib.rs index 345302d0534..b6af7d64b48 100644 --- a/components/cloud/aws/src/lib.rs +++ b/components/cloud/aws/src/lib.rs @@ -5,6 +5,6 @@ mod kms; pub use kms::{AwsKms, ENCRYPTION_VENDOR_NAME_AWS_KMS}; mod s3; -pub use s3::{Config, S3Storage, STORAGE_VENDOR_NAME_AWS}; +pub use s3::{Config, S3Storage, STORAGE_NAME, STORAGE_VENDOR_NAME_AWS}; mod util; diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index b5cacb2266e..e2e9919860b 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -515,7 +515,7 @@ impl<'client> S3Uploader<'client> { } } -const STORAGE_NAME: &str = "s3"; +pub const STORAGE_NAME: &str = "s3"; #[async_trait] impl BlobStorage for S3Storage { diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index 0bad03cbcca..f1d1a617dc8 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -30,7 +30,7 @@ use tokio::time::timeout; mod hdfs; pub use hdfs::{HdfsConfig, HdfsStorage}; -mod local; +pub mod local; pub use local::LocalStorage; mod noop; pub use noop::NoopStorage; diff --git a/components/external_storage/src/local.rs b/components/external_storage/src/local.rs index 5fd899b17f9..3e307dca157 100644 --- a/components/external_storage/src/local.rs +++ b/components/external_storage/src/local.rs @@ -54,7 +54,7 @@ fn url_for(base: &Path) -> url::Url { u } -const STORAGE_NAME: &str = "local"; +pub const STORAGE_NAME: &str = "local"; #[async_trait] impl ExternalStorage for LocalStorage { diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index ccadcca674f..2990a983974 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -21,11 +21,11 @@ fn assert_same_file_name(s1: String, s2: String) { let tokens1: Vec<&str> = s1.split('_').collect(); let tokens2: Vec<&str> = s2.split('_').collect(); assert_eq!(tokens1.len(), tokens2.len()); - // 2_1_1_e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855_1609407693105_write.sst - // 2_1_1_e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855_1609407693199_write.sst + // 2/1_1_e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855_1609407693105_write.sst + // 2/1_1_e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855_1609407693199_write.sst // should be equal for i in 0..tokens1.len() { - if i != 4 { + if i != 3 { assert_eq!(tokens1[i], tokens2[i]); } } From 856caa1b30c152d82aa0d923fbc0a8253df1ffb4 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 19 Jul 2022 23:29:08 -0700 Subject: [PATCH 095/676] tikv: refactor StorageConfigManger to support multi-rocksdb (#12962) close tikv/tikv#12961 refactor StorageConfigManger to support multi-rocksdb Signed-off-by: qi.xu Co-authored-by: qi.xu --- components/engine_test/src/lib.rs | 17 ++++++-- components/engine_traits/src/engine.rs | 35 +++++++--------- components/server/src/server.rs | 9 ++-- src/config.rs | 5 ++- src/server/engine_factory.rs | 12 ++++-- src/server/engine_factory_v2.rs | 40 +++++++++++++++--- src/storage/config_manager.rs | 41 ++++++++----------- .../singleton_flow_controller.rs | 14 ++++++- tests/failpoints/cases/test_storage.rs | 3 +- 9 files changed, 112 insertions(+), 64 deletions(-) diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index b670ef34500..82373ac8568 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -90,7 +90,9 @@ pub mod kv { RocksEngine as KvTestEngine, RocksEngineIterator as KvTestEngineIterator, RocksSnapshot as KvTestSnapshot, RocksWriteBatchVec as KvTestWriteBatch, }; - use engine_traits::{Result, TabletAccessor, TabletFactory}; + use engine_traits::{ + CFOptionsExt, ColumnFamilyOptions, Result, TabletAccessor, TabletFactory, CF_DEFAULT, + }; use tikv_util::box_err; use crate::ctor::{CFOptions, DBOptions, KvEngineConstructorExt}; @@ -279,8 +281,17 @@ pub mod kv { new_engine } - fn clone(&self) -> Box + Send> { - Box::new(std::clone::Clone::clone(self)) + fn set_shared_block_cache_capacity( + &self, + capacity: u64, + ) -> std::result::Result<(), String> { + let reg = self.registry.lock().unwrap(); + // pick up any tablet and set the shared block cache capacity + if let Some(((_id, _suffix), tablet)) = (*reg).iter().next() { + let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap + opt.set_block_cache_capacity(capacity)?; + } + Ok(()) } } diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index de99f924038..9b560bcd65b 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -204,10 +204,6 @@ pub trait TabletFactory: TabletAccessor { /// Tablets root path fn tablets_path(&self) -> PathBuf; - /// Clone the tablet factory instance - /// Here we don't use Clone traint because it will break the trait's object safty - fn clone(&self) -> Box + Send>; - /// Load the tablet from path for id and suffix--for scenarios such as applying snapshot fn load_tablet(&self, _path: &Path, _id: u64, _suffix: u64) -> Result { unimplemented!(); @@ -222,11 +218,13 @@ pub trait TabletFactory: TabletAccessor { fn is_tombstoned(&self, _region_id: u64, _suffix: u64) -> bool { unimplemented!(); } + + fn set_shared_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String>; } pub struct DummyFactory where - EK: Clone + Send + 'static, + EK: CFOptionsExt + Clone + Send + 'static, { pub engine: Option, pub root_path: String, @@ -234,7 +232,7 @@ where impl TabletFactory for DummyFactory where - EK: Clone + Send + 'static, + EK: CFOptionsExt + Clone + Send + 'static, { fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { Ok(self.engine.as_ref().unwrap().clone()) @@ -258,22 +256,19 @@ where PathBuf::from(&self.root_path) } - fn clone(&self) -> Box + Send> { - if self.engine.is_none() { - return Box::>::new(DummyFactory { - engine: None, - root_path: self.root_path.clone(), - }); - } - Box::>::new(DummyFactory { - engine: Some(self.engine.as_ref().unwrap().clone()), - root_path: self.root_path.clone(), - }) + fn set_shared_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { + let opt = self + .engine + .as_ref() + .unwrap() + .get_options_cf(CF_DEFAULT) + .unwrap(); // FIXME unwrap + opt.set_block_cache_capacity(capacity) } } impl TabletAccessor for DummyFactory where - EK: Clone + Send + 'static, + EK: CFOptionsExt + Clone + Send + 'static, { fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &EK)) { if let Some(engine) = &self.engine { @@ -288,14 +283,14 @@ where impl DummyFactory where - EK: Clone + Send + 'static, + EK: CFOptionsExt + Clone + Send + 'static, { pub fn new(engine: Option, root_path: String) -> DummyFactory { DummyFactory { engine, root_path } } } -impl Default for DummyFactory { +impl Default for DummyFactory { fn default() -> Self { Self::new(None, "/tmp".to_string()) } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index c0ed12bf73c..7911447368e 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -232,6 +232,7 @@ struct TiKvServer { sst_worker: Option>>, quota_limiter: Arc, causal_ts_provider: Option>>, // used for rawkv apiv2 + tablet_factory: Option + Send + Sync>>, } struct TiKvEngines { @@ -352,6 +353,7 @@ impl TiKvServer { sst_worker: None, quota_limiter, causal_ts_provider, + tablet_factory: None, } } @@ -735,7 +737,7 @@ impl TiKvServer { cfg_controller.register( tikv::config::Module::Storage, Box::new(StorageConfigManger::new( - self.engines.as_ref().unwrap().engine.kv_engine(), + self.tablet_factory.as_ref().unwrap().clone(), self.config.storage.block_cache.shared, ttl_scheduler, flow_controller, @@ -1649,7 +1651,7 @@ impl TiKvServer { if let Some(cache) = block_cache { builder = builder.block_cache(cache); } - let factory = builder.build(); + let factory = Arc::new(builder.build()); let kv_engine = factory .create_shared_db() .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); @@ -1659,11 +1661,12 @@ impl TiKvServer { cfg_controller.register( tikv::config::Module::Rocksdb, Box::new(DBConfigManger::new( - Arc::new(factory), + factory.clone(), DBType::Kv, self.config.storage.block_cache.shared, )), ); + self.tablet_factory = Some(factory); engines .raft .register_config(cfg_controller, self.config.storage.block_cache.shared); diff --git a/src/config.rs b/src/config.rs index 9b06da58926..98aabb20369 100644 --- a/src/config.rs +++ b/src/config.rs @@ -3990,7 +3990,8 @@ mod tests { use api_version::{ApiV1, KvFormat}; use case_macros::*; use engine_traits::{ - ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptions as DBOptionsTrait, ALL_CFS, + ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptions as DBOptionsTrait, DummyFactory, + ALL_CFS, }; use futures::executor::block_on; use grpcio::ResourceQuota; @@ -4446,7 +4447,7 @@ mod tests { cfg_controller.register( Module::Storage, Box::new(StorageConfigManger::new( - engine, + Arc::new(DummyFactory::new(Some(engine), "".to_string())), shared, scheduler, flow_controller.clone(), diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 04e1f72f05a..0de26bc43c4 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -11,7 +11,8 @@ use engine_rocks::{ RocksEventListener, }; use engine_traits::{ - CompactionJobInfo, Result, TabletAccessor, TabletFactory, CF_DEFAULT, CF_WRITE, + CFOptionsExt, ColumnFamilyOptions, CompactionJobInfo, Result, TabletAccessor, TabletFactory, + CF_DEFAULT, CF_WRITE, }; use kvproto::kvrpcpb::ApiVersion; use raftstore::RegionInfoAccessor; @@ -245,8 +246,13 @@ impl TabletFactory for KvEngineFactory { fn destroy_tablet(&self, _id: u64, _suffix: u64) -> engine_traits::Result<()> { Ok(()) } - fn clone(&self) -> Box + Send> { - Box::new(std::clone::Clone::clone(self)) + + fn set_shared_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { + if let Ok(db) = self.inner.root_db.lock() { + let opt = db.as_ref().unwrap().get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap + opt.set_block_cache_capacity(capacity)?; + } + Ok(()) } } diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index d1cc29bc88f..2dfe297e5d8 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -7,7 +7,9 @@ use std::{ use collections::HashMap; use engine_rocks::RocksEngine; -use engine_traits::{Result, TabletAccessor, TabletFactory}; +use engine_traits::{ + CFOptionsExt, ColumnFamilyOptions, Result, TabletAccessor, TabletFactory, CF_DEFAULT, +}; use crate::server::engine_factory::KvEngineFactory; @@ -158,8 +160,14 @@ impl TabletFactory for KvEngineFactoryV2 { new_engine } - fn clone(&self) -> Box + Send> { - Box::new(std::clone::Clone::clone(self)) + fn set_shared_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { + let reg = self.registry.lock().unwrap(); + // pick up any tablet and set the shared block cache capacity + if let Some(((_id, _suffix), tablet)) = (*reg).iter().next() { + let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap + opt.set_block_cache_capacity(capacity)?; + } + Ok(()) } } @@ -180,7 +188,7 @@ impl TabletAccessor for KvEngineFactoryV2 { #[cfg(test)] mod tests { - use engine_traits::TabletFactory; + use engine_traits::{TabletFactory, CF_WRITE}; use super::*; use crate::{config::TiKvConfig, server::KvEngineFactoryBuilder}; @@ -212,10 +220,15 @@ mod tests { #[test] fn test_kvengine_factory() { let cfg = TEST_CONFIG.clone(); + assert!(cfg.storage.block_cache.shared); + let cache = cfg.storage.block_cache.build_shared_cache(); let dir = test_util::temp_dir("test_kvengine_factory", false); let env = cfg.build_shared_rocks_env(None, None).unwrap(); - let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); + let mut builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); + if let Some(cache) = cache { + builder = builder.block_cache(cache); + } let factory = builder.build(); let shared_db = factory.create_shared_db().unwrap(); let tablet = TabletFactory::create_tablet(&factory, 1, 10); @@ -240,15 +253,25 @@ mod tests { assert_eq!(count, 1); assert!(factory.is_single_engine()); assert!(shared_db.is_single_engine()); + factory + .set_shared_block_cache_capacity(1024 * 1024) + .unwrap(); + let opt = shared_db.get_options_cf(CF_DEFAULT).unwrap(); + assert_eq!(opt.get_block_cache_capacity(), 1024 * 1024); } #[test] fn test_kvengine_factory_v2() { let cfg = TEST_CONFIG.clone(); + assert!(cfg.storage.block_cache.shared); + let cache = cfg.storage.block_cache.build_shared_cache(); let dir = test_util::temp_dir("test_kvengine_factory_v2", false); let env = cfg.build_shared_rocks_env(None, None).unwrap(); - let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); + let mut builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); + if let Some(cache) = cache { + builder = builder.block_cache(cache); + } let inner_factory = builder.build(); let factory = KvEngineFactoryV2::new(inner_factory); let tablet = factory.create_tablet(1, 10); @@ -263,6 +286,11 @@ mod tests { let tablet_path = factory.tablet_path(1, 10); let result = factory.open_tablet_raw(&tablet_path, false); assert!(result.is_err()); + factory + .set_shared_block_cache_capacity(1024 * 1024) + .unwrap(); + let opt = tablet.get_options_cf(CF_WRITE).unwrap(); + assert_eq!(opt.get_block_cache_capacity(), 1024 * 1024); assert!(factory.exists(1, 10)); assert!(!factory.exists(1, 11)); diff --git a/src/storage/config_manager.rs b/src/storage/config_manager.rs index 217ebbb25c8..d3d051ac5f9 100644 --- a/src/storage/config_manager.rs +++ b/src/storage/config_manager.rs @@ -4,7 +4,7 @@ use std::{convert::TryInto, sync::Arc}; -use engine_traits::{CFNamesExt, CFOptionsExt, ColumnFamilyOptions, CF_DEFAULT}; +use engine_traits::{CFNamesExt, CFOptionsExt, TabletFactory, CF_DEFAULT}; use file_system::{get_io_rate_limiter, IOPriority, IOType}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use strum::IntoEnumIterator; @@ -20,7 +20,7 @@ use crate::{ }; pub struct StorageConfigManger { - kvdb: ::Local, + tablet_factory: Arc + Send + Sync>, shared_block_cache: bool, ttl_checker_scheduler: Scheduler, flow_controller: Arc, @@ -32,14 +32,14 @@ unsafe impl Sync for StorageConfigManger {} impl StorageConfigManger { pub fn new( - kvdb: ::Local, + tablet_factory: Arc + Send + Sync>, shared_block_cache: bool, ttl_checker_scheduler: Scheduler, flow_controller: Arc, scheduler: TxnScheduler, ) -> Self { StorageConfigManger { - kvdb, + tablet_factory, shared_block_cache, ttl_checker_scheduler, flow_controller, @@ -57,12 +57,7 @@ impl ConfigManager for StorageConfigManger { if let Some(size) = block_cache.remove("capacity") { if size != ConfigValue::None { let s: ReadableSize = size.into(); - // Hack: since all CFs in both kvdb and raftdb share a block cache, we can change - // the size through any of them. Here we change it through default CF in kvdb. - // A better way to do it is to hold the cache reference somewhere, and use it to - // change cache size. - let opt = self.kvdb.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(s.0)?; + self.tablet_factory.set_shared_block_cache_capacity(s.0)?; // Write config to metric CONFIG_ROCKSDB_GAUGE .with_label_values(&[CF_DEFAULT, "block_cache_size"]) @@ -77,21 +72,17 @@ impl ConfigManager for StorageConfigManger { } else if let Some(ConfigValue::Module(mut flow_control)) = change.remove("flow_control") { if let Some(v) = flow_control.remove("enable") { let enable: bool = v.into(); - if enable { - for cf in self.kvdb.cf_names() { - self.kvdb - .set_options_cf(cf, &[("disable_write_stall", "true")]) - .unwrap(); - } - self.flow_controller.enable(true); - } else { - for cf in self.kvdb.cf_names() { - self.kvdb - .set_options_cf(cf, &[("disable_write_stall", "false")]) - .unwrap(); - } - self.flow_controller.enable(false); - } + let enable_str = if enable { "true" } else { "false" }; + self.tablet_factory.for_each_opened_tablet( + &mut |_region_id, _suffix, tablet: &EK::Local| { + for cf in tablet.cf_names() { + tablet + .set_options_cf(cf, &[("disable_write_stall", enable_str)]) + .unwrap(); + } + }, + ); + self.flow_controller.enable(enable); } } else if let Some(v) = change.get("scheduler_worker_pool_size") { let pool_size: usize = v.into(); diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index 76671412abc..edcac95aa00 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -989,7 +989,8 @@ impl FlowChecker { pub(super) mod tests { use std::sync::atomic::AtomicU64; - use engine_traits::Result; + use engine_rocks::RocksColumnFamilyOptions; + use engine_traits::{CFOptionsExt, Result}; use super::{super::FlowController, *}; @@ -1018,6 +1019,17 @@ pub(super) mod tests { } } + impl CFOptionsExt for EngineStub { + type ColumnFamilyOptions = RocksColumnFamilyOptions; + fn get_options_cf(&self, _cf: &str) -> Result { + unimplemented!(); + } + + fn set_options_cf(&self, _cf: &str, _options: &[(&str, &str)]) -> Result<()> { + unimplemented!(); + } + } + impl FlowControlFactorsExt for EngineStub { fn get_cf_num_files_at_level(&self, _cf: &str, _level: usize) -> Result> { Ok(Some(self.0.num_l0_files.load(Ordering::Relaxed))) diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 7d0bb8c0b74..c6872d22dab 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -12,6 +12,7 @@ use std::{ use api_version::KvFormat; use collections::HashMap; +use engine_traits::DummyFactory; use errors::{extract_key_error, extract_region_error}; use futures::executor::block_on; use grpcio::*; @@ -265,7 +266,7 @@ fn test_scale_scheduler_pool() { cfg_controller.register( Module::Storage, Box::new(StorageConfigManger::new( - kv_engine, + Arc::new(DummyFactory::new(Some(kv_engine), "".to_string())), cfg.storage.block_cache.shared, scheduler, flow_controller, From cfe62ba99d073893839d357c6d3770ceb3f60107 Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 20 Jul 2022 14:49:08 +0800 Subject: [PATCH 096/676] log: support dynamically change log level via sql (#13019) ref tikv/tikv#4935, ref tikv/tikv#12986 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- cmd/tikv-ctl/src/main.rs | 4 +- cmd/tikv-ctl/src/util.rs | 2 +- components/server/src/server.rs | 4 +- components/server/src/setup.rs | 12 +- components/tikv_util/src/logger/mod.rs | 8 +- src/config.rs | 177 ++++++++++++++++++------- src/server/status_server/mod.rs | 11 +- tests/integrations/config/mod.rs | 6 +- 8 files changed, 158 insertions(+), 66 deletions(-) diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 9609fffb9a5..67834db9c5d 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -62,7 +62,9 @@ fn main() { let cfg = cfg_path.map_or_else( || { let mut cfg = TiKvConfig::default(); - cfg.log.level = tikv_util::logger::get_level_by_string("warn").unwrap(); + cfg.log.level = tikv_util::logger::get_level_by_string("warn") + .unwrap() + .into(); cfg }, |path| { diff --git a/cmd/tikv-ctl/src/util.rs b/cmd/tikv-ctl/src/util.rs index c776f16f83d..36091b5a930 100644 --- a/cmd/tikv-ctl/src/util.rs +++ b/cmd/tikv-ctl/src/util.rs @@ -10,7 +10,7 @@ const LOG_DIR: &str = "./ctl-engine-info-log"; #[allow(clippy::field_reassign_with_default)] pub fn init_ctl_logger(level: &str) { let mut cfg = TiKvConfig::default(); - cfg.log.level = slog::Level::from_str(level).unwrap(); + cfg.log.level = slog::Level::from_str(level).unwrap().into(); cfg.rocksdb.info_log_dir = LOG_DIR.to_owned(); cfg.raftdb.info_log_dir = LOG_DIR.to_owned(); initial_logger(&cfg); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 7911447368e..8eca26404d9 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -81,7 +81,7 @@ use raftstore::{ }; use security::SecurityManager; use tikv::{ - config::{ConfigController, DBConfigManger, DBType, TiKvConfig}, + config::{ConfigController, DBConfigManger, DBType, LogConfigManager, TiKvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, import::{ImportSstService, SstImporter}, @@ -622,6 +622,8 @@ impl TiKvServer { ))), ); + cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + // Create cdc. let mut cdc_worker = Box::new(LazyWorker::new("cdc")); let cdc_scheduler = cdc_worker.scheduler(); diff --git a/components/server/src/setup.rs b/components/server/src/setup.rs index 0c657733f54..3e37d87242c 100644 --- a/components/server/src/setup.rs +++ b/components/server/src/setup.rs @@ -150,9 +150,11 @@ pub fn initial_logger(config: &TiKvConfig) { let drainer = logger::LogDispatcher::new(normal, rocksdb, raftdb, slow); let level = config.log.level; let slow_threshold = config.slow_log_threshold.as_millis(); - logger::init_log(drainer, level, true, true, vec![], slow_threshold).unwrap_or_else(|e| { - fatal!("failed to initialize log: {}", e); - }); + logger::init_log(drainer, level.into(), true, true, vec![], slow_threshold).unwrap_or_else( + |e| { + fatal!("failed to initialize log: {}", e); + }, + ); } macro_rules! do_build { @@ -235,8 +237,8 @@ pub fn initial_metric(cfg: &MetricConfig) { #[allow(dead_code)] pub fn overwrite_config_with_cmd_args(config: &mut TiKvConfig, matches: &ArgMatches<'_>) { if let Some(level) = matches.value_of("log-level") { - config.log.level = logger::get_level_by_string(level).unwrap(); - config.log_level = slog::Level::Info; + config.log.level = logger::get_level_by_string(level).unwrap().into(); + config.log_level = slog::Level::Info.into(); } if let Some(file) = matches.value_of("log-file") { diff --git a/components/tikv_util/src/logger/mod.rs b/components/tikv_util/src/logger/mod.rs index 36a5cf95baf..f4fd936cddc 100644 --- a/components/tikv_util/src/logger/mod.rs +++ b/components/tikv_util/src/logger/mod.rs @@ -78,7 +78,7 @@ where .overflow_strategy(SLOG_CHANNEL_OVERFLOW_STRATEGY) .thread_name(thd_name!("slogger")) .build_with_guard(); - let drain = async_log.filter_level(level).fuse(); + let drain = async_log.fuse(); let drain = SlowLogFilter { threshold: slow_threshold, inner: drain, @@ -87,7 +87,7 @@ where (slog::Logger::root(filtered, slog_o!()), Some(guard)) } else { - let drain = LogAndFuse(Mutex::new(drain).filter_level(level)); + let drain = LogAndFuse(Mutex::new(drain)); let drain = SlowLogFilter { threshold: slow_threshold, inner: drain, @@ -287,7 +287,9 @@ pub fn get_log_level() -> Option { } pub fn set_log_level(new_level: Level) { - LOG_LEVEL.store(new_level.as_usize(), Ordering::SeqCst) + LOG_LEVEL.store(new_level.as_usize(), Ordering::SeqCst); + // also change std log to new level. + let _ = slog_global::redirect_std_log(Some(new_level)); } pub struct TikvFormat diff --git a/src/config.rs b/src/config.rs index 98aabb20369..1f64f53dc58 100644 --- a/src/config.rs +++ b/src/config.rs @@ -8,6 +8,7 @@ use std::{ cmp, collections::{HashMap, HashSet}, + convert::TryFrom, error::Error, fs, i32, io::{Error as IoError, ErrorKind, Write}, @@ -21,7 +22,7 @@ use api_version::ApiV1Ttl; use causal_ts::Config as CausalTsConfig; use encryption_export::DataKeyManager; use engine_rocks::{ - config::{self as rocks_config, BlobRunMode, CompressionType, LogLevel}, + config::{self as rocks_config, BlobRunMode, CompressionType, LogLevel as RocksLogLevel}, get_env, properties::MvccPropertiesCollectorFactory, raw::{ @@ -53,11 +54,16 @@ use raftstore::{ }; use resource_metering::Config as ResourceMeteringConfig; use security::SecurityConfig; +use serde::{ + de::{Error as DError, Unexpected}, + Deserialize, Deserializer, Serialize, Serializer, +}; use serde_json::{to_value, Map, Value}; use tikv_util::{ config::{ self, LogFormat, RaftDataStateMachine, ReadableDuration, ReadableSize, TomlWriter, GIB, MIB, }, + logger::{get_level_by_string, get_string_by_level, set_log_level}, sys::SysQuota, time::duration_to_sec, yatp_pool, @@ -1004,7 +1010,7 @@ impl TitanDBConfig { #[serde(rename_all = "kebab-case")] pub struct DbConfig { #[online_config(skip)] - pub info_log_level: LogLevel, + pub info_log_level: RocksLogLevel, #[serde(with = "rocks_config::recovery_mode_serde")] #[online_config(skip)] pub wal_recovery_mode: DBRecoveryMode, @@ -1101,7 +1107,7 @@ impl Default for DbConfig { info_log_roll_time: ReadableDuration::secs(0), info_log_keep_log_file_num: 10, info_log_dir: "".to_owned(), - info_log_level: LogLevel::Info, + info_log_level: RocksLogLevel::Info, rate_bytes_per_sec: ReadableSize::gb(10), rate_limiter_refill_period: ReadableDuration::millis(100), rate_limiter_mode: DBRateLimiterMode::WriteOnly, @@ -1364,7 +1370,7 @@ pub struct RaftDbConfig { #[online_config(skip)] pub info_log_dir: String, #[online_config(skip)] - pub info_log_level: LogLevel, + pub info_log_level: RocksLogLevel, #[online_config(skip)] pub max_sub_compactions: u32, pub writable_file_max_buffer_size: ReadableSize, @@ -1409,7 +1415,7 @@ impl Default for RaftDbConfig { info_log_roll_time: ReadableDuration::secs(0), info_log_keep_log_file_num: 10, info_log_dir: "".to_owned(), - info_log_level: LogLevel::Info, + info_log_level: RocksLogLevel::Info, max_sub_compactions: bg_job_limits.max_sub_compactions as u32, writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, @@ -1804,33 +1810,6 @@ impl Default for MetricConfig { } } } - -pub mod log_level_serde { - use serde::{ - de::{Error, Unexpected}, - Deserialize, Deserializer, Serialize, Serializer, - }; - use slog::Level; - use tikv_util::logger::{get_level_by_string, get_string_by_level}; - - pub fn deserialize<'de, D>(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let string = String::deserialize(deserializer)?; - get_level_by_string(&string) - .ok_or_else(|| D::Error::invalid_value(Unexpected::Str(&string), &"a valid log level")) - } - - #[allow(clippy::trivially_copy_pass_by_ref)] - pub fn serialize(value: &Level, serializer: S) -> Result - where - S: Serializer, - { - get_string_by_level(*value).serialize(serializer) - } -} - #[derive(Clone, Copy, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -2690,21 +2669,86 @@ impl Default for File { } } -#[derive(Clone, Serialize, Deserialize, PartialEq, Debug)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct LogConfig { - #[serde(with = "log_level_serde")] - pub level: slog::Level, + pub level: LogLevel, + #[online_config(skip)] pub format: LogFormat, + #[online_config(skip)] pub enable_timestamp: bool, + #[online_config(skip)] pub file: File, } +/// LogLevel is a wrapper type of `slog::Level` +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct LogLevel(slog::Level); + +impl From for slog::Level { + fn from(l: LogLevel) -> Self { + l.0 + } +} + +impl From for LogLevel { + fn from(l: slog::Level) -> Self { + Self(l) + } +} + +impl Serialize for LogLevel { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + get_string_by_level(self.0).serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for LogLevel { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let string = String::deserialize(deserializer)?; + get_level_by_string(&string) + .map(LogLevel) + .ok_or_else(|| D::Error::invalid_value(Unexpected::Str(&string), &"a valid log level")) + } +} + +impl From for ConfigValue { + fn from(l: LogLevel) -> Self { + Self::String(get_string_by_level(l.0).into()) + } +} + +impl TryFrom for LogLevel { + type Error = String; + fn try_from(value: ConfigValue) -> Result { + if let ConfigValue::String(s) = value { + get_level_by_string(&s) + .map(LogLevel) + .ok_or_else(|| format!("invalid log level: '{}'", s)) + } else { + panic!("expect ConfigValue::String, found: {:?}", value) + } + } +} + +impl TryFrom<&ConfigValue> for LogLevel { + type Error = String; + fn try_from(value: &ConfigValue) -> Result { + Self::try_from(value.clone()) + } +} + impl Default for LogConfig { fn default() -> Self { Self { - level: slog::Level::Info, + level: LogLevel(slog::Level::Info), format: LogFormat::Text, enable_timestamp: true, file: File::default(), @@ -2721,6 +2765,19 @@ impl LogConfig { } } +pub struct LogConfigManager; + +impl ConfigManager for LogConfigManager { + fn dispatch(&mut self, changes: ConfigChange) -> CfgResult<()> { + if let Some(v) = changes.get("level") { + let log_level = LogLevel::try_from(v)?; + set_log_level(log_level.0); + } + info!("update log config"; "config" => ?changes); + Ok(()) + } +} + #[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -2775,8 +2832,7 @@ pub struct TiKvConfig { // They are preserved for compatibility check. #[doc(hidden)] #[online_config(skip)] - #[serde(with = "log_level_serde")] - pub log_level: slog::Level, + pub log_level: LogLevel, #[doc(hidden)] #[online_config(skip)] pub log_file: String, @@ -2814,7 +2870,7 @@ pub struct TiKvConfig { #[online_config(skip)] pub memory_usage_high_water: f64, - #[online_config(skip)] + #[online_config(submodule)] pub log: LogConfig, #[online_config(submodule)] @@ -2895,7 +2951,7 @@ impl Default for TiKvConfig { fn default() -> TiKvConfig { TiKvConfig { cfg_path: "".to_owned(), - log_level: slog::Level::Info, + log_level: slog::Level::Info.into(), log_file: "".to_owned(), log_format: LogFormat::Text, log_rotation_timespan: ReadableDuration::hours(0), @@ -3838,6 +3894,7 @@ pub enum Module { ResourceMetering, BackupStream, Quota, + Log, Unknown(String), } @@ -3865,6 +3922,7 @@ impl From<&str> for Module { "resolved_ts" => Module::ResolvedTs, "resource_metering" => Module::ResourceMetering, "quota" => Module::Quota, + "log" => Module::Log, n => Module::Unknown(n.to_owned()), } } @@ -4003,6 +4061,7 @@ mod tests { use tikv_kv::RocksEngine as RocksDBEngine; use tikv_util::{ config::VersionTrack, + logger::get_log_level, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, sys::SysQuota, worker::{dummy_scheduler, ReceiverWrapper}, @@ -4139,7 +4198,7 @@ mod tests { assert_eq!(last_cfg_metadata.modified().unwrap(), first_modified); // write to file when config is the inequivalent of last one. - cfg.log_level = slog::Level::Warning; + cfg.log_level = slog::Level::Warning.into(); assert!(persist_config(&cfg).is_ok()); last_cfg_metadata = last_cfg_path.metadata().unwrap(); assert_ne!(last_cfg_metadata.modified().unwrap(), first_modified); @@ -4255,8 +4314,7 @@ mod tests { fn test_parse_log_level() { #[derive(Serialize, Deserialize, Debug)] struct LevelHolder { - #[serde(with = "log_level_serde")] - v: Level, + v: LogLevel, } let legal_cases = vec![ @@ -4268,19 +4326,21 @@ mod tests { ("info", Level::Info), ]; for (serialized, deserialized) in legal_cases { - let holder = LevelHolder { v: deserialized }; + let holder = LevelHolder { + v: deserialized.into(), + }; let res_string = toml::to_string(&holder).unwrap(); let exp_string = format!("v = \"{}\"\n", serialized); assert_eq!(res_string, exp_string); let res_value: LevelHolder = toml::from_str(&exp_string).unwrap(); - assert_eq!(res_value.v, deserialized); + assert_eq!(res_value.v, deserialized.into()); } let compatibility_cases = vec![("warning", Level::Warning), ("critical", Level::Critical)]; for (serialized, deserialized) in compatibility_cases { let variant_string = format!("v = \"{}\"\n", serialized); let res_value: LevelHolder = toml::from_str(&variant_string).unwrap(); - assert_eq!(res_value.v, deserialized); + assert_eq!(res_value.v, deserialized.into()); } let illegal_cases = vec!["foobar", ""]; @@ -4709,6 +4769,31 @@ mod tests { ); } + #[test] + fn test_change_logconfig() { + let (cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let cfg_controller = ConfigController::new(cfg); + + cfg_controller.register(Module::Log, Box::new(LogConfigManager)); + + cfg_controller.update_config("log.level", "warn").unwrap(); + assert_eq!(get_log_level().unwrap(), Level::Warning); + assert_eq!( + cfg_controller.get_current().log.level, + LogLevel(Level::Warning) + ); + + assert!( + cfg_controller + .update_config("log.level", "invalid") + .is_err() + ); + assert_eq!( + cfg_controller.get_current().log.level, + LogLevel(Level::Warning) + ); + } + #[test] fn test_dispatch_titan_blob_run_mode_config() { let mut cfg = TiKvConfig::default(); diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 1bb066d1a2c..c4cb6a67fbb 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -62,7 +62,7 @@ use self::profile::{ read_file, start_one_cpu_profile, start_one_heap_profile, }; use crate::{ - config::{log_level_serde, ConfigController}, + config::{ConfigController, LogLevel}, server::Result, tikv_util::sys::thread::ThreadBuildWrapper, }; @@ -79,8 +79,7 @@ static FAIL_POINTS_REQUEST_PATH: &str = "/fail"; #[derive(Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] struct LogLevelRequest { - #[serde(with = "log_level_serde")] - pub log_level: slog::Level, + pub log_level: LogLevel, } pub struct StatusServer { @@ -403,7 +402,7 @@ where match log_level_request { Ok(req) => { - set_log_level(req.log_level); + set_log_level(req.log_level.into()); Ok(Response::new(Body::empty())) } Err(err) => Ok(make_response(StatusCode::BAD_REQUEST, err.to_string())), @@ -1464,7 +1463,7 @@ mod tests { .build() .unwrap(); - let new_log_level = slog::Level::Debug; + let new_log_level = slog::Level::Debug.into(); let mut log_level_request = Request::new(Body::from( serde_json::to_string(&LogLevelRequest { log_level: new_log_level, @@ -1484,7 +1483,7 @@ mod tests { .await .map(move |res| { assert_eq!(res.status(), StatusCode::OK); - assert_eq!(get_log_level(), Some(new_log_level)); + assert_eq!(get_log_level(), Some(new_log_level.into())); }) .unwrap() }); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 44a6ad8c989..d66ea96fb3b 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -64,7 +64,7 @@ fn read_file_in_project_dir(path: &str) -> String { fn test_serde_custom_tikv_config() { let mut value = TiKvConfig::default(); value.log_rotation_timespan = ReadableDuration::days(1); - value.log.level = Level::Critical; + value.log.level = Level::Critical.into(); value.log.file.filename = "foo".to_owned(); value.log.format = LogFormat::Json; value.log.file.max_size = 1; @@ -891,12 +891,12 @@ fn test_block_cache_backward_compatible() { fn test_log_backward_compatible() { let content = read_file_in_project_dir("integrations/config/test-log-compatible.toml"); let mut cfg: TiKvConfig = toml::from_str(&content).unwrap(); - assert_eq!(cfg.log.level, slog::Level::Info); + assert_eq!(cfg.log.level, slog::Level::Info.into()); assert_eq!(cfg.log.file.filename, ""); assert_eq!(cfg.log.format, LogFormat::Text); assert_eq!(cfg.log.file.max_size, 300); cfg.logger_compatible_adjust(); - assert_eq!(cfg.log.level, slog::Level::Critical); + assert_eq!(cfg.log.level, slog::Level::Critical.into()); assert_eq!(cfg.log.file.filename, "foo"); assert_eq!(cfg.log.format, LogFormat::Json); assert_eq!(cfg.log.file.max_size, 1024); From 0eec6009fb5d386437eee46063d7e461c25988d3 Mon Sep 17 00:00:00 2001 From: guo-shaoge Date: Wed, 20 Jul 2022 16:41:09 +0800 Subject: [PATCH 097/676] copr: fix get_valid_int_prefix() to be compatible with TiDB(#13045) (#13046) close tikv/tikv#13045 Signed-off-by: guo-shaoge --- .../tidb_query_datatype/src/codec/convert.rs | 153 ++++++++++++------ components/tidb_query_expr/src/impl_cast.rs | 3 +- 2 files changed, 103 insertions(+), 53 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index 61ce14a0390..bcfc7bb2bbe 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -828,7 +828,17 @@ impl ConvertTo for Bytes { } pub fn get_valid_int_prefix<'a>(ctx: &mut EvalContext, s: &'a str) -> Result> { - if !ctx.cfg.flag.contains(Flag::IN_SELECT_STMT) { + get_valid_int_prefix_helper(ctx, s, false) +} + +// As TiDB code(getValidIntPrefix()), cast expr needs to give error/warning when input string +// is like float. +pub fn get_valid_int_prefix_helper<'a>( + ctx: &mut EvalContext, + s: &'a str, + is_cast_func: bool, +) -> Result> { + if !is_cast_func { let vs = get_valid_float_prefix(ctx, s)?; Ok(float_str_to_int_string(ctx, vs)) } else { @@ -855,51 +865,65 @@ pub fn get_valid_int_prefix<'a>(ctx: &mut EvalContext, s: &'a str) -> Result(ctx: &mut EvalContext, s: &'a str) -> Result<&'a str> { - let mut saw_dot = false; - let mut saw_digit = false; - let mut valid_len = 0; - let mut e_idx = 0; - for (i, c) in s.chars().enumerate() { - if c == '+' || c == '-' { - if i != 0 && (e_idx == 0 || i != e_idx + 1) { - // "1e+1" is valid. - break; - } - } else if c == '.' { - if saw_dot || e_idx > 0 { - // "1.1." or "1e1.1" + get_valid_float_prefix_helper(ctx, s, false) +} + +// As TiDB code(getValidFloatPrefix()), cast expr should not give error/warning when input is +// empty. +pub fn get_valid_float_prefix_helper<'a>( + ctx: &mut EvalContext, + s: &'a str, + is_cast_func: bool, +) -> Result<&'a str> { + if is_cast_func && s.is_empty() { + Ok("0") + } else { + let mut saw_dot = false; + let mut saw_digit = false; + let mut valid_len = 0; + let mut e_idx = 0; + for (i, c) in s.chars().enumerate() { + if c == '+' || c == '-' { + if i != 0 && (e_idx == 0 || i != e_idx + 1) { + // "1e+1" is valid. + break; + } + } else if c == '.' { + if saw_dot || e_idx > 0 { + // "1.1." or "1e1.1" + break; + } + saw_dot = true; + if saw_digit { + // "123." is valid. + valid_len = i + 1; + } + } else if c == 'e' || c == 'E' { + if !saw_digit { + // "+.e" + break; + } + if e_idx != 0 { + // "1e5e" + break; + } + e_idx = i + } else if !('0'..='9').contains(&c) { break; - } - saw_dot = true; - if saw_digit { - // "123." is valid. + } else { + saw_digit = true; valid_len = i + 1; } - } else if c == 'e' || c == 'E' { - if !saw_digit { - // "+.e" - break; - } - if e_idx != 0 { - // "1e5e" - break; - } - e_idx = i - } else if !('0'..='9').contains(&c) { - break; + } + if valid_len == 0 || valid_len < s.len() { + ctx.handle_truncate_err(Error::truncated_wrong_val("INTEGER", s))?; + } + if valid_len == 0 { + Ok("0") } else { - saw_digit = true; - valid_len = i + 1; + Ok(&s[..valid_len]) } } - if valid_len == 0 || valid_len < s.len() { - ctx.handle_truncate_err(Error::truncated_wrong_val("INTEGER", s))?; - } - if valid_len == 0 { - Ok("0") - } else { - Ok(&s[..valid_len]) - } } /// the `s` must be a valid int_str @@ -1984,28 +2008,48 @@ mod tests { fn test_get_valid_float_prefix() { let cases = vec![ ("-100", "-100"), + ("1.", "1."), + (".1", ".1"), + ("123.23E-10", "123.23E-10"), + ]; + + let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag( + Flag::TRUNCATE_AS_WARNING | Flag::OVERFLOW_AS_WARNING, + ))); + for (i, o) in cases { + assert_eq!(super::get_valid_float_prefix(&mut ctx, i).unwrap(), o); + } + assert_eq!(ctx.take_warnings().warnings.len(), 0); + + let warning_cases = vec![ ("1abc", "1"), ("-1-1", "-1"), ("+1+1", "+1"), ("123..34", "123."), - ("123.23E-10", "123.23E-10"), ("1.1e1.3", "1.1e1"), ("11e1.3", "11e1"), ("1.1e-13a", "1.1e-13"), - ("1.", "1."), - (".1", ".1"), - ("", "0"), ("123e+", "123"), ("123.e", "123."), ("1-1-", "1"), ("11-1-", "11"), ("-1-1-", "-1"), + ("", "0"), ]; - - let mut ctx = EvalContext::new(Arc::new(EvalConfig::default_for_test())); - for (i, o) in cases { + let warning_cnt = warning_cases.len(); + for (i, o) in warning_cases.clone() { assert_eq!(super::get_valid_float_prefix(&mut ctx, i).unwrap(), o); } + assert_eq!(ctx.take_warnings().warnings.len(), warning_cnt); + + // Test is cast expr. + for (i, o) in warning_cases.clone() { + assert_eq!( + super::get_valid_float_prefix_helper(&mut ctx, i, true).unwrap(), + o + ); + } + assert_eq!(ctx.take_warnings().warnings.len(), warning_cnt - 1); } #[test] @@ -2093,11 +2137,8 @@ mod tests { } assert_eq!(ctx.take_warnings().warnings.len(), 0); - let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag( - Flag::IN_SELECT_STMT | Flag::IGNORE_TRUNCATE | Flag::OVERFLOW_AS_WARNING, - ))); + let mut ctx = EvalContext::new(Arc::new(EvalConfig::default_for_test())); let cases = vec![ - ("+0.0", "+0"), ("100", "100"), ("+100", "+100"), ("-100", "-100"), @@ -2108,10 +2149,18 @@ mod tests { ]; for (i, e) in cases { - let o = super::get_valid_int_prefix(&mut ctx, i); + let o = super::get_valid_int_prefix_helper(&mut ctx, i, true); assert_eq!(o.unwrap(), *e, "{}, {}", i, e); } assert_eq!(ctx.take_warnings().warnings.len(), 0); + + let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING))); + let cases = vec![("+0.0", "+0"), ("0.5", "0"), ("+0.5", "+0")]; + for (i, e) in cases { + let o = super::get_valid_int_prefix_helper(&mut ctx, i, true); + assert_eq!(o.unwrap(), *e, "{}, {}", i, e); + } + assert_eq!(ctx.take_warnings().warnings.len(), 3); } #[test] diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index f6d6af4eb02..e283a78d245 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -344,7 +344,7 @@ fn cast_string_as_int( } else { // FIXME: if the err get_valid_int_prefix returned is overflow err, // it should be ERR_TRUNCATE_WRONG_VALUE but not others. - let valid_int_prefix = get_valid_int_prefix(ctx, val)?; + let valid_int_prefix = get_valid_int_prefix_helper(ctx, val, true)?; let parse_res = if !is_str_neg { valid_int_prefix.parse::().map(|x| x as i64) } else { @@ -2343,6 +2343,7 @@ mod tests { vec![ERR_TRUNCATE_WRONG_VALUE], Cond::Unsigned, ), + ("0.5", 0_i64, vec![ERR_TRUNCATE_WRONG_VALUE], Cond::None), ]; for (input, expected, mut err_code, cond) in cs { From 6c7f6ecf4a999b0c102d442061d0ce0bd8b7c969 Mon Sep 17 00:00:00 2001 From: Zwb Date: Wed, 20 Jul 2022 17:01:09 +0800 Subject: [PATCH 098/676] Fix panic when enable titan (#13051) close tikv/tikv#13038 Signed-off-by: Wenbo Zhang Co-authored-by: Xinye Tao --- Cargo.lock | 6 +++--- components/engine_rocks/src/write_batch.rs | 2 +- tests/integrations/raftstore/test_compact_after_delete.rs | 1 + tests/integrations/raftstore/test_merge.rs | 1 + tests/integrations/raftstore/test_snap.rs | 2 ++ 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0dd646d56e8..fb4e4d1e6a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2748,7 +2748,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#2e00e78b945194e8a672e8e078b6c73956e9ace0" +source = "git+https://github.com/tikv/rust-rocksdb.git#827a5df22cd59dc708c4c6a87dd8735a2312773d" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2767,7 +2767,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#2e00e78b945194e8a672e8e078b6c73956e9ace0" +source = "git+https://github.com/tikv/rust-rocksdb.git#827a5df22cd59dc708c4c6a87dd8735a2312773d" dependencies = [ "bzip2-sys", "cc", @@ -4584,7 +4584,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#2e00e78b945194e8a672e8e078b6c73956e9ace0" +source = "git+https://github.com/tikv/rust-rocksdb.git#827a5df22cd59dc708c4c6a87dd8735a2312773d" dependencies = [ "libc 0.2.125", "librocksdb_sys", diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index 1aa5c424521..77b8e65d3eb 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -98,7 +98,7 @@ impl RocksWriteBatchVec { impl engine_traits::WriteBatch for RocksWriteBatchVec { fn write_opt(&self, opts: &WriteOptions) -> Result<()> { let opt: RocksWriteOptions = opts.into(); - if self.index > 0 { + if self.support_write_batch_vec { self.get_db() .multi_batch_write(self.as_inner(), &opt.into_raw()) .map_err(Error::Engine) diff --git a/tests/integrations/raftstore/test_compact_after_delete.rs b/tests/integrations/raftstore/test_compact_after_delete.rs index 5a9a1521355..b31b86b3bfb 100644 --- a/tests/integrations/raftstore/test_compact_after_delete.rs +++ b/tests/integrations/raftstore/test_compact_after_delete.rs @@ -36,6 +36,7 @@ fn test_compact_after_delete(cluster: &mut Cluster) { cluster.cfg.raft_store.region_compact_min_tombstones = 500; cluster.cfg.raft_store.region_compact_tombstones_percent = 50; cluster.cfg.raft_store.region_compact_check_step = 1; + cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); for i in 0..1000 { diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index df739d825bc..4d7914429ab 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -24,6 +24,7 @@ use txn_types::{Key, PessimisticLock}; #[test] fn test_node_base_merge() { let mut cluster = new_node_cluster(0, 3); + cluster.cfg.rocksdb.titan.enabled = true; configure_for_merge(&mut cluster); cluster.run(); diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index c75e07e7f3a..180e5fb1334 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -25,6 +25,7 @@ use tikv::server::snap::send_snap; use tikv_util::{config::*, time::Instant, HandyRwLock}; fn test_huge_snapshot(cluster: &mut Cluster, max_snapshot_file_size: u64) { + cluster.cfg.rocksdb.titan.enabled = true; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); cluster.cfg.raft_store.snap_apply_batch_size = ReadableSize(500); @@ -211,6 +212,7 @@ fn test_server_snap_gc() { /// when there are multiple snapshots which have overlapped region ranges /// arrive at the same raftstore. fn test_concurrent_snap(cluster: &mut Cluster) { + cluster.cfg.rocksdb.titan.enabled = true; // Disable raft log gc in this test case. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); From baf30d0ea62282c54212ecb3383de9eb2225e063 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Wed, 20 Jul 2022 18:13:09 +0800 Subject: [PATCH 099/676] log-backup: upload global checkpoint ts to etcd. (#13053) ref tikv/tikv#1, close tikv/tikv#13062 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 35 +++++++++++++++---- .../backup-stream/src/metadata/client.rs | 34 ++++++++++++++++++ components/backup-stream/src/metadata/keys.rs | 14 ++++++++ components/backup-stream/src/metadata/test.rs | 28 ++++++++++++++- components/backup-stream/src/router.rs | 30 ++++++++-------- components/external_storage/src/local.rs | 34 +++++++++++++++--- 6 files changed, 148 insertions(+), 27 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 51e04023d60..b4c49ea892a 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -806,15 +806,36 @@ where let ts = self.meta_client.global_progress_of_task(&task).await; match ts { Ok(global_checkpoint) => { - if let Err(e) = self + let r = self .range_router .update_global_checkpoint(&task, global_checkpoint, self.store_id) - .await - { - warn!("backup stream failed to update global checkpoint."; - "task" => ?task, - "err" => ?e - ); + .await; + match r { + Ok(true) => { + if let Err(err) = self + .meta_client + .set_storage_checkpoint(&task, global_checkpoint) + .await + { + warn!("backup stream failed to set global checkpoint."; + "task" => ?task, + "global-checkpoint" => global_checkpoint, + "err" => ?err, + ); + } + } + Ok(false) => { + debug!("backup stream no need update global checkpoint."; + "task" => ?task, + "global-checkpoint" => global_checkpoint, + ); + } + Err(e) => { + warn!("backup stream failed to update global checkpoint."; + "task" => ?task, + "err" => ?e + ); + } } } Err(e) => { diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 07d93162e00..dc21f86b526 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -421,6 +421,40 @@ impl MetadataClient { }) } + /// Set the storage checkpoint to metadata. + pub async fn set_storage_checkpoint(&self, task_name: &str, ts: u64) -> Result<()> { + let now = Instant::now(); + defer! { + super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["storage_checkpoint"]).observe(now.saturating_elapsed().as_secs_f64()) + } + self.meta_store + .set(KeyValue( + MetaKey::storage_checkpoint_of(task_name, self.store_id), + ts.to_be_bytes().to_vec(), + )) + .await?; + Ok(()) + } + + /// Get the storage checkpoint from metadata. This function is justly used for test. + pub async fn get_storage_checkpoint(&self, task_name: &str) -> Result { + let now = Instant::now(); + defer! { + super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_step"]).observe(now.saturating_elapsed().as_secs_f64()) + } + let snap = self.meta_store.snapshot().await?; + let ts = snap + .get(Keys::Key(MetaKey::storage_checkpoint_of( + task_name, + self.store_id, + ))) + .await?; + + match ts.as_slice() { + [ts, ..] => Ok(TimeStamp::new(parse_ts_from_bytes(ts.value())?)), + [] => Ok(self.get_task_start_ts_checkpoint(task_name).await?.ts), + } + } /// forward the progress of some task. pub async fn set_local_task_checkpoint(&self, task_name: &str, ts: u64) -> Result<()> { let now = Instant::now(); diff --git a/components/backup-stream/src/metadata/keys.rs b/components/backup-stream/src/metadata/keys.rs index 6920ba14a33..4db978c2cb6 100644 --- a/components/backup-stream/src/metadata/keys.rs +++ b/components/backup-stream/src/metadata/keys.rs @@ -5,6 +5,7 @@ use kvproto::metapb::Region; const PREFIX: &str = "/tidb/br-stream"; const PATH_INFO: &str = "/info"; const PATH_NEXT_BACKUP_TS: &str = "/checkpoint"; +const PATH_STORAGE_CHECKPOINT: &str = "/storage-checkpoint"; const PATH_RANGES: &str = "/ranges"; const PATH_PAUSE: &str = "/pause"; const PATH_LAST_ERROR: &str = "/last-error"; @@ -23,6 +24,8 @@ const TASKS_PREFIX: &str = "/tidb/br-stream/info/"; /// /checkpoint/// -> /// For the status of tasks: /// /pause/ -> "" +/// For the storage checkpoint ts of tasks: +/// /storage-checkpoint// -> /// ``` #[derive(Clone)] pub struct MetaKey(pub Vec); @@ -129,6 +132,17 @@ impl MetaKey { ) } + /// defines the key of storage checkpoint-ts of task in a store. + pub fn storage_checkpoint_of(name: &str, store_id: u64) -> Self { + Self( + format!( + "{}{}/{}/{}", + PREFIX, PATH_STORAGE_CHECKPOINT, name, store_id + ) + .into_bytes(), + ) + } + pub fn pause_prefix_len() -> usize { Self::pause_prefix().0.len() } diff --git a/components/backup-stream/src/metadata/test.rs b/components/backup-stream/src/metadata/test.rs index e70ed78b32c..b9fb965033a 100644 --- a/components/backup-stream/src/metadata/test.rs +++ b/components/backup-stream/src/metadata/test.rs @@ -14,7 +14,7 @@ use kvproto::{ use tokio_stream::StreamExt; use txn_types::TimeStamp; -use super::{MetadataClient, StreamTask}; +use super::{keys::MetaKey, MetadataClient, StreamTask}; use crate::{ errors::Result, metadata::{ @@ -152,6 +152,32 @@ async fn test_progress() -> Result<()> { Ok(()) } +#[test] +fn test_storage_checkpoint_of() { + let task_name = "simple_task"; + let store_id: u64 = 5; + let key = MetaKey::storage_checkpoint_of(task_name, store_id); + assert_eq!( + &key.0, + "/tidb/br-stream/storage-checkpoint/simple_task/5".as_bytes() + ); +} + +#[tokio::test] +async fn test_set_storage_checkpoint() -> Result<()> { + let cli = test_meta_cli(); + let task = simple_task("simple_3"); + let storage_checkpoint_ts: u64 = 12345; + + // set storage checkpoint to metadata + cli.set_storage_checkpoint(task.info.get_name(), storage_checkpoint_ts) + .await?; + // get storage checkpoint from metadata + let ts = cli.get_storage_checkpoint(task.info.get_name()).await?; + assert_eq!(ts.into_inner(), storage_checkpoint_ts); + Ok(()) +} + #[tokio::test] async fn test_init() -> Result<()> { let cli = test_meta_cli(); diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 1ad4c4ad4ca..3e29592a9f4 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -569,11 +569,11 @@ impl RouterInner { task_name: &str, global_checkpoint: u64, store_id: u64, - ) -> Result<()> { - let t = self.get_task_info(task_name).await?; - t.update_global_checkpoint(global_checkpoint, store_id) - .await?; - Ok(()) + ) -> Result { + self.get_task_info(task_name) + .await? + .update_global_checkpoint(global_checkpoint, store_id) + .await } /// tick aims to flush log/meta to extern storage periodically. @@ -1122,7 +1122,7 @@ impl StreamTaskInfo { &self, global_checkpoint: u64, store_id: u64, - ) -> Result<()> { + ) -> Result { let last_global_checkpoint = self.global_checkpoint_ts.load(Ordering::SeqCst); if last_global_checkpoint < global_checkpoint { let r = self.global_checkpoint_ts.compare_exchange( @@ -1133,9 +1133,10 @@ impl StreamTaskInfo { ); if r.is_ok() { self.flush_global_checkpoint(store_id).await?; + return Ok(true); } } - Ok(()) + Ok(false) } } @@ -2014,7 +2015,7 @@ mod tests { } #[tokio::test] - async fn test_update_global_checkpoint() { + async fn test_update_global_checkpoint() -> Result<()> { // create local storage let tmp_dir = tempfile::tempdir().unwrap(); let backend = external_storage_export::make_local_backend(tmp_dir.path()); @@ -2039,18 +2040,18 @@ mod tests { // test no need to update global checkpoint let store_id = 3; let mut global_checkpoint = 10000; - let r = task + let is_updated = task .update_global_checkpoint(global_checkpoint, store_id) - .await; - assert_eq!(r.is_ok(), true); + .await?; + assert_eq!(is_updated, false); assert_eq!(task.global_checkpoint_ts.load(Ordering::SeqCst), 10001); // test update global checkpoint global_checkpoint = 10002; - let r = task + let is_updated = task .update_global_checkpoint(global_checkpoint, store_id) - .await; - assert_eq!(r.is_ok(), true); + .await?; + assert_eq!(is_updated, true); assert_eq!( task.global_checkpoint_ts.load(Ordering::SeqCst), global_checkpoint @@ -2067,5 +2068,6 @@ mod tests { ts.copy_from_slice(&buff); let ts = u64::from_le_bytes(ts); assert_eq!(ts, global_checkpoint); + Ok(()) } } diff --git a/components/external_storage/src/local.rs b/components/external_storage/src/local.rs index 3e307dca157..00cb42cf1a6 100644 --- a/components/external_storage/src/local.rs +++ b/components/external_storage/src/local.rs @@ -100,12 +100,11 @@ impl ExternalStorage for LocalStorage { } })?; } - // Sanitize check, do not save file if it is already exist. + + // Because s3 could support writing(put_object) a existed object. + // For the interface consistent with s3, local storage need also support write a existed file. if fs::metadata(self.base.join(name)).await.is_ok() { - return Err(io::Error::new( - io::ErrorKind::AlreadyExists, - format!("[{}] is already exists in {}", name, self.base.display()), - )); + info!("[{}] is already exists in {}", name, self.base.display()); } let tmp_path = self.tmp_path(Path::new(name)); let mut tmp_f = File::create(&tmp_path).await?; @@ -215,4 +214,29 @@ mod tests { fn test_url_of_backend() { assert_eq!(url_for(Path::new("/tmp/a")).to_string(), "local:///tmp/a"); } + + #[tokio::test] + async fn test_write_existed_file() { + let temp_dir = Builder::new().tempdir().unwrap(); + let path = temp_dir.path(); + let ls = LocalStorage::new(path).unwrap(); + + let filename = "existed.file"; + let buf1: &[u8] = b"pingcap"; + let buf2: &[u8] = b"tikv"; + let r = ls + .write(filename, UnpinReader(Box::new(buf1)), buf1.len() as _) + .await; + assert!(r.is_ok()); + let r = ls + .write(filename, UnpinReader(Box::new(buf2)), buf2.len() as _) + .await; + assert!(r.is_ok()); + + let mut read_buff: Vec = Vec::new(); + let r = ls.read(filename).read_to_end(&mut read_buff).await; + assert!(r.is_ok()); + assert_eq!(read_buff.len(), 4); + assert_eq!(&read_buff, buf2); + } } From 9b3a669a97c39e7603851c0b5c85754c5d2d0cf7 Mon Sep 17 00:00:00 2001 From: Jiarui Li <34512395+Willendless@users.noreply.github.com> Date: Wed, 20 Jul 2022 23:09:08 -0400 Subject: [PATCH 100/676] duration: keep duration parser compatible with tidb (#13031) close tikv/tikv#12932, ref tikv/tikv#35455 Signed-off-by: Willendless <317500141@qq.com> Co-authored-by: Liqi Geng --- .../src/codec/mysql/duration.rs | 88 ++++++++++++++----- 1 file changed, 66 insertions(+), 22 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index 997983c2e49..e151c8fd0c5 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -150,28 +150,35 @@ mod parser { Ok((rest, hhmmss)) } - fn hhmmss_datetime<'a>( - ctx: &mut EvalContext, - input: &'a str, - fsp: u8, - ) -> IResult<&'a str, Duration, ()> { + /// A string can match datetime format only if it starts with a series of digits + /// whose length matches the full format of DateTime literal (12, 14) + /// or the string starts with a date literal. + fn format_can_match_datetime(input: &str) -> IResult<(), (), ()> { let (rest, digits) = digit1(input)?; + if digits.len() == 12 || digits.len() == 14 { - let datetime = DateTime::parse_datetime(ctx, input, fsp as i8, true) - .map_err(|_| nom::Err::Error(()))?; - return Ok(("", datetime.convert(ctx).map_err(|_| nom::Err::Error(()))?)); + return Ok(((), ())); } + let (rest, _) = anysep(rest)?; let (rest, _) = digit1(rest)?; let (rest, _) = anysep(rest)?; let (rest, _) = digit1(rest)?; - let has_datetime_sep = matches!(rest.chars().next(), Some(c) if c == 'T' || c == ' '); - - if !has_datetime_sep { - return Err(nom::Err::Error(())); + if matches!(rest.chars().next(), Some(c) if c == 'T' || c == ' ') { + Ok(((), ())) + } else { + Err(nom::Err::Error(())) } + } + /// Caller should make sure the input string can match datetime format + /// according to `format_can_match_datetime`. + fn hhmmss_datetime<'a>( + ctx: &mut EvalContext, + input: &'a str, + fsp: u8, + ) -> IResult<&'a str, Duration, ()> { let datetime = DateTime::parse_datetime(ctx, input, fsp as i8, true) .map_err(|_| nom::Err::Error(()))?; Ok(("", datetime.convert(ctx).map_err(|_| nom::Err::Error(()))?)) @@ -208,16 +215,21 @@ mod parser { ctx: &mut EvalContext, input: &str, fsp: u8, - fallback_to_daytime: bool, + fallback_to_datetime: bool, overflow_as_null: bool, ) -> Option { let input = input.trim(); if input.is_empty() { - return Some(Duration::zero()); + return None; } let (rest, neg) = negative(input).ok()?; let (rest, _) = space0::<_, ()>(rest).ok()?; + + let chars_len = rest.len(); + let mut truncated_parse = false; + let fallback_to_datetime = fallback_to_datetime && format_can_match_datetime(rest).is_ok(); + let duration = day_hhmmss(rest) .ok() .and_then(|(rest, (day, [hh, mm, ss]))| { @@ -230,7 +242,10 @@ mod parser { let (rest, frac) = fraction(rest, fsp).ok()?; if !rest.is_empty() { - return None; + if chars_len >= 12 { + return None; + } + truncated_parse = true; } Some(Duration::new_from_parts( @@ -238,8 +253,17 @@ mod parser { )) }); + // In order to keep compatible with TiDB, when input string can only be partially parsed by `hhmmss_compact` + // and it can match the datetime format, we fallback to parse it using datetime format. + if truncated_parse && fallback_to_datetime { + return hhmmss_datetime(ctx, rest, fsp).map_or(None, |(_, duration)| Some(duration)); + } + match duration { - Some(Ok(duration)) => Some(duration), + Some(Ok(duration)) => { + let _ = ctx.handle_truncate(truncated_parse); + Some(duration) + } Some(Err(err)) if err.is_overflow() => { if overflow_as_null { return None; @@ -249,7 +273,7 @@ mod parser { Some(Duration { nanos, fsp }) }) } - None if fallback_to_daytime => { + None if fallback_to_datetime => { hhmmss_datetime(ctx, rest, fsp).map_or(None, |(_, duration)| Some(duration)) } _ => None, @@ -809,7 +833,8 @@ mod tests { ("2011-11-11 00:00:01", 0, Some("00:00:01")), ("20111111000001", 0, Some("00:00:01")), ("201112110102", 0, Some("11:01:02")), - ("2011-11-11", 0, None), + ("2011-11-11", 0, Some("00:20:11")), + ("2012-08-x", 0, Some("00:20:12")), ("--23", 0, None), ("232 10", 0, None), ("-232 10", 0, None), @@ -818,7 +843,24 @@ mod tests { ("00:00:00.777777", 2, Some("00:00:00.78")), ("00:00:00.777777", 6, Some("00:00:00.777777")), ("00:00:00.001", 3, Some("00:00:00.001")), + ("0x", 6, Some("00:00:00.000000")), + ("1x", 6, Some("00:00:01.000000")), + ("0000-00-00", 6, Some("00:00:00.000000")), // NOTE: The following case is easy to fail. + ("0000-00-00", 0, Some("00:00:00")), + ("1234abc", 0, Some("00:12:34")), + ("1234x", 0, Some("00:12:34")), + ("1234xxxxxxx", 0, Some("00:12:34")), + ("1234xxxxxxxx", 0, None), + ("-1234xxxxxxx", 0, Some("-00:12:34")), + ("-1234xxxxxxxx", 0, None), + ("1-----", 0, Some("00:00:01")), + ("20100000-02-12", 0, None), + ("20100-02-12", 0, Some("02:01:00")), + ("99999-99-99", 0, None), + ("99990000", 0, None), + ("0000-00-00", 0, Some("00:00:00")), + ("00-00-00", 0, Some("00:00:00")), ("- 1 ", 0, Some("-00:00:01")), ("1:2:3", 0, Some("01:02:03")), ("1 1:2:3", 0, Some("25:02:03")), @@ -835,8 +877,9 @@ mod tests { (" - 1 : 2 : 3 .123 ", 3, Some("-01:02:03.123")), (" - 1 .123 ", 3, Some("-00:00:01.123")), ("-", 0, None), + ("a", 0, None), ("- .1", 0, None), - ("", 0, Some("00:00:00")), + ("", 0, None), ("", 7, None), ("1.1", 1, Some("00:00:01.1")), ("-1.1", 1, Some("-00:00:01.1")), @@ -846,13 +889,13 @@ mod tests { ("4294967295 0:59:59", 0, None), ("4294967295 232:59:59", 0, None), ("-4294967295 232:59:59", 0, None), - ("1::2:3", 0, None), - ("1.23 3", 0, None), + ("1::2:3", 0, Some("00:00:01")), + ("1.23 3", 0, Some("00:00:01")), ("1:62:3", 0, None), ("1:02:63", 0, None), ("-231342080", 0, None), + ("2010-02-12", 0, Some("00:20:10")), // test fallback to datetime - ("2010-02-12", 0, None), ("2010-02-12t12:23:34", 0, None), ("2010-02-12T12:23:34", 0, Some("12:23:34")), ("2010-02-12 12:23:34", 0, Some("12:23:34")), @@ -871,6 +914,7 @@ mod tests { let cases: Vec<(&str, i8, Option<&'static str>, bool)> = vec![ ("-790822912", 0, None, true), ("-790822912", 0, Some("-838:59:59"), false), + ("99990000", 0, Some("838:59:59"), false), ]; for (input, fsp, expect, return_null) in cases { From 5fa87491244fa0356ec06b4d9681fac14b83ac79 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Thu, 21 Jul 2022 11:37:09 +0800 Subject: [PATCH 101/676] log-backup: modify the config (#13023) ref tikv/tikv#12895 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 8 +++++--- etc/config-template.toml | 2 +- src/config.rs | 12 +++++------- tests/integrations/config/mod.rs | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index b4c49ea892a..2defb88b541 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -111,14 +111,14 @@ where concurrency_manager: ConcurrencyManager, ) -> Self { crate::metrics::STREAM_ENABLED.inc(); - let pool = create_tokio_runtime(config.io_threads, "backup-stream") + let pool = create_tokio_runtime((config.num_threads / 2).max(1), "backup-stream") .expect("failed to create tokio runtime for backup stream worker."); let meta_client = MetadataClient::new(store, store_id); let range_router = Router::new( PathBuf::from(config.temp_path.clone()), scheduler.clone(), - config.temp_file_size_limit_per_task.0, + config.file_size_limit.0, config.max_flush_interval.0, ); @@ -159,7 +159,7 @@ where observer.clone(), meta_client.clone(), pd_client.clone(), - config.num_threads, + ((config.num_threads + 1) / 2).max(1), ); pool.spawn(op_loop); Endpoint { @@ -935,6 +935,8 @@ where /// Create a standard tokio runtime /// (which allows io and time reactor, involve thread memory accessor), fn create_tokio_runtime(thread_count: usize, thread_name: &str) -> TokioResult { + info!("create tokio runtime for backup stream"; "thread_name" => thread_name, "thread-count" => thread_count); + tokio::runtime::Builder::new_multi_thread() .thread_name(thread_name) // Maybe make it more configurable? diff --git a/etc/config-template.toml b/etc/config-template.toml index a19533b7847..795a82f371c 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -1189,7 +1189,7 @@ [log-backup] ## Number of threads to perform backup stream tasks. -## The default value is set to min(CPU_NUM * 0.5, 8). +## The default value is CPU_NUM * 0.5, and limited to [2, 12]. # num-threads = 8 ## enable this feature. TiKV will starts watch related tasks in PD. and backup kv changes to storage accodring to task. diff --git a/src/config.rs b/src/config.rs index 1f64f53dc58..6ae622bd806 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2450,17 +2450,16 @@ pub struct BackupStreamConfig { #[online_config(skip)] pub num_threads: usize, #[online_config(skip)] - pub io_threads: usize, - #[online_config(skip)] pub enable: bool, #[online_config(skip)] pub temp_path: String, #[online_config(skip)] - pub temp_file_size_limit_per_task: ReadableSize, + pub file_size_limit: ReadableSize, #[online_config(skip)] pub initial_scan_pending_memory_quota: ReadableSize, #[online_config(skip)] pub initial_scan_rate_limit: ReadableSize, + #[serde(skip)] #[online_config(skip)] pub use_checkpoint_v3: bool, } @@ -2488,12 +2487,11 @@ impl Default for BackupStreamConfig { Self { max_flush_interval: ReadableDuration::minutes(5), // use at most 50% of vCPU by default - num_threads: (cpu_num * 0.5).clamp(1.0, 8.0) as usize, - io_threads: 2, + num_threads: (cpu_num * 0.5).clamp(2.0, 12.0) as usize, enable: false, // TODO: may be use raft store directory temp_path: String::new(), - temp_file_size_limit_per_task: ReadableSize::mb(128), + file_size_limit: ReadableSize::mb(256), initial_scan_pending_memory_quota: ReadableSize(quota_size as _), initial_scan_rate_limit: ReadableSize::mb(60), use_checkpoint_v3: true, @@ -3101,7 +3099,7 @@ impl TiKvConfig { if self.backup_stream.temp_path.is_empty() { self.backup_stream.temp_path = - config::canonicalize_sub_path(&self.storage.data_dir, "log-backup-tmp")?; + config::canonicalize_sub_path(&self.storage.data_dir, "log-backup-temp")?; } self.rocksdb.validate()?; diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index d66ea96fb3b..8c1be52be78 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -748,7 +748,7 @@ fn test_serde_custom_tikv_config() { ..Default::default() }; value.backup_stream = BackupStreamConfig { - num_threads: 8, + num_threads: 12, ..Default::default() }; value.import = ImportConfig { From 92b223cfd4195d857219bac02d6a96f4ce4c03e3 Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 21 Jul 2022 12:05:10 +0800 Subject: [PATCH 102/676] log: optimize log filter (#13080) ref tikv/tikv#12986 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- components/tikv_util/src/logger/mod.rs | 85 ++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 13 deletions(-) diff --git a/components/tikv_util/src/logger/mod.rs b/components/tikv_util/src/logger/mod.rs index f4fd936cddc..35bf5f4c8e0 100644 --- a/components/tikv_util/src/logger/mod.rs +++ b/components/tikv_util/src/logger/mod.rs @@ -83,7 +83,7 @@ where threshold: slow_threshold, inner: drain, }; - let filtered = drain.filter(filter).fuse(); + let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); (slog::Logger::root(filtered, slog_o!()), Some(guard)) } else { @@ -92,7 +92,7 @@ where threshold: slow_threshold, inner: drain, }; - let filtered = drain.filter(filter).fuse(); + let filtered = GlobalLevelFilter::new(drain.filter(filter).fuse()); (slog::Logger::root(filtered, slog_o!()), None) }; @@ -407,17 +407,15 @@ where type Err = slog::Never; fn log(&self, record: &Record<'_>, values: &OwnedKVList) -> Result { - if record.level().as_usize() <= LOG_LEVEL.load(Ordering::Relaxed) { - if let Err(e) = self.0.log(record, values) { - let fatal_drainer = Mutex::new(text_format(term_writer(), true)).ignore_res(); - fatal_drainer.log(record, values).unwrap(); - let fatal_logger = slog::Logger::root(fatal_drainer, slog_o!()); - slog::slog_crit!( - fatal_logger, - "logger encountered error"; - "err" => %e, - ); - } + if let Err(e) = self.0.log(record, values) { + let fatal_drainer = Mutex::new(text_format(term_writer(), true)).ignore_res(); + fatal_drainer.log(record, values).unwrap(); + let fatal_logger = slog::Logger::root(fatal_drainer, slog_o!()); + slog::slog_crit!( + fatal_logger, + "logger encountered error"; + "err" => %e, + ); } Ok(()) } @@ -452,6 +450,36 @@ where } } +// GlobalLevelFilter is a filter that base on the global `LOG_LEVEL`'s value. +pub struct GlobalLevelFilter(pub D); + +impl GlobalLevelFilter { + /// Create `LevelFilter` + pub fn new(drain: D) -> Self { + Self(drain) + } +} + +impl Drain for GlobalLevelFilter +where + D: Drain, + D::Ok: Default, +{ + type Ok = D::Ok; + type Err = D::Err; + fn log(&self, record: &Record<'_>, logger_values: &OwnedKVList) -> Result { + if record.level().as_usize() <= LOG_LEVEL.load(Ordering::Relaxed) { + self.0.log(record, logger_values) + } else { + Ok(Default::default()) + } + } + #[inline] + fn is_enabled(&self, level: Level) -> bool { + level.as_usize() <= LOG_LEVEL.load(Ordering::Relaxed) && self.0.is_enabled(level) + } +} + struct SlowCostSerializer { // None means input record without key `takes` cost: Option, @@ -821,6 +849,37 @@ mod tests { }); } + #[test] + fn test_global_level_filter() { + let decorator = PlainSyncDecorator::new(TestWriter); + let drain = TikvFormat::new(decorator, true).fuse(); + let logger = + slog::Logger::root_typed(GlobalLevelFilter::new(drain), slog_o!()).into_erased(); + + let expected = "[2019/01/15 13:40:39.619 +08:00] [INFO] [mod.rs:871] [Welcome]\n"; + let check_log = |log: &str| { + BUFFER.with(|buffer| { + let mut buffer = buffer.borrow_mut(); + let output = from_utf8(&*buffer).unwrap(); + // only check the log len here as some field like timestamp, location may change. + assert_eq!(output.len(), log.len()); + buffer.clear(); + }); + }; + + set_log_level(Level::Info); + slog_info!(logger, "Welcome"); + check_log(expected); + + set_log_level(Level::Warning); + slog_info!(logger, "Welcome"); + check_log(""); + + set_log_level(Level::Info); + slog_info!(logger, "Welcome"); + check_log(expected); + } + /// Removes the wrapping signs, peels `"[hello]"` to `"hello"`, or peels `"(hello)"` to `"hello"`, fn peel(output: &str) -> &str { assert!(output.len() >= 2); From 43f5f7ed2e5dff43d22e8be9e59fda2558c76597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 21 Jul 2022 14:17:09 +0800 Subject: [PATCH 103/676] log-backup: fixed bug of initial scanning rate limit doesn't take effect (#13069) close tikv/tikv#13068 Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- components/backup-stream/src/event_loader.rs | 74 ++++++++++++++++---- components/backup-stream/src/metrics.rs | 5 ++ components/backup-stream/src/utils.rs | 19 +++-- 3 files changed, 80 insertions(+), 18 deletions(-) diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 841f6ac75b6..40e0ab5c60b 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -92,8 +92,8 @@ impl EventLoader { let region_id = region.get_id(); let scanner = ScannerBuilder::new(snapshot, to_ts) .range( - Some(Key::from_encoded_slice(®ion.start_key)), - Some(Key::from_encoded_slice(®ion.end_key)), + (!region.start_key.is_empty()).then(|| Key::from_encoded_slice(®ion.start_key)), + (!region.end_key.is_empty()).then(|| Key::from_encoded_slice(®ion.end_key)), ) .hint_min_ts(Some(from_ts)) .fill_cache(false) @@ -123,7 +123,7 @@ impl EventLoader { /// Drain the internal buffer, converting them to the [`ApplyEvents`], /// and tracking the locks at the same time. - fn omit_entries_to( + fn emit_entries_to( &mut self, result: &mut ApplyEvents, resolver: &mut TwoPhaseResolver, @@ -178,19 +178,22 @@ impl EventLoader { /// The context for loading incremental data between range. /// Like [`cdc::Initializer`], but supports initialize over range. /// Note: maybe we can merge those two structures? +/// Note': maybe extract more fields to trait so it would be easier to test. #[derive(Clone)] pub struct InitialDataLoader { - pub(crate) router: RT, - pub(crate) regions: R, // Note: maybe we can make it an abstract thing like `EventSink` with // method `async (KvEvent) -> Result<()>`? pub(crate) sink: Router, pub(crate) tracing: SubscriptionTracer, pub(crate) scheduler: Scheduler, + // Note: this is only for `init_range`, maybe make it an argument? + pub(crate) regions: R, + // Note: Maybe move those fields about initial scanning into some trait? + pub(crate) router: RT, pub(crate) quota: PendingMemoryQuota, - pub(crate) handle: Handle, pub(crate) limit: Limiter, + pub(crate) handle: Handle, _engine: PhantomData, } @@ -381,14 +384,12 @@ where "{:?}", msg )))); let mut events = ApplyEvents::with_capacity(1024, region.id); - let stat = event_loader.fill_entries()?; - let disk_read = self.with_resolver(region, |r| { - let (result, byte_size) = utils::with_record_read_throughput(|| { - event_loader.omit_entries_to(&mut events, r) - }); - result?; - Result::Ok(byte_size) - })?; + // Note: the call of `fill_entries` is the only step which would read the disk. + // we only need to record the disk throughput of this. + let (stat, disk_read) = + utils::with_record_read_throughput(|| event_loader.fill_entries()); + let stat = stat?; + self.with_resolver(region, |r| event_loader.emit_entries_to(&mut events, r))?; if events.is_empty() { metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); return Ok(stats.stat); @@ -402,6 +403,7 @@ where self.limit.blocking_consume(disk_read as _); debug!("sending events to router"; "size" => %event_size, "region" => %region_id); metrics::INCREMENTAL_SCAN_SIZE.observe(event_size as f64); + metrics::INCREMENTAL_SCAN_DISK_READ.inc_by(disk_read as f64); metrics::HEAP_MEMORY.add(event_size as _); join_handles.push(tokio::spawn(async move { utils::handle_on_event_result(&sched, sink.on_events(events).await); @@ -470,3 +472,47 @@ where Ok(()) } } + +#[cfg(test)] +mod tests { + use kvproto::metapb::*; + use tikv::storage::{txn::tests::*, Engine, TestEngineBuilder}; + use txn_types::TimeStamp; + + use super::EventLoader; + use crate::{ + router::ApplyEvents, subscription_track::TwoPhaseResolver, + utils::with_record_read_throughput, + }; + + #[test] + fn test_disk_read() { + let engine = TestEngineBuilder::new().build_without_cache().unwrap(); + for i in 0..100 { + let owned_key = format!("{:06}", i); + let key = owned_key.as_bytes(); + let owned_value = [i as u8; 512]; + let value = owned_value.as_slice(); + must_prewrite_put(&engine, key, value, key, i * 2); + must_commit(&engine, key, i * 2, i * 2 + 1); + } + // let compact the memtable to disk so we can see the disk read. + engine.get_rocksdb().as_inner().compact_range(None, None); + + let mut r = Region::new(); + r.set_id(42); + r.set_start_key(b"".to_vec()); + r.set_end_key(b"".to_vec()); + let snap = engine.snapshot_on_kv_engine(b"", b"").unwrap(); + let mut loader = + EventLoader::load_from(snap, TimeStamp::zero(), TimeStamp::max(), &r).unwrap(); + + let (r, data_load) = with_record_read_throughput(|| loader.fill_entries()); + r.unwrap(); + let mut events = ApplyEvents::with_capacity(1024, 42); + let mut res = TwoPhaseResolver::new(42, None); + loader.emit_entries_to(&mut events, &mut res).unwrap(); + assert_ne!(events.len(), 0); + assert_ne!(data_load, 0); + } +} diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index 24a044bb4fb..a94be6df7f6 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -56,6 +56,11 @@ lazy_static! { exponential_buckets(1.0, 2.0, 16).unwrap() ) .unwrap(); + pub static ref INCREMENTAL_SCAN_DISK_READ: Counter = register_counter!( + "tikv_log_backup_initial_scan_disk_read", + "The total count of disk read bytes." + ) + .unwrap(); pub static ref INCREMENTAL_SCAN_SIZE: Histogram = register_histogram!( "tikv_stream_incremental_scan_bytes", "The size of scanning.", diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 486ce6ae0f8..0f09e747b80 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -519,7 +519,19 @@ impl ReadThroughputRecorder { let ins = self.ins.as_ref()?; let begin = self.begin.as_ref()?; let end = ins.io_stat().ok()??; - Some(end.read - begin.read) + let bytes_read = end.read - begin.read; + // FIXME: In our test environment, there may be too many caches hence + // the `bytes_read` is always zero :( + // For now, we eject here and let rocksDB prove that we did read something + // When the proc think we don't touch the block device (even in fact we didn't). + // NOTE: In the real-world, we would accept the zero `bytes_read` value since the cache did exists. + #[cfg(test)] + if bytes_read == 0 { + // use println here so we can get this message even log doesn't enabled. + println!("ejecting in test since no read recorded in procfs"); + return None; + } + Some(bytes_read) } fn end(self) -> u64 { @@ -579,6 +591,8 @@ mod test { time::Duration, }; + use engine_rocks::raw::DBOptions; + use engine_traits::WriteOptions; use futures::executor::block_on; use crate::utils::{is_in_range, CallbackWaitGroup, SegmentMap}; @@ -720,8 +734,6 @@ mod test { } } - /// skip it currently. Test it at local env successfully but failed at pod. - #[cfg(FALSE)] #[test] fn test_recorder() { use engine_rocks::{raw::DB, RocksEngine}; @@ -731,7 +743,6 @@ mod test { let p = TempDir::new("test_db").unwrap(); let mut opt = DBOptions::default(); opt.create_if_missing(true); - opt.enable_multi_write_batch(true); let db = DB::open(opt.clone(), p.path().as_os_str().to_str().unwrap()).unwrap(); let engine = RocksEngine::from_db(Arc::new(db)); let mut wb = engine.write_batch(); From 1cc64cf4a1555efb28555ebaa6fc32b4918bdcce Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Thu, 21 Jul 2022 16:03:10 +0800 Subject: [PATCH 104/676] server: do not update background cpu limit when auto-tune is off (#13056) ref tikv/tikv#12679, close tikv/tikv#13055 ThreadTime is collected when the cpu limit in the QuotaLimiter is not infinity. #12679 updated the background cpu limit even if auto-tune is off, which is unnecessary. So, that causes additional cost of collecting thread CPU time in some critical paths. This commit sets cpu_time_limit of the QuotaLimiter only if auto-tune is enabled, so we don't waste effort to collect CPU time when auto-tune is not enabled. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- components/server/src/server.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 8eca26404d9..58a4dc61338 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1296,20 +1296,17 @@ impl TiKvServer { }; // Determine the base cpu quota - let base_cpu_quota = { + let base_cpu_quota = // if cpu quota is not specified, start from optimistic case if quota_limiter.cputime_limiter(false).is_infinite() { - let quota = 1000_f64 + 1000_f64 * f64::max( BACKGROUND_REQUEST_CORE_LOWER_BOUND, SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_DEFAULT_RATIO, - ); - quota_limiter.set_cpu_time_limit(quota as usize, false); - quota + ) } else { quota_limiter.cputime_limiter(false) / 1000_f64 - } - }; + }; // Calculate the celling and floor quota let celling_quota = f64::min( @@ -1326,7 +1323,12 @@ impl TiKvServer { DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL, move || { if quota_limiter.auto_tune_enabled() { - let old_quota = quota_limiter.cputime_limiter(false) / 1000_f64; + let cputime_limit = quota_limiter.cputime_limiter(false); + let old_quota = if cputime_limit.is_infinite() { + base_cpu_quota + } else { + cputime_limit / 1000_f64 + }; let cpu_usage = match proc_stats.cpu_usage() { Ok(r) => r, Err(_e) => 0.0, From 1f0a1a3451302647f4de7fc06fe323e0cba67b98 Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 21 Jul 2022 16:43:10 +0800 Subject: [PATCH 105/676] raftstore: fix high commit log duration when adding new peer (#13078) close tikv/tikv#13077 When adding a new peer, `alive_cache_idx` would not consider the new peer still in applying snapshot. Then it may trigger compacting entry cache due to `alive_cache_idx` being equal to `applied_idx`. After the snapshot is applied, the log gap of new peer is not in entry cache, which triggers async fetch to read disk. Considering raft engine's read performance is not as good as rocksdb's, once there are a lot of Regions triggering async fetch, the process of replicating log to new peer would be slow. If there is a conf change promoting the learner and demoting another peer, the commit index can't be advanced in joint state because the to-be-learner peer doesn't catch up logs in time. Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- components/engine_panic/src/raft_engine.rs | 4 -- components/engine_rocks/src/raft_engine.rs | 4 -- components/engine_traits/src/raft_engine.rs | 8 --- components/raft_log_engine/src/engine.rs | 6 -- components/raftstore/src/store/fsm/peer.rs | 43 +++++++------ components/raftstore/src/store/peer.rs | 4 +- .../raftstore/src/store/peer_storage.rs | 62 ++++--------------- tests/failpoints/cases/test_async_fetch.rs | 36 +++++++++++ 8 files changed, 73 insertions(+), 94 deletions(-) diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index d6f82c7f646..2fffb544fe3 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -118,10 +118,6 @@ impl RaftEngine for PanicEngine { panic!() } - fn has_builtin_entry_cache(&self) -> bool { - panic!() - } - fn flush_metrics(&self, instance: &str) { panic!() } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index 2f67904486f..607e0bfca17 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -312,10 +312,6 @@ impl RaftEngine for RocksEngine { Ok(vec![]) } - fn has_builtin_entry_cache(&self) -> bool { - false - } - fn flush_metrics(&self, instance: &str) { KvEngine::flush_metrics(self, instance) } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 7773ee3245f..58a78f605f9 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -119,14 +119,6 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send /// which needs to be compacted ASAP. fn purge_expired_files(&self) -> Result>; - /// The `RaftEngine` has a builtin entry cache or not. - fn has_builtin_entry_cache(&self) -> bool { - false - } - - /// GC the builtin entry cache. - fn gc_entry_cache(&self, _raft_group_id: u64, _to: u64) {} - fn flush_metrics(&self, _instance: &str) {} fn flush_stats(&self) -> Option { None diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index d2f8b7cb4e1..22d2d645165 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -548,12 +548,6 @@ impl RaftEngine for RaftLogEngine { self.0.purge_expired_files().map_err(transfer_error) } - fn has_builtin_entry_cache(&self) -> bool { - false - } - - fn gc_entry_cache(&self, _raft_group_id: u64, _to: u64) {} - /// Flush current cache stats. fn flush_stats(&self) -> Option { None diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index fad93ac54d8..f3bcd56eabf 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1916,7 +1916,7 @@ where ); if self.fsm.peer.pending_remove { - self.fsm.peer.mut_store().flush_cache_metrics(); + self.fsm.peer.mut_store().flush_entry_cache_metrics(); return; } // When having pending snapshot, if election timeout is met, it can't pass @@ -1983,7 +1983,7 @@ where } self.fsm.peer.post_raft_group_tick(); - self.fsm.peer.mut_store().flush_cache_metrics(); + self.fsm.peer.mut_store().flush_entry_cache_metrics(); // Keep ticking if there are still pending read requests or this node is within hibernate timeout. if res.is_none() /* hibernate_region is false */ || @@ -3560,7 +3560,7 @@ where let compact_to = state.get_index() + 1; self.fsm.peer.schedule_raftlog_gc(self.ctx, compact_to); self.fsm.peer.last_compacted_idx = compact_to; - self.fsm.peer.mut_store().compact_to(compact_to); + self.fsm.peer.mut_store().on_compact_raftlog(compact_to); } fn on_ready_split_region( @@ -4900,7 +4900,7 @@ where // snapshot generating has already been cancelled when the role becomes follower. return; } - if !self.fsm.peer.get_store().is_cache_empty() || !self.ctx.cfg.hibernate_regions { + if !self.fsm.peer.get_store().is_entry_cache_empty() || !self.ctx.cfg.hibernate_regions { self.register_raft_gc_log_tick(); } fail_point!("on_raft_log_gc_tick_1", self.fsm.peer_id() == 1, |_| {}); @@ -4930,21 +4930,26 @@ where // `alive_cache_idx` is only used to gc cache. let applied_idx = self.fsm.peer.get_store().applied_index(); let truncated_idx = self.fsm.peer.get_store().truncated_index(); + let first_idx = self.fsm.peer.get_store().first_index(); let last_idx = self.fsm.peer.get_store().last_index(); + let (mut replicated_idx, mut alive_cache_idx) = (last_idx, last_idx); for (peer_id, p) in self.fsm.peer.raft_group.raft.prs().iter() { if replicated_idx > p.matched { replicated_idx = p.matched; } if let Some(last_heartbeat) = self.fsm.peer.peer_heartbeats.get(peer_id) { - if alive_cache_idx > p.matched - && p.matched >= truncated_idx - && *last_heartbeat > cache_alive_limit - { - alive_cache_idx = p.matched; + if *last_heartbeat > cache_alive_limit { + if alive_cache_idx > p.matched && p.matched >= truncated_idx { + alive_cache_idx = p.matched; + } else if p.matched == 0 { + // the new peer is still applying snapshot, do not compact cache now + alive_cache_idx = 0; + } } } } + // When an election happened or a new peer is added, replicated_idx can be 0. if replicated_idx > 0 { assert!( @@ -4955,21 +4960,20 @@ where ); REGION_MAX_LOG_LAG.observe((last_idx - replicated_idx) as f64); } + + // leader may call `get_term()` on the latest replicated index, so compact + // entries before `alive_cache_idx` instead of `alive_cache_idx + 1`. self.fsm .peer .mut_store() - .maybe_gc_cache(alive_cache_idx, applied_idx); + .compact_entry_cache(std::cmp::min(alive_cache_idx, applied_idx + 1)); if needs_evict_entry_cache(self.ctx.cfg.evict_cache_on_memory_ratio) { - self.fsm.peer.mut_store().evict_cache(true); - if !self.fsm.peer.get_store().cache_is_empty() { + self.fsm.peer.mut_store().evict_entry_cache(true); + if !self.fsm.peer.get_store().is_entry_cache_empty() { self.register_entry_cache_evict_tick(); } } - let mut total_gc_logs = 0; - - let first_idx = self.fsm.peer.get_store().first_index(); - let mut compact_idx = if force_compact && replicated_idx > first_idx { replicated_idx } else if (applied_idx > first_idx @@ -5007,7 +5011,6 @@ where .compact_idx_too_small += 1; return; } - total_gc_logs += compact_idx - first_idx; // Create a compact log request and notify directly. let region_id = self.fsm.peer.region().get_id(); @@ -5022,7 +5025,7 @@ where self.fsm.skip_gc_raft_log_ticks = 0; self.register_raft_gc_log_tick(); - PEER_GC_RAFT_LOG_COUNTER.inc_by(total_gc_logs); + PEER_GC_RAFT_LOG_COUNTER.inc_by(compact_idx - first_idx); } fn register_entry_cache_evict_tick(&mut self) { @@ -5032,11 +5035,11 @@ where fn on_entry_cache_evict_tick(&mut self) { fail_point!("on_entry_cache_evict_tick", |_| {}); if needs_evict_entry_cache(self.ctx.cfg.evict_cache_on_memory_ratio) { - self.fsm.peer.mut_store().evict_cache(true); + self.fsm.peer.mut_store().evict_entry_cache(true); } let mut _usage = 0; if memory_usage_reaches_high_water(&mut _usage) - && !self.fsm.peer.get_store().cache_is_empty() + && !self.fsm.peer.get_store().is_entry_cache_empty() { self.register_entry_cache_evict_tick(); } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 489db8b9600..1a7954ca037 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2778,7 +2778,7 @@ where .trace_cached_entries(apply.entries[0].clone()); if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { // Compact all cached entries instead of half evict. - self.mut_store().evict_cache(false); + self.mut_store().evict_entry_cache(false); } ctx.apply_router .schedule_task(self.region_id, ApplyTask::apply(apply)); @@ -3156,7 +3156,7 @@ where if !self.is_leader() { self.mut_store() - .compact_cache_to(apply_state.applied_index + 1); + .compact_entry_cache(apply_state.applied_index + 1); } let progress_to_be_updated = self.mut_store().applied_index_term() != applied_index_term; diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index a6208b09f9e..58e35ff9084 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1383,50 +1383,23 @@ where self.last_term = last_term; } - pub fn compact_to(&mut self, idx: u64) { - self.compact_cache_to(idx); - + pub fn on_compact_raftlog(&mut self, idx: u64) { + self.compact_entry_cache(idx); self.cancel_generating_snap(Some(idx)); } - pub fn compact_cache_to(&mut self, idx: u64) { + pub fn compact_entry_cache(&mut self, idx: u64) { self.cache.compact_to(idx); - let rid = self.get_region_id(); - if self.engines.raft.has_builtin_entry_cache() { - self.engines.raft.gc_entry_cache(rid, idx); - } } #[inline] - pub fn is_cache_empty(&self) -> bool { + pub fn is_entry_cache_empty(&self) -> bool { self.cache.is_empty() } - pub fn maybe_gc_cache(&mut self, replicated_idx: u64, apply_idx: u64) { - if self.engines.raft.has_builtin_entry_cache() { - let rid = self.get_region_id(); - self.engines.raft.gc_entry_cache(rid, apply_idx + 1); - } - if replicated_idx == apply_idx { - // The region is inactive, clear the cache immediately. - self.cache.compact_to(apply_idx + 1); - return; - } - let cache_first_idx = match self.cache.first_index() { - None => return, - Some(idx) => idx, - }; - if cache_first_idx > replicated_idx + 1 { - // Catching up log requires accessing fs already, let's optimize for - // the common case. - // Maybe gc to second least replicated_idx is better. - self.cache.compact_to(apply_idx + 1); - } - } - /// Evict entries from the cache. - pub fn evict_cache(&mut self, half: bool) { - if !self.cache.cache.is_empty() { + pub fn evict_entry_cache(&mut self, half: bool) { + if !self.is_entry_cache_empty() { let cache = &mut self.cache; let cache_len = cache.cache.len(); let drain_to = if half { cache_len / 2 } else { cache_len - 1 }; @@ -1436,22 +1409,11 @@ where } } - pub fn cache_is_empty(&self) -> bool { - self.cache.cache.is_empty() - } - #[inline] - pub fn flush_cache_metrics(&mut self) { + pub fn flush_entry_cache_metrics(&mut self) { // NOTE: memory usage of entry cache is flushed realtime. self.cache.flush_stats(); self.raftlog_fetch_stats.flush_stats(); - if self.engines.raft.has_builtin_entry_cache() { - if let Some(stats) = self.engines.raft.flush_stats() { - RAFT_ENTRIES_CACHES_GAUGE.set(stats.cache_size as i64); - RAFT_ENTRY_FETCHES.hit.inc_by(stats.hit as u64); - RAFT_ENTRY_FETCHES.miss.inc_by(stats.miss as u64); - } - } } // Apply the peer with given snapshot. @@ -2457,7 +2419,7 @@ mod tests { router, store.engines.raft.clone(), )); - store.compact_cache_to(5); + store.compact_entry_cache(5); let mut e = store.entries(lo, hi, maxsize, GetEntriesContext::empty(true)); if e == Err(raft::Error::Store( raft::StorageError::LogTemporarilyUnavailable, @@ -3147,20 +3109,20 @@ mod tests { // compact to min(5 + 1, 7) store.cache.persisted = 5; - store.compact_to(7); + store.compact_entry_cache(7); exp_res = vec![new_entry(6, 7), new_entry(7, 8)]; validate_cache(&store, &exp_res); // compact to min(7 + 1, 7) store.cache.persisted = 7; - store.compact_to(7); + store.compact_entry_cache(7); exp_res = vec![new_entry(7, 8)]; validate_cache(&store, &exp_res); // compact all - store.compact_to(8); + store.compact_entry_cache(8); validate_cache(&store, &[]); // invalid compaction should be ignored. - store.compact_to(6); + store.compact_entry_cache(6); } #[test] diff --git a/tests/failpoints/cases/test_async_fetch.rs b/tests/failpoints/cases/test_async_fetch.rs index 28df1dba891..c6b8a693085 100644 --- a/tests/failpoints/cases/test_async_fetch.rs +++ b/tests/failpoints/cases/test_async_fetch.rs @@ -234,3 +234,39 @@ fn test_node_async_fetch_leader_change() { must_get_equal(&cluster.get_engine(1), &k, &v); } } + +// Test the case whether entry cache is reserved for the newly added peer. +#[test] +fn test_node_compact_entry_cache() { + let count = 5; + let mut cluster = new_node_cluster(0, count); + cluster.pd_client.disable_default_operator(); + + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster.cfg.raft_store.raft_log_reserve_max_ticks = 2; + cluster.run(); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.must_put(b"k0", b"v0"); + cluster.pd_client.must_remove_peer(1, new_peer(5, 5)); + + // pause snapshot applied + fail::cfg("before_region_gen_snap", "pause").unwrap(); + fail::cfg("worker_async_fetch_raft_log", "pause").unwrap(); + // change one peer to learner + cluster.pd_client.add_peer(1, new_learner_peer(5, 5)); + + // cause log lag and pause async fetch to check if entry cache is reserved for the learner + for i in 1..6 { + let k = i.to_string().into_bytes(); + let v = k.clone(); + cluster.must_put(&k, &v); + } + std::thread::sleep(Duration::from_millis(100)); + + fail::remove("before_region_gen_snap"); + cluster.pd_client.must_have_peer(1, new_learner_peer(5, 5)); + + // if entry cache is not reserved, the learner will not be able to catch up. + must_get_equal(&cluster.get_engine(5), b"5", b"5"); +} From dc7c48d1731e079ae0949694f88858554982136e Mon Sep 17 00:00:00 2001 From: BornChanger <97348524+BornChanger@users.noreply.github.com> Date: Thu, 21 Jul 2022 19:55:10 +0800 Subject: [PATCH 106/676] components, src: avoid cpu quota limitation contamination (#13085) close tikv/tikv#13084 Signed-off-by: BornChanger Co-authored-by: Ti Chi Robot --- components/tidb_query_executors/src/runner.rs | 2 +- components/tikv_util/src/quota_limiter.rs | 67 ++++++++++--------- src/config.rs | 12 ++-- src/coprocessor/statistics/analyze.rs | 10 +-- src/storage/mod.rs | 4 +- src/storage/txn/scheduler.rs | 2 +- 6 files changed, 50 insertions(+), 47 deletions(-) diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index dc88c1f6993..4a8a3a02851 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -453,7 +453,7 @@ impl BatchExecutorsRunner { let mut chunk = Chunk::default(); - let mut sample = self.quota_limiter.new_sample(); + let mut sample = self.quota_limiter.new_sample(true); let (drained, record_len) = { let _guard = sample.observe_cpu(); self.internal_handle_request( diff --git a/components/tikv_util/src/quota_limiter.rs b/components/tikv_util/src/quota_limiter.rs index c9a761f49de..f382964c4d1 100644 --- a/components/tikv_util/src/quota_limiter.rs +++ b/components/tikv_util/src/quota_limiter.rs @@ -234,21 +234,24 @@ impl QuotaLimiter { } // To generate a sampler. - pub fn new_sample(&self) -> Sample { + pub fn new_sample(&self, is_foreground: bool) -> Sample { Sample { read_bytes: 0, write_bytes: 0, cpu_time: Duration::ZERO, - enable_cpu_limit: !self - .foreground_limiters - .cputime_limiter - .speed_limit() - .is_infinite() - || !self + enable_cpu_limit: if is_foreground { + !self + .foreground_limiters + .cputime_limiter + .speed_limit() + .is_infinite() + } else { + !self .background_limiters .cputime_limiter .speed_limit() - .is_infinite(), + .is_infinite() + }, } } @@ -389,25 +392,25 @@ mod tests { ); }; - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_cpu_time(Duration::from_millis(60)); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); check_duration(should_delay, Duration::ZERO); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_cpu_time(Duration::from_millis(50)); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); check_duration(should_delay, Duration::from_millis(110)); std::thread::sleep(Duration::from_millis(10)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_cpu_time(Duration::from_millis(20)); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); // should less 60+50+20 assert!(should_delay < Duration::from_millis(130)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_cpu_time(Duration::from_millis(200)); sample.add_write_bytes(256); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); @@ -417,25 +420,25 @@ mod tests { assert!(thread_start_time.elapsed() < Duration::from_millis(50)); quota_limiter.set_cpu_time_limit(2000, true); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_cpu_time(Duration::from_millis(200)); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); check_duration(should_delay, Duration::from_millis(100)); quota_limiter.set_read_bandwidth_limit(ReadableSize(512), true); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_read_bytes(128); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); check_duration(should_delay, Duration::from_millis(250)); quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(2), true); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_write_bytes(256); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); check_duration(should_delay, Duration::from_millis(125)); quota_limiter.set_max_delay_duration(ReadableDuration::millis(40)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_read_bytes(256); sample.add_write_bytes(512); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); @@ -443,19 +446,19 @@ mod tests { // test change limiter to 0 quota_limiter.set_cpu_time_limit(0, true); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_cpu_time(Duration::from_millis(100)); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); check_duration(should_delay, Duration::ZERO); quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(0), true); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_write_bytes(256); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); check_duration(should_delay, Duration::ZERO); quota_limiter.set_read_bandwidth_limit(ReadableSize::kb(0), true); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_read_bytes(256); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); check_duration(should_delay, Duration::ZERO); @@ -463,30 +466,30 @@ mod tests { // set bandwidth back quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(1), true); quota_limiter.set_max_delay_duration(ReadableDuration::millis(0)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_write_bytes(128); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); check_duration(should_delay, Duration::from_millis(125)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_cpu_time(Duration::from_millis(60)); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::ZERO); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_cpu_time(Duration::from_millis(50)); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(110)); std::thread::sleep(Duration::from_millis(10)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_cpu_time(Duration::from_millis(20)); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); // should less 60+50+20 assert!(should_delay < Duration::from_millis(130)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_cpu_time(Duration::from_millis(200)); sample.add_write_bytes(256); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); @@ -496,25 +499,25 @@ mod tests { assert!(thread_start_time.elapsed() < Duration::from_millis(50)); quota_limiter.set_cpu_time_limit(2000, false); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_cpu_time(Duration::from_millis(200)); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(100)); quota_limiter.set_read_bandwidth_limit(ReadableSize(512), false); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_read_bytes(128); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(250)); quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(2), false); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_write_bytes(256); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(125)); quota_limiter.set_max_delay_duration(ReadableDuration::millis(40)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_read_bytes(256); sample.add_write_bytes(512); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); @@ -522,19 +525,19 @@ mod tests { // test change limiter to 0 quota_limiter.set_cpu_time_limit(0, false); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_cpu_time(Duration::from_millis(100)); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::ZERO); quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(0), false); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_write_bytes(256); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::ZERO); quota_limiter.set_read_bandwidth_limit(ReadableSize::kb(0), false); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_read_bytes(256); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::ZERO); @@ -542,7 +545,7 @@ mod tests { // set bandwidth back quota_limiter.set_write_bandwidth_limit(ReadableSize::kb(1), false); quota_limiter.set_max_delay_duration(ReadableDuration::millis(0)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_write_bytes(128); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); check_duration(should_delay, Duration::from_millis(125)); diff --git a/src/config.rs b/src/config.rs index 6ae622bd806..ebf1e132777 100644 --- a/src/config.rs +++ b/src/config.rs @@ -4929,7 +4929,7 @@ mod tests { cfg.quota.foreground_write_bandwidth = ReadableSize::mb(256); assert_eq!(cfg_controller.get_current(), cfg); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_read_bytes(ReadableSize::mb(32).0 as usize); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); assert_eq!(should_delay, Duration::from_millis(125)); @@ -4939,7 +4939,7 @@ mod tests { .unwrap(); cfg.quota.foreground_read_bandwidth = ReadableSize::mb(512); assert_eq!(cfg_controller.get_current(), cfg); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_write_bytes(ReadableSize::mb(128).0 as usize); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); assert_eq!(should_delay, Duration::from_millis(500)); @@ -4956,7 +4956,7 @@ mod tests { cfg.quota.background_write_bandwidth = ReadableSize::mb(256); assert_eq!(cfg_controller.get_current(), cfg); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_read_bytes(ReadableSize::mb(32).0 as usize); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); assert_eq!(should_delay, Duration::from_millis(125)); @@ -4966,7 +4966,7 @@ mod tests { .unwrap(); cfg.quota.background_read_bandwidth = ReadableSize::mb(512); assert_eq!(cfg_controller.get_current(), cfg); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_write_bytes(ReadableSize::mb(128).0 as usize); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); assert_eq!(should_delay, Duration::from_millis(500)); @@ -4976,12 +4976,12 @@ mod tests { .unwrap(); cfg.quota.max_delay_duration = ReadableDuration::millis(50); assert_eq!(cfg_controller.get_current(), cfg); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); sample.add_write_bytes(ReadableSize::mb(128).0 as usize); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); assert_eq!(should_delay, Duration::from_millis(50)); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(false); sample.add_write_bytes(ReadableSize::mb(128).0 as usize); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); assert_eq!(should_delay, Duration::from_millis(50)); diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index bb0348be98f..05a30f64c4d 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -325,7 +325,7 @@ struct RowSampleBuilder { columns_info: Vec, column_groups: Vec, quota_limiter: Arc, - is_quota_auto_tune: bool, + is_auto_analyze: bool, } impl RowSampleBuilder { @@ -334,7 +334,7 @@ impl RowSampleBuilder { storage: TiKvStorage>, ranges: Vec, quota_limiter: Arc, - is_quota_auto_tune: bool, + is_auto_analyze: bool, ) -> Result { let columns_info: Vec<_> = req.take_columns_info().into(); if columns_info.is_empty() { @@ -359,7 +359,7 @@ impl RowSampleBuilder { columns_info, column_groups: req.take_column_groups().into(), quota_limiter, - is_quota_auto_tune, + is_auto_analyze, }) } @@ -391,7 +391,7 @@ impl RowSampleBuilder { time_slice_start = Instant::now(); } - let mut sample = self.quota_limiter.new_sample(); + let mut sample = self.quota_limiter.new_sample(!self.is_auto_analyze); { let _guard = sample.observe_cpu(); let result = self.data.next_batch(BATCH_MAX_SIZE); @@ -446,7 +446,7 @@ impl RowSampleBuilder { // Don't let analyze bandwidth limit the quota limiter, this is already limited in rate limiter. let quota_delay = { - if !self.is_quota_auto_tune { + if !self.is_auto_analyze { self.quota_limiter.consume_sample(sample, true).await } else { self.quota_limiter.consume_sample(sample, false).await diff --git a/src/storage/mod.rs b/src/storage/mod.rs index cb792d7aec2..0864c9edd2d 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -569,7 +569,7 @@ impl Storage { let api_version = self.api_version; let quota_limiter = self.quota_limiter.clone(); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); let res = self.read_pool.spawn_handle( async move { @@ -898,7 +898,7 @@ impl Storage { let concurrency_manager = self.concurrency_manager.clone(); let api_version = self.api_version; let quota_limiter = self.quota_limiter.clone(); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); let res = self.read_pool.spawn_handle( async move { let stage_scheduled_ts = Instant::now(); diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index e78dbdaa49d..2588e820d21 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -775,7 +775,7 @@ impl Scheduler { let ts = task.cmd.ts(); let scheduler = self.clone(); let quota_limiter = self.inner.quota_limiter.clone(); - let mut sample = quota_limiter.new_sample(); + let mut sample = quota_limiter.new_sample(true); let pessimistic_lock_mode = self.pessimistic_lock_mode(); let pipelined = task.cmd.can_be_pipelined() && pessimistic_lock_mode == PessimisticLockMode::Pipelined; From b1952dcaf8af9ab218916b26eb5dd3ce72a8d638 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 22 Jul 2022 13:41:09 +0800 Subject: [PATCH 107/676] metrics: fix a expression error of unified read pool cpu (#13087) close tikv/tikv#13086 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- metrics/grafana/tikv_details.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index adb398824ca..eda4e88de66 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -6836,7 +6836,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"unified_read_po*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"unified_read_po.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, From 6e66f09a6f463b6586c9ba89eaf2a96c2106328c Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 22 Jul 2022 16:53:09 +0800 Subject: [PATCH 108/676] engine_trait: introduce status error (#13059) ref tikv/tikv#13058 This PR is the first step to prepare for tirocks, the new rocksdb wrapper. The status error is introduced and iterator trait is refactored to keep consistent with tirocks. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot Co-authored-by: Xinye Tao --- cmd/tikv-ctl/src/executor.rs | 4 +- components/backup-stream/src/utils.rs | 2 +- components/backup/src/writer.rs | 2 +- components/cdc/src/initializer.rs | 4 +- components/engine_panic/src/cf_options.rs | 2 +- components/engine_panic/src/engine.rs | 19 +-- components/engine_panic/src/snapshot.rs | 21 ++-- components/engine_panic/src/sst.rs | 21 ++-- components/engine_rocks/src/cf_options.rs | 6 +- components/engine_rocks/src/compact.rs | 6 +- components/engine_rocks/src/encryption.rs | 12 +- components/engine_rocks/src/engine.rs | 61 ++++----- .../engine_rocks/src/engine_iterator.rs | 55 ++++---- components/engine_rocks/src/file_system.rs | 23 ++-- components/engine_rocks/src/import.rs | 11 +- components/engine_rocks/src/lib.rs | 4 +- components/engine_rocks/src/misc.rs | 44 +++---- components/engine_rocks/src/raft_engine.rs | 11 +- components/engine_rocks/src/raw.rs | 6 +- components/engine_rocks/src/raw_util.rs | 19 +-- components/engine_rocks/src/snapshot.rs | 21 +--- components/engine_rocks/src/sst.rs | 76 ++++++----- components/engine_rocks/src/status.rs | 19 +++ .../engine_rocks/src/table_properties.rs | 12 +- components/engine_rocks/src/util.rs | 8 +- components/engine_rocks/src/write_batch.rs | 32 +++-- components/engine_test/src/lib.rs | 5 +- components/engine_traits/src/cf_options.rs | 2 +- components/engine_traits/src/engine.rs | 12 +- components/engine_traits/src/errors.rs | 118 ++++++++++++++++-- components/engine_traits/src/file_system.rs | 10 +- components/engine_traits/src/iterable.rs | 93 ++++---------- .../engine_traits_tests/src/iterator.rs | 78 ++++++------ .../src/read_consistency.rs | 6 +- components/engine_traits_tests/src/sst.rs | 27 ++-- .../src/coprocessor/consistency_check.rs | 2 +- .../src/coprocessor/split_check/table.rs | 6 +- components/raftstore/src/store/bootstrap.rs | 2 +- .../raftstore/src/store/compaction_guard.rs | 6 +- components/raftstore/src/store/fsm/store.rs | 2 +- .../raftstore/src/store/peer_storage.rs | 4 +- .../raftstore/src/store/region_snapshot.rs | 80 ++++-------- components/raftstore/src/store/snap.rs | 2 +- components/raftstore/src/store/snap/io.rs | 6 +- .../raftstore/src/store/worker/split_check.rs | 4 +- components/server/src/raft_engine_switch.rs | 11 +- components/sst_importer/src/import_file.rs | 3 +- components/sst_importer/src/sst_importer.rs | 40 +++--- components/test_backup/src/lib.rs | 2 +- components/test_raftstore/src/cluster.rs | 8 +- components/test_raftstore/src/util.rs | 2 +- components/tikv_kv/src/btree_engine.rs | 17 ++- components/tikv_kv/src/cursor.rs | 8 +- components/tikv_kv/src/lib.rs | 30 +++-- components/tikv_kv/src/raftstore_impls.rs | 11 +- components/tikv_kv/src/rocksdb_engine.rs | 19 ++- components/tikv_util/src/config.rs | 14 +-- src/config.rs | 39 +++--- src/import/duplicate_detect.rs | 4 +- src/server/debug.rs | 24 ++-- src/server/engine_factory.rs | 2 +- src/server/engine_factory_v2.rs | 2 +- src/server/gc_worker/compaction_filter.rs | 5 +- src/server/node.rs | 2 +- src/server/reset_to_version.rs | 24 ++-- src/storage/kv/test_engine_builder.rs | 8 +- src/storage/mod.rs | 8 +- src/storage/mvcc/consistency_check.rs | 12 +- src/storage/mvcc/reader/reader.rs | 4 +- src/storage/raw/encoded.rs | 11 +- src/storage/raw/raw_mvcc.rs | 25 ++-- src/storage/raw/store.rs | 11 +- src/storage/txn/commands/prewrite.rs | 5 +- src/storage/txn/store.rs | 5 +- .../cases/test_replica_stale_read.rs | 13 +- tests/failpoints/cases/test_ttl.rs | 2 +- .../raftstore/test_split_region.rs | 18 ++- .../integrations/raftstore/test_tombstone.rs | 2 +- tests/integrations/storage/test_raftkv.rs | 32 ++--- 79 files changed, 700 insertions(+), 689 deletions(-) create mode 100644 components/engine_rocks/src/status.rs diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 96b322936bc..401d96e5d8e 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -1090,8 +1090,8 @@ impl DebugExecutor for Debugger { fn handle_engine_error(err: EngineError) -> ! { error!("error while open kvdb: {}", err); - if let EngineError::Engine(msg) = err { - if msg.starts_with(LOCK_FILE_ERROR) { + if let EngineError::Engine(s) = err { + if s.state().contains(LOCK_FILE_ERROR) { error!( "LOCK file conflict indicates TiKV process is running. \ Do NOT delete the LOCK file and force the command to run. \ diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 0f09e747b80..6ad26cb045c 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -759,7 +759,7 @@ mod test { let (items, size) = super::with_record_read_throughput(|| { let mut items = vec![]; let snap = engine.snapshot(); - snap.scan(b"", b"", false, |k, v| { + snap.scan(CF_DEFAULT, b"", b"", false, |k, v| { items.push((k.to_owned(), v.to_owned())); Ok(true) }) diff --git a/components/backup/src/writer.rs b/components/backup/src/writer.rs index 4c4c6dc5ec7..99a907948ce 100644 --- a/components/backup/src/writer.rs +++ b/components/backup/src/writer.rs @@ -458,7 +458,7 @@ mod tests { } for (cf, kv) in kvs { let mut map = BTreeMap::new(); - db.scan_cf( + db.scan( cf, keys::DATA_MIN_KEY, keys::DATA_MAX_KEY, diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index e1feb0c9795..28b7e5f5d0a 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -240,7 +240,9 @@ impl Initializer { let (raw_key_prefix, raw_key_prefix_end) = ApiV2::get_rawkv_range(); iter_opt.set_lower_bound(&[raw_key_prefix], DATA_KEY_PREFIX_LEN); iter_opt.set_upper_bound(&[raw_key_prefix_end], DATA_KEY_PREFIX_LEN); - let mut iter = RawMvccSnapshot::from_snapshot(snap).iter(iter_opt).unwrap(); + let mut iter = RawMvccSnapshot::from_snapshot(snap) + .iter(CF_DEFAULT, iter_opt) + .unwrap(); iter.seek_to_first()?; Scanner::RawKvScanner(iter) diff --git a/components/engine_panic/src/cf_options.rs b/components/engine_panic/src/cf_options.rs index 918185b8183..f00db2eeb4f 100644 --- a/components/engine_panic/src/cf_options.rs +++ b/components/engine_panic/src/cf_options.rs @@ -44,7 +44,7 @@ impl ColumnFamilyOptions for PanicColumnFamilyOptions { fn get_block_cache_capacity(&self) -> u64 { panic!() } - fn set_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { + fn set_block_cache_capacity(&self, capacity: u64) -> Result<()> { panic!() } fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index 5608b55ea00..128cb318ed6 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -1,7 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{ - IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, Result, SeekKey, SyncMutable, + IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, Result, SyncMutable, TabletAccessor, WriteOptions, }; @@ -75,10 +75,7 @@ impl SyncMutable for PanicEngine { impl Iterable for PanicEngine { type Iterator = PanicEngineIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { - panic!() - } - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { panic!() } } @@ -86,10 +83,18 @@ impl Iterable for PanicEngine { pub struct PanicEngineIterator; impl Iterator for PanicEngineIterator { - fn seek(&mut self, key: SeekKey<'_>) -> Result { + fn seek(&mut self, key: &[u8]) -> Result { + panic!() + } + fn seek_for_prev(&mut self, key: &[u8]) -> Result { panic!() } - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result { + + fn seek_to_first(&mut self) -> Result { + panic!() + } + + fn seek_to_last(&mut self) -> Result { panic!() } diff --git a/components/engine_panic/src/snapshot.rs b/components/engine_panic/src/snapshot.rs index c65dc560326..e27ed42d093 100644 --- a/components/engine_panic/src/snapshot.rs +++ b/components/engine_panic/src/snapshot.rs @@ -2,9 +2,7 @@ use std::ops::Deref; -use engine_traits::{ - IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, SeekKey, Snapshot, -}; +use engine_traits::{IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, Snapshot}; use crate::{db_vector::PanicDBVector, engine::PanicEngine}; @@ -36,10 +34,7 @@ impl Peekable for PanicSnapshot { impl Iterable for PanicSnapshot { type Iterator = PanicSnapshotIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { - panic!() - } - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { panic!() } } @@ -47,10 +42,18 @@ impl Iterable for PanicSnapshot { pub struct PanicSnapshotIterator; impl Iterator for PanicSnapshotIterator { - fn seek(&mut self, key: SeekKey<'_>) -> Result { + fn seek(&mut self, key: &[u8]) -> Result { + panic!() + } + fn seek_for_prev(&mut self, key: &[u8]) -> Result { panic!() } - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result { + + fn seek_to_first(&mut self) -> Result { + panic!() + } + + fn seek_to_last(&mut self) -> Result { panic!() } diff --git a/components/engine_panic/src/sst.rs b/components/engine_panic/src/sst.rs index 64aa5666fe1..d1e5f4b331c 100644 --- a/components/engine_panic/src/sst.rs +++ b/components/engine_panic/src/sst.rs @@ -3,8 +3,8 @@ use std::path::PathBuf; use engine_traits::{ - CfName, ExternalSstFileInfo, IterOptions, Iterable, Iterator, Result, SeekKey, - SstCompressionType, SstExt, SstReader, SstWriter, SstWriterBuilder, + CfName, ExternalSstFileInfo, IterOptions, Iterable, Iterator, Result, SstCompressionType, + SstExt, SstReader, SstWriter, SstWriterBuilder, }; use crate::engine::PanicEngine; @@ -32,10 +32,7 @@ impl SstReader for PanicSstReader { impl Iterable for PanicSstReader { type Iterator = PanicSstReaderIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { - panic!() - } - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { panic!() } } @@ -43,10 +40,18 @@ impl Iterable for PanicSstReader { pub struct PanicSstReaderIterator; impl Iterator for PanicSstReaderIterator { - fn seek(&mut self, key: SeekKey<'_>) -> Result { + fn seek(&mut self, key: &[u8]) -> Result { + panic!() + } + fn seek_for_prev(&mut self, key: &[u8]) -> Result { panic!() } - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result { + + fn seek_to_first(&mut self) -> Result { + panic!() + } + + fn seek_to_last(&mut self) -> Result { panic!() } diff --git a/components/engine_rocks/src/cf_options.rs b/components/engine_rocks/src/cf_options.rs index 49ba840bc00..87d05510f58 100644 --- a/components/engine_rocks/src/cf_options.rs +++ b/components/engine_rocks/src/cf_options.rs @@ -5,7 +5,7 @@ use rocksdb::ColumnFamilyOptions as RawCFOptions; use tikv_util::box_err; use crate::{ - db_options::RocksTitanDBOptions, engine::RocksEngine, + db_options::RocksTitanDBOptions, engine::RocksEngine, r2e, sst_partitioner::RocksSstPartitionerFactory, util, }; @@ -79,8 +79,8 @@ impl ColumnFamilyOptions for RocksColumnFamilyOptions { self.0.get_block_cache_capacity() } - fn set_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { - self.0.set_block_cache_capacity(capacity) + fn set_block_cache_capacity(&self, capacity: u64) -> Result<()> { + self.0.set_block_cache_capacity(capacity).map_err(r2e) } fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index fef3af46f5c..0b50e0757c2 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -5,7 +5,7 @@ use std::cmp; use engine_traits::{CFNamesExt, CompactExt, Result}; use rocksdb::{CompactOptions, CompactionOptions, DBCompressionType}; -use crate::{engine::RocksEngine, util}; +use crate::{engine::RocksEngine, r2e, util}; impl CompactExt for RocksEngine { type CompactedEvent = crate::compact_listener::RocksCompactedEvent; @@ -130,8 +130,8 @@ impl CompactExt for RocksEngine { opts.set_max_subcompactions(max_subcompactions as i32); opts.set_output_file_size_limit(output_file_size_limit); - db.compact_files_cf(handle, &opts, &files, output_level)?; - Ok(()) + db.compact_files_cf(handle, &opts, &files, output_level) + .map_err(r2e) } } diff --git a/components/engine_rocks/src/encryption.rs b/components/engine_rocks/src/encryption.rs index 94c13e811a9..3caf07a0276 100644 --- a/components/engine_rocks/src/encryption.rs +++ b/components/engine_rocks/src/encryption.rs @@ -9,19 +9,19 @@ use rocksdb::{ FileEncryptionInfo as DBFileEncryptionInfo, }; -use crate::raw::Env; +use crate::{r2e, raw::Env}; // Use engine::Env directly since Env is not abstracted. pub(crate) fn get_env( base_env: Option>, key_manager: Option>, -) -> std::result::Result, String> { +) -> engine_traits::Result> { let base_env = base_env.unwrap_or_else(|| Arc::new(Env::default())); if let Some(manager) = key_manager { - Ok(Arc::new(Env::new_key_managed_encrypted_env( - base_env, - WrappedEncryptionKeyManager { manager }, - )?)) + Ok(Arc::new( + Env::new_key_managed_encrypted_env(base_env, WrappedEncryptionKeyManager { manager }) + .map_err(r2e)?, + )) } else { Ok(base_env) } diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 33af3b78036..e6a1cf4a6a7 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -3,14 +3,14 @@ use std::{any::Any, fs, path::Path, sync::Arc}; use engine_traits::{ - Error, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable, - TabletAccessor, + IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable, TabletAccessor, }; use rocksdb::{DBIterator, Writable, DB}; use crate::{ db_vector::RocksDBVector, options::RocksReadOptions, + r2e, rocks_metrics::{ flush_engine_histogram_metrics, flush_engine_iostall_properties, flush_engine_properties, flush_engine_ticker_metrics, @@ -82,7 +82,7 @@ impl KvEngine for RocksEngine { } fn sync(&self) -> Result<()> { - self.db.sync_wal().map_err(Error::Engine) + self.db.sync_wal().map_err(r2e) } fn flush_metrics(&self, instance: &str) { @@ -133,15 +133,7 @@ impl TabletAccessor for RocksEngine { impl Iterable for RocksEngine { type Iterator = RocksEngineIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { - let opt: RocksReadOptions = opts.into(); - Ok(RocksEngineIterator::from_raw(DBIterator::new( - self.db.clone(), - opt.into_raw(), - ))) - } - - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { let handle = get_cf_handle(&self.db, cf)?; let opt: RocksReadOptions = opts.into(); Ok(RocksEngineIterator::from_raw(DBIterator::new_cf( @@ -157,7 +149,7 @@ impl Peekable for RocksEngine { fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { let opt: RocksReadOptions = opts.into(); - let v = self.db.get_opt(key, &opt.into_raw())?; + let v = self.db.get_opt(key, &opt.into_raw()).map_err(r2e)?; Ok(v.map(RocksDBVector::from_raw)) } @@ -169,41 +161,42 @@ impl Peekable for RocksEngine { ) -> Result> { let opt: RocksReadOptions = opts.into(); let handle = get_cf_handle(&self.db, cf)?; - let v = self.db.get_cf_opt(handle, key, &opt.into_raw())?; + let v = self + .db + .get_cf_opt(handle, key, &opt.into_raw()) + .map_err(r2e)?; Ok(v.map(RocksDBVector::from_raw)) } } impl SyncMutable for RocksEngine { fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { - self.db.put(key, value).map_err(Error::Engine) + self.db.put(key, value).map_err(r2e) } fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { let handle = get_cf_handle(&self.db, cf)?; - self.db.put_cf(handle, key, value).map_err(Error::Engine) + self.db.put_cf(handle, key, value).map_err(r2e) } fn delete(&self, key: &[u8]) -> Result<()> { - self.db.delete(key).map_err(Error::Engine) + self.db.delete(key).map_err(r2e) } fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { let handle = get_cf_handle(&self.db, cf)?; - self.db.delete_cf(handle, key).map_err(Error::Engine) + self.db.delete_cf(handle, key).map_err(r2e) } fn delete_range(&self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { - self.db - .delete_range(begin_key, end_key) - .map_err(Error::Engine) + self.db.delete_range(begin_key, end_key).map_err(r2e) } fn delete_range_cf(&self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { let handle = get_cf_handle(&self.db, cf)?; self.db .delete_range_cf(handle, begin_key, end_key) - .map_err(Error::Engine) + .map_err(r2e) } } @@ -211,7 +204,7 @@ impl SyncMutable for RocksEngine { mod tests { use std::sync::Arc; - use engine_traits::{Iterable, KvEngine, Peekable, SyncMutable}; + use engine_traits::{Iterable, KvEngine, Peekable, SyncMutable, CF_DEFAULT}; use kvproto::metapb::Region; use tempfile::Builder; @@ -285,7 +278,7 @@ mod tests { let mut data = vec![]; engine - .scan(b"", &[0xFF, 0xFF], false, |key, value| { + .scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) @@ -300,7 +293,7 @@ mod tests { data.clear(); engine - .scan_cf(cf, b"", &[0xFF, 0xFF], false, |key, value| { + .scan(cf, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) @@ -314,16 +307,16 @@ mod tests { ); data.clear(); - let pair = engine.seek(b"a1").unwrap().unwrap(); + let pair = engine.seek(CF_DEFAULT, b"a1").unwrap().unwrap(); assert_eq!(pair, (b"a1".to_vec(), b"v1".to_vec())); - assert!(engine.seek(b"a3").unwrap().is_none()); - let pair_cf = engine.seek_cf(cf, b"a1").unwrap().unwrap(); + assert!(engine.seek(CF_DEFAULT, b"a3").unwrap().is_none()); + let pair_cf = engine.seek(cf, b"a1").unwrap().unwrap(); assert_eq!(pair_cf, (b"a1".to_vec(), b"v1".to_vec())); - assert!(engine.seek_cf(cf, b"a3").unwrap().is_none()); + assert!(engine.seek(cf, b"a3").unwrap().is_none()); let mut index = 0; engine - .scan(b"", &[0xFF, 0xFF], false, |key, value| { + .scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); index += 1; Ok(index != 1) @@ -335,15 +328,15 @@ mod tests { let snap = RocksSnapshot::new(engine.get_sync_db()); engine.put(b"a3", b"v3").unwrap(); - assert!(engine.seek(b"a3").unwrap().is_some()); + assert!(engine.seek(CF_DEFAULT, b"a3").unwrap().is_some()); - let pair = snap.seek(b"a1").unwrap().unwrap(); + let pair = snap.seek(CF_DEFAULT, b"a1").unwrap().unwrap(); assert_eq!(pair, (b"a1".to_vec(), b"v1".to_vec())); - assert!(snap.seek(b"a3").unwrap().is_none()); + assert!(snap.seek(CF_DEFAULT, b"a3").unwrap().is_none()); data.clear(); - snap.scan(b"", &[0xFF, 0xFF], false, |key, value| { + snap.scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) diff --git a/components/engine_rocks/src/engine_iterator.rs b/components/engine_rocks/src/engine_iterator.rs index fcc10237510..de51b32c8f4 100644 --- a/components/engine_rocks/src/engine_iterator.rs +++ b/components/engine_rocks/src/engine_iterator.rs @@ -2,8 +2,10 @@ use std::sync::Arc; -use engine_traits::{self, Error, Result}; -use rocksdb::{DBIterator, SeekKey as RawSeekKey, DB}; +use engine_traits::{self, Result}; +use rocksdb::{DBIterator, DB}; + +use crate::r2e; // FIXME: Would prefer using &DB instead of Arc. As elsewhere in // this crate, it would require generic associated types. @@ -20,30 +22,38 @@ impl RocksEngineIterator { } impl engine_traits::Iterator for RocksEngineIterator { - fn seek(&mut self, key: engine_traits::SeekKey<'_>) -> Result { - let k: RocksSeekKey<'_> = key.into(); - self.0.seek(k.into_raw()).map_err(Error::Engine) + fn seek(&mut self, key: &[u8]) -> Result { + self.0.seek(rocksdb::SeekKey::Key(key)).map_err(r2e) + } + + fn seek_for_prev(&mut self, key: &[u8]) -> Result { + self.0 + .seek_for_prev(rocksdb::SeekKey::Key(key)) + .map_err(r2e) } - fn seek_for_prev(&mut self, key: engine_traits::SeekKey<'_>) -> Result { - let k: RocksSeekKey<'_> = key.into(); - self.0.seek_for_prev(k.into_raw()).map_err(Error::Engine) + fn seek_to_first(&mut self) -> Result { + self.0.seek(rocksdb::SeekKey::Start).map_err(r2e) + } + + fn seek_to_last(&mut self) -> Result { + self.0.seek(rocksdb::SeekKey::End).map_err(r2e) } fn prev(&mut self) -> Result { #[cfg(not(feature = "nortcheck"))] if !self.valid()? { - return Err(Error::Engine("Iterator invalid".to_string())); + return Err(r2e("Iterator invalid")); } - self.0.prev().map_err(Error::Engine) + self.0.prev().map_err(r2e) } fn next(&mut self) -> Result { #[cfg(not(feature = "nortcheck"))] if !self.valid()? { - return Err(Error::Engine("Iterator invalid".to_string())); + return Err(r2e("Iterator invalid")); } - self.0.next().map_err(Error::Engine) + self.0.next().map_err(r2e) } fn key(&self) -> &[u8] { @@ -59,25 +69,6 @@ impl engine_traits::Iterator for RocksEngineIterator { } fn valid(&self) -> Result { - self.0.valid().map_err(Error::Engine) - } -} - -pub struct RocksSeekKey<'a>(RawSeekKey<'a>); - -impl<'a> RocksSeekKey<'a> { - pub fn into_raw(self) -> RawSeekKey<'a> { - self.0 - } -} - -impl<'a> From> for RocksSeekKey<'a> { - fn from(key: engine_traits::SeekKey<'a>) -> Self { - let k = match key { - engine_traits::SeekKey::Start => RawSeekKey::Start, - engine_traits::SeekKey::End => RawSeekKey::End, - engine_traits::SeekKey::Key(k) => RawSeekKey::Key(k), - }; - RocksSeekKey(k) + self.0.valid().map_err(r2e) } } diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index 397eaead488..2fcbc405056 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -5,20 +5,23 @@ use std::sync::Arc; use engine_traits::{EngineFileSystemInspector, FileSystemInspector}; use rocksdb::FileSystemInspector as DBFileSystemInspector; -use crate::raw::Env; +use crate::{e2r, r2e, raw::Env}; // Use engine::Env directly since Env is not abstracted. pub(crate) fn get_env( base_env: Option>, limiter: Option>, -) -> Result, String> { +) -> engine_traits::Result> { let base_env = base_env.unwrap_or_else(|| Arc::new(Env::default())); - Ok(Arc::new(Env::new_file_system_inspected_env( - base_env, - WrappedFileSystemInspector { - inspector: EngineFileSystemInspector::from_limiter(limiter), - }, - )?)) + Ok(Arc::new( + Env::new_file_system_inspected_env( + base_env, + WrappedFileSystemInspector { + inspector: EngineFileSystemInspector::from_limiter(limiter), + }, + ) + .map_err(r2e)?, + )) } pub struct WrappedFileSystemInspector { @@ -27,11 +30,11 @@ pub struct WrappedFileSystemInspector { impl DBFileSystemInspector for WrappedFileSystemInspector { fn read(&self, len: usize) -> Result { - self.inspector.read(len) + self.inspector.read(len).map_err(e2r) } fn write(&self, len: usize) -> Result { - self.inspector.write(len) + self.inspector.write(len).map_err(e2r) } } diff --git a/components/engine_rocks/src/import.rs b/components/engine_rocks/src/import.rs index 1cfe24cb8e4..641e33f7bd8 100644 --- a/components/engine_rocks/src/import.rs +++ b/components/engine_rocks/src/import.rs @@ -7,7 +7,7 @@ use rocksdb::{ set_external_sst_file_global_seq_no, IngestExternalFileOptions as RawIngestExternalFileOptions, }; -use crate::{engine::RocksEngine, util}; +use crate::{engine::RocksEngine, r2e, util}; impl ImportExt for RocksEngine { type IngestExternalFileOptions = RocksIngestExternalFileOptions; @@ -22,10 +22,10 @@ impl ImportExt for RocksEngine { // Prior to v5.2.0, TiKV use `write_global_seqno=true` for ingestion. For backward // compatibility, in case TiKV is retrying an ingestion job generated by older // version, it needs to reset the global seqno to 0. - set_external_sst_file_global_seq_no(self.as_inner(), cf, file, 0)?; + set_external_sst_file_global_seq_no(self.as_inner(), cf, file, 0).map_err(r2e)?; f.sync_all() - .map_err(|e| format!("sync {}: {:?}", file, e))?; - Ok(()) + .map_err(|e| format!("sync {}: {:?}", file, e)) + .map_err(r2e) })?; // This is calling a specially optimized version of // ingest_external_file_cf. In cases where the memtable needs to be @@ -34,7 +34,8 @@ impl ImportExt for RocksEngine { // the manual memtable flush was taken. let _did_nonblocking_memtable_flush = self .as_inner() - .ingest_external_file_optimized(cf, &opts.0, files)?; + .ingest_external_file_optimized(cf, &opts.0, files) + .map_err(r2e)?; Ok(()) } } diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index 7cf4d948d0d..b93d8cc7f36 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -47,6 +47,8 @@ mod sst; pub use crate::sst::*; mod sst_partitioner; pub use crate::sst_partitioner::*; +mod status; +pub use crate::status::*; mod table_properties; pub use crate::table_properties::*; mod write_batch; @@ -113,7 +115,7 @@ pub mod raw; pub fn get_env( key_manager: Option>, limiter: Option>, -) -> std::result::Result, String> { +) -> engine_traits::Result> { let env = encryption::get_env(None /*base_env*/, key_manager)?; file_system::get_env(Some(env), limiter) } diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index d7741e98c26..ce608d353b7 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -8,7 +8,8 @@ use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; use crate::{ - engine::RocksEngine, rocks_metrics_defs::*, sst::RocksSstWriterBuilder, util, RocksSstWriter, + engine::RocksEngine, r2e, rocks_metrics_defs::*, sst::RocksSstWriterBuilder, util, + RocksSstWriter, }; pub const MAX_DELETE_COUNT_BY_KEY: usize = 2048; @@ -54,8 +55,8 @@ impl RocksEngine { } last_end_key = Some(r.end_key.to_owned()); - let mut it = self.iterator_cf_opt(cf, opts.clone())?; - let mut it_valid = it.seek(r.start_key.into())?; + let mut it = self.iterator_opt(cf, opts.clone())?; + let mut it_valid = it.seek(r.start_key)?; while it_valid { if it.key() >= r.end_key { break; @@ -106,8 +107,8 @@ impl RocksEngine { // to avoid referring to missing blob files. opts.set_key_only(true); } - let mut it = self.iterator_cf_opt(cf, opts)?; - let mut it_valid = it.seek(range.start_key.into())?; + let mut it = self.iterator_opt(cf, opts)?; + let mut it_valid = it.seek(range.start_key)?; let mut wb = self.write_batch(); while it_valid { wb.delete_cf(cf, it.key())?; @@ -127,12 +128,12 @@ impl RocksEngine { impl MiscExt for RocksEngine { fn flush(&self, sync: bool) -> Result<()> { - Ok(self.as_inner().flush(sync)?) + self.as_inner().flush(sync).map_err(r2e) } fn flush_cf(&self, cf: &str, sync: bool) -> Result<()> { let handle = util::get_cf_handle(self.as_inner(), cf)?; - Ok(self.as_inner().flush_cf(handle, sync)?) + self.as_inner().flush_cf(handle, sync).map_err(r2e) } fn delete_ranges_cf( @@ -151,12 +152,9 @@ impl MiscExt for RocksEngine { if r.start_key >= r.end_key { continue; } - self.as_inner().delete_files_in_range_cf( - handle, - r.start_key, - r.end_key, - false, - )?; + self.as_inner() + .delete_files_in_range_cf(handle, r.start_key, r.end_key, false) + .map_err(r2e)?; } } DeleteStrategy::DeleteBlobs => { @@ -166,12 +164,9 @@ impl MiscExt for RocksEngine { if r.start_key >= r.end_key { continue; } - self.as_inner().delete_blob_files_in_range_cf( - handle, - r.start_key, - r.end_key, - false, - )?; + self.as_inner() + .delete_blob_files_in_range_cf(handle, r.start_key, r.end_key, false) + .map_err(r2e)?; } } } @@ -241,7 +236,8 @@ impl MiscExt for RocksEngine { for cf in db.cf_names() { let handle = util::get_cf_handle(db, cf)?; - db.delete_files_in_ranges_cf(handle, &delete_ranges, /* include_end */ false)?; + db.delete_files_in_ranges_cf(handle, &delete_ranges, /* include_end */ false) + .map_err(r2e)?; } Ok(()) @@ -252,7 +248,7 @@ impl MiscExt for RocksEngine { } fn sync_wal(&self) -> Result<()> { - Ok(self.as_inner().sync_wal()?) + self.as_inner().sync_wal().map_err(r2e) } fn exists(path: &str) -> bool { @@ -340,7 +336,7 @@ mod tests { use std::sync::Arc; use engine_traits::{ - DeleteStrategy, Iterable, Iterator, Mutable, SeekKey, SyncMutable, WriteBatchExt, ALL_CFS, + DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, ALL_CFS, }; use tempfile::Builder; @@ -353,8 +349,8 @@ mod tests { fn check_data(db: &RocksEngine, cfs: &[&str], expected: &[(&[u8], &[u8])]) { for cf in cfs { - let mut iter = db.iterator_cf(cf).unwrap(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = db.iterator(cf).unwrap(); + iter.seek_to_first().unwrap(); for &(k, v) in expected { assert_eq!(k, iter.key()); assert_eq!(v, iter.value()); diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index 607e0bfca17..b6a35f4a4e2 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -64,6 +64,7 @@ impl RaftEngineReadOnly for RocksEngine { let start_key = keys::raft_log_key(region_id, low); let end_key = keys::raft_log_key(region_id, high); self.scan( + CF_DEFAULT, &start_key, &end_key, true, // fill_cache @@ -108,6 +109,7 @@ impl RaftEngineReadOnly for RocksEngine { let start_key = keys::raft_log_key(region_id, 0); let end_key = keys::raft_log_key(region_id, u64::MAX); self.scan( + CF_DEFAULT, &start_key, &end_key, false, // fill_cache @@ -123,7 +125,7 @@ impl RaftEngineReadOnly for RocksEngine { fn is_empty(&self) -> Result { let mut is_empty = true; - self.scan_cf(CF_DEFAULT, b"", b"", false, |_, _| { + self.scan(CF_DEFAULT, b"", b"", false, |_, _| { is_empty = false; Ok(false) })?; @@ -158,6 +160,7 @@ impl RaftEngineDebug for RocksEngine { let start_key = keys::raft_log_key(raft_group_id, 0); let end_key = keys::raft_log_key(raft_group_id, u64::MAX); self.scan( + CF_DEFAULT, &start_key, &end_key, false, // fill_cache @@ -181,7 +184,7 @@ impl RocksEngine { if from == 0 { let start_key = keys::raft_log_key(raft_group_id, 0); let prefix = keys::raft_log_prefix(raft_group_id); - match self.seek(&start_key)? { + match self.seek(CF_DEFAULT, &start_key)? { Some((k, _)) if k.starts_with(&prefix) => from = box_try!(keys::raft_log_index(&k)), // No need to gc. _ => return Ok(0), @@ -252,7 +255,7 @@ impl RaftEngine for RocksEngine { let seek_key = keys::raft_log_key(raft_group_id, 0); let prefix = keys::raft_log_prefix(raft_group_id); fail::fail_point!("engine_rocks_raft_engine_clean_seek", |_| Ok(())); - if let Some((key, _)) = self.seek(&seek_key)? { + if let Some((key, _)) = self.seek(CF_DEFAULT, &seek_key)? { if !key.starts_with(&prefix) { // No raft logs for the raft group. return Ok(()); @@ -343,7 +346,7 @@ impl RaftEngine for RocksEngine { let start_key = keys::REGION_META_MIN_KEY; let end_key = keys::REGION_META_MAX_KEY; let mut err = None; - self.scan(start_key, end_key, false, |key, _| { + self.scan(CF_DEFAULT, start_key, end_key, false, |key, _| { let (region_id, suffix) = box_try!(keys::decode_region_meta_key(key)); if suffix != keys::REGION_STATE_SUFFIX { return Ok(true); diff --git a/components/engine_rocks/src/raw.rs b/components/engine_rocks/src/raw.rs index c7d2e3a0d31..c51c0187b2d 100644 --- a/components/engine_rocks/src/raw.rs +++ b/components/engine_rocks/src/raw.rs @@ -14,7 +14,7 @@ pub use rocksdb::{ DBCompactionFilter, DBCompactionStyle, DBCompressionType, DBEntryType, DBInfoLogLevel, DBIterator, DBOptions, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, DBTitanDBBlobRunMode, Env, EventListener, IngestExternalFileOptions, LRUCacheOptions, - MemoryAllocator, PerfContext, PrepopulateBlockCache, Range, ReadOptions, SeekKey, - SliceTransform, TableFilter, TablePropertiesCollector, TablePropertiesCollectorFactory, - TitanBlobIndex, TitanDBOptions, Writable, WriteOptions, DB, + MemoryAllocator, PerfContext, PrepopulateBlockCache, Range, ReadOptions, SliceTransform, + TableFilter, TablePropertiesCollector, TablePropertiesCollectorFactory, TitanBlobIndex, + TitanDBOptions, Writable, WriteOptions, DB, }; diff --git a/components/engine_rocks/src/raw_util.rs b/components/engine_rocks/src/raw_util.rs index a9f1fcda781..e669f007276 100644 --- a/components/engine_rocks/src/raw_util.rs +++ b/components/engine_rocks/src/raw_util.rs @@ -13,6 +13,8 @@ use rocksdb::{ }; use tikv_util::warn; +use crate::r2e; + pub struct CFOptions<'a> { cf: &'a str, options: ColumnFamilyOptions, @@ -92,12 +94,13 @@ pub fn new_engine_opt( cfs_v.push(x.cf); cf_opts_v.push(x.options.clone()); } - let mut db = DB::open_cf(db_opt, path, cfs_v.into_iter().zip(cf_opts_v).collect())?; + let mut db = + DB::open_cf(db_opt, path, cfs_v.into_iter().zip(cf_opts_v).collect()).map_err(r2e)?; for x in cfs_opts { if x.cf == CF_DEFAULT { continue; } - db.create_cf((x.cf, x.options))?; + db.create_cf((x.cf, x.options)).map_err(r2e)?; } return Ok(db); @@ -106,7 +109,7 @@ pub fn new_engine_opt( db_opt.create_if_missing(false); // Lists all column families in current db. - let cfs_list = DB::list_column_families(&db_opt, path)?; + let cfs_list = DB::list_column_families(&db_opt, path).map_err(r2e)?; let existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); let needed: Vec<&str> = cfs_opts.iter().map(|x| x.cf).collect(); @@ -134,7 +137,8 @@ pub fn new_engine_opt( cfs_opts_v.push(x.options); } - let db = DB::open_cf(db_opt, path, cfs_v.into_iter().zip(cfs_opts_v).collect())?; + let db = + DB::open_cf(db_opt, path, cfs_v.into_iter().zip(cfs_opts_v).collect()).map_err(r2e)?; return Ok(db); } @@ -155,14 +159,14 @@ pub fn new_engine_opt( } } let cfds = cfs_v.into_iter().zip(cfs_opts_v).collect(); - let mut db = DB::open_cf(db_opt, path, cfds)?; + let mut db = DB::open_cf(db_opt, path, cfds).map_err(r2e)?; // Drops discarded column families. // for cf in existed.iter().filter(|x| needed.iter().find(|y| y == x).is_none()) { for cf in cfs_diff(&existed, &needed) { // Never drop default column families. if cf != CF_DEFAULT { - db.drop_cf(cf)?; + db.drop_cf(cf).map_err(r2e)?; } } @@ -176,7 +180,8 @@ pub fn new_engine_opt( .unwrap() .options .clone(), - ))?; + )) + .map_err(r2e)?; } Ok(db) } diff --git a/components/engine_rocks/src/snapshot.rs b/components/engine_rocks/src/snapshot.rs index e1a0f635286..94724b220f7 100644 --- a/components/engine_rocks/src/snapshot.rs +++ b/components/engine_rocks/src/snapshot.rs @@ -9,7 +9,8 @@ use engine_traits::{self, IterOptions, Iterable, Peekable, ReadOptions, Result, use rocksdb::{rocksdb_options::UnsafeSnap, DBIterator, DB}; use crate::{ - db_vector::RocksDBVector, options::RocksReadOptions, util::get_cf_handle, RocksEngineIterator, + db_vector::RocksDBVector, options::RocksReadOptions, r2e, util::get_cf_handle, + RocksEngineIterator, }; pub struct RocksSnapshot { @@ -54,19 +55,7 @@ impl Drop for RocksSnapshot { impl Iterable for RocksSnapshot { type Iterator = RocksEngineIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { - let opt: RocksReadOptions = opts.into(); - let mut opt = opt.into_raw(); - unsafe { - opt.set_snapshot(&self.snap); - } - Ok(RocksEngineIterator::from_raw(DBIterator::new( - self.db.clone(), - opt, - ))) - } - - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { let opt: RocksReadOptions = opts.into(); let mut opt = opt.into_raw(); unsafe { @@ -90,7 +79,7 @@ impl Peekable for RocksSnapshot { unsafe { opt.set_snapshot(&self.snap); } - let v = self.db.get_opt(key, &opt)?; + let v = self.db.get_opt(key, &opt).map_err(r2e)?; Ok(v.map(RocksDBVector::from_raw)) } @@ -106,7 +95,7 @@ impl Peekable for RocksSnapshot { opt.set_snapshot(&self.snap); } let handle = get_cf_handle(self.db.as_ref(), cf)?; - let v = self.db.get_cf_opt(handle, key, &opt)?; + let v = self.db.get_cf_opt(handle, key, &opt).map_err(r2e)?; Ok(v.map(RocksDBVector::from_raw)) } } diff --git a/components/engine_rocks/src/sst.rs b/components/engine_rocks/src/sst.rs index c7eb52e0527..68182238161 100644 --- a/components/engine_rocks/src/sst.rs +++ b/components/engine_rocks/src/sst.rs @@ -3,8 +3,8 @@ use std::{path::PathBuf, rc::Rc, sync::Arc}; use engine_traits::{ - Error, ExternalSstFileInfo, IterOptions, Iterable, Iterator, Result, SeekKey, - SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, + Error, ExternalSstFileInfo, IterOptions, Iterable, Iterator, Result, SstCompressionType, + SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, }; use fail::fail_point; use kvproto::import_sstpb::SstMeta; @@ -14,9 +14,7 @@ use rocksdb::{ SstFileWriter, DB, }; -// FIXME: Move RocksSeekKey into a common module since -// it's shared between multiple iterators -use crate::{engine::RocksEngine, engine_iterator::RocksSeekKey, options::RocksReadOptions}; +use crate::{engine::RocksEngine, options::RocksReadOptions, r2e}; impl SstExt for RocksEngine { type SstReader = RocksSstReader; @@ -51,7 +49,7 @@ impl RocksSstReader { cf_options.set_env(env); } let mut reader = SstFileReader::new(cf_options); - reader.open(path)?; + reader.open(path).map_err(r2e)?; let inner = Rc::new(reader); Ok(RocksSstReader { inner }) } @@ -70,7 +68,7 @@ impl SstReader for RocksSstReader { Self::open_with_env(path, None) } fn verify_checksum(&self) -> Result<()> { - self.inner.verify_checksum()?; + self.inner.verify_checksum().map_err(r2e)?; Ok(()) } fn iter(&self) -> Self::Iterator { @@ -81,7 +79,8 @@ impl SstReader for RocksSstReader { impl Iterable for RocksSstReader { type Iterator = RocksSstIterator; - fn iterator_opt(&self, opts: IterOptions) -> Result { + /// Cf is ignored as there is only one cf in sst. + fn iterator_opt(&self, _cf: &str, opts: IterOptions) -> Result { let opt: RocksReadOptions = opts.into(); let opt = opt.into_raw(); Ok(RocksSstIterator(SstFileReader::iter_opt_rc( @@ -89,10 +88,6 @@ impl Iterable for RocksSstReader { opt, ))) } - - fn iterator_cf_opt(&self, _cf: &str, _opts: IterOptions) -> Result { - unimplemented!() // FIXME: What should happen here? - } } // FIXME: See comment on RocksSstReader for why this contains Rc @@ -103,30 +98,40 @@ pub struct RocksSstIterator(DBIterator>); unsafe impl Send for RocksSstIterator {} impl Iterator for RocksSstIterator { - fn seek(&mut self, key: SeekKey<'_>) -> Result { - let k: RocksSeekKey<'_> = key.into(); - self.0.seek(k.into_raw()).map_err(Error::Engine) + fn seek(&mut self, key: &[u8]) -> Result { + self.0.seek(rocksdb::SeekKey::Key(key)).map_err(r2e) + } + + fn seek_for_prev(&mut self, key: &[u8]) -> Result { + self.0 + .seek_for_prev(rocksdb::SeekKey::Key(key)) + .map_err(r2e) } - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result { - let k: RocksSeekKey<'_> = key.into(); - self.0.seek_for_prev(k.into_raw()).map_err(Error::Engine) + /// Seek to the first key in the database. + fn seek_to_first(&mut self) -> Result { + self.0.seek(rocksdb::SeekKey::Start).map_err(r2e) + } + + /// Seek to the last key in the database. + fn seek_to_last(&mut self) -> Result { + self.0.seek(rocksdb::SeekKey::End).map_err(r2e) } fn prev(&mut self) -> Result { #[cfg(not(feature = "nortcheck"))] if !self.valid()? { - return Err(Error::Engine("Iterator invalid".to_string())); + return Err(r2e("Iterator invalid")); } - self.0.prev().map_err(Error::Engine) + self.0.prev().map_err(r2e) } fn next(&mut self) -> Result { #[cfg(not(feature = "nortcheck"))] if !self.valid()? { - return Err(Error::Engine("Iterator invalid".to_string())); + return Err(r2e("Iterator invalid")); } - self.0.next().map_err(Error::Engine) + self.0.next().map_err(r2e) } fn key(&self) -> &[u8] { @@ -138,7 +143,7 @@ impl Iterator for RocksSstIterator { } fn valid(&self) -> Result { - self.0.valid().map_err(Error::Engine) + self.0.valid().map_err(r2e) } } @@ -192,7 +197,7 @@ impl SstWriterBuilder for RocksSstWriterBuilder { env = db.env(); let handle = db .cf_handle(self.cf.as_deref().unwrap_or(CF_DEFAULT)) - .ok_or_else(|| format!("CF {:?} is not found", self.cf))?; + .ok_or_else(|| r2e(format!("CF {:?} is not found", self.cf)))?; db.get_options_cf(handle) } else { ColumnFamilyOptions::new() @@ -240,7 +245,7 @@ impl SstWriterBuilder for RocksSstWriterBuilder { io_options.bottommost_compression(DBCompressionType::Disable); let mut writer = SstFileWriter::new(EnvOptions::new(), io_options); fail_point!("on_open_sst_writer"); - writer.open(path)?; + writer.open(path).map_err(r2e)?; Ok(RocksSstWriter { writer, env }) } } @@ -255,11 +260,11 @@ impl SstWriter for RocksSstWriter { type ExternalSstFileReader = SequentialFile; fn put(&mut self, key: &[u8], val: &[u8]) -> Result<()> { - Ok(self.writer.put(key, val)?) + self.writer.put(key, val).map_err(r2e) } fn delete(&mut self, key: &[u8]) -> Result<()> { - Ok(self.writer.delete(key)?) + self.writer.delete(key).map_err(r2e) } fn file_size(&mut self) -> u64 { @@ -267,22 +272,25 @@ impl SstWriter for RocksSstWriter { } fn finish(mut self) -> Result { - Ok(RocksExternalSstFileInfo(self.writer.finish()?)) + Ok(RocksExternalSstFileInfo(self.writer.finish().map_err(r2e)?)) } fn finish_read(mut self) -> Result<(Self::ExternalSstFileInfo, Self::ExternalSstFileReader)> { - let env = self.env.take().ok_or_else(|| { - Error::Engine("failed to read sequential file no env provided".to_owned()) - })?; - let sst_info = self.writer.finish()?; + let env = self + .env + .take() + .ok_or_else(|| r2e("failed to read sequential file no env provided"))?; + let sst_info = self.writer.finish().map_err(r2e)?; let p = sst_info.file_path(); let path = p.as_os_str().to_str().ok_or_else(|| { - Error::Engine(format!( + r2e(format!( "failed to sequential file bad path {}", p.display() )) })?; - let seq_file = env.new_sequential_file(path, EnvOptions::new())?; + let seq_file = env + .new_sequential_file(path, EnvOptions::new()) + .map_err(r2e)?; Ok((RocksExternalSstFileInfo(sst_info), seq_file)) } } diff --git a/components/engine_rocks/src/status.rs b/components/engine_rocks/src/status.rs new file mode 100644 index 00000000000..1565e013834 --- /dev/null +++ b/components/engine_rocks/src/status.rs @@ -0,0 +1,19 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +/// A function that will transform a rocksdb error to engine trait error. +/// +/// r stands for rocksdb, e stands for engine_trait. +pub fn r2e(msg: impl Into) -> engine_traits::Error { + // TODO: use correct code. + engine_traits::Error::Engine(engine_traits::Status::with_error( + engine_traits::Code::IoError, + msg, + )) +} + +/// A function that will transform a engine trait error to rocksdb error. +/// +/// r stands for rocksdb, e stands for engine_trait. +pub fn e2r(s: engine_traits::Error) -> String { + format!("{:?}", s) +} diff --git a/components/engine_rocks/src/table_properties.rs b/components/engine_rocks/src/table_properties.rs index 3a3bbad6a04..19b2141483d 100644 --- a/components/engine_rocks/src/table_properties.rs +++ b/components/engine_rocks/src/table_properties.rs @@ -1,8 +1,8 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{Error, Range, Result}; +use engine_traits::{Range, Result}; -use crate::{util, RangeProperties, RocksEngine}; +use crate::{r2e, util, RangeProperties, RocksEngine}; #[repr(transparent)] pub struct UserCollectedProperties(rocksdb::UserCollectedProperties); @@ -57,11 +57,9 @@ impl RocksEngine { let cf = util::get_cf_handle(self.as_inner(), cf)?; // FIXME: extra allocation let ranges: Vec<_> = ranges.iter().map(util::range_to_rocks_range).collect(); - let raw = self - .as_inner() - .get_properties_of_tables_in_range(cf, &ranges); - let raw = raw.map_err(Error::Engine)?; - Ok(raw) + self.as_inner() + .get_properties_of_tables_in_range(cf, &ranges) + .map_err(r2e) } pub fn get_range_properties_cf( diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 47e4016ebc6..81a2ccb497a 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -10,6 +10,7 @@ use crate::{ cf_options::RocksColumnFamilyOptions, db_options::RocksDBOptions, engine::RocksEngine, + r2e, raw_util::{new_engine as new_engine_raw, new_engine_opt as new_engine_opt_raw, CFOptions}, rocks_metrics_defs::*, }; @@ -86,10 +87,9 @@ pub fn new_engine_opt( } pub fn get_cf_handle<'a>(db: &'a DB, cf: &str) -> Result<&'a CFHandle> { - let handle = db - .cf_handle(cf) - .ok_or_else(|| Error::Engine(format!("cf {} not found", cf)))?; - Ok(handle) + db.cf_handle(cf) + .ok_or_else(|| format!("cf {} not found", cf)) + .map_err(r2e) } pub fn range_to_rocks_range<'a>(range: &Range<'a>) -> RocksRange<'a> { diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index 77b8e65d3eb..f09761802e6 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -2,10 +2,10 @@ use std::sync::Arc; -use engine_traits::{self, Error, Mutable, Result, WriteBatchExt, WriteOptions}; +use engine_traits::{self, Mutable, Result, WriteBatchExt, WriteOptions}; use rocksdb::{Writable, WriteBatch as RawWriteBatch, DB}; -use crate::{engine::RocksEngine, options::RocksWriteOptions, util::get_cf_handle}; +use crate::{engine::RocksEngine, options::RocksWriteOptions, r2e, util::get_cf_handle}; const WRITE_BATCH_MAX_BATCH: usize = 16; const WRITE_BATCH_LIMIT: usize = 16; @@ -101,11 +101,11 @@ impl engine_traits::WriteBatch for RocksWriteBatchVec { if self.support_write_batch_vec { self.get_db() .multi_batch_write(self.as_inner(), &opt.into_raw()) - .map_err(Error::Engine) + .map_err(r2e) } else { self.get_db() .write_opt(&self.wbs[0], &opt.into_raw()) - .map_err(Error::Engine) + .map_err(r2e) } } @@ -153,9 +153,9 @@ impl engine_traits::WriteBatch for RocksWriteBatchVec { fn pop_save_point(&mut self) -> Result<()> { if let Some(x) = self.save_points.pop() { - return self.wbs[x].pop_save_point().map_err(Error::Engine); + return self.wbs[x].pop_save_point().map_err(r2e); } - Err(Error::Engine("no save point".into())) + Err(r2e("no save point")) } fn rollback_to_save_point(&mut self) -> Result<()> { @@ -164,9 +164,9 @@ impl engine_traits::WriteBatch for RocksWriteBatchVec { self.wbs[i].clear(); } self.index = x; - return self.wbs[x].rollback_to_save_point().map_err(Error::Engine); + return self.wbs[x].rollback_to_save_point().map_err(r2e); } - Err(Error::Engine("no save point".into())) + Err(r2e("no save point")) } fn merge(&mut self, other: Self) -> Result<()> { @@ -181,35 +181,31 @@ impl engine_traits::WriteBatch for RocksWriteBatchVec { impl Mutable for RocksWriteBatchVec { fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { self.check_switch_batch(); - self.wbs[self.index].put(key, value).map_err(Error::Engine) + self.wbs[self.index].put(key, value).map_err(r2e) } fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { self.check_switch_batch(); let handle = get_cf_handle(self.db.as_ref(), cf)?; - self.wbs[self.index] - .put_cf(handle, key, value) - .map_err(Error::Engine) + self.wbs[self.index].put_cf(handle, key, value).map_err(r2e) } fn delete(&mut self, key: &[u8]) -> Result<()> { self.check_switch_batch(); - self.wbs[self.index].delete(key).map_err(Error::Engine) + self.wbs[self.index].delete(key).map_err(r2e) } fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { self.check_switch_batch(); let handle = get_cf_handle(self.db.as_ref(), cf)?; - self.wbs[self.index] - .delete_cf(handle, key) - .map_err(Error::Engine) + self.wbs[self.index].delete_cf(handle, key).map_err(r2e) } fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { self.check_switch_batch(); self.wbs[self.index] .delete_range(begin_key, end_key) - .map_err(Error::Engine) + .map_err(r2e) } fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { @@ -217,7 +213,7 @@ impl Mutable for RocksWriteBatchVec { let handle = get_cf_handle(self.db.as_ref(), cf)?; self.wbs[self.index] .delete_range_cf(handle, begin_key, end_key) - .map_err(Error::Engine) + .map_err(r2e) } } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 82373ac8568..d6633139122 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -281,10 +281,7 @@ pub mod kv { new_engine } - fn set_shared_block_cache_capacity( - &self, - capacity: u64, - ) -> std::result::Result<(), String> { + fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { let reg = self.registry.lock().unwrap(); // pick up any tablet and set the shared block cache capacity if let Some(((_id, _suffix), tablet)) = (*reg).iter().next() { diff --git a/components/engine_traits/src/cf_options.rs b/components/engine_traits/src/cf_options.rs index 2e130cbf73c..6498238280f 100644 --- a/components/engine_traits/src/cf_options.rs +++ b/components/engine_traits/src/cf_options.rs @@ -21,7 +21,7 @@ pub trait ColumnFamilyOptions { fn get_soft_pending_compaction_bytes_limit(&self) -> u64; fn get_hard_pending_compaction_bytes_limit(&self) -> u64; fn get_block_cache_capacity(&self) -> u64; - fn set_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String>; + fn set_block_cache_capacity(&self, capacity: u64) -> Result<()>; fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions); fn get_target_file_size_base(&self) -> u64; fn set_disable_auto_compactions(&mut self, v: bool); diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 9b560bcd65b..c143cf7a194 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -187,7 +187,7 @@ pub trait TabletFactory: TabletAccessor { fn create_shared_db(&self) -> Result; /// Destroy the tablet and its data - fn destroy_tablet(&self, id: u64, suffix: u64) -> crate::Result<()>; + fn destroy_tablet(&self, id: u64, suffix: u64) -> Result<()>; /// Check if the tablet with specified id/suffix exists #[inline] @@ -219,7 +219,7 @@ pub trait TabletFactory: TabletAccessor { unimplemented!(); } - fn set_shared_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String>; + fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()>; } pub struct DummyFactory @@ -243,7 +243,7 @@ where fn create_shared_db(&self) -> Result { Ok(self.engine.as_ref().unwrap().clone()) } - fn destroy_tablet(&self, _id: u64, _suffix: u64) -> crate::Result<()> { + fn destroy_tablet(&self, _id: u64, _suffix: u64) -> Result<()> { Ok(()) } fn exists_raw(&self, _path: &Path) -> bool { @@ -256,7 +256,7 @@ where PathBuf::from(&self.root_path) } - fn set_shared_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { + fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { let opt = self .engine .as_ref() @@ -312,8 +312,8 @@ mod tests { fn test_tablet_error_collector_err() { let mut err = TabletErrorCollector::new(); err.add_result(1, 1, Ok(())); - err.add_result(1, 1, Err("this is an error1".to_string().into())); - err.add_result(1, 1, Err("this is an error2".to_string().into())); + err.add_result(1, 1, Err(Status::with_code(Code::Aborted).into())); + err.add_result(1, 1, Err(Status::with_code(Code::NotFound).into())); err.add_result(1, 1, Ok(())); let r = err.take_result(); assert!(r.is_err()); diff --git a/components/engine_traits/src/errors.rs b/components/engine_traits/src/errors.rs index 12104e14a5c..6348db22174 100644 --- a/components/engine_traits/src/errors.rs +++ b/components/engine_traits/src/errors.rs @@ -6,11 +6,119 @@ use error_code::{self, ErrorCode, ErrorCodeExt}; use raft::{Error as RaftError, StorageError}; use thiserror::Error; +#[repr(u8)] +#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] +pub enum Code { + Ok = 0, + NotFound = 1, + Corruption = 2, + NotSupported = 3, + InvalidArgument = 4, + IoError = 5, + MergeInProgress = 6, + Incomplete = 7, + ShutdownInProgress = 8, + TimedOut = 9, + Aborted = 10, + Busy = 11, + Expired = 12, + TryAgain = 13, + CompactionTooLarge = 14, + ColumnFamilyDropped = 15, +} + +#[repr(u8)] +#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] +pub enum SubCode { + None = 0, + MutexTimeout = 1, + LockTimeout = 2, + LockLimit = 3, + NoSpace = 4, + Deadlock = 5, + StaleFile = 6, + MemoryLimit = 7, + SpaceLimit = 8, + PathNotFound = 9, +} + +#[repr(u8)] +#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] +pub enum Severity { + NoError = 0, + SoftError = 1, + HardError = 2, + FatalError = 3, + UnrecoverableError = 4, +} + +#[repr(C)] +#[derive(Debug, Error)] +#[error("[{:?}] {:?}-{:?} {}", .code, .sub_code, .sev, .state)] +pub struct Status { + code: Code, + sub_code: SubCode, + sev: Severity, + state: String, +} + +impl Status { + pub fn with_code(code: Code) -> Status { + Self { + code, + sub_code: SubCode::None, + sev: Severity::NoError, + state: String::new(), + } + } + + pub fn with_error(code: Code, error: impl Into) -> Self { + Self { + code, + sub_code: SubCode::None, + sev: Severity::NoError, + state: error.into(), + } + } + + #[inline] + pub fn set_sub_code(&mut self, sub_code: SubCode) -> &mut Self { + self.sub_code = sub_code; + self + } + + #[inline] + pub fn set_severity(&mut self, sev: Severity) -> &mut Self { + self.sev = sev; + self + } + + #[inline] + pub fn code(&self) -> Code { + self.code + } + + #[inline] + pub fn sub_code(&self) -> SubCode { + self.sub_code + } + + #[inline] + pub fn severity(&self) -> Severity { + self.sev + } + + #[inline] + pub fn state(&self) -> &str { + &self.state + } +} + #[derive(Debug, Error)] pub enum Error { // Engine uses plain string as the error. - #[error("Storage Engine {0}")] - Engine(String), + #[error("Storage Engine {0:?}")] + Engine(#[from] Status), // FIXME: It should not know Region. #[error( "Key {} is out of [region {}] [{}, {})", @@ -38,12 +146,6 @@ pub enum Error { EntriesCompacted, } -impl From for Error { - fn from(err: String) -> Self { - Error::Engine(err) - } -} - pub type Result = result::Result; impl ErrorCodeExt for Error { diff --git a/components/engine_traits/src/file_system.rs b/components/engine_traits/src/file_system.rs index 9022aeb7dc2..1671c1f0aab 100644 --- a/components/engine_traits/src/file_system.rs +++ b/components/engine_traits/src/file_system.rs @@ -4,9 +4,11 @@ use std::sync::Arc; use file_system::{get_io_rate_limiter, get_io_type, IOOp, IORateLimiter}; +use crate::Result; + pub trait FileSystemInspector: Sync + Send { - fn read(&self, len: usize) -> Result; - fn write(&self, len: usize) -> Result; + fn read(&self, len: usize) -> Result; + fn write(&self, len: usize) -> Result; } pub struct EngineFileSystemInspector { @@ -33,7 +35,7 @@ impl Default for EngineFileSystemInspector { } impl FileSystemInspector for EngineFileSystemInspector { - fn read(&self, len: usize) -> Result { + fn read(&self, len: usize) -> Result { if let Some(limiter) = &self.limiter { let io_type = get_io_type(); Ok(limiter.request(io_type, IOOp::Read, len)) @@ -42,7 +44,7 @@ impl FileSystemInspector for EngineFileSystemInspector { } } - fn write(&self, len: usize) -> Result { + fn write(&self, len: usize) -> Result { if let Some(limiter) = &self.limiter { let io_type = get_io_type(); Ok(limiter.request(io_type, IOOp::Write, len)) diff --git a/components/engine_traits/src/iterable.rs b/components/engine_traits/src/iterable.rs index a6dbdd2d03f..9d45fc5b0ac 100644 --- a/components/engine_traits/src/iterable.rs +++ b/components/engine_traits/src/iterable.rs @@ -31,13 +31,6 @@ use tikv_util::keybuilder::KeyBuilder; use crate::*; -/// A token indicating where an iterator "seek" operation should stop. -pub enum SeekKey<'a> { - Start, - End, - Key(&'a [u8]), -} - /// An iterator over a consistent set of keys and values. /// /// Iterators are implemented for `KvEngine`s and for `Snapshot`s. They see a @@ -56,15 +49,8 @@ pub enum SeekKey<'a> { pub trait Iterator: Send { /// Move the iterator to a specific key. /// - /// When `key` is `SeekKey::Start` or `SeekKey::End`, - /// `seek` and `seek_for_prev` behave identically. - /// The difference between the two functions is how they - /// behave for `SeekKey::Key`, and only when an exactly - /// matching keys is not found: - /// - /// When seeking with `SeekKey::Key`, and an exact match is not found, - /// `seek` sets the iterator to the next key greater than that - /// specified as `key`, if such a key exists; + /// When an exact match is not found, `seek` sets the iterator to the next + /// key greater than that specified as `key`, if such a key exists; /// `seek_for_prev` sets the iterator to the previous key less than /// that specified as `key`, if such a key exists. /// @@ -72,7 +58,7 @@ pub trait Iterator: Send { /// /// `true` if seeking succeeded and the iterator is valid, /// `false` if seeking failed and the iterator is invalid. - fn seek(&mut self, key: SeekKey<'_>) -> Result; + fn seek(&mut self, key: &[u8]) -> Result; /// Move the iterator to a specific key. /// @@ -83,44 +69,40 @@ pub trait Iterator: Send { /// /// `true` if seeking succeeded and the iterator is valid, /// `false` if seeking failed and the iterator is invalid. - fn seek_for_prev(&mut self, key: SeekKey<'_>) -> Result; + fn seek_for_prev(&mut self, key: &[u8]) -> Result; - /// Short for `seek(SeekKey::Start)`. - fn seek_to_first(&mut self) -> Result { - self.seek(SeekKey::Start) - } + /// Seek to the first key in the engine. + fn seek_to_first(&mut self) -> Result; - /// Short for `seek(SeekKey::End)`. - fn seek_to_last(&mut self) -> Result { - self.seek(SeekKey::End) - } + /// Seek to the last key in the database. + fn seek_to_last(&mut self) -> Result; /// Move a valid iterator to the previous key. /// /// # Panics /// - /// If the iterator is invalid + /// If the iterator is invalid, iterator may panic or aborted. fn prev(&mut self) -> Result; /// Move a valid iterator to the next key. /// /// # Panics /// - /// If the iterator is invalid + /// If the iterator is invalid, iterator may panic or aborted. fn next(&mut self) -> Result; /// Retrieve the current key. /// /// # Panics /// - /// If the iterator is invalid + /// If the iterator is invalid, iterator may panic or aborted. fn key(&self) -> &[u8]; /// Retrieve the current value. /// /// # Panics /// - /// If the iterator is invalid + /// If the iterator is invalid, iterator may panic or aborted. fn value(&self) -> &[u8]; /// Returns `true` if the iterator points to a `key`/`value` pair. @@ -130,32 +112,15 @@ pub trait Iterator: Send { pub trait Iterable { type Iterator: Iterator; - fn iterator_opt(&self, opts: IterOptions) -> Result; - fn iterator_cf_opt(&self, cf: &str, opts: IterOptions) -> Result; - - fn iterator(&self) -> Result { - self.iterator_opt(IterOptions::default()) - } + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result; - fn iterator_cf(&self, cf: &str) -> Result { - self.iterator_cf_opt(cf, IterOptions::default()) + fn iterator(&self, cf: &str) -> Result { + self.iterator_opt(cf, IterOptions::default()) } /// scan the key between start_key(inclusive) and end_key(exclusive), /// the upper bound is omitted if end_key is empty - fn scan(&self, start_key: &[u8], end_key: &[u8], fill_cache: bool, f: F) -> Result<()> - where - F: FnMut(&[u8], &[u8]) -> Result, - { - let start = KeyBuilder::from_slice(start_key, DATA_KEY_PREFIX_LEN, 0); - let end = - (!end_key.is_empty()).then(|| KeyBuilder::from_slice(end_key, DATA_KEY_PREFIX_LEN, 0)); - let iter_opt = IterOptions::new(Some(start), end, fill_cache); - scan_impl(self.iterator_opt(iter_opt)?, start_key, f) - } - - // like `scan`, only on a specific column family. - fn scan_cf( + fn scan( &self, cf: &str, start_key: &[u8], @@ -170,23 +135,13 @@ pub trait Iterable { let end = (!end_key.is_empty()).then(|| KeyBuilder::from_slice(end_key, DATA_KEY_PREFIX_LEN, 0)); let iter_opt = IterOptions::new(Some(start), end, fill_cache); - scan_impl(self.iterator_cf_opt(cf, iter_opt)?, start_key, f) - } - - // Seek the first key >= given key, if not found, return None. - fn seek(&self, key: &[u8]) -> Result, Vec)>> { - let mut iter = self.iterator()?; - if iter.seek(SeekKey::Key(key))? { - let (k, v) = (iter.key().to_vec(), iter.value().to_vec()); - return Ok(Some((k, v))); - } - Ok(None) + scan_impl(self.iterator_opt(cf, iter_opt)?, start_key, f) } // Seek the first key >= given key, if not found, return None. - fn seek_cf(&self, cf: &str, key: &[u8]) -> Result, Vec)>> { - let mut iter = self.iterator_cf(cf)?; - if iter.seek(SeekKey::Key(key))? { + fn seek(&self, cf: &str, key: &[u8]) -> Result, Vec)>> { + let mut iter = self.iterator(cf)?; + if iter.seek(key)? { return Ok(Some((iter.key().to_vec(), iter.value().to_vec()))); } Ok(None) @@ -198,19 +153,13 @@ where Iter: Iterator, F: FnMut(&[u8], &[u8]) -> Result, { - let mut remained = it.seek(SeekKey::Key(start_key))?; + let mut remained = it.seek(start_key)?; while remained { remained = f(it.key(), it.value())? && it.next()?; } Ok(()) } -impl<'a> From<&'a [u8]> for SeekKey<'a> { - fn from(bs: &'a [u8]) -> SeekKey<'a> { - SeekKey::Key(bs) - } -} - /// Collect all items of `it` into a vector, generally used for tests. /// /// # Panics diff --git a/components/engine_traits_tests/src/iterator.rs b/components/engine_traits_tests/src/iterator.rs index 00f7a974b52..96709c3fe29 100644 --- a/components/engine_traits_tests/src/iterator.rs +++ b/components/engine_traits_tests/src/iterator.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{Iterable, Iterator, KvEngine, SeekKey}; +use engine_traits::{Iterable, Iterator, KvEngine, CF_DEFAULT}; use panic_hook::recover_safe; use super::default_engine; @@ -30,24 +30,22 @@ where .is_err() ); - assert_eq!(iter.seek(SeekKey::Start).unwrap(), false); - assert_eq!(iter.seek(SeekKey::End).unwrap(), false); - assert_eq!(iter.seek(SeekKey::Key(b"foo")).unwrap(), false); - assert_eq!(iter.seek_for_prev(SeekKey::Start).unwrap(), false); - assert_eq!(iter.seek_for_prev(SeekKey::End).unwrap(), false); - assert_eq!(iter.seek_for_prev(SeekKey::Key(b"foo")).unwrap(), false); + assert_eq!(iter.seek_to_first().unwrap(), false); + assert_eq!(iter.seek_to_last().unwrap(), false); + assert_eq!(iter.seek(b"foo").unwrap(), false); + assert_eq!(iter.seek_for_prev(b"foo").unwrap(), false); } #[test] fn iter_empty_engine() { let db = default_engine(); - iter_empty(&db.engine, |e| e.iterator().unwrap()); + iter_empty(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_empty_snapshot() { let db = default_engine(); - iter_empty(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_empty(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_forward(e: &E, i: IF) @@ -64,7 +62,7 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::Start).unwrap()); + assert!(iter.seek_to_first().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"a"); @@ -103,13 +101,13 @@ where #[test] fn iter_forward_engine() { let db = default_engine(); - iter_forward(&db.engine, |e| e.iterator().unwrap()); + iter_forward(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_forward_snapshot() { let db = default_engine(); - iter_forward(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_reverse(e: &E, i: IF) @@ -126,7 +124,7 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::End).unwrap()); + assert!(iter.seek_to_last().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); @@ -165,13 +163,13 @@ where #[test] fn iter_reverse_engine() { let db = default_engine(); - iter_reverse(&db.engine, |e| e.iterator().unwrap()); + iter_reverse(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_reverse_snapshot() { let db = default_engine(); - iter_reverse(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn seek_to_key_then_forward(e: &E, i: IF) @@ -186,7 +184,7 @@ where let mut iter = i(e); - assert!(iter.seek(SeekKey::Key(b"b")).unwrap()); + assert!(iter.seek(b"b").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"b"); @@ -206,13 +204,13 @@ where #[test] fn seek_to_key_then_forward_engine() { let db = default_engine(); - seek_to_key_then_forward(&db.engine, |e| e.iterator().unwrap()); + seek_to_key_then_forward(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_to_key_then_forward_snapshot() { let db = default_engine(); - seek_to_key_then_forward(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_to_key_then_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn seek_to_key_then_reverse(e: &E, i: IF) @@ -227,7 +225,7 @@ where let mut iter = i(e); - assert!(iter.seek(SeekKey::Key(b"b")).unwrap()); + assert!(iter.seek(b"b").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"b"); @@ -247,13 +245,13 @@ where #[test] fn seek_to_key_then_reverse_engine() { let db = default_engine(); - seek_to_key_then_reverse(&db.engine, |e| e.iterator().unwrap()); + seek_to_key_then_reverse(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_to_key_then_reverse_snapshot() { let db = default_engine(); - seek_to_key_then_reverse(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_to_key_then_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_forward_then_reverse(e: &E, i: IF) @@ -270,7 +268,7 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::Start).unwrap()); + assert!(iter.seek_to_first().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"a"); @@ -308,13 +306,13 @@ where #[test] fn iter_forward_then_reverse_engine() { let db = default_engine(); - iter_forward_then_reverse(&db.engine, |e| e.iterator().unwrap()); + iter_forward_then_reverse(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_forward_then_reverse_snapshot() { let db = default_engine(); - iter_forward_then_reverse(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_forward_then_reverse(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn iter_reverse_then_forward(e: &E, i: IF) @@ -331,7 +329,7 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::End).unwrap()); + assert!(iter.seek_to_last().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); @@ -369,13 +367,13 @@ where #[test] fn iter_reverse_then_forward_engine() { let db = default_engine(); - iter_reverse_then_forward(&db.engine, |e| e.iterator().unwrap()); + iter_reverse_then_forward(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iter_reverse_then_forward_snapshot() { let db = default_engine(); - iter_reverse_then_forward(&db.engine, |e| e.snapshot().iterator().unwrap()); + iter_reverse_then_forward(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } // When seek finds an exact key then seek_for_prev behaves just like seek @@ -391,19 +389,19 @@ where let mut iter = i(e); - assert!(iter.seek_for_prev(SeekKey::Start).unwrap()); + assert!(iter.seek_to_first().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"a"); assert_eq!(iter.value(), b"a"); - assert!(iter.seek_for_prev(SeekKey::End).unwrap()); + assert!(iter.seek_to_last().unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); assert_eq!(iter.value(), b"c"); - assert!(iter.seek_for_prev(SeekKey::Key(b"c")).unwrap()); + assert!(iter.seek_for_prev(b"c").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); @@ -413,13 +411,13 @@ where #[test] fn seek_for_prev_engine() { let db = default_engine(); - seek_for_prev(&db.engine, |e| e.iterator().unwrap()); + seek_for_prev(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_for_prev_snapshot() { let db = default_engine(); - seek_for_prev(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_for_prev(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } // When Seek::Key doesn't find an exact match, @@ -437,24 +435,24 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek(SeekKey::Key(b"b")).unwrap()); + assert!(iter.seek(b"b").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); - assert!(!iter.seek(SeekKey::Key(b"d")).unwrap()); + assert!(!iter.seek(b"d").unwrap()); assert!(!iter.valid().unwrap()); } #[test] fn seek_key_miss_engine() { let db = default_engine(); - seek_key_miss(&db.engine, |e| e.iterator().unwrap()); + seek_key_miss(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_key_miss_snapshot() { let db = default_engine(); - seek_key_miss(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_key_miss(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } fn seek_key_prev_miss(e: &E, i: IF) @@ -469,22 +467,22 @@ where assert!(!iter.valid().unwrap()); - assert!(iter.seek_for_prev(SeekKey::Key(b"d")).unwrap()); + assert!(iter.seek_for_prev(b"d").unwrap()); assert!(iter.valid().unwrap()); assert_eq!(iter.key(), b"c"); - assert!(!iter.seek_for_prev(SeekKey::Key(b"b")).unwrap()); + assert!(!iter.seek_for_prev(b"b").unwrap()); assert!(!iter.valid().unwrap()); } #[test] fn seek_key_prev_miss_engine() { let db = default_engine(); - seek_key_prev_miss(&db.engine, |e| e.iterator().unwrap()); + seek_key_prev_miss(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn seek_key_prev_miss_snapshot() { let db = default_engine(); - seek_key_prev_miss(&db.engine, |e| e.snapshot().iterator().unwrap()); + seek_key_prev_miss(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } diff --git a/components/engine_traits_tests/src/read_consistency.rs b/components/engine_traits_tests/src/read_consistency.rs index d80b6b3db7c..8c7ab50657f 100644 --- a/components/engine_traits_tests/src/read_consistency.rs +++ b/components/engine_traits_tests/src/read_consistency.rs @@ -2,7 +2,7 @@ //! Testing iterator and snapshot behavior in the presence of intermixed writes -use engine_traits::{Iterable, Iterator, KvEngine, Peekable, SyncMutable}; +use engine_traits::{Iterable, Iterator, KvEngine, Peekable, SyncMutable, CF_DEFAULT}; use super::default_engine; @@ -71,11 +71,11 @@ where #[test] fn iterator_with_writes_engine() { let db = default_engine(); - iterator_with_writes(&db.engine, |e| e.iterator().unwrap()); + iterator_with_writes(&db.engine, |e| e.iterator(CF_DEFAULT).unwrap()); } #[test] fn iterator_with_writes_snapshot() { let db = default_engine(); - iterator_with_writes(&db.engine, |e| e.snapshot().iterator().unwrap()); + iterator_with_writes(&db.engine, |e| e.snapshot().iterator(CF_DEFAULT).unwrap()); } diff --git a/components/engine_traits_tests/src/sst.rs b/components/engine_traits_tests/src/sst.rs index 10104e752cc..231e12ea785 100644 --- a/components/engine_traits_tests/src/sst.rs +++ b/components/engine_traits_tests/src/sst.rs @@ -6,8 +6,7 @@ use std::fs; use engine_test::kv::KvTestEngine; use engine_traits::{ - Error, ExternalSstFileInfo, Iterator, Result, SeekKey, SstExt, SstReader, SstWriter, - SstWriterBuilder, + Error, ExternalSstFileInfo, Iterator, Result, SstExt, SstReader, SstWriter, SstWriterBuilder, }; use panic_hook::recover_safe; @@ -51,7 +50,7 @@ fn basic() -> Result<()> { let sst_reader = ::SstReader::open(&sst_path)?; let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; let key = iter.key(); let value = iter.value(); assert_eq!(b"k1", key); @@ -80,7 +79,7 @@ fn forward() -> Result<()> { let sst_reader = ::SstReader::open(&sst_path)?; let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; let key = iter.key(); let value = iter.value(); @@ -117,7 +116,7 @@ fn reverse() -> Result<()> { let sst_reader = ::SstReader::open(&sst_path)?; let mut iter = sst_reader.iter(); - iter.seek(SeekKey::End)?; + iter.seek_to_last()?; let key = iter.key(); let value = iter.value(); @@ -136,7 +135,7 @@ fn reverse() -> Result<()> { Ok(()) } -// todo test seek_for_prev(SeekKey::Key) +// todo test seek_for_prev(Key) #[test] fn delete() -> Result<()> { @@ -155,7 +154,7 @@ fn delete() -> Result<()> { let sst_reader = ::SstReader::open(&sst_path)?; let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; assert_eq!(iter.valid()?, false); @@ -174,12 +173,10 @@ fn delete() -> Result<()> { .is_err() ); - assert_eq!(iter.seek(SeekKey::Start)?, false); - assert_eq!(iter.seek(SeekKey::End)?, false); - assert_eq!(iter.seek(SeekKey::Key(b"foo"))?, false); - assert_eq!(iter.seek_for_prev(SeekKey::Start)?, false); - assert_eq!(iter.seek_for_prev(SeekKey::End)?, false); - assert_eq!(iter.seek_for_prev(SeekKey::Key(b"foo"))?, false); + assert_eq!(iter.seek_to_first()?, false); + assert_eq!(iter.seek_to_last()?, false); + assert_eq!(iter.seek(b"foo")?, false); + assert_eq!(iter.seek_for_prev(b"foo")?, false); Ok(()) } @@ -215,7 +212,7 @@ fn same_key() -> Result<()> { let sst_reader = ::SstReader::open(&sst_path)?; let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; let key = iter.key(); let value = iter.value(); assert_eq!(b"k1", key); @@ -257,7 +254,7 @@ fn reverse_key() -> Result<()> { let sst_reader = ::SstReader::open(&sst_path)?; let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start)?; + iter.seek_to_first()?; let key = iter.key(); let value = iter.value(); assert_eq!(b"k2", key); diff --git a/components/raftstore/src/coprocessor/consistency_check.rs b/components/raftstore/src/coprocessor/consistency_check.rs index 16770595405..70b55db41f4 100644 --- a/components/raftstore/src/coprocessor/consistency_check.rs +++ b/components/raftstore/src/coprocessor/consistency_check.rs @@ -66,7 +66,7 @@ fn compute_hash_on_raw(region: &Region, snap: &S) -> Result { let start_key = keys::enc_start_key(region); let end_key = keys::enc_end_key(region); for cf in cf_names { - snap.scan_cf(cf, &start_key, &end_key, false, |k, v| { + snap.scan(cf, &start_key, &end_key, false, |k, v| { digest.update(k); digest.update(v); Ok(true) diff --git a/components/raftstore/src/coprocessor/split_check/table.rs b/components/raftstore/src/coprocessor/split_check/table.rs index a8a1ded4144..e377d4b550a 100644 --- a/components/raftstore/src/coprocessor/split_check/table.rs +++ b/components/raftstore/src/coprocessor/split_check/table.rs @@ -2,7 +2,7 @@ use std::cmp::Ordering; -use engine_traits::{IterOptions, Iterator, KvEngine, SeekKey, CF_WRITE}; +use engine_traits::{IterOptions, Iterator, KvEngine, CF_WRITE}; use error_code::ErrorCodeExt; use kvproto::{metapb::Region, pdpb::CheckPolicy}; use tidb_query_datatype::codec::table as table_codec; @@ -183,10 +183,10 @@ fn last_key_of_region(db: &impl KvEngine, region: &Region) -> Result = iter.seek(SeekKey::End).map_err(|e| box_err!(e)); + let found: Result = iter.seek_to_last().map_err(|e| box_err!(e)); if found? { let key = iter.key().to_vec(); last_key = Some(key); diff --git a/components/raftstore/src/store/bootstrap.rs b/components/raftstore/src/store/bootstrap.rs index 12fb238dce8..561425d9d00 100644 --- a/components/raftstore/src/store/bootstrap.rs +++ b/components/raftstore/src/store/bootstrap.rs @@ -34,7 +34,7 @@ fn is_range_empty( end_key: &[u8], ) -> Result { let mut count: u32 = 0; - engine.scan_cf(cf, start_key, end_key, false, |_, _| { + engine.scan(cf, start_key, end_key, false, |_, _| { count += 1; Ok(false) })?; diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index dc5690a2b34..e7a59631ca1 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -202,9 +202,7 @@ mod tests { raw_util::{new_engine_opt, CFOptions}, RocksEngine, RocksSstPartitionerFactory, RocksSstReader, }; - use engine_traits::{ - CompactExt, Iterator, MiscExt, SeekKey, SstReader, SyncMutable, CF_DEFAULT, - }; + use engine_traits::{CompactExt, Iterator, MiscExt, SstReader, SyncMutable, CF_DEFAULT}; use keys::DATA_PREFIX_KEY; use kvproto::metapb::Region; use tempfile::TempDir; @@ -404,7 +402,7 @@ mod tests { fn collect_keys(path: &str) -> Vec> { let mut sst_reader = RocksSstReader::open(path).unwrap().iter(); - let mut valid = sst_reader.seek(SeekKey::Start).unwrap(); + let mut valid = sst_reader.seek_to_first().unwrap(); let mut ret = vec![]; while valid { ret.push(sst_reader.key().to_owned()); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index c46cafb7e48..f92d08dd3a4 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1084,7 +1084,7 @@ impl RaftPollerBuilder { let mut merging_count = 0; let mut meta = self.store_meta.lock().unwrap(); let mut replication_state = self.global_replication_state.lock().unwrap(); - kv_engine.scan_cf(CF_RAFT, start_key, end_key, false, |key, value| { + kv_engine.scan(CF_RAFT, start_key, end_key, false, |key, value| { let (region_id, suffix) = box_try!(keys::decode_region_meta_key(key)); if suffix != keys::REGION_STATE_SUFFIX { return Ok(true); diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 58e35ff9084..ec6cc3bcf11 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -2238,7 +2238,7 @@ mod tests { store .engines .kv - .scan_cf(CF_RAFT, &meta_start, &meta_end, false, |_, _| { + .scan(CF_RAFT, &meta_start, &meta_end, false, |_, _| { count += 1; Ok(true) }) @@ -2251,7 +2251,7 @@ mod tests { store .engines .kv - .scan_cf(CF_RAFT, &raft_start, &raft_end, false, |_, _| { + .scan(CF_RAFT, &raft_start, &raft_end, false, |_, _| { count += 1; Ok(true) }) diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index 390c0ee0f5c..cd2bc75d048 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -109,12 +109,8 @@ where } } - pub fn iter(&self, iter_opt: IterOptions) -> RegionIterator { - RegionIterator::new(&self.snap, Arc::clone(&self.region), iter_opt) - } - - pub fn iter_cf(&self, cf: &str, iter_opt: IterOptions) -> Result> { - Ok(RegionIterator::new_cf( + pub fn iter(&self, cf: &str, iter_opt: IterOptions) -> Result> { + Ok(RegionIterator::new( &self.snap, Arc::clone(&self.region), iter_opt, @@ -124,24 +120,13 @@ where // scan scans database using an iterator in range [start_key, end_key), calls function f for // each iteration, if f returns false, terminates this scan. - pub fn scan(&self, start_key: &[u8], end_key: &[u8], fill_cache: bool, f: F) -> Result<()> - where - F: FnMut(&[u8], &[u8]) -> Result, - { - let start = KeyBuilder::from_slice(start_key, DATA_PREFIX_KEY.len(), 0); - let end = KeyBuilder::from_slice(end_key, DATA_PREFIX_KEY.len(), 0); - let iter_opt = IterOptions::new(Some(start), Some(end), fill_cache); - self.scan_impl(self.iter(iter_opt), start_key, f) - } - - // like `scan`, only on a specific column family. - pub fn scan_cf( + pub fn scan( &self, cf: &str, start_key: &[u8], end_key: &[u8], fill_cache: bool, - f: F, + mut f: F, ) -> Result<()> where F: FnMut(&[u8], &[u8]) -> Result, @@ -149,13 +134,8 @@ where let start = KeyBuilder::from_slice(start_key, DATA_PREFIX_KEY.len(), 0); let end = KeyBuilder::from_slice(end_key, DATA_PREFIX_KEY.len(), 0); let iter_opt = IterOptions::new(Some(start), Some(end), fill_cache); - self.scan_impl(self.iter_cf(cf, iter_opt)?, start_key, f) - } - fn scan_impl(&self, mut it: RegionIterator, start_key: &[u8], mut f: F) -> Result<()> - where - F: FnMut(&[u8], &[u8]) -> Result, - { + let mut it = self.iter(cf, iter_opt)?; let mut it_valid = it.seek(start_key)?; while it_valid { it_valid = f(it.key(), it.value())? && it.next()?; @@ -300,16 +280,7 @@ impl RegionIterator where S: Snapshot, { - pub fn new(snap: &S, region: Arc, mut iter_opt: IterOptions) -> RegionIterator { - update_lower_bound(&mut iter_opt, ®ion); - update_upper_bound(&mut iter_opt, ®ion); - let iter = snap - .iterator_opt(iter_opt) - .expect("creating snapshot iterator"); // FIXME error handling - RegionIterator { iter, region } - } - - pub fn new_cf( + pub fn new( snap: &S, region: Arc, mut iter_opt: IterOptions, @@ -318,7 +289,7 @@ where update_lower_bound(&mut iter_opt, ®ion); update_upper_bound(&mut iter_opt, ®ion); let iter = snap - .iterator_cf_opt(cf, iter_opt) + .iterator_opt(cf, iter_opt) .expect("creating snapshot iterator"); // FIXME error handling RegionIterator { iter, region } } @@ -337,15 +308,13 @@ where }); self.should_seekable(key)?; let key = keys::data_key(key); - self.iter.seek(key.as_slice().into()).map_err(Error::from) + self.iter.seek(&key).map_err(Error::from) } pub fn seek_for_prev(&mut self, key: &[u8]) -> Result { self.should_seekable(key)?; let key = keys::data_key(key); - self.iter - .seek_for_prev(key.as_slice().into()) - .map_err(Error::from) + self.iter.seek_for_prev(&key).map_err(Error::from) } pub fn prev(&mut self) -> Result { @@ -397,7 +366,7 @@ fn handle_check_key_in_region_error(e: crate::Error) -> Result<()> { #[cfg(test)] mod tests { use engine_test::{kv::KvTestSnapshot, new_temp_engine}; - use engine_traits::{Engines, KvEngine, Peekable, RaftEngine, SyncMutable}; + use engine_traits::{Engines, KvEngine, Peekable, RaftEngine, SyncMutable, CF_DEFAULT}; use keys::data_key; use kvproto::metapb::{Peer, Region}; use tempfile::Builder; @@ -548,7 +517,7 @@ mod tests { upper_bound.map(|v| KeyBuilder::from_slice(v, keys::DATA_PREFIX_KEY.len(), 0)), true, ); - let mut iter = snap.iter(iter_opt); + let mut iter = snap.iter(CF_DEFAULT, iter_opt).unwrap(); for (seek_key, in_range, seek_exp, prev_exp) in seek_table.clone() { let check_res = |iter: &RegionIterator, res: Result, @@ -650,7 +619,7 @@ mod tests { let snap = RegionSnapshot::::new(&store); let mut data = vec![]; - snap.scan(b"a2", &[0xFF, 0xFF], false, |key, value| { + snap.scan(CF_DEFAULT, b"a2", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) @@ -660,7 +629,7 @@ mod tests { assert_eq!(data, &base_data[1..3]); data.clear(); - snap.scan(b"a2", &[0xFF, 0xFF], false, |key, value| { + snap.scan(CF_DEFAULT, b"a2", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(false) }) @@ -668,7 +637,7 @@ mod tests { assert_eq!(data.len(), 1); - let mut iter = snap.iter(IterOptions::default()); + let mut iter = snap.iter(CF_DEFAULT, IterOptions::default()).unwrap(); assert!(iter.seek_to_first().unwrap()); let mut res = vec![]; loop { @@ -685,7 +654,7 @@ mod tests { let store = new_peer_storage(engines.clone(), ®ion); let snap = RegionSnapshot::::new(&store); data.clear(); - snap.scan(b"", &[0xFF, 0xFF], false, |key, value| { + snap.scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |key, value| { data.push((key.to_vec(), value.to_vec())); Ok(true) }) @@ -694,7 +663,7 @@ mod tests { assert_eq!(data.len(), 5); assert_eq!(data, base_data); - let mut iter = snap.iter(IterOptions::default()); + let mut iter = snap.iter(CF_DEFAULT, IterOptions::default()).unwrap(); assert!(iter.seek(b"a1").unwrap()); assert!(iter.seek_to_first().unwrap()); @@ -710,11 +679,16 @@ mod tests { // test iterator with upper bound let store = new_peer_storage(engines, ®ion); let snap = RegionSnapshot::::new(&store); - let mut iter = snap.iter(IterOptions::new( - None, - Some(KeyBuilder::from_slice(b"a5", DATA_PREFIX_KEY.len(), 0)), - true, - )); + let mut iter = snap + .iter( + CF_DEFAULT, + IterOptions::new( + None, + Some(KeyBuilder::from_slice(b"a5", DATA_PREFIX_KEY.len(), 0)), + true, + ), + ) + .unwrap(); assert!(iter.seek_to_first().unwrap()); let mut res = vec![]; loop { @@ -735,7 +709,7 @@ mod tests { let snap = RegionSnapshot::::new(&store); let mut iter_opt = IterOptions::default(); iter_opt.set_lower_bound(b"a3", 1); - let mut iter = snap.iter(iter_opt); + let mut iter = snap.iter(CF_DEFAULT, iter_opt).unwrap(); assert!(iter.seek_to_last().unwrap()); let mut res = vec![]; loop { diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 7bcaeb5529b..cca1dfbda77 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -2011,7 +2011,7 @@ pub mod tests { pub fn get_kv_count(snap: &impl EngineSnapshot) -> usize { let mut kv_count = 0; for cf in SNAPSHOT_CFS { - snap.scan_cf( + snap.scan( cf, &keys::data_key(b"a"), &keys::data_key(b"z"), diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 2baf191d749..4fb34f15341 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -81,7 +81,7 @@ where }; let mut stats = BuildStatistics::default(); - box_try!(snap.scan_cf(cf, start_key, end_key, false, |key, value| { + box_try!(snap.scan(cf, start_key, end_key, false, |key, value| { stats.key_count += 1; stats.total_size += key.len() + value.len(); box_try!(BytesEncoder::encode_compact_bytes(&mut writer, key)); @@ -133,7 +133,7 @@ where .to_string(); let sst_writer = RefCell::new(create_sst_file_writer::(engine, cf, &path)?); let mut file_length: usize = 0; - box_try!(snap.scan_cf(cf, start_key, end_key, false, |key, value| { + box_try!(snap.scan(cf, start_key, end_key, false, |key, value| { let entry_len = key.len() + value.len(); if file_length + entry_len > raw_size_per_file as usize { cf_file.add_file(file_id); // add previous file @@ -375,7 +375,7 @@ mod tests { // Scan keys from db let mut keys_in_db: HashMap<_, Vec<_>> = HashMap::new(); for cf in SNAPSHOT_CFS { - snap.scan_cf( + snap.scan( cf, &keys::data_key(b"a"), &keys::data_end_key(b"z"), diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 3822575fb8e..e5dde8a910c 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -98,8 +98,8 @@ where Some(KeyBuilder::from_slice(end_key, 0, 0)), fill_cache, ); - let mut iter = db.iterator_cf_opt(cf, iter_opt)?; - let found: Result = iter.seek(start_key.into()).map_err(|e| box_err!(e)); + let mut iter = db.iterator_opt(cf, iter_opt)?; + let found: Result = iter.seek(start_key).map_err(|e| box_err!(e)); if found? { heap.push(KeyEntry::new( iter.key().to_vec(), diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index 586a3999b82..d011f9be93f 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -7,7 +7,7 @@ use std::sync::{ use crossbeam::channel::{unbounded, Receiver}; use engine_rocks::{self, RocksEngine}; -use engine_traits::{Iterable, Iterator, RaftEngine, RaftEngineReadOnly, RaftLogBatch, SeekKey}; +use engine_traits::{Iterable, Iterator, RaftEngine, RaftEngineReadOnly, RaftLogBatch, CF_DEFAULT}; use kvproto::raft_serverpb::RaftLocalState; use protobuf::Message; use raft::eraftpb::Entry; @@ -36,8 +36,8 @@ pub fn dump_raftdb_to_raft_engine(source: &RocksEngine, target: &RaftLogEngine, info!("Start to scan raft log from RocksEngine and dump into RaftLogEngine"); let consumed_time = tikv_util::time::Instant::now(); // Seek all region id from raftdb and send them to workers. - let mut it = source.iterator().unwrap(); - let mut valid = it.seek(SeekKey::Key(keys::REGION_RAFT_MIN_KEY)).unwrap(); + let mut it = source.iterator(CF_DEFAULT).unwrap(); + let mut valid = it.seek(keys::REGION_RAFT_MIN_KEY).unwrap(); while valid { match keys::decode_raft_key(it.key()) { Err(e) => { @@ -47,7 +47,7 @@ pub fn dump_raftdb_to_raft_engine(source: &RocksEngine, target: &RaftLogEngine, tx.send(id).unwrap(); count_region += 1; let next_key = keys::raft_log_prefix(id + 1); - valid = it.seek(SeekKey::Key(&next_key)).unwrap(); + valid = it.seek(&next_key).unwrap(); } } } @@ -115,7 +115,7 @@ fn check_raft_engine_is_empty(engine: &RaftLogEngine) { fn check_raft_db_is_empty(engine: &RocksEngine) { let mut count = 0; engine - .scan(b"", &[0xFF, 0xFF], false, |_, _| { + .scan(CF_DEFAULT, b"", &[0xFF, 0xFF], false, |_, _| { count += 1; Ok(false) }) @@ -138,6 +138,7 @@ fn run_dump_raftdb_worker( let mut entries = vec![]; old_engine .scan( + CF_DEFAULT, &keys::raft_log_prefix(id), &keys::raft_log_prefix(id + 1), false, diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index 7c02b058d1e..be93ded1554 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -316,7 +316,8 @@ impl ImportDir { for &(start, end) in TIDB_RANGES_COMPLEMENT { let mut unexpected_data_key = None; - sst_reader.scan(start, end, false, |key, _| { + // No CF in sst. + sst_reader.scan("", start, end, false, |key, _| { unexpected_data_key = Some(key.to_vec()); Ok(false) })?; diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index d1ef399d6d0..1d4e2e916dc 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -15,7 +15,7 @@ use encryption::{encryption_method_to_db_encryption_method, DataKeyManager}; use engine_rocks::{get_env, RocksSstReader}; use engine_traits::{ name_to_cf, util::check_key_in_range, CfName, EncryptionKeyManager, FileEncryptionInfo, - Iterator, KvEngine, SeekKey, SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, + Iterator, KvEngine, SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, }; use file_system::{get_io_rate_limiter, OpenOptions}; @@ -554,7 +554,7 @@ impl SstImporter { // must iterate if we perform key rewrite return Ok(None); } - if !iter.seek(SeekKey::Start)? { + if !iter.seek_to_first()? { // the SST is empty, so no need to iterate at all (should be impossible?) return Ok(Some(meta.get_range().clone())); } @@ -566,7 +566,7 @@ impl SstImporter { let start_key = start_key.to_vec(); // seek to end and fetch the last (inclusive) key of the SST. - iter.seek(SeekKey::End)?; + iter.seek_to_last()?; let last_key = keys::origin_key(iter.key()); if is_after_end_bound(last_key, &range_end) { // SST's end is after the range to consume @@ -606,8 +606,8 @@ impl SstImporter { let mut first_key = None; match range_start { - Bound::Unbounded => iter.seek(SeekKey::Start)?, - Bound::Included(s) => iter.seek(SeekKey::Key(&keys::data_key(&s)))?, + Bound::Unbounded => iter.seek_to_first()?, + Bound::Included(s) => iter.seek(&keys::data_key(&s))?, Bound::Excluded(_) => unreachable!(), }; // SST writer must not be opened in gRPC threads, because it may be @@ -789,7 +789,7 @@ mod tests { use engine_traits::{ collect, EncryptionMethod, Error as TraitError, ExternalSstFileInfo, Iterable, Iterator, - SeekKey, SstReader, SstWriter, CF_DEFAULT, DATA_CFS, + SstReader, SstWriter, CF_DEFAULT, DATA_CFS, }; use file_system::File; use openssl::hash::{Hasher, MessageDigest}; @@ -1333,7 +1333,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1392,7 +1392,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), Some(env)); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1440,7 +1440,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1485,7 +1485,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1529,7 +1529,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1605,8 +1605,8 @@ mod tests { assert_eq!(meta_info.total_kvs, 4); // verifies the DB content is correct. - let mut iter = db.iterator_cf(cf).unwrap(); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = db.iterator(cf).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1670,7 +1670,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1714,7 +1714,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1746,8 +1746,8 @@ mod tests { db, ); match &result { - Err(Error::EngineTraits(TraitError::Engine(msg))) if msg.starts_with("Corruption:") => { - } + Err(Error::EngineTraits(TraitError::Engine(s))) + if s.state().starts_with("Corruption:") => {} _ => panic!("unexpected download result: {:?}", result), } } @@ -1849,7 +1849,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1907,7 +1907,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ @@ -1962,7 +1962,7 @@ mod tests { let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); let mut iter = sst_reader.iter(); - iter.seek(SeekKey::Start).unwrap(); + iter.seek_to_first().unwrap(); assert_eq!( collect(iter), vec![ diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index bf14b86dfc8..afdcd279e19 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -392,7 +392,7 @@ impl TestSuite { if !end.is_empty() { iter_opt.set_upper_bound(&end, DATA_KEY_PREFIX_LEN); } - let mut iter = snapshot.iter_cf(cf, iter_opt).unwrap(); + let mut iter = snapshot.iter(cf, iter_opt).unwrap(); if !iter.seek(&start).unwrap() { return (0, 0, 0); diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 046d2396382..28112304496 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1260,12 +1260,12 @@ impl Cluster { let mut kv_wb = self.engines[&store_id].kv.write_batch(); self.engines[&store_id] .kv - .scan_cf(CF_RAFT, &meta_start, &meta_end, false, |k, _| { + .scan(CF_RAFT, &meta_start, &meta_end, false, |k, _| { kv_wb.delete(k).unwrap(); Ok(true) }) .unwrap(); - snap.scan_cf(CF_RAFT, &meta_start, &meta_end, false, |k, v| { + snap.scan(CF_RAFT, &meta_start, &meta_end, false, |k, v| { kv_wb.put(k, v).unwrap(); Ok(true) }) @@ -1277,12 +1277,12 @@ impl Cluster { ); self.engines[&store_id] .kv - .scan_cf(CF_RAFT, &raft_start, &raft_end, false, |k, _| { + .scan(CF_RAFT, &raft_start, &raft_end, false, |k, _| { kv_wb.delete(k).unwrap(); Ok(true) }) .unwrap(); - snap.scan_cf(CF_RAFT, &raft_start, &raft_end, false, |k, v| { + snap.scan(CF_RAFT, &raft_start, &raft_end, false, |k, v| { kv_wb.put(k, v).unwrap(); Ok(true) }) diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 12ca8f9a867..1769ecc4154 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -107,7 +107,7 @@ pub fn must_region_cleared(engine: &Engines, region for cf in ALL_CFS { engine .kv - .scan_cf(cf, &start_key, &end_key, false, |k, v| { + .scan(cf, &start_key, &end_key, false, |k, v| { panic!( "[region {}] unexpected ({:?}, {:?}) in cf {:?}", id, k, v, cf diff --git a/components/tikv_kv/src/btree_engine.rs b/components/tikv_kv/src/btree_engine.rs index 36a8aa58849..9557f945034 100644 --- a/components/tikv_kv/src/btree_engine.rs +++ b/components/tikv_kv/src/btree_engine.rs @@ -246,11 +246,8 @@ impl Snapshot for BTreeEngineSnapshot { fn get_cf_opt(&self, _: ReadOptions, cf: CfName, key: &Key) -> EngineResult> { self.get_cf(cf, key) } - fn iter(&self, iter_opt: IterOptions) -> EngineResult { - self.iter_cf(CF_DEFAULT, iter_opt) - } #[inline] - fn iter_cf(&self, cf: CfName, iter_opt: IterOptions) -> EngineResult { + fn iter(&self, cf: CfName, iter_opt: IterOptions) -> EngineResult { let tree = self.inner_engine.get_cf(cf); Ok(BTreeEngineIterator::new(tree, iter_opt)) } @@ -341,13 +338,21 @@ pub mod tests { let mut iter_op = IterOptions::default(); iter_op.set_lower_bound(b"a7", 0); iter_op.set_upper_bound(b"a3", 0); - let mut cursor = Cursor::new(snap.iter(iter_op).unwrap(), ScanMode::Forward, false); + let mut cursor = Cursor::new( + snap.iter(CF_DEFAULT, iter_op).unwrap(), + ScanMode::Forward, + false, + ); assert!(!cursor.seek(&Key::from_raw(b"a5"), &mut statistics).unwrap()); let mut iter_op = IterOptions::default(); iter_op.set_lower_bound(b"a3", 0); iter_op.set_upper_bound(b"a7", 0); - let mut cursor = Cursor::new(snap.iter(iter_op).unwrap(), ScanMode::Forward, false); + let mut cursor = Cursor::new( + snap.iter(CF_DEFAULT, iter_op).unwrap(), + ScanMode::Forward, + false, + ); assert!(cursor.seek(&Key::from_raw(b"a5"), &mut statistics).unwrap()); assert!(!cursor.seek(&Key::from_raw(b"a8"), &mut statistics).unwrap()); diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index de29583444c..923a1878a42 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -561,7 +561,7 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { iter_opt.set_prefix_same_as_start(true); } Ok(Cursor::new( - self.snapshot.iter_cf(self.cf, iter_opt)?, + self.snapshot.iter(self.cf, iter_opt)?, self.scan_mode, self.prefix_seek, )) @@ -637,7 +637,7 @@ mod tests { let mut iter_opt = IterOptions::default(); iter_opt.use_prefix_seek(); iter_opt.set_prefix_same_as_start(true); - let it = snap.iter(iter_opt); + let it = snap.iter(CF_DEFAULT, iter_opt).unwrap(); let mut iter = Cursor::new(it, ScanMode::Mixed, true); assert!( @@ -677,7 +677,7 @@ mod tests { let snap = RegionSnapshot::::from_raw(engines.kv.clone(), region); let mut statistics = CfStatistics::default(); - let it = snap.iter(IterOptions::default()); + let it = snap.iter(CF_DEFAULT, IterOptions::default()).unwrap(); let mut iter = Cursor::new(it, ScanMode::Mixed, false); assert!( !iter @@ -735,7 +735,7 @@ mod tests { let mut region = Region::default(); region.mut_peers().push(Peer::default()); let snap = RegionSnapshot::::from_raw(engines.kv, region); - let it = snap.iter(IterOptions::default()); + let it = snap.iter(CF_DEFAULT, IterOptions::default()).unwrap(); let mut iter = Cursor::new(it, ScanMode::Mixed, false); assert!( !iter diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index adb04fc25cd..1d66f11ad74 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -359,8 +359,7 @@ pub trait Snapshot: Sync + Send + Clone { /// Get the value associated with `key` in `cf` column family, with Options in `opts` fn get_cf_opt(&self, opts: ReadOptions, cf: CfName, key: &Key) -> Result>; - fn iter(&self, iter_opt: IterOptions) -> Result; - fn iter_cf(&self, cf: CfName, iter_opt: IterOptions) -> Result; + fn iter(&self, cf: CfName, iter_opt: IterOptions) -> Result; // The minimum key this snapshot can retrieve. #[inline] fn lower_bound(&self) -> Option<&[u8]> { @@ -706,7 +705,7 @@ pub mod tests { fn assert_seek(engine: &E, key: &[u8], pair: (&[u8], &[u8])) { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut cursor = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Mixed, false, ); @@ -719,7 +718,7 @@ pub mod tests { fn assert_reverse_seek(engine: &E, key: &[u8], pair: (&[u8], &[u8])) { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut cursor = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Mixed, false, ); @@ -817,7 +816,7 @@ pub mod tests { assert_reverse_seek(engine, b"z", (b"x", b"1")); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut iter = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Mixed, false, ); @@ -841,7 +840,7 @@ pub mod tests { must_put(engine, b"z", b"2"); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut cursor = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Mixed, false, ); @@ -864,7 +863,7 @@ pub mod tests { } let snapshot = engine.snapshot(Default::default()).unwrap(); let mut cursor = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Mixed, false, ); @@ -882,7 +881,7 @@ pub mod tests { fn test_empty_seek(engine: &E) { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut cursor = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Mixed, false, ); @@ -954,9 +953,16 @@ pub mod tests { start_idx: usize, step: usize, ) { - let mut cursor = Cursor::new(snapshot.iter(IterOptions::default()).unwrap(), mode, false); - let mut near_cursor = - Cursor::new(snapshot.iter(IterOptions::default()).unwrap(), mode, false); + let mut cursor = Cursor::new( + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), + mode, + false, + ); + let mut near_cursor = Cursor::new( + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), + mode, + false, + ); let limit = (SEEK_BOUND as usize * 10 + 50 - 1) * 2; for (_, mut i) in (start_idx..(SEEK_BOUND as usize * 30)) @@ -1092,7 +1098,7 @@ pub mod tests { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut iter = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Forward, false, ); diff --git a/components/tikv_kv/src/raftstore_impls.rs b/components/tikv_kv/src/raftstore_impls.rs index d93ddaf236c..c1384bdcd45 100644 --- a/components/tikv_kv/src/raftstore_impls.rs +++ b/components/tikv_kv/src/raftstore_impls.rs @@ -85,18 +85,11 @@ impl EngineSnapshot for RegionSnapshot { Ok(v.map(|v| v.to_vec())) } - fn iter(&self, iter_opt: IterOptions) -> kv::Result { + fn iter(&self, cf: CfName, iter_opt: IterOptions) -> kv::Result { fail_point!("raftkv_snapshot_iter", |_| Err(box_err!( - "injected error for iter" - ))); - Ok(RegionSnapshot::iter(self, iter_opt)) - } - - fn iter_cf(&self, cf: CfName, iter_opt: IterOptions) -> kv::Result { - fail_point!("raftkv_snapshot_iter_cf", |_| Err(box_err!( "injected error for iter_cf" ))); - RegionSnapshot::iter_cf(self, cf, iter_opt).map_err(kv::Error::from) + RegionSnapshot::iter(self, cf, iter_opt).map_err(kv::Error::from) } #[inline] diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index ee220f7e31a..50059433553 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -15,7 +15,7 @@ use engine_rocks::{ RocksEngineIterator, }; use engine_traits::{ - CfName, Engines, IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, SeekKey, + CfName, Engines, IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, }; use file_system::IORateLimiter; use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb}; @@ -287,14 +287,9 @@ impl Snapshot for Arc { Ok(v.map(|v| v.to_vec())) } - fn iter(&self, iter_opt: IterOptions) -> Result { - trace!("RocksSnapshot: create iterator"); - Ok(self.iterator_opt(iter_opt)?) - } - - fn iter_cf(&self, cf: CfName, iter_opt: IterOptions) -> Result { + fn iter(&self, cf: CfName, iter_opt: IterOptions) -> Result { trace!("RocksSnapshot: create cf iterator"); - Ok(self.iterator_cf_opt(cf, iter_opt)?) + Ok(self.iterator_opt(cf, iter_opt)?) } fn ext(&self) -> DummySnapshotExt { @@ -312,19 +307,19 @@ impl EngineIterator for RocksEngineIterator { } fn seek(&mut self, key: &Key) -> Result { - Iterator::seek(self, key.as_encoded().as_slice().into()).map_err(Error::from) + Iterator::seek(self, key.as_encoded()).map_err(Error::from) } fn seek_for_prev(&mut self, key: &Key) -> Result { - Iterator::seek_for_prev(self, key.as_encoded().as_slice().into()).map_err(Error::from) + Iterator::seek_for_prev(self, key.as_encoded()).map_err(Error::from) } fn seek_to_first(&mut self) -> Result { - Iterator::seek(self, SeekKey::Start).map_err(Error::from) + Iterator::seek_to_first(self).map_err(Error::from) } fn seek_to_last(&mut self) -> Result { - Iterator::seek(self, SeekKey::End).map_err(Error::from) + Iterator::seek_to_last(self).map_err(Error::from) } fn valid(&self) -> Result { diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index aa981603d17..6982c66b67a 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -947,8 +947,7 @@ securityfs /sys/kernel/security securityfs rw,nosuid,nodev,noexec,relatime 0 0 #[test] fn test_check_data_dir() { // test invalid data_path - let ret = check_data_dir("/sys/invalid", "/proc/mounts"); - assert!(ret.is_err()); + check_data_dir("/sys/invalid", "/proc/mounts").unwrap_err(); // get real path's fs_info let tmp_dir = Builder::new() .prefix("test-check-data-dir") @@ -959,13 +958,15 @@ securityfs /sys/kernel/security securityfs rw,nosuid,nodev,noexec,relatime 0 0 let fs_info = get_fs_info(&data_path, "/proc/mounts").unwrap(); // data_path may not mounted on a normal device on container - if !fs_info.fsname.starts_with("/dev") { + // /proc/mounts may contain host's device, which is not accessible in container. + if Path::new("/.dockerenv").exists() + && (!fs_info.fsname.starts_with("/dev") || !Path::new(&fs_info.fsname).exists()) + { return; } // test with real path - let ret = check_data_dir(&data_path, "/proc/mounts"); - assert!(ret.is_ok()); + check_data_dir(&data_path, "/proc/mounts").unwrap(); // test with device mapper // get real_path's rotational info @@ -985,8 +986,7 @@ securityfs /sys/kernel/security securityfs rw,nosuid,nodev,noexec,relatime 0 0 let mnt_file = format!("{}/mnt.txt", tmp_dir.path().display()); create_file(&mnt_file, mninfo.as_bytes()); // check info - let res = check_data_dir(&data_path, &mnt_file); - assert!(res.is_ok()); + check_data_dir(&data_path, &mnt_file).unwrap(); // check rotational info let get = get_rotational_info(&tmp_device).unwrap(); assert_eq!(expect, get); diff --git a/src/config.rs b/src/config.rs index ebf1e132777..0df2e2a2101 100644 --- a/src/config.rs +++ b/src/config.rs @@ -37,8 +37,8 @@ use engine_rocks::{ DEFAULT_PROP_KEYS_INDEX_DISTANCE, DEFAULT_PROP_SIZE_INDEX_DISTANCE, }; use engine_traits::{ - CFOptionsExt, ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptionsExt, TabletAccessor, - TabletErrorCollector, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + CFOptionsExt, ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptions as _, DBOptionsExt, + TabletAccessor, TabletErrorCollector, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use file_system::IORateLimiter; use keys::region_raft_prefix_len; @@ -1597,14 +1597,11 @@ impl> DBConfigManger { let mut error_collector = TabletErrorCollector::new(); self.tablet_accessor .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - let r = db.get_options_cf(cf); - if let Ok(opt) = r { - let r = opt.set_block_cache_capacity(size.0); - if let Err(r) = r { - error_collector.add_result(region_id, suffix, Err(r.into())); - } - } else if let Err(r) = r { - error_collector.add_result(region_id, suffix, Err(r)); + let r = db + .get_options_cf(cf) + .and_then(|opt| opt.set_block_cache_capacity(size.0)); + if r.is_err() { + error_collector.add_result(region_id, suffix, r); } }); // Write config to metric @@ -1618,10 +1615,10 @@ impl> DBConfigManger { let mut error_collector = TabletErrorCollector::new(); self.tablet_accessor .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - let mut opt = db.as_inner().get_db_options(); + let mut opt = db.get_db_options(); let r = opt.set_rate_bytes_per_sec(rate_bytes_per_sec); - if let Err(r) = r { - error_collector.add_result(region_id, suffix, Err(r.into())); + if r.is_err() { + error_collector.add_result(region_id, suffix, r); } }); error_collector.take_result() @@ -1634,20 +1631,24 @@ impl> DBConfigManger { let mut error_collector = TabletErrorCollector::new(); self.tablet_accessor .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - let mut opt = db.as_inner().get_db_options(); - let r = opt.set_auto_tuned(rate_limiter_auto_tuned); - if let Err(r) = r { - error_collector.add_result(region_id, suffix, Err(r.into())); + let mut opt = db.get_db_options(); + let r = opt.set_rate_limiter_auto_tuned(rate_limiter_auto_tuned); + if r.is_err() { + error_collector.add_result(region_id, suffix, r); } else { // double check the new state - let new_auto_tuned = opt.get_auto_tuned(); + let new_auto_tuned = opt.get_rate_limiter_auto_tuned(); if new_auto_tuned.is_none() || new_auto_tuned.unwrap() != rate_limiter_auto_tuned { error_collector.add_result( region_id, suffix, - Err("fail to set rate_limiter_auto_tuned".to_string().into()), + Err(engine_traits::Status::with_error( + engine_traits::Code::IoError, + "fail to set rate_limiter_auto_tuned", + ) + .into()), ); } } diff --git a/src/import/duplicate_detect.rs b/src/import/duplicate_detect.rs index f3277f3f3ef..3ae9360e727 100644 --- a/src/import/duplicate_detect.rs +++ b/src/import/duplicate_detect.rs @@ -40,9 +40,7 @@ impl DuplicateDetector { }); let mut iter_opt = IterOptions::new(Some(l_bound), u_bound, false); iter_opt.set_key_only(key_only); - let mut iter = snapshot - .iter_cf(CF_WRITE, iter_opt) - .map_err(from_kv_error)?; + let mut iter = snapshot.iter(CF_WRITE, iter_opt).map_err(from_kv_error)?; iter.seek(&start_key).map_err(from_kv_error)?; Ok(DuplicateDetector { snapshot, diff --git a/src/server/debug.rs b/src/server/debug.rs index d10f58cc2ad..93732c9c580 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -17,8 +17,8 @@ use engine_rocks::{ }; use engine_traits::{ Engines, IterOptions, Iterable, Iterator as EngineIterator, Mutable, MvccProperties, Peekable, - RaftEngine, Range, RangePropertiesExt, SeekKey, SyncMutable, WriteBatch, WriteBatchExt, - WriteOptions, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + RaftEngine, Range, RangePropertiesExt, SyncMutable, WriteBatch, WriteBatchExt, WriteOptions, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ debugpb::{self, Db as DBType}, @@ -155,7 +155,7 @@ impl Debugger { let start_key = keys::REGION_META_MIN_KEY; let end_key = keys::REGION_META_MAX_KEY; let mut regions = Vec::with_capacity(128); - box_try!(db.scan_cf(cf, start_key, end_key, false, |key, _| { + box_try!(db.scan(cf, start_key, end_key, false, |key, _| { let (id, suffix) = box_try!(keys::decode_region_meta_key(key)); if suffix != keys::REGION_STATE_SUFFIX { return Ok(true); @@ -241,7 +241,7 @@ impl Debugger { let mut sizes = vec![]; for cf in cfs { let mut size = 0; - box_try!(self.engines.kv.scan_cf( + box_try!(self.engines.kv.scan( cf.as_ref(), start_key, end_key, @@ -273,7 +273,7 @@ impl Debugger { MvccInfoIterator::new( |cf, opts| { let kv = &self.engines.kv; - kv.iterator_cf_opt(cf, opts).map_err(|e| box_err!(e)) + kv.iterator_opt(cf, opts).map_err(|e| box_err!(e)) }, if start.is_empty() { None } else { Some(start) }, if end.is_empty() { None } else { Some(end) }, @@ -298,7 +298,7 @@ impl Debugger { }; let iter_opt = IterOptions::new(Some(KeyBuilder::from_vec(start.to_vec(), 0, 0)), end, false); - let mut iter = box_try!(db.iterator_cf_opt(cf, iter_opt)); + let mut iter = box_try!(db.iterator_opt(cf, iter_opt)); if !iter.seek_to_first().unwrap() { return Ok(vec![]); } @@ -496,8 +496,8 @@ impl Debugger { Some(KeyBuilder::from_vec(to, 0, 0)), false, ); - let mut iter = box_try!(self.engines.kv.iterator_cf_opt(CF_RAFT, readopts)); - iter.seek(SeekKey::from(from.as_ref())).unwrap(); + let mut iter = box_try!(self.engines.kv.iterator_opt(CF_RAFT, readopts)); + iter.seek(&from).unwrap(); let fake_snap_worker = Worker::new("fake-snap-worker").lazy_build("fake-snap"); let fake_raftlog_fetch_worker = @@ -659,7 +659,7 @@ impl Debugger { } } } else { - box_try!(self.engines.kv.scan_cf( + box_try!(self.engines.kv.scan( CF_RAFT, keys::REGION_META_MIN_KEY, keys::REGION_META_MAX_KEY, @@ -759,7 +759,7 @@ impl Debugger { return Err(box_err!("Bad region: {:?}", region)); } - box_try!(self.engines.kv.scan_cf( + box_try!(self.engines.kv.scan( CF_RAFT, keys::REGION_META_MIN_KEY, keys::REGION_META_MAX_KEY, @@ -1061,8 +1061,8 @@ impl MvccChecker { Some(KeyBuilder::from_vec(to, 0, 0)), false, ); - let mut iter = box_try!(db.c().iterator_cf_opt(cf, readopts)); - iter.seek(SeekKey::Start).unwrap(); + let mut iter = box_try!(db.c().iterator_opt(cf, readopts)); + iter.seek_to_first().unwrap(); Ok(iter) }; diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 0de26bc43c4..421c0c0f8ba 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -247,7 +247,7 @@ impl TabletFactory for KvEngineFactory { Ok(()) } - fn set_shared_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { + fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { if let Ok(db) = self.inner.root_db.lock() { let opt = db.as_ref().unwrap().get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap opt.set_block_cache_capacity(capacity)?; diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 2dfe297e5d8..e5237187886 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -160,7 +160,7 @@ impl TabletFactory for KvEngineFactoryV2 { new_engine } - fn set_shared_block_cache_capacity(&self, capacity: u64) -> std::result::Result<(), String> { + fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { let reg = self.registry.lock().unwrap(); // pick up any tablet and set the shared block cache capacity if let Some(((_id, _suffix), tablet)) = (*reg).iter().next() { diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 7d233430f70..8d914080279 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -467,7 +467,10 @@ impl WriteCompactionFilter { let _io_type_guard = WithIOType::new(IOType::Gc); fail_point!("write_compaction_filter_flush_write_batch", true, |_| { Err(engine_traits::Error::Engine( - "Ingested fail point".to_string(), + engine_traits::Status::with_error( + engine_traits::Code::IoError, + "Ingested fail point", + ), )) }); wb.write_opt(wopts) diff --git a/src/server/node.rs b/src/server/node.rs index dfed9459b1c..eb2cc72e432 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -304,7 +304,7 @@ where for cf in DATA_CFS { for (start, end) in TIDB_RANGES_COMPLEMENT { let mut unexpected_data_key = None; - snapshot.scan_cf( + snapshot.scan( cf, &keys::data_key(start), &keys::data_key(end), diff --git a/src/server/reset_to_version.rs b/src/server/reset_to_version.rs index 1a7443f6d08..94e3e38900d 100644 --- a/src/server/reset_to_version.rs +++ b/src/server/reset_to_version.rs @@ -8,8 +8,8 @@ use std::{ use engine_rocks::{RocksEngine, RocksEngineIterator, RocksWriteBatchVec}; use engine_traits::{ - IterOptions, Iterable, Iterator, Mutable, SeekKey, WriteBatch, WriteBatchExt, CF_DEFAULT, - CF_LOCK, CF_WRITE, + IterOptions, Iterable, Iterator, Mutable, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_LOCK, + CF_WRITE, }; use tikv_util::sys::thread::StdThreadBuildWrapper; use txn_types::{Key, TimeStamp, Write, WriteRef}; @@ -71,8 +71,8 @@ impl ResetToVersionWorker { .lock() .expect("failed to lock `state` in `ResetToVersionWorker::new`") = ResetToVersionState::RemovingWrite { scanned: 0 }; - write_iter.seek(SeekKey::Start).unwrap(); - lock_iter.seek(SeekKey::Start).unwrap(); + write_iter.seek_to_first().unwrap(); + lock_iter.seek_to_first().unwrap(); Self { write_iter, lock_iter, @@ -207,9 +207,9 @@ impl ResetToVersionManager { let readopts = IterOptions::new(None, None, false); let write_iter = self .engine - .iterator_cf_opt(CF_WRITE, readopts.clone()) + .iterator_opt(CF_WRITE, readopts.clone()) .unwrap(); - let lock_iter = self.engine.iterator_cf_opt(CF_LOCK, readopts).unwrap(); + let lock_iter = self.engine.iterator_opt(CF_LOCK, readopts).unwrap(); let mut worker = ResetToVersionWorker::new(write_iter, lock_iter, ts, self.state.clone()); let mut wb = self.engine.write_batch(); let props = tikv_util::thread_group::current_properties(); @@ -352,9 +352,9 @@ mod tests { let readopts = IterOptions::new(None, None, false); let mut write_iter = fake_engine .c() - .iterator_cf_opt(CF_WRITE, readopts.clone()) + .iterator_opt(CF_WRITE, readopts.clone()) .unwrap(); - write_iter.seek(SeekKey::Start).unwrap(); + write_iter.seek_to_first().unwrap(); let mut remaining_writes = vec![]; while write_iter.valid().unwrap() { let write = WriteRef::parse(write_iter.value()).unwrap().to_owned(); @@ -364,9 +364,9 @@ mod tests { } let mut default_iter = fake_engine .c() - .iterator_cf_opt(CF_DEFAULT, readopts.clone()) + .iterator_opt(CF_DEFAULT, readopts.clone()) .unwrap(); - default_iter.seek(SeekKey::Start).unwrap(); + default_iter.seek_to_first().unwrap(); let mut remaining_defaults = vec![]; while default_iter.valid().unwrap() { let key = default_iter.key().to_vec(); @@ -375,8 +375,8 @@ mod tests { remaining_defaults.push((key, value)); } - let mut lock_iter = fake_engine.c().iterator_cf_opt(CF_LOCK, readopts).unwrap(); - lock_iter.seek(SeekKey::Start).unwrap(); + let mut lock_iter = fake_engine.c().iterator_opt(CF_LOCK, readopts).unwrap(); + lock_iter.seek_to_first().unwrap(); let mut remaining_locks = vec![]; while lock_iter.valid().unwrap() { let lock = Lock::parse(lock_iter.value()).unwrap().to_owned(); diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index 94d750a20f7..d5c1180ddf0 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -232,7 +232,11 @@ mod tests { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut iter_opt = IterOptions::default(); iter_opt.set_max_skippable_internal_keys(1); - let mut iter = Cursor::new(snapshot.iter(iter_opt).unwrap(), ScanMode::Forward, false); + let mut iter = Cursor::new( + snapshot.iter(CF_DEFAULT, iter_opt).unwrap(), + ScanMode::Forward, + false, + ); let mut statistics = CfStatistics::default(); let res = iter.seek(&Key::from_raw(b"foo"), &mut statistics); @@ -258,7 +262,7 @@ mod tests { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut iter = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Forward, false, ); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0864c9edd2d..a43b5270875 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -2728,16 +2728,12 @@ impl Snapshot for TxnTestSnapshot { self.snapshot.get_cf_opt(opts, cf, key) } - fn iter(&self, iter_opt: engine_traits::IterOptions) -> tikv_kv::Result { - self.snapshot.iter(iter_opt) - } - - fn iter_cf( + fn iter( &self, cf: CfName, iter_opt: engine_traits::IterOptions, ) -> tikv_kv::Result { - self.snapshot.iter_cf(cf, iter_opt) + self.snapshot.iter(cf, iter_opt) } fn ext(&self) -> Self::Ext<'_> { diff --git a/src/storage/mvcc/consistency_check.rs b/src/storage/mvcc/consistency_check.rs index f60147d9991..7881eb45903 100644 --- a/src/storage/mvcc/consistency_check.rs +++ b/src/storage/mvcc/consistency_check.rs @@ -12,8 +12,8 @@ use std::{ }; use engine_traits::{ - IterOptions, Iterable, Iterator as EngineIterator, KvEngine, Peekable, SeekKey, CF_DEFAULT, - CF_LOCK, CF_RAFT, CF_WRITE, + IterOptions, Iterable, Iterator as EngineIterator, KvEngine, Peekable, CF_DEFAULT, CF_LOCK, + CF_RAFT, CF_WRITE, }; use kvproto::kvrpcpb::{MvccInfo, MvccLock, MvccValue, MvccWrite, Op}; use raftstore::{ @@ -105,7 +105,7 @@ impl ConsistencyCheckObserver for Mvcc { } let mut scanner = MvccInfoScanner::new( - |cf, opts| snap.iterator_cf_opt(cf, opts).map_err(|e| box_err!(e)), + |cf, opts| snap.iterator_opt(cf, opts).map_err(|e| box_err!(e)), Some(&keys::data_key(region.get_start_key())), Some(&keys::data_end_key(region.get_end_key())), MvccChecksum::new(safe_point), @@ -162,7 +162,7 @@ impl MvccInfoScanner { let iter_opts = IterOptions::new(key_builder(from)?, key_builder(to)?, false); let gen_iter = |cf: &str| -> Result { let mut iter = f(cf, iter_opts.clone())?; - box_try!(iter.seek(SeekKey::Key(from))); + box_try!(iter.seek(from)); Ok(iter) }; @@ -464,7 +464,7 @@ mod tests { for &safe_point in &[150, 160, 100] { let raw = engine.get_rocksdb(); let mut scanner = MvccInfoScanner::new( - |cf, opts| raw.iterator_cf_opt(cf, opts).map_err(|e| box_err!(e)), + |cf, opts| raw.iterator_opt(cf, opts).map_err(|e| box_err!(e)), Some(&keys::data_key(b"")), Some(&keys::data_end_key(b"")), MvccChecksum::new(safe_point), @@ -556,7 +556,7 @@ mod tests { let scan_mvcc = |start: &[u8], end: &[u8], limit: u64| { MvccInfoIterator::new( - |cf, opts| engine.iterator_cf_opt(cf, opts).map_err(|e| box_err!(e)), + |cf, opts| engine.iterator_opt(cf, opts).map_err(|e| box_err!(e)), if start.is_empty() { None } else { Some(start) }, if end.is_empty() { None } else { Some(end) }, limit as usize, diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 4f36599b2f6..17b02c28ec9 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -972,7 +972,7 @@ pub mod tests { iopt.set_hint_min_ts(min); iopt.set_hint_max_ts(max); - let mut iter = snap.iter_cf(CF_WRITE, iopt).unwrap(); + let mut iter = snap.iter(CF_WRITE, iopt).unwrap(); for (i, expect_ts) in res.iter().enumerate() { if i == 0 { @@ -1020,7 +1020,7 @@ pub mod tests { iopt.set_hint_max_ts(Bound::Included(6)); let snap = RegionSnapshot::::from_raw(db.c().clone(), region); - let mut iter = snap.iter_cf(CF_WRITE, iopt).unwrap(); + let mut iter = snap.iter(CF_WRITE, iopt).unwrap(); // Must not omit the latest deletion of key1 to prevent seeing outdated record. assert_eq!(iter.seek_to_first().unwrap(), true); diff --git a/src/storage/raw/encoded.rs b/src/storage/raw/encoded.rs index b9b25015891..788d9a7ed02 100644 --- a/src/storage/raw/encoded.rs +++ b/src/storage/raw/encoded.rs @@ -75,16 +75,9 @@ impl Snapshot for RawEncodeSnapshot { self.map_value(self.snap.get_cf_opt(opts, cf, key)) } - fn iter(&self, iter_opt: IterOptions) -> Result { + fn iter(&self, cf: CfName, iter_opt: IterOptions) -> Result { Ok(RawEncodeIterator::new( - self.snap.iter(iter_opt)?, - self.current_ts, - )) - } - - fn iter_cf(&self, cf: CfName, iter_opt: IterOptions) -> Result { - Ok(RawEncodeIterator::new( - self.snap.iter_cf(cf, iter_opt)?, + self.snap.iter(cf, iter_opt)?, self.current_ts, )) } diff --git a/src/storage/raw/raw_mvcc.rs b/src/storage/raw/raw_mvcc.rs index 4212b1c56ef..4ddfa68a757 100644 --- a/src/storage/raw/raw_mvcc.rs +++ b/src/storage/raw/raw_mvcc.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{CfName, IterOptions, ReadOptions, DATA_KEY_PREFIX_LEN}; +use engine_traits::{CfName, IterOptions, ReadOptions, CF_DEFAULT, DATA_KEY_PREFIX_LEN}; use txn_types::{Key, TimeStamp, Value}; use crate::storage::kv::{Error, ErrorInner, Iterator, Result, Snapshot}; @@ -19,7 +19,7 @@ impl RawMvccSnapshot { pub fn seek_first_key_value_cf( &self, - cf: Option, + cf: CfName, opts: Option, key: &Key, ) -> Result> { @@ -29,10 +29,7 @@ impl RawMvccSnapshot { iter_opt.set_prefix_same_as_start(true); let upper_bound = key.clone().append_ts(TimeStamp::zero()).into_encoded(); iter_opt.set_vec_upper_bound(upper_bound, DATA_KEY_PREFIX_LEN); - let mut iter = match cf { - Some(cf_name) => self.iter_cf(cf_name, iter_opt)?, - None => self.iter(iter_opt)?, - }; + let mut iter = self.iter(cf, iter_opt)?; if iter.seek(key)? { Ok(Some(iter.value().to_owned())) } else { @@ -46,23 +43,19 @@ impl Snapshot for RawMvccSnapshot { type Ext<'a> = S::Ext<'a> where S: 'a; fn get(&self, key: &Key) -> Result> { - self.seek_first_key_value_cf(None, None, key) + self.seek_first_key_value_cf(CF_DEFAULT, None, key) } fn get_cf(&self, cf: CfName, key: &Key) -> Result> { - self.seek_first_key_value_cf(Some(cf), None, key) + self.seek_first_key_value_cf(cf, None, key) } fn get_cf_opt(&self, opts: ReadOptions, cf: CfName, key: &Key) -> Result> { - self.seek_first_key_value_cf(Some(cf), Some(opts), key) - } - - fn iter(&self, iter_opt: IterOptions) -> Result { - Ok(RawMvccIterator::new(self.snap.iter(iter_opt)?)) + self.seek_first_key_value_cf(cf, Some(opts), key) } - fn iter_cf(&self, cf: CfName, iter_opt: IterOptions) -> Result { - Ok(RawMvccIterator::new(self.snap.iter_cf(cf, iter_opt)?)) + fn iter(&self, cf: CfName, iter_opt: IterOptions) -> Result { + Ok(RawMvccIterator::new(self.snap.iter(cf, iter_opt)?)) } #[inline] @@ -315,7 +308,7 @@ mod tests { // seek let iter_opt = IterOptions::default(); - let mut iter = encode_snapshot.iter_cf(CF_DEFAULT, iter_opt).unwrap(); + let mut iter = encode_snapshot.iter(CF_DEFAULT, iter_opt).unwrap(); let mut pairs = vec![]; let raw_key = ApiV2::encode_raw_key_owned(b"r\0a".to_vec(), None); iter.seek(&raw_key).unwrap(); diff --git a/src/storage/raw/store.rs b/src/storage/raw/store.rs index b5b901d77a0..5caad0dfbb6 100644 --- a/src/storage/raw/store.rs +++ b/src/storage/raw/store.rs @@ -197,7 +197,7 @@ impl<'a, S: Snapshot, F: KvFormat> RawStoreInner { if limit == 0 { return Ok(vec![]); } - let mut cursor = Cursor::new(self.snapshot.iter_cf(cf, option)?, ScanMode::Forward, false); + let mut cursor = Cursor::new(self.snapshot.iter(cf, option)?, ScanMode::Forward, false); let statistics = statistics.mut_cf_statistics(cf); if !cursor.seek(start_key, statistics)? { return Ok(vec![]); @@ -248,11 +248,7 @@ impl<'a, S: Snapshot, F: KvFormat> RawStoreInner { if limit == 0 { return Ok(vec![]); } - let mut cursor = Cursor::new( - self.snapshot.iter_cf(cf, option)?, - ScanMode::Backward, - false, - ); + let mut cursor = Cursor::new(self.snapshot.iter(cf, option)?, ScanMode::Backward, false); let statistics = statistics.mut_cf_statistics(cf); if !cursor.reverse_seek(start_key, statistics)? { return Ok(vec![]); @@ -303,8 +299,7 @@ impl<'a, S: Snapshot, F: KvFormat> RawStoreInner { let cf_stats = stats.mut_cf_statistics(cf); let mut opts = IterOptions::new(None, None, false); opts.set_upper_bound(r.get_end_key(), DATA_KEY_PREFIX_LEN); - let mut cursor = - Cursor::new(self.snapshot.iter_cf(cf, opts)?, ScanMode::Forward, false); + let mut cursor = Cursor::new(self.snapshot.iter(cf, opts)?, ScanMode::Forward, false); cursor.seek(&Key::from_encoded(r.get_start_key().to_vec()), cf_stats)?; while cursor.valid()? { row_count += 1; diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index cfe8f68c512..dd9e451e883 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -1452,10 +1452,7 @@ mod tests { fn get_cf_opt(&self, _: ReadOptions, _: CfName, _: &Key) -> Result> { unimplemented!() } - fn iter(&self, _: IterOptions) -> Result { - unimplemented!() - } - fn iter_cf(&self, _: CfName, _: IterOptions) -> Result { + fn iter(&self, _: CfName, _: IterOptions) -> Result { unimplemented!() } fn ext(&self) -> MockSnapshotExt { diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index 5ba658ff062..59f9f077aa2 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -812,10 +812,7 @@ mod tests { fn get_cf_opt(&self, _: ReadOptions, _: CfName, _: &Key) -> EngineResult> { Ok(None) } - fn iter(&self, _: IterOptions) -> EngineResult { - Ok(MockRangeSnapshotIter::default()) - } - fn iter_cf(&self, _: CfName, _: IterOptions) -> EngineResult { + fn iter(&self, _: CfName, _: IterOptions) -> EngineResult { Ok(MockRangeSnapshotIter::default()) } fn lower_bound(&self) -> Option<&[u8]> { diff --git a/tests/failpoints/cases/test_replica_stale_read.rs b/tests/failpoints/cases/test_replica_stale_read.rs index 83180d8156d..ab11b7039fd 100644 --- a/tests/failpoints/cases/test_replica_stale_read.rs +++ b/tests/failpoints/cases/test_replica_stale_read.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use kvproto::{kvrpcpb::Op, metapb::Peer}; use pd_client::PdClient; @@ -314,15 +314,15 @@ fn test_stale_read_while_applying_snapshot() { // Compact logs to force requesting snapshot after clearing send filters. let gc_limit = cluster.cfg.raft_store.raft_log_gc_count_limit(); - let state = cluster.truncated_state(1, 1); - for i in 1..gc_limit * 10 { + for i in 1..gc_limit * 2 { let (k, v) = ( format!("k{}", i).into_bytes(), format!("v{}", i).into_bytes(), ); leader_client.must_kv_write(&pd_client, vec![new_mutation(Op::Put, &k, &v)], k); } - cluster.wait_log_truncated(1, 1, state.get_index() + 5 * gc_limit); + let last_index_on_store_2 = cluster.raft_local_state(1, 2).last_index; + cluster.wait_log_truncated(1, 1, last_index_on_store_2 + 1); // Pasuse before applying snapshot is finish let raft_before_applying_snap_finished = "raft_before_applying_snap_finished"; @@ -330,7 +330,7 @@ fn test_stale_read_while_applying_snapshot() { cluster.clear_send_filters(); // Wait follower 2 start applying snapshot - cluster.wait_log_truncated(1, 2, state.get_index() + 5 * gc_limit); + cluster.wait_log_truncated(1, 2, last_index_on_store_2 + 1); sleep_ms(100); // We can't read while applying snapshot and the `safe_ts` should reset to 0 @@ -346,6 +346,9 @@ fn test_stale_read_while_applying_snapshot() { // Resume applying snapshot fail::remove(raft_before_applying_snap_finished); + let last_index_on_store_1 = cluster.raft_local_state(1, 1).last_index; + cluster.wait_last_index(1, 2, last_index_on_store_1, Duration::from_secs(3)); + // We can read `key1` after applied snapshot follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), k1_commit_ts); // There is still lock on the region, we can't read `key1` with the newest ts diff --git a/tests/failpoints/cases/test_ttl.rs b/tests/failpoints/cases/test_ttl.rs index 9e6a8a3bcde..4748b1d0bbf 100644 --- a/tests/failpoints/cases/test_ttl.rs +++ b/tests/failpoints/cases/test_ttl.rs @@ -349,7 +349,7 @@ fn test_ttl_iterator_impl() { let snapshot = engine.snapshot(SnapContext::default()).unwrap(); let ttl_snapshot = RawEncodeSnapshot::<_, F>::from_snapshot(snapshot); let mut iter = ttl_snapshot - .iter(IterOptions::new(None, None, false)) + .iter(CF_DEFAULT, IterOptions::new(None, None, false)) .unwrap(); iter.seek_to_first().unwrap(); assert_eq!(iter.key(), b"r\0key1"); diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index e7901bf9bf4..2d6657e5a90 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -8,7 +8,7 @@ use std::{ }; use engine_rocks::Compat; -use engine_traits::{Iterable, Peekable, CF_WRITE}; +use engine_traits::{Iterable, Peekable, CF_DEFAULT, CF_WRITE}; use keys::data_key; use kvproto::{metapb, pdpb, raft_cmdpb::*, raft_serverpb::RaftMessage}; use pd_client::PdClient; @@ -202,11 +202,17 @@ fn test_auto_split_region(cluster: &mut Cluster) { let mut size = 0; cluster.engines[&store_id] .kv - .scan(&data_key(b""), &data_key(middle_key), false, |k, v| { - size += k.len() as u64; - size += v.len() as u64; - Ok(true) - }) + .scan( + CF_DEFAULT, + &data_key(b""), + &data_key(middle_key), + false, + |k, v| { + size += k.len() as u64; + size += v.len() as u64; + Ok(true) + }, + ) .expect(""); assert!(size <= REGION_SPLIT_SIZE); // although size may be smaller than REGION_SPLIT_SIZE, but the diff should diff --git a/tests/integrations/raftstore/test_tombstone.rs b/tests/integrations/raftstore/test_tombstone.rs index 158223d9a2c..18a1e5a96ca 100644 --- a/tests/integrations/raftstore/test_tombstone.rs +++ b/tests/integrations/raftstore/test_tombstone.rs @@ -50,7 +50,7 @@ fn test_tombstone(cluster: &mut Cluster) { for cf in engine_2.cf_names() { engine_2 .c() - .scan_cf(cf, b"", &[0xFF], false, |k, v| { + .scan(cf, b"", &[0xFF], false, |k, v| { existing_kvs.push((k.to_vec(), v.to_vec())); Ok(true) }) diff --git a/tests/integrations/storage/test_raftkv.rs b/tests/integrations/storage/test_raftkv.rs index 5e41e3c2789..4f48cb72920 100644 --- a/tests/integrations/storage/test_raftkv.rs +++ b/tests/integrations/storage/test_raftkv.rs @@ -385,20 +385,7 @@ fn assert_none_cf(ctx: SnapContext<'_>, engine: &E, cf: CfName, key: assert_eq!(snapshot.get_cf(cf, &Key::from_raw(key)).unwrap(), None); } -fn assert_seek(ctx: SnapContext<'_>, engine: &E, key: &[u8], pair: (&[u8], &[u8])) { - let snapshot = engine.snapshot(ctx).unwrap(); - let mut cursor = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), - ScanMode::Mixed, - false, - ); - let mut statistics = CfStatistics::default(); - cursor.seek(&Key::from_raw(key), &mut statistics).unwrap(); - assert_eq!(cursor.key(&mut statistics), &*bytes::encode_bytes(pair.0)); - assert_eq!(cursor.value(&mut statistics), pair.1); -} - -fn assert_seek_cf( +fn assert_seek( ctx: SnapContext<'_>, engine: &E, cf: CfName, @@ -407,7 +394,7 @@ fn assert_seek_cf( ) { let snapshot = engine.snapshot(ctx).unwrap(); let mut cursor = Cursor::new( - snapshot.iter_cf(cf, IterOptions::default()).unwrap(), + snapshot.iter(cf, IterOptions::default()).unwrap(), ScanMode::Mixed, false, ); @@ -479,14 +466,14 @@ fn batch(ctx: SnapContext<'_>, engine: &E) { fn seek(ctx: SnapContext<'_>, engine: &E) { must_put(ctx.pb_ctx, engine, b"x", b"1"); - assert_seek(ctx.clone(), engine, b"x", (b"x", b"1")); - assert_seek(ctx.clone(), engine, b"a", (b"x", b"1")); + assert_seek(ctx.clone(), engine, CF_DEFAULT, b"x", (b"x", b"1")); + assert_seek(ctx.clone(), engine, CF_DEFAULT, b"a", (b"x", b"1")); must_put(ctx.pb_ctx, engine, b"z", b"2"); - assert_seek(ctx.clone(), engine, b"y", (b"z", b"2")); - assert_seek(ctx.clone(), engine, b"x\x00", (b"z", b"2")); + assert_seek(ctx.clone(), engine, CF_DEFAULT, b"y", (b"z", b"2")); + assert_seek(ctx.clone(), engine, CF_DEFAULT, b"x\x00", (b"z", b"2")); let snapshot = engine.snapshot(ctx.clone()).unwrap(); let mut iter = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Mixed, false, ); @@ -505,7 +492,7 @@ fn near_seek(ctx: SnapContext<'_>, engine: &E) { must_put(ctx.pb_ctx, engine, b"z", b"2"); let snapshot = engine.snapshot(ctx.clone()).unwrap(); let mut cursor = Cursor::new( - snapshot.iter(IterOptions::default()).unwrap(), + snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), ScanMode::Mixed, false, ); @@ -525,11 +512,12 @@ fn near_seek(ctx: SnapContext<'_>, engine: &E) { must_delete(ctx.pb_ctx, engine, b"z"); } +// TODO: remove following as the code path of cf is the same. fn cf(ctx: SnapContext<'_>, engine: &E) { assert_none_cf(ctx.clone(), engine, "default", b"key"); must_put_cf(ctx.pb_ctx, engine, "default", b"key", b"value"); assert_has_cf(ctx.clone(), engine, "default", b"key", b"value"); - assert_seek_cf(ctx.clone(), engine, "default", b"k", (b"key", b"value")); + assert_seek(ctx.clone(), engine, "default", b"k", (b"key", b"value")); must_delete_cf(ctx.pb_ctx, engine, "default", b"key"); assert_none_cf(ctx, engine, "default", b"key"); } From 5dc99a9ff0dfbc0641a589eaa44df0f2ae1ec2e1 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 22 Jul 2022 17:07:09 +0800 Subject: [PATCH 109/676] *: solve the problem that test_stale_read_while_applying_snapshot is unstable (#13091) close tikv/tikv#13057 Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- components/engine_rocks/src/misc.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index ce608d353b7..ad1f385654f 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -202,9 +202,10 @@ impl MiscExt for RocksEngine { if let Some(n) = util::get_cf_num_files_at_level(self.as_inner(), handle, 0) { let options = self.as_inner().get_options_cf(handle); let slowdown_trigger = options.get_level_zero_slowdown_writes_trigger(); + let compaction_trigger = options.get_level_zero_file_num_compaction_trigger() as u64; // Leave enough buffer to tolerate heavy write workload, // which may flush some memtables in a short time. - if n > u64::from(slowdown_trigger) / 2 { + if n > u64::from(slowdown_trigger) / 2 && n >= compaction_trigger { return Ok(true); } } From b4a0e3cfd566644ad5b10ba5ba309c029b42f5a2 Mon Sep 17 00:00:00 2001 From: tison Date: Sat, 23 Jul 2022 11:51:09 +0800 Subject: [PATCH 110/676] *: migrate match-template to standalone crate (#13112) close tikv/tikv#13113 Signed-off-by: tison Co-authored-by: Ti Chi Robot --- Cargo.toml | 3 +- components/api_version/Cargo.toml | 2 +- components/match_template/Cargo.toml | 13 - components/match_template/src/lib.rs | 261 --------------------- components/tidb_query_aggr/Cargo.toml | 2 +- components/tidb_query_datatype/Cargo.toml | 2 +- components/tidb_query_executors/Cargo.toml | 2 +- components/tidb_query_expr/Cargo.toml | 2 +- scripts/check-bins.py | 2 +- 9 files changed, 7 insertions(+), 282 deletions(-) delete mode 100644 components/match_template/Cargo.toml delete mode 100644 components/match_template/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index b094c857d5e..fd7af73bdf4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,7 +110,7 @@ libc = "0.2" libloading = "0.7" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { path = "components/log_wrappers" } -match_template = { path = "components/match_template" } +match-template = "0.0.1" memory_trace_macros = { path = "components/memory_trace_macros" } mime = "0.3.13" more-asserts = "0.2" @@ -243,7 +243,6 @@ members = [ "components/into_other", "components/keys", "components/log_wrappers", - "components/match_template", "components/online_config", "components/panic_hook", "components/pd_client", diff --git a/components/api_version/Cargo.toml b/components/api_version/Cargo.toml index b6ce4bf54d5..e2d4beaacbf 100644 --- a/components/api_version/Cargo.toml +++ b/components/api_version/Cargo.toml @@ -12,7 +12,7 @@ bitflags = "1.0.1" codec = { path = "../codec", default-features = false } engine_traits = { path = "../engine_traits", default-features = false } kvproto = { git = "https://github.com/pingcap/kvproto.git" } -match_template = { path = "../match_template" } +match-template = "0.0.1" thiserror = "1.0" tikv_alloc = { path = "../tikv_alloc" } tikv_util = { path = "../tikv_util", default-features = false } diff --git a/components/match_template/Cargo.toml b/components/match_template/Cargo.toml deleted file mode 100644 index 1f5f683ee92..00000000000 --- a/components/match_template/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "match_template" -version = "0.0.1" -edition = "2018" -publish = false - -[lib] -proc-macro = true - -[dependencies] -proc-macro2 = "1" -quote = "1" -syn = { version = "1", features = ["full", "extra-traits", "fold"] } diff --git a/components/match_template/src/lib.rs b/components/match_template/src/lib.rs deleted file mode 100644 index eb50d333379..00000000000 --- a/components/match_template/src/lib.rs +++ /dev/null @@ -1,261 +0,0 @@ -// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. - -#[macro_use] -extern crate quote; - -use proc_macro2::{Group, TokenStream, TokenTree}; -use quote::ToTokens; -use syn::{ - parse::{Parse, ParseStream, Result}, - punctuated::Punctuated, - *, -}; - -/// This crate provides a macro that can be used to append a match expression with multiple -/// arms, where the tokens in the first arm, as a template, can be subsitituted and the template -/// arm will be expanded into multiple arms. -/// -/// For example, the following code -/// -/// ```ignore -/// match_template! { -/// T = [Int, Real, Double], -/// match Foo { -/// EvalType::T => { panic!("{}", EvalType::T); }, -/// EvalType::Other => unreachable!(), -/// } -/// } -/// ``` -/// -/// generates -/// -/// ```ignore -/// match Foo { -/// EvalType::Int => { panic!("{}", EvalType::Int); }, -/// EvalType::Real => { panic!("{}", EvalType::Real); }, -/// EvalType::Double => { panic!("{}", EvalType::Double); }, -/// EvalType::Other => unreachable!(), -/// } -/// ``` -/// -/// In addition, substitution can vary on two sides of the arms. -/// -/// For example, -/// -/// ```ignore -/// match_template! { -/// T = [Foo, Bar => Baz], -/// match Foo { -/// EvalType::T => { panic!("{}", EvalType::T); }, -/// } -/// } -/// ``` -/// -/// generates -/// -/// ```ignore -/// match Foo { -/// EvalType::Foo => { panic!("{}", EvalType::Foo); }, -/// EvalType::Bar => { panic!("{}", EvalType::Baz); }, -/// } -/// ``` -/// -/// Wildcard match arm is also supported (but there will be no substitution). -#[proc_macro] -pub fn match_template(input: proc_macro::TokenStream) -> proc_macro::TokenStream { - let mt = parse_macro_input!(input as MatchTemplate); - mt.expand().into() -} -struct MatchTemplate { - template_ident: Ident, - substitutes: Punctuated, - match_exp: Box, - template_arm: Arm, - remaining_arms: Vec, -} - -impl Parse for MatchTemplate { - fn parse(input: ParseStream<'_>) -> Result { - let template_ident = input.parse()?; - input.parse::()?; - let substitutes_tokens; - bracketed!(substitutes_tokens in input); - let substitutes = - Punctuated::::parse_terminated(&substitutes_tokens)?; - input.parse::()?; - let m: ExprMatch = input.parse()?; - let mut arms = m.arms; - arms.iter_mut().for_each(|arm| arm.comma = None); - assert!(!arms.is_empty(), "Expect at least 1 match arm"); - let template_arm = arms.remove(0); - assert!(template_arm.guard.is_none(), "Expect no match arm guard"); - - Ok(Self { - template_ident, - substitutes, - match_exp: m.expr, - template_arm, - remaining_arms: arms, - }) - } -} - -impl MatchTemplate { - fn expand(self) -> TokenStream { - let Self { - template_ident, - substitutes, - match_exp, - template_arm, - remaining_arms, - } = self; - let match_arms = substitutes.into_iter().map(|substitute| { - let mut arm = template_arm.clone(); - let (left_tokens, right_tokens) = match substitute { - Substitution::Identical(ident) => { - (ident.clone().into_token_stream(), ident.into_token_stream()) - } - Substitution::Map(left_ident, right_tokens) => { - (left_ident.into_token_stream(), right_tokens) - } - }; - arm.pat = replace_in_token_stream(arm.pat, &template_ident, &left_tokens); - arm.body = replace_in_token_stream(arm.body, &template_ident, &right_tokens); - arm - }); - quote! { - match #match_exp { - #(#match_arms,)* - #(#remaining_arms,)* - } - } - } -} - -#[derive(Debug)] -enum Substitution { - Identical(Ident), - Map(Ident, TokenStream), -} - -impl Parse for Substitution { - fn parse(input: ParseStream<'_>) -> Result { - let left_ident = input.parse()?; - let fat_arrow: Option]> = input.parse()?; - if fat_arrow.is_some() { - let mut right_tokens: Vec = vec![]; - while !input.peek(Token![,]) && !input.is_empty() { - right_tokens.push(input.parse()?); - } - Ok(Substitution::Map( - left_ident, - right_tokens.into_iter().collect(), - )) - } else { - Ok(Substitution::Identical(left_ident)) - } - } -} - -fn replace_in_token_stream( - input: T, - from_ident: &Ident, - to_tokens: &TokenStream, -) -> T { - let mut tokens = TokenStream::new(); - input.to_tokens(&mut tokens); - - let tokens: TokenStream = tokens - .into_iter() - .flat_map(|token| match token { - TokenTree::Ident(ident) if ident == *from_ident => to_tokens.clone(), - TokenTree::Group(group) => Group::new( - group.delimiter(), - replace_in_token_stream(group.stream(), from_ident, to_tokens), - ) - .into_token_stream(), - other => other.into(), - }) - .collect(); - - syn::parse2(tokens).unwrap() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_basic() { - let input = r#" - T = [Int, Real, Double], - match foo() { - EvalType::T => { panic!("{}", EvalType::T); }, - EvalType::Other => unreachable!(), - } - "#; - - let expect_output = r#" - match foo() { - EvalType::Int => { panic!("{}", EvalType::Int); }, - EvalType::Real => { panic!("{}", EvalType::Real); }, - EvalType::Double => { panic!("{}", EvalType::Double); }, - EvalType::Other => unreachable!(), - } - "#; - let expect_output_stream: TokenStream = expect_output.parse().unwrap(); - - let mt: MatchTemplate = syn::parse_str(input).unwrap(); - let output = mt.expand(); - assert_eq!(output.to_string(), expect_output_stream.to_string()); - } - - #[test] - fn test_wildcard() { - let input = r#" - TT = [Foo, Bar], - match v { - VectorValue::TT => EvalType::TT, - _ => unreachable!(), - } - "#; - - let expect_output = r#" - match v { - VectorValue::Foo => EvalType::Foo, - VectorValue::Bar => EvalType::Bar, - _ => unreachable!(), - } - "#; - let expect_output_stream: TokenStream = expect_output.parse().unwrap(); - - let mt: MatchTemplate = syn::parse_str(input).unwrap(); - let output = mt.expand(); - assert_eq!(output.to_string(), expect_output_stream.to_string()); - } - - #[test] - fn test_map() { - let input = r#" - TT = [Foo, Bar => Baz, Bark => <&'static Whooh>()], - match v { - VectorValue::TT => EvalType::TT, - EvalType::Other => unreachable!(), - } - "#; - - let expect_output = r#" - match v { - VectorValue::Foo => EvalType::Foo, - VectorValue::Bar => EvalType::Baz, - VectorValue::Bark => EvalType:: < & 'static Whooh>(), - EvalType::Other => unreachable!(), - } - "#; - let expect_output_stream: TokenStream = expect_output.parse().unwrap(); - - let mt: MatchTemplate = syn::parse_str(input).unwrap(); - let output = mt.expand(); - assert_eq!(output.to_string(), expect_output_stream.to_string()); - } -} diff --git a/components/tidb_query_aggr/Cargo.toml b/components/tidb_query_aggr/Cargo.toml index 71025327e9a..e1642fb6f31 100644 --- a/components/tidb_query_aggr/Cargo.toml +++ b/components/tidb_query_aggr/Cargo.toml @@ -6,7 +6,7 @@ publish = false description = "Vector aggr functions of query engine to run TiDB pushed down executors" [dependencies] -match_template = { path = "../match_template" } +match-template = "0.0.1" tidb_query_codegen = { path = "../tidb_query_codegen" } tidb_query_common = { path = "../tidb_query_common", default-features = false } tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index 56acb353302..2e748d26d8d 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -20,7 +20,7 @@ hex = "0.4" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" log_wrappers = { path = "../log_wrappers" } -match_template = { path = "../match_template" } +match-template = "0.0.1" nom = { version = "5.1.0", default-features = false, features = ["std"] } num = { version = "0.3", default-features = false } num-derive = "0.3" diff --git a/components/tidb_query_executors/Cargo.toml b/components/tidb_query_executors/Cargo.toml index db4992b0306..923696606ed 100644 --- a/components/tidb_query_executors/Cargo.toml +++ b/components/tidb_query_executors/Cargo.toml @@ -13,7 +13,7 @@ futures = { version = "0.3", features = ["compat"] } itertools = "0.10" kvproto = { git = "https://github.com/pingcap/kvproto.git" } log_wrappers = { path = "../log_wrappers" } -match_template = { path = "../match_template" } +match-template = "0.0.1" protobuf = { version = "2.8", features = ["bytes"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } diff --git a/components/tidb_query_expr/Cargo.toml b/components/tidb_query_expr/Cargo.toml index 8458516390b..a04553b5b6d 100644 --- a/components/tidb_query_expr/Cargo.toml +++ b/components/tidb_query_expr/Cargo.toml @@ -14,7 +14,7 @@ file_system = { path = "../file_system", default-features = false } flate2 = { version = "=1.0.11", default-features = false, features = ["zlib"] } hex = "0.4" log_wrappers = { path = "../log_wrappers" } -match_template = { path = "../match_template" } +match-template = "0.0.1" num = { version = "0.3", default-features = false } num-traits = "0.2" openssl = { version = "0.10" } diff --git a/scripts/check-bins.py b/scripts/check-bins.py index 04a3b77c01d..e8c7bf03791 100644 --- a/scripts/check-bins.py +++ b/scripts/check-bins.py @@ -11,7 +11,7 @@ # NB: The fuzzer bins here are just placeholders due to the workspace # structure; they are not actual fuzzers. WHITE_LIST = { - "online_config", "online_config_derive", "match_template", "tidb_query_codegen", + "online_config", "online_config_derive", "tidb_query_codegen", "panic_hook", "fuzz", "fuzzer_afl", "fuzzer_honggfuzz", "fuzzer_libfuzzer", "coprocessor_plugin_api", "example_plugin", "memory_trace_macros", "case_macros", "tracker" From 24316a45e8416593ec0e101a4e85a695f651eeb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 25 Jul 2022 16:17:10 +0800 Subject: [PATCH 111/676] log-backup: unify namespace for log backup metrics (#13105) ref tikv/tikv#12534 Renamed metrics in `backup-stream`: tikv_stream_(.*) => tikv_log_backup_$1 tikv_pending_initial_scan => tikv_log_backup_initial_scan Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- components/backup-stream/src/metrics.rs | 32 ++++++++--------- metrics/grafana/tikv_details.json | 46 ++++++++++++------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index a94be6df7f6..de150ef2395 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -44,14 +44,14 @@ lazy_static! { ) .unwrap(); pub static ref HANDLE_EVENT_DURATION_HISTOGRAM: HistogramVec = register_histogram_vec!( - "tikv_stream_event_handle_duration_sec", + "tikv_log_backup_event_handle_duration_sec", "The duration of handling an cmd batch.", &["stage"], exponential_buckets(0.001, 2.0, 16).unwrap() ) .unwrap(); pub static ref HANDLE_KV_HISTOGRAM: Histogram = register_histogram!( - "tikv_stream_handle_kv_batch", + "tikv_log_backup_handle_kv_batch", "The total kv pair change handle by the stream backup", exponential_buckets(1.0, 2.0, 16).unwrap() ) @@ -62,18 +62,18 @@ lazy_static! { ) .unwrap(); pub static ref INCREMENTAL_SCAN_SIZE: Histogram = register_histogram!( - "tikv_stream_incremental_scan_bytes", + "tikv_log_backup_incremental_scan_bytes", "The size of scanning.", exponential_buckets(64.0, 2.0, 16).unwrap() ) .unwrap(); pub static ref SKIP_KV_COUNTER: Counter = register_counter!( - "tikv_stream_skip_kv_count", + "tikv_log_backup_skip_kv_count", "The total kv size skipped by the streaming", ) .unwrap(); pub static ref STREAM_ERROR: IntCounterVec = register_int_counter_vec!( - "tikv_stream_errors", + "tikv_log_backup_errors", "The errors during stream backup.", &["type"] ) @@ -85,61 +85,61 @@ lazy_static! { ) .unwrap(); pub static ref HEAP_MEMORY: IntGauge = register_int_gauge!( - "tikv_stream_heap_memory", + "tikv_log_backup_heap_memory", "The heap memory allocating by stream backup." ) .unwrap(); pub static ref ON_EVENT_COST_HISTOGRAM: HistogramVec = register_histogram_vec!( - "tikv_stream_on_event_duration_seconds", + "tikv_log_backup_on_event_duration_seconds", "The time cost of handling events.", &["stage"], exponential_buckets(0.001, 2.0, 16).unwrap() ) .unwrap(); pub static ref STORE_CHECKPOINT_TS: IntGaugeVec = register_int_gauge_vec!( - "tikv_stream_store_checkpoint_ts", + "tikv_log_backup_store_checkpoint_ts", "The checkpoint ts (next backup ts) of task", &["task"], ) .unwrap(); pub static ref FLUSH_DURATION: HistogramVec = register_histogram_vec!( - "tikv_stream_flush_duration_sec", + "tikv_log_backup_flush_duration_sec", "The time cost of flushing a task.", &["stage"], exponential_buckets(1.0, 2.0, 16).unwrap() ) .unwrap(); pub static ref FLUSH_FILE_SIZE: Histogram = register_histogram!( - "tikv_stream_flush_file_size", + "tikv_log_backup_flush_file_size", "Some statistics of flushing of this run.", exponential_buckets(1024.0, 2.0, 16).unwrap() ) .unwrap(); pub static ref INITIAL_SCAN_DURATION: Histogram = register_histogram!( - "tikv_stream_initial_scan_duration_sec", + "tikv_log_backup_initial_scan_duration_sec", "The duration of initial scanning.", exponential_buckets(0.001, 2.0, 16).unwrap() ) .unwrap(); pub static ref SKIP_RETRY: IntCounterVec = register_int_counter_vec!( - "tikv_stream_skip_retry_observe", + "tikv_log_backup_skip_retry_observe", "The reason of giving up observing region when meeting error.", &["reason"], ) .unwrap(); pub static ref INITIAL_SCAN_STAT: IntCounterVec = register_int_counter_vec!( - "tikv_stream_initial_scan_operations", + "tikv_log_backup_initial_scan_operations", "The operations over rocksdb during initial scanning.", &["cf", "op"], ) .unwrap(); pub static ref STREAM_ENABLED: IntCounter = register_int_counter!( - "tikv_stream_enabled", + "tikv_log_backup_enabled", "When gt 0, this node enabled streaming." ) .unwrap(); pub static ref TRACK_REGION: IntGauge = register_int_gauge!( - "tikv_stream_observed_region", + "tikv_log_backup_observed_region", "the region being observed by the current store.", ) .unwrap(); @@ -150,7 +150,7 @@ lazy_static! { ) .unwrap(); pub static ref PENDING_INITIAL_SCAN_LEN: IntGaugeVec = register_int_gauge_vec!( - "tikv_pending_initial_scan", + "tikv_log_backup_pending_initial_scan", "The pending initial scan", &["stage"] ) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index eda4e88de66..5da0ca7c0d3 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -40199,7 +40199,7 @@ "targets": [ { "exemplar": true, - "expr": "tikv_stream_enabled{instance=~\"$instance\"}", + "expr": "tikv_log_backup_enabled{instance=~\"$instance\"}", "instant": true, "interval": "", "legendFormat": "{{ instance }}", @@ -40261,7 +40261,7 @@ "targets": [ { "exemplar": true, - "expr": "increase(tikv_stream_flush_file_size_sum{instance=~\"$instance\"}[30m]) / on(instance) increase(tikv_stream_flush_duration_sec_count{stage=~\"save_files\",instance=~\"$instance\"}[30m])", + "expr": "increase(tikv_log_backup_flush_file_size_sum{instance=~\"$instance\"}[30m]) / on(instance) increase(tikv_log_backup_flush_duration_sec_count{stage=~\"save_files\",instance=~\"$instance\"}[30m])", "hide": false, "instant": true, "interval": "", @@ -40322,7 +40322,7 @@ "targets": [ { "exemplar": true, - "expr": "round(increase(tikv_stream_flush_file_size_count{instance=~\"$instance\"}[30m]))", + "expr": "round(increase(tikv_log_backup_flush_file_size_count{instance=~\"$instance\"}[30m]))", "instant": true, "interval": "", "legendFormat": "{{ instance }}", @@ -40383,7 +40383,7 @@ "targets": [ { "exemplar": true, - "expr": "round(sum(increase(tikv_stream_flush_duration_sec_count{stage=~\"save_files\",instance=~\"$instance\"}[30m])))", + "expr": "round(sum(increase(tikv_log_backup_flush_duration_sec_count{stage=~\"save_files\",instance=~\"$instance\"}[30m])))", "hide": false, "instant": true, "interval": "", @@ -40444,7 +40444,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_flush_file_size_sum{instance=~\"$instance\"}[30m]))", + "expr": "sum(increase(tikv_log_backup_flush_file_size_sum{instance=~\"$instance\"}[30m]))", "hide": false, "instant": true, "interval": "", @@ -40663,7 +40663,7 @@ "targets": [ { "exemplar": true, - "expr": "round(sum(increase(tikv_stream_flush_file_size_count{instance=~\"$instance\"}[30m])))", + "expr": "round(sum(increase(tikv_log_backup_flush_file_size_count{instance=~\"$instance\"}[30m])))", "hide": false, "instant": true, "interval": "", @@ -40840,7 +40840,7 @@ "targets": [ { "exemplar": true, - "expr": "rate(tikv_stream_handle_kv_batch_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", + "expr": "rate(tikv_log_backup_handle_kv_batch_sum{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", "format": "time_series", "instant": false, "interval": "", @@ -40941,7 +40941,7 @@ "targets": [ { "exemplar": true, - "expr": "rate(tikv_stream_incremental_scan_bytes_sum{instance=~\"$instance\"}[$__rate_interval])", + "expr": "rate(tikv_log_backup_incremental_scan_bytes_sum{instance=~\"$instance\"}[$__rate_interval])", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -41193,7 +41193,7 @@ "targets": [ { "exemplar": true, - "expr": "tikv_stream_heap_memory{instance=~\"$instance\"}", + "expr": "tikv_log_backup_heap_memory{instance=~\"$instance\"}", "format": "time_series", "instant": false, "interval": "", @@ -41298,7 +41298,7 @@ "targets": [ { "exemplar": true, - "expr": "tikv_stream_observed_region{instance=~\"$instance\"}", + "expr": "tikv_log_backup_observed_region{instance=~\"$instance\"}", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -41307,7 +41307,7 @@ }, { "exemplar": true, - "expr": "sum(tikv_stream_observed_region{instance=~\"$instance\"})", + "expr": "sum(tikv_log_backup_observed_region{instance=~\"$instance\"})", "hide": false, "interval": "", "legendFormat": "total", @@ -41407,7 +41407,7 @@ "targets": [ { "exemplar": true, - "expr": "increase(tikv_stream_errors{instance=~\"$instance\"}[$__interval])", + "expr": "increase(tikv_log_backup_errors{instance=~\"$instance\"}[$__interval])", "format": "time_series", "hide": false, "instant": false, @@ -41418,7 +41418,7 @@ }, { "exemplar": true, - "expr": "tikv_stream_errors{instance=~\"$instance\"}", + "expr": "tikv_log_backup_errors{instance=~\"$instance\"}", "hide": true, "interval": "1m", "intervalFactor": 2, @@ -41752,7 +41752,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_flush_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"save_files\"}[$__interval])) by (le)", + "expr": "sum(increase(tikv_log_backup_flush_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"save_files\"}[$__interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -41835,7 +41835,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_initial_scan_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__interval])) by (le)", + "expr": "sum(increase(tikv_log_backup_initial_scan_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -41918,7 +41918,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_event_handle_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"to_stream_event\"}[$__interval])) by (le)", + "expr": "sum(increase(tikv_log_backup_event_handle_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"to_stream_event\"}[$__interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -42001,7 +42001,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_event_handle_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"get_router_lock\"}[$__interval])) by (le)", + "expr": "sum(increase(tikv_log_backup_event_handle_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"get_router_lock\"}[$__interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -42084,7 +42084,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_handle_kv_batch_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__interval])) by (le)", + "expr": "sum(increase(tikv_log_backup_handle_kv_batch_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -42167,7 +42167,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_event_handle_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"save_to_temp_file\"}[$__interval])) by (le)", + "expr": "sum(increase(tikv_log_backup_event_handle_duration_sec_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=~\"save_to_temp_file\"}[$__interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -42250,7 +42250,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_on_event_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=\"write_to_tempfile\"}[$__interval])) by (le)", + "expr": "sum(increase(tikv_log_backup_on_event_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=\"write_to_tempfile\"}[$__interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -42333,7 +42333,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(increase(tikv_stream_on_event_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=\"syscall_write\"}[$__interval])) by (le)", + "expr": "sum(increase(tikv_log_backup_on_event_duration_seconds_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", stage=\"syscall_write\"}[$__interval])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -42716,7 +42716,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(tikv_stream_initial_scan_operations{instance=~\"$instance\", op=~\"read_bytes\"}[$__rate_interval])) BY (op, cf)", + "expr": "sum(rate(tikv_log_backup_initial_scan_operations{instance=~\"$instance\", op=~\"read_bytes\"}[$__rate_interval])) BY (op, cf)", "interval": "", "legendFormat": "{{ cf }}", "queryType": "randomWalk", @@ -42815,7 +42815,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(tikv_stream_initial_scan_operations{instance=~\"$instance\", op!~\"read_bytes\"}[$__rate_interval])) BY (op, cf) > 0", + "expr": "sum(rate(tikv_log_backup_initial_scan_operations{instance=~\"$instance\", op!~\"read_bytes\"}[$__rate_interval])) BY (op, cf) > 0", "interval": "", "legendFormat": "{{ cf }}/{{ op }}", "queryType": "randomWalk", From e278777f97cc71aa60cf21550c5e8f55bfa95b84 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 25 Jul 2022 16:43:11 +0800 Subject: [PATCH 112/676] raftstorev2: add apply batch system (#13013) ref tikv/tikv#12842 Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/apply.rs | 181 +++++++++++++++++- components/raftstore-v2/src/batch/mod.rs | 1 + components/raftstore-v2/src/batch/store.rs | 31 ++- components/raftstore-v2/src/fsm/apply.rs | 72 +++++++ components/raftstore-v2/src/fsm/mod.rs | 1 + components/raftstore-v2/src/lib.rs | 1 + components/raftstore-v2/src/raft/apply.rs | 23 +++ components/raftstore-v2/src/raft/mod.rs | 2 + components/raftstore-v2/src/raft/peer.rs | 13 +- .../src/router/internal_message.rs | 3 + components/raftstore-v2/src/router/message.rs | 4 +- components/raftstore-v2/src/router/mod.rs | 2 + components/raftstore-v2/src/tablet.rs | 93 +++++++++ components/raftstore/src/store/fsm/apply.rs | 48 +++-- 14 files changed, 439 insertions(+), 36 deletions(-) create mode 100644 components/raftstore-v2/src/raft/apply.rs create mode 100644 components/raftstore-v2/src/router/internal_message.rs create mode 100644 components/raftstore-v2/src/tablet.rs diff --git a/components/raftstore-v2/src/batch/apply.rs b/components/raftstore-v2/src/batch/apply.rs index a7e392127d5..ab44d435e67 100644 --- a/components/raftstore-v2/src/batch/apply.rs +++ b/components/raftstore-v2/src/batch/apply.rs @@ -1,4 +1,183 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +//! This module contains all structs related to apply batch system. +//! +//! After being started, each thread will have its own `ApplyPoller` and poll +//! using `ApplyContext`. For more information, see the documentation of batch-system. + +use std::{ + ops::{Deref, DerefMut}, + sync::Arc, +}; + +use batch_system::{ + BasicMailbox, BatchRouter, BatchSystem, HandleResult, HandlerBuilder, PollHandler, +}; +use engine_traits::{KvEngine, RaftEngine}; +use raftstore::store::{ + fsm::{ + apply::{ControlFsm, ControlMsg}, + ApplyNotifier, + }, + util::LatencyInspector, + Config, +}; +use slog::Logger; +use tikv_util::config::{Tracker, VersionTrack}; + +use crate::{ + fsm::{ApplyFsm, ApplyFsmDelegate}, + raft::{Apply, Peer}, + router::ApplyTask, +}; + +pub struct ApplyContext { + cfg: Config, +} + +impl ApplyContext { + pub fn new(cfg: Config) -> Self { + ApplyContext { cfg } + } +} + +pub struct ApplyPoller { + apply_task_buf: Vec, + pending_latency_inspect: Vec, + apply_ctx: ApplyContext, + cfg_tracker: Tracker, +} + +impl ApplyPoller { + pub fn new(apply_ctx: ApplyContext, cfg_tracker: Tracker) -> ApplyPoller { + ApplyPoller { + apply_task_buf: Vec::new(), + pending_latency_inspect: Vec::new(), + apply_ctx, + cfg_tracker, + } + } + + /// Updates the internal buffer to latest capacity. + fn apply_buf_capacity(&mut self) { + let new_cap = self.messages_per_tick(); + tikv_util::set_vec_capacity(&mut self.apply_task_buf, new_cap); + } + + #[inline] + fn messages_per_tick(&self) -> usize { + self.apply_ctx.cfg.messages_per_tick + } +} + +impl PollHandler, ControlFsm> for ApplyPoller +where + EK: KvEngine, +{ + fn begin(&mut self, _batch_size: usize, update_cfg: F) + where + for<'a> F: FnOnce(&'a batch_system::Config), + { + let cfg = self.cfg_tracker.any_new().map(|c| c.clone()); + if let Some(cfg) = cfg { + let last_messages_per_tick = self.messages_per_tick(); + self.apply_ctx.cfg = cfg; + if self.apply_ctx.cfg.messages_per_tick != last_messages_per_tick { + self.apply_buf_capacity(); + } + update_cfg(&self.apply_ctx.cfg.apply_batch_system); + } + } + + fn handle_control(&mut self, control: &mut ControlFsm) -> Option { + control.handle_messages(&mut self.pending_latency_inspect); + for inspector in self.pending_latency_inspect.drain(..) { + // TODO: support apply duration. + inspector.finish(); + } + Some(0) + } + + fn handle_normal( + &mut self, + normal: &mut impl DerefMut>, + ) -> batch_system::HandleResult { + let received_cnt = normal.recv(&mut self.apply_task_buf); + let handle_result = if received_cnt == self.messages_per_tick() { + HandleResult::KeepProcessing + } else { + HandleResult::stop_at(0, false) + }; + let mut delegate = ApplyFsmDelegate::new(normal, &mut self.apply_ctx); + delegate.handle_msgs(&mut self.apply_task_buf); + handle_result + } + + fn end(&mut self, batch: &mut [Option>>]) { + // TODO: support memory trace + } +} + +pub struct ApplyPollerBuilder { + cfg: Arc>, +} + +impl ApplyPollerBuilder { + pub fn new(cfg: Arc>) -> Self { + Self { cfg } + } +} + +impl HandlerBuilder, ControlFsm> for ApplyPollerBuilder { + type Handler = ApplyPoller; + + fn build(&mut self, priority: batch_system::Priority) -> Self::Handler { + let apply_ctx = ApplyContext::new(self.cfg.value().clone()); + let cfg_tracker = self.cfg.clone().tracker("apply".to_string()); + ApplyPoller::new(apply_ctx, cfg_tracker) + } +} + /// Batch system for applying logs pipeline. -pub struct ApplySystem; +pub struct ApplySystem { + system: BatchSystem, ControlFsm>, +} + +impl Deref for ApplySystem { + type Target = BatchSystem, ControlFsm>; + + fn deref(&self) -> &BatchSystem, ControlFsm> { + &self.system + } +} + +impl DerefMut for ApplySystem { + fn deref_mut(&mut self) -> &mut BatchSystem, ControlFsm> { + &mut self.system + } +} + +impl ApplySystem { + pub fn schedule_all<'a, ER: RaftEngine>(&self, peers: impl Iterator>) { + let mut mailboxes = Vec::with_capacity(peers.size_hint().0); + for peer in peers { + let apply = Apply::new(peer); + let (tx, fsm) = ApplyFsm::new(apply); + mailboxes.push(( + peer.region_id(), + BasicMailbox::new(tx, fsm, self.router().state_cnt().clone()), + )); + } + self.router().register_all(mailboxes); + } +} + +pub type ApplyRouter = BatchRouter, ControlFsm>; + +pub fn create_apply_batch_system(cfg: &Config) -> (ApplyRouter, ApplySystem) { + let (control_tx, control_fsm) = ControlFsm::new(); + let (router, system) = + batch_system::create_system(&cfg.apply_batch_system, control_tx, control_fsm); + let system = ApplySystem { system }; + (router, system) +} diff --git a/components/raftstore-v2/src/batch/mod.rs b/components/raftstore-v2/src/batch/mod.rs index e856147220d..0f4b9fba3d3 100644 --- a/components/raftstore-v2/src/batch/mod.rs +++ b/components/raftstore-v2/src/batch/mod.rs @@ -8,4 +8,5 @@ mod apply; mod store; +pub(crate) use apply::ApplyContext; pub use store::{create_store_batch_system, StoreContext, StoreSystem}; diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 6a8974259ff..2dce4b54c2a 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -19,6 +19,7 @@ use tikv_util::{ timer::SteadyTimer, }; +use super::apply::{create_apply_batch_system, ApplyPollerBuilder, ApplyRouter, ApplySystem}; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate}, raft::Peer, @@ -92,10 +93,7 @@ impl StorePoller { } fn schedule_ticks(&mut self) { - assert_eq!( - PeerTick::get_all_ticks().len(), - self.poll_ctx.tick_batch.len() - ); + assert_eq!(PeerTick::all_ticks().len(), self.poll_ctx.tick_batch.len()); for batch in &mut self.poll_ctx.tick_batch { batch.schedule(&self.poll_ctx.timer); } @@ -111,10 +109,12 @@ impl PollHandler { system: BatchSystem, StoreFsm>, + apply_router: ApplyRouter, + apply_system: ApplySystem, logger: Logger, } @@ -280,7 +282,7 @@ impl StoreSystem { T: Transport + 'static, { let mut builder = StorePollerBuilder::new( - cfg, + cfg.clone(), store.get_id(), raft_engine, tablet_factory, @@ -288,6 +290,8 @@ impl StoreSystem { self.logger.clone(), ); let peers = builder.init()?; + self.apply_system + .schedule_all(peers.values().map(|pair| pair.1.peer())); // Choose a different name so we know what version is actually used. rs stands // for raft store. let tag = format!("rs-{}", store.get_id()); @@ -309,10 +313,15 @@ impl StoreSystem { router.force_send(addr, PeerMsg::Start).unwrap(); } router.send_control(StoreMsg::Start { store }).unwrap(); + + let apply_poller_builder = ApplyPollerBuilder::new(cfg); + self.apply_system + .spawn("apply".to_owned(), apply_poller_builder); Ok(()) } pub fn shutdown(&mut self) { + self.apply_system.shutdown(); self.system.shutdown(); } } @@ -332,6 +341,12 @@ where let (store_tx, store_fsm) = StoreFsm::new(cfg, store); let (router, system) = batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); - let system = StoreSystem { system, logger }; + let (apply_router, apply_system) = create_apply_batch_system(cfg); + let system = StoreSystem { + system, + apply_router, + apply_system, + logger, + }; (router, system) } diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index bb3db8c75d3..43e3441528e 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -1 +1,73 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use batch_system::Fsm; +use crossbeam::channel::TryRecvError; +use engine_traits::KvEngine; +use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; + +use crate::{batch::ApplyContext, raft::Apply, router::ApplyTask}; + +pub struct ApplyFsm { + apply: Apply, + receiver: Receiver, + is_stopped: bool, +} + +impl ApplyFsm { + pub fn new(apply: Apply) -> (LooseBoundedSender, Box) { + let (tx, rx) = mpsc::loose_bounded(usize::MAX); + ( + tx, + Box::new(Self { + apply, + receiver: rx, + is_stopped: false, + }), + ) + } + + /// Fetches tasks to `apply_task_buf`. It will stop when the buffer is full. + /// + /// Returns how many messages are fetched. + pub fn recv(&mut self, apply_task_buf: &mut Vec) -> usize { + let l = apply_task_buf.len(); + for i in l..apply_task_buf.capacity() { + match self.receiver.try_recv() { + Ok(msg) => apply_task_buf.push(msg), + Err(e) => { + if let TryRecvError::Disconnected = e { + self.is_stopped = true; + } + return i - l; + } + } + } + apply_task_buf.capacity() - l + } +} + +impl Fsm for ApplyFsm { + type Message = ApplyTask; + + #[inline] + fn is_stopped(&self) -> bool { + self.is_stopped + } +} + +pub struct ApplyFsmDelegate<'a, EK: KvEngine> { + fsm: &'a mut ApplyFsm, + apply_ctx: &'a mut ApplyContext, +} + +impl<'a, EK: KvEngine> ApplyFsmDelegate<'a, EK> { + pub fn new(fsm: &'a mut ApplyFsm, apply_ctx: &'a mut ApplyContext) -> Self { + Self { fsm, apply_ctx } + } + + pub fn handle_msgs(&self, apply_task_buf: &mut Vec) { + for task in apply_task_buf.drain(..) { + // TODO: handle the tasks. + } + } +} diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs index 9f3bcefac46..02f788d3be2 100644 --- a/components/raftstore-v2/src/fsm/mod.rs +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -9,5 +9,6 @@ mod apply; mod peer; mod store; +pub use apply::{ApplyFsm, ApplyFsmDelegate}; pub use peer::{PeerFsm, PeerFsmDelegate, SenderFsmPair}; pub use store::{StoreFsm, StoreFsmDelegate}; diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 220fa0b2d33..71062161384 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -16,6 +16,7 @@ mod fsm; mod operation; mod raft; mod router; +mod tablet; pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreSystem}; diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs new file mode 100644 index 00000000000..0c7abf52b58 --- /dev/null +++ b/components/raftstore-v2/src/raft/apply.rs @@ -0,0 +1,23 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use slog::Logger; + +use super::Peer; +use crate::tablet::CachedTablet; + +/// Apply applies all the committed commands to kv db. +pub struct Apply { + tablet: CachedTablet, + logger: Logger, +} + +impl Apply { + #[inline] + pub fn new(peer: &Peer) -> Self { + Apply { + tablet: peer.tablet().clone(), + logger: peer.logger().clone(), + } + } +} diff --git a/components/raftstore-v2/src/raft/mod.rs b/components/raftstore-v2/src/raft/mod.rs index 045e9ff89b3..c1d6a522d79 100644 --- a/components/raftstore-v2/src/raft/mod.rs +++ b/components/raftstore-v2/src/raft/mod.rs @@ -1,7 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod apply; mod peer; mod storage; +pub use apply::Apply; pub use peer::Peer; pub use storage::{write_initial_states, Storage}; diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index e2ccb068cbc..e52ec322445 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -10,12 +10,15 @@ use slog::{o, Logger}; use tikv_util::{box_err, config::ReadableSize}; use super::storage::Storage; -use crate::Result; +use crate::{ + tablet::{self, CachedTablet}, + Result, +}; /// A peer that delegates commands between state machine and raft. pub struct Peer { raft_group: RawNode>, - tablet: Option, + tablet: CachedTablet, logger: Logger, } @@ -57,6 +60,8 @@ impl Peer { }; let tablet_index = s.region_state().get_tablet_index(); + // Another option is always create tablet even if tablet index is 0. But this can + // introduce race when gc old tablet and create new peer. let tablet = if tablet_index != 0 { if !tablet_factory.exists(region_id, tablet_index) { return Err(box_err!( @@ -73,7 +78,7 @@ impl Peer { Ok(Some(Peer { raft_group: RawNode::new(&raft_cfg, s, &logger)?, - tablet, + tablet: CachedTablet::new(tablet), logger, })) } @@ -94,7 +99,7 @@ impl Peer { } #[inline] - pub fn tablet(&self) -> &Option { + pub fn tablet(&self) -> &CachedTablet { &self.tablet } diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs new file mode 100644 index 00000000000..f5ef72d8e30 --- /dev/null +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -0,0 +1,3 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub enum ApplyTask {} diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 75011163e83..37d9515d301 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -148,7 +148,7 @@ pub enum PeerTick { } impl PeerTick { - pub const VARIANT_COUNT: usize = Self::get_all_ticks().len(); + pub const VARIANT_COUNT: usize = Self::all_ticks().len(); #[inline] pub fn tag(self) -> &'static str { @@ -166,7 +166,7 @@ impl PeerTick { } } - pub const fn get_all_ticks() -> &'static [PeerTick] { + pub const fn all_ticks() -> &'static [PeerTick] { const TICKS: &[PeerTick] = &[ PeerTick::Raft, PeerTick::RaftLogGc, diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index fd27349ef43..a7c7672b835 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -1,5 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod internal_message; mod message; +pub(crate) use internal_message::ApplyTask; pub use message::{PeerMsg, PeerTick, StoreMsg, StoreTick}; diff --git a/components/raftstore-v2/src/tablet.rs b/components/raftstore-v2/src/tablet.rs new file mode 100644 index 00000000000..2293eaed033 --- /dev/null +++ b/components/raftstore-v2/src/tablet.rs @@ -0,0 +1,93 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, +}; + +struct LatestTablet { + data: Mutex>, + version: AtomicU64, +} + +/// Tablet may change during split, merge and applying snapshot. So we need a shared value to +/// reflect the latest tablet. `CachedTablet` provide cache that can speed up common access. +#[derive(Clone)] +pub struct CachedTablet { + latest: Arc>, + cache: Option, + version: u64, +} + +impl CachedTablet { + #[inline] + pub fn new(data: Option) -> Self { + CachedTablet { + latest: Arc::new(LatestTablet { + data: Mutex::new(data.clone()), + version: AtomicU64::new(0), + }), + cache: data, + version: 0, + } + } + + pub fn set(&mut self, data: EK) { + let mut guard = self.latest.data.lock().unwrap(); + *guard = Some(data.clone()); + let v = self.latest.version.fetch_add(1, Ordering::Relaxed); + drop(guard); + self.cache = Some(data); + self.version = v; + } + + /// Get the tablet from cache without checking if it's up to date. + #[inline] + pub fn cache(&self) -> Option<&EK> { + self.cache.as_ref() + } + + /// Get the latest tablet. + #[inline] + pub fn latest(&mut self) -> Option<&EK> { + if self.latest.version.load(Ordering::Relaxed) > self.version { + let guard = self.latest.data.lock().unwrap(); + self.version = self.latest.version.load(Ordering::Relaxed); + self.cache = guard.clone(); + } + self.cache() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cached_tablet() { + let mut cached_tablet = CachedTablet::new(None); + assert_eq!(cached_tablet.cache(), None); + assert_eq!(cached_tablet.latest(), None); + + cached_tablet = CachedTablet::new(Some(1)); + assert_eq!(cached_tablet.cache().cloned(), Some(1)); + assert_eq!(cached_tablet.latest().cloned(), Some(1)); + + // Setting tablet will refresh cache immediately. + cached_tablet.set(2); + assert_eq!(cached_tablet.cache().cloned(), Some(2)); + assert_eq!(cached_tablet.latest().cloned(), Some(2)); + + let mut cloned = cached_tablet.clone(); + // Clone should reuse cache. + assert_eq!(cloned.cache().cloned(), Some(2)); + cloned.set(1); + assert_eq!(cloned.cache().cloned(), Some(1)); + assert_eq!(cloned.latest().cloned(), Some(1)); + + // Local cache won't be refreshed until querying latest. + assert_eq!(cached_tablet.cache().cloned(), Some(2)); + assert_eq!(cached_tablet.latest().cloned(), Some(1)); + assert_eq!(cached_tablet.cache().cloned(), Some(1)); + } +} diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 16e039dd640..88bff373760 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -3831,7 +3831,7 @@ pub struct ControlFsm { } impl ControlFsm { - fn new() -> (LooseBoundedSender, Box) { + pub fn new() -> (LooseBoundedSender, Box) { let (tx, rx) = loose_bounded(std::usize::MAX); let fsm = Box::new(ControlFsm { stopped: false, @@ -3839,6 +3839,28 @@ impl ControlFsm { }); (tx, fsm) } + + pub fn handle_messages(&mut self, pending_latency_inspect: &mut Vec) { + // Usually there will be only 1 control message. + loop { + match self.receiver.try_recv() { + Ok(ControlMsg::LatencyInspect { + send_time, + mut inspector, + }) => { + inspector.record_apply_wait(send_time.saturating_elapsed()); + pending_latency_inspect.push(inspector); + } + Err(TryRecvError::Empty) => { + return; + } + Err(TryRecvError::Disconnected) => { + self.stopped = true; + return; + } + } + } + } } impl Fsm for ControlFsm { @@ -3887,27 +3909,11 @@ where } fn handle_control(&mut self, control: &mut ControlFsm) -> Option { - loop { - match control.receiver.try_recv() { - Ok(ControlMsg::LatencyInspect { - send_time, - mut inspector, - }) => { - if self.apply_ctx.timer.is_none() { - self.apply_ctx.timer = Some(Instant::now_coarse()); - } - inspector.record_apply_wait(send_time.saturating_elapsed()); - self.apply_ctx.pending_latency_inspect.push(inspector); - } - Err(TryRecvError::Empty) => { - return Some(0); - } - Err(TryRecvError::Disconnected) => { - control.stopped = true; - return Some(0); - } - } + control.handle_messages(&mut self.apply_ctx.pending_latency_inspect); + if !self.apply_ctx.pending_latency_inspect.is_empty() && self.apply_ctx.timer.is_none() { + self.apply_ctx.timer = Some(Instant::now_coarse()); } + Some(0) } fn handle_normal(&mut self, normal: &mut impl DerefMut>) -> HandleResult { From 50f6c6fc294c2aa425b4684b357a5713681895b7 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Wed, 27 Jul 2022 11:51:11 +0800 Subject: [PATCH 113/676] pd-client: pd client should update if the grpc stream sender failed. (#13094) close tikv/tikv#12934 Signed-off-by: bufferflies <1045931706@qq.com> --- components/error_code/src/pd.rs | 1 + components/pd_client/src/client.rs | 8 +++- components/pd_client/src/errors.rs | 6 ++- components/test_pd/src/mocker/service.rs | 1 + tests/integrations/pd/test_rpc_client.rs | 53 ++++++++++++++++++++++++ 5 files changed, 66 insertions(+), 3 deletions(-) diff --git a/components/error_code/src/pd.rs b/components/error_code/src/pd.rs index 60952e96922..018c86c3d39 100644 --- a/components/error_code/src/pd.rs +++ b/components/error_code/src/pd.rs @@ -8,6 +8,7 @@ define_error_codes!( CLUSTER_NOT_BOOTSTRAPPED => ("ClusterNotBootstraped", "", ""), INCOMPATIBLE => ("Imcompatible", "", ""), GRPC => ("gRPC", "", ""), + STREAM_DISCONNECT => ("StreamDisconnect","",""), REGION_NOT_FOUND => ("RegionNotFound", "", ""), STORE_TOMBSTONE => ("StoreTombstone", "", ""), GLOBAL_CONFIG_NOT_FOUND => ("GlobalConfigNotFound","",""), diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index facf2e24b76..173b25357c4 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -619,6 +619,9 @@ impl PdClient for RpcClient { if last > last_report { last_report = last - 1; } + fail::fail_point!("region_heartbeat_send_failed", |_| { + Err(Error::Grpc(grpcio::Error::RemoteStopped)) + }); Ok((r, WriteFlags::default())) })) .await; @@ -643,7 +646,8 @@ impl PdClient for RpcClient { .expect("expect region heartbeat sender"); let ret = sender .unbounded_send(req) - .map_err(|e| Error::Other(Box::new(e))); + .map_err(|e| Error::StreamDisconnect(e.into_send_error())); + Box::pin(future::ready(ret)) as PdFuture<_> }; @@ -1048,7 +1052,7 @@ impl PdClient for RpcClient { .expect("expect region buckets sender"); let ret = sender .unbounded_send(req) - .map_err(|e| Error::Other(Box::new(e))); + .map_err(|e| Error::StreamDisconnect(e.into_send_error())); Box::pin(future::ready(ret)) as PdFuture<_> }; diff --git a/components/pd_client/src/errors.rs b/components/pd_client/src/errors.rs index a9e4ffe6266..61adceec391 100644 --- a/components/pd_client/src/errors.rs +++ b/components/pd_client/src/errors.rs @@ -3,6 +3,7 @@ use std::{error, result}; use error_code::{self, ErrorCode, ErrorCodeExt}; +use futures::channel::mpsc::SendError; use thiserror::Error; #[derive(Debug, Error)] @@ -15,6 +16,8 @@ pub enum Error { Incompatible, #[error("{0}")] Grpc(#[from] grpcio::Error), + #[error("{0}")] + StreamDisconnect(#[from] SendError), #[error("unknown error {0:?}")] Other(#[from] Box), #[error("region is not found for key {}", log_wrappers::Value::key(.0))] @@ -30,7 +33,7 @@ pub type Result = result::Result; impl Error { pub fn retryable(&self) -> bool { match self { - Error::Grpc(_) | Error::ClusterNotBootstrapped(_) => true, + Error::Grpc(_) | Error::ClusterNotBootstrapped(_) | Error::StreamDisconnect(_) => true, Error::Other(_) | Error::RegionNotFound(_) | Error::StoreTombstone(_) @@ -48,6 +51,7 @@ impl ErrorCodeExt for Error { Error::ClusterNotBootstrapped(_) => error_code::pd::CLUSTER_NOT_BOOTSTRAPPED, Error::Incompatible => error_code::pd::INCOMPATIBLE, Error::Grpc(_) => error_code::pd::GRPC, + Error::StreamDisconnect(_) => error_code::pd::STREAM_DISCONNECT, Error::RegionNotFound(_) => error_code::pd::REGION_NOT_FOUND, Error::StoreTombstone(_) => error_code::pd::STORE_TOMBSTONE, Error::GlobalConfigNotFound(_) => error_code::pd::GLOBAL_CONFIG_NOT_FOUND, diff --git a/components/test_pd/src/mocker/service.rs b/components/test_pd/src/mocker/service.rs index 572eb9534f9..2ff5c178c67 100644 --- a/components/test_pd/src/mocker/service.rs +++ b/components/test_pd/src/mocker/service.rs @@ -238,6 +238,7 @@ impl PdMocker for Service { .insert(region_id, req.get_leader().clone()); let mut resp = RegionHeartbeatResponse::default(); + resp.set_region_id(req.get_region().get_id()); let header = Service::header(); resp.set_header(header); Some(Ok(resp)) diff --git a/tests/integrations/pd/test_rpc_client.rs b/tests/integrations/pd/test_rpc_client.rs index eb0337f8a22..20fc6b70908 100644 --- a/tests/integrations/pd/test_rpc_client.rs +++ b/tests/integrations/pd/test_rpc_client.rs @@ -475,6 +475,59 @@ fn test_change_leader_async() { panic!("failed, leader should changed"); } +#[test] +fn test_pd_client_heartbeat_send_failed() { + let pd_client_send_fail_fp = "region_heartbeat_send_failed"; + fail::cfg(pd_client_send_fail_fp, "return()").unwrap(); + let server = MockServer::with_case(1, Arc::new(AlreadyBootstrapped)); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + let poller = Builder::new_multi_thread() + .thread_name(thd_name!("poller")) + .worker_threads(1) + .build() + .unwrap(); + let (tx, rx) = mpsc::channel(); + let f = + client.handle_region_heartbeat_response(1, move |resp| tx.send(resp).unwrap_or_default()); + poller.spawn(f); + + let heartbeat_send_fail = |ok| { + let mut region = metapb::Region::default(); + region.set_id(1); + poller.spawn(client.region_heartbeat( + store::RAFT_INIT_LOG_TERM, + region, + metapb::Peer::default(), + RegionStat::default(), + None, + )); + let rsp = rx.recv_timeout(Duration::from_millis(100)); + if ok { + assert!(rsp.is_ok()); + assert_eq!(rsp.unwrap().get_region_id(), 1); + } else { + assert!(rsp.is_err()); + } + + let region = block_on(client.get_region_by_id(1)); + if ok { + assert!(region.is_ok()); + let r = region.unwrap(); + assert!(r.is_some()); + assert_eq!(1, r.unwrap().get_id()); + } else { + assert!(region.is_err()); + } + }; + // send fail if network is block. + heartbeat_send_fail(false); + fail::remove(pd_client_send_fail_fp); + // send success after network recovered. + heartbeat_send_fail(true); +} + #[test] fn test_region_heartbeat_on_leader_change() { let eps_count = 3; From 6a9db360d9d49a03696473a4d6402606b68b2686 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 27 Jul 2022 14:09:10 +0800 Subject: [PATCH 114/676] raftstore: extract EntryStorage (#13115) ref tikv/tikv#12842 This PR extract part of `PeerStorage` as `EntryStorage`, which only serves entry access. It will be reused by raftstorev2. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- .../src/store/async_io/write_tests.rs | 9 +- .../raftstore/src/store/entry_storage.rs | 1417 ++++++++++++++++ components/raftstore/src/store/fsm/apply.rs | 67 +- components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/mod.rs | 10 +- components/raftstore/src/store/peer.rs | 26 +- .../raftstore/src/store/peer_storage.rs | 1502 ++--------------- components/raftstore/src/store/worker/read.rs | 44 +- .../raftstore/src/store/worker/region.rs | 16 +- 9 files changed, 1603 insertions(+), 1490 deletions(-) create mode 100644 components/raftstore/src/store/entry_storage.rs diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 97d41824a62..04ece802a45 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -11,7 +11,7 @@ use tempfile::Builder; use super::*; use crate::{ - store::{Config, Transport}, + store::{peer_storage::tests::new_entry, Config, Transport}, Result, }; @@ -42,13 +42,6 @@ fn must_have_entries_and_state( } } -fn new_entry(index: u64, term: u64) -> Entry { - let mut e = Entry::default(); - e.set_index(index); - e.set_term(term); - e -} - fn new_raft_state(term: u64, vote: u64, commit: u64, last_index: u64) -> RaftLocalState { let mut raft_state = RaftLocalState::new(); raft_state.mut_hard_state().set_term(term); diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs new file mode 100644 index 00000000000..4f751a35b17 --- /dev/null +++ b/components/raftstore/src/store/entry_storage.rs @@ -0,0 +1,1417 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains the implementation of the `EntryStorage`, which covers a subset of +//! raft storage. This module will be shared between raftstore v1 and v2. + +use std::{ + cell::{Cell, RefCell}, + cmp, + collections::VecDeque, + mem, + ops::Range, + sync::{Arc, Mutex}, +}; + +use collections::HashMap; +use engine_traits::{KvEngine, RaftEngine, RAFT_LOG_MULTI_GET_CNT}; +use fail::fail_point; +use kvproto::raft_serverpb::{RaftApplyState, RaftLocalState}; +use protobuf::Message; +use raft::{prelude::*, util::limit_size, GetEntriesContext, StorageError}; +use tikv_alloc::TraceEvent; +use tikv_util::{debug, info, worker::Scheduler}; + +use super::{metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE}; +use crate::{bytes_capacity, store::worker::RaftlogFetchTask}; + +const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; +const SHRINK_CACHE_CAPACITY: usize = 64; +const ENTRY_MEM_SIZE: usize = mem::size_of::(); + +pub const MAX_INIT_ENTRY_COUNT: usize = 1024; + +#[inline] +pub fn first_index(state: &RaftApplyState) -> u64 { + state.get_truncated_state().get_index() + 1 +} + +#[inline] +pub fn last_index(state: &RaftLocalState) -> u64 { + state.get_last_index() +} + +/// Committed entries sent to apply threads. +#[derive(Clone)] +pub struct CachedEntries { + pub range: Range, + // Entries and dangle size for them. `dangle` means not in entry cache. + entries: Arc, usize)>>, +} + +impl CachedEntries { + pub fn new(entries: Vec) -> Self { + assert!(!entries.is_empty()); + let start = entries.first().map(|x| x.index).unwrap(); + let end = entries.last().map(|x| x.index).unwrap() + 1; + let range = Range { start, end }; + CachedEntries { + entries: Arc::new(Mutex::new((entries, 0))), + range, + } + } + + /// Take cached entries and dangle size for them. `dangle` means not in entry cache. + pub fn take_entries(&self) -> (Vec, usize) { + mem::take(&mut *self.entries.lock().unwrap()) + } +} + +struct EntryCache { + // The last index of persisted entry. + // It should be equal to `RaftLog::persisted`. + persisted: u64, + cache: VecDeque, + trace: VecDeque, + hit: Cell, + miss: Cell, + #[cfg(test)] + size_change_cb: Option>, +} + +impl EntryCache { + fn first_index(&self) -> Option { + self.cache.front().map(|e| e.get_index()) + } + + fn fetch_entries_to( + &self, + begin: u64, + end: u64, + mut fetched_size: u64, + max_size: u64, + ents: &mut Vec, + ) { + if begin >= end { + return; + } + assert!(!self.cache.is_empty()); + let cache_low = self.cache.front().unwrap().get_index(); + let start_idx = begin.checked_sub(cache_low).unwrap() as usize; + let limit_idx = end.checked_sub(cache_low).unwrap() as usize; + + let mut end_idx = start_idx; + self.cache + .iter() + .skip(start_idx) + .take_while(|e| { + let cur_idx = end_idx as u64 + cache_low; + assert_eq!(e.get_index(), cur_idx); + let m = u64::from(e.compute_size()); + fetched_size += m; + if fetched_size == m { + end_idx += 1; + fetched_size <= max_size && end_idx < limit_idx + } else if fetched_size <= max_size { + end_idx += 1; + end_idx < limit_idx + } else { + false + } + }) + .count(); + // Cache either is empty or contains latest log. Hence we don't need to fetch log + // from rocksdb anymore. + assert!(end_idx == limit_idx || fetched_size > max_size); + let (first, second) = tikv_util::slices_in_range(&self.cache, start_idx, end_idx); + ents.extend_from_slice(first); + ents.extend_from_slice(second); + } + + fn append(&mut self, region_id: u64, peer_id: u64, entries: &[Entry]) { + if !entries.is_empty() { + let mut mem_size_change = 0; + let old_capacity = self.cache.capacity(); + mem_size_change += self.append_impl(region_id, peer_id, entries); + let new_capacity = self.cache.capacity(); + mem_size_change += Self::cache_vec_mem_size_change(new_capacity, old_capacity); + mem_size_change += self.shrink_if_necessary(); + self.flush_mem_size_change(mem_size_change); + } + } + + fn append_impl(&mut self, region_id: u64, peer_id: u64, entries: &[Entry]) -> i64 { + let mut mem_size_change = 0; + + if let Some(cache_last_index) = self.cache.back().map(|e| e.get_index()) { + let first_index = entries[0].get_index(); + if cache_last_index >= first_index { + let cache_len = self.cache.len(); + let truncate_to = cache_len + .checked_sub((cache_last_index - first_index + 1) as usize) + .unwrap_or_default(); + let trunc_to_idx = self.cache[truncate_to].index; + for e in self.cache.drain(truncate_to..) { + mem_size_change -= + (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; + } + if let Some(cached) = self.trace.back() { + // Only committed entries can be traced, and only uncommitted entries + // can be truncated. So there won't be any overlaps. + let cached_last = cached.range.end - 1; + assert!(cached_last < trunc_to_idx); + } + } else if cache_last_index + 1 < first_index { + panic!( + "[region {}] {} unexpected hole: {} < {}", + region_id, peer_id, cache_last_index, first_index + ); + } + } + + for e in entries { + self.cache.push_back(e.to_owned()); + mem_size_change += (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; + } + // In the past, the entry cache will be truncated if its size exceeds a certain number. + // However, after introducing async write io, the entry must stay in cache if it's not + // persisted to raft db because the raft-rs may need to read entries.(e.g. leader sends + // MsgAppend to followers) + + mem_size_change + } + + pub fn entry(&self, idx: u64) -> Option<&Entry> { + let cache_low = self.cache.front()?.get_index(); + if idx >= cache_low { + Some(&self.cache[(idx - cache_low) as usize]) + } else { + None + } + } + + /// Compact all entries whose indexes are less than `idx`. + pub fn compact_to(&mut self, mut idx: u64) -> u64 { + if idx > self.persisted + 1 { + // Only the persisted entries can be compacted + idx = self.persisted + 1; + } + + let mut mem_size_change = 0; + + // Clean cached entries which have been already sent to apply threads. For example, + // if entries [1, 10), [10, 20), [20, 30) are sent to apply threads and `compact_to(15)` + // is called, only [20, 30) will still be kept in cache. + let old_trace_cap = self.trace.capacity(); + while let Some(cached_entries) = self.trace.pop_front() { + if cached_entries.range.start >= idx { + self.trace.push_front(cached_entries); + let trace_len = self.trace.len(); + let trace_cap = self.trace.capacity(); + if trace_len < SHRINK_CACHE_CAPACITY && trace_cap > SHRINK_CACHE_CAPACITY { + self.trace.shrink_to(SHRINK_CACHE_CAPACITY); + } + break; + } + let (_, dangle_size) = cached_entries.take_entries(); + mem_size_change -= dangle_size as i64; + idx = cmp::max(cached_entries.range.end, idx); + } + let new_trace_cap = self.trace.capacity(); + mem_size_change += Self::trace_vec_mem_size_change(new_trace_cap, old_trace_cap); + + let cache_first_idx = self.first_index().unwrap_or(u64::MAX); + if cache_first_idx >= idx { + self.flush_mem_size_change(mem_size_change); + assert!(mem_size_change <= 0); + return -mem_size_change as u64; + } + + let cache_last_idx = self.cache.back().unwrap().get_index(); + // Use `cache_last_idx + 1` to make sure cache can be cleared completely if necessary. + let compact_to = (cmp::min(cache_last_idx + 1, idx) - cache_first_idx) as usize; + for e in self.cache.drain(..compact_to) { + mem_size_change -= (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64 + } + + mem_size_change += self.shrink_if_necessary(); + self.flush_mem_size_change(mem_size_change); + assert!(mem_size_change <= 0); + -mem_size_change as u64 + } + + fn total_mem_size(&self) -> i64 { + let data_size: i64 = self + .cache + .iter() + .map(|e| (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64) + .sum(); + let cache_vec_size = Self::cache_vec_mem_size_change(self.cache.capacity(), 0); + let trace_vec_size = Self::trace_vec_mem_size_change(self.trace.capacity(), 0); + data_size + cache_vec_size + trace_vec_size + } + + fn cache_vec_mem_size_change(new_capacity: usize, old_capacity: usize) -> i64 { + ENTRY_MEM_SIZE as i64 * (new_capacity as i64 - old_capacity as i64) + } + + fn trace_vec_mem_size_change(new_capacity: usize, old_capacity: usize) -> i64 { + mem::size_of::() as i64 * (new_capacity as i64 - old_capacity as i64) + } + + fn flush_mem_size_change(&self, mem_size_change: i64) { + #[cfg(test)] + if let Some(size_change_cb) = self.size_change_cb.as_ref() { + size_change_cb(mem_size_change); + } + let event = if mem_size_change > 0 { + TraceEvent::Add(mem_size_change as usize) + } else { + TraceEvent::Sub(-mem_size_change as usize) + }; + MEMTRACE_ENTRY_CACHE.trace(event); + RAFT_ENTRIES_CACHES_GAUGE.add(mem_size_change); + } + + fn flush_stats(&self) { + let hit = self.hit.replace(0); + RAFT_ENTRY_FETCHES.hit.inc_by(hit); + let miss = self.miss.replace(0); + RAFT_ENTRY_FETCHES.miss.inc_by(miss); + } + + #[inline] + fn is_empty(&self) -> bool { + self.cache.is_empty() + } + + fn trace_cached_entries(&mut self, entries: CachedEntries) { + let dangle_size = { + let mut guard = entries.entries.lock().unwrap(); + + let last_idx = guard.0.last().map(|e| e.index).unwrap(); + let cache_front = match self.cache.front().map(|e| e.index) { + Some(i) => i, + None => u64::MAX, + }; + + let dangle_range = if last_idx < cache_front { + // All entries are not in entry cache. + 0..guard.0.len() + } else if let Ok(i) = guard.0.binary_search_by(|e| e.index.cmp(&cache_front)) { + // Some entries are in entry cache. + 0..i + } else { + // All entries are in entry cache. + 0..0 + }; + + let mut size = 0; + for e in &guard.0[dangle_range] { + size += bytes_capacity(&e.data) + bytes_capacity(&e.context); + } + guard.1 = size; + size + }; + + let old_capacity = self.trace.capacity(); + self.trace.push_back(entries); + let new_capacity = self.trace.capacity(); + let diff = Self::trace_vec_mem_size_change(new_capacity, old_capacity); + + self.flush_mem_size_change(diff + dangle_size as i64); + } + + fn shrink_if_necessary(&mut self) -> i64 { + if self.cache.len() < SHRINK_CACHE_CAPACITY && self.cache.capacity() > SHRINK_CACHE_CAPACITY + { + let old_capacity = self.cache.capacity(); + self.cache.shrink_to_fit(); + let new_capacity = self.cache.capacity(); + return Self::cache_vec_mem_size_change(new_capacity, old_capacity); + } + 0 + } + + fn update_persisted(&mut self, persisted: u64) { + self.persisted = persisted; + } +} + +impl Default for EntryCache { + fn default() -> Self { + let entry_cache = EntryCache { + persisted: 0, + cache: Default::default(), + trace: Default::default(), + hit: Cell::new(0), + miss: Cell::new(0), + #[cfg(test)] + size_change_cb: None, + }; + entry_cache.flush_mem_size_change(entry_cache.total_mem_size()); + entry_cache + } +} + +impl Drop for EntryCache { + fn drop(&mut self) { + let mem_size_change = self.total_mem_size(); + self.flush_mem_size_change(-mem_size_change); + self.flush_stats(); + } +} + +#[derive(Debug, PartialEq)] +pub enum RaftlogFetchState { + Fetching, + Fetched(Box), +} + +#[derive(Debug, PartialEq)] +pub struct RaftlogFetchResult { + pub ents: raft::Result>, + // because entries may be empty, so store the original low index that the task issued + pub low: u64, + // the original max size that the task issued + pub max_size: u64, + // if the ents hit max_size + pub hit_size_limit: bool, + // the times that async fetch have already tried + pub tried_cnt: usize, + // the term when the task issued + pub term: u64, +} + +#[derive(Default)] +struct AsyncFetchStats { + async_fetch: Cell, + sync_fetch: Cell, + fallback_fetch: Cell, + fetch_invalid: Cell, + fetch_unused: Cell, +} + +impl AsyncFetchStats { + fn flush_stats(&mut self) { + RAFT_ENTRY_FETCHES + .async_fetch + .inc_by(self.async_fetch.replace(0)); + RAFT_ENTRY_FETCHES + .sync_fetch + .inc_by(self.sync_fetch.replace(0)); + RAFT_ENTRY_FETCHES + .fallback_fetch + .inc_by(self.fallback_fetch.replace(0)); + RAFT_ENTRY_FETCHES + .fetch_invalid + .inc_by(self.fetch_invalid.replace(0)); + RAFT_ENTRY_FETCHES + .fetch_unused + .inc_by(self.fetch_unused.replace(0)); + } +} + +/// A subset of `PeerStorage` that focus on accessing log entries. +pub struct EntryStorage { + region_id: u64, + peer_id: u64, + raft_engine: ER, + cache: EntryCache, + raft_state: RaftLocalState, + apply_state: RaftApplyState, + last_term: u64, + applied_term: u64, + raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_stats: AsyncFetchStats, + async_fetch_results: RefCell>, +} + +impl EntryStorage { + pub fn new( + region_id: u64, + peer_id: u64, + raft_engine: ER, + raft_state: RaftLocalState, + apply_state: RaftApplyState, + last_term: u64, + applied_term: u64, + raftlog_fetch_scheduler: Scheduler, + ) -> Self { + EntryStorage { + region_id, + peer_id, + raft_engine, + cache: EntryCache::default(), + raft_state, + apply_state, + last_term, + applied_term, + raftlog_fetch_scheduler, + raftlog_fetch_stats: AsyncFetchStats::default(), + async_fetch_results: RefCell::new(HashMap::default()), + } + } + + fn check_range(&self, low: u64, high: u64) -> raft::Result<()> { + if low > high { + return Err(storage_error(format!( + "low: {} is greater that high: {}", + low, high + ))); + } else if low <= self.truncated_index() { + return Err(raft::Error::Store(StorageError::Compacted)); + } else if high > self.last_index() + 1 { + return Err(storage_error(format!( + "entries' high {} is out of bound lastindex {}", + high, + self.last_index() + ))); + } + Ok(()) + } + + pub fn clean_async_fetch_res(&mut self, low: u64) { + self.async_fetch_results.borrow_mut().remove(&low); + } + + // Update the async fetch result. + // None indicates cleanning the fetched result. + pub fn update_async_fetch_res(&mut self, low: u64, res: Option>) { + // If it's in fetching, don't clean the async fetch result. + if self.async_fetch_results.borrow().get(&low) == Some(&RaftlogFetchState::Fetching) + && res.is_none() + { + return; + } + + match res { + Some(res) => { + if let Some(RaftlogFetchState::Fetched(prev)) = self + .async_fetch_results + .borrow_mut() + .insert(low, RaftlogFetchState::Fetched(res)) + { + info!( + "unconsumed async fetch res"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "res" => ?prev, + "low" => low, + ); + } + } + None => { + let prev = self.async_fetch_results.borrow_mut().remove(&low); + if prev.is_some() { + self.raftlog_fetch_stats.fetch_unused.update(|m| m + 1); + } + } + } + } + + fn async_fetch( + &self, + region_id: u64, + low: u64, + high: u64, + max_size: u64, + context: GetEntriesContext, + buf: &mut Vec, + ) -> raft::Result { + if let Some(RaftlogFetchState::Fetching) = self.async_fetch_results.borrow().get(&low) { + // already an async fetch in flight + return Err(raft::Error::Store( + raft::StorageError::LogTemporarilyUnavailable, + )); + } + + let tried_cnt = if let Some(RaftlogFetchState::Fetched(res)) = + self.async_fetch_results.borrow_mut().remove(&low) + { + assert_eq!(res.low, low); + let mut ents = res.ents?; + let first = ents.first().map(|e| e.index).unwrap(); + assert_eq!(first, res.low); + let last = ents.last().map(|e| e.index).unwrap(); + + if last + 1 >= high { + // async fetch res covers [low, high) + ents.truncate((high - first) as usize); + assert_eq!(ents.last().map(|e| e.index).unwrap(), high - 1); + if max_size < res.max_size { + limit_size(&mut ents, Some(max_size)); + } + let count = ents.len(); + buf.append(&mut ents); + fail_point!("on_async_fetch_return"); + return Ok(count); + } else if res.hit_size_limit && max_size <= res.max_size { + // async fetch res doesn't cover [low, high) due to hit size limit + if max_size < res.max_size { + limit_size(&mut ents, Some(max_size)); + }; + let count = ents.len(); + buf.append(&mut ents); + return Ok(count); + } else if last + RAFT_LOG_MULTI_GET_CNT > high - 1 + && res.tried_cnt + 1 == MAX_ASYNC_FETCH_TRY_CNT + { + let mut fetched_size = ents.iter().fold(0, |acc, e| acc + e.compute_size() as u64); + if max_size <= fetched_size { + limit_size(&mut ents, Some(max_size)); + let count = ents.len(); + buf.append(&mut ents); + return Ok(count); + } + + // the count of left entries isn't too large, fetch the remaining entries synchronously one by one + for idx in last + 1..high { + let ent = self.raft_engine.get_entry(region_id, idx)?; + match ent { + None => { + return Err(raft::Error::Store(raft::StorageError::Unavailable)); + } + Some(ent) => { + let size = ent.compute_size() as u64; + if fetched_size + size > max_size { + break; + } else { + fetched_size += size; + ents.push(ent); + } + } + } + } + let count = ents.len(); + buf.append(&mut ents); + return Ok(count); + } + info!( + "async fetch invalid"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "first" => first, + "last" => last, + "low" => low, + "high" => high, + "max_size" => max_size, + "res_max_size" => res.max_size, + ); + // low index or max size is changed, the result is not fit for the current range, so refetch again. + self.raftlog_fetch_stats.fetch_invalid.update(|m| m + 1); + res.tried_cnt + 1 + } else { + 1 + }; + + // the first/second try: get [low, high) asynchronously + // the third try: + // - if term and low are matched: use result of [low, persisted) and get [persisted, high) synchronously + // - else: get [low, high) synchronously + if tried_cnt >= MAX_ASYNC_FETCH_TRY_CNT { + // even the larger range is invalid again, fallback to fetch in sync way + self.raftlog_fetch_stats.fallback_fetch.update(|m| m + 1); + let count = self.raft_engine.fetch_entries_to( + region_id, + low, + high, + Some(max_size as usize), + buf, + )?; + return Ok(count); + } + + self.raftlog_fetch_stats.async_fetch.update(|m| m + 1); + self.async_fetch_results + .borrow_mut() + .insert(low, RaftlogFetchState::Fetching); + self.raftlog_fetch_scheduler + .schedule(RaftlogFetchTask::PeerStorage { + region_id, + context, + low, + high, + max_size: (max_size as usize), + tried_cnt, + term: self.hard_state().get_term(), + }) + .unwrap(); + Err(raft::Error::Store( + raft::StorageError::LogTemporarilyUnavailable, + )) + } + + pub fn entries( + &self, + low: u64, + high: u64, + max_size: u64, + context: GetEntriesContext, + ) -> raft::Result> { + self.check_range(low, high)?; + let mut ents = + Vec::with_capacity(std::cmp::min((high - low) as usize, MAX_INIT_ENTRY_COUNT)); + if low == high { + return Ok(ents); + } + let cache_low = self.cache.first_index().unwrap_or(u64::MAX); + if high <= cache_low { + self.cache.miss.update(|m| m + 1); + return if context.can_async() { + self.async_fetch(self.region_id, low, high, max_size, context, &mut ents)?; + Ok(ents) + } else { + self.raftlog_fetch_stats.sync_fetch.update(|m| m + 1); + self.raft_engine.fetch_entries_to( + self.region_id, + low, + high, + Some(max_size as usize), + &mut ents, + )?; + Ok(ents) + }; + } + let begin_idx = if low < cache_low { + self.cache.miss.update(|m| m + 1); + let fetched_count = if context.can_async() { + self.async_fetch(self.region_id, low, cache_low, max_size, context, &mut ents)? + } else { + self.raftlog_fetch_stats.sync_fetch.update(|m| m + 1); + self.raft_engine.fetch_entries_to( + self.region_id, + low, + cache_low, + Some(max_size as usize), + &mut ents, + )? + }; + if fetched_count < (cache_low - low) as usize { + // Less entries are fetched than expected. + return Ok(ents); + } + cache_low + } else { + low + }; + self.cache.hit.update(|h| h + 1); + let fetched_size = ents.iter().fold(0, |acc, e| acc + e.compute_size()); + self.cache + .fetch_entries_to(begin_idx, high, fetched_size as u64, max_size, &mut ents); + Ok(ents) + } + + pub fn term(&self, idx: u64) -> raft::Result { + if idx == self.truncated_index() { + return Ok(self.truncated_term()); + } + self.check_range(idx, idx + 1)?; + if self.truncated_term() == self.last_term || idx == self.last_index() { + return Ok(self.last_term); + } + if let Some(e) = self.cache.entry(idx) { + Ok(e.get_term()) + } else { + Ok(self + .raft_engine + .get_entry(self.region_id, idx) + .unwrap() + .unwrap() + .get_term()) + } + } + + #[inline] + pub fn first_index(&self) -> u64 { + first_index(&self.apply_state) + } + + #[inline] + pub fn last_index(&self) -> u64 { + last_index(&self.raft_state) + } + + #[inline] + pub fn last_term(&self) -> u64 { + self.last_term + } + + #[inline] + pub fn set_last_term(&mut self, term: u64) { + self.last_term = term; + } + + #[inline] + pub fn set_applied_term(&mut self, applied_term: u64) { + self.applied_term = applied_term; + } + + #[inline] + pub fn applied_term(&self) -> u64 { + self.applied_term + } + + #[inline] + pub fn raft_state(&self) -> &RaftLocalState { + &self.raft_state + } + + #[inline] + pub fn raft_state_mut(&mut self) -> &mut RaftLocalState { + &mut self.raft_state + } + + #[inline] + pub fn applied_index(&self) -> u64 { + self.apply_state.get_applied_index() + } + + #[inline] + pub fn set_applied_state(&mut self, apply_state: RaftApplyState) { + self.apply_state = apply_state; + } + + #[inline] + pub fn apply_state(&self) -> &RaftApplyState { + &self.apply_state + } + + #[inline] + pub fn apply_state_mut(&mut self) -> &mut RaftApplyState { + &mut self.apply_state + } + + #[inline] + pub fn commit_index(&self) -> u64 { + self.raft_state.get_hard_state().get_commit() + } + + #[inline] + pub fn set_commit_index(&mut self, commit: u64) { + assert!(commit >= self.commit_index()); + self.raft_state.mut_hard_state().set_commit(commit); + } + + #[inline] + pub fn hard_state(&self) -> &HardState { + self.raft_state.get_hard_state() + } + + #[inline] + pub fn truncated_index(&self) -> u64 { + self.apply_state.get_truncated_state().get_index() + } + + #[inline] + pub fn truncated_term(&self) -> u64 { + self.apply_state.get_truncated_state().get_term() + } + + // Append the given entries to the raft log using previous last index or self.last_index. + pub fn append(&mut self, entries: Vec, task: &mut WriteTask) { + if entries.is_empty() { + return; + } + debug!( + "append entries"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "count" => entries.len(), + ); + let prev_last_index = self.raft_state.get_last_index(); + + let (last_index, last_term) = { + let e = entries.last().unwrap(); + (e.get_index(), e.get_term()) + }; + + self.cache.append(self.region_id, self.peer_id, &entries); + + task.entries = entries; + // Delete any previously appended log entries which never committed. + task.cut_logs = Some((last_index + 1, prev_last_index + 1)); + + self.raft_state.set_last_index(last_index); + self.last_term = last_term; + } + + pub fn compact_entry_cache(&mut self, idx: u64) { + self.cache.compact_to(idx); + } + + #[inline] + pub fn is_entry_cache_empty(&self) -> bool { + self.cache.is_empty() + } + + /// Evict entries from the cache. + pub fn evict_entry_cache(&mut self, half: bool) { + if !self.is_entry_cache_empty() { + let cache = &mut self.cache; + let cache_len = cache.cache.len(); + let drain_to = if half { cache_len / 2 } else { cache_len - 1 }; + let idx = cache.cache[drain_to].index; + let mem_size_change = cache.compact_to(idx + 1); + RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); + } + } + + #[inline] + pub fn flush_entry_cache_metrics(&mut self) { + // NOTE: memory usage of entry cache is flushed realtime. + self.cache.flush_stats(); + self.raftlog_fetch_stats.flush_stats(); + } + + pub fn raft_engine(&self) -> &ER { + &self.raft_engine + } + + pub fn update_cache_persisted(&mut self, persisted: u64) { + self.cache.update_persisted(persisted); + } + + pub fn trace_cached_entries(&mut self, entries: CachedEntries) { + self.cache.trace_cached_entries(entries); + } + + pub fn clear(&mut self) { + self.cache = EntryCache::default(); + } +} + +#[cfg(test)] +pub mod tests { + use std::sync::mpsc; + + use engine_test::raft::RaftTestEngine; + use engine_traits::RaftEngineReadOnly; + use protobuf::Message; + use raft::{GetEntriesContext, StorageError}; + use tempfile::Builder; + use tikv_util::worker::{dummy_scheduler, LazyWorker, Worker}; + + use super::*; + use crate::store::peer_storage::tests::{append_ents, new_entry, new_storage_from_ents}; + + impl EntryCache { + fn new_with_cb(cb: impl Fn(i64) + Send + 'static) -> Self { + let entry_cache = EntryCache { + persisted: 0, + cache: Default::default(), + trace: Default::default(), + hit: Cell::new(0), + miss: Cell::new(0), + size_change_cb: Some(Box::new(cb) as Box), + }; + entry_cache.flush_mem_size_change(entry_cache.total_mem_size()); + entry_cache + } + } + + pub fn validate_cache(store: &EntryStorage, exp_ents: &[Entry]) { + assert_eq!(store.cache.cache, exp_ents); + for e in exp_ents { + let entry = store + .raft_engine + .get_entry(store.region_id, e.get_index()) + .unwrap() + .unwrap(); + assert_eq!(entry, *e); + } + } + + #[test] + fn test_storage_cache_size_change() { + let new_padded_entry = |index: u64, term: u64, pad_len: usize| { + let mut e = new_entry(index, term); + e.data = vec![b'x'; pad_len].into(); + e + }; + + // Test the initial data structure size. + let (tx, rx) = mpsc::sync_channel(8); + let mut cache = EntryCache::new_with_cb(move |c: i64| tx.send(c).unwrap()); + assert_eq!(rx.try_recv().unwrap(), 896); + + cache.append( + 0, + 0, + &[new_padded_entry(101, 1, 1), new_padded_entry(102, 1, 2)], + ); + assert_eq!(rx.try_recv().unwrap(), 3); + + // Test size change for one overlapped entry. + cache.append(0, 0, &[new_padded_entry(102, 2, 3)]); + assert_eq!(rx.try_recv().unwrap(), 1); + + // Test size change for all overlapped entries. + cache.append( + 0, + 0, + &[new_padded_entry(101, 3, 4), new_padded_entry(102, 3, 5)], + ); + assert_eq!(rx.try_recv().unwrap(), 5); + + cache.append(0, 0, &[new_padded_entry(103, 3, 6)]); + assert_eq!(rx.try_recv().unwrap(), 6); + + // Test trace a dangle entry. + let cached_entries = CachedEntries::new(vec![new_padded_entry(100, 1, 1)]); + cache.trace_cached_entries(cached_entries); + assert_eq!(rx.try_recv().unwrap(), 1); + + // Test trace an entry which is still in cache. + let cached_entries = CachedEntries::new(vec![new_padded_entry(102, 3, 5)]); + cache.trace_cached_entries(cached_entries); + assert_eq!(rx.try_recv().unwrap(), 0); + + // Test compare `cached_last` with `trunc_to_idx` in `EntryCache::append_impl`. + cache.append(0, 0, &[new_padded_entry(103, 4, 7)]); + assert_eq!(rx.try_recv().unwrap(), 1); + + // Test compact one traced dangle entry and one entry in cache. + cache.persisted = 101; + cache.compact_to(102); + assert_eq!(rx.try_recv().unwrap(), -5); + + // Test compact the last traced dangle entry. + cache.persisted = 102; + cache.compact_to(103); + assert_eq!(rx.try_recv().unwrap(), -5); + + // Test compact all entries. + cache.persisted = 103; + cache.compact_to(104); + assert_eq!(rx.try_recv().unwrap(), -7); + + drop(cache); + assert_eq!(rx.try_recv().unwrap(), -896); + } + + #[test] + fn test_storage_cache_entry() { + let mut cache = EntryCache::default(); + let ents = vec![ + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 4), + new_entry(6, 6), + ]; + cache.append(0, 0, &ents); + assert!(cache.entry(1).is_none()); + assert!(cache.entry(2).is_none()); + for e in &ents { + assert_eq!(e, cache.entry(e.get_index()).unwrap()); + } + let res = panic_hook::recover_safe(|| cache.entry(7)); + assert!(res.is_err()); + } + + #[test] + fn test_async_fetch() { + let ents = vec![ + new_entry(2, 2), + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 5), + new_entry(6, 6), + ]; + + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let region_worker = Worker::new("snap-manager").lazy_build("snap-manager"); + let region_scheduler = region_worker.scheduler(); + let (dummy_scheduler, _rx) = dummy_scheduler(); + + let mut store = new_storage_from_ents(region_scheduler, dummy_scheduler, &td, &ents); + + let max_u64 = u64::max_value(); + let mut tests = vec![ + // already compacted + ( + 3, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Err(raft::Error::Store(StorageError::Compacted)), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Err(raft::Error::Store(StorageError::Compacted)), + vec![], + ), + // fetch partial entries due to max size limit + ( + 3, + 7, + 30, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: 30, + hit_size_limit: true, + tried_cnt: 1, + term: 1, + }, + Ok(3), + ents[1..4].to_vec(), + ), + // fetch all entries + ( + 2, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents.clone()), + low: 2, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Ok(5), + ents.clone(), + ), + // high is smaller than before + ( + 3, + 5, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Ok(2), + ents[1..3].to_vec(), + ), + // high is larger than before, second try + ( + 3, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Err(raft::Error::Store(StorageError::LogTemporarilyUnavailable)), + vec![], + ), + // high is larger than before, thrid try + ( + 3, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 2, + term: 1, + }, + Ok(4), + ents[1..].to_vec(), + ), + // max size is smaller than before + ( + 2, + 7, + 10, + 1, + RaftlogFetchResult { + ents: Ok(ents.clone()), + low: 2, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Ok(2), + ents[..2].to_vec(), + ), + // max size is larger than before but with lower high + ( + 2, + 5, + 40, + 1, + RaftlogFetchResult { + ents: Ok(ents.clone()), + low: 2, + max_size: 30, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Ok(3), + ents[..3].to_vec(), + ), + // low index is smaller than before + ( + 2, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Err(raft::Error::Store(StorageError::Compacted)), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Err(raft::Error::Store(StorageError::LogTemporarilyUnavailable)), + vec![], + ), + // low index is larger than before + ( + 4, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(vec![]), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: 1, + term: 1, + }, + Err(raft::Error::Store(StorageError::LogTemporarilyUnavailable)), + vec![], + ), + // hit tried several lmit + ( + 3, + 7, + max_u64, + 1, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, + term: 1, + }, + Ok(4), + ents[1..5].to_vec(), + ), + // term is changed + ( + 3, + 7, + max_u64, + 2, + RaftlogFetchResult { + ents: Ok(ents[1..4].to_vec()), + low: 3, + max_size: max_u64, + hit_size_limit: false, + tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, + term: 1, + }, + Ok(4), + ents[1..5].to_vec(), + ), + ]; + + for (i, (lo, hi, maxsize, term, async_res, expected_res, expected_ents)) in + tests.drain(..).enumerate() + { + if async_res.low != lo { + store.clean_async_fetch_res(lo); + } else { + store.update_async_fetch_res(lo, Some(Box::new(async_res))); + } + let mut ents = vec![]; + store.raft_state.mut_hard_state().set_term(term); + let res = store.async_fetch( + store.get_region_id(), + lo, + hi, + maxsize, + GetEntriesContext::empty(true), + &mut ents, + ); + if res != expected_res { + panic!("#{}: expect result {:?}, got {:?}", i, expected_res, res); + } + if ents != expected_ents { + panic!("#{}: expect ents {:?}, got {:?}", i, expected_ents, ents); + } + } + } + + #[test] + fn test_storage_append() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let mut tests = vec![ + ( + vec![new_entry(4, 6), new_entry(5, 6)], + vec![new_entry(4, 6), new_entry(5, 6)], + ), + ( + vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], + vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], + ), + // truncate the existing entries and append + (vec![new_entry(4, 5)], vec![new_entry(4, 5)]), + // direct append + ( + vec![new_entry(6, 5)], + vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], + ), + ]; + for (i, (entries, wentries)) in tests.drain(..).enumerate() { + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let worker = LazyWorker::new("snap-manager"); + let sched = worker.scheduler(); + let (dummy_scheduler, _) = dummy_scheduler(); + let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); + append_ents(&mut store, &entries); + let li = store.last_index().unwrap(); + let actual_entries = store + .entries(4, li + 1, u64::max_value(), GetEntriesContext::empty(false)) + .unwrap(); + if actual_entries != wentries { + panic!("#{}: want {:?}, got {:?}", i, wentries, actual_entries); + } + } + } + + #[test] + fn test_storage_cache_fetch() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let worker = LazyWorker::new("snap-manager"); + let sched = worker.scheduler(); + let (dummy_scheduler, _) = dummy_scheduler(); + let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); + store.cache.cache.clear(); + // empty cache should fetch data from rocksdb directly. + let mut res = store + .entries(4, 6, u64::max_value(), GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(*res, ents[1..]); + + let entries = vec![new_entry(6, 5), new_entry(7, 5)]; + append_ents(&mut store, &entries); + validate_cache(&store, &entries); + + // direct cache access + res = store + .entries(6, 8, u64::max_value(), GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(res, entries); + + // size limit should be supported correctly. + res = store + .entries(4, 8, 0, GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(res, vec![new_entry(4, 4)]); + let mut size: u64 = ents[1..].iter().map(|e| u64::from(e.compute_size())).sum(); + res = store + .entries(4, 8, size, GetEntriesContext::empty(false)) + .unwrap(); + let mut exp_res = ents[1..].to_vec(); + assert_eq!(res, exp_res); + for e in &entries { + size += u64::from(e.compute_size()); + exp_res.push(e.clone()); + res = store + .entries(4, 8, size, GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(res, exp_res); + } + + // range limit should be supported correctly. + for low in 4..9 { + for high in low..9 { + let res = store + .entries(low, high, u64::max_value(), GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!(*res, exp_res[low as usize - 4..high as usize - 4]); + } + } + } + + #[test] + fn test_storage_cache_update() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let worker = LazyWorker::new("snap-manager"); + let sched = worker.scheduler(); + let (dummy_scheduler, _) = dummy_scheduler(); + let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); + store.cache.cache.clear(); + + // initial cache + let mut entries = vec![new_entry(6, 5), new_entry(7, 5)]; + append_ents(&mut store, &entries); + validate_cache(&store, &entries); + + // rewrite + entries = vec![new_entry(6, 6), new_entry(7, 6)]; + append_ents(&mut store, &entries); + validate_cache(&store, &entries); + + // rewrite old entry + entries = vec![new_entry(5, 6), new_entry(6, 6)]; + append_ents(&mut store, &entries); + validate_cache(&store, &entries); + + // partial rewrite + entries = vec![new_entry(6, 7), new_entry(7, 7)]; + append_ents(&mut store, &entries); + let mut exp_res = vec![new_entry(5, 6), new_entry(6, 7), new_entry(7, 7)]; + validate_cache(&store, &exp_res); + + // direct append + entries = vec![new_entry(8, 7), new_entry(9, 7)]; + append_ents(&mut store, &entries); + exp_res.extend_from_slice(&entries); + validate_cache(&store, &exp_res); + + // rewrite middle + entries = vec![new_entry(7, 8)]; + append_ents(&mut store, &entries); + exp_res.truncate(2); + exp_res.push(new_entry(7, 8)); + validate_cache(&store, &exp_res); + + // compact to min(5 + 1, 7) + store.cache.persisted = 5; + store.compact_entry_cache(7); + exp_res = vec![new_entry(6, 7), new_entry(7, 8)]; + validate_cache(&store, &exp_res); + + // compact to min(7 + 1, 7) + store.cache.persisted = 7; + store.compact_entry_cache(7); + exp_res = vec![new_entry(7, 8)]; + validate_cache(&store, &exp_res); + // compact all + store.compact_entry_cache(8); + validate_cache(&store, &[]); + // invalid compaction should be ignored. + store.compact_entry_cache(6); + } +} diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 88bff373760..98d12303b19 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -77,13 +77,14 @@ use crate::{ }, store::{ cmd_resp, + entry_storage::{self, CachedEntries}, fsm::RaftPollerBuilder, local_metrics::{RaftMetrics, TimeTracker}, memory::*, metrics::*, msg::{Callback, PeerMsg, ReadResponse, SignificantMsg}, peer::Peer, - peer_storage::{self, write_initial_apply_state, write_peer_state, CachedEntries}, + peer_storage::{write_initial_apply_state, write_peer_state}, util, util::{ admin_cmd_epoch_lookup, check_region_epoch, compare_region_epoch, is_learner, @@ -597,7 +598,7 @@ where apply_state: delegate.apply_state.clone(), exec_res: results, metrics: delegate.metrics.clone(), - applied_index_term: delegate.applied_index_term, + applied_term: delegate.applied_term, bucket_stat: delegate.buckets.clone().map(Box::new), }); } @@ -889,7 +890,7 @@ where /// to file, but KV data may not synced to file, so we will lose data. apply_state: RaftApplyState, /// The term of the raft log at applied index. - applied_index_term: u64, + applied_term: u64, /// The latest flushed applied index. last_flush_applied_index: u64, @@ -924,7 +925,7 @@ where pending_remove: false, last_flush_applied_index: reg.apply_state.get_applied_index(), apply_state: reg.apply_state, - applied_index_term: reg.applied_index_term, + applied_term: reg.applied_term, term: reg.term, stopped: false, handle_start: None, @@ -1094,7 +1095,7 @@ where apply_ctx.host.on_empty_cmd(&self.region, index, term); self.apply_state.set_applied_index(index); - self.applied_index_term = term; + self.applied_term = term; assert!(term > 0); // 1. When a peer become leader, it will send an empty entry. @@ -1302,7 +1303,7 @@ where } self.apply_state.set_applied_index(index); - self.applied_index_term = term; + self.applied_term = term; let cmd = Cmd::new(index, term, req.clone(), resp.clone()); let should_write = ctx.host.post_exec( @@ -2469,7 +2470,7 @@ where let prepare_merge = req.get_prepare_merge(); let index = prepare_merge.get_min_index(); - let first_index = peer_storage::first_index(&self.apply_state); + let first_index = entry_storage::first_index(&self.apply_state); if index < first_index { // We filter `CompactLog` command before. panic!( @@ -2713,7 +2714,7 @@ where let compact_index = req.get_compact_log().get_compact_index(); let resp = AdminResponse::default(); - let first_index = peer_storage::first_index(&self.apply_state); + let first_index = entry_storage::first_index(&self.apply_state); if compact_index <= first_index { debug!( "compact index <= first index, no need to compact"; @@ -3011,7 +3012,7 @@ pub struct Registration { pub id: u64, pub term: u64, pub apply_state: RaftApplyState, - pub applied_index_term: u64, + pub applied_term: u64, pub region: Region, pub pending_request_snapshot_count: Arc, pub is_merging: bool, @@ -3024,7 +3025,7 @@ impl Registration { id: peer.peer_id(), term: peer.term(), apply_state: peer.get_store().apply_state().clone(), - applied_index_term: peer.get_store().applied_index_term(), + applied_term: peer.get_store().applied_term(), region: peer.region().clone(), pending_request_snapshot_count: peer.pending_request_snapshot_count.clone(), is_merging: peer.pending_merge_state.is_some(), @@ -3110,7 +3111,7 @@ impl GenSnapTask { pub fn generate_and_schedule_snapshot( self, kv_snap: EK::Snapshot, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, region_sched: &Scheduler>, ) -> Result<()> @@ -3123,7 +3124,7 @@ impl GenSnapTask { region_id: self.region_id, notifier: self.snap_notifier, for_balance: self.for_balance, - last_applied_index_term, + last_applied_term, last_applied_state, canceled: self.canceled, // This snapshot may be held for a long time, which may cause too many @@ -3281,7 +3282,7 @@ where { pub region_id: u64, pub apply_state: RaftApplyState, - pub applied_index_term: u64, + pub applied_term: u64, pub exec_res: VecDeque>, pub metrics: ApplyMetrics, pub bucket_stat: Option>, @@ -3591,7 +3592,7 @@ where if let Err(e) = snap_task.generate_and_schedule_snapshot::( apply_ctx.engine.snapshot(), - self.delegate.applied_index_term, + self.delegate.applied_term, self.delegate.apply_state.clone(), &apply_ctx.region_scheduler, ) { @@ -4429,7 +4430,7 @@ mod tests { id: Default::default(), term: Default::default(), apply_state: Default::default(), - applied_index_term: Default::default(), + applied_term: Default::default(), region: Default::default(), pending_request_snapshot_count: Default::default(), is_merging: Default::default(), @@ -4444,7 +4445,7 @@ mod tests { id: self.id, term: self.term, apply_state: self.apply_state.clone(), - applied_index_term: self.applied_index_term, + applied_term: self.applied_term, region: self.region.clone(), pending_request_snapshot_count: self.pending_request_snapshot_count.clone(), is_merging: self.is_merging, @@ -4646,7 +4647,7 @@ mod tests { let mut reg = Registration { id: 1, term: 4, - applied_index_term: 5, + applied_term: 5, ..Default::default() }; reg.region.set_id(2); @@ -4659,7 +4660,7 @@ mod tests { assert!(!delegate.pending_remove); assert_eq!(delegate.apply_state, reg.apply_state); assert_eq!(delegate.term, reg.term); - assert_eq!(delegate.applied_index_term, reg.applied_index_term); + assert_eq!(delegate.applied_term, reg.applied_term); }); let (resp_tx, resp_rx) = mpsc::channel(); @@ -4735,10 +4736,10 @@ mod tests { assert!(apply_res.exec_res.is_empty()); // empty entry will make applied_index step forward and should write apply state to engine. assert_eq!(apply_res.metrics.written_keys, 1); - assert_eq!(apply_res.applied_index_term, 5); + assert_eq!(apply_res.applied_term, 5); validate(&router, 2, |delegate| { assert_eq!(delegate.term, 11); - assert_eq!(delegate.applied_index_term, 5); + assert_eq!(delegate.applied_term, 5); assert_eq!(delegate.apply_state.get_applied_index(), 5); assert_eq!( delegate.apply_state.get_applied_index(), @@ -5079,7 +5080,7 @@ mod tests { assert_eq!(engine.get_value(&dk_k2).unwrap().unwrap(), b"v1"); assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); validate(&router, 1, |delegate| { - assert_eq!(delegate.applied_index_term, 1); + assert_eq!(delegate.applied_term, 1); assert_eq!(delegate.apply_state.get_applied_index(), 1); }); fetch_apply_res(&rx); @@ -5092,7 +5093,7 @@ mod tests { let apply_res = fetch_apply_res(&rx); assert_eq!(apply_res.region_id, 1); assert_eq!(apply_res.apply_state.get_applied_index(), 2); - assert_eq!(apply_res.applied_index_term, 2); + assert_eq!(apply_res.applied_term, 2); assert!(apply_res.exec_res.is_empty()); assert!(apply_res.metrics.written_bytes >= 5); assert_eq!(apply_res.metrics.written_keys, 2); @@ -5120,7 +5121,7 @@ mod tests { let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); assert!(resp.get_header().get_error().has_epoch_not_match()); let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 2); + assert_eq!(apply_res.applied_term, 2); assert_eq!(apply_res.apply_state.get_applied_index(), 3); let put_entry = EntryBuilder::new(4, 2) @@ -5141,7 +5142,7 @@ mod tests { let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); assert!(resp.get_header().get_error().has_key_not_in_region()); let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 2); + assert_eq!(apply_res.applied_term, 2); assert_eq!(apply_res.apply_state.get_applied_index(), 4); // a writebatch should be atomic. assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); @@ -5235,7 +5236,7 @@ mod tests { assert!(apply_res.exec_res.is_empty()); // The entry should be applied now. let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 3); + assert_eq!(apply_res.applied_term, 3); assert_eq!(apply_res.apply_state.get_applied_index(), 8); // UploadSST @@ -5312,15 +5313,15 @@ mod tests { // The region was rescheduled low-priority becasuee of ingest command, // only put entry has been applied; let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 3); + assert_eq!(apply_res.applied_term, 3); assert_eq!(apply_res.apply_state.get_applied_index(), 9); // The region will yield after timeout. let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 3); + assert_eq!(apply_res.applied_term, 3); assert_eq!(apply_res.apply_state.get_applied_index(), 10); // The third entry should be applied now. let apply_res = fetch_apply_res(&rx); - assert_eq!(apply_res.applied_index_term, 3); + assert_eq!(apply_res.applied_term, 3); assert_eq!(apply_res.apply_state.get_applied_index(), 11); let write_batch_max_keys = ::WRITE_BATCH_MAX_KEYS; @@ -5689,7 +5690,7 @@ mod tests { let apply_res = fetch_apply_res(&rx); // applied_index can still be advanced. assert_eq!(apply_res.apply_state.get_applied_index(), index_id); - assert_eq!(apply_res.applied_index_term, 1); + assert_eq!(apply_res.applied_term, 1); // Executing CompactLog is filtered and takes no effect. assert_eq!(apply_res.exec_res.len(), 0); assert_eq!(apply_res.apply_state.get_truncated_state().get_index(), 0); @@ -5708,7 +5709,7 @@ mod tests { let apply_res = fetch_apply_res(&rx); // applied_index can still be advanced. assert_eq!(apply_res.apply_state.get_applied_index(), index_id); - assert_eq!(apply_res.applied_index_term, 1); + assert_eq!(apply_res.applied_term, 1); // We can get exec result of CompactLog. assert_eq!(apply_res.exec_res.len(), 1); assert_eq!( @@ -5726,7 +5727,7 @@ mod tests { let apply_res = fetch_apply_res(&rx); // applied_index can still be advanced. assert_eq!(apply_res.apply_state.get_applied_index(), index_id); - assert_eq!(apply_res.applied_index_term, 1); + assert_eq!(apply_res.applied_term, 1); // We can't get exec result of ComputeHash. assert_eq!(apply_res.exec_res.len(), 0); obs.filter_consistency_check.store(false, Ordering::SeqCst); @@ -5744,7 +5745,7 @@ mod tests { router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![split], vec![]))); let apply_res = fetch_apply_res(&rx); assert_eq!(apply_res.apply_state.get_applied_index(), index_id); - assert_eq!(apply_res.applied_index_term, 1); + assert_eq!(apply_res.applied_term, 1); let (_, r8) = if let ExecResult::SplitRegion { regions, derived: _, @@ -5768,7 +5769,7 @@ mod tests { router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![merge], vec![]))); let apply_res = fetch_apply_res(&rx); assert_eq!(apply_res.apply_state.get_applied_index(), index_id); - assert_eq!(apply_res.applied_index_term, 1); + assert_eq!(apply_res.applied_term, 1); // PrepareMerge will trigger commit. let state: RaftApplyState = engine .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index f3bcd56eabf..baccd071690 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2110,7 +2110,7 @@ where self.fsm.has_ready |= self.fsm.peer.post_apply( self.ctx, res.apply_state, - res.applied_index_term, + res.applied_term, &res.metrics, ); // After applying, several metrics are updated, report it to pd to diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 64c70bbc2e7..bd9564b1a63 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -2,6 +2,7 @@ pub mod cmd_resp; pub mod config; +pub mod entry_storage; pub mod fsm; pub mod memory; pub mod metrics; @@ -27,12 +28,17 @@ mod worker; #[cfg(any(test, feature = "testexport"))] pub use self::msg::PeerInternalStat; pub use self::{ + async_io::{ + write::{Worker as WriteWorker, WriteMsg, WriteTask}, + write_router::WriteRouter, + }, bootstrap::{ bootstrap_store, clear_prepare_bootstrap_cluster, clear_prepare_bootstrap_key, initial_region, prepare_bootstrap_cluster, }, compaction_guard::CompactionGuardGeneratorFactory, config::Config, + entry_storage::{EntryStorage, RaftlogFetchResult, MAX_INIT_ENTRY_COUNT}, fsm::{check_sst_for_ingestion, DestroyPeerJob, RaftRouter, StoreInfo}, hibernate_state::{GroupState, HibernateState}, memory::*, @@ -45,8 +51,8 @@ pub use self::{ peer::{AbstractPeer, Peer, PeerStat, ProposalContext, RequestInspector, RequestPolicy}, peer_storage::{ clear_meta, do_snapshot, write_initial_apply_state, write_initial_raft_state, - write_peer_state, PeerStorage, RaftlogFetchResult, SnapState, INIT_EPOCH_CONF_VER, - INIT_EPOCH_VER, MAX_INIT_ENTRY_COUNT, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + write_peer_state, PeerStorage, SnapState, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, + RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, read_queue::ReadIndexContext, region_snapshot::{RegionIterator, RegionSnapshot}, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 1a7954ca037..2bcaefff762 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2194,8 +2194,8 @@ where // TODO: It may cause read index to wait a long time. // There may be some values that are not applied by this leader yet but the old leader, - // if applied_index_term isn't equal to current term. - self.get_store().applied_index_term() == self.term() + // if applied_term isn't equal to current term. + self.get_store().applied_term() == self.term() // There may be stale read if the old leader splits really slow, // the new region may already elected a new leader while // the old leader still think it owns the split range. @@ -3136,7 +3136,7 @@ where &mut self, ctx: &mut PollContext, apply_state: RaftApplyState, - applied_index_term: u64, + applied_term: u64, apply_metrics: &ApplyMetrics, ) -> bool { let mut has_ready = false; @@ -3159,9 +3159,9 @@ where .compact_entry_cache(apply_state.applied_index + 1); } - let progress_to_be_updated = self.mut_store().applied_index_term() != applied_index_term; + let progress_to_be_updated = self.mut_store().applied_term() != applied_term; self.mut_store().set_applied_state(apply_state); - self.mut_store().set_applied_term(applied_index_term); + self.mut_store().set_applied_term(applied_term); self.peer_stat.written_keys += apply_metrics.written_keys; self.peer_stat.written_bytes += apply_metrics.written_bytes; @@ -3183,13 +3183,13 @@ where self.read_progress.update_applied(applied_index); - // Only leaders need to update applied_index_term. + // Only leaders need to update applied_term. if progress_to_be_updated && self.is_leader() { - if applied_index_term == self.term() { + if applied_term == self.term() { ctx.coprocessor_host .on_applied_current_term(StateRole::Leader, self.region()); } - let progress = ReadProgress::applied_index_term(applied_index_term); + let progress = ReadProgress::applied_term(applied_term); let mut meta = ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id).unwrap(); self.maybe_update_read_progress(reader, progress); @@ -4223,7 +4223,7 @@ where return Err(box_err!( "{} peer has not applied to current term, applied_term {}, current_term {}", self.tag, - self.get_store().applied_index_term(), + self.get_store().applied_term(), self.term() )); } @@ -4437,11 +4437,11 @@ where // Actually, according to the implementation of conf change in raft-rs, this check must be // passed if the previous check that `pending_conf_index` should be less than or equal to // `self.get_store().applied_index()` is passed. - if self.get_store().applied_index_term() != self.term() { + if self.get_store().applied_term() != self.term() { return Err(box_err!( "{} peer has not applied to current term, applied_term {}, current_term {}", self.tag, - self.get_store().applied_index_term(), + self.get_store().applied_term(), self.term() )); } @@ -4908,7 +4908,7 @@ where let res = self.raft_group.raft.check_group_commit_consistent(); if Some(true) != res { let mut buffer: SmallVec<[(u64, u64, u64); 5]> = SmallVec::new(); - if self.get_store().applied_index_term() >= self.term() { + if self.get_store().applied_term() >= self.term() { let progress = self.raft_group.raft.prs(); for (id, p) in progress.iter() { if !progress.conf().voters().contains(*id) { @@ -5347,7 +5347,7 @@ where ER: RaftEngine, { fn has_applied_to_current_term(&mut self) -> bool { - self.get_store().applied_index_term() == self.term() + self.get_store().applied_term() == self.term() } fn inspect_lease(&mut self) -> LeaseState { diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index ec6cc3bcf11..8301c75e7c3 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -2,23 +2,18 @@ // #[PerformanceCriticalPath] use std::{ - cell::{Cell, RefCell}, - cmp, - collections::VecDeque, - error, mem, - ops::Range, + cell::RefCell, + error, + ops::{Deref, DerefMut}, sync::{ atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, mpsc::{self, Receiver, TryRecvError}, - Arc, Mutex, + Arc, }, u64, }; -use collections::HashMap; -use engine_traits::{ - Engines, KvEngine, Mutable, Peekable, RaftEngine, RaftLogBatch, CF_RAFT, RAFT_LOG_MULTI_GET_CNT, -}; +use engine_traits::{Engines, KvEngine, Mutable, Peekable, RaftEngine, RaftLogBatch, CF_RAFT}; use fail::fail_point; use into_other::into_other; use keys::{self, enc_end_key, enc_start_key}; @@ -32,20 +27,20 @@ use protobuf::Message; use raft::{ self, eraftpb::{self, ConfState, Entry, HardState, Snapshot}, - util::limit_size, Error as RaftError, GetEntriesContext, RaftState, Ready, Storage, StorageError, }; -use tikv_alloc::trace::TraceEvent; use tikv_util::{ box_err, box_try, debug, defer, error, info, time::Instant, warn, worker::Scheduler, }; -use super::{metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager, SnapshotStatistics}; +use super::{ + entry_storage::last_index, metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager, + SnapshotStatistics, +}; use crate::{ - bytes_capacity, store::{ - async_io::write::WriteTask, fsm::GenSnapTask, memory::*, peer::PersistSnapshotResult, util, - worker::RaftlogFetchTask, + async_io::write::WriteTask, entry_storage::EntryStorage, fsm::GenSnapTask, + peer::PersistSnapshotResult, util, worker::RaftlogFetchTask, }, Error, Result, }; @@ -55,17 +50,12 @@ use crate::{ pub const RAFT_INIT_LOG_TERM: u64 = 5; pub const RAFT_INIT_LOG_INDEX: u64 = 5; const MAX_SNAP_TRY_CNT: usize = 5; -const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; - -pub const MAX_INIT_ENTRY_COUNT: usize = 1024; /// The initial region epoch version. pub const INIT_EPOCH_VER: u64 = 1; /// The initial region epoch conf_version. pub const INIT_EPOCH_CONF_VER: u64 = 1; -const SHRINK_CACHE_CAPACITY: usize = 64; - pub const JOB_STATUS_PENDING: usize = 0; pub const JOB_STATUS_RUNNING: usize = 1; pub const JOB_STATUS_CANCELLING: usize = 2; @@ -73,8 +63,6 @@ pub const JOB_STATUS_CANCELLED: usize = 3; pub const JOB_STATUS_FINISHED: usize = 4; pub const JOB_STATUS_FAILED: usize = 5; -const ENTRY_MEM_SIZE: usize = mem::size_of::(); - /// Possible status returned by `check_applying_snap`. #[derive(Debug, Clone, Copy, PartialEq)] pub enum CheckApplyingSnapStatus { @@ -112,312 +100,7 @@ impl PartialEq for SnapState { } } -#[inline] -pub fn first_index(state: &RaftApplyState) -> u64 { - state.get_truncated_state().get_index() + 1 -} - -#[inline] -pub fn last_index(state: &RaftLocalState) -> u64 { - state.get_last_index() -} - -struct EntryCache { - // The last index of persisted entry. - // It should be equal to `RaftLog::persisted`. - persisted: u64, - cache: VecDeque, - trace: VecDeque, - hit: Cell, - miss: Cell, - #[cfg(test)] - size_change_cb: Option>, -} - -impl EntryCache { - fn first_index(&self) -> Option { - self.cache.front().map(|e| e.get_index()) - } - - fn fetch_entries_to( - &self, - begin: u64, - end: u64, - mut fetched_size: u64, - max_size: u64, - ents: &mut Vec, - ) { - if begin >= end { - return; - } - assert!(!self.cache.is_empty()); - let cache_low = self.cache.front().unwrap().get_index(); - let start_idx = begin.checked_sub(cache_low).unwrap() as usize; - let limit_idx = end.checked_sub(cache_low).unwrap() as usize; - - let mut end_idx = start_idx; - self.cache - .iter() - .skip(start_idx) - .take_while(|e| { - let cur_idx = end_idx as u64 + cache_low; - assert_eq!(e.get_index(), cur_idx); - let m = u64::from(e.compute_size()); - fetched_size += m; - if fetched_size == m { - end_idx += 1; - fetched_size <= max_size && end_idx < limit_idx - } else if fetched_size <= max_size { - end_idx += 1; - end_idx < limit_idx - } else { - false - } - }) - .count(); - // Cache either is empty or contains latest log. Hence we don't need to fetch log - // from rocksdb anymore. - assert!(end_idx == limit_idx || fetched_size > max_size); - let (first, second) = tikv_util::slices_in_range(&self.cache, start_idx, end_idx); - ents.extend_from_slice(first); - ents.extend_from_slice(second); - } - - fn append(&mut self, tag: &str, entries: &[Entry]) { - if !entries.is_empty() { - let mut mem_size_change = 0; - let old_capacity = self.cache.capacity(); - mem_size_change += self.append_impl(tag, entries); - let new_capacity = self.cache.capacity(); - mem_size_change += Self::get_cache_vec_mem_size_change(new_capacity, old_capacity); - mem_size_change += self.shrink_if_necessary(); - self.flush_mem_size_change(mem_size_change); - } - } - - fn append_impl(&mut self, tag: &str, entries: &[Entry]) -> i64 { - let mut mem_size_change = 0; - - if let Some(cache_last_index) = self.cache.back().map(|e| e.get_index()) { - let first_index = entries[0].get_index(); - if cache_last_index >= first_index { - let cache_len = self.cache.len(); - let truncate_to = cache_len - .checked_sub((cache_last_index - first_index + 1) as usize) - .unwrap_or_default(); - let trunc_to_idx = self.cache[truncate_to].index; - for e in self.cache.drain(truncate_to..) { - mem_size_change -= - (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; - } - if let Some(cached) = self.trace.back() { - // Only committed entries can be traced, and only uncommitted entries - // can be truncated. So there won't be any overlaps. - let cached_last = cached.range.end - 1; - assert!(cached_last < trunc_to_idx); - } - } else if cache_last_index + 1 < first_index { - panic!( - "{} unexpected hole: {} < {}", - tag, cache_last_index, first_index - ); - } - } - - for e in entries { - self.cache.push_back(e.to_owned()); - mem_size_change += (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; - } - // In the past, the entry cache will be truncated if its size exceeds a certain number. - // However, after introducing async write io, the entry must stay in cache if it's not - // persisted to raft db because the raft-rs may need to read entries.(e.g. leader sends - // MsgAppend to followers) - - mem_size_change - } - - pub fn entry(&self, idx: u64) -> Option<&Entry> { - let cache_low = self.cache.front()?.get_index(); - if idx >= cache_low { - Some(&self.cache[(idx - cache_low) as usize]) - } else { - None - } - } - - /// Compact all entries whose indexes are less than `idx`. - pub fn compact_to(&mut self, mut idx: u64) -> u64 { - if idx > self.persisted + 1 { - // Only the persisted entries can be compacted - idx = self.persisted + 1; - } - - let mut mem_size_change = 0; - - // Clean cached entries which have been already sent to apply threads. For example, - // if entries [1, 10), [10, 20), [20, 30) are sent to apply threads and `compact_to(15)` - // is called, only [20, 30) will still be kept in cache. - let old_trace_cap = self.trace.capacity(); - while let Some(cached_entries) = self.trace.pop_front() { - if cached_entries.range.start >= idx { - self.trace.push_front(cached_entries); - let trace_len = self.trace.len(); - let trace_cap = self.trace.capacity(); - if trace_len < SHRINK_CACHE_CAPACITY && trace_cap > SHRINK_CACHE_CAPACITY { - self.trace.shrink_to(SHRINK_CACHE_CAPACITY); - } - break; - } - let (_, dangle_size) = cached_entries.take_entries(); - mem_size_change -= dangle_size as i64; - idx = cmp::max(cached_entries.range.end, idx); - } - let new_trace_cap = self.trace.capacity(); - mem_size_change += Self::get_trace_vec_mem_size_change(new_trace_cap, old_trace_cap); - - let cache_first_idx = self.first_index().unwrap_or(u64::MAX); - if cache_first_idx >= idx { - self.flush_mem_size_change(mem_size_change); - assert!(mem_size_change <= 0); - return -mem_size_change as u64; - } - - let cache_last_idx = self.cache.back().unwrap().get_index(); - // Use `cache_last_idx + 1` to make sure cache can be cleared completely if necessary. - let compact_to = (cmp::min(cache_last_idx + 1, idx) - cache_first_idx) as usize; - for e in self.cache.drain(..compact_to) { - mem_size_change -= (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64 - } - - mem_size_change += self.shrink_if_necessary(); - self.flush_mem_size_change(mem_size_change); - assert!(mem_size_change <= 0); - -mem_size_change as u64 - } - - fn get_total_mem_size(&self) -> i64 { - let data_size: i64 = self - .cache - .iter() - .map(|e| (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64) - .sum(); - let cache_vec_size = Self::get_cache_vec_mem_size_change(self.cache.capacity(), 0); - let trace_vec_size = Self::get_trace_vec_mem_size_change(self.trace.capacity(), 0); - data_size + cache_vec_size + trace_vec_size - } - - fn get_cache_vec_mem_size_change(new_capacity: usize, old_capacity: usize) -> i64 { - ENTRY_MEM_SIZE as i64 * (new_capacity as i64 - old_capacity as i64) - } - - fn get_trace_vec_mem_size_change(new_capacity: usize, old_capacity: usize) -> i64 { - mem::size_of::() as i64 * (new_capacity as i64 - old_capacity as i64) - } - - fn flush_mem_size_change(&self, mem_size_change: i64) { - #[cfg(test)] - if let Some(size_change_cb) = self.size_change_cb.as_ref() { - size_change_cb(mem_size_change); - } - let event = if mem_size_change > 0 { - TraceEvent::Add(mem_size_change as usize) - } else { - TraceEvent::Sub(-mem_size_change as usize) - }; - MEMTRACE_ENTRY_CACHE.trace(event); - RAFT_ENTRIES_CACHES_GAUGE.add(mem_size_change); - } - - fn flush_stats(&self) { - let hit = self.hit.replace(0); - RAFT_ENTRY_FETCHES.hit.inc_by(hit); - let miss = self.miss.replace(0); - RAFT_ENTRY_FETCHES.miss.inc_by(miss); - } - - #[inline] - fn is_empty(&self) -> bool { - self.cache.is_empty() - } - - fn trace_cached_entries(&mut self, entries: CachedEntries) { - let dangle_size = { - let mut guard = entries.entries.lock().unwrap(); - - let last_idx = guard.0.last().map(|e| e.index).unwrap(); - let cache_front = match self.cache.front().map(|e| e.index) { - Some(i) => i, - None => u64::MAX, - }; - - let dangle_range = if last_idx < cache_front { - // All entries are not in entry cache. - 0..guard.0.len() - } else if let Ok(i) = guard.0.binary_search_by(|e| e.index.cmp(&cache_front)) { - // Some entries are in entry cache. - 0..i - } else { - // All entries are in entry cache. - 0..0 - }; - - let mut size = 0; - for e in &guard.0[dangle_range] { - size += bytes_capacity(&e.data) + bytes_capacity(&e.context); - } - guard.1 = size; - size - }; - - let old_capacity = self.trace.capacity(); - self.trace.push_back(entries); - let new_capacity = self.trace.capacity(); - let diff = Self::get_trace_vec_mem_size_change(new_capacity, old_capacity); - - self.flush_mem_size_change(diff + dangle_size as i64); - } - - fn shrink_if_necessary(&mut self) -> i64 { - if self.cache.len() < SHRINK_CACHE_CAPACITY && self.cache.capacity() > SHRINK_CACHE_CAPACITY - { - let old_capacity = self.cache.capacity(); - self.cache.shrink_to_fit(); - let new_capacity = self.cache.capacity(); - return Self::get_cache_vec_mem_size_change(new_capacity, old_capacity); - } - 0 - } - - fn update_persisted(&mut self, persisted: u64) { - self.persisted = persisted; - } -} - -impl Default for EntryCache { - fn default() -> Self { - let entry_cache = EntryCache { - persisted: 0, - cache: Default::default(), - trace: Default::default(), - hit: Cell::new(0), - miss: Cell::new(0), - #[cfg(test)] - size_change_cb: None, - }; - entry_cache.flush_mem_size_change(entry_cache.get_total_mem_size()); - entry_cache - } -} - -impl Drop for EntryCache { - fn drop(&mut self) { - let mem_size_change = self.get_total_mem_size(); - self.flush_mem_size_change(-mem_size_change); - self.flush_stats(); - } -} - -fn storage_error(error: E) -> raft::Error +pub fn storage_error(error: E) -> raft::Error where E: Into>, { @@ -480,7 +163,7 @@ pub fn recover_from_applying_state( Ok(()) } -fn init_applied_index_term( +fn init_applied_term( engines: &Engines, region: &Region, apply_state: &RaftApplyState, @@ -642,72 +325,30 @@ where peer_id: u64, region: metapb::Region, - raft_state: RaftLocalState, - apply_state: RaftApplyState, - applied_index_term: u64, - last_term: u64, snap_state: RefCell, gen_snap_task: RefCell>, region_scheduler: Scheduler>, snap_tried_cnt: RefCell, - cache: EntryCache, - - raftlog_fetch_scheduler: Scheduler, - raftlog_fetch_stats: AsyncFetchStats, - async_fetch_results: RefCell>, + entry_storage: EntryStorage, pub tag: String, } -#[derive(Debug, PartialEq)] -pub enum RaftlogFetchState { - Fetching, - Fetched(Box), -} - -#[derive(Debug, PartialEq)] -pub struct RaftlogFetchResult { - pub ents: raft::Result>, - // because entries may be empty, so store the original low index that the task issued - pub low: u64, - // the original max size that the task issued - pub max_size: u64, - // if the ents hit max_size - pub hit_size_limit: bool, - // the times that async fetch have already tried - pub tried_cnt: usize, - // the term when the task issued - pub term: u64, -} +impl Deref for PeerStorage { + type Target = EntryStorage; -#[derive(Default)] -struct AsyncFetchStats { - async_fetch: Cell, - sync_fetch: Cell, - fallback_fetch: Cell, - fetch_invalid: Cell, - fetch_unused: Cell, + #[inline] + fn deref(&self) -> &Self::Target { + &self.entry_storage + } } -impl AsyncFetchStats { - fn flush_stats(&mut self) { - RAFT_ENTRY_FETCHES - .async_fetch - .inc_by(self.async_fetch.replace(0)); - RAFT_ENTRY_FETCHES - .sync_fetch - .inc_by(self.sync_fetch.replace(0)); - RAFT_ENTRY_FETCHES - .fallback_fetch - .inc_by(self.fallback_fetch.replace(0)); - RAFT_ENTRY_FETCHES - .fetch_invalid - .inc_by(self.fetch_invalid.replace(0)); - RAFT_ENTRY_FETCHES - .fetch_unused - .inc_by(self.fetch_unused.replace(0)); +impl DerefMut for PeerStorage { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.entry_storage } } @@ -728,19 +369,20 @@ where context: GetEntriesContext, ) -> raft::Result> { let max_size = max_size.into(); - self.entries(low, high, max_size.unwrap_or(u64::MAX), context) + self.entry_storage + .entries(low, high, max_size.unwrap_or(u64::MAX), context) } fn term(&self, idx: u64) -> raft::Result { - self.term(idx) + self.entry_storage.term(idx) } fn first_index(&self) -> raft::Result { - Ok(self.first_index()) + Ok(self.entry_storage.first_index()) } fn last_index(&self) -> raft::Result { - Ok(self.last_index()) + Ok(self.entry_storage.last_index()) } fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { @@ -773,25 +415,28 @@ where return Err(box_err!("{} validate state fail: {:?}", tag, e)); } let last_term = init_last_term(&engines, region, &raft_state, &apply_state)?; - let applied_index_term = init_applied_index_term(&engines, region, &apply_state)?; + let applied_term = init_applied_term(&engines, region, &apply_state)?; + let entry_storage = EntryStorage::new( + region.id, + peer_id, + engines.raft.clone(), + raft_state, + apply_state, + last_term, + applied_term, + raftlog_fetch_scheduler, + ); Ok(PeerStorage { engines, peer_id, region: region.clone(), - raft_state, - apply_state, snap_state: RefCell::new(SnapState::Relax), gen_snap_task: RefCell::new(None), region_scheduler, - raftlog_fetch_scheduler, snap_tried_cnt: RefCell::new(0), tag, - applied_index_term, - last_term, - cache: EntryCache::default(), - async_fetch_results: RefCell::new(HashMap::default()), - raftlog_fetch_stats: AsyncFetchStats::default(), + entry_storage, }) } @@ -800,14 +445,14 @@ where } pub fn initial_state(&self) -> raft::Result { - let hard_state = self.raft_state.get_hard_state().clone(); + let hard_state = self.raft_state().get_hard_state().clone(); if hard_state == HardState::default() { assert!( !self.is_initialized(), "peer for region {:?} is initialized but local state {:?} has empty hard \ state", self.region, - self.raft_state + self.raft_state() ); return Ok(RaftState::new(hard_state, ConfState::default())); @@ -818,348 +463,6 @@ where )) } - fn check_range(&self, low: u64, high: u64) -> raft::Result<()> { - if low > high { - return Err(storage_error(format!( - "low: {} is greater that high: {}", - low, high - ))); - } else if low <= self.truncated_index() { - return Err(RaftError::Store(StorageError::Compacted)); - } else if high > self.last_index() + 1 { - return Err(storage_error(format!( - "entries' high {} is out of bound lastindex {}", - high, - self.last_index() - ))); - } - Ok(()) - } - - pub fn clean_async_fetch_res(&mut self, low: u64) { - self.async_fetch_results.borrow_mut().remove(&low); - } - - // Update the async fetch result. - // None indicates cleanning the fetched result. - pub fn update_async_fetch_res(&mut self, low: u64, res: Option>) { - // If it's in fetching, don't clean the async fetch result. - if self.async_fetch_results.borrow().get(&low) == Some(&RaftlogFetchState::Fetching) - && res.is_none() - { - return; - } - - match res { - Some(res) => { - if let Some(RaftlogFetchState::Fetched(prev)) = self - .async_fetch_results - .borrow_mut() - .insert(low, RaftlogFetchState::Fetched(res)) - { - info!( - "unconsumed async fetch res"; - "region_id" => self.region.get_id(), - "peer_id" => self.peer_id, - "res" => ?prev, - "low" => low, - ); - } - } - None => { - let prev = self.async_fetch_results.borrow_mut().remove(&low); - if prev.is_some() { - self.raftlog_fetch_stats.fetch_unused.update(|m| m + 1); - } - } - } - } - - fn async_fetch( - &self, - region_id: u64, - low: u64, - high: u64, - max_size: u64, - context: GetEntriesContext, - buf: &mut Vec, - ) -> raft::Result { - if let Some(RaftlogFetchState::Fetching) = self.async_fetch_results.borrow().get(&low) { - // already an async fetch in flight - return Err(raft::Error::Store( - raft::StorageError::LogTemporarilyUnavailable, - )); - } - - let tried_cnt = if let Some(RaftlogFetchState::Fetched(res)) = - self.async_fetch_results.borrow_mut().remove(&low) - { - assert_eq!(res.low, low); - let mut ents = res.ents?; - let first = ents.first().map(|e| e.index).unwrap(); - assert_eq!(first, res.low); - let last = ents.last().map(|e| e.index).unwrap(); - - if last + 1 >= high { - // async fetch res covers [low, high) - ents.truncate((high - first) as usize); - assert_eq!(ents.last().map(|e| e.index).unwrap(), high - 1); - if max_size < res.max_size { - limit_size(&mut ents, Some(max_size)); - } - let count = ents.len(); - buf.append(&mut ents); - fail_point!("on_async_fetch_return"); - return Ok(count); - } else if res.hit_size_limit && max_size <= res.max_size { - // async fetch res doesn't cover [low, high) due to hit size limit - if max_size < res.max_size { - limit_size(&mut ents, Some(max_size)); - }; - let count = ents.len(); - buf.append(&mut ents); - return Ok(count); - } else if last + RAFT_LOG_MULTI_GET_CNT > high - 1 - && res.tried_cnt + 1 == MAX_ASYNC_FETCH_TRY_CNT - { - let mut fetched_size = ents.iter().fold(0, |acc, e| acc + e.compute_size() as u64); - if max_size <= fetched_size { - limit_size(&mut ents, Some(max_size)); - let count = ents.len(); - buf.append(&mut ents); - return Ok(count); - } - - // the count of left entries isn't too large, fetch the remaining entries synchronously one by one - for idx in last + 1..high { - let ent = self.engines.raft.get_entry(region_id, idx)?; - match ent { - None => { - return Err(raft::Error::Store(raft::StorageError::Unavailable)); - } - Some(ent) => { - let size = ent.compute_size() as u64; - if fetched_size + size > max_size { - break; - } else { - fetched_size += size; - ents.push(ent); - } - } - } - } - let count = ents.len(); - buf.append(&mut ents); - return Ok(count); - } - info!( - "async fetch invalid"; - "region_id" => self.region.get_id(), - "peer_id" => self.peer_id, - "first" => first, - "last" => last, - "low" => low, - "high" => high, - "max_size" => max_size, - "res_max_size" => res.max_size, - ); - // low index or max size is changed, the result is not fit for the current range, so refetch again. - self.raftlog_fetch_stats.fetch_invalid.update(|m| m + 1); - res.tried_cnt + 1 - } else { - 1 - }; - - // the first/second try: get [low, high) asynchronously - // the third try: - // - if term and low are matched: use result of [low, persisted) and get [persisted, high) synchronously - // - else: get [low, high) synchronously - if tried_cnt >= MAX_ASYNC_FETCH_TRY_CNT { - // even the larger range is invalid again, fallback to fetch in sync way - self.raftlog_fetch_stats.fallback_fetch.update(|m| m + 1); - let count = self.engines.raft.fetch_entries_to( - region_id, - low, - high, - Some(max_size as usize), - buf, - )?; - return Ok(count); - } - - self.raftlog_fetch_stats.async_fetch.update(|m| m + 1); - self.async_fetch_results - .borrow_mut() - .insert(low, RaftlogFetchState::Fetching); - self.raftlog_fetch_scheduler - .schedule(RaftlogFetchTask::PeerStorage { - region_id, - context, - low, - high, - max_size: (max_size as usize), - tried_cnt, - term: self.hard_state().get_term(), - }) - .unwrap(); - Err(raft::Error::Store( - raft::StorageError::LogTemporarilyUnavailable, - )) - } - - pub fn entries( - &self, - low: u64, - high: u64, - max_size: u64, - context: GetEntriesContext, - ) -> raft::Result> { - self.check_range(low, high)?; - let mut ents = - Vec::with_capacity(std::cmp::min((high - low) as usize, MAX_INIT_ENTRY_COUNT)); - if low == high { - return Ok(ents); - } - let region_id = self.get_region_id(); - let cache_low = self.cache.first_index().unwrap_or(u64::MAX); - if high <= cache_low { - self.cache.miss.update(|m| m + 1); - return if context.can_async() { - self.async_fetch(region_id, low, high, max_size, context, &mut ents)?; - Ok(ents) - } else { - self.raftlog_fetch_stats.sync_fetch.update(|m| m + 1); - self.engines.raft.fetch_entries_to( - region_id, - low, - high, - Some(max_size as usize), - &mut ents, - )?; - Ok(ents) - }; - } - let begin_idx = if low < cache_low { - self.cache.miss.update(|m| m + 1); - let fetched_count = if context.can_async() { - self.async_fetch(region_id, low, cache_low, max_size, context, &mut ents)? - } else { - self.raftlog_fetch_stats.sync_fetch.update(|m| m + 1); - self.engines.raft.fetch_entries_to( - region_id, - low, - cache_low, - Some(max_size as usize), - &mut ents, - )? - }; - if fetched_count < (cache_low - low) as usize { - // Less entries are fetched than expected. - return Ok(ents); - } - cache_low - } else { - low - }; - self.cache.hit.update(|h| h + 1); - let fetched_size = ents.iter().fold(0, |acc, e| acc + e.compute_size()); - self.cache - .fetch_entries_to(begin_idx, high, fetched_size as u64, max_size, &mut ents); - Ok(ents) - } - - pub fn term(&self, idx: u64) -> raft::Result { - if idx == self.truncated_index() { - return Ok(self.truncated_term()); - } - self.check_range(idx, idx + 1)?; - if self.truncated_term() == self.last_term || idx == self.last_index() { - return Ok(self.last_term); - } - if let Some(e) = self.cache.entry(idx) { - Ok(e.get_term()) - } else { - Ok(self - .engines - .raft - .get_entry(self.get_region_id(), idx) - .unwrap() - .unwrap() - .get_term()) - } - } - - #[inline] - pub fn first_index(&self) -> u64 { - first_index(&self.apply_state) - } - - #[inline] - pub fn last_index(&self) -> u64 { - last_index(&self.raft_state) - } - - #[inline] - pub fn last_term(&self) -> u64 { - self.last_term - } - - #[inline] - pub fn raft_state(&self) -> &RaftLocalState { - &self.raft_state - } - - #[inline] - pub fn applied_index(&self) -> u64 { - self.apply_state.get_applied_index() - } - - #[inline] - pub fn set_applied_state(&mut self, apply_state: RaftApplyState) { - self.apply_state = apply_state; - } - - #[inline] - pub fn set_applied_term(&mut self, applied_index_term: u64) { - self.applied_index_term = applied_index_term; - } - - #[inline] - pub fn apply_state(&self) -> &RaftApplyState { - &self.apply_state - } - - #[inline] - pub fn applied_index_term(&self) -> u64 { - self.applied_index_term - } - - #[inline] - pub fn commit_index(&self) -> u64 { - self.raft_state.get_hard_state().get_commit() - } - - #[inline] - pub fn set_commit_index(&mut self, commit: u64) { - assert!(commit >= self.commit_index()); - self.raft_state.mut_hard_state().set_commit(commit); - } - - #[inline] - pub fn hard_state(&self) -> &HardState { - self.raft_state.get_hard_state() - } - - #[inline] - pub fn truncated_index(&self) -> u64 { - self.apply_state.get_truncated_state().get_index() - } - - #[inline] - pub fn truncated_term(&self) -> u64 { - self.apply_state.get_truncated_state().get_term() - } - #[inline] pub fn region(&self) -> &metapb::Region { &self.region @@ -1181,7 +484,7 @@ where snapshot_index: u64, kv_wb: &mut impl Mutable, ) -> Result<()> { - let mut snapshot_raft_state = self.raft_state.clone(); + let mut snapshot_raft_state = self.raft_state().clone(); snapshot_raft_state .mut_hard_state() .set_commit(snapshot_index); @@ -1200,7 +503,7 @@ where kv_wb.put_msg_cf( CF_RAFT, &keys::apply_state_key(self.region.get_id()), - &self.apply_state, + self.apply_state(), )?; Ok(()) } @@ -1354,68 +657,11 @@ where self.gen_snap_task.get_mut().take() } - // Append the given entries to the raft log using previous last index or self.last_index. - pub fn append(&mut self, entries: Vec, task: &mut WriteTask) { - if entries.is_empty() { - return; - } - let region_id = self.get_region_id(); - debug!( - "append entries"; - "region_id" => region_id, - "peer_id" => self.peer_id, - "count" => entries.len(), - ); - let prev_last_index = self.raft_state.get_last_index(); - - let (last_index, last_term) = { - let e = entries.last().unwrap(); - (e.get_index(), e.get_term()) - }; - - self.cache.append(&self.tag, &entries); - - task.entries = entries; - // Delete any previously appended log entries which never committed. - task.cut_logs = Some((last_index + 1, prev_last_index + 1)); - - self.raft_state.set_last_index(last_index); - self.last_term = last_term; - } - pub fn on_compact_raftlog(&mut self, idx: u64) { - self.compact_entry_cache(idx); + self.entry_storage.compact_entry_cache(idx); self.cancel_generating_snap(Some(idx)); } - pub fn compact_entry_cache(&mut self, idx: u64) { - self.cache.compact_to(idx); - } - - #[inline] - pub fn is_entry_cache_empty(&self) -> bool { - self.cache.is_empty() - } - - /// Evict entries from the cache. - pub fn evict_entry_cache(&mut self, half: bool) { - if !self.is_entry_cache_empty() { - let cache = &mut self.cache; - let cache_len = cache.cache.len(); - let drain_to = if half { cache_len / 2 } else { cache_len - 1 }; - let idx = cache.cache[drain_to].index; - let mem_size_change = cache.compact_to(idx + 1); - RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); - } - } - - #[inline] - pub fn flush_entry_cache_metrics(&mut self) { - // NOTE: memory usage of entry cache is flushed realtime. - self.cache.flush_stats(); - self.raftlog_fetch_stats.flush_stats(); - } - // Apply the peer with given snapshot. pub fn apply_snapshot( &mut self, @@ -1454,7 +700,7 @@ where if self.is_initialized() { // we can only delete the old data when the peer is initialized. - let first_index = self.first_index(); + let first_index = self.entry_storage.first_index(); // It's possible that logs between `last_compacted_idx` and `first_index` are // being deleted in raftlog_gc worker. But it's OK as: // 1. If the peer accepts a new snapshot, it must start with an index larger than @@ -1475,15 +721,18 @@ where let last_index = snap.get_metadata().get_index(); - self.raft_state.set_last_index(last_index); - self.last_term = snap.get_metadata().get_term(); - self.apply_state.set_applied_index(last_index); - self.applied_index_term = self.last_term; + self.raft_state_mut().set_last_index(last_index); + self.set_last_term(snap.get_metadata().get_term()); + self.apply_state_mut().set_applied_index(last_index); + let last_term = self.last_term(); + self.set_applied_term(last_term); // The snapshot only contains log which index > applied index, so // here the truncate state's (index, term) is in snapshot metadata. - self.apply_state.mut_truncated_state().set_index(last_index); - self.apply_state + self.apply_state_mut() + .mut_truncated_state() + .set_index(last_index); + self.apply_state_mut() .mut_truncated_state() .set_term(snap.get_metadata().get_term()); @@ -1502,7 +751,7 @@ where "region_id" => self.region.get_id(), "peer_id" => self.peer_id, "region" => ?region, - "state" => ?self.apply_state, + "state" => ?self.apply_state(), ); Ok(region) @@ -1522,9 +771,9 @@ where raft_wb, region_id, first_index, - &self.raft_state, + self.raft_state(), )?; - self.cache = EntryCache::default(); + self.entry_storage.clear(); Ok(()) } @@ -1575,8 +824,8 @@ where Ok(()) } - pub fn get_raft_engine(&self) -> ER { - self.engines.raft.clone() + pub fn raft_engine(&self) -> &ER { + self.entry_storage.raft_engine() } /// Check whether the storage has finished applying snapshot. @@ -1721,14 +970,14 @@ where destroy_regions: Vec, ) -> Result<(HandleReadyResult, WriteTask)> { let region_id = self.get_region_id(); - let prev_raft_state = self.raft_state.clone(); + let prev_raft_state = self.raft_state().clone(); let mut write_task = WriteTask::new(region_id, self.peer_id, ready.number()); let mut res = HandleReadyResult::SendIOTask; if !ready.snapshot().is_empty() { fail_point!("raft_before_apply_snap"); - let last_first_index = self.first_index(); + let last_first_index = self.first_index().unwrap(); let snap_region = self.apply_snapshot(ready.snapshot(), &mut write_task, &destroy_regions)?; @@ -1747,15 +996,15 @@ where // Last index is 0 means the peer is created from raft message // and has not applied snapshot yet, so skip persistent hard state. - if self.raft_state.get_last_index() > 0 { + if self.raft_state().get_last_index() > 0 { if let Some(hs) = ready.hs() { - self.raft_state.set_hard_state(hs.clone()); + self.raft_state_mut().set_hard_state(hs.clone()); } } // Save raft state if it has changed or there is a snapshot. - if prev_raft_state != self.raft_state || !ready.snapshot().is_empty() { - write_task.raft_state = Some(self.raft_state.clone()); + if prev_raft_state != *self.raft_state() || !ready.snapshot().is_empty() { + write_task.raft_state = Some(self.raft_state().clone()); } if !ready.snapshot().is_empty() { @@ -1777,10 +1026,6 @@ where Ok((res, write_task)) } - pub fn update_cache_persisted(&mut self, persisted: u64) { - self.cache.update_persisted(persisted); - } - pub fn persist_snapshot(&mut self, res: &PersistSnapshotResult) { // cleanup data before scheduling apply task if self.is_initialized() { @@ -1821,10 +1066,6 @@ where // See comments in `apply_snapshot` for more details. self.set_region(res.region.clone()); } - - pub fn trace_cached_entries(&mut self, entries: CachedEntries) { - self.cache.trace_cached_entries(entries); - } } /// Delete all meta belong to the region. Results are stored in `wb`. @@ -1865,7 +1106,7 @@ pub fn do_snapshot( engine: &E, kv_snap: E::Snapshot, region_id: u64, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, for_balance: bool, allow_multi_files_snapshot: bool, @@ -1894,7 +1135,7 @@ where let key = SnapKey::new( region_id, - last_applied_index_term, + last_applied_term, apply_state.get_applied_index(), ); @@ -1999,34 +1240,8 @@ pub fn write_peer_state( Ok(()) } -/// Committed entries sent to apply threads. -#[derive(Clone)] -pub struct CachedEntries { - pub range: Range, - // Entries and dangle size for them. `dangle` means not in entry cache. - entries: Arc, usize)>>, -} - -impl CachedEntries { - pub fn new(entries: Vec) -> Self { - assert!(!entries.is_empty()); - let start = entries.first().map(|x| x.index).unwrap(); - let end = entries.last().map(|x| x.index).unwrap() + 1; - let range = Range { start, end }; - CachedEntries { - entries: Arc::new(Mutex::new((entries, 0))), - range, - } - } - - /// Take cached entries and dangle size for them. `dangle` means not in entry cache. - pub fn take_entries(&self) -> (Vec, usize) { - mem::take(&mut *self.entries.lock().unwrap()) - } -} - #[cfg(test)] -mod tests { +pub mod tests { use std::{ cell::RefCell, path::Path, @@ -2058,27 +1273,13 @@ mod tests { store::{ async_io::write::write_to_db_for_test, bootstrap_store, + entry_storage::tests::validate_cache, fsm::apply::compact_raft_log, initial_region, prepare_bootstrap_cluster, worker::{RaftlogFetchRunner, RegionRunner, RegionTask}, }, }; - impl EntryCache { - fn new_with_cb(cb: impl Fn(i64) + Send + 'static) -> Self { - let entry_cache = EntryCache { - persisted: 0, - cache: Default::default(), - trace: Default::default(), - hit: Cell::new(0), - miss: Cell::new(0), - size_change_cb: Some(Box::new(cb) as Box), - }; - entry_cache.flush_mem_size_change(entry_cache.get_total_mem_size()); - entry_cache - } - } - fn new_storage( region_scheduler: Scheduler>, raftlog_fetch_scheduler: Scheduler, @@ -2113,7 +1314,7 @@ mod tests { .unwrap() } - fn new_storage_from_ents( + pub fn new_storage_from_ents( region_scheduler: Scheduler>, raftlog_fetch_scheduler: Scheduler, path: &TempDir, @@ -2124,15 +1325,15 @@ mod tests { store.append(ents[1..].to_vec(), &mut write_task); store.update_cache_persisted(ents.last().unwrap().get_index()); store - .apply_state + .apply_state_mut() .mut_truncated_state() .set_index(ents[0].get_index()); store - .apply_state + .apply_state_mut() .mut_truncated_state() .set_term(ents[0].get_term()); store - .apply_state + .apply_state_mut() .set_applied_index(ents.last().unwrap().get_index()); if write_task.kv_wb.is_none() { write_task.kv_wb = Some(store.engines.kv.write_batch()); @@ -2140,35 +1341,22 @@ mod tests { store .save_apply_state_to(write_task.kv_wb.as_mut().unwrap()) .unwrap(); - write_task.raft_state = Some(store.raft_state.clone()); + write_task.raft_state = Some(store.raft_state().clone()); write_to_db_for_test(&store.engines, write_task); store } - fn append_ents(store: &mut PeerStorage, ents: &[Entry]) { + pub fn append_ents(store: &mut PeerStorage, ents: &[Entry]) { if ents.is_empty() { return; } let mut write_task = WriteTask::new(store.get_region_id(), store.peer_id, 1); store.append(ents.to_vec(), &mut write_task); - write_task.raft_state = Some(store.raft_state.clone()); + write_task.raft_state = Some(store.raft_state().clone()); write_to_db_for_test(&store.engines, write_task); } - fn validate_cache(store: &PeerStorage, exp_ents: &[Entry]) { - assert_eq!(store.cache.cache, exp_ents); - for e in exp_ents { - let entry = store - .engines - .raft - .get_entry(store.get_region_id(), e.get_index()) - .unwrap() - .unwrap(); - assert_eq!(entry, *e); - } - } - - fn new_entry(index: u64, term: u64) -> Entry { + pub fn new_entry(index: u64, term: u64) -> Entry { let mut e = Entry::default(); e.set_index(index); e.set_term(term); @@ -2442,257 +1630,6 @@ mod tests { assert_ne!(count, 0); } - #[test] - fn test_async_fetch() { - let ents = vec![ - new_entry(2, 2), - new_entry(3, 3), - new_entry(4, 4), - new_entry(5, 5), - new_entry(6, 6), - ]; - - let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let region_worker = Worker::new("snap-manager").lazy_build("snap-manager"); - let region_scheduler = region_worker.scheduler(); - let (dummy_scheduler, _rx) = dummy_scheduler(); - let mut store = new_storage_from_ents(region_scheduler, dummy_scheduler, &td, &ents); - - let max_u64 = u64::max_value(); - let mut tests = vec![ - // already compacted - ( - 3, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Err(RaftError::Store(StorageError::Compacted)), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Err(RaftError::Store(StorageError::Compacted)), - vec![], - ), - // fetch partial entries due to max size limit - ( - 3, - 7, - 30, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: 30, - hit_size_limit: true, - tried_cnt: 1, - term: 1, - }, - Ok(3), - ents[1..4].to_vec(), - ), - // fetch all entries - ( - 2, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents.clone()), - low: 2, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Ok(5), - ents.clone(), - ), - // high is smaller than before - ( - 3, - 5, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Ok(2), - ents[1..3].to_vec(), - ), - // high is larger than before, second try - ( - 3, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Err(RaftError::Store(StorageError::LogTemporarilyUnavailable)), - vec![], - ), - // high is larger than before, thrid try - ( - 3, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 2, - term: 1, - }, - Ok(4), - ents[1..].to_vec(), - ), - // max size is smaller than before - ( - 2, - 7, - 10, - 1, - RaftlogFetchResult { - ents: Ok(ents.clone()), - low: 2, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Ok(2), - ents[..2].to_vec(), - ), - // max size is larger than before but with lower high - ( - 2, - 5, - 40, - 1, - RaftlogFetchResult { - ents: Ok(ents.clone()), - low: 2, - max_size: 30, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Ok(3), - ents[..3].to_vec(), - ), - // low index is smaller than before - ( - 2, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Err(RaftError::Store(StorageError::Compacted)), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Err(RaftError::Store(StorageError::LogTemporarilyUnavailable)), - vec![], - ), - // low index is larger than before - ( - 4, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(vec![]), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: 1, - term: 1, - }, - Err(RaftError::Store(StorageError::LogTemporarilyUnavailable)), - vec![], - ), - // hit tried several lmit - ( - 3, - 7, - max_u64, - 1, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, - term: 1, - }, - Ok(4), - ents[1..5].to_vec(), - ), - // term is changed - ( - 3, - 7, - max_u64, - 2, - RaftlogFetchResult { - ents: Ok(ents[1..4].to_vec()), - low: 3, - max_size: max_u64, - hit_size_limit: false, - tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, - term: 1, - }, - Ok(4), - ents[1..5].to_vec(), - ), - ]; - - for (i, (lo, hi, maxsize, term, async_res, expected_res, expected_ents)) in - tests.drain(..).enumerate() - { - if async_res.low != lo { - store.clean_async_fetch_res(lo); - } else { - store.update_async_fetch_res(lo, Some(Box::new(async_res))); - } - let mut ents = vec![]; - store.raft_state.mut_hard_state().set_term(term); - let res = store.async_fetch( - store.get_region_id(), - lo, - hi, - maxsize, - GetEntriesContext::empty(true), - &mut ents, - ); - if res != expected_res { - panic!("#{}: expect result {:?}, got {:?}", i, expected_res, res); - } - if ents != expected_ents { - panic!("#{}: expect ents {:?}, got {:?}", i, expected_ents, ents); - } - } - } - // last_index and first_index are not mutated by PeerStorage on its own, // so we don't test them here. @@ -2711,10 +1648,9 @@ mod tests { let sched = worker.scheduler(); let (dummy_scheduler, _) = dummy_scheduler(); let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); - let res = store - .term(idx) - .map_err(From::from) - .and_then(|term| compact_raft_log(&store.tag, &mut store.apply_state, idx, term)); + let res = store.term(idx).map_err(From::from).and_then(|term| { + compact_raft_log(&store.tag, store.entry_storage.apply_state_mut(), idx, term) + }); // TODO check exact error type after refactoring error. if res.is_err() ^ werr.is_err() { panic!("#{}: want {:?}, got {:?}", i, werr, res); @@ -2834,10 +1770,10 @@ mod tests { let mut hs = HardState::default(); hs.set_commit(7); hs.set_term(5); - s.raft_state.set_hard_state(hs); - s.raft_state.set_last_index(7); - s.apply_state.set_applied_index(7); - write_task.raft_state = Some(s.raft_state.clone()); + s.raft_state_mut().set_hard_state(hs); + s.raft_state_mut().set_last_index(7); + s.apply_state_mut().set_applied_index(7); + write_task.raft_state = Some(s.raft_state().clone()); if write_task.kv_wb.is_none() { write_task.kv_wb = Some(s.engines.kv.write_batch()); } @@ -2845,7 +1781,7 @@ mod tests { .unwrap(); write_to_db_for_test(&s.engines, write_task); let term = s.term(7).unwrap(); - compact_raft_log(&s.tag, &mut s.apply_state, 7, term).unwrap(); + compact_raft_log(&s.tag, s.entry_storage.apply_state_mut(), 7, term).unwrap(); let mut kv_wb = s.engines.kv.write_batch(); s.save_apply_state_to(&mut kv_wb).unwrap(); kv_wb.write().unwrap(); @@ -2970,246 +1906,6 @@ mod tests { test_storage_create_snapshot_for_role("tikv", 5); } - #[test] - fn test_storage_append() { - let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; - let mut tests = vec![ - ( - vec![new_entry(4, 6), new_entry(5, 6)], - vec![new_entry(4, 6), new_entry(5, 6)], - ), - ( - vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], - vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], - ), - // truncate the existing entries and append - (vec![new_entry(4, 5)], vec![new_entry(4, 5)]), - // direct append - ( - vec![new_entry(6, 5)], - vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 5)], - ), - ]; - for (i, (entries, wentries)) in tests.drain(..).enumerate() { - let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let worker = LazyWorker::new("snap-manager"); - let sched = worker.scheduler(); - let (dummy_scheduler, _) = dummy_scheduler(); - let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); - append_ents(&mut store, &entries); - let li = store.last_index(); - let actual_entries = store - .entries(4, li + 1, u64::max_value(), GetEntriesContext::empty(false)) - .unwrap(); - if actual_entries != wentries { - panic!("#{}: want {:?}, got {:?}", i, wentries, actual_entries); - } - } - } - - #[test] - fn test_storage_cache_fetch() { - let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; - let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let worker = LazyWorker::new("snap-manager"); - let sched = worker.scheduler(); - let (dummy_scheduler, _) = dummy_scheduler(); - let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); - store.cache.cache.clear(); - // empty cache should fetch data from rocksdb directly. - let mut res = store - .entries(4, 6, u64::max_value(), GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(*res, ents[1..]); - - let entries = vec![new_entry(6, 5), new_entry(7, 5)]; - append_ents(&mut store, &entries); - validate_cache(&store, &entries); - - // direct cache access - res = store - .entries(6, 8, u64::max_value(), GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(res, entries); - - // size limit should be supported correctly. - res = store - .entries(4, 8, 0, GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(res, vec![new_entry(4, 4)]); - let mut size = ents[1..].iter().map(|e| u64::from(e.compute_size())).sum(); - res = store - .entries(4, 8, size, GetEntriesContext::empty(false)) - .unwrap(); - let mut exp_res = ents[1..].to_vec(); - assert_eq!(res, exp_res); - for e in &entries { - size += u64::from(e.compute_size()); - exp_res.push(e.clone()); - res = store - .entries(4, 8, size, GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(res, exp_res); - } - - // range limit should be supported correctly. - for low in 4..9 { - for high in low..9 { - let res = store - .entries(low, high, u64::max_value(), GetEntriesContext::empty(false)) - .unwrap(); - assert_eq!(*res, exp_res[low as usize - 4..high as usize - 4]); - } - } - } - - #[test] - fn test_storage_cache_update() { - let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; - let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let worker = LazyWorker::new("snap-manager"); - let sched = worker.scheduler(); - let (dummy_scheduler, _) = dummy_scheduler(); - let mut store = new_storage_from_ents(sched, dummy_scheduler, &td, &ents); - store.cache.cache.clear(); - - // initial cache - let mut entries = vec![new_entry(6, 5), new_entry(7, 5)]; - append_ents(&mut store, &entries); - validate_cache(&store, &entries); - - // rewrite - entries = vec![new_entry(6, 6), new_entry(7, 6)]; - append_ents(&mut store, &entries); - validate_cache(&store, &entries); - - // rewrite old entry - entries = vec![new_entry(5, 6), new_entry(6, 6)]; - append_ents(&mut store, &entries); - validate_cache(&store, &entries); - - // partial rewrite - entries = vec![new_entry(6, 7), new_entry(7, 7)]; - append_ents(&mut store, &entries); - let mut exp_res = vec![new_entry(5, 6), new_entry(6, 7), new_entry(7, 7)]; - validate_cache(&store, &exp_res); - - // direct append - entries = vec![new_entry(8, 7), new_entry(9, 7)]; - append_ents(&mut store, &entries); - exp_res.extend_from_slice(&entries); - validate_cache(&store, &exp_res); - - // rewrite middle - entries = vec![new_entry(7, 8)]; - append_ents(&mut store, &entries); - exp_res.truncate(2); - exp_res.push(new_entry(7, 8)); - validate_cache(&store, &exp_res); - - // compact to min(5 + 1, 7) - store.cache.persisted = 5; - store.compact_entry_cache(7); - exp_res = vec![new_entry(6, 7), new_entry(7, 8)]; - validate_cache(&store, &exp_res); - - // compact to min(7 + 1, 7) - store.cache.persisted = 7; - store.compact_entry_cache(7); - exp_res = vec![new_entry(7, 8)]; - validate_cache(&store, &exp_res); - // compact all - store.compact_entry_cache(8); - validate_cache(&store, &[]); - // invalid compaction should be ignored. - store.compact_entry_cache(6); - } - - #[test] - fn test_storage_cache_size_change() { - let new_padded_entry = |index: u64, term: u64, pad_len: usize| { - let mut e = new_entry(index, term); - e.data = vec![b'x'; pad_len].into(); - e - }; - - // Test the initial data structure size. - let (tx, rx) = mpsc::sync_channel(8); - let mut cache = EntryCache::new_with_cb(move |c: i64| tx.send(c).unwrap()); - assert_eq!(rx.try_recv().unwrap(), 896); - - cache.append( - "", - &[new_padded_entry(101, 1, 1), new_padded_entry(102, 1, 2)], - ); - assert_eq!(rx.try_recv().unwrap(), 3); - - // Test size change for one overlapped entry. - cache.append("", &[new_padded_entry(102, 2, 3)]); - assert_eq!(rx.try_recv().unwrap(), 1); - - // Test size change for all overlapped entries. - cache.append( - "", - &[new_padded_entry(101, 3, 4), new_padded_entry(102, 3, 5)], - ); - assert_eq!(rx.try_recv().unwrap(), 5); - - cache.append("", &[new_padded_entry(103, 3, 6)]); - assert_eq!(rx.try_recv().unwrap(), 6); - - // Test trace a dangle entry. - let cached_entries = CachedEntries::new(vec![new_padded_entry(100, 1, 1)]); - cache.trace_cached_entries(cached_entries); - assert_eq!(rx.try_recv().unwrap(), 1); - - // Test trace an entry which is still in cache. - let cached_entries = CachedEntries::new(vec![new_padded_entry(102, 3, 5)]); - cache.trace_cached_entries(cached_entries); - assert_eq!(rx.try_recv().unwrap(), 0); - - // Test compare `cached_last` with `trunc_to_idx` in `EntryCache::append_impl`. - cache.append("", &[new_padded_entry(103, 4, 7)]); - assert_eq!(rx.try_recv().unwrap(), 1); - - // Test compact one traced dangle entry and one entry in cache. - cache.persisted = 101; - cache.compact_to(102); - assert_eq!(rx.try_recv().unwrap(), -5); - - // Test compact the last traced dangle entry. - cache.persisted = 102; - cache.compact_to(103); - assert_eq!(rx.try_recv().unwrap(), -5); - - // Test compact all entries. - cache.persisted = 103; - cache.compact_to(104); - assert_eq!(rx.try_recv().unwrap(), -7); - - drop(cache); - assert_eq!(rx.try_recv().unwrap(), -896); - } - - #[test] - fn test_storage_cache_entry() { - let mut cache = EntryCache::default(); - let ents = vec![ - new_entry(3, 3), - new_entry(4, 4), - new_entry(5, 4), - new_entry(6, 6), - ]; - cache.append("", &ents); - assert!(cache.entry(1).is_none()); - assert!(cache.entry(2).is_none()); - for e in &ents { - assert_eq!(e, cache.entry(e.get_index()).unwrap()); - } - let res = panic_hook::recover_safe(|| cache.entry(7)); - assert!(res.is_err()); - } - #[test] fn test_storage_apply_snapshot() { let ents = vec![ @@ -3256,18 +1952,18 @@ mod tests { let td2 = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); let mut s2 = new_storage(sched.clone(), dummy_scheduler.clone(), &td2); - assert_eq!(s2.first_index(), s2.applied_index() + 1); + assert_eq!(s2.first_index(), Ok(s2.applied_index() + 1)); let mut write_task = WriteTask::new(s2.get_region_id(), s2.peer_id, 1); let snap_region = s2.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap1.get_data()).unwrap(); assert_eq!(snap_region, snap_data.take_region(),); - assert_eq!(s2.last_term, snap1.get_metadata().get_term()); - assert_eq!(s2.apply_state.get_applied_index(), 6); - assert_eq!(s2.raft_state.get_last_index(), 6); - assert_eq!(s2.apply_state.get_truncated_state().get_index(), 6); - assert_eq!(s2.apply_state.get_truncated_state().get_term(), 6); - assert_eq!(s2.first_index(), s2.applied_index() + 1); + assert_eq!(s2.last_term(), snap1.get_metadata().get_term()); + assert_eq!(s2.apply_state().get_applied_index(), 6); + assert_eq!(s2.raft_state().get_last_index(), 6); + assert_eq!(s2.apply_state().get_truncated_state().get_index(), 6); + assert_eq!(s2.apply_state().get_truncated_state().get_term(), 6); + assert_eq!(s2.first_index(), Ok(s2.applied_index() + 1)); validate_cache(&s2, &[]); let td3 = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); @@ -3279,11 +1975,11 @@ mod tests { let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap1.get_data()).unwrap(); assert_eq!(snap_region, snap_data.take_region(),); - assert_eq!(s3.last_term, snap1.get_metadata().get_term()); - assert_eq!(s3.apply_state.get_applied_index(), 6); - assert_eq!(s3.raft_state.get_last_index(), 6); - assert_eq!(s3.apply_state.get_truncated_state().get_index(), 6); - assert_eq!(s3.apply_state.get_truncated_state().get_term(), 6); + assert_eq!(s3.last_term(), snap1.get_metadata().get_term()); + assert_eq!(s3.apply_state().get_applied_index(), 6); + assert_eq!(s3.raft_state().get_last_index(), 6); + assert_eq!(s3.apply_state().get_truncated_state().get_index(), 6); + assert_eq!(s3.apply_state().get_truncated_state().get_term(), 6); validate_cache(&s3, &[]); } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a506ab80f17..81358c989e0 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -149,7 +149,7 @@ pub struct ReadDelegate { pub region: Arc, pub peer_id: u64, pub term: u64, - pub applied_index_term: u64, + pub applied_term: u64, pub leader_lease: Option, pub last_valid_ts: Timespec, @@ -230,7 +230,7 @@ impl ReadDelegate { region: Arc::new(region), peer_id, term: peer.term(), - applied_index_term: peer.get_store().applied_index_term(), + applied_term: peer.get_store().applied_term(), leader_lease: None, last_valid_ts: Timespec::new(0, 0), tag: format!("[region {}] {}", region_id, peer_id), @@ -262,8 +262,8 @@ impl ReadDelegate { Progress::Term(term) => { self.term = term; } - Progress::AppliedIndexTerm(applied_index_term) => { - self.applied_index_term = applied_index_term; + Progress::AppliedTerm(applied_term) => { + self.applied_term = applied_term; } Progress::LeaderLease(leader_lease) => { self.leader_lease = Some(leader_lease); @@ -358,7 +358,7 @@ impl ReadDelegate { region: Arc::new(region), peer_id: 1, term: 1, - applied_index_term: 1, + applied_term: 1, leader_lease: None, last_valid_ts: Timespec::new(0, 0), tag: format!("[region {}] {}", region_id, 1), @@ -377,11 +377,11 @@ impl Display for ReadDelegate { write!( f, "ReadDelegate for region {}, \ - leader {} at term {}, applied_index_term {}, has lease {}", + leader {} at term {}, applied_term {}, has lease {}", self.region.get_id(), self.peer_id, self.term, - self.applied_index_term, + self.applied_term, self.leader_lease.is_some(), ) } @@ -391,7 +391,7 @@ impl Display for ReadDelegate { pub enum Progress { Region(metapb::Region), Term(u64), - AppliedIndexTerm(u64), + AppliedTerm(u64), LeaderLease(RemoteLease), RegionBuckets(Arc), } @@ -405,8 +405,8 @@ impl Progress { Progress::Term(term) } - pub fn applied_index_term(applied_index_term: u64) -> Progress { - Progress::AppliedIndexTerm(applied_index_term) + pub fn applied_term(applied_term: u64) -> Progress { + Progress::AppliedTerm(applied_term) } pub fn leader_lease(lease: RemoteLease) -> Progress { @@ -752,13 +752,13 @@ struct Inspector<'r, 'm> { impl<'r, 'm> RequestInspector for Inspector<'r, 'm> { fn has_applied_to_current_term(&mut self) -> bool { - if self.delegate.applied_index_term == self.delegate.term { + if self.delegate.applied_term == self.delegate.term { true } else { debug!( "rejected by term check"; "tag" => &self.delegate.tag, - "applied_index_term" => self.delegate.applied_index_term, + "applied_term" => self.delegate.applied_term, "delegate_term" => ?self.delegate.term, ); @@ -1078,7 +1078,7 @@ mod tests { // Register region 1 lease.renew(monotonic_raw_now()); let remote = lease.maybe_new_remote_lease(term6).unwrap(); - // But the applied_index_term is stale. + // But the applied_term is stale. { let mut meta = store_meta.lock().unwrap(); let read_delegate = ReadDelegate { @@ -1086,7 +1086,7 @@ mod tests { region: Arc::new(region1.clone()), peer_id: leader2.get_id(), term: term6, - applied_index_term: term6 - 1, + applied_term: term6 - 1, leader_lease: Some(remote), last_valid_ts: Timespec::new(0, 0), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), @@ -1099,13 +1099,13 @@ mod tests { meta.readers.insert(1, read_delegate); } - // The applied_index_term is stale + // The applied_term is stale must_redirect(&mut reader, &rx, cmd.clone()); assert_eq!(reader.metrics.rejected_by_cache_miss, 2); assert_eq!(reader.metrics.rejected_by_applied_term, 1); - // Make the applied_index_term matches current term. - let pg = Progress::applied_index_term(term6); + // Make the applied_term matches current term. + let pg = Progress::applied_term(term6); { let mut meta = store_meta.lock().unwrap(); meta.readers.get_mut(&1).unwrap().update(pg); @@ -1236,7 +1236,7 @@ mod tests { meta.readers .get_mut(&1) .unwrap() - .update(Progress::applied_index_term(term6 + 3)); + .update(Progress::applied_term(term6 + 3)); } reader.propose_raft_command( None, @@ -1329,7 +1329,7 @@ mod tests { region: Arc::new(region.clone()), peer_id: 1, term: 1, - applied_index_term: 1, + applied_term: 1, leader_lease: None, last_valid_ts: Timespec::new(0, 0), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), @@ -1345,7 +1345,7 @@ mod tests { let d = reader.get_delegate(1).unwrap(); assert_eq!(&*d.region, ®ion); assert_eq!(d.term, 1); - assert_eq!(d.applied_index_term, 1); + assert_eq!(d.applied_term, 1); assert!(d.leader_lease.is_none()); drop(d); @@ -1370,9 +1370,9 @@ mod tests { meta.readers .get_mut(&1) .unwrap() - .update(Progress::applied_index_term(2)); + .update(Progress::applied_term(2)); } - assert_eq!(reader.get_delegate(1).unwrap().applied_index_term, 2); + assert_eq!(reader.get_delegate(1).unwrap().applied_term, 2); { let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 0ac92103129..4bc5cc032a3 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -72,7 +72,7 @@ const ENGINE: &str = "engine"; pub enum Task { Gen { region_id: u64, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, kv_snap: S, canceled: Arc, @@ -262,7 +262,7 @@ where fn generate_snap( &self, region_id: u64, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, kv_snap: EK::Snapshot, notifier: SyncSender, @@ -275,7 +275,7 @@ where &self.engine, kv_snap, region_id, - last_applied_index_term, + last_applied_term, last_applied_state, for_balance, allow_multi_files_snapshot, @@ -301,7 +301,7 @@ where fn handle_gen( &self, region_id: u64, - last_applied_index_term: u64, + last_applied_term: u64, last_applied_state: RaftApplyState, kv_snap: EK::Snapshot, canceled: Arc, @@ -325,7 +325,7 @@ where if let Err(e) = self.generate_snap( region_id, - last_applied_index_term, + last_applied_term, last_applied_state, kv_snap, notifier, @@ -703,7 +703,7 @@ where match task { Task::Gen { region_id, - last_applied_index_term, + last_applied_term, last_applied_state, kv_snap, canceled, @@ -742,7 +742,7 @@ where tikv_alloc::add_thread_memory_accessor(); ctx.handle_gen( region_id, - last_applied_index_term, + last_applied_term, last_applied_state, kv_snap, canceled, @@ -1055,7 +1055,7 @@ mod tests { .schedule(Task::Gen { region_id: id, kv_snap: engine.kv.snapshot(), - last_applied_index_term: entry.get_term(), + last_applied_term: entry.get_term(), last_applied_state: apply_state, canceled: Arc::new(AtomicBool::new(false)), notifier: tx, From 0dc72407e87bf9861991b05d72391bc8fd149871 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 27 Jul 2022 14:43:11 +0800 Subject: [PATCH 115/676] raftstorev2: fix cached tablet bug (#13127) ref tikv/tikv#12842 None Signed-off-by: tabokie Co-authored-by: Ti Chi Robot --- Cargo.lock | 16 +- components/raftstore-v2/src/batch/apply.rs | 4 +- components/raftstore-v2/src/batch/store.rs | 16 +- components/raftstore-v2/src/bootstrap.rs | 169 +++++++++++--------- components/raftstore-v2/src/fsm/apply.rs | 3 +- components/raftstore-v2/src/fsm/peer.rs | 3 +- components/raftstore-v2/src/fsm/store.rs | 3 +- components/raftstore-v2/src/raft/peer.rs | 2 +- components/raftstore-v2/src/raft/storage.rs | 4 +- components/raftstore-v2/src/tablet.rs | 23 ++- components/tikv_util/src/lib.rs | 1 + 11 files changed, 137 insertions(+), 107 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fb4e4d1e6a9..87dc15eb69a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,7 +84,7 @@ dependencies = [ "codec", "engine_traits", "kvproto", - "match_template", + "match-template", "panic_hook", "thiserror", "tikv_alloc", @@ -2852,8 +2852,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e88c3cbe8288f77f293e48a28b3232e3defd203a6d839fa7f68ea4329e83464" [[package]] -name = "match_template" +name = "match-template" version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c334ac67725febd94c067736ac46ef1c7cacf1c743ca14b9f917c2df2c20acd8" dependencies = [ "proc-macro2", "quote", @@ -5827,7 +5829,7 @@ dependencies = [ name = "tidb_query_aggr" version = "0.0.1" dependencies = [ - "match_template", + "match-template", "panic_hook", "tidb_query_codegen", "tidb_query_common", @@ -5886,7 +5888,7 @@ dependencies = [ "kvproto", "lazy_static", "log_wrappers", - "match_template", + "match-template", "nom 5.1.0", "num 0.3.0", "num-derive", @@ -5918,7 +5920,7 @@ dependencies = [ "itertools", "kvproto", "log_wrappers", - "match_template", + "match-template", "protobuf", "slog", "slog-global", @@ -5947,7 +5949,7 @@ dependencies = [ "flate2", "hex 0.4.2", "log_wrappers", - "match_template", + "match-template", "num 0.3.0", "num-traits", "openssl", @@ -6024,7 +6026,7 @@ dependencies = [ "libloading", "log", "log_wrappers", - "match_template", + "match-template", "memory_trace_macros", "mime", "more-asserts", diff --git a/components/raftstore-v2/src/batch/apply.rs b/components/raftstore-v2/src/batch/apply.rs index ab44d435e67..f71c98e5c86 100644 --- a/components/raftstore-v2/src/batch/apply.rs +++ b/components/raftstore-v2/src/batch/apply.rs @@ -50,7 +50,7 @@ pub struct ApplyPoller { impl ApplyPoller { pub fn new(apply_ctx: ApplyContext, cfg_tracker: Tracker) -> ApplyPoller { - ApplyPoller { + Self { apply_task_buf: Vec::new(), pending_latency_inspect: Vec::new(), apply_ctx, @@ -58,7 +58,7 @@ impl ApplyPoller { } } - /// Updates the internal buffer to latest capacity. + /// Updates the internal buffer to match the latest configuration. fn apply_buf_capacity(&mut self) { let new_cap = self.messages_per_tick(); tikv_util::set_vec_capacity(&mut self.apply_task_buf, new_cap); diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 2dce4b54c2a..1d84ba47302 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -26,7 +26,7 @@ use crate::{ Error, PeerMsg, PeerTick, Result, StoreMsg, }; -/// A per thread context used for handling raft messages. +/// A per-thread context used for handling raft messages. pub struct StoreContext { /// A logger without any KV. It's clean for creating new PeerFSM. pub logger: Logger, @@ -64,19 +64,17 @@ struct StorePoller { impl StorePoller { pub fn new(poll_ctx: StoreContext, cfg_tracker: Tracker) -> Self { - let mut poller = Self { + Self { store_msg_buf: Vec::new(), peer_msg_buf: Vec::new(), poll_ctx, cfg_tracker, last_flush_time: TiInstant::now(), need_flush_events: false, - }; - poller.apply_buf_capacity(); - poller + } } - /// Updates the internal buffer to latest capacity. + /// Updates the internal buffer to match the latest configuration. fn apply_buf_capacity(&mut self) { let new_cap = self.messages_per_tick(); tikv_util::set_vec_capacity(&mut self.store_msg_buf, new_cap); @@ -119,6 +117,7 @@ impl PollHandler Option { + debug_assert!(self.store_msg_buf.is_empty()); let received_cnt = store.recv(&mut self.store_msg_buf); let expected_msg_count = if received_cnt == self.messages_per_tick() { None @@ -134,6 +133,7 @@ impl PollHandler>, ) -> HandleResult { + debug_assert!(self.peer_msg_buf.is_empty()); let received_cnt = peer.recv(&mut self.peer_msg_buf); let handle_result = if received_cnt == self.messages_per_tick() { HandleResult::KeepProcessing @@ -203,7 +203,7 @@ impl StorePollerBuilder { } } - /// Init all the existing raft machine and cleanup stale tablets. + /// Initializes all the existing raft machines and cleanup stale tablets. fn init(&self) -> Result>> { let mut regions = HashMap::default(); let cfg = self.cfg.value(); @@ -328,7 +328,7 @@ impl StoreSystem { pub type StoreRouter = BatchRouter, StoreFsm>; -/// Create the batch system for polling raft activities. +/// Creates the batch system for polling raft activities. pub fn create_store_batch_system( cfg: &Config, store: Store, diff --git a/components/raftstore-v2/src/bootstrap.rs b/components/raftstore-v2/src/bootstrap.rs index 55e1f6814c5..c3e2d2de6f7 100644 --- a/components/raftstore-v2/src/bootstrap.rs +++ b/components/raftstore-v2/src/bootstrap.rs @@ -22,9 +22,16 @@ const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs( /// A struct for bootstrapping the store. /// -/// A typical bootstrap process should follow following order: -/// 1. bootstrap the store to get a store ID. -/// 2. bootstrap the first region using the last store ID. +/// A typical bootstrap process should take the following steps: +/// +/// 1. Calls `bootstrap_store` to bootstrap the store. +/// 2. Calls `bootstrap_first_region` to bootstrap the first region using store +/// ID returned from last step. +/// +/// # Safety +/// +/// These steps are re-entrant, i.e. the caller can redo any steps whether or +/// not they fail or succeed. pub struct Bootstrap<'a, ER: RaftEngine> { engine: &'a ER, cluster_id: u64, @@ -33,8 +40,8 @@ pub struct Bootstrap<'a, ER: RaftEngine> { logger: Logger, } -// Although all methods won't change internal state, but they still receive `&mut self` as it's -// not thread safe to bootstrap concurrently. +// Although all methods won't change internal state, but they still receive +// `&mut self` as it's not thread safe to bootstrap concurrently. impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { pub fn new( engine: &'a ER, @@ -50,9 +57,9 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { } } - /// check store, return store id for the engine. - /// If the store is not bootstrapped, use None. - fn check_store(&mut self) -> Result> { + /// Gets and validates the store ID from engine if it's already + /// bootstrapped. + fn check_store_id_in_engine(&mut self) -> Result> { let ident = match self.engine.get_store_ident()? { Some(ident) => ident, None => return Ok(None), @@ -60,7 +67,8 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { if ident.get_cluster_id() != self.cluster_id { return Err(box_err!( "cluster ID mismatch, local {} != remote {}, \ - you are trying to connect to another cluster, please reconnect to the correct PD", + you are trying to connect to another cluster, \ + please reconnect to the correct PD", ident.get_cluster_id(), self.cluster_id )); @@ -71,13 +79,22 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { Ok(Some(ident.get_store_id())) } - fn inner_bootstrap_store(&mut self) -> Result { + /// Bootstraps the store and returns the store ID. + /// + /// The bootstrapping basically allocates a new store ID from PD and writes + /// it to engine with sync=true. + /// + /// If the store is already bootstrapped, return the store ID directly. + pub fn bootstrap_store(&mut self) -> Result { + if let Some(id) = self.check_store_id_in_engine()? { + return Ok(id); + } + if !self.engine.is_empty()? { + return Err(box_err!("store is not empty and has already had data")); + } let id = self.pd_client.alloc_id()?; debug!(self.logger, "alloc store id"; "store_id" => id); let mut ident = StoreIdent::default(); - if !self.engine.is_empty()? { - return Err(box_err!("store is not empty and has already had data.")); - } ident.set_cluster_id(self.cluster_id); ident.set_store_id(id); self.engine.put_store_ident(&ident)?; @@ -88,18 +105,6 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { Ok(id) } - /// Bootstrap the store and return the store ID. - /// - /// If store is bootstrapped already, return the store ID directly. - pub fn bootstrap_store(&mut self) -> Result { - let store_id = match self.check_store()? { - Some(id) => id, - None => self.inner_bootstrap_store()?, - }; - - Ok(store_id) - } - fn prepare_bootstrap_first_region(&mut self, store_id: u64) -> Result { let region_id = self.pd_client.alloc_id()?; debug!( @@ -127,7 +132,7 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { Ok(region) } - fn check_first_region_bootstrapped(&mut self) -> Result { + fn check_pd_first_region_bootstrapped(&mut self) -> Result { for _ in 0..MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT { match self.pd_client.is_cluster_bootstrapped() { Ok(b) => return Ok(b), @@ -140,21 +145,6 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { Err(box_err!("check cluster bootstrapped failed")) } - fn check_or_prepare_bootstrap_first_region(&mut self, store_id: u64) -> Result> { - if let Some(first_region) = self.engine.get_prepare_bootstrap_region()? { - // Bootstrap is aborted last time, resume. It may succeed or fail last time, no matter - // what, at least we need a way to clean up. - Ok(Some(first_region)) - } else if self.check_first_region_bootstrapped()? { - // If other node has bootstrap the cluster, skip to avoid useless ID allocating and - // disk writes. - Ok(None) - } else { - // We are probably the first one triggering bootstrap. - self.prepare_bootstrap_first_region(store_id).map(Some) - } - } - fn clear_prepare_bootstrap(&mut self, first_region_id: Option) -> Result<()> { let mut wb = self.engine.log_batch(10); wb.remove_prepare_bootstrap_region()?; @@ -168,11 +158,44 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { Ok(()) } - fn inner_bootstrap_first_region( + /// Bootstraps the first region of this cluster. + /// + /// The bootstrapping starts by allocating a region ID from PD. Then it + /// initializes the region's state and writes a preparing marker to the + /// engine. After attempting to register itself as the first region to PD, + /// the preparing marker is deleted from the engine. + /// + /// On the occasion that the someone else bootstraps the first region + /// before us, the region state is cleared and `None` is returned. + pub fn bootstrap_first_region( &mut self, store: &Store, - first_region: &Region, - ) -> Result { + store_id: u64, + ) -> Result> { + let first_region = match self.engine.get_prepare_bootstrap_region()? { + // The last bootstrap aborts. We need to resume or clean it up. + Some(r) => r, + None => { + if self.check_pd_first_region_bootstrapped()? { + // If other node has bootstrap the cluster, skip to avoid + // useless ID allocating and disk writes. + return Ok(None); + } + self.prepare_bootstrap_first_region(store_id)? + } + }; + + info!( + self.logger, + "trying to bootstrap first region"; + "store_id" => store_id, + "region" => ?first_region + ); + // cluster is not bootstrapped, and we choose first store to bootstrap + fail_point!("node_after_prepare_bootstrap_cluster", |_| Err(box_err!( + "injected error: node_after_prepare_bootstrap_cluster" + ))); + let region_id = first_region.get_id(); let mut retry = 0; while retry < MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT { @@ -181,23 +204,32 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { .bootstrap_cluster(store.clone(), first_region.clone()) { Ok(_) => { - info!(self.logger, "bootstrap cluster ok"; "cluster_id" => self.cluster_id); + info!( + self.logger, + "bootstrap cluster ok"; + "cluster_id" => self.cluster_id + ); fail_point!("node_after_bootstrap_cluster", |_| Err(box_err!( "injected error: node_after_bootstrap_cluster" ))); self.clear_prepare_bootstrap(None)?; - return Ok(true); + return Ok(Some(first_region)); } Err(pd_client::Error::ClusterBootstrapped(_)) => { match self.pd_client.get_region(b"") { Ok(region) => { - if region == *first_region { + if region == first_region { + // It is bootstrapped by us before. self.clear_prepare_bootstrap(None)?; - return Ok(true); + return Ok(Some(first_region)); } else { - info!(self.logger, "cluster is already bootstrapped"; "cluster_id" => self.cluster_id); + info!( + self.logger, + "cluster is already bootstrapped"; + "cluster_id" => self.cluster_id + ); self.clear_prepare_bootstrap(Some(region_id))?; - return Ok(false); + return Ok(None); } } Err(e) => { @@ -206,36 +238,21 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { } } Err(e) => { - error!(self.logger, "bootstrap cluster"; "cluster_id" => self.cluster_id, "err" => ?e, "err_code" => %e.error_code()) + error!( + self.logger, + "bootstrap cluster failed once"; + "cluster_id" => self.cluster_id, + "err" => ?e, + "err_code" => %e.error_code() + ); } } retry += 1; thread::sleep(CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL); } - Err(box_err!("bootstrapped cluster failed")) - } - - /// Bootstrap the first region. - /// - /// If the cluster is already bootstrapped, `None` is returned. - pub fn bootstrap_first_region( - &mut self, - store: &Store, - store_id: u64, - ) -> Result> { - let first_region = match self.check_or_prepare_bootstrap_first_region(store_id)? { - Some(r) => r, - None => return Ok(None), - }; - info!(self.logger, "trying to bootstrap first region"; "store_id" => store_id, "region" => ?first_region); - // cluster is not bootstrapped, and we choose first store to bootstrap - fail_point!("node_after_prepare_bootstrap_cluster", |_| Err(box_err!( - "injected error: node_after_prepare_bootstrap_cluster" - ))); - if self.inner_bootstrap_first_region(store, &first_region)? { - Ok(Some(first_region)) - } else { - Ok(None) - } + Err(box_err!( + "bootstrapped cluster failed after {} attempts", + retry + )) } } diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 43e3441528e..21646be4738 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -26,7 +26,8 @@ impl ApplyFsm { ) } - /// Fetches tasks to `apply_task_buf`. It will stop when the buffer is full. + /// Fetches messages to `apply_task_buf`. It will stop when the buffer + /// capacity is reached or there is no more pending messages. /// /// Returns how many messages are fetched. pub fn recv(&mut self, apply_task_buf: &mut Vec) -> usize { diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 8187575d658..88d7b479e49 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -47,7 +47,8 @@ impl PeerFsm { self.peer.logger() } - /// Fetches messages to `peer_msg_buf`. It will stop when the buffer is full. + /// Fetches messages to `peer_msg_buf`. It will stop when the buffer + /// capacity is reached or there is no more pending messages. /// /// Returns how many messages are fetched. pub fn recv(&mut self, peer_msg_buf: &mut Vec>) -> usize { diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 091b3fe11e9..257028f1630 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -23,7 +23,8 @@ impl StoreFsm { (tx, fsm) } - /// Fetches messages to `store_msg_buf`. It will stop when the buffer is full. + /// Fetches messages to `store_msg_buf`. It will stop when the buffer + /// capacity is reached or there is no more pending messages. /// /// Returns how many messages are fetched. pub fn recv(&self, store_msg_buf: &mut Vec) -> usize { diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index e52ec322445..c3cede21ebc 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -25,7 +25,7 @@ pub struct Peer { impl Peer { /// Creates a new peer. /// - /// If peer is destroyed, None is returned. + /// If peer is destroyed, `None` is returned. pub fn new( cfg: &Config, region_id: u64, diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index fc25e12bad3..ff0bd64cd01 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -58,8 +58,8 @@ pub struct Storage { impl Storage { /// Creates a new storage. /// - /// All metadata should be initialized before calling this method. If the region is destroyed - /// `None` will be returned. + /// All metadata should be initialized before calling this method. If the + /// region is destroyed, `None` will be returned. pub fn new( region_id: u64, store_id: u64, diff --git a/components/raftstore-v2/src/tablet.rs b/components/raftstore-v2/src/tablet.rs index 2293eaed033..f4f5bdcbc6f 100644 --- a/components/raftstore-v2/src/tablet.rs +++ b/components/raftstore-v2/src/tablet.rs @@ -33,12 +33,12 @@ impl CachedTablet { } pub fn set(&mut self, data: EK) { - let mut guard = self.latest.data.lock().unwrap(); - *guard = Some(data.clone()); - let v = self.latest.version.fetch_add(1, Ordering::Relaxed); - drop(guard); + self.version = { + let mut latest_data = self.latest.data.lock().unwrap(); + *latest_data = Some(data.clone()); + self.latest.version.fetch_add(1, Ordering::Relaxed) + 1 + }; self.cache = Some(data); - self.version = v; } /// Get the tablet from cache without checking if it's up to date. @@ -51,9 +51,9 @@ impl CachedTablet { #[inline] pub fn latest(&mut self) -> Option<&EK> { if self.latest.version.load(Ordering::Relaxed) > self.version { - let guard = self.latest.data.lock().unwrap(); + let latest_data = self.latest.data.lock().unwrap(); self.version = self.latest.version.load(Ordering::Relaxed); - self.cache = guard.clone(); + self.cache = latest_data.clone(); } self.cache() } @@ -76,7 +76,14 @@ mod tests { // Setting tablet will refresh cache immediately. cached_tablet.set(2); assert_eq!(cached_tablet.cache().cloned(), Some(2)); - assert_eq!(cached_tablet.latest().cloned(), Some(2)); + + // Test `latest()` will use cache. + // Unsafe modify the data. + let old_data = *cached_tablet.latest.data.lock().unwrap(); + *cached_tablet.latest.data.lock().unwrap() = Some(0); + assert_eq!(cached_tablet.latest().cloned(), old_data); + // Restore the data. + *cached_tablet.latest.data.lock().unwrap() = old_data; let mut cloned = cached_tablet.clone(); // Clone should reuse cache. diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index 9b3e38aa9cc..1fec3722a64 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -586,6 +586,7 @@ pub fn build_on_master_branch() -> bool { } /// Set the capacity of a vector to the given capacity. +#[inline] pub fn set_vec_capacity(v: &mut Vec, cap: usize) { match cap.cmp(&v.capacity()) { cmp::Ordering::Less => v.shrink_to(cap), From f5adcb1cec9e8322be13b2313b6784a0aa0339dd Mon Sep 17 00:00:00 2001 From: WangLe1321 Date: Wed, 27 Jul 2022 16:17:11 +0800 Subject: [PATCH 116/676] log-backup: fix uploading to gcs error (#13107) close tikv/tikv#13106 Signed-off-by: WangLe1321 Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/backup-stream/Cargo.toml | 1 + components/backup-stream/src/router.rs | 62 ++++++++++++++++++++++++-- 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 87dc15eb69a..3917b836317 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -459,6 +459,7 @@ dependencies = [ "fail", "file_system", "futures 0.3.15", + "futures-io", "grpcio", "hex 0.4.2", "kvproto", diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 9e8049e0ec0..e2b23ccf5db 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -35,6 +35,7 @@ external_storage_export = { path = "../external_storage/export", default-feature fail = { version = "0.5", optional = true } file_system = { path = "../file_system" } futures = "0.3" +futures-io = "0.3" grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } hex = "0.4" diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 3e29592a9f4..b236cefde77 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -972,8 +972,7 @@ impl StreamTaskInfo { let stat = reader.metadata().await?; let reader = UnpinReader(Box::new(limiter.limit(reader.compat()))); let filepath = &data_file.storage_path; - // Once we cannot get the stat of the file, use 4K I/O. - let est_len = stat.len().max(4096); + let est_len = stat.len(); let ret = storage.write(filepath, reader, est_len).await; match ret { @@ -1370,13 +1369,17 @@ struct TaskRange { #[cfg(test)] mod tests { - use std::{ffi::OsStr, time::Duration}; + use std::{ffi::OsStr, marker::Unpin, time::Duration}; + use external_storage::NoopStorage; + use futures::AsyncReadExt; + use futures_io::AsyncRead; use kvproto::brpb::{Local, Noop, StorageBackend, StreamBackupTaskInfo}; use tikv_util::{ codec::number::NumberEncoder, worker::{dummy_scheduler, ReceiverWrapper}, }; + use tokio::{fs::File, sync::Mutex}; use txn_types::{Write, WriteType}; use super::*; @@ -2070,4 +2073,57 @@ mod tests { assert_eq!(ts, global_checkpoint); Ok(()) } + + struct MockCheckContentStorage { + s: NoopStorage, + } + + #[async_trait::async_trait] + impl ExternalStorage for MockCheckContentStorage { + fn name(&self) -> &'static str { + self.s.name() + } + + fn url(&self) -> io::Result { + self.s.url() + } + + async fn write( + &self, + _name: &str, + mut reader: UnpinReader, + content_length: u64, + ) -> io::Result<()> { + let mut data = Vec::new(); + reader.0.read_to_end(&mut data).await?; + let data_len: u64 = data.len() as _; + + if data_len == content_length { + Ok(()) + } else { + Err(io::Error::new( + io::ErrorKind::Other, + "the length of content in reader is not equal with content_length", + )) + } + } + + fn read(&self, name: &str) -> Box { + self.s.read(name) + } + } + + #[tokio::test] + async fn test_est_len_in_flush() -> Result<()> { + let noop_s = NoopStorage::default(); + let ms = MockCheckContentStorage { s: noop_s }; + let file_path = std::env::temp_dir().join(format!("{}", uuid::Uuid::new_v4())); + let mut f = File::create(file_path.clone()).await?; + f.write_all("test-data".as_bytes()).await?; + + let data_file = DataFile::new(file_path).await.unwrap(); + let result = StreamTaskInfo::flush_log_file_to(Arc::new(ms), &Mutex::new(data_file)).await; + assert_eq!(result.is_ok(), true); + Ok(()) + } } From 4152dbe02dfd2df11848e70908902c82f032018c Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 28 Jul 2022 11:17:11 +0800 Subject: [PATCH 117/676] *: remove engine_rocks raw_util (#13132) ref tikv/tikv#13058 raw_util is duplicated with util. By using util, this PR also makes `DB` type only in use in engine_rocks, which is helpful for adapting tirocks later. In addition, this PR fixes an unsound transforms between `Arc` and `RocksEngine`. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + Makefile | 2 +- cmd/tikv-ctl/src/executor.rs | 11 +- components/backup-stream/src/utils.rs | 8 +- components/backup/src/endpoint.rs | 10 +- components/backup/src/writer.rs | 24 +- components/engine_rocks/src/cf_options.rs | 38 +- components/engine_rocks/src/compact.rs | 51 +-- .../engine_rocks/src/compact_listener.rs | 7 +- components/engine_rocks/src/compat.rs | 25 -- components/engine_rocks/src/db_options.rs | 35 ++ components/engine_rocks/src/engine.rs | 27 +- components/engine_rocks/src/file_system.rs | 34 +- components/engine_rocks/src/import.rs | 17 +- components/engine_rocks/src/lib.rs | 4 - components/engine_rocks/src/misc.rs | 32 +- .../engine_rocks/src/perf_context_impl.rs | 6 +- components/engine_rocks/src/properties.rs | 31 +- components/engine_rocks/src/raw.rs | 15 +- components/engine_rocks/src/raw_util.rs | 336 ------------------ components/engine_rocks/src/rocks_metrics.rs | 3 +- components/engine_rocks/src/util.rs | 314 ++++++++++++---- components/engine_rocks/src/write_batch.rs | 7 +- components/engine_rocks_helper/Cargo.toml | 1 + .../engine_rocks_helper/src/sst_recovery.rs | 22 +- components/engine_test/src/lib.rs | 169 +++------ components/engine_traits/src/lib.rs | 4 - .../src/basic_read_write.rs | 15 +- .../engine_traits_tests/src/cf_names.rs | 19 +- components/engine_traits_tests/src/ctor.rs | 14 +- components/engine_traits_tests/src/lib.rs | 13 +- .../src/coprocessor/split_check/half.rs | 47 +-- .../src/coprocessor/split_check/keys.rs | 36 +- .../src/coprocessor/split_check/size.rs | 45 +-- .../src/coprocessor/split_check/table.rs | 4 +- components/raftstore/src/store/bootstrap.rs | 10 +- .../raftstore/src/store/compaction_guard.rs | 24 +- components/raftstore/src/store/fsm/apply.rs | 8 +- .../raftstore/src/store/peer_storage.rs | 6 +- components/raftstore/src/store/snap.rs | 36 +- .../raftstore/src/store/worker/compact.rs | 12 +- .../src/store/worker/consistency_check.rs | 8 +- .../raftstore/src/store/worker/raftlog_gc.rs | 3 +- components/raftstore/src/store/worker/read.rs | 3 +- .../raftstore/src/store/worker/region.rs | 12 +- components/server/src/raft_engine_switch.rs | 15 +- components/server/src/server.rs | 12 +- components/sst_importer/src/import_mode.rs | 6 +- components/sst_importer/src/util.rs | 26 +- components/test_backup/src/lib.rs | 2 +- components/test_raftstore/src/cluster.rs | 13 +- components/test_raftstore/src/util.rs | 16 +- components/test_sst_importer/src/lib.rs | 24 +- components/tikv_kv/src/cursor.rs | 19 +- components/tikv_kv/src/lib.rs | 2 +- components/tikv_kv/src/rocksdb_engine.rs | 21 +- src/config.rs | 66 ++-- src/server/debug.rs | 199 +++++------ src/server/engine_factory.rs | 7 +- src/server/gc_worker/mod.rs | 17 +- src/server/reset_to_version.rs | 29 +- src/storage/kv/test_engine_builder.rs | 22 +- src/storage/mod.rs | 22 +- src/storage/mvcc/consistency_check.rs | 10 +- src/storage/mvcc/reader/reader.rs | 79 ++-- .../singleton_flow_controller.rs | 4 +- tests/benches/misc/raftkv/mod.rs | 28 +- .../misc/writebatch/bench_writebatch.rs | 41 ++- tests/benches/raftstore/mod.rs | 10 +- tests/failpoints/cases/test_async_fetch.rs | 4 +- tests/failpoints/cases/test_merge.rs | 9 +- tests/failpoints/cases/test_replica_read.rs | 2 - tests/failpoints/cases/test_sst_recovery.rs | 69 ++-- .../integrations/config/dynamic/raftstore.rs | 27 +- .../config/dynamic/split_check.rs | 26 +- .../integrations/raftstore/test_bootstrap.rs | 26 +- .../raftstore/test_clear_stale_data.rs | 26 +- .../raftstore/test_compact_after_delete.rs | 3 +- .../raftstore/test_compact_log.rs | 8 +- .../raftstore/test_conf_change.rs | 4 - tests/integrations/raftstore/test_merge.rs | 3 - tests/integrations/raftstore/test_multi.rs | 25 +- .../raftstore/test_split_region.rs | 3 +- .../integrations/raftstore/test_stale_peer.rs | 25 +- tests/integrations/raftstore/test_stats.rs | 1 + .../integrations/raftstore/test_tombstone.rs | 10 +- tests/integrations/server/kv_service.rs | 14 +- tests/integrations/storage/test_titan.rs | 118 +++--- 88 files changed, 1024 insertions(+), 1618 deletions(-) delete mode 100644 components/engine_rocks/src/compat.rs delete mode 100644 components/engine_rocks/src/raw_util.rs diff --git a/Cargo.lock b/Cargo.lock index 3917b836317..9e0303726fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1504,6 +1504,7 @@ version = "0.1.0" dependencies = [ "engine_rocks", "engine_test", + "engine_traits", "fail", "futures 0.3.15", "keys", diff --git a/Makefile b/Makefile index 22c575abb8f..fb7bbf6052e 100644 --- a/Makefile +++ b/Makefile @@ -330,7 +330,7 @@ unset-override: pre-format: unset-override @rustup component add rustfmt - @cargo install -q cargo-sort + @which cargo-sort &> /dev/null || cargo install -q cargo-sort format: pre-format @cargo fmt diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 401d96e5d8e..19977924e69 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -6,10 +6,7 @@ use std::{ }; use encryption_export::data_key_manager_from_config; -use engine_rocks::{ - raw_util::{db_exist, new_engine_opt}, - RocksEngine, -}; +use engine_rocks::util::{db_exist, new_engine_opt}; use engine_traits::{ Engines, Error as EngineError, RaftEngine, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, }; @@ -78,11 +75,10 @@ pub fn new_debug_executor( .build_cf_opts(&cache, None, cfg.storage.api_version()); let kv_path = PathBuf::from(kv_path).canonicalize().unwrap(); let kv_path = kv_path.to_str().unwrap(); - let kv_db = match new_engine_opt(kv_path, kv_db_opts, kv_cfs_opts) { + let mut kv_db = match new_engine_opt(kv_path, kv_db_opts, kv_cfs_opts) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - let mut kv_db = RocksEngine::from_db(Arc::new(kv_db)); kv_db.set_shared_block_cache(shared_block_cache); let cfg_controller = ConfigController::default(); @@ -95,11 +91,10 @@ pub fn new_debug_executor( error!("raft db not exists: {}", raft_path); tikv_util::logger::exit_process_gracefully(-1); } - let raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { + let mut raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - let mut raft_db = RocksEngine::from_db(Arc::new(raft_db)); raft_db.set_shared_block_cache(shared_block_cache); let debugger = Debugger::new(Engines::new(kv_db, raft_db), cfg_controller); Box::new(debugger) as Box diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 6ad26cb045c..5aed8f55f7f 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -591,7 +591,6 @@ mod test { time::Duration, }; - use engine_rocks::raw::DBOptions; use engine_traits::WriteOptions; use futures::executor::block_on; @@ -736,15 +735,12 @@ mod test { #[test] fn test_recorder() { - use engine_rocks::{raw::DB, RocksEngine}; use engine_traits::{Iterable, KvEngine, Mutable, WriteBatch, WriteBatchExt, CF_DEFAULT}; use tempdir::TempDir; let p = TempDir::new("test_db").unwrap(); - let mut opt = DBOptions::default(); - opt.create_if_missing(true); - let db = DB::open(opt.clone(), p.path().as_os_str().to_str().unwrap()).unwrap(); - let engine = RocksEngine::from_db(Arc::new(db)); + let engine = + engine_rocks::util::new_engine(p.path().to_str().unwrap(), &[CF_DEFAULT]).unwrap(); let mut wb = engine.write_batch(); for i in 0..100 { wb.put_cf(CF_DEFAULT, format!("hello{}", i).as_bytes(), b"world") diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index bbcf33d7899..8865aa4f94c 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -11,7 +11,7 @@ use std::{ use async_channel::SendError; use causal_ts::CausalTsProvider; use concurrency_manager::ConcurrencyManager; -use engine_rocks::raw::DB; +use engine_rocks::RocksEngine; use engine_traits::{name_to_cf, raw_ttl::ttl_current_ts, CfName, SstCompressionType}; use external_storage::{BackendConfig, HdfsConfig}; use external_storage_export::{create_storage, ExternalStorage}; @@ -505,7 +505,7 @@ impl BackupRange { async fn backup_raw_kv_to_file( &self, engine: E, - db: Arc, + db: RocksEngine, limiter: &Limiter, file_name: String, cf: CfNameWrap, @@ -659,7 +659,7 @@ pub struct Endpoint { store_id: u64, pool: RefCell, io_pool: Runtime, - db: Arc, + db: RocksEngine, config_manager: ConfigManager, concurrency_manager: ConcurrencyManager, softlimit: SoftLimitKeeper, @@ -782,7 +782,7 @@ impl Endpoint { store_id: u64, engine: E, region_info: R, - db: Arc, + db: RocksEngine, config: BackupConfig, concurrency_manager: ConcurrencyManager, api_version: ApiVersion, @@ -1280,7 +1280,7 @@ pub mod tests { .unwrap(); let concurrency_manager = ConcurrencyManager::new(1.into()); let need_encode_key = !is_raw_kv || api_version == ApiVersion::V2; - let db = rocks.get_rocksdb().get_sync_db(); + let db = rocks.get_rocksdb(); ( temp, Endpoint::new( diff --git a/components/backup/src/writer.rs b/components/backup/src/writer.rs index 99a907948ce..7127d896314 100644 --- a/components/backup/src/writer.rs +++ b/components/backup/src/writer.rs @@ -1,9 +1,9 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt::Display, io::Read, sync::Arc}; +use std::{fmt::Display, io::Read}; use encryption::{EncrypterReader, Iv}; -use engine_rocks::{raw::DB, RocksEngine, RocksSstWriter, RocksSstWriterBuilder}; +use engine_rocks::{RocksEngine, RocksSstWriter, RocksSstWriterBuilder}; use engine_traits::{ CfName, ExternalSstFileInfo, SstCompressionType, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, @@ -168,7 +168,7 @@ pub struct BackupWriterBuilder { store_id: u64, limiter: Limiter, region: Region, - db: Arc, + db: RocksEngine, compression_type: Option, compression_level: i32, sst_max_size: u64, @@ -180,7 +180,7 @@ impl BackupWriterBuilder { store_id: u64, limiter: Limiter, region: Region, - db: Arc, + db: RocksEngine, compression_type: Option, compression_level: i32, sst_max_size: u64, @@ -227,7 +227,7 @@ pub struct BackupWriter { impl BackupWriter { /// Create a new BackupWriter. pub fn new( - db: Arc, + db: RocksEngine, name: &str, compression_type: Option, compression_level: i32, @@ -238,14 +238,14 @@ impl BackupWriter { let default = RocksSstWriterBuilder::new() .set_in_memory(true) .set_cf(CF_DEFAULT) - .set_db(RocksEngine::from_ref(&db)) + .set_db(&db) .set_compression_type(compression_type) .set_compression_level(compression_level) .build(name)?; let write = RocksSstWriterBuilder::new() .set_in_memory(true) .set_cf(CF_WRITE) - .set_db(RocksEngine::from_ref(&db)) + .set_db(&db) .set_compression_type(compression_type) .set_compression_level(compression_level) .build(name)?; @@ -351,7 +351,7 @@ pub struct BackupRawKvWriter { impl BackupRawKvWriter { /// Create a new BackupRawKvWriter. pub fn new( - db: Arc, + db: RocksEngine, name: &str, cf: CfNameWrap, limiter: Limiter, @@ -363,7 +363,7 @@ impl BackupRawKvWriter { let writer = RocksSstWriterBuilder::new() .set_in_memory(true) .set_cf(cf.into()) - .set_db(RocksEngine::from_ref(&db)) + .set_db(&db) .set_compression_type(compression_type) .set_compression_level(compression_level) .build(name)?; @@ -498,7 +498,7 @@ mod tests { r.set_id(1); r.mut_peers().push(new_peer(1, 1)); let mut writer = BackupWriter::new( - db.get_sync_db(), + db.clone(), "foo", None, 0, @@ -516,7 +516,7 @@ mod tests { // Test write only txn. let mut writer = BackupWriter::new( - db.get_sync_db(), + db.clone(), "foo1", None, 0, @@ -555,7 +555,7 @@ mod tests { // Test write and default. let mut writer = BackupWriter::new( - db.get_sync_db(), + db, "foo2", None, 0, diff --git a/components/engine_rocks/src/cf_options.rs b/components/engine_rocks/src/cf_options.rs index 87d05510f58..c6a5390a063 100644 --- a/components/engine_rocks/src/cf_options.rs +++ b/components/engine_rocks/src/cf_options.rs @@ -1,7 +1,9 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use std::ops::{Deref, DerefMut}; + use engine_traits::{CFOptionsExt, ColumnFamilyOptions, Result, SstPartitionerFactory}; -use rocksdb::ColumnFamilyOptions as RawCFOptions; +use rocksdb::ColumnFamilyOptions as RawCfOptions; use tikv_util::box_err; use crate::{ @@ -10,11 +12,11 @@ use crate::{ }; impl CFOptionsExt for RocksEngine { - type ColumnFamilyOptions = RocksColumnFamilyOptions; + type ColumnFamilyOptions = RocksCfOptions; fn get_options_cf(&self, cf: &str) -> Result { let handle = util::get_cf_handle(self.as_inner(), cf)?; - Ok(RocksColumnFamilyOptions::from_raw( + Ok(RocksCfOptions::from_raw( self.as_inner().get_options_cf(handle), )) } @@ -27,28 +29,40 @@ impl CFOptionsExt for RocksEngine { } } -#[derive(Clone)] -pub struct RocksColumnFamilyOptions(RawCFOptions); +#[derive(Default, Clone)] +pub struct RocksCfOptions(RawCfOptions); -impl RocksColumnFamilyOptions { - pub fn from_raw(raw: RawCFOptions) -> RocksColumnFamilyOptions { - RocksColumnFamilyOptions(raw) +impl RocksCfOptions { + pub fn from_raw(raw: RawCfOptions) -> RocksCfOptions { + RocksCfOptions(raw) } - pub fn into_raw(self) -> RawCFOptions { + pub fn into_raw(self) -> RawCfOptions { self.0 } +} + +impl Deref for RocksCfOptions { + type Target = RawCfOptions; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} - pub fn as_raw_mut(&mut self) -> &mut RawCFOptions { +impl DerefMut for RocksCfOptions { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } -impl ColumnFamilyOptions for RocksColumnFamilyOptions { +impl ColumnFamilyOptions for RocksCfOptions { type TitanDBOptions = RocksTitanDBOptions; fn new() -> Self { - RocksColumnFamilyOptions::from_raw(RawCFOptions::new()) + RocksCfOptions::from_raw(RawCfOptions::default()) } fn get_max_write_buffer_number(&self) -> u32 { diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index 0b50e0757c2..393377149ff 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -137,16 +137,10 @@ impl CompactExt for RocksEngine { #[cfg(test)] mod tests { - use std::sync::Arc; - - use engine_traits::CompactExt; - use rocksdb::{ColumnFamilyOptions, Writable}; + use engine_traits::{CFNamesExt, CFOptionsExt, CompactExt, MiscExt, SyncMutable}; use tempfile::Builder; - use crate::{ - raw_util::{new_engine, CFOptions}, - Compat, - }; + use crate::{util, RocksCfOptions, RocksDBOptions}; #[test] fn test_compact_files_in_range() { @@ -155,29 +149,24 @@ mod tests { .tempdir() .unwrap(); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_disable_auto_compactions(true); - let cfs_opts = vec![ - CFOptions::new("default", cf_opts.clone()), - CFOptions::new("test", cf_opts), - ]; - let db = new_engine( + let cfs_opts = vec![("default", cf_opts.clone()), ("test", cf_opts)]; + let db = util::new_engine_opt( temp_dir.path().to_str().unwrap(), - None, - &["default", "test"], - Some(cfs_opts), + RocksDBOptions::default(), + cfs_opts, ) .unwrap(); - let db = Arc::new(db); for cf_name in db.cf_names() { - let cf = db.cf_handle(cf_name).unwrap(); for i in 0..5 { - db.put_cf(cf, &[i], &[i]).unwrap(); - db.put_cf(cf, &[i + 1], &[i + 1]).unwrap(); - db.flush_cf(cf, true).unwrap(); + db.put_cf(cf_name, &[i], &[i]).unwrap(); + db.put_cf(cf_name, &[i + 1], &[i + 1]).unwrap(); + db.flush_cf(cf_name, true).unwrap(); } - let cf_meta = db.get_column_family_meta_data(cf); + let cf = util::get_cf_handle(db.as_inner(), cf_name).unwrap(); + let cf_meta = db.as_inner().get_column_family_meta_data(cf); let cf_levels = cf_meta.get_levels(); assert_eq!(cf_levels.first().unwrap().get_files().len(), 5); } @@ -187,13 +176,12 @@ mod tests { // # After // Level-0: [4-5] // Level-1: [0-4] - db.c() - .compact_files_in_range(None, Some(&[4]), Some(1)) + db.compact_files_in_range(None, Some(&[4]), Some(1)) .unwrap(); for cf_name in db.cf_names() { - let cf = db.cf_handle(cf_name).unwrap(); - let cf_meta = db.get_column_family_meta_data(cf); + let cf = util::get_cf_handle(db.as_inner(), cf_name).unwrap(); + let cf_meta = db.as_inner().get_column_family_meta_data(cf); let cf_levels = cf_meta.get_levels(); let level_0 = cf_levels[0].get_files(); assert_eq!(level_0.len(), 1); @@ -211,14 +199,13 @@ mod tests { // # After // Level-0: [4-5] // Level-N: [0-4] - db.c() - .compact_files_in_range(Some(&[2]), Some(&[4]), None) + db.compact_files_in_range(Some(&[2]), Some(&[4]), None) .unwrap(); for cf_name in db.cf_names() { - let cf = db.cf_handle(cf_name).unwrap(); - let cf_opts = db.get_options_cf(cf); - let cf_meta = db.get_column_family_meta_data(cf); + let cf = util::get_cf_handle(db.as_inner(), cf_name).unwrap(); + let cf_opts = db.get_options_cf(cf_name).unwrap(); + let cf_meta = db.as_inner().get_column_family_meta_data(cf); let cf_levels = cf_meta.get_levels(); let level_0 = cf_levels[0].get_files(); assert_eq!(level_0.len(), 1); diff --git a/components/engine_rocks/src/compact_listener.rs b/components/engine_rocks/src/compact_listener.rs index 2cfdb253eb0..5fc7a4e92f2 100644 --- a/components/engine_rocks/src/compact_listener.rs +++ b/components/engine_rocks/src/compact_listener.rs @@ -17,10 +17,7 @@ use rocksdb::{ }; use tikv_util::warn; -use crate::{ - properties::{RangeProperties, UserCollectedPropertiesDecoder}, - raw::EventListener, -}; +use crate::properties::{RangeProperties, UserCollectedPropertiesDecoder}; pub struct RocksCompactionJobInfo<'a>(&'a RawCompactionJobInfo); @@ -229,7 +226,7 @@ impl CompactionListener { } } -impl EventListener for CompactionListener { +impl rocksdb::EventListener for CompactionListener { fn on_compaction_completed(&self, info: &RawCompactionJobInfo) { let info = &RocksCompactionJobInfo::from_raw(info); if info.status().is_err() { diff --git a/components/engine_rocks/src/compat.rs b/components/engine_rocks/src/compat.rs deleted file mode 100644 index 96371fcf62b..00000000000 --- a/components/engine_rocks/src/compat.rs +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::Arc; - -use crate::{engine::RocksEngine, raw::DB}; - -/// A trait to enter the world of engine traits from a raw `Arc` -/// with as little syntax as possible. -/// -/// This will be used during the transition from RocksDB to the -/// `KvEngine` abstraction and then discarded. -pub trait Compat { - type Other; - - fn c(&self) -> &Self::Other; -} - -impl Compat for Arc { - type Other = RocksEngine; - - #[inline] - fn c(&self) -> &RocksEngine { - RocksEngine::from_ref(self) - } -} diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index 948ed469352..6aaccfee76b 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -1,5 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use std::ops::{Deref, DerefMut}; + use engine_traits::{DBOptions, DBOptionsExt, Result, TitanDBOptions}; use rocksdb::{DBOptions as RawDBOptions, TitanDBOptions as RawTitanDBOptions}; use tikv_util::box_err; @@ -19,6 +21,7 @@ impl DBOptionsExt for RocksEngine { } } +#[derive(Default)] pub struct RocksDBOptions(RawDBOptions); impl RocksDBOptions { @@ -35,6 +38,22 @@ impl RocksDBOptions { } } +impl Deref for RocksDBOptions { + type Target = RawDBOptions; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for RocksDBOptions { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl DBOptions for RocksDBOptions { type TitanDBOptions = RocksTitanDBOptions; @@ -83,6 +102,22 @@ impl RocksTitanDBOptions { } } +impl Deref for RocksTitanDBOptions { + type Target = RawTitanDBOptions; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for RocksTitanDBOptions { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + impl TitanDBOptions for RocksTitanDBOptions { fn new() -> Self { RocksTitanDBOptions::from_raw(RawTitanDBOptions::new()) diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index e6a1cf4a6a7..6071f06a646 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -30,6 +30,10 @@ pub struct RocksEngine { } impl RocksEngine { + pub(crate) fn new(db: DB) -> RocksEngine { + RocksEngine::from_db(Arc::new(db)) + } + pub fn from_db(db: Arc) -> Self { RocksEngine { db: db.clone(), @@ -38,13 +42,6 @@ impl RocksEngine { } } - // Notice: After obtaining RocksEngine through this method, please make sure - // it has been initialized with db, otherwise do not call its member methods, - // as it'll contain garbage members. - pub fn from_ref(db: &Arc) -> &Self { - unsafe { &*(db as *const Arc as *const RocksEngine) } - } - pub fn as_inner(&self) -> &Arc { &self.db } @@ -202,21 +199,17 @@ impl SyncMutable for RocksEngine { #[cfg(test)] mod tests { - use std::sync::Arc; - use engine_traits::{Iterable, KvEngine, Peekable, SyncMutable, CF_DEFAULT}; use kvproto::metapb::Region; use tempfile::Builder; - use crate::{raw_util, RocksEngine, RocksSnapshot}; + use crate::{util, RocksSnapshot}; #[test] fn test_base() { let path = Builder::new().prefix("var").tempdir().unwrap(); let cf = "cf"; - let engine = RocksEngine::from_db(Arc::new( - raw_util::new_engine(path.path().to_str().unwrap(), None, &[cf], None).unwrap(), - )); + let engine = util::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, cf]).unwrap(); let mut r = Region::default(); r.set_id(10); @@ -251,9 +244,7 @@ mod tests { fn test_peekable() { let path = Builder::new().prefix("var").tempdir().unwrap(); let cf = "cf"; - let engine = RocksEngine::from_db(Arc::new( - raw_util::new_engine(path.path().to_str().unwrap(), None, &[cf], None).unwrap(), - )); + let engine = util::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, cf]).unwrap(); engine.put(b"k1", b"v1").unwrap(); engine.put_cf(cf, b"k1", b"v2").unwrap(); @@ -267,9 +258,7 @@ mod tests { fn test_scan() { let path = Builder::new().prefix("var").tempdir().unwrap(); let cf = "cf"; - let engine = RocksEngine::from_db(Arc::new( - raw_util::new_engine(path.path().to_str().unwrap(), None, &[cf], None).unwrap(), - )); + let engine = util::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, cf]).unwrap(); engine.put(b"a1", b"v1").unwrap(); engine.put(b"a2", b"v2").unwrap(); diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index 2fcbc405056..c63edb8a117 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -42,32 +42,27 @@ impl DBFileSystemInspector for WrappedFileSystemInspecto mod tests { use std::sync::Arc; - use engine_traits::{CompactExt, CF_DEFAULT}; + use engine_traits::{CompactExt, MiscExt, SyncMutable, CF_DEFAULT}; use file_system::{IOOp, IORateLimiter, IORateLimiterStatistics, IOType}; use keys::data_key; - use rocksdb::{DBOptions, Writable, DB}; use tempfile::Builder; use super::*; use crate::{ - compat::Compat, - event_listener::RocksEventListener, - raw::{ColumnFamilyOptions, DBCompressionType}, - raw_util::{new_engine_opt, CFOptions}, + event_listener::RocksEventListener, raw::DBCompressionType, util::new_engine_opt, + RocksCfOptions, RocksDBOptions, RocksEngine, }; - fn new_test_db(dir: &str) -> (Arc, Arc) { + fn new_test_db(dir: &str) -> (RocksEngine, Arc) { let limiter = Arc::new(IORateLimiter::new_for_test()); - let mut db_opts = DBOptions::new(); + let mut db_opts = RocksDBOptions::default(); db_opts.add_event_listener(RocksEventListener::new("test_db", None)); let env = get_env(None, Some(limiter.clone())).unwrap(); db_opts.set_env(env); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_disable_auto_compactions(true); cf_opts.compression_per_level(&[DBCompressionType::No; 7]); - let db = Arc::new( - new_engine_opt(dir, db_opts, vec![CFOptions::new(CF_DEFAULT, cf_opts)]).unwrap(), - ); + let db = new_engine_opt(dir, db_opts, vec![(CF_DEFAULT, cf_opts)]).unwrap(); (db, limiter.statistics().unwrap()) } @@ -97,14 +92,13 @@ mod tests { assert!(stats.fetch(IOType::Flush, IOOp::Write) > value_size * 2); assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); - db.c() - .compact_range( - CF_DEFAULT, None, /*start_key*/ - None, /*end_key*/ - false, /*exclusive_manual*/ - 1, /*max_subcompactions*/ - ) - .unwrap(); + db.compact_range( + CF_DEFAULT, None, /*start_key*/ + None, /*end_key*/ + false, /*exclusive_manual*/ + 1, /*max_subcompactions*/ + ) + .unwrap(); assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) > value_size * 4); assert!( stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) diff --git a/components/engine_rocks/src/import.rs b/components/engine_rocks/src/import.rs index 641e33f7bd8..79e6d6c0f49 100644 --- a/components/engine_rocks/src/import.rs +++ b/components/engine_rocks/src/import.rs @@ -62,8 +62,6 @@ impl IngestExternalFileOptions for RocksIngestExternalFileOptions { #[cfg(test)] mod tests { - use std::sync::Arc; - use engine_traits::{ FlowControlFactorsExt, MiscExt, Mutable, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, CF_DEFAULT, @@ -71,12 +69,7 @@ mod tests { use tempfile::Builder; use super::*; - use crate::{ - engine::RocksEngine, - raw::{ColumnFamilyOptions, DBOptions}, - raw_util::{new_engine_opt, CFOptions}, - RocksSstWriterBuilder, - }; + use crate::{util::new_engine_opt, RocksCfOptions, RocksDBOptions, RocksSstWriterBuilder}; #[test] fn test_ingest_multiple_file() { @@ -91,14 +84,12 @@ mod tests { let cfs_opts = ALL_CFS .iter() .map(|cf| { - let mut opt = ColumnFamilyOptions::new(); + let mut opt = RocksCfOptions::default(); opt.set_force_consistency_checks(true); - CFOptions::new(cf, opt) + (*cf, opt) }) .collect(); - let db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - let db = Arc::new(db); - let db = RocksEngine::from_db(db); + let db = new_engine_opt(path_str, RocksDBOptions::default(), cfs_opts).unwrap(); let mut wb = db.write_batch(); for i in 1000..5000 { let v = i.to_string(); diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index b93d8cc7f36..8ec581c6e86 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -67,12 +67,8 @@ mod engine_iterator; pub use crate::engine_iterator::*; mod options; -pub mod raw_util; pub mod util; -mod compat; -pub use compat::*; - mod compact_listener; pub use compact_listener::*; diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index ad1f385654f..ff465d85dd1 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -253,7 +253,7 @@ impl MiscExt for RocksEngine { } fn exists(path: &str) -> bool { - crate::raw_util::db_exist(path) + crate::util::db_exist(path) } fn dump_stats(&self) -> Result { @@ -334,8 +334,6 @@ impl MiscExt for RocksEngine { #[cfg(test)] mod tests { - use std::sync::Arc; - use engine_traits::{ DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, ALL_CFS, }; @@ -344,8 +342,8 @@ mod tests { use super::*; use crate::{ engine::RocksEngine, - raw::{ColumnFamilyOptions, DBOptions, DB}, - raw_util::{new_engine_opt, CFOptions}, + util::{new_engine, new_engine_opt}, + RocksCfOptions, RocksDBOptions, }; fn check_data(db: &RocksEngine, cfs: &[&str], expected: &[(&[u8], &[u8])]) { @@ -372,13 +370,7 @@ mod tests { .unwrap(); let path_str = path.path().to_str().unwrap(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) - .collect(); - let db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - let db = Arc::new(db); - let db = RocksEngine::from_db(db); + let db = new_engine(path_str, ALL_CFS).unwrap(); let mut wb = db.write_batch(); let ts: u8 = 12; @@ -523,14 +515,12 @@ mod tests { let cfs_opts = ALL_CFS .iter() .map(|cf| { - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_level_zero_file_num_compaction_trigger(1); - CFOptions::new(cf, cf_opts) + (*cf, cf_opts) }) .collect(); - let db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - let db = Arc::new(db); - let db = RocksEngine::from_db(db); + let db = new_engine_opt(path_str, RocksDBOptions::default(), cfs_opts).unwrap(); let keys = vec![b"k1", b"k2", b"k3", b"k4"]; @@ -562,11 +552,11 @@ mod tests { .unwrap(); let path_str = path.path().to_str().unwrap(); - let mut opts = DBOptions::new(); + let mut opts = RocksDBOptions::default(); opts.create_if_missing(true); opts.enable_multi_batch_write(true); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); // Prefix extractor(trim the timestamp at tail) for write cf. cf_opts .set_prefix_extractor( @@ -577,9 +567,7 @@ mod tests { // Create prefix bloom filter for memtable. cf_opts.set_memtable_prefix_bloom_size_ratio(0.1_f64); let cf = "default"; - let db = DB::open_cf(opts, path_str, vec![(cf, cf_opts)]).unwrap(); - let db = Arc::new(db); - let db = RocksEngine::from_db(db); + let db = new_engine_opt(path_str, opts, vec![(cf, cf_opts)]).unwrap(); let mut wb = db.write_batch(); let kvs: Vec<(&[u8], &[u8])> = vec![ (b"kabcdefg1", b"v1"), diff --git a/components/engine_rocks/src/perf_context_impl.rs b/components/engine_rocks/src/perf_context_impl.rs index 152a0a12785..fe747b21a49 100644 --- a/components/engine_rocks/src/perf_context_impl.rs +++ b/components/engine_rocks/src/perf_context_impl.rs @@ -10,8 +10,8 @@ use tikv_util::time::Instant; use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS}; use crate::{ - perf_context_metrics::*, raw_util, set_perf_flags, set_perf_level, - PerfContext as RawPerfContext, PerfFlag, PerfFlags, + perf_context_metrics::*, set_perf_flags, set_perf_level, util, PerfContext as RawPerfContext, + PerfFlag, PerfFlags, }; macro_rules! report_write_perf_context { @@ -191,7 +191,7 @@ impl PerfContextStatistics { } } } else { - set_perf_level(raw_util::to_raw_perf_level(self.perf_level)); + set_perf_level(util::to_raw_perf_level(self.perf_level)); } } diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index 47b48d2fc5c..1168182c58e 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -536,9 +536,7 @@ pub fn get_range_entries_and_versions( #[cfg(test)] mod tests { - use std::sync::Arc; - - use engine_traits::{CF_WRITE, LARGE_CFS}; + use engine_traits::{MiscExt, SyncMutable, CF_WRITE, LARGE_CFS}; use rand::Rng; use tempfile::Builder; use test::Bencher; @@ -546,9 +544,8 @@ mod tests { use super::*; use crate::{ - compat::Compat, - raw::{ColumnFamilyOptions, DBEntryType, DBOptions, TablePropertiesCollector, Writable}, - raw_util::CFOptions, + raw::{DBEntryType, TablePropertiesCollector}, + RocksCfOptions, RocksDBOptions, }; #[allow(clippy::many_single_char_names)] @@ -714,18 +711,15 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::new(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = RocksDBOptions::default(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_level_zero_file_num_compaction_trigger(10); cf_opts.add_table_properties_collector_factory( "tikv.mvcc-properties-collector", MvccPropertiesCollectorFactory::default(), ); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); - let db = Arc::new(crate::raw_util::new_engine_opt(path_str, db_opts, cfs_opts).unwrap()); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); + let db = crate::util::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let cases = ["a", "b", "c"]; for &key in &cases { @@ -734,22 +728,21 @@ mod tests { .append_ts(2.into()) .as_encoded(), ); - let write_cf = db.cf_handle(CF_WRITE).unwrap(); - db.put_cf(write_cf, &k1, b"v1").unwrap(); - db.delete_cf(write_cf, &k1).unwrap(); + db.put_cf(CF_WRITE, &k1, b"v1").unwrap(); + db.delete_cf(CF_WRITE, &k1).unwrap(); let key = keys::data_key( Key::from_raw(key.as_bytes()) .append_ts(3.into()) .as_encoded(), ); - db.put_cf(write_cf, &key, b"v2").unwrap(); - db.flush_cf(write_cf, true).unwrap(); + db.put_cf(CF_WRITE, &key, b"v2").unwrap(); + db.flush_cf(CF_WRITE, true).unwrap(); } let start_keys = keys::data_key(&[]); let end_keys = keys::data_end_key(&[]); let (entries, versions) = - get_range_entries_and_versions(db.c(), CF_WRITE, &start_keys, &end_keys).unwrap(); + get_range_entries_and_versions(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); assert_eq!(entries, (cases.len() * 2) as u64); assert_eq!(versions, cases.len() as u64); } diff --git a/components/engine_rocks/src/raw.rs b/components/engine_rocks/src/raw.rs index c51c0187b2d..1a8718588b2 100644 --- a/components/engine_rocks/src/raw.rs +++ b/components/engine_rocks/src/raw.rs @@ -7,14 +7,13 @@ //! crate, but only until the engine interface is completely abstracted. pub use rocksdb::{ - new_compaction_filter_raw, run_ldb_tool, run_sst_dump_tool, BlockBasedOptions, CFHandle, Cache, - ChecksumType, ColumnFamilyOptions, CompactOptions, CompactionFilter, CompactionFilterContext, + new_compaction_filter_raw, run_ldb_tool, run_sst_dump_tool, BlockBasedOptions, Cache, + ChecksumType, CompactOptions, CompactionFilter, CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, CompactionJobInfo, CompactionOptions, CompactionPriority, DBBottommostLevelCompaction, - DBCompactionFilter, DBCompactionStyle, DBCompressionType, DBEntryType, DBInfoLogLevel, - DBIterator, DBOptions, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, - DBTitanDBBlobRunMode, Env, EventListener, IngestExternalFileOptions, LRUCacheOptions, - MemoryAllocator, PerfContext, PrepopulateBlockCache, Range, ReadOptions, SliceTransform, - TableFilter, TablePropertiesCollector, TablePropertiesCollectorFactory, TitanBlobIndex, - TitanDBOptions, Writable, WriteOptions, DB, + DBCompactionFilter, DBCompactionStyle, DBCompressionType, DBEntryType, DBRateLimiterMode, + DBRecoveryMode, DBStatisticsTickerType, DBTitanDBBlobRunMode, Env, EventListener, + IngestExternalFileOptions, LRUCacheOptions, MemoryAllocator, PerfContext, + PrepopulateBlockCache, Range, SliceTransform, TablePropertiesCollector, + TablePropertiesCollectorFactory, }; diff --git a/components/engine_rocks/src/raw_util.rs b/components/engine_rocks/src/raw_util.rs deleted file mode 100644 index e669f007276..00000000000 --- a/components/engine_rocks/src/raw_util.rs +++ /dev/null @@ -1,336 +0,0 @@ -// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. - -//! Functions for constructing the rocksdb crate's `DB` type -//! -//! These are an artifact of refactoring the engine traits and will go away -//! eventually. Prefer to use the versions in the `util` module. - -use std::{fs, path::Path, sync::Arc}; - -use engine_traits::{Result, CF_DEFAULT}; -use rocksdb::{ - load_latest_options, CColumnFamilyDescriptor, ColumnFamilyOptions, DBOptions, Env, DB, -}; -use tikv_util::warn; - -use crate::r2e; - -pub struct CFOptions<'a> { - cf: &'a str, - options: ColumnFamilyOptions, -} - -impl<'a> CFOptions<'a> { - pub fn new(cf: &'a str, options: ColumnFamilyOptions) -> CFOptions<'a> { - CFOptions { cf, options } - } -} - -pub fn new_engine( - path: &str, - db_opts: Option, - cfs: &[&str], - opts: Option>>, -) -> Result { - let mut db_opts = match db_opts { - Some(opt) => opt, - None => DBOptions::new(), - }; - db_opts.enable_statistics(true); - let cf_opts = match opts { - Some(opts_vec) => opts_vec, - None => { - let mut default_cfs_opts = Vec::with_capacity(cfs.len()); - for cf in cfs { - default_cfs_opts.push(CFOptions::new(*cf, ColumnFamilyOptions::new())); - } - default_cfs_opts - } - }; - new_engine_opt(path, db_opts, cf_opts) -} - -/// Turns "dynamic level size" off for the existing column family which was off before. -/// Column families are small, HashMap isn't necessary. -fn adjust_dynamic_level_bytes( - cf_descs: &[CColumnFamilyDescriptor], - cf_options: &mut CFOptions<'_>, -) { - if let Some(cf_desc) = cf_descs - .iter() - .find(|cf_desc| cf_desc.name() == cf_options.cf) - { - let existed_dynamic_level_bytes = - cf_desc.options().get_level_compaction_dynamic_level_bytes(); - if existed_dynamic_level_bytes - != cf_options - .options - .get_level_compaction_dynamic_level_bytes() - { - warn!( - "change dynamic_level_bytes for existing column family is danger"; - "old_value" => existed_dynamic_level_bytes, - "new_value" => cf_options.options.get_level_compaction_dynamic_level_bytes(), - ); - } - cf_options - .options - .set_level_compaction_dynamic_level_bytes(existed_dynamic_level_bytes); - } -} - -pub fn new_engine_opt( - path: &str, - mut db_opt: DBOptions, - cfs_opts: Vec>, -) -> Result { - // Creates a new db if it doesn't exist. - if !db_exist(path) { - db_opt.create_if_missing(true); - - let mut cfs_v = vec![]; - let mut cf_opts_v = vec![]; - if let Some(x) = cfs_opts.iter().find(|x| x.cf == CF_DEFAULT) { - cfs_v.push(x.cf); - cf_opts_v.push(x.options.clone()); - } - let mut db = - DB::open_cf(db_opt, path, cfs_v.into_iter().zip(cf_opts_v).collect()).map_err(r2e)?; - for x in cfs_opts { - if x.cf == CF_DEFAULT { - continue; - } - db.create_cf((x.cf, x.options)).map_err(r2e)?; - } - - return Ok(db); - } - - db_opt.create_if_missing(false); - - // Lists all column families in current db. - let cfs_list = DB::list_column_families(&db_opt, path).map_err(r2e)?; - let existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); - let needed: Vec<&str> = cfs_opts.iter().map(|x| x.cf).collect(); - - let cf_descs = if !existed.is_empty() { - let env = match db_opt.env() { - Some(env) => env, - None => Arc::new(Env::default()), - }; - // panic if OPTIONS not found for existing instance? - let (_, tmp) = load_latest_options(path, &env, true) - .unwrap_or_else(|e| panic!("failed to load_latest_options {:?}", e)) - .unwrap_or_else(|| panic!("couldn't find the OPTIONS file")); - tmp - } else { - vec![] - }; - - // If all column families exist, just open db. - if existed == needed { - let mut cfs_v = vec![]; - let mut cfs_opts_v = vec![]; - for mut x in cfs_opts { - adjust_dynamic_level_bytes(&cf_descs, &mut x); - cfs_v.push(x.cf); - cfs_opts_v.push(x.options); - } - - let db = - DB::open_cf(db_opt, path, cfs_v.into_iter().zip(cfs_opts_v).collect()).map_err(r2e)?; - return Ok(db); - } - - // Opens db. - let mut cfs_v: Vec<&str> = Vec::new(); - let mut cfs_opts_v: Vec = Vec::new(); - for cf in &existed { - cfs_v.push(cf); - match cfs_opts.iter().find(|x| x.cf == *cf) { - Some(x) => { - let mut tmp = CFOptions::new(x.cf, x.options.clone()); - adjust_dynamic_level_bytes(&cf_descs, &mut tmp); - cfs_opts_v.push(tmp.options); - } - None => { - cfs_opts_v.push(ColumnFamilyOptions::new()); - } - } - } - let cfds = cfs_v.into_iter().zip(cfs_opts_v).collect(); - let mut db = DB::open_cf(db_opt, path, cfds).map_err(r2e)?; - - // Drops discarded column families. - // for cf in existed.iter().filter(|x| needed.iter().find(|y| y == x).is_none()) { - for cf in cfs_diff(&existed, &needed) { - // Never drop default column families. - if cf != CF_DEFAULT { - db.drop_cf(cf).map_err(r2e)?; - } - } - - // Creates needed column families if they don't exist. - for cf in cfs_diff(&needed, &existed) { - db.create_cf(( - cf, - cfs_opts - .iter() - .find(|x| x.cf == cf) - .unwrap() - .options - .clone(), - )) - .map_err(r2e)?; - } - Ok(db) -} - -pub fn db_exist(path: &str) -> bool { - let path = Path::new(path); - if !path.exists() || !path.is_dir() { - return false; - } - let current_file_path = path.join("CURRENT"); - if !current_file_path.exists() || !current_file_path.is_file() { - return false; - } - - // If path is not an empty directory, and current file exists, we say db exists. If path is not an empty directory - // but db has not been created, `DB::list_column_families` fails and we can clean up - // the directory by this indication. - fs::read_dir(&path).unwrap().next().is_some() -} - -/// Returns a Vec of cf which is in `a' but not in `b'. -fn cfs_diff<'a>(a: &[&'a str], b: &[&str]) -> Vec<&'a str> { - a.iter() - .filter(|x| !b.iter().any(|y| *x == y)) - .cloned() - .collect() -} - -pub fn to_raw_perf_level(level: engine_traits::PerfLevel) -> rocksdb::PerfLevel { - match level { - engine_traits::PerfLevel::Uninitialized => rocksdb::PerfLevel::Uninitialized, - engine_traits::PerfLevel::Disable => rocksdb::PerfLevel::Disable, - engine_traits::PerfLevel::EnableCount => rocksdb::PerfLevel::EnableCount, - engine_traits::PerfLevel::EnableTimeExceptForMutex => { - rocksdb::PerfLevel::EnableTimeExceptForMutex - } - engine_traits::PerfLevel::EnableTimeAndCPUTimeExceptForMutex => { - rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex - } - engine_traits::PerfLevel::EnableTime => rocksdb::PerfLevel::EnableTime, - engine_traits::PerfLevel::OutOfBounds => rocksdb::PerfLevel::OutOfBounds, - } -} - -pub fn from_raw_perf_level(level: rocksdb::PerfLevel) -> engine_traits::PerfLevel { - match level { - rocksdb::PerfLevel::Uninitialized => engine_traits::PerfLevel::Uninitialized, - rocksdb::PerfLevel::Disable => engine_traits::PerfLevel::Disable, - rocksdb::PerfLevel::EnableCount => engine_traits::PerfLevel::EnableCount, - rocksdb::PerfLevel::EnableTimeExceptForMutex => { - engine_traits::PerfLevel::EnableTimeExceptForMutex - } - rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex => { - engine_traits::PerfLevel::EnableTimeAndCPUTimeExceptForMutex - } - rocksdb::PerfLevel::EnableTime => engine_traits::PerfLevel::EnableTime, - rocksdb::PerfLevel::OutOfBounds => engine_traits::PerfLevel::OutOfBounds, - } -} - -#[cfg(test)] -mod tests { - use engine_traits::CF_DEFAULT; - use rocksdb::{ColumnFamilyOptions, DBOptions, DB}; - use tempfile::Builder; - - use super::*; - - #[test] - fn test_cfs_diff() { - let a = vec!["1", "2", "3"]; - let a_diff_a = cfs_diff(&a, &a); - assert!(a_diff_a.is_empty()); - let b = vec!["4"]; - assert_eq!(a, cfs_diff(&a, &b)); - let c = vec!["4", "5", "3", "6"]; - assert_eq!(vec!["1", "2"], cfs_diff(&a, &c)); - assert_eq!(vec!["4", "5", "6"], cfs_diff(&c, &a)); - let d = vec!["1", "2", "3", "4"]; - let a_diff_d = cfs_diff(&a, &d); - assert!(a_diff_d.is_empty()); - assert_eq!(vec!["4"], cfs_diff(&d, &a)); - } - - #[test] - fn test_new_engine_opt() { - let path = Builder::new() - .prefix("_util_rocksdb_test_check_column_families") - .tempdir() - .unwrap(); - let path_str = path.path().to_str().unwrap(); - - // create db when db not exist - let mut cfs_opts = vec![CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new())]; - let mut opts = ColumnFamilyOptions::new(); - opts.set_level_compaction_dynamic_level_bytes(true); - cfs_opts.push(CFOptions::new("cf_dynamic_level_bytes", opts.clone())); - { - let mut db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes"]); - check_dynamic_level_bytes(&mut db); - } - - // add cf1. - let cfs_opts = vec![ - CFOptions::new(CF_DEFAULT, opts.clone()), - CFOptions::new("cf_dynamic_level_bytes", opts.clone()), - CFOptions::new("cf1", opts), - ]; - { - let mut db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); - check_dynamic_level_bytes(&mut db); - } - - // drop cf1. - let cfs_opts = vec![ - CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new()), - CFOptions::new("cf_dynamic_level_bytes", ColumnFamilyOptions::new()), - ]; - { - let mut db = new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes"]); - check_dynamic_level_bytes(&mut db); - } - - // never drop default cf - let cfs_opts = vec![]; - new_engine_opt(path_str, DBOptions::new(), cfs_opts).unwrap(); - column_families_must_eq(path_str, vec![CF_DEFAULT]); - } - - fn column_families_must_eq(path: &str, excepted: Vec<&str>) { - let opts = DBOptions::new(); - let cfs_list = DB::list_column_families(&opts, path).unwrap(); - - let mut cfs_existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); - let mut cfs_excepted: Vec<&str> = excepted.clone(); - cfs_existed.sort_unstable(); - cfs_excepted.sort_unstable(); - assert_eq!(cfs_existed, cfs_excepted); - } - - fn check_dynamic_level_bytes(db: &mut DB) { - let cf_default = db.cf_handle(CF_DEFAULT).unwrap(); - let tmp_cf_opts = db.get_options_cf(cf_default); - assert!(!tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); - let cf_test = db.cf_handle("cf_dynamic_level_bytes").unwrap(); - let tmp_cf_opts = db.get_options_cf(cf_test); - assert!(tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); - } -} diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 1ce4063298e..4529b6e9d27 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -1618,8 +1618,7 @@ mod tests { #[test] fn test_flush() { let dir = Builder::new().prefix("test-flush").tempdir().unwrap(); - let engine = - crate::util::new_engine(dir.path().to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let engine = crate::util::new_engine(dir.path().to_str().unwrap(), ALL_CFS).unwrap(); for tp in ENGINE_TICKER_TYPES { flush_engine_ticker_metrics(*tp, 2, "kv"); } diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 81a2ccb497a..a3b6a2bf4cf 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -1,89 +1,163 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{str::FromStr, sync::Arc}; +use std::{fs, path::Path, str::FromStr, sync::Arc}; -use engine_traits::{Engines, Error, Range, Result, CF_DEFAULT}; -use rocksdb::{CFHandle, Range as RocksRange, SliceTransform, DB}; -use tikv_util::box_err; +use engine_traits::{Engines, Range, Result, CF_DEFAULT}; +use rocksdb::{ + load_latest_options, CColumnFamilyDescriptor, CFHandle, ColumnFamilyOptions, Env, + Range as RocksRange, SliceTransform, DB, +}; +use slog_global::warn; use crate::{ - cf_options::RocksColumnFamilyOptions, - db_options::RocksDBOptions, - engine::RocksEngine, - r2e, - raw_util::{new_engine as new_engine_raw, new_engine_opt as new_engine_opt_raw, CFOptions}, + cf_options::RocksCfOptions, db_options::RocksDBOptions, engine::RocksEngine, r2e, rocks_metrics_defs::*, }; pub fn new_temp_engine(path: &tempfile::TempDir) -> Engines { let raft_path = path.path().join(std::path::Path::new("raft")); Engines::new( - new_engine( - path.path().to_str().unwrap(), - None, - engine_traits::ALL_CFS, - None, - ) - .unwrap(), - new_engine( - raft_path.to_str().unwrap(), - None, - &[engine_traits::CF_DEFAULT], - None, - ) - .unwrap(), + new_engine(path.path().to_str().unwrap(), engine_traits::ALL_CFS).unwrap(), + new_engine(raft_path.to_str().unwrap(), &[engine_traits::CF_DEFAULT]).unwrap(), ) } pub fn new_default_engine(path: &str) -> Result { - let engine = - new_engine_raw(path, None, &[CF_DEFAULT], None).map_err(|e| Error::Other(box_err!(e)))?; - let engine = Arc::new(engine); - let engine = RocksEngine::from_db(engine); - Ok(engine) + new_engine(path, &[CF_DEFAULT]) } -pub struct RocksCFOptions<'a> { - cf: &'a str, - options: RocksColumnFamilyOptions, +pub fn new_engine(path: &str, cfs: &[&str]) -> Result { + let mut db_opts = RocksDBOptions::default(); + db_opts.enable_statistics(true); + let cf_opts = cfs.iter().map(|name| (*name, Default::default())).collect(); + new_engine_opt(path, db_opts, cf_opts) } -impl<'a> RocksCFOptions<'a> { - pub fn new(cf: &'a str, options: RocksColumnFamilyOptions) -> RocksCFOptions<'a> { - RocksCFOptions { cf, options } +pub fn new_engine_opt( + path: &str, + db_opt: RocksDBOptions, + cf_opts: Vec<(&str, RocksCfOptions)>, +) -> Result { + let mut db_opt = db_opt.into_raw(); + if cf_opts.iter().all(|(name, _)| *name != CF_DEFAULT) { + return Err(engine_traits::Error::Engine( + engine_traits::Status::with_error( + engine_traits::Code::InvalidArgument, + "default cf must be specified", + ), + )); + } + let mut cf_opts: Vec<_> = cf_opts + .into_iter() + .map(|(name, opt)| (name, opt.into_raw())) + .collect(); + + // Creates a new db if it doesn't exist. + if !db_exist(path) { + db_opt.create_if_missing(true); + db_opt.create_missing_column_families(true); + + let db = DB::open_cf(db_opt, path, cf_opts.into_iter().collect()).map_err(r2e)?; + + return Ok(RocksEngine::new(db)); + } + + db_opt.create_if_missing(false); + + // Lists all column families in current db. + let cfs_list = DB::list_column_families(&db_opt, path).map_err(r2e)?; + let existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); + let needed: Vec<&str> = cf_opts.iter().map(|(name, _)| *name).collect(); + + let cf_descs = if !existed.is_empty() { + let env = match db_opt.env() { + Some(env) => env, + None => Arc::new(Env::default()), + }; + // panic if OPTIONS not found for existing instance? + let (_, tmp) = load_latest_options(path, &env, true) + .unwrap_or_else(|e| panic!("failed to load_latest_options {:?}", e)) + .unwrap_or_else(|| panic!("couldn't find the OPTIONS file")); + tmp + } else { + vec![] + }; + + for cf in &existed { + if cf_opts.iter().all(|(name, _)| name != cf) { + cf_opts.push((cf, ColumnFamilyOptions::default())); + } + } + for (name, opt) in &mut cf_opts { + adjust_dynamic_level_bytes(&cf_descs, name, opt); } - pub fn into_raw(self) -> CFOptions<'a> { - CFOptions::new(self.cf, self.options.into_raw()) + // If all column families exist, just open db. + if existed == needed { + let db = DB::open_cf(db_opt, path, cf_opts.into_iter().collect()).map_err(r2e)?; + return Ok(RocksEngine::new(db)); } + + // Opens db. + let cfds = cf_opts.into_iter().collect(); + db_opt.create_missing_column_families(true); + let mut db = DB::open_cf(db_opt, path, cfds).map_err(r2e)?; + + // Drops discarded column families. + // for cf in existed.iter().filter(|x| needed.iter().find(|y| y == x).is_none()) { + for cf in cfs_diff(&existed, &needed) { + // Never drop default column families. + if cf != CF_DEFAULT { + db.drop_cf(cf).map_err(r2e)?; + } + } + + Ok(RocksEngine::new(db)) } -pub fn new_engine( - path: &str, - db_opts: Option, - cfs: &[&str], - opts: Option>>, -) -> Result { - let db_opts = db_opts.map(RocksDBOptions::into_raw); - let opts = opts.map(|o| o.into_iter().map(RocksCFOptions::into_raw).collect()); - let engine = new_engine_raw(path, db_opts, cfs, opts).map_err(|e| Error::Other(box_err!(e)))?; - let engine = Arc::new(engine); - let engine = RocksEngine::from_db(engine); - Ok(engine) +/// Turns "dynamic level size" off for the existing column family which was off before. +/// Column families are small, HashMap isn't necessary. +fn adjust_dynamic_level_bytes( + cf_descs: &[CColumnFamilyDescriptor], + name: &str, + opt: &mut ColumnFamilyOptions, +) { + if let Some(cf_desc) = cf_descs.iter().find(|cf_desc| cf_desc.name() == name) { + let existed_dynamic_level_bytes = + cf_desc.options().get_level_compaction_dynamic_level_bytes(); + if existed_dynamic_level_bytes != opt.get_level_compaction_dynamic_level_bytes() { + warn!( + "change dynamic_level_bytes for existing column family is danger"; + "old_value" => existed_dynamic_level_bytes, + "new_value" => opt.get_level_compaction_dynamic_level_bytes(), + ); + } + opt.set_level_compaction_dynamic_level_bytes(existed_dynamic_level_bytes); + } } -pub fn new_engine_opt( - path: &str, - db_opt: RocksDBOptions, - cfs_opts: Vec>, -) -> Result { - let db_opt = db_opt.into_raw(); - let cfs_opts = cfs_opts.into_iter().map(RocksCFOptions::into_raw).collect(); - let engine = - new_engine_opt_raw(path, db_opt, cfs_opts).map_err(|e| Error::Other(box_err!(e)))?; - let engine = Arc::new(engine); - let engine = RocksEngine::from_db(engine); - Ok(engine) +pub fn db_exist(path: &str) -> bool { + let path = Path::new(path); + if !path.exists() || !path.is_dir() { + return false; + } + let current_file_path = path.join("CURRENT"); + if !current_file_path.exists() || !current_file_path.is_file() { + return false; + } + + // If path is not an empty directory, and current file exists, we say db exists. If path is not an empty directory + // but db has not been created, `DB::list_column_families` fails and we can clean up + // the directory by this indication. + fs::read_dir(&path).unwrap().next().is_some() +} + +/// Returns a Vec of cf which is in `a' but not in `b'. +fn cfs_diff<'a>(a: &[&'a str], b: &[&str]) -> Vec<&'a str> { + a.iter() + .filter(|x| !b.iter().any(|y| *x == y)) + .cloned() + .collect() } pub fn get_cf_handle<'a>(db: &'a DB, cf: &str) -> Result<&'a CFHandle> { @@ -223,3 +297,123 @@ impl SliceTransform for NoopSliceTransform { true } } + +pub fn to_raw_perf_level(level: engine_traits::PerfLevel) -> rocksdb::PerfLevel { + match level { + engine_traits::PerfLevel::Uninitialized => rocksdb::PerfLevel::Uninitialized, + engine_traits::PerfLevel::Disable => rocksdb::PerfLevel::Disable, + engine_traits::PerfLevel::EnableCount => rocksdb::PerfLevel::EnableCount, + engine_traits::PerfLevel::EnableTimeExceptForMutex => { + rocksdb::PerfLevel::EnableTimeExceptForMutex + } + engine_traits::PerfLevel::EnableTimeAndCPUTimeExceptForMutex => { + rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex + } + engine_traits::PerfLevel::EnableTime => rocksdb::PerfLevel::EnableTime, + engine_traits::PerfLevel::OutOfBounds => rocksdb::PerfLevel::OutOfBounds, + } +} + +pub fn from_raw_perf_level(level: rocksdb::PerfLevel) -> engine_traits::PerfLevel { + match level { + rocksdb::PerfLevel::Uninitialized => engine_traits::PerfLevel::Uninitialized, + rocksdb::PerfLevel::Disable => engine_traits::PerfLevel::Disable, + rocksdb::PerfLevel::EnableCount => engine_traits::PerfLevel::EnableCount, + rocksdb::PerfLevel::EnableTimeExceptForMutex => { + engine_traits::PerfLevel::EnableTimeExceptForMutex + } + rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex => { + engine_traits::PerfLevel::EnableTimeAndCPUTimeExceptForMutex + } + rocksdb::PerfLevel::EnableTime => engine_traits::PerfLevel::EnableTime, + rocksdb::PerfLevel::OutOfBounds => engine_traits::PerfLevel::OutOfBounds, + } +} + +#[cfg(test)] +mod tests { + use engine_traits::{CFOptionsExt, CF_DEFAULT}; + use rocksdb::DB; + use tempfile::Builder; + + use super::*; + + #[test] + fn test_cfs_diff() { + let a = vec!["1", "2", "3"]; + let a_diff_a = cfs_diff(&a, &a); + assert!(a_diff_a.is_empty()); + let b = vec!["4"]; + assert_eq!(a, cfs_diff(&a, &b)); + let c = vec!["4", "5", "3", "6"]; + assert_eq!(vec!["1", "2"], cfs_diff(&a, &c)); + assert_eq!(vec!["4", "5", "6"], cfs_diff(&c, &a)); + let d = vec!["1", "2", "3", "4"]; + let a_diff_d = cfs_diff(&a, &d); + assert!(a_diff_d.is_empty()); + assert_eq!(vec!["4"], cfs_diff(&d, &a)); + } + + #[test] + fn test_new_engine_opt() { + let path = Builder::new() + .prefix("_util_rocksdb_test_check_column_families") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + + // create db when db not exist + let mut cfs_opts = vec![(CF_DEFAULT, RocksCfOptions::default())]; + let mut opts = RocksCfOptions::default(); + opts.set_level_compaction_dynamic_level_bytes(true); + cfs_opts.push(("cf_dynamic_level_bytes", opts.clone())); + let db = new_engine_opt(path_str, RocksDBOptions::default(), cfs_opts).unwrap(); + column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes"]); + check_dynamic_level_bytes(&db); + drop(db); + + // add cf1. + let cfs_opts = vec![ + (CF_DEFAULT, opts.clone()), + ("cf_dynamic_level_bytes", opts.clone()), + ("cf1", opts), + ]; + let db = new_engine_opt(path_str, RocksDBOptions::default(), cfs_opts).unwrap(); + column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); + check_dynamic_level_bytes(&db); + drop(db); + + // drop cf1. + let cfs = vec![CF_DEFAULT, "cf_dynamic_level_bytes"]; + let db = new_engine(path_str, &cfs).unwrap(); + column_families_must_eq(path_str, cfs); + check_dynamic_level_bytes(&db); + drop(db); + + // drop all cfs. + new_engine(path_str, &[CF_DEFAULT]).unwrap(); + column_families_must_eq(path_str, vec![CF_DEFAULT]); + + // not specifying default cf should error. + new_engine(path_str, &[]).unwrap_err(); + column_families_must_eq(path_str, vec![CF_DEFAULT]); + } + + fn column_families_must_eq(path: &str, excepted: Vec<&str>) { + let opts = RocksDBOptions::default(); + let cfs_list = DB::list_column_families(&opts, path).unwrap(); + + let mut cfs_existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); + let mut cfs_excepted: Vec<&str> = excepted.clone(); + cfs_existed.sort_unstable(); + cfs_excepted.sort_unstable(); + assert_eq!(cfs_existed, cfs_excepted); + } + + fn check_dynamic_level_bytes(db: &RocksEngine) { + let tmp_cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); + assert!(!tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); + let tmp_cf_opts = db.get_options_cf("cf_dynamic_level_bytes").unwrap(); + assert!(tmp_cf_opts.get_level_compaction_dynamic_level_bytes()); + } +} diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index f09761802e6..892dd83321c 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -219,7 +219,7 @@ impl Mutable for RocksWriteBatchVec { #[cfg(test)] mod tests { - use engine_traits::{Peekable, WriteBatch}; + use engine_traits::{Peekable, WriteBatch, CF_DEFAULT}; use rocksdb::DBOptions as RawDBOptions; use tempfile::Builder; @@ -227,6 +227,7 @@ mod tests { super::{util::new_engine_opt, RocksDBOptions}, *, }; + use crate::RocksCfOptions; #[test] fn test_should_write_to_engine_with_pipeline_write_mode() { @@ -241,7 +242,7 @@ mod tests { let engine = new_engine_opt( path.path().join("db").to_str().unwrap(), RocksDBOptions::from_raw(opt), - vec![], + vec![(CF_DEFAULT, RocksCfOptions::default())], ) .unwrap(); assert!( @@ -287,7 +288,7 @@ mod tests { let engine = new_engine_opt( path.path().join("db").to_str().unwrap(), RocksDBOptions::from_raw(opt), - vec![], + vec![(CF_DEFAULT, RocksCfOptions::default())], ) .unwrap(); assert!( diff --git a/components/engine_rocks_helper/Cargo.toml b/components/engine_rocks_helper/Cargo.toml index 74a0e8de47c..77133f09cbd 100644 --- a/components/engine_rocks_helper/Cargo.toml +++ b/components/engine_rocks_helper/Cargo.toml @@ -9,6 +9,7 @@ failpoints = ["fail/failpoints"] [dependencies] engine_rocks = { path = "../engine_rocks", default-features = false } +engine_traits = { path = "../engine_traits" } fail = "0.5" futures = "0.3" keys = { path = "../keys", default-features = false } diff --git a/components/engine_rocks_helper/src/sst_recovery.rs b/components/engine_rocks_helper/src/sst_recovery.rs index e7c1bae3a1c..bfd39e951b2 100644 --- a/components/engine_rocks_helper/src/sst_recovery.rs +++ b/components/engine_rocks_helper/src/sst_recovery.rs @@ -6,7 +6,7 @@ use std::{ time::{Duration, Instant}, }; -use engine_rocks::raw::*; +use engine_rocks::RocksEngine; use fail::fail_point; use raftstore::store::fsm::StoreMeta; use tikv_util::{self, set_panic_mark, warn, worker::*}; @@ -17,7 +17,7 @@ pub const DEFAULT_CHECK_INTERVAL: Duration = Duration::from_secs(10); const MAX_DAMAGED_FILES_NUM: usize = 2; pub struct RecoveryRunner { - db: Arc, + db: RocksEngine, store_meta: Arc>, // Considering that files will not be too much, it is enough to use `Vec`. damaged_files: Vec, @@ -68,7 +68,7 @@ impl RunnableWithTimer for RecoveryRunner { impl RecoveryRunner { pub fn new( - db: Arc, + db: RocksEngine, store_meta: Arc>, max_hang_duration: Duration, check_duration: Duration, @@ -87,7 +87,7 @@ impl RecoveryRunner { return; } - let live_files = self.db.get_live_files(); + let live_files = self.db.as_inner().get_live_files(); for i in 0..live_files.get_files_count() { if path == live_files.get_name(i as i32) { let f = FileInfo { @@ -167,6 +167,7 @@ impl RecoveryRunner { // file with the same largest key will be skipped. // Here store meta lock should be held to prevent peers from being added back. self.db + .as_inner() .delete_files_in_range(&file.smallest_key, &file.largest_key, true) .unwrap(); self.must_file_not_exist(&file.name); @@ -192,7 +193,7 @@ impl RecoveryRunner { } fn must_file_not_exist(&self, fname: &str) { - let live_files = self.db.get_live_files(); + let live_files = self.db.as_inner().get_live_files(); for i in 0..live_files.get_files_count() { if live_files.get_name(i as i32) == fname { // `delete_files_in_range` can't delete L0 files. @@ -206,7 +207,8 @@ impl RecoveryRunner { mod tests { use std::{collections::BTreeMap, sync::Arc}; - use engine_rocks::raw_util; + use engine_rocks::util; + use engine_traits::{CompactExt, SyncMutable, CF_DEFAULT}; use kvproto::metapb::{Peer, Region}; use tempfile::Builder; @@ -218,16 +220,14 @@ mod tests { .prefix("test_sst_recovery_runner") .tempdir() .unwrap(); - let db = Arc::new( - raw_util::new_engine(path.path().to_str().unwrap(), None, &["cf"], None).unwrap(), - ); + let db = util::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, "cf"]).unwrap(); db.put(b"z2", b"val").unwrap(); db.put(b"z7", b"val").unwrap(); // generate SST file. - db.compact_range(None, None); + db.compact_range(CF_DEFAULT, None, None, false, 1).unwrap(); - let files = db.get_live_files(); + let files = db.as_inner().get_live_files(); assert_eq!(files.get_smallestkey(0), b"z2"); assert_eq!(files.get_largestkey(0), b"z7"); diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index d6633139122..20645823fd8 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -95,21 +95,16 @@ pub mod kv { }; use tikv_util::box_err; - use crate::ctor::{CFOptions, DBOptions, KvEngineConstructorExt}; + use crate::ctor::{ColumnFamilyOptions as KvTestCFOptions, DBOptions, KvEngineConstructorExt}; - pub fn new_engine( - path: &str, - db_opt: Option, - cfs: &[&str], - opts: Option>, - ) -> Result { - KvTestEngine::new_kv_engine(path, db_opt, cfs, opts) + pub fn new_engine(path: &str, cfs: &[&str]) -> Result { + KvTestEngine::new_kv_engine(path, cfs) } pub fn new_engine_opt( path: &str, db_opt: DBOptions, - cfs_opts: Vec, + cfs_opts: Vec<(&str, KvTestCFOptions)>, ) -> Result { KvTestEngine::new_kv_engine_opt(path, db_opt, cfs_opts) } @@ -119,24 +114,21 @@ pub mod kv { #[derive(Clone)] pub struct TestTabletFactory { root_path: String, - db_opt: Option, - cfs: Vec, - opts: Option>, + db_opt: DBOptions, + cf_opts: Vec<(&'static str, KvTestCFOptions)>, registry: Arc>>, } impl TestTabletFactory { pub fn new( root_path: &str, - db_opt: Option, - cfs: &[&str], - opts: Option>, + db_opt: DBOptions, + cf_opts: Vec<(&'static str, KvTestCFOptions)>, ) -> Self { Self { root_path: root_path.to_string(), db_opt, - cfs: cfs.iter().map(|s| s.to_string()).collect(), - opts, + cf_opts, registry: Arc::new(Mutex::new(HashMap::default())), } } @@ -165,13 +157,10 @@ pub mod kv { } let tablet_path = self.tablet_path(id, suffix); let tablet_path = tablet_path.to_str().unwrap(); - let mut cfs = vec![]; - self.cfs.iter().for_each(|s| cfs.push(s.as_str())); - let kv_engine = KvTestEngine::new_kv_engine( + let kv_engine = KvTestEngine::new_kv_engine_opt( tablet_path, self.db_opt.clone(), - cfs.as_slice(), - self.opts.clone(), + self.cf_opts.clone(), )?; reg.insert((id, suffix), kv_engine.clone()); Ok(kv_engine) @@ -343,12 +332,7 @@ pub mod ctor { /// /// The engine stores its data in the `path` directory. /// If that directory does not exist, then it is created. - fn new_kv_engine( - path: &str, - db_opt: Option, - cfs: &[&str], - opts: Option>, - ) -> Result; + fn new_kv_engine(path: &str, cfs: &[&str]) -> Result; /// Create a new engine with specified column families and options /// @@ -357,7 +341,7 @@ pub mod ctor { fn new_kv_engine_opt( path: &str, db_opt: DBOptions, - cfs_opts: Vec, + cf_opts: Vec<(&str, ColumnFamilyOptions)>, ) -> Result; } @@ -390,21 +374,6 @@ pub mod ctor { pub type RaftDBOptions = DBOptions; - #[derive(Clone)] - pub struct CFOptions { - pub cf: String, - pub options: ColumnFamilyOptions, - } - - impl CFOptions { - pub fn new(cf: &str, options: ColumnFamilyOptions) -> CFOptions { - CFOptions { - cf: cf.to_string(), - options, - } - } - } - /// Properties for a single column family /// /// All engines must emulate column families, but at present it is not clear @@ -501,22 +470,19 @@ pub mod ctor { use engine_panic::PanicEngine; use engine_traits::Result; - use super::{CFOptions, DBOptions, KvEngineConstructorExt, RaftEngineConstructorExt}; + use super::{ + ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftEngineConstructorExt, + }; impl KvEngineConstructorExt for engine_panic::PanicEngine { - fn new_kv_engine( - _path: &str, - _db_opt: Option, - _cfs: &[&str], - _opts: Option>, - ) -> Result { + fn new_kv_engine(_path: &str, _cfs: &[&str]) -> Result { Ok(PanicEngine) } fn new_kv_engine_opt( _path: &str, _db_opt: DBOptions, - _cfs_opts: Vec, + _cfs_opts: Vec<(&str, ColumnFamilyOptions)>, ) -> Result { Ok(PanicEngine) } @@ -533,71 +499,38 @@ pub mod ctor { use engine_rocks::{ get_env, properties::{MvccPropertiesCollectorFactory, RangePropertiesCollectorFactory}, - raw::{ - ColumnFamilyOptions as RawRocksColumnFamilyOptions, DBOptions as RawRocksDBOptions, - }, - util::{ - new_engine as rocks_new_engine, new_engine_opt as rocks_new_engine_opt, - RocksCFOptions, - }, - RocksColumnFamilyOptions, RocksDBOptions, + util::new_engine_opt as rocks_new_engine_opt, + RocksCfOptions, RocksDBOptions, }; - use engine_traits::{ColumnFamilyOptions as ColumnFamilyOptionsTrait, Result}; + use engine_traits::{ColumnFamilyOptions as ColumnFamilyOptionsTrait, Result, CF_DEFAULT}; use super::{ - CFOptions, ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftDBOptions, + ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftDBOptions, RaftEngineConstructorExt, }; impl KvEngineConstructorExt for engine_rocks::RocksEngine { - // FIXME this is duplicating behavior from engine_rocks::raw_util in order to + // FIXME this is duplicating behavior from engine_rocks::util in order to // call set_standard_cf_opts. - fn new_kv_engine( - path: &str, - db_opt: Option, - cfs: &[&str], - opts: Option>, - ) -> Result { - let rocks_db_opts = match db_opt { - Some(db_opt) => Some(get_rocks_db_opts(db_opt)?), - None => None, - }; - let cfs_opts = match opts { - Some(opts) => opts, - None => { - let mut default_cfs_opts = Vec::with_capacity(cfs.len()); - for cf in cfs { - default_cfs_opts.push(CFOptions::new(*cf, ColumnFamilyOptions::new())); - } - default_cfs_opts - } - }; - let rocks_cfs_opts = cfs_opts + fn new_kv_engine(path: &str, cfs: &[&str]) -> Result { + let rocks_db_opt = RocksDBOptions::default(); + let default_cf_opt = ColumnFamilyOptions::new(); + let rocks_cfs_opts = cfs .iter() - .map(|cf_opts| { - let mut rocks_cf_opts = RocksColumnFamilyOptions::new(); - set_standard_cf_opts(rocks_cf_opts.as_raw_mut(), &cf_opts.options); - set_cf_opts(&mut rocks_cf_opts, &cf_opts.options); - RocksCFOptions::new(&cf_opts.cf, rocks_cf_opts) - }) + .map(|cf_name| (*cf_name, get_rocks_cf_opts(&default_cf_opt))) .collect(); - rocks_new_engine(path, rocks_db_opts, &[], Some(rocks_cfs_opts)) + rocks_new_engine_opt(path, rocks_db_opt, rocks_cfs_opts) } fn new_kv_engine_opt( path: &str, db_opt: DBOptions, - cfs_opts: Vec, + cfs_opts: Vec<(&str, ColumnFamilyOptions)>, ) -> Result { let rocks_db_opts = get_rocks_db_opts(db_opt)?; let rocks_cfs_opts = cfs_opts .iter() - .map(|cf_opts| { - let mut rocks_cf_opts = RocksColumnFamilyOptions::new(); - set_standard_cf_opts(rocks_cf_opts.as_raw_mut(), &cf_opts.options); - set_cf_opts(&mut rocks_cf_opts, &cf_opts.options); - RocksCFOptions::new(&cf_opts.cf, rocks_cf_opts) - }) + .map(|(name, opt)| (*name, get_rocks_cf_opts(opt))) .collect(); rocks_new_engine_opt(path, rocks_db_opts, rocks_cfs_opts) } @@ -606,22 +539,17 @@ pub mod ctor { impl RaftEngineConstructorExt for engine_rocks::RocksEngine { fn new_raft_engine(path: &str, db_opt: Option) -> Result { let rocks_db_opts = match db_opt { - Some(db_opt) => Some(get_rocks_db_opts(db_opt)?), - None => None, + Some(db_opt) => get_rocks_db_opts(db_opt)?, + None => RocksDBOptions::default(), }; - let cf_opts = CFOptions::new(engine_traits::CF_DEFAULT, ColumnFamilyOptions::new()); - let mut rocks_cf_opts = RocksColumnFamilyOptions::new(); - set_standard_cf_opts(rocks_cf_opts.as_raw_mut(), &cf_opts.options); - set_cf_opts(&mut rocks_cf_opts, &cf_opts.options); - let default_cfs_opts = vec![RocksCFOptions::new(&cf_opts.cf, rocks_cf_opts)]; - rocks_new_engine(path, rocks_db_opts, &[], Some(default_cfs_opts)) + let rocks_cf_opts = get_rocks_cf_opts(&ColumnFamilyOptions::new()); + let default_cfs_opts = vec![(CF_DEFAULT, rocks_cf_opts)]; + rocks_new_engine_opt(path, rocks_db_opts, default_cfs_opts) } } - fn set_standard_cf_opts( - rocks_cf_opts: &mut RawRocksColumnFamilyOptions, - cf_opts: &ColumnFamilyOptions, - ) { + fn get_rocks_cf_opts(cf_opts: &ColumnFamilyOptions) -> RocksCfOptions { + let mut rocks_cf_opts = RocksCfOptions::new(); if !cf_opts.get_no_range_properties() { rocks_cf_opts.add_table_properties_collector_factory( "tikv.range-properties-collector", @@ -634,27 +562,21 @@ pub mod ctor { MvccPropertiesCollectorFactory::default(), ); } - } - fn set_cf_opts( - rocks_cf_opts: &mut RocksColumnFamilyOptions, - cf_opts: &ColumnFamilyOptions, - ) { if let Some(trigger) = cf_opts.get_level_zero_file_num_compaction_trigger() { rocks_cf_opts.set_level_zero_file_num_compaction_trigger(trigger); } if let Some(trigger) = cf_opts.get_level_zero_slowdown_writes_trigger() { - rocks_cf_opts - .as_raw_mut() - .set_level_zero_slowdown_writes_trigger(trigger); + rocks_cf_opts.set_level_zero_slowdown_writes_trigger(trigger); } if cf_opts.get_disable_auto_compactions() { rocks_cf_opts.set_disable_auto_compactions(true); } + rocks_cf_opts } fn get_rocks_db_opts(db_opts: DBOptions) -> Result { - let mut rocks_db_opts = RawRocksDBOptions::new(); + let mut rocks_db_opts = RocksDBOptions::default(); let env = get_env(db_opts.key_manager.clone(), db_opts.rate_limiter)?; rocks_db_opts.set_env(env); if db_opts.enable_multi_batch_write { @@ -662,7 +584,6 @@ pub mod ctor { rocks_db_opts.enable_pipelined_write(false); rocks_db_opts.enable_multi_batch_write(true); } - let rocks_db_opts = RocksDBOptions::from_raw(rocks_db_opts); Ok(rocks_db_opts) } } @@ -695,13 +616,7 @@ pub fn new_temp_engine( ) -> engine_traits::Engines { let raft_path = path.path().join(std::path::Path::new("raft")); engine_traits::Engines::new( - crate::kv::new_engine( - path.path().to_str().unwrap(), - None, - engine_traits::ALL_CFS, - None, - ) - .unwrap(), + crate::kv::new_engine(path.path().to_str().unwrap(), engine_traits::ALL_CFS).unwrap(), crate::raft::new_engine(raft_path.to_str().unwrap(), None).unwrap(), ) } diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index c5b09fe59e1..6ba3da2b3d9 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -239,10 +239,6 @@ //! it in engine_traits and engine_rocks, replacing all the callers with calls //! into the traits, then delete the versions in the `engine` crate. //! -//! - Use the .c() method from engine_rocks::compat::Compat to get a -//! KvEngine reference from Arc in the fewest characters. It also -//! works on Snapshot, and can be adapted to other types. -//! //! - Use `IntoOther` to adapt between error types of dependencies that are not //! themselves interdependent. E.g. raft::Error can be created from //! engine_traits::Error even though neither `raft` tor `engine_traits` know diff --git a/components/engine_traits_tests/src/basic_read_write.rs b/components/engine_traits_tests/src/basic_read_write.rs index d5104ba57e3..38a1921dd85 100644 --- a/components/engine_traits_tests/src/basic_read_write.rs +++ b/components/engine_traits_tests/src/basic_read_write.rs @@ -2,7 +2,7 @@ //! Reading and writing -use engine_traits::{Peekable, SyncMutable, ALL_CFS, CF_DEFAULT, CF_WRITE}; +use engine_traits::{Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; use super::engine_cfs; @@ -17,16 +17,3 @@ fn non_cf_methods_are_default_cf() { let value = value.expect("value"); assert_eq!(b"bar", &*value); } - -// CF_DEFAULT always exists -#[test] -fn non_cf_methods_implicit_default_cf() { - let db = engine_cfs(&[CF_WRITE]); - db.engine.put(b"foo", b"bar").unwrap(); - let value = db.engine.get_value(b"foo").unwrap(); - let value = value.expect("value"); - assert_eq!(b"bar", &*value); - let value = db.engine.get_value_cf(CF_DEFAULT, b"foo").unwrap(); - let value = value.expect("value"); - assert_eq!(b"bar", &*value); -} diff --git a/components/engine_traits_tests/src/cf_names.rs b/components/engine_traits_tests/src/cf_names.rs index 187df39a081..48031275b14 100644 --- a/components/engine_traits_tests/src/cf_names.rs +++ b/components/engine_traits_tests/src/cf_names.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{CFNamesExt, KvEngine, Snapshot, ALL_CFS, CF_DEFAULT, CF_WRITE}; +use engine_traits::{CFNamesExt, KvEngine, Snapshot, ALL_CFS, CF_DEFAULT}; use super::{default_engine, engine_cfs}; @@ -22,14 +22,6 @@ fn cf_names() { } } -#[test] -fn implicit_default_cf() { - let db = engine_cfs(&[CF_WRITE]); - let names = db.engine.cf_names(); - assert_eq!(names.len(), 2); - assert!(names.contains(&CF_DEFAULT)); -} - #[test] fn default_names_snapshot() { let db = default_engine(); @@ -49,12 +41,3 @@ fn cf_names_snapshot() { assert!(names.contains(cf)); } } - -#[test] -fn implicit_default_cf_snapshot() { - let db = engine_cfs(&[CF_WRITE]); - let snapshot = db.engine.snapshot(); - let names = snapshot.cf_names(); - assert_eq!(names.len(), 2); - assert!(names.contains(&CF_DEFAULT)); -} diff --git a/components/engine_traits_tests/src/ctor.rs b/components/engine_traits_tests/src/ctor.rs index b3338a46367..5f39ad4f3a7 100644 --- a/components/engine_traits_tests/src/ctor.rs +++ b/components/engine_traits_tests/src/ctor.rs @@ -5,7 +5,7 @@ use std::fs; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions, DBOptions, KvEngineConstructorExt}, + ctor::{ColumnFamilyOptions, DBOptions, KvEngineConstructorExt}, kv::KvTestEngine, }; use engine_traits::{KvEngine, SyncMutable, ALL_CFS}; @@ -16,7 +16,7 @@ use super::tempdir; fn new_engine_basic() { let dir = tempdir(); let path = dir.path().to_str().unwrap(); - let _db = KvTestEngine::new_kv_engine(path, None, ALL_CFS, None).unwrap(); + let _db = KvTestEngine::new_kv_engine(path, ALL_CFS).unwrap(); } #[test] @@ -26,7 +26,7 @@ fn new_engine_opt_basic() { let db_opts = DBOptions::default(); let cf_opts = ALL_CFS .iter() - .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) + .map(|cf| (*cf, ColumnFamilyOptions::new())) .collect(); let _db = KvTestEngine::new_kv_engine_opt(path, db_opts, cf_opts).unwrap(); } @@ -37,7 +37,7 @@ fn new_engine_missing_dir() { let dir = tempdir(); let path = dir.path(); let path = path.join("missing").to_str().unwrap().to_owned(); - let db = KvTestEngine::new_kv_engine(&path, None, ALL_CFS, None).unwrap(); + let db = KvTestEngine::new_kv_engine(&path, ALL_CFS).unwrap(); db.put(b"foo", b"bar").unwrap(); db.sync().unwrap(); } @@ -50,7 +50,7 @@ fn new_engine_opt_missing_dir() { let db_opts = DBOptions::default(); let cf_opts = ALL_CFS .iter() - .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) + .map(|cf| (*cf, ColumnFamilyOptions::new())) .collect(); let db = KvTestEngine::new_kv_engine_opt(&path, db_opts, cf_opts).unwrap(); db.put(b"foo", b"bar").unwrap(); @@ -71,7 +71,7 @@ fn new_engine_readonly_dir() { fs::set_permissions(&path, perms).unwrap(); let path = path.to_str().unwrap(); - let err = KvTestEngine::new_kv_engine(path, None, ALL_CFS, None); + let err = KvTestEngine::new_kv_engine(path, ALL_CFS); assert!(err.is_err()); } @@ -93,7 +93,7 @@ fn new_engine_opt_readonly_dir() { let db_opts = DBOptions::default(); let cf_opts = ALL_CFS .iter() - .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) + .map(|cf| (*cf, ColumnFamilyOptions::new())) .collect(); let err = KvTestEngine::new_kv_engine_opt(path, db_opts, cf_opts); diff --git a/components/engine_traits_tests/src/lib.rs b/components/engine_traits_tests/src/lib.rs index 0ddb39c61ac..73c741ff925 100644 --- a/components/engine_traits_tests/src/lib.rs +++ b/components/engine_traits_tests/src/lib.rs @@ -64,7 +64,7 @@ fn default_engine() -> TempDirEnginePair { let dir = tempdir(); let path = dir.path().to_str().unwrap(); - let engine = KvTestEngine::new_kv_engine(path, None, &[CF_DEFAULT], None).unwrap(); + let engine = KvTestEngine::new_kv_engine(path, &[CF_DEFAULT]).unwrap(); TempDirEnginePair { engine, tempdir: dir, @@ -74,7 +74,10 @@ fn default_engine() -> TempDirEnginePair { /// Create a multi batch write engine with only CF_DEFAULT fn multi_batch_write_engine() -> TempDirEnginePair { use engine_test::{ - ctor::{DBOptions as KvTestDBOptions, KvEngineConstructorExt}, + ctor::{ + ColumnFamilyOptions as KvTestCFOptions, DBOptions as KvTestDBOptions, + KvEngineConstructorExt, + }, kv::KvTestEngine, }; use engine_traits::CF_DEFAULT; @@ -83,7 +86,9 @@ fn multi_batch_write_engine() -> TempDirEnginePair { let path = dir.path().to_str().unwrap(); let mut opt = KvTestDBOptions::default(); opt.set_enable_multi_batch_write(true); - let engine = KvTestEngine::new_kv_engine(path, Some(opt), &[CF_DEFAULT], None).unwrap(); + let engine = + KvTestEngine::new_kv_engine_opt(path, opt, vec![(CF_DEFAULT, KvTestCFOptions::new())]) + .unwrap(); TempDirEnginePair { engine, tempdir: dir, @@ -96,7 +101,7 @@ fn engine_cfs(cfs: &[&str]) -> TempDirEnginePair { let dir = tempdir(); let path = dir.path().to_str().unwrap(); - let engine = KvTestEngine::new_kv_engine(path, None, cfs, None).unwrap(); + let engine = KvTestEngine::new_kv_engine(path, cfs).unwrap(); TempDirEnginePair { engine, tempdir: dir, diff --git a/components/raftstore/src/coprocessor/split_check/half.rs b/components/raftstore/src/coprocessor/split_check/half.rs index f6d207df875..57472b5cecf 100644 --- a/components/raftstore/src/coprocessor/split_check/half.rs +++ b/components/raftstore/src/coprocessor/split_check/half.rs @@ -125,7 +125,7 @@ pub fn get_region_approximate_middle( mod tests { use std::{iter, sync::mpsc}; - use engine_test::ctor::{CFOptions, ColumnFamilyOptions, DBOptions}; + use engine_test::ctor::{ColumnFamilyOptions, DBOptions}; use engine_traits::{MiscExt, SyncMutable, ALL_CFS, CF_DEFAULT, LARGE_CFS}; use kvproto::{ metapb::{Peer, Region}, @@ -148,15 +148,7 @@ mod tests { fn test_split_check() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| { - let cf_opts = ColumnFamilyOptions::new(); - CFOptions::new(cf, cf_opts) - }) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -201,15 +193,7 @@ mod tests { fn test_split_check_with_key_range() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| { - let cf_opts = ColumnFamilyOptions::new(); - CFOptions::new(cf, cf_opts) - }) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -273,15 +257,7 @@ mod tests { fn test_generate_region_bucket_impl(mvcc: bool) { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| { - let cf_opts = ColumnFamilyOptions::new(); - CFOptions::new(cf, cf_opts) - }) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -405,15 +381,7 @@ mod tests { fn test_generate_region_bucket_with_deleting_data() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| { - let cf_opts = ColumnFamilyOptions::new(); - CFOptions::new(cf, cf_opts) - }) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -520,10 +488,7 @@ mod tests { let db_opts = DBOptions::default(); let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let engine = engine_test::kv::new_engine_opt(path, db_opts, cfs_opts).unwrap(); let mut big_value = Vec::with_capacity(256); diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index 22a81e54f31..8c0d7aad86c 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -230,7 +230,7 @@ pub fn get_region_approximate_keys( mod tests { use std::{cmp, sync::mpsc, u64}; - use engine_test::ctor::{CFOptions, ColumnFamilyOptions, DBOptions}; + use engine_test::ctor::{ColumnFamilyOptions, DBOptions}; use engine_traits::{KvEngine, MiscExt, SyncMutable, ALL_CFS, CF_DEFAULT, CF_WRITE, LARGE_CFS}; use kvproto::{ metapb::{Peer, Region}, @@ -290,13 +290,7 @@ mod tests { fn test_split_check() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ColumnFamilyOptions::new(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -400,13 +394,7 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ColumnFamilyOptions::new(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -466,10 +454,7 @@ mod tests { let db_opts = DBOptions::default(); let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let cases = [("a", 1024), ("b", 2048), ("c", 4096)]; @@ -575,13 +560,7 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ColumnFamilyOptions::new(); - let cfs_opts = ALL_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); - let engine = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); + let engine = engine_test::kv::new_engine(path_str, ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -655,10 +634,7 @@ mod tests { let db_opts = DBOptions::default(); let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); // size >= 4194304 will insert a new point in range properties diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 30198cd2337..352e956d43e 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -259,7 +259,7 @@ pub mod tests { use collections::HashSet; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions, DBOptions}, + ctor::{ColumnFamilyOptions, DBOptions}, kv::KvTestEngine, }; use engine_traits::{ @@ -446,9 +446,9 @@ pub mod tests { .iter() .map(|cf| { if cfs_with_range_prop.contains(cf) { - CFOptions::new(cf, ColumnFamilyOptions::new()) + (*cf, ColumnFamilyOptions::new()) } else { - CFOptions::new(cf, cf_opt.clone()) + (*cf, cf_opt.clone()) } }) .collect(); @@ -576,9 +576,9 @@ pub mod tests { if cfs_with_range_prop.contains(cf) { let mut opt = ColumnFamilyOptions::new(); opt.set_disable_auto_compactions(true); - CFOptions::new(cf, opt) + (*cf, opt) } else { - CFOptions::new(cf, cf_opt.clone()) + (*cf, cf_opt.clone()) } }) .collect(); @@ -713,9 +713,9 @@ pub mod tests { if cfs_with_range_prop.contains(cf) { let mut opt = ColumnFamilyOptions::new(); opt.set_disable_auto_compactions(true); - CFOptions::new(cf, opt) + (*cf, opt) } else { - CFOptions::new(cf, cf_opt.clone()) + (*cf, cf_opt.clone()) } }) .collect(); @@ -768,9 +768,9 @@ pub mod tests { .iter() .map(|cf| { if cf != &CF_LOCK { - CFOptions::new(cf, ColumnFamilyOptions::new()) + (*cf, ColumnFamilyOptions::new()) } else { - CFOptions::new(cf, cf_opt.clone()) + (*cf, cf_opt.clone()) } }) .collect(); @@ -829,7 +829,7 @@ pub mod tests { .map(|cf| { let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_no_range_properties(true); - CFOptions::new(cf, cf_opts) + (*cf, cf_opts) }) .collect(); let engine = @@ -911,10 +911,7 @@ pub mod tests { cf_opts.set_level_zero_file_num_compaction_trigger(10); cf_opts.set_no_range_properties(true); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let engine = engine_test::kv::new_engine_opt(path, db_opts, cfs_opts).unwrap(); let region = make_region(1, vec![], vec![]); @@ -947,10 +944,7 @@ pub mod tests { let db_opts = DBOptions::default(); let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let engine = engine_test::kv::new_engine_opt(path, db_opts, cfs_opts).unwrap(); let mut big_value = Vec::with_capacity(256); @@ -1062,10 +1056,7 @@ pub mod tests { let db_opts = DBOptions::default(); let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let cases = [("a", 1024), ("b", 2048), ("c", 4096)]; @@ -1095,10 +1086,7 @@ pub mod tests { let db_opts = DBOptions::default(); let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_disable_auto_compactions(true); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let mut cf_size = 0; @@ -1133,10 +1121,7 @@ pub mod tests { let db_opts = DBOptions::default(); let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_disable_auto_compactions(true); - let cfs_opts = LARGE_CFS - .iter() - .map(|cf| CFOptions::new(cf, cf_opts.clone())) - .collect(); + let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); let mut cf_size = 0; diff --git a/components/raftstore/src/coprocessor/split_check/table.rs b/components/raftstore/src/coprocessor/split_check/table.rs index e377d4b550a..df2fa0fb7c6 100644 --- a/components/raftstore/src/coprocessor/split_check/table.rs +++ b/components/raftstore/src/coprocessor/split_check/table.rs @@ -256,7 +256,7 @@ mod tests { .prefix("test_last_key_of_region") .tempdir() .unwrap(); - let engine = new_engine(path.path().to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let engine = new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); @@ -309,7 +309,7 @@ mod tests { .prefix("test_table_check_observer") .tempdir() .unwrap(); - let engine = new_engine(path.path().to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let engine = new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let mut region = Region::default(); region.set_id(1); diff --git a/components/raftstore/src/store/bootstrap.rs b/components/raftstore/src/store/bootstrap.rs index 561425d9d00..e1c90a177c7 100644 --- a/components/raftstore/src/store/bootstrap.rs +++ b/components/raftstore/src/store/bootstrap.rs @@ -136,13 +136,9 @@ mod tests { fn test_bootstrap() { let path = Builder::new().prefix("var").tempdir().unwrap(); let raft_path = path.path().join("raft"); - let kv_engine = engine_test::kv::new_engine( - path.path().to_str().unwrap(), - None, - &[CF_DEFAULT, CF_RAFT], - None, - ) - .unwrap(); + let kv_engine = + engine_test::kv::new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, CF_RAFT]) + .unwrap(); let raft_engine = engine_test::raft::new_engine(raft_path.to_str().unwrap(), None).unwrap(); let engines = Engines::new(kv_engine.clone(), raft_engine.clone()); let region = initial_region(1, 1, 1); diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index e7a59631ca1..4fb4c7feb7a 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -195,12 +195,12 @@ impl SstPartitioner for CompactionGuardGenerator

{ #[cfg(test)] mod tests { - use std::{str, sync::Arc}; + use std::str; use engine_rocks::{ - raw::{BlockBasedOptions, ColumnFamilyOptions, DBCompressionType, DBOptions}, - raw_util::{new_engine_opt, CFOptions}, - RocksEngine, RocksSstPartitionerFactory, RocksSstReader, + raw::{BlockBasedOptions, DBCompressionType}, + util::new_engine_opt, + RocksCfOptions, RocksDBOptions, RocksEngine, RocksSstPartitionerFactory, RocksSstReader, }; use engine_traits::{CompactExt, Iterator, MiscExt, SstReader, SyncMutable, CF_DEFAULT}; use keys::DATA_PREFIX_KEY; @@ -367,7 +367,7 @@ mod tests { fn new_test_db(provider: MockRegionInfoProvider) -> (RocksEngine, TempDir) { let temp_dir = TempDir::new().unwrap(); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_target_file_size_base(MAX_OUTPUT_FILE_SIZE); cf_opts.set_sst_partitioner_factory(RocksSstPartitionerFactory( CompactionGuardGeneratorFactory::new(CF_DEFAULT, provider, MIN_OUTPUT_FILE_SIZE) @@ -389,14 +389,12 @@ mod tests { block_based_opts.set_block_size(100); cf_opts.set_block_based_table_factory(&block_based_opts); - let db = RocksEngine::from_db(Arc::new( - new_engine_opt( - temp_dir.path().to_str().unwrap(), - DBOptions::new(), - vec![CFOptions::new(CF_DEFAULT, cf_opts)], - ) - .unwrap(), - )); + let db = new_engine_opt( + temp_dir.path().to_str().unwrap(), + RocksDBOptions::default(), + vec![(CF_DEFAULT, cf_opts)], + ) + .unwrap(); (db, temp_dir) } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 98d12303b19..ab73c0bc8c6 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -4370,13 +4370,7 @@ mod tests { pub fn create_tmp_engine(path: &str) -> (TempDir, KvTestEngine) { let path = Builder::new().prefix(path).tempdir().unwrap(); - let engine = new_engine( - path.path().join("db").to_str().unwrap(), - None, - ALL_CFS, - None, - ) - .unwrap(); + let engine = new_engine(path.path().join("db").to_str().unwrap(), ALL_CFS).unwrap(); (path, engine) } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 8301c75e7c3..76bb95b0d39 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1285,8 +1285,7 @@ pub mod tests { raftlog_fetch_scheduler: Scheduler, path: &TempDir, ) -> PeerStorage { - let kv_db = engine_test::kv::new_engine(path.path().to_str().unwrap(), None, ALL_CFS, None) - .unwrap(); + let kv_db = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let raft_path = path.path().join(Path::new("raft")); let raft_db = engine_test::raft::new_engine(raft_path.to_str().unwrap(), None).unwrap(); let engines = Engines::new(kv_db, raft_db); @@ -2087,8 +2086,7 @@ pub mod tests { let region_sched = region_worker.scheduler(); let raftlog_fetch_worker = LazyWorker::new("raftlog-fetch-worker"); let raftlog_fetch_sched = raftlog_fetch_worker.scheduler(); - let kv_db = - engine_test::kv::new_engine(td.path().to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let kv_db = engine_test::kv::new_engine(td.path().to_str().unwrap(), ALL_CFS).unwrap(); let raft_path = td.path().join(Path::new("raft")); let raft_db = engine_test::raft::new_engine(raft_path.to_str().unwrap(), None).unwrap(); let engines = Engines::new(kv_db, raft_db); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index cca1dfbda77..6a8aa5ca3bf 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1883,7 +1883,7 @@ pub mod tests { use encryption::{DataKeyManager, EncryptionConfig, FileConfig, MasterKeyConfig}; use encryption_export::data_key_manager_from_config; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftDBOptions}, + ctor::{ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftDBOptions}, kv::KvTestEngine, raft::RaftTestEngine, }; @@ -1917,32 +1917,41 @@ pub mod tests { const TEST_META_FILE_BUFFER_SIZE: usize = 1000; const BYTE_SIZE: usize = 1; - type DBBuilder = - fn(p: &Path, db_opt: Option, cf_opts: Option>) -> Result; + type DBBuilder = fn( + p: &Path, + db_opt: Option, + cf_opts: Option>, + ) -> Result; pub fn open_test_empty_db( path: &Path, db_opt: Option, - cf_opts: Option>, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, { let p = path.to_str().unwrap(); - let db = E::new_kv_engine(p, db_opt, ALL_CFS, cf_opts).unwrap(); + let db_opt = db_opt.unwrap_or_default(); + let cf_opts = cf_opts.unwrap_or_else(|| { + ALL_CFS + .iter() + .map(|cf| (*cf, ColumnFamilyOptions::default())) + .collect() + }); + let db = E::new_kv_engine_opt(p, db_opt, cf_opts).unwrap(); Ok(db) } pub fn open_test_db( path: &Path, db_opt: Option, - cf_opts: Option>, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, { - let p = path.to_str().unwrap(); - let db = E::new_kv_engine(p, db_opt, ALL_CFS, cf_opts).unwrap(); + let db = open_test_empty_db::(path, db_opt, cf_opts).unwrap(); let key = keys::data_key(TEST_KEY); // write some data into each cf for (i, cf) in db.cf_names().into_iter().enumerate() { @@ -1957,13 +1966,12 @@ pub mod tests { pub fn open_test_db_with_100keys( path: &Path, db_opt: Option, - cf_opts: Option>, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, { - let p = path.to_str().unwrap(); - let db = E::new_kv_engine(p, db_opt, ALL_CFS, cf_opts).unwrap(); + let db = open_test_empty_db::(path, db_opt, cf_opts).unwrap(); // write some data into each cf for (i, cf) in db.cf_names().into_iter().enumerate() { let mut p = Peer::default(); @@ -1981,7 +1989,7 @@ pub mod tests { path: &TempDir, raft_db_opt: Option, kv_db_opt: Option, - kv_cf_opts: Option>, + kv_cf_opts: Option>, regions: &[u64], ) -> Result> { let p = path.path(); @@ -2267,7 +2275,7 @@ pub mod tests { let dst_db_path = dst_db_dir.path().to_str().unwrap(); // Change arbitrarily the cf order of ALL_CFS at destination db. let dst_cfs = [CF_WRITE, CF_DEFAULT, CF_LOCK, CF_RAFT]; - let dst_db = engine_test::kv::new_engine(dst_db_path, None, &dst_cfs, None).unwrap(); + let dst_db = engine_test::kv::new_engine(dst_db_path, &dst_cfs).unwrap(); let options = ApplyOptions { db: dst_db.clone(), region, @@ -2816,7 +2824,7 @@ pub mod tests { let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_no_range_properties(true); cf_opts.set_no_table_properties(true); - CFOptions::new(cf, cf_opts) + (*cf, cf_opts) }) .collect(); let engine = diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index afa4d609da1..88222623084 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -247,7 +247,7 @@ mod tests { use std::{thread::sleep, time::Duration}; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions, DBOptions}, + ctor::{ColumnFamilyOptions, DBOptions}, kv::{new_engine, new_engine_opt, KvTestEngine}, }; use engine_traits::{ @@ -266,7 +266,7 @@ mod tests { .prefix("compact-range-test") .tempdir() .unwrap(); - let db = new_engine(path.path().to_str().unwrap(), None, &[CF_DEFAULT], None).unwrap(); + let db = new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT]).unwrap(); let mut runner = Runner::new(db.clone()); @@ -323,10 +323,10 @@ mod tests { let mut cf_opts = ColumnFamilyOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(8); let cfs_opts = vec![ - CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new()), - CFOptions::new(CF_RAFT, ColumnFamilyOptions::new()), - CFOptions::new(CF_LOCK, ColumnFamilyOptions::new()), - CFOptions::new(CF_WRITE, cf_opts), + (CF_DEFAULT, ColumnFamilyOptions::new()), + (CF_RAFT, ColumnFamilyOptions::new()), + (CF_LOCK, ColumnFamilyOptions::new()), + (CF_WRITE, cf_opts), ]; new_engine_opt(path, db_opts, cfs_opts).unwrap() } diff --git a/components/raftstore/src/store/worker/consistency_check.rs b/components/raftstore/src/store/worker/consistency_check.rs index dfd2b527168..154f1816dbf 100644 --- a/components/raftstore/src/store/worker/consistency_check.rs +++ b/components/raftstore/src/store/worker/consistency_check.rs @@ -141,13 +141,7 @@ mod tests { #[test] fn test_consistency_check() { let path = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let db = new_engine( - path.path().to_str().unwrap(), - None, - &[CF_DEFAULT, CF_RAFT], - None, - ) - .unwrap(); + let db = new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, CF_RAFT]).unwrap(); let mut region = Region::default(); region.mut_peers().push(Peer::default()); diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index 71584a5e678..bf7debfb1d9 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -214,8 +214,7 @@ mod tests { let path_raft = dir.path().join("raft"); let path_kv = dir.path().join("kv"); let raft_db = engine_test::raft::new_engine(path_kv.to_str().unwrap(), None).unwrap(); - let kv_db = - engine_test::kv::new_engine(path_raft.to_str().unwrap(), None, ALL_CFS, None).unwrap(); + let kv_db = engine_test::kv::new_engine(path_raft.to_str().unwrap(), ALL_CFS).unwrap(); let engines = Engines::new(kv_db, raft_db.clone()); let (tx, rx) = mpsc::channel(); diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 81358c989e0..1be9cf8b4e9 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -981,8 +981,7 @@ mod tests { Receiver>, ) { let path = Builder::new().prefix(path).tempdir().unwrap(); - let db = engine_test::kv::new_engine(path.path().to_str().unwrap(), None, ALL_CFS, None) - .unwrap(); + let db = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let (ch, rx, _) = MockRouter::new(); let mut reader = LocalReader::new(db, store_meta, ch); reader.store_id = Cell::new(Some(store_id)); diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 4bc5cc032a3..cdd0ee5556b 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -813,12 +813,12 @@ mod tests { }; use engine_test::{ - ctor::{CFOptions, ColumnFamilyOptions}, + ctor::ColumnFamilyOptions, kv::{KvTestEngine, KvTestSnapshot}, }; use engine_traits::{ CompactExt, FlowControlFactorsExt, KvEngine, MiscExt, Mutable, Peekable, - RaftEngineReadOnly, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, + RaftEngineReadOnly, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_WRITE, }; use keys::data_key; use kvproto::raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}; @@ -986,10 +986,10 @@ mod tests { cf_opts.set_level_zero_slowdown_writes_trigger(5); cf_opts.set_disable_auto_compactions(true); let kv_cfs_opts = vec![ - CFOptions::new("default", cf_opts.clone()), - CFOptions::new("write", cf_opts.clone()), - CFOptions::new("lock", cf_opts.clone()), - CFOptions::new("raft", cf_opts.clone()), + (CF_DEFAULT, cf_opts.clone()), + (CF_WRITE, cf_opts.clone()), + (CF_LOCK, cf_opts.clone()), + (CF_RAFT, cf_opts.clone()), ]; let engine = get_test_db_for_regions( &temp_dir, diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index d011f9be93f..bf06ecefcea 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -214,7 +214,6 @@ fn run_dump_raft_engine_worker( #[cfg(test)] mod tests { - use engine_rocks::raw::DBOptions; use tikv::config::TiKvConfig; use super::*; @@ -245,13 +244,12 @@ mod tests { { // Prepare some data for the RocksEngine. - let raftdb = engine_rocks::raw_util::new_engine_opt( + let raftdb = engine_rocks::util::new_engine_opt( &cfg.raft_store.raftdb_path, cfg.raftdb.build_opt(), cfg.raftdb.build_cf_opts(&None), ) .unwrap(); - let raftdb = RocksEngine::from_db(Arc::new(raftdb)); let mut batch = raftdb.log_batch(0); set_write_batch(1, &mut batch); raftdb.consume(&mut batch, false).unwrap(); @@ -271,15 +269,8 @@ mod tests { std::fs::remove_dir_all(&cfg.raft_store.raftdb_path).unwrap(); // Dump logs from RaftLogEngine to RocksEngine. - let raftdb = { - let db = engine_rocks::raw_util::new_engine_opt( - &cfg.raft_store.raftdb_path, - DBOptions::new(), - vec![], - ) - .unwrap(); - RocksEngine::from_db(Arc::new(db)) - }; + let raftdb = + engine_rocks::util::new_engine(&cfg.raft_store.raftdb_path, &[CF_DEFAULT]).unwrap(); dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 4); assert(1, &raftdb); assert(5, &raftdb); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 58a4dc61338..ad788f2ecec 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -649,7 +649,7 @@ impl TiKvServer { if let Some(sst_worker) = &mut self.sst_worker { let sst_runner = RecoveryRunner::new( - engines.engines.kv.get_sync_db(), + engines.engines.kv.clone(), engines.store_meta.clone(), self.config.storage.background_error_recovery_window.into(), DEFAULT_CHECK_INTERVAL, @@ -1186,7 +1186,7 @@ impl TiKvServer { servers.node.id(), engines.engine.clone(), self.region_info_accessor.clone(), - engines.engines.kv.as_inner().clone(), + engines.engines.kv.clone(), self.config.backup.clone(), self.concurrency_manager.clone(), self.config.storage.api_version(), @@ -1551,10 +1551,9 @@ impl ConfiguredRaftEngine for RocksEngine { let mut raft_db_opts = config_raftdb.build_opt(); raft_db_opts.set_env(env.clone()); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let raftdb = - engine_rocks::raw_util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) + let mut raftdb = + engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) .expect("failed to open raftdb"); - let mut raftdb = RocksEngine::from_db(Arc::new(raftdb)); raftdb.set_shared_block_cache(block_cache.is_some()); if should_dump { @@ -1609,13 +1608,12 @@ impl ConfiguredRaftEngine for RaftLogEngine { let mut raft_db_opts = config_raftdb.build_opt(); raft_db_opts.set_env(env.clone()); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let raftdb = engine_rocks::raw_util::new_engine_opt( + let raftdb = engine_rocks::util::new_engine_opt( &config.raft_store.raftdb_path, raft_db_opts, raft_cf_opts, ) .expect("failed to open raftdb for migration"); - let raftdb = RocksEngine::from_db(Arc::new(raftdb)); dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /*threads*/); raftdb.stop(); drop(raftdb); diff --git a/components/sst_importer/src/import_mode.rs b/components/sst_importer/src/import_mode.rs index 3123ed66da5..39dca3bea02 100644 --- a/components/sst_importer/src/import_mode.rs +++ b/components/sst_importer/src/import_mode.rs @@ -242,7 +242,7 @@ impl ImportModeCFOptions { mod tests { use std::thread; - use engine_traits::KvEngine; + use engine_traits::{KvEngine, CF_DEFAULT}; use futures::executor::ThreadPoolBuilder; use tempfile::Builder; use test_sst_importer::{new_test_engine, new_test_engine_with_options}; @@ -290,7 +290,7 @@ mod tests { .prefix("test_import_mode_switcher") .tempdir() .unwrap(); - let db = new_test_engine(temp_dir.path().to_str().unwrap(), &["a", "b"]); + let db = new_test_engine(temp_dir.path().to_str().unwrap(), &[CF_DEFAULT, "a", "b"]); let normal_db_options = ImportModeDBOptions::new_options(&db); let import_db_options = normal_db_options.optimized_for_import_mode(); @@ -331,7 +331,7 @@ mod tests { .prefix("test_import_mode_timeout") .tempdir() .unwrap(); - let db = new_test_engine(temp_dir.path().to_str().unwrap(), &["a", "b"]); + let db = new_test_engine(temp_dir.path().to_str().unwrap(), &[CF_DEFAULT, "a", "b"]); let normal_db_options = ImportModeDBOptions::new_options(&db); let import_db_options = normal_db_options.optimized_for_import_mode(); diff --git a/components/sst_importer/src/util.rs b/components/sst_importer/src/util.rs index a3a71ba8144..042b430b811 100644 --- a/components/sst_importer/src/util.rs +++ b/components/sst_importer/src/util.rs @@ -69,13 +69,12 @@ mod tests { use encryption::DataKeyManager; use engine_rocks::{ - util::{new_engine, RocksCFOptions}, - RocksColumnFamilyOptions, RocksDBOptions, RocksEngine, RocksSstWriterBuilder, + util::new_engine_opt, RocksCfOptions, RocksDBOptions, RocksEngine, RocksSstWriterBuilder, RocksTitanDBOptions, }; use engine_traits::{ CfName, ColumnFamilyOptions, DBOptions, EncryptionKeyManager, ImportExt, Peekable, - SstWriter, SstWriterBuilder, TitanDBOptions, + SstWriter, SstWriterBuilder, TitanDBOptions, CF_DEFAULT, }; use tempfile::Builder; use test_util::encryption::new_test_key_manager; @@ -116,7 +115,7 @@ mod tests { fn check_prepare_sst_for_ingestion( db_opts: Option, - cf_opts: Option>>, + cf_opts: Option>, key_manager: Option<&DataKeyManager>, was_encrypted: bool, ) { @@ -135,10 +134,11 @@ mod tests { let kvs = [("k1", "v1"), ("k2", "v2"), ("k3", "v3")]; - let cf_name = "default"; - let db = new_engine(path_str, db_opts, &[cf_name], cf_opts).unwrap(); + let db_opts = db_opts.unwrap_or_default(); + let cf_opts = cf_opts.unwrap_or_else(|| vec![(CF_DEFAULT, RocksCfOptions::default())]); + let db = new_engine_opt(path_str, db_opts, cf_opts).unwrap(); - gen_sst_with_kvs(&db, cf_name, sst_path.to_str().unwrap(), &kvs); + gen_sst_with_kvs(&db, CF_DEFAULT, sst_path.to_str().unwrap(), &kvs); if was_encrypted { // Add the file to key_manager to simulate an encrypted file. @@ -156,9 +156,9 @@ mod tests { prepare_sst_for_ingestion(&sst_path, &sst_clone, key_manager).unwrap(); check_hard_link(&sst_path, 2); check_hard_link(&sst_clone, 2); - db.ingest_external_file_cf(cf_name, &[sst_clone.to_str().unwrap()]) + db.ingest_external_file_cf(CF_DEFAULT, &[sst_clone.to_str().unwrap()]) .unwrap(); - check_db_with_kvs(&db, cf_name, &kvs); + check_db_with_kvs(&db, CF_DEFAULT, &kvs); assert!(!sst_clone.exists()); // Since we are not using key_manager in db, simulate the db deleting the file from // key_manager. @@ -171,9 +171,9 @@ mod tests { prepare_sst_for_ingestion(&sst_path, &sst_clone, key_manager).unwrap(); check_hard_link(&sst_path, 2); check_hard_link(&sst_clone, 1); - db.ingest_external_file_cf(cf_name, &[sst_clone.to_str().unwrap()]) + db.ingest_external_file_cf(CF_DEFAULT, &[sst_clone.to_str().unwrap()]) .unwrap(); - check_db_with_kvs(&db, cf_name, &kvs); + check_db_with_kvs(&db, CF_DEFAULT, &kvs); assert!(!sst_clone.exists()); } @@ -192,11 +192,11 @@ mod tests { // Force all values write out to blob files. titan_opts.set_min_blob_size(0); db_opts.set_titandb_options(&titan_opts); - let mut cf_opts = RocksColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::new(); cf_opts.set_titandb_options(&titan_opts); check_prepare_sst_for_ingestion( Some(db_opts), - Some(vec![RocksCFOptions::new("default", cf_opts)]), + Some(vec![(CF_DEFAULT, cf_opts)]), None, /*key_manager*/ false, /*was_encrypted*/ ); diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index afdcd279e19..dfdffd97105 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -85,7 +85,7 @@ impl TestSuite { *id, sim.storages[id].clone(), sim.region_info_accessors[id].clone(), - engines.kv.as_inner().clone(), + engines.kv.clone(), BackupConfig { num_threads: 4, batch_size: 8, diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 28112304496..0359952d237 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -12,7 +12,7 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; use encryption_export::DataKeyManager; -use engine_rocks::{raw::DB, Compat, RocksEngine, RocksSnapshot}; +use engine_rocks::{RocksEngine, RocksSnapshot}; use engine_test::raft::RaftTestEngine; use engine_traits::{ CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, WriteBatch, @@ -371,8 +371,8 @@ impl Cluster { debug!("node {} stopped", node_id); } - pub fn get_engine(&self, node_id: u64) -> Arc { - Arc::clone(self.engines[&node_id].kv.as_inner()) + pub fn get_engine(&self, node_id: u64) -> RocksEngine { + self.engines[&node_id].kv.clone() } pub fn get_raft_engine(&self, node_id: u64) -> RaftTestEngine { @@ -736,14 +736,14 @@ impl Cluster { self.leaders.remove(®ion_id); } - pub fn assert_quorum) -> bool>(&self, mut condition: F) { + pub fn assert_quorum bool>(&self, mut condition: F) { if self.engines.is_empty() { return; } let half = self.engines.len() / 2; let mut qualified_cnt = 0; for (id, engines) in &self.engines { - if !condition(engines.kv.as_inner()) { + if !condition(&engines.kv) { debug!("store {} is not qualified yet.", id); continue; } @@ -1178,7 +1178,6 @@ impl Cluster { pub fn apply_state(&self, region_id: u64, store_id: u64) -> RaftApplyState { let key = keys::apply_state_key(region_id); self.get_engine(store_id) - .c() .get_msg_cf::(engine_traits::CF_RAFT, &key) .unwrap() .unwrap() @@ -1197,7 +1196,6 @@ impl Cluster { pub fn region_local_state(&self, region_id: u64, store_id: u64) -> RegionLocalState { self.get_engine(store_id) - .c() .get_msg_cf::( engine_traits::CF_RAFT, &keys::region_state_key(region_id), @@ -1210,7 +1208,6 @@ impl Cluster { for _ in 0..100 { let state = self .get_engine(store_id) - .c() .get_msg_cf::( engine_traits::CF_RAFT, &keys::region_state_key(region_id), diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 1769ecc4154..bdd7c08b7e8 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -13,7 +13,7 @@ use collections::HashMap; use encryption_export::{ data_key_manager_from_config, DataKeyManager, FileConfig, MasterKeyConfig, }; -use engine_rocks::{config::BlobRunMode, raw::DB, Compat, RocksEngine, RocksSnapshot}; +use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot}; use engine_test::raft::RaftTestEngine; use engine_traits::{ Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, TabletFactory, ALL_CFS, @@ -55,9 +55,9 @@ use txn_types::Key; use crate::{Cluster, Config, ServerCluster, Simulator, TestPdClient}; -pub fn must_get(engine: &Arc, cf: &str, key: &[u8], value: Option<&[u8]>) { +pub fn must_get(engine: &RocksEngine, cf: &str, key: &[u8], value: Option<&[u8]>) { for _ in 1..300 { - let res = engine.c().get_value_cf(cf, &keys::data_key(key)).unwrap(); + let res = engine.get_value_cf(cf, &keys::data_key(key)).unwrap(); if let (Some(value), Some(res)) = (value, res.as_ref()) { assert_eq!(value, &res[..]); return; @@ -68,7 +68,7 @@ pub fn must_get(engine: &Arc, cf: &str, key: &[u8], value: Option<&[u8]>) { thread::sleep(Duration::from_millis(20)); } debug!("last try to get {}", log_wrappers::hex_encode_upper(key)); - let res = engine.c().get_value_cf(cf, &keys::data_key(key)).unwrap(); + let res = engine.get_value_cf(cf, &keys::data_key(key)).unwrap(); if value.is_none() && res.is_none() || value.is_some() && res.is_some() && value.unwrap() == &*res.unwrap() { @@ -81,19 +81,19 @@ pub fn must_get(engine: &Arc, cf: &str, key: &[u8], value: Option<&[u8]>) { ) } -pub fn must_get_equal(engine: &Arc, key: &[u8], value: &[u8]) { +pub fn must_get_equal(engine: &RocksEngine, key: &[u8], value: &[u8]) { must_get(engine, "default", key, Some(value)); } -pub fn must_get_none(engine: &Arc, key: &[u8]) { +pub fn must_get_none(engine: &RocksEngine, key: &[u8]) { must_get(engine, "default", key, None); } -pub fn must_get_cf_equal(engine: &Arc, cf: &str, key: &[u8], value: &[u8]) { +pub fn must_get_cf_equal(engine: &RocksEngine, cf: &str, key: &[u8], value: &[u8]) { must_get(engine, cf, key, Some(value)); } -pub fn must_get_cf_none(engine: &Arc, cf: &str, key: &[u8]) { +pub fn must_get_cf_none(engine: &RocksEngine, cf: &str, key: &[u8]) { must_get(engine, cf, key, None); } diff --git a/components/test_sst_importer/src/lib.rs b/components/test_sst_importer/src/lib.rs index 9c9ef0496e9..65d2a3dc70a 100644 --- a/components/test_sst_importer/src/lib.rs +++ b/components/test_sst_importer/src/lib.rs @@ -3,12 +3,9 @@ use std::{collections::HashMap, fs, path::Path, sync::Arc}; use engine_rocks::{ - raw::{ - ColumnFamilyOptions, DBEntryType, DBOptions, Env, TablePropertiesCollector, - TablePropertiesCollectorFactory, - }, - raw_util::{new_engine, CFOptions}, - RocksEngine, RocksSstReader, RocksSstWriterBuilder, + raw::{DBEntryType, Env, TablePropertiesCollector, TablePropertiesCollectorFactory}, + util::new_engine_opt, + RocksCfOptions, RocksDBOptions, RocksEngine, RocksSstReader, RocksSstWriterBuilder, }; pub use engine_rocks::{RocksEngine as TestEngine, RocksSstWriter}; use engine_traits::{KvEngine, SstWriter, SstWriterBuilder}; @@ -32,12 +29,12 @@ pub fn new_test_engine_with_options_and_env( env: Option>, ) -> RocksEngine where - F: FnMut(&str, &mut ColumnFamilyOptions), + F: FnMut(&str, &mut RocksCfOptions), { let cf_opts = cfs .iter() .map(|cf| { - let mut opt = ColumnFamilyOptions::new(); + let mut opt = RocksCfOptions::default(); if let Some(ref env) = env { opt.set_env(env.clone()); } @@ -46,22 +43,21 @@ where "tikv.test_properties", TestPropertiesCollectorFactory::new(*cf), ); - CFOptions::new(*cf, opt) + (*cf, opt) }) .collect(); - let db_opts = env.map(|e| { - let mut opts = DBOptions::default(); + let db_opts = env.map_or_else(RocksDBOptions::default, |e| { + let mut opts = RocksDBOptions::default(); opts.set_env(e); opts }); - let db = new_engine(path, db_opts, cfs, Some(cf_opts)).expect("rocks test engine"); - RocksEngine::from_db(Arc::new(db)) + new_engine_opt(path, db_opts, cf_opts).expect("rocks test engine") } pub fn new_test_engine_with_options(path: &str, cfs: &[&str], apply: F) -> RocksEngine where - F: FnMut(&str, &mut ColumnFamilyOptions), + F: FnMut(&str, &mut RocksCfOptions), { new_test_engine_with_options_and_env(path, cfs, apply, None) } diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index 923a1878a42..44437e60f4c 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -570,13 +570,9 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { #[cfg(test)] mod tests { - use std::sync::Arc; - use engine_rocks::{ - raw::ColumnFamilyOptions, - raw_util::{new_engine, CFOptions}, - util::{new_temp_engine, FixedPrefixSliceTransform}, - RocksEngine, RocksSnapshot, + util::{new_engine_opt, new_temp_engine, FixedPrefixSliceTransform}, + RocksCfOptions, RocksDBOptions, RocksEngine, RocksSnapshot, }; use engine_traits::{IterOptions, SyncMutable, CF_DEFAULT}; use keys::data_key; @@ -613,22 +609,19 @@ mod tests { #[test] fn test_seek_and_prev_with_prefix_seek() { let path = Builder::new().prefix("test-cursor").tempdir().unwrap(); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts .set_prefix_extractor( "FixedPrefixSliceTransform", FixedPrefixSliceTransform::new(3), ) .unwrap(); - let engine = new_engine( + let engine = new_engine_opt( path.path().to_str().unwrap(), - None, - &[CF_DEFAULT], - Some(vec![CFOptions::new(CF_DEFAULT, cf_opts)]), + RocksDBOptions::default(), + vec![(CF_DEFAULT, cf_opts)], ) .unwrap(); - let engine = Arc::new(engine); - let engine = RocksEngine::from_db(engine); let (region, _) = load_default_dataset(engine.clone()); diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 1d66f11ad74..c96d996dc5c 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -648,7 +648,7 @@ pub fn write_modifies(kv_engine: &impl LocalEngine, modifies: Vec) -> Re Ok(()) } -pub const TEST_ENGINE_CFS: &[CfName] = &["cf"]; +pub const TEST_ENGINE_CFS: &[CfName] = &[CF_DEFAULT, "cf"]; pub mod tests { use tikv_util::codec::bytes; diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 50059433553..f0331403725 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -11,8 +11,7 @@ use std::{ pub use engine_rocks::RocksSnapshot; use engine_rocks::{ - get_env, raw::DBOptions, raw_util::CFOptions, RocksEngine as BaseRocksEngine, - RocksEngineIterator, + get_env, RocksCfOptions, RocksDBOptions, RocksEngine as BaseRocksEngine, RocksEngineIterator, }; use engine_traits::{ CfName, Engines, IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, @@ -89,11 +88,10 @@ pub struct RocksEngine { impl RocksEngine { pub fn new( path: &str, - cfs: &[CfName], - cfs_opts: Option>>, + db_opts: Option, + cfs_opts: Vec<(CfName, RocksCfOptions)>, shared_block_cache: bool, io_rate_limiter: Option>, - db_opts: Option, ) -> Result { info!("RocksEngine: creating for path"; "path" => path); let (path, temp_dir) = match path { @@ -104,21 +102,16 @@ impl RocksEngine { _ => (path.to_owned(), None), }; let worker = Worker::new("engine-rocksdb"); - let mut db_opts = db_opts.unwrap_or_else(|| DBOptions::new()); + let mut db_opts = db_opts.unwrap_or_default(); if io_rate_limiter.is_some() { db_opts.set_env(get_env(None /*key_manager*/, io_rate_limiter).unwrap()); } - let db = Arc::new(engine_rocks::raw_util::new_engine( - &path, - Some(db_opts), - cfs, - cfs_opts, - )?); + let db = engine_rocks::util::new_engine_opt(&path, db_opts, cfs_opts)?; // It does not use the raft_engine, so it is ok to fill with the same // rocksdb. - let mut kv_engine = BaseRocksEngine::from_db(db.clone()); - let mut raft_engine = BaseRocksEngine::from_db(db); + let mut kv_engine = db.clone(); + let mut raft_engine = db; kv_engine.set_shared_block_cache(shared_block_cache); raft_engine.set_shared_block_cache(shared_block_cache); let engines = Engines::new(kv_engine, raft_engine); diff --git a/src/config.rs b/src/config.rs index 0df2e2a2101..489609d1196 100644 --- a/src/config.rs +++ b/src/config.rs @@ -26,19 +26,19 @@ use engine_rocks::{ get_env, properties::MvccPropertiesCollectorFactory, raw::{ - BlockBasedOptions, Cache, ChecksumType, ColumnFamilyOptions, CompactionPriority, - DBCompactionStyle, DBCompressionType, DBOptions, DBRateLimiterMode, DBRecoveryMode, Env, - LRUCacheOptions, PrepopulateBlockCache, TitanDBOptions, + BlockBasedOptions, Cache, ChecksumType, CompactionPriority, DBCompactionStyle, + DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, LRUCacheOptions, + PrepopulateBlockCache, }, - raw_util::CFOptions, util::{FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform}, - RaftDBLogger, RangePropertiesCollectorFactory, RocksEngine, RocksEventListener, - RocksSstPartitionerFactory, RocksdbLogger, TtlPropertiesCollectorFactory, + RaftDBLogger, RangePropertiesCollectorFactory, RocksCfOptions, RocksDBOptions, RocksEngine, + RocksEventListener, RocksTitanDBOptions, RocksdbLogger, TtlPropertiesCollectorFactory, DEFAULT_PROP_KEYS_INDEX_DISTANCE, DEFAULT_PROP_SIZE_INDEX_DISTANCE, }; use engine_traits::{ CFOptionsExt, ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptions as _, DBOptionsExt, - TabletAccessor, TabletErrorCollector, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + TabletAccessor, TabletErrorCollector, TitanDBOptions as _, CF_DEFAULT, CF_LOCK, CF_RAFT, + CF_WRITE, }; use file_system::IORateLimiter; use keys::region_raft_prefix_len; @@ -178,8 +178,8 @@ impl Default for TitanCfConfig { } impl TitanCfConfig { - fn build_opts(&self) -> TitanDBOptions { - let mut opts = TitanDBOptions::new(); + fn build_opts(&self) -> RocksTitanDBOptions { + let mut opts = RocksTitanDBOptions::new(); opts.set_min_blob_size(self.min_blob_size.0 as u64); opts.set_blob_file_compression(self.blob_file_compression.into()); opts.set_blob_cache(self.blob_cache_size.0 as usize, -1, false, 0.0); @@ -533,7 +533,7 @@ macro_rules! build_cf_opt { block_base_opts.set_prepopulate_block_cache($opt.prepopulate_block_cache); block_base_opts.set_format_version($opt.format_version); block_base_opts.set_checksum($opt.checksum); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_block_based_table_factory(&block_base_opts); cf_opts.set_num_levels($opt.num_levels); assert!($opt.compression_per_level.len() >= $opt.num_levels as usize); @@ -592,7 +592,7 @@ macro_rules! build_cf_opt { $opt.compaction_guard_min_output_file_size.0, ) .unwrap(); - cf_opts.set_sst_partitioner_factory(RocksSstPartitionerFactory(factory)); + cf_opts.set_sst_partitioner_factory(factory); cf_opts.set_target_file_size_base($opt.compaction_guard_max_output_file_size.0); } else { warn!("compaction guard is disabled due to region info provider not available") @@ -671,7 +671,7 @@ impl DefaultCfConfig { cache: &Option, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, - ) -> ColumnFamilyOptions { + ) -> RocksCfOptions { let mut cf_opts = build_cf_opt!(self, CF_DEFAULT, cache, region_info_accessor); let f = RangePropertiesCollectorFactory { prop_size_index_distance: self.prop_size_index_distance, @@ -782,7 +782,7 @@ impl WriteCfConfig { &self, cache: &Option, region_info_accessor: Option<&RegionInfoAccessor>, - ) -> ColumnFamilyOptions { + ) -> RocksCfOptions { let mut cf_opts = build_cf_opt!(self, CF_WRITE, cache, region_info_accessor); // Prefix extractor(trim the timestamp at tail) for write cf. cf_opts @@ -876,7 +876,7 @@ impl Default for LockCfConfig { } impl LockCfConfig { - pub fn build_opt(&self, cache: &Option) -> ColumnFamilyOptions { + pub fn build_opt(&self, cache: &Option) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; let mut cf_opts = build_cf_opt!(self, CF_LOCK, cache, no_region_info_accessor); cf_opts @@ -952,7 +952,7 @@ impl Default for RaftCfConfig { } impl RaftCfConfig { - pub fn build_opt(&self, cache: &Option) -> ColumnFamilyOptions { + pub fn build_opt(&self, cache: &Option) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; let mut cf_opts = build_cf_opt!(self, CF_RAFT, cache, no_region_info_accessor); cf_opts @@ -991,8 +991,8 @@ impl Default for TitanDBConfig { } impl TitanDBConfig { - fn build_opts(&self) -> TitanDBOptions { - let mut opts = TitanDBOptions::new(); + fn build_opts(&self) -> RocksTitanDBOptions { + let mut opts = RocksTitanDBOptions::new(); opts.set_dirname(&self.dirname); opts.set_disable_background_gc(self.disable_gc); opts.set_max_background_gc(self.max_background_gc); @@ -1131,8 +1131,8 @@ impl Default for DbConfig { } impl DbConfig { - pub fn build_opt(&self) -> DBOptions { - let mut opts = DBOptions::new(); + pub fn build_opt(&self) -> RocksDBOptions { + let mut opts = RocksDBOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { opts.set_wal_dir(&self.wal_dir); @@ -1195,20 +1195,20 @@ impl DbConfig { cache: &Option, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, - ) -> Vec> { + ) -> Vec<(&'static str, RocksCfOptions)> { vec![ - CFOptions::new( + ( CF_DEFAULT, self.defaultcf .build_opt(cache, region_info_accessor, api_version), ), - CFOptions::new(CF_LOCK, self.lockcf.build_opt(cache)), - CFOptions::new( + (CF_LOCK, self.lockcf.build_opt(cache)), + ( CF_WRITE, self.writecf.build_opt(cache, region_info_accessor), ), // TODO: remove CF_RAFT. - CFOptions::new(CF_RAFT, self.raftcf.build_opt(cache)), + (CF_RAFT, self.raftcf.build_opt(cache)), ] } @@ -1319,7 +1319,7 @@ impl Default for RaftDefaultCfConfig { } impl RaftDefaultCfConfig { - pub fn build_opt(&self, cache: &Option) -> ColumnFamilyOptions { + pub fn build_opt(&self, cache: &Option) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; let mut cf_opts = build_cf_opt!(self, CF_DEFAULT, cache, no_region_info_accessor); let f = FixedPrefixSliceTransform::new(region_raft_prefix_len()); @@ -1431,8 +1431,8 @@ impl Default for RaftDbConfig { } impl RaftDbConfig { - pub fn build_opt(&self) -> DBOptions { - let mut opts = DBOptions::new(); + pub fn build_opt(&self) -> RocksDBOptions { + let mut opts = RocksDBOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { opts.set_wal_dir(&self.wal_dir); @@ -1473,8 +1473,8 @@ impl RaftDbConfig { opts } - pub fn build_cf_opts(&self, cache: &Option) -> Vec> { - vec![CFOptions::new(CF_DEFAULT, self.defaultcf.build_opt(cache))] + pub fn build_cf_opts(&self, cache: &Option) -> Vec<(&'static str, RocksCfOptions)> { + vec![(CF_DEFAULT, self.defaultcf.build_opt(cache))] } fn validate(&mut self) -> Result<(), Box> { @@ -4048,7 +4048,6 @@ mod tests { use case_macros::*; use engine_traits::{ ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptions as DBOptionsTrait, DummyFactory, - ALL_CFS, }; use futures::executor::block_on; use grpcio::ResourceQuota; @@ -4469,15 +4468,14 @@ mod tests { assert_eq!(F::TAG, cfg.storage.api_version()); let engine = RocksDBEngine::new( &cfg.storage.data_dir, - ALL_CFS, - Some(cfg.rocksdb.build_cf_opts( + Some(cfg.rocksdb.build_opt()), + cfg.rocksdb.build_cf_opts( &cfg.storage.block_cache.build_shared_cache(), None, cfg.storage.api_version(), - )), + ), true, None, - Some(cfg.rocksdb.build_opt()), ) .unwrap(); let storage = diff --git a/src/server/debug.rs b/src/server/debug.rs index 93732c9c580..7bfa2aa438e 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -5,15 +5,14 @@ use std::{ iter::FromIterator, path::Path, result, - sync::Arc, thread::{Builder as ThreadBuilder, JoinHandle}, }; use collections::HashSet; use engine_rocks::{ - raw::{CompactOptions, DBBottommostLevelCompaction, DB}, + raw::{CompactOptions, DBBottommostLevelCompaction}, util::get_cf_handle, - Compat, RocksEngine, RocksEngineIterator, RocksMvccProperties, RocksWriteBatchVec, + RocksEngine, RocksEngineIterator, RocksMvccProperties, RocksWriteBatchVec, }; use engine_traits::{ Engines, IterOptions, Iterable, Iterator as EngineIterator, Mutable, MvccProperties, Peekable, @@ -167,9 +166,9 @@ impl Debugger { Ok(regions) } - fn get_db_from_type(&self, db: DBType) -> Result<&Arc> { + fn get_db_from_type(&self, db: DBType) -> Result<&RocksEngine> { match db { - DBType::Kv => Ok(self.engines.kv.as_inner()), + DBType::Kv => Ok(&self.engines.kv), DBType::Raft => Err(box_err!("Get raft db is not allowed")), _ => Err(box_err!("invalid DBType type")), } @@ -178,7 +177,7 @@ impl Debugger { pub fn get(&self, db: DBType, cf: &str, key: &[u8]) -> Result> { validate_db_and_cf(db, cf)?; let db = self.get_db_from_type(db)?; - match db.c().get_value_cf(cf, key) { + match db.get_value_cf(cf, key) { Ok(Some(v)) => Ok(v.to_vec()), Ok(None) => Err(Error::NotFound(format!( "value for key {:?} in db {:?}", @@ -323,7 +322,7 @@ impl Debugger { ) -> Result<()> { validate_db_and_cf(db, cf)?; let db = self.get_db_from_type(db)?; - let handle = box_try!(get_cf_handle(db, cf)); + let handle = box_try!(get_cf_handle(db.as_inner(), cf)); let start = if start.is_empty() { None } else { Some(start) }; let end = if end.is_empty() { None } else { Some(end) }; info!("Debugger starts manual compact"; "db" => ?db, "cf" => cf); @@ -331,7 +330,8 @@ impl Debugger { opts.set_max_subcompactions(threads as i32); opts.set_exclusive_manual_compaction(false); opts.set_bottommost_level_compaction(bottommost.0); - db.compact_range_cf_opt(handle, &opts, start, end); + db.as_inner() + .compact_range_cf_opt(handle, &opts, start, end); info!("Debugger finishes manual compact"; "db" => ?db, "cf" => cf); Ok(()) } @@ -346,7 +346,7 @@ impl Debugger { let mut errors = Vec::with_capacity(regions.len()); for region in regions { let region_id = region.get_id(); - if let Err(e) = set_region_tombstone(db.as_inner(), store_id, region, &mut wb) { + if let Err(e) = set_region_tombstone(db, store_id, region, &mut wb) { errors.push((region_id, e)); } } @@ -403,7 +403,7 @@ impl Debugger { for region in regions { let region_id = region.get_id(); if let Err(e) = recover_mvcc_for_range( - db.as_inner(), + db, region.get_start_key(), region.get_end_key(), read_only, @@ -417,18 +417,15 @@ impl Debugger { } pub fn recover_all(&self, threads: usize, read_only: bool) -> Result<()> { - let db = self.engines.kv.clone(); + let db = &self.engines.kv; info!("Calculating split keys..."); - let split_keys = divide_db(db.as_inner(), threads) - .unwrap() - .into_iter() - .map(|k| { - let k = Key::from_encoded(keys::origin_key(&k).to_vec()) - .truncate_ts() - .unwrap(); - k.as_encoded().clone() - }); + let split_keys = divide_db(db, threads).unwrap().into_iter().map(|k| { + let k = Key::from_encoded(keys::origin_key(&k).to_vec()) + .truncate_ts() + .unwrap(); + k.as_encoded().clone() + }); let mut range_borders = vec![b"".to_vec()]; range_borders.extend(split_keys); @@ -454,13 +451,8 @@ impl Debugger { log_wrappers::Value::key(&end_key) ); - let result = recover_mvcc_for_range( - db.as_inner(), - &start_key, - &end_key, - read_only, - thread_index, - ); + let result = + recover_mvcc_for_range(&db, &start_key, &end_key, read_only, thread_index); tikv_alloc::remove_thread_memory_accessor(); result }) @@ -861,8 +853,8 @@ impl Debugger { let start = keys::enc_start_key(region); let end = keys::enc_end_key(region); - let mut res = dump_write_cf_properties(self.engines.kv.as_inner(), &start, &end)?; - let mut res1 = dump_default_cf_properties(self.engines.kv.as_inner(), &start, &end)?; + let mut res = dump_write_cf_properties(&self.engines.kv, &start, &end)?; + let mut res1 = dump_default_cf_properties(&self.engines.kv, &start, &end)?; res.append(&mut res1); let middle_key = match box_try!(get_region_approximate_middle(&self.engines.kv, region)) { @@ -885,12 +877,12 @@ impl Debugger { pub fn get_range_properties(&self, start: &[u8], end: &[u8]) -> Result> { let mut props = dump_write_cf_properties( - self.engines.kv.as_inner(), + &self.engines.kv, &keys::data_key(start), &keys::data_end_key(end), )?; let mut props1 = dump_default_cf_properties( - self.engines.kv.as_inner(), + &self.engines.kv, &keys::data_key(start), &keys::data_end_key(end), )?; @@ -904,13 +896,13 @@ impl Debugger { } fn dump_default_cf_properties( - db: &Arc, + db: &RocksEngine, start: &[u8], end: &[u8], ) -> Result> { let mut num_entries = 0; // number of Rocksdb K/V entries. - let collection = box_try!(db.c().get_range_properties_cf(CF_DEFAULT, start, end)); + let collection = box_try!(db.get_range_properties_cf(CF_DEFAULT, start, end)); let num_files = collection.len(); for (_, v) in collection.iter() { @@ -937,13 +929,13 @@ fn dump_default_cf_properties( } fn dump_write_cf_properties( - db: &Arc, + db: &RocksEngine, start: &[u8], end: &[u8], ) -> Result> { let mut num_entries = 0; // number of Rocksdb K/V entries. - let collection = box_try!(db.c().get_range_properties_cf(CF_WRITE, start, end)); + let collection = box_try!(db.get_range_properties_cf(CF_WRITE, start, end)); let num_files = collection.len(); let mut mvcc_properties = MvccProperties::new(); @@ -998,19 +990,19 @@ fn dump_write_cf_properties( } fn recover_mvcc_for_range( - db: &Arc, + db: &RocksEngine, start_key: &[u8], end_key: &[u8], read_only: bool, thread_index: usize, ) -> Result<()> { - let mut mvcc_checker = box_try!(MvccChecker::new(Arc::clone(db), start_key, end_key)); + let mut mvcc_checker = box_try!(MvccChecker::new(db.clone(), start_key, end_key)); mvcc_checker.thread_index = thread_index; let wb_limit: usize = 10240; loop { - let mut wb = db.c().write_batch(); + let mut wb = db.write_batch(); mvcc_checker.check_mvcc(&mut wb, Some(wb_limit))?; let batch_size = wb.count(); @@ -1050,7 +1042,7 @@ pub struct MvccChecker { } impl MvccChecker { - fn new(db: Arc, start_key: &[u8], end_key: &[u8]) -> Result { + fn new(db: RocksEngine, start_key: &[u8], end_key: &[u8]) -> Result { let start_key = keys::data_key(start_key); let end_key = keys::data_end_key(end_key); let gen_iter = |cf: &str| -> Result<_> { @@ -1061,7 +1053,7 @@ impl MvccChecker { Some(KeyBuilder::from_vec(to, 0, 0)), false, ); - let mut iter = box_try!(db.c().iterator_opt(cf, readopts)); + let mut iter = box_try!(db.iterator_opt(cf, readopts)); iter.seek_to_first().unwrap(); Ok(iter) }; @@ -1330,7 +1322,7 @@ fn validate_db_and_cf(db: DBType, cf: &str) -> Result<()> { } fn set_region_tombstone( - db: &Arc, + db: &RocksEngine, store_id: u64, region: Region, wb: &mut RocksWriteBatchVec, @@ -1339,7 +1331,6 @@ fn set_region_tombstone( let key = keys::region_state_key(id); let region_state = db - .c() .get_msg_cf::(CF_RAFT, &key) .map_err(|e| box_err!(e)) .and_then(|s| s.ok_or_else(|| Error::Other("Can't find RegionLocalState".into())))?; @@ -1378,25 +1369,19 @@ fn set_region_tombstone( Ok(()) } -fn divide_db(db: &Arc, parts: usize) -> raftstore::Result>> { +fn divide_db(db: &RocksEngine, parts: usize) -> raftstore::Result>> { // Empty start and end key cover all range. let start = keys::data_key(b""); let end = keys::data_end_key(b""); let range = Range::new(&start, &end); Ok(box_try!( - RocksEngine::from_db(db.clone()).get_range_approximate_split_keys(range, parts - 1) + db.get_range_approximate_split_keys(range, parts - 1) )) } #[cfg(test)] mod tests { - use std::sync::Arc; - - use engine_rocks::{ - raw::{ColumnFamilyOptions, DBOptions}, - raw_util::{new_engine_opt, CFOptions}, - RocksEngine, - }; + use engine_rocks::{util::new_engine_opt, RocksCfOptions, RocksDBOptions, RocksEngine}; use engine_traits::{Mutable, SyncMutable, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; use kvproto::{ kvrpcpb::ApiVersion, @@ -1409,7 +1394,7 @@ mod tests { use crate::storage::mvcc::{Lock, LockType}; fn init_region_state( - engine: &Arc, + engine: &RocksEngine, region_id: u64, stores: &[u64], mut learner: usize, @@ -1430,7 +1415,7 @@ mod tests { region_state.set_state(PeerState::Normal); region_state.set_region(region.clone()); let key = keys::region_state_key(region_id); - engine.c().put_msg_cf(CF_RAFT, &key, ®ion_state).unwrap(); + engine.put_msg_cf(CF_RAFT, &key, ®ion_state).unwrap(); region } @@ -1456,10 +1441,9 @@ mod tests { raft_engine.put_msg(&raft_state_key, &raft_state).unwrap(); } - fn get_region_state(engine: &Arc, region_id: u64) -> RegionLocalState { + fn get_region_state(engine: &RocksEngine, region_id: u64) -> RegionLocalState { let key = keys::region_state_key(region_id); engine - .c() .get_msg_cf::(CF_RAFT, &key) .unwrap() .unwrap() @@ -1535,24 +1519,9 @@ mod tests { fn new_debugger() -> Debugger { let tmp = Builder::new().prefix("test_debug").tempdir().unwrap(); let path = tmp.path().to_str().unwrap(); - let engine = Arc::new( - engine_rocks::raw_util::new_engine_opt( - path, - DBOptions::new(), - vec![ - CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new()), - CFOptions::new(CF_WRITE, ColumnFamilyOptions::new()), - CFOptions::new(CF_LOCK, ColumnFamilyOptions::new()), - CFOptions::new(CF_RAFT, ColumnFamilyOptions::new()), - ], - ) - .unwrap(), - ); + let engine = engine_rocks::util::new_engine(path, ALL_CFS).unwrap(); - let engines = Engines::new( - RocksEngine::from_db(Arc::clone(&engine)), - RocksEngine::from_db(engine), - ); + let engines = Engines::new(engine.clone(), engine); Debugger::new(engines, ConfigController::default()) } @@ -1720,21 +1689,21 @@ mod tests { let engine = &debugger.engines.kv; // region 1 with peers at stores 11, 12, 13. - let region_1 = init_region_state(engine.as_inner(), 1, &[11, 12, 13], 0); + let region_1 = init_region_state(engine, 1, &[11, 12, 13], 0); // Got the target region from pd, which doesn't contains the store. let mut target_region_1 = region_1.clone(); target_region_1.mut_peers().remove(0); target_region_1.mut_region_epoch().set_conf_ver(100); // region 2 with peers at stores 11, 12, 13. - let region_2 = init_region_state(engine.as_inner(), 2, &[11, 12, 13], 0); + let region_2 = init_region_state(engine, 2, &[11, 12, 13], 0); // Got the target region from pd, which has different peer_id. let mut target_region_2 = region_2.clone(); target_region_2.mut_peers()[0].set_id(100); target_region_2.mut_region_epoch().set_conf_ver(100); // region 3 with peers at stores 21, 22, 23. - let region_3 = init_region_state(engine.as_inner(), 3, &[21, 22, 23], 0); + let region_3 = init_region_state(engine, 3, &[21, 22, 23], 0); // Got the target region from pd but the peers are not changed. let mut target_region_3 = region_3; target_region_3.mut_region_epoch().set_conf_ver(100); @@ -1748,21 +1717,15 @@ mod tests { let errors = debugger.set_region_tombstone(target_regions).unwrap(); assert_eq!(errors.len(), 1); assert_eq!(errors[0].0, 3); - assert_eq!( - get_region_state(engine.as_inner(), 1).take_region(), - region_1 - ); - assert_eq!( - get_region_state(engine.as_inner(), 2).take_region(), - region_2 - ); + assert_eq!(get_region_state(engine, 1).take_region(), region_1); + assert_eq!(get_region_state(engine, 2).take_region(), region_2); // After set_region_tombstone success, all region should be adjusted. let target_regions = vec![target_region_1, target_region_2]; let errors = debugger.set_region_tombstone(target_regions).unwrap(); assert!(errors.is_empty()); for ®ion_id in &[1, 2] { - let state = get_region_state(engine.as_inner(), region_id).get_state(); + let state = get_region_state(engine, region_id).get_state(); assert_eq!(state, PeerState::Tombstone); } } @@ -1778,19 +1741,19 @@ mod tests { assert!(!errors.is_empty()); // region 1 with peers at stores 11, 12, 13. - init_region_state(engine.as_inner(), 1, &[11, 12, 13], 0); - let mut expected_state = get_region_state(engine.as_inner(), 1); + init_region_state(engine, 1, &[11, 12, 13], 0); + let mut expected_state = get_region_state(engine, 1); expected_state.set_state(PeerState::Tombstone); // tombstone region 1. let errors = debugger.set_region_tombstone_by_id(vec![1]).unwrap(); assert!(errors.is_empty()); - assert_eq!(get_region_state(engine.as_inner(), 1), expected_state); + assert_eq!(get_region_state(engine, 1), expected_state); // tombstone region 1 again. let errors = debugger.set_region_tombstone_by_id(vec![1]).unwrap(); assert!(errors.is_empty()); - assert_eq!(get_region_state(engine.as_inner(), 1), expected_state); + assert_eq!(get_region_state(engine, 1), expected_state); } #[test] @@ -1799,7 +1762,7 @@ mod tests { debugger.set_store_id(100); let engine = &debugger.engines.kv; - let get_region_stores = |engine: &Arc, region_id: u64| { + let get_region_stores = |engine: &RocksEngine, region_id: u64| { get_region_state(engine, region_id) .get_region() .get_peers() @@ -1808,7 +1771,7 @@ mod tests { .collect::>() }; - let get_region_learner = |engine: &Arc, region_id: u64| { + let get_region_learner = |engine: &RocksEngine, region_id: u64| { get_region_state(engine, region_id) .get_region() .get_peers() @@ -1818,9 +1781,9 @@ mod tests { }; // region 1 with peers at stores 11, 12, 13 and 14. - init_region_state(engine.as_inner(), 1, &[11, 12, 13, 14], 0); + init_region_state(engine, 1, &[11, 12, 13, 14], 0); // region 2 with peers at stores 21, 22 and 23. - init_region_state(engine.as_inner(), 2, &[21, 22, 23], 0); + init_region_state(engine, 2, &[21, 22, 23], 0); // Only remove specified stores from region 1. debugger @@ -1828,43 +1791,43 @@ mod tests { .unwrap(); // 13 and 14 should be removed from region 1. - assert_eq!(get_region_stores(engine.as_inner(), 1), &[11, 12]); + assert_eq!(get_region_stores(engine, 1), &[11, 12]); // 21 and 23 shouldn't be removed from region 2. - assert_eq!(get_region_stores(engine.as_inner(), 2), &[21, 22, 23]); + assert_eq!(get_region_stores(engine, 2), &[21, 22, 23]); // Remove specified stores from all regions. debugger .remove_failed_stores(vec![11, 23], None, false) .unwrap(); - assert_eq!(get_region_stores(engine.as_inner(), 1), &[12]); - assert_eq!(get_region_stores(engine.as_inner(), 2), &[21, 22]); + assert_eq!(get_region_stores(engine, 1), &[12]); + assert_eq!(get_region_stores(engine, 2), &[21, 22]); // Should fail when the store itself is in the failed list. - init_region_state(engine.as_inner(), 3, &[100, 31, 32, 33], 0); + init_region_state(engine, 3, &[100, 31, 32, 33], 0); debugger .remove_failed_stores(vec![100], None, false) .unwrap_err(); // no learner, promote learner does nothing - init_region_state(engine.as_inner(), 4, &[41, 42, 43, 44], 0); + init_region_state(engine, 4, &[41, 42, 43, 44], 0); debugger.remove_failed_stores(vec![44], None, true).unwrap(); - assert_eq!(get_region_stores(engine.as_inner(), 4), &[41, 42, 43]); - assert_eq!(get_region_learner(engine.as_inner(), 4), 0); + assert_eq!(get_region_stores(engine, 4), &[41, 42, 43]); + assert_eq!(get_region_learner(engine, 4), 0); // promote learner - init_region_state(engine.as_inner(), 5, &[51, 52, 53, 54], 1); + init_region_state(engine, 5, &[51, 52, 53, 54], 1); debugger .remove_failed_stores(vec![52, 53, 54], None, true) .unwrap(); - assert_eq!(get_region_stores(engine.as_inner(), 5), &[51]); - assert_eq!(get_region_learner(engine.as_inner(), 5), 0); + assert_eq!(get_region_stores(engine, 5), &[51]); + assert_eq!(get_region_learner(engine, 5), 0); // no need to promote learner - init_region_state(engine.as_inner(), 6, &[61, 62, 63, 64], 1); + init_region_state(engine, 6, &[61, 62, 63, 64], 1); debugger.remove_failed_stores(vec![64], None, true).unwrap(); - assert_eq!(get_region_stores(engine.as_inner(), 6), &[61, 62, 63]); - assert_eq!(get_region_learner(engine.as_inner(), 6), 1); + assert_eq!(get_region_stores(engine, 6), &[61, 62, 63]); + assert_eq!(get_region_learner(engine, 6), 1); } #[test] @@ -1874,8 +1837,8 @@ mod tests { let kv_engine = &debugger.engines.kv; let raft_engine = &debugger.engines.raft; - init_region_state(kv_engine.as_inner(), 1, &[100, 101], 1); - init_region_state(kv_engine.as_inner(), 2, &[100, 103], 1); + init_region_state(kv_engine, 1, &[100, 101], 1); + init_region_state(kv_engine, 2, &[100, 103], 1); init_raft_state(kv_engine, raft_engine, 1, 100, 90, 80); init_raft_state(kv_engine, raft_engine, 2, 80, 80, 80); @@ -2026,10 +1989,7 @@ mod tests { remove_region_state(1); remove_region_state(2); assert!(debugger.recreate_region(region.clone()).is_ok()); - assert_eq!( - get_region_state(engine.as_inner(), 100).get_region(), - ®ion - ); + assert_eq!(get_region_state(engine, 100).get_region(), ®ion); region.set_start_key(b"z".to_vec()); region.set_end_key(b"".to_vec()); @@ -2177,27 +2137,28 @@ mod tests { let path_str = path.path().to_str().unwrap(); let cfs_opts = ALL_CFS .iter() - .map(|cf| CFOptions::new(cf, ColumnFamilyOptions::new())) + .map(|cf| (*cf, RocksCfOptions::default())) .collect(); - let db_opt = DBOptions::new(); + let db_opt = RocksDBOptions::default(); db_opt.enable_multi_batch_write(true); - let db = Arc::new(new_engine_opt(path_str, db_opt, cfs_opts).unwrap()); + let db = new_engine_opt(path_str, db_opt, cfs_opts).unwrap(); // Write initial KVs. - let mut wb = RocksEngine::from_db(db.clone()).write_batch(); + let mut wb = db.write_batch(); for &(cf, ref k, ref v, _) in &kv { wb.put_cf(cf, &keys::data_key(k.as_encoded()), v).unwrap(); } wb.write().unwrap(); // Fix problems. - let mut checker = MvccChecker::new(Arc::clone(&db), b"k", b"l").unwrap(); - let mut wb = db.c().write_batch(); + let mut checker = MvccChecker::new(db.clone(), b"k", b"l").unwrap(); + let mut wb = db.write_batch(); checker.check_mvcc(&mut wb, None).unwrap(); wb.write().unwrap(); // Check result. for (cf, k, _, expect) in kv { let data = db + .as_inner() .get_cf( - get_cf_handle(&db, cf).unwrap(), + get_cf_handle(db.as_inner(), cf).unwrap(), &keys::data_key(k.as_encoded()), ) .unwrap(); diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 421c0c0f8ba..854c1fdd356 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -144,19 +144,18 @@ impl KvEngineFactory { self.inner.region_info_accessor.as_ref(), self.inner.api_version, ); - let kv_engine = engine_rocks::raw_util::new_engine_opt( + let kv_engine = engine_rocks::util::new_engine_opt( tablet_path.to_str().unwrap(), kv_db_opts, kv_cfs_opts, ); - let kv_engine = match kv_engine { + let mut kv_engine = match kv_engine { Ok(e) => e, Err(e) => { error!("failed to create kv engine"; "path" => %tablet_path.display(), "err" => ?e); return Err(e); } }; - let mut kv_engine = RocksEngine::from_db(Arc::new(kv_engine)); let shared_block_cache = self.inner.block_cache.is_some(); kv_engine.set_shared_block_cache(shared_block_cache); Ok(kv_engine) @@ -184,7 +183,7 @@ impl KvEngineFactory { ); // TODOTODO: call rust-rocks or tirocks to destroy_engine; /* - engine_rocks::raw_util::destroy_engine( + engine_rocks::util::destroy_engine( tablet_path.to_str().unwrap(), kv_db_opts, kv_cfs_opts, diff --git a/src/server/gc_worker/mod.rs b/src/server/gc_worker/mod.rs index 4e2bc6e76de..20de36ef035 100644 --- a/src/server/gc_worker/mod.rs +++ b/src/server/gc_worker/mod.rs @@ -50,9 +50,7 @@ fn check_need_gc(safe_point: TimeStamp, ratio_threshold: f64, props: &MvccProper #[cfg(test)] mod tests { - use std::sync::Arc; - - use engine_rocks::{raw::DB, Compat}; + use engine_rocks::RocksEngine; use engine_traits::{MvccPropertiesExt, CF_WRITE}; use kvproto::metapb::Region; @@ -60,7 +58,7 @@ mod tests { use crate::storage::mvcc::reader_tests::{make_region, open_db, RegionEngine}; fn get_mvcc_properties_and_check_gc( - db: Arc, + db: &RocksEngine, region: Region, safe_point: impl Into, need_gc: bool, @@ -70,7 +68,6 @@ mod tests { let start = keys::data_key(region.get_start_key()); let end = keys::data_end_key(region.get_end_key()); let props = db - .c() .get_mvcc_properties_cf(CF_WRITE, safe_point, &start, &end) .unwrap(); assert_eq!(check_need_gc(safe_point, 1.0, &props), need_gc); @@ -100,7 +97,7 @@ mod tests { engine.put(&[3], 4, 4); engine.flush(); engine.compact(); - let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, false); + let props = get_mvcc_properties_and_check_gc(&db, region.clone(), 10, false); assert_eq!(props.min_ts, 1.into()); assert_eq!(props.max_ts, 4.into()); assert_eq!(props.num_rows, 4); @@ -116,7 +113,7 @@ mod tests { engine.flush(); // After this flush, keys 5,6 in the new SST file have more than one // versions, so we need gc. - let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, true); + let props = get_mvcc_properties_and_check_gc(&db, region.clone(), 10, true); assert_eq!(props.min_ts, 1.into()); assert_eq!(props.max_ts, 8.into()); assert_eq!(props.num_rows, 6); @@ -124,7 +121,7 @@ mod tests { assert_eq!(props.num_versions, 8); assert_eq!(props.max_row_versions, 2); // But if the `safe_point` is older than all versions, we don't need gc too. - let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 0, false); + let props = get_mvcc_properties_and_check_gc(&db, region.clone(), 0, false); assert_eq!(props.min_ts, TimeStamp::max()); assert_eq!(props.max_ts, TimeStamp::zero()); assert_eq!(props.num_rows, 0); @@ -138,7 +135,7 @@ mod tests { engine.compact(); // After this compact, all versions of keys 5,6 are deleted, // no keys have more than one versions, so we don't need gc. - let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, false); + let props = get_mvcc_properties_and_check_gc(&db, region.clone(), 10, false); assert_eq!(props.min_ts, 1.into()); assert_eq!(props.max_ts, 4.into()); assert_eq!(props.num_rows, 4); @@ -149,7 +146,7 @@ mod tests { // A single lock version need gc. engine.lock(&[7], 9, 9); engine.flush(); - let props = get_mvcc_properties_and_check_gc(Arc::clone(&db), region.clone(), 10, true); + let props = get_mvcc_properties_and_check_gc(&db, region.clone(), 10, true); assert_eq!(props.min_ts, 1.into()); assert_eq!(props.max_ts, 9.into()); assert_eq!(props.num_rows, 5); diff --git a/src/server/reset_to_version.rs b/src/server/reset_to_version.rs index 94e3e38900d..de837bdb1cb 100644 --- a/src/server/reset_to_version.rs +++ b/src/server/reset_to_version.rs @@ -255,12 +255,7 @@ impl ResetToVersionManager { #[cfg(test)] mod tests { - use engine_rocks::{ - raw::{ColumnFamilyOptions, DBOptions}, - raw_util::CFOptions, - Compat, - }; - use engine_traits::{WriteBatch, WriteBatchExt, CF_LOCK, CF_RAFT}; + use engine_traits::{WriteBatch, WriteBatchExt, ALL_CFS, CF_LOCK}; use tempfile::Builder; use txn_types::{Lock, LockType, WriteType}; @@ -270,19 +265,7 @@ mod tests { fn test_basic() { let tmp = Builder::new().prefix("test_basic").tempdir().unwrap(); let path = tmp.path().to_str().unwrap(); - let fake_engine = Arc::new( - engine_rocks::raw_util::new_engine_opt( - path, - DBOptions::new(), - vec![ - CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new()), - CFOptions::new(CF_WRITE, ColumnFamilyOptions::new()), - CFOptions::new(CF_LOCK, ColumnFamilyOptions::new()), - CFOptions::new(CF_RAFT, ColumnFamilyOptions::new()), - ], - ) - .unwrap(), - ); + let fake_engine = engine_rocks::util::new_engine(path, ALL_CFS).unwrap(); let write = vec![ // key, start_ts, commit_ts @@ -339,19 +322,18 @@ mod tests { ); kv.push((CF_LOCK, Key::from_raw(key), lock.to_bytes())); } - let mut wb = fake_engine.c().write_batch(); + let mut wb = fake_engine.write_batch(); for &(cf, ref k, ref v) in &kv { wb.put_cf(cf, &keys::data_key(k.as_encoded()), v).unwrap(); } wb.write().unwrap(); - let manager = ResetToVersionManager::new(fake_engine.c().clone()); + let manager = ResetToVersionManager::new(fake_engine.clone()); manager.start(100.into()); manager.wait(); let readopts = IterOptions::new(None, None, false); let mut write_iter = fake_engine - .c() .iterator_opt(CF_WRITE, readopts.clone()) .unwrap(); write_iter.seek_to_first().unwrap(); @@ -363,7 +345,6 @@ mod tests { remaining_writes.push((key, write)); } let mut default_iter = fake_engine - .c() .iterator_opt(CF_DEFAULT, readopts.clone()) .unwrap(); default_iter.seek_to_first().unwrap(); @@ -375,7 +356,7 @@ mod tests { remaining_defaults.push((key, value)); } - let mut lock_iter = fake_engine.c().iterator_opt(CF_LOCK, readopts).unwrap(); + let mut lock_iter = fake_engine.iterator_opt(CF_LOCK, readopts).unwrap(); lock_iter.seek_to_first().unwrap(); let mut remaining_locks = vec![]; while lock_iter.valid().unwrap() { diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index d5c1180ddf0..e3d1507224b 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -6,7 +6,7 @@ use std::{ }; use causal_ts::tests::DummyRawTsTracker; -use engine_rocks::{raw::ColumnFamilyOptions, raw_util::CFOptions}; +use engine_rocks::RocksCfOptions; use engine_traits::{CfName, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; use file_system::IORateLimiter; use kvproto::kvrpcpb::ApiVersion; @@ -113,24 +113,18 @@ impl TestEngineBuilder { let cfs_opts = cfs .iter() .map(|cf| match *cf { - CF_DEFAULT => CFOptions::new( + CF_DEFAULT => ( CF_DEFAULT, cfg_rocksdb.defaultcf.build_opt(&cache, None, api_version), ), - CF_LOCK => CFOptions::new(CF_LOCK, cfg_rocksdb.lockcf.build_opt(&cache)), - CF_WRITE => CFOptions::new(CF_WRITE, cfg_rocksdb.writecf.build_opt(&cache, None)), - CF_RAFT => CFOptions::new(CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), - _ => CFOptions::new(*cf, ColumnFamilyOptions::new()), + CF_LOCK => (CF_LOCK, cfg_rocksdb.lockcf.build_opt(&cache)), + CF_WRITE => (CF_WRITE, cfg_rocksdb.writecf.build_opt(&cache, None)), + CF_RAFT => (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), + _ => (*cf, RocksCfOptions::default()), }) .collect(); - let mut engine = RocksEngine::new( - &path, - &cfs, - Some(cfs_opts), - cache.is_some(), - self.io_rate_limiter, - None, /* CFOptions */ - )?; + let mut engine = + RocksEngine::new(&path, None, cfs_opts, cache.is_some(), self.io_rate_limiter)?; if let ApiVersion::V2 = api_version { Self::register_causal_observer(&mut engine); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index a43b5270875..aab89299641 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3101,8 +3101,7 @@ mod tests { use api_version::{test_kv_format_impl, ApiV2}; use collections::HashMap; - use engine_rocks::raw_util::CFOptions; - use engine_traits::{raw_ttl::ttl_current_ts, ALL_CFS, CF_LOCK, CF_RAFT, CF_WRITE}; + use engine_traits::{raw_ttl::ttl_current_ts, CF_LOCK, CF_RAFT, CF_WRITE}; use error_code::ErrorCodeExt; use errors::extract_key_error; use futures::executor::block_on; @@ -3232,7 +3231,10 @@ mod tests { #[test] fn test_cf_error() { // New engine lacks normal column families. - let engine = TestEngineBuilder::new().cfs(["foo"]).build().unwrap(); + let engine = TestEngineBuilder::new() + .cfs([CF_DEFAULT, "foo"]) + .build() + .unwrap(); let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) .build() .unwrap(); @@ -3638,27 +3640,25 @@ mod tests { }; let engine = { let path = "".to_owned(); - let cfs = ALL_CFS.to_vec(); let cfg_rocksdb = db_config; let cache = BlockCacheConfig::default().build_shared_cache(); let cfs_opts = vec![ - CFOptions::new( + ( CF_DEFAULT, cfg_rocksdb .defaultcf .build_opt(&cache, None, ApiVersion::V1), ), - CFOptions::new(CF_LOCK, cfg_rocksdb.lockcf.build_opt(&cache)), - CFOptions::new(CF_WRITE, cfg_rocksdb.writecf.build_opt(&cache, None)), - CFOptions::new(CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), + (CF_LOCK, cfg_rocksdb.lockcf.build_opt(&cache)), + (CF_WRITE, cfg_rocksdb.writecf.build_opt(&cache, None)), + (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), ]; RocksEngine::new( &path, - &cfs, - Some(cfs_opts), + None, + cfs_opts, cache.is_some(), None, /*io_rate_limiter*/ - None, /* CFOptions */ ) } .unwrap(); diff --git a/src/storage/mvcc/consistency_check.rs b/src/storage/mvcc/consistency_check.rs index 7881eb45903..eb788cb4dd3 100644 --- a/src/storage/mvcc/consistency_check.rs +++ b/src/storage/mvcc/consistency_check.rs @@ -480,7 +480,7 @@ mod tests { #[test] fn test_mvcc_info_collector() { - use engine_test::ctor::{CFOptions, ColumnFamilyOptions, DBOptions}; + use engine_test::ctor::{ColumnFamilyOptions, DBOptions}; use engine_traits::SyncMutable; use txn_types::TimeStamp; @@ -495,10 +495,10 @@ mod tests { path, DBOptions::default(), vec![ - CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new()), - CFOptions::new(CF_WRITE, ColumnFamilyOptions::new()), - CFOptions::new(CF_LOCK, ColumnFamilyOptions::new()), - CFOptions::new(CF_RAFT, ColumnFamilyOptions::new()), + (CF_DEFAULT, ColumnFamilyOptions::new()), + (CF_WRITE, ColumnFamilyOptions::new()), + (CF_LOCK, ColumnFamilyOptions::new()), + (CF_RAFT, ColumnFamilyOptions::new()), ], ) .unwrap(); diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 17b02c28ec9..614f8acb147 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -617,18 +617,16 @@ impl MvccReader { #[cfg(test)] pub mod tests { - use std::{ops::Bound, sync::Arc, u64}; + use std::{ops::Bound, u64}; use concurrency_manager::ConcurrencyManager; use engine_rocks::{ - properties::MvccPropertiesCollectorFactory, - raw::{ColumnFamilyOptions, DBOptions, DB}, - raw_util::CFOptions, - Compat, RocksSnapshot, + properties::MvccPropertiesCollectorFactory, RocksCfOptions, RocksDBOptions, RocksEngine, + RocksSnapshot, }; use engine_traits::{ - IterOptions, Mutable, WriteBatch, WriteBatchExt, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, - CF_WRITE, + CompactExt, IterOptions, MiscExt, Mutable, SyncMutable, WriteBatch, WriteBatchExt, ALL_CFS, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ kvrpcpb::{AssertionLevel, Context}, @@ -649,20 +647,20 @@ pub mod tests { }; pub struct RegionEngine { - db: Arc, + db: RocksEngine, region: Region, } impl RegionEngine { - pub fn new(db: &Arc, region: &Region) -> RegionEngine { + pub fn new(db: &RocksEngine, region: &Region) -> RegionEngine { RegionEngine { - db: Arc::clone(db), + db: db.clone(), region: region.clone(), } } pub fn snapshot(&self) -> RegionSnapshot { - let db = self.db.c().clone(); + let db = self.db.clone(); RegionSnapshot::::from_raw(db, self.region.clone()) } @@ -849,7 +847,7 @@ pub mod tests { pub fn write(&mut self, modifies: Vec) { let db = &self.db; - let mut wb = db.c().write_batch(); + let mut wb = db.write_batch(); for rev in modifies { match rev { Modify::Put(cf, k, v) => { @@ -879,22 +877,20 @@ pub mod tests { pub fn flush(&mut self) { for cf in ALL_CFS { - let cf = engine_rocks::util::get_cf_handle(&self.db, cf).unwrap(); self.db.flush_cf(cf, true).unwrap(); } } pub fn compact(&mut self) { for cf in ALL_CFS { - let cf = engine_rocks::util::get_cf_handle(&self.db, cf).unwrap(); - self.db.compact_range_cf(cf, None, None); + self.db.compact_range(cf, None, None, false, 1).unwrap(); } } } - pub fn open_db(path: &str, with_properties: bool) -> Arc { - let db_opts = DBOptions::new(); - let mut cf_opts = ColumnFamilyOptions::new(); + pub fn open_db(path: &str, with_properties: bool) -> RocksEngine { + let db_opt = RocksDBOptions::default(); + let mut cf_opts = RocksCfOptions::default(); cf_opts.set_write_buffer_size(32 * 1024 * 1024); if with_properties { cf_opts.add_table_properties_collector_factory( @@ -903,12 +899,12 @@ pub mod tests { ); } let cfs_opts = vec![ - CFOptions::new(CF_DEFAULT, ColumnFamilyOptions::new()), - CFOptions::new(CF_RAFT, ColumnFamilyOptions::new()), - CFOptions::new(CF_LOCK, ColumnFamilyOptions::new()), - CFOptions::new(CF_WRITE, cf_opts), + (CF_DEFAULT, RocksCfOptions::default()), + (CF_RAFT, RocksCfOptions::default()), + (CF_LOCK, RocksCfOptions::default()), + (CF_WRITE, cf_opts), ]; - Arc::new(engine_rocks::raw_util::new_engine_opt(path, db_opts, cfs_opts).unwrap()) + engine_rocks::util::new_engine_opt(path, db_opt, cfs_opts).unwrap() } pub fn make_region(id: u64, start_key: Vec, end_key: Vec) -> Region { @@ -945,7 +941,7 @@ pub mod tests { engine.put(&[12], 11, 12); engine.flush(); - let snap = RegionSnapshot::::from_raw(db.c().clone(), region); + let snap = RegionSnapshot::::from_raw(db, region); let tests = vec![ // set nothing. @@ -1019,7 +1015,7 @@ pub mod tests { iopt.set_hint_min_ts(Bound::Included(1)); iopt.set_hint_max_ts(Bound::Included(6)); - let snap = RegionSnapshot::::from_raw(db.c().clone(), region); + let snap = RegionSnapshot::::from_raw(db, region); let mut iter = snap.iter(CF_WRITE, iopt).unwrap(); // Must not omit the latest deletion of key1 to prevent seeing outdated record. @@ -1069,7 +1065,7 @@ pub mod tests { engine.prewrite_pessimistic_lock(m, k, 45); engine.commit(k, 45, 50); - let snap = RegionSnapshot::::from_raw(db.c().clone(), region); + let snap = RegionSnapshot::::from_raw(db, region); let mut reader = MvccReader::new(snap, None, false); // Let's assume `50_45 PUT` means a commit version with start ts is 45 and commit ts @@ -1180,7 +1176,7 @@ pub mod tests { engine.prewrite_pessimistic_lock(m, k, 1); engine.commit(k, 1, 4); - let snap = RegionSnapshot::::from_raw(db.c().clone(), region); + let snap = RegionSnapshot::::from_raw(db, region); let mut reader = MvccReader::new(snap, None, false); let (commit_ts, write_type) = reader @@ -1241,7 +1237,7 @@ pub mod tests { // Let's assume `2_1 PUT` means a commit version with start ts is 1 and commit ts // is 2. // Commit versions: [25_23 PUT, 20_10 PUT, 17_15 PUT, 7_7 Rollback, 5_1 PUT, 3_3 Rollback]. - let snap = RegionSnapshot::::from_raw(db.c().clone(), region.clone()); + let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); let mut reader = MvccReader::new(snap, None, false); let k = Key::from_raw(k); @@ -1312,7 +1308,7 @@ pub mod tests { engine.prewrite(m2, k2, 1); engine.commit(k2, 1, 2); - let snap = RegionSnapshot::::from_raw(db.c().clone(), region); + let snap = RegionSnapshot::::from_raw(db.clone(), region); let mut reader = MvccReader::new(snap, None, false); let (commit_ts, write) = reader @@ -1334,7 +1330,7 @@ pub mod tests { // Test seek_write touches region's end. let region1 = make_region(1, vec![], Key::from_raw(b"k1").into_encoded()); - let snap = RegionSnapshot::::from_raw(db.c().clone(), region1); + let snap = RegionSnapshot::::from_raw(db, region1); let mut reader = MvccReader::new(snap, None, false); assert!(reader.seek_write(&k, 2.into()).unwrap().is_none()); @@ -1384,7 +1380,7 @@ pub mod tests { let m = Mutation::make_put(Key::from_raw(k), v.to_vec()); engine.prewrite(m, k, 24); - let snap = RegionSnapshot::::from_raw(db.c().clone(), region); + let snap = RegionSnapshot::::from_raw(db, region); let mut reader = MvccReader::new(snap, None, false); // Let's assume `2_1 PUT` means a commit version with start ts is 1 and commit ts @@ -1524,7 +1520,7 @@ pub mod tests { limit, expect_res: &[_], expect_is_remain| { - let snap = RegionSnapshot::::from_raw(db.c().clone(), region.clone()); + let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); let mut reader = MvccReader::new(snap, None, false); let res = reader .scan_locks( @@ -1691,7 +1687,7 @@ pub mod tests { for case in cases { engine.write(case.modifies); - let snap = RegionSnapshot::::from_raw(db.c().clone(), region.clone()); + let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); let mut reader = MvccReader::new(snap, case.scan_mode, false); let result = reader.load_data(&case.key, case.write); assert_eq!(format!("{:?}", result), format!("{:?}", case.expected)); @@ -1779,7 +1775,7 @@ pub mod tests { for case in cases { engine.write(case.modifies); - let snap = RegionSnapshot::::from_raw(db.c().clone(), region.clone()); + let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); let mut reader = MvccReader::new(snap, None, false); let result = reader.get(&case.key, case.ts, case.gc_fence_limit); assert_eq!(format!("{:?}", result), format!("{:?}", case.expected)); @@ -1972,8 +1968,7 @@ pub mod tests { fn test_reader_prefix_seek() { let dir = tempfile::TempDir::new().unwrap(); let builder = TestEngineBuilder::new().path(dir.path()); - let db = builder.build().unwrap().kv_engine().get_sync_db(); - let cf = engine_rocks::util::get_cf_handle(&db, CF_WRITE).unwrap(); + let db = builder.build().unwrap().kv_engine(); let region = make_region(1, vec![], vec![]); let mut engine = RegionEngine::new(&db, ®ion); @@ -1983,24 +1978,22 @@ pub mod tests { let commit_ts = (i * 2 + 1).into(); let mut k = vec![b'z']; k.extend_from_slice(Key::from_raw(b"k1").append_ts(commit_ts).as_encoded()); - use engine_rocks::raw::Writable; - engine.db.delete_cf(cf, &k).unwrap(); + engine.db.delete_cf(CF_WRITE, &k).unwrap(); } engine.flush(); - #[allow(clippy::useless_vec)] - for (k, scan_mode, tombstones) in vec![ - (b"k0", Some(ScanMode::Forward), 99), + for (k, scan_mode, tombstones) in &[ + (b"k0" as &[u8], Some(ScanMode::Forward), 99), (b"k0", None, 0), (b"k1", Some(ScanMode::Forward), 99), (b"k1", None, 99), (b"k2", Some(ScanMode::Forward), 0), (b"k2", None, 0), ] { - let mut reader = MvccReader::new(engine.snapshot(), scan_mode, false); + let mut reader = MvccReader::new(engine.snapshot(), *scan_mode, false); let (k, ts) = (Key::from_raw(k), 199.into()); reader.seek_write(&k, ts).unwrap(); - assert_eq!(reader.statistics.write.seek_tombstone, tombstones); + assert_eq!(reader.statistics.write.seek_tombstone, *tombstones); } } } diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index edcac95aa00..056c447aced 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -989,7 +989,7 @@ impl FlowChecker { pub(super) mod tests { use std::sync::atomic::AtomicU64; - use engine_rocks::RocksColumnFamilyOptions; + use engine_rocks::RocksCfOptions; use engine_traits::{CFOptionsExt, Result}; use super::{super::FlowController, *}; @@ -1020,7 +1020,7 @@ pub(super) mod tests { } impl CFOptionsExt for EngineStub { - type ColumnFamilyOptions = RocksColumnFamilyOptions; + type ColumnFamilyOptions = RocksCfOptions; fn get_options_cf(&self, _cf: &str) -> Result { unimplemented!(); } diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index c861a251bba..4c94aeb1249 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -3,8 +3,8 @@ use std::sync::Arc; use crossbeam::channel::TrySendError; -use engine_rocks::{raw::DB, RocksEngine, RocksSnapshot}; -use engine_traits::{ALL_CFS, CF_DEFAULT}; +use engine_rocks::{RocksEngine, RocksSnapshot}; +use engine_traits::{KvEngine, ALL_CFS, CF_DEFAULT}; use kvproto::{ kvrpcpb::{Context, ExtraOp as TxnExtraOp}, metapb::Region, @@ -35,12 +35,12 @@ use crate::test; #[derive(Clone)] struct SyncBenchRouter { - db: Arc, + db: RocksEngine, region: Region, } impl SyncBenchRouter { - fn new(region: Region, db: Arc) -> SyncBenchRouter { + fn new(region: Region, db: RocksEngine) -> SyncBenchRouter { SyncBenchRouter { db, region } } } @@ -51,7 +51,7 @@ impl SyncBenchRouter { cmd_resp::bind_term(&mut response, 1); match cmd.callback { Callback::Read(cb) => { - let snapshot = RocksSnapshot::new(Arc::clone(&self.db)); + let snapshot = self.db.snapshot(); let region = Arc::new(self.region.to_owned()); cb(ReadResponse { response, @@ -129,18 +129,18 @@ impl LocalReadRouter for SyncBenchRouter { fn release_snapshot_cache(&self) {} } -fn new_engine() -> (TempDir, Arc) { +fn new_engine() -> (TempDir, RocksEngine) { let dir = Builder::new().prefix("bench_rafkv").tempdir().unwrap(); let path = dir.path().to_str().unwrap().to_string(); - let db = engine_rocks::raw_util::new_engine(&path, None, ALL_CFS, None).unwrap(); - (dir, Arc::new(db)) + let db = engine_rocks::util::new_engine(&path, ALL_CFS).unwrap(); + (dir, db) } // The lower limit of time a async_snapshot may take. #[bench] fn bench_async_snapshots_noop(b: &mut test::Bencher) { let (_dir, db) = new_engine(); - let snapshot = RocksSnapshot::new(Arc::clone(&db)); + let snapshot = db.snapshot(); let resp = ReadResponse { response: RaftCmdResponse::default(), snapshot: Some(RegionSnapshot::from_snapshot( @@ -179,10 +179,7 @@ fn bench_async_snapshot(b: &mut test::Bencher) { region.mut_region_epoch().set_version(2); region.mut_region_epoch().set_conf_ver(5); let (_tmp, db) = new_engine(); - let kv = RaftKv::new( - SyncBenchRouter::new(region.clone(), db.clone()), - RocksEngine::from_db(db), - ); + let kv = RaftKv::new(SyncBenchRouter::new(region.clone(), db.clone()), db); let mut ctx = Context::default(); ctx.set_region_id(region.get_id()); @@ -211,10 +208,7 @@ fn bench_async_write(b: &mut test::Bencher) { region.mut_region_epoch().set_version(2); region.mut_region_epoch().set_conf_ver(5); let (_tmp, db) = new_engine(); - let kv = RaftKv::new( - SyncBenchRouter::new(region.clone(), db.clone()), - RocksEngine::from_db(db), - ); + let kv = RaftKv::new(SyncBenchRouter::new(region.clone(), db.clone()), db); let mut ctx = Context::default(); ctx.set_region_id(region.get_id()); diff --git a/tests/benches/misc/writebatch/bench_writebatch.rs b/tests/benches/misc/writebatch/bench_writebatch.rs index 0c6e81a35ca..cde64280184 100644 --- a/tests/benches/misc/writebatch/bench_writebatch.rs +++ b/tests/benches/misc/writebatch/bench_writebatch.rs @@ -1,18 +1,12 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; - -use engine_rocks::{ - raw::{DBOptions, DB}, - RocksEngine, RocksWriteBatchVec, -}; -use engine_traits::{Mutable, WriteBatch, WriteBatchExt}; +use engine_rocks::{RocksCfOptions, RocksDBOptions, RocksEngine, RocksWriteBatchVec}; +use engine_traits::{Mutable, WriteBatch, WriteBatchExt, CF_DEFAULT}; use tempfile::Builder; use test::Bencher; -fn writebatch(db: &Arc, round: usize, batch_keys: usize) { +fn writebatch(engine: &RocksEngine, round: usize, batch_keys: usize) { let v = b"operators are syntactic sugar for calls to methods of built-in traits"; - let engine = RocksEngine::from_db(db.clone()); for r in 0..round { let mut batch = engine.write_batch(); for i in 0..batch_keys { @@ -28,12 +22,17 @@ fn bench_writebatch_impl(b: &mut Bencher, batch_keys: usize) { .prefix("/tmp/rocksdb_write_batch_bench") .tempdir() .unwrap(); - let mut opts = DBOptions::new(); + let mut opts = RocksDBOptions::default(); opts.create_if_missing(true); opts.enable_unordered_write(false); opts.enable_pipelined_write(false); opts.enable_multi_batch_write(true); - let db = Arc::new(DB::open(opts, path.path().to_str().unwrap()).unwrap()); + let db = engine_rocks::util::new_engine_opt( + path.path().to_str().unwrap(), + opts, + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); let key_count = 1 << 13; let round = key_count / batch_keys; b.iter(|| { @@ -112,13 +111,17 @@ fn bench_writebatch_without_capacity(b: &mut Bencher) { .prefix("/tmp/rocksdb_write_batch_bench") .tempdir() .unwrap(); - let mut opts = DBOptions::new(); + let mut opts = RocksDBOptions::default(); opts.create_if_missing(true); opts.enable_unordered_write(false); opts.enable_pipelined_write(false); opts.enable_multi_batch_write(true); - let db = Arc::new(DB::open(opts, path.path().to_str().unwrap()).unwrap()); - let engine = RocksEngine::from_db(db); + let engine = engine_rocks::util::new_engine_opt( + path.path().to_str().unwrap(), + opts, + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); b.iter(|| { let mut wb = engine.write_batch(); fill_writebatch(&mut wb, 4096); @@ -131,13 +134,17 @@ fn bench_writebatch_with_capacity(b: &mut Bencher) { .prefix("/tmp/rocksdb_write_batch_bench") .tempdir() .unwrap(); - let mut opts = DBOptions::new(); + let mut opts = RocksDBOptions::default(); opts.create_if_missing(true); opts.enable_unordered_write(false); opts.enable_pipelined_write(false); opts.enable_multi_batch_write(true); - let db = Arc::new(DB::open(opts, path.path().to_str().unwrap()).unwrap()); - let engine = RocksEngine::from_db(db); + let engine = engine_rocks::util::new_engine_opt( + path.path().to_str().unwrap(), + opts, + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); b.iter(|| { let mut wb = engine.write_batch_with_cap(4096); fill_writebatch(&mut wb, 4096); diff --git a/tests/benches/raftstore/mod.rs b/tests/benches/raftstore/mod.rs index 58e674c9d11..05c602824c2 100644 --- a/tests/benches/raftstore/mod.rs +++ b/tests/benches/raftstore/mod.rs @@ -1,17 +1,17 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt, sync::Arc}; +use std::fmt; use criterion::{Bencher, Criterion}; -use engine_rocks::{raw::DB, Compat}; +use engine_rocks::RocksEngine; use engine_traits::{Mutable, WriteBatch, WriteBatchExt}; use test_raftstore::*; use test_util::*; const DEFAULT_DATA_SIZE: usize = 100_000; -fn enc_write_kvs(db: &Arc, kvs: &[(Vec, Vec)]) { - let mut wb = db.c().write_batch(); +fn enc_write_kvs(db: &RocksEngine, kvs: &[(Vec, Vec)]) { + let mut wb = db.write_batch(); for &(ref k, ref v) in kvs { wb.put(&keys::data_key(k), v).unwrap(); } @@ -21,7 +21,7 @@ fn enc_write_kvs(db: &Arc, kvs: &[(Vec, Vec)]) { fn prepare_cluster(cluster: &mut Cluster, initial_kvs: &[(Vec, Vec)]) { cluster.run(); for engines in cluster.engines.values() { - enc_write_kvs(engines.kv.as_inner(), initial_kvs); + enc_write_kvs(&engines.kv, initial_kvs); } cluster.leader_of_region(1).unwrap(); } diff --git a/tests/failpoints/cases/test_async_fetch.rs b/tests/failpoints/cases/test_async_fetch.rs index c6b8a693085..638888e83e2 100644 --- a/tests/failpoints/cases/test_async_fetch.rs +++ b/tests/failpoints/cases/test_async_fetch.rs @@ -32,7 +32,7 @@ fn test_node_async_fetch() { let mut before_states = HashMap::default(); for (&id, engines) in &cluster.engines { - must_get_equal(engines.kv.as_inner(), b"k1", b"v1"); + must_get_equal(&engines.kv, b"k1", b"v1"); let mut state: RaftApplyState = engines .kv .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) @@ -88,7 +88,7 @@ fn test_node_async_fetch() { for i in 1..60u32 { let k = i.to_string().into_bytes(); let v = k.clone(); - must_get_equal(cluster.engines[&1].kv.as_inner(), &k, &v); + must_get_equal(&cluster.engines[&1].kv, &k, &v); } for i in 60..500u32 { diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index c341d801c9b..5cb7c79011f 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -9,7 +9,6 @@ use std::{ time::Duration, }; -use engine_rocks::Compat; use engine_traits::{Peekable, CF_RAFT}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ @@ -76,7 +75,6 @@ fn test_node_merge_rollback() { let state_key = keys::region_state_key(region.get_id()); let state: RegionLocalState = cluster .get_engine(i) - .c() .get_msg_cf(CF_RAFT, &state_key) .unwrap() .unwrap(); @@ -105,7 +103,6 @@ fn test_node_merge_rollback() { let state_key = keys::region_state_key(region.get_id()); let state: RegionLocalState = cluster .get_engine(i) - .c() .get_msg_cf(CF_RAFT, &state_key) .unwrap() .unwrap(); @@ -139,10 +136,10 @@ fn test_node_merge_restart() { cluster.shutdown(); let engine = cluster.get_engine(leader.get_store_id()); let state_key = keys::region_state_key(left.get_id()); - let state: RegionLocalState = engine.c().get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); + let state: RegionLocalState = engine.get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Merging, "{:?}", state); let state_key = keys::region_state_key(right.get_id()); - let state: RegionLocalState = engine.c().get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); + let state: RegionLocalState = engine.get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Normal, "{:?}", state); fail::remove(schedule_merge_fp); cluster.start().unwrap(); @@ -157,7 +154,6 @@ fn test_node_merge_restart() { let state_key = keys::region_state_key(left.get_id()); let state: RegionLocalState = cluster .get_engine(i) - .c() .get_msg_cf(CF_RAFT, &state_key) .unwrap() .unwrap(); @@ -165,7 +161,6 @@ fn test_node_merge_restart() { let state_key = keys::region_state_key(right.get_id()); let state: RegionLocalState = cluster .get_engine(i) - .c() .get_msg_cf(CF_RAFT, &state_key) .unwrap() .unwrap(); diff --git a/tests/failpoints/cases/test_replica_read.rs b/tests/failpoints/cases/test_replica_read.rs index bd5003d23f2..e288828dc66 100644 --- a/tests/failpoints/cases/test_replica_read.rs +++ b/tests/failpoints/cases/test_replica_read.rs @@ -7,7 +7,6 @@ use std::{ }; use crossbeam::channel; -use engine_rocks::Compat; use engine_traits::{Peekable, RaftEngineReadOnly, CF_RAFT}; use futures::executor::block_on; use kvproto::raft_serverpb::{PeerState, RaftMessage, RegionLocalState}; @@ -229,7 +228,6 @@ fn test_read_applying_snapshot() { let region_key = keys::region_state_key(r1); let region_state: RegionLocalState = cluster .get_engine(3) - .c() .get_msg_cf(CF_RAFT, ®ion_key) .unwrap() .unwrap(); diff --git a/tests/failpoints/cases/test_sst_recovery.rs b/tests/failpoints/cases/test_sst_recovery.rs index e03e58bfa98..b15a43b3d35 100644 --- a/tests/failpoints/cases/test_sst_recovery.rs +++ b/tests/failpoints/cases/test_sst_recovery.rs @@ -1,17 +1,25 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{io::Write, path::Path, sync::Arc, time::Duration}; +use std::{fmt::Debug, io::Write, path::Path, sync::Arc, time::Duration}; -use engine_rocks::{ - raw::{CompactionOptions, DB}, - util::get_cf_handle, -}; +use engine_rocks::RocksEngine; use engine_rocks_helper::sst_recovery::*; -use engine_traits::CF_DEFAULT; +use engine_traits::{CompactExt, Peekable, CF_DEFAULT}; use test_raftstore::*; const CHECK_DURATION: Duration = Duration::from_millis(50); +#[track_caller] +fn assert_corruption(res: engine_traits::Result) { + match res { + Err(engine_traits::Error::Engine(s)) => { + // TODO: check code instead after using tirocks. + assert!(s.state().contains("Corruption"), "{:?}", s); + } + _ => panic!("expected corruption, got {:?}", res), + } +} + #[test] fn test_sst_recovery_basic() { let (mut cluster, pd_client, engine1) = create_tikv_cluster_with_one_node_damaged(); @@ -43,19 +51,19 @@ fn test_sst_recovery_basic() { std::thread::sleep(CHECK_DURATION); - assert_eq!(&engine1.get(b"z1").unwrap().unwrap().to_owned(), b"val"); - assert_eq!(&engine1.get(b"z7").unwrap().unwrap().to_owned(), b"val"); - assert!(engine1.get(b"z4").unwrap_err().contains("Corruption")); + must_get_equal(&engine1, b"1", b"val"); + must_get_equal(&engine1, b"7", b"val"); + assert_corruption(engine1.get_value(b"z4")); fail::remove("sst_recovery_before_delete_files"); std::thread::sleep(CHECK_DURATION); - assert_eq!(&engine1.get(b"z1").unwrap().unwrap().to_owned(), b"val"); - assert_eq!(&engine1.get(b"z7").unwrap().unwrap().to_owned(), b"val"); - assert!(engine1.get(b"z4").unwrap().is_none()); + must_get_equal(&engine1, b"1", b"val"); + must_get_equal(&engine1, b"7", b"val"); + assert!(engine1.get_value(b"z4").unwrap().is_none()); // Damaged file has been deleted. - let files = engine1.get_live_files(); + let files = engine1.as_inner().get_live_files(); assert_eq!(files.get_files_count(), 2); assert_eq!(store_meta.lock().unwrap().damaged_ranges.len(), 0); @@ -75,7 +83,7 @@ fn test_sst_recovery_overlap_range_sst_exist() { cluster.must_put_cf(CF_DEFAULT, b"7", b"val_1"); cluster.flush_data(); - let files = engine1.get_live_files(); + let files = engine1.as_inner().get_live_files(); assert_eq!(files.get_files_count(), 4); // Remove peers for safe deletion of files in sst recovery. @@ -90,13 +98,13 @@ fn test_sst_recovery_overlap_range_sst_exist() { cluster.must_put_cf(CF_DEFAULT, b"4", b"val_2"); std::thread::sleep(CHECK_DURATION); - assert_eq!(&engine1.get(b"z1").unwrap().unwrap().to_owned(), b"val_1"); - assert_eq!(&engine1.get(b"z4").unwrap().unwrap().to_owned(), b"val_1"); - assert_eq!(&engine1.get(b"z7").unwrap().unwrap().to_owned(), b"val_1"); + must_get_equal(&engine1, b"1", b"val_1"); + must_get_equal(&engine1, b"4", b"val_1"); + must_get_equal(&engine1, b"7", b"val_1"); // Validate the damaged sst has been deleted. compact_files_to_target_level(&engine1, true, 3).unwrap(); - let files = engine1.get_live_files(); + let files = engine1.as_inner().get_live_files(); assert_eq!(files.get_files_count(), 1); must_get_equal(&engine1, b"4", b"val_1"); @@ -119,10 +127,10 @@ fn test_sst_recovery_atomic_when_adding_peer() { pd_client.must_remove_peer(region.id, peer.clone()); std::thread::sleep(CHECK_DURATION); - assert_eq!(&engine1.get(b"z1").unwrap().unwrap().to_owned(), b"val"); - assert_eq!(&engine1.get(b"z7").unwrap().unwrap().to_owned(), b"val"); + must_get_equal(&engine1, b"1", b"val"); + must_get_equal(&engine1, b"7", b"val"); // delete file action is paused before. - assert!(engine1.get(b"z4").unwrap_err().contains("Corruption")); + assert_corruption(engine1.get_value(b"z4")); let region = cluster.get_region(b"3"); // add peer back on store 1 to validate atomic of sst recovery. @@ -148,11 +156,11 @@ fn disturb_sst_file(path: &Path) { // To trigger compaction and test background error. // set `compact_all` to `false` only compact the latest flushed file. fn compact_files_to_target_level( - engine: &Arc, + engine: &RocksEngine, compact_all: bool, level: i32, -) -> Result<(), String> { - let files = engine.get_live_files(); +) -> engine_traits::Result<()> { + let files = engine.as_inner().get_live_files(); let mut file_names = vec![]; if compact_all { for i in 0..files.get_files_count() { @@ -166,12 +174,11 @@ fn compact_files_to_target_level( file_names.push(name); } - let handle = get_cf_handle(engine, CF_DEFAULT).unwrap(); - engine.compact_files_cf(handle, &CompactionOptions::new(), &file_names, level) + engine.compact_files_cf(CF_DEFAULT, file_names, Some(level), 1, false) } fn create_tikv_cluster_with_one_node_damaged() --> (Cluster, Arc, Arc) { +-> (Cluster, Arc, RocksEngine) { let mut cluster = new_server_cluster(0, 3); let pd_client = cluster.pd_client.clone(); pd_client.disable_default_operator(); @@ -227,7 +234,7 @@ fn create_tikv_cluster_with_one_node_damaged() cluster.must_split(®ion, b"7"); // after 3 flushing and compacts, now 3 sst files exist. - let files = engine1.get_live_files(); + let files = engine1.as_inner().get_live_files(); assert_eq!(files.get_files_count(), 3); // disturb sst file range [3,5] @@ -243,11 +250,7 @@ fn create_tikv_cluster_with_one_node_damaged() disturb_sst_file(&sst_path); // The sst file is damaged, so this action will fail. - assert!( - compact_files_to_target_level(&engine1, true, 3) - .unwrap_err() - .contains("Corruption") - ); + assert_corruption(compact_files_to_target_level(&engine1, true, 3)); (cluster, pd_client, engine1) } diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 8f8238e27db..bae6262aeb4 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -8,7 +8,7 @@ use std::{ use concurrency_manager::ConcurrencyManager; use engine_rocks::RocksEngine; -use engine_traits::{Engines, ALL_CFS}; +use engine_traits::{Engines, ALL_CFS, CF_DEFAULT}; use kvproto::raft_serverpb::RaftMessage; use raftstore::{ coprocessor::CoprocessorHost, @@ -49,25 +49,12 @@ impl Transport for MockTransport { } fn create_tmp_engine(dir: &TempDir) -> Engines { - let db = Arc::new( - engine_rocks::raw_util::new_engine( - dir.path().join("db").to_str().unwrap(), - None, - ALL_CFS, - None, - ) - .unwrap(), - ); - let raft_db = Arc::new( - engine_rocks::raw_util::new_engine( - dir.path().join("raft").to_str().unwrap(), - None, - &[], - None, - ) - .unwrap(), - ); - Engines::new(RocksEngine::from_db(db), RocksEngine::from_db(raft_db)) + let db = + engine_rocks::util::new_engine(dir.path().join("db").to_str().unwrap(), ALL_CFS).unwrap(); + let raft_db = + engine_rocks::util::new_engine(dir.path().join("raft").to_str().unwrap(), &[CF_DEFAULT]) + .unwrap(); + Engines::new(db, raft_db) } fn start_raftstore( diff --git a/tests/integrations/config/dynamic/split_check.rs b/tests/integrations/config/dynamic/split_check.rs index 325ef8e9929..582ce8f115e 100644 --- a/tests/integrations/config/dynamic/split_check.rs +++ b/tests/integrations/config/dynamic/split_check.rs @@ -2,14 +2,12 @@ use std::{ path::Path, - sync::{ - mpsc::{self, sync_channel}, - Arc, - }, + sync::mpsc::{self, sync_channel}, time::Duration, }; -use engine_rocks::{raw::DB, Compat}; +use engine_rocks::RocksEngine; +use engine_traits::CF_DEFAULT; use raftstore::{ coprocessor::{ config::{Config, SplitCheckConfigManager}, @@ -20,22 +18,18 @@ use raftstore::{ use tikv::config::{ConfigController, Module, TiKvConfig}; use tikv_util::worker::{LazyWorker, Scheduler, Worker}; -fn tmp_engine>(path: P) -> Arc { - Arc::new( - engine_rocks::raw_util::new_engine( - path.as_ref().to_str().unwrap(), - None, - &["split-check-config"], - None, - ) - .unwrap(), +fn tmp_engine>(path: P) -> RocksEngine { + engine_rocks::util::new_engine( + path.as_ref().to_str().unwrap(), + &[CF_DEFAULT, "split-check-config"], ) + .unwrap() } -fn setup(cfg: TiKvConfig, engine: Arc) -> (ConfigController, LazyWorker) { +fn setup(cfg: TiKvConfig, engine: RocksEngine) -> (ConfigController, LazyWorker) { let (router, _) = sync_channel(1); let runner = Runner::new( - engine.c().clone(), + engine, router.clone(), CoprocessorHost::new(router, cfg.coprocessor.clone()), ); diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index 058728cb0a3..f2019d04ea7 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -5,8 +5,7 @@ use std::{ }; use concurrency_manager::ConcurrencyManager; -use engine_rocks::{Compat, RocksEngine}; -use engine_traits::{Engines, Peekable, ALL_CFS, CF_RAFT}; +use engine_traits::{Engines, Peekable, ALL_CFS, CF_DEFAULT, CF_RAFT}; use kvproto::{kvrpcpb::ApiVersion, metapb, raft_serverpb::RegionLocalState}; use raftstore::{ coprocessor::CoprocessorHost, @@ -44,19 +43,12 @@ fn test_node_bootstrap_with_prepared_data() { let (_, system) = fsm::create_raft_batch_system(&cfg.raft_store); let simulate_trans = SimulateTransport::new(ChannelTransport::new()); let tmp_path = Builder::new().prefix("test_cluster").tempdir().unwrap(); - let engine = Arc::new( - engine_rocks::raw_util::new_engine(tmp_path.path().to_str().unwrap(), None, ALL_CFS, None) - .unwrap(), - ); + let engine = + engine_rocks::util::new_engine(tmp_path.path().to_str().unwrap(), ALL_CFS).unwrap(); let tmp_path_raft = tmp_path.path().join(Path::new("raft")); - let raft_engine = Arc::new( - engine_rocks::raw_util::new_engine(tmp_path_raft.to_str().unwrap(), None, &[], None) - .unwrap(), - ); - let engines = Engines::new( - RocksEngine::from_db(Arc::clone(&engine)), - RocksEngine::from_db(Arc::clone(&raft_engine)), - ); + let raft_engine = + engine_rocks::util::new_engine(tmp_path_raft.to_str().unwrap(), &[CF_DEFAULT]).unwrap(); + let engines = Engines::new(engine.clone(), raft_engine); let tmp_mgr = Builder::new().prefix("test_cluster").tempdir().unwrap(); let bg_worker = WorkerBuilder::new("background").thread_count(2).create(); let mut node = Node::new( @@ -81,7 +73,6 @@ fn test_node_bootstrap_with_prepared_data() { let region = node.prepare_bootstrap_cluster(&engines, 1).unwrap(); assert!( engine - .c() .get_msg::(keys::PREPARE_BOOTSTRAP_KEY) .unwrap() .is_some() @@ -89,7 +80,6 @@ fn test_node_bootstrap_with_prepared_data() { let region_state_key = keys::region_state_key(region.get_id()); assert!( engine - .c() .get_msg_cf::(CF_RAFT, ®ion_state_key) .unwrap() .is_some() @@ -121,15 +111,13 @@ fn test_node_bootstrap_with_prepared_data() { ) .unwrap(); assert!( - Arc::clone(&engine) - .c() + engine .get_msg::(keys::PREPARE_BOOTSTRAP_KEY) .unwrap() .is_none() ); assert!( engine - .c() .get_msg_cf::(CF_RAFT, ®ion_state_key) .unwrap() .is_none() diff --git a/tests/integrations/raftstore/test_clear_stale_data.rs b/tests/integrations/raftstore/test_clear_stale_data.rs index b67148b473d..8010d4c956c 100644 --- a/tests/integrations/raftstore/test_clear_stale_data.rs +++ b/tests/integrations/raftstore/test_clear_stale_data.rs @@ -1,30 +1,31 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use engine_rocks::raw::{CompactOptions, Writable, DB}; -use engine_traits::{CF_DEFAULT, CF_LOCK}; +use engine_rocks::{raw::CompactOptions, RocksEngine}; +use engine_traits::{MiscExt, Peekable, SyncMutable, CF_DEFAULT, CF_LOCK}; use test_raftstore::*; -fn init_db_with_sst_files(db: &DB, level: i32, n: u8) { +fn init_db_with_sst_files(db: &RocksEngine, level: i32, n: u8) { let mut opts = CompactOptions::new(); opts.set_change_level(true); opts.set_target_level(level); for cf_name in &[CF_DEFAULT, CF_LOCK] { - let handle = db.cf_handle(cf_name).unwrap(); + let handle = db.as_inner().cf_handle(cf_name).unwrap(); // Each SST file has only one kv. for i in 0..n { let k = keys::data_key(&[i]); - db.put_cf(handle, &k, &k).unwrap(); - db.flush_cf(handle, true).unwrap(); - db.compact_range_cf_opt(handle, &opts, None, None); + db.put_cf(cf_name, &k, &k).unwrap(); + db.flush_cf(cf_name, true).unwrap(); + db.as_inner() + .compact_range_cf_opt(handle, &opts, None, None); } } } -fn check_db_files_at_level(db: &DB, level: i32, num_files: u64) { +fn check_db_files_at_level(db: &RocksEngine, level: i32, num_files: u64) { for cf_name in &[CF_DEFAULT, CF_LOCK] { - let handle = db.cf_handle(cf_name).unwrap(); + let handle = db.as_inner().cf_handle(cf_name).unwrap(); let name = format!("rocksdb.num-files-at-level{}", level); - let value = db.get_property_int_cf(handle, &name).unwrap(); + let value = db.as_inner().get_property_int_cf(handle, &name).unwrap(); if value != num_files { panic!( "cf {} level {} should have {} files, got {}", @@ -34,11 +35,10 @@ fn check_db_files_at_level(db: &DB, level: i32, num_files: u64) { } } -fn check_kv_in_all_cfs(db: &DB, i: u8, found: bool) { +fn check_kv_in_all_cfs(db: &RocksEngine, i: u8, found: bool) { for cf_name in &[CF_DEFAULT, CF_LOCK] { - let handle = db.cf_handle(cf_name).unwrap(); let k = keys::data_key(&[i]); - let v = db.get_cf(handle, &k).unwrap(); + let v = db.get_value_cf(cf_name, &k).unwrap(); if found { assert_eq!(v.unwrap(), &k); } else { diff --git a/tests/integrations/raftstore/test_compact_after_delete.rs b/tests/integrations/raftstore/test_compact_after_delete.rs index b31b86b3bfb..13cfb535e97 100644 --- a/tests/integrations/raftstore/test_compact_after_delete.rs +++ b/tests/integrations/raftstore/test_compact_after_delete.rs @@ -63,8 +63,7 @@ fn test_compact_after_delete(cluster: &mut Cluster) { cluster.must_delete_cf(CF_WRITE, &k); } for engines in cluster.engines.values() { - let cf = get_cf_handle(engines.kv.as_inner(), CF_WRITE).unwrap(); - engines.kv.as_inner().flush_cf(cf, true).unwrap(); + engines.kv.flush_cf(CF_WRITE, true).unwrap(); } // wait for compaction. diff --git a/tests/integrations/raftstore/test_compact_log.rs b/tests/integrations/raftstore/test_compact_log.rs index abaa18b50fa..e7d14a6eb45 100644 --- a/tests/integrations/raftstore/test_compact_log.rs +++ b/tests/integrations/raftstore/test_compact_log.rs @@ -53,7 +53,7 @@ fn test_compact_count_limit(cluster: &mut Cluster) { let mut before_states = HashMap::default(); for (&id, engines) in &cluster.engines { - must_get_equal(engines.kv.as_inner(), b"k1", b"v1"); + must_get_equal(&engines.kv, b"k1", b"v1"); let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); let state = state.take_truncated_state(); // compact should not start @@ -119,7 +119,7 @@ fn test_compact_many_times(cluster: &mut Cluster) { let mut before_states = HashMap::default(); for (&id, engines) in &cluster.engines { - must_get_equal(engines.kv.as_inner(), b"k1", b"v1"); + must_get_equal(&engines.kv, b"k1", b"v1"); let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); let state = state.take_truncated_state(); // compact should not start @@ -190,7 +190,7 @@ fn test_compact_size_limit(cluster: &mut Cluster) { if id == 1 { continue; } - must_get_equal(engines.kv.as_inner(), b"k1", b"v1"); + must_get_equal(&engines.kv, b"k1", b"v1"); let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); let state = state.take_truncated_state(); // compact should not start @@ -263,7 +263,7 @@ fn test_compact_reserve_max_ticks(cluster: &mut Cluster) { let mut before_states = HashMap::default(); for (&id, engines) in &cluster.engines { - must_get_equal(engines.kv.as_inner(), b"k1", b"v1"); + must_get_equal(&engines.kv, b"k1", b"v1"); let mut state: RaftApplyState = get_raft_msg_or_default(engines, &apply_key); let state = state.take_truncated_state(); // compact should not start diff --git a/tests/integrations/raftstore/test_conf_change.rs b/tests/integrations/raftstore/test_conf_change.rs index ab4166d5826..3778794387a 100644 --- a/tests/integrations/raftstore/test_conf_change.rs +++ b/tests/integrations/raftstore/test_conf_change.rs @@ -9,7 +9,6 @@ use std::{ time::Duration, }; -use engine_rocks::Compat; use engine_traits::{Peekable, CF_RAFT}; use futures::executor::block_on; use kvproto::{ @@ -176,7 +175,6 @@ fn test_pd_conf_change(cluster: &mut Cluster) { let engine_2 = cluster.get_engine(peer2.get_store_id()); assert!( engine_2 - .c() .get_value(&keys::data_key(b"k1")) .unwrap() .is_none() @@ -402,7 +400,6 @@ fn test_after_remove_itself(cluster: &mut Cluster) { for _ in 0..250 { let region: RegionLocalState = engine1 - .c() .get_msg_cf(CF_RAFT, &keys::region_state_key(r1)) .unwrap() .unwrap(); @@ -412,7 +409,6 @@ fn test_after_remove_itself(cluster: &mut Cluster) { sleep_ms(20); } let region: RegionLocalState = engine1 - .c() .get_msg_cf(CF_RAFT, &keys::region_state_key(r1)) .unwrap() .unwrap(); diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 4d7914429ab..1146e152681 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -2,7 +2,6 @@ use std::{iter::*, sync::*, thread, time::*}; -use engine_rocks::Compat; use engine_traits::{Peekable, CF_LOCK, CF_RAFT, CF_WRITE}; use kvproto::{ kvrpcpb::Context, @@ -93,7 +92,6 @@ fn test_node_base_merge() { for _ in 0..3 { state = cluster .get_engine(i) - .c() .get_msg_cf(CF_RAFT, &state_key) .unwrap() .unwrap(); @@ -534,7 +532,6 @@ fn test_node_merge_brain_split() { let state_key = keys::region_state_key(left.get_id()); let state: RegionLocalState = cluster .get_engine(3) - .c() .get_msg_cf(CF_RAFT, &state_key) .unwrap() .unwrap(); diff --git a/tests/integrations/raftstore/test_multi.rs b/tests/integrations/raftstore/test_multi.rs index 00fb8f99e05..296d6f207cf 100644 --- a/tests/integrations/raftstore/test_multi.rs +++ b/tests/integrations/raftstore/test_multi.rs @@ -6,7 +6,6 @@ use std::{ time::Duration, }; -use engine_rocks::Compat; use engine_traits::Peekable; use kvproto::raft_cmdpb::RaftCmdResponse; use raft::eraftpb::MessageType; @@ -33,7 +32,7 @@ fn test_multi_base_after_bootstrap(cluster: &mut Cluster) { thread::sleep(Duration::from_millis(200)); cluster.assert_quorum( - |engine| match engine.c().get_value(&keys::data_key(key)).unwrap() { + |engine| match engine.get_value(&keys::data_key(key)).unwrap() { None => false, Some(v) => &*v == value, }, @@ -45,13 +44,7 @@ fn test_multi_base_after_bootstrap(cluster: &mut Cluster) { // sleep 200ms in case the commit packet is dropped by simulated transport. thread::sleep(Duration::from_millis(200)); - cluster.assert_quorum(|engine| { - engine - .c() - .get_value(&keys::data_key(key)) - .unwrap() - .is_none() - }); + cluster.assert_quorum(|engine| engine.get_value(&keys::data_key(key)).unwrap().is_none()); // TODO add epoch not match test cases. } @@ -79,12 +72,9 @@ fn test_multi_leader_crash(cluster: &mut Cluster) { cluster.must_put(key2, value2); cluster.must_delete(key1); - must_get_none( - cluster.engines[&last_leader.get_store_id()].kv.as_inner(), - key2, - ); + must_get_none(&cluster.engines[&last_leader.get_store_id()].kv, key2); must_get_equal( - cluster.engines[&last_leader.get_store_id()].kv.as_inner(), + &cluster.engines[&last_leader.get_store_id()].kv, key1, value1, ); @@ -93,14 +83,11 @@ fn test_multi_leader_crash(cluster: &mut Cluster) { cluster.run_node(last_leader.get_store_id()).unwrap(); must_get_equal( - cluster.engines[&last_leader.get_store_id()].kv.as_inner(), + &cluster.engines[&last_leader.get_store_id()].kv, key2, value2, ); - must_get_none( - cluster.engines[&last_leader.get_store_id()].kv.as_inner(), - key1, - ); + must_get_none(&cluster.engines[&last_leader.get_store_id()].kv, key1); } fn test_multi_cluster_restart(cluster: &mut Cluster) { diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 2d6657e5a90..53c56510574 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -7,7 +7,6 @@ use std::{ time::Duration, }; -use engine_rocks::Compat; use engine_traits::{Iterable, Peekable, CF_DEFAULT, CF_WRITE}; use keys::data_key; use kvproto::{metapb, pdpb, raft_cmdpb::*, raft_serverpb::RaftMessage}; @@ -295,7 +294,7 @@ fn check_cluster(cluster: &mut Cluster, k: &[u8], v: &[u8], all_ // Note that a follower can still commit the log by an empty MsgAppend // when bcast commit is disabled. A heartbeat response comes to leader // before MsgAppendResponse will trigger MsgAppend. - match engine.c().get_value(&keys::data_key(k)).unwrap() { + match engine.get_value(&keys::data_key(k)).unwrap() { Some(res) => assert_eq!(v, &res[..]), None => missing_count += 1, } diff --git a/tests/integrations/raftstore/test_stale_peer.rs b/tests/integrations/raftstore/test_stale_peer.rs index 92e9d6ac77b..e9edcc49966 100644 --- a/tests/integrations/raftstore/test_stale_peer.rs +++ b/tests/integrations/raftstore/test_stale_peer.rs @@ -4,7 +4,6 @@ use std::{sync::Arc, thread, time::*}; -use engine_rocks::Compat; use engine_traits::{Peekable, CF_RAFT}; use kvproto::raft_serverpb::{PeerState, RegionLocalState}; use raft::eraftpb::MessageType; @@ -79,11 +78,7 @@ fn test_stale_peer_out_of_region(cluster: &mut Cluster) { must_get_none(&engine_2, key); must_get_none(&engine_2, key2); let state_key = keys::region_state_key(1); - let state: RegionLocalState = engine_2 - .c() - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state: RegionLocalState = engine_2.get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Tombstone); } @@ -171,11 +166,7 @@ fn test_stale_peer_without_data(cluster: &mut Cluster, right_de // Before peer 4 is destroyed, a tombstone mark will be written into the engine. // So we could check the tombstone mark to make sure peer 4 is destroyed. let state_key = keys::region_state_key(new_region_id); - let state: RegionLocalState = engine3 - .c() - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state: RegionLocalState = engine3.get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Tombstone); // other region should not be affected. @@ -258,11 +249,7 @@ fn test_stale_learner() { // Check not leader should fail, all data should be removed. must_get_none(&engine3, b"k1"); let state_key = keys::region_state_key(r1); - let state: RegionLocalState = engine3 - .c() - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state: RegionLocalState = engine3.get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Tombstone); } @@ -317,10 +304,6 @@ fn test_stale_learner_with_read_index() { // Stale learner should be destroyed due to interaction between leader must_get_none(&engine3, b"k1"); let state_key = keys::region_state_key(r1); - let state: RegionLocalState = engine3 - .c() - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state: RegionLocalState = engine3.get_msg_cf(CF_RAFT, &state_key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Tombstone); } diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index c9f698edd65..03c0f0a82b2 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -7,6 +7,7 @@ use std::{ }; use api_version::{test_kv_format_impl, KvFormat}; +use engine_traits::MiscExt; use futures::{executor::block_on, SinkExt, StreamExt}; use grpcio::*; use kvproto::{kvrpcpb::*, pdpb::QueryKind, tikvpb::*, tikvpb_grpc::TikvClient}; diff --git a/tests/integrations/raftstore/test_tombstone.rs b/tests/integrations/raftstore/test_tombstone.rs index 18a1e5a96ca..189587dea44 100644 --- a/tests/integrations/raftstore/test_tombstone.rs +++ b/tests/integrations/raftstore/test_tombstone.rs @@ -3,8 +3,7 @@ use std::{sync::Arc, thread, time::Duration}; use crossbeam::channel; -use engine_rocks::{raw::Writable, Compat}; -use engine_traits::{Iterable, Peekable, RaftEngineReadOnly, SyncMutable, CF_RAFT}; +use engine_traits::{CFNamesExt, Iterable, Peekable, RaftEngineReadOnly, SyncMutable, CF_RAFT}; use kvproto::raft_serverpb::{PeerState, RaftMessage, RegionLocalState, StoreIdent}; use protobuf::Message; use raft::eraftpb::MessageType; @@ -49,7 +48,6 @@ fn test_tombstone(cluster: &mut Cluster) { let mut existing_kvs = vec![]; for cf in engine_2.cf_names() { engine_2 - .c() .scan(cf, b"", &[0xFF], false, |k, v| { existing_kvs.push((k.to_vec(), v.to_vec())); Ok(true) @@ -134,7 +132,7 @@ fn test_fast_destroy(cluster: &mut Cluster) { cluster.stop_node(3); let key = keys::region_state_key(1); - let state: RegionLocalState = engine_3.c().get_msg_cf(CF_RAFT, &key).unwrap().unwrap(); + let state: RegionLocalState = engine_3.get_msg_cf(CF_RAFT, &key).unwrap().unwrap(); assert_eq!(state.get_state(), PeerState::Tombstone); // Force add some dirty data. @@ -245,14 +243,12 @@ fn test_server_stale_meta() { let engine_3 = cluster.get_engine(3); let mut state: RegionLocalState = engine_3 - .c() .get_msg_cf(CF_RAFT, &keys::region_state_key(1)) .unwrap() .unwrap(); state.set_state(PeerState::Tombstone); engine_3 - .c() .put_msg_cf(CF_RAFT, &keys::region_state_key(1), &state) .unwrap(); cluster.clear_send_filters(); @@ -316,7 +312,7 @@ fn test_safe_tombstone_gc() { let mut state: Option = None; let timer = Instant::now(); while timer.saturating_elapsed() < Duration::from_secs(5) { - state = cluster.get_engine(4).c().get_msg_cf(CF_RAFT, &key).unwrap(); + state = cluster.get_engine(4).get_msg_cf(CF_RAFT, &key).unwrap(); if state.is_some() { break; } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 18f3f7278d5..367f38114f6 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -9,7 +9,6 @@ use std::{ use api_version::{ApiV1, ApiV1Ttl, ApiV2, KvFormat}; use concurrency_manager::ConcurrencyManager; -use engine_rocks::{raw::Writable, Compat}; use engine_traits::{ MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, SyncMutable, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, @@ -709,7 +708,7 @@ fn test_debug_get() { let engine = cluster.get_engine(store_id); let key = keys::data_key(k); engine.put(&key, v).unwrap(); - assert_eq!(engine.get(&key).unwrap().unwrap(), v); + assert_eq!(engine.get_value(&key).unwrap().unwrap(), v); // Debug get let mut req = debugpb::GetRequest::default(); @@ -784,12 +783,10 @@ fn test_debug_region_info() { let mut apply_state = raft_serverpb::RaftApplyState::default(); apply_state.set_applied_index(42); kv_engine - .c() .put_msg_cf(CF_RAFT, &apply_state_key, &apply_state) .unwrap(); assert_eq!( kv_engine - .c() .get_msg_cf::(CF_RAFT, &apply_state_key) .unwrap() .unwrap(), @@ -800,12 +797,10 @@ fn test_debug_region_info() { let mut region_state = raft_serverpb::RegionLocalState::default(); region_state.set_state(raft_serverpb::PeerState::Tombstone); kv_engine - .c() .put_msg_cf(CF_RAFT, ®ion_state_key, ®ion_state) .unwrap(); assert_eq!( kv_engine - .c() .get_msg_cf::(CF_RAFT, ®ion_state_key) .unwrap() .unwrap(), @@ -844,7 +839,6 @@ fn test_debug_region_size() { let mut state = RegionLocalState::default(); state.set_region(region); engine - .c() .put_msg_cf(CF_RAFT, ®ion_state_key, &state) .unwrap(); @@ -852,8 +846,7 @@ fn test_debug_region_size() { // At lease 8 bytes for the WRITE cf. let (k, v) = (keys::data_key(b"kkkk_kkkk"), b"v"); for cf in &cfs { - let cf_handle = engine.cf_handle(cf).unwrap(); - engine.put_cf(cf_handle, k.as_slice(), v).unwrap(); + engine.put_cf(cf, k.as_slice(), v).unwrap(); } let mut req = debugpb::RegionSizeRequest::default(); @@ -938,8 +931,7 @@ fn test_debug_scan_mvcc() { TimeStamp::zero(), ) .to_bytes(); - let cf_handle = engine.cf_handle(CF_LOCK).unwrap(); - engine.put_cf(cf_handle, k.as_slice(), &v).unwrap(); + engine.put_cf(CF_LOCK, k.as_slice(), &v).unwrap(); } let mut req = debugpb::ScanMvccRequest::default(); diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index cfc250a8e15..cd311386769 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -2,19 +2,17 @@ use std::{ path::{Path, PathBuf}, - sync::Arc, thread, time::Duration, }; use engine_rocks::{ - raw::{IngestExternalFileOptions, Writable}, - util::{get_cf_handle, new_temp_engine}, - Compat, RocksEngine, RocksSnapshot, RocksSstWriterBuilder, + raw::IngestExternalFileOptions, util::new_temp_engine, RocksEngine, RocksSnapshot, + RocksSstWriterBuilder, }; use engine_traits::{ - CompactExt, DeleteStrategy, Engines, KvEngine, MiscExt, Range, SstWriter, SstWriterBuilder, - ALL_CFS, CF_DEFAULT, CF_WRITE, + CFOptionsExt, CompactExt, DeleteStrategy, Engines, KvEngine, MiscExt, Range, SstWriter, + SstWriterBuilder, SyncMutable, CF_DEFAULT, CF_WRITE, }; use keys::data_key; use kvproto::metapb::{Peer, Region}; @@ -64,7 +62,8 @@ fn test_turnoff_titan() { } cluster.must_flush_cf(CF_DEFAULT, true); for i in cluster.get_node_ids().into_iter() { - let db = cluster.get_engine(i); + let engine = cluster.get_engine(i); + let db = engine.as_inner(); assert_eq!( db.get_property_int("rocksdb.num-files-at-level0").unwrap(), 2 @@ -96,9 +95,8 @@ fn test_turnoff_titan() { assert_eq!(cluster.must_get(b"k1"), None); for i in cluster.get_node_ids().into_iter() { let db = cluster.get_engine(i); - let handle = get_cf_handle(&db, CF_DEFAULT).unwrap(); let opt = vec![("blob_run_mode", "kFallback")]; - assert!(db.set_options_cf(handle, &opt).is_ok()); + assert!(db.set_options_cf(CF_DEFAULT, &opt).is_ok()); } cluster.compact_data(); let mut all_check_pass = true; @@ -107,7 +105,8 @@ fn test_turnoff_titan() { sleep_ms(10); all_check_pass = true; for i in cluster.get_node_ids().into_iter() { - let db = cluster.get_engine(i); + let engine = cluster.get_engine(i); + let db = engine.as_inner(); if db.get_property_int("rocksdb.num-files-at-level0").unwrap() != 0 { all_check_pass = false; break; @@ -171,24 +170,9 @@ fn test_delete_files_in_range_for_titan() { let raft_path = path.path().join(Path::new("titan")); let engines = Engines::new( - RocksEngine::from_db(Arc::new( - engine_rocks::raw_util::new_engine( - path.path().to_str().unwrap(), - Some(kv_db_opts), - ALL_CFS, - Some(kv_cfs_opts), - ) - .unwrap(), - )), - RocksEngine::from_db(Arc::new( - engine_rocks::raw_util::new_engine( - raft_path.to_str().unwrap(), - None, - &[CF_DEFAULT], - None, - ) + engine_rocks::util::new_engine_opt(path.path().to_str().unwrap(), kv_db_opts, kv_cfs_opts) .unwrap(), - )), + engine_rocks::util::new_engine(raft_path.to_str().unwrap(), &[CF_DEFAULT]).unwrap(), ); // Write some mvcc keys and values into db @@ -197,37 +181,43 @@ fn test_delete_files_in_range_for_titan() { let start_ts = 7.into(); let commit_ts = 8.into(); let write = Write::new(WriteType::Put, start_ts, None); - let db = engines.kv.as_inner(); - let default_cf = db.cf_handle(CF_DEFAULT).unwrap(); - let write_cf = db.cf_handle(CF_WRITE).unwrap(); - db.put_cf( - default_cf, - &data_key(Key::from_raw(b"a").append_ts(start_ts).as_encoded()), - b"a_value", - ) - .unwrap(); - db.put_cf( - write_cf, - &data_key(Key::from_raw(b"a").append_ts(commit_ts).as_encoded()), - &write.as_ref().to_bytes(), - ) - .unwrap(); - db.put_cf( - default_cf, - &data_key(Key::from_raw(b"b").append_ts(start_ts).as_encoded()), - b"b_value", - ) - .unwrap(); - db.put_cf( - write_cf, - &data_key(Key::from_raw(b"b").append_ts(commit_ts).as_encoded()), - &write.as_ref().to_bytes(), - ) - .unwrap(); + engines + .kv + .put_cf( + CF_DEFAULT, + &data_key(Key::from_raw(b"a").append_ts(start_ts).as_encoded()), + b"a_value", + ) + .unwrap(); + engines + .kv + .put_cf( + CF_WRITE, + &data_key(Key::from_raw(b"a").append_ts(commit_ts).as_encoded()), + &write.as_ref().to_bytes(), + ) + .unwrap(); + engines + .kv + .put_cf( + CF_DEFAULT, + &data_key(Key::from_raw(b"b").append_ts(start_ts).as_encoded()), + b"b_value", + ) + .unwrap(); + engines + .kv + .put_cf( + CF_WRITE, + &data_key(Key::from_raw(b"b").append_ts(commit_ts).as_encoded()), + &write.as_ref().to_bytes(), + ) + .unwrap(); // Flush and compact the kvs into L6. - db.flush(true).unwrap(); - db.c().compact_files_in_range(None, None, None).unwrap(); + engines.kv.flush(true).unwrap(); + engines.kv.compact_files_in_range(None, None, None).unwrap(); + let db = engines.kv.as_inner(); let value = db.get_property_int("rocksdb.num-files-at-level0").unwrap(); assert_eq!(value, 0); let value = db.get_property_int("rocksdb.num-files-at-level6").unwrap(); @@ -247,7 +237,8 @@ fn test_delete_files_in_range_for_titan() { writer.finish().unwrap(); let mut opts = IngestExternalFileOptions::new(); opts.move_files(true); - db.ingest_external_file_cf(default_cf, &opts, &[sst_file_path.to_str().unwrap()]) + let cf_default = db.cf_handle(CF_DEFAULT).unwrap(); + db.ingest_external_file_cf(cf_default, &opts, &[sst_file_path.to_str().unwrap()]) .unwrap(); // Now the LSM structure of default cf is: @@ -265,12 +256,12 @@ fn test_delete_files_in_range_for_titan() { assert_eq!(value, 1); // Used to trigger titan gc - let db = engines.kv.as_inner(); - db.put(b"1", b"1").unwrap(); - db.flush(true).unwrap(); - db.put(b"2", b"2").unwrap(); - db.flush(true).unwrap(); - db.c() + let engine = &engines.kv; + engine.put(b"1", b"1").unwrap(); + engine.flush(true).unwrap(); + engine.put(b"2", b"2").unwrap(); + engine.flush(true).unwrap(); + engine .compact_files_in_range(Some(b"0"), Some(b"3"), Some(1)) .unwrap(); @@ -286,6 +277,7 @@ fn test_delete_files_in_range_for_titan() { // blob2: (1, 1) // blob3: (2, 2) // blob4: (b_7, b_value) + let db = engine.as_inner(); let value = db.get_property_int("rocksdb.num-files-at-level0").unwrap(); assert_eq!(value, 0); let value = db.get_property_int("rocksdb.num-files-at-level1").unwrap(); From 4f8f731485906465b868ee32a4ba7d550c0631e2 Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 28 Jul 2022 14:27:11 +0800 Subject: [PATCH 118/676] *: limit comment width by rustfmt (#13139) close tikv/tikv#13150 limit comment width to 80 by rustfmt Signed-off-by: Connor1996 --- cmd/tikv-ctl/src/cmd.rs | 29 +- cmd/tikv-ctl/src/executor.rs | 4 +- cmd/tikv-ctl/src/main.rs | 9 +- components/api_version/src/api_v1.rs | 4 +- components/api_version/src/api_v1ttl.rs | 4 +- components/api_version/src/api_v2.rs | 19 +- components/api_version/src/lib.rs | 27 +- .../backup-stream/src/checkpoint_manager.rs | 15 +- components/backup-stream/src/endpoint.rs | 37 +- components/backup-stream/src/event_loader.rs | 34 +- .../backup-stream/src/metadata/client.rs | 20 +- components/backup-stream/src/metadata/keys.rs | 9 +- .../backup-stream/src/metadata/store/etcd.rs | 20 +- .../src/metadata/store/lazy_etcd.rs | 7 +- .../backup-stream/src/metadata/store/mod.rs | 3 +- .../src/metadata/store/slash_etc.rs | 3 +- components/backup-stream/src/observer.rs | 18 +- components/backup-stream/src/router.rs | 95 ++-- .../backup-stream/src/subscription_manager.rs | 35 +- .../backup-stream/src/subscription_track.rs | 56 +- components/backup-stream/src/utils.rs | 56 +- components/backup-stream/tests/mod.rs | 19 +- components/backup/src/endpoint.rs | 39 +- components/backup/src/softlimit.rs | 20 +- components/backup/src/utils.rs | 18 +- components/backup/src/writer.rs | 9 +- .../batch-system/benches/batch-system.rs | 4 +- components/batch-system/src/batch.rs | 43 +- components/batch-system/src/router.rs | 4 +- components/causal_ts/src/config.rs | 5 +- components/causal_ts/src/lib.rs | 3 +- components/causal_ts/src/observer.rs | 26 +- components/causal_ts/src/tso.rs | 23 +- components/cdc/src/channel.rs | 3 +- components/cdc/src/delegate.rs | 44 +- components/cdc/src/endpoint.rs | 38 +- components/cdc/src/initializer.rs | 13 +- components/cdc/src/metrics.rs | 6 +- components/cdc/src/observer.rs | 10 +- components/cdc/src/old_value.rs | 11 +- .../cdc/tests/failpoints/test_endpoint.rs | 16 +- components/cdc/tests/integrations/test_cdc.rs | 53 +- components/cloud/aws/src/kms.rs | 8 +- components/cloud/aws/src/s3.rs | 27 +- components/cloud/azure/src/azblob.rs | 9 +- components/cloud/gcp/src/gcs.rs | 6 +- components/cloud/src/blob.rs | 3 +- components/codec/src/buffer.rs | 24 +- components/codec/src/byte.rs | 119 +++-- components/codec/src/error.rs | 3 +- components/codec/src/number.rs | 58 ++- components/concurrency_manager/src/lib.rs | 8 +- .../concurrency_manager/src/lock_table.rs | 11 +- .../concurrency_manager/tests/memory_usage.rs | 3 +- .../coprocessor_plugin_api/src/allocator.rs | 16 +- .../coprocessor_plugin_api/src/errors.rs | 16 +- components/coprocessor_plugin_api/src/lib.rs | 25 +- .../coprocessor_plugin_api/src/plugin_api.rs | 31 +- .../coprocessor_plugin_api/src/storage_api.rs | 32 +- components/coprocessor_plugin_api/src/util.rs | 47 +- components/encryption/export/src/lib.rs | 3 +- components/encryption/src/crypter.rs | 4 +- .../encryption/src/encrypted_file/mod.rs | 4 +- components/encryption/src/file_dict_file.rs | 51 +- components/encryption/src/io.rs | 10 +- components/encryption/src/manager/mod.rs | 20 +- components/encryption/src/master_key/kms.rs | 10 +- components/encryption/src/master_key/mem.rs | 18 +- components/encryption/src/master_key/mod.rs | 5 +- components/engine_rocks/src/engine.rs | 6 +- components/engine_rocks/src/event_listener.rs | 2 + components/engine_rocks/src/file_system.rs | 12 +- components/engine_rocks/src/import.rs | 7 +- components/engine_rocks/src/lib.rs | 5 +- components/engine_rocks/src/misc.rs | 12 +- .../engine_rocks/src/perf_context_impl.rs | 7 +- components/engine_rocks/src/properties.rs | 12 +- components/engine_rocks/src/raft_engine.rs | 3 +- .../engine_rocks/src/range_properties.rs | 7 +- components/engine_rocks/src/rocks_metrics.rs | 8 +- components/engine_rocks/src/sst.rs | 12 +- components/engine_rocks/src/util.rs | 14 +- components/engine_rocks/src/write_batch.rs | 19 +- .../engine_rocks_helper/src/sst_recovery.rs | 10 +- components/engine_test/src/lib.rs | 3 +- components/engine_traits/src/compact.rs | 11 +- components/engine_traits/src/engine.rs | 29 +- components/engine_traits/src/lib.rs | 41 +- components/engine_traits/src/misc.rs | 34 +- components/engine_traits/src/peekable.rs | 3 +- components/engine_traits/src/perf_context.rs | 3 +- components/engine_traits/src/raft_engine.rs | 4 +- .../engine_traits/src/range_properties.rs | 3 +- .../engine_traits/src/sst_partitioner.rs | 4 +- .../external_storage/export/src/export.rs | 9 +- .../external_storage/export/src/request.rs | 3 +- components/external_storage/src/lib.rs | 6 +- components/external_storage/src/local.rs | 11 +- components/external_storage/src/request.rs | 3 +- components/file_system/src/file.rs | 3 +- .../file_system/src/io_stats/biosnoop.rs | 10 +- components/file_system/src/lib.rs | 16 +- components/file_system/src/metrics_manager.rs | 3 +- components/file_system/src/rate_limiter.rs | 80 +-- components/keys/src/lib.rs | 11 +- components/log_wrappers/src/lib.rs | 14 +- components/log_wrappers/src/test_util.rs | 7 +- components/online_config/src/lib.rs | 4 +- components/panic_hook/src/lib.rs | 3 +- components/pd_client/src/config.rs | 11 +- components/pd_client/src/feature_gate.rs | 8 +- components/pd_client/src/lib.rs | 27 +- components/pd_client/src/tso.rs | 53 +- components/pd_client/src/util.rs | 29 +- components/profiler/examples/prime.rs | 3 +- components/profiler/src/lib.rs | 9 +- components/profiler/src/profiler_unix.rs | 18 +- components/raft_log_engine/src/engine.rs | 3 +- components/raft_log_engine/src/lib.rs | 3 +- components/raftstore-v2/src/batch/apply.rs | 3 +- components/raftstore-v2/src/batch/store.rs | 6 +- components/raftstore-v2/src/lib.rs | 9 +- components/raftstore-v2/src/raft/peer.rs | 4 +- components/raftstore-v2/src/router/message.rs | 8 +- components/raftstore-v2/src/tablet.rs | 5 +- .../raftstore/src/coprocessor/config.rs | 3 +- .../raftstore/src/coprocessor/dispatcher.rs | 18 +- components/raftstore/src/coprocessor/mod.rs | 46 +- .../src/coprocessor/region_info_accessor.rs | 147 +++--- .../src/coprocessor/split_check/keys.rs | 10 +- .../src/coprocessor/split_check/mod.rs | 4 +- .../src/coprocessor/split_check/size.rs | 13 +- .../src/coprocessor/split_check/table.rs | 5 +- .../raftstore/src/store/async_io/write.rs | 17 +- .../src/store/async_io/write_router.rs | 31 +- components/raftstore/src/store/bootstrap.rs | 4 +- .../raftstore/src/store/compaction_guard.rs | 32 +- components/raftstore/src/store/config.rs | 49 +- .../raftstore/src/store/entry_storage.rs | 41 +- components/raftstore/src/store/fsm/apply.rs | 193 ++++--- components/raftstore/src/store/fsm/peer.rs | 477 ++++++++++-------- components/raftstore/src/store/fsm/store.rs | 80 +-- components/raftstore/src/store/msg.rs | 51 +- components/raftstore/src/store/peer.rs | 455 +++++++++-------- .../raftstore/src/store/peer_storage.rs | 74 +-- components/raftstore/src/store/read_queue.rs | 14 +- .../raftstore/src/store/region_snapshot.rs | 4 +- .../raftstore/src/store/replication_mode.rs | 11 +- components/raftstore/src/store/snap.rs | 47 +- components/raftstore/src/store/snap/io.rs | 7 +- components/raftstore/src/store/txn_ext.rs | 104 ++-- components/raftstore/src/store/util.rs | 147 +++--- .../src/store/worker/check_leader.rs | 11 +- .../raftstore/src/store/worker/compact.rs | 24 +- components/raftstore/src/store/worker/pd.rs | 25 +- .../raftstore/src/store/worker/raftlog_gc.rs | 5 +- components/raftstore/src/store/worker/read.rs | 23 +- .../raftstore/src/store/worker/region.rs | 30 +- .../raftstore/src/store/worker/split_check.rs | 9 +- .../src/store/worker/split_config.rs | 31 +- .../src/store/worker/split_controller.rs | 59 ++- components/resolved_ts/src/advance.rs | 16 +- components/resolved_ts/src/cmd.rs | 10 +- components/resolved_ts/src/endpoint.rs | 28 +- components/resolved_ts/src/lib.rs | 13 +- components/resolved_ts/src/observer.rs | 14 +- components/resolved_ts/src/resolver.rs | 6 +- components/resolved_ts/src/scanner.rs | 3 +- components/resource_metering/src/collector.rs | 3 +- components/resource_metering/src/lib.rs | 28 +- components/resource_metering/src/model.rs | 3 +- .../src/recorder/collector_reg.rs | 18 +- .../src/recorder/localstorage.rs | 7 +- .../resource_metering/src/recorder/mod.rs | 5 +- .../src/recorder/sub_recorder/mod.rs | 25 +- .../src/recorder/sub_recorder/summary.rs | 8 +- .../src/reporter/data_sink.rs | 5 +- .../resource_metering/src/reporter/mod.rs | 9 +- .../resource_metering/src/reporter/pubsub.rs | 5 +- .../src/reporter/single_target.rs | 8 +- .../resource_metering/tests/summary_test.rs | 2 +- components/security/src/lib.rs | 3 +- components/server/src/raft_engine_switch.rs | 7 +- components/server/src/server.rs | 45 +- components/server/src/setup.rs | 15 +- components/sst_importer/src/import_file.rs | 8 +- components/sst_importer/src/sst_importer.rs | 19 +- components/sst_importer/src/util.rs | 25 +- components/test_backup/src/lib.rs | 2 +- components/test_coprocessor/src/dag.rs | 3 +- components/test_coprocessor/src/fixture.rs | 3 +- components/test_coprocessor/src/table.rs | 6 +- components/test_raftstore/src/cluster.rs | 26 +- components/test_raftstore/src/node.rs | 4 +- components/test_raftstore/src/pd.rs | 13 +- components/test_raftstore/src/server.rs | 3 +- .../test_raftstore/src/transport_simulate.rs | 18 +- components/test_raftstore/src/util.rs | 8 +- components/test_util/src/lib.rs | 6 +- components/test_util/src/runner.rs | 6 +- components/tidb_query_aggr/src/impl_avg.rs | 3 +- components/tidb_query_aggr/src/impl_count.rs | 7 +- components/tidb_query_aggr/src/impl_first.rs | 13 +- .../tidb_query_aggr/src/impl_max_min.rs | 12 +- components/tidb_query_aggr/src/impl_sum.rs | 13 +- .../tidb_query_aggr/src/impl_variance.rs | 21 +- components/tidb_query_aggr/src/lib.rs | 81 +-- components/tidb_query_aggr/src/parser.rs | 29 +- components/tidb_query_aggr/src/util.rs | 9 +- components/tidb_query_codegen/src/lib.rs | 4 +- .../tidb_query_codegen/src/rpn_function.rs | 118 +++-- components/tidb_query_common/src/error.rs | 5 +- .../tidb_query_common/src/execute_stats.rs | 15 +- .../tidb_query_common/src/storage/mod.rs | 4 +- .../src/storage/ranges_iter.rs | 21 +- .../tidb_query_common/src/storage/scanner.rs | 23 +- .../src/storage/test_fixture.rs | 7 +- components/tidb_query_common/src/util.rs | 8 +- .../src/codec/batch/lazy_column.rs | 45 +- .../src/codec/batch/lazy_column_vec.rs | 23 +- .../src/codec/chunk/chunk.rs | 8 +- .../src/codec/chunk/column.rs | 5 +- .../codec/collation/collator/gbk_collation.rs | 15 +- .../codec/collation/collator/latin1_bin.rs | 3 +- .../src/codec/collation/collator/mod.rs | 6 +- .../collation/collator/utf8mb4_binary.rs | 3 +- .../collation/collator/utf8mb4_general_ci.rs | 3 +- .../collation/collator/utf8mb4_unicode_ci.rs | 3 +- .../src/codec/collation/mod.rs | 5 +- .../tidb_query_datatype/src/codec/convert.rs | 67 ++- .../src/codec/data_type/chunked_vec_bytes.rs | 10 +- .../src/codec/data_type/chunked_vec_json.rs | 11 +- .../src/codec/data_type/chunked_vec_set.rs | 3 +- .../src/codec/data_type/chunked_vec_sized.rs | 9 +- .../src/codec/data_type/logical_rows.rs | 3 +- .../src/codec/data_type/mod.rs | 23 +- .../src/codec/data_type/scalar.rs | 17 +- .../src/codec/data_type/vector.rs | 28 +- .../tidb_query_datatype/src/codec/datum.rs | 15 +- .../src/codec/datum_codec.rs | 5 +- .../src/codec/mysql/binary_literal.rs | 7 +- .../src/codec/mysql/charset.rs | 3 +- .../src/codec/mysql/decimal.rs | 63 ++- .../src/codec/mysql/duration.rs | 20 +- .../src/codec/mysql/json/comparison.rs | 4 +- .../src/codec/mysql/json/json_extract.rs | 7 +- .../src/codec/mysql/json/json_keys.rs | 3 +- .../src/codec/mysql/json/json_merge.rs | 3 +- .../src/codec/mysql/json/mod.rs | 13 +- .../src/codec/mysql/json/modifier.rs | 10 +- .../src/codec/mysql/json/path_expr.rs | 8 +- .../src/codec/mysql/json/serde.rs | 3 +- .../src/codec/mysql/time/extension.rs | 15 +- .../src/codec/mysql/time/mod.rs | 43 +- .../src/codec/mysql/time/tz.rs | 8 +- .../src/codec/row/v2/compat_v1.rs | 14 +- .../src/codec/row/v2/encoder_for_test.rs | 10 +- .../src/codec/row/v2/mod.rs | 3 +- .../src/codec/row/v2/row_slice.rs | 12 +- .../tidb_query_datatype/src/codec/table.rs | 12 +- .../tidb_query_datatype/src/def/eval_type.rs | 16 +- .../tidb_query_datatype/src/def/field_type.rs | 28 +- .../tidb_query_datatype/src/expr/ctx.rs | 32 +- .../src/fast_hash_aggr_executor.rs | 44 +- .../src/index_scan_executor.rs | 219 ++++---- .../tidb_query_executors/src/interface.rs | 91 ++-- components/tidb_query_executors/src/lib.rs | 11 +- .../src/projection_executor.rs | 13 +- components/tidb_query_executors/src/runner.rs | 43 +- .../src/selection_executor.rs | 43 +- .../src/simple_aggr_executor.rs | 37 +- .../src/slow_hash_aggr_executor.rs | 70 +-- .../src/stream_aggr_executor.rs | 25 +- .../src/table_scan_executor.rs | 99 ++-- .../src/top_n_executor.rs | 65 ++- .../src/util/aggr_executor.rs | 75 +-- .../src/util/hash_aggr_helper.rs | 5 +- .../src/util/mock_executor.rs | 4 +- .../tidb_query_executors/src/util/mod.rs | 4 +- .../src/util/scan_executor.rs | 28 +- components/tidb_query_expr/src/impl_cast.rs | 75 +-- .../tidb_query_expr/src/impl_compare_in.rs | 4 +- .../tidb_query_expr/src/impl_encryption.rs | 16 +- components/tidb_query_expr/src/impl_json.rs | 3 +- components/tidb_query_expr/src/impl_math.rs | 16 +- components/tidb_query_expr/src/impl_op.rs | 3 +- components/tidb_query_expr/src/impl_string.rs | 18 +- components/tidb_query_expr/src/impl_time.rs | 54 +- components/tidb_query_expr/src/lib.rs | 17 +- components/tidb_query_expr/src/types/expr.rs | 6 +- .../tidb_query_expr/src/types/expr_builder.rs | 28 +- .../tidb_query_expr/src/types/expr_eval.rs | 51 +- .../tidb_query_expr/src/types/function.rs | 64 ++- .../tidb_query_expr/src/types/test_util.rs | 27 +- components/tikv_alloc/src/error.rs | 3 +- components/tikv_alloc/src/lib.rs | 8 +- components/tikv_alloc/src/trace.rs | 26 +- components/tikv_kv/src/btree_engine.rs | 17 +- components/tikv_kv/src/cursor.rs | 21 +- components/tikv_kv/src/lib.rs | 46 +- components/tikv_kv/src/mock_engine.rs | 3 +- components/tikv_kv/src/rocksdb_engine.rs | 5 +- components/tikv_util/src/buffer_vec.rs | 27 +- components/tikv_util/src/callback.rs | 4 +- components/tikv_util/src/codec/bytes.rs | 38 +- components/tikv_util/src/codec/number.rs | 18 +- components/tikv_util/src/config.rs | 47 +- components/tikv_util/src/deadline.rs | 3 +- components/tikv_util/src/future.rs | 12 +- components/tikv_util/src/lib.rs | 9 +- components/tikv_util/src/log.rs | 11 +- components/tikv_util/src/logger/file_log.rs | 11 +- components/tikv_util/src/logger/formatter.rs | 18 +- components/tikv_util/src/logger/mod.rs | 20 +- components/tikv_util/src/macros.rs | 21 +- components/tikv_util/src/memory.rs | 3 +- .../tikv_util/src/metrics/process_linux.rs | 4 +- .../tikv_util/src/metrics/threads_dummy.rs | 8 +- .../tikv_util/src/metrics/threads_linux.rs | 6 +- components/tikv_util/src/mpsc/batch.rs | 17 +- components/tikv_util/src/mpsc/mod.rs | 13 +- components/tikv_util/src/stream.rs | 7 +- components/tikv_util/src/sys/cgroup.rs | 26 +- components/tikv_util/src/sys/inspector.rs | 4 +- components/tikv_util/src/sys/mod.rs | 7 +- components/tikv_util/src/sys/thread.rs | 13 +- components/tikv_util/src/time.rs | 7 +- components/tikv_util/src/timer.rs | 16 +- components/tikv_util/src/topn.rs | 3 +- components/tikv_util/src/worker/mod.rs | 23 +- components/tikv_util/src/worker/pool.rs | 3 +- .../tikv_util/src/yatp_pool/future_pool.rs | 14 +- .../tipb_helper/src/expr_def_builder.rs | 3 +- components/tracker/src/lib.rs | 4 +- components/txn_types/src/lock.rs | 18 +- components/txn_types/src/timestamp.rs | 23 +- components/txn_types/src/types.rs | 51 +- components/txn_types/src/write.rs | 128 ++--- fuzz/cli.rs | 5 +- fuzz/targets/mod.rs | 3 +- rustfmt.toml | 7 + src/config.rs | 141 +++--- src/coprocessor/dag/storage_impl.rs | 3 +- src/coprocessor/endpoint.rs | 81 +-- .../interceptors/concurrency_limiter.rs | 8 +- src/coprocessor/interceptors/deadline.rs | 4 +- src/coprocessor/mod.rs | 28 +- src/coprocessor/statistics/analyze.rs | 53 +- src/coprocessor/statistics/cmsketch.rs | 4 +- src/coprocessor/tracker.rs | 20 +- src/coprocessor_v2/endpoint.rs | 5 +- src/coprocessor_v2/mod.rs | 28 +- src/coprocessor_v2/plugin_registry.rs | 91 ++-- src/coprocessor_v2/raw_storage_impl.rs | 13 +- src/import/duplicate_detect.rs | 16 +- src/import/sst_service.rs | 14 +- src/server/config.rs | 11 +- src/server/debug.rs | 40 +- src/server/engine_factory.rs | 11 +- src/server/engine_factory_v2.rs | 4 +- .../gc_worker/applied_lock_collector.rs | 83 +-- src/server/gc_worker/compaction_filter.rs | 52 +- src/server/gc_worker/config.rs | 5 +- src/server/gc_worker/gc_manager.rs | 163 +++--- src/server/gc_worker/gc_worker.rs | 76 +-- .../gc_worker/rawkv_compaction_filter.rs | 27 +- src/server/load_statistics/linux.rs | 21 +- src/server/load_statistics/mod.rs | 3 +- src/server/lock_manager/client.rs | 3 +- src/server/lock_manager/config.rs | 6 +- src/server/lock_manager/deadlock.rs | 97 ++-- src/server/lock_manager/mod.rs | 19 +- src/server/lock_manager/waiter_manager.rs | 23 +- src/server/node.rs | 17 +- src/server/raft_client.rs | 32 +- src/server/raftkv.rs | 6 +- src/server/reset_to_version.rs | 11 +- src/server/server.rs | 3 +- src/server/service/debug.rs | 3 +- src/server/service/diagnostics/log.rs | 7 +- src/server/service/diagnostics/sys.rs | 2 +- src/server/service/kv.rs | 29 +- src/server/snap.rs | 3 +- src/server/status_server/mod.rs | 11 +- src/server/status_server/profile.rs | 6 +- src/storage/config.rs | 8 +- src/storage/errors.rs | 18 +- src/storage/kv/test_engine_builder.rs | 4 +- src/storage/lock_manager.rs | 25 +- src/storage/mod.rs | 251 +++++---- src/storage/mvcc/consistency_check.rs | 5 +- src/storage/mvcc/reader/mod.rs | 22 +- src/storage/mvcc/reader/point_getter.rs | 60 ++- src/storage/mvcc/reader/reader.rs | 126 +++-- src/storage/mvcc/reader/scanner/backward.rs | 97 ++-- src/storage/mvcc/reader/scanner/forward.rs | 107 ++-- src/storage/mvcc/reader/scanner/mod.rs | 105 ++-- src/storage/mvcc/txn.rs | 70 +-- src/storage/raw/raw_mvcc.rs | 11 +- src/storage/raw/store.rs | 18 +- src/storage/read_pool.rs | 6 +- .../txn/actions/acquire_pessimistic_lock.rs | 86 ++-- .../txn/actions/check_data_constraint.rs | 10 +- src/storage/txn/actions/check_txn_status.rs | 32 +- src/storage/txn/actions/cleanup.rs | 19 +- src/storage/txn/actions/commit.rs | 13 +- src/storage/txn/actions/mod.rs | 5 +- src/storage/txn/actions/prewrite.rs | 118 +++-- .../txn/commands/acquire_pessimistic_lock.rs | 5 +- .../txn/commands/check_secondary_locks.rs | 14 +- src/storage/txn/commands/check_txn_status.rs | 86 ++-- src/storage/txn/commands/cleanup.rs | 4 +- src/storage/txn/commands/compare_and_swap.rs | 7 +- src/storage/txn/commands/macros.rs | 14 +- src/storage/txn/commands/mod.rs | 58 ++- .../txn/commands/pessimistic_rollback.rs | 3 +- src/storage/txn/commands/prewrite.rs | 84 +-- src/storage/txn/commands/resolve_lock.rs | 11 +- src/storage/txn/commands/resolve_lock_lite.rs | 4 +- src/storage/txn/commands/rollback.rs | 4 +- src/storage/txn/commands/txn_heart_beat.rs | 3 +- src/storage/txn/flow_controller/mod.rs | 4 +- .../singleton_flow_controller.rs | 50 +- .../flow_controller/tablet_flow_controller.rs | 4 +- src/storage/txn/latch.rs | 56 +- src/storage/txn/sched_pool.rs | 3 +- src/storage/txn/scheduler.rs | 145 +++--- src/storage/txn/store.rs | 18 +- src/storage/types.rs | 10 +- .../coprocessor_executors/hash_aggr/mod.rs | 12 +- .../coprocessor_executors/hash_aggr/util.rs | 4 +- .../index_scan/fixture.rs | 4 +- .../coprocessor_executors/index_scan/mod.rs | 12 +- .../coprocessor_executors/integrated/mod.rs | 32 +- .../coprocessor_executors/selection/util.rs | 3 +- .../coprocessor_executors/simple_aggr/util.rs | 4 +- .../coprocessor_executors/stream_aggr/mod.rs | 8 +- .../coprocessor_executors/stream_aggr/util.rs | 4 +- .../table_scan/fixture.rs | 10 +- .../coprocessor_executors/table_scan/mod.rs | 30 +- .../coprocessor_executors/util/fixture.rs | 44 +- .../benches/coprocessor_executors/util/mod.rs | 4 +- .../coprocessor_executors/util/store.rs | 3 +- tests/benches/hierarchy/engine/mod.rs | 2 +- tests/benches/misc/storage/incremental_get.rs | 4 +- tests/failpoints/cases/test_async_fetch.rs | 7 +- .../cases/test_cmd_epoch_checker.rs | 11 +- tests/failpoints/cases/test_conf_change.rs | 3 +- tests/failpoints/cases/test_coprocessor.rs | 9 +- tests/failpoints/cases/test_disk_full.rs | 19 +- tests/failpoints/cases/test_early_apply.rs | 16 +- tests/failpoints/cases/test_encryption.rs | 7 +- tests/failpoints/cases/test_gc_worker.rs | 59 ++- tests/failpoints/cases/test_hibernate.rs | 14 +- tests/failpoints/cases/test_import_service.rs | 19 +- tests/failpoints/cases/test_kv_service.rs | 10 +- .../cases/test_memory_usage_limit.rs | 3 +- tests/failpoints/cases/test_merge.rs | 151 +++--- tests/failpoints/cases/test_pending_peers.rs | 4 +- tests/failpoints/cases/test_rawkv.rs | 6 +- tests/failpoints/cases/test_replica_read.rs | 77 +-- .../cases/test_replica_stale_read.rs | 95 ++-- tests/failpoints/cases/test_server.rs | 8 +- tests/failpoints/cases/test_snap.rs | 46 +- tests/failpoints/cases/test_split_region.rs | 61 ++- tests/failpoints/cases/test_sst_recovery.rs | 3 +- tests/failpoints/cases/test_stale_peer.rs | 18 +- tests/failpoints/cases/test_stale_read.rs | 14 +- tests/failpoints/cases/test_storage.rs | 17 +- tests/failpoints/cases/test_transaction.rs | 28 +- .../failpoints/cases/test_transfer_leader.rs | 7 +- .../failpoints/cases/test_unsafe_recovery.rs | 11 +- tests/integrations/backup/mod.rs | 14 +- .../integrations/config/test_config_client.rs | 3 +- tests/integrations/coprocessor/test_select.rs | 36 +- tests/integrations/pd/test_rpc_client.rs | 10 +- .../integrations/raftstore/test_bootstrap.rs | 14 +- .../raftstore/test_compact_lock_cf.rs | 6 +- .../raftstore/test_compact_log.rs | 12 +- .../raftstore/test_conf_change.rs | 16 +- .../raftstore/test_early_apply.rs | 11 +- .../integrations/raftstore/test_hibernate.rs | 23 +- .../integrations/raftstore/test_lease_read.rs | 79 +-- tests/integrations/raftstore/test_merge.rs | 124 +++-- tests/integrations/raftstore/test_multi.rs | 34 +- tests/integrations/raftstore/test_prevote.rs | 28 +- .../raftstore/test_region_change_observer.rs | 6 +- .../raftstore/test_region_heartbeat.rs | 6 +- .../raftstore/test_region_info_accessor.rs | 3 +- .../raftstore/test_replica_read.rs | 35 +- .../raftstore/test_replication_mode.rs | 19 +- tests/integrations/raftstore/test_snap.rs | 29 +- .../raftstore/test_split_region.rs | 33 +- .../integrations/raftstore/test_stale_peer.rs | 51 +- .../integrations/raftstore/test_tombstone.rs | 6 +- .../raftstore/test_transfer_leader.rs | 3 +- .../raftstore/test_unsafe_recovery.rs | 65 ++- tests/integrations/server/gc_worker.rs | 7 +- tests/integrations/server/kv_service.rs | 13 +- tests/integrations/server/lock_manager.rs | 23 +- tests/integrations/server/raft_client.rs | 18 +- tests/integrations/storage/test_storage.rs | 9 +- 502 files changed, 7245 insertions(+), 5434 deletions(-) diff --git a/cmd/tikv-ctl/src/cmd.rs b/cmd/tikv-ctl/src/cmd.rs index 74cc69034fc..7f459a4c127 100644 --- a/cmd/tikv-ctl/src/cmd.rs +++ b/cmd/tikv-ctl/src/cmd.rs @@ -373,7 +373,8 @@ pub enum Cmd { /// Skip write RocksDB read_only: bool, }, - /// Unsafely recover when the store can not start normally, this recover may lose data + /// Unsafely recover when the store can not start normally, this recover may + /// lose data UnsafeRecover { #[structopt(subcommand)] cmd: UnsafeRecoverCmd, @@ -404,7 +405,9 @@ pub enum Cmd { default_value = crate::executor::METRICS_PROMETHEUS, possible_values = &["prometheus", "jemalloc", "rocksdb_raft", "rocksdb_kv"], )] - /// Set the metrics tag, one of prometheus/jemalloc/rocksdb_raft/rocksdb_kv, if not specified, print prometheus + /// Set the metrics tag + /// Options: prometheus/jemalloc/rocksdb_raft/rocksdb_kv + /// If not specified, print prometheus tag: Vec, }, /// Force a consistency-check for a specified region @@ -415,10 +418,13 @@ pub enum Cmd { }, /// Get all regions with corrupt raft BadRegions {}, - /// Modify tikv config, eg. tikv-ctl --host ip:port modify-tikv-config -n rocksdb.defaultcf.disable-auto-compactions -v true + /// Modify tikv config. + /// Eg. tikv-ctl --host ip:port modify-tikv-config -n + /// rocksdb.defaultcf.disable-auto-compactions -v true ModifyTikvConfig { #[structopt(short = "n")] - /// The config name are same as the name used on config file, eg. raftstore.messages-per-tick, raftdb.max-background-jobs + /// The config name are same as the name used on config file. + /// eg. raftstore.messages-per-tick, raftdb.max-background-jobs config_name: String, #[structopt(short = "v")] @@ -431,7 +437,8 @@ pub enum Cmd { /// Output meta file path file: String, }, - /// Compact the whole cluster in a specified range in one or more column families + /// Compact the whole cluster in a specified range in one or more column + /// families CompactCluster { #[structopt( short = "d", @@ -449,7 +456,8 @@ pub enum Cmd { default_value = CF_DEFAULT, possible_values = &["default", "lock", "write"], )] - /// Column family names, for kv db, combine from default/lock/write; for raft db, can only be default + /// Column family names, for kv db, combine from default/lock/write; for + /// raft db, can only be default cf: Vec, #[structopt( @@ -529,12 +537,14 @@ pub enum Cmd { #[structopt(subcommand)] cmd: EncryptionMetaCmd, }, - /// Delete encryption keys that are no longer associated with physical files. + /// Delete encryption keys that are no longer associated with physical + /// files. CleanupEncryptionMeta {}, /// Print bad ssts related infos BadSsts { #[structopt(long)] - /// specify manifest, if not set, it will look up manifest file in db path + /// specify manifest, if not set, it will look up manifest file in db + /// path manifest: Option, #[structopt(long, value_delimiter = ",")] @@ -604,7 +614,8 @@ pub enum RaftCmd { pub enum FailCmd { /// Inject failures Inject { - /// Inject fail point and actions pairs. E.g. tikv-ctl fail inject a=off b=panic + /// Inject fail point and actions pairs. + /// E.g. tikv-ctl fail inject a=off b=panic args: Vec, #[structopt(short = "f")] diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 19977924e69..62ce325a130 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -64,7 +64,7 @@ pub fn new_debug_executor( let cache = cfg.storage.block_cache.build_shared_cache(); let shared_block_cache = cache.is_some(); let env = cfg - .build_shared_rocks_env(key_manager.clone(), None /*io_rate_limiter*/) + .build_shared_rocks_env(key_manager.clone(), None /* io_rate_limiter */) .unwrap(); let mut kv_db_opts = cfg.rocksdb.build_opt(); @@ -105,7 +105,7 @@ pub fn new_debug_executor( error!("raft engine not exists: {}", config.dir); tikv_util::logger::exit_process_gracefully(-1); } - let raft_db = RaftLogEngine::new(config, key_manager, None /*io_rate_limiter*/).unwrap(); + let raft_db = RaftLogEngine::new(config, key_manager, None /* io_rate_limiter */).unwrap(); let debugger = Debugger::new(Engines::new(kv_db, raft_db), cfg_controller); Box::new(debugger) as Box } diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 67834db9c5d..00094af8dc6 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -675,7 +675,7 @@ fn run_ldb_command(args: Vec, cfg: &TiKvConfig) { let key_manager = data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .unwrap() .map(Arc::new); - let env = get_env(key_manager, None /*io_rate_limiter*/).unwrap(); + let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); let mut opts = cfg.rocksdb.build_opt(); opts.set_env(env); @@ -735,7 +735,9 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, for line in corruptions.lines() { println!("--------------------------------------------------------"); // The corruption format may like this: + // ```text // /path/to/db/057155.sst is corrupted: Corruption: block checksum mismatch: expected 3754995957, got 708533950 in /path/to/db/057155.sst offset 3126049 size 22724 + // ``` println!("corruption info:\n{}", line); let r = Regex::new(r"/\w*\.sst").unwrap(); @@ -795,8 +797,10 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, println!("\nsst meta:"); // The output may like this: + // ```text // --------------- Column family "write" (ID 2) -------------- // 63:132906243[3555338 .. 3555338]['7A311B40EFCC2CB4C5911ECF3937D728DED26AE53FA5E61BE04F23F2BE54EACC73' seq:3555338, type:1 .. '7A313030302E25CD5F57252E' seq:3555338, type:1] at level 0 + // ``` let column_r = Regex::new(r"--------------- (.*) --------------\n(.*)").unwrap(); if let Some(m) = column_r.captures(&output) { println!( @@ -848,7 +852,8 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, println!("unexpected key {}", log_wrappers::Value(&start)); } } else { - // it is expected when the sst is output of a compaction and the sst isn't added to manifest yet. + // it is expected when the sst is output of a compaction and the sst isn't added + // to manifest yet. println!( "sst {} is not found in manifest: {}", sst_file_number, output diff --git a/components/api_version/src/api_v1.rs b/components/api_version/src/api_v1.rs index 5b980ea75f1..1530124d245 100644 --- a/components/api_version/src/api_v1.rs +++ b/components/api_version/src/api_v1.rs @@ -45,7 +45,7 @@ impl KvFormat for ApiV1 { ) -> Result { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => Ok(Key::from_encoded_slice(key)), - ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1")), // reject apiv2 -> apiv1 conversion + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1")), /* reject apiv2 -> apiv1 conversion */ } } @@ -56,7 +56,7 @@ impl KvFormat for ApiV1 { ) -> Result<(Vec, Vec)> { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => Ok((start_key, end_key)), - ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1")), // reject apiv2 -> apiv1 conversion + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1")), /* reject apiv2 -> apiv1 conversion */ } } } diff --git a/components/api_version/src/api_v1ttl.rs b/components/api_version/src/api_v1ttl.rs index 65c7f569aa6..2a2df6bfb33 100644 --- a/components/api_version/src/api_v1ttl.rs +++ b/components/api_version/src/api_v1ttl.rs @@ -70,7 +70,7 @@ impl KvFormat for ApiV1Ttl { ) -> Result { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => Ok(Key::from_encoded_slice(key)), - ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1ttl")), // reject apiv2 -> apiv1ttl conversion + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1ttl")), /* reject apiv2 -> apiv1ttl conversion */ } } @@ -81,7 +81,7 @@ impl KvFormat for ApiV1Ttl { ) -> Result<(Vec, Vec)> { match src_api { ApiVersion::V1 | ApiVersion::V1ttl => Ok((start_key, end_key)), - ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1ttl")), // reject apiv2 -> apiv1ttl conversion + ApiVersion::V2 => Err(box_err!("unsupported conversion from v2 to v1ttl")), /* reject apiv2 -> apiv1ttl conversion */ } } } diff --git a/components/api_version/src/api_v2.rs b/components/api_version/src/api_v2.rs index a8a177596ad..712804b3b3a 100644 --- a/components/api_version/src/api_v2.rs +++ b/components/api_version/src/api_v2.rs @@ -143,8 +143,8 @@ impl KvFormat for ApiV2 { } // Note: `user_key` may not be `KeyMode::Raw`. - // E.g., `raw_xxx_range` interfaces accept an exclusive end key just beyond the scope of raw keys. - // The validity is ensured by client & Storage interfaces. + // E.g. `raw_xxx_range` interfaces accept an exclusive end key just beyond the + // scope of raw keys. The validity is ensured by client & Storage interfaces. fn encode_raw_key(user_key: &[u8], ts: Option) -> Key { let encoded_key = Key::from_raw(user_key); if let Some(ts) = ts { @@ -156,13 +156,14 @@ impl KvFormat for ApiV2 { } // Note: `user_key` may not be `KeyMode::Raw`. - // E.g., `raw_xxx_range` interfaces accept an exclusive end key just beyond the scope of raw keys. - // The validity is ensured by client & Storage interfaces. + // E.g. `raw_xxx_range` interfaces accept an exclusive end key just beyond the + // scope of raw keys. The validity is ensured by client & Storage interfaces. fn encode_raw_key_owned(mut user_key: Vec, ts: Option) -> Key { let src_len = user_key.len(); let encoded_len = MemComparableByteCodec::encoded_len(src_len); - // always reserve more U64_SIZE for ts, as it's likely to "append_ts" later, especially in raw write procedures. + // always reserve more U64_SIZE for ts, as it's likely to "append_ts" later, + // especially in raw write procedures. user_key.reserve(encoded_len - src_len + number::U64_SIZE); user_key.resize(encoded_len, 0u8); MemComparableByteCodec::encode_all_in_place(&mut user_key, src_len); @@ -248,8 +249,8 @@ impl ApiV2 { } // Note: `encoded_bytes` may not be `KeyMode::Raw`. -// E.g., backup service accept an exclusive end key just beyond the scope of raw keys. -// The validity is ensured by client & Storage interfaces. +// E.g., backup service accept an exclusive end key just beyond the scope of raw +// keys. The validity is ensured by client & Storage interfaces. #[inline] fn is_valid_encoded_bytes(mut encoded_bytes: &[u8], with_ts: bool) -> bool { bytes::decode_bytes(&mut encoded_bytes, false).is_ok() @@ -261,8 +262,8 @@ fn is_valid_encoded_key(encoded_key: &Key, with_ts: bool) -> bool { is_valid_encoded_bytes(encoded_key.as_encoded(), with_ts) } -/// TimeStamp::zero is not acceptable, as such entries can not be retrieved by RawKV MVCC. -/// See `RawMvccSnapshot::seek_first_key_value_cf`. +/// TimeStamp::zero is not acceptable, as such entries can not be retrieved by +/// RawKV MVCC. See `RawMvccSnapshot::seek_first_key_value_cf`. #[inline] fn is_valid_ts(ts: TimeStamp) -> bool { !ts.is_zero() diff --git a/components/api_version/src/lib.rs b/components/api_version/src/lib.rs index 0dbdc833b86..60f23455cc7 100644 --- a/components/api_version/src/lib.rs +++ b/components/api_version/src/lib.rs @@ -18,13 +18,15 @@ pub trait KvFormat: Clone + Copy + 'static + Send + Sync { const CLIENT_TAG: ApiVersion; const IS_TTL_ENABLED: bool; - /// Parse the key prefix and infer key mode. It's safe to parse either raw key or encoded key. + /// Parse the key prefix and infer key mode. It's safe to parse either raw + /// key or encoded key. fn parse_key_mode(key: &[u8]) -> KeyMode; fn parse_range_mode(range: (Option<&[u8]>, Option<&[u8]>)) -> KeyMode; /// Parse from the bytes from storage. fn decode_raw_value(bytes: &[u8]) -> Result>; - /// This is equivalent to `decode_raw_value()` but returns the owned user value. + /// This is equivalent to `decode_raw_value()` but returns the owned user + /// value. fn decode_raw_value_owned(mut bytes: Vec) -> Result>> { let (len, expire_ts, is_delete) = { let raw_value = Self::decode_raw_value(&bytes)?; @@ -47,8 +49,8 @@ pub trait KvFormat: Clone + Copy + 'static + Send + Sync { /// This is equivalent to `encode_raw_value` but reduced an allocation. fn encode_raw_value_owned(value: RawValue>) -> Vec; - /// Parse from the txn_types::Key from storage. Default implementation for API V1|V1TTL. - /// Return: (user key, optional timestamp) + /// Parse from the txn_types::Key from storage. Default implementation for + /// API V1|V1TTL. Return: (user key, optional timestamp) fn decode_raw_key(encoded_key: &Key, _with_ts: bool) -> Result<(Vec, Option)> { Ok((encoded_key.as_encoded().clone(), None)) } @@ -59,7 +61,8 @@ pub trait KvFormat: Clone + Copy + 'static + Send + Sync { ) -> Result<(Vec, Option)> { Ok((encoded_key.into_encoded(), None)) } - /// Encode the user key & optional timestamp into txn_types::Key. Default implementation for API V1|V1TTL. + /// Encode the user key & optional timestamp into txn_types::Key. Default + /// implementation for API V1|V1TTL. fn encode_raw_key(user_key: &[u8], _ts: Option) -> Key { Key::from_encoded_slice(user_key) } @@ -138,7 +141,8 @@ macro_rules! match_template_api_version { }} } -/// Dispatch an expression with type `kvproto::kvrpcpb::ApiVersion` to corresponding concrete type of `KvFormat` +/// Dispatch an expression with type `kvproto::kvrpcpb::ApiVersion` to +/// corresponding concrete type of `KvFormat` /// /// For example, the following code /// @@ -197,8 +201,8 @@ pub enum KeyMode { /// /// ### ApiVersion::V1ttl /// -/// 8 bytes representing the unix timestamp in seconds for expiring time will be append -/// to the value of all RawKV kv pairs. +/// 8 bytes representing the unix timestamp in seconds for expiring time will be +/// append to the value of all RawKV kv pairs. /// /// ```text /// ------------------------------------------------------------ @@ -221,8 +225,8 @@ pub enum KeyMode { /// ``` /// /// As shown in the example below, the least significant bit of the meta flag -/// indicates whether the value contains 8 bytes expire ts at the very left to the -/// meta flags. +/// indicates whether the value contains 8 bytes expire ts at the very left to +/// the meta flags. /// /// ```text /// -------------------------------------------------------------------------------- @@ -235,7 +239,8 @@ pub enum KeyMode { pub struct RawValue> { /// The user value. pub user_value: T, - /// The unix timestamp in seconds indicating the point of time that this key will be deleted. + /// The unix timestamp in seconds indicating the point of time that this key + /// will be deleted. pub expire_ts: Option, /// Logical deletion flag in ApiV2, should be `false` in ApiV1 and ApiV1Ttl pub is_delete: bool, diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 96e330f956d..7dae680fa05 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -20,8 +20,8 @@ use crate::{ /// A manager for maintaining the last flush ts. /// This information is provided for the `advancer` in checkpoint V3, -/// which involved a central node (typically TiDB) for collecting all regions' checkpoint -/// then advancing the global checkpoint. +/// which involved a central node (typically TiDB) for collecting all regions' +/// checkpoint then advancing the global checkpoint. #[derive(Debug, Default)] pub struct CheckpointManager { items: HashMap, @@ -165,11 +165,13 @@ pub struct LastFlushTsOfRegion { pub trait FlushObserver: Send + 'static { /// The callback when the flush has advanced the resolver. async fn before(&mut self, checkpoints: Vec<(Region, TimeStamp)>); - /// The callback when the flush is done. (Files are fully written to external storage.) + /// The callback when the flush is done. (Files are fully written to + /// external storage.) async fn after(&mut self, task: &str, rts: u64) -> Result<()>; /// The optional callback to rewrite the resolved ts of this flush. - /// Because the default method (collect all leader resolved ts in the store, and use the minimal TS.) - /// may lead to resolved ts rolling back, if we desire a stronger consistency, we can rewrite a safer resolved ts here. + /// Because the default method (collect all leader resolved ts in the store, + /// and use the minimal TS.) may lead to resolved ts rolling back, if we + /// desire a stronger consistency, we can rewrite a safer resolved ts here. /// Note the new resolved ts cannot be greater than the old resolved ts. async fn rewrite_resolved_ts( &mut self, @@ -282,7 +284,8 @@ where } } // Optionally upload the region checkpoint. - // Unless in some extreme condition, skipping upload the region checkpoint won't lead to data loss. + // Unless in some extreme condition, skipping upload the region checkpoint won't + // lead to data loss. if let Err(err) = self .meta_cli .upload_region_checkpoint(task, &self.checkpoints) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 2defb88b541..ff1e2a4e66c 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -63,7 +63,8 @@ use crate::{ }; const SLOW_EVENT_THRESHOLD: f64 = 120.0; -/// CHECKPOINT_SAFEPOINT_TTL_IF_ERROR specifies the safe point TTL(24 hour) if task has fatal error. +/// CHECKPOINT_SAFEPOINT_TTL_IF_ERROR specifies the safe point TTL(24 hour) if +/// task has fatal error. const CHECKPOINT_SAFEPOINT_TTL_IF_ERROR: u64 = 24; pub struct Endpoint { @@ -255,7 +256,8 @@ where async fn starts_flush_ticks(router: Router) { loop { // check every 5s. - // TODO: maybe use global timer handle in the `tikv_utils::timer` (instead of enabling timing in the current runtime)? + // TODO: maybe use global timer handle in the `tikv_utils::timer` (instead of + // enabling timing in the current runtime)? tokio::time::sleep(Duration::from_secs(5)).await; debug!("backup stream trigger flush tick"); router.tick().await; @@ -415,7 +417,8 @@ where } } - /// Convert a batch of events to the cmd batch, and update the resolver status. + /// Convert a batch of events to the cmd batch, and update the resolver + /// status. fn record_batch(subs: SubscriptionTracer, batch: CmdBatch) -> Option { let region_id = batch.region_id; let mut resolver = match subs.get_subscription_of(region_id) { @@ -425,7 +428,9 @@ where return None; } }; - // Stale data is accpetable, while stale locks may block the checkpoint advancing. + // Stale data is acceptable, while stale locks may block the checkpoint + // advancing. + // ```text // Let L be the instant some key locked, U be the instant it unlocked, // +---------*-------L-----------U--*-------------+ // ^ ^----(1)----^ ^ We get the snapshot for initial scanning at here. @@ -434,6 +439,7 @@ where // ...note that (1) is the last cmd batch of first observing, so the unlock event would never be sent to us. // ...then the lock would get an eternal life in the resolver :| // (Before we refreshing the resolver for this region again) + // ``` if batch.pitr_id != resolver.value().handle.id { debug!("stale command"; "region_id" => %region_id, "now" => ?resolver.value().handle.id, "remote" => ?batch.pitr_id); return None; @@ -529,8 +535,10 @@ where "end_key" => utils::redact(&end_key), ); } - // Assuming the `region info provider` would read region info form `StoreMeta` directly and this would be fast. - // If this gets slow, maybe make it async again. (Will that bring race conditions? say `Start` handled after `ResfreshResolver` of some region.) + // Assuming the `region info provider` would read region info form `StoreMeta` + // directly and this would be fast. If this gets slow, maybe make it async + // again. (Will that bring race conditions? say `Start` handled after + // `ResfreshResolver` of some region.) let range_init_result = init.initialize_range(start_key.clone(), end_key.clone()); match range_init_result { Ok(()) => { @@ -680,7 +688,8 @@ where ); } - /// unload a task from memory: this would stop observe the changes required by the task temporarily. + /// unload a task from memory: this would stop observe the changes required + /// by the task temporarily. fn unload_task(&self, task: &str) -> Option { let router = self.range_router.clone(); @@ -988,8 +997,9 @@ pub enum Task { /// FatalError pauses the task and set the error. FatalError(TaskSelector, Box), /// Run the callback when see this message. Only for test usage. - /// NOTE: Those messages for testing are not guared by `#[cfg(test)]` for now, because - /// the integration test would not enable test config when compiling (why?) + /// NOTE: Those messages for testing are not guarded by `#[cfg(test)]` for + /// now, because the integration test would not enable test config when + /// compiling (why?) Sync( // Run the closure if ... Box, @@ -998,8 +1008,9 @@ pub enum Task { ), /// Mark the store as a failover store. /// This would prevent store from updating its checkpoint ts for a while. - /// Because we are not sure whether the regions in the store have new leader -- - /// we keep a safe checkpoint so they can choose a safe `from_ts` for initial scanning. + /// Because we are not sure whether the regions in the store have new leader + /// -- we keep a safe checkpoint so they can choose a safe `from_ts` for + /// initial scanning. MarkFailover(Instant), /// Flush the task with name. Flush(String), @@ -1032,8 +1043,8 @@ pub enum ObserveOp { }, /// Destroy the region subscription. /// Unlike `Stop`, this will assume the region would never go back. - /// For now, the effect of "never go back" is that we won't try to hint other store - /// the checkpoint ts of this region. + /// For now, the effect of "never go back" is that we won't try to hint + /// other store the checkpoint ts of this region. Destroy { region: Region, }, diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 40e0ab5c60b..05b370e2985 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -110,7 +110,8 @@ impl EventLoader { }) } - /// Scan a batch of events from the snapshot, and save them into the internal buffer. + /// Scan a batch of events from the snapshot, and save them into the + /// internal buffer. fn fill_entries(&mut self) -> Result { assert!( self.entry_batch.is_empty(), @@ -278,11 +279,13 @@ where /// and return the current snapshot of that region. fn observe_over(&self, region: &Region, cmd: ChangeObserver) -> Result { // There are 2 ways for getting the initial snapshot of a region: - // 1. the BR method: use the interface in the RaftKv interface, read the key-values directly. - // 2. the CDC method: use the raftstore message `SignificantMsg::CaptureChange` to - // register the region to CDC observer and get a snapshot at the same time. - // Registering the observer to the raftstore is necessary because we should only listen events from leader. - // In CDC, the change observer is per-delegate(i.e. per-region), we can create the command per-region here too. + // - the BR method: use the interface in the RaftKv interface, read the + // key-values directly. + // - the CDC method: use the raftstore message `SignificantMsg::CaptureChange` + // to register the region to CDC observer and get a snapshot at the same time. + // Registering the observer to the raftstore is necessary because we should only + // listen events from leader. In CDC, the change observer is + // per-delegate(i.e. per-region), we can create the command per-region here too. let (callback, fut) = tikv_util::future::paired_future_callback::>(); @@ -351,7 +354,8 @@ where raftstore::store::util::compare_region_epoch( region.get_region_epoch(), &v.value().meta, - // No need for checking conf version because conf change won't cancel the observation. + // No need for checking conf version because conf change won't cancel the + // observation. false, true, false, @@ -359,8 +363,8 @@ where Ok(v) }) .map_err(|err| Error::Contextual { - // Both when we cannot find the region in the track and - // the epoch has changed means that we should cancel the current turn of initial scanning. + // Both when we cannot find the region in the track and the epoch has changed means + // that we should cancel the current turn of initial scanning. inner_error: Box::new(Error::ObserveCanceled( region_id, region.get_region_epoch().clone(), @@ -446,7 +450,8 @@ where Ok(stats) } - /// initialize a range: it simply scan the regions with leader role and send them to [`initialize_region`]. + /// initialize a range: it simply scan the regions with leader role and send + /// them to [`initialize_region`]. pub fn initialize_range(&self, start_key: Vec, end_key: Vec) -> Result<()> { let mut pager = RegionPager::scan_from(self.regions.clone(), start_key, end_key); loop { @@ -456,10 +461,11 @@ where break; } for r in regions { - // Note: Even we did the initial scanning, and blocking resolved ts from advancing, - // if the next_backup_ts was updated in some extreme condition, there is still little chance to lost data: - // For example, if a region cannot elect the leader for long time. (say, net work partition) - // At that time, we have nowhere to record the lock status of this region. + // Note: Even we did the initial scanning, and blocking resolved ts from + // advancing, if the next_backup_ts was updated in some extreme condition, there + // is still little chance to lost data: For example, if a region cannot elect + // the leader for long time. (say, net work partition) At that time, we have + // nowhere to record the lock status of this region. let success = try_send!( self.scheduler, Task::ModifyObserve(ObserveOp::Start { region: r.region }) diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index dc21f86b526..2732952930c 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -244,7 +244,8 @@ impl MetadataClient { } /// Initialize a task: execute some general operations over the keys. - /// For now, it sets the checkpoint ts if there isn't one for the current store. + /// For now, it sets the checkpoint ts if there isn't one for the current + /// store. pub async fn init_task(&self, task: &StreamBackupTaskInfo) -> Result<()> { let if_present = Condition::new( MetaKey::next_backup_ts_of(&task.name, self.store_id), @@ -263,7 +264,8 @@ impl MetadataClient { } /// Upload the last error information to the etcd. - /// This won't pause the task. Even this method would usually be paired with `pause`. + /// This won't pause the task. Even this method would usually be paired with + /// `pause`. pub async fn report_last_error(&self, name: &str, last_error: StreamBackupError) -> Result<()> { use protobuf::Message; let now = Instant::now(); @@ -376,7 +378,8 @@ impl MetadataClient { } /// watch event stream from the revision(exclusive). - /// the revision would usually come from a WithRevision struct(which indices the revision of the inner item). + /// the revision would usually come from a WithRevision struct(which indices + /// the revision of the inner item). pub async fn events_from(&self, revision: i64) -> Result> { let watcher = self .meta_store @@ -436,7 +439,8 @@ impl MetadataClient { Ok(()) } - /// Get the storage checkpoint from metadata. This function is justly used for test. + /// Get the storage checkpoint from metadata. This function is justly used + /// for test. pub async fn get_storage_checkpoint(&self, task_name: &str) -> Result { let now = Instant::now(); defer! { @@ -508,8 +512,8 @@ impl MetadataClient { }) } - /// Perform a two-phase bisection search algorithm for the intersection of all ranges - /// and the specificated range (usually region range.) + /// Perform a two-phase bisection search algorithm for the intersection of + /// all ranges and the specificated range (usually region range.) /// TODO: explain the algorithm? pub async fn range_overlap_of_task( &self, @@ -637,8 +641,8 @@ impl MetadataClient { } /// insert a task with ranges into the metadata store. - /// the current abstraction of metadata store doesn't support transaction API. - /// Hence this function is non-transactional and only for testing. + /// the current abstraction of metadata store doesn't support transaction + /// API. Hence this function is non-transactional and only for testing. pub async fn insert_task_with_range( &self, task: &StreamTask, diff --git a/components/backup-stream/src/metadata/keys.rs b/components/backup-stream/src/metadata/keys.rs index 4db978c2cb6..32962ec36b0 100644 --- a/components/backup-stream/src/metadata/keys.rs +++ b/components/backup-stream/src/metadata/keys.rs @@ -9,7 +9,8 @@ const PATH_STORAGE_CHECKPOINT: &str = "/storage-checkpoint"; const PATH_RANGES: &str = "/ranges"; const PATH_PAUSE: &str = "/pause"; const PATH_LAST_ERROR: &str = "/last-error"; -// Note: maybe use something like `const_fmt` for concatenating constant strings? +// Note: maybe use something like `const_fmt` for concatenating constant +// strings? const TASKS_PREFIX: &str = "/tidb/br-stream/info/"; /// A key that associates to some metadata. @@ -60,7 +61,8 @@ impl KeyValue { } /// Take the start-key and end-key from a metadata key-value pair. - /// example: `KeyValue(/ranges/, ) -> (, )` + /// example: `KeyValue(/ranges/, ) -> + /// (, )` pub fn take_range(&mut self, task_name: &str) -> (Vec, Vec) { let prefix_len = MetaKey::ranges_prefix_len(task_name); (self.take_key()[prefix_len..].to_vec(), self.take_value()) @@ -160,7 +162,8 @@ impl MetaKey { Self(format!("{}{}/{}/{}", PREFIX, PATH_LAST_ERROR, name, store).into_bytes()) } - /// return the key that keeps the range [self, self.next()) contains only `self`. + /// return the key that keeps the range [self, self.next()) contains only + /// `self`. pub fn next(&self) -> Self { let mut next = self.clone(); next.0.push(0); diff --git a/components/backup-stream/src/metadata/store/etcd.rs b/components/backup-stream/src/metadata/store/etcd.rs index 2b940c905cd..556661700f9 100644 --- a/components/backup-stream/src/metadata/store/etcd.rs +++ b/components/backup-stream/src/metadata/store/etcd.rs @@ -61,13 +61,14 @@ impl From for KvEventType { impl From for KeyValue { fn from(kv: etcd_client::KeyValue) -> Self { - // TODO: we can move out the vector in the KeyValue struct here. (instead of copying.) - // But that isn't possible for now because: + // TODO: we can move out the vector in the KeyValue struct here. (instead of + // copying.) But that isn't possible for now because: // - The raw KV pair(defined by the protocol buffer of etcd) is private. - // - That did could be exported by `pub-fields` feature of the client. - // However that feature isn't published in theirs Cargo.toml (Is that a mistake?). - // - Indeed, we can use `mem::transmute` here because `etcd_client::KeyValue` has `#[repr(transparent)]`. - // But before here become a known bottle neck, I'm not sure whether it's worthwhile for involving unsafe code. + // - That did could be exported by `pub-fields` feature of the client. However + // that feature isn't published in theirs Cargo.toml (Is that a mistake?). + // - Indeed, we can use `mem::transmute` here because `etcd_client::KeyValue` + // has `#[repr(transparent)]`. But before here become a known bottle neck, I'm + // not sure whether it's worthwhile for involving unsafe code. KeyValue(MetaKey(kv.key().to_owned()), kv.value().to_owned()) } } @@ -75,7 +76,7 @@ impl From for KeyValue { /// Prepare the etcd options required by the keys. /// Return the start key for requesting. macro_rules! prepare_opt { - ($opt: ident, $keys: expr) => { + ($opt:ident, $keys:expr) => { match $keys { Keys::Prefix(key) => { $opt = $opt.with_prefix(); @@ -203,7 +204,7 @@ impl EtcdStore { Compare::value(cond.over_key, op, cond.arg) } - /// Convert the transcation operations to etcd transcation ops. + /// Convert the transaction operations to etcd transaction ops. fn to_txn(ops: &mut [super::TransactionOp], leases: &HashMap) -> Vec { ops.iter_mut().map(|op| match op { TransactionOp::Put(key, opt) => { @@ -234,7 +235,8 @@ impl EtcdStore { /// Make a conditional txn. /// For now, this wouldn't split huge transaction into smaller ones, - /// so when playing with etcd in PD, conditional transaction should be small. + /// so when playing with etcd in PD, conditional transaction should be + /// small. async fn make_conditional_txn( cli: &mut Client, mut txn: super::CondTransaction, diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 7e1858b913e..97573ab756e 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -105,9 +105,10 @@ impl LazyEtcdClientInner { async fn connect(&self) -> Result { let store = retry(|| { // For now, the interface of the `etcd_client` doesn't us to control - // how to create channels when connecting, hence we cannot update the tls config at runtime. - // TODO: maybe add some method like `with_channel` for `etcd_client`, and adapt the `SecurityManager` API, - // instead of doing everything by own. + // how to create channels when connecting, hence we cannot update the tls config + // at runtime. + // TODO: maybe add some method like `with_channel` for `etcd_client`, and adapt + // the `SecurityManager` API, instead of doing everything by own. etcd_client::Client::connect(self.endpoints.clone(), Some(self.opt.clone())) }) .await diff --git a/components/backup-stream/src/metadata/store/mod.rs b/components/backup-stream/src/metadata/store/mod.rs index 0855582da59..e5d1f03e715 100644 --- a/components/backup-stream/src/metadata/store/mod.rs +++ b/components/backup-stream/src/metadata/store/mod.rs @@ -48,7 +48,8 @@ impl Condition { } /// A conditional transaction. -/// This would atomicly evalute the condition, and execute corresponding transaction. +/// This would atomically evaluate the condition, and execute corresponding +/// transaction. #[derive(Debug)] pub struct CondTransaction { cond: Condition, diff --git a/components/backup-stream/src/metadata/store/slash_etc.rs b/components/backup-stream/src/metadata/store/slash_etc.rs index 1a2f127501c..2ae4c05dfaf 100644 --- a/components/backup-stream/src/metadata/store/slash_etc.rs +++ b/components/backup-stream/src/metadata/store/slash_etc.rs @@ -83,7 +83,8 @@ impl Snapshot for WithRevision { } // use iterator operations (instead of collect all kv pairs in the range) - // if the test case get too slow. (How can we figure out whether there are more?) + // if the test case get too slow. (How can we figure out whether there are + // more?) let more = if extra.limit > 0 { let more = kvs.len() > extra.limit; kvs.truncate(extra.limit); diff --git a/components/backup-stream/src/observer.rs b/components/backup-stream/src/observer.rs index ad22b67e145..36c310d3532 100644 --- a/components/backup-stream/src/observer.rs +++ b/components/backup-stream/src/observer.rs @@ -19,13 +19,15 @@ use crate::{ }; /// The inflight `StartObserve` message count. -/// Currently, we handle the `StartObserve` message in the main loop(endpoint thread), which may -/// take longer time than expected. So when we are starting to observe many region (e.g. failover), -/// there may be many pending messages, those messages won't block the advancing of checkpoint ts. -/// So the checkpoint ts may be too late and losing some data. +/// Currently, we handle the `StartObserve` message in the main loop(endpoint +/// thread), which may take longer time than expected. So when we are starting +/// to observe many region (e.g. failover), there may be many pending messages, +/// those messages won't block the advancing of checkpoint ts. So the checkpoint +/// ts may be too late and losing some data. /// -/// This is a temporary solution for this problem: If this greater than (1), then it implies that there are some -/// inflight wait-for-initialized regions, we should block the resolved ts from advancing in that condition. +/// This is a temporary solution for this problem: If this greater than (1), +/// then it implies that there are some inflight wait-for-initialized regions, +/// we should block the resolved ts from advancing in that condition. /// /// FIXME: Move handler of `ModifyObserve` to another thread, and remove this :( pub static IN_FLIGHT_START_OBSERVE_MESSAGE: AtomicUsize = AtomicUsize::new(0); @@ -99,8 +101,8 @@ impl BackupStreamObserver { impl Coprocessor for BackupStreamObserver {} impl CmdObserver for BackupStreamObserver { - // `BackupStreamObserver::on_flush_applied_cmd_batch` should only invoke if `cmd_batches` is not empty - // and only leader will trigger this. + // `BackupStreamObserver::on_flush_applied_cmd_batch` should only invoke if + // `cmd_batches` is not empty and only leader will trigger this. fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index b236cefde77..05e49d232a9 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -58,9 +58,10 @@ use crate::{ const FLUSH_FAILURE_BECOME_FATAL_THRESHOLD: usize = 30; -/// FLUSH_LOG_CONCURRENT_BATCH_COUNT specifies the concurrent count to write to storage. -/// 'Log backup' will produce a large mount of small files during flush interval, -/// and storage could take mistaken if writing all of these files to storage concurrently. +/// FLUSH_LOG_CONCURRENT_BATCH_COUNT specifies the concurrent count to write to +/// storage. 'Log backup' will produce a large mount of small files during flush +/// interval, and storage could take mistaken if writing all of these files to +/// storage concurrently. const FLUSH_LOG_CONCURRENT_BATCH_COUNT: usize = 128; #[derive(Clone, Debug)] @@ -124,10 +125,11 @@ pub struct ApplyEvents { } impl ApplyEvents { - /// Convert a [CmdBatch] to a vector of events. Ignoring admin / error commands. - /// At the same time, advancing status of the `Resolver` by those keys. - /// Note: the resolved ts cannot be advanced if there is no command, - /// maybe we also need to update resolved_ts when flushing? + /// Convert a [CmdBatch] to a vector of events. Ignoring admin / error + /// commands. At the same time, advancing status of the `Resolver` by + /// those keys. + /// Note: the resolved ts cannot be advanced if there is no command, maybe + /// we also need to update resolved_ts when flushing? pub fn from_cmd_batch(cmd: CmdBatch, resolver: &mut TwoPhaseResolver) -> Self { let region_id = cmd.region_id; let mut result = vec![]; @@ -241,7 +243,8 @@ impl ApplyEvents { >::borrow(&item).clone(), ApplyEvents { events: { - // assuming the keys in the same region would probably be in one group. + // assuming the keys in the same region would probably be in one + // group. let mut v = Vec::with_capacity(event_len); v.push(event); v @@ -336,7 +339,8 @@ pub struct RouterInner { /// The temporary directory for all tasks. prefix: PathBuf, - /// The handle to Endpoint, we should send `Flush` to endpoint if there are too many temporary files. + /// The handle to Endpoint, we should send `Flush` to endpoint if there are + /// too many temporary files. scheduler: Scheduler, /// The size limit of temporary file per task. temp_file_size_limit: u64, @@ -371,8 +375,9 @@ impl RouterInner { } } - /// Find the task for a region. If `end_key` is empty, search from start_key to +inf. - /// It simply search for a random possible overlapping range and get its task. + /// Find the task for a region. If `end_key` is empty, search from start_key + /// to +inf. It simply search for a random possible overlapping range and + /// get its task. /// FIXME: If a region crosses many tasks, this can only find one of them. pub fn find_task_by_range(&self, start_key: &[u8], mut end_key: &[u8]) -> Option { let r = self.ranges.rl(); @@ -384,11 +389,13 @@ impl RouterInner { } /// Register some ranges associated to some task. - /// Because the observer interface yields encoded data key, the key should be ENCODED DATA KEY too. - /// (i.e. encoded by `Key::from_raw(key).into_encoded()`, [`utils::wrap_key`] could be a shortcut.). - /// We keep ranges in memory to filter kv events not in these ranges. + /// Because the observer interface yields encoded data key, the key should + /// be ENCODED DATA KEY too. (i.e. encoded by + /// `Key::from_raw(key).into_encoded()`, [`utils::wrap_key`] could be + /// a shortcut.). We keep ranges in memory to filter kv events not in + /// these ranges. fn register_ranges(&self, task_name: &str, ranges: Vec<(Vec, Vec)>) { - // TODO reigister ranges to filter kv event + // TODO register ranges to filter kv event // register ranges has two main purpose. // 1. filter kv event that no need to backup // 2. route kv event to the corresponding file. @@ -494,9 +501,9 @@ impl RouterInner { let task_info = self.get_task_info(&task).await?; task_info.on_events(events).await?; - // When this event make the size of temporary files exceeds the size limit, make a flush. - // Note that we only flush if the size is less than the limit before the event, - // or we may send multiplied flush requests. + // When this event make the size of temporary files exceeds the size limit, make + // a flush. Note that we only flush if the size is less than the limit before + // the event, or we may send multiplied flush requests. debug!( "backup stream statics size"; "task" => ?task, @@ -526,8 +533,8 @@ impl RouterInner { futures::future::join_all(tasks).await } - /// flush the specified task, once once success, return the min resolved ts of this flush. - /// returns `None` if failed. + /// flush the specified task, once once success, return the min resolved ts + /// of this flush. returns `None` if failed. pub async fn do_flush( &self, task_name: &str, @@ -586,7 +593,8 @@ impl RouterInner { error!("backup stream schedule task failed"; "error" => ?e); } - // if stream task need flush this time, schedule Task::Flush, or update time justly. + // if stream task need flush this time, schedule Task::Flush, or update time + // justly. if task_info.should_flush() && task_info.set_flushing_status_cas(false, true).is_ok() { info!( "backup stream trigger flush task by tick"; @@ -618,14 +626,16 @@ pub enum FormatType { } impl TempFileKey { - /// Create the key for an event. The key can be used to find which temporary file the event should be stored. + /// Create the key for an event. The key can be used to find which temporary + /// file the event should be stored. fn of(kv: &ApplyEvent, region_id: u64) -> Self { let table_id = if kv.is_meta() { // Force table id of meta key be zero. 0 } else { - // When we cannot extract the table key, use 0 for the table key(perhaps we insert meta key here.). - // Can we elide the copy here(or at least, take a slice of key instead of decoding the whole key)? + // When we cannot extract the table key, use 0 for the table key(perhaps we + // insert meta key here.). Can we elide the copy here(or at least, + // take a slice of key instead of decoding the whole key)? Key::from_encoded_slice(&kv.key) .into_raw() .ok() @@ -700,11 +710,14 @@ impl TempFileKey { } /// path_to_log_file specifies the path of record log. - /// eg. "v1/${date}/${hour}/${store_id}/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log" + /// ```text + /// v1/${date}/${hour}/${store_id}/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log + /// ``` fn path_to_log_file(&self, store_id: u64, min_ts: u64, max_ts: u64) -> String { format!( "v1/{}/{}/{}/t{:08}/{:012}-{}.log", - // We may delete a range of files, so using the max_ts for preventing remove some records wrong. + // We may delete a range of files, so using the max_ts for preventing remove some + // records wrong. Self::format_date_time(max_ts, FormatType::Date), Self::format_date_time(max_ts, FormatType::Hour), store_id, @@ -715,7 +728,9 @@ impl TempFileKey { } /// path_to_schema_file specifies the path of schema log. - /// eg. "v1/${date}/${hour}/${store_id}/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log" + /// ```text + /// v1/${date}/${hour}/${store_id}/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log + /// ``` fn path_to_schema_file(store_id: u64, min_ts: u64, max_ts: u64) -> String { format!( "v1/{}/{}/{}/schema-meta/{:012}-{}.log", @@ -744,7 +759,8 @@ pub struct StreamTaskInfo { ranges: Vec<(Vec, Vec)>, /// The parent directory of temporary files. temp_dir: PathBuf, - /// The temporary file index. Both meta (m prefixed keys) and data (t prefixed keys). + /// The temporary file index. Both meta (m prefixed keys) and data (t + /// prefixed keys). files: SlotMap, /// flushing_files contains files pending flush. flushing_files: RwLock, DataFileInfo)>>, @@ -756,9 +772,10 @@ pub struct StreamTaskInfo { min_resolved_ts: TimeStamp, /// Total size of all temporary files in byte. total_size: AtomicUsize, - /// This should only be set to `true` by `compare_and_set(current=false, value=ture)`. - /// The thread who setting it to `true` takes the responsibility of sending the request to the - /// scheduler for flushing the files then. + /// This should only be set to `true` by `compare_and_set(current=false, + /// value=true)`. The thread who setting it to `true` takes the + /// responsibility of sending the request to the scheduler for flushing + /// the files then. /// /// If the request failed, that thread can set it to `false` back then. flushing: AtomicBool, @@ -837,7 +854,8 @@ impl StreamTaskInfo { let mut w = self.files.write().await; // double check before insert. there may be someone already insert that // when we are waiting for the write lock. - // slience the lint advising us to use the `Entry` API which may introduce copying. + // silence the lint advising us to use the `Entry` API which may introduce + // copying. #[allow(clippy::map_entry)] if !w.contains_key(&key) { let path = self.temp_dir.join(key.temp_file_name()); @@ -918,8 +936,9 @@ impl StreamTaskInfo { } pub fn should_flush(&self) -> bool { - // When it doesn't flush since 0.8x of auto-flush interval, we get ready to start flushing. - // So that we will get a buffer for the cost of actual flushing. + // When it doesn't flush since 0.8x of auto-flush interval, we get ready to + // start flushing. So that we will get a buffer for the cost of actual + // flushing. self.get_last_flush_time().saturating_elapsed_secs() >= self.flush_interval.as_secs_f64() * 0.8 } @@ -930,7 +949,8 @@ impl StreamTaskInfo { /// move need-flushing files to flushing_files. pub async fn move_to_flushing_files(&self, store_id: u64) -> Result<&Self> { - // if flushing_files is not empty, which represents this flush is a retry operation. + // if flushing_files is not empty, which represents this flush is a retry + // operation. if !self.flushing_files.read().await.is_empty() { return Ok(self); } @@ -1032,8 +1052,9 @@ impl StreamTaskInfo { /// execute the flush: copy local files to external storage. /// if success, return the last resolved ts of this flush. - /// The caller can try to advance the resolved ts and provide it to the function, - /// and we would use max(resolved_ts_provided, resolved_ts_from_file). + /// The caller can try to advance the resolved ts and provide it to the + /// function, and we would use `max(resolved_ts_provided, + /// resolved_ts_from_file)`. pub async fn do_flush( &self, store_id: u64, diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 0b415f95bf6..4555bdbf4ff 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -62,10 +62,10 @@ pub struct ResolvedRegions { } impl ResolvedRegions { - /// compose the calculated global checkpoint and region checkpoints. - /// note: maybe we can compute the global checkpoint internal and getting the interface clear. - /// however we must take the `min_ts` or we cannot provide valid global checkpoint if there - /// isn't any region checkpoint. + /// Compose the calculated global checkpoint and region checkpoints. + /// Note: Maybe we can compute the global checkpoint internal and getting + /// the interface clear. However we must take the `min_ts` or we cannot + /// provide valid global checkpoint if there isn't any region checkpoint. pub fn new(checkpoint: TimeStamp, checkpoints: Vec<(Region, TimeStamp)>) -> Self { Self { items: checkpoints, @@ -128,7 +128,8 @@ where handle: ObserveHandle, ) -> Result { let region_id = region.get_id(); - // Note: we have external retry at `ScanCmd::exec_by_with_retry`, should we keep retrying here? + // Note: we have external retry at `ScanCmd::exec_by_with_retry`, should we keep + // retrying here? let snap = self.observe_over_with_retry(region, move || { ChangeObserver::from_pitr(region_id, handle.clone()) })?; @@ -222,9 +223,11 @@ fn scan_executor_loop( } /// spawn the executors in the scan pool. -/// we make workers thread instead of spawn scan task directly into the pool because the [`InitialDataLoader`] isn't `Sync` hence -/// we must use it very carefully or rustc (along with tokio) would complain that we made a `!Send` future. -/// so we have moved the data loader to the synchronous context so its reference won't be shared between threads any more. +/// we make workers thread instead of spawn scan task directly into the pool +/// because the [`InitialDataLoader`] isn't `Sync` hence we must use it very +/// carefully or rustc (along with tokio) would complain that we made a `!Send` +/// future. so we have moved the data loader to the synchronous context so its +/// reference won't be shared between threads any more. fn spawn_executors(init: impl InitialScan + Send + 'static, number: usize) -> ScanPoolHandle { let (tx, rx) = crossbeam::channel::bounded(MESSAGE_BUFFER_SIZE); let pool = create_scan_pool(number); @@ -280,8 +283,9 @@ const MESSAGE_BUFFER_SIZE: usize = 4096; /// The operator for region subscription. /// It make a queue for operations over the `SubscriptionTracer`, generally, -/// we should only modify the `SubscriptionTracer` itself (i.e. insert records, remove records) at here. -/// So the order subscription / desubscription won't be broken. +/// we should only modify the `SubscriptionTracer` itself (i.e. insert records, +/// remove records) at here. So the order subscription / desubscription won't be +/// broken. pub struct RegionSubscriptionManager { // Note: these fields appear everywhere, maybe make them a `context` type? regions: R, @@ -337,7 +341,8 @@ where /// /// # returns /// - /// a two-tuple, the first is the handle to the manager, the second is the operator loop future. + /// a two-tuple, the first is the handle to the manager, the second is the + /// operator loop future. pub fn start( initial_loader: InitialDataLoader, observer: BackupStreamObserver, @@ -453,7 +458,8 @@ where } let cps = self.subs.resolve_with(min_ts); let min_region = cps.iter().min_by_key(|(_, rts)| rts); - // If there isn't any region observed, the `min_ts` can be used as resolved ts safely. + // If there isn't any region observed, the `min_ts` can be used as resolved ts + // safely. let rts = min_region.map(|(_, rts)| *rts).unwrap_or(min_ts); info!("getting checkpoint"; "defined_by_region" => ?min_region.map(|r| r.0.get_id()), "checkpoint" => %rts); self.subs.warn_if_gap_too_huge(rts); @@ -621,8 +627,9 @@ where fn spawn_scan(&self, cmd: ScanCmd) { // we should not spawn initial scanning tasks to the tokio blocking pool // because it is also used for converting sync File I/O to async. (for now!) - // In that condition, if we blocking for some resources(for example, the `MemoryQuota`) - // at the block threads, we may meet some ghosty deadlock. + // In that condition, if we blocking for some resources(for example, the + // `MemoryQuota`) at the block threads, we may meet some ghosty + // deadlock. let s = self.scan_pool_handle.request(cmd); if let Err(err) = s { let region_id = err.0.region.get_id(); diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 30063089804..aa9f35705fb 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -17,7 +17,8 @@ pub struct SubscriptionTracer(Arc>); #[derive(Debug, Eq, PartialEq, Clone, Copy)] pub enum SubscriptionState { - /// When it is newly added (maybe after split or leader transfered from other store), without any flush. + /// When it is newly added (maybe after split or leader transfered from + /// other store), without any flush. Fresh, /// It has been flushed, and running normally. Normal, @@ -95,8 +96,9 @@ impl SubscriptionTracer { // Register a region as tracing. // The `start_ts` is used to tracking the progress of initial scanning. - // (Note: the `None` case of `start_ts` is for testing / refresh region status when split / merge, - // maybe we'd better provide some special API for those cases and remove the `Option`?) + // Note: the `None` case of `start_ts` is for testing / refresh region status + // when split / merge, maybe we'd better provide some special API for those + // cases and remove the `Option`? pub fn register_region( &self, region: &Region, @@ -132,7 +134,7 @@ impl SubscriptionTracer { pub fn warn_if_gap_too_huge(&self, ts: TimeStamp) { let gap = TimeStamp::physical_now() - ts.physical(); if gap >= 10 * 60 * 1000 - /* 10 mins */ + // 10 mins { let far_resolver = self .0 @@ -155,7 +157,8 @@ impl SubscriptionTracer { } /// try to mark a region no longer be tracked by this observer. - /// returns whether success (it failed if the region hasn't been observed when calling this.) + /// returns whether success (it failed if the region hasn't been observed + /// when calling this.) pub fn deregister_region_if( &self, region: &Region, @@ -170,9 +173,12 @@ impl SubscriptionTracer { // use this method to check whether a key exists: // ``` // let mut present = false; - // deregister_region_if(42, |..| { present = true; }); + // deregister_region_if(42, |..| { + // present = true; + // }); // ``` - // At that time, if we call the callback with stale value, the called may get false positive. + // At that time, if we call the callback with stale value, the called may get + // false positive. if o.state == SubscriptionState::Removal { return false; } @@ -195,7 +201,8 @@ impl SubscriptionTracer { /// /// # return /// - /// Whether the status can be updated internally without deregister-and-register. + /// Whether the status can be updated internally without + /// deregister-and-register. pub fn try_update_region(&self, new_region: &Region) -> bool { let mut sub = match self.get_subscription_of(new_region.get_id()) { Some(sub) => sub, @@ -282,7 +289,8 @@ impl SubscriptionTracer { } /// This enhanced version of `Resolver` allow some unordered lock events. -/// The name "2-phase" means this is used for 2 *concurrency* phases of observing a region: +/// The name "2-phase" means this is used for 2 *concurrency* phases of +/// observing a region: /// 1. Doing the initial scanning. /// 2. Listening at the incremental data. /// @@ -294,25 +302,31 @@ impl SubscriptionTracer { /// +-> Phase 1: Initial scanning scans writes between start ts and now. /// ``` /// -/// In backup-stream, we execute these two tasks parallel. Which may make some race conditions: -/// - When doing initial scanning, there may be a flush triggered, but the default resolver -/// would probably resolved to the tip of incremental events. -/// - When doing initial scanning, we meet and track a lock already meet by the incremental events, -/// then the default resolver cannot untrack this lock any more. +/// In backup-stream, we execute these two tasks parallel. Which may make some +/// race conditions: +/// - When doing initial scanning, there may be a flush triggered, but the +/// default resolver would probably resolved to the tip of incremental events. +/// - When doing initial scanning, we meet and track a lock already meet by the +/// incremental events, then the default resolver cannot untrack this lock any +/// more. /// /// This version of resolver did some change for solve these problems: -/// - The resolver won't advance the resolved ts to greater than `stable_ts` if there is some. This -/// can help us prevent resolved ts from advancing when initial scanning hasn't finished yet. -/// - When we `untrack` a lock haven't been tracked, this would record it, and skip this lock if we want to track it then. -/// This would be safe because: +/// - The resolver won't advance the resolved ts to greater than `stable_ts` if +/// there is some. This can help us prevent resolved ts from advancing when +/// initial scanning hasn't finished yet. +/// - When we `untrack` a lock haven't been tracked, this would record it, and +/// skip this lock if we want to track it then. This would be safe because: /// - untracking a lock not be tracked is no-op for now. -/// - tracking a lock have already being untracked (unordered call of `track` and `untrack`) wouldn't happen at phase 2 for same region. -/// but only when phase 1 and phase 2 happened concurrently, at that time, we wouldn't and cannot advance the resolved ts. +/// - tracking a lock have already being untracked (unordered call of `track` +/// and `untrack`) wouldn't happen at phase 2 for same region. but only when +/// phase 1 and phase 2 happened concurrently, at that time, we wouldn't and +/// cannot advance the resolved ts. pub struct TwoPhaseResolver { resolver: Resolver, future_locks: Vec, /// When `Some`, is the start ts of the initial scanning. - /// And implies the phase 1 (initial scanning) is keep running asynchronously. + /// And implies the phase 1 (initial scanning) is keep running + /// asynchronously. stable_ts: Option, } diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 5aed8f55f7f..89f21567801 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -46,8 +46,9 @@ pub fn wrap_key(v: Vec) -> Vec { } /// Transform a str to a [`engine_traits::CfName`]\(`&'static str`). -/// If the argument isn't one of `""`, `"DEFAULT"`, `"default"`, `"WRITE"`, `"write"`, `"LOCK"`, `"lock"`... -/// returns "ERR_CF". (Which would be ignored then.) +/// If the argument isn't one of `""`, `"DEFAULT"`, `"default"`, `"WRITE"`, +/// `"write"`, `"LOCK"`, `"lock"`... returns "ERR_CF". (Which would be ignored +/// then.) pub fn cf_name(s: &str) -> CfName { match s { "" | "DEFAULT" | "default" => CF_DEFAULT, @@ -149,7 +150,8 @@ pub type Slot = Mutex; /// NOTE: Maybe we can use dashmap for replacing the RwLock. pub type SlotMap = RwLock, S>>; -/// Like `..=val`(a.k.a. `RangeToInclusive`), but allows `val` being a reference to DSTs. +/// Like `..=val`(a.k.a. `RangeToInclusive`), but allows `val` being a reference +/// to DSTs. struct RangeToInclusiveRef<'a, T: ?Sized>(&'a T); impl<'a, T: ?Sized> RangeBounds for RangeToInclusiveRef<'a, T> { @@ -191,7 +193,8 @@ pub type SegmentSet = SegmentMap; impl SegmentMap { /// Try to add a element into the segment tree, with default value. - /// (This is useful when using the segment tree as a `Set`, i.e. `SegmentMap`) + /// (This is useful when using the segment tree as a `Set`, i.e. + /// `SegmentMap`) /// /// - If no overlapping, insert the range into the tree and returns `true`. /// - If overlapping detected, do nothing and return `false`. @@ -267,8 +270,8 @@ impl SegmentMap { return Some(overlap_with_start); } // |--s----+-----+----e----| - // Otherwise, the possibility of being overlapping would be there are some sub range - // of the queried range... + // Otherwise, the possibility of being overlapping would be there are some sub + // range of the queried range... // |--s----+----e----+-----| // ...Or the end key is contained by some Range. // For faster query, we merged the two cases together. @@ -286,7 +289,8 @@ impl SegmentMap { covered_by_the_range.map(|(k, v)| (k, &v.range_end, &v.item)) } - /// Check whether the range is overlapping with any range in the segment tree. + /// Check whether the range is overlapping with any range in the segment + /// tree. pub fn is_overlapping(&self, range: (&R, &R)) -> bool where K: Borrow, @@ -301,8 +305,8 @@ impl SegmentMap { } /// transform a [`RaftCmdRequest`] to `(key, value, cf)` triple. -/// once it contains a write request, extract it, and return `Left((key, value, cf))`, -/// otherwise return the request itself via `Right`. +/// once it contains a write request, extract it, and return `Left((key, value, +/// cf))`, otherwise return the request itself via `Right`. pub fn request_to_triple(mut req: Request) -> Either<(Vec, Vec, CfName), Request> { let (key, value, cf) = match req.get_cmd_type() { CmdType::Put => { @@ -319,11 +323,11 @@ pub fn request_to_triple(mut req: Request) -> Either<(Vec, Vec, CfName), } /// `try_send!(s: Scheduler, task: T)` tries to send a task to the scheduler, -/// once meet an error, would report it, with the current file and line (so it is made as a macro). -/// returns whether it success. +/// once meet an error, would report it, with the current file and line (so it +/// is made as a macro). returns whether it success. #[macro_export(crate)] macro_rules! try_send { - ($s: expr, $task: expr) => { + ($s:expr, $task:expr) => { match $s.schedule($task) { Err(err) => { $crate::errors::Error::from(err).report(concat!( @@ -341,9 +345,10 @@ macro_rules! try_send { }; } -/// a hacky macro which allow us enable all debug log via the feature `backup_stream_debug`. -/// because once we enable debug log for all crates, it would soon get too verbose to read. -/// using this macro now we can enable debug log level for the crate only (even compile time...). +/// a hacky macro which allow us enable all debug log via the feature +/// `backup_stream_debug`. because once we enable debug log for all crates, it +/// would soon get too verbose to read. using this macro now we can enable debug +/// log level for the crate only (even compile time...). #[macro_export(crate)] macro_rules! debug { ($($t: tt)+) => { @@ -391,7 +396,8 @@ pub fn record_cf_stat(cf_name: &str, stat: &CfStatistics) { ); } -/// a shortcut for handing the result return from `Router::on_events`, when any faliure, send a fatal error to the `doom_messenger`. +/// a shortcut for handing the result return from `Router::on_events`, when any +/// failure, send a fatal error to the `doom_messenger`. pub fn handle_on_event_result(doom_messenger: &Scheduler, result: Vec<(String, Result<()>)>) { for (task, res) in result.into_iter() { if let Err(err) = res { @@ -422,8 +428,8 @@ pub struct CallbackWaitGroup { on_finish_all: std::sync::Mutex>>, } -/// A shortcut for making an opaque future type for return type or argument type, -/// which is sendable and not borrowing any variables. +/// A shortcut for making an opaque future type for return type or argument +/// type, which is sendable and not borrowing any variables. /// /// `fut![T]` == `impl Future + Send + 'static` #[macro_export(crate)] @@ -469,7 +475,8 @@ impl CallbackWaitGroup { Box::pin(rx.map(|_| ())) } - /// make a work, as long as the return value held, mark a work in the group is running. + /// make a work, as long as the return value held, mark a work in the group + /// is running. pub fn work(self: Arc) -> Work { self.running.fetch_add(1, Ordering::SeqCst); Work(self) @@ -520,11 +527,12 @@ impl ReadThroughputRecorder { let begin = self.begin.as_ref()?; let end = ins.io_stat().ok()??; let bytes_read = end.read - begin.read; - // FIXME: In our test environment, there may be too many caches hence - // the `bytes_read` is always zero :( - // For now, we eject here and let rocksDB prove that we did read something - // When the proc think we don't touch the block device (even in fact we didn't). - // NOTE: In the real-world, we would accept the zero `bytes_read` value since the cache did exists. + // FIXME: In our test environment, there may be too many caches hence the + // `bytes_read` is always zero. + // For now, we eject here and let rocksDB prove that we did read something when + // the proc think we don't touch the block device (even in fact we didn't). + // NOTE: In the real-world, we would accept the zero `bytes_read` value since + // the cache did exists. #[cfg(test)] if bytes_read == 0 { // use println here so we can get this message even log doesn't enabled. diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index b9559d86c1f..671952dc40d 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -159,9 +159,10 @@ impl SuiteBuilder { suite.start_endpoint(id, use_v3); } // TODO: The current mock metastore (slash_etc) doesn't supports multi-version. - // We must wait until the endpoints get ready to watching the metastore, or some modifies may be lost. - // Either make Endpoint::with_client wait until watch did start or make slash_etc support multi-version, - // then we can get rid of this sleep. + // We must wait until the endpoints get ready to watching the metastore, or some + // modifies may be lost. Either make Endpoint::with_client wait until watch did + // start or make slash_etc support multi-version, then we can get rid of this + // sleep. std::thread::sleep(Duration::from_secs(1)); suite } @@ -671,8 +672,8 @@ mod test { } #[test] - /// This case tests whehter the checkpoint ts (next backup ts) can be advanced correctly - /// when async commit is enabled. + /// This case tests whether the checkpoint ts (next backup ts) can be + /// advanced correctly when async commit is enabled. fn async_commit() { let mut suite = super::SuiteBuilder::new_named("async_commit") .nodes(3) @@ -768,8 +769,9 @@ mod test { suite.force_flush_files("inflight_message"); fail::cfg("delay_on_start_observe", "pause").unwrap(); suite.must_shuffle_leader(1); - // Handling the `StartObserve` message and doing flush are executed asynchronously. - // Make a delay of unblocking flush thread for make sure we have handled the `StartObserve`. + // Handling the `StartObserve` message and doing flush are executed + // asynchronously. Make a delay of unblocking flush thread for make sure + // we have handled the `StartObserve`. std::thread::sleep(Duration::from_secs(1)); fail::cfg("delay_on_flush", "off").unwrap(); suite.wait_for_flush(); @@ -790,7 +792,8 @@ mod test { .global_progress_of_task("inflight_message"), ) .unwrap(); - // The checkpoint should be advanced as expection when the inflight message has been consumed. + // The checkpoint should be advanced as expected when the inflight message has + // been consumed. assert!(checkpoint > 512, "checkpoint = {}", checkpoint); } diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 8865aa4f94c..ada36a08615 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -345,7 +345,7 @@ impl BackupRange { snapshot, backup_ts, IsolationLevel::Si, - false, /* fill_cache */ + false, // fill_cache Default::default(), Default::default(), false, @@ -969,10 +969,11 @@ impl Endpoint { } return; } - // Flush causal timestamp to make sure that future writes will have larger timestamps. - // And help TiKV-BR acquire a backup-ts with intact data smaller than it. - // (Note that intactness is not fully ensured now, until the safe-ts of RawKV is implemented. - // TiKV-BR need a workaround by rewinding backup-ts to a small "safe interval"). + // Flush causal timestamp to make sure that future writes will have larger + // timestamps. And help TiKV-BR acquire a backup-ts with intact data + // smaller than it. (Note that intactness is not fully ensured now, + // until the safe-ts of RawKV is implemented. TiKV-BR need a workaround + // by rewinding backup-ts to a small "safe interval"). if request.is_raw_kv { if let Err(e) = self .causal_ts_provider @@ -1088,10 +1089,13 @@ fn get_max_start_key(start_key: Option<&Key>, region: &Region) -> Option { } } -/// Construct an backup file name based on the given store id, region, range start key and local unix timestamp. -/// A name consists with five parts: store id, region_id, a epoch version, the hash of range start key and timestamp. -/// range start key is used to keep the unique file name for file, to handle different tables exists on the same region. -/// local unix timestamp is used to keep the unique file name for file, to handle receive the same request after connection reset. +/// Construct an backup file name based on the given store id, region, range +/// start key and local unix timestamp. A name consists with five parts: store +/// id, region_id, a epoch version, the hash of range start key and timestamp. +/// range start key is used to keep the unique file name for file, to handle +/// different tables exists on the same region. local unix timestamp is used to +/// keep the unique file name for file, to handle receive the same request after +/// connection reset. pub fn backup_file_name( store_id: u64, region: &Region, @@ -1536,11 +1540,11 @@ pub mod tests { // flush to disk so that read requests can be traced by TiKV limiter. engine .get_rocksdb() - .flush_cf(engine_traits::CF_DEFAULT, true /*sync*/) + .flush_cf(engine_traits::CF_DEFAULT, true /* sync */) .unwrap(); engine .get_rocksdb() - .flush_cf(engine_traits::CF_WRITE, true /*sync*/) + .flush_cf(engine_traits::CF_WRITE, true /* sync */) .unwrap(); // TODO: check key number for each snapshot. @@ -1575,7 +1579,7 @@ pub mod tests { info!("{:?}", files); assert_eq!( files.len(), - file_len, /* default and write */ + file_len, // default and write "{:?}", resp ); @@ -1651,8 +1655,8 @@ pub mod tests { let start_key_idx: u64 = 100; let end_key_idx: u64 = 110; endpoint.region_info.set_regions(vec![( - vec![], //generate_test_raw_key(start_key_idx).into_bytes(), - vec![], //generate_test_raw_key(end_key_idx).into_bytes(), + vec![], // generate_test_raw_key(start_key_idx).into_bytes(), + vec![], // generate_test_raw_key(end_key_idx).into_bytes(), 1, )]); let ctx = Context::default(); @@ -1679,7 +1683,7 @@ pub mod tests { // flush to disk so that read requests can be traced by TiKV limiter. engine .get_rocksdb() - .flush_cf(engine_traits::CF_DEFAULT, true /*sync*/) + .flush_cf(engine_traits::CF_DEFAULT, true /* sync */) .unwrap(); // TODO: check key number for each snapshot. @@ -1730,7 +1734,7 @@ pub mod tests { let file_len = 1; let files = resp.get_files(); info!("{:?}", files); - assert_eq!(files.len(), file_len /* default cf*/, "{:?}", resp); + assert_eq!(files.len(), file_len /* default cf */, "{:?}", resp); assert_eq!(files[0].total_kvs, end_key_idx - start_key_idx); assert_eq!(files[0].crc64xor, checksum); assert_eq!(files[0].get_start_key(), file_start); @@ -2000,7 +2004,8 @@ pub mod tests { assert_eq!(responses.len(), 3, "{:?}", responses); // for testing whether dropping the pool before all tasks finished causes panic. - // but the panic must be checked manually... (It may panic at tokio runtime threads...) + // but the panic must be checked manually. (It may panic at tokio runtime + // threads) let mut pool = ControlThreadPool::new(); pool.adjust_with(1); pool.spawn(async { tokio::time::sleep(Duration::from_millis(100)).await }); diff --git a/components/backup/src/softlimit.rs b/components/backup/src/softlimit.rs index babc13326bd..c3a2fc7c796 100644 --- a/components/backup/src/softlimit.rs +++ b/components/backup/src/softlimit.rs @@ -89,9 +89,10 @@ impl SoftLimit { pub trait CpuStatistics { type Container: IntoIterator; // ThreadInfoStatistics needs &mut self to record the thread information. - // RefCell(internal mutability) would make SoftLimitByCpu !Sync, hence futures contains it become !Send (WHY?) - // Mutex would make this function async or blocking. - // Anyway, &mut here is acceptable, since SoftLimitByCpu won't be shared. (Even the &mut here is a little weird...) + // RefCell(internal mutability) would make SoftLimitByCpu !Sync, hence futures + // contains it become !Send (WHY?) Mutex would make this function async or + // blocking. Anyway, &mut here is acceptable, since SoftLimitByCpu won't be + // shared. (Even the &mut here is a little weird...) fn get_cpu_usages(&mut self) -> Self::Container; } @@ -119,7 +120,8 @@ impl SoftLimitByCpu { self.current_idle_exclude(|_| false) } - /// returns the current idle processor, ignoring threads with name matches the predicate. + /// returns the current idle processor, ignoring threads with name matches + /// the predicate. fn current_idle_exclude(&mut self, mut exclude: impl FnMut(&str) -> bool) -> f64 { let usages = self.metrics.get_cpu_usages(); let used = usages @@ -129,15 +131,17 @@ impl SoftLimitByCpu { self.total_time - used } - /// apply the limit to the soft limit according to the current CPU remaining. + /// apply the limit to the soft limit according to the current CPU + /// remaining. #[cfg(test)] pub async fn exec_over(&mut self, limit: &SoftLimit) -> Result<()> { self.exec_over_with_exclude(limit, |_| false).await } - /// apply the limit to the soft limit according to the current CPU remaining. - /// when calculating the CPU usage, ignore threads with name matched by the exclude predicate. - /// This would keep at least one thread working. + /// apply the limit to the soft limit according to the current CPU + /// remaining. when calculating the CPU usage, ignore threads with name + /// matched by the exclude predicate. This would keep at least one + /// thread working. #[cfg(test)] pub async fn exec_over_with_exclude( &mut self, diff --git a/components/backup/src/utils.rs b/components/backup/src/utils.rs index 64425b595c8..de57b9f9081 100644 --- a/components/backup/src/utils.rs +++ b/components/backup/src/utils.rs @@ -12,11 +12,13 @@ use txn_types::{Key, TimeStamp}; use crate::{metrics::*, Result}; -// BACKUP_V1_TO_V2_TS is used as causal timestamp to backup RawKV api version V1/V1Ttl data and save to V2 format. -// Use 1 other than 0 because 0 is not a acceptable value for causal timestamp. See api_version::ApiV2::is_valid_ts. +// BACKUP_V1_TO_V2_TS is used as causal timestamp to backup RawKV api version +// V1/V1Ttl data and save to V2 format. Use 1 other than 0 because 0 is not a +// acceptable value for causal timestamp. See api_version::ApiV2::is_valid_ts. pub const BACKUP_V1_TO_V2_TS: u64 = 1; /// DaemonRuntime is a "background" runtime, which contains "daemon" tasks: -/// any task spawn into it would run until finish even the runtime isn't referenced. +/// any task spawn into it would run until finish even the runtime isn't +/// referenced. pub struct DaemonRuntime(Option); impl DaemonRuntime { @@ -109,11 +111,12 @@ pub struct KeyValueCodec { } // Usage of the KeyValueCodec in backup process is as following: -// `new` -> `check_backup_api_version`, return false if not supported or input invalid. -// encode the backup range with `encode_backup_key` +// `new` -> `check_backup_api_version`, return false if not supported or input +// invalid. encode the backup range with `encode_backup_key` // In `backup_raw` process -> use `is_valid_raw_value` & // `convert_encoded_key_to_dst_version` & `convert_encoded_value_to_dst_version` -// In BackupResponse, call `decode_backup_key` & `convert_key_range_to_dst_version` +// In BackupResponse, call `decode_backup_key` & +// `convert_key_range_to_dst_version` impl KeyValueCodec { pub fn new(is_raw_kv: bool, cur_api_ver: ApiVersion, dst_api_ver: ApiVersion) -> Self { KeyValueCodec { @@ -204,7 +207,8 @@ impl KeyValueCodec { }) } - // Input key is encoded key for rawkv apiv2 and txnkv. return the decode dst apiversion key. + // Input key is encoded key for rawkv apiv2 and txnkv. return the decode dst + // apiversion key. pub fn decode_backup_key(&self, key: Option) -> Result> { if key.is_none() { return Ok(vec![]); diff --git a/components/backup/src/writer.rs b/components/backup/src/writer.rs index 7127d896314..103ee9c6790 100644 --- a/components/backup/src/writer.rs +++ b/components/backup/src/writer.rs @@ -26,9 +26,8 @@ use crate::{backup_file_name, metrics::*, utils::KeyValueCodec, Error, Result}; #[derive(Debug, Clone, Copy)] /// CfNameWrap wraps the CfName type. -/// For removing the 'static lifetime bound in the async function, -/// which doesn't compile due to 'captures lifetime that does not appear in bounds' :(. -/// see https://github.com/rust-lang/rust/issues/63033 +/// For removing the 'static lifetime bound in the async function, which doesn't +/// compile due to 'captures lifetime that does not appear in bounds', see https://github.com/rust-lang/rust/issues/63033 /// FIXME: remove this. pub struct CfNameWrap(pub &'static str); @@ -99,8 +98,8 @@ impl Writer { Ok(()) } - // FIXME: we cannot get sst_info in [save_and_build_file], which may cause the !Send type - // [RocksEnternalSstFileInfo] sent between threads. + // FIXME: we cannot get sst_info in [save_and_build_file], which may cause the + // !Send type [RocksEnternalSstFileInfo] sent between threads. fn finish_read(writer: RocksSstWriter) -> Result<(u64, impl Read)> { let (sst_info, sst_reader) = writer.finish_read()?; Ok((sst_info.file_size(), sst_reader)) diff --git a/components/batch-system/benches/batch-system.rs b/components/batch-system/benches/batch-system.rs index b4e3ffd03ac..c248eabaf04 100644 --- a/components/batch-system/benches/batch-system.rs +++ b/components/batch-system/benches/batch-system.rs @@ -85,8 +85,8 @@ fn bench_imbalance(c: &mut Criterion) { system.shutdown(); } -/// Bench how it performs when scheduling a lot of quick tasks during an long-polling -/// tasks. +/// Bench how it performs when scheduling a lot of quick tasks during an +/// long-polling tasks. /// /// A good scheduling algorithm should not starve the quick tasks. fn bench_fairness(c: &mut Criterion) { diff --git a/components/batch-system/src/batch.rs b/components/batch-system/src/batch.rs index 108058ee5f2..49433a73592 100644 --- a/components/batch-system/src/batch.rs +++ b/components/batch-system/src/batch.rs @@ -1,9 +1,10 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -//! This is the core implementation of a batch system. Generally there will be two -//! different kind of FSMs in TiKV's FSM system. One is normal FSM, which usually -//! represents a peer, the other is control FSM, which usually represents something -//! that controls how the former is created or metrics are collected. +//! This is the core implementation of a batch system. Generally there will be +//! two different kind of FSMs in TiKV's FSM system. One is normal FSM, which +//! usually represents a peer, the other is control FSM, which usually +//! represents something that controls how the former is created or metrics are +//! collected. // #[PerformanceCriticalPath] use std::{ @@ -39,7 +40,7 @@ pub enum FsmTypes { // A macro to introduce common definition of scheduler. macro_rules! impl_sched { - ($name:ident, $ty:path, Fsm = $fsm:tt) => { + ($name:ident, $ty:path,Fsm = $fsm:tt) => { pub struct $name { sender: channel::Sender>, low_sender: channel::Sender>, @@ -205,8 +206,9 @@ impl Batch { /// Schedule the normal FSM located at `index`. /// - /// If `inplace`, the relative position of all fsm will not be changed; otherwise, the fsm - /// will be popped and the last fsm will be swap in to reduce memory copy. + /// If `inplace`, the relative position of all fsm will not be changed; + /// otherwise, the fsm will be popped and the last fsm will be swap in + /// to reduce memory copy. pub fn schedule(&mut self, router: &BatchRouter, index: usize, inplace: bool) { let to_schedule = match self.normals[index].take() { Some(f) => f, @@ -267,8 +269,8 @@ pub enum HandleResult { KeepProcessing, /// The Fsm should stop at the progress. StopAt { - /// The count of messages that have been acknowledged by handler. The fsm should be - /// released until new messages arrive. + /// The count of messages that have been acknowledged by handler. The + /// fsm should be released until new messages arrive. progress: usize, /// Whether the fsm should be released before `end`. skip_end: bool, @@ -307,7 +309,7 @@ pub trait PollHandler: Send + 'static { /// This function is called when handling readiness for control FSM. /// /// If returned value is Some, then it represents a length of channel. This - /// function will only be called for the same fsm after channel's lengh is + /// function will only be called for the same fsm after channel's length is /// larger than the value. If it returns None, then this function will /// still be called for the same FSM in the next loop unless the FSM is /// stopped. @@ -318,8 +320,8 @@ pub trait PollHandler: Send + 'static { /// The returned value is handled in the same way as `handle_control`. fn handle_normal(&mut self, normal: &mut impl DerefMut) -> HandleResult; - /// This function is called after `handle_normal` is called for all fsm and before calling - /// `end`. The function is expected to run lightweight work. + /// This function is called after `handle_normal` is called for all fsm and + /// before calling `end`. The function is expected to run lightweight work. fn light_end(&mut self, _batch: &mut [Option>]) {} /// This function is called at the end of every round. @@ -389,13 +391,14 @@ impl> Poller { let mut to_skip_end = Vec::with_capacity(self.max_batch_size); // Fetch batch after every round is finished. It's helpful to protect regions - // from becoming hungry if some regions are hot points. Since we fetch new fsm every time - // calling `poll`, we do not need to configure a large value for `self.max_batch_size`. + // from becoming hungry if some regions are hot points. Since we fetch new fsm + // every time calling `poll`, we do not need to configure a large value for + // `self.max_batch_size`. let mut run = true; while run && self.fetch_fsm(&mut batch) { - // If there is some region wait to be deal, we must deal with it even if it has overhead - // max size of batch. It's helpful to protect regions from becoming hungry - // if some regions are hot points. + // If there is some region wait to be deal, we must deal with it even if it has + // overhead max size of batch. It's helpful to protect regions from becoming + // hungry if some regions are hot points. let mut max_batch_size = std::cmp::max(self.max_batch_size, batch.normals.len()); // update some online config if needed. { @@ -454,9 +457,9 @@ impl> Poller { if let Ok(fsm) = self.fsm_receiver.try_recv() { run = batch.push(fsm); } - // If we receive a ControlFsm, break this cycle and call `end`. Because ControlFsm - // may change state of the handler, we shall deal with it immediately after - // calling `begin` of `Handler`. + // If we receive a ControlFsm, break this cycle and call `end`. Because + // ControlFsm may change state of the handler, we shall deal with it immediately + // after calling `begin` of `Handler`. if !run || fsm_cnt >= batch.normals.len() { break; } diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 43067ecb202..9975d66dfdc 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -346,8 +346,8 @@ where let state_unit = mem::size_of::>(); // Every message in crossbeam sender needs 8 bytes to store state. let message_unit = mem::size_of::() + 8; - // crossbeam unbounded channel sender has a list of blocks. Every block has 31 unit - // and every sender has at least one sender. + // crossbeam unbounded channel sender has a list of blocks. Every block has 31 + // unit and every sender has at least one sender. let sender_block_unit = 31; RouterTrace { alive: (mailbox_unit * 8 / 7 // hashmap uses 7/8 of allocated memory. diff --git a/components/causal_ts/src/config.rs b/components/causal_ts/src/config.rs index a856b5b7358..e75bff62d47 100644 --- a/components/causal_ts/src/config.rs +++ b/components/causal_ts/src/config.rs @@ -16,8 +16,9 @@ pub struct Config { /// The minimal renew batch size of BatchTsoProvider. /// /// Default is 100. - /// One TSO is required for every batch of Raft put messages, so by default 1K tso/s should be enough. - /// Benchmark showed that with a 8.6w raw_put per second, the TSO requirement is 600 per second. + /// One TSO is required for every batch of Raft put messages, so by default + /// 1K tso/s should be enough. Benchmark showed that with a 8.6w raw_put + /// per second, the TSO requirement is 600 per second. pub renew_batch_min_size: u32, } diff --git a/components/causal_ts/src/lib.rs b/components/causal_ts/src/lib.rs index 615f01365cd..05626ce7203 100644 --- a/components/causal_ts/src/lib.rs +++ b/components/causal_ts/src/lib.rs @@ -22,7 +22,8 @@ pub trait CausalTsProvider: Send + Sync { /// Get a new timestamp. fn get_ts(&self) -> Result; - /// Flush (cached) timestamps to keep causality on some events, such as "leader transfer". + /// Flush (cached) timestamps to keep causality on some events, such as + /// "leader transfer". fn flush(&self) -> Result<()> { Ok(()) } diff --git a/components/causal_ts/src/observer.rs b/components/causal_ts/src/observer.rs index 8d2c5abc95c..aeb04bfabf5 100644 --- a/components/causal_ts/src/observer.rs +++ b/components/causal_ts/src/observer.rs @@ -20,8 +20,9 @@ use raftstore::{ use crate::{CausalTsProvider, RawTsTracker}; -/// CausalObserver appends timestamp for RawKV V2 data, -/// and invoke causal_ts_provider.flush() on specified event, e.g. leader transfer, snapshot apply. +/// CausalObserver appends timestamp for RawKV V2 data, and invoke +/// causal_ts_provider.flush() on specified event, e.g. leader +/// transfer, snapshot apply. /// Should be used ONLY when API v2 is enabled. pub struct CausalObserver { causal_ts_provider: Arc, @@ -37,7 +38,8 @@ impl Clone for CausalObserver { } } -// Causal observer's priority should be higher than all other observers, to avoid being bypassed. +// Causal observer's priority should be higher than all other observers, to +// avoid being bypassed. const CAUSAL_OBSERVER_PRIORITY: u32 = 0; impl CausalObserver { @@ -97,7 +99,8 @@ impl QueryObserver for CausalObserver RoleObserver for CausalObserver RegionChangeObserver for CausalObse return; } - // In the scenario of region merge, the target region would merge some entries from source - // region with larger timestamps (when leader of source region is in another store with - // larger TSO batch than the store of target region's leader). - // So we need a flush after commit merge. See issue #12680. - // TODO: do not need flush if leaders of source & target region are in the same store. + // In the scenario of region merge, the target region would merge some entries + // from source region with larger timestamps (when leader of source region is in + // another store with larger TSO batch than the store of target region's + // leader). So we need a flush after commit merge. See issue #12680. + // TODO: do not need flush if leaders of source & target region are in the same + // store. if let RegionChangeEvent::Update(RegionChangeReason::CommitMerge) = event { self.flush_timestamp(ctx.region(), REASON_REGION_MERGE); } diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 917353222fa..35e6bffd11b 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -31,8 +31,9 @@ use crate::{ // Renew on every 100ms, to adjust batch size rapidly enough. pub(crate) const TSO_BATCH_RENEW_INTERVAL_DEFAULT: u64 = 100; // Batch size on every renew interval. -// One TSO is required for every batch of Raft put messages, so by default 1K tso/s should be enough. -// Benchmark showed that with a 8.6w raw_put per second, the TSO requirement is 600 per second. +// One TSO is required for every batch of Raft put messages, so by default 1K +// tso/s should be enough. Benchmark showed that with a 8.6w raw_put per second, +// the TSO requirement is 600 per second. pub(crate) const TSO_BATCH_MIN_SIZE_DEFAULT: u32 = 100; // Max batch size of TSO requests. Space of logical timestamp is 262144, // exceed this space will cause PD to sleep, waiting for physical clock advance. @@ -89,7 +90,8 @@ impl TsoBatch { Ok(()) } - // Note: batch is "used up" in flush, and batch size will be enlarged in next renew. + // Note: batch is "used up" in flush, and batch size will be enlarged in next + // renew. pub fn flush(&self) { self.logical_start .store(self.logical_end, Ordering::Relaxed); @@ -114,7 +116,8 @@ impl TsoBatch { } } -/// MAX_RENEW_BATCH_SIZE is the batch size of TSO renew. It is an empirical value. +/// MAX_RENEW_BATCH_SIZE is the batch size of TSO renew. It is an empirical +/// value. const MAX_RENEW_BATCH_SIZE: usize = 64; type RenewError = Arc; @@ -363,8 +366,8 @@ impl CausalTsProvider for BatchTsoProvider { break; } if let Err(err) = block_on(self.renew_tso_batch(false, TSO_BATCH_RENEW_FOR_USED_UP)) { - // `renew_tso_batch` failure is likely to be caused by TSO timeout, which would mean that PD is quite busy. - // So do not retry any more. + // `renew_tso_batch` failure is likely to be caused by TSO timeout, which would + // mean that PD is quite busy. So do not retry any more. error!("BatchTsoProvider::get_ts, renew_tso_batch fail on batch used-up"; "err" => ?err); break; } @@ -477,8 +480,8 @@ pub mod tests { let pd_cli = Arc::new(TestPdClient::new(1, false)); pd_cli.set_tso(1000.into()); - // Set `renew_interval` to 0 to disable background renew. Invoke `flush()` to renew manually. - // allocated: [1001, 1100] + // Set `renew_interval` to 0 to disable background renew. Invoke `flush()` to + // renew manually. allocated: [1001, 1100] let provider = block_on(BatchTsoProvider::new_opt( pd_cli.clone(), Duration::ZERO, @@ -539,8 +542,8 @@ pub mod tests { ); } - // Set `renew_interval` to 0 to disable background renew. Invoke `flush()` to renew manually. - // allocated: [1001, 1100] + // Set `renew_interval` to 0 to disable background renew. Invoke `flush()` to + // renew manually. allocated: [1001, 1100] let provider = block_on(BatchTsoProvider::new_opt( pd_cli.clone(), Duration::ZERO, diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index 94fe0f74c61..3b1894eb6fc 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -44,8 +44,9 @@ const CDC_RESP_MAX_BYTES: u32 = 6 * 1024 * 1024; /// Assume the average size of batched `CdcEvent::Event`s is 32KB and /// the average count of batched `CdcEvent::Event`s is 64. -/// +/// ```text /// 2 = (CDC_EVENT_MAX_BYTES * CDC_EVENT_MAX_COUNT / CDC_MAX_RESP_SIZE).ceil() + 1 /* reserve for ResolvedTs */; +/// ``` const CDC_RESP_MAX_BATCH_COUNT: usize = 2; pub enum CdcEvent { diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 752c068e72a..10de563c4fc 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -64,10 +64,11 @@ impl Default for DownstreamID { pub enum DownstreamState { /// It's just created and rejects change events and resolved timestamps. Uninitialized, - /// It has got a snapshot for incremental scan, and change events will be accepted. - /// However it still rejects resolved timestamps. + /// It has got a snapshot for incremental scan, and change events will be + /// accepted. However it still rejects resolved timestamps. Initializing, - /// Incremental scan is finished so that resolved timestamps are acceptable now. + /// Incremental scan is finished so that resolved timestamps are acceptable + /// now. Normal, Stopped, } @@ -78,7 +79,8 @@ impl Default for DownstreamState { } } -/// Shold only be called when it's uninitialized or stopped. Return false if it's stopped. +/// Should only be called when it's uninitialized or stopped. Return false if +/// it's stopped. pub(crate) fn on_init_downstream(s: &AtomicCell) -> bool { s.compare_exchange( DownstreamState::Uninitialized, @@ -87,7 +89,8 @@ pub(crate) fn on_init_downstream(s: &AtomicCell) -> bool { .is_ok() } -/// Shold only be called when it's initializing or stopped. Return false if it's stopped. +/// Should only be called when it's initializing or stopped. Return false if +/// it's stopped. pub(crate) fn post_init_downstream(s: &AtomicCell) -> bool { s.compare_exchange(DownstreamState::Initializing, DownstreamState::Normal) .is_ok() @@ -348,9 +351,10 @@ impl Delegate { let _ = self.broadcast(send); } - /// `txn_extra_op` returns a shared flag which is accessed in TiKV's transaction layer to - /// determine whether to capture modifications' old value or not. Unsubsribing all downstreams - /// or calling `Delegate::stop` will store it with `TxnExtraOp::Noop`. + /// `txn_extra_op` returns a shared flag which is accessed in TiKV's + /// transaction layer to determine whether to capture modifications' old + /// value or not. Unsubscribing all downstreams or calling + /// `Delegate::stop` will store it with `TxnExtraOp::Noop`. /// /// NOTE: Dropping a `Delegate` won't update this flag. pub fn txn_extra_op(&self) -> &AtomicCell { @@ -373,7 +377,8 @@ impl Delegate { Ok(()) } - /// Install a resolver. Return downstreams which fail because of the region's internal changes. + /// Install a resolver. Return downstreams which fail because of the + /// region's internal changes. pub fn on_region_ready( &mut self, mut resolver: Resolver, @@ -667,8 +672,8 @@ impl Delegate { ..Default::default() }; let send = move |downstream: &Downstream| { - // No ready downstream or a downstream that does not match the kv_api type, will be ignored. - // There will be one region that contains both Txn & Raw entries. + // No ready downstream or a downstream that does not match the kv_api type, will + // be ignored. There will be one region that contains both Txn & Raw entries. // The judgement here is for sending entries to downstreams with correct kv_api. if !downstream.state.load().ready_for_change_events() || downstream.kv_api != kv_api { return Ok(()); @@ -877,9 +882,9 @@ impl Delegate { if let Err(e) = compare_region_epoch( &downstream.region_epoch, region, - false, /* check_conf_ver */ - true, /* check_ver */ - true, /* include_region */ + false, // check_conf_ver + true, // check_ver + true, // include_region ) { info!( "cdc fail to subscribe downstream"; @@ -918,9 +923,10 @@ fn make_overlapped_rollback(key: Key, row: &mut EventRow) { set_event_row_type(row, EventLogType::Rollback); } -/// Decodes the write record and store its information in `row`. This may be called both when -/// doing incremental scan of observing apply events. There's different behavior for the two -/// case, distinguished by the `is_apply` parameter. +/// Decodes the write record and store its information in `row`. This may be +/// called both when doing incremental scan of observing apply events. There's +/// different behavior for the two case, distinguished by the `is_apply` +/// parameter. fn decode_write( key: Vec, value: &[u8], @@ -932,8 +938,8 @@ fn decode_write( let write = WriteRef::parse(value).unwrap().to_owned(); // For scanning, ignore the GC fence and read the old data; - // For observed apply, drop the record it self but keep only the overlapped rollback information - // if gc_fence exists. + // For observed apply, drop the record it self but keep only the overlapped + // rollback information if gc_fence exists. if is_apply && write.gc_fence.is_some() { // `gc_fence` is set means the write record has been rewritten. // Currently the only case is writing overlapped_rollback. And in this case diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index fa6dcb97651..22cb5b94922 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -390,8 +390,8 @@ pub struct Endpoint { timer: SteadyTimer, tso_worker: Runtime, store_meta: Arc>, - /// The concurrency manager for transactions. It's needed for CDC to check locks when - /// calculating resolved_ts. + /// The concurrency manager for transactions. It's needed for CDC to check + /// locks when calculating resolved_ts. concurrency_manager: ConcurrencyManager, config: CdcConfig, @@ -458,7 +458,8 @@ impl, E: KvEngine> Endpoint { .build() .unwrap(); - // Initialized for the first time, subsequent adjustments will be made based on configuration updates. + // Initialized for the first time, subsequent adjustments will be made based on + // configuration updates. let scan_concurrency_semaphore = Arc::new(Semaphore::new(config.incremental_scan_concurrency)); let old_value_cache = OldValueCache::new(config.old_value_cache_memory_quota); @@ -534,7 +535,8 @@ impl, E: KvEngine> Endpoint { "current config" => ?self.config, "change" => ?change ); - // Update the config here. The following adjustments will all use the new values. + // Update the config here. The following adjustments will all use the new + // values. self.config.update(change.clone()).unwrap(); // Maybe the cache will be lost due to smaller capacity, @@ -544,8 +546,8 @@ impl, E: KvEngine> Endpoint { .resize(self.config.old_value_cache_memory_quota); } - // Maybe the limit will be exceeded for a while after the concurrency becomes smaller, - // but it is acceptable. + // Maybe the limit will be exceeded for a while after the concurrency becomes + // smaller, but it is acceptable. if change.get("incremental_scan_concurrency").is_some() { self.scan_concurrency_semaphore = Arc::new(Semaphore::new(self.config.incremental_scan_concurrency)) @@ -924,7 +926,8 @@ impl, E: KvEngine> Endpoint { // Reset resolved_regions to empty. let resolved_regions = &mut self.resolved_region_heap; resolved_regions.clear(); - // rawkv only, if user does not use rawkv apiv2, raw_resolved_regions should be empty. + // rawkv only, if user does not use rawkv apiv2, raw_resolved_regions should be + // empty. let mut raw_resolved_regions = ResolvedRegionVec { vec: vec![] }; let total_region_count = regions.len(); @@ -949,7 +952,8 @@ impl, E: KvEngine> Endpoint { } resolved_regions.push(region_id, resolved_ts.min()); // The judge of raw region is not accuracy here, and we may miss at most one - // "normal" raw region. But this will not break the correctness of outlier detection. + // "normal" raw region. But this will not break the correctness of outlier + // detection. if resolved_ts.is_min_ts_from_raw() { raw_resolved_regions.push(region_id, resolved_ts.raw_ts) } @@ -997,7 +1001,8 @@ impl, E: KvEngine> Endpoint { self.broadcast_resolved_ts(outlier_min_resolved_ts, outlier_regions); self.broadcast_resolved_ts(normal_min_resolved_ts, normal_regions); - // rawkv only, if user does not use rawkv apiv2, raw_resolved_regions should be empty. + // rawkv only, if user does not use rawkv apiv2, raw_resolved_regions should be + // empty. self.handle_raw_outlier_regions(&mut raw_resolved_regions, min_ts); } @@ -1124,8 +1129,8 @@ impl, E: KvEngine> Endpoint { let mut min_ts = min_ts_pd; let mut min_ts_min_lock = min_ts_pd; - // Sync with concurrency manager so that it can work correctly when optimizations - // like async commit is enabled. + // Sync with concurrency manager so that it can work correctly when + // optimizations like async commit is enabled. // Note: This step must be done before scheduling `Task::MinTS` task, and the // resolver must be checked in or after `Task::MinTS`' execution. cm.update_max_ts(min_ts); @@ -1707,7 +1712,8 @@ mod tests { let mut updated_cfg = cfg.clone(); { // Update it to be smaller than incremental_scan_threads, - // which will be an invalid change and will modified to incremental_scan_threads. + // which will be an invalid change and will modified to + // incremental_scan_threads. updated_cfg.incremental_scan_concurrency = 2; } let diff = cfg.diff(&updated_cfg); @@ -2104,7 +2110,8 @@ mod tests { suite.run(Task::RawTrackTs { region_id, ts }); } let delegate = suite.endpoint.capture_regions.get_mut(®ion_id).unwrap(); - // region is not ready, so raw lock in resolver, raw ts is added to delegate.pending. + // region is not ready, so raw lock in resolver, raw ts is added to + // delegate.pending. assert_eq!(delegate.resolver.is_none(), true); // Schedule resolver ready (resolver is built by conn a). let mut region = Region::default(); @@ -2219,7 +2226,7 @@ mod tests { .recv_timeout(Duration::from_millis(500)) .unwrap() .unwrap(); - assert_matches!(task_recv, + assert_matches!(task_recv, Task::Deregister(Deregister::Delegate {region_id, observe_id, ..}) if region_id == dead_lock_region && observe_id == ob_id); let gap = Duration::from_millis(cur_tso.physical() - dead_lock_ts.physical()).as_secs_f64(); @@ -2873,7 +2880,8 @@ mod tests { .is_none(), true ); - // count become 1002, boundary: 2248 - 3 * 502 = 742, but ts gap is not larger than 60s. + // count become 1002, boundary: 2248 - 3 * 502 = 742, but ts gap is not larger + // than 60s. region_vec2.push(741, TimeStamp::compose(741, 0)); assert_eq!( region_vec2 diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 28b7e5f5d0a..3be509e73d0 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -305,8 +305,9 @@ impl Initializer { Ok(()) } - // It's extracted from `Initializer::scan_batch` to avoid becoming an asynchronous block, - // so that we can limit scan speed based on the thread disk I/O or RocksDB block read bytes. + // It's extracted from `Initializer::scan_batch` to avoid becoming an + // asynchronous block, so that we can limit scan speed based on the thread + // disk I/O or RocksDB block read bytes. fn do_scan( &self, scanner: &mut Scanner, @@ -472,10 +473,10 @@ impl Initializer { pub(crate) fn deregister_downstream(&self, err: Error) { let deregister = if self.build_resolver || err.has_region_error() { // Deregister delegate on the conditions, - // * It fails to build a resolver. A delegate requires a resolver - // to advance resolved ts. - // * A region error. It usually mean a peer is not leader or - // a leader meets an error and can not serve. + // * It fails to build a resolver. A delegate requires a resolver to advance + // resolved ts. + // * A region error. It usually mean a peer is not leader or a leader meets an + // error and can not serve. Deregister::Delegate { region_id: self.region_id, observe_id: self.observe_id, diff --git a/components/cdc/src/metrics.rs b/components/cdc/src/metrics.rs index 969e3b371a4..5db91572112 100644 --- a/components/cdc/src/metrics.rs +++ b/components/cdc/src/metrics.rs @@ -8,9 +8,9 @@ use prometheus::*; use prometheus_static_metric::*; use tikv::storage::Statistics; -/// Installing a new capture contains 2 phases, one for incremental scanning and one for -/// fetching delta changes from raftstore. They can share some similar metrics, in which -/// case we can use this tag to distinct them. +/// Installing a new capture contains 2 phases, one for incremental scanning and +/// one for fetching delta changes from raftstore. They can share some similar +/// metrics, in which case we can use this tag to distinct them. pub const TAG_DELTA_CHANGE: &str = "delta_change"; pub const TAG_INCREMENTAL_SCAN: &str = "incremental_scan"; diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 6c0771cbc64..18b4d995077 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -45,8 +45,8 @@ impl CdcObserver { } pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { - // use 0 as the priority of the cmd observer. CDC should have a higher priority than - // the `resolved-ts`'s cmd observer + // use 0 as the priority of the cmd observer. CDC should have a higher priority + // than the `resolved-ts`'s cmd observer coprocessor_host .registry .register_cmd_observer(0, BoxCmdObserver::new(self.clone())); @@ -96,7 +96,8 @@ impl CdcObserver { impl Coprocessor for CdcObserver {} impl CmdObserver for CdcObserver { - // `CdcObserver::on_flush_applied_cmd_batch` should only invoke if `cmd_batches` is not empty + // `CdcObserver::on_flush_applied_cmd_batch` should only invoke if `cmd_batches` + // is not empty fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -119,7 +120,8 @@ impl CmdObserver for CdcObserver { let mut region = Region::default(); region.mut_peers().push(Peer::default()); // Create a snapshot here for preventing the old value was GC-ed. - // TODO: only need it after enabling old value, may add a flag to indicate whether to get it. + // TODO: only need it after enabling old value, may add a flag to indicate + // whether to get it. let snapshot = RegionSnapshot::from_snapshot(Arc::new(engine.snapshot()), Arc::new(region)); let get_old_value = move |key, query_ts, diff --git a/components/cdc/src/old_value.rs b/components/cdc/src/old_value.rs index caf3060591e..89f78f694c3 100644 --- a/components/cdc/src/old_value.rs +++ b/components/cdc/src/old_value.rs @@ -104,8 +104,8 @@ impl OldValueCache { } } -/// Fetch old value for `key`. If it can't be found in `old_value_cache`, seek and retrieve it with -/// `query_ts` from `snapshot`. +/// Fetch old value for `key`. If it can't be found in `old_value_cache`, seek +/// and retrieve it with `query_ts` from `snapshot`. pub fn get_old_value( snapshot: &S, key: Key, @@ -171,9 +171,10 @@ pub fn new_old_value_cursor(snapshot: &S, cf: &'static str) - /// Gets the latest value to the key with an older or equal version. /// -/// The key passed in should be a key with a timestamp. This function will returns -/// the latest value of the entry if the user key is the same to the given key and -/// the timestamp is older than or equal to the timestamp in the given key. +/// The key passed in should be a key with a timestamp. This function will +/// returns the latest value of the entry if the user key is the same to the +/// given key and the timestamp is older than or equal to the timestamp in the +/// given key. /// /// `load_from_cf_data` indicates how to get value from `CF_DEFAULT`. pub fn near_seek_old_value( diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index a38c3988bcc..2e9375ce6a5 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -306,7 +306,8 @@ fn do_test_no_resolved_ts_before_downstream_initialized(version: &str) { } let th = thread::spawn(move || { - // The first downstream can receive timestamps but the second should receive nothing. + // The first downstream can receive timestamps but the second should receive + // nothing. let mut rx = event_feeds[0].replace(None).unwrap(); assert!(recv_timeout(&mut rx, Duration::from_secs(1)).is_ok()); let mut rx = event_feeds[1].replace(None).unwrap(); @@ -318,11 +319,11 @@ fn do_test_no_resolved_ts_before_downstream_initialized(version: &str) { suite.stop(); } -// When a new CDC downstream is installed, delta changes for other downstreams on the same -// region should be flushed so that the new downstream can gets a fresh snapshot to performs -// a incremental scan. CDC can ensure that those delta changes are sent to CDC's `Endpoint` -// before the incremental scan, but `Sink` may break this rule. This case tests it won't -// happen any more. +// When a new CDC downstream is installed, delta changes for other downstreams +// on the same region should be flushed so that the new downstream can gets a +// fresh snapshot to performs a incremental scan. CDC can ensure that those +// delta changes are sent to CDC's `Endpoint` before the incremental scan, but +// `Sink` may break this rule. This case tests it won't happen any more. #[test] fn test_cdc_observed_before_incremental_scan_snapshot() { let cluster = new_server_cluster(0, 1); @@ -331,7 +332,8 @@ fn test_cdc_observed_before_incremental_scan_snapshot() { let region = suite.cluster.get_region(b""); let lead_client = PeerClient::new(&suite.cluster, region.id, new_peer(1, 1)); - // So that the second changefeed can get some delta changes elder than its snapshot. + // So that the second changefeed can get some delta changes elder than its + // snapshot. let (mut req_tx_0, event_feed_0, _) = new_event_feed(suite.get_region_cdc_client(region.id)); let req_0 = suite.new_changedata_request(region.id); block_on(req_tx_0.send((req_0, WriteFlags::default()))).unwrap(); diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index 5f9f9bf7209..3be68c5905c 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -1177,7 +1177,8 @@ fn test_old_value_multi_changefeeds_impl() { } } - // The downstream 2 can also get old values because `req`.`extra_op` field is ignored now. + // The downstream 2 can also get old values because `req`.`extra_op` field is + // ignored now. event_count = 0; loop { let events = receive_event_2(false).events.to_vec(); @@ -1285,9 +1286,9 @@ fn test_cdc_resolve_ts_checking_concurrency_manager_impl() { } let _guard = lock_key(b"xa", 90); - // The resolved_ts should be blocked by the mem lock but it's already greater than 90. - // Retry until receiving an unchanged resolved_ts because the first several resolved ts received - // might be updated before acquiring the lock. + // The resolved_ts should be blocked by the mem lock but it's already greater + // than 90. Retry until receiving an unchanged resolved_ts because the first + // several resolved ts received might be updated before acquiring the lock. let mut last_resolved_ts = 0; let mut success = false; for _ in 0..5 { @@ -1840,9 +1841,10 @@ fn test_cdc_scan_ignore_gc_fence_impl() { let commit_ts2 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_kv_commit(1, vec![key.to_vec()], start_ts2, commit_ts2); - // Assume the first version above is written by async commit and it's commit_ts is not unique. - // Use it's commit_ts as another transaction's start_ts. - // Run check_txn_status on commit_ts1 so that gc_fence will be set on the first version. + // Assume the first version above is written by async commit and it's commit_ts + // is not unique. Use it's commit_ts as another transaction's start_ts. + // Run check_txn_status on commit_ts1 so that gc_fence will be set on the first + // version. let caller_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); let action = suite.must_check_txn_status( 1, @@ -1940,9 +1942,10 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { let commit_ts2 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_kv_commit(1, vec![key.to_vec()], start_ts2, commit_ts2); - // We don't care about the events caused by the previous writings in this test case, and it's - // too complicated to check them. Just skip them here, and wait for resolved_ts to be pushed to - // a greater value than the two versions' commit_ts-es. + // We don't care about the events caused by the previous writings in this test + // case, and it's too complicated to check them. Just skip them here, and + // wait for resolved_ts to be pushed to a greater value than the two + // versions' commit_ts-es. let skip_to_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); loop { let e = receive_event(true); @@ -1953,9 +1956,10 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { } } - // Assume the two versions of the key are written by async commit transactions, and their - // commit_ts-es are also other transaction's start_ts-es. Run check_txn_status on the - // commit_ts-es of the two versions to cause overlapping rollback. + // Assume the two versions of the key are written by async commit transactions, + // and their commit_ts-es are also other transaction's start_ts-es. Run + // check_txn_status on the commit_ts-es of the two versions to cause + // overlapping rollback. let caller_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_check_txn_status( 1, @@ -2007,9 +2011,9 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { other => panic!("unknown event {:?}", other), }); - // In some special cases, a newly committed record may carry an overlapped rollback initially. - // In this case, gc_fence shouldn't be set, and CDC ignores the rollback and handles the - // committing normally. + // In some special cases, a newly committed record may carry an overlapped + // rollback initially. In this case, gc_fence shouldn't be set, and CDC + // ignores the rollback and handles the committing normally. let start_ts3 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); let mut mutation = Mutation::default(); mutation.set_op(Op::Put); @@ -2031,11 +2035,11 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { other => panic!("unknown event {:?}", other), }); - // Again, assume the transaction is committed with async commit protocol, and the commit_ts is - // also another transaction's start_ts. + // Again, assume the transaction is committed with async commit protocol, and + // the commit_ts is also another transaction's start_ts. let commit_ts3 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); - // Rollback another transaction before committing, then the rolling back information will be - // recorded in the lock. + // Rollback another transaction before committing, then the rolling back + // information will be recorded in the lock. let caller_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_check_txn_status( 1, @@ -2082,10 +2086,11 @@ fn test_cdc_extract_rollback_if_gc_fence_set_impl() { suite.stop(); } -// This test is created for covering the case that term was increased without leader change. -// Ideally leader id and term in StoreMeta should be updated together with a yielded SoftState, -// but sometimes the leader was transferred to another store and then changed back, -// a follower would not get a new SoftState. +// This test is created for covering the case that term was increased without +// leader change. Ideally leader id and term in StoreMeta should be updated +// together with a yielded SoftState, but sometimes the leader was transferred +// to another store and then changed back, a follower would not get a new +// SoftState. #[test] fn test_term_change() { let cluster = new_server_cluster(0, 3); diff --git a/components/cloud/aws/src/kms.rs b/components/cloud/aws/src/kms.rs index 11ecf88ddd9..3d5d6a3fdea 100644 --- a/components/cloud/aws/src/kms.rs +++ b/components/cloud/aws/src/kms.rs @@ -82,8 +82,8 @@ impl KmsProvider for AwsKms { ENCRYPTION_VENDOR_NAME_AWS_KMS } - // On decrypt failure, the rule is to return WrongMasterKey error in case it is possible that - // a wrong master key has been used, or other error otherwise. + // On decrypt failure, the rule is to return WrongMasterKey error in case it is + // possible that a wrong master key has been used, or other error otherwise. async fn decrypt_data_key(&self, data_key: &EncryptedKey) -> Result> { let decrypt_request = DecryptRequest { ciphertext_blob: bytes::Bytes::copy_from_slice(&*data_key), @@ -125,8 +125,8 @@ impl KmsProvider for AwsKms { } } -// Rusoto errors Display implementation just gives the cause message and discards the type. -// This is really bad when the cause message is empty! +// Rusoto errors Display implementation just gives the cause message and +// discards the type. This is really bad when the cause message is empty! // Use Debug instead: this will show both pub struct FixRusotoErrorDisplay( RusotoError, diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index e2e9919860b..fd5c07c5097 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -259,8 +259,9 @@ impl From> for UploadError { } /// try_read_exact tries to read exact length data as the buffer size. -/// like [`std::io::Read::read_exact`], but won't return `UnexpectedEof` when cannot read anything more from the `Read`. -/// once returning a size less than the buffer length, implies a EOF was meet, or nothing readed. +/// like [`std::io::Read::read_exact`], but won't return `UnexpectedEof` when +/// cannot read anything more from the `Read`. once returning a size less than +/// the buffer length, implies a EOF was meet, or nothing read. async fn try_read_exact( r: &mut R, buf: &mut [u8], @@ -283,7 +284,8 @@ async fn try_read_exact( const MINIMUM_PART_SIZE: usize = 5 * 1024 * 1024; impl<'client> S3Uploader<'client> { - /// Creates a new uploader with a given target location and upload configuration. + /// Creates a new uploader with a given target location and upload + /// configuration. fn new(client: &'client S3Client, config: &Config, key: String) -> Self { Self { client, @@ -370,7 +372,8 @@ impl<'client> S3Uploader<'client> { } } - /// Completes a multipart upload process, asking S3 to join all parts into a single file. + /// Completes a multipart upload process, asking S3 to join all parts into a + /// single file. async fn complete(&self) -> Result<(), RusotoError> { let res = timeout( Self::get_timeout(), @@ -452,8 +455,8 @@ impl<'client> S3Uploader<'client> { /// Uploads a file atomically. /// - /// This should be used only when the data is known to be short, and thus relatively cheap to - /// retry the entire upload. + /// This should be used only when the data is known to be short, and thus + /// relatively cheap to retry the entire upload. async fn upload(&self, data: &[u8]) -> Result<(), RusotoError> { let res = timeout(Self::get_timeout(), async { #[cfg(feature = "failpoints")] @@ -540,9 +543,9 @@ impl BlobStorage for S3Storage { } else { io::ErrorKind::Other }; - // Even we can check whether there is an `io::Error` internal and extract it directly, - // We still need to keep the message 'failed to put object' here for adapting the string-matching based - // retry logic in BR :( + // Even we can check whether there is an `io::Error` internal and extract it + // directly, We still need to keep the message 'failed to put object' here for + // adapting the string-matching based retry logic in BR :( io::Error::new(error_code, format!("failed to put object {}", e)) }) } @@ -628,7 +631,8 @@ mod tests { // set multi_part_size to use upload_part function config.multi_part_size = multi_part_size; - // split magic_contents into 3 parts, so we mock 5 requests here(1 begin + 3 part + 1 complete) + // split magic_contents into 3 parts, so we mock 5 requests here(1 begin + 3 + // part + 1 complete) let dispatcher = MultipleMockRequestDispatcher::new(vec![ MockRequestDispatcher::with_status(200).with_body( r#" @@ -904,7 +908,8 @@ mod tests { use self::try_read_exact; - /// ThrottleRead throttles a `Read` -- make it emits 2 chars for each `read` call. + /// ThrottleRead throttles a `Read` -- make it emits 2 chars for each + /// `read` call. struct ThrottleRead(R); impl Read for ThrottleRead { fn read(&mut self, buf: &mut [u8]) -> io::Result { diff --git a/components/cloud/azure/src/azblob.rs b/components/cloud/azure/src/azblob.rs index c322f1d0edc..2d7f2566509 100644 --- a/components/cloud/azure/src/azblob.rs +++ b/components/cloud/azure/src/azblob.rs @@ -247,8 +247,6 @@ impl RetryError for RequestError { const CONNECTION_TIMEOUT: Duration = Duration::from_secs(900); /// A helper for uploading a large file to Azure storage. -/// -/// struct AzureUploader { client_builder: Arc, name: String, @@ -257,7 +255,8 @@ struct AzureUploader { } impl AzureUploader { - /// Creates a new uploader with a given target location and upload configuration. + /// Creates a new uploader with a given target location and upload + /// configuration. fn new(client_builder: Arc, config: &Config, name: String) -> Self { AzureUploader { client_builder, @@ -288,8 +287,8 @@ impl AzureUploader { /// Uploads a file atomically. /// - /// This should be used only when the data is known to be short, and thus relatively cheap to - /// retry the entire upload. + /// This should be used only when the data is known to be short, and thus + /// relatively cheap to retry the entire upload. async fn upload(&self, data: &[u8]) -> Result<(), RequestError> { match timeout(Self::get_timeout(), async { self.client_builder diff --git a/components/cloud/gcp/src/gcs.rs b/components/cloud/gcp/src/gcs.rs index 08ee60a52bf..a3401dbf6c8 100644 --- a/components/cloud/gcp/src/gcs.rs +++ b/components/cloud/gcp/src/gcs.rs @@ -424,8 +424,8 @@ impl BlobStorage for GCSStorage { ..Default::default() }; - // FIXME: Switch to upload() API so we don't need to read the entire data into memory - // in order to retry. + // FIXME: Switch to upload() API so we don't need to read the entire data into + // memory in order to retry. let mut data = Vec::with_capacity(content_length as usize); reader.read_to_end(&mut data).await?; retry(|| async { @@ -456,7 +456,7 @@ impl BlobStorage for GCSStorage { Ok(oid) => oid, Err(e) => return GCSStorage::error_to_async_read(io::ErrorKind::InvalidInput, e), }; - let request = match Object::download(&oid, None /*optional*/) { + let request = match Object::download(&oid, None /* optional */) { Ok(request) => request.map(|_: io::Empty| Body::empty()), Err(e) => return GCSStorage::error_to_async_read(io::ErrorKind::Other, e), }; diff --git a/components/cloud/src/blob.rs b/components/cloud/src/blob.rs index 4685b5ae851..2e38097e385 100644 --- a/components/cloud/src/blob.rs +++ b/components/cloud/src/blob.rs @@ -15,7 +15,8 @@ pub trait BlobConfig: 'static + Send + Sync { /// It is identity to [external_storage::UnpinReader], /// only for decoupling external_storage and cloud package. /// -/// See the documentation of [external_storage::UnpinReader] for why those wrappers exists. +/// See the documentation of [external_storage::UnpinReader] for why those +/// wrappers exists. pub struct PutResource(pub Box); impl AsyncRead for PutResource { diff --git a/components/codec/src/buffer.rs b/components/codec/src/buffer.rs index e19e66b91e1..4010ecdf04f 100644 --- a/components/codec/src/buffer.rs +++ b/components/codec/src/buffer.rs @@ -23,11 +23,13 @@ pub trait BufferReader { /// TODO: We should make the panic behaviour deterministic. fn advance(&mut self, count: usize); - /// Read next several bytes as a slice and advance the position of internal cursor. + /// Read next several bytes as a slice and advance the position of internal + /// cursor. /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to read specified number of bytes. + /// Returns `Error::Io` if there is not enough space to read specified + /// number of bytes. fn read_bytes(&mut self, count: usize) -> Result<&[u8]>; } @@ -129,14 +131,16 @@ pub trait BufferWriter { /// The caller may hint the underlying buffer to grow according to `size` /// if the underlying buffer is dynamically sized (i.e. is capable to grow). /// - /// The size of the returned slice may be less than `size` given. For example, - /// when underlying buffer is fixed sized and there is no enough space any more. + /// The size of the returned slice may be less than `size` given. For + /// example, when underlying buffer is fixed sized and there is no + /// enough space any more. /// /// # Safety /// - /// The returned mutable slice is for writing only and should be never used for - /// reading since it might contain uninitialized memory when underlying buffer - /// is dynamically sized. For this reason, this function is marked `unsafe`. + /// The returned mutable slice is for writing only and should be never used + /// for reading since it might contain uninitialized memory when + /// underlying buffer is dynamically sized. For this reason, this + /// function is marked `unsafe`. unsafe fn bytes_mut(&mut self, size: usize) -> &mut [u8]; /// Advances the position of internal cursor for a previous write. @@ -490,7 +494,6 @@ mod tests { let mut buffer = base.clone(); let mut buf_slice = buffer.as_mut_slice(); - // let buffer_viewer = std::slice::from_raw_parts(buffer as *const u8, buffer.len()); buf_slice.bytes_mut(13)[..13].clone_from_slice(&base_write[0..13]); assert_eq!(&buf_slice[0..13], &base_write[0..13]); @@ -584,8 +587,8 @@ mod tests { } } - /// Test whether it is safe to store values in `Vec` after `len()`, i.e. during - /// reallocation these values are copied. + /// Test whether it is safe to store values in `Vec` after `len()`, + /// i.e. during reallocation these values are copied. #[test] // FIXME(#4331) Don't ignore this test. #[ignore] @@ -632,7 +635,6 @@ mod tests { // Re-allocate the vector space and ensure that the address is changed. vec.reserve(::std::cmp::max(payload_len * 3, 32)); - //assert_ne!(vec_ptr, vec.as_ptr()); if vec_ptr == vec.as_ptr() { in_place_reallocs += 1; } diff --git a/components/codec/src/byte.rs b/components/codec/src/byte.rs index 53b8091ac8c..63143938c13 100644 --- a/components/codec/src/byte.rs +++ b/components/codec/src/byte.rs @@ -21,9 +21,9 @@ impl MemComparableByteCodec { (src_len / MEMCMP_GROUP_SIZE + 1) * (MEMCMP_GROUP_SIZE + 1) } - /// Gets the length of the first encoded byte sequence in the given buffer, which is encoded in - /// the memory-comparable format. If the buffer is not complete, the length of buffer will be - /// returned. + /// Gets the length of the first encoded byte sequence in the given buffer, + /// which is encoded in the memory-comparable format. If the buffer is + /// not complete, the length of buffer will be returned. #[inline] fn get_first_encoded_len_internal(encoded: &[u8]) -> usize { let mut idx = MEMCMP_GROUP_SIZE; @@ -39,23 +39,25 @@ impl MemComparableByteCodec { } } - /// Gets the length of the first encoded byte sequence in the given buffer, which is encoded in - /// the ascending memory-comparable format. + /// Gets the length of the first encoded byte sequence in the given buffer, + /// which is encoded in the ascending memory-comparable format. pub fn get_first_encoded_len(encoded: &[u8]) -> usize { Self::get_first_encoded_len_internal::(encoded) } - /// Gets the length of the first encoded byte sequence in the given buffer, which is encoded in - /// the descending memory-comparable format. + /// Gets the length of the first encoded byte sequence in the given buffer, + /// which is encoded in the descending memory-comparable format. pub fn get_first_encoded_len_desc(encoded: &[u8]) -> usize { Self::get_first_encoded_len_internal::(encoded) } - /// Encodes all bytes in the `src` into `dest` in ascending memory-comparable format. + /// Encodes all bytes in the `src` into `dest` in ascending + /// memory-comparable format. /// /// Returns the number of bytes encoded. /// - /// `dest` must not overlaps `src`, otherwise encoded results will be incorrect. + /// `dest` must not overlaps `src`, otherwise encoded results will be + /// incorrect. /// /// # Panics /// @@ -99,7 +101,8 @@ impl MemComparableByteCodec { } } - /// Encodes the bytes `src[..len]` in ascending memory-comparable format in place. + /// Encodes the bytes `src[..len]` in ascending memory-comparable format in + /// place. /// /// Returns the number of bytes encoded. /// @@ -159,11 +162,13 @@ impl MemComparableByteCodec { } } - /// Encodes all bytes in the `src` into `dest` in descending memory-comparable format. + /// Encodes all bytes in the `src` into `dest` in descending + /// memory-comparable format. /// /// Returns the number of bytes encoded. /// - /// `dest` must not overlaps `src`, otherwise encoded results will be incorrect. + /// `dest` must not overlaps `src`, otherwise encoded results will be + /// incorrect. /// /// # Panics /// @@ -176,7 +181,8 @@ impl MemComparableByteCodec { encoded_len } - /// Encodes the bytes `src[..len]` in descending memory-comparable format in place. + /// Encodes the bytes `src[..len]` in descending memory-comparable format in + /// place. /// /// Returns the number of bytes encoded. /// @@ -189,21 +195,25 @@ impl MemComparableByteCodec { encoded_len } - /// Decodes bytes in ascending memory-comparable format in the `src` into `dest`. + /// Decodes bytes in ascending memory-comparable format in the `src` into + /// `dest`. /// - /// If there are multiple encoded byte slices in `src`, only the first one will be decoded. + /// If there are multiple encoded byte slices in `src`, only the first one + /// will be decoded. /// - /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number of bytes read in - /// `src` and `written_bytes` is the number of bytes written in `dest`. + /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number + /// of bytes read in `src` and `written_bytes` is the number of bytes + /// written in `dest`. /// - /// Note that actual written data may be larger than `written_bytes`. Bytes more than - /// `written_bytes` are junk and should be ignored. + /// Note that actual written data may be larger than `written_bytes`. Bytes + /// more than `written_bytes` are junk and should be ignored. /// /// If `src == dest`, please use `try_decode_first_in_place`. /// /// # Panics /// - /// Panics if `dest.len() < src.len()`, although actual written data may be less. + /// Panics if `dest.len() < src.len()`, although actual written data may be + /// less. /// /// When there is a panic, `dest` may contain partially written data. /// @@ -223,21 +233,25 @@ impl MemComparableByteCodec { ) } - /// Decodes bytes in descending memory-comparable format in the `src` into `dest`. + /// Decodes bytes in descending memory-comparable format in the `src` into + /// `dest`. /// - /// If there are multiple encoded byte slices in `src`, only the first one will be decoded. + /// If there are multiple encoded byte slices in `src`, only the first one + /// will be decoded. /// - /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number of bytes read in - /// `src` and `written_bytes` is the number of bytes written in `dest`. + /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number + /// of bytes read in `src` and `written_bytes` is the number of bytes + /// written in `dest`. /// - /// Note that actual written data may be larger than `written_bytes`. Bytes more than - /// `written_bytes` are junk and should be ignored. + /// Note that actual written data may be larger than `written_bytes`. Bytes + /// more than `written_bytes` are junk and should be ignored. /// /// If `src == dest`, please use `try_decode_first_in_place_desc`. /// /// # Panics /// - /// Panics if `dest.len() < src.len()`, although actual written data may be less. + /// Panics if `dest.len() < src.len()`, although actual written data may be + /// less. /// /// When there is a panic, `dest` may contain partially written data. /// @@ -259,16 +273,17 @@ impl MemComparableByteCodec { Ok((read_bytes, written_bytes)) } - /// Decodes bytes in ascending memory-comparable format in place, i.e. decoded data will - /// overwrite the encoded data. + /// Decodes bytes in ascending memory-comparable format in place, i.e. + /// decoded data will overwrite the encoded data. /// - /// If there are multiple encoded byte slices in `buffer`, only the first one will be decoded. + /// If there are multiple encoded byte slices in `buffer`, only the first + /// one will be decoded. /// - /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number of bytes read - /// and `written_bytes` is the number of bytes written. + /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number + /// of bytes read and `written_bytes` is the number of bytes written. /// - /// Note that actual written data may be larger than `written_bytes`. Bytes more than - /// `written_bytes` are junk and should be ignored. + /// Note that actual written data may be larger than `written_bytes`. Bytes + /// more than `written_bytes` are junk and should be ignored. /// /// # Errors /// @@ -286,16 +301,17 @@ impl MemComparableByteCodec { ) } - /// Decodes bytes in descending memory-comparable format in place, i.e. decoded data will - /// overwrite the encoded data. + /// Decodes bytes in descending memory-comparable format in place, i.e. + /// decoded data will overwrite the encoded data. /// - /// If there are multiple encoded byte slices in `buffer`, only the first one will be decoded. + /// If there are multiple encoded byte slices in `buffer`, only the first + /// one will be decoded. /// - /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number of bytes read - /// and `written_bytes` is the number of bytes written. + /// Returns `(read_bytes, written_bytes)` where `read_bytes` is the number + /// of bytes read and `written_bytes` is the number of bytes written. /// - /// Note that actual written data may be larger than `written_bytes`. Bytes more than - /// `written_bytes` are junk and should be ignored. + /// Note that actual written data may be larger than `written_bytes`. Bytes + /// more than `written_bytes` are junk and should be ignored. /// /// # Errors /// @@ -323,10 +339,12 @@ impl MemComparableByteCodec { /// /// This function uses pointers to accept the scenario that `src == dest`. /// - /// This function also uses generics to specialize different code path for ascending and - /// descending decoding, which performs better than inlining a flag. + /// This function also uses generics to specialize different code path for + /// ascending and descending decoding, which performs better than + /// inlining a flag. /// - /// Please refer to `try_decode_first` for the meaning of return values, panics and errors. + /// Please refer to `try_decode_first` for the meaning of return values, + /// panics and errors. #[inline] fn try_decode_first_internal( mut src_ptr: *const u8, @@ -395,7 +413,8 @@ impl MemComparableByteCodec { trait MemComparableCodecHelper { const PADDING: [u8; MEMCMP_GROUP_SIZE]; - /// Given a raw padding size byte, interprets the padding size according to correct order. + /// Given a raw padding size byte, interprets the padding size according to + /// correct order. fn parse_padding_size(raw_marker: u8) -> usize; } @@ -476,8 +495,9 @@ impl MemComparableByteDecoder for T {} pub struct CompactByteCodec; impl CompactByteCodec { - /// Gets the length of the first encoded byte sequence in the given buffer, which is encoded in - /// the compact format. If the buffer is not complete, the length of buffer will be returned. + /// Gets the length of the first encoded byte sequence in the given buffer, + /// which is encoded in the compact format. If the buffer is not complete, + /// the length of buffer will be returned. pub fn get_first_encoded_len(encoded: &[u8]) -> usize { let result = NumberCodec::try_decode_var_i64(encoded); match result { @@ -968,8 +988,9 @@ mod tests { fn test_memcmp_try_decode_first() { use super::MEMCMP_GROUP_SIZE as N; - // We have ensured correctness in `test_memcmp_encode_all`, so we use `encode_all` to - // generate fixtures in different length, used for decoding. + // We have ensured correctness in `test_memcmp_encode_all`, so we use + // `encode_all` to generate fixtures in different length, used for + // decoding. fn do_test( is_desc: bool, diff --git a/components/codec/src/error.rs b/components/codec/src/error.rs index 2483bd541de..b85d8dd078d 100644 --- a/components/codec/src/error.rs +++ b/components/codec/src/error.rs @@ -27,8 +27,7 @@ impl ErrorInner { } } -// ====== The code below is to box the error so that the it can be as small as possible ====== - +// Box the error so that the it can be as small as possible #[derive(Debug, Error)] #[error(transparent)] pub struct Error(#[from] pub Box); diff --git a/components/codec/src/number.rs b/components/codec/src/number.rs index 4cc114e7ea7..af47905334d 100644 --- a/components/codec/src/number.rs +++ b/components/codec/src/number.rs @@ -403,7 +403,8 @@ impl NumberCodec { } /// Encodes an unsigned 64 bit integer `v` to `buf` in VarInt encoding, - /// which is not memory-comparable. Returns the number of bytes that encoded. + /// which is not memory-comparable. Returns the number of bytes that + /// encoded. /// /// Note: VarInt encoding is slow, try avoid using it. /// @@ -429,13 +430,15 @@ impl NumberCodec { } /// Decodes an unsigned 64 bit integer from `buf` in VarInt encoding. - /// Returns decoded result and the number of bytes that successfully decoded. + /// Returns decoded result and the number of bytes that successfully + /// decoded. /// /// This function is more efficient when `buf.len() >= 10`. /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to decode the whole VarInt. + /// Returns `Error::Io` if there is not enough space to decode the whole + /// VarInt. pub fn try_decode_var_u64(buf: &[u8]) -> Result<(u64, usize)> { #[allow(clippy::cast_lossless)] unsafe { @@ -478,7 +481,8 @@ impl NumberCodec { } /// Encodes a signed 64 bit integer `v` to `buf` in VarInt encoding, - /// which is not memory-comparable. Returns the number of bytes that encoded. + /// which is not memory-comparable. Returns the number of bytes that + /// encoded. /// /// Note: VarInt encoding is slow, try avoid using it. /// @@ -495,13 +499,15 @@ impl NumberCodec { } /// Decodes a signed 64 bit integer from `buf` in VarInt encoding. - /// Returns decoded result and the number of bytes that successfully decoded. + /// Returns decoded result and the number of bytes that successfully + /// decoded. /// /// This function is more efficient when `buf.len() >= 10`. /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to decode the whole VarInt. + /// Returns `Error::Io` if there is not enough space to decode the whole + /// VarInt. #[inline] pub fn try_decode_var_i64(buf: &[u8]) -> Result<(i64, usize)> { let (uv, decoded_bytes) = Self::try_decode_var_u64(buf)?; @@ -514,8 +520,8 @@ impl NumberCodec { } } - /// Gets the length of the first encoded VarInt in the given buffer. If the buffer is not - /// complete, the length of buffer will be returned. + /// Gets the length of the first encoded VarInt in the given buffer. If the + /// buffer is not complete, the length of buffer will be returned. /// /// This function is more efficient when `buf.len() >= 10`. pub fn get_first_encoded_var_int_len(buf: &[u8]) -> usize { @@ -761,7 +767,8 @@ pub trait NumberDecoder: BufferReader { /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to decode the whole VarInt. + /// Returns `Error::Io` if there is not enough space to decode the whole + /// VarInt. #[inline] fn read_var_u64(&mut self) -> Result { let (v, decoded_bytes) = { @@ -779,7 +786,8 @@ pub trait NumberDecoder: BufferReader { /// /// # Errors /// - /// Returns `Error::Io` if there is not enough space to decode the whole VarInt. + /// Returns `Error::Io` if there is not enough space to decode the whole + /// VarInt. #[inline] fn read_var_i64(&mut self) -> Result { let (v, decoded_bytes) = { @@ -1015,11 +1023,13 @@ pub trait NumberEncoder: BufferWriter { } /// Writes an unsigned 64 bit integer `v` in VarInt encoding, - /// which is not memory-comparable. Returns the number of bytes that encoded. + /// which is not memory-comparable. Returns the number of bytes that + /// encoded. /// /// Note: /// - VarInt encoding is slow, try avoid using it. - /// - The buffer must reserve 10 bytes for writing, although actual written bytes may be less. + /// - The buffer must reserve 10 bytes for writing, although actual written + /// bytes may be less. /// - The buffer will be advanced by actual written bytes. /// /// # Errors @@ -1039,11 +1049,13 @@ pub trait NumberEncoder: BufferWriter { } /// Writes a signed 64 bit integer `v` in VarInt encoding, - /// which is not memory-comparable. Returns the number of bytes that encoded. + /// which is not memory-comparable. Returns the number of bytes that + /// encoded. /// /// Note: /// - VarInt encoding is slow, try avoid using it. - /// - The buffer must reserve 10 bytes for writing, although actual written bytes may be less. + /// - The buffer must reserve 10 bytes for writing, although actual written + /// bytes may be less. /// - The buffer will be advanced by actual written bytes. /// /// # Errors @@ -1818,7 +1830,8 @@ mod benches { use crate::ErrorInner; - /// Encode u64 little endian using `NumberCodec` and store position in extra variable. + /// Encode u64 little endian using `NumberCodec` and store position in extra + /// variable. #[bench] fn bench_encode_u64_le_number_codec(b: &mut test::Bencher) { let mut buf: [u8; 10] = [0; 10]; @@ -1834,7 +1847,8 @@ mod benches { }); } - /// Encode u64 little endian using `byteorder::WriteBytesExt` over a `Cursor<&mut [u8]>`. + /// Encode u64 little endian using `byteorder::WriteBytesExt` over a + /// `Cursor<&mut [u8]>`. #[bench] fn bench_encode_u64_le_byteorder(b: &mut test::Bencher) { use byteorder::WriteBytesExt; @@ -1852,7 +1866,8 @@ mod benches { }); } - /// Encode u64 little endian using `NumberEncoder` over a `Cursor<&mut [u8]>`. + /// Encode u64 little endian using `NumberEncoder` over a `Cursor<&mut + /// [u8]>`. #[bench] fn bench_encode_u64_le_buffer_encoder_slice(b: &mut test::Bencher) { use super::NumberEncoder; @@ -1881,7 +1896,8 @@ mod benches { }); } - /// Decode u64 little endian using `NumberCodec` and store position in extra variable. + /// Decode u64 little endian using `NumberCodec` and store position in extra + /// variable. #[bench] fn bench_decode_u64_le_number_codec(b: &mut test::Bencher) { let buf: [u8; 10] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]; @@ -1894,7 +1910,8 @@ mod benches { }); } - /// Decode u64 little endian using `NumberCodec` and store position via slice index. + /// Decode u64 little endian using `NumberCodec` and store position via + /// slice index. #[bench] fn bench_decode_u64_le_number_codec_over_slice(b: &mut test::Bencher) { let buf: Vec = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 0]; @@ -1907,7 +1924,8 @@ mod benches { }); } - /// Decode u64 little endian using `byteorder::ReadBytesExt` over a `Cursor<&[u8]>`. + /// Decode u64 little endian using `byteorder::ReadBytesExt` over a + /// `Cursor<&[u8]>`. #[bench] fn bench_decode_u64_le_byteorder(b: &mut test::Bencher) { use byteorder::ReadBytesExt; diff --git a/components/concurrency_manager/src/lib.rs b/components/concurrency_manager/src/lib.rs index 7865f43fc78..b80501b5433 100644 --- a/components/concurrency_manager/src/lib.rs +++ b/components/concurrency_manager/src/lib.rs @@ -58,8 +58,8 @@ impl ConcurrencyManager { } } - /// Acquires a mutex of the key and returns an RAII guard. When the guard goes - /// out of scope, the mutex will be unlocked. + /// Acquires a mutex of the key and returns an RAII guard. When the guard + /// goes out of scope, the mutex will be unlocked. /// /// The guard can be used to store Lock in the table. The stored lock /// is visible to `read_key_check` and `read_range_check`. @@ -67,8 +67,8 @@ impl ConcurrencyManager { self.lock_table.lock_key(key).await } - /// Acquires mutexes of the keys and returns the RAII guards. The order of the - /// guards is the same with the given keys. + /// Acquires mutexes of the keys and returns the RAII guards. The order of + /// the guards is the same with the given keys. /// /// The guards can be used to store Lock in the table. The stored lock /// is visible to `read_key_check` and `read_range_check`. diff --git a/components/concurrency_manager/src/lock_table.rs b/components/concurrency_manager/src/lock_table.rs index 2b9e87f8f39..da08d9983d1 100644 --- a/components/concurrency_manager/src/lock_table.rs +++ b/components/concurrency_manager/src/lock_table.rs @@ -33,12 +33,13 @@ impl LockTable { let entry = self.0.get_or_insert(key.clone(), weak); if entry.value().ptr_eq(&weak2) { // If the weak ptr returned by `get_or_insert` equals to the one we inserted, - // `guard` refers to the KeyHandle in the lock table. Now, we can bind the handle - // to the table. + // `guard` refers to the KeyHandle in the lock table. Now, we can bind the + // handle to the table. - // SAFETY: The `table` field in `KeyHandle` is only accessed through the `set_table` - // or the `drop` method. It's impossible to have a concurrent `drop` here and `set_table` - // is only called here. So there is no concurrent access to the `table` field in `KeyHandle`. + // SAFETY: The `table` field in `KeyHandle` is only accessed through the + // `set_table` or the `drop` method. It's impossible to have a concurrent `drop` + // here and `set_table` is only called here. So there is no concurrent access to + // the `table` field in `KeyHandle`. unsafe { guard.handle().set_table(self.clone()); } diff --git a/components/concurrency_manager/tests/memory_usage.rs b/components/concurrency_manager/tests/memory_usage.rs index b3b62ab5849..34ce9986a61 100644 --- a/components/concurrency_manager/tests/memory_usage.rs +++ b/components/concurrency_manager/tests/memory_usage.rs @@ -11,7 +11,8 @@ use rand::prelude::*; use txn_types::{Key, Lock, LockType}; // This test is heavy so we shouldn't run it daily. -// Run it with the following command (recommending release mode) and see the printed stats: +// Run it with the following command (recommending release mode) and see the +// printed stats: // // ``` // cargo test --package concurrency_manager --test memory_usage --features jemalloc --release -- test_memory_usage --exact --ignored --nocapture diff --git a/components/coprocessor_plugin_api/src/allocator.rs b/components/coprocessor_plugin_api/src/allocator.rs index 7d7140b6170..d8c2ab5062f 100644 --- a/components/coprocessor_plugin_api/src/allocator.rs +++ b/components/coprocessor_plugin_api/src/allocator.rs @@ -9,8 +9,8 @@ type DeallocFn = unsafe fn(*mut u8, Layout); /// Used to initialize the plugin's allocator. /// -/// A `HostAllocatorPtr` contains the relevant pointers to initialize the allocator of -/// to plugin. It will be passed from TiKV to the plugin. +/// A `HostAllocatorPtr` contains the relevant pointers to initialize the +/// allocator of to plugin. It will be passed from TiKV to the plugin. #[repr(C)] pub struct HostAllocatorPtr { pub alloc_fn: AllocFn, @@ -26,8 +26,9 @@ pub struct HostAllocator { impl HostAllocator { /// Creates a new [`HostAllocator`]. /// - /// The internal function pointers are initially `None`, so any attempt to allocate memory - /// before a call to [`set_allocator()`] will result in a panic. + /// The internal function pointers are initially `None`, so any attempt to + /// allocate memory before a call to [`set_allocator()`] will result in + /// a panic. pub const fn new() -> Self { HostAllocator { alloc_fn: Atomic::new(None), @@ -35,9 +36,10 @@ impl HostAllocator { } } - /// Updates the function pointers of the [`HostAllocator`] to the given [`HostAllocatorPtr`]. - /// This function needs to be called before _any_ allocation with this allocator is performed, - /// because otherwise the [`HostAllocator`] is in an invalid state. + /// Updates the function pointers of the [`HostAllocator`] to the given + /// [`HostAllocatorPtr`]. This function needs to be called before _any_ + /// allocation with this allocator is performed, because otherwise the + /// [`HostAllocator`] is in an invalid state. pub fn set_allocator(&self, allocator: HostAllocatorPtr) { self.alloc_fn .store(Some(allocator.alloc_fn), Ordering::SeqCst); diff --git a/components/coprocessor_plugin_api/src/errors.rs b/components/coprocessor_plugin_api/src/errors.rs index 7085fa98edd..78961d60df8 100644 --- a/components/coprocessor_plugin_api/src/errors.rs +++ b/components/coprocessor_plugin_api/src/errors.rs @@ -9,9 +9,10 @@ pub type PluginResult = std::result::Result; /// Error returned by operations on [`RawStorage`]. /// -/// If a plugin wants to return a custom error, e.g. an error in the business logic, the plugin should -/// return an appropriately encoded error in [`RawResponse`]; in other words, plugins are responsible -/// for their error handling by themselves. +/// If a plugin wants to return a custom error, e.g. an error in the business +/// logic, the plugin should return an appropriately encoded error in +/// [`RawResponse`]; in other words, plugins are responsible for their error +/// handling by themselves. #[derive(Debug)] pub enum PluginError { KeyNotInRegion { @@ -23,11 +24,12 @@ pub enum PluginError { Timeout(Duration), Canceled, - /// Errors that can not be handled by a coprocessor plugin but should instead be returned to the - /// client. + /// Errors that can not be handled by a coprocessor plugin but should + /// instead be returned to the client. /// - /// If such an error appears, plugins can run some cleanup code and return early from the - /// request. The error will be passed to the client and the client might retry the request. + /// If such an error appears, plugins can run some cleanup code and return + /// early from the request. The error will be passed to the client and + /// the client might retry the request. Other(String, Box), } diff --git a/components/coprocessor_plugin_api/src/lib.rs b/components/coprocessor_plugin_api/src/lib.rs index ca61b54c724..7f05840c072 100644 --- a/components/coprocessor_plugin_api/src/lib.rs +++ b/components/coprocessor_plugin_api/src/lib.rs @@ -1,25 +1,30 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -//! This crate contains some necessary types and traits for implementing a custom coprocessor plugin -//! for TiKV. +//! This crate contains some necessary types and traits for implementing a +//! custom coprocessor plugin for TiKV. //! -//! Most notably, if you want to write a custom plugin, your plugin needs to implement the -//! [`CoprocessorPlugin`] trait. The plugin then needs to be compiled to a `dylib`. +//! Most notably, if you want to write a custom plugin, your plugin needs to +//! implement the [`CoprocessorPlugin`] trait. The plugin then needs to be +//! compiled to a `dylib`. //! -//! > Note: Only `dylib` is supported, and not `cdylib` or `staticlib`, because the latter two are -//! > not able to use TiKV's allocator. See also the documentation in [`std::alloc`]. +//! > Note: Only `dylib` is supported, and not `cdylib` or `staticlib`, because +//! > the latter two are +//! > not able to use TiKV's allocator. See also the documentation in +//! > [`std::alloc`]. //! -//! In order to make your plugin callable, you need to declare a constructor with the -//! [`declare_plugin`] macro. +//! In order to make your plugin callable, you need to declare a constructor +//! with the [`declare_plugin`] macro. //! -//! A plugin can interact with the underlying storage via the [`RawStorage`] trait. +//! A plugin can interact with the underlying storage via the [`RawStorage`] +//! trait. //! //! # Example //! //! ```no_run -//! use coprocessor_plugin_api::*; //! use std::ops::Range; //! +//! use coprocessor_plugin_api::*; +//! //! #[derive(Default)] //! struct MyPlugin; //! diff --git a/components/coprocessor_plugin_api/src/plugin_api.rs b/components/coprocessor_plugin_api/src/plugin_api.rs index 31f87f3c822..f31c3f9bab2 100644 --- a/components/coprocessor_plugin_api/src/plugin_api.rs +++ b/components/coprocessor_plugin_api/src/plugin_api.rs @@ -7,31 +7,32 @@ use crate::PluginResult; /// Raw bytes of the request payload from the client to the coprocessor. pub type RawRequest = Vec; -/// The response from the coprocessor encoded as raw bytes that are sent back to the client. +/// The response from the coprocessor encoded as raw bytes that are sent back to +/// the client. pub type RawResponse = Vec; /// A plugin that allows users to execute arbitrary code on TiKV nodes. /// -/// If you want to implement a custom coprocessor plugin for TiKV, your plugin needs to implement -/// the [`CoprocessorPlugin`] trait. +/// If you want to implement a custom coprocessor plugin for TiKV, your plugin +/// needs to implement the [`CoprocessorPlugin`] trait. /// -/// Plugins can run setup code in their constructor and teardown code by implementing -/// [`std::ops::Drop`]. +/// Plugins can run setup code in their constructor and teardown code by +/// implementing [`std::ops::Drop`]. pub trait CoprocessorPlugin: Send + Sync { /// Handles a request to the coprocessor. /// - /// The data in the `request` parameter is exactly the same data that was passed with the - /// `RawCoprocessorRequest` in the `data` field. Each plugin is responsible to properly decode - /// the raw bytes by itself. - /// The same is true for the return parameter of this function. Upon successful completion, the - /// function should return a properly encoded result as raw bytes which is then sent back to - /// the client. + /// The data in the `request` parameter is exactly the same data that was + /// passed with the `RawCoprocessorRequest` in the `data` field. Each + /// plugin is responsible to properly decode the raw bytes by itself. + /// The same is true for the return parameter of this function. Upon + /// successful completion, the function should return a properly encoded + /// result as raw bytes which is then sent back to the client. /// - /// Most of the time, it's a good idea to use Protobuf for encoding/decoding, but in general you - /// can also send raw bytes. + /// Most of the time, it's a good idea to use Protobuf for + /// encoding/decoding, but in general you can also send raw bytes. /// - /// Plugins can read and write data from the underlying [`RawStorage`] via the `storage` - /// parameter. + /// Plugins can read and write data from the underlying [`RawStorage`] via + /// the `storage` parameter. fn on_raw_coprocessor_request( &self, ranges: Vec>, diff --git a/components/coprocessor_plugin_api/src/storage_api.rs b/components/coprocessor_plugin_api/src/storage_api.rs index 3adfa7c4a7e..08c09ca4a48 100644 --- a/components/coprocessor_plugin_api/src/storage_api.rs +++ b/components/coprocessor_plugin_api/src/storage_api.rs @@ -15,38 +15,44 @@ pub type KvPair = (Key, Value); /// Storage access for coprocessor plugins. /// -/// [`RawStorage`] allows coprocessor plugins to interact with TiKV storage on a low level. +/// [`RawStorage`] allows coprocessor plugins to interact with TiKV storage on a +/// low level. /// /// Batch operations should be preferred due to their better performance. #[async_trait(?Send)] pub trait RawStorage { - /// Retrieves the value for a given key from the storage on the current node. - /// Returns [`Option::None`] if the key is not present in the database. + /// Retrieves the value for a given key from the storage on the current + /// node. Returns [`Option::None`] if the key is not present in the + /// database. async fn get(&self, key: Key) -> PluginResult>; - /// Same as [`RawStorage::get()`], but retrieves values for multiple keys at once. + /// Same as [`RawStorage::get()`], but retrieves values for multiple keys at + /// once. async fn batch_get(&self, keys: Vec) -> PluginResult>; - /// Same as [`RawStorage::get()`], but accepts a `key_range` such that values for keys in - /// `[key_range.start, key_range.end)` are retrieved. + /// Same as [`RawStorage::get()`], but accepts a `key_range` such that + /// values for keys in `[key_range.start, key_range.end)` are retrieved. /// The upper bound of the `key_range` is exclusive. async fn scan(&self, key_range: Range) -> PluginResult>; /// Inserts a new key-value pair into the storage on the current node. async fn put(&self, key: Key, value: Value) -> PluginResult<()>; - /// Same as [`RawStorage::put()`], but inserts multiple key-value pairs at once. + /// Same as [`RawStorage::put()`], but inserts multiple key-value pairs at + /// once. async fn batch_put(&self, kv_pairs: Vec) -> PluginResult<()>; - /// Deletes a key-value pair from the storage on the current node given a `key`. - /// Returns [`Result::Ok]` if the key was successfully deleted. + /// Deletes a key-value pair from the storage on the current node given a + /// `key`. Returns [`Result::Ok]` if the key was successfully deleted. async fn delete(&self, key: Key) -> PluginResult<()>; - /// Same as [`RawStorage::delete()`], but deletes multiple key-value pairs at once. + /// Same as [`RawStorage::delete()`], but deletes multiple key-value pairs + /// at once. async fn batch_delete(&self, keys: Vec) -> PluginResult<()>; - /// Same as [`RawStorage::delete()`], but deletes multiple key-values pairs at once - /// given a `key_range`. All records with keys in `[key_range.start, key_range.end)` - /// will be deleted. The upper bound of the `key_range` is exclusive. + /// Same as [`RawStorage::delete()`], but deletes multiple key-values pairs + /// at once given a `key_range`. All records with keys in + /// `[key_range.start, key_range.end)` will be deleted. The upper bound + /// of the `key_range` is exclusive. async fn delete_range(&self, key_range: Range) -> PluginResult<()>; } diff --git a/components/coprocessor_plugin_api/src/util.rs b/components/coprocessor_plugin_api/src/util.rs index fd15a26a1c8..816b0d12162 100644 --- a/components/coprocessor_plugin_api/src/util.rs +++ b/components/coprocessor_plugin_api/src/util.rs @@ -2,33 +2,40 @@ use super::{allocator::HostAllocatorPtr, plugin_api::CoprocessorPlugin}; -/// Name of the exported constructor with signature [`PluginConstructorSignature`] for the plugin. +/// Name of the exported constructor with signature +/// [`PluginConstructorSignature`] for the plugin. pub static PLUGIN_CONSTRUCTOR_SYMBOL: &[u8] = b"_plugin_create"; -/// Name of the exported function with signature [`PluginGetBuildInfoSignature`] to get build -/// information about the plugin. +/// Name of the exported function with signature [`PluginGetBuildInfoSignature`] +/// to get build information about the plugin. pub static PLUGIN_GET_BUILD_INFO_SYMBOL: &[u8] = b"_plugin_get_build_info"; -/// Name of the exported function with signature [`PluginGetPluginInfoSignature`] to get some -/// information about the plugin. +/// Name of the exported function with signature +/// [`PluginGetPluginInfoSignature`] to get some information about the plugin. pub static PLUGIN_GET_PLUGIN_INFO_SYMBOL: &[u8] = b"_plugin_get_plugin_info"; -/// Type signature of the exported function with symbol [`PLUGIN_CONSTRUCTOR_SYMBOL`]. +/// Type signature of the exported function with symbol +/// [`PLUGIN_CONSTRUCTOR_SYMBOL`]. pub type PluginConstructorSignature = unsafe fn(host_allocator: HostAllocatorPtr) -> *mut dyn CoprocessorPlugin; -/// Type signature of the exported function with symbol [`PLUGIN_GET_BUILD_INFO_SYMBOL`]. +/// Type signature of the exported function with symbol +/// [`PLUGIN_GET_BUILD_INFO_SYMBOL`]. pub type PluginGetBuildInfoSignature = extern "C" fn() -> BuildInfo; -/// Type signature of the exported function with symbol [`PLUGIN_GET_PLUGIN_INFO_SYMBOL`]. +/// Type signature of the exported function with symbol +/// [`PLUGIN_GET_PLUGIN_INFO_SYMBOL`]. pub type PluginGetPluginInfoSignature = extern "C" fn() -> PluginInfo; -/// Automatically collected build information about the plugin that is exposed from the library. +/// Automatically collected build information about the plugin that is exposed +/// from the library. /// -/// Will be automatically created when using [`declare_plugin!(...)`](declare_plugin) and will be -/// used by TiKV when a plugin is loaded to determine whether there are compilation mismatches. +/// Will be automatically created when using +/// [`declare_plugin!(...)`](declare_plugin) and will be used by TiKV when a +/// plugin is loaded to determine whether there are compilation mismatches. #[repr(C)] #[derive(Debug, Clone, PartialEq, Eq)] pub struct BuildInfo { - /// Version of the [`coprocessor_plugin_api`](crate) crate that was used to compile this plugin. + /// Version of the [`coprocessor_plugin_api`](crate) crate that was used to + /// compile this plugin. pub api_version: &'static str, /// Target triple for which platform this plugin was compiled. pub target: &'static str, @@ -59,11 +66,15 @@ pub struct PluginInfo { /// Declare a plugin for the library so that it can be loaded by TiKV. /// /// The macro has three different versions: -/// * `declare_plugin!(plugin_name, plugin_version, plugin_ctor)` which gives you full control. -/// * `declare_plugin!(plugin_name, plugin_ctor)` automatically fetches the version from `Cargo.toml`. -/// * `declare_plugin!(plugin_ctor)` automatically fetches plugin name and version from `Cargo.toml`. +/// * `declare_plugin!(plugin_name, plugin_version, plugin_ctor)` which gives +/// you full control. +/// * `declare_plugin!(plugin_name, plugin_ctor)` automatically fetches the +/// version from `Cargo.toml`. +/// * `declare_plugin!(plugin_ctor)` automatically fetches plugin name and +/// version from `Cargo.toml`. /// -/// The types of `plugin_name` and `plugin_version` have to be `&'static str` literals. +/// The types of `plugin_name` and `plugin_version` have to be `&'static str` +/// literals. /// /// # Notes /// This works by automatically generating an `extern "C"` function with a @@ -119,8 +130,8 @@ macro_rules! declare_plugin { /// Transforms the name of a package into the name of the compiled library. /// -/// The result of the function can be used to correctly locate build artifacts of `dylib` on -/// different platforms. +/// The result of the function can be used to correctly locate build artifacts +/// of `dylib` on different platforms. /// /// The name of the `dylib` is /// * `lib.so` on Linux diff --git a/components/encryption/export/src/lib.rs b/components/encryption/export/src/lib.rs index 5b84a4a0c34..537eb8785e5 100644 --- a/components/encryption/export/src/lib.rs +++ b/components/encryption/export/src/lib.rs @@ -82,7 +82,8 @@ fn create_backend_inner(config: &MasterKeyConfig) -> Result> { }) } -// CloudKMS adapts the KmsProvider definition from the cloud crate to that of the encryption crate +// CloudKMS adapts the KmsProvider definition from the cloud crate to that of +// the encryption crate #[derive(Debug, Deref)] struct CloudKms(Box); diff --git a/components/encryption/src/crypter.rs b/components/encryption/src/crypter.rs index f869817de2b..1268d0d88f2 100644 --- a/components/encryption/src/crypter.rs +++ b/components/encryption/src/crypter.rs @@ -153,7 +153,7 @@ impl<'k> AesGcmCrypter<'k> { cipher, &self.key.0, Some(self.iv.as_slice()), - &[], /* AAD */ + &[], // AAD pt, &mut tag.0, )?; @@ -166,7 +166,7 @@ impl<'k> AesGcmCrypter<'k> { cipher, &self.key.0, Some(self.iv.as_slice()), - &[], /* AAD */ + &[], // AAD ct, &tag.0, )?; diff --git a/components/encryption/src/encrypted_file/mod.rs b/components/encryption/src/encrypted_file/mod.rs index 7bf31225db8..57b5527b7bf 100644 --- a/components/encryption/src/encrypted_file/mod.rs +++ b/components/encryption/src/encrypted_file/mod.rs @@ -34,8 +34,8 @@ impl<'a> EncryptedFile<'a> { EncryptedFile { base, name } } - /// Read and decrypt the file. Caller need to handle the NotFound io error in case file not - /// exists. + /// Read and decrypt the file. Caller need to handle the NotFound io error + /// in case file not exists. pub fn read(&self, master_key: &dyn Backend) -> Result> { let start = Instant::now(); let res = OpenOptions::new() diff --git a/components/encryption/src/file_dict_file.rs b/components/encryption/src/file_dict_file.rs index e2dedfe534e..0884cb1ca04 100644 --- a/components/encryption/src/file_dict_file.rs +++ b/components/encryption/src/file_dict_file.rs @@ -120,7 +120,8 @@ impl FileDictionaryFile { self.base.join(&self.name) } - /// Rewrite the log file to reduce file size and reduce the time of next recovery. + /// Rewrite the log file to reduce file size and reduce the time of next + /// recovery. fn rewrite(&mut self) -> Result<()> { let file_dict_bytes = self.file_dict.write_to_bytes()?; if self.enable_log { @@ -397,7 +398,7 @@ mod tests { tempdir.path(), "test_file_dict_file", enable_log, - 2, /*file_rewrite_threshold*/ + 2, // file_rewrite_threshold ) .unwrap(); let info1 = create_file_info(1, EncryptionMethod::Aes256Ctr); @@ -440,12 +441,12 @@ mod tests { #[test] fn test_file_dict_file_normal_v1() { - test_file_dict_file_normal(false /*enable_log*/); + test_file_dict_file_normal(false /* enable_log */); } #[test] fn test_file_dict_file_normal_v2() { - test_file_dict_file_normal(true /*enable_log*/); + test_file_dict_file_normal(true /* enable_log */); } fn test_file_dict_file_existed(enable_log: bool) { @@ -454,7 +455,7 @@ mod tests { tempdir.path(), "test_file_dict_file", enable_log, - 2, /*file_rewrite_threshold*/ + 2, // file_rewrite_threshold ) .unwrap(); @@ -464,9 +465,9 @@ mod tests { let (_, file_dict) = FileDictionaryFile::open( tempdir.path(), "test_file_dict_file", - true, /*enable_log*/ - 2, /*file_rewrite_threshold*/ - false, /*skip_rewrite*/ + true, // enable_log + 2, // file_rewrite_threshold + false, // skip_rewrite ) .unwrap(); assert_eq!(*file_dict.files.get("info").unwrap(), info); @@ -474,12 +475,12 @@ mod tests { #[test] fn test_file_dict_file_existed_v1() { - test_file_dict_file_existed(false /*enable_log*/); + test_file_dict_file_existed(false /* enable_log */); } #[test] fn test_file_dict_file_existed_v2() { - test_file_dict_file_existed(true /*enable_log*/); + test_file_dict_file_existed(true /* enable_log */); } fn test_file_dict_file_not_existed(enable_log: bool) { @@ -488,20 +489,20 @@ mod tests { tempdir.path(), "test_file_dict_file", enable_log, - 2, /*file_rewrite_threshold*/ - false, /*skip_rewrite*/ + 2, // file_rewrite_threshold + false, // skip_rewrite ); assert!(matches!(ret, Err(Error::Io(_)))); } #[test] fn test_file_dict_file_not_existed_v1() { - test_file_dict_file_not_existed(false /*enable_log*/); + test_file_dict_file_not_existed(false /* enable_log */); } #[test] fn test_file_dict_file_not_existed_v2() { - test_file_dict_file_not_existed(true /*enable_log*/); + test_file_dict_file_not_existed(true /* enable_log */); } #[test] @@ -524,9 +525,9 @@ mod tests { let (_, file_dict_read) = FileDictionaryFile::open( tempdir.path(), "test_file_dict_file", - true, /*enable_log*/ - 2, /*file_rewrite_threshold*/ - false, /*skip_rewrite*/ + true, // enable_log + 2, // file_rewrite_threshold + false, // skip_rewrite ) .unwrap(); assert_eq!(file_dict, file_dict_read); @@ -544,8 +545,8 @@ mod tests { let mut file_dict = FileDictionaryFile::new( tempdir.path(), "test_file_dict_file", - true, /*enable_log*/ - 1000, /*file_rewrite_threshold*/ + true, // enable_log + 1000, // file_rewrite_threshold ) .unwrap(); @@ -571,9 +572,9 @@ mod tests { let (_, file_dict) = FileDictionaryFile::open( tempdir.path(), "test_file_dict_file", - true, /*enable_log*/ - 1000, /*file_rewrite_threshold*/ - true, /*skip_rewrite*/ + true, // enable_log + 1000, // file_rewrite_threshold + true, // skip_rewrite ) .unwrap(); assert_eq!(*file_dict.files.get("f1").unwrap(), info1); @@ -586,9 +587,9 @@ mod tests { let (_, file_dict) = FileDictionaryFile::open( tempdir.path(), "test_file_dict_file", - false, /*enable_log*/ - 1000, /*file_rewrite_threshold*/ - false, /*skip_rewrite*/ + false, // enable_log + 1000, // file_rewrite_threshold + false, // skip_rewrite ) .unwrap(); assert_eq!(*file_dict.files.get("f1").unwrap(), info1); diff --git a/components/encryption/src/io.rs b/components/encryption/src/io.rs index d62542cb16a..d2c5b6d1546 100644 --- a/components/encryption/src/io.rs +++ b/components/encryption/src/io.rs @@ -409,7 +409,8 @@ impl CrypterCore { } fn reset_buffer(&mut self, size: usize) { - // OCrypter require the output buffer to have block_size extra bytes, or it will panic. + // OCrypter require the output buffer to have block_size extra bytes, or it will + // panic. self.buffer.resize(size + self.block_size, 0); } @@ -436,9 +437,10 @@ impl CrypterCore { Ok(()) } - /// For simplicity, the following implementation rely on the fact that OpenSSL always - /// return exact same size as input in CTR mode. If it is not true in the future, or we - /// want to support other counter modes, this code needs to be updated. + /// For simplicity, the following implementation rely on the fact that + /// OpenSSL always return exact same size as input in CTR mode. If it is + /// not true in the future, or we want to support other counter modes, + /// this code needs to be updated. pub fn do_crypter_in_place(&mut self, buf: &mut [u8]) -> IoResult<()> { if self.crypter.is_none() { self.reset_crypter(0)?; diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 2240e212b84..79654d9d6a2 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -263,9 +263,9 @@ impl Dicts { return Ok(None); } }; - // When an encrypted file exists in the file system, the file_dict must have info about - // this file. But the opposite is not true, this is because the actual file operation - // and file_dict operation are not atomic. + // When an encrypted file exists in the file system, the file_dict must have + // info about this file. But the opposite is not true, this is because the + // actual file operation and file_dict operation are not atomic. check_stale_file_exist(dst_fname, &mut file_dict, &mut file_dict_file)?; let method = file.method; file_dict.files.insert(dst_fname.to_owned(), file.clone()); @@ -466,7 +466,8 @@ impl DataKeyManager { Ok(Some(Self::from_dicts(dicts, args.method, master_key)?)) } - /// Will block file operation for a considerable amount of time. Only used for debugging purpose. + /// Will block file operation for a considerable amount of time. Only used + /// for debugging purpose. pub fn retain_encrypted_files(&self, f: impl Fn(&str) -> bool) { let mut dict = self.dicts.file_dict.lock().unwrap(); let mut file_dict_file = self.dicts.file_dict_file.lock().unwrap(); @@ -592,7 +593,7 @@ impl DataKeyManager { pub fn create_file_for_write>(&self, path: P) -> Result> { let file_writer = File::create(&path)?; - self.open_file_with_writer(path, file_writer, true /*create*/) + self.open_file_with_writer(path, file_writer, true /* create */) } pub fn open_file_with_writer, W: std::io::Write>( @@ -683,9 +684,9 @@ impl DataKeyManager { let (_, file_dict) = FileDictionaryFile::open( dict_path, FILE_DICT_NAME, - true, /*enable_file_dictionary_log*/ + true, // enable_file_dictionary_log 1, - true, /*skip_rewrite*/ + true, // skip_rewrite )?; if let Some(file_path) = file_path { if let Some(info) = file_dict.files.get(file_path) { @@ -1294,7 +1295,8 @@ mod tests { let previous = Box::new(PlaintextBackend::default()) as Box; let result = new_key_manager(&tmp_dir, None, wrong_key, previous); - // When the master key is invalid, the key manager left a empty file dict and return errors. + // When the master key is invalid, the key manager left a empty file dict and + // return errors. assert!(result.is_err()); let previous = Box::new(PlaintextBackend::default()) as Box; let result = new_key_manager(&tmp_dir, None, right_key, previous); @@ -1317,7 +1319,7 @@ mod tests { { let raw = File::create(&path).unwrap(); let mut f = manager - .open_file_with_writer(&path, raw, false /*create*/) + .open_file_with_writer(&path, raw, false /* create */) .unwrap(); f.write_all(content.as_bytes()).unwrap(); f.sync_all().unwrap(); diff --git a/components/encryption/src/master_key/kms.rs b/components/encryption/src/master_key/kms.rs index da1b6d80e0a..8520e7a0cbe 100644 --- a/components/encryption/src/master_key/kms.rs +++ b/components/encryption/src/master_key/kms.rs @@ -124,17 +124,17 @@ impl KmsBackend { Ok(content) } - // On decrypt failure, the rule is to return WrongMasterKey error in case it is possible that - // a wrong master key has been used, or other error otherwise. + // On decrypt failure, the rule is to return WrongMasterKey error in case it is + // possible that a wrong master key has been used, or other error otherwise. fn decrypt_content(&self, content: &EncryptedContent) -> Result> { let vendor_name = self.kms_provider.name(); match content.metadata.get(MetadataKey::KmsVendor.as_str()) { Some(val) if val.as_slice() == vendor_name.as_bytes() => (), None => { return Err( - // If vender is missing in metadata, it could be the encrypted content is invalid - // or corrupted, but it is also possible that the content is encrypted using the - // FileBackend. Return WrongMasterKey anyway. + // If vender is missing in metadata, it could be the encrypted content is + // invalid or corrupted, but it is also possible that the content is encrypted + // using the FileBackend. Return WrongMasterKey anyway. Error::WrongMasterKey(box_err!("missing KMS vendor")), ); } diff --git a/components/encryption/src/master_key/mem.rs b/components/encryption/src/master_key/mem.rs index 92453dac5f2..8e65b85fff6 100644 --- a/components/encryption/src/master_key/mem.rs +++ b/components/encryption/src/master_key/mem.rs @@ -38,24 +38,25 @@ impl MemAesGcmBackend { Ok(content) } - // On decrypt failure, the rule is to return WrongMasterKey error in case it is possible that - // a wrong master key has been used, or other error otherwise. + // On decrypt failure, the rule is to return WrongMasterKey error in case it is + // possible that a wrong master key has been used, or other error otherwise. pub fn decrypt_content(&self, content: &EncryptedContent) -> Result> { let method = content .get_metadata() .get(MetadataKey::Method.as_str()) .ok_or_else(|| { - // Missing method in metadata. The metadata of the encrypted content is invalid or - // corrupted. + // Missing method in metadata. The metadata of the encrypted content is invalid + // or corrupted. Error::Other(box_err!( "metadata {} not found", MetadataKey::Method.as_str() )) })?; if method.as_slice() != MetadataMethod::Aes256Gcm.as_slice() { - // Currently we only support aes256-gcm. A different method could mean the encrypted - // content is written by a future version of TiKV, and we don't know how to handle it. - // Fail immediately instead of fallback to previous key. + // Currently we only support aes256-gcm. A different method could mean the + // encrypted content is written by a future version of TiKV, and we + // don't know how to handle it. Fail immediately instead of fallback + // to previous key. return Err(Error::Other(box_err!( "encryption method mismatch, expected {:?} vs actual {:?}", MetadataMethod::Aes256Gcm.as_slice(), @@ -75,7 +76,8 @@ impl MemAesGcmBackend { .get_metadata() .get(MetadataKey::AesGcmTag.as_str()) .ok_or_else(|| { - // Tag is missing. The metadata of the encrypted content is invalid or corrupted. + // Tag is missing. The metadata of the encrypted content is invalid or + // corrupted. Error::Other(box_err!("gcm tag not found")) })?; let gcm_tag = AesGcmTag::from(tag.as_slice()); diff --git a/components/encryption/src/master_key/mod.rs b/components/encryption/src/master_key/mod.rs index f975e1de7b9..59578a2bcf0 100644 --- a/components/encryption/src/master_key/mod.rs +++ b/components/encryption/src/master_key/mod.rs @@ -106,8 +106,9 @@ pub mod tests { } impl MockBackend { - // Callers are responsible for enabling tracking on the MockBackend by calling this function - // This names the backend instance, allowiing later fine-grained recall + // Callers are responsible for enabling tracking on the MockBackend by calling + // this function This names the backend instance, allowing later fine-grained + // recall pub fn track(&mut self, name: String) { let track = make_track(&name); self.track = track.clone(); diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 6071f06a646..0e83eb2cdb3 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -56,9 +56,9 @@ impl RocksEngine { return false; } - // If path is not an empty directory, we say db exists. If path is not an empty directory - // but db has not been created, `DB::list_column_families` fails and we can clean up - // the directory by this indication. + // If path is not an empty directory, we say db exists. If path is not an empty + // directory but db has not been created, `DB::list_column_families` fails and + // we can clean up the directory by this indication. fs::read_dir(&path).unwrap().next().is_some() } diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 86b8e4fdcae..5b93ccba637 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -162,8 +162,10 @@ impl rocksdb::EventListener for RocksEventListener { } // Here are some expected error examples: +// ```text // 1. Corruption: Sst file size mismatch: /qps/data/tikv-10014/db/000398.sst. Size recorded in manifest 6975, actual size 6959 // 2. Corruption: Bad table magic number: expected 9863518390377041911, found 759105309091689679 in /qps/data/tikv-10014/db/000021.sst +// ``` // // We assume that only the corruption sst file path is printed inside error. fn resolve_sst_filename_from_err(err: &str) -> Option { diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index c63edb8a117..87f46893774 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -82,21 +82,21 @@ mod tests { db.put(&data_key(b"a1"), &value).unwrap(); db.put(&data_key(b"a2"), &value).unwrap(); assert_eq!(stats.fetch(IOType::Flush, IOOp::Write), 0); - db.flush(true /*sync*/).unwrap(); + db.flush(true /* sync */).unwrap(); assert!(stats.fetch(IOType::Flush, IOOp::Write) > value_size * 2); assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.put(&data_key(b"a2"), &value).unwrap(); db.put(&data_key(b"a3"), &value).unwrap(); - db.flush(true /*sync*/).unwrap(); + db.flush(true /* sync */).unwrap(); assert!(stats.fetch(IOType::Flush, IOOp::Write) > value_size * 2); assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.compact_range( - CF_DEFAULT, None, /*start_key*/ - None, /*end_key*/ - false, /*exclusive_manual*/ - 1, /*max_subcompactions*/ + CF_DEFAULT, None, // start_key + None, // end_key + false, // exclusive_manual + 1, // max_subcompactions ) .unwrap(); assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) > value_size * 4); diff --git a/components/engine_rocks/src/import.rs b/components/engine_rocks/src/import.rs index 79e6d6c0f49..a64da35ae67 100644 --- a/components/engine_rocks/src/import.rs +++ b/components/engine_rocks/src/import.rs @@ -19,9 +19,10 @@ impl ImportExt for RocksEngine { opts.set_write_global_seqno(false); files.iter().try_for_each(|file| -> Result<()> { let f = File::open(file)?; - // Prior to v5.2.0, TiKV use `write_global_seqno=true` for ingestion. For backward - // compatibility, in case TiKV is retrying an ingestion job generated by older - // version, it needs to reset the global seqno to 0. + // Prior to v5.2.0, TiKV use `write_global_seqno=true` for ingestion. For + // backward compatibility, in case TiKV is retrying an ingestion job + // generated by older version, it needs to reset the global seqno to + // 0. set_external_sst_file_global_seq_no(self.as_inner(), cf, file, 0).map_err(r2e)?; f.sync_all() .map_err(|e| format!("sync {}: {:?}", file, e)) diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index 8ec581c6e86..a2e394bf8c8 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -10,7 +10,8 @@ //! Because there are so many similarly named types across the TiKV codebase, //! and so much "import renaming", this crate consistently explicitly names type //! that implement a trait as `RocksTraitname`, to avoid the need for import -//! renaming and make it obvious what type any particular module is working with. +//! renaming and make it obvious what type any particular module is working +//! with. //! //! Please read the engine_trait crate docs before hacking. @@ -112,6 +113,6 @@ pub fn get_env( key_manager: Option>, limiter: Option>, ) -> engine_traits::Result> { - let env = encryption::get_env(None /*base_env*/, key_manager)?; + let env = encryption::get_env(None /* base_env */, key_manager)?; file_system::get_env(Some(env), limiter) } diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index ff465d85dd1..ea6d48adb35 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -19,8 +19,8 @@ impl RocksEngine { self.as_inner().is_titan() } - // We store all data which would be deleted in memory at first because the data of region will never be larger than - // max-region-size. + // We store all data which would be deleted in memory at first because the data + // of region will never be larger than max-region-size. fn delete_all_in_range_cf_by_ingest( &self, cf: &str, @@ -36,8 +36,8 @@ impl RocksEngine { let end = KeyBuilder::from_slice(max_end_key, 0, 0); let mut opts = IterOptions::new(Some(start), Some(end), false); if self.is_titan() { - // Cause DeleteFilesInRange may expose old blob index keys, setting key only for Titan - // to avoid referring to missing blob files. + // Cause DeleteFilesInRange may expose old blob index keys, setting key only for + // Titan to avoid referring to missing blob files. opts.set_key_only(true); } @@ -103,8 +103,8 @@ impl RocksEngine { let end = KeyBuilder::from_slice(range.end_key, 0, 0); let mut opts = IterOptions::new(Some(start), Some(end), false); if self.is_titan() { - // Cause DeleteFilesInRange may expose old blob index keys, setting key only for Titan - // to avoid referring to missing blob files. + // Cause DeleteFilesInRange may expose old blob index keys, setting key only for + // Titan to avoid referring to missing blob files. opts.set_key_only(true); } let mut it = self.iterator_opt(cf, opts)?; diff --git a/components/engine_rocks/src/perf_context_impl.rs b/components/engine_rocks/src/perf_context_impl.rs index fe747b21a49..543e116d8ac 100644 --- a/components/engine_rocks/src/perf_context_impl.rs +++ b/components/engine_rocks/src/perf_context_impl.rs @@ -15,7 +15,7 @@ use crate::{ }; macro_rules! report_write_perf_context { - ($ctx: expr, $metric: ident) => { + ($ctx:expr, $metric:ident) => { if $ctx.perf_level != PerfLevel::Disable { $ctx.write = WritePerfContext::capture(); observe_write_time!($ctx, $metric, write_wal_time); @@ -31,7 +31,7 @@ macro_rules! report_write_perf_context { } macro_rules! observe_write_time { - ($ctx:expr, $metric: expr, $v:ident) => { + ($ctx:expr, $metric:expr, $v:ident) => { $metric.$v.observe(($ctx.write.$v) as f64 / 1e9); }; } @@ -169,7 +169,8 @@ pub struct PerfContextStatistics { const FLUSH_METRICS_INTERVAL: Duration = Duration::from_secs(2); impl PerfContextStatistics { - /// Create an instance which stores instant statistics values, retrieved at creation. + /// Create an instance which stores instant statistics values, retrieved at + /// creation. pub fn new(perf_level: PerfLevel, kind: PerfContextKind) -> Self { PerfContextStatistics { perf_level, diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index 1168182c58e..c142ce01a74 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -563,15 +563,18 @@ mod tests { ("g", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), ("h", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8, 1), ("i", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4, 1), - // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + 9),keys(4,5) + // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + + // 9),keys(4,5) ("j", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), ("k", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), // handle "k": size(size = DISTANCE + 2, offset = DISTANCE / 8 * 25 + 11),keys(2,11) ("l", 0, DEFAULT_PROP_KEYS_INDEX_DISTANCE / 2), ("m", 0, DEFAULT_PROP_KEYS_INDEX_DISTANCE / 2), - //handle "m": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE,offset = 11+DEFAULT_PROP_KEYS_INDEX_DISTANCE + // handle "m": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE,offset = + // 11+DEFAULT_PROP_KEYS_INDEX_DISTANCE ("n", 1, DEFAULT_PROP_KEYS_INDEX_DISTANCE), - //handle "n": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE, offset = 11+2*DEFAULT_PROP_KEYS_INDEX_DISTANCE + // handle "n": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE, offset = + // 11+2*DEFAULT_PROP_KEYS_INDEX_DISTANCE ("o", 1, 1), // handle "o": keys = 1, offset = 12 + 2*DEFAULT_PROP_KEYS_INDEX_DISTANCE ]; @@ -662,7 +665,8 @@ mod tests { ("g", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), ("h", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8), ("i", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4), - // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + 9),keys(4,5) + // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + + // 9),keys(4,5) ("j", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), ("k", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), // handle "k": size(size = DISTANCE + 2, offset = DISTANCE / 8 * 25 + 11),keys(2,11) diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index b6a35f4a4e2..fd52342002f 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -38,7 +38,8 @@ impl RaftEngineReadOnly for RocksEngine { let (max_size, mut total_size, mut count) = (max_size.unwrap_or(usize::MAX), 0, 0); if high - low <= RAFT_LOG_MULTI_GET_CNT { - // If election happens in inactive regions, they will just try to fetch one empty log. + // If election happens in inactive regions, they will just try to fetch one + // empty log. for i in low..high { if total_size > 0 && total_size >= max_size { break; diff --git a/components/engine_rocks/src/range_properties.rs b/components/engine_rocks/src/range_properties.rs index fcd0d2fa863..17d0805340d 100644 --- a/components/engine_rocks/src/range_properties.rs +++ b/components/engine_rocks/src/range_properties.rs @@ -191,8 +191,8 @@ impl RangePropertiesExt for RocksEngine { const SAMPLING_THRESHOLD: usize = 20000; const SAMPLE_RATIO: usize = 1000; - // If there are too many keys, reduce its amount before sorting, or it may take too much - // time to sort the keys. + // If there are too many keys, reduce its amount before sorting, or it may take + // too much time to sort the keys. if keys.len() > SAMPLING_THRESHOLD { let len = keys.len(); keys = keys.into_iter().step_by(len / SAMPLE_RATIO).collect(); @@ -204,7 +204,8 @@ impl RangePropertiesExt for RocksEngine { return Ok(keys); } - // Find `key_count` keys which divides the whole range into `parts` parts evenly. + // Find `key_count` keys which divides the whole range into `parts` parts + // evenly. let mut res = Vec::with_capacity(key_count); let section_len = (keys.len() as f64) / ((key_count + 1) as f64); for i in 1..=key_count { diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 4529b6e9d27..4a88c6675ed 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -934,8 +934,8 @@ pub fn flush_engine_iostall_properties(engine: &DB, name: &str) { pub fn flush_engine_properties(engine: &DB, name: &str, shared_block_cache: bool) { for cf in engine.cf_names() { let handle = crate::util::get_cf_handle(engine, cf).unwrap(); - // It is important to monitor each cf's size, especially the "raft" and "lock" column - // families. + // It is important to monitor each cf's size, especially the "raft" and "lock" + // column families. let cf_used_size = crate::util::get_engine_cf_used_size(engine, handle); STORE_ENGINE_SIZE_GAUGE_VEC .with_label_values(&[name, cf]) @@ -1111,8 +1111,8 @@ pub fn flush_engine_properties(engine: &DB, name: &str, shared_block_cache: bool } if shared_block_cache { - // Since block cache is shared, getting cache size from any CF is fine. Here we get from - // default CF. + // Since block cache is shared, getting cache size from any CF is fine. Here we + // get from default CF. let handle = crate::util::get_cf_handle(engine, CF_DEFAULT).unwrap(); let block_cache_usage = engine.get_block_cache_usage_cf(handle); STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC diff --git a/components/engine_rocks/src/sst.rs b/components/engine_rocks/src/sst.rs index 68182238161..66e0a974916 100644 --- a/components/engine_rocks/src/sst.rs +++ b/components/engine_rocks/src/sst.rs @@ -93,8 +93,8 @@ impl Iterable for RocksSstReader { // FIXME: See comment on RocksSstReader for why this contains Rc pub struct RocksSstIterator(DBIterator>); -// TODO(5kbpers): Temporarily force to add `Send` here, add a method for creating -// DBIterator> in rust-rocksdb later. +// TODO(5kbpers): Temporarily force to add `Send` here, add a method for +// creating DBIterator> in rust-rocksdb later. unsafe impl Send for RocksSstIterator {} impl Iterator for RocksSstIterator { @@ -231,10 +231,10 @@ impl SstWriterBuilder for RocksSstWriterBuilder { io_options.set_compression_options( -14, self.compression_level, - 0, /*strategy*/ - 0, /*max_dict_bytes*/ - 0, /*zstd_max_train_bytes*/ - 1, /*parallel_threads*/ + 0, // strategy + 0, // max_dict_bytes + 0, // zstd_max_train_bytes + 1, // parallel_threads ); } io_options.compression(compress_type); diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index a3b6a2bf4cf..4192eecfcae 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -104,7 +104,8 @@ pub fn new_engine_opt( let mut db = DB::open_cf(db_opt, path, cfds).map_err(r2e)?; // Drops discarded column families. - // for cf in existed.iter().filter(|x| needed.iter().find(|y| y == x).is_none()) { + // for cf in existed.iter().filter(|x| needed.iter().find(|y| y == + // x).is_none()) { for cf in cfs_diff(&existed, &needed) { // Never drop default column families. if cf != CF_DEFAULT { @@ -115,8 +116,8 @@ pub fn new_engine_opt( Ok(RocksEngine::new(db)) } -/// Turns "dynamic level size" off for the existing column family which was off before. -/// Column families are small, HashMap isn't necessary. +/// Turns "dynamic level size" off for the existing column family which was off +/// before. Column families are small, HashMap isn't necessary. fn adjust_dynamic_level_bytes( cf_descs: &[CColumnFamilyDescriptor], name: &str, @@ -146,9 +147,10 @@ pub fn db_exist(path: &str) -> bool { return false; } - // If path is not an empty directory, and current file exists, we say db exists. If path is not an empty directory - // but db has not been created, `DB::list_column_families` fails and we can clean up - // the directory by this indication. + // If path is not an empty directory, and current file exists, we say db exists. + // If path is not an empty directory but db has not been created, + // `DB::list_column_families` fails and we can clean up the directory by + // this indication. fs::read_dir(&path).unwrap().next().is_some() } diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index 892dd83321c..f658fb046fb 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -29,12 +29,14 @@ impl WriteBatchExt for RocksEngine { } } -/// `RocksWriteBatchVec` is for method `MultiBatchWrite` of RocksDB, which splits a large WriteBatch -/// into many smaller ones and then any thread could help to deal with these small WriteBatch when it -/// is calling `MultiBatchCommit` and wait the front writer to finish writing. `MultiBatchWrite` will -/// perform much better than traditional `pipelined_write` when TiKV writes very large data into RocksDB. -/// We will remove this feature when `unordered_write` of RocksDB becomes more stable and becomes compatible -/// with Titan. +/// `RocksWriteBatchVec` is for method `MultiBatchWrite` of RocksDB, which +/// splits a large WriteBatch into many smaller ones and then any thread could +/// help to deal with these small WriteBatch when it is calling +/// `MultiBatchCommit` and wait the front writer to finish writing. +/// `MultiBatchWrite` will perform much better than traditional +/// `pipelined_write` when TiKV writes very large data into RocksDB. +/// We will remove this feature when `unordered_write` of RocksDB becomes more +/// stable and becomes compatible with Titan. pub struct RocksWriteBatchVec { db: Arc, wbs: Vec, @@ -79,8 +81,9 @@ impl RocksWriteBatchVec { self.db.as_ref() } - /// `check_switch_batch` will split a large WriteBatch into many smaller ones. This is to avoid - /// a large WriteBatch blocking write_thread too long. + /// `check_switch_batch` will split a large WriteBatch into many smaller + /// ones. This is to avoid a large WriteBatch blocking write_thread too + /// long. #[inline(always)] fn check_switch_batch(&mut self) { if self.support_write_batch_vec diff --git a/components/engine_rocks_helper/src/sst_recovery.rs b/components/engine_rocks_helper/src/sst_recovery.rs index bfd39e951b2..7a820e6a79b 100644 --- a/components/engine_rocks_helper/src/sst_recovery.rs +++ b/components/engine_rocks_helper/src/sst_recovery.rs @@ -132,7 +132,8 @@ impl RecoveryRunner { self.damaged_files.iter().any(|f| f.name == sst_path) } - // Cleans up obsolete damaged files and panics if some files are not handled in time. + // Cleans up obsolete damaged files and panics if some files are not handled in + // time. fn check_damaged_files(&mut self) { if self.damaged_files.is_empty() { return; @@ -153,7 +154,8 @@ impl RecoveryRunner { } // Check whether the StoreMeta contains the region range, if it contains, - // recorded fault region ids to report to PD and add file info into `damaged_files`. + // recorded fault region ids to report to PD and add file info into + // `damaged_files`. // // Acquire meta lock. fn check_overlap_damaged_regions(&self, file: &FileInfo) -> bool { @@ -163,8 +165,8 @@ impl RecoveryRunner { meta.update_overlap_damaged_ranges(&file.name, &file.smallest_key, &file.largest_key); if !overlap { fail_point!("sst_recovery_before_delete_files"); - // The sst file can be deleted safely and set `include_end` to `true` otherwise the - // file with the same largest key will be skipped. + // The sst file can be deleted safely and set `include_end` to `true` otherwise + // the file with the same largest key will be skipped. // Here store meta lock should be held to prevent peers from being added back. self.db .as_inner() diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 20645823fd8..ada430261e3 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -328,7 +328,8 @@ pub mod ctor { /// - The column families specified as `cfs`, with default options, or /// - The column families specified as `opts`, with options. /// - /// Note that if `opts` is not `None` then the `cfs` argument is completely ignored. + /// Note that if `opts` is not `None` then the `cfs` argument is + /// completely ignored. /// /// The engine stores its data in the `path` directory. /// If that directory does not exist, then it is created. diff --git a/components/engine_traits/src/compact.rs b/components/engine_traits/src/compact.rs index a7e8636769b..8dd1cc7d9b4 100644 --- a/components/engine_traits/src/compact.rs +++ b/components/engine_traits/src/compact.rs @@ -9,7 +9,8 @@ use crate::errors::Result; pub trait CompactExt { type CompactedEvent: CompactedEvent; - /// Checks whether any column family sets `disable_auto_compactions` to `True` or not. + /// Checks whether any column family sets `disable_auto_compactions` to + /// `True` or not. fn auto_compactions_is_disabled(&self) -> Result; /// Compacts the column families in the specified range by manual or not. @@ -24,7 +25,8 @@ pub trait CompactExt { /// Compacts files in the range and above the output level. /// Compacts all files if the range is not specified. - /// Compacts all files to the bottommost level if the output level is not specified. + /// Compacts all files to the bottommost level if the output level is not + /// specified. fn compact_files_in_range( &self, start: Option<&[u8]>, @@ -32,8 +34,9 @@ pub trait CompactExt { output_level: Option, ) -> Result<()>; - /// Compacts files in the range and above the output level of the given column family. - /// Compacts all files to the bottommost level if the output level is not specified. + /// Compacts files in the range and above the output level of the given + /// column family. Compacts all files to the bottommost level if the + /// output level is not specified. fn compact_files_in_range_cf( &self, cf: &str, diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index c143cf7a194..1ffbdec1df5 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -68,8 +68,9 @@ pub trait KvEngine: /// TabletAccessor is the trait to access all the tablets with provided accessor /// -/// For single rocksdb instance, it essentially accesses the global kvdb with the accessor -/// For multi rocksdb instances, it accesses all the tablets with the accessor +/// For single rocksdb instance, it essentially accesses the global kvdb with +/// the accessor For multi rocksdb instances, it accesses all the tablets with +/// the accessor pub trait TabletAccessor { /// Loop visit all opened tablets by the specified function. fn for_each_opened_tablet(&self, _f: &mut (dyn FnMut(u64, u64, &EK))); @@ -82,9 +83,11 @@ pub trait TabletAccessor { /// max error count to log const MAX_ERROR_COUNT: u32 = 5; -/// TabletErrorCollector is the facility struct to handle errors when using TabletAccessor::for_each_opened_tablet +/// TabletErrorCollector is the facility struct to handle errors when using +/// TabletAccessor::for_each_opened_tablet /// -/// It will choose the last failed result as the final result, meanwhile logging errors up to MAX_ERROR_COUNT. +/// It will choose the last failed result as the final result, meanwhile logging +/// errors up to MAX_ERROR_COUNT. pub struct TabletErrorCollector { errors: Vec, max_error_count: u32, @@ -151,14 +154,14 @@ impl Drop for TabletErrorCollector { } /// A factory trait to create new engine. -/// -// It should be named as `EngineFactory` for consistency, but we are about to rename -// engine to tablet, so always use tablet for new traits/types. +// It should be named as `EngineFactory` for consistency, but we are about to +// rename engine to tablet, so always use tablet for new traits/types. pub trait TabletFactory: TabletAccessor { /// Create an tablet by id and suffix. If the tablet exists, it will fail. - /// The id is likely the region Id, the suffix could be the current raft log index. - /// They together could specify a unique path for a region's tablet. - /// The reason to have suffix is that we can keep more than one tablet for a region. + /// The id is likely the region Id, the suffix could be the current raft log + /// index. They together could specify a unique path for a region's + /// tablet. The reason to have suffix is that we can keep more than one + /// tablet for a region. fn create_tablet(&self, id: u64, suffix: u64) -> Result; /// Open a tablet by id and suffix. If the tablet exists, it will open it. @@ -167,7 +170,8 @@ pub trait TabletFactory: TabletAccessor { self.open_tablet_raw(&self.tablet_path(id, suffix), false) } - /// Open a tablet by id and suffix from cache---that means it should already be opened. + /// Open a tablet by id and suffix from cache---that means it should already + /// be opened. fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option { if let Ok(engine) = self.open_tablet_raw(&self.tablet_path(id, suffix), false) { return Some(engine); @@ -204,7 +208,8 @@ pub trait TabletFactory: TabletAccessor { /// Tablets root path fn tablets_path(&self) -> PathBuf; - /// Load the tablet from path for id and suffix--for scenarios such as applying snapshot + /// Load the tablet from path for id and suffix--for scenarios such as + /// applying snapshot fn load_tablet(&self, _path: &Path, _id: u64, _suffix: u64) -> Result { unimplemented!(); } diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 6ba3da2b3d9..191e5dcb204 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -60,14 +60,15 @@ //! - [`SyncMutable`] and [`Mutable`] - types to which single key/value pairs //! can be written. This includes engines and write batches. //! -//! - [`WriteBatch`] - types that can commit multiple key/value pairs in batches. -//! A `WriteBatchExt::WriteBtach` commits all pairs in one atomic transaction. -//! A `WriteBatchExt::WriteBatchVec` does not (FIXME: is this correct?). +//! - [`WriteBatch`] - types that can commit multiple key/value pairs in +//! batches. A `WriteBatchExt::WriteBatch` commits all pairs in one atomic +//! transaction. A `WriteBatchExt::WriteBatchVec` does not (FIXME: is this +//! correct?). //! //! The `KvEngine` instance generally acts as a factory for types that implement //! other traits in the crate. These factory methods, associated types, and -//! other associated methods are defined in "extension" traits. For example, methods -//! on engines related to batch writes are in the `WriteBatchExt` trait. +//! other associated methods are defined in "extension" traits. For example, +//! methods on engines related to batch writes are in the `WriteBatchExt` trait. //! //! //! # Design notes @@ -75,19 +76,19 @@ //! - `KvEngine` is the main engine trait. It requires many other traits, which //! have many other associated types that implement yet more traits. //! -//! - Features should be grouped into their own modules with their own -//! traits. A common pattern is to have an associated type that implements -//! a trait, and an "extension" trait that associates that type with `KvEngine`, -//! which is part of `KvEngine's trait requirements. +//! - Features should be grouped into their own modules with their own traits. A +//! common pattern is to have an associated type that implements a trait, and +//! an "extension" trait that associates that type with `KvEngine`, which is +//! part of `KvEngine's trait requirements. //! //! - For now, for simplicity, all extension traits are required by `KvEngine`. //! In the future it may be feasible to separate them for engines with //! different feature sets. //! -//! - Associated types generally have the same name as the trait they -//! are required to implement. Engine extensions generally have the same -//! name suffixed with `Ext`. Concrete implementations usually have the -//! same name prefixed with the database name, i.e. `Rocks`. +//! - Associated types generally have the same name as the trait they are +//! required to implement. Engine extensions generally have the same name +//! suffixed with `Ext`. Concrete implementations usually have the same name +//! prefixed with the database name, i.e. `Rocks`. //! //! Example: //! @@ -121,9 +122,9 @@ //! use a standard new method). If future engines require factory methods, the //! traits can be converted then. //! -//! - Types that require a handle to the engine (or some other "parent" type) -//! do so with either Rc or Arc. An example is EngineIterator. The reason -//! for this is that associated types cannot contain lifetimes. That requires +//! - Types that require a handle to the engine (or some other "parent" type) do +//! so with either Rc or Arc. An example is EngineIterator. The reason for +//! this is that associated types cannot contain lifetimes. That requires //! "generic associated types". See //! //! - @@ -221,15 +222,15 @@ //! `RocksDB::from_ref` and `RocksDB::as_inner` methods. //! //! - Down follow the type system too far "down the rabbit hole". When you see -//! that another subsystem is blocking you from refactoring the system you -//! are trying to refactor, stop, stash your changes, and focus on the other +//! that another subsystem is blocking you from refactoring the system you are +//! trying to refactor, stop, stash your changes, and focus on the other //! system instead. //! //! - You will through away branches that lead to dead ends. Learn from the //! experience and try again from a different angle. //! -//! - For now, use the same APIs as the RocksDB bindings, as methods -//! on the various engine traits, and with this crate's error type. +//! - For now, use the same APIs as the RocksDB bindings, as methods on the +//! various engine traits, and with this crate's error type. //! //! - When new types are needed from the RocksDB API, add a new module, define a //! new trait (possibly with the same name as the RocksDB type), then define a diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index bc2c3a2b547..67e32e40bdd 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -11,17 +11,20 @@ use crate::{ #[derive(Clone, Debug)] pub enum DeleteStrategy { - /// Delete the SST files that are fullly fit in range. However, the SST files that are partially - /// overlapped with the range will not be touched. + /// Delete the SST files that are fullly fit in range. However, the SST + /// files that are partially overlapped with the range will not be + /// touched. DeleteFiles, /// Delete the data stored in Titan. DeleteBlobs, - /// Scan for keys and then delete. Useful when we know the keys in range are not too many. + /// Scan for keys and then delete. Useful when we know the keys in range are + /// not too many. DeleteByKey, - /// Delete by range. Note that this is experimental and you should check whether it is enbaled - /// in config before using it. + /// Delete by range. Note that this is experimental and you should check + /// whether it is enbaled in config before using it. DeleteByRange, - /// Delete by ingesting a SST file with deletions. Useful when the number of ranges is too many. + /// Delete by ingesting a SST file with deletions. Useful when the number of + /// ranges is too many. DeleteByWriter { sst_path: String }, } @@ -44,25 +47,26 @@ pub trait MiscExt: CFNamesExt + FlowControlFactorsExt { ranges: &[Range<'_>], ) -> Result<()>; - /// Return the approximate number of records and size in the range of memtables of the cf. + /// Return the approximate number of records and size in the range of + /// memtables of the cf. fn get_approximate_memtable_stats_cf(&self, cf: &str, range: &Range<'_>) -> Result<(u64, u64)>; fn ingest_maybe_slowdown_writes(&self, cf: &str) -> Result; /// Gets total used size of rocksdb engine, including: - /// * total size (bytes) of all SST files. - /// * total size (bytes) of active and unflushed immutable memtables. - /// * total size (bytes) of all blob files. - /// + /// * total size (bytes) of all SST files. + /// * total size (bytes) of active and unflushed immutable memtables. + /// * total size (bytes) of all blob files. fn get_engine_used_size(&self) -> Result; /// Roughly deletes files in multiple ranges. /// /// Note: - /// - After this operation, some keys in the range might still exist in the database. - /// - After this operation, some keys in the range might be removed from existing snapshot, - /// so you shouldn't expect to be able to read data from the range using existing snapshots - /// any more. + /// - After this operation, some keys in the range might still exist in + /// the database. + /// - After this operation, some keys in the range might be removed from + /// existing snapshot, so you shouldn't expect to be able to read data + /// from the range using existing snapshots any more. /// /// Ref: fn roughly_cleanup_ranges(&self, ranges: &[(Vec, Vec)]) -> Result<()>; diff --git a/components/engine_traits/src/peekable.rs b/components/engine_traits/src/peekable.rs index 7550568396c..23318b2a233 100644 --- a/components/engine_traits/src/peekable.rs +++ b/components/engine_traits/src/peekable.rs @@ -19,7 +19,8 @@ pub trait Peekable { /// Returns `None` if they key does not exist. fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result>; - /// Read a value for a key from a given column family, given a set of options. + /// Read a value for a key from a given column family, given a set of + /// options. /// /// Returns `None` if the key does not exist. fn get_value_cf_opt( diff --git a/components/engine_traits/src/perf_context.rs b/components/engine_traits/src/perf_context.rs index c46ec4a95c8..dfa5aa967b7 100644 --- a/components/engine_traits/src/perf_context.rs +++ b/components/engine_traits/src/perf_context.rs @@ -50,7 +50,8 @@ pub enum PerfContextKind { RaftstoreStore, /// Commands in tikv::storage, the inner str is the command tag. Storage(&'static str), - /// Coprocessor requests in tikv::coprocessor, the inner str is the request type. + /// Coprocessor requests in tikv::coprocessor, the inner str is the request + /// type. Coprocessor(&'static str), } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 58a78f605f9..d94d69fa335 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -103,8 +103,8 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()>; - /// Like `cut_logs` but the range could be very large. Return the deleted count. - /// Generally, `from` can be passed in `0`. + /// Like `cut_logs` but the range could be very large. Return the deleted + /// count. Generally, `from` can be passed in `0`. fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result; fn batch_gc(&self, tasks: Vec) -> Result { diff --git a/components/engine_traits/src/range_properties.rs b/components/engine_traits/src/range_properties.rs index 8c326bd41c7..f97008dd929 100644 --- a/components/engine_traits/src/range_properties.rs +++ b/components/engine_traits/src/range_properties.rs @@ -32,7 +32,8 @@ pub trait RangePropertiesExt { large_threshold: u64, ) -> Result; - /// Get range approximate split keys to split range evenly into key_count + 1 parts . + /// Get range approximate split keys to split range evenly into key_count + + /// 1 parts . fn get_range_approximate_split_keys( &self, range: Range<'_>, diff --git a/components/engine_traits/src/sst_partitioner.rs b/components/engine_traits/src/sst_partitioner.rs index faedd4efb8b..f41664403d1 100644 --- a/components/engine_traits/src/sst_partitioner.rs +++ b/components/engine_traits/src/sst_partitioner.rs @@ -30,8 +30,8 @@ pub trait SstPartitioner { } pub trait SstPartitionerFactory: Sync + Send { - // Lifetime of the partitioner can be changed to be bounded by the factory's lifetime once - // generic associated types is supported. + // Lifetime of the partitioner can be changed to be bounded by the factory's + // lifetime once generic associated types is supported. // https://github.com/rust-lang/rfcs/blob/master/text/1598-generic_associated_types.md type Partitioner: SstPartitioner + 'static; diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/export/src/export.rs index b9d4b098394..00048522752 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/export/src/export.rs @@ -1,7 +1,7 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -//! To use External storage with protobufs as an application, import this module. -//! external_storage contains the actual library code +//! To use External storage with protobufs as an application, import this +//! module. external_storage contains the actual library code //! Cloud provider backends are under components/cloud use std::{ io::{self, Write}, @@ -55,8 +55,9 @@ pub fn create_storage( } } -// when the flag cloud-storage-dylib or cloud-storage-grpc is set create_storage is automatically wrapped with a client -// This function is used by the library/server to avoid any wrapping +// when the flag cloud-storage-dylib or cloud-storage-grpc is set create_storage +// is automatically wrapped with a client This function is used by the +// library/server to avoid any wrapping pub fn create_storage_no_client( storage_backend: &StorageBackend, config: BackendConfig, diff --git a/components/external_storage/export/src/request.rs b/components/external_storage/export/src/request.rs index eaf618746c0..5623c0732d7 100644 --- a/components/external_storage/export/src/request.rs +++ b/components/external_storage/export/src/request.rs @@ -58,7 +58,8 @@ pub async fn restore_inner( expected_length: u64, ) -> io::Result<()> { let storage = create_storage_no_client(&storage_backend)?; - // TODO: support encryption. The service must be launched with or sent a DataKeyManager + // TODO: support encryption. The service must be launched with or sent a + // DataKeyManager let output: &mut dyn io::Write = &mut File::create(file_name)?; // the minimum speed of reading data, in bytes/second. // if reading speed is slower than this rate, we will stop with diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index f1d1a617dc8..8c9ea242b98 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -50,9 +50,9 @@ pub fn record_storage_create(start: Instant, storage: &dyn ExternalStorage) { } /// UnpinReader is a simple wrapper for AsyncRead + Unpin + Send. -/// This wrapper would remove the lifetime at the argument of the generted async function -/// in order to make rustc happy. (And reduce the length of signture of write.) -/// see https://github.com/rust-lang/rust/issues/63033 +/// This wrapper would remove the lifetime at the argument of the generated +/// async function in order to make rustc happy. (And reduce the length of +/// signature of write.) see https://github.com/rust-lang/rust/issues/63033 pub struct UnpinReader(pub Box); #[derive(Debug, Default)] diff --git a/components/external_storage/src/local.rs b/components/external_storage/src/local.rs index 00cb42cf1a6..f246c808b86 100644 --- a/components/external_storage/src/local.rs +++ b/components/external_storage/src/local.rs @@ -84,8 +84,9 @@ impl ExternalStorage for LocalStorage { )); } // create the parent dir if there isn't one. - // note: we may write to arbitrary directory here if the path contains things like '../' - // but internally the file name should be fully controlled by TiKV, so maybe it is OK? + // note: we may write to arbitrary directory here if the path contains things + // like '../' but internally the file name should be fully controlled by + // TiKV, so maybe it is OK? if let Some(parent) = Path::new(name).parent() { fs::create_dir_all(self.base.join(parent)) .await @@ -102,7 +103,8 @@ impl ExternalStorage for LocalStorage { } // Because s3 could support writing(put_object) a existed object. - // For the interface consistent with s3, local storage need also support write a existed file. + // For the interface consistent with s3, local storage need also support write a + // existed file. if fs::metadata(self.base.join(name)).await.is_ok() { info!("[{}] is already exists in {}", name, self.base.display()); } @@ -120,7 +122,8 @@ impl ExternalStorage for LocalStorage { fn read(&self, name: &str) -> Box { debug!("read file from local storage"; "name" => %name, "base" => %self.base.display()); - // We used std i/o here for removing the requirement of tokio reactor when restoring. + // We used std i/o here for removing the requirement of tokio reactor when + // restoring. // FIXME: when restore side get ready, use tokio::fs::File for returning. match StdFile::open(self.base.join(name)) { Ok(file) => Box::new(AllowStdIo::new(file)) as _, diff --git a/components/external_storage/src/request.rs b/components/external_storage/src/request.rs index ef4fa54e448..7f1a81d49b7 100644 --- a/components/external_storage/src/request.rs +++ b/components/external_storage/src/request.rs @@ -24,7 +24,8 @@ pub fn write_sender( // currently it is copying into an intermediate buffer // Writing to a file here uses up disk space // But as a positive it gets the backup data out of the DB the fastest - // Currently this waits for the file to be completely written before sending to storage + // Currently this waits for the file to be completely written before sending to + // storage runtime.enter(|| { block_on(async { let msg = |action: &str| format!("{} file {:?}", action, &file_path); diff --git a/components/file_system/src/file.rs b/components/file_system/src/file.rs index 93269d5da10..1c56b240f1d 100644 --- a/components/file_system/src/file.rs +++ b/components/file_system/src/file.rs @@ -15,7 +15,8 @@ use fs2::FileExt; use super::{get_io_rate_limiter, get_io_type, IOOp, IORateLimiter}; -/// A wrapper around `std::fs::File` with capability to track and regulate IO flow. +/// A wrapper around `std::fs::File` with capability to track and regulate IO +/// flow. pub struct File { inner: fs::File, limiter: Option>, diff --git a/components/file_system/src/io_stats/biosnoop.rs b/components/file_system/src/io_stats/biosnoop.rs index cbe622f78f8..d156d94f77c 100644 --- a/components/file_system/src/io_stats/biosnoop.rs +++ b/components/file_system/src/io_stats/biosnoop.rs @@ -29,9 +29,9 @@ use crate::{metrics::*, IOBytes, IOType}; /// by address, then all the IO requests for that thread will be recorded in /// corresponding type's map in BCC. /// -/// With that information, every time calling `IOContext` it get the stored stats -/// from corresponding type's map in BCC. Thus it enables TiKV to get the latency and -/// bytes of read/write request per IO-type. +/// With that information, every time calling `IOContext` it get the stored +/// stats from corresponding type's map in BCC. Thus it enables TiKV to get the +/// latency and bytes of read/write request per IO-type. const MAX_THREAD_IDX: usize = 192; @@ -291,8 +291,8 @@ mod tests { #[test] fn test_biosnoop() { init().unwrap(); - // Test cases are running in parallel, while they depend on the same global variables. - // To make them not affect each other, run them in sequence. + // Test cases are running in parallel, while they depend on the same global + // variables. To make them not affect each other, run them in sequence. test_thread_idx_allocation(); test_io_context(); unsafe { diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index d5f8345cae3..104b7371537 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -281,7 +281,8 @@ pub fn copy, Q: AsRef>(from: P, to: Q) -> io::Result { copy_imp(from.as_ref(), to.as_ref(), false /* sync */) } -/// Copies the contents and permission bits of one file to another, then synchronizes. +/// Copies the contents and permission bits of one file to another, then +/// synchronizes. pub fn copy_and_sync, Q: AsRef>(from: P, to: Q) -> io::Result { copy_imp(from.as_ref(), to.as_ref(), true /* sync */) } @@ -296,8 +297,8 @@ pub fn file_exists>(file: P) -> bool { path.exists() && path.is_file() } -/// Deletes given path from file system. Returns `true` on success, `false` if the file doesn't exist. -/// Otherwise the raw error will be returned. +/// Deletes given path from file system. Returns `true` on success, `false` if +/// the file doesn't exist. Otherwise the raw error will be returned. pub fn delete_file_if_exist>(file: P) -> io::Result { match remove_file(&file) { Ok(_) => Ok(true), @@ -306,8 +307,8 @@ pub fn delete_file_if_exist>(file: P) -> io::Result { } } -/// Deletes given path from file system. Returns `true` on success, `false` if the directory doesn't -/// exist. Otherwise the raw error will be returned. +/// Deletes given path from file system. Returns `true` on success, `false` if +/// the directory doesn't exist. Otherwise the raw error will be returned. pub fn delete_dir_if_exist>(dir: P) -> io::Result { match remove_dir_all(&dir) { Ok(_) => Ok(true), @@ -316,8 +317,9 @@ pub fn delete_dir_if_exist>(dir: P) -> io::Result { } } -/// Creates a new, empty directory at the provided path. Returns `true` on success, -/// `false` if the directory already exists. Otherwise the raw error will be returned. +/// Creates a new, empty directory at the provided path. Returns `true` on +/// success, `false` if the directory already exists. Otherwise the raw error +/// will be returned. pub fn create_dir_if_not_exist>(dir: P) -> io::Result { match create_dir(&dir) { Ok(_) => Ok(true), diff --git a/components/file_system/src/metrics_manager.rs b/components/file_system/src/metrics_manager.rs index ddc48eb8f86..8ff4bddde47 100644 --- a/components/file_system/src/metrics_manager.rs +++ b/components/file_system/src/metrics_manager.rs @@ -12,7 +12,8 @@ use crate::{ }; pub enum BytesFetcher { - /// Fetch IO statistics from IO rate limiter, which records passed-through IOs in atomic counters. + /// Fetch IO statistics from IO rate limiter, which records passed-through + /// IOs in atomic counters. FromRateLimiter(Arc), /// Fetch IO statistics from OS I/O stats collector. FromIOStatsCollector(), diff --git a/components/file_system/src/rate_limiter.rs b/components/file_system/src/rate_limiter.rs index b6aa0730ac7..51fe8228aef 100644 --- a/components/file_system/src/rate_limiter.rs +++ b/components/file_system/src/rate_limiter.rs @@ -159,15 +159,15 @@ impl Default for IORateLimiterStatistics { } } -/// Used to dynamically adjust the proportion of total budgets allocated for rate limited -/// IO. This is needed when global IOs are only partially rate limited, e.g. when mode is -/// IORateLimitMode::WriteOnly. +/// Used to dynamically adjust the proportion of total budgets allocated for +/// rate limited IO. This is needed when global IOs are only partially rate +/// limited, e.g. when mode is IORateLimitMode::WriteOnly. pub trait IOBudgetAdjustor: Send + Sync { fn adjust(&self, threshold: usize) -> usize; } -/// Limit total IO flow below provided threshold by throttling lower-priority IOs. -/// Rate limit is disabled when total IO threshold is set to zero. +/// Limit total IO flow below provided threshold by throttling lower-priority +/// IOs. Rate limit is disabled when total IO threshold is set to zero. struct PriorityBasedIORateLimiter { // High-priority IOs are only limited when strict is true strict: bool, @@ -197,13 +197,13 @@ impl PriorityBasedIORateLimiterProtected { } macro_rules! do_sleep { - ($duration:expr, sync) => { + ($duration:expr,sync) => { std::thread::sleep($duration); }; - ($duration:expr, async) => { + ($duration:expr,async) => { tokio::time::sleep($duration).await; }; - ($duration:expr, skewed_sync) => { + ($duration:expr,skewed_sync) => { use rand::Rng; let mut rng = rand::thread_rng(); let subtraction: bool = rng.gen(); @@ -217,9 +217,10 @@ macro_rules! do_sleep { } /// Actual implementation for requesting IOs from PriorityBasedIORateLimiter. -/// An attempt will first be recorded. If the attempted amount exceeds the available quotas of -/// current epoch, the requester will be queued (logically) and sleep until served. -/// Macro is necessary to de-dup codes used both in async/sync functions. +/// An attempt will first be recorded. If the attempted amount exceeds the +/// available quotas of current epoch, the requester will be queued (logically) +/// and sleep until served. Macro is necessary to de-dup codes used both in +/// async/sync functions. macro_rules! request_imp { ($limiter:ident, $priority:ident, $amount:ident, $mode:tt) => {{ debug_assert!($amount > 0); @@ -244,7 +245,8 @@ macro_rules! request_imp { // The request is already partially fulfilled in current epoch when consumption // overflow bytes are smaller than requested amount. let remains = std::cmp::min(bytes_through - cached_bytes_per_epoch, amount); - // When there is a recent refill, double check if bytes consumption has been reset. + // When there is a recent refill, double check if bytes consumption has been + // reset. if now + DEFAULT_REFILL_PERIOD < locked.next_refill_time + Duration::from_millis(1) && $limiter.bytes_through[priority_idx].fetch_add(remains, Ordering::Relaxed) + remains @@ -252,8 +254,8 @@ macro_rules! request_imp { { return amount; } - // Enqueue itself by adding to pending_bytes, whose current value denotes a position - // of logical queue to wait in. + // Enqueue itself by adding to pending_bytes, whose current value denotes a + // position of logical queue to wait in. locked.pending_bytes[priority_idx] += remains; // Calculate wait duration by queue_len / served_per_epoch. let wait = if locked.next_refill_time <= now { @@ -343,11 +345,13 @@ impl PriorityBasedIORateLimiter { /// Updates and refills IO budgets for next epoch based on IO priority. /// Here we provide best-effort priority control: - /// 1) Limited IO budget is assigned to lower priority to ensure higher priority can at least - /// consume the same IO amount as the last few epochs without breaching global threshold. - /// 2) Higher priority may temporarily use lower priority's IO budgets. When this happens, - /// total IO flow could exceed global threshold. - /// 3) Highest priority IO alone must not exceed global threshold (in strict mode). + /// - Limited IO budget is assigned to lower priority to ensure higher + /// priority can at least consume the same IO amount as the last few + /// epochs without breaching global threshold. + /// - Higher priority may temporarily use lower priority's IO budgets. When + /// this happens, total IO flow could exceed global threshold. + /// - Highest priority IO alone must not exceed global threshold (in strict + /// mode). fn refill(&self, locked: &mut PriorityBasedIORateLimiterProtected, now: Instant) { let mut total_budgets = self.bytes_per_epoch[IOPriority::High as usize].load(Ordering::Relaxed); @@ -368,8 +372,8 @@ impl PriorityBasedIORateLimiter { let mut used_budgets = 0; for pri in &[IOPriority::High, IOPriority::Medium] { let p = *pri as usize; - // Skipped epochs can only serve pending requests rather that in-coming ones, catch up - // by subtracting them from pending_bytes. + // Skipped epochs can only serve pending requests rather that in-coming ones, + // catch up by subtracting them from pending_bytes. let served_by_skipped_epochs = std::cmp::min( (remaining_budgets as f32 * skipped_epochs) as usize, locked.pending_bytes[p], @@ -460,8 +464,8 @@ impl IORateLimiter { pub fn new_for_test() -> Self { IORateLimiter::new( IORateLimitMode::AllIo, - true, /*strict*/ - true, /*enable_statistics*/ + true, // strict + true, // enable_statistics ) } @@ -629,15 +633,15 @@ mod tests { let t0 = Instant::now(); let _write_context = start_background_jobs( &limiter, - 1, /*job_count*/ + 1, // job_count Request(IOType::ForegroundWrite, IOOp::Write, 10), - None, /*interval*/ + None, // interval ); let _compaction_context = start_background_jobs( &limiter, - 1, /*job_count*/ + 1, // job_count Request(IOType::Compaction, IOOp::Write, 10), - None, /*interval*/ + None, // interval ); std::thread::sleep(Duration::from_secs(1)); let t1 = Instant::now(); @@ -679,9 +683,9 @@ mod tests { { let _context = start_background_jobs( limiter, - 2, /*job_count*/ + 2, // job_count Request(IOType::ForegroundWrite, IOOp::Write, 10), - None, /*interval*/ + None, // interval ); std::thread::sleep(duration); } @@ -699,8 +703,8 @@ mod tests { let bytes_per_sec = 2000; let limiter = Arc::new(IORateLimiter::new( IORateLimitMode::AllIo, - false, /*strict*/ - true, /*enable_statistics*/ + false, // strict + true, // enable_statistics )); limiter.set_io_priority(IOType::ForegroundWrite, IOPriority::Medium); verify_rate_limit(&limiter, bytes_per_sec, Duration::from_secs(2)); @@ -712,9 +716,9 @@ mod tests { { let _context = start_background_jobs( &limiter, - 2, /*job_count*/ + 2, // job_count Request(IOType::ForegroundWrite, IOOp::Write, 10), - None, /*interval*/ + None, // interval ); std::thread::sleep(Duration::from_secs(2)); } @@ -750,7 +754,7 @@ mod tests { // each thread request at most 1000 bytes per second let _context = start_background_jobs( &limiter, - actual_kbytes_per_sec, /*job_count*/ + actual_kbytes_per_sec, // job_count Request(IOType::Compaction, IOOp::Write, 1), Some(Duration::from_millis(1)), ); @@ -781,7 +785,7 @@ mod tests { { let _write = start_background_jobs( &limiter, - 1, /*job_count*/ + 1, // job_count Request( IOType::ForegroundWrite, IOOp::Write, @@ -791,7 +795,7 @@ mod tests { ); let _compaction = start_background_jobs( &limiter, - 1, /*job_count*/ + 1, // job_count Request( IOType::Compaction, IOOp::Write, @@ -801,7 +805,7 @@ mod tests { ); let _import = start_background_jobs( &limiter, - 1, /*job_count*/ + 1, // job_count Request( IOType::Import, IOOp::Write, @@ -826,7 +830,7 @@ mod tests { #[bench] fn bench_critical_section(b: &mut test::Bencher) { - let inner_limiter = PriorityBasedIORateLimiter::new(true /*strict*/); + let inner_limiter = PriorityBasedIORateLimiter::new(true /* strict */); inner_limiter.set_bytes_per_sec(1024); let now = Instant::now_coarse(); b.iter(|| { diff --git a/components/keys/src/lib.rs b/components/keys/src/lib.rs index fa855bbe353..ecb2657de00 100644 --- a/components/keys/src/lib.rs +++ b/components/keys/src/lib.rs @@ -226,16 +226,16 @@ pub fn origin_key(key: &[u8]) -> &[u8] { /// Get the `start_key` of current region in encoded form. pub fn enc_start_key(region: &Region) -> Vec { - // only initialized region's start_key can be encoded, otherwise there must be bugs - // somewhere. + // only initialized region's start_key can be encoded, otherwise there must be + // bugs somewhere. assert!(!region.get_peers().is_empty()); data_key(region.get_start_key()) } /// Get the `end_key` of current region in encoded form. pub fn enc_end_key(region: &Region) -> Vec { - // only initialized region's end_key can be encoded, otherwise there must be bugs - // somewhere. + // only initialized region's end_key can be encoded, otherwise there must be + // bugs somewhere. assert!(!region.get_peers().is_empty()); data_end_key(region.get_end_key()) } @@ -439,7 +439,8 @@ mod tests { assert_eq!(buffer, data_key(b"cde")); let mut region = Region::default(); - // uninitialised region should not be passed in `enc_start_key` and `enc_end_key`. + // uninitialised region should not be passed in `enc_start_key` and + // `enc_end_key`. assert!(::panic_hook::recover_safe(|| enc_start_key(®ion)).is_err()); assert!(::panic_hook::recover_safe(|| enc_end_key(®ion)).is_err()); diff --git a/components/log_wrappers/src/lib.rs b/components/log_wrappers/src/lib.rs index 986c1710137..5361eaeee18 100644 --- a/components/log_wrappers/src/lib.rs +++ b/components/log_wrappers/src/lib.rs @@ -1,6 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! Provides wrappers for types that comes from 3rd-party and does not implement slog::Value. +//! Provides wrappers for types that comes from 3rd-party and does not implement +//! slog::Value. #[macro_use] extern crate slog; @@ -21,10 +22,11 @@ pub mod test_util; /// Wraps any `Display` type, use `Display` as `slog::Value`. /// -/// Usually this wrapper is useful in containers, e.g. `Option>`. +/// Usually this wrapper is useful in containers, e.g. +/// `Option>`. /// -/// If your type `val: T` is directly used as a field value, you may use `"key" => %value` syntax -/// instead. +/// If your type `val: T` is directly used as a field value, you may use `"key" +/// => %value` syntax instead. pub struct DisplayValue(pub T); impl slog::Value for DisplayValue { @@ -43,8 +45,8 @@ impl slog::Value for DisplayValue { /// /// Usually this wrapper is useful in containers, e.g. `Option>`. /// -/// If your type `val: T` is directly used as a field value, you may use `"key" => ?value` syntax -/// instead. +/// If your type `val: T` is directly used as a field value, you may use `"key" +/// => ?value` syntax instead. pub struct DebugValue(pub T); impl slog::Value for DebugValue { diff --git a/components/log_wrappers/src/test_util.rs b/components/log_wrappers/src/test_util.rs index a527ac379eb..d455e52c620 100644 --- a/components/log_wrappers/src/test_util.rs +++ b/components/log_wrappers/src/test_util.rs @@ -4,7 +4,8 @@ use std::{io, sync}; -/// A buffer which can be served as a logging destination while being able to access its content. +/// A buffer which can be served as a logging destination while being able to +/// access its content. #[derive(Clone, Default)] pub struct SyncLoggerBuffer(sync::Arc>>); @@ -14,8 +15,8 @@ impl SyncLoggerBuffer { Self::default() } - /// Builds a `slog::Logger` over this buffer which uses compact format and always output `TIME` - /// in the time field. + /// Builds a `slog::Logger` over this buffer which uses compact format and + /// always output `TIME` in the time field. pub fn build_logger(&self) -> slog::Logger { use slog::Drain; diff --git a/components/online_config/src/lib.rs b/components/online_config/src/lib.rs index fae347fee40..2388bf3b3ac 100644 --- a/components/online_config/src/lib.rs +++ b/components/online_config/src/lib.rs @@ -51,7 +51,7 @@ impl Debug for ConfigValue { } macro_rules! impl_from { - ($from: ty, $to: tt) => { + ($from:ty, $to:tt) => { impl From<$from> for ConfigValue { fn from(r: $from) -> ConfigValue { ConfigValue::$to(r) @@ -69,7 +69,7 @@ impl_from!(String, String); impl_from!(ConfigChange, Module); macro_rules! impl_into { - ($into: ty, $from: tt) => { + ($into:ty, $from:tt) => { impl From for $into { fn from(c: ConfigValue) -> $into { if let ConfigValue::$from(v) = c { diff --git a/components/panic_hook/src/lib.rs b/components/panic_hook/src/lib.rs index 12db221dbb5..7e95ea4071a 100644 --- a/components/panic_hook/src/lib.rs +++ b/components/panic_hook/src/lib.rs @@ -55,7 +55,8 @@ fn track_hook(p: &PanicInfo<'_>) { /// Recover from closure which may panic. /// -/// This function assumes the closure is able to be forced to implement `UnwindSafe`. +/// This function assumes the closure is able to be forced to implement +/// `UnwindSafe`. /// /// Also see [`AssertUnwindSafe`](https://doc.rust-lang.org/std/panic/struct.AssertUnwindSafe.html). pub fn recover_safe(f: F) -> std::thread::Result diff --git a/components/pd_client/src/config.rs b/components/pd_client/src/config.rs index f11608117e8..a02c2272490 100644 --- a/components/pd_client/src/config.rs +++ b/components/pd_client/src/config.rs @@ -6,8 +6,8 @@ use serde_derive::{Deserialize, Serialize}; use tikv_util::config::ReadableDuration; /// The configuration for a PD Client. /// -/// By default during initialization the client will attempt to reconnect every 300s -/// for infinity, logging only every 10th duplicate error. +/// By default during initialization the client will attempt to reconnect every +/// 300s for infinity, logging only every 10th duplicate error. #[derive(Clone, Serialize, Deserialize, PartialEq, Debug)] #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -24,8 +24,8 @@ pub struct Config { /// /// Default is isize::MAX, represented by -1. pub retry_max_count: isize, - /// If the client observes the same error message on retry, it can repeat the message only - /// every `n` times. + /// If the client observes the same error message on retry, it can repeat + /// the message only every `n` times. /// /// Default is 10. Set to 1 to disable this feature. pub retry_log_every: usize, @@ -33,7 +33,8 @@ pub struct Config { /// /// Default is 10m. pub update_interval: ReadableDuration, - /// The switch to support forwarding requests to follower when the network partition problem happens. + /// The switch to support forwarding requests to follower when the network + /// partition problem happens. /// /// Default is false. pub enable_forwarding: bool, diff --git a/components/pd_client/src/feature_gate.rs b/components/pd_client/src/feature_gate.rs index 64ee3067585..dc8bef853de 100644 --- a/components/pd_client/src/feature_gate.rs +++ b/components/pd_client/src/feature_gate.rs @@ -7,8 +7,8 @@ use std::sync::{ use semver::{SemVerError, Version}; -/// The function assumes only major, minor and patch are considered, and they are -/// all less than u16::MAX, which is 65535. +/// The function assumes only major, minor and patch are considered, and they +/// are all less than u16::MAX, which is 65535. const fn ver_to_val(major: u64, minor: u64, patch: u64) -> u64 { major << 32 | minor << 16 | patch } @@ -45,8 +45,8 @@ impl FeatureGate { /// /// # Safety /// - /// Correctness in FeatureGate depends on monotonic increasing of version number, - /// should use `set_version` instead. + /// Correctness in FeatureGate depends on monotonic increasing of version + /// number, should use `set_version` instead. pub unsafe fn reset_version(&self, version: &str) -> Result<(), SemVerError> { let new = Version::parse(version)?; let val = ver_to_val(new.major, new.minor, new.patch); diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index c68a97f1dec..21c53f07a34 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -224,10 +224,10 @@ pub trait PdClient: Send + Sync { } /// Creates the cluster with cluster ID, node, stores and first Region. - /// If the cluster is already bootstrapped, return ClusterBootstrapped error. - /// When a node starts, if it finds nothing in the node and - /// cluster is not bootstrapped, it begins to create node, stores, first Region - /// and then call bootstrap_cluster to let PD know it. + /// If the cluster is already bootstrapped, return ClusterBootstrapped + /// error. When a node starts, if it finds nothing in the node and + /// cluster is not bootstrapped, it begins to create node, stores, first + /// Region and then call bootstrap_cluster to let PD know it. /// It may happen that multi nodes start at same time to try to /// bootstrap, but only one can succeed, while others will fail /// and must remove their created local Region data themselves. @@ -263,11 +263,12 @@ pub trait PdClient: Send + Sync { /// - For bootstrapping, PD knows first Region with `bootstrap_cluster`. /// - For changing Peer, PD determines where to add a new Peer in some store /// for this Region. - /// - For Region splitting, PD determines the new Region id and Peer id for the - /// split Region. - /// - For Region merging, PD knows which two Regions will be merged and which Region - /// and Peers will be removed. - /// - For auto-balance, PD determines how to move the Region from one store to another. + /// - For Region splitting, PD determines the new Region id and Peer id for + /// the split Region. + /// - For Region merging, PD knows which two Regions will be merged and + /// which Region and Peers will be removed. + /// - For auto-balance, PD determines how to move the Region from one store + /// to another. /// Gets store information if it is not a tombstone store. fn get_store(&self, _store_id: u64) -> Result { @@ -380,7 +381,8 @@ pub trait PdClient: Send + Sync { unimplemented!(); } - /// Registers a handler to the client, which will be invoked after reconnecting to PD. + /// Registers a handler to the client, which will be invoked after + /// reconnecting to PD. /// /// Please note that this method should only be called once. fn handle_reconnect(&self, _: F) @@ -409,8 +411,9 @@ pub trait PdClient: Send + Sync { } /// Gets a batch of timestamps from PD. - /// Return a timestamp with (physical, logical), indicating that timestamps allocated are: - /// [Timestamp(physical, logical - count + 1), Timestamp(physical, logical)] + /// Return a timestamp with (physical, logical), indicating that timestamps + /// allocated are: [Timestamp(physical, logical - count + 1), + /// Timestamp(physical, logical)] fn batch_get_tso(&self, _count: u32) -> PdFuture { unimplemented!() } diff --git a/components/pd_client/src/tso.rs b/components/pd_client/src/tso.rs index 6c99e87e4e7..a19d7af8f06 100644 --- a/components/pd_client/src/tso.rs +++ b/components/pd_client/src/tso.rs @@ -3,13 +3,15 @@ //! This module is the low-level mechanisms for getting timestamps from a PD //! cluster. It should be used via the `get_tso` API in `PdClient`. //! -//! Once a `TimestampOracle` is created, there will be two futures running in a background working -//! thread created automatically. The `get_timestamp` method creates a oneshot channel whose -//! transmitter is served as a `TimestampRequest`. `TimestampRequest`s are sent to the working -//! thread through a bounded multi-producer, single-consumer channel. Every time the first future -//! is polled, it tries to exhaust the channel to get as many requests as possible and sends a -//! single `TsoRequest` to the PD server. The other future receives `TsoResponse`s from the PD -//! server and allocates timestamps for the requests. +//! Once a `TimestampOracle` is created, there will be two futures running in a +//! background working thread created automatically. The `get_timestamp` method +//! creates a oneshot channel whose transmitter is served as a +//! `TimestampRequest`. `TimestampRequest`s are sent to the working thread +//! through a bounded multi-producer, single-consumer channel. Every time the +//! first future is polled, it tries to exhaust the channel to get as many +//! requests as possible and sends a single `TsoRequest` to the PD server. The +//! other future receives `TsoResponse`s from the PD server and allocates +//! timestamps for the requests. use std::{cell::RefCell, collections::VecDeque, pin::Pin, rc::Rc, thread}; @@ -37,13 +39,14 @@ struct TimestampRequest { count: u32, } -/// The timestamp oracle (TSO) which provides monotonically increasing timestamps. +/// The timestamp oracle (TSO) which provides monotonically increasing +/// timestamps. pub struct TimestampOracle { - /// The transmitter of a bounded channel which transports requests of getting a single - /// timestamp to the TSO working thread. A bounded channel is used to prevent using - /// too much memory unexpectedly. - /// In the working thread, the `TimestampRequest`, which is actually a one channel sender, - /// is used to send back the timestamp result. + /// The transmitter of a bounded channel which transports requests of + /// getting a single timestamp to the TSO working thread. A bounded + /// channel is used to prevent using too much memory unexpectedly. + /// In the working thread, the `TimestampRequest`, which is actually a one + /// channel sender, is used to send back the timestamp result. request_tx: mpsc::Sender, close_rx: watch::Receiver<()>, } @@ -113,12 +116,14 @@ async fn run_tso( mut request_rx: mpsc::Receiver, close_tx: watch::Sender<()>, ) { - // The `TimestampRequest`s which are waiting for the responses from the PD server + // The `TimestampRequest`s which are waiting for the responses from the PD + // server let pending_requests = Rc::new(RefCell::new(VecDeque::with_capacity(MAX_PENDING_COUNT))); - // When there are too many pending requests, the `send_request` future will refuse to fetch - // more requests from the bounded channel. This waker is used to wake up the sending future - // if the queue containing pending requests is no longer full. + // When there are too many pending requests, the `send_request` future will + // refuse to fetch more requests from the bounded channel. This waker is + // used to wake up the sending future if the queue containing pending + // requests is no longer full. let sending_future_waker = Rc::new(AtomicWaker::new()); let mut request_stream = TsoRequestStream { @@ -139,8 +144,8 @@ async fn run_tso( while let Some(Ok(resp)) = rpc_receiver.next().await { let mut pending_requests = pending_requests.borrow_mut(); - // Wake up the sending future blocked by too many pending requests as we are consuming - // some of them here. + // Wake up the sending future blocked by too many pending requests as we are + // consuming some of them here. if pending_requests.len() >= MAX_PENDING_COUNT { sending_future_waker.wake(); } @@ -204,8 +209,8 @@ impl<'a> Stream for TsoRequestStream<'a> { let write_flags = WriteFlags::default().buffer_hint(false); Poll::Ready(Some((req, write_flags))) } else { - // Set the waker to the context, then the stream can be waked up after the pending queue - // is no longer full. + // Set the waker to the context, then the stream can be waked up after the + // pending queue is no longer full. self.self_waker.register(cx.waker()); Poll::Pending } @@ -216,9 +221,9 @@ fn allocate_timestamps( resp: &TsoResponse, pending_requests: &mut VecDeque, ) -> Result<()> { - // PD returns the timestamp with the biggest logical value. We can send back timestamps - // whose logical value is from `logical - count + 1` to `logical` using the senders - // in `pending`. + // PD returns the timestamp with the biggest logical value. We can send back + // timestamps whose logical value is from `logical - count + 1` to `logical` + // using the senders in `pending`. let tail_ts = resp .timestamp .as_ref() diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 5ec629aacdb..e4145f16c0d 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -43,7 +43,8 @@ use super::{ const RETRY_INTERVAL: Duration = Duration::from_secs(1); // 1s const MAX_RETRY_TIMES: u64 = 5; -// The max duration when retrying to connect to leader. No matter if the MAX_RETRY_TIMES is reached. +// The max duration when retrying to connect to leader. No matter if the +// MAX_RETRY_TIMES is reached. const MAX_RETRY_DURATION: Duration = Duration::from_secs(10); // FIXME: Use a request-independent way to handle reconnection. @@ -317,7 +318,8 @@ impl Client { /// Re-establishes connection with PD leader in asynchronized fashion. /// /// If `force` is false, it will reconnect only when members change. - /// Note: Retrying too quickly will return an error due to cancellation. Please always try to reconnect after sending the request first. + /// Note: Retrying too quickly will return an error due to cancellation. + /// Please always try to reconnect after sending the request first. pub async fn reconnect(&self, force: bool) -> Result<()> { PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); let start = Instant::now(); @@ -477,9 +479,10 @@ where { loop { let ret = { - // Drop the read lock immediately to prevent the deadlock between the caller thread - // which may hold the read lock and wait for PD client thread completing the request - // and the PD client thread which may block on acquiring the write lock. + // Drop the read lock immediately to prevent the deadlock between the caller + // thread which may hold the read lock and wait for PD client thread + // completing the request and the PD client thread which may block + // on acquiring the write lock. let client_stub = client.inner.rl().client_stub.clone(); func(&client_stub).map_err(Error::Grpc) }; @@ -610,7 +613,8 @@ impl PdConnector { Ok((_, r)) => { let new_cluster_id = r.get_header().get_cluster_id(); if new_cluster_id == cluster_id { - // check whether the response have leader info, otherwise continue to loop the rest members + // check whether the response have leader info, otherwise continue to + // loop the rest members if r.has_leader() { return Ok(r); } @@ -635,9 +639,11 @@ impl PdConnector { } // There are 3 kinds of situations we will return the new client: - // 1. the force is true which represents the client is newly created or the original connection has some problem - // 2. the previous forwarded host is not empty and it can connect the leader now which represents the network partition problem to leader may be recovered - // 3. the member information of PD has been changed + // 1. the force is true which represents the client is newly created or the + // original connection has some problem 2. the previous forwarded host is + // not empty and it can connect the leader now which represents the network + // partition problem to leader may be recovered 3. the member information of + // PD has been changed async fn reconnect_pd( &self, members_resp: GetMembersResponse, @@ -844,8 +850,9 @@ pub fn find_bucket_index>(key: &[u8], bucket_keys: &[S]) -> Optio ) } -/// Merge incoming bucket stats. If a range in new buckets overlaps with multiple ranges in -/// current buckets, stats of the new range will be added to all stats of current ranges. +/// Merge incoming bucket stats. If a range in new buckets overlaps with +/// multiple ranges in current buckets, stats of the new range will be added to +/// all stats of current ranges. pub fn merge_bucket_stats, I: AsRef<[u8]>>( cur: &[C], cur_stats: &mut BucketStats, diff --git a/components/profiler/examples/prime.rs b/components/profiler/examples/prime.rs index fa54b2b2658..ede351acea5 100644 --- a/components/profiler/examples/prime.rs +++ b/components/profiler/examples/prime.rs @@ -24,7 +24,8 @@ //! valgrind --tool=callgrind --instr-atstart=no ../../target/debug/examples/prime //! ``` //! -//! You must not run example via `valgrind cargo run ...`. The framework won't detect Callgrind! +//! You must not run example via `valgrind cargo run ...`. The framework won't +//! detect Callgrind! #[inline(never)] fn is_prime_number(v: usize, prime_numbers: &[usize]) -> bool { diff --git a/components/profiler/src/lib.rs b/components/profiler/src/lib.rs index e3ea0d43a6a..2734d8f7877 100644 --- a/components/profiler/src/lib.rs +++ b/components/profiler/src/lib.rs @@ -30,11 +30,12 @@ //! //! Then, compile the code with `profiling` feature enabled. //! -//! By default, a profile called `app.profile` will be generated by CPU Profiler. -//! You can then analyze the profile using [pprof](https://github.com/google/pprof). +//! By default, a profile called `app.profile` will be generated by CPU +//! Profiler. You can then analyze the profile using [pprof](https://github.com/google/pprof). //! -//! If the application is running in Callgrind, a Callgrind profile dump will be generated instead. -//! Notice that you should run Callgrind with command line option `--instr-atstart=no`, e.g.: +//! If the application is running in Callgrind, a Callgrind profile dump will be +//! generated instead. Notice that you should run Callgrind with command line +//! option `--instr-atstart=no`, e.g.: //! //! ```bash //! valgrind --tool=callgrind --instr-atstart=no ./my_example diff --git a/components/profiler/src/profiler_unix.rs b/components/profiler/src/profiler_unix.rs index 822b89619a9..c53f32b3b44 100644 --- a/components/profiler/src/profiler_unix.rs +++ b/components/profiler/src/profiler_unix.rs @@ -16,14 +16,15 @@ lazy_static::lazy_static! { static ref ACTIVE_PROFILER: Mutex = Mutex::new(Profiler::None); } -/// Start profiling. Returns false if failed, i.e. there is already a profiling in progress. +/// Start profiling. Returns false if failed, i.e. there is already a profiling +/// in progress. /// -/// When `profiling` feature is not enabled, this function will do nothing and there is totally -/// zero cost. +/// When `profiling` feature is not enabled, this function will do nothing and +/// there is totally zero cost. /// /// When running in Callgrind, Callgrind instrumentation will be started -/// (`CALLGRIND_START_INSTRUMENTATION`). Otherwise, the CPU Profiler will be started and profile -/// will be generated to the file specified by `name`. +/// (`CALLGRIND_START_INSTRUMENTATION`). Otherwise, the CPU Profiler will be +/// started and profile will be generated to the file specified by `name`. // TODO: Better multi-thread support. #[inline] pub fn start(name: impl AsRef) -> bool { @@ -49,10 +50,11 @@ pub fn start(name: impl AsRef) -> bool { true } -/// Stop profiling. Returns false if failed, i.e. there is no profiling in progress. +/// Stop profiling. Returns false if failed, i.e. there is no profiling in +/// progress. /// -/// When `profiling` feature is not enabled, this function will do nothing and there is totally -/// zero cost. +/// When `profiling` feature is not enabled, this function will do nothing and +/// there is totally zero cost. #[inline] pub fn stop() -> bool { let mut profiler = ACTIVE_PROFILER.lock().unwrap(); diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 22d2d645165..628b066029d 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -311,7 +311,8 @@ impl RaftLogBatchTrait for RaftLogBatch { } fn cut_logs(&mut self, _: u64, _: u64, _: u64) { - // It's unnecessary because overlapped entries can be handled in `append`. + // It's unnecessary because overlapped entries can be handled in + // `append`. } fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { diff --git a/components/raft_log_engine/src/lib.rs b/components/raft_log_engine/src/lib.rs index 41ba961c48a..6156771afa8 100644 --- a/components/raft_log_engine/src/lib.rs +++ b/components/raft_log_engine/src/lib.rs @@ -10,7 +10,8 @@ //! Because there are so many similarly named types across the TiKV codebase, //! and so much "import renaming", this crate consistently explicitly names type //! that implement a trait as `RocksTraitname`, to avoid the need for import -//! renaming and make it obvious what type any particular module is working with. +//! renaming and make it obvious what type any particular module is working +//! with. //! //! Please read the engine_trait crate docs before hacking. diff --git a/components/raftstore-v2/src/batch/apply.rs b/components/raftstore-v2/src/batch/apply.rs index f71c98e5c86..ebc7696aa64 100644 --- a/components/raftstore-v2/src/batch/apply.rs +++ b/components/raftstore-v2/src/batch/apply.rs @@ -3,7 +3,8 @@ //! This module contains all structs related to apply batch system. //! //! After being started, each thread will have its own `ApplyPoller` and poll -//! using `ApplyContext`. For more information, see the documentation of batch-system. +//! using `ApplyContext`. For more information, see the documentation of +//! batch-system. use std::{ ops::{Deref, DerefMut}, diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 1d84ba47302..ee063fc15dd 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -34,7 +34,8 @@ pub struct StoreContext { pub trans: T, /// The latest configuration. pub cfg: Config, - /// The tick batch for delay ticking. It will be flushed at the end of every round. + /// The tick batch for delay ticking. It will be flushed at the end of every + /// round. pub tick_batch: Vec, /// The precise timer for scheduling tick. pub timer: SteadyTimer, @@ -236,7 +237,8 @@ impl StorePollerBuilder { } fn clean_up_tablets(&self, peers: &HashMap>) -> Result<()> { - // TODO: list all available tablets and destroy those which are not in the peers. + // TODO: list all available tablets and destroy those which are not in the + // peers. Ok(()) } } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 71062161384..0739cd61cb7 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -2,10 +2,11 @@ //! Raftstore is the place where we implement multi-raft. //! -//! The thread module of raftstore is batch-system, more check components/batch-system. -//! All state machines are defined in [`fsm`] module. Everything that wrapping raft is -//! implemented in [`raft`] module. And the commands, including split/merge/confchange/read/write, -//! are implemented in [`operation`] module. All state machines are expected to communicate with +//! The thread module of raftstore is batch-system, more check +//! components/batch-system. All state machines are defined in [`fsm`] module. +//! Everything that wrapping raft is implemented in [`raft`] module. And the +//! commands, including split/merge/confchange/read/write, are implemented in +//! [`operation`] module. All state machines are expected to communicate with //! messages. They are defined in [`router`] module. #![allow(unused)] diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index c3cede21ebc..aebb1bf7406 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -60,8 +60,8 @@ impl Peer { }; let tablet_index = s.region_state().get_tablet_index(); - // Another option is always create tablet even if tablet index is 0. But this can - // introduce race when gc old tablet and create new peer. + // Another option is always create tablet even if tablet index is 0. But this + // can introduce race when gc old tablet and create new peer. let tablet = if tablet_index != 0 { if !tablet_factory.exists(region_id, tablet_index) { return Err(box_err!( diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 37d9515d301..12041f56fe7 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -60,9 +60,9 @@ where /// Variants of channels for `Msg`. /// - `Read`: a channel for read only requests including `StatusRequest`, -/// `GetRequest` and `SnapRequest` +/// `GetRequest` and `SnapRequest` /// - `Write`: a channel for write only requests including `AdminRequest` -/// `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. +/// `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. /// Prefer channel rather than callback because: /// 1. channel can be reused, hence reduce allocations. /// 2. channel may not need dynamic dispatch. @@ -234,8 +234,8 @@ pub enum PeerMsg { /// leader of the target raft group. If it's failed to be sent, callback /// usually needs to be called before dropping in case of resource leak. RaftCommand(RaftCommand), - /// Tick is periodical task. If target peer doesn't exist there is a potential - /// that the raft node will not work anymore. + /// Tick is periodical task. If target peer doesn't exist there is a + /// potential that the raft node will not work anymore. Tick(PeerTick), /// Result of applying committed entries. The message can't be lost. ApplyRes { diff --git a/components/raftstore-v2/src/tablet.rs b/components/raftstore-v2/src/tablet.rs index f4f5bdcbc6f..8552b1a1f0f 100644 --- a/components/raftstore-v2/src/tablet.rs +++ b/components/raftstore-v2/src/tablet.rs @@ -10,8 +10,9 @@ struct LatestTablet { version: AtomicU64, } -/// Tablet may change during split, merge and applying snapshot. So we need a shared value to -/// reflect the latest tablet. `CachedTablet` provide cache that can speed up common access. +/// Tablet may change during split, merge and applying snapshot. So we need a +/// shared value to reflect the latest tablet. `CachedTablet` provide cache that +/// can speed up common access. #[derive(Clone)] pub struct CachedTablet { latest: Arc>, diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index 1609cc3001a..1087b18c287 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -53,7 +53,8 @@ pub struct Config { #[online_config(skip)] pub prefer_approximate_bucket: bool, // ratio of region_bucket_size. (0, 0.5) - // The region_bucket_merge_size_ratio * region_bucket_size is threshold to merge with its left neighbor bucket + // The region_bucket_merge_size_ratio * region_bucket_size is threshold to merge with its left + // neighbor bucket pub region_bucket_merge_size_ratio: f64, } diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index cd370e332e3..8122f54b12d 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -38,7 +38,7 @@ pub trait ClonableObserver: 'static + Send { } macro_rules! impl_box_observer { - ($name:ident, $ob: ident, $wrapper: ident) => { + ($name:ident, $ob:ident, $wrapper:ident) => { pub struct $name(Box + Send>); impl $name { pub fn new(observer: T) -> $name { @@ -82,7 +82,7 @@ macro_rules! impl_box_observer { // This is the same as impl_box_observer_g except $ob has a typaram macro_rules! impl_box_observer_g { - ($name:ident, $ob: ident, $wrapper: ident) => { + ($name:ident, $ob:ident, $wrapper:ident) => { pub struct $name(Box> + Send>); impl $name { pub fn new + Clone>(observer: T) -> $name { @@ -254,8 +254,9 @@ impl Registry { } } -/// A macro that loops over all observers and returns early when error is found or -/// bypass is set. `try_loop_ob` is expected to be used for hook that returns a `Result`. +/// A macro that loops over all observers and returns early when error is found +/// or bypass is set. `try_loop_ob` is expected to be used for hook that returns +/// a `Result`. macro_rules! try_loop_ob { ($r:expr, $obs:expr, $hook:ident, $($args:tt)*) => { loop_ob!(_imp _res, $r, $obs, $hook, $($args)*) @@ -439,10 +440,11 @@ impl CoprocessorHost { } } - /// `post_exec` should be called immediately after we executed one raft command. - /// It notifies observers side effects of this command before execution of the next command, - /// including req/resp, apply state, modified region state, etc. - /// Return true observers think a persistence is necessary. + /// `post_exec` should be called immediately after we executed one raft + /// command. It notifies observers side effects of this command before + /// execution of the next command, including req/resp, apply state, + /// modified region state, etc. Return true observers think a + /// persistence is necessary. pub fn post_exec( &self, region: &Region, diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index b798c7577af..8a4975b1459 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -94,13 +94,15 @@ pub trait AdminObserver: Coprocessor { /// For now, the `region` in `ObserverContext` is an empty region. fn post_apply_admin(&self, _: &mut ObserverContext<'_>, _: &AdminResponse) {} - /// Hook before exec admin request, returns whether we should skip this admin. + /// Hook before exec admin request, returns whether we should skip this + /// admin. fn pre_exec_admin(&self, _: &mut ObserverContext<'_>, _: &AdminRequest) -> bool { false } /// Hook to call immediately after exec command - /// Will be a special persistence after this exec if a observer returns true. + /// Will be a special persistence after this exec if a observer returns + /// true. fn post_exec_admin( &self, _: &mut ObserverContext<'_>, @@ -113,7 +115,8 @@ pub trait AdminObserver: Coprocessor { } pub trait QueryObserver: Coprocessor { - /// Hook when observe applying empty cmd, probably caused by leadership change. + /// Hook when observe applying empty cmd, probably caused by leadership + /// change. fn on_empty_cmd(&self, _: &mut ObserverContext<'_>, _index: u64, _term: u64) {} /// Hook to call before proposing write request. @@ -130,13 +133,15 @@ pub trait QueryObserver: Coprocessor { /// For now, the `region` in `ObserverContext` is an empty region. fn post_apply_query(&self, _: &mut ObserverContext<'_>, _: &Cmd) {} - /// Hook before exec write request, returns whether we should skip this write. + /// Hook before exec write request, returns whether we should skip this + /// write. fn pre_exec_query(&self, _: &mut ObserverContext<'_>, _: &[Request]) -> bool { false } /// Hook to call immediately after exec command. - /// Will be a special persistence after this exec if a observer returns true. + /// Will be a special persistence after this exec if a observer returns + /// true. fn post_exec_query( &self, _: &mut ObserverContext<'_>, @@ -150,12 +155,12 @@ pub trait QueryObserver: Coprocessor { pub trait ApplySnapshotObserver: Coprocessor { /// Hook to call after applying key from plain file. - /// This may be invoked multiple times for each plain file, and each time a batch of key-value - /// pairs will be passed to the function. + /// This may be invoked multiple times for each plain file, and each time a + /// batch of key-value pairs will be passed to the function. fn apply_plain_kvs(&self, _: &mut ObserverContext<'_>, _: CfName, _: &[(Vec, Vec)]) {} - /// Hook to call after applying sst file. Currently the content of the snapshot can't be - /// passed to the observer. + /// Hook to call after applying sst file. Currently the content of the + /// snapshot can't be passed to the observer. fn apply_sst(&self, _: &mut ObserverContext<'_>, _: CfName, _path: &str) {} } @@ -216,8 +221,8 @@ pub trait RoleObserver: Coprocessor { /// Hook to call when role of a peer changes. /// /// Please note that, this hook is not called at realtime. There maybe a - /// situation that the hook is not called yet, however the role of some peers - /// have changed. + /// situation that the hook is not called yet, however the role of some + /// peers have changed. fn on_role_change(&self, _: &mut ObserverContext<'_>, _: &RoleChange) {} } @@ -274,8 +279,9 @@ impl ObserveID { } } -/// ObserveHandle is the status of a term of observing, it contains the `ObserveID` -/// and the `observing` flag indicate whether the observing is ongoing +/// ObserveHandle is the status of a term of observing, it contains the +/// `ObserveID` and the `observing` flag indicate whether the observing is +/// ongoing #[derive(Clone, Default, Debug)] pub struct ObserveHandle { pub id: ObserveID, @@ -326,14 +332,15 @@ impl CmdObserveInfo { } } - /// Get the max observe level of the observer info by the observers currently registered. - /// Currently, TiKV uses a static strategy for managing observers. - /// There are a fixed number type of observer being registered in each TiKV node, - /// and normally, observers are singleton. + /// Get the max observe level of the observer info by the observers + /// currently registered. Currently, TiKV uses a static strategy for + /// managing observers. There are a fixed number type of observer being + /// registered in each TiKV node, and normally, observers are singleton. /// The types are: /// CDC: Observer supports the `ChangeData` service. /// PiTR: Observer supports the `backup-log` function. - /// RTS: Observer supports the `resolved-ts` advancing (and follower read, etc.). + /// RTS: Observer supports the `resolved-ts` advancing (and follower read, + /// etc.). fn observe_level(&self) -> ObserveLevel { let cdc = if self.cdc_id.is_observing() { // `cdc` observe all data @@ -449,7 +456,8 @@ pub trait CmdObserver: Coprocessor { cmd_batches: &mut Vec, engine: &E, ); - // TODO: maybe shoulde move `on_applied_current_term` to a separated `Coprocessor` + // TODO: maybe should move `on_applied_current_term` to a separated + // `Coprocessor` /// Hook to call at the first time the leader applied on its term fn on_applied_current_term(&self, role: StateRole, region: &Region); } diff --git a/components/raftstore/src/coprocessor/region_info_accessor.rs b/components/raftstore/src/coprocessor/region_info_accessor.rs index c38f1161a1f..e8a5b1ac1c9 100644 --- a/components/raftstore/src/coprocessor/region_info_accessor.rs +++ b/components/raftstore/src/coprocessor/region_info_accessor.rs @@ -24,20 +24,23 @@ use super::{ ObserverContext, RegionChangeEvent, RegionChangeObserver, Result, RoleChange, RoleObserver, }; -/// `RegionInfoAccessor` is used to collect all regions' information on this TiKV into a collection -/// so that other parts of TiKV can get region information from it. It registers a observer to -/// raftstore, which is named `RegionEventListener`. When the events that we are interested in -/// happen (such as creating and deleting regions), `RegionEventListener` simply sends the events -/// through a channel. -/// In the mean time, `RegionCollector` keeps fetching messages from the channel, and mutates -/// the collection according to the messages. When an accessor method of `RegionInfoAccessor` is -/// called, it also simply sends a message to `RegionCollector`, and the result will be sent -/// back through as soon as it's finished. -/// In fact, the channel mentioned above is actually a `util::worker::Worker`. +/// `RegionInfoAccessor` is used to collect all regions' information on this +/// TiKV into a collection so that other parts of TiKV can get region +/// information from it. It registers a observer to raftstore, which is named +/// `RegionEventListener`. When the events that we are interested in happen +/// (such as creating and deleting regions), `RegionEventListener` simply +/// sends the events through a channel. +/// In the mean time, `RegionCollector` keeps fetching messages from the +/// channel, and mutates the collection according to the messages. When an +/// accessor method of `RegionInfoAccessor` is called, it also simply sends a +/// message to `RegionCollector`, and the result will be sent back through as +/// soon as it's finished. In fact, the channel mentioned above is actually a +/// `util::worker::Worker`. /// -/// **Caution**: Note that the information in `RegionInfoAccessor` is not perfectly precise. Some -/// regions may be temporarily absent while merging or splitting is in progress. Also, -/// `RegionInfoAccessor`'s information may slightly lag the actual regions on the TiKV. +/// **Caution**: Note that the information in `RegionInfoAccessor` is not +/// perfectly precise. Some regions may be temporarily absent while merging or +/// splitting is in progress. Also, `RegionInfoAccessor`'s information may +/// slightly lag the actual regions on the TiKV. /// `RaftStoreEvent` Represents events dispatched from raftstore coprocessor. #[derive(Debug)] @@ -81,9 +84,10 @@ impl RegionInfo { type RegionsMap = HashMap; type RegionRangesMap = BTreeMap; -// RangeKey is a wrapper used to unify the comparsion between region start key -// and region end key. Region end key is special as empty stands for the infinite, -// so we need to take special care for cases where the end key is empty. +// RangeKey is a wrapper used to unify the comparison between region start key +// and region end key. Region end key is special as empty stands for the +// infinite, so we need to take special care for cases where the end key is +// empty. #[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] pub enum RangeKey { Finite(Vec), @@ -107,8 +111,8 @@ impl RangeKey { pub type Callback = Box; pub type SeekRegionCallback = Box) + Send>; -/// `RegionInfoAccessor` has its own thread. Queries and updates are done by sending commands to the -/// thread. +/// `RegionInfoAccessor` has its own thread. Queries and updates are done by +/// sending commands to the thread. pub enum RegionInfoQuery { RaftStoreEvent(RaftStoreEvent), SeekRegion { @@ -151,8 +155,8 @@ impl Display for RegionInfoQuery { } } -/// `RegionEventListener` implements observer traits. It simply send the events that we are interested in -/// through the `scheduler`. +/// `RegionEventListener` implements observer traits. It simply send the events +/// that we are interested in through the `scheduler`. #[derive(Clone)] struct RegionEventListener { scheduler: Scheduler, @@ -206,9 +210,10 @@ fn register_region_event_listener( .register_region_change_observer(1, BoxRegionChangeObserver::new(listener)); } -/// `RegionCollector` is the place where we hold all region information we collected, and the -/// underlying runner of `RegionInfoAccessor`. It listens on events sent by the `RegionEventListener` and -/// keeps information of all regions. Role of each region are also tracked. +/// `RegionCollector` is the place where we hold all region information we +/// collected, and the underlying runner of `RegionInfoAccessor`. It listens on +/// events sent by the `RegionEventListener` and keeps information of all +/// regions. Role of each region are also tracked. pub struct RegionCollector { // HashMap: region_id -> (Region, State) regions: RegionsMap, @@ -277,9 +282,10 @@ impl RegionCollector { } fn handle_create_region(&mut self, region: Region, role: StateRole) { - // During tests, we found that the `Create` event may arrive multiple times. And when we - // receive an `Update` message, the region may have been deleted for some reason. So we - // handle it according to whether the region exists in the collection. + // During tests, we found that the `Create` event may arrive multiple times. And + // when we receive an `Update` message, the region may have been deleted for + // some reason. So we handle it according to whether the region exists in the + // collection. if self.regions.contains_key(®ion.get_id()) { info!( "trying to create region but it already exists, try to update it"; @@ -324,8 +330,8 @@ impl RegionCollector { let removed_id = self.region_ranges.remove(&end_key).unwrap(); assert_eq!(removed_id, region.get_id()); } else { - // It's possible that the region is already removed because it's end_key is used by - // another newer region. + // It's possible that the region is already removed because it's end_key is used + // by another newer region. debug!( "destroying region but it doesn't exist"; "region_id" => region.get_id(), @@ -348,29 +354,33 @@ impl RegionCollector { self.create_region(region, new_role); } - /// Determines whether `region_to_check`'s epoch is stale compared to `current`'s epoch + /// Determines whether `region_to_check`'s epoch is stale compared to + /// `current`'s epoch #[inline] fn is_region_epoch_stale(&self, region_to_check: &Region, current: &Region) -> bool { let epoch = region_to_check.get_region_epoch(); let current_epoch = current.get_region_epoch(); // Only compare conf_ver when they have the same version. - // When a region A merges region B, region B may have a greater conf_ver. Then, the new - // merged region meta has larger version but smaller conf_ver than the original B's. In this - // case, the incoming region meta has a smaller conf_ver but is not stale. + // When a region A merges region B, region B may have a greater conf_ver. Then, + // the new merged region meta has larger version but smaller conf_ver than the + // original B's. In this case, the incoming region meta has a smaller conf_ver + // but is not stale. epoch.get_version() < current_epoch.get_version() || (epoch.get_version() == current_epoch.get_version() && epoch.get_conf_ver() < current_epoch.get_conf_ver()) } - /// For all regions whose range overlaps with the given `region` or region_id is the same as - /// `region`'s, checks whether the given `region`'s epoch is not older than theirs. + /// For all regions whose range overlaps with the given `region` or + /// region_id is the same as `region`'s, checks whether the given + /// `region`'s epoch is not older than theirs. /// - /// Returns false if the given `region` is stale, which means, at least one region above has - /// newer epoch. - /// If the given `region` is not stale, all other regions in the collection that overlaps with - /// the given `region` must be stale. Returns true in this case, and if `clear_regions_in_range` - /// is true, those out-of-date regions will be removed from the collection. + /// Returns false if the given `region` is stale, which means, at least one + /// region above has newer epoch. + /// If the given `region` is not stale, all other regions in the collection + /// that overlaps with the given `region` must be stale. Returns true in + /// this case, and if `clear_regions_in_range` is true, those out-of-date + /// regions will be removed from the collection. fn check_region_range(&mut self, region: &Region, clear_regions_in_range: bool) -> bool { if let Some(region_with_same_id) = self.regions.get(®ion.get_id()) { if self.is_region_epoch_stale(region, ®ion_with_same_id.region) { @@ -458,14 +468,14 @@ impl RegionCollector { let region = event.get_region(); if region.get_region_epoch().get_version() == 0 { // Ignore messages with version 0. - // In raftstore `Peer::replicate`, the region meta's fields are all initialized with - // default value except region_id. So if there is more than one region replicating - // when the TiKV just starts, the assertion "Any two region with different ids and - // overlapping ranges must have different version" fails. + // In raftstore `Peer::replicate`, the region meta's fields are all initialized + // with default value except region_id. So if there is more than one region + // replicating when the TiKV just starts, the assertion "Any two region with + // different ids and overlapping ranges must have different version" fails. // // Since 0 is actually an invalid value of version, we can simply ignore the - // messages with version 0. The region will be created later when the region's epoch - // is properly set and an Update message was sent. + // messages with version 0. The region will be created later when the region's + // epoch is properly set and an Update message was sent. return; } if !self.check_region_range(region, true) { @@ -564,7 +574,8 @@ impl RunnableWithTimer for RegionCollector { } } -/// `RegionInfoAccessor` keeps all region information separately from raftstore itself. +/// `RegionInfoAccessor` keeps all region information separately from raftstore +/// itself. #[derive(Clone)] pub struct RegionInfoAccessor { // We use a dedicated worker for region info accessor. If we later want to share a worker with @@ -578,8 +589,9 @@ pub struct RegionInfoAccessor { impl RegionInfoAccessor { /// Creates a new `RegionInfoAccessor` and register to `host`. - /// `RegionInfoAccessor` doesn't need, and should not be created more than once. If it's needed - /// in different places, just clone it, and their contents are shared. + /// `RegionInfoAccessor` doesn't need, and should not be created more than + /// once. If it's needed in different places, just clone it, and their + /// contents are shared. pub fn new(host: &mut CoprocessorHost) -> Self { let worker = WorkerBuilder::new("region-collector-worker").create(); let scheduler = worker.start_with_timer("region-collector-worker", RegionCollector::new()); @@ -605,8 +617,8 @@ impl RegionInfoAccessor { } pub trait RegionInfoProvider: Send + Sync { - /// Get a iterator of regions that contains `from` or have keys larger than `from`, and invoke - /// the callback to process the result. + /// Get a iterator of regions that contains `from` or have keys larger than + /// `from`, and invoke the callback to process the result. fn seek_region(&self, _from: &[u8], _callback: SeekRegionCallback) -> Result<()> { unimplemented!() } @@ -762,7 +774,8 @@ mod tests { } } - /// Adds a set of regions to an empty collection and check if it's successfully loaded. + /// Adds a set of regions to an empty collection and check if it's + /// successfully loaded. fn must_load_regions(c: &mut RegionCollector, regions: &[Region]) { assert!(c.regions.is_empty()); assert!(c.region_ranges.is_empty()); @@ -819,8 +832,9 @@ mod tests { .get_version(); assert!(region.get_region_epoch().get_version() < version); } - // If end_key is updated and the region_id corresponding to the `old_end_key` doesn't equals - // to `region_id`, it shouldn't be removed since it was used by another region. + // If end_key is updated and the region_id corresponding to the `old_end_key` + // doesn't equals to `region_id`, it shouldn't be removed since it was + // used by another region. if let Some(old_end_key) = old_end_key { if old_end_key.as_slice() != region.get_end_key() { assert!( @@ -849,8 +863,8 @@ mod tests { c.handle_raftstore_event(RaftStoreEvent::DestroyRegion { region }); assert!(c.regions.get(&id).is_none()); - // If the region_id corresponding to the end_key doesn't equals to `id`, it shouldn't be - // removed since it was used by another region. + // If the region_id corresponding to the end_key doesn't equals to `id`, it + // shouldn't be removed since it was used by another region. if let Some(end_key) = end_key { assert!( c.region_ranges @@ -1100,9 +1114,10 @@ mod tests { ); } - /// Simulates splitting a region into 3 regions, and the region with old id will be the - /// `derive_index`-th region of them. The events are triggered in order indicated by `seq`. - /// This is to ensure the collection is correct, no matter what the events' order to happen is. + /// Simulates splitting a region into 3 regions, and the region with old id + /// will be the `derive_index`-th region of them. The events are triggered + /// in order indicated by `seq`. This is to ensure the collection is + /// correct, no matter what the events' order to happen is. /// Values in `seq` and of `derive_index` start from 1. fn test_split_impl(derive_index: usize, seq: &[usize]) { let mut c = RegionCollector::new(); @@ -1210,15 +1225,16 @@ mod tests { ]; must_load_regions(&mut c, init_regions); - // While splitting, region 4 created but region 2 still has an `update` event which haven't - // been handled. + // While splitting, region 4 created but region 2 still has an `update` event + // which haven't been handled. must_create_region(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Follower); must_update_region(&mut c, &new_region(2, b"k1", b"k9", 1), StateRole::Follower); must_change_role(&mut c, &new_region(2, b"k1", b"k9", 1), StateRole::Leader); must_update_region(&mut c, &new_region(2, b"k1", b"k5", 2), StateRole::Leader); - // TODO: In fact, region 2's role should be follower. However because it's previous state was - // removed while creating updating region 4, it can't be successfully updated. Fortunately - // this case may hardly happen so it can be fixed later. + // TODO: In fact, region 2's role should be follower. However because it's + // previous state was removed while creating updating region 4, it can't be + // successfully updated. Fortunately this case may hardly happen so it can be + // fixed later. check_collection( &c, &[ @@ -1229,8 +1245,9 @@ mod tests { ], ); - // While merging, region 2 expanded and covered region 4 (and their end key become the same) - // but region 4 still has an `update` event which haven't been handled. + // While merging, region 2 expanded and covered region 4 (and their end key + // become the same) but region 4 still has an `update` event which haven't been + // handled. must_update_region(&mut c, &new_region(2, b"k1", b"k9", 3), StateRole::Leader); must_update_region(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Follower); must_change_role(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Leader); diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index 8c0d7aad86c..892a38a7f48 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -62,7 +62,8 @@ where if self.current_count > self.split_threshold && !over_limit { self.split_keys.push(keys::origin_key(key.key()).to_vec()); // if for previous on_kv() self.current_count == self.split_threshold, - // the split key would be pushed this time, but the entry for this time should not be ignored. + // the split key would be pushed this time, but the entry for this time should + // not be ignored. self.current_count = 1; over_limit = self.split_keys.len() as u64 >= self.batch_split_limit; } @@ -184,7 +185,8 @@ where REGION_KEYS_HISTOGRAM.observe(region_keys as f64); // if bucket checker using scan is added, to utilize the scan, // add keys checker as well for free - // It has the assumption that the size's checker is before the keys's check in the host + // It has the assumption that the size's checker is before the keys's check in + // the host let need_split_region = region_keys >= host.cfg.region_max_keys(); if need_split_region { info!( @@ -608,8 +610,8 @@ mod tests { let region_size = get_region_approximate_size(&engine, ®ion, ReadableSize::mb(1000).0).unwrap(); // to make the region_max_size < region_split_size + region_size - // The split by keys should still work. But if the bug in on_kv() in size.rs exists, - // it will result in split by keys failed. + // The split by keys should still work. But if the bug in on_kv() in size.rs + // exists, it will result in split by keys failed. cfg.region_max_size = Some(ReadableSize(region_size * 6 / 5)); cfg.region_split_size = ReadableSize(region_size * 4 / 5); runnable = SplitCheckRunner::new(engine, tx.clone(), CoprocessorHost::new(tx, cfg)); diff --git a/components/raftstore/src/coprocessor/split_check/mod.rs b/components/raftstore/src/coprocessor/split_check/mod.rs index 9f1cbf17eb1..3978789db91 100644 --- a/components/raftstore/src/coprocessor/split_check/mod.rs +++ b/components/raftstore/src/coprocessor/split_check/mod.rs @@ -92,8 +92,8 @@ impl<'a, E> Host<'a, E> { const MIN_BUCKET_COUNT_PER_REGION: u64 = 2; if region_size >= self.cfg.region_bucket_size.0 * MIN_BUCKET_COUNT_PER_REGION { let mut bucket_checker = size::Checker::new( - self.cfg.region_bucket_size.0, /* not used */ - self.cfg.region_bucket_size.0, /* not used */ + self.cfg.region_bucket_size.0, // not used + self.cfg.region_bucket_size.0, // not used region_size / self.cfg.region_bucket_size.0, CheckPolicy::Approximate, ); diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 352e956d43e..faff7b77c0a 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -58,7 +58,8 @@ where if self.current_size > self.split_size && !over_limit { self.split_keys.push(keys::origin_key(entry.key()).to_vec()); // if for previous on_kv() self.current_size == self.split_size, - // the split key would be pushed this time, but the entry size for this time should not be ignored. + // the split key would be pushed this time, but the entry size for this time + // should not be ignored. self.current_size = if self.current_size - size == self.split_size { size } else { @@ -615,8 +616,9 @@ pub mod tests { let cop_host = CoprocessorHost::new(tx.clone(), cfg); let mut runnable = SplitCheckRunner::new(engine.clone(), tx, cop_host.clone()); for i in 0..2000 { - // if not mvcc, kv size is (6+1)*2 = 14, given bucket size is 3000, expect each bucket has about 210 keys - // if mvcc, kv size is about 18*2 = 36, expect each bucket has about 80 keys + // if not mvcc, kv size is (6+1)*2 = 14, given bucket size is 3000, expect each + // bucket has about 210 keys if mvcc, kv size is about 18*2 = 36, expect each + // bucket has about 80 keys let s = key_gen(format!("{:04}00", i).as_bytes(), mvcc, i.into()); engine.put_cf(data_cf, &s, &s).unwrap(); if i % 10 == 0 && i > 0 { @@ -645,8 +647,9 @@ pub mod tests { // insert keys into 0000 ~ 0020 with 000000 ~ 002000 for i in 0..2000 { - // kv size is (6+1)*2 = 14, given bucket size is 3000, expect each bucket has about 210 keys - // if mvcc, kv size is about 18*2 = 36, expect each bucket has about 80 keys + // kv size is (6+1)*2 = 14, given bucket size is 3000, expect each bucket has + // about 210 keys if mvcc, kv size is about 18*2 = 36, expect each bucket has + // about 80 keys let s = key_gen(format!("{:06}", i).as_bytes(), mvcc, i.into()); engine.put_cf(data_cf, &s, &s).unwrap(); if i % 10 == 0 { diff --git a/components/raftstore/src/coprocessor/split_check/table.rs b/components/raftstore/src/coprocessor/split_check/table.rs index df2fa0fb7c6..9b5220938fd 100644 --- a/components/raftstore/src/coprocessor/split_check/table.rs +++ b/components/raftstore/src/coprocessor/split_check/table.rs @@ -26,8 +26,9 @@ where E: KvEngine, { /// Feed keys in order to find the split key. - /// If `current_data_key` does not belong to `status.first_encoded_table_prefix`. - /// it returns the encoded table prefix of `current_data_key`. + /// If `current_data_key` does not belong to + /// `status.first_encoded_table_prefix`. it returns the encoded table + /// prefix of `current_data_key`. fn on_kv(&mut self, _: &mut ObserverContext<'_>, entry: &KeyEntry) -> bool { if self.split_key.is_some() { return true; diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index c788f7c2d1e..6b652670138 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -81,7 +81,8 @@ where } } -/// WriteTask contains write tasks which need to be persisted to kv db and raft db. +/// WriteTask contains write tasks which need to be persisted to kv db and raft +/// db. pub struct WriteTask where EK: KvEngine, @@ -273,7 +274,8 @@ where } fn clear(&mut self) { - // raft_wb doesn't have clear interface and it should be consumed by raft db before + // raft_wb doesn't have clear interface and it should be consumed by raft db + // before self.kv_wb.clear(); self.raft_states.clear(); self.state_size = 0; @@ -582,11 +584,12 @@ where "error_code" => %e.error_code(), ); self.message_metrics.add(msg_type, false); - // If this msg is snapshot, it is unnecessary to send snapshot - // status to this peer because it has already become follower. - // (otherwise the snapshot msg should be sent in store thread other than here) - // Also, the follower don't need flow control, so don't send - // unreachable msg here. + // If this msg is snapshot, it is unnecessary to send + // snapshot status to this peer because it has already + // become follower. (otherwise the snapshot msg should be + // sent in store thread other than here) Also, the follower + // don't need flow control, so don't send unreachable msg + // here. } else { self.message_metrics.add(msg_type, true); } diff --git a/components/raftstore/src/store/async_io/write_router.rs b/components/raftstore/src/store/async_io/write_router.rs index 384273a97ad..6b19212c164 100644 --- a/components/raftstore/src/store/async_io/write_router.rs +++ b/components/raftstore/src/store/async_io/write_router.rs @@ -90,7 +90,8 @@ where } } - /// Send write msg to write worker or push into inner buffer and wait for rescheduling. + /// Send write msg to write worker or push into inner buffer and wait for + /// rescheduling. pub fn send_write_msg>( &mut self, ctx: &mut C, @@ -105,9 +106,9 @@ where } } - /// If there is some msgs need to be rescheduled, check the new persisted number and - /// sending these msgs to a new write worker if persisted number is greater than - /// `self.last_unpersisted`. + /// If there is some msgs need to be rescheduled, check the new persisted + /// number and sending these msgs to a new write worker if persisted + /// number is greater than `self.last_unpersisted`. pub fn check_new_persisted>( &mut self, ctx: &mut C, @@ -117,7 +118,8 @@ where return; } // The peer must be destroyed after all previous write tasks have been finished. - // So do not worry about a destroyed peer being counted in `io_reschedule_concurrent_count`. + // So do not worry about a destroyed peer being counted in + // `io_reschedule_concurrent_count`. ctx.io_reschedule_concurrent_count() .fetch_sub(1, Ordering::SeqCst); @@ -144,10 +146,12 @@ where } } - /// Check if write task can be sent to write worker or pushed into `self.pending_write_msgs`. + /// Check if write task can be sent to write worker or pushed into + /// `self.pending_write_msgs`. /// - /// Returns false if the task should be pushed into `self.pending_write_msgs`. - /// true means the task should be sent to the write worker. + /// Returns false if the task should be pushed into + /// `self.pending_write_msgs`. true means the task should be sent to the + /// write worker. fn should_send>( &mut self, ctx: &mut C, @@ -180,7 +184,8 @@ where } if self.next_writer_id.is_none() { // The hot write peers should not be rescheduled entirely. - // So it will not be rescheduled if the random id is the same as the original one. + // So it will not be rescheduled if the random id is the same as the original + // one. let new_id = rand::random::() % ctx.config().store_io_pool_size; if new_id == self.writer_id { // Reset the time @@ -191,8 +196,9 @@ where } // This peer should be rescheduled. // Try to add 1 to `io_reschedule_concurrent_count`. - // The `cfg.io_reschedule_concurrent_max_count` is used for controlling the concurrent count - // of rescheduling peer fsm because rescheduling will introduce performance penalty. + // The `cfg.io_reschedule_concurrent_max_count` is used for controlling the + // concurrent count of rescheduling peer fsm because rescheduling will + // introduce performance penalty. let success = ctx .io_reschedule_concurrent_count() .fetch_update(Ordering::SeqCst, Ordering::Relaxed, |c| { @@ -205,7 +211,8 @@ where .is_ok(); if success { STORE_IO_RESCHEDULE_PEER_TOTAL_GAUGE.inc(); - // Rescheduling succeeds. The task should be pushed into `self.pending_write_msgs`. + // Rescheduling succeeds. The task should be pushed into + // `self.pending_write_msgs`. self.last_unpersisted = last_unpersisted; info!("starts io reschedule"; "tag" => &self.tag); false diff --git a/components/raftstore/src/store/bootstrap.rs b/components/raftstore/src/store/bootstrap.rs index e1c90a177c7..1ee8e9ddc10 100644 --- a/components/raftstore/src/store/bootstrap.rs +++ b/components/raftstore/src/store/bootstrap.rs @@ -44,8 +44,8 @@ fn is_range_empty( // Bootstrap the store, the DB for this store must be empty and has no data. // -// FIXME: ER typaram should just be impl KvEngine, but RaftEngine doesn't support -// the `is_range_empty` query yet. +// FIXME: ER typaram should just be impl KvEngine, but RaftEngine doesn't +// support the `is_range_empty` query yet. pub fn bootstrap_store( engines: &Engines, cluster_id: u64, diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index 4fb4c7feb7a..1aee90b6463 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -47,8 +47,8 @@ impl CompactionGuardGeneratorFactory

{ } } -// Update to implement engine_traits::SstPartitionerFactory instead once we move to use abstracted -// ColumnFamilyOptions in src/config.rs. +// Update to implement engine_traits::SstPartitionerFactory instead once we move +// to use abstracted ColumnFamilyOptions in src/config.rs. impl SstPartitionerFactory for CompactionGuardGeneratorFactory

{ @@ -59,9 +59,9 @@ impl SstPartitionerFactory } fn create_partitioner(&self, context: &SstPartitionerContext<'_>) -> Option { - // create_partitioner can be called in RocksDB while holding db_mutex. It can block - // other operations on RocksDB. To avoid such caces, we defer region info query to - // the first time should_partition is called. + // create_partitioner can be called in RocksDB while holding db_mutex. It can + // block other operations on RocksDB. To avoid such cases, we defer + // region info query to the first time should_partition is called. Some(CompactionGuardGenerator { cf_name: self.cf_name, smallest_key: context.smallest_key.to_vec(), @@ -383,8 +383,8 @@ mod tests { DBCompressionType::No, DBCompressionType::No, ]); - // Make block size small to make sure current_output_file_size passed to SstPartitioner - // is accurate. + // Make block size small to make sure current_output_file_size passed to + // SstPartitioner is accurate. let mut block_based_opts = BlockBasedOptions::new(); block_based_opts.set_block_size(100); cf_opts.set_block_based_table_factory(&block_based_opts); @@ -437,26 +437,26 @@ mod tests { assert_eq!(b"z", DATA_PREFIX_KEY); // Create two overlapping SST files then force compaction. - // Region "a" will share a SST file with region "b", since region "a" is too small. - // Region "c" will be splitted into two SSTs, since its size is larger than - // target_file_size_base. + // Region "a" will share a SST file with region "b", since region "a" is too + // small. Region "c" will be splitted into two SSTs, since its size is + // larger than target_file_size_base. let value = vec![b'v'; 1024]; db.put(b"za1", b"").unwrap(); db.put(b"zb1", &value).unwrap(); db.put(b"zc1", &value).unwrap(); - db.flush(true /*sync*/).unwrap(); + db.flush(true /* sync */).unwrap(); db.put(b"zb2", &value).unwrap(); db.put(b"zc2", &value).unwrap(); db.put(b"zc3", &value).unwrap(); db.put(b"zc4", &value).unwrap(); db.put(b"zc5", &value).unwrap(); db.put(b"zc6", &value).unwrap(); - db.flush(true /*sync*/).unwrap(); + db.flush(true /* sync */).unwrap(); db.compact_range( - CF_DEFAULT, None, /*start_key*/ - None, /*end_key*/ - false, /*exclusive_manual*/ - 1, /*max_subcompactions*/ + CF_DEFAULT, None, // start_key + None, // end_key + false, // exclusive_manual + 1, // max_subcompactions ) .unwrap(); diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index fdd47d6c2ae..5d7d89bbc7b 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -37,7 +37,8 @@ with_prefix!(prefix_store "store-"); #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct Config { - // minimizes disruption when a partitioned node rejoins the cluster by using a two phase election. + // minimizes disruption when a partitioned node rejoins the cluster by using a two phase + // election. #[online_config(skip)] pub prevote: bool, #[online_config(skip)] @@ -120,12 +121,13 @@ pub struct Config { /// the peer is considered to be down and is reported to PD. pub max_peer_down_duration: ReadableDuration, - /// If the leader of a peer is missing for longer than max_leader_missing_duration, - /// the peer would ask pd to confirm whether it is valid in any region. - /// If the peer is stale and is not valid in any region, it will destroy itself. + /// If the leader of a peer is missing for longer than + /// max_leader_missing_duration, the peer would ask pd to confirm + /// whether it is valid in any region. If the peer is stale and is not + /// valid in any region, it will destroy itself. pub max_leader_missing_duration: ReadableDuration, - /// Similar to the max_leader_missing_duration, instead it will log warnings and - /// try to alert monitoring systems, if there is any. + /// Similar to the max_leader_missing_duration, instead it will log warnings + /// and try to alert monitoring systems, if there is any. pub abnormal_leader_missing_duration: ReadableDuration, pub peer_stale_state_check_interval: ReadableDuration, @@ -156,11 +158,11 @@ pub struct Config { #[online_config(hidden)] pub right_derive_when_split: bool, - /// This setting can only ensure conf remove will not be proposed by the peer - /// being removed. But it can't guarantee the remove is applied when the target - /// is not leader. That means we always need to check if it's working as expected - /// when a leader applies a self-remove conf change. Keep the configuration only - /// for convenient test. + /// This setting can only ensure conf remove will not be proposed by the + /// peer being removed. But it can't guarantee the remove is applied + /// when the target is not leader. That means we always need to check if + /// it's working as expected when a leader applies a self-remove conf + /// change. Keep the configuration only for convenient test. #[cfg(any(test, feature = "testexport"))] pub allow_remove_leader: bool, @@ -213,9 +215,10 @@ pub struct Config { #[doc(hidden)] #[online_config(skip)] /// Disable this feature by set to 0, logic will be removed in other pr. - /// When TiKV memory usage reaches `memory_usage_high_water` it will try to limit memory - /// increasing. For raftstore layer entries will be evicted from entry cache, if they - /// utilize memory more than `evict_cache_on_memory_ratio` * total. + /// When TiKV memory usage reaches `memory_usage_high_water` it will try to + /// limit memory increasing. For raftstore layer entries will be evicted + /// from entry cache, if they utilize memory more than + /// `evict_cache_on_memory_ratio` * total. /// /// Set it to 0 can disable cache evict. // By default it's 0.2. So for different system memory capacity, cache evict happens: @@ -226,13 +229,14 @@ pub struct Config { pub cmd_batch: bool, - /// When the count of concurrent ready exceeds this value, command will not be proposed - /// until the previous ready has been persisted. + /// When the count of concurrent ready exceeds this value, command will not + /// be proposed until the previous ready has been persisted. /// If `cmd_batch` is 0, this config will have no effect. /// If it is 0, it means no limit. pub cmd_batch_concurrent_ready_max_count: usize, - /// When the size of raft db writebatch exceeds this value, write will be triggered. + /// When the size of raft db writebatch exceeds this value, write will be + /// triggered. pub raft_write_size_limit: ReadableSize, pub waterfall_metrics: bool, @@ -256,7 +260,8 @@ pub struct Config { #[serde(skip_serializing)] #[online_config(skip)] pub region_split_size: ReadableSize, - // Deprecated! The time to clean stale peer safely can be decided based on RocksDB snapshot sequence number. + // Deprecated! The time to clean stale peer safely can be decided based on RocksDB snapshot + // sequence number. #[doc(hidden)] #[serde(skip_serializing)] #[online_config(skip)] @@ -268,8 +273,8 @@ pub struct Config { // Interval to report min resolved ts, if it is zero, it means disabled. pub report_min_resolved_ts_interval: ReadableDuration, - /// Interval to check whether to reactivate in-memory pessimistic lock after being disabled - /// before transferring leader. + /// Interval to check whether to reactivate in-memory pessimistic lock after + /// being disabled before transferring leader. pub reactive_memory_lock_tick_interval: ReadableDuration, /// Max tick count before reactivating in-memory pessimistic lock. pub reactive_memory_lock_timeout_tick: usize, @@ -460,8 +465,8 @@ impl Config { )); } - // The adjustment of this value is related to the number of regions, usually 16384 is - // already a large enough value + // The adjustment of this value is related to the number of regions, usually + // 16384 is already a large enough value if self.raft_max_inflight_msgs == 0 || self.raft_max_inflight_msgs > 16384 { return Err(box_err!( "raft max inflight msgs should be greater than 0 and less than or equal to 16384" diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 4f751a35b17..03054cfcc16 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -1,7 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -//! This module contains the implementation of the `EntryStorage`, which covers a subset of -//! raft storage. This module will be shared between raftstore v1 and v2. +//! This module contains the implementation of the `EntryStorage`, which covers +//! a subset of raft storage. This module will be shared between raftstore v1 +//! and v2. use std::{ cell::{Cell, RefCell}, @@ -60,7 +61,8 @@ impl CachedEntries { } } - /// Take cached entries and dangle size for them. `dangle` means not in entry cache. + /// Take cached entries and dangle size for them. `dangle` means not in + /// entry cache. pub fn take_entries(&self) -> (Vec, usize) { mem::take(&mut *self.entries.lock().unwrap()) } @@ -119,8 +121,8 @@ impl EntryCache { } }) .count(); - // Cache either is empty or contains latest log. Hence we don't need to fetch log - // from rocksdb anymore. + // Cache either is empty or contains latest log. Hence we don't need to fetch + // log from rocksdb anymore. assert!(end_idx == limit_idx || fetched_size > max_size); let (first, second) = tikv_util::slices_in_range(&self.cache, start_idx, end_idx); ents.extend_from_slice(first); @@ -172,10 +174,10 @@ impl EntryCache { self.cache.push_back(e.to_owned()); mem_size_change += (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; } - // In the past, the entry cache will be truncated if its size exceeds a certain number. - // However, after introducing async write io, the entry must stay in cache if it's not - // persisted to raft db because the raft-rs may need to read entries.(e.g. leader sends - // MsgAppend to followers) + // In the past, the entry cache will be truncated if its size exceeds a certain + // number. However, after introducing async write io, the entry must stay in + // cache if it's not persisted to raft db because the raft-rs may need to read + // entries.(e.g. leader sends MsgAppend to followers) mem_size_change } @@ -198,9 +200,9 @@ impl EntryCache { let mut mem_size_change = 0; - // Clean cached entries which have been already sent to apply threads. For example, - // if entries [1, 10), [10, 20), [20, 30) are sent to apply threads and `compact_to(15)` - // is called, only [20, 30) will still be kept in cache. + // Clean cached entries which have been already sent to apply threads. For + // example, if entries [1, 10), [10, 20), [20, 30) are sent to apply threads and + // `compact_to(15)` is called, only [20, 30) will still be kept in cache. let old_trace_cap = self.trace.capacity(); while let Some(cached_entries) = self.trace.pop_front() { if cached_entries.range.start >= idx { @@ -227,7 +229,8 @@ impl EntryCache { } let cache_last_idx = self.cache.back().unwrap().get_index(); - // Use `cache_last_idx + 1` to make sure cache can be cleared completely if necessary. + // Use `cache_last_idx + 1` to make sure cache can be cleared completely if + // necessary. let compact_to = (cmp::min(cache_last_idx + 1, idx) - cache_first_idx) as usize; for e in self.cache.drain(..compact_to) { mem_size_change -= (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64 @@ -564,7 +567,8 @@ impl EntryStorage { return Ok(count); } - // the count of left entries isn't too large, fetch the remaining entries synchronously one by one + // the count of left entries isn't too large, fetch the remaining entries + // synchronously one by one for idx in last + 1..high { let ent = self.raft_engine.get_entry(region_id, idx)?; match ent { @@ -597,7 +601,8 @@ impl EntryStorage { "max_size" => max_size, "res_max_size" => res.max_size, ); - // low index or max size is changed, the result is not fit for the current range, so refetch again. + // low index or max size is changed, the result is not fit for the current + // range, so refetch again. self.raftlog_fetch_stats.fetch_invalid.update(|m| m + 1); res.tried_cnt + 1 } else { @@ -606,7 +611,8 @@ impl EntryStorage { // the first/second try: get [low, high) asynchronously // the third try: - // - if term and low are matched: use result of [low, persisted) and get [persisted, high) synchronously + // - if term and low are matched: use result of [low, persisted) and get + // [persisted, high) synchronously // - else: get [low, high) synchronously if tried_cnt >= MAX_ASYNC_FETCH_TRY_CNT { // even the larger range is invalid again, fallback to fetch in sync way @@ -807,7 +813,8 @@ impl EntryStorage { self.apply_state.get_truncated_state().get_term() } - // Append the given entries to the raft log using previous last index or self.last_index. + // Append the given entries to the raft log using previous last index or + // self.last_index. pub fn append(&mut self, entries: Vec, task: &mut WriteTask) { if entries.is_empty() { return; diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index ab73c0bc8c6..284015b0eb8 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -395,14 +395,17 @@ where store_id: u64, /// region_id -> (peer_id, is_splitting) /// Used for handling race between splitting and creating new peer. - /// An uninitialized peer can be replaced to the one from splitting iff they are exactly the same peer. + /// An uninitialized peer can be replaced to the one from splitting iff they + /// are exactly the same peer. pending_create_peers: Arc>>, - /// We must delete the ingested file before calling `callback` so that any ingest-request reaching this - /// peer could see this update if leader had changed. We must also delete them after the applied-index - /// has been persisted to kvdb because this entry may replay because of panic or power-off, which - /// happened before `WriteBatch::write` and after `SstImporter::delete`. We shall make sure that - /// this entry will never apply again at first, then we can delete the ssts files. + /// We must delete the ingested file before calling `callback` so that any + /// ingest-request reaching this peer could see this update if leader + /// had changed. We must also delete them after the applied-index + /// has been persisted to kvdb because this entry may replay because of + /// panic or power-off, which happened before `WriteBatch::write` and + /// after `SstImporter::delete`. We shall make sure that this entry will + /// never apply again at first, then we can delete the ssts files. delete_ssts: Vec, /// The priority of this Handler. @@ -484,10 +487,11 @@ where .push_batch(&delegate.observe_info, delegate.region.get_id()); } - /// Commits all changes have done for delegate. `persistent` indicates whether - /// write the changes into rocksdb. + /// Commits all changes have done for delegate. `persistent` indicates + /// whether write the changes into rocksdb. /// - /// This call is valid only when it's between a `prepare_for` and `finish_for`. + /// This call is valid only when it's between a `prepare_for` and + /// `finish_for`. pub fn commit(&mut self, delegate: &mut ApplyDelegate) { if delegate.last_flush_applied_index < delegate.apply_state.get_applied_index() { delegate.write_apply_state(self.kv_wb_mut()); @@ -547,7 +551,8 @@ where // Control the memory usage for the WriteBatch. self.kv_wb = self.engine.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE); } else { - // Clear data, reuse the WriteBatch, this can reduce memory allocations and deallocations. + // Clear data, reuse the WriteBatch, this can reduce memory allocations and + // deallocations. self.kv_wb_mut().clear(); } self.kv_wb_last_bytes = 0; @@ -567,7 +572,8 @@ where batch_max_level, mut cb_batch, } = mem::replace(&mut self.applied_batch, ApplyCallbackBatch::new()); - // Call it before invoking callback for preventing Commit is executed before Prewrite is observed. + // Call it before invoking callback for preventing Commit is executed before + // Prewrite is observed. self.host .on_flush_applied_cmd_batch(batch_max_level, cmd_batch, &self.engine); // Invoke callbacks @@ -750,9 +756,9 @@ fn has_high_latency_operation(cmd: &RaftCmdRequest) -> bool { fn should_sync_log(cmd: &RaftCmdRequest) -> bool { if cmd.has_admin_request() { if cmd.get_admin_request().get_cmd_type() == AdminCmdType::CompactLog { - // We do not need to sync WAL before compact log, because this request will send a msg to - // raft_gc_log thread to delete the entries before this index instead of deleting them in - // apply thread directly. + // We do not need to sync WAL before compact log, because this request will send + // a msg to raft_gc_log thread to delete the entries before this + // index instead of deleting them in apply thread directly. return false; } return true; @@ -780,9 +786,9 @@ fn should_sync_log(cmd: &RaftCmdRequest) -> bool { /// this struct. /// TODO: check whether generator/coroutine is a good choice in this case. struct WaitSourceMergeState { - /// A flag that indicates whether the source peer has applied to the required - /// index. If the source peer is ready, this flag should be set to the region id - /// of source peer. + /// A flag that indicates whether the source peer has applied to the + /// required index. If the source peer is ready, this flag should be set + /// to the region id of source peer. logs_up_to_date: Arc, } @@ -859,12 +865,14 @@ where tag: String, /// If the delegate should be stopped from polling. - /// A delegate can be stopped in conf change, merge or requested by destroy message. + /// A delegate can be stopped in conf change, merge or requested by destroy + /// message. stopped: bool, /// The start time of the current round to execute commands. handle_start: Option, - /// Set to true when removing itself because of `ConfChangeType::RemoveNode`, and then - /// any following committed logs in same Ready should be applied failed. + /// Set to true when removing itself because of + /// `ConfChangeType::RemoveNode`, and then any following committed logs + /// in same Ready should be applied failed. pending_remove: bool, /// The commands waiting to be committed and applied @@ -872,22 +880,25 @@ where /// The counter of pending request snapshots. See more in `Peer`. pending_request_snapshot_count: Arc, - /// Indicates the peer is in merging, if that compact log won't be performed. + /// Indicates the peer is in merging, if that compact log won't be + /// performed. is_merging: bool, /// Records the epoch version after the last merge. last_merge_version: u64, yield_state: Option>, - /// A temporary state that keeps track of the progress of the source peer state when - /// CommitMerge is unable to be executed. + /// A temporary state that keeps track of the progress of the source peer + /// state when CommitMerge is unable to be executed. wait_merge_state: Option, // ID of last region that reports ready. ready_source_region_id: u64, - /// TiKV writes apply_state to KV RocksDB, in one write batch together with kv data. + /// TiKV writes apply_state to KV RocksDB, in one write batch together with + /// kv data. /// - /// If we write it to Raft RocksDB, apply_state and kv data (Put, Delete) are in - /// separate WAL file. When power failure, for current raft log, apply_index may synced - /// to file, but KV data may not synced to file, so we will lose data. + /// If we write it to Raft RocksDB, apply_state and kv data (Put, Delete) + /// are in separate WAL file. When power failure, for current raft log, + /// apply_index may synced to file, but KV data may not synced to file, + /// so we will lose data. apply_state: RaftApplyState, /// The term of the raft log at applied index. applied_term: u64, @@ -900,8 +911,9 @@ where /// The local metrics, and it will be flushed periodically. metrics: ApplyMetrics, - /// Priority in batch system. When applying some commands which have high latency, - /// we decrease the priority of current fsm to reduce the impact on other normal commands. + /// Priority in batch system. When applying some commands which have high + /// latency, we decrease the priority of current fsm to reduce the + /// impact on other normal commands. priority: Priority, /// To fetch Raft entries for applying if necessary. @@ -954,7 +966,8 @@ where self.id } - /// Handles all the committed_entries, namely, applies the committed entries. + /// Handles all the committed_entries, namely, applies the committed + /// entries. fn handle_raft_committed_entries( &mut self, apply_ctx: &mut ApplyContext, @@ -964,9 +977,9 @@ where return; } apply_ctx.prepare_for(self); - // If we send multiple ConfChange commands, only first one will be proposed correctly, - // others will be saved as a normal entry with no data, so we must re-propose these - // commands again. + // If we send multiple ConfChange commands, only first one will be proposed + // correctly, others will be saved as a normal entry with no data, so we + // must re-propose these commands again. apply_ctx.committed_count += committed_entries_drainer.len(); let mut results = VecDeque::new(); while let Some(entry) = committed_entries_drainer.next() { @@ -986,9 +999,10 @@ where ); } - // NOTE: before v5.0, `EntryType::EntryConfChangeV2` entry is handled by `unimplemented!()`, - // which can break compatibility (i.e. old version tikv running on data written by new version tikv), - // but PD will reject old version tikv join the cluster, so this should not happen. + // NOTE: before v5.0, `EntryType::EntryConfChangeV2` entry is handled by + // `unimplemented!()`, which can break compatibility (i.e. old version tikv + // running on data written by new version tikv), but PD will reject old version + // tikv join the cluster, so this should not happen. let res = match entry.get_entry_type() { EntryType::EntryNormal => self.handle_raft_entry_normal(apply_ctx, &entry), EntryType::EntryConfChange | EntryType::EntryConfChangeV2 => { @@ -1238,11 +1252,13 @@ where /// Applies raft command. /// /// An apply operation can fail in the following situations: - /// 1. it encounters an error that will occur on all stores, it can continue - /// applying next entry safely, like epoch not match for example; - /// 2. it encounters an error that may not occur on all stores, in this case - /// we should try to apply the entry again or panic. Considering that this - /// usually due to disk operation fail, which is rare, so just panic is ok. + /// - it encounters an error that will occur on all stores, it can + /// continue applying next entry safely, like epoch not match for + /// example; + /// - it encounters an error that may not occur on all stores, in this + /// case we should try to apply the entry again or panic. Considering + /// that this usually due to disk operation fail, which is rare, so just + /// panic is ok. fn apply_raft_cmd( &mut self, ctx: &mut ApplyContext, @@ -1359,7 +1375,8 @@ where if let Some(epoch) = origin_epoch { let cmd_type = req.get_admin_request().get_cmd_type(); let epoch_state = admin_cmd_epoch_lookup(cmd_type); - // The change-epoch behavior **MUST BE** equal to the settings in `admin_cmd_epoch_lookup` + // The change-epoch behavior **MUST BE** equal to the settings in + // `admin_cmd_epoch_lookup` if (epoch_state.change_ver && epoch.get_version() == self.region.get_region_epoch().get_version()) || (epoch_state.change_conf_ver @@ -1619,7 +1636,8 @@ where keys::data_key_with_buffer(key, &mut ctx.key_buffer); let key = ctx.key_buffer.as_slice(); - // since size_diff_hint is not accurate, so we just skip calculate the value size. + // since size_diff_hint is not accurate, so we just skip calculate the value + // size. self.metrics.size_diff_hint -= key.len() as i64; if !req.get_delete().get_cf().is_empty() { let cf = req.get_delete().get_cf(); @@ -2236,9 +2254,9 @@ where .mut_splits() .set_right_derive(split.get_right_derive()); admin_req.mut_splits().mut_requests().push(split); - // This method is executed only when there are unapplied entries after being restarted. - // So there will be no callback, it's OK to return a response that does not matched - // with its request. + // This method is executed only when there are unapplied entries after being + // restarted. So there will be no callback, it's OK to return a response + // that does not matched with its request. self.exec_batch_split(ctx, &admin_req) } @@ -2301,10 +2319,12 @@ where // Note that the split requests only contain ids for new regions, so we need // to handle new regions and old region separately. if right_derive { - // So the range of new regions is [old_start_key, split_key1, ..., last_split_key]. + // So the range of new regions is [old_start_key, split_key1, ..., + // last_split_key]. keys.push_front(derived.get_start_key().to_vec()); } else { - // So the range of new regions is [split_key1, ..., last_split_key, old_end_key]. + // So the range of new regions is [split_key1, ..., last_split_key, + // old_end_key]. keys.push_back(derived.get_end_key().to_vec()); derived.set_end_key(keys.front().unwrap().to_vec()); regions.push(derived.clone()); @@ -2520,15 +2540,20 @@ where // The target peer should send missing log entries to the source peer. // // So, the merge process order would be: - // 1. `exec_commit_merge` in target apply fsm and send `CatchUpLogs` to source peer fsm - // 2. `on_catch_up_logs_for_merge` in source peer fsm - // 3. if the source peer has already executed the corresponding `on_ready_prepare_merge`, set pending_remove and jump to step 6 - // 4. ... (raft append and apply logs) - // 5. `on_ready_prepare_merge` in source peer fsm and set pending_remove (means source region has finished applying all logs) - // 6. `logs_up_to_date_for_merge` in source apply fsm (destroy its apply fsm and send Noop to trigger the target apply fsm) - // 7. resume `exec_commit_merge` in target apply fsm - // 8. `on_ready_commit_merge` in target peer fsm and send `MergeResult` to source peer fsm - // 9. `on_merge_result` in source peer fsm (destroy itself) + // - `exec_commit_merge` in target apply fsm and send `CatchUpLogs` to source + // peer fsm + // - `on_catch_up_logs_for_merge` in source peer fsm + // - if the source peer has already executed the corresponding + // `on_ready_prepare_merge`, set pending_remove and jump to step 6 + // - ... (raft append and apply logs) + // - `on_ready_prepare_merge` in source peer fsm and set pending_remove (means + // source region has finished applying all logs) + // - `logs_up_to_date_for_merge` in source apply fsm (destroy its apply fsm and + // send Noop to trigger the target apply fsm) + // - resume `exec_commit_merge` in target apply fsm + // - `on_ready_commit_merge` in target peer fsm and send `MergeResult` to source + // peer fsm + // - `on_merge_result` in source peer fsm (destroy itself) fn exec_commit_merge( &mut self, ctx: &mut ApplyContext, @@ -3043,7 +3068,8 @@ where pub index: u64, pub term: u64, pub cb: Callback, - /// `propose_time` is set to the last time when a peer starts to renew lease. + /// `propose_time` is set to the last time when a peer starts to renew + /// lease. pub propose_time: Option, pub must_pass_epoch_check: bool, } @@ -3055,8 +3081,8 @@ pub struct Destroy { merge_from_snapshot: bool, } -/// A message that asks the delegate to apply to the given logs and then reply to -/// target mailbox. +/// A message that asks the delegate to apply to the given logs and then reply +/// to target mailbox. #[derive(Default, Debug)] pub struct CatchUpLogs { /// The target region to be notified when given logs are applied. @@ -3337,7 +3363,8 @@ where ) } - /// Handles peer registration. When a peer is created, it will register an apply delegate. + /// Handles peer registration. When a peer is created, it will register an + /// apply delegate. fn handle_registration(&mut self, reg: Registration) { info!( "re-register to apply delegates"; @@ -3351,7 +3378,8 @@ where self.delegate = ApplyDelegate::from_registration(reg); } - /// Handles apply tasks, and uses the apply delegate to handle the committed entries. + /// Handles apply tasks, and uses the apply delegate to handle the committed + /// entries. fn handle_apply(&mut self, apply_ctx: &mut ApplyContext, mut apply: Apply) { if apply_ctx.timer.is_none() { apply_ctx.timer = Some(Instant::now_coarse()); @@ -3474,7 +3502,8 @@ where self.delegate.destroy(ctx); } - /// Handles peer destroy. When a peer is destroyed, the corresponding apply delegate should be removed too. + /// Handles peer destroy. When a peer is destroyed, the corresponding apply + /// delegate should be removed too. fn handle_destroy(&mut self, ctx: &mut ApplyContext, d: Destroy) { assert_eq!(d.region_id, self.delegate.region_id()); if d.merge_from_snapshot { @@ -3545,8 +3574,9 @@ where "region_id" => region_id, "peer_id" => self.delegate.id(), ); - // The source peer fsm will be destroyed when the target peer executes `on_ready_commit_merge` - // and sends `merge result` to the source peer fsm. + // The source peer fsm will be destroyed when the target peer executes + // `on_ready_commit_merge` and sends `merge result` to the source peer + // fsm. self.destroy(ctx); catch_up_logs .logs_up_to_date @@ -3650,12 +3680,13 @@ where let resp = match compare_region_epoch( ®ion_epoch, &self.delegate.region, - false, /* check_conf_ver */ - true, /* check_ver */ - true, /* include_region */ + false, // check_conf_ver + true, // check_ver + true, // include_region ) { Ok(()) => { - // Commit the writebatch for ensuring the following snapshot can get all previous writes. + // Commit the writebatch for ensuring the following snapshot can get all + // previous writes. if apply_ctx.kv_wb().count() > 0 { apply_ctx.commit(&mut self.delegate); } @@ -4266,8 +4297,8 @@ mod memtrace { S: Snapshot, { fn heap_size(&self) -> usize { - // Some fields of `PendingCmd` are on stack, but ignore them because they are just - // some small boxed closures. + // Some fields of `PendingCmd` are on stack, but ignore them because they are + // just some small boxed closures. self.normals.capacity() * mem::size_of::>() } } @@ -4728,7 +4759,8 @@ mod tests { assert_eq!(apply_res.apply_state, apply_state); assert_eq!(apply_res.apply_state.get_applied_index(), 5); assert!(apply_res.exec_res.is_empty()); - // empty entry will make applied_index step forward and should write apply state to engine. + // empty entry will make applied_index step forward and should write apply state + // to engine. assert_eq!(apply_res.metrics.written_keys, 1); assert_eq!(apply_res.applied_term, 5); validate(&router, 2, |delegate| { @@ -5335,7 +5367,8 @@ mod tests { capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); } let index = write_batch_max_keys + 11; - // The region was rescheduled to normal-priority handler. Discard the first apply_res. + // The region was rescheduled to normal-priority handler. Discard the first + // apply_res. fetch_apply_res(&rx); let apply_res = fetch_apply_res(&rx); assert_eq!(apply_res.apply_state.get_applied_index(), index as u64); @@ -5391,9 +5424,10 @@ mod tests { reg.region.mut_region_epoch().set_version(3); router.schedule_task(1, Msg::Registration(reg)); - // Test whether put commands and ingest commands are applied to engine in a correct order. - // We will generate 5 entries which are put, ingest, put, ingest, put respectively. For a same key, - // it can exist in multiple entries or in a single entries. We will test all all the possible + // Test whether put commands and ingest commands are applied to engine in a + // correct order. We will generate 5 entries which are put, ingest, put, + // ingest, put respectively. For a same key, it can exist in multiple + // entries or in a single entries. We will test all all the possible // keys exsiting combinations. let mut keys = Vec::new(); let keys_count = 1 << 5; @@ -5510,8 +5544,8 @@ mod tests { assert!(!resp.get_header().has_error(), "{:?}", resp); } let mut res = fetch_apply_res(&rx); - // There may be one or two ApplyRes which depends on whether these two apply msgs - // are batched together. + // There may be one or two ApplyRes which depends on whether these two apply + // msgs are batched together. if res.apply_state.get_applied_index() == 3 { res = fetch_apply_res(&rx); } @@ -6276,7 +6310,8 @@ mod tests { let res = panic_hook::recover_safe(|| { let _cmd = PendingCmd::::new(1, 1, Callback::None); panic!("Don't abort"); - // It would abort and fail if there was a double-panic in PendingCmd dtor. + // It would abort and fail if there was a double-panic in PendingCmd + // dtor. }); res.unwrap_err(); } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index baccd071690..8d5369aaefa 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -113,7 +113,8 @@ enum DelayReason { /// Limits the maximum number of regions returned by error. /// -/// Another choice is using coprocessor batch limit, but 10 should be a good fit in most case. +/// Another choice is using coprocessor batch limit, but 10 should be a good fit +/// in most case. const MAX_REGIONS_IN_ERROR: usize = 10; const REGION_SPLIT_SKIP_MAX_COUNT: usize = 3; @@ -129,13 +130,15 @@ where ER: RaftEngine, { pub peer: Peer, - /// A registry for all scheduled ticks. This can avoid scheduling ticks twice accidentally. + /// A registry for all scheduled ticks. This can avoid scheduling ticks + /// twice accidentally. tick_registry: [bool; PeerTick::VARIANT_COUNT], /// Ticks for speed up campaign in chaos state. /// - /// Followers will keep ticking in Idle mode to measure how many ticks have been skipped. - /// Once it becomes chaos, those skipped ticks will be ticked so that it can campaign - /// quickly instead of waiting an election timeout. + /// Followers will keep ticking in Idle mode to measure how many ticks have + /// been skipped. Once it becomes chaos, those skipped ticks will be + /// ticked so that it can campaign quickly instead of waiting an + /// election timeout. /// /// This will be reset to 0 once it receives any messages from leader. missing_ticks: usize, @@ -144,11 +147,12 @@ where has_ready: bool, mailbox: Option>>, pub receiver: Receiver>, - /// when snapshot is generating or sending, skip split check at most REGION_SPLIT_SKIT_MAX_COUNT times. + /// when snapshot is generating or sending, skip split check at most + /// REGION_SPLIT_SKIT_MAX_COUNT times. skip_split_count: usize, - /// Sometimes applied raft logs won't be compacted in time, because less compact means less - /// sync-log in apply threads. Stale logs will be deleted if the skip time reaches this - /// `skip_gc_raft_log_ticks`. + /// Sometimes applied raft logs won't be compacted in time, because less + /// compact means less sync-log in apply threads. Stale logs will be + /// deleted if the skip time reaches this `skip_gc_raft_log_ticks`. skip_gc_raft_log_ticks: usize, reactivate_memory_lock_ticks: usize, @@ -160,8 +164,8 @@ where /// Destroy is delayed because of some unpersisted readies in Peer. /// Should call `destroy_peer` again after persisting all readies. delayed_destroy: Option, - /// Before actually destroying a peer, ensure all log gc tasks are finished, so we - /// can start destroying without seeking. + /// Before actually destroying a peer, ensure all log gc tasks are finished, + /// so we can start destroying without seeking. logs_gc_flushed: bool, } @@ -285,9 +289,9 @@ where )) } - // The peer can be created from another node with raft membership changes, and we only - // know the region_id and peer_id when creating this replicated peer, the region info - // will be retrieved later after applying snapshot. + // The peer can be created from another node with raft membership changes, and + // we only know the region_id and peer_id when creating this replicated peer, + // the region info will be retrieved later after applying snapshot. pub fn replicate( store_id: u64, cfg: &Config, @@ -458,8 +462,8 @@ where fn should_finish(&self, cfg: &Config) -> bool { if let Some(batch_req) = self.request.as_ref() { - // Limit the size of batch request so that it will not exceed raft_entry_max_size after - // adding header. + // Limit the size of batch request so that it will not exceed + // raft_entry_max_size after adding header. if self.batch_req_size > (cfg.raft_entry_max_size.0 as f64 * 0.4) as u64 { return true; } @@ -877,9 +881,9 @@ where return; } let target_index = if self.fsm.peer.force_leader.is_some() { - // For regions that lose quorum (or regions have force leader), whatever has been - // proposed will be committed. Based on that fact, we simply use "last index" here to - // avoid implementing another "wait commit" process. + // For regions that lose quorum (or regions have force leader), whatever has + // been proposed will be committed. Based on that fact, we simply use "last + // index" here to avoid implementing another "wait commit" process. self.fsm.peer.raft_group.raft.raft_log.last_index() } else { self.fsm.peer.raft_group.raft.raft_log.committed @@ -891,7 +895,7 @@ where }); self.fsm .peer - .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ self.fsm.stopped); + .unsafe_recovery_maybe_finish_wait_apply(/* force= */ self.fsm.stopped); } fn on_unsafe_recovery_fill_out_report(&mut self, syncer: UnsafeRecoveryFillOutReportSyncer) { @@ -989,8 +993,9 @@ where if is_learner(&self.fsm.peer.peer) { // FIXME: should use `bcast_check_stale_peer_message` instead. - // Sending a new enum type msg to a old tikv may cause panic during rolling update - // we should change the protobuf behavior and check if properly handled in all place + // Sending a new enum type msg to a old tikv may cause panic during rolling + // update we should change the protobuf behavior and check if properly handled + // in all place self.fsm.peer.bcast_wake_up_message(self.ctx); } } @@ -1358,8 +1363,9 @@ where ); return; } - // wait two rounds of election timeout to trigger check quorum to step down the leader - // note: check quorum is triggered every `election_timeout` instead of `randomized_election_timeout` + // wait two rounds of election timeout to trigger check quorum to step down the + // leader note: check quorum is triggered every `election_timeout` instead of + // `randomized_election_timeout` Some( self.fsm.peer.raft_group.raft.election_timeout() * 2 - self.fsm.peer.raft_group.raft.election_elapsed, @@ -1439,7 +1445,8 @@ where // When PD issues force leader on two different peer, it may cause // two force leader in same term. self.fsm.peer.raft_group.raft.pre_vote = false; - // trigger vote request to all voters, will check the vote result in `check_force_leader` + // trigger vote request to all voters, will check the vote result in + // `check_force_leader` if let Err(e) = self.fsm.peer.raft_group.campaign() { warn!( "Unsafe recovery, campaign failed"; @@ -1558,7 +1565,8 @@ where self.fsm.peer.raft_group.raft.set_check_quorum(true); self.fsm.peer.raft_group.raft.pre_vote = true; if self.fsm.peer.raft_group.raft.promotable() { - // Do not campaign directly here, otherwise on_role_changed() won't called for follower state + // Do not campaign directly here, otherwise on_role_changed() won't called for + // follower state let _ = self.ctx.router.send( self.region_id(), PeerMsg::CasualMessage(CasualMessage::Campaign), @@ -1939,17 +1947,18 @@ where if self.fsm.hibernate_state.group_state() == GroupState::Idle { // missing_ticks should be less than election timeout ticks otherwise // follower may tick more than an election timeout in chaos state. - // Before stopping tick, `missing_tick` should be `raft_election_timeout_ticks` - 2 - // - `raft_heartbeat_ticks` (default 10 - 2 - 2 = 6) - // and the follower's `election_elapsed` in raft-rs is 1. - // After the group state becomes Chaos, the next tick will call `raft_group.tick` - // `missing_tick` + 1 times(default 7). + // Before stopping tick, `missing_tick` should be `raft_election_timeout_ticks` + // - 2 - `raft_heartbeat_ticks` (default 10 - 2 - 2 = 6) and the follower's + // `election_elapsed` in raft-rs is 1. + // After the group state becomes Chaos, the next tick will call + // `raft_group.tick` `missing_tick` + 1 times(default 7). // Then the follower's `election_elapsed` will be 1 + `missing_tick` + 1 // (default 1 + 6 + 1 = 8) which is less than the min election timeout. - // The reason is that we don't want let all followers become (pre)candidate if one - // follower may receive a request, then becomes (pre)candidate and sends (pre)vote msg - // to others. As long as the leader can wake up and broadcast heartbeats in one `raft_heartbeat_ticks` - // time(default 2s), no more followers will wake up and sends vote msg again. + // The reason is that we don't want let all followers become (pre)candidate if + // one follower may receive a request, then becomes (pre)candidate and sends + // (pre)vote msg to others. As long as the leader can wake up and broadcast + // heartbeats in one `raft_heartbeat_ticks` time(default 2s), no more followers + // will wake up and sends vote msg again. if self.fsm.missing_ticks + 1 /* for the next tick after the peer isn't Idle */ + self.fsm.peer.raft_group.raft.election_elapsed + self.ctx.cfg.raft_heartbeat_ticks @@ -1985,7 +1994,8 @@ where self.fsm.peer.mut_store().flush_entry_cache_metrics(); - // Keep ticking if there are still pending read requests or this node is within hibernate timeout. + // Keep ticking if there are still pending read requests or this node is within + // hibernate timeout. if res.is_none() /* hibernate_region is false */ || !self.fsm.peer.check_after_tick(self.fsm.hibernate_state.group_state(), res.unwrap()) || (self.fsm.peer.is_leader() && !self.all_agree_to_hibernate()) @@ -2021,7 +2031,7 @@ where Some(UnsafeRecoveryState::WaitApply { .. }) => self .fsm .peer - .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false), + .unsafe_recovery_maybe_finish_wait_apply(/* force= */ false), Some(UnsafeRecoveryState::DemoteFailedVoters { syncer, failed_voters, @@ -2378,10 +2388,11 @@ where .retain(|r| self.fsm.region_id() != r.get_id()); } else { // This snapshot may be accepted by raft-rs. - // If it's rejected by raft-rs, the snapshot region in `pending_snapshot_regions` - // will be removed together with the latest snapshot region after applying that snapshot. - // But if `regions_to_destroy` is not empty, the pending snapshot must be this msg's snapshot - // because this kind of snapshot is exclusive. + // If it's rejected by raft-rs, the snapshot region in + // `pending_snapshot_regions` will be removed together with the latest snapshot + // region after applying that snapshot. + // But if `regions_to_destroy` is not empty, the pending snapshot must be this + // msg's snapshot because this kind of snapshot is exclusive. self.destroy_regions_for_snapshot(regions_to_destroy); } } @@ -2541,23 +2552,26 @@ where let from_store_id = msg.get_from_peer().get_store_id(); // Let's consider following cases with three nodes [1, 2, 3] and 1 is leader: - // a. 1 removes 2, 2 may still send MsgAppendResponse to 1. + // - 1 removes 2, 2 may still send MsgAppendResponse to 1. // We should ignore this stale message and let 2 remove itself after // applying the ConfChange log. - // b. 2 is isolated, 1 removes 2. When 2 rejoins the cluster, 2 will - // send stale MsgRequestVote to 1 and 3, at this time, we should tell 2 to gc itself. - // c. 2 is isolated but can communicate with 3. 1 removes 3. + // - 2 is isolated, 1 removes 2. When 2 rejoins the cluster, 2 will + // send stale MsgRequestVote to 1 and 3, at this time, we should tell 2 to gc + // itself. + // - 2 is isolated but can communicate with 3. 1 removes 3. // 2 will send stale MsgRequestVote to 3, 3 should ignore this message. - // d. 2 is isolated but can communicate with 3. 1 removes 2, then adds 4, remove 3. + // - 2 is isolated but can communicate with 3. 1 removes 2, then adds 4, remove + // 3. // 2 will send stale MsgRequestVote to 3, 3 should tell 2 to gc itself. - // e. 2 is isolated. 1 adds 4, 5, 6, removes 3, 1. Now assume 4 is leader. + // - 2 is isolated. 1 adds 4, 5, 6, removes 3, 1. Now assume 4 is leader. // After 2 rejoins the cluster, 2 may send stale MsgRequestVote to 1 and 3, // 1 and 3 will ignore this message. Later 4 will send messages to 2 and 2 will // rejoin the raft group again. - // f. 2 is isolated. 1 adds 4, 5, 6, removes 3, 1. Now assume 4 is leader, and 4 removes 2. + // - 2 is isolated. 1 adds 4, 5, 6, removes 3, 1. Now assume 4 is leader, and 4 + // removes 2. // unlike case e, 2 will be stale forever. - // TODO: for case f, if 2 is stale for a long time, 2 will communicate with pd and pd will - // tell 2 is stale, so 2 can remove itself. + // TODO: for case f, if 2 is stale for a long time, 2 will communicate with pd + // and pd will tell 2 is stale, so 2 can remove itself. let self_epoch = self.fsm.peer.region().get_region_epoch(); if util::is_epoch_stale(from_epoch, self_epoch) && util::find_peer(self.fsm.peer.region(), from_store_id).is_none() @@ -2625,11 +2639,11 @@ where "merge_target" => ?merge_target, ); - // When receiving message that has a merge target, it indicates that the source peer on this - // store is stale, the peers on other stores are already merged. The epoch in merge target - // is the state of target peer at the time when source peer is merged. So here we record the - // merge target epoch version to let the target peer on this store to decide whether to - // destroy the source peer. + // When receiving message that has a merge target, it indicates that the source + // peer on this store is stale, the peers on other stores are already merged. + // The epoch in merge target is the state of target peer at the time when source + // peer is merged. So here we record the merge target epoch version to let the + // target peer on this store to decide whether to destroy the source peer. let mut meta = self.ctx.store_meta.lock().unwrap(); meta.targets_map.insert(self.region_id(), target_region_id); let v = meta @@ -2640,8 +2654,8 @@ where no_range_merge_target.clear_start_key(); no_range_merge_target.clear_end_key(); if let Some(pre_merge_target) = v.insert(self.region_id(), no_range_merge_target) { - // Merge target epoch records the version of target region when source region is merged. - // So it must be same no matter when receiving merge target. + // Merge target epoch records the version of target region when source region is + // merged. So it must be same no matter when receiving merge target. if pre_merge_target.get_region_epoch().get_version() != merge_target.get_region_epoch().get_version() { @@ -2654,7 +2668,8 @@ where } if let Some(r) = meta.regions.get(&target_region_id) { - // In the case that the source peer's range isn't overlapped with target's anymore: + // In the case that the source peer's range isn't overlapped with target's + // anymore: // | region 2 | region 3 | region 1 | // || merge 3 into 2 // \/ @@ -2668,8 +2683,8 @@ where // so the new target peer can't find the source peer. // e.g. new region 2 is overlapped with region 1 // - // If that, source peer still need to decide whether to destroy itself. When the target - // peer has already moved on, source peer can destroy itself. + // If that, source peer still need to decide whether to destroy itself. When the + // target peer has already moved on, source peer can destroy itself. if util::is_epoch_stale(merge_target.get_region_epoch(), r.get_region_epoch()) { return Ok(true); } @@ -2678,8 +2693,8 @@ where drop(meta); // All of the target peers must exist before merging which is guaranteed by PD. - // Now the target peer is not in region map, so if everything is ok, the merge target - // region should be staler than the local target region + // Now the target peer is not in region map, so if everything is ok, the merge + // target region should be staler than the local target region if self.is_merge_target_region_stale(merge_target)? { Ok(true) } else { @@ -2719,16 +2734,17 @@ where ); // Destroy peer in next round in order to apply more committed entries if any. - // It depends on the implementation that msgs which are handled in this round have already fetched. + // It depends on the implementation that msgs which are handled in this round + // have already fetched. let _ = self .ctx .router .force_send(self.fsm.region_id(), PeerMsg::Destroy(self.fsm.peer_id())); } - // Returns `Vec<(u64, bool)>` indicated (source_region_id, merge_to_this_peer) if the `msg` - // doesn't contain a snapshot or this snapshot doesn't conflict with any other snapshots or regions. - // Otherwise a `SnapKey` is returned. + // Returns `Vec<(u64, bool)>` indicated (source_region_id, merge_to_this_peer) + // if the `msg` doesn't contain a snapshot or this snapshot doesn't conflict + // with any other snapshots or regions. Otherwise a `SnapKey` is returned. fn check_snapshot(&mut self, msg: &RaftMessage) -> Result>> { if !msg.get_message().has_snapshot() { return Ok(Either::Right(vec![])); @@ -2829,13 +2845,14 @@ where let mut is_overlapped = false; let mut regions_to_destroy = vec![]; - // In some extreme cases, it may cause source peer destroyed improperly so that a later - // CommitMerge may panic because source is already destroyed, so just drop the message: - // 1. A new snapshot is received whereas a snapshot is still in applying, and the snapshot - // under applying is generated before merge and the new snapshot is generated after merge. - // After the applying snapshot is finished, the log may able to catch up and so a - // CommitMerge will be applied. - // 2. There is a CommitMerge pending in apply thread. + // In some extreme cases, it may cause source peer destroyed improperly so that + // a later CommitMerge may panic because source is already destroyed, so just + // drop the message: + // - A new snapshot is received whereas a snapshot is still in applying, and the + // snapshot under applying is generated before merge and the new snapshot is + // generated after merge. After the applying snapshot is finished, the log may + // able to catch up and so a CommitMerge will be applied. + // - There is a CommitMerge pending in apply thread. let ready = !self.fsm.peer.is_handling_snapshot() && !self.fsm.peer.has_pending_snapshot() // It must be ensured that all logs have been applied. @@ -2864,9 +2881,9 @@ where snap_region.get_region_epoch().to_owned(), ); if ready && can_destroy { - // The snapshot that we decide to whether destroy peer based on must can be applied. - // So here not to destroy peer immediately, or the snapshot maybe dropped in later - // check but the peer is already destroyed. + // The snapshot that we decide to whether destroy peer based on must can be + // applied. So here not to destroy peer immediately, or the snapshot maybe + // dropped in later check but the peer is already destroyed. regions_to_destroy.push((exist_region.get_id(), merge_to_this_peer)); continue; } @@ -2895,14 +2912,16 @@ where // Now all checking passed. if self.fsm.peer.local_first_replicate && !self.fsm.peer.is_initialized() { - // If the peer is not initialized and passes the snapshot range check, `is_splitting` flag must - // be false. - // 1. If `is_splitting` is set to true, then the uninitialized peer is created before split is applied - // and the peer id is the same as split one. So there should be no initialized peer before. - // 2. If the peer is also created by splitting, then the snapshot range is not overlapped with - // parent peer. It means leader has applied merge and split at least one time. However, - // the prerequisite of merge includes the initialization of all target peers and source peers, - // which is conflict with 1. + // If the peer is not initialized and passes the snapshot range check, + // `is_splitting` flag must be false. + // - If `is_splitting` is set to true, then the uninitialized peer is created + // before split is applied and the peer id is the same as split one. So there + // should be no initialized peer before. + // - If the peer is also created by splitting, then the snapshot range is not + // overlapped with parent peer. It means leader has applied merge and split at + // least one time. However, the prerequisite of merge includes the + // initialization of all target peers and source peers, which is conflict with + // 1. let pending_create_peers = self.ctx.pending_create_peers.lock().unwrap(); let status = pending_create_peers.get(®ion_id).cloned(); if status != Some((self.fsm.peer_id(), false)) { @@ -2951,8 +2970,8 @@ where } else { MergeResultKind::Stale }; - // Use `unwrap` is ok because the StoreMeta lock is held and these source peers still - // exist in regions and region_ranges map. + // Use `unwrap` is ok because the StoreMeta lock is held and these source peers + // still exist in regions and region_ranges map. // It depends on the implementation of `destroy_peer`. self.ctx .router @@ -3036,11 +3055,12 @@ where } } - // Returns whether we should propose another TransferLeader command. This is for: - // 1. Considering the amount of pessimistic locks can be big, it can reduce - // unavailable time caused by waiting for the transferree catching up logs. - // 2. Make transferring leader strictly after write commands that executes - // before proposing the locks, preventing unexpected lock loss. + // Returns whether we should propose another TransferLeader command. This is + // for: + // - Considering the amount of pessimistic locks can be big, it can reduce + // unavailable time caused by waiting for the transferee catching up logs. + // - Make transferring leader strictly after write commands that executes before + // proposing the locks, preventing unexpected lock loss. fn propose_locks_before_transfer_leader(&mut self, msg: &eraftpb::Message) -> bool { // 1. Disable in-memory pessimistic locks. @@ -3053,20 +3073,22 @@ where // in the TransferringLeader status, we can safely initiate transferring leader // now. // If it's not in TransferringLeader status now, it is probably because several - // ticks have passed after proposing the locks in the last time and we reactivate - // the memory locks. Then, we should propose the locks again. + // ticks have passed after proposing the locks in the last time and we + // reactivate the memory locks. Then, we should propose the locks again. if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX && pessimistic_locks.status == LocksStatus::TransferringLeader { return false; } - // If it is not writable, it's probably because it's a retried TransferLeader and the locks - // have been proposed. But we still need to return true to propose another TransferLeader - // command. Otherwise, some write requests that have marked some locks as deleted will fail - // because raft rejects more proposals. - // It is OK to return true here if it's in other states like MergingRegion or NotLeader. - // In those cases, the locks will fail to propose and nothing will happen. + // If it is not writable, it's probably because it's a retried TransferLeader + // and the locks have been proposed. But we still need to return true to + // propose another TransferLeader command. Otherwise, some write requests that + // have marked some locks as deleted will fail because raft rejects more + // proposals. + // It is OK to return true here if it's in other states like MergingRegion or + // NotLeader. In those cases, the locks will fail to propose and nothing will + // happen. if !pessimistic_locks.is_writable() { return true; } @@ -3078,11 +3100,12 @@ where if pessimistic_locks.is_empty() { return false; } - // FIXME: Raft command has size limit. Either limit the total size of pessimistic locks - // in a region, or split commands here. + // FIXME: Raft command has size limit. Either limit the total size of + // pessimistic locks in a region, or split commands here. let mut cmd = RaftCmdRequest::default(); { - // Downgrade to a read guard, do not block readers in the scheduler as far as possible. + // Downgrade to a read guard, do not block readers in the scheduler as far as + // possible. let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); fail_point!("invalidate_locks_before_transfer_leader"); for (key, (lock, deleted)) in &*pessimistic_locks { @@ -3100,9 +3123,10 @@ where } } if cmd.get_requests().is_empty() { - // If the map is not empty but all locks are deleted, it is possible that a write - // command has just marked locks deleted but not proposed yet. It might cause - // that command to fail if we skip proposing the extra TransferLeader command here. + // If the map is not empty but all locks are deleted, it is possible that a + // write command has just marked locks deleted but not proposed yet. + // It might cause that command to fail if we skip proposing the + // extra TransferLeader command here. return true; } cmd.mut_header().set_region_id(self.fsm.region_id()); @@ -3128,7 +3152,8 @@ where } } - /// Check if destroy can be executed immediately. If it can't, the reason is returned. + /// Check if destroy can be executed immediately. If it can't, the reason is + /// returned. fn maybe_delay_destroy(&mut self) -> Option { if self.fsm.peer.has_unpersisted_ready() { assert!(self.ctx.sync_write_worker.is_none()); @@ -3141,9 +3166,9 @@ where let is_initialized = self.fsm.peer.is_initialized(); if !is_initialized { // If the peer is uninitialized, then it can't receive any logs from leader. So - // no need to gc. If there was a peer with same region id on the store, and it had - // logs written, then it must be initialized, hence its log should be gc either - // before it's destroyed or during node restarts. + // no need to gc. If there was a peer with same region id on the store, and it + // had logs written, then it must be initialized, hence its log should be gc + // either before it's destroyed or during node restarts. self.fsm.logs_gc_flushed = true; } if !self.fsm.logs_gc_flushed { @@ -3262,7 +3287,7 @@ where if self.fsm.peer.unsafe_recovery_state.is_some() { self.fsm .peer - .unsafe_recovery_maybe_finish_wait_apply(/*force=*/ true); + .unsafe_recovery_maybe_finish_wait_apply(/* force= */ true); } let mut meta = self.ctx.store_meta.lock().unwrap(); @@ -3318,7 +3343,8 @@ where } // Some places use `force_send().unwrap()` if the StoreMeta lock is held. - // So in here, it's necessary to held the StoreMeta lock when closing the router. + // So in here, it's necessary to held the StoreMeta lock when closing the + // router. self.ctx.router.close(region_id); self.fsm.stop(); @@ -3361,8 +3387,10 @@ where .get_mut(&target) .unwrap() .remove(®ion_id); - // When the target doesn't exist(add peer but the store is isolated), source peer decide to destroy by itself. - // Without target, the `pending_merge_targets` for target won't be removed, so here source peer help target to clear. + // When the target doesn't exist(add peer but the store is isolated), source + // peer decide to destroy by itself. Without target, the + // `pending_merge_targets` for target won't be removed, so here source peer help + // target to clear. if meta.regions.get(&target).is_none() && meta.pending_merge_targets.get(&target).unwrap().is_empty() { @@ -3411,7 +3439,8 @@ where _ => unreachable!(), } } else { - // Please take a look at test case test_redundant_conf_change_by_snapshot. + // Please take a look at test case + // test_redundant_conf_change_by_snapshot. } self.update_region(cp.region); @@ -3526,9 +3555,10 @@ where // Most of these functions are only called when the peer is a leader. // (it's pretty reasonable because progress is used to track others' status) // The only exception is `Raft::restore` at the time of writing, which is ok - // because the raft msgs(including snapshot) don't be handled when `pending_remove` - // is true(it will be set in `destroy_peer`). - // TODO: totally avoid calling these raft-rs functions when `pending_remove` is true. + // because the raft msgs(including snapshot) don't be handled when + // `pending_remove` is true(it will be set in `destroy_peer`). + // TODO: totally avoid calling these raft-rs functions when `pending_remove` is + // true. self.fsm .peer .raft_group @@ -3573,9 +3603,10 @@ where let region_id = derived.get_id(); - // Group in-memory pessimistic locks in the original region into new regions. The locks of - // new regions will be put into the corresponding new regions later. And the locks belonging - // to the old region will stay in the original map. + // Group in-memory pessimistic locks in the original region into new regions. + // The locks of new regions will be put into the corresponding new regions + // later. And the locks belonging to the old region will stay in the original + // map. let region_locks = { let mut pessimistic_locks = self.fsm.peer.txn_ext.pessimistic_locks.write(); info!("moving {} locks to new regions", pessimistic_locks.len(); "region_id" => region_id); @@ -3732,8 +3763,8 @@ where new_peer.peer.approximate_size = estimated_size; new_peer.peer.approximate_keys = estimated_keys; *new_peer.peer.txn_ext.pessimistic_locks.write() = locks; - // The new peer is likely to become leader, send a heartbeat immediately to reduce - // client query miss. + // The new peer is likely to become leader, send a heartbeat immediately to + // reduce client query miss. new_peer.peer.heartbeat_pd(self.ctx); } @@ -3785,8 +3816,9 @@ where /// Check if merge target region is staler than the local one in kv engine. /// It should be called when target region is not in region map in memory. - /// If everything is ok, the answer should always be true because PD should ensure all target peers exist. - /// So if not, error log will be printed and return false. + /// If everything is ok, the answer should always be true because PD should + /// ensure all target peers exist. So if not, error log will be printed + /// and return false. fn is_merge_target_region_stale(&self, target_region: &metapb::Region) -> Result { let target_region_id = target_region.get_id(); let target_peer_id = util::find_peer(target_region, self.ctx.store_id()) @@ -3805,8 +3837,9 @@ where return Ok(true); } // The local target region epoch is staler than target region's. - // In the case where the peer is destroyed by receiving gc msg rather than applying conf change, - // the epoch may staler but it's legal, so check peer id to assure that. + // In the case where the peer is destroyed by receiving gc msg rather than + // applying conf change, the epoch may staler but it's legal, so check peer id + // to assure that. if let Some(local_target_peer_id) = util::find_peer(target_state.get_region(), self.ctx.store_id()).map(|r| r.get_id()) { @@ -3830,8 +3863,8 @@ where // There is a new peer and it's destroyed without being initialised. return Ok(true); } - // The local target peer id is greater than the one in target region, but its epoch - // is staler than target_region's. That is contradictory. + // The local target peer id is greater than the one in target region, but + // its epoch is staler than target_region's. That is contradictory. panic!("{} local target peer id {} is greater than the one in target region {}, but its epoch is staler, local target region {:?}, target region {:?}", self.fsm.peer.tag, local_target_peer_id, target_peer_id, target_state.get_region(), target_region); } @@ -3847,7 +3880,8 @@ where } } } else { - // Can't get local target peer id probably because this target peer is removed by applying conf change + // Can't get local target peer id probably because this target peer is removed + // by applying conf change error!( "the local target peer does not exist in target region state"; "target_region" => ?target_region, @@ -3980,9 +4014,10 @@ where request.set_admin_request(admin); (request, target_id) }; - // Please note that, here assumes that the unit of network isolation is store rather than - // peer. So a quorum stores of source region should also be the quorum stores of target - // region. Otherwise we need to enable proposal forwarding. + // Please note that, here assumes that the unit of network isolation is store + // rather than peer. So a quorum stores of source region should also be the + // quorum stores of target region. Otherwise we need to enable proposal + // forwarding. self.ctx .router .force_send( @@ -4204,8 +4239,8 @@ where d.mark_pending_remove(); } - // After the region commit merged, the region's key range is extended and the region's `safe_ts` - // should reset to `min(source_safe_ts, target_safe_ts)` + // After the region commit merged, the region's key range is extended and the + // region's `safe_ts` should reset to `min(source_safe_ts, target_safe_ts)` let source_read_progress = meta.region_read_progress.remove(&source.get_id()).unwrap(); self.fsm .peer @@ -4222,8 +4257,8 @@ where drop(meta); // make approximate size and keys updated in time. - // the reason why follower need to update is that there is a issue that after merge - // and then transfer leader, the new leader may have stale size and keys. + // the reason why follower need to update is that there is a issue that after + // merge and then transfer leader, the new leader may have stale size and keys. self.fsm.peer.size_diff_hint = self.ctx.cfg.region_split_check_diff().0; self.fsm.peer.reset_region_buckets(); if self.fsm.peer.is_leader() { @@ -4255,9 +4290,9 @@ where /// Handle rollbacking Merge result. /// - /// If commit is 0, it means that Merge is rollbacked by a snapshot; otherwise - /// it's rollbacked by a proposal, and its value should be equal to the commit - /// index of previous PrepareMerge. + /// If commit is 0, it means that Merge is rollbacked by a snapshot; + /// otherwise it's rollbacked by a proposal, and its value should be + /// equal to the commit index of previous PrepareMerge. fn on_ready_rollback_merge(&mut self, commit: u64, region: Option) { let pending_commit = self .fsm @@ -4328,9 +4363,9 @@ where ); } // Because of the checking before proposing `PrepareMerge`, which is - // no `CompactLog` proposal between the smallest commit index and the latest index. - // If the merge succeed, all source peers are impossible in apply snapshot state - // and must be initialized. + // no `CompactLog` proposal between the smallest commit index and the latest + // index. If the merge succeed, all source peers are impossible in apply + // snapshot state and must be initialized. { let meta = self.ctx.store_meta.lock().unwrap(); if meta.atomic_snap_regions.contains_key(&self.region_id()) { @@ -4400,9 +4435,9 @@ where "merge_state" => ?self.fsm.peer.pending_merge_state, ); // Because of the checking before proposing `PrepareMerge`, which is - // no `CompactLog` proposal between the smallest commit index and the latest index. - // If the merge succeed, all source peers are impossible in apply snapshot state - // and must be initialized. + // no `CompactLog` proposal between the smallest commit index and the latest + // index. If the merge succeed, all source peers are impossible in apply + // snapshot state and must be initialized. // So `maybe_destroy` must succeed here. let job = self.fsm.peer.maybe_destroy(self.ctx).unwrap(); self.handle_destroy_peer(job); @@ -4442,8 +4477,9 @@ where ); // Remove this region's snapshot region from the `pending_snapshot_regions` - // The `pending_snapshot_regions` is only used to occupy the key range, so if this - // peer is added to `region_ranges`, it can be remove from `pending_snapshot_regions` + // The `pending_snapshot_regions` is only used to occupy the key range, so if + // this peer is added to `region_ranges`, it can be remove from + // `pending_snapshot_regions` meta.pending_snapshot_regions .retain(|r| self.fsm.region_id() != r.get_id()); @@ -4486,7 +4522,8 @@ where } } else if self.fsm.peer.local_first_replicate { // This peer is uninitialized previously. - // More accurately, the `RegionLocalState` has been persisted so the data can be removed from `pending_create_peers`. + // More accurately, the `RegionLocalState` has been persisted so the data can be + // removed from `pending_create_peers`. let mut pending_create_peers = self.ctx.pending_create_peers.lock().unwrap(); assert_eq!( pending_create_peers.remove(&self.fsm.region_id()), @@ -4576,14 +4613,15 @@ where } } - // Update metrics only when all exec_results are finished in case the metrics is counted multiple times - // when waiting for commit merge + // Update metrics only when all exec_results are finished in case the metrics is + // counted multiple times when waiting for commit merge self.ctx.store_stat.lock_cf_bytes_written += metrics.lock_cf_written_bytes; self.ctx.store_stat.engine_total_bytes_written += metrics.written_bytes; self.ctx.store_stat.engine_total_keys_written += metrics.written_keys; } - /// Check if a request is valid if it has valid prepare_merge/commit_merge proposal. + /// Check if a request is valid if it has valid prepare_merge/commit_merge + /// proposal. fn check_merge_proposal(&self, msg: &mut RaftCmdRequest) -> Result<()> { if !msg.get_admin_request().has_prepare_merge() && !msg.get_admin_request().has_commit_merge() @@ -4678,7 +4716,8 @@ where let request = msg.get_requests(); if self.fsm.peer.force_leader.is_some() { - // in force leader state, forbid requests to make the recovery progress less error-prone + // in force leader state, forbid requests to make the recovery progress less + // error-prone if !(msg.has_admin_request() && (msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeer || msg.get_admin_request().get_cmd_type() == AdminCmdType::ChangePeerV2)) @@ -4724,8 +4763,8 @@ where .region_not_initialized += 1; return Err(Error::RegionNotInitialized(region_id)); } - // If the peer is applying snapshot, it may drop some sending messages, that could - // make clients wait for response until timeout. + // If the peer is applying snapshot, it may drop some sending messages, that + // could make clients wait for response until timeout. if self.fsm.peer.is_handling_snapshot() { self.ctx.raft_metrics.invalid_proposal.is_applying_snapshot += 1; // TODO: replace to a more suitable error. @@ -4742,10 +4781,10 @@ where match util::check_region_epoch(msg, self.fsm.peer.region(), true) { Err(Error::EpochNotMatch(m, mut new_regions)) => { - // Attach the region which might be split from the current region. But it doesn't - // matter if the region is not split from the current region. If the region meta - // received by the TiKV driver is newer than the meta cached in the driver, the meta is - // updated. + // Attach the region which might be split from the current region. But it + // doesn't matter if the region is not split from the current region. If the + // region meta received by the TiKV driver is newer than the meta cached in the + // driver, the meta is updated. let requested_version = msg.get_header().get_region_epoch().version; self.collect_sibling_region(requested_version, &mut new_regions); self.ctx.raft_metrics.invalid_proposal.epoch_not_match += 1; @@ -4756,7 +4795,8 @@ where } } - /// Propose batched raft commands(if any) first, then propose the given raft command. + /// Propose batched raft commands(if any) first, then propose the given raft + /// command. fn propose_raft_command( &mut self, msg: RaftCmdRequest, @@ -4773,7 +4813,8 @@ where } /// Propose the raft command directly. - /// Note that this function introduces a reorder between this command and batched commands. + /// Note that this function introduces a reorder between this command and + /// batched commands. fn propose_raft_command_internal( &mut self, mut msg: RaftCmdRequest, @@ -4827,9 +4868,9 @@ where } // Note: - // The peer that is being checked is a leader. It might step down to be a follower later. It - // doesn't matter whether the peer is a leader or not. If it's not a leader, the proposing - // command log entry can't be committed. + // The peer that is being checked is a leader. It might step down to be a + // follower later. It doesn't matter whether the peer is a leader or not. If + // it's not a leader, the proposing command log entry can't be committed. let mut resp = RaftCmdResponse::default(); let term = self.fsm.peer.term(); @@ -4875,7 +4916,8 @@ where collect_cnt -= 1; // For example, A is split into B, A, and then B is split into C, B. if r.get_region_epoch().version >= max_version { - // It doesn't matter if it's a false positive, as it's limited by MAX_REGIONS_IN_ERROR. + // It doesn't matter if it's a false positive, as it's limited by + // MAX_REGIONS_IN_ERROR. collect_cnt += r.get_region_epoch().version - max_version; max_version = r.get_region_epoch().version; } @@ -4896,8 +4938,9 @@ where #[allow(clippy::if_same_then_else)] fn on_raft_gc_log_tick(&mut self, force_compact: bool) { if !self.fsm.peer.is_leader() { - // `compact_cache_to` is called when apply, there is no need to call `compact_to` here, - // snapshot generating has already been cancelled when the role becomes follower. + // `compact_cache_to` is called when apply, there is no need to call + // `compact_to` here, snapshot generating has already been cancelled + // when the role becomes follower. return; } if !self.fsm.peer.get_store().is_entry_cache_empty() || !self.ctx.cfg.hibernate_regions { @@ -4907,9 +4950,10 @@ where fail_point!("on_raft_gc_log_tick", |_| {}); debug_assert!(!self.fsm.stopped); - // As leader, we would not keep caches for the peers that didn't response heartbeat in the - // last few seconds. That happens probably because another TiKV is down. In this case if we - // do not clean up the cache, it may keep growing. + // As leader, we would not keep caches for the peers that didn't response + // heartbeat in the last few seconds. That happens probably because + // another TiKV is down. In this case if we do not clean up the cache, + // it may keep growing. let drop_cache_duration = self.ctx.cfg.raft_heartbeat_interval() + self.ctx.cfg.raft_entry_cache_life_time.0; let cache_alive_limit = Instant::now() - drop_cache_duration; @@ -4982,11 +5026,13 @@ where { std::cmp::max(first_idx + (last_idx - first_idx) / 2, replicated_idx) } else if replicated_idx < first_idx || last_idx - first_idx < 3 { - // In the current implementation one compaction can't delete all stale Raft logs. - // There will be at least 3 entries left after one compaction: + // In the current implementation one compaction can't delete all stale Raft + // logs. There will be at least 3 entries left after one compaction: + // ``` // |------------- entries needs to be compacted ----------| // [entries...][the entry at `compact_idx`][the last entry][new compaction entry] // |-------------------- entries will be left ----------------------| + // ``` self.ctx.raft_metrics.raft_log_gc_skipped.reserve_log += 1; return; } else if replicated_idx - first_idx < self.ctx.cfg.raft_log_gc_threshold @@ -5073,13 +5119,13 @@ where return; } - // When restart, the may_skip_split_check will be false. The split check will first - // check the region size, and then check whether the region should split. This - // should work even if we change the region max size. + // When restart, the may_skip_split_check will be false. The split check will + // first check the region size, and then check whether the region should split. + // This should work even if we change the region max size. // If peer says should update approximate size, update region size and check // whether the region should split. - // We assume that `may_skip_split_check` is only set true after the split check task is - // scheduled. + // We assume that `may_skip_split_check` is only set true after the split check + // task is scheduled. if self.fsm.peer.may_skip_split_check && self.fsm.peer.compaction_declined_bytes < self.ctx.cfg.region_split_check_diff().0 && self.fsm.peer.size_diff_hint < self.ctx.cfg.region_split_check_diff().0 @@ -5097,19 +5143,20 @@ where return; } - // When Lightning or BR is importing data to TiKV, their ingest-request may fail because of - // region-epoch not matched. So we hope TiKV do not check region size and split region during - // importing. + // When Lightning or BR is importing data to TiKV, their ingest-request may fail + // because of region-epoch not matched. So we hope TiKV do not check region size + // and split region during importing. if self.ctx.importer.get_mode() == SwitchMode::Import { return; } - // bulk insert too fast may cause snapshot stale very soon, worst case it stale before - // sending. so when snapshot is generating or sending, skip split check at most 3 times. - // There is a trade off between region size and snapshot success rate. Split check is - // triggered every 10 seconds. If a snapshot can't be generated in 30 seconds, it might be - // just too large to be generated. Split it into smaller size can help generation. check - // issue 330 for more info. + // bulk insert too fast may cause snapshot stale very soon, worst case it stale + // before sending. so when snapshot is generating or sending, skip split check + // at most 3 times. There is a trade off between region size and snapshot + // success rate. Split check is triggered every 10 seconds. If a snapshot can't + // be generated in 30 seconds, it might be just too large to be generated. Split + // it into smaller size can help generation. check issue 330 for more + // info. if self.fsm.peer.get_store().is_generating_snapshot() && self.fsm.skip_split_count < self.region_split_skip_max_count() { @@ -5541,8 +5588,8 @@ where return; } - // Do not check the bucket ranges if we want to split the region with a given key range, - // this is to avoid compatibility issues. + // Do not check the bucket ranges if we want to split the region with a given + // key range, this is to avoid compatibility issues. let split_check_bucket_ranges = if !is_key_range { self.gen_bucket_range_for_update() } else { @@ -5608,8 +5655,8 @@ where } if let Some(ForceLeaderState::ForceLeader { time, .. }) = self.fsm.peer.force_leader { - // Clean up the force leader state after a timeout, since the PD recovery process may - // have been aborted for some reasons. + // Clean up the force leader state after a timeout, since the PD recovery + // process may have been aborted for some reasons. if time.saturating_elapsed() > cmp::max( self.ctx.cfg.peer_stale_state_check_interval.0, @@ -5660,8 +5707,9 @@ where // from the cluster or probably destroyed. // Meantime, D, E, F would not reach B, since it's not in the cluster anymore. // In this case, peer B would notice that the leader is missing for a long time, - // and it would check with pd to confirm whether it's still a member of the cluster. - // If not, it destroys itself as a stale peer which is removed out already. + // and it would check with pd to confirm whether it's still a member of the + // cluster. If not, it destroys itself as a stale peer which is removed out + // already. let state = self.fsm.peer.check_stale_state(self.ctx); fail_point!("peer_check_stale_state", state != StaleState::Valid, |_| {}); match state { @@ -5719,8 +5767,8 @@ where fn on_reactivate_memory_lock_tick(&mut self) { let mut pessimistic_locks = self.fsm.peer.txn_ext.pessimistic_locks.write(); - // If it is not leader, we needn't reactivate by tick. In-memory pessimistic lock will - // be enabled when this region becomes leader again. + // If it is not leader, we needn't reactivate by tick. In-memory pessimistic + // lock will be enabled when this region becomes leader again. // And this tick is currently only used for the leader transfer failure case. if !self.fsm.peer.is_leader() || pessimistic_locks.status != LocksStatus::TransferringLeader { @@ -5729,8 +5777,8 @@ where self.fsm.reactivate_memory_lock_ticks += 1; let transferring_leader = self.fsm.peer.raft_group.raft.lead_transferee.is_some(); - // `lead_transferee` is not set immediately after the lock status changes. So, we need - // the tick count condition to avoid reactivating too early. + // `lead_transferee` is not set immediately after the lock status changes. So, + // we need the tick count condition to avoid reactivating too early. if !transferring_leader && self.fsm.reactivate_memory_lock_ticks >= self.ctx.cfg.reactive_memory_lock_timeout_tick @@ -5839,8 +5887,8 @@ where Some(self.fsm.peer.approximate_size.unwrap_or_default() + size); self.fsm.peer.approximate_keys = Some(self.fsm.peer.approximate_keys.unwrap_or_default() + keys); - // The ingested file may be overlapped with the data in engine, so we need to check it - // again to get the accurate value. + // The ingested file may be overlapped with the data in engine, so we need to + // check it again to get the accurate value. self.fsm.peer.may_skip_split_check = false; if self.fsm.peer.is_leader() { self.on_pd_heartbeat_tick(); @@ -5849,13 +5897,13 @@ where } fn on_transfer_leader(&mut self, term: u64) { - // If the term has changed between proposing and executing the TransferLeader request, - // ignore it because this request may be stale. + // If the term has changed between proposing and executing the TransferLeader + // request, ignore it because this request may be stale. if term != self.fsm.peer.term() { return; } - // As the leader can propose the TransferLeader request successfully, the disk of - // the leader is probably not full. + // As the leader can propose the TransferLeader request successfully, the disk + // of the leader is probably not full. self.fsm.peer.execute_transfer_leader( self.ctx, self.fsm.peer.leader_id(), @@ -5865,7 +5913,8 @@ where self.fsm.has_ready = true; } - /// Verify and store the hash to state. return true means the hash has been stored successfully. + /// Verify and store the hash to state. return true means the hash has been + /// stored successfully. // TODO: Consider context in the function. fn verify_and_store_hash( &mut self, @@ -5915,8 +5964,9 @@ where if self.fsm.peer.consistency_state.index != INVALID_INDEX && !self.fsm.peer.consistency_state.hash.is_empty() { - // Maybe computing is too slow or computed result is dropped due to channel full. - // If computing is too slow, miss count will be increased twice. + // Maybe computing is too slow or computed result is dropped due to channel + // full. If computing is too slow, miss count will be increased + // twice. REGION_HASH_COUNTER.verify.miss.inc(); warn!( "hash belongs to wrong index, skip."; @@ -5939,15 +5989,17 @@ where } } -/// Checks merge target, returns whether the source peer should be destroyed and whether the source peer is -/// merged to this target peer. +/// Checks merge target, returns whether the source peer should be destroyed and +/// whether the source peer is merged to this target peer. /// /// It returns (`can_destroy`, `merge_to_this_peer`). /// -/// `can_destroy` is true when there is a network isolation which leads to a follower of a merge target -/// Region's log falls behind and then receive a snapshot with epoch version after merge. +/// `can_destroy` is true when there is a network isolation which leads to a +/// follower of a merge target Region's log falls behind and then receive a +/// snapshot with epoch version after merge. /// -/// `merge_to_this_peer` is true when `can_destroy` is true and the source peer is merged to this target peer. +/// `merge_to_this_peer` is true when `can_destroy` is true and the source peer +/// is merged to this target peer. pub fn maybe_destroy_source( meta: &StoreMeta, target_region_id: u64, @@ -5964,8 +6016,8 @@ pub fn maybe_destroy_source( region_epoch, target_region.get_region_epoch(), ); - // The target peer will move on, namely, it will apply a snapshot generated after merge, - // so destroy source peer. + // The target peer will move on, namely, it will apply a snapshot generated + // after merge, so destroy source peer. if region_epoch.get_version() > target_region.get_region_epoch().get_version() { return ( true, @@ -5975,7 +6027,8 @@ pub fn maybe_destroy_source( .get_id(), ); } - // Wait till the target peer has caught up logs and source peer will be destroyed at that time. + // Wait till the target peer has caught up logs and source peer will be + // destroyed at that time. return (false, false); } } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index f92d08dd3a4..635ff2c6693 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -124,25 +124,30 @@ pub struct StoreMeta { pub regions: HashMap, /// region_id -> reader pub readers: HashMap, - /// `MsgRequestPreVote`, `MsgRequestVote` or `MsgAppend` messages from newly split Regions shouldn't be - /// dropped if there is no such Region in this store now. So the messages are recorded temporarily and - /// will be handled later. + /// `MsgRequestPreVote`, `MsgRequestVote` or `MsgAppend` messages from newly + /// split Regions shouldn't be dropped if there is no such Region in this + /// store now. So the messages are recorded temporarily and will be handled + /// later. pub pending_msgs: RingQueue, /// The regions with pending snapshots. pub pending_snapshot_regions: Vec, - /// A marker used to indicate the peer of a Region has received a merge target message and waits to be destroyed. - /// target_region_id -> (source_region_id -> merge_target_region) + /// A marker used to indicate the peer of a Region has received a merge + /// target message and waits to be destroyed. target_region_id -> + /// (source_region_id -> merge_target_region) pub pending_merge_targets: HashMap>, - /// An inverse mapping of `pending_merge_targets` used to let source peer help target peer to clean up related entry. - /// source_region_id -> target_region_id + /// An inverse mapping of `pending_merge_targets` used to let source peer + /// help target peer to clean up related entry. source_region_id -> + /// target_region_id pub targets_map: HashMap, - /// `atomic_snap_regions` and `destroyed_region_for_snap` are used for making destroy overlapped regions - /// and apply snapshot atomically. + /// `atomic_snap_regions` and `destroyed_region_for_snap` are used for + /// making destroy overlapped regions and apply snapshot atomically. /// region_id -> wait_destroy_regions_map(source_region_id -> is_ready) - /// A target peer must wait for all source peer to ready before applying snapshot. + /// A target peer must wait for all source peer to ready before applying + /// snapshot. pub atomic_snap_regions: HashMap>, /// source_region_id -> need_atomic - /// Used for reminding the source peer to switch to ready in `atomic_snap_regions`. + /// Used for reminding the source peer to switch to ready in + /// `atomic_snap_regions`. pub destroyed_region_for_snap: HashMap, /// region_id -> `RegionReadProgress` pub region_read_progress: RegionReadProgressRegistry, @@ -191,7 +196,8 @@ impl StoreMeta { /// end_key > file.smallestkey /// start_key <= file.largestkey pub fn update_overlap_damaged_ranges(&mut self, fname: &str, start: &[u8], end: &[u8]) -> bool { - // `region_ranges` is promised to have no overlap so just check the first region. + // `region_ranges` is promised to have no overlap so just check the first + // region. if let Some((_, id)) = self .region_ranges .range((Excluded(start.to_owned()), Unbounded::>)) @@ -471,11 +477,12 @@ where pub feature_gate: FeatureGate, /// region_id -> (peer_id, is_splitting) /// Used for handling race between splitting and creating new peer. - /// An uninitialized peer can be replaced to the one from splitting iff they are exactly the same peer. + /// An uninitialized peer can be replaced to the one from splitting iff they + /// are exactly the same peer. /// /// WARNING: - /// To avoid deadlock, if you want to use `store_meta` and `pending_create_peers` together, - /// the lock sequence MUST BE: + /// To avoid deadlock, if you want to use `store_meta` and + /// `pending_create_peers` together, the lock sequence MUST BE: /// 1. lock the store_meta. /// 2. lock the pending_create_peers. pub pending_create_peers: Arc>>, @@ -485,8 +492,8 @@ where pub timer: SteadyTimer, pub trans: T, /// WARNING: - /// To avoid deadlock, if you want to use `store_meta` and `global_replication_state` together, - /// the lock sequence MUST BE: + /// To avoid deadlock, if you want to use `store_meta` and + /// `global_replication_state` together, the lock sequence MUST BE: /// 1. lock the store_meta. /// 2. lock the global_replication_state. pub global_replication_state: Arc>, @@ -895,7 +902,8 @@ impl PollHandler, St let mut delegate = PeerFsmDelegate::new(peer, &mut self.poll_ctx); delegate.handle_msgs(&mut self.peer_msg_buf); - // No readiness is generated and using sync write, skipping calling ready and release early. + // No readiness is generated and using sync write, skipping calling ready and + // release early. if !delegate.collect_ready() && self.poll_ctx.sync_write_worker.is_some() { if let HandleResult::StopAt { skip_end, .. } = &mut handle_result { *skip_end = true; @@ -1805,8 +1813,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } else { let mut need_gc_msg = util::is_vote_msg(msg.get_message()); if msg.has_extra_msg() { - // A learner can't vote so it sends the check-stale-peer msg to others to find out whether - // it is removed due to conf change or merge. + // A learner can't vote so it sends the check-stale-peer msg to others to find + // out whether it is removed due to conf change or merge. need_gc_msg |= msg.get_extra_msg().get_type() == ExtraMessageType::MsgCheckStalePeer; // For backward compatibility @@ -1834,8 +1842,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER return Ok(CheckMsgStatus::DropMsg); } // A tombstone peer may not apply the conf change log which removes itself. - // In this case, the local epoch is stale and the local peer can be found from region. - // We can compare the local peer id with to_peer_id to verify whether it is correct to create a new peer. + // In this case, the local epoch is stale and the local peer can be found from + // region. We can compare the local peer id with to_peer_id to verify whether it + // is correct to create a new peer. if let Some(local_peer_id) = util::find_peer(region, self.ctx.store_id()).map(|r| r.get_id()) { @@ -1980,7 +1989,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } let res = self.maybe_create_peer_internal(region_id, msg, is_local_first); - // If failed, i.e. Err or Ok(false), remove this peer data from `pending_create_peers`. + // If failed, i.e. Err or Ok(false), remove this peer data from + // `pending_create_peers`. if res.as_ref().map_or(true, |b| !*b) && is_local_first { let mut pending_create_peers = self.ctx.pending_create_peers.lock().unwrap(); if let Some(status) = pending_create_peers.get(®ion_id) { @@ -2021,13 +2031,16 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let pending_create_peers = self.ctx.pending_create_peers.lock().unwrap(); match pending_create_peers.get(®ion_id) { Some(status) if *status == (msg.get_to_peer().get_id(), false) => (), - // If changed, it means this peer has been/will be replaced from the new one from splitting. + // If changed, it means this peer has been/will be replaced from the new one from + // splitting. _ => return Ok(false), } - // Note that `StoreMeta` lock is held and status is (peer_id, false) in `pending_create_peers` now. - // If this peer is created from splitting latter and then status in `pending_create_peers` is changed, - // that peer creation in `on_ready_split_region` must be executed **after** current peer creation - // because of the `StoreMeta` lock. + // Note that `StoreMeta` lock is held and status is (peer_id, false) + // in `pending_create_peers` now. If this peer is created from + // splitting latter and then status in `pending_create_peers` is + // changed, that peer creation in `on_ready_split_region` must be + // executed **after** current peer creation because of the + // `StoreMeta` lock. } if meta.overlap_damaged_range( @@ -2096,8 +2109,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER is_overlapped = true; if msg.get_region_epoch().get_version() > exist_region.get_region_epoch().get_version() { - // If new region's epoch version is greater than exist region's, the exist region - // may has been merged/splitted already. + // If new region's epoch version is greater than exist region's, the exist + // region may has been merged/splitted already. let _ = self.ctx.router.force_send( exist_region.get_id(), PeerMsg::CasualMessage(CasualMessage::RegionOverlapped), @@ -2538,9 +2551,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } } - // When there is an import job running, the region which this sst belongs may has not been - // split from the origin region because the apply thread is so busy that it can not apply - // SplitRequest as soon as possible. So we can not delete this sst file. + // When there is an import job running, the region which this sst belongs may + // has not been split from the origin region because the apply thread is so busy + // that it can not apply SplitRequest as soon as possible. So we can not + // delete this sst file. if !validate_ssts.is_empty() && self.ctx.importer.get_mode() != SwitchMode::Import { let task = CleanupSstTask::ValidateSst { ssts: validate_ssts, diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index e3820a6d3ee..e552229aa0c 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -81,9 +81,9 @@ pub type TestCallback = Box; /// Variants of callbacks for `Msg`. /// - `Read`: a callback for read only requests including `StatusRequest`, -/// `GetRequest` and `SnapRequest` +/// `GetRequest` and `SnapRequest` /// - `Write`: a callback for write only requests including `AdminRequest` -/// `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. +/// `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. pub enum Callback { /// No callback. None, @@ -92,12 +92,14 @@ pub enum Callback { /// Write callback. Write { cb: WriteCallback, - /// `proposed_cb` is called after a request is proposed to the raft group successfully. - /// It's used to notify the caller to move on early because it's very likely the request - /// will be applied to the raftstore. + /// `proposed_cb` is called after a request is proposed to the raft + /// group successfully. It's used to notify the caller to move on early + /// because it's very likely the request will be applied to the + /// raftstore. proposed_cb: Option, - /// `committed_cb` is called after a request is committed and before it's being applied, and - /// it's guaranteed that the request will be successfully applied soon. + /// `committed_cb` is called after a request is committed and before + /// it's being applied, and it's guaranteed that the request will be + /// successfully applied soon. committed_cb: Option, trackers: SmallVec<[TimeTracker; 4]>, }, @@ -298,18 +300,20 @@ pub enum MergeResultKind { /// Its target peer applys `CommitMerge` log. FromTargetLog, /// Its target peer receives snapshot. - /// In step 1, this peer should mark `pending_move` is true and destroy its apply fsm. - /// Then its target peer will remove this peer data and apply snapshot atomically. + /// In step 1, this peer should mark `pending_move` is true and destroy its + /// apply fsm. Then its target peer will remove this peer data and apply + /// snapshot atomically. FromTargetSnapshotStep1, /// In step 2, this peer should destroy its peer fsm. FromTargetSnapshotStep2, - /// This peer is no longer needed by its target peer so it can be destroyed by itself. - /// It happens if and only if its target peer has been removed by conf change. + /// This peer is no longer needed by its target peer so it can be destroyed + /// by itself. It happens if and only if its target peer has been removed by + /// conf change. Stale, } -/// Some significant messages sent to raftstore. Raftstore will dispatch these messages to Raft -/// groups to update some important internal status. +/// Some significant messages sent to raftstore. Raftstore will dispatch these +/// messages to Raft groups to update some important internal status. #[derive(Debug)] pub enum SignificantMsg where @@ -389,7 +393,8 @@ pub enum CasualMessage { hash: Vec, }, - /// Approximate size of target region. This message can only be sent by split-check thread. + /// Approximate size of target region. This message can only be sent by + /// split-check thread. RegionApproximateSize { size: u64, }, @@ -578,15 +583,16 @@ pub enum PeerMsg { /// leader of the target raft group. If it's failed to be sent, callback /// usually needs to be called before dropping in case of resource leak. RaftCommand(RaftCommand), - /// Tick is periodical task. If target peer doesn't exist there is a potential - /// that the raft node will not work anymore. + /// Tick is periodical task. If target peer doesn't exist there is a + /// potential that the raft node will not work anymore. Tick(PeerTick), /// Result of applying committed entries. The message can't be lost. ApplyRes { res: ApplyTaskRes, }, - /// Message that can't be lost but rarely created. If they are lost, real bad - /// things happen like some peers will be considered dead in the group. + /// Message that can't be lost but rarely created. If they are lost, real + /// bad things happen like some peers will be considered dead in the + /// group. SignificantMsg(SignificantMsg), /// Start the FSM. Start, @@ -636,8 +642,9 @@ impl fmt::Debug for PeerMsg { } impl PeerMsg { - /// For some specific kind of messages, it's actually acceptable if failed to send it by - /// `significant_send`. This function determine if the current message is acceptable to fail. + /// For some specific kind of messages, it's actually acceptable if failed + /// to send it by `significant_send`. This function determine if the + /// current message is acceptable to fail. pub fn is_send_failure_ignorable(&self) -> bool { matches!( self, @@ -656,8 +663,8 @@ where invalid_ssts: Vec, }, - // Clear region size and keys for all regions in the range, so we can force them to re-calculate - // their size later. + // Clear region size and keys for all regions in the range, so we can force them to + // re-calculate their size later. ClearRegionSizeInRange { start_key: Vec, end_key: Vec, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 2bcaefff762..62721b5c1c9 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -296,8 +296,9 @@ impl ProposedAdminCmd { } struct CmdEpochChecker { - // Although it's a deque, because of the characteristics of the settings from `admin_cmd_epoch_lookup`, - // the max size of admin cmd is 2, i.e. split/merge and change peer. + // Although it's a deque, because of the characteristics of the settings from + // `admin_cmd_epoch_lookup`, the max size of admin cmd is 2, i.e. split/merge and change + // peer. proposed_admin_cmd: VecDeque>, term: u64, } @@ -324,10 +325,11 @@ impl CmdEpochChecker { } } - /// Check if the proposal can be proposed on the basis of its epoch and previous proposed admin cmds. + /// Check if the proposal can be proposed on the basis of its epoch and + /// previous proposed admin cmds. /// - /// Returns None if passing the epoch check, otherwise returns a index which is the last - /// admin cmd index conflicted with this proposal. + /// Returns None if passing the epoch check, otherwise returns a index which + /// is the last admin cmd index conflicted with this proposal. fn propose_check_epoch(&mut self, req: &RaftCmdRequest, term: u64) -> Option { self.maybe_update_term(term); let (check_ver, check_conf_ver) = if !req.has_admin_request() { @@ -473,12 +475,13 @@ pub struct ReadyResult { #[derive(Debug)] /// ForceLeader process would be: -/// 1. If it's hibernated, enter wait ticks state, and wake up the peer -/// 2. Enter pre force leader state, become candidate and send request vote to all peers -/// 3. Wait for the responses of the request vote, no reject should be received. -/// 4. Enter force leader state, become leader without leader lease -/// 5. Execute recovery plan(some remove-peer commands) -/// 6. After the plan steps are all applied, exit force leader state +/// - If it's hibernated, enter wait ticks state, and wake up the peer +/// - Enter pre force leader state, become candidate and send request vote to +/// all peers +/// - Wait for the responses of the request vote, no reject should be received. +/// - Enter force leader state, become leader without leader lease +/// - Execute recovery plan(some remove-peer commands) +/// - After the plan steps are all applied, exit force leader state pub enum ForceLeaderState { WaitTicks { syncer: UnsafeRecoveryForceLeaderSyncer, @@ -495,32 +498,34 @@ pub enum ForceLeaderState { }, } -// Following shared states are used while reporting to PD for unsafe recovery and shared among -// all the regions per their life cycle. +// Following shared states are used while reporting to PD for unsafe recovery +// and shared among all the regions per their life cycle. // The work flow is like: -// 1. report phase -// start_unsafe_recovery_report -// -> broadcast wait-apply commands -// -> wait for all the peers' apply indices meet their targets -// -> broadcast fill out report commands -// -> wait for all the peers fill out the reports for themselves -// -> send a store report (through store heartbeat) -// 2. force leader phase -// dispatch force leader commands -// -> wait for all the peers that received the command become force leader -// -> start_unsafe_recovery_report -// 3. plan execution phase -// dispatch recovery plans -// -> wait for all the creates, deletes and demotes to finish, for the demotes, -// procedures are: -// -> exit joint state if it is already in joint state -// -> demote failed voters, and promote self to be a voter if it is a learner -// -> exit joint state -// -> start_unsafe_recovery_report - -// Intends to use RAII to sync unsafe recovery procedures between peers, in addition to that, -// it uses a closure to avoid having a raft router as a member variable, which is statically -// dispatched, thus needs to propagate the generics everywhere. +// 1. report phase +// - start_unsafe_recovery_report +// - broadcast wait-apply commands +// - wait for all the peers' apply indices meet their targets +// - broadcast fill out report commands +// - wait for all the peers fill out the reports for themselves +// - send a store report (through store heartbeat) +// 2. force leader phase +// - dispatch force leader commands +// - wait for all the peers that received the command become force leader +// - start_unsafe_recovery_report +// 3. plan execution phase +// - dispatch recovery plans +// - wait for all the creates, deletes and demotes to finish, for the +// demotes, procedures are: +// - exit joint state if it is already in joint state +// - demote failed voters, and promote self to be a voter if it is a +// learner +// - exit joint state +// - start_unsafe_recovery_report +// +// Intends to use RAII to sync unsafe recovery procedures between peers, in +// addition to that, it uses a closure to avoid having a raft router as a member +// variable, which is statically dispatched, thus needs to propagate the +// generics everywhere. pub struct InvokeClosureOnDrop(Box); impl fmt::Debug for InvokeClosureOnDrop { @@ -732,15 +737,17 @@ where pub should_wake_up: bool, /// Whether this peer is destroyed asynchronously. /// If it's true, - /// 1. when merging, its data in storeMeta will be removed early by the target peer. - /// 2. all read requests must be rejected. + /// - when merging, its data in storeMeta will be removed early by the + /// target peer. + /// - all read requests must be rejected. pub pending_remove: bool, /// Force leader state is only used in online recovery when the majority of - /// peers are missing. In this state, it forces one peer to become leader out - /// of accordance with Raft election rule, and forbids any read/write proposals. - /// With that, we can further propose remove failed-nodes conf-change, to make - /// the Raft group forms majority and works normally later on. + /// peers are missing. In this state, it forces one peer to become leader + /// out of accordance with Raft election rule, and forbids any + /// read/write proposals. With that, we can further propose remove + /// failed-nodes conf-change, to make the Raft group forms majority and + /// works normally later on. /// /// For details, see the comment of `ForceLeaderState`. pub force_leader: Option, @@ -757,16 +764,17 @@ where /// The count of deleted keys since last reset. delete_keys_hint: u64, /// An inaccurate difference in region size after compaction. - /// It is used to trigger check split to update approximate size and keys after space reclamation - /// of deleted entries. + /// It is used to trigger check split to update approximate size and keys + /// after space reclamation of deleted entries. pub compaction_declined_bytes: u64, /// Approximate size of the region. pub approximate_size: Option, /// Approximate keys of the region. pub approximate_keys: Option, - /// Whether this region has scheduled a split check task. If we just splitted - /// the region or ingested one file which may be overlapped with the existed data, - /// reset the flag so that the region can be splitted again. + /// Whether this region has scheduled a split check task. If we just + /// splitted the region or ingested one file which may be overlapped + /// with the existed data, reset the flag so that the region can be + /// splitted again. pub may_skip_split_check: bool, /// The state for consistency check. @@ -776,7 +784,8 @@ where pub pending_request_snapshot_count: Arc, /// The index of last scheduled committed raft log. pub last_applying_idx: u64, - /// The index of last compacted raft log. It is used for the next compact log task. + /// The index of last compacted raft log. It is used for the next compact + /// log task. pub last_compacted_idx: u64, /// The index of the latest urgent proposal index. last_urgent_proposal_idx: u64, @@ -788,9 +797,10 @@ where pub raft_log_size_hint: u64, /// The write fence index. - /// If there are pessimistic locks, PrepareMerge can be proposed after applying to - /// this index. When a pending PrepareMerge exists, no more write commands should be proposed. - /// This avoids proposing pessimistic locks that are already deleted before PrepareMerge. + /// If there are pessimistic locks, PrepareMerge can be proposed after + /// applying to this index. When a pending PrepareMerge exists, no more + /// write commands should be proposed. This avoids proposing pessimistic + /// locks that are already deleted before PrepareMerge. pub prepare_merge_fence: u64, pub pending_prepare_merge: Option, @@ -816,8 +826,8 @@ where pub replication_mode_version: u64, /// The required replication state at current version. pub dr_auto_sync_state: DrAutoSyncState, - /// A flag that caches sync state. It's set to true when required replication - /// state is reached for current region. + /// A flag that caches sync state. It's set to true when required + /// replication state is reached for current region. pub replication_sync: bool, /// The known newest conf version and its corresponding peer list @@ -1092,9 +1102,10 @@ where pub fn maybe_append_merge_entries(&mut self, merge: &CommitMergeRequest) -> Option { let mut entries = merge.get_entries(); if entries.is_empty() { - // Though the entries is empty, it is possible that one source peer has caught up the logs - // but commit index is not updated. If other source peers are already destroyed, so the raft - // group will not make any progress, namely the source peer can not get the latest commit index anymore. + // Though the entries is empty, it is possible that one source peer has caught + // up the logs but commit index is not updated. If other source peers are + // already destroyed, so the raft group will not make any progress, namely the + // source peer can not get the latest commit index anymore. // Here update the commit index to let source apply rest uncommitted entries. return if merge.get_commit() > self.raft_group.raft.raft_log.committed { self.raft_group.raft.raft_log.commit_to(merge.get_commit()); @@ -1113,9 +1124,9 @@ where "commit_index" => self.raft_group.raft.raft_log.committed, ); if log_idx < self.raft_group.raft.raft_log.committed { - // There are maybe some logs not included in CommitMergeRequest's entries, like CompactLog, - // so the commit index may exceed the last index of the entires from CommitMergeRequest. - // If that, no need to append + // There are maybe some logs not included in CommitMergeRequest's entries, like + // CompactLog, so the commit index may exceed the last index of the entires from + // CommitMergeRequest. If that, no need to append if self.raft_group.raft.raft_log.committed - log_idx >= entries.len() as u64 { return None; } @@ -1126,9 +1137,10 @@ where let last_log = entries.last().unwrap(); if last_log.term > self.term() { - // Hack: In normal flow, when leader sends the entries, it will use a term that's not less - // than the last log term. And follower will update its states correctly. For merge, we append - // the log without raft, so we have to take care of term explicitly to get correct metadata. + // Hack: In normal flow, when leader sends the entries, it will use a term + // that's not less than the last log term. And follower will update its states + // correctly. For merge, we append the log without raft, so we have to take care + // of term explicitly to get correct metadata. info!( "become follower for new logs"; "new_log_term" => last_log.term, @@ -1149,7 +1161,8 @@ where .map(|(_, last_index)| last_index) } - /// Tries to destroy itself. Returns a job (if needed) to do more cleaning tasks. + /// Tries to destroy itself. Returns a job (if needed) to do more cleaning + /// tasks. pub fn maybe_destroy(&mut self, ctx: &PollContext) -> Option { if self.pending_remove { info!( @@ -1193,15 +1206,15 @@ where // There is no applying snapshot or snapshot is canceled so the `apply_snap_ctx` // should be set to None. - // 1. If the snapshot is canceled, the `apply_snap_ctx` should be None. - // Remember the snapshot should not be canceled and the context should - // be None only after applying snapshot in normal case. But here is safe - // becasue this peer is about to destroy and `pending_remove` will be true, - // namely no more ready will be fetched. - // 2. If there is no applying snapshot, the `apply_snap_ctx` should also be None. - // It's possible that the snapshot was canceled successfully before but - // `cancel_applying_snap` returns false. If so, at this time, `apply_snap_ctx` - // is Some and should be set to None. + // - If the snapshot is canceled, the `apply_snap_ctx` should be None. Remember + // the snapshot should not be canceled and the context should be None only + // after applying snapshot in normal case. But here is safe because this peer + // is about to destroy and `pending_remove` will be true, namely no more ready + // will be fetched. + // - If there is no applying snapshot, the `apply_snap_ctx` should also be None. + // It's possible that the snapshot was canceled successfully before but + // `cancel_applying_snap` returns false. If so, at this time, `apply_snap_ctx` + // is Some and should be set to None. self.apply_snap_ctx = None; self.pending_remove = true; @@ -1257,14 +1270,15 @@ where panic!("{} unexpected pending states {:?}", self.tag, status); } } else { - // The status is inserted when it's created. It will be removed in following cases: - // 1. By appy worker as it fails to split due to region state key. This is - // impossible to reach this code path because the delete write batch is not - // persisted yet. - // 2. By store fsm as it fails to create peer, which is also invalid obviously. - // 3. By peer fsm after persisting snapshot, then it should be initialized. - // 4. By peer fsm after split. - // 5. By peer fsm when destroy, which should go the above branch instead. + // The status is inserted when it's created. It will be removed in following + // cases: + // - By apply worker as it fails to split due to region state key. This is + // impossible to reach this code path because the delete write batch is not + // persisted yet. + // - By store fsm as it fails to create peer, which is also invalid obviously. + // - By peer fsm after persisting snapshot, then it should be initialized. + // - By peer fsm after split. + // - By peer fsm when destroy, which should go the above branch instead. (None, false) } } else { @@ -1274,16 +1288,16 @@ where // Set Tombstone state explicitly let mut kv_wb = engines.kv.write_batch(); let mut raft_wb = engines.raft.log_batch(1024); - // Raft log gc should be flushed before being destroyed, so last_compacted_idx has to be - // the minimal index that may still have logs. + // Raft log gc should be flushed before being destroyed, so last_compacted_idx + // has to be the minimal index that may still have logs. let last_compacted_idx = self.last_compacted_idx; self.mut_store() .clear_meta(last_compacted_idx, &mut kv_wb, &mut raft_wb)?; - // StoreFsmDelegate::check_msg use both epoch and region peer list to check whether - // a message is targing a staled peer. But for an uninitialized peer, both epoch and - // peer list are empty, so a removed peer will be created again. Saving current peer - // into the peer list of region will fix this problem. + // StoreFsmDelegate::check_msg use both epoch and region peer list to check + // whether a message is targeting a staled peer. But for an uninitialized peer, + // both epoch and peer list are empty, so a removed peer will be created again. + // Saving current peer into the peer list of region will fix this problem. if !self.get_store().is_initialized() { region.mut_peers().push(self.peer.clone()); } @@ -1371,8 +1385,8 @@ where let last_index = self.raft_group.raft.raft_log.last_index(); for (id, pr) in status.progress.unwrap().iter() { // Even a recent inactive node is also considered. If we put leader into sleep, - // followers or learners may not sync its logs for a long time and become unavailable. - // We choose availability instead of performance in this case. + // followers or learners may not sync its logs for a long time and become + // unavailable. We choose availability instead of performance in this case. if *id == self.peer.get_id() { continue; } @@ -1470,13 +1484,13 @@ where ) { if self.region().get_region_epoch().get_version() < region.get_region_epoch().get_version() { - // Epoch version changed, disable read on the localreader for this region. + // Epoch version changed, disable read on the local reader for this region. self.leader_lease.expire_remote_lease(); } self.mut_store().set_region(region.clone()); let progress = ReadProgress::region(region); - // Always update read delegate's region to avoid stale region info after a follower - // becoming a leader. + // Always update read delegate's region to avoid stale region info after a + // follower becoming a leader. self.maybe_update_read_progress(reader, progress); // Update leader info @@ -1535,7 +1549,8 @@ where self.apply_snap_ctx.is_some() || self.get_store().is_applying_snapshot() } - /// Returns `true` if the raft group has replicated a snapshot but not committed it yet. + /// Returns `true` if the raft group has replicated a snapshot but not + /// committed it yet. #[inline] pub fn has_pending_snapshot(&self) -> bool { self.get_pending_snapshot().is_some() @@ -1875,11 +1890,13 @@ where // 1. Current leader hasn't communicated with this peer. // 2. This peer does not exist yet(maybe it is created but not initialized) // - // The correctness of region merge depends on the fact that all target peers must exist during merging. - // (PD rely on `pending_peers` to check whether all target peers exist) + // The correctness of region merge depends on the fact that all target peers + // must exist during merging. (PD rely on `pending_peers` to check whether all + // target peers exist) // // So if the `matched` is 0, it must be a pending peer. - // It can be ensured because `truncated_index` must be greater than `RAFT_INIT_LOG_INDEX`(5). + // It can be ensured because `truncated_index` must be greater than + // `RAFT_INIT_LOG_INDEX`(5). if progress.matched < truncated_idx { if let Some(p) = self.get_peer_from_cache(id) { pending_peers.push(p); @@ -1999,8 +2016,8 @@ where // Updates the `leader_missing_time` according to the current state. // // If we are checking this it means we suspect the leader might be missing. - // Mark down the time when we are called, so we can check later if it's been longer than it - // should be. + // Mark down the time when we are called, so we can check later if it's been + // longer than it should be. match self.leader_missing_time { None => { self.leader_missing_time = Instant::now().into(); @@ -2102,27 +2119,30 @@ where self.lead_transferee = self.raft_group.raft.lead_transferee.unwrap_or_default(); } - /// Correctness depends on the order between calling this function and notifying other peers - /// the new commit index. - /// It is due to the interaction between lease and split/merge.(details are decribed below) + /// Correctness depends on the order between calling this function and + /// notifying other peers the new commit index. + /// It is due to the interaction between lease and split/merge.(details are + /// described below) /// - /// Note that in addition to the hearbeat/append msg, the read index response also can notify - /// other peers the new commit index. There are three place where TiKV handles read index resquest. - /// The first place is in raft-rs, so it's like hearbeat/append msg, call this function and - /// then send the response. The second place is in `Step`, we should use the commit index - /// of `PeerStorage` which is the greatest commit index that can be observed outside. - /// The third place is in `read_index`, handle it like the second one. + /// Note that in addition to the heartbeat/append msg, the read index + /// response also can notify other peers the new commit index. There are + /// three place where TiKV handles read index request. The first place is in + /// raft-rs, so it's like heartbeat/append msg, call this function and then + /// send the response. The second place is in `Step`, we should use the + /// commit index of `PeerStorage` which is the greatest commit index that + /// can be observed outside. The third place is in `read_index`, handle it + /// like the second one. fn on_leader_commit_idx_changed(&mut self, pre_commit_index: u64, commit_index: u64) { if commit_index <= pre_commit_index || !self.is_leader() { return; } - // The admin cmds in `CmdEpochChecker` are proposed by the current leader so we can - // use it to get the split/prepare-merge cmds which was committed just now. + // The admin cmds in `CmdEpochChecker` are proposed by the current leader so we + // can use it to get the split/prepare-merge cmds which was committed just now. - // BatchSplit and Split cmd are mutually exclusive because they both change epoch's - // version so only one of them can be proposed and the other one will be rejected - // by `CmdEpochChecker`. + // BatchSplit and Split cmd are mutually exclusive because they both change + // epoch's version so only one of them can be proposed and the other one will be + // rejected by `CmdEpochChecker`. let last_split_idx = self .cmd_epoch_checker .last_cmd_index(AdminCmdType::BatchSplit) @@ -2179,7 +2199,8 @@ where // by apply worker. So we have to wait here. // Please note that commit_index can't be used here. When applying a snapshot, // a stale heartbeat can make the leader think follower has already applied - // the snapshot, and send remaining log entries, which may increase commit_index. + // the snapshot, and send remaining log entries, which may increase + // commit_index. // TODO: add more test self.last_applying_idx == self.get_store().applied_index() // Requesting snapshots also triggers apply workers to write @@ -2193,8 +2214,8 @@ where fn ready_to_handle_read(&self) -> bool { // TODO: It may cause read index to wait a long time. - // There may be some values that are not applied by this leader yet but the old leader, - // if applied_term isn't equal to current term. + // There may be some values that are not applied by this leader yet but the old + // leader, if applied_term isn't equal to current term. self.get_store().applied_term() == self.term() // There may be stale read if the old leader splits really slow, // the new region may already elected a new leader while @@ -2209,9 +2230,9 @@ where fn ready_to_handle_unsafe_replica_read(&self, read_index: u64) -> bool { // Wait until the follower applies all values before the read. There is still a - // problem if the leader applies fewer values than the follower, the follower read - // could get a newer value, and after that, the leader may read a stale value, - // which violates linearizability. + // problem if the leader applies fewer values than the follower, the follower + // read could get a newer value, and after that, the leader may read a stale + // value, which violates linearizability. self.get_store().applied_index() >= read_index // If it is in pending merge state(i.e. applied PrepareMerge), the data may be stale. // TODO: Add a test to cover this case @@ -2271,17 +2292,19 @@ where /// Returns whether it's valid to handle raft ready. /// /// The snapshot process order would be: - /// 1. Get the snapshot from the ready - /// 2. Wait for the notify of persisting this ready through `Peer::on_persist_ready` - /// 3. Schedule the snapshot task to region worker through `schedule_applying_snapshot` - /// 4. Wait for applying snapshot to complete(`check_snap_status`) + /// - Get the snapshot from the ready + /// - Wait for the notify of persisting this ready through + /// `Peer::on_persist_ready` + /// - Schedule the snapshot task to region worker through + /// `schedule_applying_snapshot` + /// - Wait for applying snapshot to complete(`check_snap_status`) /// Then it's valid to handle the next ready. fn check_snap_status(&mut self, ctx: &mut PollContext) -> bool { if let Some(snap_ctx) = self.apply_snap_ctx.as_ref() { if !snap_ctx.scheduled { // There is a snapshot from ready but it is not scheduled because the ready has - // not been persisted yet. We should wait for the notification of persisting ready - // and do not get a new ready. + // not been persisted yet. We should wait for the notification of persisting + // ready and do not get a new ready. return false; } } @@ -2334,7 +2357,7 @@ where if self.unsafe_recovery_state.is_some() { debug!("unsafe recovery finishes applying a snapshot"); - self.unsafe_recovery_maybe_finish_wait_apply(/*force=*/ false); + self.unsafe_recovery_maybe_finish_wait_apply(/* force= */ false); } } // If `apply_snap_ctx` is none, it means this snapshot does not @@ -2402,9 +2425,9 @@ where } let meta = ctx.store_meta.lock().unwrap(); - // For merge process, the stale source peer is destroyed asynchronously when applying - // snapshot or creating new peer. So here checks whether there is any overlap, if so, - // wait and do not handle raft ready. + // For merge process, the stale source peer is destroyed asynchronously when + // applying snapshot or creating new peer. So here checks whether there is any + // overlap, if so, wait and do not handle raft ready. if let Some(wait_destroy_regions) = meta.atomic_snap_regions.get(&self.region_id) { for (source_region_id, is_ready) in wait_destroy_regions { if !is_ready { @@ -2596,8 +2619,9 @@ where last.raft_msgs.push(persisted_msgs); } } else { - // If this ready don't need to be persisted and there is no previous unpersisted ready, - // we can safely consider it is persisted so the persisted msgs can be sent immediately. + // If this ready don't need to be persisted and there is no previous unpersisted + // ready, we can safely consider it is persisted so the persisted msgs can be + // sent immediately. self.persisted_number = ready_number; if !persisted_msgs.is_empty() { @@ -2606,8 +2630,8 @@ where self.send_raft_messages(ctx, msgs); } - // The commit index and messages of light ready should be empty because no data needs - // to be persisted. + // The commit index and messages of light ready should be empty because no data + // needs to be persisted. let mut light_rd = self.raft_group.advance_append(ready); self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics.ready); @@ -2703,9 +2727,9 @@ where .find_propose_time(entry.get_term(), entry.get_index()); if let Some(propose_time) = propose_time { // We must renew current_time because this value may be created a long time ago. - // If we do not renew it, this time may be smaller than propose_time of a command, - // which was proposed in another thread while this thread receives its AppendEntriesResponse - // and is ready to calculate its commit-log-duration. + // If we do not renew it, this time may be smaller than propose_time of a + // command, which was proposed in another thread while this thread receives its + // AppendEntriesResponse and is ready to calculate its commit-log-duration. ctx.current_time.replace(monotonic_raw_now()); ctx.raft_metrics.commit_log.observe(duration_to_sec( (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), @@ -2880,7 +2904,8 @@ where self.mut_store().update_cache_persisted(persist_index); if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { - // forward commit index, the committed entries will be applied in the next raft base tick round + // forward commit index, the committed entries will be applied in the next raft + // base tick round self.maybe_force_forward_commit_index(); } } @@ -2922,7 +2947,8 @@ where let persist_index = self.raft_group.raft.raft_log.persisted; if let Some(ForceLeaderState::ForceLeader { .. }) = self.force_leader { - // forward commit index, the committed entries will be applied in the next raft base tick round + // forward commit index, the committed entries will be applied in the next raft + // base tick round self.maybe_force_forward_commit_index(); } self.mut_store().update_cache_persisted(persist_index); @@ -3022,7 +3048,8 @@ where } } - /// Responses to the ready read index request on the replica, the replica is not a leader. + /// Responses to the ready read index request on the replica, the replica is + /// not a leader. fn post_pending_read_index_on_replica(&mut self, ctx: &mut PollContext) { while let Some(mut read) = self.pending_reads.pop_front() { // The response of this read index request is lost, but we need it for @@ -3101,9 +3128,9 @@ where // update the `read_index` of read request that before this successful // `ready`. if !self.is_leader() { - // NOTE: there could still be some pending reads proposed by the peer when it was - // leader. They will be cleared in `clear_uncommitted_on_role_change` later in - // the function. + // NOTE: there could still be some pending reads proposed by the peer when it + // was leader. They will be cleared in `clear_uncommitted_on_role_change` later + // in the function. self.pending_reads.advance_replica_reads(states); self.post_pending_read_index_on_replica(ctx); } else { @@ -3346,8 +3373,8 @@ where ) { self.propose_normal(ctx, req) } else { - // If leader node is disk full, try to transfer leader to a node with disk usage normal to - // keep write availablity not downback. + // If leader node is disk full, try to transfer leader to a node with disk usage + // normal to keep write availability not downback. // if majority node is disk full, to transfer leader or not is not necessary. // Note: Need to exclude learner node. if maybe_transfer_leader && !self.disk_full_peers.majority { @@ -3402,8 +3429,9 @@ where Ok(Either::Left(idx)) => { let has_applied_to_current_term = self.has_applied_to_current_term(); if has_applied_to_current_term { - // After this peer has applied to current term and passed above checking including `cmd_epoch_checker`, - // we can safely guarantee that this proposal will be committed if there is no abnormal leader transfer + // After this peer has applied to current term and passed above checking + // including `cmd_epoch_checker`, we can safely guarantee + // that this proposal will be committed if there is no abnormal leader transfer // in the near future. Thus proposed callback can be called. cb.invoke_proposed(); } @@ -3468,7 +3496,8 @@ where self.proposals.push(p); } - // TODO: set higher election priority of voter/incoming voter than demoting voter + // TODO: set higher election priority of voter/incoming voter than demoting + // voter /// Validate the `ConfChange` requests and check whether it's safe to /// propose these conf change requests. /// It's safe iff at least the quorum of the Raft group is still healthy @@ -3549,8 +3578,9 @@ where } } - // Multiple changes that only effect learner will not product `IncommingVoter` or `DemotingVoter` - // after apply, but raftstore layer and PD rely on these roles to detect joint state + // Multiple changes that only effect learner will not product `IncommingVoter` + // or `DemotingVoter` after apply, but raftstore layer and PD rely on these + // roles to detect joint state if kind != ConfChangeKind::Simple && only_learner_change { return Err(box_err!( "{} invalid conf change request, multiple changes that only effect learner", @@ -3630,8 +3660,8 @@ where msg.set_msg_type(eraftpb::MessageType::MsgTransferLeader); msg.set_from(self.peer_id()); // log term here represents the term of last log. For leader, the term of last - // log is always its current term. Not just set term because raft library forbids - // setting it for MsgTransferLeader messages. + // log is always its current term. Not just set term because raft library + // forbids setting it for MsgTransferLeader messages. msg.set_log_term(self.term()); self.raft_group.raft.msgs.push(msg); true @@ -3720,8 +3750,9 @@ where self.pending_reads.has_unresolved() } - /// `ReadIndex` requests could be lost in network, so on followers commands could queue in - /// `pending_reads` forever. Sending a new `ReadIndex` periodically can resolve this. + /// `ReadIndex` requests could be lost in network, so on followers commands + /// could queue in `pending_reads` forever. Sending a new `ReadIndex` + /// periodically can resolve this. pub fn retry_pending_reads(&mut self, cfg: &Config) { if self.is_leader() || !self.pending_reads.check_needs_retry(cfg) @@ -3779,11 +3810,11 @@ where let now = monotonic_raw_now(); if self.is_leader() { match self.inspect_lease() { - // Here combine the new read request with the previous one even if the lease expired is - // ok because in this case, the previous read index must be sent out with a valid - // lease instead of a suspect lease. So there must no pending transfer-leader proposals - // before or after the previous read index, and the lease can be renewed when get - // heartbeat responses. + // Here combine the new read request with the previous one even if the lease expired + // is ok because in this case, the previous read index must be sent out with a valid + // lease instead of a suspect lease. So there must no pending transfer-leader + // proposals before or after the previous read index, and the lease can be renewed + // when get heartbeat responses. LeaseState::Valid | LeaseState::Expired => { // Must use the commit index of `PeerStorage` instead of the commit index // in raft-rs which may be greater than the former one. @@ -3796,14 +3827,15 @@ where .get(0) .map(|req| req.has_read_index()) .unwrap_or_default(); - // A read index request or a read with addition request always needs the response of - // checking memory lock for async commit, so we cannot apply the optimization here + // A read index request or a read with addition request always needs the + // response of checking memory lock for async commit, so we cannot apply the + // optimization here if !is_read_index_request && read.addition_request.is_none() && read.propose_time + max_lease > now { - // A read request proposed in the current lease is found; combine the new - // read request to that previous one, so that no proposing needed. + // A read request proposed in the current lease is found; combine the + // new read request to that previous one, so that no proposing needed. read.push_command(req, cb, commit_index); return false; } @@ -3816,9 +3848,9 @@ where } } - // When a replica cannot detect any leader, `MsgReadIndex` will be dropped, which would - // cause a long time waiting for a read response. Then we should return an error directly - // in this situation. + // When a replica cannot detect any leader, `MsgReadIndex` will be dropped, + // which would cause a long time waiting for a read response. Then we + // should return an error directly in this situation. if !self.is_leader() && self.leader_id() == INVALID_ID { poll_ctx.raft_metrics.invalid_proposal.read_index_no_leader += 1; // The leader may be hibernated, send a message for trying to awaken the leader. @@ -3959,8 +3991,9 @@ where "min_matched" => min_m, "min_committed" => min_c, ); - // Reset `min_matched` to `min_committed`, since the raft log at `min_committed` is - // known to be committed in all peers, all of the peers should also have replicated it + // Reset `min_matched` to `min_committed`, since the raft log at `min_committed` + // is known to be committed in all peers, all of the peers should also have + // replicated it min_m = min_c; } Ok((min_m, min_c)) @@ -3976,7 +4009,8 @@ where if self.prepare_merge_fence > 0 { let applied_index = self.get_store().applied_index(); if applied_index >= self.prepare_merge_fence { - // Check passed, clear fence and start proposing pessimistic locks and PrepareMerge. + // Check passed, clear fence and start proposing pessimistic locks and + // PrepareMerge. self.prepare_merge_fence = 0; self.pending_prepare_merge = None; passed_merge_fence = true; @@ -4055,10 +4089,10 @@ where )); }; - // Record current proposed index. If there are some in-memory pessimistic locks, we should - // wait until applying to the proposed index before proposing pessimistic locks and - // PrepareMerge. Otherwise, if an already proposed command will remove a pessimistic lock, - // we will make some deleted locks appear again. + // Record current proposed index. If there are some in-memory pessimistic locks, + // we should wait until applying to the proposed index before proposing + // pessimistic locks and PrepareMerge. Otherwise, if an already proposed command + // will remove a pessimistic lock, we will make some deleted locks appear again. if !passed_merge_fence { let pessimistic_locks = self.txn_ext.pessimistic_locks.read(); if !pessimistic_locks.is_empty() { @@ -4104,9 +4138,10 @@ where pessimistic_locks.status = LocksStatus::MergingRegion; return Ok(()); } - // The proposed pessimistic locks here will also be carried in CommitMerge. Check the size - // to avoid CommitMerge exceeding the size limit of a raft entry. This check is a inaccurate - // check. We will check the size again accurately later using the protobuf encoding. + // The proposed pessimistic locks here will also be carried in CommitMerge. + // Check the size to avoid CommitMerge exceeding the size limit of a raft entry. + // This check is a inaccurate check. We will check the size again accurately + // later using the protobuf encoding. if pessimistic_locks.memory_size > size_limit { return Err(box_err!( "pessimistic locks size {} exceed size limit {}, skip merging.", @@ -4180,9 +4215,11 @@ where /// Propose normal request to raft /// - /// Returns Ok(Either::Left(index)) means the proposal is proposed successfully and is located on `index` position. - /// Ok(Either::Right(index)) means the proposal is rejected by `CmdEpochChecker` and the `index` is the position of - /// the last conflict admin cmd. + /// Returns Ok(Either::Left(index)) means the proposal is proposed + /// successfully and is located on `index` position. + /// Ok(Either::Right(index)) means the proposal is rejected by + /// `CmdEpochChecker` and the `index` is the position of the last + /// conflict admin cmd. fn propose_normal( &mut self, poll_ctx: &mut PollContext, @@ -4209,8 +4246,8 @@ where poll_ctx.raft_metrics.propose.normal += 1; if self.has_applied_to_current_term() { - // Only when applied index's term is equal to current leader's term, the information - // in epoch checker is up to date and can be used to check epoch. + // Only when applied index's term is equal to current leader's term, the + // information in epoch checker is up to date and can be used to check epoch. if let Some(index) = self .cmd_epoch_checker .propose_check_epoch(&req, self.term()) @@ -4218,8 +4255,9 @@ where return Ok(Either::Right(index)); } } else if req.has_admin_request() { - // The admin request is rejected because it may need to update epoch checker which - // introduces an uncertainty and may breaks the correctness of epoch checker. + // The admin request is rejected because it may need to update epoch checker + // which introduces an uncertainty and may breaks the correctness of epoch + // checker. return Err(box_err!( "{} peer has not applied to current term, applied_term {}, current_term {}", self.tag, @@ -4232,7 +4270,8 @@ where let ctx = match self.pre_propose(poll_ctx, &mut req) { Ok(ctx) => ctx, Err(e) => { - // Skipping PrepareMerge is logged when the PendingPrepareMerge error is generated. + // Skipping PrepareMerge is logged when the PendingPrepareMerge error is + // generated. if !matches!(e, Error::PendingPrepareMerge) { warn!( "skip proposal"; @@ -4401,7 +4440,8 @@ where }; // transfer leader command doesn't need to replicate log and apply, so we - // return immediately. Note that this command may fail, we can view it just as an advice + // return immediately. Note that this command may fail, we can view it just as + // an advice cb.invoke_with_response(make_transfer_leader_response()); transferred @@ -4412,9 +4452,10 @@ where // 2. Removing the leader is not allowed in the configuration; // 3. The conf change makes the raft group not healthy; // 4. The conf change is dropped by raft group internally. - /// Returns Ok(Either::Left(index)) means the proposal is proposed successfully and is located on `index` position. - /// Ok(Either::Right(index)) means the proposal is rejected by `CmdEpochChecker` and the `index` is the position of - /// the last conflict admin cmd. + /// Returns Ok(Either::Left(index)) means the proposal is proposed + /// successfully and is located on `index` position. Ok(Either:: + /// Right(index)) means the proposal is rejected by `CmdEpochChecker` and + /// the `index` is the position of the last conflict admin cmd. fn propose_conf_change( &mut self, ctx: &mut PollContext, @@ -4434,9 +4475,10 @@ where self.tag )); } - // Actually, according to the implementation of conf change in raft-rs, this check must be - // passed if the previous check that `pending_conf_index` should be less than or equal to - // `self.get_store().applied_index()` is passed. + // Actually, according to the implementation of conf change in raft-rs, this + // check must be passed if the previous check that `pending_conf_index` + // should be less than or equal to `self.get_store().applied_index()` is + // passed. if self.get_store().applied_term() != self.term() { return Err(box_err!( "{} peer has not applied to current term, applied_term {}, current_term {}", @@ -4618,7 +4660,8 @@ where normal_peers.insert(peer_id); } if let Some(pr) = self.raft_group.raft.prs().get(peer_id) { - // status 3-normal, 2-almostfull, 1-alreadyfull, only for simplying the sort func belowing. + // status 3-normal, 2-almostfull, 1-alreadyfull, only for simplying the sort + // func belowing. let mut status = 3; if let Some(usg) = usage { status = match usg { @@ -4653,7 +4696,8 @@ where return; } - // Reverse sort peers based on `next_idx`, `usage` and `store healthy status`, then try to get a potential quorum. + // Reverse sort peers based on `next_idx`, `usage` and `store healthy status`, + // then try to get a potential quorum. next_idxs.sort_by(|x, y| { if x.3 == y.3 { y.1.cmp(&x.1) @@ -4709,8 +4753,8 @@ where self.dangerous_majority_set = has_dangurous_set; - // For the Peer with AlreadFull in potential quorum set, we still need to send logs to it. - // To support incoming configure change. + // For the Peer with AlreadFull in potential quorum set, we still need to send + // logs to it. To support incoming configure change. if quorum_ok { for peer in potential_quorum { if let Some(x) = self.disk_full_peers.peers.get_mut(&peer) { @@ -4763,7 +4807,8 @@ where } } - // if there are some peers with disk already full status in the majority set, should not allowed. + // if there are some peers with disk already full status in the majority set, + // should not allowed. if self.dangerous_majority_set { return false; } @@ -4775,7 +4820,8 @@ where if matches!(disk_full_opt, DiskFullOpt::AllowedOnAlmostFull) && self.disk_full_peers.peers.values().any(|x| x.1) { - // Majority peers are in disk full status but the request carries a special flag. + // Majority peers are in disk full status but the request carries a special + // flag. return true; } false @@ -5039,13 +5085,14 @@ where } // There could be two cases: - // 1. Target peer already exists but has not established communication with leader yet - // 2. Target peer is added newly due to member change or region split, but it's not - // created yet - // For both cases the region start key and end key are attached in RequestVote and - // Heartbeat message for the store of that peer to check whether to create a new peer - // when receiving these messages, or just to wait for a pending region split to perform - // later. + // - Target peer already exists but has not established communication with + // leader yet + // - Target peer is added newly due to member change or region split, but it's + // not created yet + // For both cases the region start key and end key are attached in RequestVote + // and Heartbeat message for the store of that peer to check whether to create a + // new peer when receiving these messages, or just to wait for a pending region + // split to perform later. if self.get_store().is_initialized() && is_initial_msg(&msg) { let region = self.region(); send_msg.set_start_key(region.get_start_key().to_vec()); @@ -5247,7 +5294,8 @@ where Ok(()) } - /// Update states of the peer which can be changed in the previous raft tick. + /// Update states of the peer which can be changed in the previous raft + /// tick. pub fn post_raft_group_tick(&mut self) { self.lead_transferee = self.raft_group.raft.lead_transferee.unwrap_or_default(); } @@ -5446,7 +5494,8 @@ fn make_transfer_leader_response() -> RaftCmdResponse { resp } -// The Raft message context for a MsgTransferLeader if it is a reply of a TransferLeader command. +// The Raft message context for a MsgTransferLeader if it is a reply of a +// TransferLeader command. pub const TRANSFER_LEADER_COMMAND_REPLY_CTX: &[u8] = &[1]; /// A poor version of `Peer` to avoid port generic variables everywhere. diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 76bb95b0d39..cec0d44f081 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -147,17 +147,18 @@ pub fn recover_from_applying_state( let raft_state = box_try!(engines.raft.get_raft_state(region_id)).unwrap_or_default(); - // if we recv append log when applying snapshot, last_index in raft_local_state will - // larger than snapshot_index. since raft_local_state is written to raft engine, and - // raft write_batch is written after kv write_batch, raft_local_state may wrong if - // restart happen between the two write. so we copy raft_local_state to kv engine - // (snapshot_raft_state), and set snapshot_raft_state.last_index = snapshot_index. - // after restart, we need check last_index. + // if we recv append log when applying snapshot, last_index in raft_local_state + // will larger than snapshot_index. since raft_local_state is written to + // raft engine, and raft write_batch is written after kv write_batch, + // raft_local_state may wrong if restart happen between the two write. so we + // copy raft_local_state to kv engine (snapshot_raft_state), and set + // snapshot_raft_state.last_index = snapshot_index. after restart, we need + // check last_index. if last_index(&snapshot_raft_state) > last_index(&raft_state) { // There is a gap between existing raft logs and snapshot. Clean them up. engines .raft - .clean(region_id, 0 /*first_index*/, &raft_state, raft_wb)?; + .clean(region_id, 0 /* first_index */, &raft_state, raft_wb)?; raft_wb.put_raft_state(region_id, &snapshot_raft_state)?; } Ok(()) @@ -303,8 +304,9 @@ fn validate_states( state_str() )); } - // Since the entries must be persisted before applying, the term of raft state should also - // be persisted. So it should be greater than the commit term of apply state. + // Since the entries must be persisted before applying, the term of raft state + // should also be persisted. So it should be greater than the commit term of + // apply state. if raft_state.get_hard_state().get_term() < apply_state.get_commit_term() { return Err(box_err!( "term of raft state < commit term of apply state, {}", @@ -552,8 +554,8 @@ where true } - /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no unavailable - /// snapshot. + /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no + /// unavailable snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { let mut snap_state = self.snap_state.borrow_mut(); let mut tried_cnt = self.snap_tried_cnt.borrow_mut(); @@ -703,14 +705,14 @@ where let first_index = self.entry_storage.first_index(); // It's possible that logs between `last_compacted_idx` and `first_index` are // being deleted in raftlog_gc worker. But it's OK as: - // 1. If the peer accepts a new snapshot, it must start with an index larger than - // this `first_index`; - // 2. If the peer accepts new entries after this snapshot or new snapshot, it must - // start with the new applied index, which is larger than `first_index`. + // - If the peer accepts a new snapshot, it must start with an index larger than + // this `first_index`; + // - If the peer accepts new entries after this snapshot or new snapshot, it + // must start with the new applied index, which is larger than `first_index`. // So new logs won't be deleted by on going raftlog_gc task accidentally. // It's possible that there will be some logs between `last_compacted_idx` and - // `first_index` are not deleted. So a cleanup task for the range should be triggered - // after applying the snapshot. + // `first_index` are not deleted. So a cleanup task for the range should be + // triggered after applying the snapshot. self.clear_meta(first_index, kv_wb, raft_wb)?; } // Write its source peers' `RegionLocalState` together with itself for atomicity @@ -740,10 +742,10 @@ where // Although there is an interval that other metadata are updated while `region` // is not after handing snapshot from ready, at the time of writing, it's no // problem for now. - // The reason why the update of `region` is delayed is that we expect `region` stays - // consistent with the one in `StoreMeta::regions` which should be updated after - // persisting due to atomic snapshot and peer create process. So if we can fix - // these issues in future(maybe not?), the `region` and `StoreMeta::regions` + // The reason why the update of `region` is delayed is that we expect `region` + // stays consistent with the one in `StoreMeta::regions` which should be updated + // after persisting due to atomic snapshot and peer create process. So if we can + // fix these issues in future(maybe not?), the `region` and `StoreMeta::regions` // can updated here immediately. info!( @@ -865,7 +867,8 @@ where res } - /// Cancel applying snapshot, return true if the job can be considered not be run again. + /// Cancel applying snapshot, return true if the job can be considered not + /// be run again. pub fn cancel_applying_snap(&mut self) -> bool { let is_canceled = match *self.snap_state.borrow() { SnapState::Applying(ref status) => { @@ -1042,14 +1045,15 @@ where } } - // Note that the correctness depends on the fact that these source regions MUST NOT - // serve read request otherwise a corrupt data may be returned. + // Note that the correctness depends on the fact that these source regions MUST + // NOT serve read request otherwise a corrupt data may be returned. // For now, it is ensured by - // 1. After `PrepareMerge` log is committed, the source region leader's lease will be - // suspected immediately which makes the local reader not serve read request. - // 2. No read request can be responsed in peer fsm during merging. - // These conditions are used to prevent reading **stale** data in the past. - // At present, they are also used to prevent reading **corrupt** data. + // - After `PrepareMerge` log is committed, the source region leader's lease + // will be suspected immediately which makes the local reader not serve read + // request. + // - No read request can be responsed in peer fsm during merging. These + // conditions are used to prevent reading **stale** data in the past. At + // present, they are also used to prevent reading **corrupt** data. for r in &res.destroy_regions { if let Err(e) = self.clear_extra_data(r, &res.region) { error!(?e; @@ -1061,8 +1065,8 @@ where self.schedule_applying_snapshot(); - // The `region` is updated after persisting in order to stay consistent with the one - // in `StoreMeta::regions` (will be updated soon). + // The `region` is updated after persisting in order to stay consistent with the + // one in `StoreMeta::regions` (will be updated soon). // See comments in `apply_snapshot` for more details. self.set_region(res.region.clone()); } @@ -1189,7 +1193,8 @@ where Ok(snapshot) } -// When we bootstrap the region we must call this to initialize region local state first. +// When we bootstrap the region we must call this to initialize region local +// state first. pub fn write_initial_raft_state(raft_wb: &mut W, region_id: u64) -> Result<()> { let mut raft_state = RaftLocalState { last_index: RAFT_INIT_LOG_INDEX, @@ -1493,7 +1498,7 @@ pub mod tests { store .engines .raft - .consume(&mut raft_wb, false /*sync*/) + .consume(&mut raft_wb, false /* sync */) .unwrap(); assert_eq!(left, get_meta_key_count(&store)); @@ -1520,7 +1525,8 @@ pub mod tests { where EK: KvEngine, { - /// Sends a significant message. We should guarantee that the message can't be dropped. + /// Sends a significant message. We should guarantee that the message + /// can't be dropped. fn significant_send( &self, _: u64, diff --git a/components/raftstore/src/store/read_queue.rs b/components/raftstore/src/store/read_queue.rs index 9e6c9cf69f0..aa24b4bc3c7 100644 --- a/components/raftstore/src/store/read_queue.rs +++ b/components/raftstore/src/store/read_queue.rs @@ -162,8 +162,9 @@ where self.ready_cnt != self.reads.len() } - /// Clear all commands in the queue. if `notify_removed` contains an `region_id`, - /// notify the request's callback that the region is removed. + /// Clear all commands in the queue. if `notify_removed` contains an + /// `region_id`, notify the request's callback that the region is + /// removed. pub fn clear_all(&mut self, notify_removed: Option) { let mut removed = 0; for mut read in self.reads.drain(..) { @@ -349,7 +350,8 @@ where Some(res) } - /// Raft could have not been ready to handle the poped task. So put it back into the queue. + /// Raft could have not been ready to handle the poped task. So put it back + /// into the queue. pub fn push_front(&mut self, read: ReadIndexRequest) { debug_assert!(read.read_index.is_some()); self.reads.push_front(read); @@ -491,7 +493,8 @@ mod read_index_ctx_tests { } ); - // Old version TiKV should be able to parse context without lock checking fields. + // Old version TiKV should be able to parse context without lock checking + // fields. let bytes = ctx.to_bytes(); assert_eq!(bytes, id.as_bytes()); } @@ -640,7 +643,8 @@ mod tests { ); queue.push_back(req, true); - // Advance on leader, but the peer is not ready to handle it (e.g. it's in merging). + // Advance on leader, but the peer is not ready to handle it (e.g. it's in + // merging). queue.advance_leader_reads("", vec![(id, None, 10)]); // The leader steps down to follower, clear uncommitted reads. diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index cd2bc75d048..056f1f4832d 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -118,8 +118,8 @@ where )) } - // scan scans database using an iterator in range [start_key, end_key), calls function f for - // each iteration, if f returns false, terminates this scan. + // scan scans database using an iterator in range [start_key, end_key), calls + // function f for each iteration, if f returns false, terminates this scan. pub fn scan( &self, cf: &str, diff --git a/components/raftstore/src/store/replication_mode.rs b/components/raftstore/src/store/replication_mode.rs index bf13b9e2364..1f163ccfb9f 100644 --- a/components/raftstore/src/store/replication_mode.rs +++ b/components/raftstore/src/store/replication_mode.rs @@ -93,11 +93,12 @@ impl StoreGroup { /// Gets the group ID of store. /// - /// Different version may indicates different label key. If version is less than - /// recorded one, then label key has to be changed, new value can't be mixed with - /// old values, so `None` is returned. If version is larger, then label key must - /// still matches. Because `recalculate` is called before updating regions' - /// replication status, so unchanged recorded version means unchanged label key. + /// Different version may indicates different label key. If version is less + /// than recorded one, then label key has to be changed, new value can't + /// be mixed with old values, so `None` is returned. If version is larger, + /// then label key must still matches. Because `recalculate` is called + /// before updating regions' replication status, so unchanged recorded + /// version means unchanged label key. #[inline] pub fn group_id(&self, version: u64, store_id: u64) -> Option { if version < self.version { diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 6a8aa5ca3bf..aeaf70f5b03 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -371,7 +371,8 @@ impl CfFile { assert!(self.size.len() >= idx); let file_name = self.gen_file_name(idx); if self.size.len() > idx { - // Any logic similar to test_snap_corruption_on_size_or_checksum will trigger this branch + // Any logic similar to test_snap_corruption_on_size_or_checksum will trigger + // this branch self.size[idx] = size; self.checksum[idx] = checksum; self.file_names[idx] = file_name.clone(); @@ -645,8 +646,8 @@ impl Snapshot { Ok(s) } - // If all files of the snapshot exist, return `Ok` directly. Otherwise create a new file at - // the temporary meta file path, so that all other try will fail. + // If all files of the snapshot exist, return `Ok` directly. Otherwise create a + // new file at the temporary meta file path, so that all other try will fail. fn init_for_building(&mut self) -> RaftStoreResult<()> { if self.exists() { return Ok(()); @@ -820,10 +821,10 @@ impl Snapshot { fn save_meta_file(&mut self) -> RaftStoreResult<()> { let v = box_try!(self.meta_file.meta.write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { - // `meta_file` could be None for this case: in `init_for_building` the snapshot exists - // so no temporary meta file is created, and this field is None. However in `do_build` - // it's deleted so we build it again, and then call `save_meta_file` with `meta_file` - // as None. + // `meta_file` could be None for this case: in `init_for_building` the snapshot + // exists so no temporary meta file is created, and this field is + // None. However in `do_build` it's deleted so we build it again, + // and then call `save_meta_file` with `meta_file` as None. // FIXME: We can fix it later by introducing a better snapshot delete mechanism. f.write_all(&v[..])?; f.flush()?; @@ -895,8 +896,8 @@ impl Snapshot { }; cf_file.kv_count = cf_stat.key_count as u64; if cf_file.kv_count > 0 { - // Use `kv_count` instead of file size to check empty files because encrypted sst files - // contain some metadata so their sizes will never be 0. + // Use `kv_count` instead of file size to check empty files because encrypted + // sst files contain some metadata so their sizes will never be 0. self.mgr.rename_tmp_cf_file_for_send(cf_file)?; } else { for tmp_file_path in cf_file.tmp_file_paths() { @@ -936,7 +937,7 @@ impl Snapshot { fn delete(&self) { macro_rules! try_delete_snapshot_files { - ($cf_file: ident, $file_name_func: ident) => { + ($cf_file:ident, $file_name_func:ident) => { let mut file_id = 0; loop { let file_path = $cf_file.path.join($cf_file.$file_name_func(file_id)); @@ -948,7 +949,7 @@ impl Snapshot { } } }; - ($cf_file: ident) => { + ($cf_file:ident) => { let mut file_id = 0; loop { let file_path = $cf_file.path.join($cf_file.gen_file_name(file_id)); @@ -972,7 +973,8 @@ impl Snapshot { for cf_file in &self.cf_files { // Delete cloned files. let clone_file_paths = cf_file.clone_file_paths(); - // in case the meta file is corrupted or deleted, delete snapshot files with best effort + // in case the meta file is corrupted or deleted, delete snapshot files with + // best effort if clone_file_paths.is_empty() { try_delete_snapshot_files!(cf_file, gen_clone_file_name); } else { @@ -1409,8 +1411,8 @@ impl SnapManager { Ok(()) } - // [PerformanceCriticalPath]?? I/O involved API should be called in background thread - // Return all snapshots which is idle not being used. + // [PerformanceCriticalPath]?? I/O involved API should be called in background + // thread Return all snapshots which is idle not being used. pub fn list_idle_snap(&self) -> io::Result> { // Use a lock to protect the directory when scanning. let registry = self.core.registry.rl(); @@ -1489,7 +1491,8 @@ impl SnapManager { /// because only one caller can lock temporary disk files. /// /// NOTE: it calculates snapshot size by scanning the base directory. - /// Don't call it in raftstore thread until the size limitation mechanism gets refactored. + /// Don't call it in raftstore thread until the size limitation mechanism + /// gets refactored. pub fn get_snapshot_for_building(&self, key: &SnapKey) -> RaftStoreResult> { let mut old_snaps = None; while self.get_total_snap_size()? > self.max_total_snap_size() { @@ -1559,8 +1562,9 @@ impl SnapManager { Ok(Box::new(s)) } - /// Get a `Snapshot` can be used for writting and then `save`. Concurrent calls - /// are allowed because only one caller can lock temporary disk files. + /// Get a `Snapshot` can be used for writting and then `save`. Concurrent + /// calls are allowed because only one caller can lock temporary disk + /// files. pub fn get_snapshot_for_receiving( &self, key: &SnapKey, @@ -2378,7 +2382,8 @@ pub mod tests { } } - // Make all the snapshot in the specified dir corrupted to have incorrect checksum. + // Make all the snapshot in the specified dir corrupted to have incorrect + // checksum. fn corrupt_snapshot_checksum_in>(dir: T) -> Vec { let dir_path = dir.into(); let mut res = Vec::new(); @@ -2423,7 +2428,8 @@ pub mod tests { res } - // Make all the snapshot meta files in the specified corrupted to have incorrect content. + // Make all the snapshot meta files in the specified corrupted to have incorrect + // content. fn corrupt_snapshot_meta_file>(dir: T) -> usize { let mut total = 0; let dir_path = dir.into(); @@ -2951,7 +2957,8 @@ pub mod tests { let key = SnapKey::new(1, 1, 1); let region = gen_test_region(1, 1, 1); - // Test one snapshot can be built multi times. DataKeyManager should be handled correctly. + // Test one snapshot can be built multi times. DataKeyManager should be handled + // correctly. for _ in 0..2 { let mut s1 = snap_mgr.get_snapshot_for_building(&key).unwrap(); let mut snap_data = RaftSnapshotData::default(); diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 4fb34f15341..c88c1bd3718 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -193,8 +193,8 @@ where Ok(stats) } -/// Apply the given snapshot file into a column family. `callback` will be invoked after each batch of -/// key value pairs written to db. +/// Apply the given snapshot file into a column family. `callback` will be +/// invoked after each batch of key value pairs written to db. pub fn apply_plain_cf_file( path: &str, key_mgr: Option<&Arc>, @@ -226,7 +226,8 @@ where Ok(()) }; - // Collect keys to a vec rather than wb so that we can invoke the callback less times. + // Collect keys to a vec rather than wb so that we can invoke the callback less + // times. let mut batch = Vec::with_capacity(1024); let mut batch_data_size = 0; diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 1d8e7ed1981..7b681506f63 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -15,13 +15,13 @@ use txn_types::{Key, PessimisticLock}; /// Transaction extensions related to a peer. #[derive(Default)] pub struct TxnExt { - /// The max timestamp recorded in the concurrency manager is only updated at leader. - /// So if a peer becomes leader from a follower, the max timestamp can be outdated. - /// We need to update the max timestamp with a latest timestamp from PD before this - /// peer can work. - /// From the least significant to the most, 1 bit marks whether the timestamp is - /// updated, 31 bits for the current epoch version, 32 bits for the current term. - /// The version and term are stored to prevent stale UpdateMaxTimestamp task from + /// The max timestamp recorded in the concurrency manager is only updated at + /// leader. So if a peer becomes leader from a follower, the max timestamp + /// can be outdated. We need to update the max timestamp with a latest + /// timestamp from PD before this peer can work. From the least significant + /// to the most, 1 bit marks whether the timestamp is updated, 31 bits for + /// the current epoch version, 32 bits for the current term. The version + /// and term are stored to prevent stale UpdateMaxTimestamp task from /// marking the lowest bit. pub max_ts_sync_status: AtomicU64, @@ -58,7 +58,8 @@ lazy_static! { const GLOBAL_MEM_SIZE_LIMIT: usize = 100 << 20; // 100 MiB -// 512 KiB, so pessimistic locks in one region can be proposed in a single command. +// 512 KiB, so pessimistic locks in one region can be proposed in a single +// command. const PEER_MEM_SIZE_LIMIT: usize = 512 << 10; /// Pessimistic locks of a region peer. @@ -66,51 +67,53 @@ const PEER_MEM_SIZE_LIMIT: usize = 512 << 10; pub struct PeerPessimisticLocks { /// The table that stores pessimistic locks. /// - /// The bool marks an ongoing write request (which has been sent to the raftstore while not - /// applied yet) will delete this lock. The lock will be really deleted after applying the - /// write request. The flag will decide whether this lock should be migrated to other peers - /// on leader or region changes: + /// The bool marks an ongoing write request (which has been sent to the + /// raftstore while not applied yet) will delete this lock. The lock will be + /// really deleted after applying the write request. The flag will decide + /// whether this lock should be migrated to other peers on leader or region + /// changes: /// - /// - Transfer leader - /// The lock with the deleted mark SHOULD NOT be proposed before transferring leader. - /// Considering the following cases with different orders: - /// 1. Propose write -> propose locks -> apply write -> apply locks -> transfer leader - /// Because the locks marking deleted will not be proposed. The lock will be deleted when - /// applying the write while not showing up again after applying the locks. - /// 2. Propose locks -> propose write -> transfer leader - /// No lock will be lost in normal cases because the write request has been sent to the - /// raftstore, it is likely to be proposed successfully, while the leader will need at - /// least another round to receive the transfer leader message from the transferree. + /// - Transfer leader The lock with the deleted mark SHOULD NOT be proposed + /// before transferring leader. Considering the following cases with + /// different orders: 1. Propose write -> propose locks -> apply write -> + /// apply locks -> transfer leader Because the locks marking deleted will + /// not be proposed. The lock will be deleted when applying the write + /// while not showing up again after applying the locks. 2. Propose locks + /// -> propose write -> transfer leader No lock will be lost in normal + /// cases because the write request has been sent to the raftstore, it is + /// likely to be proposed successfully, while the leader will need at + /// least another round to receive the transfer leader message from the + /// transferee. /// - /// - Split region - /// The lock with the deleted mark SHOULD be moved to new regions on region split. - /// Considering the following cases with different orders: - /// 1. Propose write -> propose split -> apply write -> execute split - /// The write will be applied earlier than split. So, the lock will be deleted earlier - /// than moving locks to new regions. - /// 2. Propose split -> propose write -> ready split -> apply write - /// The write will be skipped because its version is lower than the new region. So, no - /// lock should be deleted in this case. - /// 3. Propose split -> ready split -> propose write - /// The write proposal will be rejected because of version mismatch. + /// - Split region The lock with the deleted mark SHOULD be moved to new + /// regions on region split. Considering the following cases with + /// different orders: 1. Propose write -> propose split -> apply write -> + /// execute split The write will be applied earlier than split. So, the + /// lock will be deleted earlier than moving locks to new regions. 2. + /// Propose split -> propose write -> ready split -> apply write The write + /// will be skipped because its version is lower than the new region. So, + /// no lock should be deleted in this case. 3. Propose split -> ready + /// split -> propose write The write proposal will be rejected because of + /// version mismatch. /// - /// - Merge region - /// The lock with the deleted mark SHOULD be included in the catch up logs on region merge. - /// Considering the following cases with different orders: - /// 1. Propose write -> propose prepare merge -> apply write -> execute merge - /// The locks marked deleted will be deleted when applying the write request. So, the - /// deleted locks will not be included again in the commit merge request. - /// 2. Propose prepare merge -> propose write -> execute merge -> apply write - /// Applying the write will be skipped because of version mismatch. So, no lock should - /// be deleted. It's correct that we include the locks that are marked deleted in the - /// commit merge request. + /// - Merge region The lock with the deleted mark SHOULD be included in the + /// catch up logs on region merge. Considering the following cases with + /// different orders: 1. Propose write -> propose prepare merge -> apply + /// write -> execute merge The locks marked deleted will be deleted when + /// applying the write request. So, the deleted locks will not be included + /// again in the commit merge request. 2. Propose prepare merge -> propose + /// write -> execute merge -> apply write Applying the write will be + /// skipped because of version mismatch. So, no lock should be deleted. + /// It's correct that we include the locks that are marked deleted in the + /// commit merge request. map: HashMap, /// Status of the pessimistic lock map. /// The map is writable only in the Normal state. pub status: LocksStatus, /// Refers to the Raft term in which the pessimistic lock table is valid. pub term: u64, - /// Refers to the region version in which the pessimistic lock table is valid. + /// Refers to the region version in which the pessimistic lock table is + /// valid. pub version: u64, /// Estimated memory used by the pessimistic locks. pub memory_size: usize, @@ -158,8 +161,8 @@ impl PeerPessimisticLocks { for pair in &pairs { let (key, lock) = pair.as_pair(); // If the key already exists in the map, it's an overwrite. - // The primary lock does not change during an overwrite, so we don't need to update - // the memory size. + // The primary lock does not change during an overwrite, so we don't need to + // update the memory size. if !self.map.contains_key(key) { incr += key.len() + lock.memory_size(); } @@ -215,11 +218,12 @@ impl PeerPessimisticLocks { /// Group pessimistic locks in the original region to the split regions. /// - /// The given regions MUST be sorted by key in the ascending order. The returned - /// `HashMap`s are in the same order of the given regions. + /// The given regions MUST be sorted by key in the ascending order. The + /// returned `HashMap`s are in the same order of the given regions. /// - /// The locks belonging to the derived region will be kept in the given `locks` map, - /// and the corresponding position in the returned `Vec` will be an empty map. + /// The locks belonging to the derived region will be kept in the given + /// `locks` map, and the corresponding position in the returned `Vec` + /// will be an empty map. pub fn group_by_regions( &mut self, regions: &[metapb::Region], diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 75c620ac12c..2bda7f4794f 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -105,10 +105,10 @@ pub fn check_key_in_region(key: &[u8], region: &metapb::Region) -> Result<()> { } } -/// `is_first_vote_msg` checks `msg` is the first vote (or prevote) message or not. It's used for -/// when the message is received but there is no such region in `Store::region_peers` and the -/// region overlaps with others. In this case we should put `msg` into `pending_msg` instead of -/// create the peer. +/// `is_first_vote_msg` checks `msg` is the first vote (or prevote) message or +/// not. It's used for when the message is received but there is no such region +/// in `Store::region_peers` and the region overlaps with others. In this case +/// we should put `msg` into `pending_msg` instead of create the peer. #[inline] fn is_first_vote_msg(msg: &eraftpb::Message) -> bool { match msg.get_msg_type() { @@ -119,10 +119,11 @@ fn is_first_vote_msg(msg: &eraftpb::Message) -> bool { } } -/// `is_first_append_entry` checks `msg` is the first append message or not. This meassge is the first -/// message that the learner peers of the new split region will receive from the leader. It's used for -/// when the message is received but there is no such region in `Store::region_peers`. In this case we -/// should put `msg` into `pending_msg` instead of create the peer. +/// `is_first_append_entry` checks `msg` is the first append message or not. +/// This meassge is the first message that the learner peers of the new split +/// region will receive from the leader. It's used for when the message is +/// received but there is no such region in `Store::region_peers`. In this case +/// we should put `msg` into `pending_msg` instead of create the peer. #[inline] fn is_first_append_entry(msg: &eraftpb::Message) -> bool { match msg.get_msg_type() { @@ -146,7 +147,8 @@ pub fn is_vote_msg(msg: &eraftpb::Message) -> bool { msg_type == MessageType::MsgRequestVote || msg_type == MessageType::MsgRequestPreVote } -/// `is_initial_msg` checks whether the `msg` can be used to initialize a new peer or not. +/// `is_initial_msg` checks whether the `msg` can be used to initialize a new +/// peer or not. // There could be two cases: // 1. Target peer already exists but has not established communication with leader yet // 2. Target peer is added newly due to member change or region split, but it's not @@ -207,12 +209,13 @@ impl AdminCmdEpochState { } /// WARNING: the existing settings below **MUST NOT** be changed!!! -/// Changing any admin cmd's `AdminCmdEpochState` or the epoch-change behavior during applying -/// will break upgrade compatibility and correctness dependency of `CmdEpochChecker`. -/// Please remember it is very difficult to fix the issues arising from not following this rule. +/// Changing any admin cmd's `AdminCmdEpochState` or the epoch-change behavior +/// during applying will break upgrade compatibility and correctness dependency +/// of `CmdEpochChecker`. Please remember it is very difficult to fix the issues +/// arising from not following this rule. /// -/// If you really want to change an admin cmd behavior, please add a new admin cmd and **DO NOT** -/// delete the old one. +/// If you really want to change an admin cmd behavior, please add a new admin +/// cmd and **DO NOT** delete the old one. pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochState { match admin_cmp_type { AdminCmdType::InvalidAdmin => AdminCmdEpochState::new(false, false, false, false), @@ -234,8 +237,8 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat } } -/// WARNING: `NORMAL_REQ_CHECK_VER` and `NORMAL_REQ_CHECK_CONF_VER` **MUST NOT** be changed. -/// The reason is the same as `admin_cmd_epoch_lookup`. +/// WARNING: `NORMAL_REQ_CHECK_VER` and `NORMAL_REQ_CHECK_CONF_VER` **MUST NOT** +/// be changed. The reason is the same as `admin_cmd_epoch_lookup`. pub static NORMAL_REQ_CHECK_VER: bool = true; pub static NORMAL_REQ_CHECK_CONF_VER: bool = false; @@ -396,14 +399,16 @@ pub fn is_region_initialized(r: &metapb::Region) -> bool { !r.get_peers().is_empty() } -/// Lease records an expired time, for examining the current moment is in lease or not. -/// It's dedicated to the Raft leader lease mechanism, contains either state of -/// 1. Suspect Timestamp -/// A suspicious leader lease timestamp, which marks the leader may still hold or lose -/// its lease until the clock time goes over this timestamp. -/// 2. Valid Timestamp -/// A valid leader lease timestamp, which marks the leader holds the lease for now. -/// The lease is valid until the clock time goes over this timestamp. +/// Lease records an expired time, for examining the current moment is in lease +/// or not. It's dedicated to the Raft leader lease mechanism, contains either +/// state of +/// - Suspect Timestamp +/// - A suspicious leader lease timestamp, which marks the leader may still +/// hold or lose its lease until the clock time goes over this timestamp. +/// - Valid Timestamp +/// - A valid leader lease timestamp, which marks the leader holds the lease +/// for now. The lease is valid until the clock time goes over this +/// timestamp. /// /// ```text /// Time @@ -419,18 +424,19 @@ pub fn is_region_initialized(r: &metapb::Region) -> bool { /// ``` /// /// Note: -/// - Valid timestamp would increase when raft log entries are applied in current term. -/// - Suspect timestamp would be set after the message `MsgTimeoutNow` is sent by current peer. -/// The message `MsgTimeoutNow` starts a leader transfer procedure. During this procedure, -/// current peer as an old leader may still hold its lease or lose it. -/// It's possible there is a new leader elected and current peer as an old leader -/// doesn't step down due to network partition from the new leader. In that case, -/// current peer lose its leader lease. -/// Within this suspect leader lease expire time, read requests could not be performed -/// locally. -/// - The valid leader lease should be `lease = max_lease - (commit_ts - send_ts)` -/// And the expired timestamp for that leader lease is `commit_ts + lease`, -/// which is `send_ts + max_lease` in short. +/// - Valid timestamp would increase when raft log entries are applied in +/// current term. +/// - Suspect timestamp would be set after the message `MsgTimeoutNow` is sent +/// by current peer. The message `MsgTimeoutNow` starts a leader transfer +/// procedure. During this procedure, current peer as an old leader may +/// still hold its lease or lose it. It's possible there is a new leader +/// elected and current peer as an old leader doesn't step down due to +/// network partition from the new leader. In that case, current peer lose +/// its leader lease. Within this suspect leader lease expire time, read +/// requests could not be performed locally. +/// - The valid leader lease should be `lease = max_lease - (commit_ts - +/// send_ts)` And the expired timestamp for that leader lease is `commit_ts +/// + lease`, which is `send_ts + max_lease` in short. pub struct Lease { // A suspect timestamp is in the Either::Left(_), // a valid timestamp is in the Either::Right(_). @@ -466,9 +472,9 @@ impl Lease { } } - /// The valid leader lease should be `lease = max_lease - (commit_ts - send_ts)` - /// And the expired timestamp for that leader lease is `commit_ts + lease`, - /// which is `send_ts + max_lease` in short. + /// The valid leader lease should be `lease = max_lease - (commit_ts - + /// send_ts)` And the expired timestamp for that leader lease is + /// `commit_ts + lease`, which is `send_ts + max_lease` in short. fn next_expired_time(&self, send_ts: Timespec) -> Timespec { send_ts + self.max_lease } @@ -595,8 +601,8 @@ impl fmt::Debug for Lease { } /// A remote lease, it can only be derived by `Lease`. It will be sent -/// to the local read thread, so name it remote. If Lease expires, the remote must -/// expire too. +/// to the local read thread, so name it remote. If Lease expires, the remote +/// must expire too. #[derive(Clone)] pub struct RemoteLease { expired_time: Arc, @@ -921,8 +927,8 @@ impl RegionReadProgressRegistry { .map(|rp| rp.safe_ts()) } - // Update `safe_ts` with the provided `LeaderInfo` and return the regions that have the - // same `LeaderInfo` + // Update `safe_ts` with the provided `LeaderInfo` and return the regions that + // have the same `LeaderInfo` pub fn handle_check_leaders(&self, leaders: Vec) -> Vec { let mut regions = Vec::with_capacity(leaders.len()); let registry = self.registry.lock().unwrap(); @@ -949,9 +955,9 @@ impl RegionReadProgressRegistry { info_map } - /// Invoke the provided callback with the registry, an internal lock will hold - /// while invoking the callback so it is important that *not* try to acquiring any - /// lock inside the callback to avoid dead lock + /// Invoke the provided callback with the registry, an internal lock will + /// hold while invoking the callback so it is important that *not* try + /// to acquiring any lock inside the callback to avoid dead lock pub fn with(&self, f: F) -> T where F: FnOnce(&HashMap>) -> T, @@ -967,9 +973,10 @@ impl Default for RegionReadProgressRegistry { } } -/// `RegionReadProgress` is used to keep track of the replica's `safe_ts`, the replica can handle a read -/// request directly without requiring leader lease or read index iff `safe_ts` >= `read_ts` (the `read_ts` -/// is usually stale i.e seconds ago). +/// `RegionReadProgress` is used to keep track of the replica's `safe_ts`, the +/// replica can handle a read request directly without requiring leader lease or +/// read index iff `safe_ts` >= `read_ts` (the `read_ts` is usually stale i.e +/// seconds ago). /// /// `safe_ts` is updated by the `(apply index, safe ts)` item: /// ```ignore @@ -978,13 +985,15 @@ impl Default for RegionReadProgressRegistry { /// } /// ``` /// -/// For the leader, the `(apply index, safe ts)` item is publish by the `resolved-ts` worker periodically. -/// For the followers, the item is sync periodically from the leader through the `CheckLeader` rpc. +/// For the leader, the `(apply index, safe ts)` item is publish by the +/// `resolved-ts` worker periodically. For the followers, the item is sync +/// periodically from the leader through the `CheckLeader` rpc. /// -/// The intend is to make the item's `safe ts` larger (more up to date) and `apply index` smaller (require less data) +/// The intend is to make the item's `safe ts` larger (more up to date) and +/// `apply index` smaller (require less data) // -/// TODO: the name `RegionReadProgress` is conflict with the leader lease's `ReadProgress`, shoule change it to another -/// more proper name +/// TODO: the name `RegionReadProgress` is conflict with the leader lease's +/// `ReadProgress`, shoule change it to another more proper name #[derive(Debug)] pub struct RegionReadProgress { // `core` used to keep track and update `safe_ts`, it should @@ -1036,13 +1045,13 @@ impl RegionReadProgress { } } - // Consume the provided `LeaderInfo` to update `safe_ts` and return whether the provided - // `LeaderInfo` is same as ours + // Consume the provided `LeaderInfo` to update `safe_ts` and return whether the + // provided `LeaderInfo` is same as ours pub fn consume_leader_info(&self, mut leader_info: LeaderInfo) -> bool { let mut core = self.core.lock().unwrap(); if leader_info.has_read_state() { - // It is okay to update `safe_ts` without checking the `LeaderInfo`, the `read_state` - // is guaranteed to be valid when it is published by the leader + // It is okay to update `safe_ts` without checking the `LeaderInfo`, the + // `read_state` is guaranteed to be valid when it is published by the leader let rs = leader_info.take_read_state(); let (apply_index, ts) = (rs.get_applied_index(), rs.get_safe_ts()); if apply_index != 0 && ts != 0 && !core.discard { @@ -1123,16 +1132,17 @@ struct RegionReadProgressCore { tag: String, region_id: u64, applied_index: u64, - // A wraper of `(apply_index, safe_ts)` item, where the `read_state.ts` is the peer's current `safe_ts` - // and the `read_state.idx` is the smallest `apply_index` required for that `safe_ts` + // A wraper of `(apply_index, safe_ts)` item, where the `read_state.ts` is the peer's current + // `safe_ts` and the `read_state.idx` is the smallest `apply_index` required for that `safe_ts` read_state: ReadState, // The local peer's acknowledge about the leader leader_info: LocalLeaderInfo, // `pending_items` is a *sorted* list of `(apply_index, safe_ts)` item pending_items: VecDeque, - // After the region commit merged, the region's key range is extended and the region's `safe_ts` - // should reset to `min(source_safe_ts, target_safe_ts)`, and start reject stale `read_state` item - // with index smaller than `last_merge_index` to avoid `safe_ts` undo the decrease + // After the region commit merged, the region's key range is extended and the region's + // `safe_ts` should reset to `min(source_safe_ts, target_safe_ts)`, and start reject stale + // `read_state` item with index smaller than `last_merge_index` to avoid `safe_ts` undo the + // decrease last_merge_index: u64, // Stop update `safe_ts` pause: bool, @@ -1210,7 +1220,8 @@ impl RegionReadProgressCore { // The apply index should not decrease assert!(applied >= self.applied_index); self.applied_index = applied; - // Consume pending items with `apply_index` less or equal to `self.applied_index` + // Consume pending items with `apply_index` less or equal to + // `self.applied_index` let mut to_update = self.read_state.clone(); while let Some(item) = self.pending_items.pop_front() { if self.applied_index < item.idx { @@ -1279,7 +1290,8 @@ impl RegionReadProgressCore { } } -/// Represent the duration of all stages of raftstore recorded by one inspecting. +/// Represent the duration of all stages of raftstore recorded by one +/// inspecting. #[derive(Default, Debug)] pub struct RaftstoreDuration { pub store_wait_duration: Option, @@ -1432,7 +1444,8 @@ mod tests { let cases = vec![ (Timespec::new(0, 0), 0x0000_0000_0000_0000u64), (Timespec::new(0, 1), 0x0000_0000_0000_0000u64), // 1ns is round down to 0ms. - (Timespec::new(0, 999_999), 0x0000_0000_0000_0000u64), // 999_999ns is round down to 0ms. + (Timespec::new(0, 999_999), 0x0000_0000_0000_0000u64), /* 999_999ns is round down to + * 0ms. */ ( // 1_048_575ns is round down to 0ms. Timespec::new(0, 1_048_575 /* 0x0FFFFF */), @@ -1520,7 +1533,7 @@ mod tests { ) -> metapb::Region { let mut region = metapb::Region::default(); macro_rules! push_peer { - ($ids: ident, $role: expr) => { + ($ids:ident, $role:expr) => { for id in $ids { let mut peer = metapb::Peer::default(); peer.set_id(*id); diff --git a/components/raftstore/src/store/worker/check_leader.rs b/components/raftstore/src/store/worker/check_leader.rs index d5fd6f2c007..355dca4f168 100644 --- a/components/raftstore/src/store/worker/check_leader.rs +++ b/components/raftstore/src/store/worker/check_leader.rs @@ -56,7 +56,8 @@ impl Runner { } } - // Get the minimal `safe_ts` from regions overlap with the key range [`start_key`, `end_key`) + // Get the minimal `safe_ts` from regions overlap with the key range + // [`start_key`, `end_key`) fn get_range_safe_ts(&self, key_range: KeyRange) -> u64 { if key_range.get_start_key().is_empty() && key_range.get_end_key().is_empty() { // Fast path to get the min `safe_ts` of all regions in this store @@ -73,10 +74,10 @@ impl Runner { data_key(key_range.get_start_key()), data_end_key(key_range.get_end_key()), ); - // `store_safe_ts` won't be accessed frequently (like per-request or per-transaction), - // also this branch won't entry because the request key range is empty currently (in v5.1) - // keep this branch for robustness and future use, so it is okay getting `store_safe_ts` - // from `store_meta` (behide a mutex) + // `store_safe_ts` won't be accessed frequently (like per-request or + // per-transaction), also this branch won't entry because the request key range + // is empty currently (in v5.1) keep this branch for robustness and future use, + // so it is okay getting `store_safe_ts` from `store_meta` (behide a mutex) let meta = self.store_meta.lock().unwrap(); meta.region_read_progress.with(|registry| { meta.region_ranges diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index 88222623084..a829d2fe01c 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -23,9 +23,12 @@ pub enum Task { }, CheckAndCompact { - cf_names: Vec, // Column families need to compact - ranges: Vec, // Ranges need to check - tombstones_num_threshold: u64, // The minimum RocksDB tombstones a range that need compacting has + // Column families need to compact + cf_names: Vec, + // Ranges need to check + ranges: Vec, + // The minimum RocksDB tombstones a range that need compacting has + tombstones_num_threshold: u64, tombstones_percent_threshold: u64, }, } @@ -181,7 +184,8 @@ fn need_compact( return false; } - // When the number of tombstones exceed threshold and ratio, this range need compacting. + // When the number of tombstones exceed threshold and ratio, this range need + // compacting. let estimate_num_del = num_entires - num_versions; estimate_num_del >= tombstones_num_threshold && estimate_num_del * 100 >= tombstones_percent_threshold * num_entires @@ -193,14 +197,15 @@ fn collect_ranges_need_compact( tombstones_num_threshold: u64, tombstones_percent_threshold: u64, ) -> Result, Error> { - // Check the SST properties for each range, and TiKV will compact a range if the range - // contains too many RocksDB tombstones. TiKV will merge multiple neighboring ranges - // that need compacting into a single range. + // Check the SST properties for each range, and TiKV will compact a range if the + // range contains too many RocksDB tombstones. TiKV will merge multiple + // neighboring ranges that need compacting into a single range. let mut ranges_need_compact = VecDeque::new(); let mut compact_start = None; let mut compact_end = None; for range in ranges.windows(2) { - // Get total entries and total versions in this range and checks if it needs to be compacted. + // Get total entries and total versions in this range and checks if it needs to + // be compacted. if let Some((num_ent, num_ver)) = box_try!(engine.get_range_entries_and_versions(CF_WRITE, &range[0], &range[1])) { @@ -220,7 +225,8 @@ fn collect_ranges_need_compact( } } - // Current range doesn't need compacting, save previous range that need compacting. + // Current range doesn't need compacting, save previous range that need + // compacting. if compact_start.is_some() { assert!(compact_end.is_some()); } diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index afd84ad16dd..d65cbcea8d4 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -738,11 +738,12 @@ fn hotspot_query_num_report_threshold() -> u64 { HOTSPOT_QUERY_RATE_THRESHOLD * 10 } -// Slow score is a value that represents the speed of a store and ranges in [1, 100]. -// It is maintained in the AIMD way. -// If there are some inspecting requests timeout during a round, by default the score -// will be increased at most 1x when above 10% inspecting requests timeout. -// If there is not any timeout inspecting requests, the score will go back to 1 in at least 5min. +// Slow score is a value that represents the speed of a store and ranges in [1, +// 100]. It is maintained in the AIMD way. +// If there are some inspecting requests timeout during a round, by default the +// score will be increased at most 1x when above 10% inspecting requests +// timeout. If there is not any timeout inspecting requests, the score will go +// back to 1 in at least 5min. struct SlowScore { value: OrderedFloat, last_record_time: Instant, @@ -1086,9 +1087,10 @@ where Default::default(), ); } - // When rolling update, there might be some old version tikvs that don't support batch split in cluster. - // In this situation, PD version check would refuse `ask_batch_split`. - // But if update time is long, it may cause large Regions, so call `ask_split` instead. + // When rolling update, there might be some old version tikvs that don't support + // batch split in cluster. In this situation, PD version check would refuse + // `ask_batch_split`. But if update time is long, it may cause large Regions, so + // call `ask_split` instead. Err(Error::Incompatible) => { let (region_id, peer_id) = (region.id, peer.id); info!( @@ -1237,7 +1239,8 @@ where stats.set_used_size(used_size); let mut available = capacity.checked_sub(used_size).unwrap_or_default(); - // We only care about rocksdb SST file size, so we should check disk available here. + // We only care about rocksdb SST file size, so we should check disk available + // here. available = cmp::min(available, disk_stats.available_space()); if available == 0 { @@ -2074,8 +2077,8 @@ where self.slow_score.last_tick_finished = false; if self.slow_score.last_tick_id % self.slow_score.round_ticks == 0 { - // `last_update_time` is refreshed every round. If no update happens in a whole round, - // we set the status to unknown. + // `last_update_time` is refreshed every round. If no update happens in a whole + // round, we set the status to unknown. if self.curr_health_status == ServingStatus::Serving && self.slow_score.last_record_time < self.slow_score.last_update_time { diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index bf7debfb1d9..88e30e33104 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -108,7 +108,8 @@ impl Runner { return; } fail::fail_point!("worker_gc_raft_log_flush"); - // Sync wal of kv_db to make sure the data before apply_index has been persisted to disk. + // Sync wal of kv_db to make sure the data before apply_index has been persisted + // to disk. let start = Instant::now(); self.engines.kv.sync().unwrap_or_else(|e| { panic!("failed to sync kv_engine in raft_log_gc: {:?}", e); @@ -233,7 +234,7 @@ mod tests { e.set_index(i); raft_wb.append(region_id, vec![e]).unwrap(); } - raft_db.consume(&mut raft_wb, false /*sync*/).unwrap(); + raft_db.consume(&mut raft_wb, false /* sync */).unwrap(); let tbls = vec![ (Task::gc(region_id, 0, 10), 10, (0, 10), (10, 100)), diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 1be9cf8b4e9..b7724789d4b 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -514,10 +514,11 @@ where cmd.callback.invoke_read(read_resp); } - // Ideally `get_delegate` should return `Option<&ReadDelegate>`, but if so the lifetime of - // the returned `&ReadDelegate` will bind to `self`, and make it impossible to use `&mut self` - // while the `&ReadDelegate` is alive, a better choice is use `Rc` but `LocalReader: Send` will be - // violated, which is required by `LocalReadRouter: Send`, use `Arc` will introduce extra cost but + // Ideally `get_delegate` should return `Option<&ReadDelegate>`, but if so the + // lifetime of the returned `&ReadDelegate` will bind to `self`, and make it + // impossible to use `&mut self` while the `&ReadDelegate` is alive, a better + // choice is use `Rc` but `LocalReader: Send` will be violated, which is + // required by `LocalReadRouter: Send`, use `Arc` will introduce extra cost but // make the logic clear fn get_delegate(&mut self, region_id: u64) -> Option> { let rd = match self.delegates.get(®ion_id) { @@ -669,7 +670,8 @@ where // Getting the snapshot let response = self.execute(&req, &delegate.region, None, read_id); - // Double check in case `safe_ts` change after the first check and before getting snapshot + // Double check in case `safe_ts` change after the first check and before + // getting snapshot if let Err(resp) = delegate.check_stale_read_safe(read_ts, &mut self.metrics) { @@ -705,11 +707,12 @@ where } } - /// If read requests are received at the same RPC request, we can create one snapshot for all - /// of them and check whether the time when the snapshot was created is in lease. We use - /// ThreadReadId to figure out whether this RaftCommand comes from the same RPC request with - /// the last RaftCommand which left a snapshot cached in LocalReader. ThreadReadId is composed - /// by thread_id and a thread_local incremental sequence. + /// If read requests are received at the same RPC request, we can create one + /// snapshot for all of them and check whether the time when the snapshot + /// was created is in lease. We use ThreadReadId to figure out whether this + /// RaftCommand comes from the same RPC request with the last RaftCommand + /// which left a snapshot cached in LocalReader. ThreadReadId is composed by + /// thread_id and a thread_local incremental sequence. #[inline] pub fn read( &mut self, diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index cdd0ee5556b..5e2cc8992f5 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -48,7 +48,8 @@ use crate::{ }, }; -// used to periodically check whether we should delete a stale peer's range in region runner +// used to periodically check whether we should delete a stale peer's range in +// region runner #[cfg(test)] pub const STALE_PEER_CHECK_TICK: usize = 1; // 1000 milliseconds @@ -137,7 +138,8 @@ struct StalePeerInfo { } /// A structure records all ranges to be deleted with some delay. -/// The delay is because there may be some coprocessor requests related to these ranges. +/// The delay is because there may be some coprocessor requests related to these +/// ranges. #[derive(Clone, Default)] struct PendingDeleteRanges { ranges: BTreeMap, StalePeerInfo>, // start_key -> StalePeerInfo @@ -202,7 +204,8 @@ impl PendingDeleteRanges { /// Inserts a new range waiting to be deleted. /// - /// Before an insert is called, it must call drain_overlap_ranges to clean the overlapping range. + /// Before an insert is called, it must call drain_overlap_ranges to clean + /// the overlapping range. fn insert(&mut self, region_id: u64, start_key: &[u8], end_key: &[u8], stale_sequence: u64) { if !self.find_overlap_ranges(start_key, end_key).is_empty() { panic!( @@ -290,14 +293,16 @@ where "err" => %e, ); } - // The error can be ignored as snapshot will be sent in next heartbeat in the end. + // The error can be ignored as snapshot will be sent in next heartbeat in the + // end. let _ = self .router .send(region_id, CasualMessage::SnapshotGenerated); Ok(()) } - /// Handles the task of generating snapshot of the Region. It calls `generate_snap` to do the actual work. + /// Handles the task of generating snapshot of the Region. It calls + /// `generate_snap` to do the actual work. fn handle_gen( &self, region_id: u64, @@ -425,7 +430,8 @@ where Ok(()) } - /// Tries to apply the snapshot of the specified Region. It calls `apply_snap` to do the actual work. + /// Tries to apply the snapshot of the specified Region. It calls + /// `apply_snap` to do the actual work. fn handle_apply(&mut self, region_id: u64, status: Arc) { let _ = status.compare_exchange( JOB_STATUS_PENDING, @@ -493,7 +499,8 @@ where let mut df_ranges = Vec::with_capacity(overlap_ranges.len()); for (region_id, start_key, end_key, stale_sequence) in overlap_ranges.iter() { // `DeleteFiles` may break current rocksdb snapshots consistency, - // so do not use it unless we can make sure there is no reader of the destroyed peer anymore. + // so do not use it unless we can make sure there is no reader of the destroyed + // peer anymore. if *stale_sequence < oldest_sequence { df_ranges.push(Range::new(start_key, end_key)); } else { @@ -588,8 +595,8 @@ where } } - /// Checks the number of files at level 0 to avoid write stall after ingesting sst. - /// Returns true if the ingestion causes write stall. + /// Checks the number of files at level 0 to avoid write stall after + /// ingesting sst. Returns true if the ingestion causes write stall. fn ingest_maybe_stall(&self) -> bool { for cf in SNAPSHOT_CFS { // no need to check lock cf @@ -679,8 +686,9 @@ where fn handle_pending_applies(&mut self) { fail_point!("apply_pending_snapshot", |_| {}); while !self.pending_applies.is_empty() { - // should not handle too many applies than the number of files that can be ingested. - // check level 0 every time because we can not make sure how does the number of level 0 files change. + // should not handle too many applies than the number of files that can be + // ingested. check level 0 every time because we can not make sure + // how does the number of level 0 files change. if self.ctx.ingest_maybe_stall() { break; } diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index e5dde8a910c..14a1a5b7bbc 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -339,7 +339,8 @@ where ); } - /// Checks a Region with split and bucket checkers to produce split keys and buckets keys and generates split admin command. + /// Checks a Region with split and bucket checkers to produce split keys and + /// buckets keys and generates split admin command. fn check_split_and_bucket( &mut self, region: &Region, @@ -553,7 +554,8 @@ where if bucket_range_idx == bucket_range_list.len() { skip_check_bucket = true; } else if origin_key >= bucket_range_list[bucket_range_idx].0.as_slice() { - // e.key() is between bucket_range_list[bucket_range_idx].0, bucket_range_list[bucket_range_idx].1 + // e.key() is between bucket_range_list[bucket_range_idx].0, + // bucket_range_list[bucket_range_idx].1 bucket_size += e.entry_size() as u64; if bucket_size >= host.region_bucket_size() { bucket.keys.push(origin_key.to_vec()); @@ -580,7 +582,8 @@ where } } - // if we scan the whole range, we can update approximate size and keys with accurate value. + // if we scan the whole range, we can update approximate size and keys with + // accurate value. if is_key_range { return; } diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index 4fe00fff448..7857ae10d8e 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -14,22 +14,35 @@ pub(crate) const DEFAULT_SAMPLE_NUM: usize = 20; const DEFAULT_QPS_THRESHOLD: usize = 3000; const DEFAULT_BYTE_THRESHOLD: usize = 30 * 1024 * 1024; -// We get balance score by abs(sample.left-sample.right)/(sample.right+sample.left). It will be used to measure left and right balance +// We get balance score by +// abs(sample.left-sample.right)/(sample.right+sample.left). It will be used to +// measure left and right balance const DEFAULT_SPLIT_BALANCE_SCORE: f64 = 0.25; -// We get contained score by sample.contained/(sample.right+sample.left+sample.contained). It will be used to avoid to split regions requested by range. +// We get contained score by +// sample.contained/(sample.right+sample.left+sample.contained). It will be used +// to avoid to split regions requested by range. const DEFAULT_SPLIT_CONTAINED_SCORE: f64 = 0.5; -// If the `split_balance_score` and `split_contained_score` above could not be satisfied, we will try to split the region according to its CPU load, +// If the `split_balance_score` and `split_contained_score` above could not be +// satisfied, we will try to split the region according to its CPU load, // then these parameters below will start to work. -// When the gRPC poll thread CPU usage (over the past `detect_times` seconds by default) is higher than gRPC poll thread count * `DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, -// the CPU-based split won't be triggered no matter if the `DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO` and `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` are exceeded -// to prevent from increasing the gRPC poll CPU usage. +// When the gRPC poll thread CPU usage (over the past `detect_times` seconds by +// default) is higher than gRPC poll thread count * +// `DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, the CPU-based split won't +// be triggered no matter if the +// `DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO` and +// `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` are exceeded to prevent from increasing +// the gRPC poll CPU usage. const DEFAULT_GRPC_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.5; -// When the Unified Read Poll thread CPU usage is higher than Unified Read Poll thread count * `DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, +// When the Unified Read Poll thread CPU usage is higher than Unified Read Poll +// thread count * +// `DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO`, // the CPU-based split will try to check and record the top hot CPU region. const DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.8; -// When the Unified Read Poll is hot and the region's CPU usage reaches `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` as a percentage of the Unified Read Poll, -// it will be added into the hot region list and may be split later as the top hot CPU region. +// When the Unified Read Poll is hot and the region's CPU usage reaches +// `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` as a percentage of the Unified Read +// Poll, it will be added into the hot region list and may be split later as the +// top hot CPU region. pub(crate) const REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.25; lazy_static! { diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 013ac705be9..0f15bcc4805 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -60,7 +60,8 @@ where } // This function uses the distributed/parallel reservoir sampling algorithm. -// It will sample min(sample_num, all_key_ranges_num) key ranges from multiple `key_ranges_provider` with the same possibility. +// It will sample min(sample_num, all_key_ranges_num) key ranges from multiple +// `key_ranges_provider` with the same possibility. fn sample( sample_num: usize, mut key_ranges_providers: Vec, @@ -72,7 +73,8 @@ where let mut sampled_key_ranges = vec![]; // Retain the non-empty key ranges. // `key_ranges_provider` may return an empty key ranges vector, which will cause - // the later sampling to fall into a dead loop. So we need to filter it out here. + // the later sampling to fall into a dead loop. So we need to filter it out + // here. key_ranges_providers .retain_mut(|key_ranges_provider| !key_ranges_getter(key_ranges_provider).is_empty()); if key_ranges_providers.is_empty() { @@ -109,8 +111,9 @@ where // Generate a random number in [1, all_key_ranges_num]. // Starting from 1 is to achieve equal probability. // For example, for a `prefix_sum` like [1, 2, 3, 4], - // if we generate a random number in [0, 4], the probability of choosing the first index is 0.4 - // rather than 0.25 due to that 0 and 1 will both make `binary_search` get the same result. + // if we generate a random number in [0, 4], the probability of choosing the + // first index is 0.4 rather than 0.25 due to that 0 and 1 will both + // make `binary_search` get the same result. let i = prefix_sum .binary_search(&rng.gen_range(1..=all_key_ranges_num)) .unwrap_or_else(|i| i); @@ -170,7 +173,8 @@ impl From> for Samples { } impl Samples { - // evaluate the samples according to the given key range, it will update the sample's left, right and contained counter. + // evaluate the samples according to the given key range, it will update the + // sample's left, right and contained counter. fn evaluate(&mut self, key_range: &KeyRange) { for mut sample in self.0.iter_mut() { let order_start = if key_range.start_key.is_empty() { @@ -210,8 +214,9 @@ impl Samples { } let evaluated_key_num = (sample.contained + evaluated_key_num_lr) as f64; - // The balance score is the difference in the number of requested keys between the left and right of a sample key. - // The smaller the balance score, the more balanced the load will be after this splitting. + // The balance score is the difference in the number of requested keys between + // the left and right of a sample key. The smaller the balance + // score, the more balanced the load will be after this splitting. let balance_score = (sample.left as f64 - sample.right as f64).abs() / evaluated_key_num_lr as f64; LOAD_BASE_SPLIT_SAMPLE_VEC @@ -222,8 +227,9 @@ impl Samples { continue; } - // The contained score is the ratio of a sample key that are contained in the requested key. - // The larger the contained score, the more RPCs the cluster will receive after this splitting. + // The contained score is the ratio of a sample key that are contained in the + // requested key. The larger the contained score, the more RPCs the + // cluster will receive after this splitting. let contained_score = sample.contained as f64 / evaluated_key_num; LOAD_BASE_SPLIT_SAMPLE_VEC .with_label_values(&["contained_score"]) @@ -233,8 +239,9 @@ impl Samples { continue; } - // We try to find a split key that has the smallest balance score and the smallest contained score - // to make the splitting keep the load balanced while not increasing too many RPCs. + // We try to find a split key that has the smallest balance score and the + // smallest contained score to make the splitting keep the load + // balanced while not increasing too many RPCs. let final_score = balance_score + contained_score; if final_score < best_score { best_index = index as i32; @@ -295,13 +302,14 @@ impl Recorder { // collect the split keys from the recorded key_ranges. // This will start a second-level sampling on the previous sampled key ranges, - // evaluate the samples according to the given key range, and compute the split keys finally. + // evaluate the samples according to the given key range, and compute the split + // keys finally. fn collect(&self, config: &SplitConfig) -> Vec { let sampled_key_ranges = sample(config.sample_num, self.key_ranges.clone(), |x| x); let mut samples = Samples::from(sampled_key_ranges); let recorded_key_ranges: Vec<&KeyRange> = self.key_ranges.iter().flatten().collect(); - // Because we need to observe the number of `no_enough_key` of all the actual keys, - // so we do this check after the samples are calculated. + // Because we need to observe the number of `no_enough_key` of all the actual + // keys, so we do this check after the samples are calculated. if (recorded_key_ranges.len() as u64) < config.sample_threshold { LOAD_BASE_SPLIT_EVENT .no_enough_sampled_key @@ -315,8 +323,8 @@ impl Recorder { } } -// RegionInfo will maintain key_ranges with sample_num length by reservoir sampling. -// And it will save qps num and peer. +// RegionInfo will maintain key_ranges with sample_num length by reservoir +// sampling. And it will save qps num and peer. #[derive(Debug, Clone)] pub struct RegionInfo { pub sample_num: usize, @@ -378,7 +386,8 @@ pub struct ReadStats { // 2. add_query_num_batch // 3. add_flow // Among these three methods, `add_flow` will not update `key_ranges` of `RegionInfo`, - // and due to this, an `RegionInfo` without `key_ranges` may occur. The caller should be aware of this. + // and due to this, an `RegionInfo` without `key_ranges` may occur. The caller should be aware + // of this. pub region_infos: HashMap, pub sample_num: usize, pub region_buckets: HashMap, @@ -525,7 +534,8 @@ impl SplitInfo { } // Create a SplitInfo with the given region_id, peer, start_key and end_key. - // This is used to split the region on half within the specified start and end keys later. + // This is used to split the region on half within the specified start and end + // keys later. fn with_start_end_key( region_id: u64, peer: Peer, @@ -643,7 +653,8 @@ impl AutoSplitController { >= self.cfg.region_cpu_overload_threshold_ratio } - // collect the read stats from read_stats_vec and dispatch them to a Region HashMap. + // collect the read stats from read_stats_vec and dispatch them to a Region + // HashMap. fn collect_read_stats(read_stats_vec: Vec) -> HashMap> { // RegionID -> Vec, collect the RegionInfo from different threads. let mut region_infos_map = HashMap::default(); @@ -659,12 +670,14 @@ impl AutoSplitController { region_infos_map } - // collect the CPU stats from cpu_stats_vec and dispatch them to a Region HashMap. + // collect the CPU stats from cpu_stats_vec and dispatch them to a Region + // HashMap. fn collect_cpu_stats( &self, cpu_stats_vec: Vec>, ) -> HashMap)> { - // RegionID -> (CPU usage, Hottest Key Range), calculate the CPU usage and its hottest key range. + // RegionID -> (CPU usage, Hottest Key Range), calculate the CPU usage and its + // hottest key range. let mut region_cpu_map = HashMap::default(); if !self.should_check_region_cpu() { return region_cpu_map; @@ -730,8 +743,8 @@ impl AutoSplitController { / 100.0 } - // flush the read stats info into the recorder and check if the region needs to be split - // according to all the stats info the recorder has collected before. + // flush the read stats info into the recorder and check if the region needs to + // be split according to all the stats info the recorder has collected before. pub fn flush( &mut self, read_stats_vec: Vec, diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index ef683724429..e1c23652db8 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -44,8 +44,8 @@ pub struct AdvanceTsWorker { timer: SteadyTimer, worker: Runtime, scheduler: Scheduler>, - /// The concurrency manager for transactions. It's needed for CDC to check locks when - /// calculating resolved_ts. + /// The concurrency manager for transactions. It's needed for CDC to check + /// locks when calculating resolved_ts. concurrency_manager: ConcurrencyManager, // store_id -> client tikv_clients: Arc>>, @@ -104,8 +104,8 @@ impl AdvanceTsWorker { // Ignore get tso errors since we will retry every `advance_ts_interval`. let mut min_ts = pd_client.get_tso().await.unwrap_or_default(); - // Sync with concurrency manager so that it can work correctly when optimizations - // like async commit is enabled. + // Sync with concurrency manager so that it can work correctly when + // optimizations like async commit is enabled. // Note: This step must be done before scheduling `Task::MinTS` task, and the // resolver must be checked in or after `Task::MinTS`' execution. cm.update_max_ts(min_ts); @@ -153,8 +153,9 @@ impl AdvanceTsWorker { } // Confirms leadership of region peer before trying to advance resolved ts. -// This function broadcasts a special message to all stores, gets the leader id of them to confirm whether -// current peer has a quorum which accepts its leadership. +// This function broadcasts a special message to all stores, gets the leader id +// of them to confirm whether current peer has a quorum which accepts its +// leadership. pub async fn region_resolved_ts_store( regions: Vec, store_meta: Arc>, @@ -290,7 +291,8 @@ pub async fn region_resolved_ts_store( .observe(start.saturating_elapsed_secs()); }); for _ in 0..store_count { - // Use `select_all` to avoid the process getting blocked when some TiKVs were down. + // Use `select_all` to avoid the process getting blocked when some TiKVs were + // down. let (res, _, remains) = select_all(stores).await; stores = remains; match res { diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index f561aa07e28..277a31e2001 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -135,7 +135,8 @@ impl ChangeLog { pub(crate) fn decode_write(key: &[u8], value: &[u8], is_apply: bool) -> Option { let write = WriteRef::parse(value).ok()?.to_owned(); - // Drop the record it self but keep only the overlapped rollback information if gc_fence exists. + // Drop the record it self but keep only the overlapped rollback information if + // gc_fence exists. if is_apply && write.gc_fence.is_some() { // `gc_fence` is set means the write record has been rewritten. // Currently the only case is writing overlapped_rollback. And in this case @@ -191,7 +192,8 @@ struct RowChange { fn group_row_changes(requests: Vec) -> HashMap { let mut changes: HashMap = HashMap::default(); - // The changes about default cf was recorded here and need to be matched with a `write` or a `lock`. + // The changes about default cf was recorded here and need to be matched with a + // `write` or a `lock`. let mut unmatched_default = HashMap::default(); for mut req in requests { match req.get_cmd_type() { @@ -254,8 +256,8 @@ fn group_row_changes(requests: Vec) -> HashMap { changes } -/// Filter non-lock related data (i.e `default_cf` data), the implement is subject to -/// how `group_row_changes` and `encode_rows` encode `ChangeRow` +/// Filter non-lock related data (i.e `default_cf` data), the implement is +/// subject to how `group_row_changes` and `encode_rows` encode `ChangeRow` pub fn lock_only_filter(mut cmd_batch: CmdBatch) -> Option { if cmd_batch.is_empty() { return None; diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 90e3a3b7912..5a180a9b6c8 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -63,8 +63,8 @@ enum PendingLock { } // Records information related to observed region. -// observe_id is used for avoiding ABA problems in incremental scan task, advance resolved ts task, -// and command observing. +// observe_id is used for avoiding ABA problems in incremental scan task, +// advance resolved ts task, and command observing. struct ObserveRegion { meta: Region, handle: ObserveHandle, @@ -106,8 +106,9 @@ impl ObserveRegion { continue; } ChangeLog::Admin(req_type) => { - // TODO: for admin cmd that won't change the region meta like peer list and key range - // (i.e. `CompactLog`, `ComputeHash`) we may not need to return error + // TODO: for admin cmd that won't change the region meta like peer list + // and key range (i.e. `CompactLog`, `ComputeHash`) we may not need to + // return error return Err(format!( "region met admin command {:?} while initializing resolver", req_type @@ -167,8 +168,9 @@ impl ObserveRegion { "region met split/merge command, stop tracking since key range changed, wait for re-register"; "req_type" => ?req_type, ); - // Stop tracking so that `tracked_index` larger than the split/merge command index won't be published - // untill `RegionUpdate` event trigger the region re-register and re-scan the new key range + // Stop tracking so that `tracked_index` larger than the split/merge + // command index won't be published until `RegionUpdate` event + // trigger the region re-register and re-scan the new key range self.resolver.stop_tracking(); } _ => { @@ -421,15 +423,17 @@ where return; } // TODO: may not need to re-register region for some cases: - // - `Split/BatchSplit`, which can be handled by remove out-of-range locks from the `Resolver`'s lock heap + // - `Split/BatchSplit`, which can be handled by remove out-of-range locks from + // the `Resolver`'s lock heap // - `PrepareMerge` and `RollbackMerge`, the key range is unchanged self.deregister_region(region_id); self.register_region(incoming_region); } } - // This function is corresponding to RegionDestroyed event that can be only scheduled by observer. - // To prevent destroying region for wrong peer, it should check the region epoch at first. + // This function is corresponding to RegionDestroyed event that can be only + // scheduled by observer. To prevent destroying region for wrong peer, it + // should check the region epoch at first. fn region_destroyed(&mut self, region: Region) { if let Some(observe_region) = self.regions.get(®ion.id) { if util::compare_region_epoch( @@ -501,7 +505,8 @@ where self.sinker.sink_resolved_ts(regions, ts); } - // Tracking or untracking locks with incoming commands that corresponding observe id is valid. + // Tracking or untracking locks with incoming commands that corresponding + // observe id is valid. #[allow(clippy::drop_ref)] fn handle_change_log( &mut self, @@ -566,7 +571,8 @@ where } fn register_advance_event(&self, cfg_version: usize) { - // Ignore advance event that registered with previous `advance_ts_interval` config + // Ignore advance event that registered with previous `advance_ts_interval` + // config if self.cfg_version != cfg_version { return; } diff --git a/components/resolved_ts/src/lib.rs b/components/resolved_ts/src/lib.rs index 172efbb9c18..5ad2941dde2 100644 --- a/components/resolved_ts/src/lib.rs +++ b/components/resolved_ts/src/lib.rs @@ -1,13 +1,16 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! Resolved TS is a timestamp that represents the lower bonud of incoming Commit TS +//! Resolved TS is a timestamp that represents the lower bound of incoming +//! Commit TS // and the upper bound of outgoing Commit TS. -//! Through this timestamp we can get a consistent view in the transaction level. +//! Through this timestamp we can get a consistent view in the transaction +//! level. //! //! To maintain a correct Resolved TS, these premises must be satisfied: -//! 1. Tracing all locks in the region, use the minimal Start TS as Resolved TS. -//! 2. If there is not any lock, use the latest timestamp as Resolved TS. -//! 3. Resolved TS must be advanced by the region leader after it has applied on its term. +//! - Tracing all locks in the region, use the minimal Start TS as Resolved TS. +//! - If there is not any lock, use the latest timestamp as Resolved TS. +//! - Resolved TS must be advanced by the region leader after it has applied on +//! its term. #![feature(box_patterns)] #![feature(result_flattening)] diff --git a/components/resolved_ts/src/observer.rs b/components/resolved_ts/src/observer.rs index c9decaadc77..9ff7b976ad4 100644 --- a/components/resolved_ts/src/observer.rs +++ b/components/resolved_ts/src/observer.rs @@ -18,8 +18,9 @@ impl Observer { } pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { - // The `resolved-ts` cmd observer will `mem::take` the `Vec`, use a low priority - // to let it be the last observer and avoid affecting other observers + // The `resolved-ts` cmd observer will `mem::take` the `Vec`, use a + // low priority to let it be the last observer and avoid affecting other + // observers coprocessor_host .registry .register_cmd_observer(1000, BoxCmdObserver::new(self.clone())); @@ -84,7 +85,8 @@ impl CmdObserver for Observer { impl RoleObserver for Observer { fn on_role_change(&self, ctx: &mut ObserverContext<'_>, role_change: &RoleChange) { // Stop to advance resolved ts after peer steps down to follower or candidate. - // Do not need to check observe id because we expect all role change events are scheduled in order. + // Do not need to check observe id because we expect all role change events are + // scheduled in order. if role_change.state != StateRole::Leader { if let Err(e) = self.scheduler.schedule(Task::DeRegisterRegion { region_id: ctx.region().id, @@ -102,9 +104,9 @@ impl RegionChangeObserver for Observer { event: RegionChangeEvent, role: StateRole, ) { - // If the peer is not leader, it must has not registered the observe region or it is deregistering - // the observe region, so don't need to send `RegionUpdated`/`RegionDestroyed` to update the observe - // region + // If the peer is not leader, it must has not registered the observe region or + // it is deregistering the observe region, so don't need to send + // `RegionUpdated`/`RegionDestroyed` to update the observe region if role != StateRole::Leader { return; } diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 12c7cbe0c56..f1518784a33 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -181,7 +181,8 @@ impl Resolver { self.raw_lock_ts_heap.push(Reverse(ts)); } - // untrack all timestamps smaller than input ts, depend on the raw ts in one region is non-decreasing + // untrack all timestamps smaller than input ts, depend on the raw ts in one + // region is non-decreasing pub fn raw_untrack_lock(&mut self, ts: TimeStamp) { debug!("raw untrack ts before {}, region {}", ts, self.region_id); while let Some(&Reverse(min_ts)) = self.raw_lock_ts_heap.peek() { @@ -197,7 +198,8 @@ impl Resolver { /// `min_ts` advances the resolver even if there is no write. /// Return None means the resolver is not initialized. pub fn resolve(&mut self, min_ts: TimeStamp) -> ResolvedTs { - // The `Resolver` is stopped, not need to advance, just return the current `resolved_ts` + // The `Resolver` is stopped, not need to advance, just return the current + // `resolved_ts` if self.stopped { return self.resolved_ts; } diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index 835de79c161..396fc7333da 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -212,7 +212,8 @@ impl, E: KvEngine> ScannerPool { let mut resp = box_try!(fut.await); if resp.response.get_header().has_error() { let err = resp.response.take_header().take_error(); - // These two errors can't handled by retrying since the epoch and observe id is unchanged + // These two errors can't handled by retrying since the epoch and observe id is + // unchanged if err.has_epoch_not_match() || err.get_message().contains("stale observe id") { return Err(Error::request(err)); } diff --git a/components/resource_metering/src/collector.rs b/components/resource_metering/src/collector.rs index 9e1830b8acb..bdadd638f2e 100644 --- a/components/resource_metering/src/collector.rs +++ b/components/resource_metering/src/collector.rs @@ -15,7 +15,8 @@ use crate::RawRecords; /// to the `Scheduler` for processing. /// /// `Reporter` implements [Runnable] and [RunnableWithTimer], aggregates the -/// data sent by the `Collector` internally, and reports it regularly through RPC. +/// data sent by the `Collector` internally, and reports it regularly through +/// RPC. /// /// [Recorder]: crate::recorder::Recorder /// [Reporter]: crate::reporter::Reporter diff --git a/components/resource_metering/src/lib.rs b/components/resource_metering/src/lib.rs index 9c1f25e4b0c..bd64d7202ae 100644 --- a/components/resource_metering/src/lib.rs +++ b/components/resource_metering/src/lib.rs @@ -51,9 +51,9 @@ pub const MAX_THREAD_REGISTER_RETRY: u32 = 10; /// This structure is used as a label to distinguish different request contexts. /// -/// In order to associate `ResourceMeteringTag` with a certain piece of code logic, -/// we added a function to [Future] to bind `ResourceMeteringTag` to the specified -/// future context. It is used in the main business logic of TiKV. +/// In order to associate `ResourceMeteringTag` with a certain piece of code +/// logic, we added a function to [Future] to bind `ResourceMeteringTag` to the +/// specified future context. It is used in the main business logic of TiKV. /// /// [Future]: futures::Future pub struct ResourceMeteringTag { @@ -214,14 +214,15 @@ impl ResourceTagFactory { /// This trait extends the standard [Future]. /// -/// When the user imports [FutureExt], all futures in its module (such as async block) -/// will additionally support the [FutureExt::in_resource_metering_tag] method. This method -/// can bind a [ResourceMeteringTag] to the scope of this future (actually, it is stored in -/// the local storage of the thread where `Future` is located). During the polling period of -/// the future, we can continue to observe the system resources used by the thread in which -/// it is located, which is associated with `ResourceMeteringTag` and is also stored in thread -/// local storage. There is a background thread that continuously summarizes the storage of -/// each thread and reports it regularly. +/// When the user imports [FutureExt], all futures in its module (such as async +/// block) will additionally support the [FutureExt::in_resource_metering_tag] +/// method. This method can bind a [ResourceMeteringTag] to the scope of this +/// future (actually, it is stored in the local storage of the thread where +/// `Future` is located). During the polling period of the future, we can +/// continue to observe the system resources used by the thread in which it is +/// located, which is associated with `ResourceMeteringTag` and is also stored +/// in thread local storage. There is a background thread that continuously +/// summarizes the storage of each thread and reports it regularly. /// /// [Future]: futures::Future pub trait FutureExt: Sized { @@ -245,8 +246,9 @@ pub trait StreamExt: Sized { impl StreamExt for T {} -/// This structure is the return value of the [FutureExt::in_resource_metering_tag] method, -/// which wraps the original [Future] with a [ResourceMeteringTag]. +/// This structure is the return value of the +/// [FutureExt::in_resource_metering_tag] method, which wraps the original +/// [Future] with a [ResourceMeteringTag]. /// /// see [FutureExt] for more information. /// diff --git a/components/resource_metering/src/model.rs b/components/resource_metering/src/model.rs index 0cacc6930d4..1359e6c3a45 100644 --- a/components/resource_metering/src/model.rs +++ b/components/resource_metering/src/model.rs @@ -71,7 +71,8 @@ impl Default for RawRecords { } impl RawRecords { - /// Keep a maximum of `k` self.records and aggregate the others into returned [RawRecord]. + /// Keep a maximum of `k` self.records and aggregate the others into + /// returned [RawRecord]. pub fn keep_top_k(&mut self, k: usize) -> RawRecord { let mut others = RawRecord::default(); if self.records.len() <= k { diff --git a/components/resource_metering/src/recorder/collector_reg.rs b/components/resource_metering/src/recorder/collector_reg.rs index 8205a2290cb..f166101dfe5 100644 --- a/components/resource_metering/src/recorder/collector_reg.rs +++ b/components/resource_metering/src/recorder/collector_reg.rs @@ -30,16 +30,16 @@ impl CollectorRegHandle { } } - /// Register a collector to the recorder. Dropping the returned [CollectorGuard] will - /// preform deregistering. + /// Register a collector to the recorder. Dropping the returned + /// [CollectorGuard] will preform deregistering. /// - /// The second argument `as_observer` indicates that whether the given `collector` will - /// control the enabled state of the recorder: - /// - When `as_observer` is false, the recorder will respect it and begin to profile if it's - /// off before. In other words, if there is at least one non-observed collector, the recorder - /// will keep running. - /// - When `as_observer` is true, whether the recorder to be on or off won't depend on if - /// the collector exists. + /// The second argument `as_observer` indicates that whether the given + /// `collector` will control the enabled state of the recorder: + /// - When `as_observer` is false, the recorder will respect it and begin to + /// profile if it's off before. In other words, if there is at least one + /// non-observed collector, the recorder will keep running. + /// - When `as_observer` is true, whether the recorder to be on or off won't + /// depend on if the collector exists. pub fn register(&self, collector: Box, as_observer: bool) -> CollectorGuard { static NEXT_COLLECTOR_ID: AtomicU64 = AtomicU64::new(1); let id = CollectorId(NEXT_COLLECTOR_ID.fetch_add(1, Ordering::SeqCst)); diff --git a/components/resource_metering/src/recorder/localstorage.rs b/components/resource_metering/src/recorder/localstorage.rs index afc9554a212..c9f0b25b478 100644 --- a/components/resource_metering/src/recorder/localstorage.rs +++ b/components/resource_metering/src/recorder/localstorage.rs @@ -16,10 +16,11 @@ thread_local! { pub static STORAGE: RefCell = RefCell::new(LocalStorage::default()); } -/// `LocalStorage` is a thread-local structure that contains all necessary data of submodules. +/// `LocalStorage` is a thread-local structure that contains all necessary data +/// of submodules. /// -/// In order to facilitate mutual reference, the thread-local data of all sub-modules -/// need to be stored centrally in `LocalStorage`. +/// In order to facilitate mutual reference, the thread-local data of all +/// sub-modules need to be stored centrally in `LocalStorage`. #[derive(Clone, Default)] pub struct LocalStorage { pub registered: bool, diff --git a/components/resource_metering/src/recorder/mod.rs b/components/resource_metering/src/recorder/mod.rs index 92e6d094274..9ed6acfb74f 100644 --- a/components/resource_metering/src/recorder/mod.rs +++ b/components/resource_metering/src/recorder/mod.rs @@ -288,8 +288,9 @@ impl ConfigChangeNotifier { } } -/// Constructs a default [Recorder], spawn it and return the corresponding [ConfigChangeNotifier], -/// [CollectorRegHandle], [ResourceTagFactory] and [LazyWorker]. +/// Constructs a default [Recorder], spawn it and return the corresponding +/// [ConfigChangeNotifier], [CollectorRegHandle], [ResourceTagFactory] and +/// [LazyWorker]. /// /// This function is intended to simplify external use. pub fn init_recorder( diff --git a/components/resource_metering/src/recorder/sub_recorder/mod.rs b/components/resource_metering/src/recorder/sub_recorder/mod.rs index e36acb26ddb..42647f3486d 100644 --- a/components/resource_metering/src/recorder/sub_recorder/mod.rs +++ b/components/resource_metering/src/recorder/sub_recorder/mod.rs @@ -8,19 +8,22 @@ use crate::{recorder::localstorage::LocalStorage, RawRecords}; pub mod cpu; pub mod summary; -/// This trait defines a general framework that works at a certain frequency. Typically, -/// it describes the recorder(sampler) framework for a specific resource. +/// This trait defines a general framework that works at a certain frequency. +/// Typically, it describes the recorder(sampler) framework for a specific +/// resource. /// -/// [Recorder] will maintain a list of sub-recorders, driving all sub-recorders to work -/// according to the behavior described in this trait. +/// [Recorder] will maintain a list of sub-recorders, driving all sub-recorders +/// to work according to the behavior described in this trait. pub trait SubRecorder: Send { - /// This function is called at a fixed frequency. (A typical frequency is 99hz.) + /// This function is called at a fixed frequency. (A typical frequency is + /// 99hz.) /// - /// The [RawRecords] and [LocalStorage] map of all threads will be passed in through - /// parameters. We need to collect resources (may be from each `LocalStorage`) and - /// write them into `RawRecords`. + /// The [RawRecords] and [LocalStorage] map of all threads will be passed in + /// through parameters. We need to collect resources (may be from each + /// `LocalStorage`) and write them into `RawRecords`. /// - /// The implementation needs to sample the resource in this function (in general). + /// The implementation needs to sample the resource in this function (in + /// general). /// /// [RawRecords]: crate::model::RawRecords /// [LocalStorage]: crate::localstorage::LocalStorage @@ -30,8 +33,8 @@ pub trait SubRecorder: Send { /// This function is called every time before reporting to Collector. /// The default period is 1 second. /// - /// The [RawRecords] and [LocalStorage] map of all threads will be passed in through parameters. - /// `usize` is thread_id without platform dependency. + /// The [RawRecords] and [LocalStorage] map of all threads will be passed in + /// through parameters. `usize` is thread_id without platform dependency. /// /// [RawRecords]: crate::model::RawRecords /// [LocalStorage]: crate::localstorage::LocalStorage diff --git a/components/resource_metering/src/recorder/sub_recorder/summary.rs b/components/resource_metering/src/recorder/sub_recorder/summary.rs index 34cf07f9caf..93ba95080e3 100644 --- a/components/resource_metering/src/recorder/sub_recorder/summary.rs +++ b/components/resource_metering/src/recorder/sub_recorder/summary.rs @@ -35,8 +35,9 @@ pub fn record_write_keys(count: u32) { /// An implementation of [SubRecorder] for collecting summary data. /// -/// `SummaryRecorder` uses some special methods ([record_read_keys]/[record_write_keys]) -/// to collect external statistical information. +/// `SummaryRecorder` uses some special methods +/// ([record_read_keys]/[record_write_keys]) to collect external statistical +/// information. /// /// See [SubRecorder] for more relevant designs. /// @@ -59,7 +60,8 @@ impl SubRecorder for SummaryRecorder { } // The request currently being polled has not yet been merged into the hashmap, // so it needs to be processed separately. (For example, a slow request that is - // blocking needs to reflect in real time how many keys have been read currently) + // blocking needs to reflect in real time how many keys have been read + // currently) if let Some(t) = ls.attached_tag.load_full() { if t.extra_attachment.is_empty() { return; diff --git a/components/resource_metering/src/reporter/data_sink.rs b/components/resource_metering/src/reporter/data_sink.rs index 1dadc2723bc..e453bdd3371 100644 --- a/components/resource_metering/src/reporter/data_sink.rs +++ b/components/resource_metering/src/reporter/data_sink.rs @@ -9,7 +9,8 @@ use crate::error::Result; /// This trait abstracts the interface to communicate with the remote. /// We can simply mock this interface to test without RPC. pub trait DataSink: Send { - // `try_send` pushes a report data into the sink, which will later be sent to a target - // by the sink. If the sink is kept full, or the sink is closed, an error will be returned. + // `try_send` pushes a report data into the sink, which will later be sent to a + // target by the sink. If the sink is kept full, or the sink is closed, an error + // will be returned. fn try_send(&mut self, records: Arc>) -> Result<()>; } diff --git a/components/resource_metering/src/reporter/mod.rs b/components/resource_metering/src/reporter/mod.rs index 024a79bde53..721fb570b22 100644 --- a/components/resource_metering/src/reporter/mod.rs +++ b/components/resource_metering/src/reporter/mod.rs @@ -30,9 +30,9 @@ use crate::{ /// A structure for reporting statistics through [Client]. /// -/// `Reporter` implements [Runnable] and [RunnableWithTimer] to handle [Task]s from -/// the [Scheduler]. It internally aggregates the reported [RawRecords] into [Records] -/// and upload them to the remote server through the `Client`. +/// `Reporter` implements [Runnable] and [RunnableWithTimer] to handle [Task]s +/// from the [Scheduler]. It internally aggregates the reported [RawRecords] +/// into [Records] and upload them to the remote server through the `Client`. /// /// [Runnable]: tikv_util::worker::Runnable /// [RunnableWithTimer]: tikv_util::worker::RunnableWithTimer @@ -205,7 +205,8 @@ impl ConfigChangeNotifier { } } -/// Constructs a default [Recorder], start it and return the corresponding [ConfigChangeNotifier], [DataSinkRegHandle] and [LazyWorker]. +/// Constructs a default [Recorder], start it and return the corresponding +/// [ConfigChangeNotifier], [DataSinkRegHandle] and [LazyWorker]. /// /// This function is intended to simplify external use. pub fn init_reporter( diff --git a/components/resource_metering/src/reporter/pubsub.rs b/components/resource_metering/src/reporter/pubsub.rs index 0112a8b17db..62144ec920c 100644 --- a/components/resource_metering/src/reporter/pubsub.rs +++ b/components/resource_metering/src/reporter/pubsub.rs @@ -22,8 +22,9 @@ use crate::{ /// `PubSubService` implements [ResourceMeteringPubSub]. /// -/// If a client subscribes to resource metering records, the `PubSubService` is responsible for -/// registering them to the reporter. Then the reporter sends data to the client periodically. +/// If a client subscribes to resource metering records, the `PubSubService` is +/// responsible for registering them to the reporter. Then the reporter sends +/// data to the client periodically. /// /// [ResourceMeteringPubSub]: kvproto::resource_usage_agent_grpc::ResourceMeteringPubSub #[derive(Clone)] diff --git a/components/resource_metering/src/reporter/single_target.rs b/components/resource_metering/src/reporter/single_target.rs index 69817bc847b..09609b84462 100644 --- a/components/resource_metering/src/reporter/single_target.rs +++ b/components/resource_metering/src/reporter/single_target.rs @@ -41,8 +41,8 @@ impl Runnable for SingleTargetDataSink { } } -/// `SingleTargetDataSink` is the default implementation of [DataSink], which uses gRPC -/// to report data to the remote end. +/// `SingleTargetDataSink` is the default implementation of [DataSink], which +/// uses gRPC to report data to the remote end. pub struct SingleTargetDataSink { scheduler: Scheduler, data_sink_reg: DataSinkRegHandle, @@ -246,8 +246,8 @@ impl Drop for Guard { } } -/// Constructs a default [SingleTargetDataSink], start it and return the corresponding [AddressChangeNotifier] -/// and [LazyWorker]. +/// Constructs a default [SingleTargetDataSink], start it and return the +/// corresponding [AddressChangeNotifier] and [LazyWorker]. /// /// This function is intended to simplify external use. pub fn init_single_target( diff --git a/components/resource_metering/tests/summary_test.rs b/components/resource_metering/tests/summary_test.rs index c5a9ae61ac3..ae647055206 100644 --- a/components/resource_metering/tests/summary_test.rs +++ b/components/resource_metering/tests/summary_test.rs @@ -53,7 +53,7 @@ fn test_summary() { let data_sink = MockDataSink::default(); - /* At this point we are ready for everything except turning on the switch. */ + // At this point we are ready for everything except turning on the switch. // expect no data { diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index ed5ff0d1fa4..d984ccb353d 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -41,7 +41,8 @@ pub struct SecurityConfig { /// /// # Arguments /// -/// - `tag`: only used in the error message, like "ca key", "cert key", "private key", etc. +/// - `tag`: only used in the error message, like "ca key", "cert key", +/// "private key", etc. fn check_key_file(tag: &str, path: &str) -> Result, Box> { if path.is_empty() { return Ok(None); diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index bf06ecefcea..7ada07d5206 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -158,7 +158,8 @@ fn run_dump_raftdb_worker( let mut state = RaftLocalState::default(); state.merge_from_bytes(value)?; batch.put_raft_state(region_id, &state).unwrap(); - // Assume that we always scan entry first and raft state at the end. + // Assume that we always scan entry first and raft state at the + // end. batch .append(region_id, std::mem::take(&mut entries)) .unwrap(); @@ -237,8 +238,8 @@ mod tests { // Dump logs from RocksEngine to RaftLogEngine. let raft_engine = RaftLogEngine::new( cfg.raft_engine.config(), - None, /*key_manager*/ - None, /*io_rate_limiter*/ + None, // key_manager + None, // io_rate_limiter ) .expect("open raft engine"); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index ad788f2ecec..73269c3f07a 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -2,13 +2,14 @@ //! This module startups all the components of a TiKV server. //! -//! It is responsible for reading from configs, starting up the various server components, -//! and handling errors (mostly by aborting and reporting to the user). +//! It is responsible for reading from configs, starting up the various server +//! components, and handling errors (mostly by aborting and reporting to the +//! user). //! //! The entry point is `run_tikv`. //! -//! Components are often used to initialize other components, and/or must be explicitly stopped. -//! We keep these components in the `TiKvServer` struct. +//! Components are often used to initialize other components, and/or must be +//! explicitly stopped. We keep these components in the `TiKvServer` struct. use std::{ cmp, @@ -363,7 +364,8 @@ impl TiKvServer { /// /// # Fatal errors /// - /// - If `dynamic config` feature is enabled and failed to register config to PD + /// - If `dynamic config` feature is enabled and failed to register config + /// to PD /// - If some critical configs (like data dir) are differrent from last run /// - If the config can't pass `validate()` /// - If the max open file descriptor limit is not high enough to support @@ -488,9 +490,10 @@ impl TiKvServer { ); } - // We truncate a big file to make sure that both raftdb and kvdb of TiKV have enough space - // to do compaction and region migration when TiKV recover. This file is created in - // data_dir rather than db_path, because we must not increase store size of db_path. + // We truncate a big file to make sure that both raftdb and kvdb of TiKV have + // enough space to do compaction and region migration when TiKV recover. + // This file is created in data_dir rather than db_path, because we must not + // increase store size of db_path. let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); let mut capacity = disk_stats.total_space(); if self.config.raft_store.capacity.0 > 0 { @@ -1017,7 +1020,8 @@ impl TiKvServer { ) .unwrap_or_else(|e| fatal!("failed to start node: {}", e)); - // Start auto gc. Must after `Node::start` because `node_id` is initialized there. + // Start auto gc. Must after `Node::start` because `node_id` is initialized + // there. assert!(node.id() > 0); // Node id should never be 0. let auto_gc_config = AutoGcConfig::new( self.pd_client.clone(), @@ -1242,15 +1246,15 @@ impl TiKvServer { self.config .storage .io_rate_limit - .build(!stats_collector_enabled /*enable_statistics*/), + .build(!stats_collector_enabled /* enable_statistics */), ); let fetcher = if stats_collector_enabled { BytesFetcher::FromIOStatsCollector() } else { BytesFetcher::FromRateLimiter(limiter.statistics().unwrap()) }; - // Set up IO limiter even when rate limit is disabled, so that rate limits can be - // dynamically applied later on. + // Set up IO limiter even when rate limit is disabled, so that rate limits can + // be dynamically applied later on. set_io_rate_limiter(Some(limiter)); fetcher } @@ -1286,7 +1290,8 @@ impl TiKvServer { }); } - // Only background cpu quota tuning is implemented at present. iops and frontend quota tuning is on the way + // Only background cpu quota tuning is implemented at present. iops and frontend + // quota tuning is on the way fn init_quota_tuning_task(&self, quota_limiter: Arc) { // No need to do auto tune when capacity is really low if SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO @@ -1335,9 +1340,11 @@ impl TiKvServer { }; // Try tuning quota when cpu_usage is correctly collected. // rule based tuning: - // 1) if instance is busy, shrink cpu quota for analyze by one quota pace until lower bound is hit; - // 2) if instance cpu usage is healthy, no op; - // 3) if instance is idle, increase cpu quota by one quota pace until upper bound is hit. + // - if instance is busy, shrink cpu quota for analyze by one quota pace until + // lower bound is hit; + // - if instance cpu usage is healthy, no op; + // - if instance is idle, increase cpu quota by one quota pace until upper + // bound is hit. if cpu_usage > 0.0f64 { let mut target_quota = old_quota; @@ -1560,7 +1567,7 @@ impl ConfiguredRaftEngine for RocksEngine { let raft_engine = RaftLogEngine::new(config.raft_engine.config(), key_manager.clone(), None) .expect("failed to open raft engine for migration"); - dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /*threads*/); + dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /* threads */); raft_engine.stop(); drop(raft_engine); raft_data_state_machine.after_dump_data(); @@ -1614,7 +1621,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { raft_cf_opts, ) .expect("failed to open raftdb for migration"); - dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /*threads*/); + dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /* threads */); raftdb.stop(); drop(raftdb); raft_data_state_machine.after_dump_data(); @@ -1674,7 +1681,7 @@ impl TiKvServer { .register_config(cfg_controller, self.config.storage.block_cache.shared); let engines_info = Arc::new(EnginesResourceInfo::new( - &engines, 180, /*max_samples_to_preserve*/ + &engines, 180, // max_samples_to_preserve )); (engines, engines_info) diff --git a/components/server/src/setup.rs b/components/server/src/setup.rs index 3e37d87242c..4f49f6fb86e 100644 --- a/components/server/src/setup.rs +++ b/components/server/src/setup.rs @@ -16,7 +16,8 @@ use tikv_util::{self, config, logger}; // A workaround for checking if log is initialized. pub static LOG_INITIALIZED: AtomicBool = AtomicBool::new(false); -// The info log file names does not end with ".log" since it conflict with rocksdb WAL files. +// The info log file names does not end with ".log" since it conflict with +// rocksdb WAL files. pub const DEFAULT_ROCKSDB_LOG_FILE: &str = "rocksdb.info"; pub const DEFAULT_RAFTDB_LOG_FILE: &str = "raftdb.info"; @@ -33,11 +34,12 @@ macro_rules! fatal { }) } -// TODO: There is a very small chance that duplicate files will be generated if there are -// a lot of logs written in a very short time. Consider rename the rotated file with a version -// number while rotate by size. +// TODO: There is a very small chance that duplicate files will be generated if +// there are a lot of logs written in a very short time. Consider rename the +// rotated file with a version number while rotate by size. // -// The file name format after rotated is as follows: "{original name}.{"%Y-%m-%dT%H-%M-%S%.3f"}" +// The file name format after rotated is as follows: +// "{original name}.{"%Y-%m-%dT%H-%M-%S%.3f"}" fn rename_by_timestamp(path: &Path) -> io::Result { let mut new_path = path.parent().unwrap().to_path_buf(); let mut new_fname = path.file_stem().unwrap().to_os_string(); @@ -76,7 +78,8 @@ pub fn initial_logger(config: &TiKvConfig) { let rocksdb_info_log_path = if !config.rocksdb.info_log_dir.is_empty() { make_engine_log_path(&config.rocksdb.info_log_dir, "", DEFAULT_ROCKSDB_LOG_FILE) } else { - // Don't use `DEFAULT_ROCKSDB_SUB_DIR`, because of the logic of `RocksEngine::exists`. + // Don't use `DEFAULT_ROCKSDB_SUB_DIR`, because of the logic of + // `RocksEngine::exists`. make_engine_log_path(&config.storage.data_dir, "", DEFAULT_ROCKSDB_LOG_FILE) }; let raftdb_info_log_path = if !config.raftdb.info_log_dir.is_empty() { diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index be93ded1554..60f72052b10 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -302,7 +302,8 @@ impl ImportDir { for meta in metas { match (api_version, meta.api_version) { (cur_version, meta_version) if cur_version == meta_version => continue, - // sometimes client do not know whether ttl is enabled, so a general V1 is accepted as V1ttl + // sometimes client do not know whether ttl is enabled, so a general V1 is accepted + // as V1ttl (ApiVersion::V1ttl, ApiVersion::V1) => continue, // import V1ttl as V1 will immediatly be rejected because it is never correct. (ApiVersion::V1, ApiVersion::V1ttl) => return Ok(false), @@ -451,8 +452,9 @@ pub fn path_to_sst_meta>(path: P) -> Result { meta.mut_region_epoch().set_conf_ver(elems[2].parse()?); meta.mut_region_epoch().set_version(elems[3].parse()?); if elems.len() > 4 { - // If we upgrade TiKV from 3.0.x to 4.0.x and higher version, we can not read cf_name from - // the file path, because TiKV 3.0.x does not encode cf_name to path. + // If we upgrade TiKV from 3.0.x to 4.0.x and higher version, we can not read + // cf_name from the file path, because TiKV 3.0.x does not encode + // cf_name to path. meta.set_cf_name(elems[4].to_owned()); } Ok(meta) diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 1d4e2e916dc..356541cebbb 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -326,8 +326,9 @@ impl SstImporter { path.temp.clone(), backend, expected_sha256, - // kv-files needn't are decrypted with KMS when download currently because these files are not encrypted when log-backup. - // It is different from sst-files because sst-files is encrypted when saved with rocksdb env with KMS. + // kv-files needn't are decrypted with KMS when download currently because these files + // are not encrypted when log-backup. It is different from sst-files + // because sst-files is encrypted when saved with rocksdb env with KMS. // to do: support KMS when log-backup and restore point. false, // don't support encrypt for now. @@ -431,7 +432,8 @@ impl SstImporter { } if check_key_in_range(&key, 0, start_key, end_key).is_err() { // key not in range, we can simply skip this key here. - // the client make sure the correct region will download and apply the same file. + // the client make sure the correct region will download and apply the same + // file. INPORTER_APPLY_COUNT .with_label_values(&["key_not_in_region"]) .inc(); @@ -573,7 +575,8 @@ impl SstImporter { return Ok(None); } - // range contained the entire SST, no need to iterate, just moving the file is ok + // range contained the entire SST, no need to iterate, just moving the file is + // ok let mut range = Range::default(); range.set_start(start_key); range.set_end(last_key.to_vec()); @@ -844,7 +847,7 @@ mod tests { // Test ImportDir::ingest() let db_path = temp_dir.path().join("db"); - let env = get_env(key_manager.clone(), None /*io_rate_limiter*/).unwrap(); + let env = get_env(key_manager.clone(), None /* io_rate_limiter */).unwrap(); let db = new_test_engine_with_env(db_path.to_str().unwrap(), &[CF_DEFAULT], env); let cases = vec![(0, 10), (5, 15), (10, 20), (0, 100)]; @@ -1363,7 +1366,7 @@ mod tests { .unwrap(); let db_path = temp_dir.path().join("db"); - let env = get_env(Some(key_manager), None /*io_rate_limiter*/).unwrap(); + let env = get_env(Some(key_manager), None /* io_rate_limiter */).unwrap(); let db = new_test_engine_with_env(db_path.to_str().unwrap(), DATA_CFS, env.clone()); let range = importer @@ -1599,8 +1602,8 @@ mod tests { // key3 = "zt9102_r07", value3 = "pqrst", len = 15 // key4 = "zt9102_r13", value4 = "www", len = 13 // total_bytes = (13 + 13 + 15 + 13) + 4 * 8 = 86 - // don't no why each key has extra 8 byte length in raw_key_size(), but it seems tolerable. - // https://docs.rs/rocks/0.1.0/rocks/table_properties/struct.TableProperties.html#method.raw_key_size + // don't no why each key has extra 8 byte length in raw_key_size(), but it seems + // tolerable. https://docs.rs/rocks/0.1.0/rocks/table_properties/struct.TableProperties.html#method.raw_key_size assert_eq!(meta_info.total_bytes, 86); assert_eq!(meta_info.total_kvs, 4); diff --git a/components/sst_importer/src/util.rs b/components/sst_importer/src/util.rs index 042b430b811..6ba4d892717 100644 --- a/components/sst_importer/src/util.rs +++ b/components/sst_importer/src/util.rs @@ -9,8 +9,8 @@ use file_system::File; use super::Result; /// Prepares the SST file for ingestion. -/// The purpose is to make the ingestion retryable when using the `move_files` option. -/// Things we need to consider here: +/// The purpose is to make the ingestion retryable when using the `move_files` +/// option. Things we need to consider here: /// 1. We need to access the original file on retry, so we should make a clone /// before ingestion. /// 2. `RocksDB` will modified the global seqno of the ingested file, so we need @@ -32,8 +32,9 @@ pub fn prepare_sst_for_ingestion, Q: AsRef>( if Path::new(clone).exists() { file_system::remove_file(clone).map_err(|e| format!("remove {}: {:?}", clone, e))?; } - // always try to remove the file from key manager because the clean up in rocksdb is not atomic, - // thus the file may be deleted but key in key manager is not. + // always try to remove the file from key manager because the clean up in + // rocksdb is not atomic, thus the file may be deleted but key in key + // manager is not. if let Some(key_manager) = encryption_key_manager { key_manager.delete_file(clone)?; } @@ -160,8 +161,8 @@ mod tests { .unwrap(); check_db_with_kvs(&db, CF_DEFAULT, &kvs); assert!(!sst_clone.exists()); - // Since we are not using key_manager in db, simulate the db deleting the file from - // key_manager. + // Since we are not using key_manager in db, simulate the db deleting the file + // from key_manager. if let Some(manager) = key_manager { manager.delete_file(sst_clone.to_str().unwrap()).unwrap(); } @@ -180,8 +181,8 @@ mod tests { #[test] fn test_prepare_sst_for_ingestion() { check_prepare_sst_for_ingestion( - None, None, None, /*key_manager*/ - false, /* was encrypted*/ + None, None, None, // key_manager + false, // was encrypted ); } @@ -197,8 +198,8 @@ mod tests { check_prepare_sst_for_ingestion( Some(db_opts), Some(vec![(CF_DEFAULT, cf_opts)]), - None, /*key_manager*/ - false, /*was_encrypted*/ + None, // key_manager + false, // was_encrypted ); } @@ -207,7 +208,7 @@ mod tests { let tmp_dir = tempfile::TempDir::new().unwrap(); let key_manager = new_test_key_manager(&tmp_dir, None, None, None); let manager = Arc::new(key_manager.unwrap().unwrap()); - check_prepare_sst_for_ingestion(None, None, Some(&manager), false /*was_encrypted*/); + check_prepare_sst_for_ingestion(None, None, Some(&manager), false /* was_encrypted */); } #[test] @@ -215,6 +216,6 @@ mod tests { let tmp_dir = tempfile::TempDir::new().unwrap(); let key_manager = new_test_key_manager(&tmp_dir, None, None, None); let manager = Arc::new(key_manager.unwrap().unwrap()); - check_prepare_sst_for_ingestion(None, None, Some(&manager), true /*was_encrypted*/); + check_prepare_sst_for_ingestion(None, None, Some(&manager), true /* was_encrypted */); } } diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index dfdffd97105..e6622128243 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -52,7 +52,7 @@ pub struct TestSuite { // Retry if encounter error macro_rules! retry_req { - ($call_req: expr, $check_resp: expr, $resp:ident, $retry:literal, $timeout:literal) => { + ($call_req:expr, $check_resp:expr, $resp:ident, $retry:literal, $timeout:literal) => { let start = Instant::now(); let timeout = Duration::from_millis($timeout); let mut tried_times = 0; diff --git a/components/test_coprocessor/src/dag.rs b/components/test_coprocessor/src/dag.rs index 38476f694f5..4165d19bdb4 100644 --- a/components/test_coprocessor/src/dag.rs +++ b/components/test_coprocessor/src/dag.rs @@ -112,7 +112,8 @@ impl DAGSelect { col_expr.mut_val().encode_i64(col_offset).unwrap(); let mut expr = Expr::default(); let mut expr_ft = col.as_field_type(); - // Avg will contains two auxiliary columns (sum, count) and the sum should be a `Decimal` + // Avg will contains two auxiliary columns (sum, count) and the sum should be a + // `Decimal` if aggr_t == ExprType::Avg || aggr_t == ExprType::Sum { expr_ft.set_tp(0xf6); // FieldTypeTp::NewDecimal } diff --git a/components/test_coprocessor/src/fixture.rs b/components/test_coprocessor/src/fixture.rs index c7feacedbfe..55a7f72a07f 100644 --- a/components/test_coprocessor/src/fixture.rs +++ b/components/test_coprocessor/src/fixture.rs @@ -122,7 +122,8 @@ pub fn init_data_with_commit( init_data_with_engine_and_commit(Context::default(), engine, tbl, vals, commit) } -// This function will create a Product table and initialize with the specified data. +// This function will create a Product table and initialize with the specified +// data. pub fn init_with_data( tbl: &ProductTable, vals: &[(i64, Option<&str>, i64)], diff --git a/components/test_coprocessor/src/table.rs b/components/test_coprocessor/src/table.rs index 91910d4c2bf..af070f62759 100644 --- a/components/test_coprocessor/src/table.rs +++ b/components/test_coprocessor/src/table.rs @@ -88,7 +88,8 @@ impl Table { range } - /// Create a `KeyRange` which select records in the range. The end_handle_id is included. + /// Create a `KeyRange` which select records in the range. The end_handle_id + /// is included. pub fn get_record_range(&self, start_handle_id: i64, end_handle_id: i64) -> KeyRange { let mut range = KeyRange::default(); range.set_start(table::encode_row_key(self.id, start_handle_id)); @@ -103,7 +104,8 @@ impl Table { self.get_record_range(handle_id, handle_id) } - /// Create a `KeyRange` which select all index records of a specified index in current table. + /// Create a `KeyRange` which select all index records of a specified index + /// in current table. pub fn get_index_range_all(&self, idx: i64) -> KeyRange { let mut range = KeyRange::default(); let mut buf = Vec::with_capacity(8); diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 0359952d237..301647bf267 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -180,7 +180,8 @@ impl Cluster { pd_client: Arc, api_version: ApiVersion, ) -> Cluster { - // TODO: In the future, maybe it's better to test both case where `use_delete_range` is true and false + // TODO: In the future, maybe it's better to test both case where + // `use_delete_range` is true and false Cluster { cfg: Config { tikv: new_tikv_config_with_api_ver(id, api_version), @@ -221,11 +222,12 @@ impl Cluster { Ok(()) } - /// Engines in a just created cluster are not bootstraped, which means they are not associated - /// with a `node_id`. Call `Cluster::start` can bootstrap all nodes in the cluster. + /// Engines in a just created cluster are not bootstrapped, which means they + /// are not associated with a `node_id`. Call `Cluster::start` can bootstrap + /// all nodes in the cluster. /// - /// However sometimes a node can be bootstrapped externally. This function can be called to - /// mark them as bootstrapped in `Cluster`. + /// However sometimes a node can be bootstrapped externally. This function + /// can be called to mark them as bootstrapped in `Cluster`. pub fn set_bootstrapped(&mut self, node_id: u64, offset: usize) { let engines = self.dbs[offset].clone(); let key_mgr = self.key_managers[offset].clone(); @@ -248,7 +250,7 @@ impl Cluster { self.cfg .storage .io_rate_limit - .build(true /*enable_statistics*/), + .build(true /* enable_statistics */), )); for _ in 0..self.count { self.create_engine(None); @@ -304,7 +306,7 @@ impl Cluster { pub fn flush_data(&self) { for engine in self.engines.values() { let db = &engine.kv; - db.flush_cf(CF_DEFAULT, true /*sync*/).unwrap(); + db.flush_cf(CF_DEFAULT, true /* sync */).unwrap(); } } @@ -605,9 +607,9 @@ impl Cluster { assert_eq!(self.pd_client.get_regions_number() as u32, len) } - // For test when a node is already bootstraped the cluster with the first region - // But another node may request bootstrap at same time and get is_bootstrap false - // Add Region but not set bootstrap to true + // For test when a node is already bootstrapped the cluster with the first + // region But another node may request bootstrap at same time and get + // is_bootstrap false Add Region but not set bootstrap to true pub fn add_first_region(&self) -> Result<()> { let mut region = metapb::Region::default(); let region_id = self.pd_client.alloc_id().unwrap(); @@ -1347,8 +1349,8 @@ impl Cluster { } } - // It's similar to `ask_split`, the difference is the msg, it sends, is `Msg::SplitRegion`, - // and `region` will not be embedded to that msg. + // It's similar to `ask_split`, the difference is the msg, it sends, is + // `Msg::SplitRegion`, and `region` will not be embedded to that msg. // Caller must ensure that the `split_key` is in the `region`. pub fn split_region( &mut self, diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 27cbd367ba7..ac3e3a6cc6e 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -187,8 +187,8 @@ impl NodeCluster { .unwrap() } - // Set a function that will be invoked after creating each CoprocessorHost. The first argument - // of `op` is the node_id. + // Set a function that will be invoked after creating each CoprocessorHost. The + // first argument of `op` is the node_id. // Set this before invoking `run_node`. #[allow(clippy::type_complexity)] pub fn post_create_coprocessor_host( diff --git a/components/test_raftstore/src/pd.rs b/components/test_raftstore/src/pd.rs index 66823a29708..45a69896296 100644 --- a/components/test_raftstore/src/pd.rs +++ b/components/test_raftstore/src/pd.rs @@ -410,9 +410,9 @@ impl PdCluster { fn put_store(&mut self, store: metapb::Store) -> Result<()> { let store_id = store.get_id(); - // There is a race between put_store and handle_region_heartbeat_response. If store id is - // 0, it means it's a placeholder created by latter, we just need to update the meta. - // Otherwise we should overwrite it. + // There is a race between put_store and handle_region_heartbeat_response. If + // store id is 0, it means it's a placeholder created by latter, we just need to + // update the meta. Otherwise we should overwrite it. if self .stores .get(&store_id) @@ -538,8 +538,8 @@ impl PdCluster { && incoming_epoch.get_conf_ver() == 0; let overlaps = self.get_overlap(start_key, end_key); if created_by_unsafe_recovery { - // Allow recreated region by unsafe recover to overwrite other regions with a "older" - // epoch. + // Allow recreated region by unsafe recover to overwrite other regions with a + // "older" epoch. return Ok(overlaps); } for r in overlaps.iter() { @@ -1318,7 +1318,8 @@ impl TestPdClient { self.cluster.wl().check_merge_target_integrity = false; } - /// The next generated TSO will be `ts + 1`. See `get_tso()` and `batch_get_tso()`. + /// The next generated TSO will be `ts + 1`. See `get_tso()` and + /// `batch_get_tso()`. pub fn set_tso(&self, ts: TimeStamp) { let old = self.tso.swap(ts.into_inner(), Ordering::SeqCst); if old > ts.into_inner() { diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 5d85fff86bc..e22b730151a 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -167,7 +167,8 @@ impl ServerCluster { ); let security_mgr = Arc::new(SecurityManager::new(&Default::default()).unwrap()); let map = AddressMap::default(); - // We don't actually need to handle snapshot message, just create a dead worker to make it compile. + // We don't actually need to handle snapshot message, just create a dead worker + // to make it compile. let worker = LazyWorker::new("snap-worker"); let conn_builder = ConnectionBuilder::new( env.clone(), diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index 9ebba64aa48..e8fba33f65f 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -507,10 +507,11 @@ impl Filter for SnapshotFilter { } } -/// `CollectSnapshotFilter` is a simulation transport filter to simulate the simultaneous delivery -/// of multiple snapshots from different peers. It collects the snapshots from different -/// peers and drop the subsequent snapshots from the same peers. Currently, if there are -/// more than 1 snapshots in this filter, all the snapshots will be dilivered at once. +/// `CollectSnapshotFilter` is a simulation transport filter to simulate the +/// simultaneous delivery of multiple snapshots from different peers. It +/// collects the snapshots from different peers and drop the subsequent +/// snapshots from the same peers. Currently, if there are more than 1 snapshots +/// in this filter, all the snapshots will be delivered at once. pub struct CollectSnapshotFilter { dropped: AtomicBool, stale: AtomicBool, @@ -753,10 +754,11 @@ impl Filter for LeadingDuplicatedSnapshotFilter { } } -/// `RandomLatencyFilter` is a transport filter to simulate randomized network latency. -/// Based on a randomized rate, `RandomLatencyFilter` will decide whether to delay -/// the sending of any message. It's could be used to simulate the message sending -/// in a network with random latency, where messages could be delayed, disordered or lost. +/// `RandomLatencyFilter` is a transport filter to simulate randomized network +/// latency. Based on a randomized rate, `RandomLatencyFilter` will decide +/// whether to delay the sending of any message. It's could be used to simulate +/// the message sending in a network with random latency, where messages could +/// be delayed, disordered or lost. pub struct RandomLatencyFilter { delay_rate: u32, delayed_msgs: Mutex>, diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index bdd7c08b7e8..c399b4813f2 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -724,8 +724,9 @@ pub fn configure_for_lease_read( // Adjust max leader lease. cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(election_timeout - base_tick_interval); - // Use large peer check interval, abnormal and max leader missing duration to make a valid config, - // that is election timeout x 2 < peer stale state check < abnormal < max leader missing duration. + // Use large peer check interval, abnormal and max leader missing duration to + // make a valid config, that is election timeout x 2 < peer stale state + // check < abnormal < max leader missing duration. cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration(election_timeout * 3); cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration(election_timeout * 4); @@ -1169,7 +1170,8 @@ pub fn check_compacted( compact_count: u64, must_compacted: bool, ) -> bool { - // Every peer must have compacted logs, so the truncate log state index/term must > than before. + // Every peer must have compacted logs, so the truncate log state index/term + // must > than before. let mut compacted_idx = HashMap::default(); for (&id, engines) in all_engines { diff --git a/components/test_util/src/lib.rs b/components/test_util/src/lib.rs index dc053bd6d20..d2096e74c82 100644 --- a/components/test_util/src/lib.rs +++ b/components/test_util/src/lib.rs @@ -32,9 +32,9 @@ pub use crate::{ }; pub fn setup_for_ci() { - // We use backtrace in tests to record suspicious problems. And loading backtrace - // the first time can take several seconds. Spawning a thread and load it ahead - // of time to avoid causing timeout. + // We use backtrace in tests to record suspicious problems. And loading + // backtrace the first time can take several seconds. Spawning a thread and + // load it ahead of time to avoid causing timeout. thread::Builder::new() .name(tikv_util::thd_name!("backtrace-loader")) .spawn_wrapper(::backtrace::Backtrace::new) diff --git a/components/test_util/src/runner.rs b/components/test_util/src/runner.rs index e7ef1ba0cb5..d05f7e98879 100644 --- a/components/test_util/src/runner.rs +++ b/components/test_util/src/runner.rs @@ -99,9 +99,9 @@ impl TestHook for FailpointHook { } } -/// During panic, due to drop order, failpoints will not be cleared before tests exit. -/// If tests wait for a sleep failpoint, the whole tests will hang. So we need a method -/// to clear failpoints explicitly besides teardown. +/// During panic, due to drop order, failpoints will not be cleared before tests +/// exit. If tests wait for a sleep failpoint, the whole tests will hang. So we +/// need a method to clear failpoints explicitly besides teardown. pub fn clear_failpoints() { FS.with(|s| s.borrow_mut().take()); } diff --git a/components/tidb_query_aggr/src/impl_avg.rs b/components/tidb_query_aggr/src/impl_avg.rs index ec4784b24e4..6337c8de6c5 100644 --- a/components/tidb_query_aggr/src/impl_avg.rs +++ b/components/tidb_query_aggr/src/impl_avg.rs @@ -73,7 +73,8 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserAvg { /// The AVG aggregate function. /// -/// Note that there are `AVG(Decimal) -> (Int, Decimal)` and `AVG(Double) -> (Int, Double)`. +/// Note that there are `AVG(Decimal) -> (Int, Decimal)` and `AVG(Double) -> +/// (Int, Double)`. #[derive(Debug, AggrFunction)] #[aggr_function(state = AggrFnStateAvg::::new())] pub struct AggrFnAvg diff --git a/components/tidb_query_aggr/src/impl_count.rs b/components/tidb_query_aggr/src/impl_count.rs index 0e17f1adfb6..3d49d8b25af 100644 --- a/components/tidb_query_aggr/src/impl_count.rs +++ b/components/tidb_query_aggr/src/impl_count.rs @@ -111,9 +111,10 @@ impl AggrFnStateCount { } } -// Here we manually implement `AggrFunctionStateUpdatePartial` so that `update_repeat` and -// `update_vector` can be faster. Also note that we support all kind of -// `AggrFunctionStateUpdatePartial` for the COUNT aggregate function. +// Here we manually implement `AggrFunctionStateUpdatePartial` so that +// `update_repeat` and `update_vector` can be faster. Also note that we support +// all kind of `AggrFunctionStateUpdatePartial` for the COUNT aggregate +// function. impl super::AggrFunctionStateUpdatePartial for AggrFnStateCount where diff --git a/components/tidb_query_aggr/src/impl_first.rs b/components/tidb_query_aggr/src/impl_first.rs index f01546cc5ef..b7ccd077598 100644 --- a/components/tidb_query_aggr/src/impl_first.rs +++ b/components/tidb_query_aggr/src/impl_first.rs @@ -155,19 +155,22 @@ where } } -// Here we manually implement `AggrFunctionStateUpdatePartial` instead of implementing -// `ConcreteAggrFunctionState` so that `update_repeat` and `update_vector` can be faster. +// Here we manually implement `AggrFunctionStateUpdatePartial` instead of +// implementing `ConcreteAggrFunctionState` so that `update_repeat` and +// `update_vector` can be faster. impl super::AggrFunctionStateUpdatePartial for AggrFnStateFirst where T: EvaluableRef<'static> + 'static, VectorValue: VectorValueExt, { - // ChunkedType has been implemented in AggrFunctionStateUpdatePartial for AggrFnStateFirst + // ChunkedType has been implemented in AggrFunctionStateUpdatePartial for + // AggrFnStateFirst impl_state_update_partial! { T } } -// In order to make `AggrFnStateFirst` satisfy the `AggrFunctionState` trait, we default impl all -// `AggrFunctionStateUpdatePartial` of `Evaluable` for all `AggrFnStateFirst`. +// In order to make `AggrFnStateFirst` satisfy the `AggrFunctionState` trait, we +// default impl all `AggrFunctionStateUpdatePartial` of `Evaluable` for all +// `AggrFnStateFirst`. impl_unmatched_function_state! { AggrFnStateFirst } impl super::AggrFunctionState for AggrFnStateFirst diff --git a/components/tidb_query_aggr/src/impl_max_min.rs b/components/tidb_query_aggr/src/impl_max_min.rs index 49eb4d911b8..31ff6acc8aa 100644 --- a/components/tidb_query_aggr/src/impl_max_min.rs +++ b/components/tidb_query_aggr/src/impl_max_min.rs @@ -242,9 +242,9 @@ where /// # Notes /// - /// For MAX(), MySQL currently compares ENUM and SET columns by their string value rather - /// than by the string's relative position in the set. This differs from how ORDER BY - /// compares them. + /// For MAX(), MySQL currently compares ENUM and SET columns by their string + /// value rather than by the string's relative position in the set. This + /// differs from how ORDER BY compares them. /// /// ref: https://dev.mysql.com/doc/refman/5.7/en/aggregate-functions.html#function_max #[inline] @@ -331,9 +331,9 @@ where /// # Notes /// - /// For MAX(), MySQL currently compares ENUM and SET columns by their string value rather - /// than by the string's relative position in the set. This differs from how ORDER BY - /// compares them. + /// For MAX(), MySQL currently compares ENUM and SET columns by their string + /// value rather than by the string's relative position in the set. This + /// differs from how ORDER BY compares them. /// /// ref: https://dev.mysql.com/doc/refman/5.7/en/aggregate-functions.html#function_max #[inline] diff --git a/components/tidb_query_aggr/src/impl_sum.rs b/components/tidb_query_aggr/src/impl_sum.rs index 5b0e8334e86..85f31b8f459 100644 --- a/components/tidb_query_aggr/src/impl_sum.rs +++ b/components/tidb_query_aggr/src/impl_sum.rs @@ -52,7 +52,8 @@ impl super::parser::AggrDefinitionParser for AggrFnDefinitionParserSum { out_schema.push(out_ft); out_exp.push(exp); - // Choose a type-aware SUM implementation based on the eval type after rewriting exp. + // Choose a type-aware SUM implementation based on the eval type after rewriting + // exp. Ok(match rewritten_eval_type { EvalType::Decimal => Box::new(AggrFnSum::::new()), EvalType::Real => Box::new(AggrFnSum::::new()), @@ -190,8 +191,9 @@ where /// # Notes /// - /// Functions such as SUM() or AVG() that expect a numeric argument cast the argument to a - /// number if necessary. For ENUM values, the index number is used in the calculation. + /// Functions such as SUM() or AVG() that expect a numeric argument cast the + /// argument to a number if necessary. For ENUM values, the index number is + /// used in the calculation. /// /// ref: https://dev.mysql.com/doc/refman/8.0/en/enum.html #[inline] @@ -266,8 +268,9 @@ where /// # Notes /// - /// Functions such as SUM() or AVG() that expect a numeric argument cast the argument to a - /// number if necessary. For ENUM values, the index number is used in the calculation. + /// Functions such as SUM() or AVG() that expect a numeric argument cast the + /// argument to a number if necessary. For ENUM values, the index number is + /// used in the calculation. /// /// ref: https://dev.mysql.com/doc/refman/8.0/en/enum.html #[inline] diff --git a/components/tidb_query_aggr/src/impl_variance.rs b/components/tidb_query_aggr/src/impl_variance.rs index f5b7fcc3bc8..190446c3809 100644 --- a/components/tidb_query_aggr/src/impl_variance.rs +++ b/components/tidb_query_aggr/src/impl_variance.rs @@ -80,7 +80,8 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserVari let out_ft = root_expr.take_field_type(); let out_et = box_try!(EvalType::try_from(out_ft.as_accessor().tp())); - // Rewrite expression to insert CAST() if needed. The rewrite should always succeed. + // Rewrite expression to insert CAST() if needed. The rewrite should always + // succeed. super::util::rewrite_exp_for_sum_avg(src_schema, &mut exp).unwrap(); let rewritten_eval_type = @@ -103,7 +104,8 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserVari out_schema.push(out_ft); out_exp.push(exp); - // Choose a type-aware VARIANCE implementation based on the eval type after rewriting exp. + // Choose a type-aware VARIANCE implementation based on the eval type after + // rewriting exp. Ok(match rewritten_eval_type { EvalType::Decimal => Box::new(AggrFnVariance::::new()), EvalType::Real => Box::new(AggrFnVariance::::new()), @@ -117,7 +119,8 @@ impl super::AggrDefinitionParser for AggrFnDefinitionParserVari /// The VARIANCE aggregate function. /// -/// Note that there are `VARIANCE(Decimal) -> Decimal` and `VARIANCE(Double) -> Double`. +/// Note that there are `VARIANCE(Decimal) -> Decimal` and `VARIANCE(Double) -> +/// Double`. #[derive(Debug, AggrFunction)] #[aggr_function(state = AggrFnStateVariance::::new())] pub struct AggrFnVariance @@ -276,9 +279,9 @@ where /// # Notes /// - /// Functions such as SUM() or AVG() or VARIANCE() that expect a numeric argument cast the - /// argument to a number if necessary. For ENUM values, the index number is used in the - /// calculation. + /// Functions such as SUM() or AVG() or VARIANCE() that expect a numeric + /// argument cast the argument to a number if necessary. For ENUM values, + /// the index number is used in the calculation. /// /// ref: https://dev.mysql.com/doc/refman/8.0/en/enum.html #[inline] @@ -387,9 +390,9 @@ where /// # Notes /// - /// Functions such as SUM() or AVG() or VARIANCE() that expect a numeric argument cast the - /// argument to a number if necessary. For ENUM values, the index number is used in the - /// calculation. + /// Functions such as SUM() or AVG() or VARIANCE() that expect a numeric + /// argument cast the argument to a number if necessary. For ENUM values, + /// the index number is used in the calculation. /// /// ref: https://dev.mysql.com/doc/refman/8.0/en/enum.html #[inline] diff --git a/components/tidb_query_aggr/src/lib.rs b/components/tidb_query_aggr/src/lib.rs index 65b2da55d03..b9d73b2773a 100644 --- a/components/tidb_query_aggr/src/lib.rs +++ b/components/tidb_query_aggr/src/lib.rs @@ -30,16 +30,18 @@ pub use self::parser::{AggrDefinitionParser, AllAggrDefinitionParser}; /// A trait for all single parameter aggregate functions. /// -/// Unlike ordinary function, aggregate function calculates a summary value over multiple rows. To -/// save memory, this functionality is provided via an incremental update model: +/// Unlike ordinary function, aggregate function calculates a summary value over +/// multiple rows. To save memory, this functionality is provided via an +/// incremental update model: /// -/// 1. Each aggregate function associates a state structure, storing partially computed aggregate -/// results. +/// - Each aggregate function associates a state structure, storing partially +/// computed aggregate results. /// -/// 2. The caller calls `update()` or `update_vector()` for each row to update the state. +/// - The caller calls `update()` or `update_vector()` for each row to update +/// the state. /// -/// 3. The caller finally calls `push_result()` to aggregate a summary value and push it into the -/// given data container. +/// - The caller finally calls `push_result()` to aggregate a summary value and +/// push it into the given data container. /// /// This trait can be auto derived by using `tidb_query_codegen::AggrFunction`. pub trait AggrFunction: std::fmt::Debug + Send + 'static { @@ -52,13 +54,15 @@ pub trait AggrFunction: std::fmt::Debug + Send + 'static { /// A trait for all single parameter aggregate function states. /// -/// Aggregate function states are created by corresponding aggregate functions. For each state, -/// it can be updated or aggregated (to finalize a result) independently. +/// Aggregate function states are created by corresponding aggregate functions. +/// For each state, it can be updated or aggregated (to finalize a result) +/// independently. /// -/// Note that aggregate function states are strongly typed, that is, the caller must provide the -/// parameter in the correct data type for an aggregate function states that calculates over this -/// data type. To be safely boxed and placed in a vector, interfaces are provided in a form that -/// accept all kinds of data type. However, unmatched types will result in panics in runtime. +/// Note that aggregate function states are strongly typed, that is, the caller +/// must provide the parameter in the correct data type for an aggregate +/// function states that calculates over this data type. To be safely boxed and +/// placed in a vector, interfaces are provided in a form that accept all kinds +/// of data type. However, unmatched types will result in panics in runtime. pub trait AggrFunctionState: std::fmt::Debug + Send @@ -73,17 +77,19 @@ pub trait AggrFunctionState: + AggrFunctionStateUpdatePartial> + AggrFunctionStateUpdatePartial> { - // TODO: A better implementation is to specialize different push result targets. However - // current aggregation executor cannot utilize it. + // TODO: A better implementation is to specialize different push result targets. + // However current aggregation executor cannot utilize it. fn push_result(&self, ctx: &mut EvalContext, target: &mut [VectorValue]) -> Result<()>; } -/// A helper trait for single parameter aggregate function states that only work over concrete eval -/// types. This is the actual and only trait that normal aggregate function states will implement. +/// A helper trait for single parameter aggregate function states that only work +/// over concrete eval types. This is the actual and only trait that normal +/// aggregate function states will implement. /// -/// Unlike `AggrFunctionState`, this trait only provides specialized `update()` and `push_result()` -/// functions according to the associated type. `update()` and `push_result()` functions that accept -/// any eval types (but will panic when eval type does not match expectation) will be generated via +/// Unlike `AggrFunctionState`, this trait only provides specialized `update()` +/// and `push_result()` functions according to the associated type. `update()` +/// and `push_result()` functions that accept any eval types (but will panic +/// when eval type does not match expectation) will be generated via /// implementations over this trait. pub trait ConcreteAggrFunctionState: std::fmt::Debug + Send + 'static { type ParameterType: EvaluableRef<'static>; @@ -102,14 +108,14 @@ pub trait ConcreteAggrFunctionState: std::fmt::Debug + Send + 'static { #[macro_export] macro_rules! update_concrete { - ( $state:expr, $ctx:expr, $value:expr ) => { + ($state:expr, $ctx:expr, $value:expr) => { unsafe { $state.update_concrete_unsafe($ctx, $value.unsafe_into()) } }; } #[macro_export] macro_rules! update_vector { - ( $state:expr, $ctx:expr, $physical_values:expr, $logical_rows:expr ) => { + ($state:expr, $ctx:expr, $physical_values:expr, $logical_rows:expr) => { unsafe { $state.update_vector_unsafe( $ctx, @@ -123,21 +129,21 @@ macro_rules! update_vector { #[macro_export] macro_rules! update_repeat { - ( $state:expr, $ctx:expr, $value:expr, $repeat_times:expr ) => { + ($state:expr, $ctx:expr, $value:expr, $repeat_times:expr) => { unsafe { $state.update_repeat_unsafe($ctx, $value.unsafe_into(), $repeat_times) } }; } #[macro_export] macro_rules! update { - ( $state:expr, $ctx:expr, $value:expr ) => { + ($state:expr, $ctx:expr, $value:expr) => { unsafe { $state.update_unsafe($ctx, $value.unsafe_into()) } }; } #[macro_export] macro_rules! impl_state_update_partial { - ( $ty:tt ) => { + ($ty:tt) => { #[inline] unsafe fn update_unsafe( &mut self, @@ -172,7 +178,7 @@ macro_rules! impl_state_update_partial { #[macro_export] macro_rules! impl_concrete_state { - ( $ty:ty ) => { + ($ty:ty) => { #[inline] unsafe fn update_concrete_unsafe( &mut self, @@ -186,7 +192,7 @@ macro_rules! impl_concrete_state { #[macro_export] macro_rules! impl_unmatched_function_state { - ( $ty:ty ) => { + ($ty:ty) => { impl super::AggrFunctionStateUpdatePartial for $ty where T1: EvaluableRef<'static> + 'static, @@ -226,15 +232,15 @@ macro_rules! impl_unmatched_function_state { }; } -/// A helper trait that provides `update()` and `update_vector()` over a concrete type, which will -/// be relied in `AggrFunctionState`. +/// A helper trait that provides `update()` and `update_vector()` over a +/// concrete type, which will be relied in `AggrFunctionState`. pub trait AggrFunctionStateUpdatePartial> { /// Updates the internal state giving one row data. /// /// # Panics /// - /// Panics if the aggregate function does not support the supplied concrete data type as its - /// parameter. + /// Panics if the aggregate function does not support the supplied concrete + /// data type as its parameter. /// /// # Safety /// @@ -245,8 +251,8 @@ pub trait AggrFunctionStateUpdatePartial> { /// /// # Panics /// - /// Panics if the aggregate function does not support the supplied concrete data type as its - /// parameter. + /// Panics if the aggregate function does not support the supplied concrete + /// data type as its parameter. /// /// # Safety /// @@ -262,8 +268,8 @@ pub trait AggrFunctionStateUpdatePartial> { /// /// # Panics /// - /// Panics if the aggregate function does not support the supplied concrete data type as its - /// parameter. + /// Panics if the aggregate function does not support the supplied concrete + /// data type as its parameter. /// /// # Safety /// @@ -281,8 +287,9 @@ impl, State> AggrFunctionStateUpdatePartial for Stat where State: ConcreteAggrFunctionState, { - // All `ConcreteAggrFunctionState` implement `AggrFunctionStateUpdatePartial`, which is - // one of the trait bound that `AggrFunctionState` requires. + // All `ConcreteAggrFunctionState` implement + // `AggrFunctionStateUpdatePartial`, which is one of the trait bound that + // `AggrFunctionState` requires. #[inline] default unsafe fn update_unsafe( diff --git a/components/tidb_query_aggr/src/parser.rs b/components/tidb_query_aggr/src/parser.rs index 5cbc19961d8..600326edb2f 100644 --- a/components/tidb_query_aggr/src/parser.rs +++ b/components/tidb_query_aggr/src/parser.rs @@ -9,26 +9,29 @@ use crate::{impl_bit_op::*, impl_max_min::*, impl_variance::*, AggrFunction}; /// Parse a specific aggregate function definition from protobuf. /// -/// All aggregate function implementations should include an impl for this trait as well as -/// add a match arm in `map_pb_sig_to_aggr_func_parser` so that the aggregate function can be -/// actually utilized. +/// All aggregate function implementations should include an impl for this trait +/// as well as add a match arm in `map_pb_sig_to_aggr_func_parser` so that the +/// aggregate function can be actually utilized. pub trait AggrDefinitionParser { - /// Checks whether the inner expression of the aggregate function definition is supported. - /// It is ensured that `aggr_def.tp` maps the current parser instance. + /// Checks whether the inner expression of the aggregate function definition + /// is supported. It is ensured that `aggr_def.tp` maps the current + /// parser instance. fn check_supported(&self, aggr_def: &Expr) -> Result<()>; /// Parses and transforms the aggregate function definition. /// - /// The schema of this aggregate function will be appended in `out_schema` and the final - /// RPN expression (maybe wrapped by some casting according to types) will be appended in - /// `out_exp`. + /// The schema of this aggregate function will be appended in `out_schema` + /// and the final RPN expression (maybe wrapped by some casting + /// according to types) will be appended in `out_exp`. /// - /// The parser may choose particular aggregate function implementation based on the data - /// type, so `schema` is also needed in case of data type depending on the column. + /// The parser may choose particular aggregate function implementation based + /// on the data type, so `schema` is also needed in case of data type + /// depending on the column. /// /// # Panic /// - /// May panic if the aggregate function definition is not supported by this parser. + /// May panic if the aggregate function definition is not supported by this + /// parser. fn parse( &self, mut aggr_def: Expr, @@ -100,8 +103,8 @@ impl AggrDefinitionParser for AllAggrDefinitionParser { }) } - /// Parses and transforms the aggregate function definition to generate corresponding - /// `AggrFunction` instance. + /// Parses and transforms the aggregate function definition to generate + /// corresponding `AggrFunction` instance. /// /// # Panic /// diff --git a/components/tidb_query_aggr/src/util.rs b/components/tidb_query_aggr/src/util.rs index 0e9ae390cf1..c4ba7a05766 100644 --- a/components/tidb_query_aggr/src/util.rs +++ b/components/tidb_query_aggr/src/util.rs @@ -7,7 +7,8 @@ use tidb_query_datatype::{builder::FieldTypeBuilder, EvalType, FieldTypeAccessor use tidb_query_expr::{impl_cast::get_cast_fn_rpn_node, RpnExpression, RpnExpressionBuilder}; use tipb::{Expr, FieldType}; -/// Checks whether or not there is only one child and the child expression is supported. +/// Checks whether or not there is only one child and the child expression is +/// supported. pub fn check_aggr_exp_supported_one_child(aggr_def: &Expr) -> Result<()> { if aggr_def.get_children().len() != 1 { return Err(other_err!( @@ -23,7 +24,8 @@ pub fn check_aggr_exp_supported_one_child(aggr_def: &Expr) -> Result<()> { Ok(()) } -/// Rewrites the expression to insert necessary cast functions for SUM and AVG aggregate functions. +/// Rewrites the expression to insert necessary cast functions for SUM and AVG +/// aggregate functions. /// /// See `typeInfer4Sum` and `typeInfer4Avg` in TiDB. /// @@ -63,7 +65,8 @@ pub fn rewrite_exp_for_sum_avg(schema: &[FieldType], exp: &mut RpnExpression) -> Ok(()) } -/// Rewrites the expression to insert necessary cast functions for Bit operation family functions. +/// Rewrites the expression to insert necessary cast functions for Bit operation +/// family functions. pub fn rewrite_exp_for_bit_op(schema: &[FieldType], exp: &mut RpnExpression) -> Result<()> { let ret_field_type = exp.ret_field_type(schema); let ret_eval_type = box_try!(EvalType::try_from(ret_field_type.as_accessor().tp())); diff --git a/components/tidb_query_codegen/src/lib.rs b/components/tidb_query_codegen/src/lib.rs index baa9d8522ab..feee1c6afb3 100644 --- a/components/tidb_query_codegen/src/lib.rs +++ b/components/tidb_query_codegen/src/lib.rs @@ -8,8 +8,8 @@ //! //! This crate exports a custom derive for [`AggrFunction`](https://github.com/tikv/tikv/blob/master/components/tidb_query_aggr/src/mod.rs) //! and an attribute macro called `rpn_fn` for use on functions which provide -//! coprocessor functionality. `rpn_fn` is documented in the [rpn_function](rpn_function.rs) -//! module. +//! coprocessor functionality. `rpn_fn` is documented in the +//! [rpn_function](rpn_function.rs) module. #![feature(proc_macro_diagnostic)] #![feature(iter_order_by)] diff --git a/components/tidb_query_codegen/src/rpn_function.rs b/components/tidb_query_codegen/src/rpn_function.rs index 8025fc01588..864fce9afd8 100644 --- a/components/tidb_query_codegen/src/rpn_function.rs +++ b/components/tidb_query_codegen/src/rpn_function.rs @@ -16,13 +16,13 @@ //! ## Arguments to macro //! //! If neither `varg` or `raw_varg` are supplied, then the generated arguments -//! follow from the supplied function's arguments. Each argument must have a type -//! `Option<&T>` for some `T`. +//! follow from the supplied function's arguments. Each argument must have a +//! type `Option<&T>` for some `T`. //! //! ### `varg` //! -//! The RPN operator takes a variable number of arguments. The arguments are passed -//! as a `&[Option<&T>]`. E.g., +//! The RPN operator takes a variable number of arguments. The arguments are +//! passed as a `&[Option<&T>]`. E.g., //! //! ```ignore //! #[rpn_fn(varg)] @@ -33,8 +33,8 @@ //! //! ### `raw_varg` //! -//! The RPN operator takes a variable number of arguments. The arguments are passed -//! as a `&[ScalarValueRef]`. E.g., +//! The RPN operator takes a variable number of arguments. The arguments are +//! passed as a `&[ScalarValueRef]`. E.g., //! //! ```ignore //! #[rpn_fn(raw_varg)] @@ -43,8 +43,8 @@ //! } //! ``` //! -//! Use `raw_varg` where the function takes a variable number of arguments and the types -//! are not the same, for example, RPN function `case_when`. +//! Use `raw_varg` where the function takes a variable number of arguments and +//! the types are not the same, for example, RPN function `case_when`. //! //! ### `max_args` //! @@ -61,34 +61,40 @@ //! ### `extra_validator` //! //! A function name for custom validation code to be run when an operation is -//! validated. The validator function should have the signature `&tipb::Expr -> Result<()>`. -//! E.g., `#[rpn_fn(raw_varg, extra_validator = json_object_validator)]` +//! validated. The validator function should have the signature `&tipb::Expr -> +//! Result<()>`. E.g., `#[rpn_fn(raw_varg, extra_validator = +//! json_object_validator)]` //! //! ### `metadata_type` //! //! The type of the metadata structure defined in tipb. -//! If `metadata_mapper` is not specified, the protobuf metadata structure will be used as the metadata directly. +//! If `metadata_mapper` is not specified, the protobuf metadata structure will +//! be used as the metadata directly. //! //! ### `metadata_mapper` //! -//! A function name to construct a new metadata or transform a protobuf metadata structure into a desired form. -//! The function signatures varies according to the existence of `metadata_mapper` and `metadata_type` as follows. +//! A function name to construct a new metadata or transform a protobuf metadata +//! structure into a desired form. The function signatures varies according to +//! the existence of `metadata_mapper` and `metadata_type` as follows. //! -//! - `metadata_mapper ` exists, `metadata_type` missing: `fn(&mut tipb::Expr) -> T` +//! - `metadata_mapper ` exists, `metadata_type` missing: `fn(&mut tipb::Expr) +//! -> T` //! //! Constructs a new metadata in type `T`. //! -//! - `metadata_mapper ` exists, `metadata_type` exists: `fn(MetaDataType, &mut tipb::Expr) -> T` +//! - `metadata_mapper ` exists, `metadata_type` exists: `fn(MetaDataType, &mut +//! tipb::Expr) -> T` //! -//! Transforms a protobuf metadata type `MetaDataType` specified by `metadata_type` into a new type `T`. +//! Transforms a protobuf metadata type `MetaDataType` specified by +//! `metadata_type` into a new type `T`. //! //! ### `capture` //! //! An array of argument names which are passed from the caller to the expanded -//! function. The argument names must be in scope in the generated `eval` or `run` -//! methods. Currently, that includes the following arguments (the supplied -//! function must accept these arguments with the corresponding types, in -//! addition to any other arguments): +//! function. The argument names must be in scope in the generated `eval` or +//! `run` methods. Currently, that includes the following arguments (the +//! supplied function must accept these arguments with the corresponding types, +//! in addition to any other arguments): //! //! * `ctx: &mut expr::EvalContext` //! * `output_rows: usize` @@ -111,35 +117,42 @@ //! This includes `varg` and `raw_varg`. //! //! The supplied function is preserved and a constructor function is generated -//! with a `_fn_meta` suffix, e.g., `#[rpn_fn] fn foo ...` will preserve `foo` and -//! generate `foo_fn_meta`. The constructor function returns an `rpn_expr::RpnFnMeta` -//! value. +//! with a `_fn_meta` suffix, e.g., `#[rpn_fn] fn foo ...` will preserve `foo` +//! and generate `foo_fn_meta`. The constructor function returns an +//! `rpn_expr::RpnFnMeta` value. //! -//! The constructor function will include code for validating the runtime arguments -//! and running the function, pointers to these functions are stored in the result. +//! The constructor function will include code for validating the runtime +//! arguments and running the function, pointers to these functions are stored +//! in the result. //! //! ### Non-vararg functions //! -//! Generate the following (examples assume a supplied function called `foo_bar`: +//! Generate the following (examples assume a supplied function called +//! `foo_bar`: //! -//! * A trait to represent the function (`FooBar_Fn`) with a single function `eval`. +//! * A trait to represent the function (`FooBar_Fn`) with a single function +//! `eval`. //! - An impl of that trait for all argument types which panics -//! - An impl of that trait for the supported argument type which calls the supplied function. -//! * An evaluator struct (`FooBar_Evaluator`) which implements `rpn_expr::function::Evaluator`, -//! which includes an `eval` method which dispatches to `FooBar_Fn::eval`. +//! - An impl of that trait for the supported argument type which calls the +//! supplied function. +//! * An evaluator struct (`FooBar_Evaluator`) which implements +//! `rpn_expr::function::Evaluator`, which includes an `eval` method which +//! dispatches to `FooBar_Fn::eval`. //! * A constructor function similar to the vararg case. //! //! The supplied function is preserved. //! -//! The supported argument type is represented as a type-level list, for example, a -//! a function which takes two unsigned ints has an argument representation -//! something like `Arg>`. See documentation in -//! `components/tidb_query_expr/src/types/function.rs` for more details. +//! The supported argument type is represented as a type-level list, for +//! example, a a function which takes two unsigned ints has an argument +//! representation something like `Arg>`. See +//! documentation in `components/tidb_query_expr/src/types/function.rs` for more +//! details. //! -//! The `_Fn` trait can be customised by implementing it manually. -//! For example, you are going to implement an RPN function called `regex_match` taking two -//! arguments, the regex and the string to match. You want to build the regex only once if the -//! first argument is a scalar. The code may look like: +//! The `_Fn` trait can be customized by implementing it manually. +//! For example, you are going to implement an RPN function called `regex_match` +//! taking two arguments, the regex and the string to match. You want to build +//! the regex only once if the first argument is a scalar. The code may look +//! like: //! //! ```ignore //! fn regex_match_impl(regex: &Regex, text: Option<&Bytes>) -> Result> { @@ -175,8 +188,9 @@ //! } //! ``` //! -//! If the RPN function accepts variable number of arguments and all arguments have the same eval -//! type, like RPN function `coalesce`, you can use `#[rpn_fn(varg)]` like: +//! If the RPN function accepts variable number of arguments and all arguments +//! have the same eval type, like RPN function `coalesce`, you can use +//! `#[rpn_fn(varg)]` like: //! //! ```ignore //! #[rpn_fn(varg)] @@ -220,10 +234,12 @@ mod kw { /// Parses an attribute like `#[rpn_fn(varg, capture = [ctx, output_rows])`. #[derive(Debug)] struct RpnFnAttr { - /// Whether or not the function is a varg function. Varg function accepts `&[&Option]`. + /// Whether or not the function is a varg function. Varg function accepts + /// `&[&Option]`. is_varg: bool, - /// Whether or not the function is a raw varg function. Raw varg function accepts `&[ScalarValueRef]`. + /// Whether or not the function is a raw varg function. Raw varg function + /// accepts `&[ScalarValueRef]`. is_raw_varg: bool, /// Whether or not the function needs extra logic on `None` value. @@ -234,8 +250,9 @@ struct RpnFnAttr { /// The maximum accepted arguments, which will be checked by the validator. /// - /// Only varg or raw_varg function accepts a range of number of arguments. Other kind of - /// function strictly stipulates number of arguments according to the function definition. + /// Only varg or raw_varg function accepts a range of number of arguments. + /// Other kind of function strictly stipulates number of arguments + /// according to the function definition. max_args: Option, /// The minimal accepted arguments, which will be checked by the validator. @@ -411,7 +428,8 @@ impl parse::Parse for RpnFnAttr { } } -/// Parses an evaluable type like `Option<&T>`, `Option`, `Option`, `Option` or `Option`. +/// Parses an evaluable type like `Option<&T>`, `Option`, +/// `Option`, `Option` or `Option`. struct RpnFnRefEvaluableTypeWithOption(RpnFnRefEvaluableType); impl parse::Parse for RpnFnRefEvaluableTypeWithOption { @@ -504,8 +522,8 @@ impl parse::Parse for RpnFnRefEvaluableType { } /// Parses a function signature parameter like `val: &Option` or `val: &T`. -/// If input has &Option, set has_option to true; otherwise, set has_option to false. -/// Caller can use has_option to check if input is valid. +/// If input has &Option, set has_option to true; otherwise, set has_option +/// to false. Caller can use has_option to check if input is valid. struct RpnFnSignatureParam { _pat: Pat, has_option: bool, @@ -531,9 +549,9 @@ impl parse::Parse for RpnFnSignatureParam { } } -/// Parses a function signature parameter like `val: &[&Option]` or `val: &[&T]`. -/// If input has &Option, set has_option to true; otherwise, set has_option to false. -/// Caller can use has_option to check if input is valid. +/// Parses a function signature parameter like `val: &[&Option]` or `val: +/// &[&T]`. If input has &Option, set has_option to true; otherwise, set +/// has_option to false. Caller can use has_option to check if input is valid. struct VargsRpnFnSignatureParam { _pat: Pat, has_option: bool, diff --git a/components/tidb_query_common/src/error.rs b/components/tidb_query_common/src/error.rs index 8697413f69c..046e2f02059 100644 --- a/components/tidb_query_common/src/error.rs +++ b/components/tidb_query_common/src/error.rs @@ -90,8 +90,9 @@ impl ErrorCodeExt for EvaluateError { #[error(transparent)] pub struct StorageError(#[from] pub anyhow::Error); -/// We want to restrict the type of errors to be either a `StorageError` or `EvaluateError`, thus -/// `failure::Error` is not used. Instead, we introduce our own error enum. +/// We want to restrict the type of errors to be either a `StorageError` or +/// `EvaluateError`, thus `failure::Error` is not used. Instead, we introduce +/// our own error enum. #[derive(Debug, Error)] pub enum ErrorInner { #[error("Storage error: {0}")] diff --git a/components/tidb_query_common/src/execute_stats.rs b/components/tidb_query_common/src/execute_stats.rs index 2318ad43e16..b2740212df0 100644 --- a/components/tidb_query_common/src/execute_stats.rs +++ b/components/tidb_query_common/src/execute_stats.rs @@ -76,7 +76,8 @@ impl ExecSummaryCollector for ExecSummaryCollectorEnabled { } } -/// A `ExecSummaryCollector` that does not collect anything. Acts like `collect = false`. +/// A `ExecSummaryCollector` that does not collect anything. Acts like `collect +/// = false`. pub struct ExecSummaryCollectorDisabled; impl ExecSummaryCollector for ExecSummaryCollectorDisabled { @@ -105,11 +106,11 @@ pub struct WithSummaryCollector { pub inner: T, } -/// Execution statistics to be flowed between parent and child executors at once during -/// `collect_exec_stats()` invocation. +/// Execution statistics to be flowed between parent and child executors at once +/// during `collect_exec_stats()` invocation. pub struct ExecuteStats { - /// The execution summary of each executor. If execution summary is not needed, it will - /// be zero sized. + /// The execution summary of each executor. If execution summary is not + /// needed, it will be zero sized. pub summary_per_executor: Vec, /// For each range given in the request, how many rows are scanned. @@ -119,8 +120,8 @@ pub struct ExecuteStats { impl ExecuteStats { /// Creates a new statistics instance. /// - /// If execution summary does not need to be collected, it is safe to pass 0 to the `executors` - /// argument, which will avoid one allocation. + /// If execution summary does not need to be collected, it is safe to pass 0 + /// to the `executors` argument, which will avoid one allocation. pub fn new(executors_len: usize) -> Self { Self { summary_per_executor: vec![ExecSummary::default(); executors_len], diff --git a/components/tidb_query_common/src/storage/mod.rs b/components/tidb_query_common/src/storage/mod.rs index 818b863d0a4..f8d9f37723d 100644 --- a/components/tidb_query_common/src/storage/mod.rs +++ b/components/tidb_query_common/src/storage/mod.rs @@ -11,8 +11,8 @@ pub type Result = std::result::Result; pub type OwnedKvPair = (Vec, Vec); -/// The abstract storage interface. The table scan and index scan executor relies on a `Storage` -/// implementation to provide source data. +/// The abstract storage interface. The table scan and index scan executor +/// relies on a `Storage` implementation to provide source data. pub trait Storage: Send { type Statistics; diff --git a/components/tidb_query_common/src/storage/ranges_iter.rs b/components/tidb_query_common/src/storage/ranges_iter.rs index 061cd339129..6f99249336b 100644 --- a/components/tidb_query_common/src/storage/ranges_iter.rs +++ b/components/tidb_query_common/src/storage/ranges_iter.rs @@ -7,12 +7,12 @@ pub enum IterStatus { /// All ranges are consumed. Drained, - /// Last range is drained or this iteration is a fresh start so that caller should scan - /// on a new range. + /// Last range is drained or this iteration is a fresh start so that caller + /// should scan on a new range. NewRange(Range), - /// Last interval range is not drained and the caller should continue scanning without changing - /// the scan range. + /// Last interval range is not drained and the caller should continue + /// scanning without changing the scan range. Continue, } @@ -23,13 +23,14 @@ pub enum IterStatus { /// - a flag indicating continuing last interval range /// - a flag indicating that all ranges are consumed /// -/// If a new range is returned, caller can then scan unknown amount of key(s) within this new range. -/// The caller must inform the structure so that it will emit a new range next time by calling -/// `notify_drained()` after current range is drained. Multiple `notify_drained()` without `next()` -/// will have no effect. +/// If a new range is returned, caller can then scan unknown amount of key(s) +/// within this new range. The caller must inform the structure so that it will +/// emit a new range next time by calling `notify_drained()` after current range +/// is drained. Multiple `notify_drained()` without `next()` will have no +/// effect. pub struct RangesIterator { - /// Whether or not we are processing a valid range. If we are not processing a range, or there - /// is no range any more, this field is `false`. + /// Whether or not we are processing a valid range. If we are not processing + /// a range, or there is no range any more, this field is `false`. in_range: bool, iter: std::vec::IntoIter, diff --git a/components/tidb_query_common/src/storage/scanner.rs b/components/tidb_query_common/src/storage/scanner.rs index 1c1a1cea111..851220307b9 100644 --- a/components/tidb_query_common/src/storage/scanner.rs +++ b/components/tidb_query_common/src/storage/scanner.rs @@ -5,8 +5,8 @@ use crate::error::StorageError; const KEY_BUFFER_CAPACITY: usize = 64; -/// A scanner that scans over multiple ranges. Each range can be a point range containing only -/// one row, or an interval range containing multiple rows. +/// A scanner that scans over multiple ranges. Each range can be a point range +/// containing only one row, or an interval range containing multiple rows. pub struct RangesScanner { storage: T, ranges_iter: RangesIterator, @@ -69,7 +69,8 @@ impl RangesScanner { } /// Fetches next row. - /// Note: `update_scanned_range` can control whether update the scanned range when `is_scanned_range_aware` is true. + /// Note: `update_scanned_range` can control whether update the scanned + /// range when `is_scanned_range_aware` is true. pub fn next_opt( &mut self, update_scanned_range: bool, @@ -119,14 +120,14 @@ impl RangesScanner { } } - /// Appends storage statistics collected so far to the given container and clears the - /// collected statistics. + /// Appends storage statistics collected so far to the given container and + /// clears the collected statistics. pub fn collect_storage_stats(&mut self, dest: &mut T::Statistics) { self.storage.collect_statistics(dest) } - /// Appends scanned rows of each range so far to the given container and clears the - /// collected statistics. + /// Appends scanned rows of each range so far to the given container and + /// clears the collected statistics. pub fn collect_scanned_rows_per_range(&mut self, dest: &mut Vec) { dest.append(&mut self.scanned_rows_per_range); self.scanned_rows_per_range.push(0); @@ -503,8 +504,8 @@ mod tests { assert_eq!(&r.upper_exclusive, b"foo_8"); // Multiple ranges - // TODO: caller should not pass in unordered ranges otherwise scanned ranges would be - // unsound. + // TODO: caller should not pass in unordered ranges otherwise scanned ranges + // would be unsound. let ranges = vec![ IntervalRange::from(("foo", "foo_3")).into(), IntervalRange::from(("foo_5", "foo_50")).into(), @@ -718,8 +719,8 @@ mod tests { assert_eq!(&r.upper_exclusive, b"foo_8"); // Multiple ranges - // TODO: caller should not pass in unordered ranges otherwise scanned ranges would be - // unsound. + // TODO: caller should not pass in unordered ranges otherwise scanned ranges + // would be unsound. let ranges = vec![ IntervalRange::from(("foo", "foo_3")).into(), IntervalRange::from(("foo_5", "foo_50")).into(), diff --git a/components/tidb_query_common/src/storage/test_fixture.rs b/components/tidb_query_common/src/storage/test_fixture.rs index a10726b5347..305bc5bf168 100644 --- a/components/tidb_query_common/src/storage/test_fixture.rs +++ b/components/tidb_query_common/src/storage/test_fixture.rs @@ -11,7 +11,8 @@ type ErrorBuilder = Box crate::error::StorageError>; type FixtureValue = std::result::Result, ErrorBuilder>; -/// A `Storage` implementation that returns fixed source data (i.e. fixture). Useful in tests. +/// A `Storage` implementation that returns fixed source data (i.e. fixture). +/// Useful in tests. #[derive(Clone)] pub struct FixtureStorage { data: Arc, FixtureValue>>, @@ -69,8 +70,8 @@ impl super::Storage for FixtureStorage { fn scan_next(&mut self) -> Result> { let value = if !self.is_backward_scan { - // During the call of this function, `data` must be valid and we are only returning - // data clones to outside, so this access is safe. + // During the call of this function, `data` must be valid and we are only + // returning data clones to outside, so this access is safe. self.data_view_unsafe.as_mut().unwrap().next() } else { self.data_view_unsafe.as_mut().unwrap().next_back() diff --git a/components/tidb_query_common/src/util.rs b/components/tidb_query_common/src/util.rs index 9ee2a059073..9f9b60bf9f7 100644 --- a/components/tidb_query_common/src/util.rs +++ b/components/tidb_query_common/src/util.rs @@ -40,8 +40,8 @@ pub fn is_prefix_next(key: &[u8], next: &[u8]) -> bool { let mut carry_pos = len; loop { if carry_pos == 0 { - // All bytes of `key` are 255. `next` couldn't be `key`'s prefix_next since their - // lengths are equal. + // All bytes of `key` are 255. `next` couldn't be `key`'s prefix_next since + // their lengths are equal. return false; } @@ -71,8 +71,8 @@ pub fn is_prefix_next(key: &[u8], next: &[u8]) -> bool { && next[carry_pos + 1..].iter().all(|byte| *byte == 0) && key[..carry_pos] == next[..carry_pos] } else if len + 1 == next_len { - // `next` must has one more 0 than `key`, and the first `len` bytes must be all 255. - // The case that `len == 0` is also covered here. + // `next` must has one more 0 than `key`, and the first `len` bytes must be all + // 255. The case that `len == 0` is also covered here. *next.last().unwrap() == 0 && key.iter().all(|byte| *byte == 255) && next.iter().take(len).all(|byte| *byte == 255) diff --git a/components/tidb_query_datatype/src/codec/batch/lazy_column.rs b/components/tidb_query_datatype/src/codec/batch/lazy_column.rs index dcd6328ca18..11d290f9c31 100644 --- a/components/tidb_query_datatype/src/codec/batch/lazy_column.rs +++ b/components/tidb_query_datatype/src/codec/batch/lazy_column.rs @@ -16,13 +16,14 @@ use crate::{ match_template_evaltype, EvalType, FieldTypeAccessor, }; -/// A container stores an array of datums, which can be either raw (not decoded), or decoded into -/// the `VectorValue` type. +/// A container stores an array of datums, which can be either raw (not +/// decoded), or decoded into the `VectorValue` type. /// /// TODO: -/// Since currently the data format in response can be the same as in storage, we use this structure -/// to avoid unnecessary repeated serialization / deserialization. In future, Coprocessor will -/// respond all data in Chunk format which is different to the format in storage. At that time, +/// Since currently the data format in response can be the same as in storage, +/// we use this structure to avoid unnecessary repeated serialization / +/// deserialization. In future, Coprocessor will respond all data in Chunk +/// format which is different to the format in storage. At that time, /// this structure is no longer useful and should be removed. #[derive(Clone, Debug)] pub enum LazyBatchColumn { @@ -42,14 +43,16 @@ impl LazyBatchColumn { #[inline] pub fn raw_with_capacity(capacity: usize) -> Self { use codec::number::MAX_VARINT64_LENGTH; - // We assume that each element *may* has a size of MAX_VAR_INT_LEN + Datum Flag (1 byte). + // We assume that each element *may* has a size of MAX_VAR_INT_LEN + Datum Flag + // (1 byte). LazyBatchColumn::Raw(BufferVec::with_capacity( capacity, capacity * (MAX_VARINT64_LENGTH + 1), )) } - /// Creates a new `LazyBatchColumn::Decoded` with specified capacity and eval type. + /// Creates a new `LazyBatchColumn::Decoded` with specified capacity and + /// eval type. #[inline] pub fn decoded_with_capacity_and_tp(capacity: usize, eval_tp: EvalType) -> Self { LazyBatchColumn::Decoded(VectorValue::with_capacity(capacity, eval_tp)) @@ -150,14 +153,16 @@ impl LazyBatchColumn { } } - /// Decodes this column if the column is not decoded, according to the given logical rows map. - /// After decoding, the decoded column will have the same physical layout as the encoded one - /// (i.e. the same logical rows), but elements in unnecessary positions will not be decoded - /// and will be `None`. + /// Decodes this column if the column is not decoded, according to the given + /// logical rows map. After decoding, the decoded column will have the same + /// physical layout as the encoded one (i.e. the same logical rows), but + /// elements in unnecessary positions will not be decoded and will be + /// `None`. /// - /// The field type is needed because we use the same `DateTime` structure when handling - /// Date, Time or Timestamp. - // TODO: Maybe it's a better idea to assign different eval types for different date types. + /// The field type is needed because we use the same `DateTime` structure + /// when handling Date, Time or Timestamp. + // TODO: Maybe it's a better idea to assign different eval types for different + // date types. pub fn ensure_decoded( &mut self, ctx: &mut EvalContext, @@ -358,7 +363,8 @@ mod tests { assert!(col.is_decoded()); assert_eq!(col.len(), 3); assert_eq!(col.capacity(), 3); - // Element 1 is None because it is not referred in `logical_rows` and we don't decode it. + // Element 1 is None because it is not referred in `logical_rows` and we don't + // decode it. assert_eq!(col.decoded().to_int_vec(), &[Some(32), None, Some(10)]); { @@ -370,7 +376,8 @@ mod tests { assert_eq!(col.decoded().to_int_vec(), &[Some(32), None, Some(10)]); } - // Decode a decoded column, even using a different logical rows, does not have effect. + // Decode a decoded column, even using a different logical rows, does not have + // effect. col.ensure_decoded( &mut ctx, &FieldTypeTp::Long.into(), @@ -435,7 +442,8 @@ mod benches { /// Bench performance of decoding a raw batch column. /// - /// Note that there is a clone in the bench suite, whose cost should be excluded. + /// Note that there is a clone in the bench suite, whose cost should be + /// excluded. #[bench] fn bench_lazy_batch_column_clone_and_decode(b: &mut test::Bencher) { use crate::{ @@ -471,7 +479,8 @@ mod benches { /// Bench performance of decoding a decoded lazy batch column. /// - /// Note that there is a clone in the bench suite, whose cost should be excluded. + /// Note that there is a clone in the bench suite, whose cost should be + /// excluded. #[bench] fn bench_lazy_batch_column_clone_and_decode_decoded(b: &mut test::Bencher) { use crate::{ diff --git a/components/tidb_query_datatype/src/codec/batch/lazy_column_vec.rs b/components/tidb_query_datatype/src/codec/batch/lazy_column_vec.rs index d4f7ea9044a..55a07e72ae7 100644 --- a/components/tidb_query_datatype/src/codec/batch/lazy_column_vec.rs +++ b/components/tidb_query_datatype/src/codec/batch/lazy_column_vec.rs @@ -13,7 +13,8 @@ use crate::{ /// Stores multiple `LazyBatchColumn`s. Each column has an equal length. #[derive(Clone, Debug)] pub struct LazyBatchColumnVec { - /// Multiple lazy batch columns. Each column is either decoded, or not decoded. + /// Multiple lazy batch columns. Each column is either decoded, or not + /// decoded. /// /// For decoded columns, they may be in different types. If the column is in /// type `LazyBatchColumn::Raw`, it means that it is not decoded. @@ -37,9 +38,11 @@ impl From> for LazyBatchColumnVec { } impl LazyBatchColumnVec { - /// Creates a new empty `LazyBatchColumnVec`, which does not have columns and rows. + /// Creates a new empty `LazyBatchColumnVec`, which does not have columns + /// and rows. /// - /// Because column numbers won't change, it means constructed instance will be always empty. + /// Because column numbers won't change, it means constructed instance will + /// be always empty. #[inline] pub fn empty() -> Self { Self { @@ -47,7 +50,8 @@ impl LazyBatchColumnVec { } } - /// Creates a new empty `LazyBatchColumnVec` with the same number of columns and schema. + /// Creates a new empty `LazyBatchColumnVec` with the same number of columns + /// and schema. #[inline] #[must_use] pub fn clone_empty(&self, capacity: usize) -> Self { @@ -60,7 +64,8 @@ impl LazyBatchColumnVec { } } - /// Creates a new `LazyBatchColumnVec`, which contains `columns_count` number of raw columns. + /// Creates a new `LazyBatchColumnVec`, which contains `columns_count` + /// number of raw columns. #[cfg(test)] #[must_use] pub fn with_raw_columns(columns_count: usize) -> Self { @@ -160,8 +165,8 @@ impl LazyBatchColumnVec { Ok(()) } - /// Truncates columns into equal length. The new length of all columns would be the length of - /// the shortest column before calling this function. + /// Truncates columns into equal length. The new length of all columns would + /// be the length of the shortest column before calling this function. pub fn truncate_into_equal_length(&mut self) { let mut min_len = self.rows_len(); for col in &self.columns { @@ -184,8 +189,8 @@ impl LazyBatchColumnVec { } } -// Do not implement Deref, since we want to forbid some misleading function calls like -// `LazyBatchColumnVec.len()`. +// Do not implement Deref, since we want to forbid some misleading function +// calls like `LazyBatchColumnVec.len()`. impl Index for LazyBatchColumnVec { type Output = LazyBatchColumn; diff --git a/components/tidb_query_datatype/src/codec/chunk/chunk.rs b/components/tidb_query_datatype/src/codec/chunk/chunk.rs index 2cf1261f7dc..ee111d11f77 100644 --- a/components/tidb_query_datatype/src/codec/chunk/chunk.rs +++ b/components/tidb_query_datatype/src/codec/chunk/chunk.rs @@ -10,8 +10,9 @@ use super::{ use crate::{codec::Datum, FieldTypeAccessor}; /// `Chunk` stores multiple rows of data. -/// Values are appended in compact format and can be directly accessed without decoding. -/// When the chunk is done processing, we can reuse the allocated memory by resetting it. +/// Values are appended in compact format and can be directly accessed without +/// decoding. When the chunk is done processing, we can reuse the allocated +/// memory by resetting it. pub struct Chunk { columns: Vec, } @@ -32,7 +33,8 @@ impl Chunk { } /// Reset the chunk, so the memory it allocated can be reused. - /// Make sure all the data in the chunk is not used anymore before you reuse this chunk. + /// Make sure all the data in the chunk is not used anymore before you reuse + /// this chunk. pub fn reset(&mut self) { for column in &mut self.columns { column.reset(); diff --git a/components/tidb_query_datatype/src/codec/chunk/column.rs b/components/tidb_query_datatype/src/codec/chunk/column.rs index b8f7e4b9da6..f7f13363686 100644 --- a/components/tidb_query_datatype/src/codec/chunk/column.rs +++ b/components/tidb_query_datatype/src/codec/chunk/column.rs @@ -402,7 +402,8 @@ impl Column { self.null_cnt = 0; self.null_bitmap.clear(); if !self.var_offsets.is_empty() { - // The first offset is always 0, it makes slicing the data easier, we need to keep it. + // The first offset is always 0, it makes slicing the data easier, we need to + // keep it. self.var_offsets.truncate(1); } self.data.clear(); @@ -1006,7 +1007,7 @@ pub trait ChunkColumnEncoder: NumberEncoder { } // offsets if !col.is_fixed() { - //let length = (col.length+1)*4; + // let length = (col.length+1)*4; for v in &col.var_offsets { self.write_i64_le(*v as i64)?; } diff --git a/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs b/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs index 9c2dd2497f1..31685ca08d5 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/gbk_collation.rs @@ -15,8 +15,8 @@ impl Collator for T { #[inline] fn char_weight(ch: char) -> Self::Weight { - // All GBK code point are in BMP, if the incoming character is not, convert it to '?'. - // This should not happened. + // All GBK code point are in BMP, if the incoming character is not, convert it + // to '?'. This should not happened. let r = ch as usize; if r > 0xFFFF { return '?' as u16; @@ -71,7 +71,8 @@ impl GbkCollator for CollatorGbkBin { const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_BIN_TABLE; } -/// Collator for `gbk_chinese_ci` collation with padding behavior (trims right spaces). +/// Collator for `gbk_chinese_ci` collation with padding behavior (trims right +/// spaces). #[derive(Debug)] pub struct CollatorGbkChineseCi; @@ -80,10 +81,12 @@ impl GbkCollator for CollatorGbkChineseCi { const WEIGHT_TABLE: &'static [u8; (0xffff + 1) * 2] = GBK_CHINESE_CI_TABLE; } -// GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally the same with golang's GBK encoding. -// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened. +// GBK_BIN_TABLE are the encoding tables from Unicode to GBK code, it is totally +// the same with golang's GBK encoding. If there is no mapping code in GBK, use +// 0x3F(?) instead. It should not happened. const GBK_BIN_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_bin.data"); // GBK_CHINESE_CI_TABLE are the sort key tables for GBK codepoint. -// If there is no mapping code in GBK, use 0x3F(?) instead. It should not happened. +// If there is no mapping code in GBK, use 0x3F(?) instead. It should not +// happened. const GBK_CHINESE_CI_TABLE: &[u8; (0xffff + 1) * 2] = include_bytes!("gbk_chinese_ci.data"); diff --git a/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs b/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs index c74ed3687a9..c70deb08cd1 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/latin1_bin.rs @@ -4,7 +4,8 @@ use bstr::{ByteSlice, B}; use super::*; -/// Collator for latin1_bin collation with padding behavior (trims right spaces). +/// Collator for latin1_bin collation with padding behavior (trims right +/// spaces). #[derive(Debug)] pub struct CollatorLatin1Bin; diff --git a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs index e12114d9cea..bac55eabea7 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/mod.rs @@ -45,7 +45,8 @@ mod tests { (Collation::GbkChineseCi, 6), ]; let cases = vec![ - // (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, GBKBin, GbkChineseCi]) + // (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, + // Latin1, GBKBin, GbkChineseCi]) ( "a".as_bytes(), "a".as_bytes(), @@ -232,7 +233,8 @@ mod tests { (Collation::GbkChineseCi, 6), ]; let cases = vec![ - // (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, GBKBin, GbkChineseCi]) + // (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi, Latin1, + // GBKBin, GbkChineseCi]) ( "a", [ diff --git a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs index bbd7e60a047..959664b1854 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_binary.rs @@ -2,7 +2,8 @@ use super::*; -/// Collator for utf8mb4_bin collation with padding behavior (trims right spaces). +/// Collator for utf8mb4_bin collation with padding behavior (trims right +/// spaces). #[derive(Debug)] pub struct CollatorUtf8Mb4Bin; diff --git a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs index 50770550f19..2cc9a738372 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_general_ci.rs @@ -2,7 +2,8 @@ use super::*; -/// Collator for utf8mb4_general_ci collation with padding behavior (trims right spaces). +/// Collator for utf8mb4_general_ci collation with padding behavior (trims right +/// spaces). #[derive(Debug)] pub struct CollatorUtf8Mb4GeneralCi; diff --git a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_unicode_ci.rs b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_unicode_ci.rs index 9bb44382f53..5a529d48144 100644 --- a/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_unicode_ci.rs +++ b/components/tidb_query_datatype/src/codec/collation/collator/utf8mb4_unicode_ci.rs @@ -2,7 +2,8 @@ use super::*; -/// Collator for `utf8mb4_unicode_ci` collation with padding behavior (trims right spaces). +/// Collator for `utf8mb4_unicode_ci` collation with padding behavior (trims +/// right spaces). #[derive(Debug)] pub struct CollatorUtf8Mb4UnicodeCi; diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index 7d73cce2192..0d6a8e6d9ea 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -149,8 +149,9 @@ where /// /// # Panic /// - /// The `Ord`, `Hash`, `PartialEq` and more implementations assume that the bytes are - /// valid for the certain collator. The violation will cause panic. + /// The `Ord`, `Hash`, `PartialEq` and more implementations assume that the + /// bytes are valid for the certain collator. The violation will cause + /// panic. #[inline] pub fn new_unchecked(inner: T) -> Self { Self { diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index bcfc7bb2bbe..c576f14ee5f 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -280,11 +280,13 @@ impl ToInt for u64 { impl ToInt for f64 { /// This function is ported from TiDB's types.ConvertFloatToInt, - /// which checks whether the number overflows the signed lower and upper boundaries of `tp` + /// which checks whether the number overflows the signed lower and upper + /// boundaries of `tp` /// /// # Notes /// - /// It handles overflows using `ctx` so that the caller would not handle it anymore. + /// It handles overflows using `ctx` so that the caller would not handle it + /// anymore. fn to_int(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { #![allow(clippy::float_cmp)] let val = self.round(); @@ -307,11 +309,13 @@ impl ToInt for f64 { } /// This function is ported from TiDB's types.ConvertFloatToUint, - /// which checks whether the number overflows the unsigned upper boundaries of `tp` + /// which checks whether the number overflows the unsigned upper boundaries + /// of `tp` /// /// # Notes /// - /// It handles overflows using `ctx` so that the caller would not handle it anymore. + /// It handles overflows using `ctx` so that the caller would not handle it + /// anymore. #[allow(clippy::float_cmp)] fn to_uint(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let val = self.round(); @@ -444,8 +448,12 @@ impl ToInt for Decimal { impl ToInt for DateTime { // FiXME - // Time::parse_utc_datetime("2000-01-01T12:13:14.6666", 4).unwrap().round_frac(DEFAULT_FSP) - // will get 2000-01-01T12:13:14, this is a bug + // ``` + // Time::parse_utc_datetime("2000-01-01T12:13:14.6666", 4) + // .unwrap() + // .round_frac(DEFAULT_FSP) + // ``` + // will get 2000-01-01T12:13:14, this is a bug #[inline] fn to_int(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let t = self.round_frac(ctx, DEFAULT_FSP)?; @@ -664,8 +672,8 @@ pub fn produce_dec_with_specified_tp( } } -/// `produce_float_with_specified_tp`(`ProduceFloatWithSpecifiedTp` in TiDB) produces -/// a new float64 according to `flen` and `decimal` in `self.tp`. +/// `produce_float_with_specified_tp`(`ProduceFloatWithSpecifiedTp` in TiDB) +/// produces a new float64 according to `flen` and `decimal` in `self.tp`. /// TODO port tests from TiDB(TiDB haven't implemented now) pub fn produce_float_with_specified_tp( ctx: &mut EvalContext, @@ -692,8 +700,8 @@ pub fn produce_float_with_specified_tp( Ok(res) } -/// `produce_str_with_specified_tp`(`ProduceStrWithSpecifiedTp` in TiDB) produces -/// a new string according to `flen` and `chs`. +/// `produce_str_with_specified_tp`(`ProduceStrWithSpecifiedTp` in TiDB) +/// produces a new string according to `flen` and `chs`. pub fn produce_str_with_specified_tp<'a>( ctx: &mut EvalContext, s: Cow<'a, [u8]>, @@ -705,8 +713,8 @@ pub fn produce_str_with_specified_tp<'a>( return Ok(s); } let flen = flen as usize; - // flen is the char length, not byte length, for UTF8 charset, we need to calculate the - // char count and truncate to flen chars if it is too long. + // flen is the char length, not byte length, for UTF8 charset, we need to + // calculate the char count and truncate to flen chars if it is too long. if chs == charset::CHARSET_UTF8 || chs == charset::CHARSET_UTF8MB4 { let (char_count, truncate_pos) = { let s = &String::from_utf8_lossy(&s); @@ -767,7 +775,8 @@ pub fn pad_zero_for_binary_type(s: &mut Vec, ft: &FieldType) { .unwrap_or(false) && s.len() < flen { - // it seems MaxAllowedPacket has not push down to tikv, so we needn't to handle it + // it seems MaxAllowedPacket has not push down to tikv, so we needn't to handle + // it s.resize(flen, 0); } } @@ -831,8 +840,8 @@ pub fn get_valid_int_prefix<'a>(ctx: &mut EvalContext, s: &'a str) -> Result( ctx: &mut EvalContext, s: &'a str, @@ -868,8 +877,8 @@ pub fn get_valid_float_prefix<'a>(ctx: &mut EvalContext, s: &'a str) -> Result<& get_valid_float_prefix_helper(ctx, s, false) } -// As TiDB code(getValidFloatPrefix()), cast expr should not give error/warning when input is -// empty. +// As TiDB code(getValidFloatPrefix()), cast expr should not give error/warning +// when input is empty. pub fn get_valid_float_prefix_helper<'a>( ctx: &mut EvalContext, s: &'a str, @@ -961,14 +970,14 @@ fn round_int_str(num_next_dot: char, s: &str) -> Cow<'_, str> { } /// It converts a valid float string into valid integer string which can be -/// parsed by `i64::from_str`, we can't parse float first then convert it to string -/// because precision will be lost. +/// parsed by `i64::from_str`, we can't parse float first then convert it to +/// string because precision will be lost. /// /// When the float string indicating a value that is overflowing the i64, /// the original float string is returned and an overflow warning is attached. /// -/// This func will find serious overflow such as the len of result > 20 (without prefix `+/-`) -/// however, it will not check whether the result overflow BIGINT. +/// This func will find serious overflow such as the len of result > 20 (without +/// prefix `+/-`) however, it will not check whether the result overflow BIGINT. fn float_str_to_int_string<'a>(ctx: &mut EvalContext, valid_float: &'a str) -> Cow<'a, str> { // this func is complex, to make it same as TiDB's version, // we impl it like TiDB's version(https://github.com/pingcap/tidb/blob/9b521342bf/types/convert.go#L400) @@ -1531,7 +1540,8 @@ mod tests { ("{}", ERR_TRUNCATE_WRONG_VALUE), ("[]", ERR_TRUNCATE_WRONG_VALUE), ]; - // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as true + // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as + // true let mut ctx = EvalContext::new(Arc::new(EvalConfig::new())); for (jstr, exp) in test_cases { let json: Json = jstr.parse().unwrap(); @@ -1865,7 +1875,8 @@ mod tests { ("{}", ERR_TRUNCATE_WRONG_VALUE), ("[]", ERR_TRUNCATE_WRONG_VALUE), ]; - // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as true + // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as + // true let mut ctx = EvalContext::new(Arc::new(EvalConfig::new())); for (jstr, exp) in test_cases { let json: Json = jstr.parse().unwrap(); @@ -2089,7 +2100,8 @@ mod tests { assert_eq!(o.unwrap(), i); } - // Secondly, make sure warnings are attached when the float string cannot be casted to a valid int string + // Secondly, make sure warnings are attached when the float string cannot be + // casted to a valid int string let warnings = ctx.take_warnings().warnings; assert_eq!(warnings.len(), 2); for warning in warnings { @@ -2359,8 +2371,8 @@ mod tests { // origin, // (origin_flen, origin_decimal), (res_flen, res_decimal), is_unsigned, // expect, warning_err_code, - // ((InInsertStmt || InUpdateStmt || InDeleteStmt), overflow_as_warning, truncate_as_warning) - // ) + // ((InInsertStmt || InUpdateStmt || InDeleteStmt), overflow_as_warning, + // truncate_as_warning) ) // // The origin_flen, origin_decimal field is to // let the programmer clearly know what the flen and decimal of the decimal is. @@ -2646,7 +2658,8 @@ mod tests { // zero // FIXME: // according to Decimal::prec_and_frac, - // the decimals' prec(the number of all digits) and frac(the number of digit after number point) are + // the decimals' prec(the number of all digits) and frac(the number of digit after + // number point) are: // Decimal::zero()'s is (1, 0) // Decimal::from_bytes(b"00.00")'s is (2, 2) // Decimal::from_bytes(b"000.00")'s is (2, 2) diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs index 7086e97c23b..4bad0fcc129 100644 --- a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs @@ -13,11 +13,11 @@ pub struct ChunkedVecBytes { /// A vector storing `Option` with a compact layout. /// -/// Inside `ChunkedVecBytes`, `bitmap` indicates if an element at given index is null, -/// and `data` stores actual data. Bytes data are stored adjacent to each other in -/// `data`. If element at a given index is null, then it takes no space in `data`. -/// Otherwise, contents of the `Bytes` are stored, and `var_offset` indicates the starting -/// position of each element. +/// Inside `ChunkedVecBytes`, `bitmap` indicates if an element at given index is +/// null, and `data` stores actual data. Bytes data are stored adjacent to each +/// other in `data`. If element at a given index is null, then it takes no space +/// in `data`. Otherwise, contents of the `Bytes` are stored, and `var_offset` +/// indicates the starting position of each element. impl ChunkedVecBytes { #[inline] pub fn push_data_ref(&mut self, value: BytesRef<'_>) { diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_json.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_json.rs index 52279c5a439..9ef17dc61eb 100644 --- a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_json.rs +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_json.rs @@ -7,11 +7,12 @@ use crate::impl_chunked_vec_common; /// A vector storing `Option` with a compact layout. /// -/// Inside `ChunkedVecJson`, `bitmap` indicates if an element at given index is null, -/// and `data` stores actual data. Json data are stored adjacent to each other in -/// `data`. If element at a given index is null, then it takes no space in `data`. -/// Otherwise, a one byte `json_type` and variable size json data is stored in `data`, -/// and `var_offset` indicates the starting position of each element. +/// Inside `ChunkedVecJson`, `bitmap` indicates if an element at given index is +/// null, and `data` stores actual data. Json data are stored adjacent to each +/// other in `data`. If element at a given index is null, then it takes no space +/// in `data`. Otherwise, a one byte `json_type` and variable size json data is +/// stored in `data`, and `var_offset` indicates the starting position of each +/// element. #[derive(Debug, PartialEq, Clone)] pub struct ChunkedVecJson { data: Vec, diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_set.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_set.rs index 41b523391c2..1a3f6838e96 100644 --- a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_set.rs +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_set.rs @@ -20,7 +20,8 @@ use crate::impl_chunked_vec_common; /// stored representation issue /// /// TODO: add way to set set column data -/// TODO: code fot set/enum looks nearly the same, considering refactor them using macro +/// TODO: code fot set/enum looks nearly the same, considering refactor them +/// using macro #[derive(Debug, Clone)] pub struct ChunkedVecSet { data: Arc, diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_sized.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_sized.rs index 45e2665ec31..4f614d00be0 100644 --- a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_sized.rs +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_sized.rs @@ -9,10 +9,11 @@ use crate::impl_chunked_vec_common; /// in that structure itself. This includes `Int`, `Real`, `Decimal`, /// `DateTime` and `Duration` in copr framework. /// -/// Inside `ChunkedVecSized`, `bitmap` indicates if an element at given index is null, -/// and `data` stores actual data. If the element at given index is null (or `None`), -/// the corresponding `bitmap` bit is false, and `data` stores zero value for -/// that element. Otherwise, `data` stores actual data, and `bitmap` bit is true. +/// Inside `ChunkedVecSized`, `bitmap` indicates if an element at given index is +/// null, and `data` stores actual data. If the element at given index is null +/// (or `None`), the corresponding `bitmap` bit is false, and `data` stores zero +/// value for that element. Otherwise, `data` stores actual data, and `bitmap` +/// bit is true. #[derive(Debug, PartialEq, Clone)] pub struct ChunkedVecSized { data: Vec, diff --git a/components/tidb_query_datatype/src/codec/data_type/logical_rows.rs b/components/tidb_query_datatype/src/codec/data_type/logical_rows.rs index d27a030b817..46b5a64b010 100644 --- a/components/tidb_query_datatype/src/codec/data_type/logical_rows.rs +++ b/components/tidb_query_datatype/src/codec/data_type/logical_rows.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -// TODO: This value is chosen based on MonetDB/X100's research without our own benchmarks. +// TODO: This value is chosen based on MonetDB/X100's research without our own +// benchmarks. pub const BATCH_MAX_SIZE: usize = 1024; /// Identical logical row is a special case in expression evaluation that diff --git a/components/tidb_query_datatype/src/codec/data_type/mod.rs b/components/tidb_query_datatype/src/codec/data_type/mod.rs index 8397a8d2ab5..278ef48469a 100644 --- a/components/tidb_query_datatype/src/codec/data_type/mod.rs +++ b/components/tidb_query_datatype/src/codec/data_type/mod.rs @@ -50,8 +50,8 @@ pub use crate::codec::mysql::{ }; use crate::{codec::convert::ConvertTo, expr::EvalContext, EvalType}; -/// A trait of evaluating current concrete eval type into a MySQL logic value, represented by -/// Rust's `bool` type. +/// A trait of evaluating current concrete eval type into a MySQL logic value, +/// represented by Rust's `bool` type. pub trait AsMySQLBool { /// Evaluates into a MySQL logic value. fn as_mysql_bool(&self, context: &mut EvalContext) -> Result; @@ -187,27 +187,28 @@ pub trait Evaluable: Clone + std::fmt::Debug + Send + Sync + 'static { /// panics if the varient mismatches. fn borrow_scalar_value_ref(v: ScalarValueRef<'_>) -> Option<&Self>; - /// Borrows a slice of this concrete type from a `VectorValue` in the same type; - /// panics if the varient mismatches. + /// Borrows a slice of this concrete type from a `VectorValue` in the same + /// type; panics if the varient mismatches. fn borrow_vector_value(v: &VectorValue) -> &ChunkedVecSized; } pub trait EvaluableRet: Clone + std::fmt::Debug + Send + Sync + 'static { const EVAL_TYPE: EvalType; type ChunkedType: ChunkedVec; - /// Converts a vector of this concrete type into a `VectorValue` in the same type; - /// panics if the varient mismatches. + /// Converts a vector of this concrete type into a `VectorValue` in the same + /// type; panics if the varient mismatches. fn cast_chunk_into_vector_value(vec: Self::ChunkedType) -> VectorValue; } /// # Notes /// -/// Make sure operating `bitmap` and `value` together, so while `bitmap` is 0 and the -/// corresponding value is None. +/// Make sure operating `bitmap` and `value` together, so while `bitmap` is 0 +/// and the corresponding value is None. /// /// With this guaranty, we can avoid the following issue: /// -/// For Data [Some(1), Some(2), None], we could have different stored representation: +/// For Data [Some(1), Some(2), None], we could have different stored +/// representation: /// /// Bitmap: 110, Value: 1, 2, 0 /// Bitmap: 110, Value: 1, 2, 1 @@ -368,8 +369,8 @@ pub trait EvaluableRef<'a>: Clone + std::fmt::Debug + Send + Sync { /// panics if the varient mismatches. fn borrow_scalar_value_ref(v: ScalarValueRef<'a>) -> Option; - /// Borrows a slice of this concrete type from a `VectorValue` in the same type; - /// panics if the varient mismatches. + /// Borrows a slice of this concrete type from a `VectorValue` in the same + /// type; panics if the varient mismatches. fn borrow_vector_value(v: &'a VectorValue) -> Self::ChunkedType; /// Convert this reference to owned type diff --git a/components/tidb_query_datatype/src/codec/data_type/scalar.rs b/components/tidb_query_datatype/src/codec/data_type/scalar.rs index 7bf36935f3b..b95dbb63342 100644 --- a/components/tidb_query_datatype/src/codec/data_type/scalar.rs +++ b/components/tidb_query_datatype/src/codec/data_type/scalar.rs @@ -13,17 +13,19 @@ use crate::{ /// A scalar value container, a.k.a. datum, for all concrete eval types. /// -/// In many cases, for example, at the framework level, the concrete eval type is unknown at compile -/// time. So we use this enum container to represent types dynamically. It is similar to trait -/// object `Box` where `T` is a concrete eval type but faster. +/// In many cases, for example, at the framework level, the concrete eval type +/// is unknown at compile time. So we use this enum container to represent types +/// dynamically. It is similar to trait object `Box` where `T` is a concrete +/// eval type but faster. /// /// Like `VectorValue`, the inner concrete value is immutable. /// /// Compared to `VectorValue`, it only contains a single concrete value. -/// Compared to `Datum`, it is a newer encapsulation that naturally wraps `Option<..>`. +/// Compared to `Datum`, it is a newer encapsulation that naturally wraps +/// `Option<..>`. /// -/// TODO: Once we removed the `Option<..>` wrapper, it will be much like `Datum`. At that time, -/// we only need to preserve one of them. +/// TODO: Once we removed the `Option<..>` wrapper, it will be much like +/// `Datum`. At that time, we only need to preserve one of them. #[derive(Clone, Debug, PartialEq)] pub enum ScalarValue { Int(Option), @@ -170,7 +172,8 @@ impl From for Option { } } -/// A scalar value reference container. Can be created from `ScalarValue` or `VectorValue`. +/// A scalar value reference container. Can be created from `ScalarValue` or +/// `VectorValue`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum ScalarValueRef<'a> { Int(Option<&'a super::Int>), diff --git a/components/tidb_query_datatype/src/codec/data_type/vector.rs b/components/tidb_query_datatype/src/codec/data_type/vector.rs index d26067d8219..c7eecf92fa0 100644 --- a/components/tidb_query_datatype/src/codec/data_type/vector.rs +++ b/components/tidb_query_datatype/src/codec/data_type/vector.rs @@ -8,8 +8,8 @@ use crate::{ /// A vector value container, a.k.a. column, for all concrete eval types. /// -/// The inner concrete value is immutable. However it is allowed to push and remove values from -/// this vector container. +/// The inner concrete value is immutable. However it is allowed to push and +/// remove values from this vector container. #[derive(Debug, PartialEq, Clone)] pub enum VectorValue { Int(ChunkedVecSized), @@ -25,8 +25,8 @@ pub enum VectorValue { } impl VectorValue { - /// Creates an empty `VectorValue` according to `eval_tp` and reserves capacity according - /// to `capacity`. + /// Creates an empty `VectorValue` according to `eval_tp` and reserves + /// capacity according to `capacity`. #[inline] pub fn with_capacity(capacity: usize, eval_tp: EvalType) -> Self { match_template_evaltype! { @@ -116,9 +116,11 @@ impl VectorValue { self.len() == 0 } - /// Shortens the column, keeping the first `len` datums and dropping the rest. + /// Shortens the column, keeping the first `len` datums and dropping the + /// rest. /// - /// If `len` is greater than the column's current length, this has no effect. + /// If `len` is greater than the column's current length, this has no + /// effect. #[inline] pub fn truncate(&mut self, len: usize) { match_template_evaltype! { @@ -134,7 +136,8 @@ impl VectorValue { self.truncate(0); } - /// Returns the number of elements this column can hold without reallocating. + /// Returns the number of elements this column can hold without + /// reallocating. #[inline] pub fn capacity(&self) -> usize { match_template_evaltype! { @@ -165,7 +168,8 @@ impl VectorValue { /// Evaluates values into MySQL logic values. /// - /// The caller must provide an output buffer which is large enough for holding values. + /// The caller must provide an output buffer which is large enough for + /// holding values. pub fn eval_as_mysql_bools( &self, ctx: &mut EvalContext, @@ -464,7 +468,8 @@ impl VectorValue { macro_rules! impl_as_slice { ($ty:tt, $name:ident) => { impl VectorValue { - /// Extracts a slice of values in specified concrete type from current column. + /// Extracts a slice of values in specified concrete type from current + /// column. /// /// # Panics /// @@ -494,8 +499,9 @@ impl_as_slice! { Json, to_json_vec } impl_as_slice! { Enum, to_enum_vec } impl_as_slice! { Set, to_set_vec } -/// Additional `VectorValue` methods available via generics. These methods support different -/// concrete types but have same names and should be specified via the generic parameter type. +/// Additional `VectorValue` methods available via generics. These methods +/// support different concrete types but have same names and should be specified +/// via the generic parameter type. pub trait VectorValueExt { /// The generic version for `VectorValue::push_xxx()`. fn push(&mut self, v: Option); diff --git a/components/tidb_query_datatype/src/codec/datum.rs b/components/tidb_query_datatype/src/codec/datum.rs index a1cc6460ae2..8d2e62b6ac0 100644 --- a/components/tidb_query_datatype/src/codec/datum.rs +++ b/components/tidb_query_datatype/src/codec/datum.rs @@ -162,7 +162,8 @@ pub fn cmp_f64(l: f64, r: f64) -> Result { .ok_or_else(|| invalid_type!("{} and {} can't be compared", l, r)) } -/// `checked_add_i64` checks and adds `r` to the `l`. Return None if the sum is negative. +/// `checked_add_i64` checks and adds `r` to the `l`. Return None if the sum is +/// negative. #[inline] fn checked_add_i64(l: u64, r: i64) -> Option { if r >= 0 { @@ -908,8 +909,8 @@ pub trait DatumDecoder: NIL_FLAG => Datum::Null, FLOAT_FLAG => self.read_f64().map(Datum::F64)?, DURATION_FLAG => { - // Decode the i64 into `Duration` with `MAX_FSP`, then unflatten it with concrete - // `FieldType` information + // Decode the i64 into `Duration` with `MAX_FSP`, then unflatten it with + // concrete `FieldType` information let nanos = self.read_i64()?; let dur = Duration::from_nanos(nanos, MAX_FSP)?; Datum::Dur(dur) @@ -1010,7 +1011,7 @@ pub trait DatumEncoder: self.write_u8(JSON_FLAG)?; self.write_json(j.as_ref())?; } - //TODO: implement datum write here. + // TODO: implement datum write here. Datum::Enum(_) => unimplemented!(), Datum::Set(_) => unimplemented!(), } @@ -1073,7 +1074,8 @@ pub fn encode(ctx: &mut EvalContext, values: &[Datum], comparable: bool) -> Resu Ok(buf) } -/// `encode_key` encodes a datum slice into a memory comparable buffer as the key. +/// `encode_key` encodes a datum slice into a memory comparable buffer as the +/// key. pub fn encode_key(ctx: &mut EvalContext, values: &[Datum]) -> Result> { encode(ctx, values, true) } @@ -1134,7 +1136,8 @@ pub fn split_datum(buf: &[u8], desc: bool) -> Result<(&[u8], &[u8])> { /// `skip_n_datum_slices` skip `n` datum slices within `buf` /// and advances the buffer pointer. -/// If the datum buffer contains less than `n` slices, an error will be returned. +/// If the datum buffer contains less than `n` slices, an error will be +/// returned. pub fn skip_n(buf: &mut &[u8], n: usize) -> Result<()> { let origin = *buf; for i in 0..n { diff --git a/components/tidb_query_datatype/src/codec/datum_codec.rs b/components/tidb_query_datatype/src/codec/datum_codec.rs index 6710029ec99..9d3f5058d0b 100644 --- a/components/tidb_query_datatype/src/codec/datum_codec.rs +++ b/components/tidb_query_datatype/src/codec/datum_codec.rs @@ -1,7 +1,8 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! The unified entry for encoding and decoding an evaluable type to / from datum bytes. -//! Datum bytes consists of 1 byte datum flag and variable bytes datum payload. +//! The unified entry for encoding and decoding an evaluable type to / from +//! datum bytes. Datum bytes consists of 1 byte datum flag and variable bytes +//! datum payload. use codec::prelude::*; use tipb::FieldType; diff --git a/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs b/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs index 9904ead1098..8d1f5fdd8bb 100644 --- a/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs @@ -44,9 +44,10 @@ pub fn to_uint(ctx: &mut EvalContext, bytes: &[u8]) -> Result { } impl BinaryLiteral { - /// from_u64 creates a new BinaryLiteral instance by the given uint value in BigEndian. - /// byte size will be used as the length of the new BinaryLiteral, with leading bytes filled to zero. - /// If byte size is -1, the leading zeros in new BinaryLiteral will be trimmed. + /// from_u64 creates a new BinaryLiteral instance by the given uint value in + /// BigEndian. byte size will be used as the length of the new + /// BinaryLiteral, with leading bytes filled to zero. If byte size is -1, + /// the leading zeros in new BinaryLiteral will be trimmed. pub fn from_u64(val: u64, byte_size: isize) -> Result { if byte_size != -1 && !(1..=8).contains(&byte_size) { return Err(box_err!("invalid byte size: {}", byte_size)); diff --git a/components/tidb_query_datatype/src/codec/mysql/charset.rs b/components/tidb_query_datatype/src/codec/mysql/charset.rs index 27ad1b2a44f..0ac2655c619 100644 --- a/components/tidb_query_datatype/src/codec/mysql/charset.rs +++ b/components/tidb_query_datatype/src/codec/mysql/charset.rs @@ -4,7 +4,8 @@ pub const CHARSET_BIN: &str = "binary"; /// `CHARSET_UTF8` is the default charset for string types. pub const CHARSET_UTF8: &str = "utf8"; -/// `CHARSET_UTF8MB4` represents 4 bytes utf8, which works the same way as utf8 in Rust. +/// `CHARSET_UTF8MB4` represents 4 bytes utf8, which works the same way as utf8 +/// in Rust. pub const CHARSET_UTF8MB4: &str = "utf8mb4"; /// `CHARSET_ASCII` is a subset of UTF8. pub const CHARSET_ASCII: &str = "ascii"; diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 2eec85b7e34..a172d2e2723 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -58,10 +58,11 @@ impl Res { matches!(*self, Res::Truncated(_)) } - /// Convert `Res` into `Result` with an `EvalContext` that handling the errors - /// If `truncated_err` is None, `ctx` will try to handle the default truncated error: `Error::truncated()`, - /// otherwise handle the specified error inside `truncated_err`. - /// Same does `overflow_err` means. + /// Convert `Res` into `Result` with an `EvalContext` that handling the + /// errors If `truncated_err` is None, `ctx` will try to handle the + /// default truncated error: `Error::truncated()`, otherwise handle the + /// specified error inside `truncated_err`. Same does `overflow_err` + /// means. fn into_result_impl( self, ctx: &mut EvalContext, @@ -186,7 +187,8 @@ pub fn dec_encoded_len(encoded: &[u8]) -> Result { Ok(int_len + frac_len + 2) } -/// `count_leading_zeroes` returns the number of leading zeroes that can be removed from int. +/// `count_leading_zeroes` returns the number of leading zeroes that can be +/// removed from int. fn count_leading_zeroes(i: u8, word: u32) -> u8 { let (mut c, mut i) = (0, i as usize); while TEN_POW[i] > word { @@ -196,7 +198,8 @@ fn count_leading_zeroes(i: u8, word: u32) -> u8 { c } -/// `count_trailing_zeroes` returns the number of trailing zeroes that can be removed from fraction. +/// `count_trailing_zeroes` returns the number of trailing zeroes that can be +/// removed from fraction. fn count_trailing_zeroes(i: u8, word: u32) -> u8 { let (mut c, mut i) = (0, i as usize); while word % TEN_POW[i] == 0 { @@ -259,14 +262,15 @@ fn sub2(lhs: u32, rhs: u32, carry: &mut i32, res: &mut u32) { type SubTmp = (usize, usize, u8); -/// calculate the carry for lhs - rhs, returns the carry and needed temporary results for -/// beginning a subtraction. +/// calculate the carry for lhs - rhs, returns the carry and needed temporary +/// results for beginning a subtraction. /// /// The new carry can be: /// 1. None if lhs is equals to rhs. /// 2. Some(0) if abs(lhs) > abs(rhs), /// 3. Some(1) if abs(lhs) < abs(rhs). -/// l_frac_word_cnt and r_frac_word_cnt do not contain the suffix 0 when r_int_word_cnt == l_int_word_cnt. +/// l_frac_word_cnt and r_frac_word_cnt do not contain the suffix 0 when +/// r_int_word_cnt == l_int_word_cnt. #[inline] fn calc_sub_carry(lhs: &Decimal, rhs: &Decimal) -> (Option, u8, SubTmp, SubTmp) { let (l_int_word_cnt, mut l_frac_word_cnt) = (word_cnt!(lhs.int_cnt), word_cnt!(lhs.frac_cnt)); @@ -303,9 +307,11 @@ fn calc_sub_carry(lhs: &Decimal, rhs: &Decimal) -> (Option, u8, SubTmp, Sub while r_idx as isize <= r_end && rhs.word_buf[r_end as usize] == 0 { r_end -= 1; } - // here l_end is the last nonzero index in l.word_buf, attention:it may in the range of (0,l_int_word_cnt) + // here l_end is the last nonzero index in l.word_buf, attention:it may in the + // range of (0,l_int_word_cnt) l_frac_word_cnt = cmp::max(0, l_end + 1 - l_stop as isize) as u8; - // here r_end is the last nonzero index in r.word_buf, attention:it may in the range of (0,r_int_word_cnt) + // here r_end is the last nonzero index in r.word_buf, attention:it may in the + // range of (0,r_int_word_cnt) r_frac_word_cnt = cmp::max(0, r_end + 1 - r_stop as isize) as u8; while l_idx as isize <= l_end && r_idx as isize <= r_end @@ -976,10 +982,10 @@ impl Decimal { } /// Given a precision count 'prec', get: - /// 1. the index of first non-zero word in self.word_buf to hold the leading 'prec' number of - /// digits - /// 2. the number of remained digits if we remove all leading zeros for the leading 'prec' - /// number of digits + /// 1. the index of first non-zero word in self.word_buf to hold the + /// leading 'prec' number of digits + /// 2. the number of remained digits if we remove all leading zeros for the + /// leading 'prec' number of digits fn remove_leading_zeroes(&self, prec: u8) -> (usize, u8) { let mut cnt = prec; let mut i = ((cnt + DIGITS_PER_WORD - 1) % DIGITS_PER_WORD) + 1; @@ -1016,7 +1022,8 @@ impl Decimal { (buf, word_start_idx, int_len, int_cnt, frac_cnt) } - /// Get the least precision and fraction count to encode this decimal completely. + /// Get the least precision and fraction count to encode this decimal + /// completely. pub fn prec_and_frac(&self) -> (u8, u8) { let (_, int_cnt) = self.remove_leading_zeroes(self.int_cnt); let prec = int_cnt + self.frac_cnt; @@ -1338,8 +1345,9 @@ impl Decimal { dec } - /// `shift` shifts decimal digits in given number (with rounding if it need), - /// shift > 0 means shift to left shift, shift < 0 means right shift. + /// `shift` shifts decimal digits in given number (with rounding if it + /// need), shift > 0 means shift to left shift, shift < 0 means right + /// shift. /// /// In fact it is multiplying on 10^shift. pub fn shift(self, shift: isize) -> Res { @@ -1564,7 +1572,8 @@ impl Decimal { Decimal::from_bytes_with_word_buf(s, WORD_BUF_LEN) } - /// Returns a `Decimal` from a given bytes slice buffer and specified buffer length + /// Returns a `Decimal` from a given bytes slice buffer and specified buffer + /// length /// /// # Notes /// @@ -1574,7 +1583,7 @@ impl Decimal { fn from_bytes_with_word_buf(s: &[u8], word_buf_len: u8) -> Result> { // trim whitespace let mut bs = match s.iter().position(|c| !c.is_ascii_whitespace()) { - //TODO: return badnumber + // TODO: return badnumber None => return Err(box_err!("\"{}\" is empty", escape(s))), Some(pos) => &s[pos..], }; @@ -1618,7 +1627,7 @@ impl Decimal { word += u32::from(c - b'0') * TEN_POW[inner_idx]; inner_idx += 1; if inner_idx == DIGITS_PER_WORD as usize { - //TODO overflow + // TODO overflow word_idx -= 1; d.word_buf[word_idx] = word; word = 0; @@ -2245,7 +2254,8 @@ pub trait DecimalDecoder: NumberDecoder { Ok(d) } - /// `read_decimal_from_chunk` decode Decimal encoded by `write_decimal_to_chunk`. + /// `read_decimal_from_chunk` decode Decimal encoded by + /// `write_decimal_to_chunk`. fn read_decimal_from_chunk(&mut self) -> Result { let buf = self.read_bytes(DECIMAL_STRUCT_SIZE)?; let d = unsafe { @@ -2457,12 +2467,15 @@ mod tests { Ok(Decimal::from_str("-18446744073709552000").unwrap()), ), // FIXME: because of rust's bug, - // (1<<64)(18446744073709551616), (1<<65)(36893488147419103232) can not be represent by f64 - // so these cases can not pass + // (1<<64)(18446744073709551616), (1<<65)(36893488147419103232) can not be represent + // by f64 so these cases can not pass // (18446744073709551616.0, Ok(Decimal::from_str("18446744073709551616").unwrap())), // (-18446744073709551616.0, Ok(Decimal::from_str("-18446744073709551616").unwrap())), // (36893488147419103000.0, Ok(Decimal::from_str("36893488147419103000.0").unwrap())), - // (-36893488147419103000.0, Ok(Decimal::from_str("-36893488147419103000.0").unwrap())), + // ( + // -36893488147419103000.0, + // Ok(Decimal::from_str("-36893488147419103000.0").unwrap()) + // ), ( 36893488147419103000.0, Ok(Decimal::from_str("36893488147419103000.0").unwrap()), diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index e151c8fd0c5..370467b9928 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -150,9 +150,9 @@ mod parser { Ok((rest, hhmmss)) } - /// A string can match datetime format only if it starts with a series of digits - /// whose length matches the full format of DateTime literal (12, 14) - /// or the string starts with a date literal. + /// A string can match datetime format only if it starts with a series of + /// digits whose length matches the full format of DateTime literal (12, + /// 14) or the string starts with a date literal. fn format_can_match_datetime(input: &str) -> IResult<(), (), ()> { let (rest, digits) = digit1(input)?; @@ -253,8 +253,9 @@ mod parser { )) }); - // In order to keep compatible with TiDB, when input string can only be partially parsed by `hhmmss_compact` - // and it can match the datetime format, we fallback to parse it using datetime format. + // In order to keep compatible with TiDB, when input string can only be + // partially parsed by `hhmmss_compact` and it can match the datetime + // format, we fallback to parse it using datetime format. if truncated_parse && fallback_to_datetime { return hhmmss_datetime(ctx, rest, fsp).map_or(None, |(_, duration)| Some(duration)); } @@ -363,7 +364,8 @@ impl Duration { } /// Returns the number of seconds contained by this Duration as f64. - /// The returned value does include the fractional (nanosecond) part of the duration. + /// The returned value does include the fractional (nanosecond) part of the + /// duration. #[inline] pub fn to_secs_f64(self) -> f64 { self.nanos as f64 / NANOS_PER_SEC as f64 @@ -507,7 +509,8 @@ impl Duration { Ok(Duration { nanos, fsp }) } - /// Checked duration addition. Computes self + rhs, returning None if overflow occurred. + /// Checked duration addition. Computes self + rhs, returning None if + /// overflow occurred. pub fn checked_add(self, rhs: Duration) -> Option { let nanos = self.nanos.checked_add(rhs.nanos)?; check_nanos(nanos).ok()?; @@ -517,7 +520,8 @@ impl Duration { }) } - /// Checked duration subtraction. Computes self - rhs, returning None if overflow occurred. + /// Checked duration subtraction. Computes self - rhs, returning None if + /// overflow occurred. pub fn checked_sub(self, rhs: Duration) -> Option { let nanos = self.nanos.checked_sub(rhs.nanos)?; check_nanos(nanos).ok()?; diff --git a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs index 1cad179b475..fe8bb2c35d7 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs @@ -145,8 +145,8 @@ impl<'a> PartialOrd for JsonRef<'a> { let left_data = self.as_f64(); let right_data = right.as_f64(); - // tidb treats boolean as integer, but boolean is different from integer in JSON. - // so we need convert them to same type and then compare. + // tidb treats boolean as integer, but boolean is different from integer in + // JSON. so we need convert them to same type and then compare. if let (Ok(left), Ok(right)) = (left_data, right_data) { return left.partial_cmp(&right); } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs index bc867904fd6..f7c1198c542 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs @@ -7,9 +7,10 @@ use super::{ }; impl<'a> JsonRef<'a> { - /// `extract` receives several path expressions as arguments, matches them in j, and returns - /// the target JSON matched any path expressions, which may be autowrapped as an array. - /// If there is no any expression matched, it returns None. + /// `extract` receives several path expressions as arguments, matches them + /// in j, and returns the target JSON matched any path expressions, which + /// may be autowrapped as an array. If there is no any expression matched, + /// it returns None. /// /// See `Extract()` in TiDB `json.binary_function.go` pub fn extract(&self, path_expr_list: &[PathExpression]) -> Result> { diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_keys.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_keys.rs index 96bc9aaf56e..68c361321ad 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_keys.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_keys.rs @@ -5,7 +5,8 @@ use std::str; use super::{super::Result, path_expr::PathExpression, Json, JsonRef, JsonType}; impl<'a> JsonRef<'a> { - /// Evaluates a (possibly empty) list of values and returns a JSON array containing those values specified by `path_expr_list` + /// Evaluates a (possibly empty) list of values and returns a JSON array + /// containing those values specified by `path_expr_list` pub fn keys(&self, path_expr_list: &[PathExpression]) -> Result> { if !path_expr_list.is_empty() { if path_expr_list.len() > 1 { diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_merge.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_merge.rs index 3bccdce7017..627daf77722 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_merge.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_merge.rs @@ -13,7 +13,8 @@ impl Json { /// 1. adjacent arrays are merged to a single array; /// 2. adjacent object are merged to a single object; /// 3. a scalar value is autowrapped as an array before merge; - /// 4. an adjacent array and object are merged by autowrapping the object as an array. + /// 4. an adjacent array and object are merged by autowrapping the object as + /// an array. /// /// See `MergeBinary()` in TiDB `json/binary_function.go` #[allow(clippy::comparison_chain)] diff --git a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs index 2b36a4b89d0..7251f5477f6 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs @@ -54,7 +54,6 @@ //! // lengths up to 127, 2 bytes to represent //! // lengths up to 16383, and so on... //! ``` -//! mod binary; mod comparison; @@ -432,7 +431,8 @@ impl ConvertTo for i64 { impl ConvertTo for f64 { #[inline] fn convert(&self, _: &mut EvalContext) -> Result { - // FIXME: `select json_type(cast(1111.11 as json))` should return `DECIMAL`, we return `DOUBLE` now. + // FIXME: `select json_type(cast(1111.11 as json))` should return `DECIMAL`, we + // return `DOUBLE` now. let mut value = vec![0; F64_SIZE]; NumberCodec::encode_f64_le(&mut value, *self); Ok(Json { @@ -445,7 +445,8 @@ impl ConvertTo for f64 { impl ConvertTo for Real { #[inline] fn convert(&self, _: &mut EvalContext) -> Result { - // FIXME: `select json_type(cast(1111.11 as json))` should return `DECIMAL`, we return `DOUBLE` now. + // FIXME: `select json_type(cast(1111.11 as json))` should return `DECIMAL`, we + // return `DOUBLE` now. let mut value = vec![0; F64_SIZE]; NumberCodec::encode_f64_le(&mut value, self.into_inner()); Ok(Json { @@ -458,7 +459,8 @@ impl ConvertTo for Real { impl ConvertTo for Decimal { #[inline] fn convert(&self, ctx: &mut EvalContext) -> Result { - // FIXME: `select json_type(cast(1111.11 as json))` should return `DECIMAL`, we return `DOUBLE` now. + // FIXME: `select json_type(cast(1111.11 as json))` should return `DECIMAL`, we + // return `DOUBLE` now. let val: f64 = self.convert(ctx)?; val.convert(ctx) } @@ -589,7 +591,8 @@ mod tests { ("{}", ERR_TRUNCATE_WRONG_VALUE), ("[]", ERR_TRUNCATE_WRONG_VALUE), ]; - // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as true + // avoid to use EvalConfig::default_for_test() that set Flag::IGNORE_TRUNCATE as + // true let mut ctx = EvalContext::new(Arc::new(EvalConfig::new())); for (jstr, exp) in test_cases { let json: Json = jstr.parse().unwrap(); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs index 5118da55377..ecdec8adad4 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs @@ -52,7 +52,8 @@ impl<'a> BinaryModifier<'a> { self.rebuild() } - /// Replaces the existing value JSON specified by the expression path with `new` + /// Replaces the existing value JSON specified by the expression path with + /// `new` pub fn replace(mut self, path: &PathExpression, new: Json) -> Result { let result = extract_json(self.old, path.legs.as_slice())?; if result.is_empty() { @@ -63,8 +64,8 @@ impl<'a> BinaryModifier<'a> { self.rebuild() } - /// Inserts a `new` into `old` JSON document by given expression path without replacing - /// existing values + /// Inserts a `new` into `old` JSON document by given expression path + /// without replacing existing values pub fn insert(mut self, path: &PathExpression, new: Json) -> Result { let result = extract_json(self.old, path.legs.as_slice())?; if !result.is_empty() { @@ -97,7 +98,8 @@ impl<'a> BinaryModifier<'a> { for i in 0..elem_count { elems.push(parent_node.array_get_elem(i)?); } - // We can ignore the idx in the PathLeg here since we have checked the path-value existence + // We can ignore the idx in the PathLeg here since we have checked the + // path-value existence elems.push(new.as_ref()); self.new_value = Some(Json::from_ref_array(elems)?); } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs b/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs index 09e524fe373..afb9cafff67 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs @@ -15,6 +15,7 @@ // 2) double asterisk(**) could not be last leg; // // Examples: +// ``` // select json_extract('{"a": "b", "c": [1, "2"]}', '$.a') -> "b" // select json_extract('{"a": "b", "c": [1, "2"]}', '$.c') -> [1, "2"] // select json_extract('{"a": "b", "c": [1, "2"]}', '$.a', '$.c') -> ["b", [1, "2"]] @@ -22,6 +23,7 @@ // select json_extract('{"a": "b", "c": [1, "2"]}', '$.c[2]') -> NULL // select json_extract('{"a": "b", "c": [1, "2"]}', '$.c[*]') -> [1, "2"] // select json_extract('{"a": "b", "c": [1, "2"]}', '$.*') -> ["b", [1, "2"]] +// ``` use std::ops::Index; @@ -33,7 +35,8 @@ use crate::codec::Result; pub const PATH_EXPR_ASTERISK: &str = "*"; // [a-zA-Z_][a-zA-Z0-9_]* matches any identifier; -// "[^"\\]*(\\.[^"\\]*)*" matches any string literal which can carry escaped quotes. +// "[^"\\]*(\\.[^"\\]*)*" matches any string literal which can carry escaped +// quotes. const PATH_EXPR_LEG_RE_STR: &str = r#"(\.\s*([a-zA-Z_][a-zA-Z0-9_]*|\*|"[^"\\]*(\\.[^"\\]*)*")|(\[\s*([0-9]+|\*)\s*\])|\*\*)"#; @@ -135,7 +138,8 @@ pub fn parse_json_path_expr(path_expr: &str) -> Result { legs.push(PathLeg::DoubleAsterisk); } } - // Check `!expr.is_empty()` here because "$" is a valid path to specify the current JSON. + // Check `!expr.is_empty()` here because "$" is a valid path to specify the + // current JSON. if (last_end == 0) && (!expr.is_empty()) { return Err(box_err!("Invalid JSON path: {}", path_expr)); } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs index 984bb151323..1b848c3534f 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs @@ -58,7 +58,8 @@ impl MySQLFormatter { } impl<'a> ToString for JsonRef<'a> { - /// This function is a simple combination and rewrite of serde_json's `to_writer_pretty` + /// This function is a simple combination and rewrite of serde_json's + /// `to_writer_pretty` fn to_string(&self) -> String { let mut writer = Vec::with_capacity(128); let mut ser = JsonSerializer::with_formatter(&mut writer, MySQLFormatter::new()); diff --git a/components/tidb_query_datatype/src/codec/mysql/time/extension.rs b/components/tidb_query_datatype/src/codec/mysql/time/extension.rs index 816d189c999..7cc233e92d1 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/extension.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/extension.rs @@ -59,11 +59,12 @@ impl DateTimeExtension for Time { } /// returns the week of year and year. should not be called directly. - /// when monday_first == true, Monday is considered as the first day in the week, - /// otherwise Sunday. - /// when week_year == true, week is from 1 to 53, otherwise from 0 to 53. - /// when first_weekday == true, the week that contains the first 'first-day-of-week' is week 1, - /// otherwise weeks are numbered according to ISO 8601:1988. + /// - when monday_first == true, Monday is considered as the first day in + /// the week, otherwise Sunday. + /// - when week_year == true, week is from 1 to 53, otherwise from 0 to 53. + /// - when first_weekday == true, the week that contains the first + /// 'first-day-of-week' is week 1, otherwise weeks are numbered according + /// to ISO 8601:1988. fn calc_year_week( &self, monday_first: bool, @@ -104,8 +105,8 @@ impl DateTimeExtension for Time { (year, week) } - /// returns the week of year according to week mode. should not be called directly. - /// implements TiDB calcWeek() + /// returns the week of year according to week mode. should not be called + /// directly. implements TiDB calcWeek() fn calc_year_week_by_week_mode(&self, week_mode: WeekMode) -> (i32, i32) { let mode = week_mode.to_normalized(); let monday_first = mode.contains(WeekMode::BEHAVIOR_MONDAY_FIRST); diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 29b66725e2a..5d387f1cdff 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -73,9 +73,9 @@ fn last_day_of_month(year: u32, month: u32) -> u32 { /// assert_eq!([2019, 12, 2, 0, 0, 0, 0], parts); /// ``` /// When year, month or day is zero, there can not have a carry. -/// e.g.: `"1998-11-00 23:59:59.999" (fsp = 2, round = true)`, in `hms` it contains a carry, -/// however, the `day` is 0, which is invalid in `MySQL`. When thoese cases encountered, return -/// None. +/// e.g.: `"1998-11-00 23:59:59.999" (fsp = 2, round = true)`, in `hms` it +/// contains a carry, however, the `day` is 0, which is invalid in `MySQL`. When +/// thoese cases encountered, return None. fn round_components(parts: &mut [u32]) -> Option<()> { debug_assert_eq!(parts.len(), 7); let modulus = [ @@ -113,9 +113,10 @@ fn chrono_datetime( second: u32, micro: u32, ) -> Result> { - // NOTE: We are not using `tz::from_ymd_opt` as suggested in chrono's README due to - // chronotope/chrono-tz #23. - // As a workaround, we first build a NaiveDate, then attach time zone information to it. + // NOTE: We are not using `tz::from_ymd_opt` as suggested in chrono's README due + // to chronotope/chrono-tz #23. + // As a workaround, we first build a NaiveDate, then attach time zone + // information to it. NaiveDate::from_ymd_opt(year as i32, month, day) .and_then(|date| date.and_hms_opt(hour, minute, second)) .and_then(|t| t.checked_add_signed(chrono::Duration::microseconds(i64::from(micro)))) @@ -344,7 +345,8 @@ mod parser { /// ```ignore /// split_components_with_tz(b"2020-12-24T15:37:50+0800")?.1 == Some(480*60) /// ``` - /// the second value if not None indicates the offset in seconds of the timezone parsed + /// the second value if not None indicates the offset in seconds of the + /// timezone parsed fn split_components_with_tz(input: &str) -> Option<(Vec<&[u8]>, Option)> { let mut buffer = input.as_bytes(); @@ -508,8 +510,9 @@ mod parser { } } - /// Try to parse a datetime string `input` without fractional part and separators. - /// return an array that stores `[year, month, day, hour, minute, second, 0]` + /// Try to parse a datetime string `input` without fractional part and + /// separators. return an array that stores `[year, month, day, hour, + /// minute, second, 0]` fn parse_whole(input: &[u8]) -> Option<[u32; 7]> { let mut parts = [0u32; 7]; @@ -535,8 +538,8 @@ mod parser { Some(parts) } - /// Try to parse a fractional part from `input` with `fsp`, round the result if `round` is - /// true. + /// Try to parse a fractional part from `input` with `fsp`, round the result + /// if `round` is true. /// NOTE: This function assumes that `fsp` is in range: [0, 6]. fn parse_frac(input: &[u8], fsp: u8, round: bool) -> Option<(bool, u32)> { debug_assert!(fsp < 7); @@ -568,8 +571,8 @@ mod parser { let trimmed = input.trim(); (!trimmed.is_empty()).as_option()?; - // to support ISO8601 and MySQL's time zone support, we further parse the following formats - // 2020-12-17T11:55:55Z + // to support ISO8601 and MySQL's time zone support, we further parse the + // following formats 2020-12-17T11:55:55Z // 2020-12-17T11:55:55+0800 // 2020-12-17T11:55:55-08 // 2020-12-17T11:55:55+02:00 @@ -835,8 +838,8 @@ fn handle_invalid_date(ctx: &mut EvalContext, mut args: TimeArgs) -> Result for Time { return Ok(Duration::zero()); } let seconds = i64::from(self.hour() * 3600 + self.minute() * 60 + self.second()); - // `microsecond` returns the number of microseconds since the whole non-leap second. - // Such as for 2019-09-22 07:21:22.670936103 UTC, + // `microsecond` returns the number of microseconds since the whole non-leap + // second. Such as for 2019-09-22 07:21:22.670936103 UTC, // it will return 670936103. let microsecond = i64::from(self.micro()); Duration::from_micros(seconds * 1_000_000 + microsecond, self.fsp() as i8) @@ -2606,7 +2609,8 @@ mod tests { for case in cases { // Enable NO_ZERO_DATE, STRICT_MODE and ALLOW_INVALID_DATE. - // If an invalid date (converted to zero-date) is encountered, an error is returned. + // If an invalid date (converted to zero-date) is encountered, an error is + // returned. let mut ctx = EvalContext::from(TimeEnv { no_zero_date: true, strict_mode: true, @@ -2623,7 +2627,8 @@ mod tests { let cases = vec!["2019-01-00", "2019-00-01"]; for &case in cases.iter() { - // Enable NO_ZERO_IN_DATE only. If zero-date is encountered, a warning is produced. + // Enable NO_ZERO_IN_DATE only. If zero-date is encountered, a warning is + // produced. let mut ctx = EvalContext::from(TimeEnv { no_zero_in_date: true, ..TimeEnv::default() diff --git a/components/tidb_query_datatype/src/codec/mysql/time/tz.rs b/components/tidb_query_datatype/src/codec/mysql/time/tz.rs index 1efb2f3997c..7b90e96b78c 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/tz.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/tz.rs @@ -4,8 +4,8 @@ use std::{fmt, str::FromStr}; use chrono::*; -/// A time zone represented by either offset (i.e. +8) or name (i.e. Asia/Shanghai). In addition, -/// local time zone is also valid. +/// A time zone represented by either offset (i.e. +8) or name (i.e. +/// Asia/Shanghai). In addition, local time zone is also valid. #[derive(Clone)] pub enum Tz { /// A time zone specified by offset seconds. @@ -26,8 +26,8 @@ impl Tz { FixedOffset::east_opt(secs as i32).map(Tz::Offset) } - /// Constructs a time zone from the name. If the specified time zone name is `system`, - /// a local time zone will be constructed. + /// Constructs a time zone from the name. If the specified time zone name is + /// `system`, a local time zone will be constructed. pub fn from_tz_name(name: &str) -> Option { if name.to_lowercase() == "system" { Some(Tz::local()) diff --git a/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs b/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs index 25c651e1243..2e4a0703d4a 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs @@ -123,15 +123,17 @@ pub trait V1CompatibleEncoder: DatumFlagAndPayloadEncoder { impl V1CompatibleEncoder for T {} -/// These tests mainly focus on transfer the v2 encoding to v1-compatible encoding. +/// These tests mainly focus on transfer the v2 encoding to v1-compatible +/// encoding. /// /// The test path is: -/// 1. Encode value using v2 -/// 2. Use `V1CompatibleEncoder` to transfer the encoded bytes from v2 to v1-compatible -/// 3. Use `RawDatumDecoder` decode the encoded bytes, check the result. +/// - Encode value using v2 +/// - Use `V1CompatibleEncoder` to transfer the encoded bytes from v2 to +/// v1-compatible +/// - Use `RawDatumDecoder` decode the encoded bytes, check the result. /// -/// Note: a value encoded using v2 then transfer to v1-compatible encoding, is not always equals the -/// encoded-bytes using v1 directly. +/// Note: a value encoded using v2 then transfer to v1-compatible encoding, is +/// not always equals the encoded-bytes using v1 directly. #[cfg(test)] mod tests { use std::{f64, i16, i32, i64, i8, u16, u32, u64, u8}; diff --git a/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs b/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs index 09611adfbf6..1ee5104b723 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs @@ -1,18 +1,20 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! This `encoder` module is only used for test, so the implementation is very straightforward. +//! This `encoder` module is only used for test, so the implementation is very +//! straightforward. //! //! According to //! //! The row format is: -//! +//! ``` //! | version | flag | number_of_non_null_columns | number_of_null_columns | non_null_column_ids | null_column_ids | value_offsets | values | //! |---------| ---- | -------------------------- | ---------------------- | ------------------- | --------------- | ------------- | ------ | -//! +//! ``` //! length about each field: //! //! * version: 1 byte -//! * flag: 1 byte, when there's id greater than 255 or the total size of the values is greater than 65535 , value is 1, otherwise 0 +//! * flag: 1 byte, when there's id greater than 255 or the total size of the +//! values is greater than 65535 , value is 1, otherwise 0 //! * number of non-null values: 2 bytes //! * number of null values: 2 bytes //! * non-null column ids: when flag == 1 (big), id is 4 bytes, otherwise 1 byte diff --git a/components/tidb_query_datatype/src/codec/row/v2/mod.rs b/components/tidb_query_datatype/src/codec/row/v2/mod.rs index 2265cd3803d..b0cec291410 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/mod.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/mod.rs @@ -3,7 +3,8 @@ use bitflags::bitflags; // Prior to v2, the first byte is not version code, but datum type. -// From v2, it's used for version code, and the value starts from 128, to be compatible. +// From v2, it's used for version code, and the value starts from 128, to be +// compatible. pub const CODEC_VERSION: u8 = 128; bitflags! { diff --git a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs index 66aa4df0902..94e9dd0a9ae 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs @@ -61,12 +61,13 @@ impl RowSlice<'_> { /// Search `id` in non-null ids /// - /// Returns the `start` position and `offset` in `values` field if found, otherwise returns `None` + /// Returns the `start` position and `offset` in `values` field if found, + /// otherwise returns `None` /// /// # Errors /// - /// If the id is found with no offset(It will only happen when the row data is broken), - /// `Error::ColumnOffset` will be returned. + /// If the id is found with no offset(It will only happen when the row data + /// is broken), `Error::ColumnOffset` will be returned. pub fn search_in_non_null_ids(&self, id: i64) -> Result> { if !self.id_valid(id) { return Ok(None); @@ -170,7 +171,8 @@ impl RowSlice<'_> { /// Decodes `len` number of ints from `buf` in little endian /// /// Note: -/// This method is only implemented on little endianness currently, since x86 use little endianness. +/// This method is only implemented on little endianness currently, since x86 +/// use little endianness. #[cfg(target_endian = "little")] #[inline] fn read_le_bytes<'a, T>(buf: &mut &'a [u8], len: usize) -> Result> @@ -280,7 +282,7 @@ mod tests { let cols = vec![ Column::new(1, 1000), Column::new(356, 2), - Column::new(33, ScalarValue::Int(None)), //0x21 + Column::new(33, ScalarValue::Int(None)), // 0x21 Column::new(3, 3), Column::new(64123, 5), ]; diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index c49fefb4e73..2cb2f055842 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -100,8 +100,8 @@ pub fn check_index_key(key: &[u8]) -> Result<()> { check_key_type(key, INDEX_PREFIX_SEP) } -/// `check_key_type` checks if the key is the type we want, `wanted_type` should be -/// `table::RECORD_PREFIX_SEP` or `table::INDEX_PREFIX_SEP` . +/// `check_key_type` checks if the key is the type we want, `wanted_type` should +/// be `table::RECORD_PREFIX_SEP` or `table::INDEX_PREFIX_SEP` . #[inline] fn check_key_type(key: &[u8], wanted_type: &[u8]) -> Result<()> { let mut buf = key; @@ -184,7 +184,8 @@ pub fn encode_common_handle_for_test(table_id: i64, handle: &[u8]) -> Vec { key } -/// `encode_column_key` encodes the table id, row handle and column id into a byte array. +/// `encode_column_key` encodes the table id, row handle and column id into a +/// byte array. pub fn encode_column_key(table_id: i64, handle: i64, column_id: i64) -> Vec { let mut key = Vec::with_capacity(RECORD_ROW_KEY_LEN + ID_LEN); key.append_table_record_prefix(table_id).unwrap(); @@ -391,7 +392,8 @@ impl RowColsDict { self.cols.insert(cid, RowColMeta::new(offset, length)); } - /// Gets binary of cols, keeps the original order, and returns one slice and cols' end offsets. + /// Gets binary of cols, keeps the original order, and returns one slice and + /// cols' end offsets. pub fn get_column_values_and_end_offsets(&self) -> (&[u8], Vec) { let mut start = self.value.len(); let mut length = 0; @@ -789,7 +791,7 @@ mod tests { range.set_start(small_key.clone()); range.set_end(large_key.clone()); assert!(check_table_ranges(&[range]).is_ok()); - //test range.start > range.end + // test range.start > range.end let mut range = KeyRange::default(); range.set_end(small_key.clone()); range.set_start(large_key); diff --git a/components/tidb_query_datatype/src/def/eval_type.rs b/components/tidb_query_datatype/src/def/eval_type.rs index 16ec996b531..9addab99e56 100644 --- a/components/tidb_query_datatype/src/def/eval_type.rs +++ b/components/tidb_query_datatype/src/def/eval_type.rs @@ -4,9 +4,9 @@ use std::fmt; /// Function implementations' parameter data types. /// -/// It is similar to the `EvalType` in TiDB, but doesn't provide type `Timestamp`, which is -/// handled by the same type as `DateTime` here, instead of a new type. Also, `String` is -/// called `Bytes` here to be less confusing. +/// It is similar to the `EvalType` in TiDB, but doesn't provide type +/// `Timestamp`, which is handled by the same type as `DateTime` here, instead +/// of a new type. Also, `String` is called `Bytes` here to be less confusing. #[derive(Debug, PartialEq, Clone, Copy)] pub enum EvalType { Int, @@ -23,8 +23,8 @@ pub enum EvalType { impl EvalType { /// Converts `EvalType` into one of the compatible `FieldTypeTp`s. /// - /// This function should be only useful in test scenarios that only cares about `EvalType` but - /// accepts a `FieldTypeTp`. + /// This function should be only useful in test scenarios that only cares + /// about `EvalType` but accepts a `FieldTypeTp`. pub fn into_certain_field_type_tp_for_test(self) -> crate::FieldTypeTp { match self { EvalType::Int => crate::FieldTypeTp::LongLong, @@ -49,7 +49,8 @@ impl fmt::Display for EvalType { impl std::convert::TryFrom for EvalType { type Error = crate::DataTypeError; - // Succeeds for all field types supported as eval types, fails for unsupported types. + // Succeeds for all field types supported as eval types, fails for unsupported + // types. fn try_from(tp: crate::FieldTypeTp) -> Result { let eval_type = match tp { crate::FieldTypeTp::Tiny @@ -76,7 +77,8 @@ impl std::convert::TryFrom for EvalType { | crate::FieldTypeTp::Null => EvalType::Bytes, crate::FieldTypeTp::Enum => EvalType::Enum, _ => { - // TODO: we need to handle FieldTypeTp::{Enum, Set} after we implement encode and decode. + // TODO: we need to handle FieldTypeTp::{Enum, Set} after we implement encode + // and decode. return Err(crate::DataTypeError::UnsupportedType { name: tp.to_string(), }); diff --git a/components/tidb_query_datatype/src/def/field_type.rs b/components/tidb_query_datatype/src/def/field_type.rs index b52065d8a72..ac89ad53318 100644 --- a/components/tidb_query_datatype/src/def/field_type.rs +++ b/components/tidb_query_datatype/src/def/field_type.rs @@ -10,9 +10,10 @@ use crate::error::DataTypeError; /// /// `FieldType` is the field type of a column defined by schema. /// -/// `ColumnInfo` describes a column. It contains `FieldType` and some other column specific -/// information. However for historical reasons, fields in `FieldType` (for example, `tp`) -/// are flattened into `ColumnInfo`. Semantically these fields are identical. +/// `ColumnInfo` describes a column. It contains `FieldType` and some other +/// column specific information. However for historical reasons, fields in +/// `FieldType` (for example, `tp`) are flattened into `ColumnInfo`. +/// Semantically these fields are identical. /// /// Please refer to [mysql/type.go](https://github.com/pingcap/parser/blob/master/mysql/type.go). #[derive(PartialEq, Debug, Clone, Copy)] @@ -117,9 +118,9 @@ pub enum Collation { impl Collation { /// Parse from collation id. /// - /// These are magic numbers defined in tidb, where positive numbers are for legacy - /// compatibility, and all new clusters with padding configuration enabled will - /// use negative numbers to indicate the padding behavior. + /// These are magic numbers defined in tidb, where positive numbers are for + /// legacy compatibility, and all new clusters with padding configuration + /// enabled will use negative numbers to indicate the padding behavior. pub fn from_i32(n: i32) -> Result { match n { -33 | -45 => Ok(Collation::Utf8Mb4GeneralCi), @@ -215,8 +216,9 @@ pub trait FieldTypeAccessor { fn set_collation(&mut self, collation: Collation) -> &mut dyn FieldTypeAccessor; - /// Convert reference to `FieldTypeAccessor` interface. Useful when an implementer - /// provides inherent methods with the same name as the accessor trait methods. + /// Convert reference to `FieldTypeAccessor` interface. Useful when an + /// implementer provides inherent methods with the same name as the accessor + /// trait methods. fn as_accessor(&self) -> &dyn FieldTypeAccessor where Self: Sized, @@ -232,8 +234,8 @@ pub trait FieldTypeAccessor { self as &mut dyn FieldTypeAccessor } - /// Whether this type is a hybrid type, which can represent different types of value in - /// specific context. + /// Whether this type is a hybrid type, which can represent different types + /// of value in specific context. /// /// Please refer to `Hybrid` in TiDB. #[inline] @@ -254,7 +256,8 @@ pub trait FieldTypeAccessor { || tp == FieldTypeTp::LongBlob } - /// Whether this type is a char-like type like a string type or a varchar type. + /// Whether this type is a char-like type like a string type or a varchar + /// type. /// /// Please refer to `IsTypeChar` in TiDB. #[inline] @@ -263,7 +266,8 @@ pub trait FieldTypeAccessor { tp == FieldTypeTp::String || tp == FieldTypeTp::VarChar } - /// Whether this type is a varchar-like type like a varstring type or a varchar type. + /// Whether this type is a varchar-like type like a varstring type or a + /// varchar type. /// /// Please refer to `IsTypeVarchar` in TiDB. #[inline] diff --git a/components/tidb_query_datatype/src/expr/ctx.rs b/components/tidb_query_datatype/src/expr/ctx.rs index f92c561b013..0e488689fce 100644 --- a/components/tidb_query_datatype/src/expr/ctx.rs +++ b/components/tidb_query_datatype/src/expr/ctx.rs @@ -89,8 +89,8 @@ impl EvalConfig { } else if req.has_time_zone_offset() { box_try!(eval_cfg.set_time_zone_by_offset(req.get_time_zone_offset())); } else { - // This should not be reachable. However we will not panic here in case - // of compatibility issues. + // This should not be reachable. However we will not panic here in + // case of compatibility issues. } if req.has_max_warning_count() { eval_cfg.set_max_warning_cnt(req.get_max_warning_count() as usize); @@ -316,8 +316,8 @@ impl EvalContext { } /// Indicates whether values less than 0 should be clipped to 0 for unsigned - /// integer types. This is the case for `insert`, `update`, `alter table` and - /// `load data infile` statements, when not in strict SQL mode. + /// integer types. This is the case for `insert`, `update`, `alter table` + /// and `load data infile` statements, when not in strict SQL mode. /// see pub fn should_clip_to_zero(&self) -> bool { self.cfg.flag.contains(Flag::IN_INSERT_STMT) @@ -370,37 +370,37 @@ mod tests { fn test_handle_division_by_zero() { let cases = vec![ //(flag,sql_mode,is_ok,is_empty) - (Flag::empty(), SqlMode::empty(), true, false), //warning + (Flag::empty(), SqlMode::empty(), true, false), // warning ( Flag::IN_INSERT_STMT, SqlMode::ERROR_FOR_DIVISION_BY_ZERO, true, false, - ), //warning + ), // warning ( Flag::IN_UPDATE_OR_DELETE_STMT, SqlMode::ERROR_FOR_DIVISION_BY_ZERO, true, false, - ), //warning + ), // warning ( Flag::IN_UPDATE_OR_DELETE_STMT, SqlMode::ERROR_FOR_DIVISION_BY_ZERO | SqlMode::STRICT_ALL_TABLES, false, true, - ), //error + ), // error ( Flag::IN_UPDATE_OR_DELETE_STMT, SqlMode::STRICT_ALL_TABLES, true, true, - ), //ok + ), // ok ( Flag::IN_UPDATE_OR_DELETE_STMT | Flag::DIVIDED_BY_ZERO_AS_WARNING, SqlMode::ERROR_FOR_DIVISION_BY_ZERO | SqlMode::STRICT_ALL_TABLES, true, false, - ), //warning + ), // warning ]; for (flag, sql_mode, is_ok, is_empty) in cases { let mut cfg = EvalConfig::new(); @@ -415,12 +415,12 @@ mod tests { fn test_handle_invalid_time_error() { let cases = vec![ //(flag,strict_sql_mode,is_ok,is_empty) - (Flag::empty(), false, true, false), //warning - (Flag::empty(), true, true, false), //warning - (Flag::IN_INSERT_STMT, false, true, false), //warning - (Flag::IN_UPDATE_OR_DELETE_STMT, false, true, false), //warning - (Flag::IN_UPDATE_OR_DELETE_STMT, true, false, true), //error - (Flag::IN_INSERT_STMT, true, false, true), //error + (Flag::empty(), false, true, false), // warning + (Flag::empty(), true, true, false), // warning + (Flag::IN_INSERT_STMT, false, true, false), // warning + (Flag::IN_UPDATE_OR_DELETE_STMT, false, true, false), // warning + (Flag::IN_UPDATE_OR_DELETE_STMT, true, false, true), // error + (Flag::IN_INSERT_STMT, true, false, true), // error ]; for (flag, strict_sql_mode, is_ok, is_empty) in cases { let err = Error::invalid_time_format(""); diff --git a/components/tidb_query_executors/src/fast_hash_aggr_executor.rs b/components/tidb_query_executors/src/fast_hash_aggr_executor.rs index c5859e48338..942e61087d3 100644 --- a/components/tidb_query_executors/src/fast_hash_aggr_executor.rs +++ b/components/tidb_query_executors/src/fast_hash_aggr_executor.rs @@ -32,8 +32,8 @@ macro_rules! match_template_hashable { }} } -/// Fast Hash Aggregation Executor uses hash when comparing group key. It only supports one -/// group by column. +/// Fast Hash Aggregation Executor uses hash when comparing group key. It only +/// supports one group by column. pub struct BatchFastHashAggregationExecutor( AggregationExecutor, ); @@ -72,8 +72,8 @@ impl BatchExecutor for BatchFastHashAggregationExecutor } } -// We assign a dummy type `Box>` so that we can omit the type -// when calling `check_supported`. +// We assign a dummy type `Box>` so that we +// can omit the type when calling `check_supported`. impl BatchFastHashAggregationExecutor>> { /// Checks whether this executor can be used. #[inline] @@ -198,8 +198,8 @@ impl BatchFastHashAggregationExecutor { /// All groups. enum Groups { // The value of each hash table is the start index in `FastHashAggregationImpl::states` - // field. When there are new groups (i.e. new entry in the hash table), the states of the groups - // will be appended to `states`. + // field. When there are new groups (i.e. new entry in the hash table), the states of the + // groups will be appended to `states`. Int(HashMap, usize>), Real(HashMap, usize>), Bytes(HashMap, usize>), @@ -388,7 +388,8 @@ impl AggregationExecutorImpl for FastHashAggregationImp Ok(vec![group_by_column]) } - /// Fast hash aggregation can output aggregate results only if the source is drained. + /// Fast hash aggregation can output aggregate results only if the source is + /// drained. #[inline] fn is_partial_results_ready(&self) -> bool { false @@ -481,7 +482,8 @@ mod tests { #[test] fn test_it_works_integration() { - // This test creates a hash aggregation executor with the following aggregate functions: + // This test creates a hash aggregation executor with the following aggregate + // functions: // - COUNT(1) // - COUNT(col_1 + 5.0) // - AVG(col_0) @@ -548,18 +550,20 @@ mod tests { assert!(!r.is_drained.unwrap()); let mut r = exec.next_batch(1); - // col_0 + col_1 can result in [NULL, 9.0, 6.0], thus there will be three groups. + // col_0 + col_1 can result in [NULL, 9.0, 6.0], thus there will be three + // groups. assert_eq!(&r.logical_rows, &[0, 1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); assert_eq!(r.physical_columns.columns_len(), 5); // 4 result column, 1 group by column - // Let's check group by column first. Group by column is decoded in fast hash agg, - // but not decoded in slow hash agg. So decode it anyway. + // Let's check group by column first. Group by column is decoded in fast hash + // agg, but not decoded in slow hash agg. So decode it anyway. r.physical_columns[4] .ensure_all_decoded_for_test(&mut EvalContext::default(), &exec.schema()[4]) .unwrap(); - // The row order is not defined. Let's sort it by the group by column before asserting. + // The row order is not defined. Let's sort it by the group by column before + // asserting. let mut sort_column: Vec<(usize, _)> = r.physical_columns[4] .decoded() .to_real_vec() @@ -611,7 +615,8 @@ mod tests { #[test] fn test_group_by_a_constant() { - // This test creates a hash aggregation executor with the following aggregate functions: + // This test creates a hash aggregation executor with the following aggregate + // functions: // - COUNT(1) // - COUNT(col_1 + 5.0) // - AVG(col_0) @@ -707,7 +712,8 @@ mod tests { use tipb::ExprType; use tipb_helper::ExprDefBuilder; - // This test creates a hash aggregation executor with the following aggregate functions: + // This test creates a hash aggregation executor with the following aggregate + // functions: // - COUNT(col_0) // - AVG(col_1) // And group by: @@ -769,13 +775,14 @@ mod tests { assert_eq!(r.physical_columns.rows_len(), 3); assert_eq!(r.physical_columns.columns_len(), 4); // 3 result column, 1 group by column - // Let's check group by column first. Group by column is decoded in fast hash agg, - // but not decoded in slow hash agg. So decode it anyway. + // Let's check group by column first. Group by column is decoded in fast hash + // agg, but not decoded in slow hash agg. So decode it anyway. r.physical_columns[3] .ensure_all_decoded_for_test(&mut EvalContext::default(), &exec.schema()[3]) .unwrap(); - // The row order is not defined. Let's sort it by the group by column before asserting. + // The row order is not defined. Let's sort it by the group by column before + // asserting. let mut sort_column: Vec<(usize, _)> = r.physical_columns[3] .decoded() .to_bytes_vec() @@ -1079,7 +1086,8 @@ mod tests { #[test] fn test_group_by_enum_column() { - // This test creates a hash aggregation executor with the following aggregate functions: + // This test creates a hash aggregation executor with the following aggregate + // functions: // - COUNT(1) // And group by: // - col_0(enum_type) diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index ccf57f1235f..bcbf2b8f92b 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -31,8 +31,8 @@ use crate::interface::*; pub struct BatchIndexScanExecutor(ScanExecutor); -// We assign a dummy type `Box>` so that we can omit the type -// when calling `check_supported`. +// We assign a dummy type `Box>` so that we can +// omit the type when calling `check_supported`. impl BatchIndexScanExecutor>> { /// Checks whether this executor can be used. #[inline] @@ -52,32 +52,33 @@ impl BatchIndexScanExecutor { unique: bool, is_scanned_range_aware: bool, ) -> Result { - // Note 1: `unique = true` doesn't completely mean that it is a unique index scan. Instead - // it just means that we can use point-get for this index. In the following scenarios - // `unique` will be `false`: + // Note 1: `unique = true` doesn't completely mean that it is a unique index + // scan. Instead it just means that we can use point-get for this index. + // In the following scenarios `unique` will be `false`: // - scan from a non-unique index // - scan from a unique index with like: where unique-index like xxx // - // Note 2: Unlike table scan executor, the accepted `columns_info` of index scan executor is - // strictly stipulated. The order of columns in the schema must be the same as index data - // stored and if PK handle is needed it must be placed as the last one. + // Note 2: Unlike table scan executor, the accepted `columns_info` of index scan + // executor is strictly stipulated. The order of columns in the schema must be + // the same as index data stored and if PK handle is needed it must be placed as + // the last one. // - // Note 3: Currently TiDB may send multiple PK handles to TiKV (but only the last one is - // real). We accept this kind of request for compatibility considerations, but will be - // forbidden soon. + // Note 3: Currently TiDB may send multiple PK handles to TiKV (but only the + // last one is real). We accept this kind of request for compatibility + // considerations, but will be forbidden soon. // - // Note 4: When process global indexes, an extra partition ID column with column ID - // `table::EXTRA_PARTITION_ID_COL_ID` will append to column info to indicate which partiton - // handles belong to. See https://github.com/pingcap/parser/pull/1010 for more information. + // Note 4: When process global indexes, an extra partition ID column with column + // ID `table::EXTRA_PARTITION_ID_COL_ID` will append to column info to indicate which partiton handles belong to. See https://github.com/pingcap/parser/pull/1010 for more information. // - // Note 5: When process a partitioned table's index under tidb_partition_prune_mode = 'dynamic' - // and with either an active transaction buffer or with a SelectLock/pessimistic lock, we - // need to return the physical table id since several partitions may be included in the - // range. + // Note 5: When process a partitioned table's index under + // tidb_partition_prune_mode = 'dynamic' and with either an active transaction + // buffer or with a SelectLock/pessimistic lock, we need to return the physical + // table id since several partitions may be included in the range. // // Note 6: Also int_handle (-1), EXTRA_PARTITION_ID_COL_ID (-2) and - // EXTRA_PHYSICAL_TABLE_ID_COL_ID (-3) must be requested in this order in columns_info! - // since current implementation looks for them backards for -3, -2, -1. + // EXTRA_PHYSICAL_TABLE_ID_COL_ID (-3) must be requested in this order in + // columns_info! since current implementation looks for them backwards for -3, + // -2, -1. let physical_table_id_column_cnt = columns_info.last().map_or(0, |ci| { (ci.get_column_id() == table::EXTRA_PHYSICAL_TABLE_ID_COL_ID) as usize }); @@ -209,7 +210,8 @@ struct IndexScanExecutorImpl { decode_handle_strategy: DecodeHandleStrategy, /// Number of partition ID columns, now it can only be 0 or 1. - /// Must be after all normal columns and handle, but before physical_table_id_column + /// Must be after all normal columns and handle, but before + /// physical_table_id_column pid_column_cnt: usize, /// Number of Physical Table ID columns, can only be 0 or 1. @@ -230,10 +232,11 @@ impl ScanExecutorImpl for IndexScanExecutorImpl { &mut self.context } - /// Constructs empty columns, with PK containing int handle in decoded format and the rest in raw format. + /// Constructs empty columns, with PK containing int handle in decoded + /// format and the rest in raw format. /// - /// Note: the structure of the constructed column is the same as table scan executor but due - /// to different reasons. + /// Note: the structure of the constructed column is the same as table scan + /// executor but due to different reasons. fn build_column_vec(&self, scan_rows: usize) -> LazyBatchColumnVec { let columns_len = self.schema.len(); let mut columns = Vec::with_capacity(columns_len); @@ -278,53 +281,55 @@ impl ScanExecutorImpl for IndexScanExecutorImpl { } // Value layout: (see https://docs.google.com/document/d/1Co5iMiaxitv3okJmLYLJxZYCNChcjzswJMRr-_45Eqg/edit?usp=sharing) - // +-- IndexValueVersion0 (with restore data, or common handle, or index is global) - // | - // | Layout: TailLen | Options | Padding | [IntHandle] | [UntouchedFlag] - // | Length: 1 | len(options) | len(padding) | 8 | 1 - // | - // | TailLen: len(padding) + len(IntHandle) + len(UntouchedFlag) - // | Options: Encode some value for new features, such as common handle, new collations or global index. - // | See below for more information. - // | Padding: Ensure length of value always >= 10. (or >= 11 if UntouchedFlag exists.) - // | IntHandle: Only exists when table use int handles and index is unique. - // | UntouchedFlag: Only exists when index is untouched. - // | - // +-- Old Encoding (without restore data, integer handle, local) - // | - // | Layout: [Handle] | [UntouchedFlag] - // | Length: 8 | 1 - // | - // | Handle: Only exists in unique index. - // | UntouchedFlag: Only exists when index is untouched. - // | - // | If neither Handle nor UntouchedFlag exists, value will be one single byte '0' (i.e. []byte{'0'}). - // | Length of value <= 9, use to distinguish from the new encoding. + // ```text + // +-- IndexValueVersion0 (with restore data, or common handle, or index is global) // | - // +-- IndexValueForClusteredIndexVersion1 - // | - // | Layout: TailLen | VersionFlag | Version | Options | [UntouchedFlag] - // | Length: 1 | 1 | 1 | len(options) | 1 - // | - // | TailLen: len(UntouchedFlag) - // | Options: Encode some value for new features, such as common handle, new collations or global index. - // | See below for more information. - // | UntouchedFlag: Only exists when index is untouched. - // | - // | Layout of Options: - // | - // | Segment: Common Handle | Global Index | New Collation + // | Layout: TailLen | Options | Padding | [IntHandle] | [UntouchedFlag] + // | Length: 1 | len(options) | len(padding) | 8 | 1 + // | + // | TailLen: len(padding) + len(IntHandle) + len(UntouchedFlag) + // | Options: Encode some value for new features, such as common handle, new collations or global index. + // | See below for more information. + // | Padding: Ensure length of value always >= 10. (or >= 11 if UntouchedFlag exists.) + // | IntHandle: Only exists when table use int handles and index is unique. + // | UntouchedFlag: Only exists when index is untouched. + // | + // +-- Old Encoding (without restore data, integer handle, local) + // | + // | Layout: [Handle] | [UntouchedFlag] + // | Length: 8 | 1 + // | + // | Handle: Only exists in unique index. + // | UntouchedFlag: Only exists when index is untouched. + // | + // | If neither Handle nor UntouchedFlag exists, value will be one single byte '0' (i.e. []byte{'0'}). + // | Length of value <= 9, use to distinguish from the new encoding. + // | + // +-- IndexValueForClusteredIndexVersion1 + // | + // | Layout: TailLen | VersionFlag | Version | Options | [UntouchedFlag] + // | Length: 1 | 1 | 1 | len(options) | 1 + // | + // | TailLen: len(UntouchedFlag) + // | Options: Encode some value for new features, such as common handle, new collations or global index. + // | See below for more information. + // | UntouchedFlag: Only exists when index is untouched. + // | + // | Layout of Options: + // | + // | Segment: Common Handle | Global Index | New Collation // | Layout: CHandle Flag | CHandle Len | CHandle | PidFlag | PartitionID | restoreData - // | Length: 1 | 2 | len(CHandle) | 1 | 8 | len(restoreData) - // | - // | Common Handle Segment: Exists when unique index used common handles. - // | Global Index Segment: Exists when index is global. - // | New Collation Segment: Exists when new collation is used and index or handle contains non-binary string. - // | In v4.0, restored data contains all the index values. For example, (a int, b char(10)) and index (a, b). - // | The restored data contains both the values of a and b. - // | In v5.0, restored data contains only non-binary data(except for char and _bin). In the above example, the restored data contains only the value of b. - // | Besides, if the collation of b is _bin, then restored data is an integer indicate the spaces are truncated. Then we use sortKey - // | and the restored data together to restore original data. + // | Length: 1 | 2 | len(CHandle) | 1 | 8 | len(restoreData) + // | + // | Common Handle Segment: Exists when unique index used common handles. + // | Global Index Segment: Exists when index is global. + // | New Collation Segment: Exists when new collation is used and index or handle contains non-binary string. + // | In v4.0, restored data contains all the index values. For example, (a int, b char(10)) and index (a, b). + // | The restored data contains both the values of a and b. + // | In v5.0, restored data contains only non-binary data(except for char and _bin). In the above example, the restored data contains only the value of b. + // | Besides, if the collation of b is _bin, then restored data is an integer indicate the spaces are truncated. Then we use sortKey + // | and the restored data together to restore original data. + // ``` #[inline] fn process_kv_pair( &mut self, @@ -435,8 +440,9 @@ impl IndexScanExecutorImpl { } // Process index values that are in old collation. - // NOTE: We should extract the index columns from the key first, and extract the handles from value if there is no handle in the key. - // Otherwise, extract the handles from the key. + // NOTE: We should extract the index columns from the key first, and extract the + // handles from value if there is no handle in the key. Otherwise, extract the + // handles from the key. fn process_old_collation_kv( &mut self, mut key_payload: &[u8], @@ -478,16 +484,23 @@ impl IndexScanExecutorImpl { Ok(()) } - // restore_original_data restores the index values whose format is introduced in TiDB 5.0. - // Unlike the format in TiDB 4.0, the new format is optimized for storage space: - // 1. If the index is a composed index, only the non-binary string column's value need to write to value, not all. - // 2. If a string column's collation is _bin, then we only write the number of the truncated spaces to value. - // 3. If a string column is char, not varchar, then we use the sortKey directly. + // restore_original_data restores the index values whose format is introduced in + // TiDB 5.0. Unlike the format in TiDB 4.0, the new format is optimized for + // storage space: + // - If the index is a composed index, only the non-binary string column's value + // need to write to value, not all. + // - If a string column's collation is _bin, then we only write the number of + // the truncated spaces to value. + // - If a string column is char, not varchar, then we use the sortKey directly. + // // The whole logic of this function is: - // 1. For each column pass in, check if it needs the restored data to get to original data. If not, check the next column. - // 2. Skip if the `sort key` is NULL, because the original data must be NULL. - // 3. Depend on the collation if `_bin` or not. Process them differently to get the correct original data. - // 4. Write the original data into the column, we need to make sure pop() is called. + // - For each column pass in, check if it needs the restored data to get to + // original data. If not, check the next column. + // - Skip if the `sort key` is NULL, because the original data must be NULL. + // - Depend on the collation if `_bin` or not. Process them differently to get + // the correct original data. + // - Write the original data into the column, we need to make sure pop() is + // called. fn restore_original_data<'a>( &self, restored_values: &[u8], @@ -518,7 +531,8 @@ impl IndexScanExecutorImpl { column.mut_raw().pop(); let original_data = if is_bin_collation { - // _bin collation, we need to combine data from key and value to form the original data. + // _bin collation, we need to combine data from key and value to form the + // original data. // Unwrap as checked by `decoded_value.read_datum() == Datum::Null` let truncate_str = decoded_value.as_string()?.unwrap(); @@ -551,7 +565,8 @@ impl IndexScanExecutorImpl { // get_index_version is the same as getIndexVersion() in the TiDB repo. fn get_index_version(value: &[u8]) -> Result { if value.len() == 3 || value.len() == 4 { - // For the unique index with null value or non-unique index, the length can be 3 or 4 if <= 9. + // For the unique index with null value or non-unique index, the length can be 3 + // or 4 if <= 9. return Ok(1); } if value.len() <= MAX_OLD_ENCODED_VALUE_LEN { @@ -689,11 +704,13 @@ impl IndexScanExecutorImpl { // If there are some restore data, we need to process them to get the original data. RestoreData::V4(rst) => { - // 4.0 version format, use the restore data directly. The restore data contain all the indexed values. + // 4.0 version format, use the restore data directly. The restore data contain + // all the indexed values. self.extract_columns_from_row_format(rst, columns)?; } RestoreData::V5(rst) => { - // Extract the data from key, then use the restore data to get the original data. + // Extract the data from key, then use the restore data to get the original + // data. Self::extract_columns_from_datum_format( &mut key_payload, &mut columns[..self.columns_id_without_handle.len()], @@ -924,8 +941,9 @@ mod tests { // Case 1. Normal index. - // For a normal index, the PK handle is stored in the key and nothing interesting is stored - // in the value. So let's build corresponding KV data. + // For a normal index, the PK handle is stored in the key and nothing + // interesting is stored in the value. So let's build corresponding KV + // data. let store = { let kv: Vec<_> = data @@ -2023,9 +2041,10 @@ mod tests { fn test_int_handle_char_index() { use tidb_query_datatype::builder::FieldTypeBuilder; - // Schema: create table t(a int, b char(10) collate utf8mb4_bin, c char(10) collate utf8mb4_unicode_ci, key i_a(a), key i_b(b), key i_c(c), key i_abc(a, b, c), unique key i_ua(a), - // unique key i_ub(b), unique key i_uc(c), unique key i_uabc(a,b,c)); - // insert into t values (1, "a ", "A "); + // Schema: create table t(a int, b char(10) collate utf8mb4_bin, c char(10) + // collate utf8mb4_unicode_ci, key i_a(a), key i_b(b), key i_c(c), key i_abc(a, + // b, c), unique key i_ua(a), unique key i_ub(b), unique key i_uc(c), + // unique key i_uabc(a,b,c)); insert into t values (1, "a ", "A "); // i_a and i_ua let mut idx_exe = IndexScanExecutorImpl { @@ -2259,9 +2278,11 @@ mod tests { fn test_int_handle_varchar_index() { use tidb_query_datatype::builder::FieldTypeBuilder; - // Schema: create table t(a int, b varchar(10) collate utf8mb4_bin, c varchar(10) collate utf8mb4_unicode_ci, key i_a(a), key i_b(b), key i_c(c), key i_abc(a, b, c), unique key i_ua(a), - // unique key i_ub(b), unique key i_uc(c), unique key i_uabc(a,b,c)); - // insert into t values (1, "a ", "A "); + // Schema: create table t(a int, b varchar(10) collate utf8mb4_bin, c + // varchar(10) collate utf8mb4_unicode_ci, key i_a(a), key i_b(b), key i_c(c), + // key i_abc(a, b, c), unique key i_ua(a), unique key i_ub(b), unique + // key i_uc(c), unique key i_uabc(a,b,c)); insert into t values (1, "a + // ", "A "); // i_a and i_ua let mut idx_exe = IndexScanExecutorImpl { @@ -2502,9 +2523,12 @@ mod tests { fn test_common_handle_index() { use tidb_query_datatype::builder::FieldTypeBuilder; - // create table t(a int, b char(10) collate utf8mb4_bin, c char(10) collate utf8mb4_unicode_ci, d varchar(10) collate utf8mb4_bin, e varchar(10) collate utf8mb4_general_ci - // , primary key(a, b, c, d, e), key i_a(a), key i_b(b), key i_c(c), key i_d(d), key i_e(e), key i_abcde(a, b, c, d, e), unique key i_ua(a), unique key i_ub(b), unique key i_uc( - // c), unique key i_ud(d), unique key i_ue(e), unique key i_uabcde(a,b,c, d, e)); + // create table t(a int, b char(10) collate utf8mb4_bin, c char(10) collate + // utf8mb4_unicode_ci, d varchar(10) collate utf8mb4_bin, e varchar(10) collate + // utf8mb4_general_ci , primary key(a, b, c, d, e), key i_a(a), key + // i_b(b), key i_c(c), key i_d(d), key i_e(e), key i_abcde(a, b, c, d, e), + // unique key i_ua(a), unique key i_ub(b), unique key i_uc( c), unique + // key i_ud(d), unique key i_ue(e), unique key i_uabcde(a,b,c, d, e)); // // CREATE TABLE `t` ( // `a` int(11) NOT NULL, @@ -3277,8 +3301,9 @@ mod tests { fn test_common_handle_index_latin1_bin() { use tidb_query_datatype::builder::FieldTypeBuilder; - // create table t(c1 varchar(200) CHARACTER SET latin1 COLLATE latin1_bin, c2 int, primary key(c1) clustered, key kk(c2)); - // idx_exec for index kk(c2), its columns will be + // create table t(c1 varchar(200) CHARACTER SET latin1 COLLATE latin1_bin, c2 + // int, primary key(c1) clustered, key kk(c2)); idx_exec for index + // kk(c2), its columns will be let mut idx_exe = IndexScanExecutorImpl { context: Default::default(), schema: vec![ diff --git a/components/tidb_query_executors/src/interface.rs b/components/tidb_query_executors/src/interface.rs index cbbe77943b1..1ea5038a2d6 100644 --- a/components/tidb_query_executors/src/interface.rs +++ b/components/tidb_query_executors/src/interface.rs @@ -1,6 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -// TODO: Maybe we can find a better place to put these interfaces, e.g. naming it as prelude? +// TODO: Maybe we can find a better place to put these interfaces, e.g. naming +// it as prelude? //! Batch executor common structures. @@ -13,8 +14,8 @@ use tidb_query_common::{ use tidb_query_datatype::{codec::batch::LazyBatchColumnVec, expr::EvalWarnings}; use tipb::FieldType; -/// The interface for pull-based executors. It is similar to the Volcano Iterator model, but -/// pulls data in batch and stores data by column. +/// The interface for pull-based executors. It is similar to the Volcano +/// Iterator model, but pulls data in batch and stores data by column. pub trait BatchExecutor: Send { type StorageStats; @@ -23,26 +24,30 @@ pub trait BatchExecutor: Send { /// Pulls next several rows of data (stored by column). /// - /// This function might return zero rows, which doesn't mean that there is no more result. - /// See `is_drained` in `BatchExecuteResult`. + /// This function might return zero rows, which doesn't mean that there is + /// no more result. See `is_drained` in `BatchExecuteResult`. fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult; - /// Collects execution statistics (including but not limited to metrics and execution summaries) - /// accumulated during execution and prepares for next collection. + /// Collects execution statistics (including but not limited to metrics and + /// execution summaries) accumulated during execution and prepares for + /// next collection. /// - /// The executor implementation must invoke this function for each children executor. However - /// the invocation order of children executors is not stipulated. + /// The executor implementation must invoke this function for each children + /// executor. However the invocation order of children executors is not + /// stipulated. /// - /// This function may be invoked several times during execution. For each invocation, it should - /// not contain accumulated meta data in last invocation. Normally the invocation frequency of - /// this function is less than `next_batch()`. + /// This function may be invoked several times during execution. For each + /// invocation, it should not contain accumulated meta data in last + /// invocation. Normally the invocation frequency of this function is + /// less than `next_batch()`. fn collect_exec_stats(&mut self, dest: &mut ExecuteStats); - /// Collects underlying storage statistics accumulated during execution and prepares for - /// next collection. + /// Collects underlying storage statistics accumulated during execution and + /// prepares for next collection. /// - /// Similar to `collect_exec_stats()`, the implementation must invoke this function for each - /// children executor and this function may be invoked several times during execution. + /// Similar to `collect_exec_stats()`, the implementation must invoke this + /// function for each children executor and this function may be invoked + /// several times during execution. fn collect_storage_stats(&mut self, dest: &mut Self::StorageStats); fn take_scanned_range(&mut self) -> IntervalRange; @@ -127,26 +132,31 @@ impl BatchExecutor } } -/// Data to be flowed between parent and child executors' single `next_batch()` invocation. +/// Data to be flowed between parent and child executors' single `next_batch()` +/// invocation. /// -/// Note: there are other data flow between executors, like metrics and output statistics. -/// However they are flowed at once, just before response, instead of each step during execution. -/// Hence they are not covered by this structure. See `BatchExecuteMetaData`. +/// Note: there are other data flow between executors, like metrics and output +/// statistics. However they are flowed at once, just before response, instead +/// of each step during execution. Hence they are not covered by this structure. +/// See `BatchExecuteMetaData`. /// -/// It is only `Send` but not `Sync` because executor returns its own data copy. However `Send` -/// enables executors to live in different threads. +/// It is only `Send` but not `Sync` because executor returns its own data copy. +/// However `Send` enables executors to live in different threads. /// -/// It is designed to be used in new generation executors, i.e. executors support batch execution. -/// The old executors will not be refined to return this kind of result. +/// It is designed to be used in new generation executors, i.e. executors +/// support batch execution. The old executors will not be refined to return +/// this kind of result. pub struct BatchExecuteResult { /// The *physical* columns data generated during this invocation. /// - /// Note 1: Empty column data doesn't mean that there is no more data. See `is_drained`. + /// Note 1: Empty column data doesn't mean that there is no more data. See + /// `is_drained`. /// - /// Note 2: This is only a *physical* store of data. The data may not be in desired order and - /// there could be filtered out data stored inside. You should access the *logical* - /// data via the `logical_rows` field. For the same reason, `rows_len() > 0` doesn't - /// mean that there is logical data inside. + /// Note 2: This is only a *physical* store of data. The data may not be in + /// desired order and there could be filtered out data stored inside. You + /// should access the *logical* data via the `logical_rows` field. For the + /// same reason, `rows_len() > 0` doesn't mean that there is logical data + /// inside. pub physical_columns: LazyBatchColumnVec, /// Valid row offsets in `physical_columns`, placed in the logical order. @@ -160,16 +170,17 @@ pub struct BatchExecuteResult { /// Whether or not there is no more data. /// /// This structure is a `Result`. When it is: - /// - `Ok(false)`: The normal case, means that there could be more data. The caller should - /// continue calling `next_batch()` although for each call the returned data may - /// be empty. - /// - `Ok(true)`: Means that the executor is drained and no more data will be returned in - /// future. However there could be some (last) data in the `data` field this - /// time. The caller should NOT call `next_batch()` any more. - /// - `Err(_)`: Means that there is an error when trying to retrieve more data. In this case, - /// the error is returned and the executor is also drained. Similar to - /// `Ok(true)`, there could be some remaining data in the `data` field which is - /// valid data and should be processed. The caller should NOT call `next_batch()` - /// any more. + /// - `Ok(false)`: The normal case, means that there could be more data. The + /// caller should continue calling `next_batch()` although for each call + /// the returned data may be empty. + /// - `Ok(true)`: Means that the executor is drained and no more data will + /// be returned in future. However there could be some (last) data in the + /// `data` field this time. The caller should NOT call `next_batch()` any + /// more. + /// - `Err(_)`: Means that there is an error when trying to retrieve more + /// data. In this case, the error is returned and the executor is also + /// drained. Similar to `Ok(true)`, there could be some remaining data in + /// the `data` field which is valid data and should be processed. The + /// caller should NOT call `next_batch()` any more. pub is_drained: Result, } diff --git a/components/tidb_query_executors/src/lib.rs b/components/tidb_query_executors/src/lib.rs index b32518c600b..ad86f94f9b8 100644 --- a/components/tidb_query_executors/src/lib.rs +++ b/components/tidb_query_executors/src/lib.rs @@ -1,11 +1,12 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -//! This crate implements a simple SQL query engine to work with TiDB pushed down executors. +//! This crate implements a simple SQL query engine to work with TiDB pushed +//! down executors. //! -//! The query engine is able to scan and understand rows stored by TiDB, run against a -//! series of executors and then return the execution result. The query engine is provided via -//! TiKV Coprocessor interface. However standalone UDF functions are also exported and can be used -//! standalone. +//! The query engine is able to scan and understand rows stored by TiDB, run +//! against a series of executors and then return the execution result. The +//! query engine is provided via TiKV Coprocessor interface. However standalone +//! UDF functions are also exported and can be used standalone. #![allow(incomplete_features)] #![feature(proc_macro_hygiene)] diff --git a/components/tidb_query_executors/src/projection_executor.rs b/components/tidb_query_executors/src/projection_executor.rs index 680800859f3..1d6892731ff 100644 --- a/components/tidb_query_executors/src/projection_executor.rs +++ b/components/tidb_query_executors/src/projection_executor.rs @@ -20,8 +20,8 @@ pub struct BatchProjectionExecutor { exprs: Vec, } -// We assign a dummy type `Box>` so that we can omit the type -// when calling `check_supported`. +// We assign a dummy type `Box>` so that we +// can omit the type when calling `check_supported`. impl BatchProjectionExecutor>> { /// Checks whether this executor can be used. #[inline] @@ -209,8 +209,9 @@ mod tests { ], ); - // When source executor returns empty rows, projection executor should process correctly. - // No errors should be generated and the expression functions should not be called. + // When source executor returns empty rows, projection executor should process + // correctly. No errors should be generated and the expression functions + // should not be called. let r = exec.next_batch(1); // The scan rows parameter has no effect for mock executor. We don't care. @@ -507,8 +508,8 @@ mod tests { ], ); - // When evaluating expr[0], there will be no error. However we will meet errors for - // expr[1]. + // When evaluating expr[0], there will be no error. However we will meet errors + // for expr[1]. let exprs = (0..=1) .map(|offset| { diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 4a8a3a02851..073fade4b29 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -32,32 +32,34 @@ use super::{ *, }; -// TODO: The value is chosen according to some very subjective experience, which is not tuned -// carefully. We need to benchmark to find a best value. Also we may consider accepting this value -// from TiDB side. +// TODO: The value is chosen according to some very subjective experience, which +// is not tuned carefully. We need to benchmark to find a best value. Also we +// may consider accepting this value from TiDB side. const BATCH_INITIAL_SIZE: usize = 32; -// TODO: This value is chosen based on MonetDB/X100's research without our own benchmarks. +// TODO: This value is chosen based on MonetDB/X100's research without our own +// benchmarks. pub use tidb_query_expr::types::BATCH_MAX_SIZE; // TODO: Maybe there can be some better strategy. Needs benchmarks and tunes. const BATCH_GROW_FACTOR: usize = 2; -/// Batch executors are run in coroutines. `MAX_TIME_SLICE` is the maximum time a coroutine -/// can run without being yielded. +/// Batch executors are run in coroutines. `MAX_TIME_SLICE` is the maximum time +/// a coroutine can run without being yielded. pub const MAX_TIME_SLICE: Duration = Duration::from_millis(1); pub struct BatchExecutorsRunner { - /// The deadline of this handler. For each check point (e.g. each iteration) we need to check - /// whether or not the deadline is exceeded and break the process if so. + /// The deadline of this handler. For each check point (e.g. each iteration) + /// we need to check whether or not the deadline is exceeded and break + /// the process if so. // TODO: Deprecate it using a better deadline mechanism. deadline: Deadline, out_most_executor: Box>, - /// The offset of the columns need to be outputted. For example, TiDB may only needs a subset - /// of the columns in the result so that unrelated columns don't need to be encoded and - /// returned back. + /// The offset of the columns need to be outputted. For example, TiDB may + /// only needs a subset of the columns in the result so that unrelated + /// columns don't need to be encoded and returned back. output_offsets: Vec, config: Arc, @@ -76,16 +78,18 @@ pub struct BatchExecutorsRunner { /// 2. chunk: result is encoded column by column using chunk format. encode_type: EncodeType, - /// If it's a paging request, paging_size indicates to the required size for current page. + /// If it's a paging request, paging_size indicates to the required size for + /// current page. paging_size: Option, quota_limiter: Arc, } -// We assign a dummy type `()` so that we can omit the type when calling `check_supported`. +// We assign a dummy type `()` so that we can omit the type when calling +// `check_supported`. impl BatchExecutorsRunner<()> { - /// Given a list of executor descriptors and checks whether all executor descriptors can - /// be used to build batch executors. + /// Given a list of executor descriptors and checks whether all executor + /// descriptors can be used to build batch executors. pub fn check_supported(exec_descriptors: &[tipb::Executor]) -> Result<()> { for ed in exec_descriptors { match ed.get_tp() { @@ -387,7 +391,9 @@ impl BatchExecutorsRunner { storage, ranges, config.clone(), - is_streaming || paging_size.is_some(), // For streaming and paging request, executors will continue scan from range end where last scan is finished + is_streaming || paging_size.is_some(), /* For streaming and paging request, + * executors will continue scan from range + * end where last scan is finished */ )?; let encode_type = if !is_arrow_encodable(out_most_executor.schema()) { @@ -434,8 +440,9 @@ impl BatchExecutorsRunner { /// handle_request returns the response of selection and an optional range, /// only paging request will return Some(IntervalRange), /// this should be used when calculating ranges of the next batch. - /// IntervalRange records whole range scanned though there are gaps in multi ranges. - /// e.g.: [(k1 -> k2), (k4 -> k5)] may got response (k1, k2, k4) with IntervalRange like (k1, k4). + /// IntervalRange records whole range scanned though there are gaps in multi + /// ranges. e.g.: [(k1 -> k2), (k4 -> k5)] may got response (k1, k2, k4) + /// with IntervalRange like (k1, k4). pub async fn handle_request(&mut self) -> Result<(SelectResponse, Option)> { let mut chunks = vec![]; let mut batch_size = Self::batch_initial_size(); diff --git a/components/tidb_query_executors/src/selection_executor.rs b/components/tidb_query_executors/src/selection_executor.rs index e930d6f9d89..61030e593e0 100644 --- a/components/tidb_query_executors/src/selection_executor.rs +++ b/components/tidb_query_executors/src/selection_executor.rs @@ -20,8 +20,8 @@ pub struct BatchSelectionExecutor { conditions: Vec, } -// We assign a dummy type `Box>` so that we can omit the type -// when calling `check_supported`. +// We assign a dummy type `Box>` so that we +// can omit the type when calling `check_supported`. impl BatchSelectionExecutor>> { /// Checks whether this executor can be used. #[inline] @@ -62,10 +62,12 @@ impl BatchSelectionExecutor { }) } - /// Accepts source result and mutates its `logical_rows` according to predicates. + /// Accepts source result and mutates its `logical_rows` according to + /// predicates. /// - /// When errors are returned, it means there are errors during the evaluation. Currently - /// we treat this situation as "completely failed". + /// When errors are returned, it means there are errors during the + /// evaluation. Currently we treat this situation as "completely + /// failed". fn handle_src_result(&mut self, src_result: &mut BatchExecuteResult) -> Result<()> { // We handle errors in next_batch, so we can ingore it here. @@ -139,9 +141,9 @@ where let mut err_result = Ok(()); let mut logical_index = 0; logical_rows.retain(|_| { - // We don't care the physical index indicated by `logical_rows`, since what's in there - // does not affect the filtering. Instead, the eval result in corresponding logical index - // matters. + // We don't care the physical index indicated by `logical_rows`, since what's in + // there does not affect the filtering. Instead, the eval result in + // corresponding logical index matters. let eval_result_physical_index = eval_result_logical_rows.get_idx(logical_index); logical_index += 1; @@ -261,8 +263,9 @@ mod tests { ], ); - // When source executor returns empty rows, selection executor should process correctly. - // No errors should be generated and the predicate function should not be called. + // When source executor returns empty rows, selection executor should process + // correctly. No errors should be generated and the predicate function + // should not be called. let r = exec.next_batch(1); // The scan rows parameter has no effect for mock executor. We don't care. @@ -330,8 +333,8 @@ mod tests { ) } - /// Tests the scenario that there is no predicate or there is a predicate but always returns - /// true (no data is filtered). + /// Tests the scenario that there is no predicate or there is a predicate + /// but always returns true (no data is filtered). #[test] fn test_no_predicate_or_predicate_always_true() { // Build a selection executor without predicate. @@ -462,8 +465,8 @@ mod tests { ) } - /// Tests the scenario that the predicate returns both true and false. Rows that predicate - /// returns false should be removed from the result. + /// Tests the scenario that the predicate returns both true and false. Rows + /// that predicate returns false should be removed from the result. #[test] fn test_predicate_1() { let src_exec = make_src_executor_using_fixture_2(); @@ -514,8 +517,8 @@ mod tests { assert!(r.is_drained.unwrap()); } - /// Tests the scenario that there are multiple predicates. Only the row that all predicates - /// return true should be remained. + /// Tests the scenario that there are multiple predicates. Only the row that + /// all predicates return true should be remained. #[test] fn test_multiple_predicate_1() { // Use [is_even(column[0]), is_even(column[1])] as the predicate. @@ -634,8 +637,8 @@ mod tests { ], ); - // When evaluating predicates[0], there will be no error. However we will meet errors for - // predicates[1]. + // When evaluating predicates[0], there will be no error. However we will meet + // errors for predicates[1]. let predicates = (0..=1) .map(|offset| { @@ -647,8 +650,8 @@ mod tests { .collect(); let mut exec = BatchSelectionExecutor::new_for_test(src_exec, predicates); - // TODO: A more precise result is that the first two rows are returned and error starts from - // the third row. + // TODO: A more precise result is that the first two rows are returned and error + // starts from the third row. let r = exec.next_batch(1); assert!(r.logical_rows.is_empty()); diff --git a/components/tidb_query_executors/src/simple_aggr_executor.rs b/components/tidb_query_executors/src/simple_aggr_executor.rs index 1e1dd48929b..d26d293a274 100644 --- a/components/tidb_query_executors/src/simple_aggr_executor.rs +++ b/components/tidb_query_executors/src/simple_aggr_executor.rs @@ -1,7 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! Simple aggregation is an aggregation that do not have `GROUP BY`s. It is more even more simpler -//! than stream aggregation. +//! Simple aggregation is an aggregation that do not have `GROUP BY`s. It is +//! more even more simpler than stream aggregation. use std::sync::Arc; @@ -58,8 +58,8 @@ impl BatchExecutor for BatchSimpleAggregationExecutor { } } -// We assign a dummy type `Box>` so that we can omit the type -// when calling `check_supported`. +// We assign a dummy type `Box>` so that we +// can omit the type when calling `check_supported`. impl BatchSimpleAggregationExecutor>> { /// Checks whether this executor can be used. #[inline] @@ -104,8 +104,8 @@ impl BatchSimpleAggregationExecutor { aggr_defs: Vec, aggr_def_parser: impl AggrDefinitionParser, ) -> Result { - // Empty states is fine because it will be re-initialized later according to the content - // in entities. + // Empty states is fine because it will be re-initialized later according to the + // content in entities. let aggr_impl = SimpleAggregationImpl { states: Vec::new(), has_input_rows: false, @@ -222,7 +222,8 @@ impl AggregationExecutorImpl for SimpleAggregationImpl Ok(Vec::new()) } - /// Simple aggregation can output aggregate results only if the source is drained. + /// Simple aggregation can output aggregate results only if the source is + /// drained. #[inline] fn is_partial_results_ready(&self) -> bool { false @@ -243,9 +244,11 @@ mod tests { #[test] fn test_it_works_unit() { - /// Aggregate function `Foo` accepts a Bytes column, returns a Int datum. + /// Aggregate function `Foo` accepts a Bytes column, returns a Int + /// datum. /// - /// The returned data is the sum of the length of all accepted bytes datums. + /// The returned data is the sum of the length of all accepted bytes + /// datums. #[derive(Debug, AggrFunction)] #[aggr_function(state = AggrFnFooState::new())] struct AggrFnFoo; @@ -290,9 +293,9 @@ mod tests { output.push(FieldTypeTp::LongLong.into()); } - /// Aggregate function `Bar` accepts a Real column, returns `(a: Int, b: Int, c: Real)`, - /// where `a` is the number of rows including nulls, `b` is the number of rows excluding - /// nulls, `c` is the sum of all values. + /// Aggregate function `Bar` accepts a Real column, returns `(a: Int, b: + /// Int, c: Real)`, where `a` is the number of rows including nulls, `b` + /// is the number of rows excluding nulls, `c` is the sum of all values. #[derive(Debug, AggrFunction)] #[aggr_function(state = AggrFnBarState::new())] struct AggrFnBar; @@ -349,7 +352,8 @@ mod tests { output.push(FieldTypeTp::Double.into()); } - // This test creates a simple aggregation executor with the following aggregate functions: + // This test creates a simple aggregation executor with the following aggregate + // functions: // - Foo("abc") // - Foo(NULL) // - Bar(42.5) @@ -360,8 +364,8 @@ mod tests { let src_exec = make_src_executor_1(); - // As a unit test, let's use the most simple way to build the executor. No complex parsers - // involved. + // As a unit test, let's use the most simple way to build the executor. No + // complex parsers involved. let aggr_definitions: Vec<_> = (0..6) .map(|index| { @@ -503,7 +507,8 @@ mod tests { use tipb::ExprType; use tipb_helper::ExprDefBuilder; - // This test creates a simple aggregation executor with the following aggregate functions: + // This test creates a simple aggregation executor with the following aggregate + // functions: // - COUNT(1) // - COUNT(4.5) // - COUNT(NULL) diff --git a/components/tidb_query_executors/src/slow_hash_aggr_executor.rs b/components/tidb_query_executors/src/slow_hash_aggr_executor.rs index bd1e5cf8a80..2502e28f570 100644 --- a/components/tidb_query_executors/src/slow_hash_aggr_executor.rs +++ b/components/tidb_query_executors/src/slow_hash_aggr_executor.rs @@ -23,8 +23,8 @@ use crate::{ util::{aggr_executor::*, hash_aggr_helper::HashAggregationHelper, *}, }; -/// Slow Hash Aggregation Executor supports multiple groups but uses less efficient ways to -/// store group keys in hash tables. +/// Slow Hash Aggregation Executor supports multiple groups but uses less +/// efficient ways to store group keys in hash tables. /// /// FIXME: It is not correct to just store the serialized data as the group key. /// See pingcap/tidb#10467. @@ -66,8 +66,8 @@ impl BatchExecutor for BatchSlowHashAggregationExecutor } } -// We assign a dummy type `Box>` so that we can omit the type -// when calling `check_supported`. +// We assign a dummy type `Box>` so that we +// can omit the type when calling `check_supported`. impl BatchSlowHashAggregationExecutor>> { /// Checks whether this executor can be used. #[inline] @@ -195,34 +195,37 @@ impl BatchSlowHashAggregationExecutor { pub struct SlowHashAggregationImpl { states: Vec>, - /// The value is the group index. `states` and `group_key_offsets` are stored in - /// the order of group index. + /// The value is the group index. `states` and `group_key_offsets` are + /// stored in the order of group index. groups: HashMap, group_by_exps: Vec, - /// Extra group by columns store the bytes columns in original data form while - /// default columns store them in sortkey form. - /// The sortkey form is used to aggr on while the original form is to be returned - /// as results. + /// Extra group by columns store the bytes columns in original data form + /// while default columns store them in sortkey form. + /// The sortkey form is used to aggr on while the original form is to be + /// returned as results. /// - /// For example, the bytes column at index i will be stored in sortkey form at column i - /// and in original data form at column `extra_group_by_col_index[i]`. + /// For example, the bytes column at index i will be stored in sortkey form + /// at column i and in original data form at column + /// `extra_group_by_col_index[i]`. extra_group_by_col_index: Vec, - /// The sequence of group by column index which are in original form and are in the - /// same order as group_by_exps by substituting bytes columns index for extra group by column index. + /// The sequence of group by column index which are in original form and are + /// in the same order as group_by_exps by substituting bytes columns + /// index for extra group by column index. original_group_by_col_index: Vec, - /// Encoded group keys are stored in this buffer sequentially. Offsets of each encoded - /// element are stored in `group_key_offsets`. + /// Encoded group keys are stored in this buffer sequentially. Offsets of + /// each encoded element are stored in `group_key_offsets`. /// /// `GroupKeyRefUnsafe` contains a raw pointer to this buffer. #[allow(clippy::box_collection)] group_key_buffer: Box>, - /// The offsets of encoded keys in `group_key_buffer`. This `Vec` always has a leading `0` - /// element. Then, the begin and end offsets of the "i"-th column of the group key whose group - /// index is "j" are `group_key_offsets[j * group_by_col_len + i]` and + /// The offsets of encoded keys in `group_key_buffer`. This `Vec` always has + /// a leading `0` element. Then, the begin and end offsets of the "i"-th + /// column of the group key whose group index is "j" are + /// `group_key_offsets[j * group_by_col_len + i]` and /// `group_key_offsets[j * group_by_col_len + i + 1]`. /// /// group_by_col_len = group_by_exps.len() + extra_group_by_col_index.len() @@ -231,8 +234,9 @@ pub struct SlowHashAggregationImpl { states_offset_each_logical_row: Vec, /// Stores evaluation results of group by expressions. - /// It is just used to reduce allocations. The lifetime is not really 'static. The elements - /// are only valid in the same batch where they are added. + /// It is just used to reduce allocations. The lifetime is not really + /// 'static. The elements are only valid in the same batch where they + /// are added. group_by_results_unsafe: Vec>, /// Cached encoded results for calculated Scalar results @@ -267,8 +271,8 @@ impl AggregationExecutorImpl for SlowHashAggregationImp let logical_rows_len = input_logical_rows.len(); let aggr_fn_len = entities.each_aggr_fn.len(); - // Decode columns with mutable input first, so subsequent access to input can be immutable - // (and the borrow checker will be happy) + // Decode columns with mutable input first, so subsequent access to input can be + // immutable (and the borrow checker will be happy) ensure_columns_decoded( context, &self.group_by_exps, @@ -330,8 +334,8 @@ impl AggregationExecutorImpl for SlowHashAggregationImp // End of the sortkey columns let group_key_ref_end = self.group_key_buffer.len(); - // Encode bytes column in original form to extra group by columns, which is to be returned - // as group by results + // Encode bytes column in original form to extra group by columns, which is to + // be returned as group by results for (i, col_index) in self.extra_group_by_col_index.iter().enumerate() { let group_by_result = &self.group_by_results_unsafe[*col_index]; match group_by_result { @@ -468,17 +472,19 @@ impl AggregationExecutorImpl for SlowHashAggregationImp Ok(group_by_columns) } - /// Slow hash aggregation can output aggregate results only if the source is drained. + /// Slow hash aggregation can output aggregate results only if the source is + /// drained. #[inline] fn is_partial_results_ready(&self) -> bool { false } } -/// A reference to a group key slice in the `group_key_buffer` of `SlowHashAggregationImpl`. +/// A reference to a group key slice in the `group_key_buffer` of +/// `SlowHashAggregationImpl`. /// -/// It is safe as soon as it doesn't outlive the `SlowHashAggregationImpl` that creates this -/// reference. +/// It is safe as soon as it doesn't outlive the `SlowHashAggregationImpl` that +/// creates this reference. struct GroupKeyRefUnsafe { /// Points to the `group_key_buffer` of `SlowHashAggregationImpl` buffer_ptr: NonNull>, @@ -521,7 +527,8 @@ mod tests { use tipb::ExprType; use tipb_helper::ExprDefBuilder; - // This test creates a hash aggregation executor with the following aggregate functions: + // This test creates a hash aggregation executor with the following aggregate + // functions: // - COUNT(1) // - AVG(col_0 + 5.0) // And group by: @@ -596,7 +603,8 @@ mod tests { .ensure_all_decoded_for_test(&mut EvalContext::default(), &exec.schema()[5]) .unwrap(); - // The row order is not defined. Let's sort it by the group by column before asserting. + // The row order is not defined. Let's sort it by the group by column before + // asserting. let mut sort_column: Vec<(usize, _)> = r.physical_columns[3] .decoded() .to_bytes_vec() diff --git a/components/tidb_query_executors/src/stream_aggr_executor.rs b/components/tidb_query_executors/src/stream_aggr_executor.rs index 52f55751442..4b768cd65fe 100644 --- a/components/tidb_query_executors/src/stream_aggr_executor.rs +++ b/components/tidb_query_executors/src/stream_aggr_executor.rs @@ -58,8 +58,8 @@ impl BatchExecutor for BatchStreamAggregationExecutor { } } -// We assign a dummy type `Box>` so that we can omit the type -// when calling `check_supported`. +// We assign a dummy type `Box>` so that we +// can omit the type when calling `check_supported`. impl BatchStreamAggregationExecutor>> { /// Checks whether this executor can be used. #[inline] @@ -95,13 +95,15 @@ pub struct BatchStreamAggregationImpl { states: Vec>, /// Stores evaluation results of group by expressions. - /// It is just used to reduce allocations. The lifetime is not really 'static. The elements - /// are only valid in the same batch where they are added. + /// It is just used to reduce allocations. The lifetime is not really + /// 'static. The elements are only valid in the same batch where they + /// are added. group_by_results_unsafe: Vec>, /// Stores evaluation results of aggregate expressions. - /// It is just used to reduce allocations. The lifetime is not really 'static. The elements - /// are only valid in the same batch where they are added. + /// It is just used to reduce allocations. The lifetime is not really + /// 'static. The elements are only valid in the same batch where they + /// are added. aggr_expr_results_unsafe: Vec>, } @@ -226,8 +228,8 @@ impl AggregationExecutorImpl for BatchStreamAggregation let group_by_len = self.group_by_exps.len(); let aggr_fn_len = entities.each_aggr_fn.len(); - // Decode columns with mutable input first, so subsequent access to input can be immutable - // (and the borrow checker will be happy) + // Decode columns with mutable input first, so subsequent access to input can be + // immutable (and the borrow checker will be happy) ensure_columns_decoded( context, &self.group_by_exps, @@ -391,8 +393,8 @@ impl AggregationExecutorImpl for BatchStreamAggregation Ok(group_by_columns) } - /// We cannot ensure the last group is complete, so we can output partial results - /// only if group count >= 2. + /// We cannot ensure the last group is complete, so we can output partial + /// results only if group count >= 2. #[inline] fn is_partial_results_ready(&self) -> bool { AggregationExecutorImpl::::groups_len(self) >= 2 @@ -469,7 +471,8 @@ mod tests { use tipb::ExprType; use tipb_helper::ExprDefBuilder; - // This test creates a stream aggregation executor with the following aggregate functions: + // This test creates a stream aggregation executor with the following aggregate + // functions: // - COUNT(1) // - AVG(col_1 + 1.0) // And group by: diff --git a/components/tidb_query_executors/src/table_scan_executor.rs b/components/tidb_query_executors/src/table_scan_executor.rs index 908f0a7146a..c2c310b4018 100644 --- a/components/tidb_query_executors/src/table_scan_executor.rs +++ b/components/tidb_query_executors/src/table_scan_executor.rs @@ -26,8 +26,8 @@ pub struct BatchTableScanExecutor(ScanExecutor; -// We assign a dummy type `Box>` so that we can omit the type -// when calling `check_supported`. +// We assign a dummy type `Box>` so that we can +// omit the type when calling `check_supported`. impl BatchTableScanExecutor>> { /// Checks whether this executor can be used. #[inline] @@ -80,8 +80,9 @@ impl BatchTableScanExecutor { column_id_index.insert(ci.get_column_id(), index); } - // Note: if two PK handles are given, we will only preserve the *last* one. Also if two - // columns with the same column id are given, we will only preserve the *last* one. + // Note: if two PK handles are given, we will only preserve the + // *last* one. Also if two columns with the same column + // id are given, we will only preserve the *last* one. } let no_common_handle = primary_column_ids.is_empty(); @@ -142,30 +143,32 @@ impl BatchExecutor for BatchTableScanExecutor { } struct TableScanExecutorImpl { - /// Note: Although called `EvalContext`, it is some kind of execution context instead. + /// Note: Although called `EvalContext`, it is some kind of execution + /// context instead. // TODO: Rename EvalContext to ExecContext. context: EvalContext, - /// The schema of the output. All of the output come from specific columns in the underlying - /// storage. + /// The schema of the output. All of the output come from specific columns + /// in the underlying storage. schema: Vec, - /// The default value of corresponding columns in the schema. When column data is missing, - /// the default value will be used to fill the output. + /// The default value of corresponding columns in the schema. When column + /// data is missing, the default value will be used to fill the output. columns_default_value: Vec>, /// The output position in the schema giving the column id. column_id_index: HashMap, - /// Vec of indices in output row to put the handle. The indices must be sorted in the vec. + /// Vec of indices in output row to put the handle. The indices must be + /// sorted in the vec. handle_indices: HandleIndicesVec, /// Vec of Primary key column's IDs. primary_column_ids: Vec, - /// A vector of flags indicating whether corresponding column is filled in `next_batch`. - /// It is a struct level field in order to prevent repeated memory allocations since its length - /// is fixed for each `next_batch` call. + /// A vector of flags indicating whether corresponding column is filled in + /// `next_batch`. It is a struct level field in order to prevent repeated + /// memory allocations since its length is fixed for each `next_batch` call. is_column_filled: Vec, } @@ -193,8 +196,8 @@ impl TableScanExecutorImpl { remaining = &remaining[1..]; let column_id = box_try!(remaining.read_var_i64()); let (val, new_remaining) = datum::split_datum(remaining, false)?; - // Note: The produced columns may be not in the same length if there is error due - // to corrupted data. It will be handled in `ScanExecutor`. + // Note: The produced columns may be not in the same length if there is error + // due to corrupted data. It will be handled in `ScanExecutor`. let some_index = self.column_id_index.get(&column_id); if let Some(index) = some_index { let index = *index; @@ -246,7 +249,8 @@ impl TableScanExecutorImpl { *decoded_columns += 1; self.is_column_filled[*idx] = true; } else { - // This column is missing. It will be filled with default values later. + // This column is missing. It will be filled with default values + // later. } } Ok(()) @@ -264,13 +268,14 @@ impl ScanExecutorImpl for TableScanExecutorImpl { &mut self.context } - /// Constructs empty columns, with PK in decoded format and the rest in raw format. + /// Constructs empty columns, with PK in decoded format and the rest in raw + /// format. fn build_column_vec(&self, scan_rows: usize) -> LazyBatchColumnVec { let columns_len = self.schema.len(); let mut columns = Vec::with_capacity(columns_len); - // If there are any PK columns, for each of them, fill non-PK columns before it and push the - // PK column. + // If there are any PK columns, for each of them, fill non-PK columns before it + // and push the PK column. // For example, consider: // non-pk non-pk non-pk pk non-pk non-pk pk pk non-pk non-pk // handle_indices: ^3 ^6 ^7 @@ -309,9 +314,10 @@ impl ScanExecutorImpl for TableScanExecutorImpl { last_index = *handle_index + 1; } - // Then fill remaining columns after the last handle column. If there are no PK columns, - // the previous loop will be skipped and this loop will be run on 0..columns_len. - // For the example above, this loop will push: [non-pk, non-pk] + // Then fill remaining columns after the last handle column. If there are no PK + // columns, the previous loop will be skipped and this loop will be run + // on 0..columns_len. For the example above, this loop will push: + // [non-pk, non-pk] for i in last_index..columns_len { if Some(i) == physical_table_id_column_idx { columns.push(LazyBatchColumn::decoded_with_capacity_and_tp( @@ -352,8 +358,9 @@ impl ScanExecutorImpl for TableScanExecutorImpl { let handle = table::decode_int_handle(key)?; for handle_index in &self.handle_indices { - // TODO: We should avoid calling `push_int` repeatedly. Instead we should specialize - // a `&mut Vec` first. However it is hard to program due to lifetime restriction. + // TODO: We should avoid calling `push_int` repeatedly. Instead we should + // specialize a `&mut Vec` first. However it is hard to program + // due to lifetime restriction. if !self.is_column_filled[*handle_index] { columns[*handle_index].mut_decoded().push_int(Some(handle)); decoded_columns += 1; @@ -361,14 +368,16 @@ impl ScanExecutorImpl for TableScanExecutorImpl { } } } else if !self.primary_column_ids.is_empty() { - // Otherwise, if `primary_column_ids` is not empty, we try to extract the values of the columns from the common handle. + // Otherwise, if `primary_column_ids` is not empty, we try to extract the values + // of the columns from the common handle. let mut handle = table::decode_common_handle(key)?; for primary_id in self.primary_column_ids.iter() { let index = self.column_id_index.get(primary_id); let (datum, remain) = datum::split_datum(handle, false)?; handle = remain; - // If the column info of the corresponding primary column id is missing, we ignore this slice of the datum. + // If the column info of the corresponding primary column id is missing, we + // ignore this slice of the datum. if let Some(&index) = index { if !self.is_column_filled[index] { columns[index].mut_raw().push(datum); @@ -390,8 +399,8 @@ impl ScanExecutorImpl for TableScanExecutorImpl { self.is_column_filled[*idx] = true; } - // Some fields may be missing in the row, we push corresponding default value to make all - // columns in same length. + // Some fields may be missing in the row, we push corresponding default value to + // make all columns in same length. for i in 0..columns_len { if !self.is_column_filled[i] { // Missing fields must not be a primary key, so it must be @@ -585,7 +594,8 @@ mod tests { .collect() } - /// Returns whole table's ranges which include point range and non-point range. + /// Returns whole table's ranges which include point range and non-point + /// range. fn mixed_ranges_for_whole_table(&self) -> Vec { vec![ self.table_range(i64::MIN, 3), @@ -743,9 +753,9 @@ mod tests { vec![0, 1], vec![0, 2], vec![1, 2], - //PK is the last column in schema + // PK is the last column in schema vec![2, 1, 0], - //PK is the first column in schema + // PK is the first column in schema vec![0, 1, 2], // PK is in the middle of the schema vec![1, 0, 2], @@ -802,7 +812,8 @@ mod tests { executor.collect_exec_stats(&mut s); - // Collected statistics remain unchanged because of no newly generated delta statistics. + // Collected statistics remain unchanged because of no newly generated delta + // statistics. assert_eq!(s.scanned_rows_per_range.len(), 2); assert_eq!(s.scanned_rows_per_range[0], 3); assert_eq!(s.scanned_rows_per_range[1], 0); @@ -811,7 +822,8 @@ mod tests { assert_eq!(3, exec_summary.num_produced_rows); assert_eq!(2, exec_summary.num_iterations); - // Reset collected statistics so that now we will only collect statistics in this round. + // Reset collected statistics so that now we will only collect statistics in + // this round. s.clear(); executor.next_batch(10); executor.collect_exec_stats(&mut s); @@ -907,7 +919,8 @@ mod tests { let store = FixtureStorage::from(kv); - // For row 0 + row 1 + (row 2 ~ row 4), we should only get row 0, row 1 and an error. + // For row 0 + row 1 + (row 2 ~ row 4), we should only get row 0, row 1 and an + // error. for corrupted_row_index in 2..=4 { let mut executor = BatchTableScanExecutor::new( store.clone(), @@ -1013,8 +1026,8 @@ mod tests { let store = FixtureStorage::new(kv.into_iter().collect()); // Case 1: row 0 + row 1 + row 2 - // We should get row 0 and error because no further rows should be scanned when there is - // an error. + // We should get row 0 and error because no further rows should be scanned when + // there is an error. { let mut executor = BatchTableScanExecutor::new( store.clone(), @@ -1052,8 +1065,8 @@ mod tests { } // Case 1b: row 0 + row 1 + row 2 - // We should get row 0 and error because no further rows should be scanned when there is - // an error. With EXTRA_PHYSICAL_TABLE_ID_COL + // We should get row 0 and error because no further rows should be scanned when + // there is an error. With EXTRA_PHYSICAL_TABLE_ID_COL { let mut columns_info = columns_info.clone(); columns_info.push({ @@ -1228,8 +1241,8 @@ mod tests { // This test makes a pk column with id = 1 and non-pk columns with id // in 10 to 10 + columns_is_pk.len(). - // PK columns will be set to column 1 and others will be set to column 10 + i, where i is - // the index of each column. + // PK columns will be set to column 1 and others will be set to column 10 + i, + // where i is the index of each column. let mut columns_info = Vec::new(); for (i, is_pk) in columns_is_pk.iter().enumerate() { @@ -1378,7 +1391,8 @@ mod tests { assert_eq!(result.is_drained.unwrap(), true); assert_eq!(result.logical_rows.len(), 1); - // We expect we fill the primary column with the value embedded in the common handle. + // We expect we fill the primary column with the value embedded in the common + // handle. for i in 0..result.physical_columns.columns_len() { result.physical_columns[i] .ensure_all_decoded_for_test(&mut EvalContext::default(), &schema[i]) @@ -1563,7 +1577,8 @@ mod tests { result.physical_columns.columns_len(), columns.len() - missed_columns_info.len() ); - // We expect we fill the primary column with the value embedded in the common handle. + // We expect we fill the primary column with the value embedded in the common + // handle. for i in 0..result.physical_columns.columns_len() { result.physical_columns[i] .ensure_all_decoded_for_test(&mut EvalContext::default(), &schema[i]) diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index 112a3f3c33b..39f009784f0 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -18,32 +18,34 @@ use crate::{interface::*, util::*}; pub struct BatchTopNExecutor { /// The heap, which contains N rows at most. /// - /// This field is placed before `eval_columns_buffer_unsafe`, `order_exprs`, `order_is_desc` - /// and `src` because it relies on data in those fields and we want this field to be dropped - /// first. + /// This field is placed before `eval_columns_buffer_unsafe`, `order_exprs`, + /// `order_is_desc` and `src` because it relies on data in those fields + /// and we want this field to be dropped first. heap: BinaryHeap, - /// A collection of all evaluated columns. This is to avoid repeated allocations in - /// each `next_batch()`. + /// A collection of all evaluated columns. This is to avoid repeated + /// allocations in each `next_batch()`. /// - /// DO NOT EVER try to read the content of the elements directly, since it is highly unsafe. - /// The lifetime of elements is not really 'static. Certain elements are valid only if both - /// of the following conditions are satisfied: + /// DO NOT EVER try to read the content of the elements directly, since it + /// is highly unsafe. The lifetime of elements is not really 'static. + /// Certain elements are valid only if both of the following conditions + /// are satisfied: /// /// 1. `BatchTopNExecutor` is valid (i.e. not dropped). /// - /// 2. The referenced `LazyBatchColumnVec` of the element must be valid, which only happens - /// when at least one of the row is in the `heap`. Note that rows may be swapped out from - /// `heap` at any time. + /// 2. The referenced `LazyBatchColumnVec` of the element must be valid, + /// which only happens when at least one of the row is in the `heap`. + /// Note that rows may be swapped out from `heap` at any time. /// - /// This field is placed before `order_exprs` and `src` because it relies on data in - /// those fields and we want this field to be dropped first. + /// This field is placed before `order_exprs` and `src` because it relies on + /// data in those fields and we want this field to be dropped first. #[allow(clippy::box_collection)] eval_columns_buffer_unsafe: Box>>, order_exprs: Box<[RpnExpression]>, - /// This field stores the field type of the results evaluated by the exprs in `order_exprs`. + /// This field stores the field type of the results evaluated by the exprs + /// in `order_exprs`. order_exprs_field_type: Box<[FieldType]>, /// Whether or not it is descending order for each order by column. @@ -56,13 +58,14 @@ pub struct BatchTopNExecutor { is_ended: bool, } -/// All `NonNull` pointers in `BatchTopNExecutor` cannot be accessed out of the struct and -/// `BatchTopNExecutor` doesn't leak the pointers to other threads. Therefore, with those `NonNull` -/// pointers, BatchTopNExecutor still remains `Send`. +/// All `NonNull` pointers in `BatchTopNExecutor` cannot be accessed out of the +/// struct and `BatchTopNExecutor` doesn't leak the pointers to other threads. +/// Therefore, with those `NonNull` pointers, BatchTopNExecutor still remains +/// `Send`. unsafe impl Send for BatchTopNExecutor {} -// We assign a dummy type `Box>` so that we can omit the type -// when calling `check_supported`. +// We assign a dummy type `Box>` so that we +// can omit the type when calling `check_supported`. impl BatchTopNExecutor>> { /// Checks whether this executor can be used. #[inline] @@ -208,8 +211,8 @@ impl BatchTopNExecutor { &logical_rows, )?; - // Pin data behind an Arc, so that they won't be dropped as long as this `pinned_data` - // is kept somewhere. + // Pin data behind an Arc, so that they won't be dropped as long as this + // `pinned_data` is kept somewhere. let pinned_source_data = Arc::new(HeapItemSourceData { physical_columns, logical_rows, @@ -404,8 +407,8 @@ struct HeapItemSourceData { /// The item in the heap of `BatchTopNExecutor`. /// -/// WARN: The content of this structure is valid only if `BatchTopNExecutor` is valid (i.e. -/// not dropped). Thus it is called unsafe. +/// WARN: The content of this structure is valid only if `BatchTopNExecutor` is +/// valid (i.e. not dropped). Thus it is called unsafe. struct HeapItemUnsafe { /// A pointer to the `order_is_desc` field in `BatchTopNExecutor`. order_is_desc_ptr: NonNull<[bool]>, @@ -424,7 +427,8 @@ struct HeapItemUnsafe { /// The length of evaluated columns in the buffer is `order_is_desc.len()`. eval_columns_offset: usize, - /// Which logical row in the evaluated columns this heap item is representing. + /// Which logical row in the evaluated columns this heap item is + /// representing. logical_row_index: usize, } @@ -460,8 +464,9 @@ impl HeapItemUnsafe { let lhs = lhs_node.get_logical_scalar_ref(self.logical_row_index); let rhs = rhs_node.get_logical_scalar_ref(other.logical_row_index); - // There is panic inside, but will never panic, since the data type of corresponding - // column should be consistent for each `HeapItemUnsafe`. + // There is panic inside, but will never panic, since the data type of + // corresponding column should be consistent for each + // `HeapItemUnsafe`. let ord = lhs.cmp_sort_key(&rhs, &order_exprs_field_type[column_idx])?; if ord == Ordering::Equal { @@ -478,8 +483,8 @@ impl HeapItemUnsafe { } } -/// WARN: HeapItemUnsafe implements partial ordering. It panics when Collator fails to parse. -/// So make sure that it is valid before putting it into a heap. +/// WARN: HeapItemUnsafe implements partial ordering. It panics when Collator +/// fails to parse. So make sure that it is valid before putting it into a heap. impl Ord for HeapItemUnsafe { fn cmp(&self, other: &Self) -> Ordering { self.cmp_sort_key(other).unwrap() @@ -884,6 +889,7 @@ mod tests { /// Builds an executor that will return these data: /// + /// ```text /// == Schema == /// Col0 (Bytes[Utf8Mb4GeneralCi]) Col1(Bytes[Utf8Mb4Bin]) Col2(Bytes[Binary]) /// == Call #1 == @@ -897,6 +903,7 @@ mod tests { /// "Aa" NULL "aaa" /// "aaa" "Aa" "áa" /// (drained) + /// ``` fn make_bytes_src_executor() -> MockExecutor { MockExecutor::new( vec![ @@ -1139,6 +1146,7 @@ mod tests { /// Builds an executor that will return these data: /// + /// ```text /// == Schema == /// Col0 (LongLong(Unsigned)) Col1(LongLong[Signed]) Col2(Long[Unsigned]) /// == Call #1 == @@ -1152,6 +1160,7 @@ mod tests { /// 300 300 300 /// 9,223,372,036,854,775,808 -9,223,372,036,854,775,808 2,147,483,648 /// (drained) (drained) (drained) + /// ``` fn make_src_executor_unsigned() -> MockExecutor { MockExecutor::new( vec![ diff --git a/components/tidb_query_executors/src/util/aggr_executor.rs b/components/tidb_query_executors/src/util/aggr_executor.rs index 74a9429b390..a40c0c9aec4 100644 --- a/components/tidb_query_executors/src/util/aggr_executor.rs +++ b/components/tidb_query_executors/src/util/aggr_executor.rs @@ -9,7 +9,8 @@ //! ^^^^^ ^^^^ : Group By Expressions //! ``` //! -//! The SQL above has 2 GROUP BY columns, so we say it's *group by cardinality* is 2. +//! The SQL above has 2 GROUP BY columns, so we say it's *group by cardinality* +//! is 2. //! //! In the result: //! @@ -22,9 +23,9 @@ //! ^^^^^^ ^^^^^ : Group By Column //! ``` //! -//! Some aggregate function output multiple results, for example, `AVG(Int)` output two results: -//! count and sum. In this case we say that the result of `AVG(Int)` has a *cardinality* of 2. -//! +//! Some aggregate function output multiple results, for example, `AVG(Int)` +//! output two results: count and sum. In this case we say that the result of +//! `AVG(Int)` has a *cardinality* of 2. use std::{convert::TryFrom, sync::Arc}; @@ -44,17 +45,20 @@ use tipb::{Expr, FieldType}; use crate::interface::*; pub trait AggregationExecutorImpl: Send { - /// Accepts entities without any group by columns and modifies them optionally. + /// Accepts entities without any group by columns and modifies them + /// optionally. /// - /// Implementors should modify the `schema` entity when there are group by columns. + /// Implementors should modify the `schema` entity when there are group by + /// columns. /// /// This function will be called only once. fn prepare_entities(&mut self, entities: &mut Entities); - /// Processes a set of columns which are emitted from the underlying executor. + /// Processes a set of columns which are emitted from the underlying + /// executor. /// - /// Implementors should update the aggregate function states according to the data of - /// these columns. + /// Implementors should update the aggregate function states according to + /// the data of these columns. fn process_batch_input( &mut self, entities: &mut Entities, @@ -64,19 +68,20 @@ pub trait AggregationExecutorImpl: Send { /// Returns the current number of groups. /// - /// Note that this number can be inaccurate because it is a hint for the capacity of the vector. + /// Note that this number can be inaccurate because it is a hint for the + /// capacity of the vector. fn groups_len(&self) -> usize; /// Iterates aggregate function states for each available group. /// - /// Implementors should call `iteratee` for each group with the aggregate function states of - /// that group as the argument. + /// Implementors should call `iteratee` for each group with the aggregate + /// function states of that group as the argument. /// - /// Implementors may return the content of each group as extra columns in the return value - /// if there are group by columns. + /// Implementors may return the content of each group as extra columns in + /// the return value if there are group by columns. /// - /// Implementors should not iterate the same group multiple times for the same partial - /// input data. + /// Implementors should not iterate the same group multiple times for the + /// same partial input data. fn iterate_available_groups( &mut self, entities: &mut Entities, @@ -84,10 +89,12 @@ pub trait AggregationExecutorImpl: Send { iteratee: impl FnMut(&mut Entities, &[Box]) -> Result<()>, ) -> Result>; - /// Returns whether we can now output partial aggregate results when the source is not drained. + /// Returns whether we can now output partial aggregate results when the + /// source is not drained. /// - /// This method is called only when the source is not drained because aggregate result is always - /// ready if the source is drained and no error occurs. + /// This method is called only when the source is not drained because + /// aggregate result is always ready if the source is drained and no + /// error occurs. fn is_partial_results_ready(&self) -> bool; } @@ -97,8 +104,8 @@ pub struct Entities { pub src: Src, pub context: EvalContext, - /// The schema of the aggregation executor. It consists of aggregate result columns and - /// group by columns. + /// The schema of the aggregation executor. It consists of aggregate result + /// columns and group by columns. pub schema: Vec, /// The aggregate function. @@ -110,13 +117,14 @@ pub struct Entities { /// The (input) expression of each aggregate function. pub each_aggr_exprs: Vec, - /// The eval type of the result columns of all aggregate functions. One aggregate function - /// may have multiple result columns. + /// The eval type of the result columns of all aggregate functions. One + /// aggregate function may have multiple result columns. pub all_result_column_types: Vec, } -/// A shared executor implementation for simple aggregation, hash aggregation and -/// stream aggregation. Implementation differences are further given via `AggregationExecutorImpl`. +/// A shared executor implementation for simple aggregation, hash aggregation +/// and stream aggregation. Implementation differences are further given via +/// `AggregationExecutorImpl`. pub struct AggregationExecutor> { imp: I, is_ended: bool, @@ -154,7 +162,8 @@ impl> AggregationExecutor schema_len); - // Currently only support 1 parameter aggregate functions, so let's simply assert it. + // Currently only support 1 parameter aggregate functions, so let's simply + // assert it. assert_eq!(each_aggr_exprs.len(), each_aggr_exprs_len + 1); each_aggr_fn.push(aggr_fn); @@ -190,7 +199,8 @@ impl> AggregationExecutor Result<(Option, bool)> { // Use max batch size from the beginning because aggregation @@ -199,12 +209,13 @@ impl> AggregationExecutor MockExecutor { MockExecutor::new( vec![ diff --git a/components/tidb_query_executors/src/util/hash_aggr_helper.rs b/components/tidb_query_executors/src/util/hash_aggr_helper.rs index 7795b1c1062..e357d065030 100644 --- a/components/tidb_query_executors/src/util/hash_aggr_helper.rs +++ b/components/tidb_query_executors/src/util/hash_aggr_helper.rs @@ -16,8 +16,9 @@ pub struct HashAggregationHelper; impl HashAggregationHelper { /// Updates states for each row. /// - /// Each row may belong to a different group. States of all groups should be passed in altogether - /// in a single vector and the states of each row should be specified by an offset vector. + /// Each row may belong to a different group. States of all groups should be + /// passed in altogether in a single vector and the states of each row + /// should be specified by an offset vector. pub fn update_each_row_states_by_offset( entities: &mut Entities, input_physical_columns: &mut LazyBatchColumnVec, diff --git a/components/tidb_query_executors/src/util/mock_executor.rs b/components/tidb_query_executors/src/util/mock_executor.rs index 1f61f811b8c..ae20695033f 100644 --- a/components/tidb_query_executors/src/util/mock_executor.rs +++ b/components/tidb_query_executors/src/util/mock_executor.rs @@ -9,8 +9,8 @@ use tipb::FieldType; use crate::interface::*; -/// A simple mock executor that will return batch data according to a fixture without any -/// modification. +/// A simple mock executor that will return batch data according to a fixture +/// without any modification. /// /// Normally this should be only used in tests. pub struct MockExecutor { diff --git a/components/tidb_query_executors/src/util/mod.rs b/components/tidb_query_executors/src/util/mod.rs index cd6c0e1ed5e..6aa578459e2 100644 --- a/components/tidb_query_executors/src/util/mod.rs +++ b/components/tidb_query_executors/src/util/mod.rs @@ -25,8 +25,8 @@ pub fn ensure_columns_decoded( Ok(()) } -/// Evaluates expressions and outputs the result into the given Vec. Lifetime of the expressions -/// are erased. +/// Evaluates expressions and outputs the result into the given Vec. Lifetime of +/// the expressions are erased. pub unsafe fn eval_exprs_decoded_no_lifetime<'a>( ctx: &mut EvalContext, exprs: &[RpnExpression], diff --git a/components/tidb_query_executors/src/util/scan_executor.rs b/components/tidb_query_executors/src/util/scan_executor.rs index 114bc77ee1a..c9a88fb820e 100644 --- a/components/tidb_query_executors/src/util/scan_executor.rs +++ b/components/tidb_query_executors/src/util/scan_executor.rs @@ -25,8 +25,9 @@ pub trait ScanExecutorImpl: Send { /// Accepts a key value pair and fills the column vector. /// - /// The column vector does not need to be regular when there are errors during this process. - /// However if there is no error, the column vector must be regular. + /// The column vector does not need to be regular when there are errors + /// during this process. However if there is no error, the column vector + /// must be regular. fn process_kv_pair( &mut self, key: &[u8], @@ -35,8 +36,9 @@ pub trait ScanExecutorImpl: Send { ) -> Result<()>; } -/// A shared executor implementation for both table scan and index scan. Implementation differences -/// between table scan and index scan are further given via `ScanExecutorImpl`. +/// A shared executor implementation for both table scan and index scan. +/// Implementation differences between table scan and index scan are further +/// given via `ScanExecutorImpl`. pub struct ScanExecutor { /// The internal scanning implementation. imp: I, @@ -44,9 +46,9 @@ pub struct ScanExecutor { /// The scanner that scans over ranges. scanner: RangesScanner, - /// A flag indicating whether this executor is ended. When table is drained or there was an - /// error scanning the table, this flag will be set to `true` and `next_batch` should be never - /// called again. + /// A flag indicating whether this executor is ended. When table is drained + /// or there was an error scanning the table, this flag will be set to + /// `true` and `next_batch` should be never called again. is_ended: bool, } @@ -94,7 +96,8 @@ impl ScanExecutor { /// Fills a column vector and returns whether or not all ranges are drained. /// - /// The columns are ensured to be regular even if there are errors during the process. + /// The columns are ensured to be regular even if there are errors during + /// the process. fn fill_column_vec( &mut self, scan_rows: usize, @@ -129,7 +132,8 @@ impl ScanExecutor { } /// Extracts `FieldType` from `ColumnInfo`. -// TODO: Embed FieldType in ColumnInfo directly in Cop DAG v2 to remove this function. +// TODO: Embed FieldType in ColumnInfo directly in Cop DAG v2 to remove this +// function. pub fn field_type_from_column_info(ci: &ColumnInfo) -> FieldType { let mut field_type = FieldType::default(); field_type.set_tp(ci.get_tp()); @@ -176,9 +180,9 @@ impl BatchExecutor for ScanExecutor { let logical_rows = (0..logical_columns.rows_len()).collect(); // TODO - // If `is_drained.is_err()`, it means that there is an error after *successfully* retrieving - // these rows. After that, if we only consumes some of the rows (TopN / Limit), we should - // ignore this error. + // If `is_drained.is_err()`, it means that there is an error after + // *successfully* retrieving these rows. After that, if we only consumes + // some of the rows (TopN / Limit), we should ignore this error. match &is_drained { // Note: `self.is_ended` is only used for assertion purpose. diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index e283a78d245..16f6a8f66c2 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -223,8 +223,8 @@ fn get_cast_fn_rpn_meta( /// Gets the cast function between specified data types. /// -/// TODO: This function supports some internal casts performed by TiKV. However it would be better -/// to be done in TiDB. +/// TODO: This function supports some internal casts performed by TiKV. However +/// it would be better to be done in TiDB. pub fn get_cast_fn_rpn_node( is_from_constant: bool, from_field_type: &FieldType, @@ -333,8 +333,9 @@ fn cast_string_as_int( match val { None => Ok(None), Some(val) => { - // TODO: in TiDB, if `b.args[0].GetType().Hybrid()` || `IsBinaryLiteral(b.args[0])`, - // then it will return res from EvalInt() directly. + // TODO: in TiDB, if `b.args[0].GetType().Hybrid()` || + // `IsBinaryLiteral(b.args[0])`, then it will return res from + // EvalInt() directly. let is_unsigned = extra.ret_field_type.is_unsigned(); let val = get_valid_utf8_prefix(ctx, val)?; let val = val.trim(); @@ -480,8 +481,8 @@ fn cast_signed_int_as_unsigned_real( } } -// because we needn't to consider if uint overflow upper boundary of signed real, -// so we can merge uint to signed/unsigned real in one function +// because we needn't to consider if uint overflow upper boundary of signed +// real, so we can merge uint to signed/unsigned real in one function #[rpn_fn(nullable)] #[inline] fn cast_unsigned_int_as_signed_or_unsigned_real(val: Option<&Int>) -> Result> { @@ -710,9 +711,10 @@ fn cast_float_real_as_string( } } -// FIXME: We cannot use specialization in current Rust version, so impl ConvertTo for Bytes cannot -// pass compile because of we have impl Convert for T where T: ToString + Evaluable -// Refactor this part after https://github.com/rust-lang/rust/issues/31844 closed +// FIXME: We cannot use specialization in current Rust version, so impl +// ConvertTo for Bytes cannot pass compile because of we have impl +// Convert for T where T: ToString + Evaluable +// Refactor this part after https://github.com/rust-lang/rust/issues/31844 closed #[rpn_fn(nullable, capture = [ctx, extra])] #[inline] fn cast_string_as_string( @@ -841,7 +843,8 @@ fn cast_string_as_unsigned_decimal( match val { None => Ok(None), Some(val) => { - // FIXME: in TiDB, if the param IsBinaryLiteral, then return the result of `evalDecimal` directly + // FIXME: in TiDB, if the param IsBinaryLiteral, then return the result of + // `evalDecimal` directly let d: Decimal = val.convert(ctx)?; let d = if metadata.get_in_union() && d.is_negative() { Decimal::zero() @@ -1302,7 +1305,8 @@ fn cast_string_as_json(extra: &RpnFnCallExtra<'_>, val: Option) -> Res let val: Json = s.parse()?; Ok(Some(val)) } else { - // FIXME: port `JSONBinary` from TiDB to adapt if the bytes is not a valid utf8 string + // FIXME: port `JSONBinary` from TiDB to adapt if the bytes is not a valid utf8 + // string let val = unsafe { String::from_utf8_unchecked(val.to_owned()) }; Ok(Some(Json::from_string(val)?)) } @@ -2308,9 +2312,10 @@ mod tests { // and `show warnings` will show // `| Warning | 1292 | Truncated incorrect INTEGER value: '18446744073709551616'` // fix this cast_string_as_int after fix TiDB's - // ("18446744073709551616", 18446744073709551615 as i64, Some(ERR_TRUNCATE_WRONG_VALUE) , Cond::Unsigned) - // FIXME: our cast_string_as_int's err handle is not exactly same as TiDB's - // ("18446744073709551616", 18446744073709551615u64 as i64, Some(ERR_TRUNCATE_WRONG_VALUE), Cond::InSelectStmt), + // ("18446744073709551616", 18446744073709551615 as i64, Some(ERR_TRUNCATE_WRONG_VALUE) + // , Cond::Unsigned) FIXME: our cast_string_as_int's err handle is not + // exactly same as TiDB's ("18446744073709551616", 18446744073709551615u64 + // as i64, Some(ERR_TRUNCATE_WRONG_VALUE), Cond::InSelectStmt), // has prefix `-` and in_union and unsigned ("-10", 0, vec![], Cond::InUnionAndUnsigned), @@ -2559,7 +2564,8 @@ mod tests { fn test_time_as_int_and_uint() { let mut ctx = EvalContext::default(); // TODO: add more test case - // TODO: add test that make cast_any_as_any:: returning truncated error + // TODO: add test that make cast_any_as_any:: returning truncated + // error let cs: Vec<(Time, i64)> = vec![ ( Time::parse_datetime(&mut ctx, "2000-01-01T12:13:14", 0, true).unwrap(), @@ -2570,8 +2576,12 @@ mod tests { 20000101121315, ), // FiXME - // Time::parse_utc_datetime("2000-01-01T12:13:14.6666", 4).unwrap().round_frac(DEFAULT_FSP) - // will get 2000-01-01T12:13:14, this is a bug + // ``` + // Time::parse_utc_datetime("2000-01-01T12:13:14.6666", 4) + // .unwrap() + // .round_frac(DEFAULT_FSP) + // ``` + // will get 2000-01-01T12:13:14, this is a bug // ( // Time::parse_utc_datetime("2000-01-01T12:13:14.6666", 4).unwrap(), // 20000101121315, @@ -2775,7 +2785,7 @@ mod tests { for (input, expected, fsp) in cases { let mut ctx = EvalContext::default(); let time = - Time::parse_timestamp(&mut ctx, input, MAX_FSP, /* Enable round*/ true).unwrap(); + Time::parse_timestamp(&mut ctx, input, MAX_FSP, /* Enable round */ true).unwrap(); let actual: Time = RpnFnScalarEvaluator::new() .push_param(time) @@ -3528,9 +3538,11 @@ mod tests { vec![ERR_TRUNCATE_WRONG_VALUE, ERR_DATA_OUT_OF_RANGE], ), // the case below has 3 warning - // 1. from getValidFloatPrefix, because of `-1234abc`'s `abc`, (ERR_TRUNCATE_WRONG_VALUE) - // 2. from ProduceFloatWithSpecifiedTp, because of TruncateFloat (ERR_DATA_OUT_OF_RANGE) - // 3. from ProduceFloatWithSpecifiedTp, because of unsigned but negative (ERR_DATA_OUT_OF_RANGE) + // - from getValidFloatPrefix, because of `-1234abc`'s `abc`, + // (ERR_TRUNCATE_WRONG_VALUE) + // - from ProduceFloatWithSpecifiedTp, because of TruncateFloat (ERR_DATA_OUT_OF_RANGE) + // - from ProduceFloatWithSpecifiedTp, because of unsigned but negative + // (ERR_DATA_OUT_OF_RANGE) ( String::from("-1234abc"), 0.0, @@ -3865,8 +3877,8 @@ mod tests { } /// base_cs: - /// vector of (T, T to bytes(without any other handle do by cast_as_string_helper), - /// T to string for debug output), + /// vector of (T, T to bytes(without any other handle do by + /// cast_as_string_helper), T to string for debug output), /// the object should not be zero len. #[allow(clippy::type_complexity)] fn test_as_string_helper( @@ -4627,8 +4639,8 @@ mod tests { // ( // origin, origin_flen, origin_decimal, res_flen, res_decimal, is_unsigned, // expect, warning_err_code, - // (InInsertStmt || InUpdateStmt || InDeleteStmt), overflow_as_warning, truncate_as_warning - // ) + // (InInsertStmt || InUpdateStmt || InDeleteStmt), overflow_as_warning, + // truncate_as_warning ) // // The origin_flen, origin_decimal here is // to let the programmer clearly know what the flen and decimal of the decimal is. @@ -4983,8 +4995,9 @@ mod tests { } // These test depend on the correctness of - // Decimal::from(u64), Decimal::from(i64), Decimal::from_f64(), Decimal::from_bytes() - // Decimal::zero(), Decimal::round, max_or_min_dec, max_decimal + // Decimal::from(u64), Decimal::from(i64), Decimal::from_f64(), + // Decimal::from_bytes() Decimal::zero(), Decimal::round, max_or_min_dec, + // max_decimal #[test] fn test_unsigned_int_as_signed_or_unsigned_decimal() { test_none_with_ctx_and_extra(cast_unsigned_int_as_signed_or_unsigned_decimal); @@ -6087,8 +6100,9 @@ mod tests { { // cast_real_as_duration call `Duration::parse`, directly, // and `Duration::parse`, is test in duration.rs. - // Our test here is to make sure that the result is same as calling `Duration::parse`, - // no matter whether call_real_as_duration call `Duration::parse`, directly. + // Our test here is to make sure that the result is same as calling + // `Duration::parse`, no matter whether call_real_as_duration call + // `Duration::parse`, directly. for val in base_cs { for fsp in MIN_FSP..=MAX_FSP { let mut ctx = CtxConfig { @@ -6756,7 +6770,8 @@ mod tests { // TODO: add more case for other TimeType let cs = vec![ - // Add time_type filed here is to make maintainer know clearly that what is the type of the time. + // Add time_type filed here is to make maintainer know clearly that what is the type of + // the time. ( Time::parse_datetime(&mut ctx, "2000-01-01T12:13:14", 0, true).unwrap(), TimeType::DateTime, diff --git a/components/tidb_query_expr/src/impl_compare_in.rs b/components/tidb_query_expr/src/impl_compare_in.rs index 03b5919b410..d518c9061a0 100644 --- a/components/tidb_query_expr/src/impl_compare_in.rs +++ b/components/tidb_query_expr/src/impl_compare_in.rs @@ -161,8 +161,8 @@ impl InByCompare for Int {} impl InByCompare for Real {} impl InByCompare for Decimal {} impl InByCompare for Duration {} -// DateTime requires TZInfo in context, and we cannot acquire it during metadata_mapper. -// TODO: implement InByHash for DateTime. +// DateTime requires TZInfo in context, and we cannot acquire it during +// metadata_mapper. TODO: implement InByHash for DateTime. impl InByCompare for DateTime {} #[derive(Debug)] diff --git a/components/tidb_query_expr/src/impl_encryption.rs b/components/tidb_query_expr/src/impl_encryption.rs index 9bf99d9f52a..3a51f798442 100644 --- a/components/tidb_query_expr/src/impl_encryption.rs +++ b/components/tidb_query_expr/src/impl_encryption.rs @@ -77,8 +77,9 @@ pub fn compress(input: BytesRef, writer: BytesWriter) -> Result { return Ok(writer.write_ref(Some(b""))); } let mut e = ZlibEncoder::new(input, Compression::default()); - // preferred capacity is input length plus four bytes length header and one extra end "." - // max capacity is isize::max_value(), or will panic with "capacity overflow" + // preferred capacity is input length plus four bytes length header and one + // extra end "." max capacity is isize::max_value(), or will panic with + // "capacity overflow" let mut vec = Vec::with_capacity((input.len() + 5).min(isize::max_value() as usize)); vec.resize(4, 0); LittleEndian::write_u32(&mut vec, input.len() as u32); @@ -116,10 +117,11 @@ pub fn uncompress( let mut d = ZlibDecoder::new(&input[4..]); let mut vec = Vec::with_capacity(len); - // if the length of uncompressed string is greater than the length we read from the first - // four bytes, return null and generate a length corrupted warning. - // if the length of uncompressed string is zero or uncompress fail, return null and generate - // a data corrupted warning + // - if the length of uncompressed string is greater than the length we read + // from the first four bytes, return null and generate a length corrupted + // warning. + // - if the length of uncompressed string is zero or uncompress fail, return + // null and generate a data corrupted warning match d.read_to_end(&mut vec) { match d.read_to_end(&mut vec) { Ok(decoded_len) if len >= decoded_len && decoded_len != 0 => { Ok(writer.write_ref(Some(vec.as_ref()))) @@ -458,7 +460,7 @@ mod tests { ); } - //test NULL case + // test NULL case assert!( RpnFnScalarEvaluator::new() .push_param(ScalarValue::Int(None)) diff --git a/components/tidb_query_expr/src/impl_json.rs b/components/tidb_query_expr/src/impl_json.rs index 0b42c953712..5e5595bd3ed 100644 --- a/components/tidb_query_expr/src/impl_json.rs +++ b/components/tidb_query_expr/src/impl_json.rs @@ -66,7 +66,8 @@ fn json_modify(args: &[ScalarValueRef], mt: ModifyType) -> Result> Ok(Some(base.as_ref().modify(&path_expr_list, values, mt)?)) } -/// validate the arguments are `(Option, &[(Option, Option)])` +/// validate the arguments are `(Option, &[(Option, +/// Option)])` fn json_modify_validator(expr: &tipb::Expr) -> Result<()> { let children = expr.get_children(); assert!(children.len() >= 2); diff --git a/components/tidb_query_expr/src/impl_math.rs b/components/tidb_query_expr/src/impl_math.rs index 798ca2b9c6a..80484c224c4 100644 --- a/components/tidb_query_expr/src/impl_math.rs +++ b/components/tidb_query_expr/src/impl_math.rs @@ -639,15 +639,22 @@ fn extract_num(num_s: &str, is_neg: bool, from_base: IntWithSign) -> IntWithSign } } -// Returns (isize, is_positive): convert an i64 to usize, and whether the input is positive +// Returns (isize, is_positive): convert an i64 to usize, and whether the input +// is positive // // # Examples // ``` // assert_eq!(i64_to_usize(1_i64, false), (1_usize, true)); // assert_eq!(i64_to_usize(1_i64, false), (1_usize, true)); // assert_eq!(i64_to_usize(-1_i64, false), (1_usize, false)); -// assert_eq!(i64_to_usize(u64::max_value() as i64, true), (u64::max_value() as usize, true)); -// assert_eq!(i64_to_usize(u64::max_value() as i64, false), (1_usize, false)); +// assert_eq!( +// i64_to_usize(u64::max_value() as i64, true), +// (u64::max_value() as usize, true) +// ); +// assert_eq!( +// i64_to_usize(u64::max_value() as i64, false), +// (1_usize, false) +// ); // ``` #[inline] pub fn i64_to_usize(i: i64, is_unsigned: bool) -> (usize, bool) { @@ -1272,7 +1279,8 @@ mod tests { (std::f64::consts::PI, 0.0_f64), ( (std::f64::consts::PI * 3.0) / 4.0, - f64::tan((std::f64::consts::PI * 3.0) / 4.0), //in mysql and rust, it equals -1.0000000000000002, not -1 + f64::tan((std::f64::consts::PI * 3.0) / 4.0), /* in mysql and rust, it equals + * -1.0000000000000002, not -1 */ ), ]; for (input, expect) in test_cases { diff --git a/components/tidb_query_expr/src/impl_op.rs b/components/tidb_query_expr/src/impl_op.rs index dce8920a545..5ecb4e9a7dc 100644 --- a/components/tidb_query_expr/src/impl_op.rs +++ b/components/tidb_query_expr/src/impl_op.rs @@ -29,7 +29,8 @@ pub fn logical_or(arg0: Option<&i64>, arg1: Option<&i64>) -> Result> #[rpn_fn(nullable)] #[inline] pub fn logical_xor(arg0: Option<&i64>, arg1: Option<&i64>) -> Result> { - // evaluates to 1 if an odd number of operands is nonzero, otherwise 0 is returned. + // evaluates to 1 if an odd number of operands is nonzero, otherwise 0 is + // returned. Ok(match (arg0, arg1) { (Some(arg0), Some(arg1)) => Some(((*arg0 == 0) ^ (*arg1 == 0)) as i64), _ => None, diff --git a/components/tidb_query_expr/src/impl_string.rs b/components/tidb_query_expr/src/impl_string.rs index c43b0ff2f1f..9ebba24ed43 100644 --- a/components/tidb_query_expr/src/impl_string.rs +++ b/components/tidb_query_expr/src/impl_string.rs @@ -16,10 +16,12 @@ const SPACE: u8 = 0o40u8; const MAX_BLOB_WIDTH: i32 = 16_777_216; // FIXME: Should be isize // see https://dev.mysql.com/doc/refman/5.7/en/string-functions.html#function_to-base64 -// mysql base64 doc: A newline is added after each 76 characters of encoded output +// mysql base64 doc: A newline is added after each 76 characters of encoded +// output const BASE64_LINE_WRAP_LENGTH: usize = 76; -// mysql base64 doc: Each 3 bytes of the input data are encoded using 4 characters. +// mysql base64 doc: Each 3 bytes of the input data are encoded using 4 +// characters. const BASE64_INPUT_CHUNK_LENGTH: usize = 3; const BASE64_ENCODED_CHUNK_LENGTH: usize = 4; const BASE64_LINE_WRAP: u8 = b'\n'; @@ -379,11 +381,13 @@ pub fn rpad_utf8( } } -// when target_len is 0, return Some(0), means the pad function should return empty string -// currently there are three conditions it return None, which means pad function should return Null -// 1. target_len is negative -// 2. target_len of type in byte is larger then MAX_BLOB_WIDTH -// 3. target_len is greater than length of input string, *and* pad string is empty +// when target_len is 0, return Some(0), means the pad function should return +// empty string currently there are three conditions it return None, which means +// pad function should return Null +// - target_len is negative +// - target_len of type in byte is larger then MAX_BLOB_WIDTH +// - target_len is greater than length of input string, *and* pad string is +// empty // otherwise return Some(target_len) #[inline] fn validate_target_len_for_pad( diff --git a/components/tidb_query_expr/src/impl_time.rs b/components/tidb_query_expr/src/impl_time.rs index 5914740c8fa..80912fd6526 100644 --- a/components/tidb_query_expr/src/impl_time.rs +++ b/components/tidb_query_expr/src/impl_time.rs @@ -179,7 +179,8 @@ pub fn week_of_year(ctx: &mut EvalContext, t: Option<&DateTime>) -> Result 198652, here the first 4 digits represents year, and the last 2 digits represents week. +// e.g.: SELECT YEARWEEK('1987-01-01'); -- -> 198652, here the first 4 digits +// represents year, and the last 2 digits represents week. #[rpn_fn(capture = [ctx])] #[inline] pub fn year_week_with_mode(ctx: &mut EvalContext, t: &DateTime, mode: &Int) -> Result> { @@ -810,8 +811,8 @@ pub fn duration_duration_time_diff( ) -> Result> { let res = match arg1.checked_sub(*arg2) { Some(res) => res, - // `check_sub` returns `None` if the sub operation overflow/underflow i64 bound or mysql_time_value bound. - // and we need to treat these two case separately. + // `check_sub` returns `None` if the sub operation overflow/underflow i64 bound or + // mysql_time_value bound. and we need to treat these two case separately. // if `arg1 - arg2` is in (`MAX_NANOS`, `i64::MAX`], return max value of mysql `TIME` type. // if `arg1 - arg2` is in [`i64::MIN`, `-MAX_NANOS`), return min value of mysql `TIME` type. // if `arg1 - arg2` is overflow or underflow i64, return `None`. @@ -1075,23 +1076,26 @@ mod tests { assert_eq!(output, expect, "{:?} {:?}", date, format); } - // // TODO: pass this test after refactoring the issue #3953 is fixed. - // { - // let format: Option = Some("abc%b %M %m %c %D %d %e %j".as_bytes().to_vec()); - // let time: Option = Some( DateTime::parse_utc_datetime("0000-00-00 00:00:00", 6).unwrap()); - // - // let mut cfg = EvalConfig::new(); - // cfg.set_flag(Flag::IN_UPDATE_OR_DELETE_STMT) - // .set_sql_mode(SqlMode::NO_ZERO_DATE | SqlMode::STRICT_ALL_TABLES); - // let ctx = EvalContext::new(Arc::new(cfg)); - // - // let output = RpnFnScalarEvaluator::new() - // .context(ctx) - // .push_param(time.clone()) - // .push_param(format) - // .evaluate::(ScalarFuncSig::DateFormatSig); - // assert!(output.is_err()); - // } + // TODO: pass this test after refactoring the issue #3953 is fixed. + // { + // let format: Option = Some( + // "abc%b %M %m %c %D %d %e %j".as_bytes().to_vec()); + // let time: Option = + // Some(DateTime::parse_utc_datetime( + // "0000-00-00 00:00:00", 6).unwrap()); + + // let mut cfg = EvalConfig::new(); + // cfg.set_flag(Flag::IN_UPDATE_OR_DELETE_STMT) + // .set_sql_mode(SqlMode::NO_ZERO_DATE | SqlMode::STRICT_ALL_TABLES); + // let ctx = EvalContext::new(Arc::new(cfg)); + + // let output = RpnFnScalarEvaluator::new() + // .context(ctx) + // .push_param(time.clone()) + // .push_param(format) + // .evaluate::(ScalarFuncSig::DateFormatSig); + // assert!(output.is_err()); + // } { let mut cfg = EvalConfig::new(); @@ -1868,8 +1872,10 @@ mod tests { #[test] fn test_from_days() { let cases = vec![ - (ScalarValue::Int(Some(-140)), Some("0000-00-00")), // mysql FROM_DAYS returns 0000-00-00 for any day <= 365. - (ScalarValue::Int(Some(140)), Some("0000-00-00")), // mysql FROM_DAYS returns 0000-00-00 for any day <= 365. + (ScalarValue::Int(Some(-140)), Some("0000-00-00")), /* mysql FROM_DAYS returns + * 0000-00-00 for any day <= + * 365. */ + (ScalarValue::Int(Some(140)), Some("0000-00-00")), /* mysql FROM_DAYS returns 0000-00-00 for any day <= 365. */ (ScalarValue::Int(Some(735_000)), Some("2012-05-12")), // Leap year. (ScalarValue::Int(Some(735_030)), Some("2012-06-11")), (ScalarValue::Int(Some(735_130)), Some("2012-09-19")), @@ -1882,7 +1888,9 @@ mod tests { (ScalarValue::Int(Some(734_544)), Some("2011-02-11")), (ScalarValue::Int(Some(734_513)), Some("2011-01-11")), (ScalarValue::Int(Some(3_652_424)), Some("9999-12-31")), - (ScalarValue::Int(Some(3_652_425)), Some("0000-00-00")), // mysql FROM_DAYS returns 0000-00-00 for any day >= 3652425 + (ScalarValue::Int(Some(3_652_425)), Some("0000-00-00")), /* mysql FROM_DAYS returns + * 0000-00-00 for any day + * >= 3652425 */ (ScalarValue::Int(None), None), ]; let mut ctx = EvalContext::default(); diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index eec5bdad844..679d4e003f8 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -1,11 +1,12 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -//! This crate implements a simple SQL query engine to work with TiDB pushed down executors. +//! This crate implements a simple SQL query engine to work with TiDB pushed +//! down executors. //! -//! The query engine is able to scan and understand rows stored by TiDB, run against a -//! series of executors and then return the execution result. The query engine is provided via -//! TiKV Coprocessor interface. However standalone UDF functions are also exported and can be used -//! standalone. +//! The query engine is able to scan and understand rows stored by TiDB, run +//! against a series of executors and then return the execution result. The +//! query engine is provided via TiKV Coprocessor interface. However standalone +//! UDF functions are also exported and can be used standalone. #![allow(elided_lifetimes_in_paths)] // Necessary until rpn_fn accepts functions annotated with lifetimes. #![allow(incomplete_features)] @@ -141,7 +142,8 @@ fn map_int_sig(value: ScalarFuncSig, children: &[Expr], mapper: F) -> Result< where F: Fn(bool, bool) -> RpnFnMeta, { - // FIXME: The signature for different signed / unsigned int should be inferred at TiDB side. + // FIXME: The signature for different signed / unsigned int should be inferred + // at TiDB side. if children.len() != 2 { return Err(other_err!( "ScalarFunction {:?} (params = {}) is not supported in batch mode", @@ -220,7 +222,8 @@ fn map_rhs_int_sig(value: ScalarFuncSig, children: &[Expr], mapper: F) -> Res where F: Fn(bool) -> RpnFnMeta, { - // FIXME: The signature for different signed / unsigned int should be inferred at TiDB side. + // FIXME: The signature for different signed / unsigned int should be inferred + // at TiDB side. if children.len() != 2 { return Err(other_err!( "ScalarFunction {:?} (params = {}) is not supported in batch mode", diff --git a/components/tidb_query_expr/src/types/expr.rs b/components/tidb_query_expr/src/types/expr.rs index b94c17f8cdf..26689e762ff 100644 --- a/components/tidb_query_expr/src/types/expr.rs +++ b/components/tidb_query_expr/src/types/expr.rs @@ -24,7 +24,8 @@ pub enum RpnExpressionNode { field_type: FieldType, }, - /// Represents a reference to a column in the columns specified in evaluation. + /// Represents a reference to a column in the columns specified in + /// evaluation. ColumnRef { offset: usize }, } @@ -80,7 +81,8 @@ impl RpnExpressionNode { } } -/// An expression in Reverse Polish notation, which is simply a list of RPN expression nodes. +/// An expression in Reverse Polish notation, which is simply a list of RPN +/// expression nodes. /// /// You may want to build it using `RpnExpressionBuilder`. #[derive(Debug)] diff --git a/components/tidb_query_expr/src/types/expr_builder.rs b/components/tidb_query_expr/src/types/expr_builder.rs index d6c8aebb0c1..33c9d48de67 100644 --- a/components/tidb_query_expr/src/types/expr_builder.rs +++ b/components/tidb_query_expr/src/types/expr_builder.rs @@ -26,9 +26,9 @@ pub struct RpnExpressionBuilder(Vec); impl RpnExpressionBuilder { /// Checks whether the given expression definition tree is supported. pub fn check_expr_tree_supported(c: &Expr) -> Result<()> { - // TODO: This logic relies on the correctness of the passed in GROUP BY eval type. However - // it can be different from the one we calculated (e.g. pass a column / fn with different - // type). + // TODO: This logic relies on the correctness of the passed in GROUP BY eval + // type. However it can be different from the one we calculated (e.g. + // pass a column / fn with different type). box_try!(EvalType::try_from(c.get_field_type().as_accessor().tp())); match c.get_tp() { @@ -54,8 +54,8 @@ impl RpnExpressionBuilder { Ok(()) } - /// Gets the result type when expression tree is converted to RPN expression and evaluated. - /// The result type will be either scalar or vector. + /// Gets the result type when expression tree is converted to RPN expression + /// and evaluated. The result type will be either scalar or vector. pub fn is_expr_eval_to_scalar(c: &Expr) -> Result { match c.get_tp() { ExprType::Null @@ -157,8 +157,9 @@ impl RpnExpressionBuilder { self } - /// Pushes a `Constant` node. The field type will be auto inferred by choosing an arbitrary - /// field type that matches the field type of the given value. + /// Pushes a `Constant` node. The field type will be auto inferred by + /// choosing an arbitrary field type that matches the field type of the + /// given value. #[must_use] pub fn push_constant_for_test(mut self, value: impl Into) -> Self { let value = value.into(); @@ -241,8 +242,8 @@ impl AsRef<[RpnExpressionNode]> for RpnExpressionBuilder { /// B E F G C D A /// ``` /// -/// The transform process is very much like a post-order traversal. This function does it -/// recursively. +/// The transform process is very much like a post-order traversal. This +/// function does it recursively. fn append_rpn_nodes_recursively( tree_node: Expr, rpn_nodes: &mut Vec, @@ -315,7 +316,8 @@ where let args: Vec<_> = tree_node.take_children().into(); let args_len = args.len(); - // Visit children first, then push current node, so that it is a post-order traversal. + // Visit children first, then push current node, so that it is a post-order + // traversal. for arg in args { append_rpn_nodes_recursively(arg, rpn_nodes, ctx, fn_mapper, max_columns)?; } @@ -550,9 +552,9 @@ mod tests { unreachable!() } - /// For testing `append_rpn_nodes_recursively`. It accepts protobuf function sig enum, which - /// cannot be modified by us in tests to support fn_a ~ fn_d. So let's just hard code some - /// substitute. + /// For testing `append_rpn_nodes_recursively`. It accepts protobuf function + /// sig enum, which cannot be modified by us in tests to support fn_a ~ + /// fn_d. So let's just hard code some substitute. fn fn_mapper(expr: &Expr) -> Result { // fn_a: CastIntAsInt // fn_b: CastIntAsReal diff --git a/components/tidb_query_expr/src/types/expr_eval.rs b/components/tidb_query_expr/src/types/expr_eval.rs index c8f9083f64f..2ba3b030ef0 100644 --- a/components/tidb_query_expr/src/types/expr_eval.rs +++ b/components/tidb_query_expr/src/types/expr_eval.rs @@ -22,8 +22,8 @@ use super::{ /// /// It can be either an owned node or a reference node. /// -/// When node comes from a column reference, it is a reference node (both value and field_type -/// are references). +/// When node comes from a column reference, it is a reference node (both value +/// and field_type are references). /// /// When nodes comes from an evaluated result, it is an owned node. #[derive(Debug)] @@ -64,17 +64,20 @@ impl<'a> RpnStackNodeVectorValue<'a> { } } -/// A type for each node in the RPN evaluation stack. It can be one of a scalar value node or a -/// vector value node. The vector value node can be either an owned vector value or a reference. +/// A type for each node in the RPN evaluation stack. It can be one of a scalar +/// value node or a vector value node. The vector value node can be either an +/// owned vector value or a reference. #[derive(Debug)] pub enum RpnStackNode<'a> { - /// Represents a scalar value. Comes from a constant node in expression list. + /// Represents a scalar value. Comes from a constant node in expression + /// list. Scalar { value: &'a ScalarValue, field_type: &'a FieldType, }, - /// Represents a vector value. Comes from a column reference or evaluated result. + /// Represents a vector value. Comes from a column reference or evaluated + /// result. Vector { value: RpnStackNodeVectorValue<'a>, field_type: &'a FieldType, @@ -123,7 +126,8 @@ impl<'a> RpnStackNode<'a> { /// Gets a reference of the element by logical index. /// - /// If this is a `Scalar` variant, the returned reference will be the same for any index. + /// If this is a `Scalar` variant, the returned reference will be the same + /// for any index. /// /// # Panics /// @@ -145,13 +149,15 @@ impl<'a> RpnStackNode<'a> { impl RpnExpression { /// Evaluates the expression into a vector. /// - /// If referred columns are not decoded, they will be decoded according to the given schema. + /// If referred columns are not decoded, they will be decoded according to + /// the given schema. /// /// # Panics /// /// Panics if the expression is not valid. /// - /// Panics when referenced column does not have equal length as specified in `rows`. + /// Panics when referenced column does not have equal length as specified in + /// `rows`. pub fn eval<'a>( &'a self, ctx: &mut EvalContext, @@ -160,9 +166,10 @@ impl RpnExpression { input_logical_rows: &'a [usize], output_rows: usize, ) -> Result> { - // We iterate two times. The first time we decode all referred columns. The second time - // we evaluate. This is to make Rust's borrow checker happy because there will be - // mutable reference during the first iteration and we can't keep these references. + // We iterate two times. The first time we decode all referred columns. The + // second time we evaluate. This is to make Rust's borrow checker happy + // because there will be mutable reference during the first iteration + // and we can't keep these references. self.ensure_columns_decoded(ctx, schema, input_physical_columns, input_logical_rows)?; self.eval_decoded( ctx, @@ -194,11 +201,13 @@ impl RpnExpression { Ok(()) } - /// Evaluates the expression into a stack node. The input columns must be already decoded. + /// Evaluates the expression into a stack node. The input columns must be + /// already decoded. /// - /// It differs from `eval` in that `eval_decoded` needn't receive a mutable reference - /// to `LazyBatchColumnVec`. However, since `eval_decoded` doesn't decode columns, - /// it will panic if referred columns are not decoded. + /// It differs from `eval` in that `eval_decoded` needn't receive a mutable + /// reference to `LazyBatchColumnVec`. However, since `eval_decoded` + /// doesn't decode columns, it will panic if referred columns are not + /// decoded. /// /// # Panics /// @@ -206,7 +215,8 @@ impl RpnExpression { /// /// Panics if referred columns are not decoded. /// - /// Panics when referenced column does not have equal length as specified in `rows`. + /// Panics when referenced column does not have equal length as specified in + /// `rows`. pub fn eval_decoded<'a>( &'a self, ctx: &mut EvalContext, @@ -400,7 +410,8 @@ mod tests { assert_eq!(val.field_type().as_accessor().tp(), FieldTypeTp::Double); } - /// Single column node but row numbers in `eval()` does not match column length, should panic. + /// Single column node but row numbers in `eval()` does not match column + /// length, should panic. #[test] fn test_eval_single_column_node_mismatch_rows() { let (columns, logical_rows, schema) = new_single_column_node_fixture(); @@ -725,8 +736,8 @@ mod tests { assert_eq!(val.field_type().as_accessor().tp(), FieldTypeTp::LongLong); } - /// Binary function (arguments are both raw columns). The same column is referred multiple times - /// and it should be Ok. + /// Binary function (arguments are both raw columns). The same column is + /// referred multiple times and it should be Ok. #[test] fn test_eval_binary_function_raw_column() { /// foo(v1, v2) performs v1 * v2. diff --git a/components/tidb_query_expr/src/types/function.rs b/components/tidb_query_expr/src/types/function.rs index e657b9fe262..dee74d2a434 100644 --- a/components/tidb_query_expr/src/types/function.rs +++ b/components/tidb_query_expr/src/types/function.rs @@ -1,8 +1,9 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! People implementing RPN functions with fixed argument type and count don't necessarily -//! need to understand how `Evaluator` and `RpnDef` work. There's a procedural macro called -//! `rpn_fn` defined in `tidb_query_codegen` to help you create RPN functions. For example: +//! People implementing RPN functions with fixed argument type and count don't +//! necessarily need to understand how `Evaluator` and `RpnDef` work. There's a +//! procedural macro called `rpn_fn` defined in `tidb_query_codegen` to help you +//! create RPN functions. For example: //! //! ```ignore //! use tidb_query_codegen::rpn_fn; @@ -13,9 +14,10 @@ //! } //! ``` //! -//! You can still call the `foo` function directly; the macro preserves the original function -//! It creates a `foo_fn_meta()` function (simply add `_fn_meta` to the original -//! function name) which generates an `RpnFnMeta` struct. +//! You can still call the `foo` function directly; the macro preserves the +//! original function It creates a `foo_fn_meta()` function (simply add +//! `_fn_meta` to the original function name) which generates an `RpnFnMeta` +//! struct. //! //! For more information on the procedural macro, see the documentation in //! `components/tidb_query_codegen/src/rpn_function`. @@ -96,7 +98,8 @@ impl<'a, T: EvaluableRef<'a>> ScalarArg<'a, T> { impl<'a, T: EvaluableRef<'a>> RpnFnArg for ScalarArg<'a, T> { type Type = Option; - /// Gets the value in the given row. All rows of a `ScalarArg` share the same value. + /// Gets the value in the given row. All rows of a `ScalarArg` share the + /// same value. #[inline] fn get(&self, _row: usize) -> Option { self.0.clone() @@ -137,17 +140,19 @@ impl<'a, T: EvaluableRef<'a>, C: 'a + ChunkRef<'a, T>> RpnFnArg for VectorArg<'a /// Partial or complete argument definition of an RPN function. /// -/// `ArgDef` is constructed at the beginning of evaluating an RPN function. The types of -/// `RpnFnArg`s are determined at this stage. So there won't be dynamic dispatch or enum matches -/// when the function is applied to each row of the input. +/// `ArgDef` is constructed at the beginning of evaluating an RPN function. The +/// types of `RpnFnArg`s are determined at this stage. So there won't be dynamic +/// dispatch or enum matches when the function is applied to each row of the +/// input. pub trait ArgDef: std::fmt::Debug {} /// RPN function argument definitions in the form of a linked list. /// -/// For example, if an RPN function foo(Int, Real, Decimal) is applied to input of a scalar of -/// integer, a vector of reals and a vector of decimals, the constructed `ArgDef` will be -/// `Arg, Arg, Arg, Null>>>`. `Null` -/// indicates the end of the argument list. +/// For example, if an RPN function foo(Int, Real, Decimal) is applied to input +/// of a scalar of integer, a vector of reals and a vector of decimals, the +/// constructed `ArgDef` will be `Arg, Arg, +/// Arg, Null>>>`. `Null` indicates the end of the argument +/// list. #[derive(Debug)] pub struct Arg { arg: A, @@ -157,8 +162,8 @@ pub struct Arg { impl ArgDef for Arg {} impl Arg { - /// Gets the value of the head argument in the given row and returns the remaining argument - /// list. + /// Gets the value of the head argument in the given row and returns the + /// remaining argument list. #[inline] pub fn extract(&self, row: usize) -> (A::Type, &Rem) { (self.arg.get(row), &self.rem) @@ -179,16 +184,18 @@ impl ArgDef for Null {} /// A generic evaluator of an RPN function. /// -/// For every RPN function, the evaluator should be created first. Then, call its `eval` method -/// with the input to get the result vector. +/// For every RPN function, the evaluator should be created first. Then, call +/// its `eval` method with the input to get the result vector. /// /// There are two kinds of evaluators in general: -/// - `ArgConstructor`: It's a provided `Evaluator`. It is used in the `rpn_fn` attribute macro -/// to generate the `ArgDef`. The `def` parameter of its eval method is the already constructed -/// `ArgDef`. If it is the outmost evaluator, `def` should be `Null`. -/// - Custom evaluators which do the actual execution of the RPN function. The `def` parameter of -/// its eval method is the constructed `ArgDef`. Implementors can then extract values from the -/// arguments, execute the RPN function and fill the result vector. +/// - `ArgConstructor`: It's a provided `Evaluator`. It is used in the `rpn_fn` +/// attribute macro to generate the `ArgDef`. The `def` parameter of its eval +/// method is the already constructed `ArgDef`. If it is the outmost +/// evaluator, `def` should be `Null`. +/// - Custom evaluators which do the actual execution of the RPN function. The +/// `def` parameter of its eval method is the constructed `ArgDef`. +/// Implementors can then extract values from the arguments, execute the RPN +/// function and fill the result vector. pub trait Evaluator<'a> { fn eval( self, @@ -271,7 +278,8 @@ pub fn validate_expr_return_type(expr: &Expr, et: EvalType) -> Result<()> { } } -/// Validates whether the number of arguments of an expression node meets expectation. +/// Validates whether the number of arguments of an expression node meets +/// expectation. pub fn validate_expr_arguments_eq(expr: &Expr, args: usize) -> Result<()> { let received_args = expr.get_children().len(); if received_args == args { @@ -285,7 +293,8 @@ pub fn validate_expr_arguments_eq(expr: &Expr, args: usize) -> Result<()> { } } -/// Validates whether the number of arguments of an expression node >= expectation. +/// Validates whether the number of arguments of an expression node >= +/// expectation. pub fn validate_expr_arguments_gte(expr: &Expr, args: usize) -> Result<()> { let received_args = expr.get_children().len(); if received_args >= args { @@ -299,7 +308,8 @@ pub fn validate_expr_arguments_gte(expr: &Expr, args: usize) -> Result<()> { } } -/// Validates whether the number of arguments of an expression node <= expectation. +/// Validates whether the number of arguments of an expression node <= +/// expectation. pub fn validate_expr_arguments_lte(expr: &Expr, args: usize) -> Result<()> { let received_args = expr.get_children().len(); if received_args <= args { diff --git a/components/tidb_query_expr/src/types/test_util.rs b/components/tidb_query_expr/src/types/test_util.rs index e1f44b6553d..88ec11debc6 100644 --- a/components/tidb_query_expr/src/types/test_util.rs +++ b/components/tidb_query_expr/src/types/test_util.rs @@ -16,7 +16,8 @@ use crate::{types::function::RpnFnMeta, RpnExpressionBuilder}; /// Helper utility to evaluate RPN function over scalar inputs. /// -/// This structure should be only useful in tests because it is not very efficient. +/// This structure should be only useful in tests because it is not very +/// efficient. pub struct RpnFnScalarEvaluator { rpn_expr_builder: RpnExpressionBuilder, return_field_type: Option, @@ -35,9 +36,9 @@ impl RpnFnScalarEvaluator { } } - /// Pushes a parameter as the value of an argument for evaluation. The field type will be auto - /// inferred by choosing an arbitrary field type that matches the field type of the given - /// value. + /// Pushes a parameter as the value of an argument for evaluation. The field + /// type will be auto inferred by choosing an arbitrary field type that + /// matches the field type of the given value. #[must_use] pub fn push_param(mut self, value: impl Into) -> Self { self.rpn_expr_builder = self.rpn_expr_builder.push_constant_for_test(value); @@ -52,7 +53,8 @@ impl RpnFnScalarEvaluator { self } - /// Pushes a parameter as the value of an argument for evaluation using a specified field type. + /// Pushes a parameter as the value of an argument for evaluation using a + /// specified field type. #[must_use] pub fn push_param_with_field_type( mut self, @@ -67,8 +69,9 @@ impl RpnFnScalarEvaluator { /// Sets the return field type. /// - /// If not set, the evaluation will use an inferred return field type by choosing an arbitrary - /// field type that matches the field type of the generic type `T` when calling `evaluate()`. + /// If not set, the evaluation will use an inferred return field type by + /// choosing an arbitrary field type that matches the field type of the + /// generic type `T` when calling `evaluate()`. #[must_use] pub fn return_field_type(mut self, field_type: impl Into) -> Self { self.return_field_type = Some(field_type.into()); @@ -93,10 +96,11 @@ impl RpnFnScalarEvaluator { /// Evaluates the given function. /// - /// Note that this function does not respect previous `return_field_type()` call. + /// Note that this function does not respect previous `return_field_type()` + /// call. /// - /// This function exposes low-level evaluate results. Prefer to use `evaluate()` instead for - /// normal use case. + /// This function exposes low-level evaluate results. Prefer to use + /// `evaluate()` instead for normal use case. pub fn evaluate_raw( self, ret_field_type: impl Into, @@ -107,7 +111,8 @@ impl RpnFnScalarEvaluator { None => EvalContext::default(), }; - // Children expr descriptors are needed to map the signature into the actual function impl. + // Children expr descriptors are needed to map the signature into the actual + // function impl. let children_ed: Vec<_> = self .rpn_expr_builder .as_ref() diff --git a/components/tikv_alloc/src/error.rs b/components/tikv_alloc/src/error.rs index 68c5338ab7e..c098a387c2e 100644 --- a/components/tikv_alloc/src/error.rs +++ b/components/tikv_alloc/src/error.rs @@ -7,7 +7,8 @@ pub enum ProfError { MemProfilingNotEnabled, IOError(std::io::Error), JemallocError(String), - PathEncodingError(std::ffi::OsString), // When temp files are in a non-unicode directory, OsString.into_string() will cause this error, + PathEncodingError(std::ffi::OsString), /* When temp files are in a non-unicode directory, + * OsString.into_string() will cause this error, */ PathWithNulError(std::ffi::NulError), } diff --git a/components/tikv_alloc/src/lib.rs b/components/tikv_alloc/src/lib.rs index 1435ca2bbd0..507a1195a38 100644 --- a/components/tikv_alloc/src/lib.rs +++ b/components/tikv_alloc/src/lib.rs @@ -26,8 +26,7 @@ //! //! This crate accepts five cargo features: //! -//! - mem-profiling - compiles jemalloc and this crate with profiling -//! capability +//! - mem-profiling - compiles jemalloc and this crate with profiling capability //! //! - jemalloc - compiles tikv-jemallocator (default) //! @@ -134,8 +133,9 @@ mod runner { extern crate test; use test::*; - /// Check for ignored test cases with ignore message "#ifdef ". The test - /// case will be enabled if the specific environment variable is set. + /// Check for ignored test cases with ignore message "#ifdef ". + /// The test case will be enabled if the specific environment variable + /// is set. pub fn run_env_conditional_tests(cases: &[&TestDescAndFn]) { let cases: Vec<_> = cases .iter() diff --git a/components/tikv_alloc/src/trace.rs b/components/tikv_alloc/src/trace.rs index a55988450ee..f58bf31fd06 100644 --- a/components/tikv_alloc/src/trace.rs +++ b/components/tikv_alloc/src/trace.rs @@ -1,20 +1,22 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -//! This module provides some utilities to define the tree hierarchy to trace memory. +//! This module provides some utilities to define the tree hierarchy to trace +//! memory. //! -//! A memory trace is a tree that records how much memory its children and itself -//! uses, It doesn't need to match any function stacktrace, instead it should -//! have logically meaningful layout. +//! A memory trace is a tree that records how much memory its children and +//! itself uses, It doesn't need to match any function stacktrace, instead it +//! should have logically meaningful layout. //! -//! For example, memory usage should be divided into several components under the -//! root scope: TiDB EndPoint, Transaction, Raft, gRPC etc. TiDB EndPoint can divide -//! its children by queries, while Raft can divide memory by store and apply. Name -//! are defined as number for better performance. In practice, it can be mapped to -//! enumerates instead. +//! For example, memory usage should be divided into several components under +//! the root scope: TiDB EndPoint, Transaction, Raft, gRPC etc. TiDB EndPoint +//! can divide its children by queries, while Raft can divide memory by store +//! and apply. Name are defined as number for better performance. In practice, +//! it can be mapped to enumerates instead. //! -//! To define a memory trace tree, we can use the `mem_trace` macro. The `mem_trace` -//! macro constructs every node as a `MemoryTrace` which implements `MemoryTrace` trait. -//! We can also define a specified tree node by implementing `MemoryTrace` trait. +//! To define a memory trace tree, we can use the `mem_trace` macro. The +//! `mem_trace` macro constructs every node as a `MemoryTrace` which implements +//! `MemoryTrace` trait. We can also define a specified tree node by +//! implementing `MemoryTrace` trait. use std::{ fmt::{self, Debug, Display, Formatter}, diff --git a/components/tikv_kv/src/btree_engine.rs b/components/tikv_kv/src/btree_engine.rs index 9557f945034..b80c32e7088 100644 --- a/components/tikv_kv/src/btree_engine.rs +++ b/components/tikv_kv/src/btree_engine.rs @@ -24,9 +24,10 @@ use crate::{ type RwLockTree = RwLock>; -/// The BTreeEngine(based on `BTreeMap`) is in memory and only used in tests and benchmarks. -/// Note: The `snapshot()` and `async_snapshot()` methods are fake, the returned snapshot is not isolated, -/// they will be affected by the later modifies. +/// The BTreeEngine(based on `BTreeMap`) is in memory and only used in tests and +/// benchmarks. Note: The `snapshot()` and `async_snapshot()` methods are fake, +/// the returned snapshot is not isolated, they will be affected by the later +/// modifies. #[derive(Clone)] pub struct BTreeEngine { cf_names: Vec, @@ -102,7 +103,8 @@ impl Engine for BTreeEngine { Ok(()) } - /// warning: It returns a fake snapshot whose content will be affected by the later modifies! + /// warning: It returns a fake snapshot whose content will be affected by + /// the later modifies! fn async_snapshot( &self, _ctx: SnapContext<'_>, @@ -155,9 +157,10 @@ impl BTreeEngineIterator { } } - /// In general, there are 2 endpoints in a range, the left one and the right one. - /// This method will seek to the left one if left is `true`, else seek to the right one. - /// Returns true when the endpoint is valid, which means the endpoint exist and in `self.bounds`. + /// In general, there are 2 endpoints in a range, the left one and the right + /// one. This method will seek to the left one if left is `true`, else seek + /// to the right one. Returns true when the endpoint is valid, which means + /// the endpoint exist and in `self.bounds`. fn seek_to_range_endpoint(&mut self, range: (Bound, Bound), left: bool) -> bool { let tree = self.tree.read().unwrap(); let mut range = tree.range(range); diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index 44437e60f4c..995f2ed0e21 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -63,13 +63,15 @@ impl Cursor { self.cur_value_has_read.set(false); } - /// Mark key as read. Returns whether key was marked as read before this call. + /// Mark key as read. Returns whether key was marked as read before this + /// call. #[inline] fn mark_key_read(&self) -> bool { self.cur_key_has_read.replace(true) } - /// Mark value as read. Returns whether value was marked as read before this call. + /// Mark value as read. Returns whether value was marked as read before this + /// call. #[inline] fn mark_value_read(&self) -> bool { self.cur_value_has_read.replace(true) @@ -148,7 +150,8 @@ impl Cursor { } } else if self.prefix_seek { // When prefixed seek and prefix_same_as_start enabled - // seek_to_first may return false due to no key's prefix is same as iter lower bound's + // seek_to_first may return false due to no key's prefix is same as iter lower + // bound's return self.seek(key, statistics); } else { assert!(self.seek_to_first(statistics)); @@ -375,9 +378,9 @@ impl Cursor { } #[inline] - // As Rocksdb described, if Iterator::Valid() is false, there are two possibilities: - // (1) We reached the end of the data. In this case, status() is OK(); - // (2) there is an error. In this case status() is not OK(). + // As Rocksdb described, if Iterator::Valid() is false, there are two + // possibilities: (1) We reached the end of the data. In this case, status() + // is OK(); (2) there is an error. In this case status() is not OK(). // So check status when iterator is invalidated. pub fn valid(&self) -> Result { match self.iter.valid() { @@ -418,7 +421,8 @@ impl Cursor { } } -/// A handy utility to build a snapshot cursor according to various configurations. +/// A handy utility to build a snapshot cursor according to various +/// configurations. pub struct CursorBuilder<'a, S: Snapshot> { snapshot: &'a S, cf: CfName, @@ -555,7 +559,8 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { iter_opt.set_key_only(self.key_only); iter_opt.set_max_skippable_internal_keys(self.max_skippable_internal_keys); - // prefix_seek is only used for single key, so set prefix_same_as_start for safety. + // prefix_seek is only used for single key, so set prefix_same_as_start for + // safety. if self.prefix_seek { iter_opt.use_prefix_seek(); iter_opt.set_prefix_same_as_start(true); diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index c96d996dc5c..e26318d7b4e 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -1,7 +1,8 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -//! There are multiple [`Engine`](kv::Engine) implementations, [`RaftKv`](crate::server::raftkv::RaftKv) -//! is used by the [`Server`](crate::server::Server). The [`BTreeEngine`](kv::BTreeEngine) and +//! There are multiple [`Engine`](kv::Engine) implementations, +//! [`RaftKv`](crate::server::raftkv::RaftKv) is used by the +//! [`Server`](crate::server::Server). The [`BTreeEngine`](kv::BTreeEngine) and //! [`RocksEngine`](RocksEngine) are used for testing only. #![feature(min_specialization)] @@ -150,7 +151,8 @@ impl From for raft_cmdpb::Request { } // For test purpose only. -// It's used to simulate observer actions in `rocksdb_engine`. See `RocksEngine::async_write_ext()`. +// It's used to simulate observer actions in `rocksdb_engine`. See +// `RocksEngine::async_write_ext()`. impl From for Modify { fn from(mut req: raft_cmdpb::Request) -> Modify { let name_to_cf = |name: &str| -> Option { @@ -280,8 +282,8 @@ pub trait Engine: Send + Clone + 'static { /// Writes data to the engine asynchronously with some extensions. /// - /// When the write request is proposed successfully, the `proposed_cb` is invoked. - /// When the write request is finished, the `write_cb` is invoked. + /// When the write request is proposed successfully, the `proposed_cb` is + /// invoked. When the write request is finished, the `write_cb` is invoked. fn async_write_ext( &self, ctx: &Context, @@ -341,10 +343,12 @@ pub trait Engine: Send + Clone + 'static { fn schedule_txn_extra(&self, _txn_extra: TxnExtra) {} } -/// A Snapshot is a consistent view of the underlying engine at a given point in time. +/// A Snapshot is a consistent view of the underlying engine at a given point in +/// time. /// -/// Note that this is not an MVCC snapshot, that is a higher level abstraction of a view of TiKV -/// at a specific timestamp. This snapshot is lower-level, a view of the underlying storage. +/// Note that this is not an MVCC snapshot, that is a higher level abstraction +/// of a view of TiKV at a specific timestamp. This snapshot is lower-level, a +/// view of the underlying storage. pub trait Snapshot: Sync + Send + Clone { type Iter: Iterator; type Ext<'a>: SnapshotExt @@ -357,7 +361,8 @@ pub trait Snapshot: Sync + Send + Clone { /// Get the value associated with `key` in `cf` column family fn get_cf(&self, cf: CfName, key: &Key) -> Result>; - /// Get the value associated with `key` in `cf` column family, with Options in `opts` + /// Get the value associated with `key` in `cf` column family, with Options + /// in `opts` fn get_cf_opt(&self, opts: ReadOptions, cf: CfName, key: &Key) -> Result>; fn iter(&self, cf: CfName, iter_opt: IterOptions) -> Result; // The minimum key this snapshot can retrieve. @@ -365,7 +370,8 @@ pub trait Snapshot: Sync + Send + Clone { fn lower_bound(&self) -> Option<&[u8]> { None } - // The maximum key can be fetched from the snapshot should less than the upper bound. + // The maximum key can be fetched from the snapshot should less than the upper + // bound. #[inline] fn upper_bound(&self) -> Option<&[u8]> { None @@ -375,8 +381,9 @@ pub trait Snapshot: Sync + Send + Clone { } pub trait SnapshotExt { - /// Retrieves a version that represents the modification status of the underlying data. - /// Version should be changed when underlying data is changed. + /// Retrieves a version that represents the modification status of the + /// underlying data. Version should be changed when underlying data is + /// changed. /// /// If the engine does not support data version, then `None` is returned. fn get_data_version(&self) -> Option { @@ -533,8 +540,8 @@ where /// /// Postcondition: `TLS_ENGINE_ANY` is non-null. pub fn set_tls_engine(engine: E) { - // Safety: we check that `TLS_ENGINE_ANY` is null to ensure we don't leak an existing - // engine; we ensure there are no other references to `engine`. + // Safety: we check that `TLS_ENGINE_ANY` is null to ensure we don't leak an + // existing engine; we ensure there are no other references to `engine`. TLS_ENGINE_ANY.with(move |e| unsafe { if (*e.get()).is_null() { let engine = Box::into_raw(Box::new(engine)) as *mut (); @@ -552,8 +559,9 @@ pub fn set_tls_engine(engine: E) { /// The current tls engine must have the same type as `E` (or at least /// there destructors must be compatible). pub unsafe fn destroy_tls_engine() { - // Safety: we check that `TLS_ENGINE_ANY` is non-null, we must ensure that references - // to `TLS_ENGINE_ANY` can never be stored outside of `TLS_ENGINE_ANY`. + // Safety: we check that `TLS_ENGINE_ANY` is non-null, we must ensure that + // references to `TLS_ENGINE_ANY` can never be stored outside of + // `TLS_ENGINE_ANY`. TLS_ENGINE_ANY.with(|e| { let ptr = *e.get(); if !ptr.is_null() { @@ -856,7 +864,8 @@ pub mod tests { .near_seek(&Key::from_raw(b"z\x00"), &mut statistics) .unwrap() ); - // Insert many key-values between 'x' and 'z' then near_seek will fallback to seek. + // Insert many key-values between 'x' and 'z' then near_seek will fallback to + // seek. for i in 0..super::SEEK_BOUND { let key = format!("y{}", i); must_put(engine, key.as_bytes(), b"3"); @@ -945,7 +954,8 @@ pub mod tests { ForPrev, } - // use step to control the distance between target key and current key in cursor. + // use step to control the distance between target key and current key in + // cursor. fn test_linear_seek( snapshot: &S, mode: ScanMode, diff --git a/components/tikv_kv/src/mock_engine.rs b/components/tikv_kv/src/mock_engine.rs index 3851f5148f4..bec883c1f71 100644 --- a/components/tikv_kv/src/mock_engine.rs +++ b/components/tikv_kv/src/mock_engine.rs @@ -81,7 +81,8 @@ impl ExpectedWrite { } } -/// `ExpectedWriteList` represents a list of writes expected to write to the engine +/// `ExpectedWriteList` represents a list of writes expected to write to the +/// engine struct ExpectedWriteList(Mutex>); // We implement drop here instead of on MockEngine diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index f0331403725..0cc90730acd 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -104,7 +104,7 @@ impl RocksEngine { let worker = Worker::new("engine-rocksdb"); let mut db_opts = db_opts.unwrap_or_default(); if io_rate_limiter.is_some() { - db_opts.set_env(get_env(None /*key_manager*/, io_rate_limiter).unwrap()); + db_opts.set_env(get_env(None /* key_manager */, io_rate_limiter).unwrap()); } let db = engine_rocks::util::new_engine_opt(&path, db_opts, cfs_opts)?; @@ -151,7 +151,8 @@ impl RocksEngine { } /// `pre_propose` is called before propose. - /// It's used to trigger "pre_propose_query" observers for RawKV API V2 by now. + /// It's used to trigger "pre_propose_query" observers for RawKV API V2 by + /// now. fn pre_propose(&self, mut batch: WriteData) -> Result { let requests = batch .modifies diff --git a/components/tikv_util/src/buffer_vec.rs b/components/tikv_util/src/buffer_vec.rs index c337e9e1659..d2247c011ec 100644 --- a/components/tikv_util/src/buffer_vec.rs +++ b/components/tikv_util/src/buffer_vec.rs @@ -4,8 +4,8 @@ use std::iter::*; use codec::prelude::BufferWriter; -/// A vector like container storing multiple buffers. Each buffer is a `[u8]` slice in -/// arbitrary length. +/// A vector like container storing multiple buffers. Each buffer is a `[u8]` +/// slice in arbitrary length. #[derive(Default, Clone)] pub struct BufferVec { data: Vec, @@ -38,7 +38,8 @@ impl BufferVec { Self::default() } - /// Constructs a new, empty `BufferVec` with the specified element capacity and data capacity. + /// Constructs a new, empty `BufferVec` with the specified element capacity + /// and data capacity. #[inline] pub fn with_capacity(elements_capacity: usize, data_capacity: usize) -> Self { Self { @@ -47,15 +48,15 @@ impl BufferVec { } } - /// Returns the number of buffers this `BufferVec` can hold without reallocating the - /// offsets array. + /// Returns the number of buffers this `BufferVec` can hold without + /// reallocating the offsets array. #[inline] pub fn capacity(&self) -> usize { self.offsets.capacity() } - /// Returns the number of buffers this `BufferVec` can hold without reallocating the - /// data array. + /// Returns the number of buffers this `BufferVec` can hold without + /// reallocating the data array. #[inline] pub fn data_capacity(&self) -> usize { self.data.capacity() @@ -100,11 +101,12 @@ impl BufferVec { } } - /// Returns a delegator that provides `extend` appends buffers together as one buffer - /// to the back. + /// Returns a delegator that provides `extend` appends buffers together as + /// one buffer to the back. /// - /// Note that this function always creates a new buffer even if you don't call `extend` - /// on the delegator later, which simply results in appending a new empty buffer. + /// Note that this function always creates a new buffer even if you don't + /// call `extend` on the delegator later, which simply results in + /// appending a new empty buffer. #[inline] pub fn begin_concat_extend(&mut self) -> WithConcatExtend<'_> { WithConcatExtend::init(self) @@ -171,7 +173,8 @@ impl BufferVec { } } - /// Shortens the `BufferVec`, keeping the first `n` buffers and dropping the rest. + /// Shortens the `BufferVec`, keeping the first `n` buffers and dropping the + /// rest. /// /// If `n` >= current length, this has no effect. #[inline] diff --git a/components/tikv_util/src/callback.rs b/components/tikv_util/src/callback.rs index 62a39c7d06f..5f33ce10696 100644 --- a/components/tikv_util/src/callback.rs +++ b/components/tikv_util/src/callback.rs @@ -10,8 +10,8 @@ pub type Callback = Box; /// Note that leaking the callback can cause it to be never called but it /// rarely happens. /// -/// Also note that because `callback` and `arg_on_drop` may be called in the `drop` -/// method, do not panic inside them or use `safe_panic` instead. +/// Also note that because `callback` and `arg_on_drop` may be called in the +/// `drop` method, do not panic inside them or use `safe_panic` instead. pub fn must_call( callback: impl FnOnce(T) + Send + 'static, arg_on_drop: impl FnOnce() -> T + Send + 'static, diff --git a/components/tikv_util/src/codec/bytes.rs b/components/tikv_util/src/codec/bytes.rs index 36990ba7d08..034e8e73375 100644 --- a/components/tikv_util/src/codec/bytes.rs +++ b/components/tikv_util/src/codec/bytes.rs @@ -55,8 +55,8 @@ pub trait BytesEncoder: NumberEncoder { } /// Joins bytes with its length into a byte slice. It is more - /// efficient in both space and time compared to `encode_bytes`. Note that the encoded - /// result is not memcomparable. + /// efficient in both space and time compared to `encode_bytes`. Note that + /// the encoded result is not memcomparable. fn encode_compact_bytes(&mut self, data: &[u8]) -> Result<()> { self.encode_var_i64(data.len() as i64)?; self.write_all(data).map_err(From::from) @@ -95,13 +95,14 @@ fn encode_order_bytes(bs: &[u8], desc: bool) -> Vec { /// Gets the first encoded bytes' length in compactly encoded data. /// -/// Compact-encoding includes a VarInt encoded length prefix (1 ~ 9 bytes) and N bytes payload. -/// This function gets the total bytes length of compact-encoded data, including the length prefix. +/// Compact-encoding includes a VarInt encoded length prefix (1 ~ 9 bytes) and N +/// bytes payload. This function gets the total bytes length of compact-encoded +/// data, including the length prefix. /// /// Note: /// - This function won't check whether the bytes are encoded correctly. -/// - There can be multiple compact-encoded data, placed one by one. This function only returns -/// the length of the first one. +/// - There can be multiple compact-encoded data, placed one by one. This +/// function only returns the length of the first one. pub fn encoded_compact_len(mut encoded: &[u8]) -> usize { let last_encoded = encoded.as_ptr() as usize; let total_len = encoded.len(); @@ -137,13 +138,14 @@ impl CompactBytesFromFileDecoder for T {} /// Gets the first encoded bytes' length in memcomparable-encoded data. /// -/// Memcomparable-encoding includes a VarInt encoded length prefix (1 ~ 9 bytes) and N bytes payload. -/// This function gets the total bytes length of memcomparable-encoded data, including the length prefix. +/// Memcomparable-encoding includes a VarInt encoded length prefix (1 ~ 9 bytes) +/// and N bytes payload. This function gets the total bytes length of +/// memcomparable-encoded data, including the length prefix. /// /// Note: /// - This function won't check whether the bytes are encoded correctly. -/// - There can be multiple memcomparable-encoded data, placed one by one. This function only returns -/// the length of the first one. +/// - There can be multiple memcomparable-encoded data, placed one by one. +/// This function only returns the length of the first one. pub fn encoded_bytes_len(encoded: &[u8], desc: bool) -> usize { let mut idx = ENC_GROUP_SIZE; loop { @@ -221,8 +223,8 @@ pub fn decode_bytes(data: &mut BytesSlice<'_>, desc: bool) -> Result> { } } -/// Decodes bytes which are encoded by `encode_bytes` before just in place without malloc. -/// Please use this instead of `decode_bytes` if possible. +/// Decodes bytes which are encoded by `encode_bytes` before just in place +/// without malloc. Please use this instead of `decode_bytes` if possible. pub fn decode_bytes_in_place(data: &mut Vec, desc: bool) -> Result<()> { let mut write_offset = 0; let mut read_offset = 0; @@ -281,7 +283,8 @@ pub fn decode_bytes_in_place(data: &mut Vec, desc: bool) -> Result<()> { } } -/// Returns whether `encoded` bytes is encoded from `raw`. Returns `false` if `encoded` is invalid. +/// Returns whether `encoded` bytes is encoded from `raw`. Returns `false` if +/// `encoded` is invalid. pub fn is_encoded_from(encoded: &[u8], raw: &[u8], desc: bool) -> bool { let check_single_chunk = |encoded: &[u8], raw: &[u8]| { let len = raw.len(); @@ -310,8 +313,8 @@ pub fn is_encoded_from(encoded: &[u8], raw: &[u8], desc: bool) -> bool { return false; } - // Bytes are compared in reverse order because in real cases like TiDB, if two keys - // are different, the last a few bytes are more likely to be different. + // Bytes are compared in reverse order because in real cases like TiDB, if two + // keys are different, the last a few bytes are more likely to be different. let raw_chunks = raw.chunks_exact(ENC_GROUP_SIZE); // Check the last chunk first @@ -320,8 +323,9 @@ pub fn is_encoded_from(encoded: &[u8], raw: &[u8], desc: bool) -> bool { _ => return false, } - // The count of the remaining chunks must be the same. Using `size_hint` here is both safe and - // efficient because chunk iterators implement trait `TrustedLen`. + // The count of the remaining chunks must be the same. Using `size_hint` here is + // both safe and efficient because chunk iterators implement trait + // `TrustedLen`. if rev_encoded_chunks.size_hint() != raw_chunks.size_hint() { return false; } diff --git a/components/tikv_util/src/codec/number.rs b/components/tikv_util/src/codec/number.rs index 2f0b3fbcf3a..840da1cf85d 100644 --- a/components/tikv_util/src/codec/number.rs +++ b/components/tikv_util/src/codec/number.rs @@ -44,27 +44,31 @@ fn order_decode_f64(u: u64) -> f64 { pub trait NumberEncoder: Write { /// Writes the encoded value to buf. - /// It guarantees that the encoded value is in ascending order for comparison. + /// It guarantees that the encoded value is in ascending order for + /// comparison. fn encode_i64(&mut self, v: i64) -> Result<()> { let u = order_encode_i64(v); self.encode_u64(u) } /// Writes the encoded value to buf. - /// It guarantees that the encoded value is in descending order for comparison. + /// It guarantees that the encoded value is in descending order for + /// comparison. fn encode_i64_desc(&mut self, v: i64) -> Result<()> { let u = order_encode_i64(v); self.encode_u64_desc(u) } /// Writes the encoded value to slice buf. - /// It guarantees that the encoded value is in ascending order for comparison. + /// It guarantees that the encoded value is in ascending order for + /// comparison. fn encode_u64(&mut self, v: u64) -> Result<()> { self.write_u64::(v).map_err(From::from) } /// Writes the encoded value to slice buf. - /// It guarantees that the encoded value is in descending order for comparison. + /// It guarantees that the encoded value is in descending order for + /// comparison. fn encode_u64_desc(&mut self, v: u64) -> Result<()> { self.write_u64::(!v).map_err(From::from) } @@ -100,14 +104,16 @@ pub trait NumberEncoder: Write { } /// Writes the encoded value to slice buf. - /// It guarantees that the encoded value is in ascending order for comparison. + /// It guarantees that the encoded value is in ascending order for + /// comparison. fn encode_f64(&mut self, f: f64) -> Result<()> { let u = order_encode_f64(f); self.encode_u64(u) } /// Writes the encoded value to slice buf. - /// It guarantees that the encoded value is in descending order for comparison. + /// It guarantees that the encoded value is in descending order for + /// comparison. fn encode_f64_desc(&mut self, f: f64) -> Result<()> { let u = order_encode_f64(f); self.encode_u64_desc(u) diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index 6982c66b67a..6655531c294 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -539,7 +539,8 @@ pub fn normalize_path>(path: P) -> PathBuf { ret } -/// Normalizes the path and canonicalizes its longest physically existing sub-path. +/// Normalizes the path and canonicalizes its longest physically existing +/// sub-path. fn canonicalize_non_existing_path>(path: P) -> std::io::Result { fn try_canonicalize_normalized_path(path: &Path) -> std::io::Result { use std::path::Component; @@ -591,7 +592,8 @@ fn canonicalize_non_existing_path>(path: P) -> std::io::Result>(path: P) -> std::io::Result { match path.as_ref().canonicalize() { Err(e) if e.kind() == std::io::ErrorKind::NotFound => canonicalize_non_existing_path(path), @@ -714,7 +716,8 @@ mod check_kernel { Ok(()) } - /// `check_kernel_params` checks kernel parameters, following are checked so far: + /// `check_kernel_params` checks kernel parameters, following are checked so + /// far: /// - `net.core.somaxconn` should be greater or equal to 32768. /// - `net.ipv4.tcp_syncookies` should be 0 /// - `vm.swappiness` shoud be 0 @@ -1034,7 +1037,8 @@ fn get_file_count(data_path: &str, extension: &str) -> Result Result<(), ConfigError> { let op = "data-dir.empty.check"; let dir = Path::new(data_path); @@ -1052,7 +1056,8 @@ pub fn check_data_dir_empty(data_path: &str, extension: &str) -> Result<(), Conf } /// `check_addr` validates an address. Addresses are formed like "Host:Port". -/// More details about **Host** and **Port** can be found in WHATWG URL Standard. +/// More details about **Host** and **Port** can be found in WHATWG URL +/// Standard. /// /// Return whether the address is unspecified, i.e. `0.0.0.0` or `::0` pub fn check_addr(addr: &str) -> Result { @@ -1238,9 +1243,9 @@ impl TomlLine { } } -/// TomlWriter use to update the config file and only cover the most commom toml -/// format that used by tikv config file, toml format like: quoted keys, multi-line -/// value, inline table, etc, are not supported, see +/// TomlWriter use to update the config file and only cover the most common toml +/// format that used by tikv config file, toml format like: quoted keys, +/// multi-line value, inline table, etc, are not supported, see /// for more detail. pub struct TomlWriter { dst: Vec, @@ -1402,14 +1407,15 @@ macro_rules! numeric_enum_serializing_mod { } /// Helper for migrating Raft data safely. Such migration is defined as -/// multiple states that can be uniquely distinguished. And the transtions +/// multiple states that can be uniquely distinguished. And the transitions /// between these states are atomic. /// /// States: /// 1. Init - Only source directory contains Raft data. -/// 2. Migrating - A marker file contains the path of source directory. The source -/// directory contains a complete copy of Raft data. Target directory may exist. -/// 3. Completed - Only target directory contains Raft data. Marker file may exist. +/// 2. Migrating - A marker file contains the path of source directory. The +/// source directory contains a complete copy of Raft data. Target +/// directory may exist. 3. Completed - Only target directory contains Raft +/// data. Marker file may exist. pub struct RaftDataStateMachine { root: PathBuf, in_progress_marker: PathBuf, @@ -1454,8 +1460,9 @@ impl RaftDataStateMachine { Ok(()) } - /// Returns whether a migration is needed. When it's needed, enters the `Migrating` - /// state. Otherwise prepares the target directory for opening. + /// Returns whether a migration is needed. When it's needed, enters the + /// `Migrating` state. Otherwise prepares the target directory for + /// opening. pub fn before_open_target(&mut self) -> bool { // Clean up trash directory if there is any. for p in [&self.source, &self.target] { @@ -1478,8 +1485,8 @@ impl RaftDataStateMachine { Self::must_remove(&self.source); return false; } - // It's actually in Completed state, just in the reverse direction. - // Equivalent to Init state. + // It's actually in Completed state, just in the reverse + // direction. Equivalent to Init state. } else { assert!(real_source == self.source); Self::must_remove(&self.target); @@ -1503,8 +1510,8 @@ impl RaftDataStateMachine { Self::must_remove(&self.in_progress_marker); } - // `after_dump_data` involves two atomic operations, insert a check point between - // them to test crash safety. + // `after_dump_data` involves two atomic operations, insert a check point + // between them to test crash safety. #[cfg(test)] fn after_dump_data_with_check(&mut self, check: &F) { assert!(Self::data_exists(&self.source)); @@ -1525,8 +1532,8 @@ impl RaftDataStateMachine { Self::sync_dir(&self.root); } - // Assumes there is a marker file. Returns None when the content of marker file is - // incomplete. + // Assumes there is a marker file. Returns None when the content of marker file + // is incomplete. fn read_marker(&self) -> Option { let marker = fs::read_to_string(&self.in_progress_marker).unwrap(); if marker.ends_with("//") { diff --git a/components/tikv_util/src/deadline.rs b/components/tikv_util/src/deadline.rs index c02d0a19fa9..84463f507b9 100644 --- a/components/tikv_util/src/deadline.rs +++ b/components/tikv_util/src/deadline.rs @@ -31,7 +31,8 @@ impl Deadline { Self { deadline } } - /// Creates a new `Deadline` that will reach after specified amount of time in future. + /// Creates a new `Deadline` that will reach after specified amount of time + /// in future. pub fn from_now(after_duration: Duration) -> Self { let deadline = Instant::now_coarse() + after_duration; Self { deadline } diff --git a/components/tikv_util/src/future.rs b/components/tikv_util/src/future.rs index 8f639a9e5ef..61d6f33ad4c 100644 --- a/components/tikv_util/src/future.rs +++ b/components/tikv_util/src/future.rs @@ -17,8 +17,8 @@ use futures::{ use crate::callback::must_call; -/// Generates a paired future and callback so that when callback is being called, its result -/// is automatically passed as a future result. +/// Generates a paired future and callback so that when callback is being +/// called, its result is automatically passed as a future result. pub fn paired_future_callback() -> (Box, futures_oneshot::Receiver) where T: Send + 'static, @@ -52,8 +52,9 @@ where (callback, future) } -/// Create a stream proxy with buffer representing the remote stream. The returned task -/// will receive messages from the remote stream as much as possible. +/// Create a stream proxy with buffer representing the remote stream. The +/// returned task will receive messages from the remote stream as much as +/// possible. pub fn create_stream_with_buffer( s: S, size: usize, @@ -165,7 +166,8 @@ impl PollAtWake { Ok(_) => return, Err(s) => { if s == NOTIFIED { - // Only this thread can change the state from NOTIFIED, so it has to succeed. + // Only this thread can change the state from NOTIFIED, so it has to + // succeed. match arc_self.state.compare_exchange( NOTIFIED, POLLING, diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index 1fec3722a64..ecfeb7253fd 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -498,10 +498,11 @@ pub fn set_panic_hook(panic_abort: bool, data_dir: &str) { ); // There might be remaining logs in the async logger. - // To collect remaining logs and also collect future logs, replace the old one with a - // terminal logger. - // When the old global async logger is replaced, the old async guard will be taken and dropped. - // In the drop() the async guard, it waits for the finish of the remaining logs in the async logger. + // To collect remaining logs and also collect future logs, replace the old one + // with a terminal logger. + // When the old global async logger is replaced, the old async guard will be + // taken and dropped. In the drop() the async guard, it waits for the + // finish of the remaining logs in the async logger. if let Some(level) = ::log::max_level().to_level() { let drainer = logger::text_format(logger::term_writer(), true); let _ = logger::init_log( diff --git a/components/tikv_util/src/log.rs b/components/tikv_util/src/log.rs index 208280519e8..10facfa2287 100644 --- a/components/tikv_util/src/log.rs +++ b/components/tikv_util/src/log.rs @@ -6,10 +6,13 @@ macro_rules! crit( ($($args:tt)+) => { ::slog_global::crit!($($args)+) };); -/// Logs a error level message using the slog global logger. /// Use '?' to output error in debug format or '%' to ouput error in display format. -/// As the third and forth rules shown, the last log field should follow a ',' to seperate the 'err' field. eg. `error!(?e, "msg"; "foo" => foo,);` -/// If you don't want to output error code, just use the common form like other macros. -/// Require `slog_global` dependency and `#![feature(min_speacilization)]` in all crates. +/// Logs a error level message using the slog global logger. /// Use '?' to +/// output error in debug format or '%' to output error in display format. As +/// the third and forth rules shown, the last log field should follow a ',' to +/// separate the 'err' field. eg. `error!(?e, "msg"; "foo" => foo,);` +/// If you don't want to output error code, just use the common form like other +/// macros. Require `slog_global` dependency and +/// `#![feature(min_speacilization)]` in all crates. #[macro_export] macro_rules! error { (?$e:expr; $l:literal) => { diff --git a/components/tikv_util/src/logger/file_log.rs b/components/tikv_util/src/logger/file_log.rs index 3b8d4ae3ff0..5d0300ccdc5 100644 --- a/components/tikv_util/src/logger/file_log.rs +++ b/components/tikv_util/src/logger/file_log.rs @@ -41,18 +41,21 @@ pub trait Rotator: Send { /// Return if the file need to be rotated. fn should_rotate(&self) -> bool; - /// Call by operator, update rotators' state while the operator try to write some data. + /// Call by operator, update rotators' state while the operator try to write + /// some data. fn on_write(&mut self, data: &[u8]) -> io::Result<()>; - /// Call by operator, update rotators' state while the operator execute a rotation. + /// Call by operator, update rotators' state while the operator execute a + /// rotation. fn on_rotate(&mut self) -> io::Result<()>; } /// This `FileLogger` will iterate over a series of `Rotators`, /// once the context trigger the `Rotator`, it will execute a rotation. /// -/// After rotating, the original log file would be renamed to "{original name}.{"%Y-%m-%dT%H-%M-%S%.3f"}". -/// Note: log file will *not* be compressed or otherwise modified. +/// After rotating, the original log file would be renamed to "{original +/// name}.{"%Y-%m-%dT%H-%M-%S%.3f"}". Note: log file will *not* be compressed or +/// otherwise modified. pub struct RotatingFileLogger { path: PathBuf, file: File, diff --git a/components/tikv_util/src/logger/formatter.rs b/components/tikv_util/src/logger/formatter.rs index fe536eff2b0..c53c5896519 100644 --- a/components/tikv_util/src/logger/formatter.rs +++ b/components/tikv_util/src/logger/formatter.rs @@ -2,7 +2,8 @@ use std::io; -/// Writes file name into the writer, removes the character which not match `[a-zA-Z0-9\.-_]` +/// Writes file name into the writer, removes the character which not match +/// `[a-zA-Z0-9\.-_]` pub fn write_file_name(writer: &mut W, file_name: &str) -> io::Result<()> where W: io::Write + ?Sized, @@ -30,9 +31,9 @@ where Ok(()) } -/// According to [RFC: Unified Log Format], it returns `true` when this byte stream contains -/// the following characters, which means this input stream needs to be JSON encoded. -/// Otherwise, it returns `false`. +/// According to [RFC: Unified Log Format], it returns `true` when this byte +/// stream contains the following characters, which means this input stream +/// needs to be JSON encoded. Otherwise, it returns `false`. /// /// - U+0000 (NULL) ~ U+0020 (SPACE) /// - U+0022 (QUOTATION MARK) @@ -41,7 +42,6 @@ where /// - U+005D (RIGHT SQUARE BRACKET) /// /// [RFC: Unified Log Format]: (https://github.com/tikv/rfcs/blob/master/text/2018-12-19-unified-log-format.md) -/// #[inline] fn need_json_encode(bytes: &[u8]) -> bool { for &byte in bytes { @@ -52,13 +52,13 @@ fn need_json_encode(bytes: &[u8]) -> bool { false } -/// According to [RFC: Unified Log Format], escapes the given data and writes it into a writer. -/// If there is no character [`need json encode`], it writes the data into the writer directly. -/// Else, it serializes the given data structure as JSON into a writer. +/// According to [RFC: Unified Log Format], escapes the given data and writes it +/// into a writer. If there is no character [`need json encode`], it writes the +/// data into the writer directly. Else, it serializes the given data structure +/// as JSON into a writer. /// /// [RFC: Unified Log Format]: (https://github.com/tikv/rfcs/blob/master/text/2018-12-19-unified-log-format.md) /// [`need json encode`]: #method.need_json_encode -/// pub fn write_escaped_str(writer: &mut W, value: &str) -> io::Result<()> where W: io::Write + ?Sized, diff --git a/components/tikv_util/src/logger/mod.rs b/components/tikv_util/src/logger/mod.rs index 35bf5f4c8e0..dc5d4a3b862 100644 --- a/components/tikv_util/src/logger/mod.rs +++ b/components/tikv_util/src/logger/mod.rs @@ -179,7 +179,8 @@ where TikvFormat::new(decorator, true) } -/// Same as text_format, but is adjusted to be closer to vanilla RocksDB logger format. +/// Same as text_format, but is adjusted to be closer to vanilla RocksDB logger +/// format. pub fn rocks_text_format(io: W, enable_timestamp: bool) -> RocksFormat> where W: io::Write, @@ -237,8 +238,8 @@ pub fn get_level_by_string(lv: &str) -> Option { } } -// The `to_string()` function of `slog::Level` produces values like `erro` and `trce` instead of -// the full words. This produces the full word. +// The `to_string()` function of `slog::Level` produces values like `erro` and +// `trce` instead of the full words. This produces the full word. pub fn get_string_by_level(lv: Level) -> &'static str { match lv { Level::Critical => "fatal", @@ -421,7 +422,8 @@ where } } -// Filters logs with operation cost lower than threshold. Otherwise output logs to inner drainer +// Filters logs with operation cost lower than threshold. Otherwise output logs +// to inner drainer struct SlowLogFilter { threshold: u64, inner: D, @@ -686,8 +688,8 @@ mod tests { use super::*; - // Due to the requirements of `Logger::root*` on a writer with a 'static lifetime - // we need to make a Thread Local, + // Due to the requirements of `Logger::root*` on a writer with a 'static + // lifetime we need to make a Thread Local, // and implement a custom writer. thread_local! { static BUFFER: RefCell> = RefCell::new(Vec::new()); @@ -861,7 +863,8 @@ mod tests { BUFFER.with(|buffer| { let mut buffer = buffer.borrow_mut(); let output = from_utf8(&*buffer).unwrap(); - // only check the log len here as some field like timestamp, location may change. + // only check the log len here as some field like timestamp, location may + // change. assert_eq!(output.len(), log.len()); buffer.clear(); }); @@ -880,7 +883,8 @@ mod tests { check_log(expected); } - /// Removes the wrapping signs, peels `"[hello]"` to `"hello"`, or peels `"(hello)"` to `"hello"`, + /// Removes the wrapping signs, peels `"[hello]"` to `"hello"`, or peels + /// `"(hello)"` to `"hello"`, fn peel(output: &str) -> &str { assert!(output.len() >= 2); &(output[1..output.len() - 1]) diff --git a/components/tikv_util/src/macros.rs b/components/tikv_util/src/macros.rs index ff32d255276..10889046a3b 100644 --- a/components/tikv_util/src/macros.rs +++ b/components/tikv_util/src/macros.rs @@ -41,7 +41,8 @@ macro_rules! slow_log { } -/// Makes a thread name with an additional tag inherited from the current thread. +/// Makes a thread name with an additional tag inherited from the current +/// thread. #[macro_export] macro_rules! thd_name { ($name:expr) => {{ @@ -54,7 +55,8 @@ macro_rules! thd_name { /// Simulates Go's defer. /// /// Please note that, different from go, this defer is bound to scope. -/// When exiting the scope, its deferred calls are executed in last-in-first-out order. +/// When exiting the scope, its deferred calls are executed in last-in-first-out +/// order. #[macro_export] macro_rules! defer { ($t:expr) => { @@ -62,8 +64,8 @@ macro_rules! defer { }; } -/// Waits for async operation. It returns `Option` after the expression gets executed. -/// It only accepts a `Result` expression. +/// Waits for async operation. It returns `Option` after the expression +/// gets executed. It only accepts a `Result` expression. #[macro_export] macro_rules! wait_op { ($expr:expr) => { @@ -87,7 +89,8 @@ macro_rules! wait_op { }}; } -/// Checks `Result>`, and returns early when it meets `Err` or `Ok(None)`. +/// Checks `Result>`, and returns early when it meets `Err` or +/// `Ok(None)`. #[macro_export] macro_rules! try_opt { ($expr:expr) => {{ @@ -99,8 +102,8 @@ macro_rules! try_opt { }}; } -/// Checks `Result>`, and returns early when it meets `Err` or `Ok(None)`. -/// return `Ok(or)` when met `Ok(None)`. +/// Checks `Result>`, and returns early when it meets `Err` or +/// `Ok(None)`. return `Ok(or)` when met `Ok(None)`. #[macro_export] macro_rules! try_opt_or { ($expr:expr, $or:expr) => {{ @@ -115,8 +118,8 @@ macro_rules! try_opt_or { /// A safe panic macro that prevents double panic. /// /// You probably want to use this macro instead of `panic!` in a `drop` method. -/// It checks whether the current thread is unwinding because of panic. If it is, -/// log an error message instead of causing double panic. +/// It checks whether the current thread is unwinding because of panic. If it +/// is, log an error message instead of causing double panic. #[macro_export] macro_rules! safe_panic { () => ({ diff --git a/components/tikv_util/src/memory.rs b/components/tikv_util/src/memory.rs index cd067f2c382..0a2f49461c5 100644 --- a/components/tikv_util/src/memory.rs +++ b/components/tikv_util/src/memory.rs @@ -33,7 +33,8 @@ impl HeapSize for Region { let mut size = self.start_key.capacity() + self.end_key.capacity(); size += mem::size_of::(); size += self.peers.capacity() * mem::size_of::(); - // There is still a `bytes` in `EncryptionMeta`. Ignore it becaure it could be shared. + // There is still a `bytes` in `EncryptionMeta`. Ignore it because it could be + // shared. size += mem::size_of::(); size } diff --git a/components/tikv_util/src/metrics/process_linux.rs b/components/tikv_util/src/metrics/process_linux.rs index 0b1c9777b09..9d661d1d434 100644 --- a/components/tikv_util/src/metrics/process_linux.rs +++ b/components/tikv_util/src/metrics/process_linux.rs @@ -1,7 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -//! This module is a subset of rust-prometheus's process collector, without the fd collector -//! to avoid memory fragmentation issues when open fd is large. +//! This module is a subset of rust-prometheus's process collector, without the +//! fd collector to avoid memory fragmentation issues when open fd is large. use std::io::{Error, ErrorKind, Result}; diff --git a/components/tikv_util/src/metrics/threads_dummy.rs b/components/tikv_util/src/metrics/threads_dummy.rs index 3bc60a4f5d4..bd718b34b00 100644 --- a/components/tikv_util/src/metrics/threads_dummy.rs +++ b/components/tikv_util/src/metrics/threads_dummy.rs @@ -1,11 +1,7 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -/*! - -Currently we does not support collecting CPU usage of threads for systems -other than Linux. PRs are welcome! - -*/ +//! Currently we does not support collecting CPU usage of threads for systems +//! other than Linux. PRs are welcome! use std::io; diff --git a/components/tikv_util/src/metrics/threads_linux.rs b/components/tikv_util/src/metrics/threads_linux.rs index 4eae41b0f06..608b60949e8 100644 --- a/components/tikv_util/src/metrics/threads_linux.rs +++ b/components/tikv_util/src/metrics/threads_linux.rs @@ -214,7 +214,8 @@ impl Collector for ThreadsCollector { } } -/// Sanitizes the thread name. Keeps `a-zA-Z0-9_:`, replaces `-` and ` ` with `_`, and drops the others. +/// Sanitizes the thread name. Keeps `a-zA-Z0-9_:`, replaces `-` and ` ` with +/// `_`, and drops the others. /// /// Examples: /// @@ -368,7 +369,8 @@ impl ThreadInfoStatistics { self.tid_names.entry(tid).or_insert(name); // To get a percentage result, - // we pre-multiply `cpu_time` by 100 here rather than inside the `update_metric`. + // we pre-multiply `cpu_time` by 100 here rather than inside the + // `update_metric`. let cpu_time = thread::linux::cpu_total(&stat) * 100.0; update_metric( &mut self.metrics_total.cpu_times, diff --git a/components/tikv_util/src/mpsc/batch.rs b/components/tikv_util/src/mpsc/batch.rs index f135c938e42..a635a75d4e4 100644 --- a/components/tikv_util/src/mpsc/batch.rs +++ b/components/tikv_util/src/mpsc/batch.rs @@ -196,8 +196,9 @@ impl Receiver { } } -/// Creates a unbounded channel with a given `notify_size`, which means if there are more pending -/// messages in the channel than `notify_size`, the `Sender` will auto notify the `Receiver`. +/// Creates a unbounded channel with a given `notify_size`, which means if there +/// are more pending messages in the channel than `notify_size`, the `Sender` +/// will auto notify the `Receiver`. /// /// # Panics /// if `notify_size` equals to 0. @@ -215,8 +216,9 @@ pub fn unbounded(notify_size: usize) -> (Sender, Receiver) { ) } -/// Creates a bounded channel with a given `notify_size`, which means if there are more pending -/// messages in the channel than `notify_size`, the `Sender` will auto notify the `Receiver`. +/// Creates a bounded channel with a given `notify_size`, which means if there +/// are more pending messages in the channel than `notify_size`, the `Sender` +/// will auto notify the `Receiver`. /// /// # Panics /// if `notify_size` equals to 0. @@ -285,9 +287,10 @@ where I: Fn() -> E + Unpin, C: BatchCollector + Unpin, { - /// Creates a new `BatchReceiver` with given `initializer` and `collector`. `initializer` is - /// used to generate a initial value, and `collector` will collect every (at most - /// `max_batch_size`) raw items into the batched value. + /// Creates a new `BatchReceiver` with given `initializer` and `collector`. + /// `initializer` is used to generate a initial value, and `collector` + /// will collect every (at most `max_batch_size`) raw items into the + /// batched value. pub fn new(rx: Receiver, max_batch_size: usize, initializer: I, collector: C) -> Self { BatchReceiver { rx, diff --git a/components/tikv_util/src/mpsc/mod.rs b/components/tikv_util/src/mpsc/mod.rs index 99dd6b3e5d0..fbd089ebb9e 100644 --- a/components/tikv_util/src/mpsc/mod.rs +++ b/components/tikv_util/src/mpsc/mod.rs @@ -1,12 +1,8 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -/*! - -This module provides an implementation of mpsc channel based on -crossbeam_channel. Comparing to the crossbeam_channel, this implementation -supports closed detection and try operations. - -*/ +//! This module provides an implementation of mpsc channel based on +//! crossbeam_channel. Comparing to the crossbeam_channel, this implementation +//! supports closed detection and try operations. pub mod batch; use std::{ @@ -99,7 +95,8 @@ impl Sender { self.sender.is_empty() } - /// Blocks the current thread until a message is sent or the channel is disconnected. + /// Blocks the current thread until a message is sent or the channel is + /// disconnected. #[inline] pub fn send(&self, t: T) -> Result<(), SendError> { if self.state.is_sender_connected() { diff --git a/components/tikv_util/src/stream.rs b/components/tikv_util/src/stream.rs index d491b73c1b2..b7ba46c45bf 100644 --- a/components/tikv_util/src/stream.rs +++ b/components/tikv_util/src/stream.rs @@ -71,7 +71,8 @@ pub fn error_stream(e: io::Error) -> impl Stream> + Unp /// otherwise the executor's states may be disrupted. /// /// This means the future must only use async functions. -// FIXME: get rid of this function, so that futures_executor::block_on is sufficient. +// FIXME: get rid of this function, so that futures_executor::block_on is +// sufficient. pub fn block_on_external_io(f: F) -> F::Output { // we need a Tokio runtime, Tokio futures require Tokio executor. Builder::new_current_thread() @@ -90,8 +91,8 @@ pub trait RetryError { /// Retries a future execution. /// -/// This method implements truncated exponential back-off retry strategies outlined in -/// and +/// This method implements truncated exponential back-off retry strategies +/// outlined in and /// /// Since rusoto does not have transparent auto-retry /// (), we need to implement this manually. diff --git a/components/tikv_util/src/sys/cgroup.rs b/components/tikv_util/src/sys/cgroup.rs index f475cf3ddda..59830748382 100644 --- a/components/tikv_util/src/sys/cgroup.rs +++ b/components/tikv_util/src/sys/cgroup.rs @@ -37,18 +37,20 @@ use procfs::process::{MountInfo, Process}; // For more details about cgrop v2, PTAL // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html. // -// The above examples are implicitly based on a premise that paths in `/proc/self/cgroup` -// can be appended to `/sys/fs/cgroup` directly to get the final paths. Generally it's -// correct for Linux hosts but maybe wrong for containers. For containers, cgroup file systems -// can be based on other mount points. For example: +// The above examples are implicitly based on a premise that paths in +// `/proc/self/cgroup` can be appended to `/sys/fs/cgroup` directly to get the +// final paths. Generally it's correct for Linux hosts but maybe wrong for +// containers. For containers, cgroup file systems can be based on other mount +// points. For example: // // /proc/self/cgroup: // 4:memory:/path/to/the/controller // /proc/self/mountinfo: -// 34 25 0:30 /path/to/the/controller /sys/fs/cgroup/memory relatime - cgroup cgroup memory -// `path/to/the/controller` is possible to be not accessable in the container. However from the -// `mountinfo` file we can know the path is mounted on `sys/fs/cgroup/memory`, then we can build -// the absolute path based on the mountinfo file. +// 34 25 0:30 /path/to/the/controller /sys/fs/cgroup/memory relatime - cgroup +// cgroup memory `path/to/the/controller` is possible to be not accessable in +// the container. However from the `mountinfo` file we can know the path is +// mounted on `sys/fs/cgroup/memory`, then we can build the absolute path based +// on the mountinfo file. // // For the format of the mountinfo file, PTAL https://man7.org/linux/man-pages/man5/proc.5.html. @@ -175,10 +177,12 @@ fn is_cgroup2_unified_mode() -> Result { } // From cgroup spec: -// "/proc/$PID/cgroup" lists a process’s cgroup membership. If legacy cgroup is in use in -// the system, this file may contain multiple lines, one for each hierarchy. +// "/proc/$PID/cgroup" lists a process’s cgroup membership. If legacy cgroup is +// in use in the system, this file may contain multiple lines, one for each +// hierarchy. // -// The format is "::". For example, "10:cpuset:/test-cpuset". +// The format is "::". For example, +// "10:cpuset:/test-cpuset". fn parse_proc_cgroup_v1(lines: &str) -> HashMap { let mut subsystems = HashMap::new(); for line in lines.lines().map(|s| s.trim()).filter(|s| !s.is_empty()) { diff --git a/components/tikv_util/src/sys/inspector.rs b/components/tikv_util/src/sys/inspector.rs index addb99c58d2..7b49b647706 100644 --- a/components/tikv_util/src/sys/inspector.rs +++ b/components/tikv_util/src/sys/inspector.rs @@ -20,8 +20,8 @@ pub struct DiskStat { pub trait ThreadInspector { type DiskID; - /// Disk read and write bytes from the backend storage layer. `None` means it's not available - /// for the platform. + /// Disk read and write bytes from the backend storage layer. `None` means + /// it's not available for the platform. fn io_stat(&self) -> Result, String> { Ok(None) } diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 2f5d3c98133..8dd7aefa77c 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -91,8 +91,8 @@ impl SysQuota { } } -/// Get the current global memory usage in bytes. Users need to call `record_global_memory_usage` -/// to refresh it periodically. +/// Get the current global memory usage in bytes. Users need to call +/// `record_global_memory_usage` to refresh it periodically. pub fn get_global_memory_usage() -> u64 { GLOBAL_MEMORY_USAGE.load(Ordering::Acquire) } @@ -110,7 +110,8 @@ pub fn record_global_memory_usage() { GLOBAL_MEMORY_USAGE.store(0, Ordering::Release); } -/// Register the high water mark so that `memory_usage_reaches_high_water` is available. +/// Register the high water mark so that `memory_usage_reaches_high_water` is +/// available. pub fn register_memory_usage_high_water(mark: u64) { MEMORY_USAGE_HIGH_WATER.store(mark, Ordering::Release); } diff --git a/components/tikv_util/src/sys/thread.rs b/components/tikv_util/src/sys/thread.rs index 445fc93974e..00a6e47b409 100644 --- a/components/tikv_util/src/sys/thread.rs +++ b/components/tikv_util/src/sys/thread.rs @@ -1,8 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -//! This module provides unified APIs for accessing thread/process related information. -//! Only Linux platform is implemented correctly, for other platform, it only guarantees -//! successful compilation. +//! This module provides unified APIs for accessing thread/process related +//! information. Only Linux platform is implemented correctly, for other +//! platform, it only guarantees successful compilation. use std::{io, io::Result, sync::Mutex, thread}; @@ -82,7 +82,8 @@ mod imp { } /// Gets thread ids of the given process id. - /// WARN: Don't call this function frequently. Otherwise there will be a lot of memory fragments. + /// WARN: Don't call this function frequently. Otherwise there will be a lot + /// of memory fragments. pub fn thread_ids>(pid: Pid) -> io::Result { let dir = fs::read_dir(format!("/proc/{}/task", pid))?; Ok(dir @@ -216,8 +217,8 @@ mod imp { pub command: String, } - /// Unlike Linux, the unit of `stime` and `utime` is microseconds instead of ticks. - /// See [`full_thread_stat()`] + /// Unlike Linux, the unit of `stime` and `utime` is microseconds instead of + /// ticks. See [`full_thread_stat()`] #[inline] pub fn ticks_per_second() -> i64 { MICRO_SEC_PER_SEC diff --git a/components/tikv_util/src/time.rs b/components/tikv_util/src/time.rs index 57e9e261444..0ab8240c4f2 100644 --- a/components/tikv_util/src/time.rs +++ b/components/tikv_util/src/time.rs @@ -342,10 +342,11 @@ impl Instant { } } - /// It is similar to `duration_since`, but it won't panic when `self` is less than `other`, - /// and `None` will be returned in this case. + /// It is similar to `duration_since`, but it won't panic when `self` is + /// less than `other`, and `None` will be returned in this case. /// - /// Callers need to ensure that `self` and `other` are same type of Instants. + /// Callers need to ensure that `self` and `other` are same type of + /// Instants. pub fn checked_sub(&self, other: Instant) -> Option { if *self >= other { Some(self.duration_since(other)) diff --git a/components/tikv_util/src/timer.rs b/components/tikv_util/src/timer.rs index 50cfa48f9aa..56a00e01a50 100644 --- a/components/tikv_util/src/timer.rs +++ b/components/tikv_util/src/timer.rs @@ -48,11 +48,11 @@ impl Timer { self.pending.peek().map(|task| task.0.next_tick) } - /// Pops a `TimeoutTask` from the `Timer`, which should be ticked before `instant`. - /// Returns `None` if no tasks should be ticked any more. + /// Pops a `TimeoutTask` from the `Timer`, which should be ticked before + /// `instant`. Returns `None` if no tasks should be ticked any more. /// - /// The normal use case is keeping `pop_task_before` until get `None` in order - /// to retrieve all available events. + /// The normal use case is keeping `pop_task_before` until get `None` in + /// order to retrieve all available events. pub fn pop_task_before(&mut self, instant: Instant) -> Option { if self .pending @@ -121,8 +121,8 @@ fn start_global_timer() -> Handle { struct TimeZero { /// An arbitrary time used as the zero time. /// - /// Note that `zero` doesn't have to be related to `steady_time_point`, as what's - /// observed here is elapsed time instead of time point. + /// Note that `zero` doesn't have to be related to `steady_time_point`, as + /// what's observed here is elapsed time instead of time point. zero: std::time::Instant, /// A base time point. /// @@ -135,8 +135,8 @@ struct TimeZero { /// Time produced by the clock is not affected by clock jump or time adjustment. /// Internally it uses CLOCK_MONOTONIC_RAW to get a steady time source. /// -/// `Instant`s produced by this clock can't be compared or used to calculate elapse -/// unless they are produced using the same zero time. +/// `Instant`s produced by this clock can't be compared or used to calculate +/// elapse unless they are produced using the same zero time. #[derive(Clone)] pub struct SteadyClock { zero: Arc, diff --git a/components/tikv_util/src/topn.rs b/components/tikv_util/src/topn.rs index 5147f0d9b86..d6e059d8c42 100644 --- a/components/tikv_util/src/topn.rs +++ b/components/tikv_util/src/topn.rs @@ -58,7 +58,8 @@ impl IntoIterator for TopN { #[allow(clippy::type_complexity)] type IntoIter = iter::Map>, fn(Reverse) -> T>; - // note: IntoIterator doesn't require the result in order, there is an `IntoIterSorted`, implement that if necessary + // note: IntoIterator doesn't require the result in order, there is an + // `IntoIterSorted`, implement that if necessary fn into_iter(self) -> Self::IntoIter { self.heap.into_iter().map(|Reverse(x)| x) } diff --git a/components/tikv_util/src/worker/mod.rs b/components/tikv_util/src/worker/mod.rs index 4c2e3d2473f..a8196dca054 100644 --- a/components/tikv_util/src/worker/mod.rs +++ b/components/tikv_util/src/worker/mod.rs @@ -1,18 +1,15 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. -/*! - -`Worker` provides a mechanism to run tasks asynchronously (i.e. in the background) with some -additional features, for example, ticks. - -A worker contains: - -- A runner (which should implement the `Runnable` trait): to run tasks one by one or in batch. -- A scheduler: to send tasks to the runner, returns immediately. - -Briefly speaking, this is a mpsc (multiple-producer-single-consumer) model. - -*/ +//! `Worker` provides a mechanism to run tasks asynchronously (i.e. in the +//! background) with some additional features, for example, ticks. +//! +//! A worker contains: +//! +//! - A runner (which should implement the `Runnable` trait): to run tasks one +//! by one or in batch. +//! - A scheduler: to send tasks to the runner, returns immediately. +//! +//! Briefly speaking, this is a mpsc (multiple-producer-single-consumer) model. mod future; mod metrics; diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index 841a8a2229d..621ac730c30 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -117,7 +117,8 @@ impl Scheduler { /// Schedules a task to run. /// - /// If the worker is stopped or number pending tasks exceeds capacity, an error will return. + /// If the worker is stopped or number pending tasks exceeds capacity, an + /// error will return. pub fn schedule(&self, task: T) -> Result<(), ScheduleError> { debug!("scheduling task {}", task); if self.counter.load(Ordering::Acquire) >= self.pending_capacity { diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index a40221e3b6d..6962ae30756 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -72,8 +72,8 @@ impl FuturePool { /// Gets current running task count. #[inline] pub fn get_running_task_count(&self) -> usize { - // As long as different future pool has different name prefix, we can safely use the value - // in metrics. + // As long as different future pool has different name prefix, we can safely use + // the value in metrics. self.inner.get_running_task_count() } @@ -85,7 +85,8 @@ impl FuturePool { self.inner.spawn(TrackedFuture::new(future)) } - /// Spawns a future in the pool and returns a handle to the result of the future. + /// Spawns a future in the pool and returns a handle to the result of the + /// future. /// /// The future will not be executed if the handle is not polled. pub fn spawn_handle( @@ -116,8 +117,8 @@ impl PoolInner { } fn get_running_task_count(&self) -> usize { - // As long as different future pool has different name prefix, we can safely use the value - // in metrics. + // As long as different future pool has different name prefix, we can safely use + // the value in metrics. self.env.metrics_running_task_count.get() as usize } @@ -298,7 +299,8 @@ mod tests { // So far we have only elapsed TICK_INTERVAL * 0.2, so no ticks so far. assert!(try_recv_tick().is_err()); - // Even if long enough time has elapsed, tick is not emitted until next task arrives + // Even if long enough time has elapsed, tick is not emitted until next task + // arrives thread::sleep(TICK_INTERVAL * 2); assert!(try_recv_tick().is_err()); diff --git a/components/tipb_helper/src/expr_def_builder.rs b/components/tipb_helper/src/expr_def_builder.rs index 589ee1afbd6..f6c1d26a1ff 100644 --- a/components/tipb_helper/src/expr_def_builder.rs +++ b/components/tipb_helper/src/expr_def_builder.rs @@ -4,7 +4,8 @@ use codec::prelude::NumberEncoder; use tidb_query_datatype::{FieldTypeAccessor, FieldTypeFlag, FieldTypeTp}; use tipb::{Expr, ExprType, FieldType, ScalarFuncSig}; -/// A helper utility to build `tipb::Expr` (a.k.a. expression definition) easily. +/// A helper utility to build `tipb::Expr` (a.k.a. expression definition) +/// easily. pub struct ExprDefBuilder(Expr); impl ExprDefBuilder { diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index dbefbbe770c..c37fcde86d1 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -125,8 +125,8 @@ pub struct RequestMetrics { pub block_read_nanos: u64, pub internal_key_skipped_count: u64, pub deleted_key_skipped_count: u64, - // temp instant used in raftstore metrics, first be the instant when creating the write callback, - // then reset when it is ready to apply + // temp instant used in raftstore metrics, first be the instant when creating the write + // callback, then reset when it is ready to apply pub write_instant: Option, pub wf_batch_wait_nanos: u64, pub wf_send_to_queue_nanos: u64, diff --git a/components/txn_types/src/lock.rs b/components/txn_types/src/lock.rs index 1a48a59308b..e0570d900ac 100644 --- a/components/txn_types/src/lock.rs +++ b/components/txn_types/src/lock.rs @@ -331,7 +331,8 @@ impl Lock { info } - /// Checks whether the lock conflicts with the given `ts`. If `ts == TimeStamp::max()`, the primary lock will be ignored. + /// Checks whether the lock conflicts with the given `ts`. If `ts == + /// TimeStamp::max()`, the primary lock will be ignored. fn check_ts_conflict_si( lock: Cow<'_, Self>, key: &Key, @@ -358,8 +359,9 @@ impl Lock { let raw_key = key.to_raw()?; if ts == TimeStamp::max() && raw_key == lock.primary && !lock.use_async_commit { - // When `ts == TimeStamp::max()` (which means to get latest committed version for - // primary key), and current key is the primary key, we ignore this lock. + // When `ts == TimeStamp::max()` (which means to get latest committed version + // for primary key), and current key is the primary key, we ignore + // this lock. return Ok(()); } @@ -421,8 +423,8 @@ impl Lock { } } -/// A specialized lock only for pessimistic lock. This saves memory for cases that only -/// pessimistic locks exist. +/// A specialized lock only for pessimistic lock. This saves memory for cases +/// that only pessimistic locks exist. #[derive(Clone, PartialEq, Eq)] pub struct PessimisticLock { /// The primary key in raw format. @@ -803,7 +805,8 @@ mod tests { ) .unwrap(); - // Ignore the primary lock when reading the latest committed version by setting u64::MAX as ts + // Ignore the primary lock when reading the latest committed version by setting + // u64::MAX as ts lock.lock_type = LockType::Put; lock.primary = b"foo".to_vec(); Lock::check_ts_conflict( @@ -815,7 +818,8 @@ mod tests { ) .unwrap(); - // Should not ignore the primary lock of an async commit transaction even if setting u64::MAX as ts + // Should not ignore the primary lock of an async commit transaction even if + // setting u64::MAX as ts let async_commit_lock = lock.clone().use_async_commit(vec![]); Lock::check_ts_conflict( Cow::Borrowed(&async_commit_lock), diff --git a/components/txn_types/src/timestamp.rs b/components/txn_types/src/timestamp.rs index dcb6f6b03dd..593fa2e1d41 100644 --- a/components/txn_types/src/timestamp.rs +++ b/components/txn_types/src/timestamp.rs @@ -122,10 +122,11 @@ const TS_SET_USE_VEC_LIMIT: usize = 8; pub enum TsSet { /// When the set is empty, avoid the useless cloning of Arc. Empty, - /// `Vec` is suitable when the set is small or the set is barely used, and it doesn't worth - /// converting a `Vec` into a `HashSet`. + /// `Vec` is suitable when the set is small or the set is barely used, and + /// it doesn't worth converting a `Vec` into a `HashSet`. Vec(Arc<[TimeStamp]>), - /// `Set` is suitable when there are many timestamps **and** it will be queried multiple times. + /// `Set` is suitable when there are many timestamps **and** it will be + /// queried multiple times. Set(Arc>), } @@ -137,14 +138,15 @@ impl Default for TsSet { } impl TsSet { - /// Create a `TsSet` from the given vec of timestamps. It will select the proper internal - /// collection type according to the size. + /// Create a `TsSet` from the given vec of timestamps. It will select the + /// proper internal collection type according to the size. #[inline] pub fn new(ts: Vec) -> Self { if ts.is_empty() { TsSet::Empty } else if ts.len() <= TS_SET_USE_VEC_LIMIT { - // If there are too few elements in `ts`, use Vec directly instead of making a Set. + // If there are too few elements in `ts`, use Vec directly instead of making a + // Set. TsSet::Vec(ts.into()) } else { TsSet::Set(Arc::new(ts.into_iter().collect())) @@ -161,10 +163,11 @@ impl TsSet { Self::vec(unsafe { tikv_util::memory::vec_transmute(ts) }) } - /// Create a `TsSet` from the given vec of timestamps, but it will be forced to use `Vec` as the - /// internal collection type. When it's sure that the set will be queried at most once, use this - /// is better than `TsSet::new`, since both the querying on `Vec` and the conversion from `Vec` - /// to `HashSet` is O(N). + /// Create a `TsSet` from the given vec of timestamps, but it will be forced + /// to use `Vec` as the internal collection type. When it's sure that the + /// set will be queried at most once, use this is better than `TsSet::new`, + /// since both the querying on `Vec` and the conversion from `Vec` to + /// `HashSet` is O(N). #[inline] pub fn vec(ts: Vec) -> Self { if ts.is_empty() { diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 432f1eafc34..1d3fd775f1b 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -59,7 +59,8 @@ impl Key { Key(encoded) } - /// Creates a key from raw bytes but returns None if the key is an empty slice. + /// Creates a key from raw bytes but returns None if the key is an empty + /// slice. #[inline] pub fn from_raw_maybe_unbounded(key: &[u8]) -> Option { if key.is_empty() { @@ -89,7 +90,8 @@ impl Key { Key(encoded_key) } - /// Creates a key with reserved capacity for timestamp from encoded bytes slice. + /// Creates a key with reserved capacity for timestamp from encoded bytes + /// slice. #[inline] pub fn from_encoded_slice(encoded_key: &[u8]) -> Key { let mut k = Vec::with_capacity(encoded_key.len() + number::U64_SIZE); @@ -128,7 +130,8 @@ impl Key { /// Creates a new key by truncating the timestamp from this key. /// - /// Preconditions: the caller must ensure this is actually a timestamped key. + /// Preconditions: the caller must ensure this is actually a timestamped + /// key. #[inline] pub fn truncate_ts(mut self) -> Result { let len = self.0.len(); @@ -183,14 +186,14 @@ impl Key { Ok(number::decode_u64_desc(&mut ts)?.into()) } - /// Whether the user key part of a ts encoded key `ts_encoded_key` equals to the encoded - /// user key `user_key`. + /// Whether the user key part of a ts encoded key `ts_encoded_key` equals to + /// the encoded user key `user_key`. /// - /// There is an optimization in this function, which is to compare the last 8 encoded bytes - /// first before comparing the rest. It is because in TiDB many records are ended with an 8 - /// byte row id and in many situations only this part is different when calling this function. - // - // TODO: If the last 8 byte is memory aligned, it would be better. + /// There is an optimization in this function, which is to compare the last + /// 8 encoded bytes first before comparing the rest. It is because in TiDB + /// many records are ended with an 8 byte row id and in many situations only + /// this part is different when calling this function. TODO: If the last + /// 8 byte is memory aligned, it would be better. #[inline] pub fn is_user_key_eq(ts_encoded_key: &[u8], user_key: &[u8]) -> bool { let user_key_len = user_key.len(); @@ -199,8 +202,8 @@ impl Key { } if user_key_len >= number::U64_SIZE { // We compare last 8 bytes as u64 first, then compare the rest. - // TODO: Can we just use == to check the left part and right part? `memcmp` might - // be smart enough. + // TODO: Can we just use == to check the left part and right part? `memcmp` + // might be smart enough. let left = NativeEndian::read_u64(&ts_encoded_key[user_key_len - 8..]); let right = NativeEndian::read_u64(&user_key[user_key_len - 8..]); if left != right { @@ -262,10 +265,11 @@ pub enum MutationType { /// A row mutation. /// -/// It may also carry an `Assertion` field, which means it has such an *assertion* to the data -/// (the key already exist or not exist). The assertion should pass if the mutation (in a prewrite -/// request) is going to be finished successfully, otherwise it indicates there should be some bug -/// causing the attempt to write wrong data. +/// It may also carry an `Assertion` field, which means it has such an +/// *assertion* to the data (the key already exist or not exist). The assertion +/// should pass if the mutation (in a prewrite request) is going to be finished +/// successfully, otherwise it indicates there should be some bug causing the +/// attempt to write wrong data. #[derive(Clone)] pub enum Mutation { /// Put `Value` into `Key`, overwriting any existing value. @@ -429,8 +433,8 @@ impl From for Mutation { } } -/// `OldValue` is used by cdc to read the previous value associated with some key during the -/// prewrite process. +/// `OldValue` is used by cdc to read the previous value associated with some +/// key during the prewrite process. #[derive(Debug, Clone, PartialEq)] pub enum OldValue { /// A real `OldValue`. @@ -441,8 +445,8 @@ pub enum OldValue { None, /// The user doesn't care about the previous value. Unspecified, - /// Not sure whether the old value exists or not. users can seek CF_WRITE to the give position - /// to take a look. + /// Not sure whether the old value exists or not. users can seek CF_WRITE to + /// the give position to take a look. SeekWrite(Key), } @@ -470,7 +474,8 @@ impl OldValue { } } - /// The finalized `OldValue::Value` content, or `None` for `OldValue::Unspecified`. + /// The finalized `OldValue::Value` content, or `None` for + /// `OldValue::Unspecified`. /// /// # Panics /// @@ -496,8 +501,8 @@ impl OldValue { } // Returned by MvccTxn when extra_op is set to kvrpcpb::ExtraOp::ReadOldValue. -// key with current ts -> (short value of the prev txn, start ts of the prev txn). -// The value of the map will be None when the mutation is `Insert`. +// key with current ts -> (short value of the prev txn, start ts of the prev +// txn). The value of the map will be None when the mutation is `Insert`. // MutationType is the type of mutation of the current write. pub type OldValues = HashMap)>; diff --git a/components/txn_types/src/write.rs b/components/txn_types/src/write.rs index 73871bf8abe..755207ed3f3 100644 --- a/components/txn_types/src/write.rs +++ b/components/txn_types/src/write.rs @@ -29,7 +29,8 @@ const FLAG_OVERLAPPED_ROLLBACK: u8 = b'R'; const GC_FENCE_PREFIX: u8 = b'F'; -/// The short value for rollback records which are protected from being collapsed. +/// The short value for rollback records which are protected from being +/// collapsed. const PROTECTED_ROLLBACK_SHORT_VALUE: &[u8] = b"p"; impl WriteType { @@ -68,20 +69,22 @@ pub struct Write { pub start_ts: TimeStamp, pub short_value: Option, - /// The `commit_ts` of transactions can be non-globally-unique. But since we store Rollback - /// records in the same CF where Commit records is, and Rollback records are saved with - /// `user_key{start_ts}` as the internal key, the collision between Commit and Rollback - /// records can't be avoided. In this case, we keep the Commit record, and set the - /// `has_overlapped_rollback` flag to indicate that there's also a Rollback record. - /// Also note that `has_overlapped_rollback` field is only necessary when the Rollback record - /// should be protected. + /// The `commit_ts` of transactions can be non-globally-unique. But since we + /// store Rollback records in the same CF where Commit records is, and + /// Rollback records are saved with `user_key{start_ts}` as the internal + /// key, the collision between Commit and Rollback records can't be avoided. + /// In this case, we keep the Commit record, and set the + /// `has_overlapped_rollback` flag to indicate that there's also a Rollback + /// record. Also note that `has_overlapped_rollback` field is only necessary + /// when the Rollback record should be protected. pub has_overlapped_rollback: bool, - /// Records the next version after this version when overlapping rollback happens on an already - /// existed commit record. + /// Records the next version after this version when overlapping rollback + /// happens on an already existed commit record. /// - /// When a rollback flag is written on an already-written commit record, it causes rewriting - /// the commit record. It may cause problems with the GC compaction filter. Consider this case: + /// When a rollback flag is written on an already-written commit record, it + /// causes rewriting the commit record. It may cause problems with the + /// GC compaction filter. Consider this case: /// /// ```text /// Key_100_put, Key_120_del @@ -93,51 +96,59 @@ pub struct Write { /// Key_100_put_R, Key_120_del /// ``` /// - /// Then GC with safepoint = 130 may happen. However a follower may not have finished applying - /// the change. So on the follower, it's possible that: + /// Then GC with safepoint = 130 may happen. However a follower may not have + /// finished applying the change. So on the follower, it's possible + /// that: /// /// 1. `Key_100_put`, `Key_120_del` applied - /// 2. GC with safepoint = 130 started and `Key_100_put`, `Key_120_del` are deleted - /// 3. Finished applying `Key_100_put_R`, which means to rewrite `Key_100_put` - /// 4. Read at `140` should get nothing (since it's MVCC-deleted at 120) but finds `Key_100_put` + /// 2. GC with safepoint = 130 started and `Key_100_put`, `Key_120_del` are + /// deleted 3. Finished applying `Key_100_put_R`, which means to rewrite + /// `Key_100_put` 4. Read at `140` should get nothing (since it's + /// MVCC-deleted at 120) but finds `Key_100_put` /// - /// To solve the problem, when marking `has_overlapped_rollback` on an already-existed commit - /// record, add a special field `gc_fence` on it. If there is a newer version after the record - /// being rewritten, the next version's `commit_ts` will be recorded. When MVCC reading finds - /// a commit record with a GC fence timestamp but the corresponding version that matches that ts - /// doesn't exist, the current version will be believed to be already GC-ed and ignored. + /// To solve the problem, when marking `has_overlapped_rollback` on an + /// already-existed commit record, add a special field `gc_fence` on it. If + /// there is a newer version after the record being rewritten, the next + /// version's `commit_ts` will be recorded. When MVCC reading finds a commit + /// record with a GC fence timestamp but the corresponding version + /// that matches that ts doesn't exist, the current version will be + /// believed to be already GC-ed and ignored. /// - /// Therefore, for the example above, in the 3rd step it will record the version `120` to the - /// `gc_fence` field: + /// Therefore, for the example above, in the 3rd step it will record the + /// version `120` to the `gc_fence` field: /// /// ```text /// Key_100_put_R_120, Key_120_del /// ``` /// - /// And when the reading in the 4th step finds the `PUT` record but the version at 120 doesn't - /// exist, it will be regarded as already GC-ed and ignored. + /// And when the reading in the 4th step finds the `PUT` record but the + /// version at 120 doesn't exist, it will be regarded as already GC-ed + /// and ignored. /// - /// For CDC and TiFlash, when they receives a commit record with `gc_fence` field set, it can - /// determine that it must be caused by an overlapped rollback instead of an actual commit. + /// For CDC and TiFlash, when they receives a commit record with `gc_fence` + /// field set, it can determine that it must be caused by an overlapped + /// rollback instead of an actual commit. /// - /// Note: GC fence will only be written on `PUT` and `DELETE` versions, and may only point to - /// a `PUT` or `DELETE` version. If there are other `Lock` and `Rollback` records after the - /// record that's being rewritten, they will be skipped. For example, in this case: + /// Note: GC fence will only be written on `PUT` and `DELETE` versions, and + /// may only point to a `PUT` or `DELETE` version. If there are other `Lock` + /// and `Rollback` records after the record that's being rewritten, they + /// will be skipped. For example, in this case: /// /// ```text /// Key_100_put, Key_105_lock, Key_110_rollback, Key_120_del /// ``` /// - /// If overlapped rollback happens at 100, the `Key_100_put` will be rewritten as - /// `Key_100_put_R_120`. It points to version 120 instead of the nearest 105. + /// If overlapped rollback happens at 100, the `Key_100_put` will be + /// rewritten as `Key_100_put_R_120`. It points to version 120 instead + /// of the nearest 105. /// /// /// The meaning of the field: /// * `None`: A record that haven't been rewritten - /// * `Some(0)`: A commit record that has been rewritten due to overlapping rollback, but it - /// doesn't have an newer version. - /// * `Some(ts)`: A commit record that has been rewritten due to overlapping rollback, - /// and it's next version's `commit_ts` is `ts` + /// * `Some(0)`: A commit record that has been rewritten due to overlapping + /// rollback, but it doesn't have an newer version. + /// * `Some(ts)`: A commit record that has been rewritten due to overlapping + /// rollback, and it's next version's `commit_ts` is `ts` pub gc_fence: Option, } @@ -229,17 +240,18 @@ pub struct WriteRef<'a> { pub write_type: WriteType, pub start_ts: TimeStamp, pub short_value: Option<&'a [u8]>, - /// The `commit_ts` of transactions can be non-globally-unique. But since we store Rollback - /// records in the same CF where Commit records is, and Rollback records are saved with - /// `user_key{start_ts}` as the internal key, the collision between Commit and Rollback - /// records can't be avoided. In this case, we keep the Commit record, and set the - /// `has_overlapped_rollback` flag to indicate that there's also a Rollback record. - /// Also note that `has_overlapped_rollback` field is only necessary when the Rollback record - /// should be protected. + /// The `commit_ts` of transactions can be non-globally-unique. But since we + /// store Rollback records in the same CF where Commit records is, and + /// Rollback records are saved with `user_key{start_ts}` as the internal + /// key, the collision between Commit and Rollback records can't be avoided. + /// In this case, we keep the Commit record, and set the + /// `has_overlapped_rollback` flag to indicate that there's also a Rollback + /// record. Also note that `has_overlapped_rollback` field is only necessary + /// when the Rollback record should be protected. pub has_overlapped_rollback: bool, - /// Records the next version after this version when overlapping rollback happens on an already - /// existed commit record. + /// Records the next version after this version when overlapping rollback + /// happens on an already existed commit record. /// /// See [`Write::gc_fence`] for more detail. pub gc_fence: Option, @@ -333,21 +345,23 @@ impl WriteRef<'_> { } /// Prev Conditions: - /// * The `Write` record `self` is referring to is the latest version found by reading at `read_ts` - /// * The `read_ts` is safe, which means, it's not earlier than the current GC safepoint. + /// * The `Write` record `self` is referring to is the latest version + /// found by reading at `read_ts` + /// * The `read_ts` is safe, which means, it's not earlier than the + /// current GC safepoint. /// Return: - /// Whether the `Write` record is valid, ie. there's no GC fence or GC fence doesn't points to any other - /// version. + /// Whether the `Write` record is valid, ie. there's no GC fence or GC + /// fence doesn't points to any other version. pub fn check_gc_fence_as_latest_version(&self, read_ts: TimeStamp) -> bool { - // It's a valid write record if there's no GC fence or GC fence doesn't points to any other - // version. + // It's a valid write record if there's no GC fence or GC fence doesn't points + // to any other version. // If there is a GC fence that's points to another version, there are two cases: // * If `gc_fence_ts > read_ts`, then since `read_ts` didn't expire the GC - // safepoint, so the current version must be a not-expired version or the latest version - // before safepoint, so it must be a valid version - // * If `gc_fence_ts <= read_ts`, since the current version is the latest version found by - // reading at `read_ts`, the version at `gc_fence_ts` must be missing, so the current - // version must be invalid. + // safepoint, so the current version must be a not-expired version or the + // latest version before safepoint, so it must be a valid version + // * If `gc_fence_ts <= read_ts`, since the current version is the latest + // version found by reading at `read_ts`, the version at `gc_fence_ts` must be + // missing, so the current version must be invalid. if let Some(gc_fence_ts) = self.gc_fence { if !gc_fence_ts.is_zero() && gc_fence_ts <= read_ts { return false; diff --git a/fuzz/cli.rs b/fuzz/cli.rs index f70551ac084..3a804be7d17 100644 --- a/fuzz/cli.rs +++ b/fuzz/cli.rs @@ -212,7 +212,10 @@ fn run_afl(target: &str) -> Result<()> { )); } - // 2. cargo afl fuzz -i {seed_dir} -o {corpus_dir} target/debug/{instrumented_binary} + // 2. + // ``` + // cargo afl fuzz -i {seed_dir} -o {corpus_dir} target/debug/{instrumented_binary} + // ``` let instrumented_bin = WORKSPACE_ROOT.join("target/debug").join(target); let fuzzer_bin = Command::new("cargo") .args(&["afl", "fuzz"]) diff --git a/fuzz/targets/mod.rs b/fuzz/targets/mod.rs index 25799ff618f..73e29bef568 100644 --- a/fuzz/targets/mod.rs +++ b/fuzz/targets/mod.rs @@ -1,6 +1,7 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -//! DO NOT MOVE THIS FILE. IT WILL BE PARSED BY `fuzz/cli.rs`. SEE `discover_fuzz_targets()`. +//! DO NOT MOVE THIS FILE. IT WILL BE PARSED BY `fuzz/cli.rs`. SEE +//! `discover_fuzz_targets()`. mod util; diff --git a/rustfmt.toml b/rustfmt.toml index ccc70980180..68b82c22bd1 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,6 +1,13 @@ version = "Two" unstable_features = true +comment_width = 80 +wrap_comments = true +format_code_in_doc_comments = true +format_macro_bodies = true +format_macro_matchers = true +normalize_comments = true +normalize_doc_attributes = true condense_wildcard_suffixes = true license_template_path = "etc/license.template" newline_style = "Unix" diff --git a/src/config.rs b/src/config.rs index 489609d1196..0fe367c1349 100644 --- a/src/config.rs +++ b/src/config.rs @@ -88,7 +88,8 @@ pub const BLOCK_CACHE_RATE: f64 = 0.45; /// By default, TiKV will try to limit memory usage to 75% of system memory. pub const MEMORY_USAGE_LIMIT_RATE: f64 = 0.75; -/// Min block cache shard's size. If a shard is too small, the index/filter data may not fit one shard +/// Min block cache shard's size. If a shard is too small, the index/filter data +/// may not fit one shard pub const MIN_BLOCK_CACHE_SHARD_SIZE: usize = 128 * MIB as usize; /// Maximum of 15% of system memory can be used by Raft Engine. Normally its @@ -240,11 +241,12 @@ fn get_background_job_limits_impl( cpu_num: u32, defaults: &BackgroundJobLimits, ) -> BackgroundJobLimits { - // At the minimum, we should have two background jobs: one for flush and one for compaction. - // Otherwise, the number of background jobs should not exceed cpu_num - 1. + // At the minimum, we should have two background jobs: one for flush and one for + // compaction. Otherwise, the number of background jobs should not exceed + // cpu_num - 1. let max_background_jobs = cmp::max(2, cmp::min(defaults.max_background_jobs, cpu_num - 1)); - // Scale flush threads proportionally to cpu cores. Also make sure the number of flush - // threads doesn't exceed total jobs. + // Scale flush threads proportionally to cpu cores. Also make sure the number of + // flush threads doesn't exceed total jobs. let max_background_flushes = cmp::min( (max_background_jobs + 3) / 4, defaults.max_background_flushes, @@ -540,15 +542,16 @@ macro_rules! build_cf_opt { let compression_per_level = $opt.compression_per_level[..$opt.num_levels as usize].to_vec(); cf_opts.compression_per_level(compression_per_level.as_slice()); cf_opts.bottommost_compression($opt.bottommost_level_compression); - // To set for bottommost level sst compression. The first 3 parameters refer to the - // default value in `CompressionOptions` in `rocksdb/include/rocksdb/advanced_options.h`. + // To set for bottommost level sst compression. The first 3 parameters refer to + // the default value in `CompressionOptions` in + // `rocksdb/include/rocksdb/advanced_options.h`. cf_opts.set_bottommost_level_compression_options( - -14, /* window_bits */ - 32767, /* level */ - 0, /* strategy */ + -14, // window_bits + 32767, // level + 0, // strategy $opt.bottommost_zstd_compression_dict_size, $opt.bottommost_zstd_compression_sample_size, - 1, /* parallel_threads */ + 1, // parallel_threads ); cf_opts.set_write_buffer_size($opt.write_buffer_size.0); cf_opts.set_max_write_buffer_number($opt.max_write_buffer_number); @@ -967,8 +970,8 @@ impl RaftCfConfig { #[derive(Clone, Serialize, Deserialize, PartialEq, Debug)] #[serde(default)] #[serde(rename_all = "kebab-case")] -// Note that Titan is still an experimental feature. Once enabled, it can't fall back. -// Forced fallback may result in data loss. +// Note that Titan is still an experimental feature. Once enabled, it can't fall +// back. Forced fallback may result in data loss. pub struct TitanDBConfig { pub enabled: bool, pub dirname: String, @@ -1063,8 +1066,8 @@ pub struct DbConfig { #[online_config(skip)] pub enable_pipelined_write: bool, // deprecated. TiKV will use a new write mode when set `enable_pipelined_write` false and fall - // back to write mode in 3.0 when set `enable_pipelined_write` true. The code of multi-batch-write - // in RocksDB has been removed. + // back to write mode in 3.0 when set `enable_pipelined_write` true. The code of + // multi-batch-write in RocksDB has been removed. #[online_config(skip)] #[serde(skip_serializing)] pub enable_multi_batch_write: bool, @@ -1331,10 +1334,12 @@ impl RaftDefaultCfConfig { } } -// RocksDB Env associate thread pools of multiple instances from the same process. -// When construct Options, options.env is set to same singleton Env::Default() object. -// So total max_background_jobs = max(rocksdb.max_background_jobs, raftdb.max_background_jobs) -// But each instance will limit their background jobs according to their own max_background_jobs +// RocksDB Env associate thread pools of multiple instances from the same +// process. When construct Options, options.env is set to same singleton +// Env::Default() object. So total max_background_jobs = +// max(rocksdb.max_background_jobs, raftdb.max_background_jobs) +// But each instance will limit their background jobs according to their own +// max_background_jobs #[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] @@ -1592,7 +1597,8 @@ impl> DBConfigManger { block-cache.capacity in storage module instead" .into()); } - // for multi-rocks, shared block cache has to be enabled and thus should shortcut in the above if statement. + // for multi-rocks, shared block cache has to be enabled and thus should + // shortcut in the above if statement. assert!(self.tablet_accessor.is_single_engine()); let mut error_collector = TabletErrorCollector::new(); self.tablet_accessor @@ -1764,7 +1770,8 @@ fn config_to_slice(config_change: &[(String, String)]) -> Vec<(&str, &str)> { .collect() } -// Convert `ConfigValue` to formatted String that can pass to `DB::set_db_options` +// Convert `ConfigValue` to formatted String that can pass to +// `DB::set_db_options` fn config_value_to_string(config_change: Vec<(String, ConfigValue)>) -> Vec<(String, String)> { config_change .into_iter() @@ -2130,10 +2137,10 @@ macro_rules! readpool_config { const DEFAULT_STORAGE_READPOOL_MIN_CONCURRENCY: usize = 4; const DEFAULT_STORAGE_READPOOL_MAX_CONCURRENCY: usize = 8; -// Assume a request can be finished in 1ms, a request at position x will wait about -// 0.001 * x secs to be actual started. A server-is-busy error will trigger 2 seconds -// backoff. So when it needs to wait for more than 2 seconds, return error won't causse -// larger latency. +// Assume a request can be finished in 1ms, a request at position x will wait +// about 0.001 * x secs to be actual started. A server-is-busy error will +// trigger 2 seconds backoff. So when it needs to wait for more than 2 seconds, +// return error won't causse larger latency. const DEFAULT_READPOOL_MAX_TASKS_PER_WORKER: usize = 2 * 1000; const MIN_READPOOL_STACK_SIZE_MB: u64 = 2; @@ -2511,17 +2518,19 @@ pub struct CdcConfig { pub incremental_scan_threads: usize, pub incremental_scan_concurrency: usize, pub incremental_scan_speed_limit: ReadableSize, - /// `TsFilter` can increase speed and decrease resource usage when incremental content is much - /// less than total content. However in other cases, `TsFilter` can make performance worse - /// because it needs to re-fetch old row values if they are required. + /// `TsFilter` can increase speed and decrease resource usage when + /// incremental content is much less than total content. However in + /// other cases, `TsFilter` can make performance worse because it needs + /// to re-fetch old row values if they are required. /// - /// `TsFilter` will be enabled if `incremental/total <= incremental_scan_ts_filter_ratio`. + /// `TsFilter` will be enabled if `incremental/total <= + /// incremental_scan_ts_filter_ratio`. /// Set `incremental_scan_ts_filter_ratio` to 0 will disable it. pub incremental_scan_ts_filter_ratio: f64, - /// Count of threads to confirm Region leadership in TiKV instances, 1 by default. - /// Please consider to increase it if count of regions on one TiKV instance is - /// greater than 20k. + /// Count of threads to confirm Region leadership in TiKV instances, 1 by + /// default. Please consider to increase it if count of regions on one + /// TiKV instance is greater than 20k. #[online_config(skip)] pub tso_worker_threads: usize, @@ -3254,8 +3263,8 @@ impl TiKvConfig { Ok(()) } - // As the init of `logger` is very early, this adjust needs to be separated and called - // immediately after parsing the command line. + // As the init of `logger` is very early, this adjust needs to be separated and + // called immediately after parsing the command line. pub fn logger_compatible_adjust(&mut self) { let default_tikv_cfg = TiKvConfig::default(); let default_log_cfg = LogConfig::default(); @@ -3373,8 +3382,8 @@ impl TiKvConfig { "server.end-point-max-tasks", "readpool.coprocessor.max-tasks-per-worker-xxx", ); // Note: - // Our `end_point_max_tasks` is mostly mistakenly configured, so we don't override - // new configuration using old values. + // Our `end_point_max_tasks` is mostly mistakenly configured, so we don't + // override new configuration using old values. self.server.end_point_max_tasks = None; } if self.raft_store.clean_stale_peer_delay.as_secs() > 0 { @@ -3390,9 +3399,9 @@ impl TiKvConfig { ); self.rocksdb.auto_tuned = None; } - // When shared block cache is enabled, if its capacity is set, it overrides individual - // block cache sizes. Otherwise use the sum of block cache size of all column families - // as the shared cache size. + // When shared block cache is enabled, if its capacity is set, it overrides + // individual block cache sizes. Otherwise use the sum of block cache + // size of all column families as the shared cache size. let cache_cfg = &mut self.storage.block_cache; if cache_cfg.shared && cache_cfg.capacity.is_none() { cache_cfg.capacity = Some(ReadableSize( @@ -3679,7 +3688,8 @@ pub fn to_flatten_config_info(cfg: &TiKvConfig) -> Vec { Value::Object(res) } - // configs that should not be flatten because the config type is HashMap instead of submodule. + // configs that should not be flatten because the config type is HashMap instead + // of submodule. lazy_static! { static ref NO_FLATTEN_CFGS: HashSet<&'static str> = { let mut set = HashSet::new(); @@ -3989,7 +3999,8 @@ impl ConfigController { // dispatched to corresponding config manager, to avoid dispatch change twice if let Some(mgr) = inner.config_mgrs.get_mut(&Module::from(name.as_str())) { if let Err(e) = mgr.dispatch(change.clone()) { - // we already verified the correctness at the beginning of this function. + // we already verified the correctness at the beginning of this + // function. inner.current.update(to_update).unwrap(); return Err(e); } @@ -5129,7 +5140,7 @@ mod tests { ..Default::default() }; let provider = Some(MockRegionInfoProvider::new(vec![])); - let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /*cache*/, provider); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /* cache */, provider); assert_eq!( config.target_file_size_base.0, cf_opts.get_target_file_size_base() @@ -5143,7 +5154,7 @@ mod tests { ..Default::default() }; let provider: Option = None; - let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /*cache*/, provider); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /* cache */, provider); assert_eq!( config.target_file_size_base.0, cf_opts.get_target_file_size_base() @@ -5159,7 +5170,7 @@ mod tests { ..Default::default() }; let provider = Some(MockRegionInfoProvider::new(vec![])); - let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /*cache*/, provider); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /* cache */, provider); assert_eq!( config.compaction_guard_max_output_file_size.0, cf_opts.get_target_file_size_base() @@ -5190,7 +5201,8 @@ mod tests { assert!(cfg.validate().is_ok()); assert_eq!(cfg.memory_usage_limit.unwrap(), ReadableSize(5 * GIB)); - // Test memory_usage_limit will fallback to system memory capacity with huge block cache. + // Test memory_usage_limit will fallback to system memory capacity with huge + // block cache. cfg.memory_usage_limit = None; let system = SysQuota::memory_limit_in_bytes(); cfg.storage.block_cache.capacity = Some(ReadableSize(system * 3 / 4)); @@ -5263,7 +5275,10 @@ mod tests { fn test_background_job_limits() { // cpu num = 1 assert_eq!( - get_background_job_limits_impl(1 /*cpu_num*/, &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS), + get_background_job_limits_impl( + 1, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), BackgroundJobLimits { max_background_jobs: 2, max_background_flushes: 1, @@ -5273,7 +5288,7 @@ mod tests { ); assert_eq!( get_background_job_limits_impl( - 1, /*cpu_num*/ + 1, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { @@ -5285,7 +5300,10 @@ mod tests { ); // cpu num = 2 assert_eq!( - get_background_job_limits_impl(2 /*cpu_num*/, &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS), + get_background_job_limits_impl( + 2, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), BackgroundJobLimits { max_background_jobs: 2, max_background_flushes: 1, @@ -5295,7 +5313,7 @@ mod tests { ); assert_eq!( get_background_job_limits_impl( - 2, /*cpu_num*/ + 2, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { @@ -5307,7 +5325,10 @@ mod tests { ); // cpu num = 4 assert_eq!( - get_background_job_limits_impl(4 /*cpu_num*/, &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS), + get_background_job_limits_impl( + 4, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), BackgroundJobLimits { max_background_jobs: 3, max_background_flushes: 1, @@ -5317,7 +5338,7 @@ mod tests { ); assert_eq!( get_background_job_limits_impl( - 4, /*cpu_num*/ + 4, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), BackgroundJobLimits { @@ -5329,7 +5350,10 @@ mod tests { ); // cpu num = 8 assert_eq!( - get_background_job_limits_impl(8 /*cpu_num*/, &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS), + get_background_job_limits_impl( + 8, // cpu_num + &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS + ), BackgroundJobLimits { max_background_jobs: 7, max_background_flushes: 2, @@ -5339,7 +5363,7 @@ mod tests { ); assert_eq!( get_background_job_limits_impl( - 8, /*cpu_num*/ + 8, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS, @@ -5347,14 +5371,14 @@ mod tests { // cpu num = 16 assert_eq!( get_background_job_limits_impl( - 16, /*cpu_num*/ + 16, // cpu_num &KVDB_DEFAULT_BACKGROUND_JOB_LIMITS ), KVDB_DEFAULT_BACKGROUND_JOB_LIMITS, ); assert_eq!( get_background_job_limits_impl( - 16, /*cpu_num*/ + 16, // cpu_num &RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS ), RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS, @@ -5401,8 +5425,8 @@ mod tests { let mut default_cfg = TiKvConfig::default(); // Some default values are computed based on the environment. - // Because we can't set config values for these in `config-template.toml`, we will handle - // them manually. + // Because we can't set config values for these in `config-template.toml`, we + // will handle them manually. cfg.readpool.unified.max_thread_count = default_cfg.readpool.unified.max_thread_count; cfg.readpool.storage.high_concurrency = default_cfg.readpool.storage.high_concurrency; cfg.readpool.storage.normal_concurrency = default_cfg.readpool.storage.normal_concurrency; @@ -5428,7 +5452,8 @@ mod tests { cfg.backup_stream.num_threads = default_cfg.backup_stream.num_threads; // There is another set of config values that we can't directly compare: - // When the default values are `None`, but are then resolved to `Some(_)` later on. + // When the default values are `None`, but are then resolved to `Some(_)` later + // on. default_cfg.readpool.storage.adjust_use_unified_pool(); default_cfg.readpool.coprocessor.adjust_use_unified_pool(); default_cfg.security.redact_info_log = Some(false); diff --git a/src/coprocessor/dag/storage_impl.rs b/src/coprocessor/dag/storage_impl.rs index 883507452ec..46dcf7f570e 100644 --- a/src/coprocessor/dag/storage_impl.rs +++ b/src/coprocessor/dag/storage_impl.rs @@ -68,7 +68,8 @@ impl Storage for TiKvStorage { } fn scan_next(&mut self) -> QEResult> { - // Unwrap is fine because we must have called `reset_range` before calling `scan_next`. + // Unwrap is fine because we must have called `reset_range` before calling + // `scan_next`. let kv = self.scanner.as_mut().unwrap().next().map_err(Error::from)?; Ok(kv.map(|(k, v)| (k.into_raw().unwrap(), v))) } diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 2b2ae03caa2..d07d9bd5bd6 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -32,8 +32,9 @@ use crate::{ }, }; -/// Requests that need time of less than `LIGHT_TASK_THRESHOLD` is considered as light ones, -/// which means they don't need a permit from the semaphore before execution. +/// Requests that need time of less than `LIGHT_TASK_THRESHOLD` is considered as +/// light ones, which means they don't need a permit from the semaphore before +/// execution. const LIGHT_TASK_THRESHOLD: Duration = Duration::from_millis(5); /// A pool to build and run Coprocessor request handlers. @@ -79,9 +80,9 @@ impl Endpoint { resource_tag_factory: ResourceTagFactory, quota_limiter: Arc, ) -> Self { - // FIXME: When yatp is used, we need to limit coprocessor requests in progress to avoid - // using too much memory. However, if there are a number of large requests, small requests - // will still be blocked. This needs to be improved. + // FIXME: When yatp is used, we need to limit coprocessor requests in progress + // to avoid using too much memory. However, if there are a number of large + // requests, small requests will still be blocked. This needs to be improved. let semaphore = match &read_pool { ReadPoolHandle::Yatp { .. } => { Some(Arc::new(Semaphore::new(cfg.end_point_max_concurrency))) @@ -139,8 +140,8 @@ impl Endpoint { Ok(()) } - /// Parse the raw `Request` to create `RequestHandlerBuilder` and `ReqContext`. - /// Returns `Err` if fails. + /// Parse the raw `Request` to create `RequestHandlerBuilder` and + /// `ReqContext`. Returns `Err` if fails. /// /// It also checks if there are locks in memory blocking this read request. fn parse_request_and_check_memory_locks( @@ -373,16 +374,17 @@ impl Endpoint { /// The real implementation of handling a unary request. /// - /// It first retrieves a snapshot, then builds the `RequestHandler` over the snapshot and - /// the given `handler_builder`. Finally, it calls the unary request interface of the - /// `RequestHandler` to process the request and produce a result. + /// It first retrieves a snapshot, then builds the `RequestHandler` over the + /// snapshot and the given `handler_builder`. Finally, it calls the unary + /// request interface of the `RequestHandler` to process the request and + /// produce a result. async fn handle_unary_request_impl( semaphore: Option>, mut tracker: Box>, handler_builder: RequestHandlerBuilder, ) -> Result> { - // When this function is being executed, it may be queued for a long time, so that - // deadline may exceed. + // When this function is being executed, it may be queued for a long time, so + // that deadline may exceed. tracker.on_scheduled(); tracker.req_ctx.deadline.check()?; @@ -445,8 +447,8 @@ impl Endpoint { /// Handle a unary request and run on the read pool. /// - /// Returns `Err(err)` if the read pool is full. Returns `Ok(future)` in other cases. - /// The future inside may be an error however. + /// Returns `Err(err)` if the read pool is full. Returns `Ok(future)` in + /// other cases. The future inside may be an error however. fn handle_unary_request( &self, req_ctx: ReqContext, @@ -477,9 +479,9 @@ impl Endpoint { async move { res.await? } } - /// Parses and handles a unary request. Returns a future that will never fail. If there are - /// errors during parsing or handling, they will be converted into a `Response` as the success - /// result of the future. + /// Parses and handles a unary request. Returns a future that will never + /// fail. If there are errors during parsing or handling, they will be + /// converted into a `Response` as the success result of the future. #[inline] pub fn parse_and_handle_unary_request( &self, @@ -510,9 +512,10 @@ impl Endpoint { /// The real implementation of handling a stream request. /// - /// It first retrieves a snapshot, then builds the `RequestHandler` over the snapshot and - /// the given `handler_builder`. Finally, it calls the stream request interface of the - /// `RequestHandler` multiple times to process the request and produce multiple results. + /// It first retrieves a snapshot, then builds the `RequestHandler` over the + /// snapshot and the given `handler_builder`. Finally, it calls the stream + /// request interface of the `RequestHandler` multiple times to process the + /// request and produce multiple results. fn handle_stream_request_impl( semaphore: Option>, mut tracker: Box>, @@ -585,8 +588,8 @@ impl Endpoint { /// Handle a stream request and run on the read pool. /// - /// Returns `Err(err)` if the read pool is full. Returns `Ok(stream)` in other cases. - /// The stream inside may produce errors however. + /// Returns `Err(err)` if the read pool is full. Returns `Ok(stream)` in + /// other cases. The stream inside may produce errors however. fn handle_stream_request( &self, req_ctx: ReqContext, @@ -621,9 +624,10 @@ impl Endpoint { Ok(rx) } - /// Parses and handles a stream request. Returns a stream that produce each result in a - /// `Response` and will never fail. If there are errors during parsing or handling, they will - /// be converted into a `Response` as the only stream item. + /// Parses and handles a stream request. Returns a stream that produce each + /// result in a `Response` and will never fail. If there are errors during + /// parsing or handling, they will be converted into a `Response` as the + /// only stream item. #[inline] pub fn parse_and_handle_stream_request( &self, @@ -1292,12 +1296,13 @@ mod tests { /// Asserted that the snapshot can be retrieved in 500ms. const SNAPSHOT_DURATION_MS: u64 = 500; - /// Asserted that the delay caused by OS scheduling other tasks is smaller than 200ms. - /// This is mostly for CI. + /// Asserted that the delay caused by OS scheduling other tasks is + /// smaller than 200ms. This is mostly for CI. const HANDLE_ERROR_MS: u64 = 200; - /// The acceptable error range for a coarse timer. Note that we use CLOCK_MONOTONIC_COARSE - /// which can be slewed by time adjustment code (e.g., NTP, PTP). + /// The acceptable error range for a coarse timer. Note that we use + /// CLOCK_MONOTONIC_COARSE which can be slewed by time + /// adjustment code (e.g., NTP, PTP). const COARSE_ERROR_MS: u64 = 50; /// The duration that payload executes. @@ -1460,11 +1465,12 @@ mod tests { // Response 1 // - // Note: `process_wall_time_ms` includes `total_process_time` and `total_suspend_time`. - // Someday it will be separated, but for now, let's just consider the combination. + // Note: `process_wall_time_ms` includes `total_process_time` and + // `total_suspend_time`. Someday it will be separated, but for now, + // let's just consider the combination. // - // In the worst case, `total_suspend_time` could be totally req2 payload. So here: - // req1 payload <= process time <= (req1 payload + req2 payload) + // In the worst case, `total_suspend_time` could be totally req2 payload. + // So here: req1 payload <= process time <= (req1 payload + req2 payload) let resp = &rx.recv().unwrap()[0]; assert!(resp.get_other_error().is_empty()); assert_ge!( @@ -1482,11 +1488,12 @@ mod tests { // Response 2 // - // Note: `process_wall_time_ms` includes `total_process_time` and `total_suspend_time`. - // Someday it will be separated, but for now, let's just consider the combination. + // Note: `process_wall_time_ms` includes `total_process_time` and + // `total_suspend_time`. Someday it will be separated, but for now, + // let's just consider the combination. // - // In the worst case, `total_suspend_time` could be totally req1 payload. So here: - // req2 payload <= process time <= (req1 payload + req2 payload) + // In the worst case, `total_suspend_time` could be totally req1 payload. + // So here: req2 payload <= process time <= (req1 payload + req2 payload) let resp = &rx.recv().unwrap()[0]; assert!(!resp.get_other_error().is_empty()); assert_ge!( diff --git a/src/coprocessor/interceptors/concurrency_limiter.rs b/src/coprocessor/interceptors/concurrency_limiter.rs index d9da8b472bc..aa8b5c72f13 100644 --- a/src/coprocessor/interceptors/concurrency_limiter.rs +++ b/src/coprocessor/interceptors/concurrency_limiter.rs @@ -15,8 +15,8 @@ use tokio::sync::{Semaphore, SemaphorePermit}; use crate::coprocessor::metrics::*; -/// Limits the concurrency of heavy tasks by limiting the time spent on executing `fut` -/// before forcing to acquire a semaphore permit. +/// Limits the concurrency of heavy tasks by limiting the time spent on +/// executing `fut` before forcing to acquire a semaphore permit. /// /// The future `fut` can always run for at least `time_limit_without_permit`, /// but it needs to acquire a permit from the semaphore before it can continue. @@ -159,8 +159,8 @@ mod tests { .is_ok() ); - // Both t1 and t2 need a semaphore permit to finish. Although t2 is much shorter than t1, - // it starts with t1 + // Both t1 and t2 need a semaphore permit to finish. Although t2 is much shorter + // than t1, it starts with t1 smp.add_permits(1); let smp2 = smp.clone(); let mut t1 = diff --git a/src/coprocessor/interceptors/deadline.rs b/src/coprocessor/interceptors/deadline.rs index 7c7d44a6b4f..29b673aa487 100644 --- a/src/coprocessor/interceptors/deadline.rs +++ b/src/coprocessor/interceptors/deadline.rs @@ -9,8 +9,8 @@ use std::{ use pin_project::pin_project; use tikv_util::deadline::{Deadline, DeadlineError}; -/// Checks the deadline before every poll of the future. If the deadline is exceeded, -/// `DeadlineError` is returned. +/// Checks the deadline before every poll of the future. If the deadline is +/// exceeded, `DeadlineError` is returned. pub fn check_deadline( fut: F, deadline: Deadline, diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index 834033a60e1..0cde193a606 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -6,16 +6,18 @@ //! By doing so, the CPU of TiKV nodes can be utilized for computing and the //! amount of data to transfer can be reduced (i.e. filtered at TiKV side). //! -//! Notice that Coprocessor handles more than simple SQL query executors (DAG request). It also -//! handles analyzing requests and checksum requests. +//! Notice that Coprocessor handles more than simple SQL query executors (DAG +//! request). It also handles analyzing requests and checksum requests. //! -//! The entry point of handling all coprocessor requests is `Endpoint`. Common steps are: -//! 1. Parse the request into a DAG request, Checksum request or Analyze request. -//! 2. Retrieve a snapshot from the underlying engine according to the given timestamp. -//! 3. Build corresponding request handlers from the snapshot and request detail. -//! 4. Run request handlers once (for unary requests) or multiple times (for streaming requests) -//! on a future thread pool. -//! 5. Return handling result as a response. +//! The entry point of handling all coprocessor requests is `Endpoint`. Common +//! steps are: +//! - Parse the request into a DAG request, Checksum request or Analyze request. +//! - Retrieve a snapshot from the underlying engine according to the given +//! timestamp. +//! - Build corresponding request handlers from the snapshot and request detail. +//! - Run request handlers once (for unary requests) or multiple times (for +//! streaming requests) on a future thread pool. +//! - Return handling result as a response. //! //! Please refer to `Endpoint` for more details. @@ -117,11 +119,13 @@ pub struct ReqContext { pub txn_start_ts: TimeStamp, /// The set of timestamps of locks that can be bypassed during the reading - /// because either they will be rolled back or their commit_ts > read request's start_ts. + /// because either they will be rolled back or their commit_ts > read + /// request's start_ts. pub bypass_locks: TsSet, - /// The set of timestamps of locks that value in it can be accessed during the reading - /// because they will be committed and their commit_ts <= read request's start_ts. + /// The set of timestamps of locks that value in it can be accessed during + /// the reading because they will be committed and their commit_ts <= + /// read request's start_ts. pub access_locks: TsSet, /// The data version to match. If it matches the underlying data version, diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 05a30f64c4d..70144f47ce1 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -138,9 +138,9 @@ impl AnalyzeContext { let mut row_count = 0; let mut time_slice_start = Instant::now(); let mut topn_heap = BinaryHeap::new(); - // cur_val recording the current value's data and its counts when iterating index's rows. - // Once we met a new value, the old value will be pushed into the topn_heap to maintain the - // top-n information. + // cur_val recording the current value's data and its counts when iterating + // index's rows. Once we met a new value, the old value will be pushed + // into the topn_heap to maintain the top-n information. let mut cur_val: (u32, Vec) = (0, vec![]); let top_n_size = req.get_top_n_size() as usize; let stats_version = if req.has_version() { @@ -444,7 +444,8 @@ impl RowSampleBuilder { } } - // Don't let analyze bandwidth limit the quota limiter, this is already limited in rate limiter. + // Don't let analyze bandwidth limit the quota limiter, this is already limited + // in rate limiter. let quota_delay = { if !self.is_auto_analyze { self.quota_limiter.consume_sample(sample, true).await @@ -858,10 +859,10 @@ impl SampleBuilder { }) } - // `collect_columns_stats` returns the sample collectors which contain total count, - // null count, distinct values count and count-min sketch. And it also returns the statistic - // builder for PK which contains the histogram. When PK is common handle, it returns index stats - // for PK. + // `collect_columns_stats` returns the sample collectors which contain total + // count, null count, distinct values count and count-min sketch. And it + // also returns the statistic builder for PK which contains the histogram. + // When PK is common handle, it returns index stats for PK. // See https://en.wikipedia.org/wiki/Reservoir_sampling async fn collect_columns_stats( &mut self, @@ -871,8 +872,8 @@ impl SampleBuilder { self.columns_info.len() - self.columns_info[0].get_pk_handle() as usize; // The number of columns need to be sampled is `columns_without_handle_len`. - // It equals to `columns_info.len()` if the first column doesn't contain a handle. - // Otherwise, it equals to `columns_info.len() - 1`. + // It equals to `columns_info.len()` if the first column doesn't contain a + // handle. Otherwise, it equals to `columns_info.len() - 1`. let mut pk_builder = Histogram::new(self.max_bucket_size); let mut collectors = vec![ SampleCollector::new( @@ -915,9 +916,9 @@ impl SampleBuilder { } if self.analyze_common_handle { - // cur_val recording the current value's data and its counts when iterating index's rows. - // Once we met a new value, the old value will be pushed into the topn_heap to maintain the - // top-n information. + // cur_val recording the current value's data and its counts when iterating + // index's rows. Once we met a new value, the old value will be pushed into the + // topn_heap to maintain the top-n information. let mut cur_val: (u32, Vec) = (0, vec![]); let mut topn_heap = BinaryHeap::new(); for logical_row in &result.logical_rows { @@ -979,16 +980,21 @@ impl SampleBuilder { &mut val, )?; - // This is a workaround for different encoding methods used by TiDB and TiKV for CM Sketch. - // We need this because we must ensure we are using the same encoding method when we are querying values from - // CM Sketch (in TiDB) and inserting values into CM Sketch (here). - // We are inserting raw bytes from TableScanExecutor into CM Sketch here and query CM Sketch using bytes - // encoded by tablecodec.EncodeValue() in TiDB. Their results are different after row format becomes ver 2. + // This is a workaround for different encoding methods used by TiDB and TiKV for + // CM Sketch. We need this because we must ensure we are using the same encoding + // method when we are querying values from CM Sketch (in TiDB) and inserting + // values into CM Sketch (here). + // We are inserting raw bytes from TableScanExecutor into CM Sketch here and + // query CM Sketch using bytes encoded by tablecodec.EncodeValue() in TiDB. + // Their results are different after row format becomes ver 2. // - // Here we (1) convert INT bytes to VAR_INT bytes, (2) convert UINT bytes to VAR_UINT bytes, - // and (3) "flatten" the duration value from DURATION bytes into i64 value, then convert it to VAR_INT bytes. - // These are the only 3 cases we need to care about according to TiDB's tablecodec.EncodeValue() and - // TiKV's V1CompatibleEncoder::write_v2_as_datum(). + // Here we: + // - convert INT bytes to VAR_INT bytes + // - convert UINT bytes to VAR_UINT bytes + // - "flatten" the duration value from DURATION bytes into i64 value, then + // convert it to VAR_INT bytes. + // These are the only 3 cases we need to care about according to TiDB's + // tablecodec.EncodeValue() and TiKV's V1CompatibleEncoder::write_v2_as_datum(). val = match val[0] { INT_FLAG | UINT_FLAG | DURATION_FLAG => { let mut mut_val = &val[..]; @@ -1037,7 +1043,8 @@ impl SampleBuilder { } } -/// `SampleCollector` will collect Samples and calculate the count, ndv and total size of an attribute. +/// `SampleCollector` will collect Samples and calculate the count, ndv and +/// total size of an attribute. #[derive(Clone)] struct SampleCollector { samples: Vec>, diff --git a/src/coprocessor/statistics/cmsketch.rs b/src/coprocessor/statistics/cmsketch.rs index e9da9c8a91d..6a3042c8ee7 100644 --- a/src/coprocessor/statistics/cmsketch.rs +++ b/src/coprocessor/statistics/cmsketch.rs @@ -36,8 +36,8 @@ impl CmSketch { } // `insert` inserts the data into cm sketch. For each row i, the position at - // (h1 + h2*i) % width will be incremented by one, where the (h1, h2) is the hash value - // of data. + // (h1 + h2*i) % width will be incremented by one, where the (h1, h2) is the + // hash value of data. pub fn insert(&mut self, bytes: &[u8]) { self.count = self.count.wrapping_add(1); let (h1, h2) = CmSketch::hash(bytes); diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index 064073825f4..f9b908979b8 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -18,7 +18,8 @@ enum TrackerState { /// The tracker is initialized. Initialized, - /// The tracker is notified that the task is scheduled on a thread pool and start running. + /// The tracker is notified that the task is scheduled on a thread pool and + /// start running. Scheduled(Instant), /// The tracker is notified that the snapshot needed by the task is ready. @@ -36,7 +37,8 @@ enum TrackerState { /// The tracker is notified that all items just finished. AllItemFinished, - /// The tracker has finished all tracking and there will be no future operations. + /// The tracker has finished all tracking and there will be no future + /// operations. Tracked, } @@ -50,7 +52,8 @@ pub struct Tracker { wait_time: Duration, // Total wait time schedule_wait_time: Duration, // Wait time spent on waiting for scheduling snapshot_wait_time: Duration, // Wait time spent on waiting for a snapshot - handler_build_time: Duration, // Time spent on building the handler (not included in total wait time) + handler_build_time: Duration, /* Time spent on building the handler (not included in total + * wait time) */ req_lifetime: Duration, // Suspend time between processing two items @@ -75,9 +78,9 @@ pub struct Tracker { } impl Tracker { - /// Initialize the tracker. Normally it is called outside future pool's factory context, - /// because the future pool might be full and we need to wait it. This kind of wait time - /// has to be recorded. + /// Initialize the tracker. Normally it is called outside future pool's + /// factory context, because the future pool might be full and we need + /// to wait it. This kind of wait time has to be recorded. pub fn new(req_ctx: ReqContext, slow_log_threshold: Duration) -> Self { let now = Instant::now_coarse(); Tracker { @@ -386,8 +389,9 @@ impl Tracker { } impl Drop for Tracker { - /// `Tracker` may be dropped without even calling `on_begin_all_items`. For example, if - /// get snapshot failed. So we fast-forward if some steps are missing. + /// `Tracker` may be dropped without even calling `on_begin_all_items`. For + /// example, if get snapshot failed. So we fast-forward if some steps + /// are missing. fn drop(&mut self) { if self.current_stage == TrackerState::Initialized { self.on_scheduled(); diff --git a/src/coprocessor_v2/endpoint.rs b/src/coprocessor_v2/endpoint.rs index 6a8f3e8a5f8..da6d4aa8521 100644 --- a/src/coprocessor_v2/endpoint.rs +++ b/src/coprocessor_v2/endpoint.rs @@ -49,8 +49,9 @@ impl Endpoint { /// Handles a request to the coprocessor framework. /// - /// Each request is dispatched to the corresponding coprocessor plugin based on it's `copr_name` - /// field. A plugin with a matching name must be loaded by TiKV, otherwise an error is returned. + /// Each request is dispatched to the corresponding coprocessor plugin based + /// on it's `copr_name` field. A plugin with a matching name must be loaded + /// by TiKV, otherwise an error is returned. #[inline] pub fn handle_request( &self, diff --git a/src/coprocessor_v2/mod.rs b/src/coprocessor_v2/mod.rs index bcadbb72cfc..d1a045b7b0f 100644 --- a/src/coprocessor_v2/mod.rs +++ b/src/coprocessor_v2/mod.rs @@ -2,25 +2,27 @@ //! # TiKV's Coprocessor Framework //! -//! A coprocessor framework that allows custom, pluggable coprocessor plugins to execute arbitrary -//! user requests directly on TiKV nodes. +//! A coprocessor framework that allows custom, pluggable coprocessor plugins to +//! execute arbitrary user requests directly on TiKV nodes. //! -//! *Note: While there currently also exists a different [coprocessor][super::coprocessor] that is -//! designed to execute a defined set of functions on TiKV nodes, this coprocessor framework allows -//! to register "coprocessor plugins" that can execute arbitrary code directly on TiKV nodes. -//! The long-term goal is to fully replace the existing coprocessor with an equivalent plugin for -//! this coprocessor.* +//! *Note: While there currently also exists a different +//! [coprocessor][super::coprocessor] that is designed to execute a defined set +//! of functions on TiKV nodes, this coprocessor framework allows to register +//! "coprocessor plugins" that can execute arbitrary code directly on TiKV +//! nodes. The long-term goal is to fully replace the existing coprocessor with +//! an equivalent plugin for this coprocessor.* //! //! ## Background //! //! The design of the coprocessor framework follows closely the principles of -//! [HBase's coprocessor][hbase-copr] which in turn is built on the ideas of the coprocessor -//! framework in Google's BigTable. +//! [HBase's coprocessor][hbase-copr] which in turn is built on the ideas of the +//! coprocessor framework in Google's BigTable. //! -//! By registering new coprocessor plugins, users are able to extend the functionality of TiKV and -//! run code directly on storage nodes. This usually leads to dramatically increased performance -//! because the CPU of TiKV nodes can be utilized for computation and the amount of data transfer -//! can be reduced. +//! By registering new coprocessor plugins, users are able to extend the +//! functionality of TiKV and run code directly on storage nodes. This usually +//! leads to dramatically increased performance because the CPU of TiKV nodes +//! can be utilized for computation and the amount of data transfer can be +//! reduced. //! //! //! [hbase-copr]: https://blogs.apache.org/hbase/entry/coprocessor_introduction diff --git a/src/coprocessor_v2/plugin_registry.rs b/src/coprocessor_v2/plugin_registry.rs index bfdc5ac2fc7..c02a652fc88 100644 --- a/src/coprocessor_v2/plugin_registry.rs +++ b/src/coprocessor_v2/plugin_registry.rs @@ -100,13 +100,15 @@ impl PluginRegistry { /// Hot-reloads plugins from a given directory. /// /// All plugins that are already present in the directory will be loaded. - /// A background thread is spawned to watch file system events. If the library file of a loaded - /// plugin is deleted, the corresponding plugin is automatically unloaded; if a new library file - /// is placed into the directory, it will be automatically loaded into TiKV's coprocessor plugin - /// system. + /// A background thread is spawned to watch file system events. If the + /// library file of a loaded plugin is deleted, the corresponding plugin + /// is automatically unloaded; if a new library file is placed into the + /// directory, it will be automatically loaded into TiKV's coprocessor + /// plugin system. /// - /// A file will only be loaded if it has the proper file ending of dynamic link libraries for - /// the current platform (`.so` for Linux, `.dylib` for macOS, `.dll` for Windows). + /// A file will only be loaded if it has the proper file ending of dynamic + /// link libraries for the current platform (`.so` for Linux, `.dylib` + /// for macOS, `.dll` for Windows). pub fn start_hot_reloading( &mut self, plugin_directory: impl Into, @@ -116,9 +118,9 @@ impl PluginRegistry { // Create plugin directory if it doesn't exist. std::fs::create_dir_all(&plugin_directory)?; - // If this is the first call to `start_hot_reloading()`, create a new file system watcher - // and background thread for loading plugins. For later invocations, the same watcher and - // thread will be used. + // If this is the first call to `start_hot_reloading()`, create a new file + // system watcher and background thread for loading plugins. For later + // invocations, the same watcher and thread will be used. if self.fs_watcher.is_none() { let (tx, rx) = mpsc::channel(); let fs_watcher = notify::watcher(tx, Duration::from_secs(3)).unwrap(); @@ -150,7 +152,8 @@ impl PluginRegistry { warn!("a loaded coprocessor plugin is removed. Be aware that original plugin is still running"; "plugin_path" => ?file); } Ok(DebouncedEvent::Rename(old_file, new_file)) => { - // If the file is renamed with a different parent directory, we will receive a `Remove` instead. + // If the file is renamed with a different parent directory, we will + // receive a `Remove` instead. debug_assert!(old_file.parent() == new_file.parent()); rename(&old_file, &new_file); } @@ -180,18 +183,21 @@ impl PluginRegistry { Ok(()) } - /// Finds a plugin by its name. The plugin must have been loaded before with [`load_plugin()`]. + /// Finds a plugin by its name. The plugin must have been loaded before with + /// [`load_plugin()`]. /// - /// Plugins are indexed by the name that is returned by [`CoprocessorPlugin::name()`]. + /// Plugins are indexed by the name that is returned by + /// [`CoprocessorPlugin::name()`]. pub fn get_plugin(&self, plugin_name: &str) -> Option> { self.inner.read().unwrap().get_plugin(plugin_name) } /// finds a plugin by its associated file path, similar to [`get_plugin()`]. /// - /// The given path has to be exactly the same as the one the plugin with loaded with, e.g. - /// `"./coprocessors/plugin1.so"` would be *different* from `"coprocessors/plugin1.so"` - /// (note the leading `./`). The same applies when the associated path was changed with + /// The given path has to be exactly the same as the one the plugin with + /// loaded with, e.g. `"./coprocessors/plugin1.so"` would be *different* + /// from `"coprocessors/plugin1.so"` (note the leading `./`). The same + /// applies when the associated path was changed with /// [`update_plugin_path()`]. pub fn get_plugin_by_path>(&self, plugin_path: P) -> Option> { self.inner.read().unwrap().get_plugin_by_path(plugin_path) @@ -200,7 +206,8 @@ impl PluginRegistry { /// Returns the names of the currently loaded plugins. /// The order of plugin names is arbitrary. pub fn loaded_plugin_names(&self) -> Vec { - // Collect names into vector so we can release the `RwLockReadGuard` before we return. + // Collect names into vector so we can release the `RwLockReadGuard` before we + // return. self.inner .read() .unwrap() @@ -211,9 +218,9 @@ impl PluginRegistry { /// Loads a [`CoprocessorPlugin`] from a `dylib`. /// - /// After this function has successfully finished, the plugin is registered with the - /// [`PluginRegistry`] and can later be obtained by calling [`get_plugin()`] with the proper - /// name. + /// After this function has successfully finished, the plugin is registered + /// with the [`PluginRegistry`] and can later be obtained by calling + /// [`get_plugin()`] with the proper name. /// /// Returns the name of the loaded plugin. pub fn load_plugin>(&self, file_name: P) -> Result { @@ -223,10 +230,12 @@ impl PluginRegistry { /// Attempts to load all plugins from a given directory. /// /// Returns a list of the names of all successfully loaded plugins. - /// If a file could not be successfully loaded as a plugin, it will be discarded. + /// If a file could not be successfully loaded as a plugin, it will be + /// discarded. /// - /// The plugins have to follow the system's naming convention in order to be loaded, e.g. `.so` - /// for Linux, `.dylib` for macOS and `.dll` for Windows. + /// The plugins have to follow the system's naming convention in order to be + /// loaded, e.g. `.so` for Linux, `.dylib` for macOS and `.dll` for + /// Windows. pub fn load_plugins_from_dir( &self, dir_name: impl Into, @@ -255,8 +264,8 @@ impl PluginRegistry { /// Updates the associated file path for plugin. /// - /// This function should be used to maintain consistent state when the underlying file of a - /// plugin was renamed or moved. + /// This function should be used to maintain consistent state when the + /// underlying file of a plugin was renamed or moved. pub fn update_plugin_path>(&self, plugin_name: &str, new_path: P) { self.inner .write() @@ -264,7 +273,8 @@ impl PluginRegistry { .update_plugin_path(plugin_name, new_path) } - /// Returns the associated file path for the plugin for the given `plugin_name`. + /// Returns the associated file path for the plugin for the given + /// `plugin_name`. pub fn get_path_for_plugin(&self, plugin_name: &str) -> Option { self.inner .read() @@ -368,24 +378,26 @@ pub struct LoadedPlugin { } impl LoadedPlugin { - /// Creates a new `LoadedPlugin` by loading a `dylib` from a file into memory. + /// Creates a new `LoadedPlugin` by loading a `dylib` from a file into + /// memory. /// /// The `file_path` argument may be any of: - /// * A simple filename of a library if the library is in any of the platform-specific locations - /// from where libraries are usually loaded, e.g. the current directory or in - /// `LD_LIBRARY_PATH` on unix systems. + /// * A simple filename of a library if the library is in any of the + /// platform-specific locations from where libraries are usually loaded, + /// e.g. the current directory or in `LD_LIBRARY_PATH` on unix systems. /// * Absolute path to the library /// * Relative (to the current working directory) path to the library /// - /// The function instantiates the plugin by calling `_plugin_create()` to obtain a - /// [`CoprocessorPlugin`]. + /// The function instantiates the plugin by calling `_plugin_create()` to + /// obtain a [`CoprocessorPlugin`]. /// /// # Safety /// - /// The library **must** contain a function with name [`PLUGIN_CONSTRUCTOR_SYMBOL`] and the - /// signature of [`PluginConstructorSignature`]. Otherwise, behavior is undefined. - /// See also [`libloading::Library::get()`] for more information on what restrictions apply to - /// [`PLUGIN_CONSTRUCTOR_SYMBOL`]. + /// The library **must** contain a function with name + /// [`PLUGIN_CONSTRUCTOR_SYMBOL`] and the signature of + /// [`PluginConstructorSignature`]. Otherwise, behavior is undefined. + /// See also [`libloading::Library::get()`] for more information on what + /// restrictions apply to [`PLUGIN_CONSTRUCTOR_SYMBOL`]. pub unsafe fn new>(file_path: P) -> Result { let lib = Library::new(&file_path)?; @@ -559,7 +571,8 @@ mod tests { // trigger loading std::fs::copy(&original_library_path, &library_path).unwrap(); - // fs watcher detects changes in every 3 seconds, therefore, wait 4 seconds so as to make sure the watcher is triggered. + // fs watcher detects changes in every 3 seconds, therefore, wait 4 seconds so + // as to make sure the watcher is triggered. std::thread::sleep(Duration::from_secs(4)); assert!(registry.get_plugin(plugin_name).is_some()); @@ -570,7 +583,8 @@ mod tests { // trigger rename std::fs::rename(&library_path, &library_path_2).unwrap(); - // fs watcher detects changes in every 3 seconds, therefore, wait 4 seconds so as to make sure the watcher is triggered. + // fs watcher detects changes in every 3 seconds, therefore, wait 4 seconds so + // as to make sure the watcher is triggered. std::thread::sleep(Duration::from_secs(4)); assert!(registry.get_plugin(plugin_name).is_some()); @@ -580,7 +594,8 @@ mod tests { ); std::fs::remove_file(&library_path_2).unwrap(); - // fs watcher detects changes in every 3 seconds, therefore, wait 4 seconds so as to make sure the watcher is triggered. + // fs watcher detects changes in every 3 seconds, therefore, wait 4 seconds so + // as to make sure the watcher is triggered. std::thread::sleep(Duration::from_secs(4)); // plugin will not be unloadad diff --git a/src/coprocessor_v2/raw_storage_impl.rs b/src/coprocessor_v2/raw_storage_impl.rs index 7ef7b59010a..fc505c50312 100644 --- a/src/coprocessor_v2/raw_storage_impl.rs +++ b/src/coprocessor_v2/raw_storage_impl.rs @@ -19,16 +19,18 @@ use crate::storage::{ /// Implementation of the [`RawStorage`] trait. /// -/// It wraps TiKV's [`Storage`] into an API that is exposed to coprocessor plugins. -/// The `RawStorageImpl` should be constructed for every invocation of a [`CoprocessorPlugin`] as -/// it wraps a [`Context`] that is unique for every request. +/// It wraps TiKV's [`Storage`] into an API that is exposed to coprocessor +/// plugins. The `RawStorageImpl` should be constructed for every invocation of +/// a [`CoprocessorPlugin`] as it wraps a [`Context`] that is unique for every +/// request. pub struct RawStorageImpl<'a, E: Engine, L: LockManager, F: KvFormat> { context: Context, storage: &'a Storage, } impl<'a, E: Engine, L: LockManager, F: KvFormat> RawStorageImpl<'a, E, L, F> { - /// Constructs a new `RawStorageImpl` that wraps a given [`Context`] and [`Storage`]. + /// Constructs a new `RawStorageImpl` that wraps a given [`Context`] and + /// [`Storage`]. pub fn new(context: Context, storage: &'a Storage) -> Self { RawStorageImpl { context, storage } } @@ -190,7 +192,8 @@ impl From for PluginErrorShim { storage::errors::ErrorInner::Kv(KvError(box KvErrorInner::Timeout(duration))) => { PluginError::Timeout(duration) } - // Other errors are passed as-is inside their `Result` so we get a `&Result` when using `Any::downcast_ref`. + // Other errors are passed as-is inside their `Result` so we get a `&Result` when using + // `Any::downcast_ref`. _ => PluginError::Other( format!("{}", &error), Box::new(storage::Result::<()>::Err(error)), diff --git a/src/import/duplicate_detect.rs b/src/import/duplicate_detect.rs index 3ae9360e727..86e955c6cd2 100644 --- a/src/import/duplicate_detect.rs +++ b/src/import/duplicate_detect.rs @@ -397,13 +397,15 @@ mod tests { } // There are 40 key-value pairs in db, there are - // [100, 101, 102, 103, 104, 105, 106, 107, 108, 109] with commit timestamp 10 - // [104, 105, 106, 107, 108, 109, 110, 111, 112, 113] with commit timestamp 14, these 20 keys - // have existed in db before importing. So we do not think (105,10) is repeated with (105,14). - // [108, 109, 110, 111, 112, 113, 114, 115, 116, 117] with commit timestamp 18 - // [112, 113, 114, 115, 116, 117, 118, 119, 120, 121] with commit timestamp 22, these 20 keys - // are imported by lightning. So (108,18) is repeated with (108,14), but (108,18) is not repeated - // with (108,10). + // - [100, 101, 102, 103, 104, 105, 106, 107, 108, 109] with commit timestamp 10 + // - [104, 105, 106, 107, 108, 109, 110, 111, 112, 113] with commit timestamp + // 14, these 20 keys have existed in db before importing. So we do not think + // (105,10) is repeated with (105,14). + // - [108, 109, 110, 111, 112, 113, 114, 115, 116, 117] with commit timestamp 18 + // - [112, 113, 114, 115, 116, 117, 118, 119, 120, 121] with commit timestamp + // 22, these 20 keys + // are imported by lightning. So (108,18) is repeated with (108,14), but + // (108,18) is not repeated with (108,10). #[test] fn test_duplicate_detect_incremental() { let storage = TestStorageBuilderApiV1::new(DummyLockManager) diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index a81a34b1e71..36089e41fd1 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -422,7 +422,8 @@ where self.threads.spawn_ok(handle_task); } - // Downloads KV file and performs key-rewrite then apply kv into this tikv store. + // Downloads KV file and performs key-rewrite then apply kv into this tikv + // store. fn apply( &mut self, _ctx: RpcContext<'_>, @@ -629,7 +630,6 @@ where } /// Ingest multiple files by sending a raft command to raftstore. - /// fn multi_ingest( &mut self, ctx: RpcContext<'_>, @@ -858,7 +858,8 @@ fn pb_error_inc(type_: &str, e: &errorpb::Error) { enum RequestCollector { /// Retain the last ts of each key in each request. - /// This is used for write CF because resolved ts observer hates duplicated key in the same request. + /// This is used for write CF because resolved ts observer hates duplicated + /// key in the same request. RetainLastTs(HashMap, (Request, u64)>), /// Collector favor that simple collect all items. /// This is used for default CF. @@ -941,9 +942,10 @@ fn make_request(reqs: &mut RequestCollector, context: Context) -> RaftCmdRequest let mut cmd = RaftCmdRequest::default(); let mut header = make_request_header(context); // Set the UUID of header to prevent raftstore batching our requests. - // The current `resolved_ts` observer assumes that each batch of request doesn't has - // two writes to the same key. (Even with 2 different TS). That was true for normal cases - // because the latches reject concurrency write to keys. However we have bypassed the latch layer :( + // The current `resolved_ts` observer assumes that each batch of request doesn't + // has two writes to the same key. (Even with 2 different TS). That was true + // for normal cases because the latches reject concurrency write to keys. + // However we have bypassed the latch layer :( header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); cmd.set_header(header); cmd.set_requests(reqs.drain().into()); diff --git a/src/server/config.rs b/src/server/config.rs index 8a581d5eeba..88d167d2e64 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -164,9 +164,10 @@ pub struct Config { #[doc(hidden)] #[online_config(skip)] - /// When TiKV memory usage reaches `memory_usage_high_water` it will try to limit memory - /// increasing. For server layer some messages will be rejected or droped, if they utilize - /// memory more than `reject_messages_on_memory_ratio` * total. + /// When TiKV memory usage reaches `memory_usage_high_water` it will try to + /// limit memory increasing. For server layer some messages will be rejected + /// or dropped, if they utilize memory more than + /// `reject_messages_on_memory_ratio` * total. /// /// Set it to 0 can disable message rejecting. // By default it's 0.2. So for different memory capacity, messages are rejected when: @@ -384,8 +385,8 @@ impl Config { } if self.heavy_load_threshold > 100 { - // The configuration has been changed to describe CPU usage of a single thread instead - // of all threads. So migrate from the old style. + // The configuration has been changed to describe CPU usage of a single thread + // instead of all threads. So migrate from the old style. self.heavy_load_threshold = 75; } diff --git a/src/server/debug.rs b/src/server/debug.rs index 7bfa2aa438e..03630cf930a 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -337,7 +337,8 @@ impl Debugger { } /// Set regions to tombstone by manual, and apply other status(such as - /// peers, version, and key range) from `region` which comes from PD normally. + /// peers, version, and key range) from `region` which comes from PD + /// normally. pub fn set_region_tombstone(&self, regions: Vec) -> Result> { let store_id = self.get_store_ident()?.get_store_id(); let db = &self.engines.kv; @@ -675,8 +676,9 @@ impl Debugger { for region_id in region_ids { let region_state = self.region_info(region_id)?; - // It's safe to unwrap region_local_state here, because get_all_regions_in_store() - // guarantees that the region state exists in kvdb. + // It's safe to unwrap region_local_state here, because + // get_all_regions_in_store() guarantees that the region state + // exists in kvdb. if region_state.region_local_state.unwrap().state == PeerState::Tombstone { continue; } @@ -1933,7 +1935,8 @@ mod tests { // last index < commit index mock_raft_state(&mut wb1, 10, 100, 110); - // commit index < last index < apply index, or commit index < apply index < last index. + // commit index < last index < apply index, or commit index < apply index < last + // index. mock_raft_state(&mut wb1, 11, 100, 90); mock_apply_state(&mut wb2, 11, 110); mock_raft_state(&mut wb1, 12, 100, 90); @@ -2012,10 +2015,14 @@ mod tests { lock.extend(vec![ // key, start_ts, for_update_ts, lock_type, short_value, check (b"k1", 100, 0, LockType::Put, false, Expect::Remove), // k1: remove orphan lock. - (b"k2", 100, 0, LockType::Delete, false, Expect::Keep), // k2: Delete doesn't need default. - (b"k3", 100, 0, LockType::Put, true, Expect::Keep), // k3: short value doesn't need default. - (b"k4", 100, 0, LockType::Put, false, Expect::Keep), // k4: corresponding default exists. - (b"k5", 100, 0, LockType::Put, false, Expect::Remove), // k5: duplicated lock and write. + (b"k2", 100, 0, LockType::Delete, false, Expect::Keep), /* k2: Delete doesn't need + * default. */ + (b"k3", 100, 0, LockType::Put, true, Expect::Keep), /* k3: short value doesn't need + * default. */ + (b"k4", 100, 0, LockType::Put, false, Expect::Keep), /* k4: corresponding default + * exists. */ + (b"k5", 100, 0, LockType::Put, false, Expect::Remove), /* k5: duplicated lock and + * write. */ ]); write.extend(vec![ // key, start_ts, commit_ts, write_type, short_value, check @@ -2032,11 +2039,14 @@ mod tests { ]); write.extend(vec![ // key, start_ts, commit_ts, write_type, short_value - (b"k6", 100, 101, WriteType::Put, true, Expect::Keep), // short value doesn't need default. - (b"k6", 99, 99, WriteType::Rollback, false, Expect::Keep), // rollback doesn't need default. - (b"k6", 97, 98, WriteType::Delete, false, Expect::Keep), // delete doesn't need default. - (b"k6", 94, 94, WriteType::Put, false, Expect::Keep), // ok. - (b"k6", 92, 93, WriteType::Put, false, Expect::Remove), // extra write. + (b"k6", 100, 101, WriteType::Put, true, Expect::Keep), /* short value doesn't need + * default. */ + (b"k6", 99, 99, WriteType::Rollback, false, Expect::Keep), /* rollback doesn't need + * default. */ + (b"k6", 97, 98, WriteType::Delete, false, Expect::Keep), /* delete doesn't need + * default. */ + (b"k6", 94, 94, WriteType::Put, false, Expect::Keep), // ok. + (b"k6", 92, 93, WriteType::Put, false, Expect::Remove), // extra write. (b"k6", 90, 91, WriteType::Delete, false, Expect::Keep), (b"k6", 88, 89, WriteType::Put, true, Expect::Keep), ]); @@ -2066,7 +2076,9 @@ mod tests { lock.extend(vec![ // key, start_ts, for_update_ts, lock_type, short_value, check (b"k8", 90, 105, LockType::Pessimistic, false, Expect::Remove), // newer writes exist - (b"k9", 90, 115, LockType::Put, true, Expect::Keep), // prewritten lock from a pessimistic txn + (b"k9", 90, 115, LockType::Put, true, Expect::Keep), /* prewritten lock + * from a pessimistic + * txn */ ]); write.extend(vec![ // key, start_ts, commit_ts, write_type, short_value diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 854c1fdd356..59315b4732d 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -182,12 +182,11 @@ impl KvEngineFactory { self.inner.api_version, ); // TODOTODO: call rust-rocks or tirocks to destroy_engine; - /* - engine_rocks::util::destroy_engine( - tablet_path.to_str().unwrap(), - kv_db_opts, - kv_cfs_opts, - )?;*/ + // engine_rocks::util::destroy_engine( + // tablet_path.to_str().unwrap(), + // kv_db_opts, + // kv_cfs_opts, + // )?; let _ = std::fs::remove_dir_all(tablet_path); Ok(()) } diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index e5237187886..80366cc17d1 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -300,8 +300,8 @@ mod tests { assert!(!factory.is_tombstoned(1, 10)); assert!(factory.load_tablet(&tablet_path, 1, 10).is_err()); assert!(factory.load_tablet(&tablet_path, 1, 20).is_ok()); - // After we load it as with the new id or suffix, we should be unable to get it with - // the old id and suffix in the cache. + // After we load it as with the new id or suffix, we should be unable to get it + // with the old id and suffix in the cache. assert!(factory.open_tablet_cache(1, 10).is_none()); assert!(factory.open_tablet_cache(1, 20).is_some()); diff --git a/src/server/gc_worker/applied_lock_collector.rs b/src/server/gc_worker/applied_lock_collector.rs index 9c30afc350b..a013d742890 100644 --- a/src/server/gc_worker/applied_lock_collector.rs +++ b/src/server/gc_worker/applied_lock_collector.rs @@ -34,9 +34,10 @@ const MAX_COLLECT_SIZE: usize = 1024; struct LockObserverState { max_ts: AtomicU64, - /// `is_clean` is true, only it's sure that all applying of stale locks (locks with start_ts <= - /// specified max_ts) are monitored and collected. If there are too many stale locks or any - /// error happens, `is_clean` must be set to `false`. + /// `is_clean` is true, only it's sure that all applying of stale locks + /// (locks with start_ts <= specified max_ts) are monitored and collected. + /// If there are too many stale locks or any error happens, `is_clean` + /// must be set to `false`. is_clean: AtomicBool, } @@ -112,9 +113,10 @@ impl Display for LockCollectorTask { } } -/// `LockObserver` observes apply events and apply snapshot events. If it happens in CF_LOCK, it -/// checks the `start_ts`s of the locks being written. If a lock's `start_ts` <= specified `max_ts` -/// in the `state`, it will send the lock to through the `sender`, so the receiver can collect it. +/// `LockObserver` observes apply events and apply snapshot events. If it +/// happens in CF_LOCK, it checks the `start_ts`s of the locks being written. If +/// a lock's `start_ts` <= specified `max_ts` in the `state`, it will send the +/// lock to through the `sender`, so the receiver can collect it. #[derive(Clone)] struct LockObserver { state: Arc, @@ -310,9 +312,9 @@ impl LockCollectorRunner { Greater => { info!("start collecting locks"; "max_ts" => max_ts); self.collected_locks.clear(); - // TODO: `is_clean` may be unexpectedly set to false here, if any error happens on a - // previous observing. It need to be solved, although it's very unlikely to happen and - // doesn't affect correctness of data. + // TODO: `is_clean` may be unexpectedly set to false here, if any error happens + // on a previous observing. It need to be solved, although it's very unlikely to + // happen and doesn't affect correctness of data. self.observer_state.mark_clean(); self.observer_state.store_max_ts(max_ts); Ok(()) @@ -420,21 +422,22 @@ impl AppliedLockCollector { self.worker.lock().unwrap().stop(); } - /// Starts collecting applied locks whose `start_ts` <= `max_ts`. Only one `max_ts` is valid - /// at one time. + /// Starts collecting applied locks whose `start_ts` <= `max_ts`. Only one + /// `max_ts` is valid at one time. pub fn start_collecting(&self, max_ts: TimeStamp, callback: Callback<()>) -> Result<()> { - // Before starting collecting, check the concurrency manager to avoid later prewrite - // requests uses a min_commit_ts less than the safepoint. + // Before starting collecting, check the concurrency manager to avoid later + // prewrite requests uses a min_commit_ts less than the safepoint. // `max_ts` here is the safepoint of the current round of GC. - // Ths is similar to that we update max_ts and check memory lock when handling other - // transactional read requests. However this is done at start_collecting instead of - // physical_scan_locks. The reason is that, to fully scan a TiKV store, it might needs more - // than one physical_scan_lock requests. However memory lock needs to be checked before - // scanning the locks, and we can't know the `end_key` of the scan range at that time. As - // a result, each physical_scan_lock request will cause scanning memory lock from the - // start_key to the very-end of the TiKV node, which is a waste. But since we always start - // collecting applied locks before physical scan lock, so a better idea is to check the - // memory lock before physical_scan_lock. + // Ths is similar to that we update max_ts and check memory lock when handling + // other transactional read requests. However this is done at start_collecting + // instead of physical_scan_locks. The reason is that, to fully scan a TiKV + // store, it might needs more than one physical_scan_lock requests. However + // memory lock needs to be checked before scanning the locks, and we can't know + // the `end_key` of the scan range at that time. As a result, each + // physical_scan_lock request will cause scanning memory lock from the start_key + // to the very-end of the TiKV node, which is a waste. But since we always start + // collecting applied locks before physical scan lock, so a better idea is to + // check the memory lock before physical_scan_lock. self.concurrency_manager.update_max_ts(max_ts); self.concurrency_manager .read_range_check(None, None, |key, lock| { @@ -453,10 +456,11 @@ impl AppliedLockCollector { .map_err(|e| box_err!("failed to schedule task: {:?}", e)) } - /// Get the collected locks after `start_collecting`. Only valid when `max_ts` matches the - /// `max_ts` provided to `start_collecting`. - /// Collects at most `MAX_COLLECT_SIZE` locks. If there are (even potentially) more locks than - /// `MAX_COLLECT_SIZE` or any error happens, the flag `is_clean` will be unset, which represents + /// Get the collected locks after `start_collecting`. Only valid when + /// `max_ts` matches the `max_ts` provided to `start_collecting`. + /// Collects at most `MAX_COLLECT_SIZE` locks. If there are (even + /// potentially) more locks than `MAX_COLLECT_SIZE` or any error happens, + /// the flag `is_clean` will be unset, which represents /// `AppliedLockCollector` cannot collect all locks. pub fn get_collected_locks( &self, @@ -468,8 +472,8 @@ impl AppliedLockCollector { .map_err(|e| box_err!("failed to schedule task: {:?}", e)) } - /// Stop collecting locks. Only valid when `max_ts` matches the `max_ts` provided to - /// `start_collecting`. + /// Stop collecting locks. Only valid when `max_ts` matches the `max_ts` + /// provided to `start_collecting`. pub fn stop_collecting(&self, max_ts: TimeStamp, callback: Callback<()>) -> Result<()> { self.scheduler .schedule(LockCollectorTask::StopCollecting { max_ts, callback }) @@ -588,8 +592,8 @@ mod tests { get_collected_locks(&c, 2).unwrap_err(); stop_collecting(&c, 2).unwrap_err(); - // When start_collecting is invoked with a larger ts, the later one will ovewrite the - // previous one. + // When start_collecting is invoked with a larger ts, the later one will + // ovewrite the previous one. start_collecting(&c, 3).unwrap(); assert_eq!(c.concurrency_manager.max_ts(), 3.into()); get_collected_locks(&c, 3).unwrap(); @@ -703,7 +707,8 @@ mod tests { (expected_result.clone(), true) ); - // When start collecting with the same max_ts again, shouldn't clean up the observer state. + // When start collecting with the same max_ts again, shouldn't clean up the + // observer state. start_collecting(&c, 100).unwrap(); assert_eq!( get_collected_locks(&c, 100).unwrap(), @@ -727,8 +732,8 @@ mod tests { (expected_result, true) ); - // When start_collecting is double-invoked again with larger ts, the previous results are - // dropped. + // When start_collecting is double-invoked again with larger ts, the previous + // results are dropped. start_collecting(&c, 110).unwrap(); assert_eq!(get_collected_locks(&c, 110).unwrap(), (vec![], true)); coprocessor_host.post_apply(&Region::default(), &make_raft_cmd(req)); @@ -789,8 +794,8 @@ mod tests { (expected_locks.clone(), true) ); - // When stale start_collecting request arrives, the previous collected results shouldn't - // be dropped. + // When stale start_collecting request arrives, the previous collected results + // shouldn't be dropped. start_collecting(&c, 100).unwrap(); assert_eq!( get_collected_locks(&c, 100).unwrap(), @@ -802,8 +807,8 @@ mod tests { (expected_locks, true) ); - // When start_collecting is double-invoked again with larger ts, the previous results are - // dropped. + // When start_collecting is double-invoked again with larger ts, the previous + // results are dropped. start_collecting(&c, 110).unwrap(); assert_eq!(get_collected_locks(&c, 110).unwrap(), (vec![], true)); coprocessor_host.post_apply_plain_kvs_from_snapshot(&Region::default(), CF_LOCK, &lock_kvs); @@ -813,8 +818,8 @@ mod tests { coprocessor_host.post_apply_sst_from_snapshot(&Region::default(), CF_DEFAULT, ""); assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks.clone(), true)); - // Apply SST file to lock cf is not supported. This will cause error and therefore - // `is_clean` will be set to false. + // Apply SST file to lock cf is not supported. This will cause error and + // therefore `is_clean` will be set to false. coprocessor_host.post_apply_sst_from_snapshot(&Region::default(), CF_LOCK, ""); assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks, false)); } diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 8d914080279..165a1f62ddf 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -41,13 +41,15 @@ use crate::{ const DEFAULT_DELETE_BATCH_SIZE: usize = 256 * 1024; pub const DEFAULT_DELETE_BATCH_COUNT: usize = 128; -// The default version that can enable compaction filter for GC. This is necessary because after -// compaction filter is enabled, it's impossible to fallback to ealier version which modifications -// of GC are distributed to other replicas by Raft. +// The default version that can enable compaction filter for GC. This is +// necessary because after compaction filter is enabled, it's impossible to +// fallback to earlier version which modifications of GC are distributed to +// other replicas by Raft. const COMPACTION_FILTER_GC_FEATURE: Feature = Feature::require(5, 0, 0); -// Global context to create a compaction filter for write CF. It's necessary as these fields are -// not available when constructing `WriteCompactionFilterFactory`. +// Global context to create a compaction filter for write CF. It's necessary as +// these fields are not available when constructing +// `WriteCompactionFilterFactory`. pub struct GcContext { pub(crate) db: RocksEngine, pub(crate) store_id: u64, @@ -338,8 +340,8 @@ impl WriteCompactionFilter { } } - // `log_on_error` indicates whether to print an error log on scheduling failures. - // It's only enabled for `GcTask::OrphanVersions`. + // `log_on_error` indicates whether to print an error log on scheduling + // failures. It's only enabled for `GcTask::OrphanVersions`. fn schedule_gc_task(&self, task: GcTask, log_on_error: bool) { match self.gc_scheduler.schedule(task) { Ok(_) => {} @@ -432,7 +434,7 @@ impl WriteCompactionFilter { } self.filtered += 1; self.handle_filtered_write(write)?; - self.flush_pending_writes_if_need(false /*force*/)?; + self.flush_pending_writes_if_need(false /* force */)?; let decision = if self.remove_older { // Use `Decision::RemoveAndSkipUntil` instead of `Decision::Remove` to avoid // leaving tombstones, which can only be freed at the bottommost level. @@ -566,8 +568,8 @@ thread_local! { } impl Drop for WriteCompactionFilter { - // NOTE: it's required that `CompactionFilter` is dropped before the compaction result - // becomes installed into the DB instance. + // NOTE: it's required that `CompactionFilter` is dropped before the compaction + // result becomes installed into the DB instance. fn drop(&mut self) { if self.mvcc_deletion_overlaps.take() == Some(0) { self.handle_bottommost_delete(); @@ -652,7 +654,7 @@ fn check_need_gc( ratio_threshold: f64, context: &CompactionFilterContext, ) -> bool { - let check_props = |props: &MvccProperties| -> (bool, bool /*skip_more_checks*/) { + let check_props = |props: &MvccProperties| -> (bool, bool /* skip_more_checks */) { if props.min_ts > safe_point { return (false, false); } @@ -668,8 +670,9 @@ fn check_need_gc( return (true, false); } - // When comparing `num_versions` with `num_puts`, trait internal levels specially - // because MVCC-deletion marks can't be handled at those levels. + // When comparing `num_versions` with `num_puts`, trait internal levels + // specially because MVCC-deletion marks can't be handled at those + // levels. let num_rollback_and_locks = (props.num_versions - props.num_deletes) as f64; if num_rollback_and_locks > props.num_puts as f64 * ratio_threshold { return (true, false); @@ -973,7 +976,8 @@ pub mod tests { must_prewrite_delete(&engine, b"zkey", b"zkey", 120); must_commit(&engine, b"zkey", 120, 130); - // No GC task should be emit because the mvcc-deletion mark covers some older versions. + // No GC task should be emit because the mvcc-deletion mark covers some older + // versions. gc_and_check(false, b"zkey"); // A GC task should be emit after older versions are cleaned. gc_and_check(true, b"zkey"); @@ -995,14 +999,15 @@ pub mod tests { must_prewrite_put(&engine, b"zkey2", &value, b"zkey2", 220); must_commit(&engine, b"zkey2", 220, 230); - // No GC task should be emit because the mvcc-deletion mark covers some older versions. + // No GC task should be emit because the mvcc-deletion mark covers some older + // versions. gc_and_check(false, b"zkey1"); // A GC task should be emit after older versions are cleaned. gc_and_check(true, b"zkey1"); } - // Test if there are not enought garbage in SST files involved by a compaction, no compaction - // filter will be created. + // Test if there are not enought garbage in SST files involved by a compaction, + // no compaction filter will be created. #[test] fn test_mvcc_properties() { let mut cfg = DbConfig::default(); @@ -1031,7 +1036,8 @@ pub mod tests { gc_runner.target_level = Some(6); gc_runner.safe_point(100).gc(&raw_engine); - // Can perform GC at the bottommost level even if the threshold can't be reached. + // Can perform GC at the bottommost level even if the threshold can't be + // reached. gc_runner.ratio_threshold = Some(10.0); gc_runner.target_level = Some(6); gc_runner.safe_point(140).gc(&raw_engine); @@ -1062,12 +1068,12 @@ pub mod tests { } } - // If we use `CompactionFilterDecision::RemoveAndSkipUntil` in compaction filters, - // deletion marks can only be handled in the bottommost level. Otherwise dirty - // versions could be exposed incorrectly. + // If we use `CompactionFilterDecision::RemoveAndSkipUntil` in compaction + // filters, deletion marks can only be handled in the bottommost level. + // Otherwise dirty versions could be exposed incorrectly. // - // This case tests that deletion marks won't be handled at internal levels, and at - // the bottommost levels, dirty versions still can't be exposed. + // This case tests that deletion marks won't be handled at internal levels, and + // at the bottommost levels, dirty versions still can't be exposed. #[test] fn test_remove_and_skip_until() { let mut cfg = DbConfig::default(); diff --git a/src/server/gc_worker/config.rs b/src/server/gc_worker/config.rs index 9406e39d993..1816dd845e1 100644 --- a/src/server/gc_worker/config.rs +++ b/src/server/gc_worker/config.rs @@ -18,8 +18,9 @@ pub struct GcConfig { pub batch_keys: usize, pub max_write_bytes_per_sec: ReadableSize, pub enable_compaction_filter: bool, - /// By default compaction_filter can only works if `cluster_version` is greater than 5.0.0. - /// Change `compaction_filter_skip_version_check` can enable it by force. + /// By default compaction_filter can only works if `cluster_version` is + /// greater than 5.0.0. Change `compaction_filter_skip_version_check` + /// can enable it by force. pub compaction_filter_skip_version_check: bool, } diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index b009c80b728..7fdc440527f 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -36,18 +36,19 @@ pub struct AutoGcConfig { pub safe_point_provider: S, pub region_info_provider: R, - /// Used to find which peer of a region is on this TiKV, so that we can compose a `Context`. + /// Used to find which peer of a region is on this TiKV, so that we can + /// compose a `Context`. pub self_store_id: u64, pub poll_safe_point_interval: Duration, - /// If this is set, safe_point will be checked before doing GC on every region while working. - /// Otherwise safe_point will be only checked when `poll_safe_point_interval` has past since - /// last checking. + /// If this is set, safe_point will be checked before doing GC on every + /// region while working. Otherwise safe_point will be only checked when + /// `poll_safe_point_interval` has past since last checking. pub always_check_safe_point: bool, - /// This will be called when a round of GC has finished and goes back to idle state. - /// This field is for test purpose. + /// This will be called when a round of GC has finished and goes back to + /// idle state. This field is for test purpose. pub post_a_round_of_gc: Option>, } @@ -64,8 +65,8 @@ impl AutoGcConfig { } } - /// Creates a config for test purpose. The interval to poll safe point is as short as 0.1s and - /// during GC it never skips checking safe point. + /// Creates a config for test purpose. The interval to poll safe point is as + /// short as 0.1s and during GC it never skips checking safe point. pub fn new_test_cfg( safe_point_provider: S, region_info_provider: R, @@ -82,8 +83,8 @@ impl AutoGcConfig { } } -/// The only error that will break `GcManager`'s process is that the `GcManager` is interrupted by -/// others, maybe due to TiKV shutting down. +/// The only error that will break `GcManager`'s process is that the `GcManager` +/// is interrupted by others, maybe due to TiKV shutting down. #[derive(Debug)] enum GcManagerError { Stopped, @@ -93,11 +94,12 @@ type GcManagerResult = std::result::Result; /// Used to check if `GcManager` should be stopped. /// -/// When `GcManager` is running, it might take very long time to GC a round. It should be able to -/// break at any time so that we can shut down TiKV in time. +/// When `GcManager` is running, it might take very long time to GC a round. It +/// should be able to break at any time so that we can shut down TiKV in time. pub(super) struct GcManagerContext { - /// Used to receive stop signal. The sender side is hold in `GcManagerHandle`. - /// If this field is `None`, the `GcManagerContext` will never stop. + /// Used to receive stop signal. The sender side is hold in + /// `GcManagerHandle`. If this field is `None`, the `GcManagerContext` + /// will never stop. stop_signal_receiver: Option>, /// Whether an stop signal is received. is_stopped: bool, @@ -111,14 +113,15 @@ impl GcManagerContext { } } - /// Sets the receiver that used to receive the stop signal. `GcManagerContext` will be - /// considered to be stopped as soon as a message is received from the receiver. + /// Sets the receiver that used to receive the stop signal. + /// `GcManagerContext` will be considered to be stopped as soon as a + /// message is received from the receiver. pub fn set_stop_signal_receiver(&mut self, rx: mpsc::Receiver<()>) { self.stop_signal_receiver = Some(rx); } - /// Sleeps for a while. if a stop message is received, returns immediately with - /// `GcManagerError::Stopped`. + /// Sleeps for a while. if a stop message is received, returns immediately + /// with `GcManagerError::Stopped`. fn sleep_or_stop(&mut self, timeout: Duration) -> GcManagerResult<()> { if self.is_stopped { return Err(GcManagerError::Stopped); @@ -141,8 +144,8 @@ impl GcManagerContext { } } - /// Checks if a stop message has been fired. Returns `GcManagerError::Stopped` if there's such - /// a message. + /// Checks if a stop message has been fired. Returns + /// `GcManagerError::Stopped` if there's such a message. fn check_stopped(&mut self) -> GcManagerResult<()> { if self.is_stopped { return Err(GcManagerError::Stopped); @@ -197,7 +200,8 @@ fn set_status_metrics(state: GcManagerState) { } } -/// Wraps `JoinHandle` of `GcManager` and helps to stop the `GcManager` synchronously. +/// Wraps `JoinHandle` of `GcManager` and helps to stop the `GcManager` +/// synchronously. pub(super) struct GcManagerHandle { join_handle: JoinHandle<()>, stop_signal_sender: mpsc::Sender<()>, @@ -218,13 +222,15 @@ impl GcManagerHandle { } /// Controls how GC runs automatically on the TiKV. -/// It polls safe point periodically, and when the safe point is updated, `GcManager` will start to -/// scan all regions (whose leader is on this TiKV), and does GC on all those regions. +/// It polls safe point periodically, and when the safe point is updated, +/// `GcManager` will start to scan all regions (whose leader is on this TiKV), +/// and does GC on all those regions. pub(super) struct GcManager { cfg: AutoGcConfig, - /// The current safe point. `GcManager` will try to update it periodically. When `safe_point` is - /// updated, `GCManager` will start to do GC on all regions. + /// The current safe point. `GcManager` will try to update it periodically. + /// When `safe_point` is updated, `GCManager` will start to do GC on all + /// regions. safe_point: Arc, safe_point_last_check_time: Instant, @@ -232,7 +238,8 @@ pub(super) struct GcManager>, - /// Holds the running status. It will tell us if `GcManager` should stop working and exit. + /// Holds the running status. It will tell us if `GcManager` should stop + /// working and exit. gc_manager_ctx: GcManagerContext, cfg_tracker: GcWorkerConfigManager, @@ -268,8 +275,8 @@ impl GcMan .store(ts.into_inner(), AtomicOrdering::Relaxed); } - /// Starts working in another thread. This function moves the `GcManager` and returns a handler - /// of it. + /// Starts working in another thread. This function moves the `GcManager` + /// and returns a handler of it. pub fn start(mut self) -> Result { set_status_metrics(GcManagerState::Init); self.initialize(); @@ -292,8 +299,8 @@ impl GcMan }) } - /// Polls safe point and does GC in a loop, again and again, until interrupted by invoking - /// `GcManagerHandle::stop`. + /// Polls safe point and does GC in a loop, again and again, until + /// interrupted by invoking `GcManagerHandle::stop`. fn run(&mut self) { debug!("gc-manager is started"); self.run_impl().unwrap_err(); @@ -325,9 +332,10 @@ impl GcMan } /// Sets the initial state of the `GCManger`. - /// The only task of initializing is to simply get the current safe point as the initial value - /// of `safe_point`. TiKV won't do any GC automatically until the first time `safe_point` was - /// updated to a greater value than initial value. + /// The only task of initializing is to simply get the current safe point as + /// the initial value of `safe_point`. TiKV won't do any GC + /// automatically until the first time `safe_point` was updated to a + /// greater value than initial value. fn initialize(&mut self) { debug!("gc-manager is initializing"); self.save_safe_point(TimeStamp::zero()); @@ -347,8 +355,9 @@ impl GcMan } } - /// Tries to update the safe point. Returns true if safe point has been updated to a greater - /// value. Returns false if safe point didn't change or we encountered an error. + /// Tries to update the safe point. Returns true if safe point has been + /// updated to a greater value. Returns false if safe point didn't + /// change or we encountered an error. fn try_update_safe_point(&mut self) -> bool { self.safe_point_last_check_time = Instant::now(); @@ -380,13 +389,13 @@ impl GcMan } } - /// Scans all regions on the TiKV whose leader is this TiKV, and does GC on all of them. - /// Regions are scanned and GC-ed in lexicographical order. + /// Scans all regions on the TiKV whose leader is this TiKV, and does GC on + /// all of them. Regions are scanned and GC-ed in lexicographical order. /// - /// While the `gc_a_round` function is running, it will periodically check whether safe_point is - /// updated before the function `gc_a_round` finishes. If so, *Rewinding* will occur. For - /// example, when we just starts to do GC, our progress is like this: ('^' means our current - /// progress) + /// While the `gc_a_round` function is running, it will periodically check + /// whether safe_point is updated before the function `gc_a_round` finishes. + /// If so, *Rewinding* will occur. For example, when we just starts to do + /// GC, our progress is like this: ('^' means our current progress) /// /// ```text /// | region 1 | region 2 | region 3| region 4 | region 5 | region 6 | @@ -400,17 +409,18 @@ impl GcMan /// ----------------------^ /// ``` /// - /// At this time we found that safe point was updated, so rewinding will happen. First we - /// continue working to the end: ('#' indicates the position that safe point updates) + /// At this time we found that safe point was updated, so rewinding will + /// happen. First we continue working to the end: ('#' indicates the + /// position that safe point updates) /// /// ```text /// | region 1 | region 2 | region 3| region 4 | region 5 | region 6 | /// ----------------------#------------------------------------------^ /// ``` /// - /// Then region 1-2 were GC-ed with the old safe point and region 3-6 were GC-ed with the new - /// new one. Then, we *rewind* to the very beginning and continue GC to the position that safe - /// point updates: + /// Then region 1-2 were GC-ed with the old safe point and region 3-6 were + /// GC-ed with the new new one. Then, we *rewind* to the very beginning + /// and continue GC to the position that safe point updates: /// /// ```text /// | region 1 | region 2 | region 3| region 4 | region 5 | region 6 | @@ -419,12 +429,14 @@ impl GcMan /// ``` /// /// Then GC finishes. - /// If safe point updates again at some time, it will still try to GC all regions with the - /// latest safe point. If safe point always updates before `gc_a_round` finishes, `gc_a_round` - /// may never stop, but it doesn't matter. + /// If safe point updates again at some time, it will still try to GC all + /// regions with the latest safe point. If safe point always updates + /// before `gc_a_round` finishes, `gc_a_round` may never stop, but it + /// doesn't matter. fn gc_a_round(&mut self) -> GcManagerResult<()> { let mut need_rewind = false; - // Represents where we should stop doing GC. `None` means the very end of the TiKV. + // Represents where we should stop doing GC. `None` means the very end of the + // TiKV. let mut end = None; // Represents where we have GC-ed to. `None` means the very end of the TiKV. let mut progress = Some(Key::from_encoded(BEGIN_KEY.to_vec())); @@ -434,17 +446,17 @@ impl GcMan info!("gc_worker: auto gc starts"; "safe_point" => self.curr_safe_point()); - // The following loop iterates all regions whose leader is on this TiKV and does GC on them. - // At the same time, check whether safe_point is updated periodically. If it's updated, - // rewinding will happen. + // The following loop iterates all regions whose leader is on this TiKV and does + // GC on them. At the same time, check whether safe_point is updated + // periodically. If it's updated, rewinding will happen. loop { self.gc_manager_ctx.check_stopped()?; if is_compaction_filter_allowed(&*self.cfg_tracker.value(), &self.feature_gate) { return Ok(()); } - // Check the current GC progress and determine if we are going to rewind or we have - // finished the round of GC. + // Check the current GC progress and determine if we are going to rewind or we + // have finished the round of GC. if need_rewind { if progress.is_none() { // We have worked to the end and we need to rewind. Restart from beginning. @@ -469,8 +481,8 @@ impl GcMan _ => false, }; if finished { - // We have worked to the end of the TiKV or our progress has reached `end`, and we - // don't need to rewind. In this case, the round of GC has finished. + // We have worked to the end of the TiKV or our progress has reached `end`, and + // we don't need to rewind. In this case, the round of GC has finished. info!("gc_worker: auto gc finishes"; "processed_regions" => processed_regions); return Ok(()); } @@ -478,15 +490,16 @@ impl GcMan assert!(progress.is_some()); - // Before doing GC, check whether safe_point is updated periodically to determine if - // rewinding is needed. + // Before doing GC, check whether safe_point is updated periodically to + // determine if rewinding is needed. self.check_if_need_rewind(&progress, &mut need_rewind, &mut end); progress = self.gc_next_region(progress.unwrap(), &mut processed_regions)?; } } - /// Checks whether we need to rewind in this round of GC. Only used in `gc_a_round`. + /// Checks whether we need to rewind in this round of GC. Only used in + /// `gc_a_round`. fn check_if_need_rewind( &mut self, progress: &Option, @@ -523,8 +536,9 @@ impl GcMan } } - /// Does GC on the next region after `from_key`. Returns the end key of the region it processed. - /// If we have processed to the end of all regions, returns `None`. + /// Does GC on the next region after `from_key`. Returns the end key of the + /// region it processed. If we have processed to the end of all regions, + /// returns `None`. fn gc_next_region( &mut self, from_key: Key, @@ -663,8 +677,8 @@ mod tests { impl GcSafePointProvider for MockSafePointProvider { fn get_safe_point(&self) -> Result { - // Error will be ignored by `GcManager`, which is equivalent to that the safe_point - // is not updated. + // Error will be ignored by `GcManager`, which is equivalent to that the + // safe_point is not updated. self.rx.try_recv().map_err(|e| box_err!(e)) } } @@ -755,13 +769,16 @@ mod tests { /// Run a round of auto GC and check if it correctly GC regions as expected. /// - /// Param `regions` is a `Vec` of tuples which is `(start_key, end_key, region_id)` + /// Param `regions` is a `Vec` of tuples which is `(start_key, end_key, + /// region_id)` /// - /// The first value in param `safe_points` will be used to initialize the GcManager, and the remaining - /// values will be checked before every time GC-ing a region. If the length of `safe_points` is - /// less than executed GC tasks, the last value will be used for extra GC tasks. + /// The first value in param `safe_points` will be used to initialize the + /// GcManager, and the remaining values will be checked before every time + /// GC-ing a region. If the length of `safe_points` is less than executed GC + /// tasks, the last value will be used for extra GC tasks. /// - /// Param `expected_gc_tasks` is a `Vec` of tuples which is `(region_id, safe_point)`. + /// Param `expected_gc_tasks` is a `Vec` of tuples which is `(region_id, + /// safe_point)`. fn test_auto_gc( regions: Vec<(Vec, Vec, u64)>, safe_points: Vec + Copy>, @@ -865,7 +882,8 @@ mod tests { vec![(1, 233), (2, 233), (3, 233), (4, 233)], ); - // First region doesn't starts with empty and last region doesn't ends with empty. + // First region doesn't starts with empty and last region doesn't ends with + // empty. let regions = vec![ (b"0".to_vec(), b"1".to_vec(), 1), (b"1".to_vec(), b"2".to_vec(), 2), @@ -935,8 +953,9 @@ mod tests { ); let mut safe_points = vec![233, 233, 233, 234, 234, 234, 235]; - // The logic of `gc_a_round` wastes a loop when the last region's end_key is not null, so it - // will check safe point one more time before GC-ing the first region after rewinding. + // The logic of `gc_a_round` wastes a loop when the last region's end_key is not + // null, so it will check safe point one more time before GC-ing the first + // region after rewinding. if !regions.last().unwrap().1.is_empty() { safe_points.insert(5, 234); } diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 7242a984d0d..fe409be3ae4 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -59,12 +59,12 @@ use crate::{ }, }; -/// After the GC scan of a key, output a message to the log if there are at least this many -/// versions of the key. +/// After the GC scan of a key, output a message to the log if there are at +/// least this many versions of the key. const GC_LOG_FOUND_VERSION_THRESHOLD: usize = 30; -/// After the GC delete versions of a key, output a message to the log if at least this many -/// versions are deleted. +/// After the GC delete versions of a key, output a message to the log if at +/// least this many versions are deleted. const GC_LOG_DELETED_VERSION_THRESHOLD: usize = 30; pub const GC_MAX_EXECUTING_TASKS: usize = 10; @@ -120,13 +120,14 @@ where limit: usize, callback: Callback>, }, - /// If GC in compaction filter is enabled, versions on default CF will be handled with - /// `DB::delete` in write CF's compaction filter. However if the compaction filter finds - /// the DB is stalled, it will send the task to GC worker to ensure the compaction can be - /// continued. + /// If GC in compaction filter is enabled, versions on default CF will be + /// handled with `DB::delete` in write CF's compaction filter. However if + /// the compaction filter finds the DB is stalled, it will send the task + /// to GC worker to ensure the compaction can be continued. /// - /// NOTE: It's possible that the TiKV instance fails after a compaction result is installed - /// but its orphan versions are not deleted. Those orphan versions will never get cleaned + /// NOTE: It's possible that the TiKV instance fails after a compaction + /// result is installed but its orphan versions are not deleted. Those + /// orphan versions will never get cleaned /// until `DefaultCompactionFilter` is introduced. /// /// The tracking issue: . @@ -308,8 +309,8 @@ where } /// Check need gc without getting snapshot. - /// If this is not supported or any error happens, returns true to do further check after - /// getting snapshot. + /// If this is not supported or any error happens, returns true to do + /// further check after getting snapshot. fn need_gc(&self, start_key: &[u8], end_key: &[u8], safe_point: TimeStamp) -> bool { let props = match self .engine @@ -629,8 +630,8 @@ where let local_storage = self.engine.kv_engine(); // Convert keys to RocksDB layer form - // TODO: Logic coupled with raftstore's implementation. Maybe better design is to do it in - // somewhere of the same layer with apply_worker. + // TODO: Logic coupled with raftstore's implementation. Maybe better design is + // to do it in somewhere of the same layer with apply_worker. let start_data_key = keys::data_key(start_key.as_encoded()); let end_data_key = keys::data_end_key(end_key.as_encoded()); @@ -904,7 +905,8 @@ where } } -/// When we failed to schedule a `GcTask` to `GcRunner`, use this to handle the `ScheduleError`. +/// When we failed to schedule a `GcTask` to `GcRunner`, use this to handle the +/// `ScheduleError`. fn handle_gc_task_schedule_error(e: ScheduleError>) -> Result<()> { error!("failed to schedule gc task"; "err" => %e); let res = Err(box_err!("failed to schedule gc task: {:?}", e)); @@ -915,7 +917,8 @@ fn handle_gc_task_schedule_error(e: ScheduleError>) -> Res GcTask::PhysicalScanLock { callback, .. } => { callback(Err(Error::from(ErrorInner::GcWorkerTooBusy))) } - // Attention: If you are adding a new GcTask, do not forget to call the callback if it has a callback. + // Attention: If you are adding a new GcTask, do not forget to call the callback if it has a + // callback. GcTask::GcKeys { .. } | GcTask::RawGcKeys { .. } | GcTask::OrphanVersions { .. } => {} #[cfg(any(test, feature = "testexport"))] GcTask::Validate(_) => {} @@ -968,7 +971,8 @@ where { engine: E, - /// `raft_store_router` is useful to signal raftstore clean region size informations. + /// `raft_store_router` is useful to signal raftstore clean region size + /// informations. raft_store_router: RR, /// Used to signal unsafe destroy range is executed. flow_info_sender: Option>, @@ -1150,11 +1154,12 @@ where .or_else(handle_gc_task_schedule_error) } - /// Cleans up all keys in a range and quickly free the disk space. The range might span over - /// multiple regions, and the `ctx` doesn't indicate region. The request will be done directly - /// on RocksDB, bypassing the Raft layer. User must promise that, after calling `destroy_range`, - /// the range will never be accessed any more. However, `destroy_range` is allowed to be called - /// multiple times on an single range. + /// Cleans up all keys in a range and quickly free the disk space. The range + /// might span over multiple regions, and the `ctx` doesn't indicate region. + /// The request will be done directly on RocksDB, bypassing the Raft layer. + /// User must promise that, after calling `destroy_range`, the range will + /// never be accessed any more. However, `destroy_range` is allowed to be + /// called multiple times on an single range. pub fn unsafe_destroy_range( &self, ctx: Context, @@ -1287,10 +1292,11 @@ mod tests { }; /// A wrapper of engine that adds the 'z' prefix to keys internally. - /// For test engines, they writes keys into db directly, but in production a 'z' prefix will be - /// added to keys by raftstore layer before writing to db. Some functionalities of `GCWorker` - /// bypasses Raft layer, so they needs to know how data is actually represented in db. This - /// wrapper allows test engines write 'z'-prefixed keys to db. + /// For test engines, they writes keys into db directly, but in production a + /// 'z' prefix will be added to keys by raftstore layer before writing to + /// db. Some functionalities of `GCWorker` bypasses Raft layer, so they + /// needs to know how data is actually represented in db. This wrapper + /// allows test engines write 'z'-prefixed keys to db. #[derive(Clone)] struct PrefixedEngine(kv::RocksEngine); @@ -1388,8 +1394,8 @@ mod tests { } } - /// Assert the data in `storage` is the same as `expected_data`. Keys in `expected_data` should - /// be encoded form without ts. + /// Assert the data in `storage` is the same as `expected_data`. Keys in + /// `expected_data` should be encoded form without ts. fn check_data( storage: &Storage, expected_data: &BTreeMap, Vec>, @@ -1988,13 +1994,15 @@ mod tests { .unwrap(); assert_eq!(runner.stats.write.seek_tombstone, 0); - // Test rebuilding snapshot when GC write batch limit reached (gc_info.is_completed == false). - // Build a key with versions that will just reach the limit `MAX_TXN_WRITE_SIZE`. + // Test rebuilding snapshot when GC write batch limit reached + // (gc_info.is_completed == false). Build a key with versions that will + // just reach the limit `MAX_TXN_WRITE_SIZE`. let key_size = Modify::Delete(CF_WRITE, Key::from_raw(b"k2").append_ts(1.into())).size(); // versions = ceil(MAX_TXN_WRITE_SIZE/write_size) + 3 // Write CF: Put@N, Put@N-2, Put@N-4, ... Put@5, Put@3 // ^ ^^^^^^^^^^^^^^^^^^^ - // safepoint=N-1 Deleted in the first batch, `ceil(MAX_TXN_WRITE_SIZE/write_size)` versions. + // safepoint=N-1 Deleted in the first batch, + // `ceil(MAX_TXN_WRITE_SIZE/write_size)` versions. let versions = (MAX_TXN_WRITE_SIZE - 1) / key_size + 4; for start_ts in (1..versions).map(|x| x as u64 * 2) { let commit_ts = start_ts + 1; @@ -2012,9 +2020,9 @@ mod tests { Some((1, ri_provider)), ) .unwrap(); - // The first batch will leave tombstones that will be seen while processing the second - // batch, but it will be seen in `next` after seeking the latest unexpired version, - // therefore `seek_tombstone` is not affected. + // The first batch will leave tombstones that will be seen while processing the + // second batch, but it will be seen in `next` after seeking the latest + // unexpired version, therefore `seek_tombstone` is not affected. assert_eq!(runner.stats.write.seek_tombstone, 0); // ... and next_tombstone indicates there's indeed more than one batches. assert_eq!(runner.stats.write.next_tombstone, versions - 3); diff --git a/src/server/gc_worker/rawkv_compaction_filter.rs b/src/server/gc_worker/rawkv_compaction_filter.rs index ac16c30bb03..3ed206408e4 100644 --- a/src/server/gc_worker/rawkv_compaction_filter.rs +++ b/src/server/gc_worker/rawkv_compaction_filter.rs @@ -98,8 +98,8 @@ thread_local! { } impl Drop for RawCompactionFilter { - // NOTE: it's required that `CompactionFilter` is dropped before the compaction result - // becomes installed into the DB instance. + // NOTE: it's required that `CompactionFilter` is dropped before the compaction + // result becomes installed into the DB instance. fn drop(&mut self) { self.raw_gc_mvcc_deletions(); @@ -181,7 +181,8 @@ impl RawCompactionFilter { return Ok(CompactionFilterDecision::Keep); } - // If the key mode is not KeyMode::Raw or value_type is not CompactionFilterValueType::Value, it's needed to be retained. + // If the key mode is not KeyMode::Raw or value_type is not + // CompactionFilterValueType::Value, it's needed to be retained. let key_mode = ApiV2::parse_key_mode(keys::origin_key(key)); if key_mode != KeyMode::Raw || value_type != CompactionFilterValueType::Value { return Ok(CompactionFilterDecision::Keep); @@ -199,15 +200,19 @@ impl RawCompactionFilter { self.versions += 1; let raw_value = ApiV2::decode_raw_value(value)?; - // If it's the latest version, and it's deleted or expired, it needs to be sent to GCWorker to be processed asynchronously. + // If it's the latest version, and it's deleted or expired, it needs to be sent + // to GCWorker to be processed asynchronously. if !raw_value.is_valid(self.current_ts) { self.raw_handle_delete(); if self.mvcc_deletions.len() >= DEFAULT_DELETE_BATCH_COUNT { self.raw_gc_mvcc_deletions(); } } - // 1. If it's the latest version, and it's neither deleted nor expired, it's needed to be retained. - // 2. If it's the latest version, and it's deleted or expired, while we do async gctask to deleted or expired records, both put records and deleted/expired records are actually kept within the compaction filter. + // 1. If it's the latest version, and it's neither deleted nor expired, it's + // needed to be retained. 2. If it's the latest version, and it's + // deleted or expired, while we do async gctask to deleted or expired records, + // both put records and deleted/expired records are actually kept within the + // compaction filter. Ok(CompactionFilterDecision::Keep) } else { if commit_ts.into_inner() >= self.safe_point { @@ -216,7 +221,8 @@ impl RawCompactionFilter { self.versions += 1; self.filtered += 1; - // If it's ts < safepoint, and it's not the latest version, it's need to be removed. + // If it's ts < safepoint, and it's not the latest version, it's need to be + // removed. Ok(CompactionFilterDecision::Remove) } } @@ -234,8 +240,8 @@ impl RawCompactionFilter { } } - // `log_on_error` indicates whether to print an error log on scheduling failures. - // It's only enabled for `GcTask::OrphanVersions`. + // `log_on_error` indicates whether to print an error log on scheduling + // failures. It's only enabled for `GcTask::OrphanVersions`. fn schedule_gc_task(&self, task: GcTask, log_on_error: bool) { match self.gc_scheduler.schedule(task) { Ok(_) => {} @@ -363,7 +369,8 @@ pub mod tests { gc_runner.safe_point(80).gc_raw(&raw_engine); - // If ts(70) < safepoint(80), and this userkey's latest verion is not deleted or expired, this version will be removed in do_filter. + // If ts(70) < safepoint(80), and this userkey's latest version is not deleted + // or expired, this version will be removed in do_filter. let entry70 = raw_engine .get_value_cf(CF_DEFAULT, make_key(b"r\0a", 70).as_slice()) .unwrap(); diff --git a/src/server/load_statistics/linux.rs b/src/server/load_statistics/linux.rs index f3a12593a51..e0a9b950944 100644 --- a/src/server/load_statistics/linux.rs +++ b/src/server/load_statistics/linux.rs @@ -23,11 +23,12 @@ pub struct ThreadLoadStatistics { } impl ThreadLoadStatistics { - /// Create a thread load statistics for all threads with `prefix`. `ThreadLoad` is stored into - /// `thread_loads` for each thread. At most `slots` old records will be kept, to make the curve - /// more smooth. + /// Create a thread load statistics for all threads with `prefix`. + /// `ThreadLoad` is stored into `thread_loads` for each thread. At most + /// `slots` old records will be kept, to make the curve more smooth. /// - /// Note: call this after the target threads are initialized, otherwise it can't catch them. + /// Note: call this after the target threads are initialized, otherwise it + /// can't catch them. pub fn new(slots: usize, prefix: &str, thread_loads: Arc) -> Self { let pid = thread::process_id(); let mut tids = vec![]; @@ -56,17 +57,19 @@ impl ThreadLoadStatistics { } } - /// For every threads with the name prefix given in `ThreadLoadStatistics::new`, - /// gather cpu usage from `/proc//task/` and store it in `thread_load` + /// For every threads with the name prefix given in + /// `ThreadLoadStatistics::new`, gather cpu usage from + /// `/proc//task/` and store it in `thread_load` /// passed in `ThreadLoadStatistics::new`. /// - /// Some old usages and instants (at most `slots`) will be kept internal to make - /// the usage curve more smooth. + /// Some old usages and instants (at most `slots`) will be kept internal to + /// make the usage curve more smooth. pub fn record(&mut self, instant: Instant) { self.instants[self.cur_pos] = instant; self.cpu_usages[self.cur_pos].clear(); for tid in &self.tids { - // TODO: if monitored threads exited and restarted then, we should update `self.tids`. + // TODO: if monitored threads exited and restarted then, we should update + // `self.tids`. if let Ok(stat) = thread::full_thread_stat(self.pid, *tid) { let total = thread::linux::cpu_total(&stat); self.cpu_usages[self.cur_pos].insert(*tid, total); diff --git a/src/server/load_statistics/mod.rs b/src/server/load_statistics/mod.rs index 3b792def94d..5cb856e2948 100644 --- a/src/server/load_statistics/mod.rs +++ b/src/server/load_statistics/mod.rs @@ -44,7 +44,8 @@ impl ThreadLoadPool { }) } - /// Gets the current load. For example, 200 means the threads consuming 200% of the CPU resources. + /// Gets the current load. For example, 200 means the threads consuming 200% + /// of the CPU resources. pub fn total_load(&self) -> usize { self.total_load.load(Ordering::Relaxed) } diff --git a/src/server/lock_manager/client.rs b/src/server/lock_manager/client.rs index f3b59c4e97b..c71bec0b63a 100644 --- a/src/server/lock_manager/client.rs +++ b/src/server/lock_manager/client.rs @@ -21,7 +21,8 @@ pub type Callback = Box; const CQ_COUNT: usize = 1; const CLIENT_PREFIX: &str = "deadlock"; -/// Builds the `Environment` of deadlock clients. All clients should use the same instance. +/// Builds the `Environment` of deadlock clients. All clients should use the +/// same instance. pub fn env() -> Arc { Arc::new( EnvBuilder::new() diff --git a/src/server/lock_manager/config.rs b/src/server/lock_manager/config.rs index 8d391e874de..aba08f3d2e7 100644 --- a/src/server/lock_manager/config.rs +++ b/src/server/lock_manager/config.rs @@ -27,9 +27,9 @@ pub struct Config { /// Whether to enable the pipelined pessimistic lock feature. pub pipelined: bool, /// Whether to enable the in-memory pessimistic lock feature. - /// It will take effect only if the `pipelined` config is true because - /// we assume that the success rate of pessimistic transactions is important to - /// people who disable the pipelined pessimistic lock feature. + /// It will take effect only if the `pipelined` config is true because we + /// assume that the success rate of pessimistic transactions is important + /// to people who disable the pipelined pessimistic lock feature. pub in_memory: bool, } diff --git a/src/server/lock_manager/deadlock.rs b/src/server/lock_manager/deadlock.rs index 7cc8978d735..4fee40138c1 100644 --- a/src/server/lock_manager/deadlock.rs +++ b/src/server/lock_manager/deadlock.rs @@ -112,9 +112,11 @@ impl Locks { /// Used to detect the deadlock of wait-for-lock in the cluster. pub struct DetectTable { - /// Keeps the DAG of wait-for-lock. Every edge from `txn_ts` to `lock_ts` has a survival time -- `ttl`. - /// When checking the deadlock, if the ttl has elpased, the corresponding edge will be removed. - /// `last_detect_time` is the start time of the edge. `Detect` requests will refresh it. + /// Keeps the DAG of wait-for-lock. Every edge from `txn_ts` to `lock_ts` + /// has a survival time -- `ttl`. When checking the deadlock, if the ttl + /// has elpased, the corresponding edge will be removed. + /// `last_detect_time` is the start time of the edge. `Detect` requests will + /// refresh it. // txn_ts => (lock_ts => Locks) wait_for_map: HashMap>, @@ -138,11 +140,12 @@ impl DetectTable { } } - /// Returns the key hash which causes deadlock, and the current wait chain that forms the - /// deadlock with `txn_ts`'s waiting for txn at `lock_ts`. - /// Note that the current detecting edge is not included in the returned wait chain. This is - /// intended to reduce RPC message size since the information about current detecting txn is - /// included in a separated field. + /// Returns the key hash which causes deadlock, and the current wait chain + /// that forms the deadlock with `txn_ts`'s waiting for txn at + /// `lock_ts`. Note that the current detecting edge is not included in + /// the returned wait chain. This is intended to reduce RPC message size + /// since the information about current detecting txn is included in a + /// separated field. pub fn detect( &mut self, txn_ts: TimeStamp, @@ -181,12 +184,12 @@ impl DetectTable { let ttl = self.ttl; let mut stack = vec![wait_for_ts]; - // Memorize the pushed vertexes to avoid duplicate search, and maps to the predecessor of - // the vertex. - // Since the graph is a DAG instead of a tree, a vertex may have multiple predecessors. But - // it's ok if we only remember one: for each vertex, if it has a route to the goal (txn_ts), - // we must be able to find the goal and exit this function before visiting the vertex one - // more time. + // Memorize the pushed vertexes to avoid duplicate search, and maps to the + // predecessor of the vertex. + // Since the graph is a DAG instead of a tree, a vertex may have multiple + // predecessors. But it's ok if we only remember one: for each vertex, + // if it has a route to the goal (txn_ts), we must be able to find the + // goal and exit this function before visiting the vertex one more time. let mut pushed: HashMap = HashMap::default(); pushed.insert(wait_for_ts, TimeStamp::zero()); while let Some(curr_ts) = stack.pop() { @@ -220,18 +223,20 @@ impl DetectTable { None } - /// Generate the wait chain after deadlock is detected. This function is part of implementation - /// of `do_detect`. It assumes there's a path from `start` to `end` in the waiting graph, and - /// every single edge `V1 -> V2` has an entry in `vertex_predecessors_map` so that - /// `vertex_predecessors_map[V2] == V1`, and `vertex_predecessors_map[V1] == 0`. + /// Generate the wait chain after deadlock is detected. This function is + /// part of implementation of `do_detect`. It assumes there's a path + /// from `start` to `end` in the waiting graph, and every single edge + /// `V1 -> V2` has an entry in `vertex_predecessors_map` so that + /// `vertex_predecessors_map[V2] == V1`, and `vertex_predecessors_map[V1] == + /// 0`. fn generate_wait_chain( &self, start: TimeStamp, end: TimeStamp, vertex_predecessors_map: HashMap, ) -> Vec { - // It's rare that a deadlock formed by too many transactions. Preallocating a few elements - // should be enough in most cases. + // It's rare that a deadlock formed by too many transactions. Preallocating a + // few elements should be enough in most cases. let mut wait_chain = Vec::with_capacity(3); let mut lock_ts = end; @@ -259,9 +264,9 @@ impl DetectTable { wait_chain } - /// Returns true and adds to the detect table if `txn_ts` is waiting for `lock_ts`. - /// When the function returns true, `key` and `resource_group_tag` may be taken to store in the - /// waiting graph. + /// Returns true and adds to the detect table if `txn_ts` is waiting for + /// `lock_ts`. When the function returns true, `key` and + /// `resource_group_tag` may be taken to store in the waiting graph. fn register_if_existed( &mut self, txn_ts: TimeStamp, @@ -280,7 +285,8 @@ impl DetectTable { false } - /// Adds to the detect table. The edge from `txn_ts` to `lock_ts` must not exist. + /// Adds to the detect table. The edge from `txn_ts` to `lock_ts` must not + /// exist. fn register( &mut self, txn_ts: TimeStamp, @@ -402,7 +408,8 @@ pub enum Task { /// If the node has the leader region and the role of the node changes, /// a `ChangeRole` task will be scheduled. /// - /// It's the only way to change the node from leader to follower, and vice versa. + /// It's the only way to change the node from leader to follower, and vice + /// versa. ChangeRole(Role), /// Change the ttl of DetectTable ChangeTtl(Duration), @@ -434,8 +441,8 @@ impl Display for Task { } } -/// `Scheduler` is the wrapper of the `FutureScheduler` to simplify scheduling tasks -/// to the deadlock detector. +/// `Scheduler` is the wrapper of the `FutureScheduler` to simplify +/// scheduling tasks to the deadlock detector. #[derive(Clone)] pub struct Scheduler(FutureScheduler); @@ -498,14 +505,15 @@ impl Scheduler { } } -/// The leader region is the region containing the LEADER_KEY and the leader of the -/// leader region is also the leader of the deadlock detector. +/// The leader region is the region containing the LEADER_KEY and the leader of +/// the leader region is also the leader of the deadlock detector. const LEADER_KEY: &[u8] = b""; -/// `RoleChangeNotifier` observes region or role change events of raftstore. If the -/// region is the leader region and the role of this node is changed, a `ChangeRole` -/// task will be scheduled to the deadlock detector. It's the only way to change the -/// node from the leader of deadlock detector to follower, and vice versa. +/// `RoleChangeNotifier` observes region or role change events of raftstore. If +/// the region is the leader region and the role of this node is changed, a +/// `ChangeRole` task will be scheduled to the deadlock detector. It's the only +/// way to change the node from the leader of deadlock detector to follower, and +/// vice versa. #[derive(Clone)] pub(crate) struct RoleChangeNotifier { /// The id of the valid leader region. @@ -755,8 +763,9 @@ where } } } - // If the node is a follower, it will receive a `ChangeRole(Follower)` msg when the leader - // is changed. It should reset itself even if the role of the node is not changed. + // If the node is a follower, it will receive a `ChangeRole(Follower)` msg when + // the leader is changed. It should reset itself even if the role of the + // node is not changed. self.reset(role); } @@ -794,8 +803,9 @@ where /// Returns true if sends successfully. /// - /// If the client is None, reconnects the leader first, then sends the request to the leader. - /// If sends failed, sets the client to None for retry. + /// If the client is None, reconnects the leader first, then sends the + /// request to the leader. If sends failed, sets the client to None for + /// retry. fn send_request_to_leader( &mut self, tp: DetectType, @@ -889,11 +899,13 @@ where if self.send_request_to_leader(tp, txn_ts, lock, diag_ctx.clone()) { return; } - // Because the client is asynchronous, it won't be closed until failing to send a - // request. So retry to refresh the leader info and send it again. + // Because the client is asynchronous, it won't be closed until + // failing to send a request. So retry to + // refresh the leader info and send it again. } - // If a request which causes deadlock is dropped, it leads to the waiter timeout. - // TiDB will retry to acquire the lock and detect deadlock again. + // If a request which causes deadlock is dropped, it leads to the waiter + // timeout. TiDB will retry to acquire the lock and detect deadlock + // again. warn!("detect request dropped"; "tp" => ?tp, "txn_ts" => txn_ts, "lock" => ?lock); ERROR_COUNTER_METRICS.dropped.inc(); } @@ -1304,7 +1316,8 @@ pub mod tests { tag, }; - // Detect specified edges sequentially, and expects the last one will cause the deadlock. + // Detect specified edges sequentially, and expects the last one will cause the + // deadlock. let test_once = |edges: &[Edge<'_>]| { let mut detect_table = DetectTable::new(Duration::from_millis(100)); let mut edge_map = HashMap::default(); diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index 7ce6b50e6c0..91e25a2edeb 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -54,7 +54,8 @@ fn detected_slot_idx(txn_ts: TimeStamp) -> usize { /// `LockManager` has two components working in two threads: /// * One is the `WaiterManager` which manages transactions waiting for locks. -/// * The other one is the `Detector` which detects deadlocks between transactions. +/// * The other one is the `Detector` which detects deadlocks between +/// transactions. pub struct LockManager { waiter_mgr_worker: Option>, detector_worker: Option>, @@ -198,8 +199,9 @@ impl LockManager { } } - /// Creates a `RoleChangeNotifier` of the deadlock detector worker and registers it to - /// the `CoprocessorHost` to observe the role change events of the leader region. + /// Creates a `RoleChangeNotifier` of the deadlock detector worker and + /// registers it to the `CoprocessorHost` to observe the role change + /// events of the leader region. pub fn register_detector_role_change_observer( &self, host: &mut CoprocessorHost, @@ -208,7 +210,8 @@ impl LockManager { role_change_notifier.register(host); } - /// Creates a `DeadlockService` to handle deadlock detect requests from other nodes. + /// Creates a `DeadlockService` to handle deadlock detect requests from + /// other nodes. pub fn deadlock_service(&self) -> DeadlockService { DeadlockService::new( self.waiter_mgr_scheduler.clone(), @@ -268,7 +271,8 @@ impl LockManagerTrait for LockManager { self.waiter_mgr_scheduler .wait_for(start_ts, cb, pr, lock, timeout, diag_ctx.clone()); - // If it is the first lock the transaction tries to lock, it won't cause deadlock. + // If it is the first lock the transaction tries to lock, it won't cause + // deadlock. if !is_first_lock { self.add_to_detected(start_ts); self.detector_scheduler.detect(start_ts, lock, diag_ctx); @@ -288,8 +292,9 @@ impl LockManagerTrait for LockManager { self.waiter_mgr_scheduler .wake_up(lock_ts, hashes, commit_ts); } - // If a pessimistic transaction is committed or rolled back and it once sent requests to - // detect deadlock, clean up its wait-for entries in the deadlock detector. + // If a pessimistic transaction is committed or rolled back and it once sent + // requests to detect deadlock, clean up its wait-for entries in the + // deadlock detector. if is_pessimistic_txn && self.remove_from_detected(lock_ts) { self.detector_scheduler.clean_up(lock_ts); } diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index 8c3d2c7749d..8e5225bef76 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -40,11 +40,13 @@ struct DelayInner { cancelled: bool, } -/// `Delay` is a wrapper of `tokio_timer::Delay` which has a resolution of one millisecond. -/// It has some extra features than `tokio_timer::Delay` used by `WaiterManager`. +/// `Delay` is a wrapper of `tokio_timer::Delay` which has a resolution of one +/// millisecond. It has some extra features than `tokio_timer::Delay` used by +/// `WaiterManager`. /// -/// `Delay` performs no work and completes with `true` once the specified deadline has been reached. -/// If it has been cancelled, it will complete with `false` at arbitrary time. +/// `Delay` performs no work and completes with `true` once the specified +/// deadline has been reached. If it has been cancelled, it will complete with +/// `false` at arbitrary time. // FIXME: Use `tokio_timer::DelayQueue` instead if https://github.com/tokio-rs/tokio/issues/1700 is fixed. #[derive(Clone)] struct Delay { @@ -325,7 +327,8 @@ impl WaitTable { WAIT_TABLE_STATUS_GAUGE.txns.inc(); None } - // Here we don't increase waiter_count because it's already updated in LockManager::wait_for() + // Here we don't increase waiter_count because it's already updated in + // LockManager::wait_for() } /// Removes all waiters waiting for the lock. @@ -348,10 +351,11 @@ impl WaitTable { Some(waiter) } - /// Removes the `Waiter` with the smallest start ts and returns it with remaining waiters. + /// Removes the `Waiter` with the smallest start ts and returns it with + /// remaining waiters. /// - /// NOTE: Due to the borrow checker, it doesn't remove the entry in the `WaitTable` - /// even if there is no remaining waiter. + /// NOTE: Due to the borrow checker, it doesn't remove the entry in the + /// `WaitTable` even if there is no remaining waiter. fn remove_oldest_waiter(&mut self, lock: Lock) -> Option<(Waiter, &mut Waiters)> { let waiters = self.wait_table.get_mut(&lock.hash)?; let oldest_idx = waiters @@ -823,7 +827,8 @@ pub mod tests { waiter_ts: TimeStamp, mut lock_info: LockInfo, deadlock_hash: u64, - expect_wait_chain: &[(u64, u64, &[u8], &[u8])], // (waiter_ts, wait_for_ts, key, resource_group_tag) + expect_wait_chain: &[(u64, u64, &[u8], &[u8])], /* (waiter_ts, wait_for_ts, key, + * resource_group_tag) */ ) { match res { Err(StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( diff --git a/src/server/node.rs b/src/server/node.rs index eb2cc72e432..84aeb89377d 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -241,7 +241,8 @@ where self.store.get_id() } - /// Gets the Scheduler of RaftstoreConfigTask, it must be called after start. + /// Gets the Scheduler of RaftstoreConfigTask, it must be called after + /// start. pub fn refresh_config_scheduler(&mut self) -> Scheduler { self.system.refresh_config_scheduler() } @@ -251,7 +252,8 @@ where pub fn get_router(&self) -> RaftRouter { self.system.router() } - /// Gets a transmission end of a channel which is used send messages to apply worker. + /// Gets a transmission end of a channel which is used send messages to + /// apply worker. pub fn get_apply_router(&self) -> ApplyRouter { self.system.apply_router() } @@ -289,11 +291,12 @@ where .kv .get_msg::(keys::STORE_IDENT_KEY)? .expect("Store should have bootstrapped"); - // API version is not written into `StoreIdent` in legacy TiKV, thus it will be V1 in - // `StoreIdent` regardless of `storage.enable_ttl`. To allow upgrading from legacy V1 - // TiKV, the config switch between V1 and V1ttl are not checked here. - // It's safe to do so because `storage.enable_ttl` is impossible to change thanks to the - // config check. + // API version is not written into `StoreIdent` in legacy TiKV, thus it will be + // V1 in `StoreIdent` regardless of `storage.enable_ttl`. To allow upgrading + // from legacy V1 TiKV, the config switch between V1 and V1ttl are not checked + // here. It's safe to do so because `storage.enable_ttl` is impossible to change + // thanks to the config check. let should_check = match (ident.api_version, + // self.api_version) { let should_check = match (ident.api_version, self.api_version) { (ApiVersion::V1, ApiVersion::V1ttl) | (ApiVersion::V1ttl, ApiVersion::V1) => false, (left, right) => left != right, diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index 4b2815f5d73..bc0e8a59303 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -133,7 +133,8 @@ impl Queue { self.buf.pop() } - /// Same as `try_pop` but register interest on readiness when `None` is returned. + /// Same as `try_pop` but register interest on readiness when `None` is + /// returned. /// /// The method should be called in polling context. If the queue is empty, /// it will register current polling task for notifications. @@ -244,8 +245,8 @@ impl Buffer for BatchMessageBuffer { #[inline] fn push(&mut self, msg: RaftMessage) { let msg_size = Self::message_size(&msg); - // To avoid building too large batch, we limit each batch's size. Since `msg_size` - // is estimated, `GRPC_SEND_MSG_BUF` is reserved for errors. + // To avoid building too large batch, we limit each batch's size. Since + // `msg_size` is estimated, `GRPC_SEND_MSG_BUF` is reserved for errors. if self.size > 0 && (self.size + msg_size + self.cfg.raft_client_grpc_send_msg_buffer >= self.cfg.max_grpc_send_msg_len as usize @@ -276,9 +277,10 @@ impl Buffer for BatchMessageBuffer { self.push(more); } - // try refresh config after flush. `max_grpc_send_msg_len` and `raft_msg_max_batch_size` - // can impact the buffer push logic, but since they are soft restriction, we check config change - // at here to avoid affact performance since `push` is a hot path. + // try refresh config after flush. `max_grpc_send_msg_len` and + // `raft_msg_max_batch_size` can impact the buffer push logic, but since + // they are soft restriction, we check config change at here to avoid + // affact performance since `push` is a hot path. self.maybe_refresh_config(); res @@ -533,7 +535,8 @@ where RAFT_MESSAGE_FLUSH_COUNTER.full.inc_by(1); } - // So either enough messages are batched up or don't need to wait or wait timeouts. + // So either enough messages are batched up or don't need to wait or wait + // timeouts. s.flush_timeout.take(); ready!(Poll::Ready(s.buffer.flush(&mut s.sender)))?; continue; @@ -823,9 +826,9 @@ async fn start( let f = back_end.batch_call(&client, addr.clone()); let mut res = f.await; if res == Ok(()) { - // If the call is setup successfully, it will never finish. Returning `Ok(())` means the - // batch_call is not supported, we are probably connect to an old version of TiKV. So we - // need to fallback to use legacy API. + // If the call is setup successfully, it will never finish. Returning `Ok(())` + // means the batch_call is not supported, we are probably connect to + // an old version of TiKV. So we need to fallback to use legacy API. let f = back_end.call(&client, addr.clone()); res = f.await; } @@ -836,7 +839,8 @@ async fn start( Err(_) => { error!("connection abort"; "store_id" => back_end.store_id, "addr" => addr); if retry_times > 1 { - // Clears pending messages to avoid consuming high memory when one node is shutdown. + // Clears pending messages to avoid consuming high memory when one node is + // shutdown. back_end.clear_pending_message("unreachable"); } else { // At least report failure in metrics. @@ -990,9 +994,9 @@ where /// Sends a message. /// - /// If the message fails to be sent, false is returned. Returning true means the message is - /// enqueued to buffer. Caller is expected to call `flush` to ensure all buffered messages - /// are sent out. + /// If the message fails to be sent, false is returned. Returning true means + /// the message is enqueued to buffer. Caller is expected to call `flush` to + /// ensure all buffered messages are sent out. pub fn send(&mut self, msg: RaftMessage) -> result::Result<(), DiscardReason> { let store_id = msg.get_to_peer().store_id; let grpc_raft_conn_num = self.builder.cfg.value().grpc_raft_conn_num as u64; diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index e8c06c220b8..ab60f969493 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -514,7 +514,8 @@ impl Coprocessor for ReplicaReadLockChecker {} impl ReadIndexObserver for ReplicaReadLockChecker { fn on_step(&self, msg: &mut eraftpb::Message, role: StateRole) { // Only check and return result if the current peer is a leader. - // If it's not a leader, the read index request will be redirected to the leader later. + // If it's not a leader, the read index request will be redirected to the leader + // later. if msg.get_msg_type() != MessageType::MsgReadIndex || role != StateRole::Leader { return; } @@ -574,7 +575,8 @@ mod tests { use super::*; - // This test ensures `ReplicaReadLockChecker` won't change UUID context of read index. + // This test ensures `ReplicaReadLockChecker` won't change UUID context of read + // index. #[test] fn test_replica_read_lock_checker_for_single_uuid() { let cm = ConcurrencyManager::new(1.into()); diff --git a/src/server/reset_to_version.rs b/src/server/reset_to_version.rs index de837bdb1cb..20bd65ac17a 100644 --- a/src/server/reset_to_version.rs +++ b/src/server/reset_to_version.rs @@ -22,7 +22,8 @@ const BATCH_SIZE: usize = 256; /// todo: Report this to the user. #[derive(Debug, Clone)] pub enum ResetToVersionState { - /// `RemovingWrite` means we are removing stale data in the `WRITE` and `DEFAULT` cf + /// `RemovingWrite` means we are removing stale data in the `WRITE` and + /// `DEFAULT` cf RemovingWrite { scanned: usize }, /// `RemovingWrite` means we are removing stale data in the `LOCK` cf RemovingLock { scanned: usize }, @@ -40,7 +41,8 @@ impl ResetToVersionState { } } -/// `ResetToVersionWorker` is the worker that does the actual reset-to-version work. +/// `ResetToVersionWorker` is the worker that does the actual reset-to-version +/// work. pub struct ResetToVersionWorker { /// `ts` is the timestamp to reset to. ts: TimeStamp, @@ -168,8 +170,9 @@ impl ResetToVersionWorker { } } -/// `ResetToVersionManager` is the manager that manages the reset-to-version process. -/// User should interact with `ResetToVersionManager` instead of using `ResetToVersionWorker` directly. +/// `ResetToVersionManager` is the manager that manages the reset-to-version +/// process. User should interact with `ResetToVersionManager` instead of using +/// `ResetToVersionWorker` directly. pub struct ResetToVersionManager { /// Current state of the reset-to-version process. state: Arc>, diff --git a/src/server/server.rs b/src/server/server.rs index 196a6584be7..c5aa6311193 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -477,7 +477,8 @@ mod tests { } } - // if this failed, unset the environmental variables 'http_proxy' and 'https_proxy', and retry. + // if this failed, unset the environmental variables 'http_proxy' and + // 'https_proxy', and retry. #[test] fn test_peer_resolve() { let cfg = Config { diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index 740e597e5e2..e66bb3ec40c 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -61,7 +61,8 @@ pub struct Service> { } impl> Service { - /// Constructs a new `Service` with `Engines`, a `RaftStoreRouter` and a `GcWorker`. + /// Constructs a new `Service` with `Engines`, a `RaftStoreRouter` and a + /// `GcWorker`. pub fn new( engines: Engines, pool: Handle, diff --git a/src/server/service/diagnostics/log.rs b/src/server/service/diagnostics/log.rs index a79ca0c4e8a..4ab02f819da 100644 --- a/src/server/service/diagnostics/log.rs +++ b/src/server/service/diagnostics/log.rs @@ -173,7 +173,8 @@ impl Iterator for LogIterator { if self.pre_log.time < self.begin_time { continue; } - // treat the invalid log with the pre valid log time and level but its own whole line content + // treat the invalid log with the pre valid log time and level but its own + // whole line content item.set_time(self.pre_log.time); item.set_level(self.pre_log.get_level()); item.set_message(input.to_owned()); @@ -267,8 +268,8 @@ fn parse(input: &str) -> Result<(&str, (i64, LogLevel)), Error> { Ok((content, (timestamp, level))) } -/// Parses the start time and end time of a log file and return the maximal and minimal -/// timestamp in unix milliseconds. +/// Parses the start time and end time of a log file and return the maximal and +/// minimal timestamp in unix milliseconds. fn parse_time_range(file: &std::fs::File) -> Result<(i64, i64), Error> { let file_start_time = parse_start_time(file, 10)?; let file_end_time = parse_end_time(file, 10)?; diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index c0cc3eb1c6a..9eb88016424 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -37,7 +37,7 @@ impl NicSnapshot { fn into_pairs(self, prev: &NicSnapshot) -> Vec { macro_rules! pair { - ($label: literal, $value: expr, $old_value: expr) => {{ + ($label:literal, $value:expr, $old_value:expr) => {{ let mut pair = ServerInfoPair::default(); pair.set_key($label.to_owned()); pair.set_value(format!("{:.2}", ($value - $old_value) as f64)); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 878a138aafe..1ad81ec8900 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -218,8 +218,8 @@ macro_rules! handle_request { } macro_rules! set_total_time { - ($resp: ident, $duration: expr, no_time_detail) => {}; - ($resp: ident, $duration: expr, has_time_detail) => { + ($resp:ident, $duration:expr,no_time_detail) => {}; + ($resp:ident, $duration:expr,has_time_detail) => { let mut $resp = $resp; $resp .mut_exec_details_v2() @@ -627,8 +627,8 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ) { let begin_instant = Instant::now(); - // DestroyRange is a very dangerous operation. We don't allow passing MIN_KEY as start, or - // MAX_KEY as end here. + // DestroyRange is a very dangerous operation. We don't allow passing MIN_KEY as + // start, or MAX_KEY as end here. assert!(!req.get_start_key().is_empty()); assert!(!req.get_end_key().is_empty()); @@ -726,8 +726,8 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor if let Err(err @ RaftStoreError::StoreNotMatch { .. }) = Self::handle_raft_message(store_id, &ch, msg, reject) { - // Return an error here will break the connection, only do that for `StoreNotMatch` to - // let tikv to resolve a correct address from PD + // Return an error here will break the connection, only do that for + // `StoreNotMatch` to let tikv to resolve a correct address from PD return Err(Error::from(err)); } } @@ -772,8 +772,8 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor if let Err(err @ RaftStoreError::StoreNotMatch { .. }) = Self::handle_raft_message(store_id, &ch, msg, reject) { - // Return an error here will break the connection, only do that for `StoreNotMatch` to - // let tikv to resolve a correct address from PD + // Return an error here will break the connection, only do that for + // `StoreNotMatch` to let tikv to resolve a correct address from PD return Err(Error::from(err)); } } @@ -1412,8 +1412,9 @@ async fn future_handle_empty( ) -> ServerResult { let mut res = BatchCommandsEmptyResponse::default(); res.set_test_id(req.get_test_id()); - // `BatchCommandsWaker` processes futures in notify. If delay_time is too small, notify - // can be called immediately, so the future is polled recursively and lead to deadlock. + // `BatchCommandsWaker` processes futures in notify. If delay_time is too small, + // notify can be called immediately, so the future is polled recursively and + // lead to deadlock. if req.get_delay_time() >= 10 { let _ = tikv_util::timer::GLOBAL_TIMER_HANDLE .delay( @@ -1733,9 +1734,11 @@ fn future_raw_batch_put( let pairs_len = req.get_pairs().len(); // The TTL for each key in seconds. // - // In some TiKV of old versions, only one TTL can be provided and the TTL will be applied to all keys in - // the request. For compatibility reasons, if the length of `ttls` is exactly one, then the TTL will be applied - // to all keys. Otherwise, the length mismatch between `ttls` and `pairs` will return an error. + // In some TiKV of old versions, only one TTL can be provided and the TTL will + // be applied to all keys in the request. For compatibility reasons, if the + // length of `ttls` is exactly one, then the TTL will be applied to all keys. + // Otherwise, the length mismatch between `ttls` and `pairs` will return an + // error. let ttls = if req.get_ttls().is_empty() { vec![0; pairs_len] } else if req.get_ttls().len() == 1 { diff --git a/src/server/snap.rs b/src/server/snap.rs index 15304c51cdd..f451b6b70e9 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -120,7 +120,8 @@ pub struct SendStat { /// Send the snapshot to specified address. /// -/// It will first send the normal raft snapshot message and then send the snapshot file. +/// It will first send the normal raft snapshot message and then send the +/// snapshot file. pub fn send_snap( env: Arc, mgr: SnapManager, diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index c4cb6a67fbb..13b7b94297d 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -352,7 +352,8 @@ where Ok(val) => val, Err(err) => return Ok(make_response(StatusCode::BAD_REQUEST, err.to_string())), }, - None => 99, // Default frequency of sampling. 99Hz to avoid coincide with special periods + None => 99, /* Default frequency of sampling. 99Hz to avoid coincide with special + * periods */ }; let prototype_content_type: hyper::http::HeaderValue = @@ -565,8 +566,9 @@ where } // 1. POST "/config" will modify the configuration of TiKV. - // 2. GET "/region" will get start key and end key. These keys could be actual - // user data since in some cases the data itself is stored in the key. + // 2. GET "/region" will get start key and end key. These keys could be + // actual user data since in some cases the data itself is stored in the + // key. let should_check_cert = !matches!( (&method, path.as_ref()), (&Method::GET, "/metrics") @@ -858,7 +860,8 @@ async fn handle_fail_points_request(req: Request) -> hyper::Result { - // In this scope the path must be like /fail...(/...), which starts with FAIL_POINTS_REQUEST_PATH and may or may not have a sub path + // In this scope the path must be like /fail...(/...), which starts with + // FAIL_POINTS_REQUEST_PATH and may or may not have a sub path // Now we return 404 when path is neither /fail nor /fail/ if path != FAIL_POINTS_REQUEST_PATH && path != fail_path { return Ok(Response::builder() diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index 88f45a9ca9e..a37712dfd68 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -122,7 +122,8 @@ where } /// Activate heap profile and call `callback` if successfully. -/// `deactivate_heap_profile` can only be called after it's notified from `callback`. +/// `deactivate_heap_profile` can only be called after it's notified from +/// `callback`. pub async fn activate_heap_profile( dump_period: S, store_path: PathBuf, @@ -299,7 +300,8 @@ fn extract_thread_name(thread_name: &str) -> String { .unwrap_or_else(|| thread_name.to_owned()) } -// Re-define some heap profiling functions because heap-profiling is not enabled for tests. +// Re-define some heap profiling functions because heap-profiling is not enabled +// for tests. #[cfg(test)] mod test_utils { use std::sync::Mutex; diff --git a/src/storage/config.rs b/src/storage/config.rs index 78850c9964c..2a5ac4840e0 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -282,10 +282,10 @@ pub struct IORateLimitConfig { pub max_bytes_per_sec: ReadableSize, #[online_config(skip)] pub mode: IORateLimitMode, - /// When this flag is off, high-priority IOs are counted but not limited. Default - /// set to false because the optimal throughput target provided by user might not be - /// the maximum available bandwidth. For multi-tenancy use case, this flag should be - /// turned on. + /// When this flag is off, high-priority IOs are counted but not limited. + /// Default set to false because the optimal throughput target provided by + /// user might not be the maximum available bandwidth. For multi-tenancy + /// use case, this flag should be turned on. #[online_config(skip)] pub strict: bool, pub foreground_read_priority: IOPriority, diff --git a/src/storage/errors.rs b/src/storage/errors.rs index 8c3ca2c4116..dae61653f07 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -21,8 +21,9 @@ use crate::storage::{ }; #[derive(Debug, Error)] -/// Detailed errors for storage operations. This enum also unifies code for basic error -/// handling functionality in a single place instead of being spread out. +/// Detailed errors for storage operations. This enum also unifies code for +/// basic error handling functionality in a single place instead of being spread +/// out. pub enum ErrorInner { #[error("{0}")] Kv(#[from] kv::Error), @@ -177,8 +178,9 @@ pub enum ErrorHeaderKind { } impl ErrorHeaderKind { - /// TODO: This function is only used for bridging existing & legacy metric tags. - /// It should be removed once Coprocessor starts using new static metrics. + /// TODO: This function is only used for bridging existing & legacy metric + /// tags. It should be removed once Coprocessor starts using new static + /// metrics. pub fn get_str(&self) -> &'static str { match *self { ErrorHeaderKind::NotLeader => "not_leader", @@ -204,8 +206,8 @@ const SCHEDULER_IS_BUSY: &str = "scheduler is busy"; const GC_WORKER_IS_BUSY: &str = "gc worker is busy"; const DEADLINE_EXCEEDED: &str = "deadline is exceeded"; -/// Get the `ErrorHeaderKind` enum that corresponds to the error in the protobuf message. -/// Returns `ErrorHeaderKind::Other` if no match found. +/// Get the `ErrorHeaderKind` enum that corresponds to the error in the protobuf +/// message. Returns `ErrorHeaderKind::Other` if no match found. pub fn get_error_kind_from_header(header: &errorpb::Error) -> ErrorHeaderKind { if header.has_not_leader() { ErrorHeaderKind::NotLeader @@ -266,8 +268,8 @@ pub fn extract_region_error(res: &Result) -> Option { Some(err) } Err(Error(box ErrorInner::Closed)) => { - // TiKV is closing, return an RegionError to tell the client that this region is unavailable - // temporarily, the client should retry the request in other TiKVs. + // TiKV is closing, return an RegionError to tell the client that this region is + // unavailable temporarily, the client should retry the request in other TiKVs. let mut err = errorpb::Error::default(); err.set_message("TiKV is Closing".to_string()); Some(err) diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index e3d1507224b..0867c30fb31 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -68,8 +68,8 @@ impl TestEngineBuilder { } /// Register causal observer for RawKV API V2. - // TODO: `RocksEngine` is coupling with RawKV features including GC (compaction filter) & CausalObserver. - // Consider decoupling them. + // TODO: `RocksEngine` is coupling with RawKV features including GC (compaction + // filter) & CausalObserver. Consider decoupling them. fn register_causal_observer(engine: &mut RocksEngine) { let causal_ts_provider = Arc::new(causal_ts::tests::TestProvider::default()); let causal_ob = diff --git a/src/storage/lock_manager.rs b/src/storage/lock_manager.rs index 61d99f1a4dd..def756c921e 100644 --- a/src/storage/lock_manager.rs +++ b/src/storage/lock_manager.rs @@ -20,8 +20,9 @@ pub struct Lock { pub struct DiagnosticContext { /// The key we care about pub key: Vec, - /// This tag is used for aggregate related kv requests (eg. generated from same statement) - /// Currently it is the encoded SQL digest if the client is TiDB + /// This tag is used for aggregate related kv requests (eg. generated from + /// same statement) Currently it is the encoded SQL digest if the client + /// is TiDB pub resource_group_tag: Vec, } @@ -41,8 +42,8 @@ impl WaitTimeout { } } - /// Timeouts are encoded as i64s in protobufs where 0 means using default timeout. - /// Negative means no wait. + /// Timeouts are encoded as i64s in protobufs where 0 means using default + /// timeout. Negative means no wait. pub fn from_encoded(i: i64) -> Option { use std::cmp::Ordering::*; @@ -60,15 +61,18 @@ impl From for WaitTimeout { } } -/// `LockManager` manages transactions waiting for locks held by other transactions. -/// It has responsibility to handle deadlocks between transactions. +/// `LockManager` manages transactions waiting for locks held by other +/// transactions. It has responsibility to handle deadlocks between +/// transactions. pub trait LockManager: Clone + Send + 'static { /// Transaction with `start_ts` waits for `lock` released. /// - /// If the lock is released or waiting times out or deadlock occurs, the transaction - /// should be waken up and call `cb` with `pr` to notify the caller. + /// If the lock is released or waiting times out or deadlock occurs, the + /// transaction should be waken up and call `cb` with `pr` to notify the + /// caller. /// - /// If the lock is the first lock the transaction waits for, it won't result in deadlock. + /// If the lock is the first lock the transaction waits for, it won't result + /// in deadlock. fn wait_for( &self, start_ts: TimeStamp, @@ -80,7 +84,8 @@ pub trait LockManager: Clone + Send + 'static { diag_ctx: DiagnosticContext, ); - /// The locks with `lock_ts` and `hashes` are released, tries to wake up transactions. + /// The locks with `lock_ts` and `hashes` are released, tries to wake up + /// transactions. fn wake_up( &self, lock_ts: TimeStamp, diff --git a/src/storage/mod.rs b/src/storage/mod.rs index aab89299641..6338525ab02 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -2,40 +2,50 @@ // #[PerformanceCriticalPath] -//! This module contains TiKV's transaction layer. It lowers high-level, transactional -//! commands to low-level (raw key-value) interactions with persistent storage. +//! This module contains TiKV's transaction layer. It lowers high-level, +//! transactional commands to low-level (raw key-value) interactions with +//! persistent storage. //! -//! This module is further split into layers: [`txn`](txn) lowers transactional commands to -//! key-value operations on an MVCC abstraction. [`mvcc`](mvcc) is our MVCC implementation. -//! [`kv`](kv) is an abstraction layer over persistent storage. +//! This module is further split into layers: [`txn`](txn) lowers transactional +//! commands to key-value operations on an MVCC abstraction. [`mvcc`](mvcc) is +//! our MVCC implementation. [`kv`](kv) is an abstraction layer over persistent +//! storage. //! -//! Other responsibilities of this module are managing latches (see [`latch`](txn::latch)), deadlock -//! and wait handling (see [`lock_manager`](lock_manager)), sche -//! duling command execution (see -//! [`txn::scheduler`](txn::scheduler)), and handling commands from the raw and versioned APIs (in -//! the [`Storage`](Storage) struct). +//! Other responsibilities of this module are managing latches (see +//! [`latch`](txn::latch)), deadlock and wait handling (see +//! [`lock_manager`](lock_manager)), sche duling command execution (see +//! [`txn::scheduler`](txn::scheduler)), and handling commands from the raw and +//! versioned APIs (in the [`Storage`](Storage) struct). //! //! For more information about TiKV's transactions, see the [sig-txn docs](https://github.com/tikv/sig-transaction/tree/master/doc). //! //! Some important types are: //! -//! * the [`Engine`](kv::Engine) trait and related traits, which abstracts over underlying storage, -//! * the [`MvccTxn`](mvcc::txn::MvccTxn) struct, which is the primary object in the MVCC -//! implementation, -//! * the commands in the [`commands`](txn::commands) module, which are how each command is implemented, -//! * the [`Storage`](Storage) struct, which is the primary entry point for this module. +//! * the [`Engine`](kv::Engine) trait and related traits, which abstracts over +//! underlying storage, +//! * the [`MvccTxn`](mvcc::txn::MvccTxn) struct, which is the primary object in +//! the MVCC implementation, +//! * the commands in the [`commands`](txn::commands) module, which are how each +//! command is implemented, +//! * the [`Storage`](Storage) struct, which is the primary entry point for this +//! module. //! //! Related code: //! -//! * the [`kv`](crate::server::service::kv) module, which is the interface for TiKV's APIs, -//! * the [`lock_manager](crate::server::lock_manager), which takes part in lock and deadlock -//! management, -//! * [`gc_worker`](crate::server::gc_worker), which drives garbage collection of old values, -//! * the [`txn_types](::txn_types) crate, some important types for this module's interface, -//! * the [`kvproto`](::kvproto) crate, which defines TiKV's protobuf API and includes some -//! documentation of the commands implemented here, -//! * the [`test_storage`](::test_storage) crate, integration tests for this module, -//! * the [`engine_traits`](::engine_traits) crate, more detail of the engine abstraction. +//! * the [`kv`](crate::server::service::kv) module, which is the interface for +//! TiKV's APIs, +//! * the [`lock_manager](crate::server::lock_manager), which takes part in lock +//! and deadlock management, +//! * [`gc_worker`](crate::server::gc_worker), which drives garbage collection +//! of old values, +//! * the [`txn_types](::txn_types) crate, some important types for this +//! module's interface, +//! * the [`kvproto`](::kvproto) crate, which defines TiKV's protobuf API and +//! includes some documentation of the commands implemented here, +//! * the [`test_storage`](::test_storage) crate, integration tests for this +//! module, +//! * the [`engine_traits`](::engine_traits) crate, more detail of the engine +//! abstraction. pub mod config; pub mod config_manager; @@ -119,27 +129,31 @@ use crate::{ pub type Result = std::result::Result; pub type Callback = Box) + Send>; -/// [`Storage`](Storage) implements transactional KV APIs and raw KV APIs on a given [`Engine`]. -/// An [`Engine`] provides low level KV functionality. [`Engine`] has multiple implementations. -/// When a TiKV server is running, a [`RaftKv`](crate::server::raftkv::RaftKv) will be the -/// underlying [`Engine`] of [`Storage`]. The other two types of engines are for test purpose. +/// [`Storage`](Storage) implements transactional KV APIs and raw KV APIs on a +/// given [`Engine`]. An [`Engine`] provides low level KV functionality. +/// [`Engine`] has multiple implementations. When a TiKV server is running, a +/// [`RaftKv`](crate::server::raftkv::RaftKv) will be the underlying [`Engine`] +/// of [`Storage`]. The other two types of engines are for test purpose. /// -///[`Storage`] is reference counted and cloning [`Storage`] will just increase the reference counter. -/// Storage resources (i.e. threads, engine) will be released when all references are dropped. +/// [`Storage`] is reference counted and cloning [`Storage`] will just increase +/// the reference counter. Storage resources (i.e. threads, engine) will be +/// released when all references are dropped. /// -/// Notice that read and write methods may not be performed over full data in most cases, i.e. when -/// underlying engine is [`RaftKv`](crate::server::raftkv::RaftKv), -/// which limits data access in the range of a single region -/// according to specified `ctx` parameter. However, -/// [`unsafe_destroy_range`](crate::server::gc_worker::GcTask::UnsafeDestroyRange) is the only exception. -/// It's always performed on the whole TiKV. +/// Notice that read and write methods may not be performed over full data in +/// most cases, i.e. when underlying engine is +/// [`RaftKv`](crate::server::raftkv::RaftKv), which limits data access in the +/// range of a single region according to specified `ctx` parameter. However, +/// [`unsafe_destroy_range`](crate::server::gc_worker::GcTask:: +/// UnsafeDestroyRange) is the only exception. It's always performed on the +/// whole TiKV. /// -/// Operations of [`Storage`](Storage) can be divided into two types: MVCC operations and raw operations. -/// MVCC operations uses MVCC keys, which usually consist of several physical keys in different -/// CFs. In default CF and write CF, the key will be memcomparable-encoded and append the timestamp -/// to it, so that multiple versions can be saved at the same time. -/// Raw operations use raw keys, which are saved directly to the engine without memcomparable- -/// encoding and appending timestamp. +/// Operations of [`Storage`](Storage) can be divided into two types: MVCC +/// operations and raw operations. MVCC operations uses MVCC keys, which usually +/// consist of several physical keys in different CFs. In default CF and write +/// CF, the key will be memcomparable-encoded and append the timestamp to it, so +/// that multiple versions can be saved at the same time. Raw operations use raw +/// keys, which are saved directly to the engine without memcomparable- encoding +/// and appending timestamp. pub struct Storage { // TODO: Too many Arcs, would be slow when clone. engine: E, @@ -214,7 +228,7 @@ impl Drop for Storage { } macro_rules! check_key_size { - ($key_iter: expr, $max_key_size: expr, $callback: ident) => { + ($key_iter:expr, $max_key_size:expr, $callback:ident) => { for k in $key_iter { let key_size = k.len(); if key_size > $max_key_size { @@ -332,7 +346,8 @@ impl Storage { unsafe { with_tls_engine(f) } } - /// Check the given raw kv CF name. If the given cf is empty, CF_DEFAULT will be returned. + /// Check the given raw kv CF name. If the given cf is empty, CF_DEFAULT + /// will be returned. // TODO: refactor to use `Api` parameter. fn rawkv_cf(cf: &str, api_version: ApiVersion) -> Result { match api_version { @@ -360,8 +375,10 @@ impl Storage { /// Check if key range is valid /// - /// - If `reverse` is true, `end_key` is less than `start_key`. `end_key` is the lower bound. - /// - If `reverse` is false, `end_key` is greater than `start_key`. `end_key` is the upper bound. + /// - If `reverse` is true, `end_key` is less than `start_key`. `end_key` is + /// the lower bound. + /// - If `reverse` is false, `end_key` is greater than `start_key`. + /// `end_key` is the upper bound. fn check_key_ranges(ranges: &[KeyRange], reverse: bool) -> bool { let ranges_len = ranges.len(); for i in 0..ranges_len { @@ -415,7 +432,8 @@ impl Storage { /// * Request of V2 with legal prefix. /// See the following for detail: /// * rfc: https://github.com/tikv/rfcs/blob/master/text/0069-api-v2.md. - /// * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, enum APIVersion. + /// * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, + /// enum APIVersion. // TODO: refactor to use `Api` parameter. fn check_api_version( storage_api_version: ApiVersion, @@ -696,9 +714,11 @@ impl Storage { } } - /// Get values of a set of keys with separate context from a snapshot, return a list of `Result`s. + /// Get values of a set of keys with separate context from a snapshot, + /// return a list of `Result`s. /// - /// Only writes that are committed before their respective `start_ts` are visible. + /// Only writes that are committed before their respective `start_ts` are + /// visible. pub fn batch_get_command>, Statistics)>>( &self, requests: Vec, @@ -713,15 +733,17 @@ impl Storage { let concurrency_manager = self.concurrency_manager.clone(); let api_version = self.api_version; - // The resource tags of these batched requests are not the same, and it is quite expensive - // to distinguish them, so we can find random one of them as a representative. + // The resource tags of these batched requests are not the same, and it is quite + // expensive to distinguish them, so we can find random one of them as a + // representative. let rand_index = rand::thread_rng().gen_range(0, requests.len()); let rand_ctx = requests[rand_index].get_context(); let rand_key = requests[rand_index].get_key().to_vec(); let resource_tag = self .resource_tag_factory .new_tag_with_key_ranges(rand_ctx, vec![(rand_key.clone(), rand_key)]); - // Unset the TLS tracker because the future below does not belong to any specific request + // Unset the TLS tracker because the future below does not belong to any + // specific request clear_tls_tracker_token(); let res = self.read_pool.spawn_handle( async move { @@ -1044,9 +1066,10 @@ impl Storage { } } - /// Scan keys in [`start_key`, `end_key`) up to `limit` keys from the snapshot. - /// If `reverse_scan` is true, it scans [`end_key`, `start_key`) in descending order. - /// If `end_key` is `None`, it means the upper bound or the lower bound if reverse scan is unbounded. + /// Scan keys in [`start_key`, `end_key`) up to `limit` keys from the + /// snapshot. If `reverse_scan` is true, it scans [`end_key`, + /// `start_key`) in descending order. If `end_key` is `None`, it means + /// the upper bound or the lower bound if reverse scan is unbounded. /// /// Only writes committed before `start_ts` are visible. pub fn scan( @@ -1270,15 +1293,16 @@ impl Storage { .inc(); // Do not check_api_version in scan_lock, to be compatible with TiDB gc-worker, - // which resolves locks on regions, and boundary of regions will be out of range of TiDB keys. + // which resolves locks on regions, and boundary of regions will be out of range + // of TiDB keys. let command_duration = tikv_util::time::Instant::now(); concurrency_manager.update_max_ts(max_ts); let begin_instant = Instant::now(); - // TODO: Though it's very unlikely to find a conflicting memory lock here, it's not - // a good idea to return an error to the client, making the GC fail. A better - // approach is to wait for these locks to be unlocked. + // TODO: Though it's very unlikely to find a conflicting memory lock here, it's + // not a good idea to return an error to the client, making the GC fail. A + // better approach is to wait for these locks to be unlocked. concurrency_manager.read_range_check( start_key.as_ref(), end_key.as_ref(), @@ -1364,7 +1388,8 @@ impl Storage { } } - // The entry point of the storage scheduler. Not only transaction commands need to access keys serially. + // The entry point of the storage scheduler. Not only transaction commands need + // to access keys serially. pub fn sched_txn_command( &self, cmd: TypedCommand, @@ -1423,11 +1448,13 @@ impl Storage { /// Delete all keys in the range [`start_key`, `end_key`). /// - /// All keys in the range will be deleted permanently regardless of their timestamps. - /// This means that deleted keys will not be retrievable by specifying an older timestamp. - /// If `notify_only` is set, the data will not be immediately deleted, but the operation will - /// still be replicated via Raft. This is used to notify that the data will be deleted by - /// [`unsafe_destroy_range`](crate::server::gc_worker::GcTask::UnsafeDestroyRange) soon. + /// All keys in the range will be deleted permanently regardless of their + /// timestamps. This means that deleted keys will not be retrievable by + /// specifying an older timestamp. If `notify_only` is set, the data will + /// not be immediately deleted, but the operation will still be replicated + /// via Raft. This is used to notify that the data will be deleted by + /// [`unsafe_destroy_range`](crate::server::gc_worker::GcTask:: + /// UnsafeDestroyRange) soon. pub fn delete_range( &self, ctx: Context, @@ -1502,7 +1529,8 @@ impl Storage { let begin_instant = Instant::now(); let mut stats = Statistics::default(); let key = F::encode_raw_key_owned(key, None); - // Keys pass to `tls_collect_query` should be encoded, to get correct keys for region split. + // Keys pass to `tls_collect_query` should be encoded, to get correct keys for + // region split. tls_collect_query( ctx.get_region_id(), ctx.get_peer(), @@ -1555,8 +1583,9 @@ impl Storage { let priority_tag = get_priority_tag(priority); let api_version = self.api_version; - // The resource tags of these batched requests are not the same, and it is quite expensive - // to distinguish them, so we can find random one of them as a representative. + // The resource tags of these batched requests are not the same, and it is quite + // expensive to distinguish them, so we can find random one of them as a + // representative. let rand_index = rand::thread_rng().gen_range(0, gets.len()); let rand_ctx = gets[rand_index].get_context(); let rand_key = gets[rand_index].get_key().to_vec(); @@ -1590,9 +1619,9 @@ impl Storage { for (mut req, id) in gets.into_iter().zip(ids) { let ctx = req.take_context(); let key = F::encode_raw_key_owned(req.take_key(), None); - // Keys pass to `tls_collect_query` should be encoded, to get correct keys for region split. - // Don't place in loop of `snaps`, otherwise `snap.wait` may run in another thread, - // and cause the `thread-local` statistics unstable for test. + // Keys pass to `tls_collect_query` should be encoded, to get correct keys for + // region split. Don't place in loop of `snaps`, otherwise `snap.wait` may run + // in another thread, and cause the `thread-local` statistics unstable for test. tls_collect_query( ctx.get_region_id(), ctx.get_peer(), @@ -1890,7 +1919,8 @@ impl Storage { } /// Delete a raw key from the storage. - /// In API V2, data is "logical" deleted, to enable CDC of delete operations. + /// In API V2, data is "logical" deleted, to enable CDC of delete + /// operations. pub fn raw_delete( &self, ctx: Context, @@ -1921,8 +1951,9 @@ impl Storage { } /// Delete all raw keys in [`start_key`, `end_key`). - /// Note that in API V2, data is still "physical" deleted, as "logical" delete for a range will be quite expensive. - /// Notification of range delete operations will be through a special channel (unimplemented yet). + /// Note that in API V2, data is still "physical" deleted, as "logical" + /// delete for a range will be quite expensive. Notification of range delete + /// operations will be through a special channel (unimplemented yet). pub fn raw_delete_range( &self, ctx: Context, @@ -1959,7 +1990,8 @@ impl Storage { } /// Delete some raw keys in a batch. - /// In API V2, data is "logical" deleted, to enable CDC of delete operations. + /// In API V2, data is "logical" deleted, to enable CDC of delete + /// operations. pub fn raw_batch_delete( &self, ctx: Context, @@ -1995,14 +2027,16 @@ impl Storage { /// Scan raw keys in a range. /// - /// If `reverse_scan` is false, the range is [`start_key`, `end_key`); otherwise, the range is - /// [`end_key`, `start_key`) and it scans from `start_key` and goes backwards. If `end_key` is `None`, it - /// means unbounded. + /// If `reverse_scan` is false, the range is [`start_key`, `end_key`); + /// otherwise, the range is [`end_key`, `start_key`) and it scans from + /// `start_key` and goes backwards. If `end_key` is `None`, it means + /// unbounded. /// /// This function scans at most `limit` keys. /// /// If `key_only` is true, the value - /// corresponding to the key will not be read out. Only scanned keys will be returned. + /// corresponding to the key will not be read out. Only scanned keys will be + /// returned. pub fn raw_scan( &self, ctx: Context, @@ -2048,7 +2082,8 @@ impl Storage { let start_key = F::encode_raw_key_owned(start_key, None); let end_key = end_key.map(|k| F::encode_raw_key_owned(k, None)); - // Keys pass to `tls_collect_query` should be encoded, to get correct keys for region split. + // Keys pass to `tls_collect_query` should be encoded, to get correct keys for + // region split. tls_collect_query( ctx.get_region_id(), ctx.get_peer(), @@ -2324,7 +2359,8 @@ impl Storage { let begin_instant = Instant::now(); let mut stats = Statistics::default(); let key = F::encode_raw_key_owned(key, None); - // Keys pass to `tls_collect_query` should be encoded, to get correct keys for region split. + // Keys pass to `tls_collect_query` should be encoded, to get correct keys for + // region split. tls_collect_query( ctx.get_region_id(), ctx.get_peer(), @@ -3658,7 +3694,7 @@ mod tests { None, cfs_opts, cache.is_some(), - None, /*io_rate_limiter*/ + None, // io_rate_limiter ) } .unwrap(); @@ -4599,7 +4635,8 @@ mod tests { #[test] fn test_raw_v2_multi_versions() { - // Test update on the same key to verify multi-versions implementation of RawKV V2. + // Test update on the same key to verify multi-versions implementation of RawKV + // V2. let test_data = vec![Some(b"v1"), Some(b"v2"), None, Some(b"v3")]; let k = b"r\0k".to_vec(); @@ -5502,7 +5539,8 @@ mod tests { false ); - // if end_key is omitted, the next start_key is used instead. so, false is returned. + // if end_key is omitted, the next start_key is used instead. so, false is + // returned. let ranges = make_ranges(vec![ (b"c".to_vec(), vec![]), (b"b".to_vec(), vec![]), @@ -6386,8 +6424,8 @@ mod tests { }, ); - // We should be able to resolve all locks for transaction ts=100 when there are this - // many locks. + // We should be able to resolve all locks for transaction ts=100 when there are + // this many locks. let scanned_locks_coll = vec![ 1, RESOLVE_LOCK_BATCH_SIZE, @@ -6609,7 +6647,8 @@ mod tests { ) }; - // `advise_ttl` = 90, which is less than current ttl 100. The lock's ttl will remains 100. + // `advise_ttl` = 90, which is less than current ttl 100. The lock's ttl will + // remains 100. storage .sched_txn_command( commands::TxnHeartBeat::new(k.clone(), 10.into(), 90, Context::default()), @@ -6618,8 +6657,8 @@ mod tests { .unwrap(); rx.recv().unwrap(); - // `advise_ttl` = 110, which is greater than current ttl. The lock's ttl will be updated to - // 110. + // `advise_ttl` = 110, which is greater than current ttl. The lock's ttl will be + // updated to 110. storage .sched_txn_command( commands::TxnHeartBeat::new(k.clone(), 10.into(), 110, Context::default()), @@ -6684,8 +6723,8 @@ mod tests { assert_eq!(cm.max_ts(), ts(9, 1)); - // No lock and no commit info. If specified rollback_if_not_exist, the key will be rolled - // back. + // No lock and no commit info. If specified rollback_if_not_exist, the key will + // be rolled back. storage .sched_txn_command( commands::CheckTxnStatus::new( @@ -7959,9 +7998,9 @@ mod tests { } // This is one of the series of tests to test overlapped timestamps. - // Overlapped ts means there is a rollback record and a commit record with the same ts. - // In this test we check that if rollback happens before commit, then they should not have overlapped ts, - // which is an expected property. + // Overlapped ts means there is a rollback record and a commit record with the + // same ts. In this test we check that if rollback happens before commit, then + // they should not have overlapped ts, which is an expected property. #[test] fn test_overlapped_ts_rollback_before_prewrite() { let engine = TestEngineBuilder::new().build().unwrap(); @@ -8114,8 +8153,9 @@ mod tests { .unwrap(); assert!(rx.recv().unwrap() > 10); } - // this test shows that the scheduler take `response_policy` in `WriteResult` serious, - // ie. call the callback at expected stage when writing to the engine + // this test shows that the scheduler take `response_policy` in `WriteResult` + // serious, ie. call the callback at expected stage when writing to the + // engine #[test] fn test_scheduler_response_policy() { struct Case { @@ -8279,8 +8319,8 @@ mod tests { .unwrap(); let (tx, rx) = channel(); - // Pessimistically lock k1, k2, k3, k4, after the pessimistic retry k2 is no longer needed - // and the pessimistic lock on k2 is left. + // Pessimistically lock k1, k2, k3, k4, after the pessimistic retry k2 is no + // longer needed and the pessimistic lock on k2 is left. storage .sched_txn_command( new_acquire_pessimistic_lock_command( @@ -8352,7 +8392,8 @@ mod tests { rx.recv().unwrap(); // Pessimistically rollback the k2 lock. - // Non lite lock resolve on k1 and k2, there should no errors as lock on k2 is pessimistic type. + // Non lite lock resolve on k1 and k2, there should no errors as lock on k2 is + // pessimistic type. must_rollback(&storage.engine, b"k2", 10, false); let mut temp_map = HashMap::default(); temp_map.insert(10.into(), 20.into()); @@ -8489,7 +8530,8 @@ mod tests { // Test check_api_version. // See the following for detail: // * rfc: https://github.com/tikv/rfcs/blob/master/text/0069-api-v2.md. - // * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, enum APIVersion. + // * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, + // enum APIVersion. #[test] fn test_check_api_version() { use error_code::storage::*; @@ -8871,7 +8913,8 @@ mod tests { } let (tx, rx) = channel(); - // The written in-memory pessimistic lock should be visible, so the new lock request should fail. + // The written in-memory pessimistic lock should be visible, so the new lock + // request should fail. storage .sched_txn_command( new_acquire_pessimistic_lock_command( @@ -8886,7 +8929,8 @@ mod tests { }), ) .unwrap(); - // DummyLockManager just drops the callback, so it will fail to receive anything. + // DummyLockManager just drops the callback, so it will fail to receive + // anything. assert!(rx.recv().is_err()); let (tx, rx) = channel(); @@ -8943,7 +8987,8 @@ mod tests { ) .unwrap(); rx.recv().unwrap(); - // When disabling in-memory pessimistic lock, the lock map should remain unchanged. + // When disabling in-memory pessimistic lock, the lock map should remain + // unchanged. assert!(txn_ext.pessimistic_locks.read().is_empty()); let (tx, rx) = channel(); diff --git a/src/storage/mvcc/consistency_check.rs b/src/storage/mvcc/consistency_check.rs index eb788cb4dd3..d715ec598c2 100644 --- a/src/storage/mvcc/consistency_check.rs +++ b/src/storage/mvcc/consistency_check.rs @@ -28,8 +28,9 @@ use crate::storage::mvcc::{Lock, LockType, WriteRef, WriteType}; const PHYSICAL_SHIFT_BITS: usize = 18; const SAFE_POINT_WINDOW: usize = 120; -// When leader broadcasts a ComputeHash command to followers, it's possible that the safe point -// becomes stale when the command reaches followers. So use a 2 minutes window to reduce this. +// When leader broadcasts a ComputeHash command to followers, it's possible that +// the safe point becomes stale when the command reaches followers. So use a 2 +// minutes window to reduce this. fn get_safe_point_for_check(mut safe_point: u64) -> u64 { safe_point >>= PHYSICAL_SHIFT_BITS; safe_point += (SAFE_POINT_WINDOW * 1000) as u64; // 120s * 1000ms/s. diff --git a/src/storage/mvcc/reader/mod.rs b/src/storage/mvcc/reader/mod.rs index 440a1650ca3..2e7d20ccf2b 100644 --- a/src/storage/mvcc/reader/mod.rs +++ b/src/storage/mvcc/reader/mod.rs @@ -24,23 +24,25 @@ pub enum NewerTsCheckState { NotMetYet, } -/// The result of `get_txn_commit_record`, which is used to get the status of a specified -/// transaction from write cf. +/// The result of `get_txn_commit_record`, which is used to get the status of a +/// specified transaction from write cf. #[derive(Debug)] pub enum TxnCommitRecord { - /// The commit record of the given transaction is not found. But it's possible that there's - /// another transaction's commit record, whose `commit_ts` equals to the current transaction's - /// `start_ts`. That kind of record will be returned via the `overlapped_write` field. - /// In this case, if the current transaction is to be rolled back, the `overlapped_write` must not - /// be overwritten. + /// The commit record of the given transaction is not found. But it's + /// possible that there's another transaction's commit record, whose + /// `commit_ts` equals to the current transaction's `start_ts`. That + /// kind of record will be returned via the `overlapped_write` field. + /// In this case, if the current transaction is to be rolled back, the + /// `overlapped_write` must not be overwritten. None { overlapped_write: Option, }, /// Found the transaction's write record. SingleRecord { commit_ts: TimeStamp, write: Write }, - /// The transaction's status is found in another transaction's record's `overlapped_rollback` - /// field. This may happen when the current transaction's `start_ts` is the same as the - /// `commit_ts` of another transaction on this key. + /// The transaction's status is found in another transaction's record's + /// `overlapped_rollback` field. This may happen when the current + /// transaction's `start_ts` is the same as the `commit_ts` of another + /// transaction on this key. OverlappedRollback { commit_ts: TimeStamp }, } diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index a9ce84aada7..434d0948310 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -50,8 +50,8 @@ impl PointGetterBuilder { self } - /// Set whether values of the user key should be omitted. When `omit_value` is `true`, the - /// length of returned value will be 0. + /// Set whether values of the user key should be omitted. When `omit_value` + /// is `true`, the length of returned value will be 0. /// /// Previously this option is called `key_only`. /// @@ -93,8 +93,8 @@ impl PointGetterBuilder { self } - /// Check whether there is data with newer ts. The result of `met_newer_ts_data` is Unknown - /// if this option is not set. + /// Check whether there is data with newer ts. The result of + /// `met_newer_ts_data` is Unknown if this option is not set. /// /// Default is false. #[inline] @@ -132,8 +132,9 @@ impl PointGetterBuilder { } } -/// This struct can be used to get the value of user keys. Internally, rollbacks are ignored and -/// smaller version will be tried. If the isolation level is Si, locks will be checked first. +/// This struct can be used to get the value of user keys. Internally, rollbacks +/// are ignored and smaller version will be tried. If the isolation level is Si, +/// locks will be checked first. /// /// Use `PointGetterBuilder` to build `PointGetter`. pub struct PointGetter { @@ -169,7 +170,8 @@ impl PointGetter { fail_point!("point_getter_get"); if need_check_locks(self.isolation_level) { - // Check locks that signal concurrent writes for `Si` or more recent writes for `RcCheckTs`. + // Check locks that signal concurrent writes for `Si` or more recent writes for + // `RcCheckTs`. if let Some(lock) = self.load_and_check_lock(user_key)? { return self.load_data_from_lock(user_key, lock); } @@ -178,13 +180,14 @@ impl PointGetter { self.load_data(user_key) } - /// Get a lock of a user key in the lock CF. If lock exists, it will be checked to - /// see whether it conflicts with the given `ts` and return an error if so. If the - /// lock is in access_locks, it will be returned and caller can read through it. + /// Get a lock of a user key in the lock CF. If lock exists, it will be + /// checked to see whether it conflicts with the given `ts` and return + /// an error if so. If the lock is in access_locks, it will be returned + /// and caller can read through it. /// - /// In common cases we expect to get nothing in lock cf. Using a `get_cf` instead of `seek` - /// is fast in such cases due to no need for RocksDB to continue move and skip deleted entries - /// until find a user key. + /// In common cases we expect to get nothing in lock cf. Using a `get_cf` + /// instead of `seek` is fast in such cases due to no need for RocksDB + /// to continue move and skip deleted entries until find a user key. fn load_and_check_lock(&mut self, user_key: &Key) -> Result> { self.statistics.lock.get += 1; let lock_value = self.snapshot.get_cf(CF_LOCK, user_key)?; @@ -216,8 +219,8 @@ impl PointGetter { /// Load the value. /// - /// First, a correct version info in the Write CF will be sought. Then, value will be loaded - /// from Default CF if necessary. + /// First, a correct version info in the Write CF will be sought. Then, + /// value will be loaded from Default CF if necessary. fn load_data(&mut self, user_key: &Key) -> Result> { let mut use_near_seek = false; let mut seek_key = user_key.clone(); @@ -323,9 +326,10 @@ impl PointGetter { /// Load the value from default CF. /// - /// We assume that mostly the keys given to batch get keys are not very close to each other. - /// `near_seek` will likely fall back to `seek` in such scenario, which takes 2x time - /// compared to `get_cf`. Thus we use `get_cf` directly here. + /// We assume that mostly the keys given to batch get keys are not very + /// close to each other. `near_seek` will likely fall back to `seek` in + /// such scenario, which takes 2x time compared to `get_cf`. Thus we use + /// `get_cf` directly here. fn load_data_from_default_cf( &mut self, write_start_ts: TimeStamp, @@ -350,7 +354,8 @@ impl PointGetter { /// Load the value from the lock. /// - /// The lock belongs to a committed transaction and its commit_ts <= read's start_ts. + /// The lock belongs to a committed transaction and its commit_ts <= read's + /// start_ts. fn load_data_from_lock(&mut self, user_key: &Key, lock: Lock) -> Result> { debug_assert!(lock.ts < self.ts && lock.min_commit_ts <= self.ts); match lock.lock_type { @@ -373,8 +378,8 @@ impl PointGetter { } LockType::Delete => Ok(None), LockType::Lock | LockType::Pessimistic => { - // Only when fails to call `Lock::check_ts_conflict()`, the function is called, so it's - // unreachable here. + // Only when fails to call `Lock::check_ts_conflict()`, the function is called, + // so it's unreachable here. unreachable!() } } @@ -552,8 +557,8 @@ mod tests { engine } - /// Builds a sample engine that contains transactions on the way and some short - /// values embedded in the write CF. The data is as follows: + /// Builds a sample engine that contains transactions on the way and some + /// short values embedded in the write CF. The data is as follows: /// DELETE bar (start at 4) /// PUT bar -> barval (commit at 3) /// PUT foo1 -> foo1vv... (commit at 3) @@ -919,8 +924,8 @@ mod tests { must_get_err(&mut getter, key); must_rollback(&engine, key, 40, false); - // Should get the latest committed value if there is a primary lock with a ts less than - // the latest Write's commit_ts. + // Should get the latest committed value if there is a primary lock with a ts + // less than the latest Write's commit_ts. // // write.start_ts(10) < primary_lock.start_ts(15) < write.commit_ts(20) must_acquire_pessimistic_lock(&engine, key, key, 15, 50); @@ -1016,7 +1021,7 @@ mod tests { 100, 80.into(), 1, - 100.into(), /* min_commit_ts */ + 100.into(), // min_commit_ts TimeStamp::default(), false, Assertion::None, @@ -1229,7 +1234,8 @@ mod tests { must_get_value(&mut batch_getter, key2, val22); must_get_err(&mut batch_getter, key3); - // Test batch point get. Error should not be reported if the lock type is rollback or lock. + // Test batch point get. Error should not be reported if the lock type is + // rollback or lock. let mut batch_getter_ok = new_point_getter_with_iso(&engine, 70.into(), IsolationLevel::RcCheckTs); must_get_value(&mut batch_getter_ok, key4, val4); diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 614f8acb147..377d2c94022 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -20,15 +20,16 @@ use crate::storage::{ }, }; -/// Read from an MVCC snapshot, i.e., a logical view of the database at a specific timestamp (the -/// start_ts). +/// Read from an MVCC snapshot, i.e., a logical view of the database at a +/// specific timestamp (the start_ts). /// /// This represents the view of the database from a single transaction. /// -/// Confusingly, there are two meanings of the word 'snapshot' here. In the name of the struct, -/// 'snapshot' means an mvcc snapshot. In the type parameter bound (of `S`), 'snapshot' means a view -/// of the underlying storage engine at a given point in time. This latter snapshot will include -/// values for keys at multiple timestamps. +/// Confusingly, there are two meanings of the word 'snapshot' here. In the name +/// of the struct, 'snapshot' means an mvcc snapshot. In the type parameter +/// bound (of `S`), 'snapshot' means a view of the underlying storage engine at +/// a given point in time. This latter snapshot will include values for keys at +/// multiple timestamps. pub struct SnapshotReader { pub reader: MvccReader, pub start_ts: TimeStamp, @@ -123,11 +124,12 @@ pub struct MvccReader { lock_cursor: Option>, write_cursor: Option>, - /// None means following operations are performed on a single user key, i.e., - /// different versions of the same key. It can use prefix seek to speed up reads - /// from the write-cf. + /// None means following operations are performed on a single user key, + /// i.e., different versions of the same key. It can use prefix seek to + /// speed up reads from the write-cf. scan_mode: Option, - // Records the current key for prefix seek. Will Reset the write cursor when switching to another key. + // Records the current key for prefix seek. Will Reset the write cursor when switching to + // another key. current_key: Option, fill_cache: bool, @@ -266,28 +268,31 @@ impl MvccReader { } /// Return: - /// (commit_ts, write_record) of the write record for `key` committed before or equal to`ts` - /// Post Condition: - /// leave the write_cursor at the first record which key is less or equal to the `ts` encoded version of `key` + /// (commit_ts, write_record) of the write record for `key` committed + /// before or equal to`ts` Post Condition: + /// leave the write_cursor at the first record which key is less or equal + /// to the `ts` encoded version of `key` pub fn seek_write(&mut self, key: &Key, ts: TimeStamp) -> Result> { // Get the cursor for write record // - // When it switches to another key in prefix seek mode, creates a new cursor for it - // because the current position of the cursor is seldom around `key`. + // When it switches to another key in prefix seek mode, creates a new cursor for + // it because the current position of the cursor is seldom around `key`. if self.scan_mode.is_none() && self.current_key.as_ref().map_or(true, |k| k != key) { self.current_key = Some(key.clone()); self.write_cursor.take(); } self.create_write_cursor()?; let cursor = self.write_cursor.as_mut().unwrap(); - // find a `ts` encoded key which is less than the `ts` encoded version of the `key` + // find a `ts` encoded key which is less than the `ts` encoded version of the + // `key` let found = cursor.near_seek(&key.clone().append_ts(ts), &mut self.statistics.write)?; if !found { return Ok(None); } let write_key = cursor.key(&mut self.statistics.write); let commit_ts = Key::decode_ts_from(write_key)?; - // check whether the found written_key's "real key" part equals the `key` we want to find + // check whether the found written_key's "real key" part equals the `key` we + // want to find if !Key::is_user_key_eq(write_key, key.as_encoded()) { return Ok(None); } @@ -296,17 +301,19 @@ impl MvccReader { Ok(Some((commit_ts, write))) } - /// Gets the value of the specified key's latest version before specified `ts`. + /// Gets the value of the specified key's latest version before specified + /// `ts`. /// - /// It tries to ensure the write record's `gc_fence`'s ts, if any, greater than specified - /// `gc_fence_limit`. Pass `None` to `gc_fence_limit` to skip the check. - /// The caller must guarantee that there's no other `PUT` or `DELETE` versions whose `commit_ts` - /// is between the found version and the provided `gc_fence_limit` (`gc_fence_limit` is - /// inclusive). + /// It tries to ensure the write record's `gc_fence`'s ts, if any, greater + /// than specified `gc_fence_limit`. Pass `None` to `gc_fence_limit` to + /// skip the check. The caller must guarantee that there's no other `PUT` or + /// `DELETE` versions whose `commit_ts` is between the found version and + /// the provided `gc_fence_limit` (`gc_fence_limit` is inclusive). /// - /// For transactional reads, the `gc_fence_limit` must be provided to ensure the result is - /// correct. Generally, it should be the read_ts of the current transaction, which might be - /// different from the `ts` passed to this function. + /// For transactional reads, the `gc_fence_limit` must be provided to ensure + /// the result is correct. Generally, it should be the read_ts of the + /// current transaction, which might be different from the `ts` passed to + /// this function. /// /// Note that this function does not check for locks on `key`. fn get( @@ -321,15 +328,17 @@ impl MvccReader { }) } - /// Gets the write record of the specified key's latest version before specified `ts`. - /// It tries to ensure the write record's `gc_fence`'s ts, if any, greater than specified - /// `gc_fence_limit`. Pass `None` to `gc_fence_limit` to skip the check. - /// The caller must guarantee that there's no other `PUT` or `DELETE` versions whose `commit_ts` - /// is between the found version and the provided `gc_fence_limit` (`gc_fence_limit` is + /// Gets the write record of the specified key's latest version before + /// specified `ts`. It tries to ensure the write record's `gc_fence`'s + /// ts, if any, greater than specified `gc_fence_limit`. Pass `None` to + /// `gc_fence_limit` to skip the check. The caller must guarantee that + /// there's no other `PUT` or `DELETE` versions whose `commit_ts` is between + /// the found version and the provided `gc_fence_limit` (`gc_fence_limit` is /// inclusive). - /// For transactional reads, the `gc_fence_limit` must be provided to ensure the result is - /// correct. Generally, it should be the read_ts of the current transaction, which might be - /// different from the `ts` passed to this function. + /// For transactional reads, the `gc_fence_limit` must be provided to ensure + /// the result is correct. Generally, it should be the read_ts of the + /// current transaction, which might be different from the `ts` passed to + /// this function. pub fn get_write( &mut self, key: &Key, @@ -341,8 +350,8 @@ impl MvccReader { .map(|(w, _)| w)) } - /// Gets the write record of the specified key's latest version before specified `ts`, and - /// additionally the write record's `commit_ts`, if any. + /// Gets the write record of the specified key's latest version before + /// specified `ts`, and additionally the write record's `commit_ts`, if any. /// /// See also [`MvccReader::get_write`]. pub fn get_write_with_commit_ts( @@ -375,8 +384,8 @@ impl MvccReader { } fn get_txn_commit_record(&mut self, key: &Key, start_ts: TimeStamp) -> Result { - // It's possible a txn with a small `start_ts` has a greater `commit_ts` than a txn with - // a greater `start_ts` in pessimistic transaction. + // It's possible a txn with a small `start_ts` has a greater `commit_ts` than a + // txn with a greater `start_ts` in pessimistic transaction. // I.e., txn_1.commit_ts > txn_2.commit_ts > txn_2.start_ts > txn_1.start_ts. // // Scan all the versions from `TimeStamp::max()` to `start_ts`. @@ -462,11 +471,12 @@ impl MvccReader { Ok(None) } - /// Scan locks that satisfies `filter(lock)` returns true, from the given start key `start`. - /// At most `limit` locks will be returned. If `limit` is set to `0`, it means unlimited. + /// Scan locks that satisfies `filter(lock)` returns true, from the given + /// start key `start`. At most `limit` locks will be returned. If `limit` is + /// set to `0`, it means unlimited. /// - /// The return type is `(locks, is_remain)`. `is_remain` indicates whether there MAY be - /// remaining locks that can be scanned. + /// The return type is `(locks, is_remain)`. `is_remain` indicates whether + /// there MAY be remaining locks that can be scanned. pub fn scan_locks( &mut self, start: Option<&Key>, @@ -505,7 +515,8 @@ impl MvccReader { cursor.next(&mut self.statistics.lock); } self.statistics.lock.processed_keys += locks.len(); - // If we reach here, `cursor.valid()` is `false`, so there MUST be no more locks. + // If we reach here, `cursor.valid()` is `false`, so there MUST be no more + // locks. Ok((locks, false)) } @@ -1068,9 +1079,10 @@ pub mod tests { let snap = RegionSnapshot::::from_raw(db, region); let mut reader = MvccReader::new(snap, None, false); - // Let's assume `50_45 PUT` means a commit version with start ts is 45 and commit ts - // is 50. - // Commit versions: [50_45 PUT, 45_40 PUT, 40_35 PUT, 30_25 PUT, 20_20 Rollback, 10_1 PUT, 5_5 Rollback]. + // Let's assume `50_45 PUT` means a commit version with start ts is 45 and + // commit ts is 50. + // Commit versions: [50_45 PUT, 45_40 PUT, 40_35 PUT, 30_25 PUT, 20_20 Rollback, + // 10_1 PUT, 5_5 Rollback]. let key = Key::from_raw(k); let overlapped_write = reader .get_txn_commit_record(&key, 55.into()) @@ -1078,8 +1090,8 @@ pub mod tests { .unwrap_none(); assert!(overlapped_write.is_none()); - // When no such record is found but a record of another txn has a write record with - // its commit_ts equals to current start_ts, it + // When no such record is found but a record of another txn has a write record + // with its commit_ts equals to current start_ts, it let overlapped_write = reader .get_txn_commit_record(&key, 50.into()) .unwrap() @@ -1234,9 +1246,10 @@ pub mod tests { engine.prewrite(m, k, 23); engine.commit(k, 23, 25); - // Let's assume `2_1 PUT` means a commit version with start ts is 1 and commit ts - // is 2. - // Commit versions: [25_23 PUT, 20_10 PUT, 17_15 PUT, 7_7 Rollback, 5_1 PUT, 3_3 Rollback]. + // Let's assume `2_1 PUT` means a commit version with start ts is 1 and commit + // ts is 2. + // Commit versions: [25_23 PUT, 20_10 PUT, 17_15 PUT, 7_7 Rollback, 5_1 PUT, 3_3 + // Rollback]. let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); let mut reader = MvccReader::new(snap, None, false); @@ -1383,10 +1396,10 @@ pub mod tests { let snap = RegionSnapshot::::from_raw(db, region); let mut reader = MvccReader::new(snap, None, false); - // Let's assume `2_1 PUT` means a commit version with start ts is 1 and commit ts - // is 2. - // Commit versions: [21_17 LOCK, 20_18 PUT, 15_13 LOCK, 14_12 PUT, 9_8 DELETE, 7_6 LOCK, - // 5_5 Rollback, 2_1 PUT]. + // Let's assume `2_1 PUT` means a commit version with start ts is 1 and commit + // ts is 2. + // Commit versions: [21_17 LOCK, 20_18 PUT, 15_13 LOCK, 14_12 PUT, 9_8 DELETE, + // 7_6 LOCK, 5_5 Rollback, 2_1 PUT]. let key = Key::from_raw(k); assert!(reader.get_write(&key, 1.into(), None).unwrap().is_none()); @@ -1947,7 +1960,8 @@ pub mod tests { } } - // Must return Oldvalue::None when prev_write_loaded is true and prev_write is None. + // Must return Oldvalue::None when prev_write_loaded is true and prev_write is + // None. let engine = TestEngineBuilder::new().build().unwrap(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, None, true); diff --git a/src/storage/mvcc/reader/scanner/backward.rs b/src/storage/mvcc/reader/scanner/backward.rs index 0b20c94a819..7e3d677ea52 100644 --- a/src/storage/mvcc/reader/scanner/backward.rs +++ b/src/storage/mvcc/reader/scanner/backward.rs @@ -22,11 +22,11 @@ use crate::storage::{ // RocksDB, so don't set REVERSE_SEEK_BOUND too small. const REVERSE_SEEK_BOUND: u64 = 16; -/// This struct can be used to scan keys starting from the given user key in the reverse order -/// (less than). +/// This struct can be used to scan keys starting from the given user key in the +/// reverse order (less than). /// -/// Internally, for each key, rollbacks are ignored and smaller version will be tried. If the -/// isolation level is SI, locks will be checked first. +/// Internally, for each key, rollbacks are ignored and smaller version will be +/// tried. If the isolation level is SI, locks will be checked first. /// /// Use `ScannerBuilder` to build `BackwardKvScanner`. pub struct BackwardKvScanner { @@ -81,8 +81,8 @@ impl BackwardKvScanner { // TODO: `seek_to_last` is better, however it has performance issues currently. // TODO: We have no guarantee about whether or not the upper_bound has a // timestamp suffix, so currently it is not safe to change write_cursor's - // reverse_seek to seek_for_prev. However in future, once we have different types - // for them, this can be done safely. + // reverse_seek to seek_for_prev. However in future, once we have different + // types for them, this can be done safely. self.write_cursor.reverse_seek( self.cfg.upper_bound.as_ref().unwrap(), &mut self.statistics.write, @@ -131,9 +131,9 @@ impl BackwardKvScanner { let write_user_key = Key::truncate_ts_for(wk)?; match write_user_key.cmp(lk) { Ordering::Less => { - // We are scanning from largest user key to smallest user key, so this - // indicate that we meet a lock first, thus its corresponding write - // does not exist. + // We are scanning from largest user key to smallest user key, so + // this indicate that we meet a lock first, thus its corresponding + // write does not exist. (lk, false, true) } Ordering::Greater => { @@ -145,8 +145,8 @@ impl BackwardKvScanner { } }; - // Use `from_encoded_slice` to reserve space for ts, so later we can append ts to - // the key or its clones without reallocation. + // Use `from_encoded_slice` to reserve space for ts, so later we can append ts + // to the key or its clones without reallocation. (Key::from_encoded_slice(res.0), res.1, res.2) }; @@ -188,7 +188,8 @@ impl BackwardKvScanner { &mut self.statistics, ); if has_write { - // Skip current_user_key because this key is either blocked or handled. + // Skip current_user_key because this key is either blocked or + // handled. has_write = false; self.move_write_cursor_to_prev_user_key(¤t_user_key)?; } @@ -218,9 +219,9 @@ impl BackwardKvScanner { } } - /// Attempt to get the value of a key specified by `user_key` and `self.cfg.ts` in reverse order. - /// This function requires that the write cursor is currently pointing to the earliest version - /// of `user_key`. + /// Attempt to get the value of a key specified by `user_key` and + /// `self.cfg.ts` in reverse order. This function requires that the write + /// cursor is currently pointing to the earliest version of `user_key`. #[inline] fn reverse_get( &mut self, @@ -232,8 +233,8 @@ impl BackwardKvScanner { // At first, we try to use several `prev()` to get the desired version. - // We need to save last desired version, because when we may move to an unwanted version - // at any time. + // We need to save last desired version, because when we may move to an unwanted + // version at any time. let mut last_version = None; let mut last_checked_commit_ts = TimeStamp::zero(); @@ -310,8 +311,8 @@ impl BackwardKvScanner { } assert!(ts > last_checked_commit_ts); - // After several `prev()`, we still not get the latest version for the specified ts, - // use seek to locate the latest version. + // After several `prev()`, we still not get the latest version for the specified + // ts, use seek to locate the latest version. // Check whether newer version exists. let mut use_near_seek = false; @@ -336,8 +337,8 @@ impl BackwardKvScanner { } } - // `user_key` must have reserved space here, so its clone `seek_key` has reserved space - // too. Thus no reallocation happens in `append_ts`. + // `user_key` must have reserved space here, so its clone `seek_key` has + // reserved space too. Thus no reallocation happens in `append_ts`. seek_key = seek_key.append_ts(ts); if use_near_seek { self.write_cursor @@ -349,9 +350,9 @@ impl BackwardKvScanner { assert!(self.write_cursor.valid()?); loop { - // After seek, or after some `next()`, we may reach `last_checked_commit_ts` again. It - // means we have checked all versions for this user key. We use `last_version` as - // return. + // After seek, or after some `next()`, we may reach `last_checked_commit_ts` + // again. It means we have checked all versions for this user key. + // We use `last_version` as return. let current_ts = { let current_key = self.write_cursor.key(&mut self.statistics.write); // We should never reach another user key. @@ -387,8 +388,8 @@ impl BackwardKvScanner { } } - /// Handle last version. Last version may be PUT or DELETE. If it is a PUT, value should be - /// load. + /// Handle last version. Last version may be PUT or DELETE. If it is a PUT, + /// value should be load. #[inline] fn handle_last_version( &mut self, @@ -410,8 +411,9 @@ impl BackwardKvScanner { } } - /// Load the value by the given `some_write`. If value is carried in `some_write`, it will be - /// returned directly. Otherwise there will be a default CF look up. + /// Load the value by the given `some_write`. If value is carried in + /// `some_write`, it will be returned directly. Otherwise there will be a + /// default CF look up. /// /// The implementation is similar to `PointGetter::load_data_by_write`. #[inline] @@ -438,13 +440,13 @@ impl BackwardKvScanner { } } - /// After `self.reverse_get()`, our write cursor may be pointing to current user key (if we - /// found a desired version), or previous user key (if there is no desired version), or - /// out of bound. + /// After `self.reverse_get()`, our write cursor may be pointing to current + /// user key (if we found a desired version), or previous user key (if there + /// is no desired version), or out of bound. /// - /// If it is pointing to current user key, we need to step it until we meet a new - /// key. We first try to `prev()` a few times. If still not reaching another user - /// key, we `seek_for_prev()`. + /// If it is pointing to current user key, we need to step it until we meet + /// a new key. We first try to `prev()` a few times. If still not reaching + /// another user key, we `seek_for_prev()`. #[inline] fn move_write_cursor_to_prev_user_key(&mut self, current_user_key: &Key) -> Result<()> { for i in 0..SEEK_BOUND { @@ -520,7 +522,8 @@ mod tests { must_commit(&engine, k, ts, ts); } - // Generate REVERSE_SEEK_BOUND / 2 Put and REVERSE_SEEK_BOUND / 2 + 1 Rollback for key [8]. + // Generate REVERSE_SEEK_BOUND / 2 Put and REVERSE_SEEK_BOUND / 2 + 1 Rollback + // for key [8]. let k = &[8_u8]; for ts in 0..=REVERSE_SEEK_BOUND { must_prewrite_put(&engine, k, &[ts as u8], k, ts); @@ -540,8 +543,8 @@ mod tests { } } - // Generate REVERSE_SEEK_BOUND / 2 Put, 1 Delete and REVERSE_SEEK_BOUND / 2 Rollback - // for key [7]. + // Generate REVERSE_SEEK_BOUND / 2 Put, 1 Delete and REVERSE_SEEK_BOUND / 2 + // Rollback for key [7]. let k = &[7_u8]; for ts in 0..REVERSE_SEEK_BOUND / 2 { must_prewrite_put(&engine, k, &[ts as u8], k, ts); @@ -796,8 +799,8 @@ mod tests { assert_eq!(statistics.processed_size, 0); } - /// Check whether everything works as usual when `BackwardKvScanner::reverse_get()` goes - /// out of bound. + /// Check whether everything works as usual when + /// `BackwardKvScanner::reverse_get()` goes out of bound. /// /// Case 1. prev out of bound, next_version is None. #[test] @@ -880,8 +883,8 @@ mod tests { assert_eq!(statistics.processed_size, 0); } - /// Check whether everything works as usual when `BackwardKvScanner::reverse_get()` goes - /// out of bound. + /// Check whether everything works as usual when + /// `BackwardKvScanner::reverse_get()` goes out of bound. /// /// Case 2. prev out of bound, next_version is Some. #[test] @@ -973,7 +976,8 @@ mod tests { } /// Check whether everything works as usual when - /// `BackwardKvScanner::move_write_cursor_to_prev_user_key()` goes out of bound. + /// `BackwardKvScanner::move_write_cursor_to_prev_user_key()` goes out of + /// bound. /// /// Case 1. prev() out of bound #[test] @@ -1054,7 +1058,8 @@ mod tests { } /// Check whether everything works as usual when - /// `BackwardKvScanner::move_write_cursor_to_prev_user_key()` goes out of bound. + /// `BackwardKvScanner::move_write_cursor_to_prev_user_key()` goes out of + /// bound. /// /// Case 2. seek_for_prev() out of bound #[test] @@ -1141,7 +1146,8 @@ mod tests { } /// Check whether everything works as usual when - /// `BackwardKvScanner::move_write_cursor_to_prev_user_key()` goes out of bound. + /// `BackwardKvScanner::move_write_cursor_to_prev_user_key()` goes out of + /// bound. /// /// Case 3. a more complicated case #[test] @@ -1167,7 +1173,8 @@ mod tests { .build() .unwrap(); - // The following illustration comments assume that SEEK_BOUND = 4, REVERSE_SEEK_BOUND = 6. + // The following illustration comments assume that SEEK_BOUND = 4, + // REVERSE_SEEK_BOUND = 6. // Initial position: 1 seek_to_last: // b_11 b_10 b_9 b_8 b_7 b_6 b_5 b_4 b_3 b_2 b_1 c_1 diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 1e5163dcd78..d2c5e8b6a1b 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -94,8 +94,8 @@ impl Cursors { // We have not found another user key for now, so we directly `seek()`. // After that, we must pointing to another key, or out of bound. - // `current_user_key` must have reserved space here, so its clone has reserved space too. - // So no reallocation happens in `append_ts`. + // `current_user_key` must have reserved space here, so its clone has reserved + // space too. So no reallocation happens in `append_ts`. self.write.internal_seek( ¤t_user_key.clone().append_ts(TimeStamp::zero()), &mut statistics.write, @@ -194,17 +194,17 @@ impl> ForwardScanner { loop { // `current_user_key` is `min(user_key(write_cursor), lock_cursor)`, indicating - // the encoded user key we are currently dealing with. It may not have a write, or - // may not have a lock. It is not a slice to avoid data being invalidated after - // cursor moving. + // the encoded user key we are currently dealing with. It may not have a write, + // or may not have a lock. It is not a slice to avoid data being invalidated + // after cursor moving. // - // `has_write` indicates whether `current_user_key` has at least one corresponding - // `write`. If there is one, it is what current write cursor pointing to. The pointed - // `write` must be the most recent (i.e. largest `commit_ts`) write of - // `current_user_key`. + // `has_write` indicates whether `current_user_key` has at least one + // corresponding `write`. If there is one, it is what current write cursor + // pointing to. The pointed `write` must be the most recent (i.e. largest + // `commit_ts`) write of `current_user_key`. // - // `has_lock` indicates whether `current_user_key` has a corresponding `lock`. If - // there is one, it is what current lock cursor pointing to. + // `has_lock` indicates whether `current_user_key` has a corresponding `lock`. + // If there is one, it is what current lock cursor pointing to. let (mut current_user_key, has_write, has_lock) = { let w_key = if self.cursors.write.valid()? { Some(self.cursors.write.key(&mut self.statistics.write)) @@ -261,8 +261,8 @@ impl> ForwardScanner { } }; - // Use `from_encoded_slice` to reserve space for ts, so later we can append ts to - // the key or its clones without reallocation. + // Use `from_encoded_slice` to reserve space for ts, so later we can append ts + // to the key or its clones without reallocation. (Key::from_encoded_slice(res.0), res.1, res.2) }; @@ -303,10 +303,10 @@ impl> ForwardScanner { } } - /// Try to move the write cursor to the `self.cfg.ts` version of the given key. - /// Because it is possible that the cursor is moved to the next user key or - /// the end of key space, the method returns whether the write cursor still - /// points to the given user key. + /// Try to move the write cursor to the `self.cfg.ts` version of the given + /// key. Because it is possible that the cursor is moved to the next user + /// key or the end of key space, the method returns whether the write cursor + /// still points to the given user key. fn move_write_cursor_to_ts(&mut self, user_key: &Key) -> Result { assert!(self.cursors.write.valid()?); @@ -339,7 +339,8 @@ impl> ForwardScanner { self.met_newer_ts_data = NewerTsCheckState::Met; } - // Report error if there's a more recent version if the isolation level is RcCheckTs. + // Report error if there's a more recent version if the isolation level is + // RcCheckTs. if self.cfg.isolation_level == IsolationLevel::RcCheckTs { // TODO: the more write recent version with `LOCK` or `ROLLBACK` write type // could be skipped. @@ -354,10 +355,11 @@ impl> ForwardScanner { } } } - // If we have not found `${user_key}_${ts}` in a few `next()`, directly `seek()`. + // If we have not found `${user_key}_${ts}` in a few `next()`, directly + // `seek()`. if needs_seek { - // `user_key` must have reserved space here, so its clone has reserved space too. So no - // reallocation happens in `append_ts`. + // `user_key` must have reserved space here, so its clone has reserved space + // too. So no reallocation happens in `append_ts`. self.cursors.write.seek( &user_key.clone().append_ts(self.cfg.ts), &mut self.statistics.write, @@ -536,8 +538,9 @@ impl ScanPolicy for LatestEntryPolicy { cursors: &mut Cursors, statistics: &mut Statistics, ) -> Result> { - // Now we must have reached the first key >= `${user_key}_${ts}`. However, we may - // meet `Lock` or `Rollback`. In this case, more versions needs to be looked up. + // Now we must have reached the first key >= `${user_key}_${ts}`. However, we + // may meet `Lock` or `Rollback`. In this case, more versions needs to be looked + // up. let mut write_key = cursors.write.key(&mut statistics.write); let entry: Option = loop { if Key::decode_ts_from(write_key)? <= self.after_ts { @@ -648,7 +651,8 @@ fn scan_latest_handle_lock( .map(|_| HandleRes::Skip(current_user_key)) } -/// The ScanPolicy for outputting `TxnEntry` for every locks or commits in specified ts range. +/// The ScanPolicy for outputting `TxnEntry` for every locks or commits in +/// specified ts range. /// /// The `ForwardScanner` with this policy scans all entries whose `commit_ts`s /// (or locks' `start_ts`s) in range (`from_ts`, `cfg.ts`]. @@ -745,8 +749,8 @@ impl ScanPolicy for DeltaEntryPolicy { let write_value = cursors.write.value(&mut statistics.write); let commit_ts = Key::decode_ts_from(cursors.write.key(&mut statistics.write))?; - // commit_ts > cfg.ts never happens since the ForwardScanner will skip those greater - // versions. + // commit_ts > cfg.ts never happens since the ForwardScanner will skip those + // greater versions. if commit_ts <= self.from_ts { cursors.move_write_cursor_to_next_user_key(¤t_user_key, statistics)?; @@ -755,8 +759,9 @@ impl ScanPolicy for DeltaEntryPolicy { let (write_type, start_ts, short_value) = { // DeltaEntryScanner only returns commit records between `from_ts` and `cfg.ts`. - // We can assume that it must ensure GC safepoint doesn't exceed `from_ts`, so GC - // fence checking can be skipped. But it's still needed when loading the old value. + // We can assume that it must ensure GC safepoint doesn't exceed `from_ts`, so + // GC fence checking can be skipped. But it's still needed when loading the old + // value. let write_ref = WriteRef::parse(write_value)?; ( write_ref.write_type, @@ -832,10 +837,11 @@ impl ScanPolicy for DeltaEntryPolicy { } } -/// This type can be used to scan keys starting from the given user key (greater than or equal). +/// This type can be used to scan keys starting from the given user key (greater +/// than or equal). /// -/// Internally, for each key, rollbacks are ignored and smaller version will be tried. If the -/// isolation level is SI, locks will be checked first. +/// Internally, for each key, rollbacks are ignored and smaller version will be +/// tried. If the isolation level is SI, locks will be checked first. /// /// Use `ScannerBuilder` to build `ForwardKvScanner`. pub type ForwardKvScanner = ForwardScanner; @@ -843,8 +849,8 @@ pub type ForwardKvScanner = ForwardScanner; /// This scanner is like `ForwardKvScanner` but outputs `TxnEntry`. pub type EntryScanner = ForwardScanner; -/// This scanner scans all entries whose commit_ts (or locks' start_ts) is in range -/// (from_ts, cfg.ts]. +/// This scanner scans all entries whose commit_ts (or locks' start_ts) is in +/// range (from_ts, cfg.ts]. pub type DeltaScanner = ForwardScanner; impl TxnEntryScanner for ForwardScanner @@ -1109,7 +1115,8 @@ mod latest_kv_tests { Scanner, }; - /// Check whether everything works as usual when `ForwardKvScanner::get()` goes out of bound. + /// Check whether everything works as usual when `ForwardKvScanner::get()` + /// goes out of bound. #[test] fn test_get_out_of_bound() { let engine = TestEngineBuilder::new().build().unwrap(); @@ -1175,7 +1182,8 @@ mod latest_kv_tests { } /// Check whether everything works as usual when - /// `ForwardKvScanner::move_write_cursor_to_next_user_key()` goes out of bound. + /// `ForwardKvScanner::move_write_cursor_to_next_user_key()` goes out of + /// bound. /// /// Case 1. next() out of bound #[test] @@ -1232,7 +1240,7 @@ mod latest_kv_tests { // a_8 b_2 b_1 b_0 // ^cursor // We should be able to get wanted value without any operation. - // After get the value, use SEEK_BOUND / 2 + 1 next to reach next user key and stop: + // After get the value, use SEEK_BOUND/2+1 next to reach next user key and stop: // a_8 b_2 b_1 b_0 // ^cursor assert_eq!( @@ -1256,7 +1264,8 @@ mod latest_kv_tests { } /// Check whether everything works as usual when - /// `ForwardKvScanner::move_write_cursor_to_next_user_key()` goes out of bound. + /// `ForwardKvScanner::move_write_cursor_to_next_user_key()` goes out of + /// bound. /// /// Case 2. seek() out of bound #[test] @@ -1593,7 +1602,8 @@ mod latest_entry_tests { Engine, Modify, TestEngineBuilder, }; - /// Check whether everything works as usual when `EntryScanner::get()` goes out of bound. + /// Check whether everything works as usual when `EntryScanner::get()` goes + /// out of bound. #[test] fn test_get_out_of_bound() { let engine = TestEngineBuilder::new().build().unwrap(); @@ -1721,7 +1731,7 @@ mod latest_entry_tests { // a_8 b_2 b_1 b_0 // ^cursor // We should be able to get wanted value without any operation. - // After get the value, use SEEK_BOUND / 2 + 1 next to reach next user key and stop: + // After get the value, use SEEK_BOUND/2+1 next to reach next user key and stop: // a_8 b_2 b_1 b_0 // ^cursor let entry = EntryBuilder::default() @@ -2024,7 +2034,8 @@ mod delta_entry_tests { use super::{super::ScannerBuilder, test_util::*, *}; use crate::storage::{mvcc::tests::write, txn::tests::*, Engine, Modify, TestEngineBuilder}; - /// Check whether everything works as usual when `Delta::get()` goes out of bound. + /// Check whether everything works as usual when `Delta::get()` goes out of + /// bound. #[test] fn test_get_out_of_bound() { let engine = TestEngineBuilder::new().build().unwrap(); @@ -2151,7 +2162,7 @@ mod delta_entry_tests { // a_8 b_2 b_1 b_0 // ^cursor // We should be able to get wanted value without any operation. - // After get the value, use SEEK_BOUND / 2 + 1 next to reach next user key and stop: + // After get the value, use SEEK_BOUND/2+1 next to reach next user key and stop: // a_8 b_2 b_1 b_0 // ^cursor let entry = EntryBuilder::default() @@ -2189,8 +2200,8 @@ mod delta_entry_tests { must_commit(&engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); // Generate SEEK_BOUND rollback and 1 put for [b] . - // It differs from EntryScanner that this will try to fetch multiple versions of each key. - // So in this test it needs one more next than EntryScanner. + // It differs from EntryScanner that this will try to fetch multiple versions of + // each key. So in this test it needs one more next than EntryScanner. for ts in 1..=SEEK_BOUND { let modifies = vec![ // ts is rather small, so it is ok to `as u8` @@ -2341,8 +2352,8 @@ mod delta_entry_tests { fn test_mess() { // TODO: non-pessimistic lock should be returned enven if its ts < from_ts. // (key, lock, [commit1, commit2, ...]) - // Values ends with 'L' will be made larger than `SHORT_VALUE_MAX_LEN` so it will be saved - // in default cf. + // Values ends with 'L' will be made larger than `SHORT_VALUE_MAX_LEN` so it + // will be saved in default cf. let test_data = vec![ ( b"a" as &[u8], @@ -2555,7 +2566,8 @@ mod delta_entry_tests { while let Some(entry) = scanner.next_entry().unwrap() { actual.push(entry); } - // Do assertions one by one so that if it fails it won't print too long panic message. + // Do assertions one by one so that if it fails it won't print too long panic + // message. for i in 0..std::cmp::max(actual.len(), expected.len()) { assert_eq!( actual[i], expected[i], @@ -2695,7 +2707,8 @@ mod delta_entry_tests { // Scanning entries in (10, max] should get all prewrites check(10, vec![&entry_a_5, &entry_b_15, &entry_c_5]); - // Scanning entries include delete in (7, max] should get a_5, b_10, b_15 and c_5 + // Scanning entries include delete in (7, max] should get a_5, b_10, b_15 and + // c_5 check(7, vec![&entry_a_5, &entry_b_15, &entry_b_10, &entry_c_5]); // Scanning entries in (0, max] should get a_1, a_3, a_5, b_2, b_10, and b_15 check( diff --git a/src/storage/mvcc/reader/scanner/mod.rs b/src/storage/mvcc/reader/scanner/mod.rs index a3f759191f0..21626d2b61c 100644 --- a/src/storage/mvcc/reader/scanner/mod.rs +++ b/src/storage/mvcc/reader/scanner/mod.rs @@ -42,8 +42,8 @@ impl ScannerBuilder { self } - /// Set whether values of the user key should be omitted. When `omit_value` is `true`, the - /// length of returned value will be 0. + /// Set whether values of the user key should be omitted. When `omit_value` + /// is `true`, the length of returned value will be 0. /// /// Previously this option is called `key_only`. /// @@ -75,8 +75,8 @@ impl ScannerBuilder { self } - /// Limit the range to `[lower_bound, upper_bound)` in which the `ForwardKvScanner` should scan. - /// `None` means unbounded. + /// Limit the range to `[lower_bound, upper_bound)` in which the + /// `ForwardKvScanner` should scan. `None` means unbounded. /// /// Default is `(None, None)`. #[inline] @@ -87,8 +87,8 @@ impl ScannerBuilder { self } - /// Set locks that the scanner can bypass. Locks with start_ts in the specified set will be - /// ignored during scanning. + /// Set locks that the scanner can bypass. Locks with start_ts in the + /// specified set will be ignored during scanning. /// /// Default is empty. #[inline] @@ -98,8 +98,8 @@ impl ScannerBuilder { self } - /// Set locks that the scanner can read through. Locks with start_ts in the specified set will be - /// accessed during scanning. + /// Set locks that the scanner can read through. Locks with start_ts in the + /// specified set will be accessed during scanning. /// /// Default is empty. #[inline] @@ -133,8 +133,8 @@ impl ScannerBuilder { self } - /// Check whether there is data with newer ts. The result of `met_newer_ts_data` is Unknown - /// if this option is not set. + /// Check whether there is data with newer ts. The result of + /// `met_newer_ts_data` is Unknown if this option is not set. /// /// Default is false. #[inline] @@ -237,8 +237,8 @@ impl StoreScanner for Scanner { } } - /// Returns whether data with newer ts is found. The result is meaningful only when - /// `check_has_newer_ts_data` is set to true. + /// Returns whether data with newer ts is found. The result is meaningful + /// only when `check_has_newer_ts_data` is set to true. fn met_newer_ts_data(&self) -> NewerTsCheckState { match self { Scanner::Forward(scanner) => scanner.met_newer_ts_data(), @@ -253,9 +253,10 @@ pub struct ScannerConfig { omit_value: bool, isolation_level: IsolationLevel, - /// `lower_bound` and `upper_bound` is used to create `default_cursor`. `upper_bound` - /// is used in initial seek(or `lower_bound` in initial backward seek) as well. They will be consumed after `default_cursor` is being - /// created. + /// `lower_bound` and `upper_bound` is used to create `default_cursor`. + /// `upper_bound` is used in initial seek(or `lower_bound` in initial + /// backward seek) as well. They will be consumed after `default_cursor` is + /// being created. lower_bound: Option, upper_bound: Option, // hint for we will only scan data with commit ts >= hint_min_ts @@ -306,7 +307,8 @@ impl ScannerConfig { self.create_cf_cursor_with_scan_mode(cf, self.scan_mode()) } - /// Create the cursor with specified scan_mode, instead of inferring scan_mode from the config. + /// Create the cursor with specified scan_mode, instead of inferring + /// scan_mode from the config. #[inline] fn create_cf_cursor_with_scan_mode( &mut self, @@ -340,14 +342,15 @@ impl ScannerConfig { /// /// Internally, there will be a `near_seek` operation. /// -/// Notice that the value may be already carried in the `write` (short value). In this -/// case, you should not call this function. +/// Notice that the value may be already carried in the `write` (short value). +/// In this case, you should not call this function. /// /// # Panics /// /// Panics if there is a short value carried in the given `write`. /// -/// Panics if key in default CF does not exist. This means there is a data corruption. +/// Panics if key in default CF does not exist. This means there is a data +/// corruption. pub fn near_load_data_by_write( default_cursor: &mut Cursor, // TODO: make it `ForwardCursor`. user_key: &Key, @@ -429,14 +432,15 @@ pub fn has_data_in_range( } /// Seek for the next valid (write type == Put or Delete) write record. -/// The write cursor must indicate a data key of the user key of which ts <= after_ts. -/// Return None if cannot find any valid write record. +/// The write cursor must indicate a data key of the user key of which ts <= +/// after_ts. Return None if cannot find any valid write record. /// -/// GC fence will be checked against the specified `gc_fence_limit`. If `gc_fence_limit` is greater -/// than the `commit_ts` of the current write record pointed by the cursor, The caller must -/// guarantee that there are no other versions in range `(current_commit_ts, gc_fence_limit]`. Note -/// that if a record is determined as invalid by checking GC fence, the `write_cursor`'s position -/// will be left remain on it. +/// GC fence will be checked against the specified `gc_fence_limit`. If +/// `gc_fence_limit` is greater than the `commit_ts` of the current write record +/// pointed by the cursor, The caller must guarantee that there are no other +/// versions in range `(current_commit_ts, gc_fence_limit]`. Note that if a +/// record is determined as invalid by checking GC fence, the `write_cursor`'s +/// position will be left remain on it. pub fn seek_for_valid_write( write_cursor: &mut Cursor, user_key: &Key, @@ -477,18 +481,21 @@ where } /// Seek for the last written value. -/// The write cursor must indicate a data key of the user key of which ts <= after_ts. -/// Return None if cannot find any valid write record or found a delete record. +/// The write cursor must indicate a data key of the user key of which ts <= +/// after_ts. Return None if cannot find any valid write record or found a +/// delete record. /// -/// GC fence will be checked against the specified `gc_fence_limit`. If `gc_fence_limit` is greater -/// than the `commit_ts` of the current write record pointed by the cursor, The caller must -/// guarantee that there are no other versions in range `(current_commit_ts, gc_fence_limit]`. Note -/// that if a record is determined as invalid by checking GC fence, the `write_cursor`'s position -/// will be left remain on it. +/// GC fence will be checked against the specified `gc_fence_limit`. If +/// `gc_fence_limit` is greater than the `commit_ts` of the current write record +/// pointed by the cursor, The caller must guarantee that there are no other +/// versions in range `(current_commit_ts, gc_fence_limit]`. Note that if a +/// record is determined as invalid by checking GC fence, the `write_cursor`'s +/// position will be left remain on it. /// -/// `write_cursor` maybe created with an `TsFilter`, which can filter out some key-value pairs with -/// less `commit_ts` than `ts_filter`. So if the got value has a less timestamp than `ts_filter`, it -/// should be replaced by None because the real wanted value can have been filtered. +/// `write_cursor` maybe created with an `TsFilter`, which can filter out some +/// key-value pairs with less `commit_ts` than `ts_filter`. So if the got value +/// has a less timestamp than `ts_filter`, it should be replaced by None because +/// the real wanted value can have been filtered. pub fn seek_for_valid_value( write_cursor: &mut Cursor, default_cursor: &mut Cursor, @@ -570,8 +577,8 @@ pub(crate) fn load_data_by_lock( } LockType::Delete => Ok(None), LockType::Lock | LockType::Pessimistic => { - // Only when fails to call `Lock::check_ts_conflict()`, the function is called, so it's - // unreachable here. + // Only when fails to call `Lock::check_ts_conflict()`, the function is called, + // so it's unreachable here. unreachable!() } } @@ -592,8 +599,8 @@ mod tests { }, }; - // Collect data from the scanner and assert it equals to `expected`, which is a collection of - // (raw_key, value). + // Collect data from the scanner and assert it equals to `expected`, which is a + // collection of (raw_key, value). // `None` value in `expected` means the key is locked. fn check_scan_result( mut scanner: Scanner, @@ -842,15 +849,15 @@ mod tests { let access_locks = TsSet::from_u64s(vec![30, 40, 50, 60, 90]); let mut expected_result = vec![ - (vec![0], Some(vec![b'v', 0, 0])), /* access put if not delete_bound */ - (vec![1], Some(vec![b'v', 1, 1])), /* access put */ - /* vec![2] access delete */ - (vec![3], Some(vec![b'v', 3])), /* ignore LockType::Lock */ - (vec![4], None), /* locked */ - (vec![5], Some(vec![b'v', 5])), /* bypass */ - (vec![6], Some(vec![b'v', 6])), /* ignore lock with larger ts */ - (vec![7], Some(vec![b'v', 7])), /* no lock */ - (vec![8], Some(vec![b'v', 8, 8])), /* access put if not delete_bound*/ + (vec![0], Some(vec![b'v', 0, 0])), // access put if not delete_bound + (vec![1], Some(vec![b'v', 1, 1])), // access put + // vec![2] access delete + (vec![3], Some(vec![b'v', 3])), // ignore LockType::Lock + (vec![4], None), // locked + (vec![5], Some(vec![b'v', 5])), // bypass + (vec![6], Some(vec![b'v', 6])), // ignore lock with larger ts + (vec![7], Some(vec![b'v', 7])), // no lock + (vec![8], Some(vec![b'v', 8, 8])), // access put if not delete_bound ]; if desc { expected_result.reverse(); diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index bf8add1abfd..a5343b234ac 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -28,8 +28,9 @@ impl GcInfo { } } -/// `ReleasedLock` contains the information of the lock released by `commit`, `rollback` and so on. -/// It's used by `LockManager` to wake up transactions waiting for locks. +/// `ReleasedLock` contains the information of the lock released by `commit`, +/// `rollback` and so on. It's used by `LockManager` to wake up transactions +/// waiting for locks. #[derive(Debug, PartialEq)] pub struct ReleasedLock { /// The hash value of the lock. @@ -52,8 +53,8 @@ pub struct MvccTxn { pub(crate) start_ts: TimeStamp, pub(crate) write_size: usize, pub(crate) modifies: Vec, - // When 1PC is enabled, locks will be collected here instead of marshalled and put into `writes`, - // so it can be further processed. The elements are tuples representing + // When 1PC is enabled, locks will be collected here instead of marshalled and put into + // `writes`, so it can be further processed. The elements are tuples representing // (key, lock, remove_pessimistic_lock) pub(crate) locks_for_1pc: Vec<(Key, Lock, bool)>, // `concurrency_manager` is used to set memory locks for prewritten keys. @@ -141,14 +142,15 @@ impl MvccTxn { self.modifies.push(write); } - /// Add the timestamp of the current rollback operation to another transaction's lock if - /// necessary. + /// Add the timestamp of the current rollback operation to another + /// transaction's lock if necessary. /// - /// When putting rollback record on a key that's locked by another transaction, the second - /// transaction may overwrite the current rollback record when it's committed. Sometimes it may - /// break consistency. To solve the problem, add the timestamp of the current rollback to the - /// lock. So when the lock is committed, it can check if it will overwrite a rollback record - /// by checking the information in the lock. + /// When putting rollback record on a key that's locked by another + /// transaction, the second transaction may overwrite the current rollback + /// record when it's committed. Sometimes it may break consistency. To solve + /// the problem, add the timestamp of the current rollback to the lock. So + /// when the lock is committed, it can check if it will overwrite a rollback + /// record by checking the information in the lock. pub(crate) fn mark_rollback_on_mismatching_lock( &mut self, key: &Key, @@ -158,18 +160,20 @@ impl MvccTxn { assert_ne!(lock.ts, self.start_ts); if !is_protected { - // A non-protected rollback record is ok to be overwritten, so do nothing in this case. + // A non-protected rollback record is ok to be overwritten, so do nothing in + // this case. return; } if self.start_ts < lock.min_commit_ts { - // The rollback will surely not be overwritten by committing the lock. Do nothing. + // The rollback will surely not be overwritten by committing the lock. Do + // nothing. return; } if !lock.use_async_commit { - // Currently only async commit may use calculated commit_ts. Do nothing if it's not a - // async commit transaction. + // Currently only async commit may use calculated commit_ts. Do nothing if it's + // not a async commit transaction. return; } @@ -563,8 +567,8 @@ pub(crate) mod tests { assert_eq!(w1r.set_overlapped_rollback(false, None), w1); let w2r = must_written(&engine, k2, 11, 20, WriteType::Put); - // Rollback is invoked on secondaries, so the rollback is not protected and overlapped_rollback - // won't be set. + // Rollback is invoked on secondaries, so the rollback is not protected and + // overlapped_rollback won't be set. assert_eq!(w2r, w2); } @@ -951,8 +955,8 @@ pub(crate) mod tests { let (k, v) = (b"k", b"v"); - // Pessimistic prewrite keeps the larger TTL of the prewrite request and the original - // pessimisitic lock. + // Pessimistic prewrite keeps the larger TTL of the prewrite request and the + // original pessimisitic lock. must_acquire_pessimistic_lock_with_ttl(&engine, k, k, 10, 10, 100); must_pessimistic_locked(&engine, k, 10, 10); must_pessimistic_prewrite_put_with_ttl(&engine, k, v, k, 10, 10, true, 110); @@ -960,8 +964,8 @@ pub(crate) mod tests { must_rollback(&engine, k, 10, false); - // TTL not changed if the pessimistic lock's TTL is larger than that provided in the - // prewrite request. + // TTL not changed if the pessimistic lock's TTL is larger than that provided in + // the prewrite request. must_acquire_pessimistic_lock_with_ttl(&engine, k, k, 20, 20, 100); must_pessimistic_locked(&engine, k, 20, 20); must_pessimistic_prewrite_put_with_ttl(&engine, k, v, k, 20, 20, true, 90); @@ -1115,8 +1119,8 @@ pub(crate) mod tests { must_pessimistic_prewrite_put(&engine, k3, v3, k1, 10, 20, true); // Write a non-pessimistic lock with for_update_ts 20. must_pessimistic_prewrite_put(&engine, k2, v2, k1, 10, 20, false); - // Roll back the primary key due to timeout, but the non-pessimistic lock is not rolled - // back. + // Roll back the primary key due to timeout, but the non-pessimistic lock is not + // rolled back. must_rollback(&engine, k1, 10, false); // Txn-15 acquires pessimistic locks on k1. @@ -1188,7 +1192,8 @@ pub(crate) mod tests { #[test] fn test_async_prewrite_primary() { - // copy must_prewrite_put_impl, check that the key is written with the correct secondaries and the right timestamp + // copy must_prewrite_put_impl, check that the key is written with the correct + // secondaries and the right timestamp let engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); @@ -1239,7 +1244,8 @@ pub(crate) mod tests { // max_ts in the concurrency manager is 42, so the min_commit_ts is 43. assert_eq!(lock.min_commit_ts, TimeStamp::new(43)); - // A duplicate prewrite request should return the min_commit_ts in the primary key + // A duplicate prewrite request should return the min_commit_ts in the primary + // key assert_eq!(do_prewrite(), 43.into()); } @@ -1296,7 +1302,8 @@ pub(crate) mod tests { // max_ts in the concurrency manager is 42, so the min_commit_ts is 43. assert_eq!(lock.min_commit_ts, TimeStamp::new(43)); - // A duplicate prewrite request should return the min_commit_ts in the primary key + // A duplicate prewrite request should return the min_commit_ts in the primary + // key assert_eq!(do_pessimistic_prewrite(), 43.into()); } @@ -1345,8 +1352,8 @@ pub(crate) mod tests { must_unlocked(&engine, k); must_written(&engine, k, 10, 20, WriteType::Put); - // Optimistic transaction allows the start_ts equals to another transaction's commit_ts - // on the same key. + // Optimistic transaction allows the start_ts equals to another transaction's + // commit_ts on the same key. must_prewrite_put(&engine, k, v, k, 20); must_locked(&engine, k, 20); must_commit(&engine, k, 20, 30); @@ -1418,15 +1425,16 @@ pub(crate) mod tests { assert!(w.has_overlapped_rollback); assert!(w.gc_fence.is_none()); - // Do not commit with overlapped_rollback if the rollback ts doesn't equal to commit_ts. + // Do not commit with overlapped_rollback if the rollback ts doesn't equal to + // commit_ts. must_prewrite_put_async_commit(&engine, k, v, k, &Some(vec![]), 40, 0); must_cleanup(&engine, k, 44, 0); must_commit(&engine, k, 40, 45); let w = must_written(&engine, k, 40, 45, WriteType::Put); assert!(!w.has_overlapped_rollback); - // Do not put rollback mark to the lock if the lock is not async commit or if lock.ts is - // before start_ts or min_commit_ts. + // Do not put rollback mark to the lock if the lock is not async commit or if + // lock.ts is before start_ts or min_commit_ts. must_prewrite_put(&engine, k, v, k, 50); must_cleanup(&engine, k, 55, 0); let l = must_locked(&engine, k, 50); diff --git a/src/storage/raw/raw_mvcc.rs b/src/storage/raw/raw_mvcc.rs index 4ddfa68a757..59dd5e8f13d 100644 --- a/src/storage/raw/raw_mvcc.rs +++ b/src/storage/raw/raw_mvcc.rs @@ -151,8 +151,9 @@ impl RawMvccIterator { } // RawMvccIterator always return the latest ts of user key. -// ts is desc encoded after user key, so it's placed the first one for the same user key. -// Only one-way direction scan is supported. Like `seek` then `next` or `seek_for_prev` then `prev` +// ts is desc encoded after user key, so it's placed the first one for the same +// user key. Only one-way direction scan is supported. Like `seek` then `next` +// or `seek_for_prev` then `prev` impl Iterator for RawMvccIterator { fn next(&mut self) -> Result { if !self.is_forward { @@ -217,7 +218,8 @@ impl Iterator for RawMvccIterator { } fn key(&self) -> &[u8] { - // need map_or_else to lazy evaluate the default func, as it will abort when invalid. + // need map_or_else to lazy evaluate the default func, as it will abort when + // invalid. self.cur_key.as_deref().unwrap_or_else(|| self.inner.key()) } @@ -259,7 +261,8 @@ mod tests { let (tx, rx) = channel(); let ctx = Context::default(); - // TODO: Consider another way other than hard coding, to generate keys' prefix of test data. + // TODO: Consider another way other than hard coding, to generate keys' prefix + // of test data. let test_data = vec![ (b"r\0a".to_vec(), b"aa".to_vec(), 10), (b"r\0aa".to_vec(), b"aaa".to_vec(), 20), diff --git a/src/storage/raw/store.rs b/src/storage/raw/store.rs index 5caad0dfbb6..4d70c2bf5ff 100644 --- a/src/storage/raw/store.rs +++ b/src/storage/raw/store.rs @@ -21,7 +21,8 @@ use crate::{ const MAX_TIME_SLICE: Duration = Duration::from_millis(2); const MAX_BATCH_SIZE: usize = 1024; -// TODO: refactor to utilize generic type `KvFormat` and eliminate matching `api_version`. +// TODO: refactor to utilize generic type `KvFormat` and eliminate matching +// `api_version`. pub enum RawStore { V1(RawStoreInner), V1Ttl(RawStoreInner, ApiV1Ttl>), @@ -180,11 +181,11 @@ impl<'a, S: Snapshot, F: KvFormat> RawStoreInner { }) } - /// Scan raw keys in [`start_key`, `end_key`), returns at most `limit` keys. If `end_key` is - /// `None`, it means unbounded. + /// Scan raw keys in [`start_key`, `end_key`), returns at most `limit` keys. + /// If `end_key` is `None`, it means unbounded. /// - /// If `key_only` is true, the value corresponding to the key will not be read. Only scanned - /// keys will be returned. + /// If `key_only` is true, the value corresponding to the key will not be + /// read. Only scanned keys will be returned. pub async fn forward_raw_scan( &'a self, cf: CfName, @@ -231,11 +232,12 @@ impl<'a, S: Snapshot, F: KvFormat> RawStoreInner { Ok(pairs) } - /// Scan raw keys in [`end_key`, `start_key`) in reverse order, returns at most `limit` keys. If - /// `start_key` is `None`, it means it's unbounded. + /// Scan raw keys in [`end_key`, `start_key`) in reverse order, returns at + /// most `limit` keys. If `start_key` is `None`, it means it's unbounded. /// /// If `key_only` is true, the value - /// corresponding to the key will not be read out. Only scanned keys will be returned. + /// corresponding to the key will not be read out. Only scanned keys will be + /// returned. pub async fn reverse_raw_scan( &'a self, cf: CfName, diff --git a/src/storage/read_pool.rs b/src/storage/read_pool.rs index f93497b2905..c25ae15d46b 100644 --- a/src/storage/read_pool.rs +++ b/src/storage/read_pool.rs @@ -1,6 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -//! Distinct thread pools to handle read commands having different priority levels. +//! Distinct thread pools to handle read commands having different priority +//! levels. use std::sync::{Arc, Mutex}; @@ -26,7 +27,8 @@ impl PoolTicker for FuturePoolTicker { } } -/// Build respective thread pools to handle read commands of different priority levels. +/// Build respective thread pools to handle read commands of different priority +/// levels. pub fn build_read_pool( config: &StorageReadPoolConfig, reporter: R, diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 9cca49c9323..792ed8fcb9a 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -12,15 +12,18 @@ use crate::storage::{ Snapshot, }; -/// Acquires pessimistic lock on a single key. Optionally reads the previous value by the way. +/// Acquires pessimistic lock on a single key. Optionally reads the previous +/// value by the way. /// -/// When `need_value` is set, the first return value will be the previous value of the key (possibly -/// `None`). When `need_value` is not set but `need_check_existence` is set, the first return value -/// will be an empty value (`Some(vec![])`) if the key exists before or `None` if not. If neither -/// `need_value` nor `need_check_existence` is set, the first return value is always `None`. +/// When `need_value` is set, the first return value will be the previous value +/// of the key (possibly `None`). When `need_value` is not set but +/// `need_check_existence` is set, the first return value will be an empty value +/// (`Some(vec![])`) if the key exists before or `None` if not. If neither +/// `need_value` nor `need_check_existence` is set, the first return value is +/// always `None`. /// -/// The second return value will also contains the previous value of the key if `need_old_value` is -/// set, or `OldValue::Unspecified` otherwise. +/// The second return value will also contains the previous value of the key if +/// `need_old_value` is set, or `OldValue::Unspecified` otherwise. pub fn acquire_pessimistic_lock( txn: &mut MvccTxn, reader: &mut SnapshotReader, @@ -38,14 +41,16 @@ pub fn acquire_pessimistic_lock( crate::storage::mvcc::txn::make_txn_error(err, &key, reader.start_ts).into() )); - // Update max_ts for Insert operation to guarante linearizability and snapshot isolation + // Update max_ts for Insert operation to guarantee linearizability and snapshot + // isolation if should_not_exist { txn.concurrency_manager.update_max_ts(for_update_ts); } - // When `need_value` is set, the value need to be loaded of course. If `need_check_existence` - // and `need_old_value` are both set, we also load the value even if `need_value` is false, - // so that it avoids `load_old_value` doing repeated work. + // When `need_value` is set, the value need to be loaded of course. If + // `need_check_existence` and `need_old_value` are both set, we also load + // the value even if `need_value` is false, so that it avoids + // `load_old_value` doing repeated work. let need_load_value = need_value || (need_check_existence && need_old_value); fn load_old_value( @@ -72,7 +77,8 @@ pub fn acquire_pessimistic_lock( } } - /// Returns proper result according to the loaded value (if any) the specified settings. + /// Returns proper result according to the loaded value (if any) the + /// specified settings. #[inline] fn ret_val(need_value: bool, need_check_existence: bool, val: Option) -> Option { if need_value { @@ -160,8 +166,8 @@ pub fn acquire_pessimistic_lock( } // Handle rollback. - // The rollback information may come from either a Rollback record or a record with - // `has_overlapped_rollback` flag. + // The rollback information may come from either a Rollback record or a record + // with `has_overlapped_rollback` flag. if commit_ts == reader.start_ts && (write.write_type == WriteType::Rollback || write.has_overlapped_rollback) { @@ -172,7 +178,8 @@ pub fn acquire_pessimistic_lock( } .into()); } - // If `commit_ts` we seek is already before `start_ts`, the rollback must not exist. + // If `commit_ts` we seek is already before `start_ts`, the rollback must not + // exist. if commit_ts > reader.start_ts { if let Some((older_commit_ts, older_write)) = reader.seek_write(&key, reader.start_ts)? @@ -480,8 +487,8 @@ pub mod tests { let k = b"k1"; let v = b"v1"; - // TODO: Some corner cases don't give proper results. Although they are not important, we - // should consider whether they are better to be fixed. + // TODO: Some corner cases don't give proper results. Although they are not + // important, we should consider whether they are better to be fixed. // Normal must_succeed(&engine, k, k, 1, 1); @@ -630,8 +637,9 @@ pub mod tests { must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 38, true); must_locked(&engine, k, 35); - // Commit pessimistic transaction's key but with smaller commit_ts than for_update_ts. - // Currently not checked, so in this case it will actually be successfully committed. + // Commit pessimistic transaction's key but with smaller commit_ts than + // for_update_ts. Currently not checked, so in this case it will + // actually be successfully committed. must_commit(&engine, k, 35, 36); must_unlocked(&engine, k); must_get_commit_ts(&engine, k, 35, 36); @@ -661,17 +669,18 @@ pub mod tests { must_commit(&engine, k, 46, 50); must_unlocked(&engine, k); - // Prewrite on non-pessimistic key meets write with larger commit_ts than current - // for_update_ts (non-pessimistic data conflict). - // Normally non-pessimistic keys in pessimistic transactions are used when we are sure that - // there won't be conflicts. So this case is also not checked, and prewrite will succeeed. + // Prewrite on non-pessimistic key meets write with larger commit_ts than + // current for_update_ts (non-pessimistic data conflict). + // Normally non-pessimistic keys in pessimistic transactions are used when we + // are sure that there won't be conflicts. So this case is also not checked, and + // prewrite will succeeed. must_pessimistic_prewrite_put(&engine, k, v, k, 47, 48, false); must_locked(&engine, k, 47); must_cleanup(&engine, k, 47, 0); must_unlocked(&engine, k); - // The rollback of the primary key in a pessimistic transaction should be protected from - // being collapsed. + // The rollback of the primary key in a pessimistic transaction should be + // protected from being collapsed. must_succeed(&engine, k, k, 49, 60); must_pessimistic_prewrite_put(&engine, k, v, k, 49, 60, true); must_locked(&engine, k, 49); @@ -681,8 +690,9 @@ pub mod tests { must_rollback(&engine, k, 51, false); must_err(&engine, k, k, 49, 60); - // Overlapped rollback record will be written when the current start_ts equals to another write - // records' commit ts. Now there is a commit record with commit_ts = 50. + // Overlapped rollback record will be written when the current start_ts equals + // to another write records' commit ts. Now there is a commit record with + // commit_ts = 50. must_succeed(&engine, k, k, 50, 61); must_pessimistic_prewrite_put(&engine, k, v, k, 50, 61, true); must_locked(&engine, k, 50); @@ -846,9 +856,9 @@ pub mod tests { // PUT, LOCK, READ // `----------^ - // Note that this case is special because usually the `LOCK` is the first write already got - // during prewrite/acquire_pessimistic_lock and will continue searching an older version - // from the `LOCK` record. + // Note that this case is special because usually the `LOCK` is the first write + // already got during prewrite/acquire_pessimistic_lock and will continue + // searching an older version from the `LOCK` record. must_prewrite_put(&engine, b"k7", b"v7", b"k7", 16); must_commit(&engine, b"k7", 16, 30); must_prewrite_lock(&engine, b"k7", b"k7", 37); @@ -1072,7 +1082,8 @@ pub mod tests { must_pessimistic_prewrite_put(&engine, key, value, key, 3, 3, true); must_commit(&engine, key, 3, 5); - // T2: start_ts = 15, acquire pessimistic lock on k, with should_not_exist flag set. + // T2: start_ts = 15, acquire pessimistic lock on k, with should_not_exist flag + // set. let snapshot = engine.snapshot(Default::default()).unwrap(); let min_commit_ts = TimeStamp::zero(); let cm = ConcurrencyManager::new(min_commit_ts); @@ -1100,12 +1111,14 @@ pub mod tests { assert_eq!(cm.max_ts().into_inner(), 15); - // T3: start_ts = 8, commit_ts = max_ts + 1 = 16, prewrite a DELETE operation on k + // T3: start_ts = 8, commit_ts = max_ts + 1 = 16, prewrite a DELETE operation on + // k must_succeed(&engine, key, key, 8, 8); must_pessimistic_prewrite_delete(&engine, key, key, 8, 8, true); must_commit(&engine, key, 8, cm.max_ts().into_inner() + 1); - // T1: start_ts = 10, repeatedly acquire pessimistic lock on k, with should_not_exist flag set + // T1: start_ts = 10, repeatedly acquire pessimistic lock on k, with + // should_not_exist flag set let snapshot = engine.snapshot(Default::default()).unwrap(); let start_ts = TimeStamp::new(10); let for_update_ts = TimeStamp::new(10); @@ -1157,9 +1170,10 @@ pub mod tests { // k5: GC fence invalid must_prewrite_put(&engine, b"k5", b"v5", b"k5", 5); must_commit(&engine, b"k5", 5, 6); - // A invalid gc fence is assumed never pointing to a ts greater than GC safepoint, and - // a read operation's ts is assumed never less than the GC safepoint. Therefore since we - // will read at ts=10 later, we can't put a version greater than 10 in this case. + // A invalid gc fence is assumed never pointing to a ts greater than GC + // safepoint, and a read operation's ts is assumed never less than the + // GC safepoint. Therefore since we will read at ts=10 later, we can't + // put a version greater than 10 in this case. must_cleanup_with_gc_fence(&engine, b"k5", 6, 0, 8, true); for &need_value in &[false, true] { diff --git a/src/storage/txn/actions/check_data_constraint.rs b/src/storage/txn/actions/check_data_constraint.rs index 3b28d3e4214..35999ee6cb2 100644 --- a/src/storage/txn/actions/check_data_constraint.rs +++ b/src/storage/txn/actions/check_data_constraint.rs @@ -10,7 +10,8 @@ use crate::storage::{ /// Checks the existence of the key according to `should_not_exist`. /// If not, returns an `AlreadyExist` error. -/// The caller must guarantee that the given `write` is the latest version of the key. +/// The caller must guarantee that the given `write` is the latest version of +/// the key. pub(crate) fn check_data_constraint( reader: &mut SnapshotReader, should_not_exist: bool, @@ -18,8 +19,8 @@ pub(crate) fn check_data_constraint( write_commit_ts: TimeStamp, key: &Key, ) -> MvccResult<()> { - // Here we assume `write` is the latest version of the key. So it should not contain a - // GC fence ts. Otherwise, it must be an already-deleted version. + // Here we assume `write` is the latest version of the key. So it should not + // contain a GC fence ts. Otherwise, it must be an already-deleted version. let write_is_invalid = matches!(write.gc_fence, Some(gc_fence_ts) if !gc_fence_ts.is_zero()); if !should_not_exist || write.write_type == WriteType::Delete || write_is_invalid { @@ -28,7 +29,8 @@ pub(crate) fn check_data_constraint( // The current key exists under any of the following conditions: // 1.The current write type is `PUT` - // 2.The current write type is `Rollback` or `Lock`, and the key have an older version. + // 2.The current write type is `Rollback` or `Lock`, and the key have an older + // version. if write.write_type == WriteType::Put || reader.key_exist(key, write_commit_ts.prev())? { return Err(ErrorInner::AlreadyExist { key: key.to_raw()? }.into()); } diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index 295124fde37..2f3a2c84b11 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -11,8 +11,9 @@ use crate::storage::{ Snapshot, TxnStatus, }; -// Check whether there's an overlapped write record, and then perform rollback. The actual behavior -// to do the rollback differs according to whether there's an overlapped write record. +// Check whether there's an overlapped write record, and then perform rollback. +// The actual behavior to do the rollback differs according to whether there's +// an overlapped write record. pub fn check_txn_status_lock_exists( txn: &mut MvccTxn, reader: &mut SnapshotReader, @@ -23,8 +24,9 @@ pub fn check_txn_status_lock_exists( force_sync_commit: bool, resolving_pessimistic_lock: bool, ) -> Result<(TxnStatus, Option)> { - // Never rollback or push forward min_commit_ts in check_txn_status if it's using async commit. - // Rollback of async-commit locks are done during ResolveLock. + // Never rollback or push forward min_commit_ts in check_txn_status if it's + // using async commit. Rollback of async-commit locks are done during + // ResolveLock. if lock.use_async_commit { if force_sync_commit { info!( @@ -40,8 +42,8 @@ pub fn check_txn_status_lock_exists( let is_pessimistic_txn = !lock.for_update_ts.is_zero(); if lock.ts.physical() + lock.ttl < current_ts.physical() { // If the lock is expired, clean it up. - // If the resolving and primary key lock are both pessimistic locks, just unlock the - // primary pessimistic lock and do not write rollback records. + // If the resolving and primary key lock are both pessimistic locks, just unlock + // the primary pessimistic lock and do not write rollback records. return if resolving_pessimistic_lock && lock.lock_type == LockType::Pessimistic { let released = txn.unlock_key(primary_key, is_pessimistic_txn); MVCC_CHECK_TXN_STATUS_COUNTER_VEC.pessimistic_rollback.inc(); @@ -54,9 +56,9 @@ pub fn check_txn_status_lock_exists( }; } - // If lock.min_commit_ts is 0, it's not a large transaction and we can't push forward - // its min_commit_ts otherwise the transaction can't be committed by old version TiDB - // during rolling update. + // If lock.min_commit_ts is 0, it's not a large transaction and we can't push + // forward its min_commit_ts otherwise the transaction can't be committed by + // old version TiDB during rolling update. if !lock.min_commit_ts.is_zero() && !caller_start_ts.is_max() // Push forward the min_commit_ts so that reading won't be blocked by locks. @@ -72,8 +74,9 @@ pub fn check_txn_status_lock_exists( MVCC_CHECK_TXN_STATUS_COUNTER_VEC.update_ts.inc(); } - // As long as the primary lock's min_commit_ts > caller_start_ts, locks belong to the same transaction - // can't block reading. Return MinCommitTsPushed result to the client to let it bypass locks. + // As long as the primary lock's min_commit_ts > caller_start_ts, locks belong + // to the same transaction can't block reading. Return MinCommitTsPushed + // result to the client to let it bypass locks. let min_commit_ts_pushed = (!caller_start_ts.is_zero() && lock.min_commit_ts > caller_start_ts) // If the caller_start_ts is max, it's a point get in the autocommit transaction. // We don't push forward lock's min_commit_ts and the point get can ignore the lock @@ -157,7 +160,8 @@ pub fn rollback_lock( _ => return Ok(txn.unlock_key(key, is_pessimistic_txn)), }; - // If prewrite type is DEL or LOCK or PESSIMISTIC, it is no need to delete value. + // If prewrite type is DEL or LOCK or PESSIMISTIC, it is no need to delete + // value. if lock.short_value.is_none() && lock.lock_type == LockType::Put { txn.delete_value(key.clone(), lock.ts); } @@ -188,8 +192,8 @@ pub fn collapse_prev_rollback( Ok(()) } -/// Generate the Write record that should be written that means to perform a specified rollback -/// operation. +/// Generate the Write record that should be written that means to perform a +/// specified rollback operation. pub fn make_rollback( start_ts: TimeStamp, protected: bool, diff --git a/src/storage/txn/actions/cleanup.rs b/src/storage/txn/actions/cleanup.rs index be8dc60a768..461b8e2d432 100644 --- a/src/storage/txn/actions/cleanup.rs +++ b/src/storage/txn/actions/cleanup.rs @@ -12,12 +12,13 @@ use crate::storage::{ Snapshot, TxnStatus, }; -/// Cleanup the lock if it's TTL has expired, comparing with `current_ts`. If `current_ts` is 0, -/// cleanup the lock without checking TTL. If the lock is the primary lock of a pessimistic -/// transaction, the rollback record is protected from being collapsed. +/// Cleanup the lock if it's TTL has expired, comparing with `current_ts`. If +/// `current_ts` is 0, cleanup the lock without checking TTL. If the lock is the +/// primary lock of a pessimistic transaction, the rollback record is protected +/// from being collapsed. /// -/// Returns the released lock. Returns error if the key is locked or has already been -/// committed. +/// Returns the released lock. Returns error if the key is locked or has already +/// been committed. pub fn cleanup( txn: &mut MvccTxn, reader: &mut SnapshotReader, @@ -193,8 +194,8 @@ pub mod tests { #[test] fn test_cleanup() { - // Cleanup's logic is mostly similar to rollback, except the TTL check. Tests that not - // related to TTL check should be covered by other test cases. + // Cleanup's logic is mostly similar to rollback, except the TTL check. Tests + // that not related to TTL check should be covered by other test cases. let engine = TestEngineBuilder::new().build().unwrap(); // Shorthand for composing ts. @@ -214,8 +215,8 @@ pub mod tests { // Try to cleanup another transaction's lock. Does nothing. must_succeed(&engine, k, ts(10, 1), ts(120, 0)); - // If there is no exisiting lock when cleanup, it may be a pessimistic transaction, - // so the rollback should be protected. + // If there is no existing lock when cleanup, it may be a pessimistic + // transaction, so the rollback should be protected. must_get_rollback_protected(&engine, k, ts(10, 1), true); must_locked(&engine, k, ts(10, 0)); diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index 028241155ec..8435479991e 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -41,11 +41,11 @@ pub fn commit( .into()); } - // It's an abnormal routine since pessimistic locks shouldn't be committed in our - // transaction model. But a pessimistic lock will be left if the pessimistic - // rollback request fails to send and the transaction need not to acquire - // this lock again(due to WriteConflict). If the transaction is committed, we - // should commit this pessimistic lock too. + // It's an abnormal routine since pessimistic locks shouldn't be committed in + // our transaction model. But a pessimistic lock will be left if the pessimistic + // rollback request fails to send and the transaction need not to acquire this + // lock again(due to WriteConflict). If the transaction is committed, we should + // commit this pessimistic lock too. if lock.lock_type == LockType::Pessimistic { warn!( "commit a pessimistic lock with Lock type"; @@ -254,7 +254,8 @@ pub mod tests { ); must_succeed(&engine, k, ts(30, 0), ts(50, 0)); - // If the min_commit_ts of the pessimistic lock is greater than prewrite's, use it. + // If the min_commit_ts of the pessimistic lock is greater than prewrite's, use + // it. must_acquire_pessimistic_lock_for_large_txn(&engine, k, k, ts(60, 0), ts(60, 0), 100); check_txn_status::tests::must_success( &engine, diff --git a/src/storage/txn/actions/mod.rs b/src/storage/txn/actions/mod.rs index 518afb5a449..58c27721f56 100644 --- a/src/storage/txn/actions/mod.rs +++ b/src/storage/txn/actions/mod.rs @@ -1,7 +1,8 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -//! This file contains the "actions" we perform on a [`crate::storage::mvcc::MvccTxn`] and related -//! tests. "Actions" here means a group of more basic operations, eg. +//! This file contains the "actions" we perform on a +//! [`crate::storage::mvcc::MvccTxn`] and related tests. "Actions" here means a +//! group of more basic operations, eg. //! [`crate::storage::mvcc::MvccReader::load_lock`], //! [`crate::storage::mvcc::MvccTxn::put_write`], which are methods on //! [`crate::storage::mvcc::MvccTxn`], for archiving a certain target. diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index a96c5eabc8d..e7ca85c8137 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -33,7 +33,8 @@ pub fn prewrite( let mut mutation = PrewriteMutation::from_mutation(mutation, secondary_keys, is_pessimistic_lock, txn_props)?; - // Update max_ts for Insert operation to guarante linearizability and snapshot isolation + // Update max_ts for Insert operation to guarantee linearizability and snapshot + // isolation if mutation.should_not_exist { txn.concurrency_manager.update_max_ts(txn_props.start_ts); } @@ -76,12 +77,13 @@ pub fn prewrite( }; // Check assertion if necessary. There are couple of different cases: - // * If the write is already loaded, then assertion can be checked without introducing too much - // performance overhead. So do assertion in this case. - // * If `amend_pessimistic_lock` has happened, assertion can be done during amending. Skip it. - // * If constraint check is skipped thus `prev_write` is not loaded, doing assertion here - // introduces too much overhead. However, we'll do it anyway if `assertion_level` is set to - // `Strict` level. + // * If the write is already loaded, then assertion can be checked without + // introducing too much performance overhead. So do assertion in this case. + // * If `amend_pessimistic_lock` has happened, assertion can be done during + // amending. Skip it. + // * If constraint check is skipped thus `prev_write` is not loaded, doing + // assertion here introduces too much overhead. However, we'll do it anyway if + // `assertion_level` is set to `Strict` level. // Assertion level will be checked within the `check_assertion` function. if !lock_amended { let (reloaded_prev_write, reloaded) = @@ -95,11 +97,13 @@ pub fn prewrite( let prev_write = prev_write.map(|(w, _)| w); if mutation.should_not_write { - // `checkNotExists` is equivalent to a get operation, so it should update the max_ts. + // `checkNotExists` is equivalent to a get operation, so it should update the + // max_ts. txn.concurrency_manager.update_max_ts(txn_props.start_ts); let min_commit_ts = if mutation.need_min_commit_ts() { - // Don't calculate the min_commit_ts according to the concurrency manager's max_ts - // for a should_not_write mutation because it's not persisted and doesn't change data. + // Don't calculate the min_commit_ts according to the concurrency manager's + // max_ts for a should_not_write mutation because it's not persisted and doesn't + // change data. cmp::max(txn_props.min_commit_ts, txn_props.start_ts.next()) } else { TimeStamp::zero() @@ -273,10 +277,11 @@ impl<'a> PrewriteMutation<'a> { }) } - // Pessimistic transactions only acquire pessimistic locks on row keys and unique index keys. - // The corresponding secondary index keys are not locked until pessimistic prewrite. - // It's possible that lock conflict occurs on them, but the isolation is - // guaranteed by pessimistic locks, so let TiDB resolves these locks immediately. + // Pessimistic transactions only acquire pessimistic locks on row keys and + // unique index keys. The corresponding secondary index keys are not locked + // until pessimistic prewrite. It's possible that lock conflict occurs on + // them, but the isolation is guaranteed by pessimistic locks, so let TiDB + // resolves these locks immediately. fn lock_info(&self, lock: Lock) -> Result { let mut info = lock.into_lock_info(self.key.to_raw()?); if self.txn_props.is_pessimistic() { @@ -343,8 +348,8 @@ impl<'a> PrewriteMutation<'a> { match reader.seek_write(&self.key, TimeStamp::max())? { Some((commit_ts, write)) => { // Abort on writes after our start/for_update timestamp ... - // If exists a commit version whose commit timestamp is larger than current start/for_update - // timestamp, we should abort current prewrite. + // If exists a commit version whose commit timestamp is larger than current + // start/for_update timestamp, we should abort current prewrite. match self.txn_props.kind { TransactionKind::Optimistic(_) => { if commit_ts > self.txn_props.start_ts { @@ -380,8 +385,8 @@ impl<'a> PrewriteMutation<'a> { // TODO: Maybe we need to add a new error for the rolled back case. self.write_conflict_error(&write, commit_ts)?; } - // Should check it when no lock exists, otherwise it can report error when there is - // a lock belonging to a committed transaction which deletes the key. + // Should check it when no lock exists, otherwise it can report error when there + // is a lock belonging to a committed transaction which deletes the key. check_data_constraint(reader, self.should_not_exist, &write, commit_ts, &self.key)?; Ok(Some((write, commit_ts))) @@ -491,12 +496,13 @@ impl<'a> PrewriteMutation<'a> { |(w, _)| matches!(w.gc_fence, Some(gc_fence_ts) if !gc_fence_ts.is_zero()), ) { - // The previously-loaded write record has an invalid gc_fence. Regard it as none. + // The previously-loaded write record has an invalid gc_fence. Regard it as + // none. write = &None; } - // Load the most recent version if prev write is not loaded yet, or the prev write is not - // a data version (`Put` or `Delete`) + // Load the most recent version if prev write is not loaded yet, or the prev + // write is not a data version (`Put` or `Delete`) let need_reload = !write_loaded || write.as_ref().map_or(false, |(w, _)| { w.write_type != WriteType::Put && w.write_type != WriteType::Delete @@ -533,7 +539,8 @@ impl<'a> PrewriteMutation<'a> { _ => Ok(()), }; - // Assertion error can be caused by a rollback. So make up a constraint check if the check was skipped before. + // Assertion error can be caused by a rollback. So make up a constraint check if + // the check was skipped before. if assertion_err.is_err() { if self.skip_constraint_check() { self.check_for_newer_version(reader)?; @@ -583,8 +590,8 @@ impl<'a> PrewriteMutation<'a> { } } -// The final_min_commit_ts will be calculated if either async commit or 1PC is enabled. -// It's allowed to enable 1PC without enabling async commit. +// The final_min_commit_ts will be calculated if either async commit or 1PC is +// enabled. It's allowed to enable 1PC without enabling async commit. fn async_commit_timestamps( key: &Key, lock: &mut Lock, @@ -642,7 +649,8 @@ fn async_commit_timestamps( } // TiKV may fails to write pessimistic locks due to pipelined process. -// If the data is not changed after acquiring the lock, we can still prewrite the key. +// If the data is not changed after acquiring the lock, we can still prewrite +// the key. fn amend_pessimistic_lock( mutation: &PrewriteMutation<'_>, reader: &mut SnapshotReader, @@ -652,11 +660,14 @@ fn amend_pessimistic_lock( // The invariants of pessimistic locks are: // 1. lock's for_update_ts >= key's latest commit_ts // 2. lock's for_update_ts >= txn's start_ts - // 3. If the data is changed after acquiring the pessimistic lock, key's new commit_ts > lock's for_update_ts + // 3. If the data is changed after acquiring the pessimistic lock, key's new + // commit_ts > lock's for_update_ts // - // So, if the key's latest commit_ts is still less than or equal to lock's for_update_ts, the data is not changed. - // However, we can't get lock's for_update_ts in current implementation (txn's for_update_ts is updated for each DML), - // we can only use txn's start_ts to check -- If the key's commit_ts is less than txn's start_ts, it's less than + // So, if the key's latest commit_ts is still less than or equal to lock's + // for_update_ts, the data is not changed. However, we can't get lock's + // for_update_ts in current implementation (txn's for_update_ts is updated for + // each DML), we can only use txn's start_ts to check -- If the key's + // commit_ts is less than txn's start_ts, it's less than // lock's for_update_ts too. if *commit_ts >= reader.start_ts { warn!( @@ -676,7 +687,8 @@ fn amend_pessimistic_lock( } } // Used pipelined pessimistic lock acquiring in this txn but failed - // Luckily no other txn modified this lock, amend it by treat it as optimistic txn. + // Luckily no other txn modified this lock, amend it by treat it as optimistic + // txn. MVCC_CONFLICT_COUNTER .pipelined_acquire_pessimistic_lock_amend_success .inc(); @@ -858,8 +870,9 @@ pub mod tests { let cm = ConcurrencyManager::new(41.into()); let snapshot = engine.snapshot(Default::default()).unwrap(); - // should_not_write mutations don't write locks or change data so that they needn't ask - // the concurrency manager for max_ts. Its min_commit_ts may be less than or equal to max_ts. + // should_not_write mutations don't write locks or change data so that they + // needn't ask the concurrency manager for max_ts. Its min_commit_ts may + // be less than or equal to max_ts. let mut props = optimistic_async_props(b"k0", 10.into(), 50.into(), 2, false); props.min_commit_ts = 11.into(); let mut txn = MvccTxn::new(10.into(), cm.clone()); @@ -878,7 +891,8 @@ pub mod tests { assert!(min_ts < 41.into()); assert_eq!(old_value, OldValue::Unspecified); - // `checkNotExists` is equivalent to a get operation, so it should update the max_ts. + // `checkNotExists` is equivalent to a get operation, so it should update the + // max_ts. let mut props = optimistic_txn_props(b"k0", 42.into()); props.min_commit_ts = 43.into(); let mut txn = MvccTxn::new(42.into(), cm.clone()); @@ -1220,9 +1234,9 @@ pub mod tests { // PUT, LOCK, READ // `----------^ - // Note that this case is special because usually the `LOCK` is the first write already got - // during prewrite/acquire_pessimistic_lock and will continue searching an older version - // from the `LOCK` record. + // Note that this case is special because usually the `LOCK` is the first write + // already got during prewrite/acquire_pessimistic_lock and will continue + // searching an older version from the `LOCK` record. must_prewrite_put(&engine, b"k7", b"v7", b"k7", 16); must_commit(&engine, b"k7", 16, 30); must_prewrite_lock(&engine, b"k7", b"k7", 37); @@ -1373,9 +1387,9 @@ pub mod tests { must_commit(&engine, b"k1", 10, 20); must_commit(&engine, b"k2", 10, 20); - // This is a re-sent prewrite. It should report a PessimisticLockNotFound. In production, the caller - // will need to check if the current transaction is already committed before, in order to - // provide the idempotency. + // This is a re-sent prewrite. It should report a PessimisticLockNotFound. In + // production, the caller will need to check if the current transaction is + // already committed before, in order to provide the idempotency. let err = must_retry_pessimistic_prewrite_put_err( &engine, b"k2", @@ -1405,8 +1419,8 @@ pub mod tests { must_commit(&engine, b"k1", 35, 40); must_commit(&engine, b"k2", 35, 40); - // A retrying non-pessimistic-lock prewrite request should not skip constraint checks. - // It reports a PessimisticLockNotFound. + // A retrying non-pessimistic-lock prewrite request should not skip constraint + // checks. It reports a PessimisticLockNotFound. let err = must_retry_pessimistic_prewrite_put_err( &engine, b"k2", @@ -1434,7 +1448,8 @@ pub mod tests { must_unlocked(&engine, b"k2"); // Committing still does nothing. must_commit(&engine, b"k2", 10, 25); - // Try a different txn start ts (which haven't been successfully committed before). + // Try a different txn start ts (which haven't been successfully committed + // before). let err = must_retry_pessimistic_prewrite_put_err( &engine, b"k2", b"v2", b"k1", &None, 11, 11, false, 0, ); @@ -1443,7 +1458,8 @@ pub mod tests { Error(box ErrorInner::PessimisticLockNotFound { .. }) )); must_unlocked(&engine, b"k2"); - // However conflict still won't be checked if there's a non-retry request arriving. + // However conflict still won't be checked if there's a non-retry request + // arriving. must_prewrite_put_impl( &engine, b"k2", @@ -1464,8 +1480,9 @@ pub mod tests { must_locked(&engine, b"k2", 12); must_rollback(&engine, b"k2", 12, false); - // And conflict check is according to the for_update_ts for pessimistic prewrite. - // So, it will not report error if for_update_ts is large enough. + // And conflict check is according to the for_update_ts for pessimistic + // prewrite. So, it will not report error if for_update_ts is large + // enough. must_prewrite_put_impl( &engine, b"k2", @@ -1896,8 +1913,8 @@ pub mod tests { must_rollback(&engine, &k1, 30, true); must_rollback(&engine, &k3, 30, true); - // Pessimistic transaction assertion fail on fast/strict level if assertion happens - // during amending pessimistic lock. + // Pessimistic transaction assertion fail on fast/strict level if assertion + // happens during amending pessimistic lock. let pass = assertion_level == AssertionLevel::Off; prewrite_put( &k2, @@ -1922,7 +1939,8 @@ pub mod tests { must_rollback(&engine, &k2, 30, true); must_rollback(&engine, &k4, 30, true); - // Pessimistic transaction fail on strict level no matter whether `is_pessimistic_lock`. + // Pessimistic transaction fail on strict level no matter whether + // `is_pessimistic_lock`. let pass = assertion_level != AssertionLevel::Strict; prewrite_put( &k1, @@ -1990,8 +2008,8 @@ pub mod tests { must_cleanup_with_gc_fence(&engine, k, 5, 0, 7, true); }; - // Test multiple cases without recreating the engine. So use a increasing key prefix to - // avoid each case interfering each other. + // Test multiple cases without recreating the engine. So use a increasing key + // prefix to avoid each case interfering each other. let mut key_prefix = b'a'; let mut test_all_levels = |prepare| { diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index d49d759f3a5..1db991f70eb 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -89,8 +89,9 @@ impl WriteCommand for AcquirePessimisticLock let mut res = if self.return_values { Ok(PessimisticLockRes::Values(vec![])) } else if self.check_existence { - // If return_value is set, the existence status is implicitly included in the result. - // So check_existence only need to be explicitly handled if `return_values` is not set. + // If return_value is set, the existence status is implicitly included in the + // result. So check_existence only need to be explicitly handled if + // `return_values` is not set. Ok(PessimisticLockRes::Existence(vec![])) } else { Ok(PessimisticLockRes::Empty) diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index c27e8dc1bc0..7f6f4879a3d 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -56,8 +56,8 @@ enum SecondaryLockStatus { impl WriteCommand for CheckSecondaryLocks { fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { - // It is not allowed for commit to overwrite a protected rollback. So we update max_ts - // to prevent this case from happening. + // It is not allowed for commit to overwrite a protected rollback. So we update + // max_ts to prevent this case from happening. context.concurrency_manager.update_max_ts(self.start_ts); let mut txn = MvccTxn::new(self.start_ts, context.concurrency_manager); @@ -83,8 +83,8 @@ impl WriteCommand for CheckSecondaryLocks { (SecondaryLockStatus::Locked(lock), false, None) } } - // Searches the write CF for the commit record of the lock and returns the commit timestamp - // (0 if the lock is not committed). + // Searches the write CF for the commit record of the lock and returns the commit + // timestamp (0 if the lock is not committed). l => { mismatch_lock = l; match reader.get_txn_commit_record(&key)? { @@ -96,9 +96,9 @@ impl WriteCommand for CheckSecondaryLocks { }; // We needn't write a rollback once there is a write record for it: // If it's a committed record, it cannot be changed. - // If it's a rollback record, it either comes from another check_secondary_lock - // (thus protected) or the client stops commit actively. So we don't need - // to make it protected again. + // If it's a rollback record, it either comes from another + // check_secondary_lock (thus protected) or the client stops commit + // actively. So we don't need to make it protected again. (status, false, None) } TxnCommitRecord::OverlappedRollback { .. } => { diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 5ec0ae5c503..7fd4a45ff8a 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -64,12 +64,13 @@ impl CommandExt for CheckTxnStatus { } impl WriteCommand for CheckTxnStatus { - /// checks whether a transaction has expired its primary lock's TTL, rollback the - /// transaction if expired, or update the transaction's min_commit_ts according to the metadata - /// in the primary lock. - /// When transaction T1 meets T2's lock, it may invoke this on T2's primary key. In this - /// situation, `self.start_ts` is T2's `start_ts`, `caller_start_ts` is T1's `start_ts`, and - /// the `current_ts` is literally the timestamp when this function is invoked; it may not be + /// checks whether a transaction has expired its primary lock's TTL, + /// rollback the transaction if expired, or update the transaction's + /// min_commit_ts according to the metadata in the primary lock. + /// When transaction T1 meets T2's lock, it may invoke this on T2's primary + /// key. In this situation, `self.start_ts` is T2's `start_ts`, + /// `caller_start_ts` is T1's `start_ts`, and the `current_ts` is + /// literally the timestamp when this function is invoked; it may not be /// accurate. fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { let mut new_max_ts = self.lock_ts; @@ -122,7 +123,8 @@ impl WriteCommand for CheckTxnStatus { let mut released_locks = ReleasedLocks::new(self.lock_ts, TimeStamp::zero()); released_locks.push(released); - // The lock is released here only when the `check_txn_status` returns `TtlExpire`. + // The lock is released here only when the `check_txn_status` returns + // `TtlExpire`. if let TxnStatus::TtlExpire = txn_status { released_locks.wake_up(context.lock_mgr); } @@ -477,7 +479,8 @@ pub mod tests { must_unlocked(&engine, b"k2"); must_get_rollback_protected(&engine, b"k2", 15, true); - // case 3: pessimistic transaction with two keys (large txn), secondary is prewritten first + // case 3: pessimistic transaction with two keys (large txn), secondary is + // prewritten first must_acquire_pessimistic_lock_for_large_txn(&engine, b"k3", b"k3", 20, 20, 100); must_acquire_pessimistic_lock_for_large_txn(&engine, b"k4", b"k3", 20, 25, 100); must_pessimistic_prewrite_put_async_commit( @@ -491,7 +494,8 @@ pub mod tests { true, 28, ); - // the client must call check_txn_status with caller_start_ts == current_ts == 0, should not push + // the client must call check_txn_status with caller_start_ts == current_ts == + // 0, should not push must_success( &engine, b"k3", @@ -504,7 +508,8 @@ pub mod tests { uncommitted(100, 21, false), ); - // case 4: pessimistic transaction with two keys (not large txn), secondary is prewritten first + // case 4: pessimistic transaction with two keys (not large txn), secondary is + // prewritten first must_acquire_pessimistic_lock_with_ttl(&engine, b"k5", b"k5", 30, 30, 100); must_acquire_pessimistic_lock_with_ttl(&engine, b"k6", b"k5", 30, 35, 100); must_pessimistic_prewrite_put_async_commit( @@ -518,7 +523,8 @@ pub mod tests { true, 36, ); - // the client must call check_txn_status with caller_start_ts == current_ts == 0, should not push + // the client must call check_txn_status with caller_start_ts == current_ts == + // 0, should not push must_success( &engine, b"k5", @@ -569,8 +575,8 @@ pub mod tests { // The initial min_commit_ts is start_ts + 1. must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(5, 1), false); - // CheckTxnStatus with caller_start_ts = 0 and current_ts = 0 should just return the - // information of the lock without changing it. + // CheckTxnStatus with caller_start_ts = 0 and current_ts = 0 should just return + // the information of the lock without changing it. must_success( &engine, k, @@ -613,8 +619,8 @@ pub mod tests { must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(9, 1), false); // caller_start_ts < lock.min_commit_ts < current_ts - // When caller_start_ts < lock.min_commit_ts, no need to update it, but pushed should be - // true. + // When caller_start_ts < lock.min_commit_ts, no need to update it, but pushed + // should be true. must_success( &engine, k, @@ -642,7 +648,8 @@ pub mod tests { ); must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(11, 1), false); - // For same caller_start_ts and current_ts, update min_commit_ts to caller_start_ts + 1 + // For same caller_start_ts and current_ts, update min_commit_ts to + // caller_start_ts + 1 must_success( &engine, k, @@ -689,7 +696,8 @@ pub mod tests { must_prewrite_put_for_large_txn(&engine, k, v, k, ts(20, 0), 100, 0); - // Check a committed transaction when there is another lock. Expect getting the commit ts. + // Check a committed transaction when there is another lock. Expect getting the + // commit ts. must_success( &engine, k, @@ -702,8 +710,8 @@ pub mod tests { committed(ts(15, 0)), ); - // Check a not existing transaction, the result depends on whether `rollback_if_not_exist` - // is set. + // Check a not existing transaction, the result depends on whether + // `rollback_if_not_exist` is set. if r { must_success( &engine, @@ -729,8 +737,8 @@ pub mod tests { must_err(&engine, k, ts(6, 0), ts(12, 0), ts(12, 0), r, false, false); } - // TTL check is based on physical time (in ms). When logical time's difference is larger - // than TTL, the lock won't be resolved. + // TTL check is based on physical time (in ms). When logical time's difference + // is larger than TTL, the lock won't be resolved. must_success( &engine, k, @@ -936,8 +944,10 @@ pub mod tests { 100, TimeStamp::zero(), 1, - /* min_commit_ts */ TimeStamp::zero(), - /* max_commit_ts */ TimeStamp::zero(), + // min_commit_ts + TimeStamp::zero(), + // max_commit_ts + TimeStamp::zero(), false, kvproto::kvrpcpb::Assertion::None, kvproto::kvrpcpb::AssertionLevel::Off, @@ -958,7 +968,8 @@ pub mod tests { must_prewrite_put_for_large_txn(&engine, k, v, k, ts(310, 0), 100, 0); must_large_txn_locked(&engine, k, ts(310, 0), 100, ts(310, 1), false); - // Don't push forward the min_commit_ts if caller_start_ts is max, but pushed should be true. + // Don't push forward the min_commit_ts if caller_start_ts is max, but pushed + // should be true. must_success( &engine, k, @@ -998,7 +1009,8 @@ pub mod tests { let ts = TimeStamp::compose; // Check with resolving_pessimistic_lock flag. - // Path: there is no commit or rollback record, no rollback record should be written. + // Path: there is no commit or rollback record, no rollback record should be + // written. must_success( &engine, k, @@ -1031,8 +1043,9 @@ pub mod tests { uncommitted(10, TimeStamp::zero(), false), ); - // Path: the pessimistic primary key lock does exist, and it's expired, the primary lock will - // be pessimistically rolled back but there will not be a rollback record. + // Path: the pessimistic primary key lock does exist, and it's expired, the + // primary lock will be pessimistically rolled back but there will not + // be a rollback record. must_success( &engine, k, @@ -1060,8 +1073,10 @@ pub mod tests { 10, TimeStamp::zero(), 1, - /* min_commit_ts */ TimeStamp::zero(), - /* max_commit_ts */ TimeStamp::zero(), + // min_commit_ts + TimeStamp::zero(), + // max_commit_ts + TimeStamp::zero(), false, kvproto::kvrpcpb::Assertion::None, kvproto::kvrpcpb::AssertionLevel::Off, @@ -1078,8 +1093,9 @@ pub mod tests { uncommitted(10, TimeStamp::zero(), false), ); - // Path: the prewrite primary key expired and the solving key is a pessimistic lock, - // rollback record should be written and the transaction status is certain. + // Path: the prewrite primary key expired and the solving key is a pessimistic + // lock, rollback record should be written and the transaction status is + // certain. must_success( &engine, k, @@ -1094,8 +1110,9 @@ pub mod tests { must_unlocked(&engine, k); must_get_rollback_ts(&engine, k, ts(30, 0)); - // Path: the resolving_pessimistic_lock is false and the primary key lock is pessimistic - // lock, the transaction is in commit phase and the rollback record should be written. + // Path: the resolving_pessimistic_lock is false and the primary key lock is + // pessimistic lock, the transaction is in commit phase and the rollback + // record should be written. must_acquire_pessimistic_lock_with_ttl(&engine, k, k, ts(50, 0), ts(50, 0), 10); must_pessimistic_locked(&engine, k, ts(50, 0), ts(50, 0)); must_success( @@ -1106,7 +1123,8 @@ pub mod tests { ts(61, 0), true, false, - /* resolving_pessimistic_lock */ false, + // resolving_pessimistic_lock + false, |s| s == TtlExpire, ); must_unlocked(&engine, k); diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index 62c0aaa98c1..c810c749bd6 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -46,8 +46,8 @@ impl CommandExt for Cleanup { impl WriteCommand for Cleanup { fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { - // It is not allowed for commit to overwrite a protected rollback. So we update max_ts - // to prevent this case from happening. + // It is not allowed for commit to overwrite a protected rollback. So we update + // max_ts to prevent this case from happening. context.concurrency_manager.update_max_ts(self.start_ts); let mut txn = MvccTxn::new(self.start_ts, context.concurrency_manager); diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index 3d3b62ea156..161db528c19 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -116,9 +116,10 @@ mod tests { test_kv_format_impl!(test_cas_basic_impl); } - /// Note: for API V2, TestEngine don't support MVCC reading, so `pre_propose` observer is ignored, - /// and no timestamp will be append to key. - /// The full test of `RawCompareAndSwap` is in `src/storage/mod.rs`. + /// Note: for API V2, TestEngine don't support MVCC reading, so + /// `pre_propose` observer is ignored, and no timestamp will be append + /// to key. The full test of `RawCompareAndSwap` is in + /// `src/storage/mod.rs`. fn test_cas_basic_impl() { let engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); diff --git a/src/storage/txn/commands/macros.rs b/src/storage/txn/commands/macros.rs index c505714f2a4..c57e7bcb5fb 100644 --- a/src/storage/txn/commands/macros.rs +++ b/src/storage/txn/commands/macros.rs @@ -23,8 +23,8 @@ macro_rules! ctx { /// value of `cmd` and which accepts one parameter whose type name matches /// the value of `cmd`. /// cmd_ty -> The type of the result of executing this command. -/// display -> Information needed to implement the `Display` trait for the command. -/// content -> The fields of the struct definition for the command. +/// display -> Information needed to implement the `Display` trait for the +/// command. content -> The fields of the struct definition for the command. macro_rules! command { ( $(#[$outer_doc: meta])* @@ -148,12 +148,12 @@ macro_rules! request_type { } macro_rules! write_bytes { - ($field: ident) => { + ($field:ident) => { fn write_bytes(&self) -> usize { self.$field.as_encoded().len() } }; - ($field: ident: multiple) => { + ($field:ident : multiple) => { fn write_bytes(&self) -> usize { self.$field.iter().map(|x| x.as_encoded().len()).sum() } @@ -166,17 +166,17 @@ macro_rules! gen_lock { crate::storage::txn::latch::Lock::new::<(), _>(vec![]) } }; - ($field: ident) => { + ($field:ident) => { fn gen_lock(&self) -> crate::storage::txn::latch::Lock { crate::storage::txn::latch::Lock::new(std::iter::once(&self.$field)) } }; - ($field: ident: multiple) => { + ($field:ident : multiple) => { fn gen_lock(&self) -> crate::storage::txn::latch::Lock { crate::storage::txn::latch::Lock::new(&self.$field) } }; - ($field: ident: multiple$transform: tt) => { + ($field:ident : multiple $transform:tt) => { fn gen_lock(&self) -> crate::storage::txn::latch::Lock { #![allow(unused_parens)] let keys = self.$field.iter().map($transform); diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 5cd94b172ff..7f748c352f7 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -70,7 +70,8 @@ use crate::storage::{ /// Learn more about our transaction system at /// [Deep Dive TiKV: Distributed Transactions](https://tikv.org/docs/deep-dive/distributed-transaction/introduction/) /// -/// These are typically scheduled and used through the [`Storage`](crate::storage::Storage) with functions like +/// These are typically scheduled and used through the +/// [`Storage`](crate::storage::Storage) with functions like /// [`prewrite`](prewrite::Prewrite) trait and are executed asynchronously. pub enum Command { Prewrite(Prewrite), @@ -95,22 +96,23 @@ pub enum Command { /// A `Command` with its return type, reified as the generic parameter `T`. /// -/// Incoming grpc requests (like `CommitRequest`, `PrewriteRequest`) are converted to -/// this type via a series of transformations. That process is described below using -/// `CommitRequest` as an example: -/// 1. A `CommitRequest` is handled by the `future_commit` method in kv.rs, where it -/// needs to be transformed to a `TypedCommand` before being passed to the -/// `storage.sched_txn_command` method. -/// 2. The `From` impl for `TypedCommand` gets chosen, and its generic -/// parameter indicates that the result type for this instance of `TypedCommand` is -/// going to be `TxnStatus` - one of the variants of the `StorageCallback` enum. -/// 3. In the above `from` method, the details of the commit request are captured by -/// creating an instance of the struct `storage::txn::commands::commit::Command` -/// via its `new` method. -/// 4. This struct is wrapped in a variant of the enum `storage::txn::commands::Command`. -/// This enum exists to facilitate generic operations over different commands. -/// 5. Finally, the `Command` enum variant for `Commit` is converted to the `TypedCommand` -/// using the `From` impl for `TypedCommand`. +/// Incoming grpc requests (like `CommitRequest`, `PrewriteRequest`) are +/// converted to this type via a series of transformations. That process is +/// described below using `CommitRequest` as an example: +/// 1. A `CommitRequest` is handled by the `future_commit` method in kv.rs, +/// where it needs to be transformed to a `TypedCommand` before being passed to +/// the `storage.sched_txn_command` method. +/// 2. The `From` impl for `TypedCommand` gets chosen, and its +/// generic parameter indicates that the result type for this instance of +/// `TypedCommand` is going to be `TxnStatus` - one of the variants of the +/// `StorageCallback` enum. 3. In the above `from` method, the details of the +/// commit request are captured by creating an instance of the struct +/// `storage::txn::commands::commit::Command` via its `new` method. +/// 4. This struct is wrapped in a variant of the enum +/// `storage::txn::commands::Command`. This enum exists to facilitate generic +/// operations over different commands. 5. Finally, the `Command` enum variant +/// for `Commit` is converted to the `TypedCommand` using the `From` +/// impl for `TypedCommand`. /// /// For other requests, see the corresponding `future_` method, the `From` trait /// implementation and so on. @@ -350,16 +352,18 @@ pub(super) struct ReleasedLocks { pessimistic: bool, } -/// Represents for a scheduler command, when should the response sent to the client. -/// For most cases, the response should be sent after the result being successfully applied to -/// the storage (if needed). But in some special cases, some optimizations allows the response to be -/// returned at an earlier phase. +/// Represents for a scheduler command, when should the response sent to the +/// client. For most cases, the response should be sent after the result being +/// successfully applied to the storage (if needed). But in some special cases, +/// some optimizations allows the response to be returned at an earlier phase. /// -/// Note that this doesn't affect latch releasing. The latch and the memory lock (if any) are always -/// released after applying, regardless of when the response is sent. +/// Note that this doesn't affect latch releasing. The latch and the memory lock +/// (if any) are always released after applying, regardless of when the response +/// is sent. #[derive(Clone, Copy, Debug, PartialEq)] pub enum ResponsePolicy { - /// Return the response to the client when the command has finished applying. + /// Return the response to the client when the command has finished + /// applying. OnApplied, /// Return the response after finishing Raft committing. OnCommitted, @@ -695,12 +699,14 @@ impl Debug for Command { } } -/// Commands that do not need to modify the database during execution will implement this trait. +/// Commands that do not need to modify the database during execution will +/// implement this trait. pub trait ReadCommand: CommandExt { fn process_read(self, snapshot: S, statistics: &mut Statistics) -> Result; } -/// Commands that need to modify the database during execution will implement this trait. +/// Commands that need to modify the database during execution will implement +/// this trait. pub trait WriteCommand: CommandExt { fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result; } diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index bcafed8b0e6..010238426ee 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -45,7 +45,8 @@ impl CommandExt for PessimisticRollback { } impl WriteCommand for PessimisticRollback { - /// Delete any pessimistic lock with small for_update_ts belongs to this transaction. + /// Delete any pessimistic lock with small for_update_ts belongs to this + /// transaction. fn process_write(mut self, snapshot: S, context: WriteContext<'_, L>) -> Result { let mut txn = MvccTxn::new(self.start_ts, context.concurrency_manager); let mut reader = ReaderWithStats::new( diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index dd9e451e883..a6aa8af6f87 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -1,10 +1,10 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -//! Functionality for handling optimistic and pessimistic prewrites. These are separate commands -//! (although maybe they shouldn't be since there is only one protobuf), but -//! handling of the commands is similar. We therefore have a single type (Prewriter) to handle both -//! kinds of prewrite. +//! Functionality for handling optimistic and pessimistic prewrites. These are +//! separate commands (although maybe they shouldn't be since there is only one +//! protobuf), but handling of the commands is similar. We therefore have a +//! single type (Prewriter) to handle both kinds of prewrite. use std::mem; @@ -410,7 +410,8 @@ impl WriteCommand for PrewritePessimistic { } } -/// Handles both kinds of prewrite (K statically indicates either optimistic or pessimistic). +/// Handles both kinds of prewrite (K statically indicates either optimistic or +/// pessimistic). struct Prewriter { kind: K, mutations: Vec, @@ -444,7 +445,8 @@ impl Prewriter { SnapshotReader::new_with_ctx(self.start_ts, snapshot, &self.ctx), context.statistics, ); - // Set extra op here for getting the write record when check write conflict in prewrite. + // Set extra op here for getting the write record when check write conflict in + // prewrite. let rows = self.mutations.len(); let res = self.prewrite(&mut txn, &mut reader, context.extra_op); @@ -460,9 +462,10 @@ impl Prewriter { )) } - // Async commit requires the max timestamp in the concurrency manager to be up-to-date. - // If it is possibly stale due to leader transfer or region merge, return an error. - // TODO: Fallback to non-async commit if not synced instead of returning an error. + // Async commit requires the max timestamp in the concurrency manager to be + // up-to-date. If it is possibly stale due to leader transfer or region + // merge, return an error. TODO: Fallback to non-async commit if not synced + // instead of returning an error. fn check_max_ts_synced(&self, snapshot: &impl Snapshot) -> Result<()> { if (self.secondary_keys.is_some() || self.try_one_pc) && !snapshot.ext().is_max_ts_synced() { @@ -476,9 +479,10 @@ impl Prewriter { } } - /// The core part of the prewrite action. In the abstract, this method iterates over the mutations - /// in the prewrite and prewrites each one. It keeps track of any locks encountered and (if it's - /// an async commit transaction) the min_commit_ts, these are returned by the method. + /// The core part of the prewrite action. In the abstract, this method + /// iterates over the mutations in the prewrite and prewrites each one. + /// It keeps track of any locks encountered and (if it's an async commit + /// transaction) the min_commit_ts, these are returned by the method. fn prewrite( &mut self, txn: &mut MvccTxn, @@ -710,10 +714,11 @@ impl Prewriter { } } -/// Encapsulates things which must be done differently for optimistic or pessimistic transactions. +/// Encapsulates things which must be done differently for optimistic or +/// pessimistic transactions. trait PrewriteKind { - /// The type of mutation and, optionally, its extra information, differing for the - /// optimistic and pessimistic transaction. + /// The type of mutation and, optionally, its extra information, differing + /// for the optimistic and pessimistic transaction. type Mutation: MutationLock; fn txn_kind(&self) -> TransactionKind; @@ -783,8 +788,8 @@ impl PrewriteKind for Pessimistic { } } -/// The type of mutation and, optionally, its extra information, differing for the -/// optimistic and pessimistic transaction. +/// The type of mutation and, optionally, its extra information, differing for +/// the optimistic and pessimistic transaction. /// For optimistic txns, this is `Mutation`. /// For pessimistic txns, this is `(Mutation, bool)`, where the bool indicates /// whether the mutation takes a pessimistic lock or not. @@ -845,7 +850,8 @@ fn handle_1pc_locks(txn: &mut MvccTxn, commit_ts: TimeStamp) -> ReleasedLocks { txn.start_ts, lock.short_value, ); - // Transactions committed with 1PC should be impossible to overwrite rollback records. + // Transactions committed with 1PC should be impossible to overwrite rollback + // records. txn.put_write(key.clone(), commit_ts, write.as_ref().to_bytes()); if delete_pessimistic_lock { released_locks.push(txn.unlock_key(key, true)); @@ -1044,8 +1050,8 @@ mod tests { .unwrap(); // Rollback to make tombstones in lock-cf. rollback(&engine, &mut statistic, keys, 100).unwrap(); - // Gc rollback flags store in write-cf to make sure the next prewrite operation will skip - // seek write cf. + // Gc rollback flags store in write-cf to make sure the next prewrite operation + // will skip seek write cf. gc_by_compact(&engine, pri_key, 101); set_perf_level(PerfLevel::EnableTimeExceptForMutex); let perf = ReadPerfInstant::new(); @@ -1132,9 +1138,9 @@ mod tests { ) .unwrap(); - // Test a 1PC request should not be partially written when encounters error on the halfway. - // If some of the keys are successfully written as committed state, the atomicity will be - // broken. + // Test a 1PC request should not be partially written when encounters error on + // the halfway. If some of the keys are successfully written as committed state, + // the atomicity will be broken. let (k1, v1) = (b"k1", b"v1"); let (k2, v2) = (b"k2", b"v2"); // Lock k2. @@ -1248,9 +1254,9 @@ mod tests { must_rollback(&engine, k1, 20, true); - // Test a 1PC request should not be partially written when encounters error on the halfway. - // If some of the keys are successfully written as committed state, the atomicity will be - // broken. + // Test a 1PC request should not be partially written when encounters error on + // the halfway. If some of the keys are successfully written as committed state, + // the atomicity will be broken. // Lock k2 with a optimistic lock. let mut statistics = Statistics::default(); @@ -1473,7 +1479,7 @@ mod tests { } macro_rules! assert_max_ts_err { - ($e: expr) => { + ($e:expr) => { match $e { Err(Error(box ErrorInner::MaxTimestampNotSynced { .. })) => {} _ => panic!("Should have returned an error"), @@ -1676,11 +1682,12 @@ mod tests { assert_eq!(cm.max_ts().into_inner(), 15); - // T3: start_ts = 8, commit_ts = max_ts + 1 = 16, prewrite a DELETE operation on k + // T3: start_ts = 8, commit_ts = max_ts + 1 = 16, prewrite a DELETE operation on + // k must_prewrite_delete(&engine, key, key, 8); must_commit(&engine, key, 8, cm.max_ts().into_inner() + 1); - // T1: start_ts = 10, reapeatly prewrite on k, with should_not_exist flag set + // T1: start_ts = 10, repeatedly prewrite on k, with should_not_exist flag set let res = prewrite_with_cm( &engine, cm, @@ -2019,8 +2026,8 @@ mod tests { must_commit(&engine, b"k1", 35, 40); must_commit(&engine, b"k2", 35, 40); - // A retrying non-pessimistic-lock prewrite request should not skip constraint checks. - // Here it should take no effect, even there's already a newer version + // A retrying non-pessimistic-lock prewrite request should not skip constraint + // checks. Here it should take no effect, even there's already a newer version // after it. (No matter if it's async commit). prewrite_with_retry_flag(b"k2", b"v2", b"k1", Some(vec![]), 10, false, true).unwrap(); must_unlocked(&engine, b"k2"); @@ -2029,8 +2036,8 @@ mod tests { must_unlocked(&engine, b"k2"); // Committing still does nothing. must_commit(&engine, b"k2", 10, 25); - // Try a different txn start ts (which haven't been successfully committed before). - // It should report a PessimisticLockNotFound. + // Try a different txn start ts (which haven't been successfully committed + // before). It should report a PessimisticLockNotFound. let err = prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 11, false, true).unwrap_err(); assert!(matches!( err, @@ -2039,7 +2046,8 @@ mod tests { ))) )); must_unlocked(&engine, b"k2"); - // However conflict still won't be checked if there's a non-retry request arriving. + // However conflict still won't be checked if there's a non-retry request + // arriving. prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 10, false, false).unwrap(); must_locked(&engine, b"k2", 10); } @@ -2108,8 +2116,8 @@ mod tests { fn test_assertion_fail_on_conflicting_index_key() { let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); - // Simulate two transactions that tries to insert the same row with a secondary index, and - // the second one canceled the first one (by rolling back its lock). + // Simulate two transactions that tries to insert the same row with a secondary + // index, and the second one canceled the first one (by rolling back its lock). let t1_start_ts = TimeStamp::compose(1, 0); let t2_start_ts = TimeStamp::compose(2, 0); @@ -2222,8 +2230,8 @@ mod tests { ))) )); - // If the two keys are sent in different requests, it would be the client's duty to ignore - // the assertion error. + // If the two keys are sent in different requests, it would be the client's duty + // to ignore the assertion error. let err = must_prewrite_put_err_impl( &engine, b"row", diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index 6638fe5cffd..1d2bfbf49d8 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -100,9 +100,9 @@ impl WriteCommand for ResolveLock { false, )? } else if commit_ts > current_lock.ts { - // Continue to resolve locks if the not found committed locks are pessimistic type. - // They could be left if the transaction is finally committed and pessimistic conflict - // retry happens during execution. + // Continue to resolve locks if the not found committed locks are pessimistic + // type. They could be left if the transaction is finally committed and + // pessimistic conflict retry happens during execution. match commit(&mut txn, &mut reader, current_key.clone(), commit_ts) { Ok(res) => res, Err(MvccError(box MvccErrorInner::TxnLockNotFound { .. })) @@ -160,6 +160,7 @@ impl WriteCommand for ResolveLock { } } -// To resolve a key, the write size is about 100~150 bytes, depending on key and value length. -// The write batch will be around 32KB if we scan 256 keys each time. +// To resolve a key, the write size is about 100~150 bytes, depending on key and +// value length. The write batch will be around 32KB if we scan 256 keys each +// time. pub const RESOLVE_LOCK_BATCH_SIZE: usize = 256; diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index f69d4a107fc..5a0f636d2f6 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -52,8 +52,8 @@ impl WriteCommand for ResolveLockLite { ); let rows = self.resolve_keys.len(); - // ti-client guarantees the size of resolve_keys will not too large, so no necessary - // to control the write_size as ResolveLock. + // ti-client guarantees the size of resolve_keys will not too large, so no + // necessary to control the write_size as ResolveLock. let mut released_locks = ReleasedLocks::new(self.start_ts, self.commit_ts); for key in self.resolve_keys { released_locks.push(if !self.commit_ts.is_zero() { diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index 70e7fc4a49d..ad22e966590 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -52,8 +52,8 @@ impl WriteCommand for Rollback { let rows = self.keys.len(); let mut released_locks = ReleasedLocks::new(self.start_ts, TimeStamp::zero()); for k in self.keys { - // Rollback is called only if the transaction is known to fail. Under the circumstances, - // the rollback record needn't be protected. + // Rollback is called only if the transaction is known to fail. Under the + // circumstances, the rollback record needn't be protected. let released_lock = cleanup(&mut txn, &mut reader, k, TimeStamp::zero(), false)?; released_locks.push(released_lock); } diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index e894cc6835e..2149d5571da 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -210,7 +210,8 @@ pub mod tests { must_err(&engine, k, 5, 100); // Create a lock with TTL=100. - // The initial TTL will be set to 0 after calling must_prewrite_put. Update it first. + // The initial TTL will be set to 0 after calling must_prewrite_put. Update it + // first. must_prewrite_put(&engine, k, v, k, 5); must_locked(&engine, k, 5); must_success(&engine, k, 5, 100, 100); diff --git a/src/storage/txn/flow_controller/mod.rs b/src/storage/txn/flow_controller/mod.rs index f109b9896a3..c0faeac6328 100644 --- a/src/storage/txn/flow_controller/mod.rs +++ b/src/storage/txn/flow_controller/mod.rs @@ -13,7 +13,7 @@ pub enum FlowController { } macro_rules! flow_controller_fn { - ($fn_name: ident, $region_id: ident, $type: ident) => { + ($fn_name:ident, $region_id:ident, $type:ident) => { pub fn $fn_name(&self, $region_id: u64) -> $type { match self { FlowController::Singleton(ref controller) => controller.$fn_name($region_id), @@ -21,7 +21,7 @@ macro_rules! flow_controller_fn { } } }; - ($fn_name: ident, $region_id: ident, $bytes: ident, $type: ident) => { + ($fn_name:ident, $region_id:ident, $bytes:ident, $type:ident) => { pub fn $fn_name(&self, $region_id: u64, $bytes: usize) -> $type { match self { FlowController::Singleton(ref controller) => { diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index 056c447aced..40bb50a88c8 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -45,8 +45,9 @@ enum Trend { NoTrend, } -/// Flow controller is used to throttle the write rate at scheduler level, aiming -/// to substitute the write stall mechanism of RocksDB. It features in two points: +/// Flow controller is used to throttle the write rate at scheduler level, +/// aiming to substitute the write stall mechanism of RocksDB. It features in +/// two points: /// * throttle at scheduler, so raftstore and apply won't be blocked anymore /// * better control on the throttle rate to avoid QPS drop under heavy write /// @@ -54,22 +55,22 @@ enum Trend { /// is limited to 16MB/s by default which doesn't take real disk ability into /// account. It may underestimate the disk's throughout that 16MB/s is too small /// at once, causing a very large jitter on the write duration. -/// Also, it decreases the delayed write rate further if the factors still exceed -/// the threshold. So under heavy write load, the write rate may be throttled to -/// a very low rate from time to time, causing QPS drop eventually. -/// +/// Also, it decreases the delayed write rate further if the factors still +/// exceed the threshold. So under heavy write load, the write rate may be +/// throttled to a very low rate from time to time, causing QPS drop eventually. /// For compaction pending bytes, we use discardable ratio to do flow control -/// which is separated mechanism from throttle speed. Compaction pending bytes is -/// a approximate value, usually, changes up and down dramatically, so it's unwise -/// to map compaction pending bytes to a specified throttle speed. Instead, -/// mapping it from soft limit to hard limit as 0% to 100% discardable ratio. With -/// this, there must be a point that foreground write rate is equal to the -/// background compaction pending bytes consuming rate so that compaction pending -/// bytes is kept around a steady level. +/// which is separated mechanism from throttle speed. Compaction pending bytes +/// is a approximate value, usually, changes up and down dramatically, so it's +/// unwise to map compaction pending bytes to a specified throttle speed. +/// Instead, mapping it from soft limit to hard limit as 0% to 100% discardable +/// ratio. With this, there must be a point that foreground write rate is equal +/// to the background compaction pending bytes consuming rate so that compaction +/// pending bytes is kept around a steady level. /// /// Here is a brief flow showing where the mechanism works: -/// grpc -> check should drop(discardable ratio) -> limiter -> async write to raftstore +/// grpc -> check should drop(discardable ratio) -> limiter -> async write to +/// raftstore pub struct EngineFlowController { discard_ratio: Arc, limiter: Arc, @@ -702,7 +703,8 @@ impl FlowChecker { .with_label_values(&[&cf]) .set((checker.long_term_pending_bytes.get_avg() * RATIO_SCALE_FACTOR as f64) as i64); - // do special check on start, see the comment of the variable definition for detail. + // do special check on start, see the comment of the variable definition for + // detail. if checker.on_start_pending_bytes { if num < soft || checker.long_term_pending_bytes.trend() == Trend::Increasing { // the write is accumulating, still need to throttle @@ -766,7 +768,8 @@ impl FlowChecker { let prev = checker.last_num_memtables.get_recent(); checker.last_num_memtables.observe(num_memtables); - // do special check on start, see the comment of the variable definition for detail. + // do special check on start, see the comment of the variable definition for + // detail. if checker.on_start_memtable { if num_memtables < self.memtables_threshold || checker.last_num_memtables.trend() == Trend::Increasing @@ -904,7 +907,8 @@ impl FlowChecker { let checker = self.cf_checkers.get_mut(&cf).unwrap(); let num_l0_files = checker.long_term_num_l0_files.get_recent(); - // do special check on start, see the comment of the variable definition for detail. + // do special check on start, see the comment of the variable definition for + // detail. if checker.on_start_l0_files { if num_l0_files < self.l0_files_threshold || checker.long_term_num_l0_files.trend() == Trend::Increasing @@ -1132,7 +1136,8 @@ pub(super) mod tests { tablet_suffix, )) .unwrap(); - // not throttle when the average of the sliding window doesn't exceeds the threshold + // not throttle when the average of the sliding window doesn't exceeds the + // threshold stub.0.num_memtables.store(6, Ordering::Relaxed); tx.send(FlowInfo::Flush( "default".to_string(), @@ -1523,7 +1528,8 @@ pub(super) mod tests { smoother.observe_with_time(4, now); assert_eq!(smoother.trend(), Trend::NoTrend); - // Incresing trend, the left range contains 3 records, the right range contains 1 records. + // Increasing trend, the left range contains 3 records, the right range contains + // 1 records. let mut smoother = Smoother::< f64, 6, @@ -1545,7 +1551,8 @@ pub(super) mod tests { smoother.observe_with_time(4.0, now); assert_eq!(smoother.trend(), Trend::Increasing); - // Decreasing trend, the left range contains 1 records, the right range contains 3 records. + // Decreasing trend, the left range contains 1 records, the right range contains + // 3 records. let mut smoother = Smoother::< f32, 6, @@ -1561,7 +1568,8 @@ pub(super) mod tests { smoother.observe_with_time(1.0, now); assert_eq!(smoother.trend(), Trend::Decreasing); - // No trend, the left range contains 1 records, the right range contains 3 records. + // No trend, the left range contains 1 records, the right range contains 3 + // records. let mut smoother = Smoother::< f32, 6, diff --git a/src/storage/txn/flow_controller/tablet_flow_controller.rs b/src/storage/txn/flow_controller/tablet_flow_controller.rs index d177c203ba1..14819127389 100644 --- a/src/storage/txn/flow_controller/tablet_flow_controller.rs +++ b/src/storage/txn/flow_controller/tablet_flow_controller.rs @@ -163,7 +163,8 @@ impl FlowInfoDispatcher { .entry(region_id) .or_insert_with(|| insert_limiter_and_checker(region_id, suffix)); // check if the checker's engine is exactly (region_id, suffix) - // if checker.suffix < suffix, it means its tablet is old and needs the refresh + // if checker.suffix < suffix, it means its tablet is old and needs the + // refresh if checker.tablet_suffix() < suffix { let engine = tablet_factory.open_tablet_cache(region_id, suffix).unwrap(); @@ -332,7 +333,6 @@ mod tests { tablet_suffix, )) .unwrap(); - //assert!(!flow_controller.tablet_exist(region_id)); } #[test] diff --git a/src/storage/txn/latch.rs b/src/storage/txn/latch.rs index 0c2ca7951ff..86d16858bd3 100644 --- a/src/storage/txn/latch.rs +++ b/src/storage/txn/latch.rs @@ -13,13 +13,16 @@ use parking_lot::{Mutex, MutexGuard}; const WAITING_LIST_SHRINK_SIZE: usize = 8; const WAITING_LIST_MAX_CAPACITY: usize = 16; -/// Latch which is used to serialize accesses to resources hashed to the same slot. +/// Latch which is used to serialize accesses to resources hashed to the same +/// slot. /// -/// Latches are indexed by slot IDs. The keys of a command are hashed into unsigned numbers, -/// then the command is added to the waiting queues of the latches. +/// Latches are indexed by slot IDs. The keys of a command are hashed into +/// unsigned numbers, then the command is added to the waiting queues of the +/// latches. /// -/// If command A is ahead of command B in one latch, it must be ahead of command B in all the -/// overlapping latches. This is an invariant ensured by the `gen_lock`, `acquire` and `release`. +/// If command A is ahead of command B in one latch, it must be ahead of command +/// B in all the overlapping latches. This is an invariant ensured by the +/// `gen_lock`, `acquire` and `release`. #[derive(Clone)] struct Latch { // store hash value of the key and command ID which requires this key. @@ -34,7 +37,8 @@ impl Latch { } } - /// Find the first command ID in the queue whose hash value is equal to hash. + /// Find the first command ID in the queue whose hash value is equal to + /// hash. pub fn get_first_req_by_hash(&self, hash: u64) -> Option { for (h, cid) in self.waiting.iter().flatten() { if *h == hash { @@ -44,10 +48,11 @@ impl Latch { None } - /// Remove the first command ID in the queue whose hash value is equal to hash_key. - /// If the element which would be removed does not appear at the front of the queue, it will leave - /// a hole in the queue. So we must remove consecutive hole when remove the head of the - /// queue to make the queue not too long. + /// Remove the first command ID in the queue whose hash value is equal to + /// hash_key. If the element which would be removed does not appear at the + /// front of the queue, it will leave a hole in the queue. So we must remove + /// consecutive hole when remove the head of the queue to make the queue not + /// too long. pub fn pop_front(&mut self, key_hash: u64) -> Option<(u64, u64)> { if let Some(item) = self.waiting.pop_front() { if let Some((k, _)) = item.as_ref() { @@ -74,8 +79,8 @@ impl Latch { self.waiting.push_back(Some((key_hash, cid))); } - /// For some hot keys, the waiting list maybe very long, so we should shrink the waiting - /// VecDeque after pop. + /// For some hot keys, the waiting list maybe very long, so we should shrink + /// the waiting VecDeque after pop. fn maybe_shrink(&mut self) { // Pop item which is none to make queue not too long. while let Some(item) = self.waiting.front() { @@ -95,7 +100,8 @@ impl Latch { /// Lock required for a command. #[derive(Clone)] pub struct Lock { - /// The hash value of the keys that a command must acquire before being able to be processed. + /// The hash value of the keys that a command must acquire before being able + /// to be processed. pub required_hashes: Vec, /// The number of latches that the command has acquired. @@ -126,7 +132,8 @@ impl Lock { } } - /// Returns true if all the required latches have be acquired, false otherwise. + /// Returns true if all the required latches have be acquired, false + /// otherwise. pub fn acquired(&self) -> bool { self.required_hashes.len() == self.owned_count } @@ -138,8 +145,9 @@ impl Lock { /// Latches which are used for concurrency control in the scheduler. /// -/// Each latch is indexed by a slot ID, hence the term latch and slot are used interchangeably, but -/// conceptually a latch is a queue, and a slot is an index to the queue. +/// Each latch is indexed by a slot ID, hence the term latch and slot are used +/// interchangeably, but conceptually a latch is a queue, and a slot is an index +/// to the queue. pub struct Latches { slots: Vec>>, size: usize, @@ -156,11 +164,13 @@ impl Latches { Latches { slots, size } } - /// Tries to acquire the latches specified by the `lock` for command with ID `who`. + /// Tries to acquire the latches specified by the `lock` for command with ID + /// `who`. /// - /// This method will enqueue the command ID into the waiting queues of the latches. A latch is - /// considered acquired if the command ID is the first one of elements in the queue which have - /// the same hash value. Returns true if all the Latches are acquired, false otherwise. + /// This method will enqueue the command ID into the waiting queues of the + /// latches. A latch is considered acquired if the command ID is the first + /// one of elements in the queue which have the same hash value. Returns + /// true if all the Latches are acquired, false otherwise. pub fn acquire(&self, lock: &mut Lock, who: u64) -> bool { let mut acquired_count: usize = 0; for &key_hash in &lock.required_hashes[lock.owned_count..] { @@ -184,9 +194,11 @@ impl Latches { lock.acquired() } - /// Releases all latches owned by the `lock` of command with ID `who`, returns the wakeup list. + /// Releases all latches owned by the `lock` of command with ID `who`, + /// returns the wakeup list. /// - /// Preconditions: the caller must ensure the command is at the front of the latches. + /// Preconditions: the caller must ensure the command is at the front of the + /// latches. pub fn release(&self, lock: &Lock, who: u64) -> Vec { let mut wakeup_list: Vec = vec![]; for &key_hash in &lock.required_hashes[..lock.owned_count] { diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 12ff44bbd61..5894efc3226 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -61,7 +61,8 @@ impl SchedPool { name_prefix: &str, ) -> Self { let engine = Arc::new(Mutex::new(engine)); - // for low cpu quota env, set the max-thread-count as 4 to allow potential cases that we need more thread than cpu num. + // for low cpu quota env, set the max-thread-count as 4 to allow potential cases + // that we need more thread than cpu num. let max_pool_size = std::cmp::max( pool_size, std::cmp::max(4, SysQuota::cpu_cores_quota() as usize), diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 2588e820d21..fb32f767bd5 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -3,23 +3,25 @@ // #[PerformanceCriticalPath //! Scheduler which schedules the execution of `storage::Command`s. //! -//! There is one scheduler for each store. It receives commands from clients, executes them against -//! the MVCC layer storage engine. +//! There is one scheduler for each store. It receives commands from clients, +//! executes them against the MVCC layer storage engine. //! -//! Logically, the data organization hierarchy from bottom to top is row -> region -> store -> -//! database. But each region is replicated onto N stores for reliability, the replicas form a Raft -//! group, one of which acts as the leader. When the client read or write a row, the command is -//! sent to the scheduler which is on the region leader's store. +//! Logically, the data organization hierarchy from bottom to top is row -> +//! region -> store -> database. But each region is replicated onto N stores for +//! reliability, the replicas form a Raft group, one of which acts as the +//! leader. When the client read or write a row, the command is sent to the +//! scheduler which is on the region leader's store. //! -//! Scheduler runs in a single-thread event loop, but command executions are delegated to a pool of -//! worker thread. +//! Scheduler runs in a single-thread event loop, but command executions are +//! delegated to a pool of worker thread. //! -//! Scheduler keeps track of all the running commands and uses latches to ensure serialized access -//! to the overlapping rows involved in concurrent commands. But note that scheduler only ensures -//! serialized access to the overlapping rows at command level, but a transaction may consist of -//! multiple commands, therefore conflicts may happen at transaction level. Transaction semantics -//! is ensured by the transaction protocol implemented in the client library, which is transparent -//! to the scheduler. +//! Scheduler keeps track of all the running commands and uses latches to ensure +//! serialized access to the overlapping rows involved in concurrent commands. +//! But note that scheduler only ensures serialized access to the overlapping +//! rows at command level, but a transaction may consist of multiple commands, +//! therefore conflicts may happen at transaction level. Transaction semantics +//! is ensured by the transaction protocol implemented in the client library, +//! which is transparent to the scheduler. use std::{ marker::PhantomData, @@ -75,8 +77,8 @@ use crate::{ const TASKS_SLOTS_NUM: usize = 1 << 12; // 4096 slots. -// The default limit is set to be very large. Then, requests without `max_exectuion_duration` -// will not be aborted unexpectedly. +// The default limit is set to be very large. Then, requests without +// `max_exectuion_duration` will not be aborted unexpectedly. pub const DEFAULT_EXECUTION_DURATION_LIMIT: Duration = Duration::from_secs(24 * 60 * 60); const IN_MEMORY_PESSIMISTIC_LOCK: Feature = Feature::require(6, 0, 0); @@ -279,18 +281,19 @@ impl SchedulerInner { /// Tries to acquire all the required latches for a command when waken up by /// another finished command. /// - /// Returns a deadline error if the deadline is exceeded. Returns the `Task` if - /// all latches are acquired, returns `None` otherwise. + /// Returns a deadline error if the deadline is exceeded. Returns the `Task` + /// if all latches are acquired, returns `None` otherwise. fn acquire_lock_on_wakeup(&self, cid: u64) -> Result, StorageError> { let mut task_slot = self.get_task_slot(cid); let tctx = task_slot.get_mut(&cid).unwrap(); - // Check deadline early during acquiring latches to avoid expired requests blocking - // other requests. + // Check deadline early during acquiring latches to avoid expired requests + // blocking other requests. if let Err(e) = tctx.task.as_ref().unwrap().cmd.deadline().check() { - // `acquire_lock_on_wakeup` is called when another command releases its locks and wakes up - // command `cid`. This command inserted its lock before and now the lock is at the - // front of the queue. The actual acquired count is one more than the `owned_count` - // recorded in the lock, so we increase one to make `release` work. + // `acquire_lock_on_wakeup` is called when another command releases its locks + // and wakes up command `cid`. This command inserted its lock before + // and now the lock is at the front of the queue. The actual + // acquired count is one more than the `owned_count` recorded in the + // lock, so we increase one to make `release` work. tctx.lock.owned_count += 1; return Err(e.into()); } @@ -463,8 +466,9 @@ impl Scheduler { fail_point!("txn_scheduler_acquire_fail"); } - /// Tries to acquire all the necessary latches. If all the necessary latches are acquired, - /// the method initiates a get snapshot operation for further processing. + /// Tries to acquire all the necessary latches. If all the necessary latches + /// are acquired, the method initiates a get snapshot operation for further + /// processing. fn try_to_wake_up(&self, cid: u64) { match self.inner.acquire_lock_on_wakeup(cid) { Ok(Some(task)) => { @@ -579,8 +583,8 @@ impl Scheduler { /// Event handler for the success of read. /// - /// If a next command is present, continues to execute; otherwise, delivers the result to the - /// callback. + /// If a next command is present, continues to execute; otherwise, delivers + /// the result to the callback. fn on_read_finished(&self, cid: u64, pr: ProcessResult, tag: CommandKind) { SCHED_STAGE_COUNTER_VEC.get(tag).read_finish.inc(); @@ -627,9 +631,9 @@ impl Scheduler { drop(lock_guards); let tctx = self.inner.dequeue_task_context(cid); - // If pipelined pessimistic lock or async apply prewrite takes effect, it's not guaranteed - // that the proposed or committed callback is surely invoked, which takes and invokes - // `tctx.cb(tctx.pr)`. + // If pipelined pessimistic lock or async apply prewrite takes effect, it's not + // guaranteed that the proposed or committed callback is surely invoked, which + // takes and invokes `tctx.cb(tctx.pr)`. if let Some(cb) = tctx.cb { let pr = match result { Ok(()) => pr.or(tctx.pr).unwrap(), @@ -742,8 +746,8 @@ impl Scheduler { .await; } - /// Processes a read command within a worker thread, then posts `ReadFinished` message back to the - /// `Scheduler`. + /// Processes a read command within a worker thread, then posts + /// `ReadFinished` message back to the `Scheduler`. fn process_read(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { fail_point!("txn_before_process_read"); debug!("process read cmd in worker pool"; "cid" => task.cid); @@ -764,8 +768,9 @@ impl Scheduler { self.on_read_finished(task.cid, pr, tag); } - /// Processes a write command within a worker thread, then posts either a `WriteFinished` - /// message if successful or a `FinishedWithErr` message back to the `Scheduler`. + /// Processes a write command within a worker thread, then posts either a + /// `WriteFinished` message if successful or a `FinishedWithErr` message + /// back to the `Scheduler`. async fn process_write(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { fail_point!("txn_before_process_write"); let write_bytes = task.cmd.write_bytes(); @@ -806,7 +811,8 @@ impl Scheduler { }; if write_result.is_ok() { - // TODO: write bytes can be a bit inaccurate due to error requests or in-memory pessimistic locks. + // TODO: write bytes can be a bit inaccurate due to error requests or in-memory + // pessimistic locks. sample.add_write_bytes(write_bytes); } let read_bytes = statistics.cf_statistics(CF_DEFAULT).flow_stats.read_bytes @@ -833,16 +839,16 @@ impl Scheduler { .map_err(StorageError::from) .and(write_result) { - // Write prepare failure typically means conflicting transactions are detected. Delivers the - // error to the callback, and releases the latches. + // Write prepare failure typically means conflicting transactions are detected. Delivers + // the error to the callback, and releases the latches. Err(err) => { SCHED_STAGE_COUNTER_VEC.get(tag).prepare_write_err.inc(); debug!("write command failed"; "cid" => cid, "err" => ?err); scheduler.finish_with_err(cid, err); return; } - // Initiates an async write operation on the storage engine, there'll be a `WriteFinished` - // message when it finishes. + // Initiates an async write operation on the storage engine, there'll be a + // `WriteFinished` message when it finishes. Ok(res) => res, }; let region_id = ctx.get_region_id(); @@ -963,10 +969,10 @@ impl Scheduler { let _ = self.inner.flow_controller.consume(region_id, write_size); } else { let start = Instant::now_coarse(); - // Control mutex is used to ensure there is only one request consuming the quota. - // The delay may exceed 1s, and the speed limit is changed every second. - // If the speed of next second is larger than the one of first second, - // without the mutex, the write flow can't throttled strictly. + // Control mutex is used to ensure there is only one request consuming the + // quota. The delay may exceed 1s, and the speed limit is changed every second. + // If the speed of next second is larger than the one of first second, without + // the mutex, the write flow can't throttled strictly. let control_mutex = self.inner.control_mutex.clone(); let _guard = control_mutex.lock().await; let delay = self.inner.flow_controller.consume(region_id, write_size); @@ -994,8 +1000,9 @@ impl Scheduler { let (version, term) = (ctx.get_region_epoch().get_version(), ctx.get_term()); // Mutations on the lock CF should overwrite the memory locks. - // We only set a deleted flag here, and the lock will be finally removed when it finishes - // applying. See the comments in `PeerPessimisticLocks` for how this flag is used. + // We only set a deleted flag here, and the lock will be finally removed when it + // finishes applying. See the comments in `PeerPessimisticLocks` for how this + // flag is used. let txn_ext2 = txn_ext.clone(); let mut pessimistic_locks_guard = txn_ext2 .as_ref() @@ -1021,29 +1028,33 @@ impl Scheduler { } _ => vec![], }; - // Keep the read lock guard of the pessimistic lock table until the request is sent to the raftstore. + // Keep the read lock guard of the pessimistic lock table until the request is + // sent to the raftstore. // - // If some in-memory pessimistic locks need to be proposed, we will propose another TransferLeader - // command. Then, we can guarentee even if the proposed locks don't include the locks deleted here, - // the response message of the transfer leader command must be later than this write command because - // this write command has been sent to the raftstore. Then, we don't need to worry this request will - // fail due to the voluntary leader transfer. + // If some in-memory pessimistic locks need to be proposed, we will propose + // another TransferLeader command. Then, we can guarentee even if the proposed + // locks don't include the locks deleted here, the response message of the + // transfer leader command must be later than this write command because this + // write command has been sent to the raftstore. Then, we don't need to worry + // this request will fail due to the voluntary leader transfer. let _downgraded_guard = pessimistic_locks_guard.and_then(|guard| { (!removed_pessimistic_locks.is_empty()).then(|| RwLockWriteGuard::downgrade(guard)) }); - // The callback to receive async results of write prepare from the storage engine. + // The callback to receive async results of write prepare from the storage + // engine. let engine_cb = Box::new(move |result: EngineResult<()>| { let ok = result.is_ok(); if ok && !removed_pessimistic_locks.is_empty() { - // Removing pessimistic locks when it succeeds to apply. This should be done in the apply - // thread, to make sure it happens before other admin commands are executed. + // Removing pessimistic locks when it succeeds to apply. This should be done in + // the apply thread, to make sure it happens before other admin commands are + // executed. if let Some(mut pessimistic_locks) = txn_ext .as_ref() .map(|txn_ext| txn_ext.pessimistic_locks.write()) { - // If epoch version or term does not match, region or leader change has happened, - // so we needn't remove the key. + // If epoch version or term does not match, region or leader change has + // happened, so we needn't remove the key. if pessimistic_locks.term == term && pessimistic_locks.version == version { for key in removed_pessimistic_locks { pessimistic_locks.remove(&key); @@ -1070,8 +1081,9 @@ impl Scheduler { .observe(rows as f64); if !ok { - // Only consume the quota when write succeeds, otherwise failed write requests may exhaust - // the quota and other write requests would be in long delay. + // Only consume the quota when write succeeds, otherwise failed write + // requests may exhaust the quota and other write requests would be in long + // delay. if sched.inner.flow_controller.enabled() { sched.inner.flow_controller.unconsume(region_id, write_size); } @@ -1095,7 +1107,8 @@ impl Scheduler { } } - /// Returns whether it succeeds to write pessimistic locks to the in-memory lock table. + /// Returns whether it succeeds to write pessimistic locks to the in-memory + /// lock table. fn try_write_in_memory_pessimistic_locks( &self, txn_ext: Option<&TxnExt>, @@ -1107,10 +1120,11 @@ impl Scheduler { None => return false, }; let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); - // When not writable, it only means we cannot write locks to the in-memory lock table, - // but it is still possible for the region to propose request. - // When term or epoch version has changed, the request must fail. To be simple, here we just - // let the request fallback to propose and let raftstore generate an appropriate error. + // When not writable, it only means we cannot write locks to the in-memory lock + // table, but it is still possible for the region to propose request. + // When term or epoch version has changed, the request must fail. To be simple, + // here we just let the request fallback to propose and let raftstore generate + // an appropriate error. if !pessimistic_locks.is_writable() || pessimistic_locks.term != context.get_term() || pessimistic_locks.version != context.get_region_epoch().get_version() @@ -1550,7 +1564,8 @@ mod tests { // time limit is 100ms. thread::sleep(Duration::from_millis(200)); - // When releasing the lock, the queuing tasks should be all waken up without stack overflow. + // When releasing the lock, the queuing tasks should be all waken up without + // stack overflow. scheduler.release_lock(&lock, cid); // A new request should not be blocked. diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index 59f9f077aa2..2cd4afaf932 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -20,7 +20,8 @@ pub trait Store: Send { /// Fetch the provided key. fn get(&self, key: &Key, statistics: &mut Statistics) -> Result>; - /// Re-use last cursor to incrementally (if possible) fetch the provided key. + /// Re-use last cursor to incrementally (if possible) fetch the provided + /// key. fn incremental_get(&mut self, key: &Key) -> Result>; /// Take the statistics. Currently only available for `incremental_get`. @@ -49,13 +50,15 @@ pub trait Store: Send { /// [`Scanner`]s allow retrieving items or batches from a scan result. /// -/// Commonly they are obtained as a result of a [`scanner`](Store::scanner) operation. +/// Commonly they are obtained as a result of a [`scanner`](Store::scanner) +/// operation. pub trait Scanner: Send { /// Get the next [`KvPair`](KvPair) if it exists. fn next(&mut self) -> Result>; /// Get the next [`KvPair`](KvPair)s up to `limit` if they exist. - /// If `sample_step` is greater than 0, skips `sample_step - 1` number of keys after each returned key. + /// If `sample_step` is greater than 0, skips `sample_step - 1` number of + /// keys after each returned key. fn scan(&mut self, limit: usize, sample_step: usize) -> Result>> { let mut row_count = 0; let mut results = Vec::with_capacity(limit); @@ -1162,12 +1165,14 @@ mod tests { Some((Key::from_raw(b"z"), b"beta".to_vec())) ); assert!(scanner.next().is_err()); - // note: mvcc impl does not guarantee to work any more after meeting a non lock error + // note: mvcc impl does not guarantee to work any more after meeting a non lock + // error assert_eq!(scanner.next().unwrap(), None); let mut scanner = store.scanner(true, false, false, None, None).unwrap(); assert!(scanner.next().is_err()); - // note: mvcc impl does not guarantee to work any more after meeting a non lock error + // note: mvcc impl does not guarantee to work any more after meeting a non lock + // error assert_eq!( scanner.next().unwrap(), Some((Key::from_raw(b"z"), b"beta".to_vec())) @@ -1224,7 +1229,8 @@ mod tests { ); assert_eq!(scanner.next().unwrap(), Some((Key::from_raw(b"z"), vec![]))); assert!(scanner.next().is_err()); - // note: mvcc impl does not guarantee to work any more after meeting a non lock error + // note: mvcc impl does not guarantee to work any more after meeting a non lock + // error assert_eq!(scanner.next().unwrap(), None); let mut scanner = store diff --git a/src/storage/types.rs b/src/storage/types.rs index fe4319da97c..70cd7d2d991 100644 --- a/src/storage/types.rs +++ b/src/storage/types.rs @@ -123,11 +123,13 @@ pub struct PrewriteResult { #[derive(Clone, Debug, PartialEq)] pub enum PessimisticLockRes { - /// The previous value is loaded while handling the `AcquirePessimisticLock` command. The i-th - /// item is the value of the i-th key in the `AcquirePessimisticLock` command. + /// The previous value is loaded while handling the `AcquirePessimisticLock` + /// command. The i-th item is the value of the i-th key in the + /// `AcquirePessimisticLock` command. Values(Vec>), - /// Checked whether the key exists while handling the `AcquirePessimisticLock` command. The i-th - /// item is true if the i-th key in the `AcquirePessimisticLock` command exists. + /// Checked whether the key exists while handling the + /// `AcquirePessimisticLock` command. The i-th item is true if the i-th key + /// in the `AcquirePessimisticLock` command exists. Existence(Vec), Empty, } diff --git a/tests/benches/coprocessor_executors/hash_aggr/mod.rs b/tests/benches/coprocessor_executors/hash_aggr/mod.rs index f7893e66bdc..07f28c22d63 100644 --- a/tests/benches/coprocessor_executors/hash_aggr/mod.rs +++ b/tests/benches/coprocessor_executors/hash_aggr/mod.rs @@ -40,8 +40,8 @@ fn bench_hash_aggr_count_1_group_by_int_col_2_groups( } /// COUNT(1) GROUP BY COL > X. -/// Half of the row belong to one group and the rest belong to another group. Thus there are -/// totally two groups. +/// Half of the row belong to one group and the rest belong to another group. +/// Thus there are totally two groups. fn bench_hash_aggr_count_1_group_by_fn_2_groups( b: &mut criterion::Bencher<'_, M>, input: &Input, @@ -94,8 +94,8 @@ fn bench_hash_aggr_count_1_group_by_decimal_col_2_groups( input.bencher.bench(b, &fb, &group_by, &[expr]); } -/// COUNT(1) GROUP BY COL1, COL2 where COL1 is a int column and COL2 is a real column. -/// Each row is a new group. +/// COUNT(1) GROUP BY COL1, COL2 where COL1 is a int column and COL2 is a real +/// column. Each row is a new group. fn bench_hash_aggr_count_1_group_by_int_col_real_col( b: &mut criterion::Bencher<'_, M>, input: &Input, @@ -115,8 +115,8 @@ fn bench_hash_aggr_count_1_group_by_int_col_real_col( input.bencher.bench(b, &fb, &group_by, &[expr]); } -/// COUNT(1) GROUP BY COL1, COL2 where COL1 is a int column and COL2 is a real column. -/// There will be two groups totally. +/// COUNT(1) GROUP BY COL1, COL2 where COL1 is a int column and COL2 is a real +/// column. There will be two groups totally. fn bench_hash_aggr_count_1_group_by_int_col_real_col_2_groups( b: &mut criterion::Bencher<'_, M>, input: &Input, diff --git a/tests/benches/coprocessor_executors/hash_aggr/util.rs b/tests/benches/coprocessor_executors/hash_aggr/util.rs index efa92ebf0cb..b799276b193 100644 --- a/tests/benches/coprocessor_executors/hash_aggr/util.rs +++ b/tests/benches/coprocessor_executors/hash_aggr/util.rs @@ -39,8 +39,8 @@ where } } -/// A bencher that will use batch hash aggregation executor to bench the giving aggregate -/// expression. +/// A bencher that will use batch hash aggregation executor to bench the giving +/// aggregate expression. pub struct BatchBencher; impl HashAggrBencher for BatchBencher diff --git a/tests/benches/coprocessor_executors/index_scan/fixture.rs b/tests/benches/coprocessor_executors/index_scan/fixture.rs index 286a2a22e1e..20ee6d41369 100644 --- a/tests/benches/coprocessor_executors/index_scan/fixture.rs +++ b/tests/benches/coprocessor_executors/index_scan/fixture.rs @@ -3,8 +3,8 @@ use test_coprocessor::*; use tikv::storage::RocksEngine; -/// Builds a fixture table, which contains two columns: id, foo and there is an index over -/// `foo` column. +/// Builds a fixture table, which contains two columns: id, foo and there is an +/// index over `foo` column. pub fn table_with_2_columns_and_one_index(rows: usize) -> (i64, Table, Store) { let index_id = next_id(); let id = ColumnBuilder::new() diff --git a/tests/benches/coprocessor_executors/index_scan/mod.rs b/tests/benches/coprocessor_executors/index_scan/mod.rs index 93a9cd4a3fb..ba29f08bb87 100644 --- a/tests/benches/coprocessor_executors/index_scan/mod.rs +++ b/tests/benches/coprocessor_executors/index_scan/mod.rs @@ -11,8 +11,8 @@ const ROWS: usize = 5000; /// 1 interested column, which is PK (which is in the key). /// -/// This kind of scanner is used in SQLs like `SELECT * FROM .. WHERE index = X`, an index lookup -/// will be performed so that PK is needed. +/// This kind of scanner is used in SQLs like `SELECT * FROM .. WHERE index = +/// X`, an index lookup will be performed so that PK is needed. fn bench_index_scan_primary_key(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement + 'static, @@ -27,10 +27,12 @@ where ); } -/// 1 interested column, which is the column of the index itself (which is in the key). +/// 1 interested column, which is the column of the index itself (which is in +/// the key). /// -/// This kind of scanner is used in SQLs like `SELECT COUNT(*) FROM .. WHERE index = X` or -/// `SELECT index FROM .. WHERE index = X`. There is no double read. +/// This kind of scanner is used in SQLs like `SELECT COUNT(*) FROM .. WHERE +/// index = X` or `SELECT index FROM .. WHERE index = X`. There is no double +/// read. fn bench_index_scan_index(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement + 'static, diff --git a/tests/benches/coprocessor_executors/integrated/mod.rs b/tests/benches/coprocessor_executors/integrated/mod.rs index e3e64709625..cb7e48f3bd7 100644 --- a/tests/benches/coprocessor_executors/integrated/mod.rs +++ b/tests/benches/coprocessor_executors/integrated/mod.rs @@ -19,7 +19,8 @@ where { let (table, store) = crate::table_scan::fixture::table_with_2_columns(input.rows); - // TODO: Change to use `DAGSelect` helper when it no longer place unnecessary columns. + // TODO: Change to use `DAGSelect` helper when it no longer place unnecessary + // columns. let executors = &[ table_scan(&[table["id"].as_column_info()]), simple_aggregate(&[ @@ -260,7 +261,8 @@ fn bench_select_count_1_group_by_int_col_group_few_stream( bench_select_count_1_group_by_int_col_stream_impl(table, store, b, input); } -/// SELECT COUNT(1) FROM Table GROUP BY int_col (n groups, n = row_count, stream aggregation) +/// SELECT COUNT(1) FROM Table GROUP BY int_col (n groups, n = row_count, stream +/// aggregation) fn bench_select_count_1_group_by_int_col_group_many_stream( b: &mut criterion::Bencher<'_, M>, input: &Input, @@ -365,7 +367,8 @@ fn bench_select_count_1_group_by_2_col_group_few( bench_select_count_1_group_by_2_col_impl(table, store, b, input); } -/// SELECT COUNT(1) FROM Table GROUP BY int_col, int_col + 1 (n groups, n = row_count) +/// SELECT COUNT(1) FROM Table GROUP BY int_col, int_col + 1 (n groups, n = +/// row_count) fn bench_select_count_1_group_by_2_col_group_many( b: &mut criterion::Bencher<'_, M>, input: &Input, @@ -407,7 +410,8 @@ fn bench_select_count_1_group_by_2_col_stream_impl( .bench(b, executors, &[table.get_record_range_all()], &store); } -/// SELECT COUNT(1) FROM Table GROUP BY int_col, int_col + 1 (2 groups, stream aggregation) +/// SELECT COUNT(1) FROM Table GROUP BY int_col, int_col + 1 (2 groups, stream +/// aggregation) fn bench_select_count_1_group_by_2_col_group_few_stream( b: &mut criterion::Bencher<'_, M>, input: &Input, @@ -418,7 +422,8 @@ fn bench_select_count_1_group_by_2_col_group_few_stream( bench_select_count_1_group_by_2_col_stream_impl(table, store, b, input); } -/// SELECT COUNT(1) FROM Table GROUP BY int_col, int_col + 1 (n groups, n = row_count, stream aggregation) +/// SELECT COUNT(1) FROM Table GROUP BY int_col, int_col + 1 (n groups, n = +/// row_count, stream aggregation) fn bench_select_count_1_group_by_2_col_group_many_stream( b: &mut criterion::Bencher<'_, M>, input: &Input, @@ -429,7 +434,8 @@ fn bench_select_count_1_group_by_2_col_group_many_stream( bench_select_count_1_group_by_2_col_stream_impl(table, store, b, input); } -/// SELECT COUNT(1) FROM Table WHERE id > X GROUP BY int_col (2 groups, selectivity = 5%) +/// SELECT COUNT(1) FROM Table WHERE id > X GROUP BY int_col (2 groups, +/// selectivity = 5%) fn bench_select_count_1_where_fn_group_by_int_col_group_few_sel_l( b: &mut criterion::Bencher<'_, M>, input: &Input, @@ -531,7 +537,8 @@ fn bench_select_order_by_3_col_impl( .bench(b, executors, &[table.get_record_range_all()], &store); } -/// SELECT id, col1, col2 FROM Table ORDER BY isnull(col1), col1, col2 DESC LIMIT 10 +/// SELECT id, col1, col2 FROM Table ORDER BY isnull(col1), col1, col2 DESC +/// LIMIT 10 fn bench_select_order_by_3_col_limit_small(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, @@ -539,7 +546,8 @@ where bench_select_order_by_3_col_impl(10, b, input); } -/// SELECT id, col1, col2 FROM Table ORDER BY isnull(col1), col1, col2 DESC LIMIT 4000 +/// SELECT id, col1, col2 FROM Table ORDER BY isnull(col1), col1, col2 DESC +/// LIMIT 4000 fn bench_select_order_by_3_col_limit_large(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, @@ -591,8 +599,8 @@ fn bench_select_where_fn_order_by_3_col_impl( .bench(b, executors, &[table.get_record_range_all()], &store); } -/// SELECT id, col1, col2 FROM Table WHERE id > X ORDER BY isnull(col1), col1, col2 DESC LIMIT 10 -/// (selectivity = 0%) +/// SELECT id, col1, col2 FROM Table WHERE id > X ORDER BY isnull(col1), col1, +/// col2 DESC LIMIT 10 (selectivity = 0%) fn bench_select_where_fn_order_by_3_col_limit_small( b: &mut criterion::Bencher<'_, M>, input: &Input, @@ -602,8 +610,8 @@ fn bench_select_where_fn_order_by_3_col_limit_small( bench_select_where_fn_order_by_3_col_impl(10, b, input); } -/// SELECT id, col1, col2 FROM Table WHERE id > X ORDER BY isnull(col1), col1, col2 DESC LIMIT 4000 -/// (selectivity = 0%) +/// SELECT id, col1, col2 FROM Table WHERE id > X ORDER BY isnull(col1), col1, +/// col2 DESC LIMIT 4000 (selectivity = 0%) fn bench_select_where_fn_order_by_3_col_limit_large( b: &mut criterion::Bencher<'_, M>, input: &Input, diff --git a/tests/benches/coprocessor_executors/selection/util.rs b/tests/benches/coprocessor_executors/selection/util.rs index ef2548a3c42..85e39f49cfe 100644 --- a/tests/benches/coprocessor_executors/selection/util.rs +++ b/tests/benches/coprocessor_executors/selection/util.rs @@ -31,7 +31,8 @@ where } } -/// A bencher that will use batch selection aggregation executor to bench the giving expressions. +/// A bencher that will use batch selection aggregation executor to bench the +/// giving expressions. pub struct BatchBencher; impl SelectionBencher for BatchBencher diff --git a/tests/benches/coprocessor_executors/simple_aggr/util.rs b/tests/benches/coprocessor_executors/simple_aggr/util.rs index e3cbe14dd37..e13d1be503f 100644 --- a/tests/benches/coprocessor_executors/simple_aggr/util.rs +++ b/tests/benches/coprocessor_executors/simple_aggr/util.rs @@ -31,8 +31,8 @@ where } } -/// A bencher that will use batch simple aggregation executor to bench the giving aggregate -/// expression. +/// A bencher that will use batch simple aggregation executor to bench the +/// giving aggregate expression. pub struct BatchBencher; impl SimpleAggrBencher for BatchBencher diff --git a/tests/benches/coprocessor_executors/stream_aggr/mod.rs b/tests/benches/coprocessor_executors/stream_aggr/mod.rs index 9f0f3a34e66..fa82fa620a7 100644 --- a/tests/benches/coprocessor_executors/stream_aggr/mod.rs +++ b/tests/benches/coprocessor_executors/stream_aggr/mod.rs @@ -74,8 +74,8 @@ fn bench_stream_aggr_count_1_group_by_decimal_col_2_groups( input.bencher.bench(b, &fb, &group_by, &[expr]); } -/// COUNT(1) GROUP BY COL1, COL2 where COL1 is a int column and COL2 is a real column. -/// Each row is a new group. +/// COUNT(1) GROUP BY COL1, COL2 where COL1 is a int column and COL2 is a real +/// column. Each row is a new group. fn bench_stream_aggr_count_1_group_by_int_col_real_col( b: &mut criterion::Bencher<'_, M>, input: &Input, @@ -95,8 +95,8 @@ fn bench_stream_aggr_count_1_group_by_int_col_real_col( input.bencher.bench(b, &fb, &group_by, &[expr]); } -/// COUNT(1) GROUP BY COL1, COL2 where COL1 is a int column and COL2 is a real column. -/// There will be two groups totally. +/// COUNT(1) GROUP BY COL1, COL2 where COL1 is a int column and COL2 is a real +/// column. There will be two groups totally. fn bench_stream_aggr_count_1_group_by_int_col_real_col_2_groups( b: &mut criterion::Bencher<'_, M>, input: &Input, diff --git a/tests/benches/coprocessor_executors/stream_aggr/util.rs b/tests/benches/coprocessor_executors/stream_aggr/util.rs index b31a220b837..cba952150bb 100644 --- a/tests/benches/coprocessor_executors/stream_aggr/util.rs +++ b/tests/benches/coprocessor_executors/stream_aggr/util.rs @@ -37,8 +37,8 @@ where } } -/// A bencher that will use batch stream aggregation executor to bench the giving aggregate -/// expression. +/// A bencher that will use batch stream aggregation executor to bench the +/// giving aggregate expression. pub struct BatchBencher; impl StreamAggrBencher for BatchBencher diff --git a/tests/benches/coprocessor_executors/table_scan/fixture.rs b/tests/benches/coprocessor_executors/table_scan/fixture.rs index 8005f6fab8a..7e3dd2bfc32 100644 --- a/tests/benches/coprocessor_executors/table_scan/fixture.rs +++ b/tests/benches/coprocessor_executors/table_scan/fixture.rs @@ -23,7 +23,8 @@ pub fn table_with_2_columns(rows: usize) -> (Table, Store) { (table, store) } -/// Builds a fixture table, which contains specified number of columns: col0, col1, col2, ... +/// Builds a fixture table, which contains specified number of columns: col0, +/// col1, col2, ... pub fn table_with_multi_columns(rows: usize, columns: usize) -> (Table, Store) { let mut table = TableBuilder::new(); for idx in 0..columns { @@ -44,8 +45,8 @@ pub fn table_with_multi_columns(rows: usize, columns: usize) -> (Table, Store (Table, Store) { let mut table = TableBuilder::new(); for idx in 0..columns { @@ -67,7 +68,8 @@ pub fn table_with_missing_column(rows: usize, columns: usize) -> (Table, Store (Table, Store) { let id = ColumnBuilder::new() .col_type(TYPE_LONG) diff --git a/tests/benches/coprocessor_executors/table_scan/mod.rs b/tests/benches/coprocessor_executors/table_scan/mod.rs index 288374ae741..b030a236cbd 100644 --- a/tests/benches/coprocessor_executors/table_scan/mod.rs +++ b/tests/benches/coprocessor_executors/table_scan/mod.rs @@ -26,7 +26,8 @@ where ); } -/// 1 interested column, at the front of each row. Each row contains 100 columns. +/// 1 interested column, at the front of each row. Each row contains 100 +/// columns. /// /// This kind of scanner is used in SQLs like `SELECT COUNT(column)`. fn bench_table_scan_datum_front(b: &mut criterion::Bencher<'_, M>, input: &Input) @@ -43,7 +44,8 @@ where ); } -/// 2 interested columns, at the front of each row. Each row contains 100 columns. +/// 2 interested columns, at the front of each row. Each row contains 100 +/// columns. fn bench_table_scan_datum_multi_front(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, @@ -76,8 +78,8 @@ where ); } -/// 100 interested columns, all columns in the row are interested (i.e. there are totally 100 -/// columns in the row). +/// 100 interested columns, all columns in the row are interested (i.e. there +/// are totally 100 columns in the row). fn bench_table_scan_datum_all(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, @@ -92,7 +94,8 @@ where ); } -/// 3 columns in the row and the last column is very long but only PK is interested. +/// 3 columns in the row and the last column is very long but only PK is +/// interested. fn bench_table_scan_long_datum_primary_key(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, @@ -107,7 +110,8 @@ where ); } -/// 3 columns in the row and the last column is very long but a short column is interested. +/// 3 columns in the row and the last column is very long but a short column is +/// interested. fn bench_table_scan_long_datum_normal(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, @@ -122,7 +126,8 @@ where ); } -/// 3 columns in the row and the last column is very long and the long column is interested. +/// 3 columns in the row and the last column is very long and the long column is +/// interested. fn bench_table_scan_long_datum_long(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, @@ -137,7 +142,8 @@ where ); } -/// 3 columns in the row and the last column is very long and the all columns are interested. +/// 3 columns in the row and the last column is very long and the all columns +/// are interested. fn bench_table_scan_long_datum_all(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, @@ -156,8 +162,8 @@ where ); } -/// 1 interested column, but the column is missing from each row (i.e. it's default value is -/// used instead). Each row contains totally 10 columns. +/// 1 interested column, but the column is missing from each row (i.e. it's +/// default value is used instead). Each row contains totally 10 columns. fn bench_table_scan_datum_absent(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, @@ -172,8 +178,8 @@ where ); } -/// 1 interested column, but the column is missing from each row (i.e. it's default value is -/// used instead). Each row contains totally 100 columns. +/// 1 interested column, but the column is missing from each row (i.e. it's +/// default value is used instead). Each row contains totally 100 columns. fn bench_table_scan_datum_absent_large_row(b: &mut criterion::Bencher<'_, M>, input: &Input) where M: Measurement, diff --git a/tests/benches/coprocessor_executors/util/fixture.rs b/tests/benches/coprocessor_executors/util/fixture.rs index 0836be732f7..5910ab4fc69 100644 --- a/tests/benches/coprocessor_executors/util/fixture.rs +++ b/tests/benches/coprocessor_executors/util/fixture.rs @@ -65,7 +65,8 @@ impl FixtureBuilder { self } - /// Pushes a i64 column that values are randomly sampled from the giving values. + /// Pushes a i64 column that values are randomly sampled from the giving + /// values. pub fn push_column_i64_sampled(mut self, samples: &[i64]) -> Self { let mut rng: XorShiftRng = SeedableRng::seed_from_u64(SEED_1); let mut col = Vec::with_capacity(self.rows); @@ -77,10 +78,12 @@ impl FixtureBuilder { self } - /// Pushes a i64 column that values are filled according to the given values in order. + /// Pushes a i64 column that values are filled according to the given values + /// in order. /// - /// For example, if 3 values `[a, b, c]` are given, then the first 1/3 values in the column are - /// `a`, the second 1/3 values are `b` and the last 1/3 values are `c`. + /// For example, if 3 values `[a, b, c]` are given, then the first 1/3 + /// values in the column are `a`, the second 1/3 values are `b` and the + /// last 1/3 values are `c`. pub fn push_column_i64_ordered(mut self, samples: &[i64]) -> Self { let mut col = Vec::with_capacity(self.rows); for i in 0..self.rows { @@ -117,7 +120,8 @@ impl FixtureBuilder { self } - /// Pushes a f64 column that values are randomly sampled from the giving values. + /// Pushes a f64 column that values are randomly sampled from the giving + /// values. pub fn push_column_f64_sampled(mut self, samples: &[f64]) -> Self { let mut rng: XorShiftRng = SeedableRng::seed_from_u64(SEED_1); let mut col = Vec::with_capacity(self.rows); @@ -129,10 +133,12 @@ impl FixtureBuilder { self } - /// Pushes a f64 column that values are filled according to the given values in order. + /// Pushes a f64 column that values are filled according to the given values + /// in order. /// - /// For example, if 3 values `[a, b, c]` are given, then the first 1/3 values in the column are - /// `a`, the second 1/3 values are `b` and the last 1/3 values are `c`. + /// For example, if 3 values `[a, b, c]` are given, then the first 1/3 + /// values in the column are `a`, the second 1/3 values are `b` and the + /// last 1/3 values are `c`. pub fn push_column_f64_ordered(mut self, samples: &[f64]) -> Self { let mut col = Vec::with_capacity(self.rows); for i in 0..self.rows { @@ -157,7 +163,8 @@ impl FixtureBuilder { /// Pushes a decimal column that values are randomly generated. /// - /// Generated decimals have 1 to 30 integer digits and 1 to 20 fractional digits. + /// Generated decimals have 1 to 30 integer digits and 1 to 20 fractional + /// digits. pub fn push_column_decimal_random(mut self) -> Self { let mut rng: XorShiftRng = SeedableRng::seed_from_u64(SEED_2); let mut col = Vec::with_capacity(self.rows); @@ -180,7 +187,8 @@ impl FixtureBuilder { self } - /// Pushes a decimal column that values are randomly sampled from the giving values. + /// Pushes a decimal column that values are randomly sampled from the giving + /// values. pub fn push_column_decimal_sampled(mut self, samples: &[&str]) -> Self { let mut rng: XorShiftRng = SeedableRng::seed_from_u64(SEED_2); let mut col = Vec::with_capacity(self.rows); @@ -193,10 +201,12 @@ impl FixtureBuilder { self } - /// Pushes a decimal column that values are filled according to the given values in order. + /// Pushes a decimal column that values are filled according to the given + /// values in order. /// - /// For example, if 3 values `[a, b, c]` are given, then the first 1/3 values in the column are - /// `a`, the second 1/3 values are `b` and the last 1/3 values are `c`. + /// For example, if 3 values `[a, b, c]` are given, then the first 1/3 + /// values in the column are `a`, the second 1/3 values are `b` and the + /// last 1/3 values are `c`. pub fn push_column_decimal_ordered(mut self, samples: &[&str]) -> Self { let mut col = Vec::with_capacity(self.rows); for i in 0..self.rows { @@ -209,8 +219,8 @@ impl FixtureBuilder { self } - /// Pushes a bytes column that values are randomly generated and each value has the same length - /// as specified. + /// Pushes a bytes column that values are randomly generated and each value + /// has the same length as specified. pub fn push_column_bytes_random_fixed_len(mut self, len: usize) -> Self { let mut rng: XorShiftRng = SeedableRng::seed_from_u64(SEED_3); let mut col = Vec::with_capacity(self.rows); @@ -327,8 +337,8 @@ impl BatchExecutor for BatchFixtureExecutor { } } -/// Benches the performance of the batch fixture executor itself. When using it as the source -/// executor in other benchmarks, we need to take out these costs. +/// Benches the performance of the batch fixture executor itself. When using it +/// as the source executor in other benchmarks, we need to take out these costs. fn bench_util_batch_fixture_executor_next_1024(b: &mut criterion::Bencher<'_, M>) where M: Measurement, diff --git a/tests/benches/coprocessor_executors/util/mod.rs b/tests/benches/coprocessor_executors/util/mod.rs index f0a64a7e5dd..5ef442a25cd 100644 --- a/tests/benches/coprocessor_executors/util/mod.rs +++ b/tests/benches/coprocessor_executors/util/mod.rs @@ -20,8 +20,8 @@ use tipb::Executor as PbExecutor; pub use self::fixture::FixtureBuilder; -/// Gets the value of `TIKV_BENCH_LEVEL`. The larger value it is, the more comprehensive benchmarks -/// will be. +/// Gets the value of `TIKV_BENCH_LEVEL`. The larger value it is, the more +/// comprehensive benchmarks will be. pub fn bench_level() -> usize { if let Ok(s) = std::env::var("TIKV_BENCH_LEVEL") { s.parse::().unwrap() diff --git a/tests/benches/coprocessor_executors/util/store.rs b/tests/benches/coprocessor_executors/util/store.rs index 057bb2133b4..134b0e1e8d2 100644 --- a/tests/benches/coprocessor_executors/util/store.rs +++ b/tests/benches/coprocessor_executors/util/store.rs @@ -10,7 +10,8 @@ use tikv::storage::{ /// `MemStore` is a store provider that operates directly over a BTreeMap. pub type MemStore = FixtureStore; -/// `RocksStore` is a store provider that operates over a disk-based RocksDB storage. +/// `RocksStore` is a store provider that operates over a disk-based RocksDB +/// storage. pub type RocksStore = SnapshotStore>; pub trait StoreDescriber { diff --git a/tests/benches/hierarchy/engine/mod.rs b/tests/benches/hierarchy/engine/mod.rs index f248882a74e..85e6ce77e33 100644 --- a/tests/benches/hierarchy/engine/mod.rs +++ b/tests/benches/hierarchy/engine/mod.rs @@ -48,7 +48,7 @@ fn bench_engine_snapshot>( }); } -//exclude snapshot +// exclude snapshot fn bench_engine_get>( bencher: &mut Bencher<'_>, config: &BenchConfig, diff --git a/tests/benches/misc/storage/incremental_get.rs b/tests/benches/misc/storage/incremental_get.rs index 5c7b8e837a9..eb65f55fd72 100644 --- a/tests/benches/misc/storage/incremental_get.rs +++ b/tests/benches/misc/storage/incremental_get.rs @@ -47,8 +47,8 @@ fn table_lookup_gen_data() -> (SnapshotStore>, Vec) { false, ); - // Keys are given in order, and are far away from each other to simulate a normal table lookup - // scenario. + // Keys are given in order, and are far away from each other to simulate a + // normal table lookup scenario. let mut get_keys = Vec::new(); for i in (0..30000).step_by(30) { get_keys.push(Key::from_raw(&table::encode_row_key(5, i))); diff --git a/tests/failpoints/cases/test_async_fetch.rs b/tests/failpoints/cases/test_async_fetch.rs index 638888e83e2..78517dca8e3 100644 --- a/tests/failpoints/cases/test_async_fetch.rs +++ b/tests/failpoints/cases/test_async_fetch.rs @@ -103,7 +103,7 @@ fn test_node_async_fetch() { &cluster.engines, &before_states, 1, - false, /*must_compacted*/ + false, // must_compacted ) { return; @@ -113,7 +113,7 @@ fn test_node_async_fetch() { &cluster.engines, &before_states, 1, - true, /*must_compacted*/ + true, // must_compacted ); } @@ -256,7 +256,8 @@ fn test_node_compact_entry_cache() { // change one peer to learner cluster.pd_client.add_peer(1, new_learner_peer(5, 5)); - // cause log lag and pause async fetch to check if entry cache is reserved for the learner + // cause log lag and pause async fetch to check if entry cache is reserved for + // the learner for i in 1..6 { let k = i.to_string().into_bytes(); let v = k.clone(); diff --git a/tests/failpoints/cases/test_cmd_epoch_checker.rs b/tests/failpoints/cases/test_cmd_epoch_checker.rs index 00b8cd286da..1068b35f8d5 100644 --- a/tests/failpoints/cases/test_cmd_epoch_checker.rs +++ b/tests/failpoints/cases/test_cmd_epoch_checker.rs @@ -362,8 +362,8 @@ fn test_reject_proposal_during_leader_transfer() { cluster.must_put(b"k", b"v"); cluster.transfer_leader(r, new_peer(2, 2)); - // The leader can't change to transferring state immediately due to pre-transfer-leader - // feature, so wait for a while. + // The leader can't change to transferring state immediately due to + // pre-transfer-leader feature, so wait for a while. sleep_ms(100); assert_ne!(cluster.leader_of_region(r).unwrap(), new_peer(2, 2)); @@ -441,7 +441,8 @@ fn test_not_invoke_committed_cb_when_fail_to_commit() { cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k", b"v"); - // Partiton the leader and followers to let the leader fails to commit the proposal. + // Partition the leader and followers to let the leader fails to commit the + // proposal. cluster.partition(vec![1], vec![2, 3]); let write_req = make_write_req(&mut cluster, b"k1"); let (cb, cb_receivers) = make_cb(&write_req); @@ -462,8 +463,8 @@ fn test_not_invoke_committed_cb_when_fail_to_commit() { * cluster.cfg.raft_store.raft_election_timeout_ticks as u32; std::thread::sleep(2 * election_timeout); - // Make sure a new leader is elected and will discard the previous proposal when partition is - // recovered. + // Make sure a new leader is elected and will discard the previous proposal when + // partition is recovered. cluster.must_put(b"k2", b"v"); cluster.clear_send_filters(); diff --git a/tests/failpoints/cases/test_conf_change.rs b/tests/failpoints/cases/test_conf_change.rs index ef85fde1886..70194b194ac 100644 --- a/tests/failpoints/cases/test_conf_change.rs +++ b/tests/failpoints/cases/test_conf_change.rs @@ -211,7 +211,8 @@ fn test_stale_peer_cache() { // 4. peer 1 sends a snapshot with latest configuration [1, 2, 3] to peer 3; // 5. peer 3 restores the snapshot into memory; // 6. then peer 3 calling `Raft::apply_conf_change` to add peer 4; -// 7. so the disk configuration `[1, 2, 3]` is different from memory configuration `[1, 2, 3, 4]`. +// 7. so the disk configuration `[1, 2, 3]` is different from memory +// configuration `[1, 2, 3, 4]`. #[test] fn test_redundant_conf_change_by_snapshot() { let mut cluster = new_node_cluster(0, 4); diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index 60f45ae957a..818c7ba2739 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -35,7 +35,8 @@ fn test_deadline() { #[test] fn test_deadline_2() { - // It should not even take any snapshots when request is outdated from the beginning. + // It should not even take any snapshots when request is outdated from the + // beginning. let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &[]); let req = DAGSelect::from(&product).build(); @@ -198,7 +199,8 @@ fn test_paging_scan() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); - // set batch size and grow size to 1, so that only 1 row will be scanned in each batch. + // set batch size and grow size to 1, so that only 1 row will be scanned in each + // batch. fail::cfg("copr_batch_initial_size", "return(1)").unwrap(); fail::cfg("copr_batch_grow_size", "return(1)").unwrap(); for desc in [false, true] { @@ -263,7 +265,8 @@ fn test_paging_scan_multi_ranges() { ]; let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); - // set batch size and grow size to 1, so that only 1 row will be scanned in each batch. + // set batch size and grow size to 1, so that only 1 row will be scanned in each + // batch. fail::cfg("copr_batch_initial_size", "return(1)").unwrap(); fail::cfg("copr_batch_grow_size", "return(1)").unwrap(); diff --git a/tests/failpoints/cases/test_disk_full.rs b/tests/failpoints/cases/test_disk_full.rs index 5fb4ac7b1ca..be027ae7217 100644 --- a/tests/failpoints/cases/test_disk_full.rs +++ b/tests/failpoints/cases/test_disk_full.rs @@ -303,7 +303,8 @@ fn test_majority_disk_full() { let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![2, 3]); - // Proposals with special `DiskFullOpt`s can be accepted even if all peers are disk full. + // Proposals with special `DiskFullOpt`s can be accepted even if all peers are + // disk full. fail::cfg(get_fp(DiskUsage::AlmostFull, 1), "return").unwrap(); let reqs = vec![new_put_cmd(b"k3", b"v3")]; let put = new_request(1, epoch.clone(), reqs, false); @@ -313,8 +314,9 @@ fn test_majority_disk_full() { let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); assert!(!resp.get_header().has_error()); - // Reset disk full status for peer 2 and 3. 2 follower reads must success because the leader - // will continue to append entries to followers after the new disk usages are reported. + // Reset disk full status for peer 2 and 3. 2 follower reads must success + // because the leader will continue to append entries to followers after the + // new disk usages are reported. for i in 1..3 { fail::remove(get_fp(DiskUsage::AlmostFull, i + 1)); ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); @@ -327,8 +329,8 @@ fn test_majority_disk_full() { ensure_disk_usage_is_reported(&mut cluster, i + 1, i + 1, ®ion); } - // Proposals with special `DiskFullOpt`s will still be rejected if majority peers are already - // disk full. + // Proposals with special `DiskFullOpt`s will still be rejected if majority + // peers are already disk full. let reqs = vec![new_put_cmd(b"k3", b"v3")]; let put = new_request(1, epoch.clone(), reqs, false); let mut opts = RaftCmdExtraOpts::default(); @@ -346,8 +348,8 @@ fn test_majority_disk_full() { cluster.pd_client.must_remove_peer(1, new_peer(2, 2)); // After the last configuration change is applied, the raft group will be like - // `[(1, DiskUsage::AlmostFull), (3, DiskUsage::AlreadyFull)]`. So no more proposals - // should be allowed. + // `[(1, DiskUsage::AlmostFull), (3, DiskUsage::AlreadyFull)]`. So no more + // proposals should be allowed. let reqs = vec![new_put_cmd(b"k4", b"v4")]; let put = new_request(1, epoch, reqs, false); let mut opts = RaftCmdExtraOpts::default(); @@ -383,7 +385,8 @@ fn test_disk_full_followers_with_hibernate_regions() { fail::remove(get_fp(DiskUsage::AlmostFull, 2)); thread::sleep(tick_dur * 2); - // The leader should know peer 2's disk usage changes, because it's keeping to tick. + // The leader should know peer 2's disk usage changes, because it's keeping to + // tick. cluster.must_put(b"k2", b"v2"); must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); } diff --git a/tests/failpoints/cases/test_early_apply.rs b/tests/failpoints/cases/test_early_apply.rs index b6ddf136a89..acac65cd397 100644 --- a/tests/failpoints/cases/test_early_apply.rs +++ b/tests/failpoints/cases/test_early_apply.rs @@ -82,16 +82,16 @@ fn test_multi_early_apply() { } /// Test if the commit state check of apply msg is ok. -/// In the previous implementation, the commit state check uses the state of last -/// committed entry and it relies on the guarantee that the commit index and term -/// of the last committed entry must be monotonically increasing even between restarting. -/// However, this guarantee can be broken by +/// In the previous implementation, the commit state check uses the state of +/// last committed entry and it relies on the guarantee that the commit index +/// and term of the last committed entry must be monotonically increasing even +/// between restarting. However, this guarantee can be broken by /// 1. memory limitation of fetching committed entries /// 2. batching apply msg -/// Now the commit state uses the minimum of persist index and commit index from the peer -/// to fix this issue. -/// For simplicity, this test uses region merge to ensure that the apply state will be written -/// to kv db before crash. +/// Now the commit state uses the minimum of persist index and commit index from +/// the peer to fix this issue. +/// For simplicity, this test uses region merge to ensure that the apply state +/// will be written to kv db before crash. #[test] fn test_early_apply_yield_followed_with_many_entries() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/failpoints/cases/test_encryption.rs b/tests/failpoints/cases/test_encryption.rs index c99674aae1e..ccb4d698e3f 100644 --- a/tests/failpoints/cases/test_encryption.rs +++ b/tests/failpoints/cases/test_encryption.rs @@ -10,13 +10,14 @@ fn test_file_dict_file_record_corrupted() { tempdir.path(), "test_file_dict_file_record_corrupted_1", true, - 10, /*file_rewrite_threshold*/ + 10, // file_rewrite_threshold ) .unwrap(); let info1 = create_file_info(1, EncryptionMethod::Aes256Ctr); let info2 = create_file_info(2, EncryptionMethod::Unknown); // 9 represents that the first 9 bytes will be discarded. - // Crc32 (4 bytes) + File name length (2 bytes) + FileInfo length (2 bytes) + Log type (1 bytes) + // Crc32 (4 bytes) + File name length (2 bytes) + FileInfo length (2 bytes) + + // Log type (1 bytes) fail::cfg("file_dict_log_append_incomplete", "return(9)").unwrap(); file_dict_file.insert("info1", &info1).unwrap(); fail::remove("file_dict_log_append_incomplete"); @@ -28,7 +29,7 @@ fn test_file_dict_file_record_corrupted() { tempdir.path(), "test_file_dict_file_record_corrupted_2", true, - 10, /*file_rewrite_threshold*/ + 10, // file_rewrite_threshold ) .unwrap(); let info1 = create_file_info(1, EncryptionMethod::Aes256Ctr); diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index 09308646421..c4e3e4dee71 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -28,12 +28,12 @@ use tikv::{ use tikv_util::HandyRwLock; use txn_types::{Key, TimeStamp}; -// In theory, raft can propose conf change as long as there is no pending one. Replicas -// don't apply logs synchronously, so it's possible the old leader is removed before the new -// leader applies all logs. -// In the current implementation, the new leader rejects conf change until it applies all logs. -// It guarantees the correctness of green GC. This test is to prevent breaking it in the -// future. +// In theory, raft can propose conf change as long as there is no pending one. +// Replicas don't apply logs synchronously, so it's possible the old leader is +// removed before the new leader applies all logs. +// In the current implementation, the new leader rejects conf change until it +// applies all logs. It guarantees the correctness of green GC. This test is to +// prevent breaking it in the future. #[test] fn test_collect_lock_from_stale_leader() { let mut cluster = new_server_cluster(0, 2); @@ -62,7 +62,8 @@ fn test_collect_lock_from_stale_leader() { ctx.set_peer(leader.clone()); ctx.set_region_epoch(cluster.get_region_epoch(region_id)); - // Pause the new peer applying so that when it becomes the leader, it doesn't apply all logs. + // Pause the new peer applying so that when it becomes the leader, it doesn't + // apply all logs. let new_leader_apply_fp = "on_handle_apply_1003"; fail::cfg(new_leader_apply_fp, "pause").unwrap(); must_kv_prewrite( @@ -73,7 +74,8 @@ fn test_collect_lock_from_stale_leader() { 10, ); - // Leader election only considers the progress of appending logs, so it can succeed. + // Leader election only considers the progress of appending logs, so it can + // succeed. cluster.must_transfer_leader(region_id, new_peer.clone()); // It shouldn't succeed in the current implementation. cluster.pd_client.remove_peer(region_id, leader.clone()); @@ -157,7 +159,8 @@ fn test_notify_observer_after_apply() { 10, ); }); - // We can use physical_scan_lock to get the lock because we notify the lock observer after writing data to the rocskdb. + // We can use physical_scan_lock to get the lock because we notify the lock + // observer after writing data to the rocskdb. let mut locks = vec![]; retry_until(|| { assert!(must_check_lock_observer(&client, max_ts, true).is_empty()); @@ -189,7 +192,8 @@ fn test_notify_observer_after_apply() { cluster .pd_client .must_add_peer(ctx.get_region_id(), new_peer(store_id, store_id)); - // We can use physical_scan_lock to get the lock because we notify the lock observer after writing data to the rocksdb. + // We can use physical_scan_lock to get the lock because we notify the lock + // observer after writing data to the rocksdb. let mut locks = vec![]; retry_until(|| { assert!(must_check_lock_observer(&replica_client, max_ts, true).is_empty()); @@ -213,13 +217,19 @@ fn test_notify_observer_after_apply() { ); } -// It may cause locks missing during green GC if the raftstore notifies the lock observer before writing data to the rocksdb: -// 1. Store-1 transfers a region to store-2 and store-2 is applying logs. -// 2. GC worker registers lock observer on store-2 after calling lock observer's callback and before finishing applying which means the lock won't be observed. -// 3. GC worker scans locks on each store independently. It's possible GC worker has scanned all locks on store-2 and hasn't scanned locks on store-1. -// 4. Store-2 applies all logs and removes the peer on store-1. -// 5. GC worker can't scan the lock on store-1 because the peer has been destroyed. -// 6. GC worker can't get the lock from store-2 because it can't observe the lock and has scanned it. +// It may cause locks missing during green GC if the raftstore notifies the lock +// observer before writing data to the rocksdb: +// - Store-1 transfers a region to store-2 and store-2 is applying logs. +// - GC worker registers lock observer on store-2 after calling lock observer's +// callback and before finishing applying which means the lock won't be +// observed. +// - GC worker scans locks on each store independently. It's possible GC worker +// has scanned all locks on store-2 and hasn't scanned locks on store-1. +// - Store-2 applies all logs and removes the peer on store-1. +// - GC worker can't scan the lock on store-1 because the peer has been +// destroyed. +// - GC worker can't get the lock from store-2 because it can't observe the lock +// and has scanned it. #[test] fn test_collect_applying_locks() { let mut cluster = new_server_cluster(0, 2); @@ -248,7 +258,8 @@ fn test_collect_applying_locks() { ctx.set_peer(leader.clone()); ctx.set_region_epoch(cluster.get_region_epoch(region_id)); - // Pause store-2 after calling observer callbacks and before writing to the rocksdb. + // Pause store-2 after calling observer callbacks and before writing to the + // rocksdb. let new_leader_apply_fp = "post_handle_apply_1003"; fail::cfg(new_leader_apply_fp, "pause").unwrap(); @@ -300,7 +311,8 @@ fn test_collect_applying_locks() { assert_eq!(locks[0].get_key(), b"k1"); } -// Test write CF's compaction filter can call `orphan_versions_handler` correctly. +// Test write CF's compaction filter can call `orphan_versions_handler` +// correctly. #[test] fn test_error_in_compaction_filter() { let engine = TestEngineBuilder::new().build().unwrap(); @@ -333,8 +345,8 @@ fn test_error_in_compaction_filter() { fail::remove(fp); } -// Test GC worker can receive and handle orphan versions emit from write CF's compaction filter -// correctly. +// Test GC worker can receive and handle orphan versions emit from write CF's +// compaction filter correctly. #[test] fn test_orphan_versions_from_compaction_filter() { let (cluster, leader, ctx) = must_new_and_configure_cluster(|cluster| { @@ -390,8 +402,9 @@ fn test_orphan_versions_from_compaction_filter() { fail::remove(fp); } -// Call `start_auto_gc` like `cmd/src/server.rs` does. It will combine compaction filter and GC -// worker so that GC worker can help to process orphan versions on default CF. +// Call `start_auto_gc` like `cmd/src/server.rs` does. It will combine +// compaction filter and GC worker so that GC worker can help to process orphan +// versions on default CF. fn init_compaction_filter(cluster: &Cluster, store_id: u64) { #[derive(Clone)] struct MockSafePointProvider; diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index 94721d0cef5..8ef0f08f19e 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -19,7 +19,8 @@ fn test_break_leadership_on_restart() { cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); cluster.cfg.raft_store.raft_heartbeat_ticks = 2; cluster.cfg.raft_store.raft_election_timeout_ticks = 10; - // So the random election timeout will always be 10, which makes the case more stable. + // So the random election timeout will always be 10, which makes the case more + // stable. cluster.cfg.raft_store.raft_min_election_timeout_ticks = 10; cluster.cfg.raft_store.raft_max_election_timeout_ticks = 11; configure_for_hibernate(&mut cluster); @@ -38,8 +39,8 @@ fn test_break_leadership_on_restart() { // Peer 3 will: // 1. steps a heartbeat message from its leader and then ticks 1 time. - // 2. ticks a peer_stale_state_check, which will change state from Idle to PreChaos. - // 3. continues to tick until it hibernates totally. + // 2. ticks a peer_stale_state_check, which will change state from Idle to + // PreChaos. 3. continues to tick until it hibernates totally. let (tx, rx) = mpsc::sync_channel(128); fail::cfg_callback("on_raft_base_tick_idle", move || tx.send(0).unwrap()).unwrap(); let mut raft_msg = RaftMessage::default(); @@ -65,8 +66,8 @@ fn test_break_leadership_on_restart() { // Until here, peer 3 will be like `election_elapsed=3 && missing_ticks=6`. thread::sleep(Duration::from_millis(base_tick_ms * 10)); - // Restart the peer 2 and it will broadcast `MsgRequestPreVote` later, which will wake up - // peer 1 and 3. + // Restart the peer 2 and it will broadcast `MsgRequestPreVote` later, which + // will wake up peer 1 and 3. let (tx, rx) = mpsc::sync_channel(128); let filter = RegionPacketFilter::new(1, 3) .direction(Direction::Send) @@ -76,6 +77,7 @@ fn test_break_leadership_on_restart() { cluster.add_send_filter(CloneFilterFactory(filter)); cluster.run_node(2).unwrap(); - // Peer 3 shouldn't start a new election, otherwise the leader may step down incorrectly. + // Peer 3 shouldn't start a new election, otherwise the leader may step down + // incorrectly. assert!(rx.recv_timeout(Duration::from_secs(2)).is_err()); } diff --git a/tests/failpoints/cases/test_import_service.rs b/tests/failpoints/cases/test_import_service.rs index ec83d8eae75..3fdb464c718 100644 --- a/tests/failpoints/cases/test_import_service.rs +++ b/tests/failpoints/cases/test_import_service.rs @@ -128,8 +128,8 @@ fn test_ingest_reentrant() { let checksum2 = calc_crc32(save_path).unwrap(); // TODO: Remove this once write_global_seqno is deprecated. - // Checksums are the same since the global seqno in the SST file no longer gets updated with the - // default setting, which is write_global_seqno=false. + // Checksums are the same since the global seqno in the SST file no longer gets + // updated with the default setting, which is write_global_seqno=false. assert_eq!(checksum1, checksum2); // Do ingest again and it can be reentrant let resp = import.ingest(&ingest).unwrap(); @@ -155,12 +155,13 @@ fn test_ingest_key_manager_delete_file_failed() { let deregister_fp = "key_manager_fails_before_delete_file"; // the first delete is in check before ingest, the second is in ingest cleanup - // set the ingest clean up failed to trigger remove file but not remove key condition + // set the ingest clean up failed to trigger remove file but not remove key + // condition fail::cfg(deregister_fp, "1*off->1*return->off").unwrap(); - // Do an ingest and verify the result is correct. Though the ingest succeeded, the clone file is - // still in the key manager - //TODO: how to check the key manager contains the clone key + // Do an ingest and verify the result is correct. Though the ingest succeeded, + // the clone file is still in the key manager + // TODO: how to check the key manager contains the clone key let mut ingest = IngestRequest::default(); ingest.set_context(ctx.clone()); ingest.set_sst(meta.clone()); @@ -178,7 +179,8 @@ fn test_ingest_key_manager_delete_file_failed() { .get(&node_id) .unwrap() .get_path(&meta); - // wait up to 5 seconds to make sure raw uploaded file is deleted by the async clean up task. + // wait up to 5 seconds to make sure raw uploaded file is deleted by the async + // clean up task. for _ in 0..50 { if !save_path.as_path().exists() { break; @@ -187,7 +189,8 @@ fn test_ingest_key_manager_delete_file_failed() { } assert!(!save_path.as_path().exists()); - // Do upload and ingest again, though key manager contains this file, the ingest action should success. + // Do upload and ingest again, though key manager contains this file, the ingest + // action should success. upload_sst(&import, &meta, &data).unwrap(); let mut ingest = IngestRequest::default(); ingest.set_context(ctx); diff --git a/tests/failpoints/cases/test_kv_service.rs b/tests/failpoints/cases/test_kv_service.rs index bde6e8bb123..1f7e35b5691 100644 --- a/tests/failpoints/cases/test_kv_service.rs +++ b/tests/failpoints/cases/test_kv_service.rs @@ -17,7 +17,8 @@ fn test_batch_get_memory_lock() { fail::cfg("raftkv_async_snapshot_err", "return").unwrap(); let resp = client.kv_batch_get(&req).unwrap(); - // the injected error should be returned at both places for backward compatibility. + // the injected error should be returned at both places for backward + // compatibility. assert!(!resp.pairs[0].get_error().get_abort().is_empty()); assert!(!resp.get_error().get_abort().is_empty()); fail::remove("raftkv_async_snapshot_err"); @@ -34,7 +35,8 @@ fn test_kv_scan_memory_lock() { fail::cfg("raftkv_async_snapshot_err", "return").unwrap(); let resp = client.kv_scan(&req).unwrap(); - // the injected error should be returned at both places for backward compatibility. + // the injected error should be returned at both places for backward + // compatibility. assert!(!resp.pairs[0].get_error().get_abort().is_empty()); assert!(!resp.get_error().get_abort().is_empty()); fail::remove("raftkv_async_snapshot_err"); @@ -64,8 +66,8 @@ fn test_scan_lock_push_async_commit() { let k1 = b"k1"; let v1 = b"v1"; - // The following code simulates another case: prewrite is locking the memlock, and then - // another scan lock operation request meets the memlock. + // The following code simulates another case: prewrite is locking the memlock, + // and then another scan lock operation request meets the memlock. fail::cfg("before-set-lock-in-memory", "pause").unwrap(); let client1 = client.clone(); diff --git a/tests/failpoints/cases/test_memory_usage_limit.rs b/tests/failpoints/cases/test_memory_usage_limit.rs index 08c37fb330e..82aa9d5148d 100644 --- a/tests/failpoints/cases/test_memory_usage_limit.rs +++ b/tests/failpoints/cases/test_memory_usage_limit.rs @@ -13,7 +13,8 @@ use raftstore::store::MEMTRACE_ENTRY_CACHE; use test_raftstore::*; use tikv_util::config::ReadableDuration; -// Test even if memory usage reaches high water, committed entries can still get applied slowly. +// Test even if memory usage reaches high water, committed entries can still get +// applied slowly. #[test] fn test_memory_usage_reaches_high_water() { let mut cluster = new_node_cluster(0, 1); diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 5cb7c79011f..713ab4c5a5d 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -95,7 +95,8 @@ fn test_node_merge_rollback() { // Wait till rollback. cluster.must_put(b"k12", b"v12"); - // After premerge and rollback, conf_ver becomes 3 + 1 = 4, version becomes 4 + 2 = 6; + // After premerge and rollback, conf_ver becomes 3 + 1 = 4, version becomes 4 + + // 2 = 6; region.mut_region_epoch().set_conf_ver(4); region.mut_region_epoch().set_version(6); for i in 1..3 { @@ -195,7 +196,8 @@ fn test_node_merge_restart() { must_get_none(&cluster.get_engine(3), b"k3"); } -/// Test if merge is still working when restart a cluster during catching up logs for merge. +/// Test if merge is still working when restart a cluster during catching up +/// logs for merge. #[test] fn test_node_merge_catch_up_logs_restart() { let mut cluster = new_node_cluster(0, 3); @@ -340,8 +342,9 @@ fn test_node_merge_catch_up_logs_no_need() { // let source region not merged fail::cfg("before_handle_catch_up_logs_for_merge", "pause").unwrap(); fail::cfg("after_handle_catch_up_logs_for_merge", "pause").unwrap(); - // due to `before_handle_catch_up_logs_for_merge` failpoint, we already pass `apply_index < catch_up_logs.merge.get_commit()` - // so now can let apply index make progress. + // due to `before_handle_catch_up_logs_for_merge` failpoint, we already pass + // `apply_index < catch_up_logs.merge.get_commit()` so now can let apply + // index make progress. fail::remove("apply_after_prepare_merge"); // make sure all the logs are committed, including the compact command @@ -405,15 +408,15 @@ fn test_node_merge_recover_snapshot() { cluster.must_put(b"k40", b"v5"); } -// Test if a merge handled properly when there are two different snapshots of one region arrive -// in one raftstore tick. +// Test if a merge handled properly when there are two different snapshots of +// one region arrive in one raftstore tick. #[test] fn test_node_merge_multiple_snapshots_together() { test_node_merge_multiple_snapshots(true) } -// Test if a merge handled properly when there are two different snapshots of one region arrive -// in different raftstore tick. +// Test if a merge handled properly when there are two different snapshots of +// one region arrive in different raftstore tick. #[test] fn test_node_merge_multiple_snapshots_not_together() { test_node_merge_multiple_snapshots(false) @@ -471,7 +474,8 @@ fn test_node_merge_multiple_snapshots(together: bool) { .msg_type(MessageType::MsgAppend), )); - // Add a collect snapshot filter, it will delay snapshots until have collected multiple snapshots from different peers + // Add a collect snapshot filter, it will delay snapshots until have collected + // multiple snapshots from different peers cluster.sim.wl().add_recv_filter( 3, Box::new(LeadingDuplicatedSnapshotFilter::new( @@ -488,17 +492,20 @@ fn test_node_merge_multiple_snapshots(together: bool) { // Wait for snapshot to generate and send thread::sleep(Duration::from_millis(100)); - // Merge left and right region, due to isolation, the regions on store 3 are not merged yet. + // Merge left and right region, due to isolation, the regions on store 3 are not + // merged yet. pd_client.must_merge(left.get_id(), right.get_id()); thread::sleep(Duration::from_millis(200)); - // Let peer of right region on store 3 to make append response to trigger a new snapshot - // one is snapshot before merge, the other is snapshot after merge. - // Here blocks raftstore for a while to make it not to apply snapshot and receive new log now. + // Let peer of right region on store 3 to make append response to trigger a new + // snapshot one is snapshot before merge, the other is snapshot after merge. + // Here blocks raftstore for a while to make it not to apply snapshot and + // receive new log now. fail::cfg("on_raft_ready", "sleep(100)").unwrap(); cluster.clear_send_filters(); thread::sleep(Duration::from_millis(200)); - // Filter message again to make sure peer on store 3 can not catch up CommitMerge log + // Filter message again to make sure peer on store 3 can not catch up + // CommitMerge log cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(left.get_id(), 3) .direction(Direction::Recv) @@ -605,7 +612,8 @@ fn test_node_merge_restart_after_apply_premerge_before_apply_compact_log() { must_get_equal(&cluster.get_engine(3), b"k123", b"v2"); } -/// Tests whether stale merge is rollback properly if it merges to the same target region again later. +/// Tests whether stale merge is rollback properly if it merges to the same +/// target region again later. #[test] fn test_node_failed_merge_before_succeed_merge() { let mut cluster = new_node_cluster(0, 3); @@ -668,9 +676,10 @@ fn test_node_failed_merge_before_succeed_merge() { // Wait right region to send CatchUpLogs to left region. sleep_ms(100); // After executing CatchUpLogs in source peer fsm, the committed log will send - // to apply fsm in the end of this batch. So even the first `on_ready_prepare_merge` - // is executed after CatchUplogs, the latter committed logs is still sent to apply fsm - // if CatchUpLogs and `on_ready_prepare_merge` is in different batch. + // to apply fsm in the end of this batch. So even the first + // `on_ready_prepare_merge` is executed after CatchUplogs, the latter + // committed logs is still sent to apply fsm if CatchUpLogs and + // `on_ready_prepare_merge` is in different batch. // // In this case, the data is complete because the wrong up-to-date msg from the // first `on_ready_prepare_merge` is sent after all committed log. @@ -688,10 +697,12 @@ fn test_node_failed_merge_before_succeed_merge() { } } -/// Tests whether the source peer is destroyed correctly when transferring leader during committing merge. +/// Tests whether the source peer is destroyed correctly when transferring +/// leader during committing merge. /// -/// In the previous merge flow, target peer deletes meta of source peer without marking it as pending remove. -/// If source peer becomes leader at the same time, it will panic due to corrupted meta. +/// In the previous merge flow, target peer deletes meta of source peer without +/// marking it as pending remove. If source peer becomes leader at the same +/// time, it will panic due to corrupted meta. #[test] fn test_node_merge_transfer_leader() { let mut cluster = new_node_cluster(0, 3); @@ -703,8 +714,8 @@ fn test_node_merge_transfer_leader() { cluster.run(); - // To ensure the region has applied to its current term so that later `split` can success - // without any retries. Then, `left_peer_3` will must be `1003`. + // To ensure the region has applied to its current term so that later `split` + // can success without any retries. Then, `left_peer_3` will must be `1003`. let region = pd_client.get_region(b"k1").unwrap(); let peer_1 = find_peer(®ion, 1).unwrap().to_owned(); cluster.must_transfer_leader(region.get_id(), peer_1); @@ -791,7 +802,8 @@ fn test_node_merge_cascade_merge_with_apply_yield() { } } -// Test if the rollback merge proposal is proposed before the majority of peers want to rollback +// Test if the rollback merge proposal is proposed before the majority of peers +// want to rollback #[test] fn test_node_multiple_rollback_merge() { let mut cluster = new_node_cluster(0, 3); @@ -832,8 +844,8 @@ fn test_node_multiple_rollback_merge() { // Only the source leader is running `on_check_merge` fail::cfg(on_check_merge_not_1001_fp, "return()").unwrap(); fail::remove(on_schedule_merge_fp); - // In previous implementation, rollback merge proposal can be proposed by leader itself - // So wait for the leader propose rollback merge if possible + // In previous implementation, rollback merge proposal can be proposed by leader + // itself So wait for the leader propose rollback merge if possible sleep_ms(100); // Check if the source region is still in merging mode. let mut l_r = pd_client.get_region(b"k1").unwrap(); @@ -869,14 +881,14 @@ fn test_node_multiple_rollback_merge() { // In the previous implementation, the source peer will propose rollback merge // after the local target peer's epoch is larger than recorded previously. -// But it's wrong. This test constructs a case that writing data to the source region -// after merging. This operation can succeed in the previous implementation which -// causes data loss. -// In the current implementation, the rollback merge proposal can be proposed only when -// the number of peers who want to rollback merge is greater than the majority of all -// peers. If so, this merge is impossible to succeed. -// PS: A peer who wants to rollback merge means its local target peer's epoch is larger -// than recorded. +// But it's wrong. This test constructs a case that writing data to the source +// region after merging. This operation can succeed in the previous +// implementation which causes data loss. +// In the current implementation, the rollback merge proposal can be proposed +// only when the number of peers who want to rollback merge is greater than the +// majority of all peers. If so, this merge is impossible to succeed. +// PS: A peer who wants to rollback merge means its local target peer's epoch is +// larger than recorded. #[test] fn test_node_merge_write_data_to_source_region_after_merging() { let mut cluster = new_node_cluster(0, 3); @@ -971,13 +983,14 @@ fn test_node_merge_write_data_to_source_region_after_merging() { fail::remove(on_handle_apply_2_fp); } -/// In previous implementation, destroying its source peer(s) and applying snapshot is not **atomic**. -/// It may break the rule of our merging process. +/// In previous implementation, destroying its source peer(s) and applying +/// snapshot is not **atomic**. It may break the rule of our merging process. /// -/// A tikv crash after its source peers have destroyed but this target peer does not become to -/// `Applying` state which means it will not apply snapshot after this tikv restarts. -/// After this tikv restarts, a new leader may send logs to this target peer, then the panic may happen -/// because it can not find its source peers when applying `CommitMerge` log. +/// A tikv crash after its source peers have destroyed but this target peer does +/// not become to `Applying` state which means it will not apply snapshot after +/// this tikv restarts. After this tikv restarts, a new leader may send logs to +/// this target peer, then the panic may happen because it can not find its +/// source peers when applying `CommitMerge` log. /// /// This test is to reproduce above situation. #[test] @@ -1020,13 +1033,14 @@ fn test_node_merge_crash_before_snapshot_then_catch_up_logs() { pd_client.must_merge(left.get_id(), right.get_id()); region = pd_client.get_region(b"k1").unwrap(); - // Write some logs and the logs' number is greater than `raft_log_gc_count_limit` - // for latter log compaction + // Write some logs and the logs' number is greater than + // `raft_log_gc_count_limit` for latter log compaction for i in 2..15 { cluster.must_put(format!("k{}", i).as_bytes(), b"v"); } - // Aim at making peer 2 only know the compact log but do not know it is committed + // Aim at making peer 2 only know the compact log but do not know it is + // committed let condition = Arc::new(AtomicBool::new(false)); let recv_filter = Box::new( RegionPacketFilter::new(region.get_id(), 2) @@ -1052,15 +1066,16 @@ fn test_node_merge_crash_before_snapshot_then_catch_up_logs() { let peer_on_store3 = find_peer(®ion, 3).unwrap().to_owned(); assert_eq!(peer_on_store3.get_id(), 3); // Make peer 3 do not handle snapshot ready - // In previous implementation, destroying its source peer and applying snapshot is not atomic. - // So making its source peer be destroyed and do not apply snapshot to reproduce the problem + // In previous implementation, destroying its source peer and applying snapshot + // is not atomic. So making its source peer be destroyed and do not apply + // snapshot to reproduce the problem let before_handle_snapshot_ready_3_fp = "before_handle_snapshot_ready_3"; fail::cfg(before_handle_snapshot_ready_3_fp, "return()").unwrap(); cluster.clear_send_filters(); // Peer 1 will send snapshot to peer 3 - // Source peer sends msg to others to get target region info until the election timeout. - // The max election timeout is 2 * 10 * 10 = 200ms + // Source peer sends msg to others to get target region info until the election + // timeout. The max election timeout is 2 * 10 * 10 = 200ms let election_timeout = 2 * cluster.cfg.raft_store.raft_base_tick_interval.as_millis() * cluster.cfg.raft_store.raft_election_timeout_ticks as u64; @@ -1245,8 +1260,8 @@ fn test_prewrite_before_max_ts_is_synced() { assert!(!resp.get_region_error().has_max_timestamp_not_synced()); } -/// Testing that the source peer's read delegate should not be removed by the target peer -/// and only removed when the peer is destroyed +/// Testing that the source peer's read delegate should not be removed by the +/// target peer and only removed when the peer is destroyed #[test] fn test_source_peer_read_delegate_after_apply() { let mut cluster = new_node_cluster(0, 3); @@ -1266,10 +1281,12 @@ fn test_source_peer_read_delegate_after_apply() { let on_destroy_peer_fp = "destroy_peer"; fail::cfg(on_destroy_peer_fp, "pause").unwrap(); - // Merge finish means the leader of the target region have call `on_ready_commit_merge` + // Merge finish means the leader of the target region have call + // `on_ready_commit_merge` pd_client.must_merge(source.get_id(), target.get_id()); - // The source peer's `ReadDelegate` should not be removed yet and mark as `pending_remove` + // The source peer's `ReadDelegate` should not be removed yet and mark as + // `pending_remove` assert!( cluster.store_metas[&1] .lock() @@ -1312,8 +1329,8 @@ fn test_merge_with_concurrent_pessimistic_locking() { let left = cluster.get_region(b"k1"); let right = cluster.get_region(b"k3"); - // Transfer the leader of the right region to store 2. The leaders of source and target - // regions don't need to be on the same store. + // Transfer the leader of the right region to store 2. The leaders of source and + // target regions don't need to be on the same store. cluster.must_transfer_leader(right.id, new_peer(2, 2)); let snapshot = cluster.must_get_snapshot_of_region(left.id); @@ -1342,7 +1359,8 @@ fn test_merge_with_concurrent_pessimistic_locking() { fail::cfg("before_propose_locks_on_region_merge", "pause").unwrap(); - // 1. Locking before proposing pessimistic locks in the source region can succeed. + // 1. Locking before proposing pessimistic locks in the source region can + // succeed. let client2 = client.clone(); let mut mutation = Mutation::default(); mutation.set_op(Op::PessimisticLock); @@ -1453,7 +1471,8 @@ fn test_merge_pessimistic_locks_with_concurrent_prewrite() { thread::sleep(Duration::from_millis(500)); assert!(txn_ext.pessimistic_locks.read().is_writable()); - // But a later prewrite request should fail because we have already banned all later proposals. + // But a later prewrite request should fail because we have already banned all + // later proposals. req.mut_mutations()[0].set_key(b"k1".to_vec()); let resp2 = thread::spawn(move || client.kv_prewrite(&req).unwrap()); @@ -1515,14 +1534,15 @@ fn test_retry_pending_prepare_merge_fail() { propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); assert!(rx.recv_timeout(Duration::from_millis(200)).is_err()); - // Then, start merging. PrepareMerge should become pending because applied_index is smaller - // than proposed_index. + // Then, start merging. PrepareMerge should become pending because applied_index + // is smaller than proposed_index. cluster.merge_region(left.id, right.id, Callback::None); propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); thread::sleep(Duration::from_millis(200)); assert!(txn_ext.pessimistic_locks.read().is_writable()); - // Set disk full error to let PrepareMerge fail. (Set both peer to full to avoid transferring leader) + // Set disk full error to let PrepareMerge fail. (Set both peer to full to avoid + // transferring leader) fail::cfg("disk_already_full_peer_1", "return").unwrap(); fail::cfg("disk_already_full_peer_2", "return").unwrap(); fail::remove("on_handle_apply"); @@ -1590,7 +1610,8 @@ fn test_merge_pessimistic_locks_propose_fail() { LocksStatus::MergingRegion ); - // With the fail point set, we will fail to propose the locks or the PrepareMerge request. + // With the fail point set, we will fail to propose the locks or the + // PrepareMerge request. fail::cfg("raft_propose", "return()").unwrap(); // But after that, the pessimistic locks status should remain unchanged. @@ -1606,8 +1627,9 @@ fn test_merge_pessimistic_locks_propose_fail() { ); } -// Testing that when the source peer is destroyed while merging, it should not persist the `merge_state` -// thus won't generate gc message to destroy other peers +// Testing that when the source peer is destroyed while merging, it should not +// persist the `merge_state` thus won't generate gc message to destroy other +// peers #[test] fn test_destroy_source_peer_while_merging() { let mut cluster = new_node_cluster(0, 5); @@ -1671,9 +1693,10 @@ fn test_destroy_source_peer_while_merging() { pd_client.must_add_peer(right.get_id(), new_peer(4, 7)); must_get_equal(&cluster.get_engine(4), b"k4", b"v4"); - // if store 5 have persist the merge state, peer 2 and peer 3 will be destroyed because - // store 5 will response their request vote message with a gc message, and peer 7 will cause - // store 5 panic because peer 7 have larger peer id than the peer in the merge state + // if store 5 have persist the merge state, peer 2 and peer 3 will be destroyed + // because store 5 will response their request vote message with a gc + // message, and peer 7 will cause store 5 panic because peer 7 have larger + // peer id than the peer in the merge state cluster.clear_send_filters(); cluster.add_send_filter(IsolationFilterFactory::new(1)); diff --git a/tests/failpoints/cases/test_pending_peers.rs b/tests/failpoints/cases/test_pending_peers.rs index 08f028d8fcb..5618bc9ab8e 100644 --- a/tests/failpoints/cases/test_pending_peers.rs +++ b/tests/failpoints/cases/test_pending_peers.rs @@ -36,8 +36,8 @@ fn test_pending_peers() { assert!(pending_peers.is_empty()); } -// Tests if raftstore and apply worker write truncated_state concurrently could lead to -// dirty write. +// Tests if raftstore and apply worker write truncated_state concurrently could +// lead to dirty write. #[test] fn test_pending_snapshot() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index 30d0c1d995f..6db06dee35f 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -19,7 +19,8 @@ struct TestSuite { impl TestSuite { pub fn new(count: usize, api_version: ApiVersion) -> Self { let mut cluster = new_server_cluster_with_api_ver(1, count, api_version); - // Disable background renew by setting `renew_interval` to 0, to make timestamp allocation predictable. + // Disable background renew by setting `renew_interval` to 0, to make timestamp + // allocation predictable. configure_for_causal_ts(&mut cluster, "0s", 100); configure_for_merge(&mut cluster); cluster.run(); @@ -200,7 +201,8 @@ fn test_region_merge() { // Disable CausalObserver::flush_timestamp to produce causality issue. fail::cfg(FP_CAUSAL_OBSERVER_FLUSH_TIMESTAMP, "return").unwrap(); - // Transfer leaders: region 1 -> store 1, region 3 -> store 2, region 5 -> store 3. + // Transfer leaders: region 1 -> store 1, region 3 -> store 2, region 5 -> store + // 3. suite.must_transfer_leader(®ion1, 1); suite.must_transfer_leader(®ion3, 2); suite.must_transfer_leader(®ion5, 3); diff --git a/tests/failpoints/cases/test_replica_read.rs b/tests/failpoints/cases/test_replica_read.rs index e288828dc66..7a6da017d99 100644 --- a/tests/failpoints/cases/test_replica_read.rs +++ b/tests/failpoints/cases/test_replica_read.rs @@ -41,8 +41,8 @@ fn test_wait_for_apply_index() { cluster.must_put(b"k1", b"v1"); must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); - // Peer 3 does not apply the cmd of putting 'k1' right now, then the follower read must - // be blocked. + // Peer 3 does not apply the cmd of putting 'k1' right now, then the follower + // read must be blocked. must_get_none(&cluster.get_engine(3), b"k1"); let mut request = new_request( region.get_id(), @@ -354,12 +354,14 @@ fn test_read_after_cleanup_range_for_snap() { rx1.recv_timeout(Duration::from_secs(5)).unwrap(); } -/// Tests the learner of new split region will know its leader without waiting for the leader heartbeat timeout. +/// Tests the learner of new split region will know its leader without waiting +/// for the leader heartbeat timeout. /// /// Before https://github.com/tikv/tikv/pull/8820, -/// the learner of a new split region may not know its leader if it applies log slowly and drops the no-op -/// entry from the new leader, and it had to wait for a heartbeat timeout to know its leader before that it -/// can't handle any read request. +/// the learner of a new split region may not know its leader if it applies log +/// slowly and drops the no-op entry from the new leader, and it had to wait for +/// a heartbeat timeout to know its leader before that it can't handle any read +/// request. #[test] fn test_new_split_learner_can_not_find_leader() { let mut cluster = new_node_cluster(0, 4); @@ -383,9 +385,10 @@ fn test_new_split_learner_can_not_find_leader() { let region = cluster.get_region(b"k3"); cluster.must_split(®ion, b"k3"); - // This `put` will not inform learner leadership because the The learner is paused at apply split command, - // so the learner peer of the new split region is not create yet. Also, the leader will not send another - // append request before the previous one response as all peer is initiated with the `Probe` mod + // This `put` will not inform learner leadership because the The learner is + // paused at apply split command, so the learner peer of the new split region is + // not create yet. Also, the leader will not send another append request before + // the previous one response as all peer is initiated with the `Probe` mod cluster.must_put(b"k2", b"v2"); assert_eq!(cluster.get(b"k2"), Some(b"v2".to_vec())); @@ -402,8 +405,8 @@ fn test_new_split_learner_can_not_find_leader() { assert_eq!(exp_value, b"v2"); } -/// Test if the read index request can get a correct response when the commit index of leader -/// if not up-to-date after transferring leader. +/// Test if the read index request can get a correct response when the commit +/// index of leader if not up-to-date after transferring leader. #[test] fn test_replica_read_after_transfer_leader() { let mut cluster = new_node_cluster(0, 3); @@ -454,7 +457,8 @@ fn test_replica_read_after_transfer_leader() { // Wait peer 1 and 3 to send heartbeat response to peer 2 sleep_ms(100); - // Pause before collecting message to make the these message be handled in one loop + // Pause before collecting message to make the these message be handled in one + // loop let on_peer_collect_message_2 = "on_peer_collect_message_2"; fail::cfg(on_peer_collect_message_2, "pause").unwrap(); @@ -477,8 +481,8 @@ fn test_replica_read_after_transfer_leader() { assert_eq!(exp_value, b"v2"); } -// This test is for reproducing the bug that some replica reads was sent to a leader and shared a same -// read index because of the optimization on leader. +// This test is for reproducing the bug that some replica reads was sent to a +// leader and shared a same read index because of the optimization on leader. #[test] fn test_read_index_after_transfer_leader() { let mut cluster = new_node_cluster(0, 3); @@ -511,7 +515,8 @@ fn test_read_index_after_transfer_leader() { async_read_index_on_peer(&mut cluster, new_peer(2, 2), region.clone(), b"k1", true); responses.push(resp); } - // Try to split the region to change the peer into `splitting` state then can not handle read requests. + // Try to split the region to change the peer into `splitting` state then can + // not handle read requests. cluster.split_region(®ion, b"k2", raftstore::store::Callback::None); // Wait the split command be sent. sleep_ms(100); @@ -525,12 +530,15 @@ fn test_read_index_after_transfer_leader() { let msg_type = msg.get_message().get_msg_type(); matches!(msg_type, MessageType::MsgAppendResponse) }); - // Transfer leader to peer 1, peer 2 should not change role since we added a recv filter. + // Transfer leader to peer 1, peer 2 should not change role since we added a + // recv filter. cluster.transfer_leader(region_id, new_peer(1, 1)); - // Pause before collecting peer messages to make sure all messages can be handled in one batch. + // Pause before collecting peer messages to make sure all messages can be + // handled in one batch. let on_peer_collect_message_2 = "on_peer_collect_message_2"; fail::cfg(on_peer_collect_message_2, "pause").unwrap(); - // Pause apply worker to stop the split command so peer 2 would keep in `splitting` state. + // Pause apply worker to stop the split command so peer 2 would keep in + // `splitting` state. let on_handle_apply_2 = "on_handle_apply_2"; fail::cfg(on_handle_apply_2, "pause").unwrap(); // Send heartbeat and append responses to advance read index. @@ -544,8 +552,8 @@ fn test_read_index_after_transfer_leader() { fail::remove(on_peer_collect_message_2); // Wait for read index has been advanced. sleep_ms(100); - // Filter and send vote message, peer 2 would step down to follower and try to handle read requests - // as a follower. + // Filter and send vote message, peer 2 would step down to follower and try to + // handle read requests as a follower. let msgs = std::mem::take(&mut *dropped_msgs.lock().unwrap()); let vote_msgs = msgs.iter().filter(|msg| { let msg_type = msg.get_message().get_msg_type(); @@ -566,8 +574,8 @@ fn test_read_index_after_transfer_leader() { fail::remove(on_handle_apply_2); } -/// Test if the read index request can get a correct response when the commit index of leader -/// if not up-to-date after transferring leader. +/// Test if the read index request can get a correct response when the commit +/// index of leader if not up-to-date after transferring leader. #[test] fn test_batch_read_index_after_transfer_leader() { let mut cluster = new_node_cluster(0, 3); @@ -598,7 +606,8 @@ fn test_batch_read_index_after_transfer_leader() { cluster.must_transfer_leader(1, new_peer(2, 2)); - // Pause before collecting message to make the these message be handled in one loop + // Pause before collecting message to make the these message be handled in one + // loop let on_peer_collect_message_2 = "on_peer_collect_message_2"; fail::cfg(on_peer_collect_message_2, "pause").unwrap(); @@ -627,7 +636,8 @@ fn test_batch_read_index_after_transfer_leader() { .map(|x| x.recv_timeout(Duration::from_secs(5)).unwrap()) .collect::>(); - // `term` in the header is `current_term`, not term of the entry at `read_index`. + // `term` in the header is `current_term`, not term of the entry at + // `read_index`. let term = resps[0].get_header().get_current_term(); assert_eq!(term, resps[1].get_header().get_current_term()); assert_eq!(term, pd_client.get_region_last_report_term(1).unwrap()); @@ -636,8 +646,9 @@ fn test_batch_read_index_after_transfer_leader() { let index = resps[i].responses[0].get_read_index().read_index; let raft_engine = cluster.get_raft_engine(2); let entry = raft_engine.get_entry(1, index).unwrap().unwrap(); - // According to Raft, a peer shouldn't be able to perform read index until it commits - // to the current term. So term of `read_index` must equal to the current one. + // According to Raft, a peer shouldn't be able to perform read index until it + // commits to the current term. So term of `read_index` must equal to + // the current one. assert_eq!(entry.get_term(), term); } } @@ -701,8 +712,8 @@ fn test_read_index_lock_checking_on_follower() { let guard = block_on(leader_cm.lock_key(&Key::from_raw(b"k1"))); guard.with_lock(|l| *l = Some(lock.clone())); - // Now, the leader has been transferred to peer 3. The original read index request - // will be first sent to peer 1 and then redirected to peer 3. + // Now, the leader has been transferred to peer 3. The original read index + // request will be first sent to peer 1 and then redirected to peer 3. // We must make sure the lock check is done on peer 3. fail::remove("before_propose_readindex"); @@ -779,14 +790,14 @@ fn test_read_index_lock_checking_on_false_leader() { let guard = block_on(leader_cm.lock_key(&Key::from_raw(b"k1"))); guard.with_lock(|l| *l = Some(lock.clone())); - // Read index from peer 2, the read index message will be sent to the old leader peer 1. - // But the lease of peer 1 has expired and it cannot get majority of heartbeat. - // So, we cannot get the result here. + // Read index from peer 2, the read index message will be sent to the old leader + // peer 1. But the lease of peer 1 has expired and it cannot get majority of + // heartbeat. So, we cannot get the result here. let resp = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1, b"k1", true); assert!(resp.recv_timeout(Duration::from_millis(300)).is_err()); - // Now, restore the network partition. Peer 1 should now become follower and drop its - // pending read index request. Peer 2 cannot get the result now. + // Now, restore the network partition. Peer 1 should now become follower and + // drop its pending read index request. Peer 2 cannot get the result now. let recv_filter = Box::new( RegionPacketFilter::new(rid, 2) .direction(Direction::Recv) diff --git a/tests/failpoints/cases/test_replica_stale_read.rs b/tests/failpoints/cases/test_replica_stale_read.rs index ab11b7039fd..a8aaa030bfc 100644 --- a/tests/failpoints/cases/test_replica_stale_read.rs +++ b/tests/failpoints/cases/test_replica_stale_read.rs @@ -111,10 +111,12 @@ fn test_stale_read_basic_flow_lock() { b"key1".to_vec(), ); - // Assert `(key1, value2)` can't be readed with `commit_ts2` due to it's larger than the `start_ts` of `key2`. + // Assert `(key1, value2)` can't be read with `commit_ts2` due to it's larger + // than the `start_ts` of `key2`. let resp = follower_client2.kv_read(b"key1".to_vec(), commit_ts2); assert!(resp.get_region_error().has_data_is_not_ready()); - // Still can read `(key1, value1)` since `commit_ts1` is less than the `key2` lock's `start_ts` + // Still can read `(key1, value1)` since `commit_ts1` is less than the `key2` + // lock's `start_ts` follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), commit_ts1); // Prewrite on `key3` but not commit yet @@ -129,7 +131,8 @@ fn test_stale_read_basic_flow_lock() { leader_client.must_kv_commit(vec![b"key2".to_vec()], k2_prewrite_ts, k2_commit_ts); // Although there is still lock on the region, but the min lock is refreshed - // to the `key3`'s lock, now we can read `(key1, value2)` but not `(key2, value1)` + // to the `key3`'s lock, now we can read `(key1, value2)` but not `(key2, + // value1)` follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value2".to_vec(), commit_ts2); let resp = follower_client2.kv_read(b"key2".to_vec(), k2_commit_ts); assert!(resp.get_region_error().has_data_is_not_ready()); @@ -144,9 +147,9 @@ fn test_stale_read_basic_flow_lock() { follower_client2.must_kv_read_equal(b"key3".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); } -// Testing that even leader's `apply_index` updated before sync the `(apply_index, safe_ts)` -// item to other replica, the `apply_index` in the `(apply_index, safe_ts)` item should not -// be updated +// Testing that even leader's `apply_index` updated before sync the +// `(apply_index, safe_ts)` item to other replica, the `apply_index` in the +// `(apply_index, safe_ts)` item should not be updated #[test] fn test_update_apply_index_before_sync_read_state() { let (mut cluster, pd_client, mut leader_client) = prepare_for_stale_read(new_peer(1, 1)); @@ -195,9 +198,9 @@ fn test_update_apply_index_before_sync_read_state() { follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), commit_ts1); } -// Testing that if `resolved_ts` updated before `apply_index` update, the `safe_ts` -// won't be updated, hence the leader won't broadcast a wrong `(apply_index, safe_ts)` -// item to other replicas +// Testing that if `resolved_ts` updated before `apply_index` update, the +// `safe_ts` won't be updated, hence the leader won't broadcast a wrong +// `(apply_index, safe_ts)` item to other replicas #[test] fn test_update_resoved_ts_before_apply_index() { let (mut cluster, pd_client, mut leader_client) = prepare_for_stale_read(new_peer(1, 1)); @@ -213,7 +216,8 @@ fn test_update_resoved_ts_before_apply_index() { ); follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), commit_ts1); - // Return before handling `apply_res`, to stop the leader updating the apply index + // Return before handling `apply_res`, to stop the leader updating the apply + // index let on_apply_res_fp = "on_apply_res"; fail::cfg(on_apply_res_fp, "return()").unwrap(); // Stop replicate data to follower 2 @@ -249,7 +253,8 @@ fn test_update_resoved_ts_before_apply_index() { follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value2".to_vec(), commit_ts2); } -// Testing that the new elected leader should initialize the `resolver` correctly +// Testing that the new elected leader should initialize the `resolver` +// correctly #[test] fn test_new_leader_init_resolver() { let (mut cluster, pd_client, mut peer_client1) = prepare_for_stale_read(new_peer(1, 1)); @@ -264,8 +269,8 @@ fn test_new_leader_init_resolver() { b"key1".to_vec(), ); - // There are no lock in the region, the `safe_ts` should keep updating by the new leader, - // so we can read `key1` with the newest ts + // There are no lock in the region, the `safe_ts` should keep updating by the + // new leader, so we can read `key1` with the newest ts cluster.must_transfer_leader(1, new_peer(2, 2)); peer_client1.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); @@ -276,8 +281,8 @@ fn test_new_leader_init_resolver() { get_tso(&pd_client), ); - // There are locks in the region, the `safe_ts` can't be updated, so we can't read - // `key1` with the newest ts + // There are locks in the region, the `safe_ts` can't be updated, so we can't + // read `key1` with the newest ts cluster.must_transfer_leader(1, new_peer(1, 1)); let resp = peer_client2.kv_read(b"key1".to_vec(), get_tso(&pd_client)); assert!(resp.get_region_error().has_data_is_not_ready()); @@ -285,8 +290,9 @@ fn test_new_leader_init_resolver() { peer_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), commit_ts1); } -// Testing that while applying snapshot the follower should reset its `safe_ts` to 0 and -// reject incoming stale read request, then resume the `safe_ts` after applying snapshot +// Testing that while applying snapshot the follower should reset its `safe_ts` +// to 0 and reject incoming stale read request, then resume the `safe_ts` after +// applying snapshot #[test] fn test_stale_read_while_applying_snapshot() { let (mut cluster, pd_client, leader_client) = @@ -398,15 +404,17 @@ fn test_stale_read_while_region_merge() { b"key5".to_vec(), ); - // Merge source region into target region, the lock on source region should also merge - // into the target region and cause the target region's `safe_ts` decrease + // Merge source region into target region, the lock on source region should also + // merge into the target region and cause the target region's `safe_ts` + // decrease pd_client.must_merge(source.get_id(), target.get_id()); let mut follower_client2 = PeerClient::new(&cluster, target.get_id(), new_peer(2, 2)); follower_client2.ctx.set_stale_read(true); // We can read `(key5, value1)` with `k1_prewrite_ts` follower_client2.must_kv_read_equal(b"key5".to_vec(), b"value1".to_vec(), k1_prewrite_ts); - // Can't read `key5` with `k5_commit_ts` because `k1_prewrite_ts` is smaller than `k5_commit_ts` + // Can't read `key5` with `k5_commit_ts` because `k1_prewrite_ts` is smaller + // than `k5_commit_ts` let resp = follower_client2.kv_read(b"key5".to_vec(), k5_commit_ts); assert!(resp.get_region_error().has_data_is_not_ready()); @@ -417,7 +425,8 @@ fn test_stale_read_while_region_merge() { follower_client2.must_kv_read_equal(b"key5".to_vec(), b"value2".to_vec(), get_tso(&pd_client)); } -// Testing that after region merge, the `safe_ts` could be advanced even without any incoming write +// Testing that after region merge, the `safe_ts` could be advanced even without +// any incoming write #[test] fn test_stale_read_after_merge() { let (mut cluster, pd_client, _) = @@ -444,9 +453,9 @@ fn test_stale_read_after_merge() { follower_client2.must_kv_read_equal(b"key5".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); } -// Testing that during the merge, the leader of the source region won't not update the -// `safe_ts` since it can't know when the merge is completed and whether there are new -// kv write into its key range +// Testing that during the merge, the leader of the source region won't not +// update the `safe_ts` since it can't know when the merge is completed and +// whether there are new kv write into its key range #[test] fn test_read_source_region_after_target_region_merged() { let (mut cluster, pd_client, leader_client) = @@ -462,7 +471,8 @@ fn test_read_source_region_after_target_region_merged() { cluster.must_split(&cluster.get_region(&[]), b"key3"); let source = pd_client.get_region(b"key1").unwrap(); let target = pd_client.get_region(b"key5").unwrap(); - // Transfer the target region leader to store 1 and the source region leader to store 2 + // Transfer the target region leader to store 1 and the source region leader to + // store 2 cluster.must_transfer_leader(target.get_id(), new_peer(1, 1)); cluster.must_transfer_leader(source.get_id(), find_peer(&source, 2).unwrap().clone()); // Get the source region follower on store 3 @@ -481,7 +491,8 @@ fn test_read_source_region_after_target_region_merged() { // Merge source region into target region pd_client.must_merge(source.get_id(), target.get_id()); - // Leave a lock on the original source region key range through the target region leader + // Leave a lock on the original source region key range through the target + // region leader let target_leader = PeerClient::new(&cluster, target.get_id(), new_peer(1, 1)); let k1_prewrite_ts2 = get_tso(&pd_client); target_leader.must_kv_prewrite( @@ -495,17 +506,17 @@ fn test_read_source_region_after_target_region_merged() { // We still can read `key1` with `k1_commit_ts1` through source region source_follower_client3.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), k1_commit_ts1); - // But can't read `key2` with `k1_prewrite_ts2` because the source leader can't update - // `safe_ts` after source region is merged into target region even though the source leader - // didn't know the merge is complement + // But can't read `key2` with `k1_prewrite_ts2` because the source leader can't + // update `safe_ts` after source region is merged into target region even + // though the source leader didn't know the merge is complement let resp = source_follower_client3.kv_read(b"key1".to_vec(), k1_prewrite_ts2); assert!(resp.get_region_error().has_data_is_not_ready()); fail::remove(apply_before_prepare_merge_2_3); } -// Testing that altough the source region's `safe_ts` wont't be updated during merge, after merge -// rollbacked it should resume updating +// Testing that altough the source region's `safe_ts` wont't be updated during +// merge, after merge rollbacked it should resume updating #[test] fn test_stale_read_after_rollback_merge() { let (mut cluster, pd_client, leader_client) = @@ -539,12 +550,13 @@ fn test_stale_read_after_rollback_merge() { find_peer(&source, 3).unwrap().clone(), ); source_client3.ctx.set_stale_read(true); - // the `safe_ts` should resume updating after merge rollback so we can read `key1` with the newest ts + // the `safe_ts` should resume updating after merge rollback so we can read + // `key1` with the newest ts source_client3.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); } -// Testing that the new leader should ignore the pessimistic lock that wrote by the previous -// leader and keep updating the `safe_ts` +// Testing that the new leader should ignore the pessimistic lock that wrote by +// the previous leader and keep updating the `safe_ts` #[test] fn test_new_leader_ignore_pessimistic_lock() { let (mut cluster, pd_client, leader_client) = prepare_for_stale_read(new_peer(1, 1)); @@ -564,7 +576,8 @@ fn test_new_leader_ignore_pessimistic_lock() { let mut follower_client3 = PeerClient::new(&cluster, 1, new_peer(3, 3)); follower_client3.ctx.set_stale_read(true); - // The new leader should be able to update `safe_ts` so we can read `key1` with the newest ts + // The new leader should be able to update `safe_ts` so we can read `key1` with + // the newest ts follower_client3.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); } @@ -590,7 +603,8 @@ fn test_stale_read_on_learner() { learner_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); } -// Testing that stale read request with a future ts should not update the `concurency_manager`'s `max_ts` +// Testing that stale read request with a future ts should not update the +// `concurrency_manager`'s `max_ts` #[test] fn test_stale_read_future_ts_not_update_max_ts() { let (_cluster, pd_client, mut leader_client) = prepare_for_stale_read(new_peer(1, 1)); @@ -608,8 +622,9 @@ fn test_stale_read_future_ts_not_update_max_ts() { let resp = leader_client.kv_read(b"key1".to_vec(), read_ts); assert!(resp.get_region_error().has_data_is_not_ready()); - // The `max_ts` should not updated by the stale read request, so we can prewrite and commit - // `async_commit` transaction with a ts that smaller than the `read_ts` + // The `max_ts` should not updated by the stale read request, so we can prewrite + // and commit `async_commit` transaction with a ts that smaller than the + // `read_ts` let prewrite_ts = get_tso(&pd_client); assert!(prewrite_ts < read_ts); leader_client.must_kv_prewrite_async_commit( @@ -627,8 +642,8 @@ fn test_stale_read_future_ts_not_update_max_ts() { let resp = leader_client.kv_read(b"key1".to_vec(), read_ts); assert!(resp.get_region_error().has_data_is_not_ready()); - // The `max_ts` should not updated by the stale read request, so 1pc transaction with a ts that smaller - // than the `read_ts` should not be fallbacked to 2pc + // The `max_ts` should not updated by the stale read request, so 1pc transaction + // with a ts that smaller than the `read_ts` should not be fallbacked to 2pc let prewrite_ts = get_tso(&pd_client); assert!(prewrite_ts < read_ts); leader_client.must_kv_prewrite_one_pc( diff --git a/tests/failpoints/cases/test_server.rs b/tests/failpoints/cases/test_server.rs index 9d552eadee3..9c34fd13529 100644 --- a/tests/failpoints/cases/test_server.rs +++ b/tests/failpoints/cases/test_server.rs @@ -9,10 +9,10 @@ use raft::eraftpb::MessageType; use test_raftstore::*; use tikv_util::{config::ReadableDuration, HandyRwLock}; -/// When encountering raft/batch_raft mismatch store id error, the service is expected -/// to drop connections in order to let raft_client re-resolve store address from PD -/// This will make the mismatch error be automatically corrected. -/// Ths test verified this case. +/// When encountering raft/batch_raft mismatch store id error, the service is +/// expected to drop connections in order to let raft_client re-resolve store +/// address from PD This will make the mismatch error be automatically +/// corrected. Ths test verified this case. #[test] fn test_mismatch_store_node() { let count = 3; diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index a899af3466e..3507fc268d4 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -46,8 +46,8 @@ fn test_overlap_cleanup() { cluster.must_split(®ion1, b"k2"); // Wait till the snapshot of split region is applied, whose range is ["", "k2"). must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); - // Resume the fail point and pause it again. So only the paused snapshot is generated. - // And the paused snapshot's range is ["", ""), hence overlap. + // Resume the fail point and pause it again. So only the paused snapshot is + // generated. And the paused snapshot's range is ["", ""), hence overlap. fail::cfg(gen_snapshot_fp, "pause").unwrap(); // Overlap snapshot should be deleted. assert_snapshot(&cluster.get_snap_dir(3), region_id, false); @@ -186,11 +186,12 @@ fn assert_snapshot(snap_dir: &str, region_id: u64, exist: bool) { } } -// A peer on store 3 is isolated and is applying snapshot. (add failpoint so it's always pending) -// Then two conf change happens, this peer is removed and a new peer is added on store 3. -// Then isolation clear, this peer will be destroyed because of a bigger peer id in msg. -// In previous implementation, peer fsm can be destroyed synchronously because snapshot state is -// pending and can be canceled, but panic may happen if the applyfsm runs very slow. +// A peer on store 3 is isolated and is applying snapshot. (add failpoint so +// it's always pending) Then two conf change happens, this peer is removed and a +// new peer is added on store 3. Then isolation clear, this peer will be +// destroyed because of a bigger peer id in msg. In previous implementation, +// peer fsm can be destroyed synchronously because snapshot state is pending and +// can be canceled, but panic may happen if the applyfsm runs very slow. #[test] fn test_destroy_peer_on_pending_snapshot() { let mut cluster = new_server_cluster(0, 3); @@ -252,10 +253,11 @@ fn test_destroy_peer_on_pending_snapshot() { } // The peer 3 in store 3 is isolated for a while and then recovered. -// During its applying snapshot, however the peer is destroyed and thus applying snapshot is canceled. -// And when it's destroyed (destroy is not finished either), the machine restarted. -// After the restart, the snapshot should be applied successfully.println! -// And new data should be written to store 3 successfully. +// During its applying snapshot, however the peer is destroyed and thus applying +// snapshot is canceled. And when it's destroyed (destroy is not finished +// either), the machine restarted. After the restart, the snapshot should be +// applied successfully.println! And new data should be written to store 3 +// successfully. #[test] fn test_destroy_peer_on_pending_snapshot_and_restart() { let mut cluster = new_server_cluster(0, 3); @@ -315,7 +317,8 @@ fn test_destroy_peer_on_pending_snapshot_and_restart() { must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); // After peer 3 has applied snapshot, data should be got. must_get_equal(&cluster.get_engine(3), b"k119", b"v1"); - // In the end the snapshot file should be gc-ed anyway, either by new peer or by store + // In the end the snapshot file should be gc-ed anyway, either by new peer or by + // store let now = Instant::now(); loop { let mut snap_files = vec![]; @@ -464,8 +467,9 @@ fn test_receive_old_snapshot() { pd_client.must_add_peer(left.get_id(), new_peer(2, 4)); cluster.must_put(b"k11", b"v1"); - // If peer 2 handles previous old snapshot properly and does not leave over metadata - // in `pending_snapshot_regions`, peer 4 should be created normally. + // If peer 2 handles previous old snapshot properly and does not leave over + // metadata in `pending_snapshot_regions`, peer 4 should be created + // normally. must_get_equal(&cluster.get_engine(2), b"k11", b"v1"); fail::remove(peer_2_handle_snap_mgr_gc_fp); @@ -509,7 +513,8 @@ fn test_gen_snapshot_with_no_committed_entries_ready() { // 1. pause snapshot generating with a failpoint, and then add a new peer; // 2. append more Raft logs to the region to trigger raft log compactions; // 3. disable the failpoint to continue snapshot generating; -// 4. the generated snapshot should have a larger index than the latest `truncated_idx`. +// 4. the generated snapshot should have a larger index than the latest +// `truncated_idx`. #[test] fn test_cancel_snapshot_generating() { let mut cluster = new_node_cluster(0, 5); @@ -670,15 +675,17 @@ fn test_sending_fail_with_net_error() { // need to wait receiver handle the snapshot request sleep_ms(100); - // peer2 will not become learner so ti will has k1 key and receiving count will zero + // peer2 will not become learner so ti will has k1 key and receiving count will + // zero let engine2 = cluster.get_engine(2); must_get_none(&engine2, b"k1"); assert_eq!(cluster.get_snap_mgr(2).stats().receiving_count, 0); } /// Logs scan are now moved to raftlog gc threads. The case is to test if logs -/// are still cleaned up when there is stale logs before first index during applying -/// snapshot. It's expected to schedule a gc task after applying snapshot. +/// are still cleaned up when there is stale logs before first index during +/// applying snapshot. It's expected to schedule a gc task after applying +/// snapshot. #[test] fn test_snapshot_clean_up_logs_with_unfinished_log_gc() { let mut cluster = new_node_cluster(0, 3); @@ -730,7 +737,8 @@ fn test_snapshot_clean_up_logs_with_unfinished_log_gc() { assert!(dest[0].get_index() > truncated_index, "{:?}", dest); } -/// Redo snapshot apply after restart when kvdb state is updated but raftdb state is not. +/// Redo snapshot apply after restart when kvdb state is updated but raftdb +/// state is not. #[test] fn test_snapshot_recover_from_raft_write_failure() { let mut cluster = new_server_cluster(0, 3); diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 09eb603ff8e..92aee023fa5 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -259,9 +259,10 @@ impl Filter for PrevoteRangeFilter { } } -// Test if a peer is created from splitting when another initialized peer with the same -// region id has already existed. In previous implementation, it can be created and panic -// will happen because there are two initialized peer with the same region id. +// Test if a peer is created from splitting when another initialized peer with +// the same region id has already existed. In previous implementation, it can be +// created and panic will happen because there are two initialized peer with the +// same region id. #[test] fn test_split_not_to_split_existing_region() { let mut cluster = new_node_cluster(0, 4); @@ -333,8 +334,8 @@ fn test_split_not_to_split_existing_region() { must_get_none(&cluster.get_engine(3), b"k0"); } -// Test if a peer is created from splitting when another initialized peer with the same -// region id existed before and has been destroyed now. +// Test if a peer is created from splitting when another initialized peer with +// the same region id existed before and has been destroyed now. #[test] fn test_split_not_to_split_existing_tombstone_region() { let mut cluster = new_node_cluster(0, 3); @@ -401,8 +402,8 @@ fn test_split_not_to_split_existing_tombstone_region() { } // TiKV uses memory lock to control the order between spliting and creating -// new peer. This case test if tikv continues split if the peer is destroyed after -// memory lock check. +// new peer. This case test if tikv continues split if the peer is destroyed +// after memory lock check. #[test] fn test_split_continue_when_destroy_peer_after_mem_check() { let mut cluster = new_node_cluster(0, 3); @@ -478,8 +479,8 @@ fn test_split_continue_when_destroy_peer_after_mem_check() { // If value of `k22` is equal to `v22`, the previous split log must be applied. must_get_equal(&cluster.get_engine(2), b"k22", b"v22"); - // Once it's marked split in memcheck, destroy should not write tombstone otherwise it will - // break the region states. Hence split should continue. + // Once it's marked split in memcheck, destroy should not write tombstone + // otherwise it will break the region states. Hence split should continue. must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); cluster.clear_send_filters(); @@ -488,8 +489,8 @@ fn test_split_continue_when_destroy_peer_after_mem_check() { must_get_none(&cluster.get_engine(2), b"k1"); } -// Test if a peer can be created from splitting when another uninitialied peer with the same -// peer id has been created on this store. +// Test if a peer can be created from splitting when another uninitialied peer +// with the same peer id has been created on this store. #[test] fn test_split_should_split_existing_same_uninitialied_peer() { let mut cluster = new_node_cluster(0, 3); @@ -541,8 +542,8 @@ fn test_split_should_split_existing_same_uninitialied_peer() { must_get_equal(&cluster.get_engine(2), b"k11", b"v11"); } -// Test if a peer can be created from splitting when another uninitialied peer with different -// peer id has been created on this store. +// Test if a peer can be created from splitting when another uninitialied peer +// with different peer id has been created on this store. #[test] fn test_split_not_to_split_existing_different_uninitialied_peer() { let mut cluster = new_node_cluster(0, 3); @@ -597,7 +598,8 @@ fn test_split_not_to_split_existing_different_uninitialied_peer() { // peer 2 applied snapshot must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); - // But only the right part because there is a peer 4 of region 1000 on local store + // But only the right part because there is a peer 4 of region 1000 on local + // store must_get_none(&cluster.get_engine(2), b"k1"); fail::remove(before_check_snapshot_1000_2_fp); @@ -657,9 +659,9 @@ impl Filter for CollectSnapshotFilter { } } -/// If the uninitialized peer and split peer are fetched into one batch, and the first -/// one doesn't generate ready, the second one does, ready should not be mapped to the -/// first one. +/// If the uninitialized peer and split peer are fetched into one batch, and the +/// first one doesn't generate ready, the second one does, ready should not be +/// mapped to the first one. #[test] fn test_split_duplicated_batch() { let mut cluster = new_node_cluster(0, 3); @@ -696,7 +698,8 @@ fn test_split_duplicated_batch() { if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { panic!("the snapshot is not sent before split, e: {:?}", e); } - // Split the region range and then there should be another snapshot for the split ranges. + // Split the region range and then there should be another snapshot for the + // split ranges. cluster.must_split(®ion, b"k2"); // Ensure second is also sent and piled in filter. if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { @@ -764,8 +767,8 @@ fn test_split_duplicated_batch() { must_get_equal(&cluster.get_engine(3), b"k11", b"v11"); } -/// We depend on split-check task to update approximate size of region even if this region does not -/// need to split. +/// We depend on split-check task to update approximate size of region even if +/// this region does not need to split. #[test] fn test_report_approximate_size_after_split_check() { let mut cluster = new_server_cluster(0, 3); @@ -881,7 +884,8 @@ fn test_split_with_concurrent_pessimistic_locking() { assert!(resp.get_region_error().has_epoch_not_match(), "{:?}", resp); // 2. Locking happens when split has finished - // It needs to be rejected due to incorrect epoch, otherwise the lock may be written to the wrong region. + // It needs to be rejected due to incorrect epoch, otherwise the lock may be + // written to the wrong region. fail::cfg("txn_before_process_write", "pause").unwrap(); req.set_context(cluster.get_ctx(b"key")); let res = thread::spawn(move || client.kv_pessimistic_lock(&req).unwrap()); @@ -979,7 +983,8 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { cluster.split_region(&cluster.get_region(b"key"), b"a", Callback::None); thread::sleep(Duration::from_millis(300)); - // PrewriteResponse should contain an EpochNotMatch instead of PessimisticLockNotFound. + // PrewriteResponse should contain an EpochNotMatch instead of + // PessimisticLockNotFound. fail::remove("txn_before_process_write"); let resp = resp.join().unwrap(); assert!(resp.get_region_error().has_epoch_not_match(), "{:?}", resp); @@ -987,10 +992,11 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { fail::remove("on_split_invalidate_locks"); } -/// Logs are gced asynchronously. If an uninitialized peer is destroyed before being replaced by -/// split, then the asynchronous log gc response may arrive after the peer is replaced, hence -/// it will lead to incorrect memory state. Actually, there is nothing to be gc for uninitialized -/// peer. The case is to guarantee such incorrect state will not happen. +/// Logs are gced asynchronously. If an uninitialized peer is destroyed before +/// being replaced by split, then the asynchronous log gc response may arrive +/// after the peer is replaced, hence it will lead to incorrect memory state. +/// Actually, there is nothing to be gc for uninitialized peer. The case is to +/// guarantee such incorrect state will not happen. #[test] fn test_split_replace_skip_log_gc() { let mut cluster = new_node_cluster(0, 3); @@ -1023,7 +1029,8 @@ fn test_split_replace_skip_log_gc() { cluster.must_put(b"k3", b"v3"); - // Because a is not initialized, so b must be created using heartbeat on store 3. + // Because a is not initialized, so b must be created using heartbeat on store + // 3. // Simulate raft log gc stall. let gc_fp = "worker_gc_raft_log_flush"; diff --git a/tests/failpoints/cases/test_sst_recovery.rs b/tests/failpoints/cases/test_sst_recovery.rs index b15a43b3d35..f5dadc4205a 100644 --- a/tests/failpoints/cases/test_sst_recovery.rs +++ b/tests/failpoints/cases/test_sst_recovery.rs @@ -24,7 +24,8 @@ fn assert_corruption(res: engine_traits::Result) { fn test_sst_recovery_basic() { let (mut cluster, pd_client, engine1) = create_tikv_cluster_with_one_node_damaged(); - // Test that only sst recovery can delete the sst file, remove peer don't delete it. + // Test that only sst recovery can delete the sst file, remove peer don't delete + // it. fail::cfg("sst_recovery_before_delete_files", "pause").unwrap(); let store_meta = cluster.store_metas.get(&1).unwrap().clone(); diff --git a/tests/failpoints/cases/test_stale_peer.rs b/tests/failpoints/cases/test_stale_peer.rs index 0fba036417b..0321772661d 100644 --- a/tests/failpoints/cases/test_stale_peer.rs +++ b/tests/failpoints/cases/test_stale_peer.rs @@ -25,8 +25,9 @@ fn test_one_node_leader_missing() { let election_timeout = base_tick_interval * 5; cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(election_timeout - base_tick_interval); - // Use large peer check interval, abnormal and max leader missing duration to make a valid config, - // that is election timeout x 2 < peer stale state check < abnormal < max leader missing duration. + // Use large peer check interval, abnormal and max leader missing duration to + // make a valid config, that is election timeout x 2 < peer stale state + // check < abnormal < max leader missing duration. cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration(election_timeout * 3); cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration(election_timeout * 4); @@ -132,7 +133,8 @@ fn test_stale_learner_restart() { must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); } -/// Test if a peer can be destroyed through tombstone msg when applying snapshot. +/// Test if a peer can be destroyed through tombstone msg when applying +/// snapshot. #[test] fn test_stale_peer_destroy_when_apply_snapshot() { let mut cluster = new_node_cluster(0, 3); @@ -208,7 +210,8 @@ fn test_stale_peer_destroy_when_apply_snapshot() { must_get_none(&cluster.get_engine(3), b"k1"); } -/// Test if destroy a uninitialized peer through tombstone msg would allow a staled peer be created again. +/// Test if destroy a uninitialized peer through tombstone msg would allow a +/// staled peer be created again. #[test] fn test_destroy_uninitialized_peer_when_there_exists_old_peer() { // 4 stores cluster. @@ -286,7 +289,8 @@ fn test_destroy_uninitialized_peer_when_there_exists_old_peer() { } /// Logs scan are now moved to raftlog gc threads. The case is to test if logs -/// are still cleaned up when there is stale logs before first index during destroy. +/// are still cleaned up when there is stale logs before first index during +/// destroy. #[test] fn test_destroy_clean_up_logs_with_unfinished_log_gc() { let mut cluster = new_node_cluster(0, 3); @@ -319,8 +323,8 @@ fn test_destroy_clean_up_logs_with_unfinished_log_gc() { must_get_equal(&cluster.get_engine(1), b"k30", b"v30"); fail::remove(fp); - // So peer (3, 3) will be destroyed by gc message. And all stale logs before first - // index should be cleaned up. + // So peer (3, 3) will be destroyed by gc message. And all stale logs before + // first index should be cleaned up. cluster.run_node(3).unwrap(); must_get_none(&cluster.get_engine(3), b"k29"); diff --git a/tests/failpoints/cases/test_stale_read.rs b/tests/failpoints/cases/test_stale_read.rs index 6e504e2f834..9a88a73508c 100644 --- a/tests/failpoints/cases/test_stale_read.rs +++ b/tests/failpoints/cases/test_stale_read.rs @@ -338,8 +338,9 @@ fn test_read_index_when_transfer_leader_2() { must_get_equal(&cluster.get_engine(2), b"k0", b"v0"); must_get_equal(&cluster.get_engine(3), b"k0", b"v0"); - // Put and test again to ensure that peer 3 get the latest writes by message append - // instead of snapshot, so that transfer leader to peer 3 can 100% success. + // Put and test again to ensure that peer 3 get the latest writes by message + // append instead of snapshot, so that transfer leader to peer 3 can 100% + // success. cluster.must_put(b"k1", b"v1"); must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); @@ -403,8 +404,8 @@ fn test_read_index_when_transfer_leader_2() { } } - // Resume reserved messages in one batch to make sure the old leader can get read and role - // change in one `Ready`. + // Resume reserved messages in one batch to make sure the old leader can get + // read and role change in one `Ready`. fail::cfg("pause_on_peer_collect_message", "pause").unwrap(); for raft_msg in reserved_msgs { router.send_raft_message(raft_msg).unwrap(); @@ -472,8 +473,9 @@ fn test_read_after_peer_destroyed() { ); } -/// In previous implementation, we suspect the leader lease at the position of `leader_commit_prepare_merge` -/// failpoint when `PrepareMerge` log is committed, which is too late to prevent stale read. +/// In previous implementation, we suspect the leader lease at the position of +/// `leader_commit_prepare_merge` failpoint when `PrepareMerge` log is +/// committed, which is too late to prevent stale read. #[test] fn test_stale_read_during_merging_2() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index c6872d22dab..85dfe054c63 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -479,8 +479,9 @@ fn test_pipelined_pessimistic_lock() { fail::remove(scheduler_async_write_finish_fp); delete_pessimistic_lock(&storage, key.clone(), 50, 50); - // The proposed callback, which is responsible for returning response, is not guaranteed to be - // invoked. In this case it should still be continued properly. + // The proposed callback, which is responsible for returning response, is not + // guaranteed to be invoked. In this case it should still be continued + // properly. fail::cfg(before_pipelined_write_finish_fp, "return()").unwrap(); storage .sched_txn_command( @@ -1336,10 +1337,11 @@ fn test_resolve_lock_deadline() { /// Checks if concurrent transaction works correctly during shutdown. /// -/// During shutdown, all pending writes will fail with error so its latch will be released. -/// Then other writes in the latch queue will be continued to be processed, which can break -/// the correctness of latch: underlying command result is always determined, it should be -/// either always success written or never be written. +/// During shutdown, all pending writes will fail with error so its latch will +/// be released. Then other writes in the latch queue will be continued to be +/// processed, which can break the correctness of latch: underlying command +/// result is always determined, it should be either always success written or +/// never be written. #[test] fn test_mvcc_concurrent_commit_and_rollback_at_shutdown() { let (mut cluster, mut client, mut ctx) = must_new_cluster_and_kv_client_mul(3); @@ -1407,7 +1409,8 @@ fn test_mvcc_concurrent_commit_and_rollback_at_shutdown() { ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); client = TikvClient::new(channel); - // The first request is commit, the second is rollback, the first one should succeed. + // The first request is commit, the second is rollback, the first one should + // succeed. ts += 1; let get_version = ts; let mut get_req = GetRequest::default(); diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index 1435fbbe88c..419d923b0d7 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -121,7 +121,8 @@ fn test_snapshot_must_be_later_than_updating_max_ts() { .build() .unwrap(); - // Suppose snapshot was before updating max_ts, after sleeping for 500ms the following prewrite should complete. + // Suppose snapshot was before updating max_ts, after sleeping for 500ms the + // following prewrite should complete. fail::cfg("after-snapshot", "sleep(500)").unwrap(); let read_ts = 20.into(); let get_fut = storage.get(Context::default(), Key::from_raw(b"j"), read_ts); @@ -151,7 +152,8 @@ fn test_snapshot_must_be_later_than_updating_max_ts() { .unwrap(); let has_lock = block_on(get_fut).is_err(); let res = prewrite_rx.recv().unwrap().unwrap(); - // We must make sure either the lock is visible to the reader or min_commit_ts > read_ts. + // We must make sure either the lock is visible to the reader or min_commit_ts > + // read_ts. assert!(res.min_commit_ts > read_ts || has_lock); } @@ -197,10 +199,17 @@ fn test_update_max_ts_before_scan_memory_locks() { assert_eq!(res.min_commit_ts, 101.into()); } -/// Generates a test that checks the correct behavior of holding and dropping locks, -/// during the process of a single prewrite command. +/// Generates a test that checks the correct behavior of holding and dropping +/// locks, during the process of a single prewrite command. macro_rules! lock_release_test { - ($test_name:ident, $lock_exists:ident, $before_actions:expr, $middle_actions:expr, $after_actions:expr, $should_succeed:expr) => { + ( + $test_name:ident, + $lock_exists:ident, + $before_actions:expr, + $middle_actions:expr, + $after_actions:expr, + $should_succeed:expr + ) => { #[test] fn $test_name() { let engine = TestEngineBuilder::new().build().unwrap(); @@ -262,7 +271,8 @@ lock_release_test!( false ); -// Must hold lock until prewrite ends. Must release lock after prewrite succeeds. +// Must hold lock until prewrite ends. Must release lock after prewrite +// succeeds. lock_release_test!( test_lock_lifetime_on_prewrite_success, lock_exists, @@ -395,7 +405,8 @@ fn test_exceed_max_commit_ts_in_the_middle_of_prewrite() { assert_eq!(locks[1].get_key(), b"k2"); assert!(!locks[1].get_use_async_commit()); - // Send a duplicated request to test the idempotency of prewrite when falling back to 2PC. + // Send a duplicated request to test the idempotency of prewrite when falling + // back to 2PC. let (prewrite_tx, prewrite_rx) = channel(); storage .sched_txn_command( @@ -583,7 +594,8 @@ fn test_concurrent_write_after_transfer_leader_invalidates_locks() { let mut req = PrewriteRequest::default(); req.set_context(ctx); req.set_mutations(vec![mutation].into()); - // Set a different start_ts. It should fail because the memory lock is still visible. + // Set a different start_ts. It should fail because the memory lock is still + // visible. req.set_start_version(20); req.set_primary_lock(b"key".to_vec()); diff --git a/tests/failpoints/cases/test_transfer_leader.rs b/tests/failpoints/cases/test_transfer_leader.rs index 028ef9f2cef..87b05042a30 100644 --- a/tests/failpoints/cases/test_transfer_leader.rs +++ b/tests/failpoints/cases/test_transfer_leader.rs @@ -273,8 +273,8 @@ fn test_read_lock_after_become_follower() { let start_ts = block_on(cluster.pd_client.get_tso()).unwrap(); - // put kv after get start ts, then this commit will cause a PessimisticLockNotFound - // if the pessimistic lock get missing. + // put kv after get start ts, then this commit will cause a + // PessimisticLockNotFound if the pessimistic lock get missing. cluster.must_put(b"key", b"value"); let leader = cluster.leader_of_region(region_id).unwrap(); @@ -334,6 +334,7 @@ fn test_read_lock_after_become_follower() { // Transfer leader will not make the command fail. fail::remove("txn_before_process_write"); let resp = resp_rx.recv().unwrap(); - // The term has changed, so we should get a stale command error instead a PessimisticLockNotFound. + // The term has changed, so we should get a stale command error instead a + // PessimisticLockNotFound. assert!(resp.get_region_error().has_stale_command()); } diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index 290a3561be9..c70ac41d902 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -34,7 +34,8 @@ fn test_unsafe_recovery_send_report() { }) .unwrap(); - // Mannually makes an update, and wait for the apply to be triggered, to simulate "some entries are commited but not applied" scenario. + // Manually makes an update, and wait for the apply to be triggered, to + // simulate "some entries are committed but not applied" scenario. cluster.put(b"random_key2", b"random_val2").unwrap(); apply_triggered_rx .recv_timeout(Duration::from_secs(1)) @@ -88,8 +89,8 @@ fn test_unsafe_recovery_execution_result_report() { cluster.must_transfer_leader(region.get_id(), store2_peer); cluster.put(b"random_key1", b"random_val1").unwrap(); - // Split the region into 2, and remove one of them, so that we can test both region peer list - // update and region creation. + // Split the region into 2, and remove one of them, so that we can test both + // region peer list update and region creation. pd_client.must_split_region( region, pdpb::CheckPolicy::Usekey, @@ -382,8 +383,8 @@ fn test_unsafe_recovery_create_destroy_reentrancy() { cluster.must_transfer_leader(region.get_id(), store2_peer); cluster.put(b"random_key1", b"random_val1").unwrap(); - // Split the region into 2, and remove one of them, so that we can test both region peer list - // update and region creation. + // Split the region into 2, and remove one of them, so that we can test both + // region peer list update and region creation. pd_client.must_split_region( region, pdpb::CheckPolicy::Usekey, diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index 2990a983974..ff07d8a712a 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -33,9 +33,11 @@ fn assert_same_file_name(s1: String, s2: String) { fn assert_same_files(mut files1: Vec, mut files2: Vec) { assert_eq!(files1.len(), files2.len()); - // Sort here by start key in case of unordered response (by pipelined write + scan) - // `sort_by_key` couldn't be used here -- rustc would complain that `file.start_key.as_slice()` - // may not live long enough. (Is that a bug of rustc?) + // Sort here by start key in case of unordered response (by pipelined write + + // scan). + // `sort_by_key` couldn't be used here -- rustc would complain that + // `file.start_key.as_slice()` may not live long enough. (Is that a + // bug of rustc?) files1.sort_by(|f1, f2| f1.start_key.cmp(&f2.start_key)); files2.sort_by(|f1, f2| f1.start_key.cmp(&f2.start_key)); @@ -52,7 +54,8 @@ fn assert_same_files(mut files1: Vec, mut files2: Vec 0) when the test failed suite.stop(); diff --git a/tests/integrations/config/test_config_client.rs b/tests/integrations/config/test_config_client.rs index 52cdc9cb012..96299de22a3 100644 --- a/tests/integrations/config/test_config_client.rs +++ b/tests/integrations/config/test_config_client.rs @@ -224,7 +224,8 @@ raft-log-gc-threshold = 2000 ); // config update from config file assert!(cfg_controller.update_from_toml_file().is_ok()); - // after update this configration item should be constant with the modified configuration file + // after update this configuration item should be constant with the modified + // configuration file assert_eq!( cfg_controller .get_current() diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 317e811ec50..69ce131ec8b 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -37,9 +37,10 @@ fn check_chunk_datum_count(chunks: &[Chunk], datum_limit: usize) { } } -/// sort_by sorts the `$v`(a vector of `Vec`) by the $index elements in `Vec` +/// sort_by sorts the `$v`(a vector of `Vec`) by the $index elements in +/// `Vec` macro_rules! sort_by { - ($v:ident, $index:expr, $t:ident) => { + ($v:ident, $index:expr, $t:ident) => { $v.sort_by(|a, b| match (&a[$index], &b[$index]) { (Datum::Null, Datum::Null) => std::cmp::Ordering::Equal, (Datum::$t(a), Datum::$t(b)) => a.cmp(&b), @@ -1732,8 +1733,8 @@ fn test_cache() { // Cache version must be >= 5 because Raft apply index must be >= 5. assert!(cache_version >= 5); - // Send the request again using is_cache_enabled == false (default) and a matching version. - // The request should be processed as usual. + // Send the request again using is_cache_enabled == false (default) and a + // matching version. The request should be processed as usual. let mut req2 = req.clone(); req2.set_cache_if_match_version(cache_version); @@ -1746,8 +1747,8 @@ fn test_cache() { ); assert_eq!(resp.get_data(), resp2.get_data()); - // Send the request again using is_cached_enabled == true and a matching version. - // The request should be skipped. + // Send the request again using is_cached_enabled == true and a matching + // version. The request should be skipped. let mut req3 = req.clone(); req3.set_is_cache_enabled(true); @@ -1757,7 +1758,8 @@ fn test_cache() { assert!(resp3.get_is_cache_hit()); assert!(resp3.get_data().is_empty()); - // Send the request using a non-matching version. The request should be processed. + // Send the request using a non-matching version. The request should be + // processed. let mut req4 = req; req4.set_is_cache_enabled(true); @@ -1775,12 +1777,12 @@ fn test_cache() { #[test] fn test_copr_bypass_or_access_locks() { let data = vec![ - (1, Some("name:1"), 1), /* no lock */ - (2, Some("name:2"), 2), /* bypass lock */ - (3, Some("name:3"), 3), /* access lock(range) */ - (4, Some("name:4"), 4), /* access lock(range) */ - (6, Some("name:6"), 6), /* access lock(point) */ - (8, Some("name:8"), 8), /* not conflict lock */ + (1, Some("name:1"), 1), // no lock + (2, Some("name:2"), 2), // bypass lock + (3, Some("name:3"), 3), // access lock(range) + (4, Some("name:4"), 4), // access lock(range) + (6, Some("name:6"), 6), // access lock(point) + (8, Some("name:8"), 8), // not conflict lock ]; let product = ProductTable::new(); @@ -1894,10 +1896,10 @@ fn test_copr_bypass_or_access_locks() { #[test] fn test_rc_read() { let data = vec![ - (1, Some("name:1"), 1), /* no lock */ - (2, Some("name:2"), 2), /* no lock */ - (3, Some("name:3"), 3), /* update lock */ - (4, Some("name:4"), 4), /* delete lock */ + (1, Some("name:1"), 1), // no lock + (2, Some("name:2"), 2), // no lock + (3, Some("name:3"), 3), // update lock + (4, Some("name:4"), 4), // delete lock ]; let product = ProductTable::new(); diff --git a/tests/integrations/pd/test_rpc_client.rs b/tests/integrations/pd/test_rpc_client.rs index 20fc6b70908..3a3967c25a8 100644 --- a/tests/integrations/pd/test_rpc_client.rs +++ b/tests/integrations/pd/test_rpc_client.rs @@ -128,7 +128,7 @@ fn test_rpc_client() { block_on(client.store_heartbeat( pdpb::StoreStats::default(), - /*store_report=*/ None, + None, // store_report None, )) .unwrap(); @@ -353,7 +353,8 @@ fn test_retry_sync() { fn test_not_retry(func: F) { let eps_count = 1; - // NotRetry mocker returns Ok() with error header first, and next returns Ok() without any error header. + // NotRetry mocker returns Ok() with error header first, and next returns Ok() + // without any error header. let not_retry = Arc::new(NotRetry::new()); let server = MockServer::with_case(eps_count, not_retry); let eps = server.bind_addrs(); @@ -586,7 +587,8 @@ fn test_region_heartbeat_on_leader_change() { // Change PD leader once then heartbeat PD. heartbeat_on_leader_change(1); - // Change PD leader twice without update the heartbeat sender, then heartbeat PD. + // Change PD leader twice without update the heartbeat sender, then heartbeat + // PD. heartbeat_on_leader_change(2); } @@ -631,7 +633,7 @@ fn test_cluster_version() { let emit_heartbeat = || { let req = pdpb::StoreStats::default(); - block_on(client.store_heartbeat(req, /*store_report=*/ None, None)).unwrap(); + block_on(client.store_heartbeat(req, /* store_report= */ None, None)).unwrap(); }; let set_cluster_version = |version: &str| { diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index f2019d04ea7..e74f0979241 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -21,10 +21,11 @@ use tikv_util::{ }; fn test_bootstrap_idempotent(cluster: &mut Cluster) { - // assume that there is a node bootstrap the cluster and add region in pd successfully + // assume that there is a node bootstrap the cluster and add region in pd + // successfully cluster.add_first_region().unwrap(); - // now at same time start the another node, and will recive cluster is not bootstrap - // it will try to bootstrap with a new region, but will failed + // now at same time start the another node, and will receive `cluster is not + // bootstrap` it will try to bootstrap with a new region, but will failed // the region number still 1 cluster.start().unwrap(); cluster.check_regions_number(1); @@ -64,11 +65,12 @@ fn test_node_bootstrap_with_prepared_data() { let snap_mgr = SnapManager::new(tmp_mgr.path().to_str().unwrap()); let pd_worker = LazyWorker::new("test-pd-worker"); - // assume there is a node has bootstrapped the cluster and add region in pd successfully + // assume there is a node has bootstrapped the cluster and add region in pd + // successfully bootstrap_with_first_region(Arc::clone(&pd_client)).unwrap(); - // now another node at same time begin bootstrap node, but panic after prepared bootstrap - // now rocksDB must have some prepare data + // now another node at same time begin bootstrap node, but panic after prepared + // bootstrap now rocksDB must have some prepare data bootstrap_store(&engines, 0, 1).unwrap(); let region = node.prepare_bootstrap_cluster(&engines, 1).unwrap(); assert!( diff --git a/tests/integrations/raftstore/test_compact_lock_cf.rs b/tests/integrations/raftstore/test_compact_lock_cf.rs index 703e49169ef..c8ee96c7c67 100644 --- a/tests/integrations/raftstore/test_compact_lock_cf.rs +++ b/tests/integrations/raftstore/test_compact_lock_cf.rs @@ -37,12 +37,14 @@ fn test_compact_lock_cf(cluster: &mut Cluster) { cluster.cfg.rocksdb.lockcf.disable_auto_compactions = true; cluster.run(); - // Write 40 bytes, not reach lock_cf_compact_bytes_threshold, so there is no compaction. + // Write 40 bytes, not reach lock_cf_compact_bytes_threshold, so there is no + // compaction. for i in 0..5 { let (k, v) = (format!("k{}", i), format!("value{}", i)); cluster.must_put_cf(CF_LOCK, k.as_bytes(), v.as_bytes()); } - // Generate one sst, if there are datas only in one memtable, no compactions will be triggered. + // Generate one sst, if there are datas only in one memtable, no compactions + // will be triggered. flush(cluster); // Write more 40 bytes, still not reach lock_cf_compact_bytes_threshold, diff --git a/tests/integrations/raftstore/test_compact_log.rs b/tests/integrations/raftstore/test_compact_log.rs index e7d14a6eb45..bc097dd27e9 100644 --- a/tests/integrations/raftstore/test_compact_log.rs +++ b/tests/integrations/raftstore/test_compact_log.rs @@ -27,7 +27,7 @@ fn test_compact_log(cluster: &mut Cluster) { &cluster.engines, &before_states, 1, - false, /*must_compacted*/ + false, // must_compacted ) { return; @@ -38,7 +38,7 @@ fn test_compact_log(cluster: &mut Cluster) { &cluster.engines, &before_states, 1, - true, /*must_compacted*/ + true, // must_compacted ); } @@ -93,7 +93,7 @@ fn test_compact_count_limit(cluster: &mut Cluster) { &cluster.engines, &before_states, 1, - false, /*must_compacted*/ + false, // must_compacted ) { return; @@ -103,7 +103,7 @@ fn test_compact_count_limit(cluster: &mut Cluster) { &cluster.engines, &before_states, 1, - true, /*must_compacted*/ + true, // must_compacted ); } @@ -140,7 +140,7 @@ fn test_compact_many_times(cluster: &mut Cluster) { &cluster.engines, &before_states, gc_limit * 2, - false, /*must_compacted*/ + false, // must_compacted ) { return; @@ -151,7 +151,7 @@ fn test_compact_many_times(cluster: &mut Cluster) { &cluster.engines, &before_states, gc_limit * 2, - true, /*must_compacted*/ + true, // must_compacted ); } diff --git a/tests/integrations/raftstore/test_conf_change.rs b/tests/integrations/raftstore/test_conf_change.rs index 3778794387a..b37b207ac11 100644 --- a/tests/integrations/raftstore/test_conf_change.rs +++ b/tests/integrations/raftstore/test_conf_change.rs @@ -574,8 +574,8 @@ fn test_conf_change_safe(cluster: &mut Cluster) { cluster.must_put(b"k3", b"v3"); // Ensure the conf change is safe: - // The "RemoveNode" request which asks to remove one healthy node will be rejected - // if there are only 2 healthy nodes in a cluster of 3 nodes. + // The "RemoveNode" request which asks to remove one healthy node will be + // rejected if there are only 2 healthy nodes in a cluster of 3 nodes. pd_client.remove_peer(region_id, new_peer(2, 2)); cluster.must_put(b"k4", b"v4"); pd_client.must_have_peer(region_id, new_peer(2, 2)); @@ -583,7 +583,8 @@ fn test_conf_change_safe(cluster: &mut Cluster) { // In this case, it's fine to remove one unhealthy node. pd_client.must_remove_peer(region_id, new_peer(1, 1)); - // Ensure it works to remove one node from the cluster that has only two healthy nodes. + // Ensure it works to remove one node from the cluster that has only two healthy + // nodes. pd_client.must_remove_peer(region_id, new_peer(2, 2)); } @@ -913,16 +914,17 @@ where #[test] fn test_conf_change_fast() { let mut cluster = new_server_cluster(0, 3); - // Sets heartbeat timeout to more than 5 seconds. It also changes the election timeout, - // but it's OK as the cluster starts with only one peer, it will campaigns immediately. + // Sets heartbeat timeout to more than 5 seconds. It also changes the election + // timeout, but it's OK as the cluster starts with only one peer, it will + // campaigns immediately. configure_for_lease_read(&mut cluster, Some(5000), None); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); let r1 = cluster.run_conf_change(); cluster.must_put(b"k1", b"v1"); let timer = Instant::now(); - // If conf change relies on heartbeat, it will take more than 5 seconds to finish, - // hence it must timeout. + // If conf change relies on heartbeat, it will take more than 5 seconds to + // finish, hence it must timeout. pd_client.must_add_peer(r1, new_learner_peer(2, 2)); pd_client.must_add_peer(r1, new_peer(2, 2)); must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); diff --git a/tests/integrations/raftstore/test_early_apply.rs b/tests/integrations/raftstore/test_early_apply.rs index 4b9a1e40d8b..a88032671a3 100644 --- a/tests/integrations/raftstore/test_early_apply.rs +++ b/tests/integrations/raftstore/test_early_apply.rs @@ -22,9 +22,9 @@ fn delete_old_data(engine: &E, id: u64) { ..Default::default() }; engine - .clean(id, 0 /*first_index*/, &state, &mut deleter) + .clean(id, 0 /* first_index */, &state, &mut deleter) .unwrap(); - engine.consume(&mut deleter, true /*sync*/).unwrap(); + engine.consume(&mut deleter, true /* sync */).unwrap(); } /// Allow lost situation. @@ -89,7 +89,7 @@ where delete_old_data(&cluster.get_raft_engine(*id), *id); cluster .get_raft_engine(*id) - .consume(&mut batch, true /*sync*/) + .consume(&mut batch, true /* sync */) .unwrap(); } for id in &ids { @@ -160,7 +160,8 @@ fn test_follower_commit_early_apply() { test_early_apply(DataLost::FollowerCommit) } -/// Tests whether the cluster can recover from all nodes lost their commit index. +/// Tests whether the cluster can recover from all nodes lost their commit +/// index. #[test] fn test_all_node_crash() { test_early_apply(DataLost::AllLost) @@ -202,7 +203,7 @@ fn test_update_internal_apply_index() { delete_old_data(&cluster.get_raft_engine(id), id); cluster .get_raft_engine(id) - .consume(&mut batch, true /*sync*/) + .consume(&mut batch, true /* sync */) .unwrap(); cluster.run_node(id).unwrap(); } diff --git a/tests/integrations/raftstore/test_hibernate.rs b/tests/integrations/raftstore/test_hibernate.rs index 602efc2d9c3..23c859a21bd 100644 --- a/tests/integrations/raftstore/test_hibernate.rs +++ b/tests/integrations/raftstore/test_hibernate.rs @@ -231,8 +231,9 @@ fn test_transfer_leader_delay() { panic!("failed to request after 3 seconds"); } -/// If a learner is isolated before split and then catch up logs by snapshot, then the -/// range for split learner will be missing on the node until leader is waken. +/// If a learner is isolated before split and then catch up logs by snapshot, +/// then the range for split learner will be missing on the node until leader is +/// waken. #[test] fn test_split_delay() { let mut cluster = new_server_cluster(0, 4); @@ -354,9 +355,9 @@ fn test_inconsistent_configuration() { assert_eq!(cluster.leader_of_region(1), Some(new_peer(3, 3))); } -/// Negotiating hibernation is implemented after 5.0.0, for older version binaries, -/// negotiating can cause connection reset due to new enum type. The test ensures -/// negotiation won't happen until cluster is upgraded. +/// Negotiating hibernation is implemented after 5.0.0, for older version +/// binaries, negotiating can cause connection reset due to new enum type. The +/// test ensures negotiation won't happen until cluster is upgraded. #[test] fn test_hibernate_feature_gate() { let mut cluster = new_node_cluster(0, 3); @@ -405,7 +406,8 @@ fn test_hibernate_feature_gate() { assert!(!awakened.load(Ordering::SeqCst)); } -/// Tests when leader is demoted in a hibernated region, the region can recover automatically. +/// Tests when leader is demoted in a hibernated region, the region can recover +/// automatically. #[test] fn test_leader_demoted_when_hibernated() { let mut cluster = new_node_cluster(0, 4); @@ -489,10 +491,11 @@ fn test_leader_demoted_when_hibernated() { } cluster.clear_send_filters(); - // If there is no leader in the region, the cluster can't write two kvs successfully. - // The first one is possible to succeed if it's committed with the conf change at the - // same time, but the second one can't be committed or accepted because conf change - // should be applied and the leader should be demoted as learner. + // If there is no leader in the region, the cluster can't write two kvs + // successfully. The first one is possible to succeed if it's committed with + // the conf change at the same time, but the second one can't be committed + // or accepted because conf change should be applied and the leader should + // be demoted as learner. cluster.must_put(b"k1", b"v1"); cluster.must_put(b"k2", b"v2"); } diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index 140cbb98fcd..ae04c0d12f2 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -22,15 +22,16 @@ use tikv_util::{config::*, time::Instant, HandyRwLock}; // The leader keeps a record of its leader lease, and uses the system's // monotonic raw clocktime to check whether its lease has expired. // If the leader lease has not expired, when the leader receives a read request -// 1. with `read_quorum == false`, the leader will serve it by reading local data. -// This way of handling request is called "lease read". -// 2. with `read_quorum == true`, the leader will serve it by doing index read (see raft's doc). -// This way of handling request is called "index read". -// If the leader lease has expired, leader will serve both kinds of requests by index read, and -// propose an no-op entry to raft quorum to renew the lease. -// No matter what status the leader lease is, a write request is always served by writing a Raft -// log to the Raft quorum. It is called "consistent write". All writes are consistent writes. -// Every time the leader performs a consistent read/write, it will try to renew its lease. +// - with `read_quorum == false`, the leader will serve it by reading local +// data. This way of handling request is called "lease read". +// - with `read_quorum == true`, the leader will serve it by doing index read +// (see raft's doc). This way of handling request is called "index read". +// If the leader lease has expired, leader will serve both kinds of requests by +// index read, and propose an no-op entry to raft quorum to renew the lease. +// No matter what status the leader lease is, a write request is always served +// by writing a Raft log to the Raft quorum. It is called "consistent write". +// All writes are consistent writes. Every time the leader performs a consistent +// read/write, it will try to renew its lease. fn test_renew_lease(cluster: &mut Cluster) { // Avoid triggering the log compaction in this test case. cluster.cfg.raft_store.raft_log_gc_threshold = 100; @@ -163,11 +164,12 @@ fn test_node_lease_expired() { test_lease_expired(&mut cluster); } -// A helper function for testing the leader holds unsafe lease during the leader transfer -// procedure, so it will not do lease read. -// Since raft will not propose any request during leader transfer procedure, consistent read/write -// could not be performed neither. -// When leader transfer procedure aborts later, the leader would use and update the lease as usual. +// A helper function for testing the leader holds unsafe lease during the leader +// transfer procedure, so it will not do lease read. +// Since raft will not propose any request during leader transfer procedure, +// consistent read/write could not be performed neither. +// When leader transfer procedure aborts later, the leader would use and update +// the lease as usual. fn test_lease_unsafe_during_leader_transfers(cluster: &mut Cluster) { // Avoid triggering the log compaction in this test case. cluster.cfg.raft_store.raft_log_gc_threshold = 100; @@ -215,7 +217,8 @@ fn test_lease_unsafe_during_leader_transfers(cluster: &mut Cluster // Ensure peer 3 is ready to transfer leader. must_get_equal(&cluster.get_engine(3), key, b"v1"); - // Drop MsgTimeoutNow to `peer3` so that the leader transfer procedure would abort later. + // Drop MsgTimeoutNow to `peer3` so that the leader transfer procedure would + // abort later. cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(region_id, peer3_store_id) .msg_type(MessageType::MsgTimeoutNow) @@ -225,7 +228,8 @@ fn test_lease_unsafe_during_leader_transfers(cluster: &mut Cluster // Issue a transfer leader request to transfer leader from `peer` to `peer3`. cluster.transfer_leader(region_id, peer3); - // Delay a while to ensure transfer leader procedure is triggered inside raft module. + // Delay a while to ensure transfer leader procedure is triggered inside raft + // module. thread::sleep(election_timeout / 2); // Issue a read request and it will fall back to read index. @@ -239,8 +243,8 @@ fn test_lease_unsafe_during_leader_transfers(cluster: &mut Cluster // Make sure the leader transfer procedure timeouts. thread::sleep(election_timeout * 2); - // Then the leader transfer procedure aborts, now the leader could do lease read or consistent - // read/write and renew/reuse the lease as usual. + // Then the leader transfer procedure aborts, now the leader could do lease read + // or consistent read/write and renew/reuse the lease as usual. // Issue a read request and check the value on response. must_read_on_peer(cluster, peer.clone(), region.clone(), key, b"v1"); @@ -354,7 +358,8 @@ fn test_batch_id_in_lease(cluster: &mut Cluster) { }) .collect(); - // Snapshot 0 and 1 will use one RocksSnapshot because we have renew their lease. + // Snapshot 0 and 1 will use one RocksSnapshot because we have renew their + // lease. assert!(std::ptr::eq( snaps[0].get_snapshot(), snaps[1].get_snapshot() @@ -389,8 +394,9 @@ fn test_batch_id_in_lease(cluster: &mut Cluster) { )); } -/// test whether the read index callback will be handled when a region is destroyed. -/// If it's not handled properly, it will cause dead lock in transaction scheduler. +/// test whether the read index callback will be handled when a region is +/// destroyed. If it's not handled properly, it will cause dead lock in +/// transaction scheduler. #[test] fn test_node_callback_when_destroyed() { let count = 3; @@ -465,9 +471,10 @@ fn test_lease_read_callback_destroy() { cluster.must_put(b"k2", b"v2"); } -/// A read index request will be appended to waiting list when there is an on-going request -/// to reduce heartbeat messages. But when leader is in suspect lease, requests should not -/// be batched because lease can be expired at anytime. +/// A read index request will be appended to waiting list when there is an +/// on-going request to reduce heartbeat messages. But when leader is in suspect +/// lease, requests should not be batched because lease can be expired at +/// anytime. #[test] fn test_read_index_stale_in_suspect_lease() { let mut cluster = new_node_cluster(0, 3); @@ -485,8 +492,9 @@ fn test_read_index_stale_in_suspect_lease() { cluster.pd_client.must_add_peer(r1, new_peer(3, 3)); let r1 = cluster.get_region(b"k1"); - // Put and test again to ensure that peer 3 get the latest writes by message append - // instead of snapshot, so that transfer leader to peer 3 can 100% success. + // Put and test again to ensure that peer 3 get the latest writes by message + // append instead of snapshot, so that transfer leader to peer 3 can 100% + // success. cluster.must_put(b"k1", b"v1"); must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); cluster.must_put(b"k2", b"v2"); @@ -650,15 +658,16 @@ fn test_not_leader_read_lease() { } /// Test whether read index is greater than applied index. -/// 1. Add hearbeat msg filter. +/// 1. Add heartbeat msg filter. /// 2. Propose a read index request. /// 3. Put a key and get the latest applied index. /// 4. Propose another read index request. -/// 5. Remove the filter and check whether the latter read index is greater than applied index. +/// 5. Remove the filter and check whether the latter read index is greater than +/// applied index. /// /// In previous implementation, these two read index request will be batched and -/// will get the same read index which breaks the correctness because the latter one -/// is proposed after the applied index has increased and replied to client. +/// will get the same read index which breaks the correctness because the latter +/// one is proposed after the applied index has increased and replied to client. #[test] fn test_read_index_after_write() { let mut cluster = new_node_cluster(0, 3); @@ -675,7 +684,8 @@ fn test_read_index_after_write() { cluster.must_transfer_leader(region.get_id(), region_on_store1.clone()); cluster.add_send_filter(IsolationFilterFactory::new(3)); - // Add heartbeat msg filter to prevent the leader to reply the read index response. + // Add heartbeat msg filter to prevent the leader to reply the read index + // response. let filter = Box::new( RegionPacketFilter::new(region.get_id(), 2) .direction(Direction::Recv) @@ -766,7 +776,8 @@ fn test_infinite_lease() { assert_eq!(cluster.leader_of_region(region_id), Some(peer)); assert_eq!(detector.ctx.rl().len(), 1); - // renew-lease-tick shouldn't propose any request if the leader lease is not expired. + // renew-lease-tick shouldn't propose any request if the leader lease is not + // expired. for _ in 0..4 { cluster.must_put(key, b"v0"); thread::sleep(max_lease / 4); @@ -774,8 +785,8 @@ fn test_infinite_lease() { assert_eq!(detector.ctx.rl().len(), 1); } -// LocalReader will try to renew lease in advance, so the region that has continuous reads -// should not go to hibernate. +// LocalReader will try to renew lease in advance, so the region that has +// continuous reads should not go to hibernate. #[test] fn test_node_local_read_renew_lease() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 1146e152681..9cff738fdfe 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -205,7 +205,8 @@ fn test_node_merge_prerequisites_check() { cluster.must_transfer_leader(right.get_id(), right_on_store1); // first MsgAppend will append log, second MsgAppend will set commit index, - // So only allowing first MsgAppend to make source peer have uncommitted entries. + // So only allowing first MsgAppend to make source peer have uncommitted + // entries. cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(left.get_id(), 3) .direction(Direction::Recv) @@ -334,7 +335,8 @@ fn test_node_merge_slow_split_left() { test_node_merge_slow_split(false); } -// Test if a merge handled properly when there is a unfinished slow split before merge. +// Test if a merge handled properly when there is a unfinished slow split before +// merge. fn test_node_merge_slow_split(is_right_derive: bool) { let mut cluster = new_node_cluster(0, 3); configure_for_merge(&mut cluster); @@ -635,7 +637,8 @@ fn test_merge_approximate_size_and_keys() { keys ); - // after merge and then transfer leader, if not update new leader's approximate size, it maybe be stale. + // after merge and then transfer leader, if not update new leader's approximate + // size, it maybe be stale. cluster.must_transfer_leader(region.get_id(), region.get_peers()[0].clone()); // make sure split check is invoked thread::sleep(Duration::from_millis(100)); @@ -731,7 +734,8 @@ fn test_node_merge_update_region() { assert_eq!(resp.get_responses()[0].get_get().get_value(), b"v3"); } -/// Test if merge is working properly when merge entries is empty but commit index is not updated. +/// Test if merge is working properly when merge entries is empty but commit +/// index is not updated. #[test] fn test_node_merge_catch_up_logs_empty_entries() { let mut cluster = new_node_cluster(0, 3); @@ -754,20 +758,23 @@ fn test_node_merge_catch_up_logs_empty_entries() { must_get_equal(&cluster.get_engine(3), b"k0", b"v0"); // first MsgAppend will append log, second MsgAppend will set commit index, - // So only allowing first MsgAppend to make source peer have uncommitted entries. + // So only allowing first MsgAppend to make source peer have uncommitted + // entries. cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(left.get_id(), 3) .direction(Direction::Recv) .msg_type(MessageType::MsgAppend) .allow(1), )); - // make the source peer have no way to know the uncommitted entries can be applied from heartbeat. + // make the source peer have no way to know the uncommitted entries can be + // applied from heartbeat. cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(left.get_id(), 3) .msg_type(MessageType::MsgHeartbeat) .direction(Direction::Recv), )); - // make the source peer have no way to know the uncommitted entries can be applied from target region. + // make the source peer have no way to know the uncommitted entries can be + // applied from target region. cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(right.get_id(), 3) .msg_type(MessageType::MsgAppend) @@ -820,11 +827,12 @@ fn test_merge_with_slow_promote() { /// Test whether a isolated store recover properly if there is no target peer /// on this store before isolated. -/// A (-∞, k2), B [k2, +∞) on store 1,2,4 -/// store 4 is isolated -/// B merge to A (target peer A is not created on store 4. It‘s just exist logically) -/// A split => C (-∞, k3), A [k3, +∞) -/// Then network recovery +/// - A (-∞, k2), B [k2, +∞) on store 1,2,4 +/// - store 4 is isolated +/// - B merge to A (target peer A is not created on store 4. It‘s just exist +/// logically) +/// - A split => C (-∞, k3), A [k3, +∞) +/// - Then network recovery #[test] fn test_merge_isolated_store_with_no_target_peer() { let mut cluster = new_node_cluster(0, 4); @@ -882,7 +890,8 @@ fn test_merge_isolated_store_with_no_target_peer() { must_get_equal(&cluster.get_engine(4), b"k345", b"v345"); } -/// Test whether a isolated peer can recover when two other regions merge to its region +/// Test whether a isolated peer can recover when two other regions merge to its +/// region #[test] fn test_merge_cascade_merge_isolated() { let mut cluster = new_node_cluster(0, 3); @@ -932,8 +941,8 @@ fn test_merge_cascade_merge_isolated() { must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } -// Test if a learner can be destroyed properly when it's isolated and removed by conf change -// before its region merge to another region +// Test if a learner can be destroyed properly when it's isolated and removed by +// conf change before its region merge to another region #[test] fn test_merge_isolated_not_in_merge_learner() { let mut cluster = new_node_cluster(0, 3); @@ -967,7 +976,8 @@ fn test_merge_isolated_not_in_merge_learner() { pd_client.must_remove_peer(right.get_id(), right_on_store1); pd_client.must_merge(left.get_id(), right.get_id()); - // Add a new learner on store 2 to trigger peer 2 send check-stale-peer msg to other peers + // Add a new learner on store 2 to trigger peer 2 send check-stale-peer msg to + // other peers pd_client.must_add_peer(right.get_id(), new_learner_peer(2, 5)); cluster.must_put(b"k123", b"v123"); @@ -977,8 +987,8 @@ fn test_merge_isolated_not_in_merge_learner() { must_get_equal(&cluster.get_engine(2), b"k123", b"v123"); } -// Test if a learner can be destroyed properly when it's isolated and removed by conf change -// before another region merge to its region +// Test if a learner can be destroyed properly when it's isolated and removed by +// conf change before another region merge to its region #[test] fn test_merge_isolated_stale_learner() { let mut cluster = new_node_cluster(0, 3); @@ -1015,7 +1025,8 @@ fn test_merge_isolated_stale_learner() { let new_left = pd_client.get_region(b"k1").unwrap(); assert_ne!(left.get_id(), new_left.get_id()); - // Add a new learner on store 2 to trigger peer 2 send check-stale-peer msg to other peers + // Add a new learner on store 2 to trigger peer 2 send check-stale-peer msg to + // other peers pd_client.must_add_peer(new_left.get_id(), new_learner_peer(2, 5)); cluster.must_put(b"k123", b"v123"); @@ -1064,15 +1075,16 @@ fn test_merge_isolated_not_in_merge_learner_2() { pd_client.must_merge(left.get_id(), right.get_id()); cluster.run_node(2).unwrap(); - // When the abnormal leader missing duration has passed, the check-stale-peer msg will be sent to peer 1001. - // After that, a new peer list will be returned (2, 2) (3, 3). - // Then peer 2 sends the check-stale-peer msg to peer 3 and it will get a tombstone response. - // Finally peer 2 will be destroyed. + // When the abnormal leader missing duration has passed, the check-stale-peer + // msg will be sent to peer 1001. After that, a new peer list will be + // returned (2, 2) (3, 3). Then peer 2 sends the check-stale-peer msg to + // peer 3 and it will get a tombstone response. Finally peer 2 will be + // destroyed. must_get_none(&cluster.get_engine(2), b"k1"); } -/// Test if a peer can be removed if its target peer has been removed and doesn't apply the -/// CommitMerge log. +/// Test if a peer can be removed if its target peer has been removed and +/// doesn't apply the CommitMerge log. #[test] fn test_merge_remove_target_peer_isolated() { let mut cluster = new_node_cluster(0, 4); @@ -1109,7 +1121,8 @@ fn test_merge_remove_target_peer_isolated() { cluster.add_send_filter(IsolationFilterFactory::new(3)); // Make region r2's epoch > r2 peer on store 3. - // r2 peer on store 3 will be removed whose epoch is staler than the epoch when r1 merge to r2. + // r2 peer on store 3 will be removed whose epoch is staler than the epoch when + // r1 merge to r2. pd_client.must_add_peer(r2.get_id(), new_peer(4, 4)); pd_client.must_remove_peer(r2.get_id(), new_peer(4, 4)); @@ -1191,8 +1204,8 @@ fn test_sync_max_ts_after_region_merge() { assert!(new_max_ts > max_ts); } -/// If a follower is demoted by a snapshot, its meta will be changed. The case is to ensure -/// asserts in code can tolerate the change. +/// If a follower is demoted by a snapshot, its meta will be changed. The case +/// is to ensure asserts in code can tolerate the change. #[test] fn test_merge_snapshot_demote() { let mut cluster = new_node_cluster(0, 4); @@ -1267,8 +1280,8 @@ fn test_propose_in_memory_pessimistic_locks() { let left = cluster.get_region(b"k1"); let right = cluster.get_region(b"k3"); - // Transfer the leader of the right region to store 2. The leaders of source and target - // regions don't need to be on the same store. + // Transfer the leader of the right region to store 2. The leaders of source and + // target regions don't need to be on the same store. cluster.must_transfer_leader(right.id, new_peer(2, 2)); // Insert lock l1 into the left region @@ -1310,8 +1323,8 @@ fn test_propose_in_memory_pessimistic_locks() { // Merge left region into the right region pd_client.must_merge(left.id, right.id); - // After the left region is merged into the right region, its pessimistic locks should be - // proposed and applied to the storage. + // After the left region is merged into the right region, its pessimistic locks + // should be proposed and applied to the storage. let snapshot = cluster.must_get_snapshot_of_region(right.id); let value = snapshot .get_cf(CF_LOCK, &Key::from_raw(b"k1")) @@ -1334,7 +1347,8 @@ fn test_merge_pessimistic_locks_when_gap_is_too_large() { configure_for_merge(&mut cluster); cluster.cfg.pessimistic_txn.pipelined = true; cluster.cfg.pessimistic_txn.in_memory = true; - // Set raft_entry_max_size to 64 KiB. We will try to make the gap larger than the limit later. + // Set raft_entry_max_size to 64 KiB. We will try to make the gap larger than + // the limit later. cluster.cfg.raft_store.raft_entry_max_size = ReadableSize::kb(64); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1367,8 +1381,8 @@ fn test_merge_pessimistic_locks_when_gap_is_too_large() { cluster.merge_region(left.id, right.id, Callback::None); thread::sleep(Duration::from_millis(150)); - // The gap is too large, so the previous merge should fail. And this new put request - // should be allowed. + // The gap is too large, so the previous merge should fail. And this new put + // request should be allowed. let res = cluster.async_put(b"k1", b"new_val").unwrap(); cluster.clear_send_filters(); @@ -1442,8 +1456,8 @@ fn test_merge_pessimistic_locks_repeated_merge() { assert_eq!(value, lock.into_lock().to_bytes()); } -/// Check if merge is cleaned up if the merge target is destroyed several times before it's ever -/// scheduled. +/// Check if merge is cleaned up if the merge target is destroyed several times +/// before it's ever scheduled. #[test] fn test_node_merge_long_isolated() { let mut cluster = new_node_cluster(0, 3); @@ -1478,7 +1492,8 @@ fn test_node_merge_long_isolated() { let right = pd_client.get_region(b"k1").unwrap(); cluster.must_split(&right, b"k2"); cluster.must_put(b"k4", b"v4"); - // Ensure the node is removed, so it will not catch up any logs but just destroy itself. + // Ensure the node is removed, so it will not catch up any logs but just destroy + // itself. must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); must_get_equal(&cluster.get_engine(2), b"k4", b"v4"); @@ -1527,14 +1542,19 @@ fn test_stale_message_after_merge() { pd_client.must_add_peer(left.get_id(), new_peer(3, 1004)); pd_client.must_merge(left.get_id(), right.get_id()); - // Such stale message can be sent due to network error, consider the following example: - // 1. Store 1 and Store 3 can't reach each other, so peer 1003 start election and send `RequestVote` - // message to peer 1001, and fail due to network error, but this message is keep backoff-retry to send out - // 2. Peer 1002 become the new leader and remove peer 1003 and add peer 1004 on store 3, then the region is - // merged into other region, the merge can success because peer 1002 can reach both peer 1001 and peer 1004 - // 3. Network recover, so peer 1003's `RequestVote` message is sent to peer 1001 after it is merged + // Such stale message can be sent due to network error, consider the following + // example: + // - Store 1 and Store 3 can't reach each other, so peer 1003 + // start election and send `RequestVote` message to peer 1001, and fail + // due to network error, but this message is keep backoff-retry to send out + // - Peer 1002 become the new leader and remove peer 1003 and add peer 1004 on + // store 3, then the region is merged into other region, the merge can + // success because peer 1002 can reach both peer 1001 and peer 1004 + // - Network recover, so peer 1003's `RequestVote` message is sent to peer 1001 + // after it is merged // - // the backoff-retry of a stale message is hard to simulated in test, so here just send this stale message directly + // the backoff-retry of a stale message is hard to simulated in test, so here + // just send this stale message directly let mut raft_msg = RaftMessage::default(); raft_msg.set_region_id(left.get_id()); raft_msg.set_from_peer(find_peer(&left, 3).unwrap().to_owned()); @@ -1546,7 +1566,8 @@ fn test_stale_message_after_merge() { must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } -/// Check whether merge should be prevented if follower may not have enough logs. +/// Check whether merge should be prevented if follower may not have enough +/// logs. #[test] fn test_prepare_merge_with_reset_matched() { let mut cluster = new_server_cluster(0, 3); @@ -1586,15 +1607,16 @@ fn test_prepare_merge_with_reset_matched() { cluster.must_transfer_leader(left.get_id(), left_on_store1); let res = cluster.try_merge(left.get_id(), right.get_id()); // Now leader still knows peer(2, 2) has committed i0 - 1, so the min_match will - // become i0 - 1. But i0 - 1 is not a safe index as peer(3, 3) starts from i0 + 1. + // become i0 - 1. But i0 - 1 is not a safe index as peer(3, 3) starts from i0 + + // 1. assert!(res.get_header().has_error(), "{:?}", res); cluster.clear_send_filters(); // Now leader should replicate more logs and figure out a safe index. pd_client.must_merge(left.get_id(), right.get_id()); } -/// Check if prepare merge min index is chosen correctly even if all match indexes are -/// correct. +/// Check if prepare merge min index is chosen correctly even if all match +/// indexes are correct. #[test] fn test_prepare_merge_with_5_nodes_snapshot() { let mut cluster = new_server_cluster(0, 5); @@ -1634,8 +1656,8 @@ fn test_prepare_merge_with_5_nodes_snapshot() { cluster.add_send_filter(IsolationFilterFactory::new(4)); must_get_equal(&cluster.get_engine(5), b"k13", b"v13"); let res = cluster.try_merge(left.get_id(), right.get_id()); - // min_matched from peer 4 is beyond the first index of peer 5, it should not be chosen - // for prepare merge. + // min_matched from peer 4 is beyond the first index of peer 5, it should not be + // chosen for prepare merge. assert!(res.get_header().has_error(), "{:?}", res); cluster.clear_send_filters(); // Now leader should replicate more logs and figure out a safe index. diff --git a/tests/integrations/raftstore/test_multi.rs b/tests/integrations/raftstore/test_multi.rs index 296d6f207cf..d7c527b5fd9 100644 --- a/tests/integrations/raftstore/test_multi.rs +++ b/tests/integrations/raftstore/test_multi.rs @@ -324,8 +324,9 @@ fn test_leader_change_with_uncommitted_log(cluster: &mut Cluster(cluster: &mut Cluster) { // guarantee peer 1 is leader cluster.must_transfer_leader(1, new_peer(1, 1)); - // if peer 2 is unreachable, leader will not send MsgAppend to peer 2, and the leader will - // send MsgAppend with committed information to peer 2 after network recovered, and peer 2 - // will apply the entry regardless of we add an filter, so we put k0/v0 to make sure the - // network is reachable. + // if peer 2 is unreachable, leader will not send MsgAppend to peer 2, and the + // leader will send MsgAppend with committed information to peer 2 after + // network recovered, and peer 2 will apply the entry regardless of we add + // an filter, so we put k0/v0 to make sure the network is reachable. let (k0, v0) = (b"k0", b"v0"); cluster.must_put(k0, v0); @@ -507,8 +508,9 @@ fn test_read_leader_with_unapplied_log(cluster: &mut Cluster) { must_get_equal(&cluster.get_engine(i), k0, v0); } - // hack: first MsgAppend will append log, second MsgAppend will set commit index, - // So only allowing first MsgAppend to make peer 2 have uncommitted entries. + // hack: first MsgAppend will append log, second MsgAppend will set commit + // index, So only allowing first MsgAppend to make peer 2 have uncommitted + // entries. cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(1, 2) .msg_type(MessageType::MsgAppend) @@ -540,12 +542,13 @@ fn test_read_leader_with_unapplied_log(cluster: &mut Cluster) { cluster.must_transfer_leader(1, util::new_peer(2, 2)); - // leader's term not equal applied index's term, if we read local, we may get old value - // in this situation we need use raft read + // leader's term not equal applied index's term, if we read local, we may get + // old value in this situation we need use raft read must_get_none(&cluster.get_engine(2), k); - // internal read will use raft read no matter read_quorum is false or true, cause applied - // index's term not equal leader's term, and will failed with timeout + // internal read will use raft read no matter read_quorum is false or true, + // cause applied index's term not equal leader's term, and will failed with + // timeout let req = get_with_timeout(cluster, k, false, Duration::from_secs(10)).unwrap(); assert!( req.get_header().get_error().has_stale_command(), @@ -691,8 +694,8 @@ fn test_node_dropped_proposal() { ); put_req.mut_header().set_peer(new_peer(1, 1)); // peer (3, 3) won't become leader and transfer leader request will be canceled - // after about an election timeout. Before it's canceled, all proposal will be dropped - // silently. + // after about an election timeout. Before it's canceled, all proposal will be + // dropped silently. cluster.transfer_leader(1, new_peer(3, 3)); let (tx, rx) = mpsc::channel(); @@ -841,7 +844,8 @@ fn test_leader_drop_with_pessimistic_lock() { cluster.must_put(b"k1", b"v1"); assert_ne!(cluster.leader_of_region(1).unwrap().id, 1); - // When peer 1 becomes leader again, the pessimistic locks should be cleared before. + // When peer 1 becomes leader again, the pessimistic locks should be cleared + // before. cluster.clear_send_filters(); cluster.must_transfer_leader(1, new_peer(1, 1)); assert!(txn_ext.pessimistic_locks.read().is_empty()); diff --git a/tests/integrations/raftstore/test_prevote.rs b/tests/integrations/raftstore/test_prevote.rs index 6128e8e7dbf..a4336e9f3ed 100644 --- a/tests/integrations/raftstore/test_prevote.rs +++ b/tests/integrations/raftstore/test_prevote.rs @@ -35,7 +35,8 @@ fn attach_prevote_notifiers(cluster: &Cluster, peer: u64) -> mp rx } -// Validate that prevote is used in elections after partition or reboot of some nodes. +// Validate that prevote is used in elections after partition or reboot of some +// nodes. fn test_prevote( cluster: &mut Cluster, failure_type: FailureType<'_>, @@ -44,8 +45,8 @@ fn test_prevote( detect_during_recovery: impl Into>, ) { cluster.cfg.raft_store.prevote = true; - // Disable this feature because the test could run slow, in which case peers shouldn't - // hibernate, otherwise it's possible to detect no vote messages. + // Disable this feature because the test could run slow, in which case peers + // shouldn't hibernate, otherwise it's possible to detect no vote messages. cluster.cfg.raft_store.hibernate_regions = false; // To stable the test, we use a large election timeout to make // leader's readiness get handle within an election timeout @@ -149,8 +150,8 @@ fn test_prevote_partition_leader_in_majority_detect_in_majority() { #[test] fn test_prevote_partition_leader_in_majority_detect_in_minority() { let mut cluster = new_node_cluster(0, 5); - // The follower is in the minority and is part of a prevote process. On rejoin it adopts the - // old leader. + // The follower is in the minority and is part of a prevote process. On rejoin + // it adopts the old leader. test_prevote( &mut cluster, FailureType::Partition(&[1, 2, 3], &[4, 5]), @@ -164,8 +165,8 @@ fn test_prevote_partition_leader_in_majority_detect_in_minority() { #[test] fn test_prevote_partition_leader_in_minority_detect_in_majority() { let mut cluster = new_node_cluster(0, 5); - // The follower is in the minority and is part of a prevote process. On rejoin it adopts the - // old leader. + // The follower is in the minority and is part of a prevote process. On rejoin + // it adopts the old leader. test_prevote( &mut cluster, FailureType::Partition(&[1, 2], &[3, 4, 5]), @@ -179,8 +180,8 @@ fn test_prevote_partition_leader_in_minority_detect_in_majority() { #[test] fn test_prevote_partition_leader_in_minority_detect_in_minority() { let mut cluster = new_node_cluster(0, 5); - // The follower is in the minority and is part of a prevote process. On rejoin it adopts the - // old leader. + // The follower is in the minority and is part of a prevote process. On rejoin + // it adopts the old leader. test_prevote( &mut cluster, FailureType::Partition(&[1, 2, 3], &[3, 4, 5]), @@ -216,18 +217,21 @@ fn test_prevote_reboot_minority_followers() { ); } -// Test isolating a minority of the cluster and make sure that the remove themselves. +// Test isolating a minority of the cluster and make sure that the remove +// themselves. fn test_pair_isolated(cluster: &mut Cluster) { let region = 1; let pd_client = Arc::clone(&cluster.pd_client); - // Given some nodes A, B, C, D, E, we partition the cluster such that D, E are isolated from the rest. + // Given some nodes A, B, C, D, E, we partition the cluster such that D, E are + // isolated from the rest. cluster.run(); // Choose a predictable leader so we don't accidentally partition the leader. cluster.must_transfer_leader(region, new_peer(1, 1)); cluster.partition(vec![1, 2, 3], vec![4, 5]); - // Then, add a policy to PD that it should ask the Raft leader to remove the peer from the group. + // Then, add a policy to PD that it should ask the Raft leader to remove the + // peer from the group. pd_client.must_remove_peer(region, new_peer(4, 4)); pd_client.must_remove_peer(region, new_peer(5, 5)); diff --git a/tests/integrations/raftstore/test_region_change_observer.rs b/tests/integrations/raftstore/test_region_change_observer.rs index 3a1437e1868..261b1f2370e 100644 --- a/tests/integrations/raftstore/test_region_change_observer.rs +++ b/tests/integrations/raftstore/test_region_change_observer.rs @@ -97,7 +97,8 @@ fn test_region_change_observer_impl(mut cluster: Cluster) { cluster.must_split(&add_peer_event.0, b"k2"); let mut split_update = receiver.recv().unwrap(); let mut split_create = receiver.recv().unwrap(); - // We should receive an `Update` and a `Create`. The order of them is not important. + // We should receive an `Update` and a `Create`. The order of them is not + // important. if split_update.1 != RegionChangeEvent::Update(RegionChangeReason::Split) { mem::swap(&mut split_update, &mut split_create); } @@ -135,7 +136,8 @@ fn test_region_change_observer_impl(mut cluster: Cluster) { ); let mut merge_update = receiver.recv().unwrap(); let mut merge_destroy = receiver.recv().unwrap(); - // We should receive an `Update` and a `Destroy`. The order of them is not important. + // We should receive an `Update` and a `Destroy`. The order of them is not + // important. if merge_update.1 != RegionChangeEvent::Update(RegionChangeReason::CommitMerge) { mem::swap(&mut merge_update, &mut merge_destroy); } diff --git a/tests/integrations/raftstore/test_region_heartbeat.rs b/tests/integrations/raftstore/test_region_heartbeat.rs index b558f0800e7..117c10a3d19 100644 --- a/tests/integrations/raftstore/test_region_heartbeat.rs +++ b/tests/integrations/raftstore/test_region_heartbeat.rs @@ -47,9 +47,9 @@ fn test_down_peers(cluster: &mut Cluster) { cluster.stop_node(1); cluster.must_put(b"k1", b"v1"); - // max peer down duration is 500 millis, but we only report down time in seconds, - // so sleep 1 second to make the old down second is always larger than new down second - // by at lease 1 second. + // max peer down duration is 500 millis, but we only report down time in + // seconds, so sleep 1 second to make the old down second is always larger + // than new down second by at lease 1 second. sleep_ms(1000); wait_down_peers(cluster, 1, Some(1)); diff --git a/tests/integrations/raftstore/test_region_info_accessor.rs b/tests/integrations/raftstore/test_region_info_accessor.rs index 45df18d01a2..838e2ea492c 100644 --- a/tests/integrations/raftstore/test_region_info_accessor.rs +++ b/tests/integrations/raftstore/test_region_info_accessor.rs @@ -190,7 +190,8 @@ fn test_node_cluster_region_info_accessor() { })); cluster.run_conf_change(); let c = rx.recv().unwrap(); - // We only created it on the node whose id == 1 so we shouldn't receive more than one item. + // We only created it on the node whose id == 1 so we shouldn't receive more + // than one item. assert!(rx.try_recv().is_err()); test_region_info_accessor_impl(&mut cluster, &c); diff --git a/tests/integrations/raftstore/test_replica_read.rs b/tests/integrations/raftstore/test_replica_read.rs index 45e17ae37cf..8961008d4a5 100644 --- a/tests/integrations/raftstore/test_replica_read.rs +++ b/tests/integrations/raftstore/test_replica_read.rs @@ -61,10 +61,10 @@ fn test_replica_read_not_applied() { configure_for_lease_read(&mut cluster, Some(50), Some(30)); let max_lease = Duration::from_secs(1); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); - // After the leader has committed to its term, pending reads on followers can be responsed. - // However followers can receive `ReadIndexResp` after become candidate if the leader has - // hibernated. So, disable the feature to avoid read requests on followers to be cleared as - // stale. + // After the leader has committed to its term, pending reads on followers can be + // responsed. However followers can receive `ReadIndexResp` after become + // candidate if the leader has hibernated. So, disable the feature to avoid + // read requests on followers to be cleared as stale. cluster.cfg.raft_store.hibernate_regions = false; cluster.pd_client.disable_default_operator(); @@ -103,13 +103,15 @@ fn test_replica_read_not_applied() { let resp1_ch = async_read_on_peer(&mut cluster, new_peer(3, 3), r1.clone(), b"k1", true, true); assert!(resp1_ch.recv_timeout(Duration::from_secs(1)).is_err()); - // Unpark all append responses so that the new leader can commit its first entry. + // Unpark all append responses so that the new leader can commit its first + // entry. let router = cluster.sim.wl().get_router(2).unwrap(); for raft_msg in mem::take::>(dropped_msgs.lock().unwrap().as_mut()) { router.send_raft_message(raft_msg).unwrap(); } - // The old read index request won't be blocked forever as it's retried internally. + // The old read index request won't be blocked forever as it's retried + // internally. cluster.sim.wl().clear_send_filters(1); cluster.sim.wl().clear_recv_filters(2); let resp1 = resp1_ch.recv_timeout(Duration::from_secs(6)).unwrap(); @@ -128,8 +130,6 @@ fn test_replica_read_on_hibernate() { let mut cluster = new_node_cluster(0, 3); configure_for_lease_read(&mut cluster, Some(50), Some(20)); - // let max_lease = Duration::from_secs(2); - // cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); cluster.pd_client.disable_default_operator(); let r1 = cluster.run_conf_change(); @@ -418,7 +418,8 @@ fn test_split_isolation() { let r1 = cluster.get_region(b"k2"); cluster.must_split(&r1, b"k2"); let idx = cluster.truncated_state(1, 1).get_index(); - // Trigger a log compaction, so the left region ['', 'k2'] cannot created through split cmd. + // Trigger a log compaction, so the left region ['', 'k2'] cannot created + // through split cmd. for i in 2..cluster.cfg.raft_store.raft_log_gc_count_limit() * 2 { cluster.must_put(format!("k{}", i).as_bytes(), format!("v{}", i).as_bytes()); } @@ -439,7 +440,8 @@ fn test_split_isolation() { } let peer = peer.unwrap(); cluster.run_node(2).unwrap(); - // Originally leader of region ['', 'k2'] will go to sleep, so the learner peer cannot be created. + // Originally leader of region ['', 'k2'] will go to sleep, so the learner peer + // cannot be created. for _ in 0..10 { let resp = async_read_on_peer(&mut cluster, peer.clone(), r2.clone(), b"k1", true, true); let resp = resp.recv_timeout(Duration::from_secs(1)).unwrap(); @@ -451,8 +453,9 @@ fn test_split_isolation() { panic!("test failed"); } -/// Testing after applying snapshot, the `ReadDelegate` stored at `StoreMeta` will be replace with -/// the new `ReadDelegate`, and the `ReadDelegate` stored at `LocalReader` should also be updated +/// Testing after applying snapshot, the `ReadDelegate` stored at `StoreMeta` +/// will be replace with the new `ReadDelegate`, and the `ReadDelegate` stored +/// at `LocalReader` should also be updated #[test] fn test_read_local_after_snapshpot_replace_peer() { let mut cluster = new_node_cluster(0, 3); @@ -472,8 +475,9 @@ fn test_read_local_after_snapshpot_replace_peer() { must_get_equal(&cluster.get_engine(i), b"k1", b"v1"); } - // send read request to peer 3, so the local reader will cache the `ReadDelegate` of peer 3 - // it is okey only send one request because the read pool thread count is 1 + // send read request to peer 3, so the local reader will cache the + // `ReadDelegate` of peer 3 it is okay only send one request because the + // read pool thread count is 1 let r = cluster.get_region(b"k1"); // wait applying snapshot finish sleep_ms(100); @@ -516,7 +520,8 @@ fn test_read_local_after_snapshpot_replace_peer() { assert_eq!(exp_value, b"v3"); } -/// The case checks if a malformed request should not corrupt the leader's read queue. +/// The case checks if a malformed request should not corrupt the leader's read +/// queue. #[test] fn test_malformed_read_index() { let mut cluster = new_node_cluster(0, 3); diff --git a/tests/integrations/raftstore/test_replication_mode.rs b/tests/integrations/raftstore/test_replication_mode.rs index dc496ef9637..3eddc7ce40d 100644 --- a/tests/integrations/raftstore/test_replication_mode.rs +++ b/tests/integrations/raftstore/test_replication_mode.rs @@ -38,8 +38,8 @@ fn run_cluster(cluster: &mut Cluster) { cluster.must_put(b"k1", b"v0"); } -/// When using DrAutoSync replication mode, data should be replicated to different labels -/// before committed. +/// When using DrAutoSync replication mode, data should be replicated to +/// different labels before committed. #[test] fn test_dr_auto_sync() { let mut cluster = prepare_cluster(); @@ -212,22 +212,22 @@ fn test_update_group_id() { cluster.must_split(®ion, b"k2"); let left = pd_client.get_region(b"k0").unwrap(); let right = pd_client.get_region(b"k2").unwrap(); - // When a node is started, all store information are loaded at once, so we need an extra node - // to verify resolve will assign group id. + // When a node is started, all store information are loaded at once, so we need + // an extra node to verify resolve will assign group id. cluster.add_label(3, "zone", "WS"); cluster.add_new_engine(); pd_client.must_add_peer(left.id, new_peer(2, 2)); pd_client.must_add_peer(left.id, new_learner_peer(3, 3)); pd_client.must_add_peer(left.id, new_peer(3, 3)); - // If node 3's group id is not assigned, leader will make commit index as the smallest last - // index of all followers. + // If node 3's group id is not assigned, leader will make commit index as the + // smallest last index of all followers. cluster.add_send_filter(IsolationFilterFactory::new(2)); cluster.must_put(b"k11", b"v11"); must_get_equal(&cluster.get_engine(3), b"k11", b"v11"); must_get_equal(&cluster.get_engine(1), b"k11", b"v11"); - // So both node 1 and node 3 have fully resolved all stores. Further updates to group ID have - // to be done when applying conf change and snapshot. + // So both node 1 and node 3 have fully resolved all stores. Further updates to + // group ID have to be done when applying conf change and snapshot. cluster.clear_send_filters(); pd_client.must_add_peer(right.id, new_peer(2, 4)); pd_client.must_add_peer(right.id, new_learner_peer(3, 5)); @@ -348,7 +348,8 @@ fn test_replication_mode_allowlist() { must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); } -/// Ensures hibernate region still works properly when switching replication mode. +/// Ensures hibernate region still works properly when switching replication +/// mode. #[test] fn test_switching_replication_mode_hibernate() { let mut cluster = new_server_cluster(0, 3); diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 180e5fb1334..49ecf13c1d9 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -165,11 +165,13 @@ fn test_server_snap_gc_internal(version: &str) { let actual_max_per_file_size = cluster.get_snap_mgr(1).get_actual_max_per_file_size(true); - // version > 6.0.0 should enable multi_snapshot_file feature, which means actual max_per_file_size equals the config + // version > 6.0.0 should enable multi_snapshot_file feature, which means actual + // max_per_file_size equals the config if version == "6.5.0" { assert!(actual_max_per_file_size == cluster.cfg.raft_store.max_snapshot_file_raw_size.0); } else { - // the feature is disabled, and the actual_max_per_file_size should be u64::MAX (so that only one file is generated) + // the feature is disabled, and the actual_max_per_file_size should be u64::MAX + // (so that only one file is generated) assert!(actual_max_per_file_size == u64::MAX); } @@ -243,7 +245,8 @@ fn test_concurrent_snap(cluster: &mut Cluster) { if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { panic!("the snapshot is not sent before split, e: {:?}", e); } - // Split the region range and then there should be another snapshot for the split ranges. + // Split the region range and then there should be another snapshot for the + // split ranges. cluster.must_split(®ion, b"k2"); must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); // Ensure the regions work after split. @@ -521,10 +524,11 @@ fn test_inspected_snapshot() { // Test snapshot generating and receiving can share one I/O limiter fairly. // 1. Bootstrap a 1 Region, 1 replica cluster; -// 2. Add a peer on store 2 for the Region, so that there is a snapshot received on store 2; -// 3. Rename the received snapshot on store 2, and then keep sending it back to store 1; -// 4. Add another peer for the Region, so store 1 will generate a new snapshot; -// 5. Test the generating can success while the store keeps receiving snapshots from store 2. +// 2. Add a peer on store 2 for the Region, so that there is a snapshot received +// on store 2; 3. Rename the received snapshot on store 2, and then keep sending +// it back to store 1; 4. Add another peer for the Region, so store 1 will +// generate a new snapshot; 5. Test the generating can success while the store +// keeps receiving snapshots from store 2. #[test] fn test_gen_during_heavy_recv() { let mut cluster = new_server_cluster(0, 3); @@ -608,7 +612,8 @@ fn test_gen_during_heavy_recv() { } }); - // While store 1 keeps receiving snapshots, it should still can generate a snapshot on time. + // While store 1 keeps receiving snapshots, it should still can generate a + // snapshot on time. pd_client.must_add_peer(r1, new_learner_peer(3, 3)); sleep_ms(500); must_get_equal(&cluster.get_engine(3), b"zzz-0000", b"value"); @@ -653,8 +658,8 @@ fn random_long_vec(length: usize) -> Vec { value } -/// Snapshot is generated using apply term from apply thread, which should be set -/// correctly otherwise lead to unconsistency. +/// Snapshot is generated using apply term from apply thread, which should be +/// set correctly otherwise lead to inconsistency. #[test] fn test_correct_snapshot_term() { // Use five replicas so leader can send a snapshot to a new peer without @@ -697,8 +702,8 @@ fn test_correct_snapshot_term() { // Clears send filters so peer 4 can accept snapshot from peer 5. If peer 5 // didn't set apply index correctly using snapshot in apply worker, the snapshot // will be generated as term 0. Raft consider term of missing index as 0, so - // peer 4 will accept the snapshot and think it has already applied it, hence fast - // forward it then panic. + // peer 4 will accept the snapshot and think it has already applied it, hence + // fast forward it then panic. cluster.clear_send_filters(); must_get_equal(&cluster.get_engine(4), b"k0", b"v0"); cluster.clear_send_filters(); diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 53c56510574..91022892f96 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -572,16 +572,17 @@ fn test_split_region_diff_check(cluster: &mut Cluster) { let pd_client = Arc::clone(&cluster.pd_client); - // The default size index distance is too large for small data, - // we flush multiple times to generate more size index handles. + // The default size index distance is too large for small data, we flush + // multiple times to generate more size index handles. for _ in 0..10 { put_till_size(cluster, region_max_size, &mut range); } - // Peer will split when size of region meet region_max_size, - // so assume the last region_max_size of data is not involved in split, - // there will be at least (region_max_size * 10 - region_max_size) / region_split_size regions. - // But region_max_size of data should be split too, so there will be at least 2 more regions. + // Peer will split when size of region meet region_max_size, so assume the last + // region_max_size of data is not involved in split, there will be at least + // `(region_max_size * 10 - region_max_size) / region_split_size` regions. + // But region_max_size of data should be split too, so there will be at + // least 2 more regions. let min_region_cnt = (region_max_size * 10 - region_max_size) / region_split_size + 2; let mut try_cnt = 0; @@ -757,9 +758,10 @@ fn test_node_split_epoch_not_match_right_derive() { test_split_epoch_not_match(&mut cluster, true); } -// For the peer which is the leader of the region before split, -// it should campaigns immediately. and then this peer may take the leadership earlier. -// `test_quick_election_after_split` is a helper function for testing this feature. +// For the peer which is the leader of the region before split, it should +// campaigns immediately. and then this peer may take the leadership +// earlier. `test_quick_election_after_split` is a helper function for testing +// this feature. fn test_quick_election_after_split(cluster: &mut Cluster) { // Calculate the reserved time before a new campaign after split. let reserved_time = @@ -778,8 +780,8 @@ fn test_quick_election_after_split(cluster: &mut Cluster) { // The campaign should always succeeds in the ideal test environment. let new_region = cluster.get_region(b"k3"); - // Ensure the new leader is established for the newly split region, and it shares the - // same store with the leader of old region. + // Ensure the new leader is established for the newly split region, and it + // shares the same store with the leader of old region. let new_leader = cluster.query_leader( old_leader.get_store_id(), new_region.get_id(), @@ -1070,7 +1072,8 @@ fn test_refresh_region_bucket_keys() { ); assert_eq!(bucket_version3, bucket_version2); - // now the buckets is ["", "k12", ""]. further split ["", k12], [k12, ""] buckets into more buckets + // now the buckets is ["", "k12", ""]. further split ["", k12], [k12, ""] + // buckets into more buckets let region = pd_client.get_region(b"k11").unwrap(); let bucket_ranges = vec![ BucketRange(vec![], b"k12".to_vec()), @@ -1202,7 +1205,8 @@ fn test_gen_split_check_bucket_ranges() { let mut cluster = new_server_cluster(0, count); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(5); cluster.cfg.coprocessor.enable_region_bucket = true; - // disable report buckets; as it will reset the user traffic stats to randmize the test result + // disable report buckets; as it will reset the user traffic stats to randomize + // the test result cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::secs(5); // Make merge check resume quickly. cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(100); @@ -1248,7 +1252,8 @@ fn test_gen_split_check_bucket_ranges() { Option::None, Some(expected_buckets.clone()), ); - // because the diff between last_bucket_regions and bucket_regions is zero, bucket range for split check should be empty. + // because the diff between last_bucket_regions and bucket_regions is zero, + // bucket range for split check should be empty. let expected_bucket_ranges = vec![]; cluster.send_half_split_region_message(®ion, Some(expected_bucket_ranges)); diff --git a/tests/integrations/raftstore/test_stale_peer.rs b/tests/integrations/raftstore/test_stale_peer.rs index e9edcc49966..e12584d6c60 100644 --- a/tests/integrations/raftstore/test_stale_peer.rs +++ b/tests/integrations/raftstore/test_stale_peer.rs @@ -15,17 +15,19 @@ use tikv_util::{config::ReadableDuration, HandyRwLock}; /// If a peer detects the leader is missing for a specified long time, /// it should consider itself as a stale peer which is removed from the region. /// This test case covers the following scenario: -/// At first, there are three peer A, B, C in the cluster, and A is leader. -/// Peer B gets down. And then A adds D, E, F into the cluster. -/// Peer D becomes leader of the new cluster, and then removes peer A, B, C. -/// After all these peer in and out, now the cluster has peer D, E, F. -/// If peer B goes up at this moment, it still thinks it is one of the cluster -/// and has peers A, C. However, it could not reach A, C since they are removed from -/// the cluster or probably destroyed. -/// Meantime, D, E, F would not reach B, Since it's not in the cluster anymore. -/// In this case, Peer B would notice that the leader is missing for a long time, -/// and it would check with pd to confirm whether it's still a member of the cluster. -/// If not, it should destroy itself as a stale peer which is removed out already. +/// - At first, there are three peer A, B, C in the cluster, and A is leader. +/// - Peer B gets down. And then A adds D, E, F into the cluster. +/// - Peer D becomes leader of the new cluster, and then removes peer A, B, C. +/// - After all these peer in and out, now the cluster has peer D, E, F. +/// - If peer B goes up at this moment, it still thinks it is one of the +/// cluster and has peers A, C. However, it could not reach A, C since they +/// are removed from the cluster or probably destroyed. +/// - Meantime, D, E, F would not reach B, Since it's not in the cluster +/// anymore. +/// In this case, Peer B would notice that the leader is missing for a long +/// time, and it would check with pd to confirm whether it's still a member of +/// the cluster. If not, it should destroy itself as a stale peer which is +/// removed out already. fn test_stale_peer_out_of_region(cluster: &mut Cluster) { let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer number check. @@ -47,7 +49,8 @@ fn test_stale_peer_out_of_region(cluster: &mut Cluster) { cluster.add_send_filter(IsolationFilterFactory::new(2)); // In case 2 is leader, it will fail to pass the healthy nodes check, - // so remove isolated node first. Because 2 is isolated, so it can't remove itself. + // so remove isolated node first. Because 2 is isolated, so it can't remove + // itself. pd_client.must_remove_peer(r1, new_peer(2, 2)); // Add peer [(4, 4), (5, 5), (6, 6)]. @@ -96,18 +99,18 @@ fn test_server_stale_peer_out_of_region() { test_stale_peer_out_of_region(&mut cluster); } -/// A help function for testing the behaviour of the gc of stale peer -/// which is out or region. -/// If a peer detects the leader is missing for a specified long time, -/// it should consider itself as a stale peer which is removed from the region. -/// This test case covers the following scenario: -/// A peer, B is initialized as a replicated peer without data after -/// receiving a single raft AE message. But then it goes through some process like -/// the case of `test_stale_peer_out_of_region`, it's removed out of the region -/// and wouldn't be contacted anymore. -/// In both cases, peer B would notice that the leader is missing for a long time, -/// and it's an initialized peer without any data. It would destroy itself as -/// as stale peer directly and should not impact other region data on the same store. +/// A help function for testing the behaviour of the gc of stale peer which is +/// out or region. If a peer detects the leader is missing for a specified long +/// time, it should consider itself as a stale peer which is removed from the +/// region. This test case covers the following scenario: +/// - A peer, B is initialized as a replicated peer without data after receiving +/// a single raft AE message. But then it goes through some process like the +/// case of `test_stale_peer_out_of_region`, it's removed out of the region +/// and wouldn't be contacted anymore. +/// In both cases, peer B would notice that the leader is missing for a long +/// time, and it's an initialized peer without any data. It would destroy itself +/// as stale peer directly and should not impact other region data on the +/// same store. fn test_stale_peer_without_data(cluster: &mut Cluster, right_derive: bool) { cluster.cfg.raft_store.right_derive_when_split = right_derive; diff --git a/tests/integrations/raftstore/test_tombstone.rs b/tests/integrations/raftstore/test_tombstone.rs index 189587dea44..21adc354295 100644 --- a/tests/integrations/raftstore/test_tombstone.rs +++ b/tests/integrations/raftstore/test_tombstone.rs @@ -263,9 +263,9 @@ fn test_server_stale_meta() { /// Tests a tombstone peer won't trigger wrong gc message. /// -/// An uninitialized peer's peer list is empty. If a message from a healthy peer passes -/// all the other checks accidentally, it may trigger a tombstone message which will -/// make the healthy peer destroy all its data. +/// An uninitialized peer's peer list is empty. If a message from a healthy peer +/// passes all the other checks accidentally, it may trigger a tombstone message +/// which will make the healthy peer destroy all its data. #[test] fn test_safe_tombstone_gc() { let mut cluster = new_node_cluster(0, 5); diff --git a/tests/integrations/raftstore/test_transfer_leader.rs b/tests/integrations/raftstore/test_transfer_leader.rs index cb1c970914d..86789fc8f7f 100644 --- a/tests/integrations/raftstore/test_transfer_leader.rs +++ b/tests/integrations/raftstore/test_transfer_leader.rs @@ -361,7 +361,8 @@ fn test_memory_pessimistic_locks_status_after_transfer_leader_failure() { LocksStatus::TransferringLeader ); - // After several ticks, in-memory pessimistic locks should become available again. + // After several ticks, in-memory pessimistic locks should become available + // again. thread::sleep(Duration::from_secs(1)); assert_eq!(txn_ext.pessimistic_locks.read().status, LocksStatus::Normal); cluster.reset_leader_of_region(1); diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs index ebeb99ddfe7..cf2361ebc8e 100644 --- a/tests/integrations/raftstore/test_unsafe_recovery.rs +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -305,8 +305,9 @@ fn test_unsafe_recovery_already_in_joint_state() { assert!(promoted); } -// Tests whether unsafe recovery behaves correctly when the failed region is already in the -// middle of a joint state, once exit, it recovers itself without any further demotions. +// Tests whether unsafe recovery behaves correctly when the failed region is +// already in the middle of a joint state, once exit, it recovers itself without +// any further demotions. #[test] fn test_unsafe_recovery_early_return_after_exit_joint_state() { let mut cluster = new_server_cluster(0, 3); @@ -789,16 +790,19 @@ fn test_force_leader_trigger_snapshot() { find_peer(®ion, 3).unwrap().clone(), ); let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); - // Though it has a force leader now, but the command can't committed because the log is not replicated to all the alive peers. + // Though it has a force leader now, but the command can't committed because the + // log is not replicated to all the alive peers. assert!( cluster .call_command_on_leader(req, Duration::from_millis(1000)) .unwrap() .get_header() - .has_error() // error "there is a pending conf change" indicating no committed log after being the leader + .has_error() /* error "there is a pending conf change" indicating no committed log + * after being the leader */ ); - // Permit snapshot message, snapshot should be applied and advance commit index now. + // Permit snapshot message, snapshot should be applied and advance commit index + // now. cluster.sim.wl().clear_recv_filters(2); cluster .pd_client @@ -863,7 +867,8 @@ fn test_force_leader_with_uncommitted_conf_change() { * 2, )); cluster.must_enter_force_leader(region.get_id(), 1, vec![3, 4, 5]); - // the uncommitted conf-change is committed successfully after being force leader + // the uncommitted conf-change is committed successfully after being force + // leader cluster .pd_client .must_none_peer(region.get_id(), find_peer(®ion, 2).unwrap().clone()); @@ -885,12 +890,13 @@ fn test_force_leader_with_uncommitted_conf_change() { assert_eq!(cluster.must_get(b"k4"), Some(b"v4".to_vec())); } -// Test the case that none of five nodes fails and force leader on one of the nodes. -// Note: It still can't defend extreme misuse cases. For example, a group of a, -// b and c. c is isolated from a, a is the leader. If c has increased its term -// by 2 somehow (for example false prevote success twice) and force leader is -// sent to b and break lease constrain, then b will reject a's heartbeat while -// can vote for c. So c becomes leader and there are two leaders in the group. +// Test the case that none of five nodes fails and force leader on one of the +// nodes. Note: It still can't defend extreme misuse cases. For example, a group +// of a, b and c. c is isolated from a, a is the leader. If c has increased its +// term by 2 somehow (for example false prevote success twice) and force leader +// is sent to b and break lease constrain, then b will reject a's heartbeat +// while can vote for c. So c becomes leader and there are two leaders in the +// group. #[test] fn test_force_leader_on_healthy_region() { let mut cluster = new_node_cluster(0, 5); @@ -920,7 +926,8 @@ fn test_force_leader_on_healthy_region() { assert_eq!(cluster.must_get(b"k1"), Some(b"v1".to_vec())); cluster.must_put(b"k2", b"v2"); - // try to exit force leader, it will be ignored silently as it's not in the force leader state + // try to exit force leader, it will be ignored silently as it's not in the + // force leader state cluster.exit_force_leader(region.get_id(), 1); cluster.must_put(b"k4", b"v4"); @@ -1147,15 +1154,17 @@ fn test_force_leader_multiple_election_rounds() { } // Tests whether unsafe recovery report sets has_commit_merge correctly. -// This field is used by PD to issue force leader command in order, so that the recovery process -// does not break the merge accidentally, when: -// * The source region and the target region lost their quorum. -// * The living peer(s) of the source region does not have prepare merge message replicated. -// * The living peer(s) of the target region has commit merge messages replicated but -// uncommitted. -// If the living peer(s) of the source region in the above example enters force leader state before -// the peer(s) of the target region, thus proposes a no-op entry (while becoming the leader) which -// is conflict with part of the catch up logs, there will be data loss. +// This field is used by PD to issue force leader command in order, so that the +// recovery process does not break the merge accidentally, when: +// * The source region and the target region lost their quorum. +// * The living peer(s) of the source region does not have prepare merge +// message replicated. +// * The living peer(s) of the target region has commit merge messages +// replicated but uncommitted. +// If the living peer(s) of the source region in the above example enters force +// leader state before the peer(s) of the target region, thus proposes a no-op +// entry (while becoming the leader) which is conflict with part of the catch up +// logs, there will be data loss. #[test] fn test_unsafe_recovery_has_commit_merge() { let mut cluster = new_node_cluster(0, 3); @@ -1178,8 +1187,8 @@ fn test_unsafe_recovery_has_commit_merge() { let right_on_store1 = find_peer(&right, 1).unwrap(); cluster.must_transfer_leader(right.get_id(), right_on_store1.clone()); - // Block the target region from receiving MsgAppendResponse, so that the commit merge message - // will only be replicated but not committed. + // Block the target region from receiving MsgAppendResponse, so that the commit + // merge message will only be replicated but not committed. let recv_filter = Box::new( RegionPacketFilter::new(right.get_id(), 1) .direction(Direction::Recv) @@ -1236,15 +1245,15 @@ fn test_unsafe_recovery_during_merge() { let right_on_store1 = find_peer(&right, 1).unwrap(); cluster.must_transfer_leader(right.get_id(), right_on_store1.clone()); - // Blocks the replication of prepare merge message, so that the commit merge back fills it - // in CatchUpLogs. + // Blocks the replication of prepare merge message, so that the commit merge + // back fills it in CatchUpLogs. let append_filter = Box::new( RegionPacketFilter::new(left.get_id(), 2) .direction(Direction::Recv) .msg_type(MessageType::MsgAppend), ); - // Blocks the target region from receiving MsgAppendResponse, so that the commit merge message - // will only be replicated but not committed. + // Blocks the target region from receiving MsgAppendResponse, so that the commit + // merge message will only be replicated but not committed. let commit_filter = Box::new( RegionPacketFilter::new(right.get_id(), 1) .direction(Direction::Recv) diff --git a/tests/integrations/server/gc_worker.rs b/tests/integrations/server/gc_worker.rs index 1ce3cc6415a..4f521cb1da7 100644 --- a/tests/integrations/server/gc_worker.rs +++ b/tests/integrations/server/gc_worker.rs @@ -226,7 +226,8 @@ fn test_applied_lock_collector() { assert_eq!(resp.get_locks().len(), 1024); }); - // Register lock observer at a later safe point. Lock observer should reset its state. + // Register lock observer at a later safe point. Lock observer should reset its + // state. safe_point += 1; clients.iter().for_each(|(_, c)| { must_register_lock_observer(c, safe_point); @@ -266,8 +267,8 @@ fn test_applied_lock_collector() { }); } -// Since v5.0 GC bypasses Raft, which means GC scans/deletes records with `keys::DATA_PREFIX`. -// This case ensures it's performed correctly. +// Since v5.0 GC bypasses Raft, which means GC scans/deletes records with +// `keys::DATA_PREFIX`. This case ensures it's performed correctly. #[test] fn test_gc_bypass_raft() { let (cluster, leader, ctx) = must_new_cluster_mul(1); diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 367f38114f6..95d1494c660 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -1444,7 +1444,7 @@ macro_rules! test_func { macro_rules! test_func_init { ($client:ident, $ctx:ident, $call_opt:ident, $func:ident, $req:ident) => {{ test_func!($client, $ctx, $call_opt, $func, $req::default()) }}; - ($client:ident, $ctx:ident, $call_opt:ident, $func:ident, $req:ident, batch) => {{ + ($client:ident, $ctx:ident, $call_opt:ident, $func:ident, $req:ident,batch) => {{ test_func!($client, $ctx, $call_opt, $func, { let mut req = $req::default(); req.set_keys(vec![b"key".to_vec()].into()); @@ -1664,7 +1664,8 @@ fn test_tikv_forwarding() { } } -/// Test if forwarding works correctly if the target node is shutdown and restarted. +/// Test if forwarding works correctly if the target node is shutdown and +/// restarted. #[test] fn test_forwarding_reconnect() { let (mut cluster, client, call_opt, ctx) = setup_cluster(); @@ -1753,7 +1754,8 @@ fn test_get_lock_wait_info_api() { // Test API version verification for transaction requests. // See the following for detail: // * rfc: https://github.com/tikv/rfcs/blob/master/text/0069-api-v2.md. -// * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, enum APIVersion. +// * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, +// enum APIVersion. #[test] fn test_txn_api_version() { const TIDB_KEY_CASE: &[u8] = b"t_a"; @@ -1831,7 +1833,7 @@ fn test_txn_api_version() { let expect_prefix = format!("Error({}", errcode); assert!(!errs.is_empty(), "case {}", i); assert!( - errs[0].get_abort().starts_with(&expect_prefix), // e.g. Error(ApiVersionNotMatched { storage_api_version: V1, req_api_version: V2 }) + errs[0].get_abort().starts_with(&expect_prefix), /* e.g. Error(ApiVersionNotMatched { storage_api_version: V1, req_api_version: V2 }) */ "case {}: errs[0]: {:?}, expected: {}", i, errs[0], @@ -1956,7 +1958,8 @@ fn test_txn_api_version() { #[test] fn test_storage_with_quota_limiter_enable() { let (cluster, leader, ctx) = must_new_and_configure_cluster(|cluster| { - // write_bandwidth is limited to 1, which means that every write request will trigger the limit. + // write_bandwidth is limited to 1, which means that every write request will + // trigger the limit. let quota_config = QuotaConfig { foreground_cpu_time: 2000, foreground_write_bandwidth: ReadableSize(10), diff --git a/tests/integrations/server/lock_manager.rs b/tests/integrations/server/lock_manager.rs index 4fe3b98ebe1..d796d9c1f66 100644 --- a/tests/integrations/server/lock_manager.rs +++ b/tests/integrations/server/lock_manager.rs @@ -20,8 +20,8 @@ fn deadlock(client: &TikvClient, ctx: Context, key1: &[u8], ts: u64) -> bool { let (client_clone, mut ctx_clone, key1_clone) = (client.clone(), ctx.clone(), key1.clone()); let handle = thread::spawn(move || { - // `resource_group_tag` is set to check if the wait chain reported by the deadlock error - // carries the correct information. + // `resource_group_tag` is set to check if the wait chain reported by the + // deadlock error carries the correct information. ctx_clone.set_resource_group_tag(b"tag1".to_vec()); let resp = kv_pessimistic_lock( &client_clone, @@ -80,8 +80,8 @@ fn build_leader_client(cluster: &mut Cluster, key: &[u8]) -> (Tik /// Creates a deadlock on the store containing key. fn must_detect_deadlock(cluster: &mut Cluster, key: &[u8], ts: u64) { - // Sometimes, deadlocks can't be detected at once due to leader change, but it will be - // detected. + // Sometimes, deadlocks can't be detected at once due to leader change, but it + // will be detected. for _ in 0..5 { let (client, ctx) = build_leader_client(cluster, key); if deadlock(&client, ctx, key, ts) { @@ -118,8 +118,8 @@ fn must_transfer_leader(cluster: &mut Cluster, region_key: &[u8], /// Transfers the region containing region_key from source store to target peer. /// -/// REQUIRE: The source store must be the leader the region and the target store must not have -/// this region. +/// REQUIRE: The source store must be the leader the region and the target store +/// must not have this region. fn must_transfer_region( cluster: &mut Cluster, region_key: &[u8], @@ -168,7 +168,8 @@ fn find_peer_of_store(region: &Region, store_id: u64) -> Peer { .clone() } -/// Creates a cluster with only one region and store(1) is the leader of the region. +/// Creates a cluster with only one region and store(1) is the leader of the +/// region. fn new_cluster_for_deadlock_test(count: usize) -> Cluster { let mut cluster = new_server_cluster(0, count); cluster.cfg.pessimistic_txn.wait_for_lock_timeout = ReadableDuration::millis(500); @@ -229,8 +230,8 @@ fn test_detect_deadlock_when_split_region() { #[test] fn test_detect_deadlock_when_transfer_region() { let mut cluster = new_cluster_for_deadlock_test(4); - // Transfer the leader region to store(4) and the leader of deadlock detector should be - // also transfered. + // Transfer the leader region to store(4) and the leader of deadlock detector + // should be also transferred. must_transfer_region(&mut cluster, b"k", 1, 4, 4); deadlock_detector_leader_must_be(&mut cluster, 4); must_detect_deadlock(&mut cluster, b"k", 10); @@ -242,8 +243,8 @@ fn test_detect_deadlock_when_transfer_region() { must_detect_deadlock(&mut cluster, b"k", 10); must_detect_deadlock(&mut cluster, b"k1", 10); - // Transfer the new region back to store(4) which will send a role change message with empty - // key range. It shouldn't affect deadlock detector. + // Transfer the new region back to store(4) which will send a role change + // message with empty key range. It shouldn't affect deadlock detector. must_transfer_region(&mut cluster, b"k1", 1, 4, 6); deadlock_detector_leader_must_be(&mut cluster, 4); must_detect_deadlock(&mut cluster, b"k", 10); diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index de7c238e2c3..c3964ab39d8 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -236,8 +236,8 @@ fn test_batch_size_limit() { assert_eq!(msg_count.load(Ordering::SeqCst), 10); } -/// In edge case that the estimated size may be inaccurate, we need to ensure connection -/// will not be broken in this case. +/// In edge case that the estimated size may be inaccurate, we need to ensure +/// connection will not be broken in this case. #[test] fn test_batch_size_edge_limit() { let msg_count = Arc::new(AtomicUsize::new(0)); @@ -247,13 +247,14 @@ fn test_batch_size_edge_limit() { let mut raft_client = get_raft_client_by_port(port); - // Put them in buffer so sibling messages will be likely be batched during sending. + // Put them in buffer so sibling messages will be likely be batched during + // sending. let mut msgs = Vec::with_capacity(5); for _ in 0..5 { let mut raft_m = RaftMessage::default(); - // Magic number, this can make estimated size about 4940000, hence two messages will be - // batched together, but the total size will be way largher than 10MiB as there are many - // indexes and terms. + // Magic number, this can make estimated size about 4940000, hence two messages + // will be batched together, but the total size will be way larger than + // 10MiB as there are many indexes and terms. for _ in 0..38000 { let mut e = Entry::default(); e.set_term(1); @@ -275,8 +276,9 @@ fn test_batch_size_edge_limit() { assert_eq!(msg_count.load(Ordering::SeqCst), 5); } -// Try to create a mock server with `service`. The server will be binded wiht a random -// port chosen between [`min_port`, `max_port`]. Return `None` if no port is available. +// Try to create a mock server with `service`. The server will be bounded with a +// random port chosen between [`min_port`, `max_port`]. Return `None` if no port +// is available. fn create_mock_server(service: T, min_port: u16, max_port: u16) -> Option<(Server, u16)> where T: Tikv + Clone + Send + 'static, diff --git a/tests/integrations/storage/test_storage.rs b/tests/integrations/storage/test_storage.rs index 72eabdb7828..21c9db6fe42 100644 --- a/tests/integrations/storage/test_storage.rs +++ b/tests/integrations/storage/test_storage.rs @@ -913,7 +913,8 @@ const RAW_KEY_CASE: &[u8] = b"r\0_a"; // Test API version verification for txnkv requests. // See the following for detail: // * rfc: https://github.com/tikv/rfcs/blob/master/text/0069-api-v2.md. -// * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, enum APIVersion. +// * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, +// enum APIVersion. #[test] fn test_txn_store_txnkv_api_version() { let test_data = vec![ @@ -967,7 +968,8 @@ fn test_txn_store_txnkv_api_version() { store.scan_err(key, None, 100, 10); - // To compatible with TiDB gc-worker, we remove check_api_version_ranges in scan_lock + // To compatible with TiDB gc-worker, we remove check_api_version_ranges in + // scan_lock store.scan_locks_ok(20, key, &end_key, 10, vec![]); store.delete_range_err(key, key); @@ -979,7 +981,8 @@ fn test_txn_store_txnkv_api_version() { // Test API version verification for rawkv requests. // See the following for detail: // * rfc: https://github.com/tikv/rfcs/blob/master/text/0069-api-v2.md. -// * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, enum APIVersion. +// * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, +// enum APIVersion. #[test] fn test_txn_store_rawkv_api_version() { let test_data = vec![ From b22be438650b914231c6ea4b0afb9c72538044c9 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 28 Jul 2022 16:53:11 +0800 Subject: [PATCH 119/676] *: rename all DB and CF (#13149) close tikv/tikv#12394 to Db and Cf, and ColumnFamily to Cf, IO to Io Signed-off-by: Jay Lee --- cmd/tikv-ctl/src/executor.rs | 16 +- cmd/tikv-ctl/src/main.rs | 14 +- .../backup-stream/src/subscription_manager.rs | 2 +- components/backup/src/endpoint.rs | 18 +- components/backup/src/utils.rs | 4 +- components/batch-system/src/batch.rs | 4 +- components/cloud/gcp/src/gcs.rs | 14 +- components/cloud/gcp/src/lib.rs | 2 +- components/encryption/export/src/lib.rs | 4 +- components/encryption/src/crypter.rs | 34 +-- components/encryption/src/file_dict_file.rs | 4 +- components/encryption/src/lib.rs | 5 +- components/encryption/src/manager/mod.rs | 34 +-- components/engine_panic/src/cf_names.rs | 4 +- components/engine_panic/src/cf_options.rs | 18 +- components/engine_panic/src/db_options.rs | 20 +- components/engine_panic/src/db_vector.rs | 10 +- components/engine_panic/src/engine.rs | 8 +- components/engine_panic/src/snapshot.rs | 8 +- components/engine_rocks/src/cf_names.rs | 4 +- components/engine_rocks/src/cf_options.rs | 16 +- components/engine_rocks/src/compact.rs | 8 +- components/engine_rocks/src/db_options.rs | 46 ++-- components/engine_rocks/src/db_vector.rs | 18 +- components/engine_rocks/src/engine.rs | 12 +- components/engine_rocks/src/event_listener.rs | 28 +- components/engine_rocks/src/file_system.rs | 30 +- components/engine_rocks/src/import.rs | 4 +- components/engine_rocks/src/lib.rs | 2 +- components/engine_rocks/src/logger.rs | 4 +- components/engine_rocks/src/misc.rs | 8 +- components/engine_rocks/src/properties.rs | 4 +- components/engine_rocks/src/raft_engine.rs | 4 +- components/engine_rocks/src/snapshot.rs | 12 +- components/engine_rocks/src/util.rs | 52 ++-- components/engine_rocks/src/write_batch.rs | 6 +- components/engine_test/src/lib.rs | 89 +++--- components/engine_traits/src/cf_names.rs | 2 +- components/engine_traits/src/cf_options.rs | 14 +- components/engine_traits/src/db_options.rs | 14 +- components/engine_traits/src/db_vector.rs | 2 +- components/engine_traits/src/engine.rs | 16 +- components/engine_traits/src/file_system.rs | 10 +- components/engine_traits/src/lib.rs | 2 +- components/engine_traits/src/misc.rs | 4 +- components/engine_traits/src/peekable.rs | 10 +- components/engine_traits/src/raft_engine.rs | 4 +- .../engine_traits_tests/src/cf_names.rs | 2 +- components/engine_traits_tests/src/ctor.rs | 23 +- components/engine_traits_tests/src/lib.rs | 7 +- .../src/scenario_writes.rs | 2 +- .../external_storage/export/src/export.rs | 10 +- components/external_storage/src/lib.rs | 4 +- components/file_system/src/file.rs | 22 +- .../file_system/src/io_stats/biosnoop.rs | 76 +++--- components/file_system/src/io_stats/mod.rs | 20 +- components/file_system/src/io_stats/proc.rs | 56 ++-- components/file_system/src/lib.rs | 86 +++--- components/file_system/src/metrics.rs | 30 +- components/file_system/src/metrics_manager.rs | 22 +- components/file_system/src/rate_limiter.rs | 258 +++++++++--------- components/raft_log_engine/src/engine.rs | 22 +- .../src/coprocessor/split_check/half.rs | 6 +- .../src/coprocessor/split_check/keys.rs | 10 +- .../src/coprocessor/split_check/size.rs | 50 ++-- .../raftstore/src/store/compaction_guard.rs | 6 +- components/raftstore/src/store/peer.rs | 4 +- .../raftstore/src/store/peer_storage.rs | 8 +- .../raftstore/src/store/region_snapshot.rs | 6 +- components/raftstore/src/store/snap.rs | 44 ++- components/raftstore/src/store/snap/io.rs | 7 +- .../raftstore/src/store/worker/compact.rs | 12 +- .../raftstore/src/store/worker/raftlog_gc.rs | 10 +- .../src/store/worker/refresh_config.rs | 4 +- .../raftstore/src/store/worker/region.rs | 12 +- .../raftstore/src/store/worker/split_check.rs | 4 +- components/security/src/lib.rs | 6 +- components/server/src/server.rs | 24 +- components/sst_importer/src/import_mode.rs | 56 ++-- components/sst_importer/src/sst_importer.rs | 10 +- components/sst_importer/src/util.rs | 14 +- components/test_coprocessor/src/dag.rs | 10 +- components/test_raftstore/src/cluster.rs | 4 +- components/test_raftstore/src/util.rs | 4 +- components/test_sst_importer/src/lib.rs | 6 +- .../src/codec/collation/encoding/utf8.rs | 14 +- .../src/codec/collation/mod.rs | 4 +- .../src/codec/data_type/mod.rs | 30 +- .../src/codec/data_type/scalar.rs | 2 +- .../tidb_query_datatype/src/codec/datum.rs | 2 +- .../src/codec/mysql/decimal.rs | 2 +- .../src/codec/mysql/duration.rs | 2 +- .../src/codec/mysql/enums.rs | 2 +- .../src/codec/mysql/json/mod.rs | 2 +- .../src/codec/mysql/json/serde.rs | 10 +- .../src/codec/mysql/set.rs | 2 +- .../src/codec/mysql/time/mod.rs | 2 +- .../src/codec/row/v2/row_slice.rs | 28 +- .../src/index_scan_executor.rs | 6 +- .../src/selection_executor.rs | 2 +- components/tikv_kv/src/cursor.rs | 4 +- components/tikv_kv/src/rocksdb_engine.rs | 8 +- components/tikv_util/src/logger/mod.rs | 6 +- scripts/clippy | 2 +- src/config.rs | 79 +++--- src/coprocessor/dag/mod.rs | 8 +- src/coprocessor/dag/storage_impl.rs | 8 +- src/coprocessor/readpool_impl.rs | 6 +- src/import/sst_service.rs | 6 +- src/read_pool.rs | 4 +- src/server/debug.rs | 54 ++-- src/server/engine_factory.rs | 4 +- src/server/engine_factory_v2.rs | 4 +- .../gc_worker/applied_lock_collector.rs | 2 +- src/server/gc_worker/compaction_filter.rs | 22 +- src/server/gc_worker/gc_manager.rs | 2 +- src/server/gc_worker/gc_worker.rs | 12 +- src/server/gc_worker/mod.rs | 4 +- .../gc_worker/rawkv_compaction_filter.rs | 8 +- src/server/snap.rs | 14 +- src/storage/config.rs | 104 +++---- src/storage/config_manager.rs | 8 +- src/storage/kv/test_engine_builder.rs | 6 +- src/storage/mod.rs | 4 +- src/storage/mvcc/consistency_check.rs | 12 +- src/storage/mvcc/reader/reader.rs | 4 +- src/storage/read_pool.rs | 6 +- .../singleton_flow_controller.rs | 18 +- .../flow_controller/tablet_flow_controller.rs | 6 +- src/storage/txn/sched_pool.rs | 4 +- .../misc/writebatch/bench_writebatch.rs | 8 +- tests/failpoints/cases/test_coprocessor.rs | 6 +- tests/failpoints/cases/test_encryption.rs | 4 +- tests/failpoints/cases/test_gc_worker.rs | 6 +- tests/integrations/config/mod.rs | 32 +-- tests/integrations/coprocessor/test_select.rs | 70 ++--- tests/integrations/raftstore/test_snap.rs | 18 +- .../integrations/raftstore/test_tombstone.rs | 2 +- tests/integrations/storage/test_titan.rs | 2 +- 139 files changed, 1141 insertions(+), 1151 deletions(-) diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 62ce325a130..df2c3cfbadf 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -13,7 +13,7 @@ use engine_traits::{ use futures::{executor::block_on, future, stream, Stream, StreamExt, TryStreamExt}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ - debugpb::{Db as DBType, *}, + debugpb::{Db as DbType, *}, kvrpcpb::MvccInfo, metapb::{Peer, Region}, raft_cmdpb::RaftCmdRequest, @@ -464,7 +464,7 @@ pub trait DebugExecutor { fn compact( &self, address: Option<&str>, - db: DBType, + db: DbType, cf: &str, from: Option>, to: Option>, @@ -487,7 +487,7 @@ pub trait DebugExecutor { fn compact_region( &self, address: Option<&str>, - db: DBType, + db: DbType, cf: &str, region_id: u64, threads: u32, @@ -604,7 +604,7 @@ pub trait DebugExecutor { fn do_compaction( &self, - db: DBType, + db: DbType, cf: &str, from: &[u8], to: &[u8], @@ -649,7 +649,7 @@ impl DebugExecutor for DebugClient { fn get_value_by_key(&self, cf: &str, key: Vec) -> Vec { let mut req = GetRequest::default(); - req.set_db(DBType::Kv); + req.set_db(DbType::Kv); req.set_cf(cf.to_owned()); req.set_key(key); self.get(&req) @@ -718,7 +718,7 @@ impl DebugExecutor for DebugClient { fn do_compaction( &self, - db: DBType, + db: DbType, cf: &str, from: &[u8], to: &[u8], @@ -858,7 +858,7 @@ impl DebugExecutor for Debugger { } fn get_value_by_key(&self, cf: &str, key: Vec) -> Vec { - self.get(DBType::Kv, cf, &key) + self.get(DbType::Kv, cf, &key) .unwrap_or_else(|e| perror_and_exit("Debugger::get", e)) } @@ -902,7 +902,7 @@ impl DebugExecutor for Debugger { fn do_compaction( &self, - db: DBType, + db: DbType, cf: &str, from: &[u8], to: &[u8], diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 00094af8dc6..d37336cbd36 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -23,8 +23,8 @@ use std::{ }; use encryption_export::{ - create_backend, data_key_manager_from_config, encryption_method_from_db_encryption_method, - DataKeyManager, DecrypterReader, Iv, + create_backend, data_key_manager_from_config, from_engine_encryption_method, DataKeyManager, + DecrypterReader, Iv, }; use engine_rocks::get_env; use engine_traits::EncryptionKeyManager; @@ -33,7 +33,7 @@ use futures::executor::block_on; use gag::BufferRedirect; use grpcio::{CallOption, ChannelBuilder, Environment}; use kvproto::{ - debugpb::{Db as DBType, *}, + debugpb::{Db as DbType, *}, encryptionpb::EncryptionMethod, kvrpcpb::SplitRegionRequest, raft_serverpb::SnapshotMeta, @@ -151,7 +151,7 @@ fn main() { let infile1 = Path::new(infile).canonicalize().unwrap(); let file_info = key_manager.get_file(infile1.to_str().unwrap()).unwrap(); - let mthd = encryption_method_from_db_encryption_method(file_info.method); + let mthd = from_engine_encryption_method(file_info.method); if mthd == EncryptionMethod::Plaintext { println!( "{} is not encrypted, skip to decrypt it into {}", @@ -218,7 +218,7 @@ fn main() { bottommost, } => { let pd_client = get_pd_rpc_client(opt.pd, Arc::clone(&mgr)); - let db_type = if db == "kv" { DBType::Kv } else { DBType::Raft }; + let db_type = if db == "kv" { DbType::Kv } else { DbType::Raft }; let cfs = cf.iter().map(|s| s.as_ref()).collect(); let from_key = from.map(|k| unescape(&k)); let to_key = to.map(|k| unescape(&k)); @@ -347,7 +347,7 @@ fn main() { threads, bottommost, } => { - let db_type = if db == "kv" { DBType::Kv } else { DBType::Raft }; + let db_type = if db == "kv" { DbType::Kv } else { DbType::Raft }; let from_key = from.map(|k| unescape(&k)); let to_key = to.map(|k| unescape(&k)); let bottommost = BottommostLevelCompaction::from(Some(bottommost.as_ref())); @@ -610,7 +610,7 @@ fn compact_whole_cluster( pd_client: &RpcClient, cfg: &TiKvConfig, mgr: Arc, - db_type: DBType, + db_type: DbType, cfs: Vec<&str>, from: Option>, to: Option>, diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 4555bdbf4ff..c6e928b8201 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -238,7 +238,7 @@ fn spawn_executors(init: impl InitialScan + Send + 'static, number: usize) -> Sc let stopped = stopped.clone(); pool.spawn(move |_: &mut YatpHandle<'_>| { tikv_alloc::add_thread_memory_accessor(); - let _io_guard = file_system::WithIOType::new(file_system::IOType::Replication); + let _io_guard = file_system::WithIoType::new(file_system::IoType::Replication); scan_executor_loop(init, rx, stopped); tikv_alloc::remove_thread_memory_accessor(); }) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index ada36a08615..7af38d12ac4 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -1181,7 +1181,7 @@ pub mod tests { use api_version::{api_v2::RAW_KEY_PREFIX, dispatch_api_version, KvFormat, RawValue}; use engine_traits::MiscExt; use external_storage_export::{make_local_backend, make_noop_backend}; - use file_system::{IOOp, IORateLimiter, IOType}; + use file_system::{IoOp, IoRateLimiter, IoType}; use futures::{executor::block_on, stream::StreamExt}; use kvproto::metapb; use raftstore::{ @@ -1265,7 +1265,7 @@ pub mod tests { } pub fn new_endpoint_with_limiter( - limiter: Option>, + limiter: Option>, api_version: ApiVersion, is_raw_kv: bool, causal_ts_provider: Option>, @@ -1508,7 +1508,7 @@ pub mod tests { #[test] fn test_handle_backup_task() { - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); let stats = limiter.statistics().unwrap(); let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), ApiVersion::V1, false, None); let engine = endpoint.engine.clone(); @@ -1585,8 +1585,8 @@ pub mod tests { ); let (none, _rx) = block_on(rx.into_future()); assert!(none.is_none(), "{:?}", none); - assert_eq!(stats.fetch(IOType::Export, IOOp::Write), 0); - assert_ne!(stats.fetch(IOType::Export, IOOp::Read), 0); + assert_eq!(stats.fetch(IoType::Export, IoOp::Write), 0); + assert_ne!(stats.fetch(IoType::Export, IoOp::Read), 0); } } @@ -1647,7 +1647,7 @@ pub mod tests { } fn test_handle_backup_raw_task_impl(cur_api_ver: ApiVersion, dst_api_ver: ApiVersion) -> bool { - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); let stats = limiter.statistics().unwrap(); let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), cur_api_ver, true, None); let engine = endpoint.engine.clone(); @@ -1759,8 +1759,8 @@ pub mod tests { ); let (none, _rx) = block_on(rx.into_future()); assert!(none.is_none(), "{:?}", none); - assert_eq!(stats.fetch(IOType::Export, IOOp::Write), 0); - assert_ne!(stats.fetch(IOType::Export, IOOp::Read), 0); + assert_eq!(stats.fetch(IoType::Export, IoOp::Write), 0); + assert_ne!(stats.fetch(IoType::Export, IoOp::Read), 0); true } @@ -1788,7 +1788,7 @@ pub mod tests { #[test] fn test_backup_raw_apiv2_causal_ts() { - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); let ts_provider = Arc::new(causal_ts::tests::TestProvider::default()); let start_ts = ts_provider.get_ts().unwrap(); let (tmp, endpoint) = new_endpoint_with_limiter( diff --git a/components/backup/src/utils.rs b/components/backup/src/utils.rs index de57b9f9081..1b8fda5570e 100644 --- a/components/backup/src/utils.rs +++ b/components/backup/src/utils.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use api_version::{dispatch_api_version, ApiV2, KeyMode, KvFormat}; -use file_system::IOType; +use file_system::IoType; use futures::Future; use kvproto::kvrpcpb::ApiVersion; use tikv_util::{error, sys::thread::ThreadBuildWrapper}; @@ -94,7 +94,7 @@ pub fn create_tokio_runtime(thread_count: usize, thread_name: &str) -> TokioResu .enable_time() .after_start_wrapper(|| { tikv_alloc::add_thread_memory_accessor(); - file_system::set_io_type(IOType::Export); + file_system::set_io_type(IoType::Export); }) .before_stop_wrapper(|| { tikv_alloc::remove_thread_memory_accessor(); diff --git a/components/batch-system/src/batch.rs b/components/batch-system/src/batch.rs index 49433a73592..f868b4bfc94 100644 --- a/components/batch-system/src/batch.rs +++ b/components/batch-system/src/batch.rs @@ -17,7 +17,7 @@ use std::{ use crossbeam::channel::{self, SendError}; use fail::fail_point; -use file_system::{set_io_type, IOType}; +use file_system::{set_io_type, IoType}; use tikv_util::{ debug, error, info, mpsc, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, time::Instant, warn, @@ -589,7 +589,7 @@ where .name(name) .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); poller.poll(); }) .unwrap(); diff --git a/components/cloud/gcp/src/gcs.rs b/components/cloud/gcp/src/gcs.rs index a3401dbf6c8..799d1b02ee9 100644 --- a/components/cloud/gcp/src/gcs.rs +++ b/components/cloud/gcp/src/gcs.rs @@ -127,7 +127,7 @@ impl BlobConfig for Config { // GCS compatible storage #[derive(Clone)] -pub struct GCSStorage { +pub struct GcsStorage { config: Config, svc_access: Option>, client: Client, Body>, @@ -228,7 +228,7 @@ impl RetryError for RequestError { } } -impl GCSStorage { +impl GcsStorage { pub fn from_input(input: InputConfig) -> io::Result { Self::new(Config::from_input(input)?) } @@ -238,7 +238,7 @@ impl GCSStorage { } /// Create a new GCS storage for the given config. - pub fn new(config: Config) -> io::Result { + pub fn new(config: Config) -> io::Result { let svc_access = if let Some(si) = &config.svc_info { Some( ServiceAccountAccess::new(si.clone()) @@ -249,7 +249,7 @@ impl GCSStorage { }; let client = Client::builder().build(HttpsConnector::new()); - Ok(GCSStorage { + Ok(GcsStorage { config, svc_access: svc_access.map(Arc::new), client, @@ -392,7 +392,7 @@ fn parse_predefined_acl(acl: &str) -> Result, &str> { const STORAGE_NAME: &str = "gcs"; #[async_trait] -impl BlobStorage for GCSStorage { +impl BlobStorage for GcsStorage { fn config(&self) -> Box { Box::new(self.config.clone()) as Box } @@ -454,11 +454,11 @@ impl BlobStorage for GCSStorage { debug!("read file from GCS storage"; "key" => %name); let oid = match ObjectId::new(bucket, name) { Ok(oid) => oid, - Err(e) => return GCSStorage::error_to_async_read(io::ErrorKind::InvalidInput, e), + Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::InvalidInput, e), }; let request = match Object::download(&oid, None /* optional */) { Ok(request) => request.map(|_: io::Empty| Body::empty()), - Err(e) => return GCSStorage::error_to_async_read(io::ErrorKind::Other, e), + Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::Other, e), }; Box::new( self.make_request(request, tame_gcs::Scopes::ReadOnly) diff --git a/components/cloud/gcp/src/lib.rs b/components/cloud/gcp/src/lib.rs index e023ca9c6eb..4652bbf5b74 100644 --- a/components/cloud/gcp/src/lib.rs +++ b/components/cloud/gcp/src/lib.rs @@ -4,4 +4,4 @@ extern crate slog_global; mod gcs; -pub use gcs::{Config, GCSStorage}; +pub use gcs::{Config, GcsStorage}; diff --git a/components/encryption/export/src/lib.rs b/components/encryption/export/src/lib.rs index 537eb8785e5..e29a41cd07e 100644 --- a/components/encryption/export/src/lib.rs +++ b/components/encryption/export/src/lib.rs @@ -14,8 +14,8 @@ use derive_more::Deref; #[cfg(feature = "cloud-aws")] pub use encryption::KmsBackend; pub use encryption::{ - encryption_method_from_db_encryption_method, Backend, DataKeyManager, DataKeyManagerArgs, - DecrypterReader, EncryptionConfig, Error, FileConfig, Iv, KmsConfig, MasterKeyConfig, Result, + from_engine_encryption_method, Backend, DataKeyManager, DataKeyManagerArgs, DecrypterReader, + EncryptionConfig, Error, FileConfig, Iv, KmsConfig, MasterKeyConfig, Result, }; use encryption::{ DataKeyPair, EncryptedKey, FileBackend, KmsProvider, PlainKey, PlaintextBackend, diff --git a/components/encryption/src/crypter.rs b/components/encryption/src/crypter.rs index 1268d0d88f2..13286e416c9 100644 --- a/components/encryption/src/crypter.rs +++ b/components/encryption/src/crypter.rs @@ -2,7 +2,7 @@ use byteorder::{BigEndian, ByteOrder}; use derive_more::Deref; -use engine_traits::EncryptionMethod as DBEncryptionMethod; +use engine_traits::EncryptionMethod as EtEncryptionMethod; use kvproto::encryptionpb::EncryptionMethod; use openssl::symm::{self, Cipher as OCipher}; use rand::{rngs::OsRng, RngCore}; @@ -10,32 +10,28 @@ use tikv_util::{box_err, impl_display_as_debug}; use crate::{Error, Result}; -pub fn encryption_method_to_db_encryption_method(method: EncryptionMethod) -> DBEncryptionMethod { +pub fn to_engine_encryption_method(method: EncryptionMethod) -> EtEncryptionMethod { match method { - EncryptionMethod::Plaintext => DBEncryptionMethod::Plaintext, - EncryptionMethod::Aes128Ctr => DBEncryptionMethod::Aes128Ctr, - EncryptionMethod::Aes192Ctr => DBEncryptionMethod::Aes192Ctr, - EncryptionMethod::Aes256Ctr => DBEncryptionMethod::Aes256Ctr, - EncryptionMethod::Sm4Ctr => DBEncryptionMethod::Sm4Ctr, - EncryptionMethod::Unknown => DBEncryptionMethod::Unknown, + EncryptionMethod::Plaintext => EtEncryptionMethod::Plaintext, + EncryptionMethod::Aes128Ctr => EtEncryptionMethod::Aes128Ctr, + EncryptionMethod::Aes192Ctr => EtEncryptionMethod::Aes192Ctr, + EncryptionMethod::Aes256Ctr => EtEncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr => EtEncryptionMethod::Sm4Ctr, + EncryptionMethod::Unknown => EtEncryptionMethod::Unknown, } } -pub fn encryption_method_from_db_encryption_method(method: DBEncryptionMethod) -> EncryptionMethod { +pub fn from_engine_encryption_method(method: EtEncryptionMethod) -> EncryptionMethod { match method { - DBEncryptionMethod::Plaintext => EncryptionMethod::Plaintext, - DBEncryptionMethod::Aes128Ctr => EncryptionMethod::Aes128Ctr, - DBEncryptionMethod::Aes192Ctr => EncryptionMethod::Aes192Ctr, - DBEncryptionMethod::Aes256Ctr => EncryptionMethod::Aes256Ctr, - DBEncryptionMethod::Sm4Ctr => EncryptionMethod::Sm4Ctr, - DBEncryptionMethod::Unknown => EncryptionMethod::Unknown, + EtEncryptionMethod::Plaintext => EncryptionMethod::Plaintext, + EtEncryptionMethod::Aes128Ctr => EncryptionMethod::Aes128Ctr, + EtEncryptionMethod::Aes192Ctr => EncryptionMethod::Aes192Ctr, + EtEncryptionMethod::Aes256Ctr => EncryptionMethod::Aes256Ctr, + EtEncryptionMethod::Sm4Ctr => EncryptionMethod::Sm4Ctr, + EtEncryptionMethod::Unknown => EncryptionMethod::Unknown, } } -pub fn compat(method: EncryptionMethod) -> EncryptionMethod { - method -} - pub fn get_method_key_length(method: EncryptionMethod) -> usize { match method { EncryptionMethod::Plaintext => 0, diff --git a/components/encryption/src/file_dict_file.rs b/components/encryption/src/file_dict_file.rs index 0884cb1ca04..653fbf8dbbb 100644 --- a/components/encryption/src/file_dict_file.rs +++ b/components/encryption/src/file_dict_file.rs @@ -390,7 +390,7 @@ mod tests { use kvproto::encryptionpb::EncryptionMethod; use super::*; - use crate::{crypter::compat, encrypted_file::EncryptedFile, Error}; + use crate::{encrypted_file::EncryptedFile, Error}; fn test_file_dict_file_normal(enable_log: bool) { let tempdir = tempfile::tempdir().unwrap(); @@ -614,7 +614,7 @@ mod tests { fn create_file_info(id: u64, method: EncryptionMethod) -> FileInfo { FileInfo { key_id: id, - method: compat(method), + method, ..Default::default() } } diff --git a/components/encryption/src/lib.rs b/components/encryption/src/lib.rs index e6498e5d3ab..7f9079ed030 100644 --- a/components/encryption/src/lib.rs +++ b/components/encryption/src/lib.rs @@ -13,9 +13,8 @@ mod metrics; pub use self::{ config::*, crypter::{ - compat, encryption_method_from_db_encryption_method, - encryption_method_to_db_encryption_method, verify_encryption_config, AesGcmCrypter, Iv, - PlainKey, + from_engine_encryption_method, to_engine_encryption_method, verify_encryption_config, + AesGcmCrypter, Iv, PlainKey, }, encrypted_file::EncryptedFile, errors::{Error, Result, RetryCodedError}, diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 79654d9d6a2..a45f6153358 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -13,7 +13,7 @@ use std::{ use crossbeam::channel::{self, select, tick}; use engine_traits::{ - EncryptionKeyManager, EncryptionMethod as DBEncryptionMethod, FileEncryptionInfo, + EncryptionKeyManager, EncryptionMethod as EtEncryptionMethod, FileEncryptionInfo, }; use fail::fail_point; use file_system::File; @@ -23,7 +23,7 @@ use tikv_util::{box_err, debug, error, info, sys::thread::StdThreadBuildWrapper, use crate::{ config::EncryptionConfig, - crypter::{self, compat, Iv}, + crypter::{self, Iv}, encrypted_file::EncryptedFile, file_dict_file::FileDictionaryFile, io::{DecrypterReader, EncrypterWriter}, @@ -198,7 +198,7 @@ impl Dicts { let file = FileInfo { iv: iv.as_slice().to_vec(), key_id: self.current_key_id.load(Ordering::SeqCst), - method: compat(method), + method, ..Default::default() }; let file_num = { @@ -243,7 +243,7 @@ impl Dicts { file_dict_file.remove(fname)?; ENCRYPTION_FILE_NUM_GAUGE.set(file_num); - if file.method != compat(EncryptionMethod::Plaintext) { + if file.method != EncryptionMethod::Plaintext { debug!("delete encrypted file"; "fname" => fname); } else { debug!("delete plaintext file"; "fname" => fname); @@ -275,7 +275,7 @@ impl Dicts { file_dict_file.insert(dst_fname, &file)?; ENCRYPTION_FILE_NUM_GAUGE.set(file_num); - if method != compat(EncryptionMethod::Plaintext) { + if method != EncryptionMethod::Plaintext { info!("link encrypted file"; "src" => src_fname, "dst" => dst_fname); } else { info!("link plaintext file"; "src" => src_fname, "dst" => dst_fname); @@ -312,7 +312,7 @@ impl Dicts { // Generate a new data key if // 1. encryption method is not the same, or // 2. the current data key was exposed and the new master key is secure. - if compat(method) == key.method && !(key.was_exposed && master_key.is_secure()) { + if method == key.method && !(key.was_exposed && master_key.is_secure()) { let creation_time = UNIX_EPOCH + Duration::from_secs(key.creation_time); match now.duration_since(creation_time) { Ok(duration) => { @@ -336,7 +336,7 @@ impl Dicts { let (key_id, key) = generate_data_key(method); let data_key = DataKey { key, - method: compat(method), + method, creation_time, was_exposed: false, ..Default::default() @@ -615,9 +615,9 @@ impl DataKeyManager { }; EncrypterWriter::new( writer, - crypter::encryption_method_from_db_encryption_method(file.method), + crypter::from_engine_encryption_method(file.method), &file.key, - if file.method == DBEncryptionMethod::Plaintext { + if file.method == EtEncryptionMethod::Plaintext { debug_assert!(file.iv.is_empty()); Iv::Empty } else { @@ -645,9 +645,9 @@ impl DataKeyManager { let file = self.get_file(fname)?; DecrypterReader::new( reader, - crypter::encryption_method_from_db_encryption_method(file.method), + crypter::from_engine_encryption_method(file.method), &file.key, - if file.method == DBEncryptionMethod::Plaintext { + if file.method == EtEncryptionMethod::Plaintext { debug_assert!(file.iv.is_empty()); Iv::Empty } else { @@ -723,7 +723,7 @@ impl DataKeyManager { }; let encrypted_file = FileEncryptionInfo { key, - method: crypter::encryption_method_to_db_encryption_method(method), + method: crypter::to_engine_encryption_method(method), iv, }; Ok(Some(encrypted_file)) @@ -750,10 +750,10 @@ impl EncryptionKeyManager for DataKeyManager { // Return Plaintext if file is not found // RocksDB requires this let file = FileInfo::default(); - let method = compat(EncryptionMethod::Plaintext); + let method = EncryptionMethod::Plaintext; Ok(FileEncryptionInfo { key: vec![], - method: crypter::encryption_method_to_db_encryption_method(method), + method: crypter::to_engine_encryption_method(method), iv: file.iv, }) } @@ -767,7 +767,7 @@ impl EncryptionKeyManager for DataKeyManager { let file = self.dicts.new_file(fname, self.method)?; let encrypted_file = FileEncryptionInfo { key, - method: crypter::encryption_method_to_db_encryption_method(file.method), + method: crypter::to_engine_encryption_method(file.method), iv: file.get_iv().to_owned(), }; Ok(encrypted_file) @@ -789,7 +789,7 @@ impl EncryptionKeyManager for DataKeyManager { #[cfg(test)] mod tests { - use engine_traits::EncryptionMethod as DBEncryptionMethod; + use engine_traits::EncryptionMethod as EtEncryptionMethod; use file_system::{remove_file, File}; use matches::assert_matches; use tempfile::TempDir; @@ -912,7 +912,7 @@ mod tests { let foo3 = manager.get_file("foo").unwrap(); assert_eq!(foo1, foo3); let bar = manager.new_file("bar").unwrap(); - assert_eq!(bar.method, DBEncryptionMethod::Plaintext); + assert_eq!(bar.method, EtEncryptionMethod::Plaintext); } // When enabling encryption, using insecure master key is not allowed. diff --git a/components/engine_panic/src/cf_names.rs b/components/engine_panic/src/cf_names.rs index 8697634586b..ee71210f229 100644 --- a/components/engine_panic/src/cf_names.rs +++ b/components/engine_panic/src/cf_names.rs @@ -1,10 +1,10 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::CFNamesExt; +use engine_traits::CfNamesExt; use crate::engine::PanicEngine; -impl CFNamesExt for PanicEngine { +impl CfNamesExt for PanicEngine { fn cf_names(&self) -> Vec<&str> { panic!() } diff --git a/components/engine_panic/src/cf_options.rs b/components/engine_panic/src/cf_options.rs index f00db2eeb4f..1da2473bdaa 100644 --- a/components/engine_panic/src/cf_options.rs +++ b/components/engine_panic/src/cf_options.rs @@ -1,13 +1,13 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{CFOptionsExt, ColumnFamilyOptions, Result, SstPartitionerFactory}; +use engine_traits::{CfOptions, CfOptionsExt, Result, SstPartitionerFactory}; -use crate::{db_options::PanicTitanDBOptions, engine::PanicEngine}; +use crate::{db_options::PanicTitanDbOptions, engine::PanicEngine}; -impl CFOptionsExt for PanicEngine { - type ColumnFamilyOptions = PanicColumnFamilyOptions; +impl CfOptionsExt for PanicEngine { + type CfOptions = PanicCfOptions; - fn get_options_cf(&self, cf: &str) -> Result { + fn get_options_cf(&self, cf: &str) -> Result { panic!() } fn set_options_cf(&self, cf: &str, options: &[(&str, &str)]) -> Result<()> { @@ -15,10 +15,10 @@ impl CFOptionsExt for PanicEngine { } } -pub struct PanicColumnFamilyOptions; +pub struct PanicCfOptions; -impl ColumnFamilyOptions for PanicColumnFamilyOptions { - type TitanDBOptions = PanicTitanDBOptions; +impl CfOptions for PanicCfOptions { + type TitanDbOptions = PanicTitanDbOptions; fn new() -> Self { panic!() @@ -47,7 +47,7 @@ impl ColumnFamilyOptions for PanicColumnFamilyOptions { fn set_block_cache_capacity(&self, capacity: u64) -> Result<()> { panic!() } - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { panic!() } fn get_target_file_size_base(&self) -> u64 { diff --git a/components/engine_panic/src/db_options.rs b/components/engine_panic/src/db_options.rs index f28741ce4c2..4b8eb562018 100644 --- a/components/engine_panic/src/db_options.rs +++ b/components/engine_panic/src/db_options.rs @@ -1,13 +1,13 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{DBOptions, DBOptionsExt, Result, TitanDBOptions}; +use engine_traits::{DbOptions, DbOptionsExt, Result, TitanDbOptions}; use crate::engine::PanicEngine; -impl DBOptionsExt for PanicEngine { - type DBOptions = PanicDBOptions; +impl DbOptionsExt for PanicEngine { + type DbOptions = PanicDbOptions; - fn get_db_options(&self) -> Self::DBOptions { + fn get_db_options(&self) -> Self::DbOptions { panic!() } fn set_db_options(&self, options: &[(&str, &str)]) -> Result<()> { @@ -15,10 +15,10 @@ impl DBOptionsExt for PanicEngine { } } -pub struct PanicDBOptions; +pub struct PanicDbOptions; -impl DBOptions for PanicDBOptions { - type TitanDBOptions = PanicTitanDBOptions; +impl DbOptions for PanicDbOptions { + type TitanDbOptions = PanicTitanDbOptions; fn new() -> Self { panic!() @@ -44,14 +44,14 @@ impl DBOptions for PanicDBOptions { panic!() } - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { panic!() } } -pub struct PanicTitanDBOptions; +pub struct PanicTitanDbOptions; -impl TitanDBOptions for PanicTitanDBOptions { +impl TitanDbOptions for PanicTitanDbOptions { fn new() -> Self { panic!() } diff --git a/components/engine_panic/src/db_vector.rs b/components/engine_panic/src/db_vector.rs index 83d615dbc4c..3daf6dc9500 100644 --- a/components/engine_panic/src/db_vector.rs +++ b/components/engine_panic/src/db_vector.rs @@ -2,14 +2,14 @@ use std::ops::Deref; -use engine_traits::DBVector; +use engine_traits::DbVector; #[derive(Debug)] -pub struct PanicDBVector; +pub struct PanicDbVector; -impl DBVector for PanicDBVector {} +impl DbVector for PanicDbVector {} -impl Deref for PanicDBVector { +impl Deref for PanicDbVector { type Target = [u8]; fn deref(&self) -> &[u8] { @@ -17,7 +17,7 @@ impl Deref for PanicDBVector { } } -impl<'a> PartialEq<&'a [u8]> for PanicDBVector { +impl<'a> PartialEq<&'a [u8]> for PanicDbVector { fn eq(&self, rhs: &&[u8]) -> bool { **rhs == **self } diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index 128cb318ed6..a296c3df9d8 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -5,7 +5,7 @@ use engine_traits::{ TabletAccessor, WriteOptions, }; -use crate::{db_vector::PanicDBVector, snapshot::PanicSnapshot, write_batch::PanicWriteBatch}; +use crate::{db_vector::PanicDbVector, snapshot::PanicSnapshot, write_batch::PanicWriteBatch}; #[derive(Clone, Debug)] pub struct PanicEngine; @@ -35,9 +35,9 @@ impl TabletAccessor for PanicEngine { } impl Peekable for PanicEngine { - type DBVector = PanicDBVector; + type DbVector = PanicDbVector; - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { panic!() } fn get_value_cf_opt( @@ -45,7 +45,7 @@ impl Peekable for PanicEngine { opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result> { + ) -> Result> { panic!() } } diff --git a/components/engine_panic/src/snapshot.rs b/components/engine_panic/src/snapshot.rs index e27ed42d093..e573402c6d2 100644 --- a/components/engine_panic/src/snapshot.rs +++ b/components/engine_panic/src/snapshot.rs @@ -4,7 +4,7 @@ use std::ops::Deref; use engine_traits::{IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, Snapshot}; -use crate::{db_vector::PanicDBVector, engine::PanicEngine}; +use crate::{db_vector::PanicDbVector, engine::PanicEngine}; #[derive(Clone, Debug)] pub struct PanicSnapshot; @@ -16,9 +16,9 @@ impl Snapshot for PanicSnapshot { } impl Peekable for PanicSnapshot { - type DBVector = PanicDBVector; + type DbVector = PanicDbVector; - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { panic!() } fn get_value_cf_opt( @@ -26,7 +26,7 @@ impl Peekable for PanicSnapshot { opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result> { + ) -> Result> { panic!() } } diff --git a/components/engine_rocks/src/cf_names.rs b/components/engine_rocks/src/cf_names.rs index b45a3960328..3b2512d0def 100644 --- a/components/engine_rocks/src/cf_names.rs +++ b/components/engine_rocks/src/cf_names.rs @@ -1,10 +1,10 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::CFNamesExt; +use engine_traits::CfNamesExt; use crate::engine::RocksEngine; -impl CFNamesExt for RocksEngine { +impl CfNamesExt for RocksEngine { fn cf_names(&self) -> Vec<&str> { self.as_inner().cf_names() } diff --git a/components/engine_rocks/src/cf_options.rs b/components/engine_rocks/src/cf_options.rs index c6a5390a063..6b3bdcaa11b 100644 --- a/components/engine_rocks/src/cf_options.rs +++ b/components/engine_rocks/src/cf_options.rs @@ -2,19 +2,19 @@ use std::ops::{Deref, DerefMut}; -use engine_traits::{CFOptionsExt, ColumnFamilyOptions, Result, SstPartitionerFactory}; +use engine_traits::{CfOptions, CfOptionsExt, Result, SstPartitionerFactory}; use rocksdb::ColumnFamilyOptions as RawCfOptions; use tikv_util::box_err; use crate::{ - db_options::RocksTitanDBOptions, engine::RocksEngine, r2e, + db_options::RocksTitanDbOptions, engine::RocksEngine, r2e, sst_partitioner::RocksSstPartitionerFactory, util, }; -impl CFOptionsExt for RocksEngine { - type ColumnFamilyOptions = RocksCfOptions; +impl CfOptionsExt for RocksEngine { + type CfOptions = RocksCfOptions; - fn get_options_cf(&self, cf: &str) -> Result { + fn get_options_cf(&self, cf: &str) -> Result { let handle = util::get_cf_handle(self.as_inner(), cf)?; Ok(RocksCfOptions::from_raw( self.as_inner().get_options_cf(handle), @@ -58,8 +58,8 @@ impl DerefMut for RocksCfOptions { } } -impl ColumnFamilyOptions for RocksCfOptions { - type TitanDBOptions = RocksTitanDBOptions; +impl CfOptions for RocksCfOptions { + type TitanDbOptions = RocksTitanDbOptions; fn new() -> Self { RocksCfOptions::from_raw(RawCfOptions::default()) @@ -97,7 +97,7 @@ impl ColumnFamilyOptions for RocksCfOptions { self.0.set_block_cache_capacity(capacity).map_err(r2e) } - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { self.0.set_titandb_options(opts.as_raw()) } diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index 393377149ff..b9e3e5fe558 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -2,7 +2,7 @@ use std::cmp; -use engine_traits::{CFNamesExt, CompactExt, Result}; +use engine_traits::{CfNamesExt, CompactExt, Result}; use rocksdb::{CompactOptions, CompactionOptions, DBCompressionType}; use crate::{engine::RocksEngine, r2e, util}; @@ -137,10 +137,10 @@ impl CompactExt for RocksEngine { #[cfg(test)] mod tests { - use engine_traits::{CFNamesExt, CFOptionsExt, CompactExt, MiscExt, SyncMutable}; + use engine_traits::{CfNamesExt, CfOptionsExt, CompactExt, MiscExt, SyncMutable}; use tempfile::Builder; - use crate::{util, RocksCfOptions, RocksDBOptions}; + use crate::{util, RocksCfOptions, RocksDbOptions}; #[test] fn test_compact_files_in_range() { @@ -154,7 +154,7 @@ mod tests { let cfs_opts = vec![("default", cf_opts.clone()), ("test", cf_opts)]; let db = util::new_engine_opt( temp_dir.path().to_str().unwrap(), - RocksDBOptions::default(), + RocksDbOptions::default(), cfs_opts, ) .unwrap(); diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index 6aaccfee76b..dea87dbb135 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -2,17 +2,17 @@ use std::ops::{Deref, DerefMut}; -use engine_traits::{DBOptions, DBOptionsExt, Result, TitanDBOptions}; +use engine_traits::{DbOptions, DbOptionsExt, Result, TitanDbOptions}; use rocksdb::{DBOptions as RawDBOptions, TitanDBOptions as RawTitanDBOptions}; use tikv_util::box_err; use crate::engine::RocksEngine; -impl DBOptionsExt for RocksEngine { - type DBOptions = RocksDBOptions; +impl DbOptionsExt for RocksEngine { + type DbOptions = RocksDbOptions; - fn get_db_options(&self) -> Self::DBOptions { - RocksDBOptions::from_raw(self.as_inner().get_db_options()) + fn get_db_options(&self) -> Self::DbOptions { + RocksDbOptions::from_raw(self.as_inner().get_db_options()) } fn set_db_options(&self, options: &[(&str, &str)]) -> Result<()> { self.as_inner() @@ -22,11 +22,11 @@ impl DBOptionsExt for RocksEngine { } #[derive(Default)] -pub struct RocksDBOptions(RawDBOptions); +pub struct RocksDbOptions(RawDBOptions); -impl RocksDBOptions { - pub fn from_raw(raw: RawDBOptions) -> RocksDBOptions { - RocksDBOptions(raw) +impl RocksDbOptions { + pub fn from_raw(raw: RawDBOptions) -> RocksDbOptions { + RocksDbOptions(raw) } pub fn into_raw(self) -> RawDBOptions { @@ -38,7 +38,7 @@ impl RocksDBOptions { } } -impl Deref for RocksDBOptions { +impl Deref for RocksDbOptions { type Target = RawDBOptions; #[inline] @@ -47,18 +47,18 @@ impl Deref for RocksDBOptions { } } -impl DerefMut for RocksDBOptions { +impl DerefMut for RocksDbOptions { #[inline] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } -impl DBOptions for RocksDBOptions { - type TitanDBOptions = RocksTitanDBOptions; +impl DbOptions for RocksDbOptions { + type TitanDbOptions = RocksTitanDbOptions; fn new() -> Self { - RocksDBOptions::from_raw(RawDBOptions::new()) + RocksDbOptions::from_raw(RawDBOptions::new()) } fn get_max_background_jobs(&self) -> i32 { @@ -85,16 +85,16 @@ impl DBOptions for RocksDBOptions { .map_err(|e| box_err!(e)) } - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions) { + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { self.0.set_titandb_options(opts.as_raw()) } } -pub struct RocksTitanDBOptions(RawTitanDBOptions); +pub struct RocksTitanDbOptions(RawTitanDBOptions); -impl RocksTitanDBOptions { - pub fn from_raw(raw: RawTitanDBOptions) -> RocksTitanDBOptions { - RocksTitanDBOptions(raw) +impl RocksTitanDbOptions { + pub fn from_raw(raw: RawTitanDBOptions) -> RocksTitanDbOptions { + RocksTitanDbOptions(raw) } pub fn as_raw(&self) -> &RawTitanDBOptions { @@ -102,7 +102,7 @@ impl RocksTitanDBOptions { } } -impl Deref for RocksTitanDBOptions { +impl Deref for RocksTitanDbOptions { type Target = RawTitanDBOptions; #[inline] @@ -111,16 +111,16 @@ impl Deref for RocksTitanDBOptions { } } -impl DerefMut for RocksTitanDBOptions { +impl DerefMut for RocksTitanDbOptions { #[inline] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.0 } } -impl TitanDBOptions for RocksTitanDBOptions { +impl TitanDbOptions for RocksTitanDbOptions { fn new() -> Self { - RocksTitanDBOptions::from_raw(RawTitanDBOptions::new()) + RocksTitanDbOptions::from_raw(RawTitanDBOptions::new()) } fn set_min_blob_size(&mut self, size: u64) { diff --git a/components/engine_rocks/src/db_vector.rs b/components/engine_rocks/src/db_vector.rs index cf48bd8da0e..97fa65b7072 100644 --- a/components/engine_rocks/src/db_vector.rs +++ b/components/engine_rocks/src/db_vector.rs @@ -5,20 +5,20 @@ use std::{ ops::Deref, }; -use engine_traits::DBVector; +use engine_traits::DbVector; use rocksdb::DBVector as RawDBVector; -pub struct RocksDBVector(RawDBVector); +pub struct RocksDbVector(RawDBVector); -impl RocksDBVector { - pub fn from_raw(raw: RawDBVector) -> RocksDBVector { - RocksDBVector(raw) +impl RocksDbVector { + pub fn from_raw(raw: RawDBVector) -> RocksDbVector { + RocksDbVector(raw) } } -impl DBVector for RocksDBVector {} +impl DbVector for RocksDbVector {} -impl Deref for RocksDBVector { +impl Deref for RocksDbVector { type Target = [u8]; fn deref(&self) -> &[u8] { @@ -26,13 +26,13 @@ impl Deref for RocksDBVector { } } -impl Debug for RocksDBVector { +impl Debug for RocksDbVector { fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result { write!(formatter, "{:?}", &**self) } } -impl<'a> PartialEq<&'a [u8]> for RocksDBVector { +impl<'a> PartialEq<&'a [u8]> for RocksDbVector { fn eq(&self, rhs: &&[u8]) -> bool { **rhs == **self } diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 0e83eb2cdb3..9c995144efa 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -8,7 +8,7 @@ use engine_traits::{ use rocksdb::{DBIterator, Writable, DB}; use crate::{ - db_vector::RocksDBVector, + db_vector::RocksDbVector, options::RocksReadOptions, r2e, rocks_metrics::{ @@ -142,12 +142,12 @@ impl Iterable for RocksEngine { } impl Peekable for RocksEngine { - type DBVector = RocksDBVector; + type DbVector = RocksDbVector; - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { let opt: RocksReadOptions = opts.into(); let v = self.db.get_opt(key, &opt.into_raw()).map_err(r2e)?; - Ok(v.map(RocksDBVector::from_raw)) + Ok(v.map(RocksDbVector::from_raw)) } fn get_value_cf_opt( @@ -155,14 +155,14 @@ impl Peekable for RocksEngine { opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result> { + ) -> Result> { let opt: RocksReadOptions = opts.into(); let handle = get_cf_handle(&self.db, cf)?; let v = self .db .get_cf_opt(handle, key, &opt.into_raw()) .map_err(r2e)?; - Ok(v.map(RocksDBVector::from_raw)) + Ok(v.map(RocksDbVector::from_raw)) } } diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 5b93ccba637..ad7a9de455f 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use file_system::{get_io_type, set_io_type, IOType}; +use file_system::{get_io_type, set_io_type, IoType}; use regex::Regex; use rocksdb::{ CompactionJobInfo, DBBackgroundErrorReason, FlushJobInfo, IngestionInfo, MutableStatus, @@ -32,23 +32,23 @@ impl RocksEventListener { impl rocksdb::EventListener for RocksEventListener { fn on_flush_begin(&self, _info: &FlushJobInfo) { - set_io_type(IOType::Flush); + set_io_type(IoType::Flush); } fn on_flush_completed(&self, info: &FlushJobInfo) { STORE_ENGINE_EVENT_COUNTER_VEC .with_label_values(&[&self.db_name, info.cf_name(), "flush"]) .inc(); - if get_io_type() == IOType::Flush { - set_io_type(IOType::Other); + if get_io_type() == IoType::Flush { + set_io_type(IoType::Other); } } fn on_compaction_begin(&self, info: &CompactionJobInfo) { if info.base_input_level() == 0 { - set_io_type(IOType::LevelZeroCompaction); + set_io_type(IoType::LevelZeroCompaction); } else { - set_io_type(IOType::Compaction); + set_io_type(IoType::Compaction); } } @@ -69,26 +69,26 @@ impl rocksdb::EventListener for RocksEventListener { &info.compaction_reason().to_string(), ]) .inc(); - if info.base_input_level() == 0 && get_io_type() == IOType::LevelZeroCompaction - || info.base_input_level() != 0 && get_io_type() == IOType::Compaction + if info.base_input_level() == 0 && get_io_type() == IoType::LevelZeroCompaction + || info.base_input_level() != 0 && get_io_type() == IoType::Compaction { - set_io_type(IOType::Other); + set_io_type(IoType::Other); } } fn on_subcompaction_begin(&self, info: &SubcompactionJobInfo) { if info.base_input_level() == 0 { - set_io_type(IOType::LevelZeroCompaction); + set_io_type(IoType::LevelZeroCompaction); } else { - set_io_type(IOType::Compaction); + set_io_type(IoType::Compaction); } } fn on_subcompaction_completed(&self, info: &SubcompactionJobInfo) { - if info.base_input_level() == 0 && get_io_type() == IOType::LevelZeroCompaction - || info.base_input_level() != 0 && get_io_type() == IOType::Compaction + if info.base_input_level() == 0 && get_io_type() == IoType::LevelZeroCompaction + || info.base_input_level() != 0 && get_io_type() == IoType::Compaction { - set_io_type(IOType::Other); + set_io_type(IoType::Other); } } diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index 87f46893774..614611bc40e 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -10,7 +10,7 @@ use crate::{e2r, r2e, raw::Env}; // Use engine::Env directly since Env is not abstracted. pub(crate) fn get_env( base_env: Option>, - limiter: Option>, + limiter: Option>, ) -> engine_traits::Result> { let base_env = base_env.unwrap_or_else(|| Arc::new(Env::default())); Ok(Arc::new( @@ -43,19 +43,19 @@ mod tests { use std::sync::Arc; use engine_traits::{CompactExt, MiscExt, SyncMutable, CF_DEFAULT}; - use file_system::{IOOp, IORateLimiter, IORateLimiterStatistics, IOType}; + use file_system::{IoOp, IoRateLimiter, IoRateLimiterStatistics, IoType}; use keys::data_key; use tempfile::Builder; use super::*; use crate::{ event_listener::RocksEventListener, raw::DBCompressionType, util::new_engine_opt, - RocksCfOptions, RocksDBOptions, RocksEngine, + RocksCfOptions, RocksDbOptions, RocksEngine, }; - fn new_test_db(dir: &str) -> (RocksEngine, Arc) { - let limiter = Arc::new(IORateLimiter::new_for_test()); - let mut db_opts = RocksDBOptions::default(); + fn new_test_db(dir: &str) -> (RocksEngine, Arc) { + let limiter = Arc::new(IoRateLimiter::new_for_test()); + let mut db_opts = RocksDbOptions::default(); db_opts.add_event_listener(RocksEventListener::new("test_db", None)); let env = get_env(None, Some(limiter.clone())).unwrap(); db_opts.set_env(env); @@ -81,16 +81,16 @@ mod tests { db.put(&data_key(b"a1"), &value).unwrap(); db.put(&data_key(b"a2"), &value).unwrap(); - assert_eq!(stats.fetch(IOType::Flush, IOOp::Write), 0); + assert_eq!(stats.fetch(IoType::Flush, IoOp::Write), 0); db.flush(true /* sync */).unwrap(); - assert!(stats.fetch(IOType::Flush, IOOp::Write) > value_size * 2); - assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 2 + amplification_bytes); + assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); + assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.put(&data_key(b"a2"), &value).unwrap(); db.put(&data_key(b"a3"), &value).unwrap(); db.flush(true /* sync */).unwrap(); - assert!(stats.fetch(IOType::Flush, IOOp::Write) > value_size * 2); - assert!(stats.fetch(IOType::Flush, IOOp::Write) < value_size * 2 + amplification_bytes); + assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); + assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.compact_range( CF_DEFAULT, None, // start_key @@ -99,14 +99,14 @@ mod tests { 1, // max_subcompactions ) .unwrap(); - assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) > value_size * 4); + assert!(stats.fetch(IoType::LevelZeroCompaction, IoOp::Read) > value_size * 4); assert!( - stats.fetch(IOType::LevelZeroCompaction, IOOp::Read) + stats.fetch(IoType::LevelZeroCompaction, IoOp::Read) < value_size * 4 + amplification_bytes ); - assert!(stats.fetch(IOType::LevelZeroCompaction, IOOp::Write) > value_size * 3); + assert!(stats.fetch(IoType::LevelZeroCompaction, IoOp::Write) > value_size * 3); assert!( - stats.fetch(IOType::LevelZeroCompaction, IOOp::Write) + stats.fetch(IoType::LevelZeroCompaction, IoOp::Write) < value_size * 3 + amplification_bytes ); } diff --git a/components/engine_rocks/src/import.rs b/components/engine_rocks/src/import.rs index a64da35ae67..1aa65ec07fa 100644 --- a/components/engine_rocks/src/import.rs +++ b/components/engine_rocks/src/import.rs @@ -70,7 +70,7 @@ mod tests { use tempfile::Builder; use super::*; - use crate::{util::new_engine_opt, RocksCfOptions, RocksDBOptions, RocksSstWriterBuilder}; + use crate::{util::new_engine_opt, RocksCfOptions, RocksDbOptions, RocksSstWriterBuilder}; #[test] fn test_ingest_multiple_file() { @@ -90,7 +90,7 @@ mod tests { (*cf, opt) }) .collect(); - let db = new_engine_opt(path_str, RocksDBOptions::default(), cfs_opts).unwrap(); + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); let mut wb = db.write_batch(); for i in 1000..5000 { let v = i.to_string(); diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index a2e394bf8c8..f8b32c72a59 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -111,7 +111,7 @@ pub mod raw; pub fn get_env( key_manager: Option>, - limiter: Option>, + limiter: Option>, ) -> engine_traits::Result> { let env = encryption::get_env(None /* base_env */, key_manager)?; file_system::get_env(Some(env), limiter) diff --git a/components/engine_rocks/src/logger.rs b/components/engine_rocks/src/logger.rs index 9482dd12d25..b7b196448c5 100644 --- a/components/engine_rocks/src/logger.rs +++ b/components/engine_rocks/src/logger.rs @@ -21,9 +21,9 @@ impl Logger for RocksdbLogger { } #[derive(Default)] -pub struct RaftDBLogger; +pub struct RaftDbLogger; -impl Logger for RaftDBLogger { +impl Logger for RaftDbLogger { fn logv(&self, log_level: InfoLogLevel, log: &str) { match log_level { InfoLogLevel::Header => info!(#"raftdb_log_header", "{}", log), diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index ea6d48adb35..fd695bb4d2c 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -1,7 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{ - CFNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, + CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, }; use rocksdb::Range as RocksRange; @@ -343,7 +343,7 @@ mod tests { use crate::{ engine::RocksEngine, util::{new_engine, new_engine_opt}, - RocksCfOptions, RocksDBOptions, + RocksCfOptions, RocksDbOptions, }; fn check_data(db: &RocksEngine, cfs: &[&str], expected: &[(&[u8], &[u8])]) { @@ -520,7 +520,7 @@ mod tests { (*cf, cf_opts) }) .collect(); - let db = new_engine_opt(path_str, RocksDBOptions::default(), cfs_opts).unwrap(); + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); let keys = vec![b"k1", b"k2", b"k3", b"k4"]; @@ -552,7 +552,7 @@ mod tests { .unwrap(); let path_str = path.path().to_str().unwrap(); - let mut opts = RocksDBOptions::default(); + let mut opts = RocksDbOptions::default(); opts.create_if_missing(true); opts.enable_multi_batch_write(true); diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index c142ce01a74..8d049112f92 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -545,7 +545,7 @@ mod tests { use super::*; use crate::{ raw::{DBEntryType, TablePropertiesCollector}, - RocksCfOptions, RocksDBOptions, + RocksCfOptions, RocksDbOptions, }; #[allow(clippy::many_single_char_names)] @@ -715,7 +715,7 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = RocksDBOptions::default(); + let db_opts = RocksDbOptions::default(); let mut cf_opts = RocksCfOptions::default(); cf_opts.set_level_zero_file_num_compaction_trigger(10); cf_opts.add_table_properties_collector_factory( diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index fd52342002f..f1e86903e9d 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -3,7 +3,7 @@ // #[PerformanceCriticalPath] use engine_traits::{ Error, Iterable, KvEngine, MiscExt, Mutable, Peekable, RaftEngine, RaftEngineDebug, - RaftEngineReadOnly, RaftLogBatch, RaftLogGCTask, Result, SyncMutable, WriteBatch, + RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, Result, SyncMutable, WriteBatch, WriteBatchExt, WriteOptions, CF_DEFAULT, RAFT_LOG_MULTI_GET_CNT, }; use kvproto::{ @@ -289,7 +289,7 @@ impl RaftEngine for RocksEngine { self.put_msg(&keys::raft_state_key(raft_group_id), state) } - fn batch_gc(&self, groups: Vec) -> Result { + fn batch_gc(&self, groups: Vec) -> Result { let mut total = 0; let mut raft_wb = self.write_batch_with_cap(4 * 1024); for task in groups { diff --git a/components/engine_rocks/src/snapshot.rs b/components/engine_rocks/src/snapshot.rs index 94724b220f7..c107601c5d6 100644 --- a/components/engine_rocks/src/snapshot.rs +++ b/components/engine_rocks/src/snapshot.rs @@ -9,7 +9,7 @@ use engine_traits::{self, IterOptions, Iterable, Peekable, ReadOptions, Result, use rocksdb::{rocksdb_options::UnsafeSnap, DBIterator, DB}; use crate::{ - db_vector::RocksDBVector, options::RocksReadOptions, r2e, util::get_cf_handle, + db_vector::RocksDbVector, options::RocksReadOptions, r2e, util::get_cf_handle, RocksEngineIterator, }; @@ -71,16 +71,16 @@ impl Iterable for RocksSnapshot { } impl Peekable for RocksSnapshot { - type DBVector = RocksDBVector; + type DbVector = RocksDbVector; - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { let opt: RocksReadOptions = opts.into(); let mut opt = opt.into_raw(); unsafe { opt.set_snapshot(&self.snap); } let v = self.db.get_opt(key, &opt).map_err(r2e)?; - Ok(v.map(RocksDBVector::from_raw)) + Ok(v.map(RocksDbVector::from_raw)) } fn get_value_cf_opt( @@ -88,7 +88,7 @@ impl Peekable for RocksSnapshot { opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result> { + ) -> Result> { let opt: RocksReadOptions = opts.into(); let mut opt = opt.into_raw(); unsafe { @@ -96,6 +96,6 @@ impl Peekable for RocksSnapshot { } let handle = get_cf_handle(self.db.as_ref(), cf)?; let v = self.db.get_cf_opt(handle, key, &opt).map_err(r2e)?; - Ok(v.map(RocksDBVector::from_raw)) + Ok(v.map(RocksDbVector::from_raw)) } } diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 4192eecfcae..ebb18e92de5 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -10,7 +10,7 @@ use rocksdb::{ use slog_global::warn; use crate::{ - cf_options::RocksCfOptions, db_options::RocksDBOptions, engine::RocksEngine, r2e, + cf_options::RocksCfOptions, db_options::RocksDbOptions, engine::RocksEngine, r2e, rocks_metrics_defs::*, }; @@ -27,7 +27,7 @@ pub fn new_default_engine(path: &str) -> Result { } pub fn new_engine(path: &str, cfs: &[&str]) -> Result { - let mut db_opts = RocksDBOptions::default(); + let mut db_opts = RocksDbOptions::default(); db_opts.enable_statistics(true); let cf_opts = cfs.iter().map(|name| (*name, Default::default())).collect(); new_engine_opt(path, db_opts, cf_opts) @@ -35,7 +35,7 @@ pub fn new_engine(path: &str, cfs: &[&str]) -> Result { pub fn new_engine_opt( path: &str, - db_opt: RocksDBOptions, + db_opt: RocksDbOptions, cf_opts: Vec<(&str, RocksCfOptions)>, ) -> Result { let mut db_opt = db_opt.into_raw(); @@ -92,25 +92,24 @@ pub fn new_engine_opt( adjust_dynamic_level_bytes(&cf_descs, name, opt); } - // If all column families exist, just open db. - if existed == needed { - let db = DB::open_cf(db_opt, path, cf_opts.into_iter().collect()).map_err(r2e)?; + let cfds: Vec<_> = cf_opts.into_iter().collect(); + // We have added all missing options by iterating `existed`. If two vecs still + // have same length, then they must have same column families dispite their + // orders. So just open db. + if needed.len() == existed.len() && needed.len() == cfds.len() { + let db = DB::open_cf(db_opt, path, cfds).map_err(r2e)?; return Ok(RocksEngine::new(db)); } // Opens db. - let cfds = cf_opts.into_iter().collect(); db_opt.create_missing_column_families(true); let mut db = DB::open_cf(db_opt, path, cfds).map_err(r2e)?; // Drops discarded column families. - // for cf in existed.iter().filter(|x| needed.iter().find(|y| y == - // x).is_none()) { for cf in cfs_diff(&existed, &needed) { - // Never drop default column families. - if cf != CF_DEFAULT { - db.drop_cf(cf).map_err(r2e)?; - } + // We have checked it at the very beginning, so it must be needed. + assert_ne!(cf, CF_DEFAULT); + db.drop_cf(cf).map_err(r2e)?; } Ok(RocksEngine::new(db)) @@ -334,7 +333,7 @@ pub fn from_raw_perf_level(level: rocksdb::PerfLevel) -> engine_traits::PerfLeve #[cfg(test)] mod tests { - use engine_traits::{CFOptionsExt, CF_DEFAULT}; + use engine_traits::{CfOptionsExt, Peekable, SyncMutable, CF_DEFAULT}; use rocksdb::DB; use tempfile::Builder; @@ -369,7 +368,7 @@ mod tests { let mut opts = RocksCfOptions::default(); opts.set_level_compaction_dynamic_level_bytes(true); cfs_opts.push(("cf_dynamic_level_bytes", opts.clone())); - let db = new_engine_opt(path_str, RocksDBOptions::default(), cfs_opts).unwrap(); + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes"]); check_dynamic_level_bytes(&db); drop(db); @@ -378,11 +377,28 @@ mod tests { let cfs_opts = vec![ (CF_DEFAULT, opts.clone()), ("cf_dynamic_level_bytes", opts.clone()), - ("cf1", opts), + ("cf1", opts.clone()), ]; - let db = new_engine_opt(path_str, RocksDBOptions::default(), cfs_opts).unwrap(); + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); check_dynamic_level_bytes(&db); + for cf in &[CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"] { + db.put_cf(cf, b"k", b"v").unwrap(); + } + drop(db); + + // change order should not cause data corruption. + let cfs_opts = vec![ + ("cf_dynamic_level_bytes", opts.clone()), + ("cf1", opts.clone()), + (CF_DEFAULT, opts), + ]; + let db = new_engine_opt(path_str, RocksDbOptions::default(), cfs_opts).unwrap(); + column_families_must_eq(path_str, vec![CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"]); + check_dynamic_level_bytes(&db); + for cf in &[CF_DEFAULT, "cf_dynamic_level_bytes", "cf1"] { + assert_eq!(db.get_value_cf(cf, b"k").unwrap().unwrap(), b"v"); + } drop(db); // drop cf1. @@ -402,7 +418,7 @@ mod tests { } fn column_families_must_eq(path: &str, excepted: Vec<&str>) { - let opts = RocksDBOptions::default(); + let opts = RocksDbOptions::default(); let cfs_list = DB::list_column_families(&opts, path).unwrap(); let mut cfs_existed: Vec<&str> = cfs_list.iter().map(|v| v.as_str()).collect(); diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index f658fb046fb..e4028feb411 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -227,7 +227,7 @@ mod tests { use tempfile::Builder; use super::{ - super::{util::new_engine_opt, RocksDBOptions}, + super::{util::new_engine_opt, RocksDbOptions}, *, }; use crate::RocksCfOptions; @@ -244,7 +244,7 @@ mod tests { opt.enable_multi_batch_write(false); let engine = new_engine_opt( path.path().join("db").to_str().unwrap(), - RocksDBOptions::from_raw(opt), + RocksDbOptions::from_raw(opt), vec![(CF_DEFAULT, RocksCfOptions::default())], ) .unwrap(); @@ -290,7 +290,7 @@ mod tests { opt.enable_multi_batch_write(true); let engine = new_engine_opt( path.path().join("db").to_str().unwrap(), - RocksDBOptions::from_raw(opt), + RocksDbOptions::from_raw(opt), vec![(CF_DEFAULT, RocksCfOptions::default())], ) .unwrap(); diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index ada430261e3..979fbda17d0 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -65,9 +65,9 @@ pub mod raft { #[cfg(feature = "test-engine-raft-raft-engine")] pub use raft_log_engine::RaftLogEngine as RaftTestEngine; - use crate::ctor::{RaftDBOptions, RaftEngineConstructorExt}; + use crate::ctor::{RaftDbOptions, RaftEngineConstructorExt}; - pub fn new_engine(path: &str, db_opt: Option) -> Result { + pub fn new_engine(path: &str, db_opt: Option) -> Result { RaftTestEngine::new_raft_engine(path, db_opt) } } @@ -91,11 +91,11 @@ pub mod kv { RocksSnapshot as KvTestSnapshot, RocksWriteBatchVec as KvTestWriteBatch, }; use engine_traits::{ - CFOptionsExt, ColumnFamilyOptions, Result, TabletAccessor, TabletFactory, CF_DEFAULT, + CfOptions, CfOptionsExt, Result, TabletAccessor, TabletFactory, CF_DEFAULT, }; use tikv_util::box_err; - use crate::ctor::{ColumnFamilyOptions as KvTestCFOptions, DBOptions, KvEngineConstructorExt}; + use crate::ctor::{CfOptions as KvTestCfOptions, DbOptions, KvEngineConstructorExt}; pub fn new_engine(path: &str, cfs: &[&str]) -> Result { KvTestEngine::new_kv_engine(path, cfs) @@ -103,8 +103,8 @@ pub mod kv { pub fn new_engine_opt( path: &str, - db_opt: DBOptions, - cfs_opts: Vec<(&str, KvTestCFOptions)>, + db_opt: DbOptions, + cfs_opts: Vec<(&str, KvTestCfOptions)>, ) -> Result { KvTestEngine::new_kv_engine_opt(path, db_opt, cfs_opts) } @@ -114,16 +114,16 @@ pub mod kv { #[derive(Clone)] pub struct TestTabletFactory { root_path: String, - db_opt: DBOptions, - cf_opts: Vec<(&'static str, KvTestCFOptions)>, + db_opt: DbOptions, + cf_opts: Vec<(&'static str, KvTestCfOptions)>, registry: Arc>>, } impl TestTabletFactory { pub fn new( root_path: &str, - db_opt: DBOptions, - cf_opts: Vec<(&'static str, KvTestCFOptions)>, + db_opt: DbOptions, + cf_opts: Vec<(&'static str, KvTestCfOptions)>, ) -> Self { Self { root_path: root_path.to_string(), @@ -312,7 +312,7 @@ pub mod ctor { use encryption::DataKeyManager; use engine_traits::Result; - use file_system::IORateLimiter; + use file_system::IoRateLimiter; /// Kv engine construction /// @@ -341,30 +341,30 @@ pub mod ctor { /// If that directory does not exist, then it is created. fn new_kv_engine_opt( path: &str, - db_opt: DBOptions, - cf_opts: Vec<(&str, ColumnFamilyOptions)>, + db_opt: DbOptions, + cf_opts: Vec<(&str, CfOptions)>, ) -> Result; } /// Raft engine construction pub trait RaftEngineConstructorExt: Sized { /// Create a new raft engine. - fn new_raft_engine(path: &str, db_opt: Option) -> Result; + fn new_raft_engine(path: &str, db_opt: Option) -> Result; } #[derive(Clone, Default)] - pub struct DBOptions { + pub struct DbOptions { key_manager: Option>, - rate_limiter: Option>, + rate_limiter: Option>, enable_multi_batch_write: bool, } - impl DBOptions { + impl DbOptions { pub fn set_key_manager(&mut self, key_manager: Option>) { self.key_manager = key_manager; } - pub fn set_rate_limiter(&mut self, rate_limiter: Option>) { + pub fn set_rate_limiter(&mut self, rate_limiter: Option>) { self.rate_limiter = rate_limiter; } @@ -373,7 +373,7 @@ pub mod ctor { } } - pub type RaftDBOptions = DBOptions; + pub type RaftDbOptions = DbOptions; /// Properties for a single column family /// @@ -397,7 +397,7 @@ pub mod ctor { /// In the future TiKV will probably have engine-specific configuration /// options. #[derive(Clone)] - pub struct ColumnFamilyOptions { + pub struct CfOptions { disable_auto_compactions: bool, level_zero_file_num_compaction_trigger: Option, level_zero_slowdown_writes_trigger: Option, @@ -409,9 +409,9 @@ pub mod ctor { no_table_properties: bool, } - impl ColumnFamilyOptions { - pub fn new() -> ColumnFamilyOptions { - ColumnFamilyOptions { + impl CfOptions { + pub fn new() -> CfOptions { + CfOptions { disable_auto_compactions: false, level_zero_file_num_compaction_trigger: None, level_zero_slowdown_writes_trigger: None, @@ -461,7 +461,7 @@ pub mod ctor { } } - impl Default for ColumnFamilyOptions { + impl Default for CfOptions { fn default() -> Self { Self::new() } @@ -471,9 +471,7 @@ pub mod ctor { use engine_panic::PanicEngine; use engine_traits::Result; - use super::{ - ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftEngineConstructorExt, - }; + use super::{CfOptions, DbOptions, KvEngineConstructorExt, RaftEngineConstructorExt}; impl KvEngineConstructorExt for engine_panic::PanicEngine { fn new_kv_engine(_path: &str, _cfs: &[&str]) -> Result { @@ -482,15 +480,15 @@ pub mod ctor { fn new_kv_engine_opt( _path: &str, - _db_opt: DBOptions, - _cfs_opts: Vec<(&str, ColumnFamilyOptions)>, + _db_opt: DbOptions, + _cfs_opts: Vec<(&str, CfOptions)>, ) -> Result { Ok(PanicEngine) } } impl RaftEngineConstructorExt for engine_panic::PanicEngine { - fn new_raft_engine(_path: &str, _db_opt: Option) -> Result { + fn new_raft_engine(_path: &str, _db_opt: Option) -> Result { Ok(PanicEngine) } } @@ -501,21 +499,20 @@ pub mod ctor { get_env, properties::{MvccPropertiesCollectorFactory, RangePropertiesCollectorFactory}, util::new_engine_opt as rocks_new_engine_opt, - RocksCfOptions, RocksDBOptions, + RocksCfOptions, RocksDbOptions, }; - use engine_traits::{ColumnFamilyOptions as ColumnFamilyOptionsTrait, Result, CF_DEFAULT}; + use engine_traits::{CfOptions as _, Result, CF_DEFAULT}; use super::{ - ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftDBOptions, - RaftEngineConstructorExt, + CfOptions, DbOptions, KvEngineConstructorExt, RaftDbOptions, RaftEngineConstructorExt, }; impl KvEngineConstructorExt for engine_rocks::RocksEngine { // FIXME this is duplicating behavior from engine_rocks::util in order to // call set_standard_cf_opts. fn new_kv_engine(path: &str, cfs: &[&str]) -> Result { - let rocks_db_opt = RocksDBOptions::default(); - let default_cf_opt = ColumnFamilyOptions::new(); + let rocks_db_opt = RocksDbOptions::default(); + let default_cf_opt = CfOptions::new(); let rocks_cfs_opts = cfs .iter() .map(|cf_name| (*cf_name, get_rocks_cf_opts(&default_cf_opt))) @@ -525,8 +522,8 @@ pub mod ctor { fn new_kv_engine_opt( path: &str, - db_opt: DBOptions, - cfs_opts: Vec<(&str, ColumnFamilyOptions)>, + db_opt: DbOptions, + cfs_opts: Vec<(&str, CfOptions)>, ) -> Result { let rocks_db_opts = get_rocks_db_opts(db_opt)?; let rocks_cfs_opts = cfs_opts @@ -538,18 +535,18 @@ pub mod ctor { } impl RaftEngineConstructorExt for engine_rocks::RocksEngine { - fn new_raft_engine(path: &str, db_opt: Option) -> Result { + fn new_raft_engine(path: &str, db_opt: Option) -> Result { let rocks_db_opts = match db_opt { Some(db_opt) => get_rocks_db_opts(db_opt)?, - None => RocksDBOptions::default(), + None => RocksDbOptions::default(), }; - let rocks_cf_opts = get_rocks_cf_opts(&ColumnFamilyOptions::new()); + let rocks_cf_opts = get_rocks_cf_opts(&CfOptions::new()); let default_cfs_opts = vec![(CF_DEFAULT, rocks_cf_opts)]; rocks_new_engine_opt(path, rocks_db_opts, default_cfs_opts) } } - fn get_rocks_cf_opts(cf_opts: &ColumnFamilyOptions) -> RocksCfOptions { + fn get_rocks_cf_opts(cf_opts: &CfOptions) -> RocksCfOptions { let mut rocks_cf_opts = RocksCfOptions::new(); if !cf_opts.get_no_range_properties() { rocks_cf_opts.add_table_properties_collector_factory( @@ -576,8 +573,8 @@ pub mod ctor { rocks_cf_opts } - fn get_rocks_db_opts(db_opts: DBOptions) -> Result { - let mut rocks_db_opts = RocksDBOptions::default(); + fn get_rocks_db_opts(db_opts: DbOptions) -> Result { + let mut rocks_db_opts = RocksDbOptions::default(); let env = get_env(db_opts.key_manager.clone(), db_opts.rate_limiter)?; rocks_db_opts.set_env(env); if db_opts.enable_multi_batch_write { @@ -593,10 +590,10 @@ pub mod ctor { use engine_traits::Result; use raft_log_engine::{RaftEngineConfig, RaftLogEngine}; - use super::{RaftDBOptions, RaftEngineConstructorExt}; + use super::{RaftDbOptions, RaftEngineConstructorExt}; impl RaftEngineConstructorExt for raft_log_engine::RaftLogEngine { - fn new_raft_engine(path: &str, db_opts: Option) -> Result { + fn new_raft_engine(path: &str, db_opts: Option) -> Result { let mut config = RaftEngineConfig::default(); config.dir = path.to_owned(); RaftLogEngine::new( diff --git a/components/engine_traits/src/cf_names.rs b/components/engine_traits/src/cf_names.rs index 714139c8530..c33ac11081a 100644 --- a/components/engine_traits/src/cf_names.rs +++ b/components/engine_traits/src/cf_names.rs @@ -1,5 +1,5 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -pub trait CFNamesExt { +pub trait CfNamesExt { fn cf_names(&self) -> Vec<&str>; } diff --git a/components/engine_traits/src/cf_options.rs b/components/engine_traits/src/cf_options.rs index 6498238280f..a43b01670ed 100644 --- a/components/engine_traits/src/cf_options.rs +++ b/components/engine_traits/src/cf_options.rs @@ -1,17 +1,17 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use crate::{db_options::TitanDBOptions, sst_partitioner::SstPartitionerFactory, Result}; +use crate::{db_options::TitanDbOptions, sst_partitioner::SstPartitionerFactory, Result}; /// Trait for engines with column family options -pub trait CFOptionsExt { - type ColumnFamilyOptions: ColumnFamilyOptions; +pub trait CfOptionsExt { + type CfOptions: CfOptions; - fn get_options_cf(&self, cf: &str) -> Result; + fn get_options_cf(&self, cf: &str) -> Result; fn set_options_cf(&self, cf: &str, options: &[(&str, &str)]) -> Result<()>; } -pub trait ColumnFamilyOptions { - type TitanDBOptions: TitanDBOptions; +pub trait CfOptions { + type TitanDbOptions: TitanDbOptions; fn new() -> Self; fn get_max_write_buffer_number(&self) -> u32; @@ -22,7 +22,7 @@ pub trait ColumnFamilyOptions { fn get_hard_pending_compaction_bytes_limit(&self) -> u64; fn get_block_cache_capacity(&self) -> u64; fn set_block_cache_capacity(&self, capacity: u64) -> Result<()>; - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions); + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions); fn get_target_file_size_base(&self) -> u64; fn set_disable_auto_compactions(&mut self, v: bool); fn get_disable_auto_compactions(&self) -> bool; diff --git a/components/engine_traits/src/db_options.rs b/components/engine_traits/src/db_options.rs index 7a6042d3db4..6fbd61b4833 100644 --- a/components/engine_traits/src/db_options.rs +++ b/components/engine_traits/src/db_options.rs @@ -3,16 +3,16 @@ use crate::errors::Result; /// A trait for engines that support setting global options -pub trait DBOptionsExt { - type DBOptions: DBOptions; +pub trait DbOptionsExt { + type DbOptions: DbOptions; - fn get_db_options(&self) -> Self::DBOptions; + fn get_db_options(&self) -> Self::DbOptions; fn set_db_options(&self, options: &[(&str, &str)]) -> Result<()>; } /// A handle to a database's options -pub trait DBOptions { - type TitanDBOptions: TitanDBOptions; +pub trait DbOptions { + type TitanDbOptions: TitanDbOptions; fn new() -> Self; fn get_max_background_jobs(&self) -> i32; @@ -20,11 +20,11 @@ pub trait DBOptions { fn set_rate_bytes_per_sec(&mut self, rate_bytes_per_sec: i64) -> Result<()>; fn get_rate_limiter_auto_tuned(&self) -> Option; fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()>; - fn set_titandb_options(&mut self, opts: &Self::TitanDBOptions); + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions); } /// Titan-specefic options -pub trait TitanDBOptions { +pub trait TitanDbOptions { fn new() -> Self; fn set_min_blob_size(&mut self, size: u64); } diff --git a/components/engine_traits/src/db_vector.rs b/components/engine_traits/src/db_vector.rs index 9caf55d9e22..08bea9f11e5 100644 --- a/components/engine_traits/src/db_vector.rs +++ b/components/engine_traits/src/db_vector.rs @@ -6,4 +6,4 @@ use std::{fmt::Debug, ops::Deref}; /// /// The database may optimize this type to be a view into /// its own cache. -pub trait DBVector: Debug + Deref + for<'a> PartialEq<&'a [u8]> {} +pub trait DbVector: Debug + Deref + for<'a> PartialEq<&'a [u8]> {} diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 1ffbdec1df5..276fb1ed19a 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -23,9 +23,9 @@ pub trait KvEngine: + SyncMutable + Iterable + WriteBatchExt - + DBOptionsExt - + CFNamesExt - + CFOptionsExt + + DbOptionsExt + + CfNamesExt + + CfOptionsExt + ImportExt + SstExt + CompactExt @@ -229,7 +229,7 @@ pub trait TabletFactory: TabletAccessor { pub struct DummyFactory where - EK: CFOptionsExt + Clone + Send + 'static, + EK: CfOptionsExt + Clone + Send + 'static, { pub engine: Option, pub root_path: String, @@ -237,7 +237,7 @@ where impl TabletFactory for DummyFactory where - EK: CFOptionsExt + Clone + Send + 'static, + EK: CfOptionsExt + Clone + Send + 'static, { fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { Ok(self.engine.as_ref().unwrap().clone()) @@ -273,7 +273,7 @@ where } impl TabletAccessor for DummyFactory where - EK: CFOptionsExt + Clone + Send + 'static, + EK: CfOptionsExt + Clone + Send + 'static, { fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &EK)) { if let Some(engine) = &self.engine { @@ -288,14 +288,14 @@ where impl DummyFactory where - EK: CFOptionsExt + Clone + Send + 'static, + EK: CfOptionsExt + Clone + Send + 'static, { pub fn new(engine: Option, root_path: String) -> DummyFactory { DummyFactory { engine, root_path } } } -impl Default for DummyFactory { +impl Default for DummyFactory { fn default() -> Self { Self::new(None, "/tmp".to_string()) } diff --git a/components/engine_traits/src/file_system.rs b/components/engine_traits/src/file_system.rs index 1671c1f0aab..51911b1f58e 100644 --- a/components/engine_traits/src/file_system.rs +++ b/components/engine_traits/src/file_system.rs @@ -2,7 +2,7 @@ use std::sync::Arc; -use file_system::{get_io_rate_limiter, get_io_type, IOOp, IORateLimiter}; +use file_system::{get_io_rate_limiter, get_io_type, IoOp, IoRateLimiter}; use crate::Result; @@ -12,7 +12,7 @@ pub trait FileSystemInspector: Sync + Send { } pub struct EngineFileSystemInspector { - limiter: Option>, + limiter: Option>, } impl EngineFileSystemInspector { @@ -23,7 +23,7 @@ impl EngineFileSystemInspector { } } - pub fn from_limiter(limiter: Option>) -> Self { + pub fn from_limiter(limiter: Option>) -> Self { EngineFileSystemInspector { limiter } } } @@ -38,7 +38,7 @@ impl FileSystemInspector for EngineFileSystemInspector { fn read(&self, len: usize) -> Result { if let Some(limiter) = &self.limiter { let io_type = get_io_type(); - Ok(limiter.request(io_type, IOOp::Read, len)) + Ok(limiter.request(io_type, IoOp::Read, len)) } else { Ok(len) } @@ -47,7 +47,7 @@ impl FileSystemInspector for EngineFileSystemInspector { fn write(&self, len: usize) -> Result { if let Some(limiter) = &self.limiter { let io_type = get_io_type(); - Ok(limiter.request(io_type, IOOp::Write, len)) + Ok(limiter.request(io_type, IoOp::Write, len)) } else { Ok(len) } diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 191e5dcb204..0e709d164bd 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -328,7 +328,7 @@ pub use crate::range::*; mod raft_engine; pub use raft_engine::{ - CacheStats, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, RaftLogGCTask, + CacheStats, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, RAFT_LOG_MULTI_GET_CNT, }; diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 67e32e40bdd..0e6b9600da6 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -6,7 +6,7 @@ //! FIXME: Things here need to be moved elsewhere. use crate::{ - cf_names::CFNamesExt, errors::Result, flow_control_factors::FlowControlFactorsExt, range::Range, + cf_names::CfNamesExt, errors::Result, flow_control_factors::FlowControlFactorsExt, range::Range, }; #[derive(Clone, Debug)] @@ -28,7 +28,7 @@ pub enum DeleteStrategy { DeleteByWriter { sst_path: String }, } -pub trait MiscExt: CFNamesExt + FlowControlFactorsExt { +pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn flush(&self, sync: bool) -> Result<()>; fn flush_cf(&self, cf: &str, sync: bool) -> Result<()>; diff --git a/components/engine_traits/src/peekable.rs b/components/engine_traits/src/peekable.rs index 23318b2a233..fe9e3600abe 100644 --- a/components/engine_traits/src/peekable.rs +++ b/components/engine_traits/src/peekable.rs @@ -10,14 +10,14 @@ use crate::*; /// to read from, or to encode the value as a protobuf message. pub trait Peekable { /// The byte-vector type through which the database returns read values. - type DBVector: DBVector; + type DbVector: DbVector; /// Read a value for a key, given a set of options. /// /// Reads from the default column family. /// /// Returns `None` if they key does not exist. - fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result>; + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result>; /// Read a value for a key from a given column family, given a set of /// options. @@ -28,14 +28,14 @@ pub trait Peekable { opts: &ReadOptions, cf: &str, key: &[u8], - ) -> Result>; + ) -> Result>; /// Read a value for a key. /// /// Uses the default options and column family. /// /// Returns `None` if the key does not exist. - fn get_value(&self, key: &[u8]) -> Result> { + fn get_value(&self, key: &[u8]) -> Result> { self.get_value_opt(&ReadOptions::default(), key) } @@ -44,7 +44,7 @@ pub trait Peekable { /// Uses the default options. /// /// Returns `None` if the key does not exist. - fn get_value_cf(&self, cf: &str, key: &[u8]) -> Result> { + fn get_value_cf(&self, cf: &str, key: &[u8]) -> Result> { self.get_value_cf_opt(&ReadOptions::default(), cf, key) } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index d94d69fa335..a7bd66d3230 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -59,7 +59,7 @@ pub trait RaftEngineDebug: RaftEngine + Sync + Send + 'static { } } -pub struct RaftLogGCTask { +pub struct RaftLogGcTask { pub raft_group_id: u64, pub from: u64, pub to: u64, @@ -107,7 +107,7 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send /// count. Generally, `from` can be passed in `0`. fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result; - fn batch_gc(&self, tasks: Vec) -> Result { + fn batch_gc(&self, tasks: Vec) -> Result { let mut total = 0; for task in tasks { total += self.gc(task.raft_group_id, task.from, task.to)?; diff --git a/components/engine_traits_tests/src/cf_names.rs b/components/engine_traits_tests/src/cf_names.rs index 48031275b14..2cac1eaff73 100644 --- a/components/engine_traits_tests/src/cf_names.rs +++ b/components/engine_traits_tests/src/cf_names.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{CFNamesExt, KvEngine, Snapshot, ALL_CFS, CF_DEFAULT}; +use engine_traits::{CfNamesExt, KvEngine, Snapshot, ALL_CFS, CF_DEFAULT}; use super::{default_engine, engine_cfs}; diff --git a/components/engine_traits_tests/src/ctor.rs b/components/engine_traits_tests/src/ctor.rs index 5f39ad4f3a7..2ab7a7360a7 100644 --- a/components/engine_traits_tests/src/ctor.rs +++ b/components/engine_traits_tests/src/ctor.rs @@ -5,7 +5,7 @@ use std::fs; use engine_test::{ - ctor::{ColumnFamilyOptions, DBOptions, KvEngineConstructorExt}, + ctor::{CfOptions, DbOptions, KvEngineConstructorExt}, kv::KvTestEngine, }; use engine_traits::{KvEngine, SyncMutable, ALL_CFS}; @@ -23,11 +23,8 @@ fn new_engine_basic() { fn new_engine_opt_basic() { let dir = tempdir(); let path = dir.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ALL_CFS - .iter() - .map(|cf| (*cf, ColumnFamilyOptions::new())) - .collect(); + let db_opts = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let _db = KvTestEngine::new_kv_engine_opt(path, db_opts, cf_opts).unwrap(); } @@ -47,11 +44,8 @@ fn new_engine_opt_missing_dir() { let dir = tempdir(); let path = dir.path(); let path = path.join("missing").to_str().unwrap().to_owned(); - let db_opts = DBOptions::default(); - let cf_opts = ALL_CFS - .iter() - .map(|cf| (*cf, ColumnFamilyOptions::new())) - .collect(); + let db_opts = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let db = KvTestEngine::new_kv_engine_opt(&path, db_opts, cf_opts).unwrap(); db.put(b"foo", b"bar").unwrap(); db.sync().unwrap(); @@ -90,11 +84,8 @@ fn new_engine_opt_readonly_dir() { fs::set_permissions(&path, perms).unwrap(); let path = path.to_str().unwrap(); - let db_opts = DBOptions::default(); - let cf_opts = ALL_CFS - .iter() - .map(|cf| (*cf, ColumnFamilyOptions::new())) - .collect(); + let db_opts = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let err = KvTestEngine::new_kv_engine_opt(path, db_opts, cf_opts); assert!(err.is_err()); diff --git a/components/engine_traits_tests/src/lib.rs b/components/engine_traits_tests/src/lib.rs index 73c741ff925..d9b6af12f09 100644 --- a/components/engine_traits_tests/src/lib.rs +++ b/components/engine_traits_tests/src/lib.rs @@ -75,8 +75,7 @@ fn default_engine() -> TempDirEnginePair { fn multi_batch_write_engine() -> TempDirEnginePair { use engine_test::{ ctor::{ - ColumnFamilyOptions as KvTestCFOptions, DBOptions as KvTestDBOptions, - KvEngineConstructorExt, + CfOptions as KvTestCfOptions, DbOptions as KvTestDbOptions, KvEngineConstructorExt, }, kv::KvTestEngine, }; @@ -84,10 +83,10 @@ fn multi_batch_write_engine() -> TempDirEnginePair { let dir = tempdir(); let path = dir.path().to_str().unwrap(); - let mut opt = KvTestDBOptions::default(); + let mut opt = KvTestDbOptions::default(); opt.set_enable_multi_batch_write(true); let engine = - KvTestEngine::new_kv_engine_opt(path, opt, vec![(CF_DEFAULT, KvTestCFOptions::new())]) + KvTestEngine::new_kv_engine_opt(path, opt, vec![(CF_DEFAULT, KvTestCfOptions::new())]) .unwrap(); TempDirEnginePair { engine, diff --git a/components/engine_traits_tests/src/scenario_writes.rs b/components/engine_traits_tests/src/scenario_writes.rs index 3e250c21198..c9b1b1d5fb7 100644 --- a/components/engine_traits_tests/src/scenario_writes.rs +++ b/components/engine_traits_tests/src/scenario_writes.rs @@ -105,7 +105,7 @@ impl WriteScenarioEngine { } } - fn get_value(&self, key: &[u8]) -> Result::DBVector>> { + fn get_value(&self, key: &[u8]) -> Result::DbVector>> { use WriteScenario::*; match self.scenario { NoCf | DefaultCf | WriteBatchNoCf | WriteBatchDefaultCf => { diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/export/src/export.rs index 00048522752..0fb24ef48ce 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/export/src/export.rs @@ -29,7 +29,7 @@ pub use external_storage::{ }; use futures_io::AsyncRead; #[cfg(feature = "cloud-gcp")] -pub use gcp::{Config as GCSConfig, GCSStorage}; +pub use gcp::{Config as GcsConfig, GcsStorage}; pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; #[cfg(any(feature = "cloud-gcp", feature = "cloud-aws", feature = "cloud-azure"))] use kvproto::brpb::{AzureBlobStorage, Gcs, S3}; @@ -139,7 +139,7 @@ fn create_config(backend: &Backend) -> Option>> { } #[cfg(feature = "cloud-gcp")] Backend::Gcs(config) => { - let conf = GCSConfig::from_input(config.clone()); + let conf = GcsConfig::from_input(config.clone()); Some(conf.map(|c| Box::new(c) as Box)) } #[cfg(feature = "cloud-azure")] @@ -155,7 +155,7 @@ fn create_config(backend: &Backend) -> Option>> { } #[cfg(feature = "cloud-gcp")] "gcp" | "gcs" => { - let conf = GCSConfig::from_cloud_dynamic(&dyn_backend); + let conf = GcsConfig::from_cloud_dynamic(&dyn_backend); Some(conf.map(|c| Box::new(c) as Box)) } #[cfg(feature = "cloud-azure")] @@ -191,14 +191,14 @@ fn create_backend_inner( blob_store(s) } #[cfg(feature = "cloud-gcp")] - Backend::Gcs(config) => blob_store(GCSStorage::from_input(config.clone())?), + Backend::Gcs(config) => blob_store(GcsStorage::from_input(config.clone())?), #[cfg(feature = "cloud-azure")] Backend::AzureBlobStorage(config) => blob_store(AzureStorage::from_input(config.clone())?), Backend::CloudDynamic(dyn_backend) => match dyn_backend.provider_name.as_str() { #[cfg(feature = "cloud-aws")] "aws" | "s3" => blob_store(S3Storage::from_cloud_dynamic(dyn_backend)?), #[cfg(feature = "cloud-gcp")] - "gcp" | "gcs" => blob_store(GCSStorage::from_cloud_dynamic(dyn_backend)?), + "gcp" | "gcs" => blob_store(GcsStorage::from_cloud_dynamic(dyn_backend)?), #[cfg(feature = "cloud-azure")] "azure" | "azblob" => blob_store(AzureStorage::from_cloud_dynamic(dyn_backend)?), _ => { diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index 8c9ea242b98..afae433e54a 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -16,7 +16,7 @@ use std::{ }; use async_trait::async_trait; -use encryption::{encryption_method_from_db_encryption_method, DecrypterReader, Iv}; +use encryption::{from_engine_encryption_method, DecrypterReader, Iv}; use engine_traits::FileEncryptionInfo; use file_system::File; use futures_io::AsyncRead; @@ -152,7 +152,7 @@ pub fn encrypt_wrap_reader<'a>( let input = match file_crypter { Some(x) => Box::new(DecrypterReader::new( reader, - encryption_method_from_db_encryption_method(x.method), + from_engine_encryption_method(x.method), &x.key, Iv::from_slice(&x.iv)?, )?), diff --git a/components/file_system/src/file.rs b/components/file_system/src/file.rs index 1c56b240f1d..c072b8f852f 100644 --- a/components/file_system/src/file.rs +++ b/components/file_system/src/file.rs @@ -13,13 +13,13 @@ use std::{ // Extention Traits use fs2::FileExt; -use super::{get_io_rate_limiter, get_io_type, IOOp, IORateLimiter}; +use super::{get_io_rate_limiter, get_io_type, IoOp, IoRateLimiter}; /// A wrapper around `std::fs::File` with capability to track and regulate IO /// flow. pub struct File { inner: fs::File, - limiter: Option>, + limiter: Option>, } impl Debug for File { @@ -40,7 +40,7 @@ impl File { #[cfg(test)] pub fn open_with_limiter>( path: P, - limiter: Option>, + limiter: Option>, ) -> io::Result { let inner = fs::File::open(path)?; Ok(File { inner, limiter }) @@ -57,7 +57,7 @@ impl File { #[cfg(test)] pub fn create_with_limiter>( path: P, - limiter: Option>, + limiter: Option>, ) -> io::Result { let inner = fs::File::create(path)?; Ok(File { inner, limiter }) @@ -105,7 +105,7 @@ impl Read for File { let mut remains = buf.len(); let mut pos = 0; while remains > 0 { - let allowed = limiter.request(get_io_type(), IOOp::Read, remains); + let allowed = limiter.request(get_io_type(), IoOp::Read, remains); let read = self.inner.read(&mut buf[pos..pos + allowed])?; pos += read; remains -= read; @@ -132,7 +132,7 @@ impl Write for File { let mut remains = buf.len(); let mut pos = 0; while remains > 0 { - let allowed = limiter.request(get_io_type(), IOOp::Write, remains); + let allowed = limiter.request(get_io_type(), IoOp::Write, remains); let written = self.inner.write(&buf[pos..pos + allowed])?; pos += written; remains -= written; @@ -262,7 +262,7 @@ mod tests { .prefix("test_instrumented_file") .tempdir() .unwrap(); - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); // make sure read at most one bytes at a time limiter.set_io_rate_limit(20 /* 1s / refill_period */); let stats = limiter.statistics().unwrap(); @@ -270,24 +270,24 @@ mod tests { let tmp_file = tmp_dir.path().join("instrumented.txt"); let content = String::from("drink full and descend"); { - let _guard = WithIOType::new(IOType::ForegroundWrite); + let _guard = WithIoType::new(IoType::ForegroundWrite); let mut f = File::create_with_limiter(&tmp_file, Some(limiter.clone())).unwrap(); f.write_all(content.as_bytes()).unwrap(); f.sync_all().unwrap(); assert_eq!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write), + stats.fetch(IoType::ForegroundWrite, IoOp::Write), content.len() ); } { - let _guard = WithIOType::new(IOType::Export); + let _guard = WithIoType::new(IoType::Export); let mut buffer = String::new(); let mut f = File::open_with_limiter(&tmp_file, Some(limiter)).unwrap(); assert_eq!(f.read_to_string(&mut buffer).unwrap(), content.len()); assert_eq!(buffer, content); // read_to_string only exit when file.read() returns zero, which means // it requires two EOF reads to finish the call. - assert_eq!(stats.fetch(IOType::Export, IOOp::Read), content.len() + 2); + assert_eq!(stats.fetch(IoType::Export, IoOp::Read), content.len() + 2); } } diff --git a/components/file_system/src/io_stats/biosnoop.rs b/components/file_system/src/io_stats/biosnoop.rs index d156d94f77c..6b804bfed87 100644 --- a/components/file_system/src/io_stats/biosnoop.rs +++ b/components/file_system/src/io_stats/biosnoop.rs @@ -14,7 +14,7 @@ use crossbeam_utils::CachePadded; use strum::{EnumCount, IntoEnumIterator}; use tikv_util::sys::thread; -use crate::{metrics::*, IOBytes, IOType}; +use crate::{metrics::*, IoBytes, IoType}; /// Biosnoop leverages BCC to make use of eBPF to get disk IO of TiKV requests. /// The BCC code is in `biosnoop.c` which is compiled and attached kernel on @@ -29,7 +29,7 @@ use crate::{metrics::*, IOBytes, IOType}; /// by address, then all the IO requests for that thread will be recorded in /// corresponding type's map in BCC. /// -/// With that information, every time calling `IOContext` it get the stored +/// With that information, every time calling `IoContext` it get the stored /// stats from corresponding type's map in BCC. Thus it enables TiKV to get the /// latency and bytes of read/write request per IO-type. @@ -37,9 +37,9 @@ const MAX_THREAD_IDX: usize = 192; // Hold the BPF to keep it not dropped. // The two tables are `stats_by_type` and `type_by_pid` respectively. -static mut BPF_CONTEXT: Option = None; +static mut BPF_CONTEXT: Option = None; -struct BPFContext { +struct BpfContext { bpf: BPF, stats_table: Table, type_table: Table, @@ -56,9 +56,9 @@ struct BPFContext { // and kernel. Thus no need to make the elements atomic. Also use padding to // avoid false sharing. // Leave the last element as reserved, when there is no available index, all -// other threads will be allocated to that index with IOType::Other always. -static mut IO_TYPE_ARRAY: [CachePadded; MAX_THREAD_IDX + 1] = - [CachePadded::new(IOType::Other); MAX_THREAD_IDX + 1]; +// other threads will be allocated to that index with IoType::Other always. +static mut IO_TYPE_ARRAY: [CachePadded; MAX_THREAD_IDX + 1] = + [CachePadded::new(IoType::Other); MAX_THREAD_IDX + 1]; // The index of the element of IO_TYPE_ARRAY for this thread to access. thread_local! { @@ -71,7 +71,7 @@ thread_local! { &mut tid.to_ne_bytes(), std::slice::from_raw_parts_mut( ptr as *mut u8, - std::mem::size_of::<*const IOType>(), + std::mem::size_of::<*const IoType>(), ), ).unwrap(); } @@ -83,7 +83,7 @@ struct IdxWrapper(usize); impl Drop for IdxWrapper { fn drop(&mut self) { - unsafe { *IO_TYPE_ARRAY[self.0] = IOType::Other }; + unsafe { *IO_TYPE_ARRAY[self.0] = IoType::Other }; IDX_ALLOCATOR.free(self.0); // drop() of static variables won't be called when program exits. @@ -134,10 +134,10 @@ impl IdxAllocator { } } -pub fn set_io_type(new_io_type: IOType) { +pub fn set_io_type(new_io_type: IoType) { unsafe { IDX.with(|idx| { - // if MAX_THREAD_IDX, keep IOType::Other always + // if MAX_THREAD_IDX, keep IoType::Other always if idx.0 != MAX_THREAD_IDX { *IO_TYPE_ARRAY[idx.0] = new_io_type; } @@ -145,22 +145,22 @@ pub fn set_io_type(new_io_type: IOType) { }; } -pub fn get_io_type() -> IOType { +pub fn get_io_type() -> IoType { unsafe { *IDX.with(|idx| IO_TYPE_ARRAY[idx.0]) } } -pub fn fetch_io_bytes() -> [IOBytes; IOType::COUNT] { +pub fn fetch_io_bytes() -> [IoBytes; IoType::COUNT] { let mut bytes = Default::default(); unsafe { if let Some(ctx) = BPF_CONTEXT.as_mut() { - for io_type in IOType::iter() { - let io_type_buf_ptr = &mut io_type as *mut IOType as *mut u8; + for io_type in IoType::iter() { + let io_type_buf_ptr = &mut io_type as *mut IoType as *mut u8; let mut io_type_buf = - std::slice::from_raw_parts_mut(io_type_buf_ptr, std::mem::size_of::()); + std::slice::from_raw_parts_mut(io_type_buf_ptr, std::mem::size_of::()); if let Ok(e) = ctx.stats_table.get(&mut io_type_buf) { - assert!(e.len() == std::mem::size_of::()); + assert!(e.len() == std::mem::size_of::()); bytes[io_type as usize] = - std::ptr::read_unaligned(e.as_ptr() as *const IOBytes); + std::ptr::read_unaligned(e.as_ptr() as *const IoBytes); } } } @@ -210,7 +210,7 @@ pub fn init() -> Result<(), String> { let stats_table = bpf.table("stats_by_type").map_err(|e| e.to_string())?; let type_table = bpf.table("type_by_pid").map_err(|e| e.to_string())?; unsafe { - BPF_CONTEXT = Some(BPFContext { + BPF_CONTEXT = Some(BpfContext { bpf, stats_table, type_table, @@ -286,7 +286,7 @@ mod tests { fetch_io_bytes, flush_io_latency_metrics, get_io_type, init, set_io_type, BPF_CONTEXT, MAX_THREAD_IDX, }; - use crate::{metrics::*, IOType, OpenOptions}; + use crate::{metrics::*, IoType, OpenOptions}; #[test] fn test_biosnoop() { @@ -301,8 +301,8 @@ mod tests { } fn test_io_context() { - set_io_type(IOType::Compaction); - assert_eq!(get_io_type(), IOType::Compaction); + set_io_type(IoType::Compaction); + assert_eq!(get_io_type(), IoType::Compaction); let tmp = TempDir::new().unwrap(); let file_path = tmp.path().join("test_io_context"); let mut f = OpenOptions::new() @@ -313,18 +313,18 @@ mod tests { .unwrap(); let mut w = vec![A512::default(); 2]; w.as_bytes_mut()[512] = 42; - let mut compaction_bytes_before = fetch_io_bytes()[IOType::Compaction as usize]; + let mut compaction_bytes_before = fetch_io_bytes()[IoType::Compaction as usize]; f.write(w.as_bytes()).unwrap(); f.sync_all().unwrap(); - let compaction_bytes = fetch_io_bytes()[IOType::Compaction as usize]; + let compaction_bytes = fetch_io_bytes()[IoType::Compaction as usize]; assert_ne!((compaction_bytes - compaction_bytes_before).write, 0); assert_eq!((compaction_bytes - compaction_bytes_before).read, 0); compaction_bytes_before = compaction_bytes; drop(f); - let other_bytes_before = fetch_io_bytes()[IOType::Other as usize]; + let other_bytes_before = fetch_io_bytes()[IoType::Other as usize]; std::thread::spawn(move || { - set_io_type(IOType::Other); + set_io_type(IoType::Other); let mut f = OpenOptions::new() .read(true) .custom_flags(O_DIRECT) @@ -337,8 +337,8 @@ mod tests { .join() .unwrap(); - let compaction_bytes = fetch_io_bytes()[IOType::Compaction as usize]; - let other_bytes = fetch_io_bytes()[IOType::Other as usize]; + let compaction_bytes = fetch_io_bytes()[IoType::Compaction as usize]; + let other_bytes = fetch_io_bytes()[IoType::Other as usize]; assert_eq!((compaction_bytes - compaction_bytes_before).write, 0); assert_eq!((compaction_bytes - compaction_bytes_before).read, 0); assert_eq!((other_bytes - other_bytes_before).write, 0); @@ -353,7 +353,7 @@ mod tests { // the thread indexes should be recycled. for _ in 1..=MAX_THREAD_IDX * 2 { std::thread::spawn(|| { - set_io_type(IOType::Other); + set_io_type(IoType::Other); }) .join() .unwrap(); @@ -365,7 +365,7 @@ mod tests { for _ in 1..=MAX_THREAD_IDX { let pair1 = pair.clone(); let h = std::thread::spawn(move || { - set_io_type(IOType::Compaction); + set_io_type(IoType::Compaction); let (lock, cvar) = &*pair1; let mut stop = lock.lock().unwrap(); while !*stop { @@ -375,11 +375,11 @@ mod tests { handles.push(h); } - // the reserved index is used, io type should be IOType::Other + // the reserved index is used, io type should be IoType::Other for _ in 1..=MAX_THREAD_IDX { std::thread::spawn(|| { - set_io_type(IOType::Compaction); - assert_eq!(get_io_type(), IOType::Other); + set_io_type(IoType::Compaction); + assert_eq!(get_io_type(), IoType::Other); }) .join() .unwrap(); @@ -399,8 +399,8 @@ mod tests { // the thread indexes should be available again. for _ in 1..=MAX_THREAD_IDX { std::thread::spawn(|| { - set_io_type(IOType::Compaction); - assert_eq!(get_io_type(), IOType::Compaction); + set_io_type(IoType::Compaction); + assert_eq!(get_io_type(), IoType::Compaction); }) .join() .unwrap(); @@ -439,7 +439,7 @@ mod tests { #[ignore] fn bench_flush_io_latency_metrics(b: &mut Bencher) { init().unwrap(); - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); let tmp = TempDir::new().unwrap(); let file_path = tmp.path().join("bench_flush_io_latency_metrics"); @@ -476,7 +476,7 @@ mod tests { w.as_bytes_mut()[64] = 42; b.iter(|| { - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); f.write(w.as_bytes()).unwrap(); f.sync_all().unwrap(); }); @@ -509,7 +509,7 @@ mod tests { .unwrap(); let mut r = vec![A512::default(); 2]; b.iter(|| { - set_io_type(IOType::ForegroundRead); + set_io_type(IoType::ForegroundRead); f.seek(SeekFrom::Start(rng.gen_range(0..100) * 512)) .unwrap(); assert_ne!(f.read(&mut r.as_bytes_mut()).unwrap(), 0); diff --git a/components/file_system/src/io_stats/mod.rs b/components/file_system/src/io_stats/mod.rs index d9c7ae9d519..e4c0017451f 100644 --- a/components/file_system/src/io_stats/mod.rs +++ b/components/file_system/src/io_stats/mod.rs @@ -6,27 +6,27 @@ mod stub { use strum::EnumCount; - use crate::{IOBytes, IOType}; + use crate::{IoBytes, IoType}; pub fn init() -> Result<(), String> { Err("No I/O tracing tool available".to_owned()) } thread_local! { - static IO_TYPE: Cell = Cell::new(IOType::Other); + static IO_TYPE: Cell = Cell::new(IoType::Other); } - pub fn set_io_type(new_io_type: IOType) { + pub fn set_io_type(new_io_type: IoType) { IO_TYPE.with(|io_type| { io_type.set(new_io_type); }); } - pub fn get_io_type() -> IOType { + pub fn get_io_type() -> IoType { IO_TYPE.with(|io_type| io_type.get()) } - pub fn fetch_io_bytes() -> [IOBytes; IOType::COUNT] { + pub fn fetch_io_bytes() -> [IoBytes; IoType::COUNT] { Default::default() } } @@ -48,7 +48,7 @@ mod tests { use tikv_util::sys::thread::StdThreadBuildWrapper; use super::*; - use crate::IOType; + use crate::IoType; #[bench] fn bench_fetch_io_bytes(b: &mut test::Bencher) { @@ -57,7 +57,7 @@ mod tests { .map(|_| { let tx_clone = tx.clone(); std::thread::Builder::new().spawn_wrapper(move || { - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); tx_clone.send(()).unwrap(); }) }) @@ -75,14 +75,14 @@ mod tests { .map(|_| { let tx_clone = tx.clone(); std::thread::Builder::new().spawn_wrapper(move || { - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); tx_clone.send(()).unwrap(); }) }) .collect::>(); b.iter(|| match get_io_type() { - IOType::ForegroundWrite => set_io_type(IOType::ForegroundRead), - _ => set_io_type(IOType::ForegroundWrite), + IoType::ForegroundWrite => set_io_type(IoType::ForegroundRead), + _ => set_io_type(IoType::ForegroundWrite), }); for _ in 0..8 { rx.recv().unwrap(); diff --git a/components/file_system/src/io_stats/proc.rs b/components/file_system/src/io_stats/proc.rs index 836b5f5fdf0..07856ebe9c0 100644 --- a/components/file_system/src/io_stats/proc.rs +++ b/components/file_system/src/io_stats/proc.rs @@ -18,18 +18,18 @@ use tikv_util::{ warn, }; -use crate::{IOBytes, IOType}; +use crate::{IoBytes, IoType}; lazy_static! { /// Total I/O bytes read/written by each I/O type. - static ref GLOBAL_IO_STATS: [AtomicIOBytes; IOType::COUNT] = Default::default(); + static ref GLOBAL_IO_STATS: [AtomicIoBytes; IoType::COUNT] = Default::default(); /// Incremental I/O bytes read/written by the thread's own I/O type. - static ref LOCAL_IO_STATS: ThreadLocal>> = ThreadLocal::new(); + static ref LOCAL_IO_STATS: ThreadLocal>> = ThreadLocal::new(); } thread_local! { /// A private copy of I/O type. Optimized for local access. - static IO_TYPE: Cell = Cell::new(IOType::Other); + static IO_TYPE: Cell = Cell::new(IoType::Other); } #[derive(Debug)] @@ -50,7 +50,7 @@ impl ThreadID { } } - fn fetch_io_bytes(&mut self) -> Option { + fn fetch_io_bytes(&mut self) -> Option { if self.proc_reader.is_none() { let path = PathBuf::from("/proc") .join(format!("{}", self.pid)) @@ -73,7 +73,7 @@ impl ThreadID { warn!("failed to seek proc file: {}", e); }) .ok()?; - let mut io_bytes = IOBytes::default(); + let mut io_bytes = IoBytes::default(); for line in reader.lines() { let line = line .map_err(|e| { @@ -101,37 +101,37 @@ impl ThreadID { } } -struct LocalIOStats { +struct LocalIoStats { id: ThreadID, - io_type: IOType, - last_flushed: IOBytes, + io_type: IoType, + last_flushed: IoBytes, } -impl LocalIOStats { +impl LocalIoStats { fn current() -> Self { - LocalIOStats { + LocalIoStats { id: ThreadID::current(), - io_type: IOType::Other, - last_flushed: IOBytes::default(), + io_type: IoType::Other, + last_flushed: IoBytes::default(), } } } #[derive(Default)] -struct AtomicIOBytes { +struct AtomicIoBytes { read: AtomicU64, write: AtomicU64, } -impl AtomicIOBytes { - fn load(&self, order: Ordering) -> IOBytes { - IOBytes { +impl AtomicIoBytes { + fn load(&self, order: Ordering) -> IoBytes { + IoBytes { read: self.read.load(order), write: self.write.load(order), } } - fn fetch_add(&self, other: IOBytes, order: Ordering) { + fn fetch_add(&self, other: IoBytes, order: Ordering) { self.read.fetch_add(other.read, order); self.write.fetch_add(other.write, order); } @@ -139,7 +139,7 @@ impl AtomicIOBytes { /// Flushes the local I/O stats to global I/O stats. #[inline] -fn flush_thread_io(sentinel: &mut LocalIOStats) { +fn flush_thread_io(sentinel: &mut LocalIoStats) { if let Some(io_bytes) = sentinel.id.fetch_io_bytes() { GLOBAL_IO_STATS[sentinel.io_type as usize] .fetch_add(io_bytes - sentinel.last_flushed, Ordering::Relaxed); @@ -151,11 +151,11 @@ pub fn init() -> Result<(), String> { Ok(()) } -pub fn set_io_type(new_io_type: IOType) { +pub fn set_io_type(new_io_type: IoType) { IO_TYPE.with(|io_type| { if io_type.get() != new_io_type { let mut sentinel = LOCAL_IO_STATS - .get_or(|| CachePadded::new(Mutex::new(LocalIOStats::current()))) + .get_or(|| CachePadded::new(Mutex::new(LocalIoStats::current()))) .lock(); flush_thread_io(&mut sentinel); sentinel.io_type = new_io_type; @@ -164,16 +164,16 @@ pub fn set_io_type(new_io_type: IOType) { }); } -pub fn get_io_type() -> IOType { +pub fn get_io_type() -> IoType { IO_TYPE.with(|io_type| io_type.get()) } -pub fn fetch_io_bytes() -> [IOBytes; IOType::COUNT] { - let mut bytes: [IOBytes; IOType::COUNT] = Default::default(); +pub fn fetch_io_bytes() -> [IoBytes; IoType::COUNT] { + let mut bytes: [IoBytes; IoType::COUNT] = Default::default(); LOCAL_IO_STATS.iter().for_each(|sentinel| { flush_thread_io(&mut sentinel.lock()); }); - for i in 0..IOType::COUNT { + for i in 0..IoType::COUNT { bytes[i] = GLOBAL_IO_STATS[i].load(Ordering::Relaxed); } bytes @@ -191,14 +191,14 @@ mod tests { use tempfile::{tempdir, tempdir_in}; use super::*; - use crate::{OpenOptions, WithIOType}; + use crate::{OpenOptions, WithIoType}; #[test] fn test_read_bytes() { let tmp = tempdir_in("/var/tmp").unwrap_or_else(|_| tempdir().unwrap()); let file_path = tmp.path().join("test_read_bytes.txt"); let mut id = ThreadID::current(); - let _type = WithIOType::new(IOType::Compaction); + let _type = WithIoType::new(IoType::Compaction); { let mut f = OpenOptions::new() .write(true) @@ -230,7 +230,7 @@ mod tests { let tmp = tempdir_in("/var/tmp").unwrap_or_else(|_| tempdir().unwrap()); let file_path = tmp.path().join("test_write_bytes.txt"); let mut id = ThreadID::current(); - let _type = WithIOType::new(IOType::Compaction); + let _type = WithIoType::new(IoType::Compaction); let mut f = OpenOptions::new() .write(true) .create(true) diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index 104b7371537..0bacbdef428 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -42,21 +42,21 @@ use openssl::{ hash::{self, Hasher, MessageDigest}, }; pub use rate_limiter::{ - get_io_rate_limiter, set_io_rate_limiter, IOBudgetAdjustor, IORateLimitMode, IORateLimiter, - IORateLimiterStatistics, + get_io_rate_limiter, set_io_rate_limiter, IoBudgetAdjustor, IoRateLimitMode, IoRateLimiter, + IoRateLimiterStatistics, }; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use strum::{EnumCount, EnumIter}; #[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum IOOp { +pub enum IoOp { Read, Write, } #[repr(C)] #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, EnumCount, EnumIter)] -pub enum IOType { +pub enum IoType { Other = 0, // Including coprocessor and storage read. ForegroundRead = 1, @@ -74,37 +74,37 @@ pub enum IOType { Export = 10, } -impl IOType { +impl IoType { pub fn as_str(&self) -> &str { match *self { - IOType::Other => "other", - IOType::ForegroundRead => "foreground_read", - IOType::ForegroundWrite => "foreground_write", - IOType::Flush => "flush", - IOType::LevelZeroCompaction => "level_zero_compaction", - IOType::Compaction => "compaction", - IOType::Replication => "replication", - IOType::LoadBalance => "load_balance", - IOType::Gc => "gc", - IOType::Import => "import", - IOType::Export => "export", + IoType::Other => "other", + IoType::ForegroundRead => "foreground_read", + IoType::ForegroundWrite => "foreground_write", + IoType::Flush => "flush", + IoType::LevelZeroCompaction => "level_zero_compaction", + IoType::Compaction => "compaction", + IoType::Replication => "replication", + IoType::LoadBalance => "load_balance", + IoType::Gc => "gc", + IoType::Import => "import", + IoType::Export => "export", } } } -pub struct WithIOType { - previous_io_type: IOType, +pub struct WithIoType { + previous_io_type: IoType, } -impl WithIOType { - pub fn new(new_io_type: IOType) -> WithIOType { +impl WithIoType { + pub fn new(new_io_type: IoType) -> WithIoType { let previous_io_type = get_io_type(); set_io_type(new_io_type); - WithIOType { previous_io_type } + WithIoType { previous_io_type } } } -impl Drop for WithIOType { +impl Drop for WithIoType { fn drop(&mut self) { set_io_type(self.previous_io_type); } @@ -112,12 +112,12 @@ impl Drop for WithIOType { #[repr(C)] #[derive(Debug, Copy, Clone, Default)] -pub struct IOBytes { +pub struct IoBytes { read: u64, write: u64, } -impl std::ops::Sub for IOBytes { +impl std::ops::Sub for IoBytes { type Output = Self; fn sub(self, other: Self) -> Self::Output { @@ -130,18 +130,18 @@ impl std::ops::Sub for IOBytes { #[repr(u32)] #[derive(Debug, Clone, PartialEq, Eq, Copy, EnumCount)] -pub enum IOPriority { +pub enum IoPriority { Low = 0, Medium = 1, High = 2, } -impl IOPriority { +impl IoPriority { pub fn as_str(&self) -> &str { match *self { - IOPriority::Low => "low", - IOPriority::Medium => "medium", - IOPriority::High => "high", + IoPriority::Low => "low", + IoPriority::Medium => "medium", + IoPriority::High => "high", } } @@ -150,19 +150,19 @@ impl IOPriority { } } -impl std::str::FromStr for IOPriority { +impl std::str::FromStr for IoPriority { type Err = String; - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { match s { - "low" => Ok(IOPriority::Low), - "medium" => Ok(IOPriority::Medium), - "high" => Ok(IOPriority::High), + "low" => Ok(IoPriority::Low), + "medium" => Ok(IoPriority::Medium), + "high" => Ok(IoPriority::High), s => Err(format!("expect: low, medium or high, got: {:?}", s)), } } } -impl Serialize for IOPriority { +impl Serialize for IoPriority { fn serialize(&self, serializer: S) -> Result where S: Serializer, @@ -171,7 +171,7 @@ impl Serialize for IOPriority { } } -impl<'de> Deserialize<'de> for IOPriority { +impl<'de> Deserialize<'de> for IoPriority { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, @@ -179,17 +179,17 @@ impl<'de> Deserialize<'de> for IOPriority { use serde::de::{Error, Unexpected, Visitor}; struct StrVistor; impl<'de> Visitor<'de> for StrVistor { - type Value = IOPriority; + type Value = IoPriority; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(formatter, "a IO priority") } - fn visit_str(self, value: &str) -> Result + fn visit_str(self, value: &str) -> Result where E: Error, { - let p = match IOPriority::from_str(&*value.trim().to_lowercase()) { + let p = match IoPriority::from_str(&*value.trim().to_lowercase()) { Ok(p) => p, _ => { return Err(E::invalid_value( @@ -206,15 +206,15 @@ impl<'de> Deserialize<'de> for IOPriority { } } -impl From for ConfigValue { - fn from(mode: IOPriority) -> ConfigValue { +impl From for ConfigValue { + fn from(mode: IoPriority) -> ConfigValue { ConfigValue::String(mode.as_str().to_owned()) } } -impl TryFrom for IOPriority { +impl TryFrom for IoPriority { type Error = String; - fn try_from(c: ConfigValue) -> Result { + fn try_from(c: ConfigValue) -> Result { if let ConfigValue::String(s) = c { Self::from_str(s.as_str()) } else { diff --git a/components/file_system/src/metrics.rs b/components/file_system/src/metrics.rs index e968eaaece6..8aecc6b21c7 100644 --- a/components/file_system/src/metrics.rs +++ b/components/file_system/src/metrics.rs @@ -6,7 +6,7 @@ use prometheus::{local::*, *}; use prometheus_static_metric::*; make_static_metric! { - pub label_enum IOType { + pub label_enum IoType { other, foreground_read, foreground_write, @@ -20,29 +20,29 @@ make_static_metric! { export, } - pub label_enum IOOp { + pub label_enum IoOp { read, write, } - pub label_enum IOPriority { + pub label_enum IoPriority { low, medium, high, } - pub struct IOLatencyVec : Histogram { - "type" => IOType, - "op" => IOOp, + pub struct IoLatencyVec : Histogram { + "type" => IoType, + "op" => IoOp, } - pub struct IOBytesVec : IntCounter { - "type" => IOType, - "op" => IOOp, + pub struct IoBytesVec : IntCounter { + "type" => IoType, + "op" => IoOp, } - pub struct IOPriorityIntGaugeVec : IntGauge { - "type" => IOPriority, + pub struct IoPriorityIntGaugeVec : IntGauge { + "type" => IoPriority, } } @@ -53,9 +53,9 @@ lazy_static! { &["type", "op"] ).unwrap(); - pub static ref IO_LATENCY_MICROS_VEC: IOLatencyVec = + pub static ref IO_LATENCY_MICROS_VEC: IoLatencyVec = register_static_histogram_vec!( - IOLatencyVec, + IoLatencyVec, "tikv_io_latency_micros", "Duration of disk tikv io.", &["type", "op"], @@ -70,8 +70,8 @@ lazy_static! { ) .unwrap(); - pub static ref RATE_LIMITER_MAX_BYTES_PER_SEC: IOPriorityIntGaugeVec = register_static_int_gauge_vec!( - IOPriorityIntGaugeVec, + pub static ref RATE_LIMITER_MAX_BYTES_PER_SEC: IoPriorityIntGaugeVec = register_static_int_gauge_vec!( + IoPriorityIntGaugeVec, "tikv_rate_limiter_max_bytes_per_sec", "Maximum IO bytes per second", &["type"] diff --git a/components/file_system/src/metrics_manager.rs b/components/file_system/src/metrics_manager.rs index 8ff4bddde47..89e822b24e7 100644 --- a/components/file_system/src/metrics_manager.rs +++ b/components/file_system/src/metrics_manager.rs @@ -8,36 +8,36 @@ use tikv_util::time::Instant; use crate::{ io_stats::fetch_io_bytes, metrics::{tls_flush, IO_BYTES_VEC}, - IOBytes, IOOp, IORateLimiterStatistics, IOType, + IoBytes, IoOp, IoRateLimiterStatistics, IoType, }; pub enum BytesFetcher { /// Fetch IO statistics from IO rate limiter, which records passed-through /// IOs in atomic counters. - FromRateLimiter(Arc), + FromRateLimiter(Arc), /// Fetch IO statistics from OS I/O stats collector. - FromIOStatsCollector(), + FromIoStatsCollector(), } impl BytesFetcher { - fn fetch(&self) -> [IOBytes; IOType::COUNT] { + fn fetch(&self) -> [IoBytes; IoType::COUNT] { match *self { BytesFetcher::FromRateLimiter(ref stats) => { - let mut bytes: [IOBytes; IOType::COUNT] = Default::default(); - for t in IOType::iter() { - bytes[t as usize].read = stats.fetch(t, IOOp::Read) as u64; - bytes[t as usize].write = stats.fetch(t, IOOp::Write) as u64; + let mut bytes: [IoBytes; IoType::COUNT] = Default::default(); + for t in IoType::iter() { + bytes[t as usize].read = stats.fetch(t, IoOp::Read) as u64; + bytes[t as usize].write = stats.fetch(t, IoOp::Write) as u64; } bytes } - BytesFetcher::FromIOStatsCollector() => fetch_io_bytes(), + BytesFetcher::FromIoStatsCollector() => fetch_io_bytes(), } } } pub struct MetricsManager { fetcher: BytesFetcher, - last_fetch: [IOBytes; IOType::COUNT], + last_fetch: [IoBytes; IoType::COUNT], } impl MetricsManager { @@ -51,7 +51,7 @@ impl MetricsManager { pub fn flush(&mut self, _now: Instant) { tls_flush(); let latest = self.fetcher.fetch(); - for t in IOType::iter() { + for t in IoType::iter() { let delta_bytes = latest[t as usize] - self.last_fetch[t as usize]; IO_BYTES_VEC .with_label_values(&[t.as_str(), "read"]) diff --git a/components/file_system/src/rate_limiter.rs b/components/file_system/src/rate_limiter.rs index 51fe8228aef..da7fe5fe75c 100644 --- a/components/file_system/src/rate_limiter.rs +++ b/components/file_system/src/rate_limiter.rs @@ -17,7 +17,7 @@ use tikv_util::time::Instant; use super::{ metrics::{tls_collect_rate_limiter_request_wait, RATE_LIMITER_MAX_BYTES_PER_SEC}, - IOOp, IOPriority, IOType, + IoOp, IoPriority, IoType, }; const DEFAULT_REFILL_PERIOD: Duration = Duration::from_millis(50); @@ -25,38 +25,38 @@ const DEFAULT_REFILLS_PER_SEC: usize = (1.0 / DEFAULT_REFILL_PERIOD.as_secs_f32( const MAX_WAIT_DURATION_PER_REQUEST: Duration = Duration::from_millis(500); #[derive(Debug, Clone, PartialEq, Eq, Copy)] -pub enum IORateLimitMode { +pub enum IoRateLimitMode { WriteOnly, ReadOnly, AllIo, } -impl IORateLimitMode { +impl IoRateLimitMode { pub fn as_str(&self) -> &str { match *self { - IORateLimitMode::WriteOnly => "write-only", - IORateLimitMode::ReadOnly => "read-only", - IORateLimitMode::AllIo => "all-io", + IoRateLimitMode::WriteOnly => "write-only", + IoRateLimitMode::ReadOnly => "read-only", + IoRateLimitMode::AllIo => "all-io", } } #[inline] - pub fn contains(&self, op: IOOp) -> bool { + pub fn contains(&self, op: IoOp) -> bool { match *self { - IORateLimitMode::WriteOnly => op == IOOp::Write, - IORateLimitMode::ReadOnly => op == IOOp::Read, + IoRateLimitMode::WriteOnly => op == IoOp::Write, + IoRateLimitMode::ReadOnly => op == IoOp::Read, _ => true, } } } -impl FromStr for IORateLimitMode { +impl FromStr for IoRateLimitMode { type Err = String; - fn from_str(s: &str) -> Result { + fn from_str(s: &str) -> Result { match s { - "write-only" => Ok(IORateLimitMode::WriteOnly), - "read-only" => Ok(IORateLimitMode::ReadOnly), - "all-io" => Ok(IORateLimitMode::AllIo), + "write-only" => Ok(IoRateLimitMode::WriteOnly), + "read-only" => Ok(IoRateLimitMode::ReadOnly), + "all-io" => Ok(IoRateLimitMode::AllIo), s => Err(format!( "expect: write-only, read-only or all-io, got: {:?}", s @@ -65,7 +65,7 @@ impl FromStr for IORateLimitMode { } } -impl Serialize for IORateLimitMode { +impl Serialize for IoRateLimitMode { fn serialize(&self, serializer: S) -> Result where S: Serializer, @@ -74,7 +74,7 @@ impl Serialize for IORateLimitMode { } } -impl<'de> Deserialize<'de> for IORateLimitMode { +impl<'de> Deserialize<'de> for IoRateLimitMode { fn deserialize(deserializer: D) -> Result where D: Deserializer<'de>, @@ -82,17 +82,17 @@ impl<'de> Deserialize<'de> for IORateLimitMode { use serde::de::{Error, Unexpected, Visitor}; struct StrVistor; impl<'de> Visitor<'de> for StrVistor { - type Value = IORateLimitMode; + type Value = IoRateLimitMode; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(formatter, "a IO rate limit mode") } - fn visit_str(self, value: &str) -> Result + fn visit_str(self, value: &str) -> Result where E: Error, { - let p = match IORateLimitMode::from_str(&*value.trim().to_lowercase()) { + let p = match IoRateLimitMode::from_str(&*value.trim().to_lowercase()) { Ok(p) => p, _ => { return Err(E::invalid_value( @@ -112,48 +112,48 @@ impl<'de> Deserialize<'de> for IORateLimitMode { /// Record accumulated bytes through of different types. /// Used for testing and metrics. #[derive(Debug)] -pub struct IORateLimiterStatistics { - read_bytes: [CachePadded; IOType::COUNT], - write_bytes: [CachePadded; IOType::COUNT], +pub struct IoRateLimiterStatistics { + read_bytes: [CachePadded; IoType::COUNT], + write_bytes: [CachePadded; IoType::COUNT], } -impl IORateLimiterStatistics { +impl IoRateLimiterStatistics { pub fn new() -> Self { - IORateLimiterStatistics { + IoRateLimiterStatistics { read_bytes: Default::default(), write_bytes: Default::default(), } } - pub fn fetch(&self, io_type: IOType, io_op: IOOp) -> usize { + pub fn fetch(&self, io_type: IoType, io_op: IoOp) -> usize { let io_type_idx = io_type as usize; match io_op { - IOOp::Read => self.read_bytes[io_type_idx].load(Ordering::Relaxed), - IOOp::Write => self.write_bytes[io_type_idx].load(Ordering::Relaxed), + IoOp::Read => self.read_bytes[io_type_idx].load(Ordering::Relaxed), + IoOp::Write => self.write_bytes[io_type_idx].load(Ordering::Relaxed), } } - pub fn record(&self, io_type: IOType, io_op: IOOp, bytes: usize) { + pub fn record(&self, io_type: IoType, io_op: IoOp, bytes: usize) { let io_type_idx = io_type as usize; match io_op { - IOOp::Read => { + IoOp::Read => { self.read_bytes[io_type_idx].fetch_add(bytes, Ordering::Relaxed); } - IOOp::Write => { + IoOp::Write => { self.write_bytes[io_type_idx].fetch_add(bytes, Ordering::Relaxed); } } } pub fn reset(&self) { - for i in 0..IOType::COUNT { + for i in 0..IoType::COUNT { self.read_bytes[i].store(0, Ordering::Relaxed); self.write_bytes[i].store(0, Ordering::Relaxed); } } } -impl Default for IORateLimiterStatistics { +impl Default for IoRateLimiterStatistics { fn default() -> Self { Self::new() } @@ -161,36 +161,36 @@ impl Default for IORateLimiterStatistics { /// Used to dynamically adjust the proportion of total budgets allocated for /// rate limited IO. This is needed when global IOs are only partially rate -/// limited, e.g. when mode is IORateLimitMode::WriteOnly. -pub trait IOBudgetAdjustor: Send + Sync { +/// limited, e.g. when mode is IoRateLimitMode::WriteOnly. +pub trait IoBudgetAdjustor: Send + Sync { fn adjust(&self, threshold: usize) -> usize; } /// Limit total IO flow below provided threshold by throttling lower-priority /// IOs. Rate limit is disabled when total IO threshold is set to zero. -struct PriorityBasedIORateLimiter { +struct PriorityBasedIoRateLimiter { // High-priority IOs are only limited when strict is true strict: bool, // Total bytes passed through during current epoch - bytes_through: [CachePadded; IOPriority::COUNT], + bytes_through: [CachePadded; IoPriority::COUNT], // Maximum bytes permitted during current epoch - bytes_per_epoch: [CachePadded; IOPriority::COUNT], - protected: Mutex, + bytes_per_epoch: [CachePadded; IoPriority::COUNT], + protected: Mutex, } -struct PriorityBasedIORateLimiterProtected { +struct PriorityBasedIoRateLimiterProtected { next_refill_time: Instant, // Bytes that can't be fulfilled in current epoch - pending_bytes: [usize; IOPriority::COUNT], + pending_bytes: [usize; IoPriority::COUNT], // Adjust low priority IO flow based on system backlog - adjustor: Option>, + adjustor: Option>, } -impl PriorityBasedIORateLimiterProtected { +impl PriorityBasedIoRateLimiterProtected { fn new() -> Self { - PriorityBasedIORateLimiterProtected { + PriorityBasedIoRateLimiterProtected { next_refill_time: Instant::now_coarse() + DEFAULT_REFILL_PERIOD, - pending_bytes: [0; IOPriority::COUNT], + pending_bytes: [0; IoPriority::COUNT], adjustor: None, } } @@ -216,7 +216,7 @@ macro_rules! do_sleep { }; } -/// Actual implementation for requesting IOs from PriorityBasedIORateLimiter. +/// Actual implementation for requesting IOs from PriorityBasedIoRateLimiter. /// An attempt will first be recorded. If the attempted amount exceeds the /// available quotas of current epoch, the requester will be queued (logically) /// and sleep until served. Macro is necessary to de-dup codes used both in @@ -235,7 +235,7 @@ macro_rules! request_imp { $limiter.bytes_through[priority_idx].fetch_add(amount, Ordering::Relaxed) + amount; // We prefer not to partially return only a portion of requested bytes. if bytes_through <= cached_bytes_per_epoch - || !$limiter.strict && $priority == IOPriority::High + || !$limiter.strict && $priority == IoPriority::High { return amount; } @@ -296,50 +296,50 @@ macro_rules! request_imp { }}; } -impl PriorityBasedIORateLimiter { +impl PriorityBasedIoRateLimiter { fn new(strict: bool) -> Self { - PriorityBasedIORateLimiter { + PriorityBasedIoRateLimiter { strict, bytes_through: Default::default(), bytes_per_epoch: Default::default(), - protected: Mutex::new(PriorityBasedIORateLimiterProtected::new()), + protected: Mutex::new(PriorityBasedIoRateLimiterProtected::new()), } } /// Dynamically changes the total IO flow threshold. fn set_bytes_per_sec(&self, bytes_per_sec: usize) { let now = (bytes_per_sec as f64 * DEFAULT_REFILL_PERIOD.as_secs_f64()) as usize; - let before = self.bytes_per_epoch[IOPriority::High as usize].swap(now, Ordering::Relaxed); + let before = self.bytes_per_epoch[IoPriority::High as usize].swap(now, Ordering::Relaxed); RATE_LIMITER_MAX_BYTES_PER_SEC .high .set(bytes_per_sec as i64); if now == 0 || before == 0 { // Toggle on or off rate limit. let _locked = self.protected.lock(); - self.bytes_per_epoch[IOPriority::Medium as usize].store(now, Ordering::Relaxed); + self.bytes_per_epoch[IoPriority::Medium as usize].store(now, Ordering::Relaxed); RATE_LIMITER_MAX_BYTES_PER_SEC .medium .set(bytes_per_sec as i64); - self.bytes_per_epoch[IOPriority::Low as usize].store(now, Ordering::Relaxed); + self.bytes_per_epoch[IoPriority::Low as usize].store(now, Ordering::Relaxed); RATE_LIMITER_MAX_BYTES_PER_SEC.low.set(bytes_per_sec as i64); } } - fn set_low_priority_io_adjustor(&self, adjustor: Option>) { + fn set_low_priority_io_adjustor(&self, adjustor: Option>) { let mut locked = self.protected.lock(); locked.adjustor = adjustor; } - fn request(&self, priority: IOPriority, amount: usize) -> usize { + fn request(&self, priority: IoPriority, amount: usize) -> usize { request_imp!(self, priority, amount, sync) } - async fn async_request(&self, priority: IOPriority, amount: usize) -> usize { + async fn async_request(&self, priority: IoPriority, amount: usize) -> usize { request_imp!(self, priority, amount, async) } #[cfg(test)] - fn request_with_skewed_clock(&self, priority: IOPriority, amount: usize) -> usize { + fn request_with_skewed_clock(&self, priority: IoPriority, amount: usize) -> usize { request_imp!(self, priority, amount, skewed_sync) } @@ -352,9 +352,9 @@ impl PriorityBasedIORateLimiter { /// this happens, total IO flow could exceed global threshold. /// - Highest priority IO alone must not exceed global threshold (in strict /// mode). - fn refill(&self, locked: &mut PriorityBasedIORateLimiterProtected, now: Instant) { + fn refill(&self, locked: &mut PriorityBasedIoRateLimiterProtected, now: Instant) { let mut total_budgets = - self.bytes_per_epoch[IOPriority::High as usize].load(Ordering::Relaxed); + self.bytes_per_epoch[IoPriority::High as usize].load(Ordering::Relaxed); if total_budgets == 0 { // It's possible that rate limit is toggled off in the meantime. return; @@ -365,12 +365,12 @@ impl PriorityBasedIORateLimiter { locked.next_refill_time = now + DEFAULT_REFILL_PERIOD; debug_assert!( - IOPriority::High as usize == IOPriority::Medium as usize + 1 - && IOPriority::Medium as usize == IOPriority::Low as usize + 1 + IoPriority::High as usize == IoPriority::Medium as usize + 1 + && IoPriority::Medium as usize == IoPriority::Low as usize + 1 ); let mut remaining_budgets = total_budgets; let mut used_budgets = 0; - for pri in &[IOPriority::High, IOPriority::Medium] { + for pri in &[IoPriority::High, IoPriority::Medium] { let p = *pri as usize; // Skipped epochs can only serve pending requests rather that in-coming ones, // catch up by subtracting them from pending_bytes. @@ -390,7 +390,7 @@ impl PriorityBasedIORateLimiter { used_budgets += ((served_by_first_epoch + served_by_skipped_epochs) as f32 / (skipped_epochs + 1.0)) as usize; // Only apply rate limit adjustments on low-priority IOs. - if *pri == IOPriority::Medium { + if *pri == IoPriority::Medium { if let Some(adjustor) = &locked.adjustor { total_budgets = adjustor.adjust(total_budgets); } @@ -400,7 +400,7 @@ impl PriorityBasedIORateLimiter { } else { 1 // A small positive value so not to disable flow control. }; - if *pri == IOPriority::High { + if *pri == IoPriority::High { RATE_LIMITER_MAX_BYTES_PER_SEC .medium .set((remaining_budgets * DEFAULT_REFILLS_PER_SEC) as i64); @@ -411,7 +411,7 @@ impl PriorityBasedIORateLimiter { } self.bytes_per_epoch[p - 1].store(remaining_budgets, Ordering::Relaxed); } - let p = IOPriority::Low as usize; + let p = IoPriority::Low as usize; let to_serve_pending_bytes = std::cmp::min(locked.pending_bytes[p], remaining_budgets); locked.pending_bytes[p] -= to_serve_pending_bytes; self.bytes_through[p].store(to_serve_pending_bytes, Ordering::Relaxed); @@ -427,7 +427,7 @@ impl PriorityBasedIORateLimiter { #[cfg(test)] fn reset(&self) { let mut locked = self.protected.lock(); - for p in &[IOPriority::High, IOPriority::Medium] { + for p in &[IoPriority::High, IoPriority::Medium] { let p = *p as usize; locked.pending_bytes[p] = 0; } @@ -435,26 +435,26 @@ impl PriorityBasedIORateLimiter { } /// A high-performance IO rate limiter used for prioritized flow control. -/// An instance of `IORateLimiter` can be safely shared between threads. -pub struct IORateLimiter { - mode: IORateLimitMode, - priority_map: [CachePadded; IOType::COUNT], - throughput_limiter: Arc, - stats: Option>, +/// An instance of `IoRateLimiter` can be safely shared between threads. +pub struct IoRateLimiter { + mode: IoRateLimitMode, + priority_map: [CachePadded; IoType::COUNT], + throughput_limiter: Arc, + stats: Option>, } -impl IORateLimiter { - pub fn new(mode: IORateLimitMode, strict: bool, enable_statistics: bool) -> Self { - let priority_map: [CachePadded; IOType::COUNT] = Default::default(); +impl IoRateLimiter { + pub fn new(mode: IoRateLimitMode, strict: bool, enable_statistics: bool) -> Self { + let priority_map: [CachePadded; IoType::COUNT] = Default::default(); for p in priority_map.iter() { - p.store(IOPriority::High as u32, Ordering::Relaxed); + p.store(IoPriority::High as u32, Ordering::Relaxed); } - IORateLimiter { + IoRateLimiter { mode, priority_map, - throughput_limiter: Arc::new(PriorityBasedIORateLimiter::new(strict)), + throughput_limiter: Arc::new(PriorityBasedIoRateLimiter::new(strict)), stats: if enable_statistics { - Some(Arc::new(IORateLimiterStatistics::new())) + Some(Arc::new(IoRateLimiterStatistics::new())) } else { None }, @@ -462,14 +462,14 @@ impl IORateLimiter { } pub fn new_for_test() -> Self { - IORateLimiter::new( - IORateLimitMode::AllIo, + IoRateLimiter::new( + IoRateLimitMode::AllIo, true, // strict true, // enable_statistics ) } - pub fn statistics(&self) -> Option> { + pub fn statistics(&self) -> Option> { self.stats.clone() } @@ -477,15 +477,15 @@ impl IORateLimiter { self.throughput_limiter.set_bytes_per_sec(rate); } - pub fn set_io_priority(&self, io_type: IOType, io_priority: IOPriority) { + pub fn set_io_priority(&self, io_type: IoType, io_priority: IoPriority) { self.priority_map[io_type as usize].store(io_priority as u32, Ordering::Relaxed); } pub fn set_low_priority_io_adjustor_if_needed( &self, - adjustor: Option>, + adjustor: Option>, ) { - if self.mode != IORateLimitMode::AllIo { + if self.mode != IoRateLimitMode::AllIo { self.throughput_limiter .set_low_priority_io_adjustor(adjustor); } @@ -494,10 +494,10 @@ impl IORateLimiter { /// Requests for token for bytes and potentially update statistics. If this /// request can not be satisfied, the call is blocked. Granted token can be /// less than the requested bytes, but must be greater than zero. - pub fn request(&self, io_type: IOType, io_op: IOOp, mut bytes: usize) -> usize { + pub fn request(&self, io_type: IoType, io_op: IoOp, mut bytes: usize) -> usize { if self.mode.contains(io_op) { bytes = self.throughput_limiter.request( - IOPriority::unsafe_from_u32( + IoPriority::unsafe_from_u32( self.priority_map[io_type as usize].load(Ordering::Relaxed), ), bytes, @@ -513,12 +513,12 @@ impl IORateLimiter { /// statistics. If this request can not be satisfied, the call is blocked. /// Granted token can be less than the requested bytes, but must be greater /// than zero. - pub async fn async_request(&self, io_type: IOType, io_op: IOOp, mut bytes: usize) -> usize { + pub async fn async_request(&self, io_type: IoType, io_op: IoOp, mut bytes: usize) -> usize { if self.mode.contains(io_op) { bytes = self .throughput_limiter .async_request( - IOPriority::unsafe_from_u32( + IoPriority::unsafe_from_u32( self.priority_map[io_type as usize].load(Ordering::Relaxed), ), bytes, @@ -532,10 +532,10 @@ impl IORateLimiter { } #[cfg(test)] - fn request_with_skewed_clock(&self, io_type: IOType, io_op: IOOp, mut bytes: usize) -> usize { + fn request_with_skewed_clock(&self, io_type: IoType, io_op: IoOp, mut bytes: usize) -> usize { if self.mode.contains(io_op) { bytes = self.throughput_limiter.request_with_skewed_clock( - IOPriority::unsafe_from_u32( + IoPriority::unsafe_from_u32( self.priority_map[io_type as usize].load(Ordering::Relaxed), ), bytes, @@ -549,15 +549,15 @@ impl IORateLimiter { } lazy_static! { - static ref IO_RATE_LIMITER: Mutex>> = Mutex::new(None); + static ref IO_RATE_LIMITER: Mutex>> = Mutex::new(None); } // Do NOT use this method in test environment. -pub fn set_io_rate_limiter(limiter: Option>) { +pub fn set_io_rate_limiter(limiter: Option>) { *IO_RATE_LIMITER.lock() = limiter; } -pub fn get_io_rate_limiter() -> Option> { +pub fn get_io_rate_limiter() -> Option> { (*IO_RATE_LIMITER.lock()).clone() } @@ -591,10 +591,10 @@ mod tests { } #[derive(Debug, Clone, Copy)] - struct Request(IOType, IOOp, usize); + struct Request(IoType, IoOp, usize); fn start_background_jobs( - limiter: &Arc, + limiter: &Arc, job_count: usize, request: Request, interval: Option, @@ -624,8 +624,8 @@ mod tests { #[test] fn test_rate_limit_toggle() { let bytes_per_sec = 2000; - let limiter = IORateLimiter::new_for_test(); - limiter.set_io_priority(IOType::Compaction, IOPriority::Low); + let limiter = IoRateLimiter::new_for_test(); + limiter.set_io_priority(IoType::Compaction, IoPriority::Low); let limiter = Arc::new(limiter); let stats = limiter.statistics().unwrap(); // enable rate limit @@ -634,19 +634,19 @@ mod tests { let _write_context = start_background_jobs( &limiter, 1, // job_count - Request(IOType::ForegroundWrite, IOOp::Write, 10), + Request(IoType::ForegroundWrite, IoOp::Write, 10), None, // interval ); let _compaction_context = start_background_jobs( &limiter, 1, // job_count - Request(IOType::Compaction, IOOp::Write, 10), + Request(IoType::Compaction, IoOp::Write, 10), None, // interval ); std::thread::sleep(Duration::from_secs(1)); let t1 = Instant::now(); approximate_eq!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64, + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64, bytes_per_sec as f64 * (t1 - t0).as_secs_f64() ); // disable rate limit @@ -655,11 +655,11 @@ mod tests { std::thread::sleep(Duration::from_secs(1)); let t2 = Instant::now(); assert!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64 + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64 > bytes_per_sec as f64 * (t2 - t1).as_secs_f64() * 4.0 ); assert!( - stats.fetch(IOType::Compaction, IOOp::Write) as f64 + stats.fetch(IoType::Compaction, IoOp::Write) as f64 > bytes_per_sec as f64 * (t2 - t1).as_secs_f64() * 4.0 ); // enable rate limit @@ -668,12 +668,12 @@ mod tests { std::thread::sleep(Duration::from_secs(1)); let t3 = Instant::now(); approximate_eq!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64, + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64, bytes_per_sec as f64 * (t3 - t2).as_secs_f64() ); } - fn verify_rate_limit(limiter: &Arc, bytes_per_sec: usize, duration: Duration) { + fn verify_rate_limit(limiter: &Arc, bytes_per_sec: usize, duration: Duration) { let stats = limiter.statistics().unwrap(); limiter.set_io_rate_limit(bytes_per_sec); stats.reset(); @@ -684,7 +684,7 @@ mod tests { let _context = start_background_jobs( limiter, 2, // job_count - Request(IOType::ForegroundWrite, IOOp::Write, 10), + Request(IoType::ForegroundWrite, IoOp::Write, 10), None, // interval ); std::thread::sleep(duration); @@ -693,7 +693,7 @@ mod tests { end.duration_since(begin) }; approximate_eq!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64, + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64, bytes_per_sec as f64 * actual_duration.as_secs_f64() ); } @@ -701,14 +701,14 @@ mod tests { #[test] fn test_rate_limit_dynamic_priority() { let bytes_per_sec = 2000; - let limiter = Arc::new(IORateLimiter::new( - IORateLimitMode::AllIo, + let limiter = Arc::new(IoRateLimiter::new( + IoRateLimitMode::AllIo, false, // strict true, // enable_statistics )); - limiter.set_io_priority(IOType::ForegroundWrite, IOPriority::Medium); + limiter.set_io_priority(IoType::ForegroundWrite, IoPriority::Medium); verify_rate_limit(&limiter, bytes_per_sec, Duration::from_secs(2)); - limiter.set_io_priority(IOType::ForegroundWrite, IOPriority::High); + limiter.set_io_priority(IoType::ForegroundWrite, IoPriority::High); let stats = limiter.statistics().unwrap(); stats.reset(); let duration = { @@ -717,7 +717,7 @@ mod tests { let _context = start_background_jobs( &limiter, 2, // job_count - Request(IOType::ForegroundWrite, IOOp::Write, 10), + Request(IoType::ForegroundWrite, IoOp::Write, 10), None, // interval ); std::thread::sleep(Duration::from_secs(2)); @@ -726,7 +726,7 @@ mod tests { end.duration_since(begin) }; assert!( - stats.fetch(IOType::ForegroundWrite, IOOp::Write) as f64 + stats.fetch(IoType::ForegroundWrite, IoOp::Write) as f64 > bytes_per_sec as f64 * duration.as_secs_f64() * 1.5 ); } @@ -735,7 +735,7 @@ mod tests { fn test_rate_limited_heavy_flow() { let low_bytes_per_sec = 2000; let high_bytes_per_sec = 10000; - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); verify_rate_limit(&limiter, low_bytes_per_sec, Duration::from_secs(2)); verify_rate_limit(&limiter, high_bytes_per_sec, Duration::from_secs(2)); verify_rate_limit(&limiter, low_bytes_per_sec, Duration::from_secs(2)); @@ -745,7 +745,7 @@ mod tests { fn test_rate_limited_light_flow() { let kbytes_per_sec = 3; let actual_kbytes_per_sec = 2; - let limiter = Arc::new(IORateLimiter::new_for_test()); + let limiter = Arc::new(IoRateLimiter::new_for_test()); limiter.set_io_rate_limit(kbytes_per_sec * 1000); let stats = limiter.statistics().unwrap(); let duration = { @@ -755,7 +755,7 @@ mod tests { let _context = start_background_jobs( &limiter, actual_kbytes_per_sec, // job_count - Request(IOType::Compaction, IOOp::Write, 1), + Request(IoType::Compaction, IoOp::Write, 1), Some(Duration::from_millis(1)), ); std::thread::sleep(Duration::from_secs(2)); @@ -764,7 +764,7 @@ mod tests { end.duration_since(begin) }; approximate_eq!( - stats.fetch(IOType::Compaction, IOOp::Write) as f64, + stats.fetch(IoType::Compaction, IoOp::Write) as f64, actual_kbytes_per_sec as f64 * duration.as_secs_f64() * 1000.0 ); } @@ -775,10 +775,10 @@ mod tests { let write_work = 50; let compaction_work = 80; let import_work = 50; - let limiter = IORateLimiter::new_for_test(); + let limiter = IoRateLimiter::new_for_test(); limiter.set_io_rate_limit(bytes_per_sec); - limiter.set_io_priority(IOType::Compaction, IOPriority::Medium); - limiter.set_io_priority(IOType::Import, IOPriority::Low); + limiter.set_io_priority(IoType::Compaction, IoPriority::Medium); + limiter.set_io_priority(IoType::Import, IoPriority::Low); let stats = limiter.statistics().unwrap(); let limiter = Arc::new(limiter); let begin = Instant::now(); @@ -787,8 +787,8 @@ mod tests { &limiter, 1, // job_count Request( - IOType::ForegroundWrite, - IOOp::Write, + IoType::ForegroundWrite, + IoOp::Write, write_work * bytes_per_sec / 100 / 1000, ), Some(Duration::from_millis(1)), @@ -797,8 +797,8 @@ mod tests { &limiter, 1, // job_count Request( - IOType::Compaction, - IOOp::Write, + IoType::Compaction, + IoOp::Write, compaction_work * bytes_per_sec / 100 / 1000, ), Some(Duration::from_millis(1)), @@ -807,8 +807,8 @@ mod tests { &limiter, 1, // job_count Request( - IOType::Import, - IOOp::Write, + IoType::Import, + IoOp::Write, import_work * bytes_per_sec / 100 / 1000, ), Some(Duration::from_millis(1)), @@ -817,20 +817,20 @@ mod tests { } let end = Instant::now(); let duration = end.duration_since(begin); - let write_bytes = stats.fetch(IOType::ForegroundWrite, IOOp::Write); + let write_bytes = stats.fetch(IoType::ForegroundWrite, IoOp::Write); approximate_eq!( write_bytes as f64, (write_work * bytes_per_sec / 100) as f64 * duration.as_secs_f64() ); - let compaction_bytes = stats.fetch(IOType::Compaction, IOOp::Write); - let import_bytes = stats.fetch(IOType::Import, IOOp::Write); + let compaction_bytes = stats.fetch(IoType::Compaction, IoOp::Write); + let import_bytes = stats.fetch(IoType::Import, IoOp::Write); let total_bytes = write_bytes + import_bytes + compaction_bytes; approximate_eq!((compaction_bytes + write_bytes) as f64, total_bytes as f64); } #[bench] fn bench_critical_section(b: &mut test::Bencher) { - let inner_limiter = PriorityBasedIORateLimiter::new(true /* strict */); + let inner_limiter = PriorityBasedIoRateLimiter::new(true /* strict */); inner_limiter.set_bytes_per_sec(1024); let now = Instant::now_coarse(); b.iter(|| { diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 628b066029d..49183245785 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -11,9 +11,9 @@ use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter}; use engine_traits::{ CacheStats, EncryptionKeyManager, EncryptionMethod, PerfContextExt, PerfContextKind, PerfLevel, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, - RaftLogGCTask, Result, + RaftLogGcTask, Result, }; -use file_system::{IOOp, IORateLimiter, IOType}; +use file_system::{IoOp, IoRateLimiter, IoType}; use kvproto::{ metapb::Region, raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, @@ -47,7 +47,7 @@ pub struct ManagedReader { ::Reader, DecrypterReader<::Reader>, >, - rate_limiter: Option>, + rate_limiter: Option>, } impl Seek for ManagedReader { @@ -63,7 +63,7 @@ impl Read for ManagedReader { fn read(&mut self, buf: &mut [u8]) -> IoResult { let mut size = buf.len(); if let Some(ref mut limiter) = self.rate_limiter { - size = limiter.request(IOType::ForegroundRead, IOOp::Read, size); + size = limiter.request(IoType::ForegroundRead, IoOp::Read, size); } match self.inner.as_mut() { Either::Left(reader) => reader.read(&mut buf[..size]), @@ -77,7 +77,7 @@ pub struct ManagedWriter { ::Writer, EncrypterWriter<::Writer>, >, - rate_limiter: Option>, + rate_limiter: Option>, } impl Seek for ManagedWriter { @@ -93,7 +93,7 @@ impl Write for ManagedWriter { fn write(&mut self, buf: &[u8]) -> IoResult { let mut size = buf.len(); if let Some(ref mut limiter) = self.rate_limiter { - size = limiter.request(IOType::ForegroundWrite, IOOp::Write, size); + size = limiter.request(IoType::ForegroundWrite, IoOp::Write, size); } match self.inner.as_mut() { Either::Left(writer) => writer.write(&buf[..size]), @@ -133,13 +133,13 @@ impl WriteExt for ManagedWriter { pub struct ManagedFileSystem { base_file_system: DefaultFileSystem, key_manager: Option>, - rate_limiter: Option>, + rate_limiter: Option>, } impl ManagedFileSystem { pub fn new( key_manager: Option>, - rate_limiter: Option>, + rate_limiter: Option>, ) -> Self { Self { base_file_system: DefaultFileSystem, @@ -256,7 +256,7 @@ impl RaftLogEngine { pub fn new( config: RaftEngineConfig, key_manager: Option>, - rate_limiter: Option>, + rate_limiter: Option>, ) -> Result { let file_system = Arc::new(ManagedFileSystem::new(key_manager, rate_limiter)); Ok(RaftLogEngine(Arc::new( @@ -516,14 +516,14 @@ impl RaftEngine for RaftLogEngine { } fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { - self.batch_gc(vec![RaftLogGCTask { + self.batch_gc(vec![RaftLogGcTask { raft_group_id, from, to, }]) } - fn batch_gc(&self, tasks: Vec) -> Result { + fn batch_gc(&self, tasks: Vec) -> Result { let mut batch = self.log_batch(tasks.len()); let mut old_first_index = Vec::with_capacity(tasks.len()); for task in &tasks { diff --git a/components/raftstore/src/coprocessor/split_check/half.rs b/components/raftstore/src/coprocessor/split_check/half.rs index 57472b5cecf..8f572eb1f9f 100644 --- a/components/raftstore/src/coprocessor/split_check/half.rs +++ b/components/raftstore/src/coprocessor/split_check/half.rs @@ -125,7 +125,7 @@ pub fn get_region_approximate_middle( mod tests { use std::{iter, sync::mpsc}; - use engine_test::ctor::{ColumnFamilyOptions, DBOptions}; + use engine_test::ctor::{CfOptions, DbOptions}; use engine_traits::{MiscExt, SyncMutable, ALL_CFS, CF_DEFAULT, LARGE_CFS}; use kvproto::{ metapb::{Peer, Region}, @@ -485,8 +485,8 @@ mod tests { .unwrap(); let path = tmp.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let engine = engine_test::kv::new_engine_opt(path, db_opts, cfs_opts).unwrap(); diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index 892a38a7f48..e2e58933e57 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -232,7 +232,7 @@ pub fn get_region_approximate_keys( mod tests { use std::{cmp, sync::mpsc, u64}; - use engine_test::ctor::{ColumnFamilyOptions, DBOptions}; + use engine_test::ctor::{CfOptions, DbOptions}; use engine_traits::{KvEngine, MiscExt, SyncMutable, ALL_CFS, CF_DEFAULT, CF_WRITE, LARGE_CFS}; use kvproto::{ metapb::{Peer, Region}, @@ -453,8 +453,8 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); @@ -633,8 +633,8 @@ mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index faff7b77c0a..bc9fd855038 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -260,7 +260,7 @@ pub mod tests { use collections::HashSet; use engine_test::{ - ctor::{ColumnFamilyOptions, DBOptions}, + ctor::{CfOptions, DbOptions}, kv::KvTestEngine, }; use engine_traits::{ @@ -438,16 +438,16 @@ pub mod tests { fn test_split_check_impl(cfs_with_range_prop: &[CfName], data_cf: CfName) { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); + let db_opts = DbOptions::default(); let cfs_with_range_prop: HashSet<_> = cfs_with_range_prop.iter().cloned().collect(); - let mut cf_opt = ColumnFamilyOptions::new(); + let mut cf_opt = CfOptions::new(); cf_opt.set_no_range_properties(true); let cfs_opts = ALL_CFS .iter() .map(|cf| { if cfs_with_range_prop.contains(cf) { - (*cf, ColumnFamilyOptions::new()) + (*cf, CfOptions::new()) } else { (*cf, cf_opt.clone()) } @@ -565,9 +565,9 @@ pub mod tests { fn test_generate_bucket_impl(cfs_with_range_prop: &[CfName], data_cf: CfName, mvcc: bool) { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); + let db_opts = DbOptions::default(); let cfs_with_range_prop: HashSet<_> = cfs_with_range_prop.iter().cloned().collect(); - let mut cf_opt = ColumnFamilyOptions::new(); + let mut cf_opt = CfOptions::new(); cf_opt.set_no_range_properties(true); cf_opt.set_disable_auto_compactions(true); @@ -575,7 +575,7 @@ pub mod tests { .iter() .map(|cf| { if cfs_with_range_prop.contains(cf) { - let mut opt = ColumnFamilyOptions::new(); + let mut opt = CfOptions::new(); opt.set_disable_auto_compactions(true); (*cf, opt) } else { @@ -704,9 +704,9 @@ pub mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); + let db_opts = DbOptions::default(); let cfs_with_range_prop: HashSet<_> = LARGE_CFS.iter().cloned().collect(); - let mut cf_opt = ColumnFamilyOptions::new(); + let mut cf_opt = CfOptions::new(); cf_opt.set_no_range_properties(true); cf_opt.set_disable_auto_compactions(true); @@ -714,7 +714,7 @@ pub mod tests { .iter() .map(|cf| { if cfs_with_range_prop.contains(cf) { - let mut opt = ColumnFamilyOptions::new(); + let mut opt = CfOptions::new(); opt.set_disable_auto_compactions(true); (*cf, opt) } else { @@ -763,15 +763,15 @@ pub mod tests { fn test_cf_lock_without_range_prop() { let path = Builder::new().prefix("test-raftstore").tempdir().unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opt = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opt = CfOptions::new(); cf_opt.set_no_range_properties(true); let cfs_opts = ALL_CFS .iter() .map(|cf| { if cf != &CF_LOCK { - (*cf, ColumnFamilyOptions::new()) + (*cf, CfOptions::new()) } else { (*cf, cf_opt.clone()) } @@ -830,13 +830,13 @@ pub mod tests { let cfs_opts = ALL_CFS .iter() .map(|cf| { - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = CfOptions::new(); cf_opts.set_no_range_properties(true); (*cf, cf_opts) }) .collect(); let engine = - engine_test::kv::new_engine_opt(path_str, DBOptions::default(), cfs_opts).unwrap(); + engine_test::kv::new_engine_opt(path_str, DbOptions::default(), cfs_opts).unwrap(); let mut runnable = SplitCheckRunner::new(engine.clone(), tx.clone(), CoprocessorHost::new(tx, cfg)); @@ -909,8 +909,8 @@ pub mod tests { .unwrap(); let path = tmp.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); cf_opts.set_no_range_properties(true); @@ -944,8 +944,8 @@ pub mod tests { .unwrap(); let path = tmp.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let engine = engine_test::kv::new_engine_opt(path, db_opts, cfs_opts).unwrap(); @@ -1056,8 +1056,8 @@ pub mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(10); let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); @@ -1086,8 +1086,8 @@ pub mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_disable_auto_compactions(true); let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); @@ -1121,8 +1121,8 @@ pub mod tests { .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_disable_auto_compactions(true); let cfs_opts = LARGE_CFS.iter().map(|cf| (*cf, cf_opts.clone())).collect(); let db = engine_test::kv::new_engine_opt(path_str, db_opts, cfs_opts).unwrap(); diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index 1aee90b6463..c8fb02d424b 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -48,7 +48,7 @@ impl CompactionGuardGeneratorFactory

{ } // Update to implement engine_traits::SstPartitionerFactory instead once we move -// to use abstracted ColumnFamilyOptions in src/config.rs. +// to use abstracted CfOptions in src/config.rs. impl SstPartitionerFactory for CompactionGuardGeneratorFactory

{ @@ -200,7 +200,7 @@ mod tests { use engine_rocks::{ raw::{BlockBasedOptions, DBCompressionType}, util::new_engine_opt, - RocksCfOptions, RocksDBOptions, RocksEngine, RocksSstPartitionerFactory, RocksSstReader, + RocksCfOptions, RocksDbOptions, RocksEngine, RocksSstPartitionerFactory, RocksSstReader, }; use engine_traits::{CompactExt, Iterator, MiscExt, SstReader, SyncMutable, CF_DEFAULT}; use keys::DATA_PREFIX_KEY; @@ -391,7 +391,7 @@ mod tests { let db = new_engine_opt( temp_dir.path().to_str().unwrap(), - RocksDBOptions::default(), + RocksDbOptions::default(), vec![(CF_DEFAULT, cf_opts)], ) .unwrap(); diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 62721b5c1c9..6d309afa17f 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2571,7 +2571,7 @@ where let persisted_msgs = ready.take_persisted_messages(); let mut has_write_ready = false; match &res { - HandleReadyResult::SendIOTask | HandleReadyResult::Snapshot { .. } => { + HandleReadyResult::SendIoTask | HandleReadyResult::Snapshot { .. } => { if !persisted_msgs.is_empty() { task.messages = self.build_raft_messages(ctx, persisted_msgs); } @@ -2602,7 +2602,7 @@ where self.raft_group.advance_append_async(ready); } } - HandleReadyResult::NoIOTask => { + HandleReadyResult::NoIoTask => { if let Some(last) = self.unpersisted_readies.back_mut() { // Attach to the last unpersisted ready so that it can be considered to be // persisted with the last ready at the same time. diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index cec0d44f081..aec48c1756f 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -115,7 +115,7 @@ impl From for RaftError { #[derive(PartialEq, Debug)] pub enum HandleReadyResult { - SendIOTask, + SendIoTask, Snapshot { msgs: Vec, snap_region: metapb::Region, @@ -124,7 +124,7 @@ pub enum HandleReadyResult { /// The first index before applying the snapshot. last_first_index: u64, }, - NoIOTask, + NoIoTask, } pub fn recover_from_applying_state( @@ -977,7 +977,7 @@ where let mut write_task = WriteTask::new(region_id, self.peer_id, ready.number()); - let mut res = HandleReadyResult::SendIOTask; + let mut res = HandleReadyResult::SendIoTask; if !ready.snapshot().is_empty() { fail_point!("raft_before_apply_snap"); let last_first_index = self.first_index().unwrap(); @@ -1023,7 +1023,7 @@ where } if !write_task.has_data() { - res = HandleReadyResult::NoIOTask; + res = HandleReadyResult::NoIoTask; } Ok((res, write_task)) diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index 056f1f4832d..64bde3cf88b 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -175,13 +175,13 @@ impl Peekable for RegionSnapshot where S: Snapshot, { - type DBVector = ::DBVector; + type DbVector = ::DbVector; fn get_value_opt( &self, opts: &ReadOptions, key: &[u8], - ) -> EngineResult> { + ) -> EngineResult> { check_key_in_range( key, self.region.get_id(), @@ -200,7 +200,7 @@ where opts: &ReadOptions, cf: &str, key: &[u8], - ) -> EngineResult> { + ) -> EngineResult> { check_key_in_range( key, self.region.get_id(), diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index aeaf70f5b03..e7b024c38eb 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -15,9 +15,7 @@ use std::{ }; use collections::{HashMap, HashMapEntry as Entry}; -use encryption::{ - create_aes_ctr_crypter, encryption_method_from_db_encryption_method, DataKeyManager, Iv, -}; +use encryption::{create_aes_ctr_crypter, from_engine_encryption_method, DataKeyManager, Iv}; use engine_traits::{CfName, EncryptionKeyManager, KvEngine, CF_DEFAULT, CF_LOCK, CF_WRITE}; use error_code::{self, ErrorCode, ErrorCodeExt}; use fail::fail_point; @@ -617,7 +615,7 @@ impl Snapshot { if let Some(mgr) = &s.mgr.encryption_key_manager { let enc_info = mgr.new_file(&file_paths[idx])?; - let mthd = encryption_method_from_db_encryption_method(enc_info.method); + let mthd = from_engine_encryption_method(enc_info.method); if mthd != EncryptionMethod::Plaintext { let file_for_recving = cf_file.file_for_recving.last_mut().unwrap(); file_for_recving.encrypter = Some( @@ -1887,7 +1885,7 @@ pub mod tests { use encryption::{DataKeyManager, EncryptionConfig, FileConfig, MasterKeyConfig}; use encryption_export::data_key_manager_from_config; use engine_test::{ - ctor::{ColumnFamilyOptions, DBOptions, KvEngineConstructorExt, RaftDBOptions}, + ctor::{CfOptions, DbOptions, KvEngineConstructorExt, RaftDbOptions}, kv::KvTestEngine, raft::RaftTestEngine, }; @@ -1921,16 +1919,16 @@ pub mod tests { const TEST_META_FILE_BUFFER_SIZE: usize = 1000; const BYTE_SIZE: usize = 1; - type DBBuilder = fn( + type DbBuilder = fn( p: &Path, - db_opt: Option, - cf_opts: Option>, + db_opt: Option, + cf_opts: Option>, ) -> Result; pub fn open_test_empty_db( path: &Path, - db_opt: Option, - cf_opts: Option>, + db_opt: Option, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, @@ -1940,7 +1938,7 @@ pub mod tests { let cf_opts = cf_opts.unwrap_or_else(|| { ALL_CFS .iter() - .map(|cf| (*cf, ColumnFamilyOptions::default())) + .map(|cf| (*cf, CfOptions::default())) .collect() }); let db = E::new_kv_engine_opt(p, db_opt, cf_opts).unwrap(); @@ -1949,8 +1947,8 @@ pub mod tests { pub fn open_test_db( path: &Path, - db_opt: Option, - cf_opts: Option>, + db_opt: Option, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, @@ -1969,8 +1967,8 @@ pub mod tests { pub fn open_test_db_with_100keys( path: &Path, - db_opt: Option, - cf_opts: Option>, + db_opt: Option, + cf_opts: Option>, ) -> Result where E: KvEngine + KvEngineConstructorExt, @@ -1991,9 +1989,9 @@ pub mod tests { pub fn get_test_db_for_regions( path: &TempDir, - raft_db_opt: Option, - kv_db_opt: Option, - kv_cf_opts: Option>, + raft_db_opt: Option, + kv_db_opt: Option, + kv_cf_opts: Option>, regions: &[u64], ) -> Result> { let p = path.path(); @@ -2116,9 +2114,9 @@ pub mod tests { (dir, key_manager.unwrap()) } - pub fn gen_db_options_with_encryption(prefix: &str) -> (TempDir, DBOptions) { + pub fn gen_db_options_with_encryption(prefix: &str) -> (TempDir, DbOptions) { let (_enc_dir, key_manager) = create_encryption_key_manager(prefix); - let mut db_opts = DBOptions::default(); + let mut db_opts = DbOptions::default(); db_opts.set_key_manager(Some(key_manager)); (_enc_dir, db_opts) } @@ -2193,7 +2191,7 @@ pub mod tests { test_snap_file(open_test_db_with_100keys, 500); } - fn test_snap_file(get_db: DBBuilder, max_file_size: u64) { + fn test_snap_file(get_db: DbBuilder, max_file_size: u64) { let region_id = 1; let region = gen_test_region(region_id, 1, 1); let src_db_dir = Builder::new() @@ -2312,7 +2310,7 @@ pub mod tests { test_snap_validation(open_test_db_with_100keys, 500); } - fn test_snap_validation(get_db: DBBuilder, max_file_size: u64) { + fn test_snap_validation(get_db: DbBuilder, max_file_size: u64) { let region_id = 1; let region = gen_test_region(region_id, 1, 1); let db_dir = Builder::new() @@ -2827,7 +2825,7 @@ pub mod tests { let kv_cf_opts = ALL_CFS .iter() .map(|cf| { - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = CfOptions::new(); cf_opts.set_no_range_properties(true); cf_opts.set_no_table_properties(true); (*cf, cf_opts) diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index c88c1bd3718..61986ffcd78 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -9,8 +9,7 @@ use std::{ }; use encryption::{ - encryption_method_from_db_encryption_method, DataKeyManager, DecrypterReader, EncrypterWriter, - Iv, + from_engine_encryption_method, DataKeyManager, DecrypterReader, EncrypterWriter, Iv, }; use engine_traits::{ CfName, EncryptionKeyManager, Error as EngineError, Iterable, KvEngine, Mutable, @@ -61,7 +60,7 @@ where if let Some(key_mgr) = key_mgr { let enc_info = box_try!(key_mgr.new_file(path)); - let mthd = encryption_method_from_db_encryption_method(enc_info.method); + let mthd = from_engine_encryption_method(enc_info.method); if mthd != EncryptionMethod::Plaintext { let writer = box_try!(EncrypterWriter::new( file.take().unwrap(), @@ -284,7 +283,7 @@ pub fn get_decrypter_reader( encryption_key_manager: &DataKeyManager, ) -> Result, Error> { let enc_info = box_try!(encryption_key_manager.get_file(file)); - let mthd = encryption_method_from_db_encryption_method(enc_info.method); + let mthd = from_engine_encryption_method(enc_info.method); debug!( "get_decrypter_reader gets enc_info for {:?}, method: {:?}", file, mthd diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index a829d2fe01c..958da2adaa6 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -253,7 +253,7 @@ mod tests { use std::{thread::sleep, time::Duration}; use engine_test::{ - ctor::{ColumnFamilyOptions, DBOptions}, + ctor::{CfOptions, DbOptions}, kv::{new_engine, new_engine_opt, KvTestEngine}, }; use engine_traits::{ @@ -325,13 +325,13 @@ mod tests { } fn open_db(path: &str) -> KvTestEngine { - let db_opts = DBOptions::default(); - let mut cf_opts = ColumnFamilyOptions::new(); + let db_opts = DbOptions::default(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_file_num_compaction_trigger(8); let cfs_opts = vec![ - (CF_DEFAULT, ColumnFamilyOptions::new()), - (CF_RAFT, ColumnFamilyOptions::new()), - (CF_LOCK, ColumnFamilyOptions::new()), + (CF_DEFAULT, CfOptions::new()), + (CF_RAFT, CfOptions::new()), + (CF_LOCK, CfOptions::new()), (CF_WRITE, cf_opts), ]; new_engine_opt(path, db_opts, cfs_opts).unwrap() diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index 88e30e33104..f93213dfa0d 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -6,8 +6,8 @@ use std::{ sync::mpsc::Sender, }; -use engine_traits::{Engines, KvEngine, RaftEngine, RaftLogGCTask}; -use file_system::{IOType, WithIOType}; +use engine_traits::{Engines, KvEngine, RaftEngine, RaftLogGcTask}; +use file_system::{IoType, WithIoType}; use thiserror::Error; use tikv_util::{ box_try, debug, error, @@ -88,7 +88,7 @@ impl Runner { } /// Does the GC job and returns the count of logs collected. - fn gc_raft_log(&mut self, regions: Vec) -> Result { + fn gc_raft_log(&mut self, regions: Vec) -> Result { fail::fail_point!("worker_gc_raft_log", |s| { Ok(s.and_then(|s| s.parse().ok()).unwrap_or(0)) }); @@ -137,7 +137,7 @@ impl Runner { "end_index" => t.end_idx, ); } - groups.push(RaftLogGCTask { + groups.push(RaftLogGcTask { raft_group_id: t.region_id, from: t.start_idx, to: t.end_idx, @@ -171,7 +171,7 @@ where type Task = Task; fn run(&mut self, task: Task) { - let _io_type_guard = WithIOType::new(IOType::ForegroundWrite); + let _io_type_guard = WithIoType::new(IoType::ForegroundWrite); let flush_now = task.flush; self.tasks.push(task); // TODO: maybe they should also be batched even `flush_now` is true. diff --git a/components/raftstore/src/store/worker/refresh_config.rs b/components/raftstore/src/store/worker/refresh_config.rs index d3681654975..6555e96f102 100644 --- a/components/raftstore/src/store/worker/refresh_config.rs +++ b/components/raftstore/src/store/worker/refresh_config.rs @@ -6,7 +6,7 @@ use std::{ }; use batch_system::{BatchRouter, Fsm, FsmTypes, HandlerBuilder, Poller, PoolState, Priority}; -use file_system::{set_io_type, IOType}; +use file_system::{set_io_type, IoType}; use tikv_util::{ debug, error, info, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, worker::Runnable, }; @@ -74,7 +74,7 @@ where ))) .spawn_wrapper(move || { tikv_util::thread_group::set_properties(props); - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); poller.poll(); }) .unwrap(); diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 5e2cc8992f5..d15e40e6f5e 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -18,7 +18,7 @@ use std::{ use engine_traits::{DeleteStrategy, KvEngine, Mutable, Range, WriteBatch, CF_LOCK, CF_RAFT}; use fail::fail_point; -use file_system::{IOType, WithIOType}; +use file_system::{IoType, WithIoType}; use kvproto::raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}; use pd_client::PdClient; use raft::eraftpb::Snapshot as RaftSnapshot; @@ -322,10 +322,10 @@ where } let start = Instant::now(); - let _io_type_guard = WithIOType::new(if for_balance { - IOType::LoadBalance + let _io_type_guard = WithIoType::new(if for_balance { + IoType::LoadBalance } else { - IOType::Replication + IoType::Replication }); if let Err(e) = self.generate_snap( @@ -821,7 +821,7 @@ mod tests { }; use engine_test::{ - ctor::ColumnFamilyOptions, + ctor::CfOptions, kv::{KvTestEngine, KvTestSnapshot}, }; use engine_traits::{ @@ -990,7 +990,7 @@ mod tests { .tempdir() .unwrap(); - let mut cf_opts = ColumnFamilyOptions::new(); + let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_slowdown_writes_trigger(5); cf_opts.set_disable_auto_compactions(true); let kv_cfs_opts = vec![ diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 14a1a5b7bbc..81fa843ace0 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -8,7 +8,7 @@ use std::{ }; use engine_traits::{CfName, IterOptions, Iterable, Iterator, KvEngine, CF_WRITE, LARGE_CFS}; -use file_system::{IOType, WithIOType}; +use file_system::{IoType, WithIoType}; use itertools::Itertools; use kvproto::{ metapb::{Region, RegionEpoch}, @@ -639,7 +639,7 @@ where { type Task = Task; fn run(&mut self, task: Task) { - let _io_type_guard = WithIOType::new(IOType::LoadBalance); + let _io_type_guard = WithIoType::new(IoType::LoadBalance); match task { Task::SplitCheckTask { region, diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index d984ccb353d..c0be3ba276b 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -166,7 +166,7 @@ impl SecurityManager { sb.bind(addr, port) } else { if !self.cfg.cert_allowed_cn.is_empty() { - let cn_checker = CNChecker { + let cn_checker = CnChecker { allowed_cn: Arc::new(self.cfg.cert_allowed_cn.clone()), }; sb = sb.add_checker(cn_checker); @@ -186,11 +186,11 @@ impl SecurityManager { } #[derive(Clone)] -struct CNChecker { +struct CnChecker { allowed_cn: Arc>, } -impl ServerChecker for CNChecker { +impl ServerChecker for CnChecker { fn check(&mut self, ctx: &RpcContext<'_>) -> CheckResult { match check_common_name(&self.allowed_cn, ctx) { Ok(()) => CheckResult::Continue, diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 73269c3f07a..425acf6e15c 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -43,13 +43,13 @@ use engine_rocks::{ }; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ - CFOptionsExt, ColumnFamilyOptions, Engines, FlowControlFactorsExt, KvEngine, MiscExt, - RaftEngine, TabletFactory, CF_DEFAULT, CF_LOCK, CF_WRITE, + CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, + TabletFactory, CF_DEFAULT, CF_LOCK, CF_WRITE, }; use error_code::ErrorCodeExt; use file_system::{ - get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IOBudgetAdjustor, - MetricsManager as IOMetricsManager, + get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor, + MetricsManager as IoMetricsManager, }; use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; @@ -82,7 +82,7 @@ use raftstore::{ }; use security::SecurityManager; use tikv::{ - config::{ConfigController, DBConfigManger, DBType, LogConfigManager, TiKvConfig}, + config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TiKvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, import::{ImportSstService, SstImporter}, @@ -1249,7 +1249,7 @@ impl TiKvServer { .build(!stats_collector_enabled /* enable_statistics */), ); let fetcher = if stats_collector_enabled { - BytesFetcher::FromIOStatsCollector() + BytesFetcher::FromIoStatsCollector() } else { BytesFetcher::FromRateLimiter(limiter.statistics().unwrap()) }; @@ -1267,7 +1267,7 @@ impl TiKvServer { let mut engine_metrics = EngineMetricsManager::::new( self.engines.as_ref().unwrap().engines.clone(), ); - let mut io_metrics = IOMetricsManager::new(fetcher); + let mut io_metrics = IoMetricsManager::new(fetcher); let engines_info_clone = engines_info.clone(); self.background_worker .spawn_interval_task(DEFAULT_METRICS_FLUSH_INTERVAL, move || { @@ -1582,9 +1582,9 @@ impl ConfiguredRaftEngine for RocksEngine { fn register_config(&self, cfg_controller: &mut ConfigController, share_cache: bool) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DBConfigManger::new( + Box::new(DbConfigManger::new( Arc::new(self.clone()), - DBType::Raft, + DbType::Raft, share_cache, )), ); @@ -1669,9 +1669,9 @@ impl TiKvServer { let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DBConfigManger::new( + Box::new(DbConfigManger::new( factory.clone(), - DBType::Kv, + DbType::Kv, self.config.storage.block_cache.shared, )), ); @@ -1887,7 +1887,7 @@ impl EnginesResourceInfo { } } -impl IOBudgetAdjustor for EnginesResourceInfo { +impl IoBudgetAdjustor for EnginesResourceInfo { fn adjust(&self, total_budgets: usize) -> usize { let score = self.latest_normalized_pending_bytes.load(Ordering::Relaxed) as f32 / Self::SCALE_FACTOR as f32; diff --git a/components/sst_importer/src/import_mode.rs b/components/sst_importer/src/import_mode.rs index 39dca3bea02..98a4aae7fe8 100644 --- a/components/sst_importer/src/import_mode.rs +++ b/components/sst_importer/src/import_mode.rs @@ -8,7 +8,7 @@ use std::{ time::{Duration, Instant}, }; -use engine_traits::{ColumnFamilyOptions, DBOptions, KvEngine}; +use engine_traits::{CfOptions, DbOptions, KvEngine}; use futures::executor::ThreadPool; use futures_util::compat::Future01CompatExt; use kvproto::import_sstpb::*; @@ -16,19 +16,19 @@ use tikv_util::timer::GLOBAL_TIMER_HANDLE; use super::{Config, Result}; -pub type RocksDBMetricsFn = fn(cf: &str, name: &str, v: f64); +pub type RocksDbMetricsFn = fn(cf: &str, name: &str, v: f64); struct ImportModeSwitcherInner { is_import: Arc, - backup_db_options: ImportModeDBOptions, - backup_cf_options: Vec<(String, ImportModeCFOptions)>, + backup_db_options: ImportModeDbOptions, + backup_cf_options: Vec<(String, ImportModeCfOptions)>, timeout: Duration, next_check: Instant, - metrics_fn: RocksDBMetricsFn, + metrics_fn: RocksDbMetricsFn, } impl ImportModeSwitcherInner { - fn enter_normal_mode(&mut self, db: &E, mf: RocksDBMetricsFn) -> Result { + fn enter_normal_mode(&mut self, db: &E, mf: RocksDbMetricsFn) -> Result { if !self.is_import.load(Ordering::Acquire) { return Ok(false); } @@ -43,18 +43,18 @@ impl ImportModeSwitcherInner { Ok(true) } - fn enter_import_mode(&mut self, db: &E, mf: RocksDBMetricsFn) -> Result { + fn enter_import_mode(&mut self, db: &E, mf: RocksDbMetricsFn) -> Result { if self.is_import.load(Ordering::Acquire) { return Ok(false); } - self.backup_db_options = ImportModeDBOptions::new_options(db); + self.backup_db_options = ImportModeDbOptions::new_options(db); self.backup_cf_options.clear(); let import_db_options = self.backup_db_options.optimized_for_import_mode(); import_db_options.set_options(db)?; for cf_name in db.cf_names() { - let cf_opts = ImportModeCFOptions::new_options(db, cf_name); + let cf_opts = ImportModeCfOptions::new_options(db, cf_name); let import_cf_options = cf_opts.optimized_for_import_mode(); self.backup_cf_options.push((cf_name.to_owned(), cf_opts)); import_cf_options.set_options(db, cf_name, mf)?; @@ -79,7 +79,7 @@ impl ImportModeSwitcher { let is_import = Arc::new(AtomicBool::new(false)); let inner = Arc::new(Mutex::new(ImportModeSwitcherInner { is_import: is_import.clone(), - backup_db_options: ImportModeDBOptions::new(), + backup_db_options: ImportModeDbOptions::new(), backup_cf_options: Vec::new(), timeout, next_check: Instant::now() + timeout, @@ -120,14 +120,14 @@ impl ImportModeSwitcher { executor.spawn_ok(timer_loop); } - pub fn enter_normal_mode(&self, db: &E, mf: RocksDBMetricsFn) -> Result { + pub fn enter_normal_mode(&self, db: &E, mf: RocksDbMetricsFn) -> Result { if !self.is_import.load(Ordering::Acquire) { return Ok(false); } self.inner.lock().unwrap().enter_normal_mode(db, mf) } - pub fn enter_import_mode(&self, db: &E, mf: RocksDBMetricsFn) -> Result { + pub fn enter_import_mode(&self, db: &E, mf: RocksDbMetricsFn) -> Result { let mut inner = self.inner.lock().unwrap(); let ret = inner.enter_import_mode(db, mf)?; inner.next_check = Instant::now() + inner.timeout; @@ -144,11 +144,11 @@ impl ImportModeSwitcher { } } -struct ImportModeDBOptions { +struct ImportModeDbOptions { max_background_jobs: i32, } -impl ImportModeDBOptions { +impl ImportModeDbOptions { fn new() -> Self { Self { max_background_jobs: 32, @@ -161,9 +161,9 @@ impl ImportModeDBOptions { } } - fn new_options(db: &impl KvEngine) -> ImportModeDBOptions { + fn new_options(db: &impl KvEngine) -> ImportModeDbOptions { let db_opts = db.get_db_options(); - ImportModeDBOptions { + ImportModeDbOptions { max_background_jobs: db_opts.get_max_background_jobs(), } } @@ -179,14 +179,14 @@ impl ImportModeDBOptions { } } -struct ImportModeCFOptions { +struct ImportModeCfOptions { level0_stop_writes_trigger: u32, level0_slowdown_writes_trigger: u32, soft_pending_compaction_bytes_limit: u64, hard_pending_compaction_bytes_limit: u64, } -impl ImportModeCFOptions { +impl ImportModeCfOptions { fn optimized_for_import_mode(&self) -> Self { Self { level0_stop_writes_trigger: self.level0_stop_writes_trigger.max(1 << 30), @@ -196,10 +196,10 @@ impl ImportModeCFOptions { } } - fn new_options(db: &impl KvEngine, cf_name: &str) -> ImportModeCFOptions { + fn new_options(db: &impl KvEngine, cf_name: &str) -> ImportModeCfOptions { let cf_opts = db.get_options_cf(cf_name).unwrap(); //FIXME unwrap - ImportModeCFOptions { + ImportModeCfOptions { level0_stop_writes_trigger: cf_opts.get_level_zero_stop_writes_trigger(), level0_slowdown_writes_trigger: cf_opts.get_level_zero_slowdown_writes_trigger(), soft_pending_compaction_bytes_limit: cf_opts.get_soft_pending_compaction_bytes_limit(), @@ -207,7 +207,7 @@ impl ImportModeCFOptions { } } - fn set_options(&self, db: &impl KvEngine, cf_name: &str, mf: RocksDBMetricsFn) -> Result<()> { + fn set_options(&self, db: &impl KvEngine, cf_name: &str, mf: RocksDbMetricsFn) -> Result<()> { let opts = [ ( "level0_stop_writes_trigger".to_owned(), @@ -252,8 +252,8 @@ mod tests { fn check_import_options( db: &E, - expected_db_opts: &ImportModeDBOptions, - expected_cf_opts: &ImportModeCFOptions, + expected_db_opts: &ImportModeDbOptions, + expected_cf_opts: &ImportModeCfOptions, ) where E: KvEngine, { @@ -292,9 +292,9 @@ mod tests { .unwrap(); let db = new_test_engine(temp_dir.path().to_str().unwrap(), &[CF_DEFAULT, "a", "b"]); - let normal_db_options = ImportModeDBOptions::new_options(&db); + let normal_db_options = ImportModeDbOptions::new_options(&db); let import_db_options = normal_db_options.optimized_for_import_mode(); - let normal_cf_options = ImportModeCFOptions::new_options(&db, "default"); + let normal_cf_options = ImportModeCfOptions::new_options(&db, "default"); let import_cf_options = normal_cf_options.optimized_for_import_mode(); assert!( @@ -333,9 +333,9 @@ mod tests { .unwrap(); let db = new_test_engine(temp_dir.path().to_str().unwrap(), &[CF_DEFAULT, "a", "b"]); - let normal_db_options = ImportModeDBOptions::new_options(&db); + let normal_db_options = ImportModeDbOptions::new_options(&db); let import_db_options = normal_db_options.optimized_for_import_mode(); - let normal_cf_options = ImportModeCFOptions::new_options(&db, "default"); + let normal_cf_options = ImportModeCfOptions::new_options(&db, "default"); let import_cf_options = normal_cf_options.optimized_for_import_mode(); fn mf(_cf: &str, _name: &str, _v: f64) {} @@ -374,7 +374,7 @@ mod tests { |_, opt| opt.set_level_zero_stop_writes_trigger(2_000_000_000), ); - let normal_cf_options = ImportModeCFOptions::new_options(&db, "default"); + let normal_cf_options = ImportModeCfOptions::new_options(&db, "default"); assert_eq!(normal_cf_options.level0_stop_writes_trigger, 2_000_000_000); let import_cf_options = normal_cf_options.optimized_for_import_mode(); assert_eq!(import_cf_options.level0_stop_writes_trigger, 2_000_000_000); diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 356541cebbb..b6d13ac9761 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -11,7 +11,7 @@ use std::{ }; use dashmap::DashMap; -use encryption::{encryption_method_to_db_encryption_method, DataKeyManager}; +use encryption::{to_engine_encryption_method, DataKeyManager}; use engine_rocks::{get_env, RocksSstReader}; use engine_traits::{ name_to_cf, util::check_key_in_range, CfName, EncryptionKeyManager, FileEncryptionInfo, @@ -33,7 +33,7 @@ use txn_types::{Key, TimeStamp, WriteRef}; use crate::{ import_file::{ImportDir, ImportFile}, - import_mode::{ImportModeSwitcher, RocksDBMetricsFn}, + import_mode::{ImportModeSwitcher, RocksDbMetricsFn}, metrics::*, sst_writer::{RawSstWriter, TxnSstWriter}, Config, Error, Result, @@ -211,11 +211,11 @@ impl SstImporter { } } - pub fn enter_normal_mode(&self, db: E, mf: RocksDBMetricsFn) -> Result { + pub fn enter_normal_mode(&self, db: E, mf: RocksDbMetricsFn) -> Result { self.switcher.enter_normal_mode(&db, mf) } - pub fn enter_import_mode(&self, db: E, mf: RocksDBMetricsFn) -> Result { + pub fn enter_import_mode(&self, db: E, mf: RocksDbMetricsFn) -> Result { self.switcher.enter_import_mode(&db, mf) } @@ -488,7 +488,7 @@ impl SstImporter { let path = self.dir.join(meta)?; let file_crypter = crypter.map(|c| FileEncryptionInfo { - method: encryption_method_to_db_encryption_method(c.cipher_type), + method: to_engine_encryption_method(c.cipher_type), key: c.cipher_key, iv: meta.cipher_iv.to_owned(), }); diff --git a/components/sst_importer/src/util.rs b/components/sst_importer/src/util.rs index 6ba4d892717..9266378845d 100644 --- a/components/sst_importer/src/util.rs +++ b/components/sst_importer/src/util.rs @@ -70,12 +70,12 @@ mod tests { use encryption::DataKeyManager; use engine_rocks::{ - util::new_engine_opt, RocksCfOptions, RocksDBOptions, RocksEngine, RocksSstWriterBuilder, - RocksTitanDBOptions, + util::new_engine_opt, RocksCfOptions, RocksDbOptions, RocksEngine, RocksSstWriterBuilder, + RocksTitanDbOptions, }; use engine_traits::{ - CfName, ColumnFamilyOptions, DBOptions, EncryptionKeyManager, ImportExt, Peekable, - SstWriter, SstWriterBuilder, TitanDBOptions, CF_DEFAULT, + CfName, CfOptions, DbOptions, EncryptionKeyManager, ImportExt, Peekable, SstWriter, + SstWriterBuilder, TitanDbOptions, CF_DEFAULT, }; use tempfile::Builder; use test_util::encryption::new_test_key_manager; @@ -115,7 +115,7 @@ mod tests { } fn check_prepare_sst_for_ingestion( - db_opts: Option, + db_opts: Option, cf_opts: Option>, key_manager: Option<&DataKeyManager>, was_encrypted: bool, @@ -188,8 +188,8 @@ mod tests { #[test] fn test_prepare_sst_for_ingestion_titan() { - let mut db_opts = RocksDBOptions::new(); - let mut titan_opts = RocksTitanDBOptions::new(); + let mut db_opts = RocksDbOptions::new(); + let mut titan_opts = RocksTitanDbOptions::new(); // Force all values write out to blob files. titan_opts.set_min_blob_size(0); db_opts.set_titandb_options(&titan_opts); diff --git a/components/test_coprocessor/src/dag.rs b/components/test_coprocessor/src/dag.rs index 4165d19bdb4..740ece83e1a 100644 --- a/components/test_coprocessor/src/dag.rs +++ b/components/test_coprocessor/src/dag.rs @@ -277,15 +277,15 @@ impl DAGSelect { } } -pub struct DAGChunkSpliter { +pub struct DagChunkSpliter { chunks: Vec, datums: Vec, col_cnt: usize, } -impl DAGChunkSpliter { - pub fn new(chunks: Vec, col_cnt: usize) -> DAGChunkSpliter { - DAGChunkSpliter { +impl DagChunkSpliter { + pub fn new(chunks: Vec, col_cnt: usize) -> DagChunkSpliter { + DagChunkSpliter { chunks, col_cnt, datums: Vec::with_capacity(0), @@ -293,7 +293,7 @@ impl DAGChunkSpliter { } } -impl Iterator for DAGChunkSpliter { +impl Iterator for DagChunkSpliter { type Item = Vec; fn next(&mut self) -> Option> { diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 301647bf267..097e74f157b 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -18,7 +18,7 @@ use engine_traits::{ CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_RAFT, }; -use file_system::IORateLimiter; +use file_system::IoRateLimiter; use futures::executor::block_on; use kvproto::{ errorpb::Error as PbError, @@ -160,7 +160,7 @@ pub struct Cluster { pub dbs: Vec>, pub store_metas: HashMap>>, key_managers: Vec>>, - pub io_rate_limiter: Option>, + pub io_rate_limiter: Option>, pub engines: HashMap>, key_managers_map: HashMap>>, pub labels: HashMap>, diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index c399b4813f2..e33837ebd76 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -19,7 +19,7 @@ use engine_traits::{ Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, TabletFactory, ALL_CFS, CF_DEFAULT, CF_RAFT, }; -use file_system::IORateLimiter; +use file_system::IoRateLimiter; use futures::executor::block_on; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ @@ -625,7 +625,7 @@ pub fn must_contains_error(resp: &RaftCmdResponse, msg: &str) { pub fn create_test_engine( // TODO: pass it in for all cases. router: Option>, - limiter: Option>, + limiter: Option>, cfg: &Config, ) -> ( Engines, diff --git a/components/test_sst_importer/src/lib.rs b/components/test_sst_importer/src/lib.rs index 65d2a3dc70a..9397a6bb35b 100644 --- a/components/test_sst_importer/src/lib.rs +++ b/components/test_sst_importer/src/lib.rs @@ -5,7 +5,7 @@ use std::{collections::HashMap, fs, path::Path, sync::Arc}; use engine_rocks::{ raw::{DBEntryType, Env, TablePropertiesCollector, TablePropertiesCollectorFactory}, util::new_engine_opt, - RocksCfOptions, RocksDBOptions, RocksEngine, RocksSstReader, RocksSstWriterBuilder, + RocksCfOptions, RocksDbOptions, RocksEngine, RocksSstReader, RocksSstWriterBuilder, }; pub use engine_rocks::{RocksEngine as TestEngine, RocksSstWriter}; use engine_traits::{KvEngine, SstWriter, SstWriterBuilder}; @@ -47,8 +47,8 @@ where }) .collect(); - let db_opts = env.map_or_else(RocksDBOptions::default, |e| { - let mut opts = RocksDBOptions::default(); + let db_opts = env.map_or_else(RocksDbOptions::default, |e| { + let mut opts = RocksDbOptions::default(); opts.set_env(e); opts }); diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs b/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs index b1539e7c581..d06bf49c025 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs @@ -2,11 +2,11 @@ use super::*; -pub trait UTF8CompatibleEncoding { +pub trait Utf8CompatibleEncoding { const NAME: &'static str; } -impl Encoding for T { +impl Encoding for T { #[inline] fn decode(data: BytesRef<'_>) -> Result { match str::from_utf8(data) { @@ -17,22 +17,22 @@ impl Encoding for T { } #[derive(Debug)] -pub struct EncodingUTF8Mb4; +pub struct EncodingUtf8Mb4; -impl UTF8CompatibleEncoding for EncodingUTF8Mb4 { +impl Utf8CompatibleEncoding for EncodingUtf8Mb4 { const NAME: &'static str = "utf8mb4"; } #[derive(Debug)] -pub struct EncodingUTF8; +pub struct EncodingUtf8; -impl UTF8CompatibleEncoding for EncodingUTF8 { +impl Utf8CompatibleEncoding for EncodingUtf8 { const NAME: &'static str = "utf8"; } #[derive(Debug)] pub struct EncodingLatin1; -impl UTF8CompatibleEncoding for EncodingLatin1 { +impl Utf8CompatibleEncoding for EncodingLatin1 { const NAME: &'static str = "latin1"; } diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index 0d6a8e6d9ea..b3033c06d84 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -49,8 +49,8 @@ macro_rules! match_template_charset { match_template::match_template! { $t = [ - UTF8 => EncodingUTF8, - UTF8Mb4 => EncodingUTF8Mb4, + UTF8 => EncodingUtf8, + UTF8Mb4 => EncodingUtf8Mb4, Latin1 => EncodingLatin1, GBK => EncodingGBK, Binary => EncodingBinary, diff --git a/components/tidb_query_datatype/src/codec/data_type/mod.rs b/components/tidb_query_datatype/src/codec/data_type/mod.rs index 278ef48469a..930070e87a2 100644 --- a/components/tidb_query_datatype/src/codec/data_type/mod.rs +++ b/components/tidb_query_datatype/src/codec/data_type/mod.rs @@ -52,49 +52,49 @@ use crate::{codec::convert::ConvertTo, expr::EvalContext, EvalType}; /// A trait of evaluating current concrete eval type into a MySQL logic value, /// represented by Rust's `bool` type. -pub trait AsMySQLBool { +pub trait AsMySqlBool { /// Evaluates into a MySQL logic value. fn as_mysql_bool(&self, context: &mut EvalContext) -> Result; } -impl AsMySQLBool for Int { +impl AsMySqlBool for Int { #[inline] fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(*self != 0) } } -impl AsMySQLBool for Real { +impl AsMySqlBool for Real { #[inline] fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(self.into_inner() != 0f64) } } -impl<'a, T: AsMySQLBool> AsMySQLBool for &'a T { +impl<'a, T: AsMySqlBool> AsMySqlBool for &'a T { #[inline] fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { (**self).as_mysql_bool(context) } } -impl AsMySQLBool for Bytes { +impl AsMySqlBool for Bytes { #[inline] fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { self.as_slice().as_mysql_bool(context) } } -impl<'a> AsMySQLBool for BytesRef<'a> { +impl<'a> AsMySqlBool for BytesRef<'a> { #[inline] fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { Ok(!self.is_empty() && ConvertTo::::convert(self, context)? != 0f64) } } -impl<'a, T> AsMySQLBool for Option<&'a T> +impl<'a, T> AsMySqlBool for Option<&'a T> where - T: AsMySQLBool, + T: AsMySqlBool, { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { @@ -104,25 +104,25 @@ where } } -impl<'a> AsMySQLBool for JsonRef<'a> { +impl<'a> AsMySqlBool for JsonRef<'a> { fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(!self.is_zero()) } } -impl<'a> AsMySQLBool for EnumRef<'a> { +impl<'a> AsMySqlBool for EnumRef<'a> { fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(!self.is_empty()) } } -impl<'a> AsMySQLBool for SetRef<'a> { +impl<'a> AsMySqlBool for SetRef<'a> { fn as_mysql_bool(&self, _context: &mut EvalContext) -> Result { Ok(!self.is_empty()) } } -impl<'a> AsMySQLBool for Option> { +impl<'a> AsMySqlBool for Option> { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { None => Ok(false), @@ -131,7 +131,7 @@ impl<'a> AsMySQLBool for Option> { } } -impl<'a> AsMySQLBool for Option> { +impl<'a> AsMySqlBool for Option> { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { None => Ok(false), @@ -140,7 +140,7 @@ impl<'a> AsMySQLBool for Option> { } } -impl<'a> AsMySQLBool for Option> { +impl<'a> AsMySqlBool for Option> { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { None => Ok(false), @@ -149,7 +149,7 @@ impl<'a> AsMySQLBool for Option> { } } -impl<'a> AsMySQLBool for Option> { +impl<'a> AsMySqlBool for Option> { fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match self { None => Ok(false), diff --git a/components/tidb_query_datatype/src/codec/data_type/scalar.rs b/components/tidb_query_datatype/src/codec/data_type/scalar.rs index b95dbb63342..d476fd2d370 100644 --- a/components/tidb_query_datatype/src/codec/data_type/scalar.rs +++ b/components/tidb_query_datatype/src/codec/data_type/scalar.rs @@ -83,7 +83,7 @@ impl ScalarValue { } } -impl AsMySQLBool for ScalarValue { +impl AsMySqlBool for ScalarValue { #[inline] fn as_mysql_bool(&self, context: &mut EvalContext) -> Result { match_template_evaltype! { diff --git a/components/tidb_query_datatype/src/codec/datum.rs b/components/tidb_query_datatype/src/codec/datum.rs index 8d2e62b6ac0..9d791d911cd 100644 --- a/components/tidb_query_datatype/src/codec/datum.rs +++ b/components/tidb_query_datatype/src/codec/datum.rs @@ -24,7 +24,7 @@ use super::{ use crate::{ codec::{ convert::{ConvertTo, ToInt}, - data_type::AsMySQLBool, + data_type::AsMySqlBool, }, expr::EvalContext, FieldTypeTp, diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index a172d2e2723..2518e003ba3 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -1943,7 +1943,7 @@ impl Display for Decimal { } } -impl crate::codec::data_type::AsMySQLBool for Decimal { +impl crate::codec::data_type::AsMySqlBool for Decimal { #[inline] fn as_mysql_bool(&self, _ctx: &mut EvalContext) -> crate::codec::Result { Ok(!self.is_zero()) diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index 370467b9928..3869f773020 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -703,7 +703,7 @@ pub trait DurationDecoder: NumberDecoder { impl DurationDecoder for T {} -impl crate::codec::data_type::AsMySQLBool for Duration { +impl crate::codec::data_type::AsMySqlBool for Duration { #[inline] fn as_mysql_bool(&self, _context: &mut crate::expr::EvalContext) -> crate::codec::Result { Ok(!self.is_zero()) diff --git a/components/tidb_query_datatype/src/codec/mysql/enums.rs b/components/tidb_query_datatype/src/codec/mysql/enums.rs index 9a591cf750a..fecada58b1d 100644 --- a/components/tidb_query_datatype/src/codec/mysql/enums.rs +++ b/components/tidb_query_datatype/src/codec/mysql/enums.rs @@ -84,7 +84,7 @@ impl PartialOrd for Enum { } } -impl crate::codec::data_type::AsMySQLBool for Enum { +impl crate::codec::data_type::AsMySqlBool for Enum { #[inline] fn as_mysql_bool(&self, _context: &mut crate::expr::EvalContext) -> crate::codec::Result { Ok(self.value != 0) diff --git a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs index 7251f5477f6..2e5abc6f87a 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs @@ -487,7 +487,7 @@ impl ConvertTo for Duration { } } -impl crate::codec::data_type::AsMySQLBool for Json { +impl crate::codec::data_type::AsMySqlBool for Json { #[inline] fn as_mysql_bool(&self, _context: &mut crate::expr::EvalContext) -> crate::codec::Result { // TODO: This logic is not correct. See pingcap/tidb#9593 diff --git a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs index 1b848c3534f..19fec765d1c 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs @@ -15,9 +15,9 @@ use crate::codec::Error; /// https://github.com/pingcap/tidb/blob/master/types/json/binary.go /// We add a space after `,` and `:`. #[derive(Clone, Debug)] -pub struct MySQLFormatter {} +pub struct MySqlFormatter {} -impl serde_json::ser::Formatter for MySQLFormatter { +impl serde_json::ser::Formatter for MySqlFormatter { #[inline] fn begin_object_value(&mut self, writer: &mut W) -> std::io::Result<()> where @@ -51,9 +51,9 @@ impl serde_json::ser::Formatter for MySQLFormatter { } } -impl MySQLFormatter { +impl MySqlFormatter { pub fn new() -> Self { - MySQLFormatter {} + MySqlFormatter {} } } @@ -62,7 +62,7 @@ impl<'a> ToString for JsonRef<'a> { /// `to_writer_pretty` fn to_string(&self) -> String { let mut writer = Vec::with_capacity(128); - let mut ser = JsonSerializer::with_formatter(&mut writer, MySQLFormatter::new()); + let mut ser = JsonSerializer::with_formatter(&mut writer, MySqlFormatter::new()); self.serialize(&mut ser).unwrap(); unsafe { // serde_json will not emit invalid UTF-8 diff --git a/components/tidb_query_datatype/src/codec/mysql/set.rs b/components/tidb_query_datatype/src/codec/mysql/set.rs index 0d5a28e2ba5..62539c1ff2c 100644 --- a/components/tidb_query_datatype/src/codec/mysql/set.rs +++ b/components/tidb_query_datatype/src/codec/mysql/set.rs @@ -69,7 +69,7 @@ impl PartialOrd for Set { } } -impl crate::codec::data_type::AsMySQLBool for Set { +impl crate::codec::data_type::AsMySqlBool for Set { #[inline] fn as_mysql_bool(&self, _context: &mut crate::expr::EvalContext) -> crate::codec::Result { Ok(self.value > 0) diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 5d387f1cdff..79068b38118 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -1961,7 +1961,7 @@ pub trait TimeDecoder: NumberDecoder { impl TimeDecoder for T {} -impl crate::codec::data_type::AsMySQLBool for Time { +impl crate::codec::data_type::AsMySqlBool for Time { #[inline] fn as_mysql_bool(&self, _context: &mut crate::expr::EvalContext) -> crate::codec::Result { Ok(!self.is_zero()) diff --git a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs index 94e9dd0a9ae..463a969284d 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs @@ -13,17 +13,17 @@ use crate::codec::{Error, Result}; pub enum RowSlice<'a> { Small { origin: &'a [u8], - non_null_ids: LEBytes<'a, u8>, - null_ids: LEBytes<'a, u8>, - offsets: LEBytes<'a, u16>, - values: LEBytes<'a, u8>, + non_null_ids: LeBytes<'a, u8>, + null_ids: LeBytes<'a, u8>, + offsets: LeBytes<'a, u16>, + values: LeBytes<'a, u8>, }, Big { origin: &'a [u8], - non_null_ids: LEBytes<'a, u32>, - null_ids: LEBytes<'a, u32>, - offsets: LEBytes<'a, u32>, - values: LEBytes<'a, u8>, + non_null_ids: LeBytes<'a, u32>, + null_ids: LeBytes<'a, u32>, + offsets: LeBytes<'a, u32>, + values: LeBytes<'a, u8>, }, } @@ -45,7 +45,7 @@ impl RowSlice<'_> { non_null_ids: read_le_bytes(&mut data, non_null_cnt)?, null_ids: read_le_bytes(&mut data, null_cnt)?, offsets: read_le_bytes(&mut data, non_null_cnt)?, - values: LEBytes::new(data), + values: LeBytes::new(data), } } else { RowSlice::Small { @@ -53,7 +53,7 @@ impl RowSlice<'_> { non_null_ids: read_le_bytes(&mut data, non_null_cnt)?, null_ids: read_le_bytes(&mut data, null_cnt)?, offsets: read_le_bytes(&mut data, non_null_cnt)?, - values: LEBytes::new(data), + values: LeBytes::new(data), } }; Ok(row) @@ -175,7 +175,7 @@ impl RowSlice<'_> { /// use little endianness. #[cfg(target_endian = "little")] #[inline] -fn read_le_bytes<'a, T>(buf: &mut &'a [u8], len: usize) -> Result> +fn read_le_bytes<'a, T>(buf: &mut &'a [u8], len: usize) -> Result> where T: PrimInt, { @@ -185,17 +185,17 @@ where } let slice = &buf[..bytes_len]; buf.advance(bytes_len); - Ok(LEBytes::new(slice)) + Ok(LeBytes::new(slice)) } #[cfg(target_endian = "little")] -pub struct LEBytes<'a, T: PrimInt> { +pub struct LeBytes<'a, T: PrimInt> { slice: &'a [u8], _marker: PhantomData, } #[cfg(target_endian = "little")] -impl<'a, T: PrimInt> LEBytes<'a, T> { +impl<'a, T: PrimInt> LeBytes<'a, T> { fn new(slice: &'a [u8]) -> Self { Self { slice, diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index bcbf2b8f92b..9f23d434a6c 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -371,7 +371,7 @@ enum RestoreData<'a> { #[derive(PartialEq, Debug, Copy, Clone)] enum DecodePartitionIdOp<'a> { Nop, - PID(&'a [u8]), + Pid(&'a [u8]), } impl IndexScanExecutorImpl { @@ -662,7 +662,7 @@ impl IndexScanExecutorImpl { } else if partition_id_bytes.is_empty() { DecodePartitionIdOp::Nop } else { - DecodePartitionIdOp::PID(partition_id_bytes) + DecodePartitionIdOp::Pid(partition_id_bytes) } }; @@ -803,7 +803,7 @@ impl IndexScanExecutorImpl { ) -> Result<()> { match decode_pid { DecodePartitionIdOp::Nop => {} - DecodePartitionIdOp::PID(pid) => { + DecodePartitionIdOp::Pid(pid) => { // If need partition id, append partition id to the last column // before physical table id column if exists. let pid = NumberCodec::decode_i64(pid); diff --git a/components/tidb_query_executors/src/selection_executor.rs b/components/tidb_query_executors/src/selection_executor.rs index 61030e593e0..b7a19da9026 100644 --- a/components/tidb_query_executors/src/selection_executor.rs +++ b/components/tidb_query_executors/src/selection_executor.rs @@ -136,7 +136,7 @@ fn update_logical_rows_by_vector_value<'a, TT: EvaluableRef<'a>, T: 'a + ChunkRe eval_result_logical_rows: LogicalRows<'_>, ) -> tidb_query_common::error::Result<()> where - Option: AsMySQLBool, + Option: AsMySqlBool, { let mut err_result = Ok(()); let mut logical_index = 0; diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index 995f2ed0e21..cfa171054c9 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -577,7 +577,7 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { mod tests { use engine_rocks::{ util::{new_engine_opt, new_temp_engine, FixedPrefixSliceTransform}, - RocksCfOptions, RocksDBOptions, RocksEngine, RocksSnapshot, + RocksCfOptions, RocksDbOptions, RocksEngine, RocksSnapshot, }; use engine_traits::{IterOptions, SyncMutable, CF_DEFAULT}; use keys::data_key; @@ -623,7 +623,7 @@ mod tests { .unwrap(); let engine = new_engine_opt( path.path().to_str().unwrap(), - RocksDBOptions::default(), + RocksDbOptions::default(), vec![(CF_DEFAULT, cf_opts)], ) .unwrap(); diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 0cc90730acd..44d5e698f5c 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -11,12 +11,12 @@ use std::{ pub use engine_rocks::RocksSnapshot; use engine_rocks::{ - get_env, RocksCfOptions, RocksDBOptions, RocksEngine as BaseRocksEngine, RocksEngineIterator, + get_env, RocksCfOptions, RocksDbOptions, RocksEngine as BaseRocksEngine, RocksEngineIterator, }; use engine_traits::{ CfName, Engines, IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, }; -use file_system::IORateLimiter; +use file_system::IoRateLimiter; use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb}; use raftstore::coprocessor::CoprocessorHost; use tempfile::{Builder, TempDir}; @@ -88,10 +88,10 @@ pub struct RocksEngine { impl RocksEngine { pub fn new( path: &str, - db_opts: Option, + db_opts: Option, cfs_opts: Vec<(CfName, RocksCfOptions)>, shared_block_cache: bool, - io_rate_limiter: Option>, + io_rate_limiter: Option>, ) -> Result { info!("RocksEngine: creating for path"; "path" => path); let (path, temp_dir) = match path { diff --git a/components/tikv_util/src/logger/mod.rs b/components/tikv_util/src/logger/mod.rs index dc5d4a3b862..91ecd803b89 100644 --- a/components/tikv_util/src/logger/mod.rs +++ b/components/tikv_util/src/logger/mod.rs @@ -1014,8 +1014,8 @@ mod tests { } } - struct RaftDBWriter; - impl Write for RaftDBWriter { + struct RaftDbWriter; + impl Write for RaftDbWriter { fn write(&mut self, buf: &[u8]) -> io::Result { RAFTDB_BUFFER.with(|buffer| buffer.borrow_mut().write(buf)) } @@ -1029,7 +1029,7 @@ mod tests { let normal = TikvFormat::new(PlainSyncDecorator::new(NormalWriter), true); let slow = TikvFormat::new(PlainSyncDecorator::new(SlowLogWriter), true); let rocksdb = TikvFormat::new(PlainSyncDecorator::new(RocksdbLogWriter), true); - let raftdb = TikvFormat::new(PlainSyncDecorator::new(RaftDBWriter), true); + let raftdb = TikvFormat::new(PlainSyncDecorator::new(RaftDbWriter), true); let drain = LogDispatcher::new(normal, rocksdb, raftdb, Some(slow)).fuse(); let drain = SlowLogFilter { threshold: 200, diff --git a/scripts/clippy b/scripts/clippy index 58bdafb817b..491362410c1 100755 --- a/scripts/clippy +++ b/scripts/clippy @@ -20,7 +20,6 @@ CLIPPY_LINTS=(-A clippy::module_inception \ -A clippy::too_many_arguments \ -A clippy::blacklisted_name \ -A clippy::redundant_closure \ - -A clippy::upper_case_acronyms \ -A clippy::field_reassign_with_default \ -A clippy::wrong_self_convention \ -A clippy::needless_range_loop \ @@ -33,6 +32,7 @@ CLIPPY_LINTS=(-A clippy::module_inception \ -A clippy::enum_variant_names \ -W clippy::dbg_macro \ -W clippy::todo \ + -D clippy::upper_case_acronyms \ -D clippy::disallowed-methods \ -D rust-2018-idioms) diff --git a/src/config.rs b/src/config.rs index 0fe367c1349..6c345b8b773 100644 --- a/src/config.rs +++ b/src/config.rs @@ -31,16 +31,15 @@ use engine_rocks::{ PrepopulateBlockCache, }, util::{FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform}, - RaftDBLogger, RangePropertiesCollectorFactory, RocksCfOptions, RocksDBOptions, RocksEngine, - RocksEventListener, RocksTitanDBOptions, RocksdbLogger, TtlPropertiesCollectorFactory, + RaftDbLogger, RangePropertiesCollectorFactory, RocksCfOptions, RocksDbOptions, RocksEngine, + RocksEventListener, RocksTitanDbOptions, RocksdbLogger, TtlPropertiesCollectorFactory, DEFAULT_PROP_KEYS_INDEX_DISTANCE, DEFAULT_PROP_SIZE_INDEX_DISTANCE, }; use engine_traits::{ - CFOptionsExt, ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptions as _, DBOptionsExt, - TabletAccessor, TabletErrorCollector, TitanDBOptions as _, CF_DEFAULT, CF_LOCK, CF_RAFT, - CF_WRITE, + CfOptions as _, CfOptionsExt, DbOptions as _, DbOptionsExt, TabletAccessor, + TabletErrorCollector, TitanDbOptions as _, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; -use file_system::IORateLimiter; +use file_system::IoRateLimiter; use keys::region_raft_prefix_len; use kvproto::kvrpcpb::ApiVersion; use online_config::{ConfigChange, ConfigManager, ConfigValue, OnlineConfig, Result as CfgResult}; @@ -179,8 +178,8 @@ impl Default for TitanCfConfig { } impl TitanCfConfig { - fn build_opts(&self) -> RocksTitanDBOptions { - let mut opts = RocksTitanDBOptions::new(); + fn build_opts(&self) -> RocksTitanDbOptions { + let mut opts = RocksTitanDbOptions::new(); opts.set_min_blob_size(self.min_blob_size.0 as u64); opts.set_blob_file_compression(self.blob_file_compression.into()); opts.set_blob_cache(self.blob_cache_size.0 as usize, -1, false, 0.0); @@ -972,7 +971,7 @@ impl RaftCfConfig { #[serde(rename_all = "kebab-case")] // Note that Titan is still an experimental feature. Once enabled, it can't fall // back. Forced fallback may result in data loss. -pub struct TitanDBConfig { +pub struct TitanDbConfig { pub enabled: bool, pub dirname: String, pub disable_gc: bool, @@ -981,7 +980,7 @@ pub struct TitanDBConfig { pub purge_obsolete_files_period: ReadableDuration, } -impl Default for TitanDBConfig { +impl Default for TitanDbConfig { fn default() -> Self { Self { enabled: false, @@ -993,9 +992,9 @@ impl Default for TitanDBConfig { } } -impl TitanDBConfig { - fn build_opts(&self) -> RocksTitanDBOptions { - let mut opts = RocksTitanDBOptions::new(); +impl TitanDbConfig { + fn build_opts(&self) -> RocksTitanDbOptions { + let mut opts = RocksTitanDbOptions::new(); opts.set_dirname(&self.dirname); opts.set_disable_background_gc(self.disable_gc); opts.set_max_background_gc(self.max_background_gc); @@ -1082,13 +1081,13 @@ pub struct DbConfig { #[online_config(submodule)] pub raftcf: RaftCfConfig, #[online_config(skip)] - pub titan: TitanDBConfig, + pub titan: TitanDbConfig, } impl Default for DbConfig { fn default() -> DbConfig { let bg_job_limits = get_background_job_limits(&KVDB_DEFAULT_BACKGROUND_JOB_LIMITS); - let titan_config = TitanDBConfig { + let titan_config = TitanDbConfig { max_background_gc: bg_job_limits.max_titan_background_gc as i32, ..Default::default() }; @@ -1134,8 +1133,8 @@ impl Default for DbConfig { } impl DbConfig { - pub fn build_opt(&self) -> RocksDBOptions { - let mut opts = RocksDBOptions::default(); + pub fn build_opt(&self) -> RocksDbOptions { + let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { opts.set_wal_dir(&self.wal_dir); @@ -1392,13 +1391,13 @@ pub struct RaftDbConfig { #[online_config(submodule)] pub defaultcf: RaftDefaultCfConfig, #[online_config(skip)] - pub titan: TitanDBConfig, + pub titan: TitanDbConfig, } impl Default for RaftDbConfig { fn default() -> RaftDbConfig { let bg_job_limits = get_background_job_limits(&RAFTDB_DEFAULT_BACKGROUND_JOB_LIMITS); - let titan_config = TitanDBConfig { + let titan_config = TitanDbConfig { max_background_gc: bg_job_limits.max_titan_background_gc as i32, ..Default::default() }; @@ -1436,8 +1435,8 @@ impl Default for RaftDbConfig { } impl RaftDbConfig { - pub fn build_opt(&self) -> RocksDBOptions { - let mut opts = RocksDBOptions::default(); + pub fn build_opt(&self) -> RocksDbOptions { + let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { opts.set_wal_dir(&self.wal_dir); @@ -1457,7 +1456,7 @@ impl RaftDbConfig { opts.set_max_log_file_size(self.info_log_max_size.0); opts.set_log_file_time_to_roll(self.info_log_roll_time.as_secs()); opts.set_keep_log_file_num(self.info_log_keep_log_file_num); - opts.set_info_log(RaftDBLogger::default()); + opts.set_info_log(RaftDbLogger::default()); opts.set_info_log_level(self.info_log_level.into()); opts.set_max_subcompactions(self.max_sub_compactions); opts.set_writable_file_max_buffer_size(self.writable_file_max_buffer_size.0 as i32); @@ -1536,20 +1535,20 @@ impl RaftEngineConfig { } #[derive(Clone, Copy, Debug)] -pub enum DBType { +pub enum DbType { Kv, Raft, } -pub struct DBConfigManger> { +pub struct DbConfigManger> { tablet_accessor: Arc, - db_type: DBType, + db_type: DbType, shared_block_cache: bool, } -impl> DBConfigManger { - pub fn new(tablet_accessor: Arc, db_type: DBType, shared_block_cache: bool) -> Self { - DBConfigManger { +impl> DbConfigManger { + pub fn new(tablet_accessor: Arc, db_type: DbType, shared_block_cache: bool) -> Self { + DbConfigManger { tablet_accessor, db_type, shared_block_cache, @@ -1681,17 +1680,17 @@ impl> DBConfigManger { fn validate_cf(&self, cf: &str) -> Result<(), Box> { match (self.db_type, cf) { - (DBType::Kv, CF_DEFAULT) - | (DBType::Kv, CF_WRITE) - | (DBType::Kv, CF_LOCK) - | (DBType::Kv, CF_RAFT) - | (DBType::Raft, CF_DEFAULT) => Ok(()), + (DbType::Kv, CF_DEFAULT) + | (DbType::Kv, CF_WRITE) + | (DbType::Kv, CF_LOCK) + | (DbType::Kv, CF_RAFT) + | (DbType::Raft, CF_DEFAULT) => Ok(()), _ => Err(format!("invalid cf {:?} for db {:?}", cf, self.db_type).into()), } } } -impl + Send + Sync> ConfigManager for DBConfigManger { +impl + Send + Sync> ConfigManager for DbConfigManger { fn dispatch(&mut self, change: ConfigChange) -> Result<(), Box> { let change_str = format!("{:?}", change); let mut change: Vec<(String, ConfigValue)> = change.into_iter().collect(); @@ -3556,7 +3555,7 @@ impl TiKvConfig { pub fn build_shared_rocks_env( &self, key_manager: Option>, - limiter: Option>, + limiter: Option>, ) -> Result, String> { let env = get_env(key_manager, limiter)?; if !self.raft_engine.enable { @@ -4057,9 +4056,7 @@ mod tests { use api_version::{ApiV1, KvFormat}; use case_macros::*; - use engine_traits::{ - ColumnFamilyOptions as ColumnFamilyOptionsTrait, DBOptions as DBOptionsTrait, DummyFactory, - }; + use engine_traits::{CfOptions as _, DbOptions as _, DummyFactory}; use futures::executor::block_on; use grpcio::ResourceQuota; use itertools::Itertools; @@ -4379,7 +4376,7 @@ mod tests { incoming.coprocessor.region_split_keys = Some(10000); incoming.gc.max_write_bytes_per_sec = ReadableSize::mb(100); incoming.rocksdb.defaultcf.block_cache_size = ReadableSize::mb(500); - incoming.storage.io_rate_limit.import_priority = file_system::IOPriority::High; + incoming.storage.io_rate_limit.import_priority = file_system::IoPriority::High; let diff = old.diff(&incoming); let mut change = HashMap::new(); change.insert( @@ -4505,9 +4502,9 @@ mod tests { let (shared, cfg_controller) = (cfg.storage.block_cache.shared, ConfigController::new(cfg)); cfg_controller.register( Module::Rocksdb, - Box::new(DBConfigManger::new( + Box::new(DbConfigManger::new( Arc::new(engine.clone()), - DBType::Kv, + DbType::Kv, shared, )), ); diff --git a/src/coprocessor/dag/mod.rs b/src/coprocessor/dag/mod.rs index d0b9d7c381a..8b3f561ce5f 100644 --- a/src/coprocessor/dag/mod.rs +++ b/src/coprocessor/dag/mod.rs @@ -65,7 +65,7 @@ impl DagHandlerBuilder { pub fn build(self) -> Result> { COPR_DAG_REQ_COUNT.with_label_values(&["batch"]).inc(); - Ok(BatchDAGHandler::new( + Ok(BatchDagHandler::new( self.req, self.ranges, self.store, @@ -81,12 +81,12 @@ impl DagHandlerBuilder { } } -pub struct BatchDAGHandler { +pub struct BatchDagHandler { runner: tidb_query_executors::runner::BatchExecutorsRunner, data_version: Option, } -impl BatchDAGHandler { +impl BatchDagHandler { pub fn new( req: DagRequest, ranges: Vec, @@ -116,7 +116,7 @@ impl BatchDAGHandler { } #[async_trait] -impl RequestHandler for BatchDAGHandler { +impl RequestHandler for BatchDagHandler { async fn handle_request(&mut self) -> Result> { let result = self.runner.handle_request().await; handle_qe_response(result, self.runner.can_be_cached(), self.data_version).map(|x| x.into()) diff --git a/src/coprocessor/dag/storage_impl.rs b/src/coprocessor/dag/storage_impl.rs index 46dcf7f570e..7f5e60081e7 100644 --- a/src/coprocessor/dag/storage_impl.rs +++ b/src/coprocessor/dag/storage_impl.rs @@ -1,7 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use tidb_query_common::storage::{ - IntervalRange, OwnedKvPair, PointRange, Result as QEResult, Storage, + IntervalRange, OwnedKvPair, PointRange, Result as QeResult, Storage, }; use txn_types::Key; @@ -41,7 +41,7 @@ impl Storage for TiKvStorage { is_backward_scan: bool, is_key_only: bool, range: IntervalRange, - ) -> QEResult<()> { + ) -> QeResult<()> { if let Some(scanner) = &mut self.scanner { self.cf_stats_backlog.add(&scanner.take_statistics()); if scanner.met_newer_ts_data() == NewerTsCheckState::Met { @@ -67,14 +67,14 @@ impl Storage for TiKvStorage { Ok(()) } - fn scan_next(&mut self) -> QEResult> { + fn scan_next(&mut self) -> QeResult> { // Unwrap is fine because we must have called `reset_range` before calling // `scan_next`. let kv = self.scanner.as_mut().unwrap().next().map_err(Error::from)?; Ok(kv.map(|(k, v)| (k.into_raw().unwrap(), v))) } - fn get(&mut self, _is_key_only: bool, range: PointRange) -> QEResult> { + fn get(&mut self, _is_key_only: bool, range: PointRange) -> QeResult> { // TODO: Default CF does not need to be accessed if KeyOnly. // TODO: No need to check newer ts data if self.scanner has met newer ts data. let key = range.0; diff --git a/src/coprocessor/readpool_impl.rs b/src/coprocessor/readpool_impl.rs index b47ee388f22..45f6b9bcc73 100644 --- a/src/coprocessor/readpool_impl.rs +++ b/src/coprocessor/readpool_impl.rs @@ -2,7 +2,7 @@ use std::sync::{Arc, Mutex}; -use file_system::{set_io_type, IOType}; +use file_system::{set_io_type, IoType}; use tikv_util::yatp_pool::{Config, DefaultTicker, FuturePool, PoolTicker, YatpPoolBuilder}; use super::metrics::*; @@ -45,7 +45,7 @@ pub fn build_read_pool( .name_prefix(name) .after_start(move || { set_tls_engine(engine.lock().unwrap().clone()); - set_io_type(IOType::ForegroundRead); + set_io_type(IoType::ForegroundRead); }) .before_stop(move || unsafe { // Safety: we call `set_` and `destroy_` with the same engine type. @@ -71,7 +71,7 @@ pub fn build_read_pool_for_test( .config(config) .after_start(move || { set_tls_engine(engine.lock().unwrap().clone()); - set_io_type(IOType::ForegroundRead); + set_io_type(IoType::ForegroundRead); }) // Safety: we call `set_` and `destroy_` with the same engine type. .before_stop(|| unsafe { destroy_tls_engine::() }) diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 36089e41fd1..fea333903a6 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -9,7 +9,7 @@ use std::{ use collections::HashSet; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; -use file_system::{set_io_type, IOType}; +use file_system::{set_io_type, IoType}; use futures::{ executor::{ThreadPool, ThreadPoolBuilder}, future::join_all, @@ -87,7 +87,7 @@ where .after_start_wrapper(move || { tikv_util::thread_group::set_properties(props.clone()); tikv_alloc::add_thread_memory_accessor(); - set_io_type(IOType::Import); + set_io_type(IoType::Import); }) .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) .create() @@ -587,7 +587,7 @@ where /// /// If the ingestion fails because the region is not found or the epoch does /// not match, the remaining files will eventually be cleaned up by - /// CleanupSSTWorker. + /// CleanupSstWorker. fn ingest( &mut self, ctx: RpcContext<'_>, diff --git a/src/read_pool.rs b/src/read_pool.rs index 7409c9a4b6e..9c413de60a7 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -5,7 +5,7 @@ use std::{ sync::{mpsc::SyncSender, Arc, Mutex}, }; -use file_system::{set_io_type, IOType}; +use file_system::{set_io_type, IoType}; use futures::{channel::oneshot, future::TryFutureExt}; use kvproto::kvrpcpb::CommandPri; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; @@ -261,7 +261,7 @@ pub fn build_yatp_read_pool( .after_start(move || { let engine = raftkv.lock().unwrap().clone(); set_tls_engine(engine); - set_io_type(IOType::ForegroundRead); + set_io_type(IoType::ForegroundRead); }) .before_stop(|| unsafe { destroy_tls_engine::(); diff --git a/src/server/debug.rs b/src/server/debug.rs index 03630cf930a..933f4308245 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -20,7 +20,7 @@ use engine_traits::{ CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ - debugpb::{self, Db as DBType}, + debugpb::{self, Db as DbType}, metapb::{PeerRole, Region}, raft_serverpb::*, }; @@ -166,15 +166,15 @@ impl Debugger { Ok(regions) } - fn get_db_from_type(&self, db: DBType) -> Result<&RocksEngine> { + fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine> { match db { - DBType::Kv => Ok(&self.engines.kv), - DBType::Raft => Err(box_err!("Get raft db is not allowed")), - _ => Err(box_err!("invalid DBType type")), + DbType::Kv => Ok(&self.engines.kv), + DbType::Raft => Err(box_err!("Get raft db is not allowed")), + _ => Err(box_err!("invalid DB type")), } } - pub fn get(&self, db: DBType, cf: &str, key: &[u8]) -> Result> { + pub fn get(&self, db: DbType, cf: &str, key: &[u8]) -> Result> { validate_db_and_cf(db, cf)?; let db = self.get_db_from_type(db)?; match db.get_value_cf(cf, key) { @@ -313,7 +313,7 @@ impl Debugger { /// Compact the cf[start..end) in the db. pub fn compact( &self, - db: DBType, + db: DbType, cf: &str, start: &[u8], end: &[u8], @@ -1309,13 +1309,13 @@ fn region_overlap(r1: &Region, r2: &Region) -> bool { && (start_key_2 < end_key_1 || end_key_1.is_empty()) } -fn validate_db_and_cf(db: DBType, cf: &str) -> Result<()> { +fn validate_db_and_cf(db: DbType, cf: &str) -> Result<()> { match (db, cf) { - (DBType::Kv, CF_DEFAULT) - | (DBType::Kv, CF_WRITE) - | (DBType::Kv, CF_LOCK) - | (DBType::Kv, CF_RAFT) - | (DBType::Raft, CF_DEFAULT) => Ok(()), + (DbType::Kv, CF_DEFAULT) + | (DbType::Kv, CF_WRITE) + | (DbType::Kv, CF_LOCK) + | (DbType::Kv, CF_RAFT) + | (DbType::Raft, CF_DEFAULT) => Ok(()), _ => Err(Error::InvalidArgument(format!( "invalid cf {:?} for db {:?}", cf, db @@ -1383,7 +1383,7 @@ fn divide_db(db: &RocksEngine, parts: usize) -> raftstore::Result>> #[cfg(test)] mod tests { - use engine_rocks::{util::new_engine_opt, RocksCfOptions, RocksDBOptions, RocksEngine}; + use engine_rocks::{util::new_engine_opt, RocksCfOptions, RocksDbOptions, RocksEngine}; use engine_traits::{Mutable, SyncMutable, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; use kvproto::{ kvrpcpb::ApiVersion, @@ -1496,22 +1496,22 @@ mod tests { #[test] fn test_validate_db_and_cf() { let valid_cases = vec![ - (DBType::Kv, CF_DEFAULT), - (DBType::Kv, CF_WRITE), - (DBType::Kv, CF_LOCK), - (DBType::Kv, CF_RAFT), - (DBType::Raft, CF_DEFAULT), + (DbType::Kv, CF_DEFAULT), + (DbType::Kv, CF_WRITE), + (DbType::Kv, CF_LOCK), + (DbType::Kv, CF_RAFT), + (DbType::Raft, CF_DEFAULT), ]; for (db, cf) in valid_cases { validate_db_and_cf(db, cf).unwrap(); } let invalid_cases = vec![ - (DBType::Raft, CF_WRITE), - (DBType::Raft, CF_LOCK), - (DBType::Raft, CF_RAFT), - (DBType::Invalid, CF_DEFAULT), - (DBType::Invalid, "BAD_CF"), + (DbType::Raft, CF_WRITE), + (DbType::Raft, CF_LOCK), + (DbType::Raft, CF_RAFT), + (DbType::Invalid, CF_DEFAULT), + (DbType::Invalid, "BAD_CF"), ]; for (db, cf) in invalid_cases { validate_db_and_cf(db, cf).unwrap_err(); @@ -1558,10 +1558,10 @@ mod tests { engine.put(k, v).unwrap(); assert_eq!(&*engine.get_value(k).unwrap().unwrap(), v); - let got = debugger.get(DBType::Kv, CF_DEFAULT, k).unwrap(); + let got = debugger.get(DbType::Kv, CF_DEFAULT, k).unwrap(); assert_eq!(&got, v); - match debugger.get(DBType::Kv, CF_DEFAULT, b"foo") { + match debugger.get(DbType::Kv, CF_DEFAULT, b"foo") { Err(Error::NotFound(_)) => (), _ => panic!("expect Error::NotFound(_)"), } @@ -2151,7 +2151,7 @@ mod tests { .iter() .map(|cf| (*cf, RocksCfOptions::default())) .collect(); - let db_opt = RocksDBOptions::default(); + let db_opt = RocksDbOptions::default(); db_opt.enable_multi_batch_write(true); let db = new_engine_opt(path_str, db_opt, cfs_opts).unwrap(); // Write initial KVs. diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 59315b4732d..4e2edc13569 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -11,8 +11,8 @@ use engine_rocks::{ RocksEventListener, }; use engine_traits::{ - CFOptionsExt, ColumnFamilyOptions, CompactionJobInfo, Result, TabletAccessor, TabletFactory, - CF_DEFAULT, CF_WRITE, + CfOptions, CfOptionsExt, CompactionJobInfo, Result, TabletAccessor, TabletFactory, CF_DEFAULT, + CF_WRITE, }; use kvproto::kvrpcpb::ApiVersion; use raftstore::RegionInfoAccessor; diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 80366cc17d1..b47fc34cf27 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -7,9 +7,7 @@ use std::{ use collections::HashMap; use engine_rocks::RocksEngine; -use engine_traits::{ - CFOptionsExt, ColumnFamilyOptions, Result, TabletAccessor, TabletFactory, CF_DEFAULT, -}; +use engine_traits::{CfOptions, CfOptionsExt, Result, TabletAccessor, TabletFactory, CF_DEFAULT}; use crate::server::engine_factory::KvEngineFactory; diff --git a/src/server/gc_worker/applied_lock_collector.rs b/src/server/gc_worker/applied_lock_collector.rs index a013d742890..9d0e16f4286 100644 --- a/src/server/gc_worker/applied_lock_collector.rs +++ b/src/server/gc_worker/applied_lock_collector.rs @@ -20,7 +20,7 @@ use raftstore::coprocessor::{ use tikv_util::worker::{Builder as WorkerBuilder, Runnable, ScheduleError, Scheduler, Worker}; use txn_types::Key; -// TODO: Use new error type for GCWorker instead of storage::Error. +// TODO: Use new error type for GcWorker instead of storage::Error. use super::{Error, ErrorInner, Result}; use crate::storage::{ mvcc::{ErrorInner as MvccErrorInner, Lock, TimeStamp}, diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 165a1f62ddf..1c50b56bed1 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -23,7 +23,7 @@ use engine_rocks::{ use engine_traits::{ KvEngine, MiscExt, Mutable, MvccProperties, WriteBatch, WriteBatchExt, WriteOptions, }; -use file_system::{IOType, WithIOType}; +use file_system::{IoType, WithIoType}; use pd_client::{Feature, FeatureGate}; use prometheus::{local::*, *}; use raftstore::coprocessor::RegionInfoProvider; @@ -466,7 +466,7 @@ impl WriteCompactionFilter { wb: &RocksWriteBatchVec, wopts: &WriteOptions, ) -> Result<(), engine_traits::Error> { - let _io_type_guard = WithIOType::new(IOType::Gc); + let _io_type_guard = WithIoType::new(IoType::Gc); fail_point!("write_compaction_filter_flush_write_batch", true, |_| { Err(engine_traits::Error::Engine( engine_traits::Status::with_error( @@ -725,7 +725,7 @@ pub mod test_utils { // Put a new key-value pair to ensure compaction can be triggered correctly. engine.delete_cf("write", b"znot-exists-key").unwrap(); - TestGCRunner::new(safe_point).gc(&engine); + TestGcRunner::new(safe_point).gc(&engine); } lazy_static! { @@ -740,7 +740,7 @@ pub mod test_utils { compact_opts } - pub struct TestGCRunner<'a> { + pub struct TestGcRunner<'a> { pub safe_point: u64, pub ratio_threshold: Option, pub start: Option<&'a [u8]>, @@ -751,11 +751,11 @@ pub mod test_utils { pub(super) callbacks_on_drop: Vec>, } - impl<'a> TestGCRunner<'a> { + impl<'a> TestGcRunner<'a> { pub fn new(safe_point: u64) -> Self { let (gc_scheduler, gc_receiver) = dummy_scheduler(); - TestGCRunner { + TestGcRunner { safe_point, ratio_threshold: None, start: None, @@ -768,7 +768,7 @@ pub mod test_utils { } } - impl<'a> TestGCRunner<'a> { + impl<'a> TestGcRunner<'a> { pub fn safe_point(&mut self, sp: u64) -> &mut Self { self.safe_point = sp; self @@ -915,7 +915,7 @@ pub mod tests { let engine = TestEngineBuilder::new().build().unwrap(); let raw_engine = engine.get_rocksdb(); let value = vec![b'v'; 512]; - let mut gc_runner = TestGCRunner::new(0); + let mut gc_runner = TestGcRunner::new(0); // GC can't delete keys after the given safe point. must_prewrite_put(&engine, b"zkey", &value, b"zkey", 100); @@ -948,7 +948,7 @@ pub mod tests { let value = vec![b'v'; 512]; let engine = TestEngineBuilder::new().build().unwrap(); let raw_engine = engine.get_rocksdb(); - let mut gc_runner = TestGCRunner::new(0); + let mut gc_runner = TestGcRunner::new(0); let mut gc_and_check = |expect_tasks: bool, prefix: &[u8]| { gc_runner.safe_point(500).gc(&raw_engine); @@ -1018,7 +1018,7 @@ pub mod tests { let engine = builder.build_with_cfg(&cfg).unwrap(); let raw_engine = engine.get_rocksdb(); let value = vec![b'v'; 512]; - let mut gc_runner = TestGCRunner::new(0); + let mut gc_runner = TestGcRunner::new(0); for start_ts in &[100, 110, 120, 130] { must_prewrite_put(&engine, b"zkey", &value, b"zkey", *start_ts); @@ -1084,7 +1084,7 @@ pub mod tests { let builder = TestEngineBuilder::new().path(dir.path()); let engine = builder.build_with_cfg(&cfg).unwrap(); let raw_engine = engine.get_rocksdb(); - let mut gc_runner = TestGCRunner::new(0); + let mut gc_runner = TestGcRunner::new(0); // So the construction of SST files will be: // L6: |key_110| diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index 7fdc440527f..bcfe87d6783 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -331,7 +331,7 @@ impl GcMan } } - /// Sets the initial state of the `GCManger`. + /// Sets the initial state of the `GcManger`. /// The only task of initializing is to simply get the current safe point as /// the initial value of `safe_point`. TiKV won't do any GC /// automatically until the first time `safe_point` was updated to a diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index fe409be3ae4..7e695430d10 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -19,7 +19,7 @@ use engine_traits::{ raw_ttl::ttl_current_ts, DeleteStrategy, Error as EngineError, KvEngine, MiscExt, Range, WriteBatch, WriteOptions, CF_DEFAULT, CF_LOCK, CF_WRITE, }; -use file_system::{IOType, WithIOType}; +use file_system::{IoType, WithIoType}; use futures::executor::block_on; use kvproto::{ kvrpcpb::{Context, LockInfo}, @@ -767,7 +767,7 @@ where #[inline] fn run(&mut self, task: GcTask) { - let _io_type_guard = WithIOType::new(IOType::Gc); + let _io_type_guard = WithIoType::new(IoType::Gc); let enum_label = task.get_enum_label(); GC_GCTASK_COUNTER_STATIC.get(enum_label).inc(); @@ -1293,10 +1293,10 @@ mod tests { /// A wrapper of engine that adds the 'z' prefix to keys internally. /// For test engines, they writes keys into db directly, but in production a - /// 'z' prefix will be added to keys by raftstore layer before writing to - /// db. Some functionalities of `GCWorker` bypasses Raft layer, so they - /// needs to know how data is actually represented in db. This wrapper - /// allows test engines write 'z'-prefixed keys to db. + /// 'z' prefix will be added to keys by raftstore layer before writing + /// to db. Some functionalities of `GcWorker` bypasses Raft layer, so + /// they needs to know how data is actually represented in db. This + /// wrapper allows test engines write 'z'-prefixed keys to db. #[derive(Clone)] struct PrefixedEngine(kv::RocksEngine); diff --git a/src/server/gc_worker/mod.rs b/src/server/gc_worker/mod.rs index 20de36ef035..d6114a5875c 100644 --- a/src/server/gc_worker/mod.rs +++ b/src/server/gc_worker/mod.rs @@ -7,9 +7,9 @@ mod gc_manager; mod gc_worker; mod rawkv_compaction_filter; -// TODO: Use separated error type for GCWorker instead. +// TODO: Use separated error type for GcWorker instead. #[cfg(any(test, feature = "failpoints"))] -pub use compaction_filter::test_utils::{gc_by_compact, TestGCRunner}; +pub use compaction_filter::test_utils::{gc_by_compact, TestGcRunner}; pub use compaction_filter::WriteCompactionFilterFactory; pub use config::{GcConfig, GcWorkerConfigManager, DEFAULT_GC_BATCH_KEYS}; use engine_traits::MvccProperties; diff --git a/src/server/gc_worker/rawkv_compaction_filter.rs b/src/server/gc_worker/rawkv_compaction_filter.rs index 3ed206408e4..49758f5793b 100644 --- a/src/server/gc_worker/rawkv_compaction_filter.rs +++ b/src/server/gc_worker/rawkv_compaction_filter.rs @@ -201,7 +201,7 @@ impl RawCompactionFilter { self.versions += 1; let raw_value = ApiV2::decode_raw_value(value)?; // If it's the latest version, and it's deleted or expired, it needs to be sent - // to GCWorker to be processed asynchronously. + // to GcWorker to be processed asynchronously. if !raw_value.is_valid(self.current_ts) { self.raw_handle_delete(); if self.mvcc_deletions.len() >= DEFAULT_DELETE_BATCH_COUNT { @@ -314,7 +314,7 @@ pub mod tests { use super::*; use crate::{ - config::DbConfig, server::gc_worker::TestGCRunner, storage::kv::TestEngineBuilder, + config::DbConfig, server::gc_worker::TestGcRunner, storage::kv::TestEngineBuilder, }; pub fn make_key(key: &[u8], ts: u64) -> Vec { @@ -334,7 +334,7 @@ pub mod tests { .build_with_cfg(&cfg) .unwrap(); let raw_engine = engine.get_rocksdb(); - let mut gc_runner = TestGCRunner::new(0); + let mut gc_runner = TestGcRunner::new(0); let user_key = b"r\0aaaaaaaaaaa"; @@ -399,7 +399,7 @@ pub mod tests { .build() .unwrap(); let raw_engine = engine.get_rocksdb(); - let mut gc_runner = TestGCRunner::new(0); + let mut gc_runner = TestGcRunner::new(0); let mut gc_and_check = |expect_tasks: bool, prefix: &[u8]| { gc_runner.safe_point(500).gc_raw(&raw_engine); diff --git a/src/server/snap.rs b/src/server/snap.rs index f451b6b70e9..b785c455921 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -13,7 +13,7 @@ use std::{ }; use engine_traits::KvEngine; -use file_system::{IOType, WithIOType}; +use file_system::{IoType, WithIoType}; use futures::{ future::{Future, TryFutureExt}, sink::SinkExt, @@ -206,7 +206,7 @@ struct RecvSnapContext { key: SnapKey, file: Option>, raft_msg: RaftMessage, - io_type: IOType, + io_type: IoType, } impl RecvSnapContext { @@ -227,11 +227,11 @@ impl RecvSnapContext { let mut snapshot = RaftSnapshotData::default(); snapshot.merge_from_bytes(data)?; let io_type = if snapshot.get_meta().get_for_balance() { - IOType::LoadBalance + IoType::LoadBalance } else { - IOType::Replication + IoType::Replication }; - let _with_io_type = WithIOType::new(io_type); + let _with_io_type = WithIoType::new(io_type); let snap = { let s = match snap_mgr.get_snapshot_for_receiving(&key, data) { @@ -257,7 +257,7 @@ impl RecvSnapContext { } fn finish>(self, raft_router: R) -> Result<()> { - let _with_io_type = WithIOType::new(self.io_type); + let _with_io_type = WithIoType::new(self.io_type); let key = self.key; if let Some(mut file) = self.file { info!("saving snapshot file"; "snap_key" => %key, "file" => file.path()); @@ -300,7 +300,7 @@ fn recv_snap + 'static>( return Err(box_err!("{} receive chunk with empty data", context.key)); } let f = context.file.as_mut().unwrap(); - let _with_io_type = WithIOType::new(context.io_type); + let _with_io_type = WithIoType::new(context.io_type); if let Err(e) = Write::write_all(&mut *f, &data) { let key = &context.key; let path = context.file.as_mut().unwrap().path(); diff --git a/src/storage/config.rs b/src/storage/config.rs index 2a5ac4840e0..9a359310178 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -5,7 +5,7 @@ use std::{cmp::max, error::Error}; use engine_rocks::raw::{Cache, LRUCacheOptions, MemoryAllocator}; -use file_system::{IOPriority, IORateLimitMode, IORateLimiter, IOType}; +use file_system::{IoPriority, IoRateLimitMode, IoRateLimiter, IoType}; use kvproto::kvrpcpb::ApiVersion; use libc::c_int; use online_config::OnlineConfig; @@ -64,7 +64,7 @@ pub struct Config { #[online_config(submodule)] pub block_cache: BlockCacheConfig, #[online_config(submodule)] - pub io_rate_limit: IORateLimitConfig, + pub io_rate_limit: IoRateLimitConfig, } impl Default for Config { @@ -88,7 +88,7 @@ impl Default for Config { ttl_check_poll_interval: ReadableDuration::hours(12), flow_control: FlowControlConfig::default(), block_cache: BlockCacheConfig::default(), - io_rate_limit: IORateLimitConfig::default(), + io_rate_limit: IoRateLimitConfig::default(), background_error_recovery_window: ReadableDuration::hours(1), } } @@ -278,82 +278,82 @@ impl BlockCacheConfig { #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] -pub struct IORateLimitConfig { +pub struct IoRateLimitConfig { pub max_bytes_per_sec: ReadableSize, #[online_config(skip)] - pub mode: IORateLimitMode, + pub mode: IoRateLimitMode, /// When this flag is off, high-priority IOs are counted but not limited. - /// Default set to false because the optimal throughput target provided by - /// user might not be the maximum available bandwidth. For multi-tenancy - /// use case, this flag should be turned on. + /// Default set to false because the optimal throughput target provided + /// by user might not be the maximum available bandwidth. For + /// multi-tenancy use case, this flag should be turned on. #[online_config(skip)] pub strict: bool, - pub foreground_read_priority: IOPriority, - pub foreground_write_priority: IOPriority, - pub flush_priority: IOPriority, - pub level_zero_compaction_priority: IOPriority, - pub compaction_priority: IOPriority, - pub replication_priority: IOPriority, - pub load_balance_priority: IOPriority, - pub gc_priority: IOPriority, - pub import_priority: IOPriority, - pub export_priority: IOPriority, - pub other_priority: IOPriority, + pub foreground_read_priority: IoPriority, + pub foreground_write_priority: IoPriority, + pub flush_priority: IoPriority, + pub level_zero_compaction_priority: IoPriority, + pub compaction_priority: IoPriority, + pub replication_priority: IoPriority, + pub load_balance_priority: IoPriority, + pub gc_priority: IoPriority, + pub import_priority: IoPriority, + pub export_priority: IoPriority, + pub other_priority: IoPriority, } -impl Default for IORateLimitConfig { - fn default() -> IORateLimitConfig { - IORateLimitConfig { +impl Default for IoRateLimitConfig { + fn default() -> IoRateLimitConfig { + IoRateLimitConfig { max_bytes_per_sec: ReadableSize::mb(0), - mode: IORateLimitMode::WriteOnly, + mode: IoRateLimitMode::WriteOnly, strict: false, - foreground_read_priority: IOPriority::High, - foreground_write_priority: IOPriority::High, - flush_priority: IOPriority::High, - level_zero_compaction_priority: IOPriority::Medium, - compaction_priority: IOPriority::Low, - replication_priority: IOPriority::High, - load_balance_priority: IOPriority::High, - gc_priority: IOPriority::High, - import_priority: IOPriority::Medium, - export_priority: IOPriority::Medium, - other_priority: IOPriority::High, + foreground_read_priority: IoPriority::High, + foreground_write_priority: IoPriority::High, + flush_priority: IoPriority::High, + level_zero_compaction_priority: IoPriority::Medium, + compaction_priority: IoPriority::Low, + replication_priority: IoPriority::High, + load_balance_priority: IoPriority::High, + gc_priority: IoPriority::High, + import_priority: IoPriority::Medium, + export_priority: IoPriority::Medium, + other_priority: IoPriority::High, } } } -impl IORateLimitConfig { - pub fn build(&self, enable_statistics: bool) -> IORateLimiter { - let limiter = IORateLimiter::new(self.mode, self.strict, enable_statistics); +impl IoRateLimitConfig { + pub fn build(&self, enable_statistics: bool) -> IoRateLimiter { + let limiter = IoRateLimiter::new(self.mode, self.strict, enable_statistics); limiter.set_io_rate_limit(self.max_bytes_per_sec.0 as usize); - limiter.set_io_priority(IOType::ForegroundRead, self.foreground_read_priority); - limiter.set_io_priority(IOType::ForegroundWrite, self.foreground_write_priority); - limiter.set_io_priority(IOType::Flush, self.flush_priority); + limiter.set_io_priority(IoType::ForegroundRead, self.foreground_read_priority); + limiter.set_io_priority(IoType::ForegroundWrite, self.foreground_write_priority); + limiter.set_io_priority(IoType::Flush, self.flush_priority); limiter.set_io_priority( - IOType::LevelZeroCompaction, + IoType::LevelZeroCompaction, self.level_zero_compaction_priority, ); - limiter.set_io_priority(IOType::Compaction, self.compaction_priority); - limiter.set_io_priority(IOType::Replication, self.replication_priority); - limiter.set_io_priority(IOType::LoadBalance, self.load_balance_priority); - limiter.set_io_priority(IOType::Gc, self.gc_priority); - limiter.set_io_priority(IOType::Import, self.import_priority); - limiter.set_io_priority(IOType::Export, self.export_priority); - limiter.set_io_priority(IOType::Other, self.other_priority); + limiter.set_io_priority(IoType::Compaction, self.compaction_priority); + limiter.set_io_priority(IoType::Replication, self.replication_priority); + limiter.set_io_priority(IoType::LoadBalance, self.load_balance_priority); + limiter.set_io_priority(IoType::Gc, self.gc_priority); + limiter.set_io_priority(IoType::Import, self.import_priority); + limiter.set_io_priority(IoType::Export, self.export_priority); + limiter.set_io_priority(IoType::Other, self.other_priority); limiter } fn validate(&mut self) -> Result<(), Box> { - if self.other_priority != IOPriority::High { + if self.other_priority != IoPriority::High { warn!( "Occasionally some critical IO operations are tagged as IOType::Other, \ e.g. IOs are fired from unmanaged threads, thread-local type storage exceeds \ capacity. To be on the safe side, change priority for IOType::Other from \ {:?} to {:?}", self.other_priority, - IOPriority::High + IoPriority::High ); - self.other_priority = IOPriority::High; + self.other_priority = IoPriority::High; } if self.gc_priority != self.foreground_write_priority { warn!( @@ -363,7 +363,7 @@ impl IORateLimitConfig { ); self.gc_priority = self.foreground_write_priority; } - if self.mode != IORateLimitMode::WriteOnly { + if self.mode != IoRateLimitMode::WriteOnly { return Err( "storage.io-rate-limit.mode other than write-only is not supported.".into(), ); diff --git a/src/storage/config_manager.rs b/src/storage/config_manager.rs index d3d051ac5f9..8bc92a7f697 100644 --- a/src/storage/config_manager.rs +++ b/src/storage/config_manager.rs @@ -4,8 +4,8 @@ use std::{convert::TryInto, sync::Arc}; -use engine_traits::{CFNamesExt, CFOptionsExt, TabletFactory, CF_DEFAULT}; -use file_system::{get_io_rate_limiter, IOPriority, IOType}; +use engine_traits::{CfNamesExt, CfOptionsExt, TabletFactory, CF_DEFAULT}; +use file_system::{get_io_rate_limiter, IoPriority, IoType}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use strum::IntoEnumIterator; use tikv_kv::Engine; @@ -98,10 +98,10 @@ impl ConfigManager for StorageConfigManger { limiter.set_io_rate_limit(limit.0 as usize); } - for t in IOType::iter() { + for t in IoType::iter() { if let Some(priority) = io_rate_limit.remove(&(t.as_str().to_owned() + "_priority")) { - let priority: IOPriority = priority.try_into()?; + let priority: IoPriority = priority.try_into()?; limiter.set_io_priority(t, priority); } } diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index 0867c30fb31..b1b727f898c 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -8,7 +8,7 @@ use std::{ use causal_ts::tests::DummyRawTsTracker; use engine_rocks::RocksCfOptions; use engine_traits::{CfName, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; -use file_system::IORateLimiter; +use file_system::IoRateLimiter; use kvproto::kvrpcpb::ApiVersion; use tikv_util::config::ReadableSize; @@ -27,7 +27,7 @@ const TEMP_DIR: &str = ""; pub struct TestEngineBuilder { path: Option, cfs: Option>, - io_rate_limiter: Option>, + io_rate_limiter: Option>, api_version: ApiVersion, } @@ -62,7 +62,7 @@ impl TestEngineBuilder { self } - pub fn io_rate_limiter(mut self, limiter: Option>) -> Self { + pub fn io_rate_limiter(mut self, limiter: Option>) -> Self { self.io_rate_limiter = limiter; self } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 6338525ab02..620bca80b32 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3152,7 +3152,7 @@ mod tests { *, }; use crate::{ - config::TitanDBConfig, + config::TitanDbConfig, coprocessor::checksum_crc64_xor, storage::{ config::BlockCacheConfig, @@ -3668,7 +3668,7 @@ mod tests { #[test] fn test_scan_with_key_only() { let db_config = crate::config::DbConfig { - titan: TitanDBConfig { + titan: TitanDbConfig { enabled: true, ..Default::default() }, diff --git a/src/storage/mvcc/consistency_check.rs b/src/storage/mvcc/consistency_check.rs index d715ec598c2..c27b96840d0 100644 --- a/src/storage/mvcc/consistency_check.rs +++ b/src/storage/mvcc/consistency_check.rs @@ -481,7 +481,7 @@ mod tests { #[test] fn test_mvcc_info_collector() { - use engine_test::ctor::{ColumnFamilyOptions, DBOptions}; + use engine_test::ctor::{CfOptions, DbOptions}; use engine_traits::SyncMutable; use txn_types::TimeStamp; @@ -494,12 +494,12 @@ mod tests { let path = tmp.path().to_str().unwrap(); let engine = engine_test::kv::new_engine_opt( path, - DBOptions::default(), + DbOptions::default(), vec![ - (CF_DEFAULT, ColumnFamilyOptions::new()), - (CF_WRITE, ColumnFamilyOptions::new()), - (CF_LOCK, ColumnFamilyOptions::new()), - (CF_RAFT, ColumnFamilyOptions::new()), + (CF_DEFAULT, CfOptions::new()), + (CF_WRITE, CfOptions::new()), + (CF_LOCK, CfOptions::new()), + (CF_RAFT, CfOptions::new()), ], ) .unwrap(); diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 377d2c94022..c45fabe2540 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -632,7 +632,7 @@ pub mod tests { use concurrency_manager::ConcurrencyManager; use engine_rocks::{ - properties::MvccPropertiesCollectorFactory, RocksCfOptions, RocksDBOptions, RocksEngine, + properties::MvccPropertiesCollectorFactory, RocksCfOptions, RocksDbOptions, RocksEngine, RocksSnapshot, }; use engine_traits::{ @@ -900,7 +900,7 @@ pub mod tests { } pub fn open_db(path: &str, with_properties: bool) -> RocksEngine { - let db_opt = RocksDBOptions::default(); + let db_opt = RocksDbOptions::default(); let mut cf_opts = RocksCfOptions::default(); cf_opts.set_write_buffer_size(32 * 1024 * 1024); if with_properties { diff --git a/src/storage/read_pool.rs b/src/storage/read_pool.rs index c25ae15d46b..a0aee5a185f 100644 --- a/src/storage/read_pool.rs +++ b/src/storage/read_pool.rs @@ -5,7 +5,7 @@ use std::sync::{Arc, Mutex}; -use file_system::{set_io_type, IOType}; +use file_system::{set_io_type, IoType}; use tikv_util::yatp_pool::{Config, DefaultTicker, FuturePool, PoolTicker, YatpPoolBuilder}; use crate::{ @@ -49,7 +49,7 @@ pub fn build_read_pool( .config(config) .after_start(move || { set_tls_engine(engine.lock().unwrap().clone()); - set_io_type(IOType::ForegroundRead); + set_io_type(IoType::ForegroundRead); }) .before_stop(move || unsafe { // Safety: we call `set_` and `destroy_` with the same engine type. @@ -79,7 +79,7 @@ pub fn build_read_pool_for_test( .name_prefix(name) .after_start(move || { set_tls_engine(engine.lock().unwrap().clone()); - set_io_type(IOType::ForegroundRead); + set_io_type(IoType::ForegroundRead); }) // Safety: we call `set_` and `destroy_` with the same engine type. .before_stop(|| unsafe { destroy_tls_engine::() }) diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index 40bb50a88c8..8cb901187dd 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -17,7 +17,7 @@ use std::{ use collections::HashMap; use engine_rocks::FlowInfo; -use engine_traits::{CFNamesExt, FlowControlFactorsExt}; +use engine_traits::{CfNamesExt, FlowControlFactorsExt}; use getset::{CopyGetters, Setters}; use num_traits::cast::{AsPrimitive, FromPrimitive}; use rand::Rng; @@ -115,7 +115,7 @@ impl EngineFlowController { } } - pub fn new( + pub fn new( config: &FlowControlConfig, engine: E, flow_info_receiver: Receiver, @@ -443,7 +443,7 @@ impl Default for CfFlowChecker { } #[derive(CopyGetters, Setters)] -pub(super) struct FlowChecker { +pub(super) struct FlowChecker { pub soft_pending_compaction_bytes_limit: u64, hard_pending_compaction_bytes_limit: u64, memtables_threshold: u64, @@ -473,7 +473,7 @@ pub(super) struct FlowChecker FlowChecker { +impl FlowChecker { pub fn new( config: &FlowControlConfig, engine: E, @@ -994,7 +994,7 @@ pub(super) mod tests { use std::sync::atomic::AtomicU64; use engine_rocks::RocksCfOptions; - use engine_traits::{CFOptionsExt, Result}; + use engine_traits::{CfOptionsExt, Result}; use super::{super::FlowController, *}; @@ -1017,15 +1017,15 @@ pub(super) mod tests { } } - impl CFNamesExt for EngineStub { + impl CfNamesExt for EngineStub { fn cf_names(&self) -> Vec<&str> { vec!["default"] } } - impl CFOptionsExt for EngineStub { - type ColumnFamilyOptions = RocksCfOptions; - fn get_options_cf(&self, _cf: &str) -> Result { + impl CfOptionsExt for EngineStub { + type CfOptions = RocksCfOptions; + fn get_options_cf(&self, _cf: &str) -> Result { unimplemented!(); } diff --git a/src/storage/txn/flow_controller/tablet_flow_controller.rs b/src/storage/txn/flow_controller/tablet_flow_controller.rs index 14819127389..a35517246c5 100644 --- a/src/storage/txn/flow_controller/tablet_flow_controller.rs +++ b/src/storage/txn/flow_controller/tablet_flow_controller.rs @@ -13,7 +13,7 @@ use std::{ use collections::HashMap; use engine_rocks::FlowInfo; -use engine_traits::{CFNamesExt, FlowControlFactorsExt, TabletFactory}; +use engine_traits::{CfNamesExt, FlowControlFactorsExt, TabletFactory}; use rand::Rng; use tikv_util::{sys::thread::StdThreadBuildWrapper, time::Limiter}; @@ -47,7 +47,7 @@ impl Drop for TabletFlowController { } impl TabletFlowController { - pub fn new( + pub fn new( config: &FlowControlConfig, tablet_factory: Arc + Send + Sync>, flow_info_receiver: Receiver, @@ -86,7 +86,7 @@ impl TabletFlowController { struct FlowInfoDispatcher; impl FlowInfoDispatcher { - fn start( + fn start( rx: Receiver, flow_info_receiver: Receiver, tablet_factory: Arc + Send + Sync>, diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 5894efc3226..78a891b650e 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -7,7 +7,7 @@ use std::{ }; use collections::HashMap; -use file_system::{set_io_type, IOType}; +use file_system::{set_io_type, IoType}; use kvproto::pdpb::QueryKind; use prometheus::local::*; use raftstore::store::WriteStats; @@ -74,7 +74,7 @@ impl SchedPool { // the tls_engine invariants. .after_start(move || { set_tls_engine(engine.lock().unwrap().clone()); - set_io_type(IOType::ForegroundWrite); + set_io_type(IoType::ForegroundWrite); }) .before_stop(move || unsafe { // Safety: we ensure the `set_` and `destroy_` calls use the same engine type. diff --git a/tests/benches/misc/writebatch/bench_writebatch.rs b/tests/benches/misc/writebatch/bench_writebatch.rs index cde64280184..f396976e3c1 100644 --- a/tests/benches/misc/writebatch/bench_writebatch.rs +++ b/tests/benches/misc/writebatch/bench_writebatch.rs @@ -1,6 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use engine_rocks::{RocksCfOptions, RocksDBOptions, RocksEngine, RocksWriteBatchVec}; +use engine_rocks::{RocksCfOptions, RocksDbOptions, RocksEngine, RocksWriteBatchVec}; use engine_traits::{Mutable, WriteBatch, WriteBatchExt, CF_DEFAULT}; use tempfile::Builder; use test::Bencher; @@ -22,7 +22,7 @@ fn bench_writebatch_impl(b: &mut Bencher, batch_keys: usize) { .prefix("/tmp/rocksdb_write_batch_bench") .tempdir() .unwrap(); - let mut opts = RocksDBOptions::default(); + let mut opts = RocksDbOptions::default(); opts.create_if_missing(true); opts.enable_unordered_write(false); opts.enable_pipelined_write(false); @@ -111,7 +111,7 @@ fn bench_writebatch_without_capacity(b: &mut Bencher) { .prefix("/tmp/rocksdb_write_batch_bench") .tempdir() .unwrap(); - let mut opts = RocksDBOptions::default(); + let mut opts = RocksDbOptions::default(); opts.create_if_missing(true); opts.enable_unordered_write(false); opts.enable_pipelined_write(false); @@ -134,7 +134,7 @@ fn bench_writebatch_with_capacity(b: &mut Bencher) { .prefix("/tmp/rocksdb_write_batch_bench") .tempdir() .unwrap(); - let mut opts = RocksDBOptions::default(); + let mut opts = RocksDbOptions::default(); opts.create_if_missing(true); opts.enable_unordered_write(false); opts.enable_pipelined_write(false); diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index 818c7ba2739..10192db7bf0 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -219,7 +219,7 @@ fn test_paging_scan() { select_resp.merge_from_bytes(resp.get_data()).unwrap(); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(select_resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(select_resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(exp) { let name_datum = name.unwrap().as_bytes().into(); let expected_encoded = datum::encode_value( @@ -293,7 +293,7 @@ fn test_paging_scan_multi_ranges() { select_resp.merge_from_bytes(resp.get_data()).unwrap(); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(select_resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(select_resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(exp) { let name_datum = name.unwrap().as_bytes().into(); let expected_encoded = datum::encode_value( @@ -349,7 +349,7 @@ fn test_paging_scan_multi_ranges() { select_resp.merge_from_bytes(resp.get_data()).unwrap(); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(select_resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(select_resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(exp) { let name_datum = name.unwrap().as_bytes().into(); let expected_encoded = datum::encode_value( diff --git a/tests/failpoints/cases/test_encryption.rs b/tests/failpoints/cases/test_encryption.rs index ccb4d698e3f..502e31afff9 100644 --- a/tests/failpoints/cases/test_encryption.rs +++ b/tests/failpoints/cases/test_encryption.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use encryption::{compat, FileDictionaryFile}; +use encryption::FileDictionaryFile; use kvproto::encryptionpb::{EncryptionMethod, FileInfo}; #[test] @@ -47,7 +47,7 @@ fn test_file_dict_file_record_corrupted() { fn create_file_info(id: u64, method: EncryptionMethod) -> FileInfo { FileInfo { key_id: id, - method: compat(method), + method, ..Default::default() } } diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index c4e3e4dee71..73031b10283 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -17,7 +17,7 @@ use raftstore::coprocessor::{ use test_raftstore::*; use tikv::{ server::gc_worker::{ - AutoGcConfig, GcSafePointProvider, GcTask, Result as GcWorkerResult, TestGCRunner, + AutoGcConfig, GcSafePointProvider, GcTask, Result as GcWorkerResult, TestGcRunner, }, storage::{ kv::TestEngineBuilder, @@ -329,7 +329,7 @@ fn test_error_in_compaction_filter() { let fp = "write_compaction_filter_flush_write_batch"; fail::cfg(fp, "return").unwrap(); - let mut gc_runner = TestGCRunner::new(200); + let mut gc_runner = TestGcRunner::new(200); gc_runner.gc(&raw_engine); match gc_runner.gc_receiver.recv().unwrap() { @@ -382,7 +382,7 @@ fn test_orphan_versions_from_compaction_filter() { let fp = "write_compaction_filter_flush_write_batch"; fail::cfg(fp, "return").unwrap(); - let mut gc_runner = TestGCRunner::new(100); + let mut gc_runner = TestGcRunner::new(100); gc_runner.gc_scheduler = cluster.sim.rl().get_gc_worker(1).scheduler(); gc_runner.gc(&engine.kv); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 8c1be52be78..e8449624a0f 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -14,7 +14,7 @@ use engine_rocks::{ }, }; use engine_traits::PerfLevel; -use file_system::{IOPriority, IORateLimitMode}; +use file_system::{IoPriority, IoRateLimitMode}; use kvproto::encryptionpb::EncryptionMethod; use pd_client::Config as PdConfig; use raft_log_engine::{ReadableSize as RaftEngineReadableSize, RecoveryMode}; @@ -32,7 +32,7 @@ use tikv::{ lock_manager::Config as PessimisticTxnConfig, Config as ServerConfig, }, storage::config::{ - BlockCacheConfig, Config as StorageConfig, FlowControlConfig, IORateLimitConfig, + BlockCacheConfig, Config as StorageConfig, FlowControlConfig, IoRateLimitConfig, }, }; use tikv_util::config::{LogFormat, ReadableDuration, ReadableSize}; @@ -261,7 +261,7 @@ fn test_serde_custom_tikv_config() { max_sorted_runs: 100, gc_merge_rewrite: false, }; - let titan_db_config = TitanDBConfig { + let titan_db_config = TitanDbConfig { enabled: true, dirname: "bar".to_owned(), disable_gc: false, @@ -681,21 +681,21 @@ fn test_serde_custom_tikv_config() { high_pri_pool_ratio: 0.8, memory_allocator: Some(String::from("nodump")), }, - io_rate_limit: IORateLimitConfig { + io_rate_limit: IoRateLimitConfig { max_bytes_per_sec: ReadableSize::mb(1000), - mode: IORateLimitMode::AllIo, + mode: IoRateLimitMode::AllIo, strict: true, - foreground_read_priority: IOPriority::Low, - foreground_write_priority: IOPriority::Low, - flush_priority: IOPriority::Low, - level_zero_compaction_priority: IOPriority::Low, - compaction_priority: IOPriority::High, - replication_priority: IOPriority::Low, - load_balance_priority: IOPriority::Low, - gc_priority: IOPriority::High, - import_priority: IOPriority::High, - export_priority: IOPriority::High, - other_priority: IOPriority::Low, + foreground_read_priority: IoPriority::Low, + foreground_write_priority: IoPriority::Low, + flush_priority: IoPriority::Low, + level_zero_compaction_priority: IoPriority::Low, + compaction_priority: IoPriority::High, + replication_priority: IoPriority::Low, + load_balance_priority: IoPriority::Low, + gc_priority: IoPriority::High, + import_priority: IoPriority::High, + export_priority: IoPriority::High, + other_priority: IoPriority::Low, }, background_error_recovery_window: ReadableDuration::hours(1), }; diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 69ce131ec8b..024ebddbdea 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -65,7 +65,7 @@ fn test_select() { // for dag selection let req = DAGSelect::from(&product).build(); let mut resp = handle_select(&endpoint, req); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(data) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( @@ -100,7 +100,7 @@ fn test_batch_row_limit() { let req = DAGSelect::from(&product).build(); let mut resp = handle_select(&endpoint, req); check_chunk_datum_count(resp.get_chunks(), chunk_datum_limit); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(data) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( @@ -171,7 +171,7 @@ fn test_stream_batch_row_limit() { let chunk_data_limit = stream_row_limit * 3; // we have 3 fields. check_chunk_datum_count(&chunks, chunk_data_limit); - let spliter = DAGChunkSpliter::new(chunks, 3); + let spliter = DagChunkSpliter::new(chunks, 3); let j = cmp::min((i + 1) * stream_row_limit, data.len()); let cur_data = &data[i * stream_row_limit..j]; for (row, &(id, name, cnt)) in spliter.zip(cur_data) { @@ -205,7 +205,7 @@ fn test_select_after_lease() { thread::sleep(cluster.cfg.raft_store.raft_store_max_leader_lease.0); let req = DAGSelect::from(&product).build_with(ctx, &[0]); let mut resp = handle_select(&endpoint, req); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(data) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( @@ -279,7 +279,7 @@ fn test_group_by() { let mut resp = handle_select(&endpoint, req); // should only have name:0, name:2 and name:1 let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 1); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 1); let mut results = spliter.collect::>>(); sort_by!(results, 0, Bytes); for (row, name) in results.iter().zip(&[b"name:0", b"name:1", b"name:2"]) { @@ -322,7 +322,7 @@ fn test_aggr_count() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 2); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 2); let mut results = spliter.collect::>>(); sort_by!(results, 1, Bytes); for (row, (name, cnt)) in results.iter().zip(exp) { @@ -351,7 +351,7 @@ fn test_aggr_count() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); let mut results = spliter.collect::>>(); sort_by!(results, 1, Bytes); for (row, (gk_data, cnt)) in results.iter().zip(exp) { @@ -400,7 +400,7 @@ fn test_aggr_first() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 2); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 2); let mut results = spliter.collect::>>(); sort_by!(results, 1, Bytes); for (row, (name, id)) in results.iter().zip(exp) { @@ -431,7 +431,7 @@ fn test_aggr_first() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 2); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 2); let mut results = spliter.collect::>>(); sort_by!(results, 0, Bytes); for (row, (count, name)) in results.iter().zip(exp) { @@ -483,7 +483,7 @@ fn test_aggr_avg() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); let mut results = spliter.collect::>>(); sort_by!(results, 2, Bytes); for (row, (name, (sum, cnt))) in results.iter().zip(exp) { @@ -526,7 +526,7 @@ fn test_aggr_sum() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 2); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 2); let mut results = spliter.collect::>>(); sort_by!(results, 1, Bytes); for (row, (name, cnt)) in results.iter().zip(exp) { @@ -594,7 +594,7 @@ fn test_aggr_extre() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); let mut results = spliter.collect::>>(); sort_by!(results, 2, Bytes); for (row, (name, max, min)) in results.iter().zip(exp) { @@ -672,7 +672,7 @@ fn test_aggr_bit_ops() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 4); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 4); let mut results = spliter.collect::>>(); sort_by!(results, 3, Bytes); for (row, (name, bitand, bitor, bitxor)) in results.iter().zip(exp) { @@ -716,7 +716,7 @@ fn test_order_by_column() { .build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(exp) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( @@ -753,7 +753,7 @@ fn test_order_by_pk_with_select_from_index() { .build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(expect) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( @@ -786,7 +786,7 @@ fn test_limit() { let req = DAGSelect::from(&product).limit(5).build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(expect) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( @@ -823,7 +823,7 @@ fn test_reverse() { .build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(expect) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( @@ -855,7 +855,7 @@ fn test_index() { let req = DAGSelect::from_index(&product, &product["id"]).build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 1); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 1); for (row, (id, ..)) in spliter.zip(data) { let expected_encoded = datum::encode_value(&mut EvalContext::default(), &[id.into()]).unwrap(); @@ -889,7 +889,7 @@ fn test_index_reverse_limit() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 1); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 1); for (row, (id, ..)) in spliter.zip(expect) { let expected_encoded = datum::encode_value(&mut EvalContext::default(), &[id.into()]).unwrap(); @@ -919,7 +919,7 @@ fn test_limit_oom() { .build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 1); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 1); for (row, (id, ..)) in spliter.zip(data) { let expected_encoded = datum::encode_value(&mut EvalContext::default(), &[id.into()]).unwrap(); @@ -959,7 +959,7 @@ fn test_del_select() { let resp = handle_request(&endpoint, req); let mut sel_resp = SelectResponse::default(); sel_resp.merge_from_bytes(resp.get_data()).unwrap(); - let spliter = DAGChunkSpliter::new(sel_resp.take_chunks().into(), 1); + let spliter = DagChunkSpliter::new(sel_resp.take_chunks().into(), 1); let mut row_count = 0; for _ in spliter { row_count += 1; @@ -992,7 +992,7 @@ fn test_index_group_by() { let mut resp = handle_select(&endpoint, req); // should only have name:0, name:2 and name:1 let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 1); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 1); let mut results = spliter.collect::>>(); sort_by!(results, 0, Bytes); for (row, name) in results.iter().zip(&[b"name:0", b"name:1", b"name:2"]) { @@ -1025,7 +1025,7 @@ fn test_index_aggr_count() { .output_offsets(Some(vec![0])) .build(); let mut resp = handle_select(&endpoint, req); - let mut spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 1); + let mut spliter = DagChunkSpliter::new(resp.take_chunks().into(), 1); let expected_encoded = datum::encode_value( &mut EvalContext::default(), &[Datum::U64(data.len() as u64)], @@ -1053,7 +1053,7 @@ fn test_index_aggr_count() { resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 2); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 2); let mut results = spliter.collect::>>(); sort_by!(results, 1, Bytes); for (row, (name, cnt)) in results.iter().zip(exp) { @@ -1080,7 +1080,7 @@ fn test_index_aggr_count() { resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); let mut results = spliter.collect::>>(); sort_by!(results, 1, Bytes); for (row, (gk_data, cnt)) in results.iter().zip(exp) { @@ -1124,7 +1124,7 @@ fn test_index_aggr_first() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 2); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 2); let mut results = spliter.collect::>>(); sort_by!(results, 1, Bytes); for (row, (name, id)) in results.iter().zip(exp) { @@ -1182,7 +1182,7 @@ fn test_index_aggr_avg() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); let mut results = spliter.collect::>>(); sort_by!(results, 2, Bytes); for (row, (name, (sum, cnt))) in results.iter().zip(exp) { @@ -1225,7 +1225,7 @@ fn test_index_aggr_sum() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 2); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 2); let mut results = spliter.collect::>>(); sort_by!(results, 1, Bytes); for (row, (name, cnt)) in results.iter().zip(exp) { @@ -1292,7 +1292,7 @@ fn test_index_aggr_extre() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let exp_len = exp.len(); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); let mut results = spliter.collect::>>(); sort_by!(results, 2, Bytes); for (row, (name, max, min)) in results.iter().zip(exp) { @@ -1359,7 +1359,7 @@ fn test_where() { let req = DAGSelect::from(&product).where_expr(cond).build(); let mut resp = handle_select(&endpoint, req); - let mut spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let mut spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); let row = spliter.next().unwrap(); let (id, name, cnt) = data[2]; let name_datum = name.map(|s| s.as_bytes()).into(); @@ -1504,7 +1504,7 @@ fn test_handle_truncate() { assert!(!resp.has_error()); assert!(!resp.get_warnings().is_empty()); // check data - let mut spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let mut spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); let row = spliter.next().unwrap(); let (id, name, cnt) = data[2]; let name_datum = name.map(|s| s.as_bytes()).into(); @@ -1554,7 +1554,7 @@ fn test_default_val() { let req = DAGSelect::from(&tbl).limit(5).build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 4); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 4); for (row, (id, name, cnt)) in spliter.zip(expect) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( @@ -1585,7 +1585,7 @@ fn test_output_offsets() { .output_offsets(Some(vec![1])) .build(); let mut resp = handle_select(&endpoint, req); - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 1); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 1); for (row, (_, name, _)) in spliter.zip(data) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = @@ -1845,7 +1845,7 @@ fn test_copr_bypass_or_access_locks() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(expected_data) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( @@ -1950,7 +1950,7 @@ fn test_rc_read() { let mut resp = handle_select(&endpoint, req); let mut row_count = 0; - let spliter = DAGChunkSpliter::new(resp.take_chunks().into(), 3); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(expected_data.clone()) { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 49ecf13c1d9..2bc05726bfc 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -12,7 +12,7 @@ use std::{ }; use engine_traits::{KvEngine, RaftEngineReadOnly}; -use file_system::{IOOp, IOType}; +use file_system::{IoOp, IoType}; use futures::executor::block_on; use grpcio::Environment; use kvproto::raft_serverpb::*; @@ -503,23 +503,23 @@ fn test_inspected_snapshot() { .unwrap() .statistics() .unwrap(); - assert_eq!(stats.fetch(IOType::Replication, IOOp::Read), 0); - assert_eq!(stats.fetch(IOType::Replication, IOOp::Write), 0); + assert_eq!(stats.fetch(IoType::Replication, IoOp::Read), 0); + assert_eq!(stats.fetch(IoType::Replication, IoOp::Write), 0); // Make sure snapshot read hits disk cluster.flush_data(); // Let store 3 inform leader to generate a snapshot. cluster.run_node(3).unwrap(); must_get_equal(&cluster.get_engine(3), b"k2", b"v2"); - assert_ne!(stats.fetch(IOType::Replication, IOOp::Read), 0); - assert_ne!(stats.fetch(IOType::Replication, IOOp::Write), 0); + assert_ne!(stats.fetch(IoType::Replication, IoOp::Read), 0); + assert_ne!(stats.fetch(IoType::Replication, IoOp::Write), 0); pd_client.must_remove_peer(1, new_peer(2, 2)); - assert_eq!(stats.fetch(IOType::LoadBalance, IOOp::Read), 0); - assert_eq!(stats.fetch(IOType::LoadBalance, IOOp::Write), 0); + assert_eq!(stats.fetch(IoType::LoadBalance, IoOp::Read), 0); + assert_eq!(stats.fetch(IoType::LoadBalance, IoOp::Write), 0); pd_client.must_add_peer(1, new_peer(2, 2)); must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); - assert_ne!(stats.fetch(IOType::LoadBalance, IOOp::Read), 0); - assert_ne!(stats.fetch(IOType::LoadBalance, IOOp::Write), 0); + assert_ne!(stats.fetch(IoType::LoadBalance, IoOp::Read), 0); + assert_ne!(stats.fetch(IoType::LoadBalance, IoOp::Write), 0); } // Test snapshot generating and receiving can share one I/O limiter fairly. diff --git a/tests/integrations/raftstore/test_tombstone.rs b/tests/integrations/raftstore/test_tombstone.rs index 21adc354295..3d7fc235cad 100644 --- a/tests/integrations/raftstore/test_tombstone.rs +++ b/tests/integrations/raftstore/test_tombstone.rs @@ -3,7 +3,7 @@ use std::{sync::Arc, thread, time::Duration}; use crossbeam::channel; -use engine_traits::{CFNamesExt, Iterable, Peekable, RaftEngineReadOnly, SyncMutable, CF_RAFT}; +use engine_traits::{CfNamesExt, Iterable, Peekable, RaftEngineReadOnly, SyncMutable, CF_RAFT}; use kvproto::raft_serverpb::{PeerState, RaftMessage, RegionLocalState, StoreIdent}; use protobuf::Message; use raft::eraftpb::MessageType; diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index cd311386769..ec8bf906e1c 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -11,7 +11,7 @@ use engine_rocks::{ RocksSstWriterBuilder, }; use engine_traits::{ - CFOptionsExt, CompactExt, DeleteStrategy, Engines, KvEngine, MiscExt, Range, SstWriter, + CfOptionsExt, CompactExt, DeleteStrategy, Engines, KvEngine, MiscExt, Range, SstWriter, SstWriterBuilder, SyncMutable, CF_DEFAULT, CF_WRITE, }; use keys::data_key; From 84654c87d6def968a197d47babad8e08acdf685a Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 29 Jul 2022 09:01:11 +0800 Subject: [PATCH 120/676] raftstore: add more arguments to pre_exec observer (#13158) ref tikv/tikv#12849 Add more arguments to pre_exec observer Signed-off-by: CalvinNeo --- .../raftstore/src/coprocessor/dispatcher.rs | 27 ++++++++++++++----- components/raftstore/src/coprocessor/mod.rs | 10 +++++-- components/raftstore/src/store/fsm/apply.rs | 10 +++++-- 3 files changed, 36 insertions(+), 11 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 8122f54b12d..c752e629af1 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -417,13 +417,14 @@ impl CoprocessorHost { } } - pub fn pre_exec(&self, region: &Region, cmd: &RaftCmdRequest) -> bool { + // (index, term) is for the applying entry. + pub fn pre_exec(&self, region: &Region, cmd: &RaftCmdRequest, index: u64, term: u64) -> bool { let mut ctx = ObserverContext::new(region); if !cmd.has_admin_request() { let query = cmd.get_requests(); for observer in &self.registry.query_observers { let observer = observer.observer.inner(); - if observer.pre_exec_query(&mut ctx, query) { + if observer.pre_exec_query(&mut ctx, query, index, term) { return true; } } @@ -432,7 +433,7 @@ impl CoprocessorHost { let admin = cmd.get_admin_request(); for observer in &self.registry.admin_observers { let observer = observer.observer.inner(); - if observer.pre_exec_admin(&mut ctx, admin) { + if observer.pre_exec_admin(&mut ctx, admin, index, term) { return true; } } @@ -665,7 +666,13 @@ mod tests { ctx.bypass = self.bypass.load(Ordering::SeqCst); } - fn pre_exec_admin(&self, ctx: &mut ObserverContext<'_>, _: &AdminRequest) -> bool { + fn pre_exec_admin( + &self, + ctx: &mut ObserverContext<'_>, + _: &AdminRequest, + _: u64, + _: u64, + ) -> bool { self.called.fetch_add(16, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); false @@ -696,7 +703,13 @@ mod tests { ctx.bypass = self.bypass.load(Ordering::SeqCst); } - fn pre_exec_query(&self, ctx: &mut ObserverContext<'_>, _: &[Request]) -> bool { + fn pre_exec_query( + &self, + ctx: &mut ObserverContext<'_>, + _: &[Request], + _: u64, + _: u64, + ) -> bool { self.called.fetch_add(15, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); false @@ -839,12 +852,12 @@ mod tests { let mut query_req = RaftCmdRequest::default(); query_req.set_requests(vec![Request::default()].into()); - host.pre_exec(®ion, &query_req); + host.pre_exec(®ion, &query_req, 0, 0); assert_all!([&ob.called], &[103]); // 15 let mut admin_req = RaftCmdRequest::default(); admin_req.set_admin_request(AdminRequest::default()); - host.pre_exec(®ion, &admin_req); + host.pre_exec(®ion, &admin_req, 0, 0); assert_all!([&ob.called], &[119]); // 16 } diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 8a4975b1459..e7c351262fa 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -96,7 +96,13 @@ pub trait AdminObserver: Coprocessor { /// Hook before exec admin request, returns whether we should skip this /// admin. - fn pre_exec_admin(&self, _: &mut ObserverContext<'_>, _: &AdminRequest) -> bool { + fn pre_exec_admin( + &self, + _: &mut ObserverContext<'_>, + _: &AdminRequest, + _: u64, + _: u64, + ) -> bool { false } @@ -135,7 +141,7 @@ pub trait QueryObserver: Coprocessor { /// Hook before exec write request, returns whether we should skip this /// write. - fn pre_exec_query(&self, _: &mut ObserverContext<'_>, _: &[Request]) -> bool { + fn pre_exec_query(&self, _: &mut ObserverContext<'_>, _: &[Request], _: u64, _: u64) -> bool { false } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 284015b0eb8..1b64c9a2787 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1273,7 +1273,7 @@ where // E.g. `RaftApplyState` must not be changed. let mut origin_epoch = None; - let (resp, exec_result) = if ctx.host.pre_exec(&self.region, req) { + let (resp, exec_result) = if ctx.host.pre_exec(&self.region, req, index, term) { // One of the observers want to filter execution of the command. let mut resp = RaftCmdResponse::default(); if !req.get_header().get_uuid().is_empty() { @@ -5000,7 +5000,13 @@ mod tests { } } - fn pre_exec_admin(&self, _: &mut ObserverContext<'_>, req: &AdminRequest) -> bool { + fn pre_exec_admin( + &self, + _: &mut ObserverContext<'_>, + req: &AdminRequest, + _: u64, + _: u64, + ) -> bool { let cmd_type = req.get_cmd_type(); if cmd_type == AdminCmdType::CompactLog && self.filter_compact_log.deref().load(Ordering::SeqCst) From 1e13ddf3bf12c00afb4d049d05978fecae9a6067 Mon Sep 17 00:00:00 2001 From: Zwb Date: Fri, 29 Jul 2022 15:11:12 +0800 Subject: [PATCH 121/676] Make max_subcompactions dynamically changeable (#13151) close tikv/tikv#13145 Make max_subcompactions dynamically changeable Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot --- src/config.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/config.rs b/src/config.rs index 6c345b8b773..23dea43d47a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1239,6 +1239,15 @@ impl DbConfig { ) .into()); } + if self.max_sub_compactions == 0 + || self.max_sub_compactions as i32 > self.max_background_jobs + { + return Err(format!( + "max_sub_compactions should be greater than 0 and less than or equal to {:?}", + self.max_background_jobs, + ) + .into()); + } if self.max_background_flushes <= 0 || self.max_background_flushes > limit { return Err(format!( "max_background_flushes should be greater than 0 and less than or equal to {:?}", @@ -1375,7 +1384,6 @@ pub struct RaftDbConfig { pub info_log_dir: String, #[online_config(skip)] pub info_log_level: RocksLogLevel, - #[online_config(skip)] pub max_sub_compactions: u32, pub writable_file_max_buffer_size: ReadableSize, #[online_config(skip)] @@ -1678,6 +1686,11 @@ impl> DbConfigManger { Ok(()) } + fn set_max_subcompactions(&self, max_subcompactions: u32) -> Result<(), Box> { + self.set_db_config(&[("max_subcompactions", &max_subcompactions.to_string())])?; + Ok(()) + } + fn validate_cf(&self, cf: &str) -> Result<(), Box> { match (self.db_type, cf) { (DbType::Kv, CF_DEFAULT) @@ -1740,6 +1753,14 @@ impl + Send + Sync> ConfigManager for DbConfigMan self.set_max_background_jobs(max_background_jobs)?; } + if let Some(background_subcompactions_config) = change + .drain_filter(|(name, _)| name == "max_sub_compactions") + .next() + { + let max_subcompactions = background_subcompactions_config.1.into(); + self.set_max_subcompactions(max_subcompactions)?; + } + if let Some(background_flushes_config) = change .drain_filter(|(name, _)| name == "max_background_flushes") .next() From 2f42bc9ce1a2e457ec2a49820ab56ae52adcc6d2 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 29 Jul 2022 17:47:13 +0800 Subject: [PATCH 122/676] tests: do not use `assert!(r.is_ok())` in tests (#13009) ref tikv/tikv#13008 None Signed-off-by: tabokie --- components/backup-stream/src/errors.rs | 5 +- components/backup/src/endpoint.rs | 3 +- .../cdc/tests/failpoints/test_endpoint.rs | 4 +- components/cloud/aws/src/s3.rs | 31 +++---- .../concurrency_manager/src/lock_table.rs | 28 +++--- components/encryption/src/file_dict_file.rs | 5 +- components/encryption/src/io.rs | 19 ++-- components/encryption/src/manager/mod.rs | 3 +- components/engine_traits/src/engine.rs | 2 +- .../src/scenario_writes.rs | 3 +- components/external_storage/src/local.rs | 17 ++-- .../raftstore/src/coprocessor/config.rs | 12 +-- .../src/coprocessor/split_observer.rs | 7 +- components/raftstore/src/store/bootstrap.rs | 10 +-- components/raftstore/src/store/config.rs | 32 +++---- components/raftstore/src/store/fsm/apply.rs | 2 +- components/raftstore/src/store/txn_ext.rs | 18 ++-- components/resource_metering/src/config.rs | 4 +- components/sst_importer/src/sst_importer.rs | 1 - components/tidb_query_aggr/src/lib.rs | 58 +++++------- .../tidb_query_datatype/src/codec/convert.rs | 7 +- .../src/codec/mysql/decimal.rs | 16 ++-- .../src/codec/mysql/json/serde.rs | 8 +- .../tidb_query_datatype/src/codec/table.rs | 6 +- .../tidb_query_datatype/src/expr/ctx.rs | 16 ++-- .../src/table_scan_executor.rs | 4 +- .../tidb_query_expr/src/impl_compare_in.rs | 17 ++-- .../tidb_query_expr/src/types/expr_builder.rs | 90 ++++++------------- .../tidb_query_expr/src/types/expr_eval.rs | 51 ++++++----- components/tikv_util/src/config.rs | 15 ++-- components/tikv_util/src/mpsc/batch.rs | 8 +- components/tikv_util/src/worker/mod.rs | 2 +- .../tikv_util/src/yatp_pool/future_pool.rs | 8 +- src/config.rs | 48 +++++----- .../interceptors/concurrency_limiter.rs | 13 ++- src/coprocessor/interceptors/deadline.rs | 10 ++- src/read_pool.rs | 18 ++-- src/server/debug.rs | 2 +- src/server/engine_factory_v2.rs | 12 +-- src/server/gc_worker/gc_worker.rs | 46 +++++----- src/server/resolve.rs | 4 +- src/server/server.rs | 2 +- src/server/service/diagnostics/log.rs | 1 - src/server/status_server/mod.rs | 4 +- src/server/status_server/profile.rs | 6 +- src/storage/config.rs | 4 +- src/storage/mod.rs | 22 ++--- src/storage/mvcc/consistency_check.rs | 2 +- src/storage/mvcc/mod.rs | 5 +- src/storage/mvcc/txn.rs | 10 +-- src/storage/txn/scheduler.rs | 8 +- src/storage/txn/store.rs | 48 +++++----- tests/benches/misc/raftkv/mod.rs | 2 +- tests/failpoints/cases/test_disk_full.rs | 2 +- tests/failpoints/cases/test_merge.rs | 72 +++++++-------- tests/failpoints/cases/test_pd_client.rs | 1 - tests/failpoints/cases/test_split_region.rs | 18 ++-- tests/failpoints/cases/test_storage.rs | 2 +- tests/failpoints/cases/test_transaction.rs | 14 ++- .../failpoints/cases/test_transfer_leader.rs | 90 +++++++++---------- .../integrations/config/test_config_client.rs | 4 +- tests/integrations/pd/test_rpc_client.rs | 4 +- tests/integrations/raftstore/test_merge.rs | 38 ++++---- tests/integrations/raftstore/test_multi.rs | 30 +++---- .../raftstore/test_replica_read.rs | 4 +- .../raftstore/test_split_region.rs | 14 ++- .../raftstore/test_transfer_leader.rs | 22 ++--- tests/integrations/server/gc_worker.rs | 2 +- tests/integrations/server/kv_service.rs | 2 +- tests/integrations/server/security.rs | 3 +- tests/integrations/server/status_server.rs | 2 +- .../integrations/storage/test_raft_storage.rs | 5 +- tests/integrations/storage/test_titan.rs | 30 +++---- 73 files changed, 502 insertions(+), 636 deletions(-) diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index b049b0a29be..493cf28babc 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -285,8 +285,9 @@ mod test { b.iter(|| { let result: Result<()> = Ok(()); let lucky_number = rand::random::(); - let result = result.context_with(|| format!("lucky: the number is {}", lucky_number)); - assert!(result.is_ok()); + result + .context_with(|| format!("lucky: the number is {}", lucky_number)) + .unwrap(); }) } } diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 7af38d12ac4..35a08c81a2d 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -1676,8 +1676,7 @@ pub mod tests { dst_user_key.as_encoded(), dst_value, ); - let ret = engine.put(&ctx, key, value); - assert!(ret.is_ok()); + engine.put(&ctx, key, value).unwrap(); i += 1; } // flush to disk so that read requests can be traced by TiKV limiter. diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index 2e9375ce6a5..9a1053681f1 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -309,9 +309,9 @@ fn do_test_no_resolved_ts_before_downstream_initialized(version: &str) { // The first downstream can receive timestamps but the second should receive // nothing. let mut rx = event_feeds[0].replace(None).unwrap(); - assert!(recv_timeout(&mut rx, Duration::from_secs(1)).is_ok()); + recv_timeout(&mut rx, Duration::from_secs(1)).unwrap(); let mut rx = event_feeds[1].replace(None).unwrap(); - assert!(recv_timeout(&mut rx, Duration::from_secs(3)).is_err()); + recv_timeout(&mut rx, Duration::from_secs(3)).unwrap_err(); }); th.join().unwrap(); diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index fd5c07c5097..ef13749ccea 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -651,14 +651,13 @@ mod tests { let s = S3Storage::new_creds_dispatcher(config, dispatcher, credentials_provider).unwrap(); - let resp = s - .put( - "mykey", - PutResource(Box::new(magic_contents.as_bytes())), - magic_contents.len() as u64, - ) - .await; - assert!(resp.is_ok()); + s.put( + "mykey", + PutResource(Box::new(magic_contents.as_bytes())), + magic_contents.len() as u64, + ) + .await + .unwrap(); assert_eq!( CLOUD_REQUEST_HISTOGRAM_VEC .get_metric_with_label_values(&["s3", "upload_part"]) @@ -739,17 +738,15 @@ mod tests { // inject 50ms delay fail::cfg(s3_sleep_injected_fp, "return(50)").unwrap(); - let resp = s - .put( - "mykey", - PutResource(Box::new(magic_contents.as_bytes())), - magic_contents.len() as u64, - ) - .await; + s.put( + "mykey", + PutResource(Box::new(magic_contents.as_bytes())), + magic_contents.len() as u64, + ) + .await + .unwrap(); fail::remove(s3_sleep_injected_fp); fail::remove(s3_timeout_injected_fp); - // no timeout - assert!(resp.is_ok()); } #[test] diff --git a/components/concurrency_manager/src/lock_table.rs b/components/concurrency_manager/src/lock_table.rs index da08d9983d1..4169537840e 100644 --- a/components/concurrency_manager/src/lock_table.rs +++ b/components/concurrency_manager/src/lock_table.rs @@ -172,7 +172,7 @@ mod test { let key_k = Key::from_raw(b"k"); // no lock found - assert!(lock_table.check_key(&key_k, |_| Err(())).is_ok()); + lock_table.check_key(&key_k, |_| Err(())).unwrap(); let lock = Lock::new( LockType::Lock, @@ -190,7 +190,7 @@ mod test { }); // lock passes check_fn - assert!(lock_table.check_key(&key_k, |l| ts_check(l, 5)).is_ok()); + lock_table.check_key(&key_k, |l| ts_check(l, 5)).unwrap(); // lock does not pass check_fn assert_eq!(lock_table.check_key(&key_k, |l| ts_check(l, 20)), Err(lock)); @@ -231,22 +231,18 @@ mod test { }); // no lock found - assert!( - lock_table - .check_range( - Some(&Key::from_raw(b"m")), - Some(&Key::from_raw(b"n")), - |_, _| Err(()) - ) - .is_ok() - ); + lock_table + .check_range( + Some(&Key::from_raw(b"m")), + Some(&Key::from_raw(b"n")), + |_, _| Err(()), + ) + .unwrap(); // lock passes check_fn - assert!( - lock_table - .check_range(None, Some(&Key::from_raw(b"z")), |_, l| ts_check(l, 5)) - .is_ok() - ); + lock_table + .check_range(None, Some(&Key::from_raw(b"z")), |_, l| ts_check(l, 5)) + .unwrap(); // first lock does not pass check_fn assert_eq!( diff --git a/components/encryption/src/file_dict_file.rs b/components/encryption/src/file_dict_file.rs index 653fbf8dbbb..4a2609cacb5 100644 --- a/components/encryption/src/file_dict_file.rs +++ b/components/encryption/src/file_dict_file.rs @@ -600,10 +600,9 @@ mod tests { // Try open as v1 file. Should success. { let file_dict_file = EncryptedFile::new(tempdir.path(), "test_file_dict_file"); - let file_bytes = file_dict_file.read(&PlaintextBackend::default()); - assert!(file_bytes.is_ok()); + let file_bytes = file_dict_file.read(&PlaintextBackend::default()).unwrap(); let mut file_dict = FileDictionary::default(); - file_dict.merge_from_bytes(&file_bytes.unwrap()).unwrap(); + file_dict.merge_from_bytes(&file_bytes).unwrap(); assert_eq!(*file_dict.files.get("f1").unwrap(), info1); assert_eq!(file_dict.files.get("f2"), None); assert_eq!(file_dict.files.get("f3"), None); diff --git a/components/encryption/src/io.rs b/components/encryption/src/io.rs index d2c5b6d1546..e02aafabe88 100644 --- a/components/encryption/src/io.rs +++ b/components/encryption/src/io.rs @@ -694,9 +694,8 @@ mod tests { buf: &mut [u8], ) -> Poll> { let len = min(self.read_maxsize_once, buf.len()); - let r = self.cursor.read(&mut buf[..len]); - assert!(r.is_ok()); - Poll::Ready(IoResult::Ok(r.unwrap())) + let r = self.cursor.read(&mut buf[..len]).unwrap(); + Poll::Ready(IoResult::Ok(r)) } } @@ -727,11 +726,10 @@ mod tests { let mut encrypt_read_len = 0; loop { - let s = encrypt_reader + let read_len = encrypt_reader .read(&mut encrypt_text[encrypt_read_len..]) - .await; - assert!(s.is_ok()); - let read_len = s.unwrap(); + .await + .unwrap(); if read_len == 0 { break; } @@ -757,11 +755,10 @@ mod tests { .unwrap(); loop { - let s = decrypt_reader + let read_len = decrypt_reader .read(&mut decrypt_text[decrypt_read_len..]) - .await; - assert!(s.is_ok()); - let read_len = s.unwrap(); + .await + .unwrap(); if read_len == 0 { break; } diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index a45f6153358..0dcdbffdb95 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -1299,8 +1299,7 @@ mod tests { // return errors. assert!(result.is_err()); let previous = Box::new(PlaintextBackend::default()) as Box; - let result = new_key_manager(&tmp_dir, None, right_key, previous); - assert!(result.is_ok()); + new_key_manager(&tmp_dir, None, right_key, previous).unwrap(); } #[test] diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 276fb1ed19a..dc09b54fb6e 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -309,7 +309,7 @@ mod tests { fn test_tablet_error_collector_ok() { let mut err = TabletErrorCollector::new(); err.add_result(1, 1, Ok(())); - assert!(err.take_result().is_ok()); + err.take_result().unwrap(); assert_eq!(err.get_error_count(), 0); } diff --git a/components/engine_traits_tests/src/scenario_writes.rs b/components/engine_traits_tests/src/scenario_writes.rs index c9b1b1d5fb7..eb05c107c1d 100644 --- a/components/engine_traits_tests/src/scenario_writes.rs +++ b/components/engine_traits_tests/src/scenario_writes.rs @@ -213,8 +213,7 @@ scenario_test! { put_get { scenario_test! { delete_none { let db = write_scenario_engine(); - let res = db.delete(b"foo"); - assert!(res.is_ok()); + db.delete(b"foo").unwrap(); }} scenario_test! { delete { diff --git a/components/external_storage/src/local.rs b/components/external_storage/src/local.rs index f246c808b86..80c22929525 100644 --- a/components/external_storage/src/local.rs +++ b/components/external_storage/src/local.rs @@ -227,18 +227,15 @@ mod tests { let filename = "existed.file"; let buf1: &[u8] = b"pingcap"; let buf2: &[u8] = b"tikv"; - let r = ls - .write(filename, UnpinReader(Box::new(buf1)), buf1.len() as _) - .await; - assert!(r.is_ok()); - let r = ls - .write(filename, UnpinReader(Box::new(buf2)), buf2.len() as _) - .await; - assert!(r.is_ok()); + ls.write(filename, UnpinReader(Box::new(buf1)), buf1.len() as _) + .await + .unwrap(); + ls.write(filename, UnpinReader(Box::new(buf2)), buf2.len() as _) + .await + .unwrap(); let mut read_buff: Vec = Vec::new(); - let r = ls.read(filename).read_to_end(&mut read_buff).await; - assert!(r.is_ok()); + ls.read(filename).read_to_end(&mut read_buff).await.unwrap(); assert_eq!(read_buff.len(), 4); assert_eq!(&read_buff, buf2); } diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index 1087b18c287..fb1fc35345f 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -209,34 +209,34 @@ mod tests { cfg = Config::default(); cfg.region_max_size = Some(ReadableSize(10)); cfg.region_split_size = ReadableSize(20); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); cfg = Config::default(); cfg.region_max_size = None; cfg.region_split_size = ReadableSize(20); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); assert_eq!(cfg.region_max_size, Some(ReadableSize(30))); cfg = Config::default(); cfg.region_max_keys = Some(10); cfg.region_split_keys = Some(20); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); cfg = Config::default(); cfg.region_max_keys = None; cfg.region_split_keys = Some(20); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); assert_eq!(cfg.region_max_keys, Some(30)); cfg = Config::default(); cfg.enable_region_bucket = false; cfg.region_split_size = ReadableSize(20); cfg.region_bucket_size = ReadableSize(30); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); cfg = Config::default(); cfg.region_split_size = ReadableSize::mb(20); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); assert_eq!(cfg.region_split_keys, Some(200000)); } } diff --git a/components/raftstore/src/coprocessor/split_observer.rs b/components/raftstore/src/coprocessor/split_observer.rs index e763c83a37c..7f844f4b069 100644 --- a/components/raftstore/src/coprocessor/split_observer.rs +++ b/components/raftstore/src/coprocessor/split_observer.rs @@ -240,14 +240,13 @@ mod tests { let observer = SplitObserver; - let resp = observer.pre_propose_admin(&mut ctx, &mut req); // since no split is defined, actual coprocessor won't be invoke. - assert!(resp.is_ok()); + observer.pre_propose_admin(&mut ctx, &mut req).unwrap(); assert!(!req.has_split(), "only split req should be handle."); req = new_split_request(new_row_key(1, 2, 0)); // For compatible reason, split should supported too. - assert!(observer.pre_propose_admin(&mut ctx, &mut req).is_ok()); + observer.pre_propose_admin(&mut ctx, &mut req).unwrap(); // Empty key should be skipped. let mut split_keys = vec![vec![]]; @@ -257,7 +256,7 @@ mod tests { req = new_batch_split_request(split_keys.clone()); // Although invalid keys should be skipped, but if all keys are // invalid, errors should be reported. - assert!(observer.pre_propose_admin(&mut ctx, &mut req).is_err()); + observer.pre_propose_admin(&mut ctx, &mut req).unwrap_err(); let mut key = new_row_key(1, 2, 0); let mut expected_key = key[..key.len() - 8].to_vec(); diff --git a/components/raftstore/src/store/bootstrap.rs b/components/raftstore/src/store/bootstrap.rs index 1ee8e9ddc10..f6e3a266f01 100644 --- a/components/raftstore/src/store/bootstrap.rs +++ b/components/raftstore/src/store/bootstrap.rs @@ -143,10 +143,10 @@ mod tests { let engines = Engines::new(kv_engine.clone(), raft_engine.clone()); let region = initial_region(1, 1, 1); - assert!(bootstrap_store(&engines, 1, 1).is_ok()); - assert!(bootstrap_store(&engines, 1, 1).is_err()); + bootstrap_store(&engines, 1, 1).unwrap(); + bootstrap_store(&engines, 1, 1).unwrap_err(); - assert!(prepare_bootstrap_cluster(&engines, ®ion).is_ok()); + prepare_bootstrap_cluster(&engines, ®ion).unwrap(); assert!( kv_engine .get_value(keys::PREPARE_BOOTSTRAP_KEY) @@ -167,8 +167,8 @@ mod tests { ); assert!(raft_engine.get_raft_state(1).unwrap().is_some()); - assert!(clear_prepare_bootstrap_key(&engines).is_ok()); - assert!(clear_prepare_bootstrap_cluster(&engines, 1).is_ok()); + clear_prepare_bootstrap_key(&engines).unwrap(); + clear_prepare_bootstrap_cluster(&engines, 1).unwrap(); assert!( is_range_empty( &kv_engine, diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 5d7d89bbc7b..6b59eaf71bb 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -1023,10 +1023,8 @@ mod tests { cfg = Config::new(); cfg.raft_log_gc_size_limit = None; - assert!( - cfg.validate(ReadableSize(20), false, ReadableSize(0)) - .is_ok() - ); + cfg.validate(ReadableSize(20), false, ReadableSize(0)) + .unwrap(); assert_eq!(cfg.raft_log_gc_size_limit, Some(ReadableSize(15))); cfg = Config::new(); @@ -1042,10 +1040,8 @@ mod tests { cfg = Config::new(); cfg.raft_log_gc_count_limit = None; - assert!( - cfg.validate(ReadableSize::mb(1), false, ReadableSize(0)) - .is_ok() - ); + cfg.validate(ReadableSize::mb(1), false, ReadableSize(0)) + .unwrap(); assert_eq!(cfg.raft_log_gc_count_limit, Some(768)); cfg = Config::new(); @@ -1098,13 +1094,13 @@ mod tests { cfg = Config::new(); cfg.hibernate_regions = true; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.store_batch_system.max_batch_size, Some(256)); assert_eq!(cfg.apply_batch_system.max_batch_size, Some(256)); cfg = Config::new(); cfg.hibernate_regions = false; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.store_batch_system.max_batch_size, Some(1024)); assert_eq!(cfg.apply_batch_system.max_batch_size, Some(256)); @@ -1112,7 +1108,7 @@ mod tests { cfg.hibernate_regions = true; cfg.store_batch_system.max_batch_size = Some(123); cfg.apply_batch_system.max_batch_size = Some(234); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.store_batch_system.max_batch_size, Some(123)); assert_eq!(cfg.apply_batch_system.max_batch_size, Some(234)); @@ -1134,7 +1130,7 @@ mod tests { cfg.hibernate_regions = true; cfg.max_peer_down_duration = ReadableDuration::minutes(5); cfg.peer_stale_state_check_interval = ReadableDuration::minutes(5); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.max_peer_down_duration, ReadableDuration::minutes(10)); cfg = Config::new(); @@ -1143,7 +1139,7 @@ mod tests { cfg.raft_max_size_per_msg = ReadableSize::gb(64); assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); cfg.raft_max_size_per_msg = ReadableSize::gb(3); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg = Config::new(); cfg.raft_entry_max_size = ReadableSize(0); @@ -1151,23 +1147,23 @@ mod tests { cfg.raft_entry_max_size = ReadableSize::mb(3073); assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); cfg.raft_entry_max_size = ReadableSize::gb(3); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg = Config::new(); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_ok()); + cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.region_split_check_diff(), split_size / 16); cfg = Config::new(); - assert!(cfg.validate(split_size, true, split_size / 8).is_ok()); + cfg.validate(split_size, true, split_size / 8).unwrap(); assert_eq!(cfg.region_split_check_diff(), split_size / 16); cfg = Config::new(); - assert!(cfg.validate(split_size, true, split_size / 20).is_ok()); + cfg.validate(split_size, true, split_size / 20).unwrap(); assert_eq!(cfg.region_split_check_diff(), split_size / 20); cfg = Config::new(); cfg.region_split_check_diff = Some(ReadableSize(1)); - assert!(cfg.validate(split_size, true, split_size / 20).is_ok()); + cfg.validate(split_size, true, split_size / 20).unwrap(); assert_eq!(cfg.region_split_check_diff(), ReadableSize(1)); } } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 1b64c9a2787..3b9546a460c 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -4728,7 +4728,7 @@ mod tests { }); let cc_resp = cc_rx.try_recv().unwrap(); assert!(cc_resp.get_header().get_error().has_stale_command()); - assert!(rx.recv_timeout(Duration::from_secs(3)).is_ok()); + rx.recv_timeout(Duration::from_secs(3)).unwrap(); // Make sure Apply and Snapshot are in the same batch. let (snap_tx, _) = mpsc::sync_channel(0); diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 7b681506f63..078d3114060 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -338,10 +338,10 @@ mod tests { let k3 = Key::from_raw(b"k333"); // Test the memory size of peer pessimistic locks after inserting. - assert!(locks1.insert(vec![(k1.clone(), lock(b"k1"))]).is_ok()); + locks1.insert(vec![(k1.clone(), lock(b"k1"))]).unwrap(); assert_eq!(locks1.get(&k1), Some(&(lock(b"k1"), false))); assert_eq!(locks1.memory_size, k1.len() + lock(b"k1").memory_size()); - assert!(locks1.insert(vec![(k2.clone(), lock(b"k1"))]).is_ok()); + locks1.insert(vec![(k2.clone(), lock(b"k1"))]).unwrap(); assert_eq!(locks1.get(&k2), Some(&(lock(b"k1"), false))); assert_eq!( locks1.memory_size, @@ -349,7 +349,7 @@ mod tests { ); // Test the global memory size after inserting. - assert!(locks2.insert(vec![(k3.clone(), lock(b"k1"))]).is_ok()); + locks2.insert(vec![(k3.clone(), lock(b"k1"))]).unwrap(); assert_eq!(locks2.get(&k3), Some(&(lock(b"k1"), false))); assert_eq!( GLOBAL_MEM_SIZE.get() as usize, @@ -357,7 +357,7 @@ mod tests { ); // Test the memory size after replacing, it should not change. - assert!(locks1.insert(vec![(k2.clone(), lock(b"k2"))]).is_ok()); + locks1.insert(vec![(k2.clone(), lock(b"k2"))]).unwrap(); assert_eq!(locks1.get(&k2), Some(&(lock(b"k2"), false))); assert_eq!( locks1.memory_size, @@ -395,12 +395,14 @@ mod tests { defer!(GLOBAL_MEM_SIZE.set(0)); let mut locks = PeerPessimisticLocks::default(); - let res = locks.insert(vec![(Key::from_raw(b"k1"), lock(&[0; 512000]))]); - assert!(res.is_ok()); + locks + .insert(vec![(Key::from_raw(b"k1"), lock(&[0; 512000]))]) + .unwrap(); // Exceeding the region limit - let res = locks.insert(vec![(Key::from_raw(b"k2"), lock(&[0; 32000]))]); - assert!(res.is_err()); + locks + .insert(vec![(Key::from_raw(b"k2"), lock(&[0; 32000]))]) + .unwrap_err(); assert!(locks.get(&Key::from_raw(b"k2")).is_none()); // Not exceeding the region limit, but exceeding the global limit diff --git a/components/resource_metering/src/config.rs b/components/resource_metering/src/config.rs index 90b09588e3a..69d7c78cb2f 100644 --- a/components/resource_metering/src/config.rs +++ b/components/resource_metering/src/config.rs @@ -133,14 +133,14 @@ mod tests { #[test] fn test_config_validate() { let cfg = Config::default(); - assert!(cfg.validate().is_ok()); // Empty address is allowed. + cfg.validate().unwrap(); // Empty address is allowed. let cfg = Config { receiver_address: "127.0.0.1:6666".to_string(), report_receiver_interval: ReadableDuration::minutes(1), max_resource_groups: 2000, precision: ReadableDuration::secs(1), }; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let cfg = Config { receiver_address: "127.0.0.1:6666".to_string(), report_receiver_interval: ReadableDuration::days(999), // invalid diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index b6d13ac9761..71a58a33dc3 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -966,7 +966,6 @@ mod tests { // test with tde let tmp_dir = tempfile::TempDir::new().unwrap(); let key_manager = new_test_key_manager(&tmp_dir, None, None, None); - assert!(key_manager.is_ok()); (tmp_dir, Arc::new(key_manager.unwrap().unwrap())) } diff --git a/components/tidb_query_aggr/src/lib.rs b/components/tidb_query_aggr/src/lib.rs index b9d73b2773a..1eda14a0697 100644 --- a/components/tidb_query_aggr/src/lib.rs +++ b/components/tidb_query_aggr/src/lib.rs @@ -416,22 +416,18 @@ mod tests { let mut s = AggrFnStateFoo::new(); // Update using `Int` should success. - assert!( - update!( - &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, - &mut ctx, - Some(&1) - ) - .is_ok() - ); - assert!( - update!( - &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, - &mut ctx, - Some(&3) - ) - .is_ok() - ); + update!( + &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, + &mut ctx, + Some(&1) + ) + .unwrap(); + update!( + &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, + &mut ctx, + Some(&3) + ) + .unwrap(); // Update using other data type should panic. let result = panic_hook::recover_safe(|| { @@ -457,27 +453,21 @@ mod tests { // Push result to Real VectorValue should success. let mut target = vec![VectorValue::with_capacity(0, EvalType::Real)]; - assert!( - (&mut s as &mut dyn AggrFunctionState) - .push_result(&mut ctx, &mut target) - .is_ok() - ); + (&mut s as &mut dyn AggrFunctionState) + .push_result(&mut ctx, &mut target) + .unwrap(); assert_eq!(target[0].to_real_vec(), &[Real::new(4.0).ok()]); // Calling push result multiple times should also success. - assert!( - update!( - &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, - &mut ctx, - Some(&1) - ) - .is_ok() - ); - assert!( - (&mut s as &mut dyn AggrFunctionState) - .push_result(&mut ctx, &mut target) - .is_ok() - ); + update!( + &mut s as &mut dyn AggrFunctionStateUpdatePartial<_>, + &mut ctx, + Some(&1) + ) + .unwrap(); + (&mut s as &mut dyn AggrFunctionState) + .push_result(&mut ctx, &mut target) + .unwrap(); assert_eq!( target[0].to_real_vec(), &[Real::new(4.0).ok(), Real::new(5.0).ok()] diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index c576f14ee5f..41f0794950d 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -1965,20 +1965,17 @@ mod tests { let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING))); let val: Result = b"".to_vec().convert(&mut ctx); - assert!(val.is_ok()); assert_eq!(val.unwrap(), 0.0); assert_eq!(ctx.warnings.warnings.len(), 1); let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING))); let val: Result = b"1.1a".to_vec().convert(&mut ctx); - assert!(val.is_ok()); assert_eq!(val.unwrap(), 1.1); assert_eq!(ctx.warnings.warnings.len(), 1); // IGNORE_TRUNCATE let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::IGNORE_TRUNCATE))); let val: Result = b"1.2a".to_vec().convert(&mut ctx); - assert!(val.is_ok()); assert_eq!(val.unwrap(), 1.2); assert_eq!(ctx.warnings.warnings.len(), 0); } @@ -2356,9 +2353,7 @@ mod tests { for (dec, flen, decimal, want) in cases { ft.set_flen(flen); ft.set_decimal(decimal); - let nd = produce_dec_with_specified_tp(&mut ctx, dec, &ft); - assert!(nd.is_ok()); - let nd = nd.unwrap(); + let nd = produce_dec_with_specified_tp(&mut ctx, dec, &ft).unwrap(); assert_eq!(nd, want, "{}, {}, {}, {}, {}", dec, nd, want, flen, decimal); } } diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 2518e003ba3..7cd1c239bb1 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -3734,11 +3734,9 @@ mod tests { ))); let truncated_res = Res::Truncated(2333); - assert!( - truncated_res - .into_result_impl(&mut ctx, Some(Error::truncated()), None) - .is_ok() - ); + truncated_res + .into_result_impl(&mut ctx, Some(Error::truncated()), None) + .unwrap(); // Overflow cases let mut ctx = EvalContext::default(); @@ -3757,10 +3755,8 @@ mod tests { Flag::OVERFLOW_AS_WARNING, ))); let error = Error::overflow("", ""); - assert!( - overflow_res - .into_result_impl(&mut ctx, None, Some(error)) - .is_ok() - ); + overflow_res + .into_result_impl(&mut ctx, None, Some(error)) + .unwrap(); } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs index 19fec765d1c..a4c33944e21 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs @@ -240,8 +240,7 @@ mod tests { ]; for json_str in legal_cases { - let resp = Json::from_str(json_str); - assert!(resp.is_ok()); + Json::from_str(json_str).unwrap(); } let cases = vec![ @@ -256,9 +255,8 @@ mod tests { ]; for (json_str, json) in cases { - let resp = Json::from_str(json_str); - assert!(resp.is_ok()); - assert_eq!(resp.unwrap(), json.unwrap()); + let resp = Json::from_str(json_str).unwrap(); + assert_eq!(resp, json.unwrap()); } let illegal_cases = vec!["[pxx,apaa]", "hpeheh", ""]; diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 2cb2f055842..052ad8bf927 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -790,7 +790,7 @@ mod tests { let mut range = KeyRange::default(); range.set_start(small_key.clone()); range.set_end(large_key.clone()); - assert!(check_table_ranges(&[range]).is_ok()); + check_table_ranges(&[range]).unwrap(); // test range.start > range.end let mut range = KeyRange::default(); range.set_end(small_key.clone()); @@ -819,13 +819,13 @@ mod tests { #[test] fn test_check_key_type() { let record_key = encode_row_key(TABLE_ID, 1); - assert!(check_key_type(record_key.as_slice(), RECORD_PREFIX_SEP).is_ok()); + check_key_type(record_key.as_slice(), RECORD_PREFIX_SEP).unwrap(); assert!(check_key_type(record_key.as_slice(), INDEX_PREFIX_SEP).is_err()); let (_, index_key) = generate_index_data_for_test(TABLE_ID, INDEX_ID, 1, &Datum::I64(1), true); assert!(check_key_type(index_key.as_slice(), RECORD_PREFIX_SEP).is_err()); - assert!(check_key_type(index_key.as_slice(), INDEX_PREFIX_SEP).is_ok()); + check_key_type(index_key.as_slice(), INDEX_PREFIX_SEP).unwrap(); let too_small_key = vec![0]; assert!(check_key_type(too_small_key.as_slice(), RECORD_PREFIX_SEP).is_err()); diff --git a/components/tidb_query_datatype/src/expr/ctx.rs b/components/tidb_query_datatype/src/expr/ctx.rs index 0e488689fce..a3e175a3867 100644 --- a/components/tidb_query_datatype/src/expr/ctx.rs +++ b/components/tidb_query_datatype/src/expr/ctx.rs @@ -335,19 +335,19 @@ mod tests { fn test_handle_truncate() { // ignore_truncate = false, truncate_as_warning = false let mut ctx = EvalContext::new(Arc::new(EvalConfig::new())); - assert!(ctx.handle_truncate(false).is_ok()); + ctx.handle_truncate(false).unwrap(); assert!(ctx.handle_truncate(true).is_err()); assert!(ctx.take_warnings().warnings.is_empty()); // ignore_truncate = false; let mut ctx = EvalContext::new(Arc::new(EvalConfig::default_for_test())); - assert!(ctx.handle_truncate(false).is_ok()); - assert!(ctx.handle_truncate(true).is_ok()); + ctx.handle_truncate(false).unwrap(); + ctx.handle_truncate(true).unwrap(); assert!(ctx.take_warnings().warnings.is_empty()); // ignore_truncate = false, truncate_as_warning = true let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING))); - assert!(ctx.handle_truncate(false).is_ok()); - assert!(ctx.handle_truncate(true).is_ok()); + ctx.handle_truncate(false).unwrap(); + ctx.handle_truncate(true).unwrap(); assert!(!ctx.take_warnings().warnings.is_empty()); } @@ -355,11 +355,11 @@ mod tests { fn test_max_warning_cnt() { let eval_cfg = Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING)); let mut ctx = EvalContext::new(Arc::clone(&eval_cfg)); - assert!(ctx.handle_truncate(true).is_ok()); - assert!(ctx.handle_truncate(true).is_ok()); + ctx.handle_truncate(true).unwrap(); + ctx.handle_truncate(true).unwrap(); assert_eq!(ctx.take_warnings().warnings.len(), 2); for _ in 0..2 * DEFAULT_MAX_WARNING_CNT { - assert!(ctx.handle_truncate(true).is_ok()); + ctx.handle_truncate(true).unwrap(); } let warnings = ctx.take_warnings(); assert_eq!(warnings.warning_cnt, 2 * DEFAULT_MAX_WARNING_CNT); diff --git a/components/tidb_query_executors/src/table_scan_executor.rs b/components/tidb_query_executors/src/table_scan_executor.rs index c2c310b4018..a4f7e957663 100644 --- a/components/tidb_query_executors/src/table_scan_executor.rs +++ b/components/tidb_query_executors/src/table_scan_executor.rs @@ -1136,7 +1136,7 @@ mod tests { .unwrap(); let mut result = executor.next_batch(1); - assert!(result.is_drained.is_ok()); + result.is_drained.unwrap(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_decoded()); @@ -1196,7 +1196,7 @@ mod tests { .unwrap(); let mut result = executor.next_batch(10); - assert!(result.is_drained.is_ok()); + result.is_drained.unwrap(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 2); assert!(result.physical_columns[0].is_decoded()); diff --git a/components/tidb_query_expr/src/impl_compare_in.rs b/components/tidb_query_expr/src/impl_compare_in.rs index d518c9061a0..312943a276a 100644 --- a/components/tidb_query_expr/src/impl_compare_in.rs +++ b/components/tidb_query_expr/src/impl_compare_in.rs @@ -821,14 +821,15 @@ mod tests { let logical_rows: &[usize] = &(0..1024).collect::>(); profiler::start("./bench_compare_in.profile"); b.iter(|| { - let result = black_box(&exp).eval( - black_box(&mut ctx), - black_box(schema), - black_box(&mut columns), - black_box(logical_rows), - black_box(1024), - ); - assert!(result.is_ok()); + black_box(&exp) + .eval( + black_box(&mut ctx), + black_box(schema), + black_box(&mut columns), + black_box(logical_rows), + black_box(1024), + ) + .unwrap(); }); profiler::stop(); } diff --git a/components/tidb_query_expr/src/types/expr_builder.rs b/components/tidb_query_expr/src/types/expr_builder.rs index 33c9d48de67..0546fe43f08 100644 --- a/components/tidb_query_expr/src/types/expr_builder.rs +++ b/components/tidb_query_expr/src/types/expr_builder.rs @@ -584,39 +584,34 @@ mod tests { .push_child(ExprDefBuilder::constant_int(1)) .push_child(ExprDefBuilder::constant_real(3.0)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_ok()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap(); // Incorrect return type let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsTime, FieldTypeTp::LongLong) .push_child(ExprDefBuilder::constant_int(1)) .push_child(ExprDefBuilder::constant_real(3.0)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); // Incorrect number of arguments let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsTime, FieldTypeTp::VarChar) .push_child(ExprDefBuilder::constant_int(1)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsTime, FieldTypeTp::VarChar) .push_child(ExprDefBuilder::constant_int(1)) .push_child(ExprDefBuilder::constant_real(3.0)) .push_child(ExprDefBuilder::constant_real(1.0)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); // Incorrect argument type let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsTime, FieldTypeTp::VarChar) .push_child(ExprDefBuilder::constant_int(1)) .push_child(ExprDefBuilder::constant_int(5)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); } #[test] @@ -626,16 +621,14 @@ mod tests { ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsDuration, FieldTypeTp::Double) .push_child(ExprDefBuilder::constant_int(1)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_ok()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap(); let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsDuration, FieldTypeTp::Double) .push_child(ExprDefBuilder::constant_int(1)) .push_child(ExprDefBuilder::constant_int(5)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_ok()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap(); let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsDuration, FieldTypeTp::Double) @@ -643,40 +636,35 @@ mod tests { .push_child(ExprDefBuilder::constant_int(5)) .push_child(ExprDefBuilder::constant_int(4)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_ok()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap(); // Incorrect return type let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsDuration, FieldTypeTp::LongLong) .push_child(ExprDefBuilder::constant_int(1)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); // Incorrect argument type let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsDuration, FieldTypeTp::Double) .push_child(ExprDefBuilder::constant_real(1.0)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsDuration, FieldTypeTp::Double) .push_child(ExprDefBuilder::constant_int(1)) .push_child(ExprDefBuilder::constant_real(1.0)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsDuration, FieldTypeTp::Double) .push_child(ExprDefBuilder::constant_real(3.0)) .push_child(ExprDefBuilder::constant_real(1.0)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsDuration, FieldTypeTp::Double) @@ -684,8 +672,7 @@ mod tests { .push_child(ExprDefBuilder::constant_real(1.0)) .push_child(ExprDefBuilder::constant_int(1)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); } #[test] @@ -695,23 +682,20 @@ mod tests { .push_child(ExprDefBuilder::constant_real(3.0)) .push_child(ExprDefBuilder::constant_real(5.0)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_ok()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap(); // Insufficient arguments let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsJson, FieldTypeTp::LongLong) .push_child(ExprDefBuilder::constant_real(3.0)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); // Incorrect return type let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsJson, FieldTypeTp::Double) .push_child(ExprDefBuilder::constant_real(3.0)) .push_child(ExprDefBuilder::constant_real(5.0)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); // Incorrect types let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastIntAsJson, FieldTypeTp::LongLong) @@ -719,8 +703,7 @@ mod tests { .push_child(ExprDefBuilder::constant_real(5.0)) .push_child(ExprDefBuilder::constant_int(42)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); } #[test] @@ -730,22 +713,19 @@ mod tests { .push_child(ExprDefBuilder::constant_real(3.0)) .push_child(ExprDefBuilder::constant_int(5)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_ok()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap(); // Insufficient arguments let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastRealAsInt, FieldTypeTp::Double).build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); // Incorrect return type let node = ExprDefBuilder::scalar_func(ScalarFuncSig::CastRealAsInt, FieldTypeTp::LongLong) .push_child(ExprDefBuilder::constant_real(3.0)) .push_child(ExprDefBuilder::constant_int(5)) .build(); - let exp = RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0); - assert!(exp.is_err()); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node, fn_mapper, 0).unwrap_err(); } #[test] @@ -851,14 +831,8 @@ mod tests { .is_err() ); for i in 1..10 { - assert!( - RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper( - node.clone(), - fn_mapper, - i - ) - .is_ok() - ); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, i) + .unwrap(); } // Col offset = 3. The minimum success max_columns is 4. @@ -874,14 +848,8 @@ mod tests { ); } for i in 4..10 { - assert!( - RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper( - node.clone(), - fn_mapper, - i - ) - .is_ok() - ); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, i) + .unwrap(); } // Col offset = 1, 2, 5. The minimum success max_columns is 6. @@ -903,14 +871,8 @@ mod tests { ); } for i in 6..10 { - assert!( - RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper( - node.clone(), - fn_mapper, - i - ) - .is_ok() - ); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, i) + .unwrap(); } } diff --git a/components/tidb_query_expr/src/types/expr_eval.rs b/components/tidb_query_expr/src/types/expr_eval.rs index 2ba3b030ef0..442c0f8486b 100644 --- a/components/tidb_query_expr/src/types/expr_eval.rs +++ b/components/tidb_query_expr/src/types/expr_eval.rs @@ -1246,14 +1246,15 @@ mod tests { profiler::start("./bench_eval_plus_1024_rows.profile"); b.iter(|| { - let result = black_box(&exp).eval( - black_box(&mut ctx), - black_box(schema), - black_box(&mut columns), - black_box(&logical_rows), - black_box(1024), - ); - assert!(result.is_ok()); + black_box(&exp) + .eval( + black_box(&mut ctx), + black_box(schema), + black_box(&mut columns), + black_box(&logical_rows), + black_box(1024), + ) + .unwrap(); }); profiler::stop(); } @@ -1283,14 +1284,15 @@ mod tests { profiler::start("./eval_compare_1024_rows.profile"); b.iter(|| { - let result = black_box(&exp).eval( - black_box(&mut ctx), - black_box(schema), - black_box(&mut columns), - black_box(&logical_rows), - black_box(1024), - ); - assert!(result.is_ok()); + black_box(&exp) + .eval( + black_box(&mut ctx), + black_box(schema), + black_box(&mut columns), + black_box(&logical_rows), + black_box(1024), + ) + .unwrap(); }); profiler::stop(); } @@ -1320,14 +1322,15 @@ mod tests { profiler::start("./bench_eval_compare_5_rows.profile"); b.iter(|| { - let result = black_box(&exp).eval( - black_box(&mut ctx), - black_box(schema), - black_box(&mut columns), - black_box(&logical_rows), - black_box(5), - ); - assert!(result.is_ok()); + black_box(&exp) + .eval( + black_box(&mut ctx), + black_box(schema), + black_box(&mut columns), + black_box(&logical_rows), + black_box(5), + ) + .unwrap(); }); profiler::stop(); } diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index 6655531c294..8fa7c8492d0 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -1933,25 +1933,20 @@ mod tests { #[test] fn test_check_data_dir_empty() { // test invalid data_path - let ret = check_data_dir_empty("/sys/invalid", "txt"); - assert!(ret.is_ok()); + check_data_dir_empty("/sys/invalid", "txt").unwrap(); // test empty data_path let tmp_path = Builder::new() .prefix("test-get-file-count") .tempdir() .unwrap() .into_path(); - let ret = check_data_dir_empty(tmp_path.to_str().unwrap(), "txt"); - assert!(ret.is_ok()); + check_data_dir_empty(tmp_path.to_str().unwrap(), "txt").unwrap(); // test non-empty data_path let tmp_file = format!("{}", tmp_path.join("test-get-file-count.txt").display()); create_file(&tmp_file, b""); - let ret = check_data_dir_empty(tmp_path.to_str().unwrap(), ""); - assert!(ret.is_err()); - let ret = check_data_dir_empty(tmp_path.to_str().unwrap(), "txt"); - assert!(ret.is_err()); - let ret = check_data_dir_empty(tmp_path.to_str().unwrap(), "xt"); - assert!(ret.is_ok()); + check_data_dir_empty(tmp_path.to_str().unwrap(), "").unwrap_err(); + check_data_dir_empty(tmp_path.to_str().unwrap(), "txt").unwrap_err(); + check_data_dir_empty(tmp_path.to_str().unwrap(), "xt").unwrap(); } #[test] diff --git a/components/tikv_util/src/mpsc/batch.rs b/components/tikv_util/src/mpsc/batch.rs index a635a75d4e4..e8d54c514a1 100644 --- a/components/tikv_util/src/mpsc/batch.rs +++ b/components/tikv_util/src/mpsc/batch.rs @@ -391,7 +391,7 @@ mod tests { } // Send without notify, the receiver can't get batched messages. - assert!(tx.send(0).is_ok()); + tx.send(0).unwrap(); thread::sleep(time::Duration::from_millis(10)); assert_eq!(msg_counter.load(Ordering::Acquire), 0); @@ -404,7 +404,7 @@ mod tests { // Auto notify with more sendings. for _ in 0..4 { - assert!(tx.send(0).is_ok()); + tx.send(0).unwrap(); } thread::sleep(time::Duration::from_millis(10)); assert_eq!(msg_counter.load(Ordering::Acquire), 5); @@ -442,7 +442,7 @@ mod tests { polled.recv().unwrap(); // Send without notify, the receiver can't get batched messages. - assert!(tx.send(0).is_ok()); + tx.send(0).unwrap(); thread::sleep(time::Duration::from_millis(10)); assert_eq!(msg_counter.load(Ordering::Acquire), 0); @@ -455,7 +455,7 @@ mod tests { // Auto notify with more sendings. for _ in 0..16 { - assert!(tx.send(0).is_ok()); + tx.send(0).unwrap(); } thread::sleep(time::Duration::from_millis(10)); assert_eq!(msg_counter.load(Ordering::Acquire), 17); diff --git a/components/tikv_util/src/worker/mod.rs b/components/tikv_util/src/worker/mod.rs index a8196dca054..cba3a9989cb 100644 --- a/components/tikv_util/src/worker/mod.rs +++ b/components/tikv_util/src/worker/mod.rs @@ -134,7 +134,7 @@ mod tests { let (tx, rx) = mpsc::channel(); lazy_worker.start(BatchRunner { ch: tx }); - assert!(rx.recv_timeout(Duration::from_secs(3)).is_ok()); + rx.recv_timeout(Duration::from_secs(3)).unwrap(); worker.stop(); drop(rx); diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 6962ae30756..1f9c74dd709 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -474,10 +474,10 @@ mod tests { // full assert!(spawn_long_time_future(&read_pool, 8, 100).is_err()); - assert!(rx.recv().is_ok()); - assert!(rx.recv().is_ok()); - assert!(rx.recv().is_ok()); - assert!(rx.recv().is_ok()); + rx.recv().unwrap().unwrap(); + rx.recv().unwrap().unwrap(); + rx.recv().unwrap().unwrap(); + rx.recv().unwrap().unwrap(); // no more results assert!(rx.recv_timeout(Duration::from_millis(500)).is_err()); diff --git a/src/config.rs b/src/config.rs index 23dea43d47a..80e763e6981 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1920,7 +1920,7 @@ mod unified_read_pool_tests { stack_size: ReadableSize::mb(2), max_tasks_per_worker: 2000, }; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let cfg = UnifiedReadPoolConfig { min_thread_count: 1, max_thread_count: cmp::max( @@ -1929,7 +1929,7 @@ mod unified_read_pool_tests { ), ..cfg }; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let invalid_cfg = UnifiedReadPoolConfig { min_thread_count: 0, @@ -2103,7 +2103,7 @@ macro_rules! readpool_config { #[test] fn test_validate() { let cfg = $struct_name::default(); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let mut invalid_cfg = cfg.clone(); invalid_cfg.high_concurrency = 0; @@ -2127,7 +2127,7 @@ macro_rules! readpool_config { invalid_cfg.max_tasks_per_worker_high = 1; assert!(invalid_cfg.validate().is_err()); invalid_cfg.max_tasks_per_worker_high = 100; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let mut invalid_cfg = cfg.clone(); invalid_cfg.max_tasks_per_worker_normal = 0; @@ -2135,7 +2135,7 @@ macro_rules! readpool_config { invalid_cfg.max_tasks_per_worker_normal = 1; assert!(invalid_cfg.validate().is_err()); invalid_cfg.max_tasks_per_worker_normal = 100; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let mut invalid_cfg = cfg.clone(); invalid_cfg.max_tasks_per_worker_low = 0; @@ -2143,12 +2143,12 @@ macro_rules! readpool_config { invalid_cfg.max_tasks_per_worker_low = 1; assert!(invalid_cfg.validate().is_err()); invalid_cfg.max_tasks_per_worker_low = 100; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let mut invalid_but_unified = cfg.clone(); invalid_but_unified.use_unified_pool = Some(true); invalid_but_unified.low_concurrency = 0; - assert!(invalid_but_unified.validate().is_ok()); + invalid_but_unified.validate().unwrap(); } } }; @@ -2263,23 +2263,23 @@ mod readpool_tests { use_unified_pool: Some(false), ..Default::default() }; - assert!(storage.validate().is_ok()); + storage.validate().unwrap(); let coprocessor = CoprReadPoolConfig { use_unified_pool: Some(false), ..Default::default() }; - assert!(coprocessor.validate().is_ok()); + coprocessor.validate().unwrap(); let cfg = ReadPoolConfig { unified, storage, coprocessor, }; assert!(!cfg.is_unified_pool_enabled()); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); // Storage and coprocessor config must be valid when yatp is not used. let unified = UnifiedReadPoolConfig::default(); - assert!(unified.validate().is_ok()); + unified.validate().unwrap(); let storage = StorageReadPoolConfig { use_unified_pool: Some(false), high_concurrency: 0, @@ -2312,9 +2312,9 @@ mod readpool_tests { use_unified_pool: Some(true), ..Default::default() }; - assert!(storage.validate().is_ok()); + storage.validate().unwrap(); let coprocessor = CoprReadPoolConfig::default(); - assert!(coprocessor.validate().is_ok()); + coprocessor.validate().unwrap(); let mut cfg = ReadPoolConfig { unified, storage, @@ -2368,7 +2368,7 @@ mod readpool_tests { assert!(cfg.is_unified_pool_enabled()); assert!(cfg.validate().is_err()); cfg.storage.low_concurrency = 1; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let storage = StorageReadPoolConfig { use_unified_pool: Some(true), @@ -2389,7 +2389,7 @@ mod readpool_tests { assert!(cfg.is_unified_pool_enabled()); assert!(cfg.validate().is_err()); cfg.coprocessor.low_concurrency = 1; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); } } @@ -4220,13 +4220,13 @@ mod tests { let first_modified = last_cfg_metadata.modified().unwrap(); // not write to file when config is the equivalent of last one. - assert!(persist_config(&cfg).is_ok()); + persist_config(&cfg).unwrap(); last_cfg_metadata = last_cfg_path.metadata().unwrap(); assert_eq!(last_cfg_metadata.modified().unwrap(), first_modified); // write to file when config is the inequivalent of last one. cfg.log_level = slog::Level::Warning.into(); - assert!(persist_config(&cfg).is_ok()); + persist_config(&cfg).unwrap(); last_cfg_metadata = last_cfg_path.metadata().unwrap(); assert_ne!(last_cfg_metadata.modified().unwrap(), first_modified); } @@ -4305,7 +4305,7 @@ mod tests { let mut tikv_cfg = TiKvConfig::default(); tikv_cfg.storage.data_dir = path.as_path().to_str().unwrap().to_owned(); - assert!(persist_config(&tikv_cfg).is_ok()); + persist_config(&tikv_cfg).unwrap(); } #[test] @@ -5199,11 +5199,11 @@ mod tests { #[test] fn test_validate_tikv_config() { let mut cfg = TiKvConfig::default(); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let default_region_split_check_diff = cfg.raft_store.region_split_check_diff().0; cfg.raft_store.region_split_check_diff = Some(ReadableSize(cfg.raft_store.region_split_check_diff().0 + 1)); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); assert_eq!( cfg.raft_store.region_split_check_diff().0, default_region_split_check_diff + 1 @@ -5216,7 +5216,7 @@ mod tests { // Test memory_usage_limit is based on block cache size if it's not configured. cfg.memory_usage_limit = None; cfg.storage.block_cache.capacity = Some(ReadableSize(3 * GIB)); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); assert_eq!(cfg.memory_usage_limit.unwrap(), ReadableSize(5 * GIB)); // Test memory_usage_limit will fallback to system memory capacity with huge @@ -5224,7 +5224,7 @@ mod tests { cfg.memory_usage_limit = None; let system = SysQuota::memory_limit_in_bytes(); cfg.storage.block_cache.capacity = Some(ReadableSize(system * 3 / 4)); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); assert_eq!(cfg.memory_usage_limit.unwrap(), ReadableSize(system)); } @@ -5243,7 +5243,7 @@ mod tests { { let mut cfg = TiKvConfig::default(); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); } { @@ -5285,7 +5285,7 @@ mod tests { tmp_path_string_generate!(tmp_path, "data", "raftdb", "db"); cfg.rocksdb.wal_dir = tmp_path_string_generate!(tmp_path, "data", "kvdb", "db"); cfg.raftdb.wal_dir = tmp_path_string_generate!(tmp_path, "data", "raftdb", "db"); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); } } diff --git a/src/coprocessor/interceptors/concurrency_limiter.rs b/src/coprocessor/interceptors/concurrency_limiter.rs index aa8b5c72f13..c77eab86f16 100644 --- a/src/coprocessor/interceptors/concurrency_limiter.rs +++ b/src/coprocessor/interceptors/concurrency_limiter.rs @@ -151,13 +151,12 @@ mod tests { // Light tasks should run without any semaphore permit let smp2 = smp.clone(); - assert!( - tokio::spawn(timeout(Duration::from_millis(250), async move { - limit_concurrency(work(2), &*smp2, Duration::from_millis(500)).await - })) - .await - .is_ok() - ); + tokio::spawn(timeout(Duration::from_millis(250), async move { + limit_concurrency(work(2), &*smp2, Duration::from_millis(500)).await + })) + .await + .unwrap() + .unwrap(); // Both t1 and t2 need a semaphore permit to finish. Although t2 is much shorter // than t1, it starts with t1 diff --git a/src/coprocessor/interceptors/deadline.rs b/src/coprocessor/interceptors/deadline.rs index 29b673aa487..b88e6d5f0c9 100644 --- a/src/coprocessor/interceptors/deadline.rs +++ b/src/coprocessor/interceptors/deadline.rs @@ -57,10 +57,12 @@ mod tests { } } - let res = check_deadline(work(5), Deadline::from_now(Duration::from_millis(500))).await; - assert!(res.is_ok()); + check_deadline(work(5), Deadline::from_now(Duration::from_millis(500))) + .await + .unwrap(); - let res = check_deadline(work(100), Deadline::from_now(Duration::from_millis(500))).await; - assert!(res.is_err()); + check_deadline(work(100), Deadline::from_now(Duration::from_millis(500))) + .await + .unwrap_err(); } } diff --git a/src/read_pool.rs b/src/read_pool.rs index 9c413de60a7..ded1308beb2 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -380,8 +380,8 @@ mod tests { let (task3, _tx3) = gen_task(); let (task4, _tx4) = gen_task(); - assert!(handle.spawn(task1, CommandPri::Normal, 1).is_ok()); - assert!(handle.spawn(task2, CommandPri::Normal, 2).is_ok()); + handle.spawn(task1, CommandPri::Normal, 1).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2).unwrap(); thread::sleep(Duration::from_millis(300)); match handle.spawn(task3, CommandPri::Normal, 3) { @@ -391,7 +391,7 @@ mod tests { tx1.send(()).unwrap(); thread::sleep(Duration::from_millis(300)); - assert!(handle.spawn(task4, CommandPri::Normal, 4).is_ok()); + handle.spawn(task4, CommandPri::Normal, 4).unwrap(); } #[test] @@ -422,8 +422,8 @@ mod tests { let (task4, _tx4) = gen_task(); let (task5, _tx5) = gen_task(); - assert!(handle.spawn(task1, CommandPri::Normal, 1).is_ok()); - assert!(handle.spawn(task2, CommandPri::Normal, 2).is_ok()); + handle.spawn(task1, CommandPri::Normal, 1).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2).unwrap(); thread::sleep(Duration::from_millis(300)); match handle.spawn(task3, CommandPri::Normal, 3) { @@ -434,7 +434,7 @@ mod tests { handle.scale_pool_size(3); assert_eq!(handle.get_normal_pool_size(), 3); - assert!(handle.spawn(task4, CommandPri::Normal, 4).is_ok()); + handle.spawn(task4, CommandPri::Normal, 4).unwrap(); thread::sleep(Duration::from_millis(300)); match handle.spawn(task5, CommandPri::Normal, 5) { @@ -471,8 +471,8 @@ mod tests { let (task4, _tx4) = gen_task(); let (task5, _tx5) = gen_task(); - assert!(handle.spawn(task1, CommandPri::Normal, 1).is_ok()); - assert!(handle.spawn(task2, CommandPri::Normal, 2).is_ok()); + handle.spawn(task1, CommandPri::Normal, 1).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2).unwrap(); thread::sleep(Duration::from_millis(300)); match handle.spawn(task3, CommandPri::Normal, 3) { @@ -487,7 +487,7 @@ mod tests { handle.scale_pool_size(1); assert_eq!(handle.get_normal_pool_size(), 1); - assert!(handle.spawn(task4, CommandPri::Normal, 4).is_ok()); + handle.spawn(task4, CommandPri::Normal, 4).unwrap(); thread::sleep(Duration::from_millis(300)); match handle.spawn(task5, CommandPri::Normal, 5) { diff --git a/src/server/debug.rs b/src/server/debug.rs index 933f4308245..831a2b85255 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -1991,7 +1991,7 @@ mod tests { remove_region_state(1); remove_region_state(2); - assert!(debugger.recreate_region(region.clone()).is_ok()); + debugger.recreate_region(region.clone()).unwrap(); assert_eq!(get_region_state(engine, 100).get_region(), ®ion); region.set_start_key(b"z".to_vec()); diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index b47fc34cf27..094f6f5d5e6 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -229,9 +229,7 @@ mod tests { } let factory = builder.build(); let shared_db = factory.create_shared_db().unwrap(); - let tablet = TabletFactory::create_tablet(&factory, 1, 10); - assert!(tablet.is_ok()); - let tablet = tablet.unwrap(); + let tablet = TabletFactory::create_tablet(&factory, 1, 10).unwrap(); let tablet2 = factory.open_tablet(1, 10).unwrap(); assert_eq!(tablet.as_inner().path(), shared_db.as_inner().path()); assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); @@ -272,9 +270,7 @@ mod tests { } let inner_factory = builder.build(); let factory = KvEngineFactoryV2::new(inner_factory); - let tablet = factory.create_tablet(1, 10); - assert!(tablet.is_ok()); - let tablet = tablet.unwrap(); + let tablet = factory.create_tablet(1, 10).unwrap(); let tablet2 = factory.open_tablet(1, 10).unwrap(); assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); let tablet2 = factory.open_tablet_cache(1, 10).unwrap(); @@ -296,8 +292,8 @@ mod tests { assert!(!factory.exists(2, 11)); assert!(factory.exists_raw(&tablet_path)); assert!(!factory.is_tombstoned(1, 10)); - assert!(factory.load_tablet(&tablet_path, 1, 10).is_err()); - assert!(factory.load_tablet(&tablet_path, 1, 20).is_ok()); + factory.load_tablet(&tablet_path, 1, 10).unwrap_err(); + factory.load_tablet(&tablet_path, 1, 20).unwrap(); // After we load it as with the new id or suffix, we should be unable to get it // with the old id and suffix in the cache. assert!(factory.open_tablet_cache(1, 10).is_none()); diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 7e695430d10..dcdb075d256 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -2051,18 +2051,16 @@ mod tests { // Before starting gc_worker, fill the scheduler to full. for _ in 0..GC_MAX_PENDING_TASKS { - assert!( - gc_worker - .scheduler() - .schedule(GcTask::Gc { - region_id: 0, - start_key: vec![], - end_key: vec![], - safe_point: TimeStamp::from(100), - callback: Box::new(|_res| {}) - }) - .is_ok() - ); + gc_worker + .scheduler() + .schedule(GcTask::Gc { + region_id: 0, + start_key: vec![], + end_key: vec![], + safe_point: TimeStamp::from(100), + callback: Box::new(|_res| {}), + }) + .unwrap(); } // Then, it will fail to schedule another gc command. let (tx, rx) = mpsc::channel(); @@ -2081,24 +2079,22 @@ mod tests { let (tx, rx) = mpsc::channel(); // When the gc_worker is full, scheduling an unsafe destroy range task should be // still allowed. - assert!( - gc_worker - .unsafe_destroy_range( - Context::default(), - Key::from_raw(b"a"), - Key::from_raw(b"z"), - Box::new(move |res| { - tx.send(res).unwrap(); - }) - ) - .is_ok() - ); + gc_worker + .unsafe_destroy_range( + Context::default(), + Key::from_raw(b"a"), + Key::from_raw(b"z"), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap(); gc_worker.start().unwrap(); // After the worker starts running, the destroy range task should run, // and the key in the range will be deleted. - assert!(rx.recv_timeout(Duration::from_secs(10)).unwrap().is_ok()); + rx.recv_timeout(Duration::from_secs(10)).unwrap().unwrap(); must_get_none(&engine, b"key", 30); } } diff --git a/src/server/resolve.rs b/src/server/resolve.rs index ccee5c52f82..404cee0e613 100644 --- a/src/server/resolve.rs +++ b/src/server/resolve.rs @@ -256,14 +256,14 @@ mod tests { fn test_resolve_store_state_up() { let store = new_store(STORE_ADDR, metapb::StoreState::Up); let runner = new_runner(store); - assert!(runner.get_address(0).is_ok()); + runner.get_address(0).unwrap(); } #[test] fn test_resolve_store_state_offline() { let store = new_store(STORE_ADDR, metapb::StoreState::Offline); let runner = new_runner(store); - assert!(runner.get_address(0).is_ok()); + runner.get_address(0).unwrap(); } #[test] diff --git a/src/server/server.rs b/src/server/server.rs index c5aa6311193..5c0ace9d7b1 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -581,7 +581,7 @@ mod tests { trans.send(msg.clone()).unwrap(); trans.flush(); - assert!(rx.recv_timeout(Duration::from_secs(5)).is_ok()); + rx.recv_timeout(Duration::from_secs(5)).unwrap(); msg.mut_to_peer().set_store_id(2); msg.set_region_id(2); diff --git a/src/server/service/diagnostics/log.rs b/src/server/service/diagnostics/log.rs index 4ab02f819da..232ddd58b4b 100644 --- a/src/server/service/diagnostics/log.rs +++ b/src/server/service/diagnostics/log.rs @@ -481,7 +481,6 @@ mod tests { ]; for (input, time, level, content) in cs.into_iter() { let result = parse(input); - assert!(result.is_ok(), "expected OK, but got: {:?}", result); let timestamp = timestamp(time); let log = result.unwrap(); assert_eq!(log.0, content); diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 13b7b94297d..7911808e86b 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -1420,7 +1420,7 @@ mod tests { let resp = block_on(handle).unwrap(); assert_eq!(resp.status(), StatusCode::OK); let body_bytes = block_on(hyper::body::to_bytes(resp.into_body())).unwrap(); - assert!(String::from_utf8(body_bytes.as_ref().to_owned()).is_ok()); + String::from_utf8(body_bytes.as_ref().to_owned()).unwrap(); // test gzip let handle = status_server.thread_pool.spawn(async move { @@ -1440,7 +1440,7 @@ mod tests { GzDecoder::new(body_bytes.reader()) .read_to_end(&mut decoded_bytes) .unwrap(); - assert!(String::from_utf8(decoded_bytes).is_ok()); + String::from_utf8(decoded_bytes).unwrap(); status_server.stop(); } diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index a37712dfd68..446711bef30 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -409,7 +409,7 @@ mod tests { let (tx, rx) = mpsc::channel(1); let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); drop(tx); - assert!(block_on(res).unwrap().is_ok()); + block_on(res).unwrap().unwrap(); // Test activated profiling can be stopped by the handle. let (tx, rx) = sync_channel::(1); @@ -424,7 +424,7 @@ mod tests { )); assert!(check_activated()); assert!(deactivate_heap_profile()); - assert!(block_on(res).unwrap().is_ok()); + block_on(res).unwrap().unwrap(); } #[test] @@ -454,6 +454,6 @@ mod tests { )); assert!(check_activated()); assert!(deactivate_heap_profile()); - assert!(block_on(res).unwrap().is_ok()); + block_on(res).unwrap().unwrap(); } } diff --git a/src/storage/config.rs b/src/storage/config.rs index 9a359310178..4bfc664629f 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -379,11 +379,11 @@ mod tests { #[test] fn test_validate_storage_config() { let mut cfg = Config::default(); - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); let max_pool_size = std::cmp::max(4, SysQuota::cpu_cores_quota() as usize); cfg.scheduler_worker_pool_size = max_pool_size; - assert!(cfg.validate().is_ok()); + cfg.validate().unwrap(); cfg.scheduler_worker_pool_size = 0; assert!(cfg.validate().is_err()); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 620bca80b32..ef9aecf02ad 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -7758,7 +7758,7 @@ mod tests { assert_eq!(key_error.get_locked().get_key(), b"key"); // Ignore memory locks in resolved or committed locks. ctx.set_resolved_locks(vec![10]); - assert!(block_on(storage.get(ctx.clone(), Key::from_raw(b"key"), 100.into())).is_ok()); + block_on(storage.get(ctx.clone(), Key::from_raw(b"key"), 100.into())).unwrap(); ctx.take_resolved_locks(); // Test batch_get @@ -7773,7 +7773,7 @@ mod tests { assert_eq!(key_error.get_locked().get_key(), b"key"); // Ignore memory locks in resolved locks. ctx.set_resolved_locks(vec![10]); - assert!(batch_get(ctx.clone()).is_ok()); + batch_get(ctx.clone()).unwrap(); ctx.take_resolved_locks(); // Test scan @@ -7784,13 +7784,13 @@ mod tests { extract_key_error(&scan(ctx.clone(), Key::from_raw(b"a"), None, false).unwrap_err()); assert_eq!(key_error.get_locked().get_key(), b"key"); ctx.set_resolved_locks(vec![10]); - assert!(scan(ctx.clone(), Key::from_raw(b"a"), None, false).is_ok()); + scan(ctx.clone(), Key::from_raw(b"a"), None, false).unwrap(); ctx.take_resolved_locks(); let key_error = extract_key_error(&scan(ctx.clone(), Key::from_raw(b"\xff"), None, true).unwrap_err()); assert_eq!(key_error.get_locked().get_key(), b"key"); ctx.set_resolved_locks(vec![10]); - assert!(scan(ctx.clone(), Key::from_raw(b"\xff"), None, false).is_ok()); + scan(ctx.clone(), Key::from_raw(b"\xff"), None, false).unwrap(); ctx.take_resolved_locks(); // Ignore memory locks in resolved or committed locks. @@ -7816,14 +7816,14 @@ mod tests { consumer.take_data() }; let res = batch_get_command(req2.clone()); - assert!(res[0].is_ok()); + res[0].as_ref().unwrap(); let key_error = extract_key_error(res[1].as_ref().unwrap_err()); assert_eq!(key_error.get_locked().get_key(), b"key"); // Ignore memory locks in resolved or committed locks. req2.mut_context().set_resolved_locks(vec![10]); let res = batch_get_command(req2.clone()); - assert!(res[0].is_ok()); - assert!(res[1].is_ok()); + res[0].as_ref().unwrap(); + res[1].as_ref().unwrap(); req2.mut_context().take_resolved_locks(); } @@ -8661,7 +8661,7 @@ mod tests { assert!(res.is_err(), "case {}", i); assert_eq!(res.unwrap_err().error_code(), err, "case {}", i); } else { - assert!(res.is_ok(), "case {}", i); + assert!(res.is_ok(), "case {} {:?}", i, res); } } } @@ -8717,7 +8717,7 @@ mod tests { assert!(res.is_err()); assert_eq!(res.unwrap_err().error_code(), err); } else { - assert!(res.is_ok()); + res.unwrap(); } }; @@ -8955,7 +8955,7 @@ mod tests { }), ) .unwrap(); - assert!(rx.recv().unwrap().is_ok()); + rx.recv().unwrap().unwrap(); // After prewrite, the memory lock should be removed. { let pessimistic_locks = txn_ext.pessimistic_locks.read(); @@ -9014,6 +9014,6 @@ mod tests { ) .unwrap(); // Prewrite still succeeds - assert!(rx.recv().unwrap().is_ok()); + rx.recv().unwrap().unwrap(); } } diff --git a/src/storage/mvcc/consistency_check.rs b/src/storage/mvcc/consistency_check.rs index c27b96840d0..fba4f207054 100644 --- a/src/storage/mvcc/consistency_check.rs +++ b/src/storage/mvcc/consistency_check.rs @@ -567,7 +567,7 @@ mod tests { let mut count = 0; for key_and_mvcc in scan_mvcc(b"z", &[], 30) { - assert!(key_and_mvcc.is_ok()); + key_and_mvcc.unwrap(); count += 1; } assert_eq!(count, 7); diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 31631f34152..07d0093e71c 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -631,9 +631,8 @@ pub mod tests { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = SnapshotReader::new(start_ts.into(), snapshot, true); - let ret = reader.get_txn_commit_record(&Key::from_raw(key)); - assert!(ret.is_ok()); - match ret.unwrap().info() { + let ret = reader.get_txn_commit_record(&Key::from_raw(key)).unwrap(); + match ret.info() { None => {} Some((_, write_type)) => { assert_eq!(write_type, WriteType::Rollback); diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index a5343b234ac..3dd95d4045d 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -413,7 +413,7 @@ pub(crate) mod tests { must_commit(&engine, k1, 4, 5); // After delete "k1", insert returns ok. - assert!(try_prewrite_insert(&engine, k1, v2, k1, 6).is_ok()); + try_prewrite_insert(&engine, k1, v2, k1, 6).unwrap(); must_commit(&engine, k1, 6, 7); // Rollback @@ -434,7 +434,7 @@ pub(crate) mod tests { must_rollback(&engine, k1, 12, false); // After delete "k1", insert returns ok. - assert!(try_prewrite_insert(&engine, k1, v2, k1, 13).is_ok()); + try_prewrite_insert(&engine, k1, v2, k1, 13).unwrap(); must_commit(&engine, k1, 13, 14); } @@ -453,9 +453,9 @@ pub(crate) mod tests { must_commit(&engine, k1, 4, 5); // After delete "k1", check_not_exists returns ok. - assert!(try_prewrite_check_not_exists(&engine, k1, k1, 6).is_ok()); + try_prewrite_check_not_exists(&engine, k1, k1, 6).unwrap(); - assert!(try_prewrite_insert(&engine, k1, v2, k1, 7).is_ok()); + try_prewrite_insert(&engine, k1, v2, k1, 7).unwrap(); must_commit(&engine, k1, 7, 8); // Rollback @@ -472,7 +472,7 @@ pub(crate) mod tests { must_rollback(&engine, k1, 13, false); // After delete "k1", check_not_exists returns ok. - assert!(try_prewrite_check_not_exists(&engine, k1, k1, 14).is_ok()); + try_prewrite_check_not_exists(&engine, k1, k1, 14).unwrap(); } #[test] diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index fb32f767bd5..66194cd08fa 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -1392,7 +1392,7 @@ mod tests { let cmd: TypedCommand<()> = req.into(); let (cb, f) = paired_future_callback(); scheduler.run_cmd(cmd.cmd, StorageCallback::Boolean(cb)); - assert!(block_on(f).unwrap().is_ok()); + block_on(f).unwrap().unwrap(); } #[test] @@ -1450,7 +1450,7 @@ mod tests { let cmd: TypedCommand<()> = req.into(); let (cb, f) = paired_future_callback(); scheduler.run_cmd(cmd.cmd, StorageCallback::Boolean(cb)); - assert!(block_on(f).unwrap().is_ok()); + block_on(f).unwrap().unwrap(); } #[test] @@ -1516,7 +1516,7 @@ mod tests { let cmd: TypedCommand = req.into(); let (cb, f) = paired_future_callback(); scheduler.run_cmd(cmd.cmd, StorageCallback::TxnStatus(cb)); - assert!(block_on(f).unwrap().is_ok()); + block_on(f).unwrap().unwrap(); } #[test] @@ -1574,7 +1574,7 @@ mod tests { let cmd: TypedCommand<()> = req.into(); let (cb, f) = paired_future_callback(); scheduler.run_cmd(cmd.cmd, StorageCallback::Boolean(cb)); - assert!(block_on(f).is_ok()); + block_on(f).unwrap().unwrap(); } #[test] diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index 2cd4afaf932..c85bd828c08 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -970,18 +970,16 @@ mod tests { let bound_b = Key::from_encoded(b"b".to_vec()); let bound_c = Key::from_encoded(b"c".to_vec()); let bound_d = Key::from_encoded(b"d".to_vec()); - assert!(store.scanner(false, false, false, None, None).is_ok()); - assert!( - store - .scanner( - false, - false, - false, - Some(bound_b.clone()), - Some(bound_c.clone()) - ) - .is_ok() - ); + store.scanner(false, false, false, None, None).unwrap(); + store + .scanner( + false, + false, + false, + Some(bound_b.clone()), + Some(bound_c.clone()), + ) + .unwrap(); assert!( store .scanner( @@ -1021,22 +1019,16 @@ mod tests { Default::default(), false, ); - assert!(store2.scanner(false, false, false, None, None).is_ok()); - assert!( - store2 - .scanner(false, false, false, Some(bound_a.clone()), None) - .is_ok() - ); - assert!( - store2 - .scanner(false, false, false, Some(bound_a), Some(bound_b)) - .is_ok() - ); - assert!( - store2 - .scanner(false, false, false, None, Some(bound_c)) - .is_ok() - ); + store2.scanner(false, false, false, None, None).unwrap(); + store2 + .scanner(false, false, false, Some(bound_a.clone()), None) + .unwrap(); + store2 + .scanner(false, false, false, Some(bound_a), Some(bound_b)) + .unwrap(); + store2 + .scanner(false, false, false, None, Some(bound_c)) + .unwrap(); } fn gen_fixture_store() -> FixtureStore { diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index 4c94aeb1249..c97bdd72fac 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -152,7 +152,7 @@ fn bench_async_snapshots_noop(b: &mut test::Bencher) { b.iter(|| { let cb1: EngineCallback> = Box::new(move |res| { - assert!(res.is_ok()); + res.unwrap(); }); let cb2: EngineCallback> = Box::new(move |res| { if let Ok(CmdRes::Snap(snap)) = res { diff --git a/tests/failpoints/cases/test_disk_full.rs b/tests/failpoints/cases/test_disk_full.rs index be027ae7217..f1b135ef86a 100644 --- a/tests/failpoints/cases/test_disk_full.rs +++ b/tests/failpoints/cases/test_disk_full.rs @@ -67,7 +67,7 @@ fn ensure_disk_usage_is_reported( let peer = new_peer(store_id, peer_id); let key = region.get_start_key(); let ch = async_read_on_peer(cluster, peer, region.clone(), key, true, true); - assert!(ch.recv_timeout(Duration::from_secs(1)).is_ok()); + ch.recv_timeout(Duration::from_secs(1)).unwrap(); } fn test_disk_full_leader_behaviors(usage: DiskUsage) { diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 713ab4c5a5d..92785fcfa1e 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -1335,22 +1335,20 @@ fn test_merge_with_concurrent_pessimistic_locking() { let snapshot = cluster.must_get_snapshot_of_region(left.id); let txn_ext = snapshot.txn_ext.unwrap(); - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![( - Key::from_raw(b"k0"), - PessimisticLock { - primary: b"k0".to_vec().into_boxed_slice(), - start_ts: 10.into(), - ttl: 3000, - for_update_ts: 20.into(), - min_commit_ts: 30.into(), - }, - )]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![( + Key::from_raw(b"k0"), + PessimisticLock { + primary: b"k0".to_vec().into_boxed_slice(), + start_ts: 10.into(), + ttl: 3000, + for_update_ts: 20.into(), + min_commit_ts: 30.into(), + }, + )]) + .unwrap(); let addr = cluster.sim.rl().get_addr(1); let env = Arc::new(Environment::new(1)); @@ -1436,16 +1434,14 @@ fn test_merge_pessimistic_locks_with_concurrent_prewrite() { for_update_ts: 20.into(), min_commit_ts: 30.into(), }; - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![ - (Key::from_raw(b"k0"), lock.clone()), - (Key::from_raw(b"k1"), lock), - ]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![ + (Key::from_raw(b"k0"), lock.clone()), + (Key::from_raw(b"k1"), lock), + ]) + .unwrap(); let mut mutation = Mutation::default(); mutation.set_op(Op::Put); @@ -1517,13 +1513,11 @@ fn test_retry_pending_prepare_merge_fail() { for_update_ts: 20.into(), min_commit_ts: 30.into(), }; - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![(Key::from_raw(b"k1"), l1)]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![(Key::from_raw(b"k1"), l1)]) + .unwrap(); // Pause apply and write some data to the left region fail::cfg("on_handle_apply", "pause").unwrap(); @@ -1593,13 +1587,11 @@ fn test_merge_pessimistic_locks_propose_fail() { for_update_ts: 20.into(), min_commit_ts: 30.into(), }; - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![(Key::from_raw(b"k1"), lock)]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![(Key::from_raw(b"k1"), lock)]) + .unwrap(); fail::cfg("raft_propose", "pause").unwrap(); diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index 5eba2b298a1..22871994f82 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -118,7 +118,6 @@ fn test_load_global_config() { ) .await }); - assert!(res.is_ok()); for (k, v) in res.unwrap() { assert_eq!(k, format!("/global/config/{}", v)) } diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 92aee023fa5..6a67e83ef1b 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -80,7 +80,7 @@ fn test_follower_slow_split() { // After the follower split success, it will response to the pending vote. fail::cfg("apply_before_split_1_3", "off").unwrap(); - assert!(rx.recv_timeout(Duration::from_millis(100)).is_ok()); + rx.recv_timeout(Duration::from_millis(100)).unwrap(); } #[test] @@ -164,7 +164,7 @@ fn test_split_lost_request_vote() { // After the follower split success, it will response to the pending vote. fail::cfg("apply_after_split_1_3", "off").unwrap(); - assert!(rx.recv_timeout(Duration::from_millis(100)).is_ok()); + rx.recv_timeout(Duration::from_millis(100)).unwrap(); } fn gen_split_region() -> (Region, Region, Region) { @@ -951,14 +951,12 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { }; { let mut locks = txn_ext.pessimistic_locks.write(); - assert!( - locks - .insert(vec![ - (Key::from_raw(b"a"), lock_a), - (Key::from_raw(b"c"), lock_c) - ]) - .is_ok() - ); + locks + .insert(vec![ + (Key::from_raw(b"a"), lock_a), + (Key::from_raw(b"c"), lock_c), + ]) + .unwrap(); } let mut mutation = Mutation::default(); diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 85dfe054c63..17e9957d947 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -1311,7 +1311,7 @@ fn test_resolve_lock_deadline() { }), ) .unwrap(); - assert!(rx.recv().unwrap().is_ok()); + rx.recv().unwrap().unwrap(); // Resolve lock, this needs two rounds, two process_read and two process_write. // So it needs more than 400ms. It will exceed the deadline. diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index 419d923b0d7..c9f7a70ee09 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -334,7 +334,7 @@ fn test_max_commit_ts_error() { assert!(res.one_pc_commit_ts.is_zero()); // There should not be any memory lock left. - assert!(cm.read_range_check(None, None, |_, _| Err(())).is_ok()); + cm.read_range_check(None, None, |_, _| Err(())).unwrap(); // Two locks should be written, the second one does not async commit. let l1 = must_locked(&storage.get_engine(), b"k1", 10); @@ -566,13 +566,11 @@ fn test_concurrent_write_after_transfer_leader_invalidates_locks() { for_update_ts: 20.into(), min_commit_ts: 30.into(), }; - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![(Key::from_raw(b"key"), lock.clone())]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![(Key::from_raw(b"key"), lock.clone())]) + .unwrap(); let region = cluster.get_region(b""); let leader = region.get_peers()[0].clone(); diff --git a/tests/failpoints/cases/test_transfer_leader.rs b/tests/failpoints/cases/test_transfer_leader.rs index 87b05042a30..9ad2816d3d3 100644 --- a/tests/failpoints/cases/test_transfer_leader.rs +++ b/tests/failpoints/cases/test_transfer_leader.rs @@ -118,22 +118,20 @@ fn test_delete_lock_proposed_after_proposing_locks_impl(transfer_msg_count: usiz let snapshot = cluster.must_get_snapshot_of_region(region_id); let txn_ext = snapshot.txn_ext.unwrap(); - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![( - Key::from_raw(b"key"), - PessimisticLock { - primary: b"key".to_vec().into_boxed_slice(), - start_ts: 10.into(), - ttl: 1000, - for_update_ts: 10.into(), - min_commit_ts: 20.into(), - }, - )]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![( + Key::from_raw(b"key"), + PessimisticLock { + primary: b"key".to_vec().into_boxed_slice(), + start_ts: 10.into(), + ttl: 1000, + for_update_ts: 10.into(), + min_commit_ts: 20.into(), + }, + )]) + .unwrap(); let addr = cluster.sim.rl().get_addr(1); let env = Arc::new(Environment::new(1)); @@ -197,22 +195,20 @@ fn test_delete_lock_proposed_before_proposing_locks() { let snapshot = cluster.must_get_snapshot_of_region(region_id); let txn_ext = snapshot.txn_ext.unwrap(); - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![( - Key::from_raw(b"key"), - PessimisticLock { - primary: b"key".to_vec().into_boxed_slice(), - start_ts: 10.into(), - ttl: 1000, - for_update_ts: 10.into(), - min_commit_ts: 20.into(), - }, - )]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![( + Key::from_raw(b"key"), + PessimisticLock { + primary: b"key".to_vec().into_boxed_slice(), + start_ts: 10.into(), + ttl: 1000, + for_update_ts: 10.into(), + min_commit_ts: 20.into(), + }, + )]) + .unwrap(); let addr = cluster.sim.rl().get_addr(1); let env = Arc::new(Environment::new(1)); @@ -281,22 +277,20 @@ fn test_read_lock_after_become_follower() { let snapshot = cluster.must_get_snapshot_of_region(region_id); let txn_ext = snapshot.txn_ext.unwrap(); let for_update_ts = block_on(cluster.pd_client.get_tso()).unwrap(); - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![( - Key::from_raw(b"key"), - PessimisticLock { - primary: b"key".to_vec().into_boxed_slice(), - start_ts, - ttl: 1000, - for_update_ts, - min_commit_ts: for_update_ts, - }, - )]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![( + Key::from_raw(b"key"), + PessimisticLock { + primary: b"key".to_vec().into_boxed_slice(), + start_ts, + ttl: 1000, + for_update_ts, + min_commit_ts: for_update_ts, + }, + )]) + .unwrap(); let addr = cluster.sim.rl().get_addr(3); let env = Arc::new(Environment::new(1)); diff --git a/tests/integrations/config/test_config_client.rs b/tests/integrations/config/test_config_client.rs index 96299de22a3..fa45d08b24a 100644 --- a/tests/integrations/config/test_config_client.rs +++ b/tests/integrations/config/test_config_client.rs @@ -223,8 +223,8 @@ raft-log-gc-threshold = 2000 50 ); // config update from config file - assert!(cfg_controller.update_from_toml_file().is_ok()); - // after update this configuration item should be constant with the modified + cfg_controller.update_from_toml_file().unwrap(); + // after update this configration item should be constant with the modified // configuration file assert_eq!( cfg_controller diff --git a/tests/integrations/pd/test_rpc_client.rs b/tests/integrations/pd/test_rpc_client.rs index 3a3967c25a8..a6ac43235f3 100644 --- a/tests/integrations/pd/test_rpc_client.rs +++ b/tests/integrations/pd/test_rpc_client.rs @@ -32,11 +32,11 @@ fn test_retry_rpc_client() { server.stop(); let child = thread::spawn(move || { let cfg = new_config(m_eps); - assert_eq!(RpcClient::new(&cfg, None, m_mgr).is_ok(), true); + RpcClient::new(&cfg, None, m_mgr).unwrap(); }); thread::sleep(Duration::from_millis(500)); server.start(&mgr, eps); - assert_eq!(child.join().is_ok(), true); + child.join().unwrap(); } #[test] diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 9cff738fdfe..d378c55c5e6 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -1294,13 +1294,11 @@ fn test_propose_in_memory_pessimistic_locks() { for_update_ts: 20.into(), min_commit_ts: 30.into(), }; - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![(Key::from_raw(b"k1"), l1.clone())]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![(Key::from_raw(b"k1"), l1.clone())]) + .unwrap(); // Insert lock l2 into the right region let snapshot = cluster.must_get_snapshot_of_region(right.id); @@ -1312,13 +1310,11 @@ fn test_propose_in_memory_pessimistic_locks() { for_update_ts: 20.into(), min_commit_ts: 30.into(), }; - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![(Key::from_raw(b"k3"), l2.clone())]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![(Key::from_raw(b"k3"), l2.clone())]) + .unwrap(); // Merge left region into the right region pd_client.must_merge(left.id, right.id); @@ -1386,7 +1382,7 @@ fn test_merge_pessimistic_locks_when_gap_is_too_large() { let res = cluster.async_put(b"k1", b"new_val").unwrap(); cluster.clear_send_filters(); - assert!(res.recv().is_ok()); + res.recv().unwrap(); assert_eq!(cluster.must_get(b"k1").unwrap(), b"new_val"); } @@ -1421,13 +1417,11 @@ fn test_merge_pessimistic_locks_repeated_merge() { for_update_ts: 20.into(), min_commit_ts: 30.into(), }; - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![(Key::from_raw(b"k1"), lock.clone())]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![(Key::from_raw(b"k1"), lock.clone())]) + .unwrap(); // Filter MsgAppend, so the proposed PrepareMerge will not succeed cluster.add_send_filter(CloneFilterFactory( diff --git a/tests/integrations/raftstore/test_multi.rs b/tests/integrations/raftstore/test_multi.rs index d7c527b5fd9..656f6d57d2d 100644 --- a/tests/integrations/raftstore/test_multi.rs +++ b/tests/integrations/raftstore/test_multi.rs @@ -822,22 +822,20 @@ fn test_leader_drop_with_pessimistic_lock() { .get_txn_ext() .unwrap() .clone(); - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![( - Key::from_raw(b"k1"), - PessimisticLock { - primary: b"k1".to_vec().into_boxed_slice(), - start_ts: 10.into(), - ttl: 1000, - for_update_ts: 10.into(), - min_commit_ts: 10.into(), - }, - )]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![( + Key::from_raw(b"k1"), + PessimisticLock { + primary: b"k1".to_vec().into_boxed_slice(), + start_ts: 10.into(), + ttl: 1000, + for_update_ts: 10.into(), + min_commit_ts: 10.into(), + }, + )]) + .unwrap(); // Isolate node 1, leader should be transferred to another node. cluster.add_send_filter(IsolationFilterFactory::new(1)); diff --git a/tests/integrations/raftstore/test_replica_read.rs b/tests/integrations/raftstore/test_replica_read.rs index 8961008d4a5..a2ae4ab0f31 100644 --- a/tests/integrations/raftstore/test_replica_read.rs +++ b/tests/integrations/raftstore/test_replica_read.rs @@ -317,8 +317,8 @@ fn test_read_index_out_of_order() { // After peer 2 is removed, we can get 2 read responses. let resp2 = async_read_on_peer(&mut cluster, new_peer(1, 1), r1, b"k1", true, true); - assert!(resp2.recv_timeout(Duration::from_secs(1)).is_ok()); - assert!(resp1.recv_timeout(Duration::from_secs(1)).is_ok()); + resp2.recv_timeout(Duration::from_secs(1)).unwrap(); + resp1.recv_timeout(Duration::from_secs(1)).unwrap(); } #[test] diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 91022892f96..a7664e8ccf0 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -973,14 +973,12 @@ fn test_split_with_in_memory_pessimistic_locks() { }; { let mut locks = txn_ext.pessimistic_locks.write(); - assert!( - locks - .insert(vec![ - (Key::from_raw(b"a"), lock_a.clone()), - (Key::from_raw(b"c"), lock_c.clone()) - ]) - .is_ok() - ); + locks + .insert(vec![ + (Key::from_raw(b"a"), lock_a.clone()), + (Key::from_raw(b"c"), lock_c.clone()), + ]) + .unwrap(); } let region = cluster.get_region(b""); diff --git a/tests/integrations/raftstore/test_transfer_leader.rs b/tests/integrations/raftstore/test_transfer_leader.rs index 86789fc8f7f..b360bd3da58 100644 --- a/tests/integrations/raftstore/test_transfer_leader.rs +++ b/tests/integrations/raftstore/test_transfer_leader.rs @@ -215,7 +215,7 @@ fn test_transfer_leader_during_snapshot(cluster: &mut Cluster) cluster.transfer_leader(r1, new_peer(2, 2)); let resp = cluster.call_command_on_leader(put, Duration::from_secs(5)); // if it's transferring leader, resp will timeout. - assert!(resp.is_ok(), "{:?}", resp); + resp.unwrap(); must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); } @@ -299,11 +299,9 @@ fn test_propose_in_memory_pessimistic_locks() { { let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); assert!(pessimistic_locks.is_writable()); - assert!( - pessimistic_locks - .insert(vec![(Key::from_raw(b"key"), lock.clone())]) - .is_ok() - ); + pessimistic_locks + .insert(vec![(Key::from_raw(b"key"), lock.clone())]) + .unwrap(); } cluster.must_transfer_leader(1, new_peer(2, 2)); @@ -338,13 +336,11 @@ fn test_memory_pessimistic_locks_status_after_transfer_leader_failure() { min_commit_ts: 30.into(), }; // Write a pessimistic lock to the in-memory pessimistic lock table. - assert!( - txn_ext - .pessimistic_locks - .write() - .insert(vec![(Key::from_raw(b"key"), lock)]) - .is_ok() - ); + txn_ext + .pessimistic_locks + .write() + .insert(vec![(Key::from_raw(b"key"), lock)]) + .unwrap(); // Make it fail to transfer leader cluster.add_send_filter(CloneFilterFactory( diff --git a/tests/integrations/server/gc_worker.rs b/tests/integrations/server/gc_worker.rs index 4f521cb1da7..59dc776dcca 100644 --- a/tests/integrations/server/gc_worker.rs +++ b/tests/integrations/server/gc_worker.rs @@ -301,7 +301,7 @@ fn test_gc_bypass_raft() { } let gc_sched = cluster.sim.rl().get_gc_worker(1).scheduler(); - assert!(sync_gc(&gc_sched, 0, b"k1".to_vec(), b"k2".to_vec(), 200.into()).is_ok()); + sync_gc(&gc_sched, 0, b"k1".to_vec(), b"k2".to_vec(), 200.into()).unwrap(); for &start_ts in &[10, 20, 30] { let commit_ts = start_ts + 5; diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 95d1494c660..366de3c0493 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -1345,7 +1345,7 @@ fn test_prewrite_check_max_commit_ts() { } // There shouldn't be locks remaining in the lock table. - assert!(cm.read_range_check(None, None, |_, _| Err(())).is_ok()); + cm.read_range_check(None, None, |_, _| Err(())).unwrap(); } #[test] diff --git a/tests/integrations/server/security.rs b/tests/integrations/server/security.rs index 8243aca6c46..71a0979a005 100644 --- a/tests/integrations/server/security.rs +++ b/tests/integrations/server/security.rs @@ -24,8 +24,7 @@ fn test_check_cn_success() { let channel = ChannelBuilder::new(env).secure_connect(&addr, cred); let client = TikvClient::new(channel); - let status = client.kv_get(&GetRequest::default()); - assert!(status.is_ok()); + client.kv_get(&GetRequest::default()).unwrap(); } #[test] diff --git a/tests/integrations/server/status_server.rs b/tests/integrations/server/status_server.rs index ac9139a6374..84a4de39b25 100644 --- a/tests/integrations/server/status_server.rs +++ b/tests/integrations/server/status_server.rs @@ -52,7 +52,7 @@ fn test_region_meta_endpoint() { ) .unwrap(); let addr = format!("127.0.0.1:{}", test_util::alloc_port()); - assert!(status_server.start(addr).is_ok()); + status_server.start(addr).unwrap(); let check_task = check(status_server.listening_addr(), region_id); let rt = tokio::runtime::Runtime::new().unwrap(); if let Err(err) = rt.block_on(check_task) { diff --git a/tests/integrations/storage/test_raft_storage.rs b/tests/integrations/storage/test_raft_storage.rs index f828870e964..ef1ee5402e6 100644 --- a/tests/integrations/storage/test_raft_storage.rs +++ b/tests/integrations/storage/test_raft_storage.rs @@ -98,8 +98,9 @@ fn test_raft_storage_get_after_lease() { #[test] fn test_raft_storage_rollback_before_prewrite() { let (_cluster, storage, ctx) = new_raft_storage(); - let ret = storage.rollback(ctx.clone(), vec![Key::from_raw(b"key")], 10); - assert!(ret.is_ok()); + storage + .rollback(ctx.clone(), vec![Key::from_raw(b"key")], 10) + .unwrap(); let ret = storage.prewrite( ctx, vec![Mutation::make_put(Key::from_raw(b"key"), b"value".to_vec())], diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index ec8bf906e1c..7b1aab71183 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -40,25 +40,21 @@ fn test_turnoff_titan() { let size = 5; for i in 0..size { - assert!( - cluster - .put( - format!("k{:02}0", i).as_bytes(), - format!("v{}", i).as_bytes(), - ) - .is_ok() - ); + cluster + .put( + format!("k{:02}0", i).as_bytes(), + format!("v{}", i).as_bytes(), + ) + .unwrap(); } cluster.must_flush_cf(CF_DEFAULT, true); for i in 0..size { - assert!( - cluster - .put( - format!("k{:02}1", i).as_bytes(), - format!("v{}", i).as_bytes(), - ) - .is_ok() - ); + cluster + .put( + format!("k{:02}1", i).as_bytes(), + format!("v{}", i).as_bytes(), + ) + .unwrap(); } cluster.must_flush_cf(CF_DEFAULT, true); for i in cluster.get_node_ids().into_iter() { @@ -96,7 +92,7 @@ fn test_turnoff_titan() { for i in cluster.get_node_ids().into_iter() { let db = cluster.get_engine(i); let opt = vec![("blob_run_mode", "kFallback")]; - assert!(db.set_options_cf(CF_DEFAULT, &opt).is_ok()); + db.set_options_cf(CF_DEFAULT, &opt).unwrap(); } cluster.compact_data(); let mut all_check_pass = true; From 940e1395869e2d92aa91eb2d59380ce894125b70 Mon Sep 17 00:00:00 2001 From: 5kbpers Date: Fri, 29 Jul 2022 18:41:13 +0800 Subject: [PATCH 123/676] raftstore: use force_send to send ApplyRes (#13168) close tikv/tikv#13160 Use force_send to send ApplyRes Signed-off-by: 5kbpers Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/apply.rs | 1 + components/raftstore/src/store/fsm/store.rs | 14 +++++--- components/tikv_util/src/mpsc/mod.rs | 7 +++- tests/failpoints/cases/test_split_region.rs | 39 ++++++++++++++++++++- 4 files changed, 55 insertions(+), 6 deletions(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 3b9546a460c..938ea526894 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -644,6 +644,7 @@ where let is_synced = self.write_to_db(); if !self.apply_res.is_empty() { + fail_point!("before_nofity_apply_res"); let apply_res = mem::take(&mut self.apply_res); self.notifier.notify(apply_res); } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 635ff2c6693..28abf24083b 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -292,16 +292,21 @@ where { fn notify(&self, apply_res: Vec>) { for r in apply_res { - self.router.try_send( - r.region_id, + let region_id = r.region_id; + if let Err(e) = self.router.force_send( + region_id, PeerMsg::ApplyRes { res: ApplyTaskRes::Apply(r), }, - ); + ) { + error!("failed to send apply result"; "region_id" => region_id, "err" => ?e); + } } } fn notify_one(&self, region_id: u64, msg: PeerMsg) { - self.router.try_send(region_id, msg); + if let Err(e) = self.router.force_send(region_id, msg) { + error!("failed to notify apply msg"; "region_id" => region_id, "err" => ?e); + } } fn clone_box(&self) -> Box> { @@ -795,6 +800,7 @@ impl PollHandler, St where for<'a> F: FnOnce(&'a BatchSystemConfig), { + fail_point!("begin_raft_poller"); self.previous_metrics = self.poll_ctx.raft_metrics.ready.clone(); self.poll_ctx.pending_count = 0; self.poll_ctx.ready_count = 0; diff --git a/components/tikv_util/src/mpsc/mod.rs b/components/tikv_util/src/mpsc/mod.rs index fbd089ebb9e..ccec5448d0b 100644 --- a/components/tikv_util/src/mpsc/mod.rs +++ b/components/tikv_util/src/mpsc/mod.rs @@ -17,6 +17,7 @@ use std::{ use crossbeam::channel::{ self, RecvError, RecvTimeoutError, SendError, TryRecvError, TrySendError, }; +use fail::fail_point; struct State { sender_cnt: AtomicIsize, @@ -236,7 +237,11 @@ impl LooseBoundedSender { #[inline] pub fn try_send(&self, t: T) -> Result<(), TrySendError> { let cnt = self.tried_cnt.get(); - if cnt < CHECK_INTERVAL { + let check_interval = || { + fail_point!("loose_bounded_sender_check_interval", |_| 0); + CHECK_INTERVAL + }; + if cnt < check_interval() { self.tried_cnt.set(cnt + 1); } else if self.len() < self.limit { self.tried_cnt.set(1); diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 6a67e83ef1b..aab1fe3d879 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -21,7 +21,7 @@ use kvproto::{ use pd_client::PdClient; use raft::eraftpb::MessageType; use raftstore::{ - store::{config::Config as RaftstoreConfig, util::is_vote_msg, Callback}, + store::{config::Config as RaftstoreConfig, util::is_vote_msg, Callback, PeerMsg}, Result, }; use test_raftstore::*; @@ -1061,3 +1061,40 @@ fn test_split_replace_skip_log_gc() { cluster.must_put(b"k4", b"v4"); must_get_equal(&cluster.get_engine(2), b"k4", b"v4"); } + +#[test] +fn test_split_store_channel_full() { + let mut cluster = new_node_cluster(0, 1); + cluster.cfg.raft_store.notify_capacity = 10; + cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); + cluster.cfg.raft_store.messages_per_tick = 1; + let pd_client = cluster.pd_client.clone(); + pd_client.disable_default_operator(); + cluster.run(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k2", b"v2"); + let region = pd_client.get_region(b"k2").unwrap(); + let apply_fp = "before_nofity_apply_res"; + fail::cfg(apply_fp, "pause").unwrap(); + let (tx, rx) = mpsc::channel(); + cluster.split_region( + ®ion, + b"k2", + Callback::write(Box::new(move |_| tx.send(()).unwrap())), + ); + rx.recv().unwrap(); + let sender_fp = "loose_bounded_sender_check_interval"; + fail::cfg(sender_fp, "return").unwrap(); + let store_fp = "begin_raft_poller"; + fail::cfg(store_fp, "pause").unwrap(); + let raft_router = cluster.sim.read().unwrap().get_router(1).unwrap(); + for _ in 0..50 { + raft_router.force_send(1, PeerMsg::Noop).unwrap(); + } + fail::remove(apply_fp); + fail::remove(store_fp); + sleep_ms(300); + let region = pd_client.get_region(b"k1").unwrap(); + assert_ne!(region.id, 1); + fail::remove(sender_fp); +} From f96c66015da0961a5e2836c1e72d165004b471e6 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 29 Jul 2022 21:45:13 +0800 Subject: [PATCH 124/676] metrics: fix wrong expression for cdc cpu usage (#13148) close tikv/tikv#13147 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- metrics/grafana/tikv_details.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 5da0ca7c0d3..8189e45d3d2 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -7717,7 +7717,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"cdcwkr.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"cdcwkr.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - worker", @@ -7725,7 +7725,7 @@ "step": 4 }, { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"tso\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"tso\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - tso", @@ -7823,7 +7823,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", name=~\"cdc_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"cdc_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", From 9120fe675cf6389d880959caf101726869a9a92e Mon Sep 17 00:00:00 2001 From: haojinming Date: Mon, 1 Aug 2022 10:18:04 +0800 Subject: [PATCH 125/676] CDC: fix rawkv resolved ts issue (#13142) close tikv/tikv#13144 Signed-off-by: haojinming --- components/cdc/src/delegate.rs | 57 ++++++- components/cdc/src/endpoint.rs | 84 +++++++-- components/cdc/src/initializer.rs | 1 + components/cdc/src/observer.rs | 273 +++++++++++++++++++++++++++++- components/cdc/tests/mod.rs | 2 +- components/server/src/server.rs | 2 +- 6 files changed, 391 insertions(+), 28 deletions(-) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 10de563c4fc..f6ef0659fe0 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -24,7 +24,7 @@ use kvproto::{ }, }; use raftstore::{ - coprocessor::{Cmd, CmdBatch, ObserveHandle}, + coprocessor::{Cmd, CmdBatch, ObserveHandle, ObserveID}, store::util::compare_region_epoch, Error as RaftStoreError, }; @@ -614,19 +614,20 @@ impl Delegate { rows.push(v); } self.sink_downstream(rows, index, ChangeDataRequestKvApi::TiDb)?; - self.sink_raw_downstream(raw_rows, index) + self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv) } - fn sink_raw_downstream(&mut self, entries: Vec, index: u64) -> Result<()> { - if entries.is_empty() { - return Ok(()); + pub fn raw_untrack_ts(&mut self, cdc_id: ObserveID, max_ts: TimeStamp) { + // Stale CmdBatch, drop it silently. + if cdc_id != self.handle.id { + return; } // the entry's timestamp is non-decreasing, the last has the max ts. - let max_raw_ts = TimeStamp::from(entries.last().unwrap().commit_ts); + // use prev ts, see reason at CausalObserver::pre_propose_query + let max_raw_ts = max_ts.prev(); match self.resolver { Some(ref mut resolver) => { - // use prev ts, see reason at CausalObserver::pre_propose_query - resolver.raw_untrack_lock(max_raw_ts.prev()); + resolver.raw_untrack_lock(max_raw_ts); } None => { assert!(self.pending.is_some(), "region resolver not ready"); @@ -636,7 +637,6 @@ impl Delegate { .push(PendingLock::RawUntrack { ts: max_raw_ts }); } } - self.sink_downstream(entries, index, ChangeDataRequestKvApi::RawKv) } pub fn raw_track_ts(&mut self, ts: TimeStamp) { @@ -908,6 +908,16 @@ impl Delegate { // To inform transaction layer no more old values are required for the region. self.txn_extra_op.store(TxnExtraOp::Noop); } + + // if raw data and tidb data both exist in this region, it will return false. + pub fn is_raw_region(&self) -> bool { + if let Some(region) = &self.region { + ApiV2::parse_range_mode((Some(®ion.start_key), Some(®ion.end_key))) + == KeyMode::Raw + } else { + false + } + } } fn set_event_row_type(row: &mut EventRow, ty: EventLogType) { @@ -1265,4 +1275,33 @@ mod tests { } } } + + #[test] + fn test_is_raw_region() { + let region_id = 10; + let mut region = Region::default(); + region.set_id(region_id); + + // start-key, end-key, is_raw + let test_cases = vec![ + (vec![b'r', 0, 0, 0, b'a'], vec![b'r', 0, 0, 0, b'z'], true), + (vec![b'a', 0, 0, 0, b'a'], vec![b'r', 0, 0, 0, b'z'], false), + (vec![b'r', 0, 0, 0, b'a'], vec![b'z', 0, 0, 0, b'z'], false), + (vec![b'r', 0, 0, 0, b'a'], vec![b's'], true), + (vec![b'r', 0, 0, 0, b'a'], vec![], false), + (vec![], vec![], false), + ]; + for (start_key, end_key, is_raw) in &test_cases { + region.set_start_key(start_key.clone()); + region.set_end_key(end_key.clone()); + let resolver = Resolver::new(region_id); + let mut delegate = Delegate::new(region_id, Default::default()); + assert!( + delegate + .on_region_ready(resolver, region.clone()) + .is_empty() + ); + assert_eq!(delegate.is_raw_region(), *is_raw); + } + } } diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 22cb5b94922..4a957774a23 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -59,6 +59,7 @@ use crate::{ delegate::{on_init_downstream, Delegate, Downstream, DownstreamID, DownstreamState}, initializer::Initializer, metrics::*, + observer::RawRegionTs, old_value::{OldValueCache, OldValueCallback}, service::{Conn, ConnID, FeatureGate}, CdcObserver, Error, @@ -177,6 +178,9 @@ pub enum Task { region_id: u64, ts: TimeStamp, }, + RawUntrackTs { + raw_region_ts: Vec, + }, } impl_display_as_debug!(Task); @@ -256,6 +260,10 @@ impl fmt::Debug for Task { .field("region_id", ®ion_id) .field("ts", &ts) .finish(), + Task::RawUntrackTs { ref raw_region_ts } => de + .field("type", &"raw_untrack_ts") + .field("raw_ts", raw_region_ts) + .finish(), } } } @@ -859,6 +867,19 @@ impl, E: KvEngine> Endpoint { flush_oldvalue_stats(&statistics, TAG_DELTA_CHANGE); } + pub fn on_raw_untrack_ts(&mut self, batch_region_ts: Vec) { + for region_ts in batch_region_ts { + let region_id = region_ts.region_id; + if let Some(delegate) = self.capture_regions.get_mut(®ion_id) { + if delegate.has_failed() { + // Skip the batch if the delegate has failed. + continue; + } + delegate.raw_untrack_ts(region_ts.cdc_id, region_ts.max_ts); + } + } + } + fn on_region_ready(&mut self, observe_id: ObserveID, resolver: Resolver, region: Region) { let region_id = region.get_id(); let mut failed_downstreams = Vec::new(); @@ -954,7 +975,7 @@ impl, E: KvEngine> Endpoint { // The judge of raw region is not accuracy here, and we may miss at most one // "normal" raw region. But this will not break the correctness of outlier // detection. - if resolved_ts.is_min_ts_from_raw() { + if resolved_ts.is_min_ts_from_raw() || delegate.is_raw_region() { raw_resolved_regions.push(region_id, resolved_ts.raw_ts) } @@ -1335,6 +1356,7 @@ impl, E: KvEngine> Runnable for Endpoint { }, Task::ChangeConfig(change) => self.on_change_cfg(change), Task::RawTrackTs { region_id, ts } => self.on_raw_track_ts(region_id, ts), + Task::RawUntrackTs { raw_region_ts } => self.on_raw_untrack_ts(raw_region_ts), } } } @@ -1507,7 +1529,7 @@ mod tests { .unwrap() .kv_engine() }), - CdcObserver::new(task_sched), + CdcObserver::new(task_sched, api_version), Arc::new(StdMutex::new(StoreMeta::new(0))), ConcurrencyManager::new(1.into()), Arc::new(Environment::new(1)), @@ -2109,6 +2131,13 @@ mod tests { let ts = TimeStamp::compose(i, 0); suite.run(Task::RawTrackTs { region_id, ts }); } + suite.run(Task::RawUntrackTs { + raw_region_ts: vec![RawRegionTs { + region_id, + cdc_id: observe_id, + max_ts: TimeStamp::compose(125, 0), + }], + }); // untrack ts before 125 let delegate = suite.endpoint.capture_regions.get_mut(®ion_id).unwrap(); // region is not ready, so raw lock in resolver, raw ts is added to // delegate.pending. @@ -2131,7 +2160,7 @@ mod tests { let delegate = suite.endpoint.capture_regions.get_mut(®ion_id).unwrap(); let resolver = delegate.resolver.as_mut().unwrap(); let raw_resolved_ts = resolver.resolve(TimeStamp::compose(200, 0)).min(); - assert_eq!(raw_resolved_ts, TimeStamp::compose(100, 0)); + assert_eq!(raw_resolved_ts, TimeStamp::compose(125, 0)); } #[test] @@ -2144,7 +2173,7 @@ mod tests { let quota = crate::channel::MemoryQuota::new(usize::MAX); let (tx, _) = channel::channel(1, quota); let mut region_cnt = 0; - let mut start_ts: u64 = 200; + let start_ts: u64 = 200; let region_ids: Vec = (1..50).collect(); let dead_lock_region = 1; let dead_lock_ts = TimeStamp::compose(1, 0); @@ -2185,6 +2214,8 @@ mod tests { let mut region = Region::default(); region.id = region_id; region.set_region_epoch(region_epoch); + region.set_start_key(vec![b'r', 0, 0, 0, b'a']); + region.set_end_key(vec![b'r', 0, 0, 0, b'z']); let resolver = Resolver::new(region_id); suite.run(Task::ResolverReady { observe_id, @@ -2200,14 +2231,17 @@ mod tests { let ts = if region_id == dead_lock_region { dead_lock_ts } else { - TimeStamp::compose(start_ts, 0) + TimeStamp::compose(start_ts + 1, 0) }; - start_ts += 1; - suite.run(Task::RawTrackTs { region_id, ts }); - let delegate = suite.endpoint.capture_regions.get_mut(®ion_id).unwrap(); - let resolver = delegate.resolver.as_mut().unwrap(); - let raw_resolved_ts = resolver.resolve(cur_tso).min(); - assert_eq!(raw_resolved_ts, ts); + // Only 9 region is min_ts_from_raw, but other regions are raw regions, + // Them can also be counted. + if region_id < 10 { + suite.run(Task::RawTrackTs { region_id, ts }); + let delegate = suite.endpoint.capture_regions.get_mut(®ion_id).unwrap(); + let resolver = delegate.resolver.as_mut().unwrap(); + let raw_resolved_ts = resolver.resolve(cur_tso).min(); + assert_eq!(raw_resolved_ts, ts); + } } let ob_id = suite .endpoint @@ -2245,6 +2279,34 @@ mod tests { .is_none(), true ); + let untrack_region_id = 20; + let cdc_id = suite + .endpoint + .capture_regions + .get(&untrack_region_id) + .unwrap() + .handle + .id; + let region_ts = RawRegionTs { + region_id: untrack_region_id, + cdc_id, + max_ts: TimeStamp::compose(1000, 0), + }; + suite.run(Task::RawUntrackTs { + raw_region_ts: vec![region_ts], + }); + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + let delegate = suite + .endpoint + .capture_regions + .get_mut(&untrack_region_id) + .unwrap(); + let resolver = delegate.resolver.as_mut().unwrap(); + let raw_resolved_ts = resolver.resolve(cur_tso).min(); + assert_eq!(raw_resolved_ts, cur_tso); // region is untracked. } #[test] diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 3be509e73d0..98720b7cf0c 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -237,6 +237,7 @@ impl Initializer { Scanner::TxnKvScanner(txnkv_scanner) } else { let mut iter_opt = IterOptions::default(); + iter_opt.set_fill_cache(false); let (raw_key_prefix, raw_key_prefix_end) = ApiV2::get_rawkv_range(); iter_opt.set_lower_bound(&[raw_key_prefix], DATA_KEY_PREFIX_LEN); iter_opt.set_upper_bound(&[raw_key_prefix_end], DATA_KEY_PREFIX_LEN); diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 18b4d995077..124757d7697 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -2,16 +2,21 @@ use std::sync::{Arc, RwLock}; +use api_version::{ApiV2, KeyMode, KvFormat}; use causal_ts::{Error as CausalTsError, RawTsTracker, Result as CausalTsResult}; use collections::HashMap; use engine_traits::KvEngine; use fail::fail_point; -use kvproto::metapb::{Peer, Region}; +use kvproto::{ + kvrpcpb::ApiVersion, + metapb::{Peer, Region}, + raft_cmdpb::CmdType, +}; use raft::StateRole; use raftstore::{coprocessor::*, store::RegionSnapshot, Error as RaftStoreError}; use tikv::storage::Statistics; -use tikv_util::{box_err, error, warn, worker::Scheduler}; -use txn_types::TimeStamp; +use tikv_util::{box_err, defer, error, warn, worker::Scheduler}; +use txn_types::{Key, TimeStamp}; use crate::{ endpoint::{Deregister, Task}, @@ -19,6 +24,14 @@ use crate::{ Error as CdcError, }; +// max_ts presents the max ts in one batch. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct RawRegionTs { + pub region_id: u64, + pub cdc_id: ObserveID, + pub max_ts: TimeStamp, +} + /// An Observer for CDC. /// /// It observes raftstore internal events, such as: @@ -30,6 +43,7 @@ pub struct CdcObserver { // A shared registry for managing observed regions. // TODO: it may become a bottleneck, find a better way to manage the registry. observe_regions: Arc>>, + api_version: ApiVersion, } impl CdcObserver { @@ -37,10 +51,11 @@ impl CdcObserver { /// /// Events are strong ordered, so `sched` must be implemented as /// a FIFO queue. - pub fn new(sched: Scheduler) -> CdcObserver { + pub fn new(sched: Scheduler, api_version: ApiVersion) -> CdcObserver { CdcObserver { sched, observe_regions: Arc::default(), + api_version, } } @@ -91,6 +106,66 @@ impl CdcObserver { .get(®ion_id) .cloned() } + + fn untrack_raw_ts(&self, raw_region_ts: Vec) { + if raw_region_ts.is_empty() { + return; + } + if let Err(e) = self.sched.schedule(Task::RawUntrackTs { raw_region_ts }) { + warn!("cdc schedule task failed"; "error" => ?e); + } + } + + // parse rawkv cmd from CmdBatch Vec and return the max ts of every region. + pub fn get_raw_region_ts(&self, cmd_batches: &Vec) -> Vec { + if self.api_version != ApiVersion::V2 { + return vec![]; + } + let mut region_ts = vec![]; + for batch in cmd_batches { + if batch.is_empty() { + continue; + } + let region_id = batch.region_id; + let cdc_id = batch.cdc_id; + if !self + .is_subscribed(region_id) + .map_or(false, |ob_id| ob_id == cdc_id) + { + continue; + } + // Find the max ts in one batch + // The raw request's ts is non-decreasing, only need find the last one. + batch.cmds.iter().rfind(|cmd| { + if let Some(last_key) = cmd + .request + .get_requests() + .iter() + .rfind(|req| { + CmdType::Put == req.get_cmd_type() + && ApiV2::parse_key_mode(req.get_put().get_key()) == KeyMode::Raw + }) + .map(|req| req.get_put().get_key()) + { + match ApiV2::decode_raw_key_owned(Key::from_encoded_slice(last_key), true) { + Ok((_, ts)) => { + region_ts.push(RawRegionTs { + region_id, + cdc_id, + max_ts: ts.unwrap(), + }); + } + // error is ignored, raw dead lock is resolved in Endpoint::on_min_ts + Err(e) => warn!("decode raw key fails"; "err" => ?e), + } + true + } else { + false + } + }); + } + region_ts + } } impl Coprocessor for CdcObserver {} @@ -106,6 +181,13 @@ impl CmdObserver for CdcObserver { ) { assert!(!cmd_batches.is_empty()); fail_point!("before_cdc_flush_apply"); + + // Untrack raw ts regardless of the ob level. + // Because RawKV locks is tracked regardless of observe level as it is in Raft + // propose procedure and can not get an accurate observe level. + let raw_region_ts = self.get_raw_region_ts(cmd_batches); + defer!(self.untrack_raw_ts(raw_region_ts)); + if max_level < ObserveLevel::All { return; } @@ -219,7 +301,11 @@ mod tests { use std::time::Duration; use engine_rocks::RocksEngine; - use kvproto::metapb::Region; + use engine_traits::CF_WRITE; + use kvproto::{ + metapb::Region, + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse, Request}, + }; use raftstore::{coprocessor::RoleChange, store::util::new_peer}; use tikv::storage::kv::TestEngineBuilder; @@ -228,7 +314,7 @@ mod tests { #[test] fn test_register_and_deregister() { let (scheduler, mut rx) = tikv_util::worker::dummy_scheduler(); - let observer = CdcObserver::new(scheduler); + let observer = CdcObserver::new(scheduler, ApiVersion::V1); let observe_info = CmdObserveInfo::from_handle( ObserveHandle::new(), ObserveHandle::new(), @@ -368,4 +454,179 @@ mod tests { observer.on_role_change(&mut ctx, &RoleChange::new(StateRole::Follower)); rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); } + + fn put_cf(cf: &str, key: &[u8], value: &[u8]) -> Request { + let mut cmd = Request::default(); + cmd.set_cmd_type(CmdType::Put); + cmd.mut_put().set_cf(cf.to_owned()); + cmd.mut_put().set_key(key.to_vec()); + cmd.mut_put().set_value(value.to_vec()); + cmd + } + + #[test] + fn test_get_raw_region_ts() { + let (scheduler, mut rx) = tikv_util::worker::dummy_scheduler(); + let observer = CdcObserver::new(scheduler, ApiVersion::V2); + let region_id = 1; + let mut cmd = Cmd::new(0, 0, RaftCmdRequest::default(), RaftCmdResponse::default()); + cmd.request.mut_requests().clear(); + // Both cdc and resolved-ts worker are observing + let observe_info = CmdObserveInfo::from_handle( + ObserveHandle::new(), + ObserveHandle::new(), + ObserveHandle::default(), + ); + let mut cb = CmdBatch::new(&observe_info, region_id); + cb.push(&observe_info, region_id, cmd.clone()); + let cmd_batches = vec![cb]; + let ret = observer.get_raw_region_ts(&cmd_batches); + assert!(ret.is_empty()); + + let data = vec![put_cf(CF_WRITE, b"k7", b"v"), put_cf(CF_WRITE, b"k8", b"v")]; + for put in &data { + cmd.request.mut_requests().push(put.clone()); + } + let mut cb = CmdBatch::new(&observe_info, region_id); + cb.push(&observe_info, region_id, cmd.clone()); + let cmd_batches = vec![cb]; + let ret = observer.get_raw_region_ts(&cmd_batches); + assert!(ret.is_empty()); // no apiv2 key + cmd.request.mut_requests().clear(); + let data = vec![ + put_cf( + CF_WRITE, + ApiV2::encode_raw_key(b"ra", Some(TimeStamp::from(100))).as_encoded(), + b"v1", + ), + put_cf( + CF_WRITE, + ApiV2::encode_raw_key(b"rb", Some(TimeStamp::from(200))).as_encoded(), + b"v2", + ), + ]; + for put in &data { + cmd.request.mut_requests().push(put.clone()); + } + let mut cb1 = CmdBatch::new(&observe_info, region_id); + cb1.push(&observe_info, region_id, cmd.clone()); + let mut cmd2 = Cmd::new(0, 0, RaftCmdRequest::default(), RaftCmdResponse::default()); + cmd2.request.mut_requests().clear(); + let data2 = vec![ + put_cf( + CF_WRITE, + ApiV2::encode_raw_key(b"ra", Some(TimeStamp::from(300))).as_encoded(), + b"v1", + ), + put_cf( + CF_WRITE, + ApiV2::encode_raw_key(b"rb", Some(TimeStamp::from(400))).as_encoded(), + b"v2", + ), + ]; + for put in &data2 { + cmd2.request.mut_requests().push(put.clone()); + } + let mut cb2 = CmdBatch::new(&observe_info, region_id + 1); + cb2.push(&observe_info, region_id + 1, cmd2.clone()); + let mut cmd_batches = vec![cb1.clone(), cb2.clone()]; + let ret = observer.get_raw_region_ts(&cmd_batches); + assert_eq!(ret.len(), 0); // region is not subscribed. + observer.subscribe_region(region_id, observe_info.cdc_id.id); + observer.subscribe_region(region_id + 1, observe_info.cdc_id.id); + let ret = observer.get_raw_region_ts(&cmd_batches); + assert_eq!(ret.len(), 2); // two batch and both subscribed. + assert_eq!( + ret[0], + RawRegionTs { + region_id, + cdc_id: observe_info.cdc_id.id, + max_ts: TimeStamp::from(200) + } + ); + assert_eq!( + ret[1], + RawRegionTs { + region_id: region_id + 1, + cdc_id: observe_info.cdc_id.id, + max_ts: TimeStamp::from(400) + } + ); + let engine = TestEngineBuilder::new().build().unwrap().get_rocksdb(); + >::on_flush_applied_cmd_batch( + &observer, + ObserveLevel::LockRelated, + &mut cmd_batches, + &engine, + ); + // schedule task even if max level is not `All`. + match rx + .recv_timeout(Duration::from_millis(100)) + .unwrap() + .unwrap() + { + Task::RawUntrackTs { raw_region_ts } => { + assert_eq!(raw_region_ts.len(), 2); // two batch and both subscribed. + assert_eq!( + raw_region_ts[0], + RawRegionTs { + region_id, + cdc_id: observe_info.cdc_id.id, + max_ts: TimeStamp::from(200) + } + ); + assert_eq!( + raw_region_ts[1], + RawRegionTs { + region_id: region_id + 1, + cdc_id: observe_info.cdc_id.id, + max_ts: TimeStamp::from(400) + } + ); + } + _ => panic!("unexpected task"), + }; + + // non-rawkv + let data3 = vec![ + put_cf( + CF_WRITE, + ApiV2::encode_raw_key(b"ra", Some(TimeStamp::from(500))).as_encoded(), + b"v1", + ), + put_cf( + CF_WRITE, // this is non-rawkv + ApiV2::encode_raw_key(b"b", Some(TimeStamp::from(600))).as_encoded(), + b"v2", + ), + ]; + let mut cmd3 = Cmd::new(0, 0, RaftCmdRequest::default(), RaftCmdResponse::default()); + for put in &data3 { + cmd3.request.mut_requests().push(put.clone()); + } + cb2.push(&observe_info, region_id + 1, cmd3.clone()); + let cmd_batches = vec![cb1, cb2]; + let ret = observer.get_raw_region_ts(&cmd_batches); + assert_eq!(ret.len(), 2); // two batch and both subscribed. + assert_eq!( + ret[0], + RawRegionTs { + region_id, + cdc_id: observe_info.cdc_id.id, + max_ts: TimeStamp::from(200) + } + ); + assert_eq!( + ret[1], + RawRegionTs { + region_id: region_id + 1, + cdc_id: observe_info.cdc_id.id, + max_ts: TimeStamp::from(500) // 600 is not rawkey + } + ); + let (scheduler, _) = tikv_util::worker::dummy_scheduler(); + let observer = CdcObserver::new(scheduler, ApiVersion::V1); + let ret = observer.get_raw_region_ts(&cmd_batches); + assert!(ret.is_empty()); // v1 does nothing. + } } diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index 25283951450..63c06551a80 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -156,7 +156,7 @@ impl TestSuiteBuilder { Arc::new(cdc::CdcTxnExtraScheduler::new(worker.scheduler().clone())), ); let scheduler = worker.scheduler(); - let cdc_ob = cdc::CdcObserver::new(scheduler.clone()); + let cdc_ob = cdc::CdcObserver::new(scheduler.clone(), ApiVersion::V1); obs.insert(id, cdc_ob.clone()); sim.coprocessor_hooks.entry(id).or_default().push(Box::new( move |host: &mut CoprocessorHost| { diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 425acf6e15c..d8824453a24 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -802,7 +802,7 @@ impl TiKvServer { } // Register cdc. - let cdc_ob = cdc::CdcObserver::new(cdc_scheduler.clone()); + let cdc_ob = cdc::CdcObserver::new(cdc_scheduler.clone(), F::TAG); cdc_ob.register_to(self.coprocessor_host.as_mut().unwrap()); // Register cdc config manager. cfg_controller.register( From 829e5396cb8741ae8b5b33a7d2ebed33d46fd7ed Mon Sep 17 00:00:00 2001 From: xiongjiwei Date: Mon, 1 Aug 2022 15:04:05 +0800 Subject: [PATCH 126/676] charset: update the error message about can not convert error (#13155) close tikv/tikv#13156 Signed-off-by: xiongjiwei Co-authored-by: Ti Chi Robot --- .../src/codec/collation/encoding/ascii.rs | 5 ++++- .../src/codec/collation/encoding/gbk.rs | 5 ++++- .../src/codec/collation/encoding/mod.rs | 21 +++++++++++++++++++ .../src/codec/collation/encoding/utf8.rs | 5 ++++- .../tidb_query_datatype/src/codec/error.rs | 4 ++-- 5 files changed, 35 insertions(+), 5 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/ascii.rs b/components/tidb_query_datatype/src/codec/collation/encoding/ascii.rs index fac8c8f3b58..be1b91ae1ea 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/ascii.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/ascii.rs @@ -20,7 +20,10 @@ impl Encoding for EncodingAscii { fn decode(data: BytesRef<'_>) -> Result { for x in data { if !x.is_ascii() { - return Err(Error::cannot_convert_string("ascii")); + return Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + "ascii", + )); } } Ok(Bytes::from(data)) diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs b/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs index 26f61da7536..43a6289e640 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs @@ -13,7 +13,10 @@ impl Encoding for EncodingGBK { fn decode(data: BytesRef<'_>) -> Result { match GBK.decode_without_bom_handling_and_without_replacement(data) { Some(v) => Ok(Bytes::from(v.as_bytes())), - None => Err(Error::cannot_convert_string("gbk")), + None => Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + "gbk", + )), } } diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs b/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs index 2647446ab7f..b2434105ce5 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/mod.rs @@ -15,3 +15,24 @@ use crate::codec::{ data_type::{Bytes, BytesRef}, Error, Result, }; + +fn format_invalid_char(data: BytesRef<'_>) -> String { + // Max length of the invalid string is '\x00\x00\x00\x00\x00...'(25) we set 32 + // here. + let mut buf = String::with_capacity(32); + const MAX_BYTES_TO_SHOW: usize = 5; + buf.push('\''); + for i in 0..data.len() { + if i > MAX_BYTES_TO_SHOW { + buf.push_str("..."); + break; + } + if data[i].is_ascii() { + buf.push(char::from(data[i])); + } else { + buf.push_str(format!("\\x{:X}", data[i]).as_str()); + } + } + buf.push('\''); + buf +} diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs b/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs index d06bf49c025..e83d6e3eb22 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/utf8.rs @@ -11,7 +11,10 @@ impl Encoding for T { fn decode(data: BytesRef<'_>) -> Result { match str::from_utf8(data) { Ok(v) => Ok(Bytes::from(v)), - Err(_) => Err(Error::cannot_convert_string(T::NAME)), + Err(_) => Err(Error::cannot_convert_string( + format_invalid_char(data).as_str(), + T::NAME, + )), } } } diff --git a/components/tidb_query_datatype/src/codec/error.rs b/components/tidb_query_datatype/src/codec/error.rs index 9cb0ee50d18..23e76a124b8 100644 --- a/components/tidb_query_datatype/src/codec/error.rs +++ b/components/tidb_query_datatype/src/codec/error.rs @@ -95,8 +95,8 @@ impl Error { } } - pub fn cannot_convert_string(charset: &str) -> Error { - let msg = format!("cannot convert string from binary to {}", charset); + pub fn cannot_convert_string(s: &str, charset: &str) -> Error { + let msg = format!("Cannot convert string {} from binary to {}", s, charset); Error::Eval(msg, ERR_CANNOT_CONVERT_STRING) } From 677548c4ea1676a944cc650eb82275eaee41f551 Mon Sep 17 00:00:00 2001 From: 5kbpers Date: Mon, 1 Aug 2022 23:42:05 +0800 Subject: [PATCH 127/676] raftstore: make `UNREACHABLE_BACKOFF` configurable (#13193) close tikv/tikv#13054 make `UNREACHABLE_BACKOFF` configurable. Signed-off-by: 5kbpers --- components/raftstore/src/store/config.rs | 3 +++ components/raftstore/src/store/fsm/store.rs | 6 +++--- components/raftstore/src/store/worker/check_leader.rs | 2 +- tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 6b59eaf71bb..32141a23542 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -283,6 +283,8 @@ pub struct Config { #[doc(hidden)] pub max_snapshot_file_raw_size: ReadableSize, + + pub unreachable_backoff: ReadableDuration, } impl Default for Config { @@ -372,6 +374,7 @@ impl Default for Config { renew_leader_lease_advance_duration: ReadableDuration::secs(0), report_region_buckets_tick_interval: ReadableDuration::secs(10), max_snapshot_file_raw_size: ReadableSize::mb(100), + unreachable_backoff: ReadableDuration::secs(10), } } } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 28abf24083b..52d9bebd0ab 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -105,7 +105,6 @@ use crate::{ type Key = Vec; pub const PENDING_MSG_CAP: usize = 100; -const UNREACHABLE_BACKOFF: Duration = Duration::from_secs(10); const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); pub const MULTI_FILES_SNAPSHOT_FEATURE: Feature = Feature::require(6, 1, 0); // it only makes sense for large region @@ -2682,13 +2681,14 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER fn on_store_unreachable(&mut self, store_id: u64) { let now = Instant::now(); + let unreachable_backoff = self.ctx.cfg.unreachable_backoff.0; if self .fsm .store .last_unreachable_report .get(&store_id) - .map_or(UNREACHABLE_BACKOFF, |t| now.saturating_duration_since(*t)) - < UNREACHABLE_BACKOFF + .map_or(unreachable_backoff, |t| now.saturating_duration_since(*t)) + < unreachable_backoff { return; } diff --git a/components/raftstore/src/store/worker/check_leader.rs b/components/raftstore/src/store/worker/check_leader.rs index 355dca4f168..8821bb6118d 100644 --- a/components/raftstore/src/store/worker/check_leader.rs +++ b/components/raftstore/src/store/worker/check_leader.rs @@ -83,7 +83,7 @@ impl Runner { meta.region_ranges // get overlapped regions .range((Excluded(start_key), Unbounded)) - .take_while(|(_, id)| end_key > enc_start_key(&meta.regions[id])) + .take_while(|(_, id)| end_key > enc_start_key(&meta.regions[*id])) // get the min `safe_ts` .map(|(_, id)| { registry.get(id).unwrap().safe_ts() diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index e8449624a0f..2988b0cf0a3 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -244,6 +244,7 @@ fn test_serde_custom_tikv_config() { reactive_memory_lock_timeout_tick: 8, report_region_buckets_tick_interval: ReadableDuration::secs(1234), max_snapshot_file_raw_size: ReadableSize::gb(10), + unreachable_backoff: ReadableDuration::secs(111), }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index ea9cf8e4062..0221446683a 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -219,6 +219,7 @@ reactive-memory-lock-timeout-tick = 8 report-min-resolved-ts-interval = "233ms" report-region-buckets-tick-interval = "1234s" max-snapshot-file-raw-size = "10GB" +unreachable-backoff = "111s" [coprocessor] split-region-on-table = false From 4dbb057238b45b31658eba1111978d6c87fb09b8 Mon Sep 17 00:00:00 2001 From: cosven Date: Tue, 2 Aug 2022 11:30:05 +0800 Subject: [PATCH 128/676] raftstore: add metrics/logs to help debug high commit log duration (#13120) ref tikv/tikv#13060, ref tikv/tikv#13078 In some cases, such as the one mentioned in #13078, the commit log duration became high. In the case, the needed log is not in entry cache and there are many raftlog async fetch tasks. This commit adds a log to show the cache first index and peers' progress when there is any long uncommitted proposal. It also adds a metric to show the duration of the async fetch tasks. Signed-off-by: cosven Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/router/message.rs | 3 + components/raftstore/src/store/config.rs | 15 ++ .../raftstore/src/store/entry_storage.rs | 54 ++-- components/raftstore/src/store/fsm/peer.rs | 14 + components/raftstore/src/store/fsm/store.rs | 2 + components/raftstore/src/store/metrics.rs | 8 + components/raftstore/src/store/msg.rs | 3 + components/raftstore/src/store/peer.rs | 62 +++++ metrics/grafana/tikv_details.json | 252 ++++++++++++++++++ tests/integrations/config/mod.rs | 2 + tests/integrations/config/test-custom.toml | 6 +- 11 files changed, 402 insertions(+), 19 deletions(-) diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 12041f56fe7..87187b30e75 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -145,6 +145,7 @@ pub enum PeerTick { CheckLeaderLease = 7, ReactivateMemoryLock = 8, ReportBuckets = 9, + CheckLongUncommitted = 10, } impl PeerTick { @@ -163,6 +164,7 @@ impl PeerTick { PeerTick::CheckLeaderLease => "check_leader_lease", PeerTick::ReactivateMemoryLock => "reactivate_memory_lock", PeerTick::ReportBuckets => "report_buckets", + PeerTick::CheckLongUncommitted => "check_long_uncommitted", } } @@ -178,6 +180,7 @@ impl PeerTick { PeerTick::CheckLeaderLease, PeerTick::ReactivateMemoryLock, PeerTick::ReportBuckets, + PeerTick::CheckLongUncommitted, ]; TICKS } diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 32141a23542..ad89d5e7e70 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -281,6 +281,13 @@ pub struct Config { // Interval of scheduling a tick to report region buckets. pub report_region_buckets_tick_interval: ReadableDuration, + /// Interval to check long uncommitted proposals. + #[doc(hidden)] + pub check_long_uncommitted_interval: ReadableDuration, + /// Base threshold of long uncommitted proposal. + #[doc(hidden)] + pub long_uncommitted_base_threshold: ReadableDuration, + #[doc(hidden)] pub max_snapshot_file_raw_size: ReadableSize, @@ -363,6 +370,14 @@ impl Default for Config { raft_msg_flush_interval: ReadableDuration::micros(250), reactive_memory_lock_tick_interval: ReadableDuration::secs(2), reactive_memory_lock_timeout_tick: 5, + check_long_uncommitted_interval: ReadableDuration::secs(10), + /// In some cases, such as rolling upgrade, some regions' commit log + /// duration can be 12 seconds. Before #13078 is merged, + /// the commit log duration can be 2.8 minutes. So maybe + /// 20s is a relatively reasonable base threshold. Generally, + /// the log commit duration is less than 1s. Feel free to adjust + /// this config :) + long_uncommitted_base_threshold: ReadableDuration::secs(20), // They are preserved for compatibility check. region_max_size: ReadableSize(0), diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 03054cfcc16..c73e12013fe 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -20,7 +20,7 @@ use kvproto::raft_serverpb::{RaftApplyState, RaftLocalState}; use protobuf::Message; use raft::{prelude::*, util::limit_size, GetEntriesContext, StorageError}; use tikv_alloc::TraceEvent; -use tikv_util::{debug, info, worker::Scheduler}; +use tikv_util::{debug, info, time::Instant, warn, worker::Scheduler}; use super::{metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE}; use crate::{bytes_capacity, store::worker::RaftlogFetchTask}; @@ -364,9 +364,10 @@ impl Drop for EntryCache { } } -#[derive(Debug, PartialEq)] +#[derive(Debug)] pub enum RaftlogFetchState { - Fetching, + // The Instant records the start time of the fetching. + Fetching(Instant), Fetched(Box), } @@ -481,26 +482,40 @@ impl EntryStorage { // None indicates cleanning the fetched result. pub fn update_async_fetch_res(&mut self, low: u64, res: Option>) { // If it's in fetching, don't clean the async fetch result. - if self.async_fetch_results.borrow().get(&low) == Some(&RaftlogFetchState::Fetching) - && res.is_none() - { - return; + if let Some(RaftlogFetchState::Fetching(_)) = self.async_fetch_results.borrow().get(&low) { + if res.is_none() { + return; + } } match res { Some(res) => { - if let Some(RaftlogFetchState::Fetched(prev)) = self + match self .async_fetch_results .borrow_mut() .insert(low, RaftlogFetchState::Fetched(res)) { - info!( - "unconsumed async fetch res"; - "region_id" => self.region_id, - "peer_id" => self.peer_id, - "res" => ?prev, - "low" => low, - ); + Some(RaftlogFetchState::Fetching(start)) => { + RAFT_ENTRY_FETCHES_TASK_DURATION_HISTOGRAM + .observe(start.saturating_elapsed_secs()); + } + Some(RaftlogFetchState::Fetched(prev)) => { + info!( + "unconsumed async fetch res"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "res" => ?prev, + "low" => low, + ); + } + _ => { + warn!( + "unknown async fetch res"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "low" => low, + ); + } } } None => { @@ -521,7 +536,7 @@ impl EntryStorage { context: GetEntriesContext, buf: &mut Vec, ) -> raft::Result { - if let Some(RaftlogFetchState::Fetching) = self.async_fetch_results.borrow().get(&low) { + if let Some(RaftlogFetchState::Fetching(_)) = self.async_fetch_results.borrow().get(&low) { // already an async fetch in flight return Err(raft::Error::Store( raft::StorageError::LogTemporarilyUnavailable, @@ -630,7 +645,7 @@ impl EntryStorage { self.raftlog_fetch_stats.async_fetch.update(|m| m + 1); self.async_fetch_results .borrow_mut() - .insert(low, RaftlogFetchState::Fetching); + .insert(low, RaftlogFetchState::Fetching(Instant::now_coarse())); self.raftlog_fetch_scheduler .schedule(RaftlogFetchTask::PeerStorage { region_id, @@ -851,6 +866,11 @@ impl EntryStorage { self.cache.is_empty() } + #[inline] + pub fn entry_cache_first_index(&self) -> Option { + self.cache.first_index() + } + /// Evict entries from the cache. pub fn evict_entry_cache(&mut self, half: bool) { if !self.is_entry_cache_empty() { diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 8d5369aaefa..1d02b723cf6 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1076,6 +1076,7 @@ where PeerTick::CheckLeaderLease => self.on_check_leader_lease_tick(), PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), + PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted_tick(), } } @@ -5091,6 +5092,19 @@ where } } + fn register_check_long_uncommitted_tick(&mut self) { + self.schedule_tick(PeerTick::CheckLongUncommitted) + } + + fn on_check_long_uncommitted_tick(&mut self) { + if !self.fsm.peer.is_leader() || self.fsm.hibernate_state.group_state() == GroupState::Idle + { + return; + } + self.fsm.peer.check_long_uncommitted_proposals(self.ctx); + self.register_check_long_uncommitted_tick(); + } + fn register_check_leader_lease_tick(&mut self) { self.schedule_tick(PeerTick::CheckLeaderLease) } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 52d9bebd0ab..5235f90e156 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -555,6 +555,8 @@ where self.cfg.reactive_memory_lock_tick_interval.0; self.tick_batch[PeerTick::ReportBuckets as usize].wait_duration = self.cfg.report_region_buckets_tick_interval.0; + self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = + self.cfg.check_long_uncommitted_interval.0; } } diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index a983feb7909..9691d5be0db 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -605,6 +605,14 @@ lazy_static! { pub static ref RAFT_ENTRY_FETCHES: RaftEntryFetches = auto_flush_from!(RAFT_ENTRY_FETCHES_VEC, RaftEntryFetches); + // The max task duration can be a few minutes. + pub static ref RAFT_ENTRY_FETCHES_TASK_DURATION_HISTOGRAM: Histogram = + register_histogram!( + "tikv_raftstore_entry_fetches_task_duration_seconds", + "Bucketed histogram of raft entry fetches task duration.", + exponential_buckets(0.0005, 2.0, 21).unwrap() // 500us ~ 8.7m + ).unwrap(); + pub static ref LEADER_MISSING: IntGauge = register_int_gauge!( "tikv_raftstore_leader_missing", diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index e552229aa0c..43126d1def5 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -233,6 +233,7 @@ pub enum PeerTick { CheckLeaderLease = 7, ReactivateMemoryLock = 8, ReportBuckets = 9, + CheckLongUncommitted = 10, } impl PeerTick { @@ -251,6 +252,7 @@ impl PeerTick { PeerTick::CheckLeaderLease => "check_leader_lease", PeerTick::ReactivateMemoryLock => "reactivate_memory_lock", PeerTick::ReportBuckets => "report_buckets", + PeerTick::CheckLongUncommitted => "check_long_uncommitted", } } @@ -266,6 +268,7 @@ impl PeerTick { PeerTick::CheckLeaderLease, PeerTick::ReactivateMemoryLock, PeerTick::ReportBuckets, + PeerTick::CheckLongUncommitted, ]; TICKS } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 6d309afa17f..9a8fd7d0605 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -192,6 +192,11 @@ impl ProposalQueue { None } + #[inline] + fn oldest(&self) -> Option<&Proposal> { + self.queue.front() + } + fn push(&mut self, p: Proposal) { if let Some(f) = self.queue.back() { // The term must be increasing among all log entries and the index @@ -730,6 +735,11 @@ where #[getset(get = "pub")] leader_lease: Lease, pending_reads: ReadIndexQueue, + /// Threshold of long uncommitted proposals. + /// + /// Note that this is a dynamically changing value. Check the + /// `has_long_uncommitted_proposals` method for details. + long_uncommitted_threshold: Duration, /// If it fails to send messages to leader. pub leader_unreachable: bool, @@ -937,6 +947,7 @@ where raft_max_inflight_msgs: cfg.raft_max_inflight_msgs, proposals: ProposalQueue::new(tag.clone()), pending_reads: Default::default(), + long_uncommitted_threshold: cfg.long_uncommitted_base_threshold.0, peer_cache: RefCell::new(HashMap::default()), peer_heartbeats: HashMap::default(), peers_start_pending_time: vec![], @@ -2810,6 +2821,57 @@ where fail_point!("after_send_to_apply_1003", self.peer_id() == 1003, |_| {}); } + /// Check long uncommitted proposals and log some info to help find why. + pub fn check_long_uncommitted_proposals(&mut self, ctx: &mut PollContext) { + if self.has_long_uncommitted_proposals(ctx) { + let status = self.raft_group.status(); + let mut buffer: Vec<(u64, u64, u64)> = Vec::new(); + if let Some(prs) = status.progress { + for (id, p) in prs.iter() { + buffer.push((*id, p.commit_group_id, p.matched)); + } + } + warn!( + "found long uncommitted proposals"; + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + "progress" => ?buffer, + "cache_first_index" => ?self.get_store().entry_cache_first_index(), + "next_turn_threshold" => ?self.long_uncommitted_threshold, + ); + } + } + + /// Check if there is long uncommitted proposal. + /// + /// This will increase the threshold when a long uncommitted proposal is + /// detected, and reset the threshold when there is no long uncommitted + /// proposal. + fn has_long_uncommitted_proposals(&mut self, ctx: &mut PollContext) -> bool { + let mut has_long_uncommitted = false; + let base_threshold = ctx.cfg.long_uncommitted_base_threshold.0; + if let Some(propose_time) = self.proposals.oldest().and_then(|p| p.propose_time) { + // When a proposal was proposed with this ctx before, the current_time can be + // some. + let current_time = *ctx.current_time.get_or_insert_with(monotonic_raw_now); + let elapsed = match (current_time - propose_time).to_std() { + Ok(elapsed) => elapsed, + Err(_) => return false, + }; + // Increase the threshold for next turn when a long uncommitted proposal is + // detected. + if elapsed >= self.long_uncommitted_threshold { + has_long_uncommitted = true; + self.long_uncommitted_threshold += base_threshold; + } else if elapsed < base_threshold { + self.long_uncommitted_threshold = base_threshold; + } + } else { + self.long_uncommitted_threshold = base_threshold; + } + has_long_uncommitted + } + fn on_persist_snapshot( &mut self, ctx: &mut PollContext, diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 8189e45d3d2..0291aa87590 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -7872,6 +7872,111 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 55 + }, + "hiddenSeries": false, + "id": 23763572511, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "paceLength": 10, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftlog_fetch.*\"}[1m])) by (instance)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Raftlog fetch Worker CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -17210,6 +17315,153 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "hiddenSeries": false, + "id": 23763572555, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": false + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:521", + "alias": "/pending-task/", + "transform": "negative-Y", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_entry_fetches_task_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "99%", + "refId": "A", + "step": 10 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_entry_fetches_task_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "95%", + "refId": "B", + "step": 10 + }, + { + "exemplar": true, + "expr": "sum(rate(tikv_raftstore_entry_fetches_task_duration_seconds_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_entry_fetches_task_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "avg", + "refId": "C", + "step": 10 + }, + { + "exemplar": true, + "expr": "sum(tikv_worker_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=~\"$tidb_cluster.*\", instance=~\"$instance\", name=~\"raftlog-fetch-worker\"})", + "hide": false, + "interval": "", + "legendFormat": "pending-task", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Raft log async fetch task duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:86", + "decimals": null, + "format": "s", + "label": null, + "logBase": 10, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:87", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "title": "Raft Log", diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 2988b0cf0a3..98bb55625fa 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -243,6 +243,8 @@ fn test_serde_custom_tikv_config() { reactive_memory_lock_tick_interval: ReadableDuration::millis(566), reactive_memory_lock_timeout_tick: 8, report_region_buckets_tick_interval: ReadableDuration::secs(1234), + check_long_uncommitted_interval: ReadableDuration::secs(1), + long_uncommitted_base_threshold: ReadableDuration::secs(1), max_snapshot_file_raw_size: ReadableSize::gb(10), unreachable_backoff: ReadableDuration::secs(111), }; diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 0221446683a..c653e9c500d 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -216,6 +216,8 @@ check-leader-lease-interval = "123ms" renew-leader-lease-advance-duration = "456ms" reactive-memory-lock-tick-interval = "566ms" reactive-memory-lock-timeout-tick = 8 +check-long-uncommitted-interval = "1s" +long-uncommitted-base-threshold = "1s" report-min-resolved-ts-interval = "233ms" report-region-buckets-tick-interval = "1234s" max-snapshot-file-raw-size = "10GB" @@ -229,7 +231,7 @@ region-split-size = "12MB" region-max-keys = 100000 region-split-keys = 100000 consistency-check-method = "raw" -enable-region-bucket = true +enable-region-bucket = true region-bucket-size = "1MB" region-size-threshold-for-approximate = "3MB" region-bucket-merge-size-ratio = 0.4 @@ -374,7 +376,7 @@ num-levels = 4 max-bytes-for-level-multiplier = 8 compaction-style = "universal" disable-auto-compactions = true -disable-write-stall = true +disable-write-stall = true soft-pending-compaction-bytes-limit = "12GB" hard-pending-compaction-bytes-limit = "12GB" force-consistency-checks = true From 0576484eed99a6126511136f1af9ded029b9154c Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 2 Aug 2022 04:32:05 -0700 Subject: [PATCH 129/676] raftstore: refactor async write to allow write all states to raft engine (#13157) ref tikv/tikv#12842 In v2, all states are moved to raft engine, so it doesn't need to write to kv db anymore. Signed-off-by: Jay Lee --- .../raftstore/src/store/async_io/write.rs | 334 ++++++++++++++---- .../src/store/async_io/write_tests.rs | 188 +++++++--- .../raftstore/src/store/entry_storage.rs | 147 +++++++- components/raftstore/src/store/fsm/store.rs | 15 +- components/raftstore/src/store/mod.rs | 13 +- .../raftstore/src/store/peer_storage.rs | 157 +------- 6 files changed, 573 insertions(+), 281 deletions(-) diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 6b652670138..72fd52ea4d4 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -8,7 +8,7 @@ //! raft db and then invoking callback or sending msgs if any. use std::{ - fmt, + fmt, mem, sync::Arc, thread::{self, JoinHandle}, }; @@ -16,12 +16,11 @@ use std::{ use collections::HashMap; use crossbeam::channel::{bounded, Receiver, Sender, TryRecvError}; use engine_traits::{ - Engines, KvEngine, PerfContext, PerfContextKind, RaftEngine, RaftLogBatch, WriteBatch, - WriteOptions, + KvEngine, PerfContext, PerfContextKind, RaftEngine, RaftLogBatch, WriteBatch, WriteOptions, }; use error_code::ErrorCodeExt; use fail::fail_point; -use kvproto::raft_serverpb::{RaftLocalState, RaftMessage}; +use kvproto::raft_serverpb::{RaftApplyState, RaftLocalState, RaftMessage, RegionLocalState}; use protobuf::Message; use raft::eraftpb::Entry; use tikv_util::{ @@ -53,16 +52,16 @@ const RAFT_WB_SHRINK_SIZE: usize = 10 * 1024 * 1024; const RAFT_WB_DEFAULT_SIZE: usize = 256 * 1024; /// Notify the event to the specified region. -pub trait Notifier: Clone + Send + 'static { - fn notify_persisted(&self, region_id: u64, peer_id: u64, ready_number: u64); +pub trait PersistedNotifier: Clone + Send + 'static { + fn notify(&self, region_id: u64, peer_id: u64, ready_number: u64); } -impl Notifier for RaftRouter +impl PersistedNotifier for RaftRouter where EK: KvEngine, ER: RaftEngine, { - fn notify_persisted(&self, region_id: u64, peer_id: u64, ready_number: u64) { + fn notify(&self, region_id: u64, peer_id: u64, ready_number: u64) { if let Err(e) = self.force_send( region_id, PeerMsg::Persisted { @@ -81,6 +80,79 @@ where } } +/// Extra writes besides raft engine. +/// +/// For now, applying snapshot needs to persist some extra states. For v1, +/// these states are written to KvEngine. For v2, they are written to +/// RaftEngine. +// TODO: perhaps we should always pass states instead of a write batch even +// for v1. +pub enum ExtraWrite { + None, + V1(W), + V2(ExtraStates), +} + +impl ExtraWrite { + #[inline] + pub fn is_empty(&self) -> bool { + match self { + ExtraWrite::None => true, + ExtraWrite::V1(w) => w.is_empty(), + _ => false, + } + } + + #[inline] + fn data_size(&self) -> usize { + match self { + ExtraWrite::None => 0, + ExtraWrite::V1(w) => w.data_size(), + ExtraWrite::V2(m) => mem::size_of_val(m), + } + } + + #[inline] + pub fn ensure_v1(&mut self, write_batch: impl FnOnce() -> W) -> &mut W { + if let ExtraWrite::None = self { + *self = ExtraWrite::V1(write_batch()); + } else if let ExtraWrite::V2(_) = self { + unreachable!("v1 and v2 are mixed used"); + } + match self { + ExtraWrite::V1(w) => w, + _ => unreachable!(), + } + } + + #[inline] + pub fn v1_mut(&mut self) -> Option<&mut W> { + if let ExtraWrite::V1(w) = self { + Some(w) + } else { + None + } + } + + #[inline] + pub fn set_v2(&mut self, extra_states: ExtraStates) { + if let ExtraWrite::V1(_) = self { + unreachable!("v1 and v2 are mixed used"); + } else { + *self = ExtraWrite::V2(extra_states); + } + } + + #[inline] + pub fn v2_mut(&mut self) -> Option<&mut ExtraStates> { + if let ExtraWrite::V2(m) = self { + Some(m) + } else { + None + } + } +} + /// WriteTask contains write tasks which need to be persisted to kv db and raft /// db. pub struct WriteTask @@ -92,11 +164,11 @@ where peer_id: u64, ready_number: u64, pub send_time: Instant, - pub kv_wb: Option, pub raft_wb: Option, pub entries: Vec, pub cut_logs: Option<(u64, u64)>, pub raft_state: Option, + pub extra_write: ExtraWrite, pub messages: Vec, pub trackers: Vec, } @@ -112,11 +184,11 @@ where peer_id, ready_number, send_time: Instant::now(), - kv_wb: None, raft_wb: None, entries: vec![], cut_logs: None, raft_state: None, + extra_write: ExtraWrite::None, messages: vec![], trackers: vec![], } @@ -126,10 +198,15 @@ where !(self.raft_state.is_none() && self.entries.is_empty() && self.cut_logs.is_none() - && self.kv_wb.as_ref().map_or(true, |wb| wb.is_empty()) + && self.extra_write.is_empty() && self.raft_wb.as_ref().map_or(true, |wb| wb.is_empty())) } + #[inline] + pub fn ready_number(&self) -> u64 { + self.ready_number + } + /// Sanity check for robustness. pub fn valid(&self) -> Result<()> { if self.region_id == 0 || self.peer_id == 0 || self.ready_number == 0 { @@ -189,16 +266,96 @@ where } } +/// These states are set only in raftstore V2. +#[derive(Default)] +pub struct ExtraStates { + apply_state: RaftApplyState, + region_state: Option, +} + +impl ExtraStates { + #[inline] + pub fn new(apply_state: RaftApplyState) -> Self { + Self { + apply_state, + region_state: None, + } + } + + #[inline] + pub fn set_region_state(&mut self, region_state: RegionLocalState) { + self.region_state = Some(region_state); + } +} + +pub enum ExtraBatchWrite { + None, + V1(W), + V2(HashMap), +} + +impl ExtraBatchWrite { + #[inline] + fn clear(&mut self) { + match self { + ExtraBatchWrite::None => {} + ExtraBatchWrite::V1(w) => w.clear(), + ExtraBatchWrite::V2(m) => m.clear(), + } + } + + /// Merge the extra_write with this batch. + /// + /// If there is any new states inserted, return the size of the state. + fn merge(&mut self, region_id: u64, extra_write: &mut ExtraWrite) -> usize { + let mut inserted = false; + match mem::replace(extra_write, ExtraWrite::None) { + ExtraWrite::None => (), + ExtraWrite::V1(wb) => match self { + ExtraBatchWrite::None => *self = ExtraBatchWrite::V1(wb), + ExtraBatchWrite::V1(kv_wb) => kv_wb.merge(wb).unwrap(), + ExtraBatchWrite::V2(_) => unreachable!("v2 and v1 are mixed used"), + }, + ExtraWrite::V2(extra_states) => match self { + ExtraBatchWrite::None => { + let mut map = HashMap::default(); + map.insert(region_id, extra_states); + *self = ExtraBatchWrite::V2(map); + inserted = true; + } + ExtraBatchWrite::V1(_) => unreachable!("v2 and v1 are mixed used"), + ExtraBatchWrite::V2(extra_states_map) => match extra_states_map.entry(region_id) { + collections::HashMapEntry::Occupied(mut slot) => { + slot.get_mut().apply_state = extra_states.apply_state; + if let Some(region_state) = extra_states.region_state { + slot.get_mut().region_state = Some(region_state); + } + } + collections::HashMapEntry::Vacant(slot) => { + slot.insert(extra_states); + inserted = true; + } + }, + }, + }; + if inserted { + std::mem::size_of::() + } else { + 0 + } + } +} + /// WriteTaskBatch is used for combining several WriteTask into one. struct WriteTaskBatch where EK: KvEngine, ER: RaftEngine, { - pub kv_wb: EK::WriteBatch, pub raft_wb: ER::LogBatch, // Write raft state once for a region everytime writing to disk pub raft_states: HashMap, + pub extra_batch_write: ExtraBatchWrite, pub state_size: usize, pub tasks: Vec>, // region_id -> (peer_id, ready_number) @@ -210,11 +367,11 @@ where EK: KvEngine, ER: RaftEngine, { - fn new(kv_wb: EK::WriteBatch, raft_wb: ER::LogBatch) -> Self { + fn new(raft_wb: ER::LogBatch) -> Self { Self { - kv_wb, raft_wb, raft_states: HashMap::default(), + extra_batch_write: ExtraBatchWrite::None, state_size: 0, tasks: vec![], readies: HashMap::default(), @@ -226,9 +383,6 @@ where if let Err(e) = task.valid() { panic!("task is not valid: {:?}", e); } - if let Some(kv_wb) = task.kv_wb.take() { - self.kv_wb.merge(kv_wb).unwrap(); - } if let Some(raft_wb) = task.raft_wb.take() { self.raft_wb.merge(raft_wb).unwrap(); } @@ -249,6 +403,10 @@ where } } + self.state_size += self + .extra_batch_write + .merge(task.region_id, &mut task.extra_write); + if let Some(prev_readies) = self .readies .insert(task.region_id, (task.peer_id, task.ready_number)) @@ -276,8 +434,8 @@ where fn clear(&mut self) { // raft_wb doesn't have clear interface and it should be consumed by raft db // before - self.kv_wb.clear(); self.raft_states.clear(); + self.extra_batch_write.clear(); self.state_size = 0; self.tasks.clear(); self.readies.clear(); @@ -298,6 +456,18 @@ where for (region_id, state) in self.raft_states.drain() { self.raft_wb.put_raft_state(region_id, &state).unwrap(); } + if let ExtraBatchWrite::V2(extra_states_map) = &mut self.extra_batch_write { + for (region_id, state) in extra_states_map.drain() { + self.raft_wb + .put_apply_state(region_id, &state.apply_state) + .unwrap(); + if let Some(region_state) = state.region_state { + self.raft_wb + .put_region_state(region_id, ®ion_state) + .unwrap(); + } + } + } self.state_size = 0; if metrics.waterfall_metrics { let now = std::time::Instant::now(); @@ -342,11 +512,12 @@ pub struct Worker where EK: KvEngine, ER: RaftEngine, - N: Notifier, + N: PersistedNotifier, { store_id: u64, tag: String, - engines: Engines, + raft_engine: ER, + kv_engine: Option, receiver: Receiver>, notifier: N, trans: T, @@ -363,30 +534,28 @@ impl Worker where EK: KvEngine, ER: RaftEngine, - N: Notifier, + N: PersistedNotifier, T: Transport, { pub fn new( store_id: u64, tag: String, - engines: Engines, + raft_engine: ER, + kv_engine: Option, receiver: Receiver>, notifier: N, trans: T, cfg: &Arc>, ) -> Self { - let batch = WriteTaskBatch::new( - engines.kv.write_batch_with_cap(KV_WB_DEFAULT_SIZE), - engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE), - ); - let perf_context = engines - .raft - .get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); + let batch = WriteTaskBatch::new(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); + let perf_context = + raft_engine.get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); let cfg_tracker = cfg.clone().tracker(tag.clone()); Self { store_id, tag, - engines, + raft_engine, + kv_engine, receiver, notifier, trans, @@ -455,7 +624,7 @@ where "region_id" => task.region_id, "peer_id" => task.peer_id, "ready_number" => task.ready_number, - "kv_wb_size" => task.kv_wb.as_ref().map_or(0, |wb| wb.data_size()), + "extra_write_size" => task.extra_write.data_size(), "raft_wb_size" => task.raft_wb.as_ref().map_or(0, |wb| wb.persist_size()), "entry_count" => task.entries.len(), ); @@ -491,29 +660,37 @@ where fail_point!("raft_before_save"); let mut write_kv_time = 0f64; - if !self.batch.kv_wb.is_empty() { - let raft_before_save_kv_on_store_3 = || { - fail_point!("raft_before_save_kv_on_store_3", self.store_id == 3, |_| {}); - }; - raft_before_save_kv_on_store_3(); - let now = Instant::now(); - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - // TODO: Add perf context - self.batch.kv_wb.write_opt(&write_opts).unwrap_or_else(|e| { - panic!( - "store {}: {} failed to write to kv engine: {:?}", - self.store_id, self.tag, e - ); - }); - if self.batch.kv_wb.data_size() > KV_WB_SHRINK_SIZE { - self.batch.kv_wb = self.engines.kv.write_batch_with_cap(KV_WB_DEFAULT_SIZE); + if let ExtraBatchWrite::V1(kv_wb) = &mut self.batch.extra_batch_write { + if !kv_wb.is_empty() { + let store_id = self.store_id; + let raft_before_save_kv_on_store_3 = || { + fail_point!("raft_before_save_kv_on_store_3", store_id == 3, |_| {}); + }; + raft_before_save_kv_on_store_3(); + let now = Instant::now(); + let mut write_opts = WriteOptions::new(); + write_opts.set_sync(true); + // TODO: Add perf context + let tag = &self.tag; + kv_wb.write_opt(&write_opts).unwrap_or_else(|e| { + panic!( + "store {}: {} failed to write to kv engine: {:?}", + store_id, tag, e + ); + }); + if kv_wb.data_size() > KV_WB_SHRINK_SIZE { + *kv_wb = self + .kv_engine + .as_ref() + .unwrap() + .write_batch_with_cap(KV_WB_DEFAULT_SIZE); + } + write_kv_time = duration_to_sec(now.saturating_elapsed()); + STORE_WRITE_KVDB_DURATION_HISTOGRAM.observe(write_kv_time); } - write_kv_time = duration_to_sec(now.saturating_elapsed()); - STORE_WRITE_KVDB_DURATION_HISTOGRAM.observe(write_kv_time); - } - self.batch.after_write_to_kv_db(&self.metrics); + self.batch.after_write_to_kv_db(&self.metrics); + } fail_point!("raft_between_save"); @@ -523,8 +700,7 @@ where let now = Instant::now(); self.perf_context.start_observe(); - self.engines - .raft + self.raft_engine .consume_and_shrink( &mut self.batch.raft_wb, true, @@ -606,8 +782,7 @@ where let mut callback_time = 0f64; if notify { for (region_id, (peer_id, ready_number)) in &self.batch.readies { - self.notifier - .notify_persisted(*region_id, *peer_id, *ready_number); + self.notifier.notify(*region_id, *peer_id, *ready_number); } now = Instant::now(); callback_time = duration_to_sec(now.saturating_duration_since(now2)); @@ -665,26 +840,29 @@ where handlers: Vec>, } -impl StoreWriters -where - EK: KvEngine, - ER: RaftEngine, -{ - pub fn new() -> Self { +impl Default for StoreWriters { + fn default() -> Self { Self { writers: vec![], handlers: vec![], } } +} +impl StoreWriters +where + EK: KvEngine, + ER: RaftEngine, +{ pub fn senders(&self) -> &Vec>> { &self.writers } - pub fn spawn( + pub fn spawn( &mut self, store_id: u64, - engines: &Engines, + raft_engine: ER, + kv_engine: Option, notifier: &N, trans: &T, cfg: &Arc>, @@ -696,7 +874,8 @@ where let mut worker = Worker::new( store_id, tag.clone(), - engines.clone(), + raft_engine.clone(), + kv_engine.clone(), rx, notifier.clone(), trans.clone(), @@ -726,23 +905,24 @@ where /// Used for test to write task to kv db and raft db. #[cfg(test)] -pub fn write_to_db_for_test(engines: &Engines, task: WriteTask) -where +pub fn write_to_db_for_test( + engines: &engine_traits::Engines, + task: WriteTask, +) where EK: KvEngine, ER: RaftEngine, { - let mut batch = WriteTaskBatch::new( - engines.kv.write_batch(), - engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE), - ); + let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); batch.add_write_task(task); batch.before_write_to_db(&StoreWriteMetrics::new(false)); - if !batch.kv_wb.is_empty() { - let mut write_opts = WriteOptions::new(); - write_opts.set_sync(true); - batch.kv_wb.write_opt(&write_opts).unwrap_or_else(|e| { - panic!("test failed to write to kv engine: {:?}", e); - }); + if let ExtraBatchWrite::V1(kv_wb) = &mut batch.extra_batch_write { + if !kv_wb.is_empty() { + let mut write_opts = WriteOptions::new(); + write_opts.set_sync(true); + kv_wb.write_opt(&write_opts).unwrap_or_else(|e| { + panic!("test failed to write to kv engine: {:?}", e); + }); + } } if !batch.raft_wb.is_empty() { engines diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 04ece802a45..aaaed69c555 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -5,7 +5,7 @@ use std::time::Duration; use collections::HashSet; use crossbeam::channel::unbounded; use engine_test::{kv::KvTestEngine, new_temp_engine, raft::RaftTestEngine}; -use engine_traits::{Mutable, Peekable, RaftEngineReadOnly, WriteBatchExt}; +use engine_traits::{Engines, Mutable, Peekable, RaftEngineReadOnly, WriteBatchExt}; use kvproto::raft_serverpb::RaftMessage; use tempfile::Builder; @@ -15,6 +15,9 @@ use crate::{ Result, }; +type TestKvWriteBatch = ::WriteBatch; +type TestRaftLogBatch = ::LogBatch; + fn must_have_entries_and_state( raft_engine: &RaftTestEngine, entries_state: Vec<(u64, Vec, RaftLocalState)>, @@ -56,8 +59,8 @@ struct TestNotifier { tx: Sender<(u64, (u64, u64))>, } -impl Notifier for TestNotifier { - fn notify_persisted(&self, region_id: u64, peer_id: u64, ready_number: u64) { +impl PersistedNotifier for TestNotifier { + fn notify(&self, region_id: u64, peer_id: u64, ready_number: u64) { self.tx.send((region_id, (peer_id, ready_number))).unwrap() } } @@ -146,42 +149,30 @@ fn init_write_batch( engines: &Engines, task: &mut WriteTask, ) { - task.kv_wb = Some(engines.kv.write_batch()); + task.extra_write.ensure_v1(|| engines.kv.write_batch()); task.raft_wb = Some(engines.raft.log_batch(0)); } /// Help function for less code /// Option must not be none -fn put_kv(wb: &mut Option<::WriteBatch>, key: &[u8], value: &[u8]) { - wb.as_mut().unwrap().put(key, value).unwrap(); +fn put_kv(wb: Option<&mut TestKvWriteBatch>, key: &[u8], value: &[u8]) { + wb.unwrap().put(key, value).unwrap(); } /// Help function for less code /// Option must not be none -fn delete_kv(wb: &mut Option<::WriteBatch>, key: &[u8]) { - wb.as_mut().unwrap().delete(key).unwrap(); +fn delete_kv(wb: Option<&mut TestKvWriteBatch>, key: &[u8]) { + wb.unwrap().delete(key).unwrap(); } /// Simulate kv puts on raft engine. -fn put_raft_kv(wb: &mut Option<::LogBatch>, key: u64) { - wb.as_mut() - .unwrap() - .append(key, vec![new_entry(key, key)]) - .unwrap(); +fn put_raft_kv(wb: Option<&mut TestRaftLogBatch>, key: u64) { + wb.unwrap().append(key, vec![new_entry(key, key)]).unwrap(); } -fn delete_raft_kv( - engine: &RaftTestEngine, - wb: &mut Option<::LogBatch>, - key: u64, -) { +fn delete_raft_kv(engine: &RaftTestEngine, wb: Option<&mut TestRaftLogBatch>, key: u64) { engine - .clean( - key, - key, - &new_raft_state(key, key, key, key), - wb.as_mut().unwrap(), - ) + .clean(key, key, &new_raft_state(key, key, key, key), wb.unwrap()) .unwrap(); } @@ -212,7 +203,8 @@ impl TestWorker { worker: Worker::new( 1, "writer".to_string(), - engines.clone(), + engines.raft.clone(), + Some(engines.kv.clone()), task_rx, notifier, trans, @@ -236,11 +228,12 @@ impl TestWriters { let trans = TestTransport { tx: msg_tx }; let (notify_tx, notify_rx) = unbounded(); let notifier = TestNotifier { tx: notify_tx }; - let mut writers = StoreWriters::new(); + let mut writers = StoreWriters::default(); writers .spawn( 1, - engines, + engines.raft.clone(), + Some(engines.kv.clone()), ¬ifier, &trans, &Arc::new(VersionTrack::new(cfg.clone())), @@ -269,8 +262,8 @@ fn test_worker() { let mut task_1 = WriteTask::::new(region_1, 1, 10); init_write_batch(&engines, &mut task_1); - put_kv(&mut task_1.kv_wb, b"kv_k1", b"kv_v1"); - put_raft_kv(&mut task_1.raft_wb, 17); + put_kv(task_1.extra_write.v1_mut(), b"kv_k1", b"kv_v1"); + put_raft_kv(task_1.raft_wb.as_mut(), 17); task_1.entries.append(&mut vec![ new_entry(5, 5), new_entry(6, 5), @@ -284,8 +277,8 @@ fn test_worker() { let mut task_2 = WriteTask::::new(region_2, 2, 15); init_write_batch(&engines, &mut task_2); - put_kv(&mut task_2.kv_wb, b"kv_k2", b"kv_v2"); - put_raft_kv(&mut task_2.raft_wb, 27); + put_kv(task_2.extra_write.v1_mut(), b"kv_k2", b"kv_v2"); + put_raft_kv(task_2.raft_wb.as_mut(), 27); task_2 .entries .append(&mut vec![new_entry(20, 15), new_entry(21, 15)]); @@ -298,9 +291,9 @@ fn test_worker() { let mut task_3 = WriteTask::::new(region_1, 1, 11); init_write_batch(&engines, &mut task_3); - put_kv(&mut task_3.kv_wb, b"kv_k3", b"kv_v3"); - put_raft_kv(&mut task_3.raft_wb, 37); - delete_raft_kv(&engines.raft, &mut task_3.raft_wb, 17); + put_kv(task_3.extra_write.v1_mut(), b"kv_k3", b"kv_v3"); + put_raft_kv(task_3.raft_wb.as_mut(), 37); + delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); task_3 .entries .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); @@ -357,8 +350,8 @@ fn test_basic_flow() { let mut task_1 = WriteTask::::new(region_1, 1, 10); init_write_batch(&engines, &mut task_1); - put_kv(&mut task_1.kv_wb, b"kv_k1", b"kv_v1"); - put_raft_kv(&mut task_1.raft_wb, 17); + put_kv(task_1.extra_write.v1_mut(), b"kv_k1", b"kv_v1"); + put_raft_kv(task_1.raft_wb.as_mut(), 17); task_1 .entries .append(&mut vec![new_entry(5, 5), new_entry(6, 5), new_entry(7, 5)]); @@ -371,8 +364,8 @@ fn test_basic_flow() { let mut task_2 = WriteTask::::new(2, 2, 20); init_write_batch(&engines, &mut task_2); - put_kv(&mut task_2.kv_wb, b"kv_k2", b"kv_v2"); - put_raft_kv(&mut task_2.raft_wb, 27); + put_kv(task_2.extra_write.v1_mut(), b"kv_k2", b"kv_v2"); + put_raft_kv(task_2.raft_wb.as_mut(), 27); task_2 .entries .append(&mut vec![new_entry(50, 12), new_entry(51, 13)]); @@ -385,10 +378,10 @@ fn test_basic_flow() { let mut task_3 = WriteTask::::new(region_1, 1, 15); init_write_batch(&engines, &mut task_3); - put_kv(&mut task_3.kv_wb, b"kv_k3", b"kv_v3"); - delete_kv(&mut task_3.kv_wb, b"kv_k1"); - put_raft_kv(&mut task_3.raft_wb, 37); - delete_raft_kv(&engines.raft, &mut task_3.raft_wb, 17); + put_kv(task_3.extra_write.v1_mut(), b"kv_k3", b"kv_v3"); + delete_kv(task_3.extra_write.v1_mut(), b"kv_k1"); + put_raft_kv(task_3.raft_wb.as_mut(), 37); + delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); task_3.entries.append(&mut vec![new_entry(6, 6)]); task_3.cut_logs = Some((7, 8)); task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); @@ -429,3 +422,114 @@ fn test_basic_flow() { t.writers.shutdown(); } + +#[test] +fn test_basic_flow_with_states() { + let region_1 = 1; + let region_2 = 2; + + let path = Builder::new() + .prefix("async-io-basic-states") + .tempdir() + .unwrap(); + let engines = new_temp_engine(&path); + let mut cfg = Config::default(); + cfg.store_io_pool_size = 2; + let mut t = TestWriters::new(&cfg, &engines); + + let mut task_1 = WriteTask::::new(region_1, 1, 10); + task_1.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_1 = RaftApplyState::default(); + apply_state_1.applied_index = 2; + let mut extra_state = ExtraStates::new(apply_state_1); + let mut region_state_1 = RegionLocalState::default(); + region_state_1 + .mut_region() + .mut_region_epoch() + .set_version(3); + extra_state.region_state = Some(region_state_1.clone()); + task_1.extra_write.set_v2(extra_state); + put_raft_kv(task_1.raft_wb.as_mut(), 17); + task_1 + .entries + .append(&mut vec![new_entry(5, 5), new_entry(6, 5), new_entry(7, 5)]); + task_1.raft_state = Some(new_raft_state(5, 234, 6, 7)); + task_1 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + + t.write_sender(0).send(WriteMsg::WriteTask(task_1)).unwrap(); + + let mut task_2 = WriteTask::::new(2, 2, 20); + task_2.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_2 = RaftApplyState::default(); + apply_state_2.applied_index = 30; + let extra_state = ExtraStates::new(apply_state_2.clone()); + task_2.extra_write.set_v2(extra_state); + put_raft_kv(task_2.raft_wb.as_mut(), 27); + task_2 + .entries + .append(&mut vec![new_entry(50, 12), new_entry(51, 13)]); + task_2.raft_state = Some(new_raft_state(13, 567, 49, 51)); + task_2 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + + t.write_sender(1).send(WriteMsg::WriteTask(task_2)).unwrap(); + + let mut task_3 = WriteTask::::new(region_1, 1, 15); + task_3.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_3 = RaftApplyState::default(); + apply_state_3.applied_index = 5; + let extra_state = ExtraStates::new(apply_state_3.clone()); + task_3.extra_write.set_v2(extra_state); + put_raft_kv(task_3.raft_wb.as_mut(), 37); + delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); + task_3.entries.append(&mut vec![new_entry(6, 6)]); + task_3.cut_logs = Some((7, 8)); + task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); + task_3 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + + t.write_sender(0).send(WriteMsg::WriteTask(task_3)).unwrap(); + + must_wait_same_notifies(vec![(region_1, (1, 15)), (region_2, (2, 20))], &t.notify_rx); + + assert_eq!(test_raft_kv(&engines.raft, 17), false); + assert_eq!(test_raft_kv(&engines.raft, 27), true); + assert_eq!(test_raft_kv(&engines.raft, 37), true); + + must_have_entries_and_state( + &engines.raft, + vec![ + ( + region_1, + vec![new_entry(5, 5), new_entry(6, 6)], + new_raft_state(6, 345, 6, 6), + ), + ( + region_2, + vec![new_entry(50, 12), new_entry(51, 13)], + new_raft_state(13, 567, 49, 51), + ), + ], + ); + assert_eq!( + engines.raft.get_apply_state(region_1).unwrap().unwrap(), + apply_state_3 + ); + assert_eq!( + engines.raft.get_apply_state(region_2).unwrap().unwrap(), + apply_state_2 + ); + assert_eq!( + engines.raft.get_region_state(region_1).unwrap().unwrap(), + region_state_1 + ); + assert_eq!(engines.raft.get_region_state(region_2).unwrap(), None); + + must_have_same_count_msg(6, &t.msg_rx); + + t.writers.shutdown(); +} diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index c73e12013fe..33b504127f8 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -16,14 +16,20 @@ use std::{ use collections::HashMap; use engine_traits::{KvEngine, RaftEngine, RAFT_LOG_MULTI_GET_CNT}; use fail::fail_point; -use kvproto::raft_serverpb::{RaftApplyState, RaftLocalState}; +use kvproto::{ + metapb, + raft_serverpb::{RaftApplyState, RaftLocalState}, +}; use protobuf::Message; use raft::{prelude::*, util::limit_size, GetEntriesContext, StorageError}; use tikv_alloc::TraceEvent; -use tikv_util::{debug, info, time::Instant, warn, worker::Scheduler}; +use tikv_util::{box_err, debug, info, time::Instant, warn, worker::Scheduler}; -use super::{metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE}; -use crate::{bytes_capacity, store::worker::RaftlogFetchTask}; +use super::{ + metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE, RAFT_INIT_LOG_INDEX, + RAFT_INIT_LOG_TERM, +}; +use crate::{bytes_capacity, store::worker::RaftlogFetchTask, Result}; const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; const SHRINK_CACHE_CAPACITY: usize = 64; @@ -415,6 +421,115 @@ impl AsyncFetchStats { } } +fn validate_states( + region_id: u64, + raft_engine: &ER, + raft_state: &mut RaftLocalState, + apply_state: &RaftApplyState, +) -> Result<()> { + let last_index = raft_state.get_last_index(); + let mut commit_index = raft_state.get_hard_state().get_commit(); + let recorded_commit_index = apply_state.get_commit_index(); + let state_str = || -> String { + format!( + "region {}, raft state {:?}, apply state {:?}", + region_id, raft_state, apply_state + ) + }; + // The commit index of raft state may be less than the recorded commit index. + // If so, forward the commit index. + if commit_index < recorded_commit_index { + let entry = raft_engine.get_entry(region_id, recorded_commit_index)?; + if entry.map_or(true, |e| e.get_term() != apply_state.get_commit_term()) { + return Err(box_err!( + "log at recorded commit index [{}] {} doesn't exist, may lose data, {}", + apply_state.get_commit_term(), + recorded_commit_index, + state_str() + )); + } + info!("updating commit index"; "region_id" => region_id, "old" => commit_index, "new" => recorded_commit_index); + commit_index = recorded_commit_index; + } + // Invariant: applied index <= max(commit index, recorded commit index) + if apply_state.get_applied_index() > commit_index { + return Err(box_err!( + "applied index > max(commit index, recorded commit index), {}", + state_str() + )); + } + // Invariant: max(commit index, recorded commit index) <= last index + if commit_index > last_index { + return Err(box_err!( + "max(commit index, recorded commit index) > last index, {}", + state_str() + )); + } + // Since the entries must be persisted before applying, the term of raft state + // should also be persisted. So it should be greater than the commit term of + // apply state. + if raft_state.get_hard_state().get_term() < apply_state.get_commit_term() { + return Err(box_err!( + "term of raft state < commit term of apply state, {}", + state_str() + )); + } + + raft_state.mut_hard_state().set_commit(commit_index); + + Ok(()) +} + +pub fn init_last_term( + raft_engine: &ER, + region: &metapb::Region, + raft_state: &RaftLocalState, + apply_state: &RaftApplyState, +) -> Result { + let last_idx = raft_state.get_last_index(); + if last_idx == 0 { + return Ok(0); + } else if last_idx == RAFT_INIT_LOG_INDEX { + return Ok(RAFT_INIT_LOG_TERM); + } else if last_idx == apply_state.get_truncated_state().get_index() { + return Ok(apply_state.get_truncated_state().get_term()); + } else { + assert!(last_idx > RAFT_INIT_LOG_INDEX); + } + let entry = raft_engine.get_entry(region.get_id(), last_idx)?; + match entry { + None => Err(box_err!( + "[region {}] entry at {} doesn't exist, may lose data.", + region.get_id(), + last_idx + )), + Some(e) => Ok(e.get_term()), + } +} + +pub fn init_applied_term( + raft_engine: &ER, + region: &metapb::Region, + apply_state: &RaftApplyState, +) -> Result { + if apply_state.applied_index == RAFT_INIT_LOG_INDEX { + return Ok(RAFT_INIT_LOG_TERM); + } + let truncated_state = apply_state.get_truncated_state(); + if apply_state.applied_index == truncated_state.get_index() { + return Ok(truncated_state.get_term()); + } + + match raft_engine.get_entry(region.get_id(), apply_state.applied_index)? { + Some(e) => Ok(e.term), + None => Err(box_err!( + "[region {}] entry at apply index {} doesn't exist, may lose data.", + region.get_id(), + apply_state.applied_index + )), + } +} + /// A subset of `PeerStorage` that focus on accessing log entries. pub struct EntryStorage { region_id: u64, @@ -432,17 +547,25 @@ pub struct EntryStorage { impl EntryStorage { pub fn new( - region_id: u64, peer_id: u64, raft_engine: ER, - raft_state: RaftLocalState, + mut raft_state: RaftLocalState, apply_state: RaftApplyState, - last_term: u64, - applied_term: u64, + region: &metapb::Region, raftlog_fetch_scheduler: Scheduler, - ) -> Self { - EntryStorage { - region_id, + ) -> Result { + if let Err(e) = validate_states(region.id, &raft_engine, &mut raft_state, &apply_state) { + return Err(box_err!( + "[region {}] {} validate state fail: {:?}", + region.id, + peer_id, + e + )); + } + let last_term = init_last_term(&raft_engine, region, &raft_state, &apply_state)?; + let applied_term = init_applied_term(&raft_engine, region, &apply_state)?; + Ok(Self { + region_id: region.id, peer_id, raft_engine, cache: EntryCache::default(), @@ -453,7 +576,7 @@ impl EntryStorage { raftlog_fetch_scheduler, raftlog_fetch_stats: AsyncFetchStats::default(), async_fetch_results: RefCell::new(HashMap::default()), - } + }) } fn check_range(&self, low: u64, high: u64) -> raft::Result<()> { diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 5235f90e156..d4bb0a32266 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1261,7 +1261,8 @@ where Some(WriteWorker::new( self.store.get_id(), "sync-writer".to_string(), - self.engines.clone(), + self.engines.raft.clone(), + Some(self.engines.kv.clone()), rx, self.router.clone(), self.trans.clone(), @@ -1526,8 +1527,14 @@ impl RaftBatchSystem { .background_worker .start("consistency-check", consistency_check_runner); - self.store_writers - .spawn(meta.get_id(), &engines, &self.router, &trans, &cfg)?; + self.store_writers.spawn( + meta.get_id(), + engines.raft.clone(), + Some(engines.kv.clone()), + &self.router, + &trans, + &cfg, + )?; let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); let mut builder = RaftPollerBuilder { @@ -1717,7 +1724,7 @@ pub fn create_raft_batch_system( apply_router, apply_system, router: raft_router.clone(), - store_writers: StoreWriters::new(), + store_writers: StoreWriters::default(), }; (raft_router, system) } diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index bd9564b1a63..d75fef94323 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -15,7 +15,7 @@ mod async_io; mod bootstrap; mod compaction_guard; mod hibernate_state; -mod local_metrics; +pub mod local_metrics; mod peer; mod peer_storage; mod read_queue; @@ -29,8 +29,8 @@ mod worker; pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ - write::{Worker as WriteWorker, WriteMsg, WriteTask}, - write_router::WriteRouter, + write::{PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask}, + write_router::{WriteRouter, WriteRouterContext}, }, bootstrap::{ bootstrap_store, clear_prepare_bootstrap_cluster, clear_prepare_bootstrap_key, @@ -68,8 +68,9 @@ pub use self::{ util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ AutoSplitController, Bucket, BucketRange, CheckLeaderRunner, CheckLeaderTask, - FlowStatistics, FlowStatsReporter, KeyEntry, LocalReader, PdTask, QueryStats, ReadDelegate, - ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, - SplitConfigManager, TrackVer, WriteStats, + FlowStatistics, FlowStatsReporter, KeyEntry, LocalReader, PdTask, QueryStats, + RaftlogFetchRunner, RaftlogFetchTask, ReadDelegate, ReadStats, RefreshConfigTask, + RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, TrackVer, + WriteStats, }, }; diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index aec48c1756f..83363d65ac8 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -164,32 +164,6 @@ pub fn recover_from_applying_state( Ok(()) } -fn init_applied_term( - engines: &Engines, - region: &Region, - apply_state: &RaftApplyState, -) -> Result { - if apply_state.applied_index == RAFT_INIT_LOG_INDEX { - return Ok(RAFT_INIT_LOG_TERM); - } - let truncated_state = apply_state.get_truncated_state(); - if apply_state.applied_index == truncated_state.get_index() { - return Ok(truncated_state.get_term()); - } - - match engines - .raft - .get_entry(region.get_id(), apply_state.applied_index)? - { - Some(e) => Ok(e.term), - None => Err(box_err!( - "[region {}] entry at apply index {} doesn't exist, may lose data.", - region.get_id(), - apply_state.applied_index - )), - } -} - fn init_raft_state( engines: &Engines, region: &Region, @@ -233,92 +207,6 @@ fn init_apply_state( ) } -fn init_last_term( - engines: &Engines, - region: &Region, - raft_state: &RaftLocalState, - apply_state: &RaftApplyState, -) -> Result { - let last_idx = raft_state.get_last_index(); - if last_idx == 0 { - return Ok(0); - } else if last_idx == RAFT_INIT_LOG_INDEX { - return Ok(RAFT_INIT_LOG_TERM); - } else if last_idx == apply_state.get_truncated_state().get_index() { - return Ok(apply_state.get_truncated_state().get_term()); - } else { - assert!(last_idx > RAFT_INIT_LOG_INDEX); - } - let entry = engines.raft.get_entry(region.get_id(), last_idx)?; - match entry { - None => Err(box_err!( - "[region {}] entry at {} doesn't exist, may lose data.", - region.get_id(), - last_idx - )), - Some(e) => Ok(e.get_term()), - } -} - -fn validate_states( - region_id: u64, - engines: &Engines, - raft_state: &mut RaftLocalState, - apply_state: &RaftApplyState, -) -> Result<()> { - let last_index = raft_state.get_last_index(); - let mut commit_index = raft_state.get_hard_state().get_commit(); - let recorded_commit_index = apply_state.get_commit_index(); - let state_str = || -> String { - format!( - "region {}, raft state {:?}, apply state {:?}", - region_id, raft_state, apply_state - ) - }; - // The commit index of raft state may be less than the recorded commit index. - // If so, forward the commit index. - if commit_index < recorded_commit_index { - let entry = engines.raft.get_entry(region_id, recorded_commit_index)?; - if entry.map_or(true, |e| e.get_term() != apply_state.get_commit_term()) { - return Err(box_err!( - "log at recorded commit index [{}] {} doesn't exist, may lose data, {}", - apply_state.get_commit_term(), - recorded_commit_index, - state_str() - )); - } - info!("updating commit index"; "region_id" => region_id, "old" => commit_index, "new" => recorded_commit_index); - commit_index = recorded_commit_index; - } - // Invariant: applied index <= max(commit index, recorded commit index) - if apply_state.get_applied_index() > commit_index { - return Err(box_err!( - "applied index > max(commit index, recorded commit index), {}", - state_str() - )); - } - // Invariant: max(commit index, recorded commit index) <= last index - if commit_index > last_index { - return Err(box_err!( - "max(commit index, recorded commit index) > last index, {}", - state_str() - )); - } - // Since the entries must be persisted before applying, the term of raft state - // should also be persisted. So it should be greater than the commit term of - // apply state. - if raft_state.get_hard_state().get_term() < apply_state.get_commit_term() { - return Err(box_err!( - "term of raft state < commit term of apply state, {}", - state_str() - )); - } - - raft_state.mut_hard_state().set_commit(commit_index); - - Ok(()) -} - pub struct PeerStorage where EK: KvEngine, @@ -411,23 +299,17 @@ where "peer_id" => peer_id, "path" => ?engines.kv.path(), ); - let mut raft_state = init_raft_state(&engines, region)?; + let raft_state = init_raft_state(&engines, region)?; let apply_state = init_apply_state(&engines, region)?; - if let Err(e) = validate_states(region.get_id(), &engines, &mut raft_state, &apply_state) { - return Err(box_err!("{} validate state fail: {:?}", tag, e)); - } - let last_term = init_last_term(&engines, region, &raft_state, &apply_state)?; - let applied_term = init_applied_term(&engines, region, &apply_state)?; + let entry_storage = EntryStorage::new( - region.id, peer_id, engines.raft.clone(), raft_state, apply_state, - last_term, - applied_term, + region, raftlog_fetch_scheduler, - ); + )?; Ok(PeerStorage { engines, @@ -694,11 +576,8 @@ where if task.raft_wb.is_none() { task.raft_wb = Some(self.engines.raft.log_batch(64)); } - if task.kv_wb.is_none() { - task.kv_wb = Some(self.engines.kv.write_batch()); - } let raft_wb = task.raft_wb.as_mut().unwrap(); - let kv_wb = task.kv_wb.as_mut().unwrap(); + let kv_wb = task.extra_write.ensure_v1(|| self.engines.kv.write_batch()); if self.is_initialized() { // we can only delete the old data when the peer is initialized. @@ -1017,9 +896,9 @@ where // in case of recv raft log after snapshot. self.save_snapshot_raft_state_to( ready.snapshot().get_metadata().get_index(), - write_task.kv_wb.as_mut().unwrap(), + write_task.extra_write.v1_mut().unwrap(), )?; - self.save_apply_state_to(write_task.kv_wb.as_mut().unwrap())?; + self.save_apply_state_to(write_task.extra_write.v1_mut().unwrap())?; } if !write_task.has_data() { @@ -1325,7 +1204,8 @@ pub mod tests { ents: &[Entry], ) -> PeerStorage { let mut store = new_storage(region_scheduler, raftlog_fetch_scheduler, path); - let mut write_task = WriteTask::new(store.get_region_id(), store.peer_id, 1); + let mut write_task: WriteTask = + WriteTask::new(store.get_region_id(), store.peer_id, 1); store.append(ents[1..].to_vec(), &mut write_task); store.update_cache_persisted(ents.last().unwrap().get_index()); store @@ -1339,12 +1219,10 @@ pub mod tests { store .apply_state_mut() .set_applied_index(ents.last().unwrap().get_index()); - if write_task.kv_wb.is_none() { - write_task.kv_wb = Some(store.engines.kv.write_batch()); - } - store - .save_apply_state_to(write_task.kv_wb.as_mut().unwrap()) - .unwrap(); + let kv_wb = write_task + .extra_write + .ensure_v1(|| store.engines.kv.write_batch()); + store.save_apply_state_to(kv_wb).unwrap(); write_task.raft_state = Some(store.raft_state().clone()); write_to_db_for_test(&store.engines, write_task); store @@ -1779,11 +1657,10 @@ pub mod tests { s.raft_state_mut().set_last_index(7); s.apply_state_mut().set_applied_index(7); write_task.raft_state = Some(s.raft_state().clone()); - if write_task.kv_wb.is_none() { - write_task.kv_wb = Some(s.engines.kv.write_batch()); - } - s.save_apply_state_to(write_task.kv_wb.as_mut().unwrap()) - .unwrap(); + let kv_wb = write_task + .extra_write + .ensure_v1(|| s.engines.kv.write_batch()); + s.save_apply_state_to(kv_wb).unwrap(); write_to_db_for_test(&s.engines, write_task); let term = s.term(7).unwrap(); compact_raft_log(&s.tag, s.entry_storage.apply_state_mut(), 7, term).unwrap(); From 8479cebfaac80672e7ef5a1ad40e940f7f1b7aba Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 3 Aug 2022 12:00:06 +0800 Subject: [PATCH 130/676] engine_test: add single-rocksdb TestTabletFactory (#13163) close tikv/tikv#13162 Signed-off-by: SpadeA-Tang --- components/engine_test/src/lib.rs | 127 +++++++++++++++++++++---- components/engine_traits/src/engine.rs | 26 +++-- src/server/engine_factory.rs | 43 ++++++--- src/server/engine_factory_v2.rs | 30 ++---- 4 files changed, 165 insertions(+), 61 deletions(-) diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 979fbda17d0..7bdd87827e7 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -116,7 +116,7 @@ pub mod kv { root_path: String, db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>, - registry: Arc>>, + root_db: Arc>>, } impl TestTabletFactory { @@ -129,7 +129,104 @@ pub mod kv { root_path: root_path.to_string(), db_opt, cf_opts, - registry: Arc::new(Mutex::new(HashMap::default())), + root_db: Arc::new(Mutex::default()), + } + } + + fn create_tablet(&self, tablet_path: &Path) -> Result { + let kv_engine = KvTestEngine::new_kv_engine_opt( + tablet_path.to_str().unwrap(), + self.db_opt.clone(), + self.cf_opts.clone(), + )?; + Ok(kv_engine) + } + } + + impl TabletFactory for TestTabletFactory { + fn create_shared_db(&self) -> Result { + let tablet_path = self.tablet_path(0, 0); + let tablet = self.create_tablet(&tablet_path)?; + let mut root_db = self.root_db.lock().unwrap(); + root_db.replace(tablet.clone()); + Ok(tablet) + } + + fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { + let db = self.root_db.lock().unwrap(); + if let Some(cp) = db.as_ref() { + return Ok(cp.clone()); + } + + self.create_shared_db() + } + + fn open_tablet_cache(&self, _id: u64, _suffix: u64) -> Option { + self.open_tablet_raw(&self.tablet_path(0, 0), false).ok() + } + + fn open_tablet_cache_any(&self, _id: u64) -> Option { + self.open_tablet_cache(0, 0) + } + + fn open_tablet_raw(&self, _path: &Path, _readonly: bool) -> Result { + TabletFactory::create_tablet(self, 0, 0) + } + + fn exists_raw(&self, _path: &Path) -> bool { + false + } + + #[inline] + fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + Path::new(&self.root_path).join(format!("tablets/{}_{}", id, suffix)) + } + + #[inline] + fn tablets_path(&self) -> PathBuf { + Path::new(&self.root_path).join("tablets") + } + + #[inline] + fn destroy_tablet(&self, _id: u64, _suffix: u64) -> engine_traits::Result<()> { + Ok(()) + } + + fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { + let db = self.root_db.lock().unwrap(); + let opt = db.as_ref().unwrap().get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap + opt.set_block_cache_capacity(capacity)?; + Ok(()) + } + } + + impl TabletAccessor for TestTabletFactory { + fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &KvTestEngine)) { + let db = self.root_db.lock().unwrap(); + let db = db.as_ref().unwrap(); + f(0, 0, db); + } + + fn is_single_engine(&self) -> bool { + true + } + } + + #[derive(Clone)] + pub struct TestTabletFactoryV2 { + inner: TestTabletFactory, + registry: Arc>>, + } + + impl TestTabletFactoryV2 { + pub fn new( + root_path: &str, + db_opt: DbOptions, + cf_opts: Vec<(&'static str, KvTestCfOptions)>, + ) -> Self { + Self { + inner: TestTabletFactory::new(root_path, db_opt, cf_opts), + registry: Arc::default(), } } } @@ -145,7 +242,7 @@ pub mod kv { (tablet_id, tablet_suffix) } - impl TabletFactory for TestTabletFactory { + impl TabletFactory for TestTabletFactoryV2 { fn create_tablet(&self, id: u64, suffix: u64) -> Result { let mut reg = self.registry.lock().unwrap(); if let Some(db) = reg.get(&(id, suffix)) { @@ -155,35 +252,27 @@ pub mod kv { db.as_inner().path() )); } + let tablet_path = self.tablet_path(id, suffix); - let tablet_path = tablet_path.to_str().unwrap(); - let kv_engine = KvTestEngine::new_kv_engine_opt( - tablet_path, - self.db_opt.clone(), - self.cf_opts.clone(), - )?; + let kv_engine = self.inner.create_tablet(&tablet_path)?; reg.insert((id, suffix), kv_engine.clone()); + Ok(kv_engine) } fn open_tablet(&self, id: u64, suffix: u64) -> Result { - let mut reg = self.registry.lock().unwrap(); + let reg = self.registry.lock().unwrap(); if let Some(db) = reg.get(&(id, suffix)) { return Ok(db.clone()); } let db_path = self.tablet_path(id, suffix); let db = self.open_tablet_raw(db_path.as_path(), false)?; - reg.insert((id, suffix), db.clone()); Ok(db) } fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option { - let reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { - return Some(db.clone()); - } - None + self.registry.lock().unwrap().get(&(id, suffix)).cloned() } fn open_tablet_cache_any(&self, id: u64) -> Option { @@ -217,12 +306,12 @@ pub mod kv { #[inline] fn tablets_path(&self) -> PathBuf { - Path::new(&self.root_path).join("tablets") + Path::new(&self.inner.root_path).join("tablets") } #[inline] fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { - Path::new(&self.root_path).join(format!("tablets/{}_{}", id, suffix)) + Path::new(&self.inner.root_path).join(format!("tablets/{}_{}", id, suffix)) } #[inline] @@ -281,7 +370,7 @@ pub mod kv { } } - impl TabletAccessor for TestTabletFactory { + impl TabletAccessor for TestTabletFactoryV2 { #[inline] fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &KvTestEngine)) { let reg = self.registry.lock().unwrap(); diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index dc09b54fb6e..7add5e4d9b2 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -172,17 +172,10 @@ pub trait TabletFactory: TabletAccessor { /// Open a tablet by id and suffix from cache---that means it should already /// be opened. - fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option { - if let Ok(engine) = self.open_tablet_raw(&self.tablet_path(id, suffix), false) { - return Some(engine); - } - None - } + fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option; /// Open a tablet by id and any suffix from cache - fn open_tablet_cache_any(&self, id: u64) -> Option { - self.open_tablet_cache(id, 0) - } + fn open_tablet_cache_any(&self, id: u64) -> Option; /// Open tablet by path and readonly flag fn open_tablet_raw(&self, path: &Path, readonly: bool) -> Result; @@ -242,21 +235,35 @@ where fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { Ok(self.engine.as_ref().unwrap().clone()) } + fn open_tablet_raw(&self, _path: &Path, _readonly: bool) -> Result { Ok(self.engine.as_ref().unwrap().clone()) } + + fn open_tablet_cache(&self, _id: u64, _suffix: u64) -> Option { + Some(self.engine.as_ref().unwrap().clone()) + } + + fn open_tablet_cache_any(&self, _id: u64) -> Option { + Some(self.engine.as_ref().unwrap().clone()) + } + fn create_shared_db(&self) -> Result { Ok(self.engine.as_ref().unwrap().clone()) } + fn destroy_tablet(&self, _id: u64, _suffix: u64) -> Result<()> { Ok(()) } + fn exists_raw(&self, _path: &Path) -> bool { true } + fn tablet_path(&self, _id: u64, _suffix: u64) -> PathBuf { PathBuf::from(&self.root_path) } + fn tablets_path(&self) -> PathBuf { PathBuf::from(&self.root_path) } @@ -271,6 +278,7 @@ where opt.set_block_cache_capacity(capacity) } } + impl TabletAccessor for DummyFactory where EK: CfOptionsExt + Clone + Send + 'static, diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 4e2edc13569..7ddf338d870 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -18,6 +18,7 @@ use kvproto::kvrpcpb::ApiVersion; use raftstore::RegionInfoAccessor; use tikv_util::worker::Scheduler; +use super::engine_factory_v2::KvEngineFactoryV2; use crate::config::{DbConfig, TiKvConfig, DEFAULT_ROCKSDB_SUB_DIR}; struct FactoryInner { @@ -89,6 +90,17 @@ impl KvEngineFactoryBuilder { compact_event_sender: self.compact_event_sender.clone(), } } + + pub fn build_v2(self) -> KvEngineFactoryV2 { + let factory = KvEngineFactory { + inner: Arc::new(self.inner), + compact_event_sender: self.compact_event_sender.clone(), + }; + KvEngineFactoryV2 { + inner: factory, + registry: Arc::default(), + } + } } #[derive(Clone)] @@ -219,13 +231,22 @@ impl TabletFactory for KvEngineFactory { } fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { - if let Ok(db) = self.inner.root_db.lock() { - let cp = db.as_ref().unwrap().clone(); - return Ok(cp); + let db = self.inner.root_db.lock().unwrap(); + if let Some(cp) = db.as_ref() { + return Ok(cp.clone()); } + self.create_shared_db() } + fn open_tablet_cache(&self, _id: u64, _suffix: u64) -> Option { + self.open_tablet_raw(&self.tablet_path(0, 0), false).ok() + } + + fn open_tablet_cache_any(&self, _id: u64) -> Option { + self.open_tablet_cache(0, 0) + } + fn open_tablet_raw(&self, _path: &Path, _readonly: bool) -> Result { TabletFactory::create_tablet(self, 0, 0) } @@ -233,9 +254,11 @@ impl TabletFactory for KvEngineFactory { fn exists_raw(&self, _path: &Path) -> bool { false } + fn tablet_path(&self, _id: u64, _suffix: u64) -> PathBuf { self.kv_engine_path() } + fn tablets_path(&self) -> PathBuf { self.kv_engine_path() } @@ -246,20 +269,18 @@ impl TabletFactory for KvEngineFactory { } fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - if let Ok(db) = self.inner.root_db.lock() { - let opt = db.as_ref().unwrap().get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity)?; - } + let db = self.inner.root_db.lock().unwrap(); + let opt = db.as_ref().unwrap().get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap + opt.set_block_cache_capacity(capacity)?; Ok(()) } } impl TabletAccessor for KvEngineFactory { fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { - if let Ok(db) = self.inner.root_db.lock() { - let db = db.as_ref().unwrap(); - f(0, 0, db); - } + let db = self.inner.root_db.lock().unwrap(); + let db = db.as_ref().unwrap(); + f(0, 0, db); } fn is_single_engine(&self) -> bool { diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 094f6f5d5e6..5d26958ea41 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -15,8 +15,8 @@ const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; #[derive(Clone)] pub struct KvEngineFactoryV2 { - inner: KvEngineFactory, - registry: Arc>>, + pub inner: KvEngineFactory, + pub registry: Arc>>, } // Extract tablet id and tablet suffix from the path. @@ -49,7 +49,7 @@ impl TabletFactory for KvEngineFactoryV2 { } fn open_tablet(&self, id: u64, suffix: u64) -> Result { - let mut reg = self.registry.lock().unwrap(); + let reg = self.registry.lock().unwrap(); if let Some(db) = reg.get(&(id, suffix)) { return Ok(db.clone()); } @@ -57,16 +57,11 @@ impl TabletFactory for KvEngineFactoryV2 { let db_path = self.tablet_path(id, suffix); let db = self.open_tablet_raw(db_path.as_path(), false)?; debug!("open tablet"; "key" => ?(id, suffix)); - reg.insert((id, suffix), db.clone()); Ok(db) } fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option { - let reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { - return Some(db.clone()); - } - None + self.registry.lock().unwrap().get(&(id, suffix)).cloned() } fn open_tablet_cache_any(&self, id: u64) -> Option { @@ -153,6 +148,7 @@ impl TabletFactory for KvEngineFactoryV2 { let new_engine = self.open_tablet_raw(db_path.as_path(), false); if new_engine.is_ok() { let (old_id, old_suffix) = get_id_and_suffix_from_path(path); + assert!(suffix > old_suffix); self.registry.lock().unwrap().remove(&(old_id, old_suffix)); } new_engine @@ -206,15 +202,6 @@ mod tests { }; } - impl KvEngineFactoryV2 { - pub fn new(inner: KvEngineFactory) -> Self { - KvEngineFactoryV2 { - inner, - registry: Arc::new(Mutex::new(HashMap::default())), - } - } - } - #[test] fn test_kvengine_factory() { let cfg = TEST_CONFIG.clone(); @@ -268,8 +255,8 @@ mod tests { if let Some(cache) = cache { builder = builder.block_cache(cache); } - let inner_factory = builder.build(); - let factory = KvEngineFactoryV2::new(inner_factory); + + let factory = builder.build_v2(); let tablet = factory.create_tablet(1, 10).unwrap(); let tablet2 = factory.open_tablet(1, 10).unwrap(); assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); @@ -314,8 +301,7 @@ mod tests { let env = cfg.build_shared_rocks_env(None, None).unwrap(); let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); - let inner_factory = builder.build(); - let factory = KvEngineFactoryV2::new(inner_factory); + let factory = builder.build_v2(); factory.create_tablet(1, 10).unwrap(); factory.create_tablet(2, 10).unwrap(); let mut count = 0; From 5c866c4685b27bf52296dc1d38e2671dc04fbe01 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Wed, 3 Aug 2022 14:32:06 +0800 Subject: [PATCH 131/676] raftstore: Implement observer on_compute_engine_size (#12948) ref tikv/tikv#12849 Implement observer on_compute_engine_size Signed-off-by: CalvinNeo Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot --- .../raftstore/src/coprocessor/dispatcher.rs | 27 ++++ components/raftstore/src/coprocessor/mod.rs | 22 ++- components/raftstore/src/store/fsm/store.rs | 2 + components/raftstore/src/store/worker/pd.rs | 149 ++++++++++++------ 4 files changed, 152 insertions(+), 48 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index c752e629af1..6297722a996 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -143,6 +143,7 @@ impl_box_observer_g!( SplitCheckObserver, WrappedSplitCheckObserver ); +impl_box_observer!(BoxPdTaskObserver, PdTaskObserver, WrappedPdTaskObserver); impl_box_observer!(BoxRoleObserver, RoleObserver, WrappedRoleObserver); impl_box_observer!( BoxRegionChangeObserver, @@ -176,6 +177,7 @@ where region_change_observers: Vec>, cmd_observers: Vec>>, read_index_observers: Vec>, + pd_task_observers: Vec>, // TODO: add endpoint } @@ -191,6 +193,7 @@ impl Default for Registry { region_change_observers: Default::default(), cmd_observers: Default::default(), read_index_observers: Default::default(), + pd_task_observers: Default::default(), } } } @@ -237,6 +240,10 @@ impl Registry { push!(priority, cco, self.consistency_check_observers); } + pub fn register_pd_task_observer(&mut self, priority: u32, ro: BoxPdTaskObserver) { + push!(priority, ro, self.pd_task_observers); + } + pub fn register_role_observer(&mut self, priority: u32, ro: BoxRoleObserver) { push!(priority, ro, self.role_observers); } @@ -548,6 +555,15 @@ impl CoprocessorHost { Ok(hashes) } + pub fn on_compute_engine_size(&self) -> Option { + let mut store_size = None; + for observer in &self.registry.pd_task_observers { + let observer = observer.observer.inner(); + observer.on_compute_engine_size(&mut store_size); + } + store_size + } + pub fn on_role_change(&self, region: &Region, role_change: RoleChange) { loop_ob!( region, @@ -721,6 +737,12 @@ mod tests { } } + impl PdTaskObserver for TestCoprocessor { + fn on_compute_engine_size(&self, _: &mut Option) { + self.called.fetch_add(19, Ordering::SeqCst); + } + } + impl RoleObserver for TestCoprocessor { fn on_role_change(&self, ctx: &mut ObserverContext<'_>, _: &RoleChange) { self.called.fetch_add(7, Ordering::SeqCst); @@ -795,6 +817,8 @@ mod tests { .register_query_observer(1, BoxQueryObserver::new(ob.clone())); host.registry .register_apply_snapshot_observer(1, BoxApplySnapshotObserver::new(ob.clone())); + host.registry + .register_pd_task_observer(1, BoxPdTaskObserver::new(ob.clone())); host.registry .register_role_observer(1, BoxRoleObserver::new(ob.clone())); host.registry @@ -859,6 +883,9 @@ mod tests { admin_req.set_admin_request(AdminRequest::default()); host.pre_exec(®ion, &admin_req, 0, 0); assert_all!([&ob.called], &[119]); // 16 + + host.on_compute_engine_size(); + assert_all!([&ob.called], &[138]); // 19 } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index e7c351262fa..9f82c90968b 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -32,8 +32,8 @@ pub use self::{ consistency_check::{ConsistencyCheckObserver, Raw as RawConsistencyCheckObserver}, dispatcher::{ BoxAdminObserver, BoxApplySnapshotObserver, BoxCmdObserver, BoxConsistencyCheckObserver, - BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, BoxSplitCheckObserver, - CoprocessorHost, Registry, + BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, + BoxSplitCheckObserver, CoprocessorHost, Registry, }, error::{Error, Result}, region_info_accessor::{ @@ -203,6 +203,24 @@ pub trait SplitCheckObserver: Coprocessor { ); } +/// Describes size information about all stores. +/// There is guarantee that capacity >= used + avail. +/// since some space can be reserved. +#[derive(Debug, Default)] +pub struct StoreSizeInfo { + /// The capacity of the store. + pub capacity: u64, + /// Size of actual data. + pub used: u64, + /// Available space that can be written with actual data. + pub avail: u64, +} + +pub trait PdTaskObserver: Coprocessor { + /// Compute capacity/used/available size of this store. + fn on_compute_engine_size(&self, _: &mut Option) {} +} + pub struct RoleChange { pub state: StateRole, pub leader_id: u64, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index d4bb0a32266..b058d0bb35e 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1622,6 +1622,7 @@ impl RaftBatchSystem { let (raft_builder, apply_builder) = (builder.clone(), apply_poller_builder.clone()); let tag = format!("raftstore-{}", store.get_id()); + let coprocessor_host = builder.coprocessor_host.clone(); self.system.spawn(tag, builder); let mut mailboxes = Vec::with_capacity(region_peers.len()); let mut address = Vec::with_capacity(region_peers.len()); @@ -1669,6 +1670,7 @@ impl RaftBatchSystem { collector_reg_handle, region_read_progress, health_service, + coprocessor_host, ); assert!(workers.pd_worker.start_with_timer(pd_runner)); diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index d65cbcea8d4..9e5e54c185e 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -49,19 +49,22 @@ use tikv_util::{ }; use yatp::Remote; -use crate::store::{ - cmd_resp::new_error, - metrics::*, - peer::{UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryForceLeaderSyncer}, - transport::SignificantRouter, - util::{is_epoch_stale, KeysInfoFormatter, LatencyInspector, RaftstoreDuration}, - worker::{ - query_stats::QueryStats, - split_controller::{SplitInfo, TOP_N}, - AutoSplitController, ReadStats, SplitConfigChange, WriteStats, +use crate::{ + coprocessor::CoprocessorHost, + store::{ + cmd_resp::new_error, + metrics::*, + peer::{UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryForceLeaderSyncer}, + transport::SignificantRouter, + util::{is_epoch_stale, KeysInfoFormatter, LatencyInspector, RaftstoreDuration}, + worker::{ + query_stats::QueryStats, + split_controller::{SplitInfo, TOP_N}, + AutoSplitController, ReadStats, SplitConfigChange, WriteStats, + }, + Callback, CasualMessage, Config, PeerMsg, RaftCmdExtraOpts, RaftCommand, RaftRouter, + RegionReadProgressRegistry, SignificantMsg, SnapManager, StoreInfo, StoreMsg, TxnExt, }, - Callback, CasualMessage, Config, PeerMsg, RaftCmdExtraOpts, RaftCommand, RaftRouter, - RegionReadProgressRegistry, SignificantMsg, SnapManager, StoreInfo, StoreMsg, TxnExt, }; type RecordPairVec = Vec; @@ -902,6 +905,7 @@ where // The health status of the store is updated by the slow score mechanism. health_service: Option, curr_health_status: ServingStatus, + coprocessor_host: CoprocessorHost, } impl Runner @@ -926,6 +930,7 @@ where collector_reg_handle: CollectorRegHandle, region_read_progress: RegionReadProgressRegistry, health_service: Option, + coprocessor_host: CoprocessorHost, ) -> Runner { // Register the region CPU records collector. let mut region_cpu_records_collector = None; @@ -969,6 +974,7 @@ where slow_score: SlowScore::new(cfg.inspect_interval.0), health_service, curr_health_status: ServingStatus::Serving, + coprocessor_host, } } @@ -1179,18 +1185,6 @@ where store_report: Option, dr_autosync_status: Option, ) { - let disk_stats = match fs2::statvfs(store_info.kv_engine.path()) { - Err(e) => { - error!( - "get disk stat for rocksdb failed"; - "engine_path" => store_info.kv_engine.path(), - "err" => ?e - ); - return; - } - Ok(stats) => stats, - }; - let mut report_peers = HashMap::default(); for (region_id, region_peer) in &mut self.region_peers { let read_bytes = region_peer.read_bytes - region_peer.last_store_report_read_bytes; @@ -1218,35 +1212,21 @@ where } stats = collect_report_read_peer_stats(HOTSPOT_REPORT_CAPACITY, report_peers, stats); - - let disk_cap = disk_stats.total_space(); - let capacity = if store_info.capacity == 0 || disk_cap < store_info.capacity { - disk_cap - } else { - store_info.capacity + let (capacity, used_size, available) = match collect_engine_size( + &self.coprocessor_host, + Some(&store_info), + self.snap_mgr.get_total_snap_size().unwrap(), + ) { + Some((capacity, used_size, available)) => (capacity, used_size, available), + None => return, }; - stats.set_capacity(capacity); - let used_size = self.snap_mgr.get_total_snap_size().unwrap() - + store_info - .kv_engine - .get_engine_used_size() - .expect("kv engine used size") - + store_info - .raft_engine - .get_engine_size() - .expect("raft engine used size"); + stats.set_capacity(capacity); stats.set_used_size(used_size); - let mut available = capacity.checked_sub(used_size).unwrap_or_default(); - // We only care about rocksdb SST file size, so we should check disk available - // here. - available = cmp::min(available, disk_stats.available_space()); - if available == 0 { warn!("no available space"); } - stats.set_available(available); stats.set_bytes_read( self.store_stat.engine_total_bytes_read - self.store_stat.engine_last_total_bytes_read, @@ -2304,6 +2284,48 @@ fn collect_report_read_peer_stats( stats } +fn collect_engine_size( + coprocessor_host: &CoprocessorHost, + store_info: Option<&StoreInfo>, + snap_mgr_size: u64, +) -> Option<(u64, u64, u64)> { + if let Some(engine_size) = coprocessor_host.on_compute_engine_size() { + return Some((engine_size.capacity, engine_size.used, engine_size.avail)); + } + let store_info = store_info.unwrap(); + let disk_stats = match fs2::statvfs(store_info.kv_engine.path()) { + Err(e) => { + error!( + "get disk stat for rocksdb failed"; + "engine_path" => store_info.kv_engine.path(), + "err" => ?e + ); + return None; + } + Ok(stats) => stats, + }; + let disk_cap = disk_stats.total_space(); + let capacity = if store_info.capacity == 0 || disk_cap < store_info.capacity { + disk_cap + } else { + store_info.capacity + }; + let used_size = snap_mgr_size + + store_info + .kv_engine + .get_engine_used_size() + .expect("kv engine used size") + + store_info + .raft_engine + .get_engine_size() + .expect("raft engine used size"); + let mut available = capacity.checked_sub(used_size).unwrap_or_default(); + // We only care about rocksdb SST file size, so we should check disk available + // here. + available = cmp::min(available, disk_stats.available_space()); + Some((capacity, used_size, available)) +} + fn get_read_query_num(stat: &pdpb::QueryStats) -> u64 { stat.get_get() + stat.get_coprocessor() + stat.get_scan() } @@ -2494,9 +2516,12 @@ mod tests { ); } + use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; use metapb::Peer; use resource_metering::{RawRecord, TagInfos}; + use crate::coprocessor::{BoxPdTaskObserver, Coprocessor, PdTaskObserver, StoreSizeInfo}; + #[test] fn test_calculate_region_cpu_records() { // region_id -> total_cpu_time_ms @@ -2600,4 +2625,36 @@ mod tests { assert_eq!(report.stats.get_read_qps(), expected); } } + + #[derive(Debug, Clone, Default)] + struct PdObserver {} + + impl Coprocessor for PdObserver {} + + impl PdTaskObserver for PdObserver { + fn on_compute_engine_size(&self, s: &mut Option) { + let _ = s.insert(StoreSizeInfo { + capacity: 444, + used: 111, + avail: 333, + }); + } + } + + #[test] + fn test_pd_task_observer() { + let mut host = CoprocessorHost::::default(); + let obs = PdObserver::default(); + host.registry + .register_pd_task_observer(1, BoxPdTaskObserver::new(obs)); + let store_size = collect_engine_size::(&host, None, 0); + let (cap, used, avail) = if let Some((cap, used, avail)) = store_size { + (cap, used, avail) + } else { + panic!("store_size should not be none"); + }; + assert_eq!(cap, 444); + assert_eq!(used, 111); + assert_eq!(avail, 333); + } } From 8be0b14d34382eaa7ff9814714af19043547466e Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 3 Aug 2022 18:26:06 +0800 Subject: [PATCH 132/676] *: update toolchain to 2022-07-31 (#13190) ref tikv/tikv#13008, ref tikv/tikv#13009 Signed-off-by: tabokie --- Cargo.lock | 8 +- Makefile | 1 + cmd/tikv-ctl/src/cmd.rs | 4 +- components/api_version/src/lib.rs | 4 +- .../backup-stream/src/checkpoint_manager.rs | 2 +- .../backup-stream/src/metadata/client.rs | 4 +- .../src/metadata/store/slash_etc.rs | 2 +- components/backup-stream/src/metrics.rs | 2 +- components/backup-stream/src/router.rs | 17 +-- .../backup-stream/src/subscription_track.rs | 2 +- components/backup-stream/src/utils.rs | 2 +- components/batch-system/src/fsm.rs | 4 +- components/causal_ts/src/tso.rs | 14 +- components/cdc/src/channel.rs | 2 +- components/cdc/src/delegate.rs | 4 +- components/cdc/src/endpoint.rs | 2 +- .../cdc/tests/failpoints/test_resolve.rs | 2 +- components/cloud/aws/src/kms.rs | 2 +- components/cloud/aws/src/s3.rs | 32 +++-- components/codec/src/buffer.rs | 10 +- components/codec/src/byte.rs | 8 +- .../concurrency_manager/src/key_handle.rs | 4 +- components/concurrency_manager/src/lib.rs | 8 +- .../concurrency_manager/src/lock_table.rs | 2 +- components/coprocessor_plugin_api/src/util.rs | 4 +- components/encryption/src/config.rs | 6 +- .../encryption/src/encrypted_file/header.rs | 4 +- components/encryption/src/manager/mod.rs | 4 +- .../encryption/src/master_key/metadata.rs | 4 +- .../engine_rocks/src/compact_listener.rs | 2 +- components/engine_rocks/src/engine.rs | 2 +- components/engine_rocks/src/misc.rs | 5 +- .../engine_rocks/src/perf_context_impl.rs | 4 +- components/engine_rocks/src/properties.rs | 2 +- .../engine_rocks/src/range_properties.rs | 8 +- components/engine_rocks/src/ttl_properties.rs | 4 +- components/engine_traits/src/cf_defs.rs | 8 +- components/engine_traits/src/encryption.rs | 4 +- components/engine_traits/src/engine.rs | 3 +- components/engine_traits/src/errors.rs | 6 +- components/engine_traits/src/perf_context.rs | 2 +- .../engine_traits/src/sst_partitioner.rs | 6 +- components/engine_traits_tests/src/ctor.rs | 4 +- .../engine_traits_tests/src/delete_range.rs | 10 +- .../engine_traits_tests/src/iterator.rs | 64 ++++----- .../src/scenario_writes.rs | 6 +- components/engine_traits_tests/src/sst.rs | 24 ++-- .../engine_traits_tests/src/write_batch.rs | 40 +++--- components/error_code/src/lib.rs | 2 +- components/file_system/src/lib.rs | 10 +- components/file_system/src/rate_limiter.rs | 4 +- components/keys/src/lib.rs | 10 +- components/keys/src/rewrite.rs | 2 +- .../online_config_derive/src/lib.rs | 6 +- components/online_config/src/lib.rs | 4 +- components/raftstore-v2/src/router/message.rs | 2 +- .../src/coprocessor/region_info_accessor.rs | 2 +- .../src/coprocessor/split_check/size.rs | 2 +- components/raftstore/src/lib.rs | 1 - components/raftstore/src/store/config.rs | 75 +++++++---- .../raftstore/src/store/entry_storage.rs | 2 +- components/raftstore/src/store/fsm/apply.rs | 20 +-- components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/fsm/store.rs | 8 +- components/raftstore/src/store/msg.rs | 2 +- components/raftstore/src/store/peer.rs | 4 +- .../raftstore/src/store/peer_storage.rs | 6 +- components/raftstore/src/store/read_queue.rs | 2 +- .../raftstore/src/store/region_snapshot.rs | 2 +- .../raftstore/src/store/replication_mode.rs | 3 +- components/raftstore/src/store/snap.rs | 22 ++-- components/raftstore/src/store/txn_ext.rs | 2 +- components/raftstore/src/store/util.rs | 4 +- .../src/store/worker/cleanup_snapshot.rs | 2 +- components/raftstore/src/store/worker/pd.rs | 12 +- .../raftstore/src/store/worker/split_check.rs | 2 +- .../src/store/worker/split_controller.rs | 2 +- components/resource_metering/src/config.rs | 6 +- components/resource_metering/src/lib.rs | 15 +-- components/resource_metering/src/model.rs | 4 +- components/sst_importer/src/sst_importer.rs | 10 +- components/sst_importer/src/sst_writer.rs | 6 +- components/test_raftstore/src/pd.rs | 2 +- .../test_raftstore/src/transport_simulate.rs | 4 +- components/test_sst_importer/src/lib.rs | 2 +- components/test_storage/src/assert_storage.rs | 73 +++++------ .../tidb_query_aggr/src/impl_max_min.rs | 2 +- components/tidb_query_aggr/src/lib.rs | 8 +- .../tidb_query_common/src/execute_stats.rs | 2 +- .../tidb_query_common/src/storage/range.rs | 6 +- .../src/storage/ranges_iter.rs | 2 +- .../tidb_query_datatype/src/codec/convert.rs | 6 +- .../src/codec/data_type/chunked_vec_bytes.rs | 2 +- .../src/codec/data_type/mod.rs | 6 +- .../src/codec/data_type/vector.rs | 2 +- .../tidb_query_datatype/src/codec/datum.rs | 4 +- .../src/codec/mysql/binary_literal.rs | 12 +- .../src/codec/mysql/decimal.rs | 2 +- .../src/codec/mysql/duration.rs | 2 +- .../src/codec/mysql/enums.rs | 6 +- .../src/codec/mysql/json/mod.rs | 4 +- .../src/codec/mysql/json/modifier.rs | 4 +- .../src/codec/mysql/json/serde.rs | 2 +- .../src/codec/mysql/time/mod.rs | 16 +-- .../src/codec/mysql/time/tz.rs | 2 +- .../src/codec/row/v2/row_slice.rs | 4 +- .../tidb_query_datatype/src/codec/table.rs | 14 +- .../tidb_query_datatype/src/def/eval_type.rs | 2 +- .../tidb_query_datatype/src/def/field_type.rs | 4 +- .../tidb_query_datatype/src/expr/ctx.rs | 2 +- .../src/index_scan_executor.rs | 38 +++--- .../src/limit_executor.rs | 2 +- .../src/projection_executor.rs | 2 +- .../src/selection_executor.rs | 2 +- .../src/table_scan_executor.rs | 12 +- .../tidb_query_expr/src/impl_arithmetic.rs | 14 +- components/tidb_query_expr/src/impl_cast.rs | 6 +- .../tidb_query_expr/src/impl_compare.rs | 2 +- .../tidb_query_expr/src/impl_compare_in.rs | 4 +- .../tidb_query_expr/src/impl_encryption.rs | 10 +- components/tidb_query_expr/src/impl_json.rs | 4 +- components/tidb_query_expr/src/impl_math.rs | 24 ++-- components/tidb_query_expr/src/impl_op.rs | 32 ++--- components/tidb_query_expr/src/impl_string.rs | 20 +-- components/tidb_query_expr/src/impl_time.rs | 4 +- .../tidb_query_expr/src/types/expr_builder.rs | 26 +--- .../tidb_query_expr/src/types/expr_eval.rs | 12 +- components/tikv_kv/src/btree_engine.rs | 2 +- components/tikv_kv/src/cursor.rs | 24 ++-- components/tikv_kv/src/lib.rs | 14 +- components/tikv_util/src/codec/bytes.rs | 4 +- components/tikv_util/src/config.rs | 8 +- components/tikv_util/src/future.rs | 2 +- components/tikv_util/src/lib.rs | 2 +- components/tikv_util/src/logger/file_log.rs | 2 +- components/tikv_util/src/logger/mod.rs | 10 +- components/tikv_util/src/metrics/mod.rs | 2 +- .../tikv_util/src/metrics/threads_linux.rs | 2 +- components/tikv_util/src/mpsc/batch.rs | 2 +- components/tikv_util/src/time.rs | 2 +- components/tikv_util/src/timer.rs | 2 +- components/tikv_util/src/worker/pool.rs | 2 +- .../tikv_util/src/yatp_pool/future_pool.rs | 30 ++--- components/tracker/src/lib.rs | 4 +- components/tracker/src/slab.rs | 2 +- components/txn_types/src/lock.rs | 6 +- components/txn_types/src/timestamp.rs | 2 +- components/txn_types/src/types.rs | 20 ++- fuzz/cli.rs | 2 +- rust-toolchain | 2 +- rustfmt.toml | 1 - scripts/check-docker-build | 22 ++-- scripts/check-license | 14 ++ scripts/check-redact-log | 16 +-- scripts/clippy | 58 ++++---- scripts/clippy-all | 18 +-- scripts/run-cargo.sh | 2 +- src/config.rs | 124 ++++++++---------- src/coprocessor/endpoint.rs | 4 +- .../interceptors/concurrency_limiter.rs | 6 +- src/server/config.rs | 22 ++-- src/server/debug.rs | 12 +- src/server/engine_factory_v2.rs | 4 +- src/server/gc_worker/gc_manager.rs | 4 +- src/server/gc_worker/gc_worker.rs | 20 ++- src/server/lock_manager/client.rs | 7 +- src/server/raftkv.rs | 10 +- src/server/resolve.rs | 2 +- src/server/server.rs | 2 +- src/server/snap.rs | 4 +- src/server/status_server/profile.rs | 6 +- src/storage/config.rs | 4 +- src/storage/mod.rs | 2 +- src/storage/mvcc/mod.rs | 2 +- src/storage/mvcc/reader/point_getter.rs | 10 +- src/storage/mvcc/reader/scanner/backward.rs | 4 +- src/storage/mvcc/reader/scanner/forward.rs | 4 +- src/storage/mvcc/txn.rs | 28 ++-- src/storage/txn/actions/check_txn_status.rs | 2 +- src/storage/txn/actions/commit.rs | 2 +- src/storage/txn/actions/tests.rs | 38 +++--- .../singleton_flow_controller.rs | 2 +- src/storage/txn/store.rs | 24 ++-- tests/failpoints/cases/test_conf_change.rs | 2 +- tests/failpoints/cases/test_encryption.rs | 2 +- tests/failpoints/cases/test_hibernate.rs | 2 +- tests/failpoints/cases/test_merge.rs | 2 +- tests/failpoints/cases/test_pd_client.rs | 2 +- tests/failpoints/cases/test_replica_read.rs | 6 +- tests/failpoints/cases/test_split_region.rs | 2 +- tests/failpoints/cases/test_storage.rs | 2 +- tests/failpoints/cases/test_transaction.rs | 6 +- .../failpoints/cases/test_transfer_leader.rs | 6 +- tests/failpoints/cases/test_ttl.rs | 4 +- .../failpoints/cases/test_unsafe_recovery.rs | 24 ++-- .../integrations/config/dynamic/gc_worker.rs | 2 +- .../config/dynamic/pessimistic_txn.rs | 2 +- .../integrations/config/dynamic/raftstore.rs | 2 +- .../integrations/config/test_config_client.rs | 12 +- tests/integrations/pd/test_rpc_client.rs | 4 +- .../integrations/raftstore/test_bootstrap.rs | 2 +- .../raftstore/test_early_apply.rs | 2 +- .../integrations/raftstore/test_lease_read.rs | 4 +- tests/integrations/raftstore/test_merge.rs | 6 +- .../raftstore/test_replica_read.rs | 14 +- .../raftstore/test_split_region.rs | 2 +- .../raftstore/test_unsafe_recovery.rs | 24 ++-- tests/integrations/server/kv_service.rs | 2 +- tests/integrations/server/raft_client.rs | 2 +- tests/integrations/server/security.rs | 2 +- .../integrations/storage/test_raft_storage.rs | 28 ++-- tests/integrations/storage/test_raftkv.rs | 2 +- tests/integrations/storage/test_titan.rs | 2 +- 213 files changed, 869 insertions(+), 970 deletions(-) create mode 100755 scripts/check-license diff --git a/Cargo.lock b/Cargo.lock index 9e0303726fb..c5d22fc6e61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3676,18 +3676,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58ad3879ad3baf4e44784bc6a718a8698867bb991f8ce24d1bcbe2cfb4c3a75e" +checksum = "78203e83c48cffbe01e4a2d35d566ca4de445d79a85372fc64e378bfc812a260" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744b6f092ba29c3650faf274db506afd39944f48420f6c86b17cfe0ee1cb36bb" +checksum = "710faf75e1b33345361201d36d04e98ac1ed8909151a017ed384700836104c74" dependencies = [ "proc-macro2", "quote", diff --git a/Makefile b/Makefile index fb7bbf6052e..3229a307e7f 100644 --- a/Makefile +++ b/Makefile @@ -347,6 +347,7 @@ pre-clippy: unset-override clippy: pre-clippy @./scripts/check-redact-log @./scripts/check-docker-build + @./scripts/check-license @./scripts/clippy-all pre-audit: diff --git a/cmd/tikv-ctl/src/cmd.rs b/cmd/tikv-ctl/src/cmd.rs index 7f459a4c127..2fec7ea9cef 100644 --- a/cmd/tikv-ctl/src/cmd.rs +++ b/cmd/tikv-ctl/src/cmd.rs @@ -1,13 +1,13 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::{borrow::ToOwned, lazy::SyncLazy, str, string::ToString, u64}; +use std::{borrow::ToOwned, str, string::ToString, sync::LazyLock, u64}; use clap::{crate_authors, AppSettings}; use engine_traits::CF_DEFAULT; use structopt::StructOpt; const RAW_KEY_HINT: &str = "Raw key (generally starts with \"z\") in escaped form"; -static VERSION_INFO: SyncLazy = SyncLazy::new(|| { +static VERSION_INFO: LazyLock = LazyLock::new(|| { let build_timestamp = option_env!("TIKV_BUILD_TIME"); tikv::tikv_version_info(build_timestamp) }); diff --git a/components/api_version/src/lib.rs b/components/api_version/src/lib.rs index 60f23455cc7..fb8fd13cbfd 100644 --- a/components/api_version/src/lib.rs +++ b/components/api_version/src/lib.rs @@ -176,7 +176,7 @@ macro_rules! dispatch_api_version { } /// The key mode inferred from the key prefix. -#[derive(Debug, Clone, Copy, Eq, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum KeyMode { /// Raw key. Raw, @@ -235,7 +235,7 @@ pub enum KeyMode { /// | 0x12 0x34 0x56 | 0x00 0x00 0x00 0x00 0x00 0x00 0xff 0xff | 0x01 (0b00000001) | /// -------------------------------------------------------------------------------- /// ``` -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub struct RawValue> { /// The user value. pub user_value: T, diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 7dae680fa05..2874d548c5a 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -138,7 +138,7 @@ fn epoch_not_match(id: u64, sent: u64, real: u64) -> PbError { err } -#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)] +#[derive(Debug, PartialEq, Hash, Clone, Copy)] /// A simple region id, but versioned. pub struct RegionIdWithVersion { pub region_id: u64, diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 2732952930c..e92addd2992 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -72,7 +72,7 @@ impl PartialEq for MetadataEvent { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum CheckpointProvider { Store(u64), Region { id: u64, version: u64 }, @@ -82,7 +82,7 @@ pub enum CheckpointProvider { /// The polymorphic checkpoint. /// The global checkpoint should be the minimal checkpoint of all checkpoints. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub struct Checkpoint { pub provider: CheckpointProvider, pub ts: TimeStamp, diff --git a/components/backup-stream/src/metadata/store/slash_etc.rs b/components/backup-stream/src/metadata/store/slash_etc.rs index 2ae4c05dfaf..0d6484b0c1e 100644 --- a/components/backup-stream/src/metadata/store/slash_etc.rs +++ b/components/backup-stream/src/metadata/store/slash_etc.rs @@ -49,7 +49,7 @@ impl std::fmt::Debug for Key { } /// A value (maybe tombstone.) -#[derive(Debug, Eq, PartialEq, Clone)] +#[derive(Debug, PartialEq, Clone)] enum Value { Val(Vec), Del, diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index de150ef2395..c3f99b8617e 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -6,7 +6,7 @@ use prometheus::*; /// The status of a task. /// The ordering of this imples the priority for presenting to the user. /// max(TASK_STATUS) of all stores would be probably the state of the task. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum TaskStatus { Running = 0, Paused, diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 05e49d232a9..d5486cecddb 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -76,8 +76,8 @@ impl TaskSelector { pub fn reference(&self) -> TaskSelectorRef<'_> { match self { TaskSelector::ByName(s) => TaskSelectorRef::ByName(s), - TaskSelector::ByKey(k) => TaskSelectorRef::ByKey(&*k), - TaskSelector::ByRange(s, e) => TaskSelectorRef::ByRange(&*s, &*e), + TaskSelector::ByKey(k) => TaskSelectorRef::ByKey(k), + TaskSelector::ByRange(s, e) => TaskSelectorRef::ByRange(s, e), TaskSelector::All => TaskSelectorRef::All, } } @@ -99,9 +99,9 @@ impl<'a> TaskSelectorRef<'a> { ) -> bool { match self { TaskSelectorRef::ByName(name) => task_name == name, - TaskSelectorRef::ByKey(k) => task_range.any(|(s, e)| utils::is_in_range(k, (&*s, &*e))), + TaskSelectorRef::ByKey(k) => task_range.any(|(s, e)| utils::is_in_range(k, (s, e))), TaskSelectorRef::ByRange(x1, y1) => { - task_range.any(|(x2, y2)| utils::is_overlapping((x1, y1), (&*x2, &*y2))) + task_range.any(|(x2, y2)| utils::is_overlapping((x1, y1), (x2, y2))) } TaskSelectorRef::All => true, } @@ -652,15 +652,14 @@ impl TempFileKey { } fn get_file_type(&self) -> FileType { - let file_type = match self.cmd_type { + match self.cmd_type { CmdType::Put => FileType::Put, CmdType::Delete => FileType::Delete, _ => { warn!("error cmdtype"; "cmdtype" => ?self.cmd_type); panic!("error CmdType"); } - }; - file_type + } } /// The full name of the file owns the key. @@ -1787,9 +1786,7 @@ mod tests { reader: UnpinReader, content_length: u64, ) -> io::Result<()> { - if let Err(e) = (self.error_on_write)() { - return Err(e); - } + (self.error_on_write)()?; self.inner.write(name, reader, content_length).await } diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index aa9f35705fb..2287dedc6c5 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -15,7 +15,7 @@ use crate::{debug, metrics::TRACK_REGION, utils}; #[derive(Clone, Default, Debug)] pub struct SubscriptionTracer(Arc>); -#[derive(Debug, Eq, PartialEq, Clone, Copy)] +#[derive(Debug, PartialEq, Clone, Copy)] pub enum SubscriptionState { /// When it is newly added (maybe after split or leader transfered from /// other store), without any flush. diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 89f21567801..ac1b3dec168 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -698,7 +698,7 @@ mod test { drop(work); }); } - let _ = block_on(tokio::time::timeout(Duration::from_secs(20), wg.wait())).unwrap(); + block_on(tokio::time::timeout(Duration::from_secs(20), wg.wait())).unwrap(); assert_eq!(cnt.load(Ordering::SeqCst), 0, "{:?}@{}", c, i); } } diff --git a/components/batch-system/src/fsm.rs b/components/batch-system/src/fsm.rs index cee3a7b4020..6fb4fe91539 100644 --- a/components/batch-system/src/fsm.rs +++ b/components/batch-system/src/fsm.rs @@ -19,7 +19,7 @@ const NOTIFYSTATE_IDLE: usize = 1; // The FSM is expected to be dropped. const NOTIFYSTATE_DROP: usize = 2; -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub enum Priority { Low, Normal, @@ -155,7 +155,7 @@ impl FsmState { let ptr = self.data.swap(ptr::null_mut(), Ordering::SeqCst); if !ptr.is_null() { unsafe { - Box::from_raw(ptr); + let _ = Box::from_raw(ptr); } } } diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 35e6bffd11b..b6ee5d177e1 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -433,7 +433,7 @@ pub mod tests { batch.renew(10, TimeStamp::compose(1, 110)).unwrap(); // timestamp fall back - assert!(batch.renew(10, TimeStamp::compose(1, 119)).is_err()); + batch.renew(10, TimeStamp::compose(1, 119)).unwrap_err(); batch.renew(10, TimeStamp::compose(1, 200)).unwrap(); for logical in 191..=195 { @@ -500,7 +500,7 @@ pub mod tests { for ts in 1101..=1200u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } - assert!(provider.get_ts().is_err()); + provider.get_ts().unwrap_err(); provider.flush().unwrap(); // allocated: [1201, 1400] assert_eq!(provider.batch_size(), 200); @@ -517,7 +517,7 @@ pub mod tests { for ts in 1401..=1500u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } - assert!(provider.get_ts().is_err()); + provider.get_ts().unwrap_err(); // renew on used-up for ts in 1501..=2500u64 { @@ -560,23 +560,23 @@ pub mod tests { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } - assert!(provider.flush().is_err()); + provider.flush().unwrap_err(); for ts in 1101..=1300u64 { // renew on used-up, allocated: [1101, 1300] assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } pd_cli.trigger_tso_failure(); - assert!(provider.get_ts().is_err()); // renew fail on used-up + provider.get_ts().unwrap_err(); // renew fail on used-up pd_cli.trigger_tso_failure(); - assert!(provider.flush().is_err()); + provider.flush().unwrap_err(); provider.flush().unwrap(); // allocated: [1301, 1700] pd_cli.trigger_tso_failure(); // make renew fail to verify used-up for ts in 1301..=1700u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } - assert!(provider.get_ts().is_err()); + provider.get_ts().unwrap_err(); } } diff --git a/components/cdc/src/channel.rs b/components/cdc/src/channel.rs index 3b1894eb6fc..595632c306e 100644 --- a/components/cdc/src/channel.rs +++ b/components/cdc/src/channel.rs @@ -266,7 +266,7 @@ pub fn channel(buffer: usize, memory_quota: MemoryQuota) -> (Sink, Drain) { ) } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub enum SendError { Full, Disconnected, diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index f6ef0659fe0..fc379916232 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -45,7 +45,7 @@ use crate::{ static DOWNSTREAM_ID_ALLOC: AtomicUsize = AtomicUsize::new(0); /// A unique identifier of a Downstream. -#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Hash)] pub struct DownstreamID(usize); impl DownstreamID { @@ -1229,7 +1229,7 @@ mod tests { assert!(delegate.handle.is_observing()); // Subscribe with an invalid epoch. - assert!(delegate.subscribe(new_downstream(1, 2)).is_err()); + delegate.subscribe(new_downstream(1, 2)).unwrap_err(); assert_eq!(delegate.downstreams().len(), 1); // Unsubscribe all downstreams. diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 4a957774a23..9d15c347e32 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -2568,7 +2568,7 @@ mod tests { err: Some(Error::request(err_header.clone())), }; suite.run(Task::Deregister(deregister)); - assert!(channel::recv_timeout(&mut rx, Duration::from_millis(200)).is_err()); + channel::recv_timeout(&mut rx, Duration::from_millis(200)).unwrap_err(); assert_eq!(suite.endpoint.capture_regions.len(), 1); let deregister = Deregister::Downstream { diff --git a/components/cdc/tests/failpoints/test_resolve.rs b/components/cdc/tests/failpoints/test_resolve.rs index 75326ac0fb5..560eb68ba44 100644 --- a/components/cdc/tests/failpoints/test_resolve.rs +++ b/components/cdc/tests/failpoints/test_resolve.rs @@ -260,7 +260,7 @@ fn test_joint_confchange() { receive_resolved_ts(&receive_event); tx.send(()).unwrap(); }); - assert!(rx.recv_timeout(Duration::from_secs(2)).is_err()); + rx.recv_timeout(Duration::from_secs(2)).unwrap_err(); fail::remove(update_region_fp); fail::remove(deregister_fp); diff --git a/components/cloud/aws/src/kms.rs b/components/cloud/aws/src/kms.rs index 3d5d6a3fdea..040db46bb53 100644 --- a/components/cloud/aws/src/kms.rs +++ b/components/cloud/aws/src/kms.rs @@ -86,7 +86,7 @@ impl KmsProvider for AwsKms { // possible that a wrong master key has been used, or other error otherwise. async fn decrypt_data_key(&self, data_key: &EncryptedKey) -> Result> { let decrypt_request = DecryptRequest { - ciphertext_blob: bytes::Bytes::copy_from_slice(&*data_key), + ciphertext_blob: bytes::Bytes::copy_from_slice(data_key), // Use default algorithm SYMMETRIC_DEFAULT. encryption_algorithm: None, // Use key_id encoded in ciphertext. diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index ef13749ccea..25499d89c61 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -707,15 +707,14 @@ mod tests { // inject put error let s3_put_obj_err_fp = "s3_put_obj_err"; fail::cfg(s3_put_obj_err_fp, "return").unwrap(); - let resp = s - .put( - "mykey", - PutResource(Box::new(magic_contents.as_bytes())), - magic_contents.len() as u64, - ) - .await; + s.put( + "mykey", + PutResource(Box::new(magic_contents.as_bytes())), + magic_contents.len() as u64, + ) + .await + .unwrap_err(); fail::remove(s3_put_obj_err_fp); - assert!(resp.is_err()); // test timeout let s3_timeout_injected_fp = "s3_timeout_injected"; @@ -725,16 +724,15 @@ mod tests { fail::cfg(s3_timeout_injected_fp, "return(100)").unwrap(); // inject 200ms delay fail::cfg(s3_sleep_injected_fp, "return(200)").unwrap(); - let resp = s - .put( - "mykey", - PutResource(Box::new(magic_contents.as_bytes())), - magic_contents.len() as u64, - ) - .await; - fail::remove(s3_sleep_injected_fp); // timeout occur due to delay 200ms - assert!(resp.is_err()); + s.put( + "mykey", + PutResource(Box::new(magic_contents.as_bytes())), + magic_contents.len() as u64, + ) + .await + .unwrap_err(); + fail::remove(s3_sleep_injected_fp); // inject 50ms delay fail::cfg(s3_sleep_injected_fp, "return(50)").unwrap(); diff --git a/components/codec/src/buffer.rs b/components/codec/src/buffer.rs index 4010ecdf04f..f40ee1fae4f 100644 --- a/components/codec/src/buffer.rs +++ b/components/codec/src/buffer.rs @@ -343,7 +343,7 @@ mod tests { // Read more bytes than available buffer.set_position(39); - assert!(buffer.read_bytes(2).is_err()); + buffer.read_bytes(2).unwrap_err(); assert_eq!(buffer.position(), 39); assert_eq!(buffer.bytes(), &base[39..40]); } @@ -378,14 +378,14 @@ mod tests { assert_eq!(buffer, &base[21..40]); assert_eq!(buffer.bytes(), &base[21..40]); - assert!(buffer.read_bytes(20).is_err()); + buffer.read_bytes(20).unwrap_err(); buffer.advance(19); assert_eq!(buffer, &[]); assert_eq!(buffer.bytes(), &[]); assert_eq!(buffer.read_bytes(0).unwrap(), &[]); - assert!(buffer.read_bytes(1).is_err()); + buffer.read_bytes(1).unwrap_err(); } #[test] @@ -424,7 +424,7 @@ mod tests { assert_eq!(buffer.position(), 20); // Write more bytes than available size - assert!(buffer.write_bytes(&base_write[20..]).is_err()); + buffer.write_bytes(&base_write[20..]).unwrap_err(); assert_eq!(&buffer.get_ref()[0..20], &base_write[0..20]); assert_eq!(&buffer.get_ref()[20..], &base[20..]); assert_eq!(buffer.position(), 20); @@ -522,7 +522,7 @@ mod tests { let mut buf_slice = &mut buffer[20..]; // Buffer remain 20, write 21 bytes shall fail. - assert!(buf_slice.write_bytes(&base_write[20..41]).is_err()); + buf_slice.write_bytes(&base_write[20..41]).unwrap_err(); // Write remaining 20 bytes buf_slice.bytes_mut(20)[..20].clone_from_slice(&base_write[20..40]); diff --git a/components/codec/src/byte.rs b/components/codec/src/byte.rs index 63143938c13..aa7baba9e75 100644 --- a/components/codec/src/byte.rs +++ b/components/codec/src/byte.rs @@ -971,7 +971,7 @@ mod tests { let result = panic_hook::recover_safe(move || { let _ = MemComparableByteCodec::encode_all(src.as_slice(), dest.as_mut_slice()); }); - assert!(result.is_err()); + result.unwrap_err(); let mut src_in_place = vec![0; dest_len]; let result = panic_hook::recover_safe(move || { @@ -980,7 +980,7 @@ mod tests { src_len, ); }); - assert!(result.is_err()); + result.unwrap_err(); } } @@ -1141,7 +1141,7 @@ mod tests { invalid_src.as_slice(), dest.as_mut_slice(), ); - assert!(result.is_err()); + result.unwrap_err(); } } @@ -1162,7 +1162,7 @@ mod tests { dest.as_mut_slice(), ); }); - assert!(result.is_err()); + result.unwrap_err(); } { let mut dest = vec![0; src.len()]; diff --git a/components/concurrency_manager/src/key_handle.rs b/components/concurrency_manager/src/key_handle.rs index f34b29b0f37..c7aebbc49e0 100644 --- a/components/concurrency_manager/src/key_handle.rs +++ b/components/concurrency_manager/src/key_handle.rs @@ -39,7 +39,7 @@ impl KeyHandle { } pub fn with_lock(&self, f: impl FnOnce(&Option) -> T) -> T { - f(&*self.lock_store.lock()) + f(&self.lock_store.lock()) } /// Set the LockTable that the KeyHandle is in. @@ -80,7 +80,7 @@ impl KeyHandleGuard { } pub fn with_lock(&self, f: impl FnOnce(&mut Option) -> T) -> T { - f(&mut *self.handle.lock_store.lock()) + f(&mut self.handle.lock_store.lock()) } pub(crate) fn handle(&self) -> &Arc { diff --git a/components/concurrency_manager/src/lib.rs b/components/concurrency_manager/src/lib.rs index b80501b5433..342f2139e08 100644 --- a/components/concurrency_manager/src/lib.rs +++ b/components/concurrency_manager/src/lib.rs @@ -137,7 +137,8 @@ mod tests { let concurrency_manager = ConcurrencyManager::new(1.into()); let keys: Vec<_> = [b"c", b"a", b"b"] .iter() - .map(|k| Key::from_raw(*k)) + .copied() + .map(|k| Key::from_raw(k)) .collect(); let guards = concurrency_manager.lock_keys(keys.iter()).await; for (key, guard) in keys.iter().zip(&guards) { @@ -181,8 +182,9 @@ mod tests { vec![20, 40, 30], vec![30, 20, 40], ]; - let keys: Vec<_> = vec![b"a", b"b", b"c"] - .into_iter() + let keys: Vec<_> = [b"a", b"b", b"c"] + .iter() + .copied() .map(|k| Key::from_raw(k)) .collect(); diff --git a/components/concurrency_manager/src/lock_table.rs b/components/concurrency_manager/src/lock_table.rs index 4169537840e..bf7a224aa28 100644 --- a/components/concurrency_manager/src/lock_table.rs +++ b/components/concurrency_manager/src/lock_table.rs @@ -57,7 +57,7 @@ impl LockTable { ) -> Result<(), E> { if let Some(lock_ref) = self.get(key) { return lock_ref.with_lock(|lock| { - if let Some(lock) = &*lock { + if let Some(lock) = lock { return check_fn(lock); } Ok(()) diff --git a/components/coprocessor_plugin_api/src/util.rs b/components/coprocessor_plugin_api/src/util.rs index 816b0d12162..606082c0c4e 100644 --- a/components/coprocessor_plugin_api/src/util.rs +++ b/components/coprocessor_plugin_api/src/util.rs @@ -32,7 +32,7 @@ pub type PluginGetPluginInfoSignature = extern "C" fn() -> PluginInfo; /// [`declare_plugin!(...)`](declare_plugin) and will be used by TiKV when a /// plugin is loaded to determine whether there are compilation mismatches. #[repr(C)] -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq)] pub struct BuildInfo { /// Version of the [`coprocessor_plugin_api`](crate) crate that was used to /// compile this plugin. @@ -55,7 +55,7 @@ impl BuildInfo { /// Information about the plugin, like its name and version. #[repr(C)] -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq)] pub struct PluginInfo { /// The name of the plugin. pub name: &'static str, diff --git a/components/encryption/src/config.rs b/components/encryption/src/config.rs index 4f83a72855f..3fff9064f58 100644 --- a/components/encryption/src/config.rs +++ b/components/encryption/src/config.rs @@ -39,14 +39,14 @@ impl Default for EncryptionConfig { } } -#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct FileConfig { pub path: String, } -#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, Eq, OnlineConfig)] +#[derive(Clone, Default, Debug, Serialize, Deserialize, PartialEq, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct KmsConfig { @@ -68,7 +68,7 @@ impl KmsConfig { } } -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] #[serde(rename_all = "kebab-case", tag = "type")] pub enum MasterKeyConfig { // Store encryption metadata as plaintext. Data still get encrypted. Not allowed to use if diff --git a/components/encryption/src/encrypted_file/header.rs b/components/encryption/src/encrypted_file/header.rs index 1456f451f62..420b3076adb 100644 --- a/components/encryption/src/encrypted_file/header.rs +++ b/components/encryption/src/encrypted_file/header.rs @@ -7,7 +7,7 @@ use tikv_util::box_err; use crate::Result; -#[derive(Clone, Copy, PartialEq, Eq, Debug)] +#[derive(Clone, Copy, PartialEq, Debug)] pub enum Version { // The content only contains the encrypted part. V1 = 1, @@ -39,7 +39,7 @@ impl Version { /// | | Reserved (3 bytes) /// | Version (1 bytes) /// ``` -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Clone)] pub struct Header { version: Version, crc32: u32, diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 0dcdbffdb95..58a3a7a66e5 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -494,7 +494,7 @@ impl DataKeyManager { Dicts::open( &args.dict_path, args.rotation_period, - &*master_key, + master_key, args.enable_file_dictionary_log, args.file_dictionary_rewrite_threshold, ), @@ -560,7 +560,7 @@ impl DataKeyManager { )) })?; // Rewrite key_dict after replace master key. - dicts.save_key_dict(&*master_key)?; + dicts.save_key_dict(master_key)?; info!("encryption: persisted result after replace master key."); Ok(dicts) diff --git a/components/encryption/src/master_key/metadata.rs b/components/encryption/src/master_key/metadata.rs index 8537a2416e3..38518cf0b34 100644 --- a/components/encryption/src/master_key/metadata.rs +++ b/components/encryption/src/master_key/metadata.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Hash, PartialEq)] pub enum MetadataKey { Method, Iv, @@ -27,7 +27,7 @@ impl MetadataKey { } } -#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)] +#[derive(Copy, Clone, Debug, Hash, PartialEq)] pub enum MetadataMethod { Plaintext, Aes256Gcm, diff --git a/components/engine_rocks/src/compact_listener.rs b/components/engine_rocks/src/compact_listener.rs index 5fc7a4e92f2..e679410c8b9 100644 --- a/components/engine_rocks/src/compact_listener.rs +++ b/components/engine_rocks/src/compact_listener.rs @@ -197,7 +197,7 @@ impl CompactedEvent for RocksCompactedEvent { } fn cf(&self) -> &str { - &*self.cf + &self.cf } } diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 9c995144efa..13ae38b6afb 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -250,7 +250,7 @@ mod tests { engine.put_cf(cf, b"k1", b"v2").unwrap(); assert_eq!(&*engine.get_value(b"k1").unwrap().unwrap(), b"v1"); - assert!(engine.get_value_cf("foo", b"k1").is_err()); + engine.get_value_cf("foo", b"k1").unwrap_err(); assert_eq!(&*engine.get_value_cf(cf, b"k1").unwrap().unwrap(), b"v2"); } diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index fd695bb4d2c..3e204bbc49f 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -400,10 +400,7 @@ mod tests { let mut kvs_left: Vec<_> = kvs; for r in ranges { - kvs_left = kvs_left - .into_iter() - .filter(|k| k.0 < r.start_key || k.0 >= r.end_key) - .collect(); + kvs_left.retain(|k| k.0 < r.start_key || k.0 >= r.end_key); } check_data(&db, ALL_CFS, kvs_left.as_slice()); } diff --git a/components/engine_rocks/src/perf_context_impl.rs b/components/engine_rocks/src/perf_context_impl.rs index 543e116d8ac..59086127154 100644 --- a/components/engine_rocks/src/perf_context_impl.rs +++ b/components/engine_rocks/src/perf_context_impl.rs @@ -185,10 +185,10 @@ impl PerfContextStatistics { if self.perf_level == PerfLevel::Uninitialized { match self.kind { PerfContextKind::Storage(_) | PerfContextKind::Coprocessor(_) => { - set_perf_flags(&*DEFAULT_READ_PERF_FLAGS) + set_perf_flags(&DEFAULT_READ_PERF_FLAGS) } PerfContextKind::RaftstoreStore | PerfContextKind::RaftstoreApply => { - set_perf_flags(&*DEFAULT_WRITE_PERF_FLAGS) + set_perf_flags(&DEFAULT_WRITE_PERF_FLAGS) } } } else { diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index 8d049112f92..41e13a813e6 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -130,7 +130,7 @@ impl<'a> DecodeProperties for UserCollectedPropertiesDecoder<'a> { } } -#[derive(Debug, Clone, PartialEq, Eq, Copy)] +#[derive(Debug, Clone, PartialEq, Copy)] pub enum RangeOffsetKind { Size, Keys, diff --git a/components/engine_rocks/src/range_properties.rs b/components/engine_rocks/src/range_properties.rs index 17d0805340d..101a004982a 100644 --- a/components/engine_rocks/src/range_properties.rs +++ b/components/engine_rocks/src/range_properties.rs @@ -58,10 +58,10 @@ impl RangePropertiesExt for RocksEngine { let keys = props.get_approximate_keys_in_range(start_key, end_key); format!( "{}:{}", - Path::new(&*k) + Path::new(k) .file_name() .map(|f| f.to_str().unwrap()) - .unwrap_or(&*k), + .unwrap_or(k), keys ) }) @@ -118,10 +118,10 @@ impl RangePropertiesExt for RocksEngine { let size = props.get_approximate_size_in_range(start_key, end_key); format!( "{}:{}", - Path::new(&*k) + Path::new(k) .file_name() .map(|f| f.to_str().unwrap()) - .unwrap_or(&*k), + .unwrap_or(k), size ) }) diff --git a/components/engine_rocks/src/ttl_properties.rs b/components/engine_rocks/src/ttl_properties.rs index 5dd51d8cd97..eb4641cc102 100644 --- a/components/engine_rocks/src/ttl_properties.rs +++ b/components/engine_rocks/src/ttl_properties.rs @@ -182,10 +182,10 @@ mod tests { } let case2 = [("zr\0a", 0)]; - assert!(get_properties(&case2).is_err()); + get_properties(&case2).unwrap_err(); let case3 = []; - assert!(get_properties(&case3).is_err()); + get_properties(&case3).unwrap_err(); let case4 = [("zr\0a", 1)]; let props = get_properties(&case4).unwrap(); diff --git a/components/engine_traits/src/cf_defs.rs b/components/engine_traits/src/cf_defs.rs index f47a63e69e3..e3fe95ec3b6 100644 --- a/components/engine_traits/src/cf_defs.rs +++ b/components/engine_traits/src/cf_defs.rs @@ -14,11 +14,5 @@ pub fn name_to_cf(name: &str) -> Option { if name.is_empty() { return Some(CF_DEFAULT); } - for c in ALL_CFS { - if name == *c { - return Some(c); - } - } - - None + ALL_CFS.iter().copied().find(|c| name == *c) } diff --git a/components/engine_traits/src/encryption.rs b/components/engine_traits/src/encryption.rs index 41a0f97fb36..16f29d16d75 100644 --- a/components/engine_traits/src/encryption.rs +++ b/components/engine_traits/src/encryption.rs @@ -12,7 +12,7 @@ pub trait EncryptionKeyManager: Sync + Send { fn link_file(&self, src_fname: &str, dst_fname: &str) -> Result<()>; } -#[derive(Clone, PartialEq, Eq)] +#[derive(Clone, PartialEq)] pub struct FileEncryptionInfo { pub method: EncryptionMethod, pub key: Vec, @@ -46,7 +46,7 @@ impl FileEncryptionInfo { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq)] pub enum EncryptionMethod { Unknown = 0, Plaintext = 1, diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 7add5e4d9b2..8d991f1cfeb 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -328,8 +328,7 @@ mod tests { err.add_result(1, 1, Err(Status::with_code(Code::Aborted).into())); err.add_result(1, 1, Err(Status::with_code(Code::NotFound).into())); err.add_result(1, 1, Ok(())); - let r = err.take_result(); - assert!(r.is_err()); + err.take_result().unwrap_err(); assert_eq!(err.get_error_count(), 2); } } diff --git a/components/engine_traits/src/errors.rs b/components/engine_traits/src/errors.rs index 6348db22174..6784891921b 100644 --- a/components/engine_traits/src/errors.rs +++ b/components/engine_traits/src/errors.rs @@ -7,7 +7,7 @@ use raft::{Error as RaftError, StorageError}; use thiserror::Error; #[repr(u8)] -#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Hash, PartialEq)] pub enum Code { Ok = 0, NotFound = 1, @@ -28,7 +28,7 @@ pub enum Code { } #[repr(u8)] -#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Hash, PartialEq)] pub enum SubCode { None = 0, MutexTimeout = 1, @@ -43,7 +43,7 @@ pub enum SubCode { } #[repr(u8)] -#[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] +#[derive(Debug, Copy, Clone, Hash, PartialEq)] pub enum Severity { NoError = 0, SoftError = 1, diff --git a/components/engine_traits/src/perf_context.rs b/components/engine_traits/src/perf_context.rs index dfa5aa967b7..56351fbeca5 100644 --- a/components/engine_traits/src/perf_context.rs +++ b/components/engine_traits/src/perf_context.rs @@ -44,7 +44,7 @@ pub trait PerfContextExt { /// /// This is a leaky abstraction that supports the encapsulation of metrics /// reporting by the subsystems that use PerfContext. -#[derive(Eq, PartialEq, Copy, Clone, Debug)] +#[derive(PartialEq, Copy, Clone, Debug)] pub enum PerfContextKind { RaftstoreApply, RaftstoreStore, diff --git a/components/engine_traits/src/sst_partitioner.rs b/components/engine_traits/src/sst_partitioner.rs index f41664403d1..bc6ec13a4eb 100644 --- a/components/engine_traits/src/sst_partitioner.rs +++ b/components/engine_traits/src/sst_partitioner.rs @@ -2,20 +2,20 @@ use std::ffi::CString; -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub struct SstPartitionerRequest<'a> { pub prev_user_key: &'a [u8], pub current_user_key: &'a [u8], pub current_output_file_size: u64, } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub enum SstPartitionerResult { NotRequired, Required, } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(Clone, Debug, PartialEq)] pub struct SstPartitionerContext<'a> { pub is_full_compaction: bool, pub is_manual_compaction: bool, diff --git a/components/engine_traits_tests/src/ctor.rs b/components/engine_traits_tests/src/ctor.rs index 2ab7a7360a7..ab1eea4d958 100644 --- a/components/engine_traits_tests/src/ctor.rs +++ b/components/engine_traits_tests/src/ctor.rs @@ -67,7 +67,7 @@ fn new_engine_readonly_dir() { let path = path.to_str().unwrap(); let err = KvTestEngine::new_kv_engine(path, ALL_CFS); - assert!(err.is_err()); + err.unwrap_err(); } #[test] @@ -88,5 +88,5 @@ fn new_engine_opt_readonly_dir() { let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let err = KvTestEngine::new_kv_engine_opt(path, db_opts, cf_opts); - assert!(err.is_err()); + err.unwrap_err(); } diff --git a/components/engine_traits_tests/src/delete_range.rs b/components/engine_traits_tests/src/delete_range.rs index c2b87395d6a..bdfba737048 100644 --- a/components/engine_traits_tests/src/delete_range.rs +++ b/components/engine_traits_tests/src/delete_range.rs @@ -8,10 +8,8 @@ use super::default_engine; #[test] fn delete_range_cf_bad_cf() { let db = default_engine(); - assert!( - recover_safe(|| { - db.engine.delete_range_cf("bogus", b"a", b"b").unwrap(); - }) - .is_err() - ); + recover_safe(|| { + db.engine.delete_range_cf("bogus", b"a", b"b").unwrap(); + }) + .unwrap_err(); } diff --git a/components/engine_traits_tests/src/iterator.rs b/components/engine_traits_tests/src/iterator.rs index 96709c3fe29..714ca4cb0b4 100644 --- a/components/engine_traits_tests/src/iterator.rs +++ b/components/engine_traits_tests/src/iterator.rs @@ -15,20 +15,16 @@ where assert_eq!(iter.valid().unwrap(), false); - assert!(iter.prev().is_err()); - assert!(iter.next().is_err()); - assert!( - recover_safe(|| { - iter.key(); - }) - .is_err() - ); - assert!( - recover_safe(|| { - iter.value(); - }) - .is_err() - ); + iter.prev().unwrap_err(); + iter.next().unwrap_err(); + recover_safe(|| { + iter.key(); + }) + .unwrap_err(); + recover_safe(|| { + iter.value(); + }) + .unwrap_err(); assert_eq!(iter.seek_to_first().unwrap(), false); assert_eq!(iter.seek_to_last().unwrap(), false); @@ -84,18 +80,14 @@ where assert!(!iter.valid().unwrap()); - assert!( - recover_safe(|| { - iter.key(); - }) - .is_err() - ); - assert!( - recover_safe(|| { - iter.value(); - }) - .is_err() - ); + recover_safe(|| { + iter.key(); + }) + .unwrap_err(); + recover_safe(|| { + iter.value(); + }) + .unwrap_err(); } #[test] @@ -146,18 +138,14 @@ where assert!(!iter.valid().unwrap()); - assert!( - recover_safe(|| { - iter.key(); - }) - .is_err() - ); - assert!( - recover_safe(|| { - iter.value(); - }) - .is_err() - ); + recover_safe(|| { + iter.key(); + }) + .unwrap_err(); + recover_safe(|| { + iter.value(); + }) + .unwrap_err(); } #[test] diff --git a/components/engine_traits_tests/src/scenario_writes.rs b/components/engine_traits_tests/src/scenario_writes.rs index eb05c107c1d..1e52f9400d2 100644 --- a/components/engine_traits_tests/src/scenario_writes.rs +++ b/components/engine_traits_tests/src/scenario_writes.rs @@ -10,7 +10,7 @@ use panic_hook::recover_safe; use super::engine_cfs; #[allow(clippy::enum_variant_names)] -#[derive(Eq, PartialEq)] +#[derive(PartialEq)] enum WriteScenario { NoCf, DefaultCf, @@ -279,9 +279,9 @@ scenario_test! { delete_range_reverse_range { db.put(b"c", b"").unwrap(); db.put(b"d", b"").unwrap(); - assert!(recover_safe(|| { + recover_safe(|| { db.delete_range(b"d", b"b").unwrap(); - }).is_err()); + }).unwrap_err(); assert!(db.get_value(b"b").unwrap().is_some()); assert!(db.get_value(b"c").unwrap().is_some()); diff --git a/components/engine_traits_tests/src/sst.rs b/components/engine_traits_tests/src/sst.rs index 231e12ea785..ce4160e5ddc 100644 --- a/components/engine_traits_tests/src/sst.rs +++ b/components/engine_traits_tests/src/sst.rs @@ -158,20 +158,16 @@ fn delete() -> Result<()> { assert_eq!(iter.valid()?, false); - assert!(iter.prev().is_err()); - assert!(iter.next().is_err()); - assert!( - recover_safe(|| { - iter.key(); - }) - .is_err() - ); - assert!( - recover_safe(|| { - iter.value(); - }) - .is_err() - ); + iter.prev().unwrap_err(); + iter.next().unwrap_err(); + recover_safe(|| { + iter.key(); + }) + .unwrap_err(); + recover_safe(|| { + iter.value(); + }) + .unwrap_err(); assert_eq!(iter.seek_to_first()?, false); assert_eq!(iter.seek_to_last()?, false); diff --git a/components/engine_traits_tests/src/write_batch.rs b/components/engine_traits_tests/src/write_batch.rs index dc966cf03b6..e99245adb4b 100644 --- a/components/engine_traits_tests/src/write_batch.rs +++ b/components/engine_traits_tests/src/write_batch.rs @@ -717,12 +717,10 @@ fn write_batch_delete_range_backward_range() { let mut wb = db.engine.write_batch(); wb.delete_range(b"c", b"a").unwrap(); - assert!( - recover_safe(|| { - wb.write().unwrap(); - }) - .is_err() - ); + recover_safe(|| { + wb.write().unwrap(); + }) + .unwrap_err(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"b").unwrap().is_some()); @@ -745,12 +743,10 @@ fn write_batch_delete_range_backward_range() { wb.delete_range(&256_usize.to_be_bytes(), &0_usize.to_be_bytes()) .unwrap(); - assert!( - recover_safe(|| { - wb.write().unwrap(); - }) - .is_err() - ); + recover_safe(|| { + wb.write().unwrap(); + }) + .unwrap_err(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"b").unwrap().is_some()); @@ -787,12 +783,10 @@ fn write_batch_delete_range_backward_range_partial_commit() { wb.put(b"f", b"").unwrap(); wb.delete(b"a").unwrap(); - assert!( - recover_safe(|| { - wb.write().unwrap(); - }) - .is_err() - ); + recover_safe(|| { + wb.write().unwrap(); + }) + .unwrap_err(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"b").unwrap().is_some()); @@ -835,12 +829,10 @@ fn write_batch_delete_range_backward_range_partial_commit() { wb.delete(&i.to_be_bytes()).unwrap(); } - assert!( - recover_safe(|| { - wb.write().unwrap(); - }) - .is_err() - ); + recover_safe(|| { + wb.write().unwrap(); + }) + .unwrap_err(); assert!(db.engine.get_value(b"a").unwrap().is_some()); assert!(db.engine.get_value(b"b").unwrap().is_some()); diff --git a/components/error_code/src/lib.rs b/components/error_code/src/lib.rs index 8ad7f3e1f23..0747b3fd2fb 100644 --- a/components/error_code/src/lib.rs +++ b/components/error_code/src/lib.rs @@ -43,7 +43,7 @@ pub mod storage; use std::fmt::{self, Display, Formatter}; -#[derive(PartialEq, Eq, Debug, Clone, Copy)] +#[derive(PartialEq, Debug, Clone, Copy)] pub struct ErrorCode { pub code: &'static str, pub description: &'static str, diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index 0bacbdef428..36acbc65a91 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -48,14 +48,14 @@ pub use rate_limiter::{ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use strum::{EnumCount, EnumIter}; -#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub enum IoOp { Read, Write, } #[repr(C)] -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, EnumCount, EnumIter)] +#[derive(Clone, Copy, Debug, PartialEq, Hash, EnumCount, EnumIter)] pub enum IoType { Other = 0, // Including coprocessor and storage read. @@ -129,7 +129,7 @@ impl std::ops::Sub for IoBytes { } #[repr(u32)] -#[derive(Debug, Clone, PartialEq, Eq, Copy, EnumCount)] +#[derive(Debug, Clone, PartialEq, Copy, EnumCount)] pub enum IoPriority { Low = 0, Medium = 1, @@ -189,7 +189,7 @@ impl<'de> Deserialize<'de> for IoPriority { where E: Error, { - let p = match IoPriority::from_str(&*value.trim().to_lowercase()) { + let p = match IoPriority::from_str(&value.trim().to_lowercase()) { Ok(p) => p, _ => { return Err(E::invalid_value( @@ -483,7 +483,7 @@ mod tests { // Ensure it works for non-existent file. let non_existent_file = dir_path.join("non_existent_file"); - assert!(get_file_size(&non_existent_file).is_err()); + get_file_size(&non_existent_file).unwrap_err(); } #[test] diff --git a/components/file_system/src/rate_limiter.rs b/components/file_system/src/rate_limiter.rs index da7fe5fe75c..f3ec05a4314 100644 --- a/components/file_system/src/rate_limiter.rs +++ b/components/file_system/src/rate_limiter.rs @@ -24,7 +24,7 @@ const DEFAULT_REFILL_PERIOD: Duration = Duration::from_millis(50); const DEFAULT_REFILLS_PER_SEC: usize = (1.0 / DEFAULT_REFILL_PERIOD.as_secs_f32()) as usize; const MAX_WAIT_DURATION_PER_REQUEST: Duration = Duration::from_millis(500); -#[derive(Debug, Clone, PartialEq, Eq, Copy)] +#[derive(Debug, Clone, PartialEq, Copy)] pub enum IoRateLimitMode { WriteOnly, ReadOnly, @@ -92,7 +92,7 @@ impl<'de> Deserialize<'de> for IoRateLimitMode { where E: Error, { - let p = match IoRateLimitMode::from_str(&*value.trim().to_lowercase()) { + let p = match IoRateLimitMode::from_str(&value.trim().to_lowercase()) { Ok(p) => p, _ => { return Err(E::invalid_value( diff --git a/components/keys/src/lib.rs b/components/keys/src/lib.rs index ecb2657de00..f62ffc6f8ab 100644 --- a/components/keys/src/lib.rs +++ b/components/keys/src/lib.rs @@ -415,17 +415,17 @@ mod tests { let state_key = raft_state_key(1); // invalid length - assert!(decode_raft_log_key(&state_key).is_err()); + decode_raft_log_key(&state_key).unwrap_err(); let mut state_key = state_key.to_vec(); state_key.write_u64::(2).unwrap(); // invalid suffix - assert!(decode_raft_log_key(&state_key).is_err()); + decode_raft_log_key(&state_key).unwrap_err(); let mut region_state_key = region_state_key(1).to_vec(); region_state_key.write_u64::(2).unwrap(); // invalid prefix - assert!(decode_raft_log_key(®ion_state_key).is_err()); + decode_raft_log_key(®ion_state_key).unwrap_err(); } #[test] @@ -441,8 +441,8 @@ mod tests { let mut region = Region::default(); // uninitialised region should not be passed in `enc_start_key` and // `enc_end_key`. - assert!(::panic_hook::recover_safe(|| enc_start_key(®ion)).is_err()); - assert!(::panic_hook::recover_safe(|| enc_end_key(®ion)).is_err()); + ::panic_hook::recover_safe(|| enc_start_key(®ion)).unwrap_err(); + ::panic_hook::recover_safe(|| enc_end_key(®ion)).unwrap_err(); region.mut_peers().push(Peer::default()); assert_eq!(enc_start_key(®ion), vec![DATA_PREFIX]); diff --git a/components/keys/src/rewrite.rs b/components/keys/src/rewrite.rs index 03b6ea27c4f..51f588e9732 100644 --- a/components/keys/src/rewrite.rs +++ b/components/keys/src/rewrite.rs @@ -8,7 +8,7 @@ use std::ops::Bound::{self, *}; /// An error indicating the key cannot be rewritten because it does not start /// with the given prefix. -#[derive(PartialEq, Eq, Debug, Clone)] +#[derive(PartialEq, Debug, Clone)] pub struct WrongPrefix; /// Rewrites the prefix of a byte array. diff --git a/components/online_config/online_config_derive/src/lib.rs b/components/online_config/online_config_derive/src/lib.rs index ed37aeac40c..5518aa0e5e6 100644 --- a/components/online_config/online_config_derive/src/lib.rs +++ b/components/online_config/online_config_derive/src/lib.rs @@ -123,11 +123,7 @@ fn encoder( } }; // Only reserve attributes that related to `serde` - field.attrs = field - .attrs - .into_iter() - .filter(|f| is_attr("serde", f)) - .collect(); + field.attrs.retain(|f| is_attr("serde", f)); serialize_fields.push(field); } // Only reserve attributes that related to `serde` diff --git a/components/online_config/src/lib.rs b/components/online_config/src/lib.rs index 2388bf3b3ac..18d9cc0fd71 100644 --- a/components/online_config/src/lib.rs +++ b/components/online_config/src/lib.rs @@ -296,7 +296,7 @@ mod tests { ); } - #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)] + #[derive(Clone, Copy, Debug, PartialEq, Serialize)] pub enum TestEnum { First, Second, @@ -364,6 +364,6 @@ mod tests { let mut diff = HashMap::new(); diff.insert("e".to_owned(), ConfigValue::String("invalid".into())); - assert!(config.update(diff).is_err()); + config.update(diff).unwrap_err(); } } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 87187b30e75..37b34bcb666 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -132,7 +132,7 @@ where } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] pub enum PeerTick { Raft = 0, diff --git a/components/raftstore/src/coprocessor/region_info_accessor.rs b/components/raftstore/src/coprocessor/region_info_accessor.rs index e8a5b1ac1c9..fb6defbc375 100644 --- a/components/raftstore/src/coprocessor/region_info_accessor.rs +++ b/components/raftstore/src/coprocessor/region_info_accessor.rs @@ -1167,7 +1167,7 @@ mod tests { for index in indices { for order in orders { - test_split_impl(*index, *order); + test_split_impl(*index, order.as_slice()); } } } diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index bc9fd855038..44318a27b60 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -1045,7 +1045,7 @@ pub mod tests { #[test] fn test_get_approximate_split_keys() { for cf in LARGE_CFS { - test_get_approximate_split_keys_impl(*cf); + test_get_approximate_split_keys_impl(cf); } } diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index f26022efe64..e5906719109 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -6,7 +6,6 @@ #![feature(min_specialization)] #![feature(box_patterns)] #![feature(hash_drain_filter)] -#![feature(let_chains)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index ad89d5e7e70..89b5cfc1ac9 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -1011,12 +1011,14 @@ mod tests { ); cfg.raft_heartbeat_ticks = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_election_timeout_ticks = 10; cfg.raft_heartbeat_ticks = 10; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_min_election_timeout_ticks = 5; @@ -1029,15 +1031,18 @@ mod tests { cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg.raft_heartbeat_ticks = 11; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_threshold = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_size_limit = Some(ReadableSize(0)); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_size_limit = None; @@ -1049,12 +1054,14 @@ mod tests { cfg.raft_base_tick_interval = ReadableDuration::secs(1); cfg.raft_election_timeout_ticks = 10; cfg.raft_store_max_leader_lease = ReadableDuration::secs(20); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_count_limit = Some(100); cfg.merge_max_log_gap = 110; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_count_limit = None; @@ -1064,51 +1071,62 @@ mod tests { cfg = Config::new(); cfg.merge_check_tick_interval = ReadableDuration::secs(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_base_tick_interval = ReadableDuration::secs(1); cfg.raft_election_timeout_ticks = 10; cfg.peer_stale_state_check_interval = ReadableDuration::secs(5); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.peer_stale_state_check_interval = ReadableDuration::minutes(2); cfg.abnormal_leader_missing_duration = ReadableDuration::minutes(1); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.abnormal_leader_missing_duration = ReadableDuration::minutes(2); cfg.max_leader_missing_duration = ReadableDuration::minutes(1); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.local_read_batch_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.apply_batch_system.max_batch_size = Some(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.apply_batch_system.pool_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.store_batch_system.max_batch_size = Some(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.store_batch_system.pool_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.apply_batch_system.max_batch_size = Some(10241); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.store_batch_system.max_batch_size = Some(10241); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.hibernate_regions = true; @@ -1132,17 +1150,20 @@ mod tests { cfg = Config::new(); cfg.future_poll_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.snap_generator_pool_size = 0; - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.raft_base_tick_interval = ReadableDuration::secs(1); cfg.raft_election_timeout_ticks = 11; cfg.raft_store_max_leader_lease = ReadableDuration::secs(11); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg = Config::new(); cfg.hibernate_regions = true; @@ -1153,17 +1174,21 @@ mod tests { cfg = Config::new(); cfg.raft_max_size_per_msg = ReadableSize(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg.raft_max_size_per_msg = ReadableSize::gb(64); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg.raft_max_size_per_msg = ReadableSize::gb(3); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg = Config::new(); cfg.raft_entry_max_size = ReadableSize(0); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg.raft_entry_max_size = ReadableSize::mb(3073); - assert!(cfg.validate(split_size, false, ReadableSize(0)).is_err()); + cfg.validate(split_size, false, ReadableSize(0)) + .unwrap_err(); cfg.raft_entry_max_size = ReadableSize::gb(3); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 33b504127f8..e5c617ec91b 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -1155,7 +1155,7 @@ pub mod tests { assert_eq!(e, cache.entry(e.get_index()).unwrap()); } let res = panic_hook::recover_safe(|| cache.entry(7)); - assert!(res.is_err()); + res.unwrap_err(); } #[test] diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 938ea526894..e2db05db143 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -3794,7 +3794,7 @@ where } => self.handle_change(apply_ctx, cmd, region_epoch, cb), #[cfg(any(test, feature = "testexport"))] Msg::Validate(_, f) => { - let delegate: *const u8 = unsafe { mem::transmute(&self.delegate) }; + let delegate = &self.delegate as *const ApplyDelegate as *const u8; f(delegate) } } @@ -4705,7 +4705,7 @@ mod tests { // unregistered region should be ignored and notify failed. let resp = resp_rx.recv_timeout(Duration::from_secs(3)).unwrap(); assert!(resp.get_header().get_error().has_region_not_found()); - assert!(rx.try_recv().is_err()); + rx.try_recv().unwrap_err(); let (cc_tx, cc_rx) = mpsc::channel(); let pops = vec![ @@ -4808,7 +4808,7 @@ mod tests { "{:?}", resp ); - assert!(rx.try_recv().is_err()); + rx.try_recv().unwrap_err(); system.shutdown(); } @@ -5982,29 +5982,29 @@ mod tests { let mut region = Region::default(); // Check uuid and cf name - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.set_uuid(Uuid::new_v4().as_bytes().to_vec()); sst.set_cf_name(CF_DEFAULT.to_owned()); check_sst_for_ingestion(&sst, ®ion).unwrap(); sst.set_cf_name("test".to_owned()); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.set_cf_name(CF_WRITE.to_owned()); check_sst_for_ingestion(&sst, ®ion).unwrap(); // Check region id region.set_id(1); sst.set_region_id(2); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.set_region_id(1); check_sst_for_ingestion(&sst, ®ion).unwrap(); // Check region epoch region.mut_region_epoch().set_conf_ver(1); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.mut_region_epoch().set_conf_ver(1); check_sst_for_ingestion(&sst, ®ion).unwrap(); region.mut_region_epoch().set_version(1); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.mut_region_epoch().set_version(1); check_sst_for_ingestion(&sst, ®ion).unwrap(); @@ -6013,9 +6013,9 @@ mod tests { region.set_end_key(vec![8]); sst.mut_range().set_start(vec![1]); sst.mut_range().set_end(vec![8]); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.mut_range().set_start(vec![2]); - assert!(check_sst_for_ingestion(&sst, ®ion).is_err()); + check_sst_for_ingestion(&sst, ®ion).unwrap_err(); sst.mut_range().set_end(vec![7]); check_sst_for_ingestion(&sst, ®ion).unwrap(); } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 1d02b723cf6..3ae6b74a13c 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -3742,7 +3742,7 @@ where } }; let mut replication_state = self.ctx.global_replication_state.lock().unwrap(); - new_peer.peer.init_replication_mode(&mut *replication_state); + new_peer.peer.init_replication_mode(&mut replication_state); drop(replication_state); let meta_peer = new_peer.peer.peer.clone(); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index b058d0bb35e..9e126d4d141 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1138,7 +1138,7 @@ impl RaftPollerBuilder { self.engines.clone(), region, )); - peer.peer.init_replication_mode(&mut *replication_state); + peer.peer.init_replication_mode(&mut replication_state); if local_state.get_state() == PeerState::Merging { info!("region is merging"; "region" => ?region, "store_id" => store_id); merging_count += 1; @@ -1178,7 +1178,7 @@ impl RaftPollerBuilder { self.engines.clone(), ®ion, )?; - peer.peer.init_replication_mode(&mut *replication_state); + peer.peer.init_replication_mode(&mut replication_state); peer.schedule_applying_snapshot(); meta.region_ranges .insert(enc_end_key(®ion), region.get_id()); @@ -2168,7 +2168,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER // Now all checking passed let mut replication_state = self.ctx.global_replication_state.lock().unwrap(); - peer.peer.init_replication_mode(&mut *replication_state); + peer.peer.init_replication_mode(&mut replication_state); drop(replication_state); peer.peer.local_first_replicate = is_local_first; @@ -2790,7 +2790,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } }; let mut replication_state = self.ctx.global_replication_state.lock().unwrap(); - peer.peer.init_replication_mode(&mut *replication_state); + peer.peer.init_replication_mode(&mut replication_state); drop(replication_state); peer.peer.activate(self.ctx); diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 43126d1def5..ce812d5ef24 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -220,7 +220,7 @@ where } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] pub enum PeerTick { Raft = 0, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9a8fd7d0605..7c57eeb9ae4 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -113,7 +113,7 @@ const REGION_READ_PROGRESS_CAP: usize = 128; pub const MAX_COMMITTED_SIZE_PER_READY: u64 = 16 * 1024 * 1024; /// The returned states of the peer after checking whether it is stale -#[derive(Debug, PartialEq, Eq)] +#[derive(Debug, PartialEq)] pub enum StaleState { Valid, ToValidate, @@ -5808,7 +5808,7 @@ mod tests { applied_to_index_term: true, lease_state: LeaseState::Valid, }; - assert!(inspector.inspect(&req).is_err()); + inspector.inspect(&req).unwrap_err(); } } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 83363d65ac8..5ad6395dd33 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1818,7 +1818,7 @@ pub mod tests { Option::>::None, ); worker.start(runner); - assert!(s1.snapshot(0, 0).is_err()); + s1.snapshot(0, 0).unwrap_err(); let gen_task = s1.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s1.engines, &sched).unwrap(); @@ -1909,7 +1909,7 @@ pub mod tests { JOB_STATUS_FAILED, )))); let res = panic_hook::recover_safe(|| s.cancel_applying_snap()); - assert!(res.is_err()); + res.unwrap_err(); } #[test] @@ -1959,7 +1959,7 @@ pub mod tests { JOB_STATUS_FAILED, )))); let res = panic_hook::recover_safe(|| s.check_applying_snap()); - assert!(res.is_err()); + res.unwrap_err(); } #[test] diff --git a/components/raftstore/src/store/read_queue.rs b/components/raftstore/src/store/read_queue.rs index aa24b4bc3c7..d9261b9fde3 100644 --- a/components/raftstore/src/store/read_queue.rs +++ b/components/raftstore/src/store/read_queue.rs @@ -82,7 +82,7 @@ where } pub fn cmds(&self) -> &[(RaftCmdRequest, Callback, Option)] { - &*self.cmds + &self.cmds } pub fn take_cmds(&mut self) -> MustConsumeVec<(RaftCmdRequest, Callback, Option)> { diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index 64bde3cf88b..86d89fad051 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -492,7 +492,7 @@ mod tests { assert!(v0.is_none()); let v4 = snap.get_value(b"key5"); - assert!(v4.is_err()); + v4.unwrap_err(); } #[allow(clippy::type_complexity)] diff --git a/components/raftstore/src/store/replication_mode.rs b/components/raftstore/src/store/replication_mode.rs index 1f163ccfb9f..5cc0364b79a 100644 --- a/components/raftstore/src/store/replication_mode.rs +++ b/components/raftstore/src/store/replication_mode.rs @@ -192,7 +192,6 @@ impl GlobalReplicationState { #[cfg(test)] mod tests { - use std::panic; use kvproto::{ metapb, @@ -334,6 +333,6 @@ mod tests { .group .register_store(1, vec![label1.clone(), label3.clone()]) }); - assert!(res.is_err(), "existing group id can't be changed."); + res.unwrap_err(); } } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index e7b024c38eb..9a279029fd5 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -445,7 +445,7 @@ pub struct Snapshot { mgr: SnapManagerCore, } -#[derive(PartialEq, Eq, Clone, Copy)] +#[derive(PartialEq, Clone, Copy)] enum CheckPolicy { ErrAllowed, ErrNotAllowed, @@ -2516,7 +2516,7 @@ pub mod tests { corrupt_snapshot_size_in(dir.path()); - assert!(Snapshot::new_for_sending(dir.path(), &key, &mgr_core,).is_err()); + Snapshot::new_for_sending(dir.path(), &key, &mgr_core).unwrap_err(); let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s2.exists()); @@ -2563,11 +2563,11 @@ pub mod tests { write_batch_size: TEST_WRITE_BATCH_SIZE, coprocessor_host: CoprocessorHost::::default(), }; - assert!(s5.apply(options).is_err()); + s5.apply(options).unwrap_err(); corrupt_snapshot_size_in(dst_dir.path()); - assert!(Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_meta,).is_err()); - assert!(Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).is_err()); + Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_meta).unwrap_err(); + Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap_err(); } #[test] @@ -2607,7 +2607,7 @@ pub mod tests { assert_eq!(1, corrupt_snapshot_meta_file(dir.path())); - assert!(Snapshot::new_for_sending(dir.path(), &key, &mgr_core,).is_err()); + Snapshot::new_for_sending(dir.path(), &key, &mgr_core).unwrap_err(); let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s2.exists()); @@ -2637,11 +2637,9 @@ pub mod tests { assert_eq!(1, corrupt_snapshot_meta_file(dst_dir.path())); - assert!(Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core,).is_err()); - assert!( - Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_data.take_meta(),) - .is_err() - ); + Snapshot::new_for_applying(dst_dir.path(), &key, &mgr_core).unwrap_err(); + Snapshot::new_for_receiving(dst_dir.path(), &key, &mgr_core, snap_data.take_meta()) + .unwrap_err(); } #[test] @@ -2663,7 +2661,7 @@ pub mod tests { let path2 = temp_path2.to_str().unwrap().to_owned(); File::create(temp_path2).unwrap(); mgr = SnapManager::new(path2); - assert!(mgr.init().is_err()); + mgr.init().unwrap_err(); } #[test] diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 078d3114060..1270ae104c9 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -408,7 +408,7 @@ mod tests { // Not exceeding the region limit, but exceeding the global limit GLOBAL_MEM_SIZE.set(101 << 20); let res = locks.insert(vec![(Key::from_raw(b"k2"), lock(b"abc"))]); - assert!(res.is_err()); + res.unwrap_err(); assert!(locks.get(&Key::from_raw(b"k2")).is_none()); } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 2bda7f4794f..1b707a42921 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -449,7 +449,7 @@ pub struct Lease { remote: Option, } -#[derive(Clone, Copy, PartialEq, Eq, Debug)] +#[derive(Clone, Copy, PartialEq, Debug)] pub enum LeaseState { /// The lease is suspicious, may be invalid. Suspect, @@ -794,7 +794,7 @@ impl< } } -#[derive(PartialEq, Eq, Debug)] +#[derive(PartialEq, Debug)] pub enum ConfChangeKind { // Only contains one configuration change Simple, diff --git a/components/raftstore/src/store/worker/cleanup_snapshot.rs b/components/raftstore/src/store/worker/cleanup_snapshot.rs index 07d2ac001d4..c84d6ddb4d3 100644 --- a/components/raftstore/src/store/worker/cleanup_snapshot.rs +++ b/components/raftstore/src/store/worker/cleanup_snapshot.rs @@ -25,7 +25,7 @@ pub enum Task { impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match &*self { + match self { Task::GcSnapshot => write!(f, "Gc Snapshot"), Task::DeleteSnapshotFiles { key, .. } => write!(f, "Delete Snapshot Files for {}", key), } diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 9e5e54c185e..6a6aa53103d 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -1955,12 +1955,12 @@ where unix_secs_now.into_inner() - last_report_ts.into_inner(); // Keep consistent with the calculation of cpu_usages in a store heartbeat. // See components/tikv_util/src/metrics/threads_linux.rs for more details. - (interval_second > 0) - .then(|| { - ((cpu_time_duration.as_secs_f64() * 100.0) / interval_second as f64) - as u64 - }) - .unwrap_or(0) + if interval_second > 0 { + ((cpu_time_duration.as_secs_f64() * 100.0) / interval_second as f64) + as u64 + } else { + 0 + } }; ( read_bytes_delta, diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index 81fa843ace0..d1c531070ac 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -105,7 +105,7 @@ where iter.key().to_vec(), pos, iter.value().len(), - *cf, + cf, )); } iters.push((*cf, iter)); diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 0f15bcc4805..addedc3d653 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -251,7 +251,7 @@ impl Samples { if best_index >= 0 { return self.0[best_index as usize].key.clone(); } - return vec![]; + vec![] } } diff --git a/components/resource_metering/src/config.rs b/components/resource_metering/src/config.rs index 69d7c78cb2f..090768a9493 100644 --- a/components/resource_metering/src/config.rs +++ b/components/resource_metering/src/config.rs @@ -147,20 +147,20 @@ mod tests { max_resource_groups: 2000, precision: ReadableDuration::secs(1), }; - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); let cfg = Config { receiver_address: "127.0.0.1:6666".to_string(), report_receiver_interval: ReadableDuration::minutes(1), max_resource_groups: usize::MAX, // invalid precision: ReadableDuration::secs(1), }; - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); let cfg = Config { receiver_address: "127.0.0.1:6666".to_string(), report_receiver_interval: ReadableDuration::minutes(1), max_resource_groups: 2000, precision: ReadableDuration::days(999), // invalid }; - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); } } diff --git a/components/resource_metering/src/lib.rs b/components/resource_metering/src/lib.rs index bd64d7202ae..ba8e2174e19 100644 --- a/components/resource_metering/src/lib.rs +++ b/components/resource_metering/src/lib.rs @@ -143,15 +143,12 @@ impl Drop for Guard { return; } let mut records = ls.summary_records.lock().unwrap(); - match records.get(&tag) { - Some(record) => { - record.merge(&cur_record); - } - None => { - // See MAX_SUMMARY_RECORDS_LEN. - if records.len() < MAX_SUMMARY_RECORDS_LEN { - records.insert(tag, cur_record); - } + if let Some(record) = records.get(&tag) { + record.merge(&cur_record); + } else { + // See MAX_SUMMARY_RECORDS_LEN. + if records.len() < MAX_SUMMARY_RECORDS_LEN { + records.insert(tag, cur_record); } } }) diff --git a/components/resource_metering/src/model.rs b/components/resource_metering/src/model.rs index 1359e6c3a45..6f7118ef9e1 100644 --- a/components/resource_metering/src/model.rs +++ b/components/resource_metering/src/model.rs @@ -20,7 +20,7 @@ thread_local! { } /// Raw resource statistics record. -#[derive(Debug, Default, Copy, Clone, Eq, PartialEq)] +#[derive(Debug, Default, Copy, Clone, PartialEq)] pub struct RawRecord { pub cpu_time: u32, // ms pub read_keys: u32, @@ -48,7 +48,7 @@ impl RawRecord { /// [Recorder]: crate::recorder::Recorder /// [Reporter]: crate::reporter::Reporter /// [Collector]: crate::collector::Collector -#[derive(Debug, Eq, PartialEq, Clone)] +#[derive(Debug, PartialEq, Clone)] pub struct RawRecords { pub begin_unix_time_secs: u64, pub duration: Duration, diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 71a58a33dc3..ce55e7beb41 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -823,7 +823,7 @@ mod tests { check_file_not_exists(&path.clone, key_manager.as_deref()); // Cannot create the same file again. - assert!(dir.create(&meta, key_manager.clone()).is_err()); + dir.create(&meta, key_manager.clone()).unwrap_err(); } // Test ImportDir::delete() @@ -912,12 +912,10 @@ mod tests { let mut f = ImportFile::create(meta.clone(), path.clone(), data_key_manager.clone()).unwrap(); // Cannot create the same file again. - assert!( - ImportFile::create(meta.clone(), path.clone(), data_key_manager.clone()).is_err() - ); + ImportFile::create(meta.clone(), path.clone(), data_key_manager.clone()).unwrap_err(); f.append(data).unwrap(); // Invalid crc32 and length. - assert!(f.finish().is_err()); + f.finish().unwrap_err(); check_file_exists(&path.temp, data_key_manager.as_deref()); check_file_not_exists(&path.save, data_key_manager.as_deref()); } @@ -1595,7 +1593,7 @@ mod tests { meta.set_length(0); // disable validation. meta.set_crc32(0); let meta_info = importer.validate(&meta).unwrap(); - let _ = importer.ingest(&[meta_info.clone()], &db).unwrap(); + importer.ingest(&[meta_info.clone()], &db).unwrap(); // key1 = "zt9102_r01", value1 = "abc", len = 13 // key2 = "zt9102_r04", value2 = "xyz", len = 13 // key3 = "zt9102_r07", value3 = "pqrst", len = 15 diff --git a/components/sst_importer/src/sst_writer.rs b/components/sst_importer/src/sst_writer.rs index 60fc1b9e2ab..210f17fc168 100644 --- a/components/sst_importer/src/sst_writer.rs +++ b/components/sst_importer/src/sst_writer.rs @@ -434,7 +434,7 @@ mod tests { let (mut w, _handle) = new_writer(SstImporter::new_raw_writer, ApiVersion::V1); let mut batch = RawWriteBatch::default(); batch.set_ttl(10); - assert!(w.write(batch).is_err()); + w.write(batch).unwrap_err(); } #[test] @@ -462,7 +462,7 @@ mod tests { let pairs = vec![pair]; batch.set_pairs(pairs.into()); - assert!(w.write(batch).is_err()); + w.write(batch).unwrap_err(); } #[test] @@ -478,7 +478,7 @@ mod tests { let pairs = vec![pair]; batch.set_pairs(pairs.into()); - assert!(w.write(batch.clone()).is_err()); + w.write(batch.clone()).unwrap_err(); // put a valid key let mut pair = Pair::default(); diff --git a/components/test_raftstore/src/pd.rs b/components/test_raftstore/src/pd.rs index 45a69896296..33241862e07 100644 --- a/components/test_raftstore/src/pd.rs +++ b/components/test_raftstore/src/pd.rs @@ -811,7 +811,7 @@ pub struct TestPdClient { pub gc_safepoints: RwLock>, } -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Clone)] pub struct GcSafePoint { pub serivce: String, pub ttl: Duration, diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index e8fba33f65f..0aa778d01b0 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -314,9 +314,9 @@ impl FilterFactory for PartitionFilterFactory { node_ids: self.s2.clone(), })]; } - return vec![Box::new(PartitionFilter { + vec![Box::new(PartitionFilter { node_ids: self.s1.clone(), - })]; + })] } } diff --git a/components/test_sst_importer/src/lib.rs b/components/test_sst_importer/src/lib.rs index 9397a6bb35b..2f8c195a6bf 100644 --- a/components/test_sst_importer/src/lib.rs +++ b/components/test_sst_importer/src/lib.rs @@ -38,7 +38,7 @@ where if let Some(ref env) = env { opt.set_env(env.clone()); } - apply(*cf, &mut opt); + apply(cf, &mut opt); opt.add_table_properties_collector_factory( "tikv.test_properties", TestPropertiesCollectorFactory::new(*cf), diff --git a/components/test_storage/src/assert_storage.rs b/components/test_storage/src/assert_storage.rs index 7f057971785..5cb6e43d8cb 100644 --- a/components/test_storage/src/assert_storage.rs +++ b/components/test_storage/src/assert_storage.rs @@ -240,7 +240,9 @@ impl AssertionStorage { pub fn get_err(&self, key: &[u8], ts: impl Into) { let key = Key::from_raw(key); - assert!(self.store.get(self.ctx.clone(), &key, ts.into()).is_err()); + self.store + .get(self.ctx.clone(), &key, ts.into()) + .unwrap_err(); } pub fn get_ok(&self, key: &[u8], ts: impl Into, expect: &[u8]) { @@ -271,11 +273,9 @@ impl AssertionStorage { pub fn batch_get_err(&self, keys: &[&[u8]], ts: impl Into) { let keys: Vec = keys.iter().map(|x| Key::from_raw(x)).collect(); - assert!( - self.store - .batch_get(self.ctx.clone(), &keys, ts.into()) - .is_err() - ); + self.store + .batch_get(self.ctx.clone(), &keys, ts.into()) + .unwrap_err(); } pub fn batch_get_command_ok(&self, keys: &[&[u8]], ts: u64, expect: Vec<&[u8]>) { @@ -293,11 +293,9 @@ impl AssertionStorage { } pub fn batch_get_command_err(&self, keys: &[&[u8]], ts: u64) { - assert!( - self.store - .batch_get_command(self.ctx.clone(), keys, ts) - .is_err() - ); + self.store + .batch_get_command(self.ctx.clone(), keys, ts) + .unwrap_err(); } fn expect_not_leader_or_stale_command(&self, err: storage::Error) { @@ -332,7 +330,6 @@ impl AssertionStorage { ) where T: std::fmt::Debug, { - assert!(resp.is_err()); let err = resp.unwrap_err(); match err { StorageError(box StorageErrorInner::Txn(TxnError( @@ -384,16 +381,14 @@ impl AssertionStorage { _commit_ts: impl Into, ) { let start_ts = start_ts.into(); - assert!( - self.store - .prewrite( - self.ctx.clone(), - vec![Mutation::make_put(Key::from_raw(key), value.to_vec())], - key.to_vec(), - start_ts, - ) - .is_err() - ); + self.store + .prewrite( + self.ctx.clone(), + vec![Mutation::make_put(Key::from_raw(key), value.to_vec())], + key.to_vec(), + start_ts, + ) + .unwrap_err(); } pub fn delete_ok( @@ -683,16 +678,14 @@ impl AssertionStorage { start_ts: impl Into, current_ts: impl Into, ) { - assert!( - self.store - .cleanup( - self.ctx.clone(), - Key::from_raw(key), - start_ts.into(), - current_ts.into() - ) - .is_err() - ); + self.store + .cleanup( + self.ctx.clone(), + Key::from_raw(key), + start_ts.into(), + current_ts.into(), + ) + .unwrap_err(); } pub fn rollback_ok(&self, keys: Vec<&[u8]>, start_ts: impl Into) { @@ -704,11 +697,9 @@ impl AssertionStorage { pub fn rollback_err(&self, keys: Vec<&[u8]>, start_ts: impl Into) { let keys: Vec = keys.iter().map(|x| Key::from_raw(x)).collect(); - assert!( - self.store - .rollback(self.ctx.clone(), keys, start_ts.into()) - .is_err() - ); + self.store + .rollback(self.ctx.clone(), keys, start_ts.into()) + .unwrap_err(); } pub fn scan_locks_ok( @@ -890,11 +881,9 @@ impl AssertionStorage { } pub fn raw_batch_get_command_err(&self, cf: String, keys: Vec>) { - assert!( - self.store - .raw_batch_get_command(self.ctx.clone(), cf, keys) - .is_err() - ); + self.store + .raw_batch_get_command(self.ctx.clone(), cf, keys) + .unwrap_err(); } pub fn raw_put_ok(&self, cf: String, key: Vec, value: Vec) { diff --git a/components/tidb_query_aggr/src/impl_max_min.rs b/components/tidb_query_aggr/src/impl_max_min.rs index 31ff6acc8aa..f4046c35440 100644 --- a/components/tidb_query_aggr/src/impl_max_min.rs +++ b/components/tidb_query_aggr/src/impl_max_min.rs @@ -937,7 +937,7 @@ mod tests { min_state.push_result(&mut ctx, &mut aggr_result).unwrap(); } - assert_eq!(aggr_result[0].to_int_vec(), &(*expected_res)); + assert_eq!(aggr_result[0].to_int_vec(), expected_res); } #[test] diff --git a/components/tidb_query_aggr/src/lib.rs b/components/tidb_query_aggr/src/lib.rs index 1eda14a0697..c6ddfb96d2f 100644 --- a/components/tidb_query_aggr/src/lib.rs +++ b/components/tidb_query_aggr/src/lib.rs @@ -438,7 +438,7 @@ mod tests { Real::new(1.0).ok().as_ref() ); }); - assert!(result.is_err()); + result.unwrap_err(); let result = panic_hook::recover_safe(|| { let mut s = s.clone(); @@ -448,7 +448,7 @@ mod tests { Some(&[1u8] as BytesRef<'_>) ); }); - assert!(result.is_err()); + result.unwrap_err(); // Push result to Real VectorValue should success. let mut target = vec![VectorValue::with_capacity(0, EvalType::Real)]; @@ -479,13 +479,13 @@ mod tests { let mut target: Vec = Vec::new(); let _ = (&mut s as &mut dyn AggrFunctionState).push_result(&mut ctx, &mut target[..]); }); - assert!(result.is_err()); + result.unwrap_err(); let result = panic_hook::recover_safe(|| { let mut s = s.clone(); let mut target: Vec = vec![VectorValue::with_capacity(0, EvalType::Int)]; let _ = (&mut s as &mut dyn AggrFunctionState).push_result(&mut ctx, &mut target[..]); }); - assert!(result.is_err()); + result.unwrap_err(); } } diff --git a/components/tidb_query_common/src/execute_stats.rs b/components/tidb_query_common/src/execute_stats.rs index b2740212df0..55d31dfb8f5 100644 --- a/components/tidb_query_common/src/execute_stats.rs +++ b/components/tidb_query_common/src/execute_stats.rs @@ -4,7 +4,7 @@ use derive_more::{Add, AddAssign}; /// Execution summaries to support `EXPLAIN ANALYZE` statements. We don't use /// `ExecutorExecutionSummary` directly since it is less efficient. -#[derive(Debug, Default, Copy, Clone, Add, AddAssign, PartialEq, Eq)] +#[derive(Debug, Default, Copy, Clone, Add, AddAssign, PartialEq)] pub struct ExecSummary { /// Total time cost in this executor. pub time_processed_ns: usize, diff --git a/components/tidb_query_common/src/storage/range.rs b/components/tidb_query_common/src/storage/range.rs index b4075fb3b60..b826f55fe46 100644 --- a/components/tidb_query_common/src/storage/range.rs +++ b/components/tidb_query_common/src/storage/range.rs @@ -4,7 +4,7 @@ use kvproto::coprocessor::KeyRange; // TODO: Remove this module after switching to DAG v2. -#[derive(PartialEq, Eq, Clone)] +#[derive(PartialEq, Clone)] pub enum Range { Point(PointRange), Interval(IntervalRange), @@ -41,7 +41,7 @@ impl From for Range { } } -#[derive(Default, PartialEq, Eq, Clone)] +#[derive(Default, PartialEq, Clone)] pub struct IntervalRange { pub lower_inclusive: Vec, pub upper_exclusive: Vec, @@ -87,7 +87,7 @@ impl<'a, 'b> From<(&'a str, &'b str)> for IntervalRange { } } -#[derive(Default, PartialEq, Eq, Clone)] +#[derive(Default, PartialEq, Clone)] pub struct PointRange(pub Vec); impl std::fmt::Debug for PointRange { diff --git a/components/tidb_query_common/src/storage/ranges_iter.rs b/components/tidb_query_common/src/storage/ranges_iter.rs index 6f99249336b..b872d8c5bc5 100644 --- a/components/tidb_query_common/src/storage/ranges_iter.rs +++ b/components/tidb_query_common/src/storage/ranges_iter.rs @@ -2,7 +2,7 @@ use super::range::Range; -#[derive(PartialEq, Eq, Clone, Debug)] +#[derive(PartialEq, Clone, Debug)] pub enum IterStatus { /// All ranges are consumed. Drained, diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index 41f0794950d..67620510ef8 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -1589,7 +1589,7 @@ mod tests { // SHOULD_CLIP_TO_ZERO let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::IN_INSERT_STMT))); let r = (-12345_i64).to_uint(&mut ctx, FieldTypeTp::LongLong); - assert!(r.is_err()); + r.unwrap_err(); // SHOULD_CLIP_TO_ZERO | OVERFLOW_AS_WARNING let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag( @@ -1928,11 +1928,11 @@ mod tests { // test overflow let mut ctx = EvalContext::default(); let val: Result = f64::INFINITY.to_string().as_bytes().convert(&mut ctx); - assert!(val.is_err()); + val.unwrap_err(); let mut ctx = EvalContext::default(); let val: Result = f64::NEG_INFINITY.to_string().as_bytes().convert(&mut ctx); - assert!(val.is_err()); + val.unwrap_err(); // TRUNCATE_AS_WARNING let mut ctx = EvalContext::new(Arc::new(EvalConfig::from_flag(Flag::TRUNCATE_AS_WARNING))); diff --git a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs index 4bad0fcc129..c4f5abbc122 100644 --- a/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs +++ b/components/tidb_query_datatype/src/codec/data_type/chunked_vec_bytes.rs @@ -177,7 +177,7 @@ impl BytesWriter { } } -impl<'a> PartialBytesWriter { +impl PartialBytesWriter { pub fn partial_write(&mut self, data: BytesRef<'_>) { self.chunked_vec.data.extend_from_slice(data); } diff --git a/components/tidb_query_datatype/src/codec/data_type/mod.rs b/components/tidb_query_datatype/src/codec/data_type/mod.rs index 930070e87a2..8ca36790824 100644 --- a/components/tidb_query_datatype/src/codec/data_type/mod.rs +++ b/components/tidb_query_datatype/src/codec/data_type/mod.rs @@ -410,7 +410,7 @@ impl<'a, T: Evaluable + EvaluableRet> EvaluableRef<'a> for &'a T { } } -impl<'a, A: UnsafeRefInto, B> UnsafeRefInto> for Option { +impl, B> UnsafeRefInto> for Option { unsafe fn unsafe_into(self) -> Option { self.map(|x| x.unsafe_into()) } @@ -698,7 +698,7 @@ mod tests { .as_bytes() .to_vec() .as_mysql_bool(&mut ctx); - assert!(val.is_err()); + val.unwrap_err(); let mut ctx = EvalContext::default(); let val: Result = f64::NEG_INFINITY @@ -706,7 +706,7 @@ mod tests { .as_bytes() .to_vec() .as_mysql_bool(&mut ctx); - assert!(val.is_err()); + val.unwrap_err(); } #[test] diff --git a/components/tidb_query_datatype/src/codec/data_type/vector.rs b/components/tidb_query_datatype/src/codec/data_type/vector.rs index c7eecf92fa0..49a4e3a1cff 100644 --- a/components/tidb_query_datatype/src/codec/data_type/vector.rs +++ b/components/tidb_query_datatype/src/codec/data_type/vector.rs @@ -366,7 +366,7 @@ impl VectorValue { output.write_evaluable_datum_null()?; } Some(val) => { - output.write_evaluable_datum_decimal(*val)?; + output.write_evaluable_datum_decimal(val)?; } } Ok(()) diff --git a/components/tidb_query_datatype/src/codec/datum.rs b/components/tidb_query_datatype/src/codec/datum.rs index 9d791d911cd..c953e9e7269 100644 --- a/components/tidb_query_datatype/src/codec/datum.rs +++ b/components/tidb_query_datatype/src/codec/datum.rs @@ -1975,7 +1975,7 @@ mod tests { ]; for d in illegal_cases { - assert!(d.cast_as_json().is_err()); + d.cast_as_json().unwrap_err(); } } @@ -1996,7 +1996,7 @@ mod tests { let illegal_cases = vec![Datum::Max, Datum::Min]; for d in illegal_cases { - assert!(d.into_json().is_err()); + d.into_json().unwrap_err(); } } diff --git a/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs b/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs index 8d1f5fdd8bb..3ab44ad40df 100644 --- a/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/binary_literal.rs @@ -277,7 +277,7 @@ mod tests { } let lit = BinaryLiteral::from_u64(100, -2); - assert!(lit.is_err()); + lit.unwrap_err(); } #[test] @@ -463,12 +463,10 @@ mod tests { let mut ctx = EvalContext::default(); for (s, expected, err) in cs { if err { - assert!( - BinaryLiteral::from_hex_str(s) - .unwrap() - .to_uint(&mut ctx) - .is_err() - ); + BinaryLiteral::from_hex_str(s) + .unwrap() + .to_uint(&mut ctx) + .unwrap_err(); } else { let lit = BinaryLiteral::from_hex_str(s).unwrap(); assert_eq!(lit.to_uint(&mut ctx).unwrap(), expected) diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 7cd1c239bb1..135a3cd2ce7 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -3045,7 +3045,7 @@ mod tests { // error cases let cases = vec![b"1e18446744073709551620"]; for case in cases { - assert!(Decimal::from_bytes(case).is_err()); + Decimal::from_bytes(case).unwrap_err(); } } diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index 3869f773020..520c985f4b5 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -81,7 +81,7 @@ fn check_nanos_part(nanos: u32) -> Result { #[inline] fn check_nanos(nanos: i64) -> Result { - if nanos < -MAX_NANOS || nanos > MAX_NANOS { + if !(-MAX_NANOS..=MAX_NANOS).contains(&nanos) { Err(Error::truncated_wrong_val("NANOS", nanos)) } else { Ok(nanos) diff --git a/components/tidb_query_datatype/src/codec/mysql/enums.rs b/components/tidb_query_datatype/src/codec/mysql/enums.rs index fecada58b1d..6c39d7f8a95 100644 --- a/components/tidb_query_datatype/src/codec/mysql/enums.rs +++ b/components/tidb_query_datatype/src/codec/mysql/enums.rs @@ -467,7 +467,7 @@ mod tests { 1, 0, 0, 0, 0, 0, 0, 0, 99, // 3rd ]; for data in &src { - dest.write_enum_to_chunk_by_datum_payload_compact_bytes(*data, &field_type) + dest.write_enum_to_chunk_by_datum_payload_compact_bytes(data, &field_type) .expect("write_enum_to_chunk_by_payload_compact_bytes"); } assert_eq!(&dest, res); @@ -490,7 +490,7 @@ mod tests { 1, 0, 0, 0, 0, 0, 0, 0, 99, // 3rd ]; for data in &src { - dest.write_enum_to_chunk_by_datum_payload_uint(*data, &field_type) + dest.write_enum_to_chunk_by_datum_payload_uint(data, &field_type) .expect("write_enum_to_chunk_by_payload_uint"); } assert_eq!(&dest, res); @@ -513,7 +513,7 @@ mod tests { 1, 0, 0, 0, 0, 0, 0, 0, 99, // 3rd ]; for data in &src { - dest.write_enum_to_chunk_by_datum_payload_var_uint(*data, &field_type) + dest.write_enum_to_chunk_by_datum_payload_var_uint(data, &field_type) .expect("write_enum_to_chunk_by_payload_var_uint"); } assert_eq!(&dest, res); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs index 2e5abc6f87a..8967ab71eeb 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs @@ -100,7 +100,7 @@ use crate::{ const ERR_CONVERT_FAILED: &str = "Can not covert from "; /// The types of `Json` which follows -#[derive(Eq, PartialEq, FromPrimitive, Clone, Debug, Copy)] +#[derive(PartialEq, FromPrimitive, Clone, Debug, Copy)] pub enum JsonType { Object = 0x01, Array = 0x03, @@ -536,7 +536,7 @@ mod tests { ], ]; for d in cases { - assert!(json_object(d).is_err()); + json_object(d).unwrap_err(); } let cases = vec![ diff --git a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs index ecdec8adad4..8d1b5c0d453 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs @@ -87,7 +87,7 @@ impl<'a> BinaryModifier<'a> { return Ok(()); } let parent_node = &result[0]; - match &*last_leg { + match last_leg { PathLeg::Index(_) => { // Record the parent node value offset, as it's actually relative to `old` self.to_be_modified_ptr = parent_node.as_ptr(); @@ -167,7 +167,7 @@ impl<'a> BinaryModifier<'a> { return Ok(()); } let parent_node = &result[0]; - match &*last_leg { + match last_leg { PathLeg::Index(remove_idx) => { if parent_node.get_type() == JsonType::Array { self.to_be_modified_ptr = parent_node.as_ptr(); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs index a4c33944e21..b2b2f421bcb 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs @@ -262,7 +262,7 @@ mod tests { let illegal_cases = vec!["[pxx,apaa]", "hpeheh", ""]; for json_str in illegal_cases { let resp = Json::from_str(json_str); - assert!(resp.is_err()); + resp.unwrap_err(); } } } diff --git a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs index 79068b38118..88c08f16b20 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/mod.rs @@ -2044,7 +2044,7 @@ mod tests { let should_fail = vec![-1111, 1, 100, 700_100, 100_000_000, 100_000_101_000_000]; for case in should_fail { - assert!(Time::parse_from_i64(&mut ctx, case, TimeType::DateTime, 0).is_err()); + Time::parse_from_i64(&mut ctx, case, TimeType::DateTime, 0).unwrap_err(); } Ok(()) } @@ -2079,9 +2079,7 @@ mod tests { ]; for case in should_fail { let case: Decimal = case.parse().unwrap(); - assert!( - Time::parse_from_decimal(&mut ctx, &case, TimeType::DateTime, 0, true).is_err() - ); + Time::parse_from_decimal(&mut ctx, &case, TimeType::DateTime, 0, true).unwrap_err(); } Ok(()) } @@ -2155,7 +2153,7 @@ mod tests { ]; for case in should_fail { - assert!(Time::parse_date(&mut ctx, case).is_err()); + Time::parse_date(&mut ctx, case).unwrap_err(); } Ok(()) } @@ -2287,7 +2285,7 @@ mod tests { ]; for (case, fsp) in should_fail { - assert!(Time::parse_datetime(&mut ctx, case, fsp, false).is_err()); + Time::parse_datetime(&mut ctx, case, fsp, false).unwrap_err(); } Ok(()) } @@ -2583,7 +2581,7 @@ mod tests { ..TimeEnv::default() }); - assert!(Time::parse_datetime(&mut ctx, "0000-00-00 00:00:00", 0, false).is_err()); + Time::parse_datetime(&mut ctx, "0000-00-00 00:00:00", 0, false).unwrap_err(); // Enable NO_ZERO_DATE, STRICT_MODE and IGNORE_TRUNCATE. // If zero-date is encountered, an error is returned. @@ -2616,7 +2614,7 @@ mod tests { strict_mode: true, ..TimeEnv::default() }); - assert!(Time::parse_datetime(&mut ctx, case, 0, false).is_err()); + Time::parse_datetime(&mut ctx, case, 0, false).unwrap_err(); } Ok(()) @@ -2663,7 +2661,7 @@ mod tests { strict_mode: true, ..TimeEnv::default() }); - assert!(Time::parse_datetime(&mut ctx, case, 0, false).is_err()); + Time::parse_datetime(&mut ctx, case, 0, false).unwrap_err(); } Ok(()) diff --git a/components/tidb_query_datatype/src/codec/mysql/time/tz.rs b/components/tidb_query_datatype/src/codec/mysql/time/tz.rs index 7b90e96b78c..25b35a90fc0 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/tz.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/tz.rs @@ -188,7 +188,7 @@ impl TimeZone for Tz { /// `Tz::Local` -> `TzOffset::Local` /// `Tz::Offset` -> `TzOffset::Fixed` /// `Tz::Name` -> `TzOffset::NonFixed` -#[derive(Copy, Clone, PartialEq, Eq, Debug)] +#[derive(Copy, Clone, PartialEq, Debug)] pub enum TzOffset { Local(FixedOffset), Fixed(FixedOffset), diff --git a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs index 463a969284d..5d0c7329d54 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs @@ -151,8 +151,8 @@ impl RowSlice<'_> { #[inline] pub fn origin(&self) -> &[u8] { match self { - RowSlice::Big { origin, .. } => *origin, - RowSlice::Small { origin, .. } => *origin, + RowSlice::Big { origin, .. } => origin, + RowSlice::Small { origin, .. } => origin, } } diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 052ad8bf927..7155748571f 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -795,13 +795,13 @@ mod tests { let mut range = KeyRange::default(); range.set_end(small_key.clone()); range.set_start(large_key); - assert!(check_table_ranges(&[range]).is_err()); + check_table_ranges(&[range]).unwrap_err(); // test invalid end let mut range = KeyRange::default(); range.set_start(small_key); range.set_end(b"xx".to_vec()); - assert!(check_table_ranges(&[range]).is_err()); + check_table_ranges(&[range]).unwrap_err(); } #[test] @@ -812,7 +812,7 @@ mod tests { assert_eq!(tid, decode_table_id(&k).unwrap()); let k = encode_index_seek_key(tid, 1, &k); assert_eq!(tid, decode_table_id(&k).unwrap()); - assert!(decode_table_id(b"xxx").is_err()); + decode_table_id(b"xxx").unwrap_err(); } } @@ -820,15 +820,15 @@ mod tests { fn test_check_key_type() { let record_key = encode_row_key(TABLE_ID, 1); check_key_type(record_key.as_slice(), RECORD_PREFIX_SEP).unwrap(); - assert!(check_key_type(record_key.as_slice(), INDEX_PREFIX_SEP).is_err()); + check_key_type(record_key.as_slice(), INDEX_PREFIX_SEP).unwrap_err(); let (_, index_key) = generate_index_data_for_test(TABLE_ID, INDEX_ID, 1, &Datum::I64(1), true); - assert!(check_key_type(index_key.as_slice(), RECORD_PREFIX_SEP).is_err()); + check_key_type(index_key.as_slice(), RECORD_PREFIX_SEP).unwrap_err(); check_key_type(index_key.as_slice(), INDEX_PREFIX_SEP).unwrap(); let too_small_key = vec![0]; - assert!(check_key_type(too_small_key.as_slice(), RECORD_PREFIX_SEP).is_err()); - assert!(check_key_type(too_small_key.as_slice(), INDEX_PREFIX_SEP).is_err()); + check_key_type(too_small_key.as_slice(), RECORD_PREFIX_SEP).unwrap_err(); + check_key_type(too_small_key.as_slice(), INDEX_PREFIX_SEP).unwrap_err(); } } diff --git a/components/tidb_query_datatype/src/def/eval_type.rs b/components/tidb_query_datatype/src/def/eval_type.rs index 9addab99e56..855802119b9 100644 --- a/components/tidb_query_datatype/src/def/eval_type.rs +++ b/components/tidb_query_datatype/src/def/eval_type.rs @@ -137,7 +137,7 @@ mod tests { if let Some(etype) = etype { assert_eq!(ftt.unwrap(), etype); } else { - assert!(ftt.is_err()); + ftt.unwrap_err(); } } } diff --git a/components/tidb_query_datatype/src/def/field_type.rs b/components/tidb_query_datatype/src/def/field_type.rs index ac89ad53318..417d7b0d146 100644 --- a/components/tidb_query_datatype/src/def/field_type.rs +++ b/components/tidb_query_datatype/src/def/field_type.rs @@ -548,7 +548,7 @@ mod tests { if let Some(c) = expected { assert_eq!(coll.unwrap(), c); } else { - assert!(coll.is_err()); + coll.unwrap_err(); } } } @@ -574,7 +574,7 @@ mod tests { if let Some(c) = expected { assert_eq!(charset.unwrap(), c); } else { - assert!(charset.is_err()); + charset.unwrap_err(); } } } diff --git a/components/tidb_query_datatype/src/expr/ctx.rs b/components/tidb_query_datatype/src/expr/ctx.rs index a3e175a3867..ffaf63a9774 100644 --- a/components/tidb_query_datatype/src/expr/ctx.rs +++ b/components/tidb_query_datatype/src/expr/ctx.rs @@ -336,7 +336,7 @@ mod tests { // ignore_truncate = false, truncate_as_warning = false let mut ctx = EvalContext::new(Arc::new(EvalConfig::new())); ctx.handle_truncate(false).unwrap(); - assert!(ctx.handle_truncate(true).is_err()); + ctx.handle_truncate(true).unwrap_err(); assert!(ctx.take_warnings().warnings.is_empty()); // ignore_truncate = false; let mut ctx = EvalContext::new(Arc::new(EvalConfig::default_for_test())); diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index 9f23d434a6c..8492a928a8d 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -817,7 +817,7 @@ impl IndexScanExecutorImpl { #[inline] fn split_common_handle(value: &[u8]) -> Result<(&[u8], &[u8])> { if value - .get(0) + .first() .map_or(false, |c| *c == table::INDEX_VALUE_COMMON_HANDLE_FLAG) { let handle_len = (&value[1..]).read_u16().map_err(|_| { @@ -839,7 +839,7 @@ impl IndexScanExecutorImpl { #[inline] fn split_partition_id(value: &[u8]) -> Result<(&[u8], &[u8])> { if value - .get(0) + .first() .map_or(false, |c| *c == table::INDEX_VALUE_PARTITION_ID_FLAG) { if value.len() < 9 { @@ -858,7 +858,7 @@ impl IndexScanExecutorImpl { fn split_restore_data(value: &[u8]) -> Result<(&[u8], &[u8])> { Ok( if value - .get(0) + .first() .map_or(false, |c| *c == table::INDEX_VALUE_RESTORED_DATA_FLAG) { (value, &value[value.len()..]) @@ -1106,17 +1106,13 @@ mod tests { assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 3); assert!(result.physical_columns[0].is_raw()); - assert!( - result.physical_columns[0] - .ensure_all_decoded_for_test(&mut ctx, &schema[1]) - .is_err() - ); + result.physical_columns[0] + .ensure_all_decoded_for_test(&mut ctx, &schema[1]) + .unwrap_err(); assert!(result.physical_columns[1].is_raw()); - assert!( - result.physical_columns[1] - .ensure_all_decoded_for_test(&mut ctx, &schema[0]) - .is_err() - ); + result.physical_columns[1] + .ensure_all_decoded_for_test(&mut ctx, &schema[0]) + .unwrap_err(); } { @@ -1163,17 +1159,13 @@ mod tests { &[Some(5), Some(5), Some(-5)] ); assert!(result.physical_columns[1].is_raw()); - assert!( - result.physical_columns[1] - .ensure_all_decoded_for_test(&mut ctx, &schema[3]) - .is_err() - ); + result.physical_columns[1] + .ensure_all_decoded_for_test(&mut ctx, &schema[3]) + .unwrap_err(); assert!(result.physical_columns[2].is_raw()); - assert!( - result.physical_columns[2] - .ensure_all_decoded_for_test(&mut ctx, &schema[1]) - .is_err() - ); + result.physical_columns[2] + .ensure_all_decoded_for_test(&mut ctx, &schema[1]) + .unwrap_err(); } { diff --git a/components/tidb_query_executors/src/limit_executor.rs b/components/tidb_query_executors/src/limit_executor.rs index 864b32ecd6b..a1917e1b17b 100644 --- a/components/tidb_query_executors/src/limit_executor.rs +++ b/components/tidb_query_executors/src/limit_executor.rs @@ -124,7 +124,7 @@ mod tests { let r = exec.next_batch(1); assert_eq!(&r.logical_rows, &[1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); - assert!(r.is_drained.is_err()); + r.is_drained.unwrap_err(); } #[test] diff --git a/components/tidb_query_executors/src/projection_executor.rs b/components/tidb_query_executors/src/projection_executor.rs index 1d6892731ff..7304ed1b1e3 100644 --- a/components/tidb_query_executors/src/projection_executor.rs +++ b/components/tidb_query_executors/src/projection_executor.rs @@ -523,6 +523,6 @@ mod tests { let r = exec.next_batch(1); assert!(r.logical_rows.is_empty()); - assert!(r.is_drained.is_err()); + r.is_drained.unwrap_err(); } } diff --git a/components/tidb_query_executors/src/selection_executor.rs b/components/tidb_query_executors/src/selection_executor.rs index b7a19da9026..d3a2d97ef4b 100644 --- a/components/tidb_query_executors/src/selection_executor.rs +++ b/components/tidb_query_executors/src/selection_executor.rs @@ -655,6 +655,6 @@ mod tests { let r = exec.next_batch(1); assert!(r.logical_rows.is_empty()); - assert!(r.is_drained.is_err()); + r.is_drained.unwrap_err(); } } diff --git a/components/tidb_query_executors/src/table_scan_executor.rs b/components/tidb_query_executors/src/table_scan_executor.rs index a4f7e957663..3ddb20b3e4d 100644 --- a/components/tidb_query_executors/src/table_scan_executor.rs +++ b/components/tidb_query_executors/src/table_scan_executor.rs @@ -939,7 +939,7 @@ mod tests { .unwrap(); let mut result = executor.next_batch(10); - assert!(result.is_drained.is_err()); + result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 2); assert!(result.physical_columns[0].is_decoded()); @@ -1046,7 +1046,7 @@ mod tests { .unwrap(); let mut result = executor.next_batch(10); - assert!(result.is_drained.is_err()); + result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_decoded()); @@ -1094,7 +1094,7 @@ mod tests { .unwrap(); let mut result = executor.next_batch(10); - assert!(result.is_drained.is_err()); + result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_decoded()); @@ -1154,7 +1154,7 @@ mod tests { ); let result = executor.next_batch(1); - assert!(result.is_drained.is_err()); + result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 0); } @@ -1175,7 +1175,7 @@ mod tests { .unwrap(); let result = executor.next_batch(10); - assert!(result.is_drained.is_err()); + result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 0); } @@ -1230,7 +1230,7 @@ mod tests { .unwrap(); let result = executor.next_batch(10); - assert!(result.is_drained.is_err()); + result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 0); } diff --git a/components/tidb_query_expr/src/impl_arithmetic.rs b/components/tidb_query_expr/src/impl_arithmetic.rs index 2500ebc311c..01776c1ad7a 100644 --- a/components/tidb_query_expr/src/impl_arithmetic.rs +++ b/components/tidb_query_expr/src/impl_arithmetic.rs @@ -1200,13 +1200,11 @@ mod tests { let overflow = vec![(f64::MAX, 0.0001)]; for (lhs, rhs) in overflow { - assert!( - RpnFnScalarEvaluator::new() - .push_param(lhs) - .push_param(rhs) - .evaluate::(ScalarFuncSig::DivideReal) - .is_err() - ) + RpnFnScalarEvaluator::new() + .push_param(lhs) + .push_param(rhs) + .evaluate::(ScalarFuncSig::DivideReal) + .unwrap_err(); } } @@ -1275,7 +1273,7 @@ mod tests { if is_ok { assert!(result.unwrap().is_none()); } else { - assert!(result.is_err()); + result.unwrap_err(); } if has_warning { diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 16f6a8f66c2..7fb118dfbec 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -1189,7 +1189,7 @@ fn cast_string_as_time( let val = String::from_utf8_lossy(val); Time::parse( ctx, - &*val, + &val, extra.ret_field_type.as_accessor().tp().try_into()?, extra.ret_field_type.get_decimal() as i8, // Enable round @@ -2425,7 +2425,7 @@ mod tests { assert!(output.is_ok(), "input: {:?}", input); assert_eq!(output.unwrap().unwrap(), exp, "input={:?}", input); } else { - assert!(output.is_err()); + output.unwrap_err(); } } } @@ -3661,7 +3661,7 @@ mod tests { input ); } else { - assert!(output.is_err()); + output.unwrap_err(); } } } diff --git a/components/tidb_query_expr/src/impl_compare.rs b/components/tidb_query_expr/src/impl_compare.rs index 858e1bcb3ec..350b36a3a99 100644 --- a/components/tidb_query_expr/src/impl_compare.rs +++ b/components/tidb_query_expr/src/impl_compare.rs @@ -545,7 +545,7 @@ mod tests { use super::*; use crate::test_util::RpnFnScalarEvaluator; - #[derive(Clone, Copy, PartialEq, Eq)] + #[derive(Clone, Copy, PartialEq)] enum TestCaseCmpOp { GT, GE, diff --git a/components/tidb_query_expr/src/impl_compare_in.rs b/components/tidb_query_expr/src/impl_compare_in.rs index 312943a276a..6de0ba33cfb 100644 --- a/components/tidb_query_expr/src/impl_compare_in.rs +++ b/components/tidb_query_expr/src/impl_compare_in.rs @@ -65,11 +65,11 @@ pub trait Extract: Sized { #[inline] fn type_error(eval_type: EvalType, expr_type: ExprType) -> Error { - return other_err!( + other_err!( "Unexpected ExprType {:?} and EvalType {:?}", expr_type, eval_type - ); + ) } impl Extract for Int { diff --git a/components/tidb_query_expr/src/impl_encryption.rs b/components/tidb_query_expr/src/impl_encryption.rs index 3a51f798442..9c26826c03b 100644 --- a/components/tidb_query_expr/src/impl_encryption.rs +++ b/components/tidb_query_expr/src/impl_encryption.rs @@ -452,12 +452,10 @@ mod tests { ]; for len in overflow_tests { - assert!( - RpnFnScalarEvaluator::new() - .push_param(len) - .evaluate::(ScalarFuncSig::RandomBytes) - .is_err(), - ); + RpnFnScalarEvaluator::new() + .push_param(len) + .evaluate::(ScalarFuncSig::RandomBytes) + .unwrap_err(); } // test NULL case diff --git a/components/tidb_query_expr/src/impl_json.rs b/components/tidb_query_expr/src/impl_json.rs index 5e5595bd3ed..60f784dc604 100644 --- a/components/tidb_query_expr/src/impl_json.rs +++ b/components/tidb_query_expr/src/impl_json.rs @@ -584,7 +584,7 @@ mod tests { .push_params(err_args) .evaluate(ScalarFuncSig::JsonObjectSig); - assert!(output.is_err()); + output.unwrap_err(); } } @@ -948,7 +948,7 @@ mod tests { if is_success { assert_eq!(output.unwrap(), expected, "{:?}", vargs); } else { - assert!(output.is_err()); + output.unwrap_err(); } } } diff --git a/components/tidb_query_expr/src/impl_math.rs b/components/tidb_query_expr/src/impl_math.rs index 80484c224c4..55e86ee14d0 100644 --- a/components/tidb_query_expr/src/impl_math.rs +++ b/components/tidb_query_expr/src/impl_math.rs @@ -1204,7 +1204,7 @@ mod tests { let output: Result> = RpnFnScalarEvaluator::new() .push_param(Some(Real::new(x).unwrap())) .evaluate(ScalarFuncSig::Exp); - assert!(output.is_err()); + output.unwrap_err(); } } @@ -1317,12 +1317,10 @@ mod tests { .unwrap(); assert!((output.unwrap().into_inner() - expect).abs() < f64::EPSILON); } - assert!( - RpnFnScalarEvaluator::new() - .push_param(Some(Real::new(0.0_f64).unwrap())) - .evaluate::(ScalarFuncSig::Cot) - .is_err() - ); + RpnFnScalarEvaluator::new() + .push_param(Some(Real::new(0.0_f64).unwrap())) + .evaluate::(ScalarFuncSig::Cot) + .unwrap_err(); } #[test] @@ -1374,13 +1372,11 @@ mod tests { ]; for (lhs, rhs) in invalid_cases { - assert!( - RpnFnScalarEvaluator::new() - .push_param(lhs) - .push_param(rhs) - .evaluate::(ScalarFuncSig::Pow) - .is_err() - ); + RpnFnScalarEvaluator::new() + .push_param(lhs) + .push_param(rhs) + .evaluate::(ScalarFuncSig::Pow) + .unwrap_err(); } } diff --git a/components/tidb_query_expr/src/impl_op.rs b/components/tidb_query_expr/src/impl_op.rs index 5ecb4e9a7dc..9081f623b8e 100644 --- a/components/tidb_query_expr/src/impl_op.rs +++ b/components/tidb_query_expr/src/impl_op.rs @@ -402,18 +402,16 @@ mod tests { .unwrap(); assert_eq!(output, expect_output, "{:?}", arg); } - assert!( - RpnFnScalarEvaluator::new() - .push_param_with_field_type( - Some((i64::MAX as u64 + 2) as i64), - FieldTypeBuilder::new() - .tp(FieldTypeTp::LongLong) - .flag(FieldTypeFlag::UNSIGNED) - .build() - ) - .evaluate::(ScalarFuncSig::UnaryMinusInt) - .is_err() - ); + RpnFnScalarEvaluator::new() + .push_param_with_field_type( + Some((i64::MAX as u64 + 2) as i64), + FieldTypeBuilder::new() + .tp(FieldTypeTp::LongLong) + .flag(FieldTypeFlag::UNSIGNED) + .build(), + ) + .evaluate::(ScalarFuncSig::UnaryMinusInt) + .unwrap_err(); let signed_test_cases = vec![ (None, None), @@ -429,12 +427,10 @@ mod tests { .unwrap(); assert_eq!(output, expect_output, "{:?}", arg); } - assert!( - RpnFnScalarEvaluator::new() - .push_param(i64::MIN) - .evaluate::(ScalarFuncSig::UnaryMinusInt) - .is_err() - ); + RpnFnScalarEvaluator::new() + .push_param(i64::MIN) + .evaluate::(ScalarFuncSig::UnaryMinusInt) + .unwrap_err(); } #[test] diff --git a/components/tidb_query_expr/src/impl_string.rs b/components/tidb_query_expr/src/impl_string.rs index 9ebba24ed43..f3b9b03c287 100644 --- a/components/tidb_query_expr/src/impl_string.rs +++ b/components/tidb_query_expr/src/impl_string.rs @@ -284,8 +284,8 @@ pub fn lpad_utf8( pad: BytesRef, writer: BytesWriter, ) -> Result { - let input = str::from_utf8(&*arg)?; - let pad = str::from_utf8(&*pad)?; + let input = str::from_utf8(arg)?; + let pad = str::from_utf8(pad)?; let input_len = input.chars().count(); let pad_len = pad.chars().count(); @@ -350,8 +350,8 @@ pub fn rpad_utf8( pad: BytesRef, writer: BytesWriter, ) -> Result { - let input = str::from_utf8(&*arg)?; - let pad = str::from_utf8(&*pad)?; + let input = str::from_utf8(arg)?; + let pad = str::from_utf8(pad)?; let input_len = input.chars().count(); let pad_len = pad.chars().count(); @@ -451,7 +451,7 @@ pub fn left_utf8(lhs: BytesRef, rhs: &Int, writer: BytesWriter) -> Result Result { - let s = str::from_utf8(&*s_utf8)?; - let newstr = str::from_utf8(&*newstr_utf8)?; + let s = str::from_utf8(s_utf8)?; + let newstr = str::from_utf8(newstr_utf8)?; let pos = *pos; let len = *len; let upos: usize = pos as usize; @@ -543,7 +543,7 @@ pub fn right_utf8(lhs: BytesRef, rhs: &Int, writer: BytesWriter) -> Result(ScalarFuncSig::CharLengthUtf8); - assert!(output.is_err()); + output.unwrap_err(); } } diff --git a/components/tidb_query_expr/src/impl_time.rs b/components/tidb_query_expr/src/impl_time.rs index 80912fd6526..0f55e21bab5 100644 --- a/components/tidb_query_expr/src/impl_time.rs +++ b/components/tidb_query_expr/src/impl_time.rs @@ -523,7 +523,7 @@ pub fn from_days(ctx: &mut EvalContext, arg: &Int) -> Result> { pub fn make_date(ctx: &mut EvalContext, year: &Int, day: &Int) -> Result> { let mut year = *year; let mut day = *day; - if day <= 0 || year < 0 || year > 9999 || day > 366 * 9999 { + if !(1..=366 * 9999).contains(&day) || !(0..=9999).contains(&year) { return Ok(None); } if year < 70 { @@ -2394,7 +2394,7 @@ mod tests { .build(), ) .evaluate::(ScalarFuncSig::MakeTime); - assert!(output.is_err()); + output.unwrap_err(); } } diff --git a/components/tidb_query_expr/src/types/expr_builder.rs b/components/tidb_query_expr/src/types/expr_builder.rs index 0546fe43f08..5311a2c03d9 100644 --- a/components/tidb_query_expr/src/types/expr_builder.rs +++ b/components/tidb_query_expr/src/types/expr_builder.rs @@ -826,10 +826,8 @@ mod tests { fn test_max_columns_check() { // Col offset = 0. The minimum success max_columns is 1. let node = ExprDefBuilder::column_ref(0, FieldTypeTp::LongLong).build(); - assert!( - RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, 0) - .is_err() - ); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, 0) + .unwrap_err(); for i in 1..10 { RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, i) .unwrap(); @@ -838,14 +836,8 @@ mod tests { // Col offset = 3. The minimum success max_columns is 4. let node = ExprDefBuilder::column_ref(3, FieldTypeTp::LongLong).build(); for i in 0..=3 { - assert!( - RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper( - node.clone(), - fn_mapper, - i - ) - .is_err() - ); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, i) + .unwrap_err(); } for i in 4..10 { RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, i) @@ -861,14 +853,8 @@ mod tests { .build(); for i in 0..=5 { - assert!( - RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper( - node.clone(), - fn_mapper, - i - ) - .is_err() - ); + RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, i) + .unwrap_err(); } for i in 6..10 { RpnExpressionBuilder::build_from_expr_tree_with_fn_mapper(node.clone(), fn_mapper, i) diff --git a/components/tidb_query_expr/src/types/expr_eval.rs b/components/tidb_query_expr/src/types/expr_eval.rs index 442c0f8486b..078bbf1bb80 100644 --- a/components/tidb_query_expr/src/types/expr_eval.rs +++ b/components/tidb_query_expr/src/types/expr_eval.rs @@ -43,7 +43,7 @@ impl<'a> RpnStackNodeVectorValue<'a> { pub fn as_ref(&self) -> &VectorValue { match self { RpnStackNodeVectorValue::Generated { physical_value, .. } => physical_value, - RpnStackNodeVectorValue::Ref { physical_value, .. } => *physical_value, + RpnStackNodeVectorValue::Ref { physical_value, .. } => physical_value, } } @@ -425,7 +425,7 @@ mod tests { // smaller row number let _ = exp.eval(&mut ctx, &schema, &mut c, &logical_rows, 4); }); - assert!(hooked_eval.is_err()); + hooked_eval.unwrap_err(); let mut c = columns; let exp = RpnExpressionBuilder::new_for_test() @@ -436,7 +436,7 @@ mod tests { // larger row number let _ = exp.eval(&mut ctx, &schema, &mut c, &logical_rows, 6); }); - assert!(hooked_eval.is_err()); + hooked_eval.unwrap_err(); } /// Single function call node (i.e. nullary function) @@ -930,7 +930,7 @@ mod tests { let hooked_eval = panic_hook::recover_safe(|| { let _ = exp.eval(&mut ctx, &[], &mut columns, &[], 3); }); - assert!(hooked_eval.is_err()); + hooked_eval.unwrap_err(); } /// Irregular RPN expression (contains unused node). Should panic. @@ -954,7 +954,7 @@ mod tests { let hooked_eval = panic_hook::recover_safe(|| { let _ = exp.eval(&mut ctx, &[], &mut columns, &[], 3); }); - assert!(hooked_eval.is_err()); + hooked_eval.unwrap_err(); } /// Eval type does not match. Should panic. @@ -976,7 +976,7 @@ mod tests { let hooked_eval = panic_hook::recover_safe(|| { let _ = exp.eval(&mut ctx, &[], &mut columns, &[], 3); }); - assert!(hooked_eval.is_err()); + hooked_eval.unwrap_err(); } /// Parse from an expression tree then evaluate. diff --git a/components/tikv_kv/src/btree_engine.rs b/components/tikv_kv/src/btree_engine.rs index b80c32e7088..757c3e2c378 100644 --- a/components/tikv_kv/src/btree_engine.rs +++ b/components/tikv_kv/src/btree_engine.rs @@ -428,6 +428,6 @@ pub mod tests { #[test] fn test_get_not_exist_cf() { let engine = BTreeEngine::new(&[]); - assert!(::panic_hook::recover_safe(|| engine.get_cf("not_exist_cf")).is_err()); + ::panic_hook::recover_safe(|| engine.get_cf("not_exist_cf")).unwrap_err(); } } diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index cfa171054c9..2d0dd77e9d3 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -647,10 +647,8 @@ mod tests { iter.seek(&Key::from_encoded_slice(b"a3"), &mut statistics) .unwrap() ); - assert!( - iter.seek(&Key::from_encoded_slice(b"a9"), &mut statistics) - .is_err() - ); + iter.seek(&Key::from_encoded_slice(b"a9"), &mut statistics) + .unwrap_err(); assert!( !iter @@ -661,10 +659,8 @@ mod tests { iter.seek_for_prev(&Key::from_encoded_slice(b"a3"), &mut statistics) .unwrap() ); - assert!( - iter.seek_for_prev(&Key::from_encoded_slice(b"a1"), &mut statistics) - .is_err() - ); + iter.seek_for_prev(&Key::from_encoded_slice(b"a1"), &mut statistics) + .unwrap_err(); } #[test] @@ -705,14 +701,10 @@ mod tests { .reverse_seek(&Key::from_encoded_slice(b"a3"), &mut statistics) .unwrap() ); - assert!( - iter.reverse_seek(&Key::from_encoded_slice(b"a1"), &mut statistics) - .is_err() - ); - assert!( - iter.reverse_seek(&Key::from_encoded_slice(b"a8"), &mut statistics) - .is_err() - ); + iter.reverse_seek(&Key::from_encoded_slice(b"a1"), &mut statistics) + .unwrap_err(); + iter.reverse_seek(&Key::from_encoded_slice(b"a8"), &mut statistics) + .unwrap_err(); assert!(iter.seek_to_last(&mut statistics)); let mut res = vec![]; diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index e26318d7b4e..dea3c0dc745 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -67,7 +67,7 @@ pub type Callback = Box) + Send>; pub type ExtCallback = Box; pub type Result = result::Result; -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Clone)] pub enum Modify { Delete(CfName, Key), Put(CfName, Key, Value), @@ -156,14 +156,8 @@ impl From for raft_cmdpb::Request { impl From for Modify { fn from(mut req: raft_cmdpb::Request) -> Modify { let name_to_cf = |name: &str| -> Option { - engine_traits::name_to_cf(name).or_else(|| { - for c in TEST_ENGINE_CFS { - if name == *c { - return Some(c); - } - } - None - }) + engine_traits::name_to_cf(name) + .or_else(|| TEST_ENGINE_CFS.iter().copied().find(|c| name == *c)) }; match req.get_cmd_type() { @@ -947,7 +941,7 @@ pub mod tests { }}; } - #[derive(PartialEq, Eq, Clone, Copy)] + #[derive(PartialEq, Clone, Copy)] enum SeekMode { Normal, Reverse, diff --git a/components/tikv_util/src/codec/bytes.rs b/components/tikv_util/src/codec/bytes.rs index 034e8e73375..df23090c9c7 100644 --- a/components/tikv_util/src/codec/bytes.rs +++ b/components/tikv_util/src/codec/bytes.rs @@ -448,8 +448,8 @@ mod tests { ]; for mut x in invalid_bytes { - assert!(decode_bytes(&mut x.as_slice(), false).is_err()); - assert!(decode_bytes_in_place(&mut x, false).is_err()); + decode_bytes(&mut x.as_slice(), false).unwrap_err(); + decode_bytes_in_place(&mut x, false).unwrap_err(); } } diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index 8fa7c8492d0..7e9f22dcb01 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -59,7 +59,7 @@ const MINUTE: u64 = SECOND * TIME_MAGNITUDE_2; const HOUR: u64 = MINUTE * TIME_MAGNITUDE_2; const DAY: u64 = HOUR * TIME_MAGNITUDE_3; -#[derive(Clone, Copy, Debug, Serialize, Deserialize, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq)] #[serde(rename_all = "kebab-case")] pub enum LogFormat { Text, @@ -937,14 +937,14 @@ securityfs /sys/kernel/security securityfs rw,nosuid,nodev,noexec,relatime 0 0 // not found let f2 = get_fs_info("/tmp", &mnt_file); - assert!(f2.is_err()); + f2.unwrap_err(); } #[test] fn test_get_rotational_info() { // test device not exist let ret = get_rotational_info("/dev/invalid"); - assert!(ret.is_err()); + ret.unwrap_err(); } #[test] @@ -1823,7 +1823,7 @@ mod tests { { File::create(&path2).unwrap(); } - assert!(canonicalize_path(&path2).is_err()); + canonicalize_path(&path2).unwrap_err(); assert!(Path::new(&path2).exists()); } diff --git a/components/tikv_util/src/future.rs b/components/tikv_util/src/future.rs index 61d6f33ad4c..5f4c5b43817 100644 --- a/components/tikv_util/src/future.rs +++ b/components/tikv_util/src/future.rs @@ -147,7 +147,7 @@ impl PollAtWake { }; let waker = task::waker_ref(arc_self); - let cx = &mut Context::from_waker(&*waker); + let cx = &mut Context::from_waker(&waker); loop { match fut.as_mut().poll(cx) { // Likely pending diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index ecfeb7253fd..a75c4756b9c 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -728,7 +728,7 @@ mod tests { match foo(&mu.rl()) { Some(_) | None => { let res = mu.try_write(); - assert!(res.is_err()); + res.unwrap_err(); } } } diff --git a/components/tikv_util/src/logger/file_log.rs b/components/tikv_util/src/logger/file_log.rs index 5d0300ccdc5..5b575638c19 100644 --- a/components/tikv_util/src/logger/file_log.rs +++ b/components/tikv_util/src/logger/file_log.rs @@ -376,7 +376,7 @@ mod tests { // Rename failed. logger.write_all(&[0xff; 1025]).unwrap(); - assert!(logger.flush().is_err()); + logger.flush().unwrap_err(); // dropping the logger still should not panic. drop(logger); diff --git a/components/tikv_util/src/logger/mod.rs b/components/tikv_util/src/logger/mod.rs index 91ecd803b89..5ebe9468a50 100644 --- a/components/tikv_util/src/logger/mod.rs +++ b/components/tikv_util/src/logger/mod.rs @@ -781,7 +781,7 @@ mod tests { BUFFER.with(|buffer| { let mut buffer = buffer.borrow_mut(); - let output = from_utf8(&*buffer).unwrap(); + let output = from_utf8(&buffer).unwrap(); assert_eq!(output.lines().count(), expect.lines().count()); let re = Regex::new(r"(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s?(?P\[.*\])?").unwrap(); @@ -829,7 +829,7 @@ mod tests { BUFFER.with(|buffer| { let mut buffer = buffer.borrow_mut(); - let output = from_utf8(&*buffer).unwrap(); + let output = from_utf8(&buffer).unwrap(); assert_eq!(output.lines().count(), expect.lines().count()); for (output_line, expect_line) in output.lines().zip(expect.lines()) { @@ -862,7 +862,7 @@ mod tests { let check_log = |log: &str| { BUFFER.with(|buffer| { let mut buffer = buffer.borrow_mut(); - let output = from_utf8(&*buffer).unwrap(); + let output = from_utf8(&buffer).unwrap(); // only check the log len here as some field like timestamp, location may // change. assert_eq!(output.len(), log.len()); @@ -1048,7 +1048,7 @@ mod tests { let re = Regex::new(r"(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s(?P\[.*?\])\s?(?P\[.*\])?").unwrap(); NORMAL_BUFFER.with(|buffer| { let buffer = buffer.borrow_mut(); - let output = from_utf8(&*buffer).unwrap(); + let output = from_utf8(&buffer).unwrap(); let output_segments = re.captures(output).unwrap(); assert_eq!(output_segments["msg"].to_owned(), r#"["Hello World"]"#); }); @@ -1060,7 +1060,7 @@ mod tests { "#; SLOW_BUFFER.with(|buffer| { let buffer = buffer.borrow_mut(); - let output = from_utf8(&*buffer).unwrap(); + let output = from_utf8(&buffer).unwrap(); let expect_re = Regex::new(r"(?P\[.*?\])\s?(?P\[.*\])?").unwrap(); assert_eq!(output.lines().count(), slow_expect.lines().count()); for (output, expect) in output.lines().zip(slow_expect.lines()) { diff --git a/components/tikv_util/src/metrics/mod.rs b/components/tikv_util/src/metrics/mod.rs index 4b5a9abc2f7..3a9964bd8d2 100644 --- a/components/tikv_util/src/metrics/mod.rs +++ b/components/tikv_util/src/metrics/mod.rs @@ -46,7 +46,7 @@ pub fn dump_to(w: &mut impl Write, should_simplify: bool) { let encoder = TextEncoder::new(); let metric_families = prometheus::gather(); if !should_simplify { - if let Err(e) = encoder.encode(&*metric_families, w) { + if let Err(e) = encoder.encode(&metric_families, w) { warn!("prometheus encoding error"; "err" => ?e); } return; diff --git a/components/tikv_util/src/metrics/threads_linux.rs b/components/tikv_util/src/metrics/threads_linux.rs index 608b60949e8..9f85425b0ba 100644 --- a/components/tikv_util/src/metrics/threads_linux.rs +++ b/components/tikv_util/src/metrics/threads_linux.rs @@ -706,7 +706,7 @@ mod tests { let (raw_name, _) = get_thread_name("(@#)").unwrap(); assert_eq!(sanitize_thread_name(1, raw_name), "1"); - assert!(get_thread_name("invalid_stat").is_err()); + get_thread_name("invalid_stat").unwrap_err(); } #[test] diff --git a/components/tikv_util/src/mpsc/batch.rs b/components/tikv_util/src/mpsc/batch.rs index e8d54c514a1..0415f9376af 100644 --- a/components/tikv_util/src/mpsc/batch.rs +++ b/components/tikv_util/src/mpsc/batch.rs @@ -489,7 +489,7 @@ mod tests { let mut future_slot = self.future.lock().unwrap(); if let Some(mut future) = future_slot.take() { let waker = task::waker_ref(&task); - let cx = &mut Context::from_waker(&*waker); + let cx = &mut Context::from_waker(&waker); match future.as_mut().poll(cx) { Poll::Pending => { *future_slot = Some(future); diff --git a/components/tikv_util/src/time.rs b/components/tikv_util/src/time.rs index 0ab8240c4f2..0df4ed4adac 100644 --- a/components/tikv_util/src/time.rs +++ b/components/tikv_util/src/time.rs @@ -506,7 +506,7 @@ pub type Limiter = async_speed_limit::Limiter; pub type Consume = async_speed_limit::limiter::Consume; /// ReadId to judge whether the read requests come from the same GRPC stream. -#[derive(Eq, PartialEq, Clone, Debug)] +#[derive(PartialEq, Clone, Debug)] pub struct ThreadReadId { sequence: u64, pub create_time: Timespec, diff --git a/components/tikv_util/src/timer.rs b/components/tikv_util/src/timer.rs index 56a00e01a50..f47cdaf21e9 100644 --- a/components/tikv_util/src/timer.rs +++ b/components/tikv_util/src/timer.rs @@ -221,7 +221,7 @@ mod tests { use super::*; - #[derive(Debug, PartialEq, Eq, Copy, Clone)] + #[derive(Debug, PartialEq, Copy, Clone)] enum Task { A, B, diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index 621ac730c30..ba4b1e27f41 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -29,7 +29,7 @@ use crate::{ yatp_pool::{DefaultTicker, YatpPoolBuilder}, }; -#[derive(Eq, PartialEq)] +#[derive(PartialEq)] pub enum ScheduleError { Stopped(T), Full(T), diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 1f9c74dd709..9de2d49cb07 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -187,7 +187,7 @@ impl PoolInner { } } -#[derive(Clone, Copy, PartialEq, Eq, Debug)] +#[derive(Clone, Copy, PartialEq, Debug)] pub struct Full { pub current_tasks: usize, pub max_tasks: usize, @@ -285,11 +285,11 @@ mod tests { .unwrap() }; - assert!(try_recv_tick().is_err()); + try_recv_tick().unwrap_err(); // Tick is emitted because long enough time has elapsed since pool is created spawn_future_and_wait(&pool, TICK_INTERVAL / 20); - assert!(try_recv_tick().is_err()); + try_recv_tick().unwrap_err(); spawn_future_and_wait(&pool, TICK_INTERVAL / 20); spawn_future_and_wait(&pool, TICK_INTERVAL / 20); @@ -297,30 +297,30 @@ mod tests { spawn_future_and_wait(&pool, TICK_INTERVAL / 20); // So far we have only elapsed TICK_INTERVAL * 0.2, so no ticks so far. - assert!(try_recv_tick().is_err()); + try_recv_tick().unwrap_err(); // Even if long enough time has elapsed, tick is not emitted until next task // arrives thread::sleep(TICK_INTERVAL * 2); - assert!(try_recv_tick().is_err()); + try_recv_tick().unwrap_err(); spawn_future_and_wait(&pool, TICK_INTERVAL / 20); assert_eq!(try_recv_tick().unwrap(), 0); - assert!(try_recv_tick().is_err()); + try_recv_tick().unwrap_err(); // Tick is not emitted if there is no task thread::sleep(TICK_INTERVAL * 2); - assert!(try_recv_tick().is_err()); + try_recv_tick().unwrap_err(); // Tick is emitted since long enough time has passed spawn_future_and_wait(&pool, TICK_INTERVAL / 20); assert_eq!(try_recv_tick().unwrap(), 1); - assert!(try_recv_tick().is_err()); + try_recv_tick().unwrap_err(); // Tick is emitted immediately after a long task spawn_future_and_wait(&pool, TICK_INTERVAL * 2); assert_eq!(try_recv_tick().unwrap(), 2); - assert!(try_recv_tick().is_err()); + try_recv_tick().unwrap_err(); } #[test] @@ -337,18 +337,18 @@ mod tests { .thread_count(2, 2, 2) .build_future_pool(); - assert!(rx.try_recv().is_err()); + rx.try_recv().unwrap_err(); // Spawn two tasks, each will be processed in one worker thread. spawn_future_without_wait(&pool, TICK_INTERVAL / 2); spawn_future_without_wait(&pool, TICK_INTERVAL / 2); - assert!(rx.try_recv().is_err()); + rx.try_recv().unwrap_err(); // Wait long enough time to trigger a tick. thread::sleep(TICK_INTERVAL * 2); - assert!(rx.try_recv().is_err()); + rx.try_recv().unwrap_err(); // These two tasks should both trigger a tick. spawn_future_without_wait(&pool, TICK_INTERVAL); @@ -359,7 +359,7 @@ mod tests { assert_eq!(rx.try_recv().unwrap(), 0); assert_eq!(rx.try_recv().unwrap(), 1); - assert!(rx.try_recv().is_err()); + rx.try_recv().unwrap_err(); } #[test] @@ -457,7 +457,7 @@ mod tests { spawn_long_time_future(&read_pool, 4, 400).unwrap(), ); // no available results (running = 4) - assert!(rx.recv_timeout(Duration::from_millis(50)).is_err()); + rx.recv_timeout(Duration::from_millis(50)).unwrap_err(); // full assert!(spawn_long_time_future(&read_pool, 5, 100).is_err()); @@ -480,7 +480,7 @@ mod tests { rx.recv().unwrap().unwrap(); // no more results - assert!(rx.recv_timeout(Duration::from_millis(500)).is_err()); + rx.recv_timeout(Duration::from_millis(500)).unwrap_err(); } #[test] diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index c37fcde86d1..e0a9b9de24f 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -1,7 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -#![feature(array_from_fn)] - mod metrics; mod slab; mod tls; @@ -92,7 +90,7 @@ impl RequestInfo { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +#[derive(Debug, Clone, Copy, PartialEq, Default)] pub enum RequestType { #[default] Unknown, diff --git a/components/tracker/src/slab.rs b/components/tracker/src/slab.rs index f737ee1ed1e..9b4be50796b 100644 --- a/components/tracker/src/slab.rs +++ b/components/tracker/src/slab.rs @@ -144,7 +144,7 @@ struct SlabEntry { pub const INVALID_TRACKER_TOKEN: TrackerToken = TrackerToken(u64::MAX); -#[derive(Clone, Copy, PartialEq, Eq)] +#[derive(Clone, Copy, PartialEq)] pub struct TrackerToken(u64); impl TrackerToken { diff --git a/components/txn_types/src/lock.rs b/components/txn_types/src/lock.rs index e0570d900ac..4c784e31318 100644 --- a/components/txn_types/src/lock.rs +++ b/components/txn_types/src/lock.rs @@ -425,7 +425,7 @@ impl Lock { /// A specialized lock only for pessimistic lock. This saves memory for cases /// that only pessimistic locks exist. -#[derive(Clone, PartialEq, Eq)] +#[derive(Clone, PartialEq)] pub struct PessimisticLock { /// The primary key in raw format. pub primary: Box<[u8]>, @@ -695,7 +695,7 @@ mod tests { } // Test `Lock::parse()` handles incorrect input. - assert!(Lock::parse(b"").is_err()); + Lock::parse(b"").unwrap_err(); let lock = Lock::new( LockType::Lock, @@ -708,7 +708,7 @@ mod tests { TimeStamp::zero(), ); let mut v = lock.to_bytes(); - assert!(Lock::parse(&v[..4]).is_err()); + Lock::parse(&v[..4]).unwrap_err(); // Test `Lock::parse()` ignores unknown bytes. v.extend(b"unknown"); let l = Lock::parse(&v).unwrap(); diff --git a/components/txn_types/src/timestamp.rs b/components/txn_types/src/timestamp.rs index 593fa2e1d41..946ccfbbdcb 100644 --- a/components/txn_types/src/timestamp.rs +++ b/components/txn_types/src/timestamp.rs @@ -211,7 +211,7 @@ mod tests { fn test_split_ts() { let k = b"k"; let ts = TimeStamp(123); - assert!(Key::split_on_ts_for(k).is_err()); + Key::split_on_ts_for(k).unwrap_err(); let enc = Key::from_encoded_slice(k).append_ts(ts); let res = Key::split_on_ts_for(enc.as_encoded()).unwrap(); assert_eq!(res, (k.as_ref(), ts)); diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 1d3fd775f1b..75df337f80c 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -572,17 +572,15 @@ mod tests { #[test] fn test_flags_panic() { for _ in 0..100 { - assert!( - panic_hook::recover_safe(|| { - // r must be an invalid flags if it is not zero - let r = rand::random::() & !WriteBatchFlags::all().bits(); - WriteBatchFlags::from_bits_check(r); - if r == 0 { - panic!("panic for zero"); - } - }) - .is_err() - ); + panic_hook::recover_safe(|| { + // r must be an invalid flags if it is not zero + let r = rand::random::() & !WriteBatchFlags::all().bits(); + WriteBatchFlags::from_bits_check(r); + if r == 0 { + panic!("panic for zero"); + } + }) + .unwrap_err(); } } diff --git a/fuzz/cli.rs b/fuzz/cli.rs index 3a804be7d17..96972d94565 100644 --- a/fuzz/cli.rs +++ b/fuzz/cli.rs @@ -57,7 +57,7 @@ enum Cli { } arg_enum! { - #[derive(Debug, PartialEq, Eq, Clone, Copy)] + #[derive(Debug, PartialEq, Clone, Copy)] enum Fuzzer { Afl, Honggfuzz, diff --git a/rust-toolchain b/rust-toolchain index b91c1b17580..2181086f8d2 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -nightly-2022-05-01 +nightly-2022-07-31 diff --git a/rustfmt.toml b/rustfmt.toml index 68b82c22bd1..3de3c63c441 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -9,7 +9,6 @@ format_macro_matchers = true normalize_comments = true normalize_doc_attributes = true condense_wildcard_suffixes = true -license_template_path = "etc/license.template" newline_style = "Unix" use_field_init_shorthand = true use_try_shorthand = true diff --git a/scripts/check-docker-build b/scripts/check-docker-build index 26a53cc1ef6..6a505f31a89 100755 --- a/scripts/check-docker-build +++ b/scripts/check-docker-build @@ -2,18 +2,16 @@ # This script checks if all cargo targets have path specifications. set -euo pipefail -for i in $(find . -type f -name 'Cargo.toml'); do - # These folders are excluded from docker build. - if echo $i | grep -q "./fuzz/\|./profiler/"; then - continue - fi - for target in "test" "bench" "bin" "example"; do - matches=$(sed -n "/\[\[$target\]\]/,/^$/ p" $i) - if [ $(echo "$matches" | grep -c "[[$target]]") != $(echo "$matches" | grep -c "^path =") ]; then - echo "Path has not been specified for a $target target in $i, this will break docker build." - exit 1 - fi - done +for i in $(git ls-files | grep 'Cargo.toml' | grep -v 'fuzz/\|./profiler/'); do + for target in "test" "bench" "bin" "example"; do + # from "[[test]]" to the first trailing empty line + matches=$(sed -n "/\[\[$target\]\]/,/^$/ p" $i) + # check equal amount of "[[test]]" and "path =" + if [ $(echo "$matches" | grep -c "[[$target]]") != $(echo "$matches" | grep -c "^path =") ]; then + echo "Path has not been specified for a $target target in $i, this will break docker build." + exit 1 + fi + done done echo "Docker build check passed." diff --git a/scripts/check-license b/scripts/check-license new file mode 100755 index 00000000000..0b35ef67177 --- /dev/null +++ b/scripts/check-license @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# Check all source files have a license header. +set -euo pipefail + +for i in $(git ls-files | grep "\.rs"); do + # first line -> match -> print line -> quit + matches=$(sed -n "1{/Copyright [0-9]\{4\} TiKV Project Authors. Licensed under Apache-2.0./p;};q;" $i) + if [ -z "${matches}" ]; then + echo "License header is missing from $i." + exit 1 + fi +done + +echo "License check passed." diff --git a/scripts/check-redact-log b/scripts/check-redact-log index 880de323700..8ec3141ad4a 100755 --- a/scripts/check-redact-log +++ b/scripts/check-redact-log @@ -3,19 +3,19 @@ set -euo pipefail function error_msg() { - echo "To print user data into info logs or error messages, use log_wrappers::Value() instead of hex::encode_upper. The former will respect \`security.redact-info-log\` config and filter out user data from info log if needed. Otherwise, use \`log_wrappers::hex_encode_upper\` to get around the lint error. See https://github.com/tikv/tikv/pull/9250 for more information." >&2 + echo "To print user data into info logs or error messages, use log_wrappers::Value() instead of hex::encode_upper. The former will respect \`security.redact-info-log\` config and filter out user data from info log if needed. Otherwise, use \`log_wrappers::hex_encode_upper\` to get around the lint error. See https://github.com/tikv/tikv/pull/9250 for more information." >&2 } if [[ "$(uname)" == "Darwin" ]] ; then - if grep -r -n --color=always --include '*.rs' --exclude hex.rs --exclude-dir tikv-ctl --exclude-dir target 'encode_upper' . | grep -v log_wrappers ; then - error_msg - exit 1 - fi + if grep -r -n --color=always --include '*.rs' --exclude hex.rs --exclude-dir tikv-ctl --exclude-dir target 'encode_upper' . | grep -v log_wrappers ; then + error_msg + exit 1 + fi else if grep -r -n -P '(?/dev/null +# cd $pkg +# cargo clippy --all-targets --no-default-features \ +# --features "${TIKV_ENABLE_FEATURES}" -- "${ALLOWED_CLIPPY_LINTS[@]}" +# cd - >/dev/null # done # for pkg in "fuzz"; do -# cd $pkg -# cargo clippy --all-targets -- "${ALLOWED_CLIPPY_LINTS[@]}" -# cd - >/dev/null +# cd $pkg +# cargo clippy --all-targets -- "${ALLOWED_CLIPPY_LINTS[@]}" +# cd - >/dev/null # done diff --git a/scripts/run-cargo.sh b/scripts/run-cargo.sh index 8c309645f6a..0002c054927 100644 --- a/scripts/run-cargo.sh +++ b/scripts/run-cargo.sh @@ -16,7 +16,7 @@ set -e if [[ -e .cargo/config ]]; then - rm .cargo/config + rm .cargo/config fi args="" diff --git a/src/config.rs b/src/config.rs index 80e763e6981..8a9bf2d2468 100644 --- a/src/config.rs +++ b/src/config.rs @@ -213,7 +213,7 @@ impl TitanCfConfig { } } -#[derive(Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] struct BackgroundJobLimits { max_background_jobs: u32, max_background_flushes: u32, @@ -1935,26 +1935,26 @@ mod unified_read_pool_tests { min_thread_count: 0, ..cfg }; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); let invalid_cfg = UnifiedReadPoolConfig { min_thread_count: 2, max_thread_count: 1, ..cfg }; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); let invalid_cfg = UnifiedReadPoolConfig { stack_size: ReadableSize::mb(1), ..cfg }; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); let invalid_cfg = UnifiedReadPoolConfig { max_tasks_per_worker: 1, ..cfg }; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); let invalid_cfg = UnifiedReadPoolConfig { min_thread_count: 1, max_thread_count: cmp::max( @@ -1963,7 +1963,7 @@ mod unified_read_pool_tests { ) + 1, ..cfg }; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); } } @@ -2258,7 +2258,7 @@ mod readpool_tests { stack_size: ReadableSize::mb(0), max_tasks_per_worker: 0, }; - assert!(unified.validate().is_err()); + unified.validate().unwrap_err(); let storage = StorageReadPoolConfig { use_unified_pool: Some(false), ..Default::default() @@ -2285,7 +2285,7 @@ mod readpool_tests { high_concurrency: 0, ..Default::default() }; - assert!(storage.validate().is_err()); + storage.validate().unwrap_err(); let coprocessor = CoprReadPoolConfig { use_unified_pool: Some(false), ..Default::default() @@ -2296,7 +2296,7 @@ mod readpool_tests { coprocessor, }; assert!(!invalid_cfg.is_unified_pool_enabled()); - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); } #[test] @@ -2307,7 +2307,7 @@ mod readpool_tests { max_thread_count: 0, ..Default::default() }; - assert!(unified.validate().is_err()); + unified.validate().unwrap_err(); let storage = StorageReadPoolConfig { use_unified_pool: Some(true), ..Default::default() @@ -2322,7 +2322,7 @@ mod readpool_tests { }; cfg.adjust_use_unified_pool(); assert!(cfg.is_unified_pool_enabled()); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); } #[test] @@ -2366,7 +2366,7 @@ mod readpool_tests { ..Default::default() }; assert!(cfg.is_unified_pool_enabled()); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); cfg.storage.low_concurrency = 1; cfg.validate().unwrap(); @@ -2387,7 +2387,7 @@ mod readpool_tests { ..Default::default() }; assert!(cfg.is_unified_pool_enabled()); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); cfg.coprocessor.low_concurrency = 1; cfg.validate().unwrap(); } @@ -2711,7 +2711,7 @@ pub struct LogConfig { } /// LogLevel is a wrapper type of `slog::Level` -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq)] pub struct LogLevel(slog::Level); impl From for slog::Level { @@ -4131,7 +4131,7 @@ mod tests { let mut last_cfg = TiKvConfig::default(); tikv_cfg.rocksdb.wal_dir = "/data/wal_dir".to_owned(); tikv_cfg.validate().unwrap(); - assert!(tikv_cfg.check_critical_cfg_with(&last_cfg).is_err()); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap_err(); last_cfg.rocksdb.wal_dir = "/data/wal_dir".to_owned(); tikv_cfg.validate().unwrap(); @@ -4141,7 +4141,7 @@ mod tests { let mut last_cfg = TiKvConfig::default(); tikv_cfg.storage.data_dir = "/data1".to_owned(); tikv_cfg.validate().unwrap(); - assert!(tikv_cfg.check_critical_cfg_with(&last_cfg).is_err()); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap_err(); last_cfg.storage.data_dir = "/data1".to_owned(); tikv_cfg.validate().unwrap(); @@ -4155,7 +4155,7 @@ mod tests { tikv_cfg.raft_engine.mut_config().dir = "/raft/wal_dir".to_owned(); tikv_cfg.validate().unwrap(); - assert!(tikv_cfg.check_critical_cfg_with(&last_cfg).is_err()); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap_err(); last_cfg.raft_engine.mut_config().dir = "/raft/wal_dir".to_owned(); tikv_cfg.validate().unwrap(); @@ -4169,7 +4169,7 @@ mod tests { tikv_cfg.raftdb.wal_dir = "/raft/wal_dir".to_owned(); tikv_cfg.validate().unwrap(); - assert!(tikv_cfg.check_critical_cfg_with(&last_cfg).is_err()); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap_err(); last_cfg.raftdb.wal_dir = "/raft/wal_dir".to_owned(); tikv_cfg.validate().unwrap(); @@ -4177,7 +4177,7 @@ mod tests { tikv_cfg.raft_store.raftdb_path = "/raft_path".to_owned(); tikv_cfg.validate().unwrap(); - assert!(tikv_cfg.check_critical_cfg_with(&last_cfg).is_err()); + tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap_err(); last_cfg.raft_store.raftdb_path = "/raft_path".to_owned(); tikv_cfg.validate().unwrap(); @@ -4314,7 +4314,7 @@ mod tests { tikv_cfg.pd.endpoints = vec!["".to_owned()]; let dur = tikv_cfg.raft_store.raft_heartbeat_interval(); tikv_cfg.server.grpc_keepalive_time = ReadableDuration(dur); - assert!(tikv_cfg.validate().is_err()); + tikv_cfg.validate().unwrap_err(); tikv_cfg.server.grpc_keepalive_time = ReadableDuration(dur * 2); tikv_cfg.validate().unwrap(); } @@ -4328,7 +4328,7 @@ mod tests { tikv_cfg.rocksdb.writecf.block_size = ReadableSize::gb(10); tikv_cfg.rocksdb.raftcf.block_size = ReadableSize::gb(10); tikv_cfg.raftdb.defaultcf.block_size = ReadableSize::gb(10); - assert!(tikv_cfg.validate().is_err()); + tikv_cfg.validate().unwrap_err(); tikv_cfg.rocksdb.defaultcf.block_size = ReadableSize::kb(10); tikv_cfg.rocksdb.lockcf.block_size = ReadableSize::kb(10); tikv_cfg.rocksdb.writecf.block_size = ReadableSize::kb(10); @@ -4442,7 +4442,7 @@ mod tests { for (name, value) in cases { let mut change = HashMap::new(); change.insert(name, value); - assert!(to_config_change(change).is_err()); + to_config_change(change).unwrap_err(); } } @@ -4610,21 +4610,15 @@ mod tests { cfg_controller.register(Module::ResolvedTs, Box::new(TestConfigManager(tx))); // Return error if try to update not support config or unknow config - assert!( - cfg_controller - .update_config("resolved-ts.enable", "false") - .is_err() - ); - assert!( - cfg_controller - .update_config("resolved-ts.scan-lock-pool-size", "10") - .is_err() - ); - assert!( - cfg_controller - .update_config("resolved-ts.xxx", "false") - .is_err() - ); + cfg_controller + .update_config("resolved-ts.enable", "false") + .unwrap_err(); + cfg_controller + .update_config("resolved-ts.scan-lock-pool-size", "10") + .unwrap_err(); + cfg_controller + .update_config("resolved-ts.xxx", "false") + .unwrap_err(); let mut resolved_ts_cfg = cfg_controller.get_current().resolved_ts; // Default value @@ -4644,11 +4638,9 @@ mod tests { ); // Return error if try to update `advance-ts-interval` to an invalid value - assert!( - cfg_controller - .update_config("resolved-ts.advance-ts-interval", "0m") - .is_err() - ); + cfg_controller + .update_config("resolved-ts.advance-ts-interval", "0m") + .unwrap_err(); assert_eq!( resolved_ts_cfg.advance_ts_interval, ReadableDuration::millis(100) @@ -4738,11 +4730,9 @@ mod tests { // Can not update block cache through storage module // when shared block cache is disabled - assert!( - cfg_controller - .update_config("storage.block-cache.capacity", "512MB") - .is_err() - ); + cfg_controller + .update_config("storage.block-cache.capacity", "512MB") + .unwrap_err(); } #[test] @@ -4778,11 +4768,9 @@ mod tests { let db = storage.get_engine().get_rocksdb(); // Can not update shared block cache through rocksdb module - assert!( - cfg_controller - .update_config("rocksdb.defaultcf.block-cache-size", "256MB") - .is_err() - ); + cfg_controller + .update_config("rocksdb.defaultcf.block-cache-size", "256MB") + .unwrap_err(); cfg_controller .update_config("storage.block-cache.capacity", "256MB") @@ -4809,11 +4797,9 @@ mod tests { LogLevel(Level::Warning) ); - assert!( - cfg_controller - .update_config("log.level", "invalid") - .is_err() - ); + cfg_controller + .update_config("log.level", "invalid") + .unwrap_err(); assert_eq!( cfg_controller.get_current().log.level, LogLevel(Level::Warning) @@ -4882,7 +4868,7 @@ mod tests { res.unwrap(); (size, std::cmp::max(size / 2, 1)) } else { - assert!(res.is_err()); + res.unwrap_err(); (origin_pool_size, origin_pool_size_high) }; assert_eq!( @@ -4938,11 +4924,9 @@ mod tests { assert_eq!(cfg_controller.get_current(), cfg); // u64::MAX ns casts to 213503d. - assert!( - cfg_controller - .update_config("quota.max-delay-duration", "213504d") - .is_err() - ); + cfg_controller + .update_config("quota.max-delay-duration", "213504d") + .unwrap_err(); assert_eq!(cfg_controller.get_current(), cfg); cfg_controller @@ -5211,7 +5195,7 @@ mod tests { // Test validating memory_usage_limit when it's greater than max. cfg.memory_usage_limit = Some(ReadableSize(SysQuota::memory_limit_in_bytes() * 2)); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); // Test memory_usage_limit is based on block cache size if it's not configured. cfg.memory_usage_limit = None; @@ -5250,7 +5234,7 @@ mod tests { let mut cfg = TiKvConfig::default(); cfg.storage.data_dir = tmp_path_string_generate!(tmp_path, "data"); cfg.raft_store.raftdb_path = tmp_path_string_generate!(tmp_path, "data", "db"); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); } { @@ -5259,7 +5243,7 @@ mod tests { cfg.raft_store.raftdb_path = tmp_path_string_generate!(tmp_path, "data", "raftdb", "db"); cfg.rocksdb.wal_dir = tmp_path_string_generate!(tmp_path, "data", "raftdb", "db"); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); } { @@ -5268,14 +5252,14 @@ mod tests { cfg.raft_store.raftdb_path = tmp_path_string_generate!(tmp_path, "data", "raftdb", "db"); cfg.raftdb.wal_dir = tmp_path_string_generate!(tmp_path, "data", "kvdb", "db"); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); } { let mut cfg = TiKvConfig::default(); cfg.rocksdb.wal_dir = tmp_path_string_generate!(tmp_path, "data", "wal"); cfg.raftdb.wal_dir = tmp_path_string_generate!(tmp_path, "data", "wal"); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); } { @@ -5643,7 +5627,7 @@ mod tests { let r = panic_hook::recover_safe(|| { let _: DefaultCfConfig = toml::from_str(bad_string_config).unwrap(); }); - assert!(r.is_err()); + r.unwrap_err(); let bad_string_config = r#" compaction-style = 4 @@ -5651,7 +5635,7 @@ mod tests { let r = panic_hook::recover_safe(|| { let _: DefaultCfConfig = toml::from_str(bad_string_config).unwrap(); }); - assert!(r.is_err()); + r.unwrap_err(); // rate-limiter-mode default values is 2 let config_str = r#" diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index d07d9bd5bd6..677490a4b31 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -887,7 +887,7 @@ mod tests { None, PerfLevel::EnableCount, ); - assert!(block_on(copr.handle_unary_request(outdated_req_ctx, handler_builder)).is_err()); + block_on(copr.handle_unary_request(outdated_req_ctx, handler_builder)).unwrap_err(); } #[test] @@ -1038,7 +1038,7 @@ mod tests { // verify for _ in 2..5 { - assert!(rx.recv().unwrap().is_err()); + rx.recv().unwrap().unwrap_err(); } for i in 0..2 { let resp = rx.recv().unwrap().unwrap(); diff --git a/src/coprocessor/interceptors/concurrency_limiter.rs b/src/coprocessor/interceptors/concurrency_limiter.rs index c77eab86f16..590dd5d7180 100644 --- a/src/coprocessor/interceptors/concurrency_limiter.rs +++ b/src/coprocessor/interceptors/concurrency_limiter.rs @@ -152,7 +152,7 @@ mod tests { // Light tasks should run without any semaphore permit let smp2 = smp.clone(); tokio::spawn(timeout(Duration::from_millis(250), async move { - limit_concurrency(work(2), &*smp2, Duration::from_millis(500)).await + limit_concurrency(work(2), &smp2, Duration::from_millis(500)).await })) .await .unwrap() @@ -164,7 +164,7 @@ mod tests { let smp2 = smp.clone(); let mut t1 = tokio::spawn( - async move { limit_concurrency(work(8), &*smp2, Duration::default()).await }, + async move { limit_concurrency(work(8), &smp2, Duration::default()).await }, ) .fuse(); @@ -172,7 +172,7 @@ mod tests { let smp2 = smp.clone(); let mut t2 = tokio::spawn( - async move { limit_concurrency(work(2), &*smp2, Duration::default()).await }, + async move { limit_concurrency(work(2), &smp2, Duration::default()).await }, ) .fuse(); diff --git a/src/server/config.rs b/src/server/config.rs index 88d167d2e64..1959b77df00 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -495,27 +495,27 @@ mod tests { let mut invalid_cfg = cfg.clone(); invalid_cfg.concurrent_send_snap_limit = 0; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); let mut invalid_cfg = cfg.clone(); invalid_cfg.concurrent_recv_snap_limit = 0; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); let mut invalid_cfg = cfg.clone(); invalid_cfg.end_point_recursion_limit = 0; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); let mut invalid_cfg = cfg.clone(); invalid_cfg.grpc_memory_pool_quota = ReadableSize::mb(0); - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); let mut invalid_cfg = cfg.clone(); invalid_cfg.end_point_request_max_handle_duration = ReadableDuration::secs(0); - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); invalid_cfg = Config::default(); invalid_cfg.addr = "0.0.0.0:1000".to_owned(); - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); invalid_cfg.advertise_addr = "127.0.0.1:1000".to_owned(); invalid_cfg.validate().unwrap(); @@ -526,25 +526,25 @@ mod tests { } assert!(invalid_cfg.advertise_status_addr.is_empty()); invalid_cfg.advertise_status_addr = "0.0.0.0:1000".to_owned(); - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); invalid_cfg = Config::default(); invalid_cfg.advertise_addr = "127.0.0.1:1000".to_owned(); invalid_cfg.advertise_status_addr = "127.0.0.1:1000".to_owned(); - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); invalid_cfg = Config::default(); invalid_cfg.max_grpc_send_msg_len = 0; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); invalid_cfg = Config::default(); invalid_cfg.grpc_stream_initial_window_size = ReadableSize(i32::MAX as u64 + 1); - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); cfg.labels.insert("k1".to_owned(), "v1".to_owned()); cfg.validate().unwrap(); cfg.labels.insert("k2".to_owned(), "v2?".to_owned()); - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); } #[test] diff --git a/src/server/debug.rs b/src/server/debug.rs index 831a2b85255..77f6962deb9 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -913,10 +913,10 @@ fn dump_default_cf_properties( let sst_files = collection .iter() .map(|(k, _)| { - Path::new(&*k) + Path::new(k) .file_name() .map(|f| f.to_str().unwrap()) - .unwrap_or(&*k) + .unwrap_or(k) .to_string() }) .collect::>() @@ -950,10 +950,10 @@ fn dump_write_cf_properties( let sst_files = collection .iter() .map(|(k, _)| { - Path::new(&*k) + Path::new(k) .file_name() .map(|f| f.to_str().unwrap()) - .unwrap_or(&*k) + .unwrap_or(k) .to_string() }) .collect::>() @@ -1987,7 +1987,7 @@ mod tests { region.set_start_key(b"k".to_vec()); region.set_end_key(b"z".to_vec()); - assert!(debugger.recreate_region(region.clone()).is_err()); + debugger.recreate_region(region.clone()).unwrap_err(); remove_region_state(1); remove_region_state(2); @@ -1996,7 +1996,7 @@ mod tests { region.set_start_key(b"z".to_vec()); region.set_end_key(b"".to_vec()); - assert!(debugger.recreate_region(region).is_err()); + debugger.recreate_region(region).unwrap_err(); } #[test] diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 5d26958ea41..cf988f9da37 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -266,7 +266,7 @@ mod tests { assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); let tablet_path = factory.tablet_path(1, 10); let result = factory.open_tablet_raw(&tablet_path, false); - assert!(result.is_err()); + result.unwrap_err(); factory .set_shared_block_cache_capacity(1024 * 1024) .unwrap(); @@ -290,7 +290,7 @@ mod tests { assert!(factory.is_tombstoned(1, 20)); factory.destroy_tablet(1, 20).unwrap(); let result = factory.open_tablet(1, 20); - assert!(result.is_err()); + result.unwrap_err(); assert!(!factory.is_single_engine()); } diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index bcfe87d6783..b2a6a9d02dc 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -321,7 +321,7 @@ impl GcMan self.wait_for_next_safe_point()?; // Don't need to run GC any more if compaction filter is enabled. - if !is_compaction_filter_allowed(&*self.cfg_tracker.value(), &self.feature_gate) { + if !is_compaction_filter_allowed(&self.cfg_tracker.value(), &self.feature_gate) { set_status_metrics(GcManagerState::Working); self.gc_a_round()?; if let Some(on_finished) = self.cfg.post_a_round_of_gc.as_ref() { @@ -451,7 +451,7 @@ impl GcMan // periodically. If it's updated, rewinding will happen. loop { self.gc_manager_ctx.check_stopped()?; - if is_compaction_filter_allowed(&*self.cfg_tracker.value(), &self.feature_gate) { + if is_compaction_filter_allowed(&self.cfg_tracker.value(), &self.feature_gate) { return Ok(()); } diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index dcdb075d256..131efd68fac 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -2064,17 +2064,15 @@ mod tests { } // Then, it will fail to schedule another gc command. let (tx, rx) = mpsc::channel(); - assert!( - gc_worker - .gc( - TimeStamp::from(1), - Box::new(move |res| { - tx.send(res).unwrap(); - }) - ) - .is_err() - ); - assert!(rx.recv().unwrap().is_err()); + gc_worker + .gc( + TimeStamp::from(1), + Box::new(move |res| { + tx.send(res).unwrap(); + }), + ) + .unwrap_err(); + rx.recv().unwrap().unwrap_err(); let (tx, rx) = mpsc::channel(); // When the gc_worker is full, scheduling an unsafe destroy range task should be diff --git a/src/server/lock_manager/client.rs b/src/server/lock_manager/client.rs index c71bec0b63a..ba4e77810c3 100644 --- a/src/server/lock_manager/client.rs +++ b/src/server/lock_manager/client.rs @@ -59,14 +59,13 @@ impl Client { let (sink, receiver) = self.client.detect().unwrap(); let send_task = Box::pin(async move { let mut sink = sink.sink_map_err(Error::Grpc); - let res = sink - .send_all(&mut rx.map(|r| Ok((r, WriteFlags::default())))) + + sink.send_all(&mut rx.map(|r| Ok((r, WriteFlags::default())))) .await .map(|_| { info!("cancel detect sender"); sink.get_mut().cancel(); - }); - res + }) }); self.sender = Some(tx); diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index ab60f969493..de72a642837 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -128,9 +128,7 @@ fn on_write_result(mut write_resp: WriteResponse) -> Result> where S: Snapshot, { - if let Err(e) = check_raft_cmd_response(&mut write_resp.response) { - return Err(e); - } + check_raft_cmd_response(&mut write_resp.response)?; let resps = write_resp.response.take_responses(); Ok(CmdRes::Resp(resps.into())) } @@ -139,9 +137,7 @@ fn on_read_result(mut read_resp: ReadResponse) -> Result> where S: Snapshot, { - if let Err(e) = check_raft_cmd_response(&mut read_resp.response) { - return Err(e); - } + check_raft_cmd_response(&mut read_resp.response)?; let resps = read_resp.response.take_responses(); if let Some(mut snapshot) = read_resp.snapshot { snapshot.term = NonZeroU64::new(read_resp.response.get_header().get_current_term()); @@ -201,7 +197,7 @@ where req: Request, cb: Callback>, ) -> Result<()> { - let mut header = self.new_request_header(&*ctx.pb_ctx); + let mut header = self.new_request_header(ctx.pb_ctx); if ctx.pb_ctx.get_stale_read() && !ctx.start_ts.is_zero() { let mut data = [0u8; 8]; (&mut data[..]) diff --git a/src/server/resolve.rs b/src/server/resolve.rs index 404cee0e613..acf60ae783f 100644 --- a/src/server/resolve.rs +++ b/src/server/resolve.rs @@ -270,7 +270,7 @@ mod tests { fn test_resolve_store_state_tombstone() { let store = new_store(STORE_ADDR, metapb::StoreState::Tombstone); let runner = new_runner(store); - assert!(runner.get_address(0).is_err()); + runner.get_address(0).unwrap_err(); } #[test] diff --git a/src/server/server.rs b/src/server/server.rs index 5c0ace9d7b1..f202e30e761 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -320,7 +320,7 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En server.shutdown(); } if let Some(pool) = self.stats_pool.take() { - let _ = pool.shutdown_background(); + pool.shutdown_background(); } let _ = self.yatp_read_pool.take(); self.health_service.shutdown(); diff --git a/src/server/snap.rs b/src/server/snap.rs index b785c455921..e88fbd21fc9 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -186,7 +186,7 @@ pub fn send_snap( match recv_result { Ok(_) => { fail_point!("snapshot_delete_after_send"); - mgr.delete_snapshot(&key, &*chunks.snap, true); + mgr.delete_snapshot(&key, &chunks.snap, true); // TODO: improve it after rustc resolves the bug. // Call `info` in the closure directly will cause rustc // panic with `Cannot create local mono-item for DefId`. @@ -292,7 +292,7 @@ fn recv_snap + 'static>( defer!(snap_mgr.deregister(&context_key, &SnapEntry::Receiving)); while let Some(item) = stream.next().await { fail_point!("receiving_snapshot_net_error", |_| { - return Err(box_err!("{} failed to receive snapshot", context_key)); + Err(box_err!("{} failed to receive snapshot", context_key)) }); let mut chunk = item?; let data = chunk.take_data(); diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index 446711bef30..3419c7df0c8 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -278,7 +278,7 @@ where { let mut id = 0; while let Some(res) = period.next().await { - let _ = res?; + res?; id += 1; let path = format!("{}/{:0>6}{}", dir, id, HEAP_PROFILE_SUFFIX); dump_prof(&path).map_err(|e| format!("dump_prof: {}", e))?; @@ -394,7 +394,7 @@ mod tests { assert_eq!(block_on(res2).unwrap().unwrap_err(), expected); drop(tx1); - assert!(block_on(res1).unwrap().is_err()); + block_on(res1).unwrap().unwrap_err(); } #[test] @@ -439,7 +439,7 @@ mod tests { let (mut tx, rx) = mpsc::channel(1); let res = rt.spawn(activate_heap_profile(rx, std::env::temp_dir(), || {})); block_on(tx.send(Err("test".to_string()))).unwrap(); - assert!(block_on(res).unwrap().is_err()); + block_on(res).unwrap().unwrap_err(); // Test heap profiling can be activated again. let (tx, rx) = sync_channel::(1); diff --git a/src/storage/config.rs b/src/storage/config.rs index 4bfc664629f..7f2e6820201 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -386,10 +386,10 @@ mod tests { cfg.validate().unwrap(); cfg.scheduler_worker_pool_size = 0; - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); cfg.scheduler_worker_pool_size = max_pool_size + 1; - assert!(cfg.validate().is_err()); + cfg.validate().unwrap_err(); } #[test] diff --git a/src/storage/mod.rs b/src/storage/mod.rs index ef9aecf02ad..966b6095310 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -8931,7 +8931,7 @@ mod tests { .unwrap(); // DummyLockManager just drops the callback, so it will fail to receive // anything. - assert!(rx.recv().is_err()); + rx.recv().unwrap_err(); let (tx, rx) = channel(); storage diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 07d0093e71c..1a554a4410b 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -488,7 +488,7 @@ pub mod tests { if check_lock(&mut reader, key, ts).is_err() { return; } - assert!(reader.get(key, ts).is_err()); + reader.get(key, ts).unwrap_err(); } pub fn must_locked(engine: &E, key: &[u8], start_ts: impl Into) -> Lock { diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 434d0948310..7c521bb5952 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -465,7 +465,7 @@ mod tests { } fn must_get_err(point_getter: &mut PointGetter, key: &[u8]) { - assert!(point_getter.get(&Key::from_raw(key)).is_err()); + point_getter.get(&Key::from_raw(key)).unwrap_err(); } fn assert_seek_next_prev(stat: &CfStatistics, seek: usize, next: usize, prev: usize) { @@ -1152,15 +1152,15 @@ mod tests { (b"k9", None), ]; - for (k, v) in &expected_results { + for (k, v) in expected_results.iter().copied() { let mut single_getter = new_point_getter(&engine, 40.into()); - let value = single_getter.get(&Key::from_raw(*k)).unwrap(); + let value = single_getter.get(&Key::from_raw(k)).unwrap(); assert_eq!(value, v.map(|v| v.to_vec())); } let mut getter = new_point_getter(&engine, 40.into()); - for (k, v) in &expected_results { - let value = getter.get(&Key::from_raw(*k)).unwrap(); + for (k, v) in expected_results { + let value = getter.get(&Key::from_raw(k)).unwrap(); assert_eq!(value, v.map(|v| v.to_vec())); } } diff --git a/src/storage/mvcc/reader/scanner/backward.rs b/src/storage/mvcc/reader/scanner/backward.rs index 7e3d677ea52..6ade614e848 100644 --- a/src/storage/mvcc/reader/scanner/backward.rs +++ b/src/storage/mvcc/reader/scanner/backward.rs @@ -1499,7 +1499,7 @@ mod tests { scanner.next().unwrap(), Some((Key::from_raw(key2), val22.to_vec())) ); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); // Scanner has met a lock though lock.ts > read_ts. let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1529,6 +1529,6 @@ mod tests { scanner.next().unwrap(), Some((Key::from_raw(key1), val1.to_vec())) ); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); } } diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index d2c5e8b6a1b..a7a839cf2e7 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -1550,7 +1550,7 @@ mod latest_kv_tests { scanner.next().unwrap(), Some((Key::from_raw(key1), val1.to_vec())) ); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); // Scanner has met a lock though lock.ts > read_ts. let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1583,7 +1583,7 @@ mod latest_kv_tests { scanner.next().unwrap(), Some((Key::from_raw(key5), val5.to_vec())) ); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); } } diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index 3dd95d4045d..1517ad67c78 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -446,7 +446,7 @@ pub(crate) mod tests { must_commit(&engine, k1, 1, 2); // "k1" already exist, returns AlreadyExist error. - assert!(try_prewrite_check_not_exists(&engine, k1, k1, 3).is_err()); + try_prewrite_check_not_exists(&engine, k1, k1, 3).unwrap_err(); // Delete "k1" must_prewrite_delete(&engine, k1, k1, 4); @@ -461,7 +461,7 @@ pub(crate) mod tests { // Rollback must_prewrite_put(&engine, k1, v3, k1, 9); must_rollback(&engine, k1, 9, false); - assert!(try_prewrite_check_not_exists(&engine, k1, k1, 10).is_err()); + try_prewrite_check_not_exists(&engine, k1, k1, 10).unwrap_err(); // Delete "k1" again must_prewrite_delete(&engine, k1, k1, 11); @@ -479,7 +479,7 @@ pub(crate) mod tests { fn test_mvcc_txn_pessmistic_prewrite_check_not_exist() { let engine = TestEngineBuilder::new().build().unwrap(); let k = b"k1"; - assert!(try_pessimistic_prewrite_check_not_exists(&engine, k, k, 3).is_err()) + try_pessimistic_prewrite_check_not_exists(&engine, k, k, 3).unwrap_err(); } #[test] @@ -792,17 +792,15 @@ pub(crate) mod tests { let cm = ConcurrencyManager::new(10.into()); let mut txn = MvccTxn::new(5.into(), cm.clone()); let mut reader = SnapshotReader::new(5.into(), snapshot, true); - assert!( - prewrite( - &mut txn, - &mut reader, - &txn_props(5.into(), key, CommitKind::TwoPc, None, 0, false), - Mutation::make_put(Key::from_raw(key), value.to_vec()), - &None, - false, - ) - .is_err() - ); + prewrite( + &mut txn, + &mut reader, + &txn_props(5.into(), key, CommitKind::TwoPc, None, 0, false), + Mutation::make_put(Key::from_raw(key), value.to_vec()), + &None, + false, + ) + .unwrap_err(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut txn = MvccTxn::new(5.into(), cm); @@ -990,7 +988,7 @@ pub(crate) mod tests { // start_ts = 5, commit_ts = 15, Lock must_get(&engine, k, 19, v); - assert!(try_prewrite_insert(&engine, k, v, k, 20).is_err()); + try_prewrite_insert(&engine, k, v, k, 20).unwrap_err(); } #[test] diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index 2f3a2c84b11..f80e61f93ad 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -213,7 +213,7 @@ pub fn make_rollback( } } -#[derive(Debug, Copy, Clone, Eq, PartialEq)] +#[derive(Debug, Copy, Clone, PartialEq)] pub enum MissingLockAction { Rollback, ProtectedRollback, diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index 8435479991e..456757285e0 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -151,7 +151,7 @@ pub mod tests { let cm = ConcurrencyManager::new(start_ts); let mut txn = MvccTxn::new(start_ts, cm); let mut reader = SnapshotReader::new(start_ts, snapshot, true); - assert!(commit(&mut txn, &mut reader, Key::from_raw(key), commit_ts.into()).is_err()); + commit(&mut txn, &mut reader, Key::from_raw(key), commit_ts.into()).unwrap_err(); } #[cfg(test)] diff --git a/src/storage/txn/actions/tests.rs b/src/storage/txn/actions/tests.rs index acbd7a7f1a7..e5e4b57054c 100644 --- a/src/storage/txn/actions/tests.rs +++ b/src/storage/txn/actions/tests.rs @@ -486,17 +486,15 @@ pub fn must_prewrite_lock_err( let mut txn = MvccTxn::new(ts, cm); let mut reader = SnapshotReader::new(ts, snapshot, true); - assert!( - prewrite( - &mut txn, - &mut reader, - &default_txn_props(ts, pk, TimeStamp::zero()), - Mutation::make_lock(Key::from_raw(key)), - &None, - false, - ) - .is_err() - ); + prewrite( + &mut txn, + &mut reader, + &default_txn_props(ts, pk, TimeStamp::zero()), + Mutation::make_lock(Key::from_raw(key)), + &None, + false, + ) + .unwrap_err(); } pub fn must_pessimistic_prewrite_lock( @@ -539,14 +537,12 @@ pub fn must_rollback_err(engine: &E, key: &[u8], start_ts: impl Into< let cm = ConcurrencyManager::new(start_ts); let mut txn = MvccTxn::new(start_ts, cm); let mut reader = SnapshotReader::new(start_ts, snapshot, true); - assert!( - txn::cleanup( - &mut txn, - &mut reader, - Key::from_raw(key), - TimeStamp::zero(), - false, - ) - .is_err() - ); + txn::cleanup( + &mut txn, + &mut reader, + Key::from_raw(key), + TimeStamp::zero(), + false, + ) + .unwrap_err(); } diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index 8cb901187dd..2b36d6d8821 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -38,7 +38,7 @@ const MAX_THROTTLE_SPEED: f64 = 200.0 * 1024.0 * 1024.0; // 200MB const EMA_FACTOR: f64 = 0.6; // EMA stands for Exponential Moving Average -#[derive(Eq, PartialEq, Debug)] +#[derive(PartialEq, Debug)] enum Trend { Increasing, Decreasing, diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index c85bd828c08..0cd6c5b173b 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -1084,7 +1084,9 @@ mod tests { store.get(&Key::from_raw(b"ca"), &mut statistics).unwrap(), Some(b"hello".to_vec()) ); - assert!(store.get(&Key::from_raw(b"bba"), &mut statistics).is_err()); + store + .get(&Key::from_raw(b"bba"), &mut statistics) + .unwrap_err(); assert_eq!( store.get(&Key::from_raw(b"bbaa"), &mut statistics).unwrap(), None @@ -1115,7 +1117,9 @@ mod tests { store.get(&Key::from_raw(b"ab"), &mut statistics).unwrap(), Some(b"bar".to_vec()) ); - assert!(store.get(&Key::from_raw(b"zz"), &mut statistics).is_err()); + store + .get(&Key::from_raw(b"zz"), &mut statistics) + .unwrap_err(); assert_eq!( store.get(&Key::from_raw(b"z"), &mut statistics).unwrap(), Some(b"beta".to_vec()) @@ -1147,7 +1151,7 @@ mod tests { scanner.next().unwrap(), Some((Key::from_raw(b"bb"), b"alphaalpha".to_vec())) ); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); assert_eq!( scanner.next().unwrap(), Some((Key::from_raw(b"ca"), b"hello".to_vec())) @@ -1156,13 +1160,13 @@ mod tests { scanner.next().unwrap(), Some((Key::from_raw(b"z"), b"beta".to_vec())) ); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); // note: mvcc impl does not guarantee to work any more after meeting a non lock // error assert_eq!(scanner.next().unwrap(), None); let mut scanner = store.scanner(true, false, false, None, None).unwrap(); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); // note: mvcc impl does not guarantee to work any more after meeting a non lock // error assert_eq!( @@ -1173,7 +1177,7 @@ mod tests { scanner.next().unwrap(), Some((Key::from_raw(b"ca"), b"hello".to_vec())) ); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); assert_eq!( scanner.next().unwrap(), Some((Key::from_raw(b"bb"), b"alphaalpha".to_vec())) @@ -1214,13 +1218,13 @@ mod tests { scanner.next().unwrap(), Some((Key::from_raw(b"bb"), vec![])) ); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); assert_eq!( scanner.next().unwrap(), Some((Key::from_raw(b"ca"), vec![])) ); assert_eq!(scanner.next().unwrap(), Some((Key::from_raw(b"z"), vec![]))); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); // note: mvcc impl does not guarantee to work any more after meeting a non lock // error assert_eq!(scanner.next().unwrap(), None); @@ -1278,7 +1282,7 @@ mod tests { scanner.next().unwrap(), Some((Key::from_raw(b"bb"), vec![])) ); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); assert_eq!(scanner.next().unwrap(), None); let mut scanner = store @@ -1316,7 +1320,7 @@ mod tests { Some(Key::from_raw(b"bba")), ) .unwrap(); - assert!(scanner.next().is_err()); + scanner.next().unwrap_err(); assert_eq!( scanner.next().unwrap(), Some((Key::from_raw(b"bb"), vec![])) diff --git a/tests/failpoints/cases/test_conf_change.rs b/tests/failpoints/cases/test_conf_change.rs index 70194b194ac..d4219808af0 100644 --- a/tests/failpoints/cases/test_conf_change.rs +++ b/tests/failpoints/cases/test_conf_change.rs @@ -270,7 +270,7 @@ fn test_redundant_conf_change_by_snapshot() { fail::cfg("apply_on_conf_change_3_1", "off").unwrap(); cluster.must_transfer_leader(1, new_peer(3, 3)); - assert!(rx.try_recv().is_err()); + rx.try_recv().unwrap_err(); fail::remove("apply_on_conf_change_3_1"); } diff --git a/tests/failpoints/cases/test_encryption.rs b/tests/failpoints/cases/test_encryption.rs index 502e31afff9..8b73188e569 100644 --- a/tests/failpoints/cases/test_encryption.rs +++ b/tests/failpoints/cases/test_encryption.rs @@ -23,7 +23,7 @@ fn test_file_dict_file_record_corrupted() { fail::remove("file_dict_log_append_incomplete"); file_dict_file.insert("info2", &info2).unwrap(); // Intermediate record damage is not allowed. - assert!(file_dict_file.recovery().is_err()); + file_dict_file.recovery().unwrap_err(); let mut file_dict_file = FileDictionaryFile::new( tempdir.path(), diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index 8ef0f08f19e..6bbed4ac641 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -79,5 +79,5 @@ fn test_break_leadership_on_restart() { // Peer 3 shouldn't start a new election, otherwise the leader may step down // incorrectly. - assert!(rx.recv_timeout(Duration::from_secs(2)).is_err()); + rx.recv_timeout(Duration::from_secs(2)).unwrap_err(); } diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 92785fcfa1e..32bd2f05228 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -1526,7 +1526,7 @@ fn test_retry_pending_prepare_merge_fail() { let rx = cluster.async_put(b"k1", b"v11").unwrap(); propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); - assert!(rx.recv_timeout(Duration::from_millis(200)).is_err()); + rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); // Then, start merging. PrepareMerge should become pending because applied_index // is smaller than proposed_index. diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index 22871994f82..eb22ac29e45 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -129,7 +129,7 @@ fn test_watch_global_config_on_closed_server() { let client = Arc::new(client); use futures::StreamExt; let j = std::thread::spawn(move || { - let _ = futures::executor::block_on(async move { + futures::executor::block_on(async move { let mut r = client.watch_global_config().unwrap(); let mut i: usize = 0; while let Some(r) = r.next().await { diff --git a/tests/failpoints/cases/test_replica_read.rs b/tests/failpoints/cases/test_replica_read.rs index 7a6da017d99..5fe71834e45 100644 --- a/tests/failpoints/cases/test_replica_read.rs +++ b/tests/failpoints/cases/test_replica_read.rs @@ -59,7 +59,7 @@ fn test_wait_for_apply_index() { .async_command_on_node(3, request, cb) .unwrap(); // Must timeout here - assert!(rx.recv_timeout(Duration::from_millis(500)).is_err()); + rx.recv_timeout(Duration::from_millis(500)).unwrap_err(); fail::remove("on_apply_write_cmd"); // After write cmd applied, the follower read will be executed. @@ -794,7 +794,7 @@ fn test_read_index_lock_checking_on_false_leader() { // peer 1. But the lease of peer 1 has expired and it cannot get majority of // heartbeat. So, we cannot get the result here. let resp = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1, b"k1", true); - assert!(resp.recv_timeout(Duration::from_millis(300)).is_err()); + resp.recv_timeout(Duration::from_millis(300)).unwrap_err(); // Now, restore the network partition. Peer 1 should now become follower and // drop its pending read index request. Peer 2 cannot get the result now. @@ -805,7 +805,7 @@ fn test_read_index_lock_checking_on_false_leader() { ); cluster.sim.wl().add_recv_filter(2, recv_filter); cluster.clear_send_filters(); - assert!(resp.recv_timeout(Duration::from_millis(300)).is_err()); + resp.recv_timeout(Duration::from_millis(300)).unwrap_err(); // After cleaning all filters, peer 2 will retry and will get error. cluster.sim.wl().clear_recv_filters(2); diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index aab1fe3d879..bf23267a06a 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -131,7 +131,7 @@ fn test_split_lost_request_vote() { assert_eq!(range.1, b"k2"); // Make sure the message has sent to peer 3. - let _sent = after_sent_rx + after_sent_rx .recv_timeout(Duration::from_millis(100)) .unwrap(); diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 17e9957d947..7502fe6be4e 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -324,7 +324,7 @@ fn test_scale_scheduler_pool() { scale_pool(1); fail::cfg(snapshot_fp, "1*pause").unwrap(); // propose one prewrite to block the only worker - assert!(do_prewrite(b"k1", b"v1").is_err()); + do_prewrite(b"k1", b"v1").unwrap_err(); scale_pool(2); diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index c9f7a70ee09..de19d1a790c 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -323,10 +323,8 @@ fn test_max_commit_ts_error() { ) .unwrap(); thread::sleep(Duration::from_millis(200)); - assert!( - cm.read_key_check(&Key::from_raw(b"k1"), |_| Err(())) - .is_err() - ); + cm.read_key_check(&Key::from_raw(b"k1"), |_| Err(())) + .unwrap_err(); cm.update_max_ts(200.into()); let res = prewrite_rx.recv().unwrap().unwrap(); diff --git a/tests/failpoints/cases/test_transfer_leader.rs b/tests/failpoints/cases/test_transfer_leader.rs index 9ad2816d3d3..556549b8141 100644 --- a/tests/failpoints/cases/test_transfer_leader.rs +++ b/tests/failpoints/cases/test_transfer_leader.rs @@ -154,7 +154,7 @@ fn test_delete_lock_proposed_after_proposing_locks_impl(transfer_msg_count: usiz thread::spawn(move || tx.send(client.kv_cleanup(&req).unwrap()).unwrap()); thread::sleep(Duration::from_millis(200)); - assert!(resp_rx.try_recv().is_err()); + resp_rx.try_recv().unwrap_err(); for _ in 0..transfer_msg_count { cluster.transfer_leader(1, new_peer(2, 2)); @@ -231,7 +231,7 @@ fn test_delete_lock_proposed_before_proposing_locks() { thread::spawn(move || tx.send(client.kv_cleanup(&req).unwrap()).unwrap()); thread::sleep(Duration::from_millis(200)); - assert!(resp_rx.try_recv().is_err()); + resp_rx.try_recv().unwrap_err(); cluster.transfer_leader(1, new_peer(2, 2)); thread::sleep(Duration::from_millis(200)); @@ -318,7 +318,7 @@ fn test_read_lock_after_become_follower() { thread::spawn(move || tx.send(client.kv_prewrite(&req).unwrap()).unwrap()); thread::sleep(Duration::from_millis(200)); - assert!(resp_rx.try_recv().is_err()); + resp_rx.try_recv().unwrap_err(); // And pause applying the write on the leader. fail::cfg("on_apply_write_cmd", "pause").unwrap(); diff --git a/tests/failpoints/cases/test_ttl.rs b/tests/failpoints/cases/test_ttl.rs index 4748b1d0bbf..25ffcf6ff4c 100644 --- a/tests/failpoints/cases/test_ttl.rs +++ b/tests/failpoints/cases/test_ttl.rs @@ -87,14 +87,14 @@ fn test_ttl_checker_impl() { assert!(kvdb.get_value_cf(CF_DEFAULT, key4).unwrap().is_some()); assert!(kvdb.get_value_cf(CF_DEFAULT, key5).unwrap().is_some()); - let _ = check_ttl_and_compact_files(&kvdb, b"zr\0key1", b"zr\0key25", false); + check_ttl_and_compact_files(&kvdb, b"zr\0key1", b"zr\0key25", false); assert!(kvdb.get_value_cf(CF_DEFAULT, key1).unwrap().is_none()); assert!(kvdb.get_value_cf(CF_DEFAULT, key2).unwrap().is_some()); assert!(kvdb.get_value_cf(CF_DEFAULT, key3).unwrap().is_none()); assert!(kvdb.get_value_cf(CF_DEFAULT, key4).unwrap().is_some()); assert!(kvdb.get_value_cf(CF_DEFAULT, key5).unwrap().is_some()); - let _ = check_ttl_and_compact_files(&kvdb, b"zr\0key2", b"zr\0key6", false); + check_ttl_and_compact_files(&kvdb, b"zr\0key2", b"zr\0key6", false); assert!(kvdb.get_value_cf(CF_DEFAULT, key1).unwrap().is_none()); assert!(kvdb.get_value_cf(CF_DEFAULT, key2).unwrap().is_some()); assert!(kvdb.get_value_cf(CF_DEFAULT, key3).unwrap().is_none()); diff --git a/tests/failpoints/cases/test_unsafe_recovery.rs b/tests/failpoints/cases/test_unsafe_recovery.rs index c70ac41d902..20bb666ff3e 100644 --- a/tests/failpoints/cases/test_unsafe_recovery.rs +++ b/tests/failpoints/cases/test_unsafe_recovery.rs @@ -114,11 +114,9 @@ fn test_unsafe_recovery_execution_result_report() { true, ); // marjority is lost, can't propose command successfully. - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap_err(); } cluster.must_enter_force_leader(region2.get_id(), nodes[0], vec![nodes[1], nodes[2]]); @@ -303,11 +301,9 @@ fn test_unsafe_recovery_demotion_reentrancy() { true, ); // marjority is lost, can't propose command successfully. - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap_err(); } cluster.must_enter_force_leader(region.get_id(), nodes[0], vec![nodes[1], nodes[2]]); @@ -408,11 +404,9 @@ fn test_unsafe_recovery_create_destroy_reentrancy() { true, ); // marjority is lost, can't propose command successfully. - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap_err(); } cluster.must_enter_force_leader(region2.get_id(), nodes[0], vec![nodes[1], nodes[2]]); diff --git a/tests/integrations/config/dynamic/gc_worker.rs b/tests/integrations/config/dynamic/gc_worker.rs index 19e97058616..e3603d8cbab 100644 --- a/tests/integrations/config/dynamic/gc_worker.rs +++ b/tests/integrations/config/dynamic/gc_worker.rs @@ -17,7 +17,7 @@ fn test_gc_config_validate() { let mut invalid_cfg = GcConfig::default(); invalid_cfg.batch_keys = 0; - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); } fn setup_cfg_controller( diff --git a/tests/integrations/config/dynamic/pessimistic_txn.rs b/tests/integrations/config/dynamic/pessimistic_txn.rs index 78824d6ee95..b7496de182d 100644 --- a/tests/integrations/config/dynamic/pessimistic_txn.rs +++ b/tests/integrations/config/dynamic/pessimistic_txn.rs @@ -24,7 +24,7 @@ fn test_config_validate() { let mut invalid_cfg = Config::default(); invalid_cfg.wait_for_lock_timeout = ReadableDuration::millis(0); - assert!(invalid_cfg.validate().is_err()); + invalid_cfg.validate().unwrap_err(); } #[derive(Clone)] diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index bae6262aeb4..d1b34a3a498 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -191,7 +191,7 @@ fn test_update_raftstore_config() { ]; for cfg in invalid_cfgs { let change = new_changes(vec![cfg]); - assert!(cfg_controller.update(change).is_err()); + cfg_controller.update(change).unwrap_err(); // update failed, original config should not be changed. validate_store_cfg(&raft_store); diff --git a/tests/integrations/config/test_config_client.rs b/tests/integrations/config/test_config_client.rs index fa45d08b24a..4ceb5d3affc 100644 --- a/tests/integrations/config/test_config_client.rs +++ b/tests/integrations/config/test_config_client.rs @@ -33,23 +33,23 @@ fn test_update_config() { // update not support config let res = cfg_controller.update(change("server.addr", "localhost:3000")); - assert!(res.is_err()); + res.unwrap_err(); assert_eq!(cfg_controller.get_current(), cfg); // update to invalid config let res = cfg_controller.update(change("raftstore.raft-log-gc-threshold", "0")); - assert!(res.is_err()); + res.unwrap_err(); assert_eq!(cfg_controller.get_current(), cfg); // bad update request let res = cfg_controller.update(change("xxx.yyy", "0")); - assert!(res.is_err()); + res.unwrap_err(); let res = cfg_controller.update(change("raftstore.xxx", "0")); - assert!(res.is_err()); + res.unwrap_err(); let res = cfg_controller.update(change("raftstore.raft-log-gc-threshold", "10MB")); - assert!(res.is_err()); + res.unwrap_err(); let res = cfg_controller.update(change("raft-log-gc-threshold", "10MB")); - assert!(res.is_err()); + res.unwrap_err(); assert_eq!(cfg_controller.get_current(), cfg); } diff --git a/tests/integrations/pd/test_rpc_client.rs b/tests/integrations/pd/test_rpc_client.rs index a6ac43235f3..57566b91e75 100644 --- a/tests/integrations/pd/test_rpc_client.rs +++ b/tests/integrations/pd/test_rpc_client.rs @@ -509,7 +509,7 @@ fn test_pd_client_heartbeat_send_failed() { assert!(rsp.is_ok()); assert_eq!(rsp.unwrap().get_region_id(), 1); } else { - assert!(rsp.is_err()); + rsp.unwrap_err(); } let region = block_on(client.get_region_by_id(1)); @@ -519,7 +519,7 @@ fn test_pd_client_heartbeat_send_failed() { assert!(r.is_some()); assert_eq!(1, r.unwrap().get_id()); } else { - assert!(region.is_err()); + region.unwrap_err(); } }; // send fail if network is block. diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index e74f0979241..1caf4e31ea3 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -181,7 +181,7 @@ fn test_node_switch_api_version() { cluster.shutdown(); } else { // Should not be able to switch to `to_api`. - assert!(cluster.start().is_err()); + cluster.start().unwrap_err(); } } } diff --git a/tests/integrations/raftstore/test_early_apply.rs b/tests/integrations/raftstore/test_early_apply.rs index a88032671a3..b30a861e2fe 100644 --- a/tests/integrations/raftstore/test_early_apply.rs +++ b/tests/integrations/raftstore/test_early_apply.rs @@ -28,7 +28,7 @@ fn delete_old_data(engine: &E, id: u64) { } /// Allow lost situation. -#[derive(PartialEq, Eq, Clone, Copy)] +#[derive(PartialEq, Clone, Copy)] enum DataLost { /// The leader loses commit index. /// diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index ae04c0d12f2..4b69bd4129e 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -302,8 +302,8 @@ fn test_batch_id_in_lease(cluster: &mut Cluster) { let (split_key1, split_key2) = (b"k22", b"k44"); let keys = vec![b"k11", b"k33", b"k55"]; - let _ = keys.iter().map(|key| { - cluster.must_put(*key, b"v1"); + let _ = keys.iter().map(|&key| { + cluster.must_put(key, b"v1"); }); let region = pd_client.get_region(keys[0]).unwrap(); diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index d378c55c5e6..f44b2f99642 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -1636,9 +1636,9 @@ fn test_prepare_merge_with_5_nodes_snapshot() { pd_client.add_peer(left.get_id(), new_peer(5, 16)); // Make sure there will be no admin entries after min_matched. - for (k, v) in &[(b"k11", b"v11"), (b"k12", b"v12")] { - cluster.must_put(*k, *v); - must_get_equal(&cluster.get_engine(4), *k, *v); + for (k, v) in [(b"k11", b"v11"), (b"k12", b"v12")] { + cluster.must_put(k, v); + must_get_equal(&cluster.get_engine(4), k, v); } cluster.add_send_filter(IsolationFilterFactory::new(4)); // So index of peer 4 becomes min_matched. diff --git a/tests/integrations/raftstore/test_replica_read.rs b/tests/integrations/raftstore/test_replica_read.rs index a2ae4ab0f31..6deccad3a5e 100644 --- a/tests/integrations/raftstore/test_replica_read.rs +++ b/tests/integrations/raftstore/test_replica_read.rs @@ -101,7 +101,7 @@ fn test_replica_read_not_applied() { // Read index on follower should be blocked instead of get an old value. let resp1_ch = async_read_on_peer(&mut cluster, new_peer(3, 3), r1.clone(), b"k1", true, true); - assert!(resp1_ch.recv_timeout(Duration::from_secs(1)).is_err()); + resp1_ch.recv_timeout(Duration::from_secs(1)).unwrap_err(); // Unpark all append responses so that the new leader can commit its first // entry. @@ -151,7 +151,7 @@ fn test_replica_read_on_hibernate() { // Read index on follower should be blocked. let resp1_ch = async_read_on_peer(&mut cluster, new_peer(1, 1), r1, b"k1", true, true); - assert!(resp1_ch.recv_timeout(Duration::from_secs(1)).is_err()); + resp1_ch.recv_timeout(Duration::from_secs(1)).unwrap_err(); let (tx, rx) = mpsc::sync_channel(1024); let cb = Arc::new(move |msg: &RaftMessage| { @@ -278,7 +278,9 @@ fn test_replica_read_on_stale_peer() { cluster.must_put(b"k2", b"v2"); let resp1_ch = async_read_on_peer(&mut cluster, peer_on_store3, region, b"k2", true, true); // must be timeout - assert!(resp1_ch.recv_timeout(Duration::from_micros(100)).is_err()); + resp1_ch + .recv_timeout(Duration::from_micros(100)) + .unwrap_err(); } #[test] @@ -311,7 +313,7 @@ fn test_read_index_out_of_order() { // Can't get read resonse because heartbeat responses are blocked. let r1 = cluster.get_region(b"k1"); let resp1 = async_read_on_peer(&mut cluster, new_peer(1, 1), r1.clone(), b"k1", true, true); - assert!(resp1.recv_timeout(Duration::from_secs(2)).is_err()); + resp1.recv_timeout(Duration::from_secs(2)).unwrap_err(); pd_client.must_remove_peer(rid, new_peer(2, 2)); @@ -353,8 +355,8 @@ fn test_read_index_retry_lock_checking() { let r1 = cluster.get_region(b"k1"); let resp1 = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1.clone(), b"k1", true); let resp2 = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1, b"k2", true); - assert!(resp1.recv_timeout(Duration::from_secs(2)).is_err()); - assert!(resp2.try_recv().is_err()); + resp1.recv_timeout(Duration::from_secs(2)).unwrap_err(); + resp2.try_recv().unwrap_err(); // k1 has a memory lock let leader_cm = cluster.sim.rl().get_concurrency_manager(1); diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index a7664e8ccf0..6ac72f668db 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -689,7 +689,7 @@ fn test_split_epoch_not_match(cluster: &mut Cluster, right_deri cluster.must_split(&r, b"k4"); let regions: Vec<_> = [b"k0", b"k2", b"k3", b"k4"] .iter() - .map(|k| pd_client.get_region(*k).unwrap()) + .map(|&k| pd_client.get_region(k).unwrap()) .collect(); let new = regions[3].clone(); diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs index cf2361ebc8e..a9cd40d2fff 100644 --- a/tests/integrations/raftstore/test_unsafe_recovery.rs +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -19,11 +19,9 @@ fn confirm_quorum_is_lost(cluster: &mut Cluster, region: &metap true, ); // marjority is lost, can't propose command successfully. - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap_err(); } #[test] @@ -854,11 +852,9 @@ fn test_force_leader_with_uncommitted_conf_change() { find_peer(®ion, 2).unwrap().clone(), ); let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap_err(); // wait election timeout std::thread::sleep(Duration::from_millis( @@ -973,11 +969,9 @@ fn test_force_leader_on_wrong_leader() { find_peer(®ion, 3).unwrap().clone(), ); let req = new_admin_request(region.get_id(), region.get_region_epoch(), cmd); - assert!( - cluster - .call_command_on_leader(req, Duration::from_millis(10)) - .is_err() - ); + cluster + .call_command_on_leader(req, Duration::from_millis(10)) + .unwrap_err(); cluster.exit_force_leader(region.get_id(), 2); // peer on node2 still doesn't have the latest committed log. diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 366de3c0493..17b1e49f2e0 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -1936,7 +1936,7 @@ fn test_txn_api_version() { // Pessimistic Lock ts += 1; let lock_ts = ts; - let _resp = must_kv_pessimistic_lock(&client, ctx.clone(), k.clone(), lock_ts); + must_kv_pessimistic_lock(&client, ctx.clone(), k.clone(), lock_ts); // Prewrite Pessimistic let mut mutation = Mutation::default(); diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index c3964ab39d8..7ee38a72c87 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -423,7 +423,7 @@ fn test_store_allowlist() { for _ in 0..3 { let mut raft_m = RaftMessage::default(); raft_m.mut_to_peer().set_store_id(1); - assert!(raft_client.send(raft_m).is_err()); + raft_client.send(raft_m).unwrap_err(); } for _ in 0..5 { let mut raft_m = RaftMessage::default(); diff --git a/tests/integrations/server/security.rs b/tests/integrations/server/security.rs index 71a0979a005..a0d7d53186d 100644 --- a/tests/integrations/server/security.rs +++ b/tests/integrations/server/security.rs @@ -44,5 +44,5 @@ fn test_check_cn_fail() { let client = TikvClient::new(channel); let status = client.kv_get(&GetRequest::default()); - assert!(status.is_err()); + status.unwrap_err(); } diff --git a/tests/integrations/storage/test_raft_storage.rs b/tests/integrations/storage/test_raft_storage.rs index ef1ee5402e6..98e60386884 100644 --- a/tests/integrations/storage/test_raft_storage.rs +++ b/tests/integrations/storage/test_raft_storage.rs @@ -56,10 +56,14 @@ fn test_raft_storage() { // Test wrong region id. let region_id = ctx.get_region_id(); ctx.set_region_id(region_id + 1); - assert!(storage.get(ctx.clone(), &key, 20).is_err()); - assert!(storage.batch_get(ctx.clone(), &[key.clone()], 20).is_err()); - assert!(storage.scan(ctx.clone(), key, None, 1, false, 20).is_err()); - assert!(storage.scan_locks(ctx, 20, None, None, 100).is_err()); + storage.get(ctx.clone(), &key, 20).unwrap_err(); + storage + .batch_get(ctx.clone(), &[key.clone()], 20) + .unwrap_err(); + storage + .scan(ctx.clone(), key, None, 1, false, 20) + .unwrap_err(); + storage.scan_locks(ctx, 20, None, None, 100).unwrap_err(); } #[test] @@ -147,7 +151,7 @@ fn test_raft_storage_store_not_match() { peer.set_store_id(store_id + 1); ctx.set_peer(peer); - assert!(storage.get(ctx.clone(), &key, 20).is_err()); + storage.get(ctx.clone(), &key, 20).unwrap_err(); let res = storage.get(ctx.clone(), &key, 20); if let StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Engine(KvError( box KvErrorInner::Request(ref e), @@ -157,9 +161,13 @@ fn test_raft_storage_store_not_match() { } else { panic!("expect store_not_match, but got {:?}", res); } - assert!(storage.batch_get(ctx.clone(), &[key.clone()], 20).is_err()); - assert!(storage.scan(ctx.clone(), key, None, 1, false, 20).is_err()); - assert!(storage.scan_locks(ctx, 20, None, None, 100).is_err()); + storage + .batch_get(ctx.clone(), &[key.clone()], 20) + .unwrap_err(); + storage + .scan(ctx.clone(), key, None, 1, false, 20) + .unwrap_err(); + storage.scan_locks(ctx, 20, None, None, 100).unwrap_err(); } #[test] @@ -350,8 +358,8 @@ fn test_auto_gc() { let split_keys: &[&[u8]] = &[b"k2", b"k4", b"k6", b"k8"]; for k in split_keys { - let region = cluster.get_region(*k); - cluster.must_split(®ion, *k); + let region = cluster.get_region(k); + cluster.must_split(®ion, k); } check_data(&mut cluster, &storages, &test_data, 50, true); diff --git a/tests/integrations/storage/test_raftkv.rs b/tests/integrations/storage/test_raftkv.rs index 4f48cb72920..f99d9348616 100644 --- a/tests/integrations/storage/test_raftkv.rs +++ b/tests/integrations/storage/test_raftkv.rs @@ -530,5 +530,5 @@ fn wrong_context(ctx: &Context, engine: &E) { let region_id = ctx.get_region_id(); let mut ctx = ctx.to_owned(); ctx.set_region_id(region_id + 1); - assert!(engine.write(&ctx, WriteData::default()).is_err()); + engine.write(&ctx, WriteData::default()).unwrap_err(); } diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 7b1aab71183..b0c95eb9f7a 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -83,7 +83,7 @@ fn test_turnoff_titan() { // try reopen db when titan isn't properly turned off. configure_for_disable_titan(&mut cluster); - assert!(cluster.pre_start_check().is_err()); + cluster.pre_start_check().unwrap_err(); configure_for_enable_titan(&mut cluster, ReadableSize::kb(0)); cluster.pre_start_check().unwrap(); From dcb5e2ccd2582fc2f8d3425c27f9891368658154 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 4 Aug 2022 12:24:06 +0800 Subject: [PATCH 133/676] tikv_kv: support tablet level snapshot acquisition (#13039) close tikv/tikv#13050 Signed-off-by: SpadeA-Tang --- Cargo.lock | 7 + components/raftstore-v2/Cargo.toml | 6 + components/raftstore-v2/src/fsm/mod.rs | 2 +- components/raftstore-v2/src/fsm/store.rs | 30 +- components/raftstore-v2/src/operation/mod.rs | 2 + .../raftstore-v2/src/operation/read/mod.rs | 3 + .../raftstore-v2/src/operation/read/read.rs | 256 ++++++++++ components/raftstore-v2/src/tablet.rs | 3 +- components/raftstore/src/router.rs | 23 +- components/raftstore/src/store/fsm/store.rs | 1 - components/raftstore/src/store/mod.rs | 11 +- components/raftstore/src/store/peer.rs | 12 +- components/raftstore/src/store/worker/mod.rs | 5 +- components/raftstore/src/store/worker/read.rs | 460 +++++++++++++++--- components/server/src/server.rs | 13 +- components/test_raftstore/src/node.rs | 6 +- components/test_raftstore/src/server.rs | 8 +- components/tikv_kv/Cargo.toml | 1 + src/server/engine_factory.rs | 5 +- src/server/engine_factory_v2.rs | 13 +- 20 files changed, 760 insertions(+), 107 deletions(-) create mode 100644 components/raftstore-v2/src/operation/read/mod.rs create mode 100644 components/raftstore-v2/src/operation/read/read.rs diff --git a/Cargo.lock b/Cargo.lock index c5d22fc6e61..52b39154e91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4183,12 +4183,15 @@ dependencies = [ "batch-system", "collections", "crossbeam", + "engine_rocks", "engine_test", "engine_traits", "error_code", "fail", "futures-util", + "keys", "kvproto", + "log_wrappers", "pd_client", "raft", "raft-proto", @@ -4199,7 +4202,10 @@ dependencies = [ "tempfile", "test_pd", "test_util", + "tikv_kv", "tikv_util", + "time", + "txn_types", ] [[package]] @@ -6217,6 +6223,7 @@ dependencies = [ "backtrace", "engine_panic", "engine_rocks", + "engine_test", "engine_traits", "error_code", "fail", diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 5cdd2ee747f..f6a827d7424 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -32,18 +32,24 @@ cloud-azure = ["raftstore/cloud-azure"] batch-system = { path = "../batch-system", default-features = false } collections = { path = "../collections" } crossbeam = "0.8" +engine_rocks = { path = "../engine_rocks", default-features = false } engine_traits = { path = "../engine_traits" } error_code = { path = "../error_code" } fail = "0.5" futures-util = { version = "0.3", features = ["compat"] } +keys = { path = "../keys", default-features = false } kvproto = { git = "https://github.com/pingcap/kvproto.git" } +log_wrappers = { path = "../log_wrappers" } pd_client = { path = "../pd_client" } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0" } raftstore = { path = "../raftstore" } slog = "2.3" smallvec = "1.4" +tikv_kv = { path = "../tikv_kv", default-features = false } tikv_util = { path = "../tikv_util", default-features = false } +time = "0.1" +txn_types = { path = "../txn_types", default-features = false } [dev-dependencies] engine_test = { path = "../engine_test", default-features = false } diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs index 02f788d3be2..8126c8a868a 100644 --- a/components/raftstore-v2/src/fsm/mod.rs +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -11,4 +11,4 @@ mod store; pub use apply::{ApplyFsm, ApplyFsmDelegate}; pub use peer::{PeerFsm, PeerFsmDelegate, SenderFsmPair}; -pub use store::{StoreFsm, StoreFsmDelegate}; +pub use store::{StoreFsm, StoreFsmDelegate, StoreMeta}; diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 257028f1630..886478a3036 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -1,12 +1,38 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use batch_system::Fsm; +use collections::HashMap; use crossbeam::channel::TryRecvError; +use engine_traits::KvEngine; use kvproto::metapb::Store; -use raftstore::store::Config; +use raftstore::store::{Config, ReadDelegate}; use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; -use crate::{batch::StoreContext, StoreMsg}; +use crate::{batch::StoreContext, tablet::CachedTablet, StoreMsg}; + +pub struct StoreMeta +where + E: KvEngine, +{ + pub store_id: Option, + /// region_id -> reader + pub readers: HashMap, + /// region_id -> tablet cache + pub tablet_caches: HashMap>, +} + +impl StoreMeta +where + E: KvEngine, +{ + pub fn new() -> StoreMeta { + StoreMeta { + store_id: None, + readers: HashMap::default(), + tablet_caches: HashMap::default(), + } + } +} pub struct StoreFsm { store: Store, diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index bb3db8c75d3..8c427378da3 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -1 +1,3 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod read; diff --git a/components/raftstore-v2/src/operation/read/mod.rs b/components/raftstore-v2/src/operation/read/mod.rs new file mode 100644 index 00000000000..8c427378da3 --- /dev/null +++ b/components/raftstore-v2/src/operation/read/mod.rs @@ -0,0 +1,3 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod read; diff --git a/components/raftstore-v2/src/operation/read/read.rs b/components/raftstore-v2/src/operation/read/read.rs new file mode 100644 index 00000000000..63878beeb22 --- /dev/null +++ b/components/raftstore-v2/src/operation/read/read.rs @@ -0,0 +1,256 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// #[PerformanceCriticalPath] +use std::{ + cell::Cell, + collections::HashMap, + fmt::{self, Display, Formatter}, + marker::PhantomData, + ops::Deref, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use crossbeam::{atomic::AtomicCell, channel::TrySendError}; +use engine_traits::{KvEngine, RaftEngine, Snapshot, TabletFactory}; +use fail::fail_point; +use kvproto::{ + errorpb, + kvrpcpb::ExtraOp as TxnExtraOp, + metapb, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, ReadIndexResponse, Request, Response}, +}; +use pd_client::BucketMeta; +use raftstore::{ + errors::RAFTSTORE_IS_BUSY, + store::{ + cmd_resp, + util::{self, LeaseState, RegionReadProgress, RemoteLease}, + ReadDelegate, ReadExecutor, ReadExecutorProvider, ReadMetrics, ReadProgress, ReadResponse, + RegionSnapshot, RequestInspector, RequestPolicy, TrackVer, TxnExt, + }, + Error, Result, +}; +use slog::{debug, error, info, o, warn, Logger}; +use tikv_util::{ + codec::number::decode_u64, + lru::LruCache, + time::{monotonic_raw_now, Instant, ThreadReadId}, +}; +use time::Timespec; + +use crate::{fsm::StoreMeta, tablet::CachedTablet}; + +/// CachedReadDelegate is a wrapper the ReadDelegate and CachedTablet. +/// CachedTablet can fetch the latest tablet of this ReadDelegate's region. The +/// main purpose of this wrapping is to implement ReadExecutor where the latest +/// tablet is needed. +pub struct CachedReadDelegate +where + E: KvEngine, +{ + // The reason for this to be Arc, see the comment on get_delegate in + // raftstore/src/store/worker/read.rs + delegate: Arc, + cached_tablet: CachedTablet, +} + +impl Deref for CachedReadDelegate +where + E: KvEngine, +{ + type Target = ReadDelegate; + + fn deref(&self) -> &Self::Target { + self.delegate.as_ref() + } +} + +impl Clone for CachedReadDelegate +where + E: KvEngine, +{ + fn clone(&self) -> Self { + CachedReadDelegate { + delegate: Arc::clone(&self.delegate), + cached_tablet: self.cached_tablet.clone(), + } + } +} + +impl ReadExecutor for CachedReadDelegate +where + E: KvEngine, +{ + fn get_tablet(&mut self) -> &E { + self.cached_tablet.latest().unwrap() + } + + fn get_snapshot( + &mut self, + _: Option, + _: &mut Option>, + ) -> Arc { + Arc::new(self.cached_tablet.latest().unwrap().snapshot()) + } +} + +#[derive(Clone)] +struct StoreMetaDelegate +where + E: KvEngine, +{ + store_meta: Arc>>, +} + +impl StoreMetaDelegate +where + E: KvEngine, +{ + pub fn new(store_meta: Arc>>) -> StoreMetaDelegate { + StoreMetaDelegate { store_meta } + } +} + +impl ReadExecutorProvider for StoreMetaDelegate +where + E: KvEngine, +{ + type Executor = CachedReadDelegate; + + fn store_id(&self) -> Option { + self.store_meta.as_ref().lock().unwrap().store_id + } + + /// get the ReadDelegate with region_id and the number of delegates in the + /// StoreMeta + fn get_executor_and_len(&self, region_id: u64) -> (usize, Option) { + let meta = self.store_meta.as_ref().lock().unwrap(); + let reader = meta.readers.get(®ion_id).cloned(); + if let Some(reader) = reader { + // If reader is not None, cache must not be None. + let cached_tablet = meta.tablet_caches.get(®ion_id).cloned().unwrap(); + return ( + meta.readers.len(), + Some(CachedReadDelegate { + delegate: Arc::new(reader), + cached_tablet, + }), + ); + } + (meta.readers.len(), None) + } +} + +#[cfg(test)] +mod tests { + use std::{borrow::Borrow, sync::mpsc::*, thread}; + + use crossbeam::channel::TrySendError; + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, KvTestSnapshot, TestTabletFactoryV2}, + }; + use engine_traits::{Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; + use kvproto::{metapb::Region, raft_cmdpb::*}; + use raftstore::store::{ + util::Lease, Callback, CasualMessage, CasualRouter, LocalReader, ProposalRouter, + RaftCommand, + }; + use tempfile::{Builder, TempDir}; + use tikv_kv::Snapshot; + use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; + use time::Duration; + use txn_types::{Key, Lock, LockType, WriteBatchFlags}; + + use super::*; + + fn new_read_delegate( + region: &Region, + peer_id: u64, + term: u64, + applied_index_term: u64, + ) -> ReadDelegate { + let mut read_delegate_core = ReadDelegate::mock(region.id); + read_delegate_core.peer_id = peer_id; + read_delegate_core.term = term; + read_delegate_core.applied_term = applied_index_term; + read_delegate_core.region = Arc::new(region.clone()); + read_delegate_core + } + + #[test] + fn test_read_delegate() { + // Building a tablet factory + let ops = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let path = Builder::new() + .prefix("test-local-reader") + .tempdir() + .unwrap(); + let factory = Arc::new(TestTabletFactoryV2::new( + path.path().to_str().unwrap(), + ops, + cf_opts, + )); + + let store_meta = + StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::::new()))); + + let tablet1; + let tablet2; + { + let mut meta = store_meta.store_meta.as_ref().lock().unwrap(); + + // Create read_delegate with region id 1 + let mut read_delegate = ReadDelegate::mock(1); + meta.readers.insert(1, read_delegate); + + // create tablet with region_id 1 and prepare some data + tablet1 = factory.create_tablet(1, 10).unwrap(); + tablet1.put_cf(CF_DEFAULT, b"a1", b"val1").unwrap(); + let cache = CachedTablet::new(Some(tablet1.clone())); + meta.tablet_caches.insert(1, cache); + + // Create read_delegate with region id 1 + let mut read_delegate = ReadDelegate::mock(2); + let cache = CachedTablet::new(Some(read_delegate.clone())); + meta.readers.insert(2, read_delegate); + + // create tablet with region_id 1 and prepare some data + tablet2 = factory.create_tablet(2, 10).unwrap(); + tablet2.put_cf(CF_DEFAULT, b"a2", b"val2").unwrap(); + let cache = CachedTablet::new(Some(tablet2.clone())); + meta.tablet_caches.insert(2, cache); + } + + let (_, delegate) = store_meta.get_executor_and_len(1); + let mut delegate = delegate.unwrap(); + let tablet = delegate.get_tablet(); + assert_eq!(tablet1.as_inner().path(), tablet.as_inner().path()); + let snapshot = delegate.get_snapshot(None, &mut None); + assert_eq!( + b"val1".to_vec(), + snapshot + .get(&Key::from_encoded(b"a1".to_vec())) + .unwrap() + .unwrap() + ); + + let (_, delegate) = store_meta.get_executor_and_len(2); + let mut delegate = delegate.unwrap(); + let tablet = delegate.get_tablet(); + assert_eq!(tablet2.as_inner().path(), tablet.as_inner().path()); + let snapshot = delegate.get_snapshot(None, &mut None); + assert_eq!( + b"val2".to_vec(), + snapshot + .get(&Key::from_encoded(b"a2".to_vec())) + .unwrap() + .unwrap() + ); + } +} diff --git a/components/raftstore-v2/src/tablet.rs b/components/raftstore-v2/src/tablet.rs index 8552b1a1f0f..7765f5c07b6 100644 --- a/components/raftstore-v2/src/tablet.rs +++ b/components/raftstore-v2/src/tablet.rs @@ -5,6 +5,7 @@ use std::sync::{ Arc, Mutex, }; +#[derive(Debug)] struct LatestTablet { data: Mutex>, version: AtomicU64, @@ -13,7 +14,7 @@ struct LatestTablet { /// Tablet may change during split, merge and applying snapshot. So we need a /// shared value to reflect the latest tablet. `CachedTablet` provide cache that /// can speed up common access. -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct CachedTablet { latest: Arc>, cache: Option, diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 72d2bf8ca2b..400fee65813 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -12,9 +12,9 @@ use tikv_util::time::ThreadReadId; use crate::{ store::{ fsm::RaftRouter, - transport::{CasualRouter, ProposalRouter, SignificantRouter, StoreRouter}, - Callback, CasualMessage, LocalReader, PeerMsg, RaftCmdExtraOpts, RaftCommand, - SignificantMsg, StoreMsg, + transport::{CasualRouter, ProposalRouter, SignificantRouter}, + CachedReadDelegate, Callback, CasualMessage, LocalReader, PeerMsg, RaftCmdExtraOpts, + RaftCommand, SignificantMsg, StoreMetaDelegate, StoreMsg, StoreRouter, }, DiscardReason, Error as RaftStoreError, Result as RaftStoreResult, }; @@ -168,12 +168,21 @@ where } /// A router that routes messages to the raftstore -pub struct ServerRaftStoreRouter { +pub struct ServerRaftStoreRouter +where + EK: KvEngine, + ER: RaftEngine, +{ router: RaftRouter, - local_reader: RefCell, EK>>, + local_reader: + RefCell, EK, CachedReadDelegate, StoreMetaDelegate>>, } -impl Clone for ServerRaftStoreRouter { +impl Clone for ServerRaftStoreRouter +where + EK: KvEngine, + ER: RaftEngine, +{ fn clone(&self) -> Self { ServerRaftStoreRouter { router: self.router.clone(), @@ -186,7 +195,7 @@ impl ServerRaftStoreRouter { /// Creates a new router. pub fn new( router: RaftRouter, - reader: LocalReader, EK>, + reader: LocalReader, EK, CachedReadDelegate, StoreMetaDelegate>, ) -> ServerRaftStoreRouter { let local_reader = RefCell::new(reader); ServerRaftStoreRouter { diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 9e126d4d141..ecdb8653147 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -115,7 +115,6 @@ pub struct StoreInfo { } pub struct StoreMeta { - /// store id pub store_id: Option, /// region_end_key -> region_id pub region_ranges: BTreeMap, u64>, diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index d75fef94323..d47cc892033 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -67,10 +67,11 @@ pub use self::{ txn_ext::{LocksStatus, PeerPessimisticLocks, PessimisticLockPair, TxnExt}, util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ - AutoSplitController, Bucket, BucketRange, CheckLeaderRunner, CheckLeaderTask, - FlowStatistics, FlowStatsReporter, KeyEntry, LocalReader, PdTask, QueryStats, - RaftlogFetchRunner, RaftlogFetchTask, ReadDelegate, ReadStats, RefreshConfigTask, - RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, TrackVer, - WriteStats, + AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, + CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, + LocalReader, PdTask, QueryStats, RaftlogFetchRunner, RaftlogFetchTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadMetrics, ReadProgress, ReadStats, + RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, + SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, }, }; diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 7c57eeb9ae4..99287ca493c 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -79,7 +79,7 @@ use super::{ self, check_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, ConfChangeKind, Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, }, - DestroyPeerJob, + DestroyPeerJob, LocalReadContext, }; use crate::{ coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason, RoleChange}, @@ -4653,7 +4653,7 @@ where } } - let mut resp = ctx.execute(&req, &Arc::new(region), read_index, None); + let mut resp = ctx.execute(&req, &Arc::new(region), read_index, None, None); if let Some(snap) = resp.snapshot.as_mut() { snap.txn_ext = Some(self.txn_ext.clone()); snap.bucket_meta = self.region_buckets.as_ref().map(|b| b.meta.clone()); @@ -5485,11 +5485,15 @@ where EK: KvEngine, ER: RaftEngine, { - fn get_engine(&self) -> &EK { + fn get_tablet(&mut self) -> &EK { &self.engines.kv } - fn get_snapshot(&mut self, _: Option) -> Arc { + fn get_snapshot( + &mut self, + _: Option, + _: &mut Option>, + ) -> Arc { Arc::new(self.engines.kv.snapshot()) } } diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 583e9341f0d..1651183f976 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -32,7 +32,10 @@ pub use self::{ query_stats::QueryStats, raftlog_fetch::{Runner as RaftlogFetchRunner, Task as RaftlogFetchTask}, raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, - read::{LocalReader, Progress as ReadProgress, ReadDelegate, ReadExecutor, TrackVer}, + read::{ + CachedReadDelegate, LocalReadContext, LocalReader, Progress as ReadProgress, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadMetrics, StoreMetaDelegate, TrackVer, + }, refresh_config::{ BatchComponent as RaftStoreBatchComponent, Runner as RefreshConfigRunner, Task as RefreshConfigTask, diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index b7724789d4b..f3d52be5044 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -4,6 +4,7 @@ use std::{ cell::Cell, fmt::{self, Display, Formatter}, + ops::Deref, sync::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, @@ -42,16 +43,21 @@ use crate::{ Error, Result, }; +/// #[RaftstoreCommon] pub trait ReadExecutor { - fn get_engine(&self) -> &E; - fn get_snapshot(&mut self, ts: Option) -> Arc; + fn get_tablet(&mut self) -> &E; + fn get_snapshot( + &mut self, + ts: Option, + read_context: &mut Option>, + ) -> Arc; - fn get_value(&self, req: &Request, region: &metapb::Region) -> Result { + fn get_value(&mut self, req: &Request, region: &metapb::Region) -> Result { let key = req.get_get().get_key(); // region key range has no data prefix, so we must use origin key to check. util::check_key_in_region(key, region)?; - let engine = self.get_engine(); + let engine = self.get_tablet(); let mut resp = Response::default(); let res = if !req.get_get().get_cf().is_empty() { let cf = req.get_get().get_cf(); @@ -89,6 +95,7 @@ pub trait ReadExecutor { region: &Arc, read_index: Option, mut ts: Option, + mut read_context: Option>, ) -> ReadResponse { let requests = msg.get_requests(); let mut response = ReadResponse { @@ -112,8 +119,10 @@ pub trait ReadExecutor { } }, CmdType::Snap => { - let snapshot = - RegionSnapshot::from_snapshot(self.get_snapshot(ts.take()), region.clone()); + let snapshot = RegionSnapshot::from_snapshot( + self.get_snapshot(ts.take(), &mut read_context), + region.clone(), + ); response.snapshot = Some(snapshot); Response::default() } @@ -143,7 +152,7 @@ pub trait ReadExecutor { } } -/// A read only delegate of `Peer`. +/// #[RaftstoreCommon]: A read only delegate of `Peer`. #[derive(Clone, Debug)] pub struct ReadDelegate { pub region: Arc, @@ -165,6 +174,50 @@ pub struct ReadDelegate { pub track_ver: TrackVer, } +/// CachedReadDelegate is a wrapper the ReadDelegate and kv_engine. LocalReader +/// dispatch local read requests to ReadDeleage according to the region_id where +/// ReadDelegate needs kv_engine to read data or fetch snapshot. +pub struct CachedReadDelegate +where + E: KvEngine, +{ + delegate: Arc, + kv_engine: E, +} + +impl Deref for CachedReadDelegate +where + E: KvEngine, +{ + type Target = ReadDelegate; + + fn deref(&self) -> &Self::Target { + self.delegate.as_ref() + } +} + +impl Clone for CachedReadDelegate +where + E: KvEngine, +{ + fn clone(&self) -> Self { + CachedReadDelegate { + delegate: Arc::clone(&self.delegate), + kv_engine: self.kv_engine.clone(), + } + } +} + +/// #[RaftstoreCommon]: LocalReadContext combines some LocalReader's fields for temporary usage. +pub struct LocalReadContext<'a, E> +where + E: KvEngine, +{ + metrics: &'a mut ReadMetrics, + read_id: &'a mut ThreadReadId, + snap_cache: &'a mut Box>>, +} + impl Drop for ReadDelegate { fn drop(&mut self) { // call `inc` to notify the source `ReadDelegate` is dropped @@ -172,6 +225,70 @@ impl Drop for ReadDelegate { } } +/// #[RaftstoreCommon] +pub trait ReadExecutorProvider: Send + Clone + 'static +where + E: KvEngine, +{ + type Executor: ReadExecutor; + + fn store_id(&self) -> Option; + + /// get the ReadDelegate with region_id and the number of delegates in the + /// StoreMeta + fn get_executor_and_len(&self, region_id: u64) -> (usize, Option); +} + +#[derive(Clone)] +pub struct StoreMetaDelegate +where + E: KvEngine, +{ + store_meta: Arc>, + kv_engine: E, +} + +impl StoreMetaDelegate +where + E: KvEngine, +{ + pub fn new(store_meta: Arc>, kv_engine: E) -> Self { + StoreMetaDelegate { + store_meta, + kv_engine, + } + } +} + +impl ReadExecutorProvider for StoreMetaDelegate +where + E: KvEngine, +{ + type Executor = CachedReadDelegate; + + fn store_id(&self) -> Option { + self.store_meta.as_ref().lock().unwrap().store_id + } + + /// get the ReadDelegate with region_id and the number of delegates in the + /// StoreMeta + fn get_executor_and_len(&self, region_id: u64) -> (usize, Option) { + let meta = self.store_meta.as_ref().lock().unwrap(); + let reader = meta.readers.get(®ion_id).cloned(); + if let Some(reader) = reader { + return ( + meta.readers.len(), + Some(CachedReadDelegate { + delegate: Arc::new(reader), + kv_engine: self.kv_engine.clone(), + }), + ); + } + (meta.readers.len(), None) + } +} + +/// #[RaftstoreCommon] #[derive(Debug)] pub struct TrackVer { version: Arc, @@ -193,14 +310,14 @@ impl TrackVer { } // Take `&mut self` to prevent calling `inc` and `clone` at the same time - fn inc(&mut self) { + pub fn inc(&mut self) { // Only the source `TrackVer` can increase version if self.source { self.version.fetch_add(1, Ordering::Relaxed); } } - fn any_new(&self) -> bool { + pub fn any_new(&self) -> bool { self.version.load(Ordering::Relaxed) > self.local_ver } } @@ -243,7 +360,7 @@ impl ReadDelegate { } } - fn fresh_valid_ts(&mut self) { + pub fn fresh_valid_ts(&mut self) { self.last_valid_ts = monotonic_raw_now(); } @@ -276,7 +393,7 @@ impl ReadDelegate { // If the remote lease will be expired in near future send message // to `raftstore` renew it - fn maybe_renew_lease_advance( + pub fn maybe_renew_lease_advance( &self, router: &dyn CasualRouter, ts: Timespec, @@ -301,7 +418,7 @@ impl ReadDelegate { } } - fn is_in_leader_lease(&self, ts: Timespec, metrics: &mut ReadMetrics) -> bool { + pub fn is_in_leader_lease(&self, ts: Timespec, metrics: &mut ReadMetrics) -> bool { if let Some(ref lease) = self.leader_lease { let term = lease.term(); if term == self.term { @@ -320,7 +437,7 @@ impl ReadDelegate { false } - fn check_stale_read_safe( + pub fn check_stale_read_safe( &self, read_ts: u64, metrics: &mut ReadMetrics, @@ -387,6 +504,7 @@ impl Display for ReadDelegate { } } +/// #[RaftstoreCommon] #[derive(Debug)] pub enum Progress { Region(metapb::Region), @@ -418,63 +536,74 @@ impl Progress { } } -pub struct LocalReader +/// #[RaftstoreCommon]: LocalReader is an entry point where local read requests are dipatch to the +/// relevant regions by LocalReader so that these requests can be handled by the +/// relevant ReadDelegate respectively. +pub struct LocalReader where C: ProposalRouter + CasualRouter, E: KvEngine, + D: ReadExecutor + Deref, + S: ReadExecutorProvider, { - store_id: Cell>, - store_meta: Arc>, + pub store_id: Cell>, + store_meta: S, kv_engine: E, - metrics: ReadMetrics, + pub metrics: ReadMetrics, // region id -> ReadDelegate // The use of `Arc` here is a workaround, see the comment at `get_delegate` - delegates: LruCache>, - snap_cache: Option>, + pub delegates: LruCache, + snap_cache: Box>>, cache_read_id: ThreadReadId, // A channel to raftstore. router: C, } -impl ReadExecutor for LocalReader +impl ReadExecutor for CachedReadDelegate where - C: ProposalRouter + CasualRouter, E: KvEngine, { - fn get_engine(&self) -> &E { + fn get_tablet(&mut self) -> &E { &self.kv_engine } - fn get_snapshot(&mut self, create_time: Option) -> Arc { - self.metrics.local_executed_requests += 1; + fn get_snapshot( + &mut self, + create_time: Option, + read_context: &mut Option>, + ) -> Arc { + let ctx = read_context.as_mut().unwrap(); + ctx.metrics.local_executed_requests += 1; if let Some(ts) = create_time { - if ts == self.cache_read_id { - if let Some(snap) = self.snap_cache.as_ref() { - self.metrics.local_executed_snapshot_cache_hit += 1; + if ts == *ctx.read_id { + if let Some(snap) = ctx.snap_cache.as_ref().as_ref() { + ctx.metrics.local_executed_snapshot_cache_hit += 1; return snap.clone(); } } let snap = Arc::new(self.kv_engine.snapshot()); - self.cache_read_id = ts; - self.snap_cache = Some(snap.clone()); + *ctx.read_id = ts; + *ctx.snap_cache = Box::new(Some(snap.clone())); return snap; } Arc::new(self.kv_engine.snapshot()) } } -impl LocalReader +impl LocalReader where C: ProposalRouter + CasualRouter, E: KvEngine, + D: ReadExecutor + Deref + Clone, + S: ReadExecutorProvider, { - pub fn new(kv_engine: E, store_meta: Arc>, router: C) -> Self { + pub fn new(kv_engine: E, store_meta: S, router: C) -> Self { let cache_read_id = ThreadReadId::new(); LocalReader { store_meta, kv_engine, router, - snap_cache: None, + snap_cache: Box::new(None), cache_read_id, store_id: Cell::new(None), metrics: Default::default(), @@ -520,28 +649,22 @@ where // choice is use `Rc` but `LocalReader: Send` will be violated, which is // required by `LocalReadRouter: Send`, use `Arc` will introduce extra cost but // make the logic clear - fn get_delegate(&mut self, region_id: u64) -> Option> { + pub fn get_delegate(&mut self, region_id: u64) -> Option { let rd = match self.delegates.get(®ion_id) { // The local `ReadDelegate` is up to date - Some(d) if !d.track_ver.any_new() => Some(Arc::clone(d)), + Some(d) if !d.track_ver.any_new() => Some(d.clone()), _ => { debug!("update local read delegate"; "region_id" => region_id); self.metrics.rejected_by_cache_miss += 1; - let (meta_len, meta_reader) = { - let meta = self.store_meta.lock().unwrap(); - ( - meta.readers.len(), - meta.readers.get(®ion_id).cloned().map(Arc::new), - ) - }; + let (meta_len, meta_reader) = { self.store_meta.get_executor_and_len(region_id) }; // Remove the stale delegate self.delegates.remove(®ion_id); self.delegates.resize(meta_len); match meta_reader { Some(reader) => { - self.delegates.insert(region_id, Arc::clone(&reader)); + self.delegates.insert(region_id, reader.clone()); Some(reader) } None => None, @@ -552,13 +675,13 @@ where rd.filter(|r| !r.pending_remove) } - fn pre_propose_raft_command( + pub fn pre_propose_raft_command( &mut self, req: &RaftCmdRequest, - ) -> Result, RequestPolicy)>> { + ) -> Result> { // Check store id. if self.store_id.get().is_none() { - let store_id = self.store_meta.lock().unwrap().store_id; + let store_id = self.store_meta.store_id(); self.store_id.set(store_id); } let store_id = self.store_id.get().unwrap(); @@ -627,7 +750,8 @@ where cb: Callback, ) { match self.pre_propose_raft_command(&req) { - Ok(Some((delegate, policy))) => { + Ok(Some((mut delegate, policy))) => { + let delegate_ext: LocalReadContext<'_, E>; let mut response = match policy { // Leader can read local if and only if it is in lease. RequestPolicy::ReadLocal => { @@ -647,8 +771,18 @@ where self.redirect(RaftCommand::new(req, cb)); return; } - let response = self.execute(&req, &delegate.region, None, read_id); + + delegate_ext = LocalReadContext { + metrics: &mut self.metrics, + snap_cache: &mut self.snap_cache, + read_id: &mut self.cache_read_id, + }; + + let region = Arc::clone(&delegate.region); + let response = + delegate.execute(&req, ®ion, None, read_id, Some(delegate_ext)); // Try renew lease in advance + delegate.maybe_renew_lease_advance( &self.router, snapshot_ts, @@ -667,8 +801,16 @@ where return; } + delegate_ext = LocalReadContext { + metrics: &mut self.metrics, + snap_cache: &mut self.snap_cache, + read_id: &mut self.cache_read_id, + }; + + let region = Arc::clone(&delegate.region); // Getting the snapshot - let response = self.execute(&req, &delegate.region, None, read_id); + let response = + delegate.execute(&req, ®ion, None, read_id, Some(delegate_ext)); // Double check in case `safe_ts` change after the first check and before // getting snapshot @@ -725,14 +867,16 @@ where } pub fn release_snapshot_cache(&mut self) { - self.snap_cache.take(); + self.snap_cache.as_mut().take(); } } -impl Clone for LocalReader +impl Clone for LocalReader where C: ProposalRouter + CasualRouter + Clone, E: KvEngine, + D: ReadExecutor + Deref, + S: ReadExecutorProvider, { fn clone(&self) -> Self { LocalReader { @@ -748,6 +892,7 @@ where } } +/// #[RaftstoreCommon] struct Inspector<'r, 'm> { delegate: &'r ReadDelegate, metrics: &'m mut ReadMetrics, @@ -786,26 +931,27 @@ impl<'r, 'm> RequestInspector for Inspector<'r, 'm> { const METRICS_FLUSH_INTERVAL: u64 = 15_000; // 15s +/// #[RaftstoreCommon] #[derive(Clone)] -struct ReadMetrics { - local_executed_requests: u64, - local_executed_stale_read_requests: u64, - local_executed_snapshot_cache_hit: u64, +pub struct ReadMetrics { + pub local_executed_requests: u64, + pub local_executed_stale_read_requests: u64, + pub local_executed_snapshot_cache_hit: u64, // TODO: record rejected_by_read_quorum. - rejected_by_store_id_mismatch: u64, - rejected_by_peer_id_mismatch: u64, - rejected_by_term_mismatch: u64, - rejected_by_lease_expire: u64, - rejected_by_no_region: u64, - rejected_by_no_lease: u64, - rejected_by_epoch: u64, - rejected_by_applied_term: u64, - rejected_by_channel_full: u64, - rejected_by_cache_miss: u64, - rejected_by_safe_timestamp: u64, - renew_lease_advance: u64, - - last_flush_time: Instant, + pub rejected_by_store_id_mismatch: u64, + pub rejected_by_peer_id_mismatch: u64, + pub rejected_by_term_mismatch: u64, + pub rejected_by_lease_expire: u64, + pub rejected_by_no_region: u64, + pub rejected_by_no_lease: u64, + pub rejected_by_epoch: u64, + pub rejected_by_applied_term: u64, + pub rejected_by_channel_full: u64, + pub rejected_by_cache_miss: u64, + pub rejected_by_safe_timestamp: u64, + pub renew_lease_advance: u64, + + pub last_flush_time: Instant, } impl Default for ReadMetrics { @@ -923,7 +1069,7 @@ mod tests { use crossbeam::channel::TrySendError; use engine_test::kv::{KvTestEngine, KvTestSnapshot}; - use engine_traits::ALL_CFS; + use engine_traits::{Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; use kvproto::raft_cmdpb::*; use tempfile::{Builder, TempDir}; use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; @@ -980,13 +1126,18 @@ mod tests { store_meta: Arc>, ) -> ( TempDir, - LocalReader, + LocalReader< + MockRouter, + KvTestEngine, + CachedReadDelegate, + StoreMetaDelegate, + >, Receiver>, ) { let path = Builder::new().prefix(path).tempdir().unwrap(); let db = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let (ch, rx, _) = MockRouter::new(); - let mut reader = LocalReader::new(db, store_meta, ch); + let mut reader = LocalReader::new(db.clone(), StoreMetaDelegate::new(store_meta, db), ch); reader.store_id = Cell::new(Some(store_id)); (path, reader, rx) } @@ -1004,7 +1155,12 @@ mod tests { } fn must_redirect( - reader: &mut LocalReader, + reader: &mut LocalReader< + MockRouter, + KvTestEngine, + CachedReadDelegate, + StoreMetaDelegate, + >, rx: &Receiver>, cmd: RaftCmdRequest, ) { @@ -1024,7 +1180,12 @@ mod tests { } fn must_not_redirect( - reader: &mut LocalReader, + reader: &mut LocalReader< + MockRouter, + KvTestEngine, + CachedReadDelegate, + StoreMetaDelegate, + >, rx: &Receiver>, task: RaftCommand, ) { @@ -1386,4 +1547,157 @@ mod tests { let d = reader.get_delegate(1).unwrap(); assert_eq!(d.leader_lease.clone().unwrap().term(), 3); } + + #[test] + fn test_read_delegate() { + let path = Builder::new() + .prefix("test-local-reader") + .tempdir() + .unwrap(); + let kv_engine = + engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); + kv_engine.put_cf(CF_DEFAULT, b"a1", b"val1").unwrap(); + let store_meta = + StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(0))), kv_engine.clone()); + + { + let mut meta = store_meta.store_meta.as_ref().lock().unwrap(); + + // Create read_delegate with region id 1 + let read_delegate = ReadDelegate::mock(1); + meta.readers.insert(1, read_delegate); + + // Create read_delegate with region id 1 + let read_delegate = ReadDelegate::mock(2); + meta.readers.insert(2, read_delegate); + } + + let mut read_id = ThreadReadId::new(); + let mut read_metrics = ReadMetrics::default(); + let mut snap_cache = Box::new(None); + + let read_id_copy = Some(read_id.clone()); + + let mut read_context = Some(LocalReadContext { + metrics: &mut read_metrics, + read_id: &mut read_id, + snap_cache: &mut snap_cache, + }); + + let (_, delegate) = store_meta.get_executor_and_len(1); + let mut delegate = delegate.unwrap(); + let tablet = delegate.get_tablet(); + assert_eq!(kv_engine.as_inner().path(), tablet.as_inner().path()); + let snapshot = delegate.get_snapshot(read_id_copy.clone(), &mut read_context); + let val = snapshot.get_value(b"a1").unwrap().unwrap(); + assert_eq!(b"val1", val.deref()); + + let (_, delegate) = store_meta.get_executor_and_len(2); + let mut delegate = delegate.unwrap(); + let tablet = delegate.get_tablet(); + assert_eq!(kv_engine.as_inner().path(), tablet.as_inner().path()); + let snapshot = delegate.get_snapshot(read_id_copy, &mut read_context); + let val = snapshot.get_value(b"a1").unwrap().unwrap(); + assert_eq!(b"val1", val.deref()); + + assert!(snap_cache.as_ref().is_some()); + assert_eq!(read_metrics.local_executed_requests, 2); + assert_eq!(read_metrics.local_executed_snapshot_cache_hit, 1); + } + + #[test] + fn test_snap_cache_hit() { + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, _) = new_reader("test-local-reader", 1, store_meta.clone()); + + let mut region1 = metapb::Region::default(); + region1.set_id(1); + + // Register region 1 + { + let mut meta = store_meta.lock().unwrap(); + let read_delegate = ReadDelegate { + tag: String::new(), + region: Arc::new(region1.clone()), + peer_id: 1, + term: 1, + applied_term: 1, + leader_lease: None, + last_valid_ts: Timespec::new(0, 0), + txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), + txn_ext: Arc::new(TxnExt::default()), + read_progress: Arc::new(RegionReadProgress::new(®ion1, 1, 1, "".to_owned())), + pending_remove: false, + track_ver: TrackVer::new(), + bucket_meta: None, + }; + meta.readers.insert(1, read_delegate); + } + + let mut delegate = reader.get_delegate(region1.id).unwrap(); + let read_id = Some(ThreadReadId::new()); + + { + let mut read_context = Some(LocalReadContext { + metrics: &mut reader.metrics, + snap_cache: &mut reader.snap_cache, + read_id: &mut reader.cache_read_id, + }); + + for _ in 0..10 { + // Different region id should reuse the cache + let _ = delegate.get_snapshot(read_id.clone(), &mut read_context); + } + } + // We should hit cache 9 times + assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 9); + + let read_id = Some(ThreadReadId::new()); + + { + let read_context = LocalReadContext { + metrics: &mut reader.metrics, + snap_cache: &mut reader.snap_cache, + read_id: &mut reader.cache_read_id, + }; + + let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); + } + // This time, we will miss the cache + assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 9); + + { + let read_context = LocalReadContext { + metrics: &mut reader.metrics, + snap_cache: &mut reader.snap_cache, + read_id: &mut reader.cache_read_id, + }; + let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); + // We can hit it again. + assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 10); + } + + reader.release_snapshot_cache(); + { + let read_context = LocalReadContext { + metrics: &mut reader.metrics, + snap_cache: &mut reader.snap_cache, + read_id: &mut reader.cache_read_id, + }; + let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); + } + // After release, we will mss the cache even with the prevsiou read_id. + assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 10); + + { + let read_context = LocalReadContext { + metrics: &mut reader.metrics, + snap_cache: &mut reader.snap_cache, + read_id: &mut reader.cache_read_id, + }; + let _ = delegate.get_snapshot(read_id, &mut Some(read_context)); + } + // We can hit it again. + assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 11); + } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index d8824453a24..fe2b0dd1c26 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -76,7 +76,7 @@ use raftstore::{ }, memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, AutoSplitController, CheckLeaderRunner, GlobalReplicationState, LocalReader, SnapManager, - SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, + SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, RaftRouterCompactedEventSender, }; @@ -257,7 +257,10 @@ type LocalServer = Server, resolve::PdStoreAddrResolver, LocalRaftKv>; type LocalRaftKv = RaftKv>; -impl TiKvServer { +impl TiKvServer +where + ER: RaftEngine, +{ fn init(mut config: TiKvConfig) -> TiKvServer { tikv_util::thread_group::set_properties(Some(GroupProperties::default())); // It is okay use pd config and security config before `init_config`, @@ -561,7 +564,11 @@ impl TiKvServer { let engine = RaftKv::new( ServerRaftStoreRouter::new( self.router.clone(), - LocalReader::new(engines.kv.clone(), store_meta.clone(), self.router.clone()), + LocalReader::new( + engines.kv.clone(), + StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), + self.router.clone(), + ), ), engines.kv.clone(), ); diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index ac3e3a6cc6e..2584d29629e 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -290,7 +290,11 @@ impl Simulator for NodeCluster { Arc::new(SstImporter::new(&cfg.import, dir, None, cfg.storage.api_version()).unwrap()) }; - let local_reader = LocalReader::new(engines.kv.clone(), store_meta.clone(), router.clone()); + let local_reader = LocalReader::new( + engines.kv.clone(), + StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), + router.clone(), + ); let cfg_controller = ConfigController::new(cfg.tikv.clone()); let split_check_runner = diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index e22b730151a..7107c668c3d 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -38,7 +38,7 @@ use raftstore::{ fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, msg::RaftCmdExtraOpts, AutoSplitController, Callback, CheckLeaderRunner, LocalReader, RegionSnapshot, SnapManager, - SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, + SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, Result, }; @@ -284,7 +284,11 @@ impl ServerCluster { } } - let local_reader = LocalReader::new(engines.kv.clone(), store_meta.clone(), router.clone()); + let local_reader = LocalReader::new( + engines.kv.clone(), + StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), + router.clone(), + ); let raft_router = ServerRaftStoreRouter::new(router.clone(), local_reader); let sim_router = SimulateTransport::new(raft_router.clone()); diff --git a/components/tikv_kv/Cargo.toml b/components/tikv_kv/Cargo.toml index 5b640d3b0b7..cd6543dafe8 100644 --- a/components/tikv_kv/Cargo.toml +++ b/components/tikv_kv/Cargo.toml @@ -28,6 +28,7 @@ test-engines-panic = [ backtrace = "0.3" engine_panic = { path = "../engine_panic", default-features = false } engine_rocks = { path = "../engine_rocks", default-features = false } +engine_test = { path = "../engine_test", default-features = false } engine_traits = { path = "../engine_traits", default-features = false } error_code = { path = "../error_code", default-features = false } fail = "0.5" diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 7ddf338d870..bd94a3638d4 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -96,10 +96,7 @@ impl KvEngineFactoryBuilder { inner: Arc::new(self.inner), compact_event_sender: self.compact_event_sender.clone(), }; - KvEngineFactoryV2 { - inner: factory, - registry: Arc::default(), - } + KvEngineFactoryV2::new(factory) } } diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index cf988f9da37..4132b2e4c25 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -15,8 +15,17 @@ const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; #[derive(Clone)] pub struct KvEngineFactoryV2 { - pub inner: KvEngineFactory, - pub registry: Arc>>, + inner: KvEngineFactory, + registry: Arc>>, +} + +impl KvEngineFactoryV2 { + pub fn new(inner: KvEngineFactory) -> Self { + KvEngineFactoryV2 { + inner, + registry: Arc::new(Mutex::new(HashMap::default())), + } + } } // Extract tablet id and tablet suffix from the path. From 1c87fbe045b5b34f446101142a7ebb26b0dc4c92 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 4 Aug 2022 17:34:07 +0800 Subject: [PATCH 134/676] *: all in camel case (#13215) close tikv/tikv#13216 Signed-off-by: tabokie --- clippy.toml | 4 +- cmd/tikv-ctl/src/executor.rs | 6 +- cmd/tikv-ctl/src/main.rs | 14 +- cmd/tikv-ctl/src/util.rs | 4 +- cmd/tikv-server/src/main.rs | 8 +- components/api_version/src/api_v2.rs | 2 +- components/api_version/src/lib.rs | 14 +- components/backup/src/errors.rs | 6 +- components/cdc/src/delegate.rs | 38 +-- components/cdc/src/endpoint.rs | 74 +++--- components/cdc/src/initializer.rs | 18 +- components/cdc/src/observer.rs | 18 +- components/cdc/src/service.rs | 30 +-- components/engine_rocks/src/util.rs | 4 +- components/engine_traits/src/errors.rs | 4 +- components/engine_traits/src/lib.rs | 2 +- components/engine_traits/src/perf_context.rs | 4 +- components/error_code/src/backup_stream.rs | 6 +- components/error_code/src/causal_ts.rs | 6 +- components/error_code/src/cloud.rs | 4 +- components/error_code/src/encryption.rs | 2 +- components/error_code/src/engine.rs | 4 +- components/error_code/src/pd.rs | 6 +- components/error_code/src/raftstore.rs | 2 +- components/error_code/src/sst_importer.rs | 6 +- components/error_code/src/storage.rs | 6 +- components/file_system/src/io_stats/proc.rs | 18 +- components/raftstore/src/coprocessor/mod.rs | 22 +- components/raftstore/src/store/util.rs | 2 +- components/raftstore/src/store/worker/pd.rs | 42 ++-- .../src/store/worker/split_controller.rs | 10 +- components/resolved_ts/src/advance.rs | 4 +- components/resolved_ts/src/endpoint.rs | 10 +- components/resolved_ts/src/scanner.rs | 4 +- components/resolved_ts/src/sinker.rs | 4 +- components/server/src/raft_engine_switch.rs | 4 +- components/server/src/server.rs | 44 ++-- components/server/src/setup.rs | 10 +- components/sst_importer/src/errors.rs | 8 +- components/sst_importer/src/sst_writer.rs | 2 +- components/test_backup/src/lib.rs | 4 +- components/test_coprocessor/src/dag.rs | 48 ++-- components/test_raftstore/src/config.rs | 10 +- components/test_raftstore/src/util.rs | 8 +- .../src/codec/chunk/chunk.rs | 8 +- .../src/codec/chunk/column.rs | 4 +- .../src/codec/collation/encoding/gbk.rs | 4 +- .../src/codec/collation/mod.rs | 6 +- .../src/codec/row/v2/compat_v1.rs | 4 +- .../tidb_query_datatype/src/codec/table.rs | 4 +- .../tidb_query_datatype/src/def/eval_type.rs | 6 +- .../tidb_query_datatype/src/def/field_type.rs | 26 +- .../tidb_query_expr/src/impl_compare.rs | 224 +++++++++--------- components/tidb_query_expr/src/impl_math.rs | 18 +- components/tidb_query_expr/src/lib.rs | 98 ++++---- .../tidb_query_expr/src/types/expr_eval.rs | 4 +- components/tikv_alloc/src/error.rs | 6 +- etc/error_code.toml | 8 +- src/config.rs | 168 ++++++------- src/coprocessor/checksum.rs | 6 +- src/coprocessor/dag/mod.rs | 4 +- src/coprocessor/dag/storage_impl.rs | 6 +- src/coprocessor/statistics/analyze.rs | 16 +- src/import/duplicate_detect.rs | 4 +- src/server/engine_factory.rs | 4 +- src/server/engine_factory_v2.rs | 6 +- src/server/errors.rs | 4 +- src/server/service/diagnostics/log.rs | 4 +- src/server/service/kv.rs | 2 +- src/server/status_server/mod.rs | 6 +- src/storage/mod.rs | 4 +- .../coprocessor_executors/index_scan/mod.rs | 8 +- .../coprocessor_executors/index_scan/util.rs | 12 +- .../coprocessor_executors/integrated/mod.rs | 10 +- .../coprocessor_executors/integrated/util.rs | 12 +- .../coprocessor_executors/table_scan/mod.rs | 8 +- .../coprocessor_executors/table_scan/util.rs | 12 +- .../coprocessor_executors/util/bencher.rs | 8 +- .../util/scan_bencher.rs | 12 +- .../misc/coprocessor/codec/chunk/mod.rs | 2 +- tests/failpoints/cases/test_coprocessor.rs | 28 +-- .../integrations/config/dynamic/gc_worker.rs | 10 +- .../config/dynamic/pessimistic_txn.rs | 4 +- .../integrations/config/dynamic/raftstore.rs | 6 +- tests/integrations/config/dynamic/snap.rs | 6 +- .../config/dynamic/split_check.rs | 6 +- tests/integrations/config/mod.rs | 24 +- .../integrations/config/test_config_client.rs | 8 +- .../integrations/coprocessor/test_checksum.rs | 4 +- tests/integrations/coprocessor/test_select.rs | 94 ++++---- tests/integrations/import/test_sst_service.rs | 4 +- tests/integrations/import/util.rs | 10 +- .../resource_metering/test_cpu.rs | 4 +- .../resource_metering/test_read_keys.rs | 4 +- .../resource_metering/test_suite/mod.rs | 4 +- tests/integrations/storage/test_titan.rs | 4 +- 96 files changed, 753 insertions(+), 751 deletions(-) diff --git a/clippy.toml b/clippy.toml index 2a4bb3e82b2..1530b3cb60b 100644 --- a/clippy.toml +++ b/clippy.toml @@ -6,4 +6,6 @@ disallowed-methods = [ { path = "futures_executor::thread_pool::ThreadPoolBuilder::after_start", reason = "Wrapper function `::after_start_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, { path = "futures_executor::thread_pool::ThreadPoolBuilder::before_stop", reason = "Wrapper function `::before_stop_wrapper` should be used instead, refer to https://github.com/tikv/tikv/pull/12442 for more details." }, -] \ No newline at end of file +] +avoid-breaking-exported-api = false +upper-case-acronyms-aggressive = true diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index df2c3cfbadf..aa2f604b547 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -27,7 +27,7 @@ use raftstore::store::INIT_EPOCH_CONF_VER; use security::SecurityManager; use serde_json::json; use tikv::{ - config::{ConfigController, TiKvConfig}, + config::{ConfigController, TikvConfig}, server::debug::{BottommostLevelCompaction, Debugger, RegionInfo}, }; use tikv_util::escape; @@ -43,7 +43,7 @@ pub const LOCK_FILE_ERROR: &str = "IO error: While lock file"; type MvccInfoStream = Pin, MvccInfo), String>>>>; pub fn new_debug_executor( - cfg: &TiKvConfig, + cfg: &TikvConfig, data_dir: Option<&str>, skip_paranoid_checks: bool, host: Option<&str>, @@ -359,7 +359,7 @@ pub trait DebugExecutor { region: u64, to_host: Option<&str>, to_data_dir: Option<&str>, - to_config: &TiKvConfig, + to_config: &TikvConfig, mgr: Arc, ) { let rhs_debug_executor = new_debug_executor(to_config, to_data_dir, false, to_host, mgr); diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index d37336cbd36..ce39c121300 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -45,7 +45,7 @@ use raft_log_engine::ManagedFileSystem; use regex::Regex; use security::{SecurityConfig, SecurityManager}; use structopt::{clap::ErrorKind, StructOpt}; -use tikv::{config::TiKvConfig, server::debug::BottommostLevelCompaction}; +use tikv::{config::TikvConfig, server::debug::BottommostLevelCompaction}; use tikv_util::{escape, run_and_wait_child_process, sys::thread::StdThreadBuildWrapper, unescape}; use txn_types::Key; @@ -61,7 +61,7 @@ fn main() { let cfg_path = opt.config.as_ref(); let cfg = cfg_path.map_or_else( || { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.log.level = tikv_util::logger::get_level_by_string("warn") .unwrap() .into(); @@ -332,7 +332,7 @@ fn main() { } => { let to_data_dir = to_data_dir.as_deref(); let to_host = to_host.as_deref(); - let to_config = to_config.map_or_else(TiKvConfig::default, |path| { + let to_config = to_config.map_or_else(TikvConfig::default, |path| { let s = fs::read_to_string(&path).unwrap(); toml::from_str(&s).unwrap() }); @@ -608,7 +608,7 @@ fn split_region(pd_client: &RpcClient, mgr: Arc, region_id: u64 fn compact_whole_cluster( pd_client: &RpcClient, - cfg: &TiKvConfig, + cfg: &TikvConfig, mgr: Arc, db_type: DbType, cfs: Vec<&str>, @@ -671,7 +671,7 @@ fn read_fail_file(path: &str) -> Vec<(String, String)> { list } -fn run_ldb_command(args: Vec, cfg: &TiKvConfig) { +fn run_ldb_command(args: Vec, cfg: &TikvConfig) { let key_manager = data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .unwrap() .map(Arc::new); @@ -682,12 +682,12 @@ fn run_ldb_command(args: Vec, cfg: &TiKvConfig) { engine_rocks::raw::run_ldb_tool(&args, &opts); } -fn run_sst_dump_command(args: Vec, cfg: &TiKvConfig) { +fn run_sst_dump_command(args: Vec, cfg: &TikvConfig) { let opts = cfg.rocksdb.build_opt(); engine_rocks::raw::run_sst_dump_tool(&args, &opts); } -fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, cfg: &TiKvConfig) { +fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, cfg: &TikvConfig) { let db = &cfg.infer_kv_engine_path(Some(data_dir)).unwrap(); println!( "\nstart to print bad ssts; data_dir:{}; db:{}", diff --git a/cmd/tikv-ctl/src/util.rs b/cmd/tikv-ctl/src/util.rs index 36091b5a930..d7e83511d3e 100644 --- a/cmd/tikv-ctl/src/util.rs +++ b/cmd/tikv-ctl/src/util.rs @@ -3,13 +3,13 @@ use std::{borrow::ToOwned, error::Error, str, str::FromStr, u64}; use server::setup::initial_logger; -use tikv::config::TiKvConfig; +use tikv::config::TikvConfig; const LOG_DIR: &str = "./ctl-engine-info-log"; #[allow(clippy::field_reassign_with_default)] pub fn init_ctl_logger(level: &str) { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.log.level = slog::Level::from_str(level).unwrap().into(); cfg.rocksdb.info_log_dir = LOG_DIR.to_owned(); cfg.raftdb.info_log_dir = LOG_DIR.to_owned(); diff --git a/cmd/tikv-server/src/main.rs b/cmd/tikv-server/src/main.rs index 0d6e472a602..b366cd7849f 100644 --- a/cmd/tikv-server/src/main.rs +++ b/cmd/tikv-server/src/main.rs @@ -7,7 +7,7 @@ use std::{path::Path, process}; use clap::{crate_authors, App, Arg}; use serde_json::{Map, Value}; use server::setup::{ensure_no_unrecognized_config, validate_and_persist_config}; -use tikv::config::{to_flatten_config_info, TiKvConfig}; +use tikv::config::{to_flatten_config_info, TikvConfig}; fn main() { let build_timestamp = option_env!("TIKV_BUILD_TIME"); @@ -157,7 +157,7 @@ fn main() { .get_matches(); if matches.is_present("print-sample-config") { - let config = TiKvConfig::default(); + let config = TikvConfig::default(); println!("{}", toml::to_string_pretty(&config).unwrap()); process::exit(0); } @@ -167,9 +167,9 @@ fn main() { let mut config = matches .value_of_os("config") - .map_or_else(TiKvConfig::default, |path| { + .map_or_else(TikvConfig::default, |path| { let path = Path::new(path); - TiKvConfig::from_file( + TikvConfig::from_file( path, if is_config_check { Some(&mut unrecognized_keys) diff --git a/components/api_version/src/api_v2.rs b/components/api_version/src/api_v2.rs index 712804b3b3a..a56d5deac30 100644 --- a/components/api_version/src/api_v2.rs +++ b/components/api_version/src/api_v2.rs @@ -50,7 +50,7 @@ impl KvFormat for ApiV2 { match key[0] { RAW_KEY_PREFIX => KeyMode::Raw, TXN_KEY_PREFIX => KeyMode::Txn, - TIDB_META_KEY_PREFIX | TIDB_TABLE_KEY_PREFIX => KeyMode::TiDB, + TIDB_META_KEY_PREFIX | TIDB_TABLE_KEY_PREFIX => KeyMode::Tidb, _ => KeyMode::Unknown, } } diff --git a/components/api_version/src/lib.rs b/components/api_version/src/lib.rs index fb8fd13cbfd..ceb18b4bddb 100644 --- a/components/api_version/src/lib.rs +++ b/components/api_version/src/lib.rs @@ -188,7 +188,7 @@ pub enum KeyMode { /// TiDB, but instead, it means that the key matches the definition of /// TiDB key in API V2, therefore, the key is treated as TiDB data in /// order to fulfill compatibility. - TiDB, + Tidb, /// Unrecognised key mode. Unknown, } @@ -271,8 +271,8 @@ mod tests { ); assert_eq!(ApiV2::parse_key_mode(&[RAW_KEY_PREFIX]), KeyMode::Raw); assert_eq!(ApiV2::parse_key_mode(&[TXN_KEY_PREFIX]), KeyMode::Txn); - assert_eq!(ApiV2::parse_key_mode(&b"t_a"[..]), KeyMode::TiDB); - assert_eq!(ApiV2::parse_key_mode(&b"m"[..]), KeyMode::TiDB); + assert_eq!(ApiV2::parse_key_mode(&b"t_a"[..]), KeyMode::Tidb); + assert_eq!(ApiV2::parse_key_mode(&b"m"[..]), KeyMode::Tidb); assert_eq!(ApiV2::parse_key_mode(&b"ot"[..]), KeyMode::Unknown); } @@ -289,19 +289,19 @@ mod tests { ); assert_eq!( ApiV2::parse_range_mode((Some(b"t_a"), Some(b"t_z"))), - KeyMode::TiDB + KeyMode::Tidb ); assert_eq!( ApiV2::parse_range_mode((Some(b"t"), Some(b"u"))), - KeyMode::TiDB + KeyMode::Tidb ); assert_eq!( ApiV2::parse_range_mode((Some(b"m"), Some(b"n"))), - KeyMode::TiDB + KeyMode::Tidb ); assert_eq!( ApiV2::parse_range_mode((Some(b"m_a"), Some(b"m_z"))), - KeyMode::TiDB + KeyMode::Tidb ); assert_eq!( ApiV2::parse_range_mode((Some(b"x\0a"), Some(b"x\0z"))), diff --git a/components/backup/src/errors.rs b/components/backup/src/errors.rs index 4f290262c57..413f4ee77f9 100644 --- a/components/backup/src/errors.rs +++ b/components/backup/src/errors.rs @@ -24,7 +24,7 @@ impl From for ErrorPb { fn from(e: Error) -> ErrorPb { let mut err = ErrorPb::default(); match e { - Error::ClusterID { current, request } => { + Error::ClusterId { current, request } => { BACKUP_RANGE_ERROR_VEC .with_label_values(&["cluster_mismatch"]) .inc(); @@ -114,8 +114,8 @@ pub enum Error { EngineTrait(#[from] EngineTraitError), #[error("Transaction error {0}")] Txn(#[from] TxnError), - #[error("ClusterID error current {current}, request {request}")] - ClusterID { current: u64, request: u64 }, + #[error("ClusterId error current {current}, request {request}")] + ClusterId { current: u64, request: u64 }, #[error("Invalid cf {cf}")] InvalidCf { cf: String }, #[error("Failed to acquire the semaphore {0}")] diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index fc379916232..1928cd3257a 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -24,7 +24,7 @@ use kvproto::{ }, }; use raftstore::{ - coprocessor::{Cmd, CmdBatch, ObserveHandle, ObserveID}, + coprocessor::{Cmd, CmdBatch, ObserveHandle, ObserveId}, store::util::compare_region_epoch, Error as RaftStoreError, }; @@ -38,7 +38,7 @@ use crate::{ initializer::KvEntry, metrics::*, old_value::{OldValueCache, OldValueCallback}, - service::ConnID, + service::ConnId, Error, Result, }; @@ -46,15 +46,15 @@ static DOWNSTREAM_ID_ALLOC: AtomicUsize = AtomicUsize::new(0); /// A unique identifier of a Downstream. #[derive(Clone, Copy, Debug, PartialEq, Hash)] -pub struct DownstreamID(usize); +pub struct DownstreamId(usize); -impl DownstreamID { - pub fn new() -> DownstreamID { - DownstreamID(DOWNSTREAM_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) +impl DownstreamId { + pub fn new() -> DownstreamId { + DownstreamId(DOWNSTREAM_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) } } -impl Default for DownstreamID { +impl Default for DownstreamId { fn default() -> Self { Self::new() } @@ -119,10 +119,10 @@ impl DownstreamState { pub struct Downstream { // TODO: include cdc request. /// A unique identifier of the Downstream. - id: DownstreamID, + id: DownstreamId, // The request ID set by CDC to identify events corresponding different requests. req_id: u64, - conn_id: ConnID, + conn_id: ConnId, // The IP address of downstream. peer: String, region_epoch: RegionEpoch, @@ -140,11 +140,11 @@ impl Downstream { peer: String, region_epoch: RegionEpoch, req_id: u64, - conn_id: ConnID, + conn_id: ConnId, kv_api: ChangeDataRequestKvApi, ) -> Downstream { Downstream { - id: DownstreamID::new(), + id: DownstreamId::new(), req_id, conn_id, peer, @@ -199,7 +199,7 @@ impl Downstream { self.sink = Some(sink); } - pub fn get_id(&self) -> DownstreamID { + pub fn get_id(&self) -> DownstreamId { self.id } @@ -207,7 +207,7 @@ impl Downstream { self.state.clone() } - pub fn get_conn_id(&self) -> ConnID { + pub fn get_conn_id(&self) -> ConnId { self.conn_id } } @@ -277,7 +277,7 @@ impl Delegate { Ok(()) } - pub fn downstream(&self, downstream_id: DownstreamID) -> Option<&Downstream> { + pub fn downstream(&self, downstream_id: DownstreamId) -> Option<&Downstream> { self.downstreams().iter().find(|d| d.id == downstream_id) } @@ -297,7 +297,7 @@ impl Delegate { /// Let downstream unsubscribe the delegate. /// Return whether the delegate is empty or not. - pub fn unsubscribe(&mut self, id: DownstreamID, err: Option) -> bool { + pub fn unsubscribe(&mut self, id: DownstreamId, err: Option) -> bool { let error_event = err.map(|err| err.into_error_event(self.region_id)); let region_id = self.region_id; if let Some(d) = self.remove_downstream(id) { @@ -617,7 +617,7 @@ impl Delegate { self.sink_downstream(raw_rows, index, ChangeDataRequestKvApi::RawKv) } - pub fn raw_untrack_ts(&mut self, cdc_id: ObserveID, max_ts: TimeStamp) { + pub fn raw_untrack_ts(&mut self, cdc_id: ObserveId, max_ts: TimeStamp) { // Stale CmdBatch, drop it silently. if cdc_id != self.handle.id { return; @@ -863,7 +863,7 @@ impl Delegate { self.txn_extra_op.store(TxnExtraOp::ReadOldValue); } - fn remove_downstream(&mut self, id: DownstreamID) -> Option { + fn remove_downstream(&mut self, id: DownstreamId) -> Option { let downstreams = self.downstreams_mut(); if let Some(index) = downstreams.iter().position(|x| x.id == id) { let downstream = downstreams.swap_remove(index); @@ -1070,7 +1070,7 @@ mod tests { String::new(), region_epoch, request_id, - ConnID::new(), + ConnId::new(), ChangeDataRequestKvApi::TiDb, ); downstream.set_sink(sink); @@ -1189,7 +1189,7 @@ mod tests { let mut epoch = RegionEpoch::default(); epoch.set_conf_ver(region_version); epoch.set_version(region_version); - Downstream::new(peer, epoch, id, ConnID::new(), ChangeDataRequestKvApi::TiDb) + Downstream::new(peer, epoch, id, ConnId::new(), ChangeDataRequestKvApi::TiDb) }; // Create a new delegate. diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 9d15c347e32..d9938006ca1 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -29,7 +29,7 @@ use kvproto::{ use online_config::{ConfigChange, OnlineConfig}; use pd_client::{Feature, PdClient}; use raftstore::{ - coprocessor::{CmdBatch, ObserveID}, + coprocessor::{CmdBatch, ObserveId}, router::RaftStoreRouter, store::{ fsm::{ChangeObserver, StoreMeta}, @@ -56,12 +56,12 @@ use txn_types::{TimeStamp, TxnExtra, TxnExtraScheduler}; use crate::{ channel::{CdcEvent, MemoryQuota, SendError}, - delegate::{on_init_downstream, Delegate, Downstream, DownstreamID, DownstreamState}, + delegate::{on_init_downstream, Delegate, Downstream, DownstreamId, DownstreamState}, initializer::Initializer, metrics::*, observer::RawRegionTs, old_value::{OldValueCache, OldValueCallback}, - service::{Conn, ConnID, FeatureGate}, + service::{Conn, ConnId, FeatureGate}, CdcObserver, Error, }; @@ -78,16 +78,16 @@ const RAW_RESOLVED_TS_OUTLIER_COUNT_THRESHOLD: usize = 10; pub enum Deregister { Downstream { region_id: u64, - downstream_id: DownstreamID, - conn_id: ConnID, + downstream_id: DownstreamId, + conn_id: ConnId, err: Option, }, Delegate { region_id: u64, - observe_id: ObserveID, + observe_id: ObserveId, err: Error, }, - Conn(ConnID), + Conn(ConnId), } impl_display_as_debug!(Deregister); @@ -137,7 +137,7 @@ pub enum Task { Register { request: ChangeDataRequest, downstream: Downstream, - conn_id: ConnID, + conn_id: ConnId, version: semver::Version, }, Deregister(Deregister), @@ -148,13 +148,13 @@ pub enum Task { multi: Vec, old_value_cb: OldValueCallback, }, - MinTS { + MinTs { regions: Vec, min_ts: TimeStamp, current_ts: TimeStamp, }, ResolverReady { - observe_id: ObserveID, + observe_id: ObserveId, region: Region, resolver: Resolver, }, @@ -163,7 +163,7 @@ pub enum Task { // the downstream switches to Normal after the previous commands was sunk. InitDownstream { region_id: u64, - downstream_id: DownstreamID, + downstream_id: DownstreamId, downstream_state: Arc>, // `incremental_scan_barrier` will be sent into `sink` to ensure all delta changes // are delivered to the downstream. And then incremental scan can start. @@ -215,7 +215,7 @@ impl fmt::Debug for Task { .field("type", &"multi_batch") .field("multi_batch", &multi.len()) .finish(), - Task::MinTS { + Task::MinTs { ref min_ts, ref current_ts, .. @@ -388,7 +388,7 @@ pub struct Endpoint { cluster_id: u64, capture_regions: HashMap, - connections: HashMap, + connections: HashMap, scheduler: Scheduler, raft_router: T, engine: E, @@ -611,7 +611,7 @@ impl, E: KvEngine> Endpoint { let oid = self.observer.unsubscribe_region(region_id, id); assert!( oid.is_some(), - "unsubscribe region {} failed, ObserveID {:?}", + "unsubscribe region {} failed, ObserveId {:?}", region_id, id ); @@ -624,7 +624,7 @@ impl, E: KvEngine> Endpoint { } => { // Something went wrong, deregister all downstreams of the region. - // To avoid ABA problem, we must check the unique ObserveID. + // To avoid ABA problem, we must check the unique ObserveId. let need_remove = self .capture_regions .get(®ion_id) @@ -642,7 +642,7 @@ impl, E: KvEngine> Endpoint { assert_eq!( need_remove, oid.is_some(), - "unsubscribe region {} failed, ObserveID {:?}", + "unsubscribe region {} failed, ObserveId {:?}", region_id, observe_id ); @@ -661,7 +661,7 @@ impl, E: KvEngine> Endpoint { let oid = self.observer.unsubscribe_region(region_id, id); assert!( oid.is_some(), - "unsubscribe region {} failed, ObserveID {:?}", + "unsubscribe region {} failed, ObserveId {:?}", region_id, id ); @@ -678,7 +678,7 @@ impl, E: KvEngine> Endpoint { &mut self, mut request: ChangeDataRequest, mut downstream: Downstream, - conn_id: ConnID, + conn_id: ConnId, version: semver::Version, ) { let region_id = request.region_id; @@ -784,7 +784,7 @@ impl, E: KvEngine> Endpoint { let old_observe_id = self.observer.subscribe_region(region_id, observe_id); assert!( old_observe_id.is_none(), - "region {} must not be observed twice, old ObserveID {:?}, new ObserveID {:?}", + "region {} must not be observed twice, old ObserveId {:?}, new ObserveId {:?}", region_id, old_observe_id, observe_id @@ -880,7 +880,7 @@ impl, E: KvEngine> Endpoint { } } - fn on_region_ready(&mut self, observe_id: ObserveID, resolver: Resolver, region: Region) { + fn on_region_ready(&mut self, observe_id: ObserveId, resolver: Resolver, region: Region) { let region_id = region.get_id(); let mut failed_downstreams = Vec::new(); if let Some(delegate) = self.capture_regions.get_mut(®ion_id) { @@ -1129,7 +1129,7 @@ impl, E: KvEngine> Endpoint { let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); let raft_router = self.raft_router.clone(); - let regions: Vec<(u64, ObserveID)> = self + let regions: Vec<(u64, ObserveId)> = self .capture_regions .iter() .map(|(region_id, delegate)| (*region_id, delegate.handle.id)) @@ -1152,8 +1152,8 @@ impl, E: KvEngine> Endpoint { // Sync with concurrency manager so that it can work correctly when // optimizations like async commit is enabled. - // Note: This step must be done before scheduling `Task::MinTS` task, and the - // resolver must be checked in or after `Task::MinTS`' execution. + // Note: This step must be done before scheduling `Task::MinTs` task, and the + // resolver must be checked in or after `Task::MinTs`' execution. cm.update_max_ts(min_ts); if let Some(min_mem_lock_ts) = cm.global_min_lock_ts() { if min_mem_lock_ts < min_ts { @@ -1169,7 +1169,7 @@ impl, E: KvEngine> Endpoint { Err(err) => panic!("failed to regiester min ts event, error: {:?}", err), } - // If flush_causal_timestamp fails, cannot schedule MinTS task + // If flush_causal_timestamp fails, cannot schedule MinTs task // as new coming raw data may use timestamp smaller than min_ts if let Err(e) = causal_ts_provider.map_or(Ok(()), |provider| provider.flush()) { error!("cdc flush causal timestamp failed"; "err" => ?e); @@ -1202,7 +1202,7 @@ impl, E: KvEngine> Endpoint { }; if !regions.is_empty() { - match scheduler.schedule(Task::MinTS { + match scheduler.schedule(Task::MinTs { regions, min_ts, current_ts: min_ts_pd, @@ -1225,7 +1225,7 @@ impl, E: KvEngine> Endpoint { } async fn region_resolved_ts_raft( - regions: Vec<(u64, ObserveID)>, + regions: Vec<(u64, ObserveId)>, scheduler: &Scheduler, raft_router: T, min_ts: TimeStamp, @@ -1293,7 +1293,7 @@ impl, E: KvEngine> Runnable for Endpoint { debug!("cdc run task"; "task" => %task); match task { - Task::MinTS { + Task::MinTs { regions, min_ts, current_ts, @@ -2250,7 +2250,7 @@ mod tests { .unwrap() .handle .id; - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: region_ids, min_ts: cur_tso, current_ts: cur_tso, @@ -2371,7 +2371,7 @@ mod tests { let resolver = Resolver::new(1); let observe_id = suite.endpoint.capture_regions[&1].handle.id; suite.on_region_ready(observe_id, resolver, region.clone()); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1], min_ts: TimeStamp::from(1), current_ts: TimeStamp::zero(), @@ -2407,7 +2407,7 @@ mod tests { region.set_id(2); let observe_id = suite.endpoint.capture_regions[&2].handle.id; suite.on_region_ready(observe_id, resolver, region); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 2], min_ts: TimeStamp::from(2), current_ts: TimeStamp::zero(), @@ -2452,7 +2452,7 @@ mod tests { region.set_id(3); let observe_id = suite.endpoint.capture_regions[&3].handle.id; suite.on_region_ready(observe_id, resolver, region); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 2, 3], min_ts: TimeStamp::from(3), current_ts: TimeStamp::zero(), @@ -2612,8 +2612,8 @@ mod tests { assert_eq!(suite.endpoint.capture_regions.len(), 1); let deregister = Deregister::Delegate { region_id: 1, - // A stale ObserveID (different from the actual one). - observe_id: ObserveID::new(), + // A stale ObserveId (different from the actual one). + observe_id: ObserveId::new(), err: Error::request(err_header), }; suite.run(Task::Deregister(deregister)); @@ -2686,7 +2686,7 @@ mod tests { } }; - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1], min_ts: TimeStamp::from(1), current_ts: TimeStamp::zero(), @@ -2700,7 +2700,7 @@ mod tests { ) .unwrap_err(); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 2], min_ts: TimeStamp::from(2), current_ts: TimeStamp::zero(), @@ -2714,7 +2714,7 @@ mod tests { ) .unwrap_err(); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 2, 3], min_ts: TimeStamp::from(3), current_ts: TimeStamp::zero(), @@ -2724,7 +2724,7 @@ mod tests { // conn b must receive a resolved ts that contains region 3. assert_batch_resolved_ts(conn_rxs.get_mut(1).unwrap(), vec![3], 3); - suite.run(Task::MinTS { + suite.run(Task::MinTs { regions: vec![1, 3], min_ts: TimeStamp::from(4), current_ts: TimeStamp::zero(), diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 98720b7cf0c..f6a2ce2885c 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -16,7 +16,7 @@ use kvproto::{ metapb::{Region, RegionEpoch}, }; use raftstore::{ - coprocessor::ObserveID, + coprocessor::ObserveId, router::RaftStoreRouter, store::{ fsm::ChangeObserver, @@ -47,11 +47,11 @@ use txn_types::{Key, KvPair, Lock, LockType, OldValue, TimeStamp}; use crate::{ channel::CdcEvent, - delegate::{post_init_downstream, Delegate, DownstreamID, DownstreamState}, + delegate::{post_init_downstream, Delegate, DownstreamId, DownstreamState}, endpoint::Deregister, metrics::*, old_value::{near_seek_old_value, new_old_value_cursor, OldValueCursors}, - service::ConnID, + service::ConnId, Error, Result, Task, }; @@ -81,10 +81,10 @@ pub(crate) struct Initializer { pub(crate) region_id: u64, pub(crate) region_epoch: RegionEpoch, - pub(crate) observe_id: ObserveID, - pub(crate) downstream_id: DownstreamID, + pub(crate) observe_id: ObserveId, + pub(crate) downstream_id: DownstreamId, pub(crate) downstream_state: Arc>, - pub(crate) conn_id: ConnID, + pub(crate) conn_id: ConnId, pub(crate) request_id: u64, pub(crate) checkpoint_ts: TimeStamp, @@ -632,10 +632,10 @@ mod tests { region_id: 1, region_epoch: RegionEpoch::default(), - observe_id: ObserveID::new(), - downstream_id: DownstreamID::new(), + observe_id: ObserveId::new(), + downstream_id: DownstreamId::new(), downstream_state, - conn_id: ConnID::new(), + conn_id: ConnId::new(), request_id: 0, checkpoint_ts: 1.into(), speed_limiter: Limiter::new(speed_limit as _), diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 124757d7697..d7da79c0361 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -28,7 +28,7 @@ use crate::{ #[derive(Clone, Debug, Eq, PartialEq)] pub struct RawRegionTs { pub region_id: u64, - pub cdc_id: ObserveID, + pub cdc_id: ObserveId, pub max_ts: TimeStamp, } @@ -42,7 +42,7 @@ pub struct CdcObserver { sched: Scheduler, // A shared registry for managing observed regions. // TODO: it may become a bottleneck, find a better way to manage the registry. - observe_regions: Arc>>, + observe_regions: Arc>>, api_version: ApiVersion, } @@ -76,8 +76,8 @@ impl CdcObserver { /// Subscribe an region, the observer will sink events of the region into /// its scheduler. /// - /// Return previous ObserveID if there is one. - pub fn subscribe_region(&self, region_id: u64, observe_id: ObserveID) -> Option { + /// Return previous ObserveId if there is one. + pub fn subscribe_region(&self, region_id: u64, observe_id: ObserveId) -> Option { self.observe_regions .write() .unwrap() @@ -87,9 +87,9 @@ impl CdcObserver { /// Stops observe the region. /// /// Return ObserverID if unsubscribe successfully. - pub fn unsubscribe_region(&self, region_id: u64, observe_id: ObserveID) -> Option { + pub fn unsubscribe_region(&self, region_id: u64, observe_id: ObserveId) -> Option { let mut regions = self.observe_regions.write().unwrap(); - // To avoid ABA problem, we must check the unique ObserveID. + // To avoid ABA problem, we must check the unique ObserveId. if let Some(oid) = regions.get(®ion_id) { if *oid == observe_id { return regions.remove(®ion_id); @@ -99,7 +99,7 @@ impl CdcObserver { } /// Check whether the region is subscribed or not. - pub fn is_subscribed(&self, region_id: u64) -> Option { + pub fn is_subscribed(&self, region_id: u64) -> Option { self.observe_regions .read() .unwrap() @@ -364,7 +364,7 @@ mod tests { observer.on_role_change(&mut ctx, &RoleChange::new(StateRole::Follower)); rx.recv_timeout(Duration::from_millis(10)).unwrap_err(); - let oid = ObserveID::new(); + let oid = ObserveId::new(); observer.subscribe_region(1, oid); let mut ctx = ObserverContext::new(®ion); @@ -440,7 +440,7 @@ mod tests { }; // unsubscribed fail if observer id is different. - assert_eq!(observer.unsubscribe_region(1, ObserveID::new()), None); + assert_eq!(observer.unsubscribe_region(1, ObserveId::new()), None); // No event if it is unsubscribed. let oid_ = observer.unsubscribe_region(1, oid).unwrap(); diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index 80d0f8c47a4..e7bec568f67 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -26,7 +26,7 @@ use tikv_util::{error, info, warn, worker::*}; use crate::{ channel::{channel, MemoryQuota, Sink, CDC_CHANNLE_CAPACITY}, - delegate::{Downstream, DownstreamID, DownstreamState}, + delegate::{Downstream, DownstreamId, DownstreamState}, endpoint::{Deregister, Task}, }; @@ -34,15 +34,15 @@ static CONNECTION_ID_ALLOC: AtomicUsize = AtomicUsize::new(0); /// A unique identifier of a Connection. #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] -pub struct ConnID(usize); +pub struct ConnId(usize); -impl ConnID { - pub fn new() -> ConnID { - ConnID(CONNECTION_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) +impl ConnId { + pub fn new() -> ConnId { + ConnId(CONNECTION_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) } } -impl Default for ConnID { +impl Default for ConnId { fn default() -> Self { Self::new() } @@ -74,10 +74,10 @@ impl FeatureGate { } pub struct Conn { - id: ConnID, + id: ConnId, sink: Sink, - // region id -> DownstreamID - downstreams: HashMap>)>, + // region id -> DownstreamId + downstreams: HashMap>)>, peer: String, version: Option<(semver::Version, FeatureGate)>, } @@ -85,7 +85,7 @@ pub struct Conn { impl Conn { pub fn new(sink: Sink, peer: String) -> Conn { Conn { - id: ConnID::new(), + id: ConnId::new(), sink, downstreams: HashMap::default(), version: None, @@ -132,19 +132,19 @@ impl Conn { &self.peer } - pub fn get_id(&self) -> ConnID { + pub fn get_id(&self) -> ConnId { self.id } pub fn get_downstreams( &self, - ) -> &HashMap>)> { + ) -> &HashMap>)> { &self.downstreams } pub fn take_downstreams( self, - ) -> HashMap>)> { + ) -> HashMap>)> { self.downstreams } @@ -155,7 +155,7 @@ impl Conn { pub fn subscribe( &mut self, region_id: u64, - downstream_id: DownstreamID, + downstream_id: DownstreamId, downstream_state: Arc>, ) -> bool { match self.downstreams.entry(region_id) { @@ -171,7 +171,7 @@ impl Conn { self.downstreams.remove(®ion_id); } - pub fn downstream_id(&self, region_id: u64) -> Option { + pub fn downstream_id(&self, region_id: u64) -> Option { self.downstreams.get(®ion_id).map(|x| x.0) } } diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index ebb18e92de5..f749f78851c 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -307,7 +307,7 @@ pub fn to_raw_perf_level(level: engine_traits::PerfLevel) -> rocksdb::PerfLevel engine_traits::PerfLevel::EnableTimeExceptForMutex => { rocksdb::PerfLevel::EnableTimeExceptForMutex } - engine_traits::PerfLevel::EnableTimeAndCPUTimeExceptForMutex => { + engine_traits::PerfLevel::EnableTimeAndCpuTimeExceptForMutex => { rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex } engine_traits::PerfLevel::EnableTime => rocksdb::PerfLevel::EnableTime, @@ -324,7 +324,7 @@ pub fn from_raw_perf_level(level: rocksdb::PerfLevel) -> engine_traits::PerfLeve engine_traits::PerfLevel::EnableTimeExceptForMutex } rocksdb::PerfLevel::EnableTimeAndCPUTimeExceptForMutex => { - engine_traits::PerfLevel::EnableTimeAndCPUTimeExceptForMutex + engine_traits::PerfLevel::EnableTimeAndCpuTimeExceptForMutex } rocksdb::PerfLevel::EnableTime => engine_traits::PerfLevel::EnableTime, rocksdb::PerfLevel::OutOfBounds => engine_traits::PerfLevel::OutOfBounds, diff --git a/components/engine_traits/src/errors.rs b/components/engine_traits/src/errors.rs index 6784891921b..c9960b50753 100644 --- a/components/engine_traits/src/errors.rs +++ b/components/engine_traits/src/errors.rs @@ -137,7 +137,7 @@ pub enum Error { #[error("{0:?}")] Other(#[from] Box), #[error("CF {0} not found")] - CFName(String), + CfName(String), #[error("Codec {0}")] Codec(#[from] tikv_util::codec::Error), #[error("The entries of region is unavailable")] @@ -155,7 +155,7 @@ impl ErrorCodeExt for Error { Error::NotInRange { .. } => error_code::engine::NOT_IN_RANGE, Error::Protobuf(_) => error_code::engine::PROTOBUF, Error::Io(_) => error_code::engine::IO, - Error::CFName(_) => error_code::engine::CF_NAME, + Error::CfName(_) => error_code::engine::CF_NAME, Error::Codec(_) => error_code::engine::CODEC, Error::Other(_) => error_code::UNKNOWN, Error::EntriesUnavailable => error_code::engine::DATALOSS, diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 0e709d164bd..72794fba5cd 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -191,7 +191,7 @@ //! //! At the end of this phase the `engine` crate will be deleted. //! -//! ## 3) "Pulling up" the generic abstractions through TiKv +//! ## 3) "Pulling up" the generic abstractions through TiKV //! //! With all of TiKV using the `engine_traits` traits in conjunction with the //! concrete `engine_rocks` types, we can push generic type parameters up diff --git a/components/engine_traits/src/perf_context.rs b/components/engine_traits/src/perf_context.rs index 56351fbeca5..ba48974a460 100644 --- a/components/engine_traits/src/perf_context.rs +++ b/components/engine_traits/src/perf_context.rs @@ -8,7 +8,7 @@ pub enum PerfLevel { Disable, EnableCount, EnableTimeExceptForMutex, - EnableTimeAndCPUTimeExceptForMutex, + EnableTimeAndCpuTimeExceptForMutex, EnableTime, OutOfBounds, } @@ -18,7 +18,7 @@ numeric_enum_serializing_mod! {perf_level_serde PerfLevel { Disable = 1, EnableCount = 2, EnableTimeExceptForMutex = 3, - EnableTimeAndCPUTimeExceptForMutex = 4, + EnableTimeAndCpuTimeExceptForMutex = 4, EnableTime = 5, OutOfBounds = 6, }} diff --git a/components/error_code/src/backup_stream.rs b/components/error_code/src/backup_stream.rs index fa11ff5b37d..9448169cc05 100644 --- a/components/error_code/src/backup_stream.rs +++ b/components/error_code/src/backup_stream.rs @@ -3,7 +3,7 @@ define_error_codes! { "KV:LogBackup:", - ETCD => ("ETCD", + ETCD => ("Etcd", "Error during requesting the meta store(etcd)", "Please check the connectivity between TiKV and PD."), PROTO => ("Proto", @@ -23,7 +23,7 @@ define_error_codes! { "Malformed metadata found.", "The metadata format is unexpected, please check the compatibility between TiKV / BR." ), - IO => ("IO", + IO => ("Io", "Error during doing Input / Output operations.", "This is a generic error, please check the error message for further information." ), @@ -35,7 +35,7 @@ define_error_codes! { "Error during scheduling internal task.", "This is an internal error, and may happen if there are too many changes to observe, please ask the community for help." ), - PD => ("PD", + PD => ("Pd", "Error during requesting the Placement Driver.", "Please check the connectivity between TiKV and PD." ), diff --git a/components/error_code/src/causal_ts.rs b/components/error_code/src/causal_ts.rs index a5b2884a151..3f7f4e2a17e 100644 --- a/components/error_code/src/causal_ts.rs +++ b/components/error_code/src/causal_ts.rs @@ -4,9 +4,9 @@ define_error_codes!( "KV:CausalTs:", PD => ("PdClient", "", ""), - TSO => ("TSO", "", ""), - TSO_BATCH_USED_UP => ("TSO batch used up", "", ""), - BATCH_RENEW => ("Batch renew", "", ""), + TSO => ("Tso", "", ""), + TSO_BATCH_USED_UP => ("TsoBatchUsedUp", "", ""), + BATCH_RENEW => ("BatchRenew", "", ""), UNKNOWN => ("Unknown", "", "") ); diff --git a/components/error_code/src/cloud.rs b/components/error_code/src/cloud.rs index 63841761e7c..510481679dd 100644 --- a/components/error_code/src/cloud.rs +++ b/components/error_code/src/cloud.rs @@ -3,8 +3,8 @@ define_error_codes!( "KV:Cloud:", - IO => ("IO", "", ""), - SSL => ("SSL", "", ""), + IO => ("Io", "", ""), + SSL => ("Ssl", "", ""), PROTO => ("Proto", "", ""), UNKNOWN => ("Unknown", "", ""), TIMEOUT => ("Timeout", "", ""), diff --git a/components/error_code/src/encryption.rs b/components/error_code/src/encryption.rs index 069e98e3e6c..4204db84864 100644 --- a/components/error_code/src/encryption.rs +++ b/components/error_code/src/encryption.rs @@ -4,7 +4,7 @@ define_error_codes!( "KV:Encryption:", ROCKS => ("Rocks", "", ""), - IO => ("IO", "", ""), + IO => ("Io", "", ""), CRYPTER => ("Crypter", "", ""), PROTO => ("Proto", "", ""), UNKNOWN_ENCRYPTION => ("UnknownEncryption", "", ""), diff --git a/components/error_code/src/engine.rs b/components/error_code/src/engine.rs index d29d658cb69..4bb66f09753 100644 --- a/components/error_code/src/engine.rs +++ b/components/error_code/src/engine.rs @@ -6,8 +6,8 @@ define_error_codes!( ENGINE => ("Engine", "", ""), NOT_IN_RANGE => ("NotInRange", "", ""), PROTOBUF => ("Protobuf", "", ""), - IO => ("IO", "", ""), - CF_NAME => ("CFName", "", ""), + IO => ("Io", "", ""), + CF_NAME => ("CfName", "", ""), CODEC => ("Codec", "", ""), DATALOSS => ("DataLoss", "", ""), DATACOMPACTED => ("DataCompacted", "", "") diff --git a/components/error_code/src/pd.rs b/components/error_code/src/pd.rs index 018c86c3d39..3ca2ac0b29f 100644 --- a/components/error_code/src/pd.rs +++ b/components/error_code/src/pd.rs @@ -1,13 +1,13 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. define_error_codes!( - "KV:PD:", + "KV:Pd:", - IO => ("IO", "", ""), + IO => ("Io", "", ""), CLUSTER_BOOTSTRAPPED => ("ClusterBootstraped", "", ""), CLUSTER_NOT_BOOTSTRAPPED => ("ClusterNotBootstraped", "", ""), INCOMPATIBLE => ("Imcompatible", "", ""), - GRPC => ("gRPC", "", ""), + GRPC => ("Grpc", "", ""), STREAM_DISCONNECT => ("StreamDisconnect","",""), REGION_NOT_FOUND => ("RegionNotFound", "", ""), STORE_TOMBSTONE => ("StoreTombstone", "", ""), diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index 4d38de92284..2926c69c21e 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -19,7 +19,7 @@ define_error_codes!( STALE_COMMAND => ("StaleCommand", "", ""), TRANSPORT => ("Transport", "", ""), COPROCESSOR => ("Coprocessor", "", ""), - IO => ("IO", "", ""), + IO => ("Io", "", ""), PROTOBUF => ("Protobuf", "", ""), ADDR_PARSE => ("AddressParse", "", ""), TIMEOUT => ("Timeout", "", ""), diff --git a/components/error_code/src/sst_importer.rs b/components/error_code/src/sst_importer.rs index e24209c92a1..2eb6177458b 100644 --- a/components/error_code/src/sst_importer.rs +++ b/components/error_code/src/sst_importer.rs @@ -1,13 +1,13 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. define_error_codes!( - "KV:SSTImporter:", + "KV:SstImporter:", IO => ("Io", "", ""), - GRPC => ("gRPC", "", ""), + GRPC => ("Grpc", "", ""), UUID => ("Uuid", "", ""), FUTURE => ("Future", "", ""), - ROCKSDB => ("RocksDB", "", ""), + ROCKSDB => ("RocksDb", "", ""), PARSE_INT_ERROR => ("ParseIntError", "", ""), FILE_EXISTS => ("FileExists", "", ""), FILE_CORRUPTED => ("FileCorrupted", "", ""), diff --git a/components/error_code/src/storage.rs b/components/error_code/src/storage.rs index 5336ab80bb0..61b81215438 100644 --- a/components/error_code/src/storage.rs +++ b/components/error_code/src/storage.rs @@ -10,12 +10,12 @@ define_error_codes!( SCHED_TOO_BUSY => ("SchedTooBusy", "", ""), GC_WORKER_TOO_BUSY => ("GcWorkerTooBusy", "", ""), KEY_TOO_LARGE => ("KeyTooLarge", "", ""), - INVALID_CF => ("InvalidCF", "", ""), - CF_DEPRECATED => ("CFDeprecated", "", ""), + INVALID_CF => ("InvalidCf", "", ""), + CF_DEPRECATED => ("CfDeprecated", "", ""), TTL_NOT_ENABLED => ("TtlNotEnabled", "", ""), TTL_LEN_NOT_EQUALS_TO_PAIRS => ("TtlLenNotEqualsToPairs", "", ""), PROTOBUF => ("Protobuf", "", ""), - INVALID_TXN_TSO => ("INVALIDTXNTSO", "", ""), + INVALID_TXN_TSO => ("InvalidTxnTso", "", ""), INVALID_REQ_RANGE => ("InvalidReqRange", "", ""), BAD_FORMAT_LOCK => ("BadFormatLock", "", ""), BAD_FORMAT_WRITE => ("BadFormatWrite", "",""), diff --git a/components/file_system/src/io_stats/proc.rs b/components/file_system/src/io_stats/proc.rs index 07856ebe9c0..ceb772bee6e 100644 --- a/components/file_system/src/io_stats/proc.rs +++ b/components/file_system/src/io_stats/proc.rs @@ -33,17 +33,17 @@ thread_local! { } #[derive(Debug)] -struct ThreadID { +struct ThreadId { pid: Pid, tid: Pid, proc_reader: Option>, } -impl ThreadID { - fn current() -> ThreadID { +impl ThreadId { + fn current() -> ThreadId { let pid = thread::process_id(); let tid = thread::thread_id(); - ThreadID { + ThreadId { pid, tid, proc_reader: None, @@ -102,7 +102,7 @@ impl ThreadID { } struct LocalIoStats { - id: ThreadID, + id: ThreadId, io_type: IoType, last_flushed: IoBytes, } @@ -110,7 +110,7 @@ struct LocalIoStats { impl LocalIoStats { fn current() -> Self { LocalIoStats { - id: ThreadID::current(), + id: ThreadId::current(), io_type: IoType::Other, last_flushed: IoBytes::default(), } @@ -197,7 +197,7 @@ mod tests { fn test_read_bytes() { let tmp = tempdir_in("/var/tmp").unwrap_or_else(|_| tempdir().unwrap()); let file_path = tmp.path().join("test_read_bytes.txt"); - let mut id = ThreadID::current(); + let mut id = ThreadId::current(); let _type = WithIoType::new(IoType::Compaction); { let mut f = OpenOptions::new() @@ -229,7 +229,7 @@ mod tests { fn test_write_bytes() { let tmp = tempdir_in("/var/tmp").unwrap_or_else(|_| tempdir().unwrap()); let file_path = tmp.path().join("test_write_bytes.txt"); - let mut id = ThreadID::current(); + let mut id = ThreadId::current(); let _type = WithIoType::new(IoType::Compaction); let mut f = OpenOptions::new() .write(true) @@ -250,7 +250,7 @@ mod tests { #[bench] fn bench_fetch_thread_io_bytes(b: &mut test::Bencher) { - let mut id = ThreadID::current(); + let mut id = ThreadId::current(); b.iter(|| id.fetch_io_bytes().unwrap()); } } diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 9f82c90968b..82313ae7d4e 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -295,34 +295,34 @@ static OBSERVE_ID_ALLOC: AtomicUsize = AtomicUsize::new(0); /// A unique identifier for checking stale observed commands. #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] -pub struct ObserveID(usize); +pub struct ObserveId(usize); -impl ObserveID { - pub fn new() -> ObserveID { - ObserveID(OBSERVE_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) +impl ObserveId { + pub fn new() -> ObserveId { + ObserveId(OBSERVE_ID_ALLOC.fetch_add(1, Ordering::SeqCst)) } } /// ObserveHandle is the status of a term of observing, it contains the -/// `ObserveID` and the `observing` flag indicate whether the observing is +/// `ObserveId` and the `observing` flag indicate whether the observing is /// ongoing #[derive(Clone, Default, Debug)] pub struct ObserveHandle { - pub id: ObserveID, + pub id: ObserveId, observing: Arc, } impl ObserveHandle { pub fn new() -> ObserveHandle { ObserveHandle { - id: ObserveID::new(), + id: ObserveId::new(), observing: Arc::new(AtomicBool::new(true)), } } pub fn with_id(id: usize) -> ObserveHandle { ObserveHandle { - id: ObserveID(id), + id: ObserveId(id), observing: Arc::new(AtomicBool::new(true)), } } @@ -412,9 +412,9 @@ pub enum ObserveLevel { #[derive(Clone, Debug)] pub struct CmdBatch { pub level: ObserveLevel, - pub cdc_id: ObserveID, - pub rts_id: ObserveID, - pub pitr_id: ObserveID, + pub cdc_id: ObserveId, + pub rts_id: ObserveId, + pub pitr_id: ObserveId, pub region_id: u64, pub cmds: Vec, } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 1b707a42921..db62674e6a5 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -692,7 +692,7 @@ fn timespec_to_u64(ts: Timespec) -> u64 { /// /// # Panics /// -/// If nsec is negative or GE than 1_000_000_000(nano seconds pre second). +/// If nsec (nano seconds pre second) is not in [0, 1_000_000_000) range. #[inline] pub(crate) fn u64_to_timespec(u: u64) -> Timespec { let sec = u >> TIMESPEC_SEC_SHIFT; diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 6a6aa53103d..97e8ee85d86 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -186,9 +186,9 @@ where id: u64, duration: RaftstoreDuration, }, - UpdateRegionCPUCollector(bool), - RegionCPURecords(Arc), - ReportMinResolvedTS { + UpdateRegionCpuCollector(bool), + RegionCpuRecords(Arc), + ReportMinResolvedTs { store_id: u64, min_resolved_ts: u64, }, @@ -409,16 +409,16 @@ where Task::UpdateSlowScore { id, ref duration } => { write!(f, "compute slow score: id {}, duration {:?}", id, duration) } - Task::UpdateRegionCPUCollector(is_register) => { + Task::UpdateRegionCpuCollector(is_register) => { if is_register { return write!(f, "register region cpu collector"); } write!(f, "deregister region cpu collector") } - Task::RegionCPURecords(ref cpu_records) => { + Task::RegionCpuRecords(ref cpu_records) => { write!(f, "get region cpu records: {:?}", cpu_records) } - Task::ReportMinResolvedTS { + Task::ReportMinResolvedTs { store_id, min_resolved_ts, } => { @@ -625,8 +625,8 @@ where ) { let start_time = TiInstant::now(); match auto_split_controller.refresh_and_check_cfg() { - SplitConfigChange::UpdateRegionCPUCollector(is_register) => { - if let Err(e) = scheduler.schedule(Task::UpdateRegionCPUCollector(is_register)) { + SplitConfigChange::UpdateRegionCpuCollector(is_register) => { + if let Err(e) = scheduler.schedule(Task::UpdateRegionCpuCollector(is_register)) { error!( "failed to register or deregister the region cpu collector"; "is_register" => is_register, @@ -680,7 +680,7 @@ where .min() .unwrap_or(0) }); - let task = Task::ReportMinResolvedTS { + let task = Task::ReportMinResolvedTs { store_id, min_resolved_ts, }; @@ -839,8 +839,8 @@ impl SlowScore { } } -// RegionCPUMeteringCollector is used to collect the region-related CPU info. -struct RegionCPUMeteringCollector +// RegionCpuMeteringCollector is used to collect the region-related CPU info. +struct RegionCpuMeteringCollector where EK: KvEngine, ER: RaftEngine, @@ -848,24 +848,24 @@ where scheduler: Scheduler>, } -impl RegionCPUMeteringCollector +impl RegionCpuMeteringCollector where EK: KvEngine, ER: RaftEngine, { - fn new(scheduler: Scheduler>) -> RegionCPUMeteringCollector { - RegionCPUMeteringCollector { scheduler } + fn new(scheduler: Scheduler>) -> RegionCpuMeteringCollector { + RegionCpuMeteringCollector { scheduler } } } -impl Collector for RegionCPUMeteringCollector +impl Collector for RegionCpuMeteringCollector where EK: KvEngine, ER: RaftEngine, { fn collect(&self, records: Arc) { self.scheduler - .schedule(Task::RegionCPURecords(records)) + .schedule(Task::RegionCpuRecords(records)) .ok(); } } @@ -940,7 +940,7 @@ where > 0.0 { region_cpu_records_collector = Some(collector_reg_handle.register( - Box::new(RegionCPUMeteringCollector::new(scheduler.clone())), + Box::new(RegionCpuMeteringCollector::new(scheduler.clone())), false, )); } @@ -1040,7 +1040,7 @@ where return; } self.region_cpu_records_collector = Some(self.collector_reg_handle.register( - Box::new(RegionCPUMeteringCollector::new(self.scheduler.clone())), + Box::new(RegionCpuMeteringCollector::new(self.scheduler.clone())), false, )); } @@ -2015,11 +2015,11 @@ where } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), Task::UpdateSlowScore { id, duration } => self.slow_score.record(id, duration.sum()), - Task::UpdateRegionCPUCollector(is_register) => { + Task::UpdateRegionCpuCollector(is_register) => { self.handle_update_region_cpu_collector(is_register) } - Task::RegionCPURecords(records) => self.handle_region_cpu_records(records), - Task::ReportMinResolvedTS { + Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), + Task::ReportMinResolvedTs { store_id, min_resolved_ts, } => self.handle_report_min_resolved_ts(store_id, min_resolved_ts), diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index addedc3d653..7c698905b72 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -555,7 +555,7 @@ impl SplitInfo { #[derive(PartialEq, Debug)] pub enum SplitConfigChange { Noop, - UpdateRegionCPUCollector(bool), + UpdateRegionCpuCollector(bool), } pub struct AutoSplitController { @@ -927,12 +927,12 @@ impl AutoSplitController { if self.cfg.region_cpu_overload_threshold_ratio <= 0.0 && incoming.region_cpu_overload_threshold_ratio > 0.0 { - cfg_change = SplitConfigChange::UpdateRegionCPUCollector(true); + cfg_change = SplitConfigChange::UpdateRegionCpuCollector(true); } if self.cfg.region_cpu_overload_threshold_ratio > 0.0 && incoming.region_cpu_overload_threshold_ratio <= 0.0 { - cfg_change = SplitConfigChange::UpdateRegionCPUCollector(false); + cfg_change = SplitConfigChange::UpdateRegionCpuCollector(false); } self.cfg = incoming.clone(); } @@ -1638,7 +1638,7 @@ mod tests { ); assert_eq!( auto_split_controller.refresh_and_check_cfg(), - SplitConfigChange::UpdateRegionCPUCollector(false), + SplitConfigChange::UpdateRegionCpuCollector(false), ); assert_eq!( auto_split_controller @@ -1658,7 +1658,7 @@ mod tests { ); assert_eq!( auto_split_controller.refresh_and_check_cfg(), - SplitConfigChange::UpdateRegionCPUCollector(true), + SplitConfigChange::UpdateRegionCpuCollector(true), ); assert_eq!( auto_split_controller diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index e1c23652db8..57bf20e7d0b 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -106,8 +106,8 @@ impl AdvanceTsWorker { // Sync with concurrency manager so that it can work correctly when // optimizations like async commit is enabled. - // Note: This step must be done before scheduling `Task::MinTS` task, and the - // resolver must be checked in or after `Task::MinTS`' execution. + // Note: This step must be done before scheduling `Task::MinTs` task, and the + // resolver must be checked in or after `Task::MinTs`' execution. cm.update_max_ts(min_ts); if let Some(min_mem_lock_ts) = cm.global_min_lock_ts() { if min_mem_lock_ts < min_ts { diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 5a180a9b6c8..f2920e2af69 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -18,7 +18,7 @@ use kvproto::{metapb::Region, raft_cmdpb::AdminCmdType}; use online_config::{self, ConfigChange, ConfigManager, OnlineConfig}; use pd_client::PdClient; use raftstore::{ - coprocessor::{CmdBatch, ObserveHandle, ObserveID}, + coprocessor::{CmdBatch, ObserveHandle, ObserveId}, router::RaftStoreRouter, store::{ fsm::StoreMeta, @@ -458,7 +458,7 @@ where } // Deregister current observed region and try to register it again. - fn re_register_region(&mut self, region_id: u64, observe_id: ObserveID, cause: String) { + fn re_register_region(&mut self, region_id: u64, observe_id: ObserveId, cause: String) { if let Some(observe_region) = self.regions.get(®ion_id) { if observe_region.handle.id != observe_id { warn!("resolved ts deregister region failed due to observe_id not match"); @@ -554,7 +554,7 @@ where fn handle_scan_locks( &mut self, region_id: u64, - observe_id: ObserveID, + observe_id: ObserveId, entries: Vec, apply_index: u64, ) { @@ -622,7 +622,7 @@ pub enum Task { }, ReRegisterRegion { region_id: u64, - observe_id: ObserveID, + observe_id: ObserveId, cause: String, }, RegisterAdvanceEvent { @@ -638,7 +638,7 @@ pub enum Task { }, ScanLocks { region_id: u64, - observe_id: ObserveID, + observe_id: ObserveId, entries: Vec, apply_index: u64, }, diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index 396fc7333da..4266103933f 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -6,7 +6,7 @@ use engine_traits::KvEngine; use futures::compat::Future01CompatExt; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb::Region}; use raftstore::{ - coprocessor::{ObserveHandle, ObserveID}, + coprocessor::{ObserveHandle, ObserveId}, router::RaftStoreRouter, store::{ fsm::ChangeObserver, @@ -33,7 +33,7 @@ const GET_SNAPSHOT_RETRY_TIME: u32 = 3; const GET_SNAPSHOT_RETRY_BACKOFF_STEP: Duration = Duration::from_millis(25); pub type BeforeStartCallback = Box; -pub type OnErrorCallback = Box; +pub type OnErrorCallback = Box; pub type OnEntriesCallback = Box, u64) + Send>; pub type IsCancelledCallback = Box bool + Send>; diff --git a/components/resolved_ts/src/sinker.rs b/components/resolved_ts/src/sinker.rs index 29eebce02ed..383e5f7acc7 100644 --- a/components/resolved_ts/src/sinker.rs +++ b/components/resolved_ts/src/sinker.rs @@ -3,14 +3,14 @@ use std::marker::PhantomData; use engine_traits::Snapshot; -use raftstore::{coprocessor::ObserveID, store::RegionSnapshot}; +use raftstore::{coprocessor::ObserveId, store::RegionSnapshot}; use txn_types::TimeStamp; use crate::cmd::ChangeLog; pub struct SinkCmd { pub region_id: u64, - pub observe_id: ObserveID, + pub observe_id: ObserveId, pub logs: Vec, } diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index 7ada07d5206..29144c8ca18 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -215,7 +215,7 @@ fn run_dump_raft_engine_worker( #[cfg(test)] mod tests { - use tikv::config::TiKvConfig; + use tikv::config::TikvConfig; use super::*; @@ -230,7 +230,7 @@ mod tests { raftdb_wal_path.push("test-wal"); } - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.raft_store.raftdb_path = raftdb_path.to_str().unwrap().to_owned(); cfg.raftdb.wal_dir = raftdb_wal_path.to_str().unwrap().to_owned(); cfg.raft_engine.mut_config().dir = raft_engine_path.to_str().unwrap().to_owned(); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index fe2b0dd1c26..1cb6a9b3b65 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -9,7 +9,7 @@ //! The entry point is `run_tikv`. //! //! Components are often used to initialize other components, and/or must be -//! explicitly stopped. We keep these components in the `TiKvServer` struct. +//! explicitly stopped. We keep these components in the `TikvServer` struct. use std::{ cmp, @@ -82,7 +82,7 @@ use raftstore::{ }; use security::SecurityManager; use tikv::{ - config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TiKvConfig}, + config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, import::{ImportSstService, SstImporter}, @@ -140,10 +140,10 @@ const SYSTEM_HEALTHY_THRESHOLD: f64 = 0.50; const CPU_QUOTA_ADJUSTMENT_PACE: f64 = 200.0; // 0.2 vcpu #[inline] -fn run_impl(config: TiKvConfig) { - let mut tikv = TiKvServer::::init::(config); +fn run_impl(config: TikvConfig) { + let mut tikv = TikvServer::::init::(config); - // Must be called after `TiKvServer::init`. + // Must be called after `TikvServer::init`. let memory_limit = tikv.config.memory_usage_limit.unwrap().0; let high_water = (tikv.config.memory_usage_high_water * memory_limit as f64) as u64; register_memory_usage_high_water(high_water); @@ -170,7 +170,7 @@ fn run_impl(config: TiKvConfig) { /// Run a TiKV server. Returns when the server is shutdown by the user, in which /// case the server will be properly stopped. -pub fn run_tikv(config: TiKvConfig) { +pub fn run_tikv(config: TikvConfig) { // Sets the global logger ASAP. // It is okay to use the config w/o `validate()`, // because `initial_logger()` handles various conditions. @@ -207,8 +207,8 @@ const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); /// A complete TiKV server. -struct TiKvServer { - config: TiKvConfig, +struct TikvServer { + config: TikvConfig, cfg_controller: Option, security_mgr: Arc, pd_client: Arc, @@ -221,7 +221,7 @@ struct TiKvServer { store_path: PathBuf, snap_mgr: Option, // Will be filled in `init_servers`. encryption_key_manager: Option>, - engines: Option>, + engines: Option>, servers: Option>, region_info_accessor: RegionInfoAccessor, coprocessor_host: Option>, @@ -236,7 +236,7 @@ struct TiKvServer { tablet_factory: Option + Send + Sync>>, } -struct TiKvEngines { +struct TikvEngines { engines: Engines, store_meta: Arc>, engine: RaftKv>, @@ -257,11 +257,11 @@ type LocalServer = Server, resolve::PdStoreAddrResolver, LocalRaftKv>; type LocalRaftKv = RaftKv>; -impl TiKvServer +impl TikvServer where ER: RaftEngine, { - fn init(mut config: TiKvConfig) -> TiKvServer { + fn init(mut config: TikvConfig) -> TikvServer { tikv_util::thread_group::set_properties(Some(GroupProperties::default())); // It is okay use pd config and security config before `init_config`, // because these configs must be provided by command line, and only @@ -331,7 +331,7 @@ where info!("Causal timestamp provider startup."); } - TiKvServer { + TikvServer { config, cfg_controller: Some(cfg_controller), security_mgr, @@ -373,7 +373,7 @@ where /// - If the config can't pass `validate()` /// - If the max open file descriptor limit is not high enough to support /// the main database and the raft database. - fn init_config(mut config: TiKvConfig) -> ConfigController { + fn init_config(mut config: TikvConfig) -> ConfigController { validate_and_persist_config(&mut config, true); ensure_dir_exist(&config.storage.data_dir).unwrap(); @@ -408,7 +408,7 @@ where } fn connect_to_pd_cluster( - config: &mut TiKvConfig, + config: &mut TikvConfig, env: Arc, security_mgr: Arc, ) -> Arc { @@ -573,7 +573,7 @@ where engines.kv.clone(), ); - self.engines = Some(TiKvEngines { + self.engines = Some(TikvEngines { engines, store_meta, engine, @@ -813,7 +813,7 @@ where cdc_ob.register_to(self.coprocessor_host.as_mut().unwrap()); // Register cdc config manager. cfg_controller.register( - tikv::config::Module::CDC, + tikv::config::Module::Cdc, Box::new(CdcConfigManager(cdc_worker.scheduler())), ); @@ -1535,7 +1535,7 @@ where pub trait ConfiguredRaftEngine: RaftEngine { fn build( - _: &TiKvConfig, + _: &TikvConfig, _: &Arc, _: &Option>, _: &Option, @@ -1548,7 +1548,7 @@ pub trait ConfiguredRaftEngine: RaftEngine { impl ConfiguredRaftEngine for RocksEngine { fn build( - config: &TiKvConfig, + config: &TikvConfig, env: &Arc, key_manager: &Option>, block_cache: &Option, @@ -1600,7 +1600,7 @@ impl ConfiguredRaftEngine for RocksEngine { impl ConfiguredRaftEngine for RaftLogEngine { fn build( - config: &TiKvConfig, + config: &TikvConfig, env: &Arc, key_manager: &Option>, block_cache: &Option, @@ -1637,7 +1637,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { } } -impl TiKvServer { +impl TikvServer { fn init_raw_engines( &mut self, flow_listener: engine_rocks::FlowListener, @@ -1724,7 +1724,7 @@ fn pre_start() { } } -fn check_system_config(config: &TiKvConfig) { +fn check_system_config(config: &TikvConfig) { info!("beginning system configuration check"); let mut rocksdb_max_open_files = config.rocksdb.max_open_files; if config.rocksdb.titan.enabled { diff --git a/components/server/src/setup.rs b/components/server/src/setup.rs index 4f49f6fb86e..5742eda8bc8 100644 --- a/components/server/src/setup.rs +++ b/components/server/src/setup.rs @@ -10,7 +10,7 @@ use std::{ use chrono::Local; use clap::ArgMatches; use collections::HashMap; -use tikv::config::{check_critical_config, persist_config, MetricConfig, TiKvConfig}; +use tikv::config::{check_critical_config, persist_config, MetricConfig, TikvConfig}; use tikv_util::{self, config, logger}; // A workaround for checking if log is initialized. @@ -74,7 +74,7 @@ fn make_engine_log_path(path: &str, sub_path: &str, filename: &str) -> String { } #[allow(dead_code)] -pub fn initial_logger(config: &TiKvConfig) { +pub fn initial_logger(config: &TikvConfig) { let rocksdb_info_log_path = if !config.rocksdb.info_log_dir.is_empty() { make_engine_log_path(&config.rocksdb.info_log_dir, "", DEFAULT_ROCKSDB_LOG_FILE) } else { @@ -142,7 +142,7 @@ pub fn initial_logger(config: &TiKvConfig) { rocksdb: R, raftdb: T, slow: Option, - config: &TiKvConfig, + config: &TikvConfig, ) where N: slog::Drain + Send + 'static, R: slog::Drain + Send + 'static, @@ -238,7 +238,7 @@ pub fn initial_metric(cfg: &MetricConfig) { } #[allow(dead_code)] -pub fn overwrite_config_with_cmd_args(config: &mut TiKvConfig, matches: &ArgMatches<'_>) { +pub fn overwrite_config_with_cmd_args(config: &mut TikvConfig, matches: &ArgMatches<'_>) { if let Some(level) = matches.value_of("log-level") { config.log.level = logger::get_level_by_string(level).unwrap().into(); config.log_level = slog::Level::Info.into(); @@ -303,7 +303,7 @@ pub fn overwrite_config_with_cmd_args(config: &mut TiKvConfig, matches: &ArgMatc } #[allow(dead_code)] -pub fn validate_and_persist_config(config: &mut TiKvConfig, persist: bool) { +pub fn validate_and_persist_config(config: &mut TikvConfig, persist: bool) { config.compatible_adjust(); if let Err(e) = config.validate() { fatal!("invalid configuration: {}", e); diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index 3fc229aa6ee..51aabcbec01 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -19,7 +19,7 @@ pub fn error_inc(type_: &str, err: &Error) { Error::Io(..) => "io", Error::Grpc(..) => "grpc", Error::Uuid(..) => "uuid", - Error::RocksDB(..) => "rocksdb", + Error::RocksDb(..) => "rocksdb", Error::EngineTraits(..) => "engine_traits", Error::ParseIntError(..) => "parse_int", Error::FileExists(..) => "file_exists", @@ -52,7 +52,7 @@ pub enum Error { // FIXME: Remove concrete 'rocks' type #[error("RocksDB {0}")] - RocksDB(String), + RocksDb(String), #[error("Engine {0:?}")] EngineTraits(#[from] engine_traits::Error), @@ -140,7 +140,7 @@ impl Error { impl From for Error { fn from(msg: String) -> Self { - Self::RocksDB(msg) + Self::RocksDb(msg) } } @@ -161,7 +161,7 @@ impl ErrorCodeExt for Error { Error::Grpc(_) => error_code::sst_importer::GRPC, Error::Uuid(_) => error_code::sst_importer::UUID, Error::Future(_) => error_code::sst_importer::FUTURE, - Error::RocksDB(_) => error_code::sst_importer::ROCKSDB, + Error::RocksDb(_) => error_code::sst_importer::ROCKSDB, Error::EngineTraits(e) => e.error_code(), Error::ParseIntError(_) => error_code::sst_importer::PARSE_INT_ERROR, Error::FileExists(..) => error_code::sst_importer::FILE_EXISTS, diff --git a/components/sst_importer/src/sst_writer.rs b/components/sst_importer/src/sst_writer.rs index 210f17fc168..70d30569557 100644 --- a/components/sst_importer/src/sst_writer.rs +++ b/components/sst_importer/src/sst_writer.rs @@ -61,7 +61,7 @@ impl TxnSstWriter { fn check_api_version(&self, key: &[u8]) -> Result<()> { let mode = K::parse_key_mode(key); - if self.api_version == ApiVersion::V2 && mode != KeyMode::Txn && mode != KeyMode::TiDB { + if self.api_version == ApiVersion::V2 && mode != KeyMode::Txn && mode != KeyMode::Tidb { return Err(Error::invalid_key_mode( SstWriterType::Txn, self.api_version, diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index e6622128243..d7bed05eddd 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -24,7 +24,7 @@ use tidb_query_common::storage::{ }; use tikv::{ config::BackupConfig, - coprocessor::{checksum_crc64_xor, dag::TiKvStorage}, + coprocessor::{checksum_crc64_xor, dag::TikvStorage}, storage::{ kv::{Engine, SnapContext}, SnapshotStore, @@ -355,7 +355,7 @@ impl TestSuite { false, ); let mut scanner = RangesScanner::new(RangesScannerOptions { - storage: TiKvStorage::new(snap_store, false), + storage: TikvStorage::new(snap_store, false), ranges: vec![Range::Interval(IntervalRange::from((start, end)))], scan_backward_in_range: false, is_key_only: false, diff --git a/components/test_coprocessor/src/dag.rs b/components/test_coprocessor/src/dag.rs index 740ece83e1a..76e91cc6ef5 100644 --- a/components/test_coprocessor/src/dag.rs +++ b/components/test_coprocessor/src/dag.rs @@ -15,7 +15,7 @@ use tipb::{ use super::*; -pub struct DAGSelect { +pub struct DagSelect { pub execs: Vec, pub cols: Vec, pub order_by: Vec, @@ -27,8 +27,8 @@ pub struct DAGSelect { pub paging_size: Option, } -impl DAGSelect { - pub fn from(table: &Table) -> DAGSelect { +impl DagSelect { + pub fn from(table: &Table) -> DagSelect { let mut exec = Executor::default(); exec.set_tp(ExecType::TypeTableScan); let mut tbl_scan = TableScan::default(); @@ -38,7 +38,7 @@ impl DAGSelect { tbl_scan.set_columns(columns_info); exec.set_tbl_scan(tbl_scan); - DAGSelect { + DagSelect { execs: vec![exec], cols: table.columns_info(), order_by: vec![], @@ -51,7 +51,7 @@ impl DAGSelect { } } - pub fn from_index(table: &Table, index: &Column) -> DAGSelect { + pub fn from_index(table: &Table, index: &Column) -> DagSelect { let idx = index.index; let mut exec = Executor::default(); exec.set_tp(ExecType::TypeIndexScan); @@ -65,7 +65,7 @@ impl DAGSelect { exec.set_idx_scan(scan); let range = table.get_index_range_all(idx); - DAGSelect { + DagSelect { execs: vec![exec], cols: columns_info.to_vec(), order_by: vec![], @@ -79,13 +79,13 @@ impl DAGSelect { } #[must_use] - pub fn limit(mut self, n: u64) -> DAGSelect { + pub fn limit(mut self, n: u64) -> DagSelect { self.limit = Some(n); self } #[must_use] - pub fn order_by(mut self, col: &Column, desc: bool) -> DAGSelect { + pub fn order_by(mut self, col: &Column, desc: bool) -> DagSelect { let col_offset = offset_for_column(&self.cols, col.id); let mut item = ByItem::default(); let mut expr = Expr::default(); @@ -99,12 +99,12 @@ impl DAGSelect { } #[must_use] - pub fn count(self, col: &Column) -> DAGSelect { + pub fn count(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Count) } #[must_use] - pub fn aggr_col(mut self, col: &Column, aggr_t: ExprType) -> DAGSelect { + pub fn aggr_col(mut self, col: &Column, aggr_t: ExprType) -> DagSelect { let col_offset = offset_for_column(&self.cols, col.id); let mut col_expr = Expr::default(); col_expr.set_field_type(col.as_field_type()); @@ -125,47 +125,47 @@ impl DAGSelect { } #[must_use] - pub fn first(self, col: &Column) -> DAGSelect { + pub fn first(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::First) } #[must_use] - pub fn sum(self, col: &Column) -> DAGSelect { + pub fn sum(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Sum) } #[must_use] - pub fn avg(self, col: &Column) -> DAGSelect { + pub fn avg(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Avg) } #[must_use] - pub fn max(self, col: &Column) -> DAGSelect { + pub fn max(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Max) } #[must_use] - pub fn min(self, col: &Column) -> DAGSelect { + pub fn min(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::Min) } #[must_use] - pub fn bit_and(self, col: &Column) -> DAGSelect { + pub fn bit_and(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::AggBitAnd) } #[must_use] - pub fn bit_or(self, col: &Column) -> DAGSelect { + pub fn bit_or(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::AggBitOr) } #[must_use] - pub fn bit_xor(self, col: &Column) -> DAGSelect { + pub fn bit_xor(self, col: &Column) -> DagSelect { self.aggr_col(col, ExprType::AggBitXor) } #[must_use] - pub fn group_by(mut self, cols: &[&Column]) -> DAGSelect { + pub fn group_by(mut self, cols: &[&Column]) -> DagSelect { for col in cols { let offset = offset_for_column(&self.cols, col.id); let mut expr = Expr::default(); @@ -178,13 +178,13 @@ impl DAGSelect { } #[must_use] - pub fn output_offsets(mut self, output_offsets: Option>) -> DAGSelect { + pub fn output_offsets(mut self, output_offsets: Option>) -> DagSelect { self.output_offsets = output_offsets; self } #[must_use] - pub fn where_expr(mut self, expr: Expr) -> DAGSelect { + pub fn where_expr(mut self, expr: Expr) -> DagSelect { let mut exec = Executor::default(); exec.set_tp(ExecType::TypeSelection); let mut selection = Selection::default(); @@ -195,20 +195,20 @@ impl DAGSelect { } #[must_use] - pub fn desc(mut self, desc: bool) -> DAGSelect { + pub fn desc(mut self, desc: bool) -> DagSelect { self.execs[0].mut_tbl_scan().set_desc(desc); self } #[must_use] - pub fn paging_size(mut self, paging_size: u64) -> DAGSelect { + pub fn paging_size(mut self, paging_size: u64) -> DagSelect { assert_ne!(paging_size, 0); self.paging_size = Some(paging_size); self } #[must_use] - pub fn key_ranges(mut self, key_ranges: Vec) -> DAGSelect { + pub fn key_ranges(mut self, key_ranges: Vec) -> DagSelect { self.key_ranges = key_ranges; self } diff --git a/components/test_raftstore/src/config.rs b/components/test_raftstore/src/config.rs index 15748773409..a86b8eb1bf0 100644 --- a/components/test_raftstore/src/config.rs +++ b/components/test_raftstore/src/config.rs @@ -2,25 +2,25 @@ use std::ops::{Deref, DerefMut}; -use tikv::config::TiKvConfig; +use tikv::config::TikvConfig; #[derive(Clone)] pub struct Config { - pub tikv: TiKvConfig, + pub tikv: TikvConfig, pub prefer_mem: bool, } impl Deref for Config { - type Target = TiKvConfig; + type Target = TikvConfig; #[inline] - fn deref(&self) -> &TiKvConfig { + fn deref(&self) -> &TikvConfig { &self.tikv } } impl DerefMut for Config { #[inline] - fn deref_mut(&mut self) -> &mut TiKvConfig { + fn deref_mut(&mut self) -> &mut TikvConfig { &mut self.tikv } } diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index e33837ebd76..eaeaf6a4e0f 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -131,10 +131,10 @@ pub fn must_region_cleared(engine: &Engines, region } lazy_static! { - static ref TEST_CONFIG: TiKvConfig = { + static ref TEST_CONFIG: TikvConfig = { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let common_test_cfg = manifest_dir.join("src/common-test.toml"); - TiKvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { + TikvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { panic!( "invalid auto generated configuration file {}, err {}", manifest_dir.display(), @@ -144,13 +144,13 @@ lazy_static! { }; } -pub fn new_tikv_config(cluster_id: u64) -> TiKvConfig { +pub fn new_tikv_config(cluster_id: u64) -> TikvConfig { let mut cfg = TEST_CONFIG.clone(); cfg.server.cluster_id = cluster_id; cfg } -pub fn new_tikv_config_with_api_ver(cluster_id: u64, api_ver: ApiVersion) -> TiKvConfig { +pub fn new_tikv_config_with_api_ver(cluster_id: u64, api_ver: ApiVersion) -> TikvConfig { let mut cfg = TEST_CONFIG.clone(); cfg.server.cluster_id = cluster_id; cfg.storage.set_api_version(api_ver); diff --git a/components/tidb_query_datatype/src/codec/chunk/chunk.rs b/components/tidb_query_datatype/src/codec/chunk/chunk.rs index ee111d11f77..b4478c8a4d3 100644 --- a/components/tidb_query_datatype/src/codec/chunk/chunk.rs +++ b/components/tidb_query_datatype/src/codec/chunk/chunk.rs @@ -188,7 +188,7 @@ mod tests { FieldTypeTp::DateTime.into(), FieldTypeTp::Duration.into(), FieldTypeTp::NewDecimal.into(), - FieldTypeTp::JSON.into(), + FieldTypeTp::Json.into(), FieldTypeTp::String.into(), ]; let json: Json = r#"{"k1":"v1"}"#.parse().unwrap(); @@ -229,7 +229,7 @@ mod tests { FieldTypeTp::DateTime.into(), FieldTypeTp::Duration.into(), FieldTypeTp::NewDecimal.into(), - FieldTypeTp::JSON.into(), + FieldTypeTp::Json.into(), FieldTypeTp::String.into(), ]; let json: Json = r#"{"k1":"v1"}"#.parse().unwrap(); @@ -329,7 +329,7 @@ mod tests { fn bench_encode_from_raw_json_datum(b: &mut Bencher) { let json: Json = r#"{"k1":"v1"}"#.parse().unwrap(); let datum = Datum::Json(json); - bench_encode_from_raw_datum_impl(b, datum, FieldTypeTp::JSON); + bench_encode_from_raw_datum_impl(b, datum, FieldTypeTp::Json); } #[test] @@ -341,7 +341,7 @@ mod tests { FieldTypeTp::VarChar.into(), FieldTypeTp::VarChar.into(), FieldTypeTp::NewDecimal.into(), - FieldTypeTp::JSON.into(), + FieldTypeTp::Json.into(), ]; let mut chunk = Chunk::new(&fields, rows); diff --git a/components/tidb_query_datatype/src/codec/chunk/column.rs b/components/tidb_query_datatype/src/codec/chunk/column.rs index f7f13363686..ef1c2602864 100644 --- a/components/tidb_query_datatype/src/codec/chunk/column.rs +++ b/components/tidb_query_datatype/src/codec/chunk/column.rs @@ -316,7 +316,7 @@ impl Column { } FieldTypeTp::Duration => Datum::Dur(self.get_duration(idx, field_type.decimal())?), FieldTypeTp::NewDecimal => Datum::Dec(self.get_decimal(idx)?), - FieldTypeTp::JSON => Datum::Json(self.get_json(idx)?), + FieldTypeTp::Json => Datum::Json(self.get_json(idx)?), FieldTypeTp::Enum => Datum::Enum(self.get_enum(idx)?), FieldTypeTp::Bit => Datum::Bytes(self.get_bytes(idx).to_vec()), FieldTypeTp::Set => { @@ -1142,7 +1142,7 @@ mod tests { #[test] fn test_column_json() { - let fields: Vec = vec![FieldTypeTp::JSON.into()]; + let fields: Vec = vec![FieldTypeTp::Json.into()]; let json: Json = r#"{"k1":"v1"}"#.parse().unwrap(); let data = vec![Datum::Null, Datum::Json(json)]; diff --git a/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs b/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs index 43a6289e640..6f27475ff2c 100644 --- a/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs +++ b/components/tidb_query_datatype/src/codec/collation/encoding/gbk.rs @@ -6,9 +6,9 @@ use super::*; use crate::codec::data_type::{BytesGuard, BytesWriter}; #[derive(Debug)] -pub struct EncodingGBK; +pub struct EncodingGbk; -impl Encoding for EncodingGBK { +impl Encoding for EncodingGbk { #[inline] fn decode(data: BytesRef<'_>) -> Result { match GBK.decode_without_bom_handling_and_without_replacement(data) { diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index b3033c06d84..cdc21cbe35a 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -49,10 +49,10 @@ macro_rules! match_template_charset { match_template::match_template! { $t = [ - UTF8 => EncodingUtf8, - UTF8Mb4 => EncodingUtf8Mb4, + Utf8 => EncodingUtf8, + Utf8Mb4 => EncodingUtf8Mb4, Latin1 => EncodingLatin1, - GBK => EncodingGBK, + Gbk => EncodingGbk, Binary => EncodingBinary, Ascii => EncodingAscii, ], diff --git a/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs b/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs index 2e4a0703d4a..79c08ec5404 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs @@ -102,7 +102,7 @@ pub trait V1CompatibleEncoder: DatumFlagAndPayloadEncoder { // Copy datum payload as it is self.write_bytes(src)?; } - FieldTypeTp::JSON => { + FieldTypeTp::Json => { self.write_u8(datum::JSON_FLAG)?; // Copy datum payload as it is self.write_bytes(src)?; @@ -288,7 +288,7 @@ mod tests { let mut ctx = EvalContext::default(); for value in cases { - let col = Column::new(1, value.clone()).with_tp(FieldTypeTp::JSON); + let col = Column::new(1, value.clone()).with_tp(FieldTypeTp::Json); let buf = encode_to_v1_compatible(&mut ctx, &col); let got: Json = buf.decode(col.ft(), &mut ctx).unwrap().unwrap(); assert_eq!(value, got); diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 7155748571f..0c995487b3d 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -284,7 +284,7 @@ fn unflatten( FieldTypeTp::VarChar, FieldTypeTp::String, FieldTypeTp::NewDecimal, - FieldTypeTp::JSON + FieldTypeTp::Json, ] .contains(&t), "unknown type {} {}", @@ -631,7 +631,7 @@ mod tests { (1, FieldTypeTp::LongLong.into()), (2, FieldTypeTp::VarChar.into()), (3, FieldTypeTp::NewDecimal.into()), - (5, FieldTypeTp::JSON.into()), + (5, FieldTypeTp::Json.into()), (6, duration_col), ]); diff --git a/components/tidb_query_datatype/src/def/eval_type.rs b/components/tidb_query_datatype/src/def/eval_type.rs index 855802119b9..e6cd7da1b6a 100644 --- a/components/tidb_query_datatype/src/def/eval_type.rs +++ b/components/tidb_query_datatype/src/def/eval_type.rs @@ -33,7 +33,7 @@ impl EvalType { EvalType::Bytes => crate::FieldTypeTp::String, EvalType::DateTime => crate::FieldTypeTp::DateTime, EvalType::Duration => crate::FieldTypeTp::Duration, - EvalType::Json => crate::FieldTypeTp::JSON, + EvalType::Json => crate::FieldTypeTp::Json, EvalType::Enum => crate::FieldTypeTp::Enum, EvalType::Set => crate::FieldTypeTp::Set, } @@ -66,7 +66,7 @@ impl std::convert::TryFrom for EvalType { | crate::FieldTypeTp::Date | crate::FieldTypeTp::DateTime => EvalType::DateTime, crate::FieldTypeTp::Duration => EvalType::Duration, - crate::FieldTypeTp::JSON => EvalType::Json, + crate::FieldTypeTp::Json => EvalType::Json, crate::FieldTypeTp::VarChar | crate::FieldTypeTp::TinyBlob | crate::FieldTypeTp::MediumBlob @@ -115,7 +115,7 @@ mod tests { (NewDate, None), (VarChar, Some(EvalType::Bytes)), (Bit, Some(EvalType::Int)), - (JSON, Some(EvalType::Json)), + (Json, Some(EvalType::Json)), (NewDecimal, Some(EvalType::Decimal)), (Enum, Some(EvalType::Enum)), (Set, None), diff --git a/components/tidb_query_datatype/src/def/field_type.rs b/components/tidb_query_datatype/src/def/field_type.rs index 417d7b0d146..903ec738e89 100644 --- a/components/tidb_query_datatype/src/def/field_type.rs +++ b/components/tidb_query_datatype/src/def/field_type.rs @@ -36,7 +36,7 @@ pub enum FieldTypeTp { NewDate = 14, VarChar = 15, Bit = 16, - JSON = 0xf5, + Json = 0xf5, NewDecimal = 0xf6, Enum = 0xf7, Set = 0xf8, @@ -52,7 +52,7 @@ pub enum FieldTypeTp { impl FieldTypeTp { fn from_i32(i: i32) -> Option { if (i >= FieldTypeTp::Unspecified as i32 && i <= FieldTypeTp::Bit as i32) - || (i >= FieldTypeTp::JSON as i32 && i <= FieldTypeTp::Geometry as i32) + || (i >= FieldTypeTp::Json as i32 && i <= FieldTypeTp::Geometry as i32) { Some(unsafe { ::std::mem::transmute::(i) }) } else { @@ -61,7 +61,7 @@ impl FieldTypeTp { } pub fn from_u8(i: u8) -> Option { - if i <= FieldTypeTp::Bit as u8 || i >= FieldTypeTp::JSON as u8 { + if i <= FieldTypeTp::Bit as u8 || i >= FieldTypeTp::Json as u8 { Some(unsafe { ::std::mem::transmute::(i32::from(i)) }) } else { None @@ -148,10 +148,10 @@ impl fmt::Display for Collation { #[derive(PartialEq, Debug, Clone, Copy)] pub enum Charset { - UTF8, - UTF8Mb4, + Utf8, + Utf8Mb4, Latin1, - GBK, + Gbk, Binary, Ascii, } @@ -159,10 +159,10 @@ pub enum Charset { impl Charset { pub fn from_name(name: &str) -> Result { match name { - "utf8mb4" => Ok(Charset::UTF8Mb4), - "utf8" => Ok(Charset::UTF8), + "utf8mb4" => Ok(Charset::Utf8Mb4), + "utf8" => Ok(Charset::Utf8), "latin1" => Ok(Charset::Latin1), - "gbk" => Ok(Charset::GBK), + "gbk" => Ok(Charset::Gbk), "binary" => Ok(Charset::Binary), "ascii" => Ok(Charset::Ascii), _ => Err(DataTypeError::UnsupportedCharset { @@ -471,7 +471,7 @@ mod tests { FieldTypeTp::NewDate, FieldTypeTp::VarChar, FieldTypeTp::Bit, - FieldTypeTp::JSON, + FieldTypeTp::Json, FieldTypeTp::NewDecimal, FieldTypeTp::Enum, FieldTypeTp::Set, @@ -556,9 +556,9 @@ mod tests { #[test] fn test_charset_from_str() { let cases = vec![ - ("gbk", Some(Charset::GBK)), - ("utf8mb4", Some(Charset::UTF8Mb4)), - ("utf8", Some(Charset::UTF8)), + ("gbk", Some(Charset::Gbk)), + ("utf8mb4", Some(Charset::Utf8Mb4)), + ("utf8", Some(Charset::Utf8)), ("binary", Some(Charset::Binary)), ("latin1", Some(Charset::Latin1)), ("ascii", Some(Charset::Ascii)), diff --git a/components/tidb_query_expr/src/impl_compare.rs b/components/tidb_query_expr/src/impl_compare.rs index 350b36a3a99..a8dbf96d1cb 100644 --- a/components/tidb_query_expr/src/impl_compare.rs +++ b/components/tidb_query_expr/src/impl_compare.rs @@ -153,63 +153,63 @@ pub trait CmpOp { fn compare_order(ordering: std::cmp::Ordering) -> bool; } -pub struct CmpOpLT; +pub struct CmpOpLt; -impl CmpOp for CmpOpLT { +impl CmpOp for CmpOpLt { #[inline] fn compare_order(ordering: Ordering) -> bool { ordering == Ordering::Less } } -pub struct CmpOpLE; +pub struct CmpOpLe; -impl CmpOp for CmpOpLE { +impl CmpOp for CmpOpLe { #[inline] fn compare_order(ordering: Ordering) -> bool { ordering != Ordering::Greater } } -pub struct CmpOpGT; +pub struct CmpOpGt; -impl CmpOp for CmpOpGT { +impl CmpOp for CmpOpGt { #[inline] fn compare_order(ordering: Ordering) -> bool { ordering == Ordering::Greater } } -pub struct CmpOpGE; +pub struct CmpOpGe; -impl CmpOp for CmpOpGE { +impl CmpOp for CmpOpGe { #[inline] fn compare_order(ordering: Ordering) -> bool { ordering != Ordering::Less } } -pub struct CmpOpNE; +pub struct CmpOpNe; -impl CmpOp for CmpOpNE { +impl CmpOp for CmpOpNe { #[inline] fn compare_order(ordering: Ordering) -> bool { ordering != Ordering::Equal } } -pub struct CmpOpEQ; +pub struct CmpOpEq; -impl CmpOp for CmpOpEQ { +impl CmpOp for CmpOpEq { #[inline] fn compare_order(ordering: Ordering) -> bool { ordering == Ordering::Equal } } -pub struct CmpOpNullEQ; +pub struct CmpOpNullEq; -impl CmpOp for CmpOpNullEQ { +impl CmpOp for CmpOpNullEq { #[inline] fn compare_null() -> Option { Some(1) @@ -547,220 +547,220 @@ mod tests { #[derive(Clone, Copy, PartialEq)] enum TestCaseCmpOp { - GT, - GE, - LT, - LE, - EQ, - NE, - NullEQ, + Gt, + Ge, + Lt, + Le, + Eq, + Ne, + NullEq, } #[allow(clippy::type_complexity)] fn generate_numeric_compare_cases() -> Vec<(Option, Option, TestCaseCmpOp, Option)> { vec![ - (None, None, TestCaseCmpOp::GT, None), - (Real::new(3.5).ok(), None, TestCaseCmpOp::GT, None), - (Real::new(-2.1).ok(), None, TestCaseCmpOp::GT, None), - (None, Real::new(3.5).ok(), TestCaseCmpOp::GT, None), - (None, Real::new(-2.1).ok(), TestCaseCmpOp::GT, None), + (None, None, TestCaseCmpOp::Gt, None), + (Real::new(3.5).ok(), None, TestCaseCmpOp::Gt, None), + (Real::new(-2.1).ok(), None, TestCaseCmpOp::Gt, None), + (None, Real::new(3.5).ok(), TestCaseCmpOp::Gt, None), + (None, Real::new(-2.1).ok(), TestCaseCmpOp::Gt, None), ( Real::new(3.5).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::GT, + TestCaseCmpOp::Gt, Some(1), ), ( Real::new(-2.1).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::GT, + TestCaseCmpOp::Gt, Some(0), ), ( Real::new(3.5).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::GT, + TestCaseCmpOp::Gt, Some(0), ), ( Real::new(-2.1).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::GT, + TestCaseCmpOp::Gt, Some(0), ), - (None, None, TestCaseCmpOp::GE, None), - (Real::new(3.5).ok(), None, TestCaseCmpOp::GE, None), - (Real::new(-2.1).ok(), None, TestCaseCmpOp::GE, None), - (None, Real::new(3.5).ok(), TestCaseCmpOp::GE, None), - (None, Real::new(-2.1).ok(), TestCaseCmpOp::GE, None), + (None, None, TestCaseCmpOp::Ge, None), + (Real::new(3.5).ok(), None, TestCaseCmpOp::Ge, None), + (Real::new(-2.1).ok(), None, TestCaseCmpOp::Ge, None), + (None, Real::new(3.5).ok(), TestCaseCmpOp::Ge, None), + (None, Real::new(-2.1).ok(), TestCaseCmpOp::Ge, None), ( Real::new(3.5).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::GE, + TestCaseCmpOp::Ge, Some(1), ), ( Real::new(-2.1).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::GE, + TestCaseCmpOp::Ge, Some(0), ), ( Real::new(3.5).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::GE, + TestCaseCmpOp::Ge, Some(1), ), ( Real::new(-2.1).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::GE, + TestCaseCmpOp::Ge, Some(1), ), - (None, None, TestCaseCmpOp::LT, None), - (Real::new(3.5).ok(), None, TestCaseCmpOp::LT, None), - (Real::new(-2.1).ok(), None, TestCaseCmpOp::LT, None), - (None, Real::new(3.5).ok(), TestCaseCmpOp::LT, None), - (None, Real::new(-2.1).ok(), TestCaseCmpOp::LT, None), + (None, None, TestCaseCmpOp::Lt, None), + (Real::new(3.5).ok(), None, TestCaseCmpOp::Lt, None), + (Real::new(-2.1).ok(), None, TestCaseCmpOp::Lt, None), + (None, Real::new(3.5).ok(), TestCaseCmpOp::Lt, None), + (None, Real::new(-2.1).ok(), TestCaseCmpOp::Lt, None), ( Real::new(3.5).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::LT, + TestCaseCmpOp::Lt, Some(0), ), ( Real::new(-2.1).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::LT, + TestCaseCmpOp::Lt, Some(1), ), ( Real::new(3.5).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::LT, + TestCaseCmpOp::Lt, Some(0), ), ( Real::new(-2.1).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::LT, + TestCaseCmpOp::Lt, Some(0), ), - (None, None, TestCaseCmpOp::LE, None), - (Real::new(3.5).ok(), None, TestCaseCmpOp::LE, None), - (Real::new(-2.1).ok(), None, TestCaseCmpOp::LE, None), - (None, Real::new(3.5).ok(), TestCaseCmpOp::LE, None), - (None, Real::new(-2.1).ok(), TestCaseCmpOp::LE, None), + (None, None, TestCaseCmpOp::Le, None), + (Real::new(3.5).ok(), None, TestCaseCmpOp::Le, None), + (Real::new(-2.1).ok(), None, TestCaseCmpOp::Le, None), + (None, Real::new(3.5).ok(), TestCaseCmpOp::Le, None), + (None, Real::new(-2.1).ok(), TestCaseCmpOp::Le, None), ( Real::new(3.5).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::LE, + TestCaseCmpOp::Le, Some(0), ), ( Real::new(-2.1).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::LE, + TestCaseCmpOp::Le, Some(1), ), ( Real::new(3.5).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::LE, + TestCaseCmpOp::Le, Some(1), ), ( Real::new(-2.1).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::LE, + TestCaseCmpOp::Le, Some(1), ), - (None, None, TestCaseCmpOp::EQ, None), - (Real::new(3.5).ok(), None, TestCaseCmpOp::EQ, None), - (Real::new(-2.1).ok(), None, TestCaseCmpOp::EQ, None), - (None, Real::new(3.5).ok(), TestCaseCmpOp::EQ, None), - (None, Real::new(-2.1).ok(), TestCaseCmpOp::EQ, None), + (None, None, TestCaseCmpOp::Eq, None), + (Real::new(3.5).ok(), None, TestCaseCmpOp::Eq, None), + (Real::new(-2.1).ok(), None, TestCaseCmpOp::Eq, None), + (None, Real::new(3.5).ok(), TestCaseCmpOp::Eq, None), + (None, Real::new(-2.1).ok(), TestCaseCmpOp::Eq, None), ( Real::new(3.5).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::EQ, + TestCaseCmpOp::Eq, Some(0), ), ( Real::new(-2.1).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::EQ, + TestCaseCmpOp::Eq, Some(0), ), ( Real::new(3.5).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::EQ, + TestCaseCmpOp::Eq, Some(1), ), ( Real::new(-2.1).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::EQ, + TestCaseCmpOp::Eq, Some(1), ), - (None, None, TestCaseCmpOp::NE, None), - (Real::new(3.5).ok(), None, TestCaseCmpOp::NE, None), - (Real::new(-2.1).ok(), None, TestCaseCmpOp::NE, None), - (None, Real::new(3.5).ok(), TestCaseCmpOp::NE, None), - (None, Real::new(-2.1).ok(), TestCaseCmpOp::NE, None), + (None, None, TestCaseCmpOp::Ne, None), + (Real::new(3.5).ok(), None, TestCaseCmpOp::Ne, None), + (Real::new(-2.1).ok(), None, TestCaseCmpOp::Ne, None), + (None, Real::new(3.5).ok(), TestCaseCmpOp::Ne, None), + (None, Real::new(-2.1).ok(), TestCaseCmpOp::Ne, None), ( Real::new(3.5).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::NE, + TestCaseCmpOp::Ne, Some(1), ), ( Real::new(-2.1).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::NE, + TestCaseCmpOp::Ne, Some(1), ), ( Real::new(3.5).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::NE, + TestCaseCmpOp::Ne, Some(0), ), ( Real::new(-2.1).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::NE, + TestCaseCmpOp::Ne, Some(0), ), - (None, None, TestCaseCmpOp::NullEQ, Some(1)), - (Real::new(3.5).ok(), None, TestCaseCmpOp::NullEQ, Some(0)), - (Real::new(-2.1).ok(), None, TestCaseCmpOp::NullEQ, Some(0)), - (None, Real::new(3.5).ok(), TestCaseCmpOp::NullEQ, Some(0)), - (None, Real::new(-2.1).ok(), TestCaseCmpOp::NullEQ, Some(0)), + (None, None, TestCaseCmpOp::NullEq, Some(1)), + (Real::new(3.5).ok(), None, TestCaseCmpOp::NullEq, Some(0)), + (Real::new(-2.1).ok(), None, TestCaseCmpOp::NullEq, Some(0)), + (None, Real::new(3.5).ok(), TestCaseCmpOp::NullEq, Some(0)), + (None, Real::new(-2.1).ok(), TestCaseCmpOp::NullEq, Some(0)), ( Real::new(3.5).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::NullEQ, + TestCaseCmpOp::NullEq, Some(0), ), ( Real::new(-2.1).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::NullEQ, + TestCaseCmpOp::NullEq, Some(0), ), ( Real::new(3.5).ok(), Real::new(3.5).ok(), - TestCaseCmpOp::NullEQ, + TestCaseCmpOp::NullEq, Some(1), ), ( Real::new(-2.1).ok(), Real::new(-2.1).ok(), - TestCaseCmpOp::NullEQ, + TestCaseCmpOp::NullEq, Some(1), ), ] @@ -770,13 +770,13 @@ mod tests { fn test_compare_real() { for (arg0, arg1, cmp_op, expect_output) in generate_numeric_compare_cases() { let sig = match cmp_op { - TestCaseCmpOp::GT => ScalarFuncSig::GtReal, - TestCaseCmpOp::GE => ScalarFuncSig::GeReal, - TestCaseCmpOp::LT => ScalarFuncSig::LtReal, - TestCaseCmpOp::LE => ScalarFuncSig::LeReal, - TestCaseCmpOp::EQ => ScalarFuncSig::EqReal, - TestCaseCmpOp::NE => ScalarFuncSig::NeReal, - TestCaseCmpOp::NullEQ => ScalarFuncSig::NullEqReal, + TestCaseCmpOp::Gt => ScalarFuncSig::GtReal, + TestCaseCmpOp::Ge => ScalarFuncSig::GeReal, + TestCaseCmpOp::Lt => ScalarFuncSig::LtReal, + TestCaseCmpOp::Le => ScalarFuncSig::LeReal, + TestCaseCmpOp::Eq => ScalarFuncSig::EqReal, + TestCaseCmpOp::Ne => ScalarFuncSig::NeReal, + TestCaseCmpOp::NullEq => ScalarFuncSig::NullEqReal, }; let output = RpnFnScalarEvaluator::new() .push_param(arg0) @@ -795,13 +795,13 @@ mod tests { for (arg0, arg1, cmp_op, expect_output) in generate_numeric_compare_cases() { let sig = match cmp_op { - TestCaseCmpOp::GT => ScalarFuncSig::GtDuration, - TestCaseCmpOp::GE => ScalarFuncSig::GeDuration, - TestCaseCmpOp::LT => ScalarFuncSig::LtDuration, - TestCaseCmpOp::LE => ScalarFuncSig::LeDuration, - TestCaseCmpOp::EQ => ScalarFuncSig::EqDuration, - TestCaseCmpOp::NE => ScalarFuncSig::NeDuration, - TestCaseCmpOp::NullEQ => ScalarFuncSig::NullEqDuration, + TestCaseCmpOp::Gt => ScalarFuncSig::GtDuration, + TestCaseCmpOp::Ge => ScalarFuncSig::GeDuration, + TestCaseCmpOp::Lt => ScalarFuncSig::LtDuration, + TestCaseCmpOp::Le => ScalarFuncSig::LeDuration, + TestCaseCmpOp::Eq => ScalarFuncSig::EqDuration, + TestCaseCmpOp::Ne => ScalarFuncSig::NeDuration, + TestCaseCmpOp::NullEq => ScalarFuncSig::NullEqDuration, }; let output = RpnFnScalarEvaluator::new() .push_param(arg0.map(map_double_to_duration)) @@ -822,13 +822,13 @@ mod tests { let mut ctx = EvalContext::default(); for (arg0, arg1, cmp_op, expect_output) in generate_numeric_compare_cases() { let sig = match cmp_op { - TestCaseCmpOp::GT => ScalarFuncSig::GtDecimal, - TestCaseCmpOp::GE => ScalarFuncSig::GeDecimal, - TestCaseCmpOp::LT => ScalarFuncSig::LtDecimal, - TestCaseCmpOp::LE => ScalarFuncSig::LeDecimal, - TestCaseCmpOp::EQ => ScalarFuncSig::EqDecimal, - TestCaseCmpOp::NE => ScalarFuncSig::NeDecimal, - TestCaseCmpOp::NullEQ => ScalarFuncSig::NullEqDecimal, + TestCaseCmpOp::Gt => ScalarFuncSig::GtDecimal, + TestCaseCmpOp::Ge => ScalarFuncSig::GeDecimal, + TestCaseCmpOp::Lt => ScalarFuncSig::LtDecimal, + TestCaseCmpOp::Le => ScalarFuncSig::LeDecimal, + TestCaseCmpOp::Eq => ScalarFuncSig::EqDecimal, + TestCaseCmpOp::Ne => ScalarFuncSig::NeDecimal, + TestCaseCmpOp::NullEq => ScalarFuncSig::NullEqDecimal, }; let output = RpnFnScalarEvaluator::new() .push_param(arg0.map(|v| f64_to_decimal(&mut ctx, v.into_inner()).unwrap())) @@ -843,13 +843,13 @@ mod tests { fn test_compare_signed_int() { for (arg0, arg1, cmp_op, expect_output) in generate_numeric_compare_cases() { let sig = match cmp_op { - TestCaseCmpOp::GT => ScalarFuncSig::GtInt, - TestCaseCmpOp::GE => ScalarFuncSig::GeInt, - TestCaseCmpOp::LT => ScalarFuncSig::LtInt, - TestCaseCmpOp::LE => ScalarFuncSig::LeInt, - TestCaseCmpOp::EQ => ScalarFuncSig::EqInt, - TestCaseCmpOp::NE => ScalarFuncSig::NeInt, - TestCaseCmpOp::NullEQ => ScalarFuncSig::NullEqInt, + TestCaseCmpOp::Gt => ScalarFuncSig::GtInt, + TestCaseCmpOp::Ge => ScalarFuncSig::GeInt, + TestCaseCmpOp::Lt => ScalarFuncSig::LtInt, + TestCaseCmpOp::Le => ScalarFuncSig::LeInt, + TestCaseCmpOp::Eq => ScalarFuncSig::EqInt, + TestCaseCmpOp::Ne => ScalarFuncSig::NeInt, + TestCaseCmpOp::NullEq => ScalarFuncSig::NullEqInt, }; let output = RpnFnScalarEvaluator::new() .push_param(arg0.map(|v| v.into_inner() as i64)) diff --git a/components/tidb_query_expr/src/impl_math.rs b/components/tidb_query_expr/src/impl_math.rs index 55e86ee14d0..abd190d077a 100644 --- a/components/tidb_query_expr/src/impl_math.rs +++ b/components/tidb_query_expr/src/impl_math.rs @@ -345,7 +345,7 @@ fn rand() -> Result> { #[inline] #[rpn_fn(nullable)] fn rand_with_seed_first_gen(seed: Option<&i64>) -> Result> { - let mut rng = MySQLRng::new_with_seed(seed.cloned().unwrap_or(0)); + let mut rng = MySqlRng::new_with_seed(seed.cloned().unwrap_or(0)); let res = rng.gen(); Ok(Real::new(res).ok()) } @@ -548,7 +548,7 @@ pub fn round_with_frac_real(arg0: &Real, arg1: &Int) -> Result> { } thread_local! { - static MYSQL_RNG: RefCell = RefCell::new(MySQLRng::new()) + static MYSQL_RNG: RefCell = RefCell::new(MySqlRng::new()) } #[derive(Copy, Clone)] @@ -672,12 +672,12 @@ pub fn i64_to_usize(i: i64, is_unsigned: bool) -> (usize, bool) { } } -pub struct MySQLRng { +pub struct MySqlRng { seed1: u32, seed2: u32, } -impl MySQLRng { +impl MySqlRng { fn new() -> Self { let current_time = time::get_time(); let nsec = i64::from(current_time.nsec); @@ -687,7 +687,7 @@ impl MySQLRng { fn new_with_seed(seed: i64) -> Self { let seed1 = (seed.wrapping_mul(0x10001).wrapping_add(55555555)) as u32 % MAX_RAND_VALUE; let seed2 = (seed.wrapping_mul(0x10000001)) as u32 % MAX_RAND_VALUE; - MySQLRng { seed1, seed2 } + MySqlRng { seed1, seed2 } } fn gen(&mut self) -> f64 { @@ -697,7 +697,7 @@ impl MySQLRng { } } -impl Default for MySQLRng { +impl Default for MySqlRng { fn default() -> Self { Self::new() } @@ -2030,9 +2030,9 @@ mod tests { #[test] #[allow(clippy::float_cmp)] fn test_rand_new() { - let mut rng1 = MySQLRng::new(); + let mut rng1 = MySqlRng::new(); std::thread::sleep(std::time::Duration::from_millis(100)); - let mut rng2 = MySQLRng::new(); + let mut rng2 = MySqlRng::new(); let got1 = rng1.gen(); let got2 = rng2.gen(); assert!(got1 < 1.0); @@ -2054,7 +2054,7 @@ mod tests { (9223372036854775807, 0.9050373219931845, 0.37014932126752037), ]; for (seed, exp1, exp2) in tests { - let mut rand = MySQLRng::new_with_seed(seed); + let mut rand = MySqlRng::new_with_seed(seed); let res1 = rand.gen(); assert_eq!(res1, exp1); let res2 = rand.gen(); diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index 679d4e003f8..b5a2ce226c5 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -430,20 +430,20 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::FromBinary => map_from_binary_fn_sig(expr)?, // impl_compare - ScalarFuncSig::LtInt => map_int_sig(value, children, compare_mapper::)?, - ScalarFuncSig::LtReal => compare_fn_meta::>(), - ScalarFuncSig::LtDecimal => compare_fn_meta::>(), - ScalarFuncSig::LtString => map_string_compare_sig::(ft)?, - ScalarFuncSig::LtTime => compare_fn_meta::>(), - ScalarFuncSig::LtDuration => compare_fn_meta::>(), - ScalarFuncSig::LtJson => compare_json_fn_meta::(), - ScalarFuncSig::LeInt => map_int_sig(value, children, compare_mapper::)?, - ScalarFuncSig::LeReal => compare_fn_meta::>(), - ScalarFuncSig::LeDecimal => compare_fn_meta::>(), - ScalarFuncSig::LeString => map_string_compare_sig::(ft)?, - ScalarFuncSig::LeTime => compare_fn_meta::>(), - ScalarFuncSig::LeDuration => compare_fn_meta::>(), - ScalarFuncSig::LeJson => compare_json_fn_meta::(), + ScalarFuncSig::LtInt => map_int_sig(value, children, compare_mapper::)?, + ScalarFuncSig::LtReal => compare_fn_meta::>(), + ScalarFuncSig::LtDecimal => compare_fn_meta::>(), + ScalarFuncSig::LtString => map_string_compare_sig::(ft)?, + ScalarFuncSig::LtTime => compare_fn_meta::>(), + ScalarFuncSig::LtDuration => compare_fn_meta::>(), + ScalarFuncSig::LtJson => compare_json_fn_meta::(), + ScalarFuncSig::LeInt => map_int_sig(value, children, compare_mapper::)?, + ScalarFuncSig::LeReal => compare_fn_meta::>(), + ScalarFuncSig::LeDecimal => compare_fn_meta::>(), + ScalarFuncSig::LeString => map_string_compare_sig::(ft)?, + ScalarFuncSig::LeTime => compare_fn_meta::>(), + ScalarFuncSig::LeDuration => compare_fn_meta::>(), + ScalarFuncSig::LeJson => compare_json_fn_meta::(), ScalarFuncSig::GreatestInt => greatest_int_fn_meta(), ScalarFuncSig::GreatestDecimal => greatest_decimal_fn_meta(), ScalarFuncSig::GreatestString => greatest_string_fn_meta(), @@ -464,41 +464,41 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::LeastCmpStringAsTime=> least_cmp_string_as_time_fn_meta(), ScalarFuncSig::LeastDuration => least_duration_fn_meta(), ScalarFuncSig::IntervalReal => interval_real_fn_meta(), - ScalarFuncSig::GtInt => map_int_sig(value, children, compare_mapper::)?, - ScalarFuncSig::GtReal => compare_fn_meta::>(), - ScalarFuncSig::GtDecimal => compare_fn_meta::>(), - ScalarFuncSig::GtString => map_string_compare_sig::(ft)?, - ScalarFuncSig::GtTime => compare_fn_meta::>(), - ScalarFuncSig::GtDuration => compare_fn_meta::>(), - ScalarFuncSig::GtJson => compare_json_fn_meta::(), - ScalarFuncSig::GeInt => map_int_sig(value, children, compare_mapper::)?, - ScalarFuncSig::GeReal => compare_fn_meta::>(), - ScalarFuncSig::GeDecimal => compare_fn_meta::>(), - ScalarFuncSig::GeString => map_string_compare_sig::(ft)?, - ScalarFuncSig::GeTime => compare_fn_meta::>(), - ScalarFuncSig::GeDuration => compare_fn_meta::>(), - ScalarFuncSig::GeJson => compare_json_fn_meta::(), - ScalarFuncSig::NeInt => map_int_sig(value, children, compare_mapper::)?, - ScalarFuncSig::NeReal => compare_fn_meta::>(), - ScalarFuncSig::NeDecimal => compare_fn_meta::>(), - ScalarFuncSig::NeString => map_string_compare_sig::(ft)?, - ScalarFuncSig::NeTime => compare_fn_meta::>(), - ScalarFuncSig::NeDuration => compare_fn_meta::>(), - ScalarFuncSig::NeJson => compare_json_fn_meta::(), - ScalarFuncSig::EqInt => map_int_sig(value, children, compare_mapper::)?, - ScalarFuncSig::EqReal => compare_fn_meta::>(), - ScalarFuncSig::EqDecimal => compare_fn_meta::>(), - ScalarFuncSig::EqString => map_string_compare_sig::(ft)?, - ScalarFuncSig::EqTime => compare_fn_meta::>(), - ScalarFuncSig::EqDuration => compare_fn_meta::>(), - ScalarFuncSig::EqJson => compare_json_fn_meta::(), - ScalarFuncSig::NullEqInt => map_int_sig(value, children, compare_mapper::)?, - ScalarFuncSig::NullEqReal => compare_fn_meta::>(), - ScalarFuncSig::NullEqDecimal => compare_fn_meta::>(), - ScalarFuncSig::NullEqString => map_string_compare_sig::(ft)?, - ScalarFuncSig::NullEqTime => compare_fn_meta::>(), - ScalarFuncSig::NullEqDuration => compare_fn_meta::>(), - ScalarFuncSig::NullEqJson => compare_json_fn_meta::(), + ScalarFuncSig::GtInt => map_int_sig(value, children, compare_mapper::)?, + ScalarFuncSig::GtReal => compare_fn_meta::>(), + ScalarFuncSig::GtDecimal => compare_fn_meta::>(), + ScalarFuncSig::GtString => map_string_compare_sig::(ft)?, + ScalarFuncSig::GtTime => compare_fn_meta::>(), + ScalarFuncSig::GtDuration => compare_fn_meta::>(), + ScalarFuncSig::GtJson => compare_json_fn_meta::(), + ScalarFuncSig::GeInt => map_int_sig(value, children, compare_mapper::)?, + ScalarFuncSig::GeReal => compare_fn_meta::>(), + ScalarFuncSig::GeDecimal => compare_fn_meta::>(), + ScalarFuncSig::GeString => map_string_compare_sig::(ft)?, + ScalarFuncSig::GeTime => compare_fn_meta::>(), + ScalarFuncSig::GeDuration => compare_fn_meta::>(), + ScalarFuncSig::GeJson => compare_json_fn_meta::(), + ScalarFuncSig::NeInt => map_int_sig(value, children, compare_mapper::)?, + ScalarFuncSig::NeReal => compare_fn_meta::>(), + ScalarFuncSig::NeDecimal => compare_fn_meta::>(), + ScalarFuncSig::NeString => map_string_compare_sig::(ft)?, + ScalarFuncSig::NeTime => compare_fn_meta::>(), + ScalarFuncSig::NeDuration => compare_fn_meta::>(), + ScalarFuncSig::NeJson => compare_json_fn_meta::(), + ScalarFuncSig::EqInt => map_int_sig(value, children, compare_mapper::)?, + ScalarFuncSig::EqReal => compare_fn_meta::>(), + ScalarFuncSig::EqDecimal => compare_fn_meta::>(), + ScalarFuncSig::EqString => map_string_compare_sig::(ft)?, + ScalarFuncSig::EqTime => compare_fn_meta::>(), + ScalarFuncSig::EqDuration => compare_fn_meta::>(), + ScalarFuncSig::EqJson => compare_json_fn_meta::(), + ScalarFuncSig::NullEqInt => map_int_sig(value, children, compare_mapper::)?, + ScalarFuncSig::NullEqReal => compare_fn_meta::>(), + ScalarFuncSig::NullEqDecimal => compare_fn_meta::>(), + ScalarFuncSig::NullEqString => map_string_compare_sig::(ft)?, + ScalarFuncSig::NullEqTime => compare_fn_meta::>(), + ScalarFuncSig::NullEqDuration => compare_fn_meta::>(), + ScalarFuncSig::NullEqJson => compare_json_fn_meta::(), ScalarFuncSig::CoalesceInt => coalesce_fn_meta::(), ScalarFuncSig::CoalesceReal => coalesce_fn_meta::(), ScalarFuncSig::CoalesceString => coalesce_bytes_fn_meta(), diff --git a/components/tidb_query_expr/src/types/expr_eval.rs b/components/tidb_query_expr/src/types/expr_eval.rs index 078bbf1bb80..b892333b0ef 100644 --- a/components/tidb_query_expr/src/types/expr_eval.rs +++ b/components/tidb_query_expr/src/types/expr_eval.rs @@ -1274,7 +1274,7 @@ mod tests { .push_column_ref_for_test(0) .push_column_ref_for_test(0) .push_fn_call_for_test( - compare_fn_meta::>(), + compare_fn_meta::>(), 2, FieldTypeTp::LongLong, ) @@ -1312,7 +1312,7 @@ mod tests { .push_column_ref_for_test(0) .push_column_ref_for_test(0) .push_fn_call_for_test( - compare_fn_meta::>(), + compare_fn_meta::>(), 2, FieldTypeTp::LongLong, ) diff --git a/components/tikv_alloc/src/error.rs b/components/tikv_alloc/src/error.rs index c098a387c2e..a8912389784 100644 --- a/components/tikv_alloc/src/error.rs +++ b/components/tikv_alloc/src/error.rs @@ -5,7 +5,7 @@ use std::{error, fmt}; #[derive(Debug)] pub enum ProfError { MemProfilingNotEnabled, - IOError(std::io::Error), + IoError(std::io::Error), JemallocError(String), PathEncodingError(std::ffi::OsString), /* When temp files are in a non-unicode directory, * OsString.into_string() will cause this error, */ @@ -18,7 +18,7 @@ impl fmt::Display for ProfError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { ProfError::MemProfilingNotEnabled => write!(f, "mem-profiling was not enabled"), - ProfError::IOError(e) => write!(f, "io error occurred {:?}", e), + ProfError::IoError(e) => write!(f, "io error occurred {:?}", e), ProfError::JemallocError(e) => write!(f, "jemalloc error {}", e), ProfError::PathEncodingError(path) => { write!(f, "Dump target path {:?} is not unicode encoding", path) @@ -32,7 +32,7 @@ impl fmt::Display for ProfError { impl From for ProfError { fn from(e: std::io::Error) -> Self { - ProfError::IOError(e) + ProfError::IoError(e) } } diff --git a/etc/error_code.toml b/etc/error_code.toml index 20887f27abd..9a42cc3769a 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -198,9 +198,9 @@ error = ''' KV:Engine:IO ''' -["KV:Engine:CFName"] +["KV:Engine:CfName"] error = ''' -KV:Engine:CFName +KV:Engine:CfName ''' ["KV:Engine:Codec"] @@ -463,9 +463,9 @@ error = ''' KV:SSTImporter:Future ''' -["KV:SSTImporter:RocksDB"] +["KV:SSTImporter:RocksDb"] error = ''' -KV:SSTImporter:RocksDB +KV:SSTImporter:RocksDb ''' ["KV:SSTImporter:ParseIntError"] diff --git a/src/config.rs b/src/config.rs index 8a9bf2d2468..3f609f6c10d 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2,7 +2,7 @@ //! Configuration for the entire server. //! -//! TiKV is configured through the `TiKvConfig` type, which is in turn +//! TiKV is configured through the `TikvConfig` type, which is in turn //! made up of many other configuration types. use std::{ @@ -2850,7 +2850,7 @@ impl QuotaConfig { #[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] -pub struct TiKvConfig { +pub struct TikvConfig { #[doc(hidden)] #[serde(skip_serializing)] #[online_config(hidden)] @@ -2975,9 +2975,9 @@ pub struct TiKvConfig { pub causal_ts: CausalTsConfig, } -impl Default for TiKvConfig { - fn default() -> TiKvConfig { - TiKvConfig { +impl Default for TikvConfig { + fn default() -> TikvConfig { + TikvConfig { cfg_path: "".to_owned(), log_level: slog::Level::Info.into(), log_file: "".to_owned(), @@ -3019,7 +3019,7 @@ impl Default for TiKvConfig { } } -impl TiKvConfig { +impl TikvConfig { pub fn infer_raft_db_path(&self, data_dir: Option<&str>) -> Result> { if self.raft_store.raftdb_path.is_empty() { let data_dir = data_dir.unwrap_or(&self.storage.data_dir); @@ -3286,7 +3286,7 @@ impl TiKvConfig { // As the init of `logger` is very early, this adjust needs to be separated and // called immediately after parsing the command line. pub fn logger_compatible_adjust(&mut self) { - let default_tikv_cfg = TiKvConfig::default(); + let default_tikv_cfg = TikvConfig::default(); let default_log_cfg = LogConfig::default(); if self.log_level != default_tikv_cfg.log_level { eprintln!("deprecated configuration, log-level has been moved to log.level"); @@ -3538,7 +3538,7 @@ impl TiKvConfig { let mut cfg = if let Some(keys) = unrecognized_keys { serde_ignored::deserialize(&mut deserializer, |key| keys.push(key.to_string())) } else { - ::deserialize(&mut deserializer) + ::deserialize(&mut deserializer) }?; deserializer.end()?; cfg.cfg_path = path.display().to_string(); @@ -3559,9 +3559,9 @@ impl TiKvConfig { self.rocksdb.write_into_metrics(); } - pub fn with_tmp() -> Result<(TiKvConfig, tempfile::TempDir), IoError> { + pub fn with_tmp() -> Result<(TikvConfig, tempfile::TempDir), IoError> { let tmp = tempfile::tempdir()?; - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.storage.data_dir = tmp.path().display().to_string(); cfg.cfg_path = tmp.path().join(LAST_CONFIG_FILE).display().to_string(); Ok((cfg, tmp)) @@ -3597,7 +3597,7 @@ impl TiKvConfig { /// Loads the previously-loaded configuration from `last_tikv.toml`, /// compares key configuration items and fails if they are not /// identical. -pub fn check_critical_config(config: &TiKvConfig) -> Result<(), String> { +pub fn check_critical_config(config: &TikvConfig) -> Result<(), String> { // Check current critical configurations with last time, if there are some // changes, user must guarantee relevant works have been done. if let Some(mut cfg) = get_last_config(&config.storage.data_dir) { @@ -3610,12 +3610,12 @@ pub fn check_critical_config(config: &TiKvConfig) -> Result<(), String> { Ok(()) } -fn get_last_config(data_dir: &str) -> Option { +fn get_last_config(data_dir: &str) -> Option { let store_path = Path::new(data_dir); let last_cfg_path = store_path.join(LAST_CONFIG_FILE); if last_cfg_path.exists() { return Some( - TiKvConfig::from_file(&last_cfg_path, None).unwrap_or_else(|e| { + TikvConfig::from_file(&last_cfg_path, None).unwrap_or_else(|e| { panic!( "invalid auto generated configuration file {}, err {}", last_cfg_path.display(), @@ -3628,7 +3628,7 @@ fn get_last_config(data_dir: &str) -> Option { } /// Persists config to `last_tikv.toml` -pub fn persist_config(config: &TiKvConfig) -> Result<(), String> { +pub fn persist_config(config: &TikvConfig) -> Result<(), String> { let store_path = Path::new(&config.storage.data_dir); let last_cfg_path = store_path.join(LAST_CONFIG_FILE); let tmp_cfg_path = store_path.join(TMP_CONFIG_FILE); @@ -3694,7 +3694,7 @@ pub fn write_config>(path: P, content: &[u8]) -> CfgResult<()> { } // convert tikv config to a flatten array. -pub fn to_flatten_config_info(cfg: &TiKvConfig) -> Vec { +pub fn to_flatten_config_info(cfg: &TikvConfig) -> Vec { fn to_cfg_value(default_value: &Value, cfg_value: Option<&Value>, key: &str) -> Value { let mut res = Map::with_capacity(2); res.insert("Name".into(), Value::String(key.into())); @@ -3746,7 +3746,7 @@ pub fn to_flatten_config_info(cfg: &TiKvConfig) -> Vec { } let cfg_value = to_value(cfg).unwrap(); - let default_value = to_value(TiKvConfig::default()).unwrap(); + let default_value = to_value(TikvConfig::default()).unwrap(); let mut key_buf = String::new(); let mut res = Vec::new(); @@ -3760,7 +3760,7 @@ pub fn to_flatten_config_info(cfg: &TiKvConfig) -> Vec { } lazy_static! { - pub static ref TIKVCONFIG_TYPED: ConfigChange = TiKvConfig::default().typed(); + pub static ref TIKVCONFIG_TYPED: ConfigChange = TikvConfig::default().typed(); } fn serde_to_online_config(name: String) -> String { @@ -3918,7 +3918,7 @@ pub enum Module { PessimisticTxn, Gc, Split, - CDC, + Cdc, ResolvedTs, ResourceMetering, BackupStream, @@ -3947,7 +3947,7 @@ impl From<&str> for Module { "backup_stream" => Module::BackupStream, "pessimistic_txn" => Module::PessimisticTxn, "gc" => Module::Gc, - "cdc" => Module::CDC, + "cdc" => Module::Cdc, "resolved_ts" => Module::ResolvedTs, "resource_metering" => Module::ResourceMetering, "quota" => Module::Quota, @@ -3967,12 +3967,12 @@ pub struct ConfigController { #[derive(Default)] struct ConfigInner { - current: TiKvConfig, + current: TikvConfig, config_mgrs: HashMap>, } impl ConfigController { - pub fn new(current: TiKvConfig) -> Self { + pub fn new(current: TikvConfig) -> Self { ConfigController { inner: Arc::new(RwLock::new(ConfigInner { current, @@ -3988,7 +3988,7 @@ impl ConfigController { pub fn update_from_toml_file(&self) -> CfgResult<()> { let current = self.get_current(); - match TiKvConfig::from_file(Path::new(¤t.cfg_path), None) { + match TikvConfig::from_file(Path::new(¤t.cfg_path), None) { Ok(incoming) => { let diff = current.diff(&incoming); self.update_impl(diff, None) @@ -4066,7 +4066,7 @@ impl ConfigController { } } - pub fn get_current(&self) -> TiKvConfig { + pub fn get_current(&self) -> TikvConfig { self.inner.read().unwrap().current.clone() } } @@ -4122,13 +4122,13 @@ mod tests { #[test] fn test_check_critical_cfg_with() { - let mut tikv_cfg = TiKvConfig::default(); - let last_cfg = TiKvConfig::default(); + let mut tikv_cfg = TikvConfig::default(); + let last_cfg = TikvConfig::default(); tikv_cfg.validate().unwrap(); tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); - let mut tikv_cfg = TiKvConfig::default(); - let mut last_cfg = TiKvConfig::default(); + let mut tikv_cfg = TikvConfig::default(); + let mut last_cfg = TikvConfig::default(); tikv_cfg.rocksdb.wal_dir = "/data/wal_dir".to_owned(); tikv_cfg.validate().unwrap(); tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap_err(); @@ -4137,8 +4137,8 @@ mod tests { tikv_cfg.validate().unwrap(); tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); - let mut tikv_cfg = TiKvConfig::default(); - let mut last_cfg = TiKvConfig::default(); + let mut tikv_cfg = TikvConfig::default(); + let mut last_cfg = TikvConfig::default(); tikv_cfg.storage.data_dir = "/data1".to_owned(); tikv_cfg.validate().unwrap(); tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap_err(); @@ -4148,8 +4148,8 @@ mod tests { tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); // Enable Raft Engine. - let mut tikv_cfg = TiKvConfig::default(); - let mut last_cfg = TiKvConfig::default(); + let mut tikv_cfg = TikvConfig::default(); + let mut last_cfg = TikvConfig::default(); tikv_cfg.raft_engine.enable = true; last_cfg.raft_engine.enable = true; @@ -4162,8 +4162,8 @@ mod tests { tikv_cfg.check_critical_cfg_with(&last_cfg).unwrap(); // Disable Raft Engine and uses RocksDB. - let mut tikv_cfg = TiKvConfig::default(); - let mut last_cfg = TiKvConfig::default(); + let mut tikv_cfg = TikvConfig::default(); + let mut last_cfg = TikvConfig::default(); tikv_cfg.raft_engine.enable = false; last_cfg.raft_engine.enable = false; @@ -4210,7 +4210,7 @@ mod tests { #[test] fn test_last_cfg_modified() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); let store_path = Path::new(&cfg.storage.data_dir); let last_cfg_path = store_path.join(LAST_CONFIG_FILE); @@ -4238,12 +4238,12 @@ mod tests { let file = path_buf.as_path(); let (s1, s2) = ("/xxx/wal_dir".to_owned(), "/yyy/wal_dir".to_owned()); - let mut tikv_cfg = TiKvConfig::default(); + let mut tikv_cfg = TikvConfig::default(); tikv_cfg.rocksdb.wal_dir = s1.clone(); tikv_cfg.raftdb.wal_dir = s2.clone(); tikv_cfg.write_to_file(file).unwrap(); - let cfg_from_file = TiKvConfig::from_file(file, None).unwrap_or_else(|e| { + let cfg_from_file = TikvConfig::from_file(file, None).unwrap_or_else(|e| { panic!( "invalid auto generated configuration file {}, err {}", file.display(), @@ -4257,7 +4257,7 @@ mod tests { tikv_cfg.rocksdb.wal_dir = s2.clone(); tikv_cfg.raftdb.wal_dir = s1.clone(); tikv_cfg.write_to_file(file).unwrap(); - let cfg_from_file = TiKvConfig::from_file(file, None).unwrap_or_else(|e| { + let cfg_from_file = TikvConfig::from_file(file, None).unwrap_or_else(|e| { panic!( "invalid auto generated configuration file {}, err {}", file.display(), @@ -4270,7 +4270,7 @@ mod tests { #[test] fn test_flatten_cfg() { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.server.labels.insert("zone".into(), "test".into()); cfg.raft_store.raft_log_gc_count_limit = Some(123); @@ -4303,14 +4303,14 @@ mod tests { .unwrap(); let path = root_path.path().join("not_exist_dir"); - let mut tikv_cfg = TiKvConfig::default(); + let mut tikv_cfg = TikvConfig::default(); tikv_cfg.storage.data_dir = path.as_path().to_str().unwrap().to_owned(); persist_config(&tikv_cfg).unwrap(); } #[test] fn test_keepalive_check() { - let mut tikv_cfg = TiKvConfig::default(); + let mut tikv_cfg = TikvConfig::default(); tikv_cfg.pd.endpoints = vec!["".to_owned()]; let dur = tikv_cfg.raft_store.raft_heartbeat_interval(); tikv_cfg.server.grpc_keepalive_time = ReadableDuration(dur); @@ -4321,7 +4321,7 @@ mod tests { #[test] fn test_block_size() { - let mut tikv_cfg = TiKvConfig::default(); + let mut tikv_cfg = TikvConfig::default(); tikv_cfg.pd.endpoints = vec!["".to_owned()]; tikv_cfg.rocksdb.defaultcf.block_size = ReadableSize::gb(10); tikv_cfg.rocksdb.lockcf.block_size = ReadableSize::gb(10); @@ -4392,8 +4392,8 @@ mod tests { ConfigValue::from(10000u64) ); - let old = TiKvConfig::default(); - let mut incoming = TiKvConfig::default(); + let old = TikvConfig::default(); + let mut incoming = TikvConfig::default(); incoming.coprocessor.region_split_keys = Some(10000); incoming.gc.max_write_bytes_per_sec = ReadableSize::mb(100); incoming.rocksdb.defaultcf.block_cache_size = ReadableSize::mb(500); @@ -4487,7 +4487,7 @@ mod tests { #[allow(clippy::type_complexity)] fn new_engines( - cfg: TiKvConfig, + cfg: TikvConfig, ) -> ( Storage, ConfigController, @@ -4545,7 +4545,7 @@ mod tests { #[test] fn test_flow_control() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.storage.flow_control.l0_files_threshold = 50; cfg.validate().unwrap(); let (storage, cfg_controller, _, flow_controller) = new_engines::(cfg); @@ -4604,7 +4604,7 @@ mod tests { } } - let (cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); let cfg_controller = ConfigController::new(cfg); let (tx, rx) = channel::unbounded(); cfg_controller.register(Module::ResolvedTs, Box::new(TestConfigManager(tx))); @@ -4659,7 +4659,7 @@ mod tests { #[test] fn test_change_rocksdb_config() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.rocksdb.max_background_jobs = 4; cfg.rocksdb.max_background_flushes = 2; cfg.rocksdb.defaultcf.disable_auto_compactions = false; @@ -4737,7 +4737,7 @@ mod tests { #[test] fn test_change_rate_limiter_auto_tuned() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); // vanilla limiter does not support dynamically changing auto-tuned mode. cfg.rocksdb.rate_limiter_auto_tuned = true; cfg.validate().unwrap(); @@ -4761,7 +4761,7 @@ mod tests { #[test] fn test_change_shared_block_cache() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.storage.block_cache.shared = true; cfg.validate().unwrap(); let (storage, cfg_controller, ..) = new_engines::(cfg); @@ -4785,7 +4785,7 @@ mod tests { #[test] fn test_change_logconfig() { - let (cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); let cfg_controller = ConfigController::new(cfg); cfg_controller.register(Module::Log, Box::new(LogConfigManager)); @@ -4808,7 +4808,7 @@ mod tests { #[test] fn test_dispatch_titan_blob_run_mode_config() { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); let mut incoming = cfg.clone(); cfg.rocksdb.defaultcf.titan.blob_run_mode = BlobRunMode::Normal; incoming.rocksdb.defaultcf.titan.blob_run_mode = BlobRunMode::Fallback; @@ -4828,7 +4828,7 @@ mod tests { #[test] fn test_change_ttl_check_poll_interval() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.storage.block_cache.shared = true; cfg.validate().unwrap(); let (_, cfg_controller, mut rx, _) = new_engines::(cfg); @@ -4845,7 +4845,7 @@ mod tests { #[test] fn test_change_store_scheduler_worker_pool_size() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.storage.scheduler_worker_pool_size = 4; cfg.validate().unwrap(); let (storage, cfg_controller, ..) = new_engines::(cfg); @@ -4895,7 +4895,7 @@ mod tests { #[test] fn test_change_quota_config() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.quota.foreground_cpu_time = 1000; cfg.quota.foreground_write_bandwidth = ReadableSize::mb(128); cfg.quota.foreground_read_bandwidth = ReadableSize::mb(256); @@ -5008,7 +5008,7 @@ mod tests { #[test] fn test_change_server_config() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.validate().unwrap(); let cfg_controller = ConfigController::new(cfg.clone()); let (scheduler, _receiver) = dummy_scheduler(); @@ -5022,7 +5022,7 @@ mod tests { )), ); - let check_cfg = |cfg: &TiKvConfig| { + let check_cfg = |cfg: &TikvConfig| { assert_eq!(&cfg_controller.get_current(), cfg); assert_eq!(&*version_tracker.value(), &cfg.server); }; @@ -5045,7 +5045,7 @@ mod tests { fn test_compatible_adjust_validate_equal() { // After calling many time of `compatible_adjust` and `validate` should has // the same effect as calling `compatible_adjust` and `validate` one time - let mut c = TiKvConfig::default(); + let mut c = TikvConfig::default(); let mut cfg = c.clone(); c.compatible_adjust(); c.validate().unwrap(); @@ -5063,7 +5063,7 @@ mod tests { [readpool.storage] [readpool.coprocessor] "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.compatible_adjust(); assert_eq!(cfg.readpool.storage.use_unified_pool, Some(true)); assert_eq!(cfg.readpool.coprocessor.use_unified_pool, Some(true)); @@ -5074,7 +5074,7 @@ mod tests { [readpool.coprocessor] normal-concurrency = 1 "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.compatible_adjust(); assert_eq!(cfg.readpool.storage.use_unified_pool, Some(false)); assert_eq!(cfg.readpool.coprocessor.use_unified_pool, Some(false)); @@ -5104,7 +5104,7 @@ mod tests { temp_config_writer.sync_data().unwrap(); let mut unrecognized_keys = Vec::new(); - let _ = TiKvConfig::from_file(temp_config_file.path(), Some(&mut unrecognized_keys)); + let _ = TikvConfig::from_file(temp_config_file.path(), Some(&mut unrecognized_keys)); assert_eq!( unrecognized_keys, @@ -5124,7 +5124,7 @@ mod tests { [raft-engine] enable = true "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); assert_eq!( cfg.raft_engine.config.dir, @@ -5182,7 +5182,7 @@ mod tests { #[test] fn test_validate_tikv_config() { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.validate().unwrap(); let default_region_split_check_diff = cfg.raft_store.region_split_check_diff().0; cfg.raft_store.region_split_check_diff = @@ -5226,19 +5226,19 @@ mod tests { } { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.validate().unwrap(); } { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.storage.data_dir = tmp_path_string_generate!(tmp_path, "data"); cfg.raft_store.raftdb_path = tmp_path_string_generate!(tmp_path, "data", "db"); cfg.validate().unwrap_err(); } { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.storage.data_dir = tmp_path_string_generate!(tmp_path, "data", "kvdb"); cfg.raft_store.raftdb_path = tmp_path_string_generate!(tmp_path, "data", "raftdb", "db"); @@ -5247,7 +5247,7 @@ mod tests { } { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.storage.data_dir = tmp_path_string_generate!(tmp_path, "data", "kvdb"); cfg.raft_store.raftdb_path = tmp_path_string_generate!(tmp_path, "data", "raftdb", "db"); @@ -5256,14 +5256,14 @@ mod tests { } { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.rocksdb.wal_dir = tmp_path_string_generate!(tmp_path, "data", "wal"); cfg.raftdb.wal_dir = tmp_path_string_generate!(tmp_path, "data", "wal"); cfg.validate().unwrap_err(); } { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.storage.data_dir = tmp_path_string_generate!(tmp_path, "data", "kvdb"); cfg.raft_store.raftdb_path = tmp_path_string_generate!(tmp_path, "data", "raftdb", "db"); @@ -5394,7 +5394,7 @@ mod tests { .map(|l| l.strip_prefix('#').unwrap_or(l)) .join("\n"); - let mut cfg: TiKvConfig = toml::from_str(&template_config).unwrap(); + let mut cfg: TikvConfig = toml::from_str(&template_config).unwrap(); cfg.validate().unwrap(); } @@ -5407,7 +5407,7 @@ mod tests { let mut deserializer = toml::Deserializer::new(&template_config); let mut unrecognized_keys = Vec::new(); - let _: TiKvConfig = serde_ignored::deserialize(&mut deserializer, |key| { + let _: TikvConfig = serde_ignored::deserialize(&mut deserializer, |key| { unrecognized_keys.push(key.to_string()) }) .unwrap(); @@ -5423,8 +5423,8 @@ mod tests { .map(|l| l.strip_prefix('#').unwrap_or(l)) .join("\n"); - let mut cfg: TiKvConfig = toml::from_str(&template_config).unwrap(); - let mut default_cfg = TiKvConfig::default(); + let mut cfg: TikvConfig = toml::from_str(&template_config).unwrap(); + let mut default_cfg = TikvConfig::default(); // Some default values are computed based on the environment. // Because we can't set config values for these in `config-template.toml`, we @@ -5514,7 +5514,7 @@ mod tests { .lines() .map(|l| l.strip_prefix('#').unwrap_or(l)) .join("\n"); - let _: TiKvConfig = toml::from_str(&template_config).unwrap(); + let _: TikvConfig = toml::from_str(&template_config).unwrap(); } Err(e) => { if e.is_timeout() { @@ -5531,7 +5531,7 @@ mod tests { let content = r#" [cdc] "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); // old-value-cache-size is deprecated, 0 must not report error. @@ -5539,28 +5539,28 @@ mod tests { [cdc] old-value-cache-size = 0 "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); let content = r#" [cdc] min-ts-interval = "0s" "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); let content = r#" [cdc] incremental-scan-threads = 0 "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); let content = r#" [cdc] incremental-scan-concurrency = 0 "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); let content = r#" @@ -5568,7 +5568,7 @@ mod tests { incremental-scan-concurrency = 1 incremental-scan-threads = 2 "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); } @@ -5591,7 +5591,7 @@ mod tests { ("backup_stream", Module::BackupStream), ("pessimistic_txn", Module::PessimisticTxn), ("gc", Module::Gc), - ("cdc", Module::CDC), + ("cdc", Module::Cdc), ("resolved_ts", Module::ResolvedTs), ("resource_metering", Module::ResourceMetering), ("unknown", Module::Unknown("unknown".to_string())), @@ -5722,7 +5722,7 @@ mod tests { l0-files-threshold = 77 soft-pending-compaction-bytes-limit = "777GB" "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); assert_eq!( cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger, @@ -5744,7 +5744,7 @@ mod tests { soft-pending-compaction-bytes-limit = "888GB" [rocksdb.writecf] "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); assert_eq!( cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger, @@ -5767,7 +5767,7 @@ mod tests { level0-slowdown-writes-trigger = 66 soft-pending-compaction-bytes-limit = "666GB" "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); assert_eq!( cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger, @@ -5788,7 +5788,7 @@ mod tests { level0-slowdown-writes-trigger = 88 soft-pending-compaction-bytes-limit = "888GB" "#; - let mut cfg: TiKvConfig = toml::from_str(content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); assert_eq!( cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger, diff --git a/src/coprocessor/checksum.rs b/src/coprocessor/checksum.rs index 32819879188..f208b87ee0f 100644 --- a/src/coprocessor/checksum.rs +++ b/src/coprocessor/checksum.rs @@ -15,14 +15,14 @@ use tipb::{ChecksumAlgorithm, ChecksumRequest, ChecksumResponse}; use yatp::task::future::reschedule; use crate::{ - coprocessor::{dag::TiKvStorage, *}, + coprocessor::{dag::TikvStorage, *}, storage::{Snapshot, SnapshotStore, Statistics}, }; // `ChecksumContext` is used to handle `ChecksumRequest` pub struct ChecksumContext { req: ChecksumRequest, - scanner: RangesScanner>>, + scanner: RangesScanner>>, } impl ChecksumContext { @@ -43,7 +43,7 @@ impl ChecksumContext { false, ); let scanner = RangesScanner::new(RangesScannerOptions { - storage: TiKvStorage::new(store, false), + storage: TikvStorage::new(store, false), ranges: ranges .into_iter() .map(|r| Range::from_pb_range(r, false)) diff --git a/src/coprocessor/dag/mod.rs b/src/coprocessor/dag/mod.rs index 8b3f561ce5f..5b06638f244 100644 --- a/src/coprocessor/dag/mod.rs +++ b/src/coprocessor/dag/mod.rs @@ -11,7 +11,7 @@ use tidb_query_common::{execute_stats::ExecSummary, storage::IntervalRange}; use tikv_alloc::trace::MemoryTraceGuard; use tipb::{DagRequest, SelectResponse, StreamResponse}; -pub use self::storage_impl::TiKvStorage; +pub use self::storage_impl::TikvStorage; use crate::{ coprocessor::{metrics::*, Deadline, RequestHandler, Result}, storage::{Statistics, Store}, @@ -103,7 +103,7 @@ impl BatchDagHandler { runner: tidb_query_executors::runner::BatchExecutorsRunner::from_request( req, ranges, - TiKvStorage::new(store, is_cache_enabled), + TikvStorage::new(store, is_cache_enabled), deadline, streaming_batch_limit, is_streaming, diff --git a/src/coprocessor/dag/storage_impl.rs b/src/coprocessor/dag/storage_impl.rs index 7f5e60081e7..6d819b7b94f 100644 --- a/src/coprocessor/dag/storage_impl.rs +++ b/src/coprocessor/dag/storage_impl.rs @@ -11,14 +11,14 @@ use crate::{ }; /// A `Storage` implementation over TiKV's storage. -pub struct TiKvStorage { +pub struct TikvStorage { store: S, scanner: Option, cf_stats_backlog: Statistics, met_newer_ts_data_backlog: NewerTsCheckState, } -impl TiKvStorage { +impl TikvStorage { pub fn new(store: S, check_can_be_cached: bool) -> Self { Self { store, @@ -33,7 +33,7 @@ impl TiKvStorage { } } -impl Storage for TiKvStorage { +impl Storage for TikvStorage { type Statistics = Statistics; fn begin_scan( diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 70144f47ce1..e11558e73b3 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -37,7 +37,7 @@ use yatp::task::future::reschedule; use super::{cmsketch::CmSketch, fmsketch::FmSketch, histogram::Histogram}; use crate::{ - coprocessor::{dag::TiKvStorage, MEMTRACE_ANALYZE, *}, + coprocessor::{dag::TikvStorage, MEMTRACE_ANALYZE, *}, storage::{Snapshot, SnapshotStore, Statistics}, }; @@ -47,7 +47,7 @@ const ANALYZE_VERSION_V2: i32 = 2; // `AnalyzeContext` is used to handle `AnalyzeReq` pub struct AnalyzeContext { req: AnalyzeReq, - storage: Option>>, + storage: Option>>, ranges: Vec, storage_stats: Statistics, quota_limiter: Arc, @@ -76,7 +76,7 @@ impl AnalyzeContext { Ok(Self { req, - storage: Some(TiKvStorage::new(store, false)), + storage: Some(TikvStorage::new(store, false)), ranges, storage_stats: Statistics::default(), quota_limiter, @@ -126,7 +126,7 @@ impl AnalyzeContext { // it would build a histogram and count-min sketch of index values. async fn handle_index( req: AnalyzeIndexReq, - scanner: &mut RangesScanner>>, + scanner: &mut RangesScanner>>, is_common_handle: bool, ) -> Result> { let mut hist = Histogram::new(req.get_bucket_size() as usize); @@ -317,7 +317,7 @@ impl RequestHandler for AnalyzeContext { } struct RowSampleBuilder { - data: BatchTableScanExecutor>>, + data: BatchTableScanExecutor>>, max_sample_size: usize, max_fm_sketch_size: usize, @@ -331,7 +331,7 @@ struct RowSampleBuilder { impl RowSampleBuilder { fn new( mut req: AnalyzeColumnsReq, - storage: TiKvStorage>, + storage: TikvStorage>, ranges: Vec, quota_limiter: Arc, is_auto_analyze: bool, @@ -797,7 +797,7 @@ impl Drop for BaseRowSampleCollector { } struct SampleBuilder { - data: BatchTableScanExecutor>>, + data: BatchTableScanExecutor>>, max_bucket_size: usize, max_sample_size: usize, @@ -818,7 +818,7 @@ impl SampleBuilder { fn new( mut req: AnalyzeColumnsReq, common_handle_req: Option, - storage: TiKvStorage>, + storage: TikvStorage>, ranges: Vec, ) -> Result { let columns_info: Vec<_> = req.take_columns_info().into(); diff --git a/src/import/duplicate_detect.rs b/src/import/duplicate_detect.rs index 86e955c6cd2..c5429315938 100644 --- a/src/import/duplicate_detect.rs +++ b/src/import/duplicate_detect.rs @@ -181,7 +181,7 @@ impl DuplicateDetector { .map_err(from_kv_error)?; match value { Some(val) => pair.set_value(val.to_vec()), - None => return Err(Error::RocksDB("Not found defaultcf value".to_owned())), + None => return Err(Error::RocksDb("Not found defaultcf value".to_owned())), } } } @@ -217,7 +217,7 @@ impl Iterator for DuplicateDetector { fn from_kv_error(e: tikv_kv::Error) -> Error { match e { tikv_kv::Error(box tikv_kv::ErrorInner::Other(err)) => Error::Engine(err), - _ => Error::RocksDB("unkown error when request rocksdb".to_owned()), + _ => Error::RocksDb("unkown error when request rocksdb".to_owned()), } } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index bd94a3638d4..fad5cd25ba8 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -19,7 +19,7 @@ use raftstore::RegionInfoAccessor; use tikv_util::worker::Scheduler; use super::engine_factory_v2::KvEngineFactoryV2; -use crate::config::{DbConfig, TiKvConfig, DEFAULT_ROCKSDB_SUB_DIR}; +use crate::config::{DbConfig, TikvConfig, DEFAULT_ROCKSDB_SUB_DIR}; struct FactoryInner { env: Arc, @@ -39,7 +39,7 @@ pub struct KvEngineFactoryBuilder { } impl KvEngineFactoryBuilder { - pub fn new(env: Arc, config: &TiKvConfig, store_path: impl Into) -> Self { + pub fn new(env: Arc, config: &TikvConfig, store_path: impl Into) -> Self { Self { inner: FactoryInner { env, diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 4132b2e4c25..7f3bcaafe4f 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -194,14 +194,14 @@ mod tests { use engine_traits::{TabletFactory, CF_WRITE}; use super::*; - use crate::{config::TiKvConfig, server::KvEngineFactoryBuilder}; + use crate::{config::TikvConfig, server::KvEngineFactoryBuilder}; lazy_static! { - static ref TEST_CONFIG: TiKvConfig = { + static ref TEST_CONFIG: TikvConfig = { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let common_test_cfg = manifest_dir.join("components/test_raftstore/src/common-test.toml"); - TiKvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { + TikvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { panic!( "invalid auto generated configuration file {}, err {}", manifest_dir.display(), diff --git a/src/server/errors.rs b/src/server/errors.rs index 8932de2dc38..c7a41947f79 100644 --- a/src/server/errors.rs +++ b/src/server/errors.rs @@ -6,7 +6,7 @@ use engine_traits::Error as EngineTraitError; use futures::channel::oneshot::Canceled; use grpcio::Error as GrpcError; use hyper::Error as HttpError; -use openssl::error::ErrorStack as OpenSSLError; +use openssl::error::ErrorStack as OpenSslError; use pd_client::Error as PdError; use protobuf::ProtobufError; use raftstore::Error as RaftServerError; @@ -65,7 +65,7 @@ pub enum Error { Http(#[from] HttpError), #[error("{0:?}")] - OpenSSL(#[from] OpenSSLError), + OpenSsl(#[from] OpenSslError), } pub type Result = result::Result; diff --git a/src/server/service/diagnostics/log.rs b/src/server/service/diagnostics/log.rs index 232ddd58b4b..6f06bf17b30 100644 --- a/src/server/service/diagnostics/log.rs +++ b/src/server/service/diagnostics/log.rs @@ -41,12 +41,12 @@ pub enum Error { InvalidRequest(String), ParseError(String), SearchError(String), - IOError(std::io::Error), + IoError(std::io::Error), } impl From for Error { fn from(err: std::io::Error) -> Self { - Error::IOError(err) + Error::IoError(err) } } diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 1ad81ec8900..8f0f9a23cae 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1,6 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -// #[PerformanceCriticalPath]: Tikv gRPC APIs implementation +// #[PerformanceCriticalPath]: TiKV gRPC APIs implementation use std::{mem, sync::Arc}; use api_version::KvFormat; diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 7911808e86b..3df7bf212d9 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -952,7 +952,7 @@ mod tests { use tikv_util::logger::get_log_level; use crate::{ - config::{ConfigController, TiKvConfig}, + config::{ConfigController, TikvConfig}, server::status_server::{profile::TEST_PROFILE_MUTEX, LogLevelRequest, StatusServer}, }; @@ -1045,12 +1045,12 @@ mod tests { .await .unwrap(); let resp_json = String::from_utf8_lossy(&v).to_string(); - let cfg = TiKvConfig::default(); + let cfg = TikvConfig::default(); serde_json::to_string(&cfg.get_encoder()) .map(|cfg_json| { assert_eq!(resp_json, cfg_json); }) - .expect("Could not convert TiKvConfig to string"); + .expect("Could not convert TikvConfig to string"); }); block_on(handle).unwrap(); status_server.stop(); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 966b6095310..6c4374f7c76 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -449,7 +449,7 @@ impl Storage { (ApiVersion::V2, ApiVersion::V1) if Self::is_txn_command(cmd) => { // For compatibility, accept TiDB request only. for key in keys { - if ApiV2::parse_key_mode(key.as_ref()) != KeyMode::TiDB { + if ApiV2::parse_key_mode(key.as_ref()) != KeyMode::Tidb { return Err(ErrorInner::invalid_key_mode( cmd, storage_api_version, @@ -513,7 +513,7 @@ impl Storage { range.0.as_ref().map(AsRef::as_ref), range.1.as_ref().map(AsRef::as_ref), ); - if ApiV2::parse_range_mode(range) != KeyMode::TiDB { + if ApiV2::parse_range_mode(range) != KeyMode::Tidb { return Err(ErrorInner::invalid_key_range_mode( cmd, storage_api_version, diff --git a/tests/benches/coprocessor_executors/index_scan/mod.rs b/tests/benches/coprocessor_executors/index_scan/mod.rs index ba29f08bb87..eb9f98ae73b 100644 --- a/tests/benches/coprocessor_executors/index_scan/mod.rs +++ b/tests/benches/coprocessor_executors/index_scan/mod.rs @@ -76,14 +76,14 @@ where { let mut inputs = vec![ Input::new(util::BatchIndexScanNext1024Bencher::::new()), - Input::new(util::IndexScanDAGBencher::::new(false, ROWS)), - Input::new(util::IndexScanDAGBencher::::new(true, ROWS)), + Input::new(util::IndexScanDagBencher::::new(false, ROWS)), + Input::new(util::IndexScanDagBencher::::new(true, ROWS)), ]; if crate::util::bench_level() >= 2 { let mut additional_inputs = vec![ Input::new(util::BatchIndexScanNext1024Bencher::::new()), - Input::new(util::IndexScanDAGBencher::::new(false, ROWS)), - Input::new(util::IndexScanDAGBencher::::new(true, ROWS)), + Input::new(util::IndexScanDagBencher::::new(false, ROWS)), + Input::new(util::IndexScanDagBencher::::new(true, ROWS)), ]; inputs.append(&mut additional_inputs); } diff --git a/tests/benches/coprocessor_executors/index_scan/util.rs b/tests/benches/coprocessor_executors/index_scan/util.rs index 87ca1086353..19c2be94195 100644 --- a/tests/benches/coprocessor_executors/index_scan/util.rs +++ b/tests/benches/coprocessor_executors/index_scan/util.rs @@ -8,7 +8,7 @@ use test_coprocessor::*; use tidb_query_datatype::expr::EvalConfig; use tidb_query_executors::{interface::*, BatchIndexScanExecutor}; use tikv::{ - coprocessor::{dag::TiKvStorage, RequestHandler}, + coprocessor::{dag::TikvStorage, RequestHandler}, storage::{RocksEngine, Statistics, Store as TxnStore}, }; use tipb::ColumnInfo; @@ -33,7 +33,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchIndexScan unique: bool, ) -> Self::E { let mut executor = BatchIndexScanExecutor::new( - black_box(TiKvStorage::new( + black_box(TikvStorage::new( ToTxnStore::::to_store(store), false, )), @@ -53,12 +53,12 @@ impl scan_bencher::ScanExecutorBuilder for BatchIndexScan } } -pub struct IndexScanExecutorDAGBuilder { +pub struct IndexScanExecutorDagBuilder { _phantom: PhantomData, } -impl scan_bencher::ScanExecutorDAGHandlerBuilder - for IndexScanExecutorDAGBuilder +impl scan_bencher::ScanExecutorDagHandlerBuilder + for IndexScanExecutorDagBuilder { type T = T; type P = IndexScanParam; @@ -77,4 +77,4 @@ impl scan_bencher::ScanExecutorDAGHandlerBuilder pub type BatchIndexScanNext1024Bencher = scan_bencher::BatchScanNext1024Bencher>; -pub type IndexScanDAGBencher = scan_bencher::ScanDAGBencher>; +pub type IndexScanDagBencher = scan_bencher::ScanDagBencher>; diff --git a/tests/benches/coprocessor_executors/integrated/mod.rs b/tests/benches/coprocessor_executors/integrated/mod.rs index cb7e48f3bd7..0b3d638e854 100644 --- a/tests/benches/coprocessor_executors/integrated/mod.rs +++ b/tests/benches/coprocessor_executors/integrated/mod.rs @@ -19,7 +19,7 @@ where { let (table, store) = crate::table_scan::fixture::table_with_2_columns(input.rows); - // TODO: Change to use `DAGSelect` helper when it no longer place unnecessary + // TODO: Change to use `DagSelect` helper when it no longer place unnecessary // columns. let executors = &[ table_scan(&[table["id"].as_column_info()]), @@ -706,15 +706,15 @@ where rows_options.push(1); } let mut bencher_options: Vec>> = vec![ - Box::new(util::DAGBencher::::new(false)), - Box::new(util::DAGBencher::::new(true)), + Box::new(util::DagBencher::::new(false)), + Box::new(util::DagBencher::::new(true)), ]; if crate::util::bench_level() >= 2 { let mut additional_inputs: Vec>> = vec![ Box::new(util::BatchBencher::::new()), Box::new(util::BatchBencher::::new()), - Box::new(util::DAGBencher::::new(false)), - Box::new(util::DAGBencher::::new(true)), + Box::new(util::DagBencher::::new(false)), + Box::new(util::DagBencher::::new(true)), ]; bencher_options.append(&mut additional_inputs); } diff --git a/tests/benches/coprocessor_executors/integrated/util.rs b/tests/benches/coprocessor_executors/integrated/util.rs index d0c6bedaecd..d9cb5fd2138 100644 --- a/tests/benches/coprocessor_executors/integrated/util.rs +++ b/tests/benches/coprocessor_executors/integrated/util.rs @@ -7,7 +7,7 @@ use kvproto::coprocessor::KeyRange; use test_coprocessor::*; use tidb_query_datatype::expr::EvalConfig; use tikv::{ - coprocessor::dag::TiKvStorage, + coprocessor::dag::TikvStorage, storage::{RocksEngine, Store as TxnStore}, }; use tipb::Executor as PbExecutor; @@ -73,7 +73,7 @@ where crate::util::bencher::BatchNextAllBencher::new(|| { tidb_query_executors::runner::build_executors( black_box(executors.to_vec()), - black_box(TiKvStorage::new(ToTxnStore::::to_store(store), false)), + black_box(TikvStorage::new(ToTxnStore::::to_store(store), false)), black_box(ranges.to_vec()), black_box(Arc::new(EvalConfig::default())), black_box(false), @@ -88,12 +88,12 @@ where } } -pub struct DAGBencher { +pub struct DagBencher { pub batch: bool, _phantom: PhantomData, } -impl DAGBencher { +impl DagBencher { pub fn new(batch: bool) -> Self { Self { batch, @@ -102,7 +102,7 @@ impl DAGBencher { } } -impl IntegratedBencher for DAGBencher +impl IntegratedBencher for DagBencher where T: TxnStore + 'static, M: Measurement, @@ -119,7 +119,7 @@ where ranges: &[KeyRange], store: &Store, ) { - crate::util::bencher::DAGHandleBencher::new(|| { + crate::util::bencher::DagHandleBencher::new(|| { crate::util::build_dag_handler::(executors, ranges, store) }) .bench(b); diff --git a/tests/benches/coprocessor_executors/table_scan/mod.rs b/tests/benches/coprocessor_executors/table_scan/mod.rs index b030a236cbd..63cba5f1d7e 100644 --- a/tests/benches/coprocessor_executors/table_scan/mod.rs +++ b/tests/benches/coprocessor_executors/table_scan/mod.rs @@ -240,14 +240,14 @@ where { let mut inputs = vec![ Input::new(util::BatchTableScanNext1024Bencher::::new()), - Input::new(util::TableScanDAGBencher::::new(false, ROWS)), - Input::new(util::TableScanDAGBencher::::new(true, ROWS)), + Input::new(util::TableScanDagBencher::::new(false, ROWS)), + Input::new(util::TableScanDagBencher::::new(true, ROWS)), ]; if crate::util::bench_level() >= 2 { let mut additional_inputs = vec![ Input::new(util::BatchTableScanNext1024Bencher::::new()), - Input::new(util::TableScanDAGBencher::::new(false, ROWS)), - Input::new(util::TableScanDAGBencher::::new(true, ROWS)), + Input::new(util::TableScanDagBencher::::new(false, ROWS)), + Input::new(util::TableScanDagBencher::::new(true, ROWS)), ]; inputs.append(&mut additional_inputs); } diff --git a/tests/benches/coprocessor_executors/table_scan/util.rs b/tests/benches/coprocessor_executors/table_scan/util.rs index e66af09dc67..7bcfe436d62 100644 --- a/tests/benches/coprocessor_executors/table_scan/util.rs +++ b/tests/benches/coprocessor_executors/table_scan/util.rs @@ -8,7 +8,7 @@ use test_coprocessor::*; use tidb_query_datatype::expr::EvalConfig; use tidb_query_executors::{interface::*, BatchTableScanExecutor}; use tikv::{ - coprocessor::{dag::TiKvStorage, RequestHandler}, + coprocessor::{dag::TikvStorage, RequestHandler}, storage::{RocksEngine, Statistics, Store as TxnStore}, }; use tipb::ColumnInfo; @@ -33,7 +33,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchTableScan _: (), ) -> Self::E { let mut executor = BatchTableScanExecutor::new( - black_box(TiKvStorage::new( + black_box(TikvStorage::new( ToTxnStore::::to_store(store), false, )), @@ -53,12 +53,12 @@ impl scan_bencher::ScanExecutorBuilder for BatchTableScan } } -pub struct TableScanExecutorDAGBuilder { +pub struct TableScanExecutorDagBuilder { _phantom: PhantomData, } -impl scan_bencher::ScanExecutorDAGHandlerBuilder - for TableScanExecutorDAGBuilder +impl scan_bencher::ScanExecutorDagHandlerBuilder + for TableScanExecutorDagBuilder { type T = T; type P = TableScanParam; @@ -77,4 +77,4 @@ impl scan_bencher::ScanExecutorDAGHandlerBuilder pub type BatchTableScanNext1024Bencher = scan_bencher::BatchScanNext1024Bencher>; -pub type TableScanDAGBencher = scan_bencher::ScanDAGBencher>; +pub type TableScanDagBencher = scan_bencher::ScanDagBencher>; diff --git a/tests/benches/coprocessor_executors/util/bencher.rs b/tests/benches/coprocessor_executors/util/bencher.rs index cfbd2c90bc2..64862582bd8 100644 --- a/tests/benches/coprocessor_executors/util/bencher.rs +++ b/tests/benches/coprocessor_executors/util/bencher.rs @@ -76,17 +76,17 @@ impl E> Bencher for BatchNextAllBencher { } /// Invoke handle request for a DAG handler. -pub struct DAGHandleBencher Box> { +pub struct DagHandleBencher Box> { handler_builder: F, } -impl Box> DAGHandleBencher { +impl Box> DagHandleBencher { pub fn new(handler_builder: F) -> Self { Self { handler_builder } } } -impl Box> Bencher for DAGHandleBencher { +impl Box> Bencher for DagHandleBencher { fn bench(&mut self, b: &mut criterion::Bencher<'_, M>) where M: Measurement, @@ -94,7 +94,7 @@ impl Box> Bencher for DAGHandleBencher { b.iter_batched_ref( &mut self.handler_builder, |handler| { - profiler::start("./DAGHandleBencher.profile"); + profiler::start("./DagHandleBencher.profile"); black_box(block_on(handler.handle_request()).unwrap()); profiler::stop(); }, diff --git a/tests/benches/coprocessor_executors/util/scan_bencher.rs b/tests/benches/coprocessor_executors/util/scan_bencher.rs index 64f65712d54..affc19436bb 100644 --- a/tests/benches/coprocessor_executors/util/scan_bencher.rs +++ b/tests/benches/coprocessor_executors/util/scan_bencher.rs @@ -26,7 +26,7 @@ pub trait ScanExecutorBuilder: 'static { ) -> Self::E; } -pub trait ScanExecutorDAGHandlerBuilder: 'static { +pub trait ScanExecutorDagHandlerBuilder: 'static { type T: TxnStore + 'static; type P: Copy + 'static; fn build( @@ -118,13 +118,13 @@ where } } -pub struct ScanDAGBencher { +pub struct ScanDagBencher { batch: bool, display_table_rows: usize, _phantom: PhantomData, } -impl ScanDAGBencher { +impl ScanDagBencher { pub fn new(batch: bool, display_table_rows: usize) -> Self { Self { batch, @@ -134,9 +134,9 @@ impl ScanDAGBencher { } } -impl ScanBencher for ScanDAGBencher +impl ScanBencher for ScanDagBencher where - B: ScanExecutorDAGHandlerBuilder, + B: ScanExecutorDagHandlerBuilder, M: Measurement, { fn name(&self) -> String { @@ -157,7 +157,7 @@ where store: &Store, parameters: B::P, ) { - crate::util::bencher::DAGHandleBencher::new(|| { + crate::util::bencher::DagHandleBencher::new(|| { B::build(self.batch, columns, ranges, store, parameters) }) .bench(b); diff --git a/tests/benches/misc/coprocessor/codec/chunk/mod.rs b/tests/benches/misc/coprocessor/codec/chunk/mod.rs index 84e524031d5..f956e2cb14e 100644 --- a/tests/benches/misc/coprocessor/codec/chunk/mod.rs +++ b/tests/benches/misc/coprocessor/codec/chunk/mod.rs @@ -22,7 +22,7 @@ fn bench_encode_chunk(b: &mut Bencher) { FieldTypeTp::VarChar.into(), FieldTypeTp::VarChar.into(), FieldTypeTp::NewDecimal.into(), - FieldTypeTp::JSON.into(), + FieldTypeTp::Json.into(), ]; let mut chunk = Chunk::new(&fields, rows); for row_id in 0..rows { diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index 10192db7bf0..4371e8999ce 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -25,7 +25,7 @@ use txn_types::{Key, Lock, LockType}; fn test_deadline() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &[]); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); fail::cfg("deadline_check_fail", "return()").unwrap(); let resp = handle_request(&endpoint, req); @@ -39,7 +39,7 @@ fn test_deadline_2() { // beginning. let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &[]); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); fail::cfg("rockskv_async_snapshot", "panic").unwrap(); fail::cfg("deadline_check_fail", "return()").unwrap(); @@ -68,7 +68,7 @@ fn test_deadline_3() { }; init_data_with_details(Context::default(), engine, &product, &data, true, &cfg) }; - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); fail::cfg("kv_cursor_seek", "sleep(2000)").unwrap(); fail::cfg("copr_batch_initial_size", "return(1)").unwrap(); @@ -89,7 +89,7 @@ fn test_deadline_3() { fn test_parse_request_failed() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &[]); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); fail::cfg("coprocessor_parse_request", "return()").unwrap(); let resp = handle_request(&endpoint, req); @@ -102,7 +102,7 @@ fn test_parse_request_failed_2() { // It should not even take any snapshots when parse failed. let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &[]); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); fail::cfg("rockskv_async_snapshot", "panic").unwrap(); fail::cfg("coprocessor_parse_request", "return()").unwrap(); @@ -115,7 +115,7 @@ fn test_parse_request_failed_2() { fn test_readpool_full() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &[]); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); fail::cfg("future_pool_spawn_full", "return()").unwrap(); let resp = handle_request(&endpoint, req); @@ -127,7 +127,7 @@ fn test_readpool_full() { fn test_snapshot_failed() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &[]); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); fail::cfg("rockskv_async_snapshot", "return()").unwrap(); let resp = handle_request(&endpoint, req); @@ -139,7 +139,7 @@ fn test_snapshot_failed() { fn test_snapshot_failed_2() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &[]); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); fail::cfg("rockskv_async_snapshot_not_leader", "return()").unwrap(); let resp = handle_request(&endpoint, req); @@ -153,7 +153,7 @@ fn test_storage_error() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); fail::cfg("kv_cursor_seek", "return()").unwrap(); let resp = handle_request(&endpoint, req); @@ -178,7 +178,7 @@ fn test_region_error_in_scan() { init_data_with_engine_and_commit(ctx.clone(), raft_engine, &product, &data, true); fail::cfg("region_snapshot_seek", "return()").unwrap(); - let req = DAGSelect::from(&product).build_with(ctx, &[0]); + let req = DagSelect::from(&product).build_with(ctx, &[0]); let resp = handle_request(&endpoint, req); assert!( @@ -210,7 +210,7 @@ fn test_paging_scan() { exp.reverse(); } - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .paging_size(paging_size as u64) .desc(desc) .build(); @@ -278,7 +278,7 @@ fn test_paging_scan_multi_ranges() { exp.reverse(); } - let builder = DAGSelect::from(&product) + let builder = DagSelect::from(&product) .paging_size(paging_size) .desc(desc); let mut range1 = builder.key_ranges[0].clone(); @@ -334,7 +334,7 @@ fn test_paging_scan_multi_ranges() { exp.reverse(); } - let builder = DAGSelect::from(&product) + let builder = DagSelect::from(&product) .paging_size(paging_size) .desc(desc); let mut range1 = builder.key_ranges[0].clone(); @@ -409,7 +409,7 @@ fn test_read_index_lock_checking_on_follower() { ctx.set_replica_read(true); let product = ProductTable::new(); - let mut req = DAGSelect::from(&product).build(); + let mut req = DagSelect::from(&product).build(); req.set_context(ctx); req.set_start_ts(100); diff --git a/tests/integrations/config/dynamic/gc_worker.rs b/tests/integrations/config/dynamic/gc_worker.rs index e3603d8cbab..3014ebc3ba2 100644 --- a/tests/integrations/config/dynamic/gc_worker.rs +++ b/tests/integrations/config/dynamic/gc_worker.rs @@ -4,7 +4,7 @@ use std::{sync::mpsc::channel, time::Duration}; use raftstore::router::RaftStoreBlackHole; use tikv::{ - config::{ConfigController, Module, TiKvConfig}, + config::{ConfigController, Module, TikvConfig}, server::gc_worker::{GcConfig, GcTask, GcWorker}, storage::kv::TestEngineBuilder, }; @@ -21,7 +21,7 @@ fn test_gc_config_validate() { } fn setup_cfg_controller( - cfg: TiKvConfig, + cfg: TikvConfig, ) -> ( GcWorker, ConfigController, @@ -62,7 +62,7 @@ where #[allow(clippy::float_cmp)] #[test] fn test_gc_worker_config_update() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.validate().unwrap(); let (gc_worker, cfg_controller) = setup_cfg_controller(cfg); let scheduler = gc_worker.scheduler(); @@ -96,7 +96,7 @@ fn test_gc_worker_config_update() { #[test] #[allow(clippy::float_cmp)] fn test_change_io_limit_by_config_manager() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.validate().unwrap(); let (gc_worker, cfg_controller) = setup_cfg_controller(cfg); let scheduler = gc_worker.scheduler(); @@ -134,7 +134,7 @@ fn test_change_io_limit_by_config_manager() { #[allow(clippy::float_cmp)] fn test_change_io_limit_by_debugger() { // Debugger use GcWorkerConfigManager to change io limit - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.validate().unwrap(); let (gc_worker, _) = setup_cfg_controller(cfg); let scheduler = gc_worker.scheduler(); diff --git a/tests/integrations/config/dynamic/pessimistic_txn.rs b/tests/integrations/config/dynamic/pessimistic_txn.rs index b7496de182d..49bedd38c73 100644 --- a/tests/integrations/config/dynamic/pessimistic_txn.rs +++ b/tests/integrations/config/dynamic/pessimistic_txn.rs @@ -36,7 +36,7 @@ impl StoreAddrResolver for MockResolver { } fn setup( - cfg: TiKvConfig, + cfg: TikvConfig, ) -> ( ConfigController, WaiterMgrScheduler, @@ -95,7 +95,7 @@ where fn test_lock_manager_cfg_update() { const DEFAULT_TIMEOUT: u64 = 3000; const DEFAULT_DELAY: u64 = 100; - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.pessimistic_txn.wait_for_lock_timeout = ReadableDuration::millis(DEFAULT_TIMEOUT); cfg.pessimistic_txn.wake_up_delay_duration = ReadableDuration::millis(DEFAULT_DELAY); cfg.pessimistic_txn.pipelined = false; diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index d1b34a3a498..35d5fe23e49 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -23,7 +23,7 @@ use resource_metering::CollectorRegHandle; use tempfile::TempDir; use test_raftstore::TestPdClient; use tikv::{ - config::{ConfigController, Module, TiKvConfig}, + config::{ConfigController, Module, TikvConfig}, import::SstImporter, }; use tikv_util::{ @@ -58,7 +58,7 @@ fn create_tmp_engine(dir: &TempDir) -> Engines { } fn start_raftstore( - cfg: TiKvConfig, + cfg: TikvConfig, dir: &TempDir, ) -> ( ConfigController, @@ -142,7 +142,7 @@ where #[test] fn test_update_raftstore_config() { - let (mut config, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut config, _dir) = TikvConfig::with_tmp().unwrap(); config.validate().unwrap(); let (cfg_controller, router, _, mut system) = start_raftstore(config.clone(), &_dir); diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index 2594c4ffcaf..5b9ef72b4c3 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -12,7 +12,7 @@ use raftstore::store::{fsm::create_raft_batch_system, SnapManager}; use security::SecurityManager; use tempfile::TempDir; use tikv::{ - config::{ConfigController, TiKvConfig}, + config::{ConfigController, TikvConfig}, server::{ config::{Config as ServerConfig, ServerConfigManager}, snap::{Runner as SnapHandler, Task as SnapTask}, @@ -24,7 +24,7 @@ use tikv_util::{ }; fn start_server( - cfg: TiKvConfig, + cfg: TikvConfig, dir: &TempDir, ) -> (ConfigController, LazyWorker, SnapManager) { let snap_mgr = { @@ -85,7 +85,7 @@ where #[test] fn test_update_server_config() { - let (mut config, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut config, _dir) = TikvConfig::with_tmp().unwrap(); config.validate().unwrap(); let (cfg_controller, snap_worker, snap_mgr) = start_server(config.clone(), &_dir); let mut svr_cfg = config.server.clone(); diff --git a/tests/integrations/config/dynamic/split_check.rs b/tests/integrations/config/dynamic/split_check.rs index 582ce8f115e..eb9b1a63986 100644 --- a/tests/integrations/config/dynamic/split_check.rs +++ b/tests/integrations/config/dynamic/split_check.rs @@ -15,7 +15,7 @@ use raftstore::{ }, store::{SplitCheckRunner as Runner, SplitCheckTask as Task}, }; -use tikv::config::{ConfigController, Module, TiKvConfig}; +use tikv::config::{ConfigController, Module, TikvConfig}; use tikv_util::worker::{LazyWorker, Scheduler, Worker}; fn tmp_engine>(path: P) -> RocksEngine { @@ -26,7 +26,7 @@ fn tmp_engine>(path: P) -> RocksEngine { .unwrap() } -fn setup(cfg: TiKvConfig, engine: RocksEngine) -> (ConfigController, LazyWorker) { +fn setup(cfg: TikvConfig, engine: RocksEngine) -> (ConfigController, LazyWorker) { let (router, _) = sync_channel(1); let runner = Runner::new( engine, @@ -62,7 +62,7 @@ where #[test] fn test_update_split_check_config() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.validate().unwrap(); let engine = tmp_engine(&cfg.storage.data_dir); let (cfg_controller, mut worker) = setup(cfg.clone(), engine); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 98bb55625fa..b8899a1de4f 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -42,7 +42,7 @@ mod test_config_client; #[test] fn test_toml_serde() { - let value = TiKvConfig::default(); + let value = TikvConfig::default(); let dump = toml::to_string_pretty(&value).unwrap(); let load = toml::from_str(&dump).unwrap(); assert_eq!(value, load); @@ -62,7 +62,7 @@ fn read_file_in_project_dir(path: &str) -> String { #[test] fn test_serde_custom_tikv_config() { - let mut value = TiKvConfig::default(); + let mut value = TikvConfig::default(); value.log_rotation_timespan = ReadableDuration::days(1); value.log.level = Level::Critical.into(); value.log.file.filename = "foo".to_owned(); @@ -808,7 +808,7 @@ fn test_serde_custom_tikv_config() { } } -fn diff_config(lhs: &TiKvConfig, rhs: &TiKvConfig) { +fn diff_config(lhs: &TikvConfig, rhs: &TikvConfig) { let lhs_str = format!("{:?}", lhs); let rhs_str = format!("{:?}", rhs); @@ -840,12 +840,12 @@ fn diff_config(lhs: &TiKvConfig, rhs: &TiKvConfig) { #[test] fn test_serde_default_config() { - let cfg: TiKvConfig = toml::from_str("").unwrap(); - assert_eq!(cfg, TiKvConfig::default()); + let cfg: TikvConfig = toml::from_str("").unwrap(); + assert_eq!(cfg, TikvConfig::default()); let content = read_file_in_project_dir("integrations/config/test-default.toml"); - let cfg: TiKvConfig = toml::from_str(&content).unwrap(); - assert_eq!(cfg, TiKvConfig::default()); + let cfg: TikvConfig = toml::from_str(&content).unwrap(); + assert_eq!(cfg, TikvConfig::default()); } #[test] @@ -854,8 +854,8 @@ fn test_readpool_default_config() { [readpool.unified] max-thread-count = 1 "#; - let cfg: TiKvConfig = toml::from_str(content).unwrap(); - let mut expected = TiKvConfig::default(); + let cfg: TikvConfig = toml::from_str(content).unwrap(); + let mut expected = TikvConfig::default(); expected.readpool.unified.max_thread_count = 1; assert_eq!(cfg, expected); } @@ -869,14 +869,14 @@ fn test_do_not_use_unified_readpool_with_legacy_config() { [readpool.coprocessor] normal-concurrency = 1 "#; - let cfg: TiKvConfig = toml::from_str(content).unwrap(); + let cfg: TikvConfig = toml::from_str(content).unwrap(); assert!(!cfg.readpool.is_unified_pool_enabled()); } #[test] fn test_block_cache_backward_compatible() { let content = read_file_in_project_dir("integrations/config/test-cache-compatible.toml"); - let mut cfg: TiKvConfig = toml::from_str(&content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(&content).unwrap(); assert!(cfg.storage.block_cache.shared); assert!(cfg.storage.block_cache.capacity.is_none()); cfg.compatible_adjust(); @@ -893,7 +893,7 @@ fn test_block_cache_backward_compatible() { #[test] fn test_log_backward_compatible() { let content = read_file_in_project_dir("integrations/config/test-log-compatible.toml"); - let mut cfg: TiKvConfig = toml::from_str(&content).unwrap(); + let mut cfg: TikvConfig = toml::from_str(&content).unwrap(); assert_eq!(cfg.log.level, slog::Level::Info.into()); assert_eq!(cfg.log.file.filename, ""); assert_eq!(cfg.log.format, LogFormat::Text); diff --git a/tests/integrations/config/test_config_client.rs b/tests/integrations/config/test_config_client.rs index 4ceb5d3affc..6faa68f3932 100644 --- a/tests/integrations/config/test_config_client.rs +++ b/tests/integrations/config/test_config_client.rs @@ -19,7 +19,7 @@ fn change(name: &str, value: &str) -> HashMap { #[test] fn test_update_config() { - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.validate().unwrap(); let cfg_controller = ConfigController::new(cfg); let mut cfg = cfg_controller.get_current(); @@ -68,7 +68,7 @@ fn test_dispatch_change() { } } - let (mut cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); cfg.validate().unwrap(); let cfg_controller = ConfigController::new(cfg); let mut cfg = cfg_controller.get_current(); @@ -89,7 +89,7 @@ fn test_dispatch_change() { #[test] fn test_write_update_to_file() { - let (mut cfg, tmp_dir) = TiKvConfig::with_tmp().unwrap(); + let (mut cfg, tmp_dir) = TikvConfig::with_tmp().unwrap(); cfg.cfg_path = tmp_dir.path().join("cfg_file").to_str().unwrap().to_owned(); { let c = r#" @@ -201,7 +201,7 @@ fn test_update_from_toml_file() { } } - let (cfg, _dir) = TiKvConfig::with_tmp().unwrap(); + let (cfg, _dir) = TikvConfig::with_tmp().unwrap(); let cfg_controller = ConfigController::new(cfg); let cfg = cfg_controller.get_current(); let mgr = CfgManager(Arc::new(Mutex::new(cfg.raft_store.clone()))); diff --git a/tests/integrations/coprocessor/test_checksum.rs b/tests/integrations/coprocessor/test_checksum.rs index 3e08cfd22e9..db96393c860 100644 --- a/tests/integrations/coprocessor/test_checksum.rs +++ b/tests/integrations/coprocessor/test_checksum.rs @@ -13,7 +13,7 @@ use tidb_query_common::storage::{ Range, }; use tikv::{ - coprocessor::{dag::TiKvStorage, *}, + coprocessor::{dag::TikvStorage, *}, storage::{Engine, SnapshotStore}, }; use tipb::{ChecksumAlgorithm, ChecksumRequest, ChecksumResponse, ChecksumScanOn}; @@ -79,7 +79,7 @@ fn reversed_checksum_crc64_xor(store: &Store, range: KeyRange) -> false, ); let mut scanner = RangesScanner::new(RangesScannerOptions { - storage: TiKvStorage::new(store, false), + storage: TikvStorage::new(store, false), ranges: vec![Range::from_pb_range(range, false)], scan_backward_in_range: true, is_key_only: false, diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 024ebddbdea..660e88905e4 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -63,7 +63,7 @@ fn test_select() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); // for dag selection - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); let mut resp = handle_select(&endpoint, req); let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(data) { @@ -97,7 +97,7 @@ fn test_batch_row_limit() { }; // for dag selection - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); let mut resp = handle_select(&endpoint, req); check_chunk_datum_count(resp.get_chunks(), chunk_datum_limit); let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); @@ -132,7 +132,7 @@ fn test_stream_batch_row_limit() { init_data_with_details(Context::default(), engine, &product, &data, true, &cfg) }; - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); assert_eq!(req.get_ranges().len(), 1); // only ignore first 7 bytes of the row id @@ -203,7 +203,7 @@ fn test_select_after_lease() { // Sleep until the leader lease is expired. thread::sleep(cluster.cfg.raft_store.raft_store_max_leader_lease.0); - let req = DAGSelect::from(&product).build_with(ctx, &[0]); + let req = DagSelect::from(&product).build_with(ctx, &[0]); let mut resp = handle_select(&endpoint, req); let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(data) { @@ -236,8 +236,8 @@ fn test_scan_detail() { }; let reqs = vec![ - DAGSelect::from(&product).build(), - DAGSelect::from_index(&product, &product["name"]).build(), + DagSelect::from(&product).build(), + DagSelect::from_index(&product, &product["name"]).build(), ]; for mut req in reqs { @@ -272,7 +272,7 @@ fn test_group_by() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .group_by(&[&product["name"]]) .output_offsets(Some(vec![0])) .build(); @@ -314,7 +314,7 @@ fn test_aggr_count() { ]; // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .count(&product["count"]) .group_by(&[&product["name"]]) .output_offsets(Some(vec![0, 1])) @@ -344,7 +344,7 @@ fn test_aggr_count() { ]; // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .count(&product["id"]) .group_by(&[&product["name"], &product["count"]]) .build(); @@ -392,7 +392,7 @@ fn test_aggr_first() { ]; // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .first(&product["id"]) .group_by(&[&product["name"]]) .output_offsets(Some(vec![0, 1])) @@ -423,7 +423,7 @@ fn test_aggr_first() { ]; // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .first(&product["name"]) .group_by(&[&product["count"]]) .output_offsets(Some(vec![0, 1])) @@ -476,7 +476,7 @@ fn test_aggr_avg() { (Datum::Bytes(b"name:5".to_vec()), (Datum::Dec(8.into()), 2)), ]; // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .avg(&product["count"]) .group_by(&[&product["name"]]) .build(); @@ -518,7 +518,7 @@ fn test_aggr_sum() { (Datum::Bytes(b"name:5".to_vec()), 8), ]; // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .sum(&product["count"]) .group_by(&[&product["name"]]) .output_offsets(Some(vec![0, 1])) @@ -586,7 +586,7 @@ fn test_aggr_extre() { ]; // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .max(&product["count"]) .min(&product["count"]) .group_by(&[&product["name"]]) @@ -662,7 +662,7 @@ fn test_aggr_bit_ops() { ]; // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .bit_and(&product["count"]) .bit_or(&product["count"]) .bit_xor(&product["count"]) @@ -709,7 +709,7 @@ fn test_order_by_column() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .order_by(&product["count"], true) .order_by(&product["name"], false) .limit(5) @@ -747,7 +747,7 @@ fn test_order_by_pk_with_select_from_index() { let (_, endpoint) = init_with_data(&product, &data); let expect: Vec<_> = data.drain(..5).collect(); // for dag - let req = DAGSelect::from_index(&product, &product["name"]) + let req = DagSelect::from_index(&product, &product["name"]) .order_by(&product["id"], true) .limit(5) .build(); @@ -783,7 +783,7 @@ fn test_limit() { let (_, endpoint) = init_with_data(&product, &data); let expect: Vec<_> = data.drain(..5).collect(); // for dag - let req = DAGSelect::from(&product).limit(5).build(); + let req = DagSelect::from(&product).limit(5).build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); @@ -817,7 +817,7 @@ fn test_reverse() { data.reverse(); let expect: Vec<_> = data.drain(..5).collect(); // for dag - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .limit(5) .order_by(&product["id"], true) .build(); @@ -852,7 +852,7 @@ fn test_index() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); // for dag - let req = DAGSelect::from_index(&product, &product["id"]).build(); + let req = DagSelect::from_index(&product, &product["id"]).build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 1); @@ -882,7 +882,7 @@ fn test_index_reverse_limit() { data.reverse(); let expect: Vec<_> = data.drain(..5).collect(); // for dag - let req = DAGSelect::from_index(&product, &product["id"]) + let req = DagSelect::from_index(&product, &product["id"]) .limit(5) .order_by(&product["id"], true) .build(); @@ -914,7 +914,7 @@ fn test_limit_oom() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); // for dag - let req = DAGSelect::from_index(&product, &product["id"]) + let req = DagSelect::from_index(&product, &product["id"]) .limit(100000000) .build(); let mut resp = handle_select(&endpoint, req); @@ -953,7 +953,7 @@ fn test_del_select() { store.commit(); // for dag - let mut req = DAGSelect::from_index(&product, &product["id"]).build(); + let mut req = DagSelect::from_index(&product, &product["id"]).build(); req.mut_context().set_record_scan_stat(true); let resp = handle_request(&endpoint, req); @@ -985,7 +985,7 @@ fn test_index_group_by() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); // for dag - let req = DAGSelect::from_index(&product, &product["name"]) + let req = DagSelect::from_index(&product, &product["name"]) .group_by(&[&product["name"]]) .output_offsets(Some(vec![0])) .build(); @@ -1020,7 +1020,7 @@ fn test_index_aggr_count() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); // for dag - let req = DAGSelect::from_index(&product, &product["name"]) + let req = DagSelect::from_index(&product, &product["name"]) .count(&product["id"]) .output_offsets(Some(vec![0])) .build(); @@ -1045,7 +1045,7 @@ fn test_index_aggr_count() { (Datum::Bytes(b"name:5".to_vec()), 2), ]; // for dag - let req = DAGSelect::from_index(&product, &product["name"]) + let req = DagSelect::from_index(&product, &product["name"]) .count(&product["id"]) .group_by(&[&product["name"]]) .output_offsets(Some(vec![0, 1])) @@ -1073,7 +1073,7 @@ fn test_index_aggr_count() { (vec![Datum::Bytes(b"name:3".to_vec()), Datum::I64(3)], 1), (vec![Datum::Bytes(b"name:5".to_vec()), Datum::I64(4)], 2), ]; - let req = DAGSelect::from_index(&product, &product["name"]) + let req = DagSelect::from_index(&product, &product["name"]) .count(&product["id"]) .group_by(&[&product["name"], &product["count"]]) .build(); @@ -1116,7 +1116,7 @@ fn test_index_aggr_first() { (Datum::Bytes(b"name:5".to_vec()), 5), ]; // for dag - let req = DAGSelect::from_index(&product, &product["name"]) + let req = DagSelect::from_index(&product, &product["name"]) .first(&product["id"]) .group_by(&[&product["name"]]) .output_offsets(Some(vec![0, 1])) @@ -1175,7 +1175,7 @@ fn test_index_aggr_avg() { (Datum::Bytes(b"name:5".to_vec()), (Datum::Dec(8.into()), 2)), ]; // for dag - let req = DAGSelect::from_index(&product, &product["name"]) + let req = DagSelect::from_index(&product, &product["name"]) .avg(&product["count"]) .group_by(&[&product["name"]]) .build(); @@ -1217,7 +1217,7 @@ fn test_index_aggr_sum() { (Datum::Bytes(b"name:5".to_vec()), 8), ]; // for dag - let req = DAGSelect::from_index(&product, &product["name"]) + let req = DagSelect::from_index(&product, &product["name"]) .sum(&product["count"]) .group_by(&[&product["name"]]) .output_offsets(Some(vec![0, 1])) @@ -1284,7 +1284,7 @@ fn test_index_aggr_extre() { (Datum::Bytes(b"name:6".to_vec()), Datum::Null, Datum::Null), ]; // for dag - let req = DAGSelect::from_index(&product, &product["name"]) + let req = DagSelect::from_index(&product, &product["name"]) .max(&product["count"]) .min(&product["count"]) .group_by(&[&product["name"]]) @@ -1357,7 +1357,7 @@ fn test_where() { cond }; - let req = DAGSelect::from(&product).where_expr(cond).build(); + let req = DagSelect::from(&product).where_expr(cond).build(); let mut resp = handle_select(&endpoint, req); let mut spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); let row = spliter.next().unwrap(); @@ -1489,7 +1489,7 @@ fn test_handle_truncate() { for cond in cases { // Ignore truncate error. - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .where_expr(cond.clone()) .build_with(Context::default(), &[FLAG_IGNORE_TRUNCATE]); let resp = handle_select(&endpoint, req); @@ -1497,7 +1497,7 @@ fn test_handle_truncate() { assert!(resp.get_warnings().is_empty()); // truncate as warning - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .where_expr(cond.clone()) .build_with(Context::default(), &[FLAG_TRUNCATE_AS_WARNING]); let mut resp = handle_select(&endpoint, req); @@ -1518,7 +1518,7 @@ fn test_handle_truncate() { assert_eq!(spliter.next().is_none(), true); // Do NOT ignore truncate error. - let req = DAGSelect::from(&product).where_expr(cond.clone()).build(); + let req = DagSelect::from(&product).where_expr(cond.clone()).build(); let resp = handle_select(&endpoint, req); assert!(resp.has_error()); assert!(resp.get_warnings().is_empty()); @@ -1551,7 +1551,7 @@ fn test_default_val() { let (_, endpoint) = init_with_data(&product, &data); let expect: Vec<_> = data.drain(..5).collect(); - let req = DAGSelect::from(&tbl).limit(5).build(); + let req = DagSelect::from(&tbl).limit(5).build(); let mut resp = handle_select(&endpoint, req); let mut row_count = 0; let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 4); @@ -1581,7 +1581,7 @@ fn test_output_offsets() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); - let req = DAGSelect::from(&product) + let req = DagSelect::from(&product) .output_offsets(Some(vec![1])) .build(); let mut resp = handle_select(&endpoint, req); @@ -1607,7 +1607,7 @@ fn test_key_is_locked_for_primary() { let product = ProductTable::new(); let (_, endpoint) = init_data_with_commit(&product, &data, false); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); let resp = handle_request(&endpoint, req); assert!(resp.get_data().is_empty(), "{:?}", resp); assert!(resp.has_locked(), "{:?}", resp); @@ -1625,7 +1625,7 @@ fn test_key_is_locked_for_index() { let product = ProductTable::new(); let (_, endpoint) = init_data_with_commit(&product, &data, false); - let req = DAGSelect::from_index(&product, &product["name"]).build(); + let req = DagSelect::from_index(&product, &product["name"]).build(); let resp = handle_request(&endpoint, req); assert!(resp.get_data().is_empty(), "{:?}", resp); assert!(resp.has_locked(), "{:?}", resp); @@ -1643,7 +1643,7 @@ fn test_output_counts() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); - let req = DAGSelect::from(&product).build(); + let req = DagSelect::from(&product).build(); let resp = handle_select(&endpoint, req); assert_eq!(resp.get_output_counts(), &[data.len() as i64]); } @@ -1663,7 +1663,7 @@ fn test_exec_details() { let flags = &[0]; let ctx = Context::default(); - let req = DAGSelect::from(&product).build_with(ctx, flags); + let req = DagSelect::from(&product).build_with(ctx, flags); let resp = handle_request(&endpoint, req); assert!(resp.has_exec_details()); let exec_details = resp.get_exec_details(); @@ -1687,7 +1687,7 @@ fn test_invalid_range() { let product = ProductTable::new(); let (_, endpoint) = init_with_data(&product, &data); - let mut select = DAGSelect::from(&product); + let mut select = DagSelect::from(&product); select.key_ranges[0].set_start(b"xxx".to_vec()); select.key_ranges[0].set_end(b"zzz".to_vec()); let req = select.build(); @@ -1703,7 +1703,7 @@ fn test_snapshot_failed() { let (_, endpoint) = init_data_with_engine_and_commit(ctx, raft_engine, &product, &[], true); // Use an invalid context to make errors. - let req = DAGSelect::from(&product).build_with(Context::default(), &[0]); + let req = DagSelect::from(&product).build_with(Context::default(), &[0]); let resp = handle_request(&endpoint, req); assert!(resp.get_region_error().has_store_not_match()); @@ -1724,7 +1724,7 @@ fn test_cache() { let (_, endpoint) = init_data_with_engine_and_commit(ctx.clone(), raft_engine, &product, &data, true); - let req = DAGSelect::from(&product).build_with(ctx, &[0]); + let req = DagSelect::from(&product).build_with(ctx, &[0]); let resp = handle_request(&endpoint, req.clone()); assert!(!resp.get_is_cache_hit()); @@ -1839,7 +1839,7 @@ fn test_copr_bypass_or_access_locks() { // DAG { - let mut req = DAGSelect::from(&product).build_with(ctx.clone(), &[0]); + let mut req = DagSelect::from(&product).build_with(ctx.clone(), &[0]); req.set_start_ts(read_ts.into_inner()); req.set_ranges(ranges.clone().into()); @@ -1944,7 +1944,7 @@ fn test_rc_read() { ctx.set_isolation_level(IsolationLevel::Rc); let ranges = vec![product.get_record_range(1, 4)]; - let mut req = DAGSelect::from(&product).build_with(ctx.clone(), &[0]); + let mut req = DagSelect::from(&product).build_with(ctx.clone(), &[0]); req.set_start_ts(u64::MAX - 1); req.set_ranges(ranges.into()); @@ -1973,7 +1973,7 @@ fn test_buckets() { let (_, endpoint) = init_data_with_engine_and_commit(ctx.clone(), raft_engine, &product, &[], true); - let req = DAGSelect::from(&product).build_with(ctx, &[0]); + let req = DagSelect::from(&product).build_with(ctx, &[0]); let resp = handle_request(&endpoint, req.clone()); assert_eq!(resp.get_latest_buckets_version(), 0); diff --git a/tests/integrations/import/test_sst_service.rs b/tests/integrations/import/test_sst_service.rs index 0174d0ef53f..a47c817d2af 100644 --- a/tests/integrations/import/test_sst_service.rs +++ b/tests/integrations/import/test_sst_service.rs @@ -5,7 +5,7 @@ use kvproto::{import_sstpb::*, kvrpcpb::Context, tikvpb::*}; use pd_client::PdClient; use tempfile::Builder; use test_sst_importer::*; -use tikv::config::TiKvConfig; +use tikv::config::TikvConfig; use super::util::*; @@ -84,7 +84,7 @@ fn test_write_and_ingest_with_tde() { #[test] fn test_ingest_sst() { - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); cfg.server.grpc_concurrency = 1; let (_cluster, ctx, _tikv, import) = open_cluster_and_tikv_import_client(Some(cfg)); diff --git a/tests/integrations/import/util.rs b/tests/integrations/import/util.rs index 363e3292ec6..e757e7685ba 100644 --- a/tests/integrations/import/util.rs +++ b/tests/integrations/import/util.rs @@ -7,13 +7,13 @@ use grpcio::{ChannelBuilder, Environment, Result, WriteFlags}; use kvproto::{import_sstpb::*, kvrpcpb::*, tikvpb::*}; use security::SecurityConfig; use test_raftstore::*; -use tikv::config::TiKvConfig; +use tikv::config::TikvConfig; use tikv_util::HandyRwLock; use uuid::Uuid; const CLEANUP_SST_MILLIS: u64 = 10; -pub fn new_cluster(cfg: TiKvConfig) -> (Cluster, Context) { +pub fn new_cluster(cfg: TikvConfig) -> (Cluster, Context) { let count = 1; let mut cluster = new_server_cluster(0, count); cluster.cfg = Config { @@ -34,10 +34,10 @@ pub fn new_cluster(cfg: TiKvConfig) -> (Cluster, Context) { } pub fn open_cluster_and_tikv_import_client( - cfg: Option, + cfg: Option, ) -> (Cluster, Context, TikvClient, ImportSstClient) { let cfg = cfg.unwrap_or_else(|| { - let mut config = TiKvConfig::default(); + let mut config = TikvConfig::default(); config.server.addr = "127.0.0.1:0".to_owned(); let cleanup_interval = Duration::from_millis(CLEANUP_SST_MILLIS); config.raft_store.cleanup_import_sst_interval.0 = cleanup_interval; @@ -84,7 +84,7 @@ pub fn new_cluster_and_tikv_import_client_tde() -> ( let encryption_cfg = test_util::new_file_security_config(&tmp_dir); let mut security = test_util::new_security_cfg(None); security.encryption = encryption_cfg; - let mut config = TiKvConfig::default(); + let mut config = TikvConfig::default(); config.server.addr = "127.0.0.1:0".to_owned(); let cleanup_interval = Duration::from_millis(CLEANUP_SST_MILLIS); config.raft_store.cleanup_import_sst_interval.0 = cleanup_interval; diff --git a/tests/integrations/resource_metering/test_cpu.rs b/tests/integrations/resource_metering/test_cpu.rs index abbfcdf3d17..9ead51f5ef5 100644 --- a/tests/integrations/resource_metering/test_cpu.rs +++ b/tests/integrations/resource_metering/test_cpu.rs @@ -12,7 +12,7 @@ use std::{ use concurrency_manager::ConcurrencyManager; use futures::{executor::block_on, StreamExt}; use kvproto::kvrpcpb::Context; -use test_coprocessor::{DAGSelect, Insert, ProductTable, Store}; +use test_coprocessor::{DagSelect, Insert, ProductTable, Store}; use tidb_query_datatype::codec::Datum; use tikv::{ config::CoprReadPoolConfig, @@ -92,7 +92,7 @@ pub fn test_reschedule_coprocessor() { insert.execute(); store.commit(); - let mut req = DAGSelect::from(&table).build(); + let mut req = DagSelect::from(&table).build(); let mut ctx = Context::default(); ctx.set_resource_group_tag(tag.as_bytes().to_vec()); req.set_context(ctx); diff --git a/tests/integrations/resource_metering/test_read_keys.rs b/tests/integrations/resource_metering/test_read_keys.rs index d5306ef21f5..87ad50024ad 100644 --- a/tests/integrations/resource_metering/test_read_keys.rs +++ b/tests/integrations/resource_metering/test_read_keys.rs @@ -8,7 +8,7 @@ use grpcio::{ChannelBuilder, Environment}; use kvproto::{coprocessor, kvrpcpb::*, resource_usage_agent::ResourceUsageRecord, tikvpb::*}; use protobuf::Message; use resource_metering::ResourceTagFactory; -use test_coprocessor::{DAGSelect, ProductTable, Store}; +use test_coprocessor::{DagSelect, ProductTable, Store}; use test_raftstore::*; use test_util::alloc_port; use tidb_query_datatype::codec::Datum; @@ -202,7 +202,7 @@ fn test_read_keys_coprocessor() { .unwrap(); // Do DAG select to register runtime thread. - let mut req = DAGSelect::from(&product).build(); + let mut req = DagSelect::from(&product).build(); let mut ctx = Context::default(); ctx.set_resource_group_tag("TEST-TAG".into()); req.set_context(ctx); diff --git a/tests/integrations/resource_metering/test_suite/mod.rs b/tests/integrations/resource_metering/test_suite/mod.rs index 88ffa9494ab..667c86d230a 100644 --- a/tests/integrations/resource_metering/test_suite/mod.rs +++ b/tests/integrations/resource_metering/test_suite/mod.rs @@ -19,7 +19,7 @@ use resource_metering::{Config, ResourceTagFactory}; use tempfile::TempDir; use test_util::alloc_port; use tikv::{ - config::{ConfigController, TiKvConfig}, + config::{ConfigController, TikvConfig}, storage::{ lock_manager::DummyLockManager, RocksEngine, StorageApiV1, TestEngineBuilder, TestStorageBuilderApiV1, @@ -50,7 +50,7 @@ pub struct TestSuite { impl TestSuite { pub fn new(cfg: resource_metering::Config) -> Self { - let (mut tikv_cfg, dir) = TiKvConfig::with_tmp().unwrap(); + let (mut tikv_cfg, dir) = TikvConfig::with_tmp().unwrap(); tikv_cfg.resource_metering = cfg.clone(); let cfg_controller = ConfigController::new(tikv_cfg); diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index b0c95eb9f7a..f5e642f161b 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -20,7 +20,7 @@ use raftstore::store::{apply_sst_cf_file, build_sst_cf_file_list, CfFile, Region use tempfile::Builder; use test_raftstore::*; use tikv::{ - config::TiKvConfig, + config::TikvConfig, storage::{mvcc::ScannerBuilder, txn::Scanner}, }; use tikv_util::{ @@ -148,7 +148,7 @@ fn test_delete_files_in_range_for_titan() { .unwrap(); // Set configs and create engines - let mut cfg = TiKvConfig::default(); + let mut cfg = TikvConfig::default(); let cache = cfg.storage.block_cache.build_shared_cache(); cfg.rocksdb.titan.enabled = true; cfg.rocksdb.titan.disable_gc = true; From 71caf10cb7c35a3dcfeea945d3bc6437e3a314b2 Mon Sep 17 00:00:00 2001 From: Jarvis Date: Thu, 4 Aug 2022 17:56:07 +0800 Subject: [PATCH 135/676] encryption: Set Iv to empty if using plaintext encryption (#13083) close tikv/tikv#13081 Using empty IV for plaintext encryption Signed-off-by: Jarvis Zheng Co-authored-by: Xinye Tao --- components/encryption/src/manager/mod.rs | 114 ++++++++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 58a3a7a66e5..fb6b2312027 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -194,7 +194,11 @@ impl Dicts { fn new_file(&self, fname: &str, method: EncryptionMethod) -> Result { let mut file_dict_file = self.file_dict_file.lock().unwrap(); - let iv = Iv::new_ctr(); + let iv = if method != EncryptionMethod::Plaintext { + Iv::new_ctr() + } else { + Iv::Empty + }; let file = FileInfo { iv: iv.as_slice().to_vec(), key_id: self.current_key_id.load(Ordering::SeqCst), @@ -1336,4 +1340,112 @@ mod tests { assert_eq!(buffer, content); } } + + fn generate_mock_file>(dkm: Option<&DataKeyManager>, path: P, content: &String) { + use std::io::Write; + match dkm { + Some(manager) => { + // Encryption enabled. Use DataKeyManager to manage file. + let mut f = manager.create_file_for_write(&path).unwrap(); + f.write_all(content.as_bytes()).unwrap(); + f.sync_all().unwrap(); + } + None => { + // Encryption disabled. Write content in plaintext. + let mut f = File::create(&path).unwrap(); + f.write_all(content.as_bytes()).unwrap(); + f.sync_all().unwrap(); + } + } + } + + fn check_mock_file_content>( + dkm: Option<&DataKeyManager>, + path: P, + expected: &String, + ) { + use std::io::Read; + + match dkm { + Some(manager) => { + let mut buffer = String::new(); + let mut f = manager.open_file_for_read(&path).unwrap(); + assert_eq!(f.read_to_string(&mut buffer).unwrap(), expected.len()); + assert_eq!(buffer, expected.to_string()); + } + None => { + let mut buffer = String::new(); + let mut f = File::open(&path).unwrap(); + assert_eq!(f.read_to_string(&mut buffer).unwrap(), expected.len()); + assert_eq!(buffer, expected.to_string()); + } + } + } + + fn test_change_method(from: EncryptionMethod, to: EncryptionMethod) { + if from == to { + return; + } + + let generate_file_name = |method| format!("{:?}", method); + let generate_file_content = |method| format!("Encrypted with {:?}", method); + let tmp_dir = tempfile::TempDir::new().unwrap(); + let (key_path, _tmp_key_dir) = create_key_file("key"); + let master_key_backend = + Box::new(FileBackend::new(key_path.as_path()).unwrap()) as Box; + let previous = new_mock_backend() as Box; + let path_to_file1 = tmp_dir.path().join(generate_file_name(from)); + let content1 = generate_file_content(from); + + if from == EncryptionMethod::Plaintext { + // encryption not enabled. + let mut args = def_data_key_args(&tmp_dir); + args.method = EncryptionMethod::Plaintext; + let manager = + DataKeyManager::new(master_key_backend, Box::new(move || Ok(previous)), args) + .unwrap(); + assert!(manager.is_none()); + generate_mock_file(None, &path_to_file1, &content1); + check_mock_file_content(None, &path_to_file1, &content1); + } else { + let manager = + new_key_manager(&tmp_dir, Some(from), master_key_backend, previous).unwrap(); + + generate_mock_file(Some(&manager), &path_to_file1, &content1); + check_mock_file_content(Some(&manager), &path_to_file1, &content1); + // Close old manager + drop(manager); + } + + // re-open with new encryption/plaintext algorithm. + let master_key_backend = + Box::new(FileBackend::new(key_path.as_path()).unwrap()) as Box; + let previous = new_mock_backend() as Box; + let manager = new_key_manager(&tmp_dir, Some(to), master_key_backend, previous).unwrap(); + let path_to_file2 = tmp_dir.path().join(generate_file_name(to)); + + let content2 = generate_file_content(to); + generate_mock_file(Some(&manager), &path_to_file2, &content2); + check_mock_file_content(Some(&manager), &path_to_file2, &content2); + // check old file content + check_mock_file_content(Some(&manager), &path_to_file1, &content1); + } + + #[test] + fn test_encryption_algorithm_switch() { + let _guard = LOCK_FOR_GAUGE.lock().unwrap(); + + let method_list = [ + EncryptionMethod::Plaintext, + EncryptionMethod::Aes128Ctr, + EncryptionMethod::Aes192Ctr, + EncryptionMethod::Aes256Ctr, + EncryptionMethod::Sm4Ctr, + ]; + for from in method_list { + for to in method_list { + test_change_method(from, to) + } + } + } } From 25b45d939800089cdaf4c19c2d54e3420e75038e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boqin=20Qin=28=E7=A7=A6=20=E4=BC=AF=E9=92=A6=29?= Date: Thu, 4 Aug 2022 20:26:06 +0800 Subject: [PATCH 136/676] components/pd_client: fix double-read-lock in client (#13188) close tikv/tikv#12933 Signed-off-by: Burton Qin Co-authored-by: Shirly Co-authored-by: Ti Chi Robot --- components/pd_client/src/client.rs | 193 ++++++++++++++++------------- 1 file changed, 107 insertions(+), 86 deletions(-) diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 173b25357c4..04fd6350ca1 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -37,7 +37,7 @@ use yatp::{task::future::TaskCell, ThreadPool}; use super::{ metrics::*, - util::{check_resp_header, sync_request, Client, PdConnector}, + util::{check_resp_header, sync_request, Client, Inner, PdConnector}, BucketStat, Config, Error, FeatureGate, PdClient, PdFuture, RegionInfo, RegionStat, Result, UnixSecs, REQUEST_TIMEOUT, }; @@ -192,9 +192,12 @@ impl RpcClient { /// Creates a new call option with default request timeout. #[inline] pub fn call_option(client: &Client) -> CallOption { - client - .inner - .rl() + Self::call_option_inner(&client.inner.rl()) + } + + #[inline] + fn call_option_inner(inner: &Inner) -> CallOption { + inner .target_info() .call_option() .timeout(Duration::from_secs(REQUEST_TIMEOUT)) @@ -214,14 +217,15 @@ impl RpcClient { req.set_region_key(key.to_vec()); let executor = move |client: &Client, req: pdpb::GetRegionRequest| { - let handler = client - .inner - .rl() - .client_stub - .get_region_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_async_opt", e) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_region_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_async_opt", e) + }) + }; Box::pin(async move { let mut resp = handler.await?; @@ -253,12 +257,15 @@ impl RpcClient { req.set_store_id(store_id); let executor = move |client: &Client, req: pdpb::GetStoreRequest| { - let handler = client - .inner - .rl() - .client_stub - .get_store_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "get_store_async", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_store_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_store_async", e) + }) + }; Box::pin(async move { let mut resp = handler.await?; @@ -496,14 +503,15 @@ impl PdClient for RpcClient { req.set_region_id(region_id); let executor = move |client: &Client, req: pdpb::GetRegionByIdRequest| { - let handler = client - .inner - .rl() - .client_stub - .get_region_by_id_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_by_id", e) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_region_by_id_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_by_id", e); + }) + }; Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC @@ -534,14 +542,15 @@ impl PdClient for RpcClient { req.set_region_id(region_id); let executor = move |client: &Client, req: pdpb::GetRegionByIdRequest| { - let handler = client - .inner - .rl() - .client_stub - .get_region_by_id_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_by_id", e) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_region_by_id_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_by_id", e) + }) + }; Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC @@ -671,12 +680,13 @@ impl PdClient for RpcClient { req.set_region(region); let executor = move |client: &Client, req: pdpb::AskSplitRequest| { - let handler = client - .inner - .rl() - .client_stub - .ask_split_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "ask_split", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .ask_split_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "ask_split", e)) + }; Box::pin(async move { let resp = handler.await?; @@ -706,12 +716,15 @@ impl PdClient for RpcClient { req.set_split_count(count as u32); let executor = move |client: &Client, req: pdpb::AskBatchSplitRequest| { - let handler = client - .inner - .rl() - .client_stub - .ask_batch_split_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "ask_batch_split", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .ask_batch_split_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "ask_batch_split", e) + }) + }; Box::pin(async move { let resp = handler.await?; @@ -750,12 +763,15 @@ impl PdClient for RpcClient { } let executor = move |client: &Client, req: pdpb::StoreHeartbeatRequest| { let feature_gate = client.feature_gate.clone(); - let handler = client - .inner - .rl() - .client_stub - .store_heartbeat_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "store_heartbeat", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .store_heartbeat_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "store_heartbeat", e) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC @@ -784,14 +800,15 @@ impl PdClient for RpcClient { req.set_regions(regions.into()); let executor = move |client: &Client, req: pdpb::ReportBatchSplitRequest| { - let handler = client - .inner - .rl() - .client_stub - .report_batch_split_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "report_batch_split", e) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .report_batch_split_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "report_batch_split", e) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC @@ -837,15 +854,15 @@ impl PdClient for RpcClient { req.set_header(self.header()); let executor = move |client: &Client, req: pdpb::GetGcSafePointRequest| { - let option = Self::call_option(client); - let handler = client - .inner - .rl() - .client_stub - .get_gc_safe_point_async_opt(&req, option) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_gc_saft_point", e) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_gc_safe_point_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_gc_saft_point", e) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC @@ -925,17 +942,18 @@ impl PdClient for RpcClient { req.set_ttl(ttl.as_secs() as _); req.set_safe_point(safe_point.into_inner()); let executor = move |client: &Client, r: pdpb::UpdateServiceGcSafePointRequest| { - let handler = client - .inner - .rl() - .client_stub - .update_service_gc_safe_point_async_opt(&r, Self::call_option(client)) - .unwrap_or_else(|e| { - panic!( - "fail to request PD {} err {:?}", - "update_service_safe_point", e - ) - }); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .update_service_gc_safe_point_async_opt(&r, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!( + "fail to request PD {} err {:?}", + "update_service_safe_point", e + ) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC @@ -963,12 +981,15 @@ impl PdClient for RpcClient { req.set_min_resolved_ts(min_resolved_ts); let executor = move |client: &Client, req: pdpb::ReportMinResolvedTsRequest| { - let handler = client - .inner - .rl() - .client_stub - .report_min_resolved_ts_async_opt(&req, Self::call_option(client)) - .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "min_resolved_ts", e)); + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .report_min_resolved_ts_async_opt(&req, Self::call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "min_resolved_ts", e) + }) + }; Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC From 83d17c39cd0dc3bdb3c6b1f7e6206d4b322c3a37 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 5 Aug 2022 14:52:06 +0800 Subject: [PATCH 137/676] server: raise error when bootstrap with a zero store-id (#13010) close tikv/tikv#13011 raise error when bootstrap with a zero store-id Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- components/pd_client/src/client.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 04fd6350ca1..ca997e473e9 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -398,7 +398,11 @@ impl PdClient for RpcClient { })?; check_resp_header(resp.get_header())?; - Ok(resp.get_id()) + let id = resp.get_id(); + if id == 0 { + return Err(box_err!("pd alloc weird id 0")); + } + Ok(id) } fn put_store(&self, store: metapb::Store) -> Result> { From 9df16e04376952b34936209e8476350dc273d007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boqin=20Qin=28=E7=A7=A6=20=E4=BC=AF=E9=92=A6=29?= Date: Fri, 5 Aug 2022 15:20:06 +0800 Subject: [PATCH 138/676] components/engine_test: fix double-lock in open_tablet (#13187) ref tikv/tikv#13186 Signed-off-by: Burton Qin Co-authored-by: Ti Chi Robot --- components/engine_test/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 7bdd87827e7..c3c8cc598ad 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -261,8 +261,7 @@ pub mod kv { } fn open_tablet(&self, id: u64, suffix: u64) -> Result { - let reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { + if let Some(db) = self.registry.lock().unwrap().get(&(id, suffix)) { return Ok(db.clone()); } From 3800412c49479441738da7caf408bba88a92c62d Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 5 Aug 2022 15:44:06 +0800 Subject: [PATCH 139/676] server: Support default metapb::Store when register to pd (#13192) ref tikv/tikv#12849 Support default metapb::Store when register to pd Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- components/server/src/server.rs | 1 + components/test_raftstore/src/node.rs | 1 + components/test_raftstore/src/server.rs | 1 + src/server/node.rs | 22 +++++++++++++------ .../integrations/raftstore/test_bootstrap.rs | 1 + 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 1cb6a9b3b65..4a4cadeb639 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -867,6 +867,7 @@ where self.state.clone(), self.background_worker.clone(), Some(health_service.clone()), + None, ); node.try_bootstrap_store(engines.engines.clone()) .unwrap_or_else(|e| fatal!("failed to bootstrap node id: {}", e)); diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 2584d29629e..be361db3185 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -247,6 +247,7 @@ impl Simulator for NodeCluster { Arc::default(), bg_worker.clone(), None, + None, ); let (snap_mgr, snap_mgr_path) = if node_id == 0 diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 7107c668c3d..da81606d2dd 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -497,6 +497,7 @@ impl ServerCluster { state, bg_worker.clone(), Some(health_service.clone()), + None, ); node.try_bootstrap_store(engines.clone())?; let node_id = node.id(); diff --git a/src/server/node.rs b/src/server/node.rs index 84aeb89377d..a282bcded37 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -112,8 +112,12 @@ where state: Arc>, bg_worker: Worker, health_service: Option, + default_store: Option, ) -> Node { - let mut store = metapb::Store::default(); + let mut store = match default_store { + None => metapb::Store::default(), + Some(s) => s, + }; store.set_id(INVALID_ID); if cfg.advertise_addr.is_empty() { store.set_address(cfg.addr.clone()); @@ -125,7 +129,9 @@ where } else { store.set_status_address(cfg.advertise_status_addr.clone()) } - store.set_version(env!("CARGO_PKG_VERSION").to_string()); + if store.get_version() == "" { + store.set_version(env!("CARGO_PKG_VERSION").to_string()); + } if let Ok(path) = std::env::current_exe() { if let Some(path) = path.parent() { @@ -134,11 +140,13 @@ where }; store.set_start_timestamp(chrono::Local::now().timestamp()); - store.set_git_hash( - option_env!("TIKV_BUILD_GIT_HASH") - .unwrap_or("Unknown git hash") - .to_string(), - ); + if store.get_git_hash() == "" { + store.set_git_hash( + option_env!("TIKV_BUILD_GIT_HASH") + .unwrap_or("Unknown git hash") + .to_string(), + ); + } let mut labels = Vec::new(); for (k, v) in &cfg.labels { diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index 1caf4e31ea3..92e4422c57f 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -61,6 +61,7 @@ fn test_node_bootstrap_with_prepared_data() { Arc::default(), bg_worker, None, + None, ); let snap_mgr = SnapManager::new(tmp_mgr.path().to_str().unwrap()); let pd_worker = LazyWorker::new("test-pd-worker"); From 68397e8c7fe1842635b29675f9cc01f534fc1d3e Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 5 Aug 2022 16:46:07 +0800 Subject: [PATCH 140/676] tablet: fix the potential dead lock in open_tablet (#13165) close tikv/tikv#13213 Signed-off-by: SpadeA-Tang --- components/engine_test/src/lib.rs | 156 +++++++---- components/engine_traits/src/engine.rs | 111 ++++++-- .../raftstore-v2/src/operation/read/read.rs | 10 +- components/raftstore-v2/src/raft/peer.rs | 8 +- src/server/engine_factory.rs | 51 ++-- src/server/engine_factory_v2.rs | 251 +++++++++++++----- .../flow_controller/tablet_flow_controller.rs | 19 +- 7 files changed, 428 insertions(+), 178 deletions(-) diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index c3c8cc598ad..18d89b1c2fb 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -91,7 +91,7 @@ pub mod kv { RocksSnapshot as KvTestSnapshot, RocksWriteBatchVec as KvTestWriteBatch, }; use engine_traits::{ - CfOptions, CfOptionsExt, Result, TabletAccessor, TabletFactory, CF_DEFAULT, + CfOptions, CfOptionsExt, OpenOptions, Result, TabletAccessor, TabletFactory, CF_DEFAULT, }; use tikv_util::box_err; @@ -134,12 +134,11 @@ pub mod kv { } fn create_tablet(&self, tablet_path: &Path) -> Result { - let kv_engine = KvTestEngine::new_kv_engine_opt( + KvTestEngine::new_kv_engine_opt( tablet_path.to_str().unwrap(), self.db_opt.clone(), self.cf_opts.clone(), - )?; - Ok(kv_engine) + ) } } @@ -152,25 +151,36 @@ pub mod kv { Ok(tablet) } - fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { - let db = self.root_db.lock().unwrap(); - if let Some(cp) = db.as_ref() { - return Ok(cp.clone()); + /// See the comment above the same name method in KvEngineFactory + fn open_tablet( + &self, + _id: u64, + _suffix: Option, + options: OpenOptions, + ) -> Result { + if let Some(db) = self.root_db.lock().unwrap().as_ref() { + if options.create_new() { + return Err(box_err!( + "root tablet {} already exists", + db.as_inner().path() + )); + } + return Ok(db.clone()); + } else if options.create_new() || options.create() { + return self.create_shared_db(); } - self.create_shared_db() + Err(box_err!("root tablet has not been initialized")) } - fn open_tablet_cache(&self, _id: u64, _suffix: u64) -> Option { - self.open_tablet_raw(&self.tablet_path(0, 0), false).ok() - } - - fn open_tablet_cache_any(&self, _id: u64) -> Option { - self.open_tablet_cache(0, 0) - } - - fn open_tablet_raw(&self, _path: &Path, _readonly: bool) -> Result { - TabletFactory::create_tablet(self, 0, 0) + fn open_tablet_raw( + &self, + _path: &Path, + _id: u64, + _suffix: u64, + _options: OpenOptions, + ) -> Result { + self.create_shared_db() } fn exists_raw(&self, _path: &Path) -> bool { @@ -243,59 +253,86 @@ pub mod kv { } impl TabletFactory for TestTabletFactoryV2 { - fn create_tablet(&self, id: u64, suffix: u64) -> Result { - let mut reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { - return Err(box_err!( - "region {} {} already exists", - id, - db.as_inner().path() - )); - } - - let tablet_path = self.tablet_path(id, suffix); - let kv_engine = self.inner.create_tablet(&tablet_path)?; - reg.insert((id, suffix), kv_engine.clone()); - - Ok(kv_engine) - } - - fn open_tablet(&self, id: u64, suffix: u64) -> Result { - if let Some(db) = self.registry.lock().unwrap().get(&(id, suffix)) { - return Ok(db.clone()); + /// See the comment above the same name method in KvEngineFactoryV2 + fn open_tablet( + &self, + id: u64, + suffix: Option, + mut options: OpenOptions, + ) -> Result { + if options.create_new() || options.create() { + options = options.set_cache_only(false); } - let db_path = self.tablet_path(id, suffix); - let db = self.open_tablet_raw(db_path.as_path(), false)?; - Ok(db) - } - - fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option { - self.registry.lock().unwrap().get(&(id, suffix)).cloned() - } + let mut reg = self.registry.lock().unwrap(); + if let Some(suffix) = suffix { + if let Some(tablet) = reg.get(&(id, suffix)) { + // Target tablet exist in the cache + + if options.create_new() { + return Err(box_err!( + "region {} {} already exists", + id, + tablet.as_inner().path() + )); + } + return Ok(tablet.clone()); + } else if !options.cache_only() { + let tablet_path = self.tablet_path(id, suffix); + let tablet = self.open_tablet_raw(&tablet_path, id, suffix, options.clone())?; + if !options.skip_cache() { + reg.insert((id, suffix), tablet.clone()); + } + return Ok(tablet); + } + } else if options.cache_only() { + // This branch reads an arbitrary tablet with region id `id` - fn open_tablet_cache_any(&self, id: u64) -> Option { - let reg = self.registry.lock().unwrap(); - if let Some(k) = reg.keys().find(|k| k.0 == id) { - return Some(reg.get(k).unwrap().clone()); + if let Some(k) = reg.keys().find(|k| k.0 == id) { + return Ok(reg.get(k).unwrap().clone()); + } } - None - } - fn open_tablet_raw(&self, path: &Path, _readonly: bool) -> Result { - if !KvTestEngine::exists(path.to_str().unwrap_or_default()) { + Err(box_err!( + "tablet with region id {} suffix {:?} does not exist", + id, + suffix + )) + } + + fn open_tablet_raw( + &self, + path: &Path, + id: u64, + _suffix: u64, + options: OpenOptions, + ) -> Result { + let engine_exist = KvTestEngine::exists(path.to_str().unwrap_or_default()); + // Even though neither options.create nor options.create_new are true, if the + // tablet files already exists, we will open it by calling + // inner.create_tablet. In this case, the tablet exists but not in the cache + // (registry). + if !options.create() && !options.create_new() && !engine_exist { return Err(box_err!( "path {} does not have db", path.to_str().unwrap_or_default() )); + }; + + if options.create_new() && engine_exist { + return Err(box_err!( + "region {} {} already exists", + id, + path.to_str().unwrap() + )); } - let (tablet_id, tablet_suffix) = get_id_and_suffix_from_path(path); - self.create_tablet(tablet_id, tablet_suffix) + + self.inner.create_tablet(path) } #[inline] fn create_shared_db(&self) -> Result { - self.create_tablet(0, 0) + self.open_tablet(0, Some(0), OpenOptions::default().set_create_new(true)) } #[inline] @@ -350,7 +387,8 @@ pub mod kv { let db_path = self.tablet_path(id, suffix); std::fs::rename(path, &db_path)?; - let new_engine = self.open_tablet_raw(db_path.as_path(), false); + let new_engine = + self.open_tablet(id, Some(suffix), OpenOptions::default().set_create(true)); if new_engine.is_ok() { let (old_id, old_suffix) = get_id_and_suffix_from_path(path); self.registry.lock().unwrap().remove(&(old_id, old_suffix)); diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 8d991f1cfeb..e59d9104e56 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -153,32 +153,89 @@ impl Drop for TabletErrorCollector { } } +/// OpenOptionsn is used for specifiying the way of opening a tablet. +#[derive(Default, Clone)] +pub struct OpenOptions { + // create tablet if non-exist + create: bool, + create_new: bool, + read_only: bool, + cache_only: bool, + skip_cache: bool, +} + +impl OpenOptions { + /// Sets the option to create a tablet, or open it if it already exists. + pub fn set_create(mut self, create: bool) -> Self { + self.create = create; + self + } + + /// Sets the option to create a new tablet, failing if it already exists. + pub fn set_create_new(mut self, create_new: bool) -> Self { + self.create_new = create_new; + self + } + + /// Sets the option for read only + pub fn set_read_only(mut self, read_only: bool) -> Self { + self.read_only = read_only; + self + } + + /// Sets the option for only reading from cache. + pub fn set_cache_only(mut self, cache_only: bool) -> Self { + self.cache_only = cache_only; + self + } + + /// Sets the option to open a tablet without updating the cache. + pub fn set_skip_cache(mut self, skip_cache: bool) -> Self { + self.skip_cache = skip_cache; + self + } + + pub fn create(&self) -> bool { + self.create + } + + pub fn create_new(&self) -> bool { + self.create_new + } + + pub fn read_only(&self) -> bool { + self.read_only + } + + pub fn cache_only(&self) -> bool { + self.cache_only + } + + pub fn skip_cache(&self) -> bool { + self.skip_cache + } +} + /// A factory trait to create new engine. // It should be named as `EngineFactory` for consistency, but we are about to // rename engine to tablet, so always use tablet for new traits/types. pub trait TabletFactory: TabletAccessor { - /// Create an tablet by id and suffix. If the tablet exists, it will fail. + /// Open the tablet with id and suffix according to the OpenOptions. + /// /// The id is likely the region Id, the suffix could be the current raft log /// index. They together could specify a unique path for a region's /// tablet. The reason to have suffix is that we can keep more than one /// tablet for a region. - fn create_tablet(&self, id: u64, suffix: u64) -> Result; - - /// Open a tablet by id and suffix. If the tablet exists, it will open it. - /// If the tablet does not exist, it will create it. - fn open_tablet(&self, id: u64, suffix: u64) -> Result { - self.open_tablet_raw(&self.tablet_path(id, suffix), false) - } + fn open_tablet(&self, id: u64, suffix: Option, options: OpenOptions) -> Result; - /// Open a tablet by id and suffix from cache---that means it should already - /// be opened. - fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option; - - /// Open a tablet by id and any suffix from cache - fn open_tablet_cache_any(&self, id: u64) -> Option; - - /// Open tablet by path and readonly flag - fn open_tablet_raw(&self, path: &Path, readonly: bool) -> Result; + /// Open tablet by raw path without updating cache. + fn open_tablet_raw( + &self, + path: &Path, + id: u64, + suffix: u64, + options: OpenOptions, + ) -> Result; /// Create the shared db for v1 fn create_shared_db(&self) -> Result; @@ -232,23 +289,21 @@ impl TabletFactory for DummyFactory where EK: CfOptionsExt + Clone + Send + 'static, { - fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { + fn create_shared_db(&self) -> Result { Ok(self.engine.as_ref().unwrap().clone()) } - fn open_tablet_raw(&self, _path: &Path, _readonly: bool) -> Result { + fn open_tablet(&self, _id: u64, _suffix: Option, _options: OpenOptions) -> Result { Ok(self.engine.as_ref().unwrap().clone()) } - fn open_tablet_cache(&self, _id: u64, _suffix: u64) -> Option { - Some(self.engine.as_ref().unwrap().clone()) - } - - fn open_tablet_cache_any(&self, _id: u64) -> Option { - Some(self.engine.as_ref().unwrap().clone()) - } - - fn create_shared_db(&self) -> Result { + fn open_tablet_raw( + &self, + _path: &Path, + _id: u64, + _suffix: u64, + _options: OpenOptions, + ) -> Result { Ok(self.engine.as_ref().unwrap().clone()) } diff --git a/components/raftstore-v2/src/operation/read/read.rs b/components/raftstore-v2/src/operation/read/read.rs index 63878beeb22..bc3903e12fd 100644 --- a/components/raftstore-v2/src/operation/read/read.rs +++ b/components/raftstore-v2/src/operation/read/read.rs @@ -154,7 +154,7 @@ mod tests { ctor::{CfOptions, DbOptions}, kv::{KvTestEngine, KvTestSnapshot, TestTabletFactoryV2}, }; - use engine_traits::{Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; + use engine_traits::{OpenOptions, Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; use kvproto::{metapb::Region, raft_cmdpb::*}; use raftstore::store::{ util::Lease, Callback, CasualMessage, CasualRouter, LocalReader, ProposalRouter, @@ -210,7 +210,9 @@ mod tests { meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data - tablet1 = factory.create_tablet(1, 10).unwrap(); + tablet1 = factory + .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); tablet1.put_cf(CF_DEFAULT, b"a1", b"val1").unwrap(); let cache = CachedTablet::new(Some(tablet1.clone())); meta.tablet_caches.insert(1, cache); @@ -221,7 +223,9 @@ mod tests { meta.readers.insert(2, read_delegate); // create tablet with region_id 1 and prepare some data - tablet2 = factory.create_tablet(2, 10).unwrap(); + tablet2 = factory + .open_tablet(2, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); tablet2.put_cf(CF_DEFAULT, b"a2", b"val2").unwrap(); let cache = CachedTablet::new(Some(tablet2.clone())); meta.tablet_caches.insert(2, cache); diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index aebb1bf7406..70dccd284fa 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -2,7 +2,7 @@ use std::sync::Arc; -use engine_traits::{KvEngine, RaftEngine, TabletFactory}; +use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; use raft::{RawNode, INVALID_ID}; use raftstore::store::{util::find_peer, Config}; @@ -71,7 +71,11 @@ impl Peer { )); } // TODO: Perhaps we should stop create the tablet automatically. - Some(tablet_factory.open_tablet(region_id, tablet_index)?) + Some(tablet_factory.open_tablet( + region_id, + Some(tablet_index), + OpenOptions::default().set_create(true), + )?) } else { None }; diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index fad5cd25ba8..968e8fa04d8 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -11,8 +11,8 @@ use engine_rocks::{ RocksEventListener, }; use engine_traits::{ - CfOptions, CfOptionsExt, CompactionJobInfo, Result, TabletAccessor, TabletFactory, CF_DEFAULT, - CF_WRITE, + CfOptions, CfOptionsExt, CompactionJobInfo, OpenOptions, Result, TabletAccessor, TabletFactory, + CF_DEFAULT, CF_WRITE, }; use kvproto::kvrpcpb::ApiVersion; use raftstore::RegionInfoAccessor; @@ -227,25 +227,42 @@ impl TabletFactory for KvEngineFactory { Ok(tablet) } - fn create_tablet(&self, _id: u64, _suffix: u64) -> Result { - let db = self.inner.root_db.lock().unwrap(); - if let Some(cp) = db.as_ref() { - return Ok(cp.clone()); + /// Open the root tablet according to the OpenOptions. + /// + /// If options.create_new is true, create the root tablet. If the tablet + /// exists, it will fail. + /// + /// If options.create is true, open the the root tablet if it exists or + /// create it otherwise. + fn open_tablet( + &self, + _id: u64, + _suffix: Option, + options: OpenOptions, + ) -> Result { + if let Some(db) = self.inner.root_db.lock().unwrap().as_ref() { + if options.create_new() { + return Err(box_err!( + "root tablet {} already exists", + db.as_inner().path() + )); + } + return Ok(db.clone()); + } else if options.create_new() || options.create() { + return self.create_shared_db(); } - self.create_shared_db() - } - - fn open_tablet_cache(&self, _id: u64, _suffix: u64) -> Option { - self.open_tablet_raw(&self.tablet_path(0, 0), false).ok() - } - - fn open_tablet_cache_any(&self, _id: u64) -> Option { - self.open_tablet_cache(0, 0) + Err(box_err!("root tablet has not been initialized")) } - fn open_tablet_raw(&self, _path: &Path, _readonly: bool) -> Result { - TabletFactory::create_tablet(self, 0, 0) + fn open_tablet_raw( + &self, + _path: &Path, + _id: u64, + _suffix: u64, + _options: OpenOptions, + ) -> Result { + self.create_shared_db() } fn exists_raw(&self, _path: &Path) -> bool { diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 7f3bcaafe4f..e3f57d4f244 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -7,7 +7,9 @@ use std::{ use collections::HashMap; use engine_rocks::RocksEngine; -use engine_traits::{CfOptions, CfOptionsExt, Result, TabletAccessor, TabletFactory, CF_DEFAULT}; +use engine_traits::{ + CfOptions, CfOptionsExt, OpenOptions, Result, TabletAccessor, TabletFactory, CF_DEFAULT, +}; use crate::server::engine_factory::KvEngineFactory; @@ -40,62 +42,104 @@ fn get_id_and_suffix_from_path(path: &Path) -> (u64, u64) { } impl TabletFactory for KvEngineFactoryV2 { - fn create_tablet(&self, id: u64, suffix: u64) -> Result { - let mut reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { - return Err(box_err!( - "region {} {} already exists", - id, - db.as_inner().path() - )); - } - let tablet_path = self.tablet_path(id, suffix); - let kv_engine = self.inner.create_tablet(&tablet_path, id, suffix)?; - debug!("inserting tablet"; "key" => ?(id, suffix)); - reg.insert((id, suffix), kv_engine.clone()); - self.inner.on_tablet_created(id, suffix); - Ok(kv_engine) - } - - fn open_tablet(&self, id: u64, suffix: u64) -> Result { - let reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { - return Ok(db.clone()); + /// open a tablet according to the OpenOptions. + /// + /// If options.cache_only is true, only open the relevant tablet from + /// `registry`, and if suffix is None, return an arbitrary tablet with the + /// target region id if there are any. + /// + /// If options.create_new is true, create a tablet by id and suffix. If the + /// tablet exists, it will fail. + /// + /// If options.create is true, open the tablet with id and suffix if it + /// exists or create it otherwise. + /// + /// Note: options.cache_only and options.create and/or options.create_new + /// cannot be true simultaneously + fn open_tablet( + &self, + id: u64, + suffix: Option, + mut options: OpenOptions, + ) -> Result { + if options.create() || options.create_new() { + options = options.set_cache_only(false); } - let db_path = self.tablet_path(id, suffix); - let db = self.open_tablet_raw(db_path.as_path(), false)?; - debug!("open tablet"; "key" => ?(id, suffix)); - Ok(db) - } - - fn open_tablet_cache(&self, id: u64, suffix: u64) -> Option { - self.registry.lock().unwrap().get(&(id, suffix)).cloned() - } + let mut reg = self.registry.lock().unwrap(); + if let Some(suffix) = suffix { + if let Some(tablet) = reg.get(&(id, suffix)) { + // Target tablet exist in the cache + + if options.create_new() { + return Err(box_err!( + "region {} {} already exists", + id, + tablet.as_inner().path() + )); + } + return Ok(tablet.clone()); + } else if !options.cache_only() { + let tablet_path = self.tablet_path(id, suffix); + let tablet = self.open_tablet_raw(&tablet_path, id, suffix, options.clone())?; + if !options.skip_cache() { + debug!("Insert a tablet"; "key" => ?(id, suffix)); + reg.insert((id, suffix), tablet.clone()); + } + return Ok(tablet); + } + } else if options.cache_only() { + // This branch reads an arbitrary tablet with region id `id` - fn open_tablet_cache_any(&self, id: u64) -> Option { - let reg = self.registry.lock().unwrap(); - if let Some(k) = reg.keys().find(|k| k.0 == id) { - debug!("choose a random tablet"; "key" => ?k); - return Some(reg.get(k).unwrap().clone()); + if let Some(k) = reg.keys().find(|k| k.0 == id) { + debug!("choose a random tablet"; "key" => ?k); + return Ok(reg.get(k).unwrap().clone()); + } } - None + + Err(box_err!( + "tablet with region id {} suffix {:?} does not exist", + id, + suffix + )) } - fn open_tablet_raw(&self, path: &Path, _readonly: bool) -> Result { - if !RocksEngine::exists(path.to_str().unwrap_or_default()) { + fn open_tablet_raw( + &self, + path: &Path, + id: u64, + suffix: u64, + options: OpenOptions, + ) -> Result { + let engine_exist = RocksEngine::exists(path.to_str().unwrap_or_default()); + // Even though neither options.create nor options.create_new are true, if the + // tablet files already exists, we will open it by calling + // inner.create_tablet. In this case, the tablet exists but not in the cache + // (registry). + if !options.create() && !options.create_new() && !engine_exist { return Err(box_err!( "path {} does not have db", path.to_str().unwrap_or_default() )); + }; + + if options.create_new() && engine_exist { + return Err(box_err!( + "region {} {} already exists", + id, + path.to_str().unwrap() + )); } - let (tablet_id, tablet_suffix) = get_id_and_suffix_from_path(path); - self.create_tablet(tablet_id, tablet_suffix) + + let tablet = self.inner.create_tablet(path, id, suffix)?; + debug!("open tablet"; "key" => ?(id, suffix)); + self.inner.on_tablet_created(id, suffix); + Ok(tablet) } #[inline] fn create_shared_db(&self) -> Result { - self.create_tablet(0, 0) + self.open_tablet(0, Some(0), OpenOptions::default().set_create_new(true)) } #[inline] @@ -154,10 +198,10 @@ impl TabletFactory for KvEngineFactoryV2 { let db_path = self.tablet_path(id, suffix); std::fs::rename(path, &db_path)?; - let new_engine = self.open_tablet_raw(db_path.as_path(), false); + let new_engine = + self.open_tablet(id, Some(suffix), OpenOptions::default().set_create(true)); if new_engine.is_ok() { let (old_id, old_suffix) = get_id_and_suffix_from_path(path); - assert!(suffix > old_suffix); self.registry.lock().unwrap().remove(&(old_id, old_suffix)); } new_engine @@ -191,7 +235,7 @@ impl TabletAccessor for KvEngineFactoryV2 { #[cfg(test)] mod tests { - use engine_traits::{TabletFactory, CF_WRITE}; + use engine_traits::{OpenOptions, TabletFactory, CF_WRITE}; use super::*; use crate::{config::TikvConfig, server::KvEngineFactoryBuilder}; @@ -225,17 +269,24 @@ mod tests { } let factory = builder.build(); let shared_db = factory.create_shared_db().unwrap(); - let tablet = TabletFactory::create_tablet(&factory, 1, 10).unwrap(); - let tablet2 = factory.open_tablet(1, 10).unwrap(); + + // V1 can only create tablet once + factory + .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap_err(); + + let tablet = factory + .open_tablet(1, Some(10), OpenOptions::default().set_create(true)) + .unwrap(); + assert_eq!(tablet.as_inner().path(), shared_db.as_inner().path()); + let tablet = factory + .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) + .unwrap(); + assert_eq!(tablet.as_inner().path(), shared_db.as_inner().path()); + let tablet = factory + .open_tablet(1, None, OpenOptions::default().set_cache_only(true)) + .unwrap(); assert_eq!(tablet.as_inner().path(), shared_db.as_inner().path()); - assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); - let tablet2 = factory.open_tablet_cache(1, 10).unwrap(); - assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); - let tablet2 = factory.open_tablet_cache_any(1).unwrap(); - assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); - let tablet_path = factory.tablet_path(1, 10); - let tablet2 = factory.open_tablet_raw(&tablet_path, false).unwrap(); - assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); let mut count = 0; factory.for_each_opened_tablet(&mut |id, suffix, _tablet| { assert!(id == 0); @@ -266,16 +317,26 @@ mod tests { } let factory = builder.build_v2(); - let tablet = factory.create_tablet(1, 10).unwrap(); - let tablet2 = factory.open_tablet(1, 10).unwrap(); + let tablet = factory + .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); + let tablet2 = factory + .open_tablet(1, Some(10), OpenOptions::default().set_create(true)) + .unwrap(); assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); - let tablet2 = factory.open_tablet_cache(1, 10).unwrap(); + let tablet2 = factory + .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) + .unwrap(); assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); - let tablet2 = factory.open_tablet_cache_any(1).unwrap(); + let tablet2 = factory + .open_tablet(1, None, OpenOptions::default().set_cache_only(true)) + .unwrap(); assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + let tablet_path = factory.tablet_path(1, 10); - let result = factory.open_tablet_raw(&tablet_path, false); + let result = factory.open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)); result.unwrap_err(); + factory .set_shared_block_cache_capacity(1024 * 1024) .unwrap(); @@ -292,17 +353,73 @@ mod tests { factory.load_tablet(&tablet_path, 1, 20).unwrap(); // After we load it as with the new id or suffix, we should be unable to get it // with the old id and suffix in the cache. - assert!(factory.open_tablet_cache(1, 10).is_none()); - assert!(factory.open_tablet_cache(1, 20).is_some()); + factory + .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) + .unwrap_err(); + factory + .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) + .unwrap(); factory.mark_tombstone(1, 20); assert!(factory.is_tombstoned(1, 20)); factory.destroy_tablet(1, 20).unwrap(); - let result = factory.open_tablet(1, 20); + + let result = factory.open_tablet(1, Some(20), OpenOptions::default()); result.unwrap_err(); + assert!(!factory.is_single_engine()); } + #[test] + fn test_existed_db_not_in_registry() { + let cfg = TEST_CONFIG.clone(); + assert!(cfg.storage.block_cache.shared); + let cache = cfg.storage.block_cache.build_shared_cache(); + let dir = test_util::temp_dir("test_kvengine_factory_v2", false); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let mut builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); + if let Some(cache) = cache { + builder = builder.block_cache(cache); + } + + let factory = builder.build_v2(); + let tablet = factory + .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); + drop(tablet); + let tablet = factory.registry.lock().unwrap().remove(&(1, 10)).unwrap(); + drop(tablet); + factory + .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) + .unwrap_err(); + + let tablet_path = factory.tablet_path(1, 10); + let tablet = factory + .open_tablet_raw(&tablet_path, 1, 10, OpenOptions::default()) + .unwrap(); + // the tablet will not inserted in the cache + factory + .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) + .unwrap_err(); + drop(tablet); + + let tablet_path = factory.tablet_path(1, 20); + // No such tablet, so error will be returned. + factory + .open_tablet_raw(&tablet_path, 1, 10, OpenOptions::default()) + .unwrap_err(); + + let _ = factory + .open_tablet(1, Some(10), OpenOptions::default().set_create(true)) + .unwrap(); + + // Now, it should be in the cache. + factory + .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) + .unwrap(); + } + #[test] fn test_get_live_tablets() { let cfg = TEST_CONFIG.clone(); @@ -311,8 +428,12 @@ mod tests { let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); let factory = builder.build_v2(); - factory.create_tablet(1, 10).unwrap(); - factory.create_tablet(2, 10).unwrap(); + factory + .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); + factory + .open_tablet(2, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); let mut count = 0; factory.for_each_opened_tablet(&mut |id, suffix, _tablet| { assert!(id == 1 || id == 2); diff --git a/src/storage/txn/flow_controller/tablet_flow_controller.rs b/src/storage/txn/flow_controller/tablet_flow_controller.rs index a35517246c5..17a5900bea7 100644 --- a/src/storage/txn/flow_controller/tablet_flow_controller.rs +++ b/src/storage/txn/flow_controller/tablet_flow_controller.rs @@ -13,7 +13,7 @@ use std::{ use collections::HashMap; use engine_rocks::FlowInfo; -use engine_traits::{CfNamesExt, FlowControlFactorsExt, TabletFactory}; +use engine_traits::{CfNamesExt, FlowControlFactorsExt, OpenOptions, TabletFactory}; use rand::Rng; use tikv_util::{sys::thread::StdThreadBuildWrapper, time::Limiter}; @@ -117,7 +117,13 @@ impl FlowInfoDispatcher { } let insert_limiter_and_checker = |region_id, suffix| -> FlowChecker { - let engine = tablet_factory.open_tablet_cache(region_id, suffix).unwrap(); + let engine = tablet_factory + .open_tablet( + region_id, + Some(suffix), + OpenOptions::default().set_cache_only(true), + ) + .unwrap(); let mut v = limiters.as_ref().write().unwrap(); let discard_ratio = Arc::new(AtomicU32::new(0)); let limiter = v.entry(region_id).or_insert(( @@ -166,8 +172,13 @@ impl FlowInfoDispatcher { // if checker.suffix < suffix, it means its tablet is old and needs the // refresh if checker.tablet_suffix() < suffix { - let engine = - tablet_factory.open_tablet_cache(region_id, suffix).unwrap(); + let engine = tablet_factory + .open_tablet( + region_id, + Some(suffix), + OpenOptions::default().set_cache_only(true), + ) + .unwrap(); checker.set_engine(engine); checker.set_tablet_suffix(suffix); } From bec2627bb826b9e3c6266ec82f5aef5cdb4de7bd Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger <60744015+Lloyd-Pottiger@users.noreply.github.com> Date: Mon, 8 Aug 2022 11:08:47 +0800 Subject: [PATCH 141/676] diagnostics: support cgroup limit memory (#13237) close tikv/tikv#13217, ref tikv/tikv#13217 support cgroup limit memory in diagnostics service Signed-off-by: Lloyd-Pottiger Co-authored-by: Lloyd-Pottiger --- src/server/service/diagnostics/sys.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index 9eb88016424..f39da646ad1 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -350,7 +350,7 @@ fn mem_hardware_info(collector: &mut Vec) { system.refresh_memory(); let mut pair = ServerInfoPair::default(); pair.set_key("capacity".to_string()); - pair.set_value((system.get_total_memory() * KIB).to_string()); + pair.set_value(SysQuota::memory_limit_in_bytes().to_string()); let mut item = ServerInfoItem::default(); item.set_tp("memory".to_string()); item.set_name("memory".to_string()); From dcbeb16f8cf5fb9e8811a9b030b75cd71710949f Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Mon, 8 Aug 2022 11:50:47 +0800 Subject: [PATCH 142/676] dr-auto-sync: judge whether `RocksWriteBatchVec` is empty (#13238) close tikv/tikv#13194 Signed-off-by: lhy1024 --- src/server/reset_to_version.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/server/reset_to_version.rs b/src/server/reset_to_version.rs index 20bd65ac17a..e1faccd9b3f 100644 --- a/src/server/reset_to_version.rs +++ b/src/server/reset_to_version.rs @@ -134,8 +134,10 @@ impl ResetToVersionWorker { box_try!(wb.delete_cf(CF_WRITE, &key)); box_try!(wb.delete_cf(CF_DEFAULT, default_key.as_encoded())); } - wb.write().unwrap(); - wb.clear(); + if !wb.is_empty() { + wb.write().unwrap(); + wb.clear(); + } Ok(has_more) } @@ -165,7 +167,10 @@ impl ResetToVersionWorker { break; } } - wb.write().unwrap(); + if !wb.is_empty() { + wb.write().unwrap(); + wb.clear(); + } Ok(has_more) } } From 0b4231ac9ed76575d0fe4e3b6ba93efac4b50431 Mon Sep 17 00:00:00 2001 From: Ping Yu Date: Mon, 8 Aug 2022 12:14:46 +0800 Subject: [PATCH 143/676] causal_ts: Implement TSO batch list (#12970) close tikv/tikv#12794, ref tikv/tikv#12794 Implement TSO batch list to improve tolerance to TSO service fault. Signed-off-by: pingyu Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + Cargo.toml | 2 +- components/causal_ts/Cargo.toml | 6 +- components/causal_ts/src/config.rs | 33 +- components/causal_ts/src/lib.rs | 3 + components/causal_ts/src/metrics.rs | 61 +- components/causal_ts/src/observer.rs | 12 +- components/causal_ts/src/tso.rs | 753 ++++++++++++++++++------ components/server/src/server.rs | 2 + components/test_raftstore/src/server.rs | 2 + metrics/grafana/tikv_raw.json | 243 ++++++++ tests/integrations/config/mod.rs | 2 + 12 files changed, 933 insertions(+), 187 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 52b39154e91..f258fbdcf69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -733,6 +733,7 @@ dependencies = [ "parking_lot 0.12.0", "pd_client", "prometheus", + "prometheus-static-metric", "raft", "raftstore", "serde", diff --git a/Cargo.toml b/Cargo.toml index fd7af73bdf4..9bbea00262c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,7 +38,7 @@ cloud-azure = [ "encryption_export/cloud-azure", "sst_importer/cloud-azure", ] -testexport = ["raftstore/testexport", "api_version/testexport"] +testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport"] test-engine-kv-rocksdb = [ "engine_test/test-engine-kv-rocksdb" ] diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index 08027941f03..b1ad4ed449a 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -4,6 +4,9 @@ version = "0.0.1" edition = "2018" publish = false +[features] +testexport = [] + [dependencies] api_version = { path = "../api_version", default-features = false } engine_rocks = { path = "../engine_rocks", default-features = false } @@ -17,6 +20,7 @@ log_wrappers = { path = "../log_wrappers" } parking_lot = "0.12" pd_client = { path = "../pd_client", default-features = false } prometheus = { version = "0.13", features = ["nightly"] } +prometheus-static-metric = "0.5" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raftstore = { path = "../raftstore", default-features = false } serde = "1.0" @@ -30,4 +34,4 @@ tokio = { version = "1", features = ["sync"] } txn_types = { path = "../txn_types", default-features = false } [dev-dependencies] -test_raftstore = { path = "../test_raftstore", default-features = false } +test_raftstore = { path = "../test_raftstore" } diff --git a/components/causal_ts/src/config.rs b/components/causal_ts/src/config.rs index e75bff62d47..0b08fecc7d6 100644 --- a/components/causal_ts/src/config.rs +++ b/components/causal_ts/src/config.rs @@ -20,6 +20,21 @@ pub struct Config { /// 1K tso/s should be enough. Benchmark showed that with a 8.6w raw_put /// per second, the TSO requirement is 600 per second. pub renew_batch_min_size: u32, + /// The maximum renew batch size of BatchTsoProvider. + /// + /// Default is 8192. + /// PD provides 262144 TSO per 50ms for the whole cluster. Exceed this space + /// will cause PD to sleep for 50ms, waiting for physical update + /// interval. The 50ms limitation can not be broken through now (see + /// `tso-update-physical-interval`). + pub renew_batch_max_size: u32, + /// The available interval of BatchTsoProvider. + /// + /// Default is 3s. + /// The longer of the value can provide better "high-availability" against + /// PD failure, but more overhead of `TsoBatchList` & pressure to TSO + /// service. + pub available_interval: ReadableDuration, } impl Config { @@ -28,7 +43,13 @@ impl Config { return Err("causal-ts.renew_interval can't be zero".into()); } if self.renew_batch_min_size == 0 { - return Err("causal-ts.renew_batch_init_size should be greater than 0".into()); + return Err("causal-ts.renew_batch_min_size should be greater than 0".into()); + } + if self.renew_batch_max_size == 0 { + return Err("causal-ts.renew_batch_max_size should be greater than 0".into()); + } + if self.available_interval.is_zero() { + return Err("causal-ts.available-interval can't be zero".into()); } Ok(()) } @@ -37,8 +58,14 @@ impl Config { impl Default for Config { fn default() -> Self { Self { - renew_interval: ReadableDuration::millis(crate::tso::TSO_BATCH_RENEW_INTERVAL_DEFAULT), - renew_batch_min_size: crate::tso::TSO_BATCH_MIN_SIZE_DEFAULT, + renew_interval: ReadableDuration::millis( + crate::tso::DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS, + ), + renew_batch_min_size: crate::tso::DEFAULT_TSO_BATCH_MIN_SIZE, + renew_batch_max_size: crate::tso::DEFAULT_TSO_BATCH_MAX_SIZE, + available_interval: ReadableDuration::millis( + crate::tso::DEFAULT_TSO_BATCH_AVAILABLE_INTERVAL_MS, + ), } } } diff --git a/components/causal_ts/src/lib.rs b/components/causal_ts/src/lib.rs index 05626ce7203..9d77818d253 100644 --- a/components/causal_ts/src/lib.rs +++ b/components/causal_ts/src/lib.rs @@ -1,5 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(map_first_last)] // For `BTreeMap::pop_first`. +#![feature(div_duration)] + #[macro_use] extern crate tikv_util; diff --git a/components/causal_ts/src/metrics.rs b/components/causal_ts/src/metrics.rs index 072f7325dc0..52f352ccfe5 100644 --- a/components/causal_ts/src/metrics.rs +++ b/components/causal_ts/src/metrics.rs @@ -2,6 +2,7 @@ use lazy_static::*; use prometheus::*; +use prometheus_static_metric::*; lazy_static! { pub static ref TS_PROVIDER_TSO_BATCH_SIZE: IntGauge = register_int_gauge!( @@ -20,7 +21,65 @@ lazy_static! { "tikv_causal_ts_provider_tso_batch_renew_duration_seconds", "Histogram of the duration of TSO batch renew", &["result", "reason"], - exponential_buckets(1e-6, 2.0, 20).unwrap() // 1us ~ 1s + exponential_buckets(1e-4, 2.0, 20).unwrap() // 0.1ms ~ 104s ) .unwrap(); + pub static ref TS_PROVIDER_TSO_BATCH_LIST_COUNTING: HistogramVec = register_histogram_vec!( + "tikv_causal_ts_provider_tso_batch_list_counting", + "Histogram of TSO batch list counting", + &["type"], + exponential_buckets(10.0, 2.0, 20).unwrap() // 10 ~ 10,000,000 + ) + .unwrap(); +} + +make_auto_flush_static_metric! { + pub label_enum TsoBatchRenewReason { + init, + background, + used_up, + flush, + } + + pub label_enum TsoBatchCountingKind { + tso_usage, + tso_remain, + new_batch_size, + } + + pub label_enum ResultKind { + ok, + err, + } + + pub struct TsProviderGetTsDurationVec: LocalHistogram { + "result" => ResultKind, + } + + pub struct TsoBatchRenewDurationVec: LocalHistogram { + "result" => ResultKind, + "reason" => TsoBatchRenewReason, + } + + pub struct TsoBatchListCountingVec: LocalHistogram { + "type" => TsoBatchCountingKind, + } +} + +impl From<&std::result::Result> for ResultKind { + #[inline] + fn from(res: &std::result::Result) -> Self { + if res.is_ok() { Self::ok } else { Self::err } + } +} + +lazy_static! { + pub static ref TS_PROVIDER_GET_TS_DURATION_STATIC: TsProviderGetTsDurationVec = + auto_flush_from!(TS_PROVIDER_GET_TS_DURATION, TsProviderGetTsDurationVec); + pub static ref TS_PROVIDER_TSO_BATCH_RENEW_DURATION_STATIC: TsoBatchRenewDurationVec = auto_flush_from!( + TS_PROVIDER_TSO_BATCH_RENEW_DURATION, + TsoBatchRenewDurationVec + ); + pub static ref TS_PROVIDER_TSO_BATCH_LIST_COUNTING_STATIC: TsoBatchListCountingVec = + auto_flush_from!(TS_PROVIDER_TSO_BATCH_LIST_COUNTING, TsoBatchListCountingVec); } diff --git a/components/causal_ts/src/observer.rs b/components/causal_ts/src/observer.rs index aeb04bfabf5..f648d8cba08 100644 --- a/components/causal_ts/src/observer.rs +++ b/components/causal_ts/src/observer.rs @@ -175,8 +175,16 @@ pub mod tests { fn init() -> CausalObserver, DummyRawTsTracker> { let pd_cli = Arc::new(TestPdClient::new(0, true)); pd_cli.set_tso(100.into()); - let causal_ts_provider = - Arc::new(block_on(BatchTsoProvider::new_opt(pd_cli, Duration::ZERO, 100)).unwrap()); + let causal_ts_provider = Arc::new( + block_on(BatchTsoProvider::new_opt( + pd_cli, + Duration::ZERO, + Duration::from_secs(3), + 100, + 8192, + )) + .unwrap(), + ); CausalObserver::new(causal_ts_provider, DummyRawTsTracker::default()) } diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index b6ee5d177e1..6eabf8bf351 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -1,9 +1,31 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +//! ## The algorithm to make the TSO cache tolerate failure of TSO service +//! +//! 1. The scale of High-Available is specified by config item +//! `causal-ts.available-interval`. +//! +//! 2. Count usage of TSO on every renew interval. +//! +//! 3. Calculate `cache_multiplier` by `causal-ts.available-interval / +//! causal-ts.renew-interval`. +//! +//! 4. Then `tso_usage x cache_multiplier` is the expected number of TSO should +//! be cached. +//! +//! 5. And `tso_usage x cache_multiplier - tso_remain` is the expected number of +//! TSO to be requested from TSO service (if it's not a flush). +//! +//! Others: +//! * `cache_multiplier` is also used as capacity of TSO batch list, as we +//! append an item to the list on every renew. + use std::{ + borrow::Borrow, + collections::BTreeMap, error, result, sync::{ - atomic::{AtomicU64, Ordering}, + atomic::{AtomicI32, AtomicU32, AtomicU64, Ordering}, Arc, }, }; @@ -28,24 +50,28 @@ use crate::{ CausalTsProvider, }; -// Renew on every 100ms, to adjust batch size rapidly enough. -pub(crate) const TSO_BATCH_RENEW_INTERVAL_DEFAULT: u64 = 100; -// Batch size on every renew interval. -// One TSO is required for every batch of Raft put messages, so by default 1K -// tso/s should be enough. Benchmark showed that with a 8.6w raw_put per second, -// the TSO requirement is 600 per second. -pub(crate) const TSO_BATCH_MIN_SIZE_DEFAULT: u32 = 100; -// Max batch size of TSO requests. Space of logical timestamp is 262144, -// exceed this space will cause PD to sleep, waiting for physical clock advance. -const TSO_BATCH_MAX_SIZE: u32 = 20_0000; - -const TSO_BATCH_RENEW_ON_INITIALIZE: &str = "init"; -const TSO_BATCH_RENEW_BY_BACKGROUND: &str = "background"; -const TSO_BATCH_RENEW_FOR_USED_UP: &str = "used-up"; -const TSO_BATCH_RENEW_FOR_FLUSH: &str = "flush"; +/// Renew on every 100ms, to adjust batch size rapidly enough. +pub(crate) const DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS: u64 = 100; +/// Minimal batch size of TSO requests. This is an empirical value. +pub(crate) const DEFAULT_TSO_BATCH_MIN_SIZE: u32 = 100; +/// Maximum batch size of TSO requests. +/// As PD provides 262144 TSO per 50ms, conservatively set to 1/16 of 262144. +/// Exceed this space will cause PD to sleep for 50ms, waiting for physical +/// update interval. The 50ms limitation can not be broken through now (see +/// `tso-update-physical-interval`). +pub(crate) const DEFAULT_TSO_BATCH_MAX_SIZE: u32 = 8192; +/// Maximum available interval of TSO cache. +/// It means the duration that TSO we cache would be available despite failure +/// of PD. The longer of the value can provide better "High-Availability" +/// against PD failure, but more overhead of `TsoBatchList` & pressure to TSO +/// service. +pub(crate) const DEFAULT_TSO_BATCH_AVAILABLE_INTERVAL_MS: u64 = 3000; +/// Just a limitation for safety, in case user specify a too big +/// `available_interval`. +const MAX_TSO_BATCH_LIST_CAPACITY: u32 = 1024; /// TSO range: [(physical, logical_start), (physical, logical_end)) -#[derive(Default, Debug)] +#[derive(Debug)] struct TsoBatch { size: u32, physical: u64, @@ -54,7 +80,7 @@ struct TsoBatch { } impl TsoBatch { - pub fn pop(&self) -> Option { + pub fn pop(&self) -> Option<(TimeStamp, bool /* is_used_up */)> { let mut logical = self.logical_start.load(Ordering::Relaxed); while logical < self.logical_end { match self.logical_start.compare_exchange_weak( @@ -63,7 +89,12 @@ impl TsoBatch { Ordering::Relaxed, Ordering::Relaxed, ) { - Ok(_) => return Some(TimeStamp::compose(self.physical, logical)), + Ok(_) => { + return Some(( + TimeStamp::compose(self.physical, logical), + logical + 1 == self.logical_end, + )); + } Err(x) => logical = x, } } @@ -71,48 +102,177 @@ impl TsoBatch { } // `last_ts` is the last timestamp of the new batch. - pub fn renew(&mut self, batch_size: u32, last_ts: TimeStamp) -> Result<()> { - let (physical, logical) = (last_ts.physical(), last_ts.logical() + 1); - let logical_start = logical.checked_sub(batch_size as u64).unwrap(); + pub fn new(batch_size: u32, last_ts: TimeStamp) -> Self { + let (physical, logical_end) = (last_ts.physical(), last_ts.logical() + 1); + let logical_start = logical_end.checked_sub(batch_size as u64).unwrap(); + + Self { + size: batch_size, + physical, + logical_end, + logical_start: AtomicU64::new(logical_start), + } + } + + /// Number of remaining (available) TSO in the batch. + pub fn remain(&self) -> u32 { + self.logical_end + .saturating_sub(self.logical_start.load(Ordering::Relaxed)) as u32 + } + + /// The original start timestamp in the batch. + pub fn original_start(&self) -> TimeStamp { + TimeStamp::compose(self.physical, self.logical_end - self.size as u64) + } + + /// The excluded end timestamp after the last in batch. + pub fn excluded_end(&self) -> TimeStamp { + TimeStamp::compose(self.physical, self.logical_end) + } +} + +/// `TsoBatchList` is a ordered list of `TsoBatch`. It aims to: +/// +/// 1. Cache more number of TSO to improve high availability. See issue #12794. +/// `TsoBatch` can only cache at most 262144 TSO as logical clock is 18 bits. +/// +/// 2. Fully utilize cached TSO when some regions require latest TSO (e.g. in +/// the scenario of leader transfer). Other regions without the requirement can +/// still use older TSO cache. +#[derive(Default, Debug)] +struct TsoBatchList { + inner: RwLock, + + /// Number of remaining (available) TSO. + /// Using signed integer for avoiding a wrap around huge value as it's not + /// precisely counted. + tso_remain: AtomicI32, + + /// Statistics of TSO usage. + tso_usage: AtomicU32, - if physical < self.physical - || (physical == self.physical && logical_start < self.logical_end) + /// Length of batch list. It is used to limit size for efficiency, and keep + /// batches fresh. + capacity: u32, +} + +/// Inner data structure of batch list. +/// The reasons why `crossbeam_skiplist::SkipMap` is not chosen: +/// +/// 1. In `flush()` procedure, a reader of `SkipMap` can still acquire a batch +/// after the it is removed, which would violate the causality requirement. +/// The `RwLock` avoid this scenario by lock synchronization. +/// +/// 2. It is a scenario with much more reads than writes. The `RwLock` would not +/// be less efficient than lock free implementation. +type TsoBatchListInner = BTreeMap; + +impl TsoBatchList { + pub fn new(capacity: u32) -> Self { + Self { + capacity: std::cmp::min(capacity, MAX_TSO_BATCH_LIST_CAPACITY), + ..Default::default() + } + } + + pub fn remain(&self) -> u32 { + std::cmp::max(self.tso_remain.load(Ordering::Relaxed), 0) as u32 + } + + pub fn usage(&self) -> u32 { + self.tso_usage.load(Ordering::Relaxed) + } + + pub fn take_and_report_usage(&self) -> u32 { + let usage = self.tso_usage.swap(0, Ordering::Relaxed); + TS_PROVIDER_TSO_BATCH_LIST_COUNTING_STATIC + .tso_usage + .observe(usage as f64); + usage + } + + // TODO: make it async + fn remove_batch(&self, key: u64) { + if let Some(batch) = self.inner.write().remove(&key) { + self.tso_remain + .fetch_sub(batch.remain() as i32, Ordering::Relaxed); + } + } + + /// Pop timestamp. + /// When `after_ts.is_some()`, it will pop timestamp larger that `after_ts`. + /// It is used for the scenario that some regions have causality + /// requirement (e.g. after transfer, the next timestamp of new leader + /// should be larger than the store where it is transferred from). + /// `after_ts` is included. + pub fn pop(&self, after_ts: Option) -> Option { + let inner = self.inner.read(); + let range = match after_ts { + Some(after_ts) => inner.range(&after_ts.into_inner()..), + None => inner.range(..), + }; + for (key, batch) in range { + if let Some((ts, is_used_up)) = batch.pop() { + let key = *key; + drop(inner); + self.tso_usage.fetch_add(1, Ordering::Relaxed); + self.tso_remain.fetch_sub(1, Ordering::Relaxed); + if is_used_up { + // TODO: make it async + self.remove_batch(key); + } + return Some(ts); + } + } + None + } + + pub fn push(&self, batch_size: u32, last_ts: TimeStamp, need_flush: bool) -> Result { + let new_batch = TsoBatch::new(batch_size, last_ts); + + if let Some((_, last_batch)) = self.inner.read().iter().next_back() { + if new_batch.original_start() < last_batch.excluded_end() { + error!("timestamp fall back"; "batch_size" => batch_size, "last_ts" => ?last_ts, + "last_batch" => ?last_batch, "new_batch" => ?new_batch); + return Err(box_err!("timestamp fall back")); + } + } + + let key = new_batch.original_start().into_inner(); { - error!("timestamp fall back"; "last_ts" => ?last_ts, "batch" => ?self, - "physical" => physical, "logical" => logical, "logical_start" => logical_start); - return Err(box_err!("timestamp fall back")); + // Hold the write lock until new batch is inserted. + // Otherwise a `pop()` would acquire the lock, meet no TSO available, and invoke + // renew request. + let mut inner = self.inner.write(); + if need_flush { + self.flush_internal(&mut inner); + } + + inner.insert(key, new_batch); + self.tso_remain + .fetch_add(batch_size as i32, Ordering::Relaxed); } - self.size = batch_size; - self.physical = physical; - self.logical_end = logical; - self.logical_start.store(logical_start, Ordering::Relaxed); - Ok(()) + // remove items out of capacity limitation. + // TODO: make it async + if self.inner.read().len() > self.capacity as usize { + if let Some((_, batch)) = self.inner.write().pop_first() { + self.tso_remain + .fetch_sub(batch.remain() as i32, Ordering::Relaxed); + } + } + + Ok(key) + } + + fn flush_internal(&self, inner: &mut TsoBatchListInner) { + inner.clear(); + self.tso_remain.store(0, Ordering::Relaxed); } - // Note: batch is "used up" in flush, and batch size will be enlarged in next - // renew. pub fn flush(&self) { - self.logical_start - .store(self.logical_end, Ordering::Relaxed); - } - - // Return None if TsoBatch is empty. - // Note that `logical_start` will be larger than `logical_end`. See `pop()`. - pub fn used_size(&self) -> Option { - if self.size > 0 { - Some( - self.size - .checked_sub( - self.logical_end - .saturating_sub(self.logical_start.load(Ordering::Relaxed)) - as u32, - ) - .unwrap(), - ) - } else { - None - } + let mut inner = self.inner.write(); + self.flush_internal(&mut inner); } } @@ -128,53 +288,92 @@ struct RenewRequest { sender: oneshot::Sender, } +#[derive(Clone, Copy, Debug)] +struct RenewParameter { + batch_min_size: u32, + batch_max_size: u32, + // `cache_multiplier` indicates that times on usage of TSO it should cache. + // It is also used as capacity of `TsoBatchList`. + cache_multiplier: u32, +} + pub struct BatchTsoProvider { pd_client: Arc, - batch: Arc>, - batch_min_size: u32, + batch_list: Arc, causal_ts_worker: Worker, renew_interval: Duration, - renew_request_tx: mpsc::Sender, + renew_parameter: RenewParameter, + renew_request_tx: Sender, +} + +impl std::fmt::Debug for BatchTsoProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BatchTsoProvider") + .field("batch_list", &self.batch_list) + .field("renew_interval", &self.renew_interval) + .field("renew_parameter", &self.renew_parameter) + .finish() + } } impl BatchTsoProvider { pub async fn new(pd_client: Arc) -> Result { Self::new_opt( pd_client, - Duration::from_millis(TSO_BATCH_RENEW_INTERVAL_DEFAULT), - TSO_BATCH_MIN_SIZE_DEFAULT, + Duration::from_millis(DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS), + Duration::from_millis(DEFAULT_TSO_BATCH_AVAILABLE_INTERVAL_MS), + DEFAULT_TSO_BATCH_MIN_SIZE, + DEFAULT_TSO_BATCH_MAX_SIZE, ) .await } + #[allow(unused_mut)] + fn calc_cache_multiplier(mut renew_interval: Duration, available_interval: Duration) -> u32 { + #[cfg(any(test, feature = "testexport"))] + if renew_interval.is_zero() { + // Should happen in test only. + renew_interval = Duration::from_millis(DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS); + } + available_interval.div_duration_f64(renew_interval).ceil() as u32 + } + pub async fn new_opt( pd_client: Arc, renew_interval: Duration, + available_interval: Duration, batch_min_size: u32, + batch_max_size: u32, ) -> Result { + let cache_multiplier = Self::calc_cache_multiplier(renew_interval, available_interval); + let renew_parameter = RenewParameter { + batch_min_size, + batch_max_size, + cache_multiplier, + }; let (renew_request_tx, renew_request_rx) = mpsc::channel(MAX_RENEW_BATCH_SIZE); let s = Self { pd_client: pd_client.clone(), - batch: Arc::new(RwLock::new(TsoBatch::default())), - batch_min_size, + batch_list: Arc::new(TsoBatchList::new(cache_multiplier)), causal_ts_worker: WorkerBuilder::new("causal_ts_batch_tso_worker").create(), renew_interval, + renew_parameter, renew_request_tx, }; s.init(renew_request_rx).await?; Ok(s) } - async fn renew_tso_batch(&self, need_flush: bool, reason: &str) -> Result<()> { + async fn renew_tso_batch(&self, need_flush: bool, reason: TsoBatchRenewReason) -> Result<()> { Self::renew_tso_batch_internal(self.renew_request_tx.clone(), need_flush, reason).await } async fn renew_tso_batch_internal( renew_request_tx: Sender, need_flush: bool, - reason: &str, + reason: TsoBatchRenewReason, ) -> Result<()> { - let start = Instant::now(); + let start = Instant::now_coarse(); let (request, response) = oneshot::channel(); renew_request_tx .send(RenewRequest { @@ -188,60 +387,70 @@ impl BatchTsoProvider { .map_err(|_| box_err!("renew response channel is dropped")) .and_then(|r| r.map_err(|err| Error::BatchRenew(err))); - let label = if res.is_ok() { "ok" } else { "err" }; - TS_PROVIDER_TSO_BATCH_RENEW_DURATION - .with_label_values(&[label, reason]) + TS_PROVIDER_TSO_BATCH_RENEW_DURATION_STATIC + .get(res.borrow().into()) + .get(reason) .observe(start.saturating_elapsed_secs()); res } async fn renew_tso_batch_impl( pd_client: Arc, - tso_batch: Arc>, - batch_min_size: u32, + tso_batch_list: Arc, + renew_parameter: RenewParameter, need_flush: bool, ) -> Result<()> { - let new_batch_size = { - let batch = tso_batch.read(); - match batch.used_size() { - None => batch_min_size, - Some(used_size) => { - debug!("CachedTsoProvider::renew_tso_batch"; "batch before" => ?batch, "need_flush" => need_flush, "used size" => used_size); - Self::calc_new_batch_size(batch.size, used_size, batch_min_size) - } - } - }; - - match pd_client.batch_get_tso(new_batch_size).await { + let tso_remain = tso_batch_list.remain(); + let new_batch_size = + Self::calc_new_batch_size(tso_batch_list.clone(), renew_parameter, need_flush); + + TS_PROVIDER_TSO_BATCH_LIST_COUNTING_STATIC + .tso_remain + .observe(tso_remain as f64); + TS_PROVIDER_TSO_BATCH_LIST_COUNTING_STATIC + .new_batch_size + .observe(new_batch_size as f64); + + let res = match pd_client.batch_get_tso(new_batch_size).await { Err(err) => { - warn!("BatchTsoProvider::renew_tso_batch, pd_client.batch_get_tso error"; "error" => ?err, "need_flash" => need_flush); + warn!("BatchTsoProvider::renew_tso_batch, pd_client.batch_get_tso error"; + "new_batch_size" => new_batch_size, "error" => ?err, "need_flash" => need_flush); if need_flush { - let batch = tso_batch.write(); - batch.flush(); + tso_batch_list.flush(); } Err(err.into()) } Ok(ts) => { - { - let mut batch = tso_batch.write(); - batch.renew(new_batch_size, ts).map_err(|e| { + tso_batch_list + .push(new_batch_size, ts, need_flush) + .map_err(|e| { if need_flush { - batch.flush(); + tso_batch_list.flush(); } e })?; - debug!("BatchTsoProvider::renew_tso_batch"; "batch renew" => ?batch, "ts" => ?ts); - } - TS_PROVIDER_TSO_BATCH_SIZE.set(new_batch_size as i64); + debug!("BatchTsoProvider::renew_tso_batch"; + "tso_batch_list.remain" => tso_batch_list.remain(), "ts" => ?ts); + + // Should only be invoked after successful renew. Otherwise the TSO usage will + // be lost, and batch size requirement will be less than expected. Note that + // invoked here is not precise. There would be `get_ts()` before here after + // above `tso_batch_list.push()`, and make `tso_usage` a little bigger. This + // error is acceptable. + tso_batch_list.take_and_report_usage(); + Ok(()) } - } + }; + let total_batch_size = tso_batch_list.remain() + tso_batch_list.usage(); + TS_PROVIDER_TSO_BATCH_SIZE.set(total_batch_size as i64); + res } async fn renew_thread( pd_client: Arc, - tso_batch: Arc>, - batch_min_size: u32, + tso_batch_list: Arc, + renew_parameter: RenewParameter, mut rx: Receiver, ) { loop { @@ -270,8 +479,8 @@ impl BatchTsoProvider { let res = Self::renew_tso_batch_impl( pd_client.clone(), - tso_batch.clone(), - batch_min_size, + tso_batch_list.clone(), + renew_parameter, need_flush, ) .await @@ -286,28 +495,36 @@ impl BatchTsoProvider { } } - fn calc_new_batch_size(batch_size: u32, used_size: u32, batch_min_size: u32) -> u32 { - if used_size > batch_size * 3 / 4 { - // Enlarge to double if used more than 3/4. - std::cmp::min(batch_size << 1, TSO_BATCH_MAX_SIZE) - } else if used_size < batch_size / 4 { - // Shrink to half if used less than 1/4. - std::cmp::max(batch_size >> 1, batch_min_size) - } else { - batch_size + fn calc_new_batch_size( + tso_batch_list: Arc, + renew_parameter: RenewParameter, + need_flush: bool, + ) -> u32 { + // The expected number of TSO is `cache_multiplier` times on latest usage. + // Note: There is a `batch_max_size` limitation, so the request batch size will + // be less than expected, and will be fulfill in next renew. + // TODO: consider schedule TSO requests exceed `batch_max_size` limitation to + // fulfill requirement in time. + let mut new_batch_size = tso_batch_list.usage() * renew_parameter.cache_multiplier; + if !need_flush { + new_batch_size = new_batch_size.saturating_sub(tso_batch_list.remain()) } + std::cmp::min( + std::cmp::max(new_batch_size, renew_parameter.batch_min_size), + renew_parameter.batch_max_size, + ) } async fn init(&self, renew_request_rx: Receiver) -> Result<()> { // Spawn renew thread. let pd_client = self.pd_client.clone(); - let tso_batch = self.batch.clone(); - let batch_min_size = self.batch_min_size; + let tso_batch_list = self.batch_list.clone(); + let renew_parameter = self.renew_parameter; self.causal_ts_worker.remote().spawn(async move { - Self::renew_thread(pd_client, tso_batch, batch_min_size, renew_request_rx).await; + Self::renew_thread(pd_client, tso_batch_list, renew_parameter, renew_request_rx).await; }); - self.renew_tso_batch(true, TSO_BATCH_RENEW_ON_INITIALIZE) + self.renew_tso_batch(true, TsoBatchRenewReason::init) .await?; let request_tx = self.renew_request_tx.clone(); @@ -317,7 +534,7 @@ impl BatchTsoProvider { let _ = Self::renew_tso_batch_internal( request_tx, false, - TSO_BATCH_RENEW_BY_BACKGROUND, + TsoBatchRenewReason::background, ) .await; } @@ -331,33 +548,38 @@ impl BatchTsoProvider { Ok(()) } - // Get current batch_size, for test purpose. - pub fn batch_size(&self) -> u32 { - self.batch.read().size + #[cfg(test)] + pub fn tso_remain(&self) -> u32 { + self.batch_list.remain() + } + + #[cfg(test)] + pub fn tso_usage(&self) -> u32 { + self.batch_list.usage() } } const GET_TS_MAX_RETRY: u32 = 3; impl CausalTsProvider for BatchTsoProvider { + // TODO: support `after_ts` argument. fn get_ts(&self) -> Result { let start = Instant::now(); let mut retries = 0; let mut last_batch_size: u32; loop { { - let batch = self.batch.read(); - last_batch_size = batch.size; - match batch.pop() { + last_batch_size = self.batch_list.remain() + self.batch_list.usage(); + match self.batch_list.pop(None) { Some(ts) => { trace!("BatchTsoProvider::get_ts: {:?}", ts); - TS_PROVIDER_GET_TS_DURATION - .with_label_values(&["ok"]) + TS_PROVIDER_GET_TS_DURATION_STATIC + .ok .observe(start.saturating_elapsed_secs()); return Ok(ts); } None => { - warn!("BatchTsoProvider::get_ts, batch used up"; "batch.size" => batch.size, "retries" => retries); + warn!("BatchTsoProvider::get_ts, batch used up"; "last_batch_size" => last_batch_size, "retries" => retries); } } } @@ -365,7 +587,7 @@ impl CausalTsProvider for BatchTsoProvider { if retries >= GET_TS_MAX_RETRY { break; } - if let Err(err) = block_on(self.renew_tso_batch(false, TSO_BATCH_RENEW_FOR_USED_UP)) { + if let Err(err) = block_on(self.renew_tso_batch(false, TsoBatchRenewReason::used_up)) { // `renew_tso_batch` failure is likely to be caused by TSO timeout, which would // mean that PD is quite busy. So do not retry any more. error!("BatchTsoProvider::get_ts, renew_tso_batch fail on batch used-up"; "err" => ?err); @@ -373,15 +595,16 @@ impl CausalTsProvider for BatchTsoProvider { } retries += 1; } - error!("BatchTsoProvider::get_ts, batch used up"; "batch.size" => last_batch_size, "retries" => retries); - TS_PROVIDER_GET_TS_DURATION - .with_label_values(&["err"]) + error!("BatchTsoProvider::get_ts, batch used up"; "last_batch_size" => last_batch_size, "retries" => retries); + TS_PROVIDER_GET_TS_DURATION_STATIC + .err .observe(start.saturating_elapsed_secs()); Err(Error::TsoBatchUsedUp(last_batch_size)) } + // TODO: provide asynchronous method fn flush(&self) -> Result<()> { - block_on(self.renew_tso_batch(true, TSO_BATCH_RENEW_FOR_FLUSH)) + block_on(self.renew_tso_batch(true, TsoBatchRenewReason::flush)) } } @@ -413,57 +636,209 @@ pub mod tests { #[test] fn test_tso_batch() { - let mut batch = TsoBatch::default(); + let batch = TsoBatch::new(10, TimeStamp::compose(1, 100)); - assert_eq!(batch.used_size(), None); - assert_eq!(batch.pop(), None); - batch.flush(); + assert_eq!(batch.original_start(), TimeStamp::compose(1, 91)); + assert_eq!(batch.excluded_end(), TimeStamp::compose(1, 101)); + assert_eq!(batch.remain(), 10); - batch.renew(10, TimeStamp::compose(1, 100)).unwrap(); - for logical in 91..=95 { - assert_eq!(batch.pop(), Some(TimeStamp::compose(1, logical))); + for logical in 91..=93 { + assert_eq!(batch.pop(), Some((TimeStamp::compose(1, logical), false))); } - assert_eq!(batch.used_size(), Some(5)); + assert_eq!(batch.remain(), 7); - for logical in 96..=100 { - assert_eq!(batch.pop(), Some(TimeStamp::compose(1, logical))); + for logical in 94..=99 { + assert_eq!(batch.pop(), Some((TimeStamp::compose(1, logical), false))); } - assert_eq!(batch.used_size(), Some(10)); - assert_eq!(batch.pop(), None); + assert_eq!(batch.remain(), 1); - batch.renew(10, TimeStamp::compose(1, 110)).unwrap(); - // timestamp fall back - batch.renew(10, TimeStamp::compose(1, 119)).unwrap_err(); - - batch.renew(10, TimeStamp::compose(1, 200)).unwrap(); - for logical in 191..=195 { - assert_eq!(batch.pop(), Some(TimeStamp::compose(1, logical))); - } - batch.flush(); - assert_eq!(batch.used_size(), Some(10)); + assert_eq!(batch.pop(), Some((TimeStamp::compose(1, 100), true))); assert_eq!(batch.pop(), None); + assert_eq!(batch.remain(), 0); } #[test] fn test_cals_new_batch_size() { + let cache_multiplier = 30; let cases = vec![ - (100, 0, 100), - (100, 76, 200), - (200, 49, 100), - (200, 50, 200), - (200, 150, 200), - (200, 151, 400), - (200, 200, 400), - (TSO_BATCH_MAX_SIZE, TSO_BATCH_MAX_SIZE, TSO_BATCH_MAX_SIZE), + (0, 0, true, 100), + (50, 0, true, 100), + (1000, 100, true, 3000), + ( + 1000, + DEFAULT_TSO_BATCH_MAX_SIZE, + true, + DEFAULT_TSO_BATCH_MAX_SIZE, + ), + (0, 0, false, 100), + (1000, 0, false, 100), + (1000, 100, false, 2000), + (5000, 100, false, 100), + ( + 1000, + DEFAULT_TSO_BATCH_MAX_SIZE, + false, + DEFAULT_TSO_BATCH_MAX_SIZE, + ), ]; - for (i, (batch_size, used_size, expected)) in cases.into_iter().enumerate() { - let new_size = - BatchTsoProvider::::calc_new_batch_size(batch_size, used_size, 100); + for (i, (remain, usage, need_flush, expected)) in cases.into_iter().enumerate() { + let batch_list = Arc::new(TsoBatchList { + inner: Default::default(), + tso_remain: AtomicI32::new(remain as i32), + tso_usage: AtomicU32::new(usage), + capacity: cache_multiplier, + }); + let renew_parameter = RenewParameter { + batch_min_size: DEFAULT_TSO_BATCH_MIN_SIZE, + batch_max_size: DEFAULT_TSO_BATCH_MAX_SIZE, + cache_multiplier, + }; + let new_size = BatchTsoProvider::::calc_new_batch_size( + batch_list, + renew_parameter, + need_flush, + ); assert_eq!(new_size, expected, "case {}", i); } } + #[test] + fn test_tso_batch_list_basic() { + let batch_list = TsoBatchList::new(10); + + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.usage(), 0); + assert_eq!(batch_list.pop(None), None); + + batch_list + .push(10, TimeStamp::compose(1, 100), false) + .unwrap(); + assert_eq!(batch_list.remain(), 10); + assert_eq!(batch_list.usage(), 0); + + for logical in 91..=94 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 6); + assert_eq!(batch_list.usage(), 4); + + for logical in 95..=100 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.usage(), 10); + assert_eq!(batch_list.pop(None), None); + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.usage(), 10); + + batch_list + .push(10, TimeStamp::compose(1, 110), false) + .unwrap(); + assert_eq!(batch_list.remain(), 10); + assert_eq!(batch_list.usage(), 10); + // timestamp fall back + batch_list + .push(10, TimeStamp::compose(1, 119), false) + .unwrap_err(); + batch_list + .push(10, TimeStamp::compose(1, 200), false) + .unwrap(); + assert_eq!(batch_list.remain(), 20); + assert_eq!(batch_list.usage(), 10); + + for logical in 101..=110 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + for logical in 191..=195 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 5); + assert_eq!(batch_list.usage(), 25); + + batch_list.flush(); + assert_eq!(batch_list.pop(None), None); + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.take_and_report_usage(), 25); + assert_eq!(batch_list.usage(), 0); + + // need_flush + batch_list + .push(10, TimeStamp::compose(1, 300), false) + .unwrap(); + let key391 = batch_list + .push(10, TimeStamp::compose(1, 400), true) + .unwrap(); + assert_eq!(key391, TimeStamp::compose(1, 391).into_inner()); + assert_eq!(batch_list.remain(), 10); + assert_eq!(batch_list.usage(), 0); + + for logical in 391..=400 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 0); + assert_eq!(batch_list.usage(), 10); + } + + #[test] + fn test_tso_batch_list_max_batch_count() { + let batch_list = TsoBatchList::new(3); + + batch_list + .push(10, TimeStamp::compose(1, 100), false) + .unwrap(); // will be remove after the 4th push. + batch_list + .push(10, TimeStamp::compose(1, 200), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 300), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 400), false) + .unwrap(); + + for logical in 191..=195 { + assert_eq!(batch_list.pop(None), Some(TimeStamp::compose(1, logical))); + } + assert_eq!(batch_list.remain(), 25); + assert_eq!(batch_list.usage(), 5); + } + + #[test] + fn test_tso_batch_list_pop_after_ts() { + let batch_list = TsoBatchList::new(10); + + batch_list + .push(10, TimeStamp::compose(1, 100), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 200), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 300), false) + .unwrap(); + batch_list + .push(10, TimeStamp::compose(1, 400), false) + .unwrap(); + + let after_ts = TimeStamp::compose(1, 291); + for logical in 291..=300 { + assert_eq!( + batch_list.pop(Some(after_ts)), + Some(TimeStamp::compose(1, logical)) + ); + } + for logical in 391..=400 { + assert_eq!( + batch_list.pop(Some(after_ts)), + Some(TimeStamp::compose(1, logical)) + ); + } + assert_eq!(batch_list.pop(Some(after_ts)), None); + assert_eq!(batch_list.remain(), 20); + assert_eq!(batch_list.usage(), 20); + } + #[test] fn test_simple_tso_provider() { let pd_cli = Arc::new(TestPdClient::new(1, false)); @@ -485,44 +860,62 @@ pub mod tests { let provider = block_on(BatchTsoProvider::new_opt( pd_cli.clone(), Duration::ZERO, + Duration::from_secs(1), // cache_multiplier = 10 100, + 80000, )) .unwrap(); - assert_eq!(provider.batch_size(), 100); + assert_eq!(provider.tso_remain(), 100); + assert_eq!(provider.tso_usage(), 0); + for ts in 1001..=1010u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } + assert_eq!(provider.tso_remain(), 90); + assert_eq!(provider.tso_usage(), 10); provider.flush().unwrap(); // allocated: [1101, 1200] - assert_eq!(provider.batch_size(), 100); + assert_eq!(provider.tso_remain(), 100); + assert_eq!(provider.tso_usage(), 0); // used up pd_cli.trigger_tso_failure(); // make renew fail to verify used-up for ts in 1101..=1200u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } + assert_eq!(provider.tso_remain(), 0); + assert_eq!(provider.tso_usage(), 100); provider.get_ts().unwrap_err(); + assert_eq!(provider.tso_remain(), 0); + assert_eq!(provider.tso_usage(), 100); - provider.flush().unwrap(); // allocated: [1201, 1400] - assert_eq!(provider.batch_size(), 200); - - // used < 20% - for ts in 1201..=1249u64 { + provider.flush().unwrap(); // allocated: [1201, 2200] + for ts in 1201..=1260u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } + assert_eq!(provider.tso_remain(), 940); + assert_eq!(provider.tso_usage(), 60); - provider.flush().unwrap(); // allocated: [1401, 1500] - assert_eq!(provider.batch_size(), 100); + // allocated: [2201, 2300] + block_on(provider.renew_tso_batch(false, TsoBatchRenewReason::background)).unwrap(); + assert_eq!(provider.tso_remain(), 1040); // 940 + 100 + assert_eq!(provider.tso_usage(), 0); pd_cli.trigger_tso_failure(); // make renew fail to verify used-up - for ts in 1401..=1500u64 { + for ts in 1261..=2300u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } provider.get_ts().unwrap_err(); + assert_eq!(provider.tso_remain(), 0); + assert_eq!(provider.tso_usage(), 1040); // renew on used-up - for ts in 1501..=2500u64 { + for ts in 2301..=100_000u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } + // batch size: 10400, 80000, 80000 + // batch boundary: 2301, 12700, 92700, 100_000 + assert_eq!(provider.tso_remain(), 72700); + assert_eq!(provider.tso_usage(), 7300); } #[test] @@ -532,14 +925,14 @@ pub mod tests { { pd_cli.trigger_tso_failure(); - assert!( - block_on(BatchTsoProvider::new_opt( - pd_cli.clone(), - Duration::ZERO, - 100 - )) - .is_err() - ); + block_on(BatchTsoProvider::new_opt( + pd_cli.clone(), + Duration::ZERO, + Duration::from_secs(3), + 100, + 8192, + )) + .unwrap_err(); } // Set `renew_interval` to 0 to disable background renew. Invoke `flush()` to @@ -547,10 +940,12 @@ pub mod tests { let provider = block_on(BatchTsoProvider::new_opt( pd_cli.clone(), Duration::ZERO, + Duration::from_secs(1), // cache_multiplier=10 100, + 8192, )) .unwrap(); - assert_eq!(provider.batch_size(), 100); + assert_eq!(provider.tso_remain(), 100); for ts in 1001..=1010u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } @@ -572,9 +967,9 @@ pub mod tests { pd_cli.trigger_tso_failure(); provider.flush().unwrap_err(); - provider.flush().unwrap(); // allocated: [1301, 1700] + provider.flush().unwrap(); // allocated: [1301, 3300] pd_cli.trigger_tso_failure(); // make renew fail to verify used-up - for ts in 1301..=1700u64 { + for ts in 1301..=3300u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } provider.get_ts().unwrap_err(); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 4a4cadeb639..fd079764027 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -322,7 +322,9 @@ where let tso = block_on(causal_ts::BatchTsoProvider::new_opt( pd_client.clone(), config.causal_ts.renew_interval.0, + config.causal_ts.available_interval.0, config.causal_ts.renew_batch_min_size, + config.causal_ts.renew_batch_max_size, )); if let Err(e) = tso { fatal!("Causal timestamp provider initialize failed: {:?}", e); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index da81606d2dd..f69ef253e5b 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -364,7 +364,9 @@ impl ServerCluster { block_on(causal_ts::BatchTsoProvider::new_opt( self.pd_client.clone(), cfg.causal_ts.renew_interval.0, + cfg.causal_ts.available_interval.0, cfg.causal_ts.renew_batch_min_size, + cfg.causal_ts.renew_batch_max_size, )) .unwrap(), ); diff --git a/metrics/grafana/tikv_raw.json b/metrics/grafana/tikv_raw.json index f81ac801173..6664dad2734 100644 --- a/metrics/grafana/tikv_raw.json +++ b/metrics/grafana/tikv_raw.json @@ -464,6 +464,26 @@ "legendFormat": "{{result}}-P99", "refId": "A", "step": 10 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tikv_causal_ts_provider_get_ts_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, result))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{result}}-P999", + "refId": "B", + "step": 10 + }, + { + "exemplar": true, + "expr": "histogram_quantile(1, sum(rate(tikv_causal_ts_provider_get_ts_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, result))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{result}}-MAX", + "refId": "C", + "step": 10 } ], "thresholds": [], @@ -720,6 +740,229 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The TSO batch list counting", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 64, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_causal_ts_provider_tso_batch_list_counting_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}-P99", + "refId": "A", + "step": 10 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.5, sum(rate(tikv_causal_ts_provider_tso_batch_list_counting_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}-P50", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TSO batch list counting", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "TSO batch list counting frequency", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 65, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 300, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_causal_ts_provider_tso_batch_list_counting_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{result}}", + "metric": "tikv_causal_ts_provider_tso_batch_list_counting_count", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TSO batch list counting frequency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index b8899a1de4f..247b06834b0 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -794,6 +794,8 @@ fn test_serde_custom_tikv_config() { value.causal_ts = CausalTsConfig { renew_interval: ReadableDuration::millis(100), renew_batch_min_size: 100, + renew_batch_max_size: 8192, + available_interval: ReadableDuration::millis(3000), }; let custom = read_file_in_project_dir("integrations/config/test-custom.toml"); From 2d2f6d50477d70d210f95e7f53eeb6aa173ded8f Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 9 Aug 2022 14:10:47 +0800 Subject: [PATCH 144/676] file_system: detect procfs accessibility before using it (#13117) close tikv/tikv#13116 None Signed-off-by: tabokie --- components/file_system/src/io_stats/proc.rs | 72 +++++++++------------ 1 file changed, 31 insertions(+), 41 deletions(-) diff --git a/components/file_system/src/io_stats/proc.rs b/components/file_system/src/io_stats/proc.rs index ceb772bee6e..60c8cac9c36 100644 --- a/components/file_system/src/io_stats/proc.rs +++ b/components/file_system/src/io_stats/proc.rs @@ -13,10 +13,7 @@ use crossbeam_utils::CachePadded; use parking_lot::Mutex; use strum::EnumCount; use thread_local::ThreadLocal; -use tikv_util::{ - sys::thread::{self, Pid}, - warn, -}; +use tikv_util::sys::thread::{self, Pid}; use crate::{IoBytes, IoType}; @@ -50,54 +47,44 @@ impl ThreadId { } } - fn fetch_io_bytes(&mut self) -> Option { + fn fetch_io_bytes(&mut self) -> Result { if self.proc_reader.is_none() { let path = PathBuf::from("/proc") .join(format!("{}", self.pid)) .join("task") .join(format!("{}", self.tid)) .join("io"); - match File::open(path) { - Ok(file) => { - self.proc_reader = Some(BufReader::new(file)); - } - Err(e) => { - warn!("failed to open proc file: {}", e); - } - } + self.proc_reader = Some(BufReader::new( + File::open(path).map_err(|e| format!("open: {}", e))?, + )); } - if let Some(ref mut reader) = self.proc_reader { - reader - .seek(std::io::SeekFrom::Start(0)) - .map_err(|e| { - warn!("failed to seek proc file: {}", e); - }) - .ok()?; - let mut io_bytes = IoBytes::default(); - for line in reader.lines() { - let line = line - .map_err(|e| { - // ESRCH 3 No such process - if e.raw_os_error() != Some(3) { - warn!("failed to read proc file: {}", e); - } - }) - .ok()?; - if line.len() > 11 { - let mut s = line.split_whitespace(); - if let (Some(field), Some(value)) = (s.next(), s.next()) { - if field.starts_with("read_bytes") { - io_bytes.read = u64::from_str(value).ok()?; - } else if field.starts_with("write_bytes") { - io_bytes.write = u64::from_str(value).ok()?; + let reader = self.proc_reader.as_mut().unwrap(); + reader + .seek(std::io::SeekFrom::Start(0)) + .map_err(|e| format!("seek: {}", e))?; + let mut io_bytes = IoBytes::default(); + for line in reader.lines() { + match line { + Ok(line) => { + if line.len() > 11 { + let mut s = line.split_whitespace(); + if let (Some(field), Some(value)) = (s.next(), s.next()) { + if field.starts_with("read_bytes") { + io_bytes.read = u64::from_str(value) + .map_err(|e| format!("parse read_bytes: {}", e))?; + } else if field.starts_with("write_bytes") { + io_bytes.write = u64::from_str(value) + .map_err(|e| format!("parse write_bytes: {}", e))?; + } } } } + // ESRCH 3 No such process + Err(e) if e.raw_os_error() == Some(3) => break, + Err(e) => return Err(format!("read: {}", e)), } - Some(io_bytes) - } else { - None } + Ok(io_bytes) } } @@ -140,7 +127,7 @@ impl AtomicIoBytes { /// Flushes the local I/O stats to global I/O stats. #[inline] fn flush_thread_io(sentinel: &mut LocalIoStats) { - if let Some(io_bytes) = sentinel.id.fetch_io_bytes() { + if let Ok(io_bytes) = sentinel.id.fetch_io_bytes() { GLOBAL_IO_STATS[sentinel.io_type as usize] .fetch_add(io_bytes - sentinel.last_flushed, Ordering::Relaxed); sentinel.last_flushed = io_bytes; @@ -148,6 +135,9 @@ fn flush_thread_io(sentinel: &mut LocalIoStats) { } pub fn init() -> Result<(), String> { + ThreadId::current() + .fetch_io_bytes() + .map_err(|e| format!("failed to fetch I/O bytes from proc: {}", e))?; Ok(()) } From 3d521a08f0be88a43062fbbe3c15e784c939942f Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 10 Aug 2022 15:18:49 +0800 Subject: [PATCH 145/676] raftstore-v2: add module docs and remove inappropriate module dependencies (#13241) close tikv/tikv#13050 Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 -- components/raftstore-v2/Cargo.toml | 2 -- .../src/operation/read/{read.rs => local.rs} | 15 ++++----------- .../raftstore-v2/src/operation/read/mod.rs | 8 +++++++- components/raftstore/src/store/worker/read.rs | 16 ++++++++++------ 5 files changed, 21 insertions(+), 22 deletions(-) rename components/raftstore-v2/src/operation/read/{read.rs => local.rs} (94%) diff --git a/Cargo.lock b/Cargo.lock index f258fbdcf69..1537c75bbff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4184,7 +4184,6 @@ dependencies = [ "batch-system", "collections", "crossbeam", - "engine_rocks", "engine_test", "engine_traits", "error_code", @@ -4203,7 +4202,6 @@ dependencies = [ "tempfile", "test_pd", "test_util", - "tikv_kv", "tikv_util", "time", "txn_types", diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index f6a827d7424..8551864a444 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -32,7 +32,6 @@ cloud-azure = ["raftstore/cloud-azure"] batch-system = { path = "../batch-system", default-features = false } collections = { path = "../collections" } crossbeam = "0.8" -engine_rocks = { path = "../engine_rocks", default-features = false } engine_traits = { path = "../engine_traits" } error_code = { path = "../error_code" } fail = "0.5" @@ -46,7 +45,6 @@ raft-proto = { version = "0.7.0" } raftstore = { path = "../raftstore" } slog = "2.3" smallvec = "1.4" -tikv_kv = { path = "../tikv_kv", default-features = false } tikv_util = { path = "../tikv_util", default-features = false } time = "0.1" txn_types = { path = "../txn_types", default-features = false } diff --git a/components/raftstore-v2/src/operation/read/read.rs b/components/raftstore-v2/src/operation/read/local.rs similarity index 94% rename from components/raftstore-v2/src/operation/read/read.rs rename to components/raftstore-v2/src/operation/read/local.rs index bc3903e12fd..56a5f01a7fd 100644 --- a/components/raftstore-v2/src/operation/read/read.rs +++ b/components/raftstore-v2/src/operation/read/local.rs @@ -161,7 +161,6 @@ mod tests { RaftCommand, }; use tempfile::{Builder, TempDir}; - use tikv_kv::Snapshot; use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; use time::Duration; use txn_types::{Key, Lock, LockType, WriteBatchFlags}; @@ -213,7 +212,7 @@ mod tests { tablet1 = factory .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) .unwrap(); - tablet1.put_cf(CF_DEFAULT, b"a1", b"val1").unwrap(); + tablet1.put(b"a1", b"val1").unwrap(); let cache = CachedTablet::new(Some(tablet1.clone())); meta.tablet_caches.insert(1, cache); @@ -226,7 +225,7 @@ mod tests { tablet2 = factory .open_tablet(2, Some(10), OpenOptions::default().set_create_new(true)) .unwrap(); - tablet2.put_cf(CF_DEFAULT, b"a2", b"val2").unwrap(); + tablet2.put(b"a2", b"val2").unwrap(); let cache = CachedTablet::new(Some(tablet2.clone())); meta.tablet_caches.insert(2, cache); } @@ -238,10 +237,7 @@ mod tests { let snapshot = delegate.get_snapshot(None, &mut None); assert_eq!( b"val1".to_vec(), - snapshot - .get(&Key::from_encoded(b"a1".to_vec())) - .unwrap() - .unwrap() + *snapshot.get_value(b"a1").unwrap().unwrap() ); let (_, delegate) = store_meta.get_executor_and_len(2); @@ -251,10 +247,7 @@ mod tests { let snapshot = delegate.get_snapshot(None, &mut None); assert_eq!( b"val2".to_vec(), - snapshot - .get(&Key::from_encoded(b"a2".to_vec())) - .unwrap() - .unwrap() + *snapshot.get_value(b"a2").unwrap().unwrap() ); } } diff --git a/components/raftstore-v2/src/operation/read/mod.rs b/components/raftstore-v2/src/operation/read/mod.rs index 8c427378da3..efbe6af1a5a 100644 --- a/components/raftstore-v2/src/operation/read/mod.rs +++ b/components/raftstore-v2/src/operation/read/mod.rs @@ -1,3 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -mod read; +//! There are two types of read: +//! - If the ReadDelegate is in the leader lease status, the read is operated +//! locally and need not to go through the raft layer (namely local read). +//! - Otherwise, redirect the request to the raftstore and proposed as a +//! RaftCommand in the raft layer. + +mod local; diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index f3d52be5044..de1cb1011ae 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -1069,7 +1069,7 @@ mod tests { use crossbeam::channel::TrySendError; use engine_test::kv::{KvTestEngine, KvTestSnapshot}; - use engine_traits::{Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; + use engine_traits::{Peekable, SyncMutable, ALL_CFS}; use kvproto::raft_cmdpb::*; use tempfile::{Builder, TempDir}; use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; @@ -1556,7 +1556,7 @@ mod tests { .unwrap(); let kv_engine = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); - kv_engine.put_cf(CF_DEFAULT, b"a1", b"val1").unwrap(); + kv_engine.put(b"a1", b"val1").unwrap(); let store_meta = StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(0))), kv_engine.clone()); @@ -1589,16 +1589,20 @@ mod tests { let tablet = delegate.get_tablet(); assert_eq!(kv_engine.as_inner().path(), tablet.as_inner().path()); let snapshot = delegate.get_snapshot(read_id_copy.clone(), &mut read_context); - let val = snapshot.get_value(b"a1").unwrap().unwrap(); - assert_eq!(b"val1", val.deref()); + assert_eq!( + b"val1".to_vec(), + *snapshot.get_value(b"a1").unwrap().unwrap() + ); let (_, delegate) = store_meta.get_executor_and_len(2); let mut delegate = delegate.unwrap(); let tablet = delegate.get_tablet(); assert_eq!(kv_engine.as_inner().path(), tablet.as_inner().path()); let snapshot = delegate.get_snapshot(read_id_copy, &mut read_context); - let val = snapshot.get_value(b"a1").unwrap().unwrap(); - assert_eq!(b"val1", val.deref()); + assert_eq!( + b"val1".to_vec(), + *snapshot.get_value(b"a1").unwrap().unwrap() + ); assert!(snap_cache.as_ref().is_some()); assert_eq!(read_metrics.local_executed_requests, 2); From 6b3ca45c8e828d5ef8b36492791237ea9d62cfca Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 11 Aug 2022 12:00:48 +0800 Subject: [PATCH 146/676] server: make EnginesResourcesInfo be compatible with Multi-Rocks DB version. (#13206) close tikv/tikv#13214 Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- components/server/src/server.rs | 65 +++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 12 deletions(-) diff --git a/components/server/src/server.rs b/components/server/src/server.rs index fd079764027..e925a663943 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -13,6 +13,7 @@ use std::{ cmp, + collections::HashMap, convert::TryFrom, env, fmt, net::SocketAddr, @@ -44,7 +45,7 @@ use engine_rocks::{ use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, - TabletFactory, CF_DEFAULT, CF_LOCK, CF_WRITE, + TabletAccessor, TabletFactory, CF_DEFAULT, CF_LOCK, CF_WRITE, }; use error_code::ErrorCodeExt; use file_system::{ @@ -97,8 +98,8 @@ use tikv::{ service::{DebugService, DiagnosticsService}, status_server::StatusServer, ttl::TtlChecker, - KvEngineFactoryBuilder, Node, RaftKv, Server, CPU_CORES_QUOTA_GAUGE, DEFAULT_CLUSTER_ID, - GRPC_THREAD_PREFIX, + KvEngineFactory, KvEngineFactoryBuilder, Node, RaftKv, Server, CPU_CORES_QUOTA_GAUGE, + DEFAULT_CLUSTER_ID, GRPC_THREAD_PREFIX, }, storage::{ self, @@ -1685,13 +1686,15 @@ impl TikvServer { self.config.storage.block_cache.shared, )), ); - self.tablet_factory = Some(factory); + self.tablet_factory = Some(factory.clone()); engines .raft .register_config(cfg_controller, self.config.storage.block_cache.shared); let engines_info = Arc::new(EnginesResourceInfo::new( - &engines, 180, // max_samples_to_preserve + factory, + engines.raft.as_rocks_engine().cloned(), + 180, // max_samples_to_preserve )); (engines, engines_info) @@ -1841,8 +1844,13 @@ impl EngineMetricsManager { } pub struct EnginesResourceInfo { - kv_engine: RocksEngine, + tablet_factory: Arc, raft_engine: Option, + // region_id -> (suffix, tablet) + // `update` is called perodically which needs this map for recording the latest tablet for each + // region and cached_latest_tablets is used to avoid memory allocation each time when + // calling `update`. + cached_latest_tablets: Arc>>, latest_normalized_pending_bytes: AtomicU32, normalized_pending_bytes_collector: MovingAvgU32, } @@ -1850,14 +1858,15 @@ pub struct EnginesResourceInfo { impl EnginesResourceInfo { const SCALE_FACTOR: u64 = 100; - fn new( - engines: &Engines, + fn new( + tablet_factory: Arc, + raft_engine: Option, max_samples_to_preserve: usize, ) -> Self { - let raft_engine = engines.raft.as_rocks_engine().cloned(); EnginesResourceInfo { - kv_engine: engines.kv.clone(), + tablet_factory, raft_engine, + cached_latest_tablets: Arc::default(), latest_normalized_pending_bytes: AtomicU32::new(0), normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), } @@ -1884,9 +1893,41 @@ impl EnginesResourceInfo { if let Some(raft_engine) = &self.raft_engine { fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); } - for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { - fetch_engine_cf(&self.kv_engine, cf, &mut normalized_pending_bytes); + + let mut cached_latest_tablets = self.cached_latest_tablets.as_ref().lock().unwrap(); + + self.tablet_factory + .for_each_opened_tablet( + &mut |id, suffix, db: &RocksEngine| match cached_latest_tablets.entry(id) { + collections::HashMapEntry::Occupied(mut slot) => { + if slot.get().0 < suffix { + slot.insert((suffix, db.clone())); + } + } + collections::HashMapEntry::Vacant(slot) => { + slot.insert((suffix, db.clone())); + } + }, + ); + + // todo(SpadeA): Now, there's a potential race condition problem where the + // tablet could be destroyed after the clone and before the fetching + // which could result in programme panic. It's okay now as the single global + // kv_engine will not be destroyed in normal operation and v2 is not + // ready for operation. Furthermore, this race condition is general to v2 as + // tablet clone is not a case exclusively happened here. We should + // propose another PR to tackle it such as destory tablet lazily in a GC + // thread. + + for (_, (_, tablet)) in cached_latest_tablets.iter() { + for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { + fetch_engine_cf(tablet, cf, &mut normalized_pending_bytes); + } } + + // Clear ensures that these tablets are not hold forever. + cached_latest_tablets.clear(); + let (_, avg) = self .normalized_pending_bytes_collector .add(normalized_pending_bytes); From 1ec844528144ddb900a39960e30207624df40571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 11 Aug 2022 16:50:49 +0800 Subject: [PATCH 147/676] log-backup: add timeout for operations that may stuck (#13255) close tikv/tikv#13251 Added a timeout of `30s` for every etcd gRPC request. Moved `on_update_global_checkpoint` to background. Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- Cargo.lock | 3 +- components/backup-stream/Cargo.toml | 4 +- components/backup-stream/src/endpoint.rs | 43 +++++++++++++++---- .../src/metadata/store/lazy_etcd.rs | 8 +++- components/backup-stream/tests/mod.rs | 34 ++++++++++++++- 5 files changed, 78 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1537c75bbff..41a5df4c1ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1620,8 +1620,7 @@ dependencies = [ [[package]] name = "etcd-client" version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76b9f5b0b4f53cf836bef05b22cd5239479700bc8d44a04c3c77f1ba6c2c73e9" +source = "git+https://github.com/yujuncen/etcd-client?rev=e0321a1990ee561cf042973666c0db61c8d82364#e0321a1990ee561cf042973666c0db61c8d82364" dependencies = [ "http", "prost", diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index e2b23ccf5db..7fe221842ce 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -29,7 +29,9 @@ dashmap = "5" engine_rocks = { path = "../engine_rocks", default-features = false } engine_traits = { path = "../engine_traits", default-features = false } error_code = { path = "../error_code" } -etcd-client = { version = "0.7", features = ["pub-response-field", "tls"] } +# We cannot update the etcd-client to latest version because of the cyclic requirement. +# Also we need wait until https://github.com/etcdv3/etcd-client/pull/43/files to be merged. +etcd-client = { git = "https://github.com/yujuncen/etcd-client", rev = "e0321a1990ee561cf042973666c0db61c8d82364", features = ["pub-response-field", "tls"] } external_storage = { path = "../external_storage", default-features = false } external_storage_export = { path = "../external_storage/export", default-features = false } fail = { version = "0.5", optional = true } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index ff1e2a4e66c..281bf2e77f6 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -66,6 +66,10 @@ const SLOW_EVENT_THRESHOLD: f64 = 120.0; /// CHECKPOINT_SAFEPOINT_TTL_IF_ERROR specifies the safe point TTL(24 hour) if /// task has fatal error. const CHECKPOINT_SAFEPOINT_TTL_IF_ERROR: u64 = 24; +/// The timeout for tick updating the checkpoint. +/// Generally, it would take ~100ms. +/// 5s would be enough for it. +const TICK_UPDATE_TIMEOUT: Duration = Duration::from_secs(5); pub struct Endpoint { // Note: those fields are more like a shared context between components. @@ -810,19 +814,29 @@ where })); } - fn on_update_global_checkpoint(&self, task: String) { - self.pool.block_on(async move { - let ts = self.meta_client.global_progress_of_task(&task).await; + fn update_global_checkpoint(&self, task: String) -> future![()] { + let meta_client = self.meta_client.clone(); + let router = self.range_router.clone(); + let store_id = self.store_id; + async move { + #[cfg(feature = "failpoints")] + { + // fail-rs doesn't support async code blocks now. + // let's borrow the feature name and do it ourselves :3 + if std::env::var("LOG_BACKUP_UGC_SLEEP_AND_RETURN").is_ok() { + tokio::time::sleep(Duration::from_secs(100)).await; + return; + } + } + let ts = meta_client.global_progress_of_task(&task).await; match ts { Ok(global_checkpoint) => { - let r = self - .range_router - .update_global_checkpoint(&task, global_checkpoint, self.store_id) + let r = router + .update_global_checkpoint(&task, global_checkpoint, store_id) .await; match r { Ok(true) => { - if let Err(err) = self - .meta_client + if let Err(err) = meta_client .set_storage_checkpoint(&task, global_checkpoint) .await { @@ -854,7 +868,18 @@ where ); } } - }); + } + } + + fn on_update_global_checkpoint(&self, task: String) { + let _guard = self.pool.handle().enter(); + let result = self.pool.block_on(tokio::time::timeout( + TICK_UPDATE_TIMEOUT, + self.update_global_checkpoint(task), + )); + if let Err(err) = result { + warn!("log backup update global checkpoint timed out"; "err" => %err) + } } /// Modify observe over some region. diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 97573ab756e..8cd6b87ec71 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -10,6 +10,8 @@ use tokio::sync::OnceCell; use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; use crate::errors::{ContextualResultExt, Result}; +const RPC_TIMEOUT: Duration = Duration::from_secs(30); + #[derive(Clone)] pub struct LazyEtcdClient(Arc); @@ -26,7 +28,11 @@ impl ConnectionConfig { if let Some(tls) = &self.tls { opts = opts.with_tls(tls.clone()) } - opts = opts.with_keep_alive(self.keep_alive_interval, self.keep_alive_timeout); + opts = opts + .with_keep_alive(self.keep_alive_interval, self.keep_alive_timeout) + .with_timeout(RPC_TIMEOUT) + .keep_alive_while_idle(false); + opts } } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 671952dc40d..f838e96ddbf 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -598,7 +598,7 @@ fn run_async_test(test: impl Future) -> T { #[cfg(test)] mod test { - use std::time::Duration; + use std::time::{Duration, Instant}; use backup_stream::{ errors::Error, metadata::MetadataClient, router::TaskSelector, GetCheckpointResult, @@ -874,6 +874,38 @@ mod test { ); } + #[test] + fn upload_checkpoint_exits_in_time() { + defer! {{ + std::env::remove_var("LOG_BACKUP_UGC_SLEEP_AND_RETURN"); + }} + let suite = SuiteBuilder::new_named("upload_checkpoint_exits_in_time") + .nodes(1) + .build(); + std::env::set_var("LOG_BACKUP_UGC_SLEEP_AND_RETURN", "meow"); + let (_, victim) = suite.endpoints.iter().next().unwrap(); + let sched = victim.scheduler(); + sched + .schedule(Task::UpdateGlobalCheckpoint("greenwoods".to_owned())) + .unwrap(); + let start = Instant::now(); + let (tx, rx) = tokio::sync::oneshot::channel(); + sched + .schedule(Task::Sync( + Box::new(move || { + tx.send(Instant::now()).unwrap(); + }), + Box::new(|_| true), + )) + .unwrap(); + let end = run_async_test(rx).unwrap(); + assert!( + end - start < Duration::from_secs(10), + "take = {:?}", + end - start + ); + } + #[test] fn failed_during_refresh_region() { defer! { From 693ae46f2739d3f8a493589aff57edcdafc8e12a Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Thu, 11 Aug 2022 18:10:49 +0800 Subject: [PATCH 148/676] pd-client: tikv should continue if cluster-id is zero. (#13242) close tikv/tikv#13240 using warn to replace panic. Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Ti Chi Robot --- components/pd_client/src/util.rs | 47 ++++++++++++++++++------ tests/integrations/pd/test_rpc_client.rs | 14 +++++++ 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index e4145f16c0d..7f7ef9a5db5 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -585,6 +585,12 @@ impl PdConnector { .keepalive_timeout(Duration::from_secs(3)); self.security_mgr.connect(cb, addr_trim) }; + fail_point!("cluster_id_is_not_ready", |_| { + Ok(( + PdClientStub::new(channel.clone()), + GetMembersResponse::default(), + )) + }); let client = PdClientStub::new(channel); let option = CallOption::default().timeout(Duration::from_secs(REQUEST_TIMEOUT)); let response = client @@ -597,6 +603,13 @@ impl PdConnector { } } + // load_members returns the PD members by calling getMember, there are two + // abnormal scenes for the reponse: + // 1. header has an error: the PD is not ready to serve. + // 2. cluster id is zero: etcd start server but the follower did not get + // cluster id yet. + // In this case, load_members should return an error, so the client + // will not update client address. pub async fn load_members(&self, previous: &GetMembersResponse) -> Result { let previous_leader = previous.get_leader(); let members = previous.get_members(); @@ -611,18 +624,30 @@ impl PdConnector { for ep in m.get_client_urls() { match self.connect(ep.as_str()).await { Ok((_, r)) => { - let new_cluster_id = r.get_header().get_cluster_id(); - if new_cluster_id == cluster_id { - // check whether the response have leader info, otherwise continue to - // loop the rest members - if r.has_leader() { - return Ok(r); - } + let header = r.get_header(); + // Try next follower endpoint if the cluster has not ready since this pr: + // pd#5412. + if let Err(e) = check_resp_header(header) { + error!("connect pd failed";"endpoints" => ep, "error" => ?e); } else { - panic!( - "{} no longer belongs to cluster {}, it is in {}", - ep, cluster_id, new_cluster_id - ); + let new_cluster_id = header.get_cluster_id(); + // it is new cluster if the new cluster id is zero. + if cluster_id == 0 || new_cluster_id == cluster_id { + // check whether the response have leader info, otherwise continue + // to loop the rest members + if r.has_leader() { + return Ok(r); + } + // Try next endpoint if PD server returns the + // cluster id is zero without any error. + } else if new_cluster_id == 0 { + error!("{} connect success, but cluster id is not ready", ep); + } else { + panic!( + "{} no longer belongs to cluster {}, it is in {}", + ep, cluster_id, new_cluster_id + ); + } } } Err(e) => { diff --git a/tests/integrations/pd/test_rpc_client.rs b/tests/integrations/pd/test_rpc_client.rs index 57566b91e75..5f44cc0137b 100644 --- a/tests/integrations/pd/test_rpc_client.rs +++ b/tests/integrations/pd/test_rpc_client.rs @@ -476,6 +476,20 @@ fn test_change_leader_async() { panic!("failed, leader should changed"); } +#[test] +fn test_pd_client_ok_when_cluster_not_ready() { + let pd_client_cluster_id_zero = "cluster_id_is_not_ready"; + let server = MockServer::with_case(3, Arc::new(AlreadyBootstrapped)); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + fail::cfg(pd_client_cluster_id_zero, "return()").unwrap(); + // wait 100ms to let client load member. + thread::sleep(Duration::from_millis(101)); + assert_eq!(client.reconnect().is_err(), true); + fail::remove(pd_client_cluster_id_zero); +} + #[test] fn test_pd_client_heartbeat_send_failed() { let pd_client_send_fail_fp = "region_heartbeat_send_failed"; From 1ffa3034bec416268af751d319c8d7a2bc2ca464 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Thu, 11 Aug 2022 18:32:49 +0800 Subject: [PATCH 149/676] pd-client: remove `call_option` to avoid deadlock(RWR). (#13249) close tikv/tikv#13191, ref rust-lang/rust#93740 Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Ti Chi Robot --- components/pd_client/src/client.rs | 78 ++++++++++++------------------ components/pd_client/src/util.rs | 17 +++++-- 2 files changed, 46 insertions(+), 49 deletions(-) diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index ca997e473e9..942ab0269be 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -19,7 +19,7 @@ use futures::{ sink::SinkExt, stream::StreamExt, }; -use grpcio::{CallOption, EnvBuilder, Environment, WriteFlags}; +use grpcio::{EnvBuilder, Environment, WriteFlags}; use kvproto::{ metapb, pdpb::{self, Member}, @@ -37,7 +37,7 @@ use yatp::{task::future::TaskCell, ThreadPool}; use super::{ metrics::*, - util::{check_resp_header, sync_request, Client, Inner, PdConnector}, + util::{call_option_inner, check_resp_header, sync_request, Client, PdConnector}, BucketStat, Config, Error, FeatureGate, PdClient, PdFuture, RegionInfo, RegionStat, Result, UnixSecs, REQUEST_TIMEOUT, }; @@ -189,20 +189,6 @@ impl RpcClient { block_on(self.pd_client.reconnect(true)) } - /// Creates a new call option with default request timeout. - #[inline] - pub fn call_option(client: &Client) -> CallOption { - Self::call_option_inner(&client.inner.rl()) - } - - #[inline] - fn call_option_inner(inner: &Inner) -> CallOption { - inner - .target_info() - .call_option() - .timeout(Duration::from_secs(REQUEST_TIMEOUT)) - } - /// Gets given key's Region and Region's leader from PD. fn get_region_and_leader( &self, @@ -221,7 +207,7 @@ impl RpcClient { let inner = client.inner.rl(); inner .client_stub - .get_region_async_opt(&req, Self::call_option_inner(&inner)) + .get_region_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!("fail to request PD {} err {:?}", "get_region_async_opt", e) }) @@ -261,7 +247,7 @@ impl RpcClient { let inner = client.inner.rl(); inner .client_stub - .get_store_async_opt(&req, Self::call_option_inner(&inner)) + .get_store_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!("fail to request PD {} err {:?}", "get_store_async", e) }) @@ -339,7 +325,7 @@ impl PdClient for RpcClient { ) -> Result> { use kvproto::pdpb::WatchGlobalConfigRequest; let req = WatchGlobalConfigRequest::default(); - sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { + sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, _| { client.watch_global_config(&req) }) } @@ -362,8 +348,8 @@ impl PdClient for RpcClient { req.set_store(stores); req.set_region(region); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.bootstrap_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.bootstrap_opt(&req, option) })?; check_resp_header(resp.get_header())?; Ok(resp.replication_status.take()) @@ -377,8 +363,8 @@ impl PdClient for RpcClient { let mut req = pdpb::IsBootstrappedRequest::default(); req.set_header(self.header()); - let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.is_bootstrapped_opt(&req, Self::call_option(&self.pd_client)) + let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.is_bootstrapped_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -393,8 +379,8 @@ impl PdClient for RpcClient { let mut req = pdpb::AllocIdRequest::default(); req.set_header(self.header()); - let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.alloc_id_opt(&req, Self::call_option(&self.pd_client)) + let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.alloc_id_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -414,8 +400,8 @@ impl PdClient for RpcClient { req.set_header(self.header()); req.set_store(store); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.put_store_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.put_store_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -431,8 +417,8 @@ impl PdClient for RpcClient { req.set_header(self.header()); req.set_store_id(store_id); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.get_store_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.get_store_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -457,8 +443,8 @@ impl PdClient for RpcClient { req.set_header(self.header()); req.set_exclude_tombstone_stores(exclude_tombstone); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.get_all_stores_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.get_all_stores_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -473,8 +459,8 @@ impl PdClient for RpcClient { let mut req = pdpb::GetClusterConfigRequest::default(); req.set_header(self.header()); - let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.get_cluster_config_opt(&req, Self::call_option(&self.pd_client)) + let mut resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.get_cluster_config_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -511,7 +497,7 @@ impl PdClient for RpcClient { let inner = client.inner.rl(); inner .client_stub - .get_region_by_id_async_opt(&req, Self::call_option_inner(&inner)) + .get_region_by_id_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!("fail to request PD {} err {:?}", "get_region_by_id", e); }) @@ -550,7 +536,7 @@ impl PdClient for RpcClient { let inner = client.inner.rl(); inner .client_stub - .get_region_by_id_async_opt(&req, Self::call_option_inner(&inner)) + .get_region_by_id_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!("fail to request PD {} err {:?}", "get_region_by_id", e) }) @@ -688,7 +674,7 @@ impl PdClient for RpcClient { let inner = client.inner.rl(); inner .client_stub - .ask_split_async_opt(&req, Self::call_option_inner(&inner)) + .ask_split_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "ask_split", e)) }; @@ -724,7 +710,7 @@ impl PdClient for RpcClient { let inner = client.inner.rl(); inner .client_stub - .ask_batch_split_async_opt(&req, Self::call_option_inner(&inner)) + .ask_batch_split_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!("fail to request PD {} err {:?}", "ask_batch_split", e) }) @@ -771,7 +757,7 @@ impl PdClient for RpcClient { let inner = client.inner.rl(); inner .client_stub - .store_heartbeat_async_opt(&req, Self::call_option_inner(&inner)) + .store_heartbeat_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!("fail to request PD {} err {:?}", "store_heartbeat", e) }) @@ -808,7 +794,7 @@ impl PdClient for RpcClient { let inner = client.inner.rl(); inner .client_stub - .report_batch_split_async_opt(&req, Self::call_option_inner(&inner)) + .report_batch_split_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!("fail to request PD {} err {:?}", "report_batch_split", e) }) @@ -841,8 +827,8 @@ impl PdClient for RpcClient { } req.set_region(region.region); - let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.scatter_region_opt(&req, Self::call_option(&self.pd_client)) + let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.scatter_region_opt(&req, option) })?; check_resp_header(resp.get_header()) } @@ -862,7 +848,7 @@ impl PdClient for RpcClient { let inner = client.inner.rl(); inner .client_stub - .get_gc_safe_point_async_opt(&req, Self::call_option_inner(&inner)) + .get_gc_safe_point_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!("fail to request PD {} err {:?}", "get_gc_saft_point", e) }) @@ -895,8 +881,8 @@ impl PdClient for RpcClient { req.set_header(self.header()); req.set_region_id(region_id); - let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client| { - client.get_operator_opt(&req, Self::call_option(&self.pd_client)) + let resp = sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, option| { + client.get_operator_opt(&req, option) })?; check_resp_header(resp.get_header())?; @@ -950,7 +936,7 @@ impl PdClient for RpcClient { let inner = client.inner.rl(); inner .client_stub - .update_service_gc_safe_point_async_opt(&r, Self::call_option_inner(&inner)) + .update_service_gc_safe_point_async_opt(&r, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!( "fail to request PD {} err {:?}", @@ -989,7 +975,7 @@ impl PdClient for RpcClient { let inner = client.inner.rl(); inner .client_stub - .report_min_resolved_ts_async_opt(&req, Self::call_option_inner(&inner)) + .report_min_resolved_ts_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { panic!("fail to request PD {} err {:?}", "min_resolved_ts", e) }) diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 7f7ef9a5db5..fec63383891 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -472,10 +472,17 @@ where } } +pub fn call_option_inner(inner: &Inner) -> CallOption { + inner + .target_info() + .call_option() + .timeout(Duration::from_secs(REQUEST_TIMEOUT)) +} + /// Do a request in synchronized fashion. pub fn sync_request(client: &Client, mut retry: usize, func: F) -> Result where - F: Fn(&PdClientStub) -> GrpcResult, + F: Fn(&PdClientStub, CallOption) -> GrpcResult, { loop { let ret = { @@ -483,8 +490,12 @@ where // thread which may hold the read lock and wait for PD client thread // completing the request and the PD client thread which may block // on acquiring the write lock. - let client_stub = client.inner.rl().client_stub.clone(); - func(&client_stub).map_err(Error::Grpc) + let (client_stub, option) = { + let inner = client.inner.rl(); + (inner.client_stub.clone(), call_option_inner(&inner)) + }; + + func(&client_stub, option).map_err(Error::Grpc) }; match ret { Ok(r) => { From 38655bff985289560d6a6095bff54665d9cfd254 Mon Sep 17 00:00:00 2001 From: Connor Date: Fri, 12 Aug 2022 09:46:49 +0800 Subject: [PATCH 150/676] raftstore: Use thread-local metrics for local read (#13244) ref tikv/tikv#12876 Use tls local read metrics Signed-off-by: Connor1996 --- .../raftstore-v2/src/operation/read/local.rs | 2 +- components/raftstore/src/store/mod.rs | 6 +- .../raftstore/src/store/worker/metrics.rs | 51 ++- components/raftstore/src/store/worker/mod.rs | 2 +- components/raftstore/src/store/worker/read.rs | 348 +++++++----------- 5 files changed, 183 insertions(+), 226 deletions(-) diff --git a/components/raftstore-v2/src/operation/read/local.rs b/components/raftstore-v2/src/operation/read/local.rs index 56a5f01a7fd..2e694f11ebc 100644 --- a/components/raftstore-v2/src/operation/read/local.rs +++ b/components/raftstore-v2/src/operation/read/local.rs @@ -29,7 +29,7 @@ use raftstore::{ store::{ cmd_resp, util::{self, LeaseState, RegionReadProgress, RemoteLease}, - ReadDelegate, ReadExecutor, ReadExecutorProvider, ReadMetrics, ReadProgress, ReadResponse, + ReadDelegate, ReadExecutor, ReadExecutorProvider, ReadProgress, ReadResponse, RegionSnapshot, RequestInspector, RequestPolicy, TrackVer, TxnExt, }, Error, Result, diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index d47cc892033..ad730206175 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -70,8 +70,8 @@ pub use self::{ AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, LocalReader, PdTask, QueryStats, RaftlogFetchRunner, RaftlogFetchTask, ReadDelegate, - ReadExecutor, ReadExecutorProvider, ReadMetrics, ReadProgress, ReadStats, - RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, - SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, + TrackVer, WriteStats, }, }; diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index e119fcdc3ab..fa27ea340b8 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -1,8 +1,11 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. +use std::{cell::RefCell, time::Duration}; + use lazy_static::lazy_static; -use prometheus::*; +use prometheus::{local::LocalIntCounter, *}; use prometheus_static_metric::*; +use tikv_util::time::Instant; make_auto_flush_static_metric! { pub label_enum SnapType { @@ -44,14 +47,54 @@ make_static_metric! { epoch, applied_term, channel_full, + cache_miss, safe_ts, } - pub struct ReadRejectCounter : IntCounter { - "reason" => RejectReason + pub struct LocalReadRejectCounter : LocalIntCounter { + "reason" => RejectReason, } } +pub struct LocalReadMetrics { + pub local_executed_requests: LocalIntCounter, + pub local_executed_stale_read_requests: LocalIntCounter, + pub local_executed_snapshot_cache_hit: LocalIntCounter, + pub reject_reason: LocalReadRejectCounter, + pub renew_lease_advance: LocalIntCounter, + last_flush_time: Instant, +} + +thread_local! { + pub static TLS_LOCAL_READ_METRICS: RefCell = RefCell::new( + LocalReadMetrics { + local_executed_requests: LOCAL_READ_EXECUTED_REQUESTS.local(), + local_executed_stale_read_requests: LOCAL_READ_EXECUTED_STALE_READ_REQUESTS.local(), + local_executed_snapshot_cache_hit: LOCAL_READ_EXECUTED_CACHE_REQUESTS.local(), + reject_reason: LocalReadRejectCounter::from(&LOCAL_READ_REJECT_VEC), + renew_lease_advance: LOCAL_READ_RENEW_LEASE_ADVANCE_COUNTER.local(), + last_flush_time: Instant::now_coarse(), + } + ); +} + +const METRICS_FLUSH_INTERVAL: u64 = 15_000; // 15s + +pub fn maybe_tls_local_read_metrics_flush() { + TLS_LOCAL_READ_METRICS.with(|m| { + let mut m = m.borrow_mut(); + + if m.last_flush_time.saturating_elapsed() >= Duration::from_millis(METRICS_FLUSH_INTERVAL) { + m.local_executed_requests.flush(); + m.local_executed_stale_read_requests.flush(); + m.local_executed_snapshot_cache_hit.flush(); + m.reject_reason.flush(); + m.renew_lease_advance.flush(); + m.last_flush_time = Instant::now_coarse(); + } + }); +} + lazy_static! { pub static ref SNAP_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_raftstore_snapshot_total", @@ -111,8 +154,6 @@ lazy_static! { &["reason"] ) .unwrap(); - pub static ref LOCAL_READ_REJECT: ReadRejectCounter = - ReadRejectCounter::from(&LOCAL_READ_REJECT_VEC); pub static ref LOCAL_READ_EXECUTED_REQUESTS: IntCounter = register_int_counter!( "tikv_raftstore_local_read_executed_requests", "Total number of requests directly executed by local reader." diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 1651183f976..2298710ad63 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -34,7 +34,7 @@ pub use self::{ raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, read::{ CachedReadDelegate, LocalReadContext, LocalReader, Progress as ReadProgress, ReadDelegate, - ReadExecutor, ReadExecutorProvider, ReadMetrics, StoreMetaDelegate, TrackVer, + ReadExecutor, ReadExecutorProvider, StoreMetaDelegate, TrackVer, }, refresh_config::{ BatchComponent as RaftStoreBatchComponent, Runner as RefreshConfigRunner, diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index de1cb1011ae..3c5c05f4717 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -9,7 +9,6 @@ use std::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, }, - time::Duration, }; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; @@ -26,7 +25,7 @@ use tikv_util::{ codec::number::decode_u64, debug, error, lru::LruCache, - time::{monotonic_raw_now, Instant, ThreadReadId}, + time::{monotonic_raw_now, ThreadReadId}, }; use time::Timespec; @@ -213,7 +212,6 @@ pub struct LocalReadContext<'a, E> where E: KvEngine, { - metrics: &'a mut ReadMetrics, read_id: &'a mut ThreadReadId, snap_cache: &'a mut Box>>, } @@ -397,7 +395,6 @@ impl ReadDelegate { &self, router: &dyn CasualRouter, ts: Timespec, - metrics: &mut ReadMetrics, ) { if !self .leader_lease @@ -407,7 +404,7 @@ impl ReadDelegate { { return; } - metrics.renew_lease_advance += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().renew_lease_advance.inc()); let region_id = self.region.get_id(); if let Err(e) = router.send(region_id, CasualMessage::RenewLease) { debug!( @@ -418,18 +415,19 @@ impl ReadDelegate { } } - pub fn is_in_leader_lease(&self, ts: Timespec, metrics: &mut ReadMetrics) -> bool { + pub fn is_in_leader_lease(&self, ts: Timespec) -> bool { if let Some(ref lease) = self.leader_lease { let term = lease.term(); if term == self.term { if lease.inspect(Some(ts)) == LeaseState::Valid { return true; } else { - metrics.rejected_by_lease_expire += 1; + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().reject_reason.lease_expire.inc()); debug!("rejected by lease expire"; "tag" => &self.tag); } } else { - metrics.rejected_by_term_mismatch += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.term_mismatch.inc()); debug!("rejected by term mismatch"; "tag" => &self.tag); } } @@ -440,7 +438,6 @@ impl ReadDelegate { pub fn check_stale_read_safe( &self, read_ts: u64, - metrics: &mut ReadMetrics, ) -> std::result::Result<(), ReadResponse> { let safe_ts = self.read_progress.safe_ts(); if safe_ts >= read_ts { @@ -452,7 +449,7 @@ impl ReadDelegate { "safe ts" => safe_ts, "read ts" => read_ts ); - metrics.rejected_by_safe_timestamp += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.safe_ts.inc()); let mut response = cmd_resp::new_error(Error::DataIsNotReady { region_id: self.region.get_id(), peer_id: self.peer_id, @@ -549,7 +546,6 @@ where pub store_id: Cell>, store_meta: S, kv_engine: E, - pub metrics: ReadMetrics, // region id -> ReadDelegate // The use of `Arc` here is a workaround, see the comment at `get_delegate` pub delegates: LruCache, @@ -573,11 +569,12 @@ where read_context: &mut Option>, ) -> Arc { let ctx = read_context.as_mut().unwrap(); - ctx.metrics.local_executed_requests += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); if let Some(ts) = create_time { if ts == *ctx.read_id { if let Some(snap) = ctx.snap_cache.as_ref().as_ref() { - ctx.metrics.local_executed_snapshot_cache_hit += 1; + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_snapshot_cache_hit.inc()); return snap.clone(); } } @@ -606,7 +603,6 @@ where snap_cache: Box::new(None), cache_read_id, store_id: Cell::new(None), - metrics: Default::default(), delegates: LruCache::with_capacity_and_sample(0, 7), } } @@ -618,14 +614,14 @@ where match ProposalRouter::send(&self.router, cmd) { Ok(()) => return, Err(TrySendError::Full(c)) => { - self.metrics.rejected_by_channel_full += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.channel_full.inc()); err.set_message(RAFTSTORE_IS_BUSY.to_owned()); err.mut_server_is_busy() .set_reason(RAFTSTORE_IS_BUSY.to_owned()); cmd = c; } Err(TrySendError::Disconnected(c)) => { - self.metrics.rejected_by_no_region += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); err.set_message(format!("region {} is missing", region_id)); err.mut_region_not_found().set_region_id(region_id); cmd = c; @@ -655,7 +651,7 @@ where Some(d) if !d.track_ver.any_new() => Some(d.clone()), _ => { debug!("update local read delegate"; "region_id" => region_id); - self.metrics.rejected_by_cache_miss += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.cache_miss.inc()); let (meta_len, meta_reader) = { self.store_meta.get_executor_and_len(region_id) }; @@ -687,7 +683,7 @@ where let store_id = self.store_id.get().unwrap(); if let Err(e) = util::check_store_id(req, store_id) { - self.metrics.rejected_by_store_id_mismatch += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.store_id_mismatch.inc()); debug!("rejected by store id not match"; "err" => %e); return Err(e); } @@ -697,7 +693,7 @@ where let delegate = match self.get_delegate(region_id) { Some(d) => d, None => { - self.metrics.rejected_by_no_region += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); debug!("rejected by no region"; "region_id" => region_id); return Ok(None); } @@ -707,7 +703,7 @@ where // Check peer id. if let Err(e) = util::check_peer_id(req, delegate.peer_id) { - self.metrics.rejected_by_peer_id_mismatch += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.peer_id_mismatch.inc()); return Err(e); } @@ -718,13 +714,13 @@ where "delegate_term" => delegate.term, "header_term" => req.get_header().get_term(), ); - self.metrics.rejected_by_term_mismatch += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.term_mismatch.inc()); return Err(e); } // Check region epoch. if util::check_region_epoch(req, &delegate.region, false).is_err() { - self.metrics.rejected_by_epoch += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.epoch.inc()); // Stale epoch, redirect it to raftstore to get the latest region. debug!("rejected by epoch not match"; "tag" => &delegate.tag); return Ok(None); @@ -732,7 +728,6 @@ where let mut inspector = Inspector { delegate: &delegate, - metrics: &mut self.metrics, }; match inspector.inspect(req) { Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), @@ -766,14 +761,13 @@ where } None => monotonic_raw_now(), }; - if !delegate.is_in_leader_lease(snapshot_ts, &mut self.metrics) { + if !delegate.is_in_leader_lease(snapshot_ts) { // Forward to raftstore. self.redirect(RaftCommand::new(req, cb)); return; } delegate_ext = LocalReadContext { - metrics: &mut self.metrics, snap_cache: &mut self.snap_cache, read_id: &mut self.cache_read_id, }; @@ -782,27 +776,19 @@ where let response = delegate.execute(&req, ®ion, None, read_id, Some(delegate_ext)); // Try renew lease in advance - - delegate.maybe_renew_lease_advance( - &self.router, - snapshot_ts, - &mut self.metrics, - ); + delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); response } // Replica can serve stale read if and only if its `safe_ts` >= `read_ts` RequestPolicy::StaleRead => { let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); assert!(read_ts > 0); - if let Err(resp) = - delegate.check_stale_read_safe(read_ts, &mut self.metrics) - { + if let Err(resp) = delegate.check_stale_read_safe(read_ts) { cb.invoke_read(resp); return; } delegate_ext = LocalReadContext { - metrics: &mut self.metrics, snap_cache: &mut self.snap_cache, read_id: &mut self.cache_read_id, }; @@ -814,13 +800,12 @@ where // Double check in case `safe_ts` change after the first check and before // getting snapshot - if let Err(resp) = - delegate.check_stale_read_safe(read_ts, &mut self.metrics) - { + if let Err(resp) = delegate.check_stale_read_safe(read_ts) { cb.invoke_read(resp); return; } - self.metrics.local_executed_stale_read_requests += 1; + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); response } _ => unreachable!(), @@ -863,7 +848,7 @@ where cb: Callback, ) { self.propose_raft_command(read_id, req, cb); - self.metrics.maybe_flush(); + maybe_tls_local_read_metrics_flush(); } pub fn release_snapshot_cache(&mut self) { @@ -884,7 +869,6 @@ where kv_engine: self.kv_engine.clone(), router: self.router.clone(), store_id: self.store_id.clone(), - metrics: Default::default(), delegates: LruCache::with_capacity_and_sample(0, 7), snap_cache: self.snap_cache.clone(), cache_read_id: self.cache_read_id.clone(), @@ -893,12 +877,11 @@ where } /// #[RaftstoreCommon] -struct Inspector<'r, 'm> { +struct Inspector<'r> { delegate: &'r ReadDelegate, - metrics: &'m mut ReadMetrics, } -impl<'r, 'm> RequestInspector for Inspector<'r, 'm> { +impl<'r> RequestInspector for Inspector<'r> { fn has_applied_to_current_term(&mut self) -> bool { if self.delegate.applied_term == self.delegate.term { true @@ -911,7 +894,7 @@ impl<'r, 'm> RequestInspector for Inspector<'r, 'm> { ); // only for metric. - self.metrics.rejected_by_applied_term += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.applied_term.inc()); false } } @@ -923,146 +906,12 @@ impl<'r, 'm> RequestInspector for Inspector<'r, 'm> { LeaseState::Valid } else { debug!("rejected by leader lease"; "tag" => &self.delegate.tag); - self.metrics.rejected_by_no_lease += 1; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_lease.inc()); LeaseState::Expired } } } -const METRICS_FLUSH_INTERVAL: u64 = 15_000; // 15s - -/// #[RaftstoreCommon] -#[derive(Clone)] -pub struct ReadMetrics { - pub local_executed_requests: u64, - pub local_executed_stale_read_requests: u64, - pub local_executed_snapshot_cache_hit: u64, - // TODO: record rejected_by_read_quorum. - pub rejected_by_store_id_mismatch: u64, - pub rejected_by_peer_id_mismatch: u64, - pub rejected_by_term_mismatch: u64, - pub rejected_by_lease_expire: u64, - pub rejected_by_no_region: u64, - pub rejected_by_no_lease: u64, - pub rejected_by_epoch: u64, - pub rejected_by_applied_term: u64, - pub rejected_by_channel_full: u64, - pub rejected_by_cache_miss: u64, - pub rejected_by_safe_timestamp: u64, - pub renew_lease_advance: u64, - - pub last_flush_time: Instant, -} - -impl Default for ReadMetrics { - fn default() -> ReadMetrics { - ReadMetrics { - local_executed_requests: 0, - local_executed_stale_read_requests: 0, - local_executed_snapshot_cache_hit: 0, - rejected_by_store_id_mismatch: 0, - rejected_by_peer_id_mismatch: 0, - rejected_by_term_mismatch: 0, - rejected_by_lease_expire: 0, - rejected_by_no_region: 0, - rejected_by_no_lease: 0, - rejected_by_epoch: 0, - rejected_by_applied_term: 0, - rejected_by_channel_full: 0, - rejected_by_cache_miss: 0, - rejected_by_safe_timestamp: 0, - renew_lease_advance: 0, - last_flush_time: Instant::now(), - } - } -} - -impl ReadMetrics { - pub fn maybe_flush(&mut self) { - if self.last_flush_time.saturating_elapsed() - >= Duration::from_millis(METRICS_FLUSH_INTERVAL) - { - self.flush(); - self.last_flush_time = Instant::now(); - } - } - - fn flush(&mut self) { - if self.rejected_by_store_id_mismatch > 0 { - LOCAL_READ_REJECT - .store_id_mismatch - .inc_by(self.rejected_by_store_id_mismatch); - self.rejected_by_store_id_mismatch = 0; - } - if self.rejected_by_peer_id_mismatch > 0 { - LOCAL_READ_REJECT - .peer_id_mismatch - .inc_by(self.rejected_by_peer_id_mismatch); - self.rejected_by_peer_id_mismatch = 0; - } - if self.rejected_by_term_mismatch > 0 { - LOCAL_READ_REJECT - .term_mismatch - .inc_by(self.rejected_by_term_mismatch); - self.rejected_by_term_mismatch = 0; - } - if self.rejected_by_lease_expire > 0 { - LOCAL_READ_REJECT - .lease_expire - .inc_by(self.rejected_by_lease_expire); - self.rejected_by_lease_expire = 0; - } - if self.rejected_by_no_region > 0 { - LOCAL_READ_REJECT - .no_region - .inc_by(self.rejected_by_no_region); - self.rejected_by_no_region = 0; - } - if self.rejected_by_no_lease > 0 { - LOCAL_READ_REJECT.no_lease.inc_by(self.rejected_by_no_lease); - self.rejected_by_no_lease = 0; - } - if self.rejected_by_epoch > 0 { - LOCAL_READ_REJECT.epoch.inc_by(self.rejected_by_epoch); - self.rejected_by_epoch = 0; - } - if self.rejected_by_applied_term > 0 { - LOCAL_READ_REJECT - .applied_term - .inc_by(self.rejected_by_applied_term); - self.rejected_by_applied_term = 0; - } - if self.rejected_by_channel_full > 0 { - LOCAL_READ_REJECT - .channel_full - .inc_by(self.rejected_by_channel_full); - self.rejected_by_channel_full = 0; - } - if self.rejected_by_safe_timestamp > 0 { - LOCAL_READ_REJECT - .safe_ts - .inc_by(self.rejected_by_safe_timestamp); - self.rejected_by_safe_timestamp = 0; - } - if self.local_executed_snapshot_cache_hit > 0 { - LOCAL_READ_EXECUTED_CACHE_REQUESTS.inc_by(self.local_executed_snapshot_cache_hit); - self.local_executed_snapshot_cache_hit = 0; - } - if self.local_executed_requests > 0 { - LOCAL_READ_EXECUTED_REQUESTS.inc_by(self.local_executed_requests); - self.local_executed_requests = 0; - } - if self.local_executed_stale_read_requests > 0 { - LOCAL_READ_EXECUTED_STALE_READ_REQUESTS.inc_by(self.local_executed_stale_read_requests); - self.local_executed_stale_read_requests = 0; - } - if self.renew_lease_advance > 0 { - LOCAL_READ_RENEW_LEASE_ADVANCE_COUNTER.inc_by(self.renew_lease_advance); - self.renew_lease_advance = 0; - } - } -} - #[cfg(test)] mod tests { use std::{sync::mpsc::*, thread}; @@ -1234,8 +1083,14 @@ mod tests { // The region is not register yet. must_redirect(&mut reader, &rx, cmd.clone()); - assert_eq!(reader.metrics.rejected_by_no_region, 1); - assert_eq!(reader.metrics.rejected_by_cache_miss, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.no_region.get()), + 1 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 1 + ); assert!(reader.delegates.get(&1).is_none()); // Register region 1 @@ -1264,8 +1119,14 @@ mod tests { // The applied_term is stale must_redirect(&mut reader, &rx, cmd.clone()); - assert_eq!(reader.metrics.rejected_by_cache_miss, 2); - assert_eq!(reader.metrics.rejected_by_applied_term, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 2 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.applied_term.get()), + 1 + ); // Make the applied_term matches current term. let pg = Progress::applied_term(term6); @@ -1276,7 +1137,10 @@ mod tests { let task = RaftCommand::::new(cmd.clone(), Callback::Read(Box::new(move |_| {}))); must_not_redirect(&mut reader, &rx, task); - assert_eq!(reader.metrics.rejected_by_cache_miss, 3); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 3 + ); // Let's read. let task = RaftCommand::::new( @@ -1291,7 +1155,10 @@ mod tests { // Wait for expiration. thread::sleep(Duration::seconds(1).to_std().unwrap()); must_redirect(&mut reader, &rx, cmd.clone()); - assert_eq!(reader.metrics.rejected_by_lease_expire, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), + 1 + ); // Renew lease. lease.renew(monotonic_raw_now()); @@ -1311,8 +1178,14 @@ mod tests { assert!(resp.snapshot.is_none()); })), ); - assert_eq!(reader.metrics.rejected_by_store_id_mismatch, 1); - assert_eq!(reader.metrics.rejected_by_cache_miss, 3); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.store_id_mismatch.get()), + 1 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 3 + ); // metapb::Peer id mismatch. let mut cmd_peer_id = cmd.clone(); @@ -1332,7 +1205,10 @@ mod tests { assert!(resp.snapshot.is_none()); })), ); - assert_eq!(reader.metrics.rejected_by_peer_id_mismatch, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.peer_id_mismatch.get()), + 1 + ); // Read quorum. let mut cmd_read_quorum = cmd.clone(); @@ -1351,7 +1227,10 @@ mod tests { assert!(resp.snapshot.is_none()); })), ); - assert_eq!(reader.metrics.rejected_by_term_mismatch, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.term_mismatch.get()), + 1 + ); // Stale epoch. let mut epoch12 = epoch13; @@ -1359,15 +1238,19 @@ mod tests { let mut cmd_epoch = cmd.clone(); cmd_epoch.mut_header().set_region_epoch(epoch12); must_redirect(&mut reader, &rx, cmd_epoch); - assert_eq!(reader.metrics.rejected_by_epoch, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.epoch.get()), + 1 + ); // Expire lease manually, and it can not be renewed. - let previous_lease_rejection = reader.metrics.rejected_by_lease_expire; + let previous_lease_rejection = + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()); lease.expire(); lease.renew(monotonic_raw_now()); must_redirect(&mut reader, &rx, cmd.clone()); assert_eq!( - reader.metrics.rejected_by_lease_expire, + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), previous_lease_rejection + 1 ); @@ -1384,10 +1267,14 @@ mod tests { ); rx.try_recv().unwrap(); assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); - assert_eq!(reader.metrics.rejected_by_channel_full, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.channel_full.get()), + 1 + ); // Reject by term mismatch in lease. - let previous_term_rejection = reader.metrics.rejected_by_term_mismatch; + let previous_term_rejection = + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.term_mismatch.get()); let mut cmd9 = cmd.clone(); cmd9.mut_header().set_term(term6 + 3); { @@ -1415,10 +1302,13 @@ mod tests { cmd9 ); assert_eq!( - reader.metrics.rejected_by_term_mismatch, - previous_term_rejection + 1, + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.term_mismatch.get()), + previous_term_rejection + 1 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 4 ); - assert_eq!(reader.metrics.rejected_by_cache_miss, 4); // Stale local ReadDelegate cmd.mut_header().set_term(term6 + 3); @@ -1432,10 +1322,16 @@ mod tests { let task = RaftCommand::::new(cmd.clone(), Callback::Read(Box::new(move |_| {}))); must_not_redirect(&mut reader, &rx, task); - assert_eq!(reader.metrics.rejected_by_cache_miss, 5); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 5 + ); // Stale read - assert_eq!(reader.metrics.rejected_by_safe_timestamp, 0); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 0 + ); read_progress.update_safe_ts(1, 1); assert_eq!(read_progress.safe_ts(), 1); @@ -1456,13 +1352,19 @@ mod tests { })), ); must_not_redirect(&mut reader, &rx, task); - assert_eq!(reader.metrics.rejected_by_safe_timestamp, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 1 + ); read_progress.update_safe_ts(1, 2); assert_eq!(read_progress.safe_ts(), 2); let task = RaftCommand::::new(cmd, Callback::Read(Box::new(move |_| {}))); must_not_redirect(&mut reader, &rx, task); - assert_eq!(reader.metrics.rejected_by_safe_timestamp, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 1 + ); // Remove invalid delegate let reader_clone = store_meta.lock().unwrap().readers.get(&1).unwrap().clone(); @@ -1573,13 +1475,11 @@ mod tests { } let mut read_id = ThreadReadId::new(); - let mut read_metrics = ReadMetrics::default(); let mut snap_cache = Box::new(None); let read_id_copy = Some(read_id.clone()); let mut read_context = Some(LocalReadContext { - metrics: &mut read_metrics, read_id: &mut read_id, snap_cache: &mut snap_cache, }); @@ -1605,8 +1505,14 @@ mod tests { ); assert!(snap_cache.as_ref().is_some()); - assert_eq!(read_metrics.local_executed_requests, 2); - assert_eq!(read_metrics.local_executed_snapshot_cache_hit, 1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_requests.get()), + 2 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), + 1 + ); } #[test] @@ -1643,7 +1549,6 @@ mod tests { { let mut read_context = Some(LocalReadContext { - metrics: &mut reader.metrics, snap_cache: &mut reader.snap_cache, read_id: &mut reader.cache_read_id, }); @@ -1654,13 +1559,15 @@ mod tests { } } // We should hit cache 9 times - assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 9); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), + 9 + ); let read_id = Some(ThreadReadId::new()); { let read_context = LocalReadContext { - metrics: &mut reader.metrics, snap_cache: &mut reader.snap_cache, read_id: &mut reader.cache_read_id, }; @@ -1668,40 +1575,49 @@ mod tests { let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); } // This time, we will miss the cache - assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 9); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), + 9 + ); { let read_context = LocalReadContext { - metrics: &mut reader.metrics, snap_cache: &mut reader.snap_cache, read_id: &mut reader.cache_read_id, }; let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); // We can hit it again. - assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 10); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), + 10 + ); } reader.release_snapshot_cache(); { let read_context = LocalReadContext { - metrics: &mut reader.metrics, snap_cache: &mut reader.snap_cache, read_id: &mut reader.cache_read_id, }; let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); } // After release, we will mss the cache even with the prevsiou read_id. - assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 10); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), + 10 + ); { let read_context = LocalReadContext { - metrics: &mut reader.metrics, snap_cache: &mut reader.snap_cache, read_id: &mut reader.cache_read_id, }; let _ = delegate.get_snapshot(read_id, &mut Some(read_context)); } // We can hit it again. - assert_eq!(reader.metrics.local_executed_snapshot_cache_hit, 11); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), + 11 + ); } } From a8cd9645ef27617e12f73b2e25de1ba9793ecf82 Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Fri, 12 Aug 2022 16:40:50 +0800 Subject: [PATCH 151/676] metric: fix the panel description for gc compaction filter (#13275) close tikv/tikv#13274 fix the panel description for gc compaction filter Signed-off-by: cfzjywxk --- metrics/grafana/tikv_details.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 0291aa87590..b47c226cb02 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -22518,7 +22518,7 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, - "description": "SafePoint used for TiKV's Auto GC", + "description": "Keys handled in GC compaction filter", "fill": 0, "gridPos": { "h": 7, @@ -22569,14 +22569,14 @@ "expr": "sum(rate(tikv_gc_compaction_filter_skip{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, - "legendFormat": "skip", + "legendFormat": "skipped", "refId": "B" }, { "expr": "sum(rate(tikv_gc_compaction_mvcc_rollback{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, - "legendFormat": "rollback/lock", + "legendFormat": "mvcc-rollback/mvcc-lock", "refId": "C" }, { @@ -22590,7 +22590,7 @@ "expr": "sum(rate(tikv_gc_compaction_filter_perform{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, - "legendFormat": "perform", + "legendFormat": "performed-times", "refId": "E" }, { From 594fca7348fb756bb44b2a6778a3e938aa5215b4 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 12 Aug 2022 18:10:49 +0800 Subject: [PATCH 152/676] raftstore: simplify v1 propose path (#13230) ref tikv/tikv#12842 Signed-off-by: tabokie --- components/backup-stream/Cargo.toml | 4 +- components/backup-stream/src/endpoint.rs | 1 - components/backup-stream/src/event_loader.rs | 1 - components/backup-stream/src/router.rs | 14 -- .../backup-stream/src/subscription_manager.rs | 36 ++--- components/batch-system/src/batch.rs | 120 +++++++------- components/batch-system/src/fsm.rs | 56 ++++--- components/batch-system/src/mailbox.rs | 21 ++- components/batch-system/src/router.rs | 32 ++-- components/cloud/aws/src/s3.rs | 1 - components/raftstore-v2/src/batch/store.rs | 45 +++--- components/raftstore-v2/src/fsm/peer.rs | 4 +- components/raftstore/src/store/fsm/apply.rs | 10 +- components/raftstore/src/store/fsm/peer.rs | 100 ++++++------ components/raftstore/src/store/peer.rs | 153 ++++++++---------- components/raftstore/src/store/worker/pd.rs | 6 - .../src/store/worker/split_controller.rs | 3 - src/server/raft_client.rs | 1 - .../cases/test_cmd_epoch_checker.rs | 28 ++-- 19 files changed, 307 insertions(+), 329 deletions(-) diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 7fe221842ce..b0b6fc3f13f 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -8,7 +8,7 @@ default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] test-engine-kv-rocksdb = ["tikv/test-engine-kv-rocksdb"] test-engine-raft-raft-engine = ["tikv/test-engine-raft-raft-engine"] test-engines-rocksdb = ["tikv/test-engines-rocksdb"] -failpoints = ["tikv/failpoints", "fail/failpoints", "fail"] +failpoints = ["tikv/failpoints", "fail/failpoints"] backup-stream-debug = [] [[test]] @@ -34,7 +34,7 @@ error_code = { path = "../error_code" } etcd-client = { git = "https://github.com/yujuncen/etcd-client", rev = "e0321a1990ee561cf042973666c0db61c8d82364", features = ["pub-response-field", "tls"] } external_storage = { path = "../external_storage", default-features = false } external_storage_export = { path = "../external_storage/export", default-features = false } -fail = { version = "0.5", optional = true } +fail = "0.5" file_system = { path = "../file_system" } futures = "0.3" futures-io = "0.3" diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 281bf2e77f6..81374484463 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -766,7 +766,6 @@ where async move { let mut resolved = get_rts.await?; let mut new_rts = resolved.global_checkpoint(); - #[cfg(feature = "failpoints")] fail::fail_point!("delay_on_flush"); flush_ob.before(resolved.take_region_checkpoints()).await; if let Some(rewritten_rts) = flush_ob.rewrite_resolved_ts(&task).await { diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 05b370e2985..61e227af1ac 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -383,7 +383,6 @@ where let mut stats = StatisticsSummary::default(); let start = Instant::now(); loop { - #[cfg(feature = "failpoints")] fail::fail_point!("scan_and_async_send", |msg| Err(Error::Other(box_err!( "{:?}", msg )))); diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index d5486cecddb..f1280103e89 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -688,20 +688,6 @@ impl TempFileKey { use chrono::prelude::*; let millis = TimeStamp::physical(ts.into()); let dt = Utc.timestamp_millis(millis as _); - - #[cfg(feature = "failpoints")] - { - fail::fail_point!("stream_format_date_time", |s| { - return dt - .format(&s.unwrap_or_else(|| "%Y%m".to_owned())) - .to_string(); - }); - match t { - FormatType::Date => dt.format("%Y%m%d").to_string(), - FormatType::Hour => dt.format("%H").to_string(), - } - } - #[cfg(not(feature = "failpoints"))] match t { FormatType::Date => dt.format("%Y%m%d"), FormatType::Hour => dt.format("%H"), diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index c6e928b8201..751f41ee587 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -202,7 +202,6 @@ fn scan_executor_loop( canceled: Arc, ) { while let Ok(cmd) = cmds.recv() { - #[cfg(feature = "failpoints")] fail::fail_point!("execute_scan_command"); debug!("handling initial scan request"; "region_id" => %cmd.region.get_id()); metrics::PENDING_INITIAL_SCAN_LEN @@ -393,7 +392,6 @@ where info!("backup stream: on_modify_observe"; "op" => ?op); match op { ObserveOp::Start { region } => { - #[cfg(feature = "failpoints")] fail::fail_point!("delay_on_start_observe"); self.start_observe(region).await; metrics::INITIAL_SCAN_REASON @@ -522,7 +520,6 @@ where } Some(for_task) => { - #[cfg(feature = "failpoints")] fail::fail_point!("try_start_observe", |_| { Err(Error::Other(box_err!("Nature is boring"))) }); @@ -604,7 +601,6 @@ where } async fn get_last_checkpoint_of(&self, task: &str, region: &Region) -> Result { - #[cfg(feature = "failpoints")] fail::fail_point!("get_last_checkpoint_of", |hint| Err(Error::Other( box_err!( "get_last_checkpoint_of({}, {:?}) failed because {:?}", @@ -666,8 +662,6 @@ mod test { use tikv::storage::Statistics; use super::InitialScan; - #[cfg(feature = "failpoints")] - use crate::{subscription_manager::spawn_executors, utils::CallbackWaitGroup}; #[derive(Clone, Copy)] struct NoopInitialScan; @@ -687,27 +681,27 @@ mod test { } } - #[cfg(feature = "failpoints")] - fn should_finish_in(f: impl FnOnce() + Send + 'static, d: std::time::Duration) { - let (tx, rx) = futures::channel::oneshot::channel(); - std::thread::spawn(move || { - f(); - tx.send(()).unwrap(); - }); - let pool = tokio::runtime::Builder::new_current_thread() - .enable_time() - .build() - .unwrap(); - let _e = pool.handle().enter(); - pool.block_on(tokio::time::timeout(d, rx)).unwrap().unwrap(); - } - #[test] #[cfg(feature = "failpoints")] fn test_message_delay_and_exit() { use std::time::Duration; use super::ScanCmd; + use crate::{subscription_manager::spawn_executors, utils::CallbackWaitGroup}; + + fn should_finish_in(f: impl FnOnce() + Send + 'static, d: std::time::Duration) { + let (tx, rx) = futures::channel::oneshot::channel(); + std::thread::spawn(move || { + f(); + tx.send(()).unwrap(); + }); + let pool = tokio::runtime::Builder::new_current_thread() + .enable_time() + .build() + .unwrap(); + let _e = pool.handle().enter(); + pool.block_on(tokio::time::timeout(d, rx)).unwrap().unwrap(); + } let pool = spawn_executors(NoopInitialScan, 1); let wg = CallbackWaitGroup::new(); diff --git a/components/batch-system/src/batch.rs b/components/batch-system/src/batch.rs index f868b4bfc94..4d935ad4819 100644 --- a/components/batch-system/src/batch.rs +++ b/components/batch-system/src/batch.rs @@ -132,7 +132,7 @@ pub struct Batch { } impl Batch { - /// Create a a batch with given batch size. + /// Creates a batch with given batch size. pub fn with_capacity(cap: usize) -> Batch { Batch { normals: Vec::with_capacity(cap), @@ -163,15 +163,16 @@ impl Batch { self.control.take(); } - /// Put back the FSM located at index. + /// Releases the ownership of `fsm` so that it can be scheduled in another + /// poller. /// - /// Only when channel length is larger than `checked_len` will trigger - /// further notification. This function may fail if channel length is - /// larger than the given value before FSM is released. - fn release(&mut self, mut fsm: NormalFsm, checked_len: usize) -> Option> { + /// When pending messages of the FSM is different than `expected_len`, + /// attempts to schedule it in this poller again. Returns the `fsm` if the + /// re-scheduling suceeds. + fn release(&mut self, mut fsm: NormalFsm, expected_len: usize) -> Option> { let mailbox = fsm.take_mailbox().unwrap(); mailbox.release(fsm.fsm); - if mailbox.len() == checked_len { + if mailbox.len() == expected_len { None } else { match mailbox.take_fsm() { @@ -186,7 +187,7 @@ impl Batch { } } - /// Remove the normal FSM located at `index`. + /// Removes the normal FSM. /// /// This method should only be called when the FSM is stopped. /// If there are still messages in channel, the FSM is untouched and @@ -204,18 +205,11 @@ impl Batch { } } - /// Schedule the normal FSM located at `index`. - /// - /// If `inplace`, the relative position of all fsm will not be changed; - /// otherwise, the fsm will be popped and the last fsm will be swap in - /// to reduce memory copy. - pub fn schedule(&mut self, router: &BatchRouter, index: usize, inplace: bool) { + /// Schedules the normal FSM located at `index`. + pub fn schedule(&mut self, router: &BatchRouter, index: usize) { let to_schedule = match self.normals[index].take() { Some(f) => f, None => { - if !inplace { - self.normals.swap_remove(index); - } return; } }; @@ -232,12 +226,19 @@ impl Batch { // failed to reschedule f.policy.take(); self.normals[index] = res; - } else if !inplace { + } + } + + /// Reclaims the slot storage if there is no FSM located at `index`. It will + /// alter the positions of some other FSMs with index larger than `index`. + #[inline] + pub fn swap_reclaim(&mut self, index: usize) { + if self.normals[index].is_none() { self.normals.swap_remove(index); } } - /// Same as `release`, but working on control FSM. + /// Same as [`release`], but works with control FSM. pub fn release_control(&mut self, control_box: &BasicMailbox, checked_len: usize) -> bool { let s = self.control.take().unwrap(); control_box.release(s); @@ -254,7 +255,7 @@ impl Batch { } } - /// Same as `remove`, but working on control FSM. + /// Same as [`remove`], but works with control FSM. pub fn remove_control(&mut self, control_box: &BasicMailbox) { if control_box.is_empty() { let s = self.control.take().unwrap(); @@ -265,14 +266,14 @@ impl Batch { /// The result for `PollHandler::handle_control`. pub enum HandleResult { - /// The Fsm still needs to be processed. + /// The FSM still needs to be handled in the next run. KeepProcessing, - /// The Fsm should stop at the progress. + /// The FSM should stop at the progress. StopAt { - /// The count of messages that have been acknowledged by handler. The - /// fsm should be released until new messages arrive. + /// The amount of messages acknowledged by the handler. The FSM + /// should be released unless new messages arrive. progress: usize, - /// Whether the fsm should be released before `end`. + /// Whether the FSM should be passed in to `end` call. skip_end: bool, }, } @@ -284,9 +285,10 @@ impl HandleResult { } } -/// A handler that poll all FSM in ready. +/// A handler that polls all FSMs in ready. +/// +/// A general process works like the following: /// -/// A General process works like following: /// ```text /// loop { /// begin @@ -294,34 +296,34 @@ impl HandleResult { /// handle_control /// foreach ready normal: /// handle_normal +/// light_end /// end /// } /// ``` /// -/// Note that, every poll thread has its own handler, which doesn't have to be -/// Sync. +/// A [`PollHandler`] doesn't have to be [`Sync`] because each poll thread has +/// its own handler. pub trait PollHandler: Send + 'static { /// This function is called at the very beginning of every round. fn begin(&mut self, _batch_size: usize, update_cfg: F) where for<'a> F: FnOnce(&'a Config); - /// This function is called when handling readiness for control FSM. + /// This function is called when the control FSM is ready. /// - /// If returned value is Some, then it represents a length of channel. This - /// function will only be called for the same fsm after channel's length is - /// larger than the value. If it returns None, then this function will - /// still be called for the same FSM in the next loop unless the FSM is - /// stopped. + /// If `Some(len)` is returned, this function will not be called again until + /// there are more than `len` pending messages in `control` FSM. + /// + /// If `None` is returned, this function will be called again with the same + /// FSM `control` in the next round, unless it is stopped. fn handle_control(&mut self, control: &mut C) -> Option; - /// This function is called when handling readiness for normal FSM. - /// - /// The returned value is handled in the same way as `handle_control`. + /// This function is called when some normal FSMs are ready. fn handle_normal(&mut self, normal: &mut impl DerefMut) -> HandleResult; - /// This function is called after `handle_normal` is called for all fsm and - /// before calling `end`. The function is expected to run lightweight work. + /// This function is called after [`handle_normal`] is called for all FSMs + /// and before calling [`end`]. The function is expected to run lightweight + /// works. fn light_end(&mut self, _batch: &mut [Option>]) {} /// This function is called at the end of every round. @@ -383,7 +385,8 @@ impl> Poller { !batch.is_empty() } - // Poll for readiness and forward to handler. Remove stale peer if necessary. + /// Polls for readiness and forwards them to handler. Removes stale peers if + /// necessary. pub fn poll(&mut self) { fail_point!("poll"); let mut batch = Batch::with_capacity(self.max_batch_size); @@ -391,7 +394,7 @@ impl> Poller { let mut to_skip_end = Vec::with_capacity(self.max_batch_size); // Fetch batch after every round is finished. It's helpful to protect regions - // from becoming hungry if some regions are hot points. Since we fetch new fsm + // from becoming hungry if some regions are hot points. Since we fetch new FSM // every time calling `poll`, we do not need to configure a large value for // `self.max_batch_size`. let mut run = true; @@ -400,7 +403,7 @@ impl> Poller { // overhead max size of batch. It's helpful to protect regions from becoming // hungry if some regions are hot points. let mut max_batch_size = std::cmp::max(self.max_batch_size, batch.normals.len()); - // update some online config if needed. + // Update some online config if needed. { // TODO: rust 2018 does not support capture disjoint field within a closure. // See https://github.com/rust-lang/rust/issues/53488 for more details. @@ -457,9 +460,11 @@ impl> Poller { if let Ok(fsm) = self.fsm_receiver.try_recv() { run = batch.push(fsm); } - // If we receive a ControlFsm, break this cycle and call `end`. Because - // ControlFsm may change state of the handler, we shall deal with it immediately - // after calling `begin` of `Handler`. + // When `fsm_cnt >= batch.normals.len()`: + // - No more FSMs in `fsm_receiver`. + // - We receive a control FSM. Break the loop because ControlFsm may change + // state of the handler, we shall deal with it immediately after calling + // `begin` of `Handler`. if !run || fsm_cnt >= batch.normals.len() { break; } @@ -478,17 +483,19 @@ impl> Poller { fsm_cnt += 1; } self.handler.light_end(&mut batch.normals); - for offset in &to_skip_end { - batch.schedule(&self.router, *offset, true); + for index in &to_skip_end { + batch.schedule(&self.router, *index); } to_skip_end.clear(); self.handler.end(&mut batch.normals); - // Because release use `swap_remove` internally, so using pop here - // to remove the correct FSM. - while let Some(r) = reschedule_fsms.pop() { - batch.schedule(&self.router, r, false); + // Iterate larger index first, so that `swap_reclaim` won't affect other FSMs + // in the list. + for index in reschedule_fsms.iter().rev() { + batch.schedule(&self.router, *index); + batch.swap_reclaim(*index); } + reschedule_fsms.clear(); } if let Some(fsm) = batch.control.take() { self.router.control_scheduler.schedule(fsm); @@ -521,9 +528,9 @@ pub trait HandlerBuilder { /// A system that can poll FSMs concurrently and in batch. /// -/// To use the system, two type of FSMs and their PollHandlers need -/// to be defined: Normal and Control. Normal FSM handles the general -/// task while Control FSM creates normal FSM instances. +/// To use the system, two type of FSMs and their PollHandlers need to be +/// defined: Normal and Control. Normal FSM handles the general task while +/// Control FSM creates normal FSM instances. pub struct BatchSystem { name_prefix: Option, router: BatchRouter, @@ -694,7 +701,8 @@ pub type BatchRouter = Router, ControlSchedule /// Create a batch system with the given thread name prefix and pool size. /// -/// `sender` and `controller` should be paired. +/// `sender` and `controller` should be paired: all messages sent on the +/// `sender` will become available to the `controller`. pub fn create_system( cfg: &Config, sender: mpsc::LooseBoundedSender, diff --git a/components/batch-system/src/fsm.rs b/components/batch-system/src/fsm.rs index 6fb4fe91539..09e32333c96 100644 --- a/components/batch-system/src/fsm.rs +++ b/components/batch-system/src/fsm.rs @@ -12,44 +12,37 @@ use std::{ use crate::mailbox::BasicMailbox; -// The FSM is notified. -const NOTIFYSTATE_NOTIFIED: usize = 0; -// The FSM is idle. -const NOTIFYSTATE_IDLE: usize = 1; -// The FSM is expected to be dropped. -const NOTIFYSTATE_DROP: usize = 2; - #[derive(Clone, Copy, Debug, PartialEq)] pub enum Priority { Low, Normal, } -/// `FsmScheduler` schedules `Fsm` for later handles. +/// `FsmScheduler` schedules `Fsm` for later handling. pub trait FsmScheduler { type Fsm: Fsm; - /// Schedule a Fsm for later handles. + /// Schedule a Fsm for later handling. fn schedule(&self, fsm: Box); /// Shutdown the scheduler, which indicates that resources like /// background thread pool should be released. fn shutdown(&self); } -/// A Fsm is a finite state machine. It should be able to be notified for +/// A `Fsm` is a finite state machine. It should be able to be notified for /// updating internal state according to incoming messages. pub trait Fsm { type Message: Send; fn is_stopped(&self) -> bool; - /// Set a mailbox to Fsm, which should be used to send message to itself. + /// Set a mailbox to FSM, which should be used to send message to itself. fn set_mailbox(&mut self, _mailbox: Cow<'_, BasicMailbox>) where Self: Sized, { } - /// Take the mailbox from Fsm. Implementation should ensure there will be + /// Take the mailbox from FSM. Implementation should ensure there will be /// no reference to mailbox after calling this method. fn take_mailbox(&mut self) -> Option> where @@ -63,17 +56,30 @@ pub trait Fsm { } } +/// A holder of FSM. +/// +/// There are three possible states: +/// +/// 1. NOTIFYSTATE_NOTIFIED: The FSM is taken by an external executor. `data` +/// holds a null pointer. +/// 2. NOTIFYSTATE_IDLE: No actor is using the FSM. `data` owns the FSM. +/// 3. NOTIFYSTATE_DROP: The FSM is dropped. `data` holds a null pointer. pub struct FsmState { status: AtomicUsize, data: AtomicPtr, + /// A counter shared with other `FsmState`s. state_cnt: Arc, } impl FsmState { + const NOTIFYSTATE_NOTIFIED: usize = 0; + const NOTIFYSTATE_IDLE: usize = 1; + const NOTIFYSTATE_DROP: usize = 2; + pub fn new(data: Box, state_cnt: Arc) -> FsmState { state_cnt.fetch_add(1, Ordering::Relaxed); FsmState { - status: AtomicUsize::new(NOTIFYSTATE_IDLE), + status: AtomicUsize::new(Self::NOTIFYSTATE_IDLE), data: AtomicPtr::new(Box::into_raw(data)), state_cnt, } @@ -82,8 +88,8 @@ impl FsmState { /// Take the fsm if it's IDLE. pub fn take_fsm(&self) -> Option> { let res = self.status.compare_exchange( - NOTIFYSTATE_IDLE, - NOTIFYSTATE_NOTIFIED, + Self::NOTIFYSTATE_IDLE, + Self::NOTIFYSTATE_NOTIFIED, Ordering::AcqRel, Ordering::Acquire, ); @@ -99,7 +105,7 @@ impl FsmState { } } - /// Notify fsm via a `FsmScheduler`. + /// Notifies FSM via a `FsmScheduler`. #[inline] pub fn notify>( &self, @@ -115,25 +121,25 @@ impl FsmState { } } - /// Put the owner back to the state. + /// Releases the FSM ownership back to this state. /// /// It's not required that all messages should be consumed before - /// releasing a fsm. However, a fsm is guaranteed to be notified only + /// releasing a FSM. However, a FSM is guaranteed to be notified only /// when new messages arrives after it's released. #[inline] pub fn release(&self, fsm: Box) { let previous = self.data.swap(Box::into_raw(fsm), Ordering::AcqRel); - let mut previous_status = NOTIFYSTATE_NOTIFIED; + let mut previous_status = Self::NOTIFYSTATE_NOTIFIED; if previous.is_null() { let res = self.status.compare_exchange( - NOTIFYSTATE_NOTIFIED, - NOTIFYSTATE_IDLE, + Self::NOTIFYSTATE_NOTIFIED, + Self::NOTIFYSTATE_IDLE, Ordering::AcqRel, Ordering::Acquire, ); previous_status = match res { Ok(_) => return, - Err(NOTIFYSTATE_DROP) => { + Err(Self::NOTIFYSTATE_DROP) => { let ptr = self.data.swap(ptr::null_mut(), Ordering::AcqRel); unsafe { Box::from_raw(ptr) }; return; @@ -144,11 +150,11 @@ impl FsmState { panic!("invalid release state: {:?} {}", previous, previous_status); } - /// Clear the fsm. + /// Clears the FSM. #[inline] pub fn clear(&self) { - match self.status.swap(NOTIFYSTATE_DROP, Ordering::AcqRel) { - NOTIFYSTATE_NOTIFIED | NOTIFYSTATE_DROP => return, + match self.status.swap(Self::NOTIFYSTATE_DROP, Ordering::AcqRel) { + Self::NOTIFYSTATE_NOTIFIED | Self::NOTIFYSTATE_DROP => return, _ => {} } diff --git a/components/batch-system/src/mailbox.rs b/components/batch-system/src/mailbox.rs index 219edb2e2af..5afddf73c14 100644 --- a/components/batch-system/src/mailbox.rs +++ b/components/batch-system/src/mailbox.rs @@ -13,12 +13,21 @@ use crate::fsm::{Fsm, FsmScheduler, FsmState}; /// A basic mailbox. /// -/// Every mailbox should have one and only one owner, who will receive all -/// messages sent to this mailbox. +/// A mailbox holds an FSM owner, and the sending end of a channel to send +/// messages to that owner. Multiple producers share the same mailbox to +/// communicate with a FSM. /// -/// When a message is sent to a mailbox, its owner will be checked whether it's -/// idle. An idle owner will be scheduled via `FsmScheduler` immediately, which -/// will drive the fsm to poll for messages. +/// The mailbox's FSM owner needs to be scheduled to a [`Poller`] to handle its +/// pending messages. Therefore, the producer of messages also needs to provide +/// a channel to a poller ([`FsmScheduler`]), so that the mailbox can schedule +/// its FSM owner. When a message is sent to a mailbox, the mailbox will check +/// whether its FSM owner is idle, i.e. not already taken and scheduled. If the +/// FSM is idle, it will be scheduled immediately. By doing so, the mailbox +/// temporarily transfers its ownership of the FSM to the poller. The +/// implementation must make sure the same FSM is returned afterwards via the +/// [`release`] method. +/// +/// [`Poller`]: crate::batch::Poller pub struct BasicMailbox { sender: mpsc::LooseBoundedSender, state: Arc>, @@ -103,7 +112,7 @@ impl Clone for BasicMailbox { } } -/// A more high level mailbox. +/// A more high level mailbox that is paired with a [`FsmScheduler`]. pub struct Mailbox where Owner: Fsm, diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 9975d66dfdc..8b0936a9faa 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -39,17 +39,20 @@ enum CheckDoResult { Valid(T), } -/// Router route messages to its target mailbox. -/// -/// Every fsm has a mailbox, hence it's necessary to have an address book -/// that can deliver messages to specified fsm, which is exact router. +/// Router routes messages to its target FSM's mailbox. /// /// In our abstract model, every batch system has two different kind of -/// fsms. First is normal fsm, which does the common work like peers in a -/// raftstore model or apply delegate in apply model. Second is control fsm, +/// FSMs. First is normal FSM, which does the common work like peers in a +/// raftstore model or apply delegate in apply model. Second is control FSM, /// which does some work that requires a global view of resources or creates -/// missing fsm for specified address. Normal fsm and control fsm can have -/// different scheduler, but this is not required. +/// missing FSM for specified address. +/// +/// There are one control FSM and multiple normal FSMs in a system. Each FSM +/// has its own mailbox. We maintain an address book to deliver messages to the +/// specified normal FSM. +/// +/// Normal FSM and control FSM can have different scheduler, but this is not +/// required. pub struct Router { normals: Arc>>, caches: Cell>>, @@ -60,8 +63,9 @@ pub struct Router { pub(crate) normal_scheduler: Ns, pub(crate) control_scheduler: Cs, - // Count of Mailboxes that is not destroyed. - // Added when a Mailbox created, and subtracted it when a Mailbox destroyed. + // Number of active mailboxes. + // Added when a mailbox is created, and subtracted it when a mailbox is + // destroyed. state_cnt: Arc, // Indicates the router is shutdown down or not. shutdown: Arc, @@ -198,7 +202,7 @@ where } } - /// Get the mailbox of control fsm. + /// Get the mailbox of control FSM. pub fn control_mailbox(&self) -> Mailbox { Mailbox::new(self.control_box.clone(), self.control_scheduler.clone()) } @@ -269,7 +273,7 @@ where } } - /// Force sending message to control fsm. + /// Force sending message to control FSM. #[inline] pub fn send_control(&self, msg: C::Message) -> Result<(), TrySendError> { match self.control_box.try_send(msg, &self.control_scheduler) { @@ -284,7 +288,7 @@ where } } - /// Try to notify all normal fsm a message. + /// Try to notify all normal FSMs a message. pub fn broadcast_normal(&self, mut msg_gen: impl FnMut() -> N::Message) { let mailboxes = self.normals.lock().unwrap(); for mailbox in mailboxes.map.values() { @@ -292,7 +296,7 @@ where } } - /// Try to notify all fsm that the cluster is being shutdown. + /// Try to notify all FSMs that the cluster is being shutdown. pub fn broadcast_shutdown(&self) { info!("broadcasting shutdown"); self.shutdown.store(true, Ordering::SeqCst); diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index 25499d89c61..991ae154427 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -474,7 +474,6 @@ impl<'client> S3Uploader<'client> { sleep(delay_duration).await; } - #[cfg(feature = "failpoints")] fail_point!("s3_put_obj_err", |_| { Err(RusotoError::ParseError("failed to put object".to_owned())) }); diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index ee063fc15dd..9c1f60ba947 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -26,9 +26,9 @@ use crate::{ Error, PeerMsg, PeerTick, Result, StoreMsg, }; -/// A per-thread context used for handling raft messages. +/// A per-thread context shared by the [`StoreFsm`] and multiple [`PeerFsm`]s. pub struct StoreContext { - /// A logger without any KV. It's clean for creating new PeerFSM. + /// A logger without any KV. It's clean for creating new PeerFsm. pub logger: Logger, /// The transport for sending messages to peers on other stores. pub trans: T, @@ -53,12 +53,20 @@ impl StoreContext { } } -/// Poller for polling raft state machines. +/// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. +/// +/// It is responsible for: +/// +/// - Keeping the local [`StoreContext`] up-to-date. +/// - Receiving and sending messages in and out of these FSMs. struct StorePoller { - store_msg_buf: Vec, - peer_msg_buf: Vec>, poll_ctx: StoreContext, cfg_tracker: Tracker, + /// Buffers to hold in-coming messages. + store_msg_buf: Vec, + peer_msg_buf: Vec>, + /// These fields controls the timing of flushing messages generated by + /// FSMs. last_flush_time: TiInstant, need_flush_events: bool, } @@ -66,10 +74,10 @@ struct StorePoller { impl StorePoller { pub fn new(poll_ctx: StoreContext, cfg_tracker: Tracker) -> Self { Self { - store_msg_buf: Vec::new(), - peer_msg_buf: Vec::new(), poll_ctx, cfg_tracker, + store_msg_buf: Vec::new(), + peer_msg_buf: Vec::new(), last_flush_time: TiInstant::now(), need_flush_events: false, } @@ -106,8 +114,8 @@ impl PollHandler F: FnOnce(&'a batch_system::Config), { - let cfg = self.cfg_tracker.any_new().map(|c| c.clone()); - if let Some(cfg) = cfg { + // Apply configuration changes. + if let Some(cfg) = self.cfg_tracker.any_new().map(|c| c.clone()) { let last_messages_per_tick = self.messages_per_tick(); self.poll_ctx.cfg = cfg; if self.poll_ctx.cfg.messages_per_tick != last_messages_per_tick { @@ -117,31 +125,28 @@ impl PollHandler Option { + fn handle_control(&mut self, fsm: &mut StoreFsm) -> Option { debug_assert!(self.store_msg_buf.is_empty()); - let received_cnt = store.recv(&mut self.store_msg_buf); + let received_cnt = fsm.recv(&mut self.store_msg_buf); let expected_msg_count = if received_cnt == self.messages_per_tick() { None } else { Some(0) }; - let mut delegate = StoreFsmDelegate::new(store, &mut self.poll_ctx); + let mut delegate = StoreFsmDelegate::new(fsm, &mut self.poll_ctx); delegate.handle_msgs(&mut self.store_msg_buf); expected_msg_count } - fn handle_normal( - &mut self, - peer: &mut impl DerefMut>, - ) -> HandleResult { + fn handle_normal(&mut self, fsm: &mut impl DerefMut>) -> HandleResult { debug_assert!(self.peer_msg_buf.is_empty()); - let received_cnt = peer.recv(&mut self.peer_msg_buf); + let received_cnt = fsm.recv(&mut self.peer_msg_buf); let handle_result = if received_cnt == self.messages_per_tick() { HandleResult::KeepProcessing } else { HandleResult::stop_at(0, false) }; - let mut delegate = PeerFsmDelegate::new(peer, &mut self.poll_ctx); + let mut delegate = PeerFsmDelegate::new(fsm, &mut self.poll_ctx); delegate.handle_msgs(&mut self.peer_msg_buf); handle_result } @@ -204,7 +209,7 @@ impl StorePollerBuilder { } } - /// Initializes all the existing raft machines and cleanup stale tablets. + /// Initializes all the existing raft machines and cleans up stale tablets. fn init(&self) -> Result>> { let mut regions = HashMap::default(); let cfg = self.cfg.value(); @@ -262,7 +267,7 @@ where } } -/// The system used for poll raft activities. +/// The system used for polling Raft activities. pub struct StoreSystem { system: BatchSystem, StoreFsm>, apply_router: ApplyRouter, diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 88d7b479e49..a8fb67aa121 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -76,7 +76,7 @@ impl Fsm for PeerFsm { self.is_stopped } - /// Set a mailbox to Fsm, which should be used to send message to itself. + /// Set a mailbox to FSM, which should be used to send message to itself. fn set_mailbox(&mut self, mailbox: Cow<'_, BasicMailbox>) where Self: Sized, @@ -84,7 +84,7 @@ impl Fsm for PeerFsm { self.mailbox = Some(mailbox.into_owned()); } - /// Take the mailbox from Fsm. Implementation should ensure there will be + /// Take the mailbox from FSM. Implementation should ensure there will be /// no reference to mailbox after calling this method. fn take_mailbox(&mut self) -> Option> where diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index e2db05db143..0d97137bab1 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -3594,19 +3594,21 @@ where } } - #[allow(unused_mut, clippy::redundant_closure_call)] fn handle_snapshot(&mut self, apply_ctx: &mut ApplyContext, snap_task: GenSnapTask) { if self.delegate.pending_remove || self.delegate.stopped { return; } let applied_index = self.delegate.apply_state.get_applied_index(); - let mut need_sync = apply_ctx + let need_sync = apply_ctx .apply_res .iter() .any(|res| res.region_id == self.delegate.region_id()) && self.delegate.last_flush_applied_index != applied_index; - (|| fail_point!("apply_on_handle_snapshot_sync", |_| { need_sync = true }))(); - if need_sync { + let force_sync_fp = || { + fail_point!("apply_on_handle_snapshot_sync", |_| true); + false + }; + if need_sync || force_sync_fp() { if apply_ctx.timer.is_none() { apply_ctx.timer = Some(Instant::now_coarse()); } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 3ae6b74a13c..e4707947fbb 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -572,7 +572,7 @@ where self.stopped } - /// Set a mailbox to Fsm, which should be used to send message to itself. + /// Set a mailbox to FSM, which should be used to send message to itself. #[inline] fn set_mailbox(&mut self, mailbox: Cow<'_, BasicMailbox>) where @@ -581,7 +581,7 @@ where self.mailbox = Some(mailbox.into_owned()); } - /// Take the mailbox from Fsm. Implementation should ensure there will be + /// Take the mailbox from FSM. Implementation should ensure there will be /// no reference to mailbox after calling this method. #[inline] fn take_mailbox(&mut self) -> Option> @@ -631,6 +631,7 @@ where .propose .request_wait_time .observe(duration_to_sec(cmd.send_time.saturating_elapsed()) as f64); + if let Some(Err(e)) = cmd.extra_opts.deadline.map(|deadline| deadline.check()) { cmd.callback.invoke_with_response(new_error(e.into())); continue; @@ -648,14 +649,14 @@ where { self.fsm.batch_req_builder.add(cmd, req_size); if self.fsm.batch_req_builder.should_finish(&self.ctx.cfg) { - self.propose_batch_raft_command(true); + self.propose_pending_batch_raft_command(); } } else { self.propose_raft_command( cmd.request, cmd.callback, cmd.extra_opts.disk_full_opt, - ) + ); } } PeerMsg::Tick(tick) => self.on_tick(tick), @@ -688,53 +689,57 @@ where } } } + self.on_loop_finished(); + } + + #[inline] + fn on_loop_finished(&mut self) { + let ready_concurrency = self.ctx.cfg.cmd_batch_concurrent_ready_max_count; + let should_propose = self.ctx.sync_write_worker.is_some() + || ready_concurrency == 0 + || self.fsm.peer.unpersisted_ready_len() < ready_concurrency; + let force_delay_fp = || { + fail_point!( + "force_delay_propose_batch_raft_command", + self.ctx.sync_write_worker.is_none(), + |_| true + ); + false + }; // Propose batch request which may be still waiting for more raft-command - if self.ctx.sync_write_worker.is_some() { - self.propose_batch_raft_command(true); - } else { - self.propose_batch_raft_command(false); - self.check_batch_cmd_and_proposed_cb(); + if should_propose && !force_delay_fp() { + self.propose_pending_batch_raft_command(); + } else if self.fsm.batch_req_builder.has_proposed_cb + && self.fsm.batch_req_builder.propose_checked.is_none() + && let Some(cmd) = self.fsm.batch_req_builder.request.take() + { + // We are delaying these requests to next loop. Try to fulfill their + // proposed callback early. + self.fsm.batch_req_builder.propose_checked = Some(false); + if let Ok(None) = self.pre_propose_raft_command(&cmd) { + if self.fsm.peer.will_likely_propose(&cmd) { + self.fsm.batch_req_builder.propose_checked = Some(true); + for cb in &mut self.fsm.batch_req_builder.callbacks { + cb.invoke_proposed(); + } + } + } + self.fsm.batch_req_builder.request = Some(cmd); } } - fn propose_batch_raft_command(&mut self, force: bool) { + /// Flushes all pending raft commands for immediate execution. + #[inline] + fn propose_pending_batch_raft_command(&mut self) { if self.fsm.batch_req_builder.request.is_none() { return; } - if !force - && self.ctx.cfg.cmd_batch_concurrent_ready_max_count != 0 - && self.fsm.peer.unpersisted_ready_len() - >= self.ctx.cfg.cmd_batch_concurrent_ready_max_count - { - return; - } - fail_point!("propose_batch_raft_command", !force, |_| {}); let (request, callback) = self .fsm .batch_req_builder .build(&mut self.ctx.raft_metrics) .unwrap(); - self.propose_raft_command_internal(request, callback, DiskFullOpt::NotAllowedOnFull) - } - - fn check_batch_cmd_and_proposed_cb(&mut self) { - if self.fsm.batch_req_builder.request.is_none() - || !self.fsm.batch_req_builder.has_proposed_cb - || self.fsm.batch_req_builder.propose_checked.is_some() - { - return; - } - let cmd = self.fsm.batch_req_builder.request.take().unwrap(); - self.fsm.batch_req_builder.propose_checked = Some(false); - if let Ok(None) = self.pre_propose_raft_command(&cmd) { - if self.fsm.peer.will_likely_propose(&cmd) { - self.fsm.batch_req_builder.propose_checked = Some(true); - for cb in &mut self.fsm.batch_req_builder.callbacks { - cb.invoke_proposed(); - } - } - } - self.fsm.batch_req_builder.request = Some(cmd); + self.propose_raft_command_internal(request, callback, DiskFullOpt::NotAllowedOnFull); } fn on_update_replication_mode(&mut self) { @@ -3016,9 +3021,7 @@ where ); } None => { - if self.fsm.batch_req_builder.request.is_some() { - self.propose_batch_raft_command(true); - } + self.propose_pending_batch_raft_command(); if self.propose_locks_before_transfer_leader(msg) { // If some pessimistic locks are just proposed, we propose another // TransferLeader command instead of transferring leader immediately. @@ -4796,20 +4799,17 @@ where } } - /// Propose batched raft commands(if any) first, then propose the given raft - /// command. + /// Proposes pending batch raft commands (if any), then proposes the + /// provided raft command. + #[inline] fn propose_raft_command( &mut self, msg: RaftCmdRequest, cb: Callback, diskfullopt: DiskFullOpt, ) { - if let Some((request, callback)) = - self.fsm.batch_req_builder.build(&mut self.ctx.raft_metrics) - { - self.propose_raft_command_internal(request, callback, DiskFullOpt::NotAllowedOnFull); - } - + // Propose pending commands before processing new one. + self.propose_pending_batch_raft_command(); self.propose_raft_command_internal(msg, cb, diskfullopt); } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 99287ca493c..17fe22926d1 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -3385,16 +3385,16 @@ where true } - /// Propose a request. + /// Proposes a request. /// - /// Return true means the request has been proposed successfully. + /// Return whether the request has been proposed successfully. pub fn propose( &mut self, ctx: &mut PollContext, mut cb: Callback, req: RaftCmdRequest, mut err_resp: RaftCmdResponse, - disk_full_opt: DiskFullOpt, + mut disk_full_opt: DiskFullOpt, ) -> bool { if self.pending_remove { return false; @@ -3421,53 +3421,11 @@ where } Ok(RequestPolicy::ProposeNormal) => { // For admin cmds, only region split/merge comes here. - let mut stores = Vec::new(); - let mut opt = disk_full_opt; - let mut maybe_transfer_leader = false; if req.has_admin_request() { - opt = DiskFullOpt::AllowedOnAlmostFull; - } - if self.check_proposal_normal_with_disk_usage( - ctx, - opt, - &mut stores, - &mut maybe_transfer_leader, - ) { - self.propose_normal(ctx, req) - } else { - // If leader node is disk full, try to transfer leader to a node with disk usage - // normal to keep write availability not downback. - // if majority node is disk full, to transfer leader or not is not necessary. - // Note: Need to exclude learner node. - if maybe_transfer_leader && !self.disk_full_peers.majority { - let target_peer = self - .get_store() - .region() - .get_peers() - .iter() - .find(|x| { - !self.disk_full_peers.has(x.get_id()) - && x.get_id() != self.peer.get_id() - && !self.down_peer_ids.contains(&x.get_id()) - && !matches!(x.get_role(), PeerRole::Learner) - }) - .cloned(); - if let Some(p) = target_peer { - debug!( - "try to transfer leader because of current leader disk full: region id = {}, peer id = {}; target peer id = {}", - self.region_id, - self.peer.get_id(), - p.get_id() - ); - self.pre_transfer_leader(&p); - } - } - let errmsg = format!( - "propose failed: tikv disk full, cmd diskFullOpt={:?}, leader diskUsage={:?}", - disk_full_opt, ctx.self_disk_usage - ); - Err(Error::DiskFull(stores, errmsg)) + disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; } + self.check_normal_proposal_with_disk_full_opt(ctx, disk_full_opt) + .and_then(|_| self.propose_normal(ctx, req)) } Ok(RequestPolicy::ProposeConfChange) => self.propose_conf_change(ctx, &req), Err(e) => Err(e), @@ -4837,56 +4795,74 @@ where // Check disk usages for the peer itself and other peers in the raft group. // The return value indicates whether the proposal is allowed or not. - fn check_proposal_normal_with_disk_usage( + fn check_normal_proposal_with_disk_full_opt( &mut self, ctx: &mut PollContext, disk_full_opt: DiskFullOpt, - disk_full_stores: &mut Vec, - maybe_transfer_leader: &mut bool, - ) -> bool { - // check self disk status. - let allowed = match ctx.self_disk_usage { + ) -> Result<()> { + let leader_allowed = match ctx.self_disk_usage { DiskUsage::Normal => true, DiskUsage::AlmostFull => !matches!(disk_full_opt, DiskFullOpt::NotAllowedOnFull), DiskUsage::AlreadyFull => false, }; - - if !allowed { + let mut disk_full_stores = Vec::new(); + if !leader_allowed { disk_full_stores.push(ctx.store.id); - *maybe_transfer_leader = true; - return false; - } - - // If all followers diskusage normal, then allowed. - if self.disk_full_peers.is_empty() { - return true; - } - - for peer in self.get_store().region().get_peers() { - let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); - if self.disk_full_peers.peers.get(&peer_id).is_some() { - disk_full_stores.push(store_id); + // Try to transfer leader to a node with disk usage normal to maintain write + // availability. If majority node is disk full, to transfer leader or not is not + // necessary. Note: Need to exclude learner node. + if !self.disk_full_peers.majority { + let target_peer = self + .get_store() + .region() + .get_peers() + .iter() + .find(|x| { + !self.disk_full_peers.has(x.get_id()) + && x.get_id() != self.peer.get_id() + && !self.down_peer_ids.contains(&x.get_id()) + && !matches!(x.get_role(), PeerRole::Learner) + }) + .cloned(); + if let Some(p) = target_peer { + debug!( + "try to transfer leader because of current leader disk full"; + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + "target_peer_id" => p.get_id(), + ); + self.pre_transfer_leader(&p); + } + } + } else { + // Check followers. + if self.disk_full_peers.is_empty() { + return Ok(()); + } + if !self.dangerous_majority_set { + if !self.disk_full_peers.majority { + return Ok(()); + } + // Majority peers are in disk full status but the request carries a special + // flag. + if matches!(disk_full_opt, DiskFullOpt::AllowedOnAlmostFull) + && self.disk_full_peers.peers.values().any(|x| x.1) + { + return Ok(()); + } + } + for peer in self.get_store().region().get_peers() { + let (peer_id, store_id) = (peer.get_id(), peer.get_store_id()); + if self.disk_full_peers.peers.get(&peer_id).is_some() { + disk_full_stores.push(store_id); + } } } - - // if there are some peers with disk already full status in the majority set, - // should not allowed. - if self.dangerous_majority_set { - return false; - } - - if !self.disk_full_peers.majority { - return true; - } - - if matches!(disk_full_opt, DiskFullOpt::AllowedOnAlmostFull) - && self.disk_full_peers.peers.values().any(|x| x.1) - { - // Majority peers are in disk full status but the request carries a special - // flag. - return true; - } - false + let errmsg = format!( + "propose failed: tikv disk full, cmd diskFullOpt={:?}, leader diskUsage={:?}", + disk_full_opt, ctx.self_disk_usage + ); + Err(Error::DiskFull(disk_full_stores, errmsg)) } /// Check if the command will be likely to pass all the check and propose. @@ -5322,6 +5298,7 @@ where self.raft_group.raft.r.max_msg_size = ctx.cfg.raft_max_size_per_msg.0; } + #[inline] fn maybe_inject_propose_error( &self, #[allow(unused_variables)] req: &RaftCmdRequest, diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 97e8ee85d86..4ac03e2578b 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -17,7 +17,6 @@ use std::{ use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine}; -#[cfg(feature = "failpoints")] use fail::fail_point; use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::{HealthService, ServingStatus}; @@ -439,7 +438,6 @@ const DEFAULT_LOAD_BASE_SPLIT_CHECK_INTERVAL: Duration = Duration::from_secs(1); const DEFAULT_COLLECT_TICK_INTERVAL: Duration = Duration::from_secs(1); fn default_collect_tick_interval() -> Duration { - #[cfg(feature = "failpoints")] fail_point!("mock_collect_tick_interval", |_| { Duration::from_millis(1) }); @@ -447,7 +445,6 @@ fn default_collect_tick_interval() -> Duration { } fn config(interval: Duration) -> Duration { - #[cfg(feature = "failpoints")] fail_point!("mock_min_resolved_ts_interval", |_| { Duration::from_millis(50) }); @@ -721,21 +718,18 @@ const HOTSPOT_REPORT_CAPACITY: usize = 1000; // TODO: support dynamic configure threshold in future. fn hotspot_key_report_threshold() -> u64 { - #[cfg(feature = "failpoints")] fail_point!("mock_hotspot_threshold", |_| { 0 }); HOTSPOT_KEY_RATE_THRESHOLD * 10 } fn hotspot_byte_report_threshold() -> u64 { - #[cfg(feature = "failpoints")] fail_point!("mock_hotspot_threshold", |_| { 0 }); HOTSPOT_BYTE_RATE_THRESHOLD * 10 } fn hotspot_query_num_report_threshold() -> u64 { - #[cfg(feature = "failpoints")] fail_point!("mock_hotspot_threshold", |_| { 0 }); HOTSPOT_QUERY_RATE_THRESHOLD * 10 diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 7c698905b72..fc984dd1a50 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -616,7 +616,6 @@ impl AutoSplitController { } fn is_grpc_poll_busy(&self, avg_grpc_thread_usage: f64) -> bool { - #[cfg(feature = "failpoints")] fail::fail_point!("mock_grpc_poll_is_not_busy", |_| { false }); if self.max_grpc_thread_count == 0 { return false; @@ -629,7 +628,6 @@ impl AutoSplitController { } fn is_unified_read_pool_busy(&self, unified_read_pool_thread_usage: f64) -> bool { - #[cfg(feature = "failpoints")] fail::fail_point!("mock_unified_read_pool_is_busy", |_| { true }); if self.max_unified_read_pool_thread_count == 0 { return false; @@ -644,7 +642,6 @@ impl AutoSplitController { } fn is_region_busy(&self, unified_read_pool_thread_usage: f64, region_cpu_usage: f64) -> bool { - #[cfg(feature = "failpoints")] fail::fail_point!("mock_region_is_busy", |_| { true }); if unified_read_pool_thread_usage <= 0.0 || !self.should_check_region_cpu() { return false; diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index bc0e8a59303..df1a18ab06d 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -1012,7 +1012,6 @@ where self.last_hash.1 as usize }; - #[allow(unused_mut)] let mut transport_on_send_store_fp = || { fail_point!( "transport_on_send_snapshot", diff --git a/tests/failpoints/cases/test_cmd_epoch_checker.rs b/tests/failpoints/cases/test_cmd_epoch_checker.rs index 1068b35f8d5..9de8911754b 100644 --- a/tests/failpoints/cases/test_cmd_epoch_checker.rs +++ b/tests/failpoints/cases/test_cmd_epoch_checker.rs @@ -111,12 +111,12 @@ fn test_reject_proposal_during_region_split() { .unwrap_err(); // Try to put a key. - let propose_batch_raft_command_fp = "propose_batch_raft_command"; + let force_delay_propose_batch_raft_command_fp = "force_delay_propose_batch_raft_command"; let mut receivers = vec![]; for i in 0..2 { if i == 1 { // Test another path of calling proposed callback. - fail::cfg(propose_batch_raft_command_fp, "2*return").unwrap(); + fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"k1"); let (cb, cb_receivers) = make_cb(&write_req); @@ -190,12 +190,12 @@ fn test_reject_proposal_during_region_merge() { .unwrap_err(); // Try to put a key on the source region. - let propose_batch_raft_command_fp = "propose_batch_raft_command"; + let force_delay_propose_batch_raft_command_fp = "force_delay_propose_batch_raft_command"; let mut receivers = vec![]; for i in 0..2 { if i == 1 { // Test another path of calling proposed callback. - fail::cfg(propose_batch_raft_command_fp, "2*return").unwrap(); + fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"a"); let (cb, cb_receivers) = make_cb(&write_req); @@ -231,7 +231,7 @@ fn test_reject_proposal_during_region_merge() { for i in 0..2 { if i == 1 { // Test another path of calling proposed callback. - fail::cfg(propose_batch_raft_command_fp, "2*return").unwrap(); + fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"a"); let (cb, cb_receivers) = make_cb(&write_req); @@ -248,7 +248,7 @@ fn test_reject_proposal_during_region_merge() { for i in 0..2 { if i == 1 { // Test another path of calling proposed callback. - fail::cfg(propose_batch_raft_command_fp, "2*return").unwrap(); + fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"k"); let (cb, cb_receivers) = make_cb(&write_req); @@ -314,11 +314,11 @@ fn test_reject_proposal_during_rollback_region_merge() { // Write request is rejected because the source region is merging. // It's not handled by epoch checker now. - let propose_batch_raft_command_fp = "propose_batch_raft_command"; + let force_delay_propose_batch_raft_command_fp = "force_delay_propose_batch_raft_command"; for i in 0..2 { if i == 1 { // Test another path of calling proposed callback. - fail::cfg(propose_batch_raft_command_fp, "2*return").unwrap(); + fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"a"); let (cb, cb_receivers) = make_cb(&write_req); @@ -367,11 +367,11 @@ fn test_reject_proposal_during_leader_transfer() { sleep_ms(100); assert_ne!(cluster.leader_of_region(r).unwrap(), new_peer(2, 2)); - let propose_batch_raft_command_fp = "propose_batch_raft_command"; + let force_delay_propose_batch_raft_command_fp = "force_delay_propose_batch_raft_command"; for i in 0..2 { if i == 1 { // Test another path of calling proposed callback. - fail::cfg(propose_batch_raft_command_fp, "2*return").unwrap(); + fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"k"); let (cb, cb_receivers) = make_cb(&write_req); @@ -485,8 +485,8 @@ fn test_propose_before_transfer_leader() { cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k", b"v"); - let propose_batch_raft_command_fp = "propose_batch_raft_command"; - fail::cfg(propose_batch_raft_command_fp, "return").unwrap(); + let force_delay_propose_batch_raft_command_fp = "force_delay_propose_batch_raft_command"; + fail::cfg(force_delay_propose_batch_raft_command_fp, "return").unwrap(); let write_req = make_write_req(&mut cluster, b"k1"); let (cb, cb_receivers) = make_cb(&write_req); @@ -514,8 +514,8 @@ fn test_propose_before_split_and_merge() { cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k", b"v"); - let propose_batch_raft_command_fp = "propose_batch_raft_command"; - fail::cfg(propose_batch_raft_command_fp, "return").unwrap(); + let force_delay_propose_batch_raft_command_fp = "force_delay_propose_batch_raft_command"; + fail::cfg(force_delay_propose_batch_raft_command_fp, "return").unwrap(); let write_req = make_write_req(&mut cluster, b"k1"); let (cb, cb_receivers) = make_cb(&write_req); From 2780bbaf812d36b59e4586f2c6ce5b9c4d5ec03f Mon Sep 17 00:00:00 2001 From: Lucas Date: Fri, 12 Aug 2022 18:38:44 +0800 Subject: [PATCH 153/676] engine: upgrade raft-engine to support log recycling (#13231) * Update dependency on RaftEngine for supporting to open `Recycle Log Files` feature. Signed-off-by: Lucasliang * Supply extra implementations to the ENV::APIs in raft_log_engine. Signed-off-by: lucasliang * Refine the code-path in engine.rs. Signed-off-by: lucasliang * Update Cargo.toml. Signed-off-by: lucasliang * Bugfix for lacking atomicities in the processing of `rename` and `reuse`. This commit includes: * Fix the bug of locking atocmicity in the operations of `rename` and `reuse`; * Meanwhile, the related callings of `link_file` have been enhanced with safety. Signed-off-by: lucasliang * Refine the format of annoations in etc/config-template.toml. Signed-off-by: lucasliang * Make the annotations of `reuse` and configurations in RaftEngine more readable. Signed-off-by: lucasliang * Refine the annotation in `reuse`. Signed-off-by: lucasliang * Remove unnecessary restraints of raft-engine lib in cargo.toml. Signed-off-by: lucasliang Signed-off-by: Lucasliang Signed-off-by: lucasliang Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot --- Cargo.lock | 52 +++++++++++++++++++-- components/raft_log_engine/src/engine.rs | 42 +++++++++++++++++ components/raftstore/src/store/snap.rs | 12 +++-- components/sst_importer/src/import_file.rs | 22 +++++++-- components/sst_importer/src/sst_importer.rs | 10 +++- etc/config-template.toml | 17 +++++++ 6 files changed, 141 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 41a5df4c1ed..802a0e19487 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1770,7 +1770,7 @@ dependencies = [ "serde", "slog", "slog-global", - "strum", + "strum 0.20.0", "tempfile", "thread_local", "tikv_alloc", @@ -2465,6 +2465,12 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "if_chain" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb56e1aa765b4b4f3aadfab769793b7087bb03a4ea4920644a6d238e2df5b9ed" + [[package]] name = "indexmap" version = "1.6.2" @@ -4036,7 +4042,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.2.0" -source = "git+https://github.com/tikv/raft-engine.git#7a436eae40a6b62371123c96941e058b7fe52b63" +source = "git+https://github.com/tikv/raft-engine.git#6a6fe3bd2e0a1ca0b4fc643800ddc93abe74cd87" dependencies = [ "byteorder", "crc32fast", @@ -4045,6 +4051,7 @@ dependencies = [ "fs2", "hashbrown 0.12.0", "hex 0.4.2", + "if_chain", "lazy_static", "libc 0.2.125", "log", @@ -4061,13 +4068,15 @@ dependencies = [ "rhai", "scopeguard", "serde", + "serde_repr", + "strum 0.24.1", "thiserror", ] [[package]] name = "raft-engine-ctl" version = "0.2.0" -source = "git+https://github.com/tikv/raft-engine.git#7a436eae40a6b62371123c96941e058b7fe52b63" +source = "git+https://github.com/tikv/raft-engine.git#6a6fe3bd2e0a1ca0b4fc643800ddc93abe74cd87" dependencies = [ "clap 3.1.6", "env_logger", @@ -5001,6 +5010,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_repr" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fe39d9fbb0ebf5eb2c7cb7e2a47e4f462fad1379f1166b8ae49ad9eae89a7ca" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "serde_urlencoded" version = "0.7.0" @@ -5380,7 +5400,16 @@ version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7318c509b5ba57f18533982607f24070a55d353e90d4cae30c467cdb2ad5ac5c" dependencies = [ - "strum_macros", + "strum_macros 0.20.1", +] + +[[package]] +name = "strum" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +dependencies = [ + "strum_macros 0.24.2", ] [[package]] @@ -5395,6 +5424,19 @@ dependencies = [ "syn", ] +[[package]] +name = "strum_macros" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4faebde00e8ff94316c01800f9054fd2ba77d30d9e922541913051d1d978918b" +dependencies = [ + "heck 0.4.0", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "subtle" version = "2.3.0" @@ -6072,7 +6114,7 @@ dependencies = [ "slog", "slog-global", "sst_importer", - "strum", + "strum 0.20.0", "sysinfo", "tempfile", "test_sst_importer", diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 49183245785..dd7c222845c 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -194,6 +194,48 @@ impl FileSystem for ManagedFileSystem { self.base_file_system.delete(path) } + fn rename>(&self, src_path: P, dst_path: P) -> IoResult<()> { + if let Some(ref manager) = self.key_manager { + // Note: `rename` will reuse the old entryption info from `src_path`. + let src_str = src_path.as_ref().to_str().unwrap(); + let dst_str = dst_path.as_ref().to_str().unwrap(); + manager.link_file(src_str, dst_str)?; + let r = self + .base_file_system + .rename(src_path.as_ref(), dst_path.as_ref()); + let del_file = if r.is_ok() { src_str } else { dst_str }; + if let Err(e) = manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'rename'"; "err" => ?e); + } + r + } else { + self.base_file_system.rename(src_path, dst_path) + } + } + + fn reuse>(&self, src_path: P, dst_path: P) -> IoResult<()> { + if let Some(ref manager) = self.key_manager { + // Note: In contrast to `rename`, `reuse` will make sure the encryption + // metadata is properly updated by rotating the encryption key for safety, + // when encryption flag is true. It won't rewrite the data blocks with + // the updated encryption metadata. Therefore, the old encrypted data + // won't be accessible after this calling. + let src_str = src_path.as_ref().to_str().unwrap(); + let dst_str = dst_path.as_ref().to_str().unwrap(); + manager.new_file(dst_path.as_ref().to_str().unwrap())?; + let r = self + .base_file_system + .rename(src_path.as_ref(), dst_path.as_ref()); + let del_file = if r.is_ok() { src_str } else { dst_str }; + if let Err(e) = manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'reuse'"; "err" => ?e); + } + r + } else { + self.base_file_system.rename(src_path, dst_path) + } + } + fn exists_metadata>(&self, path: P) -> bool { if let Some(ref manager) = self.key_manager { if let Ok(info) = manager.get_file(path.as_ref().to_str().unwrap()) { diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 9a279029fd5..74cfd5ab0d6 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1771,8 +1771,6 @@ impl SnapManagerCore { let tmp_file_paths = cf_file.tmp_file_paths(); let file_paths = cf_file.file_paths(); for (i, tmp_file_path) in tmp_file_paths.iter().enumerate() { - file_system::rename(&tmp_file_path, &file_paths[i])?; - let mgr = self.encryption_key_manager.as_ref(); if let Some(mgr) = &mgr { let src = &tmp_file_path; @@ -1786,7 +1784,15 @@ impl SnapManagerCore { } return Err(e.into()); } - mgr.delete_file(src)?; + let r = file_system::rename(src, dst); + let del_file = if r.is_ok() { src } else { dst }; + if let Err(e) = mgr.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'rename_tmp_cf_file_for_send'"; + "err" => ?e); + } + r?; + } else { + file_system::rename(&tmp_file_path, &file_paths[i])?; } let file = Path::new(&file_paths[i]); let (checksum, size) = calc_checksum_and_size(file, mgr)?; diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index 60f72052b10..e83255942fd 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -50,7 +50,6 @@ pub struct ImportPath { impl ImportPath { // move file from temp to save. pub fn save(mut self, key_manager: Option<&DataKeyManager>) -> Result<()> { - file_system::rename(&self.temp, &self.save)?; if let Some(key_manager) = key_manager { let temp_str = self .temp @@ -61,7 +60,15 @@ impl ImportPath { .to_str() .ok_or_else(|| Error::InvalidSstPath(self.save.clone()))?; key_manager.link_file(temp_str, save_str)?; - key_manager.delete_file(temp_str)?; + let r = file_system::rename(&self.temp, &self.save); + let del_file = if r.is_ok() { temp_str } else { save_str }; + if let Err(e) = key_manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'save'"; + "file" => ?self, "err" => ?e); + } + r?; + } else { + file_system::rename(&self.temp, &self.save)?; } // sync the directory after rename self.save.pop(); @@ -137,12 +144,19 @@ impl ImportFile { "finalize SST write cache", )); } - file_system::rename(&self.path.temp, &self.path.save)?; if let Some(ref manager) = self.key_manager { let tmp_str = self.path.temp.to_str().unwrap(); let save_str = self.path.save.to_str().unwrap(); manager.link_file(tmp_str, save_str)?; - manager.delete_file(self.path.temp.to_str().unwrap())?; + let r = file_system::rename(&self.path.temp, &self.path.save); + let del_file = if r.is_ok() { tmp_str } else { save_str }; + if let Err(e) = manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during finishing importing files."; + "err" => ?e); + } + r?; + } else { + file_system::rename(&self.path.temp, &self.path.save)?; } Ok(()) } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index ce55e7beb41..7e40859b127 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -584,7 +584,6 @@ impl SstImporter { })()?; if let Some(range) = direct_retval { - file_system::rename(&path.temp, &path.save)?; if let Some(key_manager) = &self.key_manager { let temp_str = path .temp @@ -595,7 +594,14 @@ impl SstImporter { .to_str() .ok_or_else(|| Error::InvalidSstPath(path.save.clone()))?; key_manager.link_file(temp_str, save_str)?; - key_manager.delete_file(temp_str)?; + let r = file_system::rename(&path.temp, &path.save); + let del_file = if r.is_ok() { temp_str } else { save_str }; + if let Err(e) = key_manager.delete_file(del_file) { + warn!("fail to remove encryption metadata during 'do_download'"; "err" => ?e); + } + r?; + } else { + file_system::rename(&path.temp, &path.save)?; } IMPORTER_DOWNLOAD_DURATION .with_label_values(&["rename"]) diff --git a/etc/config-template.toml b/etc/config-template.toml index 795a82f371c..558612151ec 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -1086,6 +1086,23 @@ ## When it's not set, 15% of available system memory will be used. # memory-limit = "1GB" +## Version of the log file in Raft Engine. +## +## Candidates: +## 1: Can be read by TiKV release 6.1 and above. +## 2: Can be read by TiKV release 6.3 and above. Supports log recycling. +## +## Default: 1. +# format-version = 1 + +## Whether to recycle stale log files in Raft Engine. +## If `true`, logically purged log files will be reserved for recycling. +## Only available for `format-version` >= 2. This option is only +## available when TiKV >= 6.3.x. +## +## Default: false. +# enable-log-recycle = false + [security] ## The path for TLS certificates. Empty string means disabling secure connections. # ca-path = "" From 8be9d449b6cf86a5e4731d032ba1d8cd7f736da8 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 15 Aug 2022 01:46:51 -0700 Subject: [PATCH 154/676] engine_traits: clean up sst iterator (#13277) ref tikv/tikv#13058 There are 3 notable changes: - Correctly implement iterator for `SstReader`. - Remove unnecessary methods. - Make interface taking mutable references correctly. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_panic/src/lib.rs | 1 + components/engine_panic/src/misc.rs | 4 +- components/engine_panic/src/snapshot.rs | 6 +-- components/engine_panic/src/sst.rs | 21 +++++----- components/engine_panic/src/write_batch.rs | 2 +- components/engine_rocks/src/file_system.rs | 4 +- components/engine_rocks/src/lib.rs | 1 + components/engine_rocks/src/misc.rs | 12 ++++-- components/engine_rocks/src/snapshot.rs | 6 +-- components/engine_rocks/src/sst.rs | 39 +++++++------------ components/engine_rocks/src/write_batch.rs | 2 +- components/engine_traits/src/engines.rs | 4 +- components/engine_traits/src/errors.rs | 5 +++ components/engine_traits/src/iterable.rs | 25 ++++++++++-- components/engine_traits/src/lib.rs | 1 + components/engine_traits/src/misc.rs | 4 +- components/engine_traits/src/snapshot.rs | 1 - components/engine_traits/src/sst.rs | 6 +-- components/engine_traits/src/write_batch.rs | 4 +- .../engine_traits_tests/src/cf_names.rs | 22 +---------- components/engine_traits_tests/src/sst.rs | 15 +++---- .../engine_traits_tests/src/write_batch.rs | 4 +- .../src/coprocessor/consistency_check.rs | 6 +-- .../raftstore/src/store/compaction_guard.rs | 11 ++++-- components/raftstore/src/store/fsm/apply.rs | 2 +- .../raftstore/src/store/region_snapshot.rs | 2 +- .../src/store/worker/consistency_check.rs | 4 +- components/sst_importer/src/import_file.rs | 17 ++++---- components/sst_importer/src/sst_importer.rs | 28 ++++++------- src/server/gc_worker/compaction_filter.rs | 4 +- src/server/gc_worker/gc_worker.rs | 2 +- tests/integrations/raftstore/test_stats.rs | 6 +-- .../raftstore/test_update_region_size.rs | 2 +- tests/integrations/storage/test_titan.rs | 6 +-- 34 files changed, 132 insertions(+), 147 deletions(-) diff --git a/components/engine_panic/src/lib.rs b/components/engine_panic/src/lib.rs index 761b31af1d8..0573c936135 100644 --- a/components/engine_panic/src/lib.rs +++ b/components/engine_panic/src/lib.rs @@ -9,6 +9,7 @@ //! with your engine's own name; then fill in the implementations; remove //! the allow(unused) attribute; +#![feature(generic_associated_types)] #![allow(unused)] mod cf_names; diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 9a5cc310fc3..5a78ea66e5a 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -5,11 +5,11 @@ use engine_traits::{DeleteStrategy, MiscExt, Range, Result}; use crate::engine::PanicEngine; impl MiscExt for PanicEngine { - fn flush(&self, sync: bool) -> Result<()> { + fn flush_cfs(&self, wait: bool) -> Result<()> { panic!() } - fn flush_cf(&self, cf: &str, sync: bool) -> Result<()> { + fn flush_cf(&self, cf: &str, wait: bool) -> Result<()> { panic!() } diff --git a/components/engine_panic/src/snapshot.rs b/components/engine_panic/src/snapshot.rs index e573402c6d2..cf651db4956 100644 --- a/components/engine_panic/src/snapshot.rs +++ b/components/engine_panic/src/snapshot.rs @@ -9,11 +9,7 @@ use crate::{db_vector::PanicDbVector, engine::PanicEngine}; #[derive(Clone, Debug)] pub struct PanicSnapshot; -impl Snapshot for PanicSnapshot { - fn cf_names(&self) -> Vec<&str> { - panic!() - } -} +impl Snapshot for PanicSnapshot {} impl Peekable for PanicSnapshot { type DbVector = PanicDbVector; diff --git a/components/engine_panic/src/sst.rs b/components/engine_panic/src/sst.rs index d1e5f4b331c..a0f1479604c 100644 --- a/components/engine_panic/src/sst.rs +++ b/components/engine_panic/src/sst.rs @@ -1,10 +1,10 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::path::PathBuf; +use std::{marker::PhantomData, path::PathBuf}; use engine_traits::{ - CfName, ExternalSstFileInfo, IterOptions, Iterable, Iterator, Result, SstCompressionType, - SstExt, SstReader, SstWriter, SstWriterBuilder, + CfName, ExternalSstFileInfo, IterOptions, Iterable, Iterator, RefIterable, Result, + SstCompressionType, SstExt, SstReader, SstWriter, SstWriterBuilder, }; use crate::engine::PanicEngine; @@ -24,22 +24,21 @@ impl SstReader for PanicSstReader { fn verify_checksum(&self) -> Result<()> { panic!() } - fn iter(&self) -> Self::Iterator { - panic!() - } } -impl Iterable for PanicSstReader { - type Iterator = PanicSstReaderIterator; +impl RefIterable for PanicSstReader { + type Iterator<'a> = PanicSstReaderIterator<'a>; - fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { + fn iter(&self, opts: IterOptions) -> Result> { panic!() } } -pub struct PanicSstReaderIterator; +pub struct PanicSstReaderIterator<'a> { + _phantom: PhantomData<&'a ()>, +} -impl Iterator for PanicSstReaderIterator { +impl Iterator for PanicSstReaderIterator<'_> { fn seek(&mut self, key: &[u8]) -> Result { panic!() } diff --git a/components/engine_panic/src/write_batch.rs b/components/engine_panic/src/write_batch.rs index d2dc866ca31..e8ba326590c 100644 --- a/components/engine_panic/src/write_batch.rs +++ b/components/engine_panic/src/write_batch.rs @@ -20,7 +20,7 @@ impl WriteBatchExt for PanicEngine { pub struct PanicWriteBatch; impl WriteBatch for PanicWriteBatch { - fn write_opt(&self, _: &WriteOptions) -> Result<()> { + fn write_opt(&mut self, _: &WriteOptions) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index 614611bc40e..f3211d52d68 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -82,13 +82,13 @@ mod tests { db.put(&data_key(b"a1"), &value).unwrap(); db.put(&data_key(b"a2"), &value).unwrap(); assert_eq!(stats.fetch(IoType::Flush, IoOp::Write), 0); - db.flush(true /* sync */).unwrap(); + db.flush_cfs(true /* wait */).unwrap(); assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.put(&data_key(b"a2"), &value).unwrap(); db.put(&data_key(b"a3"), &value).unwrap(); - db.flush(true /* sync */).unwrap(); + db.flush_cfs(true /* wait */).unwrap(); assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index f8b32c72a59..b0e7012bad7 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -16,6 +16,7 @@ //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] +#![feature(generic_associated_types)] #[allow(unused_extern_crates)] extern crate tikv_alloc; diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 3e204bbc49f..7cf5d771486 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -127,13 +127,17 @@ impl RocksEngine { } impl MiscExt for RocksEngine { - fn flush(&self, sync: bool) -> Result<()> { - self.as_inner().flush(sync).map_err(r2e) + fn flush_cfs(&self, wait: bool) -> Result<()> { + let mut handles = vec![]; + for cf in self.cf_names() { + handles.push(util::get_cf_handle(self.as_inner(), cf)?); + } + self.as_inner().flush_cfs(&handles, wait).map_err(r2e) } - fn flush_cf(&self, cf: &str, sync: bool) -> Result<()> { + fn flush_cf(&self, cf: &str, wait: bool) -> Result<()> { let handle = util::get_cf_handle(self.as_inner(), cf)?; - self.as_inner().flush_cf(handle, sync).map_err(r2e) + self.as_inner().flush_cf(handle, wait).map_err(r2e) } fn delete_ranges_cf( diff --git a/components/engine_rocks/src/snapshot.rs b/components/engine_rocks/src/snapshot.rs index c107601c5d6..b19a32fd739 100644 --- a/components/engine_rocks/src/snapshot.rs +++ b/components/engine_rocks/src/snapshot.rs @@ -32,11 +32,7 @@ impl RocksSnapshot { } } -impl Snapshot for RocksSnapshot { - fn cf_names(&self) -> Vec<&str> { - self.db.cf_names() - } -} +impl Snapshot for RocksSnapshot {} impl Debug for RocksSnapshot { fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { diff --git a/components/engine_rocks/src/sst.rs b/components/engine_rocks/src/sst.rs index 66e0a974916..0518dd7feb5 100644 --- a/components/engine_rocks/src/sst.rs +++ b/components/engine_rocks/src/sst.rs @@ -1,9 +1,9 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{path::PathBuf, rc::Rc, sync::Arc}; +use std::{path::PathBuf, sync::Arc}; use engine_traits::{ - Error, ExternalSstFileInfo, IterOptions, Iterable, Iterator, Result, SstCompressionType, + Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, }; use fail::fail_point; @@ -22,11 +22,8 @@ impl SstExt for RocksEngine { type SstWriterBuilder = RocksSstWriterBuilder; } -// FIXME: like in RocksEngineIterator and elsewhere, here we are using -// Rc to avoid putting references in an associated type, which -// requires generic associated types. pub struct RocksSstReader { - inner: Rc, + inner: SstFileReader, } impl RocksSstReader { @@ -50,8 +47,7 @@ impl RocksSstReader { } let mut reader = SstFileReader::new(cf_options); reader.open(path).map_err(r2e)?; - let inner = Rc::new(reader); - Ok(RocksSstReader { inner }) + Ok(RocksSstReader { inner: reader }) } pub fn compression_name(&self) -> String { @@ -71,33 +67,26 @@ impl SstReader for RocksSstReader { self.inner.verify_checksum().map_err(r2e)?; Ok(()) } - fn iter(&self) -> Self::Iterator { - RocksSstIterator(SstFileReader::iter_rc(self.inner.clone())) - } } -impl Iterable for RocksSstReader { - type Iterator = RocksSstIterator; +impl RefIterable for RocksSstReader { + type Iterator<'a> = RocksSstIterator<'a>; - /// Cf is ignored as there is only one cf in sst. - fn iterator_opt(&self, _cf: &str, opts: IterOptions) -> Result { + #[inline] + fn iter(&self, opts: IterOptions) -> Result> { let opt: RocksReadOptions = opts.into(); let opt = opt.into_raw(); - Ok(RocksSstIterator(SstFileReader::iter_opt_rc( - self.inner.clone(), - opt, - ))) + Ok(RocksSstIterator(SstFileReader::iter_opt(&self.inner, opt))) } } -// FIXME: See comment on RocksSstReader for why this contains Rc -pub struct RocksSstIterator(DBIterator>); +pub struct RocksSstIterator<'a>(DBIterator<&'a SstFileReader>); -// TODO(5kbpers): Temporarily force to add `Send` here, add a method for -// creating DBIterator> in rust-rocksdb later. -unsafe impl Send for RocksSstIterator {} +// It's OK to send the iterator around. +// TODO: remove this when using tirocks. +unsafe impl Send for RocksSstIterator<'_> {} -impl Iterator for RocksSstIterator { +impl Iterator for RocksSstIterator<'_> { fn seek(&mut self, key: &[u8]) -> Result { self.0.seek(rocksdb::SeekKey::Key(key)).map_err(r2e) } diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index e4028feb411..f617608119b 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -99,7 +99,7 @@ impl RocksWriteBatchVec { } impl engine_traits::WriteBatch for RocksWriteBatchVec { - fn write_opt(&self, opts: &WriteOptions) -> Result<()> { + fn write_opt(&mut self, opts: &WriteOptions) -> Result<()> { let opt: RocksWriteOptions = opts.into(); if self.support_write_batch_vec { self.get_db() diff --git a/components/engine_traits/src/engines.rs b/components/engine_traits/src/engines.rs index 4e4089d52dc..d5928a9783a 100644 --- a/components/engine_traits/src/engines.rs +++ b/components/engine_traits/src/engines.rs @@ -20,11 +20,11 @@ impl Engines { } } - pub fn write_kv(&self, wb: &K::WriteBatch) -> Result<()> { + pub fn write_kv(&self, wb: &mut K::WriteBatch) -> Result<()> { wb.write() } - pub fn write_kv_opt(&self, wb: &K::WriteBatch, opts: &WriteOptions) -> Result<()> { + pub fn write_kv_opt(&self, wb: &mut K::WriteBatch, opts: &WriteOptions) -> Result<()> { wb.write_opt(opts) } diff --git a/components/engine_traits/src/errors.rs b/components/engine_traits/src/errors.rs index c9960b50753..6ef46ff7a70 100644 --- a/components/engine_traits/src/errors.rs +++ b/components/engine_traits/src/errors.rs @@ -40,6 +40,11 @@ pub enum SubCode { MemoryLimit = 7, SpaceLimit = 8, PathNotFound = 9, + MergeOperandsInsufficientCapacity = 10, + ManualCompactionPaused = 11, + Overwritten = 12, + TxnNotPrepared = 13, + IoFenced = 14, } #[repr(u8)] diff --git a/components/engine_traits/src/iterable.rs b/components/engine_traits/src/iterable.rs index 9d45fc5b0ac..50fcfc2344b 100644 --- a/components/engine_traits/src/iterable.rs +++ b/components/engine_traits/src/iterable.rs @@ -109,6 +109,14 @@ pub trait Iterator: Send { fn valid(&self) -> Result; } +pub trait RefIterable { + type Iterator<'a>: Iterator + where + Self: 'a; + + fn iter(&self, opts: IterOptions) -> Result>; +} + pub trait Iterable { type Iterator: Iterator; @@ -131,10 +139,7 @@ pub trait Iterable { where F: FnMut(&[u8], &[u8]) -> Result, { - let start = KeyBuilder::from_slice(start_key, DATA_KEY_PREFIX_LEN, 0); - let end = - (!end_key.is_empty()).then(|| KeyBuilder::from_slice(end_key, DATA_KEY_PREFIX_LEN, 0)); - let iter_opt = IterOptions::new(Some(start), end, fill_cache); + let iter_opt = iter_option(start_key, end_key, fill_cache); scan_impl(self.iterator_opt(cf, iter_opt)?, start_key, f) } @@ -175,3 +180,15 @@ pub fn collect(mut it: I) -> Vec<(Vec, Vec)> { } v } + +/// Build an `IterOptions` using giving data key bound. Empty upper bound will +/// be ignored. +pub fn iter_option(lower_bound: &[u8], upper_bound: &[u8], fill_cache: bool) -> IterOptions { + let lower_bound = Some(KeyBuilder::from_slice(lower_bound, 0, 0)); + let upper_bound = if upper_bound.is_empty() { + None + } else { + Some(KeyBuilder::from_slice(upper_bound, 0, 0)) + }; + IterOptions::new(lower_bound, upper_bound, fill_cache) +} diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 72794fba5cd..b140da14969 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -250,6 +250,7 @@ //! Likewise `engine_rocks` can temporarily call code from inside `engine`. #![feature(min_specialization)] #![feature(assert_matches)] +#![feature(generic_associated_types)] #[macro_use(fail_point)] extern crate fail; diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 0e6b9600da6..f0ba9d03c39 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -29,9 +29,9 @@ pub enum DeleteStrategy { } pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { - fn flush(&self, sync: bool) -> Result<()>; + fn flush_cfs(&self, wait: bool) -> Result<()>; - fn flush_cf(&self, cf: &str, sync: bool) -> Result<()>; + fn flush_cf(&self, cf: &str, wait: bool) -> Result<()>; fn delete_all_in_range(&self, strategy: DeleteStrategy, ranges: &[Range<'_>]) -> Result<()> { for cf in self.cf_names() { diff --git a/components/engine_traits/src/snapshot.rs b/components/engine_traits/src/snapshot.rs index 93ef451209c..7907abd1445 100644 --- a/components/engine_traits/src/snapshot.rs +++ b/components/engine_traits/src/snapshot.rs @@ -12,5 +12,4 @@ pub trait Snapshot where Self: 'static + Peekable + Iterable + Send + Sync + Sized + Debug, { - fn cf_names(&self) -> Vec<&str>; } diff --git a/components/engine_traits/src/sst.rs b/components/engine_traits/src/sst.rs index fb37c918886..a97fe7a8b87 100644 --- a/components/engine_traits/src/sst.rs +++ b/components/engine_traits/src/sst.rs @@ -4,7 +4,7 @@ use std::path::PathBuf; use kvproto::import_sstpb::SstMeta; -use crate::{errors::Result, iterable::Iterable}; +use crate::{errors::Result, RefIterable}; #[derive(Clone, Debug)] pub struct SstMetaInfo { @@ -20,11 +20,9 @@ pub trait SstExt: Sized { } /// SstReader is used to read an SST file. -pub trait SstReader: Iterable + Sized { +pub trait SstReader: RefIterable + Sized { fn open(path: &str) -> Result; fn verify_checksum(&self) -> Result<()>; - // FIXME: Shouldn't this me a method on Iterable? - fn iter(&self) -> Self::Iterator; } /// SstWriter is used to create sst files that can be added to database later. diff --git a/components/engine_traits/src/write_batch.rs b/components/engine_traits/src/write_batch.rs index 5d6824a7207..4dc8e47e823 100644 --- a/components/engine_traits/src/write_batch.rs +++ b/components/engine_traits/src/write_batch.rs @@ -71,10 +71,10 @@ pub trait Mutable: Send { /// save point, and pops the save point from the stack. pub trait WriteBatch: Mutable { /// Commit the WriteBatch to disk with the given options - fn write_opt(&self, opts: &WriteOptions) -> Result<()>; + fn write_opt(&mut self, opts: &WriteOptions) -> Result<()>; /// Commit the WriteBatch to disk atomically - fn write(&self) -> Result<()> { + fn write(&mut self) -> Result<()> { self.write_opt(&WriteOptions::default()) } diff --git a/components/engine_traits_tests/src/cf_names.rs b/components/engine_traits_tests/src/cf_names.rs index 2cac1eaff73..f85c2f5df97 100644 --- a/components/engine_traits_tests/src/cf_names.rs +++ b/components/engine_traits_tests/src/cf_names.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{CfNamesExt, KvEngine, Snapshot, ALL_CFS, CF_DEFAULT}; +use engine_traits::{CfNamesExt, ALL_CFS, CF_DEFAULT}; use super::{default_engine, engine_cfs}; @@ -21,23 +21,3 @@ fn cf_names() { assert!(names.contains(cf)); } } - -#[test] -fn default_names_snapshot() { - let db = default_engine(); - let snapshot = db.engine.snapshot(); - let names = snapshot.cf_names(); - assert_eq!(names.len(), 1); - assert_eq!(names[0], CF_DEFAULT); -} - -#[test] -fn cf_names_snapshot() { - let db = engine_cfs(ALL_CFS); - let snapshot = db.engine.snapshot(); - let names = snapshot.cf_names(); - assert_eq!(names.len(), ALL_CFS.len()); - for cf in ALL_CFS { - assert!(names.contains(cf)); - } -} diff --git a/components/engine_traits_tests/src/sst.rs b/components/engine_traits_tests/src/sst.rs index ce4160e5ddc..26ed686aad4 100644 --- a/components/engine_traits_tests/src/sst.rs +++ b/components/engine_traits_tests/src/sst.rs @@ -6,7 +6,8 @@ use std::fs; use engine_test::kv::KvTestEngine; use engine_traits::{ - Error, ExternalSstFileInfo, Iterator, Result, SstExt, SstReader, SstWriter, SstWriterBuilder, + Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, SstExt, SstReader, + SstWriter, SstWriterBuilder, }; use panic_hook::recover_safe; @@ -48,7 +49,7 @@ fn basic() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; let key = iter.key(); @@ -77,7 +78,7 @@ fn forward() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; @@ -114,7 +115,7 @@ fn reverse() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_last()?; @@ -152,7 +153,7 @@ fn delete() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; @@ -206,7 +207,7 @@ fn same_key() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; let key = iter.key(); @@ -248,7 +249,7 @@ fn reverse_key() -> Result<()> { sst_writer.finish()?; let sst_reader = ::SstReader::open(&sst_path)?; - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first()?; let key = iter.key(); diff --git a/components/engine_traits_tests/src/write_batch.rs b/components/engine_traits_tests/src/write_batch.rs index e99245adb4b..f13cec0845a 100644 --- a/components/engine_traits_tests/src/write_batch.rs +++ b/components/engine_traits_tests/src/write_batch.rs @@ -20,11 +20,11 @@ fn write_batch_none_no_commit() { #[test] fn write_batch_none() { let db = default_engine(); - let wb = db.engine.write_batch(); + let mut wb = db.engine.write_batch(); wb.write().unwrap(); let db = multi_batch_write_engine(); - let wb = db.engine.write_batch_with_cap(1024); + let mut wb = db.engine.write_batch_with_cap(1024); wb.write().unwrap(); } diff --git a/components/raftstore/src/coprocessor/consistency_check.rs b/components/raftstore/src/coprocessor/consistency_check.rs index 70b55db41f4..5ba97089f85 100644 --- a/components/raftstore/src/coprocessor/consistency_check.rs +++ b/components/raftstore/src/coprocessor/consistency_check.rs @@ -2,7 +2,7 @@ use std::marker::PhantomData; -use engine_traits::{KvEngine, Snapshot, CF_RAFT}; +use engine_traits::{KvEngine, Snapshot, ALL_CFS, CF_RAFT}; use kvproto::metapb::Region; use crate::{ @@ -60,12 +60,10 @@ impl ConsistencyCheckObserver for Raw { fn compute_hash_on_raw(region: &Region, snap: &S) -> Result { let region_id = region.get_id(); let mut digest = crc32fast::Hasher::new(); - let mut cf_names = snap.cf_names(); - cf_names.sort_unstable(); let start_key = keys::enc_start_key(region); let end_key = keys::enc_end_key(region); - for cf in cf_names { + for cf in ALL_CFS { snap.scan(cf, &start_key, &end_key, false, |k, v| { digest.update(k); digest.update(v); diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index c8fb02d424b..78dbccbf585 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -202,7 +202,9 @@ mod tests { util::new_engine_opt, RocksCfOptions, RocksDbOptions, RocksEngine, RocksSstPartitionerFactory, RocksSstReader, }; - use engine_traits::{CompactExt, Iterator, MiscExt, SstReader, SyncMutable, CF_DEFAULT}; + use engine_traits::{ + CompactExt, IterOptions, Iterator, MiscExt, RefIterable, SstReader, SyncMutable, CF_DEFAULT, + }; use keys::DATA_PREFIX_KEY; use kvproto::metapb::Region; use tempfile::TempDir; @@ -399,7 +401,8 @@ mod tests { } fn collect_keys(path: &str) -> Vec> { - let mut sst_reader = RocksSstReader::open(path).unwrap().iter(); + let reader = RocksSstReader::open(path).unwrap(); + let mut sst_reader = reader.iter(IterOptions::default()).unwrap(); let mut valid = sst_reader.seek_to_first().unwrap(); let mut ret = vec![]; while valid { @@ -444,14 +447,14 @@ mod tests { db.put(b"za1", b"").unwrap(); db.put(b"zb1", &value).unwrap(); db.put(b"zc1", &value).unwrap(); - db.flush(true /* sync */).unwrap(); + db.flush_cfs(true /* wait */).unwrap(); db.put(b"zb2", &value).unwrap(); db.put(b"zc2", &value).unwrap(); db.put(b"zc3", &value).unwrap(); db.put(b"zc4", &value).unwrap(); db.put(b"zc5", &value).unwrap(); db.put(b"zc6", &value).unwrap(); - db.flush(true /* sync */).unwrap(); + db.flush_cfs(true /* wait */).unwrap(); db.compact_range( CF_DEFAULT, None, // start_key None, // end_key diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 0d97137bab1..d33a262cf6a 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -533,7 +533,7 @@ where self.perf_context.start_observe(); let mut write_opts = engine_traits::WriteOptions::new(); write_opts.set_sync(need_sync); - self.kv_wb().write_opt(&write_opts).unwrap_or_else(|e| { + self.kv_wb_mut().write_opt(&write_opts).unwrap_or_else(|e| { panic!("failed to write to engine: {:?}", e); }); let trackers: Vec<_> = self diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index 86d89fad051..fe58a2587a7 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -460,7 +460,7 @@ mod tests { let db = &engines.kv; for &(ref k, level) in &levels { db.put(&data_key(k), k).unwrap(); - db.flush(true).unwrap(); + db.flush_cfs(true).unwrap(); data.push((k.to_vec(), k.to_vec())); db.compact_files_in_range(Some(&data_key(k)), Some(&data_key(k)), Some(level)) .unwrap(); diff --git a/components/raftstore/src/store/worker/consistency_check.rs b/components/raftstore/src/store/worker/consistency_check.rs index 154f1816dbf..b3bd7ef32d0 100644 --- a/components/raftstore/src/store/worker/consistency_check.rs +++ b/components/raftstore/src/store/worker/consistency_check.rs @@ -128,7 +128,7 @@ mod tests { use byteorder::{BigEndian, WriteBytesExt}; use engine_test::kv::{new_engine, KvTestEngine}; - use engine_traits::{KvEngine, SyncMutable, CF_DEFAULT, CF_RAFT}; + use engine_traits::{KvEngine, SyncMutable, ALL_CFS}; use kvproto::metapb::*; use tempfile::Builder; use tikv_util::worker::Runnable; @@ -141,7 +141,7 @@ mod tests { #[test] fn test_consistency_check() { let path = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); - let db = new_engine(path.path().to_str().unwrap(), &[CF_DEFAULT, CF_RAFT]).unwrap(); + let db = new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let mut region = Region::default(); region.mut_peers().push(Peer::default()); diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index e83255942fd..f5292b70075 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -11,7 +11,9 @@ use std::{ use api_version::api_v2::TIDB_RANGES_COMPLEMENT; use encryption::{DataKeyManager, EncrypterWriter}; use engine_rocks::{get_env, RocksSstReader}; -use engine_traits::{EncryptionKeyManager, Iterable, KvEngine, SstMetaInfo, SstReader}; +use engine_traits::{ + iter_option, EncryptionKeyManager, Iterator, KvEngine, RefIterable, SstMetaInfo, SstReader, +}; use file_system::{get_io_rate_limiter, sync_dir, File, OpenOptions}; use kvproto::{import_sstpb::*, kvrpcpb::ApiVersion}; use tikv_util::time::Instant; @@ -330,19 +332,14 @@ impl ImportDir { let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; for &(start, end) in TIDB_RANGES_COMPLEMENT { - let mut unexpected_data_key = None; - // No CF in sst. - sst_reader.scan("", start, end, false, |key, _| { - unexpected_data_key = Some(key.to_vec()); - Ok(false) - })?; - - if let Some(unexpected_data_key) = unexpected_data_key { + let opt = iter_option(start, end, false); + let mut iter = sst_reader.iter(opt)?; + if iter.seek(start)? { error!( "unable to import: switch api version with non-tidb key"; "sst" => ?meta.api_version, "current" => ?api_version, - "key" => ?log_wrappers::hex_encode_upper(&unexpected_data_key) + "key" => ?log_wrappers::hex_encode_upper(iter.key()) ); return Ok(false); } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 7e40859b127..806066bd202 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -15,8 +15,8 @@ use encryption::{to_engine_encryption_method, DataKeyManager}; use engine_rocks::{get_env, RocksSstReader}; use engine_traits::{ name_to_cf, util::check_key_in_range, CfName, EncryptionKeyManager, FileEncryptionInfo, - Iterator, KvEngine, SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, - SstWriterBuilder, CF_DEFAULT, CF_WRITE, + IterOptions, Iterator, KvEngine, RefIterable, SstCompressionType, SstExt, SstMetaInfo, + SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, }; use file_system::{get_io_rate_limiter, OpenOptions}; use futures::executor::ThreadPool; @@ -548,7 +548,7 @@ impl SstImporter { let start_rename_rewrite = Instant::now(); // read the first and last keys from the SST, determine if we could // simply move the entire SST instead of iterating and generate a new one. - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default())?; let direct_retval = (|| -> Result> { if rewrite_rule.old_key_prefix != rewrite_rule.new_key_prefix || rewrite_rule.new_timestamp != 0 @@ -798,7 +798,7 @@ mod tests { use engine_traits::{ collect, EncryptionMethod, Error as TraitError, ExternalSstFileInfo, Iterable, Iterator, - SstReader, SstWriter, CF_DEFAULT, DATA_CFS, + RefIterable, SstReader, SstWriter, CF_DEFAULT, DATA_CFS, }; use file_system::File; use openssl::hash::{Hasher, MessageDigest}; @@ -1338,7 +1338,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), @@ -1397,7 +1397,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), Some(env)); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), @@ -1445,7 +1445,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), @@ -1490,7 +1490,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), @@ -1534,7 +1534,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), @@ -1675,7 +1675,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), @@ -1719,7 +1719,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), @@ -1854,7 +1854,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), @@ -1912,7 +1912,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), @@ -1967,7 +1967,7 @@ mod tests { // verifies the SST content is correct. let sst_reader = new_sst_reader(sst_file_path.to_str().unwrap(), None); sst_reader.verify_checksum().unwrap(); - let mut iter = sst_reader.iter(); + let mut iter = sst_reader.iter(IterOptions::default()).unwrap(); iter.seek_to_first().unwrap(); assert_eq!( collect(iter), diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 1c50b56bed1..ef190f4760e 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -463,7 +463,7 @@ impl WriteCompactionFilter { } fn do_flush( - wb: &RocksWriteBatchVec, + wb: &mut RocksWriteBatchVec, wopts: &WriteOptions, ) -> Result<(), engine_traits::Error> { let _io_type_guard = WithIoType::new(IoType::Gc); @@ -481,7 +481,7 @@ impl WriteCompactionFilter { if self.write_batch.count() > DEFAULT_DELETE_BATCH_COUNT || force { let mut wopts = WriteOptions::default(); wopts.set_no_slowdown(true); - if let Err(e) = do_flush(&self.write_batch, &wopts) { + if let Err(e) = do_flush(&mut self.write_batch, &wopts) { let wb = mem::replace( &mut self.write_batch, self.engine.write_batch_with_cap(DEFAULT_DELETE_BATCH_SIZE), diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 131efd68fac..eaa55c9c69c 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -882,7 +882,7 @@ where limit, ); } - GcTask::OrphanVersions { wb, id } => { + GcTask::OrphanVersions { mut wb, id } => { info!("handling GcTask::OrphanVersions"; "id" => id); let mut wopts = WriteOptions::default(); wopts.set_sync(true); diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index 03c0f0a82b2..2af595c4e5a 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -28,7 +28,7 @@ fn check_available(cluster: &mut Cluster) { for i in 0..1000 { let last_available = stats.get_available(); cluster.must_put(format!("k{}", i).as_bytes(), &value); - engine.flush(true).unwrap(); + engine.flush_cfs(true).unwrap(); sleep_ms(20); let stats = pd_client.get_store_stats(1).unwrap(); @@ -59,7 +59,7 @@ fn test_simple_store_stats(cluster: &mut Cluster) { } let engine = cluster.get_engine(1); - engine.flush(true).unwrap(); + engine.flush_cfs(true).unwrap(); let last_stats = pd_client.get_store_stats(1).unwrap(); assert_eq!(last_stats.get_region_count(), 1); @@ -68,7 +68,7 @@ fn test_simple_store_stats(cluster: &mut Cluster) { let region = pd_client.get_region(b"").unwrap(); cluster.must_split(®ion, b"k2"); - engine.flush(true).unwrap(); + engine.flush_cfs(true).unwrap(); // wait report region count after split for _ in 0..100 { diff --git a/tests/integrations/raftstore/test_update_region_size.rs b/tests/integrations/raftstore/test_update_region_size.rs index 4aab144ff27..ee4fb79ac62 100644 --- a/tests/integrations/raftstore/test_update_region_size.rs +++ b/tests/integrations/raftstore/test_update_region_size.rs @@ -9,7 +9,7 @@ use tikv_util::config::*; fn flush(cluster: &mut Cluster) { for engines in cluster.engines.values() { - engines.kv.flush(true).unwrap(); + engines.kv.flush_cfs(true).unwrap(); } } diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index f5e642f161b..5b957b88822 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -211,7 +211,7 @@ fn test_delete_files_in_range_for_titan() { .unwrap(); // Flush and compact the kvs into L6. - engines.kv.flush(true).unwrap(); + engines.kv.flush_cfs(true).unwrap(); engines.kv.compact_files_in_range(None, None, None).unwrap(); let db = engines.kv.as_inner(); let value = db.get_property_int("rocksdb.num-files-at-level0").unwrap(); @@ -254,9 +254,9 @@ fn test_delete_files_in_range_for_titan() { // Used to trigger titan gc let engine = &engines.kv; engine.put(b"1", b"1").unwrap(); - engine.flush(true).unwrap(); + engine.flush_cfs(true).unwrap(); engine.put(b"2", b"2").unwrap(); - engine.flush(true).unwrap(); + engine.flush_cfs(true).unwrap(); engine .compact_files_in_range(Some(b"0"), Some(b"3"), Some(1)) .unwrap(); From 73c0bacf2be12c93bbb26727f9d713783dbaa05f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 16 Aug 2022 14:14:51 +0800 Subject: [PATCH 155/676] log-backup: fix early return (#13288) close tikv/tikv#13281 Fixed a bug that may cause data loss in log backup. Signed-off-by: Yu Juncen --- Cargo.lock | 2 +- components/backup-stream/Cargo.toml | 2 +- components/backup-stream/src/event_loader.rs | 9 ++- components/backup-stream/tests/mod.rs | 73 +++++++++++++++++++- 4 files changed, 80 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 802a0e19487..1efe0607541 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1620,7 +1620,7 @@ dependencies = [ [[package]] name = "etcd-client" version = "0.7.2" -source = "git+https://github.com/yujuncen/etcd-client?rev=e0321a1990ee561cf042973666c0db61c8d82364#e0321a1990ee561cf042973666c0db61c8d82364" +source = "git+https://github.com/pingcap/etcd-client?rev=e0321a1990ee561cf042973666c0db61c8d82364#e0321a1990ee561cf042973666c0db61c8d82364" dependencies = [ "http", "prost", diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index b0b6fc3f13f..e5bb889420d 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -31,7 +31,7 @@ engine_traits = { path = "../engine_traits", default-features = false } error_code = { path = "../error_code" } # We cannot update the etcd-client to latest version because of the cyclic requirement. # Also we need wait until https://github.com/etcdv3/etcd-client/pull/43/files to be merged. -etcd-client = { git = "https://github.com/yujuncen/etcd-client", rev = "e0321a1990ee561cf042973666c0db61c8d82364", features = ["pub-response-field", "tls"] } +etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "e0321a1990ee561cf042973666c0db61c8d82364", features = ["pub-response-field", "tls"] } external_storage = { path = "../external_storage", default-features = false } external_storage_export = { path = "../external_storage/export", default-features = false } fail = "0.5" diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 61e227af1ac..5aade374249 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -391,9 +391,16 @@ where // we only need to record the disk throughput of this. let (stat, disk_read) = utils::with_record_read_throughput(|| event_loader.fill_entries()); + // We must use the size of entry batch here to check whether we have progress. + // Or we may exit too early if there are only records: + // - can be inlined to `write` CF (hence it won't be written to default CF) + // - are prewritten. (hence it will only contains `Prewrite` records). + // In this condition, ALL records generate no ApplyEvent(only lock change), + // and we would exit after the first run of loop :( + let no_progress = event_loader.entry_batch.is_empty(); let stat = stat?; self.with_resolver(region, |r| event_loader.emit_entries_to(&mut events, r))?; - if events.is_empty() { + if no_progress { metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); return Ok(stats.stat); } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index f838e96ddbf..c5d3442fb84 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -158,7 +158,6 @@ impl SuiteBuilder { for id in 1..=(n as u64) { suite.start_endpoint(id, use_v3); } - // TODO: The current mock metastore (slash_etc) doesn't supports multi-version. // We must wait until the endpoints get ready to watching the metastore, or some // modifies may be lost. Either make Endpoint::with_client wait until watch did // start or make slash_etc support multi-version, then we can get rid of this @@ -318,6 +317,19 @@ impl Suite { inserted } + fn commit_keys(&mut self, keys: Vec>, start_ts: TimeStamp, commit_ts: TimeStamp) { + let mut region_keys = HashMap::>>::new(); + for k in keys { + let enc_key = Key::from_raw(&k).into_encoded(); + let region = self.cluster.get_region_id(&enc_key); + region_keys.entry(region).or_default().push(k); + } + + for (region, keys) in region_keys { + self.must_kv_commit(region, keys, start_ts, commit_ts); + } + } + fn just_commit_a_key(&mut self, key: Vec, start_ts: TimeStamp, commit_ts: TimeStamp) { let enc_key = Key::from_raw(&key).into_encoded(); let region = self.cluster.get_region_id(&enc_key); @@ -604,10 +616,13 @@ mod test { errors::Error, metadata::MetadataClient, router::TaskSelector, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; + use pd_client::PdClient; use tikv_util::{box_err, defer, info, HandyRwLock}; - use txn_types::TimeStamp; + use txn_types::{Key, TimeStamp}; - use crate::{make_record_key, make_split_key_at_record, run_async_test, SuiteBuilder}; + use crate::{ + make_record_key, make_split_key_at_record, mutation, run_async_test, SuiteBuilder, + }; #[test] fn basic() { @@ -650,6 +665,58 @@ mod test { suite.cluster.shutdown(); } + /// This test tests whether we can handle some weird transactions and their + /// race with initial scanning. + /// Generally, those transactions: + /// - Has N mutations, which's values are all short enough to be inlined in + /// the `Write` CF. (N > 1024) + /// - Commit the mutation set M first. (for all m in M: Nth-Of-Key(m) > + /// 1024) + /// ```text + /// |--...-----^------*---*-*--*-*-*-> (The line is the Key Space - from "" to inf) + /// +The 1024th key (* = committed mutation) + /// ``` + /// - Before committing remaining mutations, PiTR triggered initial + /// scanning. + /// - The remaining mutations are committed before the instant when initial + /// scanning get the snapshot. + #[test] + fn with_split_txn() { + let mut suite = super::SuiteBuilder::new_named("split_txn").use_v3().build(); + run_async_test(async { + let start_ts = suite.cluster.pd_client.get_tso().await.unwrap(); + let keys = (1..1960).map(|i| make_record_key(1, i)).collect::>(); + suite.must_kv_prewrite( + 1, + keys.clone() + .into_iter() + .map(|k| mutation(k, b"hello, world".to_vec())) + .collect(), + make_record_key(1, 1913), + start_ts, + ); + let commit_ts = suite.cluster.pd_client.get_tso().await.unwrap(); + suite.commit_keys(keys[1913..].to_vec(), start_ts, commit_ts); + suite.must_register_task(1, "test_split_txn"); + suite.commit_keys(keys[..1913].to_vec(), start_ts, commit_ts); + suite.force_flush_files("test_split_txn"); + suite.wait_for_flush(); + let keys_encoded = keys + .iter() + .map(|v| { + Key::from_raw(v.as_slice()) + .append_ts(commit_ts) + .into_encoded() + }) + .collect::>(); + suite.check_for_write_records( + suite.flushed_files.path(), + keys_encoded.iter().map(Vec::as_slice), + ); + }); + suite.cluster.shutdown(); + } + #[test] /// This case tests whether the backup can continue when the leader failes. fn leader_down() { From dabf29e4178f3f9b86f0a0eb5cac1a131df2d377 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Tue, 16 Aug 2022 15:32:51 +0800 Subject: [PATCH 156/676] raftstore: allow exec observers delay deletion of applied ssts (#13061) ref tikv/tikv#12849 allow exec observers delay deletion of applied ssts Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- .../raftstore/src/coprocessor/dispatcher.rs | 166 ++++++++++++++---- components/raftstore/src/coprocessor/mod.rs | 13 +- components/raftstore/src/store/fsm/apply.rs | 146 +++++++++++++-- components/raftstore/src/store/metrics.rs | 6 + components/sst_importer/src/import_file.rs | 4 + 5 files changed, 285 insertions(+), 50 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 6297722a996..7eea973997b 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -459,12 +459,13 @@ impl CoprocessorHost { cmd: &Cmd, apply_state: &RaftApplyState, region_state: &RegionState, + apply_ctx: &mut ApplyCtxInfo<'_>, ) -> bool { let mut ctx = ObserverContext::new(region); if !cmd.response.has_admin_response() { for observer in &self.registry.query_observers { let observer = observer.observer.inner(); - if observer.post_exec_query(&mut ctx, cmd, apply_state, region_state) { + if observer.post_exec_query(&mut ctx, cmd, apply_state, region_state, apply_ctx) { return true; } } @@ -472,7 +473,7 @@ impl CoprocessorHost { } else { for observer in &self.registry.admin_observers { let observer = observer.observer.inner(); - if observer.post_exec_admin(&mut ctx, cmd, apply_state, region_state) { + if observer.post_exec_admin(&mut ctx, cmd, apply_state, region_state, apply_ctx) { return true; } } @@ -656,6 +657,26 @@ mod tests { return_err: Arc, } + enum ObserverIndex { + PreProposeAdmin = 1, + PreApplyAdmin = 2, + PostApplyAdmin = 3, + PreProposeQuery = 4, + PreApplyQuery = 5, + PostApplyQuery = 6, + OnRoleChange = 7, + OnRegionChanged = 8, + ApplyPlainKvs = 9, + ApplySst = 10, + OnFlushAppliedCmdBatch = 13, + OnEmptyCmd = 14, + PreExecQuery = 15, + PreExecAdmin = 16, + PostExecQuery = 17, + PostExecAdmin = 18, + OnComputeEngineSize = 19, + } + impl Coprocessor for TestCoprocessor {} impl AdminObserver for TestCoprocessor { @@ -664,7 +685,8 @@ mod tests { ctx: &mut ObserverContext<'_>, _: &mut AdminRequest, ) -> Result<()> { - self.called.fetch_add(1, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreProposeAdmin as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); if self.return_err.load(Ordering::SeqCst) { return Err(box_err!("error")); @@ -673,12 +695,14 @@ mod tests { } fn pre_apply_admin(&self, ctx: &mut ObserverContext<'_>, _: &AdminRequest) { - self.called.fetch_add(2, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreApplyAdmin as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } fn post_apply_admin(&self, ctx: &mut ObserverContext<'_>, _: &AdminResponse) { - self.called.fetch_add(3, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PostApplyAdmin as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } @@ -689,7 +713,22 @@ mod tests { _: u64, _: u64, ) -> bool { - self.called.fetch_add(16, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreExecAdmin as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + false + } + + fn post_exec_admin( + &self, + ctx: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + _: &mut ApplyCtxInfo<'_>, + ) -> bool { + self.called + .fetch_add(ObserverIndex::PostExecAdmin as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); false } @@ -701,7 +740,8 @@ mod tests { ctx: &mut ObserverContext<'_>, _: &mut Vec, ) -> Result<()> { - self.called.fetch_add(4, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreProposeQuery as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); if self.return_err.load(Ordering::SeqCst) { return Err(box_err!("error")); @@ -710,12 +750,14 @@ mod tests { } fn pre_apply_query(&self, ctx: &mut ObserverContext<'_>, _: &[Request]) { - self.called.fetch_add(5, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreApplyQuery as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } fn post_apply_query(&self, ctx: &mut ObserverContext<'_>, _: &Cmd) { - self.called.fetch_add(6, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PostApplyQuery as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } @@ -726,26 +768,46 @@ mod tests { _: u64, _: u64, ) -> bool { - self.called.fetch_add(15, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::PreExecQuery as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); false } fn on_empty_cmd(&self, ctx: &mut ObserverContext<'_>, _index: u64, _term: u64) { - self.called.fetch_add(14, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::OnEmptyCmd as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + } + + fn post_exec_query( + &self, + ctx: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + _: &mut ApplyCtxInfo<'_>, + ) -> bool { + self.called + .fetch_add(ObserverIndex::PostExecQuery as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); + false } } impl PdTaskObserver for TestCoprocessor { fn on_compute_engine_size(&self, _: &mut Option) { - self.called.fetch_add(19, Ordering::SeqCst); + self.called.fetch_add( + ObserverIndex::OnComputeEngineSize as usize, + Ordering::SeqCst, + ); } } impl RoleObserver for TestCoprocessor { fn on_role_change(&self, ctx: &mut ObserverContext<'_>, _: &RoleChange) { - self.called.fetch_add(7, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::OnRoleChange as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } } @@ -757,7 +819,8 @@ mod tests { _: RegionChangeEvent, _: StateRole, ) { - self.called.fetch_add(8, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::OnRegionChanged as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } } @@ -769,12 +832,14 @@ mod tests { _: CfName, _: &[(Vec, Vec)], ) { - self.called.fetch_add(9, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::ApplyPlainKvs as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } fn apply_sst(&self, ctx: &mut ObserverContext<'_>, _: CfName, _: &str) { - self.called.fetch_add(10, Ordering::SeqCst); + self.called + .fetch_add(ObserverIndex::ApplySst as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } } @@ -786,7 +851,10 @@ mod tests { _: &mut Vec, _: &PanicEngine, ) { - self.called.fetch_add(13, Ordering::SeqCst); + self.called.fetch_add( + ObserverIndex::OnFlushAppliedCmdBatch as usize, + Ordering::SeqCst, + ); } fn on_applied_current_term(&self, _: StateRole, _: &Region) {} } @@ -825,38 +893,50 @@ mod tests { .register_region_change_observer(1, BoxRegionChangeObserver::new(ob.clone())); host.registry .register_cmd_observer(1, BoxCmdObserver::new(ob.clone())); + + let mut index: usize = 0; let region = Region::default(); let mut admin_req = RaftCmdRequest::default(); admin_req.set_admin_request(AdminRequest::default()); host.pre_propose(®ion, &mut admin_req).unwrap(); - assert_all!([&ob.called], &[1]); + index += ObserverIndex::PreProposeAdmin as usize; + assert_all!([&ob.called], &[index]); host.pre_apply(®ion, &admin_req); - assert_all!([&ob.called], &[3]); + index += ObserverIndex::PreApplyAdmin as usize; + assert_all!([&ob.called], &[index]); let mut admin_resp = RaftCmdResponse::default(); admin_resp.set_admin_response(AdminResponse::default()); host.post_apply(®ion, &Cmd::new(0, 0, admin_req, admin_resp)); - assert_all!([&ob.called], &[6]); + index += ObserverIndex::PostApplyAdmin as usize; + assert_all!([&ob.called], &[index]); let mut query_req = RaftCmdRequest::default(); query_req.set_requests(vec![Request::default()].into()); host.pre_propose(®ion, &mut query_req).unwrap(); - assert_all!([&ob.called], &[10]); + index += ObserverIndex::PreProposeQuery as usize; + assert_all!([&ob.called], &[index]); + index += ObserverIndex::PreApplyQuery as usize; host.pre_apply(®ion, &query_req); - assert_all!([&ob.called], &[15]); + assert_all!([&ob.called], &[index]); let query_resp = RaftCmdResponse::default(); host.post_apply(®ion, &Cmd::new(0, 0, query_req, query_resp)); - assert_all!([&ob.called], &[21]); + index += ObserverIndex::PostApplyQuery as usize; + assert_all!([&ob.called], &[index]); host.on_role_change(®ion, RoleChange::new(StateRole::Leader)); - assert_all!([&ob.called], &[28]); + index += ObserverIndex::OnRoleChange as usize; + assert_all!([&ob.called], &[index]); host.on_region_changed(®ion, RegionChangeEvent::Create, StateRole::Follower); - assert_all!([&ob.called], &[36]); + index += ObserverIndex::OnRegionChanged as usize; + assert_all!([&ob.called], &[index]); host.post_apply_plain_kvs_from_snapshot(®ion, "default", &[]); - assert_all!([&ob.called], &[45]); + index += ObserverIndex::ApplyPlainKvs as usize; + assert_all!([&ob.called], &[index]); host.post_apply_sst_from_snapshot(®ion, "default", ""); - assert_all!([&ob.called], &[55]); + index += ObserverIndex::ApplySst as usize; + assert_all!([&ob.called], &[index]); let observe_info = CmdObserveInfo::from_handle( ObserveHandle::new(), @@ -866,26 +946,46 @@ mod tests { let mut cb = CmdBatch::new(&observe_info, 0); cb.push(&observe_info, 0, Cmd::default()); host.on_flush_applied_cmd_batch(cb.level, vec![cb], &PanicEngine); - // `post_apply` + `on_flush_applied_cmd_batch` => 13 + 6 = 19 - assert_all!([&ob.called], &[74]); + index += ObserverIndex::PostApplyQuery as usize; + index += ObserverIndex::OnFlushAppliedCmdBatch as usize; + assert_all!([&ob.called], &[index]); let mut empty_req = RaftCmdRequest::default(); empty_req.set_requests(vec![Request::default()].into()); host.on_empty_cmd(®ion, 0, 0); - assert_all!([&ob.called], &[88]); // 14 + index += ObserverIndex::OnEmptyCmd as usize; + assert_all!([&ob.called], &[index]); let mut query_req = RaftCmdRequest::default(); query_req.set_requests(vec![Request::default()].into()); host.pre_exec(®ion, &query_req, 0, 0); - assert_all!([&ob.called], &[103]); // 15 + index += ObserverIndex::PreExecQuery as usize; + assert_all!([&ob.called], &[index]); let mut admin_req = RaftCmdRequest::default(); admin_req.set_admin_request(AdminRequest::default()); host.pre_exec(®ion, &admin_req, 0, 0); - assert_all!([&ob.called], &[119]); // 16 + index += ObserverIndex::PreExecAdmin as usize; + assert_all!([&ob.called], &[index]); host.on_compute_engine_size(); - assert_all!([&ob.called], &[138]); // 19 + index += ObserverIndex::OnComputeEngineSize as usize; + assert_all!([&ob.called], &[index]); + + let mut pending_handle_ssts = None; + let mut delete_ssts = vec![]; + let mut pending_delete_ssts = vec![]; + let mut info = ApplyCtxInfo { + pending_handle_ssts: &mut pending_handle_ssts, + pending_delete_ssts: &mut pending_delete_ssts, + delete_ssts: &mut delete_ssts, + }; + let apply_state = RaftApplyState::default(); + let region_state = RegionState::default(); + let cmd = Cmd::default(); + host.post_exec(®ion, &cmd, &apply_state, ®ion_state, &mut info); + index += ObserverIndex::PostExecQuery as usize; + assert_all!([&ob.called], &[index]); } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 82313ae7d4e..fcbfcfc98ff 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -9,7 +9,7 @@ use std::{ vec::IntoIter, }; -use engine_traits::CfName; +use engine_traits::{CfName, SstMetaInfo}; use kvproto::{ metapb::Region, pdpb::CheckPolicy, @@ -75,12 +75,21 @@ impl<'a> ObserverContext<'a> { } } +/// Context of a region provided for observers. +#[derive(Default, Clone)] pub struct RegionState { pub peer_id: u64, pub pending_remove: bool, pub modified_region: Option, } +/// Context for exec observers of mutation to be applied to ApplyContext. +pub struct ApplyCtxInfo<'a> { + pub pending_handle_ssts: &'a mut Option>, + pub delete_ssts: &'a mut Vec, + pub pending_delete_ssts: &'a mut Vec, +} + pub trait AdminObserver: Coprocessor { /// Hook to call before proposing admin request. fn pre_propose_admin(&self, _: &mut ObserverContext<'_>, _: &mut AdminRequest) -> Result<()> { @@ -115,6 +124,7 @@ pub trait AdminObserver: Coprocessor { _: &Cmd, _: &RaftApplyState, _: &RegionState, + _: &mut ApplyCtxInfo<'_>, ) -> bool { false } @@ -154,6 +164,7 @@ pub trait QueryObserver: Coprocessor { _: &Cmd, _: &RaftApplyState, _: &RegionState, + _: &mut ApplyCtxInfo<'_>, ) -> bool { false } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index d33a262cf6a..3f841e699bb 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -73,7 +73,8 @@ use super::metrics::*; use crate::{ bytes_capacity, coprocessor::{ - Cmd, CmdBatch, CmdObserveInfo, CoprocessorHost, ObserveHandle, ObserveLevel, RegionState, + ApplyCtxInfo, Cmd, CmdBatch, CmdObserveInfo, CoprocessorHost, ObserveHandle, ObserveLevel, + RegionState, }, store::{ cmd_resp, @@ -408,6 +409,11 @@ where /// never apply again at first, then we can delete the ssts files. delete_ssts: Vec, + /// A self-defined engine may be slow to ingest ssts. + /// It may move some elements of `delete_ssts` into `pending_delete_ssts` to + /// delay deletion. Otherwise we may lost data. + pending_delete_ssts: Vec, + /// The priority of this Handler. priority: Priority, /// Whether to yield high-latency operation to low-priority handler. @@ -465,6 +471,7 @@ where perf_context: engine.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), yield_duration: cfg.apply_yield_duration.0, delete_ssts: vec![], + pending_delete_ssts: vec![], store_id, pending_create_peers, priority, @@ -1244,7 +1251,6 @@ where .applied_batch .push(cmd_cb, cmd, &self.observe_info, self.region_id()); if should_write { - debug!("persist data and apply state"; "region_id" => self.region_id(), "peer_id" => self.id(), "state" => ?self.apply_state); apply_ctx.commit(self); } exec_result @@ -1323,6 +1329,22 @@ where self.applied_term = term; let cmd = Cmd::new(index, term, req.clone(), resp.clone()); + let (modified_region, mut pending_handle_ssts) = match exec_result { + ApplyResult::Res(ref e) => match e { + ExecResult::SplitRegion { ref derived, .. } => (Some(derived.clone()), None), + ExecResult::PrepareMerge { ref region, .. } => (Some(region.clone()), None), + ExecResult::CommitMerge { ref region, .. } => (Some(region.clone()), None), + ExecResult::RollbackMerge { ref region, .. } => (Some(region.clone()), None), + ExecResult::IngestSst { ref ssts } => (None, Some(ssts.clone())), + _ => (None, None), + }, + _ => (None, None), + }; + let mut apply_ctx_info = ApplyCtxInfo { + pending_handle_ssts: &mut pending_handle_ssts, + delete_ssts: &mut ctx.delete_ssts, + pending_delete_ssts: &mut ctx.pending_delete_ssts, + }; let should_write = ctx.host.post_exec( &self.region, &cmd, @@ -1330,18 +1352,25 @@ where &RegionState { peer_id: self.id(), pending_remove: self.pending_remove, - modified_region: match exec_result { - ApplyResult::Res(ref e) => match e { - ExecResult::SplitRegion { ref derived, .. } => Some(derived.clone()), - ExecResult::PrepareMerge { ref region, .. } => Some(region.clone()), - ExecResult::CommitMerge { ref region, .. } => Some(region.clone()), - ExecResult::RollbackMerge { ref region, .. } => Some(region.clone()), - _ => None, - }, - _ => None, - }, + modified_region, }, + &mut apply_ctx_info, ); + match pending_handle_ssts { + None => (), + Some(mut v) => { + if !v.is_empty() { + // All elements in `pending_handle_ssts` should be moved into either + // `delete_ssts` or `pending_delete_ssts`, once handled by by any of the + // `post_exec` observers. So a non-empty + // `pending_handle_ssts` here indicates no `post_exec` handled. + ctx.delete_ssts.append(&mut v); + } + RAFT_APPLYING_SST_GAUGE + .with_label_values(&["pending_delete"]) + .set(ctx.pending_delete_ssts.len() as i64); + } + } if let ApplyResult::Res(ref exec_result) = exec_result { match *exec_result { @@ -1564,7 +1593,6 @@ where }; dont_delete_ingested_sst_fp(); } - ctx.delete_ssts.append(&mut ssts.clone()); ApplyResult::Res(ExecResult::IngestSst { ssts }) } else { ApplyResult::None @@ -4967,6 +4995,10 @@ mod tests { cmd_sink: Option>>>, filter_compact_log: Arc, filter_consistency_check: Arc, + delay_remove_ssts: Arc, + last_delete_sst_count: Arc, + last_pending_delete_sst_count: Arc, + last_pending_handle_sst_count: Arc, } impl Coprocessor for ApplyObserver {} @@ -4979,6 +5011,43 @@ mod tests { fn post_apply_query(&self, _: &mut ObserverContext<'_>, _: &Cmd) { self.post_query_count.fetch_add(1, Ordering::SeqCst); } + + fn post_exec_query( + &self, + _: &mut ObserverContext<'_>, + _: &Cmd, + _: &RaftApplyState, + _: &RegionState, + apply_info: &mut ApplyCtxInfo<'_>, + ) -> bool { + match apply_info.pending_handle_ssts { + Some(v) => { + // If it is a ingest sst + let mut ssts = std::mem::take(v); + assert_ne!(ssts.len(), 0); + if self.delay_remove_ssts.load(Ordering::SeqCst) { + apply_info.pending_delete_ssts.append(&mut ssts); + } else { + apply_info.delete_ssts.append(&mut ssts); + } + } + None => (), + } + self.last_delete_sst_count + .store(apply_info.delete_ssts.len() as u64, Ordering::SeqCst); + self.last_pending_delete_sst_count.store( + apply_info.pending_delete_ssts.len() as u64, + Ordering::SeqCst, + ); + self.last_pending_handle_sst_count.store( + match apply_info.pending_handle_ssts { + Some(ref v) => v.len() as u64, + None => 0, + }, + Ordering::SeqCst, + ); + false + } } impl AdminObserver for ApplyObserver { @@ -4988,6 +5057,7 @@ mod tests { cmd: &Cmd, _: &RaftApplyState, region_state: &RegionState, + _: &mut ApplyCtxInfo<'_>, ) -> bool { let request = cmd.request.get_admin_request(); match request.get_cmd_type() { @@ -5664,11 +5734,13 @@ mod tests { #[test] fn test_exec_observer() { let (_path, engine) = create_tmp_engine("test-exec-observer"); - let (_import_dir, importer) = create_tmp_importer("test-exec-observer"); + let (import_dir, importer) = create_tmp_importer("test-exec-observer"); let mut host = CoprocessorHost::::default(); let obs = ApplyObserver::default(); host.registry .register_admin_observer(1, BoxAdminObserver::new(obs.clone())); + host.registry + .register_query_observer(1, BoxQueryObserver::new(obs.clone())); let (tx, rx) = mpsc::channel(); let (region_scheduler, _) = dummy_scheduler(); @@ -5682,7 +5754,7 @@ mod tests { sender, region_scheduler, coprocessor_host: host, - importer, + importer: importer.clone(), engine: engine.clone(), router: router.clone(), store_id: 1, @@ -5783,7 +5855,7 @@ mod tests { let apply_res = fetch_apply_res(&rx); assert_eq!(apply_res.apply_state.get_applied_index(), index_id); assert_eq!(apply_res.applied_term, 1); - let (_, r8) = if let ExecResult::SplitRegion { + let (r1, r8) = if let ExecResult::SplitRegion { regions, derived: _, new_split_regions: _, @@ -5814,6 +5886,48 @@ mod tests { .unwrap_or_default(); assert_eq!(apply_res.apply_state, state); + // Phase 3: we test if we can delay deletion of some sst files. + let r1_epoch = r1.get_region_epoch(); + index_id += 1; + let kvs: Vec<(&[u8], &[u8])> = vec![(b"k3", b"2")]; + let sst_path = import_dir.path().join("test.sst"); + let (mut meta, data) = gen_sst_file_with_kvs(&sst_path, &kvs); + meta.set_region_id(1); + meta.set_region_epoch(r1_epoch.clone()); + let mut file = importer.create(&meta).unwrap(); + file.append(&data).unwrap(); + file.finish().unwrap(); + let src = sst_path.clone(); + let dst = file.get_import_path().save.to_str().unwrap(); + std::fs::copy(src, dst).unwrap(); + assert!(sst_path.as_path().exists()); + let ingestsst = EntryBuilder::new(index_id, 1) + .ingest_sst(&meta) + .epoch(r1_epoch.get_conf_ver(), r1_epoch.get_version()) + .build(); + + obs.delay_remove_ssts.store(true, Ordering::SeqCst); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![ingestsst], vec![]))); + fetch_apply_res(&rx); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.exec_res.len(), 1); + assert_eq!(obs.last_pending_handle_sst_count.load(Ordering::SeqCst), 0); + assert_eq!(obs.last_delete_sst_count.load(Ordering::SeqCst), 0); + assert_eq!(obs.last_pending_delete_sst_count.load(Ordering::SeqCst), 1); + + index_id += 1; + let ingestsst = EntryBuilder::new(index_id, 1) + .ingest_sst(&meta) + .epoch(r1_epoch.get_conf_ver(), r1_epoch.get_version()) + .build(); + obs.delay_remove_ssts.store(false, Ordering::SeqCst); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![ingestsst], vec![]))); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.exec_res.len(), 1); + assert_eq!(obs.last_pending_handle_sst_count.load(Ordering::SeqCst), 0); + assert_eq!(obs.last_delete_sst_count.load(Ordering::SeqCst), 1); + assert_eq!(obs.last_pending_delete_sst_count.load(Ordering::SeqCst), 1); + system.shutdown(); } diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 9691d5be0db..587b9ad3af7 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -777,4 +777,10 @@ lazy_static! { .unwrap(); pub static ref RAFT_LOG_GC_SKIPPED: RaftLogGcSkippedVec = auto_flush_from!(RAFT_LOG_GC_SKIPPED_VEC, RaftLogGcSkippedVec); + + pub static ref RAFT_APPLYING_SST_GAUGE: IntGaugeVec = register_int_gauge_vec!( + "tikv_raft_applying_sst", + "Sum of applying sst.", + &["type"] + ).unwrap(); } diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index f5292b70075..c4a0498a9a6 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -183,6 +183,10 @@ impl ImportFile { } Ok(()) } + + pub fn get_import_path(&self) -> &ImportPath { + &self.path + } } impl fmt::Debug for ImportFile { From 117805f8ebf9eede138ce7a7345445cac251b45d Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 16 Aug 2022 16:04:52 +0800 Subject: [PATCH 157/676] server: make tablet cache a local variable (#13273) ref tikv/tikv#13214 Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- components/server/src/server.rs | 123 ++++++++++++++++++++++++++++---- 1 file changed, 108 insertions(+), 15 deletions(-) diff --git a/components/server/src/server.rs b/components/server/src/server.rs index e925a663943..f61d981a912 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -45,7 +45,7 @@ use engine_rocks::{ use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, - TabletAccessor, TabletFactory, CF_DEFAULT, CF_LOCK, CF_WRITE, + TabletFactory, CF_DEFAULT, CF_LOCK, CF_WRITE, }; use error_code::ErrorCodeExt; use file_system::{ @@ -98,8 +98,8 @@ use tikv::{ service::{DebugService, DiagnosticsService}, status_server::StatusServer, ttl::TtlChecker, - KvEngineFactory, KvEngineFactoryBuilder, Node, RaftKv, Server, CPU_CORES_QUOTA_GAUGE, - DEFAULT_CLUSTER_ID, GRPC_THREAD_PREFIX, + KvEngineFactoryBuilder, Node, RaftKv, Server, CPU_CORES_QUOTA_GAUGE, DEFAULT_CLUSTER_ID, + GRPC_THREAD_PREFIX, }, storage::{ self, @@ -1280,12 +1280,19 @@ where ); let mut io_metrics = IoMetricsManager::new(fetcher); let engines_info_clone = engines_info.clone(); + + // region_id -> (suffix, tablet) + // `update` of EnginesResourceInfo is called perodically which needs this map + // for recording the latest tablet for each region. + // `cached_latest_tablets` is passed to `update` to avoid memory + // allocation each time when calling `update`. + let mut cached_latest_tablets: HashMap = HashMap::new(); self.background_worker .spawn_interval_task(DEFAULT_METRICS_FLUSH_INTERVAL, move || { let now = Instant::now(); engine_metrics.flush(now); io_metrics.flush(now); - engines_info_clone.update(now); + engines_info_clone.update(now, &mut cached_latest_tablets); }); if let Some(limiter) = get_io_rate_limiter() { limiter.set_low_priority_io_adjustor_if_needed(Some(engines_info)); @@ -1844,13 +1851,8 @@ impl EngineMetricsManager { } pub struct EnginesResourceInfo { - tablet_factory: Arc, + tablet_factory: Arc + Sync + Send>, raft_engine: Option, - // region_id -> (suffix, tablet) - // `update` is called perodically which needs this map for recording the latest tablet for each - // region and cached_latest_tablets is used to avoid memory allocation each time when - // calling `update`. - cached_latest_tablets: Arc>>, latest_normalized_pending_bytes: AtomicU32, normalized_pending_bytes_collector: MovingAvgU32, } @@ -1859,20 +1861,23 @@ impl EnginesResourceInfo { const SCALE_FACTOR: u64 = 100; fn new( - tablet_factory: Arc, + tablet_factory: Arc + Sync + Send>, raft_engine: Option, max_samples_to_preserve: usize, ) -> Self { EnginesResourceInfo { tablet_factory, raft_engine, - cached_latest_tablets: Arc::default(), latest_normalized_pending_bytes: AtomicU32::new(0), normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), } } - pub fn update(&self, _now: Instant) { + pub fn update( + &self, + _now: Instant, + cached_latest_tablets: &mut HashMap, + ) { let mut normalized_pending_bytes = 0; fn fetch_engine_cf(engine: &RocksEngine, cf: &str, normalized_pending_bytes: &mut u32) { @@ -1894,8 +1899,6 @@ impl EnginesResourceInfo { fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); } - let mut cached_latest_tablets = self.cached_latest_tablets.as_ref().lock().unwrap(); - self.tablet_factory .for_each_opened_tablet( &mut |id, suffix, db: &RocksEngine| match cached_latest_tablets.entry(id) { @@ -1955,3 +1958,93 @@ impl IoBudgetAdjustor for EnginesResourceInfo { (total_budgets as f32 * score) as usize } } + +#[cfg(test)] +mod test { + use std::{ + collections::HashMap, + sync::{atomic::Ordering, Arc}, + }; + + use engine_rocks::{raw::Env, RocksEngine}; + use engine_traits::{ + FlowControlFactorsExt, MiscExt, OpenOptions, SyncMutable, TabletFactory, CF_DEFAULT, + }; + use tempfile::Builder; + use tikv::{config::TikvConfig, server::KvEngineFactoryBuilder}; + use tikv_util::{config::ReadableSize, time::Instant}; + + use super::EnginesResourceInfo; + + #[test] + fn test_engines_resource_info_update() { + let mut config = TikvConfig::default(); + config.rocksdb.defaultcf.disable_auto_compactions = true; + config.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.writecf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.lockcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + let env = Arc::new(Env::default()); + let path = Builder::new().prefix("test-update").tempdir().unwrap(); + + let builder = KvEngineFactoryBuilder::new(env, &config, path.path()); + let factory = builder.build_v2(); + + for i in 1..6 { + let _ = factory + .open_tablet(i, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); + } + + let tablet = factory + .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) + .unwrap(); + // Prepare some data for two tablets of the same region. So we can test whether + // we fetch the bytes from the latest one. + for i in 1..21 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } + } + let old_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); + + let tablet = factory + .open_tablet(1, Some(20), OpenOptions::default().set_create_new(true)) + .unwrap(); + + for i in 1..11 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } + } + let new_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); + + assert!(old_pending_compaction_bytes > new_pending_compaction_bytes); + + let engines_info = Arc::new(EnginesResourceInfo::new(Arc::new(factory), None, 10)); + + let mut cached_latest_tablets: HashMap = HashMap::new(); + engines_info.update(Instant::now(), &mut cached_latest_tablets); + + // The memory allocation should be reserved + assert!(cached_latest_tablets.capacity() >= 5); + // The tablet cache should be cleared + assert!(cached_latest_tablets.is_empty()); + + // The latest_normalized_pending_bytes should be equal to the pending compaction + // bytes of tablet_1_20 + assert_eq!( + (new_pending_compaction_bytes * 100) as u32, + engines_info + .latest_normalized_pending_bytes + .load(Ordering::Relaxed) + ); + } +} From 73c5d13455e0f07e5b38dd5de2d346afa2f1ddef Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 16 Aug 2022 19:32:50 -0700 Subject: [PATCH 158/676] raftstore-v2: add ready processing (#13227) ref tikv/tikv#12842 This PR adds the basic ready processing for v2. Note test case can't be run for now as there is still API missing from raft engine. Compared to v1, v2 always use async raft. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/engine_test/src/lib.rs | 14 +- components/raft_log_engine/src/engine.rs | 25 +- components/raftstore-v2/Cargo.toml | 5 + components/raftstore-v2/src/batch/mod.rs | 2 +- components/raftstore-v2/src/batch/store.rs | 146 ++++++++- components/raftstore-v2/src/fsm/peer.rs | 126 +++++++- components/raftstore-v2/src/fsm/store.rs | 10 +- components/raftstore-v2/src/lib.rs | 15 +- components/raftstore-v2/src/operation/mod.rs | 3 + .../raftstore-v2/src/operation/read/local.rs | 6 +- .../src/operation/ready/async_writer.rs | 199 ++++++++++++ .../raftstore-v2/src/operation/ready/mod.rs | 304 ++++++++++++++++++ components/raftstore-v2/src/raft/apply.rs | 2 +- components/raftstore-v2/src/raft/peer.rs | 124 ++++++- components/raftstore-v2/src/raft/storage.rs | 144 ++++++--- components/raftstore-v2/src/router/imp.rs | 12 + components/raftstore-v2/src/router/message.rs | 5 +- components/raftstore-v2/src/router/mod.rs | 1 + .../raftstore-v2/tests/integrations/mod.rs | 174 ++++++++++ .../tests/integrations/test_election.rs | 10 + .../raftstore/src/store/async_io/write.rs | 5 +- .../src/store/async_io/write_router.rs | 72 +++-- .../src/store/async_io/write_tests.rs | 4 +- components/raftstore/src/store/fsm/peer.rs | 4 +- components/raftstore/src/store/fsm/store.rs | 23 +- components/raftstore/src/store/mod.rs | 12 +- components/raftstore/src/store/msg.rs | 11 +- .../raftstore/src/store/peer_storage.rs | 49 +-- components/raftstore/src/store/transport.rs | 9 + components/raftstore/src/store/worker/mod.rs | 4 +- .../src/store/worker/raftlog_fetch.rs | 42 +-- 32 files changed, 1346 insertions(+), 217 deletions(-) create mode 100644 components/raftstore-v2/src/operation/ready/async_writer.rs create mode 100644 components/raftstore-v2/src/operation/ready/mod.rs create mode 100644 components/raftstore-v2/src/router/imp.rs create mode 100644 components/raftstore-v2/tests/integrations/mod.rs create mode 100644 components/raftstore-v2/tests/integrations/test_election.rs diff --git a/Cargo.lock b/Cargo.lock index 1efe0607541..93ac7ddd600 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4201,6 +4201,7 @@ dependencies = [ "kvproto", "log_wrappers", "pd_client", + "protobuf", "raft", "raft-proto", "raftstore", diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 18d89b1c2fb..bc723dbb76a 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -113,7 +113,7 @@ pub mod kv { #[derive(Clone)] pub struct TestTabletFactory { - root_path: String, + root_path: PathBuf, db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>, root_db: Arc>>, @@ -121,12 +121,12 @@ pub mod kv { impl TestTabletFactory { pub fn new( - root_path: &str, + root_path: &Path, db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>, ) -> Self { Self { - root_path: root_path.to_string(), + root_path: root_path.to_path_buf(), db_opt, cf_opts, root_db: Arc::new(Mutex::default()), @@ -230,7 +230,7 @@ pub mod kv { impl TestTabletFactoryV2 { pub fn new( - root_path: &str, + root_path: &Path, db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>, ) -> Self { @@ -342,12 +342,14 @@ pub mod kv { #[inline] fn tablets_path(&self) -> PathBuf { - Path::new(&self.inner.root_path).join("tablets") + self.inner.root_path.join("tablets") } #[inline] fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { - Path::new(&self.inner.root_path).join(format!("tablets/{}_{}", id, suffix)) + self.inner + .root_path + .join(format!("tablets/{}_{}", id, suffix)) } #[inline] diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index dd7c222845c..8991a6f6838 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -28,8 +28,8 @@ use tikv_util::Either; use crate::perf_context::RaftEnginePerfContext; -// A special region ID representing global state. -const STORE_REGION_ID: u64 = 0; +// A special region ID representing store state. +const STORE_STATE_ID: u64 = 0; #[derive(Clone)] pub struct MessageExtTyped; @@ -377,14 +377,14 @@ impl RaftLogBatchTrait for RaftLogBatch { fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()> { self.0 - .put_message(STORE_REGION_ID, STORE_IDENT_KEY.to_vec(), ident) + .put_message(STORE_STATE_ID, STORE_IDENT_KEY.to_vec(), ident) .map_err(transfer_error) } fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()> { self.0 .put_message( - STORE_REGION_ID, + STORE_STATE_ID, PREPARE_BOOTSTRAP_REGION_KEY.to_vec(), region, ) @@ -393,7 +393,7 @@ impl RaftLogBatchTrait for RaftLogBatch { fn remove_prepare_bootstrap_region(&mut self) -> Result<()> { self.0 - .delete(STORE_REGION_ID, PREPARE_BOOTSTRAP_REGION_KEY.to_vec()); + .delete(STORE_STATE_ID, PREPARE_BOOTSTRAP_REGION_KEY.to_vec()); Ok(()) } @@ -451,13 +451,13 @@ impl RaftEngineReadOnly for RaftLogEngine { fn get_store_ident(&self) -> Result> { self.0 - .get_message(STORE_REGION_ID, STORE_IDENT_KEY) + .get_message(STORE_STATE_ID, STORE_IDENT_KEY) .map_err(transfer_error) } fn get_prepare_bootstrap_region(&self) -> Result> { self.0 - .get_message(STORE_REGION_ID, PREPARE_BOOTSTRAP_REGION_KEY) + .get_message(STORE_STATE_ID, PREPARE_BOOTSTRAP_REGION_KEY) .map_err(transfer_error) } @@ -541,7 +541,7 @@ impl RaftEngine for RaftLogEngine { let mut batch = Self::LogBatch::default(); batch .0 - .put_message(STORE_REGION_ID, STORE_IDENT_KEY.to_vec(), ident) + .put_message(STORE_STATE_ID, STORE_IDENT_KEY.to_vec(), ident) .map_err(transfer_error)?; self.0.write(&mut batch.0, true).map_err(transfer_error)?; Ok(()) @@ -605,12 +605,17 @@ impl RaftEngine for RaftLogEngine { Ok(self.0.get_used_size() as u64) } - fn for_each_raft_group(&self, _f: &mut F) -> std::result::Result<(), E> + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> where F: FnMut(u64) -> std::result::Result<(), E>, E: From, { - unimplemented!() + for id in self.0.raft_groups() { + if id != STORE_STATE_ID { + f(id)?; + } + } + Ok(()) } } diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 8551864a444..29e68517441 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -40,6 +40,7 @@ keys = { path = "../keys", default-features = false } kvproto = { git = "https://github.com/pingcap/kvproto.git" } log_wrappers = { path = "../log_wrappers" } pd_client = { path = "../pd_client" } +protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0" } raftstore = { path = "../raftstore" } @@ -60,3 +61,7 @@ test_util = { path = "../test_util" } name = "raftstore-v2-failpoints" path = "tests/failpoints/mod.rs" required-features = ["failpoints"] + +[[test]] +name = "raftstore-v2-integrations" +path = "tests/integrations/mod.rs" diff --git a/components/raftstore-v2/src/batch/mod.rs b/components/raftstore-v2/src/batch/mod.rs index 0f4b9fba3d3..7e00932d1e1 100644 --- a/components/raftstore-v2/src/batch/mod.rs +++ b/components/raftstore-v2/src/batch/mod.rs @@ -9,4 +9,4 @@ mod apply; mod store; pub(crate) use apply::ApplyContext; -pub use store::{create_store_batch_system, StoreContext, StoreSystem}; +pub use store::{create_store_batch_system, StoreContext, StoreRouter, StoreSystem}; diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 9c1f60ba947..739240f84e0 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -1,15 +1,25 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{mem, ops::DerefMut, sync::Arc, time::Duration}; +use std::{ + mem, + ops::{Deref, DerefMut}, + sync::{atomic::AtomicUsize, Arc}, + time::Duration, +}; use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, HandleResult, HandlerBuilder, PollHandler, }; use collections::HashMap; +use crossbeam::channel::Sender; use engine_traits::{Engines, KvEngine, RaftEngine, TabletFactory}; use futures_util::{compat::Future01CompatExt, FutureExt}; use kvproto::{metapb::Store, raft_serverpb::PeerState}; -use raftstore::store::{fsm::store::PeerTickBatch, Config, Transport}; +use raft::INVALID_ID; +use raftstore::store::{ + fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, RaftlogFetchRunner, + RaftlogFetchTask, StoreWriters, Transport, WriteMsg, WriteSenders, +}; use slog::Logger; use tikv_util::{ box_err, @@ -17,6 +27,7 @@ use tikv_util::{ future::poll_future_notify, time::Instant as TiInstant, timer::SteadyTimer, + worker::{Scheduler, Worker}, }; use super::apply::{create_apply_batch_system, ApplyPollerBuilder, ApplyRouter, ApplySystem}; @@ -27,28 +38,42 @@ use crate::{ }; /// A per-thread context shared by the [`StoreFsm`] and multiple [`PeerFsm`]s. -pub struct StoreContext { - /// A logger without any KV. It's clean for creating new PeerFsm. +pub struct StoreContext { + /// A logger without any KV. It's clean for creating new PeerFSM. pub logger: Logger, /// The transport for sending messages to peers on other stores. pub trans: T, + pub has_ready: bool, + pub raft_metrics: RaftMetrics, /// The latest configuration. pub cfg: Config, + pub router: StoreRouter, /// The tick batch for delay ticking. It will be flushed at the end of every /// round. pub tick_batch: Vec, /// The precise timer for scheduling tick. pub timer: SteadyTimer, + pub write_senders: WriteSenders, } -impl StoreContext { - fn new(cfg: Config, trans: T, logger: Logger) -> Self { +impl StoreContext { + fn new( + cfg: Config, + trans: T, + router: StoreRouter, + write_senders: WriteSenders, + logger: Logger, + ) -> Self { Self { logger, trans, + has_ready: false, + raft_metrics: RaftMetrics::new(cfg.waterfall_metrics), cfg, + router, tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], timer: SteadyTimer::default(), + write_senders, } } } @@ -59,8 +84,8 @@ impl StoreContext { /// /// - Keeping the local [`StoreContext`] up-to-date. /// - Receiving and sending messages in and out of these FSMs. -struct StorePoller { - poll_ctx: StoreContext, +struct StorePoller { + poll_ctx: StoreContext, cfg_tracker: Tracker, /// Buffers to hold in-coming messages. store_msg_buf: Vec, @@ -71,8 +96,8 @@ struct StorePoller { need_flush_events: bool, } -impl StorePoller { - pub fn new(poll_ctx: StoreContext, cfg_tracker: Tracker) -> Self { +impl StorePoller { + pub fn new(poll_ctx: StoreContext, cfg_tracker: Tracker) -> Self { Self { poll_ctx, cfg_tracker, @@ -108,7 +133,7 @@ impl StorePoller { } impl PollHandler, StoreFsm> - for StorePoller + for StorePoller { fn begin(&mut self, _batch_size: usize, update_cfg: F) where @@ -147,7 +172,11 @@ impl PollHandler { engine: ER, tablet_factory: Arc>, trans: T, + router: StoreRouter, + log_fetch_scheduler: Scheduler, + write_senders: WriteSenders, logger: Logger, } @@ -197,6 +229,9 @@ impl StorePollerBuilder { engine: ER, tablet_factory: Arc>, trans: T, + router: StoreRouter, + log_fetch_scheduler: Scheduler, + store_writers: &mut StoreWriters, logger: Logger, ) -> Self { StorePollerBuilder { @@ -205,7 +240,10 @@ impl StorePollerBuilder { engine, tablet_factory, trans, + router, + log_fetch_scheduler, logger, + write_senders: store_writers.senders(), } } @@ -215,12 +253,14 @@ impl StorePollerBuilder { let cfg = self.cfg.value(); self.engine .for_each_raft_group::(&mut |region_id| { + assert_ne!(region_id, INVALID_ID); let peer = match Peer::new( &cfg, region_id, self.store_id, self.tablet_factory.as_ref(), self.engine.clone(), + self.log_fetch_scheduler.clone(), &self.logger, )? { Some(peer) => peer, @@ -254,12 +294,14 @@ where EK: KvEngine, T: Transport + 'static, { - type Handler = StorePoller; + type Handler = StorePoller; fn build(&mut self, priority: batch_system::Priority) -> Self::Handler { let poll_ctx = StoreContext::new( self.cfg.value().clone(), self.trans.clone(), + self.router.clone(), + self.write_senders.clone(), self.logger.clone(), ); let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); @@ -267,11 +309,29 @@ where } } +/// A set of background threads that will processing offloaded work from +/// raftstore. +struct Workers { + /// Worker for fetching raft logs asynchronously + log_fetch_worker: Worker, + store_writers: StoreWriters, +} + +impl Default for Workers { + fn default() -> Self { + Self { + log_fetch_worker: Worker::new("raftlog-fetch-worker"), + store_writers: StoreWriters::default(), + } + } +} + /// The system used for polling Raft activities. pub struct StoreSystem { system: BatchSystem, StoreFsm>, apply_router: ApplyRouter, apply_system: ApplySystem, + workers: Option>, logger: Logger, } @@ -288,14 +348,32 @@ impl StoreSystem { where T: Transport + 'static, { + let mut workers = Workers::default(); + workers.store_writers.spawn( + store.get_id(), + raft_engine.clone(), + None, + router, + &trans, + &cfg, + )?; + let log_fetch_scheduler = workers.log_fetch_worker.start( + "raftlog-fetch-worker", + RaftlogFetchRunner::new(router.clone(), raft_engine.clone()), + ); + let mut builder = StorePollerBuilder::new( cfg.clone(), store.get_id(), raft_engine, tablet_factory, trans, + router.clone(), + log_fetch_scheduler, + &mut workers.store_writers, self.logger.clone(), ); + self.workers = Some(workers); let peers = builder.init()?; self.apply_system .schedule_all(peers.values().map(|pair| pair.1.peer())); @@ -328,12 +406,47 @@ impl StoreSystem { } pub fn shutdown(&mut self) { + if self.workers.is_none() { + return; + } + let mut workers = self.workers.take().unwrap(); + self.apply_system.shutdown(); self.system.shutdown(); + + workers.store_writers.shutdown(); + workers.log_fetch_worker.stop(); + } +} + +#[derive(Clone)] +pub struct StoreRouter { + router: BatchRouter, StoreFsm>, + logger: Logger, +} + +impl StoreRouter { + #[inline] + pub fn logger(&self) -> &Logger { + &self.logger } } -pub type StoreRouter = BatchRouter, StoreFsm>; +impl Deref for StoreRouter { + type Target = BatchRouter, StoreFsm>; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.router + } +} + +impl DerefMut for StoreRouter { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.router + } +} /// Creates the batch system for polling raft activities. pub fn create_store_batch_system( @@ -353,7 +466,8 @@ where system, apply_router, apply_system, - logger, + workers: None, + logger: logger.clone(), }; - (router, system) + (StoreRouter { router, logger }, system) } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index a8fb67aa121..696a1e5ddf4 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -1,37 +1,43 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +//! This module contains the peer implementation for batch system. + use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine}; use kvproto::metapb; -use raftstore::store::Config; -use slog::{info, Logger}; -use tikv_util::mpsc::{self, LooseBoundedSender, Receiver, Sender}; +use raftstore::store::{Config, Transport}; +use slog::{debug, error, info, trace, Logger}; +use tikv_util::{ + is_zero_duration, + mpsc::{self, LooseBoundedSender, Receiver, Sender}, +}; -use crate::{batch::StoreContext, raft::Peer, PeerMsg, Result}; +use crate::{batch::StoreContext, raft::Peer, PeerMsg, PeerTick, Result}; pub type SenderFsmPair = (LooseBoundedSender>, Box>); pub struct PeerFsm { peer: Peer, - logger: Logger, mailbox: Option>>, receiver: Receiver>, + /// A registry for all scheduled ticks. This can avoid scheduling ticks + /// twice accidentally. + tick_registry: u16, is_stopped: bool, } impl PeerFsm { pub fn new(cfg: &Config, peer: Peer) -> Result> { - let logger = peer.logger().clone(); - info!(logger, "create peer"); + info!(peer.logger, "create peer"); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); let fsm = Box::new(PeerFsm { - logger, peer, mailbox: None, receiver: rx, + tick_registry: 0, is_stopped: false, }); Ok((tx, fsm)) @@ -42,9 +48,14 @@ impl PeerFsm { &self.peer } + #[inline] + pub fn peer_mut(&mut self) -> &mut Peer { + &mut self.peer + } + #[inline] pub fn logger(&self) -> &Logger { - self.peer.logger() + &self.peer.logger } /// Fetches messages to `peer_msg_buf`. It will stop when the buffer @@ -95,18 +106,103 @@ impl Fsm for PeerFsm { } pub struct PeerFsmDelegate<'a, EK: KvEngine, ER: RaftEngine, T> { - fsm: &'a mut PeerFsm, - store_ctx: &'a mut StoreContext, + pub fsm: &'a mut PeerFsm, + pub store_ctx: &'a mut StoreContext, } -impl<'a, EK: KvEngine, ER: RaftEngine, T> PeerFsmDelegate<'a, EK, ER, T> { - pub fn new(fsm: &'a mut PeerFsm, store_ctx: &'a mut StoreContext) -> Self { +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + pub fn new(fsm: &'a mut PeerFsm, store_ctx: &'a mut StoreContext) -> Self { Self { fsm, store_ctx } } - pub fn handle_msgs(&self, peer_msgs_buf: &mut Vec>) { + pub fn schedule_tick(&mut self, tick: PeerTick) { + assert!(PeerTick::VARIANT_COUNT <= u16::BITS as usize); + let idx = tick as usize; + let key = 1u16 << (idx as u16); + if self.fsm.tick_registry & key != 0 { + return; + } + if is_zero_duration(&self.store_ctx.tick_batch[idx].wait_duration) { + return; + } + trace!( + self.fsm.logger(), + "schedule tick"; + "tick" => ?tick, + "timeout" => ?self.store_ctx.tick_batch[idx].wait_duration, + ); + + let region_id = self.fsm.peer.region_id(); + let mb = match self.store_ctx.router.mailbox(region_id) { + Some(mb) => mb, + None => { + error!( + self.fsm.logger(), + "failed to get mailbox"; + "tick" => ?tick, + ); + return; + } + }; + self.fsm.tick_registry |= key; + let logger = self.fsm.logger().clone(); + // TODO: perhaps following allocation can be removed. + let cb = Box::new(move || { + // This can happen only when the peer is about to be destroyed + // or the node is shutting down. So it's OK to not to clean up + // registry. + if let Err(e) = mb.force_send(PeerMsg::Tick(tick)) { + debug!( + logger, + "failed to schedule peer tick"; + "tick" => ?tick, + "err" => %e, + ); + } + }); + self.store_ctx.tick_batch[idx].ticks.push(cb); + } + + fn on_start(&mut self) { + self.schedule_tick(PeerTick::Raft); + } + + fn on_tick(&mut self, tick: PeerTick) { + match tick { + PeerTick::Raft => self.on_raft_tick(), + PeerTick::RaftLogGc => unimplemented!(), + PeerTick::SplitRegionCheck => unimplemented!(), + PeerTick::PdHeartbeat => unimplemented!(), + PeerTick::CheckMerge => unimplemented!(), + PeerTick::CheckPeerStaleState => unimplemented!(), + PeerTick::EntryCacheEvict => unimplemented!(), + PeerTick::CheckLeaderLease => unimplemented!(), + PeerTick::ReactivateMemoryLock => unimplemented!(), + PeerTick::ReportBuckets => unimplemented!(), + PeerTick::CheckLongUncommitted => unimplemented!(), + } + } + + pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec>) { for msg in peer_msgs_buf.drain(..) { - // TODO: handle the messages. + match msg { + PeerMsg::RaftMessage(_) => unimplemented!(), + PeerMsg::RaftCommand(_) => unimplemented!(), + PeerMsg::Tick(tick) => self.on_tick(tick), + PeerMsg::ApplyRes { res } => unimplemented!(), + PeerMsg::Start => self.on_start(), + PeerMsg::Noop => unimplemented!(), + PeerMsg::Persisted { + peer_id, + ready_number, + } => self + .fsm + .peer_mut() + .on_persisted(self.store_ctx, peer_id, ready_number), + PeerMsg::FetchedLogs(fetched_logs) => { + self.fsm.peer_mut().on_fetched_logs(fetched_logs) + } + } } } } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 886478a3036..d80cd90d80b 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -3,7 +3,7 @@ use batch_system::Fsm; use collections::HashMap; use crossbeam::channel::TryRecvError; -use engine_traits::KvEngine; +use engine_traits::{KvEngine, RaftEngine}; use kvproto::metapb::Store; use raftstore::store::{Config, ReadDelegate}; use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; @@ -74,13 +74,13 @@ impl Fsm for StoreFsm { } } -pub struct StoreFsmDelegate<'a, T> { +pub struct StoreFsmDelegate<'a, EK: KvEngine, ER: RaftEngine, T> { fsm: &'a mut StoreFsm, - store_ctx: &'a mut StoreContext, + store_ctx: &'a mut StoreContext, } -impl<'a, T> StoreFsmDelegate<'a, T> { - pub fn new(fsm: &'a mut StoreFsm, store_ctx: &'a mut StoreContext) -> Self { +impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { + pub fn new(fsm: &'a mut StoreFsm, store_ctx: &'a mut StoreContext) -> Self { Self { fsm, store_ctx } } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 0739cd61cb7..43998160638 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -9,7 +9,20 @@ //! [`operation`] module. All state machines are expected to communicate with //! messages. They are defined in [`router`] module. +// You may get confused about the peer, or other structs like apply, in fsm and +// peer in raft module. The guideline is that if any field doesn't depend on +// the details of batch system, then it should be defined for peer in raft +// module. +// +// If we change to other concurrent programming solution, we can easily just +// change the peer in fsm. +// +// Any accessors should be defined in the file where the struct is defined. +// Functionalities like read, write, etc should be implemented in [`operation`] +// using a standalone modules. + #![allow(unused)] +#![feature(let_else)] mod batch; mod bootstrap; @@ -20,7 +33,7 @@ mod router; mod tablet; pub(crate) use batch::StoreContext; -pub use batch::{create_store_batch_system, StoreSystem}; +pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; pub use bootstrap::Bootstrap; pub use raftstore::{Error, Result}; pub use router::{PeerMsg, PeerTick, StoreMsg, StoreTick}; diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 8c427378da3..583053dd551 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -1,3 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. mod read; +mod ready; + +pub use ready::AsyncWriter; diff --git a/components/raftstore-v2/src/operation/read/local.rs b/components/raftstore-v2/src/operation/read/local.rs index 2e694f11ebc..6601477f8c3 100644 --- a/components/raftstore-v2/src/operation/read/local.rs +++ b/components/raftstore-v2/src/operation/read/local.rs @@ -190,11 +190,7 @@ mod tests { .prefix("test-local-reader") .tempdir() .unwrap(); - let factory = Arc::new(TestTabletFactoryV2::new( - path.path().to_str().unwrap(), - ops, - cf_opts, - )); + let factory = Arc::new(TestTabletFactoryV2::new(path.path(), ops, cf_opts)); let store_meta = StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::::new()))); diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs new file mode 100644 index 00000000000..457df9307ba --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -0,0 +1,199 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::VecDeque, + sync::{atomic::AtomicUsize, Arc}, +}; + +use crossbeam::channel::Sender; +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::raft_serverpb::RaftMessage; +use raftstore::store::{ + local_metrics::RaftMetrics, Config, PersistedNotifier, WriteMsg, WriteRouter, + WriteRouterContext, WriteSenders, WriteTask, +}; +use slog::{warn, Logger}; + +use crate::{ + batch::{StoreContext, StoreRouter}, + PeerMsg, +}; + +#[derive(Debug)] +struct UnpersistedReady { + /// Number of ready. + number: u64, + /// Max number of following ready whose data to be persisted is empty. + max_empty_number: u64, + raft_msgs: Vec>, +} + +/// A writer that handles asynchronous writes. +pub struct AsyncWriter { + write_router: WriteRouter, + unpersisted_readies: VecDeque, + persisted_number: u64, +} + +impl AsyncWriter { + pub fn new(region_id: u64, peer_id: u64) -> Self { + let write_router = WriteRouter::new(format!("[region {}] {}", region_id, peer_id)); + Self { + write_router, + unpersisted_readies: VecDeque::new(), + persisted_number: 0, + } + } + + /// Execute the task. + /// + /// If the task takes some time to finish, `None` is returned. Otherwise, + pub fn write( + &mut self, + ctx: &mut impl WriteRouterContext, + task: WriteTask, + ) -> Option> { + if task.has_data() { + self.send(ctx, task); + None + } else { + self.merge(task) + } + } + + pub fn known_largest_number(&self) -> u64 { + self.unpersisted_readies + .back() + .map(|r| r.number) + .unwrap_or(self.persisted_number) + } + + fn send(&mut self, ctx: &mut impl WriteRouterContext, task: WriteTask) { + let ready_number = task.ready_number(); + self.write_router.send_write_msg( + ctx, + self.unpersisted_readies.back().map(|r| r.number), + raftstore::store::WriteMsg::WriteTask(task), + ); + self.unpersisted_readies.push_back(UnpersistedReady { + number: ready_number, + max_empty_number: ready_number, + raft_msgs: vec![], + }); + } + + fn merge(&mut self, task: WriteTask) -> Option> { + let ready_number = task.ready_number(); + if self.unpersisted_readies.is_empty() { + // If this ready don't need to be persisted and there is no previous unpersisted + // ready, we can safely consider it is persisted so the persisted msgs can be + // sent immediately. + self.persisted_number = task.ready_number(); + return Some(task); + } + + // Attach to the last unpersisted ready so that it can be considered to be + // persisted with the last ready at the same time. + let last = self.unpersisted_readies.back_mut().unwrap(); + last.max_empty_number = task.ready_number(); + if !task.messages.is_empty() { + last.raft_msgs.push(task.messages); + } + None + } + + /// Called when an asynchronous write has finished. + pub fn on_persisted( + &mut self, + ctx: &mut impl WriteRouterContext, + ready_number: u64, + logger: &Logger, + ) -> Vec> { + if self.persisted_number >= ready_number { + return vec![]; + } + + let last_unpersisted = self.unpersisted_readies.back(); + if last_unpersisted.map_or(true, |u| u.number < ready_number) { + panic!( + "{:?} ready number is too large {:?} vs {}", + logger.list(), + last_unpersisted, + ready_number + ); + } + + let mut raft_messages = vec![]; + // There must be a match in `self.unpersisted_readies`. + loop { + let Some(v) = self.unpersisted_readies.pop_front() else { + panic!("{:?} ready number not found {}", logger.list(), ready_number); + }; + if v.number > ready_number { + panic!( + "{:?} ready number not matched {:?} vs {}", + logger.list(), + v, + ready_number + ); + } + if raft_messages.is_empty() { + raft_messages = v.raft_msgs; + } else { + raft_messages.extend(v.raft_msgs); + } + if v.number == ready_number { + self.persisted_number = v.max_empty_number; + break; + } + } + + self.write_router + .check_new_persisted(ctx, self.persisted_number); + + raft_messages + } + + pub fn persisted_number(&self) -> u64 { + self.persisted_number + } +} + +impl WriteRouterContext for StoreContext +where + EK: KvEngine, + ER: RaftEngine, +{ + fn write_senders(&self) -> &WriteSenders { + &self.write_senders + } + + fn config(&self) -> &Config { + &self.cfg + } + + fn raft_metrics(&self) -> &RaftMetrics { + &self.raft_metrics + } +} + +impl PersistedNotifier for StoreRouter { + fn notify(&self, region_id: u64, peer_id: u64, ready_number: u64) { + if let Err(e) = self.force_send( + region_id, + PeerMsg::Persisted { + peer_id, + ready_number, + }, + ) { + warn!( + self.logger(), + "failed to send noop to trigger persisted ready"; + "region_id" => region_id, + "peer_id" => peer_id, + "ready_number" => ready_number, + "error" => ?e, + ); + } + } +} diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs new file mode 100644 index 00000000000..668453e708b --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -0,0 +1,304 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains the actions that will drive a raft state machine. +//! +//! # Raft Ready +//! +//! Every messages or ticks may have side affect. Handling all those side +//! affect immediately is not efficient. Instead, tikv uses `Ready` to batch up +//! all the side affects and handle them at once for throughput. +//! +//! As raft store is the critical path in the whole system, we avoid most +//! blocking IO. So a typical processing is divided into two steps: +//! +//! - Handle raft ready to process the side affect and send IO tasks to +//! background threads +//! - Receive IO tasks completion and update the raft state machine +//! +//! There two steps can be processed concurrently. + +mod async_writer; + +use engine_traits::{KvEngine, RaftEngine}; +use error_code::ErrorCodeExt; +use kvproto::raft_serverpb::RaftMessage; +use protobuf::Message as _; +use raft::{eraftpb, Ready}; +use raftstore::store::{FetchedLogs, Transport, WriteTask}; +use slog::{debug, error, trace, warn}; + +pub use self::async_writer::AsyncWriter; +use crate::{ + batch::StoreContext, + fsm::{PeerFsm, PeerFsmDelegate}, + raft::{Peer, Storage}, + PeerTick, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + /// Raft relies on periodic ticks to keep the state machine sync with other + /// peers. + pub fn on_raft_tick(&mut self) { + if self.fsm.peer_mut().tick() { + self.fsm.peer_mut().set_has_ready(); + } + self.schedule_tick(PeerTick::Raft); + } +} + +impl Peer { + #[inline] + fn tick(&mut self) -> bool { + self.raft_group_mut().tick() + } + + /// Callback for fetching logs asynchronously. + pub fn on_fetched_logs(&mut self, fetched_logs: FetchedLogs) { + let FetchedLogs { context, logs } = fetched_logs; + let low = logs.low; + if !self.is_leader() { + self.entry_storage_mut().clean_async_fetch_res(low); + return; + } + if self.term() != logs.term { + self.entry_storage_mut().clean_async_fetch_res(low); + } else { + self.entry_storage_mut() + .update_async_fetch_res(low, Some(logs)); + } + self.raft_group_mut().on_entries_fetched(context); + // clean the async fetch result immediately if not used to free memory + self.entry_storage_mut().update_async_fetch_res(low, None); + self.set_has_ready(); + } + + /// Partially filled a raft message that will be sent to other peer. + fn prepare_raft_message(&mut self) -> RaftMessage { + let mut raft_msg = RaftMessage::new(); + raft_msg.set_region_id(self.region().id); + raft_msg.set_from_peer(self.peer().clone()); + // set current epoch + let epoch = self.storage().region().get_region_epoch(); + let msg_epoch = raft_msg.mut_region_epoch(); + msg_epoch.set_version(epoch.get_version()); + msg_epoch.set_conf_ver(epoch.get_conf_ver()); + raft_msg + } + + /// Transform a message from raft lib to a message that can be sent to other + /// peers. + /// + /// If the recipient can't be found, `None` is returned. + #[inline] + fn build_raft_message( + &mut self, + ctx: &mut StoreContext, + msg: eraftpb::Message, + ) -> Option { + let to_peer = match self.get_peer_from_cache(msg.to) { + Some(p) => p, + None => { + warn!(self.logger, "failed to look up recipient peer"; "to_peer" => msg.to); + return None; + } + }; + + let mut raft_msg = self.prepare_raft_message(); + + raft_msg.set_to_peer(to_peer); + if msg.from != self.peer().id { + debug!( + self.logger, + "redirecting message"; + "msg_type" => ?msg.get_msg_type(), + "from" => msg.get_from(), + "to" => msg.get_to(), + ); + } + raft_msg.set_message(msg); + Some(raft_msg) + } + + /// Send a message. + /// + /// The message is pushed into the send buffer, it may not be sent out until + /// transport is flushed explicitly. + fn send_raft_message( + &mut self, + ctx: &mut StoreContext, + msg: RaftMessage, + ) { + let msg_type = msg.get_message().get_msg_type(); + let to_peer_id = msg.get_to_peer().get_id(); + let to_store_id = msg.get_to_peer().get_store_id(); + + trace!( + self.logger, + "send raft msg"; + "msg_type" => ?msg_type, + "msg_size" => msg.get_message().compute_size(), + "to" => to_peer_id, + ); + + match ctx.trans.send(msg) { + Ok(()) => ctx.raft_metrics.send_message.add(msg_type, true), + Err(e) => { + // We use metrics to observe failure on production. + debug!( + self.logger, + "failed to send msg to other peer"; + "target_peer_id" => to_peer_id, + "target_store_id" => to_store_id, + "err" => ?e, + "error_code" => %e.error_code(), + ); + // unreachable store + self.raft_group_mut().report_unreachable(to_peer_id); + ctx.raft_metrics.send_message.add(msg_type, false); + } + } + } + + fn handle_raft_committed_entries( + &self, + _ctx: &mut crate::batch::StoreContext, + _take_committed_entries: Vec, + ) { + unimplemented!() + } + + /// Processing the ready of raft. A detail description of how it's handled + /// can be found at https://docs.rs/raft/latest/raft/#processing-the-ready-state. + /// + /// It's should be called at the end of every round of processing. Any + /// writes will be handled asynchronously, and be notified once writes + /// are persisted. + #[inline] + pub fn handle_raft_ready(&mut self, ctx: &mut StoreContext) { + let has_ready = self.reset_has_ready(); + if !has_ready { + return; + } + ctx.has_ready = true; + + if !self.raft_group().has_ready() { + return; + } + + debug!(self.logger, "handle raft ready"); + + let mut ready = self.raft_group_mut().ready(); + // Update it after unstable entries pagination is introduced. + debug_assert!(ready.entries().last().map_or_else( + || true, + |entry| entry.index == self.raft_group().raft.raft_log.last_index() + )); + + if !ready.messages().is_empty() { + debug_assert!(self.is_leader()); + for msg in ready.take_messages() { + if let Some(msg) = self.build_raft_message(ctx, msg) { + self.send_raft_message(ctx, msg); + } + } + } + + if !ready.committed_entries().is_empty() { + self.handle_raft_committed_entries(ctx, ready.take_committed_entries()); + } + + let ready_number = ready.number(); + let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); + self.storage_mut() + .handle_raft_ready(&mut ready, &mut write_task); + if !ready.persisted_messages().is_empty() { + write_task.messages = ready + .take_persisted_messages() + .into_iter() + .flat_map(|m| self.build_raft_message(ctx, m)) + .collect(); + } + // Ready number should increase monotonically. + assert!(self.async_writer.known_largest_number() < ready.number()); + if let Some(task) = self.async_writer.write(ctx, write_task) { + // So the task doesn't need to be process asynchronously, directly advance. + let mut light_rd = self.raft_group_mut().advance_append(ready); + if !task.messages.is_empty() { + for m in task.messages { + self.send_raft_message(ctx, m); + } + } + if !light_rd.messages().is_empty() || light_rd.commit_index().is_some() { + panic!( + "{:?} unexpected messages [{}] commit index [{:?}]", + self.logger.list(), + light_rd.messages().len(), + light_rd.commit_index() + ); + } + if !light_rd.committed_entries().is_empty() { + self.handle_raft_committed_entries(ctx, light_rd.take_committed_entries()); + } + } else { + // The task will be written asynchronously. Once it's persisted, it will be + // notified by `on_persisted`. + self.raft_group_mut().advance_append_async(ready); + } + + ctx.raft_metrics.ready.has_ready_region += 1; + } + + /// Called when an asynchronously write finishes. + pub fn on_persisted( + &mut self, + ctx: &mut StoreContext, + peer_id: u64, + ready_number: u64, + ) { + if peer_id != self.peer_id() { + error!(self.logger, "peer id not matched"; "persisted_peer_id" => peer_id, "persisted_number" => ready_number); + return; + } + let persisted_message = self + .async_writer + .on_persisted(ctx, ready_number, &self.logger); + for msgs in persisted_message { + for msg in msgs { + self.send_raft_message(ctx, msg); + } + } + let persisted_number = self.async_writer.persisted_number(); + self.raft_group_mut().on_persist_ready(persisted_number); + let persisted_index = self.raft_group().raft.raft_log.persisted; + self.storage_mut() + .entry_storage_mut() + .update_cache_persisted(persisted_index); + // We may need to check if there is persisted committed logs. + self.set_has_ready(); + } +} + +impl Storage { + /// Apply the ready to the storage. If there is any states need to be + /// persisted, it will be written to `write_task`. + fn handle_raft_ready( + &mut self, + ready: &mut Ready, + write_task: &mut WriteTask, + ) { + let prev_raft_state = self.entry_storage().raft_state().clone(); + + // TODO: handle snapshot + + let entry_storage = self.entry_storage_mut(); + if !ready.entries().is_empty() { + entry_storage.append(ready.take_entries(), write_task); + } + if let Some(hs) = ready.hs() { + entry_storage.raft_state_mut().set_hard_state(hs.clone()); + } + if prev_raft_state != *entry_storage.raft_state() { + write_task.raft_state = Some(entry_storage.raft_state().clone()); + } + } +} diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 0c7abf52b58..09646965bda 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -17,7 +17,7 @@ impl Apply { pub fn new(peer: &Peer) -> Self { Apply { tablet: peer.tablet().clone(), - logger: peer.logger().clone(), + logger: peer.logger.clone(), } } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 70dccd284fa..eb61d744774 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -1,16 +1,17 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; +use std::{collections::VecDeque, mem, sync::Arc}; use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; -use raft::{RawNode, INVALID_ID}; -use raftstore::store::{util::find_peer, Config}; +use raft::{RawNode, StateRole, INVALID_ID}; +use raftstore::store::{util::find_peer, Config, EntryStorage, RaftlogFetchTask, WriteRouter}; use slog::{o, Logger}; -use tikv_util::{box_err, config::ReadableSize}; +use tikv_util::{box_err, config::ReadableSize, worker::Scheduler}; use super::storage::Storage; use crate::{ + operation::AsyncWriter, tablet::{self, CachedTablet}, Result, }; @@ -19,7 +20,13 @@ use crate::{ pub struct Peer { raft_group: RawNode>, tablet: CachedTablet, - logger: Logger, + /// We use a cache for looking up peers. Not all peers exist in region's + /// peer list, for example, an isolated peer may need to send/receive + /// messages with unknown peers after recovery. + peer_cache: Vec, + pub(crate) async_writer: AsyncWriter, + has_ready: bool, + pub(crate) logger: Logger, } impl Peer { @@ -32,9 +39,10 @@ impl Peer { store_id: u64, tablet_factory: &dyn TabletFactory, engine: ER, + scheduler: Scheduler, logger: &Logger, ) -> Result> { - let s = match Storage::new(region_id, store_id, engine, logger)? { + let s = match Storage::new(region_id, store_id, engine, scheduler, logger)? { Some(s) => s, None => return Ok(None), }; @@ -83,18 +91,31 @@ impl Peer { Ok(Some(Peer { raft_group: RawNode::new(&raft_cfg, s, &logger)?, tablet: CachedTablet::new(tablet), + has_ready: false, + async_writer: AsyncWriter::new(region_id, peer_id), logger, + peer_cache: vec![], })) } + #[inline] + pub fn region(&self) -> &metapb::Region { + self.raft_group.store().region() + } + #[inline] pub fn region_id(&self) -> u64 { - self.raft_group.store().region_state().get_region().get_id() + self.region().get_id() + } + + #[inline] + pub fn peer(&self) -> &metapb::Peer { + self.raft_group.store().peer() } #[inline] pub fn peer_id(&self) -> u64 { - self.raft_group.store().peer().get_id() + self.peer().get_id() } #[inline] @@ -102,13 +123,96 @@ impl Peer { self.raft_group.store() } + #[inline] + pub fn storage_mut(&mut self) -> &mut Storage { + self.raft_group.mut_store() + } + + #[inline] + pub fn entry_storage(&self) -> &EntryStorage { + self.raft_group.store().entry_storage() + } + + #[inline] + pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { + self.raft_group.mut_store().entry_storage_mut() + } + #[inline] pub fn tablet(&self) -> &CachedTablet { &self.tablet } #[inline] - pub fn logger(&self) -> &Logger { - &self.logger + pub fn tablet_mut(&mut self) -> &mut CachedTablet { + &mut self.tablet + } + + #[inline] + pub fn raft_group(&self) -> &RawNode> { + &self.raft_group + } + + #[inline] + pub fn raft_group_mut(&mut self) -> &mut RawNode> { + &mut self.raft_group + } + + /// Mark the peer has a ready so it will be checked at the end of every + /// processing round. + #[inline] + pub fn set_has_ready(&mut self) { + self.has_ready = true; + } + + /// Mark the peer has no ready and return its previous state. + #[inline] + pub fn reset_has_ready(&mut self) -> bool { + mem::take(&mut self.has_ready) + } + + #[inline] + pub fn insert_peer_cache(&mut self, peer: metapb::Peer) { + for p in self.raft_group.store().region().get_peers() { + if p.get_id() == peer.get_id() { + return; + } + } + for p in &mut self.peer_cache { + if p.get_id() == peer.get_id() { + *p = peer; + return; + } + } + self.peer_cache.push(peer); + } + + #[inline] + pub fn clear_peer_cache(&mut self) { + self.peer_cache.clear(); + } + + #[inline] + pub fn get_peer_from_cache(&self, peer_id: u64) -> Option { + for p in self.raft_group.store().region().get_peers() { + if p.get_id() == peer_id { + return Some(p.clone()); + } + } + self.peer_cache + .iter() + .find(|p| p.get_id() == peer_id) + .cloned() + } + + #[inline] + pub fn is_leader(&self) -> bool { + self.raft_group.raft.state == StateRole::Leader + } + + /// Term of the state machine. + #[inline] + pub fn term(&self) -> u64 { + self.raft_group.raft.term } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index ff0bd64cd01..4f625b751ac 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -1,17 +1,22 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::fmt::{self, Debug, Formatter}; + use engine_traits::{RaftEngine, RaftLogBatch}; use kvproto::{ metapb::{self, Region}, raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; use raft::{ - eraftpb::{Entry, Snapshot}, + eraftpb::{ConfState, Entry, HardState, Snapshot}, GetEntriesContext, RaftState, INVALID_ID, }; -use raftstore::store::{util::find_peer, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; +use raftstore::store::{ + util::{self, find_peer}, + EntryStorage, RaftlogFetchTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, +}; use slog::{o, Logger}; -use tikv_util::box_err; +use tikv_util::{box_err, worker::Scheduler}; use crate::{Error, Result}; @@ -45,16 +50,56 @@ pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Resul /// A storage for raft. /// /// It's similar to `PeerStorage` in v1. -#[derive(Debug)] pub struct Storage { - engine: ER, + entry_storage: EntryStorage, peer: metapb::Peer, region_state: RegionLocalState, - raft_state: RaftLocalState, - apply_state: RaftApplyState, logger: Logger, } +impl Debug for Storage { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "Storage of [region {}] {}", + self.region().get_id(), + self.peer.get_id() + ) + } +} + +impl Storage { + #[inline] + pub fn entry_storage(&self) -> &EntryStorage { + &self.entry_storage + } + + #[inline] + pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { + &mut self.entry_storage + } + + #[inline] + pub fn region_state(&self) -> &RegionLocalState { + &self.region_state + } + + #[inline] + pub fn region(&self) -> &metapb::Region { + self.region_state.get_region() + } + + #[inline] + pub fn peer(&self) -> &metapb::Peer { + &self.peer + } + + #[inline] + pub fn logger(&self) -> &Logger { + &self.logger + } +} + impl Storage { /// Creates a new storage. /// @@ -64,12 +109,17 @@ impl Storage { region_id: u64, store_id: u64, engine: ER, + log_fetch_scheduler: Scheduler, logger: &Logger, ) -> Result>> { - let region_state = match engine.get_region_state(region_id) { + let region_state: RegionLocalState = match engine.get_region_state(region_id) { Ok(Some(s)) => s, res => { - return Err(box_err!("failed to get region state: {:?}", res)); + return Err(box_err!( + "failed to get region state for region {}: {:?}", + region_id, + res + )); } }; @@ -101,53 +151,67 @@ impl Storage { } }; - let mut s = Storage { + let region = region_state.get_region(); + + let entry_storage = EntryStorage::new( + peer.get_id(), engine, - peer: peer.clone(), - region_state, raft_state, apply_state, - logger, - }; - s.validate_state()?; - Ok(Some(s)) - } - - fn validate_state(&mut self) -> Result<()> { - unimplemented!() - } + region, + log_fetch_scheduler, + )?; - #[inline] - pub fn region_state(&self) -> &RegionLocalState { - &self.region_state + Ok(Some(Storage { + entry_storage, + peer: peer.clone(), + region_state, + logger, + })) } #[inline] pub fn raft_state(&self) -> &RaftLocalState { - &self.raft_state + self.entry_storage.raft_state() } #[inline] pub fn apply_state(&self) -> &RaftApplyState { - &self.apply_state + self.entry_storage.apply_state() } #[inline] - pub fn peer(&self) -> &metapb::Peer { - &self.peer - } - - #[inline] - pub fn logger(&self) -> &Logger { - &self.logger + pub fn is_initialized(&self) -> bool { + self.region_state.get_tablet_index() != 0 } } impl raft::Storage for Storage { fn initial_state(&self) -> raft::Result { - unimplemented!() + let hard_state = self.raft_state().get_hard_state().clone(); + // We will persist hard state no matter if it's initialized or not in + // v2, So hard state may not be empty. But when it becomes initialized, + // commit must be changed. + assert_eq!( + hard_state.commit == 0, + !self.is_initialized(), + "region state doesn't match raft state {:?} vs {:?}", + self.region_state(), + self.raft_state() + ); + + if hard_state.commit == 0 { + // If it's uninitialized, return empty state as we consider every + // states are empty at the very beginning. + return Ok(RaftState::new(hard_state, ConfState::default())); + } + Ok(RaftState::new( + hard_state, + util::conf_state_from_region(self.region()), + )) } + #[inline] fn entries( &self, low: u64, @@ -155,19 +219,23 @@ impl raft::Storage for Storage { max_size: impl Into>, context: GetEntriesContext, ) -> raft::Result> { - unimplemented!() + self.entry_storage + .entries(low, high, max_size.into().unwrap_or(u64::MAX), context) } + #[inline] fn term(&self, idx: u64) -> raft::Result { - unimplemented!() + self.entry_storage.term(idx) } + #[inline] fn first_index(&self) -> raft::Result { - unimplemented!() + Ok(self.entry_storage.first_index()) } + #[inline] fn last_index(&self) -> raft::Result { - unimplemented!() + Ok(self.entry_storage.last_index()) } fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs new file mode 100644 index 00000000000..1288f14c3da --- /dev/null +++ b/components/raftstore-v2/src/router/imp.rs @@ -0,0 +1,12 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use raftstore::store::{FetchedLogs, LogFetchedNotifier}; + +use crate::{batch::StoreRouter, PeerMsg}; + +impl LogFetchedNotifier for StoreRouter { + fn notify(&self, region_id: u64, fetched: FetchedLogs) { + let _ = self.force_send(region_id, PeerMsg::FetchedLogs(fetched)); + } +} diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 37b34bcb666..a71bdc89283 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -10,7 +10,8 @@ use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, }; use raftstore::store::{ - fsm::ApplyTaskRes, metrics::RaftEventDurationType, InspectedRaftMessage, RegionSnapshot, + fsm::ApplyTaskRes, metrics::RaftEventDurationType, FetchedLogs, InspectedRaftMessage, + RegionSnapshot, }; use tikv_util::{memory::HeapSize, time::Instant}; @@ -244,6 +245,7 @@ pub enum PeerMsg { ApplyRes { res: ApplyTaskRes, }, + FetchedLogs(FetchedLogs), /// Start the FSM. Start, /// A message only used to notify a peer. @@ -275,6 +277,7 @@ impl fmt::Debug for PeerMsg { "Persisted peer_id {}, ready_number {}", peer_id, ready_number ), + PeerMsg::FetchedLogs(fetched) => write!(fmt, "FetchedLogs {:?}", fetched), } } } diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index a7c7672b835..11df3cbbabd 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod imp; mod internal_message; mod message; diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs new file mode 100644 index 00000000000..d93cd09fc62 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -0,0 +1,174 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(test)] +#![feature(assert_matches)] +#![feature(custom_test_frameworks)] +#![test_runner(test_util::run_tests)] +// TODO: remove following when tests can be run. +#![allow(dead_code)] +#![allow(unused_imports)] + +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, +}; + +use crossbeam::channel::{self, Receiver, Sender}; +use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactoryV2}, + raft::RaftTestEngine, +}; +use engine_traits::{OpenOptions, TabletFactory, ALL_CFS}; +use kvproto::{metapb::Store, raft_serverpb::RaftMessage}; +use pd_client::RpcClient; +use raftstore::store::{Config, Transport, RAFT_INIT_LOG_INDEX}; +use raftstore_v2::{create_store_batch_system, Bootstrap, StoreRouter, StoreSystem}; +use slog::{o, Logger}; +use tempfile::TempDir; +use test_pd::mocker::Service; +use tikv_util::config::VersionTrack; + +mod test_election; + +type TestRouter = StoreRouter; + +struct TestNode { + _pd_server: test_pd::Server, + _pd_client: RpcClient, + _path: TempDir, + store: Store, + raft_engine: Option, + factory: Option>, + system: Option>, + logger: Logger, +} + +impl TestNode { + fn new() -> TestNode { + let logger = slog_global::borrow_global().new(o!()); + let pd_server = test_pd::Server::new(1); + let pd_client = test_pd::util::new_client(pd_server.bind_addrs(), None); + let path = TempDir::new().unwrap(); + + let cf_opts = ALL_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let factory = Arc::new(TestTabletFactoryV2::new( + path.path(), + DbOptions::default(), + cf_opts, + )); + let raft_engine = + engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) + .unwrap(); + let mut bootstrap = Bootstrap::new(&raft_engine, 0, &pd_client, logger.clone()); + let store_id = bootstrap.bootstrap_store().unwrap(); + let mut store = Store::default(); + store.set_id(store_id); + let region = bootstrap + .bootstrap_first_region(&store, store_id) + .unwrap() + .unwrap(); + if factory.exists(region.get_id(), RAFT_INIT_LOG_INDEX) { + factory + .destroy_tablet(region.get_id(), RAFT_INIT_LOG_INDEX) + .unwrap(); + } + factory + .open_tablet( + region.get_id(), + Some(RAFT_INIT_LOG_INDEX), + OpenOptions::default().set_create_new(true), + ) + .unwrap(); + + TestNode { + _pd_server: pd_server, + _pd_client: pd_client, + _path: path, + store, + raft_engine: Some(raft_engine), + factory: Some(factory), + system: None, + logger, + } + } + + fn start( + &mut self, + cfg: &Arc>, + trans: impl Transport + 'static, + ) -> TestRouter { + let (router, mut system) = create_store_batch_system::( + &cfg.value(), + self.store.clone(), + self.logger.clone(), + ); + system + .start( + self.store.clone(), + cfg.clone(), + self.raft_engine.clone().unwrap(), + self.factory.clone().unwrap(), + trans, + &router, + ) + .unwrap(); + self.system = Some(system); + router + } + + fn stop(&mut self) { + if let Some(mut system) = self.system.take() { + system.shutdown(); + } + } +} + +impl Drop for TestNode { + fn drop(&mut self) { + self.stop(); + self.raft_engine.take(); + self.factory.take(); + } +} + +#[derive(Clone)] +pub struct TestTransport { + tx: Sender, + flush_cnt: Arc, +} + +fn new_test_transport() -> (TestTransport, Receiver) { + let (tx, rx) = channel::unbounded(); + let flush_cnt = Default::default(); + (TestTransport { tx, flush_cnt }, rx) +} + +impl Transport for TestTransport { + fn send(&mut self, msg: RaftMessage) -> raftstore_v2::Result<()> { + let _ = self.tx.send(msg); + Ok(()) + } + + fn set_store_allowlist(&mut self, _stores: Vec) {} + + fn need_flush(&self) -> bool { + !self.tx.is_empty() + } + + fn flush(&mut self) { + self.flush_cnt.fetch_add(1, Ordering::SeqCst); + } +} + +fn setup_default_cluster() -> (TestNode, Receiver, TestRouter) { + let mut node = TestNode::new(); + let cfg = Default::default(); + let (tx, rx) = new_test_transport(); + let router = node.start(&cfg, tx); + (node, rx, router) +} diff --git a/components/raftstore-v2/tests/integrations/test_election.rs b/components/raftstore-v2/tests/integrations/test_election.rs new file mode 100644 index 00000000000..cf3a0cc4906 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_election.rs @@ -0,0 +1,10 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use raftstore_v2::PeerMsg; + +// TODO: finish test case when callback is added. +#[test] +fn test_smoke() { + let (_node, _transport, router) = super::setup_default_cluster(); + router.send(2, PeerMsg::Noop).unwrap(); +} diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 72fd52ea4d4..a007d168474 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -33,6 +33,7 @@ use tikv_util::{ warn, }; +use super::write_router::WriteSenders; use crate::{ store::{ config::Config, @@ -854,8 +855,8 @@ where EK: KvEngine, ER: RaftEngine, { - pub fn senders(&self) -> &Vec>> { - &self.writers + pub fn senders(&self) -> WriteSenders { + WriteSenders::new(self.writers.clone()) } pub fn spawn( diff --git a/components/raftstore/src/store/async_io/write_router.rs b/components/raftstore/src/store/async_io/write_router.rs index 6b19212c164..6c1db6419cf 100644 --- a/components/raftstore/src/store/async_io/write_router.rs +++ b/components/raftstore/src/store/async_io/write_router.rs @@ -5,6 +5,7 @@ use std::{ mem, + ops::Index, sync::{ atomic::{AtomicUsize, Ordering}, Arc, @@ -28,8 +29,7 @@ where EK: KvEngine, ER: RaftEngine, { - fn write_senders(&self) -> &Vec>>; - fn io_reschedule_concurrent_count(&self) -> &Arc; + fn write_senders(&self) -> &WriteSenders; fn config(&self) -> &Config; fn raft_metrics(&self) -> &RaftMetrics; } @@ -39,14 +39,10 @@ where EK: KvEngine, ER: RaftEngine, { - fn write_senders(&self) -> &Vec>> { + fn write_senders(&self) -> &WriteSenders { &self.write_senders } - fn io_reschedule_concurrent_count(&self) -> &Arc { - &self.io_reschedule_concurrent_count - } - fn config(&self) -> &Config { &self.cfg } @@ -120,7 +116,8 @@ where // The peer must be destroyed after all previous write tasks have been finished. // So do not worry about a destroyed peer being counted in // `io_reschedule_concurrent_count`. - ctx.io_reschedule_concurrent_count() + ctx.write_senders() + .io_reschedule_concurrent_count .fetch_sub(1, Ordering::SeqCst); STORE_IO_RESCHEDULE_PEER_TOTAL_GAUGE.dec(); @@ -200,7 +197,8 @@ where // concurrent count of rescheduling peer fsm because rescheduling will // introduce performance penalty. let success = ctx - .io_reschedule_concurrent_count() + .write_senders() + .io_reschedule_concurrent_count .fetch_update(Ordering::SeqCst, Ordering::Relaxed, |c| { if c < ctx.config().io_reschedule_concurrent_max_count { Some(c + 1) @@ -245,6 +243,37 @@ where } } +/// Senders for asynchronous writes. There can be multiple senders, generally +/// you should use `WriteRouter` to decide which sender to be used. +#[derive(Clone)] +pub struct WriteSenders { + write_senders: Vec>>, + io_reschedule_concurrent_count: Arc, +} + +impl WriteSenders { + pub fn new(write_senders: Vec>>) -> Self { + WriteSenders { + write_senders, + io_reschedule_concurrent_count: Arc::default(), + } + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.write_senders.is_empty() + } +} + +impl Index for WriteSenders { + type Output = Sender>; + + #[inline] + fn index(&self, index: usize) -> &Sender> { + &self.write_senders[index] + } +} + #[cfg(test)] mod tests { use std::thread; @@ -257,8 +286,7 @@ mod tests { struct TestWriteRouter { receivers: Vec>>, - senders: Vec>>, - io_reschedule_concurrent_count: Arc, + senders: WriteSenders, config: Config, raft_metrics: RaftMetrics, } @@ -273,8 +301,7 @@ mod tests { } Self { receivers, - senders, - io_reschedule_concurrent_count: Arc::new(AtomicUsize::new(0)), + senders: WriteSenders::new(senders), config, raft_metrics: RaftMetrics::new(true), } @@ -293,7 +320,10 @@ mod tests { } fn must_same_reschedule_count(&self, count: usize) { - let cnt = self.io_reschedule_concurrent_count.load(Ordering::Relaxed); + let cnt = self + .senders + .io_reschedule_concurrent_count + .load(Ordering::Relaxed); if cnt != count { panic!("reschedule count not same, {} != {}", cnt, count); } @@ -301,14 +331,10 @@ mod tests { } impl WriteRouterContext for TestWriteRouter { - fn write_senders(&self) -> &Vec>> { + fn write_senders(&self) -> &WriteSenders { &self.senders } - fn io_reschedule_concurrent_count(&self) -> &Arc { - &self.io_reschedule_concurrent_count - } - fn config(&self) -> &Config { &self.config } @@ -407,7 +433,9 @@ mod tests { t.must_same_reschedule_count(0); thread::sleep(Duration::from_millis(10)); - t.io_reschedule_concurrent_count.store(4, Ordering::Relaxed); + t.senders + .io_reschedule_concurrent_count + .store(4, Ordering::Relaxed); // Should retry reschedule next time because the limitation of concurrent count. // However it's possible that it will not scheduled due to random // so using loop here. @@ -428,7 +456,9 @@ mod tests { thread::sleep(Duration::from_millis(10)); } - t.io_reschedule_concurrent_count.store(3, Ordering::Relaxed); + t.senders + .io_reschedule_concurrent_count + .store(3, Ordering::Relaxed); thread::sleep(Duration::from_millis(RETRY_SCHEDULE_MILLISECONS + 2)); // Should reschedule now r.send_write_msg(&mut t, Some(40), WriteMsg::Shutdown); diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index aaaed69c555..1642c90d075 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -246,8 +246,8 @@ impl TestWriters { } } - fn write_sender(&self, id: usize) -> &Sender> { - &self.writers.senders()[id] + fn write_sender(&self, id: usize) -> Sender> { + self.writers.senders()[id].clone() } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index e4707947fbb..2452f177cff 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1306,8 +1306,8 @@ where SignificantMsg::RaftLogGcFlushed => { self.on_raft_log_gc_flushed(); } - SignificantMsg::RaftlogFetched { context, res } => { - self.on_raft_log_fetched(context, res); + SignificantMsg::RaftlogFetched(fetched_logs) => { + self.on_raft_log_fetched(fetched_logs.context, fetched_logs.logs); } SignificantMsg::EnterForceLeaderState { syncer, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index ecdb8653147..3c4e77ff4b9 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -10,10 +10,7 @@ use std::{ }, mem, ops::{Deref, DerefMut}, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Mutex, - }, + sync::{atomic::Ordering, Arc, Mutex}, time::{Duration, Instant}, u64, }; @@ -24,7 +21,7 @@ use batch_system::{ }; use collections::{HashMap, HashMapEntry, HashSet}; use concurrency_manager::ConcurrencyManager; -use crossbeam::channel::{unbounded, Sender, TryRecvError, TrySendError}; +use crossbeam::channel::{unbounded, TryRecvError, TrySendError}; use engine_traits::{ CompactedEvent, DeleteStrategy, Engines, KvEngine, Mutable, PerfContextKind, RaftEngine, RaftLogBatch, Range, WriteBatch, WriteOptions, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, @@ -71,7 +68,10 @@ use crate::{ RegionChangeReason, }, store::{ - async_io::write::{StoreWriters, Worker as WriteWorker, WriteMsg}, + async_io::{ + write::{StoreWriters, Worker as WriteWorker, WriteMsg}, + write_router::WriteSenders, + }, config::Config, fsm::{ create_apply_batch_system, @@ -518,9 +518,8 @@ where /// Disk usage for other stores. The store itself is not included. /// Only contains items which is not `DiskUsage::Normal`. pub store_disk_usages: HashMap, - pub write_senders: Vec>>, + pub write_senders: WriteSenders, pub sync_write_worker: Option, T>>, - pub io_reschedule_concurrent_count: Arc, pub pending_latency_inspect: Vec, } @@ -1072,8 +1071,7 @@ pub struct RaftPollerBuilder { pub engines: Engines, global_replication_state: Arc>, feature_gate: FeatureGate, - write_senders: Vec>>, - io_reschedule_concurrent_count: Arc, + write_senders: WriteSenders, } impl RaftPollerBuilder { @@ -1313,7 +1311,6 @@ where store_disk_usages: Default::default(), write_senders: self.write_senders.clone(), sync_write_worker, - io_reschedule_concurrent_count: self.io_reschedule_concurrent_count.clone(), pending_latency_inspect: vec![], }; ctx.update_ticks_timeout(); @@ -1364,7 +1361,6 @@ where global_replication_state: self.global_replication_state.clone(), feature_gate: self.feature_gate.clone(), write_senders: self.write_senders.clone(), - io_reschedule_concurrent_count: self.io_reschedule_concurrent_count.clone(), } } } @@ -1558,8 +1554,7 @@ impl RaftBatchSystem { store_meta, pending_create_peers: Arc::new(Mutex::new(HashMap::default())), feature_gate: pd_client.feature_gate().clone(), - write_senders: self.store_writers.senders().clone(), - io_reschedule_concurrent_count: Arc::new(AtomicUsize::new(0)), + write_senders: self.store_writers.senders(), }; let region_peers = builder.init()?; self.start_system::( diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index ad730206175..b5a35461728 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -30,7 +30,7 @@ pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ write::{PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask}, - write_router::{WriteRouter, WriteRouterContext}, + write_router::{WriteRouter, WriteRouterContext, WriteSenders}, }, bootstrap::{ bootstrap_store, clear_prepare_bootstrap_cluster, clear_prepare_bootstrap_key, @@ -68,10 +68,10 @@ pub use self::{ util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, - CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, - LocalReader, PdTask, QueryStats, RaftlogFetchRunner, RaftlogFetchTask, ReadDelegate, - ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, - SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, - TrackVer, WriteStats, + CheckLeaderTask, FetchedLogs, FlowStatistics, FlowStatsReporter, KeyEntry, + LocalReadContext, LocalReader, LogFetchedNotifier, PdTask, QueryStats, RaftlogFetchRunner, + RaftlogFetchTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, ReadProgress, + ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, + SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, }, }; diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index ce812d5ef24..947e9e074fd 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -19,12 +19,12 @@ use kvproto::{ }; #[cfg(any(test, feature = "testexport"))] use pd_client::BucketMeta; -use raft::{GetEntriesContext, SnapshotStatus}; +use raft::SnapshotStatus; use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; -use super::{local_metrics::TimeTracker, AbstractPeer, RegionSnapshot}; +use super::{local_metrics::TimeTracker, worker::FetchedLogs, AbstractPeer, RegionSnapshot}; use crate::store::{ fsm::apply::{CatchUpLogs, ChangeObserver, TaskRes as ApplyTaskRes}, metrics::RaftEventDurationType, @@ -34,7 +34,7 @@ use crate::store::{ }, util::{KeysInfoFormatter, LatencyInspector}, worker::{Bucket, BucketRange}, - RaftlogFetchResult, SnapKey, + SnapKey, }; #[derive(Debug)] @@ -357,10 +357,7 @@ where LeaderCallback(Callback), RaftLogGcFlushed, // Reports the result of asynchronous Raft logs fetching. - RaftlogFetched { - context: GetEntriesContext, - res: Box, - }, + RaftlogFetched(FetchedLogs), EnterForceLeaderState { syncer: UnsafeRecoveryForceLeaderSyncer, failed_stores: HashSet, diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 5ad6395dd33..129dac6dbb5 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1160,7 +1160,9 @@ pub mod tests { entry_storage::tests::validate_cache, fsm::apply::compact_raft_log, initial_region, prepare_bootstrap_cluster, - worker::{RaftlogFetchRunner, RegionRunner, RegionTask}, + worker::{ + FetchedLogs, LogFetchedNotifier, RaftlogFetchRunner, RegionRunner, RegionTask, + }, }, }; @@ -1383,35 +1385,20 @@ pub mod tests { } } - use crate::{ - store::{SignificantMsg, SignificantRouter}, - Result as RaftStoreResult, - }; - - pub struct TestRouter { - ch: SyncSender>, + pub struct TestRouter { + ch: SyncSender, } - impl TestRouter { - pub fn new() -> (Self, Receiver>) { + impl TestRouter { + pub fn new() -> (Self, Receiver) { let (tx, rx) = sync_channel(1); (Self { ch: tx }, rx) } } - impl SignificantRouter for TestRouter - where - EK: KvEngine, - { - /// Sends a significant message. We should guarantee that the message - /// can't be dropped. - fn significant_send( - &self, - _: u64, - msg: SignificantMsg, - ) -> RaftStoreResult<()> { - self.ch.send(msg).unwrap(); - Ok(()) + impl LogFetchedNotifier for TestRouter { + fn notify(&self, _region_id: u64, fetched_logs: FetchedLogs) { + self.ch.send(fetched_logs).unwrap(); } } @@ -1486,24 +1473,16 @@ pub mod tests { let raftlog_fetch_scheduler = raftlog_fetch_worker.scheduler(); let mut store = new_storage_from_ents(region_scheduler, raftlog_fetch_scheduler, &td, &ents); - raftlog_fetch_worker.start(RaftlogFetchRunner::::new( - router, - store.engines.raft.clone(), - )); + raftlog_fetch_worker.start(RaftlogFetchRunner::new(router, store.engines.raft.clone())); store.compact_entry_cache(5); let mut e = store.entries(lo, hi, maxsize, GetEntriesContext::empty(true)); if e == Err(raft::Error::Store( raft::StorageError::LogTemporarilyUnavailable, )) { let res = rx.recv().unwrap(); - match res { - SignificantMsg::RaftlogFetched { res, context } => { - store.update_async_fetch_res(lo, Some(res)); - count += 1; - e = store.entries(lo, hi, maxsize, context); - } - _ => unreachable!(), - }; + store.update_async_fetch_res(lo, Some(res.logs)); + count += 1; + e = store.entries(lo, hi, maxsize, res.context); } if e != wentries { panic!("#{}: expect entries {:?}, got {:?}", i, wentries, e); diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index f64fbae037e..19b825ac20c 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -8,6 +8,7 @@ use engine_traits::{KvEngine, RaftEngine, Snapshot}; use kvproto::raft_serverpb::RaftMessage; use tikv_util::{error, warn}; +use super::worker::{FetchedLogs, LogFetchedNotifier}; use crate::{ store::{CasualMessage, PeerMsg, RaftCommand, RaftRouter, SignificantMsg, StoreMsg}, DiscardReason, Error, Result, @@ -171,3 +172,11 @@ where } } } + +impl LogFetchedNotifier for RaftRouter { + #[inline] + fn notify(&self, region_id: u64, fetched: FetchedLogs) { + // Ignore region not found as it may be removed. + let _ = self.significant_send(region_id, SignificantMsg::RaftlogFetched(fetched)); + } +} diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 2298710ad63..4910f3fdd2b 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -30,7 +30,9 @@ pub use self::{ Runner as PdRunner, Task as PdTask, }, query_stats::QueryStats, - raftlog_fetch::{Runner as RaftlogFetchRunner, Task as RaftlogFetchTask}, + raftlog_fetch::{ + FetchedLogs, LogFetchedNotifier, Runner as RaftlogFetchRunner, Task as RaftlogFetchTask, + }, raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, read::{ CachedReadDelegate, LocalReadContext, LocalReader, Progress as ReadProgress, ReadDelegate, diff --git a/components/raftstore/src/store/worker/raftlog_fetch.rs b/components/raftstore/src/store/worker/raftlog_fetch.rs index 63bccf6324a..b3de87f7715 100644 --- a/components/raftstore/src/store/worker/raftlog_fetch.rs +++ b/components/raftstore/src/store/worker/raftlog_fetch.rs @@ -2,12 +2,12 @@ use std::fmt; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::RaftEngine; use fail::fail_point; use raft::GetEntriesContext; use tikv_util::worker::Runnable; -use crate::store::{RaftlogFetchResult, SignificantMsg, SignificantRouter, MAX_INIT_ENTRY_COUNT}; +use crate::store::{RaftlogFetchResult, MAX_INIT_ENTRY_COUNT}; pub enum Task { PeerStorage { @@ -42,32 +42,39 @@ impl fmt::Display for Task { } } -pub struct Runner +#[derive(Debug)] +pub struct FetchedLogs { + pub context: GetEntriesContext, + pub logs: Box, +} + +/// A router for receiving fetched result. +pub trait LogFetchedNotifier: Send { + fn notify(&self, region_id: u64, fetched: FetchedLogs); +} + +pub struct Runner where - EK: KvEngine, ER: RaftEngine, - R: SignificantRouter, + N: LogFetchedNotifier, { - router: R, + notifier: N, raft_engine: ER, - _phantom: std::marker::PhantomData, } -impl> Runner { - pub fn new(router: R, raft_engine: ER) -> Runner { +impl Runner { + pub fn new(notifier: N, raft_engine: ER) -> Runner { Runner { - router, + notifier, raft_engine, - _phantom: std::marker::PhantomData, } } } -impl Runnable for Runner +impl Runnable for Runner where - EK: KvEngine, ER: RaftEngine, - R: SignificantRouter, + N: LogFetchedNotifier, { type Task = Task; @@ -97,12 +104,11 @@ where .map(|c| (*c as u64) != high - low) .unwrap_or(false); fail_point!("worker_async_fetch_raft_log"); - // it may return a region not found error as the region could be merged. - let _ = self.router.significant_send( + self.notifier.notify( region_id, - SignificantMsg::RaftlogFetched { + FetchedLogs { context, - res: Box::new(RaftlogFetchResult { + logs: Box::new(RaftlogFetchResult { ents: res.map(|_| ents).map_err(|e| e.into()), low, max_size: max_size as u64, From 1ea26a2ac8761af356cc5c0825eb89a0b8fc9749 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 17 Aug 2022 02:08:51 -0700 Subject: [PATCH 159/676] raftstore-v2: add read write channel (#13245) ref tikv/tikv#12842 v2 uses channel instead of callbacks for proposals, so async/await has first class support. We can further reduce allocations by introducing channel pool. Signed-off-by: Jay Lee --- Cargo.lock | 2 +- components/raftstore-v2/Cargo.toml | 2 +- components/raftstore-v2/src/batch/store.rs | 4 +- components/raftstore-v2/src/fsm/peer.rs | 13 +- components/raftstore-v2/src/router/imp.rs | 3 +- .../src/router/internal_message.rs | 3 + components/raftstore-v2/src/router/message.rs | 201 +++----- components/raftstore-v2/src/router/mod.rs | 9 +- .../src/router/response_channel.rs | 477 ++++++++++++++++++ components/raftstore/src/lib.rs | 1 + components/raftstore/src/store/fsm/apply.rs | 141 +++--- components/raftstore/src/store/fsm/peer.rs | 30 +- components/raftstore/src/store/msg.rs | 125 ++++- components/raftstore/src/store/peer.rs | 51 +- components/raftstore/src/store/read_queue.rs | 88 ++-- 15 files changed, 778 insertions(+), 372 deletions(-) create mode 100644 components/raftstore-v2/src/router/response_channel.rs diff --git a/Cargo.lock b/Cargo.lock index 93ac7ddd600..3083e56ef23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4196,7 +4196,7 @@ dependencies = [ "engine_traits", "error_code", "fail", - "futures-util", + "futures 0.3.15", "keys", "kvproto", "log_wrappers", diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 29e68517441..f526aeda9c4 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -35,7 +35,7 @@ crossbeam = "0.8" engine_traits = { path = "../engine_traits" } error_code = { path = "../error_code" } fail = "0.5" -futures-util = { version = "0.3", features = ["compat"] } +futures = { version = "0.3", features = ["compat"] } keys = { path = "../keys", default-features = false } kvproto = { git = "https://github.com/pingcap/kvproto.git" } log_wrappers = { path = "../log_wrappers" } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 739240f84e0..d4cba3d9381 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -13,7 +13,7 @@ use batch_system::{ use collections::HashMap; use crossbeam::channel::Sender; use engine_traits::{Engines, KvEngine, RaftEngine, TabletFactory}; -use futures_util::{compat::Future01CompatExt, FutureExt}; +use futures::{compat::Future01CompatExt, FutureExt}; use kvproto::{metapb::Store, raft_serverpb::PeerState}; use raft::INVALID_ID; use raftstore::store::{ @@ -89,7 +89,7 @@ struct StorePoller { cfg_tracker: Tracker, /// Buffers to hold in-coming messages. store_msg_buf: Vec, - peer_msg_buf: Vec>, + peer_msg_buf: Vec, /// These fields controls the timing of flushing messages generated by /// FSMs. last_flush_time: TiInstant, diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 696a1e5ddf4..307da362330 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -17,12 +17,12 @@ use tikv_util::{ use crate::{batch::StoreContext, raft::Peer, PeerMsg, PeerTick, Result}; -pub type SenderFsmPair = (LooseBoundedSender>, Box>); +pub type SenderFsmPair = (LooseBoundedSender, Box>); pub struct PeerFsm { peer: Peer, mailbox: Option>>, - receiver: Receiver>, + receiver: Receiver, /// A registry for all scheduled ticks. This can avoid scheduling ticks /// twice accidentally. tick_registry: u16, @@ -62,7 +62,7 @@ impl PeerFsm { /// capacity is reached or there is no more pending messages. /// /// Returns how many messages are fetched. - pub fn recv(&mut self, peer_msg_buf: &mut Vec>) -> usize { + pub fn recv(&mut self, peer_msg_buf: &mut Vec) -> usize { let l = peer_msg_buf.len(); for i in l..peer_msg_buf.capacity() { match self.receiver.try_recv() { @@ -80,7 +80,7 @@ impl PeerFsm { } impl Fsm for PeerFsm { - type Message = PeerMsg; + type Message = PeerMsg; #[inline] fn is_stopped(&self) -> bool { @@ -183,13 +183,14 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } } - pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec>) { + pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec) { for msg in peer_msgs_buf.drain(..) { match msg { PeerMsg::RaftMessage(_) => unimplemented!(), + PeerMsg::RaftQuery(_) => unimplemented!(), PeerMsg::RaftCommand(_) => unimplemented!(), PeerMsg::Tick(tick) => self.on_tick(tick), - PeerMsg::ApplyRes { res } => unimplemented!(), + PeerMsg::ApplyRes(res) => unimplemented!(), PeerMsg::Start => self.on_start(), PeerMsg::Noop => unimplemented!(), PeerMsg::Persisted { diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 1288f14c3da..401961dfdb1 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -3,7 +3,8 @@ use engine_traits::{KvEngine, RaftEngine}; use raftstore::store::{FetchedLogs, LogFetchedNotifier}; -use crate::{batch::StoreRouter, PeerMsg}; +use super::PeerMsg; +use crate::batch::StoreRouter; impl LogFetchedNotifier for StoreRouter { fn notify(&self, region_id: u64, fetched: FetchedLogs) { diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index f5ef72d8e30..05653e4fdcc 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,3 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. pub enum ApplyTask {} + +#[derive(Debug)] +pub enum ApplyRes {} diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index a71bdc89283..3f0dadaed04 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -1,137 +1,23 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::{fmt, marker::PhantomData}; +use std::fmt; use engine_traits::{KvEngine, Snapshot}; use kvproto::{ - kvrpcpb::ExtraOp as TxnExtraOp, + cdcpb::Event, metapb, raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, }; use raftstore::store::{ - fsm::ApplyTaskRes, metrics::RaftEventDurationType, FetchedLogs, InspectedRaftMessage, - RegionSnapshot, + metrics::RaftEventDurationType, FetchedLogs, InspectedRaftMessage, RegionSnapshot, }; -use tikv_util::{memory::HeapSize, time::Instant}; +use tikv_util::time::Instant; -pub struct WriteResponseChannel; - -impl WriteResponseChannel { - /// Called after a request is proposed to the raft group successfully. It's - /// used to notify the caller to move on early because it's very likely the - /// request will be applied to the raftstore. - pub fn notify_proposed(&self) {} - - /// Called after a request is committed and before it's being applied, and - /// it's guaranteed that the request will be successfully applied soon. - pub fn notify_committed(&self) {} - - pub fn notify_applied(&self, _res: Result<(), RaftCmdResponse>) {} -} - -pub struct ReadResponseChannel { - _snap: PhantomData, -} - -pub struct ReadResponse { - pub snapshot: RegionSnapshot, - // What is this? - pub txn_extra_op: TxnExtraOp, -} - -impl ReadResponseChannel { - pub fn notify_read(&self, _res: Result, RaftCmdResponse>) {} -} - -// This is only necessary because of seeming limitations in derive(Clone) w/r/t -// generics. If it can be deleted in the future in favor of derive, it should -// be. -impl Clone for ReadResponse -where - S: Snapshot, -{ - fn clone(&self) -> ReadResponse { - ReadResponse { - snapshot: self.snapshot.clone(), - txn_extra_op: self.txn_extra_op, - } - } -} - -/// Variants of channels for `Msg`. -/// - `Read`: a channel for read only requests including `StatusRequest`, -/// `GetRequest` and `SnapRequest` -/// - `Write`: a channel for write only requests including `AdminRequest` -/// `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. -/// Prefer channel rather than callback because: -/// 1. channel can be reused, hence reduce allocations. -/// 2. channel may not need dynamic dispatch. -/// 3. caller can use async fashion. -/// 4. there will be no callback leak. -pub enum ResponseChannel { - /// No callback. - None, - /// Read callback. - Read(ReadResponseChannel), - /// Write callback. - Write(WriteResponseChannel), -} - -impl HeapSize for ResponseChannel {} - -impl ResponseChannel -where - S: Snapshot, -{ - pub fn notify_applied(self, resp: RaftCmdResponse) { - match self { - ResponseChannel::None => (), - ResponseChannel::Read(read) => { - read.notify_read(Err(resp)); - } - ResponseChannel::Write(write) => { - write.notify_applied(Err(resp)); - } - } - } - - pub fn notify_proposed(&mut self) { - if let ResponseChannel::Write(write) = self { - write.notify_proposed(); - } - } - - pub fn notify_committed(&mut self) { - if let ResponseChannel::Write(write) = self { - write.notify_committed(); - } - } - - pub fn invoke_read(self, args: ReadResponse) { - match self { - ResponseChannel::Read(read) => read.notify_read(Ok(args)), - other => panic!("expect Callback::Read(..), got {:?}", other), - } - } - - pub fn is_none(&self) -> bool { - matches!(self, ResponseChannel::None) - } -} - -impl fmt::Debug for ResponseChannel -where - S: Snapshot, -{ - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - ResponseChannel::None => write!(fmt, "Callback::None"), - ResponseChannel::Read(_) => write!(fmt, "Callback::Read(..)"), - ResponseChannel::Write { .. } => write!(fmt, "Callback::Write(..)"), - } - } -} +use super::{ + response_channel::{CmdResChannel, QueryResChannel}, + ApplyRes, +}; #[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] @@ -208,65 +94,96 @@ impl StoreTick { } } -/// Raft command is the command that is expected to be proposed by the -/// leader of the target raft group. -#[derive(Debug)] -pub struct RaftCommand { +/// Command that can be handled by raftstore. +pub struct RaftRequest { pub send_time: Instant, pub request: RaftCmdRequest, - pub ch: ResponseChannel, } -impl RaftCommand { - #[inline] - pub fn new(request: RaftCmdRequest, ch: ResponseChannel) -> RaftCommand { - RaftCommand { +impl RaftRequest { + pub fn new(request: RaftCmdRequest) -> Self { + RaftRequest { + send_time: Instant::now(), request, + } + } +} + +/// A query that won't change any state. So it doesn't have to be replicated to +/// all replicas. +pub struct RaftQuery { + pub req: RaftRequest, + pub ch: QueryResChannel, +} + +impl RaftQuery { + #[inline] + pub fn new(request: RaftCmdRequest, ch: QueryResChannel) -> Self { + Self { + req: RaftRequest::new(request), + ch, + } + } +} + +/// Commands that change the inernal states. It will be transformed into logs +/// and reach consensus in the raft group. +pub struct RaftCommand { + pub cmd: RaftRequest, + pub ch: CmdResChannel, +} + +impl RaftCommand { + #[inline] + pub fn new(request: RaftCmdRequest, ch: CmdResChannel) -> Self { + Self { + cmd: RaftRequest::new(request), ch, - send_time: Instant::now(), } } } /// Message that can be sent to a peer. -pub enum PeerMsg { +pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. RaftMessage(InspectedRaftMessage), - /// Raft command is the command that is expected to be proposed by the - /// leader of the target raft group. If it's failed to be sent, callback - /// usually needs to be called before dropping in case of resource leak. - RaftCommand(RaftCommand), + /// Read command only involves read operations, they are usually processed + /// using lease or read index. + RaftQuery(RaftQuery), + /// Proposal needs to be processed by all peers in a raft group. They will + /// be transformed into logs and be proposed by the leader peer. + RaftCommand(RaftCommand), /// Tick is periodical task. If target peer doesn't exist there is a /// potential that the raft node will not work anymore. Tick(PeerTick), /// Result of applying committed entries. The message can't be lost. - ApplyRes { - res: ApplyTaskRes, - }, + ApplyRes(ApplyRes), FetchedLogs(FetchedLogs), /// Start the FSM. Start, /// A message only used to notify a peer. Noop, + /// A message that indicates an asynchronous write has finished. Persisted { peer_id: u64, ready_number: u64, }, } -impl fmt::Debug for PeerMsg { +impl fmt::Debug for PeerMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { PeerMsg::RaftMessage(_) => write!(fmt, "Raft Message"), + PeerMsg::RaftQuery(_) => write!(fmt, "Raft Query"), PeerMsg::RaftCommand(_) => write!(fmt, "Raft Command"), PeerMsg::Tick(tick) => write! { fmt, "{:?}", tick }, - PeerMsg::ApplyRes { res } => write!(fmt, "ApplyRes {:?}", res), + PeerMsg::ApplyRes(res) => write!(fmt, "ApplyRes {:?}", res), PeerMsg::Start => write!(fmt, "Startup"), PeerMsg::Noop => write!(fmt, "Noop"), PeerMsg::Persisted { diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index 11df3cbbabd..4a1df09fa44 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -3,6 +3,11 @@ mod imp; mod internal_message; mod message; +mod response_channel; -pub(crate) use internal_message::ApplyTask; -pub use message::{PeerMsg, PeerTick, StoreMsg, StoreTick}; +pub(crate) use self::internal_message::ApplyTask; +pub use self::{ + internal_message::ApplyRes, + message::{PeerMsg, PeerTick, RaftCommand, RaftQuery, RaftRequest, StoreMsg, StoreTick}, + response_channel::{CmdResChannel, QueryResChannel, QueryResult}, +}; diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs new file mode 100644 index 00000000000..fe84ae3c3ef --- /dev/null +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -0,0 +1,477 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! Variants of channels for `Msg`. +//! - `Read`: a channel for read only requests including `StatusRequest`, +//! `GetRequest` and `SnapRequest` +//! - `Write`: a channel for write only requests including `AdminRequest` +//! `PutRequest`, `DeleteRequest` and `DeleteRangeRequest`. +//! +//! Prefer channel over callback because: +//! 1. channel can be reused, hence reduce allocations (not yet implemented). +//! 2. channel may not need dynamic dispatch. +//! 3. caller can use async fashion. +//! 4. there will be no callback leak. + +use std::{ + cell::UnsafeCell, + fmt, + future::Future, + mem::{self, ManuallyDrop}, + pin::Pin, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + task::{Context, Poll}, +}; + +use engine_traits::Snapshot; +use futures::task::AtomicWaker; +use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, raft_cmdpb::RaftCmdResponse}; +use raftstore::store::{ + local_metrics::TimeTracker, msg::ErrorCallback, ReadCallback, RegionSnapshot, WriteCallback, +}; +use smallvec::SmallVec; +use tikv_util::memory::HeapSize; + +/// A struct allows to watch and notify specific events. +/// +/// There are two different events: state and payload. Obviously, state events +/// have no payload. At most 30 states can be defined. There can be only one +/// type of payload. +struct EventCore { + /// Every event will have two bits. + /// - 0b00 means the event is not fired and not subscribed. + /// - 0b01 means the event is fired and not subscribed. + /// - 0b10 means the event is not fired and subscribed. + /// - 0b11 means the event is fired and subscribed. + /// Event 0 and Event 31 is reserved as payload and cancel respectively. + /// Other events should be defined within [1, 30]. + event: AtomicU64, + res: UnsafeCell>, + // Waker can be changed, need to use `AtomicWaker` to guarantee no data race. + waker: AtomicWaker, +} + +unsafe impl Send for EventCore {} + +const PAYLOAD_EVENT: u64 = 0; +const CANCEL_EVENT: u64 = 31; + +#[inline] +const fn subscribed_bit_of(event: u64) -> u64 { + 1 << (event * 2) +} + +#[inline] +const fn fired_bit_of(event: u64) -> u64 { + 1 << (event * 2 + 1) +} + +impl EventCore { + #[inline] + fn notify_event(&self, event: u64) { + let previous = self.event.fetch_or(fired_bit_of(event), Ordering::AcqRel); + if previous & subscribed_bit_of(event) != 0 { + self.waker.wake() + } + } + + /// Set the result. + /// + /// After this call, no events should be notified. + #[inline] + fn set_result(&self, result: Res) { + unsafe { + *self.res.get() = Some(result); + } + let previous = self.event.fetch_or( + fired_bit_of(PAYLOAD_EVENT) | fired_bit_of(CANCEL_EVENT), + Ordering::AcqRel, + ); + if previous & subscribed_bit_of(PAYLOAD_EVENT) != 0 { + self.waker.wake() + } + } + + /// Cancel all subscribers. + /// + /// After this call, no events should be notified and no result should be + /// set. + #[inline] + fn cancel(&self) { + let mut previous = self + .event + .fetch_or(fired_bit_of(CANCEL_EVENT), Ordering::AcqRel); + let subscribed_bit = subscribed_bit_of(0); + while previous != 0 { + // Not notified yet. + if previous & 0b11 == subscribed_bit { + self.waker.wake(); + return; + } + previous >>= 2; + } + } +} + +struct WaitEvent<'a, Res> { + event: u64, + core: &'a EventCore, +} + +#[inline] +fn check_bit(e: u64, fired_bit: u64) -> Option { + if e & fired_bit != 0 { + return Some(true); + } + let cancel_bit = fired_bit_of(CANCEL_EVENT); + if e & cancel_bit != 0 { + return Some(false); + } + None +} + +impl<'a, Res> Future for WaitEvent<'a, Res> { + type Output = bool; + + #[inline] + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let event = &self.core.event; + let mut e = event.load(Ordering::Relaxed); + let fired_bit = fired_bit_of(self.event); + if let Some(b) = check_bit(e, fired_bit) { + return Poll::Ready(b); + } + self.core.waker.register(cx.waker()); + let subscribed_bit = subscribed_bit_of(self.event); + loop { + match event.compare_exchange_weak( + e, + e | subscribed_bit, + Ordering::AcqRel, + Ordering::Relaxed, + ) { + Ok(_) => return Poll::Pending, + Err(v) => e = v, + }; + if let Some(b) = check_bit(e, fired_bit) { + return Poll::Ready(b); + } + } + } +} + +struct WaitResult<'a, Res> { + core: &'a EventCore, +} + +impl<'a, Res> Future for WaitResult<'a, Res> { + type Output = Option; + + #[inline] + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let event = &self.core.event; + let fired_bit = fired_bit_of(PAYLOAD_EVENT); + let mut e = event.load(Ordering::Relaxed); + if check_bit(e, fired_bit).is_some() { + unsafe { + return Poll::Ready((*self.core.res.get()).take()); + } + } + let subscribed_bit = subscribed_bit_of(PAYLOAD_EVENT); + self.core.waker.register(cx.waker()); + loop { + match event.compare_exchange_weak( + e, + e | subscribed_bit, + Ordering::AcqRel, + Ordering::Relaxed, + ) { + Ok(_) => return Poll::Pending, + Err(v) => e = v, + }; + if check_bit(e, fired_bit).is_some() { + unsafe { + return Poll::Ready((*self.core.res.get()).take()); + } + } + } + } +} + +pub struct CommandResultSubscriber { + core: Arc>, +} + +impl CommandResultSubscriber { + pub async fn wait_proposed(&mut self) -> bool { + WaitEvent { + event: CmdResChannel::PROPOSED_EVENT, + core: &self.core, + } + .await + } + + pub async fn wait_committed(&mut self) -> bool { + WaitEvent { + event: CmdResChannel::COMMITTED_EVENT, + core: &self.core, + } + .await + } + + pub async fn result(mut self) -> Option { + WaitResult { core: &self.core }.await + } +} + +unsafe impl Send for CommandResultSubscriber {} +unsafe impl Sync for CommandResultSubscriber {} + +pub struct CmdResChannel { + core: ManuallyDrop>>, +} + +impl CmdResChannel { + // Valid range is [1, 30] + const PROPOSED_EVENT: u64 = 1; + const COMMITTED_EVENT: u64 = 2; + + #[inline] + pub fn pair() -> (Self, CommandResultSubscriber) { + let core = Arc::new(EventCore { + event: AtomicU64::new(0), + res: UnsafeCell::new(None), + waker: AtomicWaker::new(), + }); + ( + Self { + core: ManuallyDrop::new(core.clone()), + }, + CommandResultSubscriber { core }, + ) + } +} + +impl ErrorCallback for CmdResChannel { + fn report_error(self, err: RaftCmdResponse) { + self.set_result(err); + } + + fn is_none(&self) -> bool { + false + } +} + +impl WriteCallback for CmdResChannel { + type Response = RaftCmdResponse; + + /// Called after a request is proposed to the raft group successfully. It's + /// used to notify the caller to move on early because it's very likely the + /// request will be applied to the raftstore. + #[inline] + fn notify_proposed(&mut self) { + self.core.notify_event(Self::PROPOSED_EVENT); + } + + /// Called after a request is committed and before it's being applied, and + /// it's guaranteed that the request will be successfully applied soon. + #[inline] + fn notify_committed(&mut self) { + self.core.notify_event(Self::COMMITTED_EVENT); + } + + fn trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { + None + } + + fn trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { + None + } + + // TODO: support executing hooks inside setting result. + #[inline] + fn set_result(mut self, res: RaftCmdResponse) { + self.core.set_result(res); + unsafe { + ManuallyDrop::drop(&mut self.core); + } + mem::forget(self); + } +} + +impl Drop for CmdResChannel { + #[inline] + fn drop(&mut self) { + self.core.cancel(); + unsafe { + ManuallyDrop::drop(&mut self.core); + } + } +} + +unsafe impl Send for CmdResChannel {} +unsafe impl Sync for CmdResChannel {} + +/// Response for Read. +/// +/// Unlike v1, snapshot are always taken in LocalReader, hence snapshot doesn't +/// need to be a field of the struct. +#[derive(Clone, PartialEq, Debug)] +pub struct ReadResponse { + pub txn_extra_op: TxnExtraOp, +} + +/// Possible result of a raft query. +#[derive(Clone, Debug, PartialEq)] +pub enum QueryResult { + /// If it's a read like get or snapshot, `ReadResponse` is returned on + /// success. + Read(ReadResponse), + /// If it's a status query, `RaftCmdResponse` is returned. If it's a read + /// like query, `RaftCmdResponse` is returned on error. + Response(RaftCmdResponse), +} + +impl QueryResult { + pub fn read(&self) -> Option<&ReadResponse> { + match self { + QueryResult::Read(r) => Some(r), + _ => None, + } + } + + pub fn response(&self) -> Option<&RaftCmdResponse> { + match self { + QueryResult::Response(r) => Some(r), + _ => None, + } + } +} + +pub struct QueryResChannel { + core: ManuallyDrop>>, +} + +impl QueryResChannel { + pub fn pair() -> (Self, QueryResSubscriber) { + let core = Arc::new(EventCore { + event: AtomicU64::new(0), + res: UnsafeCell::new(None), + waker: AtomicWaker::new(), + }); + ( + Self { + core: ManuallyDrop::new(core.clone()), + }, + QueryResSubscriber { core }, + ) + } +} + +impl ErrorCallback for QueryResChannel { + #[inline] + fn report_error(self, err: RaftCmdResponse) { + self.set_result(QueryResult::Response(err)); + } + + #[inline] + fn is_none(&self) -> bool { + false + } +} + +impl ReadCallback for QueryResChannel { + type Response = QueryResult; + + #[inline] + fn set_result(mut self, res: QueryResult) { + self.core.set_result(res); + unsafe { + ManuallyDrop::drop(&mut self.core); + } + mem::forget(self); + } +} + +impl Drop for QueryResChannel { + #[inline] + fn drop(&mut self) { + self.core.cancel(); + unsafe { + ManuallyDrop::drop(&mut self.core); + } + } +} + +unsafe impl Send for QueryResChannel {} +unsafe impl Sync for QueryResChannel {} + +pub struct QueryResSubscriber { + core: Arc>, +} + +impl QueryResSubscriber { + pub async fn result(mut self) -> Option { + WaitResult { core: &self.core }.await + } +} + +unsafe impl Send for QueryResSubscriber {} +unsafe impl Sync for QueryResSubscriber {} + +#[cfg(test)] +mod tests { + use engine_test::kv::KvTestSnapshot; + use futures::executor::block_on; + + use super::*; + + #[test] + fn test_cancel() { + let (mut chan, mut sub) = CmdResChannel::pair(); + drop(chan); + assert!(!block_on(sub.wait_proposed())); + assert!(!block_on(sub.wait_committed())); + assert!(block_on(sub.result()).is_none()); + + let (mut chan, mut sub) = CmdResChannel::pair(); + chan.notify_proposed(); + let mut result = RaftCmdResponse::default(); + result.mut_header().set_current_term(4); + chan.set_result(result.clone()); + assert!(block_on(sub.wait_proposed())); + assert!(!block_on(sub.wait_committed())); + assert_eq!(block_on(sub.result()), Some(result)); + + let (mut chan, mut sub) = QueryResChannel::pair(); + drop(chan); + assert!(block_on(sub.result()).is_none()); + } + + #[test] + fn test_channel() { + let (mut chan, mut sub) = CmdResChannel::pair(); + chan.notify_proposed(); + chan.notify_committed(); + let mut result = RaftCmdResponse::default(); + result.mut_header().set_current_term(2); + chan.set_result(result.clone()); + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + assert_eq!(block_on(sub.result()), Some(result.clone())); + + let (mut chan, mut sub) = QueryResChannel::pair(); + let resp = QueryResult::Response(result.clone()); + chan.set_result(resp.clone()); + assert_eq!(block_on(sub.result()).unwrap(), resp); + + let (mut chan, mut sub) = QueryResChannel::pair(); + let read = QueryResult::Read(ReadResponse { + txn_extra_op: TxnExtraOp::ReadOldValue, + }); + chan.set_result(read.clone()); + assert_eq!(block_on(sub.result()).unwrap(), read); + } +} diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index e5906719109..66fdbc00546 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -6,6 +6,7 @@ #![feature(min_specialization)] #![feature(box_patterns)] #![feature(hash_drain_filter)] +#![feature(let_else)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 3f841e699bb..d44cca3668b 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -83,7 +83,7 @@ use crate::{ local_metrics::{RaftMetrics, TimeTracker}, memory::*, metrics::*, - msg::{Callback, PeerMsg, ReadResponse, SignificantMsg}, + msg::{Callback, ErrorCallback, PeerMsg, ReadResponse, SignificantMsg}, peer::Peer, peer_storage::{write_initial_apply_state, write_peer_state}, util, @@ -91,7 +91,7 @@ use crate::{ admin_cmd_epoch_lookup, check_region_epoch, compare_region_epoch, is_learner, ChangePeerI, ConfChangeKind, KeysInfoFormatter, LatencyInspector, }, - Config, RegionSnapshot, RegionTask, + Config, RegionSnapshot, RegionTask, WriteCallback, }, Error, Result, }; @@ -101,20 +101,14 @@ const APPLY_WB_SHRINK_SIZE: usize = 1024 * 1024; const SHRINK_PENDING_CMD_QUEUE_CAP: usize = 64; const MAX_APPLY_BATCH_SIZE: usize = 64 * 1024 * 1024; -pub struct PendingCmd -where - S: Snapshot, -{ +pub struct PendingCmd { pub index: u64, pub term: u64, - pub cb: Option>, + pub cb: Option, } -impl PendingCmd -where - S: Snapshot, -{ - fn new(index: u64, term: u64, cb: Callback) -> PendingCmd { +impl PendingCmd { + fn new(index: u64, term: u64, cb: C) -> PendingCmd { PendingCmd { index, term, @@ -123,10 +117,7 @@ where } } -impl Drop for PendingCmd -where - S: Snapshot, -{ +impl Drop for PendingCmd { fn drop(&mut self) { if self.cb.is_some() { safe_panic!( @@ -138,10 +129,7 @@ where } } -impl Debug for PendingCmd -where - S: Snapshot, -{ +impl Debug for PendingCmd { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, @@ -153,30 +141,24 @@ where } } -impl HeapSize for PendingCmd {} +impl HeapSize for PendingCmd {} /// Commands waiting to be committed and applied. #[derive(Debug)] -pub struct PendingCmdQueue -where - S: Snapshot, -{ - normals: VecDeque>, - conf_change: Option>, +pub struct PendingCmdQueue { + normals: VecDeque>, + conf_change: Option>, } -impl PendingCmdQueue -where - S: Snapshot, -{ - fn new() -> PendingCmdQueue { +impl PendingCmdQueue { + fn new() -> PendingCmdQueue { PendingCmdQueue { normals: VecDeque::new(), conf_change: None, } } - fn pop_normal(&mut self, index: u64, term: u64) -> Option> { + fn pop_normal(&mut self, index: u64, term: u64) -> Option> { self.normals.pop_front().and_then(|cmd| { if self.normals.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP && self.normals.len() < SHRINK_PENDING_CMD_QUEUE_CAP @@ -191,18 +173,18 @@ where }) } - fn append_normal(&mut self, cmd: PendingCmd) { + fn append_normal(&mut self, cmd: PendingCmd) { self.normals.push_back(cmd); } - fn take_conf_change(&mut self) -> Option> { + fn take_conf_change(&mut self) -> Option> { // conf change will not be affected when changing between follower and leader, // so there is no need to check term. self.conf_change.take() } // TODO: seems we don't need to separate conf change from normal entries. - fn set_conf_change(&mut self, cmd: PendingCmd) { + fn set_conf_change(&mut self, cmd: PendingCmd) { self.conf_change = Some(cmd); } } @@ -547,7 +529,7 @@ where .applied_batch .cb_batch .iter() - .flat_map(|(cb, _)| cb.get_trackers()) + .flat_map(|(cb, _)| cb.trackers()) .flat_map(|trackers| trackers.iter().map(|t| t.as_tracker_token())) .flatten() .collect(); @@ -586,7 +568,7 @@ where // Invoke callbacks let now = std::time::Instant::now(); for (cb, resp) in cb_batch.drain(..) { - for tracker in cb.get_trackers().iter().flat_map(|v| *v) { + for tracker in cb.trackers().iter().flat_map(|v| *v) { tracker.observe(now, &self.apply_time, |t| &mut t.metrics.apply_time_nanos); } cb.invoke_with_response(resp); @@ -675,7 +657,7 @@ where } /// Calls the callback of `cmd` when the Region is removed. -fn notify_region_removed(region_id: u64, peer_id: u64, mut cmd: PendingCmd) { +fn notify_region_removed(region_id: u64, peer_id: u64, mut cmd: PendingCmd) { debug!( "region is removed, notify commands"; "region_id" => region_id, @@ -686,10 +668,10 @@ fn notify_region_removed(region_id: u64, peer_id: u64, mut cmd: PendingCmd) { +pub fn notify_req_region_removed(region_id: u64, cb: impl ErrorCallback) { let region_not_found = Error::RegionNotFound(region_id); let resp = cmd_resp::new_error(region_not_found); - cb.invoke_with_response(resp); + cb.report_error(resp); } /// Calls the callback of `cmd` when it can not be processed further. @@ -697,7 +679,7 @@ fn notify_stale_command( region_id: u64, peer_id: u64, term: u64, - mut cmd: PendingCmd, + mut cmd: PendingCmd, ) { info!( "command is stale, skip"; @@ -709,15 +691,15 @@ fn notify_stale_command( notify_stale_req(term, cmd.cb.take().unwrap()); } -pub fn notify_stale_req(term: u64, cb: Callback) { +pub fn notify_stale_req(term: u64, cb: impl ErrorCallback) { let resp = cmd_resp::err_resp(Error::StaleCommand, term); - cb.invoke_with_response(resp); + cb.report_error(resp); } -pub fn notify_stale_req_with_msg(term: u64, msg: String, cb: Callback) { +pub fn notify_stale_req_with_msg(term: u64, msg: String, cb: impl ErrorCallback) { let mut resp = cmd_resp::err_resp(Error::StaleCommand, term); resp.mut_header().mut_error().set_message(msg); - cb.invoke_with_response(resp); + cb.report_error(resp); } /// Checks if a write is needed to be issued before handling the command. @@ -884,7 +866,7 @@ where pending_remove: bool, /// The commands waiting to be committed and applied - pending_cmds: PendingCmdQueue, + pending_cmds: PendingCmdQueue>, /// The counter of pending request snapshots. See more in `Peer`. pending_request_snapshot_count: Arc, @@ -2974,10 +2956,7 @@ pub fn compact_raft_log( Ok(()) } -pub struct Apply -where - S: Snapshot, -{ +pub struct Apply { pub peer_id: u64, pub region_id: u64, pub term: u64, @@ -2985,11 +2964,11 @@ where pub commit_term: u64, pub entries: SmallVec<[CachedEntries; 1]>, pub entries_size: usize, - pub cbs: Vec>, + pub cbs: Vec>, pub bucket_meta: Option>, } -impl Apply { +impl Apply { pub(crate) fn new( peer_id: u64, region_id: u64, @@ -2997,9 +2976,9 @@ impl Apply { commit_index: u64, commit_term: u64, entries: Vec, - cbs: Vec>, + cbs: Vec>, buckets: Option>, - ) -> Apply { + ) -> Apply { let mut entries_size = 0; for e in &entries { entries_size += bytes_capacity(&e.data) + bytes_capacity(&e.context); @@ -3021,7 +3000,7 @@ impl Apply { pub fn on_schedule(&mut self, metrics: &RaftMetrics) { let now = std::time::Instant::now(); for cb in &mut self.cbs { - if let Callback::Write { trackers, .. } = &mut cb.cb { + if let Some(trackers) = cb.cb.trackers_mut() { for tracker in trackers { tracker.observe(now, &metrics.store_time, |t| { t.metrics.write_instant = Some(now); @@ -3035,7 +3014,7 @@ impl Apply { } } - fn try_batch(&mut self, other: &mut Apply) -> bool { + fn try_batch(&mut self, other: &mut Apply) -> bool { assert_eq!(self.region_id, other.region_id); assert_eq!(self.peer_id, other.peer_id); if self.entries_size + other.entries_size <= MAX_APPLY_BATCH_SIZE { @@ -3089,21 +3068,18 @@ impl Registration { } #[derive(Debug)] -pub struct Proposal -where - S: Snapshot, -{ +pub struct Proposal { pub is_conf_change: bool, pub index: u64, pub term: u64, - pub cb: Callback, + pub cb: C, /// `propose_time` is set to the last time when a peer starts to renew /// lease. pub propose_time: Option, pub must_pass_epoch_check: bool, } -impl HeapSize for Proposal {} +impl HeapSize for Proposal {} pub struct Destroy { region_id: u64, @@ -3252,7 +3228,7 @@ where { Apply { start: Instant, - apply: Apply, + apply: Apply>, }, Registration(Registration), LogsUpToDate(CatchUpLogs), @@ -3273,7 +3249,7 @@ impl Msg where EK: KvEngine, { - pub fn apply(apply: Apply) -> Msg { + pub fn apply(apply: Apply>) -> Msg { Msg::Apply { start: Instant::now(), apply, @@ -3409,7 +3385,11 @@ where /// Handles apply tasks, and uses the apply delegate to handle the committed /// entries. - fn handle_apply(&mut self, apply_ctx: &mut ApplyContext, mut apply: Apply) { + fn handle_apply( + &mut self, + apply_ctx: &mut ApplyContext, + mut apply: Apply>, + ) { if apply_ctx.timer.is_none() { apply_ctx.timer = Some(Instant::now_coarse()); } @@ -3483,12 +3463,12 @@ where } /// Handles proposals, and appends the commands to the apply delegate. - fn append_proposal(&mut self, props_drainer: Drain<'_, Proposal>) { + fn append_proposal(&mut self, props_drainer: Drain<'_, Proposal>>) { let (region_id, peer_id) = (self.delegate.region_id(), self.delegate.id()); let propose_num = props_drainer.len(); if self.delegate.stopped { for p in props_drainer { - let cmd = PendingCmd::::new(p.index, p.term, p.cb); + let cmd = PendingCmd::new(p.index, p.term, p.cb); notify_stale_command(region_id, peer_id, self.delegate.term, cmd); } return; @@ -3790,7 +3770,7 @@ where for tracker in apply .cbs .iter() - .flat_map(|p| p.cb.get_trackers()) + .flat_map(|p| p.cb.trackers()) .flat_map(|ts| ts.iter().flat_map(|t| t.as_tracker_token())) { GLOBAL_TRACKERS.with_tracker(tracker, |t| { @@ -4191,7 +4171,7 @@ where // So only shutdown needs to be checked here. if !tikv_util::thread_group::is_shutdown(!cfg!(test)) { for p in apply.cbs.drain(..) { - let cmd = PendingCmd::::new(p.index, p.term, p.cb); + let cmd = PendingCmd::new(p.index, p.term, p.cb); notify_region_removed(apply.region_id, apply.peer_id, cmd); } } @@ -4323,14 +4303,11 @@ mod memtrace { pub merge_yield: usize, } - impl HeapSize for PendingCmdQueue - where - S: Snapshot, - { + impl HeapSize for PendingCmdQueue { fn heap_size(&self) -> usize { // Some fields of `PendingCmd` are on stack, but ignore them because they are // just some small boxed closures. - self.normals.capacity() * mem::size_of::>() + self.normals.capacity() * mem::size_of::>() } } @@ -4642,7 +4619,7 @@ mod tests { index: u64, term: u64, cb: Callback, - ) -> Proposal { + ) -> Proposal> { Proposal { is_conf_change, index, @@ -4653,13 +4630,13 @@ mod tests { } } - fn apply( + fn apply( peer_id: u64, region_id: u64, term: u64, entries: Vec, - cbs: Vec>, - ) -> Apply { + cbs: Vec>, + ) -> Apply { let (commit_index, commit_term) = entries .last() .map(|e| (e.get_index(), e.get_term())) @@ -4843,7 +4820,7 @@ mod tests { system.shutdown(); } - fn cb(idx: u64, term: u64, tx: Sender) -> Proposal { + fn cb(idx: u64, term: u64, tx: Sender) -> Proposal> { proposal( false, idx, @@ -6423,7 +6400,7 @@ mod tests { #[test] fn pending_cmd_leak() { let res = panic_hook::recover_safe(|| { - let _cmd = PendingCmd::::new(1, 1, Callback::None); + let _cmd = PendingCmd::new(1, 1, Callback::::None); }); res.unwrap_err(); } @@ -6431,7 +6408,7 @@ mod tests { #[test] fn pending_cmd_leak_dtor_not_abort() { let res = panic_hook::recover_safe(|| { - let _cmd = PendingCmd::::new(1, 1, Callback::None); + let _cmd = PendingCmd::new(1, 1, Callback::::None); panic!("Don't abort"); // It would abort and fail if there was a double-panic in PendingCmd // dtor. diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 2452f177cff..66ceeea7967 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -93,7 +93,7 @@ use crate::{ }, AbstractPeer, CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, PeerTick, ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, - SignificantMsg, SnapKey, StoreMsg, + SignificantMsg, SnapKey, StoreMsg, WriteCallback, }, Error, Result, }; @@ -490,13 +490,7 @@ where let mut cbs = std::mem::take(&mut self.callbacks); let proposed_cbs: Vec = cbs .iter_mut() - .filter_map(|cb| { - if let Callback::Write { proposed_cb, .. } = cb { - proposed_cb.take() - } else { - None - } - }) + .filter_map(|cb| cb.take_proposed_cb()) .collect(); let proposed_cb: Option = if proposed_cbs.is_empty() { None @@ -509,13 +503,7 @@ where }; let committed_cbs: Vec<_> = cbs .iter_mut() - .filter_map(|cb| { - if let Callback::Write { committed_cb, .. } = cb { - committed_cb.take() - } else { - None - } - }) + .filter_map(|cb| cb.take_committed_cb()) .collect(); let committed_cb: Option = if committed_cbs.is_empty() { None @@ -529,13 +517,7 @@ where let tokens: SmallVec<[TimeTracker; 4]> = cbs .iter_mut() - .filter_map(|cb| { - if let Callback::Write { trackers, .. } = cb { - Some(trackers[0]) - } else { - None - } - }) + .filter_map(|cb| cb.trackers().map(|t| t[0])) .collect(); let mut cb = Callback::write_ext( @@ -550,7 +532,7 @@ where committed_cb, ); - if let Callback::Write { trackers, .. } = &mut cb { + if let Some(trackers) = cb.trackers_mut() { *trackers = tokens; } @@ -4829,7 +4811,7 @@ where if self.ctx.raft_metrics.waterfall_metrics { let now = Instant::now(); - for tracker in cb.get_trackers().iter().flat_map(|v| *v) { + for tracker in cb.trackers().iter().flat_map(|v| *v) { tracker.observe(now, &self.ctx.raft_metrics.wf_batch_wait, |t| { &mut t.metrics.wf_batch_wait_nanos }); diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 947e9e074fd..619a18e3fb5 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -73,9 +73,10 @@ where } } -pub type ReadCallback = Box) + Send>; -pub type WriteCallback = Box; +pub type BoxReadCallback = Box) + Send>; +pub type BoxWriteCallback = Box; pub type ExtCallback = Box; + #[cfg(any(test, feature = "testexport"))] pub type TestCallback = Box; @@ -88,10 +89,10 @@ pub enum Callback { /// No callback. None, /// Read callback. - Read(ReadCallback), + Read(BoxReadCallback), /// Write callback. Write { - cb: WriteCallback, + cb: BoxWriteCallback, /// `proposed_cb` is called after a request is proposed to the raft /// group successfully. It's used to notify the caller to move on early /// because it's very likely the request will be applied to the @@ -101,6 +102,7 @@ pub enum Callback { /// it's being applied, and it's guaranteed that the request will be /// successfully applied soon. committed_cb: Option, + trackers: SmallVec<[TimeTracker; 4]>, }, #[cfg(any(test, feature = "testexport"))] @@ -114,12 +116,12 @@ impl Callback where S: Snapshot, { - pub fn write(cb: WriteCallback) -> Self { + pub fn write(cb: BoxWriteCallback) -> Self { Self::write_ext(cb, None, None) } pub fn write_ext( - cb: WriteCallback, + cb: BoxWriteCallback, proposed_cb: Option, committed_cb: Option, ) -> Self { @@ -142,13 +144,6 @@ where } } - pub fn get_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - match self { - Callback::Write { trackers, .. } => Some(trackers), - _ => None, - } - } - pub fn invoke_with_response(self, resp: RaftCmdResponse) { match self { Callback::None => (), @@ -169,27 +164,22 @@ where } } - pub fn has_proposed_cb(&mut self) -> bool { - if let Callback::Write { proposed_cb, .. } = self { - proposed_cb.is_some() - } else { - false - } + pub fn has_proposed_cb(&self) -> bool { + let Callback::Write { proposed_cb, .. } = self else { return false }; + proposed_cb.is_some() } pub fn invoke_proposed(&mut self) { - if let Callback::Write { proposed_cb, .. } = self { - if let Some(cb) = proposed_cb.take() { - cb() - } + let Callback::Write { proposed_cb, .. } = self else { return }; + if let Some(cb) = proposed_cb.take() { + cb(); } } pub fn invoke_committed(&mut self) { - if let Callback::Write { committed_cb, .. } = self { - if let Some(cb) = committed_cb.take() { - cb() - } + let Callback::Write { committed_cb, .. } = self else { return }; + if let Some(cb) = committed_cb.take() { + cb(); } } @@ -200,7 +190,86 @@ where } } - pub fn is_none(&self) -> bool { + pub fn take_proposed_cb(&mut self) -> Option { + let Callback::Write { proposed_cb, .. } = self else { return None }; + proposed_cb.take() + } + + pub fn take_committed_cb(&mut self) -> Option { + let Callback::Write { committed_cb, .. } = self else { return None }; + committed_cb.take() + } +} + +pub trait ReadCallback: ErrorCallback { + type Response; + + fn set_result(self, result: Self::Response); +} + +pub trait WriteCallback: ErrorCallback { + type Response; + + fn notify_proposed(&mut self); + fn notify_committed(&mut self); + fn trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>>; + fn trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>>; + fn set_result(self, result: Self::Response); +} + +pub trait ErrorCallback: Send { + fn report_error(self, err: RaftCmdResponse); + fn is_none(&self) -> bool; +} + +impl ReadCallback for Callback { + type Response = ReadResponse; + + #[inline] + fn set_result(self, result: Self::Response) { + self.invoke_read(result); + } +} + +impl WriteCallback for Callback { + type Response = RaftCmdResponse; + + #[inline] + fn notify_proposed(&mut self) { + self.invoke_proposed(); + } + + #[inline] + fn notify_committed(&mut self) { + self.invoke_committed(); + } + + #[inline] + fn trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { + let Callback::Write { trackers, .. } = self else { return None }; + Some(trackers) + } + + #[inline] + fn trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { + let Callback::Write { trackers, .. } = self else { return None }; + Some(trackers) + } + + #[inline] + fn set_result(self, result: Self::Response) { + self.invoke_with_response(result); + } +} + +impl ErrorCallback for Callback { + #[inline] + fn report_error(self, err: RaftCmdResponse) { + self.invoke_with_response(err); + } + + #[inline] + fn is_none(&self) -> bool { matches!(self, Callback::None) } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 17fe22926d1..b109d107c4f 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -93,7 +93,7 @@ use crate::{ }, hibernate_state::GroupState, memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}, - msg::{PeerMsg, RaftCommand, SignificantMsg, StoreMsg}, + msg::{ErrorCallback, PeerMsg, RaftCommand, SignificantMsg, StoreMsg}, txn_ext::LocksStatus, util::{admin_cmd_epoch_lookup, RegionReadProgress}, worker::{ @@ -101,7 +101,7 @@ use crate::{ ReadProgress, RegionTask, SplitCheckTask, }, Callback, Config, GlobalReplicationState, PdTask, ReadIndexContext, ReadResponse, TxnExt, - RAFT_INIT_LOG_INDEX, + WriteCallback, RAFT_INIT_LOG_INDEX, }, Error, Result, }; @@ -121,16 +121,13 @@ pub enum StaleState { } #[derive(Debug)] -struct ProposalQueue -where - S: Snapshot, -{ +pub struct ProposalQueue { tag: String, - queue: VecDeque>, + queue: VecDeque>, } -impl ProposalQueue { - fn new(tag: String) -> ProposalQueue { +impl ProposalQueue { + fn new(tag: String) -> ProposalQueue { ProposalQueue { tag, queue: VecDeque::new(), @@ -146,7 +143,7 @@ impl ProposalQueue { .and_then(|i| { self.queue[i] .cb - .get_trackers() + .trackers() .map(|ts| (self.queue[i].term, ts)) }) } @@ -159,7 +156,7 @@ impl ProposalQueue { } // Find proposal in front or at the given term and index - fn pop(&mut self, term: u64, index: u64) -> Option> { + fn pop(&mut self, term: u64, index: u64) -> Option> { self.queue.pop_front().and_then(|p| { // Comparing the term first then the index, because the term is // increasing among all log entries and the index is increasing @@ -174,7 +171,7 @@ impl ProposalQueue { /// Find proposal at the given term and index and notify stale proposals /// in front that term and index - fn find_proposal(&mut self, term: u64, index: u64, current_term: u64) -> Option> { + fn find_proposal(&mut self, term: u64, index: u64, current_term: u64) -> Option> { while let Some(p) = self.pop(term, index) { if p.term == term { if p.index == index { @@ -193,11 +190,11 @@ impl ProposalQueue { } #[inline] - fn oldest(&self) -> Option<&Proposal> { + fn oldest(&self) -> Option<&Proposal> { self.queue.front() } - fn push(&mut self, p: Proposal) { + fn push(&mut self, p: Proposal) { if let Some(f) = self.queue.back() { // The term must be increasing among all log entries and the index // must be increasing inside a given term @@ -217,7 +214,7 @@ impl ProposalQueue { } } - fn back(&self) -> Option<&Proposal> { + fn back(&self) -> Option<&Proposal> { self.queue.back() } } @@ -730,11 +727,11 @@ where /// Record the last instant of each peer's heartbeat response. pub peer_heartbeats: HashMap, - proposals: ProposalQueue, + proposals: ProposalQueue>, leader_missing_time: Option, #[getset(get = "pub")] leader_lease: Lease, - pending_reads: ReadIndexQueue, + pending_reads: ReadIndexQueue>, /// Threshold of long uncommitted proposals. /// /// Note that this is a dynamically changing value. Check the @@ -1646,7 +1643,7 @@ where { let proposal = &self.proposals.queue[idx]; if term == proposal.term { - for tracker in proposal.cb.get_trackers().iter().flat_map(|v| v.iter()) { + for tracker in proposal.cb.trackers().iter().flat_map(|v| v.iter()) { tracker.observe(std_now, &ctx.raft_metrics.wf_send_proposal, |t| { &mut t.metrics.wf_send_proposal_nanos }); @@ -3054,7 +3051,7 @@ where fn response_read( &self, - read: &mut ReadIndexRequest, + read: &mut ReadIndexRequest>, ctx: &mut PollContext, replica_read: bool, ) { @@ -3505,7 +3502,7 @@ where fn post_propose( &mut self, poll_ctx: &mut PollContext, - mut p: Proposal, + mut p: Proposal>, ) { // Try to renew leader lease on every consistent read/write request. if poll_ctx.current_time.is_none() { @@ -3797,7 +3794,11 @@ where ); } - pub fn push_pending_read(&mut self, read: ReadIndexRequest, is_leader: bool) { + pub fn push_pending_read( + &mut self, + read: ReadIndexRequest>, + is_leader: bool, + ) { self.pending_reads.push_back(read, is_leader); } @@ -3822,7 +3823,7 @@ where ); poll_ctx.raft_metrics.propose.unsafe_read_index += 1; cmd_resp::bind_error(&mut err_resp, e); - cb.invoke_with_response(err_resp); + cb.report_error(err_resp); self.should_wake_up = true; return false; } @@ -3899,7 +3900,7 @@ where } self.should_wake_up = true; cmd_resp::bind_error(&mut err_resp, Error::NotLeader(self.region_id, None)); - cb.invoke_with_response(err_resp); + cb.report_error(err_resp); return false; } @@ -5795,7 +5796,7 @@ mod tests { #[test] fn test_propose_queue_find_proposal() { - let mut pq: ProposalQueue = + let mut pq: ProposalQueue> = ProposalQueue::new("tag".to_owned()); let gen_term = |index: u64| (index / 10) + 1; let push_proposal = |pq: &mut ProposalQueue<_>, index: u64| { @@ -5858,7 +5859,7 @@ mod tests { fn must_not_call() -> ExtCallback { Box::new(move || unreachable!()) } - let mut pq: ProposalQueue = + let mut pq: ProposalQueue> = ProposalQueue::new("tag".to_owned()); // (1, 4) and (1, 5) is not committed diff --git a/components/raftstore/src/store/read_queue.rs b/components/raftstore/src/store/read_queue.rs index d9261b9fde3..6af9c151810 100644 --- a/components/raftstore/src/store/read_queue.rs +++ b/components/raftstore/src/store/read_queue.rs @@ -4,7 +4,6 @@ use std::{cmp, collections::VecDeque, mem, u64, usize}; use collections::HashMap; -use engine_traits::Snapshot; use kvproto::{ kvrpcpb::LockInfo, raft_cmdpb::{self, RaftCmdRequest}, @@ -21,19 +20,17 @@ use tikv_util::{ use time::Timespec; use uuid::Uuid; +use super::msg::ErrorCallback; use crate::{ - store::{fsm::apply, metrics::*, Callback, Config}, + store::{fsm::apply, metrics::*, Config}, Result, }; const READ_QUEUE_SHRINK_SIZE: usize = 64; -pub struct ReadIndexRequest -where - S: Snapshot, -{ +pub struct ReadIndexRequest { pub id: Uuid, - cmds: MustConsumeVec<(RaftCmdRequest, Callback, Option)>, + cmds: MustConsumeVec<(RaftCmdRequest, C, Option)>, pub propose_time: Timespec, pub read_index: Option, pub addition_request: Option>, @@ -44,24 +41,16 @@ where cmds_heap_size: usize, } -impl ReadIndexRequest -where - S: Snapshot, -{ - const CMD_SIZE: usize = mem::size_of::<(RaftCmdRequest, Callback, Option)>(); +impl ReadIndexRequest { + const CMD_SIZE: usize = mem::size_of::<(RaftCmdRequest, C, Option)>(); - pub fn push_command(&mut self, req: RaftCmdRequest, cb: Callback, read_index: u64) { + pub fn push_command(&mut self, req: RaftCmdRequest, cb: C, read_index: u64) { RAFT_READ_INDEX_PENDING_COUNT.inc(); self.cmds_heap_size += req.heap_size(); self.cmds.push((req, cb, Some(read_index))); } - pub fn with_command( - id: Uuid, - req: RaftCmdRequest, - cb: Callback, - propose_time: Timespec, - ) -> Self { + pub fn with_command(id: Uuid, req: RaftCmdRequest, cb: C, propose_time: Timespec) -> Self { RAFT_READ_INDEX_PENDING_COUNT.inc(); // Ignore heap allocations for `Callback`. @@ -81,31 +70,25 @@ where } } - pub fn cmds(&self) -> &[(RaftCmdRequest, Callback, Option)] { + pub fn cmds(&self) -> &[(RaftCmdRequest, C, Option)] { &self.cmds } - pub fn take_cmds(&mut self) -> MustConsumeVec<(RaftCmdRequest, Callback, Option)> { + pub fn take_cmds(&mut self) -> MustConsumeVec<(RaftCmdRequest, C, Option)> { self.cmds_heap_size = 0; self.cmds.take() } } -impl Drop for ReadIndexRequest -where - S: Snapshot, -{ +impl Drop for ReadIndexRequest { fn drop(&mut self) { let dur = (monotonic_raw_now() - self.propose_time).to_std().unwrap(); RAFT_READ_INDEX_PENDING_DURATION.observe(duration_to_sec(dur)); } } -pub struct ReadIndexQueue -where - S: Snapshot, -{ - reads: VecDeque>, +pub struct ReadIndexQueue { + reads: VecDeque>, ready_cnt: usize, // How many requests are handled. handled_cnt: usize, @@ -115,11 +98,8 @@ where retry_countdown: usize, } -impl Default for ReadIndexQueue -where - S: Snapshot, -{ - fn default() -> ReadIndexQueue { +impl Default for ReadIndexQueue { + fn default() -> ReadIndexQueue { ReadIndexQueue { reads: VecDeque::new(), ready_cnt: 0, @@ -130,10 +110,7 @@ where } } -impl ReadIndexQueue -where - S: Snapshot, -{ +impl ReadIndexQueue { /// Check it's necessary to retry pending read requests or not. /// Return true if all such conditions are satisfied: /// 1. more than an election timeout elapsed from the last request push; @@ -196,7 +173,7 @@ where self.contexts.clear(); } - pub fn push_back(&mut self, mut read: ReadIndexRequest, is_leader: bool) { + pub fn push_back(&mut self, mut read: ReadIndexRequest, is_leader: bool) { if !is_leader { read.in_contexts = true; let offset = self.handled_cnt + self.reads.len(); @@ -206,15 +183,15 @@ where self.retry_countdown = usize::MAX; } - pub fn back_mut(&mut self) -> Option<&mut ReadIndexRequest> { + pub fn back_mut(&mut self) -> Option<&mut ReadIndexRequest> { self.reads.back_mut() } - pub fn back(&self) -> Option<&ReadIndexRequest> { + pub fn back(&self) -> Option<&ReadIndexRequest> { self.reads.back() } - pub fn last_ready(&self) -> Option<&ReadIndexRequest> { + pub fn last_ready(&self) -> Option<&ReadIndexRequest> { if self.ready_cnt > 0 { return Some(&self.reads[self.ready_cnt - 1]); } @@ -333,7 +310,7 @@ where } } - pub fn pop_front(&mut self) -> Option> { + pub fn pop_front(&mut self) -> Option> { if self.ready_cnt == 0 { return None; } @@ -352,7 +329,7 @@ where /// Raft could have not been ready to handle the poped task. So put it back /// into the queue. - pub fn push_front(&mut self, read: ReadIndexRequest) { + pub fn push_front(&mut self, read: ReadIndexRequest) { debug_assert!(read.read_index.is_some()); self.reads.push_front(read); self.ready_cnt += 1; @@ -444,10 +421,7 @@ mod memtrace { use super::*; - impl HeapSize for ReadIndexRequest - where - S: Snapshot, - { + impl HeapSize for ReadIndexRequest { fn heap_size(&self) -> usize { let mut size = self.cmds_heap_size + Self::CMD_SIZE * self.cmds.capacity(); if let Some(ref add) = self.addition_request { @@ -457,13 +431,10 @@ mod memtrace { } } - impl HeapSize for ReadIndexQueue - where - S: Snapshot, - { + impl HeapSize for ReadIndexQueue { #[inline] fn heap_size(&self) -> usize { - let mut size = self.reads.capacity() * mem::size_of::>() + let mut size = self.reads.capacity() * mem::size_of::>() // For one Uuid and one usize. + 24 * self.contexts.len(); for read in &self.reads { @@ -522,10 +493,11 @@ mod tests { use engine_test::kv::KvTestSnapshot; use super::*; + use crate::store::Callback; #[test] fn test_read_queue_fold() { - let mut queue = ReadIndexQueue:: { + let mut queue = ReadIndexQueue::> { handled_cnt: 125, ..Default::default() }; @@ -584,7 +556,7 @@ mod tests { #[test] fn test_become_leader_then_become_follower() { - let mut queue = ReadIndexQueue:: { + let mut queue = ReadIndexQueue::> { handled_cnt: 100, ..Default::default() }; @@ -628,7 +600,7 @@ mod tests { #[test] fn test_retake_leadership() { - let mut queue = ReadIndexQueue:: { + let mut queue = ReadIndexQueue::> { handled_cnt: 100, ..Default::default() }; @@ -670,7 +642,7 @@ mod tests { #[test] fn test_advance_replica_reads_out_of_order() { - let mut queue = ReadIndexQueue:: { + let mut queue = ReadIndexQueue::> { handled_cnt: 100, ..Default::default() }; From 6c06f99c1b9dc10811e689cfc979a1b8d4287dd6 Mon Sep 17 00:00:00 2001 From: Connor Date: Fri, 19 Aug 2022 15:38:52 +0800 Subject: [PATCH 160/676] raftstore: Simplify raft local metrics (#13307) ref tikv/tikv#12876 Simplify raft local metrics by using local counter provided by rust-prometheus Signed-off-by: Connor1996 --- .../raftstore-v2/src/operation/ready/mod.rs | 2 +- .../raftstore/src/store/async_io/write.rs | 2 +- components/raftstore/src/store/fsm/peer.rs | 79 ++- components/raftstore/src/store/fsm/store.rs | 63 ++- .../raftstore/src/store/local_metrics.rs | 461 +++--------------- components/raftstore/src/store/metrics.rs | 220 ++++----- components/raftstore/src/store/peer.rs | 55 ++- .../raftstore/src/store/worker/metrics.rs | 2 +- 8 files changed, 321 insertions(+), 563 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 668453e708b..156ea55a414 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -245,7 +245,7 @@ impl Peer { self.raft_group_mut().advance_append_async(ready); } - ctx.raft_metrics.ready.has_ready_region += 1; + ctx.raft_metrics.ready.has_ready_region.inc(); } /// Called when an asynchronously write finishes. diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index a007d168474..ea796117e2c 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -564,7 +564,7 @@ where cfg_tracker, raft_write_size_limit: cfg.value().raft_write_size_limit.0 as usize, metrics: StoreWriteMetrics::new(cfg.value().waterfall_metrics), - message_metrics: Default::default(), + message_metrics: RaftSendMessageMetrics::default(), perf_context, pending_latency_inspect: vec![], } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 66ceeea7967..9b354fb0842 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -486,7 +486,7 @@ where let cb = self.callbacks.pop().unwrap(); return Some((req, cb)); } - metric.propose.batch += self.callbacks.len() - 1; + metric.propose.batch.inc_by(self.callbacks.len() as u64 - 1); let mut cbs = std::mem::take(&mut self.callbacks); let proposed_cbs: Vec = cbs .iter_mut() @@ -610,8 +610,7 @@ where PeerMsg::RaftCommand(cmd) => { self.ctx .raft_metrics - .propose - .request_wait_time + .propose_wait_time .observe(duration_to_sec(cmd.send_time.saturating_elapsed()) as f64); if let Some(Err(e)) = cmd.extra_opts.deadline.map(|deadline| deadline.check()) { @@ -662,7 +661,7 @@ where PeerMsg::Destroy(peer_id) => { if self.fsm.peer.peer_id() == peer_id { match self.fsm.peer.maybe_destroy(self.ctx) { - None => self.ctx.raft_metrics.message_dropped.applying_snap += 1, + None => self.ctx.raft_metrics.message_dropped.applying_snap.inc(), Some(job) => { self.handle_destroy_peer(job); } @@ -1820,7 +1819,7 @@ where self.register_entry_cache_evict_tick(); } self.ctx.ready_count += 1; - self.ctx.raft_metrics.ready.has_ready_region += 1; + self.ctx.raft_metrics.ready.has_ready_region.inc(); if self.fsm.peer.leader_unreachable { self.fsm.reset_hibernate_state(GroupState::Chaos); @@ -2187,7 +2186,7 @@ where "peer_id" => self.fsm.peer_id(), "err" => ?e, ); - self.ctx.raft_metrics.propose.unsafe_read_index += 1; + self.ctx.raft_metrics.propose.unsafe_read_index.inc(); return; } @@ -2290,7 +2289,7 @@ where "skip {:?} because of disk full", msg_type; "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id() ); - self.ctx.raft_metrics.message_dropped.disk_full += 1; + self.ctx.raft_metrics.message_dropped.disk_full.inc(); return Ok(()); } @@ -2360,7 +2359,7 @@ where && (msg.get_message().get_from() == raft::INVALID_ID || msg.get_message().get_from() == self.fsm.peer_id()) { - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return Ok(()); } self.fsm.peer.step(self.ctx, msg.take_message()) @@ -2516,7 +2515,11 @@ where "to_store_id" => to.get_store_id(), "my_store_id" => self.store_id(), ); - self.ctx.raft_metrics.message_dropped.mismatch_store_id += 1; + self.ctx + .raft_metrics + .message_dropped + .mismatch_store_id + .inc(); return false; } @@ -2525,7 +2528,11 @@ where "missing epoch in raft message, ignore it"; "region_id" => region_id, ); - self.ctx.raft_metrics.message_dropped.mismatch_region_epoch += 1; + self.ctx + .raft_metrics + .message_dropped + .mismatch_region_epoch + .inc(); return false; } @@ -2577,7 +2584,7 @@ where "peer_id" => self.fsm.peer_id(), "target_peer" => ?target, ); - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); true } cmp::Ordering::Greater => { @@ -2605,7 +2612,7 @@ where } } } - None => self.ctx.raft_metrics.message_dropped.applying_snap += 1, + None => self.ctx.raft_metrics.message_dropped.applying_snap.inc(), } true } @@ -2710,7 +2717,7 @@ where "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), ); - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return; } // TODO: ask pd to guarantee we are stale now. @@ -2780,7 +2787,7 @@ where "snap" => ?snap_region, "to_peer" => ?msg.get_to_peer(), ); - self.ctx.raft_metrics.message_dropped.region_no_peer += 1; + self.ctx.raft_metrics.message_dropped.region_no_peer.inc(); return Ok(Either::Left(key)); } @@ -2792,7 +2799,7 @@ where "region_id" => self.fsm.region_id(), "peer_id" => self.fsm.peer_id(), ); - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return Ok(Either::Left(key)); } else { panic!( @@ -2826,7 +2833,7 @@ where "region" => ?region, "snap" => ?snap_region, ); - self.ctx.raft_metrics.message_dropped.region_overlap += 1; + self.ctx.raft_metrics.message_dropped.region_overlap.inc(); return Ok(Either::Left(key)); } } @@ -2889,7 +2896,7 @@ where } } if is_overlapped { - self.ctx.raft_metrics.message_dropped.region_overlap += 1; + self.ctx.raft_metrics.message_dropped.region_overlap.inc(); return Ok(Either::Left(key)); } @@ -4687,7 +4694,11 @@ where ) -> Result> { // Check store_id, make sure that the msg is dispatched to the right place. if let Err(e) = util::check_store_id(msg, self.store_id()) { - self.ctx.raft_metrics.invalid_proposal.mismatch_store_id += 1; + self.ctx + .raft_metrics + .invalid_proposal + .mismatch_store_id + .inc(); return Err(e); } if msg.has_status_request() { @@ -4730,7 +4741,7 @@ where && !allow_replica_read && !allow_stale_read { - self.ctx.raft_metrics.invalid_proposal.not_leader += 1; + self.ctx.raft_metrics.invalid_proposal.not_leader.inc(); let leader = self.fsm.peer.get_peer_from_cache(leader_id); self.fsm.reset_hibernate_state(GroupState::Chaos); self.register_raft_base_tick(); @@ -4738,7 +4749,11 @@ where } // peer_id must be the same as peer's. if let Err(e) = util::check_peer_id(msg, self.fsm.peer.peer_id()) { - self.ctx.raft_metrics.invalid_proposal.mismatch_peer_id += 1; + self.ctx + .raft_metrics + .invalid_proposal + .mismatch_peer_id + .inc(); return Err(e); } // check whether the peer is initialized. @@ -4746,13 +4761,18 @@ where self.ctx .raft_metrics .invalid_proposal - .region_not_initialized += 1; + .region_not_initialized + .inc(); return Err(Error::RegionNotInitialized(region_id)); } // If the peer is applying snapshot, it may drop some sending messages, that // could make clients wait for response until timeout. if self.fsm.peer.is_handling_snapshot() { - self.ctx.raft_metrics.invalid_proposal.is_applying_snapshot += 1; + self.ctx + .raft_metrics + .invalid_proposal + .is_applying_snapshot + .inc(); // TODO: replace to a more suitable error. return Err(Error::Other(box_err!( "{} peer is applying snapshot", @@ -4761,7 +4781,7 @@ where } // Check whether the term is stale. if let Err(e) = util::check_term(msg, self.fsm.peer.term()) { - self.ctx.raft_metrics.invalid_proposal.stale_command += 1; + self.ctx.raft_metrics.invalid_proposal.stale_command.inc(); return Err(e); } @@ -4773,7 +4793,7 @@ where // driver, the meta is updated. let requested_version = msg.get_header().get_region_epoch().version; self.collect_sibling_region(requested_version, &mut new_regions); - self.ctx.raft_metrics.invalid_proposal.epoch_not_match += 1; + self.ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); Err(Error::EpochNotMatch(m, new_regions)) } Err(e) => Err(e), @@ -5016,12 +5036,16 @@ where // [entries...][the entry at `compact_idx`][the last entry][new compaction entry] // |-------------------- entries will be left ----------------------| // ``` - self.ctx.raft_metrics.raft_log_gc_skipped.reserve_log += 1; + self.ctx.raft_metrics.raft_log_gc_skipped.reserve_log.inc(); return; } else if replicated_idx - first_idx < self.ctx.cfg.raft_log_gc_threshold && self.fsm.skip_gc_raft_log_ticks < self.ctx.cfg.raft_log_reserve_max_ticks { - self.ctx.raft_metrics.raft_log_gc_skipped.threshold_limit += 1; + self.ctx + .raft_metrics + .raft_log_gc_skipped + .threshold_limit + .inc(); // Logs will only be kept `max_ticks` * `raft_log_gc_tick_interval`. self.fsm.skip_gc_raft_log_ticks += 1; self.register_raft_gc_log_tick(); @@ -5037,7 +5061,8 @@ where self.ctx .raft_metrics .raft_log_gc_skipped - .compact_idx_too_small += 1; + .compact_idx_too_small + .inc(); return; } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 3c4e77ff4b9..d6faf92ca85 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -82,7 +82,7 @@ use crate::{ ApplyBatchSystem, ApplyNotifier, ApplyPollerBuilder, ApplyRes, ApplyRouter, ApplyTaskRes, }, - local_metrics::{RaftMetrics, RaftReadyMetrics}, + local_metrics::RaftMetrics, memory::*, metrics::*, peer_storage, @@ -598,7 +598,7 @@ where "msg_type" => ?msg_type, ); - self.raft_metrics.message_dropped.stale_msg += 1; + self.raft_metrics.message_dropped.stale_msg.inc(); let mut gc_msg = RaftMessage::default(); gc_msg.set_region_id(region_id); @@ -765,7 +765,6 @@ pub struct RaftPoller>, peer_msg_buf: Vec>, - previous_metrics: RaftReadyMetrics, timer: TiInstant, poll_ctx: PollContext, messages_per_tick: usize, @@ -773,12 +772,17 @@ pub struct RaftPoller RaftPoller { fn flush_events(&mut self) { self.flush_ticks(); - self.poll_ctx.raft_metrics.flush(); + self.poll_ctx.raft_metrics.maybe_flush(); self.poll_ctx.store_stat.flush(); MEMTRACE_PEERS.trace(mem::take(&mut self.trace_event)); @@ -800,7 +804,9 @@ impl PollHandler, St for<'a> F: FnOnce(&'a BatchSystemConfig), { fail_point!("begin_raft_poller"); - self.previous_metrics = self.poll_ctx.raft_metrics.ready.clone(); + self.previous_append = self.poll_ctx.raft_metrics.ready.append.get(); + self.previous_message = self.poll_ctx.raft_metrics.ready.message.get(); + self.previous_snapshot = self.poll_ctx.raft_metrics.ready.snapshot.get(); self.poll_ctx.pending_count = 0; self.poll_ctx.ready_count = 0; self.poll_ctx.has_ready = false; @@ -1010,17 +1016,20 @@ impl PollHandler, St .raft_metrics .ready .append - .saturating_sub(self.previous_metrics.append), + .get() + .saturating_sub(self.previous_append), self.poll_ctx .raft_metrics .ready .message - .saturating_sub(self.previous_metrics.message), + .get() + .saturating_sub(self.previous_message), self.poll_ctx .raft_metrics .ready .snapshot - .saturating_sub(self.previous_metrics.snapshot), + .get() + .saturating_sub(self.previous_snapshot), ); } @@ -1319,7 +1328,6 @@ where tag: tag.clone(), store_msg_buf: Vec::with_capacity(ctx.cfg.messages_per_tick), peer_msg_buf: Vec::with_capacity(ctx.cfg.messages_per_tick), - previous_metrics: ctx.raft_metrics.ready.clone(), timer: TiInstant::now(), messages_per_tick: ctx.cfg.messages_per_tick, poll_ctx: ctx, @@ -1327,6 +1335,9 @@ where trace_event: TraceEvent::default(), last_flush_time: TiInstant::now(), need_flush_events: false, + previous_append: 0, + previous_message: 0, + previous_snapshot: 0, } } } @@ -1757,7 +1768,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER if local_state.get_state() != PeerState::Tombstone { // Maybe split, but not registered yet. if !util::is_first_message(msg.get_message()) { - self.ctx.raft_metrics.message_dropped.region_nonexistent += 1; + self.ctx + .raft_metrics + .message_dropped + .region_nonexistent + .inc(); return Err(box_err!( "[region {}] region not exist but not tombstone: {:?}", region_id, @@ -1810,7 +1825,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } // The region in this peer is already destroyed if util::is_epoch_stale(from_epoch, region_epoch) { - self.ctx.raft_metrics.message_dropped.region_tombstone_peer += 1; + self.ctx + .raft_metrics + .message_dropped + .region_tombstone_peer + .inc(); info!( "tombstone peer receives a stale message"; "region_id" => region_id, @@ -1859,7 +1878,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER util::find_peer(region, self.ctx.store_id()).map(|r| r.get_id()) { if to_peer_id <= local_peer_id { - self.ctx.raft_metrics.message_dropped.region_tombstone_peer += 1; + self.ctx + .raft_metrics + .message_dropped + .region_tombstone_peer + .inc(); info!( "tombstone peer receives a stale message, local_peer_id >= to_peer_id in msg"; "region_id" => region_id, @@ -1907,7 +1930,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER "to_store_id" => msg.get_to_peer().get_store_id(), "region_id" => region_id, ); - self.ctx.raft_metrics.message_dropped.mismatch_store_id += 1; + self.ctx + .raft_metrics + .message_dropped + .mismatch_store_id + .inc(); return Ok(()); } @@ -1916,7 +1943,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER "missing epoch in raft message, ignore it"; "region_id" => region_id, ); - self.ctx.raft_metrics.message_dropped.mismatch_region_epoch += 1; + self.ctx + .raft_metrics + .message_dropped + .mismatch_region_epoch + .inc(); return Ok(()); } if msg.get_is_tombstone() || msg.has_merge_target() { @@ -1986,7 +2017,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER "region_id" => region_id, "msg_type" => ?msg_type, ); - self.ctx.raft_metrics.message_dropped.stale_msg += 1; + self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return Ok(false); } @@ -2129,7 +2160,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } if is_overlapped { - self.ctx.raft_metrics.message_dropped.region_overlap += 1; + self.ctx.raft_metrics.message_dropped.region_overlap.inc(); return Ok(false); } diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 923fb8ffc26..aa33ae49fea 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -6,112 +6,51 @@ use std::sync::{Arc, Mutex}; use collections::HashSet; use prometheus::local::LocalHistogram; use raft::eraftpb::MessageType; +use tikv_util::time::{Duration, Instant}; use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS}; use super::metrics::*; -/// The buffered metrics counters for raft ready handling. -#[derive(Debug, Default, Clone)] -pub struct RaftReadyMetrics { - pub message: u64, - pub commit: u64, - pub append: u64, - pub snapshot: u64, - pub pending_region: u64, - pub has_ready_region: u64, -} +const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s -impl RaftReadyMetrics { - /// Flushes all metrics - fn flush(&mut self) { - // reset all buffered metrics once they have been added - if self.message > 0 { - STORE_RAFT_READY_COUNTER.message.inc_by(self.message); - self.message = 0; - } - if self.commit > 0 { - STORE_RAFT_READY_COUNTER.commit.inc_by(self.commit); - self.commit = 0; - } - if self.append > 0 { - STORE_RAFT_READY_COUNTER.append.inc_by(self.append); - self.append = 0; - } - if self.snapshot > 0 { - STORE_RAFT_READY_COUNTER.snapshot.inc_by(self.snapshot); - self.snapshot = 0; - } - if self.pending_region > 0 { - STORE_RAFT_READY_COUNTER - .pending_region - .inc_by(self.pending_region); - self.pending_region = 0; - } - if self.has_ready_region > 0 { - STORE_RAFT_READY_COUNTER - .has_ready_region - .inc_by(self.has_ready_region); - self.has_ready_region = 0; - } - } -} - -pub type SendStatus = [u64; 2]; - -macro_rules! flush_send_status { - ($metrics:ident, $self:ident) => {{ - if $self.$metrics[0] > 0 { - STORE_RAFT_SENT_MESSAGE_COUNTER - .$metrics - .drop - .inc_by($self.$metrics[0]); - $self.$metrics[0] = 0; - } - if $self.$metrics[1] > 0 { - STORE_RAFT_SENT_MESSAGE_COUNTER - .$metrics - .accept - .inc_by($self.$metrics[1]); - $self.$metrics[1] = 0; +macro_rules! set_send_status { + ($metrics:expr, $success:ident) => {{ + if $success { + $metrics.accept.inc(); + } else { + $metrics.drop.inc(); } }}; } -/// The buffered metrics counters for raft message. -#[derive(Debug, Default, Clone)] -pub struct RaftSendMessageMetrics { - pub append: SendStatus, - pub append_resp: SendStatus, - pub prevote: SendStatus, - pub prevote_resp: SendStatus, - pub vote: SendStatus, - pub vote_resp: SendStatus, - pub snapshot: SendStatus, - pub heartbeat: SendStatus, - pub heartbeat_resp: SendStatus, - pub transfer_leader: SendStatus, - pub timeout_now: SendStatus, - pub read_index: SendStatus, - pub read_index_resp: SendStatus, +pub struct RaftSendMessageMetrics(RaftSentMessageCounterVec); + +impl Default for RaftSendMessageMetrics { + fn default() -> Self { + Self(RaftSentMessageCounterVec::from( + &STORE_RAFT_SENT_MESSAGE_COUNTER_VEC, + )) + } } impl RaftSendMessageMetrics { pub fn add(&mut self, msg_type: MessageType, success: bool) { - let i = success as usize; match msg_type { - MessageType::MsgAppend => self.append[i] += 1, - MessageType::MsgAppendResponse => self.append_resp[i] += 1, - MessageType::MsgRequestPreVote => self.prevote[i] += 1, - MessageType::MsgRequestPreVoteResponse => self.prevote_resp[i] += 1, - MessageType::MsgRequestVote => self.vote[i] += 1, - MessageType::MsgRequestVoteResponse => self.vote_resp[i] += 1, - MessageType::MsgSnapshot => self.snapshot[i] += 1, - MessageType::MsgHeartbeat => self.heartbeat[i] += 1, - MessageType::MsgHeartbeatResponse => self.heartbeat_resp[i] += 1, - MessageType::MsgTransferLeader => self.transfer_leader[i] += 1, - MessageType::MsgReadIndex => self.read_index[i] += 1, - MessageType::MsgReadIndexResp => self.read_index_resp[i] += 1, - MessageType::MsgTimeoutNow => self.timeout_now[i] += 1, + MessageType::MsgAppend => set_send_status!(self.0.append, success), + MessageType::MsgAppendResponse => set_send_status!(self.0.append_resp, success), + MessageType::MsgRequestPreVote => set_send_status!(self.0.prevote, success), + MessageType::MsgRequestPreVoteResponse => { + set_send_status!(self.0.prevote_resp, success) + } + MessageType::MsgRequestVote => set_send_status!(self.0.vote, success), + MessageType::MsgRequestVoteResponse => set_send_status!(self.0.vote_resp, success), + MessageType::MsgSnapshot => set_send_status!(self.0.snapshot, success), + MessageType::MsgHeartbeat => set_send_status!(self.0.heartbeat, success), + MessageType::MsgHeartbeatResponse => set_send_status!(self.0.heartbeat_resp, success), + MessageType::MsgTransferLeader => set_send_status!(self.0.transfer_leader, success), + MessageType::MsgReadIndex => set_send_status!(self.0.read_index, success), + MessageType::MsgReadIndexResp => set_send_status!(self.0.read_index_resp, success), + MessageType::MsgTimeoutNow => set_send_status!(self.0.timeout_now, success), // We do not care about these message types for metrics. // Explicitly declare them so when we add new message types we are forced to // decide. @@ -123,293 +62,30 @@ impl RaftSendMessageMetrics { | MessageType::MsgCheckQuorum => {} } } - /// Flushes all metrics - pub fn flush(&mut self) { - // reset all buffered metrics once they have been added - flush_send_status!(append, self); - flush_send_status!(append_resp, self); - flush_send_status!(prevote, self); - flush_send_status!(prevote_resp, self); - flush_send_status!(vote, self); - flush_send_status!(vote_resp, self); - flush_send_status!(snapshot, self); - flush_send_status!(heartbeat, self); - flush_send_status!(heartbeat_resp, self); - flush_send_status!(transfer_leader, self); - flush_send_status!(timeout_now, self); - flush_send_status!(read_index, self); - flush_send_status!(read_index_resp, self); - } -} - -#[derive(Debug, Default, Clone)] -pub struct RaftMessageDropMetrics { - pub mismatch_store_id: u64, - pub mismatch_region_epoch: u64, - pub stale_msg: u64, - pub region_overlap: u64, - pub region_no_peer: u64, - pub region_tombstone_peer: u64, - pub region_nonexistent: u64, - pub applying_snap: u64, - pub disk_full: u64, -} - -impl RaftMessageDropMetrics { - fn flush(&mut self) { - if self.mismatch_store_id > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .mismatch_store_id - .inc_by(self.mismatch_store_id); - self.mismatch_store_id = 0; - } - if self.mismatch_region_epoch > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .mismatch_region_epoch - .inc_by(self.mismatch_region_epoch); - self.mismatch_region_epoch = 0; - } - if self.stale_msg > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .stale_msg - .inc_by(self.stale_msg); - self.stale_msg = 0; - } - if self.region_overlap > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .region_overlap - .inc_by(self.region_overlap); - self.region_overlap = 0; - } - if self.region_no_peer > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .region_no_peer - .inc_by(self.region_no_peer); - self.region_no_peer = 0; - } - if self.region_tombstone_peer > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .region_tombstone_peer - .inc_by(self.region_tombstone_peer); - self.region_tombstone_peer = 0; - } - if self.region_nonexistent > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .region_nonexistent - .inc_by(self.region_nonexistent); - self.region_nonexistent = 0; - } - if self.applying_snap > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .applying_snap - .inc_by(self.applying_snap); - self.applying_snap = 0; - } - if self.disk_full > 0 { - STORE_RAFT_DROPPED_MESSAGE_COUNTER - .disk_full - .inc_by(self.disk_full); - self.disk_full = 0; - } - } -} - -/// The buffered metrics counters for raft propose. -#[derive(Clone)] -pub struct RaftProposeMetrics { - pub all: u64, - pub local_read: u64, - pub read_index: u64, - pub unsafe_read_index: u64, - pub dropped_read_index: u64, - pub normal: u64, - pub batch: usize, - pub transfer_leader: u64, - pub conf_change: u64, - pub request_wait_time: LocalHistogram, -} - -impl Default for RaftProposeMetrics { - fn default() -> RaftProposeMetrics { - RaftProposeMetrics { - all: 0, - local_read: 0, - read_index: 0, - unsafe_read_index: 0, - normal: 0, - transfer_leader: 0, - conf_change: 0, - batch: 0, - dropped_read_index: 0, - request_wait_time: REQUEST_WAIT_TIME_HISTOGRAM.local(), - } - } -} - -impl RaftProposeMetrics { - /// Flushes all metrics - fn flush(&mut self) { - // reset all buffered metrics once they have been added - if self.all > 0 { - PEER_PROPOSAL_COUNTER.all.inc_by(self.all); - self.all = 0; - } - if self.local_read > 0 { - PEER_PROPOSAL_COUNTER.local_read.inc_by(self.local_read); - self.local_read = 0; - } - if self.read_index > 0 { - PEER_PROPOSAL_COUNTER.read_index.inc_by(self.read_index); - self.read_index = 0; - } - if self.unsafe_read_index > 0 { - PEER_PROPOSAL_COUNTER - .unsafe_read_index - .inc_by(self.unsafe_read_index); - self.unsafe_read_index = 0; - } - if self.dropped_read_index > 0 { - PEER_PROPOSAL_COUNTER - .dropped_read_index - .inc_by(self.dropped_read_index); - self.dropped_read_index = 0; - } - if self.normal > 0 { - PEER_PROPOSAL_COUNTER.normal.inc_by(self.normal); - self.normal = 0; - } - if self.transfer_leader > 0 { - PEER_PROPOSAL_COUNTER - .transfer_leader - .inc_by(self.transfer_leader); - self.transfer_leader = 0; - } - if self.conf_change > 0 { - PEER_PROPOSAL_COUNTER.conf_change.inc_by(self.conf_change); - self.conf_change = 0; - } - if self.batch > 0 { - PEER_PROPOSAL_COUNTER.batch.inc_by(self.batch as u64); - self.batch = 0; - } - self.request_wait_time.flush(); - } -} -/// The buffered metrics counter for invalid propose -#[derive(Clone, Default)] -pub struct RaftInvalidProposeMetrics { - pub mismatch_store_id: u64, - pub region_not_found: u64, - pub not_leader: u64, - pub mismatch_peer_id: u64, - pub stale_command: u64, - pub epoch_not_match: u64, - pub read_index_no_leader: u64, - pub region_not_initialized: u64, - pub is_applying_snapshot: u64, -} - -impl RaftInvalidProposeMetrics { - fn flush(&mut self) { - if self.mismatch_store_id > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .mismatch_store_id - .inc_by(self.mismatch_store_id); - self.mismatch_store_id = 0; - } - if self.region_not_found > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .region_not_found - .inc_by(self.region_not_found); - self.region_not_found = 0; - } - if self.not_leader > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .not_leader - .inc_by(self.not_leader); - self.not_leader = 0; - } - if self.mismatch_peer_id > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .mismatch_peer_id - .inc_by(self.mismatch_peer_id); - self.mismatch_peer_id = 0; - } - if self.stale_command > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .stale_command - .inc_by(self.stale_command); - self.stale_command = 0; - } - if self.epoch_not_match > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .epoch_not_match - .inc_by(self.epoch_not_match); - self.epoch_not_match = 0; - } - if self.read_index_no_leader > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .read_index_no_leader - .inc_by(self.read_index_no_leader); - self.read_index_no_leader = 0; - } - if self.region_not_initialized > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .region_not_initialized - .inc_by(self.region_not_initialized); - self.region_not_initialized = 0; - } - if self.is_applying_snapshot > 0 { - RAFT_INVALID_PROPOSAL_COUNTER - .is_applying_snapshot - .inc_by(self.is_applying_snapshot); - self.is_applying_snapshot = 0; - } - } -} - -#[derive(Clone, Default)] -pub struct RaftLogGcSkippedMetrics { - pub reserve_log: u64, - pub threshold_limit: u64, - pub compact_idx_too_small: u64, -} - -impl RaftLogGcSkippedMetrics { - fn flush(&mut self) { - if self.reserve_log > 0 { - RAFT_LOG_GC_SKIPPED.reserve_log.inc_by(self.reserve_log); - self.reserve_log = 0; - } - if self.threshold_limit > 0 { - RAFT_LOG_GC_SKIPPED - .threshold_limit - .inc_by(self.threshold_limit); - self.threshold_limit = 0; - } - if self.compact_idx_too_small > 0 { - RAFT_LOG_GC_SKIPPED - .compact_idx_too_small - .inc_by(self.compact_idx_too_small); - self.compact_idx_too_small = 0; - } + pub fn flush(&mut self) { + self.0.flush(); } } /// The buffered metrics counters for raft. -#[derive(Clone)] pub struct RaftMetrics { - pub store_time: LocalHistogram, - pub ready: RaftReadyMetrics, + // local counter + pub ready: RaftReadyCounterVec, pub send_message: RaftSendMessageMetrics, - pub message_dropped: RaftMessageDropMetrics, - pub propose: RaftProposeMetrics, + pub message_dropped: RaftDroppedMessageCounterVec, + pub propose: RaftProposalCounterVec, + pub invalid_proposal: RaftInvalidProposalCounterVec, + pub raft_log_gc_skipped: RaftLogGcSkippedCounterVec, + + // local histogram + pub store_time: LocalHistogram, + pub propose_wait_time: LocalHistogram, pub process_ready: LocalHistogram, pub commit_log: LocalHistogram, - pub leader_missing: Arc>>, - pub invalid_proposal: RaftInvalidProposeMetrics, pub write_block_wait: LocalHistogram, + + // waterfall metrics pub waterfall_metrics: bool, pub wf_batch_wait: LocalHistogram, pub wf_send_to_queue: LocalHistogram, @@ -417,23 +93,31 @@ pub struct RaftMetrics { pub wf_persist_log: LocalHistogram, pub wf_commit_log: LocalHistogram, pub wf_commit_not_persist_log: LocalHistogram, - pub raft_log_gc_skipped: RaftLogGcSkippedMetrics, + + pub leader_missing: Arc>>, + + last_flush_time: Instant, } impl RaftMetrics { pub fn new(waterfall_metrics: bool) -> Self { Self { + ready: RaftReadyCounterVec::from(&STORE_RAFT_READY_COUNTER_VEC), + send_message: RaftSendMessageMetrics::default(), + message_dropped: RaftDroppedMessageCounterVec::from( + &STORE_RAFT_DROPPED_MESSAGE_COUNTER_VEC, + ), + propose: RaftProposalCounterVec::from(&PEER_PROPOSAL_COUNTER_VEC), + invalid_proposal: RaftInvalidProposalCounterVec::from( + &RAFT_INVALID_PROPOSAL_COUNTER_VEC, + ), + raft_log_gc_skipped: RaftLogGcSkippedCounterVec::from(&RAFT_LOG_GC_SKIPPED_VEC), store_time: STORE_TIME_HISTOGRAM.local(), - ready: Default::default(), - send_message: Default::default(), - message_dropped: Default::default(), - propose: Default::default(), + propose_wait_time: REQUEST_WAIT_TIME_HISTOGRAM.local(), process_ready: PEER_RAFT_PROCESS_DURATION .with_label_values(&["ready"]) .local(), commit_log: PEER_COMMIT_LOG_HISTOGRAM.local(), - leader_missing: Arc::default(), - invalid_proposal: Default::default(), write_block_wait: STORE_WRITE_MSG_BLOCK_WAIT_DURATION_HISTOGRAM.local(), waterfall_metrics, wf_batch_wait: STORE_WF_BATCH_WAIT_DURATION_HISTOGRAM.local(), @@ -442,22 +126,32 @@ impl RaftMetrics { wf_persist_log: STORE_WF_PERSIST_LOG_DURATION_HISTOGRAM.local(), wf_commit_log: STORE_WF_COMMIT_LOG_DURATION_HISTOGRAM.local(), wf_commit_not_persist_log: STORE_WF_COMMIT_NOT_PERSIST_LOG_DURATION_HISTOGRAM.local(), - raft_log_gc_skipped: RaftLogGcSkippedMetrics::default(), + leader_missing: Arc::default(), + last_flush_time: Instant::now_coarse(), } } - /// Flushs all metrics - pub fn flush(&mut self) { - self.store_time.flush(); + /// Flushes all metrics + pub fn maybe_flush(&mut self) { + if self.last_flush_time.saturating_elapsed() < Duration::from_millis(METRICS_FLUSH_INTERVAL) + { + return; + } + self.last_flush_time = Instant::now_coarse(); + self.ready.flush(); self.send_message.flush(); + self.message_dropped.flush(); self.propose.flush(); + self.invalid_proposal.flush(); + self.raft_log_gc_skipped.flush(); + + self.store_time.flush(); + self.propose_wait_time.flush(); self.process_ready.flush(); self.commit_log.flush(); - self.message_dropped.flush(); - self.invalid_proposal.flush(); self.write_block_wait.flush(); - self.raft_log_gc_skipped.flush(); + if self.waterfall_metrics { self.wf_batch_wait.flush(); self.wf_send_to_queue.flush(); @@ -466,6 +160,7 @@ impl RaftMetrics { self.wf_commit_log.flush(); self.wf_commit_not_persist_log.flush(); } + let mut missing = self.leader_missing.lock().unwrap(); LEADER_MISSING.set(missing.len() as i64); missing.clear(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 587b9ad3af7..ad4ee7e7f98 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -15,17 +15,6 @@ make_auto_flush_static_metric! { write_thread_wait, db_mutex_lock_nanos, } - pub label_enum ProposalType { - all, - local_read, - read_index, - unsafe_read_index, - normal, - transfer_leader, - conf_change, - batch, - dropped_read_index, - } pub label_enum WriteCmdType { put, @@ -53,43 +42,6 @@ make_auto_flush_static_metric! { success, } - pub label_enum RaftReadyType { - message, - commit, - append, - snapshot, - pending_region, - has_ready_region, - } - - pub label_enum MessageCounterType { - append, - append_resp, - prevote, - prevote_resp, - vote, - vote_resp, - snapshot, - heartbeat, - heartbeat_resp, - transfer_leader, - timeout_now, - read_index, - read_index_resp, - } - - pub label_enum RaftDroppedMessage { - mismatch_store_id, - mismatch_region_epoch, - stale_msg, - region_overlap, - region_no_peer, - region_tombstone_peer, - region_nonexistent, - applying_snap, - disk_full, - } - pub label_enum SnapValidationType { stale, decode, @@ -126,17 +78,7 @@ make_auto_flush_static_metric! { fetch_unused, } - pub label_enum RaftInvalidProposal { - mismatch_store_id, - region_not_found, - not_leader, - mismatch_peer_id, - stale_command, - epoch_not_match, - read_index_no_leader, - region_not_initialized, - is_applying_snapshot, - } + pub label_enum RaftEventDurationType { compact_check, pd_store_heartbeat, @@ -154,23 +96,10 @@ make_auto_flush_static_metric! { skip_partition, } - pub label_enum SendStatus { - accept, - drop, - } - - pub label_enum RaftLogGcSkippedReason { - reserve_log, - compact_idx_too_small, - threshold_limit, - } - pub struct RaftEventDuration : LocalHistogram { "type" => RaftEventDurationType } - pub struct RaftInvalidProposalCount : LocalIntCounter { - "type" => RaftInvalidProposal - } + pub struct RaftEntryFetches : LocalIntCounter { "type" => RaftEntryType } @@ -184,9 +113,6 @@ make_auto_flush_static_metric! { "type" => RegionHashType, "result" => RegionHashResult, } - pub struct ProposalVec: LocalIntCounter { - "type" => ProposalType, - } pub struct AdminCmdVec : LocalIntCounter { "type" => AdminCmdType, @@ -197,19 +123,6 @@ make_auto_flush_static_metric! { "type" => WriteCmdType, } - pub struct RaftReadyVec : LocalIntCounter { - "type" => RaftReadyType, - } - - pub struct MessageCounterVec : LocalIntCounter { - "type" => MessageCounterType, - "status" => SendStatus, - } - - pub struct RaftDropedVec : LocalIntCounter { - "type" => RaftDroppedMessage, - } - pub struct SnapValidVec : LocalIntCounter { "type" => SnapValidationType } @@ -221,18 +134,79 @@ make_auto_flush_static_metric! { "cf" => CfNames, "type" => CompactionGuardAction, } - - pub struct RaftLogGcSkippedVec: LocalIntCounter { - "reason" => RaftLogGcSkippedReason, - } } make_static_metric! { - pub struct HibernatedPeerStateGauge: IntGauge { - "state" => { - awaken, - hibernated, - }, + pub label_enum RaftReadyType { + message, + commit, + append, + snapshot, + pending_region, + has_ready_region, + } + + pub label_enum RaftSentMessageCounterType { + append, + append_resp, + prevote, + prevote_resp, + vote, + vote_resp, + snapshot, + heartbeat, + heartbeat_resp, + transfer_leader, + timeout_now, + read_index, + read_index_resp, + } + + pub label_enum SendStatus { + accept, + drop, + } + + pub label_enum RaftDroppedMessage { + mismatch_store_id, + mismatch_region_epoch, + stale_msg, + region_overlap, + region_no_peer, + region_tombstone_peer, + region_nonexistent, + applying_snap, + disk_full, + } + + pub label_enum ProposalType { + all, + local_read, + read_index, + unsafe_read_index, + normal, + transfer_leader, + conf_change, + batch, + dropped_read_index, + } + + pub label_enum RaftInvalidProposal { + mismatch_store_id, + region_not_found, + not_leader, + mismatch_peer_id, + stale_command, + epoch_not_match, + read_index_no_leader, + region_not_initialized, + is_applying_snapshot, + } + + pub label_enum RaftLogGcSkippedReason { + reserve_log, + compact_idx_too_small, + threshold_limit, } pub label_enum LoadBaseSplitEventType { @@ -262,9 +236,42 @@ make_static_metric! { unable_to_split_cpu_top, } + pub struct HibernatedPeerStateGauge: IntGauge { + "state" => { + awaken, + hibernated, + }, + } + + pub struct RaftReadyCounterVec : LocalIntCounter { + "type" => RaftReadyType, + } + + pub struct RaftSentMessageCounterVec : LocalIntCounter { + "type" => RaftSentMessageCounterType, + "status" => SendStatus, + } + + pub struct RaftDroppedMessageCounterVec : LocalIntCounter { + "type" => RaftDroppedMessage, + } + + pub struct RaftProposalCounterVec: LocalIntCounter { + "type" => ProposalType, + } + + pub struct RaftInvalidProposalCounterVec : LocalIntCounter { + "type" => RaftInvalidProposal + } + + pub struct RaftLogGcSkippedCounterVec: LocalIntCounter { + "reason" => RaftLogGcSkippedReason, + } + pub struct LoadBaseSplitEventCounterVec: IntCounter { "type" => LoadBaseSplitEventType, } + } lazy_static! { @@ -404,8 +411,6 @@ lazy_static! { "Total number of proposal made.", &["type"] ).unwrap(); - pub static ref PEER_PROPOSAL_COUNTER: ProposalVec = - auto_flush_from!(PEER_PROPOSAL_COUNTER_VEC, ProposalVec); pub static ref PEER_ADMIN_CMD_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( @@ -452,8 +457,6 @@ lazy_static! { "Total number of raft ready handled.", &["type"] ).unwrap(); - pub static ref STORE_RAFT_READY_COUNTER: RaftReadyVec = - auto_flush_from!(STORE_RAFT_READY_COUNTER_VEC, RaftReadyVec); pub static ref STORE_RAFT_SENT_MESSAGE_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( @@ -461,8 +464,6 @@ lazy_static! { "Total number of raft ready sent messages.", &["type", "status"] ).unwrap(); - pub static ref STORE_RAFT_SENT_MESSAGE_COUNTER: MessageCounterVec = - auto_flush_from!(STORE_RAFT_SENT_MESSAGE_COUNTER_VEC, MessageCounterVec); pub static ref STORE_RAFT_DROPPED_MESSAGE_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( @@ -470,8 +471,6 @@ lazy_static! { "Total number of raft dropped messages.", &["type"] ).unwrap(); - pub static ref STORE_RAFT_DROPPED_MESSAGE_COUNTER: RaftDropedVec = - auto_flush_from!(STORE_RAFT_DROPPED_MESSAGE_COUNTER_VEC, RaftDropedVec); pub static ref STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC: IntGaugeVec = register_int_gauge_vec!( @@ -632,8 +631,6 @@ lazy_static! { "Total number of raft invalid proposal.", &["type"] ).unwrap(); - pub static ref RAFT_INVALID_PROPOSAL_COUNTER: RaftInvalidProposalCount = - auto_flush_from!(RAFT_INVALID_PROPOSAL_COUNTER_VEC, RaftInvalidProposalCount); pub static ref RAFT_EVENT_DURATION_VEC: HistogramVec = register_histogram_vec!( @@ -714,11 +711,10 @@ lazy_static! { exponential_buckets(8.0, 2.0, 24).unwrap() ).unwrap(); - pub static ref RAFT_ENTRIES_CACHES_GAUGE: IntGauge = register_int_gauge!( "tikv_raft_entries_caches", "Total memory size of raft entries caches." - ).unwrap(); + ).unwrap(); pub static ref RAFT_ENTRIES_EVICT_BYTES: IntCounter = register_int_counter!( "tikv_raft_entries_evict_bytes", @@ -775,12 +771,10 @@ lazy_static! { &["reason"] ) .unwrap(); - pub static ref RAFT_LOG_GC_SKIPPED: RaftLogGcSkippedVec = - auto_flush_from!(RAFT_LOG_GC_SKIPPED_VEC, RaftLogGcSkippedVec); pub static ref RAFT_APPLYING_SST_GAUGE: IntGaugeVec = register_int_gauge_vec!( "tikv_raft_applying_sst", "Sum of applying sst.", &["type"] - ).unwrap(); + ).unwrap(); } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index b109d107c4f..89ed6eeef7d 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -70,7 +70,7 @@ use uuid::Uuid; use super::{ cmd_resp, - local_metrics::{RaftMetrics, RaftReadyMetrics, TimeTracker}, + local_metrics::{RaftMetrics, TimeTracker}, metrics::*, peer_storage::{write_peer_state, CheckApplyingSnapStatus, HandleReadyResult, PeerStorage}, read_queue::{ReadIndexQueue, ReadIndexRequest}, @@ -1569,19 +1569,28 @@ where self.raft_group.snap() } - fn add_ready_metric(&self, ready: &Ready, metrics: &mut RaftReadyMetrics) { - metrics.message += ready.messages().len() as u64; - metrics.commit += ready.committed_entries().len() as u64; - metrics.append += ready.entries().len() as u64; + fn add_ready_metric(&self, ready: &Ready, metrics: &mut RaftMetrics) { + metrics.ready.message.inc_by(ready.messages().len() as u64); + metrics + .ready + .commit + .inc_by(ready.committed_entries().len() as u64); + metrics.ready.append.inc_by(ready.entries().len() as u64); if !ready.snapshot().is_empty() { - metrics.snapshot += 1; + metrics.ready.snapshot.inc(); } } - fn add_light_ready_metric(&self, light_ready: &LightReady, metrics: &mut RaftReadyMetrics) { - metrics.message += light_ready.messages().len() as u64; - metrics.commit += light_ready.committed_entries().len() as u64; + fn add_light_ready_metric(&self, light_ready: &LightReady, metrics: &mut RaftMetrics) { + metrics + .ready + .message + .inc_by(light_ready.messages().len() as u64); + metrics + .ready + .commit + .inc_by(light_ready.committed_entries().len() as u64); } #[inline] @@ -2490,7 +2499,7 @@ where let mut ready = self.raft_group.ready(); - self.add_ready_metric(&ready, &mut ctx.raft_metrics.ready); + self.add_ready_metric(&ready, &mut ctx.raft_metrics); // Update it after unstable entries pagination is introduced. debug_assert!(ready.entries().last().map_or_else( @@ -2642,7 +2651,7 @@ where // needs to be persisted. let mut light_rd = self.raft_group.advance_append(ready); - self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics.ready); + self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics); if let Some(idx) = light_rd.commit_index() { panic!( @@ -3012,7 +3021,7 @@ where } self.mut_store().update_cache_persisted(persist_index); - self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics.ready); + self.add_light_ready_metric(&light_rd, &mut ctx.raft_metrics); if let Some(commit_index) = light_rd.commit_index() { let pre_commit_index = self.get_store().commit_index(); @@ -3397,7 +3406,7 @@ where return false; } - ctx.raft_metrics.propose.all += 1; + ctx.raft_metrics.propose.all.inc(); let req_admin_cmd_type = if !req.has_admin_request() { None @@ -3730,7 +3739,7 @@ where req: RaftCmdRequest, cb: Callback, ) { - ctx.raft_metrics.propose.local_read += 1; + ctx.raft_metrics.propose.local_read.inc(); cb.invoke_read(self.handle_read(ctx, req, false, Some(self.get_store().commit_index()))) } @@ -3821,7 +3830,7 @@ where "peer_id" => self.peer.get_id(), "err" => ?e, ); - poll_ctx.raft_metrics.propose.unsafe_read_index += 1; + poll_ctx.raft_metrics.propose.unsafe_read_index.inc(); cmd_resp::bind_error(&mut err_resp, e); cb.report_error(err_resp); self.should_wake_up = true; @@ -3873,7 +3882,11 @@ where // which would cause a long time waiting for a read response. Then we // should return an error directly in this situation. if !self.is_leader() && self.leader_id() == INVALID_ID { - poll_ctx.raft_metrics.invalid_proposal.read_index_no_leader += 1; + poll_ctx + .raft_metrics + .invalid_proposal + .read_index_no_leader + .inc(); // The leader may be hibernated, send a message for trying to awaken the leader. if self.bcast_wake_up_time.is_none() || self @@ -3904,7 +3917,7 @@ where return false; } - poll_ctx.raft_metrics.propose.read_index += 1; + poll_ctx.raft_metrics.propose.read_index.inc(); self.bcast_wake_up_time = None; let request = req @@ -3916,7 +3929,7 @@ where if dropped && self.is_leader() { // The message gets dropped silently, can't be handled anymore. apply::notify_stale_req(self.term(), cb); - poll_ctx.raft_metrics.propose.dropped_read_index += 1; + poll_ctx.raft_metrics.propose.dropped_read_index.inc(); return false; } @@ -4264,7 +4277,7 @@ where return Err(Error::ProposalInMergingMode(self.region_id)); } - poll_ctx.raft_metrics.propose.normal += 1; + poll_ctx.raft_metrics.propose.normal.inc(); if self.has_applied_to_current_term() { // Only when applied index's term is equal to current leader's term, the @@ -4425,7 +4438,7 @@ where req: RaftCmdRequest, cb: Callback, ) -> bool { - ctx.raft_metrics.propose.transfer_leader += 1; + ctx.raft_metrics.propose.transfer_leader.inc(); let transfer_leader = get_transfer_leader_cmd(&req).unwrap(); let prs = self.raft_group.raft.prs(); @@ -4544,7 +4557,7 @@ where self.check_conf_change(ctx, changes.as_ref(), &cc)?; - ctx.raft_metrics.propose.conf_change += 1; + ctx.raft_metrics.propose.conf_change.inc(); // TODO: use local histogram metrics PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data_size as f64); info!( diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index fa27ea340b8..a0732043d1b 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -78,7 +78,7 @@ thread_local! { ); } -const METRICS_FLUSH_INTERVAL: u64 = 15_000; // 15s +const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s pub fn maybe_tls_local_read_metrics_flush() { TLS_LOCAL_READ_METRICS.with(|m| { From 5fe01e05a373b13f0bf41025df2c6da2da3835ee Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Fri, 19 Aug 2022 16:02:52 +0800 Subject: [PATCH 161/676] util: use local histogram to record schedule wait duration (#13285) closes tikv/tikv#13293 It is too expensive to record the schedule wait duration of thread pools with shared atomics. The better way is to use local metrics and flush them at intervals. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- components/tikv_util/src/yatp_pool/mod.rs | 18 ++++++++++++------ src/server/raft_client.rs | 1 + 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index e2e57c9fbce..6e246d6cddf 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -7,7 +7,7 @@ use std::sync::Arc; use fail::fail_point; pub use future_pool::{Full, FuturePool}; -use prometheus::Histogram; +use prometheus::{local::LocalHistogram, Histogram}; use yatp::{ pool::{CloneRunnerBuilder, Local, Runner}, queue::{multilevel, QueueType, TaskCell as _}, @@ -45,13 +45,15 @@ impl TickerWrapper { } } - pub fn try_tick(&mut self) { + // Returns whether tick has been triggered. + pub fn try_tick(&mut self) -> bool { let now = Instant::now_coarse(); if now.saturating_duration_since(self.last_tick_time) < tick_interval() { - return; + return false; } self.last_tick_time = now; self.ticker.on_tick(); + true } pub fn on_tick(&mut self) { @@ -93,7 +95,7 @@ pub struct YatpPoolRunner { before_pause: Option>, // Statistics about the schedule wait duration. - schedule_wait_duration: Histogram, + schedule_wait_duration: LocalHistogram, } impl Runner for YatpPoolRunner { @@ -118,7 +120,9 @@ impl Runner for YatpPoolRunner { .observe(schedule_time.elapsed().as_secs_f64()); } let finished = self.inner.handle(local, task_cell); - self.ticker.try_tick(); + if self.ticker.try_tick() { + self.schedule_wait_duration.flush(); + } finished } @@ -160,7 +164,7 @@ impl YatpPoolRunner { after_start, before_stop, before_pause, - schedule_wait_duration, + schedule_wait_duration: schedule_wait_duration.local(), } } } @@ -334,6 +338,8 @@ mod tests { for _ in 0..3 { rx.recv().unwrap(); } + // Drop the pool so the local metrics are flushed. + drop(pool); let histogram = metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); assert_eq!(histogram.get_sample_count() as u32, 6, "{:?}", histogram); } diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index df1a18ab06d..bc0e8a59303 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -1012,6 +1012,7 @@ where self.last_hash.1 as usize }; + #[allow(unused_mut)] let mut transport_on_send_store_fp = || { fail_point!( "transport_on_send_snapshot", From 58fa80e0de0d43d473dc456081a4d2b08939e0aa Mon Sep 17 00:00:00 2001 From: cosven Date: Fri, 19 Aug 2022 17:52:52 +0800 Subject: [PATCH 162/676] storage: precheck whether the peer is leader when acquiring latches failed (#13254) close tikv/tikv#12966, ref tikv/tikv#12966 When a tikv is isolated from other tikv instances, some requests will be blocked in raftstore and the corresponding latches are not released. Following requests which require the latches will receive ServerIsBusy error and keep retrying. However, In such case, peers on the tikv are not leader anymore. The client is supposed to receive NotLeader error immediately. This commit introduces fail fast mode to scheduler. When a request fails to acquire any latch, scheduler checks if the peer is still leader. If it still the leader, schedule the request as usual, fail fast otherwise. Signed-off-by: cosven Co-authored-by: Ti Chi Robot --- components/backup/src/endpoint.rs | 11 +- .../src/coprocessor/region_info_accessor.rs | 67 +++++-- components/server/src/server.rs | 1 + components/test_raftstore/src/server.rs | 18 +- components/tikv_kv/src/lib.rs | 5 + components/tikv_kv/src/rocksdb_engine.rs | 25 ++- src/server/raftkv.rs | 15 +- src/storage/metrics.rs | 2 + src/storage/txn/scheduler.rs | 164 ++++++++++++++++-- tests/benches/misc/raftkv/mod.rs | 15 +- tests/integrations/storage/test_raftkv.rs | 49 ++++++ 11 files changed, 313 insertions(+), 59 deletions(-) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 35a08c81a2d..e0ea9e3ae28 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -849,8 +849,8 @@ impl Endpoint { self.pool.borrow_mut().spawn(async move { loop { - // when get the guard, release it until we finish scanning a batch, - // because if we were suspended during scanning, + // when get the guard, release it until we finish scanning a batch, + // because if we were suspended during scanning, // the region info have higher possibility to change (then we must compensate that by the fine-grained backup). let guard = limit.guard().await; if let Err(e) = guard { @@ -1174,11 +1174,12 @@ pub mod tests { use std::{ fs, path::{Path, PathBuf}, - sync::Mutex, + sync::{Mutex, RwLock}, time::Duration, }; use api_version::{api_v2::RAW_KEY_PREFIX, dispatch_api_version, KvFormat, RawValue}; + use collections::HashSet; use engine_traits::MiscExt; use external_storage_export::{make_local_backend, make_noop_backend}; use file_system::{IoOp, IoRateLimiter, IoType}; @@ -1213,7 +1214,9 @@ pub mod tests { impl MockRegionInfoProvider { pub fn new(encode_key: bool) -> Self { MockRegionInfoProvider { - regions: Arc::new(Mutex::new(RegionCollector::new())), + regions: Arc::new(Mutex::new(RegionCollector::new(Arc::new(RwLock::new( + HashSet::default(), + ))))), cancel: None, need_encode_key: encode_key, } diff --git a/components/raftstore/src/coprocessor/region_info_accessor.rs b/components/raftstore/src/coprocessor/region_info_accessor.rs index fb6defbc375..8f9021c8e60 100644 --- a/components/raftstore/src/coprocessor/region_info_accessor.rs +++ b/components/raftstore/src/coprocessor/region_info_accessor.rs @@ -6,11 +6,11 @@ use std::{ Bound::{Excluded, Unbounded}, }, fmt::{Display, Formatter, Result as FmtResult}, - sync::{mpsc, Mutex}, + sync::{mpsc, Arc, Mutex, RwLock}, time::Duration, }; -use collections::HashMap; +use collections::{HashMap, HashSet}; use engine_traits::KvEngine; use kvproto::metapb::Region; use raft::StateRole; @@ -219,11 +219,14 @@ pub struct RegionCollector { regions: RegionsMap, // BTreeMap: data_end_key -> region_id region_ranges: RegionRangesMap, + + region_leaders: Arc>>, } impl RegionCollector { - pub fn new() -> Self { + pub fn new(region_leaders: Arc>>) -> Self { Self { + region_leaders, regions: HashMap::default(), region_ranges: BTreeMap::default(), } @@ -337,11 +340,21 @@ impl RegionCollector { "region_id" => region.get_id(), ) } + self.region_leaders + .write() + .unwrap() + .remove(®ion.get_id()); } fn handle_role_change(&mut self, region: Region, new_role: StateRole) { let region_id = region.get_id(); + if new_role == StateRole::Leader { + self.region_leaders.write().unwrap().insert(region_id); + } else { + self.region_leaders.write().unwrap().remove(®ion_id); + } + if let Some(r) = self.regions.get_mut(®ion_id) { r.role = new_role; return; @@ -507,12 +520,6 @@ impl RegionCollector { } } -impl Default for RegionCollector { - fn default() -> Self { - Self::new() - } -} - impl Runnable for RegionCollector { type Task = RegionInfoQuery; @@ -585,6 +592,11 @@ pub struct RegionInfoAccessor { // https://github.com/tikv/tikv/issues/9044 worker: Worker, scheduler: Scheduler, + + /// Region leader ids set on the store. + /// + /// Others can access this info directly, such as RaftKV. + region_leaders: Arc>>, } impl RegionInfoAccessor { @@ -593,11 +605,24 @@ impl RegionInfoAccessor { /// once. If it's needed in different places, just clone it, and their /// contents are shared. pub fn new(host: &mut CoprocessorHost) -> Self { + let region_leaders = Arc::new(RwLock::new(HashSet::default())); let worker = WorkerBuilder::new("region-collector-worker").create(); - let scheduler = worker.start_with_timer("region-collector-worker", RegionCollector::new()); + let scheduler = worker.start_with_timer( + "region-collector-worker", + RegionCollector::new(region_leaders.clone()), + ); register_region_event_listener(host, scheduler.clone()); - Self { worker, scheduler } + Self { + worker, + scheduler, + region_leaders, + } + } + + /// Get a set of region leader ids. + pub fn region_leaders(&self) -> Arc>> { + self.region_leaders.clone() } /// Stops the `RegionInfoAccessor`. It should be stopped after raftstore. @@ -711,6 +736,10 @@ impl RegionInfoProvider for MockRegionInfoProvider { mod tests { use super::*; + fn new_region_collector() -> RegionCollector { + RegionCollector::new(Arc::new(RwLock::new(HashSet::default()))) + } + fn new_region(id: u64, start_key: &[u8], end_key: &[u8], version: u64) -> Region { let mut region = Region::default(); region.set_id(id); @@ -910,7 +939,7 @@ mod tests { #[test] fn test_ignore_invalid_version() { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); c.handle_raftstore_event(RaftStoreEvent::CreateRegion { region: new_region(1, b"k1", b"k3", 0), @@ -939,7 +968,7 @@ mod tests { region_with_conf(6, b"k7", b"", 20, 10), ]; - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); must_load_regions(&mut c, regions); assert!(c.check_region_range(®ion_with_conf(1, b"", b"k1", 10, 10), false)); @@ -1002,7 +1031,7 @@ mod tests { new_region(6, b"k7", b"", 1), ]; - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); must_load_regions(&mut c, &init_regions); let mut regions: Vec<_> = init_regions .iter() @@ -1033,7 +1062,7 @@ mod tests { check_collection(&c, &[]); // Test that the region with the same id will be kept in the collection - c = RegionCollector::new(); + c = new_region_collector(); must_load_regions(&mut c, &init_regions); c.check_region_range(&new_region(3, b"k1", b"k7", 2), true); @@ -1052,7 +1081,7 @@ mod tests { #[test] fn test_basic_updating() { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); let init_regions = &[ new_region(1, b"", b"k1", 1), new_region(2, b"k1", b"k9", 1), @@ -1120,7 +1149,7 @@ mod tests { /// correct, no matter what the events' order to happen is. /// Values in `seq` and of `derive_index` start from 1. fn test_split_impl(derive_index: usize, seq: &[usize]) { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); let init_regions = &[ new_region(1, b"", b"k1", 1), new_region(2, b"k1", b"k9", 1), @@ -1173,7 +1202,7 @@ mod tests { } fn test_merge_impl(to_left: bool, update_first: bool) { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); let init_regions = &[ region_with_conf(1, b"", b"k1", 1, 1), region_with_conf(2, b"k1", b"k2", 1, 100), @@ -1217,7 +1246,7 @@ mod tests { #[test] fn test_extreme_cases() { - let mut c = RegionCollector::new(); + let mut c = new_region_collector(); let init_regions = &[ new_region(1, b"", b"k1", 1), new_region(2, b"k1", b"k9", 1), diff --git a/components/server/src/server.rs b/components/server/src/server.rs index f61d981a912..35a06d1321f 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -574,6 +574,7 @@ where ), ), engines.kv.clone(), + self.region_info_accessor.region_leaders(), ); self.engines = Some(TikvEngines { diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index f69ef253e5b..683de2e5a7d 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -289,15 +289,19 @@ impl ServerCluster { StoreMetaDelegate::new(store_meta.clone(), engines.kv.clone()), router.clone(), ); - let raft_router = ServerRaftStoreRouter::new(router.clone(), local_reader); - let sim_router = SimulateTransport::new(raft_router.clone()); - - let raft_engine = RaftKv::new(sim_router.clone(), engines.kv.clone()); // Create coprocessor. let mut coprocessor_host = CoprocessorHost::new(router.clone(), cfg.coprocessor.clone()); let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + let raft_router = ServerRaftStoreRouter::new(router.clone(), local_reader); + let sim_router = SimulateTransport::new(raft_router.clone()); + let raft_engine = RaftKv::new( + sim_router.clone(), + engines.kv.clone(), + region_info_accessor.region_leaders(), + ); + if let Some(hooks) = self.coprocessor_hooks.get(&node_id) { for hook in hooks { hook(&mut coprocessor_host); @@ -313,7 +317,11 @@ impl ServerCluster { raft_engine.clone(), )); - let mut engine = RaftKv::new(sim_router.clone(), engines.kv.clone()); + let mut engine = RaftKv::new( + sim_router.clone(), + engines.kv.clone(), + region_info_accessor.region_leaders(), + ); if let Some(scheduler) = self.txn_extra_schedulers.remove(&node_id) { engine.set_txn_extra_scheduler(scheduler); } diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index dea3c0dc745..466bd973906 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -272,6 +272,11 @@ pub trait Engine: Send + Clone + 'static { fn async_snapshot(&self, ctx: SnapContext<'_>, cb: Callback) -> Result<()>; + /// Precheck request which has write with it's context. + fn precheck_write_with_ctx(&self, _ctx: &Context) -> Result<()> { + Ok(()) + } + fn async_write(&self, ctx: &Context, batch: WriteData, write_cb: Callback<()>) -> Result<()>; /// Writes data to the engine asynchronously with some extensions. diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 44d5e698f5c..031b182b9fe 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -129,6 +129,15 @@ impl RocksEngine { self.not_leader.store(true, Ordering::SeqCst); } + fn not_leader_error(&self) -> Error { + let not_leader = { + let mut header = kvproto::errorpb::Error::default(); + header.mut_not_leader().set_region_id(100); + header + }; + Error::from(ErrorInner::Request(not_leader)) + } + pub fn pause(&self, dur: Duration) { self.sched.schedule(Task::Pause(dur)).unwrap(); } @@ -209,6 +218,13 @@ impl Engine for RocksEngine { write_modifies(&self.engines.kv, modifies) } + fn precheck_write_with_ctx(&self, _ctx: &Context) -> Result<()> { + if self.not_leader.load(Ordering::SeqCst) { + return Err(self.not_leader_error()); + } + Ok(()) + } + fn async_write(&self, ctx: &Context, batch: WriteData, cb: Callback<()>) -> Result<()> { self.async_write_ext(ctx, batch, cb, None, None) } @@ -243,16 +259,11 @@ impl Engine for RocksEngine { fail_point!("rockskv_async_snapshot", |_| Err(box_err!( "snapshot failed" ))); - let not_leader = { - let mut header = kvproto::errorpb::Error::default(); - header.mut_not_leader().set_region_id(100); - header - }; fail_point!("rockskv_async_snapshot_not_leader", |_| { - Err(Error::from(ErrorInner::Request(not_leader.clone()))) + Err(self.not_leader_error()) }); if self.not_leader.load(Ordering::SeqCst) { - return Err(Error::from(ErrorInner::Request(not_leader))); + return Err(self.not_leader_error()); } box_try!(self.sched.schedule(Task::Snapshot(cb))); Ok(()) diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index de72a642837..a314315985c 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -8,10 +8,11 @@ use std::{ mem, num::NonZeroU64, result, - sync::Arc, + sync::{Arc, RwLock}, time::Duration, }; +use collections::HashSet; use concurrency_manager::ConcurrencyManager; use engine_traits::{CfName, KvEngine, MvccProperties, Snapshot}; use kvproto::{ @@ -158,6 +159,7 @@ where router: S, engine: E, txn_extra_scheduler: Option>, + region_leaders: Arc>>, } impl RaftKv @@ -166,11 +168,12 @@ where S: RaftStoreRouter + LocalReadRouter + 'static, { /// Create a RaftKv using specified configuration. - pub fn new(router: S, engine: E) -> RaftKv { + pub fn new(router: S, engine: E, region_leaders: Arc>>) -> RaftKv { RaftKv { router, engine, txn_extra_scheduler: None, + region_leaders, } } @@ -353,6 +356,14 @@ where write_modifies(&self.engine, modifies) } + fn precheck_write_with_ctx(&self, ctx: &Context) -> kv::Result<()> { + let region_id = ctx.get_region_id(); + match self.region_leaders.read().unwrap().get(®ion_id) { + Some(_) => Ok(()), + None => Err(RaftServerError::NotLeader(region_id, None).into()), + } + } + fn async_write( &self, ctx: &Context, diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 07f1143bcb0..e58f7862b37 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -159,6 +159,8 @@ make_auto_flush_static_metric! { new, snapshot, async_snapshot_err, + precheck_write_ok, + precheck_write_err, snapshot_ok, snapshot_err, read_finish, diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 66194cd08fa..2d9d3610432 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -48,7 +48,9 @@ use pd_client::{Feature, FeatureGate}; use raftstore::store::TxnExt; use resource_metering::{FutureExt, ResourceTagFactory}; use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData}; -use tikv_util::{quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE}; +use tikv_util::{ + deadline::Deadline, quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE, +}; use tracker::{get_tls_tracker_token, set_tls_tracker_token, TrackerToken}; use txn_types::TimeStamp; @@ -261,6 +263,17 @@ impl SchedulerInner { tctx } + /// Try to own the corresponding task context and take the callback. + /// + /// If the task is been processing, it should be owned. + /// If it has been finished, then it is not in the slot. + /// In both cases, cb should be None. Otherwise, cb should be some. + fn try_own_and_take_cb(&self, cid: u64) -> Option { + self.get_task_slot(cid) + .get_mut(&cid) + .and_then(|tctx| if tctx.try_own() { tctx.cb.take() } else { None }) + } + fn take_task_cb_and_pr(&self, cid: u64) -> (Option, Option) { self.get_task_slot(cid) .get_mut(&cid) @@ -431,7 +444,7 @@ impl Scheduler { self.inner .new_task_context(Task::new(cid, tracker, cmd), callback) }); - let deadline = tctx.task.as_ref().unwrap().cmd.deadline(); + if self.inner.latches.acquire(&mut tctx.lock, cid) { fail_point!("txn_scheduler_acquire_success"); tctx.on_schedule(); @@ -440,30 +453,66 @@ impl Scheduler { self.execute(task); return; } - // Check deadline in background. + let task = tctx.task.as_ref().unwrap(); + let deadline = task.cmd.deadline(); + let cmd_ctx = task.cmd.ctx().clone(); + self.fail_fast_or_check_deadline(cid, tag, cmd_ctx, deadline); + fail_point!("txn_scheduler_acquire_fail"); + } + + fn fail_fast_or_check_deadline( + &self, + cid: u64, + tag: CommandKind, + cmd_ctx: Context, + deadline: Deadline, + ) { let sched = self.clone(); self.inner .high_priority_pool .pool .spawn(async move { - GLOBAL_TIMER_HANDLE - .delay(deadline.to_std_instant()) - .compat() - .await - .unwrap(); - let cb = sched - .inner - .get_task_slot(cid) - .get_mut(&cid) - .and_then(|tctx| if tctx.try_own() { tctx.cb.take() } else { None }); - if let Some(cb) = cb { - cb.execute(ProcessResult::Failed { - err: StorageErrorInner::DeadlineExceeded.into(), - }) + match unsafe { + with_tls_engine(|engine: &E| engine.precheck_write_with_ctx(&cmd_ctx)) + } { + // Precheck failed, try to return err early. + Err(e) => { + let cb = sched.inner.try_own_and_take_cb(cid); + // The task is not processing or finished currently. It's safe + // to response early here. In the future, the task will be waked up + // and it will finished with DeadlineExceeded error. + // As the cb is taken here, it will not be executed anymore. + if let Some(cb) = cb { + let pr = ProcessResult::Failed { + err: StorageError::from(e), + }; + Self::early_response( + cid, + cb, + pr, + tag, + CommandStageKind::precheck_write_err, + ); + } + } + Ok(()) => { + SCHED_STAGE_COUNTER_VEC.get(tag).precheck_write_ok.inc(); + // Check deadline in background. + GLOBAL_TIMER_HANDLE + .delay(deadline.to_std_instant()) + .compat() + .await + .unwrap(); + let cb = sched.inner.try_own_and_take_cb(cid); + if let Some(cb) = cb { + cb.execute(ProcessResult::Failed { + err: StorageErrorInner::DeadlineExceeded.into(), + }) + } + } } }) .unwrap(); - fail_point!("txn_scheduler_acquire_fail"); } /// Tries to acquire all the necessary latches. If all the necessary latches @@ -1201,6 +1250,7 @@ mod tests { use super::*; use crate::storage::{ + kv::{Error as KvError, ErrorInner as KvErrorInner}, lock_manager::DummyLockManager, mvcc::{self, Mutation}, test_util::latest_feature_gate, @@ -1210,7 +1260,7 @@ mod tests { flow_controller::{EngineFlowController, FlowController}, latch::*, }, - TestEngineBuilder, TxnStatus, + RocksEngine, TestEngineBuilder, TxnStatus, }; #[derive(Clone)] @@ -1221,6 +1271,36 @@ mod tests { fn report_write_stats(&self, _write_stats: WriteStats) {} } + // TODO(cosven): use this in the following test cases to reduce duplicate code. + fn new_test_scheduler() -> (Scheduler, RocksEngine) { + let engine = TestEngineBuilder::new().build().unwrap(); + let config = Config { + scheduler_concurrency: 1024, + scheduler_worker_pool_size: 1, + scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), + enable_async_apply_prewrite: false, + ..Default::default() + }; + ( + Scheduler::new( + engine.clone(), + DummyLockManager, + ConcurrencyManager::new(1.into()), + &config, + DynamicConfigs { + pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), + in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), + }, + Arc::new(FlowController::Singleton(EngineFlowController::empty())), + DummyReporter, + ResourceTagFactory::new_for_test(), + Arc::new(QuotaLimiter::default()), + latest_feature_gate(), + ), + engine, + ) + } + #[test] fn test_command_latches() { let mut temp_map = HashMap::default(); @@ -1395,6 +1475,52 @@ mod tests { block_on(f).unwrap().unwrap(); } + /// When all latches are acquired, the command should be executed directly. + /// When any latch is not acquired, the command should be prechecked. + #[test] + fn test_schedule_command_with_fail_fast_mode() { + let (scheduler, engine) = new_test_scheduler(); + + // req can acquire all latches, so it should be executed directly. + let mut req = BatchRollbackRequest::default(); + req.mut_context().max_execution_duration_ms = 10000; + req.set_keys(vec![b"a".to_vec(), b"b".to_vec(), b"c".to_vec()].into()); + let cmd: TypedCommand<()> = req.into(); + let (cb, f) = paired_future_callback(); + scheduler.run_cmd(cmd.cmd, StorageCallback::Boolean(cb)); + // It must be executed (and succeed). + block_on(f).unwrap().unwrap(); + + // Acquire the latch, so that next command(req2) can't require all latches. + let mut lock = Lock::new(&[Key::from_raw(b"d")]); + let cid = scheduler.inner.gen_id(); + assert!(scheduler.inner.latches.acquire(&mut lock, cid)); + + engine.trigger_not_leader(); + + // req2 can't acquire all latches, req2 will be prechecked. + let mut req2 = BatchRollbackRequest::default(); + req2.mut_context().max_execution_duration_ms = 10000; + req2.set_keys(vec![b"a".to_vec(), b"b".to_vec(), b"d".to_vec()].into()); + let cmd2: TypedCommand<()> = req2.into(); + let (cb2, f2) = paired_future_callback(); + scheduler.run_cmd(cmd2.cmd, StorageCallback::Boolean(cb2)); + + // Precheck should return NotLeader error. + assert!(matches!( + block_on(f2).unwrap(), + Err(StorageError(box StorageErrorInner::Kv(KvError( + box KvErrorInner::Request(ref e), + )))) if e.has_not_leader(), + )); + // The task context should be owned, and it's cb should be taken. + let cid2 = cid + 1; // Hack: get the cid of req2. + let mut task_slot = scheduler.inner.get_task_slot(cid2); + let tctx = task_slot.get_mut(&cid2).unwrap(); + assert!(!tctx.try_own()); + assert!(tctx.cb.is_none()); + } + #[test] fn test_pool_available_deadline() { let engine = TestEngineBuilder::new().build().unwrap(); diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index c97bdd72fac..1143600920f 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -1,7 +1,8 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; +use std::sync::{Arc, RwLock}; +use collections::HashSet; use crossbeam::channel::TrySendError; use engine_rocks::{RocksEngine, RocksSnapshot}; use engine_traits::{KvEngine, ALL_CFS, CF_DEFAULT}; @@ -179,7 +180,11 @@ fn bench_async_snapshot(b: &mut test::Bencher) { region.mut_region_epoch().set_version(2); region.mut_region_epoch().set_conf_ver(5); let (_tmp, db) = new_engine(); - let kv = RaftKv::new(SyncBenchRouter::new(region.clone(), db.clone()), db); + let kv = RaftKv::new( + SyncBenchRouter::new(region.clone(), db.clone()), + db, + Arc::new(RwLock::new(HashSet::default())), + ); let mut ctx = Context::default(); ctx.set_region_id(region.get_id()); @@ -208,7 +213,11 @@ fn bench_async_write(b: &mut test::Bencher) { region.mut_region_epoch().set_version(2); region.mut_region_epoch().set_conf_ver(5); let (_tmp, db) = new_engine(); - let kv = RaftKv::new(SyncBenchRouter::new(region.clone(), db.clone()), db); + let kv = RaftKv::new( + SyncBenchRouter::new(region.clone(), db.clone()), + db, + Arc::new(RwLock::new(HashSet::default())), + ); let mut ctx = Context::default(); ctx.set_region_id(region.get_id()); diff --git a/tests/integrations/storage/test_raftkv.rs b/tests/integrations/storage/test_raftkv.rs index f99d9348616..420f9bd7765 100644 --- a/tests/integrations/storage/test_raftkv.rs +++ b/tests/integrations/storage/test_raftkv.rs @@ -330,6 +330,55 @@ fn test_invalid_read_index_when_no_leader() { ); } +/// RaftKV precheck_write_with_ctx checks if the current role is leader. +/// When it is not, it should return NotLeader error during prechecking. +#[test] +fn test_raftkv_precheck_write_with_ctx() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + + // make sure leader has been elected. + assert_eq!(cluster.must_get(b"k1"), None); + + let region = cluster.get_region(b""); + let leader = cluster.leader_of_region(region.get_id()).unwrap(); + let follower = region + .get_peers() + .iter() + .find(|p| p.get_id() != leader.get_id()) + .unwrap(); + + let leader_storage = cluster.sim.rl().storages[&leader.get_id()].clone(); + let follower_storage = cluster.sim.rl().storages[&follower.get_id()].clone(); + + // Assume this is a write request. + let mut ctx = Context::default(); + ctx.set_region_id(region.get_id()); + ctx.set_region_epoch(region.get_region_epoch().clone()); + ctx.set_peer(region.get_peers()[0].clone()); + + // The (write) request can be sent to the leader. + leader_storage.precheck_write_with_ctx(&ctx).unwrap(); + // The (write) request should not be send to a follower. + follower_storage.precheck_write_with_ctx(&ctx).unwrap_err(); + + // Leader has network partition and it must be not leader any more. + let filter = Box::new(RegionPacketFilter::new( + region.get_id(), + leader.get_store_id(), + )); + cluster + .sim + .wl() + .add_recv_filter(leader.get_store_id(), filter.clone()); + cluster + .sim + .wl() + .add_send_filter(leader.get_store_id(), filter); + sleep_until_election_triggered(&cluster.cfg); + leader_storage.precheck_write_with_ctx(&ctx).unwrap_err(); +} + fn must_put(ctx: &Context, engine: &E, key: &[u8], value: &[u8]) { engine.put(ctx, Key::from_raw(key), value.to_vec()).unwrap(); } From 9d658db6f861653125e53c0709795c49a316e301 Mon Sep 17 00:00:00 2001 From: Potato Date: Mon, 22 Aug 2022 16:40:20 +0800 Subject: [PATCH 163/676] storage: record and return pessimistic_lock_wait time (#13309) ref pingcap/kvproto#965, ref tikv/tikv#12362 This commit record the pessimistic_lock_wait time for pessimistic transactions in the waitManager. Signed-off-by: OneSizeFitQuorum --- Cargo.lock | 2 +- components/pd_client/src/util.rs | 3 +- components/tracker/src/lib.rs | 2 ++ components/tracker/src/slab.rs | 6 ++++ src/server/lock_manager/mod.rs | 8 +++-- src/server/lock_manager/waiter_manager.rs | 22 +++++++++++--- src/server/service/kv.rs | 16 +++++----- src/storage/lock_manager.rs | 3 ++ src/storage/txn/scheduler.rs | 2 ++ tests/integrations/server/kv_service.rs | 37 +++++++++++++++++++++++ 10 files changed, 84 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3083e56ef23..b067e3337e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2627,7 +2627,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#d88fa382391ec305e879be7635e39beae6a19890" +source = "git+https://github.com/pingcap/kvproto.git#affce57868b9f8befac389559d372369b2cb616f" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index fec63383891..2aa74176627 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -848,11 +848,12 @@ pub fn check_resp_header(header: &ResponseHeader) -> Result<()> { ErrorType::IncompatibleVersion => Err(Error::Incompatible), ErrorType::StoreTombstone => Err(Error::StoreTombstone(err.get_message().to_owned())), ErrorType::RegionNotFound => Err(Error::RegionNotFound(vec![])), - ErrorType::Unknown => Err(box_err!(err.get_message())), ErrorType::GlobalConfigNotFound => { Err(Error::GlobalConfigNotFound(err.get_message().to_owned())) } ErrorType::Ok => Ok(()), + ErrorType::DuplicatedEntry | ErrorType::EntryNotFound => Err(box_err!(err.get_message())), + ErrorType::Unknown => Err(box_err!(err.get_message())), } } diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index e0a9b9de24f..be099beadde 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -40,6 +40,7 @@ impl Tracker { } pub fn write_write_detail(&self, detail: &mut pb::WriteDetail) { + detail.set_pessimistic_lock_wait_nanos(self.metrics.pessimistic_lock_wait_nanos); detail.set_store_batch_wait_nanos(self.metrics.wf_batch_wait_nanos); detail.set_propose_send_wait_nanos( self.metrics @@ -123,6 +124,7 @@ pub struct RequestMetrics { pub block_read_nanos: u64, pub internal_key_skipped_count: u64, pub deleted_key_skipped_count: u64, + pub pessimistic_lock_wait_nanos: u64, // temp instant used in raftstore metrics, first be the instant when creating the write // callback, then reset when it is ready to apply pub write_instant: Option, diff --git a/components/tracker/src/slab.rs b/components/tracker/src/slab.rs index 9b4be50796b..c7b9efa9944 100644 --- a/components/tracker/src/slab.rs +++ b/components/tracker/src/slab.rs @@ -182,6 +182,12 @@ impl fmt::Debug for TrackerToken { } } +impl Default for TrackerToken { + fn default() -> Self { + INVALID_TRACKER_TOKEN + } +} + #[cfg(test)] mod tests { use std::{sync::Arc, thread}; diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index 91e25a2edeb..e437cea2bf1 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -320,6 +320,7 @@ mod tests { use raftstore::coprocessor::RegionChangeEvent; use security::SecurityConfig; use tikv_util::config::ReadableDuration; + use tracker::{TrackerToken, INVALID_TRACKER_TOKEN}; use self::{deadlock::tests::*, metrics::*, waiter_manager::tests::*}; use super::*; @@ -361,10 +362,11 @@ mod tests { lock_mgr } - fn diag_ctx(key: &[u8], resource_group_tag: &[u8]) -> DiagnosticContext { + fn diag_ctx(key: &[u8], resource_group_tag: &[u8], tracker: TrackerToken) -> DiagnosticContext { DiagnosticContext { key: key.to_owned(), resource_group_tag: resource_group_tag.to_owned(), + tracker, } } @@ -428,7 +430,7 @@ mod tests { waiter1.lock, false, Some(WaitTimeout::Default), - diag_ctx(b"k1", b"tag1"), + diag_ctx(b"k1", b"tag1", INVALID_TRACKER_TOKEN), ); assert!(lock_mgr.has_waiter()); let (waiter2, lock_info2, f2) = new_test_waiter(20.into(), 10.into(), 10); @@ -439,7 +441,7 @@ mod tests { waiter2.lock, false, Some(WaitTimeout::Default), - diag_ctx(b"k2", b"tag2"), + diag_ctx(b"k2", b"tag2", INVALID_TRACKER_TOKEN), ); assert!(lock_mgr.has_waiter()); assert_elapsed( diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index 8e5225bef76..b0e05091267 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -19,13 +19,14 @@ use futures::{ task::{Context, Poll}, }; use kvproto::deadlock::WaitForEntry; -use prometheus::HistogramTimer; use tikv_util::{ config::ReadableDuration, + time::{duration_to_sec, InstantExt}, timer::GLOBAL_TIMER_HANDLE, worker::{FutureRunnable, FutureScheduler, Stopped}, }; use tokio::task::spawn_local; +use tracker::GLOBAL_TRACKERS; use super::{config::Config, deadlock::Scheduler as DetectorScheduler, metrics::*}; use crate::storage::{ @@ -110,6 +111,7 @@ pub enum Task { lock: Lock, timeout: WaitTimeout, diag_ctx: DiagnosticContext, + start_waiting_time: Instant, }, WakeUp { // lock info @@ -181,7 +183,7 @@ pub(crate) struct Waiter { pub(crate) lock: Lock, pub diag_ctx: DiagnosticContext, delay: Delay, - _lifetime_timer: HistogramTimer, + start_waiting_time: Instant, } impl Waiter { @@ -192,6 +194,7 @@ impl Waiter { lock: Lock, deadline: Instant, diag_ctx: DiagnosticContext, + start_waiting_time: Instant, ) -> Self { Self { start_ts, @@ -200,7 +203,7 @@ impl Waiter { lock, delay: Delay::new(deadline), diag_ctx, - _lifetime_timer: WAITER_LIFETIME_HISTOGRAM.start_coarse_timer(), + start_waiting_time, } } @@ -224,6 +227,11 @@ impl Waiter { /// `Notify` consumes the `Waiter` to notify the corresponding transaction /// going on. fn notify(self) { + let elapsed = self.start_waiting_time.saturating_elapsed(); + GLOBAL_TRACKERS.with_tracker(self.diag_ctx.tracker, |tracker| { + tracker.metrics.pessimistic_lock_wait_nanos = elapsed.as_nanos() as u64; + }); + WAITER_LIFETIME_HISTOGRAM.observe(duration_to_sec(elapsed)); // Cancel the delay timer to prevent removing the same `Waiter` earlier. self.delay.cancel(); self.cb.execute(self.pr); @@ -424,6 +432,7 @@ impl Scheduler { lock, timeout, diag_ctx, + start_waiting_time: Instant::now(), }); } @@ -597,6 +606,7 @@ impl FutureRunnable for WaiterManager { lock, timeout, diag_ctx, + start_waiting_time, } => { let waiter = Waiter::new( start_ts, @@ -605,6 +615,7 @@ impl FutureRunnable for WaiterManager { lock, self.normalize_deadline(timeout), diag_ctx, + start_waiting_time, ); self.handle_wait_for(waiter); TASK_COUNTER_METRICS.wait_for.inc(); @@ -662,7 +673,7 @@ pub mod tests { lock: Lock { ts: lock_ts, hash }, diag_ctx: DiagnosticContext::default(), delay: Delay::new(Instant::now()), - _lifetime_timer: WAITER_LIFETIME_HISTOGRAM.start_coarse_timer(), + start_waiting_time: Instant::now(), } } @@ -764,6 +775,7 @@ pub mod tests { lock, Instant::now() + Duration::from_millis(3000), DiagnosticContext::default(), + Instant::now(), ); (waiter, info, f) } @@ -977,7 +989,7 @@ pub mod tests { .remove_waiter( Lock { ts: TimeStamp::zero(), - hash: 0 + hash: 0, }, TimeStamp::zero(), ) diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 8f0f9a23cae..fa743911b40 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -2080,20 +2080,22 @@ txn_command_future!(future_prewrite, PrewriteRequest, PrewriteResponse, (v, resp } resp.set_errors(extract_key_errors(v.map(|v| v.locks)).into()); }}); -txn_command_future!(future_acquire_pessimistic_lock, PessimisticLockRequest, PessimisticLockResponse, (v, resp, tracker) { +txn_command_future!(future_acquire_pessimistic_lock, PessimisticLockRequest, PessimisticLockResponse, (v, resp, tracker) {{ match v { Ok(Ok(res)) => { let (values, not_founds) = res.into_values_and_not_founds(); resp.set_values(values.into()); resp.set_not_founds(not_founds); - GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { - tracker.write_scan_detail(resp.mut_exec_details_v2().mut_scan_detail_v2()); - tracker.write_write_detail(resp.mut_exec_details_v2().mut_write_detail()); - }); }, - Err(e) | Ok(Err(e)) => resp.set_errors(vec![extract_key_error(&e)].into()), + Err(e) | Ok(Err(e)) => { + resp.set_errors(vec![extract_key_error(&e)].into()) + }, } -}); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + tracker.write_scan_detail(resp.mut_exec_details_v2().mut_scan_detail_v2()); + tracker.write_write_detail(resp.mut_exec_details_v2().mut_write_detail()); + }); +}}); txn_command_future!(future_pessimistic_rollback, PessimisticRollbackRequest, PessimisticRollbackResponse, (v, resp) { resp.set_errors(extract_key_errors(v).into()) }); diff --git a/src/storage/lock_manager.rs b/src/storage/lock_manager.rs index def756c921e..79a9d0572f3 100644 --- a/src/storage/lock_manager.rs +++ b/src/storage/lock_manager.rs @@ -2,6 +2,7 @@ use std::time::Duration; +use tracker::TrackerToken; use txn_types::TimeStamp; use crate::{ @@ -24,6 +25,8 @@ pub struct DiagnosticContext { /// same statement) Currently it is the encoded SQL digest if the client /// is TiDB pub resource_group_tag: Vec, + /// The tracker is used to track and collect the lock wait details. + pub tracker: TrackerToken, } /// Time to wait for lock released when encountering locks. diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 2d9d3610432..382979b7815 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -827,6 +827,7 @@ impl Scheduler { let cid = task.cid; let priority = task.cmd.priority(); let ts = task.cmd.ts(); + let tracker = task.tracker; let scheduler = self.clone(); let quota_limiter = self.inner.quota_limiter.clone(); let mut sample = quota_limiter.new_sample(true); @@ -913,6 +914,7 @@ impl Scheduler { let diag_ctx = DiagnosticContext { key, resource_group_tag: ctx.get_resource_group_tag().into(), + tracker, }; scheduler.on_wait_for_lock(cid, ts, pr, lock, is_first_lock, wait_timeout, diag_ctx); return; diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 17b1e49f2e0..8095ebdf2ca 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -2148,3 +2148,40 @@ fn test_rpc_wall_time() { ); } } + +#[test] +fn test_pessimistic_lock_execution_tracking() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (k, v) = (b"k1".to_vec(), b"k2".to_vec()); + + // Add a prewrite lock. + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v); + must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 10); + + let block_duration = Duration::from_millis(300); + let client_clone = client.clone(); + let ctx_clone = ctx.clone(); + let k_clone = k.clone(); + let handle = thread::spawn(move || { + thread::sleep(block_duration); + must_kv_commit(&client_clone, ctx_clone, vec![k_clone], 10, 30, 30); + }); + + let resp = kv_pessimistic_lock(&client, ctx, vec![k], 20, 20, false); + assert!( + resp.get_exec_details_v2() + .get_write_detail() + .get_pessimistic_lock_wait_nanos() + > 0, + "resp lock wait time={:?}, block_duration={:?}", + resp.get_exec_details_v2() + .get_write_detail() + .get_pessimistic_lock_wait_nanos(), + block_duration + ); + + handle.join().unwrap(); +} From f6159555995c156dcbfc741ccd93a59948d9d5de Mon Sep 17 00:00:00 2001 From: haojinming Date: Mon, 22 Aug 2022 22:06:21 +0800 Subject: [PATCH 164/676] rawkv: Reuse scheduler worker pool for raw modify command (#13286) ref tikv/tikv#13284 Signed-off-by: haojinming Co-authored-by: Ti Chi Robot --- src/storage/mod.rs | 284 ++++++++++++++++--------- tests/failpoints/cases/test_storage.rs | 33 +++ 2 files changed, 218 insertions(+), 99 deletions(-) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 6c4374f7c76..d974c731db0 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -87,6 +87,7 @@ use rand::prelude::*; use resource_metering::{FutureExt, ResourceTagFactory}; use tikv_kv::SnapshotExt; use tikv_util::{ + deadline::Deadline, quota_limiter::QuotaLimiter, time::{duration_to_ms, Instant, ThreadReadId}, }; @@ -1446,6 +1447,29 @@ impl Storage { Ok(()) } + // Schedule raw modify commands, which reuse the scheduler worker pool. + // TODO: separate the txn and raw commands if needed in the future. + fn sched_raw_command(&self, tag: CommandKind, future: T) -> Result<()> + where + T: Future + Send + 'static, + { + SCHED_STAGE_COUNTER_VEC.get(tag).new.inc(); + self.sched + .get_sched_pool(CommandPri::Normal) + .pool + .spawn(future) + .map_err(|_| Error::from(ErrorInner::SchedTooBusy)) + } + + fn get_deadline(ctx: &Context) -> Deadline { + let execution_duration_limit = if ctx.max_execution_duration_ms == 0 { + crate::storage::txn::scheduler::DEFAULT_EXECUTION_DURATION_LIMIT + } else { + ::std::time::Duration::from_millis(ctx.max_execution_duration_ms) + }; + Deadline::from_now(execution_duration_limit) + } + /// Delete all keys in the range [`start_key`, `end_key`). /// /// All keys in the range will be deleted permanently regardless of their @@ -1817,44 +1841,60 @@ impl Storage { if !F::IS_TTL_ENABLED && ttl != 0 { return Err(Error::from(ErrorInner::TtlNotEnabled)); } + let deadline = Self::get_deadline(&ctx); + let cf = Self::rawkv_cf(&cf, self.api_version)?; + let engine = self.engine.clone(); + self.sched_raw_command(CMD, async move { + if let Err(e) = deadline.check() { + return callback(Err(Error::from(e))); + } + let command_duration = tikv_util::time::Instant::now(); + let raw_value = RawValue { + user_value: value, + expire_ts: ttl_to_expire_ts(ttl), + is_delete: false, + }; + let m = Modify::Put( + cf, + F::encode_raw_key_owned(key, None), + F::encode_raw_value_owned(raw_value), + ); - let raw_value = RawValue { - user_value: value, - expire_ts: ttl_to_expire_ts(ttl), - is_delete: false, - }; - let m = Modify::Put( - Self::rawkv_cf(&cf, self.api_version)?, - F::encode_raw_key_owned(key, None), - F::encode_raw_value_owned(raw_value), - ); - - let mut batch = WriteData::from_modifies(vec![m]); - batch.set_allowed_on_disk_almost_full(); - - self.engine.async_write( - &ctx, - batch, - Box::new(|res| callback(res.map_err(Error::from))), - )?; - KV_COMMAND_COUNTER_VEC_STATIC.raw_put.inc(); - Ok(()) + let mut batch = WriteData::from_modifies(vec![m]); + batch.set_allowed_on_disk_almost_full(); + let (cb, f) = tikv_util::future::paired_future_callback(); + let async_ret = + engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); + let v: Result<()> = match async_ret { + Err(e) => Err(Error::from(e)), + Ok(_) => f.await.unwrap(), + }; + callback(v); + KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); + SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); + SCHED_HISTOGRAM_VEC_STATIC + .get(CMD) + .observe(command_duration.saturating_elapsed().as_secs_f64()); + }) } - fn raw_batch_put_requests_to_modifies( - cf: CfName, - pairs: Vec, - ttls: Vec, - ) -> Result> { + fn check_ttl_valid(key_cnt: usize, ttls: &Vec) -> Result<()> { if !F::IS_TTL_ENABLED { if ttls.iter().any(|&x| x != 0) { return Err(Error::from(ErrorInner::TtlNotEnabled)); } - } else if ttls.len() != pairs.len() { + } else if ttls.len() != key_cnt { return Err(Error::from(ErrorInner::TtlLenNotEqualsToPairs)); } + Ok(()) + } - let modifies = pairs + fn raw_batch_put_requests_to_modifies( + cf: CfName, + pairs: Vec, + ttls: Vec, + ) -> Vec { + pairs .into_iter() .zip(ttls) .map(|((k, v), ttl)| { @@ -1869,8 +1909,7 @@ impl Storage { F::encode_raw_value_owned(raw_value), ) }) - .collect(); - Ok(modifies) + .collect() } /// Write some keys to the storage in a batch. @@ -1882,10 +1921,11 @@ impl Storage { ttls: Vec, callback: Callback<()>, ) -> Result<()> { + const CMD: CommandKind = CommandKind::raw_batch_put; Self::check_api_version( self.api_version, ctx.api_version, - CommandKind::raw_batch_put, + CMD, pairs.iter().map(|(ref k, _)| k), )?; @@ -1896,18 +1936,32 @@ impl Storage { self.max_key_size, callback ); + Self::check_ttl_valid(pairs.len(), &ttls)?; - let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls)?; - let mut batch = WriteData::from_modifies(modifies); - batch.set_allowed_on_disk_almost_full(); - - self.engine.async_write( - &ctx, - batch, - Box::new(|res| callback(res.map_err(Error::from))), - )?; - KV_COMMAND_COUNTER_VEC_STATIC.raw_batch_put.inc(); - Ok(()) + let engine = self.engine.clone(); + let deadline = Self::get_deadline(&ctx); + self.sched_raw_command(CMD, async move { + if let Err(e) = deadline.check() { + return callback(Err(Error::from(e))); + } + let command_duration = tikv_util::time::Instant::now(); + let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls); + let mut batch = WriteData::from_modifies(modifies); + batch.set_allowed_on_disk_almost_full(); + let (cb, f) = tikv_util::future::paired_future_callback(); + let async_ret = + engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); + let v: Result<()> = match async_ret { + Err(e) => Err(Error::from(e)), + Ok(_) => f.await.unwrap(), + }; + callback(v); + KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); + SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); + SCHED_HISTOGRAM_VEC_STATIC + .get(CMD) + .observe(command_duration.saturating_elapsed().as_secs_f64()); + }) } fn raw_delete_request_to_modify(cf: CfName, key: Vec) -> Modify { @@ -1928,26 +1982,35 @@ impl Storage { key: Vec, callback: Callback<()>, ) -> Result<()> { - Self::check_api_version( - self.api_version, - ctx.api_version, - CommandKind::raw_delete, - [&key], - )?; + const CMD: CommandKind = CommandKind::raw_delete; + Self::check_api_version(self.api_version, ctx.api_version, CMD, [&key])?; check_key_size!(Some(&key).into_iter(), self.max_key_size, callback); - - let m = Self::raw_delete_request_to_modify(Self::rawkv_cf(&cf, self.api_version)?, key); - let mut batch = WriteData::from_modifies(vec![m]); - batch.set_allowed_on_disk_almost_full(); - - self.engine.async_write( - &ctx, - batch, - Box::new(|res| callback(res.map_err(Error::from))), - )?; - KV_COMMAND_COUNTER_VEC_STATIC.raw_delete.inc(); - Ok(()) + let cf = Self::rawkv_cf(&cf, self.api_version)?; + let engine = self.engine.clone(); + let deadline = Self::get_deadline(&ctx); + self.sched_raw_command(CMD, async move { + if let Err(e) = deadline.check() { + return callback(Err(Error::from(e))); + } + let command_duration = tikv_util::time::Instant::now(); + let m = Self::raw_delete_request_to_modify(cf, key); + let mut batch = WriteData::from_modifies(vec![m]); + batch.set_allowed_on_disk_almost_full(); + let (cb, f) = tikv_util::future::paired_future_callback(); + let async_ret = + engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); + let v: Result<()> = match async_ret { + Err(e) => Err(Error::from(e)), + Ok(_) => f.await.unwrap(), + }; + callback(v); + KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); + SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); + SCHED_HISTOGRAM_VEC_STATIC + .get(CMD) + .observe(command_duration.saturating_elapsed().as_secs_f64()); + }) } /// Delete all raw keys in [`start_key`, `end_key`). @@ -1962,31 +2025,45 @@ impl Storage { end_key: Vec, callback: Callback<()>, ) -> Result<()> { + const CMD: CommandKind = CommandKind::raw_delete_range; check_key_size!([&start_key, &end_key], self.max_key_size, callback); Self::check_api_version_ranges( self.api_version, ctx.api_version, - CommandKind::raw_delete_range, + CMD, [(Some(&start_key), Some(&end_key))], )?; let cf = Self::rawkv_cf(&cf, self.api_version)?; - let start_key = F::encode_raw_key_owned(start_key, None); - let end_key = F::encode_raw_key_owned(end_key, None); - - let mut batch = - WriteData::from_modifies(vec![Modify::DeleteRange(cf, start_key, end_key, false)]); - batch.set_allowed_on_disk_almost_full(); - - // TODO: special notification channel for API V2. - - self.engine.async_write( - &ctx, - batch, - Box::new(|res| callback(res.map_err(Error::from))), - )?; - KV_COMMAND_COUNTER_VEC_STATIC.raw_delete_range.inc(); - Ok(()) + let engine = self.engine.clone(); + let deadline = Self::get_deadline(&ctx); + self.sched_raw_command(CMD, async move { + if let Err(e) = deadline.check() { + return callback(Err(Error::from(e))); + } + let command_duration = tikv_util::time::Instant::now(); + let start_key = F::encode_raw_key_owned(start_key, None); + let end_key = F::encode_raw_key_owned(end_key, None); + + let mut batch = + WriteData::from_modifies(vec![Modify::DeleteRange(cf, start_key, end_key, false)]); + batch.set_allowed_on_disk_almost_full(); + + // TODO: special notification channel for API V2. + let (cb, f) = tikv_util::future::paired_future_callback(); + let async_ret = + engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); + let v: Result<()> = match async_ret { + Err(e) => Err(Error::from(e)), + Ok(_) => f.await.unwrap(), + }; + callback(v); + KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); + SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); + SCHED_HISTOGRAM_VEC_STATIC + .get(CMD) + .observe(command_duration.saturating_elapsed().as_secs_f64()); + }) } /// Delete some raw keys in a batch. @@ -1999,30 +2076,38 @@ impl Storage { keys: Vec>, callback: Callback<()>, ) -> Result<()> { - Self::check_api_version( - self.api_version, - ctx.api_version, - CommandKind::raw_batch_delete, - &keys, - )?; + const CMD: CommandKind = CommandKind::raw_batch_delete; + Self::check_api_version(self.api_version, ctx.api_version, CMD, &keys)?; let cf = Self::rawkv_cf(&cf, self.api_version)?; check_key_size!(keys.iter(), self.max_key_size, callback); - - let modifies = keys - .into_iter() - .map(|k| Self::raw_delete_request_to_modify(cf, k)) - .collect(); - let mut batch = WriteData::from_modifies(modifies); - batch.set_allowed_on_disk_almost_full(); - - self.engine.async_write( - &ctx, - batch, - Box::new(|res| callback(res.map_err(Error::from))), - )?; - KV_COMMAND_COUNTER_VEC_STATIC.raw_batch_delete.inc(); - Ok(()) + let engine = self.engine.clone(); + let deadline = Self::get_deadline(&ctx); + self.sched_raw_command(CMD, async move { + if let Err(e) = deadline.check() { + return callback(Err(Error::from(e))); + } + let command_duration = tikv_util::time::Instant::now(); + let modifies = keys + .into_iter() + .map(|k| Self::raw_delete_request_to_modify(cf, k)) + .collect(); + let mut batch = WriteData::from_modifies(modifies); + batch.set_allowed_on_disk_almost_full(); + let (cb, f) = tikv_util::future::paired_future_callback(); + let async_ret = + engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); + let v: Result<()> = match async_ret { + Err(e) => Err(Error::from(e)), + Ok(_) => f.await.unwrap(), + }; + callback(v); + KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); + SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); + SCHED_HISTOGRAM_VEC_STATIC + .get(CMD) + .observe(command_duration.saturating_elapsed().as_secs_f64()); + }) } /// Scan raw keys in a range. @@ -2444,7 +2529,8 @@ impl Storage { )?; let cf = Self::rawkv_cf(&cf, self.api_version)?; - let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls)?; + Self::check_ttl_valid(pairs.len(), &ttls)?; + let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls); let cmd = RawAtomicStore::new(cf, modifies, ctx); self.sched_txn_command(cmd, callback) } diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 7502fe6be4e..40ba7297b7c 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -1425,3 +1425,36 @@ fn test_mvcc_concurrent_commit_and_rollback_at_shutdown() { ); assert_eq!(get_resp.value, v); } + +#[test] +fn test_raw_put_deadline() { + let deadline_fp = "deadline_check_fail"; + let mut cluster = new_server_cluster(0, 1); + cluster.run(); + let region = cluster.get_region(b""); + let leader = region.get_peers()[0].clone(); + + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + let mut ctx = Context::default(); + ctx.set_region_id(region.get_id()); + ctx.set_region_epoch(region.get_region_epoch().clone()); + ctx.set_peer(leader); + + let mut put_req = RawPutRequest::default(); + put_req.set_context(ctx); + put_req.key = b"k3".to_vec(); + put_req.value = b"v3".to_vec(); + fail::cfg(deadline_fp, "return()").unwrap(); + let put_resp = client.raw_put(&put_req).unwrap(); + assert!(put_resp.has_region_error(), "{:?}", put_resp); + must_get_none(&cluster.get_engine(1), b"k3"); + + fail::remove(deadline_fp); + let put_resp = client.raw_put(&put_req).unwrap(); + assert!(!put_resp.has_region_error(), "{:?}", put_resp); + must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); +} From 252b44288c7eaa1943b5b871018d80d63a7af88f Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 22 Aug 2022 18:54:20 -0700 Subject: [PATCH 165/676] make read quota limiter work for coprocessor as well (#13257) close tikv/tikv#13256 The coprocessor's read bytes are not calculated by foreground quota limiter. Signed-off-by: qi.xu Co-authored-by: qi.xu Co-authored-by: 5kbpers --- components/test_coprocessor/src/fixture.rs | 20 ++++++++--- components/tidb_query_executors/src/runner.rs | 3 ++ components/tikv_util/src/quota_limiter.rs | 6 ++++ tests/failpoints/cases/test_coprocessor.rs | 4 +-- .../integrations/coprocessor/test_analyze.rs | 16 ++++----- .../integrations/coprocessor/test_checksum.rs | 2 +- tests/integrations/coprocessor/test_select.rs | 34 +++++++++++-------- 7 files changed, 55 insertions(+), 30 deletions(-) diff --git a/components/test_coprocessor/src/fixture.rs b/components/test_coprocessor/src/fixture.rs index 55a7f72a07f..23fc877a996 100644 --- a/components/test_coprocessor/src/fixture.rs +++ b/components/test_coprocessor/src/fixture.rs @@ -67,7 +67,7 @@ pub fn init_data_with_engine_and_commit( tbl: &ProductTable, vals: &[(i64, Option<&str>, i64)], commit: bool, -) -> (Store, Endpoint) { +) -> (Store, Endpoint, Arc) { init_data_with_details(ctx, engine, tbl, vals, commit, &Config::default()) } @@ -78,7 +78,7 @@ pub fn init_data_with_details( vals: &[(i64, Option<&str>, i64)], commit: bool, cfg: &Config, -) -> (Store, Endpoint) { +) -> (Store, Endpoint, Arc) { let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) .build() .unwrap(); @@ -103,21 +103,22 @@ pub fn init_data_with_details( store.get_engine(), )); let cm = ConcurrencyManager::new(1.into()); + let limiter = Arc::new(QuotaLimiter::default()); let copr = Endpoint::new( cfg, pool.handle(), cm, ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), + limiter.clone(), ); - (store, copr) + (store, copr, limiter) } pub fn init_data_with_commit( tbl: &ProductTable, vals: &[(i64, Option<&str>, i64)], commit: bool, -) -> (Store, Endpoint) { +) -> (Store, Endpoint, Arc) { let engine = TestEngineBuilder::new().build().unwrap(); init_data_with_engine_and_commit(Context::default(), engine, tbl, vals, commit) } @@ -128,5 +129,14 @@ pub fn init_with_data( tbl: &ProductTable, vals: &[(i64, Option<&str>, i64)], ) -> (Store, Endpoint) { + let (store, endpoint, _) = init_data_with_commit(tbl, vals, true); + (store, endpoint) +} + +// Same as init_with_data except returned values include Arc +pub fn init_with_data_ext( + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], +) -> (Store, Endpoint, Arc) { init_data_with_commit(tbl, vals, true) } diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 073fade4b29..9f32aaa180e 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -471,6 +471,9 @@ impl BatchExecutorsRunner { &mut ctx, )? }; + if chunk.has_rows_data() { + sample.add_read_bytes(chunk.get_rows_data().len()); + } let quota_delay = self.quota_limiter.consume_sample(sample, true).await; if !quota_delay.is_zero() { diff --git a/components/tikv_util/src/quota_limiter.rs b/components/tikv_util/src/quota_limiter.rs index f382964c4d1..4d5ca82c7d9 100644 --- a/components/tikv_util/src/quota_limiter.rs +++ b/components/tikv_util/src/quota_limiter.rs @@ -233,6 +233,12 @@ impl QuotaLimiter { self.enable_auto_tune.load(Ordering::Relaxed) } + pub fn total_read_bytes_consumed(&self, is_foreground: bool) -> usize { + self.get_limiters(is_foreground) + .read_bandwidth_limiter + .total_bytes_consumed() + } + // To generate a sampler. pub fn new_sample(&self, is_foreground: bool) -> Sample { Sample { diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index 4371e8999ce..481e533a879 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -60,7 +60,7 @@ fn test_deadline_3() { ]; let product = ProductTable::new(); - let (_, endpoint) = { + let (_, endpoint, _) = { let engine = tikv::storage::TestEngineBuilder::new().build().unwrap(); let cfg = tikv::server::Config { end_point_request_max_handle_duration: tikv_util::config::ReadableDuration::secs(1), @@ -174,7 +174,7 @@ fn test_region_error_in_scan() { let (_cluster, raft_engine, mut ctx) = new_raft_engine(1, ""); ctx.set_isolation_level(IsolationLevel::Si); - let (_, endpoint) = + let (_, endpoint, _) = init_data_with_engine_and_commit(ctx.clone(), raft_engine, &product, &data, true); fail::cfg("region_snapshot_seek", "return()").unwrap(); diff --git a/tests/integrations/coprocessor/test_analyze.rs b/tests/integrations/coprocessor/test_analyze.rs index 04f10fa08f1..0ce4623ac15 100644 --- a/tests/integrations/coprocessor/test_analyze.rs +++ b/tests/integrations/coprocessor/test_analyze.rs @@ -114,7 +114,7 @@ fn test_analyze_column_with_lock() { let product = ProductTable::new(); for &iso_level in &[IsolationLevel::Si, IsolationLevel::Rc] { - let (_, endpoint) = init_data_with_commit(&product, &data, false); + let (_, endpoint, _) = init_data_with_commit(&product, &data, false); let mut req = new_analyze_column_req(&product, 3, 3, 3, 3, 4, 32); let mut ctx = Context::default(); @@ -149,7 +149,7 @@ fn test_analyze_column() { ]; let product = ProductTable::new(); - let (_, endpoint) = init_data_with_commit(&product, &data, true); + let (_, endpoint, _) = init_data_with_commit(&product, &data, true); let req = new_analyze_column_req(&product, 3, 3, 3, 3, 4, 32); let resp = handle_request(&endpoint, req); @@ -181,7 +181,7 @@ fn test_analyze_single_primary_column() { ]; let product = ProductTable::new(); - let (_, endpoint) = init_data_with_commit(&product, &data, true); + let (_, endpoint, _) = init_data_with_commit(&product, &data, true); let req = new_analyze_column_req(&product, 1, 3, 3, 3, 4, 32); let resp = handle_request(&endpoint, req); @@ -206,7 +206,7 @@ fn test_analyze_index_with_lock() { let product = ProductTable::new(); for &iso_level in &[IsolationLevel::Si, IsolationLevel::Rc] { - let (_, endpoint) = init_data_with_commit(&product, &data, false); + let (_, endpoint, _) = init_data_with_commit(&product, &data, false); let mut req = new_analyze_index_req(&product, 3, product["name"].index, 4, 32, 0, 1); let mut ctx = Context::default(); @@ -246,7 +246,7 @@ fn test_analyze_index() { ]; let product = ProductTable::new(); - let (_, endpoint) = init_data_with_commit(&product, &data, true); + let (_, endpoint, _) = init_data_with_commit(&product, &data, true); let req = new_analyze_index_req(&product, 3, product["name"].index, 4, 32, 2, 2); let resp = handle_request(&endpoint, req); @@ -288,7 +288,7 @@ fn test_analyze_sampling_reservoir() { ]; let product = ProductTable::new(); - let (_, endpoint) = init_data_with_commit(&product, &data, true); + let (_, endpoint, _) = init_data_with_commit(&product, &data, true); // Pass the 2nd column as a column group. let req = new_analyze_sampling_req(&product, 1, 5, 0.0); @@ -320,7 +320,7 @@ fn test_analyze_sampling_bernoulli() { ]; let product = ProductTable::new(); - let (_, endpoint) = init_data_with_commit(&product, &data, true); + let (_, endpoint, _) = init_data_with_commit(&product, &data, true); // Pass the 2nd column as a column group. let req = new_analyze_sampling_req(&product, 1, 0, 0.5); @@ -346,7 +346,7 @@ fn test_invalid_range() { ]; let product = ProductTable::new(); - let (_, endpoint) = init_data_with_commit(&product, &data, true); + let (_, endpoint, _) = init_data_with_commit(&product, &data, true); let mut req = new_analyze_index_req(&product, 3, product["name"].index, 4, 32, 0, 1); let mut key_range = KeyRange::default(); key_range.set_start(b"xxx".to_vec()); diff --git a/tests/integrations/coprocessor/test_checksum.rs b/tests/integrations/coprocessor/test_checksum.rs index db96393c860..2983414b9cc 100644 --- a/tests/integrations/coprocessor/test_checksum.rs +++ b/tests/integrations/coprocessor/test_checksum.rs @@ -46,7 +46,7 @@ fn test_checksum() { ]; let product = ProductTable::new(); - let (store, endpoint) = init_data_with_commit(&product, &data, true); + let (store, endpoint, _) = init_data_with_commit(&product, &data, true); for column in &[&product["id"], &product["name"], &product["count"]] { assert!(column.index >= 0); diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 660e88905e4..952516daf35 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -19,7 +19,7 @@ use tikv::{ server::Config, storage::TestEngineBuilder, }; -use tikv_util::codec::number::*; +use tikv_util::{codec::number::*, config::ReadableSize}; use tipb::{ AnalyzeColumnsReq, AnalyzeReq, AnalyzeType, ChecksumRequest, Chunk, Expr, ExprType, ScalarFuncSig, SelectResponse, @@ -61,10 +61,15 @@ fn test_select() { ]; let product = ProductTable::new(); - let (_, endpoint) = init_with_data(&product, &data); + let (_, endpoint, limiter) = init_with_data_ext(&product, &data); + limiter.set_read_bandwidth_limit(ReadableSize::kb(1), true); // for dag selection let req = DagSelect::from(&product).build(); let mut resp = handle_select(&endpoint, req); + let mut total_chunk_size = 0; + for chunk in resp.get_chunks() { + total_chunk_size += chunk.get_rows_data().len(); + } let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); for (row, (id, name, cnt)) in spliter.zip(data) { let name_datum = name.map(|s| s.as_bytes()).into(); @@ -76,6 +81,7 @@ fn test_select() { let result_encoded = datum::encode_value(&mut EvalContext::default(), &row).unwrap(); assert_eq!(result_encoded, &*expected_encoded); } + assert_eq!(limiter.total_read_bytes_consumed(true), total_chunk_size); // the consume_sample is called due to read bytes quota } #[test] @@ -89,7 +95,7 @@ fn test_batch_row_limit() { let batch_row_limit = 3; let chunk_datum_limit = batch_row_limit * 3; // we have 3 fields. let product = ProductTable::new(); - let (_, endpoint) = { + let (_, endpoint, _) = { let engine = TestEngineBuilder::new().build().unwrap(); let mut cfg = Config::default(); cfg.end_point_batch_row_limit = batch_row_limit; @@ -125,7 +131,7 @@ fn test_stream_batch_row_limit() { let product = ProductTable::new(); let stream_row_limit = 2; - let (_, endpoint) = { + let (_, endpoint, _) = { let engine = TestEngineBuilder::new().build().unwrap(); let mut cfg = Config::default(); cfg.end_point_stream_batch_row_limit = stream_row_limit; @@ -198,7 +204,7 @@ fn test_select_after_lease() { let product = ProductTable::new(); let (cluster, raft_engine, ctx) = new_raft_engine(1, ""); - let (_, endpoint) = + let (_, endpoint, _) = init_data_with_engine_and_commit(ctx.clone(), raft_engine, &product, &data, true); // Sleep until the leader lease is expired. @@ -228,7 +234,7 @@ fn test_scan_detail() { ]; let product = ProductTable::new(); - let (_, endpoint) = { + let (_, endpoint, _) = { let engine = TestEngineBuilder::new().build().unwrap(); let mut cfg = Config::default(); cfg.end_point_batch_row_limit = 50; @@ -1605,7 +1611,7 @@ fn test_key_is_locked_for_primary() { ]; let product = ProductTable::new(); - let (_, endpoint) = init_data_with_commit(&product, &data, false); + let (_, endpoint, _) = init_data_with_commit(&product, &data, false); let req = DagSelect::from(&product).build(); let resp = handle_request(&endpoint, req); @@ -1623,7 +1629,7 @@ fn test_key_is_locked_for_index() { ]; let product = ProductTable::new(); - let (_, endpoint) = init_data_with_commit(&product, &data, false); + let (_, endpoint, _) = init_data_with_commit(&product, &data, false); let req = DagSelect::from_index(&product, &product["name"]).build(); let resp = handle_request(&endpoint, req); @@ -1700,7 +1706,7 @@ fn test_snapshot_failed() { let product = ProductTable::new(); let (_cluster, raft_engine, ctx) = new_raft_engine(1, ""); - let (_, endpoint) = init_data_with_engine_and_commit(ctx, raft_engine, &product, &[], true); + let (_, endpoint, _) = init_data_with_engine_and_commit(ctx, raft_engine, &product, &[], true); // Use an invalid context to make errors. let req = DagSelect::from(&product).build_with(Context::default(), &[0]); @@ -1721,7 +1727,7 @@ fn test_cache() { let product = ProductTable::new(); let (_cluster, raft_engine, ctx) = new_raft_engine(1, ""); - let (_, endpoint) = + let (_, endpoint, _) = init_data_with_engine_and_commit(ctx.clone(), raft_engine, &product, &data, true); let req = DagSelect::from(&product).build_with(ctx, &[0]); @@ -1796,7 +1802,7 @@ fn test_copr_bypass_or_access_locks() { (8, Some("name:8"), 8), ]; // lock row 3, 4, 6 - let (mut store, endpoint) = init_data_with_engine_and_commit( + let (mut store, endpoint, _) = init_data_with_engine_and_commit( Default::default(), store.get_engine(), &product, @@ -1912,7 +1918,7 @@ fn test_rc_read() { ]; // uncommitted lock to be ignored - let (store, _) = init_data_with_engine_and_commit( + let (store, ..) = init_data_with_engine_and_commit( Default::default(), store.get_engine(), &product, @@ -1921,7 +1927,7 @@ fn test_rc_read() { ); // committed lock to be read - let (mut store, endpoint) = init_data_with_engine_and_commit( + let (mut store, endpoint, _) = init_data_with_engine_and_commit( Default::default(), store.get_engine(), &product, @@ -1970,7 +1976,7 @@ fn test_buckets() { let product = ProductTable::new(); let (mut cluster, raft_engine, ctx) = new_raft_engine(1, ""); - let (_, endpoint) = + let (_, endpoint, _) = init_data_with_engine_and_commit(ctx.clone(), raft_engine, &product, &[], true); let req = DagSelect::from(&product).build_with(ctx, &[0]); From 2c2c005b2ed609e60717fab25e7da9b708dac265 Mon Sep 17 00:00:00 2001 From: YangKeao Date: Tue, 23 Aug 2022 02:40:21 -0400 Subject: [PATCH 166/676] copr: fix compatiblity of `json_extract` function (#13299) close tikv/tikv#13297 Signed-off-by: YangKeao --- .../src/codec/mysql/json/json_extract.rs | 196 ++++++++++++++++-- .../src/codec/mysql/json/mod.rs | 13 ++ 2 files changed, 186 insertions(+), 23 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs index f7c1198c542..d40451fc9b5 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs @@ -1,5 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +use collections::HashSet; + use super::{ super::Result, path_expr::{PathExpression, PathLeg, PATH_EXPR_ARRAY_INDEX_ASTERISK, PATH_EXPR_ASTERISK}, @@ -14,21 +16,59 @@ impl<'a> JsonRef<'a> { /// /// See `Extract()` in TiDB `json.binary_function.go` pub fn extract(&self, path_expr_list: &[PathExpression]) -> Result> { + let mut could_return_multiple_matches = path_expr_list.len() > 1; + let mut elem_list = Vec::with_capacity(path_expr_list.len()); for path_expr in path_expr_list { + could_return_multiple_matches |= path_expr.contains_any_asterisk(); elem_list.append(&mut extract_json(*self, &path_expr.legs)?) } if elem_list.is_empty() { - return Ok(None); + Ok(None) + } else if could_return_multiple_matches { + Ok(Some(Json::from_array( + elem_list.drain(..).map(|j| j.to_owned()).collect(), + )?)) + } else { + Ok(Some(elem_list.remove(0).to_owned())) } - if path_expr_list.len() == 1 && elem_list.len() == 1 { - // If path_expr contains asterisks, elem_list.len() won't be 1 - // even if path_expr_list.len() equals to 1. - return Ok(Some(elem_list.remove(0).to_owned())); + } +} + +#[derive(Eq)] +struct RefEqualJsonWrapper<'a>(JsonRef<'a>); + +impl<'a> PartialEq for RefEqualJsonWrapper<'a> { + fn eq(&self, other: &Self) -> bool { + self.0.ref_eq(&other.0) + } +} + +impl<'a> std::hash::Hash for RefEqualJsonWrapper<'a> { + fn hash(&self, state: &mut H) { + self.0.value.as_ptr().hash(state) + } +} + +// append the elem_list vector, if the referenced json object doesn't exist +// unlike the append in std, this function **doesn't** set the `other` length to +// 0 +// +// To use this function, you have to ensure both `elem_list` and `other` are +// unique. +fn append_if_ref_unique<'a>(elem_list: &mut Vec>, other: &Vec>) { + elem_list.reserve(other.len()); + + let mut unique_verifier = HashSet::>::with_hasher(Default::default()); + for elem in elem_list.iter() { + unique_verifier.insert(RefEqualJsonWrapper(*elem)); + } + + for elem in other { + let elem = RefEqualJsonWrapper(*elem); + if !unique_verifier.contains(&elem) { + elem_list.push(elem.0); } - Ok(Some(Json::from_array( - elem_list.drain(..).map(|j| j.to_owned()).collect(), - )?)) } } @@ -45,18 +85,21 @@ pub fn extract_json<'a>(j: JsonRef<'a>, path_legs: &[PathLeg]) -> Result { if i as usize == 0 { - ret.append(&mut extract_json(j, sub_path_legs)?) + append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?) } } }, @@ -65,27 +108,36 @@ pub fn extract_json<'a>(j: JsonRef<'a>, path_legs: &[PathLeg]) -> Result { - ret.append(&mut extract_json(j, sub_path_legs)?); + append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?); match j.get_type() { JsonType::Array => { let elem_count = j.get_elem_count(); for k in 0..elem_count { - ret.append(&mut extract_json(j.array_get_elem(k)?, sub_path_legs)?) + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(k)?, path_legs)?, + ) } } JsonType::Object => { let elem_count = j.get_elem_count(); for i in 0..elem_count { - ret.append(&mut extract_json(j.object_get_val(i)?, sub_path_legs)?) + append_if_ref_unique( + &mut ret, + &extract_json(j.object_get_val(i)?, path_legs)?, + ) } } _ => {} @@ -257,7 +309,7 @@ mod tests { legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], - Some("false"), + Some("[false]"), ), ( r#"[{"a": "a1", "b": 20.08, "c": false}, true]"#, @@ -265,7 +317,101 @@ mod tests { legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], - Some("false"), + Some("[false]"), + ), + ( + r#"[[0, 1], [2, 3], [4, [5, 6]]]"#, + vec![PathExpression { + legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }], + Some("[[0, 1], 0, 1, 2, 3, 4, 5, 6]"), + ), + ( + r#"[[0, 1], [2, 3], [4, [5, 6]]]"#, + vec![ + PathExpression { + legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }, + PathExpression { + legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }, + ], + Some("[[0, 1], 0, 1, 2, 3, 4, 5, 6, [0, 1], 0, 1, 2, 3, 4, 5, 6]"), + ), + ( + "[1]", + vec![PathExpression { + legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }], + Some("[1]"), + ), + ( + r#"{"a": 1}"#, + vec![PathExpression { + legs: vec![PathLeg::Key(String::from("a")), PathLeg::Index(0)], + flags: PathExpressionFlag::default(), + }], + Some("1"), + ), + ( + r#"{"a": 1}"#, + vec![PathExpression { + legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }], + Some(r#"[{"a": 1}, 1]"#), + ), + ( + r#"{"a": 1}"#, + vec![PathExpression { + legs: vec![ + PathLeg::Index(0), + PathLeg::Index(0), + PathLeg::Index(0), + PathLeg::Key(String::from("a")), + ], + flags: PathExpressionFlag::default(), + }], + Some(r#"1"#), + ), + ( + r#"[1, [[{"x": [{"a":{"b":{"c":42}}}]}]]]"#, + vec![PathExpression { + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(String::from("a")), + PathLeg::Key(String::from("*")), + ], + flags: PATH_EXPRESSION_CONTAINS_ASTERISK + | PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }], + Some(r#"[{"c": 42}]"#), + ), + ( + r#"[{"a": [3,4]}, {"b": 2 }]"#, + vec![ + PathExpression { + legs: vec![PathLeg::Index(0), PathLeg::Key(String::from("a"))], + flags: PathExpressionFlag::default(), + }, + PathExpression { + legs: vec![PathLeg::Index(1), PathLeg::Key(String::from("a"))], + flags: PathExpressionFlag::default(), + }, + ], + Some("[[3, 4]]"), + ), + ( + r#"[{"a": [1,1,1,1]}]"#, + vec![PathExpression { + legs: vec![PathLeg::Index(0), PathLeg::Key(String::from("a"))], + flags: PathExpressionFlag::default(), + }], + Some("[1, 1, 1, 1]"), ), ]; for (i, (js, exprs, expected)) in test_cases.drain(..).enumerate() { @@ -276,11 +422,15 @@ mod tests { Some(es) => { let e = Json::from_str(es); assert!(e.is_ok(), "#{} expect parse json ok but got {:?}", i, e); - Some(e.unwrap()) + Some(e.unwrap().to_string()) } None => None, }; - let got = j.as_ref().extract(&exprs[..]).unwrap(); + let got = j + .as_ref() + .extract(&exprs[..]) + .unwrap() + .map(|got| got.to_string()); assert_eq!( got, expected, "#{} expect {:?}, but got {:?}", diff --git a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs index 8967ab71eeb..480ac5db129 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs @@ -219,6 +219,19 @@ impl<'a> JsonRef<'a> { JsonType::String => false, } } + + // Returns whether the two JsonRef references to the same + // json object. + // + // As the JsonRef exists and holds the reference to the Json + // , the `Vec` inside the Json cannot be changed, so comparing + // the pointer is enough to represent the reference equality. + // + // PartialEq and PartialCmp have been implemented for JsonRef + // to compare the value. + pub(crate) fn ref_eq(&self, other: &JsonRef<'a>) -> bool { + std::ptr::eq(self.value, other.value) + } } /// Json implements type json used in tikv by Binary Json. From 7861f56f6249ea6b4cc19a6b2ba7d7dbd2a63c25 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 22 Aug 2022 23:56:20 -0700 Subject: [PATCH 167/676] raftstore-v2: support status query (#13300) ref tikv/tikv#12842 And as an example to show how to setup test case. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 16 ++- components/raftstore-v2/src/fsm/peer.rs | 33 +++++- components/raftstore-v2/src/fsm/store.rs | 8 +- components/raftstore-v2/src/lib.rs | 3 +- components/raftstore-v2/src/operation/mod.rs | 2 +- .../src/operation/{read => query}/local.rs | 0 .../raftstore-v2/src/operation/query/mod.rs | 82 ++++++++++++++ .../raftstore-v2/src/operation/read/mod.rs | 9 -- .../src/operation/ready/async_writer.rs | 2 +- .../raftstore-v2/src/operation/ready/mod.rs | 4 +- components/raftstore-v2/src/raft/peer.rs | 32 +++++- components/raftstore-v2/src/router/message.rs | 66 ++++------- components/raftstore-v2/src/router/mod.rs | 2 +- .../src/router/response_channel.rs | 12 +- .../raftstore-v2/tests/integrations/mod.rs | 105 ++++++++++++++++-- .../tests/integrations/test_election.rs | 10 -- .../tests/integrations/test_status.rs | 50 +++++++++ components/tikv_util/src/config.rs | 2 + scripts/check-license | 2 +- 19 files changed, 334 insertions(+), 106 deletions(-) rename components/raftstore-v2/src/operation/{read => query}/local.rs (100%) create mode 100644 components/raftstore-v2/src/operation/query/mod.rs delete mode 100644 components/raftstore-v2/src/operation/read/mod.rs delete mode 100644 components/raftstore-v2/tests/integrations/test_election.rs create mode 100644 components/raftstore-v2/tests/integrations/test_status.rs diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index d4cba3d9381..d30490f50d5 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -34,7 +34,8 @@ use super::apply::{create_apply_batch_system, ApplyPollerBuilder, ApplyRouter, A use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate}, raft::Peer, - Error, PeerMsg, PeerTick, Result, StoreMsg, + router::{PeerMsg, PeerTick, StoreMsg}, + Error, Result, }; /// A per-thread context shared by the [`StoreFsm`] and multiple [`PeerFsm`]s. @@ -139,6 +140,9 @@ impl PollHandler F: FnOnce(&'a batch_system::Config), { + if self.store_msg_buf.capacity() == 0 || self.peer_msg_buf.capacity() == 0 { + self.apply_buf_capacity(); + } // Apply configuration changes. if let Some(cfg) = self.cfg_tracker.any_new().map(|c| c.clone()) { let last_messages_per_tick = self.messages_per_tick(); @@ -152,8 +156,9 @@ impl PollHandler Option { debug_assert!(self.store_msg_buf.is_empty()); - let received_cnt = fsm.recv(&mut self.store_msg_buf); - let expected_msg_count = if received_cnt == self.messages_per_tick() { + let batch_size = self.messages_per_tick(); + let received_cnt = fsm.recv(&mut self.store_msg_buf, batch_size); + let expected_msg_count = if received_cnt == batch_size { None } else { Some(0) @@ -165,8 +170,9 @@ impl PollHandler>) -> HandleResult { debug_assert!(self.peer_msg_buf.is_empty()); - let received_cnt = fsm.recv(&mut self.peer_msg_buf); - let handle_result = if received_cnt == self.messages_per_tick() { + let batch_size = self.messages_per_tick(); + let received_cnt = fsm.recv(&mut self.peer_msg_buf, batch_size); + let handle_result = if received_cnt == batch_size { HandleResult::KeepProcessing } else { HandleResult::stop_at(0, false) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 307da362330..886d8b2323a 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -13,9 +13,15 @@ use slog::{debug, error, info, trace, Logger}; use tikv_util::{ is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver, Sender}, + time::{duration_to_sec, Instant}, }; -use crate::{batch::StoreContext, raft::Peer, PeerMsg, PeerTick, Result}; +use crate::{ + batch::StoreContext, + raft::Peer, + router::{PeerMsg, PeerTick}, + Result, +}; pub type SenderFsmPair = (LooseBoundedSender, Box>); @@ -62,9 +68,9 @@ impl PeerFsm { /// capacity is reached or there is no more pending messages. /// /// Returns how many messages are fetched. - pub fn recv(&mut self, peer_msg_buf: &mut Vec) -> usize { + pub fn recv(&mut self, peer_msg_buf: &mut Vec, batch_size: usize) -> usize { let l = peer_msg_buf.len(); - for i in l..peer_msg_buf.capacity() { + for i in l..batch_size { match self.receiver.try_recv() { Ok(msg) => peer_msg_buf.push(msg), Err(e) => { @@ -75,7 +81,7 @@ impl PeerFsm { } } } - peer_msg_buf.capacity() - l + batch_size - l } } @@ -167,6 +173,14 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.schedule_tick(PeerTick::Raft); } + #[inline] + fn on_receive_command(&self, send_time: Instant) { + self.store_ctx + .raft_metrics + .propose_wait_time + .observe(duration_to_sec(send_time.saturating_elapsed()) as f64); + } + fn on_tick(&mut self, tick: PeerTick) { match tick { PeerTick::Raft => self.on_raft_tick(), @@ -187,8 +201,15 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, for msg in peer_msgs_buf.drain(..) { match msg { PeerMsg::RaftMessage(_) => unimplemented!(), - PeerMsg::RaftQuery(_) => unimplemented!(), - PeerMsg::RaftCommand(_) => unimplemented!(), + PeerMsg::RaftQuery(cmd) => { + self.on_receive_command(cmd.send_time); + self.on_query(cmd.request, cmd.ch) + } + PeerMsg::RaftCommand(cmd) => { + self.on_receive_command(cmd.send_time); + // self.on_command(cmd.cmd.request, cmd.ch) + unimplemented!() + } PeerMsg::Tick(tick) => self.on_tick(tick), PeerMsg::ApplyRes(res) => unimplemented!(), PeerMsg::Start => self.on_start(), diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index d80cd90d80b..61a3f76b138 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -8,7 +8,7 @@ use kvproto::metapb::Store; use raftstore::store::{Config, ReadDelegate}; use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; -use crate::{batch::StoreContext, tablet::CachedTablet, StoreMsg}; +use crate::{batch::StoreContext, router::StoreMsg, tablet::CachedTablet}; pub struct StoreMeta where @@ -53,15 +53,15 @@ impl StoreFsm { /// capacity is reached or there is no more pending messages. /// /// Returns how many messages are fetched. - pub fn recv(&self, store_msg_buf: &mut Vec) -> usize { + pub fn recv(&self, store_msg_buf: &mut Vec, batch_size: usize) -> usize { let l = store_msg_buf.len(); - for i in l..store_msg_buf.capacity() { + for i in l..batch_size { match self.receiver.try_recv() { Ok(msg) => store_msg_buf.push(msg), Err(_) => return i - l, } } - store_msg_buf.capacity() - l + batch_size - l } } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 43998160638..0b890d4a177 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -29,11 +29,10 @@ mod bootstrap; mod fsm; mod operation; mod raft; -mod router; +pub mod router; mod tablet; pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; pub use bootstrap::Bootstrap; pub use raftstore::{Error, Result}; -pub use router::{PeerMsg, PeerTick, StoreMsg, StoreTick}; diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 583053dd551..c352ffe0cc1 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -mod read; +mod query; mod ready; pub use ready::AsyncWriter; diff --git a/components/raftstore-v2/src/operation/read/local.rs b/components/raftstore-v2/src/operation/query/local.rs similarity index 100% rename from components/raftstore-v2/src/operation/read/local.rs rename to components/raftstore-v2/src/operation/query/local.rs diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs new file mode 100644 index 00000000000..ff03117419b --- /dev/null +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -0,0 +1,82 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! There are two types of Query: KV read and status query. +//! +//! KV Read is implemented in local module and lease module (not implemented +//! yet). Read will be executed in callee thread if in lease, which is +//! implemented in local module. If lease is expired, it will extend the lease +//! first. Lease maintainance is implemented in lease module. +//! +//! Status query is implemented in the root module directly. + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse, StatusCmdType}; +use raftstore::{ + store::{cmd_resp, util, ReadCallback}, + Error, Result, +}; +use tikv_util::box_err; + +use crate::{ + fsm::PeerFsmDelegate, + raft::Peer, + router::{QueryResChannel, QueryResult}, +}; + +mod local; + +impl<'a, EK: KvEngine, ER: RaftEngine, T> PeerFsmDelegate<'a, EK, ER, T> { + #[inline] + pub fn on_query(&mut self, req: RaftCmdRequest, ch: QueryResChannel) { + if !req.has_status_request() { + unimplemented!(); + } else { + self.fsm.peer_mut().on_query_status(&req, ch); + } + } +} + +impl Peer { + /// Status command is used to query target region information. + #[inline] + fn on_query_status(&mut self, req: &RaftCmdRequest, ch: QueryResChannel) { + let mut response = RaftCmdResponse::default(); + if let Err(e) = self.query_status(req, &mut response) { + cmd_resp::bind_error(&mut response, e); + } + ch.set_result(QueryResult::Response(response)); + } + + fn query_status(&mut self, req: &RaftCmdRequest, resp: &mut RaftCmdResponse) -> Result<()> { + util::check_store_id(req, self.peer().get_store_id())?; + let cmd_type = req.get_status_request().get_cmd_type(); + let status_resp = resp.mut_status_response(); + status_resp.set_cmd_type(cmd_type); + match cmd_type { + StatusCmdType::RegionLeader => { + if let Some(leader) = self.leader() { + status_resp.mut_region_leader().set_leader(leader); + } + } + StatusCmdType::RegionDetail => { + if !self.storage().is_initialized() { + let region_id = req.get_header().get_region_id(); + return Err(Error::RegionNotInitialized(region_id)); + } + status_resp + .mut_region_detail() + .set_region(self.region().clone()); + if let Some(leader) = self.leader() { + status_resp.mut_region_detail().set_leader(leader); + } + } + StatusCmdType::InvalidStatus => { + return Err(box_err!("{:?} invalid status command!", self.logger.list())); + } + } + + // Bind peer current term here. + cmd_resp::bind_term(resp, self.term()); + Ok(()) + } +} diff --git a/components/raftstore-v2/src/operation/read/mod.rs b/components/raftstore-v2/src/operation/read/mod.rs deleted file mode 100644 index efbe6af1a5a..00000000000 --- a/components/raftstore-v2/src/operation/read/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -//! There are two types of read: -//! - If the ReadDelegate is in the leader lease status, the read is operated -//! locally and need not to go through the raft layer (namely local read). -//! - Otherwise, redirect the request to the raftstore and proposed as a -//! RaftCommand in the raft layer. - -mod local; diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index 457df9307ba..e0b2a1c4802 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -16,7 +16,7 @@ use slog::{warn, Logger}; use crate::{ batch::{StoreContext, StoreRouter}, - PeerMsg, + router::PeerMsg, }; #[derive(Debug)] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 156ea55a414..1be4b0ee546 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -32,7 +32,7 @@ use crate::{ batch::StoreContext, fsm::{PeerFsm, PeerFsmDelegate}, raft::{Peer, Storage}, - PeerTick, + router::PeerTick, }; impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { @@ -95,7 +95,7 @@ impl Peer { ctx: &mut StoreContext, msg: eraftpb::Message, ) -> Option { - let to_peer = match self.get_peer_from_cache(msg.to) { + let to_peer = match self.peer_from_cache(msg.to) { Some(p) => p, None => { warn!(self.logger, "failed to look up recipient peer"; "to_peer" => msg.to); diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index eb61d744774..6fd7b4b444c 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -88,14 +88,22 @@ impl Peer { None }; - Ok(Some(Peer { + let mut peer = Peer { raft_group: RawNode::new(&raft_cfg, s, &logger)?, tablet: CachedTablet::new(tablet), has_ready: false, async_writer: AsyncWriter::new(region_id, peer_id), logger, peer_cache: vec![], - })) + }; + + // If this region has only one peer and I am the one, campaign directly. + let region = peer.region(); + if region.get_peers().len() == 1 && region.get_peers()[0].get_store_id() == store_id { + peer.raft_group.campaign()?; + } + + Ok(Some(peer)) } #[inline] @@ -193,7 +201,7 @@ impl Peer { } #[inline] - pub fn get_peer_from_cache(&self, peer_id: u64) -> Option { + pub fn peer_from_cache(&self, peer_id: u64) -> Option { for p in self.raft_group.store().region().get_peers() { if p.get_id() == peer_id { return Some(p.clone()); @@ -210,6 +218,24 @@ impl Peer { self.raft_group.raft.state == StateRole::Leader } + #[inline] + pub fn leader_id(&self) -> u64 { + self.raft_group.raft.leader_id + } + + /// Get the leader peer meta. + /// + /// `None` is returned if there is no leader or the meta can't be found. + #[inline] + pub fn leader(&self) -> Option { + let leader_id = self.leader_id(); + if leader_id != 0 { + self.peer_from_cache(leader_id) + } else { + None + } + } + /// Term of the state machine. #[inline] pub fn term(&self) -> u64 { diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 3f0dadaed04..72e6149d7ad 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -15,7 +15,7 @@ use raftstore::store::{ use tikv_util::time::Instant; use super::{ - response_channel::{CmdResChannel, QueryResChannel}, + response_channel::{CmdResChannel, CmdResSubscriber, QueryResChannel, QueryResSubscriber}, ApplyRes, }; @@ -95,49 +95,17 @@ impl StoreTick { } /// Command that can be handled by raftstore. -pub struct RaftRequest { +pub struct RaftRequest { pub send_time: Instant, pub request: RaftCmdRequest, + pub ch: C, } -impl RaftRequest { - pub fn new(request: RaftCmdRequest) -> Self { +impl RaftRequest { + pub fn new(request: RaftCmdRequest, ch: C) -> Self { RaftRequest { send_time: Instant::now(), request, - } - } -} - -/// A query that won't change any state. So it doesn't have to be replicated to -/// all replicas. -pub struct RaftQuery { - pub req: RaftRequest, - pub ch: QueryResChannel, -} - -impl RaftQuery { - #[inline] - pub fn new(request: RaftCmdRequest, ch: QueryResChannel) -> Self { - Self { - req: RaftRequest::new(request), - ch, - } - } -} - -/// Commands that change the inernal states. It will be transformed into logs -/// and reach consensus in the raft group. -pub struct RaftCommand { - pub cmd: RaftRequest, - pub ch: CmdResChannel, -} - -impl RaftCommand { - #[inline] - pub fn new(request: RaftCmdRequest, ch: CmdResChannel) -> Self { - Self { - cmd: RaftRequest::new(request), ch, } } @@ -149,12 +117,12 @@ pub enum PeerMsg { /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. RaftMessage(InspectedRaftMessage), - /// Read command only involves read operations, they are usually processed - /// using lease or read index. - RaftQuery(RaftQuery), - /// Proposal needs to be processed by all peers in a raft group. They will - /// be transformed into logs and be proposed by the leader peer. - RaftCommand(RaftCommand), + /// Query won't change any state. A typical query is KV read. In most cases, + /// it will be processed using lease or read index. + RaftQuery(RaftRequest), + /// Command changes the inernal states. It will be transformed into logs and + /// applied on all replicas. + RaftCommand(RaftRequest), /// Tick is periodical task. If target peer doesn't exist there is a /// potential that the raft node will not work anymore. Tick(PeerTick), @@ -172,6 +140,18 @@ pub enum PeerMsg { }, } +impl PeerMsg { + pub fn raft_query(req: RaftCmdRequest) -> (Self, QueryResSubscriber) { + let (ch, sub) = QueryResChannel::pair(); + (PeerMsg::RaftQuery(RaftRequest::new(req, ch)), sub) + } + + pub fn raft_command(req: RaftCmdRequest) -> (Self, CmdResSubscriber) { + let (ch, sub) = CmdResChannel::pair(); + (PeerMsg::RaftCommand(RaftRequest::new(req, ch)), sub) + } +} + impl fmt::Debug for PeerMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index 4a1df09fa44..17250833168 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -8,6 +8,6 @@ mod response_channel; pub(crate) use self::internal_message::ApplyTask; pub use self::{ internal_message::ApplyRes, - message::{PeerMsg, PeerTick, RaftCommand, RaftQuery, RaftRequest, StoreMsg, StoreTick}, + message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, response_channel::{CmdResChannel, QueryResChannel, QueryResult}, }; diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index fe84ae3c3ef..ae43bd07c25 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -200,11 +200,11 @@ impl<'a, Res> Future for WaitResult<'a, Res> { } } -pub struct CommandResultSubscriber { +pub struct CmdResSubscriber { core: Arc>, } -impl CommandResultSubscriber { +impl CmdResSubscriber { pub async fn wait_proposed(&mut self) -> bool { WaitEvent { event: CmdResChannel::PROPOSED_EVENT, @@ -226,8 +226,8 @@ impl CommandResultSubscriber { } } -unsafe impl Send for CommandResultSubscriber {} -unsafe impl Sync for CommandResultSubscriber {} +unsafe impl Send for CmdResSubscriber {} +unsafe impl Sync for CmdResSubscriber {} pub struct CmdResChannel { core: ManuallyDrop>>, @@ -239,7 +239,7 @@ impl CmdResChannel { const COMMITTED_EVENT: u64 = 2; #[inline] - pub fn pair() -> (Self, CommandResultSubscriber) { + pub fn pair() -> (Self, CmdResSubscriber) { let core = Arc::new(EventCore { event: AtomicU64::new(0), res: UnsafeCell::new(None), @@ -249,7 +249,7 @@ impl CmdResChannel { Self { core: ManuallyDrop::new(core.clone()), }, - CommandResultSubscriber { core }, + CmdResSubscriber { core }, ) } } diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index d93cd09fc62..d922020cbcb 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -8,9 +8,13 @@ #![allow(dead_code)] #![allow(unused_imports)] -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, +use std::{ + ops::{Deref, DerefMut}, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + time::Duration, }; use crossbeam::channel::{self, Receiver, Sender}; @@ -20,18 +24,55 @@ use engine_test::{ raft::RaftTestEngine, }; use engine_traits::{OpenOptions, TabletFactory, ALL_CFS}; -use kvproto::{metapb::Store, raft_serverpb::RaftMessage}; +use futures::executor::block_on; +use kvproto::{ + metapb::Store, + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_serverpb::RaftMessage, +}; use pd_client::RpcClient; use raftstore::store::{Config, Transport, RAFT_INIT_LOG_INDEX}; -use raftstore_v2::{create_store_batch_system, Bootstrap, StoreRouter, StoreSystem}; +use raftstore_v2::{ + create_store_batch_system, + router::{PeerMsg, QueryResult}, + Bootstrap, StoreRouter, StoreSystem, +}; use slog::{o, Logger}; use tempfile::TempDir; use test_pd::mocker::Service; -use tikv_util::config::VersionTrack; +use tikv_util::config::{ReadableDuration, VersionTrack}; + +mod test_status; + +struct TestRouter(StoreRouter); + +impl Deref for TestRouter { + type Target = StoreRouter; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for TestRouter { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} -mod test_election; +impl TestRouter { + fn query(&self, region_id: u64, req: RaftCmdRequest) -> Option { + let (msg, sub) = PeerMsg::raft_query(req); + self.send(region_id, msg).unwrap(); + block_on(sub.result()) + } -type TestRouter = StoreRouter; + fn command(&self, region_id: u64, req: RaftCmdRequest) -> Option { + let (msg, sub) = PeerMsg::raft_command(req); + self.send(region_id, msg).unwrap(); + block_on(sub.result()) + } +} struct TestNode { _pd_server: test_pd::Server, @@ -41,6 +82,7 @@ struct TestNode { raft_engine: Option, factory: Option>, system: Option>, + cfg: Option>>, logger: Logger, } @@ -93,13 +135,14 @@ impl TestNode { raft_engine: Some(raft_engine), factory: Some(factory), system: None, + cfg: None, logger, } } fn start( &mut self, - cfg: &Arc>, + cfg: Arc>, trans: impl Transport + 'static, ) -> TestRouter { let (router, mut system) = create_store_batch_system::( @@ -117,8 +160,13 @@ impl TestNode { &router, ) .unwrap(); + self.cfg = Some(cfg); self.system = Some(system); - router + TestRouter(router) + } + + fn config(&self) -> &Arc> { + self.cfg.as_ref().unwrap() } fn stop(&mut self) { @@ -165,10 +213,43 @@ impl Transport for TestTransport { } } +// TODO: remove following when we finally integrate it in tikv-server binary. +fn v2_default_config() -> Config { + let mut config = Config::default(); + config.store_io_pool_size = 1; + config +} + +/// Disable all ticks, so test case can schedule manually. +fn disable_all_auto_ticks(cfg: &mut Config) { + cfg.raft_base_tick_interval = ReadableDuration::ZERO; + cfg.raft_log_gc_tick_interval = ReadableDuration::ZERO; + cfg.raft_log_compact_sync_interval = ReadableDuration::ZERO; + cfg.raft_engine_purge_interval = ReadableDuration::ZERO; + cfg.split_region_check_tick_interval = ReadableDuration::ZERO; + cfg.region_compact_check_interval = ReadableDuration::ZERO; + cfg.pd_heartbeat_tick_interval = ReadableDuration::ZERO; + cfg.pd_store_heartbeat_tick_interval = ReadableDuration::ZERO; + cfg.snap_mgr_gc_tick_interval = ReadableDuration::ZERO; + cfg.lock_cf_compact_interval = ReadableDuration::ZERO; + cfg.peer_stale_state_check_interval = ReadableDuration::ZERO; + cfg.consistency_check_interval = ReadableDuration::ZERO; + cfg.report_region_flow_interval = ReadableDuration::ZERO; + cfg.check_leader_lease_interval = ReadableDuration::ZERO; + cfg.merge_check_tick_interval = ReadableDuration::ZERO; + cfg.cleanup_import_sst_interval = ReadableDuration::ZERO; + cfg.inspect_interval = ReadableDuration::ZERO; + cfg.report_min_resolved_ts_interval = ReadableDuration::ZERO; + cfg.reactive_memory_lock_tick_interval = ReadableDuration::ZERO; + cfg.report_region_buckets_tick_interval = ReadableDuration::ZERO; + cfg.check_long_uncommitted_interval = ReadableDuration::ZERO; +} + fn setup_default_cluster() -> (TestNode, Receiver, TestRouter) { let mut node = TestNode::new(); - let cfg = Default::default(); + let mut cfg = v2_default_config(); + disable_all_auto_ticks(&mut cfg); let (tx, rx) = new_test_transport(); - let router = node.start(&cfg, tx); + let router = node.start(Arc::new(VersionTrack::new(cfg)), tx); (node, rx, router) } diff --git a/components/raftstore-v2/tests/integrations/test_election.rs b/components/raftstore-v2/tests/integrations/test_election.rs deleted file mode 100644 index cf3a0cc4906..00000000000 --- a/components/raftstore-v2/tests/integrations/test_election.rs +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use raftstore_v2::PeerMsg; - -// TODO: finish test case when callback is added. -#[test] -fn test_smoke() { - let (_node, _transport, router) = super::setup_default_cluster(); - router.send(2, PeerMsg::Noop).unwrap(); -} diff --git a/components/raftstore-v2/tests/integrations/test_status.rs b/components/raftstore-v2/tests/integrations/test_status.rs new file mode 100644 index 00000000000..7b0d71c9589 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_status.rs @@ -0,0 +1,50 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::assert_matches::assert_matches; + +use futures::executor::block_on; +use kvproto::raft_cmdpb::{RaftCmdRequest, StatusCmdType}; +use raftstore::store::util::new_peer; +use raftstore_v2::router::{PeerMsg, PeerTick, QueryResChannel, QueryResult, RaftRequest}; + +#[test] +fn test_status() { + let (_node, _transport, router) = super::setup_default_cluster(); + // When there is only one peer, it should campaign immediately. + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(1, 3)); + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionLeader); + let res = router.query(2, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + assert_eq!( + *status_resp.get_region_leader().get_leader(), + new_peer(1, 3) + ); + + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionDetail); + let res = router.query(2, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + let detail = status_resp.get_region_detail(); + assert_eq!(*detail.get_leader(), new_peer(1, 3)); + let region = detail.get_region(); + assert_eq!(region.get_id(), 2); + assert!(region.get_start_key().is_empty()); + assert!(region.get_end_key().is_empty()); + assert_eq!(*region.get_peers(), vec![new_peer(1, 3)]); + assert_eq!(region.get_region_epoch().get_version(), 1); + assert_eq!(region.get_region_epoch().get_conf_ver(), 1); + + // Invalid store id should return error. + req.mut_header().mut_peer().set_store_id(4); + let res = router.query(2, req).unwrap(); + let resp = res.response().unwrap(); + assert!( + resp.get_header().get_error().has_store_not_match(), + "{:?}", + resp + ); + + // TODO: add a peer then check for region change and leadership change. +} diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index 7e9f22dcb01..e11a4799bc0 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -391,6 +391,8 @@ impl FromStr for ReadableDuration { } impl ReadableDuration { + pub const ZERO: ReadableDuration = ReadableDuration(Duration::ZERO); + pub const fn micros(micros: u64) -> ReadableDuration { ReadableDuration(Duration::from_micros(micros)) } diff --git a/scripts/check-license b/scripts/check-license index 0b35ef67177..c22e712780f 100755 --- a/scripts/check-license +++ b/scripts/check-license @@ -2,7 +2,7 @@ # Check all source files have a license header. set -euo pipefail -for i in $(git ls-files | grep "\.rs"); do +for i in $(git ls-files -o --exclude-standard | grep "\.rs"); do # first line -> match -> print line -> quit matches=$(sed -n "1{/Copyright [0-9]\{4\} TiKV Project Authors. Licensed under Apache-2.0./p;};q;" $i) if [ -z "${matches}" ]; then From 6dcd0b3d66945583a133e60838533a5d12b3a487 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Tue, 23 Aug 2022 15:28:20 +0800 Subject: [PATCH 168/676] raftstore: Implement coprocessor observer pre(post)_apply_snapshot (#12889) ref tikv/tikv#12849 Support new observers pre(post)_apply_snapshot. Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- .../raftstore/src/coprocessor/dispatcher.rs | 97 +++++++++- components/raftstore/src/coprocessor/mod.rs | 29 +++ components/raftstore/src/store/mod.rs | 2 +- .../raftstore/src/store/peer_storage.rs | 1 + .../raftstore/src/store/worker/region.rs | 175 +++++++++++++++--- 5 files changed, 278 insertions(+), 26 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 7eea973997b..d2c4e14567a 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -506,6 +506,50 @@ impl CoprocessorHost { ); } + pub fn should_pre_apply_snapshot(&self) -> bool { + for observer in &self.registry.apply_snapshot_observers { + let observer = observer.observer.inner(); + if observer.should_pre_apply_snapshot() { + return true; + } + } + false + } + + pub fn pre_apply_snapshot( + &self, + region: &Region, + peer_id: u64, + snap_key: &crate::store::SnapKey, + snap: Option<&crate::store::Snapshot>, + ) { + loop_ob!( + region, + &self.registry.apply_snapshot_observers, + pre_apply_snapshot, + peer_id, + snap_key, + snap, + ); + } + + pub fn post_apply_snapshot( + &self, + region: &Region, + peer_id: u64, + snap_key: &crate::store::SnapKey, + snap: Option<&crate::store::Snapshot>, + ) { + loop_ob!( + region, + &self.registry.apply_snapshot_observers, + post_apply_snapshot, + peer_id, + snap_key, + snap, + ); + } + pub fn new_split_checker_host<'a>( &'a self, region: &Region, @@ -648,7 +692,10 @@ mod tests { }; use tikv_util::box_err; - use crate::coprocessor::*; + use crate::{ + coprocessor::*, + store::{SnapKey, Snapshot}, + }; #[derive(Clone, Default)] struct TestCoprocessor { @@ -675,6 +722,9 @@ mod tests { PostExecQuery = 17, PostExecAdmin = 18, OnComputeEngineSize = 19, + PreApplySnapshot = 20, + PostApplySnapshot = 21, + ShouldPreApplySnapshot = 22, } impl Coprocessor for TestCoprocessor {} @@ -842,6 +892,38 @@ mod tests { .fetch_add(ObserverIndex::ApplySst as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } + + fn pre_apply_snapshot( + &self, + ctx: &mut ObserverContext<'_>, + _: u64, + _: &SnapKey, + _: Option<&Snapshot>, + ) { + self.called + .fetch_add(ObserverIndex::PreApplySnapshot as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + } + + fn post_apply_snapshot( + &self, + ctx: &mut ObserverContext<'_>, + _: u64, + _: &crate::store::SnapKey, + _: Option<&Snapshot>, + ) { + self.called + .fetch_add(ObserverIndex::PostApplySnapshot as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + } + + fn should_pre_apply_snapshot(&self) -> bool { + self.called.fetch_add( + ObserverIndex::ShouldPreApplySnapshot as usize, + Ordering::SeqCst, + ); + false + } } impl CmdObserver for TestCoprocessor { @@ -986,6 +1068,19 @@ mod tests { host.post_exec(®ion, &cmd, &apply_state, ®ion_state, &mut info); index += ObserverIndex::PostExecQuery as usize; assert_all!([&ob.called], &[index]); + + let key = SnapKey::new(region.get_id(), 1, 1); + host.pre_apply_snapshot(®ion, 0, &key, None); + index += ObserverIndex::PreApplySnapshot as usize; + assert_all!([&ob.called], &[index]); + + host.post_apply_snapshot(®ion, 0, &key, None); + index += ObserverIndex::PostApplySnapshot as usize; + assert_all!([&ob.called], &[index]); + + host.should_pre_apply_snapshot(); + index += ObserverIndex::ShouldPreApplySnapshot as usize; + assert_all!([&ob.called], &[index]); } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index fcbfcfc98ff..cc6bfb91b06 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -179,6 +179,35 @@ pub trait ApplySnapshotObserver: Coprocessor { /// Hook to call after applying sst file. Currently the content of the /// snapshot can't be passed to the observer. fn apply_sst(&self, _: &mut ObserverContext<'_>, _: CfName, _path: &str) {} + + /// Hook when receiving Task::Apply. + /// Should pass valid snapshot, the option is only for testing. + /// Notice that we can call `pre_apply_snapshot` to multiple snapshots at + /// the same time. + fn pre_apply_snapshot( + &self, + _: &mut ObserverContext<'_>, + _peer_id: u64, + _: &crate::store::SnapKey, + _: Option<&crate::store::Snapshot>, + ) { + } + + /// Hook when the whole snapshot is applied. + /// Should pass valid snapshot, the option is only for testing. + fn post_apply_snapshot( + &self, + _: &mut ObserverContext<'_>, + _: u64, + _: &crate::store::SnapKey, + _snapshot: Option<&crate::store::Snapshot>, + ) { + } + + /// We call pre_apply_snapshot only when one of the observer returns true. + fn should_pre_apply_snapshot(&self) -> bool { + false + } } /// SplitChecker is invoked during a split check scan, and decides to use diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index b5a35461728..878c7c3b9f8 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -21,7 +21,7 @@ mod peer_storage; mod read_queue; mod region_snapshot; mod replication_mode; -mod snap; +pub mod snap; mod txn_ext; mod worker; diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 129dac6dbb5..4a36f385648 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -827,6 +827,7 @@ where let task = RegionTask::Apply { region_id: self.get_region_id(), status, + peer_id: self.peer_id, }; // Don't schedule the snapshot to region worker. diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index d15e40e6f5e..f167a2c90bf 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -84,6 +84,7 @@ pub enum Task { Apply { region_id: u64, status: Arc, + peer_id: u64, }, /// Destroy data between [start_key, end_key). /// @@ -347,13 +348,9 @@ where .observe(start.saturating_elapsed_secs()); } - /// Applies snapshot data of the Region. - fn apply_snap(&mut self, region_id: u64, abort: Arc) -> Result<()> { - info!("begin apply snap data"; "region_id" => region_id); - fail_point!("region_apply_snap", |_| { Ok(()) }); - check_abort(&abort)?; + fn region_state(&self, region_id: u64) -> Result { let region_key = keys::region_state_key(region_id); - let mut region_state: RegionLocalState = + let region_state: RegionLocalState = match box_try!(self.engine.get_msg_cf(CF_RAFT, ®ion_key)) { Some(state) => state, None => { @@ -363,6 +360,31 @@ where )); } }; + Ok(region_state) + } + + fn apply_state(&self, region_id: u64) -> Result { + let state_key = keys::apply_state_key(region_id); + let apply_state: RaftApplyState = + match box_try!(self.engine.get_msg_cf(CF_RAFT, &state_key)) { + Some(state) => state, + None => { + return Err(box_err!( + "failed to get apply_state from {}", + log_wrappers::Value::key(&state_key) + )); + } + }; + Ok(apply_state) + } + + /// Applies snapshot data of the Region. + fn apply_snap(&mut self, region_id: u64, peer_id: u64, abort: Arc) -> Result<()> { + info!("begin apply snap data"; "region_id" => region_id, "peer_id" => peer_id); + fail_point!("region_apply_snap", |_| { Ok(()) }); + check_abort(&abort)?; + let region_key = keys::region_state_key(region_id); + let mut region_state = self.region_state(region_id)?; // clear up origin data. let region = region_state.get_region().clone(); @@ -382,17 +404,8 @@ where check_abort(&abort)?; fail_point!("apply_snap_cleanup_range"); - let state_key = keys::apply_state_key(region_id); - let apply_state: RaftApplyState = - match box_try!(self.engine.get_msg_cf(CF_RAFT, &state_key)) { - Some(state) => state, - None => { - return Err(box_err!( - "failed to get raftstate from {}", - log_wrappers::Value::key(&state_key) - )); - } - }; + let apply_state = self.apply_state(region_id)?; + let term = apply_state.get_truncated_state().get_term(); let idx = apply_state.get_truncated_state().get_index(); let snap_key = SnapKey::new(region_id, term, idx); @@ -408,12 +421,14 @@ where let timer = Instant::now(); let options = ApplyOptions { db: self.engine.clone(), - region, + region: region.clone(), abort: Arc::clone(&abort), write_batch_size: self.batch_size, coprocessor_host: self.coprocessor_host.clone(), }; s.apply(options)?; + self.coprocessor_host + .post_apply_snapshot(®ion, peer_id, &snap_key, Some(&s)); let mut wb = self.engine.write_batch(); region_state.set_state(PeerState::Normal); @@ -432,7 +447,7 @@ where /// Tries to apply the snapshot of the specified Region. It calls /// `apply_snap` to do the actual work. - fn handle_apply(&mut self, region_id: u64, status: Arc) { + fn handle_apply(&mut self, region_id: u64, peer_id: u64, status: Arc) { let _ = status.compare_exchange( JOB_STATUS_PENDING, JOB_STATUS_RUNNING, @@ -444,7 +459,7 @@ where // let timer = apply_histogram.start_coarse_timer(); let start = Instant::now(); - match self.apply_snap(region_id, Arc::clone(&status)) { + match self.apply_snap(region_id, peer_id, Arc::clone(&status)) { Ok(()) => { status.swap(JOB_STATUS_FINISHED, Ordering::SeqCst); SNAP_COUNTER.apply.success.inc(); @@ -627,6 +642,46 @@ where Ok(()) } + + /// Calls observer `pre_apply_snapshot` for every task. + /// Multiple task can be `pre_apply_snapshot` at the same time. + fn pre_apply_snapshot(&self, task: &Task) -> Result<()> { + let (region_id, abort, peer_id) = match task { + Task::Apply { + region_id, + status, + peer_id, + } => (region_id, status.clone(), peer_id), + _ => panic!("invalid apply snapshot task"), + }; + + let region_state = self.region_state(*region_id)?; + let apply_state = self.apply_state(*region_id)?; + + check_abort(&abort)?; + + let term = apply_state.get_truncated_state().get_term(); + let idx = apply_state.get_truncated_state().get_index(); + let snap_key = SnapKey::new(*region_id, term, idx); + let s = box_try!(self.mgr.get_snapshot_for_applying(&snap_key)); + if !s.exists() { + self.coprocessor_host.pre_apply_snapshot( + region_state.get_region(), + *peer_id, + &snap_key, + None, + ); + return Err(box_err!("missing snapshot file {}", s.path())); + } + check_abort(&abort)?; + self.coprocessor_host.pre_apply_snapshot( + region_state.get_region(), + *peer_id, + &snap_key, + Some(&s), + ); + Ok(()) + } } pub struct Runner @@ -692,8 +747,13 @@ where if self.ctx.ingest_maybe_stall() { break; } - if let Some(Task::Apply { region_id, status }) = self.pending_applies.pop_front() { - self.ctx.handle_apply(region_id, status); + if let Some(Task::Apply { + region_id, + status, + peer_id, + }) = self.pending_applies.pop_front() + { + self.ctx.handle_apply(region_id, peer_id, status); } } } @@ -763,6 +823,9 @@ where } task @ Task::Apply { .. } => { fail_point!("on_region_worker_apply", true, |_| {}); + if self.ctx.coprocessor_host.should_pre_apply_snapshot() { + let _ = self.ctx.pre_apply_snapshot(&task); + } // to makes sure applying snapshots in order. self.pending_applies.push_back(task); self.handle_pending_applies(); @@ -836,7 +899,10 @@ mod tests { use super::*; use crate::{ - coprocessor::CoprocessorHost, + coprocessor::{ + ApplySnapshotObserver, BoxApplySnapshotObserver, Coprocessor, CoprocessorHost, + ObserverContext, + }, store::{ peer_storage::JOB_STATUS_PENDING, snap::tests::get_test_db_for_regions, worker::RegionRunner, CasualMessage, SnapKey, SnapManager, @@ -989,6 +1055,10 @@ mod tests { .prefix("test_pending_applies") .tempdir() .unwrap(); + let obs = MockApplySnapshotObserver::default(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_apply_snapshot_observer(1, BoxApplySnapshotObserver::new(obs.clone())); let mut cf_opts = CfOptions::new(); cf_opts.set_level_zero_slowdown_writes_trigger(5); @@ -1043,7 +1113,7 @@ mod tests { 0, true, 2, - CoprocessorHost::::default(), + host, router, Option::>::None, ); @@ -1104,6 +1174,7 @@ mod tests { .schedule(Task::Apply { region_id: id, status, + peer_id: 1, }) .unwrap(); }; @@ -1170,6 +1241,12 @@ mod tests { ); wait_apply_finish(&[1]); + assert_eq!(obs.pre_apply_count.load(Ordering::SeqCst), 1); + assert_eq!(obs.post_apply_count.load(Ordering::SeqCst), 1); + assert_eq!( + obs.pre_apply_hash.load(Ordering::SeqCst), + obs.post_apply_hash.load(Ordering::SeqCst) + ); // the pending apply task should be finished and snapshots are ingested. // note that when ingest sst, it may flush memtable if overlap, @@ -1276,4 +1353,54 @@ mod tests { thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); assert!(!check_region_exist(6)); } + + #[derive(Clone, Default)] + struct MockApplySnapshotObserver { + pub pre_apply_count: Arc, + pub post_apply_count: Arc, + pub pre_apply_hash: Arc, + pub post_apply_hash: Arc, + } + + impl Coprocessor for MockApplySnapshotObserver {} + + impl ApplySnapshotObserver for MockApplySnapshotObserver { + fn pre_apply_snapshot( + &self, + _: &mut ObserverContext<'_>, + peer_id: u64, + key: &crate::store::SnapKey, + snapshot: Option<&crate::store::Snapshot>, + ) { + let code = snapshot.unwrap().total_size().unwrap() + + key.term + + key.region_id + + key.idx + + peer_id; + self.pre_apply_count.fetch_add(1, Ordering::SeqCst); + self.pre_apply_hash + .fetch_add(code as usize, Ordering::SeqCst); + } + + fn post_apply_snapshot( + &self, + _: &mut ObserverContext<'_>, + peer_id: u64, + key: &crate::store::SnapKey, + snapshot: Option<&crate::store::Snapshot>, + ) { + let code = snapshot.unwrap().total_size().unwrap() + + key.term + + key.region_id + + key.idx + + peer_id; + self.post_apply_count.fetch_add(1, Ordering::SeqCst); + self.post_apply_hash + .fetch_add(code as usize, Ordering::SeqCst); + } + + fn should_pre_apply_snapshot(&self) -> bool { + true + } + } } From 7cc8a39e316eae3e07276fc1f42b1a21fc8964de Mon Sep 17 00:00:00 2001 From: ystaticy Date: Wed, 24 Aug 2022 18:02:22 +0800 Subject: [PATCH 169/676] Divided Gc metrics in different labels by keymode (#12853) ref tikv/tikv#12852 Signed-off-by: ystaticy Co-authored-by: Ping Yu --- components/tikv_kv/src/metrics.rs | 8 + metrics/grafana/tikv_details.json | 48 +-- src/server/gc_worker/compaction_filter.rs | 92 +++-- src/server/gc_worker/gc_worker.rs | 241 +++++++----- src/server/gc_worker/mod.rs | 10 +- .../gc_worker/rawkv_compaction_filter.rs | 53 ++- src/server/metrics.rs | 2 +- src/storage/mvcc/metrics.rs | 6 +- src/storage/mvcc/txn.rs | 10 +- src/storage/txn/actions/gc.rs | 11 +- tests/failpoints/cases/mod.rs | 1 + tests/failpoints/cases/test_gc_metrics.rs | 364 ++++++++++++++++++ 12 files changed, 667 insertions(+), 179 deletions(-) create mode 100644 tests/failpoints/cases/test_gc_metrics.rs diff --git a/components/tikv_kv/src/metrics.rs b/components/tikv_kv/src/metrics.rs index 3b63c4ab1a3..c57e4dcf496 100644 --- a/components/tikv_kv/src/metrics.rs +++ b/components/tikv_kv/src/metrics.rs @@ -3,6 +3,13 @@ use prometheus_static_metric::*; make_auto_flush_static_metric! { + pub label_enum GcKeyMode { + // The enum 'txn' contains both TiDB and TxnKV scenarios statistics, + // as they have the same storage format, and use the same GC procedures. + txn, + raw, + } + pub label_enum GcKeysCF { default, lock, @@ -25,6 +32,7 @@ make_auto_flush_static_metric! { } pub struct GcKeysCounterVec: LocalIntCounter { + "key_mode" => GcKeyMode, "cf" => GcKeysCF, "tag" => GcKeysDetail, } diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index b47c226cb02..d871603c134 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -22194,10 +22194,10 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_storage_mvcc_gc_delete_versions_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "expr": "sum(rate(tikv_storage_mvcc_gc_delete_versions_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", "format": "time_series", "intervalFactor": 2, - "legendFormat": "keys/s", + "legendFormat": "{{key_mode}}_keys/s", "refId": "E" } ], @@ -22555,70 +22555,70 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_gc_compaction_filtered{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "expr": "sum(rate(tikv_gc_compaction_filtered{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by {key_mode}", "format": "time_series", "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "filtered", + "legendFormat": "{{key_mode}}_filtered", "metric": "tikv_storage_command_total", "refId": "A", "step": 4 }, { - "expr": "sum(rate(tikv_gc_compaction_filter_skip{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "expr": "sum(rate(tikv_gc_compaction_filter_skip{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "skipped", + "legendFormat": "{{key_mode}}_skipped", "refId": "B" }, { - "expr": "sum(rate(tikv_gc_compaction_mvcc_rollback{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "expr": "sum(rate(tikv_gc_compaction_mvcc_rollback{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "mvcc-rollback/mvcc-lock", + "legendFormat": "{{key_mode}}_mvcc-rollback/mvcc-lock", "refId": "C" }, { - "expr": "sum(rate(tikv_gc_compaction_filter_orphan_versions{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "expr": "sum(rate(tikv_gc_compaction_filter_orphan_versions{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "orphan-versions", + "legendFormat": "{{key_mode}}_orphan-versions", "refId": "D" }, { - "expr": "sum(rate(tikv_gc_compaction_filter_perform{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "expr": "sum(rate(tikv_gc_compaction_filter_perform{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "performed-times", + "legendFormat": "{{key_mode}}_performed-times", "refId": "E" }, { - "expr": "sum(rate(tikv_gc_compaction_failure{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (type)", + "expr": "sum(rate(tikv_gc_compaction_failure{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode,type)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "failure-{{type}}", + "legendFormat": "{{key_mode}}_failure-{{type}}", "refId": "F" }, { - "expr": "sum(rate(tikv_gc_compaction_filter_mvcc_deletion_met{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "expr": "sum(rate(tikv_gc_compaction_filter_mvcc_deletion_met{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "mvcc-deletion-met", + "legendFormat": "{{key_mode}}_mvcc-deletion-met", "refId": "G" }, { - "expr": "sum(rate(tikv_gc_compaction_filter_mvcc_deletion_handled{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "expr": "sum(rate(tikv_gc_compaction_filter_mvcc_deletion_handled{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "mvcc-deletion-handled", + "legendFormat": "{{key_mode}}_mvcc-deletion-handled", "refId": "H" }, { - "expr": "sum(rate(tikv_gc_compaction_filter_mvcc_deletion_wasted{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m]))", + "expr": "sum(rate(tikv_gc_compaction_filter_mvcc_deletion_wasted{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "mvcc-deletion-wasted", + "legendFormat": "{{key_mode}}_mvcc-deletion-wasted", "refId": "I" } ], @@ -22708,10 +22708,10 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_gcworker_gc_keys{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\", cf=\"write\"}[1m])) by (tag)", + "expr": "sum(rate(tikv_gcworker_gc_keys{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\", cf=\"write\"}[1m])) by (key_mode,tag)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{tag}}", + "legendFormat": "{{key_mode}}_{{tag}}", "refId": "A" } ], @@ -22801,10 +22801,10 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_gcworker_gc_keys{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\", cf=\"default\"}[1m])) by (tag)", + "expr": "sum(rate(tikv_gcworker_gc_keys{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$instance\", cf=\"default\"}[1m])) by (key_mode,tag)", "format": "time_series", "intervalFactor": 1, - "legendFormat": "{{tag}}", + "legendFormat": "{{key_mode}}_{{tag}}", "refId": "A" } ], diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index ef190f4760e..23f007eb8be 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -34,7 +34,7 @@ use tikv_util::{ use txn_types::{Key, TimeStamp, WriteRef, WriteType}; use crate::{ - server::gc_worker::{GcConfig, GcTask, GcWorkerConfigManager}, + server::gc_worker::{GcConfig, GcTask, GcWorkerConfigManager, STAT_TXN_KEYMODE}, storage::mvcc::{GC_DELETE_VERSIONS_HISTOGRAM, MVCC_VERSIONS_HISTOGRAM}, }; @@ -69,62 +69,73 @@ lazy_static! { pub static ref GC_CONTEXT: Mutex> = Mutex::new(None); // Filtered keys in `WriteCompactionFilter::filter_v2`. - pub static ref GC_COMPACTION_FILTERED: IntCounter = register_int_counter!( + pub static ref GC_COMPACTION_FILTERED: IntCounterVec = register_int_counter_vec!( "tikv_gc_compaction_filtered", - "Filtered versions by compaction" + "Filtered versions by compaction", + &["key_mode"] ) .unwrap(); // A counter for errors met by `WriteCompactionFilter`. + //TODO: Add test case to check the correctness of GC_COMPACTION_FAILURE pub static ref GC_COMPACTION_FAILURE: IntCounterVec = register_int_counter_vec!( "tikv_gc_compaction_failure", "Compaction filter meets failure", - &["type"] + &["key_mode", "type"] ) .unwrap(); // A counter for skip performing GC in compactions. - static ref GC_COMPACTION_FILTER_SKIP: IntCounter = register_int_counter!( + pub static ref GC_COMPACTION_FILTER_SKIP: IntCounterVec = register_int_counter_vec!( "tikv_gc_compaction_filter_skip", - "Skip to create compaction filter for GC because of table properties" + "Skip to create compaction filter for GC because of table properties", + &["key_mode"] ) .unwrap(); - static ref GC_COMPACTION_FILTER_PERFORM: IntCounter = register_int_counter!( + pub static ref GC_COMPACTION_FILTER_PERFORM: IntCounterVec = register_int_counter_vec!( "tikv_gc_compaction_filter_perform", - "perfrom GC in compaction filter" + "perfrom GC in compaction filter", + &["key_mode"] ) .unwrap(); // `WriteType::Rollback` and `WriteType::Lock` are handled in different ways. - pub static ref GC_COMPACTION_MVCC_ROLLBACK: IntCounter = register_int_counter!( + //TODO: Add test case to check the correctness of GC_COMPACTION_MVCC_ROLLBACK + pub static ref GC_COMPACTION_MVCC_ROLLBACK: IntCounterVec = register_int_counter_vec!( "tikv_gc_compaction_mvcc_rollback", - "Compaction of mvcc rollbacks" + "Compaction of mvcc rollbacks", + &["key_mode"] ) .unwrap(); + //TODO: Add test case to check the correctness of GC_COMPACTION_FILTER_ORPHAN_VERSIONS pub static ref GC_COMPACTION_FILTER_ORPHAN_VERSIONS: IntCounterVec = register_int_counter_vec!( "tikv_gc_compaction_filter_orphan_versions", "Compaction filter orphan versions for default CF", - &["tag"] + &["key_mode", "tag"] ).unwrap(); /// Counter of mvcc deletions met in compaction filter. - pub static ref GC_COMPACTION_FILTER_MVCC_DELETION_MET: IntCounter = register_int_counter!( + pub static ref GC_COMPACTION_FILTER_MVCC_DELETION_MET: IntCounterVec = register_int_counter_vec!( "tikv_gc_compaction_filter_mvcc_deletion_met", - "MVCC deletion from compaction filter met" + "MVCC deletion from compaction filter met", + &["key_mode"] ).unwrap(); /// Counter of mvcc deletions handled in gc worker. - pub static ref GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED: IntCounter = register_int_counter!( + pub static ref GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED: IntCounterVec = register_int_counter_vec!( "tikv_gc_compaction_filter_mvcc_deletion_handled", - "MVCC deletion from compaction filter handled" + "MVCC deletion from compaction filter handled", + &["key_mode"] ) .unwrap(); /// Mvcc deletions sent to gc worker can have already been cleared, in which case resources are /// wasted to seek them. - pub static ref GC_COMPACTION_FILTER_MVCC_DELETION_WASTED: IntCounter = register_int_counter!( + //TODO: Add test case to check the correctness of GC_COMPACTION_FILTER_MVCC_DELETION_WASTED + pub static ref GC_COMPACTION_FILTER_MVCC_DELETION_WASTED: IntCounterVec = register_int_counter_vec!( "tikv_gc_compaction_filter_mvcc_deletion_wasted", - "MVCC deletion from compaction filter wasted" + "MVCC deletion from compaction filter wasted", + &["key_mode"] ).unwrap(); } @@ -236,11 +247,14 @@ impl CompactionFilterFactory for WriteCompactionFilterFactory { return std::ptr::null_mut(); } drop(gc_context_option); - - GC_COMPACTION_FILTER_PERFORM.inc(); + GC_COMPACTION_FILTER_PERFORM + .with_label_values(&[STAT_TXN_KEYMODE]) + .inc(); if !check_need_gc(safe_point.into(), ratio_threshold, context) { debug!("skip gc in compaction filter because it's not necessary"); - GC_COMPACTION_FILTER_SKIP.inc(); + GC_COMPACTION_FILTER_SKIP + .with_label_values(&[STAT_TXN_KEYMODE]) + .inc(); return std::ptr::null_mut(); } @@ -289,8 +303,8 @@ struct WriteCompactionFilter { total_filtered: usize, mvcc_rollback_and_locks: usize, orphan_versions: usize, - versions_hist: LocalHistogram, - filtered_hist: LocalHistogram, + versions_hist: LocalHistogramVec, + filtered_hist: LocalHistogramVec, #[cfg(any(test, feature = "failpoints"))] callbacks_on_drop: Vec>, @@ -351,10 +365,14 @@ impl WriteCompactionFilter { } match e { ScheduleError::Full(_) => { - GC_COMPACTION_FAILURE.with_label_values(&["full"]).inc(); + GC_COMPACTION_FAILURE + .with_label_values(&[STAT_TXN_KEYMODE, "full"]) + .inc(); } ScheduleError::Stopped(_) => { - GC_COMPACTION_FAILURE.with_label_values(&["stopped"]).inc(); + GC_COMPACTION_FAILURE + .with_label_values(&[STAT_TXN_KEYMODE, "stopped"]) + .inc(); } } } @@ -423,7 +441,9 @@ impl WriteCompactionFilter { self.remove_older = true; if self.is_bottommost_level { self.mvcc_deletion_overlaps = Some(0); - GC_COMPACTION_FILTER_MVCC_DELETION_MET.inc(); + GC_COMPACTION_FILTER_MVCC_DELETION_MET + .with_label_values(&[STAT_TXN_KEYMODE]) + .inc(); } } } @@ -503,22 +523,30 @@ impl WriteCompactionFilter { fn switch_key_metrics(&mut self) { if self.versions != 0 { - self.versions_hist.observe(self.versions as f64); + self.versions_hist + .with_label_values(&[STAT_TXN_KEYMODE]) + .observe(self.versions as f64); self.total_versions += self.versions; self.versions = 0; } if self.filtered != 0 { - self.filtered_hist.observe(self.filtered as f64); + self.filtered_hist + .with_label_values(&[STAT_TXN_KEYMODE]) + .observe(self.filtered as f64); self.total_filtered += self.filtered; self.filtered = 0; } } fn flush_metrics(&self) { - GC_COMPACTION_FILTERED.inc_by(self.total_filtered as u64); - GC_COMPACTION_MVCC_ROLLBACK.inc_by(self.mvcc_rollback_and_locks as u64); + GC_COMPACTION_FILTERED + .with_label_values(&[STAT_TXN_KEYMODE]) + .inc_by(self.total_filtered as u64); + GC_COMPACTION_MVCC_ROLLBACK + .with_label_values(&[STAT_TXN_KEYMODE]) + .inc_by(self.mvcc_rollback_and_locks as u64); GC_COMPACTION_FILTER_ORPHAN_VERSIONS - .with_label_values(&["generated"]) + .with_label_values(&[STAT_TXN_KEYMODE, "generated"]) .inc_by(self.orphan_versions as u64); if let Some((versions, filtered)) = STATS.with(|stats| { stats.versions.update(|x| x + self.total_versions); @@ -609,7 +637,9 @@ impl CompactionFilter for WriteCompactionFilter { Ok(decision) => decision, Err(e) => { warn!("compaction filter meet error: {}", e); - GC_COMPACTION_FAILURE.with_label_values(&["filter"]).inc(); + GC_COMPACTION_FAILURE + .with_label_values(&[STAT_TXN_KEYMODE, "filter"]) + .inc(); self.encountered_errors = true; CompactionFilterDecision::Keep } diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index eaa55c9c69c..eadd1d77fb2 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -13,6 +13,7 @@ use std::{ }; use api_version::{ApiV2, KvFormat}; +use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::FlowInfo; use engine_traits::{ @@ -53,7 +54,7 @@ use super::{ use crate::{ server::metrics::*, storage::{ - kv::{Engine, ScanMode, Statistics}, + kv::{metrics::GcKeyMode, Engine, ScanMode, Statistics}, mvcc::{GcInfo, MvccReader, MvccTxn}, txn::{gc, Error as TxnError}, }, @@ -67,10 +68,12 @@ const GC_LOG_FOUND_VERSION_THRESHOLD: usize = 30; /// least this many versions are deleted. const GC_LOG_DELETED_VERSION_THRESHOLD: usize = 30; -pub const GC_MAX_EXECUTING_TASKS: usize = 10; const GC_TASK_SLOW_SECONDS: u64 = 30; const GC_MAX_PENDING_TASKS: usize = 4096; +pub const STAT_TXN_KEYMODE: &str = "txn"; +pub const STAT_RAW_KEYMODE: &str = "raw"; + /// Provides safe point. pub trait GcSafePointProvider: Send + 'static { fn get_safe_point(&self) -> Result; @@ -196,7 +199,7 @@ where } /// Used to perform GC operations on the engine. -struct GcRunner +pub struct GcRunner where E: Engine, RR: RaftStoreRouter, @@ -212,7 +215,7 @@ where cfg: GcConfig, cfg_tracker: Tracker, - stats: Statistics, + stats_map: HashMap, } pub const MAX_RAW_WRITE_SIZE: usize = 32 * 1024; @@ -304,7 +307,7 @@ where limiter, cfg, cfg_tracker, - stats: Statistics::default(), + stats_map: Default::default(), } } @@ -336,7 +339,7 @@ where gc_info.deleted_versions += next_gc_info.deleted_versions; gc_info.is_completed = next_gc_info.is_completed; let stats = mem::take(&mut reader.statistics); - self.stats.add(&stats); + self.mut_stats(GcKeyMode::txn).add(&stats); Ok(()) } @@ -383,7 +386,7 @@ where self.gc_keys(keys, safe_point, None)?; } - self.stats.add(&reader.statistics); + self.mut_stats(GcKeyMode::txn).add(&reader.statistics); debug!( "gc has finished"; "start_key" => log_wrappers::Value::key(start_key), @@ -393,7 +396,7 @@ where Ok(()) } - fn gc_keys( + pub fn gc_keys( &mut self, keys: Vec, safe_point: TimeStamp, @@ -527,7 +530,7 @@ where wasted_keys += 1; } - gc_info.report_metrics(); + gc_info.report_metrics(STAT_RAW_KEYMODE); next_gc_key = keys.next(); gc_info = GcInfo::default(); @@ -569,7 +572,8 @@ where } if raw_modifies.write_size >= MAX_RAW_WRITE_SIZE { - self.stats.data.add(&statistics); + let cf_stats = self.mut_stats(GcKeyMode::raw).mut_cf_statistics(CF_DEFAULT); + cf_stats.add(&statistics); return Ok(()); } @@ -589,7 +593,8 @@ where gc_info.is_completed = true; - self.stats.data.add(&statistics); + let cf_stats = self.mut_stats(GcKeyMode::raw).mut_cf_statistics(CF_DEFAULT); + cf_stats.add(&statistics); if let Some(to_del_key) = latest_version_key { self.delete_raws(to_del_key, raw_modifies, gc_info); @@ -598,6 +603,14 @@ where Ok(()) } + pub fn mut_stats(&mut self, key_mode: GcKeyMode) -> &mut Statistics { + let stats = self + .stats_map + .entry(key_mode) + .or_insert_with(Default::default); + stats + } + fn delete_raws(&mut self, key: Key, raw_modifies: &mut MvccRaw, gc_info: &mut GcInfo) { let write = Modify::Delete(CF_DEFAULT, key); raw_modifies.write_size += write.size(); @@ -732,15 +745,17 @@ where Ok(lock_infos) } - fn update_statistics_metrics(&mut self) { - let stats = mem::take(&mut self.stats); - - for (cf, details) in stats.details_enum().iter() { - for (tag, count) in details.iter() { - GC_KEYS_COUNTER_STATIC - .get(*cf) - .get(*tag) - .inc_by(*count as u64); + fn update_statistics_metrics(&mut self, key_mode: GcKeyMode) { + if let Some(mut_stats) = self.stats_map.get_mut(&key_mode) { + let stats = mem::take(mut_stats); + for (cf, cf_details) in stats.details_enum().iter() { + for (tag, count) in cf_details.iter() { + GC_KEYS_COUNTER_STATIC + .get(key_mode) + .get(*cf) + .get(*tag) + .inc_by(*count as u64); + } } } } @@ -797,7 +812,7 @@ where let res = self.gc(&start_key, &end_key, safe_point); update_metrics(res.is_err()); callback(res); - self.update_statistics_metrics(); + self.update_statistics_metrics(GcKeyMode::txn); slow_log!( T timer, "GC on range [{}, {}), safe_point {}", @@ -812,11 +827,15 @@ where store_id, region_info_provider, } => { - let old_seek_tombstone = self.stats.write.seek_tombstone; + let old_seek_tombstone = self.mut_stats(GcKeyMode::txn).write.seek_tombstone; match self.gc_keys(keys, safe_point, Some((store_id, region_info_provider))) { Ok((handled, wasted)) => { - GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED.inc_by(handled as _); - GC_COMPACTION_FILTER_MVCC_DELETION_WASTED.inc_by(wasted as _); + GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED + .with_label_values(&[STAT_TXN_KEYMODE]) + .inc_by(handled as _); + GC_COMPACTION_FILTER_MVCC_DELETION_WASTED + .with_label_values(&[STAT_TXN_KEYMODE]) + .inc_by(wasted as _); update_metrics(false); } Err(e) => { @@ -824,10 +843,10 @@ where update_metrics(true); } } - let new_seek_tombstone = self.stats.write.seek_tombstone; + let new_seek_tombstone = self.mut_stats(GcKeyMode::txn).write.seek_tombstone; let seek_tombstone = new_seek_tombstone - old_seek_tombstone; slow_log!(T timer, "GC keys, seek_tombstone {}", seek_tombstone); - self.update_statistics_metrics(); + self.update_statistics_metrics(GcKeyMode::txn); } GcTask::RawGcKeys { keys, @@ -837,8 +856,12 @@ where } => { match self.raw_gc_keys(keys, safe_point, Some((store_id, region_info_provider))) { Ok((handled, wasted)) => { - GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED.inc_by(handled as _); - GC_COMPACTION_FILTER_MVCC_DELETION_WASTED.inc_by(wasted as _); + GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED + .with_label_values(&[STAT_RAW_KEYMODE]) + .inc_by(handled as _); + GC_COMPACTION_FILTER_MVCC_DELETION_WASTED + .with_label_values(&[STAT_RAW_KEYMODE]) + .inc_by(wasted as _); update_metrics(false); } Err(e) => { @@ -846,7 +869,7 @@ where update_metrics(true); } } - self.update_statistics_metrics(); + self.update_statistics_metrics(GcKeyMode::raw); } GcTask::UnsafeDestroyRange { ctx, @@ -893,7 +916,7 @@ where } info!("write GcTask::OrphanVersions success"; "id" => id); GC_COMPACTION_FILTER_ORPHAN_VERSIONS - .with_label_values(&["cleaned"]) + .with_label_values(&[STAT_TXN_KEYMODE, "cleaned"]) .inc_by(wb.count() as u64); update_metrics(false); } @@ -1243,51 +1266,25 @@ where } } -#[cfg(test)] -mod tests { - - use std::{ - collections::BTreeMap, - sync::mpsc::{self, channel}, - thread, - time::Duration, - }; +#[cfg(any(test, feature = "testexport"))] +pub mod test_gc_worker { + use std::sync::Arc; - use api_version::{ApiV2, KvFormat, RawValue}; - use engine_rocks::{util::get_cf_handle, RocksEngine, RocksSnapshot}; + use engine_rocks::{RocksEngine, RocksSnapshot}; use engine_traits::KvEngine; - use futures::executor::block_on; use kvproto::{ - kvrpcpb::{ApiVersion, Op}, - metapb::Peer, + kvrpcpb::Context, + metapb::{Peer, Region}, }; - use raft::StateRole; - use raftstore::{ - coprocessor::{region_info_accessor::RegionInfoAccessor, RegionChangeEvent}, - router::RaftStoreBlackHole, - store::RegionSnapshot, - }; - use tikv_kv::Snapshot; - use tikv_util::{codec::number::NumberEncoder, future::paired_future_callback}; - use txn_types::Mutation; + use raftstore::store::RegionSnapshot; + use tikv_kv::{write_modifies, Engine, Modify, SnapContext, WriteData}; + use txn_types::{Key, TimeStamp}; - use super::*; use crate::{ - config::DbConfig, + server::gc_worker::{GcSafePointProvider, Result as GcWorkerResult}, storage::{ - kv::{ - self, write_modifies, Callback as EngineCallback, Modify, Result as EngineResult, - SnapContext, TestEngineBuilder, WriteData, - }, - lock_manager::DummyLockManager, - mvcc::{tests::must_get_none, MAX_TXN_WRITE_SIZE}, - txn::{ - commands, - tests::{ - must_commit, must_gc, must_prewrite_delete, must_prewrite_put, must_rollback, - }, - }, - Engine, Storage, TestStorageBuilderApiV1, + kv, + kv::{Callback as EngineCallback, Result as EngineResult}, }, }; @@ -1298,7 +1295,7 @@ mod tests { /// they needs to know how data is actually represented in db. This /// wrapper allows test engines write 'z'-prefixed keys to db. #[derive(Clone)] - struct PrefixedEngine(kv::RocksEngine); + pub struct PrefixedEngine(pub kv::RocksEngine); impl Engine for PrefixedEngine { // Use RegionSnapshot which can remove the z prefix internally. @@ -1394,6 +1391,59 @@ mod tests { } } + pub struct MockSafePointProvider(pub u64); + + impl GcSafePointProvider for MockSafePointProvider { + fn get_safe_point(&self) -> GcWorkerResult { + Ok(self.0.into()) + } + } +} + +#[cfg(test)] +mod tests { + + use std::{ + collections::BTreeMap, + sync::mpsc::{self, channel}, + thread, + time::Duration, + }; + + use api_version::{ApiV2, KvFormat, RawValue}; + use engine_rocks::{util::get_cf_handle, RocksEngine}; + use futures::executor::block_on; + use kvproto::{ + kvrpcpb::{ApiVersion, Op}, + metapb::Peer, + }; + use raft::StateRole; + use raftstore::{ + coprocessor::{region_info_accessor::RegionInfoAccessor, RegionChangeEvent}, + router::RaftStoreBlackHole, + }; + use tikv_kv::Snapshot; + use tikv_util::{codec::number::NumberEncoder, future::paired_future_callback}; + use txn_types::Mutation; + + use super::*; + use crate::{ + config::DbConfig, + server::gc_worker::{MockSafePointProvider, PrefixedEngine}, + storage::{ + kv::{metrics::GcKeyMode, Modify, TestEngineBuilder, WriteData}, + lock_manager::DummyLockManager, + mvcc::{tests::must_get_none, MAX_TXN_WRITE_SIZE}, + txn::{ + commands, + tests::{ + must_commit, must_gc, must_prewrite_delete, must_prewrite_put, must_rollback, + }, + }, + Engine, Storage, TestStorageBuilderApiV1, + }, + }; + /// Assert the data in `storage` is the same as `expected_data`. Keys in /// `expected_data` should be encoded form without ts. fn check_data( @@ -1666,13 +1716,6 @@ mod tests { assert_eq!(res[..], expected_lock_info[3..9]); } - struct MockSafePointProvider(u64); - impl GcSafePointProvider for MockSafePointProvider { - fn get_safe_point(&self) -> Result { - Ok(self.0.into()) - } - } - #[test] fn test_gc_keys_with_region_info_provider() { let engine = TestEngineBuilder::new().build().unwrap(); @@ -1808,13 +1851,13 @@ mod tests { } db.flush_cf(cf, true).unwrap(); - assert_eq!(runner.stats.write.seek, 0); - assert_eq!(runner.stats.write.next, 0); + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek, 0); + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.next, 0); runner .gc_keys(keys, TimeStamp::new(200), Some((1, Arc::new(ri_provider)))) .unwrap(); - assert_eq!(runner.stats.write.seek, 1); - assert_eq!(runner.stats.write.next, 100 * 2); + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek, 1); + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.next, 100 * 2); } #[test] @@ -1906,8 +1949,8 @@ mod tests { .raw_gc_keys(to_gc_keys, TimeStamp::new(120), Some((1, ri_provider))) .unwrap(); - assert_eq!(7, runner.stats.data.next); - assert_eq!(2, runner.stats.data.seek); + assert_eq!(7, runner.mut_stats(GcKeyMode::raw).data.next); + assert_eq!(2, runner.mut_stats(GcKeyMode::raw).data.seek); let snapshot = prefixed_engine.snapshot_on_kv_engine(&[], &[]).unwrap(); @@ -1960,7 +2003,7 @@ mod tests { must_gc(&prefixed_engine, b"k2\x00", 30); // Test tombstone counter works - assert_eq!(runner.stats.write.seek_tombstone, 0); + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 0); runner .gc_keys( vec![Key::from_raw(b"k2\x00")], @@ -1968,11 +2011,14 @@ mod tests { Some((1, ri_provider.clone())), ) .unwrap(); - assert_eq!(runner.stats.write.seek_tombstone, 20); + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 20); // gc_keys with single key - runner.stats.write.seek_tombstone = 0; - assert_eq!(runner.stats.write.seek_tombstone, 0); + runner + .mut_stats(GcKeyMode::txn) + .mut_cf_statistics(CF_WRITE) + .seek_tombstone = 0; + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 0); runner .gc_keys( vec![Key::from_raw(b"k2")], @@ -1980,11 +2026,14 @@ mod tests { Some((1, ri_provider.clone())), ) .unwrap(); - assert_eq!(runner.stats.write.seek_tombstone, 0); + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 0); // gc_keys with multiple key - runner.stats.write.seek_tombstone = 0; - assert_eq!(runner.stats.write.seek_tombstone, 0); + runner + .mut_stats(GcKeyMode::txn) + .mut_cf_statistics(CF_WRITE) + .seek_tombstone = 0; + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 0); runner .gc_keys( vec![Key::from_raw(b"k1"), Key::from_raw(b"k2")], @@ -1992,7 +2041,7 @@ mod tests { Some((1, ri_provider.clone())), ) .unwrap(); - assert_eq!(runner.stats.write.seek_tombstone, 0); + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 0); // Test rebuilding snapshot when GC write batch limit reached // (gc_info.is_completed == false). Build a key with versions that will @@ -2012,7 +2061,10 @@ mod tests { db.flush_cf(cf, true).unwrap(); let safepoint = versions as u64 * 2; - runner.stats.write.seek_tombstone = 0; + runner + .mut_stats(GcKeyMode::txn) + .mut_cf_statistics(CF_DEFAULT) + .seek_tombstone = 0; runner .gc_keys( vec![Key::from_raw(b"k2")], @@ -2023,9 +2075,12 @@ mod tests { // The first batch will leave tombstones that will be seen while processing the // second batch, but it will be seen in `next` after seeking the latest // unexpired version, therefore `seek_tombstone` is not affected. - assert_eq!(runner.stats.write.seek_tombstone, 0); + assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 0); // ... and next_tombstone indicates there's indeed more than one batches. - assert_eq!(runner.stats.write.next_tombstone, versions - 3); + assert_eq!( + runner.mut_stats(GcKeyMode::txn).write.next_tombstone, + versions - 3 + ); } #[test] diff --git a/src/server/gc_worker/mod.rs b/src/server/gc_worker/mod.rs index d6114a5875c..5b43b9b4be3 100644 --- a/src/server/gc_worker/mod.rs +++ b/src/server/gc_worker/mod.rs @@ -1,11 +1,11 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. mod applied_lock_collector; -mod compaction_filter; +pub mod compaction_filter; mod config; mod gc_manager; mod gc_worker; -mod rawkv_compaction_filter; +pub mod rawkv_compaction_filter; // TODO: Use separated error type for GcWorker instead. #[cfg(any(test, feature = "failpoints"))] @@ -14,7 +14,11 @@ pub use compaction_filter::WriteCompactionFilterFactory; pub use config::{GcConfig, GcWorkerConfigManager, DEFAULT_GC_BATCH_KEYS}; use engine_traits::MvccProperties; pub use gc_manager::AutoGcConfig; -pub use gc_worker::{sync_gc, GcSafePointProvider, GcTask, GcWorker, GC_MAX_EXECUTING_TASKS}; +#[cfg(any(test, feature = "testexport"))] +pub use gc_worker::test_gc_worker::{MockSafePointProvider, PrefixedEngine}; +pub use gc_worker::{ + sync_gc, GcSafePointProvider, GcTask, GcWorker, STAT_RAW_KEYMODE, STAT_TXN_KEYMODE, +}; pub use rawkv_compaction_filter::RawCompactionFilterFactory; use txn_types::TimeStamp; diff --git a/src/server/gc_worker/rawkv_compaction_filter.rs b/src/server/gc_worker/rawkv_compaction_filter.rs index 49758f5793b..e50e33c1b38 100644 --- a/src/server/gc_worker/rawkv_compaction_filter.rs +++ b/src/server/gc_worker/rawkv_compaction_filter.rs @@ -16,7 +16,7 @@ use engine_rocks::{ RocksEngine, }; use engine_traits::{raw_ttl::ttl_current_ts, MiscExt}; -use prometheus::local::LocalHistogram; +use prometheus::local::LocalHistogramVec; use raftstore::coprocessor::RegionInfoProvider; use tikv_util::worker::{ScheduleError, Scheduler}; use txn_types::Key; @@ -25,9 +25,10 @@ use crate::{ server::gc_worker::{ compaction_filter::{ CompactionFilterStats, DEFAULT_DELETE_BATCH_COUNT, GC_COMPACTION_FAILURE, - GC_COMPACTION_FILTERED, GC_COMPACTION_FILTER_ORPHAN_VERSIONS, GC_CONTEXT, + GC_COMPACTION_FILTERED, GC_COMPACTION_FILTER_MVCC_DELETION_MET, + GC_COMPACTION_FILTER_ORPHAN_VERSIONS, GC_CONTEXT, }, - GcTask, + GcTask, STAT_RAW_KEYMODE, }, storage::mvcc::{GC_DELETE_VERSIONS_HISTOGRAM, MVCC_VERSIONS_HISTOGRAM}, }; @@ -87,8 +88,8 @@ struct RawCompactionFilter { total_versions: usize, total_filtered: usize, orphan_versions: usize, - versions_hist: LocalHistogram, - filtered_hist: LocalHistogram, + versions_hist: LocalHistogramVec, + filtered_hist: LocalHistogramVec, encountered_errors: bool, } @@ -128,7 +129,9 @@ impl CompactionFilter for RawCompactionFilter { Ok(decision) => decision, Err(e) => { warn!("compaction filter meet error: {}", e); - GC_COMPACTION_FAILURE.with_label_values(&["filter"]).inc(); + GC_COMPACTION_FAILURE + .with_label_values(&[STAT_RAW_KEYMODE, "filter"]) + .inc(); self.encountered_errors = true; CompactionFilterDecision::Keep } @@ -203,6 +206,9 @@ impl RawCompactionFilter { // If it's the latest version, and it's deleted or expired, it needs to be sent // to GcWorker to be processed asynchronously. if !raw_value.is_valid(self.current_ts) { + GC_COMPACTION_FILTER_MVCC_DELETION_MET + .with_label_values(&[STAT_RAW_KEYMODE]) + .inc(); self.raw_handle_delete(); if self.mvcc_deletions.len() >= DEFAULT_DELETE_BATCH_COUNT { self.raw_gc_mvcc_deletions(); @@ -251,10 +257,14 @@ impl RawCompactionFilter { } match e { ScheduleError::Full(_) => { - GC_COMPACTION_FAILURE.with_label_values(&["full"]).inc(); + GC_COMPACTION_FAILURE + .with_label_values(&[STAT_RAW_KEYMODE, "full"]) + .inc(); } ScheduleError::Stopped(_) => { - GC_COMPACTION_FAILURE.with_label_values(&["stopped"]).inc(); + GC_COMPACTION_FAILURE + .with_label_values(&[STAT_RAW_KEYMODE, "stopped"]) + .inc(); } } } @@ -270,21 +280,27 @@ impl RawCompactionFilter { // TODO some refactor to avoid duplicated codes. fn switch_key_metrics(&mut self) { if self.versions != 0 { - self.versions_hist.observe(self.versions as f64); + self.versions_hist + .with_label_values(&[STAT_RAW_KEYMODE]) + .observe(self.versions as f64); self.total_versions += self.versions; self.versions = 0; } if self.filtered != 0 { - self.filtered_hist.observe(self.filtered as f64); + self.filtered_hist + .with_label_values(&[STAT_RAW_KEYMODE]) + .observe(self.filtered as f64); self.total_filtered += self.filtered; self.filtered = 0; } } fn flush_metrics(&self) { - GC_COMPACTION_FILTERED.inc_by(self.total_filtered as u64); + GC_COMPACTION_FILTERED + .with_label_values(&[STAT_RAW_KEYMODE]) + .inc_by(self.total_filtered as u64); GC_COMPACTION_FILTER_ORPHAN_VERSIONS - .with_label_values(&["generated"]) + .with_label_values(&[STAT_RAW_KEYMODE, "generated"]) .inc_by(self.orphan_versions as u64); if let Some((versions, filtered)) = STATS.with(|stats| { stats.versions.update(|x| x + self.total_versions); @@ -301,6 +317,13 @@ impl RawCompactionFilter { } } +#[cfg(any(test, feature = "testexport"))] +pub fn make_key(key: &[u8], ts: u64) -> Vec { + let encode_key = ApiV2::encode_raw_key(key, Some(ts.into())); + let res = keys::data_key(encode_key.as_encoded()); + res +} + #[cfg(test)] pub mod tests { @@ -317,12 +340,6 @@ pub mod tests { config::DbConfig, server::gc_worker::TestGcRunner, storage::kv::TestEngineBuilder, }; - pub fn make_key(key: &[u8], ts: u64) -> Vec { - let encode_key = ApiV2::encode_raw_key(key, Some(ts.into())); - let res = keys::data_key(encode_key.as_encoded()); - res - } - #[test] fn test_raw_compaction_filter() { let mut cfg = DbConfig::default(); diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 0d24c9f798b..86ca07f38b4 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -212,7 +212,7 @@ lazy_static! { pub static ref GC_KEYS_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_gcworker_gc_keys", "Counter of keys affected during gc", - &["cf", "tag"] + &["key_mode", "cf", "tag"] ) .unwrap(); pub static ref GC_KEY_FAILURES: IntCounter = register_int_counter!( diff --git a/src/storage/mvcc/metrics.rs b/src/storage/mvcc/metrics.rs index 3fa98e8979a..ddfdc14f5ef 100644 --- a/src/storage/mvcc/metrics.rs +++ b/src/storage/mvcc/metrics.rs @@ -54,15 +54,17 @@ make_static_metric! { } lazy_static! { - pub static ref MVCC_VERSIONS_HISTOGRAM: Histogram = register_histogram!( + pub static ref MVCC_VERSIONS_HISTOGRAM: HistogramVec = register_histogram_vec!( "tikv_storage_mvcc_versions", "Histogram of versions for each key", + &["key_mode"], exponential_buckets(1.0, 2.0, 30).unwrap() ) .unwrap(); - pub static ref GC_DELETE_VERSIONS_HISTOGRAM: Histogram = register_histogram!( + pub static ref GC_DELETE_VERSIONS_HISTOGRAM: HistogramVec = register_histogram_vec!( "tikv_storage_mvcc_gc_delete_versions", "Histogram of versions deleted by gc for each key", + &["key_mode"], exponential_buckets(1.0, 2.0, 30).unwrap() ) .unwrap(); diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index 1517ad67c78..b0a64d83f22 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -20,10 +20,14 @@ pub struct GcInfo { } impl GcInfo { - pub fn report_metrics(&self) { - MVCC_VERSIONS_HISTOGRAM.observe(self.found_versions as f64); + pub fn report_metrics(&self, key_mode: &str) { + MVCC_VERSIONS_HISTOGRAM + .with_label_values(&[key_mode]) + .observe(self.found_versions as f64); if self.deleted_versions > 0 { - GC_DELETE_VERSIONS_HISTOGRAM.observe(self.deleted_versions as f64); + GC_DELETE_VERSIONS_HISTOGRAM + .with_label_values(&[key_mode]) + .observe(self.deleted_versions as f64); } } } diff --git a/src/storage/txn/actions/gc.rs b/src/storage/txn/actions/gc.rs index 07a95f4b06b..29264c7df90 100644 --- a/src/storage/txn/actions/gc.rs +++ b/src/storage/txn/actions/gc.rs @@ -2,9 +2,12 @@ use txn_types::{Key, TimeStamp, Write, WriteType}; -use crate::storage::{ - mvcc::{GcInfo, MvccReader, MvccTxn, Result as MvccResult, MAX_TXN_WRITE_SIZE}, - Snapshot, +use crate::{ + server::gc_worker::STAT_TXN_KEYMODE, + storage::{ + mvcc::{GcInfo, MvccReader, MvccTxn, Result as MvccResult, MAX_TXN_WRITE_SIZE}, + Snapshot, + }, }; pub fn gc<'a, S: Snapshot>( @@ -15,7 +18,7 @@ pub fn gc<'a, S: Snapshot>( ) -> MvccResult { let gc = Gc::new(txn, reader, key); let info = gc.run(safe_point)?; - info.report_metrics(); + info.report_metrics(STAT_TXN_KEYMODE); Ok(info) } diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index 33063777e01..1c38571e280 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -10,6 +10,7 @@ mod test_coprocessor; mod test_disk_full; mod test_early_apply; mod test_encryption; +mod test_gc_metrics; mod test_gc_worker; mod test_hibernate; mod test_import_service; diff --git a/tests/failpoints/cases/test_gc_metrics.rs b/tests/failpoints/cases/test_gc_metrics.rs new file mode 100644 index 00000000000..ede14988744 --- /dev/null +++ b/tests/failpoints/cases/test_gc_metrics.rs @@ -0,0 +1,364 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{atomic::AtomicU64, mpsc, Arc}, + thread, + time::Duration, +}; + +use api_version::{ApiV2, KvFormat, RawValue}; +use engine_rocks::{util::get_cf_handle, RocksEngine}; +use engine_traits::{CF_DEFAULT, CF_WRITE}; +use kvproto::{ + kvrpcpb::*, + metapb::{Peer, Region}, +}; +use pd_client::FeatureGate; +use raft::StateRole; +use raftstore::{ + coprocessor::{CoprocessorHost, RegionChangeEvent}, + router::RaftStoreBlackHole, + RegionInfoAccessor, +}; +use tikv::{ + config::DbConfig, + server::gc_worker::{ + compaction_filter::{ + GC_COMPACTION_FILTERED, GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED, + GC_COMPACTION_FILTER_MVCC_DELETION_MET, GC_COMPACTION_FILTER_PERFORM, + GC_COMPACTION_FILTER_SKIP, + }, + rawkv_compaction_filter::make_key, + AutoGcConfig, GcConfig, GcWorker, MockSafePointProvider, PrefixedEngine, TestGcRunner, + STAT_RAW_KEYMODE, STAT_TXN_KEYMODE, + }, + storage::{ + kv::{Modify, TestEngineBuilder, WriteData}, + mvcc::{tests::must_get, MVCC_VERSIONS_HISTOGRAM}, + txn::tests::{must_commit, must_prewrite_delete, must_prewrite_put}, + Engine, + }, +}; +use txn_types::{Key, TimeStamp}; + +#[test] +fn test_txn_create_compaction_filter() { + GC_COMPACTION_FILTER_PERFORM.reset(); + GC_COMPACTION_FILTER_SKIP.reset(); + + let mut cfg = DbConfig::default(); + cfg.writecf.disable_auto_compactions = true; + cfg.writecf.dynamic_level_bytes = false; + let dir = tempfile::TempDir::new().unwrap(); + let builder = TestEngineBuilder::new().path(dir.path()); + let engine = builder.build_with_cfg(&cfg).unwrap(); + let raw_engine = engine.get_rocksdb(); + + let mut gc_runner = TestGcRunner::new(0); + let value = vec![b'v'; 512]; + + must_prewrite_put(&engine, b"zkey", &value, b"zkey", 100); + must_commit(&engine, b"zkey", 100, 110); + + gc_runner + .safe_point(TimeStamp::new(1).into_inner()) + .gc(&raw_engine); + assert_eq!( + GC_COMPACTION_FILTER_PERFORM + .with_label_values(&[STAT_TXN_KEYMODE]) + .get(), + 1 + ); + assert_eq!( + GC_COMPACTION_FILTER_SKIP + .with_label_values(&[STAT_TXN_KEYMODE]) + .get(), + 1 + ); + + GC_COMPACTION_FILTER_PERFORM.reset(); + GC_COMPACTION_FILTER_SKIP.reset(); +} + +#[test] +fn test_txn_mvcc_filtered() { + MVCC_VERSIONS_HISTOGRAM.reset(); + GC_COMPACTION_FILTERED.reset(); + + let engine = TestEngineBuilder::new().build().unwrap(); + let raw_engine = engine.get_rocksdb(); + let value = vec![b'v'; 512]; + let mut gc_runner = TestGcRunner::new(0); + + // GC can't delete keys after the given safe point. + must_prewrite_put(&engine, b"zkey", &value, b"zkey", 100); + must_commit(&engine, b"zkey", 100, 110); + gc_runner.safe_point(50).gc(&raw_engine); + must_get(&engine, b"zkey", 110, &value); + + // GC can't delete keys before the safe ponit if they are latest versions. + gc_runner.safe_point(200).gc(&raw_engine); + must_get(&engine, b"zkey", 110, &value); + + must_prewrite_put(&engine, b"zkey", &value, b"zkey", 120); + must_commit(&engine, b"zkey", 120, 130); + + // GC can't delete the latest version before the safe ponit. + gc_runner.safe_point(115).gc(&raw_engine); + must_get(&engine, b"zkey", 110, &value); + + // GC a version will also delete the key on default CF. + gc_runner.safe_point(200).gc(&raw_engine); + assert_eq!( + MVCC_VERSIONS_HISTOGRAM + .with_label_values(&[STAT_TXN_KEYMODE]) + .get_sample_sum(), + 4_f64 + ); + assert_eq!( + GC_COMPACTION_FILTERED + .with_label_values(&[STAT_TXN_KEYMODE]) + .get(), + 1 + ); + + MVCC_VERSIONS_HISTOGRAM.reset(); + GC_COMPACTION_FILTERED.reset(); +} + +#[test] +fn test_txn_gc_keys_handled() { + GC_COMPACTION_FILTER_MVCC_DELETION_MET.reset(); + GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED.reset(); + + let engine = TestEngineBuilder::new().build().unwrap(); + let prefixed_engine = PrefixedEngine(engine.clone()); + + let (tx, _rx) = mpsc::channel(); + let feature_gate = FeatureGate::default(); + feature_gate.set_version("5.0.0").unwrap(); + let mut gc_worker = GcWorker::new( + prefixed_engine.clone(), + RaftStoreBlackHole, + tx, + GcConfig::default(), + feature_gate, + ); + gc_worker.start().unwrap(); + + let mut r1 = Region::default(); + r1.set_id(1); + r1.mut_region_epoch().set_version(1); + r1.set_start_key(b"".to_vec()); + r1.set_end_key(b"".to_vec()); + r1.mut_peers().push(Peer::default()); + r1.mut_peers()[0].set_store_id(1); + + let sp_provider = MockSafePointProvider(200); + let mut host = CoprocessorHost::::default(); + let ri_provider = RegionInfoAccessor::new(&mut host); + let auto_gc_cfg = AutoGcConfig::new(sp_provider, ri_provider, 1); + let safe_point = Arc::new(AtomicU64::new(500)); + gc_worker.start_auto_gc(auto_gc_cfg, safe_point).unwrap(); + host.on_region_changed(&r1, RegionChangeEvent::Create, StateRole::Leader); + + let db = engine.kv_engine().as_inner().clone(); + let cf = get_cf_handle(&db, CF_WRITE).unwrap(); + + for i in 0..3 { + let k = format!("k{:02}", i).into_bytes(); + must_prewrite_put(&prefixed_engine, &k, b"value", &k, 101); + must_commit(&prefixed_engine, &k, 101, 102); + must_prewrite_delete(&prefixed_engine, &k, &k, 151); + must_commit(&prefixed_engine, &k, 151, 152); + } + + db.flush_cf(cf, true).unwrap(); + + db.compact_range_cf(cf, None, None); + + // This compaction can schedule gc task + db.compact_range_cf(cf, None, None); + thread::sleep(Duration::from_millis(100)); + + assert_eq!( + GC_COMPACTION_FILTER_MVCC_DELETION_MET + .with_label_values(&[STAT_TXN_KEYMODE]) + .get(), + 6 + ); + + assert_eq!( + GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED + .with_label_values(&[STAT_TXN_KEYMODE]) + .get(), + 3 + ); + + GC_COMPACTION_FILTER_MVCC_DELETION_MET.reset(); + GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED.reset(); +} + +#[test] +fn test_raw_mvcc_filtered() { + MVCC_VERSIONS_HISTOGRAM.reset(); + GC_COMPACTION_FILTERED.reset(); + + let mut cfg = DbConfig::default(); + cfg.defaultcf.disable_auto_compactions = true; + cfg.defaultcf.dynamic_level_bytes = false; + + let engine = TestEngineBuilder::new() + .api_version(ApiVersion::V2) + .build_with_cfg(&cfg) + .unwrap(); + let raw_engine = engine.get_rocksdb(); + let mut gc_runner = TestGcRunner::new(0); + + let user_key = b"r\0aaaaaaaaaaa"; + + let test_raws = vec![ + (user_key, 100, false), + (user_key, 90, false), + (user_key, 70, false), + ]; + + let modifies = test_raws + .into_iter() + .map(|(key, ts, is_delete)| { + ( + make_key(key, ts), + ApiV2::encode_raw_value(RawValue { + user_value: &[0; 10][..], + expire_ts: Some(TimeStamp::max().into_inner()), + is_delete, + }), + ) + }) + .map(|(k, v)| Modify::Put(CF_DEFAULT, Key::from_encoded_slice(k.as_slice()), v)) + .collect(); + + let ctx = Context { + api_version: ApiVersion::V2, + ..Default::default() + }; + let batch = WriteData::from_modifies(modifies); + + engine.write(&ctx, batch).unwrap(); + + gc_runner.safe_point(80).gc_raw(&raw_engine); + + assert_eq!( + MVCC_VERSIONS_HISTOGRAM + .with_label_values(&[STAT_RAW_KEYMODE]) + .get_sample_sum(), + 1_f64 + ); + assert_eq!( + GC_COMPACTION_FILTERED + .with_label_values(&[STAT_RAW_KEYMODE]) + .get(), + 1 + ); + + MVCC_VERSIONS_HISTOGRAM.reset(); + GC_COMPACTION_FILTERED.reset(); +} + +#[test] +fn test_raw_gc_keys_handled() { + GC_COMPACTION_FILTER_MVCC_DELETION_MET.reset(); + GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED.reset(); + + let engine = TestEngineBuilder::new() + .api_version(ApiVersion::V2) + .build() + .unwrap(); + let prefixed_engine = PrefixedEngine(engine.clone()); + + let (tx, _rx) = mpsc::channel(); + let feature_gate = FeatureGate::default(); + feature_gate.set_version("5.0.0").unwrap(); + let mut gc_worker = GcWorker::new( + prefixed_engine, + RaftStoreBlackHole, + tx, + GcConfig::default(), + feature_gate, + ); + gc_worker.start().unwrap(); + + let mut r1 = Region::default(); + r1.set_id(1); + r1.mut_region_epoch().set_version(1); + r1.set_start_key(b"".to_vec()); + r1.set_end_key(b"".to_vec()); + r1.mut_peers().push(Peer::default()); + r1.mut_peers()[0].set_store_id(1); + + let sp_provider = MockSafePointProvider(200); + let mut host = CoprocessorHost::::default(); + let ri_provider = RegionInfoAccessor::new(&mut host); + let auto_gc_cfg = AutoGcConfig::new(sp_provider, ri_provider, 1); + let safe_point = Arc::new(AtomicU64::new(500)); + gc_worker.start_auto_gc(auto_gc_cfg, safe_point).unwrap(); + host.on_region_changed(&r1, RegionChangeEvent::Create, StateRole::Leader); + + let db = engine.kv_engine().as_inner().clone(); + + let user_key_del = b"r\0aaaaaaaaaaa"; + + // If it's deleted, it will call async scheduler GcTask. + let test_raws = vec![ + (user_key_del, 9, true), + (user_key_del, 5, false), + (user_key_del, 1, false), + ]; + + let modifies = test_raws + .into_iter() + .map(|(key, ts, is_delete)| { + ( + make_key(key, ts), + ApiV2::encode_raw_value(RawValue { + user_value: &[0; 10][..], + expire_ts: Some(TimeStamp::max().into_inner()), + is_delete, + }), + ) + }) + .map(|(k, v)| Modify::Put(CF_DEFAULT, Key::from_encoded_slice(k.as_slice()), v)) + .collect(); + + let ctx = Context { + api_version: ApiVersion::V2, + ..Default::default() + }; + + let batch = WriteData::from_modifies(modifies); + + engine.write(&ctx, batch).unwrap(); + + let cf = get_cf_handle(&db, CF_DEFAULT).unwrap(); + db.flush_cf(cf, true).unwrap(); + + db.compact_range_cf(cf, None, None); + + thread::sleep(Duration::from_millis(100)); + + assert_eq!( + GC_COMPACTION_FILTER_MVCC_DELETION_MET + .with_label_values(&[STAT_RAW_KEYMODE]) + .get(), + 1 + ); + assert_eq!( + GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED + .with_label_values(&[STAT_RAW_KEYMODE]) + .get(), + 1 + ); + + GC_COMPACTION_FILTER_MVCC_DELETION_MET.reset(); + GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED.reset(); +} From b5bc5078a1fd767390c8d696cf0d965623b962f7 Mon Sep 17 00:00:00 2001 From: hehechen Date: Wed, 24 Aug 2022 19:50:21 +0800 Subject: [PATCH 170/676] resolved_ts: fix check_leader to tiflash proxy (#13312) ref tikv/tikv#12092, close tikv/tikv#13310 For TiFlash proxy, should use store.peer_address instead of store.address. For TiKV, always use the same peer_address and address, so the change won't affect. Signed-off-by: hehechen Co-authored-by: Ti Chi Robot --- components/resolved_ts/src/advance.rs | 2 +- src/server/node.rs | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 57bf20e7d0b..190c4474711 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -397,7 +397,7 @@ async fn get_tikv_client( CString::new("random id").unwrap(), CONN_ID.fetch_add(1, Ordering::SeqCst), ); - let channel = security_mgr.connect(cb, &store.address); + let channel = security_mgr.connect(cb, &store.peer_address); let cli = TikvClient::new(channel); clients.insert(store_id, cli.clone()); RTS_TIKV_CLIENT_INIT_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); diff --git a/src/server/node.rs b/src/server/node.rs index a282bcded37..d8bee9abfd7 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -121,8 +121,10 @@ where store.set_id(INVALID_ID); if cfg.advertise_addr.is_empty() { store.set_address(cfg.addr.clone()); + store.set_peer_address(cfg.addr.clone()); } else { - store.set_address(cfg.advertise_addr.clone()) + store.set_address(cfg.advertise_addr.clone()); + store.set_peer_address(cfg.advertise_addr.clone()); } if cfg.advertise_status_addr.is_empty() { store.set_status_address(cfg.status_addr.clone()); From afbacfc4a864080f958497ce9a387df854f62f0f Mon Sep 17 00:00:00 2001 From: ekexium Date: Thu, 25 Aug 2022 10:08:21 +0800 Subject: [PATCH 171/676] txn: deferred constraint check (#13121) close tikv/tikv#13128, ref pingcap/tidb#36579 Signed-off-by: ekexium Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/cdc/src/old_value.rs | 3 +- components/cdc/tests/mod.rs | 6 +- components/resolved_ts/src/cmd.rs | 4 +- components/resolved_ts/tests/mod.rs | 9 +- components/test_raftstore/src/util.rs | 6 +- src/storage/mod.rs | 51 +++- src/storage/mvcc/reader/point_getter.rs | 6 +- src/storage/mvcc/reader/reader.rs | 6 +- src/storage/mvcc/reader/scanner/forward.rs | 36 ++- src/storage/mvcc/txn.rs | 58 ++-- .../txn/actions/acquire_pessimistic_lock.rs | 62 ++-- src/storage/txn/actions/cleanup.rs | 4 +- src/storage/txn/actions/commit.rs | 4 +- src/storage/txn/actions/prewrite.rs | 197 ++++++++----- src/storage/txn/actions/tests.rs | 272 +++++++++++++++--- src/storage/txn/commands/check_txn_status.rs | 14 +- src/storage/txn/commands/mod.rs | 8 +- src/storage/txn/commands/prewrite.rs | 171 +++++++---- src/storage/txn/commands/rollback.rs | 4 +- src/storage/txn/store.rs | 4 +- tests/benches/hierarchy/mvcc/mod.rs | 14 +- tests/benches/hierarchy/txn/mod.rs | 14 +- tests/failpoints/cases/test_merge.rs | 4 +- tests/failpoints/cases/test_split_region.rs | 6 +- tests/failpoints/cases/test_storage.rs | 20 +- tests/failpoints/cases/test_transaction.rs | 9 +- tests/integrations/server/kv_service.rs | 4 +- 28 files changed, 714 insertions(+), 284 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b067e3337e5..52ad7912203 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2627,7 +2627,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#affce57868b9f8befac389559d372369b2cb616f" +source = "git+https://github.com/pingcap/kvproto.git#a0f02b6efcee6112bdc313988bf6c0ae3f83c07d" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/cdc/src/old_value.rs b/components/cdc/src/old_value.rs index 89f78f694c3..9d60474b952 100644 --- a/components/cdc/src/old_value.rs +++ b/components/cdc/src/old_value.rs @@ -293,6 +293,7 @@ mod tests { use engine_rocks::{ReadPerfInstant, RocksEngine}; use engine_traits::{KvEngine, MiscExt}; + use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; use tikv::{ config::DbConfig, storage::{kv::TestEngineBuilder, txn::tests::*}, @@ -415,7 +416,7 @@ mod tests { must_commit(&engine, k, 7, 9); must_acquire_pessimistic_lock(&engine, k, k, 8, 10); - must_pessimistic_prewrite_put(&engine, k, b"v5", k, 8, 10, true); + must_pessimistic_prewrite_put(&engine, k, b"v5", k, 8, 10, DoPessimisticCheck); must_get_eq(&kv_engine, &key, 10, Some(b"v4".to_vec())); must_commit(&engine, k, 8, 11); } diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index 63c06551a80..89eebcceec7 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -11,7 +11,7 @@ use grpcio::{ }; use kvproto::{ cdcpb::{create_change_data, ChangeDataClient, ChangeDataEvent, ChangeDataRequest}, - kvrpcpb::*, + kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, tikvpb::TikvClient, }; use online_config::OnlineConfig; @@ -418,7 +418,9 @@ impl TestSuite { prewrite_req.start_version = ts.into_inner(); prewrite_req.lock_ttl = prewrite_req.start_version + 1; prewrite_req.for_update_ts = for_update_ts.into_inner(); - prewrite_req.mut_is_pessimistic_lock().push(true); + prewrite_req + .mut_pessimistic_actions() + .push(DoPessimisticCheck); let prewrite_resp = self .get_tikv_client(region_id) .kv_prewrite(&prewrite_req) diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 277a31e2001..0bb22e0a21e 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -286,7 +286,7 @@ pub fn lock_only_filter(mut cmd_batch: CmdBatch) -> Option { #[cfg(test)] mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::AssertionLevel; + use kvproto::kvrpcpb::{AssertionLevel, PrewriteRequestPessimisticAction::*}; use tikv::storage::{ kv::{MockEngineBuilder, TestEngineBuilder}, lock_manager::DummyLockManager, @@ -405,7 +405,7 @@ mod tests { }, Mutation::make_put(k1.clone(), b"v4".to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); one_pc_commit_ts(true, &mut txn, 10.into(), &DummyLockManager); diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 3d7fdb87569..0e6d8bbc9f8 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -6,7 +6,10 @@ use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::{RocksEngine, RocksSnapshot}; use grpcio::{ChannelBuilder, ClientUnaryReceiver, Environment}; -use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; +use kvproto::{ + kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, + tikvpb::TikvClient, +}; use online_config::ConfigValue; use raftstore::coprocessor::CoprocessorHost; use resolved_ts::{Observer, Task}; @@ -261,7 +264,9 @@ impl TestSuite { prewrite_req.start_version = ts.into_inner(); prewrite_req.lock_ttl = prewrite_req.start_version + 1; prewrite_req.for_update_ts = for_update_ts.into_inner(); - prewrite_req.mut_is_pessimistic_lock().push(true); + prewrite_req + .mut_pessimistic_actions() + .push(DoPessimisticCheck); let prewrite_resp = self .get_tikv_client(region_id) .kv_prewrite(&prewrite_req) diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index eaeaf6a4e0f..8cac947dc57 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -24,7 +24,7 @@ use futures::executor::block_on; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ encryptionpb::EncryptionMethod, - kvrpcpb::*, + kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, metapb::{self, RegionEpoch}, pdpb::{ ChangePeer, ChangePeerV2, CheckPolicy, Merge, RegionHeartbeatResponse, SplitRegion, @@ -894,7 +894,7 @@ pub fn must_kv_prewrite_with( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.is_pessimistic_lock = vec![true; muts.len()]; + prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; @@ -931,7 +931,7 @@ pub fn try_kv_prewrite_with( let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(ctx); if for_update_ts != 0 { - prewrite_req.is_pessimistic_lock = vec![true; muts.len()]; + prewrite_req.pessimistic_actions = vec![DoPessimisticCheck; muts.len()]; } prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; diff --git a/src/storage/mod.rs b/src/storage/mod.rs index d974c731db0..3024a05381f 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3227,7 +3227,7 @@ mod tests { use error_code::ErrorCodeExt; use errors::extract_key_error; use futures::executor::block_on; - use kvproto::kvrpcpb::{AssertionLevel, CommandPri, Op}; + use kvproto::kvrpcpb::{AssertionLevel, CommandPri, Op, PrewriteRequestPessimisticAction::*}; use tikv_util::config::ReadableSize; use tracker::INVALID_TRACKER_TOKEN; use txn_types::{Mutation, PessimisticLock, WriteType}; @@ -7199,8 +7199,14 @@ mod tests { .sched_txn_command( commands::PrewritePessimistic::new( vec![ - (Mutation::make_put(key.clone(), val.clone()), true), - (Mutation::make_put(key2.clone(), val2.clone()), false), + ( + Mutation::make_put(key.clone(), val.clone()), + DoPessimisticCheck, + ), + ( + Mutation::make_put(key2.clone(), val2.clone()), + SkipPessimisticCheck, + ), ], key.to_raw().unwrap(), 10.into(), @@ -8059,8 +8065,14 @@ mod tests { .sched_txn_command( commands::PrewritePessimistic::new( vec![ - (Mutation::make_put(Key::from_raw(b"d"), b"v".to_vec()), true), - (Mutation::make_put(Key::from_raw(b"e"), b"v".to_vec()), true), + ( + Mutation::make_put(Key::from_raw(b"d"), b"v".to_vec()), + DoPessimisticCheck, + ), + ( + Mutation::make_put(Key::from_raw(b"e"), b"v".to_vec()), + DoPessimisticCheck, + ), ], b"d".to_vec(), 200.into(), @@ -8152,7 +8164,10 @@ mod tests { storage .sched_txn_command( commands::PrewritePessimistic::new( - vec![(Mutation::make_put(key2.clone(), value2.clone()), true)], + vec![( + Mutation::make_put(key2.clone(), value2.clone()), + DoPessimisticCheck, + )], k2.to_vec(), 10.into(), 0, @@ -8197,8 +8212,11 @@ mod tests { .sched_txn_command( commands::PrewritePessimistic::new( vec![ - (Mutation::make_put(key1.clone(), value1), true), - (Mutation::make_put(key2.clone(), value2), false), + (Mutation::make_put(key1.clone(), value1), DoPessimisticCheck), + ( + Mutation::make_put(key2.clone(), value2), + SkipPessimisticCheck, + ), ], k1.to_vec(), 1.into(), @@ -8435,23 +8453,23 @@ mod tests { vec![ ( Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), - true, + DoPessimisticCheck, ), ( Mutation::make_put(Key::from_raw(b"k3"), b"v2".to_vec()), - true, + DoPessimisticCheck, ), ( Mutation::make_put(Key::from_raw(b"k4"), b"v4".to_vec()), - true, + DoPessimisticCheck, ), ( Mutation::make_put(Key::from_raw(b"k5"), b"v5".to_vec()), - true, + DoPessimisticCheck, ), ( Mutation::make_put(Key::from_raw(b"k6"), b"v6".to_vec()), - true, + DoPessimisticCheck, ), ], b"k1".to_vec(), @@ -9023,7 +9041,10 @@ mod tests { storage .sched_txn_command( commands::PrewritePessimistic::new( - vec![(Mutation::make_put(k1.clone(), b"v".to_vec()), true)], + vec![( + Mutation::make_put(k1.clone(), b"v".to_vec()), + DoPessimisticCheck, + )], b"k1".to_vec(), 10.into(), 3000, @@ -9081,7 +9102,7 @@ mod tests { storage .sched_txn_command( commands::PrewritePessimistic::new( - vec![(Mutation::make_put(k1, b"v".to_vec()), true)], + vec![(Mutation::make_put(k1, b"v".to_vec()), DoPessimisticCheck)], b"k1".to_vec(), 10.into(), 3000, diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 7c521bb5952..2a231b42823 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -389,7 +389,7 @@ impl PointGetter { #[cfg(test)] mod tests { use engine_rocks::ReadPerfInstant; - use kvproto::kvrpcpb::{Assertion, AssertionLevel}; + use kvproto::kvrpcpb::{Assertion, AssertionLevel, PrewriteRequestPessimisticAction::*}; use txn_types::SHORT_VALUE_MAX_LEN; use super::*; @@ -929,7 +929,7 @@ mod tests { // // write.start_ts(10) < primary_lock.start_ts(15) < write.commit_ts(20) must_acquire_pessimistic_lock(&engine, key, key, 15, 50); - must_pessimistic_prewrite_delete(&engine, key, key, 15, 50, true); + must_pessimistic_prewrite_delete(&engine, key, key, 15, 50, DoPessimisticCheck); let mut getter = new_point_getter(&engine, TimeStamp::max()); must_get_value(&mut getter, key, val); } @@ -1017,7 +1017,7 @@ mod tests { key, &None, 80.into(), - false, + SkipPessimisticCheck, 100, 80.into(), 1, diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index c45fabe2540..f1ed7748a15 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -640,7 +640,7 @@ pub mod tests { CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ - kvrpcpb::{AssertionLevel, Context}, + kvrpcpb::{AssertionLevel, Context, PrewriteRequestPessimisticAction::*}, metapb::{Peer, Region}, }; use raftstore::store::RegionSnapshot; @@ -749,7 +749,7 @@ pub mod tests { &Self::txn_props(start_ts, pk, false), m, &None, - false, + SkipPessimisticCheck, ) .unwrap(); self.write(txn.into_modifies()); @@ -773,7 +773,7 @@ pub mod tests { &Self::txn_props(start_ts, pk, true), m, &None, - true, + DoPessimisticCheck, ) .unwrap(); self.write(txn.into_modifies()); diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index a7a839cf2e7..6bed0289053 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -2029,7 +2029,7 @@ mod latest_entry_tests { #[cfg(test)] mod delta_entry_tests { use engine_traits::{CF_LOCK, CF_WRITE}; - use kvproto::kvrpcpb::Context; + use kvproto::kvrpcpb::{Context, PrewriteRequestPessimisticAction::*}; use txn_types::{is_short_value, SHORT_VALUE_MAX_LEN}; use super::{super::ScannerBuilder, test_util::*, *}; @@ -2486,7 +2486,7 @@ mod delta_entry_tests { key, start_ts, commit_ts - 1, - true, + DoPessimisticCheck, ), WriteType::Delete => must_pessimistic_prewrite_delete( &engine, @@ -2494,7 +2494,7 @@ mod delta_entry_tests { key, start_ts, commit_ts - 1, - true, + DoPessimisticCheck, ), WriteType::Lock => must_pessimistic_prewrite_lock( &engine, @@ -2502,7 +2502,7 @@ mod delta_entry_tests { key, start_ts, commit_ts - 1, - true, + DoPessimisticCheck, ), WriteType::Rollback => must_rollback(&engine, key, start_ts, false), } @@ -2528,14 +2528,24 @@ mod delta_entry_tests { key, ts, for_update_ts, - true, + DoPessimisticCheck, + ), + LockType::Delete => must_pessimistic_prewrite_delete( + &engine, + key, + key, + ts, + for_update_ts, + DoPessimisticCheck, + ), + LockType::Lock => must_pessimistic_prewrite_lock( + &engine, + key, + key, + ts, + for_update_ts, + DoPessimisticCheck, ), - LockType::Delete => { - must_pessimistic_prewrite_delete(&engine, key, key, ts, for_update_ts, true) - } - LockType::Lock => { - must_pessimistic_prewrite_lock(&engine, key, key, ts, for_update_ts, true) - } LockType::Pessimistic => {} } } @@ -2631,12 +2641,12 @@ mod delta_entry_tests { // Generate put for [b] at 15. must_acquire_pessimistic_lock(&engine, b"b", b"b", 9, 15); - must_pessimistic_prewrite_put(&engine, b"b", b"b_15", b"b", 9, 15, true); + must_pessimistic_prewrite_put(&engine, b"b", b"b_15", b"b", 9, 15, DoPessimisticCheck); must_prewrite_put(&engine, b"c", b"c_4", b"c", 4); must_commit(&engine, b"c", 4, 6); must_acquire_pessimistic_lock(&engine, b"c", b"c", 5, 15); - must_pessimistic_prewrite_put(&engine, b"c", b"c_5", b"c", 5, 15, true); + must_pessimistic_prewrite_put(&engine, b"c", b"c_5", b"c", 5, 15, DoPessimisticCheck); must_cleanup(&engine, b"c", 20, 0); let entry_a_1 = EntryBuilder::default() diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index b0a64d83f22..a9032d1b463 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -274,7 +274,7 @@ pub(crate) fn make_txn_error( #[cfg(test)] pub(crate) mod tests { - use kvproto::kvrpcpb::{AssertionLevel, Context}; + use kvproto::kvrpcpb::{AssertionLevel, Context, PrewriteRequestPessimisticAction::*}; use txn_types::{TimeStamp, WriteType, SHORT_VALUE_MAX_LEN}; use super::*; @@ -341,7 +341,7 @@ pub(crate) mod tests { must_commit(&engine, k1, 25, 27); must_acquire_pessimistic_lock(&engine, k1, k1, 23, 29); must_get(&engine, k1, 30, v); - must_pessimistic_prewrite_delete(&engine, k1, k1, 23, 29, true); + must_pessimistic_prewrite_delete(&engine, k1, k1, 23, 29, DoPessimisticCheck); must_get_err(&engine, k1, 30); // should read the latest record when `ts == u64::max_value()` // even if lock.start_ts(23) < latest write.commit_ts(27) @@ -521,8 +521,8 @@ pub(crate) mod tests { must_acquire_pessimistic_lock(&engine, k1, k1, 15, 15); must_acquire_pessimistic_lock(&engine, k2, k1, 15, 17); - must_pessimistic_prewrite_put(&engine, k1, v, k1, 15, 17, true); - must_pessimistic_prewrite_put(&engine, k2, v, k1, 15, 17, true); + must_pessimistic_prewrite_put(&engine, k1, v, k1, 15, 17, DoPessimisticCheck); + must_pessimistic_prewrite_put(&engine, k2, v, k1, 15, 17, DoPessimisticCheck); must_rollback(&engine, k1, 15, false); must_rollback(&engine, k2, 15, false); // The rollback of the primary key should be protected @@ -758,7 +758,7 @@ pub(crate) mod tests { &txn_props(10.into(), pk, CommitKind::TwoPc, None, 0, false), Mutation::make_put(key.clone(), v.to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); assert!(txn.write_size() > 0); @@ -802,7 +802,7 @@ pub(crate) mod tests { &txn_props(5.into(), key, CommitKind::TwoPc, None, 0, false), Mutation::make_put(Key::from_raw(key), value.to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap_err(); @@ -815,7 +815,7 @@ pub(crate) mod tests { &txn_props(5.into(), key, CommitKind::TwoPc, None, 0, true), Mutation::make_put(Key::from_raw(key), value.to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); } @@ -961,7 +961,7 @@ pub(crate) mod tests { // original pessimisitic lock. must_acquire_pessimistic_lock_with_ttl(&engine, k, k, 10, 10, 100); must_pessimistic_locked(&engine, k, 10, 10); - must_pessimistic_prewrite_put_with_ttl(&engine, k, v, k, 10, 10, true, 110); + must_pessimistic_prewrite_put_with_ttl(&engine, k, v, k, 10, 10, DoPessimisticCheck, 110); must_locked_with_ttl(&engine, k, 10, 110); must_rollback(&engine, k, 10, false); @@ -970,7 +970,7 @@ pub(crate) mod tests { // the prewrite request. must_acquire_pessimistic_lock_with_ttl(&engine, k, k, 20, 20, 100); must_pessimistic_locked(&engine, k, 20, 20); - must_pessimistic_prewrite_put_with_ttl(&engine, k, v, k, 20, 20, true, 90); + must_pessimistic_prewrite_put_with_ttl(&engine, k, v, k, 20, 20, DoPessimisticCheck, 90); must_locked_with_ttl(&engine, k, 20, 100); } @@ -984,7 +984,7 @@ pub(crate) mod tests { must_prewrite_put(&engine, k, v, k, 10); must_commit(&engine, k, 10, 11); must_acquire_pessimistic_lock(&engine, k, k, 5, 12); - must_pessimistic_prewrite_lock(&engine, k, k, 5, 12, true); + must_pessimistic_prewrite_lock(&engine, k, k, 5, 12, DoPessimisticCheck); must_commit(&engine, k, 5, 15); // Now in write cf: @@ -1025,7 +1025,7 @@ pub(crate) mod tests { expected_lock_info.get_primary_lock(), &None, expected_lock_info.get_lock_version().into(), - false, + SkipPessimisticCheck, expected_lock_info.get_lock_ttl(), TimeStamp::zero(), expected_lock_info.get_txn_size(), @@ -1068,7 +1068,7 @@ pub(crate) mod tests { expected_lock_info.set_lock_ttl(0); assert_lock_info_eq( - must_pessimistic_prewrite_put_err(&engine, k, v, k, 40, 40, false), + must_pessimistic_prewrite_put_err(&engine, k, v, k, 40, 40, SkipPessimisticCheck), &expected_lock_info, ); @@ -1095,8 +1095,8 @@ pub(crate) mod tests { must_prewrite_put(&engine, k, v, k, 2); must_locked(&engine, k, 2); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 1, 1, false); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 3, 3, false); + must_pessimistic_prewrite_put_err(&engine, k, v, k, 1, 1, SkipPessimisticCheck); + must_pessimistic_prewrite_put_err(&engine, k, v, k, 3, 3, SkipPessimisticCheck); } #[test] @@ -1117,19 +1117,19 @@ pub(crate) mod tests { must_acquire_pessimistic_lock_err(&engine, k3, k1, 10, 10); // Update for_update_ts to 20 due to write conflict must_acquire_pessimistic_lock(&engine, k3, k1, 10, 20); - must_pessimistic_prewrite_put(&engine, k1, v1, k1, 10, 20, true); - must_pessimistic_prewrite_put(&engine, k3, v3, k1, 10, 20, true); + must_pessimistic_prewrite_put(&engine, k1, v1, k1, 10, 20, DoPessimisticCheck); + must_pessimistic_prewrite_put(&engine, k3, v3, k1, 10, 20, DoPessimisticCheck); // Write a non-pessimistic lock with for_update_ts 20. - must_pessimistic_prewrite_put(&engine, k2, v2, k1, 10, 20, false); + must_pessimistic_prewrite_put(&engine, k2, v2, k1, 10, 20, SkipPessimisticCheck); // Roll back the primary key due to timeout, but the non-pessimistic lock is not // rolled back. must_rollback(&engine, k1, 10, false); // Txn-15 acquires pessimistic locks on k1. must_acquire_pessimistic_lock(&engine, k1, k1, 15, 15); - must_pessimistic_prewrite_put(&engine, k1, v1, k1, 15, 15, true); + must_pessimistic_prewrite_put(&engine, k1, v1, k1, 15, 15, DoPessimisticCheck); // There is a non-pessimistic lock conflict here. - match must_pessimistic_prewrite_put_err(&engine, k2, v2, k1, 15, 15, false) { + match must_pessimistic_prewrite_put_err(&engine, k2, v2, k1, 15, 15, SkipPessimisticCheck) { Error(box ErrorInner::KeyIsLocked(info)) => assert_eq!(info.get_lock_ttl(), 0), e => panic!("unexpected error: {}", e), }; @@ -1166,30 +1166,30 @@ pub(crate) mod tests { // Key not exist; should succeed. fail_to_write_pessimistic_lock(&engine, k, 10, 10); - must_pessimistic_prewrite_put(&engine, k, &v, k, 10, 10, true); + must_pessimistic_prewrite_put(&engine, k, &v, k, 10, 10, DoPessimisticCheck); must_commit(&engine, k, 10, 20); must_get(&engine, k, 20, &v); // for_update_ts(30) >= start_ts(30) > commit_ts(20); should succeed. v.push(0); fail_to_write_pessimistic_lock(&engine, k, 30, 30); - must_pessimistic_prewrite_put(&engine, k, &v, k, 30, 30, true); + must_pessimistic_prewrite_put(&engine, k, &v, k, 30, 30, DoPessimisticCheck); must_commit(&engine, k, 30, 40); must_get(&engine, k, 40, &v); // for_update_ts(40) >= commit_ts(40) > start_ts(35); should fail. fail_to_write_pessimistic_lock(&engine, k, 35, 40); - must_pessimistic_prewrite_put_err(&engine, k, &v, k, 35, 40, true); + must_pessimistic_prewrite_put_err(&engine, k, &v, k, 35, 40, DoPessimisticCheck); // KeyIsLocked; should fail. must_acquire_pessimistic_lock(&engine, k, k, 50, 50); - must_pessimistic_prewrite_put_err(&engine, k, &v, k, 60, 60, true); + must_pessimistic_prewrite_put_err(&engine, k, &v, k, 60, 60, DoPessimisticCheck); pessimistic_rollback::tests::must_success(&engine, k, 50, 50); // The txn has been rolled back; should fail. must_acquire_pessimistic_lock(&engine, k, k, 80, 80); must_cleanup(&engine, k, 80, TimeStamp::max()); - must_pessimistic_prewrite_put_err(&engine, k, &v, k, 80, 80, true); + must_pessimistic_prewrite_put_err(&engine, k, &v, k, 80, 80, DoPessimisticCheck); } #[test] @@ -1219,7 +1219,7 @@ pub(crate) mod tests { ), mutation, &Some(vec![b"key1".to_vec(), b"key2".to_vec(), b"key3".to_vec()]), - false, + SkipPessimisticCheck, ) .unwrap(); let modifies = txn.into_modifies(); @@ -1277,7 +1277,7 @@ pub(crate) mod tests { ), mutation, &Some(vec![b"key1".to_vec(), b"key2".to_vec(), b"key3".to_vec()]), - true, + DoPessimisticCheck, ) .unwrap(); let modifies = txn.into_modifies(); @@ -1336,7 +1336,7 @@ pub(crate) mod tests { ), mutation, &Some(vec![b"key1".to_vec(), b"key2".to_vec(), b"key3".to_vec()]), - true, + DoPessimisticCheck, ) .unwrap(); assert_eq!(min_commit_ts.into_inner(), 100); @@ -1379,7 +1379,7 @@ pub(crate) mod tests { // Pessimistic transaction also works in the same case. must_acquire_pessimistic_lock(&engine, k, k, 50, 50); must_pessimistic_locked(&engine, k, 50, 50); - must_pessimistic_prewrite_put(&engine, k, v, k, 50, 50, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 50, 50, DoPessimisticCheck); must_commit(&engine, k, 50, 60); must_unlocked(&engine, k); must_written(&engine, k, 50, 60, WriteType::Put); @@ -1562,7 +1562,7 @@ pub(crate) mod tests { // T2, start_ts = 20 must_acquire_pessimistic_lock(&engine, k2, k2, 20, 25); - must_pessimistic_prewrite_put(&engine, k2, v2, k2, 20, 25, true); + must_pessimistic_prewrite_put(&engine, k2, v2, k2, 20, 25, DoPessimisticCheck); must_cleanup(&engine, k2, 20, 0); diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 792ed8fcb9a..9df4d9ebce9 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -252,6 +252,8 @@ pub fn acquire_pessimistic_lock( pub mod tests { use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::Context; + #[cfg(test)] + use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; use txn_types::TimeStamp; use super::*; @@ -493,7 +495,7 @@ pub mod tests { // Normal must_succeed(&engine, k, k, 1, 1); must_pessimistic_locked(&engine, k, 1, 1); - must_pessimistic_prewrite_put(&engine, k, v, k, 1, 1, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 1, 1, DoPessimisticCheck); must_locked(&engine, k, 1); must_commit(&engine, k, 1, 2); must_unlocked(&engine, k); @@ -516,7 +518,7 @@ pub mod tests { must_prewrite_lock_err(&engine, k, k, 8); must_err(&engine, k, k, 8, 8); must_succeed(&engine, k, k, 8, 9); - must_pessimistic_prewrite_put(&engine, k, v, k, 8, 8, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 8, 8, DoPessimisticCheck); must_commit(&engine, k, 8, 10); must_unlocked(&engine, k); @@ -525,16 +527,16 @@ pub mod tests { must_pessimistic_locked(&engine, k, 11, 11); must_cleanup(&engine, k, 11, 0); must_err(&engine, k, k, 11, 11); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 11, 11, true); + must_pessimistic_prewrite_put_err(&engine, k, v, k, 11, 11, DoPessimisticCheck); must_prewrite_lock_err(&engine, k, k, 11); must_unlocked(&engine, k); must_succeed(&engine, k, k, 12, 12); - must_pessimistic_prewrite_put(&engine, k, v, k, 12, 12, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 12, 12, DoPessimisticCheck); must_locked(&engine, k, 12); must_cleanup(&engine, k, 12, 0); must_err(&engine, k, k, 12, 12); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 12, 12, true); + must_pessimistic_prewrite_put_err(&engine, k, v, k, 12, 12, DoPessimisticCheck); must_prewrite_lock_err(&engine, k, k, 12); must_unlocked(&engine, k); @@ -543,9 +545,9 @@ pub mod tests { must_pessimistic_locked(&engine, k, 13, 13); must_succeed(&engine, k, k, 13, 13); must_pessimistic_locked(&engine, k, 13, 13); - must_pessimistic_prewrite_put(&engine, k, v, k, 13, 13, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 13, 13, DoPessimisticCheck); must_locked(&engine, k, 13); - must_pessimistic_prewrite_put(&engine, k, v, k, 13, 13, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 13, 13, DoPessimisticCheck); must_locked(&engine, k, 13); must_commit(&engine, k, 13, 14); must_unlocked(&engine, k); @@ -556,7 +558,7 @@ pub mod tests { must_succeed(&engine, k, k, 15, 15); must_pessimistic_locked(&engine, k, 15, 15); must_get(&engine, k, 16, v); - must_pessimistic_prewrite_delete(&engine, k, k, 15, 15, true); + must_pessimistic_prewrite_delete(&engine, k, k, 15, 15, DoPessimisticCheck); must_get_err(&engine, k, 16); must_commit(&engine, k, 15, 17); @@ -582,7 +584,7 @@ pub mod tests { // Acquire lock on a prewritten key should fail. must_succeed(&engine, k, k, 26, 26); must_pessimistic_locked(&engine, k, 26, 26); - must_pessimistic_prewrite_delete(&engine, k, k, 26, 26, true); + must_pessimistic_prewrite_delete(&engine, k, k, 26, 26, DoPessimisticCheck); must_locked(&engine, k, 26); must_err(&engine, k, k, 26, 26); must_locked(&engine, k, 26); @@ -595,7 +597,7 @@ pub mod tests { must_unlocked(&engine, k); must_get_none(&engine, k, 28); // Pessimistic prewrite on a committed key should fail. - must_pessimistic_prewrite_put_err(&engine, k, v, k, 26, 26, true); + must_pessimistic_prewrite_put_err(&engine, k, v, k, 26, 26, DoPessimisticCheck); must_unlocked(&engine, k); must_get_none(&engine, k, 28); // Currently we cannot avoid this. @@ -604,7 +606,7 @@ pub mod tests { must_unlocked(&engine, k); // Non pessimistic key in pessimistic transaction. - must_pessimistic_prewrite_put(&engine, k, v, k, 30, 30, false); + must_pessimistic_prewrite_put(&engine, k, v, k, 30, 30, SkipPessimisticCheck); must_locked(&engine, k, 30); must_commit(&engine, k, 30, 31); must_unlocked(&engine, k); @@ -628,13 +630,13 @@ pub mod tests { must_pessimistic_locked(&engine, k, 35, 37); // Cannot prewrite when there is another transaction's pessimistic lock. - must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 36, true); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 38, true); + must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 36, DoPessimisticCheck); + must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 38, DoPessimisticCheck); must_pessimistic_locked(&engine, k, 35, 37); // Cannot prewrite when there is another transaction's non-pessimistic lock. - must_pessimistic_prewrite_put(&engine, k, v, k, 35, 37, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 35, 37, DoPessimisticCheck); must_locked(&engine, k, 35); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 38, true); + must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 38, DoPessimisticCheck); must_locked(&engine, k, 35); // Commit pessimistic transaction's key but with smaller commit_ts than @@ -648,7 +650,7 @@ pub mod tests { // Currently not checked, so prewrite will success. must_succeed(&engine, k, k, 40, 40); must_pessimistic_locked(&engine, k, 40, 40); - must_pessimistic_prewrite_put(&engine, k, v, k, 40, 40, false); + must_pessimistic_prewrite_put(&engine, k, v, k, 40, 40, SkipPessimisticCheck); must_locked(&engine, k, 40); must_commit(&engine, k, 40, 41); must_unlocked(&engine, k); @@ -657,14 +659,14 @@ pub mod tests { // Currently not checked. must_succeed(&engine, k, k, 42, 45); must_pessimistic_locked(&engine, k, 42, 45); - must_pessimistic_prewrite_put(&engine, k, v, k, 42, 43, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 42, 43, DoPessimisticCheck); must_locked(&engine, k, 42); must_commit(&engine, k, 42, 45); must_unlocked(&engine, k); must_succeed(&engine, k, k, 46, 47); must_pessimistic_locked(&engine, k, 46, 47); - must_pessimistic_prewrite_put(&engine, k, v, k, 46, 48, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 46, 48, DoPessimisticCheck); must_locked(&engine, k, 46); must_commit(&engine, k, 46, 50); must_unlocked(&engine, k); @@ -674,7 +676,7 @@ pub mod tests { // Normally non-pessimistic keys in pessimistic transactions are used when we // are sure that there won't be conflicts. So this case is also not checked, and // prewrite will succeeed. - must_pessimistic_prewrite_put(&engine, k, v, k, 47, 48, false); + must_pessimistic_prewrite_put(&engine, k, v, k, 47, 48, SkipPessimisticCheck); must_locked(&engine, k, 47); must_cleanup(&engine, k, 47, 0); must_unlocked(&engine, k); @@ -682,7 +684,7 @@ pub mod tests { // The rollback of the primary key in a pessimistic transaction should be // protected from being collapsed. must_succeed(&engine, k, k, 49, 60); - must_pessimistic_prewrite_put(&engine, k, v, k, 49, 60, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 49, 60, DoPessimisticCheck); must_locked(&engine, k, 49); must_cleanup(&engine, k, 49, 0); must_get_rollback_protected(&engine, k, 49, true); @@ -694,7 +696,7 @@ pub mod tests { // to another write records' commit ts. Now there is a commit record with // commit_ts = 50. must_succeed(&engine, k, k, 50, 61); - must_pessimistic_prewrite_put(&engine, k, v, k, 50, 61, true); + must_pessimistic_prewrite_put(&engine, k, v, k, 50, 61, DoPessimisticCheck); must_locked(&engine, k, 50); must_cleanup(&engine, k, 50, 0); must_get_overlapped_rollback(&engine, k, 50, 46, WriteType::Put, Some(0)); @@ -704,7 +706,15 @@ pub mod tests { let for_update_ts = start_ts + 48; let commit_ts = start_ts + 50; must_succeed(&engine, k, k, *start_ts, for_update_ts); - must_pessimistic_prewrite_put(&engine, k, v, k, *start_ts, for_update_ts, true); + must_pessimistic_prewrite_put( + &engine, + k, + v, + k, + *start_ts, + for_update_ts, + DoPessimisticCheck, + ); must_commit(&engine, k, *start_ts, commit_ts); must_get(&engine, k, commit_ts + 1, v); } @@ -946,13 +956,13 @@ pub mod tests { // Put v1 @ start ts 1, commit ts 2 must_succeed(&engine, k, k, 1, 1); - must_pessimistic_prewrite_put(&engine, k, v1, k, 1, 1, true); + must_pessimistic_prewrite_put(&engine, k, v1, k, 1, 1, DoPessimisticCheck); must_commit(&engine, k, 1, 2); let v2 = b"v2"; // Put v2 @ start ts 10, commit ts 11 must_succeed(&engine, k, k, 10, 10); - must_pessimistic_prewrite_put(&engine, k, v2, k, 10, 10, true); + must_pessimistic_prewrite_put(&engine, k, v2, k, 10, 10, DoPessimisticCheck); must_commit(&engine, k, 10, 11); // Lock @ start ts 9, for update ts 12, commit ts 13 @@ -1079,7 +1089,7 @@ pub mod tests { // T1: start_ts = 3, commit_ts = 5, put key:value must_succeed(&engine, key, key, 3, 3); - must_pessimistic_prewrite_put(&engine, key, value, key, 3, 3, true); + must_pessimistic_prewrite_put(&engine, key, value, key, 3, 3, DoPessimisticCheck); must_commit(&engine, key, 3, 5); // T2: start_ts = 15, acquire pessimistic lock on k, with should_not_exist flag @@ -1114,7 +1124,7 @@ pub mod tests { // T3: start_ts = 8, commit_ts = max_ts + 1 = 16, prewrite a DELETE operation on // k must_succeed(&engine, key, key, 8, 8); - must_pessimistic_prewrite_delete(&engine, key, key, 8, 8, true); + must_pessimistic_prewrite_delete(&engine, key, key, 8, 8, DoPessimisticCheck); must_commit(&engine, key, 8, cm.max_ts().into_inner() + 1); // T1: start_ts = 10, repeatedly acquire pessimistic lock on k, with diff --git a/src/storage/txn/actions/cleanup.rs b/src/storage/txn/actions/cleanup.rs index 461b8e2d432..19cb90f0a22 100644 --- a/src/storage/txn/actions/cleanup.rs +++ b/src/storage/txn/actions/cleanup.rs @@ -82,6 +82,8 @@ pub mod tests { use concurrency_manager::ConcurrencyManager; use engine_traits::CF_WRITE; use kvproto::kvrpcpb::Context; + #[cfg(test)] + use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; use txn_types::TimeStamp; use super::*; @@ -233,7 +235,7 @@ pub mod tests { must_get_rollback_protected(&engine, k, ts(11, 1), true); must_acquire_pessimistic_lock(&engine, k, k, ts(13, 1), ts(14, 1)); - must_pessimistic_prewrite_put(&engine, k, v, k, ts(13, 1), ts(14, 1), true); + must_pessimistic_prewrite_put(&engine, k, v, k, ts(13, 1), ts(14, 1), DoPessimisticCheck); must_succeed(&engine, k, ts(13, 1), ts(120, 0)); must_get_rollback_protected(&engine, k, ts(13, 1), true); } diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index 456757285e0..2351e0c3282 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -107,6 +107,8 @@ pub fn commit( pub mod tests { use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::Context; + #[cfg(test)] + use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; use txn_types::TimeStamp; use super::*; @@ -275,7 +277,7 @@ pub mod tests { k, &None, ts(60, 0), - true, + DoPessimisticCheck, 50, ts(60, 0), 1, diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index e7ca85c8137..7b562af8b43 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -4,7 +4,10 @@ use std::cmp; use fail::fail_point; -use kvproto::kvrpcpb::{Assertion, AssertionLevel}; +use kvproto::kvrpcpb::{ + Assertion, AssertionLevel, + PrewriteRequestPessimisticAction::{self, *}, +}; use txn_types::{ is_short_value, Key, Mutation, MutationType, OldValue, TimeStamp, Value, Write, WriteType, }; @@ -28,10 +31,10 @@ pub fn prewrite( txn_props: &TransactionProperties<'_>, mutation: Mutation, secondary_keys: &Option>>, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, ) -> Result<(TimeStamp, OldValue)> { let mut mutation = - PrewriteMutation::from_mutation(mutation, secondary_keys, is_pessimistic_lock, txn_props)?; + PrewriteMutation::from_mutation(mutation, secondary_keys, pessimistic_action, txn_props)?; // Update max_ts for Insert operation to guarantee linearizability and snapshot // isolation @@ -56,8 +59,8 @@ pub fn prewrite( let mut lock_amended = false; let lock_status = match reader.load_lock(&mutation.key)? { - Some(lock) => mutation.check_lock(lock, is_pessimistic_lock)?, - None if is_pessimistic_lock => { + Some(lock) => mutation.check_lock(lock, pessimistic_action)?, + None if matches!(pessimistic_action, DoPessimisticCheck) => { amend_pessimistic_lock(&mutation, reader)?; lock_amended = true; LockStatus::None @@ -228,7 +231,7 @@ struct PrewriteMutation<'a> { mutation_type: MutationType, secondary_keys: &'a Option>>, min_commit_ts: TimeStamp, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, lock_type: Option, lock_ttl: u64, @@ -243,7 +246,7 @@ impl<'a> PrewriteMutation<'a> { fn from_mutation( mutation: Mutation, secondary_keys: &'a Option>>, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, txn_props: &'a TransactionProperties<'a>, ) -> Result> { let should_not_write = mutation.should_not_write(); @@ -265,7 +268,7 @@ impl<'a> PrewriteMutation<'a> { mutation_type, secondary_keys, min_commit_ts: txn_props.min_commit_ts, - is_pessimistic_lock, + pessimistic_action, lock_type, lock_ttl: txn_props.lock_ttl, @@ -291,11 +294,15 @@ impl<'a> PrewriteMutation<'a> { } /// Check whether the current key is locked at any timestamp. - fn check_lock(&mut self, lock: Lock, is_pessimistic_lock: bool) -> Result { + fn check_lock( + &mut self, + lock: Lock, + pessimistic_action: PrewriteRequestPessimisticAction, + ) -> Result { if lock.ts != self.txn_props.start_ts { // Abort on lock belonging to other transaction if // prewrites a pessimistic lock. - if is_pessimistic_lock { + if matches!(pessimistic_action, DoPessimisticCheck) { warn!( "prewrite failed (pessimistic lock not found)"; "start_ts" => self.txn_props.start_ts, @@ -360,7 +367,12 @@ impl<'a> PrewriteMutation<'a> { // Note: PessimisticLockNotFound can happen on a non-pessimistically locked key, // if it is a retrying prewrite request. TransactionKind::Pessimistic(for_update_ts) => { - if commit_ts > for_update_ts { + if let DoConstraintCheck = self.pessimistic_action { + if commit_ts > self.txn_props.start_ts { + MVCC_CONFLICT_COUNTER.prewrite_write_conflict.inc(); + self.write_conflict_error(&write, commit_ts)?; + } + } else if commit_ts > for_update_ts { warn!("conflicting write was found, pessimistic lock must be lost for the corresponding row key"; "key" => %self.key, "start_ts" => self.txn_props.start_ts, @@ -570,10 +582,16 @@ impl<'a> PrewriteMutation<'a> { match &self.txn_props.kind { TransactionKind::Optimistic(s) => *s, TransactionKind::Pessimistic(_) => { - // For non-pessimistic-locked keys, do not skip constraint check when retrying. - // This intents to protect idempotency. - // Ref: https://github.com/tikv/tikv/issues/11187 - self.is_pessimistic_lock || !self.txn_props.is_retry_request + match self.pessimistic_action { + DoPessimisticCheck => true, + // For non-pessimistic-locked keys, do not skip constraint check when retrying. + // This intents to protect idempotency. + // Ref: https://github.com/tikv/tikv/issues/11187 + SkipPessimisticCheck => !self.txn_props.is_retry_request, + // For keys that postpones constraint check to prewrite, do not skip constraint + // check. + PrewriteRequestPessimisticAction::DoConstraintCheck => false, + } } } } @@ -782,7 +800,7 @@ pub mod tests { &props, Mutation::make_insert(Key::from_raw(key), value.to_vec()), &None, - false, + SkipPessimisticCheck, )?; // Insert must be None if the key is not lock, or be Unspecified if the // key is already locked. @@ -813,7 +831,7 @@ pub mod tests { &optimistic_txn_props(pk, ts), Mutation::make_check_not_exists(Key::from_raw(key)), &None, - true, + DoPessimisticCheck, )?; assert_eq!(old_value, OldValue::Unspecified); Ok(()) @@ -835,7 +853,7 @@ pub mod tests { &optimistic_async_props(b"k1", 10.into(), 50.into(), 2, false), Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &Some(vec![b"k2".to_vec()]), - false, + SkipPessimisticCheck, ) .unwrap(); assert_eq!(old_value, OldValue::None); @@ -848,7 +866,7 @@ pub mod tests { &optimistic_async_props(b"k1", 10.into(), 50.into(), 1, false), Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()), &Some(vec![]), - false, + SkipPessimisticCheck, ) .unwrap_err(); assert!(matches!( @@ -883,7 +901,7 @@ pub mod tests { &props, Mutation::make_check_not_exists(Key::from_raw(b"k0")), &Some(vec![]), - false, + SkipPessimisticCheck, ) .unwrap(); assert!(min_ts > props.start_ts); @@ -903,7 +921,7 @@ pub mod tests { &props, Mutation::make_check_not_exists(Key::from_raw(b"k0")), &Some(vec![]), - false, + SkipPessimisticCheck, ) .unwrap(); assert_eq!(cm.max_ts(), props.start_ts); @@ -918,7 +936,7 @@ pub mod tests { &optimistic_async_props(b"k1", 10.into(), 50.into(), 2, false), Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &Some(vec![b"k2".to_vec()]), - false, + SkipPessimisticCheck, ) .unwrap(); assert!(min_ts > 42.into()); @@ -941,7 +959,7 @@ pub mod tests { &optimistic_async_props(b"k3", 44.into(), 50.into(), 2, false), mutation.clone(), &Some(vec![b"k4".to_vec()]), - false, + SkipPessimisticCheck, ) .unwrap(); assert!(min_ts > 44.into()); @@ -963,7 +981,7 @@ pub mod tests { &props, mutation.clone(), &Some(vec![b"k6".to_vec()]), - false, + SkipPessimisticCheck, ) .unwrap(); assert!(min_ts > 45.into()); @@ -982,7 +1000,7 @@ pub mod tests { &props, mutation.clone(), &Some(vec![b"k8".to_vec()]), - false, + SkipPessimisticCheck, ) .unwrap(); assert!(min_ts >= 46.into()); @@ -1012,7 +1030,7 @@ pub mod tests { &optimistic_async_props(b"k1", 10.into(), 50.into(), 2, true), Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); assert_eq!(old_value, OldValue::None); @@ -1025,7 +1043,7 @@ pub mod tests { &optimistic_async_props(b"k1", 10.into(), 50.into(), 1, true), Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap_err(); assert!(matches!( @@ -1071,7 +1089,7 @@ pub mod tests { }, Mutation::make_check_not_exists(Key::from_raw(key)), &None, - false, + SkipPessimisticCheck, )?; assert_eq!(old_value, OldValue::Unspecified); Ok(()) @@ -1108,7 +1126,7 @@ pub mod tests { &txn_props, Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &Some(vec![b"k2".to_vec()]), - true, + DoPessimisticCheck, ) .unwrap(); // Pessimistic txn skips constraint check, does not read previous write. @@ -1122,7 +1140,7 @@ pub mod tests { &txn_props, Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()), &Some(vec![]), - true, + DoPessimisticCheck, ) .unwrap_err(); } @@ -1158,7 +1176,7 @@ pub mod tests { &txn_props, Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &None, - true, + DoPessimisticCheck, ) .unwrap(); // Pessimistic txn skips constraint check, does not read previous write. @@ -1172,7 +1190,7 @@ pub mod tests { &txn_props, Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()), &None, - true, + DoPessimisticCheck, ) .unwrap_err(); } @@ -1278,7 +1296,7 @@ pub mod tests { &txn_props, Mutation::make_check_not_exists(Key::from_raw(key)), &None, - false, + SkipPessimisticCheck, ); if success { let res = res.unwrap(); @@ -1293,7 +1311,7 @@ pub mod tests { &txn_props, Mutation::make_insert(Key::from_raw(key), b"value".to_vec()), &None, - false, + SkipPessimisticCheck, ); if success { let res = res.unwrap(); @@ -1348,7 +1366,7 @@ pub mod tests { &txn_props, Mutation::make_put(key.clone(), b"value".to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); assert_eq!(&old_value, expected_value, "key: {}", key); @@ -1368,7 +1386,7 @@ pub mod tests { &Some(vec![b"k2".to_vec()]), 10, 10, - true, + DoPessimisticCheck, 15, ); must_pessimistic_prewrite_put_async_commit( @@ -1379,7 +1397,7 @@ pub mod tests { &Some(vec![]), 10, 10, - false, + SkipPessimisticCheck, 15, ); @@ -1398,7 +1416,7 @@ pub mod tests { &Some(vec![]), 10, 10, - false, + SkipPessimisticCheck, 0, ); assert!(matches!( @@ -1429,7 +1447,7 @@ pub mod tests { &Some(vec![]), 10, 10, - false, + SkipPessimisticCheck, 0, ); assert!(matches!( @@ -1439,7 +1457,15 @@ pub mod tests { must_unlocked(&engine, b"k2"); let err = must_retry_pessimistic_prewrite_put_err( - &engine, b"k2", b"v2", b"k1", &None, 10, 10, false, 0, + &engine, + b"k2", + b"v2", + b"k1", + &None, + 10, + 10, + SkipPessimisticCheck, + 0, ); assert!(matches!( err, @@ -1451,7 +1477,15 @@ pub mod tests { // Try a different txn start ts (which haven't been successfully committed // before). let err = must_retry_pessimistic_prewrite_put_err( - &engine, b"k2", b"v2", b"k1", &None, 11, 11, false, 0, + &engine, + b"k2", + b"v2", + b"k1", + &None, + 11, + 11, + SkipPessimisticCheck, + 0, ); assert!(matches!( err, @@ -1467,7 +1501,7 @@ pub mod tests { b"k1", &None, 12.into(), - false, + SkipPessimisticCheck, 100, 12.into(), 1, @@ -1490,7 +1524,7 @@ pub mod tests { b"k1", &None, 13.into(), - false, + SkipPessimisticCheck, 100, 55.into(), 1, @@ -1545,7 +1579,7 @@ pub mod tests { &txn_props, Mutation::make_put(Key::from_raw(b"k1"), b"value".to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); assert_eq!( @@ -1599,7 +1633,7 @@ pub mod tests { &txn_props, Mutation::make_insert(Key::from_raw(b"k1"), b"v2".to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); assert_eq!(old_value, OldValue::None); @@ -1736,7 +1770,7 @@ pub mod tests { &txn_props, Mutation::make_put(Key::from_raw(key), b"v2".to_vec()), &None, - false, + SkipPessimisticCheck, )?; Ok(old_value) })], @@ -1772,7 +1806,7 @@ pub mod tests { &txn_props, Mutation::make_insert(Key::from_raw(key), b"v2".to_vec()), &None, - false, + SkipPessimisticCheck, )?; Ok(old_value) })], @@ -1786,7 +1820,7 @@ pub mod tests { let prewrite_put = |key: &'_ _, value, ts: u64, - is_pessimistic_lock, + pessimistic_action, for_update_ts: u64, assertion, assertion_level, @@ -1799,7 +1833,7 @@ pub mod tests { key, &None, ts.into(), - is_pessimistic_lock, + pessimistic_action, 100, for_update_ts.into(), 1, @@ -1818,7 +1852,7 @@ pub mod tests { &None, ts, for_update_ts, - is_pessimistic_lock, + pessimistic_action, 0, false, assertion, @@ -1843,7 +1877,7 @@ pub mod tests { &k1, b"v1", 10, - false, + SkipPessimisticCheck, 0, Assertion::NotExist, assertion_level, @@ -1855,7 +1889,7 @@ pub mod tests { &k1, b"v1", 20, - false, + SkipPessimisticCheck, 0, Assertion::Exist, assertion_level, @@ -1868,7 +1902,7 @@ pub mod tests { &k2, b"v2", 10, - true, + DoPessimisticCheck, 11, Assertion::NotExist, assertion_level, @@ -1880,7 +1914,7 @@ pub mod tests { &k2, b"v2", 20, - true, + DoPessimisticCheck, 21, Assertion::Exist, assertion_level, @@ -1894,7 +1928,7 @@ pub mod tests { &k1, b"v1", 30, - false, + SkipPessimisticCheck, 0, Assertion::NotExist, assertion_level, @@ -1904,7 +1938,7 @@ pub mod tests { &k3, b"v3", 30, - false, + SkipPessimisticCheck, 0, Assertion::Exist, assertion_level, @@ -1920,7 +1954,7 @@ pub mod tests { &k2, b"v2", 30, - true, + DoPessimisticCheck, 31, Assertion::NotExist, assertion_level, @@ -1930,7 +1964,7 @@ pub mod tests { &k4, b"v4", 30, - true, + DoPessimisticCheck, 31, Assertion::Exist, assertion_level, @@ -1939,14 +1973,14 @@ pub mod tests { must_rollback(&engine, &k2, 30, true); must_rollback(&engine, &k4, 30, true); - // Pessimistic transaction fail on strict level no matter whether - // `is_pessimistic_lock`. + // Pessimistic transaction fail on strict level no matter what + // `pessimistic_action` is. let pass = assertion_level != AssertionLevel::Strict; prewrite_put( &k1, b"v1", 40, - false, + SkipPessimisticCheck, 41, Assertion::NotExist, assertion_level, @@ -1956,7 +1990,7 @@ pub mod tests { &k3, b"v3", 40, - false, + SkipPessimisticCheck, 41, Assertion::Exist, assertion_level, @@ -1971,7 +2005,7 @@ pub mod tests { &k2, b"v2", 40, - true, + DoPessimisticCheck, 41, Assertion::NotExist, assertion_level, @@ -1981,7 +2015,7 @@ pub mod tests { &k4, b"v4", 40, - true, + DoPessimisticCheck, 41, Assertion::Exist, assertion_level, @@ -2027,4 +2061,39 @@ pub mod tests { test_all_levels(&prepare_delete); test_all_levels(&prepare_gc_fence); } + + #[test] + fn test_deferred_constraint_check() { + let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let key = b"key"; + let key2 = b"key2"; + let value = b"value"; + + // 1. write conflict + must_prewrite_put(&engine, key, value, key, 1); + must_commit(&engine, key, 1, 5); + must_pessimistic_prewrite_insert(&engine, key2, value, key, 3, 3, SkipPessimisticCheck); + let err = + must_pessimistic_prewrite_insert_err(&engine, key, value, key, 3, 3, DoConstraintCheck); + assert!(matches!(err, Error(box ErrorInner::WriteConflict { .. }))); + + // 2. unique constraint fail + must_prewrite_put(&engine, key, value, key, 11); + must_commit(&engine, key, 11, 12); + let err = must_pessimistic_prewrite_insert_err( + &engine, + key, + value, + key, + 13, + 13, + DoConstraintCheck, + ); + assert!(matches!(err, Error(box ErrorInner::AlreadyExist { .. }))); + + // 3. success + must_prewrite_delete(&engine, key, key, 21); + must_commit(&engine, key, 21, 22); + must_pessimistic_prewrite_insert(&engine, key, value, key, 23, 23, DoConstraintCheck); + } } diff --git a/src/storage/txn/actions/tests.rs b/src/storage/txn/actions/tests.rs index e5e4b57054c..523d4b9e8ac 100644 --- a/src/storage/txn/actions/tests.rs +++ b/src/storage/txn/actions/tests.rs @@ -3,7 +3,10 @@ //! This file contains tests and testing tools which affects multiple actions use concurrency_manager::ConcurrencyManager; -use kvproto::kvrpcpb::{Assertion, AssertionLevel, Context}; +use kvproto::kvrpcpb::{ + Assertion, AssertionLevel, Context, + PrewriteRequestPessimisticAction::{self, *}, +}; use prewrite::{prewrite, CommitKind, TransactionKind, TransactionProperties}; use super::*; @@ -20,7 +23,7 @@ pub fn must_prewrite_put_impl( pk: &[u8], secondary_keys: &Option>>, ts: TimeStamp, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, lock_ttl: u64, for_update_ts: TimeStamp, txn_size: u64, @@ -29,6 +32,81 @@ pub fn must_prewrite_put_impl( is_retry_request: bool, assertion: Assertion, assertion_level: AssertionLevel, +) { + must_prewrite_put_impl_with_should_not_exist( + engine, + key, + value, + pk, + secondary_keys, + ts, + pessimistic_action, + lock_ttl, + for_update_ts, + txn_size, + min_commit_ts, + max_commit_ts, + is_retry_request, + assertion, + assertion_level, + false, + ); +} + +pub fn must_prewrite_insert_impl( + engine: &E, + key: &[u8], + value: &[u8], + pk: &[u8], + secondary_keys: &Option>>, + ts: TimeStamp, + pessimistic_action: PrewriteRequestPessimisticAction, + lock_ttl: u64, + for_update_ts: TimeStamp, + txn_size: u64, + min_commit_ts: TimeStamp, + max_commit_ts: TimeStamp, + is_retry_request: bool, + assertion: Assertion, + assertion_level: AssertionLevel, +) { + must_prewrite_put_impl_with_should_not_exist( + engine, + key, + value, + pk, + secondary_keys, + ts, + pessimistic_action, + lock_ttl, + for_update_ts, + txn_size, + min_commit_ts, + max_commit_ts, + is_retry_request, + assertion, + assertion_level, + true, + ); +} + +pub fn must_prewrite_put_impl_with_should_not_exist( + engine: &E, + key: &[u8], + value: &[u8], + pk: &[u8], + secondary_keys: &Option>>, + ts: TimeStamp, + pessimistic_action: PrewriteRequestPessimisticAction, + lock_ttl: u64, + for_update_ts: TimeStamp, + txn_size: u64, + min_commit_ts: TimeStamp, + max_commit_ts: TimeStamp, + is_retry_request: bool, + assertion: Assertion, + assertion_level: AssertionLevel, + should_not_exist: bool, ) { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -36,7 +114,11 @@ pub fn must_prewrite_put_impl( let mut txn = MvccTxn::new(ts, cm); let mut reader = SnapshotReader::new(ts, snapshot, true); - let mutation = Mutation::Put((Key::from_raw(key), value.to_vec()), assertion); + let mutation = if should_not_exist { + Mutation::Insert((Key::from_raw(key), value.to_vec()), assertion) + } else { + Mutation::Put((Key::from_raw(key), value.to_vec()), assertion) + }; let txn_kind = if for_update_ts.is_zero() { TransactionKind::Optimistic(false) } else { @@ -64,7 +146,7 @@ pub fn must_prewrite_put_impl( }, mutation, secondary_keys, - is_pessimistic_lock, + pessimistic_action, ) .unwrap(); write(engine, &ctx, txn.into_modifies()); @@ -84,7 +166,7 @@ pub fn must_prewrite_put( pk, &None, ts.into(), - false, + SkipPessimisticCheck, 0, TimeStamp::default(), 0, @@ -103,7 +185,7 @@ pub fn must_pessimistic_prewrite_put( pk: &[u8], ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, ) { must_prewrite_put_impl( engine, @@ -112,7 +194,35 @@ pub fn must_pessimistic_prewrite_put( pk, &None, ts.into(), - is_pessimistic_lock, + pessimistic_action, + 0, + for_update_ts.into(), + 0, + TimeStamp::default(), + TimeStamp::default(), + false, + Assertion::None, + AssertionLevel::Off, + ); +} + +pub fn must_pessimistic_prewrite_insert( + engine: &E, + key: &[u8], + value: &[u8], + pk: &[u8], + ts: impl Into, + for_update_ts: impl Into, + pessimistic_action: PrewriteRequestPessimisticAction, +) { + must_prewrite_insert_impl( + engine, + key, + value, + pk, + &None, + ts.into(), + pessimistic_action, 0, for_update_ts.into(), 0, @@ -131,7 +241,7 @@ pub fn must_pessimistic_prewrite_put_with_ttl( pk: &[u8], ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, lock_ttl: u64, ) { must_prewrite_put_impl( @@ -141,7 +251,7 @@ pub fn must_pessimistic_prewrite_put_with_ttl( pk, &None, ts.into(), - is_pessimistic_lock, + pessimistic_action, lock_ttl, for_update_ts.into(), 0, @@ -166,6 +276,11 @@ pub fn must_prewrite_put_for_large_txn( let ts = ts.into(); let min_commit_ts = (ts.into_inner() + 1).into(); let for_update_ts = for_update_ts.into(); + let pessimistic_action = if !for_update_ts.is_zero() { + DoPessimisticCheck + } else { + SkipPessimisticCheck + }; must_prewrite_put_impl( engine, key, @@ -173,7 +288,7 @@ pub fn must_prewrite_put_for_large_txn( pk, &None, ts, - !for_update_ts.is_zero(), + pessimistic_action, lock_ttl, for_update_ts, 0, @@ -202,7 +317,7 @@ pub fn must_prewrite_put_async_commit( pk, secondary_keys, ts.into(), - false, + SkipPessimisticCheck, 100, TimeStamp::default(), 0, @@ -222,7 +337,7 @@ pub fn must_pessimistic_prewrite_put_async_commit( secondary_keys: &Option>>, ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, min_commit_ts: impl Into, ) { assert!(secondary_keys.is_some()); @@ -233,7 +348,7 @@ pub fn must_pessimistic_prewrite_put_async_commit( pk, secondary_keys, ts.into(), - is_pessimistic_lock, + pessimistic_action, 100, for_update_ts.into(), 0, @@ -269,6 +384,7 @@ fn default_txn_props( assertion_level: AssertionLevel::Off, } } + pub fn must_prewrite_put_err_impl( engine: &E, key: &[u8], @@ -277,11 +393,74 @@ pub fn must_prewrite_put_err_impl( secondary_keys: &Option>>, ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, + max_commit_ts: impl Into, + is_retry_request: bool, + assertion: Assertion, + assertion_level: AssertionLevel, +) -> Error { + must_prewrite_put_err_impl_with_should_not_exist( + engine, + key, + value, + pk, + secondary_keys, + ts.into(), + for_update_ts.into(), + pessimistic_action, + max_commit_ts.into(), + is_retry_request, + assertion, + assertion_level, + false, + ) +} + +pub fn must_prewrite_insert_err_impl( + engine: &E, + key: &[u8], + value: &[u8], + pk: &[u8], + secondary_keys: &Option>>, + ts: impl Into, + for_update_ts: impl Into, + pessimistic_action: PrewriteRequestPessimisticAction, + max_commit_ts: impl Into, + is_retry_request: bool, + assertion: Assertion, + assertion_level: AssertionLevel, +) -> Error { + must_prewrite_put_err_impl_with_should_not_exist( + engine, + key, + value, + pk, + secondary_keys, + ts.into(), + for_update_ts.into(), + pessimistic_action, + max_commit_ts.into(), + is_retry_request, + assertion, + assertion_level, + true, + ) +} + +pub fn must_prewrite_put_err_impl_with_should_not_exist( + engine: &E, + key: &[u8], + value: &[u8], + pk: &[u8], + secondary_keys: &Option>>, + ts: impl Into, + for_update_ts: impl Into, + pessimistic_action: PrewriteRequestPessimisticAction, max_commit_ts: impl Into, is_retry_request: bool, assertion: Assertion, assertion_level: AssertionLevel, + should_not_exist: bool, ) -> Error { let snapshot = engine.snapshot(Default::default()).unwrap(); let for_update_ts = for_update_ts.into(); @@ -289,7 +468,11 @@ pub fn must_prewrite_put_err_impl( let ts = ts.into(); let mut txn = MvccTxn::new(ts, cm); let mut reader = SnapshotReader::new(ts, snapshot, true); - let mutation = Mutation::Put((Key::from_raw(key), value.to_vec()), assertion); + let mutation = if should_not_exist { + Mutation::Insert((Key::from_raw(key), value.to_vec()), assertion) + } else { + Mutation::Put((Key::from_raw(key), value.to_vec()), assertion) + }; let commit_kind = if secondary_keys.is_some() { CommitKind::Async(max_commit_ts.into()) } else { @@ -306,7 +489,7 @@ pub fn must_prewrite_put_err_impl( &props, mutation, &None, - is_pessimistic_lock, + pessimistic_action, ) .unwrap_err() } @@ -326,7 +509,7 @@ pub fn must_prewrite_put_err( &None, ts, TimeStamp::zero(), - false, + SkipPessimisticCheck, 0, false, Assertion::None, @@ -341,7 +524,7 @@ pub fn must_pessimistic_prewrite_put_err( pk: &[u8], ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, ) -> Error { must_prewrite_put_err_impl( engine, @@ -351,7 +534,32 @@ pub fn must_pessimistic_prewrite_put_err( &None, ts, for_update_ts, - is_pessimistic_lock, + pessimistic_action, + 0, + false, + Assertion::None, + AssertionLevel::Off, + ) +} + +pub fn must_pessimistic_prewrite_insert_err( + engine: &E, + key: &[u8], + value: &[u8], + pk: &[u8], + ts: impl Into, + for_update_ts: impl Into, + pessimistic_action: PrewriteRequestPessimisticAction, +) -> Error { + must_prewrite_insert_err_impl( + engine, + key, + value, + pk, + &None, + ts, + for_update_ts, + pessimistic_action, 0, false, Assertion::None, @@ -367,7 +575,7 @@ pub fn must_retry_pessimistic_prewrite_put_err( secondary_keys: &Option>>, ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, max_commit_ts: impl Into, ) -> Error { must_prewrite_put_err_impl( @@ -378,7 +586,7 @@ pub fn must_retry_pessimistic_prewrite_put_err( secondary_keys, ts, for_update_ts, - is_pessimistic_lock, + pessimistic_action, max_commit_ts, true, Assertion::None, @@ -392,7 +600,7 @@ fn must_prewrite_delete_impl( pk: &[u8], ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, ) { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -409,7 +617,7 @@ fn must_prewrite_delete_impl( &default_txn_props(ts, pk, for_update_ts), mutation, &None, - is_pessimistic_lock, + pessimistic_action, ) .unwrap(); @@ -424,7 +632,7 @@ pub fn must_prewrite_delete( pk: &[u8], ts: impl Into, ) { - must_prewrite_delete_impl(engine, key, pk, ts, TimeStamp::zero(), false); + must_prewrite_delete_impl(engine, key, pk, ts, TimeStamp::zero(), SkipPessimisticCheck); } pub fn must_pessimistic_prewrite_delete( @@ -433,9 +641,9 @@ pub fn must_pessimistic_prewrite_delete( pk: &[u8], ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, ) { - must_prewrite_delete_impl(engine, key, pk, ts, for_update_ts, is_pessimistic_lock); + must_prewrite_delete_impl(engine, key, pk, ts, for_update_ts, pessimistic_action); } fn must_prewrite_lock_impl( @@ -444,7 +652,7 @@ fn must_prewrite_lock_impl( pk: &[u8], ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, ) { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -461,7 +669,7 @@ fn must_prewrite_lock_impl( &default_txn_props(ts, pk, for_update_ts), mutation, &None, - is_pessimistic_lock, + pessimistic_action, ) .unwrap(); @@ -471,7 +679,7 @@ fn must_prewrite_lock_impl( } pub fn must_prewrite_lock(engine: &E, key: &[u8], pk: &[u8], ts: impl Into) { - must_prewrite_lock_impl(engine, key, pk, ts, TimeStamp::zero(), false); + must_prewrite_lock_impl(engine, key, pk, ts, TimeStamp::zero(), SkipPessimisticCheck); } pub fn must_prewrite_lock_err( @@ -492,7 +700,7 @@ pub fn must_prewrite_lock_err( &default_txn_props(ts, pk, TimeStamp::zero()), Mutation::make_lock(Key::from_raw(key)), &None, - false, + SkipPessimisticCheck, ) .unwrap_err(); } @@ -503,9 +711,9 @@ pub fn must_pessimistic_prewrite_lock( pk: &[u8], ts: impl Into, for_update_ts: impl Into, - is_pessimistic_lock: bool, + pessimistic_action: PrewriteRequestPessimisticAction, ) { - must_prewrite_lock_impl(engine, key, pk, ts, for_update_ts, is_pessimistic_lock); + must_prewrite_lock_impl(engine, key, pk, ts, for_update_ts, pessimistic_action); } pub fn must_rollback( diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 7fd4a45ff8a..24f69e9a237 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -147,7 +147,7 @@ impl WriteCommand for CheckTxnStatus { #[cfg(test)] pub mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::Context; + use kvproto::kvrpcpb::{Context, PrewriteRequestPessimisticAction::*}; use tikv_util::deadline::Deadline; use txn_types::{Key, WriteType}; @@ -388,7 +388,7 @@ pub mod tests { &Some(vec![]), 15, 16, - true, + DoPessimisticCheck, 17, ); // All following check_txn_status should return the unchanged lock information @@ -491,7 +491,7 @@ pub mod tests { &Some(vec![]), 20, 25, - true, + DoPessimisticCheck, 28, ); // the client must call check_txn_status with caller_start_ts == current_ts == @@ -520,7 +520,7 @@ pub mod tests { &Some(vec![]), 30, 35, - true, + DoPessimisticCheck, 36, ); // the client must call check_txn_status with caller_start_ts == current_ts == @@ -791,7 +791,7 @@ pub mod tests { must_large_txn_locked(&engine, k, ts(4, 0), 200, ts(135, 1), true); // Commit the key. - must_pessimistic_prewrite_put(&engine, k, v, k, ts(4, 0), ts(130, 0), true); + must_pessimistic_prewrite_put(&engine, k, v, k, ts(4, 0), ts(130, 0), DoPessimisticCheck); must_commit(&engine, k, ts(4, 0), ts(140, 0)); must_unlocked(&engine, k); must_get_commit_ts(&engine, k, ts(4, 0), ts(140, 0)); @@ -940,7 +940,7 @@ pub mod tests { k, &None, ts(300, 0), - false, + SkipPessimisticCheck, 100, TimeStamp::zero(), 1, @@ -1069,7 +1069,7 @@ pub mod tests { k, &None, ts(30, 0), - false, + SkipPessimisticCheck, 10, TimeStamp::zero(), 1, diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 7f748c352f7..3dc1a37697e 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -162,12 +162,12 @@ impl From for TypedCommand { req.take_context(), ) } else { - let is_pessimistic_lock = req.take_is_pessimistic_lock(); + let pessimistic_actions = req.take_pessimistic_actions(); let mutations = req .take_mutations() .into_iter() .map(Into::into) - .zip(is_pessimistic_lock.into_iter()) + .zip(pessimistic_actions) .collect(); PrewritePessimistic::new( mutations, @@ -803,7 +803,7 @@ pub mod test_util { pub fn pessimistic_prewrite( engine: &E, statistics: &mut Statistics, - mutations: Vec<(Mutation, bool)>, + mutations: Vec<(Mutation, PrewriteRequestPessimisticAction)>, primary: Vec, start_ts: u64, for_update_ts: u64, @@ -826,7 +826,7 @@ pub mod test_util { engine: &E, cm: ConcurrencyManager, statistics: &mut Statistics, - mutations: Vec<(Mutation, bool)>, + mutations: Vec<(Mutation, PrewriteRequestPessimisticAction)>, primary: Vec, start_ts: u64, for_update_ts: u64, diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index a6aa8af6f87..deca5733eb0 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -9,7 +9,10 @@ use std::mem; use engine_traits::CF_WRITE; -use kvproto::kvrpcpb::{AssertionLevel, ExtraOp}; +use kvproto::kvrpcpb::{ + AssertionLevel, ExtraOp, + PrewriteRequestPessimisticAction::{self, *}, +}; use tikv_kv::SnapshotExt; use txn_types::{Key, Mutation, OldValue, OldValues, TimeStamp, TxnExtra, Write, WriteType}; @@ -254,7 +257,7 @@ command! { cmd_ty => PrewriteResult, content => { /// The set of mutations to apply; the bool = is pessimistic lock. - mutations: Vec<(Mutation, bool)>, + mutations: Vec<(Mutation, PrewriteRequestPessimisticAction)>, /// The primary lock. Secondary locks (from `mutations`) will refer to the primary lock. primary: Vec, /// The transaction timestamp. @@ -308,7 +311,7 @@ impl std::fmt::Debug for PrewritePessimistic { impl PrewritePessimistic { #[cfg(test)] pub fn with_defaults( - mutations: Vec<(Mutation, bool)>, + mutations: Vec<(Mutation, PrewriteRequestPessimisticAction)>, primary: Vec, start_ts: TimeStamp, for_update_ts: TimeStamp, @@ -331,7 +334,7 @@ impl PrewritePessimistic { #[cfg(test)] pub fn with_1pc( - mutations: Vec<(Mutation, bool)>, + mutations: Vec<(Mutation, PrewriteRequestPessimisticAction)>, primary: Vec, start_ts: TimeStamp, for_update_ts: TimeStamp, @@ -549,7 +552,7 @@ impl Prewriter { let mut assertion_failure = None; for m in mem::take(&mut self.mutations) { - let is_pessimistic_lock = m.is_pessimistic_lock(); + let pessimistic_action = m.pessimistic_action(); let m = m.into_mutation(); let key = m.key().clone(); let mutation_type = m.mutation_type(); @@ -560,8 +563,7 @@ impl Prewriter { } let need_min_commit_ts = secondaries.is_some() || self.try_one_pc; - let prewrite_result = - prewrite(txn, reader, &props, m, secondaries, is_pessimistic_lock); + let prewrite_result = prewrite(txn, reader, &props, m, secondaries, pessimistic_action); match prewrite_result { Ok((ts, old_value)) if !(need_min_commit_ts && ts.is_zero()) => { if need_min_commit_ts && final_min_commit_ts < ts { @@ -781,7 +783,7 @@ struct Pessimistic { } impl PrewriteKind for Pessimistic { - type Mutation = (Mutation, bool); + type Mutation = (Mutation, PrewriteRequestPessimisticAction); fn txn_kind(&self) -> TransactionKind { TransactionKind::Pessimistic(self.for_update_ts) @@ -791,16 +793,17 @@ impl PrewriteKind for Pessimistic { /// The type of mutation and, optionally, its extra information, differing for /// the optimistic and pessimistic transaction. /// For optimistic txns, this is `Mutation`. -/// For pessimistic txns, this is `(Mutation, bool)`, where the bool indicates -/// whether the mutation takes a pessimistic lock or not. +/// For pessimistic txns, this is `(Mutation, PessimisticAction)`, where the +/// action indicates what kind of operations(checks) need to be performed. +/// The action also implies the type of the lock status. trait MutationLock { - fn is_pessimistic_lock(&self) -> bool; + fn pessimistic_action(&self) -> PrewriteRequestPessimisticAction; fn into_mutation(self) -> Mutation; } impl MutationLock for Mutation { - fn is_pessimistic_lock(&self) -> bool { - false + fn pessimistic_action(&self) -> PrewriteRequestPessimisticAction { + SkipPessimisticCheck } fn into_mutation(self) -> Mutation { @@ -808,8 +811,8 @@ impl MutationLock for Mutation { } } -impl MutationLock for (Mutation, bool) { - fn is_pessimistic_lock(&self) -> bool { +impl MutationLock for (Mutation, PrewriteRequestPessimisticAction) { + fn pessimistic_action(&self) -> PrewriteRequestPessimisticAction { self.1 } @@ -1185,7 +1188,10 @@ mod tests { must_acquire_pessimistic_lock(&engine, key, key, 10, 10); - let mutations = vec![(Mutation::make_put(Key::from_raw(key), value.to_vec()), true)]; + let mutations = vec![( + Mutation::make_put(Key::from_raw(key), value.to_vec()), + DoPessimisticCheck, + )]; let mut statistics = Statistics::default(); pessimistic_prewrite_with_cm( &engine, @@ -1209,8 +1215,14 @@ mod tests { must_acquire_pessimistic_lock(&engine, k1, k1, 8, 12); let mutations = vec![ - (Mutation::make_put(Key::from_raw(k1), v1.to_vec()), true), - (Mutation::make_put(Key::from_raw(k2), v2.to_vec()), false), + ( + Mutation::make_put(Key::from_raw(k1), v1.to_vec()), + DoPessimisticCheck, + ), + ( + Mutation::make_put(Key::from_raw(k2), v2.to_vec()), + SkipPessimisticCheck, + ), ]; statistics = Statistics::default(); pessimistic_prewrite_with_cm( @@ -1235,7 +1247,10 @@ mod tests { cm.update_max_ts(50.into()); must_acquire_pessimistic_lock(&engine, k1, k1, 20, 20); - let mutations = vec![(Mutation::make_put(Key::from_raw(k1), v1.to_vec()), true)]; + let mutations = vec![( + Mutation::make_put(Key::from_raw(k1), v1.to_vec()), + DoPessimisticCheck, + )]; statistics = Statistics::default(); let res = pessimistic_prewrite_with_cm( &engine, @@ -1272,8 +1287,14 @@ mod tests { .unwrap(); // Try 1PC on the two keys and it will fail on the second one. let mutations = vec![ - (Mutation::make_put(Key::from_raw(k1), v1.to_vec()), true), - (Mutation::make_put(Key::from_raw(k2), v2.to_vec()), false), + ( + Mutation::make_put(Key::from_raw(k1), v1.to_vec()), + DoPessimisticCheck, + ), + ( + Mutation::make_put(Key::from_raw(k2), v2.to_vec()), + SkipPessimisticCheck, + ), ]; must_acquire_pessimistic_lock(&engine, k1, k1, 60, 60); pessimistic_prewrite_with_cm( @@ -1369,7 +1390,10 @@ mod tests { must_acquire_pessimistic_lock(&engine, key, key, 10, 10); - let mutations = vec![(Mutation::make_put(Key::from_raw(key), value.to_vec()), true)]; + let mutations = vec![( + Mutation::make_put(Key::from_raw(key), value.to_vec()), + DoPessimisticCheck, + )]; let mut statistics = Statistics::default(); let cmd = super::PrewritePessimistic::new( mutations, @@ -1400,8 +1424,14 @@ mod tests { must_acquire_pessimistic_lock(&engine, k2, k1, 20, 20); let mutations = vec![ - (Mutation::make_put(Key::from_raw(k1), v1.to_vec()), true), - (Mutation::make_put(Key::from_raw(k2), v2.to_vec()), true), + ( + Mutation::make_put(Key::from_raw(k1), v1.to_vec()), + DoPessimisticCheck, + ), + ( + Mutation::make_put(Key::from_raw(k2), v2.to_vec()), + DoPessimisticCheck, + ), ]; let mut statistics = Statistics::default(); // calculated_ts > max_commit_ts @@ -1605,7 +1635,10 @@ mod tests { }; let cmd = if case.pessimistic { PrewritePessimistic::new( - mutations.iter().map(|it| (it.clone(), false)).collect(), + mutations + .iter() + .map(|it| (it.clone(), SkipPessimisticCheck)) + .collect(), keys[0].to_vec(), start_ts, 0, @@ -1813,7 +1846,7 @@ mod tests { &Some(vec![]), 5, 5, - true, + DoPessimisticCheck, 10, ); must_commit(&engine, key, 5, 10); @@ -1821,7 +1854,10 @@ mod tests { // T2: start_ts = 15, commit_ts = 16, 1PC must_acquire_pessimistic_lock(&engine, key, key, 15, 15); let cmd = PrewritePessimistic::with_1pc( - vec![(Mutation::make_put(Key::from_raw(key), b"v2".to_vec()), true)], + vec![( + Mutation::make_put(Key::from_raw(key), b"v2".to_vec()), + DoPessimisticCheck, + )], key.to_vec(), 15.into(), 15.into(), @@ -1836,7 +1872,10 @@ mod tests { // Repeating the T1 prewrite request let cmd = PrewritePessimistic::new( - vec![(Mutation::make_put(Key::from_raw(key), b"v1".to_vec()), true)], + vec![( + Mutation::make_put(Key::from_raw(key), b"v1".to_vec()), + DoPessimisticCheck, + )], key.to_vec(), 5.into(), 200, @@ -1871,7 +1910,10 @@ mod tests { // Repeating the T2 prewrite request let cmd = PrewritePessimistic::with_1pc( - vec![(Mutation::make_put(Key::from_raw(key), b"v2".to_vec()), true)], + vec![( + Mutation::make_put(Key::from_raw(key), b"v2".to_vec()), + DoPessimisticCheck, + )], key.to_vec(), 15.into(), 15.into(), @@ -1909,11 +1951,11 @@ mod tests { let mutations = vec![ ( Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), - false, + SkipPessimisticCheck, ), ( Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()), - true, + DoPessimisticCheck, ), ]; let res = pessimistic_prewrite_with_cm( @@ -1960,13 +2002,13 @@ mod tests { pk: &[u8], secondary_keys, ts: u64, - is_pessimistic_lock, + pessimistic_action, is_retry_request| { let mutation = Mutation::make_put(Key::from_raw(key), value.to_vec()); let mut ctx = Context::default(); ctx.set_is_retry_request(is_retry_request); let cmd = PrewritePessimistic::new( - vec![(mutation, is_pessimistic_lock)], + vec![(mutation, pessimistic_action)], pk.to_vec(), ts.into(), 100, @@ -1991,7 +2033,7 @@ mod tests { &Some(vec![b"k2".to_vec()]), 10, 10, - true, + DoPessimisticCheck, 15, ); must_pessimistic_prewrite_put_async_commit( @@ -2002,7 +2044,7 @@ mod tests { &Some(vec![]), 10, 10, - false, + SkipPessimisticCheck, 15, ); @@ -2011,7 +2053,16 @@ mod tests { must_commit(&engine, b"k2", 10, 20); // This is a re-sent prewrite. - prewrite_with_retry_flag(b"k2", b"v2", b"k1", Some(vec![]), 10, false, true).unwrap(); + prewrite_with_retry_flag( + b"k2", + b"v2", + b"k1", + Some(vec![]), + 10, + SkipPessimisticCheck, + true, + ) + .unwrap(); // Commit repeatedly, these operations should have no effect. must_commit(&engine, b"k1", 10, 25); must_commit(&engine, b"k2", 10, 25); @@ -2029,16 +2080,28 @@ mod tests { // A retrying non-pessimistic-lock prewrite request should not skip constraint // checks. Here it should take no effect, even there's already a newer version // after it. (No matter if it's async commit). - prewrite_with_retry_flag(b"k2", b"v2", b"k1", Some(vec![]), 10, false, true).unwrap(); + prewrite_with_retry_flag( + b"k2", + b"v2", + b"k1", + Some(vec![]), + 10, + SkipPessimisticCheck, + true, + ) + .unwrap(); must_unlocked(&engine, b"k2"); - prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 10, false, true).unwrap(); + prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 10, SkipPessimisticCheck, true) + .unwrap(); must_unlocked(&engine, b"k2"); // Committing still does nothing. must_commit(&engine, b"k2", 10, 25); // Try a different txn start ts (which haven't been successfully committed // before). It should report a PessimisticLockNotFound. - let err = prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 11, false, true).unwrap_err(); + let err = + prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 11, SkipPessimisticCheck, true) + .unwrap_err(); assert!(matches!( err, Error(box ErrorInner::Mvcc(MvccError( @@ -2048,7 +2111,8 @@ mod tests { must_unlocked(&engine, b"k2"); // However conflict still won't be checked if there's a non-retry request // arriving. - prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 10, false, false).unwrap(); + prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 10, SkipPessimisticCheck, false) + .unwrap(); must_locked(&engine, b"k2", 10); } @@ -2096,7 +2160,10 @@ mod tests { must_rollback(&engine, k1, 10, true); must_acquire_pessimistic_lock(&engine, k1, v1, 15, 15); let prewrite_cmd = PrewritePessimistic::with_defaults( - vec![(Mutation::make_put(Key::from_raw(k1), v1.to_vec()), true)], + vec![( + Mutation::make_put(Key::from_raw(k1), v1.to_vec()), + DoPessimisticCheck, + )], k1.to_vec(), 10.into(), 10.into(), @@ -2149,7 +2216,7 @@ mod tests { b"row", &None, t2_start_ts, - true, + DoPessimisticCheck, 1000, t2_start_ts, 1, @@ -2166,7 +2233,7 @@ mod tests { b"row", &None, t2_start_ts, - false, + SkipPessimisticCheck, 1000, t2_start_ts, 1, @@ -2188,11 +2255,11 @@ mod tests { vec![ ( Mutation::make_put(Key::from_raw(b"row"), b"value".to_vec()), - true, + DoPessimisticCheck, ), ( Mutation::make_put(Key::from_raw(b"index"), b"value".to_vec()), - false, + SkipPessimisticCheck, ), ], b"row".to_vec(), @@ -2211,11 +2278,11 @@ mod tests { vec![ ( Mutation::make_put(Key::from_raw(b"index"), b"value".to_vec()), - false, + SkipPessimisticCheck, ), ( Mutation::make_put(Key::from_raw(b"row"), b"value".to_vec()), - true, + DoPessimisticCheck, ), ], b"row".to_vec(), @@ -2240,7 +2307,7 @@ mod tests { &None, t1_start_ts, t1_start_ts, - true, + DoPessimisticCheck, 0, false, Assertion::NotExist, @@ -2258,7 +2325,7 @@ mod tests { &None, t1_start_ts, t1_start_ts, - false, + SkipPessimisticCheck, 0, false, Assertion::NotExist, @@ -2335,7 +2402,7 @@ mod tests { &Some(vec![b"k2".to_vec()]), 5, 10, - true, + DoPessimisticCheck, 15, ); must_prewrite_put_impl( @@ -2345,7 +2412,7 @@ mod tests { b"k1", &Some(vec![]), 5.into(), - false, + SkipPessimisticCheck, 100, 10.into(), 1, @@ -2365,7 +2432,7 @@ mod tests { // (is_retry_request flag is not set, here we don't rely on it.) let mutation = Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()); let cmd = PrewritePessimistic::new( - vec![(mutation, false)], + vec![(mutation, SkipPessimisticCheck)], b"k1".to_vec(), 5.into(), 100, diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index ad22e966590..7e93e77dee6 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -75,6 +75,8 @@ impl WriteCommand for Rollback { #[cfg(test)] mod tests { + use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; + use crate::storage::{txn::tests::*, TestEngineBuilder}; #[test] @@ -87,7 +89,7 @@ mod tests { must_rollback(&engine, k1, 10, false); must_rollback(&engine, k2, 10, false); - must_pessimistic_prewrite_put(&engine, k2, v, k1, 10, 10, false); + must_pessimistic_prewrite_put(&engine, k2, v, k1, 10, 10, SkipPessimisticCheck); must_rollback(&engine, k2, 10, false); } } diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index 0cd6c5b173b..2af968c21be 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -636,7 +636,7 @@ mod tests { use concurrency_manager::ConcurrencyManager; use engine_traits::{CfName, IterOptions, ReadOptions}; - use kvproto::kvrpcpb::{AssertionLevel, Context}; + use kvproto::kvrpcpb::{AssertionLevel, Context, PrewriteRequestPessimisticAction::*}; use tikv_kv::DummySnapshotExt; use super::*; @@ -708,7 +708,7 @@ mod tests { }, Mutation::make_put(Key::from_raw(key), key.to_vec()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); } diff --git a/tests/benches/hierarchy/mvcc/mod.rs b/tests/benches/hierarchy/mvcc/mod.rs index e982465c621..f88533171c3 100644 --- a/tests/benches/hierarchy/mvcc/mod.rs +++ b/tests/benches/hierarchy/mvcc/mod.rs @@ -2,7 +2,7 @@ use concurrency_manager::ConcurrencyManager; use criterion::{black_box, BatchSize, Bencher, Criterion}; -use kvproto::kvrpcpb::{AssertionLevel, Context}; +use kvproto::kvrpcpb::{AssertionLevel, Context, PrewriteRequestPessimisticAction::*}; use test_util::KvGenerator; use tikv::storage::{ kv::{Engine, WriteData}, @@ -54,7 +54,7 @@ where &txn_props, Mutation::make_put(Key::from_raw(k), v.clone()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); } @@ -98,7 +98,15 @@ fn mvcc_prewrite>(b: &mut Bencher<'_>, config: &B is_retry_request: false, assertion_level: AssertionLevel::Off, }; - prewrite(&mut txn, &mut reader, &txn_props, mutation, &None, false).unwrap(); + prewrite( + &mut txn, + &mut reader, + &txn_props, + mutation, + &None, + SkipPessimisticCheck, + ) + .unwrap(); } }, BatchSize::SmallInput, diff --git a/tests/benches/hierarchy/txn/mod.rs b/tests/benches/hierarchy/txn/mod.rs index 723d0eb3745..840d4ac81fa 100644 --- a/tests/benches/hierarchy/txn/mod.rs +++ b/tests/benches/hierarchy/txn/mod.rs @@ -2,7 +2,7 @@ use concurrency_manager::ConcurrencyManager; use criterion::{black_box, BatchSize, Bencher, Criterion}; -use kvproto::kvrpcpb::{AssertionLevel, Context}; +use kvproto::kvrpcpb::{AssertionLevel, Context, PrewriteRequestPessimisticAction::*}; use test_util::KvGenerator; use tikv::storage::{ kv::{Engine, WriteData}, @@ -50,7 +50,7 @@ where &txn_props, Mutation::make_put(Key::from_raw(k), v.clone()), &None, - false, + SkipPessimisticCheck, ) .unwrap(); } @@ -91,7 +91,15 @@ fn txn_prewrite>(b: &mut Bencher<'_>, config: &Be is_retry_request: false, assertion_level: AssertionLevel::Off, }; - prewrite(&mut txn, &mut reader, &txn_props, mutation, &None, false).unwrap(); + prewrite( + &mut txn, + &mut reader, + &txn_props, + mutation, + &None, + SkipPessimisticCheck, + ) + .unwrap(); let write_data = WriteData::from_modifies(txn.into_modifies()); black_box(engine.write(&ctx, write_data)).unwrap(); } diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 32bd2f05228..c602fc6e4f7 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -12,7 +12,7 @@ use std::{ use engine_traits::{Peekable, CF_RAFT}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ - kvrpcpb::*, + kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, raft_serverpb::{PeerState, RaftMessage, RegionLocalState}, tikvpb::TikvClient, }; @@ -1450,7 +1450,7 @@ fn test_merge_pessimistic_locks_with_concurrent_prewrite() { let mut req = PrewriteRequest::default(); req.set_context(cluster.get_ctx(b"k0")); req.set_mutations(vec![mutation].into()); - req.set_is_pessimistic_lock(vec![true]); + req.set_pessimistic_actions(vec![DoPessimisticCheck]); req.set_start_version(10); req.set_for_update_ts(40); req.set_primary_lock(b"k0".to_vec()); diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index bf23267a06a..9ed57b94091 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -13,7 +13,9 @@ use collections::HashMap; use engine_traits::CF_WRITE; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ - kvrpcpb::{Mutation, Op, PessimisticLockRequest, PrewriteRequest}, + kvrpcpb::{ + Mutation, Op, PessimisticLockRequest, PrewriteRequest, PrewriteRequestPessimisticAction::*, + }, metapb::Region, raft_serverpb::RaftMessage, tikvpb::TikvClient, @@ -966,7 +968,7 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { let mut req = PrewriteRequest::default(); req.set_context(cluster.get_ctx(b"a")); req.set_mutations(vec![mutation].into()); - req.set_is_pessimistic_lock(vec![true]); + req.set_pessimistic_actions(vec![DoPessimisticCheck]); req.set_start_version(10); req.set_for_update_ts(commit_ts + 20); req.set_primary_lock(b"a".to_vec()); diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 40ba7297b7c..7b92cc7065e 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -19,7 +19,7 @@ use grpcio::*; use kvproto::{ kvrpcpb::{ self, AssertionLevel, BatchRollbackRequest, CommandPri, CommitRequest, Context, GetRequest, - Op, PrewriteRequest, RawPutRequest, + Op, PrewriteRequest, PrewriteRequestPessimisticAction::*, RawPutRequest, }, tikvpb::TikvClient, }; @@ -398,7 +398,10 @@ fn test_pipelined_pessimistic_lock() { storage .sched_txn_command( commands::PrewritePessimistic::new( - vec![(Mutation::make_put(key.clone(), val.clone()), true)], + vec![( + Mutation::make_put(key.clone(), val.clone()), + DoPessimisticCheck, + )], key.to_raw().unwrap(), 10.into(), 3000, @@ -571,7 +574,7 @@ fn test_async_commit_prewrite_with_stale_max_ts() { commands::PrewritePessimistic::new( vec![( Mutation::make_put(Key::from_raw(b"k1"), b"v".to_vec()), - true, + DoPessimisticCheck, )], b"k1".to_vec(), 10.into(), @@ -705,7 +708,11 @@ fn test_async_apply_prewrite_impl( commands::PrewritePessimistic::new( vec![( Mutation::make_put(Key::from_raw(key), value.to_vec()), - need_lock, + if need_lock { + DoPessimisticCheck + } else { + SkipPessimisticCheck + }, )], key.to_vec(), start_ts, @@ -1036,7 +1043,10 @@ fn test_async_apply_prewrite_1pc_impl( storage .sched_txn_command( commands::PrewritePessimistic::new( - vec![(Mutation::make_put(Key::from_raw(key), value.to_vec()), true)], + vec![( + Mutation::make_put(Key::from_raw(key), value.to_vec()), + DoPessimisticCheck, + )], key.to_vec(), start_ts, 0, diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index de19d1a790c..cd5bec990c8 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -12,7 +12,10 @@ use std::{ use futures::executor::block_on; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ - kvrpcpb::{self as pb, AssertionLevel, Context, Op, PessimisticLockRequest, PrewriteRequest}, + kvrpcpb::{ + self as pb, AssertionLevel, Context, Op, PessimisticLockRequest, PrewriteRequest, + PrewriteRequestPessimisticAction::*, + }, tikvpb::TikvClient, }; use raftstore::store::{util::new_peer, LocksStatus}; @@ -53,10 +56,10 @@ fn test_txn_failpoints() { let (k2, v2) = (b"k2", b"v2"); must_acquire_pessimistic_lock(&engine, k, k, 30, 30); fail::cfg("pessimistic_prewrite", "return()").unwrap(); - must_pessimistic_prewrite_put_err(&engine, k, v1, k, 30, 30, true); + must_pessimistic_prewrite_put_err(&engine, k, v1, k, 30, 30, DoPessimisticCheck); must_prewrite_put(&engine, k2, v2, k2, 31); fail::remove("pessimistic_prewrite"); - must_pessimistic_prewrite_put(&engine, k, v1, k, 30, 30, true); + must_pessimistic_prewrite_put(&engine, k, v1, k, 30, 30, DoPessimisticCheck); must_commit(&engine, k, 30, 40); must_commit(&engine, k2, 31, 41); must_get(&engine, k, 50, v1); diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 8095ebdf2ca..9a946a806bc 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -19,7 +19,7 @@ use grpcio_health::{proto::HealthCheckRequest, *}; use kvproto::{ coprocessor::*, debugpb, - kvrpcpb::{self, *}, + kvrpcpb::{self, PrewriteRequestPessimisticAction::*, *}, metapb, raft_serverpb, raft_serverpb::*, tikvpb::*, @@ -2073,7 +2073,7 @@ fn test_commands_write_detail() { mutation.set_op(Op::Put); mutation.set_value(v); prewrite_req.set_mutations(vec![mutation].into()); - prewrite_req.set_is_pessimistic_lock(vec![true]); + prewrite_req.set_pessimistic_actions(vec![DoPessimisticCheck]); prewrite_req.set_context(ctx.clone()); prewrite_req.set_primary_lock(k.clone()); prewrite_req.set_start_version(20); From 0030aeb90a840140a935fbb0181b6a62b5e680b3 Mon Sep 17 00:00:00 2001 From: BornChanger <97348524+BornChanger@users.noreply.github.com> Date: Thu, 25 Aug 2022 22:50:21 +0800 Subject: [PATCH 172/676] *: support read quota limit for analyze (#13302) ref tikv/tikv#13257, close tikv/tikv#13301 Signed-off-by: BornChanger Co-authored-by: Ti Chi Robot --- src/coprocessor/statistics/analyze.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index e11558e73b3..8f7b8c57dde 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -392,6 +392,7 @@ impl RowSampleBuilder { } let mut sample = self.quota_limiter.new_sample(!self.is_auto_analyze); + let mut read_size: usize = 0; { let _guard = sample.observe_cpu(); let result = self.data.next_batch(BATCH_MAX_SIZE); @@ -431,6 +432,7 @@ impl RowSampleBuilder { } else { collation_key_vals.push(Vec::new()); } + read_size += val.len(); column_vals.push(val); } collector.mut_base().count += 1; @@ -444,6 +446,7 @@ impl RowSampleBuilder { } } + sample.add_read_bytes(read_size); // Don't let analyze bandwidth limit the quota limiter, this is already limited // in rate limiter. let quota_delay = { From 7415946640f817245ccedbd95397991ec0650877 Mon Sep 17 00:00:00 2001 From: Lintian Shi Date: Mon, 29 Aug 2022 13:18:23 +0800 Subject: [PATCH 173/676] raftstore: fix checking for snapshot last index (#13088) close tikv/tikv#12618 using commit instead of last_index to check gap between existing raft logs and snapshot when recovering from applying state. Signed-off-by: LintianShi Signed-off-by: Lintian Shi Co-authored-by: LintianShi Co-authored-by: Ti Chi Robot Co-authored-by: Jay --- .../raftstore/src/store/peer_storage.rs | 21 ++-- tests/failpoints/cases/test_snap.rs | 104 ++++++++++++++++++ 2 files changed, 113 insertions(+), 12 deletions(-) diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 4a36f385648..cf70234c841 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -33,10 +33,7 @@ use tikv_util::{ box_err, box_try, debug, defer, error, info, time::Instant, warn, worker::Scheduler, }; -use super::{ - entry_storage::last_index, metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager, - SnapshotStatistics, -}; +use super::{metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager, SnapshotStatistics}; use crate::{ store::{ async_io::write::WriteTask, entry_storage::EntryStorage, fsm::GenSnapTask, @@ -147,14 +144,14 @@ pub fn recover_from_applying_state( let raft_state = box_try!(engines.raft.get_raft_state(region_id)).unwrap_or_default(); - // if we recv append log when applying snapshot, last_index in raft_local_state - // will larger than snapshot_index. since raft_local_state is written to - // raft engine, and raft write_batch is written after kv write_batch, - // raft_local_state may wrong if restart happen between the two write. so we - // copy raft_local_state to kv engine (snapshot_raft_state), and set - // snapshot_raft_state.last_index = snapshot_index. after restart, we need - // check last_index. - if last_index(&snapshot_raft_state) > last_index(&raft_state) { + // since raft_local_state is written to raft engine, and + // raft write_batch is written after kv write_batch. raft_local_state may wrong + // if restart happen between the two write. so we copy raft_local_state to + // kv engine (snapshot_raft_state), and set + // snapshot_raft_state.hard_state.commit = snapshot_index. after restart, we + // need check commit. + if snapshot_raft_state.get_hard_state().get_commit() > raft_state.get_hard_state().get_commit() + { // There is a gap between existing raft logs and snapshot. Clean them up. engines .raft diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index 3507fc268d4..93acfffc258 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -793,3 +793,107 @@ fn test_snapshot_recover_from_raft_write_failure() { cluster.must_put(format!("k1{}", i).as_bytes(), b"v1"); } } + +/// Test whether applying snapshot is resumed properly when last_index before +/// applying snapshot is larger than the snapshot index and applying is aborted +/// between kv write and raft write. +#[test] +fn test_snapshot_recover_from_raft_write_failure_with_uncommitted_log() { + let mut cluster = new_server_cluster(0, 3); + configure_for_snapshot(&mut cluster); + // Avoid triggering snapshot at final step. + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(10); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + // We use three peers([1, 2, 3]) for this test. + cluster.run(); + + sleep_ms(500); + + // Guarantee peer 1 is leader. + cluster.must_transfer_leader(1, new_peer(1, 1)); + + cluster.must_put(b"k1", b"v1"); + for i in 1..4 { + must_get_equal(&cluster.get_engine(i), b"k1", b"v1"); + } + + // Guarantee that peer 2 and 3 won't receive any entries, + // so these entries cannot be committed. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 1) + .msg_type(MessageType::MsgAppend) + .direction(Direction::Send), + )); + + // Peer 1 appends entries which is never committed. + for i in 1..20 { + let region = cluster.get_region(b""); + let reqs = vec![new_put_cmd(format!("k2{}", i).as_bytes(), b"v2")]; + let mut put = new_request( + region.get_id(), + region.get_region_epoch().clone(), + reqs, + false, + ); + put.mut_header().set_peer(new_peer(1, 1)); + let _ = cluster.call_command_on_node(1, put, Duration::from_secs(1)); + } + + for i in 1..4 { + must_get_none(&cluster.get_engine(i), b"k210"); + } + // Now peer 1 should have much longer log than peer 2 and 3. + + // Hack: down peer 1 in order to change leader to peer 3. + cluster.stop_node(1); + sleep_ms(100); + cluster.clear_send_filters(); + sleep_ms(100); + cluster.must_transfer_leader(1, new_peer(3, 3)); + + for i in 0..20 { + cluster.must_put(format!("k3{}", i).as_bytes(), b"v3"); + } + + // Peer 1 back to cluster + cluster.add_send_filter(IsolationFilterFactory::new(1)); + sleep_ms(100); + cluster.run_node(1).unwrap(); + sleep_ms(100); + must_get_none(&cluster.get_engine(1), b"k319"); + must_get_equal(&cluster.get_engine(2), b"k319", b"v3"); + must_get_equal(&cluster.get_engine(3), b"k319", b"v3"); + + // Raft writes are dropped. + let raft_before_save_on_store_1_fp = "raft_before_save_on_store_1"; + fail::cfg(raft_before_save_on_store_1_fp, "return").unwrap(); + // Skip applying snapshot into RocksDB to keep peer status in Applying. + let apply_snapshot_fp = "apply_pending_snapshot"; + fail::cfg(apply_snapshot_fp, "return()").unwrap(); + cluster.clear_send_filters(); + // Wait for leader send snapshot. + sleep_ms(100); + + cluster.stop_node(1); + fail::remove(raft_before_save_on_store_1_fp); + fail::remove(apply_snapshot_fp); + // Recover from applying state and validate states, + // may fail in this step due to invalid states. + cluster.run_node(1).unwrap(); + // Snapshot is applied. + must_get_equal(&cluster.get_engine(1), b"k319", b"v3"); + let mut ents = Vec::new(); + cluster + .get_raft_engine(1) + .get_all_entries_to(1, &mut ents) + .unwrap(); + // Raft logs are cleared. + assert!(ents.is_empty()); + + // Final step: append some more entries to make sure raftdb is healthy. + for i in 20..25 { + cluster.must_put(format!("k1{}", i).as_bytes(), b"v1"); + } +} From 68f99ae034376f5629d80fa7712796a61dc5d50e Mon Sep 17 00:00:00 2001 From: Potato Date: Mon, 29 Aug 2022 19:28:23 +0800 Subject: [PATCH 174/676] storage: record and return asycn snapshot metric (#13358) ref pingcap/kvproto#974, ref pingcap/kvproto#978, ref tikv/tikv#12362 This commit records read_index_propose_wait_nanos, read_index_confirm_wait_nanos and read_pool_schedule_wait_nanos Signed-off-by: OneSizeFitQuorum --- Cargo.lock | 3 +- Cargo.toml | 2 +- components/backup-stream/src/event_loader.rs | 2 +- components/cdc/src/endpoint.rs | 2 +- components/cdc/src/initializer.rs | 2 +- components/raftstore-v2/Cargo.toml | 1 + .../src/router/response_channel.rs | 9 +- components/raftstore/src/store/fsm/apply.rs | 16 +-- components/raftstore/src/store/fsm/peer.rs | 31 +++-- components/raftstore/src/store/msg.rs | 51 +++++--- components/raftstore/src/store/peer.rs | 29 +++-- components/raftstore/src/store/worker/read.rs | 22 ++-- components/resolved_ts/src/scanner.rs | 2 +- components/test_raftstore/src/util.rs | 22 +++- components/tracker/src/lib.rs | 6 + src/coprocessor/endpoint.rs | 13 +- src/coprocessor/tracker.rs | 18 ++- src/import/sst_service.rs | 2 +- src/server/raftkv.rs | 2 +- src/server/service/debug.rs | 4 +- src/server/service/kv.rs | 2 +- src/storage/mod.rs | 8 ++ tests/benches/misc/raftkv/mod.rs | 4 +- tests/failpoints/cases/mod.rs | 1 + .../cases/test_cmd_epoch_checker.rs | 4 +- .../cases/test_read_execution_tracker.rs | 121 ++++++++++++++++++ tests/failpoints/cases/test_stale_read.rs | 2 +- .../integrations/raftstore/test_lease_read.rs | 2 +- tests/integrations/server/kv_service.rs | 2 +- 29 files changed, 293 insertions(+), 92 deletions(-) create mode 100644 tests/failpoints/cases/test_read_execution_tracker.rs diff --git a/Cargo.lock b/Cargo.lock index 52ad7912203..a5c71cef10d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2627,7 +2627,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#a0f02b6efcee6112bdc313988bf6c0ae3f83c07d" +source = "git+https://github.com/pingcap/kvproto.git#9cc5e1ddfda3aec6eddfc09de1d0072ebbd7bb21" dependencies = [ "futures 0.3.15", "grpcio", @@ -4213,6 +4213,7 @@ dependencies = [ "test_util", "tikv_util", "time", + "tracker", "txn_types", ] diff --git a/Cargo.toml b/Cargo.toml index 9bbea00262c..e1dad6c5fa3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -207,7 +207,7 @@ procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229 # kvproto at the same time. # After the PR to kvproto is merged, remember to comment this out and run `cargo update -p kvproto`. # [patch.'https://github.com/pingcap/kvproto'] -# kvproto = {git = "https://github.com/your_github_id/kvproto", branch="your_branch"} +# kvproto = { git = "https://github.com/your_github_id/kvproto", branch="your_branch" } [workspace] # See https://github.com/rust-lang/rfcs/blob/master/text/2957-cargo-features2.md diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 5aade374249..0f83d4726e4 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -295,7 +295,7 @@ where SignificantMsg::CaptureChange { cmd, region_epoch: region.get_region_epoch().clone(), - callback: Callback::Read(Box::new(|snapshot| { + callback: Callback::read(Box::new(|snapshot| { if snapshot.response.get_header().has_error() { callback(Err(Error::RaftRequest( snapshot.response.get_header().get_error().clone(), diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index d9938006ca1..2e0253b23a9 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1242,7 +1242,7 @@ impl, E: KvEngine> Endpoint { let (tx, rx) = tokio::sync::oneshot::channel(); if let Err(e) = raft_router_clone.significant_send( region_id, - SignificantMsg::LeaderCallback(Callback::Read(Box::new(move |resp| { + SignificantMsg::LeaderCallback(Callback::read(Box::new(move |resp| { let resp = if resp.response.get_header().has_error() { None } else { diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index f6a2ce2885c..418e0c23a0a 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -144,7 +144,7 @@ impl Initializer { SignificantMsg::CaptureChange { cmd: change_cmd, region_epoch, - callback: Callback::Read(Box::new(move |resp| { + callback: Callback::read(Box::new(move |resp| { if let Err(e) = sched.schedule(Task::InitDownstream { region_id, downstream_id, diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index f526aeda9c4..c7d920e4011 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -48,6 +48,7 @@ slog = "2.3" smallvec = "1.4" tikv_util = { path = "../tikv_util", default-features = false } time = "0.1" +tracker = { path = "../tracker" } txn_types = { path = "../txn_types", default-features = false } [dev-dependencies] diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index ae43bd07c25..e87095215b8 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -33,6 +33,7 @@ use raftstore::store::{ }; use smallvec::SmallVec; use tikv_util::memory::HeapSize; +use tracker::TrackerToken; /// A struct allows to watch and notify specific events. /// @@ -282,11 +283,11 @@ impl WriteCallback for CmdResChannel { self.core.notify_event(Self::COMMITTED_EVENT); } - fn trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { + fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { None } - fn trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { + fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { None } @@ -393,6 +394,10 @@ impl ReadCallback for QueryResChannel { } mem::forget(self); } + + fn read_tracker(&self) -> Option<&TrackerToken> { + None + } } impl Drop for QueryResChannel { diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index d44cca3668b..6d1d1881046 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -529,7 +529,7 @@ where .applied_batch .cb_batch .iter() - .flat_map(|(cb, _)| cb.trackers()) + .flat_map(|(cb, _)| cb.write_trackers()) .flat_map(|trackers| trackers.iter().map(|t| t.as_tracker_token())) .flatten() .collect(); @@ -568,7 +568,7 @@ where // Invoke callbacks let now = std::time::Instant::now(); for (cb, resp) in cb_batch.drain(..) { - for tracker in cb.trackers().iter().flat_map(|v| *v) { + for tracker in cb.write_trackers().iter().flat_map(|v| *v) { tracker.observe(now, &self.apply_time, |t| &mut t.metrics.apply_time_nanos); } cb.invoke_with_response(resp); @@ -3000,7 +3000,7 @@ impl Apply { pub fn on_schedule(&mut self, metrics: &RaftMetrics) { let now = std::time::Instant::now(); for cb in &mut self.cbs { - if let Some(trackers) = cb.cb.trackers_mut() { + if let Some(trackers) = cb.cb.write_trackers_mut() { for tracker in trackers { tracker.observe(now, &metrics.store_time, |t| { t.metrics.write_instant = Some(now); @@ -3770,7 +3770,7 @@ where for tracker in apply .cbs .iter() - .flat_map(|p| p.cb.trackers()) + .flat_map(|p| p.cb.write_trackers()) .flat_map(|ts| ts.iter().flat_map(|t| t.as_tracker_token())) { GLOBAL_TRACKERS.with_tracker(tracker, |t| { @@ -5985,7 +5985,7 @@ mod tests { Msg::Change { region_epoch: region_epoch.clone(), cmd: ChangeObserver::from_cdc(1, observe_handle.clone()), - cb: Callback::Read(Box::new(|resp: ReadResponse| { + cb: Callback::read(Box::new(|resp: ReadResponse| { assert!(!resp.response.get_header().has_error()); assert!(resp.snapshot.is_some()); let snap = resp.snapshot.unwrap(); @@ -6054,7 +6054,7 @@ mod tests { Msg::Change { region_epoch, cmd: ChangeObserver::from_cdc(2, observe_handle), - cb: Callback::Read(Box::new(|resp: ReadResponse<_>| { + cb: Callback::read(Box::new(|resp: ReadResponse<_>| { assert!( resp.response .get_header() @@ -6226,7 +6226,7 @@ mod tests { Msg::Change { region_epoch: region_epoch.clone(), cmd: ChangeObserver::from_cdc(1, observe_handle.clone()), - cb: Callback::Read(Box::new(|resp: ReadResponse<_>| { + cb: Callback::read(Box::new(|resp: ReadResponse<_>| { assert!(!resp.response.get_header().has_error(), "{:?}", resp); assert!(resp.snapshot.is_some()); })), @@ -6381,7 +6381,7 @@ mod tests { Msg::Change { region_epoch, cmd: ChangeObserver::from_cdc(1, observe_handle), - cb: Callback::Read(Box::new(move |resp: ReadResponse<_>| { + cb: Callback::read(Box::new(move |resp: ReadResponse<_>| { assert!( resp.response.get_header().get_error().has_epoch_not_match(), "{:?}", diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 9b354fb0842..5497d2ad1d9 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -52,11 +52,12 @@ use tikv_util::{ box_err, debug, defer, error, escape, info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, sys::{disk::DiskUsage, memory_usage_reaches_high_water}, - time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, + time::{monotonic_raw_now, Instant as TiInstant}, trace, warn, worker::{ScheduleError, Scheduler}, Either, }; +use tracker::GLOBAL_TRACKERS; use txn_types::WriteBatchFlags; use self::memtrace::*; @@ -92,7 +93,7 @@ use crate::{ RegionTask, SplitCheckTask, }, AbstractPeer, CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, - PeerTick, ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, + PeerTick, ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, SignificantMsg, SnapKey, StoreMsg, WriteCallback, }, Error, Result, @@ -517,7 +518,7 @@ where let tokens: SmallVec<[TimeTracker; 4]> = cbs .iter_mut() - .filter_map(|cb| cb.trackers().map(|t| t[0])) + .filter_map(|cb| cb.write_trackers().map(|t| t[0])) .collect(); let mut cb = Callback::write_ext( @@ -532,7 +533,7 @@ where committed_cb, ); - if let Some(trackers) = cb.trackers_mut() { + if let Some(trackers) = cb.write_trackers_mut() { *trackers = tokens; } @@ -608,10 +609,17 @@ where } } PeerMsg::RaftCommand(cmd) => { + let propose_time = cmd.send_time.saturating_elapsed(); self.ctx .raft_metrics .propose_wait_time - .observe(duration_to_sec(cmd.send_time.saturating_elapsed()) as f64); + .observe(propose_time.as_secs_f64()); + cmd.callback.read_tracker().map(|tracker| { + GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + t.metrics.read_index_propose_wait_nanos = + propose_time.as_nanos() as u64; + }) + }); if let Some(Err(e)) = cmd.extra_opts.deadline.map(|deadline| deadline.check()) { cmd.callback.invoke_with_response(new_error(e.into())); @@ -625,8 +633,8 @@ where // so that normal writes can be rejected when proposing if the // store's disk is full. && ((self.ctx.self_disk_usage == DiskUsage::Normal - && !self.fsm.peer.disk_full_peers.majority()) - || cmd.extra_opts.disk_full_opt == DiskFullOpt::NotAllowedOnFull) + && !self.fsm.peer.disk_full_peers.majority()) + || cmd.extra_opts.disk_full_opt == DiskFullOpt::NotAllowedOnFull) { self.fsm.batch_req_builder.add(cmd, req_size); if self.fsm.batch_req_builder.should_finish(&self.ctx.cfg) { @@ -1001,8 +1009,7 @@ where || util::is_epoch_stale( region.get_region_epoch(), self.fsm.peer.region().get_region_epoch(), - ) - { + ) { // Stale message return; } @@ -1205,7 +1212,7 @@ where let apply_router = self.ctx.apply_router.clone(); self.propose_raft_command_internal( msg, - Callback::Read(Box::new(move |resp| { + Callback::read(Box::new(move |resp| { // Return the error if resp.response.get_header().has_error() { cb.invoke_read(resp); @@ -2200,7 +2207,7 @@ where cmd.mut_header().set_read_quorum(true); self.propose_raft_command_internal( cmd, - Callback::Read(Box::new(|_| ())), + Callback::read(Box::new(|_| ())), DiskFullOpt::AllowedOnAlmostFull, ); } @@ -4831,7 +4838,7 @@ where if self.ctx.raft_metrics.waterfall_metrics { let now = Instant::now(); - for tracker in cb.trackers().iter().flat_map(|v| *v) { + for tracker in cb.write_trackers().iter().flat_map(|v| *v) { tracker.observe(now, &self.ctx.raft_metrics.wf_batch_wait, |t| { &mut t.metrics.wf_batch_wait_nanos }); diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 619a18e3fb5..5b3221e8c19 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -22,7 +22,7 @@ use pd_client::BucketMeta; use raft::SnapshotStatus; use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; -use tracker::{get_tls_tracker_token, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; +use tracker::{get_tls_tracker_token, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; use super::{local_metrics::TimeTracker, worker::FetchedLogs, AbstractPeer, RegionSnapshot}; use crate::store::{ @@ -89,7 +89,11 @@ pub enum Callback { /// No callback. None, /// Read callback. - Read(BoxReadCallback), + Read { + cb: BoxReadCallback, + + tracker: TrackerToken, + }, /// Write callback. Write { cb: BoxWriteCallback, @@ -116,6 +120,11 @@ impl Callback where S: Snapshot, { + pub fn read(cb: BoxReadCallback) -> Self { + let tracker = get_tls_tracker_token(); + Callback::Read { cb, tracker } + } + pub fn write(cb: BoxWriteCallback) -> Self { Self::write_ext(cb, None, None) } @@ -147,13 +156,13 @@ where pub fn invoke_with_response(self, resp: RaftCmdResponse) { match self { Callback::None => (), - Callback::Read(read) => { + Callback::Read { cb, .. } => { let resp = ReadResponse { response: resp, snapshot: None, txn_extra_op: TxnExtraOp::Noop, }; - read(resp); + cb(resp); } Callback::Write { cb, .. } => { let resp = WriteResponse { response: resp }; @@ -165,19 +174,19 @@ where } pub fn has_proposed_cb(&self) -> bool { - let Callback::Write { proposed_cb, .. } = self else { return false }; + let Callback::Write { proposed_cb, .. } = self else { return false; }; proposed_cb.is_some() } pub fn invoke_proposed(&mut self) { - let Callback::Write { proposed_cb, .. } = self else { return }; + let Callback::Write { proposed_cb, .. } = self else { return; }; if let Some(cb) = proposed_cb.take() { cb(); } } pub fn invoke_committed(&mut self) { - let Callback::Write { committed_cb, .. } = self else { return }; + let Callback::Write { committed_cb, .. } = self else { return; }; if let Some(cb) = committed_cb.take() { cb(); } @@ -185,18 +194,18 @@ where pub fn invoke_read(self, args: ReadResponse) { match self { - Callback::Read(read) => read(args), - other => panic!("expect Callback::Read(..), got {:?}", other), + Callback::Read { cb, .. } => cb(args), + other => panic!("expect Callback::read(..), got {:?}", other), } } pub fn take_proposed_cb(&mut self) -> Option { - let Callback::Write { proposed_cb, .. } = self else { return None }; + let Callback::Write { proposed_cb, .. } = self else { return None; }; proposed_cb.take() } pub fn take_committed_cb(&mut self) -> Option { - let Callback::Write { committed_cb, .. } = self else { return None }; + let Callback::Write { committed_cb, .. } = self else { return None; }; committed_cb.take() } } @@ -205,6 +214,7 @@ pub trait ReadCallback: ErrorCallback { type Response; fn set_result(self, result: Self::Response); + fn read_tracker(&self) -> Option<&TrackerToken>; } pub trait WriteCallback: ErrorCallback { @@ -212,8 +222,8 @@ pub trait WriteCallback: ErrorCallback { fn notify_proposed(&mut self); fn notify_committed(&mut self); - fn trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>>; - fn trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>>; + fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>>; + fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>>; fn set_result(self, result: Self::Response); } @@ -229,6 +239,11 @@ impl ReadCallback for Callback { fn set_result(self, result: Self::Response) { self.invoke_read(result); } + + fn read_tracker(&self) -> Option<&TrackerToken> { + let Callback::Read { tracker, .. } = self else { return None; }; + Some(tracker) + } } impl WriteCallback for Callback { @@ -245,14 +260,14 @@ impl WriteCallback for Callback { } #[inline] - fn trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - let Callback::Write { trackers, .. } = self else { return None }; + fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { + let Callback::Write { trackers, .. } = self else { return None; }; Some(trackers) } #[inline] - fn trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - let Callback::Write { trackers, .. } = self else { return None }; + fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { + let Callback::Write { trackers, .. } = self else { return None; }; Some(trackers) } @@ -281,7 +296,7 @@ where fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Callback::None => write!(fmt, "Callback::None"), - Callback::Read(_) => write!(fmt, "Callback::Read(..)"), + Callback::Read { .. } => write!(fmt, "Callback::Read(..)"), Callback::Write { .. } => write!(fmt, "Callback::Write(..)"), #[cfg(any(test, feature = "testexport"))] Callback::Test { .. } => write!(fmt, "Callback::Test(..)"), diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 89ed6eeef7d..6b3ec4c3456 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -65,6 +65,7 @@ use tikv_util::{ Either, }; use time::Timespec; +use tracker::GLOBAL_TRACKERS; use txn_types::WriteBatchFlags; use uuid::Uuid; @@ -100,14 +101,15 @@ use crate::{ HeartbeatTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadExecutor, ReadProgress, RegionTask, SplitCheckTask, }, - Callback, Config, GlobalReplicationState, PdTask, ReadIndexContext, ReadResponse, TxnExt, - WriteCallback, RAFT_INIT_LOG_INDEX, + Callback, Config, GlobalReplicationState, PdTask, ReadCallback, ReadIndexContext, + ReadResponse, TxnExt, WriteCallback, RAFT_INIT_LOG_INDEX, }, Error, Result, }; const SHRINK_CACHE_CAPACITY: usize = 64; -const MIN_BCAST_WAKE_UP_INTERVAL: u64 = 1_000; // 1s +const MIN_BCAST_WAKE_UP_INTERVAL: u64 = 1_000; +// 1s const REGION_READ_PROGRESS_CAP: usize = 128; #[doc(hidden)] pub const MAX_COMMITTED_SIZE_PER_READY: u64 = 16 * 1024 * 1024; @@ -143,7 +145,7 @@ impl ProposalQueue { .and_then(|i| { self.queue[i] .cb - .trackers() + .write_trackers() .map(|ts| (self.queue[i].term, ts)) }) } @@ -1652,7 +1654,7 @@ where { let proposal = &self.proposals.queue[idx]; if term == proposal.term { - for tracker in proposal.cb.trackers().iter().flat_map(|v| v.iter()) { + for tracker in proposal.cb.write_trackers().iter().flat_map(|v| v.iter()) { tracker.observe(std_now, &ctx.raft_metrics.wf_send_proposal, |t| { &mut t.metrics.wf_send_proposal_nanos }); @@ -2504,7 +2506,7 @@ where // Update it after unstable entries pagination is introduced. debug_assert!(ready.entries().last().map_or_else( || true, - |entry| entry.index == self.raft_group.raft.raft_log.last_index() + |entry| entry.index == self.raft_group.raft.raft_log.last_index(), )); if self.memtrace_raft_entries != 0 { MEMTRACE_RAFT_ENTRIES.trace(TraceEvent::Sub(self.memtrace_raft_entries)); @@ -3071,7 +3073,14 @@ where "peer_id" => self.peer.get_id(), ); RAFT_READ_INDEX_PENDING_COUNT.sub(read.cmds().len() as i64); + let time = monotonic_raw_now(); for (req, cb, mut read_index) in read.take_cmds().drain(..) { + cb.read_tracker().map(|tracker| { + GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + t.metrics.read_index_confirm_wait_nanos = + (time - read.propose_time).to_std().unwrap().as_nanos() as u64; + }) + }); // leader reports key is locked if let Some(locked) = read.locked.take() { let mut response = raft_cmdpb::Response::default(); @@ -3588,9 +3597,9 @@ where if peer.get_id() == self.peer_id() && (change_type == ConfChangeType::RemoveNode - // In Joint confchange, the leader is allowed to be DemotingVoter - || (kind == ConfChangeKind::Simple - && change_type == ConfChangeType::AddLearnerNode)) + // In Joint confchange, the leader is allowed to be DemotingVoter + || (kind == ConfChangeKind::Simple + && change_type == ConfChangeType::AddLearnerNode)) && !ctx.cfg.allow_remove_leader() { return Err(box_err!( @@ -5415,6 +5424,8 @@ pub trait RequestInspector { return Ok(RequestPolicy::ProposeNormal); } + fail_point!("perform_read_index", |_| Ok(RequestPolicy::ReadIndex)); + let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); if flags.contains(WriteBatchFlags::STALE_READ) { return Ok(RequestPolicy::StaleRead); diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 3c5c05f4717..a3c3878cf68 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -1016,7 +1016,7 @@ mod tests { reader.propose_raft_command( None, cmd.clone(), - Callback::Read(Box::new(|resp| { + Callback::read(Box::new(|resp| { panic!("unexpected invoke, {:?}", resp); })), ); @@ -1135,7 +1135,7 @@ mod tests { meta.readers.get_mut(&1).unwrap().update(pg); } let task = - RaftCommand::::new(cmd.clone(), Callback::Read(Box::new(move |_| {}))); + RaftCommand::::new(cmd.clone(), Callback::read(Box::new(move |_| {}))); must_not_redirect(&mut reader, &rx, task); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), @@ -1145,7 +1145,7 @@ mod tests { // Let's read. let task = RaftCommand::::new( cmd.clone(), - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let snap = resp.snapshot.unwrap(); assert_eq!(snap.get_region(), ®ion1); })), @@ -1172,7 +1172,7 @@ mod tests { reader.propose_raft_command( None, cmd_store_id, - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); assert!(err.has_store_not_match()); assert!(resp.snapshot.is_none()); @@ -1196,7 +1196,7 @@ mod tests { reader.propose_raft_command( None, cmd_peer_id, - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { assert!( resp.response.get_header().has_error(), "{:?}", @@ -1221,7 +1221,7 @@ mod tests { reader.propose_raft_command( None, cmd_term, - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); assert!(err.has_stale_command(), "{:?}", resp); assert!(resp.snapshot.is_none()); @@ -1259,7 +1259,7 @@ mod tests { reader.propose_raft_command( None, cmd.clone(), - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); assert!(err.has_server_is_busy(), "{:?}", resp); assert!(resp.snapshot.is_none()); @@ -1291,7 +1291,7 @@ mod tests { reader.propose_raft_command( None, cmd9.clone(), - Callback::Read(Box::new(|resp| { + Callback::read(Box::new(|resp| { panic!("unexpected invoke, {:?}", resp); })), ); @@ -1320,7 +1320,7 @@ mod tests { meta.readers.get_mut(&1).unwrap().update(pg); } let task = - RaftCommand::::new(cmd.clone(), Callback::Read(Box::new(move |_| {}))); + RaftCommand::::new(cmd.clone(), Callback::read(Box::new(move |_| {}))); must_not_redirect(&mut reader, &rx, task); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), @@ -1345,7 +1345,7 @@ mod tests { cmd.mut_header().set_flag_data(data.into()); let task = RaftCommand::::new( cmd.clone(), - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let err = resp.response.get_header().get_error(); assert!(err.has_data_is_not_ready()); assert!(resp.snapshot.is_none()); @@ -1359,7 +1359,7 @@ mod tests { read_progress.update_safe_ts(1, 2); assert_eq!(read_progress.safe_ts(), 2); - let task = RaftCommand::::new(cmd, Callback::Read(Box::new(move |_| {}))); + let task = RaftCommand::::new(cmd, Callback::read(Box::new(move |_| {}))); must_not_redirect(&mut reader, &rx, task); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index 4266103933f..7877de718ba 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -206,7 +206,7 @@ impl, E: KvEngine> ScannerPool { SignificantMsg::CaptureChange { cmd: change_cmd, region_epoch: task.region.get_region_epoch().clone(), - callback: Callback::Read(Box::new(cb)), + callback: Callback::read(Box::new(cb)), }, )?; let mut resp = box_try!(fut.await); diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 8cac947dc57..9b653ac2096 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -40,6 +40,7 @@ use kvproto::{ tikvpb::TikvClient, }; use pd_client::PdClient; +use protobuf::RepeatedField; use raft::eraftpb::ConfChangeType; pub use raftstore::store::util::{find_peer, new_learner_peer, new_peer}; use raftstore::{ @@ -420,7 +421,7 @@ pub fn make_cb(cmd: &RaftCmdRequest) -> (Callback, mpsc::Receiver let (tx, rx) = mpsc::channel(); let mut detector = CallbackLeakDetector::default(); let cb = if is_read { - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { detector.called = true; // we don't care error actually. let _ = tx.send(resp.response); @@ -485,7 +486,7 @@ pub fn async_read_on_peer( request.mut_header().set_peer(peer); request.mut_header().set_replica_read(replica_read); let (tx, rx) = mpsc::sync_channel(1); - let cb = Callback::Read(Box::new(move |resp| drop(tx.send(resp.response)))); + let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); cluster.sim.wl().async_read(node_id, None, request, cb); rx } @@ -508,7 +509,7 @@ pub fn batch_read_on_peer( ); request.mut_header().set_peer(peer.clone()); let t = tx.clone(); - let cb = Callback::Read(Box::new(move |resp| { + let cb = Callback::read(Box::new(move |resp| { t.send((len, resp)).unwrap(); })); cluster @@ -562,7 +563,7 @@ pub fn async_read_index_on_peer( ); request.mut_header().set_peer(peer); let (tx, rx) = mpsc::sync_channel(1); - let cb = Callback::Read(Box::new(move |resp| drop(tx.send(resp.response)))); + let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); cluster.sim.wl().async_read(node_id, None, request, cb); rx } @@ -881,6 +882,19 @@ pub fn kv_read(client: &TikvClient, ctx: Context, key: Vec, ts: u64) -> GetR client.kv_get(&get_req).unwrap() } +pub fn kv_batch_read( + client: &TikvClient, + ctx: Context, + keys: Vec>, + ts: u64, +) -> BatchGetResponse { + let mut batch_get_req = BatchGetRequest::default(); + batch_get_req.set_context(ctx); + batch_get_req.set_keys(RepeatedField::from(keys)); + batch_get_req.set_version(ts); + client.kv_batch_get(&batch_get_req).unwrap() +} + pub fn must_kv_prewrite_with( client: &TikvClient, ctx: Context, diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index be099beadde..664dc1e6767 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -37,6 +37,9 @@ impl Tracker { detail_v2.set_rocksdb_key_skipped_count(self.metrics.internal_key_skipped_count); detail_v2.set_rocksdb_delete_skipped_count(self.metrics.deleted_key_skipped_count); detail_v2.set_get_snapshot_nanos(self.metrics.get_snapshot_nanos); + detail_v2.set_read_index_propose_wait_nanos(self.metrics.read_index_propose_wait_nanos); + detail_v2.set_read_index_confirm_wait_nanos(self.metrics.read_index_confirm_wait_nanos); + detail_v2.set_read_pool_schedule_wait_nanos(self.metrics.read_pool_schedule_wait_nanos); } pub fn write_write_detail(&self, detail: &mut pb::WriteDetail) { @@ -118,6 +121,9 @@ pub enum RequestType { #[derive(Debug, Default, Clone)] pub struct RequestMetrics { pub get_snapshot_nanos: u64, + pub read_index_propose_wait_nanos: u64, + pub read_index_confirm_wait_nanos: u64, + pub read_pool_schedule_wait_nanos: u64, pub block_cache_hit_count: u64, pub block_read_count: u64, pub block_read_byte: u64, diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 677490a4b31..5bd05bd29cd 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -501,9 +501,16 @@ impl Endpoint { async move { let res = match result_of_future { Err(e) => make_error_response(e).into(), - Ok(handle_fut) => handle_fut - .await - .unwrap_or_else(|e| make_error_response(e).into()), + Ok(handle_fut) => { + let mut response = handle_fut + .await + .unwrap_or_else(|e| make_error_response(e).into()); + let scan_detail_v2 = response.mut_exec_details_v2().mut_scan_detail_v2(); + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + tracker.write_scan_detail(scan_detail_v2); + }); + response + } }; GLOBAL_TRACKERS.remove(tracker); res diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index f9b908979b8..0547d2088f0 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -82,7 +82,7 @@ impl Tracker { /// factory context, because the future pool might be full and we need /// to wait it. This kind of wait time has to be recorded. pub fn new(req_ctx: ReqContext, slow_log_threshold: Duration) -> Self { - let now = Instant::now_coarse(); + let now = Instant::now(); Tracker { request_begin_at: now, current_stage: TrackerState::Initialized, @@ -106,14 +106,18 @@ impl Tracker { pub fn on_scheduled(&mut self) { assert_eq!(self.current_stage, TrackerState::Initialized); - let now = Instant::now_coarse(); + let now = Instant::now(); self.schedule_wait_time = now - self.request_begin_at; + with_tls_tracker(|tracker| { + tracker.metrics.read_pool_schedule_wait_nanos = + self.schedule_wait_time.as_nanos() as u64; + }); self.current_stage = TrackerState::Scheduled(now); } pub fn on_snapshot_finished(&mut self) { if let TrackerState::Scheduled(at) = self.current_stage { - let now = Instant::now_coarse(); + let now = Instant::now(); self.snapshot_wait_time = now - at; self.wait_time = now - self.request_begin_at; self.current_stage = TrackerState::SnapshotRetrieved(now); @@ -124,7 +128,7 @@ impl Tracker { pub fn on_begin_all_items(&mut self) { if let TrackerState::SnapshotRetrieved(at) = self.current_stage { - let now = Instant::now_coarse(); + let now = Instant::now(); self.handler_build_time = now - at; self.current_stage = TrackerState::AllItemsBegan; } else { @@ -133,7 +137,7 @@ impl Tracker { } pub fn on_begin_item(&mut self) { - let now = Instant::now_coarse(); + let now = Instant::now(); match self.current_stage { TrackerState::AllItemsBegan => {} TrackerState::ItemFinished(at) => { @@ -149,7 +153,7 @@ impl Tracker { pub fn on_finish_item(&mut self, some_storage_stats: Option) { if let TrackerState::ItemBegan(at) = self.current_stage { - let now = Instant::now_coarse(); + let now = Instant::now(); self.item_process_time = now - at; self.total_process_time += self.item_process_time; if let Some(storage_stats) = some_storage_stats { @@ -227,7 +231,7 @@ impl Tracker { _ => unreachable!(), } - self.req_lifetime = Instant::now_coarse() - self.request_begin_at; + self.req_lifetime = Instant::now() - self.request_begin_at; self.current_stage = TrackerState::AllItemFinished; self.track(); } diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index fea333903a6..fff9c79cec2 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -127,7 +127,7 @@ where cmd.set_header(header); cmd.set_requests(vec![req].into()); let (cb, future) = paired_future_callback(); - if let Err(e) = router.send_command(cmd, Callback::Read(cb), RaftCmdExtraOpts::default()) { + if let Err(e) = router.send_command(cmd, Callback::read(cb), RaftCmdExtraOpts::default()) { return Err(e.into()); } let mut res = future.await.map_err(|_| { diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index a314315985c..0a3f2fdd742 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -216,7 +216,7 @@ where .read( ctx.read_id, cmd, - StoreCallback::Read(Box::new(move |resp| { + StoreCallback::read(Box::new(move |resp| { cb(on_read_result(resp).map_err(Error::into)); })), ) diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index e66bb3ec40c..30cc8342959 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -552,7 +552,7 @@ fn region_detail>( raft_cmd.set_status_request(status_request); let (tx, rx) = oneshot::channel(); - let cb = Callback::Read(Box::new(|resp| tx.send(resp).unwrap())); + let cb = Callback::read(Box::new(|resp| tx.send(resp).unwrap())); async move { raft_router @@ -592,7 +592,7 @@ fn consistency_check>( raft_cmd.set_admin_request(admin_request); let (tx, rx) = oneshot::channel(); - let cb = Callback::Read(Box::new(|resp| tx.send(resp).unwrap())); + let cb = Callback::read(Box::new(|resp| tx.send(resp).unwrap())); async move { raft_router diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index fa743911b40..79fbd9c6624 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -951,7 +951,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor // so just send it as an command. if let Err(e) = self .ch - .send_command(cmd, Callback::Read(cb), RaftCmdExtraOpts::default()) + .send_command(cmd, Callback::read(cb), RaftCmdExtraOpts::default()) { // Retrun region error instead a gRPC error. let mut resp = ReadIndexResponse::default(); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 3024a05381f..8dbb8a69361 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -696,6 +696,10 @@ impl Storage { wait_wall_time_ms: duration_to_ms(wait_wall_time), process_wall_time_ms: duration_to_ms(process_wall_time), }; + with_tls_tracker(|tracker| { + tracker.metrics.read_pool_schedule_wait_nanos = + schedule_wait_time.as_nanos() as u64; + }); Ok(( result?, KvGetStatistics { @@ -1041,6 +1045,10 @@ impl Storage { stage_snap_recv_ts.saturating_duration_since(stage_begin_ts); let process_wall_time = stage_finished_ts.saturating_duration_since(stage_snap_recv_ts); + with_tls_tracker(|tracker| { + tracker.metrics.read_pool_schedule_wait_nanos = + schedule_wait_time.as_nanos() as u64; + }); let latency_stats = StageLatencyStats { schedule_wait_time_ms: duration_to_ms(schedule_wait_time), snapshot_wait_time_ms: duration_to_ms(snapshot_wait_time), diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index 1143600920f..223b692d579 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -51,7 +51,7 @@ impl SyncBenchRouter { let mut response = RaftCmdResponse::default(); cmd_resp::bind_term(&mut response, 1); match cmd.callback { - Callback::Read(cb) => { + Callback::Read { cb, .. } => { let snapshot = self.db.snapshot(); let region = Arc::new(self.region.to_owned()); cb(ReadResponse { @@ -161,7 +161,7 @@ fn bench_async_snapshots_noop(b: &mut test::Bencher) { } }); let cb: Callback = - Callback::Read(Box::new(move |resp: ReadResponse| { + Callback::read(Box::new(move |resp: ReadResponse| { let res = CmdRes::Snap(resp.snapshot.unwrap()); cb2(Ok(res)); })); diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index 1c38571e280..1ef0471152f 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -21,6 +21,7 @@ mod test_metrics_overflow; mod test_pd_client; mod test_pending_peers; mod test_rawkv; +mod test_read_execution_tracker; mod test_replica_read; mod test_replica_stale_read; mod test_server; diff --git a/tests/failpoints/cases/test_cmd_epoch_checker.rs b/tests/failpoints/cases/test_cmd_epoch_checker.rs index 9de8911754b..d96c467d487 100644 --- a/tests/failpoints/cases/test_cmd_epoch_checker.rs +++ b/tests/failpoints/cases/test_cmd_epoch_checker.rs @@ -101,7 +101,7 @@ fn test_reject_proposal_during_region_split() { // Try to split region. let (split_tx, split_rx) = mpsc::channel(); - let cb = Callback::Read(Box::new(move |resp: ReadResponse| { + let cb = Callback::read(Box::new(move |resp: ReadResponse| { split_tx.send(resp.response).unwrap() })); let r = cluster.get_region(b""); @@ -179,7 +179,7 @@ fn test_reject_proposal_during_region_merge() { fail::cfg(prepare_merge_fp, "pause").unwrap(); // Try to merge region. let (merge_tx, merge_rx) = mpsc::channel(); - let cb = Callback::Read(Box::new(move |resp: ReadResponse| { + let cb = Callback::read(Box::new(move |resp: ReadResponse| { merge_tx.send(resp.response).unwrap() })); let source = cluster.get_region(b""); diff --git a/tests/failpoints/cases/test_read_execution_tracker.rs b/tests/failpoints/cases/test_read_execution_tracker.rs new file mode 100644 index 00000000000..4357d65af5f --- /dev/null +++ b/tests/failpoints/cases/test_read_execution_tracker.rs @@ -0,0 +1,121 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use kvproto::kvrpcpb::*; +use test_coprocessor::{init_with_data, DagSelect, ProductTable}; +use test_raftstore::{ + kv_batch_read, kv_read, must_kv_commit, must_kv_prewrite, must_new_cluster_and_kv_client, +}; + +#[test] +fn test_read_execution_tracking() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (k1, v1) = (b"k1".to_vec(), b"v1".to_vec()); + let (k2, v2) = (b"k2".to_vec(), b"v2".to_vec()); + + // write entries + let mut mutation1 = Mutation::default(); + mutation1.set_op(Op::Put); + mutation1.set_key(k1.clone()); + mutation1.set_value(v1); + + let mut mutation2 = Mutation::default(); + mutation2.set_op(Op::Put); + mutation2.set_key(k2.clone()); + mutation2.set_value(v2); + + must_kv_prewrite( + &client, + ctx.clone(), + vec![mutation1, mutation2], + k1.clone(), + 10, + ); + must_kv_commit( + &client, + ctx.clone(), + vec![k1.clone(), k2.clone()], + 10, + 30, + 30, + ); + + let lease_read_checker = |scan_detail: &ScanDetailV2| { + assert!( + scan_detail.get_read_index_propose_wait_nanos() == 0, + "resp lease read propose wait time={:?}", + scan_detail.get_read_index_propose_wait_nanos() + ); + + assert!( + scan_detail.get_read_index_confirm_wait_nanos() == 0, + "resp lease read confirm wait time={:?}", + scan_detail.get_read_index_confirm_wait_nanos() + ); + + assert!( + scan_detail.get_read_pool_schedule_wait_nanos() > 0, + "resp read pool scheduling wait time={:?}", + scan_detail.get_read_pool_schedule_wait_nanos() + ); + }; + + // should perform lease read + let resp = kv_read(&client, ctx.clone(), k1.clone(), 100); + + lease_read_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + + // should perform lease read + let resp = kv_batch_read(&client, ctx.clone(), vec![k1.clone(), k2.clone()], 100); + + lease_read_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + + let product = ProductTable::new(); + init_with_data(&product, &[(1, Some("name:0"), 2)]); + let mut coprocessor_request = DagSelect::from(&product).build(); + coprocessor_request.set_context(ctx.clone()); + coprocessor_request.set_start_ts(100); + + // should perform lease read + let resp = client.coprocessor(&coprocessor_request).unwrap(); + + lease_read_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + + let read_index_checker = |scan_detail: &ScanDetailV2| { + assert!( + scan_detail.get_read_index_propose_wait_nanos() > 0, + "resp lease read propose wait time={:?}", + scan_detail.get_read_index_propose_wait_nanos() + ); + + assert!( + scan_detail.get_read_index_confirm_wait_nanos() > 0, + "resp lease read confirm wait time={:?}", + scan_detail.get_read_index_confirm_wait_nanos() + ); + + assert!( + scan_detail.get_read_pool_schedule_wait_nanos() > 0, + "resp read pool scheduling wait time={:?}", + scan_detail.get_read_pool_schedule_wait_nanos() + ); + }; + + fail::cfg("perform_read_index", "return()").unwrap(); + + // should perform read index + let resp = kv_read(&client, ctx.clone(), k1.clone(), 100); + + read_index_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + + // should perform read index + let resp = kv_batch_read(&client, ctx, vec![k1, k2], 100); + + read_index_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + + // should perform read index + let resp = client.coprocessor(&coprocessor_request).unwrap(); + + read_index_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + + fail::remove("perform_read_index"); +} diff --git a/tests/failpoints/cases/test_stale_read.rs b/tests/failpoints/cases/test_stale_read.rs index 9a88a73508c..18ddb865fd9 100644 --- a/tests/failpoints/cases/test_stale_read.rs +++ b/tests/failpoints/cases/test_stale_read.rs @@ -362,7 +362,7 @@ fn test_read_index_when_transfer_leader_2() { sim.async_command_on_node( old_leader.get_id(), read_request, - Callback::Read(Box::new(move |resp| tx.send(resp.response).unwrap())), + Callback::read(Box::new(move |resp| tx.send(resp.response).unwrap())), ) .unwrap(); rx diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index 4b69bd4129e..80b90d78045 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -526,7 +526,7 @@ fn test_read_index_stale_in_suspect_lease() { sim.async_command_on_node( old_leader.get_id(), read_request, - Callback::Read(Box::new(move |resp| tx.send(resp.response).unwrap())), + Callback::read(Box::new(move |resp| tx.send(resp.response).unwrap())), ) .unwrap(); rx diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 9a946a806bc..262060b4491 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -666,7 +666,7 @@ fn test_split_region_impl(is_raw_kv: bool) { .collect(); assert_eq!( result_split_keys, - vec![b"b", b"c", b"d", b"e",] + vec![b"b", b"c", b"d", b"e"] .into_iter() .map(|k| encode_key(&k[..])) .collect::>() From 40192af85a92ed0b6d29af0059c837620ef4eb6c Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 30 Aug 2022 11:36:23 +0800 Subject: [PATCH 175/676] storage: implement the FlashbackToVersion txn command (#13345) ref tikv/tikv#13303 Implement the `FlashbackToVersion` txn command, which contains two phases: `command::FlashbackToVersionReadPhase` and `command::FlashbackToVersion`. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- components/tracker/src/lib.rs | 1 + src/storage/metrics.rs | 1 + src/storage/mod.rs | 330 +++++++++++++++++- src/storage/mvcc/reader/reader.rs | 228 +++++++++++- .../txn/commands/flashback_to_version.rs | 134 +++++++ .../flashback_to_version_read_phase.rs | 118 +++++++ src/storage/txn/commands/mod.rs | 12 + 7 files changed, 816 insertions(+), 8 deletions(-) create mode 100644 src/storage/txn/commands/flashback_to_version.rs create mode 100644 src/storage/txn/commands/flashback_to_version_read_phase.rs diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index 664dc1e6767..56ce2aa3280 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -113,6 +113,7 @@ pub enum RequestType { KvTxnHeartBeat, KvRollback, KvPessimisticRollback, + KvFlashbackToVersion, CoprocessorDag, CoprocessorAnalyze, CoprocessorChecksum, diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index e58f7862b37..c1076dca604 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -140,6 +140,7 @@ make_auto_flush_static_metric! { pause, key_mvcc, start_ts_mvcc, + flashback_to_version, raw_get, raw_batch_get, raw_scan, diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 8dbb8a69361..162a58b4801 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3235,14 +3235,17 @@ mod tests { use error_code::ErrorCodeExt; use errors::extract_key_error; use futures::executor::block_on; - use kvproto::kvrpcpb::{AssertionLevel, CommandPri, Op, PrewriteRequestPessimisticAction::*}; + use kvproto::kvrpcpb::{ + Assertion, AssertionLevel, CommandPri, Op, PrewriteRequestPessimisticAction::*, + }; use tikv_util::config::ReadableSize; use tracker::INVALID_TRACKER_TOKEN; - use txn_types::{Mutation, PessimisticLock, WriteType}; + use txn_types::{Mutation, PessimisticLock, WriteType, SHORT_VALUE_MAX_LEN}; use super::{ mvcc::tests::{must_unlocked, must_written}, test_util::*, + txn::commands::FLASHBACK_BATCH_SIZE, *, }; use crate::{ @@ -4407,6 +4410,329 @@ mod tests { ); } + #[test] + fn test_flashback_to_version() { + let storage = TestStorageBuilderApiV1::new(DummyLockManager) + .build() + .unwrap(); + let writes = vec![ + // (Mutation, StartTS, CommitTS) + ( + Mutation::Put((Key::from_raw(b"k"), b"v@1".to_vec()), Assertion::None), + 1, + 2, + ), + ( + Mutation::Put((Key::from_raw(b"k"), b"v@3".to_vec()), Assertion::None), + 3, + 4, + ), + ( + Mutation::Put((Key::from_raw(b"k"), b"v@5".to_vec()), Assertion::None), + 5, + 6, + ), + ( + Mutation::Put((Key::from_raw(b"k"), b"v@7".to_vec()), Assertion::None), + 7, + 8, + ), + ( + Mutation::Delete(Key::from_raw(b"k"), Assertion::None), + 9, + 10, + ), + ( + Mutation::Put((Key::from_raw(b"k"), b"v@11".to_vec()), Assertion::None), + 11, + 12, + ), + // Non-short value + ( + Mutation::Put( + (Key::from_raw(b"k"), vec![b'v'; SHORT_VALUE_MAX_LEN + 1]), + Assertion::None, + ), + 13, + 14, + ), + ]; + let (tx, rx) = channel(); + // Prewrite and commit. + for write in writes.iter() { + let (key, value) = write.0.clone().into_key_value(); + let start_ts = write.1.into(); + let commit_ts = write.2.into(); + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![write.0.clone()], + key.clone().to_raw().unwrap(), + start_ts, + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new( + vec![key.clone()], + start_ts, + commit_ts, + Context::default(), + ), + expect_value_callback(tx.clone(), 1, TxnStatus::committed(commit_ts)), + ) + .unwrap(); + rx.recv().unwrap(); + if let Mutation::Put(..) = write.0 { + expect_value( + value.unwrap(), + block_on(storage.get(Context::default(), key.clone(), commit_ts)) + .unwrap() + .0, + ); + } else { + expect_none( + block_on(storage.get(Context::default(), key, commit_ts)) + .unwrap() + .0, + ); + } + } + // Flashback. + for idx in (0..writes.len()).rev() { + let write = &writes[idx]; + let key = write.0.key(); + let start_ts = write.1.into(); + let commit_ts = write.2.into(); + storage + .sched_txn_command( + commands::FlashbackToVersionReadPhase::new( + start_ts, + None, + Some(key.clone()), + Some(key.clone()), + Context::default(), + ), + expect_ok_callback(tx.clone(), 2), + ) + .unwrap(); + rx.recv().unwrap(); + if idx == 0 || matches!(writes[idx - 1].0, Mutation::Delete(..)) { + expect_none( + block_on(storage.get(Context::default(), key.clone(), commit_ts)) + .unwrap() + .0, + ); + } else { + let (_, old_value) = writes[idx - 1].0.clone().into_key_value(); + expect_value( + old_value.unwrap(), + block_on(storage.get(Context::default(), key.clone(), commit_ts)) + .unwrap() + .0, + ); + } + } + } + + #[test] + fn test_flashback_to_version_lock() { + let storage = TestStorageBuilderApiV1::new(DummyLockManager) + .build() + .unwrap(); + let (tx, rx) = channel(); + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![Mutation::make_put(Key::from_raw(b"k"), b"v@1".to_vec())], + b"k".to_vec(), + 1.into(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k")], + 1.into(), + 2.into(), + Context::default(), + ), + expect_value_callback(tx.clone(), 1, TxnStatus::committed(2.into())), + ) + .unwrap(); + rx.recv().unwrap(); + expect_value( + b"v@1".to_vec(), + block_on(storage.get(Context::default(), Key::from_raw(b"k"), 2.into())) + .unwrap() + .0, + ); + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![Mutation::make_put(Key::from_raw(b"k"), b"v@3".to_vec())], + b"k".to_vec(), + 3.into(), + ), + expect_ok_callback(tx.clone(), 2), + ) + .unwrap(); + rx.recv().unwrap(); + expect_error( + |e| match e { + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(mvcc::Error( + box mvcc::ErrorInner::KeyIsLocked { .. }, + ))))) => (), + e => panic!("unexpected error chain: {:?}", e), + }, + block_on(storage.get(Context::default(), Key::from_raw(b"k"), 3.into())), + ); + + storage + .sched_txn_command( + commands::FlashbackToVersionReadPhase::new( + 2.into(), + None, + Some(Key::from_raw(b"k")), + Some(Key::from_raw(b"k")), + Context::default(), + ), + expect_ok_callback(tx.clone(), 3), + ) + .unwrap(); + rx.recv().unwrap(); + expect_value( + b"v@1".to_vec(), + block_on(storage.get(Context::default(), Key::from_raw(b"k"), 3.into())) + .unwrap() + .0, + ); + storage + .sched_txn_command( + commands::FlashbackToVersionReadPhase::new( + 1.into(), + None, + Some(Key::from_raw(b"k")), + Some(Key::from_raw(b"k")), + Context::default(), + ), + expect_ok_callback(tx, 3), + ) + .unwrap(); + rx.recv().unwrap(); + expect_none( + block_on(storage.get(Context::default(), Key::from_raw(b"k"), 3.into())) + .unwrap() + .0, + ); + } + + #[test] + fn test_flashback_to_version_in_multi_batch() { + let storage = TestStorageBuilderApiV1::new(DummyLockManager) + .build() + .unwrap(); + let (tx, rx) = channel(); + // Add (FLASHBACK_BATCH_SIZE * 2) lock records. + for i in 1..=FLASHBACK_BATCH_SIZE * 2 { + let start_ts = (i as u64).into(); + let key = Key::from_raw(format!("k{}", i).as_bytes()); + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![Mutation::make_put( + key.clone(), + format!("v@{}", i).as_bytes().to_vec(), + )], + key.to_raw().unwrap(), + start_ts, + ), + expect_ok_callback(tx.clone(), i as i32), + ) + .unwrap(); + rx.recv().unwrap(); + expect_error( + |e| match e { + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(mvcc::Error( + box mvcc::ErrorInner::KeyIsLocked { .. }, + ))))) => (), + e => panic!("unexpected error chain: {:?}", e), + }, + block_on(storage.get(Context::default(), key, start_ts)), + ); + } + // Add (FLASHBACK_BATCH_SIZE * 2) write records. + for i in FLASHBACK_BATCH_SIZE * 2 + 1..=FLASHBACK_BATCH_SIZE * 4 { + let start_ts = (i as u64).into(); + let commit_ts = ((i + 1) as u64).into(); + let key = Key::from_raw(format!("k{}", i).as_bytes()); + let value = format!("v@{}", i).as_bytes().to_vec(); + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![Mutation::make_put(key.clone(), value.clone())], + key.to_raw().unwrap(), + start_ts, + ), + expect_ok_callback(tx.clone(), i as i32), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new( + vec![key.clone()], + start_ts, + commit_ts, + Context::default(), + ), + expect_value_callback(tx.clone(), i as i32, TxnStatus::committed(commit_ts)), + ) + .unwrap(); + rx.recv().unwrap(); + expect_value( + value, + block_on(storage.get(Context::default(), key, commit_ts)) + .unwrap() + .0, + ); + } + // Flashback all records. + storage + .sched_txn_command( + commands::FlashbackToVersionReadPhase::new( + TimeStamp::zero(), + None, + Some(Key::from_raw(b"k")), + Some(Key::from_raw(b"k")), + Context::default(), + ), + expect_ok_callback(tx, 2), + ) + .unwrap(); + rx.recv().unwrap(); + expect_none( + block_on(storage.get(Context::default(), Key::from_raw(b"k1"), 1.into())) + .unwrap() + .0, + ); + expect_none( + block_on(storage.get( + Context::default(), + Key::from_raw(format!("k{}", FLASHBACK_BATCH_SIZE * 4).as_bytes()), + ((FLASHBACK_BATCH_SIZE * 4 + 1) as u64).into(), + )) + .unwrap() + .0, + ); + } + #[test] fn test_high_priority_get_put() { let storage = TestStorageBuilderApiV1::new(DummyLockManager) diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index f1ed7748a15..2a43ac24583 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -281,7 +281,7 @@ impl MvccReader { self.current_key = Some(key.clone()); self.write_cursor.take(); } - self.create_write_cursor()?; + self.create_write_cursor(None)?; let cursor = self.write_cursor.as_mut().unwrap(); // find a `ts` encoded key which is less than the `ts` encoded version of the // `key` @@ -427,13 +427,14 @@ impl MvccReader { Ok(()) } - fn create_write_cursor(&mut self) -> Result<()> { + fn create_write_cursor(&mut self, hint_min_ts: Option) -> Result<()> { if self.write_cursor.is_none() { let cursor = CursorBuilder::new(&self.snapshot, CF_WRITE) .fill_cache(self.fill_cache) // Only use prefix seek in non-scan mode. .prefix_seek(self.scan_mode.is_none()) .scan_mode(self.get_scan_mode(true)) + .hint_min_ts(hint_min_ts) .build()?; self.write_cursor = Some(cursor); } @@ -454,7 +455,7 @@ impl MvccReader { /// Return the first committed key for which `start_ts` equals to `ts` pub fn seek_ts(&mut self, ts: TimeStamp) -> Result> { assert!(self.scan_mode.is_some()); - self.create_write_cursor()?; + self.create_write_cursor(None)?; let cursor = self.write_cursor.as_mut().unwrap(); let mut ok = cursor.seek_to_first(&mut self.statistics.write); @@ -471,11 +472,11 @@ impl MvccReader { Ok(None) } - /// Scan locks that satisfies `filter(lock)` returns true, from the given - /// start key `start`. At most `limit` locks will be returned. If `limit` is + /// Scan locks that satisfies `filter(lock)` returns true in the key range + /// [start, end). At most `limit` locks will be returned. If `limit` is /// set to `0`, it means unlimited. /// - /// The return type is `(locks, is_remain)`. `is_remain` indicates whether + /// The return type is `(locks, has_remain)`. `has_remain` indicates whether /// there MAY be remaining locks that can be scanned. pub fn scan_locks( &mut self, @@ -520,6 +521,57 @@ impl MvccReader { Ok((locks, false)) } + /// Scan writes that satisfies `filter(key)` returns true in the key range + /// [start, end). At most `limit` locks will be returned. If `limit` is + /// set to `0`, it means unlimited. + /// + /// The return type is `(writes, has_remain)`. `has_remain` indicates + /// whether there MAY be remaining writes that can be scanned. + pub fn scan_writes( + &mut self, + start: Option<&Key>, + end: Option<&Key>, + filter: F, + limit: usize, + hint_min_ts: Option, + ) -> Result<(Vec<(Key, Write)>, bool)> + where + F: Fn(&Key) -> bool, + { + self.create_write_cursor(hint_min_ts)?; + let cursor = self.write_cursor.as_mut().unwrap(); + let ok = match start { + Some(x) => cursor.seek(x, &mut self.statistics.write)?, + None => cursor.seek_to_first(&mut self.statistics.write), + }; + if !ok { + return Ok((vec![], false)); + } + let mut writes = Vec::with_capacity(limit); + while cursor.valid()? { + let key = Key::from_encoded_slice(cursor.key(&mut self.statistics.write)); + if let Some(end) = end { + if key >= *end { + return Ok((writes, false)); + } + } + + if filter(&key) { + writes.push(( + key, + WriteRef::parse(cursor.value(&mut self.statistics.write))?.to_owned(), + )); + if limit > 0 && writes.len() == limit { + return Ok((writes, true)); + } + } + cursor.next(&mut self.statistics.lock); + } + self.statistics.write.processed_keys += writes.len(); + resource_metering::record_read_keys(writes.len() as u32); + Ok((writes, false)) + } + pub fn scan_keys( &mut self, mut start: Option, @@ -1604,6 +1656,170 @@ pub mod tests { ); } + #[test] + fn test_scan_writes() { + let path = tempfile::Builder::new() + .prefix("_test_storage_mvcc_reader_scan_writes") + .tempdir() + .unwrap(); + let path = path.path().to_str().unwrap(); + let region = make_region(1, vec![], vec![]); + let db = open_db(path, true); + let mut engine = RegionEngine::new(&db, ®ion); + + // Put some writes to the db. + engine.prewrite( + Mutation::make_put(Key::from_raw(b"k1"), b"v1@1".to_vec()), + b"k1", + 1, + ); + engine.commit(b"k1", 1, 2); + engine.prewrite( + Mutation::make_put(Key::from_raw(b"k1"), b"v1@3".to_vec()), + b"k1", + 3, + ); + engine.commit(b"k1", 3, 4); + engine.prewrite( + Mutation::make_put(Key::from_raw(b"k1"), b"v1@5".to_vec()), + b"k1", + 5, + ); + engine.prewrite( + Mutation::make_put(Key::from_raw(b"k2"), b"v2@1".to_vec()), + b"k2", + 1, + ); + engine.commit(b"k2", 1, 2); + engine.prewrite( + Mutation::make_put(Key::from_raw(b"k2"), b"v2@3".to_vec()), + b"k2", + 3, + ); + engine.commit(b"k2", 3, 4); + + // Creates a reader and scan writes. + let check_scan_write = |start_key: Option, + end_key: Option, + filter: Box bool>, + limit, + expect_res: &[_], + expect_is_remain: bool| { + let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); + let mut reader = MvccReader::new(snap, Some(ScanMode::Forward), false); + let res = reader + .scan_writes(start_key.as_ref(), end_key.as_ref(), filter, limit, None) + .unwrap(); + assert_eq!(res.0, expect_res); + assert_eq!(res.1, expect_is_remain); + }; + + check_scan_write( + None, + None, + Box::new(|key| key.decode_ts().unwrap() >= 1.into()), + 1, + &[( + Key::from_raw(b"k1").append_ts(4.into()), + Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec())), + )], + true, + ); + check_scan_write( + None, + None, + Box::new(|key| key.decode_ts().unwrap() >= 1.into()), + 5, + &[ + ( + Key::from_raw(b"k1").append_ts(4.into()), + Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec())), + ), + ( + Key::from_raw(b"k1").append_ts(2.into()), + Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec())), + ), + ( + Key::from_raw(b"k2").append_ts(4.into()), + Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec())), + ), + ( + Key::from_raw(b"k2").append_ts(2.into()), + Write::new(WriteType::Put, 1.into(), Some(b"v2@1".to_vec())), + ), + ], + false, + ); + check_scan_write( + Some(Key::from_raw(b"k2")), + None, + Box::new(|key| key.decode_ts().unwrap() >= 1.into()), + 3, + &[ + ( + Key::from_raw(b"k2").append_ts(4.into()), + Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec())), + ), + ( + Key::from_raw(b"k2").append_ts(2.into()), + Write::new(WriteType::Put, 1.into(), Some(b"v2@1".to_vec())), + ), + ], + false, + ); + check_scan_write( + None, + Some(Key::from_raw(b"k2")), + Box::new(|key| key.decode_ts().unwrap() >= 1.into()), + 4, + &[ + ( + Key::from_raw(b"k1").append_ts(4.into()), + Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec())), + ), + ( + Key::from_raw(b"k1").append_ts(2.into()), + Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec())), + ), + ], + false, + ); + check_scan_write( + Some(Key::from_raw(b"k1")), + Some(Key::from_raw(b"k2")), + Box::new(|key| key.decode_ts().unwrap() >= 1.into()), + 4, + &[ + ( + Key::from_raw(b"k1").append_ts(4.into()), + Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec())), + ), + ( + Key::from_raw(b"k1").append_ts(2.into()), + Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec())), + ), + ], + false, + ); + check_scan_write( + None, + None, + Box::new(|key| key.decode_ts().unwrap() < 4.into()), + 4, + &[ + ( + Key::from_raw(b"k1").append_ts(2.into()), + Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec())), + ), + ( + Key::from_raw(b"k2").append_ts(2.into()), + Write::new(WriteType::Put, 1.into(), Some(b"v2@1".to_vec())), + ), + ], + false, + ); + } + #[test] fn test_load_data() { let path = tempfile::Builder::new() diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs new file mode 100644 index 00000000000..058758888d5 --- /dev/null +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -0,0 +1,134 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// #[PerformanceCriticalPath] +use txn_types::{Key, Lock, LockType, TimeStamp, Write, WriteType}; + +use crate::storage::{ + kv::WriteData, + lock_manager::LockManager, + mvcc::{MvccTxn, MAX_TXN_WRITE_SIZE}, + txn::{ + commands::{ + Command, CommandExt, FlashbackToVersionReadPhase, ResponsePolicy, TypedCommand, + WriteCommand, WriteContext, WriteResult, + }, + latch, Result, + }, + ProcessResult, Snapshot, +}; + +command! { + FlashbackToVersion: + cmd_ty => (), + display => "kv::command::flashback_to_version @{} | {:?}", (version ,ctx), + content => { + version: TimeStamp, + end_key: Option, + next_lock_key: Option, + next_write_key: Option, + key_locks: Vec<(Key, Lock)>, + key_writes: Vec<(Key, Write)>, + } +} + +impl CommandExt for FlashbackToVersion { + ctx!(); + tag!(flashback_to_version); + request_type!(KvFlashbackToVersion); + + fn gen_lock(&self) -> latch::Lock { + latch::Lock::new( + self.key_locks + .iter() + .map(|(key, _)| key) + .chain(self.key_writes.iter().map(|(key, _)| key)), + ) + } + + fn write_bytes(&self) -> usize { + self.key_locks + .iter() + .map(|(key, _)| key.as_encoded().len()) + .chain( + self.key_writes + .iter() + .map(|(key, _)| key.as_encoded().len()), + ) + .sum() + } +} + +impl WriteCommand for FlashbackToVersion { + fn process_write(mut self, _snapshot: S, context: WriteContext<'_, L>) -> Result { + let mut txn = MvccTxn::new(TimeStamp::zero(), context.concurrency_manager); + + let mut rows = 0; + let mut next_lock_key = self.next_lock_key.take(); + let mut next_write_key = self.next_write_key.take(); + // To flashback the `CF_LOCK`, we need to delete all locks records whose + // `start_ts` is greater than the specified version, and if it's not a + // short-value `LockType::Put`, we need to delete the actual data from + // `CF_DEFAULT` as well. + // TODO: `resolved_ts` should be taken into account. + for (key, lock) in self.key_locks { + if txn.write_size() >= MAX_TXN_WRITE_SIZE { + next_lock_key = Some(key); + break; + } + txn.unlock_key(key.clone(), lock.is_pessimistic_txn()); + rows += 1; + // If the short value is none and it's a `LockType::Put`, we should delete the + // corresponding key from `CF_DEFAULT` as well. + if lock.short_value.is_none() && lock.lock_type == LockType::Put { + txn.delete_value(key, lock.ts); + rows += 1; + } + } + // To flashback the `CF_WRITE`, we need to delete all write records whose + // `commit_ts` is greater than the specified version, and if it's not a + // short-value `WriteType::Put`, we need to delete the actual data from + // `CF_DEFAULT` as well. + for (key, write) in self.key_writes { + if txn.write_size() >= MAX_TXN_WRITE_SIZE { + next_write_key = Some(key); + break; + } + let encoded_key = key.clone().truncate_ts()?; + let commit_ts = key.decode_ts()?; + txn.delete_write(encoded_key.clone(), commit_ts); + rows += 1; + // If the short value is none and it's a `WriteType::Put`, we should delete the + // corresponding key from `CF_DEFAULT` as well. + if write.short_value.is_none() && write.write_type == WriteType::Put { + txn.delete_value(encoded_key, write.start_ts); + rows += 1; + } + } + + let mut write_data = WriteData::from_modifies(txn.into_modifies()); + write_data.set_allowed_on_disk_almost_full(); + Ok(WriteResult { + ctx: self.ctx.clone(), + to_be_write: write_data, + rows, + pr: if next_lock_key.is_none() && next_write_key.is_none() { + ProcessResult::Res + } else { + let next_cmd = FlashbackToVersionReadPhase { + ctx: self.ctx.clone(), + deadline: self.deadline, + version: self.version, + end_key: self.end_key, + next_lock_key, + next_write_key, + }; + ProcessResult::NextCommand { + cmd: Command::FlashbackToVersionReadPhase(next_cmd), + } + }, + lock_info: None, + lock_guards: vec![], + response_policy: ResponsePolicy::OnApplied, + }) + } +} diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs new file mode 100644 index 00000000000..5feedd80eb8 --- /dev/null +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -0,0 +1,118 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// #[PerformanceCriticalPath] +use txn_types::{Key, TimeStamp}; + +use crate::storage::{ + mvcc::MvccReader, + txn::{ + commands::{ + Command, CommandExt, FlashbackToVersion, ProcessResult, ReadCommand, TypedCommand, + }, + sched_pool::tls_collect_keyread_histogram_vec, + Result, + }, + ScanMode, Snapshot, Statistics, +}; + +command! { + FlashbackToVersionReadPhase: + cmd_ty => (), + display => "kv::command::flashback_to_version_read_phase | {:?}", (ctx), + content => { + version: TimeStamp, + end_key: Option, + next_lock_key: Option, + next_write_key: Option, + } +} + +impl CommandExt for FlashbackToVersionReadPhase { + ctx!(); + tag!(flashback_to_version); + request_type!(KvFlashbackToVersion); + property!(readonly); + gen_lock!(empty); + + fn write_bytes(&self) -> usize { + 0 + } +} + +pub const FLASHBACK_BATCH_SIZE: usize = 256; + +impl ReadCommand for FlashbackToVersionReadPhase { + fn process_read(self, snapshot: S, statistics: &mut Statistics) -> Result { + let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &self.ctx); + // Scan the locks. + let mut key_locks = Vec::with_capacity(0); + let mut has_remain_locks = false; + if self.next_lock_key.is_some() { + let key_locks_result = reader.scan_locks( + self.next_lock_key.as_ref(), + self.end_key.as_ref(), + // To flashback `CF_LOCK`, we need to delete all locks. + |_| true, + FLASHBACK_BATCH_SIZE, + ); + statistics.add(&reader.statistics); + (key_locks, has_remain_locks) = key_locks_result?; + } + // Scan the writes. + let mut key_writes = Vec::with_capacity(0); + let mut has_remain_writes = false; + // The batch is not full, we can still read. + if self.next_write_key.is_some() && key_locks.len() < FLASHBACK_BATCH_SIZE { + let key_writes_result = reader.scan_writes( + self.next_write_key.as_ref(), + self.end_key.as_ref(), + // To flashback `CF_WRITE` and `CF_DEFAULT`, we need to delete all keys whose + // commit_ts is greater than the specified version. + |key| key.decode_ts().unwrap() > self.version, + FLASHBACK_BATCH_SIZE - key_locks.len(), + Some(self.version), + ); + statistics.add(&reader.statistics); + (key_writes, has_remain_writes) = key_writes_result?; + } else if self.next_write_key.is_some() && key_locks.len() >= FLASHBACK_BATCH_SIZE { + // The batch is full, we need to read the writes in the next batch later. + has_remain_writes = true; + } + tls_collect_keyread_histogram_vec( + self.tag().get_str(), + (key_locks.len() + key_writes.len()) as f64, + ); + + if key_locks.is_empty() && key_writes.is_empty() { + Ok(ProcessResult::Res) + } else { + let next_lock_key = if has_remain_locks { + key_locks.last().map(|(key, _)| key.clone()) + } else { + None + }; + let next_write_key = if has_remain_writes && !key_writes.is_empty() { + key_writes.last().map(|(key, _)| key.clone()) + } else if has_remain_writes && key_writes.is_empty() { + // We haven't read any write yet, so we need to read the writes in the next + // batch later. + self.next_write_key + } else { + None + }; + let next_cmd = FlashbackToVersion { + ctx: self.ctx, + deadline: self.deadline, + version: self.version, + end_key: self.end_key, + key_locks, + key_writes, + next_lock_key, + next_write_key, + }; + Ok(ProcessResult::NextCommand { + cmd: Command::FlashbackToVersion(next_cmd), + }) + } + } +} diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 3dc1a37697e..f4794d6a0db 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -11,6 +11,8 @@ pub(crate) mod check_txn_status; pub(crate) mod cleanup; pub(crate) mod commit; pub(crate) mod compare_and_swap; +pub(crate) mod flashback_to_version; +pub(crate) mod flashback_to_version_read_phase; pub(crate) mod mvcc_by_key; pub(crate) mod mvcc_by_start_ts; pub(crate) mod pause; @@ -37,6 +39,8 @@ pub use cleanup::Cleanup; pub use commit::Commit; pub use compare_and_swap::RawCompareAndSwap; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; +pub use flashback_to_version::FlashbackToVersion; +pub use flashback_to_version_read_phase::{FlashbackToVersionReadPhase, FLASHBACK_BATCH_SIZE}; use kvproto::kvrpcpb::*; pub use mvcc_by_key::MvccByKey; pub use mvcc_by_start_ts::MvccByStartTs; @@ -92,6 +96,8 @@ pub enum Command { MvccByStartTs(MvccByStartTs), RawCompareAndSwap(RawCompareAndSwap), RawAtomicStore(RawAtomicStore), + FlashbackToVersionReadPhase(FlashbackToVersionReadPhase), + FlashbackToVersion(FlashbackToVersion), } /// A `Command` with its return type, reified as the generic parameter `T`. @@ -567,6 +573,8 @@ impl Command { Command::MvccByStartTs(t) => t, Command::RawCompareAndSwap(t) => t, Command::RawAtomicStore(t) => t, + Command::FlashbackToVersionReadPhase(t) => t, + Command::FlashbackToVersion(t) => t, } } @@ -590,6 +598,8 @@ impl Command { Command::MvccByStartTs(t) => t, Command::RawCompareAndSwap(t) => t, Command::RawAtomicStore(t) => t, + Command::FlashbackToVersionReadPhase(t) => t, + Command::FlashbackToVersion(t) => t, } } @@ -602,6 +612,7 @@ impl Command { Command::ResolveLockReadPhase(t) => t.process_read(snapshot, statistics), Command::MvccByKey(t) => t.process_read(snapshot, statistics), Command::MvccByStartTs(t) => t.process_read(snapshot, statistics), + Command::FlashbackToVersionReadPhase(t) => t.process_read(snapshot, statistics), _ => panic!("unsupported read command"), } } @@ -627,6 +638,7 @@ impl Command { Command::Pause(t) => t.process_write(snapshot, context), Command::RawCompareAndSwap(t) => t.process_write(snapshot, context), Command::RawAtomicStore(t) => t.process_write(snapshot, context), + Command::FlashbackToVersion(t) => t.process_write(snapshot, context), _ => panic!("unsupported write command"), } } From f208e1b921333dd347195a086852419718c3bf2a Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Tue, 30 Aug 2022 13:02:24 +0800 Subject: [PATCH 176/676] dr-auto-sync: enable min-resolved-ts report by default (#13305) ref tikv/tikv#13219 Signed-off-by: lhy1024 --- components/raftstore/src/store/config.rs | 2 +- components/raftstore/src/store/worker/pd.rs | 3 +++ components/resolved_ts/tests/failpoints/mod.rs | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 89b5cfc1ac9..34805e4c9ca 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -384,7 +384,7 @@ impl Default for Config { region_split_size: ReadableSize(0), clean_stale_peer_delay: ReadableDuration::minutes(0), inspect_interval: ReadableDuration::millis(500), - report_min_resolved_ts_interval: ReadableDuration::millis(0), + report_min_resolved_ts_interval: ReadableDuration::secs(1), check_leader_lease_interval: ReadableDuration::secs(0), renew_leader_lease_advance_duration: ReadableDuration::secs(0), report_region_buckets_tick_interval: ReadableDuration::secs(10), diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 4ac03e2578b..45a3827e8f5 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -448,6 +448,9 @@ fn config(interval: Duration) -> Duration { fail_point!("mock_min_resolved_ts_interval", |_| { Duration::from_millis(50) }); + fail_point!("mock_min_resolved_ts_interval_disable", |_| { + Duration::from_millis(0) + }); interval } diff --git a/components/resolved_ts/tests/failpoints/mod.rs b/components/resolved_ts/tests/failpoints/mod.rs index e734864471a..ab4e88f9d25 100644 --- a/components/resolved_ts/tests/failpoints/mod.rs +++ b/components/resolved_ts/tests/failpoints/mod.rs @@ -7,6 +7,7 @@ use kvproto::kvrpcpb::*; use pd_client::PdClient; use test_raftstore::{new_peer, sleep_ms}; pub use testsuite::*; +use tikv_util::config::ReadableDuration; use txn_types::TimeStamp; #[test] @@ -57,6 +58,16 @@ fn test_report_min_resolved_ts() { fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); fail::cfg("mock_min_resolved_ts_interval", "return(0)").unwrap(); let mut suite = TestSuite::new(1); + // default config is 1s + assert_eq!( + suite + .cluster + .cfg + .tikv + .raft_store + .report_min_resolved_ts_interval, + ReadableDuration::secs(1) + ); let region = suite.cluster.get_region(&[]); let ts1 = suite.cluster.pd_client.get_min_resolved_ts(); @@ -89,6 +100,7 @@ fn test_report_min_resolved_ts() { fn test_report_min_resolved_ts_disable() { fail::cfg("mock_tick_interval", "return(0)").unwrap(); fail::cfg("mock_collect_tick_interval", "return(0)").unwrap(); + fail::cfg("mock_min_resolved_ts_interval_disable", "return(0)").unwrap(); let mut suite = TestSuite::new(1); let region = suite.cluster.get_region(&[]); let ts1 = suite.cluster.pd_client.get_min_resolved_ts(); @@ -113,5 +125,6 @@ fn test_report_min_resolved_ts_disable() { assert!(ts3 == ts1); fail::remove("mock_tick_interval"); fail::remove("mock_collect_tick_interval"); + fail::remove("mock_min_resolved_ts_interval_disable"); suite.stop(); } From 564f6e3ae3da0022c4de78f4345516ced147b58b Mon Sep 17 00:00:00 2001 From: TonsnakeLin <87681388+TonsnakeLin@users.noreply.github.com> Date: Tue, 30 Aug 2022 15:00:23 +0800 Subject: [PATCH 177/676] PessmistincLock: lock the key if exists (#13211) close tikv/tikv#13210 Signed-off-by: Jay Lee Signed-off-by: TonsnakeLin Signed-off-by: CalvinNeo Signed-off-by: ystaticy Signed-off-by: hehechen Signed-off-by: ekexium Signed-off-by: BornChanger Signed-off-by: LintianShi Signed-off-by: Lintian Shi Signed-off-by: OneSizeFitQuorum Co-authored-by: Jay Co-authored-by: Calvin Neo Co-authored-by: ystaticy Co-authored-by: hehechen Co-authored-by: ekexium Co-authored-by: BornChanger <97348524+BornChanger@users.noreply.github.com> Co-authored-by: Lintian Shi Co-authored-by: Potato Co-authored-by: Ti Chi Robot Co-authored-by: Ping Yu Co-authored-by: Yilin Chen Co-authored-by: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Co-authored-by: LintianShi --- components/error_code/src/storage.rs | 1 + src/storage/mod.rs | 6 + src/storage/mvcc/mod.rs | 13 ++ src/storage/mvcc/reader/reader.rs | 1 + src/storage/mvcc/txn.rs | 3 +- .../txn/actions/acquire_pessimistic_lock.rs | 156 ++++++++++++++++-- .../txn/commands/acquire_pessimistic_lock.rs | 6 +- src/storage/txn/commands/mod.rs | 1 + src/storage/txn/scheduler.rs | 1 + tests/failpoints/cases/test_storage.rs | 2 + 10 files changed, 172 insertions(+), 18 deletions(-) diff --git a/components/error_code/src/storage.rs b/components/error_code/src/storage.rs index 61b81215438..ff994032dea 100644 --- a/components/error_code/src/storage.rs +++ b/components/error_code/src/storage.rs @@ -40,6 +40,7 @@ define_error_codes!( COMMIT_TS_TOO_LARGE => ("CommitTsTooLarge", "", ""), ASSERTION_FAILED => ("AssertionFailed", "", ""), + LOCK_IF_EXISTS_FAILED => ("LockIfExistsFailed", "", ""), UNKNOWN => ("Unknown", "", "") ); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 162a58b4801..0a7801848b9 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3120,6 +3120,7 @@ pub mod test_util { for_update_ts.next(), OldValues::default(), check_existence, + false, Context::default(), ) } @@ -7775,6 +7776,7 @@ mod tests { 21.into(), OldValues::default(), false, + false, Context::default(), ), expect_ok_callback(tx, 0), @@ -8465,6 +8467,7 @@ mod tests { 0.into(), OldValues::default(), false, + false, Default::default(), ), expect_ok_callback(tx.clone(), 0), @@ -8487,6 +8490,7 @@ mod tests { 0.into(), OldValues::default(), false, + false, Default::default(), ), expect_ok_callback(tx.clone(), 0), @@ -8714,6 +8718,7 @@ mod tests { TimeStamp::new(12), OldValues::default(), false, + false, Context::default(), ), pipelined_pessimistic_lock: true, @@ -8739,6 +8744,7 @@ mod tests { TimeStamp::new(12), OldValues::default(), false, + false, Context::default(), ), pipelined_pessimistic_lock: false, diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 1a554a4410b..f787014fd01 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -158,6 +158,12 @@ pub enum ErrorInner { existing_commit_ts: TimeStamp, }, + #[error( + "Lock_only_if_exists of a pessimistic lock request is set to true, but return_value is not, start_ts:{}, key:{}", + .start_ts, log_wrappers::Value::key(.key) + )] + LockIfExistsFailed { start_ts: TimeStamp, key: Vec }, + #[error("{0:?}")] Other(#[from] Box), } @@ -276,6 +282,12 @@ impl ErrorInner { existing_start_ts: *existing_start_ts, existing_commit_ts: *existing_commit_ts, }), + ErrorInner::LockIfExistsFailed { start_ts, key } => { + Some(ErrorInner::LockIfExistsFailed { + start_ts: *start_ts, + key: key.clone(), + }) + } ErrorInner::Io(_) | ErrorInner::Other(_) => None, } } @@ -375,6 +387,7 @@ impl ErrorCodeExt for Error { } ErrorInner::CommitTsTooLarge { .. } => error_code::storage::COMMIT_TS_TOO_LARGE, ErrorInner::AssertionFailed { .. } => error_code::storage::ASSERTION_FAILED, + ErrorInner::LockIfExistsFailed { .. } => error_code::storage::LOCK_IF_EXISTS_FAILED, ErrorInner::Other(_) => error_code::storage::UNKNOWN, } } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 2a43ac24583..eb83af270a1 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -856,6 +856,7 @@ pub mod tests { false, TimeStamp::zero(), true, + false, ) .unwrap(); self.write(txn.into_modifies()); diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index a9032d1b463..c02d8ef97c8 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -1050,6 +1050,7 @@ pub(crate) mod tests { false, false, TimeStamp::zero(), + false, ); } @@ -1316,7 +1317,7 @@ pub(crate) mod tests { // Simulate that min_commit_ts is pushed forward larger than latest_ts must_acquire_pessimistic_lock_impl( - &engine, b"key", b"key", 2, false, 20000, 2, false, false, 100, + &engine, b"key", b"key", 2, false, 20000, 2, false, false, 100, false, ); let snapshot = engine.snapshot(Default::default()).unwrap(); diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 9df4d9ebce9..7e30dcdd37c 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -36,11 +36,23 @@ pub fn acquire_pessimistic_lock( need_check_existence: bool, min_commit_ts: TimeStamp, need_old_value: bool, + lock_only_if_exists: bool, ) -> MvccResult<(Option, OldValue)> { fail_point!("acquire_pessimistic_lock", |err| Err( crate::storage::mvcc::txn::make_txn_error(err, &key, reader.start_ts).into() )); - + if lock_only_if_exists && !need_value { + error!( + "lock_only_if_exists of a pessimistic lock request is set to true, but return_value is not"; + "start_ts" => reader.start_ts, + "key" => log_wrappers::Value::key(key.as_encoded()), + ); + return Err(ErrorInner::LockIfExistsFailed { + start_ts: reader.start_ts, + key: key.into_raw()?, + } + .into()); + } // Update max_ts for Insert operation to guarantee linearizability and snapshot // isolation if should_not_exist { @@ -243,7 +255,12 @@ pub fn acquire_pessimistic_lock( for_update_ts, min_commit_ts, }; - txn.put_pessimistic_lock(key, lock); + + // When lock_only_if_exists is false, always accquire pessimitic lock, otherwise + // do it when val exists + if !lock_only_if_exists || val.is_some() { + txn.put_pessimistic_lock(key, lock); + } // TODO don't we need to commit the modifies in txn? Ok((ret_val(need_value, need_check_existence, val), old_value)) @@ -284,6 +301,7 @@ pub mod tests { need_value: bool, need_check_existence: bool, min_commit_ts: impl Into, + lock_only_if_exists: bool, ) -> Option { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -304,6 +322,7 @@ pub mod tests { need_check_existence, min_commit_ts, false, + lock_only_if_exists, ) .unwrap(); let modifies = txn.into_modifies(); @@ -331,6 +350,7 @@ pub mod tests { pk: &[u8], start_ts: impl Into, for_update_ts: impl Into, + lock_only_if_exists: bool, ) -> Option { must_succeed_impl( engine, @@ -343,6 +363,7 @@ pub mod tests { true, false, TimeStamp::zero(), + lock_only_if_exists, ) } @@ -366,6 +387,7 @@ pub mod tests { false, false, TimeStamp::zero(), + false, ) .is_none() ); @@ -392,6 +414,7 @@ pub mod tests { false, false, min_commit_ts, + false, ); } @@ -412,6 +435,7 @@ pub mod tests { false, false, TimeStamp::zero(), + false, ) } @@ -421,6 +445,7 @@ pub mod tests { pk: &[u8], start_ts: impl Into, for_update_ts: impl Into, + lock_only_if_exists: bool, ) -> MvccError { must_err_impl( engine, @@ -432,6 +457,7 @@ pub mod tests { true, false, TimeStamp::zero(), + lock_only_if_exists, ) } @@ -445,6 +471,7 @@ pub mod tests { need_value: bool, need_check_existence: bool, min_commit_ts: impl Into, + lock_only_if_exists: bool, ) -> MvccError { let snapshot = engine.snapshot(Default::default()).unwrap(); let min_commit_ts = min_commit_ts.into(); @@ -464,6 +491,7 @@ pub mod tests { need_check_existence, min_commit_ts, false, + lock_only_if_exists, ) .unwrap_err() } @@ -737,25 +765,28 @@ pub mod tests { let engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k", b"v"); - assert_eq!(must_succeed_return_value(&engine, k, k, 10, 10), None); + assert_eq!( + must_succeed_return_value(&engine, k, k, 10, 10, false), + None + ); must_pessimistic_locked(&engine, k, 10, 10); pessimistic_rollback::tests::must_success(&engine, k, 10, 10); // Put must_prewrite_put(&engine, k, v, k, 10); // KeyIsLocked - match must_err_return_value(&engine, k, k, 20, 20) { + match must_err_return_value(&engine, k, k, 20, 20, false) { MvccError(box ErrorInner::KeyIsLocked(_)) => (), e => panic!("unexpected error: {}", e), }; must_commit(&engine, k, 10, 20); // WriteConflict - match must_err_return_value(&engine, k, k, 15, 15) { + match must_err_return_value(&engine, k, k, 15, 15, false) { MvccError(box ErrorInner::WriteConflict { .. }) => (), e => panic!("unexpected error: {}", e), }; assert_eq!( - must_succeed_return_value(&engine, k, k, 25, 25), + must_succeed_return_value(&engine, k, k, 25, 25, false), Some(v.to_vec()) ); must_pessimistic_locked(&engine, k, 25, 25); @@ -765,7 +796,7 @@ pub mod tests { must_prewrite_lock(&engine, k, k, 30); must_commit(&engine, k, 30, 40); assert_eq!( - must_succeed_return_value(&engine, k, k, 45, 45), + must_succeed_return_value(&engine, k, k, 45, 45, false), Some(v.to_vec()) ); must_pessimistic_locked(&engine, k, 45, 45); @@ -774,7 +805,7 @@ pub mod tests { // Skip Write::Rollback must_rollback(&engine, k, 50, false); assert_eq!( - must_succeed_return_value(&engine, k, k, 55, 55), + must_succeed_return_value(&engine, k, k, 55, 55, false), Some(v.to_vec()) ); must_pessimistic_locked(&engine, k, 55, 55); @@ -783,17 +814,99 @@ pub mod tests { // Delete must_prewrite_delete(&engine, k, k, 60); must_commit(&engine, k, 60, 70); - assert_eq!(must_succeed_return_value(&engine, k, k, 75, 75), None); + assert_eq!( + must_succeed_return_value(&engine, k, k, 75, 75, false), + None + ); // Duplicated command - assert_eq!(must_succeed_return_value(&engine, k, k, 75, 75), None); assert_eq!( - must_succeed_return_value(&engine, k, k, 75, 55), + must_succeed_return_value(&engine, k, k, 75, 75, false), + None + ); + assert_eq!( + must_succeed_return_value(&engine, k, k, 75, 55, false), Some(v.to_vec()) ); must_pessimistic_locked(&engine, k, 75, 75); pessimistic_rollback::tests::must_success(&engine, k, 75, 75); } + #[test] + fn test_pessimistic_lock_only_if_exists() { + let engine = TestEngineBuilder::new().build().unwrap(); + let (k, v) = (b"k", b"v"); + + // The key doesn't exist, no pessimistic lock is generated + assert_eq!(must_succeed_return_value(&engine, k, k, 10, 10, true), None); + must_unlocked(&engine, k); + + match must_err_impl( + &engine, + k, + k, + 10, + false, + 10, + false, + false, + TimeStamp::zero(), + true, + ) { + MvccError(box ErrorInner::LockIfExistsFailed { + start_ts: _, + key: _, + }) => (), + e => panic!("unexpected error: {}", e), + }; + + // Put the value, writecf: k_20_put_v + must_prewrite_put(&engine, k, v, k, 10); + must_commit(&engine, k, 10, 20); + // Pessimistic lock generated + assert_eq!( + must_succeed_return_value(&engine, k, k, 25, 25, true), + Some(v.to_vec()) + ); + must_pessimistic_locked(&engine, k, 25, 25); + pessimistic_rollback::tests::must_success(&engine, k, 25, 25); + + // Skip Write::Lock, WriteRecord: k_20_put_v k_40_lock + must_prewrite_lock(&engine, k, k, 30); + must_commit(&engine, k, 30, 40); + assert_eq!( + must_succeed_return_value(&engine, k, k, 45, 45, true), + Some(v.to_vec()) + ); + must_pessimistic_locked(&engine, k, 45, 45); + pessimistic_rollback::tests::must_success(&engine, k, 45, 45); + + // Skip Write::Rollback WriteRecord: k_20_put_v k_40_lock k_50_R + must_rollback(&engine, k, 50, false); + assert_eq!( + must_succeed_return_value(&engine, k, k, 55, 55, true), + Some(v.to_vec()) + ); + must_pessimistic_locked(&engine, k, 55, 55); + pessimistic_rollback::tests::must_success(&engine, k, 55, 55); + + // Delete WriteRecord: k_20_put_v k_40_lock k_50_R k_70_delete + must_prewrite_delete(&engine, k, k, 60); + must_commit(&engine, k, 60, 70); + assert_eq!(must_succeed_return_value(&engine, k, k, 75, 75, true), None); + must_unlocked(&engine, k); + + // Duplicated command + assert_eq!( + must_succeed_return_value(&engine, k, k, 75, 75, false), + None + ); + must_pessimistic_locked(&engine, k, 75, 75); + assert_eq!(must_succeed_return_value(&engine, k, k, 75, 85, true), None); + must_pessimistic_locked(&engine, k, 75, 85); + pessimistic_rollback::tests::must_success(&engine, k, 75, 85); + must_unlocked(&engine, k); + } + #[test] fn test_overwrite_pessimistic_lock() { let engine = TestEngineBuilder::new().build().unwrap(); @@ -889,23 +1002,25 @@ pub mod tests { // Test constraint check with `should_not_exist`. if expected_value.is_none() { assert!( - must_succeed_impl(&engine, key, key, 50, true, 0, 50, false, false, 51) + must_succeed_impl(&engine, key, key, 50, true, 0, 50, false, false, 51, false) .is_none() ); must_pessimistic_rollback(&engine, key, 50, 51); } else { - must_err_impl(&engine, key, key, 50, true, 50, false, false, 51); + must_err_impl(&engine, key, key, 50, true, 50, false, false, 51, false); } must_unlocked(&engine, key); // Test getting value. - let res = must_succeed_impl(&engine, key, key, 50, false, 0, 50, true, false, 51); + let res = + must_succeed_impl(&engine, key, key, 50, false, 0, 50, true, false, 51, false); assert_eq!(res, expected_value.map(|v| v.to_vec())); must_pessimistic_rollback(&engine, key, 50, 51); // Test getting value when already locked. must_succeed(&engine, key, key, 50, 51); - let res2 = must_succeed_impl(&engine, key, key, 50, false, 0, 50, true, false, 51); + let res2 = + must_succeed_impl(&engine, key, key, 50, false, 0, 50, true, false, 51, false); assert_eq!(res2, expected_value.map(|v| v.to_vec())); must_pessimistic_rollback(&engine, key, 50, 51); } @@ -939,6 +1054,7 @@ pub mod tests { *need_check_existence, min_commit_ts, need_old_value, + false, ) .unwrap(); assert_eq!(old_value, OldValue::None); @@ -989,6 +1105,7 @@ pub mod tests { need_check_existence, min_commit_ts, need_old_value, + false, ) .unwrap(); assert_eq!( @@ -1022,6 +1139,7 @@ pub mod tests { false, min_commit_ts, true, + false, ) .unwrap(); assert_eq!( @@ -1064,6 +1182,7 @@ pub mod tests { *need_check_existence, min_commit_ts, need_old_value, + false, )?; Ok(old_value) }); @@ -1116,6 +1235,7 @@ pub mod tests { need_check_existence, min_commit_ts, need_old_value, + false, ) .unwrap_err(); @@ -1149,6 +1269,7 @@ pub mod tests { check_existence, min_commit_ts, need_old_value, + false, ) .unwrap_err(); } @@ -1221,6 +1342,7 @@ pub mod tests { need_value, need_check_existence, 0, + false, ); assert_eq!(value1, None); must_pessimistic_rollback(&engine, b"k1", start_ts, 30); @@ -1236,6 +1358,7 @@ pub mod tests { need_value, need_check_existence, 0, + false, ); assert_eq!(value2, expected_value(Some(b"v2"))); must_pessimistic_rollback(&engine, b"k2", start_ts, 30); @@ -1251,6 +1374,7 @@ pub mod tests { need_value, need_check_existence, 0, + false, ); assert_eq!(value3, None); must_pessimistic_rollback(&engine, b"k3", start_ts, 30); @@ -1266,6 +1390,7 @@ pub mod tests { need_value, need_check_existence, 0, + false, ); assert_eq!(value4, expected_value(Some(b"v4"))); must_pessimistic_rollback(&engine, b"k4", start_ts, 30); @@ -1281,6 +1406,7 @@ pub mod tests { need_value, need_check_existence, 0, + false, ); assert_eq!(value5, None); must_pessimistic_rollback(&engine, b"k5", start_ts, 30); diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 1db991f70eb..3632d847e59 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -26,8 +26,8 @@ command! { /// This can be rolled back with a [`PessimisticRollback`](Command::PessimisticRollback) command. AcquirePessimisticLock: cmd_ty => StorageResult, - display => "kv::command::acquirepessimisticlock keys({:?}) @ {} {} {} {:?} {} {} | {:?}", - (keys, start_ts, lock_ttl, for_update_ts, wait_timeout, min_commit_ts, check_existence, ctx), + display => "kv::command::acquirepessimisticlock keys({:?}) @ {} {} {} {:?} {} {} {} | {:?}", + (keys, start_ts, lock_ttl, for_update_ts, wait_timeout, min_commit_ts, check_existence, lock_only_if_exists, ctx), content => { /// The set of keys to lock. keys: Vec<(Key, bool)>, @@ -47,6 +47,7 @@ command! { min_commit_ts: TimeStamp, old_values: OldValues, check_existence: bool, + lock_only_if_exists: bool, } } @@ -110,6 +111,7 @@ impl WriteCommand for AcquirePessimisticLock self.check_existence, self.min_commit_ts, need_old_value, + self.lock_only_if_exists, ) { Ok((val, old_value)) => { if self.return_values || self.check_existence { diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index f4794d6a0db..a204ab4f30f 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -219,6 +219,7 @@ impl From for TypedCommand( 0.into(), OldValues::default(), false, + false, ctx.clone(), ), Box::new(move |r| tx.send(r).unwrap()), @@ -1005,6 +1006,7 @@ fn test_async_apply_prewrite_1pc_impl( 0.into(), OldValues::default(), false, + false, ctx.clone(), ), Box::new(move |r| tx.send(r).unwrap()), From 66edf9c2f62a9741eba596c8b687801418ed589a Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 30 Aug 2022 15:44:23 +0800 Subject: [PATCH 178/676] raftstore: prettify snapshot build flow (#13377) ref tikv/tikv#12876 prettify snapshot build flow Signed-off-by: Connor1996 --- .../raftstore/src/store/peer_storage.rs | 107 ++++---- components/raftstore/src/store/snap.rs | 239 +++++------------- .../raftstore/src/store/worker/region.rs | 38 ++- components/test_raftstore/src/node.rs | 6 +- src/server/snap.rs | 4 +- 5 files changed, 138 insertions(+), 256 deletions(-) diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index cf70234c841..c99b7644321 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -33,7 +33,7 @@ use tikv_util::{ box_err, box_try, debug, defer, error, info, time::Instant, warn, worker::Scheduler, }; -use super::{metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager, SnapshotStatistics}; +use super::{metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager}; use crate::{ store::{ async_io::write::WriteTask, entry_storage::EntryStorage, fsm::GenSnapTask, @@ -439,7 +439,8 @@ where let mut snap_state = self.snap_state.borrow_mut(); let mut tried_cnt = self.snap_tried_cnt.borrow_mut(); - let (mut tried, mut last_canceled, mut snap) = (false, false, None); + let mut tried = false; + let mut last_canceled = false; if let SnapState::Generating { ref canceled, ref receiver, @@ -450,24 +451,19 @@ where last_canceled = canceled.load(Ordering::SeqCst); match receiver.try_recv() { Err(TryRecvError::Empty) => { - let e = raft::StorageError::SnapshotTemporarilyUnavailable; - return Err(raft::Error::Store(e)); + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); } - Ok(s) if !last_canceled => snap = Some(s), - Err(TryRecvError::Disconnected) | Ok(_) => {} - } - } - - if tried { - *snap_state = SnapState::Relax; - match snap { - Some(s) => { + Ok(s) if !last_canceled => { + *snap_state = SnapState::Relax; *tried_cnt = 0; if self.validate_snap(&s, request_index) { return Ok(s); } } - None => { + Err(TryRecvError::Disconnected) | Ok(_) => { + *snap_state = SnapState::Relax; warn!( "failed to try generating snapshot"; "region_id" => self.region.get_id(), @@ -491,6 +487,9 @@ where cnt ))); } + if !tried || !last_canceled { + *tried_cnt += 1; + } info!( "requesting snapshot"; @@ -500,10 +499,6 @@ where "request_peer" => to, ); - if !tried || !last_canceled { - *tried_cnt += 1; - } - let (sender, receiver) = mpsc::sync_channel(1); let canceled = Arc::new(AtomicBool::new(false)); let index = Arc::new(AtomicU64::new(0)); @@ -512,11 +507,15 @@ where index: index.clone(), receiver, }; - let mut to_store_id = 0; - if let Some(peer) = self.region().get_peers().iter().find(|p| p.id == to) { - to_store_id = peer.store_id; - } - let task = GenSnapTask::new(self.region.get_id(), index, canceled, sender, to_store_id); + + let store_id = self + .region() + .get_peers() + .iter() + .find(|p| p.id == to) + .map(|p| p.store_id) + .unwrap_or(0); + let task = GenSnapTask::new(self.region.get_id(), index, canceled, sender, store_id); let mut gen_snap_task = self.gen_snap_task.borrow_mut(); assert!(gen_snap_task.is_none()); @@ -1000,18 +999,14 @@ where "region_id" => region_id, ); - let msg = kv_snap + let apply_state: RaftApplyState = kv_snap .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) - .map_err(into_other::<_, raft::Error>)?; - let apply_state: RaftApplyState = match msg { - None => { - return Err(storage_error(format!( - "could not load raft state of region {}", - region_id - ))); - } - Some(state) => state, - }; + .map_err(into_other::<_, raft::Error>) + .and_then(|v| { + v.ok_or_else(|| { + storage_error(format!("could not load raft state of region {}", region_id)) + }) + })?; assert_eq!(apply_state, last_applied_state); let key = SnapKey::new( @@ -1019,19 +1014,18 @@ where last_applied_term, apply_state.get_applied_index(), ); - mgr.register(key.clone(), SnapEntry::Generating); defer!(mgr.deregister(&key, &SnapEntry::Generating)); - let state: RegionLocalState = kv_snap + let region_state: RegionLocalState = kv_snap .get_msg_cf(CF_RAFT, &keys::region_state_key(key.region_id)) - .and_then(|res| match res { - None => Err(box_err!("region {} could not find region info", region_id)), - Some(state) => Ok(state), - }) - .map_err(into_other::<_, raft::Error>)?; - - if state.get_state() != PeerState::Normal { + .map_err(into_other::<_, raft::Error>) + .and_then(|v| { + v.ok_or_else(|| { + storage_error(format!("region {} could not find region info", region_id)) + }) + })?; + if region_state.get_state() != PeerState::Normal { return Err(storage_error(format!( "snap job for {} seems stale, skip.", region_id @@ -1039,33 +1033,22 @@ where } let mut snapshot = Snapshot::default(); - // Set snapshot metadata. snapshot.mut_metadata().set_index(key.idx); snapshot.mut_metadata().set_term(key.term); - - let conf_state = util::conf_state_from_region(state.get_region()); - snapshot.mut_metadata().set_conf_state(conf_state); - - let mut s = mgr.get_snapshot_for_building(&key)?; + snapshot + .mut_metadata() + .set_conf_state(util::conf_state_from_region(region_state.get_region())); // Set snapshot data. - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(state.get_region().clone()); - let mut stat = SnapshotStatistics::new(); - s.build( + let mut s = mgr.get_snapshot_for_building(&key)?; + let snap_data = s.build( engine, &kv_snap, - state.get_region(), - &mut snap_data, - &mut stat, + region_state.get_region(), allow_multi_files_snapshot, + for_balance, )?; - snap_data.mut_meta().set_for_balance(for_balance); - let v = snap_data.write_to_bytes()?; - snapshot.set_data(v.into()); - - SNAPSHOT_KV_COUNT_HISTOGRAM.observe(stat.kv_count as f64); - SNAPSHOT_SIZE_HISTOGRAM.observe(stat.size as f64); + snapshot.set_data(snap_data.write_to_bytes()?.into()); Ok(snapshot) } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 74cfd5ab0d6..8b063e9e1f0 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -41,13 +41,7 @@ use tikv_util::{ use crate::{ coprocessor::CoprocessorHost, - store::{ - metrics::{ - CfNames, INGEST_SST_DURATION_SECONDS, SNAPSHOT_BUILD_TIME_HISTOGRAM, - SNAPSHOT_CF_KV_COUNT, SNAPSHOT_CF_SIZE, - }, - peer_storage::JOB_STATUS_CANCELLING, - }, + store::{metrics::*, peer_storage::JOB_STATUS_CANCELLING}, Error as RaftStoreError, Result as RaftStoreResult, }; @@ -211,7 +205,7 @@ fn retry_delete_snapshot(mgr: &SnapManagerCore, key: &SnapKey, snap: &Snapshot) false } -fn gen_snapshot_meta(cf_files: &[CfFile]) -> RaftStoreResult { +fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { let mut meta = Vec::with_capacity(cf_files.len()); for cf_file in cf_files { if !SNAPSHOT_CFS.iter().any(|cf| cf_file.cf == *cf) { @@ -239,6 +233,7 @@ fn gen_snapshot_meta(cf_files: &[CfFile]) -> RaftStoreResult { } let mut snapshot_meta = SnapshotMeta::default(); snapshot_meta.set_cf_files(meta.into()); + snapshot_meta.set_for_balance(for_balance); Ok(snapshot_meta) } @@ -424,7 +419,7 @@ impl CfFile { #[derive(Default)] struct MetaFile { - pub meta: SnapshotMeta, + pub meta: Option, pub path: PathBuf, pub file: Option, @@ -735,7 +730,7 @@ impl Snapshot { } } } - self.meta_file.meta = snapshot_meta; + self.meta_file.meta = Some(snapshot_meta); Ok(()) } @@ -754,7 +749,7 @@ impl Snapshot { } pub fn load_snapshot_meta_if_necessary(&mut self) -> RaftStoreResult<()> { - if self.meta_file.meta.get_cf_files().is_empty() && file_exists(&self.meta_file.path) { + if self.meta_file.meta.is_none() && file_exists(&self.meta_file.path) { return self.load_snapshot_meta(); } Ok(()) @@ -817,7 +812,7 @@ impl Snapshot { // Only called in `do_build`. fn save_meta_file(&mut self) -> RaftStoreResult<()> { - let v = box_try!(self.meta_file.meta.write_to_bytes()); + let v = box_try!(self.meta_file.meta.as_ref().unwrap().write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { // `meta_file` could be None for this case: in `init_for_building` the snapshot // exists so no temporary meta file is created, and this field is @@ -843,8 +838,8 @@ impl Snapshot { engine: &EK, kv_snap: &EK::Snapshot, region: &Region, - stat: &mut SnapshotStatistics, allow_multi_files_snapshot: bool, + for_balance: bool, ) -> RaftStoreResult<()> where EK: KvEngine, @@ -925,10 +920,8 @@ impl Snapshot { ); } - stat.kv_count = self.cf_files.iter().map(|cf| cf.kv_count as usize).sum(); // save snapshot meta to meta file - let snapshot_meta = gen_snapshot_meta(&self.cf_files[..])?; - self.meta_file.meta = snapshot_meta; + self.meta_file.meta = Some(gen_snapshot_meta(&self.cf_files[..], for_balance)?); self.save_meta_file()?; Ok(()) } @@ -1031,31 +1024,41 @@ impl Snapshot { engine: &EK, kv_snap: &EK::Snapshot, region: &Region, - snap_data: &mut RaftSnapshotData, - stat: &mut SnapshotStatistics, allow_multi_files_snapshot: bool, - ) -> RaftStoreResult<()> { + for_balance: bool, + ) -> RaftStoreResult { + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region.clone()); + let t = Instant::now(); - self.do_build::(engine, kv_snap, region, stat, allow_multi_files_snapshot)?; + self.do_build::( + engine, + kv_snap, + region, + allow_multi_files_snapshot, + for_balance, + )?; - let total_size = self.total_size()?; - stat.size = total_size; + let total_size = self.total_size(); + let total_count = self.total_count(); // set snapshot meta data snap_data.set_file_size(total_size); snap_data.set_version(SNAPSHOT_VERSION); - snap_data.set_meta(self.meta_file.meta.clone()); + snap_data.set_meta(self.meta_file.meta.as_ref().unwrap().clone()); SNAPSHOT_BUILD_TIME_HISTOGRAM.observe(duration_to_sec(t.saturating_elapsed()) as f64); + SNAPSHOT_KV_COUNT_HISTOGRAM.observe(total_count as f64); + SNAPSHOT_SIZE_HISTOGRAM.observe(total_size as f64); info!( "scan snapshot"; "region_id" => region.get_id(), "snapshot" => self.path(), - "key_count" => stat.kv_count, + "key_count" => total_count, "size" => total_size, "takes" => ?t.saturating_elapsed(), ); - Ok(()) + Ok(snap_data) } pub fn apply(&mut self, options: ApplyOptions) -> Result<()> { @@ -1119,11 +1122,15 @@ impl Snapshot { file_system::metadata(&self.meta_file.path) } - pub fn total_size(&self) -> io::Result { - Ok(self - .cf_files + pub fn total_size(&self) -> u64 { + self.cf_files .iter() - .fold(0, |acc, x| acc + x.size.iter().sum::())) + .map(|cf| cf.size.iter().sum::()) + .sum() + } + + pub fn total_count(&self) -> u64 { + self.cf_files.iter().map(|cf| cf.kv_count).sum() } pub fn save(&mut self) -> io::Result<()> { @@ -1182,7 +1189,7 @@ impl Snapshot { sync_dir(&self.dir_path)?; // write meta file - let v = self.meta_file.meta.write_to_bytes()?; + let v = self.meta_file.meta.as_ref().unwrap().write_to_bytes()?; { let mut meta_file = self.meta_file.file.take().unwrap(); meta_file.write_all(&v[..])?; @@ -1560,19 +1567,17 @@ impl SnapManager { Ok(Box::new(s)) } - /// Get a `Snapshot` can be used for writting and then `save`. Concurrent + /// Get a `Snapshot` can be used for writing and then `save`. Concurrent /// calls are allowed because only one caller can lock temporary disk /// files. pub fn get_snapshot_for_receiving( &self, key: &SnapKey, - data: &[u8], + snapshot_meta: SnapshotMeta, ) -> RaftStoreResult> { let _lock = self.core.registry.rl(); - let mut snapshot_data = RaftSnapshotData::default(); - snapshot_data.merge_from_bytes(data)?; let base = &self.core.base; - let f = Snapshot::new_for_receiving(base, key, &self.core, snapshot_data.take_meta())?; + let f = Snapshot::new_for_receiving(base, key, &self.core, snapshot_meta)?; Ok(Box::new(f)) } @@ -1902,7 +1907,7 @@ pub mod tests { use kvproto::{ encryptionpb::EncryptionMethod, metapb::{Peer, Region}, - raft_serverpb::{RaftApplyState, RaftSnapshotData, RegionLocalState, SnapshotMeta}, + raft_serverpb::{RaftApplyState, RegionLocalState, SnapshotMeta}, }; use protobuf::Message; use raft::eraftpb::Entry; @@ -2024,7 +2029,7 @@ pub mod tests { Ok(Engines::new(kv, raft)) } - pub fn get_kv_count(snap: &impl EngineSnapshot) -> usize { + pub fn get_kv_count(snap: &impl EngineSnapshot) -> u64 { let mut kv_count = 0; for cf in SNAPSHOT_CFS { snap.scan( @@ -2139,7 +2144,7 @@ pub mod tests { }; cf_file.push(f); } - let meta = super::gen_snapshot_meta(&cf_file).unwrap(); + let meta = super::gen_snapshot_meta(&cf_file, false).unwrap(); let cf_files = meta.get_cf_files(); assert_eq!(cf_files.len(), super::SNAPSHOT_CFS.len() * 2); // each CF has two snapshot files; for (i, cf_file_meta) in meta.get_cf_files().iter().enumerate() { @@ -2221,28 +2226,14 @@ pub mod tests { assert!(!s1.exists()); assert_eq!(mgr_core.get_total_snap_size().unwrap(), 0); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let mut snap_data = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); // Ensure that this snapshot file does exist after being built. assert!(s1.exists()); - let total_size = s1.total_size().unwrap(); + let size = s1.total_size(); // Ensure the `size_track` is modified correctly. - let size = mgr_core.get_total_snap_size().unwrap(); - assert_eq!(size, total_size); - assert_eq!(stat.size as u64, size); - assert_eq!(stat.kv_count, get_kv_count(&snapshot)); + assert_eq!(size, mgr_core.get_total_snap_size().unwrap()); + assert_eq!(s1.total_count(), get_kv_count(&snapshot)); // Ensure this snapshot could be read for sending. let mut s2 = Snapshot::new_for_sending(src_dir.path(), &key, &mgr_core).unwrap(); @@ -2335,34 +2326,13 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); assert!(s1.exists()); let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(s2.exists()); - Snapshot::build::( - &mut s2, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s2.build(&db, &snapshot, ®ion, true, false).unwrap(); assert!(s2.exists()); } @@ -2505,19 +2475,7 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); assert!(s1.exists()); corrupt_snapshot_size_in(dir.path()); @@ -2526,16 +2484,7 @@ pub mod tests { let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s2.exists()); - Snapshot::build::( - &mut s2, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let snap_data = s2.build(&db, &snapshot, ®ion, true, false).unwrap(); assert!(s2.exists()); let dst_dir = Builder::new() @@ -2596,19 +2545,7 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); assert!(s1.exists()); assert_eq!(1, corrupt_snapshot_meta_file(dir.path())); @@ -2617,16 +2554,7 @@ pub mod tests { let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s2.exists()); - Snapshot::build::( - &mut s2, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let mut snap_data = s2.build(&db, &snapshot, ®ion, true, false).unwrap(); assert!(s2.exists()); let dst_dir = Builder::new() @@ -2688,21 +2616,9 @@ pub mod tests { let mgr_core = create_manager_core(&path, u64::MAX); let mut s1 = Snapshot::new_for_building(&path, &key1, &mgr_core).unwrap(); let mut region = gen_test_region(1, 1, 1); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - Snapshot::build::( - &mut s1, - &db, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let mut snap_data = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); let mut s = Snapshot::new_for_sending(&path, &key1, &mgr_core).unwrap(); - let expected_size = s.total_size().unwrap(); + let expected_size = s.total_size(); let mut s2 = Snapshot::new_for_receiving(&path, &key1, &mgr_core, snap_data.get_meta().clone()) .unwrap(); @@ -2772,19 +2688,14 @@ pub mod tests { // Ensure the snapshot being built will not be deleted on GC. src_mgr.register(key.clone(), SnapEntry::Generating); let mut s1 = src_mgr.get_snapshot_for_building(&key).unwrap(); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - s1.build(&db, &snapshot, ®ion, &mut snap_data, &mut stat, true) - .unwrap(); - let v = snap_data.write_to_bytes().unwrap(); + let mut snap_data = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); check_registry_around_deregister(&src_mgr, &key, &SnapEntry::Generating); // Ensure the snapshot being sent will not be deleted on GC. src_mgr.register(key.clone(), SnapEntry::Sending); let mut s2 = src_mgr.get_snapshot_for_sending(&key).unwrap(); - let expected_size = s2.total_size().unwrap(); + let expected_size = s2.total_size(); let dst_temp_dir = Builder::new() .prefix("test-snap-deletion-on-registry-dst") @@ -2796,7 +2707,9 @@ pub mod tests { // Ensure the snapshot being received will not be deleted on GC. dst_mgr.register(key.clone(), SnapEntry::Receiving); - let mut s3 = dst_mgr.get_snapshot_for_receiving(&key, &v[..]).unwrap(); + let mut s3 = dst_mgr + .get_snapshot_for_receiving(&key, snap_data.take_meta()) + .unwrap(); let n = io::copy(&mut s2, &mut s3).unwrap(); assert_eq!(n, expected_size); s3.save().unwrap(); @@ -2850,20 +2763,16 @@ pub mod tests { // Add an oldest snapshot for receiving. let recv_key = SnapKey::new(100, 100, 100); - let recv_head = { - let mut stat = SnapshotStatistics::new(); - let mut snap_data = RaftSnapshotData::default(); + let mut recv_head = { let mut s = snap_mgr.get_snapshot_for_building(&recv_key).unwrap(); s.build( &engine.kv, &snapshot, &gen_test_region(100, 1, 1), - &mut snap_data, - &mut stat, true, + false, ) - .unwrap(); - snap_data.write_to_bytes().unwrap() + .unwrap() }; let recv_remain = { let mut data = Vec::with_capacity(1024); @@ -2873,7 +2782,7 @@ pub mod tests { data }; let mut s = snap_mgr - .get_snapshot_for_receiving(&recv_key, &recv_head) + .get_snapshot_for_receiving(&recv_key, recv_head.take_meta()) .unwrap(); s.write_all(&recv_remain).unwrap(); s.save().unwrap(); @@ -2884,17 +2793,9 @@ pub mod tests { let key = SnapKey::new(region_id, 1, 1); let region = gen_test_region(region_id, 1, 1); let mut s = snap_mgr.get_snapshot_for_building(&key).unwrap(); - let mut snap_data = RaftSnapshotData::default(); - let mut stat = SnapshotStatistics::new(); - s.build( - &engine.kv, - &snapshot, - ®ion, - &mut snap_data, - &mut stat, - true, - ) - .unwrap(); + let _ = s + .build(&engine.kv, &snapshot, ®ion, true, false) + .unwrap(); // The first snap_size is for region 100. // That snapshot won't be deleted because it's not for generating. @@ -2963,11 +2864,7 @@ pub mod tests { // correctly. for _ in 0..2 { let mut s1 = snap_mgr.get_snapshot_for_building(&key).unwrap(); - let mut snap_data = RaftSnapshotData::default(); - snap_data.set_region(region.clone()); - let mut stat = SnapshotStatistics::new(); - s1.build(&db, &snapshot, ®ion, &mut snap_data, &mut stat, true) - .unwrap(); + let _ = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); assert!(snap_mgr.delete_snapshot(&key, &s1, false)); } } diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index f167a2c90bf..244ca514924 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -383,14 +383,14 @@ where info!("begin apply snap data"; "region_id" => region_id, "peer_id" => peer_id); fail_point!("region_apply_snap", |_| { Ok(()) }); check_abort(&abort)?; - let region_key = keys::region_state_key(region_id); - let mut region_state = self.region_state(region_id)?; - // clear up origin data. + let mut region_state = self.region_state(region_id)?; let region = region_state.get_region().clone(); let start_key = keys::enc_start_key(®ion); let end_key = keys::enc_end_key(®ion); check_abort(&abort)?; + + // clear up origin data. let overlap_ranges = self .pending_delete_ranges .drain_overlap_ranges(&start_key, &end_key); @@ -404,8 +404,8 @@ where check_abort(&abort)?; fail_point!("apply_snap_cleanup_range"); + // apply snapshot let apply_state = self.apply_state(region_id)?; - let term = apply_state.get_truncated_state().get_term(); let idx = apply_state.get_truncated_state().get_index(); let snap_key = SnapKey::new(region_id, term, idx); @@ -430,9 +430,10 @@ where self.coprocessor_host .post_apply_snapshot(®ion, peer_id, &snap_key, Some(&s)); + // delete snapshot state. let mut wb = self.engine.write_batch(); region_state.set_state(PeerState::Normal); - box_try!(wb.put_msg_cf(CF_RAFT, ®ion_key, ®ion_state)); + box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), ®ion_state)); box_try!(wb.delete_cf(CF_RAFT, &keys::snapshot_raft_state_key(region_id))); wb.write().unwrap_or_else(|e| { panic!("{} failed to save apply_snap result: {:?}", region_id, e); @@ -455,8 +456,7 @@ where Ordering::SeqCst, ); SNAP_COUNTER.apply.all.inc(); - // let apply_histogram = SNAP_HISTOGRAM.with_label_values(&["apply"]); - // let timer = apply_histogram.start_coarse_timer(); + let start = Instant::now(); match self.apply_snap(region_id, peer_id, Arc::clone(&status)) { @@ -892,8 +892,9 @@ mod tests { RaftEngineReadOnly, SyncMutable, WriteBatch, WriteBatchExt, CF_DEFAULT, CF_WRITE, }; use keys::data_key; - use kvproto::raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}; + use kvproto::raft_serverpb::{PeerState, RaftApplyState, RaftSnapshotData, RegionLocalState}; use pd_client::RpcClient; + use protobuf::Message; use tempfile::Builder; use tikv_util::worker::{LazyWorker, Worker}; @@ -1148,11 +1149,14 @@ mod tests { } msg => panic!("expected SnapshotGenerated, but got {:?}", msg), } - let data = s1.get_data(); + let mut data = RaftSnapshotData::default(); + data.merge_from_bytes(s1.get_data()).unwrap(); let key = SnapKey::from_snap(&s1).unwrap(); let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); let mut s2 = mgr.get_snapshot_for_sending(&key).unwrap(); - let mut s3 = mgr.get_snapshot_for_receiving(&key, data).unwrap(); + let mut s3 = mgr + .get_snapshot_for_receiving(&key, data.take_meta()) + .unwrap(); io::copy(&mut s2, &mut s3).unwrap(); s3.save().unwrap(); @@ -1372,11 +1376,8 @@ mod tests { key: &crate::store::SnapKey, snapshot: Option<&crate::store::Snapshot>, ) { - let code = snapshot.unwrap().total_size().unwrap() - + key.term - + key.region_id - + key.idx - + peer_id; + let code = + snapshot.unwrap().total_size() + key.term + key.region_id + key.idx + peer_id; self.pre_apply_count.fetch_add(1, Ordering::SeqCst); self.pre_apply_hash .fetch_add(code as usize, Ordering::SeqCst); @@ -1389,11 +1390,8 @@ mod tests { key: &crate::store::SnapKey, snapshot: Option<&crate::store::Snapshot>, ) { - let code = snapshot.unwrap().total_size().unwrap() - + key.term - + key.region_id - + key.idx - + peer_id; + let code = + snapshot.unwrap().total_size() + key.term + key.region_id + key.idx + peer_id; self.post_apply_count.fetch_add(1, Ordering::SeqCst); self.post_apply_hash .fetch_add(code as usize, Ordering::SeqCst); diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index be361db3185..11a5dda87bd 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -17,6 +17,7 @@ use kvproto::{ raft_cmdpb::*, raft_serverpb::{self, RaftMessage}, }; +use protobuf::Message; use raft::{eraftpb::MessageType, SnapshotStatus}; use raftstore::{ coprocessor::{config::SplitCheckConfigManager, CoprocessorHost}, @@ -94,7 +95,10 @@ impl Transport for ChannelTransport { Some(p) => { p.0.register(key.clone(), SnapEntry::Receiving); let data = msg.get_message().get_snapshot().get_data(); - p.0.get_snapshot_for_receiving(&key, data).unwrap() + let mut snapshot_data = raft_serverpb::RaftSnapshotData::default(); + snapshot_data.merge_from_bytes(data).unwrap(); + p.0.get_snapshot_for_receiving(&key, snapshot_data.take_meta()) + .unwrap() } None => return Err(box_err!("missing temp dir for store {}", to_store)), }; diff --git a/src/server/snap.rs b/src/server/snap.rs index e88fbd21fc9..b651d2d0c82 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -150,7 +150,7 @@ pub fn send_snap( if !s.exists() { return Err(box_err!("missing snap file: {:?}", s.path())); } - let total_size = s.total_size()?; + let total_size = s.total_size(); let mut chunks = { let mut first_chunk = SnapshotChunk::default(); @@ -234,7 +234,7 @@ impl RecvSnapContext { let _with_io_type = WithIoType::new(io_type); let snap = { - let s = match snap_mgr.get_snapshot_for_receiving(&key, data) { + let s = match snap_mgr.get_snapshot_for_receiving(&key, snapshot.take_meta()) { Ok(s) => s, Err(e) => return Err(box_err!("{} failed to create snapshot file: {:?}", key, e)), }; From aed265824757fe74287a80e3b36e72da0ceee5ee Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 30 Aug 2022 07:50:23 -0700 Subject: [PATCH 179/676] *: move RegionMeta to raftstore (#13335) ref tikv/tikv#12842, ref tikv/tikv#13334 This PR moves `RegionMeta` to raftstore module, so that both v1 and v2 can use it for debugging. Several fields are re-arranged to be more concise and more informations. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- .../raftstore-v2/src/operation/query/mod.rs | 16 +- components/raftstore-v2/src/router/mod.rs | 4 +- .../src/router/response_channel.rs | 162 ++++++++---------- components/raftstore/src/store/fsm/peer.rs | 61 ++++--- components/raftstore/src/store/mod.rs | 8 +- components/raftstore/src/store/msg.rs | 6 +- components/raftstore/src/store/peer.rs | 11 -- .../raftstore/src/store}/region_meta.rs | 70 +++++--- src/server/status_server/mod.rs | 5 +- tests/integrations/server/status_server.rs | 6 +- 10 files changed, 173 insertions(+), 176 deletions(-) rename {src/server/status_server => components/raftstore/src/store}/region_meta.rs (81%) diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index ff03117419b..bb8467fbc5c 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -12,7 +12,7 @@ use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse, StatusCmdType}; use raftstore::{ - store::{cmd_resp, util, ReadCallback}, + store::{cmd_resp, region_meta::RegionMeta, util, GroupState, ReadCallback}, Error, Result, }; use tikv_util::box_err; @@ -20,7 +20,7 @@ use tikv_util::box_err; use crate::{ fsm::PeerFsmDelegate, raft::Peer, - router::{QueryResChannel, QueryResult}, + router::{DebugInfoChannel, QueryResChannel, QueryResult}, }; mod local; @@ -79,4 +79,16 @@ impl Peer { cmd_resp::bind_term(resp, self.term()); Ok(()) } + + /// Query internal states for debugging purpose. + pub fn on_query_debug_info(&self, ch: DebugInfoChannel) { + let entry_storage = self.storage().entry_storage(); + let meta = RegionMeta::new( + self.storage().region_state(), + entry_storage.apply_state(), + GroupState::Ordered, + self.raft_group().status(), + ); + ch.set_result(meta); + } } diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index 17250833168..8c1ba338642 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -9,5 +9,7 @@ pub(crate) use self::internal_message::ApplyTask; pub use self::{ internal_message::ApplyRes, message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, - response_channel::{CmdResChannel, QueryResChannel, QueryResult}, + response_channel::{ + CmdResChannel, DebugInfoChannel, DebugInfoSubscriber, QueryResChannel, QueryResult, + }, }; diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index e87095215b8..2e0908aa7d0 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -16,7 +16,7 @@ use std::{ cell::UnsafeCell, fmt, future::Future, - mem::{self, ManuallyDrop}, + mem, pin::Pin, sync::{ atomic::{AtomicU64, Ordering}, @@ -29,7 +29,8 @@ use engine_traits::Snapshot; use futures::task::AtomicWaker; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, raft_cmdpb::RaftCmdResponse}; use raftstore::store::{ - local_metrics::TimeTracker, msg::ErrorCallback, ReadCallback, RegionSnapshot, WriteCallback, + local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, + RegionSnapshot, WriteCallback, }; use smallvec::SmallVec; use tikv_util::memory::HeapSize; @@ -69,6 +70,17 @@ const fn fired_bit_of(event: u64) -> u64 { 1 << (event * 2 + 1) } +impl Default for EventCore { + #[inline] + fn default() -> Self { + Self { + event: AtomicU64::new(0), + res: UnsafeCell::new(None), + waker: AtomicWaker::new(), + } + } +} + impl EventCore { #[inline] fn notify_event(&self, event: u64) { @@ -201,10 +213,54 @@ impl<'a, Res> Future for WaitResult<'a, Res> { } } -pub struct CmdResSubscriber { - core: Arc>, +/// A base subscriber that contains most common implementation of subscribers. +pub struct BaseSubscriber { + core: Arc>, } +impl BaseSubscriber { + /// Wait for the result. + #[inline] + pub async fn result(mut self) -> Option { + WaitResult { core: &self.core }.await + } +} + +unsafe impl Send for BaseSubscriber {} +unsafe impl Sync for BaseSubscriber {} + +/// A base channel that contains most common implementation of channels. +pub struct BaseChannel { + core: Arc>, +} + +impl BaseChannel { + /// Creates a pair of channel and subscriber. + #[inline] + pub fn pair() -> (Self, BaseSubscriber) { + let core: Arc> = Arc::default(); + (Self { core: core.clone() }, BaseSubscriber { core }) + } + + /// Sets the final result. + #[inline] + pub fn set_result(mut self, res: Res) { + self.core.set_result(res); + } +} + +impl Drop for BaseChannel { + #[inline] + fn drop(&mut self) { + self.core.cancel(); + } +} + +unsafe impl Send for BaseChannel {} +unsafe impl Sync for BaseChannel {} + +pub type CmdResSubscriber = BaseSubscriber; + impl CmdResSubscriber { pub async fn wait_proposed(&mut self) -> bool { WaitEvent { @@ -221,38 +277,14 @@ impl CmdResSubscriber { } .await } - - pub async fn result(mut self) -> Option { - WaitResult { core: &self.core }.await - } } -unsafe impl Send for CmdResSubscriber {} -unsafe impl Sync for CmdResSubscriber {} - -pub struct CmdResChannel { - core: ManuallyDrop>>, -} +pub type CmdResChannel = BaseChannel; impl CmdResChannel { // Valid range is [1, 30] const PROPOSED_EVENT: u64 = 1; const COMMITTED_EVENT: u64 = 2; - - #[inline] - pub fn pair() -> (Self, CmdResSubscriber) { - let core = Arc::new(EventCore { - event: AtomicU64::new(0), - res: UnsafeCell::new(None), - waker: AtomicWaker::new(), - }); - ( - Self { - core: ManuallyDrop::new(core.clone()), - }, - CmdResSubscriber { core }, - ) - } } impl ErrorCallback for CmdResChannel { @@ -294,27 +326,10 @@ impl WriteCallback for CmdResChannel { // TODO: support executing hooks inside setting result. #[inline] fn set_result(mut self, res: RaftCmdResponse) { - self.core.set_result(res); - unsafe { - ManuallyDrop::drop(&mut self.core); - } - mem::forget(self); - } -} - -impl Drop for CmdResChannel { - #[inline] - fn drop(&mut self) { - self.core.cancel(); - unsafe { - ManuallyDrop::drop(&mut self.core); - } + self.set_result(res); } } -unsafe impl Send for CmdResChannel {} -unsafe impl Sync for CmdResChannel {} - /// Response for Read. /// /// Unlike v1, snapshot are always taken in LocalReader, hence snapshot doesn't @@ -351,25 +366,7 @@ impl QueryResult { } } -pub struct QueryResChannel { - core: ManuallyDrop>>, -} - -impl QueryResChannel { - pub fn pair() -> (Self, QueryResSubscriber) { - let core = Arc::new(EventCore { - event: AtomicU64::new(0), - res: UnsafeCell::new(None), - waker: AtomicWaker::new(), - }); - ( - Self { - core: ManuallyDrop::new(core.clone()), - }, - QueryResSubscriber { core }, - ) - } -} +pub type QueryResChannel = BaseChannel; impl ErrorCallback for QueryResChannel { #[inline] @@ -388,11 +385,7 @@ impl ReadCallback for QueryResChannel { #[inline] fn set_result(mut self, res: QueryResult) { - self.core.set_result(res); - unsafe { - ManuallyDrop::drop(&mut self.core); - } - mem::forget(self); + self.set_result(res); } fn read_tracker(&self) -> Option<&TrackerToken> { @@ -400,31 +393,10 @@ impl ReadCallback for QueryResChannel { } } -impl Drop for QueryResChannel { - #[inline] - fn drop(&mut self) { - self.core.cancel(); - unsafe { - ManuallyDrop::drop(&mut self.core); - } - } -} - -unsafe impl Send for QueryResChannel {} -unsafe impl Sync for QueryResChannel {} - -pub struct QueryResSubscriber { - core: Arc>, -} - -impl QueryResSubscriber { - pub async fn result(mut self) -> Option { - WaitResult { core: &self.core }.await - } -} +pub type QueryResSubscriber = BaseSubscriber; -unsafe impl Send for QueryResSubscriber {} -unsafe impl Sync for QueryResSubscriber {} +pub type DebugInfoChannel = BaseChannel; +pub type DebugInfoSubscriber = BaseSubscriber; #[cfg(test)] mod tests { diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 5497d2ad1d9..c587ea5f32c 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -33,8 +33,8 @@ use kvproto::{ StatusCmdType, StatusResponse, }, raft_serverpb::{ - ExtraMessage, ExtraMessageType, MergeState, PeerState, RaftApplyState, RaftMessage, - RaftSnapshotData, RaftTruncatedState, RegionLocalState, + ExtraMessage, ExtraMessageType, MergeState, PeerState, RaftMessage, RaftSnapshotData, + RaftTruncatedState, RegionLocalState, }, replication_modepb::{DrAutoSyncState, ReplicationMode}, }; @@ -84,6 +84,7 @@ use crate::{ UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, TRANSFER_LEADER_COMMAND_REPLY_CTX, }, + region_meta::RegionMeta, transport::Transport, util, util::{is_learner, KeysInfoFormatter, LeaseState}, @@ -92,8 +93,8 @@ use crate::{ GcSnapshotTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, }, - AbstractPeer, CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, - PeerTick, ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, + CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, PeerTick, + ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, SignificantMsg, SnapKey, StoreMsg, WriteCallback, }, Error, Result, @@ -1001,7 +1002,24 @@ where CasualMessage::ForceCompactRaftLogs => { self.on_raft_gc_log_tick(true); } - CasualMessage::AccessPeer(cb) => cb(self.fsm as &mut dyn AbstractPeer), + CasualMessage::AccessPeer(cb) => { + let peer = &self.fsm.peer; + let store = peer.get_store(); + let mut local_state = RegionLocalState::default(); + local_state.set_region(store.region().clone()); + if let Some(s) = &peer.pending_merge_state { + local_state.set_merge_state(s.clone()); + } + if store.is_applying_snapshot() { + local_state.set_state(PeerState::Applying); + } + cb(RegionMeta::new( + &local_state, + store.apply_state(), + self.fsm.hibernate_state.group_state(), + peer.raft_group.status(), + )) + } CasualMessage::QueryRegionLeaderResp { region, leader } => { // the leader already updated if self.fsm.peer.raft_group.raft.leader_id != raft::INVALID_ID @@ -3754,8 +3772,13 @@ where // New peer derive write flow from parent region, // this will be used by balance write flow. new_peer.peer.peer_stat = self.fsm.peer.peer_stat.clone(); - new_peer.peer.last_compacted_idx = - new_peer.apply_state().get_truncated_state().get_index() + 1; + new_peer.peer.last_compacted_idx = new_peer + .peer + .get_store() + .apply_state() + .get_truncated_state() + .get_index() + + 1; let campaigned = new_peer.peer.maybe_campaign(is_leader); new_peer.has_ready |= campaigned; @@ -6220,30 +6243,6 @@ where } } -impl AbstractPeer for PeerFsm { - fn meta_peer(&self) -> &metapb::Peer { - &self.peer.peer - } - fn group_state(&self) -> GroupState { - self.hibernate_state.group_state() - } - fn region(&self) -> &metapb::Region { - self.peer.raft_group.store().region() - } - fn apply_state(&self) -> &RaftApplyState { - self.peer.raft_group.store().apply_state() - } - fn raft_status(&self) -> raft::Status<'_> { - self.peer.raft_group.status() - } - fn raft_commit_index(&self) -> u64 { - self.peer.raft_group.store().commit_index() - } - fn pending_merge_state(&self) -> Option<&MergeState> { - self.peer.pending_merge_state.as_ref() - } -} - mod memtrace { use memory_trace_macros::MemoryTraceHelper; diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 878c7c3b9f8..cac2e36d5eb 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -7,6 +7,7 @@ pub mod fsm; pub mod memory; pub mod metrics; pub mod msg; +pub mod region_meta; pub mod transport; #[macro_use] pub mod util; @@ -29,7 +30,10 @@ mod worker; pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ - write::{PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask}, + write::{ + ExtraStates, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, + WriteTask, + }, write_router::{WriteRouter, WriteRouterContext, WriteSenders}, }, bootstrap::{ @@ -48,7 +52,7 @@ pub use self::{ PeerTick, RaftCmdExtraOpts, RaftCommand, ReadCallback, ReadResponse, SignificantMsg, StoreMsg, StoreTick, WriteCallback, WriteResponse, }, - peer::{AbstractPeer, Peer, PeerStat, ProposalContext, RequestInspector, RequestPolicy}, + peer::{Peer, PeerStat, ProposalContext, RequestInspector, RequestPolicy}, peer_storage::{ clear_meta, do_snapshot, write_initial_apply_state, write_initial_raft_state, write_peer_state, PeerStorage, SnapState, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 5b3221e8c19..251094e6475 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -24,7 +24,9 @@ use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; use tracker::{get_tls_tracker_token, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; -use super::{local_metrics::TimeTracker, worker::FetchedLogs, AbstractPeer, RegionSnapshot}; +use super::{ + local_metrics::TimeTracker, region_meta::RegionMeta, worker::FetchedLogs, RegionSnapshot, +}; use crate::store::{ fsm::apply::{CatchUpLogs, ChangeObserver, TaskRes as ApplyTaskRes}, metrics::RaftEventDurationType, @@ -517,7 +519,7 @@ pub enum CasualMessage { ForceCompactRaftLogs, /// A message to access peer's internal state. - AccessPeer(Box), + AccessPeer(Box), /// Region info from PD QueryRegionLeaderResp { diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 6b3ec4c3456..91698be98e9 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -5566,17 +5566,6 @@ fn make_transfer_leader_response() -> RaftCmdResponse { // TransferLeader command. pub const TRANSFER_LEADER_COMMAND_REPLY_CTX: &[u8] = &[1]; -/// A poor version of `Peer` to avoid port generic variables everywhere. -pub trait AbstractPeer { - fn meta_peer(&self) -> &metapb::Peer; - fn group_state(&self) -> GroupState; - fn region(&self) -> &metapb::Region; - fn apply_state(&self) -> &RaftApplyState; - fn raft_status(&self) -> raft::Status<'_>; - fn raft_commit_index(&self) -> u64; - fn pending_merge_state(&self) -> Option<&MergeState>; -} - mod memtrace { use std::mem; diff --git a/src/server/status_server/region_meta.rs b/components/raftstore/src/store/region_meta.rs similarity index 81% rename from src/server/status_server/region_meta.rs rename to components/raftstore/src/store/region_meta.rs index cd78e7382c9..9af541cbfd9 100644 --- a/src/server/status_server/region_meta.rs +++ b/components/raftstore/src/store/region_meta.rs @@ -2,9 +2,11 @@ use std::collections::HashMap; -use kvproto::metapb::PeerRole; -use raft::{Progress, ProgressState, StateRole}; -use raftstore::store::{AbstractPeer, GroupState}; +use kvproto::{metapb::PeerRole, raft_serverpb}; +use raft::{Progress, ProgressState, StateRole, Status}; +use serde::{Deserialize, Serialize}; + +use super::GroupState; #[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub enum RaftProgressState { @@ -179,22 +181,34 @@ pub struct RaftApplyState { } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RegionMeta { +pub struct RegionLocalState { pub id: u64, - pub group_state: GroupState, pub start_key: Vec, pub end_key: Vec, pub epoch: Epoch, pub peers: Vec, pub merge_state: Option, + pub tablet_index: u64, +} + +/// A serializeable struct that exposes the internal debug information of a +/// peer. TODO: make protobuf generated code derive serde directly. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RegionMeta { + pub group_state: GroupState, pub raft_status: RaftStatus, pub raft_apply: RaftApplyState, + pub region_state: RegionLocalState, } impl RegionMeta { - pub fn new(abstract_peer: &dyn AbstractPeer) -> Self { - let region = abstract_peer.region(); - let apply_state = abstract_peer.apply_state(); + pub fn new( + local_state: &raft_serverpb::RegionLocalState, + apply_state: &raft_serverpb::RaftApplyState, + group_state: GroupState, + raft_status: Status<'_>, + ) -> Self { + let region = local_state.get_region(); let epoch = region.get_region_epoch(); let start_key = region.get_start_key(); let end_key = region.get_end_key(); @@ -207,25 +221,15 @@ impl RegionMeta { role: peer.get_role().into(), }); } + let merge_state = if local_state.has_merge_state() { + Some(local_state.get_merge_state()) + } else { + None + }; Self { - id: region.get_id(), - group_state: abstract_peer.group_state(), - start_key: start_key.to_owned(), - end_key: end_key.to_owned(), - epoch: Epoch { - conf_ver: epoch.get_conf_ver(), - version: epoch.get_version(), - }, - peers, - merge_state: abstract_peer - .pending_merge_state() - .map(|state| RegionMergeState { - min_index: state.get_min_index(), - commit: state.get_commit(), - region_id: state.get_target().get_id(), - }), - raft_status: abstract_peer.raft_status().into(), + group_state, + raft_status: raft_status.into(), raft_apply: RaftApplyState { applied_index: apply_state.get_applied_index(), commit_index: apply_state.get_commit_index(), @@ -235,6 +239,22 @@ impl RegionMeta { term: apply_state.get_truncated_state().get_term(), }, }, + region_state: RegionLocalState { + id: region.get_id(), + start_key: start_key.to_owned(), + end_key: end_key.to_owned(), + epoch: Epoch { + conf_ver: epoch.get_conf_ver(), + version: epoch.get_version(), + }, + peers, + merge_state: merge_state.map(|state| RegionMergeState { + min_index: state.get_min_index(), + commit: state.get_commit(), + region_id: state.get_target().get_id(), + }), + tablet_index: local_state.get_tablet_index(), + }, } } } diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 3df7bf212d9..7c001baec1e 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -1,7 +1,6 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. mod profile; -pub mod region_meta; use std::{ error::Error as StdError, marker::PhantomData, @@ -454,8 +453,8 @@ where let (tx, rx) = oneshot::channel(); match router.send( id, - CasualMessage::AccessPeer(Box::new(move |peer| { - if let Err(meta) = tx.send(region_meta::RegionMeta::new(peer)) { + CasualMessage::AccessPeer(Box::new(move |meta| { + if let Err(meta) = tx.send(meta) { error!("receiver dropped, region meta: {:?}", meta) } })), diff --git a/tests/integrations/server/status_server.rs b/tests/integrations/server/status_server.rs index 84a4de39b25..455465d87cb 100644 --- a/tests/integrations/server/status_server.rs +++ b/tests/integrations/server/status_server.rs @@ -3,12 +3,10 @@ use std::{error::Error, net::SocketAddr, sync::Arc}; use hyper::{body, Client, StatusCode, Uri}; +use raftstore::store::region_meta::RegionMeta; use security::SecurityConfig; use test_raftstore::{new_server_cluster, Simulator}; -use tikv::{ - config::ConfigController, - server::status_server::{region_meta::RegionMeta, StatusServer}, -}; +use tikv::{config::ConfigController, server::status_server::StatusServer}; use tikv_util::HandyRwLock; async fn check(authority: SocketAddr, region_id: u64) -> Result<(), Box> { From d2cc9550d1e253499039c5fe6508d9cb6ca24f0c Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 31 Aug 2022 21:24:24 +0800 Subject: [PATCH 180/676] coprocessor: move task reschedule from runner to scanner (#13337) ref tikv/tikv#13313 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- Cargo.lock | 6 + components/test_backup/src/lib.rs | 4 +- components/tidb_query_common/Cargo.toml | 3 + .../tidb_query_common/src/execute_stats.rs | 2 +- .../tidb_query_common/src/storage/scanner.rs | 241 ++++++++++++------ components/tidb_query_executors/Cargo.toml | 1 + .../src/fast_hash_aggr_executor.rs | 43 ++-- .../src/index_scan_executor.rs | 35 +-- .../tidb_query_executors/src/interface.rs | 14 +- .../src/limit_executor.rs | 33 +-- .../src/projection_executor.rs | 33 +-- components/tidb_query_executors/src/runner.rs | 63 ++--- .../src/selection_executor.rs | 51 ++-- .../src/simple_aggr_executor.rs | 23 +- .../src/slow_hash_aggr_executor.rs | 13 +- .../src/stream_aggr_executor.rs | 19 +- .../src/table_scan_executor.rs | 37 +-- .../src/top_n_executor.rs | 65 ++--- .../src/util/aggr_executor.rs | 19 +- .../src/util/mock_executor.rs | 7 +- .../src/util/scan_executor.rs | 10 +- components/tikv_util/Cargo.toml | 1 + components/tikv_util/src/quota_limiter.rs | 49 +++- src/coprocessor/checksum.rs | 17 +- src/coprocessor/dag/mod.rs | 4 +- src/coprocessor/endpoint.rs | 8 +- src/coprocessor/mod.rs | 2 +- src/coprocessor/statistics/analyze.rs | 41 +-- tests/Cargo.toml | 1 + .../coprocessor_executors/index_scan/util.rs | 3 +- .../coprocessor_executors/table_scan/util.rs | 3 +- .../coprocessor_executors/util/bencher.rs | 4 +- .../coprocessor_executors/util/fixture.rs | 4 +- .../integrations/coprocessor/test_checksum.rs | 3 +- 34 files changed, 509 insertions(+), 353 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a5c71cef10d..7ed11da4cd7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5760,6 +5760,7 @@ version = "0.0.1" dependencies = [ "api_version", "arrow", + "async-trait", "batch-system", "byteorder", "causal_ts", @@ -5906,9 +5907,11 @@ name = "tidb_query_common" version = "0.0.1" dependencies = [ "anyhow", + "async-trait", "byteorder", "derive_more", "error_code", + "futures 0.3.15", "kvproto", "lazy_static", "log_wrappers", @@ -5918,6 +5921,7 @@ dependencies = [ "thiserror", "tikv_util", "time", + "yatp", ] [[package]] @@ -5963,6 +5967,7 @@ name = "tidb_query_executors" version = "0.0.1" dependencies = [ "anyhow", + "async-trait", "codec", "collections", "fail", @@ -6324,6 +6329,7 @@ dependencies = [ "openssl", "page_size", "panic_hook", + "pin-project", "procfs", "procinfo", "prometheus", diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index d7bed05eddd..5447e8f2b37 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -13,7 +13,7 @@ use backup::Task; use collections::HashMap; use engine_traits::{CfName, IterOptions, CF_DEFAULT, CF_WRITE, DATA_KEY_PREFIX_LEN}; use external_storage_export::make_local_backend; -use futures::channel::mpsc as future_mpsc; +use futures::{channel::mpsc as future_mpsc, executor::block_on}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{brpb::*, kvrpcpb::*, tikvpb::TikvClient}; use rand::Rng; @@ -362,7 +362,7 @@ impl TestSuite { is_scanned_range_aware: false, }); let digest = crc64fast::Digest::new(); - while let Some((k, v)) = scanner.next().unwrap() { + while let Some((k, v)) = block_on(scanner.next()).unwrap() { checksum = checksum_crc64_xor(checksum, digest.clone(), &k, &v); total_kvs += 1; total_bytes += (k.len() + v.len()) as u64; diff --git a/components/tidb_query_common/Cargo.toml b/components/tidb_query_common/Cargo.toml index 2f42c226327..0efadbd48e9 100644 --- a/components/tidb_query_common/Cargo.toml +++ b/components/tidb_query_common/Cargo.toml @@ -7,8 +7,10 @@ description = "Common utility of a query engine to run TiDB pushed down executor [dependencies] anyhow = "1.0" +async-trait = "0.1" derive_more = "0.99.3" error_code = { path = "../error_code", default-features = false } +futures = "0.3" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" log_wrappers = { path = "../log_wrappers" } @@ -18,6 +20,7 @@ serde_json = "1.0" thiserror = "1.0" tikv_util = { path = "../tikv_util", default-features = false } time = "0.1" +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] byteorder = "1.2" diff --git a/components/tidb_query_common/src/execute_stats.rs b/components/tidb_query_common/src/execute_stats.rs index 55d31dfb8f5..122363eed98 100644 --- a/components/tidb_query_common/src/execute_stats.rs +++ b/components/tidb_query_common/src/execute_stats.rs @@ -18,7 +18,7 @@ pub struct ExecSummary { /// A trait for all execution summary collectors. pub trait ExecSummaryCollector: Send { - type DurationRecorder; + type DurationRecorder: Send; /// Creates a new instance with specified output slot index. fn new(output_index: usize) -> Self diff --git a/components/tidb_query_common/src/storage/scanner.rs b/components/tidb_query_common/src/storage/scanner.rs index 851220307b9..e12659f329b 100644 --- a/components/tidb_query_common/src/storage/scanner.rs +++ b/components/tidb_query_common/src/storage/scanner.rs @@ -1,9 +1,19 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use std::time::Duration; + +use tikv_util::time::Instant; +use yatp::task::future::reschedule; + use super::{range::*, ranges_iter::*, OwnedKvPair, Storage}; use crate::error::StorageError; const KEY_BUFFER_CAPACITY: usize = 64; +/// Batch executors are run in coroutines. `MAX_TIME_SLICE` is the maximum time +/// a coroutine can run without being yielded. +const MAX_TIME_SLICE: Duration = Duration::from_millis(1); +/// the number of scanned keys that should trigger a reschedule. +const CHECK_KEYS: usize = 32; /// A scanner that scans over multiple ranges. Each range can be a point range /// containing only one row, or an interval range containing multiple rows. @@ -23,6 +33,35 @@ pub struct RangesScanner { current_range: IntervalRange, working_range_begin_key: Vec, working_range_end_key: Vec, + rescheduler: RescheduleChecker, +} + +// TODO: maybe it's better to make it generic to avoid directly depending +// on yatp's rescheduler. +struct RescheduleChecker { + prev_start: Instant, + prev_key_count: usize, +} + +impl RescheduleChecker { + fn new() -> Self { + Self { + prev_start: Instant::now(), + prev_key_count: 0, + } + } + + #[inline(always)] + async fn check_reschedule(&mut self, force_check: bool) { + self.prev_key_count += 1; + if (force_check || self.prev_key_count % CHECK_KEYS == 0) + && self.prev_start.saturating_elapsed() > MAX_TIME_SLICE + { + reschedule().await; + self.prev_start = Instant::now(); + self.prev_key_count = 0; + } + } } pub struct RangesScannerOptions { @@ -58,24 +97,26 @@ impl RangesScanner { }, working_range_begin_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), working_range_end_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), + rescheduler: RescheduleChecker::new(), } } /// Fetches next row. // Note: This is not implemented over `Iterator` since it can fail. // TODO: Change to use reference to avoid allocation and copy. - pub fn next(&mut self) -> Result, StorageError> { - self.next_opt(true) + pub async fn next(&mut self) -> Result, StorageError> { + self.next_opt(true).await } /// Fetches next row. /// Note: `update_scanned_range` can control whether update the scanned /// range when `is_scanned_range_aware` is true. - pub fn next_opt( + pub async fn next_opt( &mut self, update_scanned_range: bool, ) -> Result, StorageError> { loop { + let mut force_check = true; let range = self.ranges_iter.next(); let some_row = match range { IterStatus::NewRange(Range::Point(r)) => { @@ -95,7 +136,10 @@ impl RangesScanner { .begin_scan(self.scan_backward_in_range, self.is_key_only, r)?; self.storage.scan_next()? } - IterStatus::Continue => self.storage.scan_next()?, + IterStatus::Continue => { + force_check = false; + self.storage.scan_next()? + } IterStatus::Drained => { if self.is_scanned_range_aware { self.update_working_range_end_key(); @@ -111,6 +155,7 @@ impl RangesScanner { if let Some(r) = self.scanned_rows_per_range.last_mut() { *r += 1; } + self.rescheduler.check_reschedule(force_check).await; return Ok(some_row); } else { @@ -243,6 +288,8 @@ impl RangesScanner { #[cfg(test)] mod tests { + use futures::executor::block_on; + use super::*; use crate::storage::{test_fixture::FixtureStorage, IntervalRange, PointRange, Range}; @@ -276,26 +323,26 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"foo".to_vec(), b"1".to_vec())) ); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"foo_2".to_vec(), b"3".to_vec())) ); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"foo_3".to_vec(), b"5".to_vec())) ); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"bar".to_vec(), b"2".to_vec())) ); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"bar_2".to_vec(), b"4".to_vec())) ); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); // Backward in range let ranges: Vec = vec![ @@ -312,22 +359,22 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"foo_2".to_vec(), b"3".to_vec())) ); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"foo".to_vec(), b"1".to_vec())) ); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"foo_3".to_vec(), b"5".to_vec())) ); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"bar".to_vec(), b"2".to_vec())) ); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); // Key only let ranges: Vec = vec![ @@ -342,21 +389,27 @@ mod tests { is_key_only: true, is_scanned_range_aware: false, }); - assert_eq!(scanner.next().unwrap(), Some((b"bar".to_vec(), Vec::new()))); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), + Some((b"bar".to_vec(), Vec::new())) + ); + assert_eq!( + block_on(scanner.next()).unwrap(), Some((b"bar_2".to_vec(), Vec::new())) ); - assert_eq!(scanner.next().unwrap(), Some((b"foo".to_vec(), Vec::new()))); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), + Some((b"foo".to_vec(), Vec::new())) + ); + assert_eq!( + block_on(scanner.next()).unwrap(), Some((b"foo_2".to_vec(), Vec::new())) ); assert_eq!( - scanner.next().unwrap(), + block_on(scanner.next()).unwrap(), Some((b"foo_3".to_vec(), Vec::new())) ); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); } #[test] @@ -378,9 +431,9 @@ mod tests { }); let mut scanned_rows_per_range = Vec::new(); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![2, 0, 1]); @@ -390,28 +443,28 @@ mod tests { assert_eq!(scanned_rows_per_range, vec![0]); scanned_rows_per_range.clear(); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![0, 2]); scanned_rows_per_range.clear(); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![1]); scanned_rows_per_range.clear(); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_3"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(block_on(scanner.next()).unwrap(), None); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![2]); scanned_rows_per_range.clear(); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![0]); @@ -436,7 +489,7 @@ mod tests { assert_eq!(&r.lower_inclusive, b""); assert_eq!(&r.upper_exclusive, b""); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b""); @@ -452,7 +505,7 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"x"); @@ -468,7 +521,7 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"x"); @@ -484,20 +537,20 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); assert_eq!(&r.upper_exclusive, b"foo_3\0"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_3\0"); @@ -522,31 +575,31 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo\0"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo\0"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); assert_eq!(&r.upper_exclusive, b"bar\0"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar\0"); assert_eq!(&r.upper_exclusive, b"bar_2\0"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar_2\0"); @@ -571,7 +624,7 @@ mod tests { assert_eq!(&r.lower_inclusive, b""); assert_eq!(&r.upper_exclusive, b""); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b""); @@ -587,7 +640,7 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"x"); @@ -603,7 +656,7 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"x"); @@ -619,20 +672,20 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_3"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2"); assert_eq!(&r.upper_exclusive, b"foo_8"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo_2"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); @@ -655,26 +708,26 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar_2"); assert_eq!(&r.upper_exclusive, b"box"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar"); assert_eq!(&r.upper_exclusive, b"bar_2"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo_2"); - assert_eq!(&scanner.next().unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"bar"); - assert_eq!(scanner.next().unwrap(), None); + assert_eq!(block_on(scanner.next()).unwrap(), None); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); @@ -695,22 +748,31 @@ mod tests { }); // Only lower_inclusive is updated. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"foo" + ); assert_eq!(&scanner.working_range_begin_key, b"foo"); assert_eq!(&scanner.working_range_end_key, b""); // Upper_exclusive is updated. - assert_eq!(&scanner.next_opt(true).unwrap().unwrap().0, b"foo_2"); + assert_eq!( + &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + b"foo_2" + ); assert_eq!(&scanner.working_range_begin_key, b"foo"); assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); // Upper_exclusive is not updated. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo_3"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"foo_3" + ); assert_eq!(&scanner.working_range_begin_key, b"foo"); assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); // Drained. - assert_eq!(scanner.next_opt(false).unwrap(), None); + assert_eq!(block_on(scanner.next_opt(false)).unwrap(), None); assert_eq!(&scanner.working_range_begin_key, b"foo"); assert_eq!(&scanner.working_range_end_key, b"foo_8"); @@ -738,27 +800,39 @@ mod tests { }); // Only lower_inclusive is updated. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"foo" + ); assert_eq!(&scanner.working_range_begin_key, b"foo"); assert_eq!(&scanner.working_range_end_key, b""); // Upper_exclusive is updated. Updated by scanned row. - assert_eq!(&scanner.next_opt(true).unwrap().unwrap().0, b"foo_2"); + assert_eq!( + &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + b"foo_2" + ); assert_eq!(&scanner.working_range_begin_key, b"foo"); assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); // Upper_exclusive is not updated. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"bar"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"bar" + ); assert_eq!(&scanner.working_range_begin_key, b"foo"); assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); // Upper_exclusive is not updated. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"bar_2"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"bar_2" + ); assert_eq!(&scanner.working_range_begin_key, b"foo"); assert_eq!(&scanner.working_range_end_key, b"foo_2\0"); // Drain. - assert_eq!(scanner.next_opt(false).unwrap(), None); + assert_eq!(block_on(scanner.next_opt(false)).unwrap(), None); assert_eq!(&scanner.working_range_begin_key, b"foo"); assert_eq!(&scanner.working_range_end_key, b"box"); @@ -781,22 +855,31 @@ mod tests { }); // Only lower_inclusive is updated. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo_3"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"foo_3" + ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); assert_eq!(&scanner.working_range_end_key, b""); // Upper_exclusive is updated. - assert_eq!(&scanner.next_opt(true).unwrap().unwrap().0, b"foo_2"); + assert_eq!( + &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + b"foo_2" + ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); assert_eq!(&scanner.working_range_end_key, b"foo_2"); // Upper_exclusive is not updated. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"foo" + ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); assert_eq!(&scanner.working_range_end_key, b"foo_2"); // Drained. - assert_eq!(scanner.next_opt(false).unwrap(), None); + assert_eq!(block_on(scanner.next_opt(false)).unwrap(), None); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); assert_eq!(&scanner.working_range_end_key, b"foo"); @@ -822,27 +905,39 @@ mod tests { }); // Lower_inclusive is updated. Upper_exclusive is not update. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"bar_2"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"bar_2" + ); assert_eq!(&scanner.working_range_begin_key, b"box"); assert_eq!(&scanner.working_range_end_key, b""); // Upper_exclusive is updated. Updated by scanned row. - assert_eq!(&scanner.next_opt(true).unwrap().unwrap().0, b"bar"); + assert_eq!( + &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + b"bar" + ); assert_eq!(&scanner.working_range_begin_key, b"box"); assert_eq!(&scanner.working_range_end_key, b"bar"); // Upper_exclusive is not update. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo_2"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"foo_2" + ); assert_eq!(&scanner.working_range_begin_key, b"box"); assert_eq!(&scanner.working_range_end_key, b"bar"); // Upper_exclusive is not update. - assert_eq!(&scanner.next_opt(false).unwrap().unwrap().0, b"foo"); + assert_eq!( + &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + b"foo" + ); assert_eq!(&scanner.working_range_begin_key, b"box"); assert_eq!(&scanner.working_range_end_key, b"bar"); // Drain. - assert_eq!(scanner.next_opt(false).unwrap(), None); + assert_eq!(block_on(scanner.next_opt(false)).unwrap(), None); assert_eq!(&scanner.working_range_begin_key, b"box"); assert_eq!(&scanner.working_range_end_key, b"foo"); diff --git a/components/tidb_query_executors/Cargo.toml b/components/tidb_query_executors/Cargo.toml index 923696606ed..ada01c8aef0 100644 --- a/components/tidb_query_executors/Cargo.toml +++ b/components/tidb_query_executors/Cargo.toml @@ -6,6 +6,7 @@ publish = false description = "A vector query engine to run TiDB pushed down executors" [dependencies] +async-trait = "0.1" codec = { path = "../codec", default-features = false } collections = { path = "../collections" } fail = "0.5" diff --git a/components/tidb_query_executors/src/fast_hash_aggr_executor.rs b/components/tidb_query_executors/src/fast_hash_aggr_executor.rs index 942e61087d3..174912ca0b0 100644 --- a/components/tidb_query_executors/src/fast_hash_aggr_executor.rs +++ b/components/tidb_query_executors/src/fast_hash_aggr_executor.rs @@ -2,6 +2,7 @@ use std::{convert::TryFrom, hash::Hash, sync::Arc}; +use async_trait::async_trait; use collections::HashMap; use tidb_query_aggr::*; use tidb_query_common::{storage::IntervalRange, Result}; @@ -38,6 +39,7 @@ pub struct BatchFastHashAggregationExecutor( AggregationExecutor, ); +#[async_trait] impl BatchExecutor for BatchFastHashAggregationExecutor { type StorageStats = Src::StorageStats; @@ -47,8 +49,8 @@ impl BatchExecutor for BatchFastHashAggregationExecutor } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { - self.0.next_batch(scan_rows) + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + self.0.next_batch(scan_rows).await } #[inline] @@ -464,6 +466,7 @@ where #[cfg(test)] mod tests { + use futures::executor::block_on; use tidb_query_datatype::{expr::EvalWarnings, FieldTypeTp}; use tidb_query_expr::{ impl_arithmetic::{arithmetic_fn_meta, RealPlus}, @@ -539,17 +542,17 @@ mod tests { let src_exec = make_src_executor_1(); let mut exec = exec_builder(src_exec); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let mut r = exec.next_batch(1); + let mut r = block_on(exec.next_batch(1)); // col_0 + col_1 can result in [NULL, 9.0, 6.0], thus there will be three // groups. assert_eq!(&r.logical_rows, &[0, 1, 2]); @@ -675,17 +678,17 @@ mod tests { let src_exec = make_src_executor_1(); let mut exec = exec_builder(src_exec); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let mut r = exec.next_batch(1); + let mut r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); assert_eq!(r.physical_columns.columns_len(), 5); // 4 result column, 1 group by column @@ -759,17 +762,17 @@ mod tests { let src_exec = make_src_executor_1(); let mut exec = exec_builder(src_exec); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let mut r = exec.next_batch(1); + let mut r = block_on(exec.next_batch(1)); // col_4 can result in [NULL, "aa", "aaa"], thus there will be three groups. assert_eq!(&r.logical_rows, &[0, 1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); @@ -944,12 +947,12 @@ mod tests { ); let mut exec = exec_builder(src_exec); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(r.is_drained.unwrap()); @@ -992,17 +995,17 @@ mod tests { let src_exec = make_src_executor_1(); let mut exec = exec_builder(src_exec); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let mut r = exec.next_batch(1); + let mut r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); assert_eq!(r.physical_columns.columns_len(), 1); // 0 result column, 1 group by column @@ -1063,17 +1066,17 @@ mod tests { let src_exec = make_src_executor_1(); let mut exec = exec_builder(src_exec); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let mut r = exec.next_batch(1); + let mut r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); assert_eq!(r.physical_columns.columns_len(), 1); // 0 result column, 1 group by column @@ -1136,7 +1139,7 @@ mod tests { }], ); let mut exec = exec_builder(src_exec); - let r = exec.next_batch(4); + let r = block_on(exec.next_batch(4)); assert_eq!(r.physical_columns.rows_len(), 4); assert_eq!(r.physical_columns.columns_len(), 2); diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index 8492a928a8d..ae04ffe03e6 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -2,6 +2,7 @@ use std::sync::Arc; +use async_trait::async_trait; use codec::{number::NumberCodec, prelude::NumberDecoder}; use itertools::izip; use kvproto::coprocessor::KeyRange; @@ -152,6 +153,7 @@ impl BatchIndexScanExecutor { } } +#[async_trait] impl BatchExecutor for BatchIndexScanExecutor { type StorageStats = S::Statistics; @@ -161,8 +163,8 @@ impl BatchExecutor for BatchIndexScanExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { - self.0.next_batch(scan_rows) + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + self.0.next_batch(scan_rows).await } #[inline] @@ -874,6 +876,7 @@ mod tests { use std::sync::Arc; use codec::prelude::NumberEncoder; + use futures::executor::block_on; use kvproto::coprocessor::KeyRange; use tidb_query_common::{storage::test_fixture::FixtureStorage, util::convert_to_prefix_next}; use tidb_query_datatype::{ @@ -984,7 +987,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 3); @@ -1041,7 +1044,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 3); @@ -1101,7 +1104,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 3); @@ -1146,7 +1149,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 3); @@ -1198,7 +1201,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 2); @@ -1275,7 +1278,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 2); @@ -1332,7 +1335,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1442,7 +1445,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1485,7 +1488,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1581,7 +1584,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1681,7 +1684,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1775,7 +1778,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1868,7 +1871,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1994,7 +1997,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert!(result.is_drained.as_ref().unwrap()); assert_eq!(result.physical_columns.columns_len(), 4); assert_eq!(result.physical_columns.rows_len(), 1); diff --git a/components/tidb_query_executors/src/interface.rs b/components/tidb_query_executors/src/interface.rs index 1ea5038a2d6..611516ab6bc 100644 --- a/components/tidb_query_executors/src/interface.rs +++ b/components/tidb_query_executors/src/interface.rs @@ -5,6 +5,7 @@ //! Batch executor common structures. +use async_trait::async_trait; pub use tidb_query_common::execute_stats::{ ExecSummaryCollector, ExecuteStats, WithSummaryCollector, }; @@ -16,6 +17,7 @@ use tipb::FieldType; /// The interface for pull-based executors. It is similar to the Volcano /// Iterator model, but pulls data in batch and stores data by column. +#[async_trait] pub trait BatchExecutor: Send { type StorageStats; @@ -26,7 +28,7 @@ pub trait BatchExecutor: Send { /// /// This function might return zero rows, which doesn't mean that there is /// no more result. See `is_drained` in `BatchExecuteResult`. - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult; + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult; /// Collects execution statistics (including but not limited to metrics and /// execution summaries) accumulated during execution and prepares for @@ -68,6 +70,7 @@ pub trait BatchExecutor: Send { } } +#[async_trait] impl BatchExecutor for Box { type StorageStats = T::StorageStats; @@ -75,8 +78,8 @@ impl BatchExecutor for Box { (**self).schema() } - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { - (**self).next_batch(scan_rows) + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + (**self).next_batch(scan_rows).await } fn collect_exec_stats(&mut self, dest: &mut ExecuteStats) { @@ -96,6 +99,7 @@ impl BatchExecutor for Box { } } +#[async_trait] impl BatchExecutor for WithSummaryCollector { @@ -105,9 +109,9 @@ impl BatchExecutor self.inner.schema() } - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { let timer = self.summary_collector.on_start_iterate(); - let result = self.inner.next_batch(scan_rows); + let result = self.inner.next_batch(scan_rows).await; self.summary_collector .on_finish_iterate(timer, result.logical_rows.len()); result diff --git a/components/tidb_query_executors/src/limit_executor.rs b/components/tidb_query_executors/src/limit_executor.rs index a1917e1b17b..a9cd2cae482 100644 --- a/components/tidb_query_executors/src/limit_executor.rs +++ b/components/tidb_query_executors/src/limit_executor.rs @@ -1,5 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use async_trait::async_trait; use tidb_query_common::{storage::IntervalRange, Result}; use tipb::FieldType; @@ -23,6 +24,7 @@ impl BatchLimitExecutor { } } +#[async_trait] impl BatchExecutor for BatchLimitExecutor { type StorageStats = Src::StorageStats; @@ -32,13 +34,13 @@ impl BatchExecutor for BatchLimitExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { let real_scan_rows = if self.is_src_scan_executor { std::cmp::min(scan_rows, self.remaining_rows) } else { scan_rows }; - let mut result = self.src.next_batch(real_scan_rows); + let mut result = self.src.next_batch(real_scan_rows).await; if result.logical_rows.len() < self.remaining_rows { self.remaining_rows -= result.logical_rows.len(); } else { @@ -74,6 +76,7 @@ impl BatchExecutor for BatchLimitExecutor { #[cfg(test)] mod tests { + use futures::executor::block_on; use tidb_query_datatype::{ codec::{batch::LazyBatchColumnVec, data_type::VectorValue}, expr::EvalWarnings, @@ -99,7 +102,7 @@ mod tests { let mut exec = BatchLimitExecutor::new(src_exec, 0, false).unwrap(); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 3); assert!(r.is_drained.unwrap()); @@ -121,7 +124,7 @@ mod tests { let mut exec = BatchLimitExecutor::new(src_exec, 10, false).unwrap(); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); r.is_drained.unwrap_err(); @@ -153,12 +156,12 @@ mod tests { let mut exec = BatchLimitExecutor::new(src_exec, 10, false).unwrap(); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 3); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); assert!(r.is_drained.unwrap()); @@ -190,12 +193,12 @@ mod tests { let mut exec = BatchLimitExecutor::new(src_exec, 4, false).unwrap(); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 2]); assert_eq!(r.physical_columns.rows_len(), 3); assert!(r.is_drained.unwrap()); // No errors @@ -233,17 +236,17 @@ mod tests { let mut exec = BatchLimitExecutor::new(src_exec, 4, false).unwrap(); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 4]); assert_eq!(r.physical_columns.rows_len(), 5); assert!(r.is_drained.unwrap()); @@ -256,9 +259,9 @@ mod tests { let src_exec = MockScanExecutor::new(rows, schema); let mut exec = BatchLimitExecutor::new(src_exec, 5, true).unwrap(); - let r = exec.next_batch(100); + let r = block_on(exec.next_batch(100)); assert_eq!(r.logical_rows, &[0, 1, 2, 3, 4]); - let r = exec.next_batch(2); + let r = block_on(exec.next_batch(2)); assert_eq!(r.is_drained.unwrap(), true); let schema = vec![FieldTypeTp::LongLong.into()]; @@ -266,10 +269,10 @@ mod tests { let src_exec = MockScanExecutor::new(rows, schema); let mut exec = BatchLimitExecutor::new(src_exec, 1024, true).unwrap(); for _i in 0..1023 { - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(r.is_drained.unwrap(), false); } - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(r.is_drained.unwrap(), true); } } diff --git a/components/tidb_query_executors/src/projection_executor.rs b/components/tidb_query_executors/src/projection_executor.rs index 7304ed1b1e3..962cd8698e5 100644 --- a/components/tidb_query_executors/src/projection_executor.rs +++ b/components/tidb_query_executors/src/projection_executor.rs @@ -2,6 +2,7 @@ use std::sync::Arc; +use async_trait::async_trait; use tidb_query_common::{storage::IntervalRange, Result}; use tidb_query_datatype::{ codec::{batch::LazyBatchColumnVec, data_type::*}, @@ -75,6 +76,7 @@ impl BatchProjectionExecutor { } } +#[async_trait] impl BatchExecutor for BatchProjectionExecutor { type StorageStats = Src::StorageStats; @@ -84,8 +86,8 @@ impl BatchExecutor for BatchProjectionExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { - let mut src_result = self.src.next_batch(scan_rows); + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + let mut src_result = self.src.next_batch(scan_rows).await; let child_schema = self.src.schema(); let mut eval_result = Vec::with_capacity(self.schema().len()); let BatchExecuteResult { @@ -159,6 +161,7 @@ impl BatchExecutor for BatchProjectionExecutor { #[cfg(test)] mod tests { + use futures::executor::block_on; use tidb_query_codegen::rpn_fn; use tidb_query_datatype::{codec::batch::LazyBatchColumnVec, expr::EvalWarnings, FieldTypeTp}; @@ -213,7 +216,7 @@ mod tests { // correctly. No errors should be generated and the expression functions // should not be called. - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); // The scan rows parameter has no effect for mock executor. We don't care. // FIXME: A compiler bug prevented us write: // | assert_eq!(r.logical_rows.as_slice(), &[]); @@ -221,11 +224,11 @@ mod tests { assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(r.is_drained.unwrap()); } @@ -289,7 +292,7 @@ mod tests { ]; let mut exec = BatchProjectionExecutor::new_for_test(src_exec, exprs); assert_eq!(exec.schema().len(), 1); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[2, 0]); assert_eq!(r.physical_columns.columns_len(), 1); assert_eq!( @@ -298,12 +301,12 @@ mod tests { ); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.columns_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1]); assert_eq!(r.physical_columns.columns_len(), 1); assert_eq!( @@ -326,7 +329,7 @@ mod tests { ]; let mut exec = BatchProjectionExecutor::new_for_test(src_exec, exprs); assert_eq!(exec.schema().len(), 2); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[2, 0]); assert_eq!(r.physical_columns.columns_len(), 2); assert_eq!( @@ -339,12 +342,12 @@ mod tests { ); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.columns_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1]); assert_eq!(r.physical_columns.columns_len(), 2); assert_eq!( @@ -438,7 +441,7 @@ mod tests { .build_for_test(); let mut exec = BatchProjectionExecutor::new_for_test(src_exec, vec![expr1, expr2]); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[3, 4, 0, 2]); assert_eq!(r.physical_columns.columns_len(), 2); assert_eq!( @@ -451,11 +454,11 @@ mod tests { ); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(r.logical_rows, &[0]); assert_eq!(r.physical_columns[0].decoded().to_int_vec(), vec![None]); assert_eq!(r.physical_columns[1].decoded().to_int_vec(), vec![Some(1)]); @@ -521,7 +524,7 @@ mod tests { .collect(); let mut exec = BatchProjectionExecutor::new_for_test(src_exec, exprs); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); r.is_drained.unwrap_err(); } diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 9f32aaa180e..551c3da8a7e 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -1,6 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{convert::TryFrom, sync::Arc, time::Duration}; +use std::{convert::TryFrom, sync::Arc}; use fail::fail_point; use kvproto::coprocessor::KeyRange; @@ -19,13 +19,11 @@ use tikv_util::{ deadline::Deadline, metrics::{ThrottleType, NON_TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC_STATIC}, quota_limiter::QuotaLimiter, - time::Instant, }; use tipb::{ self, Chunk, DagRequest, EncodeType, ExecType, ExecutorExecutionSummary, FieldType, SelectResponse, StreamResponse, }; -use yatp::task::future::reschedule; use super::{ interface::{BatchExecutor, ExecuteStats}, @@ -44,10 +42,6 @@ pub use tidb_query_expr::types::BATCH_MAX_SIZE; // TODO: Maybe there can be some better strategy. Needs benchmarks and tunes. const BATCH_GROW_FACTOR: usize = 2; -/// Batch executors are run in coroutines. `MAX_TIME_SLICE` is the maximum time -/// a coroutine can run without being yielded. -pub const MAX_TIME_SLICE: Duration = Duration::from_millis(1); - pub struct BatchExecutorsRunner { /// The deadline of this handler. For each check point (e.g. each iteration) /// we need to check whether or not the deadline is exceeded and break @@ -450,26 +444,21 @@ impl BatchExecutorsRunner { let mut ctx = EvalContext::new(self.config.clone()); let mut record_all = 0; - let mut time_slice_start = Instant::now(); loop { - // Check whether we should yield from the execution - if need_reschedule(time_slice_start) { - reschedule().await; - time_slice_start = Instant::now(); - } - let mut chunk = Chunk::default(); - let mut sample = self.quota_limiter.new_sample(true); let (drained, record_len) = { - let _guard = sample.observe_cpu(); - self.internal_handle_request( - false, - batch_size, - &mut chunk, - &mut warnings, - &mut ctx, - )? + let (cpu_time, res) = sample + .observe_cpu_async(self.internal_handle_request( + false, + batch_size, + &mut chunk, + &mut warnings, + &mut ctx, + )) + .await; + sample.add_cpu_time(cpu_time); + res? }; if chunk.has_rows_data() { sample.add_read_bytes(chunk.get_rows_data().len()); @@ -534,7 +523,7 @@ impl BatchExecutorsRunner { } } - pub fn handle_streaming_request( + pub async fn handle_streaming_request( &mut self, ) -> Result<(Option<(StreamResponse, IntervalRange)>, bool)> { let mut warnings = self.config.new_eval_warnings(); @@ -548,13 +537,15 @@ impl BatchExecutorsRunner { while record_len < self.stream_row_limit && !is_drained { let mut current_chunk = Chunk::default(); // TODO: Streaming coprocessor on TiKV is just not enabled in TiDB now. - let (drained, len) = self.internal_handle_request( - true, - batch_size.min(self.stream_row_limit - record_len), - &mut current_chunk, - &mut warnings, - &mut ctx, - )?; + let (drained, len) = self + .internal_handle_request( + true, + batch_size.min(self.stream_row_limit - record_len), + &mut current_chunk, + &mut warnings, + &mut ctx, + ) + .await?; chunk .mut_rows_data() .extend_from_slice(current_chunk.get_rows_data()); @@ -586,7 +577,7 @@ impl BatchExecutorsRunner { } } - fn internal_handle_request( + async fn internal_handle_request( &mut self, is_streaming: bool, batch_size: usize, @@ -598,7 +589,7 @@ impl BatchExecutorsRunner { self.deadline.check()?; - let mut result = self.out_most_executor.next_batch(batch_size); + let mut result = self.out_most_executor.next_batch(batch_size).await; let is_drained = result.is_drained?; @@ -690,9 +681,3 @@ fn grow_batch_size(batch_size: &mut usize) { } } } - -#[inline] -fn need_reschedule(time_slice_start: Instant) -> bool { - fail_point!("copr_reschedule", |_| true); - time_slice_start.saturating_elapsed() > MAX_TIME_SLICE -} diff --git a/components/tidb_query_executors/src/selection_executor.rs b/components/tidb_query_executors/src/selection_executor.rs index d3a2d97ef4b..60459229f4f 100644 --- a/components/tidb_query_executors/src/selection_executor.rs +++ b/components/tidb_query_executors/src/selection_executor.rs @@ -2,6 +2,7 @@ use std::sync::Arc; +use async_trait::async_trait; use tidb_query_common::{storage::IntervalRange, Result}; use tidb_query_datatype::{ codec::data_type::*, @@ -164,6 +165,7 @@ where err_result } +#[async_trait] impl BatchExecutor for BatchSelectionExecutor { type StorageStats = Src::StorageStats; @@ -174,8 +176,8 @@ impl BatchExecutor for BatchSelectionExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { - let mut src_result = self.src.next_batch(scan_rows); + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + let mut src_result = self.src.next_batch(scan_rows).await; if let Err(e) = self.handle_src_result(&mut src_result) { // TODO: Rows before we meeting an evaluation error are innocent. @@ -213,6 +215,7 @@ impl BatchExecutor for BatchSelectionExecutor { #[cfg(test)] mod tests { + use futures::executor::block_on; use tidb_query_codegen::rpn_fn; use tidb_query_datatype::{codec::batch::LazyBatchColumnVec, expr::EvalWarnings, FieldTypeTp}; @@ -267,7 +270,7 @@ mod tests { // correctly. No errors should be generated and the predicate function // should not be called. - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); // The scan rows parameter has no effect for mock executor. We don't care. // FIXME: A compiler bug prevented us write: // | assert_eq!(r.logical_rows.as_slice(), &[]); @@ -275,11 +278,11 @@ mod tests { assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(r.is_drained.unwrap()); } @@ -359,15 +362,15 @@ mod tests { // The selection executor should return data as it is. - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[2, 0]); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1]); assert!(r.is_drained.unwrap()); } @@ -385,15 +388,15 @@ mod tests { // The selection executor should always return empty rows. - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(r.is_drained.unwrap()); } @@ -479,15 +482,15 @@ mod tests { .build_for_test(); let mut exec = BatchSelectionExecutor::new_for_test(src_exec, vec![predicate]); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[3, 0]); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(r.is_drained.unwrap()); } @@ -504,15 +507,15 @@ mod tests { .build_for_test(); let mut exec = BatchSelectionExecutor::new_for_test(src_exec, vec![predicate]); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 2]); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(r.is_drained.unwrap()); } @@ -542,15 +545,15 @@ mod tests { let src_exec = make_src_executor_using_fixture_2(); let mut exec = BatchSelectionExecutor::new_for_test(src_exec, predicates); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(r.is_drained.unwrap()); } @@ -577,15 +580,15 @@ mod tests { let src_exec = make_src_executor_using_fixture_2(); let mut exec = BatchSelectionExecutor::new_for_test(src_exec, predicates); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(r.is_drained.unwrap()); } @@ -653,7 +656,7 @@ mod tests { // TODO: A more precise result is that the first two rows are returned and error // starts from the third row. - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); r.is_drained.unwrap_err(); } diff --git a/components/tidb_query_executors/src/simple_aggr_executor.rs b/components/tidb_query_executors/src/simple_aggr_executor.rs index d26d293a274..75790428187 100644 --- a/components/tidb_query_executors/src/simple_aggr_executor.rs +++ b/components/tidb_query_executors/src/simple_aggr_executor.rs @@ -5,6 +5,7 @@ use std::sync::Arc; +use async_trait::async_trait; use tidb_query_aggr::*; use tidb_query_common::{storage::IntervalRange, Result}; use tidb_query_datatype::{ @@ -24,6 +25,7 @@ pub struct BatchSimpleAggregationExecutor( AggregationExecutor, ); +#[async_trait] impl BatchExecutor for BatchSimpleAggregationExecutor { type StorageStats = Src::StorageStats; @@ -33,8 +35,8 @@ impl BatchExecutor for BatchSimpleAggregationExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { - self.0.next_batch(scan_rows) + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + self.0.next_batch(scan_rows).await } #[inline] @@ -232,6 +234,7 @@ impl AggregationExecutorImpl for SimpleAggregationImpl #[cfg(test)] mod tests { + use futures::executor::block_on; use tidb_query_codegen::AggrFunction; use tidb_query_datatype::{ expr::{EvalContext, EvalWarnings}, @@ -460,15 +463,15 @@ mod tests { BatchSimpleAggregationExecutor::new_for_test(src_exec, aggr_definitions, MyParser); // The scan rows parameter has no effect for mock executor. We don't care. - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); assert_eq!(r.physical_columns.columns_len(), 12); @@ -548,15 +551,15 @@ mod tests { AllAggrDefinitionParser, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); assert_eq!(r.physical_columns.columns_len(), 10); @@ -665,12 +668,12 @@ mod tests { let mut exec = BatchSimpleAggregationExecutor::new_for_test(src_exec, vec![Expr::default()], MyParser); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(r.is_drained.unwrap()); diff --git a/components/tidb_query_executors/src/slow_hash_aggr_executor.rs b/components/tidb_query_executors/src/slow_hash_aggr_executor.rs index 2502e28f570..ee076b652a7 100644 --- a/components/tidb_query_executors/src/slow_hash_aggr_executor.rs +++ b/components/tidb_query_executors/src/slow_hash_aggr_executor.rs @@ -7,6 +7,7 @@ use std::{ sync::Arc, }; +use async_trait::async_trait; use collections::{HashMap, HashMapEntry}; use tidb_query_aggr::*; use tidb_query_common::{storage::IntervalRange, Result}; @@ -32,6 +33,7 @@ pub struct BatchSlowHashAggregationExecutor( AggregationExecutor, ); +#[async_trait] impl BatchExecutor for BatchSlowHashAggregationExecutor { type StorageStats = Src::StorageStats; @@ -41,8 +43,8 @@ impl BatchExecutor for BatchSlowHashAggregationExecutor } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { - self.0.next_batch(scan_rows) + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + self.0.next_batch(scan_rows).await } #[inline] @@ -511,6 +513,7 @@ impl Eq for GroupKeyRefUnsafe {} #[cfg(test)] mod tests { + use futures::executor::block_on; use tidb_query_datatype::{codec::data_type::*, FieldTypeTp}; use tidb_query_expr::{ impl_arithmetic::{arithmetic_fn_meta, RealPlus}, @@ -571,17 +574,17 @@ mod tests { AllAggrDefinitionParser, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let mut r = exec.next_batch(1); + let mut r = block_on(exec.next_batch(1)); // col_4 (sort_key), col_0 + 1 can result in: // NULL, NULL // aa, NULL diff --git a/components/tidb_query_executors/src/stream_aggr_executor.rs b/components/tidb_query_executors/src/stream_aggr_executor.rs index 4b768cd65fe..d8a0599bf87 100644 --- a/components/tidb_query_executors/src/stream_aggr_executor.rs +++ b/components/tidb_query_executors/src/stream_aggr_executor.rs @@ -2,6 +2,7 @@ use std::{cmp::Ordering, convert::TryFrom, sync::Arc}; +use async_trait::async_trait; use tidb_query_aggr::*; use tidb_query_common::{storage::IntervalRange, Result}; use tidb_query_datatype::{ @@ -24,6 +25,7 @@ pub struct BatchStreamAggregationExecutor( AggregationExecutor, ); +#[async_trait] impl BatchExecutor for BatchStreamAggregationExecutor { type StorageStats = Src::StorageStats; @@ -33,8 +35,8 @@ impl BatchExecutor for BatchStreamAggregationExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { - self.0.next_batch(scan_rows) + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + self.0.next_batch(scan_rows).await } #[inline] @@ -454,6 +456,7 @@ fn update_current_states( #[cfg(test)] mod tests { + use futures::executor::block_on; use tidb_query_datatype::{ builder::FieldTypeBuilder, expr::EvalWarnings, Collation, FieldTypeTp, }; @@ -511,7 +514,7 @@ mod tests { AllAggrDefinitionParser, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1]); assert_eq!(r.physical_columns.rows_len(), 2); assert_eq!(r.physical_columns.columns_len(), 5); @@ -542,12 +545,12 @@ mod tests { &[None, Real::new(3.5).ok()] ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); assert_eq!(r.physical_columns.columns_len(), 5); @@ -595,7 +598,7 @@ mod tests { AllAggrDefinitionParser, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1]); assert_eq!(r.physical_columns.rows_len(), 2); assert_eq!(r.physical_columns.columns_len(), 2); @@ -611,12 +614,12 @@ mod tests { &[None, Real::new(1.5).ok()] ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); assert_eq!(r.physical_columns.columns_len(), 2); diff --git a/components/tidb_query_executors/src/table_scan_executor.rs b/components/tidb_query_executors/src/table_scan_executor.rs index 3ddb20b3e4d..957a23ba8c0 100644 --- a/components/tidb_query_executors/src/table_scan_executor.rs +++ b/components/tidb_query_executors/src/table_scan_executor.rs @@ -2,6 +2,7 @@ use std::{collections::HashSet, sync::Arc}; +use async_trait::async_trait; use collections::HashMap; use kvproto::coprocessor::KeyRange; use smallvec::SmallVec; @@ -108,6 +109,7 @@ impl BatchTableScanExecutor { } } +#[async_trait] impl BatchExecutor for BatchTableScanExecutor { type StorageStats = S::Statistics; @@ -117,8 +119,8 @@ impl BatchExecutor for BatchTableScanExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { - self.0.next_batch(scan_rows) + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + self.0.next_batch(scan_rows).await } #[inline] @@ -438,6 +440,7 @@ impl ScanExecutorImpl for TableScanExecutorImpl { mod tests { use std::{iter, sync::Arc}; + use futures::executor::block_on; use kvproto::coprocessor::KeyRange; use tidb_query_common::{ execute_stats::*, storage::test_fixture::FixtureStorage, util::convert_to_prefix_next, @@ -716,7 +719,7 @@ mod tests { for expect_rows in batch_expect_rows { let expect_rows = *expect_rows; let expect_drained = start_row + expect_rows > total_rows; - let result = executor.next_batch(expect_rows); + let result = block_on(executor.next_batch(expect_rows)); assert_eq!(*result.is_drained.as_ref().unwrap(), expect_drained); if expect_drained { // all remaining rows are fetched @@ -796,8 +799,8 @@ mod tests { .unwrap() .collect_summary(1); - executor.next_batch(1); - executor.next_batch(2); + block_on(executor.next_batch(1)); + block_on(executor.next_batch(2)); let mut s = ExecuteStats::new(2); executor.collect_exec_stats(&mut s); @@ -825,7 +828,7 @@ mod tests { // Reset collected statistics so that now we will only collect statistics in // this round. s.clear(); - executor.next_batch(10); + block_on(executor.next_batch(10)); executor.collect_exec_stats(&mut s); assert_eq!(s.scanned_rows_per_range.len(), 1); @@ -938,7 +941,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 2); @@ -1045,7 +1048,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1093,7 +1096,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1135,7 +1138,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(1); + let mut result = block_on(executor.next_batch(1)); result.is_drained.unwrap(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 1); @@ -1153,7 +1156,7 @@ mod tests { &[Some(7)] ); - let result = executor.next_batch(1); + let result = block_on(executor.next_batch(1)); result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 0); @@ -1174,7 +1177,7 @@ mod tests { ) .unwrap(); - let result = executor.next_batch(10); + let result = block_on(executor.next_batch(10)); result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 0); @@ -1195,7 +1198,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); result.is_drained.unwrap(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 2); @@ -1229,7 +1232,7 @@ mod tests { ) .unwrap(); - let result = executor.next_batch(10); + let result = block_on(executor.next_batch(10)); result.is_drained.unwrap_err(); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 0); @@ -1279,7 +1282,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert_eq!(result.is_drained.unwrap(), true); assert_eq!(result.logical_rows.len(), 1); assert_eq!(result.physical_columns.columns_len(), columns_is_pk.len()); @@ -1387,7 +1390,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert_eq!(result.is_drained.unwrap(), true); assert_eq!(result.logical_rows.len(), 1); @@ -1568,7 +1571,7 @@ mod tests { ) .unwrap(); - let mut result = executor.next_batch(10); + let mut result = block_on(executor.next_batch(10)); assert_eq!(result.is_drained.unwrap(), true); if !columns_info.is_empty() { assert_eq!(result.logical_rows.len(), 1); diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index 39f009784f0..06dc1ce956b 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -2,6 +2,7 @@ use std::{cmp::Ordering, collections::BinaryHeap, ptr::NonNull, sync::Arc}; +use async_trait::async_trait; use tidb_query_common::{storage::IntervalRange, Result}; use tidb_query_datatype::{ codec::{ @@ -178,10 +179,10 @@ impl BatchTopNExecutor { } #[inline] - fn handle_next_batch(&mut self) -> Result> { + async fn handle_next_batch(&mut self) -> Result> { // Use max batch size from the beginning because top N // always needs to calculate over all data. - let src_result = self.src.next_batch(crate::runner::BATCH_MAX_SIZE); + let src_result = self.src.next_batch(crate::runner::BATCH_MAX_SIZE).await; self.context.warnings = src_result.warnings; @@ -319,6 +320,7 @@ impl BatchTopNExecutor { } } +#[async_trait] impl BatchExecutor for BatchTopNExecutor { type StorageStats = Src::StorageStats; @@ -328,7 +330,7 @@ impl BatchExecutor for BatchTopNExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { assert!(!self.is_ended); if self.n == 0 { @@ -343,11 +345,11 @@ impl BatchExecutor for BatchTopNExecutor { if let Some(paging_size) = self.context.cfg.paging_size { if self.n > paging_size as usize { - return self.src.next_batch(scan_rows); + return self.src.next_batch(scan_rows).await; } } - let result = self.handle_next_batch(); + let result = self.handle_next_batch().await; match result { Err(e) => { @@ -507,6 +509,7 @@ impl Eq for HeapItemUnsafe {} #[cfg(test)] mod tests { + use futures::executor::block_on; use tidb_query_datatype::{ builder::FieldTypeBuilder, expr::EvalWarnings, Collation, FieldTypeFlag, FieldTypeTp, }; @@ -540,7 +543,7 @@ mod tests { 0, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); assert!(r.is_drained.unwrap()); } @@ -578,11 +581,11 @@ mod tests { 10, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); assert!(r.is_drained.unwrap()); } @@ -699,17 +702,17 @@ mod tests { 100, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6]); assert_eq!(r.physical_columns.rows_len(), 7); assert_eq!(r.physical_columns.columns_len(), 3); @@ -769,17 +772,17 @@ mod tests { 7, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6]); assert_eq!(r.physical_columns.rows_len(), 7); assert_eq!(r.physical_columns.columns_len(), 3); @@ -852,17 +855,17 @@ mod tests { 5, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); assert_eq!(r.physical_columns.rows_len(), 5); assert_eq!(r.physical_columns.columns_len(), 3); @@ -1016,17 +1019,17 @@ mod tests { 5, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); assert_eq!(r.physical_columns.rows_len(), 5); assert_eq!(r.physical_columns.columns_len(), 3); @@ -1097,17 +1100,17 @@ mod tests { 5, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); assert_eq!(r.physical_columns.rows_len(), 5); assert_eq!(r.physical_columns.columns_len(), 3); @@ -1258,17 +1261,17 @@ mod tests { 5, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); assert_eq!(r.physical_columns.rows_len(), 5); assert_eq!(r.physical_columns.columns_len(), 3); @@ -1372,17 +1375,17 @@ mod tests { 5, ); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); assert!(!r.is_drained.unwrap()); - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); assert_eq!(r.physical_columns.rows_len(), 5); assert_eq!(r.physical_columns.columns_len(), 3); @@ -1485,8 +1488,8 @@ mod tests { let mut exec2 = build_src_executor(); loop { - let r1 = exec.next_batch(1); - let r2 = exec2.next_batch(1); + let r1 = block_on(exec.next_batch(1)); + let r2 = block_on(exec2.next_batch(1)); assert_eq!(r1.logical_rows, r2.logical_rows); assert_eq!( r1.physical_columns.rows_len(), diff --git a/components/tidb_query_executors/src/util/aggr_executor.rs b/components/tidb_query_executors/src/util/aggr_executor.rs index a40c0c9aec4..ceb9949f83b 100644 --- a/components/tidb_query_executors/src/util/aggr_executor.rs +++ b/components/tidb_query_executors/src/util/aggr_executor.rs @@ -29,6 +29,7 @@ use std::{convert::TryFrom, sync::Arc}; +use async_trait::async_trait; use tidb_query_aggr::*; use tidb_query_common::{storage::IntervalRange, Result}; use tidb_query_datatype::{ @@ -202,10 +203,14 @@ impl> AggregationExecutor Result<(Option, bool)> { + async fn handle_next_batch(&mut self) -> Result<(Option, bool)> { // Use max batch size from the beginning because aggregation // always needs to calculate over all data. - let src_result = self.entities.src.next_batch(crate::runner::BATCH_MAX_SIZE); + let src_result = self + .entities + .src + .next_batch(crate::runner::BATCH_MAX_SIZE) + .await; self.entities.context.warnings = src_result.warnings; @@ -290,6 +295,7 @@ impl> AggregationExecutor> BatchExecutor for AggregationExecutor { @@ -301,10 +307,10 @@ impl> BatchExecutor } #[inline] - fn next_batch(&mut self, _scan_rows: usize) -> BatchExecuteResult { + async fn next_batch(&mut self, _scan_rows: usize) -> BatchExecuteResult { assert!(!self.is_ended); - let result = self.handle_next_batch(); + let result = self.handle_next_batch().await; match result { Err(e) => { @@ -581,6 +587,7 @@ pub mod tests { fn test_agg_paging() { use std::sync::Arc; + use futures::executor::block_on; use tidb_query_datatype::expr::EvalConfig; use tidb_query_expr::RpnExpressionBuilder; use tipb::ExprType; @@ -642,7 +649,7 @@ pub mod tests { let src_exec = make_src_executor_2(); let mut exec = exec_builder(src_exec, Some(paging_size)); for nth_call in 0..call_num { - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); if nth_call == call_num - 1 { assert!(r.is_drained.unwrap()); } else { @@ -672,7 +679,7 @@ pub mod tests { let row_num = &expect_row_num2[test_case]; let mut exec = exec_stream(make_src_executor_2(), Some(paging_size)); for nth_call in 0..call_num { - let r = exec.next_batch(1); + let r = block_on(exec.next_batch(1)); if nth_call == call_num - 1 { assert!(r.is_drained.unwrap()); } else { diff --git a/components/tidb_query_executors/src/util/mock_executor.rs b/components/tidb_query_executors/src/util/mock_executor.rs index ae20695033f..a6f11904b33 100644 --- a/components/tidb_query_executors/src/util/mock_executor.rs +++ b/components/tidb_query_executors/src/util/mock_executor.rs @@ -1,5 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use async_trait::async_trait; use tidb_query_common::storage::IntervalRange; use tidb_query_datatype::{ codec::{batch::LazyBatchColumnVec, data_type::VectorValue}, @@ -28,6 +29,7 @@ impl MockExecutor { } } +#[async_trait] impl BatchExecutor for MockExecutor { type StorageStats = (); @@ -35,7 +37,7 @@ impl BatchExecutor for MockExecutor { &self.schema } - fn next_batch(&mut self, _scan_rows: usize) -> BatchExecuteResult { + async fn next_batch(&mut self, _scan_rows: usize) -> BatchExecuteResult { self.results.next().unwrap() } @@ -73,6 +75,7 @@ impl MockScanExecutor { } } +#[async_trait] impl BatchExecutor for MockScanExecutor { type StorageStats = (); @@ -80,7 +83,7 @@ impl BatchExecutor for MockScanExecutor { &self.schema } - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { let real_scan_rows = std::cmp::min(scan_rows, self.rows.len()); // just one column let mut res_col = Vec::new(); diff --git a/components/tidb_query_executors/src/util/scan_executor.rs b/components/tidb_query_executors/src/util/scan_executor.rs index c9a88fb820e..935db5dd392 100644 --- a/components/tidb_query_executors/src/util/scan_executor.rs +++ b/components/tidb_query_executors/src/util/scan_executor.rs @@ -1,5 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use async_trait::async_trait; use kvproto::coprocessor::KeyRange; use tidb_query_common::{ storage::{ @@ -98,7 +99,7 @@ impl ScanExecutor { /// /// The columns are ensured to be regular even if there are errors during /// the process. - fn fill_column_vec( + async fn fill_column_vec( &mut self, scan_rows: usize, columns: &mut LazyBatchColumnVec, @@ -106,7 +107,7 @@ impl ScanExecutor { assert!(scan_rows > 0); for i in 0..scan_rows { - let some_row = self.scanner.next_opt(i == scan_rows - 1)?; + let some_row = self.scanner.next_opt(i == scan_rows - 1).await?; if let Some((key, value)) = some_row { // Retrieved one row from point range or non-point range. @@ -160,6 +161,7 @@ pub fn check_columns_info_supported(columns_info: &[ColumnInfo]) -> Result<()> { Ok(()) } +#[async_trait] impl BatchExecutor for ScanExecutor { type StorageStats = S::Statistics; @@ -169,12 +171,12 @@ impl BatchExecutor for ScanExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { assert!(!self.is_ended); assert!(scan_rows > 0); let mut logical_columns = self.imp.build_column_vec(scan_rows); - let is_drained = self.fill_column_vec(scan_rows, &mut logical_columns); + let is_drained = self.fill_column_vec(scan_rows, &mut logical_columns).await; logical_columns.assert_columns_equal_length(); let logical_rows = (0..logical_columns.rows_len()).collect(); diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index befe6559e32..d8964cf0301 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -36,6 +36,7 @@ num-traits = "0.2" num_cpus = "1" online_config = { path = "../online_config" } openssl = "0.10" +pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = "2" diff --git a/components/tikv_util/src/quota_limiter.rs b/components/tikv_util/src/quota_limiter.rs index 4d5ca82c7d9..818ec0ea60c 100644 --- a/components/tikv_util/src/quota_limiter.rs +++ b/components/tikv_util/src/quota_limiter.rs @@ -1,16 +1,20 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + future::Future, + pin::Pin, sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, Arc, }, + task::{Context, Poll}, time::Duration, }; use cpu_time::ThreadTime; use futures::compat::Future01CompatExt; use online_config::{ConfigChange, ConfigManager}; +use pin_project::pin_project; use super::{ config::{ReadableDuration, ReadableSize}, @@ -110,7 +114,19 @@ impl<'a> Sample { } } - fn add_cpu_time(&mut self, time: Duration) { + /// Record thread cpu time in async manner. The function creates a future + /// that can track the cpu time used during the future's poll, caller + /// should explicitly call `add_cpu_time` after the future is ready. + pub fn observe_cpu_async(&self, f: F) -> CpuObserveFuture { + CpuObserveFuture { + enabled: self.enable_cpu_limit, + total_duration: Duration::ZERO, + timer: None, + delegate: f, + } + } + + pub fn add_cpu_time(&mut self, time: Duration) { self.cpu_time += time; } } @@ -128,6 +144,37 @@ impl<'a> Drop for CpuObserveGuard<'a> { } } +/// CpuObserveFuture is a future that used to track thread cpu time. +#[pin_project] +pub struct CpuObserveFuture { + enabled: bool, + total_duration: Duration, + timer: Option, + #[pin] + delegate: F, +} + +// `ThreadTime` is not Send, but is safe here because we only use it duration +// each poll. +unsafe impl Send for CpuObserveFuture {} + +impl Future for CpuObserveFuture { + type Output = (Duration, F::Output); + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.project(); + if *this.enabled { + *this.timer = Some(ThreadTime::now()); + } + let res = this.delegate.poll(cx); + if let Some(timer) = this.timer { + *this.total_duration += timer.elapsed(); + } + let dur = *this.total_duration; + res.map(|r| (dur, r)) + } +} + impl Default for QuotaLimiter { fn default() -> Self { let foreground_limiters = LimiterItems::default(); diff --git a/src/coprocessor/checksum.rs b/src/coprocessor/checksum.rs index f208b87ee0f..52bd0a60184 100644 --- a/src/coprocessor/checksum.rs +++ b/src/coprocessor/checksum.rs @@ -7,12 +7,8 @@ use tidb_query_common::storage::{ scanner::{RangesScanner, RangesScannerOptions}, Range, }; -use tidb_query_executors::runner::MAX_TIME_SLICE; -use tidb_query_expr::BATCH_MAX_SIZE; use tikv_alloc::trace::MemoryTraceGuard; -use tikv_util::time::Instant; use tipb::{ChecksumAlgorithm, ChecksumRequest, ChecksumResponse}; -use yatp::task::future::reschedule; use crate::{ coprocessor::{dag::TikvStorage, *}, @@ -77,18 +73,7 @@ impl RequestHandler for ChecksumContext { let mut prefix_digest = crc64fast::Digest::new(); prefix_digest.write(&old_prefix); - let mut row_count = 0; - let mut time_slice_start = Instant::now(); - while let Some((k, v)) = self.scanner.next()? { - row_count += 1; - if row_count >= BATCH_MAX_SIZE { - if time_slice_start.saturating_elapsed() > MAX_TIME_SLICE { - reschedule().await; - time_slice_start = Instant::now(); - } - row_count = 0; - } - + while let Some((k, v)) = self.scanner.next().await? { if !k.starts_with(&new_prefix) { return Err(box_err!("Wrong prefix expect: {:?}", new_prefix)); } diff --git a/src/coprocessor/dag/mod.rs b/src/coprocessor/dag/mod.rs index 5b06638f244..ce575859e59 100644 --- a/src/coprocessor/dag/mod.rs +++ b/src/coprocessor/dag/mod.rs @@ -122,8 +122,8 @@ impl RequestHandler for BatchDagHandler { handle_qe_response(result, self.runner.can_be_cached(), self.data_version).map(|x| x.into()) } - fn handle_streaming_request(&mut self) -> Result<(Option, bool)> { - handle_qe_stream_response(self.runner.handle_streaming_request()) + async fn handle_streaming_request(&mut self) -> Result<(Option, bool)> { + handle_qe_stream_response(self.runner.handle_streaming_request().await) } fn collect_scan_statistics(&mut self, dest: &mut Statistics) { diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 5bd05bd29cd..8c2e6d571c0 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -558,7 +558,7 @@ impl Endpoint { let result = { tracker.on_begin_item(); - let result = handler.handle_streaming_request(); + let result = handler.handle_streaming_request().await; let mut storage_stats = Statistics::default(); handler.collect_scan_statistics(&mut storage_stats); @@ -803,8 +803,9 @@ mod tests { } } + #[async_trait] impl RequestHandler for StreamFixture { - fn handle_streaming_request(&mut self) -> Result<(Option, bool)> { + async fn handle_streaming_request(&mut self) -> Result<(Option, bool)> { let is_finished = if self.result_len == 0 { true } else { @@ -848,8 +849,9 @@ mod tests { } } + #[async_trait] impl RequestHandler for StreamFromClosure { - fn handle_streaming_request(&mut self) -> Result<(Option, bool)> { + async fn handle_streaming_request(&mut self) -> Result<(Option, bool)> { let result = (self.result_generator)(self.nth); self.nth += 1; result diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index 0cde193a606..8acd5325a1e 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -69,7 +69,7 @@ pub trait RequestHandler: Send { } /// Processes current request and produces streaming responses. - fn handle_streaming_request(&mut self) -> HandlerStreamStepResult { + async fn handle_streaming_request(&mut self) -> HandlerStreamStepResult { panic!("streaming request is not supported for this handler"); } diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 8f7b8c57dde..ade8a007383 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -22,18 +22,14 @@ use tidb_query_datatype::{ expr::{EvalConfig, EvalContext}, FieldTypeAccessor, }; -use tidb_query_executors::{ - interface::BatchExecutor, runner::MAX_TIME_SLICE, BatchTableScanExecutor, -}; +use tidb_query_executors::{interface::BatchExecutor, BatchTableScanExecutor}; use tidb_query_expr::BATCH_MAX_SIZE; use tikv_alloc::trace::{MemoryTraceGuard, TraceEvent}; use tikv_util::{ metrics::{ThrottleType, NON_TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC_STATIC}, quota_limiter::QuotaLimiter, - time::Instant, }; use tipb::{self, AnalyzeColumnsReq, AnalyzeIndexReq, AnalyzeReq, AnalyzeType}; -use yatp::task::future::reschedule; use super::{cmsketch::CmSketch, fmsketch::FmSketch, histogram::Histogram}; use crate::{ @@ -135,8 +131,6 @@ impl AnalyzeContext { req.get_cmsketch_width() as usize, ); let mut fms = FmSketch::new(req.get_sketch_size() as usize); - let mut row_count = 0; - let mut time_slice_start = Instant::now(); let mut topn_heap = BinaryHeap::new(); // cur_val recording the current value's data and its counts when iterating // index's rows. Once we met a new value, the old value will be pushed @@ -148,15 +142,7 @@ impl AnalyzeContext { } else { ANALYZE_VERSION_V1 }; - while let Some((key, _)) = scanner.next()? { - row_count += 1; - if row_count >= BATCH_MAX_SIZE { - if time_slice_start.saturating_elapsed() > MAX_TIME_SLICE { - reschedule().await; - time_slice_start = Instant::now(); - } - row_count = 0; - } + while let Some((key, _)) = scanner.next().await? { let mut key = &key[..]; if is_common_handle { table::check_record_key(key)?; @@ -382,20 +368,19 @@ impl RowSampleBuilder { use tidb_query_datatype::{codec::collation::Collator, match_template_collator}; let mut is_drained = false; - let mut time_slice_start = Instant::now(); let mut collector = self.new_collector(); while !is_drained { - let time_slice_elapsed = time_slice_start.saturating_elapsed(); - if time_slice_elapsed > MAX_TIME_SLICE { - reschedule().await; - time_slice_start = Instant::now(); - } - let mut sample = self.quota_limiter.new_sample(!self.is_auto_analyze); let mut read_size: usize = 0; { + let result = { + let (duration, res) = sample + .observe_cpu_async(self.data.next_batch(BATCH_MAX_SIZE)) + .await; + sample.add_cpu_time(duration); + res + }; let _guard = sample.observe_cpu(); - let result = self.data.next_batch(BATCH_MAX_SIZE); is_drained = result.is_drained?; let columns_slice = result.physical_columns.as_slice(); @@ -888,17 +873,11 @@ impl SampleBuilder { columns_without_handle_len ]; let mut is_drained = false; - let mut time_slice_start = Instant::now(); let mut common_handle_hist = Histogram::new(self.max_bucket_size); let mut common_handle_cms = CmSketch::new(self.cm_sketch_depth, self.cm_sketch_width); let mut common_handle_fms = FmSketch::new(self.max_fm_sketch_size); while !is_drained { - let time_slice_elapsed = time_slice_start.saturating_elapsed(); - if time_slice_elapsed > MAX_TIME_SLICE { - reschedule().await; - time_slice_start = Instant::now(); - } - let result = self.data.next_batch(BATCH_MAX_SIZE); + let result = self.data.next_batch(BATCH_MAX_SIZE).await; is_drained = result.is_drained?; let mut columns_slice = result.physical_columns.as_slice(); diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 11dbfc09f2f..14bf818aaf0 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -74,6 +74,7 @@ portable = ["tikv/portable"] [dependencies] api_version = { path = "../components/api_version", default-features = false } +async-trait = "0.1" batch-system = { path = "../components/batch-system", default-features = false } cdc = { path = "../components/cdc", default-features = false } collections = { path = "../components/collections" } diff --git a/tests/benches/coprocessor_executors/index_scan/util.rs b/tests/benches/coprocessor_executors/index_scan/util.rs index 19c2be94195..7531fb68944 100644 --- a/tests/benches/coprocessor_executors/index_scan/util.rs +++ b/tests/benches/coprocessor_executors/index_scan/util.rs @@ -3,6 +3,7 @@ use std::{marker::PhantomData, sync::Arc}; use criterion::black_box; +use futures::executor::block_on; use kvproto::coprocessor::KeyRange; use test_coprocessor::*; use tidb_query_datatype::expr::EvalConfig; @@ -48,7 +49,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchIndexScan .unwrap(); // There is a step of building scanner in the first `next()` which cost time, // so we next() before hand. - executor.next_batch(1); + block_on(executor.next_batch(1)); Box::new(executor) as Box> } } diff --git a/tests/benches/coprocessor_executors/table_scan/util.rs b/tests/benches/coprocessor_executors/table_scan/util.rs index 7bcfe436d62..2fe7c4fc4c0 100644 --- a/tests/benches/coprocessor_executors/table_scan/util.rs +++ b/tests/benches/coprocessor_executors/table_scan/util.rs @@ -3,6 +3,7 @@ use std::{marker::PhantomData, sync::Arc}; use criterion::black_box; +use futures::executor::block_on; use kvproto::coprocessor::KeyRange; use test_coprocessor::*; use tidb_query_datatype::expr::EvalConfig; @@ -48,7 +49,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchTableScan .unwrap(); // There is a step of building scanner in the first `next()` which cost time, // so we next() before hand. - executor.next_batch(1); + block_on(executor.next_batch(1)); Box::new(executor) as Box> } } diff --git a/tests/benches/coprocessor_executors/util/bencher.rs b/tests/benches/coprocessor_executors/util/bencher.rs index 64862582bd8..246510f991b 100644 --- a/tests/benches/coprocessor_executors/util/bencher.rs +++ b/tests/benches/coprocessor_executors/util/bencher.rs @@ -32,7 +32,7 @@ impl E> Bencher for BatchNext1024Bencher { |executor| { profiler::start("./BatchNext1024Bencher.profile"); let iter_times = black_box(1024); - let r = black_box(executor.next_batch(iter_times)); + let r = black_box(block_on(executor.next_batch(iter_times))); r.is_drained.unwrap(); profiler::stop(); }, @@ -62,7 +62,7 @@ impl E> Bencher for BatchNextAllBencher { |executor| { profiler::start("./BatchNextAllBencher.profile"); loop { - let r = executor.next_batch(1024); + let r = block_on(executor.next_batch(1024)); black_box(&r); if r.is_drained.unwrap() { break; diff --git a/tests/benches/coprocessor_executors/util/fixture.rs b/tests/benches/coprocessor_executors/util/fixture.rs index 5910ab4fc69..24062c7a2da 100644 --- a/tests/benches/coprocessor_executors/util/fixture.rs +++ b/tests/benches/coprocessor_executors/util/fixture.rs @@ -2,6 +2,7 @@ use std::str::FromStr; +use async_trait::async_trait; use criterion::measurement::Measurement; use rand::{seq::SliceRandom, Rng, SeedableRng}; use rand_xorshift::XorShiftRng; @@ -283,6 +284,7 @@ pub struct BatchFixtureExecutor { columns: Vec, } +#[async_trait] impl BatchExecutor for BatchFixtureExecutor { type StorageStats = Statistics; @@ -292,7 +294,7 @@ impl BatchExecutor for BatchFixtureExecutor { } #[inline] - fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { let mut columns = Vec::with_capacity(self.columns.len()); for col in &mut self.columns { let mut column = LazyBatchColumn::raw_with_capacity(scan_rows); diff --git a/tests/integrations/coprocessor/test_checksum.rs b/tests/integrations/coprocessor/test_checksum.rs index 2983414b9cc..66df6b2832c 100644 --- a/tests/integrations/coprocessor/test_checksum.rs +++ b/tests/integrations/coprocessor/test_checksum.rs @@ -2,6 +2,7 @@ use std::u64; +use futures::executor::block_on; use kvproto::{ coprocessor::{KeyRange, Request}, kvrpcpb::{Context, IsolationLevel}, @@ -88,7 +89,7 @@ fn reversed_checksum_crc64_xor(store: &Store, range: KeyRange) -> let mut checksum = 0; let digest = crc64fast::Digest::new(); - while let Some((k, v)) = scanner.next().unwrap() { + while let Some((k, v)) = block_on(scanner.next()).unwrap() { let mut digest = digest.clone(); digest.write(&k); digest.write(&v); From 90c4a0602040102fabf45ea8e8bfac33f4472d07 Mon Sep 17 00:00:00 2001 From: YangKeao Date: Thu, 1 Sep 2022 05:08:24 -0400 Subject: [PATCH 181/676] copr: add json opaque value and implement conversion, comparison... (#13342) close tikv/tikv#13340 Signed-off-by: YangKeao Co-authored-by: Liqi Geng --- Cargo.lock | 1 + components/tidb_query_datatype/Cargo.toml | 1 + .../tidb_query_datatype/src/codec/convert.rs | 4 +- .../src/codec/mysql/json/binary.rs | 8 + .../src/codec/mysql/json/comparison.rs | 10 ++ .../src/codec/mysql/json/jcodec.rs | 19 ++- .../src/codec/mysql/json/json_type.rs | 17 +++ .../src/codec/mysql/json/mod.rs | 30 +++- .../src/codec/mysql/json/modifier.rs | 3 +- .../src/codec/mysql/json/serde.rs | 11 ++ .../src/codec/row/v2/encoder_for_test.rs | 2 +- components/tidb_query_expr/src/impl_cast.rs | 141 ++++++++++++++++-- 12 files changed, 225 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7ed11da4cd7..e76166d88c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5928,6 +5928,7 @@ dependencies = [ name = "tidb_query_datatype" version = "0.0.1" dependencies = [ + "base64", "bitfield", "bitflags", "boolinator", diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index 2e748d26d8d..7eb9a296ac2 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -6,6 +6,7 @@ publish = false description = "Data type of a query engine to run TiDB pushed down executors" [dependencies] +base64 = "0.13" bitfield = "0.13.2" bitflags = "1.0.1" boolinator = "2.4.0" diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index 67620510ef8..efd99f5317a 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -510,7 +510,7 @@ impl<'a> ToInt for JsonRef<'a> { // TiDB: 5 // MySQL: 4 let val = match self.get_type() { - JsonType::Object | JsonType::Array => Ok(ctx + JsonType::Object | JsonType::Array | JsonType::Opaque => Ok(ctx .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) .map(|_| 0)?), JsonType::Literal => Ok(self.get_literal().map_or(0, |x| x as i64)), @@ -526,7 +526,7 @@ impl<'a> ToInt for JsonRef<'a> { #[inline] fn to_uint(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let val = match self.get_type() { - JsonType::Object | JsonType::Array => Ok(ctx + JsonType::Object | JsonType::Array | JsonType::Opaque => Ok(ctx .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) .map(|_| 0)?), JsonType::Literal => Ok(self.get_literal().map_or(0, |x| x as u64)), diff --git a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs index af66980460e..9b8264ee3fb 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs @@ -80,6 +80,14 @@ impl<'a> JsonRef<'a> { &self.value()[val_offset..val_offset + str_len as usize + len_len], ) } + JsonType::Opaque => { + let (opaque_bytes_len, len_len) = + NumberCodec::try_decode_var_u64(&self.value()[val_offset + 1..])?; + JsonRef::new( + val_type, + &self.value()[val_offset..val_offset + opaque_bytes_len as usize + len_len + 1], + ) + } _ => { let data_size = NumberCodec::decode_u32_le(&self.value()[val_offset + ELEMENT_COUNT_LEN..]) diff --git a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs index fe8bb2c35d7..f948a172ef0 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs @@ -37,6 +37,7 @@ impl<'a> JsonRef<'a> { .map_or(PRECEDENCE_NULL, |_| PRECEDENCE_BOOLEAN), JsonType::I64 | JsonType::U64 | JsonType::Double => PRECEDENCE_NUMBER, JsonType::String => PRECEDENCE_STRING, + JsonType::Opaque => PRECEDENCE_OPAQUE, } } @@ -140,6 +141,15 @@ impl<'a> PartialOrd for JsonRef<'a> { } Some(left_count.cmp(&right_count)) } + JsonType::Opaque => { + if let (Ok(left), Ok(right)) = + (self.get_opaque_bytes(), right.get_opaque_bytes()) + { + left.partial_cmp(right) + } else { + return None; + } + } }; } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs index 4e4094f0ae3..51ca3ba0da0 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs @@ -5,7 +5,10 @@ use std::{collections::BTreeMap, convert::TryInto, f64, str}; use codec::{number::NumberCodec, prelude::*}; use super::{constants::*, Json, JsonRef, JsonType}; -use crate::codec::{Error, Result}; +use crate::{ + codec::{Error, Result}, + FieldTypeTp, +}; impl<'a> JsonRef<'a> { fn encoded_len(&self) -> usize { @@ -211,6 +214,14 @@ pub trait JsonEncoder: NumberEncoder { self.write_bytes(bytes)?; Ok(()) } + + fn write_json_opaque(&mut self, typ: FieldTypeTp, bytes: &[u8]) -> Result<()> { + self.write_u8(typ.to_u8().unwrap())?; + let bytes_len = bytes.len() as u64; + self.write_var_u64(bytes_len)?; + self.write_bytes(bytes)?; + Ok(()) + } } pub trait JsonDatumPayloadChunkEncoder: BufferWriter { @@ -243,6 +254,12 @@ pub trait JsonDecoder: NumberDecoder { } JsonType::I64 | JsonType::U64 | JsonType::Double => self.read_bytes(NUMBER_LEN)?, JsonType::Literal => self.read_bytes(LITERAL_LEN)?, + JsonType::Opaque => { + let value = self.bytes(); + // the first byte of opaque stores the MySQL type code + let (opaque_bytes_len, len_len) = NumberCodec::try_decode_var_u64(&value[1..])?; + self.read_bytes(opaque_bytes_len as usize + len_len + 1)? + } }; Ok(Json::new(tp, Vec::from(value))) } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs index c6fd25ec688..28c4d275471 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs @@ -1,6 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use super::{JsonRef, JsonType}; +use crate::FieldTypeTp; const JSON_TYPE_BOOLEAN: &[u8] = b"BOOLEAN"; const JSON_TYPE_NONE: &[u8] = b"NULL"; @@ -10,6 +11,9 @@ const JSON_TYPE_DOUBLE: &[u8] = b"DOUBLE"; const JSON_TYPE_STRING: &[u8] = b"STRING"; const JSON_TYPE_OBJECT: &[u8] = b"OBJECT"; const JSON_TYPE_ARRAY: &[u8] = b"ARRAY"; +const JSON_TYPE_BIT: &[u8] = b"BIT"; +const JSON_TYPE_BLOB: &[u8] = b"BLOB"; +const JSON_TYPE_OPAQUE: &[u8] = b"OPAQUE"; impl<'a> JsonRef<'a> { /// `json_type` is the implementation for @@ -26,6 +30,19 @@ impl<'a> JsonRef<'a> { Some(_) => JSON_TYPE_BOOLEAN, None => JSON_TYPE_NONE, }, + JsonType::Opaque => match self.get_opaque_type() { + Ok( + FieldTypeTp::TinyBlob + | FieldTypeTp::MediumBlob + | FieldTypeTp::LongBlob + | FieldTypeTp::Blob + | FieldTypeTp::String + | FieldTypeTp::VarString + | FieldTypeTp::VarChar, + ) => JSON_TYPE_BLOB, + Ok(FieldTypeTp::Bit) => JSON_TYPE_BIT, + _ => JSON_TYPE_OPAQUE, + }, } } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs index 480ac5db129..c4e3a9ebf5c 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs @@ -90,11 +90,12 @@ use super::super::{datum::Datum, Error, Result}; use crate::{ codec::{ convert::ConvertTo, - data_type::{Decimal, Real}, + data_type::{BytesRef, Decimal, Real}, mysql, mysql::{Duration, Time, TimeType}, }, expr::EvalContext, + FieldTypeTp, }; const ERR_CONVERT_FAILED: &str = "Can not covert from "; @@ -109,6 +110,10 @@ pub enum JsonType { U64 = 0x0a, Double = 0x0b, String = 0x0c, + + // It's a special value for the compatibility with MySQL. + // It will store the raw buffer containing unexpected type (e.g. Binary). + Opaque = 0x0d, } impl TryFrom for JsonType { @@ -206,6 +211,20 @@ impl<'a> JsonRef<'a> { Ok(str::from_utf8(self.get_str_bytes()?)?) } + // Returns the opaque value in bytes + pub(crate) fn get_opaque_bytes(&self) -> Result<&'a [u8]> { + assert_eq!(self.type_code, JsonType::Opaque); + let val = self.value(); + let (str_len, len_len) = NumberCodec::try_decode_var_u64(&val[1..])?; + Ok(&val[(len_len + 1)..len_len + 1 + str_len as usize]) + } + + pub(crate) fn get_opaque_type(&self) -> Result { + assert_eq!(self.type_code, JsonType::Opaque); + let val = self.value(); + FieldTypeTp::from_u8(val[0]).ok_or(box_err!("invalid opaque type code")) + } + // Return whether the value is zero. // https://dev.mysql.com/doc/refman/8.0/en/json.html#Converting%20between%20JSON%20and%20non-JSON%20values pub(crate) fn is_zero(&self) -> bool { @@ -217,6 +236,7 @@ impl<'a> JsonRef<'a> { JsonType::U64 => self.get_u64() == 0, JsonType::Double => self.get_double() == 0f64, JsonType::String => false, + JsonType::Opaque => false, } } @@ -284,6 +304,12 @@ impl Json { Ok(Self::new(JsonType::String, value)) } + pub fn from_opaque(typ: FieldTypeTp, bytes: BytesRef<'_>) -> Result { + let mut value = vec![]; + value.write_json_opaque(typ, bytes)?; + Ok(Self::new(JsonType::Opaque, value)) + } + /// Creates a `literal` JSON from a `bool` pub fn from_bool(b: bool) -> Result { let mut value = vec![]; @@ -414,7 +440,7 @@ impl<'a> ConvertTo for JsonRef<'a> { #[inline] fn convert(&self, ctx: &mut EvalContext) -> Result { let d = match self.get_type() { - JsonType::Array | JsonType::Object => ctx + JsonType::Array | JsonType::Object | JsonType::Opaque => ctx .handle_truncate_err(Error::truncated_wrong_val("Float", self.to_string())) .map(|_| 0f64)?, JsonType::U64 => self.get_u64() as f64, diff --git a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs index 8d1b5c0d453..8c88153defc 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs @@ -232,7 +232,8 @@ impl<'a> BinaryModifier<'a> { | JsonType::I64 | JsonType::U64 | JsonType::Double - | JsonType::String => { + | JsonType::String + | JsonType::Opaque => { buf.extend_from_slice(self.old.value); } JsonType::Object | JsonType::Array => { diff --git a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs index b2b2f421bcb..d15f728ed10 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs @@ -107,6 +107,17 @@ impl<'a> Serialize for JsonRef<'a> { } tup.end() } + JsonType::Opaque => { + let bytes = self + .get_opaque_bytes() + .map_err(|_| SerError::custom("invalid opaque value"))?; + let typ = self + .get_opaque_type() + .map_err(|_| SerError::custom("invalid opaque type code"))?; + + let str = format!("base64:type{}:{}", typ, base64::encode(bytes)); + serializer.serialize_str(&str) + } } } } diff --git a/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs b/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs index 1ee5104b723..bedbc7324ce 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs @@ -6,7 +6,7 @@ //! According to //! //! The row format is: -//! ``` +//! ```ignore //! | version | flag | number_of_non_null_columns | number_of_null_columns | non_null_column_ids | null_column_ids | value_offsets | values | //! |---------| ---- | -------------------------- | ---------------------- | ------------------- | --------------- | ------------- | ------ | //! ``` diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 7fb118dfbec..50ea93d0ade 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -28,7 +28,9 @@ use tidb_query_datatype::{ }; use tipb::{Expr, FieldType}; -use crate::{types::RpnExpressionBuilder, RpnExpressionNode, RpnFnCallExtra, RpnFnMeta}; +use crate::{ + types::RpnExpressionBuilder, RpnExpressionNode, RpnFnCallExtra, RpnFnMeta, RpnStackNode, +}; fn get_cast_fn_rpn_meta( is_from_constant: bool, @@ -1288,13 +1290,30 @@ fn cast_uint_as_json(val: Option<&Int>) -> Result> { } } -#[rpn_fn(nullable, capture = [extra])] +#[rpn_fn(nullable, capture = [args, extra])] #[inline] -fn cast_string_as_json(extra: &RpnFnCallExtra<'_>, val: Option) -> Result> { +fn cast_string_as_json( + args: &[RpnStackNode<'_>], + extra: &RpnFnCallExtra<'_>, + val: Option, +) -> Result> { match val { None => Ok(None), Some(val) => { - if extra + let typ = args[0].field_type(); + if typ.is_binary_string_like() { + let mut buf = val; + + let mut vec; + if typ.tp() == FieldTypeTp::String { + vec = (*val).to_owned(); + // the `flen` of string is always greater than zero + vec.resize(typ.flen().try_into().unwrap(), 0); + buf = &vec; + } + + Ok(Some(Json::from_opaque(typ.tp(), buf)?)) + } else if extra .ret_field_type .as_accessor() .flag() @@ -1467,12 +1486,16 @@ fn cast_enum_as_time( } } -#[rpn_fn(nullable, capture = [extra])] +#[rpn_fn(nullable, capture = [args, extra])] #[inline] -fn cast_enum_as_json(extra: &RpnFnCallExtra, val: Option) -> Result> { +fn cast_enum_as_json( + args: &[RpnStackNode<'_>], + extra: &RpnFnCallExtra, + val: Option, +) -> Result> { match val { None => Ok(None), - Some(val) => cast_string_as_json(extra, Some(val.name())), + Some(val) => cast_string_as_json(args, extra, Some(val.name())), } } @@ -1557,6 +1580,24 @@ mod tests { assert!(r.is_none()); } + fn test_none_with_args_and_extra(func: F) + where + F: Fn(&[RpnStackNode<'_>], &RpnFnCallExtra, Option) -> Result>, + { + let value = ScalarValue::Bytes(None); + let field_type = FieldType::default(); + let args: [RpnStackNode<'_>; 1] = [RpnStackNode::Scalar { + value: &value, + field_type: &field_type, + }]; + let ret_field_type: FieldType = FieldType::default(); + let extra = RpnFnCallExtra { + ret_field_type: &ret_field_type, + }; + let r = func(&args, &extra, None).unwrap(); + assert!(r.is_none()); + } + fn test_none_with_metadata(func: F) where F: Fn(&tipb::InUnionMetadata, Option) -> Result>, @@ -2028,7 +2069,7 @@ mod tests { #[test] fn test_enum_as_json() { - test_none_with_extra(cast_enum_as_json); + test_none_with_args_and_extra(cast_enum_as_json); let mut jo1: BTreeMap = BTreeMap::new(); jo1.insert( @@ -2107,13 +2148,20 @@ mod tests { ), ]; for (input, expect, parse_to_json) in cs { + let arg_type = FieldType::default(); + let arg_value = ScalarValue::Enum(Some(input.to_owned())); + let args = [RpnStackNode::Scalar { + value: &arg_value, + field_type: &arg_type, + }]; + let mut rft = FieldType::default(); if parse_to_json { let fta = rft.as_mut_accessor(); fta.set_flag(FieldTypeFlag::PARSE_TO_JSON); } let extra = make_extra(&rft); - let result = cast_enum_as_json(&extra, Some(input)); + let result = cast_enum_as_json(&args, &extra, Some(input)); let result_str = result.as_ref().map(|x| x.as_ref().map(|x| x.to_string())); let log = format!( "input: {}, parse_to_json: {}, expect: {:?}, result: {:?}", @@ -6647,7 +6695,7 @@ mod tests { #[test] fn test_string_as_json() { - test_none_with_extra(cast_string_as_json); + test_none_with_args_and_extra(cast_string_as_json); let mut jo1: BTreeMap = BTreeMap::new(); jo1.insert( @@ -6657,16 +6705,19 @@ mod tests { // HasParseToJSONFlag let cs = vec![ ( + FieldType::default(), "{\"a\": \"b\"}".to_string(), Json::from_object(jo1).unwrap(), true, ), ( + FieldType::default(), "{}".to_string(), Json::from_object(BTreeMap::new()).unwrap(), true, ), ( + FieldType::default(), "[1, 2, 3]".to_string(), Json::from_array(vec![ Json::from_i64(1).unwrap(), @@ -6677,49 +6728,109 @@ mod tests { true, ), ( + FieldType::default(), "[]".to_string(), Json::from_array(Vec::new()).unwrap(), true, ), ( + FieldType::default(), "9223372036854775807".to_string(), Json::from_i64(9223372036854775807).unwrap(), true, ), ( + FieldType::default(), "-9223372036854775808".to_string(), Json::from_i64(-9223372036854775808).unwrap(), true, ), ( + FieldType::default(), "18446744073709551615".to_string(), Json::from_f64(18446744073709552000.0).unwrap(), true, ), // FIXME: f64::MAX.to_string() to json should success // (f64::MAX.to_string(), Json::from_f64(f64::MAX), true), - ("0.0".to_string(), Json::from_f64(0.0).unwrap(), true), ( + FieldType::default(), + "0.0".to_string(), + Json::from_f64(0.0).unwrap(), + true, + ), + ( + FieldType::default(), "\"abcde\"".to_string(), Json::from_string("abcde".to_string()).unwrap(), true, ), ( + FieldType::default(), "\"\"".to_string(), Json::from_string("".to_string()).unwrap(), true, ), - ("true".to_string(), Json::from_bool(true).unwrap(), true), - ("false".to_string(), Json::from_bool(false).unwrap(), true), + ( + FieldType::default(), + "true".to_string(), + Json::from_bool(true).unwrap(), + true, + ), + ( + FieldType::default(), + "false".to_string(), + Json::from_bool(false).unwrap(), + true, + ), + ( + FieldTypeBuilder::new() + .tp(FieldTypeTp::String) + .flen(4) + .charset(CHARSET_BIN) + .collation(Collation::Binary) + .build(), + "a".to_string(), + Json::from_opaque(FieldTypeTp::String, &[97, 0, 0, 0]).unwrap(), + true, + ), + ( + FieldTypeBuilder::new() + .tp(FieldTypeTp::String) + .flen(256) + .charset(CHARSET_BIN) + .collation(Collation::Binary) + .build(), + "".to_string(), + Json::from_opaque(FieldTypeTp::String, &[0; 256]).unwrap(), + true, + ), + ( + FieldTypeBuilder::new() + .tp(FieldTypeTp::VarChar) + .flen(256) + .charset(CHARSET_BIN) + .collation(Collation::Binary) + .build(), + "a".to_string(), + Json::from_opaque(FieldTypeTp::String, &[97]).unwrap(), + true, + ), ]; - for (input, expect, parse_to_json) in cs { + for (arg_type, input, expect, parse_to_json) in cs { + let arg_value = ScalarValue::Bytes(Some(input.clone().into_bytes())); + let args = [RpnStackNode::Scalar { + value: &arg_value, + field_type: &arg_type, + }]; + let mut rft = FieldType::default(); if parse_to_json { let fta = rft.as_mut_accessor(); fta.set_flag(FieldTypeFlag::PARSE_TO_JSON); } let extra = make_extra(&rft); - let result = cast_string_as_json(&extra, Some(&input.clone().into_bytes())); + let result = cast_string_as_json(&args, &extra, Some(&input.clone().into_bytes())); let result_str = result.as_ref().map(|x| x.as_ref().map(|x| x.to_string())); let log = format!( "input: {}, parse_to_json: {}, expect: {:?}, result: {:?}", From aaf124e24bfceff0839ffed127b8fbe70a18321f Mon Sep 17 00:00:00 2001 From: Hu# Date: Fri, 2 Sep 2022 11:18:24 +0800 Subject: [PATCH 182/676] *: Block reads, writes and schedules before finishing flashback (#13348) ref tikv/tikv#13303 Add Msg and peer's flashback state field Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- components/error_code/src/raftstore.rs | 2 + components/raftstore/src/errors.rs | 9 + components/raftstore/src/store/fsm/peer.rs | 56 +++- components/raftstore/src/store/metrics.rs | 3 +- components/raftstore/src/store/msg.rs | 3 + components/raftstore/src/store/peer.rs | 57 +++- components/test_raftstore/src/cluster.rs | 24 +- components/tikv_kv/src/lib.rs | 2 + components/txn_types/src/types.rs | 4 + src/server/raftkv.rs | 15 +- .../txn/commands/acquire_pessimistic_lock.rs | 1 + .../txn/commands/flashback_to_version.rs | 1 + src/storage/txn/commands/prewrite.rs | 1 + src/storage/txn/scheduler.rs | 5 +- tests/integrations/raftstore/mod.rs | 1 + .../integrations/raftstore/test_flashback.rs | 289 ++++++++++++++++++ 16 files changed, 462 insertions(+), 11 deletions(-) create mode 100644 tests/integrations/raftstore/test_flashback.rs diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index 2926c69c21e..29c4c3c1849 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -30,6 +30,8 @@ define_error_codes!( DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), PENDING_PREPARE_MERGE => ("PendingPrepareMerge", "", ""), RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), + // TODO: add FLASHBACK in errorpb + FLASHBACK_IN_PROGRESS => ("RecoveryInProgress", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 89648de7731..1adaef08c3f 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -58,6 +58,9 @@ pub enum Error { #[error("region {0} is in the recovery progress")] RecoveryInProgress(u64), + #[error("region {0} is in the flashback progress")] + FlashbackInProgress(u64), + #[error( "key {} is not in region key range [{}, {}) for region {}", log_wrappers::Value::key(.0), @@ -241,6 +244,11 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_recovery_in_progress(e); } + Error::FlashbackInProgress(region_id) => { + let mut e = errorpb::RecoveryInProgress::default(); + e.set_region_id(region_id); + errorpb.set_recovery_in_progress(e); + } _ => {} }; @@ -275,6 +283,7 @@ impl ErrorCodeExt for Error { Error::NotLeader(..) => error_code::raftstore::NOT_LEADER, Error::DiskFull(..) => error_code::raftstore::DISK_FULL, Error::RecoveryInProgress(..) => error_code::raftstore::RECOVERY_IN_PROGRESS, + Error::FlashbackInProgress(..) => error_code::raftstore::FLASHBACK_IN_PROGRESS, Error::StaleCommand => error_code::raftstore::STALE_COMMAND, Error::RegionNotInitialized(_) => error_code::raftstore::REGION_NOT_INITIALIZED, Error::KeyNotInRegion(..) => error_code::raftstore::KEY_NOT_IN_REGION, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index c587ea5f32c..eb79965d617 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -21,6 +21,7 @@ use collections::{HashMap, HashSet}; use engine_traits::{Engines, KvEngine, RaftEngine, SstMetaInfo, WriteBatchExt, CF_LOCK, CF_RAFT}; use error_code::ErrorCodeExt; use fail::fail_point; +use futures::channel::oneshot::Sender; use keys::{self, enc_end_key, enc_start_key}; use kvproto::{ errorpb, @@ -79,8 +80,8 @@ use crate::{ metrics::*, msg::{Callback, ExtCallback, InspectedRaftMessage}, peer::{ - ConsistencyState, ForceLeaderState, Peer, PersistSnapshotResult, StaleState, - UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, + ConsistencyState, FlashbackState, ForceLeaderState, Peer, PersistSnapshotResult, + StaleState, UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, TRANSFER_LEADER_COMMAND_REPLY_CTX, }, @@ -923,6 +924,38 @@ where syncer.report_for_self(self_report); } + // Call msg PrepareFlashback to stop the scheduling and RW tasks. + // Once called, it will wait for the channel's notification in FlashbackState to + // finish. We place a flag in the request, which is checked when the + // pre_propose_raft_command is called. Stopping tasks is done by applying + // the flashback-only command in this way, But for RW local reads which need + // to be considered, we let the leader lease to None to ensure that local reads + // are not executed. + fn on_prepare_flashback(&mut self, ch: Sender) { + info!( + "prepare flashback"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer_id(), + ); + if self.fsm.peer.flashback_state.is_some() { + ch.send(false).unwrap(); + return; + } + self.fsm.peer.flashback_state = Some(FlashbackState::new(ch)); + // Let the leader lease to None to ensure that local reads are not executed. + self.fsm.peer.leader_lease_mut().expire_remote_lease(); + self.fsm.peer.maybe_finish_flashback_wait_apply(); + } + + fn on_finish_flashback(&mut self) { + info!( + "finish flashback"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer_id(), + ); + self.fsm.peer.flashback_state.take(); + } + fn on_casual_msg(&mut self, msg: CasualMessage) { match msg { CasualMessage::SplitRegion { @@ -1335,6 +1368,8 @@ where SignificantMsg::UnsafeRecoveryFillOutReport(syncer) => { self.on_unsafe_recovery_fill_out_report(syncer) } + SignificantMsg::PrepareFlashback(ch) => self.on_prepare_flashback(ch), + SignificantMsg::FinishFlashback => self.on_finish_flashback(), } } @@ -2172,6 +2207,10 @@ where if self.fsm.peer.unsafe_recovery_state.is_some() { self.check_unsafe_recovery_state(); } + // TODO: combine recovery state and flashback state as a wait apply queue. + if self.fsm.peer.flashback_state.is_some() { + self.fsm.peer.maybe_finish_flashback_wait_apply(); + } } fn retry_pending_prepare_merge(&mut self, applied_index: u64) { @@ -4737,12 +4776,23 @@ where return Ok(Some(resp)); } - // Check whether the store has the right peer to handle the request. let region_id = self.region_id(); + // When in the flashback state, we should not allow any other request to be + // proposed. + if self.fsm.peer.flashback_state.is_some() { + self.ctx.raft_metrics.invalid_proposal.flashback.inc(); + let flags = WriteBatchFlags::from_bits_truncate(msg.get_header().get_flags()); + if !flags.contains(WriteBatchFlags::FLASHBACK) { + return Err(Error::FlashbackInProgress(self.region_id())); + } + } + + // Check whether the store has the right peer to handle the request. let leader_id = self.fsm.peer.leader_id(); let request = msg.get_requests(); if self.fsm.peer.force_leader.is_some() { + self.ctx.raft_metrics.invalid_proposal.force_leader.inc(); // in force leader state, forbid requests to make the recovery progress less // error-prone if !(msg.has_admin_request() diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index ad4ee7e7f98..719a2d8c09a 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -201,6 +201,8 @@ make_static_metric! { read_index_no_leader, region_not_initialized, is_applying_snapshot, + force_leader, + flashback, } pub label_enum RaftLogGcSkippedReason { @@ -271,7 +273,6 @@ make_static_metric! { pub struct LoadBaseSplitEventCounterVec: IntCounter { "type" => LoadBaseSplitEventType, } - } lazy_static! { diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 251094e6475..bb8c2c0bd89 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -7,6 +7,7 @@ use std::{borrow::Cow, fmt}; use collections::HashSet; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; +use futures::channel::oneshot::Sender; use kvproto::{ import_sstpb::SstMeta, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp}, @@ -456,6 +457,8 @@ where UnsafeRecoveryDestroy(UnsafeRecoveryExecutePlanSyncer), UnsafeRecoveryWaitApply(UnsafeRecoveryWaitApplySyncer), UnsafeRecoveryFillOutReport(UnsafeRecoveryFillOutReportSyncer), + PrepareFlashback(Sender), + FinishFlashback, } /// Message that will be sent to a peer. diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 91698be98e9..53747f082e4 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -23,7 +23,8 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; -use getset::Getters; +use futures::channel::oneshot::Sender; +use getset::{Getters, MutGetters}; use kvproto::{ errorpb, kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp, LockInfo}, @@ -706,7 +707,33 @@ pub enum UnsafeRecoveryState { Destroy(UnsafeRecoveryExecutePlanSyncer), } -#[derive(Getters)] +// This state is set by the peer fsm when invoke msg PrepareFlashback. Once set, +// it is checked every time this peer applies a new entry or a snapshot, +// if the latest committed index is met, the syncer will be called to notify the +// result. +#[derive(Debug)] +pub struct FlashbackState(Option>); + +impl FlashbackState { + pub fn new(ch: Sender) -> Self { + FlashbackState(Some(ch)) + } + + pub fn finish_wait_apply(&mut self) { + if self.0.is_none() { + return; + } + let ch = self.0.take().unwrap(); + match ch.send(true) { + Ok(_) => {} + Err(e) => { + error!("Fail to notify flashback state"; "err" => ?e); + } + } + } +} + +#[derive(Getters, MutGetters)] pub struct Peer where EK: KvEngine, @@ -731,7 +758,7 @@ where proposals: ProposalQueue>, leader_missing_time: Option, - #[getset(get = "pub")] + #[getset(get = "pub", get_mut = "pub")] leader_lease: Lease, pending_reads: ReadIndexQueue>, /// Threshold of long uncommitted proposals. @@ -887,6 +914,7 @@ where /// lead_transferee if the peer is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, + pub flashback_state: Option, } impl Peer @@ -1018,6 +1046,7 @@ where last_region_buckets: None, lead_transferee: raft::INVALID_ID, unsafe_recovery_state: None, + flashback_state: None, }; // If this region has only one peer and I am the one, campaign directly. @@ -2378,6 +2407,10 @@ where debug!("unsafe recovery finishes applying a snapshot"); self.unsafe_recovery_maybe_finish_wait_apply(/* force= */ false); } + if self.flashback_state.is_some() { + debug!("flashback finishes applying a snapshot"); + self.maybe_finish_flashback_wait_apply(); + } } // If `apply_snap_ctx` is none, it means this snapshot does not // come from the ready but comes from the unfinished snapshot task @@ -3352,6 +3385,13 @@ where "peer_id" => self.peer.get_id(), ); None + } else if self.flashback_state.is_some() { + debug!( + "prevents renew lease while in flashback state"; + "region_id" => self.region_id, + "peer_id" => self.peer.get_id(), + ); + None } else { self.leader_lease.renew(ts); let term = self.term(); @@ -4272,6 +4312,7 @@ where // In `pre_propose_raft_command`, it rejects all the requests expect conf-change // if in force leader state. if self.force_leader.is_some() { + poll_ctx.raft_metrics.invalid_proposal.force_leader.inc(); panic!( "{} propose normal in force leader state {:?}", self.tag, self.force_leader @@ -4945,6 +4986,16 @@ where } } } + + pub fn maybe_finish_flashback_wait_apply(&mut self) { + let finished = + self.raft_group.raft.raft_log.applied == self.raft_group.raft.raft_log.last_index(); + if finished { + if let Some(flashback_state) = self.flashback_state.as_mut() { + flashback_state.finish_wait_apply(); + } + } + } } #[derive(Default, Debug)] diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 097e74f157b..79f0b8ef709 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -19,7 +19,7 @@ use engine_traits::{ WriteBatchExt, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; -use futures::executor::block_on; +use futures::{self, channel::oneshot, executor::block_on}; use kvproto::{ errorpb::Error as PbError, kvrpcpb::{ApiVersion, Context}, @@ -1411,6 +1411,28 @@ impl Cluster { .unwrap(); } + pub async fn call_and_wait_prepare_flashback(&mut self, region_id: u64, store_id: u64) { + let router = self.sim.rl().get_router(store_id).unwrap(); + let (tx, rx) = oneshot::channel(); + + router + .significant_send(region_id, SignificantMsg::PrepareFlashback(tx)) + .unwrap(); + + let prepared = rx.await.unwrap(); + if !prepared { + panic!("prepare flashback failed"); + } + } + + pub fn call_finish_flashback(&mut self, region_id: u64, store_id: u64) { + let router = self.sim.rl().get_router(store_id).unwrap(); + + router + .significant_send(region_id, SignificantMsg::FinishFlashback) + .unwrap(); + } + pub fn must_split(&mut self, region: &metapb::Region, split_key: &[u8]) { let mut try_cnt = 0; let split_count = self.pd_client.get_split_count(); diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 466bd973906..64a05a98622 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -255,6 +255,8 @@ pub struct SnapContext<'a> { // `key_ranges` is used in replica read. It will send to // the leader via raft "read index" to check memory locks. pub key_ranges: Vec, + // Marks that this read is a FlashbackToVersionReadPhase. + pub for_flashback: bool, } /// Engine defines the common behaviour for a storage engine type. diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 75df337f80c..9496994f38f 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -513,6 +513,8 @@ pub struct TxnExtra { // Marks that this transaction is a 1PC transaction. RaftKv should set this flag // in the raft command request. pub one_pc: bool, + // Marks that this transaction is a flashback transaction. + pub for_flashback: bool, } impl TxnExtra { @@ -537,6 +539,8 @@ bitflags! { /// Indicates this request is a transfer leader command that needs to be proposed /// like a normal command. const TRANSFER_LEADER_PROPOSAL = 0b00000100; + /// Indicates this request is a flashback transaction. + const FLASHBACK = 0b00001000; } } diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index 0a3f2fdd742..9443ba26cd4 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -201,14 +201,20 @@ where cb: Callback>, ) -> Result<()> { let mut header = self.new_request_header(ctx.pb_ctx); + let mut flags = 0; if ctx.pb_ctx.get_stale_read() && !ctx.start_ts.is_zero() { let mut data = [0u8; 8]; (&mut data[..]) .encode_u64(ctx.start_ts.into_inner()) .unwrap(); - header.set_flags(WriteBatchFlags::STALE_READ.bits()); + flags |= WriteBatchFlags::STALE_READ.bits(); header.set_flag_data(data.into()); } + if ctx.for_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); cmd.set_requests(vec![req].into()); @@ -252,9 +258,14 @@ where let reqs: Vec = batch.modifies.into_iter().map(Into::into).collect(); let txn_extra = batch.extra; let mut header = self.new_request_header(ctx); + let mut flags = 0; if txn_extra.one_pc { - header.set_flags(WriteBatchFlags::ONE_PC.bits()); + flags |= WriteBatchFlags::ONE_PC.bits(); + } + if txn_extra.for_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); } + header.set_flags(flags); let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 3632d847e59..949b347f251 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -150,6 +150,7 @@ impl WriteCommand for AcquirePessimisticLock old_values: self.old_values, // One pc status is unkown AcquirePessimisticLock stage. one_pc: false, + for_flashback: false, }; let write_data = WriteData::new(txn.into_modifies(), extra); (pr, write_data, rows, ctx, None) diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 058758888d5..3bb6f3aa268 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -107,6 +107,7 @@ impl WriteCommand for FlashbackToVersion { let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); + write_data.extra.for_flashback = true; Ok(WriteResult { ctx: self.ctx.clone(), to_be_write: write_data, diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index deca5733eb0..333d3eb1aca 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -670,6 +670,7 @@ impl Prewriter { old_values: self.old_values, // Set one_pc flag in TxnExtra to let CDC skip handling the resolver. one_pc: self.try_one_pc, + for_flashback: false, }; // Here the lock guards are taken and will be released after the write finishes. // If an error (KeyIsLocked or WriteConflict) occurs before, these lock guards diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 0bad0078821..a72bd671d0a 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -564,10 +564,13 @@ impl Scheduler { let tag = task.cmd.tag(); SCHED_STAGE_COUNTER_VEC.get(tag).snapshot.inc(); - let snap_ctx = SnapContext { + let mut snap_ctx = SnapContext { pb_ctx: task.cmd.ctx(), ..Default::default() }; + if let Command::FlashbackToVersionReadPhase { .. } = task.cmd { + snap_ctx.for_flashback = true; + } // The program is currently in scheduler worker threads. // Safety: `self.inner.worker_pool` should ensure that a TLS engine exists. match unsafe { with_tls_engine(|engine: &E| kv::snapshot(engine, snap_ctx)) }.await diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index efa118fb8f1..d34aae05e77 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -7,6 +7,7 @@ mod test_compact_lock_cf; mod test_compact_log; mod test_conf_change; mod test_early_apply; +mod test_flashback; mod test_hibernate; mod test_joint_consensus; mod test_lease_read; diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs new file mode 100644 index 00000000000..e4d0276f9e6 --- /dev/null +++ b/tests/integrations/raftstore/test_flashback.rs @@ -0,0 +1,289 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use futures::executor::block_on; +use kvproto::metapb; +use test_raftstore::*; +use txn_types::WriteBatchFlags; + +#[test] +fn test_flahsback_for_applied_index() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + + // write for cluster. + let value = vec![1_u8; 8096]; + multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); + + // prepare for flashback + let region = cluster.get_region(b"k1"); + block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); + + let last_index = cluster + .raft_local_state(region.get_id(), 1) + .get_last_index(); + let appied_index = cluster.apply_state(region.get_id(), 1).get_applied_index(); + + assert_eq!(last_index, appied_index); +} + +#[test] +fn test_flashback_for_schedule() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + + cluster.must_transfer_leader(1, new_peer(2, 2)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + // prepare for flashback + let region = cluster.get_region(b"k1"); + block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); + + // verify the schedule is unabled. + let mut region = cluster.get_region(b"k3"); + let admin_req = new_transfer_leader_cmd(new_peer(2, 2)); + let mut transfer_leader = + new_admin_request(region.get_id(), ®ion.take_region_epoch(), admin_req); + transfer_leader.mut_header().set_peer(new_peer(1, 1)); + let resp = cluster + .call_command_on_leader(transfer_leader, Duration::from_secs(3)) + .unwrap(); + let e = resp.get_header().get_error(); + // reuse recovery_in_progress error code. + assert_eq!( + e.get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + } + ); + + // verify the schedule can be executed if add flashback flag in request's + // header. + let mut region = cluster.get_region(b"k3"); + let admin_req = new_transfer_leader_cmd(new_peer(2, 2)); + let mut transfer_leader = + new_admin_request(region.get_id(), ®ion.take_region_epoch(), admin_req); + transfer_leader.mut_header().set_peer(new_peer(1, 1)); + transfer_leader + .mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let resp = cluster + .call_command_on_leader(transfer_leader, Duration::from_secs(5)) + .unwrap(); + assert!(!resp.get_header().has_error()); + + cluster.call_finish_flashback(region.get_id(), 1); + // transfer leader to (1, 1) + cluster.must_transfer_leader(1, new_peer(1, 1)); +} + +#[test] +fn test_flahsback_for_write() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + + // write for cluster + let value = vec![1_u8; 8096]; + multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); + + // prepare for flashback + let region = cluster.get_region(b"k1"); + block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); + + // write will be blocked + let value = vec![1_u8; 8096]; + must_get_error_recovery_in_progress(&mut cluster, ®ion, new_put_cmd(b"k1", &value)); + + must_cmd_add_flashback_flag( + &mut cluster, + &mut region.clone(), + new_put_cmd(b"k1", &value), + ); + + cluster.call_finish_flashback(region.get_id(), 1); + + multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); +} + +#[test] +fn test_flahsback_for_read() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + + // write for cluster + let value = vec![1_u8; 8096]; + multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); + // read for cluster + multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); + + // prepare for flashback + let region = cluster.get_region(b"k1"); + block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); + + // read will be blocked + must_get_error_recovery_in_progress(&mut cluster, ®ion, new_get_cf_cmd("write", b"k1")); + + // verify the read can be executed if add flashback flag in request's + // header. + must_cmd_add_flashback_flag( + &mut cluster, + &mut region.clone(), + new_get_cf_cmd("write", b"k1"), + ); + + cluster.call_finish_flashback(region.get_id(), 1); + + multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); +} + +// LocalReader will attempt to renew the lease. +// However, when flashback is enabled, it will make the lease None and prevent +// renew lease. +#[test] +fn test_flahsback_for_local_read() { + let mut cluster = new_node_cluster(0, 3); + let election_timeout = configure_for_lease_read(&mut cluster, Some(50), None); + + // Avoid triggering the log compaction in this test case. + cluster.cfg.raft_store.raft_log_gc_threshold = 100; + + let node_id = 3u64; + let store_id = 3u64; + let peer = new_peer(store_id, node_id); + cluster.run(); + + cluster.must_put(b"k1", b"v1"); + let region = cluster.get_region(b"k1"); + cluster.must_transfer_leader(region.get_id(), peer.clone()); + + // check local read before prepare flashback + let state = cluster.raft_local_state(region.get_id(), store_id); + let last_index = state.get_last_index(); + // Make sure the leader transfer procedure timeouts. + std::thread::sleep(election_timeout * 2); + must_read_on_peer(&mut cluster, peer.clone(), region.clone(), b"k1", b"v1"); + // Check the leader does a local read. + let state = cluster.raft_local_state(region.get_id(), store_id); + assert_eq!(state.get_last_index(), last_index); + + // prepare for flashback + block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), store_id)); + + must_error_read_on_peer( + &mut cluster, + peer.clone(), + region.clone(), + b"k1", + Duration::from_secs(1), + ); + + // Wait for the leader's lease to expire to ensure that a renew lease interval + // has elapsed. + std::thread::sleep(election_timeout * 2); + must_error_read_on_peer( + &mut cluster, + peer.clone(), + region.clone(), + b"k1", + Duration::from_secs(1), + ); + + // Also check read by propose was blocked + let state = cluster.raft_local_state(region.get_id(), store_id); + assert_eq!(state.get_last_index(), last_index); + + cluster.call_finish_flashback(region.get_id(), store_id); + + // check local read after finish flashback + let state = cluster.raft_local_state(region.get_id(), store_id); + let last_index = state.get_last_index(); + // Make sure the leader transfer procedure timeouts. + std::thread::sleep(election_timeout * 2); + must_read_on_peer(&mut cluster, peer, region.clone(), b"k1", b"v1"); + + // Check the leader does a local read. + let state = cluster.raft_local_state(region.get_id(), store_id); + assert_eq!(state.get_last_index(), last_index); +} + +#[test] +fn test_flahsback_for_status_cmd_as_region_detail() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + + let region = cluster.get_region(b"k1"); + block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); + + let leader = cluster.leader_of_region(1).unwrap(); + let region_detail = cluster.region_detail(1, 1); + assert!(region_detail.has_region()); + let region = region_detail.get_region(); + assert_eq!(region.get_id(), 1); + assert!(region.get_start_key().is_empty()); + assert!(region.get_end_key().is_empty()); + assert_eq!(region.get_peers().len(), 3); + let epoch = region.get_region_epoch(); + assert_eq!(epoch.get_conf_ver(), 1); + assert_eq!(epoch.get_version(), 1); + + assert!(region_detail.has_leader()); + assert_eq!(region_detail.get_leader(), &leader); +} + +fn multi_do_cmd(cluster: &mut Cluster, cmd: kvproto::raft_cmdpb::Request) { + for _ in 0..100 { + let mut reqs = vec![]; + for _ in 0..100 { + reqs.push(cmd.clone()); + } + cluster.batch_put(b"k1", reqs).unwrap(); + } +} + +fn must_cmd_add_flashback_flag( + cluster: &mut Cluster, + region: &mut metapb::Region, + cmd: kvproto::raft_cmdpb::Request, +) { + // verify the read can be executed if add flashback flag in request's + // header. + let mut req = new_request( + region.get_id(), + region.take_region_epoch(), + vec![cmd], + false, + ); + let new_leader = cluster.query_leader(1, region.get_id(), Duration::from_secs(1)); + req.mut_header().set_peer(new_leader.unwrap()); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let resp = cluster.call_command(req, Duration::from_secs(5)).unwrap(); + assert!(!resp.get_header().has_error()); +} + +fn must_get_error_recovery_in_progress( + cluster: &mut Cluster, + region: &metapb::Region, + cmd: kvproto::raft_cmdpb::Request, +) { + for _ in 0..100 { + let mut reqs = vec![]; + for _ in 0..100 { + reqs.push(cmd.clone()); + } + match cluster.batch_put(b"k1", reqs) { + Ok(_) => {} + Err(e) => { + assert_eq!( + e.get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + } + ); + } + } + } +} From a1d7b93635c06608d5b00592a6627c8ab5ad8a58 Mon Sep 17 00:00:00 2001 From: Lucas Date: Fri, 2 Sep 2022 11:46:24 +0800 Subject: [PATCH 183/676] engine: default enable raft engine log recycling (#13372) ref tikv/tikv#13229 Signed-off-by: Lucasliang Co-authored-by: Xinye Tao --- Cargo.lock | 204 +++++++++++++++++++++------------------ etc/config-template.toml | 8 +- 2 files changed, 113 insertions(+), 99 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e76166d88c8..9463bbd717b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -209,7 +209,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d962799a5863fdf06fbf594e04102130582d010379137e9a98a7e2e693a5885" dependencies = [ "error-code", - "libc 0.2.125", + "libc 0.2.132", "wasm-bindgen", "winapi 0.3.9", ] @@ -240,15 +240,15 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "winapi 0.3.9", ] [[package]] name = "autocfg" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "aws" @@ -383,7 +383,7 @@ dependencies = [ "addr2line", "cc", "cfg-if 1.0.0", - "libc 0.2.125", + "libc 0.2.132", "miniz_oxide 0.4.4", "object", "rustc-demangle", @@ -533,7 +533,7 @@ dependencies = [ "bcc-sys", "bitflags", "byteorder", - "libc 0.2.125", + "libc 0.2.132", "regex", "thiserror", ] @@ -665,7 +665,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" dependencies = [ "cc", - "libc 0.2.125", + "libc 0.2.132", "pkg-config", ] @@ -691,7 +691,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7f788eaf239475a3c1e1acf89951255a46c4b9b46cf3e866fc4d0707b4b9e36" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "valgrind_request", ] @@ -861,7 +861,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f54d78e30b388d4815220c8dd03fea5656b6c6d32adb59e89061552a102f8da1" dependencies = [ "glob", - "libc 0.2.125", + "libc 0.2.132", "libloading", ] @@ -946,7 +946,7 @@ dependencies = [ "byteorder", "bytes", "error_code", - "libc 0.2.125", + "libc 0.2.132", "panic_hook", "protobuf", "rand 0.8.3", @@ -1005,7 +1005,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62" dependencies = [ "core-foundation-sys", - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -1020,7 +1020,7 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "winapi 0.3.9", ] @@ -1078,7 +1078,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63aaaf47e457badbcb376c65a49d0f182c317ebd97dc6d1ced94c8e1d09c0f3a" dependencies = [ "criterion", - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -1348,7 +1348,7 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "redox_users", "winapi 0.3.9", ] @@ -1601,7 +1601,7 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5115567ac25674e0043e472be13d14e537f37ea8aa4bdc4aef0c89add1db1ff" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "str-buf", ] @@ -1703,10 +1703,10 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "libloading", "matches", - "nix", + "nix 0.24.1", "once_cell", "protobuf", "rust-ini", @@ -1759,7 +1759,7 @@ dependencies = [ "crossbeam-utils 0.8.8", "fs2", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "maligned", "online_config", "openssl", @@ -1784,7 +1784,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "thiserror", "winapi 0.3.9", ] @@ -1796,7 +1796,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d34cfa13a63ae058bfa601fe9e313bbdb3746427c1459185464ce0fcf62e1e8" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.125", + "libc 0.2.132", "redox_syscall 0.2.11", "winapi 0.3.9", ] @@ -1809,7 +1809,7 @@ checksum = "d691fdb3f817632d259d09220d4cf0991dbb2c9e59e044a02a59194bf6e14484" dependencies = [ "cc", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "winapi 0.3.9", ] @@ -1837,7 +1837,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2adaffba6388640136149e18ed080b77a78611c1e1d6de75aedcdf78df5d4682" dependencies = [ "crc32fast", - "libc 0.2.125", + "libc 0.2.132", "libz-sys", "miniz_oxide 0.3.7", ] @@ -1878,7 +1878,7 @@ name = "fs2" version = "0.4.3" source = "git+https://github.com/tabokie/fs2-rs?branch=tikv#cd503764a19a99d74c1ab424dd13d6bcd093fcae" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "winapi 0.3.9", ] @@ -1904,7 +1904,7 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f41b048a94555da0f42f1d632e2e19510084fb8e303b0daa2816e733fb3644a0" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -2139,7 +2139,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "473a1265acc8ff1e808cd0a1af8cee3c2ee5200916058a2ca113c29f2d903571" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.125", + "libc 0.2.132", "wasi 0.7.0", ] @@ -2151,7 +2151,7 @@ checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ "cfg-if 1.0.0", "js-sys", - "libc 0.2.125", + "libc 0.2.132", "wasi 0.10.2+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2200,7 +2200,7 @@ dependencies = [ "futures-executor", "futures-util", "grpcio-sys", - "libc 0.2.125", + "libc 0.2.132", "log", "parking_lot 0.11.1", "protobuf", @@ -2237,7 +2237,7 @@ dependencies = [ "bindgen 0.59.2", "cc", "cmake", - "libc 0.2.125", + "libc 0.2.132", "libz-sys", "openssl-sys", "pkg-config", @@ -2305,7 +2305,7 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "307c3c9f937f38e3534b1d6447ecf090cafcc9744e4a6360e8b037b2cf5af120" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -2507,7 +2507,7 @@ checksum = "4816c66d2c8ae673df83366c18341538f234a26d65a9ecea5c348b453ac1d02f" dependencies = [ "bitflags", "inotify-sys", - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -2516,7 +2516,7 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -2543,7 +2543,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -2589,7 +2589,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b1d42ef453b30b7387e113da1c83ab1605d90c5b4e0eb8e96d016ed3b8c160" dependencies = [ "getrandom 0.1.12", - "libc 0.2.125", + "libc 0.2.132", "log", ] @@ -2720,9 +2720,9 @@ checksum = "e32a70cf75e5846d53a673923498228bbec6a8624708a9ea5645f075d6276122" [[package]] name = "libc" -version = "0.2.125" +version = "0.2.132" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5916d2ae698f6de9bfb891ad7a8d65c09d232dc58cc4ac433c7da3b2fd84bc2b" +checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5" [[package]] name = "libfuzzer-sys" @@ -2762,7 +2762,7 @@ dependencies = [ "bzip2-sys", "cc", "cmake", - "libc 0.2.125", + "libc 0.2.132", "libtitan_sys", "libz-sys", "lz4-sys", @@ -2780,7 +2780,7 @@ dependencies = [ "bzip2-sys", "cc", "cmake", - "libc 0.2.125", + "libc 0.2.132", "libz-sys", "lz4-sys", "snappy-sys", @@ -2794,7 +2794,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66" dependencies = [ "cc", - "libc 0.2.125", + "libc 0.2.132", "pkg-config", "vcpkg", ] @@ -2850,7 +2850,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" dependencies = [ "cc", - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -2899,7 +2899,7 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -2908,7 +2908,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "winapi 0.3.9", ] @@ -2918,7 +2918,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -2989,7 +2989,7 @@ dependencies = [ "fuchsia-zircon-sys", "iovec", "kernel32-sys", - "libc 0.2.125", + "libc 0.2.132", "log", "miow 0.2.2", "net2", @@ -3003,7 +3003,7 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "log", "miow 0.3.7", "ntapi", @@ -3101,7 +3101,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8d96b2e1c8da3957d58100b09f102c6d9cfdfced01b7ec5a8974044bb09dbd4" dependencies = [ "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "log", "openssl", "openssl-probe", @@ -3119,7 +3119,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.125", + "libc 0.2.132", "winapi 0.3.9", ] @@ -3131,8 +3131,22 @@ checksum = "8f17df307904acd05aa8e32e97bb20f2a0df1728bbc2d771ae8f9a90463441e9" dependencies = [ "bitflags", "cfg-if 1.0.0", - "libc 0.2.125", + "libc 0.2.132", + "memoffset", +] + +[[package]] +name = "nix" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e322c04a9e3440c327fca7b6c8a63e6890a32fa2ad689db972425f07e0d22abb" +dependencies = [ + "autocfg", + "bitflags", + "cfg-if 1.0.0", + "libc 0.2.132", "memoffset", + "pin-utils", ] [[package]] @@ -3189,7 +3203,7 @@ dependencies = [ "fsevent", "fsevent-sys", "inotify", - "libc 0.2.125", + "libc 0.2.132", "mio 0.6.23", "mio-extras", "walkdir", @@ -3342,7 +3356,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi", - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -3420,7 +3434,7 @@ dependencies = [ "bitflags", "cfg-if 1.0.0", "foreign-types", - "libc 0.2.125", + "libc 0.2.132", "once_cell", "openssl-macros", "openssl-sys", @@ -3460,7 +3474,7 @@ checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f" dependencies = [ "autocfg", "cc", - "libc 0.2.125", + "libc 0.2.132", "openssl-src", "pkg-config", "vcpkg", @@ -3490,7 +3504,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "winapi 0.3.9", ] @@ -3527,7 +3541,7 @@ checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" dependencies = [ "cfg-if 1.0.0", "instant", - "libc 0.2.125", + "libc 0.2.132", "redox_syscall 0.2.11", "smallvec", "winapi 0.3.9", @@ -3540,7 +3554,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28141e0cc4143da2443301914478dc976a61ffdb3f043058310c70df2fed8954" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.125", + "libc 0.2.132", "redox_syscall 0.2.11", "smallvec", "windows-sys", @@ -3616,7 +3630,7 @@ checksum = "b8f94885300e262ef461aa9fd1afbf7df3caf9e84e271a74925d1c6c8b24830f" dependencies = [ "bitflags", "byteorder", - "libc 0.2.125", + "libc 0.2.132", "mmap", "nom 4.2.3", "phf", @@ -3759,7 +3773,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d27361d7578b410d0eb5fe815c2b2105b01ab770a7c738cb9a231457a809fcc7" dependencies = [ "ipnetwork", - "libc 0.2.125", + "libc 0.2.132", "pnet_base", "pnet_sys", "winapi 0.2.8", @@ -3771,7 +3785,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82f881a6d75ac98c5541db6144682d1773bb14c6fc50c6ebac7086c8f7f23c29" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "winapi 0.2.8", "ws2_32-sys", ] @@ -3785,9 +3799,9 @@ dependencies = [ "cfg-if 1.0.0", "findshlibs", "inferno", - "libc 0.2.125", + "libc 0.2.132", "log", - "nix", + "nix 0.24.1", "once_cell", "parking_lot 0.12.0", "protobuf", @@ -3859,7 +3873,7 @@ dependencies = [ "byteorder", "hex 0.4.2", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -3868,7 +3882,7 @@ version = "0.4.2" source = "git+https://github.com/tikv/procinfo-rs?rev=6599eb9dca74229b2c1fcc44118bef7eff127128#6599eb9dca74229b2c1fcc44118bef7eff127128" dependencies = [ "byteorder", - "libc 0.2.125", + "libc 0.2.132", "nom 2.2.1", "rustc_version 0.2.3", ] @@ -3893,7 +3907,7 @@ dependencies = [ "cfg-if 1.0.0", "fnv", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "memchr", "parking_lot 0.11.1", "protobuf", @@ -4042,7 +4056,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.2.0" -source = "git+https://github.com/tikv/raft-engine.git#6a6fe3bd2e0a1ca0b4fc643800ddc93abe74cd87" +source = "git+https://github.com/tikv/raft-engine.git#9751c6dd5c20a056570c9fbfe62bad6e0d585094" dependencies = [ "byteorder", "crc32fast", @@ -4053,11 +4067,11 @@ dependencies = [ "hex 0.4.2", "if_chain", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "log", "lz4-sys", "memmap2", - "nix", + "nix 0.25.0", "num-derive", "num-traits", "parking_lot 0.12.0", @@ -4076,7 +4090,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.2.0" -source = "git+https://github.com/tikv/raft-engine.git#6a6fe3bd2e0a1ca0b4fc643800ddc93abe74cd87" +source = "git+https://github.com/tikv/raft-engine.git#9751c6dd5c20a056570c9fbfe62bad6e0d585094" dependencies = [ "clap 3.1.6", "env_logger", @@ -4224,7 +4238,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" dependencies = [ "fuchsia-cprng", - "libc 0.2.125", + "libc 0.2.132", "rand_core 0.3.1", "rdrand", "winapi 0.3.9", @@ -4237,7 +4251,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ "getrandom 0.1.12", - "libc 0.2.125", + "libc 0.2.132", "rand_chacha 0.2.1", "rand_core 0.5.1", "rand_hc 0.2.0", @@ -4249,7 +4263,7 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "rand_chacha 0.3.0", "rand_core 0.6.2", "rand_hc 0.3.0", @@ -4529,7 +4543,7 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "log", "online_config", "pdqselect", @@ -4592,7 +4606,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b72b84d47e8ec5a4f2872e8262b8f8256c5be1c938a7d6d3a867a3ba8f722f74" dependencies = [ "cc", - "libc 0.2.125", + "libc 0.2.132", "once_cell", "spin", "untrusted", @@ -4605,7 +4619,7 @@ name = "rocksdb" version = "0.3.0" source = "git+https://github.com/tikv/rust-rocksdb.git#827a5df22cd59dc708c4c6a87dd8735a2312773d" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "librocksdb_sys", ] @@ -4877,7 +4891,7 @@ dependencies = [ "bitflags", "core-foundation", "core-foundation-sys", - "libc 0.2.125", + "libc 0.2.132", "security-framework-sys", ] @@ -4888,7 +4902,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3676258fd3cfe2c9a0ec99ce3038798d847ce3e4bb17746373eb9f0f1ac16339" dependencies = [ "core-foundation-sys", - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -5084,7 +5098,7 @@ dependencies = [ "hex 0.4.2", "keys", "kvproto", - "libc 0.2.125", + "libc 0.2.132", "log", "log_wrappers", "pd_client", @@ -5142,7 +5156,7 @@ version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "signal-hook-registry", ] @@ -5152,7 +5166,7 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -5255,7 +5269,7 @@ version = "0.1.0" source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" dependencies = [ "cmake", - "libc 0.2.125", + "libc 0.2.132", "pkg-config", ] @@ -5283,7 +5297,7 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "winapi 0.3.9", ] @@ -5487,7 +5501,7 @@ dependencies = [ "cfg-if 1.0.0", "core-foundation-sys", "doc-comment", - "libc 0.2.125", + "libc 0.2.132", "ntapi", "once_cell", "rayon", @@ -5570,7 +5584,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.125", + "libc 0.2.132", "rand 0.8.3", "redox_syscall 0.2.11", "remove_dir_all", @@ -5786,7 +5800,7 @@ dependencies = [ "hyper", "keys", "kvproto", - "libc 0.2.125", + "libc 0.2.132", "log_wrappers", "more-asserts", "online_config", @@ -6078,7 +6092,7 @@ dependencies = [ "keys", "kvproto", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "libloading", "log", "log_wrappers", @@ -6174,7 +6188,7 @@ dependencies = [ "hex 0.4.2", "keys", "kvproto", - "libc 0.2.125", + "libc 0.2.132", "log", "log_wrappers", "pd_client", @@ -6209,7 +6223,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e37706572f4b151dff7a0146e040804e9c26fe3a3118591112f05cf12a4216c1" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "paste", "tikv-jemalloc-sys", ] @@ -6222,7 +6236,7 @@ checksum = "aeab4310214fe0226df8bfeb893a291a58b19682e8a07e1e1d4483ad4200d315" dependencies = [ "cc", "fs_extra", - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -6231,7 +6245,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20612db8a13a6c06d57ec83953694185a367e16945f66565e8028d2c0bd76979" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "tikv-jemalloc-sys", ] @@ -6254,7 +6268,7 @@ version = "0.1.0" dependencies = [ "fxhash", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "mimalloc", "snmalloc-rs", "tcmalloc", @@ -6320,10 +6334,10 @@ dependencies = [ "http", "kvproto", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", "log", "log_wrappers", - "nix", + "nix 0.24.1", "num-traits", "num_cpus", "online_config", @@ -6367,7 +6381,7 @@ version = "0.1.42" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" dependencies = [ - "libc 0.2.125", + "libc 0.2.132", "redox_syscall 0.1.56", "winapi 0.3.9", ] @@ -6409,7 +6423,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" dependencies = [ "bytes", - "libc 0.2.125", + "libc 0.2.132", "memchr", "mio 0.8.0", "num_cpus", @@ -6800,7 +6814,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "055058552ca15c566082fc61da433ae678f78986a6f16957e33162d1b218792a" dependencies = [ "kernel32-sys", - "libc 0.2.125", + "libc 0.2.132", "winapi 0.2.8", ] @@ -6983,7 +6997,7 @@ checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" dependencies = [ "either", "lazy_static", - "libc 0.2.125", + "libc 0.2.132", ] [[package]] @@ -7155,5 +7169,5 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b" dependencies = [ "cc", - "libc 0.2.125", + "libc 0.2.132", ] diff --git a/etc/config-template.toml b/etc/config-template.toml index 558612151ec..674eaa1a149 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -1092,16 +1092,16 @@ ## 1: Can be read by TiKV release 6.1 and above. ## 2: Can be read by TiKV release 6.3 and above. Supports log recycling. ## -## Default: 1. -# format-version = 1 +## Default: 2. +# format-version = 2 ## Whether to recycle stale log files in Raft Engine. ## If `true`, logically purged log files will be reserved for recycling. ## Only available for `format-version` >= 2. This option is only ## available when TiKV >= 6.3.x. ## -## Default: false. -# enable-log-recycle = false +## Default: true. +# enable-log-recycle = true [security] ## The path for TLS certificates. Empty string means disabling secure connections. From e8679338dcf63f152dbc525f4c7d44bbc0743df4 Mon Sep 17 00:00:00 2001 From: Ping Yu Date: Fri, 2 Sep 2022 14:32:25 +0800 Subject: [PATCH 184/676] causal_ts: add benchmark (#13389) ref tikv/tikv#12794, ref tikv/tikv#12970 1. Add benchmark for `causal_ts.BatchTsoProvider`. 2. Change implementation of `TestPdClient.batch_get_tso` to meet interface convention of real PD. 3. Remove "TODO" of making batch removal async for `causal_ts.TsoBatchList`. Signed-off-by: pingyu Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/causal_ts/Cargo.toml | 6 ++ components/causal_ts/benches/tso.rs | 123 ++++++++++++++++++++++++++ components/causal_ts/src/tso.rs | 13 +-- components/test_raftstore/src/pd.rs | 36 +++++++- components/txn_types/src/lib.rs | 2 +- components/txn_types/src/timestamp.rs | 2 +- 7 files changed, 173 insertions(+), 10 deletions(-) create mode 100644 components/causal_ts/benches/tso.rs diff --git a/Cargo.lock b/Cargo.lock index 9463bbd717b..bf5a40762e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -722,6 +722,7 @@ name = "causal_ts" version = "0.0.1" dependencies = [ "api_version", + "criterion", "engine_rocks", "engine_traits", "error_code", diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index b1ad4ed449a..7505a043a69 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -34,4 +34,10 @@ tokio = { version = "1", features = ["sync"] } txn_types = { path = "../txn_types", default-features = false } [dev-dependencies] +criterion = "0.3" test_raftstore = { path = "../test_raftstore" } + +[[bench]] +name = "tso" +path = "benches/tso.rs" +harness = false diff --git a/components/causal_ts/benches/tso.rs b/components/causal_ts/benches/tso.rs new file mode 100644 index 00000000000..86d7ed9b9ea --- /dev/null +++ b/components/causal_ts/benches/tso.rs @@ -0,0 +1,123 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::Arc, time::Duration}; + +use causal_ts::{BatchTsoProvider, CausalTsProvider, TsoBatchList}; +use criterion::*; +use futures::executor::block_on; +use test_raftstore::TestPdClient; +use txn_types::TimeStamp; + +fn bench_batch_tso_list_pop(c: &mut Criterion) { + const CAPACITY: u64 = 10_000; + let cases = vec![("100", 100), ("10k", 10_000)]; // (id, batch_size) + + let bench_func = |b: &mut Bencher<'_>, batch_size: u64| { + let batch_list = TsoBatchList::new(CAPACITY as u32); + b.iter_batched( + || { + batch_list.flush(); + for i in 0..CAPACITY { + batch_list + .push( + batch_size as u32, + TimeStamp::compose(i as u64, batch_size), + false, + ) + .unwrap(); + } + }, + |_| { + black_box(batch_list.pop(None).unwrap()); + }, + BatchSize::NumIterations(CAPACITY * batch_size), + ) + }; + + let mut group = c.benchmark_group("batch_tso_list_pop"); + for (id, batch_size) in cases { + group.bench_function(id, |b| { + bench_func(b, batch_size); + }); + } +} + +fn bench_batch_tso_list_push(c: &mut Criterion) { + const BATCH_SIZE: u64 = 8192; + let cases = vec![("50", 50), ("1024", 1024)]; // (id, capacity) + + let bench_func = |b: &mut Bencher<'_>, capacity: u64| { + let batch_list = TsoBatchList::new(capacity as u32); + let mut i = 0; + b.iter(|| { + i += 1; + black_box( + batch_list + .push( + BATCH_SIZE as u32, + TimeStamp::compose(i as u64, BATCH_SIZE), + false, + ) + .unwrap(), + ); + }) + }; + + let mut group = c.benchmark_group("batch_tso_list_push"); + for (id, capacity) in cases { + group.bench_function(id, |b| { + bench_func(b, capacity); + }); + } +} + +fn bench_batch_tso_provider_get_ts(c: &mut Criterion) { + let pd_cli = Arc::new(TestPdClient::new(1, false)); + + // Disable background renew by setting `renew_interval` to 0 to make test result + // stable. + let provider = block_on(BatchTsoProvider::new_opt( + pd_cli, + Duration::ZERO, + Duration::from_secs(1), // cache_multiplier = 10 + 100, + 80000, + )) + .unwrap(); + + c.bench_function("bench_batch_tso_provider_get_ts", |b| { + b.iter(|| { + black_box(provider.get_ts().unwrap()); + }) + }); +} + +fn bench_batch_tso_provider_flush(c: &mut Criterion) { + let pd_cli = Arc::new(TestPdClient::new(1, false)); + + // Disable background renew by setting `renew_interval` to 0 to make test result + // stable. + let provider = block_on(BatchTsoProvider::new_opt( + pd_cli, + Duration::ZERO, + Duration::from_secs(1), // cache_multiplier = 10 + 100, + 80000, + )) + .unwrap(); + + c.bench_function("bench_batch_tso_provider_flush", |b| { + b.iter(|| { + black_box(provider.flush()).unwrap(); + }) + }); +} + +criterion_group!( + benches, + bench_batch_tso_list_pop, + bench_batch_tso_list_push, + bench_batch_tso_provider_get_ts, + bench_batch_tso_provider_flush, +); +criterion_main!(benches); diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 6eabf8bf351..3bb0034af8f 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -140,7 +140,7 @@ impl TsoBatch { /// the scenario of leader transfer). Other regions without the requirement can /// still use older TSO cache. #[derive(Default, Debug)] -struct TsoBatchList { +pub struct TsoBatchList { inner: RwLock, /// Number of remaining (available) TSO. @@ -191,7 +191,6 @@ impl TsoBatchList { usage } - // TODO: make it async fn remove_batch(&self, key: u64) { if let Some(batch) = self.inner.write().remove(&key) { self.tso_remain @@ -218,7 +217,9 @@ impl TsoBatchList { self.tso_usage.fetch_add(1, Ordering::Relaxed); self.tso_remain.fetch_sub(1, Ordering::Relaxed); if is_used_up { - // TODO: make it async + // Note: do NOT try to make it async. + // According to benchmark, `remove_batch` can be done in ~50ns, while async + // implemented by `Worker` costs ~1us. self.remove_batch(key); } return Some(ts); @@ -253,8 +254,10 @@ impl TsoBatchList { .fetch_add(batch_size as i32, Ordering::Relaxed); } - // remove items out of capacity limitation. - // TODO: make it async + // Remove items out of capacity limitation. + // Note: do NOT try to make it async. + // According to benchmark, `write().pop_first()` can be done in ~50ns, while + // async implemented by `Worker` costs ~1us. if self.inner.read().len() > self.capacity as usize { if let Some((_, batch)) = self.inner.write().pop_first() { self.tso_remain diff --git a/components/test_raftstore/src/pd.rs b/components/test_raftstore/src/pd.rs index 33241862e07..75ea189c312 100644 --- a/components/test_raftstore/src/pd.rs +++ b/components/test_raftstore/src/pd.rs @@ -46,7 +46,7 @@ use tikv_util::{ Either, HandyRwLock, }; use tokio_timer::timer::Handle; -use txn_types::TimeStamp; +use txn_types::{TimeStamp, TSO_PHYSICAL_SHIFT_BITS}; use super::*; @@ -1698,8 +1698,38 @@ impl PdClient for TestPdClient { )), ))); } - let tso = self.tso.fetch_add(count as u64, Ordering::SeqCst); - Box::pin(ok(TimeStamp::new(tso + count as u64))) + + assert!(count > 0); + assert!(count < (1 << TSO_PHYSICAL_SHIFT_BITS)); + + let mut old_tso = self.tso.load(Ordering::SeqCst); + loop { + let ts: TimeStamp = old_tso.into(); + + // Add to logical part first. + let (mut physical, mut logical) = (ts.physical(), ts.logical() + count as u64); + + // When logical part is overflow, add to physical part. + // Moreover, logical part must not less than `count-1`, as the + // generated batch of TSO is treated as of the same physical time. + // Refer to real PD's implementation: + // https://github.com/tikv/pd/blob/v6.2.0/server/tso/tso.go#L361 + if logical >= (1 << TSO_PHYSICAL_SHIFT_BITS) { + physical += 1; + logical = (count - 1) as u64; + } + + let new_tso = TimeStamp::compose(physical, logical); + match self.tso.compare_exchange_weak( + old_tso, + new_tso.into_inner(), + Ordering::SeqCst, + Ordering::SeqCst, + ) { + Ok(_) => return Box::pin(ok(new_tso)), + Err(x) => old_tso = x, + } + } } fn update_service_safe_point( diff --git a/components/txn_types/src/lib.rs b/components/txn_types/src/lib.rs index be99fcc30c4..2f018c23923 100644 --- a/components/txn_types/src/lib.rs +++ b/components/txn_types/src/lib.rs @@ -16,7 +16,7 @@ use std::io; use error_code::{self, ErrorCode, ErrorCodeExt}; pub use lock::{Lock, LockType, PessimisticLock}; use thiserror::Error; -pub use timestamp::{TimeStamp, TsSet}; +pub use timestamp::{TimeStamp, TsSet, TSO_PHYSICAL_SHIFT_BITS}; pub use types::{ is_short_value, Key, KvPair, Mutation, MutationType, OldValue, OldValues, TxnExtra, TxnExtraScheduler, Value, WriteBatchFlags, SHORT_VALUE_MAX_LEN, diff --git a/components/txn_types/src/timestamp.rs b/components/txn_types/src/timestamp.rs index 946ccfbbdcb..fb0cd900123 100644 --- a/components/txn_types/src/timestamp.rs +++ b/components/txn_types/src/timestamp.rs @@ -12,7 +12,7 @@ use collections::HashSet; #[repr(transparent)] pub struct TimeStamp(u64); -const TSO_PHYSICAL_SHIFT_BITS: u64 = 18; +pub const TSO_PHYSICAL_SHIFT_BITS: u64 = 18; impl TimeStamp { /// Create a time stamp from physical and logical components. From b8315adf8dbd0c594d40d30c726a80de4c55c100 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Fri, 2 Sep 2022 17:08:25 +0800 Subject: [PATCH 185/676] service: add the kv_flashback_to_version interface (#13378) ref tikv/tikv#13303 Implement the `kv_flashback_to_version` interface. Signed-off-by: JmPotato --- Cargo.lock | 2 +- components/test_raftstore/src/cluster.rs | 7 ++ components/test_raftstore/src/util.rs | 1 - src/server/metrics.rs | 1 + src/server/service/kv.rs | 96 ++++++++++++++- src/storage/txn/commands/mod.rs | 12 ++ src/storage/txn/scheduler.rs | 6 +- tests/integrations/server/kv_service.rs | 149 +++++++++++++++++++++++ 8 files changed, 269 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bf5a40762e4..34795afc974 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2628,7 +2628,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#9cc5e1ddfda3aec6eddfc09de1d0072ebbd7bb21" +source = "git+https://github.com/pingcap/kvproto.git#f95ac338b3312e0a9bd7c33c9647a87a74314567" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 79f0b8ef709..9b5aa1a6646 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1334,6 +1334,13 @@ impl Cluster { } } + pub fn try_transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) -> RaftCmdResponse { + let epoch = self.get_region_epoch(region_id); + let transfer_leader = new_admin_request(region_id, &epoch, new_transfer_leader_cmd(leader)); + self.call_command_on_leader(transfer_leader, Duration::from_secs(5)) + .unwrap() + } + pub fn get_snap_dir(&self, node_id: u64) -> String { self.sim.rl().get_snap_dir(node_id) } diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 9b653ac2096..882095c5a7d 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -931,7 +931,6 @@ pub fn must_kv_prewrite_with( ); } -// Disk full test interface. pub fn try_kv_prewrite_with( client: &TikvClient, ctx: Context, diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 86ca07f38b4..6df6f0e96a8 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -35,6 +35,7 @@ make_auto_flush_static_metric! { kv_resolve_lock, kv_gc, kv_delete_range, + kv_flashback_to_version, raw_get, raw_batch_get, raw_batch_get_command, diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 79fbd9c6624..ab2fc41c47c 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -6,6 +6,7 @@ use std::{mem, sync::Arc}; use api_version::KvFormat; use fail::fail_point; use futures::{ + channel::oneshot, compat::Future01CompatExt, future::{self, Future, FutureExt, TryFutureExt}, sink::SinkExt, @@ -31,7 +32,7 @@ use raftstore::{ store::{ memory::{MEMTRACE_APPLYS, MEMTRACE_RAFT_ENTRIES, MEMTRACE_RAFT_MESSAGES}, metrics::RAFT_ENTRIES_CACHES_GAUGE, - Callback, CasualMessage, CheckLeaderTask, RaftCmdExtraOpts, + Callback, CasualMessage, CheckLeaderTask, RaftCmdExtraOpts, SignificantMsg, }, DiscardReason, Error as RaftStoreError, Result as RaftStoreResult, }; @@ -400,6 +401,37 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ); } + fn kv_flashback_to_version( + &mut self, + ctx: RpcContext<'_>, + mut req: FlashbackToVersionRequest, + sink: UnarySink, + ) { + let begin_instant = Instant::now(); + + let source = req.mut_context().take_request_source(); + let resp = future_flashback_to_version(&self.storage, &self.ch, req); + let task = async move { + let resp = resp.await?; + let elapsed = begin_instant.saturating_elapsed(); + sink.success(resp).await?; + GRPC_MSG_HISTOGRAM_STATIC + .kv_flashback_to_version + .observe(elapsed.as_secs_f64()); + record_request_source_metrics(source, elapsed); + ServerResult::Ok(()) + } + .map_err(|e| { + log_net_error!(e, "kv rpc failed"; + "request" => stringify!($fn_name) + ); + GRPC_MSG_FAIL_COUNTER.kv_flashback_to_version.inc(); + }) + .map(|_| ()); + + ctx.spawn(task); + } + fn coprocessor(&mut self, ctx: RpcContext<'_>, mut req: Request, sink: UnarySink) { forward_unary!(self.proxy, coprocessor, ctx, req, sink); let source = req.mut_context().take_request_source(); @@ -1026,6 +1058,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let copr_v2 = self.copr_v2.clone(); let pool_size = storage.get_normal_pool_size(); let batch_builder = BatcherBuilder::new(self.enable_req_batch, pool_size); + let ch = self.ch.clone(); let request_handler = stream.try_for_each(move |mut req| { let request_ids = req.take_request_ids(); let requests: Vec<_> = req.take_requests().into(); @@ -1042,6 +1075,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor id, req, &tx, + &ch, ); if let Some(batch) = batcher.as_mut() { batch.maybe_commit(&storage, &tx); @@ -1242,7 +1276,12 @@ fn response_batch_commands_request( poll_future_notify(task); } -fn handle_batch_commands_request( +fn handle_batch_commands_request< + T: RaftStoreRouter + 'static, + E: Engine, + L: LockManager, + F: KvFormat, +>( batcher: &mut Option, storage: &Storage, copr: &Endpoint, @@ -1251,6 +1290,7 @@ fn handle_batch_commands_request( id: u64, req: batch_commands_request::Request, tx: &Sender, + ch: &T, ) { // To simplify code and make the logic more clear. macro_rules! oneof { @@ -1353,6 +1393,7 @@ fn handle_batch_commands_request( ResolveLock, future_resolve_lock(storage), kv_resolve_lock; Gc, future_gc(), kv_gc; DeleteRange, future_delete_range(storage), kv_delete_range; + FlashbackToVersion, future_flashback_to_version(storage, ch), kv_flashback_to_version; RawBatchGet, future_raw_batch_get(storage), raw_batch_get; RawPut, future_raw_put(storage), raw_put; RawBatchPut, future_raw_batch_put(storage), raw_batch_put; @@ -1645,6 +1686,57 @@ fn future_delete_range( } } +fn future_flashback_to_version< + T: RaftStoreRouter + 'static, + E: Engine, + L: LockManager, + F: KvFormat, +>( + storage: &Storage, + raft_router: &T, + req: FlashbackToVersionRequest, +) -> impl Future> { + let storage_clone = storage.clone(); + let raft_router_clone = raft_router.clone(); + async move { + // Send a `SignificantMsg::PrepareFlashback` to prepare the raftstore for the + // later flashback. This will first block all scheduling, read and write + // operations and then wait for the latest Raft log to be applied before + // we start the flashback command. + let region_id = req.get_context().get_region_id(); + let (result_tx, result_rx) = oneshot::channel(); + raft_router_clone + .significant_send(region_id, SignificantMsg::PrepareFlashback(result_tx))?; + if !result_rx.await? { + return Err(Error::Other(box_err!( + "failed to prepare the region {} for flashback", + region_id + ))); + } + let (cb, f) = paired_future_callback(); + let res = storage_clone.sched_txn_command(req.into(), cb); + // Avoid crossing `.await` to bypass the `Send` constraint. + drop(storage_clone); + let v = match res { + Err(e) => Err(e), + Ok(_) => f.await?, + }; + fail_point!("skip_finish_flashback_to_version", |_| { + Ok(FlashbackToVersionResponse::default()) + }); + // Send a `SignificantMsg::FinishFlashback` to notify the raftstore that the + // flashback has been finished. + raft_router_clone.significant_send(region_id, SignificantMsg::FinishFlashback)?; + let mut resp = FlashbackToVersionResponse::default(); + if let Some(err) = extract_region_error(&v) { + resp.set_region_error(err); + } else if let Err(e) = v { + resp.set_error(format!("{}", e)); + } + Ok(resp) + } +} + fn future_raw_get( storage: &Storage, mut req: RawGetRequest, diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index a204ab4f30f..c15b27deb66 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -351,6 +351,18 @@ impl From for TypedCommand> { } } +impl From for TypedCommand<()> { + fn from(mut req: FlashbackToVersionRequest) -> Self { + FlashbackToVersionReadPhase::new( + req.get_version().into(), + Some(Key::from_raw(req.get_end_key())), + Some(Key::from_raw(req.get_start_key())), + Some(Key::from_raw(req.get_start_key())), + req.take_context(), + ) + } +} + #[derive(Default)] pub(super) struct ReleasedLocks { start_ts: TimeStamp, diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index a72bd671d0a..a7c38e147ee 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -568,7 +568,11 @@ impl Scheduler { pb_ctx: task.cmd.ctx(), ..Default::default() }; - if let Command::FlashbackToVersionReadPhase { .. } = task.cmd { + if matches!( + task.cmd, + Command::FlashbackToVersionReadPhase { .. } + | Command::FlashbackToVersion { .. } + ) { snap_ctx.for_flashback = true; } // The program is currently in scheduler worker threads. diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 262060b4491..d60edf7bc97 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -596,6 +596,155 @@ fn test_mvcc_resolve_lock_gc_and_delete() { assert!(del_resp.error.is_empty()); } +#[test] +fn test_mvcc_flashback() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let mut ts = 0; + let k = b"key".to_vec(); + for i in 0..10 { + let v = format!("value@{}", i).into_bytes(); + // Prewrite + ts += 1; + let prewrite_start_version = ts; + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite( + &client, + ctx.clone(), + vec![mutation], + k.clone(), + prewrite_start_version, + ); + // Commit + ts += 1; + let commit_version = ts; + must_kv_commit( + &client, + ctx.clone(), + vec![k.clone()], + prewrite_start_version, + commit_version, + commit_version, + ); + // Get + ts += 1; + must_kv_read_equal(&client, ctx.clone(), k.clone(), v.clone(), ts) + } + // Prewrite to leave a lock. + ts += 1; + let prewrite_start_version = ts; + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(b"value@latest".to_vec()); + must_kv_prewrite( + &client, + ctx.clone(), + vec![mutation], + k.clone(), + prewrite_start_version, + ); + ts += 1; + let get_version = ts; + let mut get_req = GetRequest::default(); + get_req.set_context(ctx.clone()); + get_req.key = k.clone(); + get_req.version = get_version; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(get_resp.get_error().has_locked()); + assert!(get_resp.value.is_empty()); + // Flashback + let mut flashback_to_version_req = FlashbackToVersionRequest::default(); + flashback_to_version_req.set_context(ctx.clone()); + flashback_to_version_req.version = 5; + flashback_to_version_req.start_key = b"a".to_vec(); + flashback_to_version_req.end_key = b"z".to_vec(); + let flashback_resp = client + .kv_flashback_to_version(&flashback_to_version_req) + .unwrap(); + assert!(!flashback_resp.has_region_error()); + assert!(flashback_resp.get_error().is_empty()); + // Should not meet the lock and can not get the latest data any more. + must_kv_read_equal(&client, ctx, k, b"value@1".to_vec(), ts); +} + +#[test] +#[cfg(feature = "failpoints")] +fn test_mvcc_flashback_block_rw() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); + // Flashback + let mut flashback_to_version_req = FlashbackToVersionRequest::default(); + flashback_to_version_req.set_context(ctx.clone()); + flashback_to_version_req.version = 0; + flashback_to_version_req.start_key = b"a".to_vec(); + flashback_to_version_req.end_key = b"z".to_vec(); + let flashback_resp = client + .kv_flashback_to_version(&flashback_to_version_req) + .unwrap(); + assert!(!flashback_resp.has_region_error()); + assert!(flashback_resp.get_error().is_empty()); + // Try to read. + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + // Get + let mut get_req = GetRequest::default(); + get_req.set_context(ctx.clone()); + get_req.key = k.clone(); + get_req.version = 1; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(get_resp.get_region_error().has_recovery_in_progress()); + assert!(!get_resp.has_error()); + assert!(get_resp.value.is_empty()); + // Scan + let mut scan_req = ScanRequest::default(); + scan_req.set_context(ctx.clone()); + scan_req.start_key = k.clone(); + scan_req.limit = 1; + scan_req.version = 1; + let scan_resp = client.kv_scan(&scan_req).unwrap(); + assert!(scan_resp.get_region_error().has_recovery_in_progress()); + assert!(scan_resp.pairs.is_empty()); + // Try to write. + // Prewrite + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v); + let prewrite_resp = try_kv_prewrite(&client, ctx, vec![mutation], k, 1); + assert!(prewrite_resp.get_region_error().has_recovery_in_progress()); + fail::remove("skip_finish_flashback_to_version"); +} + +#[test] +#[cfg(feature = "failpoints")] +fn test_mvcc_flashback_block_scheduling() { + let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); + fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); + // Flashback + let mut flashback_to_version_req = FlashbackToVersionRequest::default(); + flashback_to_version_req.set_context(ctx); + flashback_to_version_req.version = 0; + flashback_to_version_req.start_key = b"a".to_vec(); + flashback_to_version_req.end_key = b"z".to_vec(); + let flashback_resp = client + .kv_flashback_to_version(&flashback_to_version_req) + .unwrap(); + assert!(!flashback_resp.has_region_error()); + assert!(flashback_resp.get_error().is_empty()); + // Try to transfer leader. + let transfer_leader_resp = cluster.try_transfer_leader(1, new_peer(2, 2)); + assert!( + transfer_leader_resp + .get_header() + .get_error() + .has_recovery_in_progress() + ); + fail::remove("skip_finish_flashback_to_version"); +} + // raft related RPC is tested as parts of test_snapshot.rs, so skip here. #[test] From d471b933d6eaaa041b32ab0613cf844294567d2e Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Mon, 5 Sep 2022 14:52:55 +0800 Subject: [PATCH 186/676] engine: toggle purge worker with `need_manual_purge` (#13253) ref tikv/tikv#11119 None Signed-off-by: tabokie --- components/engine_panic/src/raft_engine.rs | 6 ++- components/engine_rocks/src/raft_engine.rs | 4 -- components/engine_traits/src/raft_engine.rs | 8 ++- components/raft_log_engine/src/engine.rs | 6 ++- components/raftstore/src/store/fsm/store.rs | 56 ++++++++++++--------- 5 files changed, 48 insertions(+), 32 deletions(-) diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 2fffb544fe3..bb501007a76 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -114,7 +114,11 @@ impl RaftEngine for PanicEngine { panic!() } - fn purge_expired_files(&self) -> Result> { + fn need_manual_purge(&self) -> bool { + panic!() + } + + fn manual_purge(&self) -> Result> { panic!() } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index f1e86903e9d..605ef4c5514 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -312,10 +312,6 @@ impl RaftEngine for RocksEngine { Ok(total) } - fn purge_expired_files(&self) -> Result> { - Ok(vec![]) - } - fn flush_metrics(&self, instance: &str) { KvEngine::flush_metrics(self, instance) } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index a7bd66d3230..e64bbe18018 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -115,9 +115,15 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send Ok(total) } + fn need_manual_purge(&self) -> bool { + false + } + /// Purge expired logs files and return a set of Raft group ids /// which needs to be compacted ASAP. - fn purge_expired_files(&self) -> Result>; + fn manual_purge(&self) -> Result> { + unimplemented!() + } fn flush_metrics(&self, _instance: &str) {} fn flush_stats(&self) -> Option { diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 8991a6f6838..2cd27d89538 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -587,7 +587,11 @@ impl RaftEngine for RaftLogEngine { Ok(total as usize) } - fn purge_expired_files(&self) -> Result> { + fn need_manual_purge(&self) -> bool { + true + } + + fn manual_purge(&self) -> Result> { self.0.purge_expired_files().map_err(transfer_error) } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index d6faf92ca85..5743b0ec3a5 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1385,9 +1385,9 @@ struct Workers { // blocking operation, which can take an extensive amount of time. cleanup_worker: Worker, region_worker: Worker, - // Used for calling `purge_expired_files`, which can be time-consuming for certain - // engine implementations. - purge_worker: Worker, + // Used for calling `manual_purge` if the specific engine implementation requires it + // (`need_manual_purge`). + purge_worker: Option, raftlog_fetch_worker: Worker, @@ -1452,12 +1452,36 @@ impl RaftBatchSystem { .registry .register_admin_observer(100, BoxAdminObserver::new(SplitObserver)); + let purge_worker = if engines.raft.need_manual_purge() { + let worker = Worker::new("purge-worker"); + let raft_clone = engines.raft.clone(); + let router_clone = self.router(); + worker.spawn_interval_task(cfg.value().raft_engine_purge_interval.0, move || { + match raft_clone.manual_purge() { + Ok(regions) => { + for region_id in regions { + let _ = router_clone.send( + region_id, + PeerMsg::CasualMessage(CasualMessage::ForceCompactRaftLogs), + ); + } + } + Err(e) => { + warn!("purge expired files"; "err" => %e); + } + }; + }); + Some(worker) + } else { + None + }; + let workers = Workers { pd_worker, background_worker, cleanup_worker: Worker::new("cleanup-worker"), region_worker: Worker::new("region-worker"), - purge_worker: Worker::new("purge-worker"), + purge_worker, raftlog_fetch_worker: Worker::new("raftlog-fetch-worker"), coprocessor_host: coprocessor_host.clone(), refresh_config_worker: LazyWorker::new("refreash-config-worker"), @@ -1484,26 +1508,6 @@ impl RaftBatchSystem { let raftlog_gc_scheduler = workers .background_worker .start_with_timer("raft-gc-worker", raftlog_gc_runner); - let router_clone = self.router(); - let engines_clone = engines.clone(); - workers.purge_worker.spawn_interval_task( - cfg.value().raft_engine_purge_interval.0, - move || { - match engines_clone.raft.purge_expired_files() { - Ok(regions) => { - for region_id in regions { - let _ = router_clone.send( - region_id, - PeerMsg::CasualMessage(CasualMessage::ForceCompactRaftLogs), - ); - } - } - Err(e) => { - warn!("purge expired files"; "err" => %e); - } - }; - }, - ); let raftlog_fetch_scheduler = workers.raftlog_fetch_worker.start( "raftlog-fetch-worker", @@ -1711,7 +1715,9 @@ impl RaftBatchSystem { workers.cleanup_worker.stop(); workers.region_worker.stop(); workers.background_worker.stop(); - workers.purge_worker.stop(); + if let Some(w) = workers.purge_worker { + w.stop(); + } workers.refresh_config_worker.stop(); workers.raftlog_fetch_worker.stop(); } From 14a8a9c4522e5be7a5e571dbaffe0278ec87bbf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 5 Sep 2022 15:04:55 +0800 Subject: [PATCH 187/676] log-backup: fixed pessimistic lock in initial scanning (#13354) close tikv/tikv#13304 Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- components/backup-stream/src/event_loader.rs | 4 +- components/backup-stream/tests/mod.rs | 73 +++++++++++++++++++- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 0f83d4726e4..fc34b65eead 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -152,7 +152,9 @@ impl EventLoader { ) })?; debug!("meet lock during initial scanning."; "key" => %utils::redact(&lock_at), "ts" => %lock.ts); - resolver.track_phase_one_lock(lock.ts, lock_at) + if utils::should_track_lock(&lock) { + resolver.track_phase_one_lock(lock.ts, lock_at); + } } TxnEntry::Commit { default, write, .. } => { result.push(ApplyEvent { diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index c5d3442fb84..4a437421dac 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -44,8 +44,12 @@ use txn_types::{Key, TimeStamp, WriteRef}; use walkdir::WalkDir; fn mutation(k: Vec, v: Vec) -> Mutation { + mutation_op(k, v, Op::Put) +} + +fn mutation_op(k: Vec, v: Vec, op: Op) -> Mutation { let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); + mutation.set_op(op); mutation.key = k; mutation.value = v; mutation @@ -419,6 +423,36 @@ impl Suite { // Copy & Paste from cdc::tests::TestSuite, maybe make it a mixin? impl Suite { + pub fn tso(&self) -> TimeStamp { + run_async_test(self.cluster.pd_client.get_tso()).unwrap() + } + + pub fn must_kv_pessimistic_lock( + &mut self, + region_id: u64, + keys: Vec>, + ts: TimeStamp, + pk: Vec, + ) { + let mut lock_req = PessimisticLockRequest::new(); + lock_req.set_context(self.get_context(region_id)); + let mut mutations = vec![]; + for key in keys { + mutations.push(mutation_op(key, vec![], Op::PessimisticLock)); + } + lock_req.set_mutations(mutations.into()); + lock_req.primary_lock = pk; + lock_req.start_version = ts.into_inner(); + lock_req.lock_ttl = ts.into_inner() + 1; + let resp = self + .get_tikv_client(region_id) + .kv_pessimistic_lock(&lock_req) + .unwrap(); + + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert!(resp.errors.is_empty(), "{:?}", resp.get_errors()); + } + pub fn must_kv_prewrite( &mut self, region_id: u64, @@ -1020,4 +1054,41 @@ mod test { regions ); } + + /// This test case tests whether we correctly handle the pessimistic locks. + #[test] + fn pessimistic_lock() { + let mut suite = SuiteBuilder::new_named("pessimistic_lock").nodes(3).build(); + suite.must_kv_pessimistic_lock( + 1, + vec![make_record_key(1, 42)], + suite.tso(), + make_record_key(1, 42), + ); + suite.must_register_task(1, "pessimistic_lock"); + suite.must_kv_pessimistic_lock( + 1, + vec![make_record_key(1, 43)], + suite.tso(), + make_record_key(1, 43), + ); + let expected_tso = suite.tso().into_inner(); + suite.force_flush_files("pessimistic_lock"); + suite.wait_for_flush(); + std::thread::sleep(Duration::from_secs(1)); + let checkpoint = run_async_test( + suite + .get_meta_cli() + .global_progress_of_task("pessimistic_lock"), + ) + .unwrap(); + // The checkpoint should be advanced: because PiTR is "Read" operation, + // which shouldn't be blocked by pessimistic locks. + assert!( + checkpoint > expected_tso, + "expected = {}; checkpoint = {}", + expected_tso, + checkpoint + ); + } } From 7d36f3490570444c944e4c7cafb7887642b695ea Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 5 Sep 2022 15:20:56 +0800 Subject: [PATCH 188/676] gc_worker: use async_snapshot instead of raw API in GC (#13322) close tikv/tikv#13319 Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/backup-stream/Cargo.toml | 1 + components/backup-stream/src/event_loader.rs | 8 +- components/engine_panic/src/snapshot.rs | 1 + .../src/coprocessor/region_info_accessor.rs | 154 +++- components/raftstore/src/store/worker/read.rs | 1 - components/server/src/server.rs | 25 +- components/test_coprocessor/src/store.rs | 2 +- components/test_raftstore/src/server.rs | 3 +- components/test_storage/src/assert_storage.rs | 48 +- components/test_storage/src/sync_storage.rs | 34 +- components/test_storage/src/util.rs | 4 +- components/tikv_kv/src/btree_engine.rs | 8 +- components/tikv_kv/src/lib.rs | 9 +- components/tikv_kv/src/mock_engine.rs | 4 - components/tikv_kv/src/rocksdb_engine.rs | 4 - src/coprocessor/endpoint.rs | 2 +- src/lib.rs | 1 + src/server/gc_worker/compaction_filter.rs | 1 - src/server/gc_worker/gc_manager.rs | 43 +- src/server/gc_worker/gc_worker.rs | 728 ++++++++++++------ .../gc_worker/rawkv_compaction_filter.rs | 1 - src/server/raftkv.rs | 27 +- src/server/server.rs | 10 +- src/storage/mod.rs | 16 +- src/storage/mvcc/reader/reader.rs | 15 + src/storage/txn/store.rs | 6 + tests/benches/hierarchy/storage/mod.rs | 6 +- tests/benches/misc/storage/incremental_get.rs | 2 +- tests/benches/misc/storage/mvcc_reader.rs | 2 +- tests/benches/misc/storage/scan.rs | 2 +- tests/failpoints/cases/test_gc_metrics.rs | 30 +- tests/failpoints/cases/test_gc_worker.rs | 2 + .../integrations/config/dynamic/gc_worker.rs | 12 +- .../integrations/raftstore/test_lease_read.rs | 46 +- tests/integrations/server/gc_worker.rs | 34 +- tests/integrations/server/kv_service.rs | 3 +- .../integrations/storage/test_raft_storage.rs | 6 +- tests/integrations/storage/test_raftkv.rs | 2 +- .../storage/test_region_info_accessor.rs | 36 + tests/integrations/storage/test_storage.rs | 24 +- 41 files changed, 960 insertions(+), 404 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 34795afc974..0aa7586a608 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -485,6 +485,7 @@ dependencies = [ "tidb_query_datatype", "tikv", "tikv_alloc", + "tikv_kv", "tikv_util", "tokio", "tokio-stream", diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index e5bb889420d..6090d929291 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -59,6 +59,7 @@ thiserror = "1" tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } tikv = { path = "../../", default-features = false } tikv_alloc = { path = "../tikv_alloc" } +tikv_kv = { path = "../tikv_kv" } tikv_util = { path = "../tikv_util" } tokio = { version = "1.5", features = ["rt-multi-thread", "macros", "time", "sync"] } tokio-stream = "0.1" diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index fc34b65eead..fc84fab0635 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -489,8 +489,10 @@ where #[cfg(test)] mod tests { + use futures::executor::block_on; use kvproto::metapb::*; - use tikv::storage::{txn::tests::*, Engine, TestEngineBuilder}; + use tikv::storage::{txn::tests::*, TestEngineBuilder}; + use tikv_kv::SnapContext; use txn_types::TimeStamp; use super::EventLoader; @@ -517,7 +519,9 @@ mod tests { r.set_id(42); r.set_start_key(b"".to_vec()); r.set_end_key(b"".to_vec()); - let snap = engine.snapshot_on_kv_engine(b"", b"").unwrap(); + + let snap = + block_on(async { tikv_kv::snapshot(&engine, SnapContext::default()).await }).unwrap(); let mut loader = EventLoader::load_from(snap, TimeStamp::zero(), TimeStamp::max(), &r).unwrap(); diff --git a/components/engine_panic/src/snapshot.rs b/components/engine_panic/src/snapshot.rs index cf651db4956..296d7ce617a 100644 --- a/components/engine_panic/src/snapshot.rs +++ b/components/engine_panic/src/snapshot.rs @@ -17,6 +17,7 @@ impl Peekable for PanicSnapshot { fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { panic!() } + fn get_value_cf_opt( &self, opts: &ReadOptions, diff --git a/components/raftstore/src/coprocessor/region_info_accessor.rs b/components/raftstore/src/coprocessor/region_info_accessor.rs index 8f9021c8e60..338cf3962c4 100644 --- a/components/raftstore/src/coprocessor/region_info_accessor.rs +++ b/components/raftstore/src/coprocessor/region_info_accessor.rs @@ -12,6 +12,7 @@ use std::{ use collections::{HashMap, HashSet}; use engine_traits::KvEngine; +use itertools::Itertools; use kvproto::metapb::Region; use raft::StateRole; use tikv_util::{ @@ -656,6 +657,10 @@ pub trait RegionInfoProvider: Send + Sync { unimplemented!() } + fn find_region_by_key(&self, _key: &[u8]) -> Result { + unimplemented!() + } + fn get_regions_in_range(&self, _start_key: &[u8], _end_key: &[u8]) -> Result> { unimplemented!() } @@ -686,6 +691,27 @@ impl RegionInfoProvider for RegionInfoAccessor { .map_err(|e| box_err!("failed to send request to region collector: {:?}", e)) } + fn find_region_by_key(&self, key: &[u8]) -> Result { + let key_in_vec = key.to_vec(); + let (tx, rx) = mpsc::channel(); + self.seek_region( + key, + Box::new(move |iter| { + if let Some(info) = iter.next() && info.region.get_start_key() <= key_in_vec.as_slice() { + if let Err(e) = tx.send(info.region.clone()) { + warn!("failed to send find_region_by_key result: {:?}", e); + } + } + }), + )?; + rx.recv().map_err(|e| { + box_err!( + "failed to receive find_region_by_key result from region collector: {:?}", + e + ) + }) + } + fn get_regions_in_range(&self, start_key: &[u8], end_key: &[u8]) -> Result> { let (tx, rx) = mpsc::channel(); let msg = RegionInfoQuery::GetRegionsInRange { @@ -712,28 +738,87 @@ impl RegionInfoProvider for RegionInfoAccessor { } // Use in tests only. -pub struct MockRegionInfoProvider(Mutex>); +// Note: The `StateRole` in RegionInfo here should not be used +pub struct MockRegionInfoProvider(Mutex>); impl MockRegionInfoProvider { pub fn new(regions: Vec) -> Self { - MockRegionInfoProvider(Mutex::new(regions)) + MockRegionInfoProvider(Mutex::new( + regions + .into_iter() + .map(|region| RegionInfo::new(region, StateRole::Leader)) + .collect_vec(), + )) } } impl Clone for MockRegionInfoProvider { fn clone(&self) -> Self { - MockRegionInfoProvider::new(self.0.lock().unwrap().clone()) + MockRegionInfoProvider::new( + self.0 + .lock() + .unwrap() + .iter() + .map(|region_info| region_info.region.clone()) + .collect_vec(), + ) } } impl RegionInfoProvider for MockRegionInfoProvider { - fn get_regions_in_range(&self, _start_key: &[u8], _end_key: &[u8]) -> Result> { - Ok(self.0.lock().unwrap().clone()) + fn get_regions_in_range(&self, start_key: &[u8], end_key: &[u8]) -> Result> { + let mut regions = Vec::new(); + let (tx, rx) = mpsc::channel(); + let end_key = RangeKey::from_end_key(end_key.to_vec()); + + self.seek_region( + start_key, + Box::new(move |iter| { + for region_info in iter { + if RangeKey::from_start_key(region_info.region.get_start_key().to_vec()) + > end_key + { + continue; + } + tx.send(region_info.region.clone()).unwrap(); + } + }), + )?; + + for region in rx { + regions.push(region); + } + Ok(regions) + } + + fn seek_region(&self, from: &[u8], callback: SeekRegionCallback) -> Result<()> { + let region_infos = self.0.lock().unwrap(); + let mut iter = region_infos.iter().filter(|®ion_info| { + RangeKey::from_end_key(region_info.region.get_end_key().to_vec()) + > RangeKey::from_start_key(from.to_vec()) + }); + callback(&mut iter); + Ok(()) + } + + fn find_region_by_key(&self, key: &[u8]) -> Result { + let region_infos = self.0.lock().unwrap(); + let key = RangeKey::from_start_key(key.to_vec()); + region_infos + .iter() + .find(|region_info| { + RangeKey::from_start_key(region_info.region.get_start_key().to_vec()) <= key + && key < RangeKey::from_end_key(region_info.region.get_end_key().to_vec()) + }) + .map(|region_info| region_info.region.clone()) + .ok_or(box_err!("Not found region containing {:?}", key)) } } #[cfg(test)] mod tests { + use txn_types::Key; + use super::*; fn new_region_collector() -> RegionCollector { @@ -1290,4 +1375,63 @@ mod tests { ], ); } + + #[test] + fn test_mock_region_info_provider() { + fn init_region(start_key: &[u8], end_key: &[u8], region_id: u64) -> Region { + let start_key = Key::from_encoded(start_key.to_vec()); + let end_key = Key::from_encoded(end_key.to_vec()); + let mut region = Region::default(); + region.set_start_key(start_key.as_encoded().clone()); + region.set_end_key(end_key.as_encoded().clone()); + region.id = region_id; + region + } + + let regions = vec![ + init_region(b"k01", b"k03", 1), + init_region(b"k05", b"k10", 2), + init_region(b"k10", b"k15", 3), + ]; + + let provider = MockRegionInfoProvider::new(regions); + + // Test ranges covering all regions + let regions = provider.get_regions_in_range(b"k01", b"k15").unwrap(); + assert!(regions.len() == 3); + assert!(regions[0].id == 1); + assert!(regions[1].id == 2); + assert!(regions[2].id == 3); + + // Test ranges covering partial regions + let regions = provider.get_regions_in_range(b"k04", b"k10").unwrap(); + assert!(regions.len() == 2); + assert!(regions[0].id == 2); + assert!(regions[1].id == 3); + + // Test seek for all regions + provider + .seek_region( + b"k02", + Box::new(|iter| { + assert!(iter.next().unwrap().region.id == 1); + assert!(iter.next().unwrap().region.id == 2); + assert!(iter.next().unwrap().region.id == 3); + assert!(iter.next().is_none()); + }), + ) + .unwrap(); + + // Test seek for partial regions + provider + .seek_region( + b"k04", + Box::new(|iter| { + assert!(iter.next().unwrap().region.id == 2); + assert!(iter.next().unwrap().region.id == 3); + assert!(iter.next().is_none()); + }), + ) + .unwrap(); + } } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a3c3878cf68..9c5889f876e 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -782,7 +782,6 @@ where // Replica can serve stale read if and only if its `safe_ts` >= `read_ts` RequestPolicy::StaleRead => { let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - assert!(read_ts > 0); if let Err(resp) = delegate.check_stale_read_safe(read_ts) { cb.invoke_read(resp); return; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 35a06d1321f..ca95ddaf310 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -591,22 +591,14 @@ where RaftRouter, > { let engines = self.engines.as_ref().unwrap(); - let mut gc_worker = GcWorker::new( + let gc_worker = GcWorker::new( engines.engine.clone(), self.router.clone(), self.flow_info_sender.take().unwrap(), self.config.gc.clone(), self.pd_client.feature_gate().clone(), + Arc::new(self.region_info_accessor.clone()), ); - gc_worker - .start() - .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); - gc_worker - .start_observe_lock_apply( - self.coprocessor_host.as_mut().unwrap(), - self.concurrency_manager.clone(), - ) - .unwrap_or_else(|e| fatal!("gc worker failed to observe lock apply: {}", e)); let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( @@ -623,7 +615,7 @@ where self.engines.as_ref().unwrap().engine.kv_engine(), self.flow_info_receiver.take().unwrap(), ))); - let gc_worker = self.init_gc_worker(); + let mut gc_worker = self.init_gc_worker(); let mut ttl_checker = Box::new(LazyWorker::new("ttl-checker")); let ttl_scheduler = ttl_checker.scheduler(); @@ -1040,7 +1032,16 @@ where self.region_info_accessor.clone(), node.id(), ); - if let Err(e) = gc_worker.start_auto_gc(auto_gc_config, safe_point) { + gc_worker + .start(node.id()) + .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); + gc_worker + .start_observe_lock_apply( + self.coprocessor_host.as_mut().unwrap(), + self.concurrency_manager.clone(), + ) + .unwrap_or_else(|e| fatal!("gc worker failed to observe lock apply: {}", e)); + if let Err(e) = gc_worker.start_auto_gc(&engines.engines.kv, auto_gc_config, safe_point) { fatal!("failed to start auto_gc on storage, error: {}", e); } diff --git a/components/test_coprocessor/src/store.rs b/components/test_coprocessor/src/store.rs index a85f75c422e..f19b0a113bd 100644 --- a/components/test_coprocessor/src/store.rs +++ b/components/test_coprocessor/src/store.rs @@ -132,7 +132,7 @@ impl Default for Store { impl Store { pub fn from_storage(storage: StorageApiV1) -> Self { Self { - store: SyncTestStorageApiV1::from_storage(storage, GcConfig::default()).unwrap(), + store: SyncTestStorageApiV1::from_storage(0, storage, GcConfig::default()).unwrap(), current_ts: 1.into(), last_committed_ts: TimeStamp::zero(), handles: vec![], diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 683de2e5a7d..2c1798877d9 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -337,8 +337,9 @@ impl ServerCluster { tx, cfg.gc.clone(), Default::default(), + Arc::new(region_info_accessor.clone()), ); - gc_worker.start().unwrap(); + gc_worker.start(node_id).unwrap(); gc_worker .start_observe_lock_apply(&mut coprocessor_host, concurrency_manager.clone()) .unwrap(); diff --git a/components/test_storage/src/assert_storage.rs b/components/test_storage/src/assert_storage.rs index 5cb6e43d8cb..3a641a322a2 100644 --- a/components/test_storage/src/assert_storage.rs +++ b/components/test_storage/src/assert_storage.rs @@ -1,7 +1,10 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use api_version::{ApiV1, KvFormat}; -use kvproto::kvrpcpb::{Context, KeyRange, LockInfo}; +use kvproto::{ + kvrpcpb::{Context, KeyRange, LockInfo}, + metapb, +}; use test_raftstore::{Cluster, ServerCluster, SimulateEngine}; use tikv::storage::{ self, @@ -27,7 +30,7 @@ impl Default for AssertionStorage { fn default() -> Self { AssertionStorage { ctx: Context::default(), - store: SyncTestStorageBuilder::default().build().unwrap(), + store: SyncTestStorageBuilder::default().build(0).unwrap(), } } } @@ -36,7 +39,7 @@ impl AssertionStorage { pub fn new() -> Self { AssertionStorage { ctx: Context::default(), - store: SyncTestStorageBuilder::new().build().unwrap(), + store: SyncTestStorageBuilder::new().build(0).unwrap(), } } } @@ -51,19 +54,27 @@ impl AssertionStorage { (cluster, storage) } - pub fn update_with_key_byte(&mut self, cluster: &mut Cluster, key: &[u8]) { + pub fn update_with_key_byte( + &mut self, + cluster: &mut Cluster, + key: &[u8], + ) -> metapb::Region { // ensure the leader of range which contains current key has been elected cluster.must_get(key); let region = cluster.get_region(key); let leader = cluster.leader_of_region(region.get_id()).unwrap(); if leader.get_store_id() == self.ctx.get_peer().get_store_id() { - return; + return region; } + let store_id = leader.store_id; let engine = cluster.sim.rl().storages[&leader.get_id()].clone(); self.ctx.set_region_id(region.get_id()); self.ctx.set_region_epoch(region.get_region_epoch().clone()); self.ctx.set_peer(leader); - self.store = SyncTestStorageBuilder::from_engine(engine).build().unwrap(); + self.store = SyncTestStorageBuilder::from_engine(engine) + .build(store_id) + .unwrap(); + region } pub fn delete_ok_for_cluster( @@ -173,7 +184,7 @@ impl AssertionStorage { break; } self.expect_not_leader_or_stale_command(res.unwrap_err()); - self.update_with_key_byte(cluster, key) + self.update_with_key_byte(cluster, key); } assert!(success); @@ -188,7 +199,7 @@ impl AssertionStorage { break; } self.expect_not_leader_or_stale_command(res.unwrap_err()); - self.update_with_key_byte(cluster, key) + self.update_with_key_byte(cluster, key); } assert!(success); } @@ -197,16 +208,17 @@ impl AssertionStorage { &mut self, cluster: &mut Cluster, region_key: &[u8], + mut region: metapb::Region, safe_point: impl Into, ) { let safe_point = safe_point.into(); for _ in 0..3 { - let ret = self.store.gc(self.ctx.clone(), safe_point); + let ret = self.store.gc(region, self.ctx.clone(), safe_point); if ret.is_ok() { return; } self.expect_not_leader_or_stale_command(ret.unwrap_err()); - self.update_with_key_byte(cluster, region_key); + region = self.update_with_key_byte(cluster, region_key); } panic!("failed with 3 retry!"); } @@ -224,7 +236,9 @@ impl AssertionStorage { self.delete_ok_for_cluster(cluster, &key, 1000, 1050); self.get_none_from_cluster(cluster, &key, 2000); - self.gc_ok_for_cluster(cluster, &key, 2000); + + let region = cluster.get_region(&key); + self.gc_ok_for_cluster(cluster, &key, region, 2000); self.get_none_from_cluster(cluster, &key, 3000); } } @@ -793,8 +807,10 @@ impl AssertionStorage { self.expect_invalid_tso_err(resp, start_ts, commit_ts.unwrap()) } - pub fn gc_ok(&self, safe_point: impl Into) { - self.store.gc(self.ctx.clone(), safe_point.into()).unwrap(); + pub fn gc_ok(&self, region: metapb::Region, safe_point: impl Into) { + self.store + .gc(region, self.ctx.clone(), safe_point.into()) + .unwrap(); } pub fn delete_range_ok(&self, start_key: &[u8], end_key: &[u8]) { @@ -1069,11 +1085,11 @@ impl AssertionStorage { .unwrap_err(); } - pub fn test_txn_store_gc(&self, key: &str) { + pub fn test_txn_store_gc(&self, key: &str, region: metapb::Region) { let key_bytes = key.as_bytes(); self.put_ok(key_bytes, b"v1", 5, 10); self.put_ok(key_bytes, b"v2", 15, 20); - self.gc_ok(30); + self.gc_ok(region, 30); self.get_none(key_bytes, 15); self.get_ok(key_bytes, 25, b"v2"); } @@ -1086,7 +1102,7 @@ impl AssertionStorage { } self.delete_ok(&key, 1000, 1050); self.get_none(&key, 2000); - self.gc_ok(2000); + self.gc_ok(metapb::Region::default(), 2000); self.get_none(&key, 3000); } } diff --git a/components/test_storage/src/sync_storage.rs b/components/test_storage/src/sync_storage.rs index b32dbe08fd5..c0b47bab2cf 100644 --- a/components/test_storage/src/sync_storage.rs +++ b/components/test_storage/src/sync_storage.rs @@ -8,8 +8,14 @@ use std::{ use api_version::{ApiV1, KvFormat}; use collections::HashMap; use futures::executor::block_on; -use kvproto::kvrpcpb::{ChecksumAlgorithm, Context, GetRequest, KeyRange, LockInfo, RawGetRequest}; -use raftstore::{coprocessor::RegionInfoProvider, router::RaftStoreBlackHole}; +use kvproto::{ + kvrpcpb::{ChecksumAlgorithm, Context, GetRequest, KeyRange, LockInfo, RawGetRequest}, + metapb, +}; +use raftstore::{ + coprocessor::{region_info_accessor::MockRegionInfoProvider, RegionInfoProvider}, + router::RaftStoreBlackHole, +}; use tikv::{ server::gc_worker::{AutoGcConfig, GcConfig, GcSafePointProvider, GcWorker}, storage::{ @@ -78,7 +84,7 @@ impl SyncTestStorageBuilder { self } - pub fn build(mut self) -> Result> { + pub fn build(mut self, store_id: u64) -> Result> { let mut builder = TestStorageBuilder::<_, _, F>::from_engine_and_lock_mgr( self.engine.clone(), DummyLockManager, @@ -87,7 +93,11 @@ impl SyncTestStorageBuilder { builder = builder.config(config); } builder = builder.set_api_version(F::TAG); - SyncTestStorage::from_storage(builder.build()?, self.gc_config.unwrap_or_default()) + SyncTestStorage::from_storage( + store_id, + builder.build()?, + self.gc_config.unwrap_or_default(), + ) } } @@ -106,6 +116,7 @@ pub type SyncTestStorageApiV1 = SyncTestStorage; impl SyncTestStorage { pub fn from_storage( + store_id: u64, storage: Storage, config: GcConfig, ) -> Result { @@ -116,8 +127,9 @@ impl SyncTestStorage { tx, config, Default::default(), + Arc::new(MockRegionInfoProvider::new(Vec::new())), ); - gc_worker.start()?; + gc_worker.start(store_id)?; Ok(Self { gc_worker, store: storage, @@ -126,10 +138,11 @@ impl SyncTestStorage { pub fn start_auto_gc( &mut self, + kv_engine: &E::Local, cfg: AutoGcConfig, ) { self.gc_worker - .start_auto_gc(cfg, Arc::new(AtomicU64::new(0))) + .start_auto_gc(kv_engine, cfg, Arc::new(AtomicU64::new(0))) .unwrap(); } @@ -334,8 +347,13 @@ impl SyncTestStorage { .unwrap() } - pub fn gc(&self, _: Context, safe_point: impl Into) -> Result<()> { - wait_op!(|cb| self.gc_worker.gc(safe_point.into(), cb)).unwrap() + pub fn gc( + &self, + region: metapb::Region, + _: Context, + safe_point: impl Into, + ) -> Result<()> { + wait_op!(|cb| self.gc_worker.gc(region, safe_point.into(), cb)).unwrap() } pub fn delete_range( diff --git a/components/test_storage/src/util.rs b/components/test_storage/src/util.rs index 62b46ffd082..032fe24c60c 100644 --- a/components/test_storage/src/util.rs +++ b/components/test_storage/src/util.rs @@ -36,7 +36,9 @@ pub fn new_raft_storage_with_store_count( let (cluster, engine, ctx) = new_raft_engine(count, key); ( cluster, - SyncTestStorageBuilder::from_engine(engine).build().unwrap(), + SyncTestStorageBuilder::from_engine(engine) + .build(ctx.peer.as_ref().unwrap().store_id) + .unwrap(), ctx, ) } diff --git a/components/tikv_kv/src/btree_engine.rs b/components/tikv_kv/src/btree_engine.rs index 757c3e2c378..5fa9b3bd7f0 100644 --- a/components/tikv_kv/src/btree_engine.rs +++ b/components/tikv_kv/src/btree_engine.rs @@ -81,10 +81,6 @@ impl Engine for BTreeEngine { unimplemented!(); } - fn snapshot_on_kv_engine(&self, _: &[u8], _: &[u8]) -> EngineResult { - unimplemented!(); - } - fn modify_on_kv_engine(&self, _: Vec) -> EngineResult<()> { unimplemented!(); } @@ -237,6 +233,7 @@ impl Snapshot for BTreeEngineSnapshot { fn get(&self, key: &Key) -> EngineResult> { self.get_cf(CF_DEFAULT, key) } + fn get_cf(&self, cf: CfName, key: &Key) -> EngineResult> { let tree_cf = self.inner_engine.get_cf(cf); let tree = tree_cf.read().unwrap(); @@ -246,14 +243,17 @@ impl Snapshot for BTreeEngineSnapshot { Some(v) => Ok(Some(v.clone())), } } + fn get_cf_opt(&self, _: ReadOptions, cf: CfName, key: &Key) -> EngineResult> { self.get_cf(cf, key) } + #[inline] fn iter(&self, cf: CfName, iter_opt: IterOptions) -> EngineResult { let tree = self.inner_engine.get_cf(cf); Ok(BTreeEngineIterator::new(tree, iter_opt)) } + fn ext(&self) -> DummySnapshotExt { DummySnapshotExt } diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 64a05a98622..b72a2e487b0 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -251,7 +251,9 @@ impl WriteData { pub struct SnapContext<'a> { pub pb_ctx: &'a Context, pub read_id: Option, - pub start_ts: TimeStamp, + // When start_ts is None and `stale_read` is true, it means acquire a snapshot without any + // consistency guarantee. + pub start_ts: Option, // `key_ranges` is used in replica read. It will send to // the leader via raft "read index" to check memory locks. pub key_ranges: Vec, @@ -267,8 +269,6 @@ pub trait Engine: Send + Clone + 'static { /// Local storage engine. fn kv_engine(&self) -> Self::Local; - fn snapshot_on_kv_engine(&self, start_key: &[u8], end_key: &[u8]) -> Result; - /// Write modifications into internal local engine directly. fn modify_on_kv_engine(&self, modifies: Vec) -> Result<()>; @@ -365,12 +365,15 @@ pub trait Snapshot: Sync + Send + Clone { /// Get the value associated with `key` in `cf` column family, with Options /// in `opts` fn get_cf_opt(&self, opts: ReadOptions, cf: CfName, key: &Key) -> Result>; + fn iter(&self, cf: CfName, iter_opt: IterOptions) -> Result; + // The minimum key this snapshot can retrieve. #[inline] fn lower_bound(&self) -> Option<&[u8]> { None } + // The maximum key can be fetched from the snapshot should less than the upper // bound. #[inline] diff --git a/components/tikv_kv/src/mock_engine.rs b/components/tikv_kv/src/mock_engine.rs index bec883c1f71..71d424b1b0f 100644 --- a/components/tikv_kv/src/mock_engine.rs +++ b/components/tikv_kv/src/mock_engine.rs @@ -152,10 +152,6 @@ impl Engine for MockEngine { self.base.kv_engine() } - fn snapshot_on_kv_engine(&self, start_key: &[u8], end_key: &[u8]) -> Result { - self.base.snapshot_on_kv_engine(start_key, end_key) - } - fn modify_on_kv_engine(&self, modifies: Vec) -> Result<()> { self.base.modify_on_kv_engine(modifies) } diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 031b182b9fe..82ebfe0e1bd 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -210,10 +210,6 @@ impl Engine for RocksEngine { self.engines.kv.clone() } - fn snapshot_on_kv_engine(&self, _: &[u8], _: &[u8]) -> Result { - self.snapshot(Default::default()) - } - fn modify_on_kv_engine(&self, modifies: Vec) -> Result<()> { write_modifies(&self.engines.kv, modifies) } diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 8c2e6d571c0..5f1027e738a 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -355,7 +355,7 @@ impl Endpoint { ) -> impl std::future::Future> { let mut snap_ctx = SnapContext { pb_ctx: &ctx.context, - start_ts: ctx.txn_start_ts, + start_ts: Some(ctx.txn_start_ts), ..Default::default() }; // need to pass start_ts and ranges to check memory locks for replica read diff --git a/src/lib.rs b/src/lib.rs index 5b7bf6e2ac1..a961abc7d38 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,6 +26,7 @@ #![feature(drain_filter)] #![feature(deadline_api)] #![feature(generic_associated_types)] +#![feature(let_else)] #[macro_use(fail_point)] extern crate fail; diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 23f007eb8be..7a5d62ee79d 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -392,7 +392,6 @@ impl WriteCompactionFilter { let task = GcTask::GcKeys { keys: mem::replace(&mut self.mvcc_deletions, empty), safe_point: self.safe_point.into(), - store_id: self.regions_provider.0, region_info_provider: self.regions_provider.1.clone(), }; self.schedule_gc_task(task, false); diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index b2a6a9d02dc..d4c1a8fd830 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -11,6 +11,7 @@ use std::{ }; use engine_traits::KvEngine; +use kvproto::metapb::Region; use pd_client::FeatureGate; use raftstore::{coprocessor::RegionInfoProvider, store::util::find_peer}; use tikv_util::{time::Instant, worker::Scheduler}; @@ -545,23 +546,14 @@ impl GcMan processed_regions: &mut usize, ) -> GcManagerResult> { // Get the information of the next region to do GC. - let (range, next_key) = self.get_next_gc_context(from_key); - let (region_id, start, end) = match range { - Some((r, s, e)) => (r, s, e), - None => return Ok(None), - }; + let (region, next_key) = self.get_next_gc_context(from_key); + let Some(region) = region else { return Ok(None) }; + + let hex_start = format!("{:?}", log_wrappers::Value::key(region.get_start_key())); + let hex_end = format!("{:?}", log_wrappers::Value::key(region.get_end_key())); + debug!("trying gc"; "region_id" => region.id, "start_key" => &hex_start, "end_key" => &hex_end); - let hex_start = format!("{:?}", log_wrappers::Value::key(&start)); - let hex_end = format!("{:?}", log_wrappers::Value::key(&end)); - debug!("trying gc"; "start_key" => &hex_start, "end_key" => &hex_end); - - if let Err(e) = sync_gc( - &self.worker_scheduler, - region_id, - start, - end, - self.curr_safe_point(), - ) { + if let Err(e) = sync_gc(&self.worker_scheduler, region, self.curr_safe_point()) { // Ignore the error and continue, since it's useless to retry this. // TODO: Find a better way to handle errors. Maybe we should retry. warn!("failed gc"; "start_key" => &hex_start, "end_key" => &hex_end, "err" => ?e); @@ -580,7 +572,7 @@ impl GcMan /// the first is the next region can be sent to GC worker; /// the second is the next key which can be passed into this method later. #[allow(clippy::type_complexity)] - fn get_next_gc_context(&mut self, key: Key) -> (Option<(u64, Vec, Vec)>, Option) { + fn get_next_gc_context(&mut self, key: Key) -> (Option, Option) { let (tx, rx) = mpsc::channel(); let store_id = self.cfg.self_store_id; @@ -612,15 +604,14 @@ impl GcMan }); match seek_region_res { - Ok(Some(mut region)) => { - let r = region.get_id(); - let (s, e) = (region.take_start_key(), region.take_end_key()); - let next_key = if e.is_empty() { + Ok(Some(region)) => { + let end_key = region.get_end_key(); + let next_key = if end_key.is_empty() { None } else { - Some(Key::from_encoded_slice(&e)) + Some(Key::from_encoded_slice(end_key)) }; - (Some((r, s, e)), next_key) + (Some(region), next_key) } Ok(None) => (None, None), Err(e) => { @@ -812,10 +803,8 @@ mod tests { .iter() .map(|task| match task { GcTask::Gc { - region_id, - safe_point, - .. - } => (*region_id, *safe_point), + region, safe_point, .. + } => (region.id, *safe_point), _ => unreachable!(), }) .collect(); diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index eadd1d77fb2..dfa0dec4ddc 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -32,11 +32,12 @@ use raftstore::{ router::RaftStoreRouter, store::{msg::StoreMsg, util::find_peer}, }; -use tikv_kv::{CfStatistics, CursorBuilder, Modify}; +use tikv_kv::{CfStatistics, CursorBuilder, Modify, SnapContext}; use tikv_util::{ config::{Tracker, VersionTrack}, time::{duration_to_sec, Instant, Limiter, SlowTimer}, worker::{Builder as WorkerBuilder, LazyWorker, Runnable, ScheduleError, Scheduler}, + Either, }; use txn_types::{Key, TimeStamp}; @@ -92,22 +93,18 @@ where E: KvEngine, { Gc { - region_id: u64, - start_key: Vec, - end_key: Vec, + region: Region, safe_point: TimeStamp, callback: Callback<()>, }, GcKeys { keys: Vec, safe_point: TimeStamp, - store_id: u64, region_info_provider: Arc, }, RawGcKeys { keys: Vec, safe_point: TimeStamp, - store_id: u64, region_info_provider: Arc, }, UnsafeDestroyRange { @@ -115,6 +112,7 @@ where start_key: Key, end_key: Key, callback: Callback<()>, + region_info_provider: Arc, }, PhysicalScanLock { ctx: Context, @@ -122,6 +120,7 @@ where start_key: Key, limit: usize, callback: Callback>, + region_info_provider: Arc, }, /// If GC in compaction filter is enabled, versions on default CF will be /// handled with `DB::delete` in write CF's compaction filter. However if @@ -164,14 +163,10 @@ where fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { GcTask::Gc { - start_key, - end_key, - safe_point, - .. + region, safe_point, .. } => f .debug_struct("Gc") - .field("start_key", &log_wrappers::Value::key(start_key)) - .field("end_key", &log_wrappers::Value::key(end_key)) + .field("region", region) .field("safe_point", safe_point) .finish(), GcTask::GcKeys { .. } => f.debug_struct("GcKeys").finish(), @@ -204,6 +199,7 @@ where E: Engine, RR: RaftStoreRouter, { + store_id: u64, engine: E, raft_store_router: RR, @@ -241,46 +237,66 @@ impl MvccRaw { } } -struct KeysInRegions> { - keys: Peekable>, - regions: Peekable, -} +// Return regions that keys are related to. +fn get_regions_for_gc( + store_id: u64, + keys: &[Key], + region_provider: Arc, +) -> Result> { + assert!(!keys.is_empty()); -impl> Iterator for KeysInRegions { - type Item = Key; - fn next(&mut self) -> Option { - loop { - let region = self.regions.peek()?; - let key = self.keys.peek()?.as_encoded().as_slice(); - if key < region.get_start_key() { - self.keys.next(); - } else if region.get_end_key().is_empty() || key < region.get_end_key() { - return self.keys.next(); - } else { - self.regions.next(); - } + if keys.len() >= 2 { + let start = keys.first().unwrap().as_encoded(); + let end = keys.last().unwrap().as_encoded(); + let regions = box_try!(region_provider.get_regions_in_range(start, end)) + .into_iter() + .filter(|r| find_peer(r, store_id).is_some()) + .peekable() + .collect(); + + Ok(regions) + } else { + // We only have one key. + let key = keys[0].as_encoded(); + let region = box_try!(region_provider.find_region_by_key(key)); + if find_peer(®ion, store_id).is_none() { + return Ok(Vec::new()); } + + Ok(vec![region]) } } -fn get_keys_in_regions( - keys: Vec, - regions_provider: Option<(u64, Arc)>, -) -> Result>> { - if keys.len() >= 2 { - if let Some((store_id, region_info_provider)) = regions_provider { - let start = keys.first().unwrap().as_encoded(); - let end = keys.last().unwrap().as_encoded(); - let regions = box_try!(region_info_provider.get_regions_in_range(start, end)) - .into_iter() - .filter(move |r| find_peer(r, store_id).is_some()) - .peekable(); +fn get_keys_in_region(keys: &mut Peekable>, region: &Region) -> Vec { + let mut keys_in_region = Vec::new(); + + loop { + let Some(key) = keys.peek() else {break}; + let key = key.as_encoded().as_slice(); - let keys = keys.into_iter().peekable(); - return Ok(Box::new(KeysInRegions { keys, regions })); + if key < region.get_start_key() { + keys.next(); + } else if region.get_end_key().is_empty() || key < region.get_end_key() { + keys_in_region.push(keys.next().unwrap()); + } else { + break; } } - Ok(Box::new(keys.into_iter())) + + keys_in_region +} + +fn init_snap_ctx(store_id: u64, region: &Region) -> Context { + let mut ctx = Context::default(); + ctx.region_id = region.id; + ctx.region_epoch = region.region_epoch.clone(); + ctx.stale_read = true; + + if let Some(peer) = region.peers.iter().find(|peer| peer.store_id == store_id) { + ctx.set_peer(peer.clone()); + } + + ctx } impl GcRunner @@ -289,6 +305,7 @@ where RR: RaftStoreRouter, { pub fn new( + store_id: u64, engine: E, raft_store_router: RR, flow_info_sender: Sender, @@ -301,6 +318,7 @@ where f64::INFINITY }); Self { + store_id, engine, raft_store_router, flow_info_sender, @@ -359,19 +377,19 @@ where Ok(()) } - fn gc(&mut self, start_key: &[u8], end_key: &[u8], safe_point: TimeStamp) -> Result<()> { - if !self.need_gc(start_key, end_key, safe_point) { + fn gc(&mut self, region: Region, safe_point: TimeStamp) -> Result<()> { + if !self.need_gc(region.get_start_key(), region.get_end_key(), safe_point) { GC_SKIPPED_COUNTER.inc(); return Ok(()); } let mut reader = MvccReader::new( - self.engine.snapshot_on_kv_engine(start_key, end_key)?, + self.get_snapshot(self.store_id, ®ion)?, Some(ScanMode::Forward), false, ); - let mut next_key = Some(Key::from_encoded_slice(start_key)); + let mut next_key = Some(Key::from_encoded_slice(region.get_start_key())); while next_key.is_some() { // Scans at most `GcConfig.batch_keys` keys. let (keys, updated_next_key) = reader @@ -383,14 +401,14 @@ where GC_EMPTY_RANGE_COUNTER.inc(); break; } - self.gc_keys(keys, safe_point, None)?; + self.gc_keys(keys, safe_point, Either::Left(region.clone()))?; } self.mut_stats(GcKeyMode::txn).add(&reader.statistics); debug!( "gc has finished"; - "start_key" => log_wrappers::Value::key(start_key), - "end_key" => log_wrappers::Value::key(end_key), + "start_key" => log_wrappers::Value::key(region.get_start_key()), + "end_key" => log_wrappers::Value::key(region.get_end_key()), "safe_point" => safe_point ); Ok(()) @@ -400,10 +418,11 @@ where &mut self, keys: Vec, safe_point: TimeStamp, - regions_provider: Option<(u64, Arc)>, + region_or_provider: Either>, ) -> Result<(usize, usize)> { + let store_id = self.store_id; let count = keys.len(); - let range_start_key = keys.first().unwrap().clone().into_encoded(); + let range_start_key = keys.first().unwrap().clone(); let range_end_key = { let mut k = keys .last() @@ -411,77 +430,110 @@ where .to_raw() .map_err(|e| EngineError::Codec(e))?; k.push(0); - Key::from_raw(&k).into_encoded() + Key::from_raw(&k) }; - let snapshot = self - .engine - .snapshot_on_kv_engine(&range_start_key, &range_end_key)?; - let mut keys = get_keys_in_regions(keys, regions_provider)?; - - let mut txn = Self::new_txn(); - let mut reader = if count <= 1 { - MvccReader::new(snapshot, None, false) - } else { - // keys are closing to each other in one batch of gc keys, so do not use - // prefix seek here to avoid too many seeks - MvccReader::new(snapshot, Some(ScanMode::Forward), false) + let (mut handled_keys, mut wasted_keys) = (0, 0); + let regions = match region_or_provider { + Either::Left(region) => vec![region], + Either::Right(region_provider) => get_regions_for_gc(store_id, &keys, region_provider)?, }; - let (mut handled_keys, mut wasted_keys) = (0, 0); - let mut gc_info = GcInfo::default(); - let mut next_gc_key = keys.next(); - while let Some(ref key) = next_gc_key { - if let Err(e) = self.gc_key(safe_point, key, &mut gc_info, &mut txn, &mut reader) { - GC_KEY_FAILURES.inc(); - error!(?e; "GC meets failure"; "key" => %key,); - // Switch to the next key if meets failure. - gc_info.is_completed = true; - } + // First item is fetched to initialize the reader and kv_engine + if regions.is_empty() { + return Ok((handled_keys, wasted_keys)); + } - if gc_info.is_completed { - if gc_info.found_versions >= GC_LOG_FOUND_VERSION_THRESHOLD { - debug!( - "GC found plenty versions for a key"; - "key" => %key, - "versions" => gc_info.found_versions, - ); - } - if gc_info.deleted_versions as usize >= GC_LOG_DELETED_VERSION_THRESHOLD { - debug!( - "GC deleted plenty versions for a key"; - "key" => %key, - "versions" => gc_info.deleted_versions, - ); + let mut txn = Self::new_txn(); + let mut gc_info = GcInfo::default(); + let mut keys = keys.into_iter().peekable(); + for region in regions { + let mut reader = self.create_reader( + count, + ®ion, + range_start_key.clone(), + range_end_key.clone(), + )?; + + let mut keys_in_region = get_keys_in_region(&mut keys, ®ion).into_iter(); + let mut next_gc_key = keys_in_region.next(); + while let Some(ref key) = next_gc_key { + if let Err(e) = self.gc_key(safe_point, key, &mut gc_info, &mut txn, &mut reader) { + GC_KEY_FAILURES.inc(); + error!(?e; "GC meets failure"; "key" => %key,); + // Switch to the next key if meets failure. + gc_info.is_completed = true; } - if gc_info.found_versions > 0 { - handled_keys += 1; + if gc_info.is_completed { + if gc_info.found_versions >= GC_LOG_FOUND_VERSION_THRESHOLD { + debug!( + "GC found plenty versions for a key"; + "key" => %key, + "versions" => gc_info.found_versions, + ); + } + if gc_info.deleted_versions as usize >= GC_LOG_DELETED_VERSION_THRESHOLD { + debug!( + "GC deleted plenty versions for a key"; + "key" => %key, + "versions" => gc_info.deleted_versions, + ); + } + + if gc_info.found_versions > 0 { + handled_keys += 1; + } else { + wasted_keys += 1; + } + next_gc_key = keys_in_region.next(); + gc_info = GcInfo::default(); } else { - wasted_keys += 1; + Self::flush_txn(txn, &self.limiter, &self.engine)?; + reader = self.create_reader( + count, + ®ion, + range_start_key.clone(), + range_end_key.clone(), + )?; + txn = Self::new_txn(); } - next_gc_key = keys.next(); - gc_info = GcInfo::default(); - } else { - Self::flush_txn(txn, &self.limiter, &self.engine)?; - let snapshot = self - .engine - .snapshot_on_kv_engine(&range_start_key, &range_end_key)?; - txn = Self::new_txn(); - reader = MvccReader::new(snapshot, Some(ScanMode::Forward), false); } } + Self::flush_txn(txn, &self.limiter, &self.engine)?; Ok((handled_keys, wasted_keys)) } + fn create_reader( + &self, + key_count: usize, + region: &Region, + range_start_key: Key, + range_end_key: Key, + ) -> Result> { + let mut reader = { + let snapshot = self.get_snapshot(self.store_id, region)?; + + if key_count <= 1 { + MvccReader::new(snapshot, None, false) + } else { + // keys are closing to each other in one batch of gc keys, so do not use + // prefix seek here to avoid too many seeks + MvccReader::new(snapshot, Some(ScanMode::Forward), false) + } + }; + reader.set_range(Some(range_start_key), Some(range_end_key)); + Ok(reader) + } + fn raw_gc_keys( &mut self, keys: Vec, safe_point: TimeStamp, - regions_provider: Option<(u64, Arc)>, + regions_provider: Arc, ) -> Result<(usize, usize)> { - let range_start_key = keys.first().unwrap().clone().into_encoded(); + let range_start_key = keys.first().unwrap().clone(); let range_end_key = { let mut k = keys .last() @@ -489,56 +541,64 @@ where .to_raw() .map_err(|e| EngineError::Codec(e))?; k.push(0); - Key::from_raw(&k).into_encoded() + Key::from_raw(&k) }; - let mut snapshot = self - .engine - .snapshot_on_kv_engine(&range_start_key, &range_end_key)?; - let mut raw_modifies = MvccRaw::new(); - let mut keys = get_keys_in_regions(keys, regions_provider)?; - let (mut handled_keys, mut wasted_keys) = (0, 0); - let mut gc_info = GcInfo::default(); - let mut next_gc_key = keys.next(); - while let Some(ref key) = next_gc_key { - if let Err(e) = self.raw_gc_key( - safe_point, - key, - &mut raw_modifies, - &mut snapshot, - &mut gc_info, - ) { - GC_KEY_FAILURES.inc(); - error!(?e; "Raw GC meets failure"; "key" => %key,); - // Switch to the next key if meets failure. - gc_info.is_completed = true; - } + let regions = get_regions_for_gc(self.store_id, &keys, regions_provider)?; - if gc_info.is_completed { - if gc_info.found_versions >= GC_LOG_FOUND_VERSION_THRESHOLD { - debug!( - "RawKV GC found plenty versions for a key"; - "key" => %key, - "versions" => gc_info.found_versions, - ); - } - if gc_info.found_versions > 0 { - handled_keys += 1; - } else { - wasted_keys += 1; + if regions.is_empty() { + return Ok((handled_keys, wasted_keys)); + } + + let mut gc_info = GcInfo::default(); + let mut keys = keys.into_iter().peekable(); + for region in regions { + let mut snapshot = self.get_snapshot(self.store_id, ®ion)?; + + let mut keys_in_region = get_keys_in_region(&mut keys, ®ion).into_iter(); + let mut next_gc_key = keys_in_region.next(); + while let Some(ref key) = next_gc_key { + if let Err(e) = self.raw_gc_key( + safe_point, + key, + &range_start_key, + &range_end_key, + &mut raw_modifies, + &mut snapshot, + &mut gc_info, + ) { + GC_KEY_FAILURES.inc(); + error!(?e; "Raw GC meets failure"; "key" => %key,); + // Switch to the next key if meets failure. + gc_info.is_completed = true; } - gc_info.report_metrics(STAT_RAW_KEYMODE); + if gc_info.is_completed { + if gc_info.found_versions >= GC_LOG_FOUND_VERSION_THRESHOLD { + debug!( + "RawKV GC found plenty versions for a key"; + "key" => %key, + "versions" => gc_info.found_versions, + ); + } + if gc_info.found_versions > 0 { + handled_keys += 1; + } else { + wasted_keys += 1; + } - next_gc_key = keys.next(); - gc_info = GcInfo::default(); - } else { - // Flush writeBatch to engine. - Self::flush_raw_gc(raw_modifies, &self.limiter, &self.engine)?; - // After flush, reset raw_modifies. - raw_modifies = MvccRaw::new(); + gc_info.report_metrics(STAT_RAW_KEYMODE); + + next_gc_key = keys_in_region.next(); + gc_info = GcInfo::default(); + } else { + // Flush writeBatch to engine. + Self::flush_raw_gc(raw_modifies, &self.limiter, &self.engine)?; + // After flush, reset raw_modifies. + raw_modifies = MvccRaw::new(); + } } } @@ -551,12 +611,16 @@ where &mut self, safe_point: TimeStamp, key: &Key, + range_start_key: &Key, + range_end_key: &Key, raw_modifies: &mut MvccRaw, kv_snapshot: &mut ::Snap, gc_info: &mut GcInfo, ) -> Result<()> { let start_key = key.clone().append_ts(safe_point.prev()); - let mut cursor = CursorBuilder::new(kv_snapshot, CF_DEFAULT).build()?; + let mut cursor = CursorBuilder::new(kv_snapshot, CF_DEFAULT) + .range(Some(range_start_key.clone()), Some(range_end_key.clone())) + .build()?; let mut statistics = CfStatistics::default(); cursor.seek(&start_key, &mut statistics)?; @@ -629,7 +693,13 @@ where Ok(()) } - fn unsafe_destroy_range(&self, ctx: &Context, start_key: &Key, end_key: &Key) -> Result<()> { + fn unsafe_destroy_range( + &self, + ctx: &Context, + start_key: &Key, + end_key: &Key, + _regions_provider: Arc, + ) -> Result<()> { info!( "unsafe destroy range started"; "start_key" => %start_key, "end_key" => %end_key @@ -727,15 +797,31 @@ where max_ts: TimeStamp, start_key: &Key, limit: usize, + regions_provider: Arc, ) -> Result> { - let snap = self - .engine - .snapshot_on_kv_engine(start_key.as_encoded(), &[]) - .unwrap(); - let mut reader = MvccReader::new(snap, Some(ScanMode::Forward), false); - let (locks, _) = reader - .scan_locks(Some(start_key), None, |l| l.ts <= max_ts, limit) - .map_err(TxnError::from_mvcc)?; + let regions = box_try!(regions_provider.get_regions_in_range(start_key.as_encoded(), &[])) + .into_iter() + .filter(move |r| find_peer(r, self.store_id).is_some()); + + let mut first_round = true; + let mut locks = Vec::new(); + for region in regions { + let start_key = { + if first_round { + first_round = false; + start_key.clone() + } else { + Key::from_raw(region.get_start_key()) + } + }; + let snap = self.get_snapshot(self.store_id, ®ion)?; + let mut reader = MvccReader::new(snap, Some(ScanMode::Forward), false); + let (locks_this_region, _) = reader + .scan_locks(Some(&start_key), None, |l| l.ts <= max_ts, limit) + .map_err(TxnError::from_mvcc)?; + + locks.extend(locks_this_region); + } let mut lock_infos = Vec::with_capacity(locks.len()); for (key, lock) in locks { @@ -771,6 +857,18 @@ where self.cfg = incoming.clone(); } } + + fn get_snapshot(&self, store_id: u64, region: &Region) -> Result<::Snap> { + let ctx = init_snap_ctx(store_id, region); + let snap_ctx = SnapContext { + pb_ctx: &ctx, + ..Default::default() + }; + + Ok(block_on(async { + tikv_kv::snapshot(&self.engine, snap_ctx).await + })?) + } } impl Runnable for GcRunner @@ -803,32 +901,30 @@ where match task { GcTask::Gc { - start_key, - end_key, + region, safe_point, callback, - .. } => { - let res = self.gc(&start_key, &end_key, safe_point); + let res = self.gc(region.clone(), safe_point); update_metrics(res.is_err()); callback(res); self.update_statistics_metrics(GcKeyMode::txn); slow_log!( T timer, "GC on range [{}, {}), safe_point {}", - log_wrappers::Value::key(&start_key), - log_wrappers::Value::key(&end_key), + log_wrappers::Value::key(region.get_start_key()), + log_wrappers::Value::key(region.get_end_key()), safe_point ); } GcTask::GcKeys { keys, safe_point, - store_id, region_info_provider, } => { let old_seek_tombstone = self.mut_stats(GcKeyMode::txn).write.seek_tombstone; - match self.gc_keys(keys, safe_point, Some((store_id, region_info_provider))) { + + match self.gc_keys(keys, safe_point, Either::Right(region_info_provider)) { Ok((handled, wasted)) => { GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED .with_label_values(&[STAT_TXN_KEYMODE]) @@ -851,10 +947,9 @@ where GcTask::RawGcKeys { keys, safe_point, - store_id, region_info_provider, } => { - match self.raw_gc_keys(keys, safe_point, Some((store_id, region_info_provider))) { + match self.raw_gc_keys(keys, safe_point, region_info_provider) { Ok((handled, wasted)) => { GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED .with_label_values(&[STAT_RAW_KEYMODE]) @@ -876,8 +971,10 @@ where start_key, end_key, callback, + region_info_provider, } => { - let res = self.unsafe_destroy_range(&ctx, &start_key, &end_key); + let res = + self.unsafe_destroy_range(&ctx, &start_key, &end_key, region_info_provider); update_metrics(res.is_err()); callback(res); slow_log!( @@ -893,8 +990,15 @@ where start_key, limit, callback, + region_info_provider, } => { - let res = self.handle_physical_scan_lock(&ctx, max_ts, &start_key, limit); + let res = self.handle_physical_scan_lock( + &ctx, + max_ts, + &start_key, + limit, + region_info_provider, + ); update_metrics(res.is_err()); callback(res); slow_log!( @@ -952,17 +1056,13 @@ fn handle_gc_task_schedule_error(e: ScheduleError>) -> Res /// Schedules a `GcTask` to the `GcRunner`. fn schedule_gc( scheduler: &Scheduler>, - region_id: u64, - start_key: Vec, - end_key: Vec, + region: Region, safe_point: TimeStamp, callback: Callback<()>, ) -> Result<()> { scheduler .schedule(GcTask::Gc { - region_id, - start_key, - end_key, + region, safe_point, callback, }) @@ -972,15 +1072,10 @@ fn schedule_gc( /// Does GC synchronously. pub fn sync_gc( scheduler: &Scheduler>, - region_id: u64, - start_key: Vec, - end_key: Vec, + region: Region, safe_point: TimeStamp, ) -> Result<()> { - wait_op!(|callback| schedule_gc( - scheduler, region_id, start_key, end_key, safe_point, callback - )) - .unwrap_or_else(|| { + wait_op!(|callback| schedule_gc(scheduler, region, safe_point, callback)).unwrap_or_else(|| { error!("failed to receive result of gc"); Err(box_err!("gc_worker: failed to receive result of gc")) }) @@ -999,6 +1094,7 @@ where raft_store_router: RR, /// Used to signal unsafe destroy range is executed. flow_info_sender: Option>, + region_info_provider: Arc, config_manager: GcWorkerConfigManager, @@ -1034,6 +1130,7 @@ where applied_lock_collector: self.applied_lock_collector.clone(), gc_manager_handle: self.gc_manager_handle.clone(), feature_gate: self.feature_gate.clone(), + region_info_provider: self.region_info_provider.clone(), } } } @@ -1069,6 +1166,7 @@ where flow_info_sender: Sender, cfg: GcConfig, feature_gate: FeatureGate, + region_info_provider: Arc, ) -> GcWorker { let worker_builder = WorkerBuilder::new("gc-worker").pending_capacity(GC_MAX_PENDING_TASKS); let worker = worker_builder.create().lazy_build("gc-worker"); @@ -1084,11 +1182,13 @@ where applied_lock_collector: None, gc_manager_handle: Arc::new(Mutex::new(None)), feature_gate, + region_info_provider, } } pub fn start_auto_gc( &self, + kv_engine: &E::Local, cfg: AutoGcConfig, safe_point: Arc, // Store safe point here. ) -> Result<()> { @@ -1098,7 +1198,7 @@ where ); info!("initialize compaction filter to perform GC when necessary"); - self.engine.kv_engine().init_compaction_filter( + kv_engine.init_compaction_filter( cfg.self_store_id, safe_point.clone(), self.config_manager.clone(), @@ -1122,8 +1222,9 @@ where Ok(()) } - pub fn start(&mut self) -> Result<()> { + pub fn start(&mut self, store_id: u64) -> Result<()> { let runner = GcRunner::new( + store_id, self.engine.clone(), self.raft_store_router.clone(), self.flow_info_sender.take().unwrap(), @@ -1163,14 +1264,10 @@ where } /// Only for tests. - pub fn gc(&self, safe_point: TimeStamp, callback: Callback<()>) -> Result<()> { - let start_key = vec![]; - let end_key = vec![]; + pub fn gc(&self, region: Region, safe_point: TimeStamp, callback: Callback<()>) -> Result<()> { self.worker_scheduler .schedule(GcTask::Gc { - region_id: 0, - start_key, - end_key, + region, safe_point, callback, }) @@ -1203,6 +1300,7 @@ where start_key, end_key, callback, + region_info_provider: self.region_info_provider.clone(), }) .or_else(handle_gc_task_schedule_error) } @@ -1228,6 +1326,7 @@ where start_key, limit, callback, + region_info_provider: self.region_info_provider.clone(), }) .or_else(handle_gc_task_schedule_error) } @@ -1271,20 +1370,22 @@ pub mod test_gc_worker { use std::sync::Arc; use engine_rocks::{RocksEngine, RocksSnapshot}; - use engine_traits::KvEngine; use kvproto::{ kvrpcpb::Context, metapb::{Peer, Region}, }; use raftstore::store::RegionSnapshot; - use tikv_kv::{write_modifies, Engine, Modify, SnapContext, WriteData}; + use tikv_kv::write_modifies; use txn_types::{Key, TimeStamp}; use crate::{ server::gc_worker::{GcSafePointProvider, Result as GcWorkerResult}, storage::{ - kv, - kv::{Callback as EngineCallback, Result as EngineResult}, + kv::{ + self, Callback as EngineCallback, Modify, Result as EngineResult, SnapContext, + WriteData, + }, + Engine, }, }; @@ -1306,22 +1407,6 @@ pub mod test_gc_worker { self.0.kv_engine() } - fn snapshot_on_kv_engine( - &self, - start_key: &[u8], - end_key: &[u8], - ) -> kv::Result { - let mut region = Region::default(); - region.set_start_key(start_key.to_owned()); - region.set_end_key(end_key.to_owned()); - // Use a fake peer to avoid panic. - region.mut_peers().push(Default::default()); - Ok(RegionSnapshot::from_snapshot( - Arc::new(self.kv_engine().snapshot()), - Arc::new(region), - )) - } - fn modify_on_kv_engine(&self, mut modifies: Vec) -> kv::Result<()> { for modify in &mut modifies { match modify { @@ -1419,8 +1504,12 @@ mod tests { }; use raft::StateRole; use raftstore::{ - coprocessor::{region_info_accessor::RegionInfoAccessor, RegionChangeEvent}, + coprocessor::{ + region_info_accessor::{MockRegionInfoProvider, RegionInfoAccessor}, + RegionChangeEvent, + }, router::RaftStoreBlackHole, + store::util::new_peer, }; use tikv_kv::Snapshot; use tikv_util::{codec::number::NumberEncoder, future::paired_future_callback}; @@ -1444,6 +1533,64 @@ mod tests { }, }; + #[test] + fn test_get_regions_for_gc() { + fn init_region( + start_key: &[u8], + end_key: &[u8], + region_id: u64, + store_id: Option, + ) -> Region { + let start_key = Key::from_encoded(start_key.to_vec()); + let end_key = Key::from_encoded(end_key.to_vec()); + let mut region = Region::default(); + region.set_start_key(start_key.as_encoded().clone()); + region.set_end_key(end_key.as_encoded().clone()); + region.id = region_id; + if let Some(store_id) = store_id { + region.mut_peers().push(Peer::default()); + region.mut_peers()[0].set_store_id(store_id); + } + region + } + + let store_id = 1; + + let r1 = init_region(b"", b"k10", 1, None); + let r2 = init_region(b"k20", b"k30", 2, Some(store_id)); + let r3 = init_region(b"k30", b"", 3, Some(store_id)); + + let ri_provider = Arc::new(MockRegionInfoProvider::new(vec![ + r1, + r2.clone(), + r3.clone(), + ])); + + let keys = vec![Key::from_encoded(b"k05".to_vec())]; + let regions = get_regions_for_gc(store_id, &keys, ri_provider.clone()).unwrap(); + // store id not match + assert!(regions.is_empty()); + + let keys = vec![ + Key::from_encoded(b"k05".to_vec()), + Key::from_encoded(b"k10".to_vec()), + Key::from_encoded(b"k25".to_vec()), + ]; + let regions = get_regions_for_gc(store_id, &keys, ri_provider.clone()).unwrap(); + let rs = vec![r2.clone()]; + assert_eq!(regions, rs); + + let keys = vec![ + Key::from_encoded(b"k05".to_vec()), + Key::from_encoded(b"k10".to_vec()), + Key::from_encoded(b"k25".to_vec()), + Key::from_encoded(b"k35".to_vec()), + ]; + let regions = get_regions_for_gc(store_id, &keys, ri_provider).unwrap(); + let rs = vec![r2, r3]; + assert_eq!(regions, rs); + } + /// Assert the data in `storage` is the same as `expected_data`. Keys in /// `expected_data` should be encoded form without ts. fn check_data( @@ -1476,8 +1623,10 @@ mod tests { commit_ts: impl Into, start_key: &[u8], end_key: &[u8], + split_key: &[u8], ) -> Result<()> { // Return Result from this function so we can use the `wait_op` macro here. + let store_id = 1; let engine = TestEngineBuilder::new().build().unwrap(); let storage = @@ -1486,10 +1635,26 @@ mod tests { .unwrap(); let gate = FeatureGate::default(); gate.set_version("5.0.0").unwrap(); + let (tx, _rx) = mpsc::channel(); - let mut gc_worker = - GcWorker::new(engine, RaftStoreBlackHole, tx, GcConfig::default(), gate); - gc_worker.start().unwrap(); + + let mut region1 = Region::default(); + region1.mut_peers().push(new_peer(store_id, 1)); + region1.set_end_key(split_key.to_vec()); + + let mut region2 = Region::default(); + region2.mut_peers().push(new_peer(store_id, 2)); + region2.set_start_key(split_key.to_vec()); + + let mut gc_worker = GcWorker::new( + engine, + RaftStoreBlackHole, + tx, + GcConfig::default(), + gate, + Arc::new(MockRegionInfoProvider::new(vec![region1, region2])), + ); + gc_worker.start(store_id).unwrap(); // Convert keys to key value pairs, where the value is "value-{key}". let data: BTreeMap<_, _> = init_keys .iter() @@ -1567,6 +1732,7 @@ mod tests { 10, b"key2", b"key4", + b"key3", ) .unwrap(); @@ -1576,6 +1742,7 @@ mod tests { 10, b"key3", b"key7", + b"key5", ) .unwrap(); @@ -1591,6 +1758,7 @@ mod tests { 10, b"key1", b"key9", + b"key5", ) .unwrap(); @@ -1606,6 +1774,7 @@ mod tests { 10, b"key2\x00", b"key4", + b"key3", ) .unwrap(); @@ -1620,6 +1789,7 @@ mod tests { 10, b"key1\x00", b"key1\x00\x00", + b"key1", ) .unwrap(); @@ -1634,12 +1804,14 @@ mod tests { 10, b"key1\x00", b"key1\x00", + b"key1", ) .unwrap(); } #[test] fn test_physical_scan_lock() { + let store_id = 1; let engine = TestEngineBuilder::new().build().unwrap(); let prefixed_engine = PrefixedEngine(engine); let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr( @@ -1649,14 +1821,17 @@ mod tests { .build() .unwrap(); let (tx, _rx) = mpsc::channel(); + let mut region = Region::default(); + region.mut_peers().push(new_peer(store_id, 0)); let mut gc_worker = GcWorker::new( prefixed_engine, RaftStoreBlackHole, tx, GcConfig::default(), FeatureGate::default(), + Arc::new(MockRegionInfoProvider::new(vec![region])), ); - gc_worker.start().unwrap(); + gc_worker.start(store_id).unwrap(); let physical_scan_lock = |max_ts: u64, start_key, limit| { let (cb, f) = paired_future_callback(); @@ -1718,20 +1893,27 @@ mod tests { #[test] fn test_gc_keys_with_region_info_provider() { + let store_id = 1; let engine = TestEngineBuilder::new().build().unwrap(); let prefixed_engine = PrefixedEngine(engine.clone()); let (tx, _rx) = mpsc::channel(); let feature_gate = FeatureGate::default(); feature_gate.set_version("5.0.0").unwrap(); + + let sp_provider = MockSafePointProvider(200); + let mut host = CoprocessorHost::::default(); + let ri_provider = RegionInfoAccessor::new(&mut host); + let mut gc_worker = GcWorker::new( prefixed_engine.clone(), RaftStoreBlackHole, tx, GcConfig::default(), feature_gate, + Arc::new(ri_provider.clone()), ); - gc_worker.start().unwrap(); + gc_worker.start(store_id).unwrap(); let mut r1 = Region::default(); r1.set_id(1); @@ -1745,7 +1927,7 @@ mod tests { r2.set_start_key(format!("k{:02}", 20).into_bytes()); r2.set_end_key(format!("k{:02}", 30).into_bytes()); r2.mut_peers().push(Peer::default()); - r2.mut_peers()[0].set_store_id(1); + r2.mut_peers()[0].set_store_id(store_id); let mut r3 = Region::default(); r3.set_id(3); @@ -1753,14 +1935,14 @@ mod tests { r3.set_start_key(format!("k{:02}", 30).into_bytes()); r3.set_end_key(b"".to_vec()); r3.mut_peers().push(Peer::default()); - r3.mut_peers()[0].set_store_id(1); + r3.mut_peers()[0].set_store_id(store_id); - let sp_provider = MockSafePointProvider(200); - let mut host = CoprocessorHost::::default(); - let ri_provider = RegionInfoAccessor::new(&mut host); let auto_gc_cfg = AutoGcConfig::new(sp_provider, ri_provider, 1); let safe_point = Arc::new(AtomicU64::new(0)); - gc_worker.start_auto_gc(auto_gc_cfg, safe_point).unwrap(); + let kv_engine = engine.get_rocksdb(); + gc_worker + .start_auto_gc(&kv_engine, auto_gc_cfg, safe_point) + .unwrap(); host.on_region_changed(&r1, RegionChangeEvent::Create, StateRole::Leader); host.on_region_changed(&r2, RegionChangeEvent::Create, StateRole::Leader); host.on_region_changed(&r3, RegionChangeEvent::Create, StateRole::Leader); @@ -1811,12 +1993,14 @@ mod tests { #[test] fn test_gc_keys_statistics() { + let store_id = 1; let engine = TestEngineBuilder::new().build().unwrap(); let prefixed_engine = PrefixedEngine(engine.clone()); let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); let mut runner = GcRunner::new( + store_id, prefixed_engine.clone(), RaftStoreBlackHole, tx, @@ -1832,7 +2016,7 @@ mod tests { r1.set_start_key(b"".to_vec()); r1.set_end_key(b"".to_vec()); r1.mut_peers().push(Peer::default()); - r1.mut_peers()[0].set_store_id(1); + r1.mut_peers()[0].set_store_id(store_id); let mut host = CoprocessorHost::::default(); let ri_provider = RegionInfoAccessor::new(&mut host); @@ -1854,7 +2038,11 @@ mod tests { assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek, 0); assert_eq!(runner.mut_stats(GcKeyMode::txn).write.next, 0); runner - .gc_keys(keys, TimeStamp::new(200), Some((1, Arc::new(ri_provider)))) + .gc_keys( + keys, + TimeStamp::new(200), + Either::Right(Arc::new(ri_provider)), + ) .unwrap(); assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek, 1); assert_eq!(runner.mut_stats(GcKeyMode::txn).write.next, 100 * 2); @@ -1862,6 +2050,7 @@ mod tests { #[test] fn test_raw_gc_keys() { + let store_id = 1; // init engine and gc runner let mut cfg = DbConfig::default(); cfg.defaultcf.disable_auto_compactions = true; @@ -1874,6 +2063,7 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); let mut runner = GcRunner::new( + store_id, prefixed_engine.clone(), RaftStoreBlackHole, tx, @@ -1889,7 +2079,7 @@ mod tests { r1.set_start_key(b"".to_vec()); r1.set_end_key(b"".to_vec()); r1.mut_peers().push(Peer::default()); - r1.mut_peers()[0].set_store_id(1); + r1.mut_peers()[0].set_store_id(store_id); let mut host = CoprocessorHost::::default(); let ri_provider = Arc::new(RegionInfoAccessor::new(&mut host)); @@ -1946,13 +2136,15 @@ mod tests { .collect(); runner - .raw_gc_keys(to_gc_keys, TimeStamp::new(120), Some((1, ri_provider))) + .raw_gc_keys(to_gc_keys, TimeStamp::new(120), ri_provider) .unwrap(); assert_eq!(7, runner.mut_stats(GcKeyMode::raw).data.next); assert_eq!(2, runner.mut_stats(GcKeyMode::raw).data.seek); - let snapshot = prefixed_engine.snapshot_on_kv_engine(&[], &[]).unwrap(); + let snapshot = + block_on(async { tikv_kv::snapshot(&prefixed_engine, SnapContext::default()).await }) + .unwrap(); test_raws .clone() @@ -1972,6 +2164,7 @@ mod tests { let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); let mut runner = GcRunner::new( + 1, prefixed_engine.clone(), RaftStoreBlackHole, tx, @@ -2008,7 +2201,7 @@ mod tests { .gc_keys( vec![Key::from_raw(b"k2\x00")], TimeStamp::new(200), - Some((1, ri_provider.clone())), + Either::Right(ri_provider.clone()), ) .unwrap(); assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 20); @@ -2023,7 +2216,7 @@ mod tests { .gc_keys( vec![Key::from_raw(b"k2")], TimeStamp::new(200), - Some((1, ri_provider.clone())), + Either::Right(ri_provider.clone()), ) .unwrap(); assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 0); @@ -2038,7 +2231,7 @@ mod tests { .gc_keys( vec![Key::from_raw(b"k1"), Key::from_raw(b"k2")], TimeStamp::new(200), - Some((1, ri_provider.clone())), + Either::Right(ri_provider.clone()), ) .unwrap(); assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 0); @@ -2069,7 +2262,7 @@ mod tests { .gc_keys( vec![Key::from_raw(b"k2")], safepoint.into(), - Some((1, ri_provider)), + Either::Right(ri_provider), ) .unwrap(); // The first batch will leave tombstones that will be seen while processing the @@ -2085,6 +2278,7 @@ mod tests { #[test] fn delete_range_when_worker_is_full() { + let store_id = 1; let engine = PrefixedEngine(TestEngineBuilder::new().build().unwrap()); must_prewrite_put(&engine, b"key", b"value", b"key", 10); must_commit(&engine, b"key", 10, 20); @@ -2096,12 +2290,16 @@ mod tests { gate.set_version("5.0.0").unwrap(); let (tx, _rx) = mpsc::channel(); + let mut region = Region::default(); + region.mut_peers().push(new_peer(store_id, 1)); + let mut gc_worker = GcWorker::new( engine.clone(), RaftStoreBlackHole, tx, GcConfig::default(), gate, + Arc::new(MockRegionInfoProvider::new(vec![region.clone()])), ); // Before starting gc_worker, fill the scheduler to full. @@ -2109,9 +2307,7 @@ mod tests { gc_worker .scheduler() .schedule(GcTask::Gc { - region_id: 0, - start_key: vec![], - end_key: vec![], + region: region.clone(), safe_point: TimeStamp::from(100), callback: Box::new(|_res| {}), }) @@ -2121,6 +2317,7 @@ mod tests { let (tx, rx) = mpsc::channel(); gc_worker .gc( + Region::default(), TimeStamp::from(1), Box::new(move |res| { tx.send(res).unwrap(); @@ -2143,11 +2340,74 @@ mod tests { ) .unwrap(); - gc_worker.start().unwrap(); + gc_worker.start(store_id).unwrap(); // After the worker starts running, the destroy range task should run, // and the key in the range will be deleted. rx.recv_timeout(Duration::from_secs(10)).unwrap().unwrap(); must_get_none(&engine, b"key", 30); } + + #[test] + fn test_keys_in_regions_iteration() { + fn init_region(start_key: &[u8], end_key: &[u8]) -> Region { + let start_key = Key::from_raw(start_key); + let end_key = Key::from_raw(end_key); + let mut region = Region::default(); + region.set_start_key(start_key.as_encoded().clone()); + region.set_end_key(end_key.as_encoded().clone()); + region + } + + fn generate_keys(start: u64, end: u64) -> Vec { + (start..end) + .into_iter() + .map(|i| { + let key = format!("k{:02}", i); + Key::from_raw(key.as_bytes()) + }) + .collect::>() + } + + // One region cover all keys + let keys = generate_keys(1, 4); + let region = init_region(b"k01", b"k04"); + let mut iter = keys.clone().into_iter().peekable(); + let ks = get_keys_in_region(&mut iter, ®ion); + assert!(iter.peek().is_none()); + assert_eq!(ks, keys); + + // More than one regions cover all keys + let keys = generate_keys(1, 9); + let region1 = init_region(b"k01", b"k04"); + let region2 = init_region(b"k04", b"k06"); + let region3 = init_region(b"k06", b"k09"); + let mut iter = keys.into_iter().peekable(); + let ks = get_keys_in_region(&mut iter, ®ion1); + assert_eq!(ks, generate_keys(1, 4)); + let ks = get_keys_in_region(&mut iter, ®ion2); + assert_eq!(ks, generate_keys(4, 6)); + let ks = get_keys_in_region(&mut iter, ®ion3); + assert_eq!(ks, generate_keys(6, 9)); + assert!(iter.peek().is_none()); + + // Cover partial keys + let keys = generate_keys(1, 9); + let region1 = init_region(b"k01", b"k04"); + let region2 = init_region(b"k06", b"k09"); + let mut iter = keys.into_iter().peekable(); + let ks = get_keys_in_region(&mut iter, ®ion1); + assert_eq!(ks, generate_keys(1, 4)); + let ks = get_keys_in_region(&mut iter, ®ion2); + assert_eq!(ks, generate_keys(6, 9)); + assert!(iter.peek().is_none()); + + // No key + let keys = generate_keys(1, 9); + let region = init_region(b"k11", b"k20"); + let mut iter = keys.into_iter().peekable(); + let ks = get_keys_in_region(&mut iter, ®ion); + assert!(iter.peek().is_none()); + assert!(ks.is_empty()); + } } diff --git a/src/server/gc_worker/rawkv_compaction_filter.rs b/src/server/gc_worker/rawkv_compaction_filter.rs index e50e33c1b38..652b2cc54ac 100644 --- a/src/server/gc_worker/rawkv_compaction_filter.rs +++ b/src/server/gc_worker/rawkv_compaction_filter.rs @@ -239,7 +239,6 @@ impl RawCompactionFilter { let task = GcTask::RawGcKeys { keys: mem::replace(&mut self.mvcc_deletions, empty), safe_point: self.safe_point.into(), - store_id: self.regions_provider.0, region_info_provider: self.regions_provider.1.clone(), }; self.schedule_gc_task(task, false); diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index 9443ba26cd4..85aedb4d538 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -18,7 +18,6 @@ use engine_traits::{CfName, KvEngine, MvccProperties, Snapshot}; use kvproto::{ errorpb, kvrpcpb::{Context, IsolationLevel}, - metapb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, Request, Response}, }; use raft::{ @@ -37,6 +36,7 @@ use raftstore::{ }, }; use thiserror::Error; +use tikv_kv::write_modifies; use tikv_util::{codec::number::NumberEncoder, time::Instant}; use txn_types::{Key, TimeStamp, TxnExtra, TxnExtraScheduler, WriteBatchFlags}; @@ -44,8 +44,8 @@ use super::metrics::*; use crate::storage::{ self, kv, kv::{ - write_modifies, Callback, Engine, Error as KvError, ErrorInner as KvErrorInner, - ExtCallback, Modify, SnapContext, WriteData, + Callback, Engine, Error as KvError, ErrorInner as KvErrorInner, ExtCallback, Modify, + SnapContext, WriteData, }, }; @@ -202,10 +202,10 @@ where ) -> Result<()> { let mut header = self.new_request_header(ctx.pb_ctx); let mut flags = 0; - if ctx.pb_ctx.get_stale_read() && !ctx.start_ts.is_zero() { + if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { let mut data = [0u8; 8]; (&mut data[..]) - .encode_u64(ctx.start_ts.into_inner()) + .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) .unwrap(); flags |= WriteBatchFlags::STALE_READ.bits(); header.set_flag_data(data.into()); @@ -329,18 +329,6 @@ where self.engine.clone() } - fn snapshot_on_kv_engine(&self, start_key: &[u8], end_key: &[u8]) -> kv::Result { - let mut region = metapb::Region::default(); - region.set_start_key(start_key.to_owned()); - region.set_end_key(end_key.to_owned()); - // Use a fake peer to avoid panic. - region.mut_peers().push(Default::default()); - Ok(RegionSnapshot::::from_raw( - self.engine.clone(), - region, - )) - } - fn modify_on_kv_engine(&self, mut modifies: Vec) -> kv::Result<()> { for modify in &mut modifies { match modify { @@ -438,8 +426,9 @@ where let mut req = Request::default(); req.set_cmd_type(CmdType::Snap); - if !ctx.key_ranges.is_empty() && !ctx.start_ts.is_zero() { - req.mut_read_index().set_start_ts(ctx.start_ts.into_inner()); + if !ctx.key_ranges.is_empty() && ctx.start_ts.map_or(false, |ts| !ts.is_zero()) { + req.mut_read_index() + .set_start_ts(ctx.start_ts.as_ref().unwrap().into_inner()); req.mut_read_index() .set_key_ranges(mem::take(&mut ctx.key_ranges).into()); } diff --git a/src/server/server.rs b/src/server/server.rs index f202e30e761..23c52793c5f 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -421,7 +421,10 @@ mod tests { use engine_rocks::RocksSnapshot; use grpcio::EnvBuilder; use kvproto::raft_serverpb::RaftMessage; - use raftstore::store::{transport::Transport, *}; + use raftstore::{ + coprocessor::region_info_accessor::MockRegionInfoProvider, + store::{transport::Transport, *}, + }; use resource_metering::ResourceTagFactory; use security::SecurityConfig; use tikv_util::quota_limiter::QuotaLimiter; @@ -481,6 +484,7 @@ mod tests { // 'https_proxy', and retry. #[test] fn test_peer_resolve() { + let mock_store_id = 5; let cfg = Config { addr: "127.0.0.1:0".to_owned(), ..Default::default() @@ -507,8 +511,9 @@ mod tests { tx, Default::default(), Default::default(), + Arc::new(MockRegionInfoProvider::new(Vec::new())), ); - gc_worker.start().unwrap(); + gc_worker.start(mock_store_id).unwrap(); let quick_fail = Arc::new(AtomicBool::new(false)); let cfg = Arc::new(VersionTrack::new(cfg)); @@ -535,7 +540,6 @@ mod tests { .build() .unwrap(), ); - let mock_store_id = 5; let addr = Arc::new(Mutex::new(None)); let (check_leader_scheduler, _) = tikv_util::worker::dummy_scheduler(); let mut server = Server::new( diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0a7801848b9..dd9a1a01c33 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1179,7 +1179,7 @@ impl Storage { let mut snap_ctx = SnapContext { pb_ctx: &ctx, - start_ts, + start_ts: Some(start_ts), ..Default::default() }; let mut key_range = KeyRange::default(); @@ -2715,7 +2715,7 @@ fn prepare_snap_ctx<'a>( let mut snap_ctx = SnapContext { pb_ctx, - start_ts, + start_ts: Some(start_ts), ..Default::default() }; if need_check_locks_in_replica_read(pb_ctx) { @@ -2791,18 +2791,6 @@ impl Engine for TxnTestEngine { self.engine.kv_engine() } - fn snapshot_on_kv_engine( - &self, - start_key: &[u8], - end_key: &[u8], - ) -> tikv_kv::Result { - let snapshot = self.engine.snapshot_on_kv_engine(start_key, end_key)?; - Ok(TxnTestSnapshot { - snapshot, - txn_ext: self.txn_ext.clone(), - }) - } - fn modify_on_kv_engine(&self, modifies: Vec) -> tikv_kv::Result<()> { self.engine.modify_on_kv_engine(modifies) } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index eb83af270a1..3f2771f0b59 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -124,6 +124,9 @@ pub struct MvccReader { lock_cursor: Option>, write_cursor: Option>, + lower_bound: Option, + upper_bound: Option, + /// None means following operations are performed on a single user key, /// i.e., different versions of the same key. It can use prefix seek to /// speed up reads from the write-cf. @@ -149,6 +152,8 @@ impl MvccReader { data_cursor: None, lock_cursor: None, write_cursor: None, + lower_bound: None, + upper_bound: None, scan_mode, current_key: None, fill_cache, @@ -164,6 +169,8 @@ impl MvccReader { data_cursor: None, lock_cursor: None, write_cursor: None, + lower_bound: None, + upper_bound: None, scan_mode, current_key: None, fill_cache: !ctx.get_not_fill_cache(), @@ -421,6 +428,7 @@ impl MvccReader { let cursor = CursorBuilder::new(&self.snapshot, CF_DEFAULT) .fill_cache(self.fill_cache) .scan_mode(self.get_scan_mode(true)) + .range(self.lower_bound.clone(), self.upper_bound.clone()) .build()?; self.data_cursor = Some(cursor); } @@ -434,6 +442,7 @@ impl MvccReader { // Only use prefix seek in non-scan mode. .prefix_seek(self.scan_mode.is_none()) .scan_mode(self.get_scan_mode(true)) + .range(self.lower_bound.clone(), self.upper_bound.clone()) .hint_min_ts(hint_min_ts) .build()?; self.write_cursor = Some(cursor); @@ -446,6 +455,7 @@ impl MvccReader { let cursor = CursorBuilder::new(&self.snapshot, CF_LOCK) .fill_cache(self.fill_cache) .scan_mode(self.get_scan_mode(true)) + .range(self.lower_bound.clone(), self.upper_bound.clone()) .build()?; self.lock_cursor = Some(cursor); } @@ -676,6 +686,11 @@ impl MvccReader { None => OldValue::None, }) } + + pub fn set_range(&mut self, lower: Option, upper: Option) { + self.lower_bound = lower; + self.upper_bound = upper; + } } #[cfg(test)] diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index 2af968c21be..7300074bfde 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -809,21 +809,27 @@ mod tests { fn get(&self, _: &Key) -> EngineResult> { Ok(None) } + fn get_cf(&self, _: CfName, _: &Key) -> EngineResult> { Ok(None) } + fn get_cf_opt(&self, _: ReadOptions, _: CfName, _: &Key) -> EngineResult> { Ok(None) } + fn iter(&self, _: CfName, _: IterOptions) -> EngineResult { Ok(MockRangeSnapshotIter::default()) } + fn lower_bound(&self) -> Option<&[u8]> { Some(self.start.as_slice()) } + fn upper_bound(&self) -> Option<&[u8]> { Some(self.end.as_slice()) } + fn ext(&self) -> DummySnapshotExt { DummySnapshotExt } diff --git a/tests/benches/hierarchy/storage/mod.rs b/tests/benches/hierarchy/storage/mod.rs index 3b906f0fffc..15873e2e424 100644 --- a/tests/benches/hierarchy/storage/mod.rs +++ b/tests/benches/hierarchy/storage/mod.rs @@ -13,7 +13,7 @@ use super::{BenchConfig, EngineFactory, DEFAULT_ITERATIONS}; fn storage_raw_get>(b: &mut Bencher<'_>, config: &BenchConfig) { let engine = config.engine_factory.build(); let store = SyncTestStorageBuilderApiV1::from_engine(engine) - .build() + .build(0) .unwrap(); b.iter_batched( || { @@ -37,7 +37,7 @@ fn storage_raw_get>(b: &mut Bencher<'_>, config: fn storage_prewrite>(b: &mut Bencher<'_>, config: &BenchConfig) { let engine = config.engine_factory.build(); let store = SyncTestStorageBuilderApiV1::from_engine(engine) - .build() + .build(0) .unwrap(); b.iter_batched( || { @@ -68,7 +68,7 @@ fn storage_prewrite>(b: &mut Bencher<'_>, config: fn storage_commit>(b: &mut Bencher<'_>, config: &BenchConfig) { let engine = config.engine_factory.build(); let store = SyncTestStorageBuilderApiV1::from_engine(engine) - .build() + .build(0) .unwrap(); b.iter_batched( || { diff --git a/tests/benches/misc/storage/incremental_get.rs b/tests/benches/misc/storage/incremental_get.rs index eb65f55fd72..a57bd3c90d5 100644 --- a/tests/benches/misc/storage/incremental_get.rs +++ b/tests/benches/misc/storage/incremental_get.rs @@ -11,7 +11,7 @@ use tikv::storage::{Engine, SnapshotStore, Statistics, Store}; use txn_types::{Key, Mutation}; fn table_lookup_gen_data() -> (SnapshotStore>, Vec) { - let store = SyncTestStorageBuilder::default().build().unwrap(); + let store = SyncTestStorageBuilder::default().build(0).unwrap(); let mut mutations = Vec::new(); let mut keys = Vec::new(); for i in 0..30000 { diff --git a/tests/benches/misc/storage/mvcc_reader.rs b/tests/benches/misc/storage/mvcc_reader.rs index df0f1d662d3..3e784ef6b73 100644 --- a/tests/benches/misc/storage/mvcc_reader.rs +++ b/tests/benches/misc/storage/mvcc_reader.rs @@ -7,7 +7,7 @@ use tikv::storage::{kv::RocksEngine, mvcc::SnapshotReader, Engine}; use txn_types::{Key, Mutation}; fn prepare_mvcc_data(key: &Key, n: u64) -> SyncTestStorageApiV1 { - let store = SyncTestStorageBuilderApiV1::default().build().unwrap(); + let store = SyncTestStorageBuilderApiV1::default().build(0).unwrap(); for ts in 1..=n { let mutation = Mutation::make_put(key.clone(), b"value".to_vec()); store diff --git a/tests/benches/misc/storage/scan.rs b/tests/benches/misc/storage/scan.rs index f17f61e1195..088ac013545 100644 --- a/tests/benches/misc/storage/scan.rs +++ b/tests/benches/misc/storage/scan.rs @@ -11,7 +11,7 @@ use txn_types::{Key, Mutation}; #[ignore] #[bench] fn bench_tombstone_scan(b: &mut Bencher) { - let store = SyncTestStorageBuilder::default().build().unwrap(); + let store = SyncTestStorageBuilder::default().build(0).unwrap(); let mut ts_generator = 1..; let mut kvs = KvGenerator::new(100, 1000); diff --git a/tests/failpoints/cases/test_gc_metrics.rs b/tests/failpoints/cases/test_gc_metrics.rs index ede14988744..c0f0d990f11 100644 --- a/tests/failpoints/cases/test_gc_metrics.rs +++ b/tests/failpoints/cases/test_gc_metrics.rs @@ -16,7 +16,9 @@ use kvproto::{ use pd_client::FeatureGate; use raft::StateRole; use raftstore::{ - coprocessor::{CoprocessorHost, RegionChangeEvent}, + coprocessor::{ + region_info_accessor::MockRegionInfoProvider, CoprocessorHost, RegionChangeEvent, + }, router::RaftStoreBlackHole, RegionInfoAccessor, }; @@ -128,6 +130,7 @@ fn test_txn_mvcc_filtered() { #[test] fn test_txn_gc_keys_handled() { + let store_id = 1; GC_COMPACTION_FILTER_MVCC_DELETION_MET.reset(); GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED.reset(); @@ -143,8 +146,9 @@ fn test_txn_gc_keys_handled() { tx, GcConfig::default(), feature_gate, + Arc::new(MockRegionInfoProvider::new(vec![])), ); - gc_worker.start().unwrap(); + gc_worker.start(store_id).unwrap(); let mut r1 = Region::default(); r1.set_id(1); @@ -152,14 +156,18 @@ fn test_txn_gc_keys_handled() { r1.set_start_key(b"".to_vec()); r1.set_end_key(b"".to_vec()); r1.mut_peers().push(Peer::default()); - r1.mut_peers()[0].set_store_id(1); + r1.mut_peers()[0].set_store_id(store_id); let sp_provider = MockSafePointProvider(200); let mut host = CoprocessorHost::::default(); let ri_provider = RegionInfoAccessor::new(&mut host); let auto_gc_cfg = AutoGcConfig::new(sp_provider, ri_provider, 1); let safe_point = Arc::new(AtomicU64::new(500)); - gc_worker.start_auto_gc(auto_gc_cfg, safe_point).unwrap(); + + let kv_engine = engine.get_rocksdb(); + gc_worker + .start_auto_gc(&kv_engine, auto_gc_cfg, safe_point) + .unwrap(); host.on_region_changed(&r1, RegionChangeEvent::Create, StateRole::Leader); let db = engine.kv_engine().as_inner().clone(); @@ -267,6 +275,7 @@ fn test_raw_mvcc_filtered() { #[test] fn test_raw_gc_keys_handled() { + let store_id = 1; GC_COMPACTION_FILTER_MVCC_DELETION_MET.reset(); GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED.reset(); @@ -285,8 +294,9 @@ fn test_raw_gc_keys_handled() { tx, GcConfig::default(), feature_gate, + Arc::new(MockRegionInfoProvider::new(vec![])), ); - gc_worker.start().unwrap(); + gc_worker.start(store_id).unwrap(); let mut r1 = Region::default(); r1.set_id(1); @@ -294,14 +304,18 @@ fn test_raw_gc_keys_handled() { r1.set_start_key(b"".to_vec()); r1.set_end_key(b"".to_vec()); r1.mut_peers().push(Peer::default()); - r1.mut_peers()[0].set_store_id(1); + r1.mut_peers()[0].set_store_id(store_id); let sp_provider = MockSafePointProvider(200); let mut host = CoprocessorHost::::default(); let ri_provider = RegionInfoAccessor::new(&mut host); - let auto_gc_cfg = AutoGcConfig::new(sp_provider, ri_provider, 1); + let auto_gc_cfg = AutoGcConfig::new(sp_provider, ri_provider, store_id); let safe_point = Arc::new(AtomicU64::new(500)); - gc_worker.start_auto_gc(auto_gc_cfg, safe_point).unwrap(); + + let kv_engine = engine.get_rocksdb(); + gc_worker + .start_auto_gc(&kv_engine, auto_gc_cfg, safe_point) + .unwrap(); host.on_region_changed(&r1, RegionChangeEvent::Create, StateRole::Leader); let db = engine.kv_engine().as_inner().clone(); diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index 73031b10283..9f80d942cd8 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -438,8 +438,10 @@ fn init_compaction_filter(cluster: &Cluster, store_id: u64) { let sim = cluster.sim.rl(); let gc_worker = sim.get_gc_worker(store_id); + let kv_engine = cluster.get_engine(store_id); gc_worker .start_auto_gc( + &kv_engine, AutoGcConfig::new(MockSafePointProvider, MockRegionInfoProvider, 1), Arc::new(AtomicU64::new(0)), ) diff --git a/tests/integrations/config/dynamic/gc_worker.rs b/tests/integrations/config/dynamic/gc_worker.rs index 3014ebc3ba2..e8b437f941a 100644 --- a/tests/integrations/config/dynamic/gc_worker.rs +++ b/tests/integrations/config/dynamic/gc_worker.rs @@ -1,8 +1,13 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::mpsc::channel, time::Duration}; +use std::{ + sync::{mpsc::channel, Arc}, + time::Duration, +}; -use raftstore::router::RaftStoreBlackHole; +use raftstore::{ + coprocessor::region_info_accessor::MockRegionInfoProvider, router::RaftStoreBlackHole, +}; use tikv::{ config::{ConfigController, Module, TikvConfig}, server::gc_worker::{GcConfig, GcTask, GcWorker}, @@ -34,8 +39,9 @@ fn setup_cfg_controller( tx, cfg.gc.clone(), Default::default(), + Arc::new(MockRegionInfoProvider::new(Vec::new())), ); - gc_worker.start().unwrap(); + gc_worker.start(0).unwrap(); let cfg_controller = ConfigController::new(cfg); cfg_controller.register(Module::Gc, Box::new(gc_worker.get_config_manager())); diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index 80b90d78045..855063bae98 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -10,7 +10,7 @@ use std::{ }; use engine_rocks::RocksSnapshot; -use kvproto::metapb; +use kvproto::{kvrpcpb::Op, metapb}; use more_asserts::assert_le; use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; @@ -828,3 +828,47 @@ fn test_node_local_read_renew_lease() { thread::sleep(request_wait); } } + +#[test] +fn test_stale_read_with_ts0() { + let mut cluster = new_server_cluster(0, 3); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.cfg.resolved_ts.enable = true; + cluster.run(); + + let leader = new_peer(1, 1); + cluster.must_transfer_leader(1, leader.clone()); + let mut leader_client = PeerClient::new(&cluster, 1, leader); + + let mut follower_client2 = PeerClient::new(&cluster, 1, new_peer(2, 2)); + + // Set the `stale_read` flag + leader_client.ctx.set_stale_read(true); + follower_client2.ctx.set_stale_read(true); + + let commit_ts1 = leader_client.must_kv_write( + &pd_client, + vec![new_mutation(Op::Put, &b"key1"[..], &b"value1"[..])], + b"key1".to_vec(), + ); + + let commit_ts2 = leader_client.must_kv_write( + &pd_client, + vec![new_mutation(Op::Put, &b"key1"[..], &b"value2"[..])], + b"key1".to_vec(), + ); + + follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), commit_ts1); + follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value2".to_vec(), commit_ts2); + assert!( + follower_client2 + .kv_read(b"key1".to_vec(), 0) + .region_error + .into_option() + .unwrap() + .not_leader + .is_some() + ); + assert!(leader_client.kv_read(b"key1".to_vec(), 0).not_found); +} diff --git a/tests/integrations/server/gc_worker.rs b/tests/integrations/server/gc_worker.rs index 59dc776dcca..36f9eed9ca8 100644 --- a/tests/integrations/server/gc_worker.rs +++ b/tests/integrations/server/gc_worker.rs @@ -271,7 +271,7 @@ fn test_applied_lock_collector() { // `keys::DATA_PREFIX`. This case ensures it's performed correctly. #[test] fn test_gc_bypass_raft() { - let (cluster, leader, ctx) = must_new_cluster_mul(1); + let (cluster, leader, ctx) = must_new_cluster_mul(2); cluster.pd_client.disable_default_operator(); let env = Arc::new(Environment::new(1)); @@ -300,17 +300,25 @@ fn test_gc_bypass_raft() { assert!(engine.kv.get_value_cf(CF_WRITE, &key).unwrap().is_some()); } - let gc_sched = cluster.sim.rl().get_gc_worker(1).scheduler(); - sync_gc(&gc_sched, 0, b"k1".to_vec(), b"k2".to_vec(), 200.into()).unwrap(); - - for &start_ts in &[10, 20, 30] { - let commit_ts = start_ts + 5; - let key = Key::from_raw(b"k1").append_ts(start_ts.into()); - let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value(&key).unwrap().is_none()); - - let key = Key::from_raw(b"k1").append_ts(commit_ts.into()); - let key = data_key(key.as_encoded()); - assert!(engine.kv.get_value_cf(CF_WRITE, &key).unwrap().is_none()); + let node_ids = cluster.get_node_ids(); + for store_id in node_ids { + let gc_sched = cluster.sim.rl().get_gc_worker(store_id).scheduler(); + + let mut region = cluster.get_region(b"a"); + region.set_start_key(b"k1".to_vec()); + region.set_end_key(b"k2".to_vec()); + sync_gc(&gc_sched, region, 200.into()).unwrap(); + + let engine = cluster.engines.get(&store_id).unwrap(); + for &start_ts in &[10, 20, 30] { + let commit_ts = start_ts + 5; + let key = Key::from_raw(b"k1").append_ts(start_ts.into()); + let key = data_key(key.as_encoded()); + assert!(engine.kv.get_value(&key).unwrap().is_none()); + + let key = Key::from_raw(b"k1").append_ts(commit_ts.into()); + let key = data_key(key.as_encoded()); + assert!(engine.kv.get_value_cf(CF_WRITE, &key).unwrap().is_none()); + } } } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index d60edf7bc97..6b2e52b8fee 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -552,7 +552,8 @@ fn test_mvcc_resolve_lock_gc_and_delete() { ts += 1; let gc_safe_ponit = TimeStamp::from(ts); let gc_scheduler = cluster.sim.rl().get_gc_worker(1).scheduler(); - sync_gc(&gc_scheduler, 0, vec![], vec![], gc_safe_ponit).unwrap(); + let region = cluster.get_region(&k); + sync_gc(&gc_scheduler, region, gc_safe_ponit).unwrap(); // the `k` at the old ts should be none. let get_version2 = commit_version + 1; diff --git a/tests/integrations/storage/test_raft_storage.rs b/tests/integrations/storage/test_raft_storage.rs index 98e60386884..58488cb91cd 100644 --- a/tests/integrations/storage/test_raft_storage.rs +++ b/tests/integrations/storage/test_raft_storage.rs @@ -294,7 +294,7 @@ fn test_auto_gc() { config.ratio_threshold = 0.9; let storage = SyncTestStorageBuilderApiV1::from_engine(engine.clone()) .gc_config(config) - .build() + .build(*id) .unwrap(); (*id, storage) @@ -312,7 +312,9 @@ fn test_auto_gc() { *id, ); cfg.post_a_round_of_gc = Some(Box::new(move || tx.send(()).unwrap())); - storage.start_auto_gc(cfg); + + let kv_engine = cluster.get_engine(*id); + storage.start_auto_gc(&kv_engine, cfg); } assert_eq!(storages.len(), count); diff --git a/tests/integrations/storage/test_raftkv.rs b/tests/integrations/storage/test_raftkv.rs index 420f9bd7765..20a3e5ebeaf 100644 --- a/tests/integrations/storage/test_raftkv.rs +++ b/tests/integrations/storage/test_raftkv.rs @@ -259,7 +259,7 @@ fn test_read_on_replica_check_memory_locks() { range.set_start_key(encoded_key.as_encoded().to_vec()); let follower_snap_ctx = SnapContext { pb_ctx: &follower_ctx, - start_ts: 100.into(), + start_ts: Some(100.into()), key_ranges: vec![range], ..Default::default() }; diff --git a/tests/integrations/storage/test_region_info_accessor.rs b/tests/integrations/storage/test_region_info_accessor.rs index b42a0d4c15a..2df7238e1a9 100644 --- a/tests/integrations/storage/test_region_info_accessor.rs +++ b/tests/integrations/storage/test_region_info_accessor.rs @@ -176,3 +176,39 @@ fn test_region_collection_get_regions_in_range() { p.stop(); } } + +#[test] +fn test_region_collection_find_region_by_key() { + let mut cluster = new_node_cluster(0, 3); + + let (tx, rx) = channel(); + cluster + .sim + .wl() + .post_create_coprocessor_host(Box::new(move |id, host| { + let p = RegionInfoAccessor::new(host); + tx.send((id, p)).unwrap() + })); + + cluster.run(); + let region_info_providers: HashMap<_, _> = rx.try_iter().collect(); + assert_eq!(region_info_providers.len(), 3); + let regions = prepare_cluster(&mut cluster); + + for node_id in cluster.get_node_ids() { + let engine = ®ion_info_providers[&node_id]; + + let region = engine.find_region_by_key(b"").unwrap(); + assert_eq!(region, regions[0]); + + let region = engine.find_region_by_key(b"k2").unwrap(); + assert_eq!(region, regions[1]); + + let region = engine.find_region_by_key(b"k99").unwrap(); + assert_eq!(region, *regions.last().unwrap()); + } + + for (_, p) in region_info_providers { + p.stop(); + } +} diff --git a/tests/integrations/storage/test_storage.rs b/tests/integrations/storage/test_storage.rs index 21c9db6fe42..b0c60ae5aab 100644 --- a/tests/integrations/storage/test_storage.rs +++ b/tests/integrations/storage/test_storage.rs @@ -13,8 +13,12 @@ use std::{ use api_version::{dispatch_api_version, KvFormat}; use engine_traits::{CF_DEFAULT, CF_LOCK}; -use kvproto::kvrpcpb::{ApiVersion, Context, KeyRange, LockInfo}; +use kvproto::{ + kvrpcpb::{ApiVersion, Context, KeyRange, LockInfo}, + metapb, +}; use rand::random; +use test_raftstore::new_peer; use test_storage::*; use tikv::{ coprocessor::checksum_crc64_xor, @@ -680,9 +684,11 @@ fn test_store_resolve_with_illegal_tso() { fn test_txn_store_gc() { let key = "k"; let store = AssertionStorage::default(); - let (_cluster, raft_store) = AssertionStorageApiV1::new_raft_storage_with_store_count(3, key); - store.test_txn_store_gc(key); - raft_store.test_txn_store_gc(key); + let (cluster, raft_store) = AssertionStorageApiV1::new_raft_storage_with_store_count(3, key); + + let region = cluster.get_region(key.as_bytes()); + store.test_txn_store_gc(key, region.clone()); + raft_store.test_txn_store_gc(key, region); } fn test_txn_store_gc_multiple_keys(key_prefix_len: usize, n: usize) { @@ -698,7 +704,11 @@ pub fn test_txn_store_gc_multiple_keys_single_storage(n: usize, prefix: String) store.put_ok(k.as_bytes(), b"v1", 5, 10); store.put_ok(k.as_bytes(), b"v2", 15, 20); } - store.gc_ok(30); + + let store_id = 1; + let mut region = metapb::Region::default(); + region.mut_peers().push(new_peer(store_id, 0)); + store.gc_ok(region, 30); for k in &keys { store.get_none(k.as_bytes(), 15); } @@ -714,12 +724,12 @@ pub fn test_txn_store_gc_multiple_keys_cluster_storage(n: usize, prefix: String) } let mut last_region = cluster.get_region(b""); - store.gc_ok_for_cluster(&mut cluster, b"", 30); + store.gc_ok_for_cluster(&mut cluster, b"", last_region.clone(), 30); for k in &keys { // clear data whose commit_ts < 30 let region = cluster.get_region(k.as_bytes()); if last_region != region { - store.gc_ok_for_cluster(&mut cluster, k.as_bytes(), 30); + store.gc_ok_for_cluster(&mut cluster, k.as_bytes(), region.clone(), 30); last_region = region; } } From 5ae75c8faf93b496030aaf7576f3ca7e2df28b19 Mon Sep 17 00:00:00 2001 From: YangKeao Date: Tue, 6 Sep 2022 02:44:55 -0400 Subject: [PATCH 189/676] copr: use manually written json path parser (#13317) close tikv/tikv#13316 Signed-off-by: YangKeao --- .../src/codec/mysql/json/path_expr.rs | 557 ++++++++++++++---- 1 file changed, 455 insertions(+), 102 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs b/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs index afb9cafff67..a760f748348 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs @@ -25,21 +25,13 @@ // select json_extract('{"a": "b", "c": [1, "2"]}', '$.*') -> ["b", [1, "2"]] // ``` -use std::ops::Index; - -use regex::Regex; +use std::{iter::Peekable, str::CharIndices}; use super::json_unquote::unquote_string; -use crate::codec::Result; +use crate::codec::{Error, Result}; pub const PATH_EXPR_ASTERISK: &str = "*"; -// [a-zA-Z_][a-zA-Z0-9_]* matches any identifier; -// "[^"\\]*(\\.[^"\\]*)*" matches any string literal which can carry escaped -// quotes. -const PATH_EXPR_LEG_RE_STR: &str = - r#"(\.\s*([a-zA-Z_][a-zA-Z0-9_]*|\*|"[^"\\]*(\\.[^"\\]*)*")|(\[\s*([0-9]+|\*)\s*\])|\*\*)"#; - #[derive(Clone, Debug, PartialEq)] pub enum PathLeg { /// `Key` indicates the path leg with '.key'. @@ -73,82 +65,334 @@ impl PathExpression { } } -/// Parses a JSON path expression. Returns a `PathExpression` -/// object which can be used in `JSON_EXTRACT`, `JSON_SET` and so on. -pub fn parse_json_path_expr(path_expr: &str) -> Result { - // Find the position of first '$'. If any no-blank characters in - // path_expr[0: dollarIndex], return an error. - let dollar_index = match path_expr.find('$') { - Some(i) => i, - None => return Err(box_err!("Invalid JSON path: {}", path_expr)), - }; - if path_expr - .index(0..dollar_index) - .char_indices() - .any(|(_, c)| !c.is_ascii_whitespace()) - { - return Err(box_err!("Invalid JSON path: {}", path_expr)); +/// `box_json_path_err` creates an error from the slice position +/// The position is added with 1, to count from 1 as start +macro_rules! box_json_path_err { + ($e:expr) => {{ + box_err!( + "Invalid JSON path expression. The error is around character position {}.", + ($e) + 1 + ) + }}; +} + +struct PathExpressionTokenizer<'a> { + input: &'a str, + + char_iterator: Peekable>, +} + +struct Position { + start: usize, + end: usize, +} + +/// PathExpressionToken represents a section in path expression and its position +enum PathExpressionToken { + Leg((PathLeg, Position)), + /// Represents the beginning "$" in the expression + Start(Position), +} + +impl<'a> Iterator for PathExpressionTokenizer<'a> { + type Item = Result; + + /// Next will try to parse the next path leg and return + /// If it returns None, it means the input is over. + /// If it returns Some(Err(..)), it means the format is error. + /// If it returns Some(Ok(..)), it represents the next token. + fn next(&mut self) -> Option> { + self.trim_white_spaces(); + // Trim all spaces at first + if self.reached_end() { + return None; + }; + + let (start, ch) = *self.char_iterator.peek().unwrap(); + match ch { + '$' => { + self.char_iterator.next(); + Some(Ok(PathExpressionToken::Start(Position { + start, + end: self.current_index(), + }))) + } + '.' => Some(self.next_key()), + '[' => Some(self.next_index()), + '*' => Some(self.next_double_asterisk()), + _ => Some(Err(box_json_path_err!(self.current_index()))), + } } +} - let expr = path_expr.index(dollar_index + 1..).trim_start(); +impl<'a> PathExpressionTokenizer<'a> { + fn new(input: &'a str) -> PathExpressionTokenizer<'a> { + PathExpressionTokenizer { + input, + char_iterator: input.char_indices().peekable(), + } + } - lazy_static::lazy_static! { - static ref RE: Regex = Regex::new(PATH_EXPR_LEG_RE_STR).unwrap(); + /// Returns the current index on the slice + fn current_index(&mut self) -> usize { + match self.char_iterator.peek() { + Some((start, _)) => *start, + None => self.input.len(), + } } - let mut legs = vec![]; - let mut flags = PathExpressionFlag::default(); - let mut last_end = 0; - for m in RE.find_iter(expr) { - let (start, end) = (m.start(), m.end()); - // Check all characters between two legs are blank. - if expr - .index(last_end..start) - .char_indices() - .any(|(_, c)| !c.is_ascii_whitespace()) - { - return Err(box_err!("Invalid JSON path: {}", path_expr)); + + /// `trim_while_spaces` removes following spaces + fn trim_white_spaces(&mut self) { + while self + .char_iterator + .next_if(|(_, ch)| ch.is_whitespace()) + .is_some() + {} + } + + /// Returns whether the input has reached the end + fn reached_end(&mut self) -> bool { + return self.char_iterator.peek().is_none(); + } + + fn next_key(&mut self) -> Result { + let (start, _) = self.char_iterator.next().unwrap(); + + self.trim_white_spaces(); + if self.reached_end() { + return Err(box_json_path_err!(self.current_index())); } - last_end = end; - - let next_char = expr.index(start..).chars().next().unwrap(); - if next_char == '[' { - // The leg is an index of a JSON array. - let leg = expr[start + 1..end].trim(); - let index_str = leg[0..leg.len() - 1].trim(); - let index = if index_str == PATH_EXPR_ASTERISK { - flags |= PATH_EXPRESSION_CONTAINS_ASTERISK; - PATH_EXPR_ARRAY_INDEX_ASTERISK - } else { - box_try!(index_str.parse::()) - }; - legs.push(PathLeg::Index(index)) - } else if next_char == '.' { - // The leg is a key of a JSON object. - let mut key = expr[start + 1..end].trim().to_owned(); - if key == PATH_EXPR_ASTERISK { - flags |= PATH_EXPRESSION_CONTAINS_ASTERISK; - } else if key.starts_with('"') { - // We need to unquote the origin string. - key = unquote_string(&key[1..key.len() - 1])?; + + match *self.char_iterator.peek().unwrap() { + (_, '*') => { + self.char_iterator.next().unwrap(); + + Ok(PathExpressionToken::Leg(( + PathLeg::Key(PATH_EXPR_ASTERISK.to_string()), + Position { + start, + end: self.current_index(), + }, + ))) + } + (mut key_start, '"') => { + // Skip this '"' character + key_start += 1; + self.char_iterator.next().unwrap(); + + // Next until the next '"' character + while self.char_iterator.next_if(|(_, ch)| *ch != '"').is_some() {} + + // Now, it's a '"' or the end + if self.char_iterator.peek().is_none() { + return Err(box_json_path_err!(self.current_index())); + } + + // `key_end` is the index of '"' + let key_end = self.current_index(); + self.char_iterator.next().unwrap(); + + let key = unquote_string(unsafe { self.input.get_unchecked(key_start..key_end) })?; + for ch in key.chars() { + // According to JSON standard, a string cannot + // contain any ASCII control characters + if ch.is_control() { + // TODO: add the concrete error location + // after unquote, we lost the map between + // the character and input position. + return Err(box_json_path_err!(key_start)); + } + } + + Ok(PathExpressionToken::Leg(( + PathLeg::Key(key), + Position { + start, + end: self.current_index(), + }, + ))) + } + (key_start, _) => { + // We have to also check the current value + while self + .char_iterator + .next_if(|(_, ch)| { + !(ch.is_whitespace() || *ch == '.' || *ch == '[' || *ch == '*') + }) + .is_some() + {} + + // Now it reaches the end or a whitespace/./[/* + let key_end = self.current_index(); + + // The start character is not available + if key_end == key_start { + return Err(box_json_path_err!(key_start)); + } + + let key = unsafe { self.input.get_unchecked(key_start..key_end) }.to_string(); + + // It's not quoted, we'll have to validate whether it's an available ECMEScript + // identifier + for (i, c) in key.char_indices() { + if i == 0 && c.is_ascii_digit() { + return Err(box_json_path_err!(key_start + i)); + } + if !c.is_ascii_alphanumeric() && c != '_' && c != '$' && c.is_ascii() { + return Err(box_json_path_err!(key_start + i)); + } + } + + Ok(PathExpressionToken::Leg(( + PathLeg::Key(key), + Position { + start, + end: key_end, + }, + ))) } - legs.push(PathLeg::Key(key)) - } else { - // The leg is '**'. - flags |= PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK; - legs.push(PathLeg::DoubleAsterisk); } } - // Check `!expr.is_empty()` here because "$" is a valid path to specify the - // current JSON. - if (last_end == 0) && (!expr.is_empty()) { - return Err(box_err!("Invalid JSON path: {}", path_expr)); + + fn next_index(&mut self) -> Result { + let (start, _) = self.char_iterator.next().unwrap(); + + self.trim_white_spaces(); + if self.reached_end() { + return Err(box_json_path_err!(self.current_index())); + } + + return match self.char_iterator.next().unwrap() { + (_, '*') => { + // Then it's a glob array index + self.trim_white_spaces(); + if self.reached_end() { + return Err(box_json_path_err!(self.current_index())); + } + + if self.char_iterator.next_if(|(_, ch)| *ch == ']').is_none() { + return Err(box_json_path_err!(self.current_index())); + } + + Ok(PathExpressionToken::Leg(( + PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK), + Position { + start, + end: self.current_index(), + }, + ))) + } + (number_start, '0'..='9') => { + // Then it's a number array index + while self + .char_iterator + .next_if(|(_, ch)| ch.is_ascii_digit()) + .is_some() + {} + let number_end = self.current_index(); + + self.trim_white_spaces(); + // now, it reaches the end of input, or reaches a non-digit character + match self.char_iterator.peek() { + Some((_, ']')) => {} + Some((pos, _)) => { + return Err(box_json_path_err!(pos)); + } + None => { + return Err(box_json_path_err!(self.current_index())); + } + } + self.char_iterator.next().unwrap(); + + let index = self.input[number_start..number_end] + .parse::() + .map_err(|_| -> Error { box_json_path_err!(number_end) })?; + Ok(PathExpressionToken::Leg(( + PathLeg::Index(index), + Position { + start, + end: self.current_index(), + }, + ))) + } + (pos, _) => Err(box_json_path_err!(pos)), + }; } - if !legs.is_empty() { - if let PathLeg::DoubleAsterisk = *legs.last().unwrap() { - // The last leg of a path expression cannot be '**'. - return Err(box_err!("Invalid JSON path: {}", path_expr)); + + fn next_double_asterisk(&mut self) -> Result { + let (start, _) = self.char_iterator.next().unwrap(); + + match self.char_iterator.next() { + Some((end, '*')) => { + // Three or more asterisks are not allowed + if let Some((pos, '*')) = self.char_iterator.peek() { + return Err(box_json_path_err!(pos)); + } + + Ok(PathExpressionToken::Leg(( + PathLeg::DoubleAsterisk, + Position { start, end }, + ))) + } + Some((pos, _)) => Err(box_json_path_err!(pos)), + None => Err(box_json_path_err!(self.current_index())), } } +} + +/// Parses a JSON path expression. Returns a `PathExpression` +/// object which can be used in `JSON_EXTRACT`, `JSON_SET` and so on. +pub fn parse_json_path_expr(path_expr: &str) -> Result { + let mut legs = Vec::new(); + let tokenizer = PathExpressionTokenizer::new(path_expr); + let mut flags = PathExpressionFlag::default(); + + let mut started = false; + let mut last_position = Position { start: 0, end: 0 }; + for (index, token) in tokenizer.enumerate() { + let token = token?; + + match token { + PathExpressionToken::Leg((leg, position)) => { + if !started { + return Err(box_json_path_err!(position.start)); + } + + match &leg { + PathLeg::Key(key) => { + if key == PATH_EXPR_ASTERISK { + flags |= PATH_EXPRESSION_CONTAINS_ASTERISK + } + } + PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK) => { + flags |= PATH_EXPRESSION_CONTAINS_ASTERISK + } + PathLeg::DoubleAsterisk => flags |= PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + _ => {} + } + + legs.push(leg.clone()); + last_position = position; + } + PathExpressionToken::Start(position) => { + started = true; + + if index != 0 { + return Err(box_json_path_err!(position.start)); + } + } + } + } + + // There is no available token + if !started { + return Err(box_json_path_err!(path_expr.len())); + } + // The last one cannot be the double asterisk + if !legs.is_empty() && legs.last().unwrap() == &PathLeg::DoubleAsterisk { + return Err(box_json_path_err!(last_position.end)); + } + Ok(PathExpression { legs, flags }) } @@ -175,7 +419,7 @@ mod tests { let mut test_cases = vec![ ( "$", - true, + None, Some(PathExpression { legs: vec![], flags: PathExpressionFlag::default(), @@ -183,23 +427,58 @@ mod tests { ), ( "$.a", - true, + None, Some(PathExpression { legs: vec![PathLeg::Key(String::from("a"))], flags: PathExpressionFlag::default(), }), ), + ( + "$ .a. $", + None, + Some(PathExpression { + legs: vec![ + PathLeg::Key(String::from("a")), + PathLeg::Key(String::from("$")), + ], + flags: PathExpressionFlag::default(), + }), + ), ( "$.\"hello world\"", - true, + None, Some(PathExpression { legs: vec![PathLeg::Key(String::from("hello world"))], flags: PathExpressionFlag::default(), }), ), ( - "$[0]", - true, + "$. \"你好 世界\" ", + None, + Some(PathExpression { + legs: vec![PathLeg::Key(String::from("你好 世界"))], + flags: PathExpressionFlag::default(), + }), + ), + ( + "$. ❤️ ", + None, + Some(PathExpression { + legs: vec![PathLeg::Key(String::from("❤️"))], + flags: PathExpressionFlag::default(), + }), + ), + ( + "$. 你好 ", + None, + Some(PathExpression { + legs: vec![PathLeg::Key(String::from("你好"))], + flags: PathExpressionFlag::default(), + }), + ), + ( + "$[ 0 ]", + None, Some(PathExpression { legs: vec![PathLeg::Index(0)], flags: PathExpressionFlag::default(), @@ -207,33 +486,107 @@ mod tests { ), ( "$**.a", - true, + None, Some(PathExpression { legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("a"))], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }), ), + ( + " $ ** . a", + None, + Some(PathExpression { + legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("a"))], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }), + ), + ( + " $ ** . $", + None, + Some(PathExpression { + legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("$"))], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }), + ), // invalid path expressions - (".a", false, None), - ("xx$[1]", false, None), - ("$.a xx .b", false, None), - ("$[a]", false, None), - ("$.\"\\u33\"", false, None), - ("$**", false, None), + ( + " $ ** . 5", + Some("Invalid JSON path expression. The error is around character position 13."), + None, + ), + ( + ".a", + Some("Invalid JSON path expression. The error is around character position 1."), + None, + ), + ( + "xx$[1]", + Some("Invalid JSON path expression. The error is around character position 1."), + None, + ), + ( + "$.a xx .b", + Some("Invalid JSON path expression. The error is around character position 5."), + None, + ), + ( + "$[a]", + Some("Invalid JSON path expression. The error is around character position 3."), + None, + ), + ( + "$.\"\\u33\"", + // TODO: pass the position in the unquote unicode error + Some("Invalid unicode, byte len too short"), + None, + ), + ( + "$**", + Some("Invalid JSON path expression. The error is around character position 3."), + None, + ), + ( + "$.\"a\\t\"", + Some("Invalid JSON path expression. The error is around character position 4."), + None, + ), + ( + "$ .a $", + Some("Invalid JSON path expression. The error is around character position 6."), + None, + ), + ( + "$ [ 2147483648 ]", + Some("Invalid JSON path expression. The error is around character position 15."), + None, + ), ]; - for (i, (path_expr, no_error, expected)) in test_cases.drain(..).enumerate() { + for (i, (path_expr, error_message, expected)) in test_cases.drain(..).enumerate() { let r = parse_json_path_expr(path_expr); - if no_error { - assert!(r.is_ok(), "#{} expect parse ok but got err {:?}", i, r); - let got = r.unwrap(); - let expected = expected.unwrap(); - assert_eq!( - got, expected, - "#{} expect {:?} but got {:?}", - i, expected, got - ); - } else { - assert!(r.is_err(), "#{} expect error but got {:?}", i, r); + + match error_message { + Some(error_message) => { + assert!(r.is_err(), "#{} expect error but got {:?}", i, r); + + let got = r.err().unwrap().to_string(); + assert!( + got.contains(error_message), + "#{} error message {} should contain {}", + i, + got, + error_message + ) + } + None => { + assert!(r.is_ok(), "#{} expect parse ok but got err {:?}", i, r); + let got = r.unwrap(); + let expected = expected.unwrap(); + assert_eq!( + got, expected, + "#{} expect {:?} but got {:?}", + i, expected, got + ); + } } } } @@ -241,10 +594,10 @@ mod tests { #[test] fn test_parse_json_path_expr_contains_any_asterisk() { let mut test_cases = vec![ - ("$.a[b]", false), + ("$.a[0]", false), ("$.a[*]", true), - ("$.*[b]", true), - ("$**.a[b]", true), + ("$.*[0]", true), + ("$**.a[0]", true), ]; for (i, (path_expr, expected)) in test_cases.drain(..).enumerate() { let r = parse_json_path_expr(path_expr); From b55ef7215572e675fd6e30ad10e97e7024c3c469 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 6 Sep 2022 15:18:55 +0800 Subject: [PATCH 190/676] *: introduce the non-retryable error FlashbackInProgress (#13398) close tikv/tikv#13397 Introduce the non-retryable error `FlashbackInProgress` for the region in the flashback progress to reject any read or write. Signed-off-by: JmPotato --- Cargo.lock | 2 +- components/error_code/src/raftstore.rs | 5 +- components/raftstore/src/errors.rs | 4 +- etc/error_code.toml | 160 ++++++++++-------- .../integrations/raftstore/test_flashback.rs | 15 +- tests/integrations/server/kv_service.rs | 8 +- 6 files changed, 107 insertions(+), 87 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0aa7586a608..2f9c09fa164 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2629,7 +2629,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#f95ac338b3312e0a9bd7c33c9647a87a74314567" +source = "git+https://github.com/pingcap/kvproto.git#2e37953b2b435961ad5b4f0e36b32c53f4777b23" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index 29c4c3c1849..2fd0d168a14 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -30,8 +30,7 @@ define_error_codes!( DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), PENDING_PREPARE_MERGE => ("PendingPrepareMerge", "", ""), RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), - // TODO: add FLASHBACK in errorpb - FLASHBACK_IN_PROGRESS => ("RecoveryInProgress", "", ""), + FLASHBACK_IN_PROGRESS => ("FlashbackInProgress", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), @@ -66,6 +65,8 @@ impl ErrorCodeExt for errorpb::Error { DATA_IS_NOT_READY } else if self.has_recovery_in_progress() { RECOVERY_IN_PROGRESS + } else if self.has_flashback_in_progress() { + FLASHBACK_IN_PROGRESS } else { UNKNOWN } diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 1adaef08c3f..878ad6c2825 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -245,9 +245,9 @@ impl From for errorpb::Error { errorpb.set_recovery_in_progress(e); } Error::FlashbackInProgress(region_id) => { - let mut e = errorpb::RecoveryInProgress::default(); + let mut e = errorpb::FlashbackInProgress::default(); e.set_region_id(region_id); - errorpb.set_recovery_in_progress(e); + errorpb.set_flashback_in_progress(e); } _ => {} }; diff --git a/etc/error_code.toml b/etc/error_code.toml index 9a42cc3769a..7a6b956449f 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -1,11 +1,11 @@ -["KV:Cloud:IO"] +["KV:Cloud:Io"] error = ''' -KV:Cloud:IO +KV:Cloud:Io ''' -["KV:Cloud:SSL"] +["KV:Cloud:Ssl"] error = ''' -KV:Cloud:SSL +KV:Cloud:Ssl ''' ["KV:Cloud:Proto"] @@ -143,9 +143,9 @@ error = ''' KV:Encryption:Rocks ''' -["KV:Encryption:IO"] +["KV:Encryption:Io"] error = ''' -KV:Encryption:IO +KV:Encryption:Io ''' ["KV:Encryption:Crypter"] @@ -193,9 +193,9 @@ error = ''' KV:Engine:Protobuf ''' -["KV:Engine:IO"] +["KV:Engine:Io"] error = ''' -KV:Engine:IO +KV:Engine:Io ''' ["KV:Engine:CfName"] @@ -218,49 +218,54 @@ error = ''' KV:Engine:DataCompacted ''' -["KV:PD:IO"] +["KV:Pd:Io"] error = ''' -KV:PD:IO +KV:Pd:Io ''' -["KV:PD:ClusterBootstraped"] +["KV:Pd:ClusterBootstraped"] error = ''' -KV:PD:ClusterBootstraped +KV:Pd:ClusterBootstraped ''' -["KV:PD:ClusterNotBootstraped"] +["KV:Pd:ClusterNotBootstraped"] error = ''' -KV:PD:ClusterNotBootstraped +KV:Pd:ClusterNotBootstraped ''' -["KV:PD:Imcompatible"] +["KV:Pd:Imcompatible"] error = ''' -KV:PD:Imcompatible +KV:Pd:Imcompatible ''' -["KV:PD:gRPC"] +["KV:Pd:Grpc"] error = ''' -KV:PD:gRPC +KV:Pd:Grpc ''' -["KV:PD:RegionNotFound"] +["KV:Pd:StreamDisconnect"] error = ''' -KV:PD:RegionNotFound +KV:Pd:StreamDisconnect ''' -["KV:PD:StoreTombstone"] +["KV:Pd:RegionNotFound"] error = ''' -KV:PD:StoreTombstone +KV:Pd:RegionNotFound ''' -["KV:PD:GlobalConfigNotFound"] +["KV:Pd:StoreTombstone"] error = ''' -KV:PD:GlobalConfigNotFound +KV:Pd:StoreTombstone ''' -["KV:PD:Unknown"] +["KV:Pd:GlobalConfigNotFound"] error = ''' -KV:PD:Unknown +KV:Pd:GlobalConfigNotFound +''' + +["KV:Pd:Unknown"] +error = ''' +KV:Pd:Unknown ''' ["KV:Raft:Io"] @@ -373,9 +378,9 @@ error = ''' KV:Raftstore:Coprocessor ''' -["KV:Raftstore:IO"] +["KV:Raftstore:Io"] error = ''' -KV:Raftstore:IO +KV:Raftstore:Io ''' ["KV:Raftstore:Protobuf"] @@ -428,6 +433,11 @@ error = ''' KV:Raftstore:RecoveryInProgress ''' +["KV:Raftstore:FlashbackInProgress"] +error = ''' +KV:Raftstore:FlashbackInProgress +''' + ["KV:Raftstore:SnapAbort"] error = ''' KV:Raftstore:SnapAbort @@ -443,94 +453,99 @@ error = ''' KV:Raftstore:SnapUnknown ''' -["KV:SSTImporter:Io"] +["KV:SstImporter:Io"] error = ''' -KV:SSTImporter:Io +KV:SstImporter:Io ''' -["KV:SSTImporter:gRPC"] +["KV:SstImporter:Grpc"] error = ''' -KV:SSTImporter:gRPC +KV:SstImporter:Grpc ''' -["KV:SSTImporter:Uuid"] +["KV:SstImporter:Uuid"] error = ''' -KV:SSTImporter:Uuid +KV:SstImporter:Uuid ''' -["KV:SSTImporter:Future"] +["KV:SstImporter:Future"] error = ''' -KV:SSTImporter:Future +KV:SstImporter:Future ''' -["KV:SSTImporter:RocksDb"] +["KV:SstImporter:RocksDb"] error = ''' -KV:SSTImporter:RocksDb +KV:SstImporter:RocksDb ''' -["KV:SSTImporter:ParseIntError"] +["KV:SstImporter:ParseIntError"] error = ''' -KV:SSTImporter:ParseIntError +KV:SstImporter:ParseIntError ''' -["KV:SSTImporter:FileExists"] +["KV:SstImporter:FileExists"] error = ''' -KV:SSTImporter:FileExists +KV:SstImporter:FileExists ''' -["KV:SSTImporter:FileCorrupted"] +["KV:SstImporter:FileCorrupted"] error = ''' -KV:SSTImporter:FileCorrupted +KV:SstImporter:FileCorrupted ''' -["KV:SSTImporter:InvalidSstPath"] +["KV:SstImporter:InvalidSstPath"] error = ''' -KV:SSTImporter:InvalidSstPath +KV:SstImporter:InvalidSstPath ''' -["KV:SSTImporter:InvalidChunk"] +["KV:SstImporter:InvalidChunk"] error = ''' -KV:SSTImporter:InvalidChunk +KV:SstImporter:InvalidChunk ''' -["KV:SSTImporter:Engine"] +["KV:SstImporter:Engine"] error = ''' -KV:SSTImporter:Engine +KV:SstImporter:Engine ''' -["KV:SSTImporter:CannotReadExternalStorage"] +["KV:SstImporter:CannotReadExternalStorage"] error = ''' -KV:SSTImporter:CannotReadExternalStorage +KV:SstImporter:CannotReadExternalStorage ''' -["KV:SSTImporter:WrongKeyPrefix"] +["KV:SstImporter:WrongKeyPrefix"] error = ''' -KV:SSTImporter:WrongKeyPrefix +KV:SstImporter:WrongKeyPrefix ''' -["KV:SSTImporter:BadFormat"] +["KV:SstImporter:BadFormat"] error = ''' -KV:SSTImporter:BadFormat +KV:SstImporter:BadFormat ''' -["KV:SSTImporter:FileConflict"] +["KV:SstImporter:FileConflict"] error = ''' -KV:SSTImporter:FileConflict +KV:SstImporter:FileConflict ''' -["KV:SSTImporter:TtlNotEnabled"] +["KV:SstImporter:TtlNotEnabled"] error = ''' -KV:SSTImporter:TtlNotEnabled +KV:SstImporter:TtlNotEnabled ''' -["KV:SSTImporter:TtlLenNotEqualsToPairs"] +["KV:SstImporter:TtlLenNotEqualsToPairs"] error = ''' -KV:SSTImporter:TtlLenNotEqualsToPairs +KV:SstImporter:TtlLenNotEqualsToPairs ''' -["KV:SSTImporter:IncompatibleApiVersion"] +["KV:SstImporter:IncompatibleApiVersion"] error = ''' -KV:SSTImporter:IncompatibleApiVersion +KV:SstImporter:IncompatibleApiVersion +''' + +["KV:SstImporter:InvalidKeyMode"] +error = ''' +KV:SstImporter:InvalidKeyMode ''' ["KV:Storage:Timeout"] @@ -568,14 +583,14 @@ error = ''' KV:Storage:KeyTooLarge ''' -["KV:Storage:InvalidCF"] +["KV:Storage:InvalidCf"] error = ''' -KV:Storage:InvalidCF +KV:Storage:InvalidCf ''' -["KV:Storage:CFDeprecated"] +["KV:Storage:CfDeprecated"] error = ''' -KV:Storage:CFDeprecated +KV:Storage:CfDeprecated ''' ["KV:Storage:TtlNotEnabled"] @@ -593,9 +608,9 @@ error = ''' KV:Storage:Protobuf ''' -["KV:Storage:INVALIDTXNTSO"] +["KV:Storage:InvalidTxnTso"] error = ''' -KV:Storage:INVALIDTXNTSO +KV:Storage:InvalidTxnTso ''' ["KV:Storage:InvalidReqRange"] @@ -708,6 +723,11 @@ error = ''' KV:Storage:AssertionFailed ''' +["KV:Storage:LockIfExistsFailed"] +error = ''' +KV:Storage:LockIfExistsFailed +''' + ["KV:Storage:Unknown"] error = ''' KV:Storage:Unknown diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index e4d0276f9e6..cf91873d385 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -50,10 +50,9 @@ fn test_flashback_for_schedule() { .call_command_on_leader(transfer_leader, Duration::from_secs(3)) .unwrap(); let e = resp.get_header().get_error(); - // reuse recovery_in_progress error code. assert_eq!( - e.get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + e.get_flashback_in_progress(), + &kvproto::errorpb::FlashbackInProgress { region_id: region.get_id(), ..Default::default() } @@ -94,7 +93,7 @@ fn test_flahsback_for_write() { // write will be blocked let value = vec![1_u8; 8096]; - must_get_error_recovery_in_progress(&mut cluster, ®ion, new_put_cmd(b"k1", &value)); + must_get_error_flashback_in_progress(&mut cluster, ®ion, new_put_cmd(b"k1", &value)); must_cmd_add_flashback_flag( &mut cluster, @@ -123,7 +122,7 @@ fn test_flahsback_for_read() { block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); // read will be blocked - must_get_error_recovery_in_progress(&mut cluster, ®ion, new_get_cf_cmd("write", b"k1")); + must_get_error_flashback_in_progress(&mut cluster, ®ion, new_get_cf_cmd("write", b"k1")); // verify the read can be executed if add flashback flag in request's // header. @@ -263,7 +262,7 @@ fn must_cmd_add_flashback_flag( assert!(!resp.get_header().has_error()); } -fn must_get_error_recovery_in_progress( +fn must_get_error_flashback_in_progress( cluster: &mut Cluster, region: &metapb::Region, cmd: kvproto::raft_cmdpb::Request, @@ -277,8 +276,8 @@ fn must_get_error_recovery_in_progress( Ok(_) => {} Err(e) => { assert_eq!( - e.get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + e.get_flashback_in_progress(), + &kvproto::errorpb::FlashbackInProgress { region_id: region.get_id(), ..Default::default() } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 6b2e52b8fee..70c7f9bda4c 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -696,7 +696,7 @@ fn test_mvcc_flashback_block_rw() { get_req.key = k.clone(); get_req.version = 1; let get_resp = client.kv_get(&get_req).unwrap(); - assert!(get_resp.get_region_error().has_recovery_in_progress()); + assert!(get_resp.get_region_error().has_flashback_in_progress()); assert!(!get_resp.has_error()); assert!(get_resp.value.is_empty()); // Scan @@ -706,7 +706,7 @@ fn test_mvcc_flashback_block_rw() { scan_req.limit = 1; scan_req.version = 1; let scan_resp = client.kv_scan(&scan_req).unwrap(); - assert!(scan_resp.get_region_error().has_recovery_in_progress()); + assert!(scan_resp.get_region_error().has_flashback_in_progress()); assert!(scan_resp.pairs.is_empty()); // Try to write. // Prewrite @@ -715,7 +715,7 @@ fn test_mvcc_flashback_block_rw() { mutation.set_key(k.clone()); mutation.set_value(v); let prewrite_resp = try_kv_prewrite(&client, ctx, vec![mutation], k, 1); - assert!(prewrite_resp.get_region_error().has_recovery_in_progress()); + assert!(prewrite_resp.get_region_error().has_flashback_in_progress()); fail::remove("skip_finish_flashback_to_version"); } @@ -741,7 +741,7 @@ fn test_mvcc_flashback_block_scheduling() { transfer_leader_resp .get_header() .get_error() - .has_recovery_in_progress() + .has_flashback_in_progress() ); fail::remove("skip_finish_flashback_to_version"); } From bcaa663c614f044fcfb596c555cac0f152f35d3e Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 6 Sep 2022 15:42:55 +0800 Subject: [PATCH 191/676] cloud: add retry on web identity credentials (#13343) close tikv/tikv#13122 Signed-off-by: 3pointer Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/cloud/aws/Cargo.toml | 1 + components/cloud/aws/src/s3.rs | 22 ++--- components/cloud/aws/src/util.rs | 134 +++++++++++++++++++++++++---- components/cloud/src/metrics.rs | 6 ++ components/tikv_util/src/stream.rs | 56 +++++++++++- 6 files changed, 191 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2f9c09fa164..f9dc0e6c418 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -279,6 +279,7 @@ dependencies = [ "tikv_util", "tokio", "url", + "uuid", ] [[package]] diff --git a/components/cloud/aws/Cargo.toml b/components/cloud/aws/Cargo.toml index 299192e9ca3..314e2281425 100644 --- a/components/cloud/aws/Cargo.toml +++ b/components/cloud/aws/Cargo.toml @@ -36,6 +36,7 @@ url = "2.0" thiserror = "1.0" lazy_static = "1.3" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } +uuid = "0.8" [dev-dependencies] futures = "0.3" diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index 991ae154427..3e9c3665f58 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -17,14 +17,10 @@ use rusoto_core::{request::DispatchSignedRequest, ByteStream, RusotoError}; use rusoto_credential::{ProvideAwsCredentials, StaticProvider}; use rusoto_s3::{util::AddressingStyle, *}; use thiserror::Error; -use tikv_util::{ - debug, - stream::{error_stream, retry}, - time::Instant, -}; +use tikv_util::{debug, stream::error_stream, time::Instant}; use tokio::time::{sleep, timeout}; -use crate::util; +use crate::util::{self, retry_and_count}; const CONNECTION_TIMEOUT: Duration = Duration::from_secs(900); pub const STORAGE_VENDOR_NAME_AWS: &str = "aws"; @@ -311,11 +307,11 @@ impl<'client> S3Uploader<'client> { // For short files, execute one put_object to upload the entire thing. let mut data = Vec::with_capacity(est_len as usize); reader.read_to_end(&mut data).await?; - retry(|| self.upload(&data)).await?; + retry_and_count(|| self.upload(&data), "upload_small_file").await?; Ok(()) } else { // Otherwise, use multipart upload to improve robustness. - self.upload_id = retry(|| self.begin()).await?; + self.upload_id = retry_and_count(|| self.begin(), "begin_upload").await?; let upload_res = async { let mut buf = vec![0; self.multi_part_size]; let mut part_number = 1; @@ -324,7 +320,11 @@ impl<'client> S3Uploader<'client> { if data_size == 0 { break; } - let part = retry(|| self.upload_part(part_number, &buf[..data_size])).await?; + let part = retry_and_count( + || self.upload_part(part_number, &buf[..data_size]), + "upload_part", + ) + .await?; self.parts.push(part); part_number += 1; } @@ -333,9 +333,9 @@ impl<'client> S3Uploader<'client> { .await; if upload_res.is_ok() { - retry(|| self.complete()).await?; + retry_and_count(|| self.complete(), "complete_upload").await?; } else { - let _ = retry(|| self.abort()).await; + let _ = retry_and_count(|| self.abort(), "abort_upload").await; } upload_res } diff --git a/components/cloud/aws/src/util.rs b/components/cloud/aws/src/util.rs index c4ff356f462..a2dc1ca8c76 100644 --- a/components/cloud/aws/src/util.rs +++ b/components/cloud/aws/src/util.rs @@ -3,6 +3,8 @@ use std::io::{self, Error, ErrorKind}; use async_trait::async_trait; +use cloud::metrics; +use futures::{future::TryFutureExt, Future}; use rusoto_core::{ region::Region, request::{HttpClient, HttpConfig}, @@ -11,10 +13,36 @@ use rusoto_credential::{ AutoRefreshingProvider, AwsCredentials, ChainProvider, CredentialsError, ProvideAwsCredentials, }; use rusoto_sts::WebIdentityProvider; +use tikv_util::{ + stream::{retry_ext, RetryError, RetryExt}, + warn, +}; #[allow(dead_code)] // This will be used soon, please remove the allow. const READ_BUF_SIZE: usize = 1024 * 1024 * 2; +const AWS_WEB_IDENTITY_TOKEN_FILE: &str = "AWS_WEB_IDENTITY_TOKEN_FILE"; +struct CredentialsErrorWrapper(CredentialsError); + +impl From for CredentialsError { + fn from(c: CredentialsErrorWrapper) -> CredentialsError { + c.0 + } +} + +impl std::fmt::Display for CredentialsErrorWrapper { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0.message)?; + Ok(()) + } +} + +impl RetryError for CredentialsErrorWrapper { + fn is_retryable(&self) -> bool { + true + } +} + pub fn new_http_client() -> io::Result { let mut http_config = HttpConfig::new(); // This can greatly improve performance dealing with payloads greater @@ -49,6 +77,22 @@ pub fn get_region(region: &str, endpoint: &str) -> io::Result { } } +pub async fn retry_and_count(action: G, name: &'static str) -> Result +where + G: FnMut() -> F, + F: Future>, + E: RetryError + std::fmt::Display, +{ + let id = uuid::Uuid::new_v4(); + retry_ext( + action, + RetryExt::default().with_fail_hook(move |err: &E| { + warn!("aws request meet error."; "err" => %err, "retry?" => %err.is_retryable(), "context" => %name, "uuid" => %id); + metrics::CLOUD_ERROR_VEC.with_label_values(&["aws", name]).inc(); + }), + ).await +} + pub struct CredentialsProvider(AutoRefreshingProvider); impl CredentialsProvider { @@ -92,21 +136,81 @@ impl Default for DefaultCredentialsProvider { #[async_trait] impl ProvideAwsCredentials for DefaultCredentialsProvider { async fn credentials(&self) -> Result { - // Prefer the web identity provider first for the kubernetes environment. - // Search for both in parallel. - let web_creds = self.web_identity_provider.credentials(); - let def_creds = self.default_provider.credentials(); - let k8s_error = match web_creds.await { - res @ Ok(_) => return res, - Err(e) => e, - }; - let def_error = match def_creds.await { - res @ Ok(_) => return res, - Err(e) => e, + // use web identity provider first for the kubernetes environment. + let cred = if std::env::var(AWS_WEB_IDENTITY_TOKEN_FILE).is_ok() { + // we need invoke assume_role in web identity provider + // this API may failed sometimes. + // according to AWS experience, it's better to retry it with 10 times + // exponential backoff for every error, because we cannot + // distinguish the error type. + retry_and_count( + || { + #[cfg(test)] + fail::fail_point!("cred_err", |_| { + Box::pin(futures::future::err(CredentialsErrorWrapper( + CredentialsError::new("injected error"), + ))) + as std::pin::Pin + Send>> + }); + let res = self + .web_identity_provider + .credentials() + .map_err(|e| CredentialsErrorWrapper(e)); + #[cfg(test)] + return Box::pin(res); + #[cfg(not(test))] + res + }, + "get_cred_over_the_cloud", + ) + .await + .map_err(|e| e.0) + } else { + // Add exponential backoff for every error, because we cannot + // distinguish the error type. + retry_and_count( + || { + self.default_provider + .credentials() + .map_err(|e| CredentialsErrorWrapper(e)) + }, + "get_cred_on_premise", + ) + .await + .map_err(|e| e.0) }; - Err(CredentialsError::new(format_args!( - "Couldn't find AWS credentials in default sources ({}) or k8s environment ({}).", - def_error.message, k8s_error.message, - ))) + + cred.map_err(|e| { + CredentialsError::new(format_args!( + "Couldn't find AWS credentials in sources ({}).", + e.message + )) + }) + } +} + +#[cfg(test)] +mod tests { + #[allow(unused_imports)] + use super::*; + + #[cfg(feature = "failpoints")] + #[tokio::test] + async fn test_default_provider() { + let default_provider = DefaultCredentialsProvider::default(); + std::env::set_var(AWS_WEB_IDENTITY_TOKEN_FILE, "tmp"); + // mock k8s env with web_identitiy_provider + fail::cfg("cred_err", "return").unwrap(); + fail::cfg("retry_count", "return(1)").unwrap(); + let res = default_provider.credentials().await; + assert_eq!(res.is_err(), true); + assert_eq!( + res.err().unwrap().message, + "Couldn't find AWS credentials in sources (injected error)." + ); + fail::remove("cred_err"); + fail::remove("retry_count"); + + std::env::remove_var(AWS_WEB_IDENTITY_TOKEN_FILE); } } diff --git a/components/cloud/src/metrics.rs b/components/cloud/src/metrics.rs index e115abe0853..58e267a56fa 100644 --- a/components/cloud/src/metrics.rs +++ b/components/cloud/src/metrics.rs @@ -10,4 +10,10 @@ lazy_static! { &["cloud", "req"] ) .unwrap(); + pub static ref CLOUD_ERROR_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_cloud_error_count", + "Total number of credentail errors from EKS env", + &["cloud", "error"] + ) + .unwrap(); } diff --git a/components/tikv_util/src/stream.rs b/components/tikv_util/src/stream.rs index b7ba46c45bf..8f892659f68 100644 --- a/components/tikv_util/src/stream.rs +++ b/components/tikv_util/src/stream.rs @@ -96,19 +96,69 @@ pub trait RetryError { /// /// Since rusoto does not have transparent auto-retry /// (), we need to implement this manually. -pub async fn retry(mut action: G) -> Result +pub async fn retry(action: G) -> Result +where + G: FnMut() -> F, + F: Future>, + E: RetryError, +{ + retry_ext(action, RetryExt::default()).await +} + +/// The extra configuration for retry. +pub struct RetryExt { + // NOTE: we can move `MAX_RETRY_DELAY` and `MAX_RETRY_TIMES` + // to here, for making the retry more configurable. + // However those are constant for now and no place for configure them. + on_failure: Option>, +} + +impl RetryExt { + /// Attaches the failure hook to the ext. + pub fn with_fail_hook(mut self, f: F) -> Self + where + F: FnMut(&E) + Send + Sync + 'static, + { + self.on_failure = Some(Box::new(f)); + self + } +} + +// If we use the default derive macro, it would complain that `E` isn't +// `Default` :( +impl Default for RetryExt { + fn default() -> Self { + Self { + on_failure: Default::default(), + } + } +} + +/// Retires a future execution. Comparing to `retry`, this version allows more +/// configurations. +pub async fn retry_ext(mut action: G, mut ext: RetryExt) -> Result where G: FnMut() -> F, F: Future>, E: RetryError, { const MAX_RETRY_DELAY: Duration = Duration::from_secs(32); - const MAX_RETRY_TIMES: usize = 4; + const MAX_RETRY_TIMES: usize = 14; + let max_retry_times = (|| { + fail::fail_point!("retry_count", |t| t + .and_then(|v| v.parse::().ok()) + .unwrap_or(MAX_RETRY_TIMES)); + MAX_RETRY_TIMES + })(); + let mut retry_wait_dur = Duration::from_secs(1); let mut final_result = action().await; - for _ in 1..MAX_RETRY_TIMES { + for _ in 1..max_retry_times { if let Err(e) = &final_result { + if let Some(ref mut f) = ext.on_failure { + f(e); + } if e.is_retryable() { let backoff = thread_rng().gen_range(0..1000); sleep(retry_wait_dur + Duration::from_millis(backoff)).await; From cbf85c11fda31c808014786c0d67a436a8cb63ed Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 6 Sep 2022 17:48:55 +0800 Subject: [PATCH 192/676] raftstore: optimize region destroy (#13384) close tikv/tikv#12421 Optimize the performance of merging empty regions Signed-off-by: tabokie --- components/engine_panic/src/misc.rs | 4 - components/engine_rocks/src/misc.rs | 82 ++-- components/engine_traits/src/misc.rs | 23 +- components/raftstore/src/store/fsm/store.rs | 10 +- .../raftstore/src/store/worker/region.rs | 349 +++++++++--------- tests/integrations/storage/test_titan.rs | 8 +- 6 files changed, 230 insertions(+), 246 deletions(-) diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 5a78ea66e5a..82012b84ed6 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -34,10 +34,6 @@ impl MiscExt for PanicEngine { panic!() } - fn roughly_cleanup_ranges(&self, ranges: &[(Vec, Vec)]) -> Result<()> { - panic!() - } - fn path(&self) -> &str { panic!() } diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 7cf5d771486..e7c9ef547d8 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -4,7 +4,6 @@ use engine_traits::{ CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, }; -use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; use crate::{ @@ -29,17 +28,6 @@ impl RocksEngine { ) -> Result<()> { let mut ranges = ranges.to_owned(); ranges.sort_by(|a, b| a.start_key.cmp(b.start_key)); - let max_end_key = ranges - .iter() - .fold(ranges[0].end_key, |x, y| std::cmp::max(x, y.end_key)); - let start = KeyBuilder::from_slice(ranges[0].start_key, 0, 0); - let end = KeyBuilder::from_slice(max_end_key, 0, 0); - let mut opts = IterOptions::new(Some(start), Some(end), false); - if self.is_titan() { - // Cause DeleteFilesInRange may expose old blob index keys, setting key only for - // Titan to avoid referring to missing blob files. - opts.set_key_only(true); - } let mut writer_wrapper: Option = None; let mut data: Vec> = vec![]; @@ -55,7 +43,17 @@ impl RocksEngine { } last_end_key = Some(r.end_key.to_owned()); - let mut it = self.iterator_opt(cf, opts.clone())?; + let mut opts = IterOptions::new( + Some(KeyBuilder::from_slice(r.start_key, 0, 0)), + Some(KeyBuilder::from_slice(r.end_key, 0, 0)), + false, + ); + if self.is_titan() { + // Cause DeleteFilesInRange may expose old blob index keys, setting key only for + // Titan to avoid referring to missing blob files. + opts.set_key_only(true); + } + let mut it = self.iterator_opt(cf, opts)?; let mut it_valid = it.seek(r.start_key)?; while it_valid { if it.key() >= r.end_key { @@ -225,29 +223,6 @@ impl MiscExt for RocksEngine { Ok(used_size) } - fn roughly_cleanup_ranges(&self, ranges: &[(Vec, Vec)]) -> Result<()> { - let db = self.as_inner(); - let mut delete_ranges = Vec::new(); - for &(ref start, ref end) in ranges { - if start == end { - continue; - } - assert!(start < end); - delete_ranges.push(RocksRange::new(start, end)); - } - if delete_ranges.is_empty() { - return Ok(()); - } - - for cf in db.cf_names() { - let handle = util::get_cf_handle(db, cf)?; - db.delete_files_in_ranges_cf(handle, &delete_ranges, /* include_end */ false) - .map_err(r2e)?; - } - - Ok(()) - } - fn path(&self) -> &str { self.as_inner().path() } @@ -363,13 +338,9 @@ mod tests { } } - fn test_delete_all_in_range( - strategy: DeleteStrategy, - origin_keys: &[Vec], - ranges: &[Range<'_>], - ) { + fn test_delete_ranges(strategy: DeleteStrategy, origin_keys: &[Vec], ranges: &[Range<'_>]) { let path = Builder::new() - .prefix("engine_delete_all_in_range") + .prefix("engine_delete_ranges") .tempdir() .unwrap(); let path_str = path.path().to_str().unwrap(); @@ -399,8 +370,7 @@ mod tests { wb.write().unwrap(); check_data(&db, ALL_CFS, kvs.as_slice()); - // Delete all in ranges. - db.delete_all_in_range(strategy, ranges).unwrap(); + db.delete_ranges_cfs(strategy, ranges).unwrap(); let mut kvs_left: Vec<_> = kvs; for r in ranges { @@ -419,25 +389,25 @@ mod tests { b"k4".to_vec(), ]; // Single range. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByRange, &data, &[Range::new(b"k1", b"k4")], ); // Two ranges without overlap. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByRange, &data, &[Range::new(b"k0", b"k1"), Range::new(b"k3", b"k4")], ); // Two ranges with overlap. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByRange, &data, &[Range::new(b"k1", b"k3"), Range::new(b"k2", b"k4")], ); // One range contains the other range. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByRange, &data, &[Range::new(b"k1", b"k4"), Range::new(b"k2", b"k3")], @@ -454,25 +424,25 @@ mod tests { b"k4".to_vec(), ]; // Single range. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByKey, &data, &[Range::new(b"k1", b"k4")], ); // Two ranges without overlap. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByKey, &data, &[Range::new(b"k0", b"k1"), Range::new(b"k3", b"k4")], ); // Two ranges with overlap. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByKey, &data, &[Range::new(b"k1", b"k3"), Range::new(b"k2", b"k4")], ); // One range contains the other range. - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByKey, &data, &[Range::new(b"k1", b"k4"), Range::new(b"k2", b"k3")], @@ -491,7 +461,7 @@ mod tests { for i in 1000..5000 { data.push(i.to_string().as_bytes().to_vec()); } - test_delete_all_in_range( + test_delete_ranges( DeleteStrategy::DeleteByWriter { sst_path }, &data, &[ @@ -538,9 +508,9 @@ mod tests { } check_data(&db, ALL_CFS, kvs.as_slice()); - db.delete_all_in_range(DeleteStrategy::DeleteFiles, &[Range::new(b"k2", b"k4")]) + db.delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[Range::new(b"k2", b"k4")]) .unwrap(); - db.delete_all_in_range(DeleteStrategy::DeleteBlobs, &[Range::new(b"k2", b"k4")]) + db.delete_ranges_cfs(DeleteStrategy::DeleteBlobs, &[Range::new(b"k2", b"k4")]) .unwrap(); check_data(&db, ALL_CFS, kvs_left.as_slice()); } @@ -585,7 +555,7 @@ mod tests { check_data(&db, &[cf], kvs.as_slice()); // Delete all in ["k2", "k4"). - db.delete_all_in_range( + db.delete_ranges_cfs( DeleteStrategy::DeleteByRange, &[Range::new(b"kabcdefg2", b"kabcdefg4")], ) diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index f0ba9d03c39..18991038ee8 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -14,6 +14,15 @@ pub enum DeleteStrategy { /// Delete the SST files that are fullly fit in range. However, the SST /// files that are partially overlapped with the range will not be /// touched. + /// + /// Note: + /// - After this operation, some keys in the range might still exist in + /// the database. + /// - After this operation, some keys in the range might be removed from + /// existing snapshot, so you shouldn't expect to be able to read data + /// from the range using existing snapshots any more. + /// + /// Ref: DeleteFiles, /// Delete the data stored in Titan. DeleteBlobs, @@ -33,7 +42,7 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn flush_cf(&self, cf: &str, wait: bool) -> Result<()>; - fn delete_all_in_range(&self, strategy: DeleteStrategy, ranges: &[Range<'_>]) -> Result<()> { + fn delete_ranges_cfs(&self, strategy: DeleteStrategy, ranges: &[Range<'_>]) -> Result<()> { for cf in self.cf_names() { self.delete_ranges_cf(cf, strategy.clone(), ranges)?; } @@ -59,18 +68,6 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { /// * total size (bytes) of all blob files. fn get_engine_used_size(&self) -> Result; - /// Roughly deletes files in multiple ranges. - /// - /// Note: - /// - After this operation, some keys in the range might still exist in - /// the database. - /// - After this operation, some keys in the range might be removed from - /// existing snapshot, so you shouldn't expect to be able to read data - /// from the range using existing snapshots any more. - /// - /// Ref: - fn roughly_cleanup_ranges(&self, ranges: &[(Vec, Vec)]) -> Result<()>; - /// The path to the directory on the filesystem where the database is stored fn path(&self) -> &str; diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 5743b0ec3a5..4ee3c5dc091 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1239,8 +1239,14 @@ impl RaftPollerBuilder { last_start_key = keys::enc_end_key(region); } ranges.push((last_start_key, keys::DATA_MAX_KEY.to_vec())); + let ranges: Vec<_> = ranges + .iter() + .map(|(start, end)| Range::new(start, end)) + .collect(); - self.engines.kv.roughly_cleanup_ranges(&ranges)?; + self.engines + .kv + .delete_ranges_cfs(DeleteStrategy::DeleteFiles, &ranges)?; info!( "cleans up garbage data"; @@ -2851,7 +2857,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER } drop(meta); - if let Err(e) = self.ctx.engines.kv.delete_all_in_range( + if let Err(e) = self.ctx.engines.kv.delete_ranges_cfs( DeleteStrategy::DeleteByKey, &[Range::new(&start_key, &end_key)], ) { diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 244ca514924..53b88d6ef16 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -50,10 +50,8 @@ use crate::{ // used to periodically check whether we should delete a stale peer's range in // region runner - #[cfg(test)] pub const STALE_PEER_CHECK_TICK: usize = 1; // 1000 milliseconds - #[cfg(not(test))] pub const STALE_PEER_CHECK_TICK: usize = 10; // 10000 milliseconds @@ -88,7 +86,8 @@ pub enum Task { }, /// Destroy data between [start_key, end_key). /// - /// The deletion may and may not succeed. + /// The actual deletion may be delayed if the engine is overloaded or a + /// reader is still referencing the data. Destroy { region_id: u64, start_key: Vec, @@ -133,8 +132,8 @@ struct StalePeerInfo { pub region_id: u64, pub end_key: Vec, // Once the oldest snapshot sequence exceeds this, it ensures that no one is - // reading on this peer anymore. So we can safely call `delete_files_in_range` - // , which may break the consistency of snapshot, of this peer range. + // reading on this peer anymore. So we can safely call `delete_files_in_range`, + // which may break the consistency of snapshot, of this peer range. pub stale_sequence: u64, } @@ -207,21 +206,27 @@ impl PendingDeleteRanges { /// /// Before an insert is called, it must call drain_overlap_ranges to clean /// the overlapping range. - fn insert(&mut self, region_id: u64, start_key: &[u8], end_key: &[u8], stale_sequence: u64) { - if !self.find_overlap_ranges(start_key, end_key).is_empty() { + fn insert( + &mut self, + region_id: u64, + start_key: Vec, + end_key: Vec, + stale_sequence: u64, + ) { + if !self.find_overlap_ranges(&start_key, &end_key).is_empty() { panic!( "[region {}] register deleting data in [{}, {}) failed due to overlap", region_id, - log_wrappers::Value::key(start_key), - log_wrappers::Value::key(end_key), + log_wrappers::Value::key(&start_key), + log_wrappers::Value::key(&end_key), ); } let info = StalePeerInfo { region_id, - end_key: end_key.to_owned(), + end_key, stale_sequence, }; - self.ranges.insert(start_key.to_owned(), info); + self.ranges.insert(start_key, info); } /// Gets all stale ranges info. @@ -243,21 +248,13 @@ impl PendingDeleteRanges { } } -#[derive(Clone)] -struct SnapContext -where - EK: KvEngine, -{ +struct SnapGenContext { engine: EK, - batch_size: usize, mgr: SnapManager, - use_delete_range: bool, - pending_delete_ranges: PendingDeleteRanges, - coprocessor_host: CoprocessorHost, router: R, } -impl SnapContext +impl SnapGenContext where EK: KvEngine, R: CasualRouter, @@ -347,6 +344,74 @@ where .generate .observe(start.saturating_elapsed_secs()); } +} + +pub struct Runner +where + EK: KvEngine, + T: PdClient + 'static, +{ + batch_size: usize, + use_delete_range: bool, + clean_stale_tick: usize, + clean_stale_check_interval: Duration, + + tiflash_stores: HashMap, + // we may delay some apply tasks if level 0 files to write stall threshold, + // pending_applies records all delayed apply task, and will check again later + pending_applies: VecDeque>, + // Ranges that have been logically destroyed at a specific sequence number. We can + // assume there will be no reader (engine snapshot) newer than that sequence number. Therefore, + // they can be physically deleted with `DeleteFiles` when we're sure there is no older + // reader as well. + // To protect this assumption, before a new snapshot is applied, the overlapping pending ranges + // must first be removed. + // The sole purpose of maintaining this list is to optimize deletion with `DeleteFiles` + // whenever we can. Errors while processing them can be ignored. + pending_delete_ranges: PendingDeleteRanges, + + engine: EK, + mgr: SnapManager, + coprocessor_host: CoprocessorHost, + router: R, + pd_client: Option>, + pool: ThreadPool, +} + +impl Runner +where + EK: KvEngine, + R: CasualRouter, + T: PdClient + 'static, +{ + pub fn new( + engine: EK, + mgr: SnapManager, + batch_size: usize, + use_delete_range: bool, + snap_generator_pool_size: usize, + coprocessor_host: CoprocessorHost, + router: R, + pd_client: Option>, + ) -> Runner { + Runner { + batch_size, + use_delete_range, + clean_stale_tick: 0, + clean_stale_check_interval: Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL), + tiflash_stores: HashMap::default(), + pending_applies: VecDeque::new(), + pending_delete_ranges: PendingDeleteRanges::default(), + engine, + mgr, + coprocessor_host, + router, + pd_client, + pool: Builder::new(thd_name!("snap-generator")) + .max_thread_count(snap_generator_pool_size) + .build_future_pool(), + } + } fn region_state(&self, region_id: u64) -> Result { let region_key = keys::region_state_key(region_id); @@ -389,18 +454,7 @@ where let start_key = keys::enc_start_key(®ion); let end_key = keys::enc_end_key(®ion); check_abort(&abort)?; - - // clear up origin data. - let overlap_ranges = self - .pending_delete_ranges - .drain_overlap_ranges(&start_key, &end_key); - if !overlap_ranges.is_empty() { - CLEAN_COUNTER_VEC - .with_label_values(&["overlap-with-apply"]) - .inc(); - self.cleanup_overlap_regions(overlap_ranges)?; - } - self.delete_all_in_range(&[Range::new(&start_key, &end_key)])?; + self.clean_overlap_ranges(start_key, end_key)?; check_abort(&abort)?; fail_point!("apply_snap_cleanup_range"); @@ -485,80 +539,77 @@ where let _ = self.router.send(region_id, CasualMessage::SnapshotApplied); } - /// Cleans up the data within the range. - fn cleanup_range(&self, ranges: &[Range<'_>]) -> Result<()> { - self.engine - .delete_all_in_range(DeleteStrategy::DeleteFiles, ranges) - .unwrap_or_else(|e| { - error!("failed to delete files in range"; "err" => %e); - }); - self.delete_all_in_range(ranges)?; - self.engine - .delete_all_in_range(DeleteStrategy::DeleteBlobs, ranges) - .unwrap_or_else(|e| { - error!("failed to delete files in range"; "err" => %e); - }); - Ok(()) - } - - /// Gets the overlapping ranges and cleans them up. - fn cleanup_overlap_regions( + /// Tries to clean up files in pending ranges overlapping with the given + /// bounds. These pending ranges will be removed. Returns an updated range + /// that also includes these ranges. Caller must ensure the remaining keys + /// in the returning range will be deleted properly. + fn clean_overlap_ranges_roughly( &mut self, - overlap_ranges: Vec<(u64, Vec, Vec, u64)>, - ) -> Result<()> { + mut start_key: Vec, + mut end_key: Vec, + ) -> (Vec, Vec) { + let overlap_ranges = self + .pending_delete_ranges + .drain_overlap_ranges(&start_key, &end_key); + if overlap_ranges.is_empty() { + return (start_key, end_key); + } + CLEAN_COUNTER_VEC.with_label_values(&["overlap"]).inc(); let oldest_sequence = self .engine .get_oldest_snapshot_sequence_number() .unwrap_or(u64::MAX); - let mut ranges = Vec::with_capacity(overlap_ranges.len()); - let mut df_ranges = Vec::with_capacity(overlap_ranges.len()); - for (region_id, start_key, end_key, stale_sequence) in overlap_ranges.iter() { - // `DeleteFiles` may break current rocksdb snapshots consistency, - // so do not use it unless we can make sure there is no reader of the destroyed - // peer anymore. - if *stale_sequence < oldest_sequence { - df_ranges.push(Range::new(start_key, end_key)); - } else { - SNAP_COUNTER_VEC - .with_label_values(&["overlap", "not_delete_files"]) - .inc(); - } - info!("delete data in range because of overlap"; "region_id" => region_id, - "start_key" => log_wrappers::Value::key(start_key), - "end_key" => log_wrappers::Value::key(end_key)); - ranges.push(Range::new(start_key, end_key)); - } + let df_ranges: Vec<_> = overlap_ranges + .iter() + .filter_map(|(region_id, cur_start, cur_end, stale_sequence)| { + info!( + "delete data in range because of overlap"; "region_id" => region_id, + "start_key" => log_wrappers::Value::key(cur_start), + "end_key" => log_wrappers::Value::key(cur_end) + ); + if &start_key > cur_start { + start_key = cur_start.clone(); + } + if &end_key < cur_end { + end_key = cur_end.clone(); + } + if *stale_sequence < oldest_sequence { + Some(Range::new(cur_start, cur_end)) + } else { + SNAP_COUNTER_VEC + .with_label_values(&["overlap", "not_delete_files"]) + .inc(); + None + } + }) + .collect(); self.engine - .delete_all_in_range(DeleteStrategy::DeleteFiles, &df_ranges) + .delete_ranges_cfs(DeleteStrategy::DeleteFiles, &df_ranges) .unwrap_or_else(|e| { error!("failed to delete files in range"; "err" => %e); }); + (start_key, end_key) + } - self.delete_all_in_range(&ranges) + /// Cleans up data in the given range and all pending ranges overlapping + /// with it. + fn clean_overlap_ranges(&mut self, start_key: Vec, end_key: Vec) -> Result<()> { + let (start_key, end_key) = self.clean_overlap_ranges_roughly(start_key, end_key); + self.delete_all_in_range(&[Range::new(&start_key, &end_key)]) } /// Inserts a new pending range, and it will be cleaned up with some delay. - fn insert_pending_delete_range(&mut self, region_id: u64, start_key: &[u8], end_key: &[u8]) { - let overlap_ranges = self - .pending_delete_ranges - .drain_overlap_ranges(start_key, end_key); - if !overlap_ranges.is_empty() { - CLEAN_COUNTER_VEC - .with_label_values(&["overlap-with-destroy"]) - .inc(); - if let Err(e) = self.cleanup_overlap_regions(overlap_ranges) { - warn!("cleanup_overlap_ranges failed"; - "region_id" => region_id, - "start_key" => log_wrappers::Value::key(start_key), - "end_key" => log_wrappers::Value::key(end_key), - "err" => %e, - ); - } - } + fn insert_pending_delete_range( + &mut self, + region_id: u64, + start_key: Vec, + end_key: Vec, + ) { + let (start_key, end_key) = self.clean_overlap_ranges_roughly(start_key, end_key); info!("register deleting data in range"; "region_id" => region_id, - "start_key" => log_wrappers::Value::key(start_key), - "end_key" => log_wrappers::Value::key(end_key), + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), ); let seq = self.engine.get_latest_sequence_number(); self.pending_delete_ranges @@ -575,33 +626,43 @@ where .engine .get_oldest_snapshot_sequence_number() .unwrap_or(u64::MAX); - let mut cleanup_ranges: Vec<(u64, Vec, Vec)> = self + let mut region_ranges: Vec<(u64, Vec, Vec)> = self .pending_delete_ranges .stale_ranges(oldest_sequence) .map(|(region_id, s, e)| (region_id, s.to_vec(), e.to_vec())) .collect(); - if cleanup_ranges.is_empty() { + if region_ranges.is_empty() { return; } CLEAN_COUNTER_VEC.with_label_values(&["destroy"]).inc_by(1); - cleanup_ranges.sort_by(|a, b| a.1.cmp(&b.1)); - while cleanup_ranges.len() > CLEANUP_MAX_REGION_COUNT { - cleanup_ranges.pop(); - } - let ranges: Vec> = cleanup_ranges + region_ranges.sort_by(|a, b| a.1.cmp(&b.1)); + region_ranges.truncate(CLEANUP_MAX_REGION_COUNT); + let ranges: Vec<_> = region_ranges .iter() .map(|(region_id, start, end)| { info!("delete data in range because of stale"; "region_id" => region_id, - "start_key" => log_wrappers::Value::key(start), - "end_key" => log_wrappers::Value::key(end)); + "start_key" => log_wrappers::Value::key(start), + "end_key" => log_wrappers::Value::key(end)); Range::new(start, end) }) .collect(); - if let Err(e) = self.cleanup_range(&ranges) { + + self.engine + .delete_ranges_cfs(DeleteStrategy::DeleteFiles, &ranges) + .unwrap_or_else(|e| { + error!("failed to delete files in range"; "err" => %e); + }); + if let Err(e) = self.delete_all_in_range(&ranges) { error!("failed to cleanup stale range"; "err" => %e); return; } - for (_, key, _) in cleanup_ranges { + self.engine + .delete_ranges_cfs(DeleteStrategy::DeleteBlobs, &ranges) + .unwrap_or_else(|e| { + error!("failed to delete blobs in range"; "err" => %e); + }); + + for (_, key, _) in region_ranges { assert!( self.pending_delete_ranges.remove(&key).is_some(), "cleanup pending_delete_ranges {} should exist", @@ -682,60 +743,6 @@ where ); Ok(()) } -} - -pub struct Runner -where - EK: KvEngine, - T: PdClient + 'static, -{ - pool: ThreadPool, - ctx: SnapContext, - // we may delay some apply tasks if level 0 files to write stall threshold, - // pending_applies records all delayed apply task, and will check again later - pending_applies: VecDeque>, - clean_stale_tick: usize, - clean_stale_check_interval: Duration, - tiflash_stores: HashMap, - pd_client: Option>, -} - -impl Runner -where - EK: KvEngine, - R: CasualRouter, - T: PdClient + 'static, -{ - pub fn new( - engine: EK, - mgr: SnapManager, - batch_size: usize, - use_delete_range: bool, - snap_generator_pool_size: usize, - coprocessor_host: CoprocessorHost, - router: R, - pd_client: Option>, - ) -> Runner { - Runner { - pool: Builder::new(thd_name!("snap-generator")) - .max_thread_count(snap_generator_pool_size) - .build_future_pool(), - ctx: SnapContext { - engine, - mgr, - batch_size, - use_delete_range, - pending_delete_ranges: PendingDeleteRanges::default(), - coprocessor_host, - router, - }, - pending_applies: VecDeque::new(), - clean_stale_tick: 0, - clean_stale_check_interval: Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL), - tiflash_stores: HashMap::default(), - pd_client, - } - } /// Tries to apply pending tasks if there is some. fn handle_pending_applies(&mut self) { @@ -744,7 +751,7 @@ where // should not handle too many applies than the number of files that can be // ingested. check level 0 every time because we can not make sure // how does the number of level 0 files change. - if self.ctx.ingest_maybe_stall() { + if self.ingest_maybe_stall() { break; } if let Some(Task::Apply { @@ -753,7 +760,7 @@ where peer_id, }) = self.pending_applies.pop_front() { - self.ctx.handle_apply(region_id, peer_id, status); + self.handle_apply(region_id, peer_id, status); } } } @@ -781,7 +788,6 @@ where } => { // It is safe for now to handle generating and applying snapshot concurrently, // but it may not when merge is implemented. - let ctx = self.ctx.clone(); let mut allow_multi_files_snapshot = false; // if to_store_id is 0, it means the to_store_id cannot be found if to_store_id != 0 { @@ -806,6 +812,11 @@ where } } + let ctx = SnapGenContext { + engine: self.engine.clone(), + mgr: self.mgr.clone(), + router: self.router.clone(), + }; self.pool.spawn(async move { tikv_alloc::add_thread_memory_accessor(); ctx.handle_gen( @@ -823,8 +834,8 @@ where } task @ Task::Apply { .. } => { fail_point!("on_region_worker_apply", true, |_| {}); - if self.ctx.coprocessor_host.should_pre_apply_snapshot() { - let _ = self.ctx.pre_apply_snapshot(&task); + if self.coprocessor_host.should_pre_apply_snapshot() { + let _ = self.pre_apply_snapshot(&task); } // to makes sure applying snapshots in order. self.pending_applies.push_back(task); @@ -842,9 +853,8 @@ where fail_point!("on_region_worker_destroy", true, |_| {}); // try to delay the range deletion because // there might be a coprocessor request related to this range - self.ctx - .insert_pending_delete_range(region_id, &start_key, &end_key); - self.ctx.clean_stale_ranges(); + self.insert_pending_delete_range(region_id, start_key, end_key); + self.clean_stale_ranges(); } } } @@ -864,7 +874,7 @@ where self.handle_pending_applies(); self.clean_stale_tick += 1; if self.clean_stale_tick >= STALE_PEER_CHECK_TICK { - self.ctx.clean_stale_ranges(); + self.clean_stale_ranges(); self.clean_stale_tick = 0; } } @@ -917,7 +927,12 @@ mod tests { e: &str, stale_sequence: u64, ) { - pending_delete_ranges.insert(id, s.as_bytes(), e.as_bytes(), stale_sequence); + pending_delete_ranges.insert( + id, + s.as_bytes().to_owned(), + e.as_bytes().to_owned(), + stale_sequence, + ); } #[test] diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 5b957b88822..25a5bccf32b 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -298,11 +298,11 @@ fn test_delete_files_in_range_for_titan() { // blob4: (b_7, b_value) // `delete_files_in_range` may expose some old keys. - // For Titan it may encounter `missing blob file` in `delete_all_in_range`, + // For Titan it may encounter `missing blob file` in `delete_ranges_cfs`, // so we set key_only for Titan. engines .kv - .delete_all_in_range( + .delete_ranges_cfs( DeleteStrategy::DeleteFiles, &[Range::new( &data_key(Key::from_raw(b"a").as_encoded()), @@ -312,7 +312,7 @@ fn test_delete_files_in_range_for_titan() { .unwrap(); engines .kv - .delete_all_in_range( + .delete_ranges_cfs( DeleteStrategy::DeleteByKey, &[Range::new( &data_key(Key::from_raw(b"a").as_encoded()), @@ -322,7 +322,7 @@ fn test_delete_files_in_range_for_titan() { .unwrap(); engines .kv - .delete_all_in_range( + .delete_ranges_cfs( DeleteStrategy::DeleteBlobs, &[Range::new( &data_key(Key::from_raw(b"a").as_encoded()), From 7a33cb611bbc99a216fd25fca7fb8713ac1648c1 Mon Sep 17 00:00:00 2001 From: Potato Date: Thu, 8 Sep 2022 10:32:56 +0800 Subject: [PATCH 193/676] storage: add perform_read_local fail_point to stabilize the test (#13427) ref tikv/tikv#12362 This commit adds `perform_read_local` fail_point so we can force the lease read to be triggered, which allows the test `test_read_execution_tracker` to pass stably. Signed-off-by: OneSizeFitQuorum --- components/raftstore/src/store/peer.rs | 2 ++ components/raftstore/src/store/worker/read.rs | 2 ++ tests/failpoints/cases/test_read_execution_tracker.rs | 4 ++++ 3 files changed, 8 insertions(+) diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 53747f082e4..edf88a561ba 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -5477,6 +5477,8 @@ pub trait RequestInspector { fail_point!("perform_read_index", |_| Ok(RequestPolicy::ReadIndex)); + fail_point!("perform_read_local", |_| Ok(RequestPolicy::ReadLocal)); + let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); if flags.contains(WriteBatchFlags::STALE_READ) { return Ok(RequestPolicy::StaleRead); diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 9c5889f876e..5efb750b863 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -416,6 +416,8 @@ impl ReadDelegate { } pub fn is_in_leader_lease(&self, ts: Timespec) -> bool { + fail_point!("perform_read_local", |_| true); + if let Some(ref lease) = self.leader_lease { let term = lease.term(); if term == self.term { diff --git a/tests/failpoints/cases/test_read_execution_tracker.rs b/tests/failpoints/cases/test_read_execution_tracker.rs index 4357d65af5f..c5ff93a70c1 100644 --- a/tests/failpoints/cases/test_read_execution_tracker.rs +++ b/tests/failpoints/cases/test_read_execution_tracker.rs @@ -59,6 +59,8 @@ fn test_read_execution_tracking() { ); }; + fail::cfg("perform_read_local", "return()").unwrap(); + // should perform lease read let resp = kv_read(&client, ctx.clone(), k1.clone(), 100); @@ -80,6 +82,8 @@ fn test_read_execution_tracking() { lease_read_checker(resp.get_exec_details_v2().get_scan_detail_v2()); + fail::remove("perform_read_local"); + let read_index_checker = |scan_detail: &ScanDetailV2| { assert!( scan_detail.get_read_index_propose_wait_nanos() > 0, From b0a80d497ccf0224c3ff01ded744e93c6d4686cb Mon Sep 17 00:00:00 2001 From: YangKeao Date: Wed, 7 Sep 2022 22:46:57 -0400 Subject: [PATCH 194/676] copr: fix wrong json opaque serialization (#13392) close tikv/tikv#13391 Signed-off-by: YangKeao Co-authored-by: Ti Chi Robot --- .../src/codec/mysql/json/serde.rs | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs index d15f728ed10..6c1f065f8d6 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/serde.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/serde.rs @@ -115,7 +115,11 @@ impl<'a> Serialize for JsonRef<'a> { .get_opaque_type() .map_err(|_| SerError::custom("invalid opaque type code"))?; - let str = format!("base64:type{}:{}", typ, base64::encode(bytes)); + let str = format!( + "base64:type{}:{}", + typ.to_u8().unwrap(), + base64::encode(bytes) + ); serializer.serialize_str(&str) } } @@ -227,6 +231,7 @@ impl<'de> Deserialize<'de> for Json { #[cfg(test)] mod tests { use super::*; + use crate::FieldTypeTp; #[test] fn test_from_str_for_object() { @@ -276,4 +281,40 @@ mod tests { resp.unwrap_err(); } } + + #[test] + fn test_to_str() { + let legal_cases = vec![ + ( + Json::from_kv_pairs(vec![( + b"key", + Json::from_str_val("value").unwrap().as_ref(), + )]) + .unwrap(), + r#"{"key": "value"}"#, + ), + ( + Json::from_array(vec![ + Json::from_str_val("d1").unwrap(), + Json::from_str_val("d2").unwrap(), + ]) + .unwrap(), + r#"["d1", "d2"]"#, + ), + (Json::from_i64(-3).unwrap(), r#"-3"#), + (Json::from_i64(3).unwrap(), r#"3"#), + (Json::from_f64(3.0).unwrap(), r#"3.0"#), + (Json::none().unwrap(), r#"null"#), + (Json::from_bool(true).unwrap(), r#"true"#), + (Json::from_bool(false).unwrap(), r#"false"#), + ( + Json::from_opaque(FieldTypeTp::VarString, &[0xAB, 0xCD]).unwrap(), + r#""base64:type253:q80=""#, + ), + ]; + + for (json, json_str) in legal_cases { + assert_eq!(json.to_string(), json_str); + } + } } From a57bb584ff85326c65ed76894d7c4c6b6a8b068e Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 7 Sep 2022 20:00:56 -0700 Subject: [PATCH 195/676] components: introduce tirocks module (#13411) ref tikv/tikv#13058 Only make it in the codebase, will not compile it. It will replace engine_rocks once it's finished. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 9 +- Cargo.toml | 12 +++ components/engine_tirocks/Cargo.toml | 9 ++ components/engine_tirocks/src/lib.rs | 12 +++ components/engine_tirocks/src/status.rs | 123 ++++++++++++++++++++++++ 5 files changed, 160 insertions(+), 5 deletions(-) create mode 100644 components/engine_tirocks/Cargo.toml create mode 100644 components/engine_tirocks/src/lib.rs create mode 100644 components/engine_tirocks/src/status.rs diff --git a/Cargo.lock b/Cargo.lock index f9dc0e6c418..aedc4328377 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -753,9 +753,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.69" +version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e70cc2f62c6ce1868963827bd677764c62d07c3d9a3e1fb1177ee1a9ab199eb2" +checksum = "2fff2a6927b3bb87f9595d67196a70493f627687a71d87a0d692242c33f58c11" dependencies = [ "jobserver", ] @@ -935,9 +935,8 @@ dependencies = [ [[package]] name = "cmake" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb6210b637171dfba4cda12e579ac6dc73f5165ad56133e5d72ef3131f320855" +version = "0.1.48" +source = "git+https://github.com/rust-lang/cmake-rs#00e6b220342a8b0ec4548071928ade38fd5f691b" dependencies = [ "cc", ] diff --git a/Cargo.toml b/Cargo.toml index e1dad6c5fa3..531449ab1b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -198,9 +198,14 @@ rusoto_mock = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr rusoto_s3 = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } rusoto_sts = { git = "https://github.com/tikv/rusoto", branch = "gh1482-s3-addr-styles" } +snappy-sys = { git = "https://github.com/busyjay/rust-snappy.git", branch = "static-link" } + # remove this when https://github.com/danburkert/fs2-rs/pull/42 is merged. fs2 = { git = "https://github.com/tabokie/fs2-rs", branch = "tikv" } +# Remove this when a new version is release. We need to solve rust-lang/cmake-rs#143. +cmake = { git = "https://github.com/rust-lang/cmake-rs" } + [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } # When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to @@ -236,6 +241,9 @@ members = [ "components/encryption", "components/encryption/export", "components/engine_rocks_helper", +# Only enable tirocks in local development, otherwise it can slow down compilation. +# TODO: always enable tirocks and remove engine_rocks. +# "components/engine_tirocks", "components/error_code", "components/external_storage", "components/external_storage/export", @@ -292,6 +300,10 @@ opt-level = 1 debug = false opt-level = 1 +[profile.dev.package.tirocks-sys] +debug = false +opt-level = 1 + [profile.dev.package.tests] debug = 1 opt-level = 1 diff --git a/components/engine_tirocks/Cargo.toml b/components/engine_tirocks/Cargo.toml new file mode 100644 index 00000000000..31b3122d842 --- /dev/null +++ b/components/engine_tirocks/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "engine_tirocks" +version = "0.1.0" +edition = "2021" + +[dependencies] +engine_traits = { path = "../engine_traits" } +tikv_alloc = { path = "../tikv_alloc" } +tirocks = { git = "https://github.com/busyjay/tirocks.git", branch = "dev" } diff --git a/components/engine_tirocks/src/lib.rs b/components/engine_tirocks/src/lib.rs new file mode 100644 index 00000000000..3257eb9f0ae --- /dev/null +++ b/components/engine_tirocks/src/lib.rs @@ -0,0 +1,12 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! A new implementation of engine_traits using tirocks. +//! +//! When all features of engine_rocks are implemented in this module, +//! engine_rocks will be removed and TiKV will switch to tirocks. + +extern crate tikv_alloc as _; + +mod status; + +pub use status::*; diff --git a/components/engine_tirocks/src/status.rs b/components/engine_tirocks/src/status.rs new file mode 100644 index 00000000000..13ae730562f --- /dev/null +++ b/components/engine_tirocks/src/status.rs @@ -0,0 +1,123 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub fn to_engine_trait_status(s: tirocks::Status) -> engine_traits::Status { + let code = match s.code() { + tirocks::Code::kOk => engine_traits::Code::Ok, + tirocks::Code::kNotFound => engine_traits::Code::NotFound, + tirocks::Code::kCorruption => engine_traits::Code::Corruption, + tirocks::Code::kNotSupported => engine_traits::Code::NotSupported, + tirocks::Code::kInvalidArgument => engine_traits::Code::InvalidArgument, + tirocks::Code::kIOError => engine_traits::Code::IoError, + tirocks::Code::kMergeInProgress => engine_traits::Code::MergeInProgress, + tirocks::Code::kIncomplete => engine_traits::Code::Incomplete, + tirocks::Code::kShutdownInProgress => engine_traits::Code::ShutdownInProgress, + tirocks::Code::kTimedOut => engine_traits::Code::TimedOut, + tirocks::Code::kAborted => engine_traits::Code::Aborted, + tirocks::Code::kBusy => engine_traits::Code::Busy, + tirocks::Code::kExpired => engine_traits::Code::Expired, + tirocks::Code::kTryAgain => engine_traits::Code::TryAgain, + tirocks::Code::kCompactionTooLarge => engine_traits::Code::CompactionTooLarge, + tirocks::Code::kColumnFamilyDropped => engine_traits::Code::ColumnFamilyDropped, + tirocks::Code::kMaxCode => unreachable!(), + }; + let sev = match s.severity() { + tirocks::Severity::kNoError => engine_traits::Severity::NoError, + tirocks::Severity::kSoftError => engine_traits::Severity::SoftError, + tirocks::Severity::kHardError => engine_traits::Severity::HardError, + tirocks::Severity::kFatalError => engine_traits::Severity::FatalError, + tirocks::Severity::kUnrecoverableError => engine_traits::Severity::UnrecoverableError, + tirocks::Severity::kMaxSeverity => unreachable!(), + }; + let sub_code = match s.sub_code() { + tirocks::SubCode::kNone => engine_traits::SubCode::None, + tirocks::SubCode::kMutexTimeout => engine_traits::SubCode::MutexTimeout, + tirocks::SubCode::kLockTimeout => engine_traits::SubCode::LockTimeout, + tirocks::SubCode::kLockLimit => engine_traits::SubCode::LockLimit, + tirocks::SubCode::kNoSpace => engine_traits::SubCode::NoSpace, + tirocks::SubCode::kDeadlock => engine_traits::SubCode::Deadlock, + tirocks::SubCode::kStaleFile => engine_traits::SubCode::StaleFile, + tirocks::SubCode::kMemoryLimit => engine_traits::SubCode::MemoryLimit, + tirocks::SubCode::kSpaceLimit => engine_traits::SubCode::SpaceLimit, + tirocks::SubCode::kPathNotFound => engine_traits::SubCode::PathNotFound, + tirocks::SubCode::KMergeOperandsInsufficientCapacity => { + engine_traits::SubCode::MergeOperandsInsufficientCapacity + } + tirocks::SubCode::kManualCompactionPaused => engine_traits::SubCode::ManualCompactionPaused, + tirocks::SubCode::kOverwritten => engine_traits::SubCode::Overwritten, + tirocks::SubCode::kTxnNotPrepared => engine_traits::SubCode::TxnNotPrepared, + tirocks::SubCode::kIOFenced => engine_traits::SubCode::IoFenced, + tirocks::SubCode::kMaxSubCode => unreachable!(), + }; + let mut es = match s.state().map(|s| String::from_utf8_lossy(s).into_owned()) { + Some(msg) => engine_traits::Status::with_error(code, msg), + None => engine_traits::Status::with_code(code), + }; + es.set_severity(sev).set_sub_code(sub_code); + es +} + +/// A function that will transform a rocksdb error to engine trait error. +/// +/// r stands for rocksdb, e stands for engine_trait. +pub fn r2e(s: tirocks::Status) -> engine_traits::Error { + engine_traits::Error::Engine(to_engine_trait_status(s)) +} + +/// A function that will transform a engine trait error to rocksdb error. +/// +/// r stands for rocksdb, e stands for engine_trait. +pub fn e2r(s: engine_traits::Error) -> tirocks::Status { + let s = match s { + engine_traits::Error::Engine(s) => s, + // Any better options than IOError? + _ => return tirocks::Status::with_error(tirocks::Code::kIOError, format!("{}", s)), + }; + let code = match s.code() { + engine_traits::Code::Ok => tirocks::Code::kOk, + engine_traits::Code::NotFound => tirocks::Code::kNotFound, + engine_traits::Code::Corruption => tirocks::Code::kCorruption, + engine_traits::Code::NotSupported => tirocks::Code::kNotSupported, + engine_traits::Code::InvalidArgument => tirocks::Code::kInvalidArgument, + engine_traits::Code::IoError => tirocks::Code::kIOError, + engine_traits::Code::MergeInProgress => tirocks::Code::kMergeInProgress, + engine_traits::Code::Incomplete => tirocks::Code::kIncomplete, + engine_traits::Code::ShutdownInProgress => tirocks::Code::kShutdownInProgress, + engine_traits::Code::TimedOut => tirocks::Code::kTimedOut, + engine_traits::Code::Aborted => tirocks::Code::kAborted, + engine_traits::Code::Busy => tirocks::Code::kBusy, + engine_traits::Code::Expired => tirocks::Code::kExpired, + engine_traits::Code::TryAgain => tirocks::Code::kTryAgain, + engine_traits::Code::CompactionTooLarge => tirocks::Code::kCompactionTooLarge, + engine_traits::Code::ColumnFamilyDropped => tirocks::Code::kColumnFamilyDropped, + }; + let sev = match s.severity() { + engine_traits::Severity::NoError => tirocks::Severity::kNoError, + engine_traits::Severity::SoftError => tirocks::Severity::kSoftError, + engine_traits::Severity::HardError => tirocks::Severity::kHardError, + engine_traits::Severity::FatalError => tirocks::Severity::kFatalError, + engine_traits::Severity::UnrecoverableError => tirocks::Severity::kUnrecoverableError, + }; + let sub_code = match s.sub_code() { + engine_traits::SubCode::None => tirocks::SubCode::kNone, + engine_traits::SubCode::MutexTimeout => tirocks::SubCode::kMutexTimeout, + engine_traits::SubCode::LockTimeout => tirocks::SubCode::kLockTimeout, + engine_traits::SubCode::LockLimit => tirocks::SubCode::kLockLimit, + engine_traits::SubCode::NoSpace => tirocks::SubCode::kNoSpace, + engine_traits::SubCode::Deadlock => tirocks::SubCode::kDeadlock, + engine_traits::SubCode::StaleFile => tirocks::SubCode::kStaleFile, + engine_traits::SubCode::MemoryLimit => tirocks::SubCode::kMemoryLimit, + engine_traits::SubCode::SpaceLimit => tirocks::SubCode::kSpaceLimit, + engine_traits::SubCode::PathNotFound => tirocks::SubCode::kPathNotFound, + engine_traits::SubCode::MergeOperandsInsufficientCapacity => { + tirocks::SubCode::KMergeOperandsInsufficientCapacity + } + engine_traits::SubCode::ManualCompactionPaused => tirocks::SubCode::kManualCompactionPaused, + engine_traits::SubCode::Overwritten => tirocks::SubCode::kOverwritten, + engine_traits::SubCode::TxnNotPrepared => tirocks::SubCode::kTxnNotPrepared, + engine_traits::SubCode::IoFenced => tirocks::SubCode::kIOFenced, + }; + let mut ts = tirocks::Status::with_error(code, s.state()); + ts.set_severity(sev); + ts.set_sub_code(sub_code); + ts +} From 76f4a4e7ca6b15a1cc8e65c54049b816c2a1c45d Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 7 Sep 2022 21:06:56 -0700 Subject: [PATCH 196/676] raftstore-v2: support peer create and destroy (#13334) ref tikv/tikv#7475, ref tikv/tikv#12842 Compared to v1, there are few differences: - peer create is forced to go through store fsm, - destroy is fully asynchronous, - there is no wait for log gc as all writes go to raft io worker now. - uninitialized peer is always persisted, so problems like #7475 will not exist. - ranges are allowed to be conflict, it simplifies code a lot. This PR also adds a debug message to verify memory states easily. We still need to make leader to trace and gc removed peers. This will be implemented in next PR. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/batch-system/src/router.rs | 16 + components/engine_rocks/src/raft_engine.rs | 2 + components/engine_traits/src/engine.rs | 4 +- components/raftstore-v2/src/batch/store.rs | 115 +++---- components/raftstore-v2/src/fsm/mod.rs | 2 +- components/raftstore-v2/src/fsm/peer.rs | 14 +- components/raftstore-v2/src/fsm/store.rs | 71 ++++- components/raftstore-v2/src/operation/life.rs | 284 ++++++++++++++++++ components/raftstore-v2/src/operation/mod.rs | 2 + .../src/operation/ready/async_writer.rs | 4 + .../raftstore-v2/src/operation/ready/mod.rs | 93 +++++- components/raftstore-v2/src/raft/peer.rs | 53 ++-- components/raftstore-v2/src/raft/storage.rs | 85 +++++- components/raftstore-v2/src/router/message.rs | 19 +- .../raftstore-v2/tests/integrations/mod.rs | 180 ++++++----- .../tests/integrations/test_life.rs | 194 ++++++++++++ .../raftstore/src/store/async_io/write.rs | 55 ++-- 17 files changed, 993 insertions(+), 200 deletions(-) create mode 100644 components/raftstore-v2/src/operation/life.rs create mode 100644 components/raftstore-v2/tests/integrations/test_life.rs diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 8b0936a9faa..660ab014939 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -174,6 +174,22 @@ where .store(normals.map.len(), Ordering::Relaxed); } + /// Same as send a message and then register the mailbox. + /// + /// The mailbox will not be registered if the message can't be sent. + pub fn send_and_register( + &self, + addr: u64, + mailbox: BasicMailbox, + msg: N::Message, + ) -> Result<(), (BasicMailbox, N::Message)> { + if let Err(SendError(m)) = mailbox.force_send(msg, &self.normal_scheduler) { + return Err((mailbox, m)); + } + self.register(addr, mailbox); + Ok(()) + } + pub fn register_all(&self, mailboxes: Vec<(u64, BasicMailbox)>) { let mut normals = self.normals.lock().unwrap(); normals.map.reserve(mailboxes.len()); diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index 605ef4c5514..9e70f7158a7 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -252,6 +252,8 @@ impl RaftEngine for RocksEngine { batch: &mut Self::LogBatch, ) -> Result<()> { batch.delete(&keys::raft_state_key(raft_group_id))?; + batch.delete(&keys::region_state_key(raft_group_id))?; + batch.delete(&keys::apply_state_key(raft_group_id))?; if first_index == 0 { let seek_key = keys::raft_log_key(raft_group_id, 0); let prefix = keys::raft_log_prefix(raft_group_id); diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index e59d9104e56..e3e767f0ed2 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -219,7 +219,7 @@ impl OpenOptions { /// A factory trait to create new engine. // It should be named as `EngineFactory` for consistency, but we are about to // rename engine to tablet, so always use tablet for new traits/types. -pub trait TabletFactory: TabletAccessor { +pub trait TabletFactory: TabletAccessor + Send + Sync { /// Open the tablet with id and suffix according to the OpenOptions. /// /// The id is likely the region Id, the suffix could be the current raft log @@ -287,7 +287,7 @@ where impl TabletFactory for DummyFactory where - EK: CfOptionsExt + Clone + Send + 'static, + EK: CfOptionsExt + Clone + Send + Sync + 'static, { fn create_shared_db(&self) -> Result { Ok(self.engine.as_ref().unwrap().clone()) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index d30490f50d5..1a2d9b3750e 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -11,10 +11,13 @@ use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, HandleResult, HandlerBuilder, PollHandler, }; use collections::HashMap; -use crossbeam::channel::Sender; +use crossbeam::channel::{Sender, TrySendError}; use engine_traits::{Engines, KvEngine, RaftEngine, TabletFactory}; use futures::{compat::Future01CompatExt, FutureExt}; -use kvproto::{metapb::Store, raft_serverpb::PeerState}; +use kvproto::{ + metapb::Store, + raft_serverpb::{PeerState, RaftMessage}, +}; use raft::INVALID_ID; use raftstore::store::{ fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, RaftlogFetchRunner, @@ -28,12 +31,13 @@ use tikv_util::{ time::Instant as TiInstant, timer::SteadyTimer, worker::{Scheduler, Worker}, + Either, }; use super::apply::{create_apply_batch_system, ApplyPollerBuilder, ApplyRouter, ApplySystem}; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate}, - raft::Peer, + raft::{Peer, Storage}, router::{PeerMsg, PeerTick, StoreMsg}, Error, Result, }; @@ -55,28 +59,9 @@ pub struct StoreContext { /// The precise timer for scheduling tick. pub timer: SteadyTimer, pub write_senders: WriteSenders, -} - -impl StoreContext { - fn new( - cfg: Config, - trans: T, - router: StoreRouter, - write_senders: WriteSenders, - logger: Logger, - ) -> Self { - Self { - logger, - trans, - has_ready: false, - raft_metrics: RaftMetrics::new(cfg.waterfall_metrics), - cfg, - router, - tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], - timer: SteadyTimer::default(), - write_senders, - } - } + pub engine: ER, + pub tablet_factory: Arc>, + pub log_fetch_scheduler: Scheduler, } /// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. @@ -260,19 +245,17 @@ impl StorePollerBuilder { self.engine .for_each_raft_group::(&mut |region_id| { assert_ne!(region_id, INVALID_ID); - let peer = match Peer::new( - &cfg, + let storage = match Storage::new( region_id, self.store_id, - self.tablet_factory.as_ref(), self.engine.clone(), self.log_fetch_scheduler.clone(), &self.logger, )? { - Some(peer) => peer, + Some(p) => p, None => return Ok(()), }; - let pair = PeerFsm::new(&cfg, peer)?; + let pair = PeerFsm::new(&cfg, &*self.tablet_factory, storage)?; let prev = regions.insert(region_id, pair); if let Some((_, p)) = prev { return Err(box_err!( @@ -303,13 +286,21 @@ where type Handler = StorePoller; fn build(&mut self, priority: batch_system::Priority) -> Self::Handler { - let poll_ctx = StoreContext::new( - self.cfg.value().clone(), - self.trans.clone(), - self.router.clone(), - self.write_senders.clone(), - self.logger.clone(), - ); + let cfg = self.cfg.value().clone(); + let poll_ctx = StoreContext { + logger: self.logger.clone(), + trans: self.trans.clone(), + has_ready: false, + raft_metrics: RaftMetrics::new(cfg.waterfall_metrics), + cfg, + router: self.router.clone(), + tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], + timer: SteadyTimer::default(), + write_senders: self.write_senders.clone(), + engine: self.engine.clone(), + tablet_factory: self.tablet_factory.clone(), + log_fetch_scheduler: self.log_fetch_scheduler.clone(), + }; let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); StorePoller::new(poll_ctx, cfg_tracker) } @@ -344,7 +335,7 @@ pub struct StoreSystem { impl StoreSystem { pub fn start( &mut self, - store: Store, + store_id: u64, cfg: Arc>, raft_engine: ER, tablet_factory: Arc>, @@ -355,14 +346,9 @@ impl StoreSystem { T: Transport + 'static, { let mut workers = Workers::default(); - workers.store_writers.spawn( - store.get_id(), - raft_engine.clone(), - None, - router, - &trans, - &cfg, - )?; + workers + .store_writers + .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; let log_fetch_scheduler = workers.log_fetch_worker.start( "raftlog-fetch-worker", RaftlogFetchRunner::new(router.clone(), raft_engine.clone()), @@ -370,7 +356,7 @@ impl StoreSystem { let mut builder = StorePollerBuilder::new( cfg.clone(), - store.get_id(), + store_id, raft_engine, tablet_factory, trans, @@ -385,7 +371,7 @@ impl StoreSystem { .schedule_all(peers.values().map(|pair| pair.1.peer())); // Choose a different name so we know what version is actually used. rs stands // for raft store. - let tag = format!("rs-{}", store.get_id()); + let tag = format!("rs-{}", store_id); self.system.spawn(tag, builder); let mut mailboxes = Vec::with_capacity(peers.len()); @@ -403,7 +389,7 @@ impl StoreSystem { for addr in address { router.force_send(addr, PeerMsg::Start).unwrap(); } - router.send_control(StoreMsg::Start { store }).unwrap(); + router.send_control(StoreMsg::Start).unwrap(); let apply_poller_builder = ApplyPollerBuilder::new(cfg); self.apply_system @@ -436,6 +422,33 @@ impl StoreRouter { pub fn logger(&self) -> &Logger { &self.logger } + + pub fn send_raft_message( + &self, + msg: Box, + ) -> std::result::Result<(), TrySendError>> { + let id = msg.get_region_id(); + let peer_msg = PeerMsg::RaftMessage(msg); + let store_msg = match self.try_send(id, peer_msg) { + Either::Left(Ok(())) => return Ok(()), + Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m)))) => { + return Err(TrySendError::Full(m)); + } + Either::Left(Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m)))) => { + return Err(TrySendError::Disconnected(m)); + } + Either::Right(PeerMsg::RaftMessage(m)) => StoreMsg::RaftMessage(m), + _ => unreachable!(), + }; + match self.send_control(store_msg) { + Ok(()) => Ok(()), + Err(TrySendError::Full(StoreMsg::RaftMessage(m))) => Err(TrySendError::Full(m)), + Err(TrySendError::Disconnected(StoreMsg::RaftMessage(m))) => { + Err(TrySendError::Disconnected(m)) + } + _ => unreachable!(), + } + } } impl Deref for StoreRouter { @@ -457,14 +470,14 @@ impl DerefMut for StoreRouter { /// Creates the batch system for polling raft activities. pub fn create_store_batch_system( cfg: &Config, - store: Store, + store_id: u64, logger: Logger, ) -> (StoreRouter, StoreSystem) where EK: KvEngine, ER: RaftEngine, { - let (store_tx, store_fsm) = StoreFsm::new(cfg, store); + let (store_tx, store_fsm) = StoreFsm::new(cfg, store_id, logger.clone()); let (router, system) = batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); let (apply_router, apply_system) = create_apply_batch_system(cfg); diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs index 8126c8a868a..191f629900a 100644 --- a/components/raftstore-v2/src/fsm/mod.rs +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -11,4 +11,4 @@ mod store; pub use apply::{ApplyFsm, ApplyFsmDelegate}; pub use peer::{PeerFsm, PeerFsmDelegate, SenderFsmPair}; -pub use store::{StoreFsm, StoreFsmDelegate, StoreMeta}; +pub use store::{Store, StoreFsm, StoreFsmDelegate, StoreMeta}; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 886d8b2323a..5e3c2674fe5 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -6,7 +6,7 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, TabletFactory}; use kvproto::metapb; use raftstore::store::{Config, Transport}; use slog::{debug, error, info, trace, Logger}; @@ -18,7 +18,7 @@ use tikv_util::{ use crate::{ batch::StoreContext, - raft::Peer, + raft::{Peer, Storage}, router::{PeerMsg, PeerTick}, Result, }; @@ -36,7 +36,12 @@ pub struct PeerFsm { } impl PeerFsm { - pub fn new(cfg: &Config, peer: Peer) -> Result> { + pub fn new( + cfg: &Config, + tablet_factory: &dyn TabletFactory, + storage: Storage, + ) -> Result> { + let peer = Peer::new(cfg, tablet_factory, storage)?; info!(peer.logger, "create peer"); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); let fsm = Box::new(PeerFsm { @@ -200,7 +205,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec) { for msg in peer_msgs_buf.drain(..) { match msg { - PeerMsg::RaftMessage(_) => unimplemented!(), + PeerMsg::RaftMessage(msg) => self.fsm.peer.on_raft_message(self.store_ctx, msg), PeerMsg::RaftQuery(cmd) => { self.on_receive_command(cmd.send_time); self.on_query(cmd.request, cmd.ch) @@ -224,6 +229,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerMsg::FetchedLogs(fetched_logs) => { self.fsm.peer_mut().on_fetched_logs(fetched_logs) } + PeerMsg::QueryDebugInfo(ch) => self.fsm.peer_mut().on_query_debug_info(ch), } } } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 61a3f76b138..0f607e5a1de 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -1,14 +1,20 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::time::SystemTime; + use batch_system::Fsm; use collections::HashMap; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::metapb::Store; use raftstore::store::{Config, ReadDelegate}; +use slog::{o, Logger}; use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; -use crate::{batch::StoreContext, router::StoreMsg, tablet::CachedTablet}; +use crate::{ + batch::StoreContext, + router::{StoreMsg, StoreTick}, + tablet::CachedTablet, +}; pub struct StoreMeta where @@ -34,16 +40,49 @@ where } } +pub struct Store { + id: u64, + // Unix time when it's started. + start_time: Option, + logger: Logger, +} + +impl Store { + pub fn new(id: u64, logger: Logger) -> Store { + Store { + id, + start_time: None, + logger: logger.new(o!("store_id" => id)), + } + } + + pub fn store_id(&self) -> u64 { + self.id + } + + pub fn start_time(&self) -> Option { + self.start_time + } + + pub fn logger(&self) -> &Logger { + &self.logger + } +} + pub struct StoreFsm { store: Store, receiver: Receiver, } impl StoreFsm { - pub fn new(cfg: &Config, store: Store) -> (LooseBoundedSender, Box) { + pub fn new( + cfg: &Config, + store_id: u64, + logger: Logger, + ) -> (LooseBoundedSender, Box) { let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); let fsm = Box::new(StoreFsm { - store, + store: Store::new(store_id, logger), receiver: rx, }); (tx, fsm) @@ -84,9 +123,29 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { Self { fsm, store_ctx } } - pub fn handle_msgs(&self, store_msg_buf: &mut Vec) { + fn on_start(&mut self) { + if self.fsm.store.start_time.is_some() { + panic!("{:?} unable to start again", self.fsm.store.logger.list(),); + } + + self.fsm.store.start_time = Some( + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_or(0, |d| d.as_secs()), + ); + } + + fn on_tick(&mut self, tick: StoreTick) { + unimplemented!() + } + + pub fn handle_msgs(&mut self, store_msg_buf: &mut Vec) { for msg in store_msg_buf.drain(..) { - // TODO: handle the messages. + match msg { + StoreMsg::Start => self.on_start(), + StoreMsg::Tick(tick) => self.on_tick(tick), + StoreMsg::RaftMessage(msg) => self.fsm.store.on_raft_message(self.store_ctx, msg), + } } } } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs new file mode 100644 index 00000000000..59e9057b846 --- /dev/null +++ b/components/raftstore-v2/src/operation/life.rs @@ -0,0 +1,284 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements the creation and destruction of peer. +//! +//! A peer can only be created by either: +//! - bootstrapping a cluster, it's coverred in crate::bootstrap; +//! - receiving a RaftMessage. +//! +//! In v1, it can also be created by split. In v2, it's required to create by +//! sending a message to store fsm first, and then using split to initialized +//! the peer. + +use std::cmp; + +use batch_system::BasicMailbox; +use crossbeam::channel::TrySendError; +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + metapb::Region, + raft_serverpb::{PeerState, RaftMessage}, +}; +use raftstore::store::{util, ExtraStates, WriteTask}; +use slog::{debug, error, info}; + +use crate::{ + batch::StoreContext, + fsm::{PeerFsm, Store, StoreFsmDelegate}, + raft::{Peer, Storage}, + router::PeerMsg, +}; + +/// When a peer is about to destroy, it becomes `WaitReady` first. If there is +/// no pending asynchronous apply, it becomes `Destroying` and then start +/// destroying asynchronously during handling ready. After the asynchronously +/// destroying is finished, it becomes `Destroyed`. +pub enum DestroyProgress { + /// Alive means destroy is not triggered at all. It's the same as None for + /// `Option`. Not using Option to avoid unwrap everywhere. + None, + /// If the destroy is triggered by message, then the message will be used + /// for creating new peer immediately. + WaitReady(Option>), + Destroying(Option>), + Destroyed, +} + +impl DestroyProgress { + #[inline] + pub fn started(&self) -> bool { + matches!( + self, + DestroyProgress::Destroying(_) | DestroyProgress::Destroyed + ) + } + + #[inline] + pub fn waiting(&self) -> bool { + matches!(self, DestroyProgress::WaitReady(_)) + } + + #[inline] + fn start(&mut self) { + match self { + DestroyProgress::WaitReady(msg) => *self = DestroyProgress::Destroying(msg.take()), + _ => panic!("must wait ready first to start destroying"), + } + } + + #[inline] + fn wait_with(&mut self, triggered_msg: Option>) { + match self { + DestroyProgress::None => *self = DestroyProgress::WaitReady(triggered_msg), + _ => panic!("must be alive to wait"), + } + } + + #[inline] + fn finish(&mut self) -> Option> { + match self { + DestroyProgress::Destroying(msg) => { + let msg = msg.take(); + *self = DestroyProgress::Destroyed; + msg + } + _ => panic!("must be destroying to finish"), + } + } +} + +impl Store { + /// When a message's recipient doesn't exist, it will be redirected to + /// store. Store is responsible for checking if it's neccessary to create + /// a peer to handle the message. + #[inline] + pub fn on_raft_message( + &mut self, + ctx: &mut StoreContext, + msg: Box, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let region_id = msg.get_region_id(); + // The message can be sent when the peer is being created, so try send it first. + let msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m))) = + ctx.router.send(region_id, PeerMsg::RaftMessage(msg)) + { + m + } else { + return; + }; + let msg_type = msg.get_message().get_msg_type(); + let from_peer = msg.get_from_peer(); + let to_peer = msg.get_to_peer(); + // Now the peer should not exist. + debug!( + self.logger(), + "handle raft message"; + "from_peer_id" => from_peer.id, + "to_peer_id" => to_peer.id, + "region_id" => region_id, + "msg_type" => %util::MsgType(&msg) + ); + if to_peer.store_id != self.store_id() { + ctx.raft_metrics.message_dropped.mismatch_store_id.inc(); + return; + } + if !msg.has_region_epoch() { + ctx.raft_metrics.message_dropped.mismatch_region_epoch.inc(); + return; + } + // TODO: maybe we need to ack the message to confirm the peer is destroyed. + if msg.get_is_tombstone() || msg.has_merge_target() { + // Target tombstone peer doesn't exist, so ignore it. + ctx.raft_metrics.message_dropped.stale_msg.inc(); + return; + } + let from_epoch = msg.get_region_epoch(); + let local_state = match ctx.engine.get_region_state(region_id) { + Ok(s) => s, + Err(e) => { + error!(self.logger(), "failed to get region state"; "region_id" => region_id, "err" => ?e); + return; + } + }; + if let Some(local_state) = local_state { + // Split will not create peer in v2, so the state must be Tombstone. + if local_state.get_state() != PeerState::Tombstone { + panic!( + "[region {}] {} peer doesn't exist but has valid local state {:?}", + region_id, to_peer.id, local_state + ); + } + // Compared to v1, we rely on leader to confirm destroy actively, so here + // skip handling gc for simplicity. + let local_epoch = local_state.get_region().get_region_epoch(); + // The region in this peer is already destroyed + if util::is_epoch_stale(from_epoch, local_epoch) { + ctx.raft_metrics.message_dropped.region_tombstone_peer.inc(); + return; + } + if let Some(local_peer) = util::find_peer(local_state.get_region(), self.store_id()) { + if to_peer.id <= local_peer.get_id() { + ctx.raft_metrics.message_dropped.region_tombstone_peer.inc(); + return; + } + } + } + + // So the peer must need to be created. We don't need to synchronous with split + // as split won't create peer in v2. And we don't check for range + // conflict as v2 depends on tablet, which allows conflict ranges. + let mut region = Region::default(); + region.set_id(region_id); + region.set_region_epoch(from_epoch.clone()); + // Peer list doesn't have to be complete, as it's uninitialized. + region.mut_peers().push(from_peer.clone()); + region.mut_peers().push(to_peer.clone()); + // We don't set the region range here as we allow range conflict. + let (tx, fsm) = match Storage::uninit( + self.store_id(), + region, + ctx.engine.clone(), + ctx.log_fetch_scheduler.clone(), + &ctx.logger, + ) + .and_then(|s| PeerFsm::new(&ctx.cfg, &*ctx.tablet_factory, s)) + { + Ok(p) => p, + res => { + error!(self.logger(), "failed to create peer"; "region_id" => region_id, "peer_id" => to_peer.id, "err" => ?res.err()); + return; + } + }; + let mailbox = BasicMailbox::new(tx, fsm, ctx.router.state_cnt().clone()); + if let Err((p, _)) = ctx + .router + .send_and_register(region_id, mailbox, PeerMsg::Start) + { + panic!( + "[region {}] {} failed to register peer", + region_id, to_peer.id + ); + } + // Only forward valid message. Split may use a message without sender to trigger + // creating a peer. + if from_peer.id != raft::INVALID_ID { + // For now the peer only exists in memory. It will persist its states when + // handling its first readiness. + let _ = ctx.router.send(region_id, PeerMsg::RaftMessage(msg)); + } + } +} + +impl Peer { + /// A peer can be destroyed in three cases: + /// 1. Received a gc message; + /// 2. Received a message whose target peer's ID is larger than this; + /// 3. Applied a conf remove self command. + /// In all cases, the peer will be destroyed asynchronousely in next + /// handle_raft_ready. + /// `triggered_msg` will be sent to store fsm after destroy is finished. + /// Should set the message only when the target peer is supposed to be + /// created afterward. + pub fn mark_for_destroy(&mut self, triggered_msg: Option>) { + if self.serving() { + self.destroy_progress_mut().wait_with(triggered_msg); + self.set_has_ready(); + } + } + + /// In v2, it's possible to destroy the peer without waiting for apply. But + /// we better wait till all previous entries are applied in case there + /// are split. It's a waste to use snapshot to restore newly split + /// tablet. + #[inline] + pub fn postpond_destroy(&self) -> bool { + let entry_storage = self.storage().entry_storage(); + // TODO: check actual split index instead of commit index. + entry_storage.applied_index() != entry_storage.commit_index() + } + + /// Start the destroy progress. It will write `Tombstone` state + /// asynchronously. + /// + /// After destroy is finished, `finish_destroy` should be called to clean up + /// memory states. + pub fn start_destroy(&mut self, write_task: &mut WriteTask) { + let entry_storage = self.storage().entry_storage(); + if self.postpond_destroy() { + return; + } + let first_index = entry_storage.first_index(); + let last_index = entry_storage.last_index(); + if first_index <= last_index { + write_task.cut_logs = match write_task.cut_logs { + None => Some((first_index, last_index)), + Some((f, l)) => Some((cmp::min(first_index, f), cmp::max(last_index, l))), + }; + } + let mut extra_states = ExtraStates::new(entry_storage.apply_state().clone()); + let mut region_state = self.storage().region_state().clone(); + // Write worker will do the clean up when meeting tombstone state. + region_state.set_state(PeerState::Tombstone); + extra_states.set_region_state(region_state); + extra_states.set_raft_state(entry_storage.raft_state().clone()); + write_task.extra_write.set_v2(extra_states); + self.destroy_progress_mut().start(); + } + + /// Do clean up for destroy. The peer is permanently destroyed when + /// Tombstone state is persisted. This method is only for cleaning up + /// memory states. + pub fn finish_destroy(&mut self, ctx: &mut StoreContext) { + info!(self.logger, "peer destroyed"); + ctx.router.close(self.region_id()); + if let Some(msg) = self.destroy_progress_mut().finish() { + // The message will be dispatched to store fsm, which will create a + // new peer. Ignore error as it's just a best effort. + let _ = ctx.router.send_raft_message(msg); + } + // TODO: close apply mailbox. + } +} diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index c352ffe0cc1..b840194b7e0 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -1,6 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod life; mod query; mod ready; +pub use life::DestroyProgress; pub use ready::AsyncWriter; diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index e0b2a1c4802..3db4426ebf7 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -157,6 +157,10 @@ impl AsyncWriter { pub fn persisted_number(&self) -> u64 { self.persisted_number } + + pub fn all_ready_persisted(&self) -> bool { + self.unpersisted_readies.is_empty() + } } impl WriteRouterContext for StoreContext diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 1be4b0ee546..aab6cc5d4c5 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -19,18 +19,21 @@ mod async_writer; +use std::cmp; + use engine_traits::{KvEngine, RaftEngine}; use error_code::ErrorCodeExt; use kvproto::raft_serverpb::RaftMessage; use protobuf::Message as _; use raft::{eraftpb, Ready}; -use raftstore::store::{FetchedLogs, Transport, WriteTask}; +use raftstore::store::{util, ExtraStates, FetchedLogs, Transport, WriteTask}; use slog::{debug, error, trace, warn}; pub use self::async_writer::AsyncWriter; use crate::{ batch::StoreContext, fsm::{PeerFsm, PeerFsmDelegate}, + operation::DestroyProgress, raft::{Peer, Storage}, router::PeerTick, }; @@ -52,6 +55,66 @@ impl Peer { self.raft_group_mut().tick() } + pub fn on_raft_message( + &mut self, + ctx: &mut StoreContext, + mut msg: Box, + ) { + debug!( + self.logger, + "handle raft message"; + "message_type" => %util::MsgType(&msg), + "from_peer_id" => msg.get_from_peer().get_id(), + "to_peer_id" => msg.get_to_peer().get_id(), + ); + if !self.serving() { + return; + } + if msg.get_to_peer().get_store_id() != self.peer().get_store_id() { + ctx.raft_metrics.message_dropped.mismatch_store_id.inc(); + return; + } + if !msg.has_region_epoch() { + ctx.raft_metrics.message_dropped.mismatch_region_epoch.inc(); + return; + } + if msg.get_is_tombstone() { + self.mark_for_destroy(None); + return; + } + if msg.has_merge_target() { + unimplemented!(); + return; + } + // We don't handle stale message like v1, as we rely on leader to actively + // cleanup stale peers. + let to_peer = msg.get_to_peer(); + // Check if the message is sent to the right peer. + match to_peer.get_id().cmp(&self.peer_id()) { + cmp::Ordering::Equal => (), + cmp::Ordering::Less => { + ctx.raft_metrics.message_dropped.stale_msg.inc(); + return; + } + cmp::Ordering::Greater => { + // We need to create the target peer. + self.mark_for_destroy(Some(msg)); + return; + } + } + if msg.has_extra_msg() { + unimplemented!(); + return; + } + // TODO: drop all msg append when the peer is uninitialized and has conflict + // ranges with other peers. + self.insert_peer_cache(msg.take_from_peer()); + if let Err(e) = self.raft_group_mut().step(msg.take_message()) { + error!(self.logger, "raft step error"; "err" => ?e); + } + self.set_has_ready(); + } + /// Callback for fetching logs asynchronously. pub fn on_fetched_logs(&mut self, fetched_logs: FetchedLogs) { let FetchedLogs { context, logs } = fetched_logs; @@ -176,15 +239,17 @@ impl Peer { #[inline] pub fn handle_raft_ready(&mut self, ctx: &mut StoreContext) { let has_ready = self.reset_has_ready(); - if !has_ready { + if !has_ready || self.destroy_progress().started() { return; } ctx.has_ready = true; - if !self.raft_group().has_ready() { + if !self.raft_group().has_ready() && (self.serving() || self.postpond_destroy()) { return; } + // Note even the group has no ready, we can still get an empty ready. + debug!(self.logger, "handle raft ready"); let mut ready = self.raft_group_mut().ready(); @@ -218,6 +283,9 @@ impl Peer { .flat_map(|m| self.build_raft_message(ctx, m)) .collect(); } + if !self.serving() { + self.start_destroy(&mut write_task); + } // Ready number should increase monotonically. assert!(self.async_writer.known_largest_number() < ready.number()); if let Some(task) = self.async_writer.write(ctx, write_task) { @@ -273,8 +341,14 @@ impl Peer { self.storage_mut() .entry_storage_mut() .update_cache_persisted(persisted_index); - // We may need to check if there is persisted committed logs. - self.set_has_ready(); + if !self.destroy_progress().started() { + // We may need to check if there is persisted committed logs. + self.set_has_ready(); + } else if self.async_writer.all_ready_persisted() { + // Destroy ready is the last ready. All readies are persisted means destroy + // is persisted. + self.finish_destroy(ctx); + } } } @@ -287,6 +361,7 @@ impl Storage { write_task: &mut WriteTask, ) { let prev_raft_state = self.entry_storage().raft_state().clone(); + let ever_persisted = self.ever_persisted(); // TODO: handle snapshot @@ -297,8 +372,14 @@ impl Storage { if let Some(hs) = ready.hs() { entry_storage.raft_state_mut().set_hard_state(hs.clone()); } - if prev_raft_state != *entry_storage.raft_state() { + if !ever_persisted || prev_raft_state != *entry_storage.raft_state() { write_task.raft_state = Some(entry_storage.raft_state().clone()); } + if !ever_persisted { + let mut extra_states = ExtraStates::new(self.apply_state().clone()); + extra_states.set_region_state(self.region_state().clone()); + write_task.extra_write.set_v2(extra_states); + self.set_ever_persisted(); + } } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 6fd7b4b444c..a84dd36f224 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -11,7 +11,7 @@ use tikv_util::{box_err, config::ReadableSize, worker::Scheduler}; use super::storage::Storage; use crate::{ - operation::AsyncWriter, + operation::{AsyncWriter, DestroyProgress}, tablet::{self, CachedTablet}, Result, }; @@ -25,6 +25,7 @@ pub struct Peer { /// messages with unknown peers after recovery. peer_cache: Vec, pub(crate) async_writer: AsyncWriter, + destroy_progress: DestroyProgress, has_ready: bool, pub(crate) logger: Logger, } @@ -35,21 +36,13 @@ impl Peer { /// If peer is destroyed, `None` is returned. pub fn new( cfg: &Config, - region_id: u64, - store_id: u64, tablet_factory: &dyn TabletFactory, - engine: ER, - scheduler: Scheduler, - logger: &Logger, - ) -> Result> { - let s = match Storage::new(region_id, store_id, engine, scheduler, logger)? { - Some(s) => s, - None => return Ok(None), - }; - let logger = s.logger().clone(); + storage: Storage, + ) -> Result { + let logger = storage.logger().clone(); - let applied_index = s.apply_state().get_applied_index(); - let peer_id = s.peer().get_id(); + let applied_index = storage.apply_state().get_applied_index(); + let peer_id = storage.peer().get_id(); let raft_cfg = raft::Config { id: peer_id, @@ -67,7 +60,8 @@ impl Peer { ..Default::default() }; - let tablet_index = s.region_state().get_tablet_index(); + let region_id = storage.region().get_id(); + let tablet_index = storage.region_state().get_tablet_index(); // Another option is always create tablet even if tablet index is 0. But this // can introduce race when gc old tablet and create new peer. let tablet = if tablet_index != 0 { @@ -89,21 +83,25 @@ impl Peer { }; let mut peer = Peer { - raft_group: RawNode::new(&raft_cfg, s, &logger)?, + raft_group: RawNode::new(&raft_cfg, storage, &logger)?, tablet: CachedTablet::new(tablet), - has_ready: false, + peer_cache: vec![], async_writer: AsyncWriter::new(region_id, peer_id), + has_ready: false, + destroy_progress: DestroyProgress::None, logger, - peer_cache: vec![], }; // If this region has only one peer and I am the one, campaign directly. let region = peer.region(); - if region.get_peers().len() == 1 && region.get_peers()[0].get_store_id() == store_id { + if region.get_peers().len() == 1 + && region.get_peers()[0] == *peer.peer() + && tablet_index != 0 + { peer.raft_group.campaign()?; } - Ok(Some(peer)) + Ok(peer) } #[inline] @@ -241,4 +239,19 @@ impl Peer { pub fn term(&self) -> u64 { self.raft_group.raft.term } + + #[inline] + pub fn serving(&self) -> bool { + matches!(self.destroy_progress, DestroyProgress::None) + } + + #[inline] + pub fn destroy_progress(&self) -> &DestroyProgress { + &self.destroy_progress + } + + #[inline] + pub fn destroy_progress_mut(&mut self) -> &mut DestroyProgress { + &mut self.destroy_progress + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 4f625b751ac..fe0a9b5913e 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -54,6 +54,10 @@ pub struct Storage { entry_storage: EntryStorage, peer: metapb::Peer, region_state: RegionLocalState, + /// Whether states has been persisted before. If a peer is just created by + /// by messages, it has not persisted any states, we need to persist them + /// at least once dispite whether the state changes since create. + ever_persisted: bool, logger: Logger, } @@ -101,6 +105,30 @@ impl Storage { } impl Storage { + /// Creates a new storage with uninit states. + /// + /// This should only be used for creating new peer from raft message. + pub fn uninit( + store_id: u64, + region: Region, + engine: ER, + log_fetch_scheduler: Scheduler, + logger: &Logger, + ) -> Result { + let mut region_state = RegionLocalState::default(); + region_state.set_region(region); + Self::create( + store_id, + region_state, + RaftLocalState::default(), + RaftApplyState::default(), + engine, + log_fetch_scheduler, + false, + logger, + ) + } + /// Creates a new storage. /// /// All metadata should be initialized before calling this method. If the @@ -112,7 +140,7 @@ impl Storage { log_fetch_scheduler: Scheduler, logger: &Logger, ) -> Result>> { - let region_state: RegionLocalState = match engine.get_region_state(region_id) { + let region_state = match engine.get_region_state(region_id) { Ok(Some(s)) => s, res => { return Err(box_err!( @@ -127,16 +155,6 @@ impl Storage { return Ok(None); } - let peer = find_peer(region_state.get_region(), store_id); - let peer = match peer { - Some(p) if p.get_id() != INVALID_ID => p, - _ => { - return Err(box_err!("no valid peer found in {:?}", region_state)); - } - }; - - let logger = logger.new(o!("region_id" => region_id, "peer_id" => peer.get_id())); - let raft_state = match engine.get_raft_state(region_id) { Ok(Some(s)) => s, res => { @@ -151,8 +169,38 @@ impl Storage { } }; - let region = region_state.get_region(); + Self::create( + store_id, + region_state, + raft_state, + apply_state, + engine, + log_fetch_scheduler, + true, + logger, + ) + .map(Some) + } + fn create( + store_id: u64, + region_state: RegionLocalState, + raft_state: RaftLocalState, + apply_state: RaftApplyState, + engine: ER, + log_fetch_scheduler: Scheduler, + persisted: bool, + logger: &Logger, + ) -> Result { + let peer = find_peer(region_state.get_region(), store_id); + let peer = match peer { + Some(p) if p.get_id() != INVALID_ID => p, + _ => { + return Err(box_err!("no valid peer found in {:?}", region_state)); + } + }; + let region = region_state.get_region(); + let logger = logger.new(o!("region_id" => region.id, "peer_id" => peer.get_id())); let entry_storage = EntryStorage::new( peer.get_id(), engine, @@ -162,12 +210,13 @@ impl Storage { log_fetch_scheduler, )?; - Ok(Some(Storage { + Ok(Storage { entry_storage, peer: peer.clone(), region_state, + ever_persisted: persisted, logger, - })) + }) } #[inline] @@ -184,6 +233,14 @@ impl Storage { pub fn is_initialized(&self) -> bool { self.region_state.get_tablet_index() != 0 } + + pub fn ever_persisted(&self) -> bool { + self.ever_persisted + } + + pub fn set_ever_persisted(&mut self) { + self.ever_persisted = true; + } } impl raft::Storage for Storage { diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 72e6149d7ad..7be1be95554 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -8,14 +8,15 @@ use kvproto::{ cdcpb::Event, metapb, raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_serverpb::RaftMessage, }; -use raftstore::store::{ - metrics::RaftEventDurationType, FetchedLogs, InspectedRaftMessage, RegionSnapshot, -}; +use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs, RegionSnapshot}; use tikv_util::time::Instant; use super::{ - response_channel::{CmdResChannel, CmdResSubscriber, QueryResChannel, QueryResSubscriber}, + response_channel::{ + CmdResChannel, CmdResSubscriber, DebugInfoChannel, QueryResChannel, QueryResSubscriber, + }, ApplyRes, }; @@ -116,7 +117,7 @@ pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target /// peer doesn't exist. - RaftMessage(InspectedRaftMessage), + RaftMessage(Box), /// Query won't change any state. A typical query is KV read. In most cases, /// it will be processed using lease or read index. RaftQuery(RaftRequest), @@ -138,6 +139,7 @@ pub enum PeerMsg { peer_id: u64, ready_number: u64, }, + QueryDebugInfo(DebugInfoChannel), } impl PeerMsg { @@ -175,14 +177,15 @@ impl fmt::Debug for PeerMsg { peer_id, ready_number ), PeerMsg::FetchedLogs(fetched) => write!(fmt, "FetchedLogs {:?}", fetched), + PeerMsg::QueryDebugInfo(_) => write!(fmt, "QueryDebugInfo"), } } } pub enum StoreMsg { - RaftMessage(InspectedRaftMessage), + RaftMessage(Box), Tick(StoreTick), - Start { store: metapb::Store }, + Start, } impl fmt::Debug for StoreMsg { @@ -190,7 +193,7 @@ impl fmt::Debug for StoreMsg { match *self { StoreMsg::RaftMessage(_) => write!(fmt, "Raft Message"), StoreMsg::Tick(tick) => write!(fmt, "StoreTick {:?}", tick), - StoreMsg::Start { ref store } => write!(fmt, "Start store {:?}", store), + StoreMsg::Start => write!(fmt, "Start store"), } } } diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index d922020cbcb..5582921ce4d 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -10,11 +10,13 @@ use std::{ ops::{Deref, DerefMut}, + path::Path, sync::{ atomic::{AtomicUsize, Ordering}, Arc, }, - time::Duration, + thread, + time::{Duration, Instant}, }; use crossbeam::channel::{self, Receiver, Sender}; @@ -31,10 +33,10 @@ use kvproto::{ raft_serverpb::RaftMessage, }; use pd_client::RpcClient; -use raftstore::store::{Config, Transport, RAFT_INIT_LOG_INDEX}; +use raftstore::store::{region_meta::RegionMeta, Config, Transport, RAFT_INIT_LOG_INDEX}; use raftstore_v2::{ create_store_batch_system, - router::{PeerMsg, QueryResult}, + router::{DebugInfoChannel, PeerMsg, QueryResult}, Bootstrap, StoreRouter, StoreSystem, }; use slog::{o, Logger}; @@ -42,6 +44,7 @@ use tempfile::TempDir; use test_pd::mocker::Service; use tikv_util::config::{ReadableDuration, VersionTrack}; +mod test_life; mod test_status; struct TestRouter(StoreRouter); @@ -67,6 +70,20 @@ impl TestRouter { block_on(sub.result()) } + fn must_query_debug_info(&self, region_id: u64, timeout: Duration) -> Option { + let timer = Instant::now(); + while timer.elapsed() < timeout { + let (ch, sub) = DebugInfoChannel::pair(); + let msg = PeerMsg::QueryDebugInfo(ch); + if self.send(region_id, msg).is_err() { + thread::sleep(Duration::from_millis(10)); + continue; + } + return block_on(sub.result()); + } + None + } + fn command(&self, region_id: u64, req: RaftCmdRequest) -> Option { let (msg, sub) = PeerMsg::raft_command(req); self.send(region_id, msg).unwrap(); @@ -74,113 +91,138 @@ impl TestRouter { } } -struct TestNode { - _pd_server: test_pd::Server, - _pd_client: RpcClient, - _path: TempDir, - store: Store, - raft_engine: Option, - factory: Option>, - system: Option>, - cfg: Option>>, - logger: Logger, +struct RunningState { + raft_engine: RaftTestEngine, + factory: Arc, + system: StoreSystem, + cfg: Arc>, + transport: TestTransport, } -impl TestNode { - fn new() -> TestNode { - let logger = slog_global::borrow_global().new(o!()); - let pd_server = test_pd::Server::new(1); - let pd_client = test_pd::util::new_client(pd_server.bind_addrs(), None); - let path = TempDir::new().unwrap(); - +impl RunningState { + fn new( + pd_client: &RpcClient, + path: &Path, + cfg: Arc>, + transport: TestTransport, + logger: &Logger, + ) -> (TestRouter, Self) { let cf_opts = ALL_CFS .iter() .copied() .map(|cf| (cf, CfOptions::default())) .collect(); let factory = Arc::new(TestTabletFactoryV2::new( - path.path(), + path, DbOptions::default(), cf_opts, )); let raft_engine = - engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) + engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), None) .unwrap(); - let mut bootstrap = Bootstrap::new(&raft_engine, 0, &pd_client, logger.clone()); + let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client, logger.clone()); let store_id = bootstrap.bootstrap_store().unwrap(); let mut store = Store::default(); store.set_id(store_id); - let region = bootstrap - .bootstrap_first_region(&store, store_id) - .unwrap() - .unwrap(); - if factory.exists(region.get_id(), RAFT_INIT_LOG_INDEX) { + if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { + if factory.exists(region.get_id(), RAFT_INIT_LOG_INDEX) { + factory + .destroy_tablet(region.get_id(), RAFT_INIT_LOG_INDEX) + .unwrap(); + } factory - .destroy_tablet(region.get_id(), RAFT_INIT_LOG_INDEX) + .open_tablet( + region.get_id(), + Some(RAFT_INIT_LOG_INDEX), + OpenOptions::default().set_create_new(true), + ) .unwrap(); } - factory - .open_tablet( - region.get_id(), - Some(RAFT_INIT_LOG_INDEX), - OpenOptions::default().set_create_new(true), - ) - .unwrap(); - TestNode { - _pd_server: pd_server, - _pd_client: pd_client, - _path: path, - store, - raft_engine: Some(raft_engine), - factory: Some(factory), - system: None, - cfg: None, - logger, - } - } - - fn start( - &mut self, - cfg: Arc>, - trans: impl Transport + 'static, - ) -> TestRouter { let (router, mut system) = create_store_batch_system::( &cfg.value(), - self.store.clone(), - self.logger.clone(), + store_id, + logger.clone(), ); system .start( - self.store.clone(), + store_id, cfg.clone(), - self.raft_engine.clone().unwrap(), - self.factory.clone().unwrap(), - trans, + raft_engine.clone(), + factory.clone(), + transport.clone(), &router, ) .unwrap(); - self.cfg = Some(cfg); - self.system = Some(system); - TestRouter(router) + + let state = Self { + raft_engine, + factory, + system, + cfg, + transport, + }; + (TestRouter(router), state) + } +} + +impl Drop for RunningState { + fn drop(&mut self) { + self.system.shutdown(); + } +} + +struct TestNode { + _pd_server: test_pd::Server, + pd_client: RpcClient, + path: TempDir, + running_state: Option, + logger: Logger, +} + +impl TestNode { + fn new() -> TestNode { + let logger = slog_global::borrow_global().new(o!()); + let pd_server = test_pd::Server::new(1); + let pd_client = test_pd::util::new_client(pd_server.bind_addrs(), None); + let path = TempDir::new().unwrap(); + + TestNode { + _pd_server: pd_server, + pd_client, + path, + running_state: None, + logger, + } + } + + fn start(&mut self, cfg: Arc>, trans: TestTransport) -> TestRouter { + let (router, state) = + RunningState::new(&self.pd_client, self.path.path(), cfg, trans, &self.logger); + self.running_state = Some(state); + router } fn config(&self) -> &Arc> { - self.cfg.as_ref().unwrap() + &self.running_state.as_ref().unwrap().cfg } fn stop(&mut self) { - if let Some(mut system) = self.system.take() { - system.shutdown(); - } + self.running_state.take(); + } + + fn restart(&mut self) -> TestRouter { + let state = self.running_state.as_ref().unwrap(); + let prev_transport = state.transport.clone(); + let cfg = state.cfg.clone(); + self.stop(); + self.start(cfg, prev_transport) } } impl Drop for TestNode { fn drop(&mut self) { self.stop(); - self.raft_engine.take(); - self.factory.take(); } } diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs new file mode 100644 index 00000000000..c03c7fe10c4 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -0,0 +1,194 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + assert_matches::assert_matches, + thread, + time::{Duration, Instant}, +}; + +use crossbeam::channel::TrySendError; +use engine_traits::{RaftEngine, RaftEngineReadOnly}; +use futures::executor::block_on; +use kvproto::{ + metapb, + raft_cmdpb::{RaftCmdRequest, StatusCmdType}, + raft_serverpb::{PeerState, RaftMessage}, +}; +use raftstore::store::util::new_peer; +use raftstore_v2::router::{DebugInfoChannel, PeerMsg}; + +use crate::TestRouter; + +fn assert_peer_not_exist(region_id: u64, peer_id: u64, router: &TestRouter) { + let timer = Instant::now(); + loop { + let (ch, sub) = DebugInfoChannel::pair(); + let msg = PeerMsg::QueryDebugInfo(ch); + match router.send(region_id, msg) { + Err(TrySendError::Disconnected(_)) => return, + Ok(()) => { + if let Some(m) = block_on(sub.result()) { + if m.raft_status.id != peer_id { + return; + } + } + } + Err(_) => (), + } + if timer.elapsed() < Duration::from_secs(3) { + thread::sleep(Duration::from_millis(10)); + } else { + panic!("peer of {} still exists", region_id); + } + } +} + +// TODO: make raft engine support more suitable way to verify range is empty. +/// Verify all states in raft engine are cleared. +fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb::Peer) { + let mut buf = vec![]; + raft_engine.get_all_entries_to(region_id, &mut buf).unwrap(); + assert!(buf.is_empty(), "{:?}", buf); + assert_matches!(raft_engine.get_raft_state(region_id), Ok(None)); + assert_matches!(raft_engine.get_apply_state(region_id), Ok(None)); + let region_state = raft_engine.get_region_state(region_id).unwrap().unwrap(); + assert_matches!(region_state.get_state(), PeerState::Tombstone); + assert!( + region_state.get_region().get_peers().contains(peer), + "{:?}", + region_state + ); +} + +/// Test a peer can be created by general raft message and destroyed tombstone +/// message. +#[test] +fn test_life_by_message() { + let (mut node, _transport, router) = super::setup_default_cluster(); + let test_region_id = 4; + let test_peer_id = 5; + let test_leader_id = 6; + assert_peer_not_exist(test_region_id, test_peer_id, &router); + + // Build a correct message. + let mut msg = Box::new(RaftMessage::default()); + msg.set_region_id(test_region_id); + msg.set_to_peer(new_peer(1, test_peer_id)); + msg.mut_region_epoch().set_conf_ver(1); + msg.set_from_peer(new_peer(2, test_leader_id)); + let raft_message = msg.mut_message(); + raft_message.set_msg_type(raft::prelude::MessageType::MsgHeartbeat); + raft_message.set_from(6); + raft_message.set_term(5); + + let assert_wrong = |f: &dyn Fn(&mut RaftMessage)| { + let mut wrong_msg = msg.clone(); + f(&mut wrong_msg); + router.send_raft_message(wrong_msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, &router); + }; + + // Check mismatch store id. + assert_wrong(&|msg| msg.mut_to_peer().set_store_id(4)); + + // Check missing region epoch. + assert_wrong(&|msg| { + msg.take_region_epoch(); + }); + + // Check tombstone. + assert_wrong(&|msg| msg.set_is_tombstone(true)); + + // Correct message will create a peer, but the peer will not be initialized. + router.send_raft_message(msg.clone()).unwrap(); + let timeout = Duration::from_secs(3); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.region_state.id, test_region_id); + assert_eq!(meta.raft_status.id, test_peer_id); + assert_eq!(meta.region_state.tablet_index, 0); + // But leader should be set. + assert_eq!(meta.raft_status.soft_state.leader_id, test_leader_id); + + // The peer should survive restart. + let router = node.restart(); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id); + let raft_engine = &node.running_state.as_ref().unwrap().raft_engine; + raft_engine.get_raft_state(test_region_id).unwrap().unwrap(); + raft_engine + .get_apply_state(test_region_id) + .unwrap() + .unwrap(); + + // The peer should be destroyed by tombstone message. + let mut tombstone_msg = msg.clone(); + tombstone_msg.set_is_tombstone(true); + router.send_raft_message(tombstone_msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, &router); + assert_tombstone(raft_engine, test_region_id, &new_peer(1, test_peer_id)); + + // Restart should not recreate tombstoned peer. + let router = node.restart(); + assert_peer_not_exist(test_region_id, test_peer_id, &router); + let raft_engine = &node.running_state.as_ref().unwrap().raft_engine; + assert_tombstone(raft_engine, test_region_id, &new_peer(1, test_peer_id)); +} + +#[test] +fn test_destroy_by_larger_id() { + let (mut node, _transport, router) = super::setup_default_cluster(); + let test_region_id = 4; + let test_peer_id = 6; + let init_term = 5; + let mut msg = Box::new(RaftMessage::default()); + msg.set_region_id(test_region_id); + msg.set_to_peer(new_peer(1, test_peer_id)); + msg.mut_region_epoch().set_conf_ver(1); + msg.set_from_peer(new_peer(2, 8)); + let raft_message = msg.mut_message(); + raft_message.set_msg_type(raft::prelude::MessageType::MsgHeartbeat); + raft_message.set_from(6); + raft_message.set_term(init_term); + // Create the peer. + router.send_raft_message(msg.clone()).unwrap(); + + let timeout = Duration::from_secs(3); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id); + + // Smaller ID should be ignored. + let mut smaller_id_msg = msg; + smaller_id_msg.set_to_peer(new_peer(1, test_peer_id - 1)); + smaller_id_msg.mut_message().set_term(init_term + 1); + router.send_raft_message(smaller_id_msg.clone()).unwrap(); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id); + assert_eq!(meta.raft_status.hard_state.term, init_term); + + // Larger ID should trigger destroy. + let mut larger_id_msg = smaller_id_msg; + larger_id_msg.set_to_peer(new_peer(1, test_peer_id + 1)); + router.send_raft_message(larger_id_msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, &router); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id + 1); + assert_eq!(meta.raft_status.hard_state.term, init_term + 1); + + // New peer should survive restart. + let router = node.restart(); + let meta = router + .must_query_debug_info(test_region_id, timeout) + .unwrap(); + assert_eq!(meta.raft_status.id, test_peer_id + 1); + assert_eq!(meta.raft_status.hard_state.term, init_term + 1); +} diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index ea796117e2c..e534a17fad1 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -20,7 +20,9 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; -use kvproto::raft_serverpb::{RaftApplyState, RaftLocalState, RaftMessage, RegionLocalState}; +use kvproto::raft_serverpb::{ + PeerState, RaftApplyState, RaftLocalState, RaftMessage, RegionLocalState, +}; use protobuf::Message; use raft::eraftpb::Entry; use tikv_util::{ @@ -37,6 +39,7 @@ use super::write_router::WriteSenders; use crate::{ store::{ config::Config, + entry_storage::first_index, fsm::RaftRouter, local_metrics::{RaftSendMessageMetrics, StoreWriteMetrics, TimeTracker}, metrics::*, @@ -218,18 +221,6 @@ where self.ready_number )); } - if let Some(last_index) = self.entries.last().map(|e| e.get_index()) { - if let Some((from, _)) = self.cut_logs { - if from != last_index + 1 { - // Entries are put and deleted in the same writebatch. - return Err(box_err!( - "invalid cut logs, last_index {}, cut_logs {:?}", - last_index, - self.cut_logs - )); - } - } - } Ok(()) } @@ -272,6 +263,8 @@ where pub struct ExtraStates { apply_state: RaftApplyState, region_state: Option, + // Set only want to destroy the raft group in write worker. + raft_state: Option, } impl ExtraStates { @@ -280,6 +273,7 @@ impl ExtraStates { Self { apply_state, region_state: None, + raft_state: None, } } @@ -287,6 +281,11 @@ impl ExtraStates { pub fn set_region_state(&mut self, region_state: RegionLocalState) { self.region_state = Some(region_state); } + + #[inline] + pub fn set_raft_state(&mut self, raft_state: RaftLocalState) { + self.raft_state = Some(raft_state); + } } pub enum ExtraBatchWrite { @@ -331,6 +330,9 @@ impl ExtraBatchWrite { if let Some(region_state) = extra_states.region_state { slot.get_mut().region_state = Some(region_state); } + if let Some(raft_state) = extra_states.raft_state { + slot.get_mut().raft_state = Some(raft_state); + } } collections::HashMapEntry::Vacant(slot) => { slot.insert(extra_states); @@ -452,21 +454,35 @@ where self.state_size + self.raft_wb.persist_size() } - fn before_write_to_db(&mut self, metrics: &StoreWriteMetrics) { + fn before_write_to_db(&mut self, engine: &ER, metrics: &StoreWriteMetrics) { // Put raft state to raft writebatch for (region_id, state) in self.raft_states.drain() { self.raft_wb.put_raft_state(region_id, &state).unwrap(); } if let ExtraBatchWrite::V2(extra_states_map) = &mut self.extra_batch_write { for (region_id, state) in extra_states_map.drain() { - self.raft_wb - .put_apply_state(region_id, &state.apply_state) - .unwrap(); + let mut tombstone = false; if let Some(region_state) = state.region_state { + if region_state.get_state() == PeerState::Tombstone { + tombstone = true; + engine + .clean( + region_id, + first_index(&state.apply_state), + state.raft_state.as_ref().unwrap(), + &mut self.raft_wb, + ) + .unwrap(); + } self.raft_wb .put_region_state(region_id, ®ion_state) .unwrap(); } + if !tombstone { + self.raft_wb + .put_apply_state(region_id, &state.apply_state) + .unwrap(); + } } } self.state_size = 0; @@ -656,7 +672,8 @@ where let timer = Instant::now(); - self.batch.before_write_to_db(&self.metrics); + self.batch + .before_write_to_db(&self.raft_engine, &self.metrics); fail_point!("raft_before_save"); @@ -915,7 +932,7 @@ pub fn write_to_db_for_test( { let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); batch.add_write_task(task); - batch.before_write_to_db(&StoreWriteMetrics::new(false)); + batch.before_write_to_db(&engines.raft, &StoreWriteMetrics::new(false)); if let ExtraBatchWrite::V1(kv_wb) = &mut batch.extra_batch_write { if !kv_wb.is_empty() { let mut write_opts = WriteOptions::new(); From 0e7bc82824dfbde4d5b01b6848a5e5366512d56c Mon Sep 17 00:00:00 2001 From: ekexium Date: Thu, 8 Sep 2022 16:16:57 +0800 Subject: [PATCH 197/676] txn: distinguish different types of write conflicts (#13424) close tikv/tikv#13423 Signed-off-by: ekexium Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- Cargo.toml | 2 +- components/txn_types/src/lib.rs | 8 +++- components/txn_types/src/lock.rs | 3 +- src/server/lock_manager/waiter_manager.rs | 4 +- src/storage/errors.rs | 7 +++- src/storage/mvcc/mod.rs | 11 ++++-- src/storage/mvcc/reader/point_getter.rs | 3 +- src/storage/mvcc/reader/scanner/backward.rs | 3 +- src/storage/mvcc/reader/scanner/forward.rs | 3 +- src/storage/mvcc/txn.rs | 3 ++ .../txn/actions/acquire_pessimistic_lock.rs | 2 + src/storage/txn/actions/prewrite.rs | 37 ++++++++++++++++--- 13 files changed, 69 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aedc4328377..8a0356d6611 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2629,7 +2629,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#2e37953b2b435961ad5b4f0e36b32c53f4777b23" +source = "git+https://github.com/pingcap/kvproto.git#7c004f4daf21e0677b0ceca50a723377a3968022" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/Cargo.toml b/Cargo.toml index 531449ab1b1..1b622f0d61b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -211,7 +211,7 @@ procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229 # When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to # kvproto at the same time. # After the PR to kvproto is merged, remember to comment this out and run `cargo update -p kvproto`. -# [patch.'https://github.com/pingcap/kvproto'] +[patch.'https://github.com/pingcap/kvproto'] # kvproto = { git = "https://github.com/your_github_id/kvproto", branch="your_branch" } [workspace] diff --git a/components/txn_types/src/lib.rs b/components/txn_types/src/lib.rs index 2f018c23923..edd89256d2b 100644 --- a/components/txn_types/src/lib.rs +++ b/components/txn_types/src/lib.rs @@ -14,6 +14,7 @@ mod write; use std::io; use error_code::{self, ErrorCode, ErrorCodeExt}; +use kvproto::kvrpcpb; pub use lock::{Lock, LockType, PessimisticLock}; use thiserror::Error; pub use timestamp::{TimeStamp, TsSet, TSO_PHYSICAL_SHIFT_BITS}; @@ -36,9 +37,9 @@ pub enum ErrorInner { #[error("key is locked (backoff or cleanup) {0:?}")] KeyIsLocked(kvproto::kvrpcpb::LockInfo), #[error( - "write conflict, start_ts: {}, conflict_start_ts: {}, conflict_commit_ts: {}, key: {}, primary: {}", + "write conflict, start_ts: {}, conflict_start_ts: {}, conflict_commit_ts: {}, key: {}, primary: {}, reason: {:?}", .start_ts, .conflict_start_ts, .conflict_commit_ts, - log_wrappers::Value::key(.key), log_wrappers::Value::key(.primary) + log_wrappers::Value::key(.key), log_wrappers::Value::key(.primary), .reason )] WriteConflict { start_ts: TimeStamp, @@ -46,6 +47,7 @@ pub enum ErrorInner { conflict_commit_ts: TimeStamp, key: Vec, primary: Vec, + reason: kvrpcpb::WriteConflictReason, }, } @@ -63,12 +65,14 @@ impl ErrorInner { conflict_commit_ts, key, primary, + reason, } => Some(ErrorInner::WriteConflict { start_ts: *start_ts, conflict_start_ts: *conflict_start_ts, conflict_commit_ts: *conflict_commit_ts, key: key.to_owned(), primary: primary.to_owned(), + reason: reason.to_owned(), }), } } diff --git a/components/txn_types/src/lock.rs b/components/txn_types/src/lock.rs index 4c784e31318..96c96828bcb 100644 --- a/components/txn_types/src/lock.rs +++ b/components/txn_types/src/lock.rs @@ -3,7 +3,7 @@ use std::{borrow::Cow, mem::size_of}; use byteorder::ReadBytesExt; -use kvproto::kvrpcpb::{IsolationLevel, LockInfo, Op}; +use kvproto::kvrpcpb::{IsolationLevel, LockInfo, Op, WriteConflictReason}; use tikv_util::codec::{ bytes::{self, BytesEncoder}, number::{self, NumberEncoder, MAX_VAR_I64_LEN, MAX_VAR_U64_LEN}, @@ -395,6 +395,7 @@ impl Lock { conflict_commit_ts: Default::default(), key: key.to_raw()?, primary: lock.primary.to_vec(), + reason: WriteConflictReason::RcCheckTs, })) } diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index b0e05091267..2ba2b583de9 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -18,7 +18,7 @@ use futures::{ future::Future, task::{Context, Poll}, }; -use kvproto::deadlock::WaitForEntry; +use kvproto::{deadlock::WaitForEntry, kvrpcpb::WriteConflictReason}; use tikv_util::{ config::ReadableDuration, time::{duration_to_sec, InstantExt}, @@ -247,6 +247,7 @@ impl Waiter { conflict_commit_ts: commit_ts, key, primary, + reason: WriteConflictReason::PessimisticRetry, }); self.pr = ProcessResult::Failed { err: StorageError::from(TxnError::from(mvcc_err)), @@ -822,6 +823,7 @@ pub mod tests { conflict_commit_ts, key, primary, + .. }), ))))) => { assert_eq!(start_ts, waiter_ts); diff --git a/src/storage/errors.rs b/src/storage/errors.rs index dae61653f07..faf12f34003 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -314,7 +314,7 @@ pub fn extract_key_error(err: &Error) -> kvrpcpb::KeyError { conflict_commit_ts, key, primary, - .. + reason, }, ))))) => { let mut write_conflict = kvrpcpb::WriteConflict::default(); @@ -323,6 +323,7 @@ pub fn extract_key_error(err: &Error) -> kvrpcpb::KeyError { write_conflict.set_conflict_commit_ts(conflict_commit_ts.into_inner()); write_conflict.set_key(key.to_owned()); write_conflict.set_primary(primary.to_owned()); + write_conflict.set_reason(reason.to_owned()); key_error.set_conflict(write_conflict); // for compatibility with older versions. key_error.set_retryable(format!("{:?}", err)); @@ -457,6 +458,8 @@ pub fn extract_key_errors(res: Result>>) -> Vec, primary: Vec, + reason: kvrpcpb::WriteConflictReason, }, #[error( @@ -203,12 +204,14 @@ impl ErrorInner { conflict_commit_ts, key, primary, + reason, } => Some(ErrorInner::WriteConflict { start_ts: *start_ts, conflict_start_ts: *conflict_start_ts, conflict_commit_ts: *conflict_commit_ts, key: key.to_owned(), primary: primary.to_owned(), + reason: reason.to_owned(), }), ErrorInner::Deadlock { start_ts, @@ -348,12 +351,14 @@ impl From for ErrorInner { conflict_commit_ts, key, primary, + reason, }) => ErrorInner::WriteConflict { start_ts, conflict_start_ts, conflict_commit_ts, key, primary, + reason, }, } } diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 2a231b42823..2758460a526 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -4,7 +4,7 @@ use std::borrow::Cow; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; -use kvproto::kvrpcpb::IsolationLevel; +use kvproto::kvrpcpb::{IsolationLevel, WriteConflictReason}; use txn_types::{Key, Lock, LockType, TimeStamp, TsSet, Value, WriteRef, WriteType}; use crate::storage::{ @@ -254,6 +254,7 @@ impl PointGetter { conflict_commit_ts: key_commit_ts, key: cursor_key.into(), primary: vec![], + reason: WriteConflictReason::RcCheckTs, } .into()); } diff --git a/src/storage/mvcc/reader/scanner/backward.rs b/src/storage/mvcc/reader/scanner/backward.rs index 6ade614e848..11ed487cd56 100644 --- a/src/storage/mvcc/reader/scanner/backward.rs +++ b/src/storage/mvcc/reader/scanner/backward.rs @@ -4,7 +4,7 @@ use std::{borrow::Cow, cmp::Ordering}; use engine_traits::CF_DEFAULT; -use kvproto::kvrpcpb::IsolationLevel; +use kvproto::kvrpcpb::{IsolationLevel, WriteConflictReason}; use txn_types::{Key, Lock, TimeStamp, Value, Write, WriteRef, WriteType}; use super::ScannerConfig; @@ -274,6 +274,7 @@ impl BackwardKvScanner { conflict_commit_ts: last_checked_commit_ts, key: current_key.into(), primary: vec![], + reason: WriteConflictReason::RcCheckTs, } .into()); } diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 6bed0289053..aee185e307f 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -4,7 +4,7 @@ use std::{borrow::Cow, cmp::Ordering}; use engine_traits::CF_DEFAULT; -use kvproto::kvrpcpb::{ExtraOp, IsolationLevel}; +use kvproto::kvrpcpb::{ExtraOp, IsolationLevel, WriteConflictReason}; use txn_types::{Key, Lock, LockType, OldValue, TimeStamp, Value, WriteRef, WriteType}; use super::ScannerConfig; @@ -350,6 +350,7 @@ impl> ForwardScanner { conflict_commit_ts: key_commit_ts, key: current_key.into(), primary: vec![], + reason: WriteConflictReason::RcCheckTs, } .into()); } diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index c02d8ef97c8..b456b359b8f 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -205,6 +205,8 @@ pub(crate) fn make_txn_error( key: &Key, start_ts: TimeStamp, ) -> crate::storage::mvcc::ErrorInner { + use kvproto::kvrpcpb::WriteConflictReason; + use crate::storage::mvcc::ErrorInner; if let Some(s) = s { match s.to_ascii_lowercase().as_str() { @@ -244,6 +246,7 @@ pub(crate) fn make_txn_error( conflict_commit_ts: TimeStamp::zero(), key: key.to_raw().unwrap(), primary: vec![], + reason: WriteConflictReason::Optimistic, }, "deadlock" => ErrorInner::Deadlock { start_ts, diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 7e30dcdd37c..699002f0126 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -1,5 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +use kvproto::kvrpcpb::WriteConflictReason; // #[PerformanceCriticalPath] use txn_types::{Key, LockType, OldValue, PessimisticLock, TimeStamp, Value, Write, WriteType}; @@ -173,6 +174,7 @@ pub fn acquire_pessimistic_lock( conflict_commit_ts: commit_ts, key: key.into_raw()?, primary: primary.to_vec(), + reason: WriteConflictReason::PessimisticRetry, } .into()); } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 7b562af8b43..85c1a6f8ccc 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -5,8 +5,9 @@ use std::cmp; use fail::fail_point; use kvproto::kvrpcpb::{ - Assertion, AssertionLevel, + self, Assertion, AssertionLevel, PrewriteRequestPessimisticAction::{self, *}, + WriteConflictReason, }; use txn_types::{ is_short_value, Key, Mutation, MutationType, OldValue, TimeStamp, Value, Write, WriteType, @@ -361,7 +362,11 @@ impl<'a> PrewriteMutation<'a> { TransactionKind::Optimistic(_) => { if commit_ts > self.txn_props.start_ts { MVCC_CONFLICT_COUNTER.prewrite_write_conflict.inc(); - self.write_conflict_error(&write, commit_ts)?; + self.write_conflict_error( + &write, + commit_ts, + WriteConflictReason::Optimistic, + )?; } } // Note: PessimisticLockNotFound can happen on a non-pessimistically locked key, @@ -370,7 +375,11 @@ impl<'a> PrewriteMutation<'a> { if let DoConstraintCheck = self.pessimistic_action { if commit_ts > self.txn_props.start_ts { MVCC_CONFLICT_COUNTER.prewrite_write_conflict.inc(); - self.write_conflict_error(&write, commit_ts)?; + self.write_conflict_error( + &write, + commit_ts, + WriteConflictReason::LazyUniquenessCheck, + )?; } } else if commit_ts > for_update_ts { warn!("conflicting write was found, pessimistic lock must be lost for the corresponding row key"; @@ -395,7 +404,11 @@ impl<'a> PrewriteMutation<'a> { { MVCC_CONFLICT_COUNTER.rolled_back.inc(); // TODO: Maybe we need to add a new error for the rolled back case. - self.write_conflict_error(&write, commit_ts)?; + self.write_conflict_error( + &write, + commit_ts, + WriteConflictReason::SelfRolledBack, + )?; } // Should check it when no lock exists, otherwise it can report error when there // is a lock belonging to a committed transaction which deletes the key. @@ -465,13 +478,19 @@ impl<'a> PrewriteMutation<'a> { final_min_commit_ts } - fn write_conflict_error(&self, write: &Write, commit_ts: TimeStamp) -> Result<()> { + fn write_conflict_error( + &self, + write: &Write, + commit_ts: TimeStamp, + reason: kvrpcpb::WriteConflictReason, + ) -> Result<()> { Err(ErrorInner::WriteConflict { start_ts: self.txn_props.start_ts, conflict_start_ts: write.start_ts, conflict_commit_ts: commit_ts, key: self.key.to_raw()?, primary: self.txn_props.primary.to_vec(), + reason, } .into()) } @@ -2075,7 +2094,13 @@ pub mod tests { must_pessimistic_prewrite_insert(&engine, key2, value, key, 3, 3, SkipPessimisticCheck); let err = must_pessimistic_prewrite_insert_err(&engine, key, value, key, 3, 3, DoConstraintCheck); - assert!(matches!(err, Error(box ErrorInner::WriteConflict { .. }))); + assert!(matches!( + err, + Error(box ErrorInner::WriteConflict { + reason: WriteConflictReason::LazyUniquenessCheck, + .. + }) + )); // 2. unique constraint fail must_prewrite_put(&engine, key, value, key, 11); From aebdada5f5473295e13dec25da9eade03774ac37 Mon Sep 17 00:00:00 2001 From: 5kbpers Date: Thu, 8 Sep 2022 17:48:55 +0800 Subject: [PATCH 198/676] raft_engine: set recover_state (#13272) ref tikv/tikv#12901 Add `put_recover_from_raft_db` and `recover_from_raft_db` to raft engine for checking if store should be recovered from states in raftdb. Signed-off-by: 5kbpers --- components/engine_panic/src/raft_engine.rs | 12 +++++++++++- components/engine_rocks/src/raft_engine.rs | 12 +++++++++++- components/engine_traits/src/raft_engine.rs | 11 ++++++++++- components/keys/src/lib.rs | 1 + components/raft_log_engine/src/engine.rs | 21 ++++++++++++++++++++- 5 files changed, 53 insertions(+), 4 deletions(-) diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index bb501007a76..75e0e68269d 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -3,7 +3,9 @@ use engine_traits::{Error, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, Result}; use kvproto::{ metapb::Region, - raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, }; use raft::eraftpb::Entry; @@ -52,6 +54,10 @@ impl RaftEngineReadOnly for PanicEngine { fn get_apply_state(&self, raft_group_id: u64) -> Result> { panic!() } + + fn get_recover_state(&self) -> Result> { + panic!() + } } impl RaftEngineDebug for PanicEngine { @@ -149,6 +155,10 @@ impl RaftEngine for PanicEngine { { panic!() } + + fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { + panic!() + } } impl RaftLogBatch for PanicWriteBatch { diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index 9e70f7158a7..b66a56caadf 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -8,7 +8,9 @@ use engine_traits::{ }; use kvproto::{ metapb::Region, - raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, }; use protobuf::Message; use raft::eraftpb::Entry; @@ -151,6 +153,10 @@ impl RaftEngineReadOnly for RocksEngine { let key = keys::apply_state_key(raft_group_id); self.get_msg_cf(CF_DEFAULT, &key) } + + fn get_recover_state(&self) -> Result> { + self.get_msg_cf(CF_DEFAULT, keys::RECOVER_STATE_KEY) + } } impl RaftEngineDebug for RocksEngine { @@ -364,6 +370,10 @@ impl RaftEngine for RocksEngine { Some(e) => Err(e), } } + + fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { + self.put_msg(keys::RECOVER_STATE_KEY, state) + } } impl RaftLogBatch for RocksWriteBatchVec { diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index e64bbe18018..b7a3f50699c 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -2,7 +2,9 @@ use kvproto::{ metapb::Region, - raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, }; use raft::eraftpb::Entry; @@ -19,6 +21,7 @@ pub trait RaftEngineReadOnly: Sync + Send + 'static { fn get_raft_state(&self, raft_group_id: u64) -> Result>; fn get_region_state(&self, raft_group_id: u64) -> Result>; fn get_apply_state(&self, raft_group_id: u64) -> Result>; + fn get_recover_state(&self) -> Result>; fn get_entry(&self, raft_group_id: u64, index: u64) -> Result>; @@ -144,6 +147,12 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send where F: FnMut(u64) -> std::result::Result<(), E>, E: From; + + /// Indicate whether region states should be recovered from raftdb and + /// replay raft logs. + /// When kvdb's write-ahead-log is disabled, the sequence number of the last + /// boot time is saved. + fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()>; } pub trait RaftLogBatch: Send { diff --git a/components/keys/src/lib.rs b/components/keys/src/lib.rs index f62ffc6f8ab..304e13f1e66 100644 --- a/components/keys/src/lib.rs +++ b/components/keys/src/lib.rs @@ -33,6 +33,7 @@ pub const DATA_MAX_KEY: &[u8] = &[DATA_PREFIX + 1]; // Following keys are all local keys, so the first byte must be 0x01. pub const STORE_IDENT_KEY: &[u8] = &[LOCAL_PREFIX, 0x01]; pub const PREPARE_BOOTSTRAP_KEY: &[u8] = &[LOCAL_PREFIX, 0x02]; +pub const RECOVER_STATE_KEY: &[u8] = &[LOCAL_PREFIX, 0x03]; // We save two types region data in DB, for raft and other meta data. // When the store starts, we should iterate all region meta data to // construct peer, no need to travel large raft data, so we separate them diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 2cd27d89538..07c7bb47bca 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -16,7 +16,9 @@ use engine_traits::{ use file_system::{IoOp, IoRateLimiter, IoType}; use kvproto::{ metapb::Region, - raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, }; use raft::eraftpb::Entry; use raft_engine::{ @@ -344,6 +346,7 @@ const STORE_IDENT_KEY: &[u8] = &[0x01]; const PREPARE_BOOTSTRAP_REGION_KEY: &[u8] = &[0x02]; const REGION_STATE_KEY: &[u8] = &[0x03]; const APPLY_STATE_KEY: &[u8] = &[0x04]; +const RECOVER_STATE_KEY: &[u8] = &[0x05]; impl RaftLogBatchTrait for RaftLogBatch { fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { @@ -472,6 +475,12 @@ impl RaftEngineReadOnly for RaftLogEngine { .get_message(raft_group_id, APPLY_STATE_KEY) .map_err(transfer_error) } + + fn get_recover_state(&self) -> Result> { + self.0 + .get_message(STORE_STATE_ID, RECOVER_STATE_KEY) + .map_err(transfer_error) + } } impl RaftEngineDebug for RaftLogEngine { @@ -621,6 +630,16 @@ impl RaftEngine for RaftLogEngine { } Ok(()) } + + fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { + let mut batch = Self::LogBatch::default(); + batch + .0 + .put_message(STORE_STATE_ID, RECOVER_STATE_KEY.to_vec(), state) + .map_err(transfer_error)?; + self.0.write(&mut batch.0, true).map_err(transfer_error)?; + Ok(()) + } } fn transfer_error(e: RaftEngineError) -> engine_traits::Error { From 079a06914256a0cbc4e6f3f85fd51f38267c06c7 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 8 Sep 2022 20:28:55 +0800 Subject: [PATCH 199/676] raftstore: Implement engine trait can_apply_snapshot (#12924) ref tikv/tikv#12849 Support new engine trait can_apply_snapshot Signed-off-by: CalvinNeo --- components/engine_rocks/src/engine.rs | 4 + components/engine_rocks/src/lib.rs | 2 +- components/engine_traits/src/engine.rs | 8 ++ .../raftstore/src/coprocessor/dispatcher.rs | 13 +- components/raftstore/src/store/config.rs | 17 +++ components/raftstore/src/store/fsm/store.rs | 4 +- .../raftstore/src/store/peer_storage.rs | 18 ++- components/raftstore/src/store/snap.rs | 4 + components/raftstore/src/store/worker/mod.rs | 2 + .../raftstore/src/store/worker/region.rs | 128 ++++++++++++------ components/tikv_util/src/timer.rs | 7 +- tests/integrations/config/mod.rs | 2 + 12 files changed, 143 insertions(+), 66 deletions(-) diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 13ae38b6afb..9e3bba56bad 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -66,6 +66,10 @@ impl RocksEngine { self.shared_block_cache = enable; } + pub fn shared_block_cache(&self) -> bool { + self.shared_block_cache + } + pub fn support_multi_batch_write(&self) -> bool { self.support_multi_batch_write } diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index b0e7012bad7..774fe9cb37b 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -68,7 +68,7 @@ mod perf_context_metrics; mod engine_iterator; pub use crate::engine_iterator::*; -mod options; +pub mod options; pub mod util; mod compact_listener; diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index e3e767f0ed2..5ad9a13b86f 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -64,6 +64,14 @@ pub trait KvEngine: /// This only exists as a temporary hack during refactoring. /// It cannot be used forever. fn bad_downcast(&self) -> &T; + + /// Returns false if KvEngine can't apply snapshot for this region now. + /// Some KvEngines need to do some transforms before apply data from + /// snapshot. These procedures can be batched in background if there are + /// more than one incoming snapshots, thus not blocking applying thread. + fn can_apply_snapshot(&self, _is_timeout: bool, _new_batch: bool, _region_id: u64) -> bool { + true + } } /// TabletAccessor is the trait to access all the tablets with provided accessor diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index d2c4e14567a..ed348950050 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -540,14 +540,11 @@ impl CoprocessorHost { snap_key: &crate::store::SnapKey, snap: Option<&crate::store::Snapshot>, ) { - loop_ob!( - region, - &self.registry.apply_snapshot_observers, - post_apply_snapshot, - peer_id, - snap_key, - snap, - ); + let mut ctx = ObserverContext::new(region); + for observer in &self.registry.apply_snapshot_observers { + let observer = observer.observer.inner(); + observer.post_apply_snapshot(&mut ctx, peer_id, snap_key, snap); + } } pub fn new_split_checker_host<'a>( diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 34805e4c9ca..8052a58dea8 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -137,6 +137,17 @@ pub struct Config { #[online_config(skip)] pub snap_apply_batch_size: ReadableSize, + // used to periodically check whether schedule pending applies in region runner + #[doc(hidden)] + #[online_config(skip)] + pub region_worker_tick_interval: ReadableDuration, + + // used to periodically check whether we should delete a stale peer's range in + // region runner + #[doc(hidden)] + #[online_config(skip)] + pub clean_stale_ranges_tick: usize, + // Interval (ms) to check region whether the data is consistent. pub consistency_check_interval: ReadableDuration, @@ -335,6 +346,12 @@ impl Default for Config { peer_stale_state_check_interval: ReadableDuration::minutes(5), leader_transfer_max_log_lag: 128, snap_apply_batch_size: ReadableSize::mb(10), + region_worker_tick_interval: if cfg!(feature = "test") { + ReadableDuration::millis(200) + } else { + ReadableDuration::millis(1000) + }, + clean_stale_ranges_tick: if cfg!(feature = "test") { 1 } else { 10 }, lock_cf_compact_interval: ReadableDuration::minutes(10), lock_cf_compact_bytes_threshold: ReadableSize::mb(256), // Disable consistency check by default as it will hurt performance. diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 4ee3c5dc091..930062f2e0c 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1496,9 +1496,7 @@ impl RaftBatchSystem { let region_runner = RegionRunner::new( engines.kv.clone(), mgr.clone(), - cfg.value().snap_apply_batch_size.0 as usize, - cfg.value().use_delete_range, - cfg.value().snap_generator_pool_size, + cfg.clone(), workers.coprocessor_host.clone(), self.router(), Some(Arc::clone(&pd_client)), diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index c99b7644321..7f4b6778860 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1142,7 +1142,8 @@ pub mod tests { fsm::apply::compact_raft_log, initial_region, prepare_bootstrap_cluster, worker::{ - FetchedLogs, LogFetchedNotifier, RaftlogFetchRunner, RegionRunner, RegionTask, + make_region_worker_raftstore_cfg, FetchedLogs, LogFetchedNotifier, + RaftlogFetchRunner, RegionRunner, RegionTask, }, }, }; @@ -1553,12 +1554,11 @@ pub mod tests { let (dummy_scheduler, _) = dummy_scheduler(); let mut s = new_storage_from_ents(sched.clone(), dummy_scheduler, &td, &ents); let (router, _) = mpsc::sync_channel(100); + let cfg = make_region_worker_raftstore_cfg(true); let runner = RegionRunner::new( s.engines.kv.clone(), mgr, - 0, - true, - 2, + cfg, CoprocessorHost::::default(), router, Option::>::None, @@ -1701,12 +1701,11 @@ pub mod tests { let store = new_store(1, labels); pd_client.add_store(store); let pd_mock = Arc::new(pd_client); + let cfg = make_region_worker_raftstore_cfg(true); let runner = RegionRunner::new( s.engines.kv.clone(), mgr, - 0, - true, - 2, + cfg, CoprocessorHost::::default(), router, Some(pd_mock), @@ -1767,12 +1766,11 @@ pub mod tests { let (dummy_scheduler, _) = dummy_scheduler(); let s1 = new_storage_from_ents(sched.clone(), dummy_scheduler.clone(), &td1, &ents); let (router, _) = mpsc::sync_channel(100); + let cfg = make_region_worker_raftstore_cfg(true); let runner = RegionRunner::new( s1.engines.kv.clone(), mgr, - 0, - true, - 2, + cfg, CoprocessorHost::::default(), router, Option::>::None, diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 8b063e9e1f0..d25fb5f11b8 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1200,6 +1200,10 @@ impl Snapshot { self.hold_tmp_files = false; Ok(()) } + + pub fn cf_files(&self) -> &[CfFile] { + &self.cf_files + } } // To check whether a procedure about apply snapshot aborts or not. diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 4910f3fdd2b..600a7a1ae6c 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -18,6 +18,8 @@ mod split_check; mod split_config; mod split_controller; +#[cfg(test)] +pub use self::region::tests::make_raftstore_cfg as make_region_worker_raftstore_cfg; pub use self::{ check_leader::{Runner as CheckLeaderRunner, Task as CheckLeaderTask}, cleanup::{Runner as CleanupRunner, Task as CleanupTask}, diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 53b88d6ef16..ad17779e42b 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -23,7 +23,9 @@ use kvproto::raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}; use pd_client::PdClient; use raft::eraftpb::Snapshot as RaftSnapshot; use tikv_util::{ - box_err, box_try, defer, error, info, thd_name, + box_err, box_try, + config::VersionTrack, + defer, error, info, thd_name, time::Instant, warn, worker::{Runnable, RunnableWithTimer}, @@ -44,23 +46,10 @@ use crate::{ }, snap::{plain_file_used, Error, Result, SNAPSHOT_CFS}, transport::CasualRouter, - ApplyOptions, CasualMessage, SnapEntry, SnapKey, SnapManager, + ApplyOptions, CasualMessage, Config, SnapEntry, SnapKey, SnapManager, }, }; -// used to periodically check whether we should delete a stale peer's range in -// region runner -#[cfg(test)] -pub const STALE_PEER_CHECK_TICK: usize = 1; // 1000 milliseconds -#[cfg(not(test))] -pub const STALE_PEER_CHECK_TICK: usize = 10; // 10000 milliseconds - -// used to periodically check whether schedule pending applies in region runner -#[cfg(not(test))] -pub const PENDING_APPLY_CHECK_INTERVAL: u64 = 1_000; // 1000 milliseconds -#[cfg(test)] -pub const PENDING_APPLY_CHECK_INTERVAL: u64 = 200; // 200 milliseconds - const CLEANUP_MAX_REGION_COUNT: usize = 64; const TIFLASH: &str = "tiflash"; @@ -355,6 +344,7 @@ where use_delete_range: bool, clean_stale_tick: usize, clean_stale_check_interval: Duration, + clean_stale_ranges_tick: usize, tiflash_stores: HashMap, // we may delay some apply tasks if level 0 files to write stall threshold, @@ -387,18 +377,19 @@ where pub fn new( engine: EK, mgr: SnapManager, - batch_size: usize, - use_delete_range: bool, - snap_generator_pool_size: usize, + cfg: Arc>, coprocessor_host: CoprocessorHost, router: R, pd_client: Option>, ) -> Runner { Runner { - batch_size, - use_delete_range, + batch_size: cfg.value().snap_apply_batch_size.0 as usize, + use_delete_range: cfg.value().use_delete_range, clean_stale_tick: 0, - clean_stale_check_interval: Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL), + clean_stale_check_interval: Duration::from_millis( + cfg.value().region_worker_tick_interval.as_millis(), + ), + clean_stale_ranges_tick: cfg.value().clean_stale_ranges_tick, tiflash_stores: HashMap::default(), pending_applies: VecDeque::new(), pending_delete_ranges: PendingDeleteRanges::default(), @@ -408,7 +399,7 @@ where router, pd_client, pool: Builder::new(thd_name!("snap-generator")) - .max_thread_count(snap_generator_pool_size) + .max_thread_count(cfg.value().snap_generator_pool_size) .build_future_pool(), } } @@ -745,8 +736,9 @@ where } /// Tries to apply pending tasks if there is some. - fn handle_pending_applies(&mut self) { + fn handle_pending_applies(&mut self, is_timeout: bool) { fail_point!("apply_pending_snapshot", |_| {}); + let mut new_batch = true; while !self.pending_applies.is_empty() { // should not handle too many applies than the number of files that can be // ingested. check level 0 every time because we can not make sure @@ -754,13 +746,24 @@ where if self.ingest_maybe_stall() { break; } - if let Some(Task::Apply { - region_id, - status, - peer_id, - }) = self.pending_applies.pop_front() - { - self.handle_apply(region_id, peer_id, status); + if let Some(Task::Apply { region_id, .. }) = self.pending_applies.front() { + fail_point!("handle_new_pending_applies", |_| {}); + if !self + .engine + .can_apply_snapshot(is_timeout, new_batch, *region_id) + { + // KvEngine can't apply snapshot for other reasons. + break; + } + if let Some(Task::Apply { + region_id, + status, + peer_id, + }) = self.pending_applies.pop_front() + { + new_batch = false; + self.handle_apply(region_id, peer_id, status); + } } } } @@ -839,7 +842,7 @@ where } // to makes sure applying snapshots in order. self.pending_applies.push_back(task); - self.handle_pending_applies(); + self.handle_pending_applies(false); if !self.pending_applies.is_empty() { // delay the apply and retry later SNAP_COUNTER.apply.delay.inc() @@ -871,9 +874,9 @@ where T: PdClient + 'static, { fn on_timeout(&mut self) { - self.handle_pending_applies(); + self.handle_pending_applies(true); self.clean_stale_tick += 1; - if self.clean_stale_tick >= STALE_PEER_CHECK_TICK { + if self.clean_stale_tick >= self.clean_stale_ranges_tick { self.clean_stale_ranges(); self.clean_stale_tick = 0; } @@ -885,7 +888,7 @@ where } #[cfg(test)] -mod tests { +pub(crate) mod tests { use std::{ io, sync::{atomic::AtomicUsize, mpsc, Arc}, @@ -906,7 +909,10 @@ mod tests { use pd_client::RpcClient; use protobuf::Message; use tempfile::Builder; - use tikv_util::worker::{LazyWorker, Worker}; + use tikv_util::{ + config::{ReadableDuration, ReadableSize}, + worker::{LazyWorker, Worker}, + }; use super::*; use crate::{ @@ -920,6 +926,20 @@ mod tests { }, }; + const PENDING_APPLY_CHECK_INTERVAL: u64 = 200; + const STALE_PEER_CHECK_TICK: usize = 1; + + pub fn make_raftstore_cfg(use_delete_range: bool) -> Arc> { + let mut store_cfg = Config::default(); + store_cfg.snap_apply_batch_size = ReadableSize(0); + store_cfg.region_worker_tick_interval = + ReadableDuration::millis(PENDING_APPLY_CHECK_INTERVAL); + store_cfg.clean_stale_ranges_tick = STALE_PEER_CHECK_TICK; + store_cfg.use_delete_range = use_delete_range; + store_cfg.snap_generator_pool_size = 2; + Arc::new(VersionTrack::new(store_cfg)) + } + fn insert_range( pending_delete_ranges: &mut PendingDeleteRanges, id: u64, @@ -1015,12 +1035,11 @@ mod tests { let mut worker: LazyWorker> = bg_worker.lazy_build("region-worker"); let sched = worker.scheduler(); let (router, _) = mpsc::sync_channel(11); + let cfg = make_raftstore_cfg(false); let mut runner = RegionRunner::new( engine.kv.clone(), mgr, - 0, - false, - 2, + cfg, CoprocessorHost::::default(), router, Option::>::None, @@ -1123,12 +1142,11 @@ mod tests { let mut worker = bg_worker.lazy_build("snap-manager"); let sched = worker.scheduler(); let (router, receiver) = mpsc::sync_channel(1); + let cfg = make_raftstore_cfg(true); let runner = RegionRunner::new( engine.kv.clone(), mgr, - 0, - true, - 2, + cfg, host, router, Option::>::None, @@ -1237,6 +1255,22 @@ mod tests { } }; + #[allow(dead_code)] + let must_not_finish = |ids: &[u64]| { + for id in ids { + let region_key = keys::region_state_key(*id); + assert_eq!( + engine + .kv + .get_msg_cf::(CF_RAFT, ®ion_key) + .unwrap() + .unwrap() + .get_state(), + PeerState::Applying + ) + } + }; + // snapshot will not ingest cause already write stall gen_and_apply_snap(1); assert_eq!( @@ -1371,6 +1405,18 @@ mod tests { ); thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); assert!(!check_region_exist(6)); + + #[cfg(feature = "failpoints")] + { + engine.kv.compact_files_in_range(None, None, None).unwrap(); + fail::cfg("handle_new_pending_applies", "return").unwrap(); + gen_and_apply_snap(7); + thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); + must_not_finish(&[7]); + fail::remove("handle_new_pending_applies"); + thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); + wait_apply_finish(&[7]); + } } #[derive(Clone, Default)] diff --git a/components/tikv_util/src/timer.rs b/components/tikv_util/src/timer.rs index f47cdaf21e9..30445780ac8 100644 --- a/components/tikv_util/src/timer.rs +++ b/components/tikv_util/src/timer.rs @@ -93,14 +93,15 @@ impl Ord for TimeoutTask { } lazy_static! { - pub static ref GLOBAL_TIMER_HANDLE: Handle = start_global_timer(); + pub static ref GLOBAL_TIMER_HANDLE: Handle = start_global_timer("timer"); } -fn start_global_timer() -> Handle { +/// Create a global timer with specific thread name. +pub fn start_global_timer(name: &str) -> Handle { let (tx, rx) = mpsc::channel(); let props = crate::thread_group::current_properties(); Builder::new() - .name(thd_name!("timer")) + .name(thd_name!(name)) .spawn_wrapper(move || { crate::thread_group::set_properties(props); tikv_alloc::add_thread_memory_accessor(); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 247b06834b0..1e87b5f7aa1 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -204,6 +204,8 @@ fn test_serde_custom_tikv_config() { peer_stale_state_check_interval: ReadableDuration::hours(2), leader_transfer_max_log_lag: 123, snap_apply_batch_size: ReadableSize::mb(12), + region_worker_tick_interval: ReadableDuration::millis(1000), + clean_stale_ranges_tick: 10, lock_cf_compact_interval: ReadableDuration::minutes(12), lock_cf_compact_bytes_threshold: ReadableSize::mb(123), consistency_check_interval: ReadableDuration::secs(12), From 3e863071dcd3ff6a56e772f5b97493b48998f432 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Fri, 9 Sep 2022 11:18:56 +0800 Subject: [PATCH 200/676] *: move ioload to tikv_util (#13421) ref tikv/tikv#13433 Signed-off-by: Ryan Leung Co-authored-by: Ti Chi Robot --- components/tikv_util/src/quota_limiter.rs | 2 +- .../diagnostics => components/tikv_util/src/sys}/ioload.rs | 0 components/tikv_util/src/sys/mod.rs | 1 + src/server/service/diagnostics/mod.rs | 6 ++++-- src/server/service/diagnostics/sys.rs | 4 ++-- 5 files changed, 8 insertions(+), 5 deletions(-) rename {src/server/service/diagnostics => components/tikv_util/src/sys}/ioload.rs (100%) diff --git a/components/tikv_util/src/quota_limiter.rs b/components/tikv_util/src/quota_limiter.rs index 818ec0ea60c..ae2e52d40d9 100644 --- a/components/tikv_util/src/quota_limiter.rs +++ b/components/tikv_util/src/quota_limiter.rs @@ -26,7 +26,7 @@ use super::{ // It's better to use a universal approach. const CPU_LIMITER_REFILL_DURATION: Duration = Duration::from_millis(100); -// Limter can be issued to cpu, write and read bandwidth +// Limiter can be issued to cpu, write and read bandwidth #[derive(Debug)] pub struct LimiterItems { cputime_limiter: Limiter, diff --git a/src/server/service/diagnostics/ioload.rs b/components/tikv_util/src/sys/ioload.rs similarity index 100% rename from src/server/service/diagnostics/ioload.rs rename to components/tikv_util/src/sys/ioload.rs diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 8dd7aefa77c..d17c821e995 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -5,6 +5,7 @@ mod cgroup; pub mod cpu_time; pub mod disk; pub mod inspector; +pub mod ioload; pub mod thread; // re-export some traits for ease of use diff --git a/src/server/service/diagnostics/mod.rs b/src/server/service/diagnostics/mod.rs index 438f618ff19..60df07aa167 100644 --- a/src/server/service/diagnostics/mod.rs +++ b/src/server/service/diagnostics/mod.rs @@ -19,12 +19,14 @@ use kvproto::diagnosticspb::{ Diagnostics, SearchLogRequest, SearchLogRequestTarget, SearchLogResponse, ServerInfoRequest, ServerInfoResponse, ServerInfoType, }; -use tikv_util::{sys::SystemExt, timer::GLOBAL_TIMER_HANDLE}; +use tikv_util::{ + sys::{ioload, SystemExt}, + timer::GLOBAL_TIMER_HANDLE, +}; use tokio::runtime::Handle; use crate::server::Error; -mod ioload; mod log; mod sys; diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index f39da646ad1..e62028e66e6 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -5,11 +5,11 @@ use std::{collections::HashMap, string::ToString}; use kvproto::diagnosticspb::{ServerInfoItem, ServerInfoPair}; use tikv_util::{ config::KIB, - sys::{cpu_time::LinuxStyleCpuTime, SysQuota, *}, + sys::{cpu_time::LinuxStyleCpuTime, ioload, SysQuota, *}, }; use walkdir::WalkDir; -use crate::server::service::diagnostics::{ioload, SYS_INFO}; +use crate::server::service::diagnostics::SYS_INFO; type CpuTimeSnapshot = Option; From 3f5acade42d6fb61ea55577fa4bffb54e16c5dc6 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Fri, 9 Sep 2022 13:58:56 +0800 Subject: [PATCH 201/676] storage: skip Rollback when checking newer version for non-pessimisitc keys (#13426) close tikv/tikv#13425, ref pingcap/tidb#35525 Don't treat newer Rollback records as write conflicts for non-pessimistic keys in pessimistic transactions. They can cause false positive errors because they can be written even if the pessimistic lock of the corresponding row key exists. Rollback records are only used to prevent retried prewrite from succeeding. Even if the Rollback record of the current transaction is collapsed by a newer record, it is safe to prewrite this non-pessimistic key because either the primary key is rolled back or it's protected because it's written by CheckSecondaryLocks. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- src/storage/txn/actions/prewrite.rs | 151 ++++++++++++++++++---------- 1 file changed, 99 insertions(+), 52 deletions(-) diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 85c1a6f8ccc..5883fc4b983 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -353,71 +353,78 @@ impl<'a> PrewriteMutation<'a> { &self, reader: &mut SnapshotReader, ) -> Result> { - match reader.seek_write(&self.key, TimeStamp::max())? { - Some((commit_ts, write)) => { - // Abort on writes after our start/for_update timestamp ... - // If exists a commit version whose commit timestamp is larger than current - // start/for_update timestamp, we should abort current prewrite. - match self.txn_props.kind { - TransactionKind::Optimistic(_) => { + let mut seek_ts = TimeStamp::max(); + while let Some((commit_ts, write)) = reader.seek_write(&self.key, seek_ts)? { + // If there's a write record whose commit_ts equals to our start ts, the current + // transaction is ok to continue, unless the record means that the current + // transaction has been rolled back. + if commit_ts == self.txn_props.start_ts + && (write.write_type == WriteType::Rollback || write.has_overlapped_rollback) + { + MVCC_CONFLICT_COUNTER.rolled_back.inc(); + // TODO: Maybe we need to add a new error for the rolled back case. + self.write_conflict_error(&write, commit_ts, WriteConflictReason::SelfRolledBack)?; + } + match self.txn_props.kind { + TransactionKind::Optimistic(_) => { + if commit_ts > self.txn_props.start_ts { + MVCC_CONFLICT_COUNTER.prewrite_write_conflict.inc(); + self.write_conflict_error( + &write, + commit_ts, + WriteConflictReason::Optimistic, + )?; + } + } + // Note: PessimisticLockNotFound can happen on a non-pessimistically locked key, + // if it is a retrying prewrite request. + TransactionKind::Pessimistic(for_update_ts) => { + if let DoConstraintCheck = self.pessimistic_action { + // Do the same as optimistic transactions if constraint checks are needed. if commit_ts > self.txn_props.start_ts { MVCC_CONFLICT_COUNTER.prewrite_write_conflict.inc(); self.write_conflict_error( &write, commit_ts, - WriteConflictReason::Optimistic, + WriteConflictReason::LazyUniquenessCheck, )?; } } - // Note: PessimisticLockNotFound can happen on a non-pessimistically locked key, - // if it is a retrying prewrite request. - TransactionKind::Pessimistic(for_update_ts) => { - if let DoConstraintCheck = self.pessimistic_action { - if commit_ts > self.txn_props.start_ts { - MVCC_CONFLICT_COUNTER.prewrite_write_conflict.inc(); - self.write_conflict_error( - &write, - commit_ts, - WriteConflictReason::LazyUniquenessCheck, - )?; - } - } else if commit_ts > for_update_ts { - warn!("conflicting write was found, pessimistic lock must be lost for the corresponding row key"; - "key" => %self.key, - "start_ts" => self.txn_props.start_ts, - "for_update_ts" => for_update_ts, - "conflicting start_ts" => write.start_ts, - "conflicting commit_ts" => commit_ts); - return Err(ErrorInner::PessimisticLockNotFound { - start_ts: self.txn_props.start_ts, - key: self.key.clone().into_raw()?, - } - .into()); + if commit_ts > for_update_ts { + // Don't treat newer Rollback records as write conflicts. They can cause + // false positive errors because they can be written even if the pessimistic + // lock of the corresponding row key exists. + // Rollback records are only used to prevent retried prewrite from + // succeeding. Even if the Rollback record of the current transaction is + // collapsed by a newer record, it is safe to prewrite this non-pessimistic + // key because either the primary key is rolled back or it's protected + // because it's written by CheckSecondaryLocks. + if write.write_type == WriteType::Rollback { + seek_ts = commit_ts.prev(); + continue; + } + + warn!("conflicting write was found, pessimistic lock must be lost for the corresponding row key"; + "key" => %self.key, + "start_ts" => self.txn_props.start_ts, + "for_update_ts" => for_update_ts, + "conflicting start_ts" => write.start_ts, + "conflicting commit_ts" => commit_ts); + return Err(ErrorInner::PessimisticLockNotFound { + start_ts: self.txn_props.start_ts, + key: self.key.clone().into_raw()?, } + .into()); } } - // If there's a write record whose commit_ts equals to our start ts, the current - // transaction is ok to continue, unless the record means that the current - // transaction has been rolled back. - if commit_ts == self.txn_props.start_ts - && (write.write_type == WriteType::Rollback || write.has_overlapped_rollback) - { - MVCC_CONFLICT_COUNTER.rolled_back.inc(); - // TODO: Maybe we need to add a new error for the rolled back case. - self.write_conflict_error( - &write, - commit_ts, - WriteConflictReason::SelfRolledBack, - )?; - } - // Should check it when no lock exists, otherwise it can report error when there - // is a lock belonging to a committed transaction which deletes the key. - check_data_constraint(reader, self.should_not_exist, &write, commit_ts, &self.key)?; - - Ok(Some((write, commit_ts))) } - None => Ok(None), + // Should check it when no lock exists, otherwise it can report error when there + // is a lock belonging to a committed transaction which deletes the key. + check_data_constraint(reader, self.should_not_exist, &write, commit_ts, &self.key)?; + + return Ok(Some((write, commit_ts))); } + Ok(None) } fn write_lock(self, lock_status: LockStatus, txn: &mut MvccTxn) -> Result { @@ -1554,6 +1561,46 @@ pub mod tests { kvproto::kvrpcpb::AssertionLevel::Off, ); must_locked(&engine, b"k2", 13); + must_rollback(&engine, b"k2", 13, false); + + // Write a Rollback at 50 first. A retried prewrite at the same ts should + // report WriteConflict. + must_rollback(&engine, b"k2", 50, false); + let err = must_retry_pessimistic_prewrite_put_err( + &engine, + b"k2", + b"v2", + b"k1", + &None, + 50, + 50, + SkipPessimisticCheck, + 0, + ); + assert!( + matches!(err, Error(box ErrorInner::WriteConflict { .. })), + "{:?}", + err + ); + // But prewriting at 48 can succeed because a newer rollback is allowed. + must_prewrite_put_impl( + &engine, + b"k2", + b"v2", + b"k1", + &None, + 48.into(), + SkipPessimisticCheck, + 100, + 48.into(), + 1, + 49.into(), + TimeStamp::default(), + true, + kvproto::kvrpcpb::Assertion::None, + kvproto::kvrpcpb::AssertionLevel::Off, + ); + must_locked(&engine, b"k2", 48); } #[test] From 4cd28ba026b846792769b2e75857adba19cc564c Mon Sep 17 00:00:00 2001 From: hehechen Date: Fri, 9 Sep 2022 14:10:56 +0800 Subject: [PATCH 202/676] hook after update safe ts (#13432) close tikv/tikv#13435 Add hook to observe the update of safe ts to calculate the TiFlash synchronization progress, including leader safe ts from check_leader RPC, and the update of self safe ts when receive check_leader RPC or apply. Signed-off-by: hehechen Co-authored-by: Ti Chi Robot --- .../raftstore/src/coprocessor/dispatcher.rs | 36 +++++++++++- components/raftstore/src/coprocessor/mod.rs | 7 ++- components/raftstore/src/store/fsm/peer.rs | 9 +-- components/raftstore/src/store/peer.rs | 6 +- components/raftstore/src/store/util.rs | 55 +++++++++++++++---- .../src/store/worker/check_leader.rs | 33 ++++++++--- components/server/src/server.rs | 5 +- components/test_raftstore/src/server.rs | 3 +- 8 files changed, 127 insertions(+), 27 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index ed348950050..3cddc21e8cb 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -133,6 +133,11 @@ macro_rules! impl_box_observer_g { impl_box_observer!(BoxAdminObserver, AdminObserver, WrappedAdminObserver); impl_box_observer!(BoxQueryObserver, QueryObserver, WrappedQueryObserver); +impl_box_observer!( + BoxUpdateSafeTsObserver, + UpdateSafeTsObserver, + WrappedUpdateSafeTsObserver +); impl_box_observer!( BoxApplySnapshotObserver, ApplySnapshotObserver, @@ -178,6 +183,7 @@ where cmd_observers: Vec>>, read_index_observers: Vec>, pd_task_observers: Vec>, + update_safe_ts_observers: Vec>, // TODO: add endpoint } @@ -194,6 +200,7 @@ impl Default for Registry { cmd_observers: Default::default(), read_index_observers: Default::default(), pd_task_observers: Default::default(), + update_safe_ts_observers: Default::default(), } } } @@ -259,6 +266,9 @@ impl Registry { pub fn register_read_index_observer(&mut self, priority: u32, rio: BoxReadIndexObserver) { push!(priority, rio, self.read_index_observers); } + pub fn register_update_safe_ts_observer(&mut self, priority: u32, qo: BoxUpdateSafeTsObserver) { + push!(priority, qo, self.update_safe_ts_observers); + } } /// A macro that loops over all observers and returns early when error is found @@ -662,6 +672,16 @@ impl CoprocessorHost { } } + pub fn on_update_safe_ts(&self, region_id: u64, self_safe_ts: u64, leader_safe_ts: u64) { + if self.registry.query_observers.is_empty() { + return; + } + for observer in &self.registry.update_safe_ts_observers { + let observer = observer.observer.inner(); + observer.on_update_safe_ts(region_id, self_safe_ts, leader_safe_ts) + } + } + pub fn shutdown(&self) { for entry in &self.registry.admin_observers { entry.observer.inner().stop(); @@ -690,7 +710,7 @@ mod tests { use tikv_util::box_err; use crate::{ - coprocessor::*, + coprocessor::{dispatcher::BoxUpdateSafeTsObserver, *}, store::{SnapKey, Snapshot}, }; @@ -722,6 +742,7 @@ mod tests { PreApplySnapshot = 20, PostApplySnapshot = 21, ShouldPreApplySnapshot = 22, + OnUpdateSafeTs = 23, } impl Coprocessor for TestCoprocessor {} @@ -938,6 +959,13 @@ mod tests { fn on_applied_current_term(&self, _: StateRole, _: &Region) {} } + impl UpdateSafeTsObserver for TestCoprocessor { + fn on_update_safe_ts(&self, _: u64, _: u64, _: u64) { + self.called + .fetch_add(ObserverIndex::OnUpdateSafeTs as usize, Ordering::SeqCst); + } + } + macro_rules! assert_all { ($target:expr, $expect:expr) => {{ for (c, e) in ($target).iter().zip($expect) { @@ -972,6 +1000,8 @@ mod tests { .register_region_change_observer(1, BoxRegionChangeObserver::new(ob.clone())); host.registry .register_cmd_observer(1, BoxCmdObserver::new(ob.clone())); + host.registry + .register_update_safe_ts_observer(1, BoxUpdateSafeTsObserver::new(ob.clone())); let mut index: usize = 0; let region = Region::default(); @@ -1078,6 +1108,10 @@ mod tests { host.should_pre_apply_snapshot(); index += ObserverIndex::ShouldPreApplySnapshot as usize; assert_all!([&ob.called], &[index]); + + host.on_update_safe_ts(1, 1, 1); + index += ObserverIndex::OnUpdateSafeTs as usize; + assert_all!([&ob.called], &[index]); } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index cc6bfb91b06..8a309dc4734 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -33,7 +33,7 @@ pub use self::{ dispatcher::{ BoxAdminObserver, BoxApplySnapshotObserver, BoxCmdObserver, BoxConsistencyCheckObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, - BoxSplitCheckObserver, CoprocessorHost, Registry, + BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, }, error::{Error, Result}, region_info_accessor::{ @@ -531,6 +531,11 @@ pub trait ReadIndexObserver: Coprocessor { fn on_step(&self, _msg: &mut eraftpb::Message, _role: StateRole) {} } +pub trait UpdateSafeTsObserver: Coprocessor { + /// Hook after update self safe_ts and received leader safe_ts. + fn on_update_safe_ts(&self, _: u64, _: u64, _: u64) {} +} + #[cfg(test)] mod tests { use super::*; diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index eb79965d617..1f709c6dce9 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -4304,10 +4304,11 @@ where // After the region commit merged, the region's key range is extended and the // region's `safe_ts` should reset to `min(source_safe_ts, target_safe_ts)` let source_read_progress = meta.region_read_progress.remove(&source.get_id()).unwrap(); - self.fsm - .peer - .read_progress - .merge_safe_ts(source_read_progress.safe_ts(), merge_index); + self.fsm.peer.read_progress.merge_safe_ts( + source_read_progress.safe_ts(), + merge_index, + &self.ctx.coprocessor_host, + ); // If a follower merges into a leader, a more recent read may happen // on the leader of the follower. So max ts should be updated after diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index edf88a561ba..0d7932a6169 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2422,7 +2422,8 @@ where // Resume `read_progress` self.read_progress.resume(); // Update apply index to `last_applying_idx` - self.read_progress.update_applied(self.last_applying_idx); + self.read_progress + .update_applied(self.last_applying_idx, &ctx.coprocessor_host); } CheckApplyingSnapStatus::Idle => { // FIXME: It's possible that the snapshot applying task is canceled. @@ -3318,7 +3319,8 @@ where } self.pending_reads.gc(); - self.read_progress.update_applied(applied_index); + self.read_progress + .update_applied(applied_index, &ctx.coprocessor_host); // Only leaders need to update applied_term. if progress_to_be_updated && self.is_leader() { diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index db62674e6a5..922ba70a2c8 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -14,6 +14,7 @@ use std::{ u64, }; +use engine_traits::KvEngine; use kvproto::{ kvrpcpb::{self, KeyRange, LeaderInfo}, metapb::{self, Peer, PeerRole, Region, RegionEpoch}, @@ -28,9 +29,12 @@ use raft::{ use raft_proto::ConfChangeI; use tikv_util::{box_err, debug, info, time::monotonic_raw_now, Either}; use time::{Duration, Timespec}; +use txn_types::TimeStamp; use super::peer_storage; -use crate::{Error, Result}; +use crate::{coprocessor::CoprocessorHost, Error, Result}; + +const INVALID_TIMESTAMP: u64 = u64::MAX; pub fn find_peer(region: &metapb::Region, store_id: u64) -> Option<&metapb::Peer> { region @@ -929,13 +933,17 @@ impl RegionReadProgressRegistry { // Update `safe_ts` with the provided `LeaderInfo` and return the regions that // have the same `LeaderInfo` - pub fn handle_check_leaders(&self, leaders: Vec) -> Vec { + pub fn handle_check_leaders( + &self, + leaders: Vec, + coprocessor: &CoprocessorHost, + ) -> Vec { let mut regions = Vec::with_capacity(leaders.len()); let registry = self.registry.lock().unwrap(); for leader_info in leaders { let region_id = leader_info.get_region_id(); if let Some(rp) = registry.get(®ion_id) { - if rp.consume_leader_info(leader_info) { + if rp.consume_leader_info(leader_info, coprocessor) { regions.push(region_id); } } @@ -1012,11 +1020,17 @@ impl RegionReadProgress { } } - pub fn update_applied(&self, applied: u64) { + pub fn update_applied(&self, applied: u64, coprocessor: &CoprocessorHost) { let mut core = self.core.lock().unwrap(); if let Some(ts) = core.update_applied(applied) { if !core.pause { self.safe_ts.store(ts, AtomicOrdering::Release); + // No need to update leader safe ts here. + coprocessor.on_update_safe_ts( + core.region_id, + TimeStamp::new(ts).physical(), + INVALID_TIMESTAMP, + ) } } } @@ -1036,18 +1050,34 @@ impl RegionReadProgress { } } - pub fn merge_safe_ts(&self, source_safe_ts: u64, merge_index: u64) { + pub fn merge_safe_ts( + &self, + source_safe_ts: u64, + merge_index: u64, + coprocessor: &CoprocessorHost, + ) { let mut core = self.core.lock().unwrap(); if let Some(ts) = core.merge_safe_ts(source_safe_ts, merge_index) { if !core.pause { self.safe_ts.store(ts, AtomicOrdering::Release); + // After region merge, self safe ts may decrease, so leader safe ts should be + // reset. + coprocessor.on_update_safe_ts( + core.region_id, + TimeStamp::new(ts).physical(), + TimeStamp::new(ts).physical(), + ) } } } // Consume the provided `LeaderInfo` to update `safe_ts` and return whether the // provided `LeaderInfo` is same as ours - pub fn consume_leader_info(&self, mut leader_info: LeaderInfo) -> bool { + pub fn consume_leader_info( + &self, + mut leader_info: LeaderInfo, + coprocessor: &CoprocessorHost, + ) -> bool { let mut core = self.core.lock().unwrap(); if leader_info.has_read_state() { // It is okay to update `safe_ts` without checking the `LeaderInfo`, the @@ -1061,6 +1091,9 @@ impl RegionReadProgress { } } } + let self_phy_ts = TimeStamp::new(self.safe_ts()).physical(); + let leader_phy_ts = TimeStamp::new(rs.get_safe_ts()).physical(); + coprocessor.on_update_safe_ts(leader_info.region_id, self_phy_ts, leader_phy_ts) } // whether the provided `LeaderInfo` is same as ours core.leader_info.leader_term == leader_info.term @@ -1357,6 +1390,7 @@ impl LatencyInspector { mod tests { use std::thread; + use engine_test::kv::KvTestEngine; use kvproto::{ metapb::{self, RegionEpoch}, raft_cmdpb::AdminRequest, @@ -1978,7 +2012,8 @@ mod tests { assert_eq!(rrp.safe_ts(), 10); assert_eq!(pending_items_num(&rrp), 10); - rrp.update_applied(20); + let coprocessor_host = CoprocessorHost::::default(); + rrp.update_applied(20, &coprocessor_host); assert_eq!(rrp.safe_ts(), 20); assert_eq!(pending_items_num(&rrp), 0); @@ -1990,7 +2025,7 @@ mod tests { assert!(pending_items_num(&rrp) <= cap); // `applied_index` large than all pending items will clear all pending items - rrp.update_applied(200); + rrp.update_applied(200, &coprocessor_host); assert_eq!(rrp.safe_ts(), 199); assert_eq!(pending_items_num(&rrp), 0); @@ -2004,9 +2039,9 @@ mod tests { rrp.update_safe_ts(301, 600); assert_eq!(pending_items_num(&rrp), 2); // `safe_ts` will update to 500 instead of 300 - rrp.update_applied(300); + rrp.update_applied(300, &coprocessor_host); assert_eq!(rrp.safe_ts(), 500); - rrp.update_applied(301); + rrp.update_applied(301, &coprocessor_host); assert_eq!(rrp.safe_ts(), 600); assert_eq!(pending_items_num(&rrp), 0); diff --git a/components/raftstore/src/store/worker/check_leader.rs b/components/raftstore/src/store/worker/check_leader.rs index 8821bb6118d..696caab7d69 100644 --- a/components/raftstore/src/store/worker/check_leader.rs +++ b/components/raftstore/src/store/worker/check_leader.rs @@ -6,16 +6,24 @@ use std::{ sync::{Arc, Mutex}, }; +use engine_traits::KvEngine; use fail::fail_point; use keys::{data_end_key, data_key, enc_start_key}; use kvproto::kvrpcpb::{KeyRange, LeaderInfo}; use tikv_util::worker::Runnable; -use crate::store::{fsm::store::StoreMeta, util::RegionReadProgressRegistry}; +use crate::{ + coprocessor::CoprocessorHost, + store::{fsm::store::StoreMeta, util::RegionReadProgressRegistry}, +}; -pub struct Runner { +pub struct Runner +where + E: KvEngine, +{ store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, + coprocessor: CoprocessorHost, } pub enum Task { @@ -47,12 +55,16 @@ impl fmt::Display for Task { } } -impl Runner { - pub fn new(store_meta: Arc>) -> Runner { +impl Runner +where + E: KvEngine, +{ + pub fn new(store_meta: Arc>, coprocessor: CoprocessorHost) -> Runner { let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); Runner { region_read_progress, store_meta, + coprocessor, } } @@ -96,7 +108,10 @@ impl Runner { } } -impl Runnable for Runner { +impl Runnable for Runner +where + E: KvEngine, +{ type Task = Task; fn run(&mut self, task: Task) { match task { @@ -111,7 +126,9 @@ impl Runnable for Runner { self.store_meta.lock().unwrap().store_id == Some(3), |_| {} ); - let regions = self.region_read_progress.handle_check_leaders(leaders); + let regions = self + .region_read_progress + .handle_check_leaders(leaders, &self.coprocessor); cb(regions); } Task::GetStoreTs { key_range, cb } => { @@ -124,6 +141,7 @@ impl Runnable for Runner { #[cfg(test)] mod tests { + use engine_test::kv::KvTestEngine; use keys::enc_end_key; use kvproto::metapb::Region; @@ -155,7 +173,8 @@ mod tests { } let meta = Arc::new(Mutex::new(StoreMeta::new(0))); - let runner = Runner::new(meta.clone()); + let coprocessor_host = CoprocessorHost::::default(); + let runner = Runner::new(meta.clone(), coprocessor_host); assert_eq!(0, runner.get_range_safe_ts(key_range(b"", b""))); add_region(&meta, 1, key_range(b"", b"k1"), 100); assert_eq!(100, runner.get_range_safe_ts(key_range(b"", b""))); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index ca95ddaf310..ba4c515557e 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -837,7 +837,10 @@ where causal_ob.register_to(self.coprocessor_host.as_mut().unwrap()); }; - let check_leader_runner = CheckLeaderRunner::new(engines.store_meta.clone()); + let check_leader_runner = CheckLeaderRunner::new( + engines.store_meta.clone(), + self.coprocessor_host.clone().unwrap(), + ); let check_leader_scheduler = self .background_worker .start("check-leader", check_leader_runner); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 2c1798877d9..f1626b9f2c9 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -390,7 +390,8 @@ impl ServerCluster { let (res_tag_factory, collector_reg_handle, rsmeter_cleanup) = self.init_resource_metering(&cfg.resource_metering); - let check_leader_runner = CheckLeaderRunner::new(store_meta.clone()); + let check_leader_runner = + CheckLeaderRunner::new(store_meta.clone(), coprocessor_host.clone()); let check_leader_scheduler = bg_worker.start("check-leader", check_leader_runner); let mut lock_mgr = LockManager::new(&cfg.pessimistic_txn); From cc127a069496dea5f05b7e2a3816c66f9c3c7713 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 9 Sep 2022 19:50:56 +0800 Subject: [PATCH 203/676] engine: update rust-rocksdb (#13393) close tikv/tikv#13095 Update rust-rocksdb Signed-off-by: tabokie --- Cargo.lock | 6 +++--- components/engine_rocks/src/write_batch.rs | 1 + tests/integrations/server/kv_service.rs | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8a0356d6611..4da587d6d4e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2758,7 +2758,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#827a5df22cd59dc708c4c6a87dd8735a2312773d" +source = "git+https://github.com/tikv/rust-rocksdb.git#4c859a208355bc15ceb7dc1f05303f68acfb4791" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2777,7 +2777,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#827a5df22cd59dc708c4c6a87dd8735a2312773d" +source = "git+https://github.com/tikv/rust-rocksdb.git#4c859a208355bc15ceb7dc1f05303f68acfb4791" dependencies = [ "bzip2-sys", "cc", @@ -4619,7 +4619,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#827a5df22cd59dc708c4c6a87dd8735a2312773d" +source = "git+https://github.com/tikv/rust-rocksdb.git#4c859a208355bc15ceb7dc1f05303f68acfb4791" dependencies = [ "libc 0.2.132", "librocksdb_sys", diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index f617608119b..6b92a285c76 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -105,6 +105,7 @@ impl engine_traits::WriteBatch for RocksWriteBatchVec { self.get_db() .multi_batch_write(self.as_inner(), &opt.into_raw()) .map_err(r2e) + .map(|_| ()) } else { self.get_db() .write_opt(&self.wbs[0], &opt.into_raw()) diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 70c7f9bda4c..6aca801b275 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -2191,7 +2191,9 @@ fn test_commands_write_detail() { assert!(wd.get_commit_log_nanos() > 0); assert!(wd.get_apply_batch_wait_nanos() > 0); assert!(wd.get_apply_log_nanos() > 0); - assert!(wd.get_apply_mutex_lock_nanos() > 0); + // Mutex has been removed from write path. + // Ref https://github.com/facebook/rocksdb/pull/7516 + // assert!(wd.get_apply_mutex_lock_nanos() > 0); assert!(wd.get_apply_write_wal_nanos() > 0); assert!(wd.get_apply_write_memtable_nanos() > 0); }; From 49223a70dc1578559a9effb162f8b20dbeede92f Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 13 Sep 2022 13:22:58 +0800 Subject: [PATCH 204/676] cloud: support backup to s3 when object lock enabled. (#13350) close tikv/tikv#13442 Signed-off-by: 3pointer Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 ++ components/cloud/aws/Cargo.toml | 2 ++ components/cloud/aws/src/s3.rs | 27 +++++++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 4da587d6d4e..45e589819c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -255,6 +255,7 @@ name = "aws" version = "0.0.1" dependencies = [ "async-trait", + "base64", "bytes", "cloud", "fail", @@ -266,6 +267,7 @@ dependencies = [ "hyper-tls", "kvproto", "lazy_static", + "md5", "prometheus", "rusoto_core", "rusoto_credential", diff --git a/components/cloud/aws/Cargo.toml b/components/cloud/aws/Cargo.toml index 314e2281425..293509709db 100644 --- a/components/cloud/aws/Cargo.toml +++ b/components/cloud/aws/Cargo.toml @@ -37,6 +37,8 @@ thiserror = "1.0" lazy_static = "1.3" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } uuid = "0.8" +md5 = "0.7.0" +base64 = "0.13.0" [dev-dependencies] futures = "0.3" diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index 3e9c3665f58..05f418e4c3a 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -50,6 +50,7 @@ pub struct Config { sse_kms_key_id: Option, storage_class: Option, multi_part_size: usize, + object_lock_enabled: bool, } impl Config { @@ -64,6 +65,7 @@ impl Config { sse_kms_key_id: None, storage_class: None, multi_part_size: MINIMUM_PART_SIZE, + object_lock_enabled: false, } } @@ -96,6 +98,7 @@ impl Config { force_path_style, sse_kms_key_id: StringNonEmpty::opt(attrs.get("sse_kms_key_id").unwrap_or(def).clone()), multi_part_size: MINIMUM_PART_SIZE, + object_lock_enabled: false, }) } @@ -128,6 +131,7 @@ impl Config { force_path_style: input.force_path_style, sse_kms_key_id: StringNonEmpty::opt(input.sse_kms_key_id), multi_part_size: MINIMUM_PART_SIZE, + object_lock_enabled: input.object_lock_enabled, }) } } @@ -232,6 +236,7 @@ struct S3Uploader<'client> { sse_kms_key_id: Option, storage_class: Option, multi_part_size: usize, + object_lock_enabled: bool, upload_id: String, parts: Vec, @@ -275,6 +280,13 @@ async fn try_read_exact( } } +fn get_content_md5(object_lock_enabled: bool, content: &[u8]) -> Option { + object_lock_enabled.then(|| { + let digest = md5::compute(content); + base64::encode(digest.0) + }) +} + /// Specifies the minimum size to use multi-part upload. /// AWS S3 requires each part to be at least 5 MiB. const MINIMUM_PART_SIZE: usize = 5 * 1024 * 1024; @@ -292,6 +304,7 @@ impl<'client> S3Uploader<'client> { sse_kms_key_id: config.sse_kms_key_id.as_ref().cloned(), storage_class: config.storage_class.as_ref().cloned(), multi_part_size: config.multi_part_size, + object_lock_enabled: config.object_lock_enabled, upload_id: "".to_owned(), parts: Vec::new(), } @@ -432,6 +445,7 @@ impl<'client> S3Uploader<'client> { upload_id: self.upload_id.clone(), part_number, content_length: Some(data.len() as i64), + content_md5: get_content_md5(self.object_lock_enabled, data), body: Some(data.to_vec().into()), ..Default::default() }) @@ -492,6 +506,7 @@ impl<'client> S3Uploader<'client> { ssekms_key_id: self.sse_kms_key_id.as_ref().map(|s| s.to_string()), storage_class: self.storage_class.as_ref().map(|s| s.to_string()), content_length: Some(data.len() as i64), + content_md5: get_content_md5(self.object_lock_enabled, data), body: Some(data.to_vec().into()), ..Default::default() }) @@ -590,6 +605,18 @@ mod tests { use super::*; + #[test] + fn test_s3_get_content_md5() { + // base64 encode md5sum "helloworld" + let code = "helloworld".to_string(); + let expect = "/F4DjTilcDIIVEHn/nAQsA==".to_string(); + let actual = get_content_md5(true, code.as_bytes()).unwrap(); + assert_eq!(actual, expect); + + let actual = get_content_md5(false, b"xxx"); + assert!(actual.is_none()) + } + #[test] fn test_s3_config() { let bucket_name = StringNonEmpty::required("mybucket".to_string()).unwrap(); From 63465fabcb255ecb82e431f51a166e699cfc4aa3 Mon Sep 17 00:00:00 2001 From: Jianjun Liao <36503113+Leavrth@users.noreply.github.com> Date: Tue, 13 Sep 2022 15:00:58 +0800 Subject: [PATCH 205/676] log-backup: merge small files in each flush (#13233) close tikv/tikv#13232 Signed-off-by: Leavrth Co-authored-by: Ti Chi Robot --- Cargo.lock | 37 ++ components/backup-stream/Cargo.toml | 2 + components/backup-stream/src/endpoint.rs | 2 +- components/backup-stream/src/router.rs | 438 ++++++++++++------ components/backup-stream/src/utils.rs | 80 +++- components/backup-stream/tests/mod.rs | 123 +++-- components/cloud/aws/src/s3.rs | 64 +-- components/cloud/azure/src/azblob.rs | 48 +- components/cloud/gcp/src/gcs.rs | 83 ++-- components/cloud/src/blob.rs | 7 + components/external_storage/Cargo.toml | 1 + components/external_storage/export/Cargo.toml | 1 + .../external_storage/export/src/export.rs | 17 +- components/external_storage/src/hdfs.rs | 9 + components/external_storage/src/lib.rs | 21 +- components/external_storage/src/local.rs | 19 +- components/external_storage/src/noop.rs | 4 + components/sst_importer/src/sst_importer.rs | 28 +- 18 files changed, 725 insertions(+), 259 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 45e589819c2..3f64d59eed9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,6 +148,21 @@ dependencies = [ "futures-core", ] +[[package]] +name = "async-compression" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "345fd392ab01f746c717b1357165b76f0b67a60192007b234058c9045fdcf695" +dependencies = [ + "futures-core", + "futures-io", + "memchr", + "pin-project-lite", + "tokio", + "zstd", + "zstd-safe", +] + [[package]] name = "async-speed-limit" version = "0.4.0" @@ -445,6 +460,7 @@ dependencies = [ name = "backup-stream" version = "0.1.0" dependencies = [ + "async-compression", "async-trait", "bytes", "chrono", @@ -1653,6 +1669,7 @@ dependencies = [ name = "external_storage" version = "0.0.1" dependencies = [ + "async-compression", "async-trait", "bytes", "encryption", @@ -1690,6 +1707,7 @@ dependencies = [ name = "external_storage_export" version = "0.0.1" dependencies = [ + "async-compression", "async-trait", "aws", "azure", @@ -7166,6 +7184,25 @@ dependencies = [ "rand 0.7.3", ] +[[package]] +name = "zstd" +version = "0.11.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "5.0.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +dependencies = [ + "libc 0.2.125", + "zstd-sys", +] + [[package]] name = "zstd-sys" version = "2.0.1+zstd.1.5.2" diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 6090d929291..8e6e43c8203 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -19,6 +19,7 @@ test = true harness = true [dependencies] +async-compression = { version = "0.3.14", features = ["tokio", "zstd"] } async-trait = { version = "0.1" } bytes = "1" chrono = "0.4" @@ -74,6 +75,7 @@ async-trait = "0.1" engine_panic = { path = "../engine_panic" } grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } hex = "0.4" +protobuf = { version = "2.8", features = ["bytes"] } rand = "0.8.0" tempdir = "0.3" tempfile = "3.0" diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 81374484463..d463964558a 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -615,7 +615,7 @@ where }) .collect::>(); range_router - .register_task(task.clone(), ranges.clone()) + .register_task(task.clone(), ranges.clone(), self.config.file_size_limit.0) .await?; for (start_key, end_key) in ranges { diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index f1280103e89..fd63cd1841e 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -14,12 +14,13 @@ use std::{ time::Duration, }; +use async_compression::{tokio::write::ZstdEncoder, Level}; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_WRITE}; use external_storage::{BackendConfig, UnpinReader}; use external_storage_export::{create_storage, ExternalStorage}; use futures::io::Cursor; use kvproto::{ - brpb::{DataFileInfo, FileType, Metadata, StreamBackupTaskInfo}, + brpb::{DataFileGroup, DataFileInfo, FileType, MetaVersion, Metadata, StreamBackupTaskInfo}, raft_cmdpb::CmdType, }; use openssl::hash::{Hasher, MessageDigest}; @@ -53,17 +54,11 @@ use crate::{ metrics::{HANDLE_KV_HISTOGRAM, SKIP_KV_COUNTER}, subscription_track::TwoPhaseResolver, try_send, - utils::{self, SegmentMap, Slot, SlotMap, StopWatch}, + utils::{self, FilesReader, SegmentMap, SlotMap, StopWatch}, }; const FLUSH_FAILURE_BECOME_FATAL_THRESHOLD: usize = 30; -/// FLUSH_LOG_CONCURRENT_BATCH_COUNT specifies the concurrent count to write to -/// storage. 'Log backup' will produce a large mount of small files during flush -/// interval, and storage could take mistaken if writing all of these files to -/// storage concurrently. -const FLUSH_LOG_CONCURRENT_BATCH_COUNT: usize = 128; - #[derive(Clone, Debug)] pub enum TaskSelector { ByName(String), @@ -422,13 +417,20 @@ impl RouterInner { &self, mut task: StreamTask, ranges: Vec<(Vec, Vec)>, + merged_file_size_limit: u64, ) -> Result<()> { let task_name = task.info.take_name(); // register task info let prefix_path = self.prefix.join(&task_name); - let stream_task = - StreamTaskInfo::new(prefix_path, task, self.max_flush_interval, ranges.clone()).await?; + let stream_task = StreamTaskInfo::new( + prefix_path, + task, + self.max_flush_interval, + ranges.clone(), + merged_file_size_limit, + ) + .await?; self.tasks .lock() .await @@ -694,31 +696,34 @@ impl TempFileKey { } } - /// path_to_log_file specifies the path of record log. + /// path_to_log_file specifies the path of record log for v2. /// ```text - /// v1/${date}/${hour}/${store_id}/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log + /// V1: v1/${date}/${hour}/${store_id}/t00000071/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log + /// V2: v1/${date}/${hour}/${store_id}/434098800931373064-f0251bd5-1441-499a-8f53-adc0d1057a73.log /// ``` - fn path_to_log_file(&self, store_id: u64, min_ts: u64, max_ts: u64) -> String { + /// For v2, we merged the small files (partition by table_id) into one file. + fn path_to_log_file(store_id: u64, min_ts: u64, max_ts: u64) -> String { format!( - "v1/{}/{}/{}/t{:08}/{:012}-{}.log", + "v1/{}/{}/{}/{}-{}.log", // We may delete a range of files, so using the max_ts for preventing remove some // records wrong. Self::format_date_time(max_ts, FormatType::Date), Self::format_date_time(max_ts, FormatType::Hour), store_id, - self.table_id, min_ts, uuid::Uuid::new_v4() ) } - /// path_to_schema_file specifies the path of schema log. + /// path_to_schema_file specifies the path of schema log for v2. /// ```text - /// v1/${date}/${hour}/${store_id}/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log + /// V1: v1/${date}/${hour}/${store_id}/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log + /// V2: v1/${date}/${hour}/${store_id}/schema-meta/434055683656384515-cc3cb7a3-e03b-4434-ab6c-907656fddf67.log /// ``` + /// For v2, we merged the small files (partition by table_id) into one file. fn path_to_schema_file(store_id: u64, min_ts: u64, max_ts: u64) -> String { format!( - "v1/{}/{}/{}/schema-meta/{:012}-{}.log", + "v1/{}/{}/{}/schema-meta/{}-{}.log", Self::format_date_time(max_ts, FormatType::Date), Self::format_date_time(max_ts, FormatType::Hour), store_id, @@ -727,11 +732,11 @@ impl TempFileKey { ) } - fn file_name(&self, store_id: u64, min_ts: TimeStamp, max_ts: TimeStamp) -> String { - if self.is_meta { - Self::path_to_schema_file(store_id, min_ts.into_inner(), max_ts.into_inner()) + fn file_name(store_id: u64, min_ts: u64, max_ts: u64, is_meta: bool) -> String { + if is_meta { + Self::path_to_schema_file(store_id, min_ts, max_ts) } else { - self.path_to_log_file(store_id, min_ts.into_inner(), max_ts.into_inner()) + Self::path_to_log_file(store_id, min_ts, max_ts) } } } @@ -748,7 +753,9 @@ pub struct StreamTaskInfo { /// prefixed keys). files: SlotMap, /// flushing_files contains files pending flush. - flushing_files: RwLock, DataFileInfo)>>, + flushing_files: RwLock>, + /// flushing_meta_files contains meta files pending flush. + flushing_meta_files: RwLock>, /// last_flush_ts represents last time this task flushed to storage. last_flush_time: AtomicPtr, /// flush_interval represents the tick interval of flush, setting by users. @@ -768,6 +775,8 @@ pub struct StreamTaskInfo { flush_fail_count: AtomicUsize, /// global checkpoint ts for this task. global_checkpoint_ts: AtomicU64, + /// The size limit of the merged file for this task. + merged_file_size_limit: u64, } impl Drop for StreamTaskInfo { @@ -776,12 +785,19 @@ impl Drop for StreamTaskInfo { .flushing_files .get_mut() .drain(..) - .map(|(a, b, _)| (a, b)) - .chain(self.files.get_mut().drain()) + .chain(self.flushing_meta_files.get_mut().drain(..)) + .map(|(_, f, _)| f.local_path) + .map(std::fs::remove_file) + .partition(|r| r.is_ok()); + info!("stream task info dropped[1/2], removing flushing_temp files"; "success" => %success.len(), "failure" => %failed.len()); + let (success, failed): (Vec<_>, Vec<_>) = self + .files + .get_mut() + .drain() .map(|(_, f)| f.into_inner().local_path) .map(std::fs::remove_file) .partition(|r| r.is_ok()); - info!("stream task info dropped, removing temp files"; "success" => %success.len(), "failure" => %failed.len()) + info!("stream task info dropped[2/2], removing temp files"; "success" => %success.len(), "failure" => %failed.len()); } } @@ -804,6 +820,7 @@ impl StreamTaskInfo { task: StreamTask, flush_interval: Duration, ranges: Vec<(Vec, Vec)>, + merged_file_size_limit: u64, ) -> Result { tokio::fs::create_dir_all(&temp_dir).await?; let storage = Arc::from(create_storage( @@ -819,12 +836,14 @@ impl StreamTaskInfo { min_resolved_ts: TimeStamp::max(), files: SlotMap::default(), flushing_files: RwLock::default(), + flushing_meta_files: RwLock::default(), last_flush_time: AtomicPtr::new(Box::into_raw(Box::new(Instant::now()))), flush_interval, total_size: AtomicUsize::new(0), flushing: AtomicBool::new(false), flush_fail_count: AtomicUsize::new(0), global_checkpoint_ts: AtomicU64::new(start_ts), + merged_file_size_limit, }) } @@ -882,24 +901,29 @@ impl StreamTaskInfo { /// Flush all template files and generate corresponding metadata. pub async fn generate_metadata(&self, store_id: u64) -> Result { - let w = self.flushing_files.read().await; + let mut w = self.flushing_files.write().await; + let mut wm = self.flushing_meta_files.write().await; // Let's flush all files first... - futures::future::join_all(w.iter().map(|(_, f, _)| async move { - let file = &mut f.lock().await.inner; - file.flush().await?; - file.get_ref().sync_all().await?; - Result::Ok(()) - })) + futures::future::join_all( + w.iter_mut() + .chain(wm.iter_mut()) + .map(|(_, f, _)| async move { + let encoder = &mut f.inner; + encoder.shutdown().await?; + let file = encoder.get_mut(); + file.flush().await?; + file.get_ref().sync_all().await?; + Result::Ok(()) + }), + ) .await .into_iter() .map(|r| r.map_err(Error::from)) .fold(Ok(()), Result::and)?; - let mut metadata = MetadataInfo::with_capacity(w.len()); + let mut metadata = MetadataInfo::with_capacity(w.len() + wm.len()); metadata.set_store_id(store_id); - for (_, _, file_meta) in w.iter() { - metadata.push(file_meta.to_owned()) - } + // delay push files until log files are flushed Ok(metadata) } @@ -933,7 +957,7 @@ impl StreamTaskInfo { } /// move need-flushing files to flushing_files. - pub async fn move_to_flushing_files(&self, store_id: u64) -> Result<&Self> { + pub async fn move_to_flushing_files(&self) -> Result<&Self> { // if flushing_files is not empty, which represents this flush is a retry // operation. if !self.flushing_files.read().await.is_empty() { @@ -942,20 +966,25 @@ impl StreamTaskInfo { let mut w = self.files.write().await; let mut fw = self.flushing_files.write().await; + let mut fw_meta = self.flushing_meta_files.write().await; for (k, v) in w.drain() { // we should generate file metadata(calculate sha256) when moving file. // because sha256 calculation is a unsafe move operation. // we cannot re-calculate it in retry. // TODO refactor move_to_flushing_files and generate_metadata - let file_meta = v.lock().await.generate_metadata(&k, store_id)?; - fw.push((k, v, file_meta)); + let mut v = v.into_inner(); + let file_meta = v.generate_metadata(&k)?; + if file_meta.is_meta { + fw_meta.push((k, v, file_meta)); + } else { + fw.push((k, v, file_meta)); + } } Ok(self) } pub async fn clear_flushing_files(&self) { - for (_, v, _) in self.flushing_files.write().await.drain(..) { - let data_file = v.lock().await; + for (_, data_file, _) in self.flushing_files.write().await.drain(..) { debug!("removing data file"; "size" => %data_file.file_size, "name" => %data_file.local_path.display()); self.total_size .fetch_sub(data_file.file_size, Ordering::SeqCst); @@ -964,69 +993,161 @@ impl StreamTaskInfo { info!("remove template file"; "err" => ?e); } } + for (_, data_file, _) in self.flushing_meta_files.write().await.drain(..) { + debug!("removing meta data file"; "size" => %data_file.file_size, "name" => %data_file.local_path.display()); + self.total_size + .fetch_sub(data_file.file_size, Ordering::SeqCst); + if let Err(e) = data_file.remove_temp_file().await { + // if remove template failed, just skip it. + info!("remove template file"; "err" => ?e); + } + } } - async fn flush_log_file_to( + async fn merge_and_flush_log_files_to( storage: Arc, - file: &Mutex, + files: &[(TempFileKey, DataFile, DataFileInfo)], + metadata: &mut MetadataInfo, + is_meta: bool, ) -> Result<()> { - let data_file = file.lock().await; + let mut data_files_open = Vec::new(); + let mut data_file_infos = Vec::new(); + let mut merged_file_info = DataFileGroup::new(); + let mut stat_length = 0; + let mut max_ts: Option = None; + let mut min_ts: Option = None; + let mut min_resolved_ts: Option = None; + for (_, data_file, file_info) in files { + let mut file_info_clone = file_info.to_owned(); + // Update offset of file_info(DataFileInfo) + // and push it into merged_file_info(DataFileGroup). + file_info_clone.set_offset(stat_length); + data_files_open.push({ + let file = File::open(data_file.local_path.clone()).await?; + let compress_length = file.metadata().await?.len(); + stat_length += compress_length; + file_info_clone.set_compress_length(compress_length); + file + }); + data_file_infos.push(file_info_clone); + + let rts = file_info.resolved_ts; + min_resolved_ts = min_resolved_ts.map_or(Some(rts), |r| Some(r.min(rts))); + min_ts = min_ts.map_or(Some(file_info.min_ts), |ts| Some(ts.min(file_info.min_ts))); + max_ts = max_ts.map_or(Some(file_info.max_ts), |ts| Some(ts.max(file_info.max_ts))); + } + let min_ts = min_ts.unwrap_or_default(); + let max_ts = max_ts.unwrap_or_default(); + merged_file_info.set_path(TempFileKey::file_name( + metadata.store_id, + min_ts, + max_ts, + is_meta, + )); + merged_file_info.set_data_files_info(data_file_infos.into()); + merged_file_info.set_length(stat_length); + merged_file_info.set_max_ts(max_ts); + merged_file_info.set_min_ts(min_ts); + merged_file_info.set_min_resolved_ts(min_resolved_ts.unwrap_or_default()); + // to do: limiter to storage let limiter = Limiter::builder(std::f64::INFINITY).build(); - let reader = File::open(data_file.local_path.clone()).await?; - let stat = reader.metadata().await?; - let reader = UnpinReader(Box::new(limiter.limit(reader.compat()))); - let filepath = &data_file.storage_path; - let est_len = stat.len(); - let ret = storage.write(filepath, reader, est_len).await; + let files_reader = FilesReader::new(data_files_open); + + let reader = UnpinReader(Box::new(limiter.limit(files_reader.compat()))); + let filepath = &merged_file_info.path; + + let ret = storage.write(filepath, reader, stat_length).await; + match ret { Ok(_) => { debug!( "backup stream flush success"; - "tmp file" => ?data_file.local_path, "storage file" => ?filepath, + "est_len" => ?stat_length, ); } Err(e) => { warn!("backup stream flush failed"; - "file" => ?data_file.local_path, - "est_len" => ?est_len, + "est_len" => ?stat_length, "err" => ?e, ); return Err(Error::Io(e)); } } + + // push merged file into metadata + metadata.push(merged_file_info); Ok(()) } - pub async fn flush_log(&self) -> Result<()> { - // if failed to write storage, we should retry write flushing_files. + pub async fn flush_log(&self, metadata: &mut MetadataInfo) -> Result<()> { let storage = self.storage.clone(); - let files = self.flushing_files.write().await; - - for batch_files in files.chunks(FLUSH_LOG_CONCURRENT_BATCH_COUNT) { - let futs = batch_files - .iter() - .map(|(_, v, _)| Self::flush_log_file_to(storage.clone(), v)); - futures::future::try_join_all(futs).await?; - } + self.merge_log(metadata, storage.clone(), &self.flushing_files, false) + .await?; + self.merge_log(metadata, storage.clone(), &self.flushing_meta_files, true) + .await?; Ok(()) } - pub async fn flush_meta(&self, metadata_info: MetadataInfo) -> Result<()> { - let meta_path = metadata_info.path_to_meta(); - let meta_buff = metadata_info.marshal_to()?; - let buflen = meta_buff.len(); + async fn merge_log( + &self, + metadata: &mut MetadataInfo, + storage: Arc, + files_lock: &RwLock>, + is_meta: bool, + ) -> Result<()> { + let files = files_lock.write().await; + let mut batch_size = 0; + // file[batch_begin_index, i) is a batch + let mut batch_begin_index = 0; + // TODO: upload the merged file concurrently, + // then collect merged_file_infos and push them into `metadata`. + for (i, (_, _, info)) in files.iter().enumerate() { + if batch_size >= self.merged_file_size_limit { + Self::merge_and_flush_log_files_to( + storage.clone(), + &files[batch_begin_index..i], + metadata, + is_meta, + ) + .await?; - self.storage - .write( - &meta_path, - UnpinReader(Box::new(Cursor::new(meta_buff))), - buflen as _, + batch_begin_index = i; + batch_size = 0; + } + + batch_size += info.length; + } + if batch_begin_index < files.len() { + Self::merge_and_flush_log_files_to( + storage.clone(), + &files[batch_begin_index..], + metadata, + is_meta, ) .await?; + } + + Ok(()) + } + + pub async fn flush_meta(&self, metadata_info: MetadataInfo) -> Result<()> { + if !metadata_info.file_groups.is_empty() { + let meta_path = metadata_info.path_to_meta(); + let meta_buff = metadata_info.marshal_to()?; + let buflen = meta_buff.len(); + + self.storage + .write( + &meta_path, + UnpinReader(Box::new(Cursor::new(meta_buff))), + buflen as _, + ) + .await?; + } Ok(()) } @@ -1055,25 +1176,29 @@ impl StreamTaskInfo { // generate meta data and prepare to flush to storage let mut metadata_info = self - .move_to_flushing_files(store_id) + .move_to_flushing_files() .await? .generate_metadata(store_id) .await?; - metadata_info.min_resolved_ts = metadata_info - .min_resolved_ts - .max(Some(resolved_ts_provided.into_inner())); - let rts = metadata_info.min_resolved_ts; crate::metrics::FLUSH_DURATION .with_label_values(&["generate_metadata"]) .observe(sw.lap().as_secs_f64()); // flush log file to storage. - self.flush_log().await?; + self.flush_log(&mut metadata_info).await?; + + // the field `min_resolved_ts` of metadata will be updated + // only after flush is done. + metadata_info.min_resolved_ts = metadata_info + .min_resolved_ts + .max(Some(resolved_ts_provided.into_inner())); + let rts = metadata_info.min_resolved_ts; + // compress length let file_size_vec = metadata_info - .files + .file_groups .iter() - .map(|d| d.length) + .map(|d| (d.length, d.data_files_info.len())) .collect::>(); // flush meta file to storage. self.flush_meta(metadata_info).await?; @@ -1088,10 +1213,11 @@ impl StreamTaskInfo { .observe(sw.lap().as_secs_f64()); file_size_vec .iter() - .for_each(|size| crate::metrics::FLUSH_FILE_SIZE.observe(*size as _)); + .for_each(|(size, _)| crate::metrics::FLUSH_FILE_SIZE.observe(*size as _)); info!("log backup flush done"; - "files" => %file_size_vec.len(), - "total_size" => %file_size_vec.iter().sum::(), + "merged_files" => %file_size_vec.len(), // the number of the merged files + "files" => %file_size_vec.iter().map(|(_, v)| v).sum::(), + "total_size" => %file_size_vec.iter().map(|(v, _)| v).sum::(), // the size of the merged files after compressed "take" => ?begin.saturating_elapsed(), ); Ok(rts) @@ -1152,18 +1278,20 @@ struct DataFile { resolved_ts: TimeStamp, min_begin_ts: Option, sha256: Hasher, - inner: BufWriter, + // TODO: use lz4 with async feature + inner: ZstdEncoder>, start_key: Vec, end_key: Vec, number_of_entries: usize, file_size: usize, local_path: PathBuf, - storage_path: String, } #[derive(Debug)] pub struct MetadataInfo { - pub files: Vec, + // the field files is deprecated in v6.3.0 + // pub files: Vec, + pub file_groups: Vec, pub min_resolved_ts: Option, pub min_ts: Option, pub max_ts: Option, @@ -1173,7 +1301,7 @@ pub struct MetadataInfo { impl MetadataInfo { fn with_capacity(cap: usize) -> Self { Self { - files: Vec::with_capacity(cap), + file_groups: Vec::with_capacity(cap), min_resolved_ts: None, min_ts: None, max_ts: None, @@ -1185,8 +1313,8 @@ impl MetadataInfo { self.store_id = store_id; } - fn push(&mut self, file: DataFileInfo) { - let rts = file.resolved_ts; + fn push(&mut self, file: DataFileGroup) { + let rts = file.min_resolved_ts; self.min_resolved_ts = self.min_resolved_ts.map_or(Some(rts), |r| Some(r.min(rts))); self.min_ts = self .min_ts @@ -1194,16 +1322,17 @@ impl MetadataInfo { self.max_ts = self .max_ts .map_or(Some(file.max_ts), |ts| Some(ts.max(file.max_ts))); - self.files.push(file); + self.file_groups.push(file); } fn marshal_to(self) -> Result> { let mut metadata = Metadata::new(); - metadata.set_files(self.files.into()); + metadata.set_file_groups(self.file_groups.into()); metadata.set_store_id(self.store_id as _); metadata.set_resolved_ts(self.min_resolved_ts.unwrap_or_default()); metadata.set_min_ts(self.min_ts.unwrap_or(0)); metadata.set_max_ts(self.max_ts.unwrap_or(0)); + metadata.set_meta_version(MetaVersion::V2); metadata .write_to_bytes() @@ -1212,7 +1341,7 @@ impl MetadataInfo { fn path_to_meta(&self) -> String { format!( - "v1/backupmeta/{:012}-{}.meta", + "v1/backupmeta/{}-{}.meta", self.min_resolved_ts.unwrap_or_default(), uuid::Uuid::new_v4() ) @@ -1225,19 +1354,19 @@ impl DataFile { async fn new(local_path: impl AsRef) -> Result { let sha256 = Hasher::new(MessageDigest::sha256()) .map_err(|err| Error::Other(box_err!("openssl hasher failed to init: {}", err)))?; + let inner = BufWriter::with_capacity(128 * 1024, File::create(local_path.as_ref()).await?); Ok(Self { min_ts: TimeStamp::max(), max_ts: TimeStamp::zero(), resolved_ts: TimeStamp::zero(), min_begin_ts: None, - inner: BufWriter::with_capacity(128 * 1024, File::create(local_path.as_ref()).await?), + inner: ZstdEncoder::with_quality(inner, Level::Fastest), sha256, number_of_entries: 0, file_size: 0, start_key: vec![], end_key: vec![], local_path: local_path.as_ref().to_owned(), - storage_path: String::default(), }) } @@ -1313,15 +1442,11 @@ impl DataFile { } } - /// generage path for log file before flushing to Storage - fn set_storage_path(&mut self, path: String) { - self.storage_path = path; - } - - /// generate the metadata in protocol buffer of the file. - fn generate_metadata(&mut self, file_key: &TempFileKey, store_id: u64) -> Result { - self.set_storage_path(file_key.file_name(store_id, self.min_ts, self.max_ts)); - + /// generate the metadata v2 where each file becomes a part of the merged + /// file. + fn generate_metadata(&mut self, file_key: &TempFileKey) -> Result { + // Note: the field `storage_path` is empty!!! It will be stored in the upper + // layer `DataFileGroup`. let mut meta = DataFileInfo::new(); meta.set_sha256( self.sha256 @@ -1329,7 +1454,6 @@ impl DataFile { .map(|bytes| bytes.to_vec()) .map_err(|err| Error::Other(box_err!("openssl hasher failed to init: {}", err)))?, ); - meta.set_path(self.storage_path.clone()); meta.set_number_of_entries(self.number_of_entries as _); meta.set_max_ts(self.max_ts.into_inner() as _); meta.set_min_ts(self.min_ts.into_inner() as _); @@ -1385,7 +1509,7 @@ mod tests { codec::number::NumberEncoder, worker::{dummy_scheduler, ReceiverWrapper}, }; - use tokio::{fs::File, sync::Mutex}; + use tokio::fs::File; use txn_types::{Write, WriteType}; use super::*; @@ -1550,6 +1674,7 @@ mod tests { utils::wrap_key(make_table_key(table_id, b"")), utils::wrap_key(make_table_key(table_id + 1, b"")), )], + 0x100000, ) .await .expect("failed to register task") @@ -1593,43 +1718,56 @@ mod tests { let end_ts = TimeStamp::physical_now(); let files = router.tasks.lock().await.get("dummy").unwrap().clone(); - let meta = files - .move_to_flushing_files(1) + let mut meta = files + .move_to_flushing_files() .await? .generate_metadata(1) .await?; - assert_eq!(meta.files.len(), 3, "test file len = {}", meta.files.len()); + assert!( - meta.files.iter().all(|item| { - TimeStamp::new(item.min_ts as _).physical() >= start_ts - && TimeStamp::new(item.max_ts as _).physical() <= end_ts - && item.min_ts <= item.max_ts - }), + meta.file_groups + .iter() + .all(|group| group.data_files_info.iter().all(|item| { + TimeStamp::new(item.min_ts as _).physical() >= start_ts + && TimeStamp::new(item.max_ts as _).physical() <= end_ts + && item.min_ts <= item.max_ts + })), "meta = {:#?}; start ts = {}, end ts = {}", - meta.files, + meta.file_groups, start_ts, end_ts ); // in some case when flush failed to write files to storage. // we may run `generate_metadata` again with same files. - let another_meta = files - .move_to_flushing_files(1) + let mut another_meta = files + .move_to_flushing_files() .await? .generate_metadata(1) .await?; - assert_eq!(meta.files.len(), another_meta.files.len()); - for i in 0..meta.files.len() { - let file1 = meta.files.get(i).unwrap(); - let file2 = another_meta.files.get(i).unwrap(); + files.flush_log(&mut meta).await?; + files.flush_log(&mut another_meta).await?; + // meta updated + let files_num = meta + .file_groups + .iter() + .map(|v| v.data_files_info.len()) + .sum::(); + assert_eq!(files_num, 3, "test file len = {}", files_num); + for i in 0..meta.file_groups.len() { + let file_groups1 = meta.file_groups.get(i).unwrap(); + let file_groups2 = another_meta.file_groups.get(i).unwrap(); // we have to make sure two times sha256 of file must be the same. - assert_eq!(file1.sha256, file2.sha256); - assert_eq!(file1.start_key, file2.start_key); - assert_eq!(file1.end_key, file2.end_key); + for j in 0..file_groups1.data_files_info.len() { + let file1 = file_groups1.data_files_info.get(j).unwrap(); + let file2 = file_groups2.data_files_info.get(j).unwrap(); + assert_eq!(file1.sha256, file2.sha256); + assert_eq!(file1.start_key, file2.start_key); + assert_eq!(file1.end_key, file2.end_key); + } } - files.flush_log().await?; files.flush_meta(meta).await?; files.clear_flushing_files().await; @@ -1662,13 +1800,18 @@ mod tests { } assert_eq!(meta_count, 1); - assert_eq!(log_count, 3); + assert_eq!(log_count, 2); // flush twice Ok(()) } - fn mock_build_kv_events(table_id: i64, region_id: u64, resolved_ts: u64) -> ApplyEvents { + fn mock_build_large_kv_events(table_id: i64, region_id: u64, resolved_ts: u64) -> ApplyEvents { let mut events_builder = KvEventsBuilder::new(region_id, resolved_ts); - events_builder.put_table("default", table_id, b"hello", b"world"); + events_builder.put_table( + "default", + table_id, + b"hello", + "world".repeat(1024).as_bytes(), + ); events_builder.finish() } @@ -1682,19 +1825,21 @@ mod tests { info: task_info, is_paused: false, }; + let merged_file_size_limit = 0x10000; let task = StreamTaskInfo::new( tmp_dir.path().to_path_buf(), stream_task, Duration::from_secs(300), vec![(vec![], vec![])], + merged_file_size_limit, ) .await .unwrap(); // on_event - let region_count = FLUSH_LOG_CONCURRENT_BATCH_COUNT + 5; + let region_count = merged_file_size_limit / (4 * 1024); // 2 merged log files for i in 1..=region_count { - let kv_events = mock_build_kv_events(i as _, i as _, i as _); + let kv_events = mock_build_large_kv_events(i as _, i as _, i as _); task.on_events(kv_events).await.unwrap(); } // do_flush @@ -1716,7 +1861,7 @@ mod tests { } } assert_eq!(meta_count, 1); - assert_eq!(log_count, region_count); + assert_eq!(log_count, 2); } struct ErrorStorage { @@ -1779,6 +1924,15 @@ mod tests { fn read(&self, name: &str) -> Box { self.inner.read(name) } + + fn read_part( + &self, + name: &str, + off: u64, + len: u64, + ) -> Box { + self.inner.read_part(name, off, len) + } } fn build_kv_event(base: i32, count: i32) -> ApplyEvents { @@ -1845,6 +1999,7 @@ mod tests { is_paused: false, }, vec![], + 0x100000, ) .await .unwrap(); @@ -1871,7 +2026,7 @@ mod tests { router .get_task_info("cleanup_test") .await? - .move_to_flushing_files(1) + .move_to_flushing_files() .await?; write_simple_data(&router).await; let mut w = walkdir::WalkDir::new(&tmp).into_iter(); @@ -2039,6 +2194,7 @@ mod tests { stream_task, Duration::from_secs(300), vec![(vec![], vec![])], + 0x100000, ) .await .unwrap(); @@ -2115,6 +2271,10 @@ mod tests { fn read(&self, name: &str) -> Box { self.s.read(name) } + + fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + self.s.read_part(name, off, len) + } } #[tokio::test] @@ -2126,7 +2286,19 @@ mod tests { f.write_all("test-data".as_bytes()).await?; let data_file = DataFile::new(file_path).await.unwrap(); - let result = StreamTaskInfo::flush_log_file_to(Arc::new(ms), &Mutex::new(data_file)).await; + let info = DataFileInfo::new(); + + let mut meta = MetadataInfo::with_capacity(1); + let kv_event = build_kv_event(1, 1); + let tmp_key = TempFileKey::of(&kv_event.events[0], 1); + let files = vec![(tmp_key, data_file, info)]; + let result = StreamTaskInfo::merge_and_flush_log_files_to( + Arc::new(ms), + &files[0..], + &mut meta, + false, + ) + .await; assert_eq!(result.is_ok(), true); Ok(()) } diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index ac1b3dec168..22163eccf5f 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use core::pin::Pin; use std::{ borrow::Borrow, collections::{hash_map::RandomState, BTreeMap, HashMap}, @@ -13,7 +14,7 @@ use std::{ use engine_rocks::ReadPerfInstant; use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; -use futures::{channel::mpsc, executor::block_on, FutureExt, StreamExt}; +use futures::{channel::mpsc, executor::block_on, ready, task::Poll, FutureExt, StreamExt}; use kvproto::raft_cmdpb::{CmdType, Request}; use raft::StateRole; use raftstore::{coprocessor::RegionInfoProvider, RegionInfo}; @@ -28,7 +29,11 @@ use tikv_util::{ worker::Scheduler, Either, }; -use tokio::sync::{oneshot, Mutex, RwLock}; +use tokio::{ + fs::File, + io::AsyncRead, + sync::{oneshot, Mutex, RwLock}, +}; use txn_types::{Key, Lock, LockType}; use crate::{ @@ -589,6 +594,39 @@ pub fn is_overlapping(range: (&[u8], &[u8]), range2: (&[u8], &[u8])) -> bool { } } +pub struct FilesReader { + files: Vec, + index: usize, +} + +impl FilesReader { + pub fn new(files: Vec) -> Self { + FilesReader { files, index: 0 } + } +} + +impl AsyncRead for FilesReader { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll> { + let me = self.get_mut(); + + while me.index < me.files.len() { + let rem = buf.remaining(); + ready!(Pin::new(&mut me.files[me.index]).poll_read(cx, buf))?; + if buf.remaining() == rem { + me.index += 1; + } else { + return Poll::Ready(Ok(())); + } + } + + Poll::Ready(Ok(())) + } +} + #[cfg(test)] mod test { use std::{ @@ -601,6 +639,7 @@ mod test { use engine_traits::WriteOptions; use futures::executor::block_on; + use tokio::io::AsyncWriteExt; use crate::utils::{is_in_range, CallbackWaitGroup, SegmentMap}; @@ -788,4 +827,41 @@ mod test { items_size ); } + + #[tokio::test] + async fn test_files_reader() { + use tempdir::TempDir; + use tokio::{fs::File, io::AsyncReadExt}; + + use super::FilesReader; + + let dir = TempDir::new("test_files").unwrap(); + let files_num = 5; + let mut files_path = Vec::new(); + let mut expect_content = String::new(); + for i in 0..files_num { + let path = dir.path().join(format!("f{}", i)); + let mut file = File::create(&path).await.unwrap(); + let content = format!("{i}_{i}_{i}_{i}_{i}\n{i}{i}{i}{i}\n").repeat(10); + file.write_all(content.as_bytes()).await.unwrap(); + file.sync_all().await.unwrap(); + + files_path.push(path); + expect_content.push_str(&content); + } + + let mut files = Vec::new(); + for i in 0..files_num { + let file = File::open(&files_path[i]).await.unwrap(); + files.push(file); + } + + let mut files_reader = FilesReader::new(files); + let mut read_content = String::new(); + files_reader + .read_to_string(&mut read_content) + .await + .unwrap(); + assert_eq!(expect_content, read_content); + } } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 4a437421dac..de9b9893567 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -9,6 +9,7 @@ use std::{ time::Duration, }; +use async_compression::futures::write::ZstdDecoder; use backup_stream::{ errors::Result, metadata::{ @@ -19,14 +20,15 @@ use backup_stream::{ router::Router, Endpoint, Task, }; -use futures::{executor::block_on, Future}; +use futures::{executor::block_on, AsyncWriteExt, Future}; use grpcio::ChannelBuilder; use kvproto::{ - brpb::{Local, StorageBackend}, + brpb::{Local, Metadata, StorageBackend}, kvrpcpb::*, tikvpb::*, }; use pd_client::PdClient; +use protobuf::parse_from_bytes; use tempdir::TempDir; use test_raftstore::{new_server_cluster, Cluster, ServerCluster}; use test_util::retry; @@ -361,7 +363,37 @@ impl Suite { } } - fn check_for_write_records<'a>( + fn load_metadata_for_write_records(&self, path: &Path) -> HashMap> { + let mut meta_map: HashMap> = HashMap::new(); + for entry in WalkDir::new(path) { + let entry = entry.unwrap(); + if entry.file_type().is_file() + && entry + .file_name() + .to_str() + .map_or(false, |s| s.ends_with(".meta")) + { + let content = std::fs::read(entry.path()).unwrap(); + let meta = parse_from_bytes::(content.as_ref()).unwrap(); + for g in meta.file_groups.into_iter() { + let path = g.path.split('/').last().unwrap(); + for f in g.data_files_info.into_iter() { + let file_info = meta_map.get_mut(path); + if let Some(v) = file_info { + v.push((f.offset as usize, (f.offset + f.compress_length) as usize)); + } else { + let v = + vec![(f.offset as usize, (f.offset + f.compress_length) as usize)]; + meta_map.insert(String::from(path), v); + } + } + } + } + } + meta_map + } + + async fn check_for_write_records<'a>( &self, path: &Path, key_set: impl std::iter::Iterator, @@ -370,6 +402,7 @@ impl Suite { let n = remain_keys.len(); let mut extra_key = 0; let mut extra_len = 0; + let meta_map = self.load_metadata_for_write_records(path); for entry in WalkDir::new(path) { let entry = entry.unwrap(); println!("checking: {:?}", entry); @@ -379,21 +412,31 @@ impl Suite { .to_str() .map_or(false, |s| s.ends_with(".log")) { - let content = std::fs::read(entry.path()).unwrap(); - let mut iter = EventIterator::new(content); - loop { - if !iter.valid() { - break; - } - iter.next().unwrap(); - if !remain_keys.remove(iter.key()) { - extra_key += 1; - extra_len += iter.key().len() + iter.value().len(); + let buf = std::fs::read(entry.path()).unwrap(); + let file_infos = meta_map.get(entry.file_name().to_str().unwrap()).unwrap(); + for &file_info in file_infos { + let mut decoder = ZstdDecoder::new(Vec::new()); + let pbuf: &[u8] = &buf[file_info.0..file_info.1]; + decoder.write_all(pbuf).await.unwrap(); + decoder.flush().await.unwrap(); + decoder.close().await.unwrap(); + let content = decoder.into_inner(); + + let mut iter = EventIterator::new(content); + loop { + if !iter.valid() { + break; + } + iter.next().unwrap(); + if !remain_keys.remove(iter.key()) { + extra_key += 1; + extra_len += iter.key().len() + iter.value().len(); + } + + let value = iter.value(); + let wf = WriteRef::parse(value).unwrap(); + assert_eq!(wf.short_value, Some(b"hello, world" as &[u8])); } - - let value = iter.value(); - let wf = WriteRef::parse(value).unwrap(); - assert_eq!(wf.short_value, Some(b"hello, world" as &[u8])); } } } @@ -671,10 +714,12 @@ mod test { let round2 = suite.write_records(256, 128, 1).await; suite.force_flush_files("test_basic"); suite.wait_for_flush(); - suite.check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), - ); + suite + .check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ) + .await; }); suite.cluster.shutdown(); } @@ -691,10 +736,12 @@ mod test { let round2 = suite.write_records(256, 128, 1).await; suite.force_flush_files("test_with_split"); suite.wait_for_flush(); - suite.check_for_write_records( - suite.flushed_files.path(), - round1.union(&round2).map(Vec::as_slice), - ); + suite + .check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + ) + .await; }); suite.cluster.shutdown(); } @@ -743,10 +790,12 @@ mod test { .into_encoded() }) .collect::>(); - suite.check_for_write_records( - suite.flushed_files.path(), - keys_encoded.iter().map(Vec::as_slice), - ); + suite + .check_for_write_records( + suite.flushed_files.path(), + keys_encoded.iter().map(Vec::as_slice), + ) + .await; }); suite.cluster.shutdown(); } @@ -765,10 +814,10 @@ mod test { let round2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("test_leader_down"); suite.wait_for_flush(); - suite.check_for_write_records( + run_async_test(suite.check_for_write_records( suite.flushed_files.path(), round1.union(&round2).map(Vec::as_slice), - ); + )); suite.cluster.shutdown(); } @@ -944,10 +993,10 @@ mod test { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("region_failure"); suite.wait_for_flush(); - suite.check_for_write_records( + run_async_test(suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - ); + )); } #[test] @@ -969,10 +1018,10 @@ mod test { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("initial_scan_failure"); suite.wait_for_flush(); - suite.check_for_write_records( + run_async_test(suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - ); + )); } #[test] @@ -1029,10 +1078,10 @@ mod test { let keys2 = run_async_test(suite.write_records(256, 128, 1)); suite.force_flush_files("fail_to_refresh_region"); suite.wait_for_flush(); - suite.check_for_write_records( + run_async_test(suite.check_for_write_records( suite.flushed_files.path(), keys.union(&keys2).map(|s| s.as_slice()), - ); + )); let leader = suite.cluster.leader_of_region(1).unwrap().store_id; let (tx, rx) = std::sync::mpsc::channel(); suite.endpoints[&leader] diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index 05f418e4c3a..469cac97d6c 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -221,6 +221,37 @@ impl S3Storage { } key.to_owned() } + + fn get_range(&self, name: &str, range: Option) -> Box { + let key = self.maybe_prefix_key(name); + let bucket = self.config.bucket.bucket.clone(); + debug!("read file from s3 storage"; "key" => %key); + let req = GetObjectRequest { + key, + bucket: (*bucket).clone(), + range, + ..Default::default() + }; + Box::new( + self.client + .get_object(req) + .map(move |future| match future { + Ok(out) => out.body.unwrap(), + Err(RusotoError::Service(GetObjectError::NoSuchKey(key))) => { + ByteStream::new(error_stream(io::Error::new( + io::ErrorKind::NotFound, + format!("no key {} at bucket {}", key, *bucket), + ))) + } + Err(e) => ByteStream::new(error_stream(io::Error::new( + io::ErrorKind::Other, + format!("failed to get object {}", e), + ))), + }) + .flatten_stream() + .into_async_read(), + ) + } } /// A helper for uploading a large files to S3 storage. @@ -565,33 +596,12 @@ impl BlobStorage for S3Storage { } fn get(&self, name: &str) -> Box { - let key = self.maybe_prefix_key(name); - let bucket = self.config.bucket.bucket.clone(); - debug!("read file from s3 storage"; "key" => %key); - let req = GetObjectRequest { - key, - bucket: (*bucket).clone(), - ..Default::default() - }; - Box::new( - self.client - .get_object(req) - .map(move |future| match future { - Ok(out) => out.body.unwrap(), - Err(RusotoError::Service(GetObjectError::NoSuchKey(key))) => { - ByteStream::new(error_stream(io::Error::new( - io::ErrorKind::NotFound, - format!("no key {} at bucket {}", key, *bucket), - ))) - } - Err(e) => ByteStream::new(error_stream(io::Error::new( - io::ErrorKind::Other, - format!("failed to get object {}", e), - ))), - }) - .flatten_stream() - .into_async_read(), - ) + self.get_range(name, None) + } + + fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + // inclusive, bytes=0-499 -> [0, 499] + self.get_range(name, Some(format!("bytes={}-{}", off, off + len - 1))) } } diff --git a/components/cloud/azure/src/azblob.rs b/components/cloud/azure/src/azblob.rs index 2d7f2566509..5bf02696de7 100644 --- a/components/cloud/azure/src/azblob.rs +++ b/components/cloud/azure/src/azblob.rs @@ -553,6 +553,33 @@ impl AzureStorage { } key.to_owned() } + + fn get_range( + &self, + name: &str, + range: Option>, + ) -> Box { + let name = self.maybe_prefix_key(name); + debug!("read file from Azure storage"; "key" => %name); + let t = async move { + let blob_client = self.client_builder.get_client().await?.as_blob_client(name); + + let builder = if let Some(r) = range { + blob_client.get().range(r) + } else { + blob_client.get() + }; + + builder + .execute() + .await + .map(|res| res.data) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", e))) + }; + let k = stream::once(t); + let t = k.boxed().into_async_read(); + Box::new(t) + } } #[async_trait] @@ -576,22 +603,11 @@ impl BlobStorage for AzureStorage { } fn get(&self, name: &str) -> Box { - let name = self.maybe_prefix_key(name); - debug!("read file from Azure storage"; "key" => %name); - let t = async move { - self.client_builder - .get_client() - .await? - .as_blob_client(name) - .get() - .execute() - .await - .map(|res| res.data) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", e))) - }; - let k = stream::once(t); - let t = k.boxed().into_async_read(); - Box::new(t) + self.get_range(name, None) + } + + fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + self.get_range(name, Some(off..off + len)) } } diff --git a/components/cloud/gcp/src/gcs.rs b/components/cloud/gcp/src/gcs.rs index 799d1b02ee9..e8e8ad20ee9 100644 --- a/components/cloud/gcp/src/gcs.rs +++ b/components/cloud/gcp/src/gcs.rs @@ -10,6 +10,7 @@ use futures_util::{ io::{AsyncRead, AsyncReadExt, Cursor}, stream::{StreamExt, TryStreamExt}, }; +use http::HeaderValue; use hyper::{client::HttpConnector, Body, Client, Request, Response, StatusCode}; use hyper_tls::HttpsConnector; pub use kvproto::brpb::{Bucket as InputBucket, CloudDynamic, Gcs as InputConfig}; @@ -345,6 +346,49 @@ impl GcsStorage { { Box::new(error_stream(io::Error::new(kind, e)).into_async_read()) } + + fn get_range(&self, name: &str, range: Option) -> Box { + let bucket = self.config.bucket.bucket.to_string(); + let name = self.maybe_prefix_key(name); + debug!("read file from GCS storage"; "key" => %name); + let oid = match ObjectId::new(bucket, name) { + Ok(oid) => oid, + Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::InvalidInput, e), + }; + let mut request = match Object::download(&oid, None /* optional */) { + Ok(request) => request.map(|_: io::Empty| Body::empty()), + Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::Other, e), + }; + if let Some(r) = range { + let header_value = match HeaderValue::from_str(&r) { + Ok(v) => v, + Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::Other, e), + }; + request.headers_mut().insert("Range", header_value); + } + Box::new( + self.make_request(request, tame_gcs::Scopes::ReadOnly) + .and_then(|response| async { + if response.status().is_success() { + Ok(response.into_body().map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("download from GCS error: {}", e), + ) + })) + } else { + Err(status_code_error( + response.status(), + "bucket read".to_string(), + )) + } + }) + .err_into::() + .try_flatten_stream() + .boxed() // this `.boxed()` pin the stream. + .into_async_read(), + ) + } } fn change_host(host: &StringNonEmpty, url: &str) -> Option { @@ -449,39 +493,12 @@ impl BlobStorage for GcsStorage { } fn get(&self, name: &str) -> Box { - let bucket = self.config.bucket.bucket.to_string(); - let name = self.maybe_prefix_key(name); - debug!("read file from GCS storage"; "key" => %name); - let oid = match ObjectId::new(bucket, name) { - Ok(oid) => oid, - Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::InvalidInput, e), - }; - let request = match Object::download(&oid, None /* optional */) { - Ok(request) => request.map(|_: io::Empty| Body::empty()), - Err(e) => return GcsStorage::error_to_async_read(io::ErrorKind::Other, e), - }; - Box::new( - self.make_request(request, tame_gcs::Scopes::ReadOnly) - .and_then(|response| async { - if response.status().is_success() { - Ok(response.into_body().map_err(|e| { - io::Error::new( - io::ErrorKind::Other, - format!("download from GCS error: {}", e), - ) - })) - } else { - Err(status_code_error( - response.status(), - "bucket read".to_string(), - )) - } - }) - .err_into::() - .try_flatten_stream() - .boxed() // this `.boxed()` pin the stream. - .into_async_read(), - ) + self.get_range(name, None) + } + + fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + // inclusive, bytes=0-499 -> [0, 499] + self.get_range(name, Some(format!("bytes={}-{}", off, off + len - 1))) } } diff --git a/components/cloud/src/blob.rs b/components/cloud/src/blob.rs index 2e38097e385..d80d3a47a28 100644 --- a/components/cloud/src/blob.rs +++ b/components/cloud/src/blob.rs @@ -46,6 +46,9 @@ pub trait BlobStorage: 'static + Send + Sync { /// Read all contents of the given path. fn get(&self, name: &str) -> Box; + + /// Read part of contents of the given path. + fn get_part(&self, name: &str, off: u64, len: u64) -> Box; } impl BlobConfig for dyn BlobStorage { @@ -72,6 +75,10 @@ impl BlobStorage for Box { fn get(&self, name: &str) -> Box { (**self).get(name) } + + fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + (**self).get_part(name, off, len) + } } #[derive(Clone, Debug, PartialEq)] diff --git a/components/external_storage/Cargo.toml b/components/external_storage/Cargo.toml index 049f8ab2e43..b74af6ff39d 100644 --- a/components/external_storage/Cargo.toml +++ b/components/external_storage/Cargo.toml @@ -16,6 +16,7 @@ cloud-storage-grpc = [ failpoints = ["fail/failpoints"] [dependencies] +async-compression = { version = "0.3.14", features = ["futures-io", "zstd"] } async-trait = "0.1" bytes = "1.0" encryption = { path = "../encryption" } diff --git a/components/external_storage/export/Cargo.toml b/components/external_storage/export/Cargo.toml index 1f75af2734a..82ff01c2afb 100644 --- a/components/external_storage/export/Cargo.toml +++ b/components/external_storage/export/Cargo.toml @@ -74,6 +74,7 @@ tokio = { version = "1.5", features = ["time", "rt", "net"], optional = true } tokio-util = { version = "0.7", features = ["compat"], optional = true } url = "2.0" async-trait = "0.1" +async-compression = { version = "0.3.14", features = ["futures-io", "zstd"]} [dev-dependencies] matches = "0.1.8" diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/export/src/export.rs index 0fb24ef48ce..6ce16334aef 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/export/src/export.rs @@ -9,6 +9,7 @@ use std::{ sync::Arc, }; +use async_compression::futures::bufread::ZstdDecoder; use async_trait::async_trait; #[cfg(feature = "cloud-aws")] pub use aws::{Config as S3Config, S3Storage}; @@ -28,6 +29,7 @@ pub use external_storage::{ read_external_storage_into_file, ExternalStorage, LocalStorage, NoopStorage, UnpinReader, }; use futures_io::AsyncRead; +use futures_util::io::BufReader; #[cfg(feature = "cloud-gcp")] pub use gcp::{Config as GcsConfig, GcsStorage}; pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; @@ -324,16 +326,25 @@ impl ExternalStorage for EncryptedExternalStorage { fn read(&self, name: &str) -> Box { self.storage.read(name) } + fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + self.storage.read_part(name, off, len) + } fn restore( &self, storage_name: &str, restore_name: std::path::PathBuf, + compressed_range: Option<(u64, u64)>, expected_length: u64, expected_sha256: Option>, speed_limiter: &Limiter, file_crypter: Option, ) -> io::Result<()> { - let reader = self.read(storage_name); + let reader = if let Some((off, len)) = compressed_range { + let r = self.read_part(storage_name, off, len); + Box::new(ZstdDecoder::new(BufReader::new(r))) + } else { + self.read(storage_name) + }; let file_writer: &mut dyn Write = &mut self.key_manager.create_file_for_write(&restore_name)?; let min_read_speed: usize = 8192; @@ -367,4 +378,8 @@ impl ExternalStorage for BlobStore { fn read(&self, name: &str) -> Box { (**self).get(name) } + + fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + (**self).get_part(name, off, len) + } } diff --git a/components/external_storage/src/hdfs.rs b/components/external_storage/src/hdfs.rs index 175104d06cb..53574633c73 100644 --- a/components/external_storage/src/hdfs.rs +++ b/components/external_storage/src/hdfs.rs @@ -134,6 +134,15 @@ impl ExternalStorage for HdfsStorage { fn read(&self, _name: &str) -> Box { unimplemented!("currently only HDFS export is implemented") } + + fn read_part( + &self, + _name: &str, + _off: u64, + _len: u64, + ) -> Box { + unimplemented!("currently only HDFS export is implemented") + } } #[cfg(test)] diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index afae433e54a..97f0f83ddbc 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -15,10 +15,12 @@ use std::{ time::Duration, }; +use async_compression::futures::bufread::ZstdDecoder; use async_trait::async_trait; use encryption::{from_engine_encryption_method, DecrypterReader, Iv}; use engine_traits::FileEncryptionInfo; use file_system::File; +use futures::io::BufReader; use futures_io::AsyncRead; use futures_util::AsyncReadExt; use openssl::hash::{Hasher, MessageDigest}; @@ -75,17 +77,26 @@ pub trait ExternalStorage: 'static + Send + Sync { /// Read all contents of the given path. fn read(&self, name: &str) -> Box; + /// Read part of contents of the given path. + fn read_part(&self, name: &str, off: u64, len: u64) -> Box; + /// Read from external storage and restore to the given path fn restore( &self, storage_name: &str, restore_name: std::path::PathBuf, + compressed_range: Option<(u64, u64)>, expected_length: u64, expected_sha256: Option>, speed_limiter: &Limiter, file_crypter: Option, ) -> io::Result<()> { - let reader = self.read(storage_name); + let reader: Box = if let Some((off, len)) = compressed_range { + let r = self.read_part(storage_name, off, len); + Box::new(ZstdDecoder::new(BufReader::new(r))) + } else { + self.read(storage_name) + }; let output: &mut dyn Write = &mut File::create(restore_name)?; // the minimum speed of reading data, in bytes/second. // if reading speed is slower than this rate, we will stop with @@ -122,6 +133,10 @@ impl ExternalStorage for Arc { fn read(&self, name: &str) -> Box { (**self).read(name) } + + fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + (**self).read_part(name, off, len) + } } #[async_trait] @@ -141,6 +156,10 @@ impl ExternalStorage for Box { fn read(&self, name: &str) -> Box { self.as_ref().read(name) } + + fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + self.as_ref().read_part(name, off, len) + } } /// Wrap the reader with file_crypter. diff --git a/components/external_storage/src/local.rs b/components/external_storage/src/local.rs index 80c22929525..4b22de96a6a 100644 --- a/components/external_storage/src/local.rs +++ b/components/external_storage/src/local.rs @@ -2,7 +2,7 @@ use std::{ fs::File as StdFile, - io, + io::{self, BufReader, Read, Seek}, marker::Unpin, path::{Path, PathBuf}, sync::Arc, @@ -130,6 +130,23 @@ impl ExternalStorage for LocalStorage { Err(e) => Box::new(error_stream(e).into_async_read()) as _, } } + + fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + debug!("read part of file from local storage"; + "name" => %name, "off" => %off, "len" => %len, "base" => %self.base.display()); + + let mut file = match StdFile::open(self.base.join(name)) { + Ok(file) => file, + Err(e) => return Box::new(error_stream(e).into_async_read()) as _, + }; + match file.seek(std::io::SeekFrom::Start(off)) { + Ok(_) => (), + Err(e) => return Box::new(error_stream(e).into_async_read()) as _, + }; + let reader = BufReader::new(file); + let take = reader.take(len); + Box::new(AllowStdIo::new(take)) as _ + } } #[cfg(test)] diff --git a/components/external_storage/src/noop.rs b/components/external_storage/src/noop.rs index cb590ca6e44..42746742624 100644 --- a/components/external_storage/src/noop.rs +++ b/components/external_storage/src/noop.rs @@ -47,6 +47,10 @@ impl ExternalStorage for NoopStorage { fn read(&self, _name: &str) -> Box { Box::new(io::empty().compat()) } + + fn read_part(&self, _name: &str, _off: u64, _len: u64) -> Box { + Box::new(io::empty().compat()) + } } #[cfg(test)] diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 806066bd202..405991b1efe 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -225,6 +225,7 @@ impl SstImporter { fn download_file_from_external_storage( &self, + compressed_range: Option<(u64, u64)>, file_length: u64, src_file_name: &str, dst_file: std::path::PathBuf, @@ -265,6 +266,7 @@ impl SstImporter { let result = ext_storage.restore( src_file_name, dst_file.clone(), + compressed_range, file_length, expect_sha256, speed_limiter, @@ -300,8 +302,10 @@ impl SstImporter { backend: &StorageBackend, speed_limiter: &Limiter, ) -> Result { - let name = meta.get_name(); - let path = self.dir.get_import_path(name)?; + let offset = meta.get_offset(); + let src_name = meta.get_name(); + let dst_name = format!("{}_{}", src_name, offset); + let path = self.dir.get_import_path(&dst_name)?; let start = Instant::now(); let sha256 = meta.get_sha256().to_vec(); let expected_sha256 = if !sha256.is_empty() { @@ -313,16 +317,22 @@ impl SstImporter { return Ok(path.save); } - let lock = self.file_locks.entry(name.to_string()).or_default(); + let lock = self.file_locks.entry(dst_name.to_string()).or_default(); if path.save.exists() { return Ok(path.save); } + let length = meta.get_compress_length(); + let compressed_range = if length == 0 { + None + } else { + Some((offset, length)) + }; self.download_file_from_external_storage( - // don't check file length after download file for now. + compressed_range, meta.get_length(), - name, + src_name, path.temp.clone(), backend, expected_sha256, @@ -335,7 +345,7 @@ impl SstImporter { None, speed_limiter, )?; - info!("download file finished {}", name); + info!("download file finished {}, offset {}", src_name, offset); if let Some(p) = path.save.parent() { // we have v1 prefix in file name. @@ -347,10 +357,11 @@ impl SstImporter { } })?; } + file_system::rename(path.temp, path.save.clone())?; drop(lock); - self.file_locks.remove(name); + self.file_locks.remove(&dst_name); IMPORTER_APPLY_DURATION .with_label_values(&["download"]) @@ -494,6 +505,7 @@ impl SstImporter { }); self.download_file_from_external_storage( + None, meta.length, name, path.temp.clone(), @@ -1251,6 +1263,7 @@ mod tests { let path = importer.dir.get_import_path(file_name).unwrap(); importer .download_file_from_external_storage( + None, meta.get_length(), file_name, path.temp.clone(), @@ -1285,6 +1298,7 @@ mod tests { let path = importer.dir.get_import_path(kv_meta.get_name()).unwrap(); importer .download_file_from_external_storage( + None, kv_meta.get_length(), kv_meta.get_name(), path.temp.clone(), From 8c93b91fea7396b9261551297d1bfa037c82e605 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Tue, 13 Sep 2022 15:31:00 +0800 Subject: [PATCH 206/676] server: support customized addr/status_addr (#13234) ref tikv/tikv#12849 Support self-defined addr/status_addr Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- components/server/src/server.rs | 2 +- components/test_raftstore/src/server.rs | 2 +- src/server/node.rs | 53 +++++++++++++++++-------- src/server/status_server/mod.rs | 9 +++-- 4 files changed, 43 insertions(+), 23 deletions(-) diff --git a/components/server/src/server.rs b/components/server/src/server.rs index ba4c515557e..8b49becc8e3 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -728,7 +728,7 @@ where storage_read_pools.handle() }; - let storage = create_raft_storage::<_, _, _, F>( + let storage = create_raft_storage::<_, _, _, F, _>( engines.engine.clone(), &self.config.storage, storage_read_pool_handle, diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index f1626b9f2c9..72282f02dc0 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -405,7 +405,7 @@ impl ServerCluster { cfg.quota.max_delay_duration, cfg.quota.enable_auto_tune, )); - let store = create_raft_storage::<_, _, _, F>( + let store = create_raft_storage::<_, _, _, F, _>( engine, &cfg.storage, storage_read_pool.handle(), diff --git a/src/server/node.rs b/src/server/node.rs index d8bee9abfd7..0916ebc8b9c 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -35,9 +35,9 @@ use super::{RaftKv, Result}; use crate::{ import::SstImporter, read_pool::ReadPoolHandle, - server::{lock_manager::LockManager, Config as ServerConfig}, + server::Config as ServerConfig, storage::{ - config::Config as StorageConfig, kv::FlowStatsReporter, + config::Config as StorageConfig, kv::FlowStatsReporter, lock_manager, txn::flow_controller::FlowController, DynamicConfigs as StorageDynamicConfigs, Storage, }, }; @@ -47,11 +47,17 @@ const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs( /// Creates a new storage engine which is backed by the Raft consensus /// protocol. -pub fn create_raft_storage( +pub fn create_raft_storage< + S, + EK, + R: FlowStatsReporter, + F: KvFormat, + LM: lock_manager::LockManager, +>( engine: RaftKv, cfg: &StorageConfig, read_pool: ReadPoolHandle, - lock_mgr: LockManager, + lock_mgr: LM, concurrency_manager: ConcurrencyManager, dynamic_configs: StorageDynamicConfigs, flow_controller: Arc, @@ -59,7 +65,7 @@ pub fn create_raft_storage( resource_tag_factory: ResourceTagFactory, quota_limiter: Arc, feature_gate: FeatureGate, -) -> Result, LockManager, F>> +) -> Result, LM, F>> where S: RaftStoreRouter + LocalReadRouter + 'static, EK: KvEngine, @@ -119,19 +125,27 @@ where Some(s) => s, }; store.set_id(INVALID_ID); - if cfg.advertise_addr.is_empty() { - store.set_address(cfg.addr.clone()); - store.set_peer_address(cfg.addr.clone()); - } else { - store.set_address(cfg.advertise_addr.clone()); - store.set_peer_address(cfg.advertise_addr.clone()); + if store.get_address().is_empty() { + if cfg.advertise_addr.is_empty() { + store.set_address(cfg.addr.clone()); + if store.get_peer_address().is_empty() { + store.set_peer_address(cfg.addr.clone()); + } + } else { + store.set_address(cfg.advertise_addr.clone()); + if store.get_peer_address().is_empty() { + store.set_peer_address(cfg.advertise_addr.clone()); + } + } } - if cfg.advertise_status_addr.is_empty() { - store.set_status_address(cfg.status_addr.clone()); - } else { - store.set_status_address(cfg.advertise_status_addr.clone()) + if store.get_status_address().is_empty() { + if cfg.advertise_status_addr.is_empty() { + store.set_status_address(cfg.status_addr.clone()); + } else { + store.set_status_address(cfg.advertise_status_addr.clone()) + } } - if store.get_version() == "" { + if store.get_version().is_empty() { store.set_version(env!("CARGO_PKG_VERSION").to_string()); } @@ -142,7 +156,7 @@ where }; store.set_start_timestamp(chrono::Local::now().timestamp()); - if store.get_git_hash() == "" { + if store.get_git_hash().is_empty() { store.set_git_hash( option_env!("TIKV_BUILD_GIT_HASH") .unwrap_or("Unknown git hash") @@ -251,6 +265,11 @@ where self.store.get_id() } + /// Gets a copy of Store which is registered to Pd. + pub fn store(&self) -> metapb::Store { + self.store.clone() + } + /// Gets the Scheduler of RaftstoreConfigTask, it must be called after /// start. pub fn refresh_config_scheduler(&mut self) -> Scheduler { diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 7c001baec1e..78302550fd5 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -1,5 +1,6 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +/// Provides profilers for TiKV. mod profile; use std::{ error::Error as StdError, @@ -39,6 +40,10 @@ use openssl::{ x509::X509, }; use pin_project::pin_project; +pub use profile::{ + activate_heap_profile, deactivate_heap_profile, jeprof_heap_profile, list_heap_profiles, + read_file, start_one_cpu_profile, start_one_heap_profile, +}; use prometheus::TEXT_FORMAT; use raftstore::store::{transport::CasualRouter, CasualMessage}; use regex::Regex; @@ -56,10 +61,6 @@ use tokio::{ }; use tokio_openssl::SslStream; -use self::profile::{ - activate_heap_profile, deactivate_heap_profile, jeprof_heap_profile, list_heap_profiles, - read_file, start_one_cpu_profile, start_one_heap_profile, -}; use crate::{ config::{ConfigController, LogLevel}, server::Result, From 2563311bca1083a803e6b360df03e91dc129e0fa Mon Sep 17 00:00:00 2001 From: haojinming Date: Wed, 14 Sep 2022 20:02:59 +0800 Subject: [PATCH 207/676] test: Separate TestPdClient from test_raftstore component (#13453) close tikv/tikv#13452 Signed-off-by: haojinming --- Cargo.lock | 27 +++++++- Cargo.toml | 1 + components/causal_ts/Cargo.toml | 2 +- components/causal_ts/benches/tso.rs | 2 +- components/causal_ts/src/observer.rs | 2 +- components/causal_ts/src/tso.rs | 2 +- components/cdc/Cargo.toml | 1 + components/cdc/src/endpoint.rs | 3 +- components/test_pd_client/Cargo.toml | 23 +++++++ components/test_pd_client/src/lib.rs | 8 +++ .../src/pd.rs | 64 ++++++++++++++++++- components/test_raftstore/Cargo.toml | 1 + components/test_raftstore/src/cluster.rs | 1 + components/test_raftstore/src/lib.rs | 4 +- components/test_raftstore/src/node.rs | 1 + components/test_raftstore/src/server.rs | 1 + components/test_raftstore/src/util.rs | 60 +---------------- tests/Cargo.toml | 1 + tests/failpoints/cases/test_bootstrap.rs | 1 + .../cases/test_replica_stale_read.rs | 1 + tests/failpoints/cases/test_sst_recovery.rs | 1 + .../config/dynamic/pessimistic_txn.rs | 2 +- .../integrations/config/dynamic/raftstore.rs | 2 +- .../integrations/raftstore/test_bootstrap.rs | 1 + .../raftstore/test_conf_change.rs | 1 + 25 files changed, 142 insertions(+), 71 deletions(-) create mode 100644 components/test_pd_client/Cargo.toml create mode 100644 components/test_pd_client/src/lib.rs rename components/{test_raftstore => test_pd_client}/src/pd.rs (96%) diff --git a/Cargo.lock b/Cargo.lock index 3f64d59eed9..3a1ad699087 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -761,7 +761,7 @@ dependencies = [ "serde_derive", "slog", "slog-global", - "test_raftstore", + "test_pd_client", "thiserror", "tikv_alloc", "tikv_util", @@ -813,6 +813,7 @@ dependencies = [ "slog", "slog-global", "tempfile", + "test_pd_client", "test_raftstore", "test_util", "thiserror", @@ -5697,6 +5698,28 @@ dependencies = [ "tikv_util", ] +[[package]] +name = "test_pd_client" +version = "0.0.1" +dependencies = [ + "collections", + "fail", + "futures 0.3.15", + "grpcio", + "keys", + "kvproto", + "log_wrappers", + "pd_client", + "raft", + "raftstore", + "slog", + "slog-global", + "tikv_util", + "tokio", + "tokio-timer", + "txn_types", +] + [[package]] name = "test_raftstore" version = "0.0.1" @@ -5733,6 +5756,7 @@ dependencies = [ "slog", "slog-global", "tempfile", + "test_pd_client", "test_util", "tikv", "tikv_util", @@ -5848,6 +5872,7 @@ dependencies = [ "test_backup", "test_coprocessor", "test_pd", + "test_pd_client", "test_raftstore", "test_sst_importer", "test_storage", diff --git a/Cargo.toml b/Cargo.toml index 1b622f0d61b..2ce23dddd3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -265,6 +265,7 @@ members = [ "components/test_coprocessor", "components/test_coprocessor_plugin/example_plugin", "components/test_pd", + "components/test_pd_client", "components/test_raftstore", "components/test_sst_importer", "components/test_storage", diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index 7505a043a69..335cd2528b6 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -35,7 +35,7 @@ txn_types = { path = "../txn_types", default-features = false } [dev-dependencies] criterion = "0.3" -test_raftstore = { path = "../test_raftstore" } +test_pd_client = { path = "../test_pd_client" } [[bench]] name = "tso" diff --git a/components/causal_ts/benches/tso.rs b/components/causal_ts/benches/tso.rs index 86d7ed9b9ea..66d950a52b5 100644 --- a/components/causal_ts/benches/tso.rs +++ b/components/causal_ts/benches/tso.rs @@ -5,7 +5,7 @@ use std::{sync::Arc, time::Duration}; use causal_ts::{BatchTsoProvider, CausalTsProvider, TsoBatchList}; use criterion::*; use futures::executor::block_on; -use test_raftstore::TestPdClient; +use test_pd_client::TestPdClient; use txn_types::TimeStamp; fn bench_batch_tso_list_pop(c: &mut Criterion) { diff --git a/components/causal_ts/src/observer.rs b/components/causal_ts/src/observer.rs index f648d8cba08..c07624e2781 100644 --- a/components/causal_ts/src/observer.rs +++ b/components/causal_ts/src/observer.rs @@ -166,7 +166,7 @@ pub mod tests { metapb::Region, raft_cmdpb::{RaftCmdRequest, Request as RaftRequest}, }; - use test_raftstore::TestPdClient; + use test_pd_client::TestPdClient; use txn_types::{Key, TimeStamp}; use super::*; diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 3bb0034af8f..86efd73198a 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -633,7 +633,7 @@ impl CausalTsProvider for SimpleTsoProvider { #[cfg(test)] pub mod tests { - use test_raftstore::TestPdClient; + use test_pd_client::TestPdClient; use super::*; diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index 255ef552c73..dbefc7df82c 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -70,6 +70,7 @@ engine_rocks = { path = "../engine_rocks", default-features = false } engine_traits = { path = "../engine_traits", default-features = false } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } tempfile = "3.0" +test_pd_client = { path = "../test_pd_client" } test_raftstore = { path = "../test_raftstore", default-features = false } test_util = { path = "../test_util", default-features = false } diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 2e0253b23a9..10251f2a257 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1433,7 +1433,8 @@ mod tests { errors::{DiscardReason, Error as RaftStoreError}, store::{msg::CasualMessage, PeerMsg, ReadDelegate}, }; - use test_raftstore::{MockRaftStoreRouter, TestPdClient}; + use test_pd_client::TestPdClient; + use test_raftstore::MockRaftStoreRouter; use tikv::{ server::DEFAULT_CLUSTER_ID, storage::{kv::Engine, TestEngineBuilder}, diff --git a/components/test_pd_client/Cargo.toml b/components/test_pd_client/Cargo.toml new file mode 100644 index 00000000000..909da59d2ae --- /dev/null +++ b/components/test_pd_client/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "test_pd_client" +version = "0.0.1" +edition = "2018" +publish = false + +[dependencies] +collections = { path = "../collections" } +fail = "0.5" +futures = "0.3" +grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +keys = { path = "../keys", default-features = false } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +log_wrappers = { path = "../log_wrappers" } +pd_client = { path = "../pd_client", default-features = false } +raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } +raftstore = { path = "../raftstore", default-features = false, features = ["testexport"] } +slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +tikv_util = { path = "../tikv_util", default-features = false } +tokio = { version = "1.5", features = ["rt-multi-thread"] } +tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +txn_types = { path = "../txn_types", default-features = false } diff --git a/components/test_pd_client/src/lib.rs b/components/test_pd_client/src/lib.rs new file mode 100644 index 00000000000..9ea837e335e --- /dev/null +++ b/components/test_pd_client/src/lib.rs @@ -0,0 +1,8 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +#[macro_use] +extern crate tikv_util; + +mod pd; + +pub use crate::pd::*; diff --git a/components/test_raftstore/src/pd.rs b/components/test_pd_client/src/pd.rs similarity index 96% rename from components/test_raftstore/src/pd.rs rename to components/test_pd_client/src/pd.rs index 75ea189c312..69cd1a30d03 100644 --- a/components/test_raftstore/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -26,7 +26,10 @@ use futures::{ use keys::{self, data_key, enc_end_key, enc_start_key}; use kvproto::{ metapb::{self, PeerRole}, - pdpb, + pdpb::{ + self, ChangePeer, ChangePeerV2, CheckPolicy, Merge, RegionHeartbeatResponse, SplitRegion, + TransferLeader, + }, replication_modepb::{ DrAutoSyncState, RegionReplicationStatus, ReplicationMode, ReplicationStatus, StoreDrAutoSyncStatus, @@ -37,7 +40,7 @@ use pd_client::{ }; use raft::eraftpb::ConfChangeType; use raftstore::store::{ - util::{check_key_in_region, find_peer, is_learner}, + util::{check_key_in_region, find_peer, is_learner, new_peer}, QueryStats, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, }; use tikv_util::{ @@ -134,6 +137,10 @@ enum Operator { }, } +pub fn sleep_ms(ms: u64) { + std::thread::sleep(Duration::from_millis(ms)); +} + fn change_peer(change_type: ConfChangeType, peer: metapb::Peer) -> pdpb::ChangePeer { let mut cp = pdpb::ChangePeer::default(); cp.set_change_type(change_type); @@ -141,6 +148,59 @@ fn change_peer(change_type: ConfChangeType, peer: metapb::Peer) -> pdpb::ChangeP cp } +pub fn new_pd_change_peer( + change_type: ConfChangeType, + peer: metapb::Peer, +) -> RegionHeartbeatResponse { + let mut change_peer = ChangePeer::default(); + change_peer.set_change_type(change_type); + change_peer.set_peer(peer); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_change_peer(change_peer); + resp +} + +pub fn new_pd_change_peer_v2(changes: Vec) -> RegionHeartbeatResponse { + let mut change_peer = ChangePeerV2::default(); + change_peer.set_changes(changes.into()); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_change_peer_v2(change_peer); + resp +} + +pub fn new_split_region(policy: CheckPolicy, keys: Vec>) -> RegionHeartbeatResponse { + let mut split_region = SplitRegion::default(); + split_region.set_policy(policy); + split_region.set_keys(keys.into()); + let mut resp = RegionHeartbeatResponse::default(); + resp.set_split_region(split_region); + resp +} + +pub fn new_pd_transfer_leader( + peer: metapb::Peer, + peers: Vec, +) -> RegionHeartbeatResponse { + let mut transfer_leader = TransferLeader::default(); + transfer_leader.set_peer(peer); + transfer_leader.set_peers(peers.into()); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_transfer_leader(transfer_leader); + resp +} + +pub fn new_pd_merge_region(target_region: metapb::Region) -> RegionHeartbeatResponse { + let mut merge = Merge::default(); + merge.set_target(target_region); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_merge(merge); + resp +} + impl Operator { fn make_region_heartbeat_response( &self, diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index cd9df2e3c05..8c19c78a0f6 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -56,6 +56,7 @@ slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debu # better to not use slog-global, but pass in the logger slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tempfile = "3.0" +test_pd_client = { path = "../test_pd_client" } test_util = { path = "../test_util", default-features = false } tikv = { path = "../../", default-features = false } tikv_util = { path = "../tikv_util", default-features = false } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 9b5aa1a6646..9a69c7110b4 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -46,6 +46,7 @@ use raftstore::{ Error, Result, }; use tempfile::TempDir; +use test_pd_client::TestPdClient; use tikv::server::Result as ServerResult; use tikv_util::{ thread_group::GroupProperties, diff --git a/components/test_raftstore/src/lib.rs b/components/test_raftstore/src/lib.rs index 82695be12ba..8893d8a7ca4 100644 --- a/components/test_raftstore/src/lib.rs +++ b/components/test_raftstore/src/lib.rs @@ -8,13 +8,11 @@ extern crate tikv_util; mod cluster; mod config; mod node; -mod pd; mod router; mod server; mod transport_simulate; mod util; pub use crate::{ - cluster::*, config::Config, node::*, pd::*, router::*, server::*, transport_simulate::*, - util::*, + cluster::*, config::Config, node::*, router::*, server::*, transport_simulate::*, util::*, }; diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 11a5dda87bd..1616504c820 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -32,6 +32,7 @@ use raftstore::{ }; use resource_metering::CollectorRegHandle; use tempfile::TempDir; +use test_pd_client::TestPdClient; use tikv::{ config::{ConfigController, Module}, import::SstImporter, diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 72282f02dc0..6895915d466 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -45,6 +45,7 @@ use raftstore::{ use resource_metering::{CollectorRegHandle, ResourceTagFactory}; use security::SecurityManager; use tempfile::TempDir; +use test_pd_client::TestPdClient; use tikv::{ config::ConfigController, coprocessor, coprocessor_v2, diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 882095c5a7d..117ca6d44df 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -26,10 +26,6 @@ use kvproto::{ encryptionpb::EncryptionMethod, kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, metapb::{self, RegionEpoch}, - pdpb::{ - ChangePeer, ChangePeerV2, CheckPolicy, Merge, RegionHeartbeatResponse, SplitRegion, - TransferLeader, - }, raft_cmdpb::{ AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, CmdType, RaftCmdRequest, RaftCmdResponse, Request, StatusCmdType, StatusRequest, @@ -50,11 +46,12 @@ use raftstore::{ use rand::RngCore; use server::server::ConfiguredRaftEngine; use tempfile::TempDir; +use test_pd_client::TestPdClient; use tikv::{config::*, server::KvEngineFactoryBuilder, storage::point_key_range}; use tikv_util::{config::*, escape, time::ThreadReadId, worker::LazyWorker, HandyRwLock}; use txn_types::Key; -use crate::{Cluster, Config, ServerCluster, Simulator, TestPdClient}; +use crate::{Cluster, Config, ServerCluster, Simulator}; pub fn must_get(engine: &RocksEngine, cf: &str, key: &[u8], value: Option<&[u8]>) { for _ in 1..300 { @@ -334,59 +331,6 @@ pub fn is_error_response(resp: &RaftCmdResponse) -> bool { resp.get_header().has_error() } -pub fn new_pd_change_peer( - change_type: ConfChangeType, - peer: metapb::Peer, -) -> RegionHeartbeatResponse { - let mut change_peer = ChangePeer::default(); - change_peer.set_change_type(change_type); - change_peer.set_peer(peer); - - let mut resp = RegionHeartbeatResponse::default(); - resp.set_change_peer(change_peer); - resp -} - -pub fn new_pd_change_peer_v2(changes: Vec) -> RegionHeartbeatResponse { - let mut change_peer = ChangePeerV2::default(); - change_peer.set_changes(changes.into()); - - let mut resp = RegionHeartbeatResponse::default(); - resp.set_change_peer_v2(change_peer); - resp -} - -pub fn new_split_region(policy: CheckPolicy, keys: Vec>) -> RegionHeartbeatResponse { - let mut split_region = SplitRegion::default(); - split_region.set_policy(policy); - split_region.set_keys(keys.into()); - let mut resp = RegionHeartbeatResponse::default(); - resp.set_split_region(split_region); - resp -} - -pub fn new_pd_transfer_leader( - peer: metapb::Peer, - peers: Vec, -) -> RegionHeartbeatResponse { - let mut transfer_leader = TransferLeader::default(); - transfer_leader.set_peer(peer); - transfer_leader.set_peers(peers.into()); - - let mut resp = RegionHeartbeatResponse::default(); - resp.set_transfer_leader(transfer_leader); - resp -} - -pub fn new_pd_merge_region(target_region: metapb::Region) -> RegionHeartbeatResponse { - let mut merge = Merge::default(); - merge.set_target(target_region); - - let mut resp = RegionHeartbeatResponse::default(); - resp.set_merge(merge); - resp -} - #[derive(Default)] struct CallbackLeakDetector { called: bool, diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 14bf818aaf0..2cc30338f83 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -144,6 +144,7 @@ sst_importer = { path = "../components/sst_importer", default-features = false } test_backup = { path = "../components/test_backup", default-features = false } test_coprocessor = { path = "../components/test_coprocessor", default-features = false } test_pd = { path = "../components/test_pd", default-features = false } +test_pd_client = { path = "../components/test_pd_client" } test_raftstore = { path = "../components/test_raftstore", default-features = false } test_sst_importer = { path = "../components/test_sst_importer", default-features = false } test_storage = { path = "../components/test_storage", default-features = false } diff --git a/tests/failpoints/cases/test_bootstrap.rs b/tests/failpoints/cases/test_bootstrap.rs index 3923f4e77f2..8dc2eb8b371 100644 --- a/tests/failpoints/cases/test_bootstrap.rs +++ b/tests/failpoints/cases/test_bootstrap.rs @@ -4,6 +4,7 @@ use std::sync::{Arc, RwLock}; use engine_traits::Peekable; use kvproto::{kvrpcpb::ApiVersion, metapb, raft_serverpb}; +use test_pd_client::TestPdClient; use test_raftstore::*; fn test_bootstrap_half_way_failure(fp: &str) { diff --git a/tests/failpoints/cases/test_replica_stale_read.rs b/tests/failpoints/cases/test_replica_stale_read.rs index a8aaa030bfc..7748ed73b96 100644 --- a/tests/failpoints/cases/test_replica_stale_read.rs +++ b/tests/failpoints/cases/test_replica_stale_read.rs @@ -5,6 +5,7 @@ use std::{sync::Arc, time::Duration}; use kvproto::{kvrpcpb::Op, metapb::Peer}; use pd_client::PdClient; use raft::eraftpb::MessageType; +use test_pd_client::TestPdClient; use test_raftstore::*; fn prepare_for_stale_read(leader: Peer) -> (Cluster, Arc, PeerClient) { diff --git a/tests/failpoints/cases/test_sst_recovery.rs b/tests/failpoints/cases/test_sst_recovery.rs index f5dadc4205a..a4c1f10b5ae 100644 --- a/tests/failpoints/cases/test_sst_recovery.rs +++ b/tests/failpoints/cases/test_sst_recovery.rs @@ -5,6 +5,7 @@ use std::{fmt::Debug, io::Write, path::Path, sync::Arc, time::Duration}; use engine_rocks::RocksEngine; use engine_rocks_helper::sst_recovery::*; use engine_traits::{CompactExt, Peekable, CF_DEFAULT}; +use test_pd_client::TestPdClient; use test_raftstore::*; const CHECK_DURATION: Duration = Duration::from_millis(50); diff --git a/tests/integrations/config/dynamic/pessimistic_txn.rs b/tests/integrations/config/dynamic/pessimistic_txn.rs index 49bedd38c73..caad8a64f9b 100644 --- a/tests/integrations/config/dynamic/pessimistic_txn.rs +++ b/tests/integrations/config/dynamic/pessimistic_txn.rs @@ -6,7 +6,7 @@ use std::{ }; use security::SecurityManager; -use test_raftstore::TestPdClient; +use test_pd_client::TestPdClient; use tikv::{ config::*, server::{ diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 35d5fe23e49..55cf75d2b75 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -21,7 +21,7 @@ use raftstore::{ }; use resource_metering::CollectorRegHandle; use tempfile::TempDir; -use test_raftstore::TestPdClient; +use test_pd_client::TestPdClient; use tikv::{ config::{ConfigController, Module, TikvConfig}, import::SstImporter, diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index 92e4422c57f..cc5b6ca1ee0 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -13,6 +13,7 @@ use raftstore::{ }; use resource_metering::CollectorRegHandle; use tempfile::Builder; +use test_pd_client::{bootstrap_with_first_region, TestPdClient}; use test_raftstore::*; use tikv::{import::SstImporter, server::Node}; use tikv_util::{ diff --git a/tests/integrations/raftstore/test_conf_change.rs b/tests/integrations/raftstore/test_conf_change.rs index b37b207ac11..9f888b828be 100644 --- a/tests/integrations/raftstore/test_conf_change.rs +++ b/tests/integrations/raftstore/test_conf_change.rs @@ -19,6 +19,7 @@ use kvproto::{ use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; use raftstore::{store::util::is_learner, Result}; +use test_pd_client::TestPdClient; use test_raftstore::*; use tikv_util::{config::ReadableDuration, time::Instant, HandyRwLock}; From b6b96382b99c4c0e995e31cf227f1a6c747c577a Mon Sep 17 00:00:00 2001 From: hehechen Date: Wed, 14 Sep 2022 20:44:59 +0800 Subject: [PATCH 208/676] resolved_ts: track ingest sst (#13454) ref tikv/tikv#13353 Signed-off-by: hehechen Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/file_system/src/rate_limiter.rs | 4 +- components/raftstore/src/store/util.rs | 8 ++ components/resolved_ts/Cargo.toml | 1 + components/resolved_ts/src/cmd.rs | 42 ++++++++--- components/resolved_ts/src/endpoint.rs | 4 + .../resolved_ts/tests/integrations/mod.rs | 31 +++++++- components/resolved_ts/tests/mod.rs | 75 ++++++++++++++++++- 8 files changed, 151 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3a1ad699087..5f1ced440c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4547,6 +4547,7 @@ dependencies = [ "slog-global", "tempfile", "test_raftstore", + "test_sst_importer", "test_util", "thiserror", "tikv", diff --git a/components/file_system/src/rate_limiter.rs b/components/file_system/src/rate_limiter.rs index f3ec05a4314..feffb6dcf14 100644 --- a/components/file_system/src/rate_limiter.rs +++ b/components/file_system/src/rate_limiter.rs @@ -569,8 +569,8 @@ mod tests { macro_rules! approximate_eq { ($left:expr, $right:expr) => { - assert!(($left) >= ($right) * 0.85); - assert!(($right) >= ($left) * 0.85); + assert!(($left) >= ($right) * 0.75); + assert!(($right) >= ($left) * 0.75); }; } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 922ba70a2c8..3d566d41416 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -931,6 +931,14 @@ impl RegionReadProgressRegistry { .map(|rp| rp.safe_ts()) } + pub fn get_tracked_index(&self, region_id: &u64) -> Option { + self.registry + .lock() + .unwrap() + .get(region_id) + .map(|rp| rp.core.lock().unwrap().applied_index) + } + // Update `safe_ts` with the provided `LeaderInfo` and return the regions that // have the same `LeaderInfo` pub fn handle_check_leaders( diff --git a/components/resolved_ts/Cargo.toml b/components/resolved_ts/Cargo.toml index e781fbc1f75..6309440202b 100644 --- a/components/resolved_ts/Cargo.toml +++ b/components/resolved_ts/Cargo.toml @@ -55,6 +55,7 @@ panic_hook = { path = "../panic_hook" } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } tempfile = "3.0" test_raftstore = { path = "../test_raftstore", default-features = false } +test_sst_importer = { path = "../test_sst_importer" } test_util = { path = "../test_util", default-features = false } tikv_kv = { path = "../tikv_kv" } diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 0bb22e0a21e..8d3eb3bb48d 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -33,6 +33,7 @@ pub enum ChangeRow { commit_ts: TimeStamp, value: Option, }, + IngestSsT, } #[allow(clippy::large_enum_variant)] @@ -58,8 +59,11 @@ impl ChangeLog { let flags = WriteBatchFlags::from_bits_truncate(request.get_header().get_flags()); let is_one_pc = flags.contains(WriteBatchFlags::ONE_PC); - let changes = group_row_changes(request.requests.into()); - let rows = Self::encode_rows(changes, is_one_pc); + let (changes, has_ingest_sst) = group_row_changes(request.requests.into()); + let mut rows = Self::encode_rows(changes, is_one_pc); + if has_ingest_sst { + rows.push(ChangeRow::IngestSsT); + } ChangeLog::Rows { index, rows } } else { ChangeLog::Admin(request.take_admin_request().get_cmd_type()) @@ -190,13 +194,17 @@ struct RowChange { default: Option, } -fn group_row_changes(requests: Vec) -> HashMap { +fn group_row_changes(requests: Vec) -> (HashMap, bool) { let mut changes: HashMap = HashMap::default(); // The changes about default cf was recorded here and need to be matched with a // `write` or a `lock`. let mut unmatched_default = HashMap::default(); + let mut has_ingest_sst = false; for mut req in requests { match req.get_cmd_type() { + CmdType::IngestSst => { + has_ingest_sst = true; + } CmdType::Put => { let mut put = req.take_put(); let key = Key::from_encoded(put.take_key()); @@ -253,7 +261,7 @@ fn group_row_changes(requests: Vec) -> HashMap { row.default = Some(default); } } - changes + (changes, has_ingest_sst) } /// Filter non-lock related data (i.e `default_cf` data), the implement is @@ -274,7 +282,7 @@ pub fn lock_only_filter(mut cmd_batch: CmdBatch) -> Option { CmdType::Delete => req.get_delete().cf.as_str(), _ => "", }; - cf == CF_LOCK || cf == CF_WRITE + cf == CF_LOCK || cf == CF_WRITE || req.get_cmd_type() == CmdType::IngestSst }); cmd.request.set_requests(requests.into()); } @@ -286,7 +294,10 @@ pub fn lock_only_filter(mut cmd_batch: CmdBatch) -> Option { #[cfg(test)] mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::{AssertionLevel, PrewriteRequestPessimisticAction::*}; + use kvproto::{ + kvrpcpb::{AssertionLevel, PrewriteRequestPessimisticAction::*}, + raft_cmdpb::{CmdType, Request}, + }; use tikv::storage::{ kv::{MockEngineBuilder, TestEngineBuilder}, lock_manager::DummyLockManager, @@ -307,8 +318,13 @@ mod tests { let rocks_engine = TestEngineBuilder::new().build().unwrap(); let engine = MockEngineBuilder::from_rocks_engine(rocks_engine).build(); - let reqs = vec![Modify::Put("default", Key::from_raw(b"k1"), b"v1".to_vec()).into()]; - assert!(ChangeLog::encode_rows(group_row_changes(reqs), false).is_empty()); + let mut reqs = vec![Modify::Put("default", Key::from_raw(b"k1"), b"v1".to_vec()).into()]; + let mut req = Request::default(); + req.set_cmd_type(CmdType::IngestSst); + reqs.push(req); + let (changes, has_ingest_sst) = group_row_changes(reqs); + assert_eq!(has_ingest_sst, true); + assert!(ChangeLog::encode_rows(changes, false).is_empty()); must_prewrite_put(&engine, b"k1", b"v1", b"k1", 1); must_commit(&engine, b"k1", 1, 2); @@ -327,8 +343,10 @@ mod tests { .take_last_modifies() .into_iter() .flat_map(|m| { - let reqs = m.into_iter().map(Into::into).collect(); - ChangeLog::encode_rows(group_row_changes(reqs), false) + let reqs: Vec = m.into_iter().map(Into::into).collect(); + let (changes, has_ingest_sst) = group_row_changes(reqs); + assert_eq!(has_ingest_sst, false); + ChangeLog::encode_rows(changes, false) }) .collect(); @@ -415,7 +433,9 @@ mod tests { .into_iter() .flat_map(|m| { let reqs = m.into_iter().map(Into::into).collect(); - ChangeLog::encode_rows(group_row_changes(reqs), true) + let (changes, has_ingest_sst) = group_row_changes(reqs); + assert_eq!(has_ingest_sst, false); + ChangeLog::encode_rows(changes, true) }) .last() .unwrap(); diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index f2920e2af69..a4e5f6e3864 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -134,6 +134,7 @@ impl ObserveRegion { }), // One pc command do not contains any lock, so just skip it ChangeRow::OnePc { .. } => {} + ChangeRow::IngestSsT => {} }); assert!( *tracked_index < *index, @@ -191,6 +192,9 @@ impl ObserveRegion { .untrack_lock(&key.to_raw().unwrap(), Some(*index)), // One pc command do not contains any lock, so just skip it ChangeRow::OnePc { .. } => {} + ChangeRow::IngestSsT => { + self.resolver.update_tracked_index(*index); + } }); } } diff --git a/components/resolved_ts/tests/integrations/mod.rs b/components/resolved_ts/tests/integrations/mod.rs index 7916d03d8d2..a8acab00625 100644 --- a/components/resolved_ts/tests/integrations/mod.rs +++ b/components/resolved_ts/tests/integrations/mod.rs @@ -5,9 +5,11 @@ mod testsuite; use std::time::Duration; use futures::executor::block_on; -use kvproto::kvrpcpb::*; +use kvproto::{kvrpcpb::*, metapb::RegionEpoch}; use pd_client::PdClient; +use tempfile::Builder; use test_raftstore::sleep_ms; +use test_sst_importer::*; pub use testsuite::*; #[test] @@ -52,6 +54,33 @@ fn test_resolved_ts_basic() { let current_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.must_get_rts_ge(r1.id, current_ts); + // ingest sst + let temp_dir = Builder::new().prefix("test_resolved_ts").tempdir().unwrap(); + let sst_path = temp_dir.path().join("test.sst"); + let sst_range = (0, 100); + + let mut sst_epoch = RegionEpoch::default(); + sst_epoch.set_conf_ver(1); + sst_epoch.set_version(4); + + let (mut meta, data) = gen_sst_file(&sst_path, sst_range); + meta.set_region_id(r1.id); + meta.set_region_epoch(sst_epoch); + + suite.upload_sst(r1.id, &meta, &data).unwrap(); + + let tracked_index_before = suite.region_tracked_index(r1.id); + suite.must_ingest_sst(r1.id, meta); + let mut tracked_index_after = suite.region_tracked_index(r1.id); + for _ in 0..10 { + if tracked_index_after > tracked_index_before { + break; + } + tracked_index_after = suite.region_tracked_index(r1.id); + sleep_ms(200) + } + assert!(tracked_index_after > tracked_index_before); + suite.stop(); } diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 0e6d8bbc9f8..812f9057e6b 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -5,8 +5,11 @@ use std::{sync::*, time::Duration}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::{RocksEngine, RocksSnapshot}; -use grpcio::{ChannelBuilder, ClientUnaryReceiver, Environment}; +use futures::{executor::block_on, stream, SinkExt}; +use grpcio::{ChannelBuilder, ClientUnaryReceiver, Environment, Result, WriteFlags}; use kvproto::{ + import_sstpb::{IngestRequest, SstMeta, UploadRequest, UploadResponse}, + import_sstpb_grpc::ImportSstClient, kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, tikvpb::TikvClient, }; @@ -28,6 +31,7 @@ pub struct TestSuite { pub endpoints: HashMap>>, pub obs: HashMap>, tikv_cli: HashMap, + import_cli: HashMap, concurrency_managers: HashMap, env: Arc, @@ -97,6 +101,7 @@ impl TestSuite { concurrency_managers, env: Arc::new(Environment::new(1)), tikv_cli: HashMap::default(), + import_cli: HashMap::default(), } } @@ -323,6 +328,19 @@ impl TestSuite { }) } + pub fn get_import_client(&mut self, region_id: u64) -> &ImportSstClient { + let leader = self.cluster.leader_of_region(region_id).unwrap(); + let store_id = leader.get_store_id(); + let addr = self.cluster.sim.rl().get_addr(store_id); + let env = self.env.clone(); + self.import_cli + .entry(leader.get_store_id()) + .or_insert_with(|| { + let channel = ChannelBuilder::new(env).connect(&addr); + ImportSstClient::new(channel) + }) + } + pub fn get_txn_concurrency_manager(&self, store_id: u64) -> Option { self.concurrency_managers.get(&store_id).cloned() } @@ -342,6 +360,20 @@ impl TestSuite { ) } + pub fn region_tracked_index(&mut self, region_id: u64) -> u64 { + for _ in 0..50 { + if let Some(leader) = self.cluster.leader_of_region(region_id) { + let meta = self.cluster.store_metas[&leader.store_id].lock().unwrap(); + if let Some(tracked_index) = meta.region_read_progress.get_tracked_index(®ion_id) + { + return tracked_index; + } + } + sleep_ms(100) + } + panic!("fail to get region tracked index after 50 trys"); + } + pub fn must_get_rts(&mut self, region_id: u64, rts: TimeStamp) { for _ in 0..50 { if let Some(ts) = self.region_resolved_ts(region_id) { @@ -365,4 +397,45 @@ impl TestSuite { } panic!("fail to get greater ts after 50 trys"); } + + pub fn upload_sst( + &mut self, + region_id: u64, + meta: &SstMeta, + data: &[u8], + ) -> Result { + let import = self.get_import_client(region_id); + let mut r1 = UploadRequest::default(); + r1.set_meta(meta.clone()); + let mut r2 = UploadRequest::default(); + r2.set_data(data.to_vec()); + let reqs: Vec<_> = vec![r1, r2] + .into_iter() + .map(|r| Result::Ok((r, WriteFlags::default()))) + .collect(); + let (mut tx, rx) = import.upload().unwrap(); + let mut stream = stream::iter(reqs); + block_on(async move { + tx.send_all(&mut stream).await?; + tx.close().await?; + rx.await + }) + } + + pub fn must_ingest_sst(&mut self, region_id: u64, meta: SstMeta) { + let mut ingest_request = IngestRequest::default(); + ingest_request.set_context(self.get_context(region_id)); + ingest_request.set_sst(meta); + + let ingest_sst_resp = self + .get_import_client(region_id) + .ingest(&ingest_request) + .unwrap(); + + assert!( + !ingest_sst_resp.has_error(), + "{:?}", + ingest_sst_resp.get_error() + ); + } } From 592d423d76f0762e75179285119c1965c9cd4b76 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 15 Sep 2022 10:58:59 +0800 Subject: [PATCH 209/676] raftstore: Implement coprocessor observer pre_persist (#12957) ref tikv/tikv#12849 Support coprocessor observer pre_commit Signed-off-by: CalvinNeo --- .../raftstore/src/coprocessor/dispatcher.rs | 20 +++++++ components/raftstore/src/coprocessor/mod.rs | 11 ++++ components/raftstore/src/store/fsm/apply.rs | 58 +++++++++++++++++-- components/test_raftstore/src/node.rs | 1 + components/test_raftstore/src/server.rs | 1 + 5 files changed, 85 insertions(+), 6 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 3cddc21e8cb..df7794c3701 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -635,6 +635,26 @@ impl CoprocessorHost { ); } + /// `pre_persist` is called we we want to persist data or meta for a region. + /// For example, in `finish_for` and `commit`, + /// we will separately call `pre_persist` with is_finished = true/false. + /// By returning false, we reject this persistence. + pub fn pre_persist( + &self, + region: &Region, + is_finished: bool, + cmd: Option<&RaftCmdRequest>, + ) -> bool { + let mut ctx = ObserverContext::new(region); + for observer in &self.registry.region_change_observers { + let observer = observer.observer.inner(); + if !observer.pre_persist(&mut ctx, is_finished, cmd) { + return false; + } + } + true + } + pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 8a309dc4734..35330701a95 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -310,6 +310,17 @@ pub enum RegionChangeEvent { pub trait RegionChangeObserver: Coprocessor { /// Hook to call when a region changed on this TiKV fn on_region_changed(&self, _: &mut ObserverContext<'_>, _: RegionChangeEvent, _: StateRole) {} + + /// Should be called everytime before we write a WriteBatch into + /// KvEngine. Returns false if we can't commit at this time. + fn pre_persist( + &self, + _: &mut ObserverContext<'_>, + _is_finished: bool, + _cmd: Option<&RaftCmdRequest>, + ) -> bool { + true + } } #[derive(Clone, Debug, Default)] diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 6d1d1881046..e23ba64eb7b 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -584,10 +584,17 @@ where delegate: &mut ApplyDelegate, results: VecDeque>, ) { - if !delegate.pending_remove { - delegate.write_apply_state(self.kv_wb_mut()); + if self.host.pre_persist(&delegate.region, true, None) { + if !delegate.pending_remove { + delegate.write_apply_state(self.kv_wb_mut()); + } + self.commit_opt(delegate, false); + } else { + debug!("do not persist when finish_for"; + "region" => ?delegate.region, + "tag" => &delegate.tag, + ); } - self.commit_opt(delegate, false); self.apply_res.push(ApplyRes { region_id: delegate.region_id(), apply_state: delegate.apply_state.clone(), @@ -1073,8 +1080,9 @@ where } let mut has_unflushed_data = self.last_flush_applied_index != self.apply_state.get_applied_index(); - if has_unflushed_data && should_write_to_engine(&cmd) - || apply_ctx.kv_wb().should_write_to_engine() + if (has_unflushed_data && should_write_to_engine(&cmd) + || apply_ctx.kv_wb().should_write_to_engine()) + && apply_ctx.host.pre_persist(&self.region, false, Some(&cmd)) { apply_ctx.commit(self); if let Some(start) = self.handle_start.as_ref() { @@ -4972,6 +4980,7 @@ mod tests { cmd_sink: Option>>>, filter_compact_log: Arc, filter_consistency_check: Arc, + skip_persist_when_pre_commit: Arc, delay_remove_ssts: Arc, last_delete_sst_count: Arc, last_pending_delete_sst_count: Arc, @@ -5095,6 +5104,17 @@ mod tests { fn on_applied_current_term(&self, _: raft::StateRole, _: &Region) {} } + impl RegionChangeObserver for ApplyObserver { + fn pre_persist( + &self, + _: &mut ObserverContext<'_>, + _is_finished: bool, + _cmd: Option<&RaftCmdRequest>, + ) -> bool { + !self.skip_persist_when_pre_commit.load(Ordering::SeqCst) + } + } + #[test] fn test_handle_raft_committed_entries() { let (_path, engine) = create_tmp_engine("test-delegate"); @@ -5716,6 +5736,8 @@ mod tests { let obs = ApplyObserver::default(); host.registry .register_admin_observer(1, BoxAdminObserver::new(obs.clone())); + host.registry + .register_region_change_observer(1, BoxRegionChangeObserver::new(obs.clone())); host.registry .register_query_observer(1, BoxQueryObserver::new(obs.clone())); @@ -5751,6 +5773,8 @@ mod tests { reg.region.mut_region_epoch().set_version(3); router.schedule_task(1, Msg::Registration(reg)); + obs.skip_persist_when_pre_commit + .store(true, Ordering::SeqCst); let mut index_id = 1; let put_entry = EntryBuilder::new(index_id, 1) .put(b"k1", b"v1") @@ -5759,7 +5783,19 @@ mod tests { .epoch(1, 3) .build(); router.schedule_task(1, Msg::apply(apply(peer_id, 1, 1, vec![put_entry], vec![]))); - fetch_apply_res(&rx); + let apply_res = fetch_apply_res(&rx); + + // We don't persist at `finish_for`, since we disabled `pre_persist`. + let state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap_or_default(); + assert_eq!( + apply_res.apply_state.get_applied_index(), + state.get_applied_index() + 1 + ); + obs.skip_persist_when_pre_commit + .store(false, Ordering::SeqCst); // Phase 1: we test if pre_exec will filter execution of commands correctly. index_id += 1; @@ -5781,6 +5817,16 @@ mod tests { assert_eq!(apply_res.exec_res.len(), 0); assert_eq!(apply_res.apply_state.get_truncated_state().get_index(), 0); + // We persist at `finish_for`, since we enabled `pre_persist`. + let state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(1)) + .unwrap() + .unwrap_or_default(); + assert_eq!( + apply_res.apply_state.get_applied_index(), + state.get_applied_index() + ); + index_id += 1; // Don't filter CompactLog obs.filter_compact_log.store(false, Ordering::SeqCst); diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 1616504c820..f604ce7dff7 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -270,6 +270,7 @@ impl Simulator for NodeCluster { .max_total_size(cfg.server.snap_max_total_size.0) .encryption_key_manager(key_manager) .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) + .enable_multi_snapshot_files(true) .build(tmp.path().to_str().unwrap()); (snap_mgr, Some(tmp)) } else { diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 6895915d466..51092007bff 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -455,6 +455,7 @@ impl ServerCluster { .max_total_size(cfg.server.snap_max_total_size.0) .encryption_key_manager(key_manager) .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) + .enable_multi_snapshot_files(true) .build(tmp_str); self.snap_mgrs.insert(node_id, snap_mgr.clone()); let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); From d762bd89a6c25eac1f07f40319f69c9f95db0131 Mon Sep 17 00:00:00 2001 From: YangKeao Date: Thu, 15 Sep 2022 00:54:59 -0400 Subject: [PATCH 210/676] copr: implement several mysql time related types json (#13418) close tikv/tikv#13417 Signed-off-by: YangKeao Co-authored-by: Ti Chi Robot --- .../tidb_query_datatype/src/codec/convert.rs | 12 +- .../src/codec/mysql/json/binary.rs | 249 +++++++++++++++- .../src/codec/mysql/json/comparison.rs | 144 +++++++++ .../src/codec/mysql/json/constants.rs | 2 + .../src/codec/mysql/json/jcodec.rs | 4 + .../src/codec/mysql/json/json_type.rs | 7 + .../src/codec/mysql/json/json_unquote.rs | 42 +++ .../src/codec/mysql/json/mod.rs | 98 +++++-- .../src/codec/mysql/json/modifier.rs | 11 +- .../src/codec/mysql/json/serde.rs | 18 ++ components/tidb_query_expr/src/impl_cast.rs | 275 ++++++++++++++++-- 11 files changed, 793 insertions(+), 69 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index efd99f5317a..26ae799c4ff 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -510,14 +510,14 @@ impl<'a> ToInt for JsonRef<'a> { // TiDB: 5 // MySQL: 4 let val = match self.get_type() { - JsonType::Object | JsonType::Array | JsonType::Opaque => Ok(ctx - .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) - .map(|_| 0)?), JsonType::Literal => Ok(self.get_literal().map_or(0, |x| x as i64)), JsonType::I64 => Ok(self.get_i64()), JsonType::U64 => Ok(self.get_u64() as i64), JsonType::Double => self.get_double().to_int(ctx, tp), JsonType::String => self.get_str_bytes()?.to_int(ctx, tp), + _ => Ok(ctx + .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) + .map(|_| 0)?), }?; val.to_int(ctx, tp) } @@ -526,14 +526,14 @@ impl<'a> ToInt for JsonRef<'a> { #[inline] fn to_uint(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let val = match self.get_type() { - JsonType::Object | JsonType::Array | JsonType::Opaque => Ok(ctx - .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) - .map(|_| 0)?), JsonType::Literal => Ok(self.get_literal().map_or(0, |x| x as u64)), JsonType::I64 => Ok(self.get_i64() as u64), JsonType::U64 => Ok(self.get_u64()), JsonType::Double => self.get_double().to_uint(ctx, tp), JsonType::String => self.get_str_bytes()?.to_uint(ctx, tp), + _ => Ok(ctx + .handle_truncate_err(Error::truncated_wrong_val("Integer", self.to_string())) + .map(|_| 0)?), }?; val.to_uint(ctx, tp) } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs index 9b8264ee3fb..12f8fbd5129 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs @@ -88,6 +88,13 @@ impl<'a> JsonRef<'a> { &self.value()[val_offset..val_offset + opaque_bytes_len as usize + len_len + 1], ) } + JsonType::Date | JsonType::Datetime | JsonType::Timestamp => { + JsonRef::new(val_type, &self.value()[val_offset..val_offset + TIME_LEN]) + } + JsonType::Time => JsonRef::new( + val_type, + &self.value()[val_offset..val_offset + DURATION_LEN], + ), _ => { let data_size = NumberCodec::decode_u32_le(&self.value()[val_offset + ELEMENT_COUNT_LEN..]) @@ -122,7 +129,16 @@ impl<'a> JsonRef<'a> { #[cfg(test)] mod tests { - use super::{super::Json, *}; + use std::collections::BTreeMap; + + use super::*; + use crate::{ + codec::{ + data_type::Duration, + mysql::{Json, Time, TimeType}, + }, + expr::EvalContext, + }; #[test] fn test_type() { @@ -143,4 +159,235 @@ mod tests { assert_eq!(json.as_ref().get_type(), tp, "{:?}", json_str); } } + + #[test] + fn test_array_get_elem() { + let mut ctx = EvalContext::default(); + + let time = Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(); + let duration = Duration::parse(&mut ctx, "12:13:14", 0).unwrap(); + let array = vec![ + Json::from_u64(1).unwrap(), + Json::from_str_val("abcdefg").unwrap(), + ]; + let object = BTreeMap::from([ + ("key1".to_string(), Json::from_u64(1).unwrap()), + ("key2".to_string(), Json::from_str_val("abcdefg").unwrap()), + ]); + + let json_array = Json::from_array(vec![ + Json::from_u64(1).unwrap(), + Json::from_time(time).unwrap(), + Json::from_duration(duration).unwrap(), + Json::from_array(array).unwrap(), + Json::from_str_val("abcdefg").unwrap(), + Json::from_bool(false).unwrap(), + Json::from_object(object).unwrap(), + ]) + .unwrap(); + let json_array_ref = json_array.as_ref(); + + assert_eq!(json_array_ref.array_get_elem(0).unwrap().get_u64(), 1); + assert_eq!( + json_array_ref + .array_get_elem(1) + .unwrap() + .get_time() + .unwrap(), + time + ); + assert_eq!( + json_array_ref + .array_get_elem(2) + .unwrap() + .get_duration() + .unwrap(), + duration + ); + assert_eq!( + json_array_ref + .array_get_elem(3) + .unwrap() + .array_get_elem(0) + .unwrap() + .get_u64(), + 1 + ); + assert_eq!( + json_array_ref + .array_get_elem(3) + .unwrap() + .array_get_elem(1) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + assert_eq!( + json_array_ref.array_get_elem(4).unwrap().get_str().unwrap(), + "abcdefg" + ); + assert_eq!( + json_array_ref + .array_get_elem(5) + .unwrap() + .get_literal() + .unwrap(), + false + ); + assert_eq!( + json_array_ref.array_get_elem(6).unwrap().object_get_key(0), + b"key1" + ); + assert_eq!( + json_array_ref.array_get_elem(6).unwrap().object_get_key(1), + b"key2" + ); + assert_eq!( + json_array_ref + .array_get_elem(6) + .unwrap() + .object_get_val(0) + .unwrap() + .get_u64(), + 1 + ); + assert_eq!( + json_array_ref + .array_get_elem(6) + .unwrap() + .object_get_val(1) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + } + + #[test] + fn test_object_get_val() { + let mut ctx = EvalContext::default(); + + let time = Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(); + let duration = Duration::parse(&mut ctx, "12:13:14", 0).unwrap(); + let array = vec![ + Json::from_u64(1).unwrap(), + Json::from_str_val("abcdefg").unwrap(), + ]; + let object = BTreeMap::from([ + ("key1".to_string(), Json::from_u64(1).unwrap()), + ("key2".to_string(), Json::from_str_val("abcdefg").unwrap()), + ]); + + let json_object = Json::from_object(BTreeMap::from([ + ("0".to_string(), Json::from_u64(1).unwrap()), + ("1".to_string(), Json::from_time(time).unwrap()), + ("2".to_string(), Json::from_duration(duration).unwrap()), + ("3".to_string(), Json::from_array(array).unwrap()), + ("4".to_string(), Json::from_str_val("abcdefg").unwrap()), + ("5".to_string(), Json::from_bool(false).unwrap()), + ("6".to_string(), Json::from_object(object).unwrap()), + ])) + .unwrap(); + let json_object_ref = json_object.as_ref(); + + assert_eq!(json_object_ref.object_get_key(0), b"0"); + assert_eq!(json_object_ref.object_get_key(1), b"1"); + assert_eq!(json_object_ref.object_get_key(2), b"2"); + assert_eq!(json_object_ref.object_get_key(3), b"3"); + + assert_eq!(json_object_ref.object_get_val(0).unwrap().get_u64(), 1); + assert_eq!( + json_object_ref + .object_get_val(1) + .unwrap() + .get_time() + .unwrap(), + time + ); + assert_eq!( + json_object_ref + .object_get_val(2) + .unwrap() + .get_duration() + .unwrap(), + duration + ); + assert_eq!( + json_object_ref + .object_get_val(3) + .unwrap() + .array_get_elem(0) + .unwrap() + .get_u64(), + 1 + ); + assert_eq!( + json_object_ref + .object_get_val(3) + .unwrap() + .array_get_elem(1) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + assert_eq!( + json_object_ref + .object_get_val(4) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + assert_eq!( + json_object_ref + .object_get_val(5) + .unwrap() + .get_literal() + .unwrap(), + false + ); + assert_eq!( + json_object_ref.object_get_val(6).unwrap().object_get_key(0), + b"key1" + ); + assert_eq!( + json_object_ref.object_get_val(6).unwrap().object_get_key(1), + b"key2" + ); + assert_eq!( + json_object_ref + .object_get_val(6) + .unwrap() + .object_get_val(0) + .unwrap() + .get_u64(), + 1 + ); + assert_eq!( + json_object_ref + .object_get_val(6) + .unwrap() + .object_get_val(1) + .unwrap() + .get_str() + .unwrap(), + "abcdefg" + ); + } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs index f948a172ef0..757ccdfc6bf 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/comparison.rs @@ -38,6 +38,10 @@ impl<'a> JsonRef<'a> { JsonType::I64 | JsonType::U64 | JsonType::Double => PRECEDENCE_NUMBER, JsonType::String => PRECEDENCE_STRING, JsonType::Opaque => PRECEDENCE_OPAQUE, + JsonType::Date => PRECEDENCE_DATE, + JsonType::Datetime => PRECEDENCE_DATETIME, + JsonType::Timestamp => PRECEDENCE_DATETIME, + JsonType::Time => PRECEDENCE_TIME, } } @@ -150,6 +154,23 @@ impl<'a> PartialOrd for JsonRef<'a> { return None; } } + JsonType::Date | JsonType::Datetime | JsonType::Timestamp => { + // The jsonTypePrecedences guarantees that the DATE is only comparable with the + // DATE, and the DATETIME and TIMESTAMP will compare with + // each other + if let (Ok(left), Ok(right)) = (self.get_time(), right.get_time()) { + left.partial_cmp(&right) + } else { + return None; + } + } + JsonType::Time => { + if let (Ok(left), Ok(right)) = (self.get_duration(), right.get_duration()) { + left.partial_cmp(&right) + } else { + return None; + } + } }; } @@ -191,6 +212,13 @@ impl PartialOrd for Json { #[cfg(test)] mod tests { use super::*; + use crate::{ + codec::{ + data_type::Duration, + mysql::{Time, TimeType}, + }, + expr::EvalContext, + }; #[test] fn test_cmp_json_numberic_type() { @@ -295,4 +323,120 @@ mod tests { assert_eq!(Json::from_i64(2).unwrap(), Json::from_bool(false).unwrap()); } + + #[test] + fn test_cmp_json_between_json_type() { + let mut ctx = EvalContext::default(); + + let cmp = [ + ( + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-14 13:14:15", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Ordering::Less, + ), + ( + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-12 13:14:15", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Ordering::Greater, + ), + ( + // DateTime is always greater than Date + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Json::from_time( + Time::parse(&mut ctx, "1998-06-14", TimeType::Date, 0, false).unwrap(), + ) + .unwrap(), + Ordering::Greater, + ), + ( + Json::from_duration(Duration::parse(&mut ctx, "12:13:14", 0).unwrap()).unwrap(), + Json::from_duration(Duration::parse(&mut ctx, "12:13:16", 0).unwrap()).unwrap(), + Ordering::Less, + ), + ( + Json::from_duration(Duration::parse(&mut ctx, "12:13:16", 0).unwrap()).unwrap(), + Json::from_duration(Duration::parse(&mut ctx, "12:13:14", 0).unwrap()).unwrap(), + Ordering::Greater, + ), + ( + // Time is always greater than Date + Json::from_duration(Duration::parse(&mut ctx, "12:13:16", 0).unwrap()).unwrap(), + Json::from_time( + Time::parse(&mut ctx, "1998-06-12", TimeType::Date, 0, false).unwrap(), + ) + .unwrap(), + Ordering::Greater, + ), + ( + // Time is always less than DateTime + Json::from_duration(Duration::parse(&mut ctx, "12:13:16", 0).unwrap()).unwrap(), + Json::from_time( + Time::parse( + &mut ctx, + "1998-06-12 11:11:11", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(), + Ordering::Less, + ), + ]; + + for (l, r, result) in cmp { + assert_eq!(l.cmp(&r), result) + } + } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/constants.rs b/components/tidb_query_datatype/src/codec/mysql/json/constants.rs index 57927b4b99c..7dec22a6c0b 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/constants.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/constants.rs @@ -11,6 +11,8 @@ pub const LITERAL_LEN: usize = 1; pub const U16_LEN: usize = 2; pub const U32_LEN: usize = 4; pub const NUMBER_LEN: usize = 8; +pub const TIME_LEN: usize = NUMBER_LEN; +pub const DURATION_LEN: usize = NUMBER_LEN + U32_LEN; pub const HEADER_LEN: usize = ELEMENT_COUNT_LEN + SIZE_LEN; // element size + data size pub const KEY_OFFSET_LEN: usize = U32_LEN; pub const KEY_LEN_LEN: usize = U16_LEN; diff --git a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs index 51ca3ba0da0..867d8ec2c20 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/jcodec.rs @@ -260,6 +260,10 @@ pub trait JsonDecoder: NumberDecoder { let (opaque_bytes_len, len_len) = NumberCodec::try_decode_var_u64(&value[1..])?; self.read_bytes(opaque_bytes_len as usize + len_len + 1)? } + JsonType::Date | JsonType::Datetime | JsonType::Timestamp => { + self.read_bytes(TIME_LEN)? + } + JsonType::Time => self.read_bytes(DURATION_LEN)?, }; Ok(Json::new(tp, Vec::from(value))) } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs index 28c4d275471..70321080ef7 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_type.rs @@ -14,6 +14,9 @@ const JSON_TYPE_ARRAY: &[u8] = b"ARRAY"; const JSON_TYPE_BIT: &[u8] = b"BIT"; const JSON_TYPE_BLOB: &[u8] = b"BLOB"; const JSON_TYPE_OPAQUE: &[u8] = b"OPAQUE"; +const JSON_TYPE_DATE: &[u8] = b"DATE"; +const JSON_TYPE_DATETIME: &[u8] = b"DATETIME"; +const JSON_TYPE_TIME: &[u8] = b"TIME"; impl<'a> JsonRef<'a> { /// `json_type` is the implementation for @@ -43,6 +46,10 @@ impl<'a> JsonRef<'a> { Ok(FieldTypeTp::Bit) => JSON_TYPE_BIT, _ => JSON_TYPE_OPAQUE, }, + JsonType::Date => JSON_TYPE_DATE, + JsonType::Datetime => JSON_TYPE_DATETIME, + JsonType::Timestamp => JSON_TYPE_DATETIME, + JsonType::Time => JSON_TYPE_TIME, } } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_unquote.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_unquote.rs index 5cfc8bc908d..f95c08cf958 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_unquote.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_unquote.rs @@ -24,6 +24,16 @@ impl<'a> JsonRef<'a> { let s = self.get_str()?; unquote_string(s) } + JsonType::Date + | JsonType::Datetime + | JsonType::Timestamp + | JsonType::Time + | JsonType::Opaque => { + let s = self.to_string(); + // Remove the quotes of output + assert!(s.len() > 2); + Ok(s[1..s.len() - 1].to_string()) + } _ => Ok(self.to_string()), } } @@ -83,6 +93,13 @@ mod tests { use std::collections::BTreeMap; use super::{super::Json, *}; + use crate::{ + codec::{ + data_type::Duration, + mysql::{Time, TimeType}, + }, + expr::EvalContext, + }; #[test] fn test_decode_escaped_unicode() { @@ -161,4 +178,29 @@ mod tests { ); } } + + #[test] + fn test_json_unquote_time_duration() { + let mut ctx = EvalContext::default(); + + let time = Json::from_time( + Time::parse( + &mut ctx, + "1998-06-13 12:13:14", + TimeType::DateTime, + 0, + false, + ) + .unwrap(), + ) + .unwrap(); + assert_eq!( + time.as_ref().unquote().unwrap(), + "1998-06-13 12:13:14.000000" + ); + + let duration = + Json::from_duration(Duration::parse(&mut ctx, "12:13:14", 0).unwrap()).unwrap(); + assert_eq!(duration.as_ref().unquote().unwrap(), "12:13:14.000000"); + } } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs index c4e3a9ebf5c..f21f789c0d0 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs @@ -75,7 +75,11 @@ mod json_remove; mod json_type; pub mod json_unquote; -use std::{collections::BTreeMap, convert::TryFrom, str}; +use std::{ + collections::BTreeMap, + convert::{TryFrom, TryInto}, + str, +}; use codec::number::{NumberCodec, F64_SIZE, I64_SIZE}; use constants::{JSON_LITERAL_FALSE, JSON_LITERAL_NIL, JSON_LITERAL_TRUE}; @@ -91,7 +95,6 @@ use crate::{ codec::{ convert::ConvertTo, data_type::{BytesRef, Decimal, Real}, - mysql, mysql::{Duration, Time, TimeType}, }, expr::EvalContext, @@ -114,6 +117,10 @@ pub enum JsonType { // It's a special value for the compatibility with MySQL. // It will store the raw buffer containing unexpected type (e.g. Binary). Opaque = 0x0d, + Date = 0x0e, + Datetime = 0x0f, + Timestamp = 0x10, + Time = 0x11, } impl TryFrom for JsonType { @@ -225,19 +232,43 @@ impl<'a> JsonRef<'a> { FieldTypeTp::from_u8(val[0]).ok_or(box_err!("invalid opaque type code")) } + pub fn get_time(&self) -> Result

{ + pub fn new() -> Self { + Self { + inner: P::capture(), + _phantom: PhantomData, + } + } + + pub fn delta(&self) -> P { + P::capture() - self.inner + } +} + +impl Default for PerfStatisticsInstant

{ + fn default() -> Self { + Self::new() + } +} + +impl slog::KV for PerfStatisticsInstant

{ + fn serialize( + &self, + record: &::slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + slog::KV::serialize(&self.inner, record, serializer) + } +} + +impl PerfContextFields for ReadPerfContext { + fn capture() -> Self { + let perf_context = PerfContext::get(); + ReadPerfContext { + user_key_comparison_count: perf_context.user_key_comparison_count(), + block_cache_hit_count: perf_context.block_cache_hit_count(), + block_read_count: perf_context.block_read_count(), + block_read_byte: perf_context.block_read_byte(), + block_read_time: perf_context.block_read_time(), + block_cache_index_hit_count: perf_context.block_cache_index_hit_count(), + index_block_read_count: perf_context.index_block_read_count(), + block_cache_filter_hit_count: perf_context.block_cache_filter_hit_count(), + filter_block_read_count: perf_context.filter_block_read_count(), + block_checksum_time: perf_context.block_checksum_time(), + block_decompress_time: perf_context.block_decompress_time(), + get_read_bytes: perf_context.get_read_bytes(), + iter_read_bytes: perf_context.iter_read_bytes(), + internal_key_skipped_count: perf_context.internal_key_skipped_count(), + internal_delete_skipped_count: perf_context.internal_delete_skipped_count(), + internal_recent_skipped_count: perf_context.internal_recent_skipped_count(), + get_snapshot_time: perf_context.get_snapshot_time(), + get_from_memtable_time: perf_context.get_from_memtable_time(), + get_from_memtable_count: perf_context.get_from_memtable_count(), + get_post_process_time: perf_context.get_post_process_time(), + get_from_output_files_time: perf_context.get_from_output_files_time(), + seek_on_memtable_time: perf_context.seek_on_memtable_time(), + seek_on_memtable_count: perf_context.seek_on_memtable_count(), + next_on_memtable_count: perf_context.next_on_memtable_count(), + prev_on_memtable_count: perf_context.prev_on_memtable_count(), + seek_child_seek_time: perf_context.seek_child_seek_time(), + seek_child_seek_count: perf_context.seek_child_seek_count(), + seek_min_heap_time: perf_context.seek_min_heap_time(), + seek_max_heap_time: perf_context.seek_max_heap_time(), + seek_internal_seek_time: perf_context.seek_internal_seek_time(), + db_mutex_lock_nanos: perf_context.db_mutex_lock_nanos(), + db_condition_wait_nanos: perf_context.db_condition_wait_nanos(), + read_index_block_nanos: perf_context.read_index_block_nanos(), + read_filter_block_nanos: perf_context.read_filter_block_nanos(), + new_table_block_iter_nanos: perf_context.new_table_block_iter_nanos(), + new_table_iterator_nanos: perf_context.new_table_iterator_nanos(), + block_seek_nanos: perf_context.block_seek_nanos(), + find_table_nanos: perf_context.find_table_nanos(), + bloom_memtable_hit_count: perf_context.bloom_memtable_hit_count(), + bloom_memtable_miss_count: perf_context.bloom_memtable_miss_count(), + bloom_sst_hit_count: perf_context.bloom_sst_hit_count(), + bloom_sst_miss_count: perf_context.bloom_sst_miss_count(), + get_cpu_nanos: perf_context.get_cpu_nanos(), + iter_next_cpu_nanos: perf_context.iter_next_cpu_nanos(), + iter_prev_cpu_nanos: perf_context.iter_prev_cpu_nanos(), + iter_seek_cpu_nanos: perf_context.iter_seek_cpu_nanos(), + encrypt_data_nanos: perf_context.encrypt_data_nanos(), + decrypt_data_nanos: perf_context.decrypt_data_nanos(), + } + } +} + +impl PerfContextFields for WritePerfContext { + fn capture() -> Self { + let perf_context = PerfContext::get(); + WritePerfContext { + write_wal_time: perf_context.write_wal_time(), + pre_and_post_process: perf_context.write_pre_and_post_process_time(), + write_memtable_time: perf_context.write_memtable_time(), + write_thread_wait: perf_context.write_thread_wait_nanos(), + db_mutex_lock_nanos: perf_context.db_mutex_lock_nanos(), + write_scheduling_flushes_compactions_time: perf_context + .write_scheduling_flushes_compactions_time(), + db_condition_wait_nanos: perf_context.db_condition_wait_nanos(), + write_delay_time: perf_context.write_delay_time(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_field_operations() { + let f1 = ReadPerfContext { + internal_key_skipped_count: 1, + internal_delete_skipped_count: 2, + block_cache_hit_count: 3, + block_read_count: 4, + block_read_byte: 5, + ..Default::default() + }; + let f2 = ReadPerfContext { + internal_key_skipped_count: 2, + internal_delete_skipped_count: 3, + block_cache_hit_count: 5, + block_read_count: 7, + block_read_byte: 11, + ..Default::default() + }; + let f3 = f1 + f2; + assert_eq!(f3.internal_key_skipped_count, 3); + assert_eq!(f3.block_cache_hit_count, 8); + assert_eq!(f3.block_read_byte, 16); + + let mut f3 = f1; + f3 += f2; + assert_eq!(f3.internal_key_skipped_count, 3); + assert_eq!(f3.block_cache_hit_count, 8); + assert_eq!(f3.block_read_byte, 16); + + let f3 = f2 - f1; + assert_eq!(f3.internal_key_skipped_count, 1); + assert_eq!(f3.block_cache_hit_count, 2); + assert_eq!(f3.block_read_byte, 6); + + let mut f3 = f2; + f3 -= f1; + assert_eq!(f3.internal_key_skipped_count, 1); + assert_eq!(f3.block_cache_hit_count, 2); + assert_eq!(f3.block_read_byte, 6); + } + + #[test] + fn test_deref() { + let mut stats = ReadPerfContext { + internal_key_skipped_count: 1, + internal_delete_skipped_count: 2, + block_cache_hit_count: 3, + block_read_count: 4, + block_read_byte: 5, + ..Default::default() + }; + assert_eq!(stats.block_cache_hit_count, 3); + stats.block_cache_hit_count = 6; + assert_eq!(stats.block_cache_hit_count, 6); + } +} diff --git a/components/engine_tirocks/src/write_batch.rs b/components/engine_tirocks/src/write_batch.rs new file mode 100644 index 00000000000..1671e686917 --- /dev/null +++ b/components/engine_tirocks/src/write_batch.rs @@ -0,0 +1,383 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{Result, WriteBatchExt as _}; +use tirocks::{option::WriteOptions, WriteBatch}; + +use crate::{r2e, RocksEngine}; + +const WRITE_BATCH_MAX_BATCH_NUM: usize = 16; +const WRITE_BATCH_MAX_KEY_NUM: usize = 16; + +impl engine_traits::WriteBatchExt for RocksEngine { + type WriteBatch = RocksWriteBatchVec; + + const WRITE_BATCH_MAX_KEYS: usize = 256; + + #[inline] + fn write_batch(&self) -> RocksWriteBatchVec { + self.write_batch_with_cap(1) + } + + #[inline] + fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatchVec { + RocksWriteBatchVec::with_unit_capacity(self, cap) + } +} + +/// `RocksWriteBatchVec` is for method `MultiBatchWrite` of RocksDB, which +/// splits a large WriteBatch into many smaller ones and then any thread could +/// help to deal with these small WriteBatch when it is calling +/// `MultiBatchCommit` and wait the front writer to finish writing. +/// `MultiBatchWrite` will perform much better than traditional +/// `pipelined_write` when TiKV writes very large data into RocksDB. +/// We will remove this feature when `unordered_write` of RocksDB becomes more +/// stable and becomes compatible with Titan. +pub struct RocksWriteBatchVec { + engine: RocksEngine, + wbs: Vec, + save_points: Vec, + index: usize, +} + +impl RocksWriteBatchVec { + pub fn with_unit_capacity(engine: &RocksEngine, cap: usize) -> RocksWriteBatchVec { + let wb = WriteBatch::with_capacity(cap); + RocksWriteBatchVec { + engine: engine.clone(), + wbs: vec![wb], + save_points: vec![], + index: 0, + } + } + + /// `check_switch_batch` will split a large WriteBatch into many smaller + /// ones. This is to avoid a large WriteBatch blocking write_thread too + /// long. + #[inline(always)] + fn check_switch_batch(&mut self) { + if self.engine.multi_batch_write() + && self.wbs[self.index].count() >= WRITE_BATCH_MAX_KEY_NUM + { + self.index += 1; + if self.index >= self.wbs.len() { + self.wbs.push(WriteBatch::default()); + } + } + } +} + +/// Converts engine_traits options to tirocks write options. +pub fn to_tirocks_opt(opt: &engine_traits::WriteOptions) -> WriteOptions { + let mut r = WriteOptions::default(); + r.set_sync(opt.sync()) + .set_no_slowdown(opt.no_slowdown()) + .set_disable_wal(opt.disable_wal()) + + // TODO: enable it. + .set_memtable_insert_hint_per_batch(false); + r +} + +impl engine_traits::WriteBatch for RocksWriteBatchVec { + fn write_opt(&mut self, opts: &engine_traits::WriteOptions) -> Result { + let opts = to_tirocks_opt(opts); + if self.engine.multi_batch_write() { + self.engine + .as_inner() + .write_multi(&opts, &mut self.wbs[..=self.index]) + .map_err(r2e) + } else { + self.engine + .as_inner() + .write(&opts, &mut self.wbs[0]) + .map_err(r2e) + } + } + + fn data_size(&self) -> usize { + let mut size = 0; + for w in &self.wbs[..=self.index] { + size += w.as_bytes().len(); + } + size + } + + fn count(&self) -> usize { + let mut size = 0; + for w in &self.wbs[..=self.index] { + size += w.count(); + } + size + } + + fn is_empty(&self) -> bool { + self.wbs[0].as_bytes().is_empty() + } + + #[inline] + fn should_write_to_engine(&self) -> bool { + if self.engine.multi_batch_write() { + self.index >= WRITE_BATCH_MAX_BATCH_NUM + } else { + self.wbs[0].count() > RocksEngine::WRITE_BATCH_MAX_KEYS + } + } + + fn clear(&mut self) { + for i in 0..=self.index { + self.wbs[i].clear(); + } + self.save_points.clear(); + // Avoid making the wbs too big at one time, then the memory will be kept + // after reusing + if self.index > WRITE_BATCH_MAX_BATCH_NUM { + self.wbs.shrink_to(WRITE_BATCH_MAX_BATCH_NUM); + } + self.index = 0; + } + + fn set_save_point(&mut self) { + self.wbs[self.index].set_save_point(); + self.save_points.push(self.index); + } + + fn pop_save_point(&mut self) -> Result<()> { + if let Some(x) = self.save_points.pop() { + return self.wbs[x].pop_save_point().map_err(r2e); + } + Err(engine_traits::Error::Engine( + engine_traits::Status::with_error( + engine_traits::Code::InvalidArgument, + "no save point", + ), + )) + } + + fn rollback_to_save_point(&mut self) -> Result<()> { + if let Some(x) = self.save_points.pop() { + for i in x + 1..=self.index { + self.wbs[i].clear(); + } + self.index = x; + return self.wbs[x].rollback_to_save_point().map_err(r2e); + } + Err(engine_traits::Error::Engine( + engine_traits::Status::with_error( + engine_traits::Code::InvalidArgument, + "no save point", + ), + )) + } + + fn merge(&mut self, mut other: Self) -> Result<()> { + if !self.engine.multi_batch_write() { + let self_wb = &mut self.wbs[0]; + for wb in &other.wbs[..=other.index] { + self_wb.append(wb).map_err(r2e)?; + } + return Ok(()); + } + let self_wb = &mut self.wbs[self.index]; + let mut other_start = 0; + if self_wb.count() < WRITE_BATCH_MAX_KEY_NUM { + self_wb.append(&other.wbs[0]).map_err(r2e)?; + other_start = 1; + } + // From this point, either of following statements is true: + // - self_wb.count() >= WRITE_BATCH_MAX_KEY_NUM + // - other.index == 0 + if other.index >= other_start { + for wb in other.wbs.drain(other_start..=other.index) { + self.index += 1; + if self.wbs.len() == self.index { + self.wbs.push(wb); + } else { + self.wbs[self.index] = wb; + } + } + } + Ok(()) + } +} + +impl engine_traits::Mutable for RocksWriteBatchVec { + fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.as_inner().default_cf(); + self.wbs[self.index].put(handle, key, value).map_err(r2e) + } + + fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.cf(cf)?; + self.wbs[self.index].put(handle, key, value).map_err(r2e) + } + + fn delete(&mut self, key: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.as_inner().default_cf(); + self.wbs[self.index].delete(handle, key).map_err(r2e) + } + + fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.cf(cf)?; + self.wbs[self.index].delete(handle, key).map_err(r2e) + } + + fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.as_inner().default_cf(); + self.wbs[self.index] + .delete_range(handle, begin_key, end_key) + .map_err(r2e) + } + + fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + self.check_switch_batch(); + let handle = self.engine.cf(cf)?; + self.wbs[self.index] + .delete_range(handle, begin_key, end_key) + .map_err(r2e) + } +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use engine_traits::{Mutable, Peekable, WriteBatch, WriteBatchExt, CF_DEFAULT}; + use tempfile::Builder; + + use super::*; + use crate::{ + cf_options::RocksCfOptions, db_options::RocksDbOptions, new_engine_opt, RocksEngine, + }; + + fn new_engine(path: &Path, multi_batch_write: bool) -> RocksEngine { + let mut db_opt = RocksDbOptions::default(); + db_opt + .set_unordered_write(false) + .set_enable_pipelined_write(!multi_batch_write) + .set_multi_batch_write(multi_batch_write); + let engine = new_engine_opt( + &path.join("db"), + db_opt, + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); + assert_eq!( + engine.as_inner().db_options().multi_batch_write(), + multi_batch_write + ); + engine + } + + #[test] + fn test_should_write_to_engine_with_pipeline_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let engine = new_engine(path.path(), false); + let mut wb = engine.write_batch(); + for _ in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.write().unwrap(); + + let v = engine.get_value(b"aaa").unwrap(); + + assert!(v.is_some()); + assert_eq!(v.unwrap(), b"bbb"); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } + + #[test] + fn test_should_write_to_engine_with_multi_batch_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let engine = new_engine(path.path(), true); + let mut wb = engine.write_batch(); + for _ in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _ in 0..WRITE_BATCH_MAX_BATCH_NUM * WRITE_BATCH_MAX_KEY_NUM { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } + + #[test] + fn test_write_batch_merge() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + for multi_batch_write in &[false, true] { + let engine = new_engine(path.path(), *multi_batch_write); + let mut wb = engine.write_batch(); + for _ in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert_eq!(wb.count(), RocksEngine::WRITE_BATCH_MAX_KEYS); + + let mut wb2 = engine.write_batch(); + for _ in 0..WRITE_BATCH_MAX_KEY_NUM / 2 { + wb2.put(b"aaa", b"bbb").unwrap(); + } + assert_eq!(wb2.count(), WRITE_BATCH_MAX_KEY_NUM / 2); + // The only batch should be moved directly. + wb.merge(wb2).unwrap(); + assert_eq!( + wb.count(), + RocksEngine::WRITE_BATCH_MAX_KEYS + WRITE_BATCH_MAX_KEY_NUM / 2 + ); + if *multi_batch_write { + assert_eq!( + wb.wbs.len(), + RocksEngine::WRITE_BATCH_MAX_KEYS / WRITE_BATCH_MAX_KEY_NUM + 1 + ); + } + + let mut wb3 = engine.write_batch(); + for _ in 0..WRITE_BATCH_MAX_KEY_NUM / 2 * 3 { + wb3.put(b"aaa", b"bbb").unwrap(); + } + assert_eq!(wb3.count(), WRITE_BATCH_MAX_KEY_NUM / 2 * 3); + // The half batch should be merged together, and then move the left one. + wb.merge(wb3).unwrap(); + assert_eq!( + wb.count(), + RocksEngine::WRITE_BATCH_MAX_KEYS + WRITE_BATCH_MAX_KEY_NUM * 2 + ); + if *multi_batch_write { + assert_eq!( + wb.wbs.len(), + RocksEngine::WRITE_BATCH_MAX_KEYS / WRITE_BATCH_MAX_KEY_NUM + 2 + ); + } + } + } +} diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index c53e06792da..32a23cd070e 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -656,28 +656,6 @@ lazy_static! { "Pending read index count." ).unwrap(); - pub static ref APPLY_PERF_CONTEXT_TIME_HISTOGRAM: HistogramVec = - register_histogram_vec!( - "tikv_raftstore_apply_perf_context_time_duration_secs", - "Bucketed histogram of request wait time duration.", - &["type"], - exponential_buckets(0.00001, 2.0, 26).unwrap() - ).unwrap(); - - pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM: HistogramVec = - register_histogram_vec!( - "tikv_raftstore_store_perf_context_time_duration_secs", - "Bucketed histogram of request wait time duration.", - &["type"], - exponential_buckets(0.00001, 2.0, 26).unwrap() - ).unwrap(); - - pub static ref APPLY_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration= - auto_flush_from!(APPLY_PERF_CONTEXT_TIME_HISTOGRAM, PerfContextTimeDuration); - - pub static ref STORE_PERF_CONTEXT_TIME_HISTOGRAM_STATIC: PerfContextTimeDuration= - auto_flush_from!(STORE_PERF_CONTEXT_TIME_HISTOGRAM, PerfContextTimeDuration); - pub static ref READ_QPS_TOPN: GaugeVec = register_gauge_vec!( "tikv_read_qps_topn", From 585763a39348b12e2b5c54430cc9b3ea916f65e6 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 26 Sep 2022 14:29:45 +0800 Subject: [PATCH 244/676] commands: use ReaderWithStats in flashback_to_version command (#13525) ref tikv/tikv#13303 Use `ReaderWithStats` in `flashback_to_version` command to collect the statistics info as much as possible. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- src/storage/txn/actions/cleanup.rs | 4 +--- src/storage/txn/actions/flashback_to_version.rs | 4 ++-- src/storage/txn/commands/flashback_to_version.rs | 9 ++++++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/storage/txn/actions/cleanup.rs b/src/storage/txn/actions/cleanup.rs index 19cb90f0a22..c72905c8910 100644 --- a/src/storage/txn/actions/cleanup.rs +++ b/src/storage/txn/actions/cleanup.rs @@ -39,14 +39,12 @@ pub fn cleanup( ErrorInner::KeyIsLocked(lock.clone().into_lock_info(key.into_raw()?)).into(), ); } - - let is_pessimistic_txn = !lock.for_update_ts.is_zero(); rollback_lock( txn, reader, key, lock, - is_pessimistic_txn, + lock.is_pessimistic_txn(), !protect_rollback, ) } diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 8047d5dd304..0b9f0461297 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -75,9 +75,9 @@ pub fn flashback_to_version_read_write( Ok((key_old_writes, has_remain_writes)) } -pub fn flashback_to_version( +pub fn flashback_to_version( txn: &mut MvccTxn, - reader: &mut SnapshotReader, + reader: &mut SnapshotReader, next_lock_key: &mut Option, next_write_key: &mut Option, key_locks: Vec<(Key, Lock)>, diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 61086020b09..b4255138eeb 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -9,8 +9,8 @@ use crate::storage::{ mvcc::{MvccTxn, SnapshotReader}, txn::{ commands::{ - Command, CommandExt, FlashbackToVersionReadPhase, ResponsePolicy, TypedCommand, - WriteCommand, WriteContext, WriteResult, + Command, CommandExt, FlashbackToVersionReadPhase, ReaderWithStats, ResponsePolicy, + TypedCommand, WriteCommand, WriteContext, WriteResult, }, flashback_to_version, latch, Result, }, @@ -62,7 +62,10 @@ impl CommandExt for FlashbackToVersion { impl WriteCommand for FlashbackToVersion { fn process_write(mut self, snapshot: S, context: WriteContext<'_, L>) -> Result { - let mut reader = SnapshotReader::new_with_ctx(self.version, snapshot, &self.ctx); + let mut reader = ReaderWithStats::new( + SnapshotReader::new_with_ctx(self.version, snapshot, &self.ctx), + context.statistics, + ); let mut txn = MvccTxn::new(TimeStamp::zero(), context.concurrency_manager); let mut next_lock_key = self.next_lock_key.take(); From 975c0543238a28c9d8a2aaf63009c429e6c2218d Mon Sep 17 00:00:00 2001 From: Hangjie Mo Date: Mon, 26 Sep 2022 17:29:44 +0800 Subject: [PATCH 245/676] metrics: fix grafana expr for `tikv_gc_compaction_filtered` (#13536) close tikv/tikv#13537 fix error grafana expr for `tikv_gc_compaction_filtered ` Signed-off-by: Jason Mo Co-authored-by: Ti Chi Robot --- metrics/grafana/tikv_details.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index b48aa216a93..9d64207c214 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -22555,7 +22555,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_gc_compaction_filtered{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by {key_mode}", + "expr": "sum(rate(tikv_gc_compaction_filtered{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (key_mode)", "format": "time_series", "instant": false, "interval": "", From a68a44e09dd4ef7de59db22b11593e7abd4e94df Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 27 Sep 2022 17:35:44 +0800 Subject: [PATCH 246/676] *: strip the RefCell off the LocalReader in ServerRaftStoreRouter (#13542) close tikv/tikv#13546 Signed-off-by: SpadeA-Tang --- components/backup-stream/src/event_loader.rs | 10 +- components/backup/src/endpoint.rs | 22 +- components/backup/src/service.rs | 6 +- components/cdc/src/initializer.rs | 32 +- components/cdc/src/old_value.rs | 140 +-- components/raftstore/src/router.rs | 26 +- components/resolved_ts/src/cmd.rs | 18 +- components/test_backup/src/lib.rs | 4 +- components/test_raftstore/src/cluster.rs | 8 +- components/test_raftstore/src/node.rs | 2 +- components/test_raftstore/src/server.rs | 6 +- .../test_raftstore/src/transport_simulate.rs | 4 +- components/tikv_kv/src/btree_engine.rs | 16 +- components/tikv_kv/src/lib.rs | 42 +- components/tikv_kv/src/mock_engine.rs | 2 +- components/tikv_kv/src/rocksdb_engine.rs | 2 +- src/coprocessor/endpoint.rs | 2 +- src/import/duplicate_detect.rs | 6 +- src/server/gc_worker/compaction_filter.rs | 82 +- src/server/gc_worker/gc_worker.rs | 103 +- src/server/raftkv.rs | 10 +- src/storage/kv/test_engine_builder.rs | 26 +- src/storage/metrics.rs | 2 +- src/storage/mod.rs | 24 +- src/storage/mvcc/consistency_check.rs | 24 +- src/storage/mvcc/mod.rs | 65 +- src/storage/mvcc/reader/point_getter.rs | 379 +++---- src/storage/mvcc/reader/reader.rs | 4 +- src/storage/mvcc/reader/scanner/backward.rs | 140 +-- src/storage/mvcc/reader/scanner/forward.rs | 254 ++--- src/storage/mvcc/reader/scanner/mod.rs | 157 +-- src/storage/mvcc/txn.rs | 958 ++++++++++-------- src/storage/raw/raw_mvcc.rs | 2 +- .../txn/actions/acquire_pessimistic_lock.rs | 697 +++++++------ .../txn/actions/check_data_constraint.rs | 2 +- src/storage/txn/actions/cleanup.rs | 68 +- src/storage/txn/actions/commit.rs | 82 +- .../txn/actions/flashback_to_version.rs | 78 +- src/storage/txn/actions/gc.rs | 52 +- src/storage/txn/actions/prewrite.rs | 654 ++++++------ src/storage/txn/actions/tests.rs | 61 +- src/storage/txn/commands/atomic_store.rs | 2 +- .../txn/commands/check_secondary_locks.rs | 43 +- src/storage/txn/commands/check_txn_status.rs | 285 +++--- src/storage/txn/commands/compare_and_swap.rs | 12 +- src/storage/txn/commands/mod.rs | 14 +- .../txn/commands/pessimistic_rollback.rs | 76 +- src/storage/txn/commands/prewrite.rs | 413 ++++---- src/storage/txn/commands/rollback.rs | 12 +- src/storage/txn/commands/txn_heart_beat.rs | 52 +- src/storage/txn/scheduler.rs | 9 +- src/storage/txn/store.rs | 2 +- tests/benches/hierarchy/engine/mod.rs | 6 +- tests/benches/hierarchy/mvcc/mod.rs | 22 +- tests/benches/hierarchy/txn/mod.rs | 21 +- tests/benches/misc/raftkv/mod.rs | 6 +- tests/benches/misc/storage/incremental_get.rs | 2 +- tests/failpoints/cases/test_gc_metrics.rs | 32 +- tests/failpoints/cases/test_gc_worker.rs | 18 +- tests/failpoints/cases/test_storage.rs | 2 +- tests/failpoints/cases/test_transaction.rs | 28 +- tests/failpoints/cases/test_ttl.rs | 4 +- tests/integrations/raftstore/test_merge.rs | 4 +- .../raftstore/test_transfer_leader.rs | 4 +- tests/integrations/storage/test_raftkv.rs | 60 +- 65 files changed, 2833 insertions(+), 2568 deletions(-) diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index fc84fab0635..90a330cf446 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -503,14 +503,14 @@ mod tests { #[test] fn test_disk_read() { - let engine = TestEngineBuilder::new().build_without_cache().unwrap(); + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); for i in 0..100 { let owned_key = format!("{:06}", i); let key = owned_key.as_bytes(); let owned_value = [i as u8; 512]; let value = owned_value.as_slice(); - must_prewrite_put(&engine, key, value, key, i * 2); - must_commit(&engine, key, i * 2, i * 2 + 1); + must_prewrite_put(&mut engine, key, value, key, i * 2); + must_commit(&mut engine, key, i * 2, i * 2 + 1); } // let compact the memtable to disk so we can see the disk read. engine.get_rocksdb().as_inner().compact_range(None, None); @@ -520,8 +520,8 @@ mod tests { r.set_start_key(b"".to_vec()); r.set_end_key(b"".to_vec()); - let snap = - block_on(async { tikv_kv::snapshot(&engine, SnapContext::default()).await }).unwrap(); + let snap = block_on(async { tikv_kv::snapshot(&mut engine, SnapContext::default()).await }) + .unwrap(); let mut loader = EventLoader::load_from(snap, TimeStamp::zero(), TimeStamp::max(), &r).unwrap(); diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index b24c61f4efd..1d4f9bbfdd9 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -290,7 +290,7 @@ impl BackupRange { async fn backup( &self, writer_builder: BackupWriterBuilder, - engine: E, + mut engine: E, concurrency_manager: ConcurrencyManager, backup_ts: TimeStamp, begin_ts: TimeStamp, @@ -507,7 +507,7 @@ impl BackupRange { async fn backup_raw_kv_to_file( &self, - engine: E, + mut engine: E, db: RocksEngine, limiter: &Limiter, file_name: String, @@ -1515,7 +1515,7 @@ pub mod tests { let limiter = Arc::new(IoRateLimiter::new_for_test()); let stats = limiter.statistics().unwrap(); let (tmp, endpoint) = new_endpoint_with_limiter(Some(limiter), ApiVersion::V1, false, None); - let engine = endpoint.engine.clone(); + let mut engine = endpoint.engine.clone(); endpoint .region_info @@ -1531,13 +1531,13 @@ pub mod tests { let commit = alloc_ts(); let key = format!("{}", i); must_prewrite_put( - &engine, + &mut engine, key.as_bytes(), &vec![i; *len], key.as_bytes(), start, ); - must_commit(&engine, key.as_bytes(), start, commit); + must_commit(&mut engine, key.as_bytes(), start, commit); backup_tss.push((alloc_ts(), len)); } } @@ -1851,7 +1851,7 @@ pub mod tests { #[test] fn test_scan_error() { let (tmp, endpoint) = new_endpoint(); - let engine = endpoint.engine.clone(); + let mut engine = endpoint.engine.clone(); endpoint .region_info @@ -1862,7 +1862,7 @@ pub mod tests { let start = alloc_ts(); let key = format!("{}", start); must_prewrite_put( - &engine, + &mut engine, key.as_bytes(), key.as_bytes(), key.as_bytes(), @@ -1890,7 +1890,7 @@ pub mod tests { // Commit the perwrite. let commit = alloc_ts(); - must_commit(&engine, key.as_bytes(), start, commit); + must_commit(&mut engine, key.as_bytes(), start, commit); // Test whether it can correctly convert not leader to region error. engine.trigger_not_leader(); @@ -1916,7 +1916,7 @@ pub mod tests { #[test] fn test_cancel() { let (temp, mut endpoint) = new_endpoint(); - let engine = endpoint.engine.clone(); + let mut engine = endpoint.engine.clone(); endpoint .region_info @@ -1927,7 +1927,7 @@ pub mod tests { let start = alloc_ts(); let key = format!("{}", start); must_prewrite_put( - &engine, + &mut engine, key.as_bytes(), key.as_bytes(), key.as_bytes(), @@ -1935,7 +1935,7 @@ pub mod tests { ); // Commit the perwrite. let commit = alloc_ts(); - must_commit(&engine, key.as_bytes(), start, commit); + must_commit(&mut engine, key.as_bytes(), start, commit); let now = alloc_ts(); let mut req = BackupRequest::default(); diff --git a/components/backup/src/service.rs b/components/backup/src/service.rs index 1281f12ad79..dd3355b1e92 100644 --- a/components/backup/src/service.rs +++ b/components/backup/src/service.rs @@ -160,7 +160,7 @@ mod tests { let (_server, client, mut rx) = new_rpc_suite(); let (tmp, endpoint) = new_endpoint(); - let engine = endpoint.engine.clone(); + let mut engine = endpoint.engine.clone(); endpoint.region_info.set_regions(vec![ (b"".to_vec(), b"2".to_vec(), 1), (b"2".to_vec(), b"5".to_vec(), 2), @@ -172,14 +172,14 @@ mod tests { let start = alloc_ts(); let key = format!("{}", i); must_prewrite_put( - &engine, + &mut engine, key.as_bytes(), key.as_bytes(), key.as_bytes(), start, ); let commit = alloc_ts(); - must_commit(&engine, key.as_bytes(), start, commit); + must_commit(&mut engine, key.as_bytes(), start, commit); } let now = alloc_ts(); diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 6be880af84c..36c1636a7e8 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -652,7 +652,7 @@ mod tests { #[test] fn test_initializer_build_resolver() { - let engine = TestEngineBuilder::new().build_without_cache().unwrap(); + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); let mut expected_locks = BTreeMap::>>::new(); @@ -662,7 +662,7 @@ mod tests { let k = &[b'k', i]; total_bytes += k.len(); let ts = TimeStamp::new(i as _); - must_acquire_pessimistic_lock(&engine, k, k, ts, ts); + must_acquire_pessimistic_lock(&mut engine, k, k, ts, ts); } for i in 10..100 { @@ -670,7 +670,7 @@ mod tests { total_bytes += k.len(); total_bytes += v.len(); let ts = TimeStamp::new(i as _); - must_prewrite_put(&engine, k, v, k, ts); + must_prewrite_put(&mut engine, k, v, k, ts); expected_locks .entry(ts) .or_default() @@ -760,7 +760,7 @@ mod tests { // handling `OldValue::SeekWrite` with `OldValueReader`. #[test] fn test_incremental_scanner_with_hint_min_ts() { - let engine = TestEngineBuilder::new().build_without_cache().unwrap(); + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); let v_suffix = |suffix: usize| -> Vec { let suffix = suffix.to_string().into_bytes(); @@ -770,7 +770,11 @@ mod tests { v }; - let check_handling_old_value_seek_write = || { + fn check_handling_old_value_seek_write(engine: &mut E, v_suffix: F) + where + E: Engine, + F: Fn(usize) -> Vec, + { // Do incremental scan with different `hint_min_ts` values. for checkpoint_ts in [200, 100, 150] { let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( @@ -807,29 +811,29 @@ mod tests { block_on(th).unwrap(); worker.stop(); } - }; + } // Create the initial data with CF_WRITE L0: |zkey_110, zkey1_160| - must_prewrite_put(&engine, b"zkey", &v_suffix(100), b"zkey", 100); - must_commit(&engine, b"zkey", 100, 110); - must_prewrite_put(&engine, b"zzzz", &v_suffix(150), b"zzzz", 150); - must_commit(&engine, b"zzzz", 150, 160); + must_prewrite_put(&mut engine, b"zkey", &v_suffix(100), b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); + must_prewrite_put(&mut engine, b"zzzz", &v_suffix(150), b"zzzz", 150); + must_commit(&mut engine, b"zzzz", 150, 160); engine .kv_engine() .unwrap() .flush_cf(CF_WRITE, true) .unwrap(); - must_prewrite_delete(&engine, b"zkey", b"zkey", 200); - check_handling_old_value_seek_write(); // For TxnEntry::Prewrite. + must_prewrite_delete(&mut engine, b"zkey", b"zkey", 200); + check_handling_old_value_seek_write(&mut engine, v_suffix); // For TxnEntry::Prewrite. // CF_WRITE L0: |zkey_110, zkey1_160|, |zkey_210| - must_commit(&engine, b"zkey", 200, 210); + must_commit(&mut engine, b"zkey", 200, 210); engine .kv_engine() .unwrap() .flush_cf(CF_WRITE, false) .unwrap(); - check_handling_old_value_seek_write(); // For TxnEntry::Commit. + check_handling_old_value_seek_write(&mut engine, v_suffix); // For TxnEntry::Commit. } #[test] diff --git a/components/cdc/src/old_value.rs b/components/cdc/src/old_value.rs index 9d60474b952..1149d8ce3e0 100644 --- a/components/cdc/src/old_value.rs +++ b/components/cdc/src/old_value.rs @@ -381,120 +381,120 @@ mod tests { #[test] fn test_old_value_reader() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let kv_engine = engine.get_rocksdb(); let k = b"k"; let key = Key::from_raw(k); - must_prewrite_put(&engine, k, b"v1", k, 1); + must_prewrite_put(&mut engine, k, b"v1", k, 1); must_get_eq(&kv_engine, &key, 2, None); must_get_eq(&kv_engine, &key, 1, None); - must_commit(&engine, k, 1, 1); + must_commit(&mut engine, k, 1, 1); must_get_eq(&kv_engine, &key, 1, Some(b"v1".to_vec())); - must_prewrite_put(&engine, k, b"v2", k, 2); + must_prewrite_put(&mut engine, k, b"v2", k, 2); must_get_eq(&kv_engine, &key, 2, Some(b"v1".to_vec())); - must_rollback(&engine, k, 2, false); + must_rollback(&mut engine, k, 2, false); - must_prewrite_put(&engine, k, b"v3", k, 3); + must_prewrite_put(&mut engine, k, b"v3", k, 3); must_get_eq(&kv_engine, &key, 3, Some(b"v1".to_vec())); - must_commit(&engine, k, 3, 3); + must_commit(&mut engine, k, 3, 3); - must_prewrite_delete(&engine, k, k, 4); + must_prewrite_delete(&mut engine, k, k, 4); must_get_eq(&kv_engine, &key, 4, Some(b"v3".to_vec())); - must_commit(&engine, k, 4, 4); + must_commit(&mut engine, k, 4, 4); - must_prewrite_put(&engine, k, vec![b'v'; 5120].as_slice(), k, 5); + must_prewrite_put(&mut engine, k, vec![b'v'; 5120].as_slice(), k, 5); must_get_eq(&kv_engine, &key, 5, None); - must_commit(&engine, k, 5, 5); + must_commit(&mut engine, k, 5, 5); - must_prewrite_delete(&engine, k, k, 6); + must_prewrite_delete(&mut engine, k, k, 6); must_get_eq(&kv_engine, &key, 6, Some(vec![b'v'; 5120])); - must_rollback(&engine, k, 6, false); + must_rollback(&mut engine, k, 6, false); - must_prewrite_put(&engine, k, b"v4", k, 7); - must_commit(&engine, k, 7, 9); + must_prewrite_put(&mut engine, k, b"v4", k, 7); + must_commit(&mut engine, k, 7, 9); - must_acquire_pessimistic_lock(&engine, k, k, 8, 10); - must_pessimistic_prewrite_put(&engine, k, b"v5", k, 8, 10, DoPessimisticCheck); + must_acquire_pessimistic_lock(&mut engine, k, k, 8, 10); + must_pessimistic_prewrite_put(&mut engine, k, b"v5", k, 8, 10, DoPessimisticCheck); must_get_eq(&kv_engine, &key, 10, Some(b"v4".to_vec())); - must_commit(&engine, k, 8, 11); + must_commit(&mut engine, k, 8, 11); } #[test] fn test_old_value_reader_check_gc_fence() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let kv_engine = engine.get_rocksdb(); // PUT, Read // `--------------^ - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 10); - must_commit(&engine, b"k1", 10, 20); - must_cleanup_with_gc_fence(&engine, b"k1", 20, 0, 50, true); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 10); + must_commit(&mut engine, b"k1", 10, 20); + must_cleanup_with_gc_fence(&mut engine, b"k1", 20, 0, 50, true); // PUT, Read // `---------^ - must_prewrite_put(&engine, b"k2", b"v2", b"k2", 11); - must_commit(&engine, b"k2", 11, 20); - must_cleanup_with_gc_fence(&engine, b"k2", 20, 0, 40, true); + must_prewrite_put(&mut engine, b"k2", b"v2", b"k2", 11); + must_commit(&mut engine, b"k2", 11, 20); + must_cleanup_with_gc_fence(&mut engine, b"k2", 20, 0, 40, true); // PUT, Read // `-----^ - must_prewrite_put(&engine, b"k3", b"v3", b"k3", 12); - must_commit(&engine, b"k3", 12, 20); - must_cleanup_with_gc_fence(&engine, b"k3", 20, 0, 30, true); + must_prewrite_put(&mut engine, b"k3", b"v3", b"k3", 12); + must_commit(&mut engine, b"k3", 12, 20); + must_cleanup_with_gc_fence(&mut engine, b"k3", 20, 0, 30, true); // PUT, PUT, Read // `-----^ `----^ - must_prewrite_put(&engine, b"k4", b"v4", b"k4", 13); - must_commit(&engine, b"k4", 13, 14); - must_prewrite_put(&engine, b"k4", b"v4x", b"k4", 15); - must_commit(&engine, b"k4", 15, 20); - must_cleanup_with_gc_fence(&engine, b"k4", 14, 0, 20, false); - must_cleanup_with_gc_fence(&engine, b"k4", 20, 0, 30, true); + must_prewrite_put(&mut engine, b"k4", b"v4", b"k4", 13); + must_commit(&mut engine, b"k4", 13, 14); + must_prewrite_put(&mut engine, b"k4", b"v4x", b"k4", 15); + must_commit(&mut engine, b"k4", 15, 20); + must_cleanup_with_gc_fence(&mut engine, b"k4", 14, 0, 20, false); + must_cleanup_with_gc_fence(&mut engine, b"k4", 20, 0, 30, true); // PUT, DEL, Read // `-----^ `----^ - must_prewrite_put(&engine, b"k5", b"v5", b"k5", 13); - must_commit(&engine, b"k5", 13, 14); - must_prewrite_delete(&engine, b"k5", b"v5", 15); - must_commit(&engine, b"k5", 15, 20); - must_cleanup_with_gc_fence(&engine, b"k5", 14, 0, 20, false); - must_cleanup_with_gc_fence(&engine, b"k5", 20, 0, 30, true); + must_prewrite_put(&mut engine, b"k5", b"v5", b"k5", 13); + must_commit(&mut engine, b"k5", 13, 14); + must_prewrite_delete(&mut engine, b"k5", b"v5", 15); + must_commit(&mut engine, b"k5", 15, 20); + must_cleanup_with_gc_fence(&mut engine, b"k5", 14, 0, 20, false); + must_cleanup_with_gc_fence(&mut engine, b"k5", 20, 0, 30, true); // PUT, LOCK, LOCK, Read // `------------------------^ - must_prewrite_put(&engine, b"k6", b"v6", b"k6", 16); - must_commit(&engine, b"k6", 16, 20); - must_prewrite_lock(&engine, b"k6", b"k6", 25); - must_commit(&engine, b"k6", 25, 26); - must_prewrite_lock(&engine, b"k6", b"k6", 28); - must_commit(&engine, b"k6", 28, 29); - must_cleanup_with_gc_fence(&engine, b"k6", 20, 0, 50, true); + must_prewrite_put(&mut engine, b"k6", b"v6", b"k6", 16); + must_commit(&mut engine, b"k6", 16, 20); + must_prewrite_lock(&mut engine, b"k6", b"k6", 25); + must_commit(&mut engine, b"k6", 25, 26); + must_prewrite_lock(&mut engine, b"k6", b"k6", 28); + must_commit(&mut engine, b"k6", 28, 29); + must_cleanup_with_gc_fence(&mut engine, b"k6", 20, 0, 50, true); // PUT, LOCK, LOCK, Read // `---------^ - must_prewrite_put(&engine, b"k7", b"v7", b"k7", 16); - must_commit(&engine, b"k7", 16, 20); - must_prewrite_lock(&engine, b"k7", b"k7", 25); - must_commit(&engine, b"k7", 25, 26); - must_cleanup_with_gc_fence(&engine, b"k7", 20, 0, 27, true); - must_prewrite_lock(&engine, b"k7", b"k7", 28); - must_commit(&engine, b"k7", 28, 29); + must_prewrite_put(&mut engine, b"k7", b"v7", b"k7", 16); + must_commit(&mut engine, b"k7", 16, 20); + must_prewrite_lock(&mut engine, b"k7", b"k7", 25); + must_commit(&mut engine, b"k7", 25, 26); + must_cleanup_with_gc_fence(&mut engine, b"k7", 20, 0, 27, true); + must_prewrite_lock(&mut engine, b"k7", b"k7", 28); + must_commit(&mut engine, b"k7", 28, 29); // PUT, Read // * (GC fence ts is 0) - must_prewrite_put(&engine, b"k8", b"v8", b"k8", 17); - must_commit(&engine, b"k8", 17, 30); - must_cleanup_with_gc_fence(&engine, b"k8", 30, 0, 0, true); + must_prewrite_put(&mut engine, b"k8", b"v8", b"k8", 17); + must_commit(&mut engine, b"k8", 17, 30); + must_cleanup_with_gc_fence(&mut engine, b"k8", 30, 0, 0, true); // PUT, LOCK, Read // `-----------^ - must_prewrite_put(&engine, b"k9", b"v9", b"k9", 18); - must_commit(&engine, b"k9", 18, 20); - must_prewrite_lock(&engine, b"k9", b"k9", 25); - must_commit(&engine, b"k9", 25, 26); - must_cleanup_with_gc_fence(&engine, b"k9", 20, 0, 27, true); + must_prewrite_put(&mut engine, b"k9", b"v9", b"k9", 18); + must_commit(&mut engine, b"k9", 18, 20); + must_prewrite_lock(&mut engine, b"k9", b"k9", 25); + must_commit(&mut engine, b"k9", 25, 26); + must_cleanup_with_gc_fence(&mut engine, b"k9", 20, 0, 27, true); let expected_results = vec![ (b"k1", Some(b"v1")), @@ -515,16 +515,16 @@ mod tests { #[test] fn test_old_value_reuse_cursor() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let kv_engine = engine.get_rocksdb(); let value = || vec![b'v'; 1024]; for i in 0..100 { let key = format!("key-{:0>3}", i).into_bytes(); - must_prewrite_put(&engine, &key, &value(), &key, 100); - must_commit(&engine, &key, 100, 101); - must_prewrite_put(&engine, &key, &value(), &key, 200); - must_commit(&engine, &key, 200, 201); + must_prewrite_put(&mut engine, &key, &value(), &key, 100); + must_commit(&mut engine, &key, 100, 101); + must_prewrite_put(&mut engine, &key, &value(), &key, 200); + must_commit(&mut engine, &key, 200, 201); } let snapshot = Arc::new(kv_engine.snapshot()); @@ -586,14 +586,14 @@ mod tests { let mut cfg = DbConfig::default(); cfg.writecf.disable_auto_compactions = true; cfg.writecf.pin_l0_filter_and_index_blocks = false; - let engine = TestEngineBuilder::new().build_with_cfg(&cfg).unwrap(); + let mut engine = TestEngineBuilder::new().build_with_cfg(&cfg).unwrap(); let kv_engine = engine.get_rocksdb(); // Key must start with `z` to pass `TsFilter`'s check. for i in 0..4 { let key = format!("zkey-{:0>3}", i).into_bytes(); - must_prewrite_put(&engine, &key, b"value", &key, 100); - must_commit(&engine, &key, 100, 101); + must_prewrite_put(&mut engine, &key, b"value", &key, 100); + must_commit(&mut engine, &key, 100, 101); kv_engine.flush_cf(CF_WRITE, true).unwrap(); } diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index f52687c311f..90cc41f2bd8 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -1,8 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::cell::RefCell; - use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine, Snapshot}; use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; @@ -115,13 +113,13 @@ where EK: KvEngine, { fn read( - &self, + &mut self, read_id: Option, req: RaftCmdRequest, cb: Callback, ) -> RaftStoreResult<()>; - fn release_snapshot_cache(&self); + fn release_snapshot_cache(&mut self); } #[derive(Clone)] @@ -174,7 +172,7 @@ where { router: RaftRouter, local_reader: - RefCell, EK, CachedReadDelegate, StoreMetaDelegate>>, + LocalReader, EK, CachedReadDelegate, StoreMetaDelegate>, } impl Clone for ServerRaftStoreRouter @@ -194,9 +192,13 @@ impl ServerRaftStoreRouter { /// Creates a new router. pub fn new( router: RaftRouter, - reader: LocalReader, EK, CachedReadDelegate, StoreMetaDelegate>, + local_reader: LocalReader< + RaftRouter, + EK, + CachedReadDelegate, + StoreMetaDelegate, + >, ) -> ServerRaftStoreRouter { - let local_reader = RefCell::new(reader); ServerRaftStoreRouter { router, local_reader, @@ -247,19 +249,17 @@ impl RaftStoreRouter for ServerRaftStoreRouter impl LocalReadRouter for ServerRaftStoreRouter { fn read( - &self, + &mut self, read_id: Option, req: RaftCmdRequest, cb: Callback, ) -> RaftStoreResult<()> { - let mut local_reader = self.local_reader.borrow_mut(); - local_reader.read(read_id, req, cb); + self.local_reader.read(read_id, req, cb); Ok(()) } - fn release_snapshot_cache(&self) { - let mut local_reader = self.local_reader.borrow_mut(); - local_reader.release_snapshot_cache(); + fn release_snapshot_cache(&mut self) { + self.local_reader.release_snapshot_cache(); } } diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 8d3eb3bb48d..89d7167cc26 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -316,7 +316,7 @@ mod tests { #[test] fn test_cmd_encode() { let rocks_engine = TestEngineBuilder::new().build().unwrap(); - let engine = MockEngineBuilder::from_rocks_engine(rocks_engine).build(); + let mut engine = MockEngineBuilder::from_rocks_engine(rocks_engine).build(); let mut reqs = vec![Modify::Put("default", Key::from_raw(b"k1"), b"v1".to_vec()).into()]; let mut req = Request::default(); @@ -326,17 +326,17 @@ mod tests { assert_eq!(has_ingest_sst, true); assert!(ChangeLog::encode_rows(changes, false).is_empty()); - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 1); - must_commit(&engine, b"k1", 1, 2); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 1); + must_commit(&mut engine, b"k1", 1, 2); - must_prewrite_put(&engine, b"k1", b"v2", b"k1", 3); - must_rollback(&engine, b"k1", 3, false); + must_prewrite_put(&mut engine, b"k1", b"v2", b"k1", 3); + must_rollback(&mut engine, b"k1", 3, false); - must_prewrite_put(&engine, b"k1", &[b'v'; 512], b"k1", 4); - must_commit(&engine, b"k1", 4, 5); + must_prewrite_put(&mut engine, b"k1", &[b'v'; 512], b"k1", 4); + must_commit(&mut engine, b"k1", 4, 5); - must_prewrite_put(&engine, b"k1", b"v3", b"k1", 5); - must_rollback(&engine, b"k1", 5, false); + must_prewrite_put(&mut engine, b"k1", b"v3", b"k1", 5); + must_rollback(&mut engine, b"k1", 5, false); let k1 = Key::from_raw(b"k1"); let rows: Vec<_> = engine diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index 5447e8f2b37..a45a3f52462 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -339,7 +339,7 @@ impl TestSuite { let mut total_kvs = 0; let mut total_bytes = 0; let sim = self.cluster.sim.rl(); - let engine = sim.storages[&self.context.get_peer().get_store_id()].clone(); + let mut engine = sim.storages[&self.context.get_peer().get_store_id()].clone(); let snap_ctx = SnapContext { pb_ctx: &self.context, ..Default::default() @@ -382,7 +382,7 @@ impl TestSuite { let mut total_bytes = 0; let sim = self.cluster.sim.rl(); - let engine = sim.storages[&self.context.get_peer().get_store_id()].clone(); + let mut engine = sim.storages[&self.context.get_peer().get_store_id()].clone(); let snap_ctx = SnapContext { pb_ctx: &self.context, ..Default::default() diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 9a69c7110b4..ef0f2246b7d 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -111,7 +111,7 @@ pub trait Simulator { } fn read( - &self, + &mut self, batch_id: Option, request: RaftCmdRequest, timeout: Duration, @@ -124,7 +124,7 @@ pub trait Simulator { } fn async_read( - &self, + &mut self, node_id: u64, batch_id: Option, request: RaftCmdRequest, @@ -415,7 +415,7 @@ impl Cluster { request: RaftCmdRequest, timeout: Duration, ) -> Result { - match self.sim.rl().read(batch_id, request.clone(), timeout) { + match self.sim.wl().read(batch_id, request.clone(), timeout) { Err(e) => { warn!("failed to read {:?}: {:?}", request, e); Err(e) @@ -439,7 +439,7 @@ impl Cluster { } } let ret = if is_read { - self.sim.rl().read(None, request.clone(), timeout) + self.sim.wl().read(None, request.clone(), timeout) } else { self.sim.rl().call_command(request.clone(), timeout) }; diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index f604ce7dff7..d6aa1eaefc8 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -444,7 +444,7 @@ impl Simulator for NodeCluster { } fn async_read( - &self, + &mut self, node_id: u64, batch_id: Option, request: RaftCmdRequest, diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 859477ee5b0..a3a9455fb20 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -708,13 +708,13 @@ impl Simulator for ServerCluster { } fn async_read( - &self, + &mut self, node_id: u64, batch_id: Option, request: RaftCmdRequest, cb: Callback, ) { - match self.metas.get(&node_id) { + match self.metas.get_mut(&node_id) { None => { let e: RaftError = box_err!("missing sender for store {}", node_id); let mut resp = RaftCmdResponse::default(); @@ -781,7 +781,7 @@ impl Cluster { ctx.set_peer(leader); ctx.set_region_epoch(epoch); - let storage = self.sim.rl().storages.get(&store_id).unwrap().clone(); + let mut storage = self.sim.rl().storages.get(&store_id).unwrap().clone(); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index 0aa778d01b0..00c12073511 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -251,7 +251,7 @@ impl> RaftStoreRouter for SimulateT impl> LocalReadRouter for SimulateTransport { fn read( - &self, + &mut self, read_id: Option, req: RaftCmdRequest, cb: Callback, @@ -259,7 +259,7 @@ impl> LocalReadRouter for SimulateT self.ch.read(read_id, req, cb) } - fn release_snapshot_cache(&self) { + fn release_snapshot_cache(&mut self) { self.ch.release_snapshot_cache() } } diff --git a/components/tikv_kv/src/btree_engine.rs b/components/tikv_kv/src/btree_engine.rs index b75c5d6851a..473b993bf39 100644 --- a/components/tikv_kv/src/btree_engine.rs +++ b/components/tikv_kv/src/btree_engine.rs @@ -103,7 +103,7 @@ impl Engine for BTreeEngine { /// warning: It returns a fake snapshot whose content will be affected by /// the later modifies! fn async_snapshot( - &self, + &mut self, _ctx: SnapContext<'_>, cb: EngineCallback, ) -> EngineResult<()> { @@ -307,25 +307,25 @@ pub mod tests { #[test] fn test_btree_engine() { - let engine = BTreeEngine::new(TEST_ENGINE_CFS); - test_base_curd_options(&engine) + let mut engine = BTreeEngine::new(TEST_ENGINE_CFS); + test_base_curd_options(&mut engine) } #[test] fn test_linear_of_btree_engine() { - let engine = BTreeEngine::default(); - test_linear(&engine); + let mut engine = BTreeEngine::default(); + test_linear(&mut engine); } #[test] fn test_statistic_of_btree_engine() { - let engine = BTreeEngine::default(); - test_cfs_statistics(&engine); + let mut engine = BTreeEngine::default(); + test_cfs_statistics(&mut engine); } #[test] fn test_bounds_of_btree_engine() { - let engine = BTreeEngine::default(); + let mut engine = BTreeEngine::default(); let test_data = vec![ (b"a1".to_vec(), b"v1".to_vec()), (b"a3".to_vec(), b"v3".to_vec()), diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index eec49db506c..77f9a00efcb 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -278,7 +278,7 @@ pub trait Engine: Send + Clone + 'static { /// region_modifies records each region's modifications. fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()>; - fn async_snapshot(&self, ctx: SnapContext<'_>, cb: Callback) -> Result<()>; + fn async_snapshot(&mut self, ctx: SnapContext<'_>, cb: Callback) -> Result<()>; /// Precheck request which has write with it's context. fn precheck_write_with_ctx(&self, _ctx: &Context) -> Result<()> { @@ -308,9 +308,9 @@ pub trait Engine: Send + Clone + 'static { .unwrap_or_else(|| Err(Error::from(ErrorInner::Timeout(timeout)))) } - fn release_snapshot(&self) {} + fn release_snapshot(&mut self) {} - fn snapshot(&self, ctx: SnapContext<'_>) -> Result { + fn snapshot(&mut self, ctx: SnapContext<'_>) -> Result { let timeout = Duration::from_secs(DEFAULT_TIMEOUT_SECS); wait_op!(|cb| self.async_snapshot(ctx, cb), timeout) .unwrap_or_else(|| Err(Error::from(ErrorInner::Timeout(timeout)))) @@ -538,10 +538,10 @@ thread_local! { /// Precondition: `TLS_ENGINE_ANY` is non-null. pub unsafe fn with_tls_engine(f: F) -> R where - F: FnOnce(&E) -> R, + F: FnOnce(&mut E) -> R, { TLS_ENGINE_ANY.with(|e| { - let engine = &*(*e.get() as *const E); + let engine = &mut *(*e.get() as *mut E); f(engine) }) } @@ -583,7 +583,7 @@ pub unsafe fn destroy_tls_engine() { /// Get a snapshot of `engine`. pub fn snapshot( - engine: &E, + engine: &mut E, ctx: SnapContext<'_>, ) -> impl std::future::Future> { let begin = Instant::now(); @@ -697,12 +697,12 @@ pub mod tests { .unwrap(); } - pub fn assert_has(engine: &E, key: &[u8], value: &[u8]) { + pub fn assert_has(engine: &mut E, key: &[u8], value: &[u8]) { let snapshot = engine.snapshot(Default::default()).unwrap(); assert_eq!(snapshot.get(&Key::from_raw(key)).unwrap().unwrap(), value); } - pub fn assert_has_cf(engine: &E, cf: CfName, key: &[u8], value: &[u8]) { + pub fn assert_has_cf(engine: &mut E, cf: CfName, key: &[u8], value: &[u8]) { let snapshot = engine.snapshot(Default::default()).unwrap(); assert_eq!( snapshot.get_cf(cf, &Key::from_raw(key)).unwrap().unwrap(), @@ -710,17 +710,17 @@ pub mod tests { ); } - pub fn assert_none(engine: &E, key: &[u8]) { + pub fn assert_none(engine: &mut E, key: &[u8]) { let snapshot = engine.snapshot(Default::default()).unwrap(); assert_eq!(snapshot.get(&Key::from_raw(key)).unwrap(), None); } - pub fn assert_none_cf(engine: &E, cf: CfName, key: &[u8]) { + pub fn assert_none_cf(engine: &mut E, cf: CfName, key: &[u8]) { let snapshot = engine.snapshot(Default::default()).unwrap(); assert_eq!(snapshot.get_cf(cf, &Key::from_raw(key)).unwrap(), None); } - fn assert_seek(engine: &E, key: &[u8], pair: (&[u8], &[u8])) { + fn assert_seek(engine: &mut E, key: &[u8], pair: (&[u8], &[u8])) { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut cursor = Cursor::new( snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), @@ -733,7 +733,7 @@ pub mod tests { assert_eq!(cursor.value(&mut statistics), pair.1); } - fn assert_reverse_seek(engine: &E, key: &[u8], pair: (&[u8], &[u8])) { + fn assert_reverse_seek(engine: &mut E, key: &[u8], pair: (&[u8], &[u8])) { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut cursor = Cursor::new( snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), @@ -778,7 +778,7 @@ pub mod tests { assert_eq!(cursor.value(&mut statistics), pair.1); } - pub fn test_base_curd_options(engine: &E) { + pub fn test_base_curd_options(engine: &mut E) { test_get_put(engine); test_batch(engine); test_empty_seek(engine); @@ -788,7 +788,7 @@ pub mod tests { test_empty_write(engine); } - fn test_get_put(engine: &E) { + fn test_get_put(engine: &mut E) { assert_none(engine, b"x"); must_put(engine, b"x", b"1"); assert_has(engine, b"x", b"1"); @@ -796,7 +796,7 @@ pub mod tests { assert_has(engine, b"x", b"2"); } - fn test_batch(engine: &E) { + fn test_batch(engine: &mut E) { engine .write( &Context::default(), @@ -822,7 +822,7 @@ pub mod tests { assert_none(engine, b"y"); } - fn test_seek(engine: &E) { + fn test_seek(engine: &mut E) { must_put(engine, b"x", b"1"); assert_seek(engine, b"x", (b"x", b"1")); assert_seek(engine, b"a", (b"x", b"1")); @@ -853,7 +853,7 @@ pub mod tests { must_delete(engine, b"z"); } - fn test_near_seek(engine: &E) { + fn test_near_seek(engine: &mut E) { must_put(engine, b"x", b"1"); must_put(engine, b"z", b"2"); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -897,7 +897,7 @@ pub mod tests { } } - fn test_empty_seek(engine: &E) { + fn test_empty_seek(engine: &mut E) { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut cursor = Cursor::new( snapshot.iter(CF_DEFAULT, IterOptions::default()).unwrap(), @@ -1042,7 +1042,7 @@ pub mod tests { } } - pub fn test_linear(engine: &E) { + pub fn test_linear(engine: &mut E) { for i in 50..50 + SEEK_BOUND * 10 { let key = format!("key_{}", i * 2); let value = format!("value_{}", i); @@ -1090,7 +1090,7 @@ pub mod tests { } } - fn test_cf(engine: &E) { + fn test_cf(engine: &mut E) { assert_none_cf(engine, "cf", b"key"); must_put_cf(engine, "cf", b"key", b"value"); assert_has_cf(engine, "cf", b"key", b"value"); @@ -1104,7 +1104,7 @@ pub mod tests { .unwrap_err(); } - pub fn test_cfs_statistics(engine: &E) { + pub fn test_cfs_statistics(engine: &mut E) { must_put(engine, b"foo", b"bar1"); must_put(engine, b"foo2", b"bar2"); must_put(engine, b"foo3", b"bar3"); // deleted diff --git a/components/tikv_kv/src/mock_engine.rs b/components/tikv_kv/src/mock_engine.rs index 3f9d0e1a098..84605a04084 100644 --- a/components/tikv_kv/src/mock_engine.rs +++ b/components/tikv_kv/src/mock_engine.rs @@ -157,7 +157,7 @@ impl Engine for MockEngine { self.base.modify_on_kv_engine(region_modifies) } - fn async_snapshot(&self, ctx: SnapContext<'_>, cb: Callback) -> Result<()> { + fn async_snapshot(&mut self, ctx: SnapContext<'_>, cb: Callback) -> Result<()> { self.base.async_snapshot(ctx, cb) } diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index a1e98326fe2..0ef9b5b274c 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -253,7 +253,7 @@ impl Engine for RocksEngine { Ok(()) } - fn async_snapshot(&self, _: SnapContext<'_>, cb: Callback) -> Result<()> { + fn async_snapshot(&mut self, _: SnapContext<'_>, cb: Callback) -> Result<()> { fail_point!("rockskv_async_snapshot", |_| Err(box_err!( "snapshot failed" ))); diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 5f1027e738a..1b7d42a8575 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -350,7 +350,7 @@ impl Endpoint { #[inline] fn async_snapshot( - engine: &E, + engine: &mut E, ctx: &ReqContext, ) -> impl std::future::Future> { let mut snap_ctx = SnapContext { diff --git a/src/import/duplicate_detect.rs b/src/import/duplicate_detect.rs index c5429315938..dbd819efbbf 100644 --- a/src/import/duplicate_detect.rs +++ b/src/import/duplicate_detect.rs @@ -350,7 +350,7 @@ mod tests { #[test] fn test_duplicate_detect() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) .build() .unwrap(); let mut data = vec![]; @@ -408,7 +408,7 @@ mod tests { // (108,18) is not repeated with (108,10). #[test] fn test_duplicate_detect_incremental() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) .build() .unwrap(); for &start in &[100, 104, 108, 112] { @@ -469,7 +469,7 @@ mod tests { #[test] fn test_duplicate_detect_rollback_and_delete() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) .build() .unwrap(); let data = vec![ diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index eef7739f979..bd5896296bb 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -750,7 +750,7 @@ pub mod test_utils { use crate::storage::kv::RocksEngine as StorageRocksEngine; /// Do a global GC with the given safe point. - pub fn gc_by_compact(engine: &StorageRocksEngine, _: &[u8], safe_point: u64) { + pub fn gc_by_compact(engine: &mut StorageRocksEngine, _: &[u8], safe_point: u64) { let engine = engine.get_rocksdb(); // Put a new key-value pair to ensure compaction can be triggered correctly. engine.delete_cf("write", b"znot-exists-key").unwrap(); @@ -942,31 +942,31 @@ pub mod tests { // Test compaction filter won't break basic GC rules. #[test] fn test_compaction_filter_basic() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let raw_engine = engine.get_rocksdb(); let value = vec![b'v'; 512]; let mut gc_runner = TestGcRunner::new(0); // GC can't delete keys after the given safe point. - must_prewrite_put(&engine, b"zkey", &value, b"zkey", 100); - must_commit(&engine, b"zkey", 100, 110); + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); gc_runner.safe_point(50).gc(&raw_engine); - must_get(&engine, b"zkey", 110, &value); + must_get(&mut engine, b"zkey", 110, &value); // GC can't delete keys before the safe ponit if they are latest versions. gc_runner.safe_point(200).gc(&raw_engine); - must_get(&engine, b"zkey", 110, &value); + must_get(&mut engine, b"zkey", 110, &value); - must_prewrite_put(&engine, b"zkey", &value, b"zkey", 120); - must_commit(&engine, b"zkey", 120, 130); + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", 120); + must_commit(&mut engine, b"zkey", 120, 130); // GC can't delete the latest version before the safe ponit. gc_runner.safe_point(115).gc(&raw_engine); - must_get(&engine, b"zkey", 110, &value); + must_get(&mut engine, b"zkey", 110, &value); // GC a version will also delete the key on default CF. gc_runner.safe_point(200).gc(&raw_engine); - must_get_none(&engine, b"zkey", 110); + must_get_none(&mut engine, b"zkey", 110); let default_key = Key::from_encoded_slice(b"zkey").append_ts(100.into()); let default_key = default_key.into_encoded(); assert!(raw_engine.get_value(&default_key).unwrap().is_none()); @@ -976,7 +976,7 @@ pub mod tests { #[test] fn test_compaction_filter_handle_deleting() { let value = vec![b'v'; 512]; - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let raw_engine = engine.get_rocksdb(); let mut gc_runner = TestGcRunner::new(0); @@ -1001,10 +1001,10 @@ pub mod tests { }; // No key switch after the deletion mark. - must_prewrite_put(&engine, b"zkey", &value, b"zkey", 100); - must_commit(&engine, b"zkey", 100, 110); - must_prewrite_delete(&engine, b"zkey", b"zkey", 120); - must_commit(&engine, b"zkey", 120, 130); + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); + must_prewrite_delete(&mut engine, b"zkey", b"zkey", 120); + must_commit(&mut engine, b"zkey", 120, 130); // No GC task should be emit because the mvcc-deletion mark covers some older // versions. @@ -1022,12 +1022,12 @@ pub mod tests { .unwrap(); // Key switch after the deletion mark. - must_prewrite_put(&engine, b"zkey1", &value, b"zkey1", 200); - must_commit(&engine, b"zkey1", 200, 210); - must_prewrite_delete(&engine, b"zkey1", b"zkey1", 220); - must_commit(&engine, b"zkey1", 220, 230); - must_prewrite_put(&engine, b"zkey2", &value, b"zkey2", 220); - must_commit(&engine, b"zkey2", 220, 230); + must_prewrite_put(&mut engine, b"zkey1", &value, b"zkey1", 200); + must_commit(&mut engine, b"zkey1", 200, 210); + must_prewrite_delete(&mut engine, b"zkey1", b"zkey1", 220); + must_commit(&mut engine, b"zkey1", 220, 230); + must_prewrite_put(&mut engine, b"zkey2", &value, b"zkey2", 220); + must_commit(&mut engine, b"zkey2", 220, 230); // No GC task should be emit because the mvcc-deletion mark covers some older // versions. @@ -1045,17 +1045,17 @@ pub mod tests { cfg.writecf.dynamic_level_bytes = false; let dir = tempfile::TempDir::new().unwrap(); let builder = TestEngineBuilder::new().path(dir.path()); - let engine = builder.build_with_cfg(&cfg).unwrap(); + let mut engine = builder.build_with_cfg(&cfg).unwrap(); let raw_engine = engine.get_rocksdb(); let value = vec![b'v'; 512]; let mut gc_runner = TestGcRunner::new(0); for start_ts in &[100, 110, 120, 130] { - must_prewrite_put(&engine, b"zkey", &value, b"zkey", *start_ts); - must_commit(&engine, b"zkey", *start_ts, *start_ts + 5); + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", *start_ts); + must_commit(&mut engine, b"zkey", *start_ts, *start_ts + 5); } - must_prewrite_delete(&engine, b"zkey", b"zkey", 140); - must_commit(&engine, b"zkey", 140, 145); + must_prewrite_delete(&mut engine, b"zkey", b"zkey", 140); + must_commit(&mut engine, b"zkey", 140, 145); // Can't perform GC because the min timestamp is greater than safe point. gc_runner @@ -1072,18 +1072,18 @@ pub mod tests { gc_runner.target_level = Some(6); gc_runner.safe_point(140).gc(&raw_engine); for commit_ts in &[105, 115, 125] { - must_get_none(&engine, b"zkey", commit_ts); + must_get_none(&mut engine, b"zkey", commit_ts); } // Put an extra key to make the memtable overlap with the bottommost one. - must_prewrite_put(&engine, b"zkey1", &value, b"zkey1", 200); - must_commit(&engine, b"zkey1", 200, 205); + must_prewrite_put(&mut engine, b"zkey1", &value, b"zkey1", 200); + must_commit(&mut engine, b"zkey1", 200, 205); for start_ts in &[200, 210, 220, 230] { - must_prewrite_put(&engine, b"zkey", &value, b"zkey", *start_ts); - must_commit(&engine, b"zkey", *start_ts, *start_ts + 5); + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", *start_ts); + must_commit(&mut engine, b"zkey", *start_ts, *start_ts + 5); } - must_prewrite_delete(&engine, b"zkey", b"zkey", 240); - must_commit(&engine, b"zkey", 240, 245); + must_prewrite_delete(&mut engine, b"zkey", b"zkey", 240); + must_commit(&mut engine, b"zkey", 240, 245); raw_engine.flush_cf(CF_WRITE, true).unwrap(); // At internal levels can't perform GC because the threshold is not reached. @@ -1096,7 +1096,7 @@ pub mod tests { .safe_point(300) .gc_on_files(&raw_engine, files, CF_WRITE); for commit_ts in &[205, 215, 225, 235] { - must_get(&engine, b"zkey", commit_ts, &value); + must_get(&mut engine, b"zkey", commit_ts, &value); } } @@ -1114,14 +1114,14 @@ pub mod tests { let dir = tempfile::TempDir::new().unwrap(); let builder = TestEngineBuilder::new().path(dir.path()); - let engine = builder.build_with_cfg(&cfg).unwrap(); + let mut engine = builder.build_with_cfg(&cfg).unwrap(); let raw_engine = engine.get_rocksdb(); let mut gc_runner = TestGcRunner::new(0); // So the construction of SST files will be: // L6: |key_110| - must_prewrite_put(&engine, b"zkey", b"zvalue", b"zkey", 100); - must_commit(&engine, b"zkey", 100, 110); + must_prewrite_put(&mut engine, b"zkey", b"zvalue", b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); gc_runner.target_level = Some(6); gc_runner.safe_point(50).gc(&raw_engine); assert_eq!(rocksdb_level_file_counts(&raw_engine, CF_WRITE)[6], 1); @@ -1129,8 +1129,8 @@ pub mod tests { // So the construction of SST files will be: // L0: |key_130, key_110| // L6: |key_110| - must_prewrite_delete(&engine, b"zkey", b"zkey", 120); - must_commit(&engine, b"zkey", 120, 130); + must_prewrite_delete(&mut engine, b"zkey", b"zkey", 120); + must_commit(&mut engine, b"zkey", 120, 130); let k_110 = Key::from_raw(b"zkey").append_ts(110.into()).into_encoded(); raw_engine.delete_cf(CF_WRITE, &k_110).unwrap(); raw_engine.flush_cf(CF_WRITE, true).unwrap(); @@ -1147,11 +1147,11 @@ pub mod tests { .gc_on_files(&raw_engine, files, CF_WRITE); assert_eq!(rocksdb_level_file_counts(&raw_engine, CF_WRITE)[5], 1); assert_eq!(rocksdb_level_file_counts(&raw_engine, CF_WRITE)[6], 1); - must_get_none(&engine, b"zkey", 200); + must_get_none(&mut engine, b"zkey", 200); // Compact the mvcc deletion mark to L6, the stale version shouldn't be exposed. gc_runner.target_level = Some(6); gc_runner.safe_point(200).gc(&raw_engine); - must_get_none(&engine, b"zkey", 200); + must_get_none(&mut engine, b"zkey", 200); } } diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 7b03d0fb5e8..82496068b99 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -530,7 +530,7 @@ where } fn create_reader( - &self, + &mut self, key_count: usize, region: &Region, range_start_key: Key, @@ -871,16 +871,17 @@ where } fn handle_physical_scan_lock( - &self, + &mut self, _: &Context, max_ts: TimeStamp, start_key: &Key, limit: usize, regions_provider: Arc, ) -> Result> { + let store_id = self.store_id; let regions = box_try!(regions_provider.get_regions_in_range(start_key.as_encoded(), &[])) .into_iter() - .filter(move |r| find_peer(r, self.store_id).is_some()); + .filter(move |r| find_peer(r, store_id).is_some()); let mut first_round = true; let mut locks = Vec::new(); @@ -893,7 +894,7 @@ where Key::from_raw(region.get_start_key()) } }; - let snap = self.get_snapshot(self.store_id, ®ion)?; + let snap = self.get_snapshot(store_id, ®ion)?; let mut reader = MvccReader::new(snap, Some(ScanMode::Forward), false); let (locks_this_region, _) = reader .scan_locks(Some(&start_key), None, |l| l.ts <= max_ts, limit) @@ -937,7 +938,7 @@ where } } - fn get_snapshot(&self, store_id: u64, region: &Region) -> Result<::Snap> { + fn get_snapshot(&mut self, store_id: u64, region: &Region) -> Result<::Snap> { let ctx = init_snap_ctx(store_id, region); let snap_ctx = SnapContext { pb_ctx: &ctx, @@ -945,7 +946,7 @@ where }; Ok(block_on(async { - tikv_kv::snapshot(&self.engine, snap_ctx).await + tikv_kv::snapshot(&mut self.engine, snap_ctx).await })?) } } @@ -1543,7 +1544,7 @@ pub mod test_gc_worker { } fn async_snapshot( - &self, + &mut self, ctx: SnapContext<'_>, callback: EngineCallback, ) -> EngineResult<()> { @@ -1659,7 +1660,7 @@ pub mod test_gc_worker { } fn async_snapshot( - &self, + &mut self, ctx: SnapContext<'_>, callback: EngineCallback, ) -> EngineResult<()> { @@ -2099,7 +2100,7 @@ mod tests { fn test_gc_keys_with_region_info_provider() { let store_id = 1; let engine = TestEngineBuilder::new().build().unwrap(); - let prefixed_engine = PrefixedEngine(engine.clone()); + let mut prefixed_engine = PrefixedEngine(engine.clone()); let (tx, _rx) = mpsc::channel(); let feature_gate = FeatureGate::default(); @@ -2153,10 +2154,10 @@ mod tests { for i in 0..100 { let k = format!("k{:02}", i).into_bytes(); - must_prewrite_put(&prefixed_engine, &k, b"value", &k, 101); - must_commit(&prefixed_engine, &k, 101, 102); - must_prewrite_delete(&prefixed_engine, &k, &k, 151); - must_commit(&prefixed_engine, &k, 151, 152); + must_prewrite_put(&mut prefixed_engine, &k, b"value", &k, 101); + must_commit(&mut prefixed_engine, &k, 101, 102); + must_prewrite_delete(&mut prefixed_engine, &k, &k, 151); + must_commit(&mut prefixed_engine, &k, 151, 152); } db.flush_cf(cf, true).unwrap(); @@ -2165,7 +2166,7 @@ mod tests { let k = format!("k{:02}", i).into_bytes(); // Stale MVCC-PUTs will be cleaned in write CF's compaction filter. - must_get_none(&prefixed_engine, &k, 150); + must_get_none(&mut prefixed_engine, &k, 150); // However, MVCC-DELETIONs will be kept. let mut raw_k = vec![b'z']; @@ -2196,7 +2197,7 @@ mod tests { fn test_gc_keys_statistics() { let store_id = 1; let engine = TestEngineBuilder::new().build().unwrap(); - let prefixed_engine = PrefixedEngine(engine.clone()); + let mut prefixed_engine = PrefixedEngine(engine.clone()); let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); @@ -2228,10 +2229,10 @@ mod tests { let mut keys = vec![]; for i in 0..100 { let k = format!("k{:02}", i).into_bytes(); - must_prewrite_put(&prefixed_engine, &k, b"value", &k, 101); - must_commit(&prefixed_engine, &k, 101, 102); - must_prewrite_delete(&prefixed_engine, &k, &k, 151); - must_commit(&prefixed_engine, &k, 151, 152); + must_prewrite_put(&mut prefixed_engine, &k, b"value", &k, 101); + must_commit(&mut prefixed_engine, &k, 101, 102); + must_prewrite_delete(&mut prefixed_engine, &k, &k, 151); + must_commit(&mut prefixed_engine, &k, 151, 152); keys.push(Key::from_raw(&k)); } db.flush_cf(cf, true).unwrap(); @@ -2259,7 +2260,7 @@ mod tests { let dir = tempfile::TempDir::new().unwrap(); let builder = TestEngineBuilder::new().path(dir.path()); let engine = builder.build_with_cfg(&cfg).unwrap(); - let prefixed_engine = PrefixedEngine(engine); + let mut prefixed_engine = PrefixedEngine(engine); let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); @@ -2343,9 +2344,10 @@ mod tests { assert_eq!(7, runner.mut_stats(GcKeyMode::raw).data.next); assert_eq!(2, runner.mut_stats(GcKeyMode::raw).data.seek); - let snapshot = - block_on(async { tikv_kv::snapshot(&prefixed_engine, SnapContext::default()).await }) - .unwrap(); + let snapshot = block_on(async { + tikv_kv::snapshot(&mut prefixed_engine, SnapContext::default()).await + }) + .unwrap(); test_raws .clone() @@ -2360,7 +2362,7 @@ mod tests { #[test] fn test_gc_keys_scan_range_limit() { let engine = TestEngineBuilder::new().build().unwrap(); - let prefixed_engine = PrefixedEngine(engine.clone()); + let mut prefixed_engine = PrefixedEngine(engine.clone()); let (tx, _rx) = mpsc::channel(); let cfg = GcConfig::default(); @@ -2391,10 +2393,10 @@ mod tests { let cf = get_cf_handle(&db, CF_WRITE).unwrap(); // Generate some tombstone for i in 10u64..30 { - must_rollback(&prefixed_engine, b"k2\x00", i, true); + must_rollback(&mut prefixed_engine, b"k2\x00", i, true); } db.flush_cf(cf, true).unwrap(); - must_gc(&prefixed_engine, b"k2\x00", 30); + must_gc(&mut prefixed_engine, b"k2\x00", 30); // Test tombstone counter works assert_eq!(runner.mut_stats(GcKeyMode::txn).write.seek_tombstone, 0); @@ -2449,8 +2451,8 @@ mod tests { let versions = (MAX_TXN_WRITE_SIZE - 1) / key_size + 4; for start_ts in (1..versions).map(|x| x as u64 * 2) { let commit_ts = start_ts + 1; - must_prewrite_put(&prefixed_engine, b"k2", b"v2", b"k2", start_ts); - must_commit(&prefixed_engine, b"k2", start_ts, commit_ts); + must_prewrite_put(&mut prefixed_engine, b"k2", b"v2", b"k2", start_ts); + must_commit(&mut prefixed_engine, b"k2", start_ts, commit_ts); } db.flush_cf(cf, true).unwrap(); let safepoint = versions as u64 * 2; @@ -2480,9 +2482,9 @@ mod tests { #[test] fn delete_range_when_worker_is_full() { let store_id = 1; - let engine = PrefixedEngine(TestEngineBuilder::new().build().unwrap()); - must_prewrite_put(&engine, b"key", b"value", b"key", 10); - must_commit(&engine, b"key", 10, 20); + let mut engine = PrefixedEngine(TestEngineBuilder::new().build().unwrap()); + must_prewrite_put(&mut engine, b"key", b"value", b"key", 10); + must_commit(&mut engine, b"key", 10, 20); let db = engine.kv_engine().unwrap().as_inner().clone(); let cf = get_cf_handle(&db, CF_WRITE).unwrap(); db.flush_cf(cf, true).unwrap(); @@ -2546,7 +2548,7 @@ mod tests { // After the worker starts running, the destroy range task should run, // and the key in the range will be deleted. rx.recv_timeout(Duration::from_secs(10)).unwrap().unwrap(); - must_get_none(&engine, b"key", 30); + must_get_none(&mut engine, b"key", 30); } #[test] @@ -2655,7 +2657,7 @@ mod tests { region_info.insert(1, r1.clone()); region_info.insert(2, r2.clone()); region_info.insert(3, r3.clone()); - let engine = MultiRocksEngine { + let mut engine = MultiRocksEngine { factory: factory.clone(), region_info, }; @@ -2689,11 +2691,17 @@ mod tests { } let k = format!("k{:02}", i).into_bytes(); let v = format!("value-{:02}", i).into_bytes(); - must_prewrite_put_on_region(&engine, region_id, &k, &v, &k, put_start_ts); - must_commit_on_region(&engine, region_id, &k, put_start_ts, put_start_ts + 1); + must_prewrite_put_on_region(&mut engine, region_id, &k, &v, &k, put_start_ts); + must_commit_on_region(&mut engine, region_id, &k, put_start_ts, put_start_ts + 1); if need_deletion { - must_prewrite_delete_on_region(&engine, region_id, &k, &k, delete_start_ts); - must_commit_on_region(&engine, region_id, &k, delete_start_ts, delete_start_ts + 1); + must_prewrite_delete_on_region(&mut engine, region_id, &k, &k, delete_start_ts); + must_commit_on_region( + &mut engine, + region_id, + &k, + delete_start_ts, + delete_start_ts + 1, + ); } } @@ -2713,7 +2721,7 @@ mod tests { let put_start_ts = 100; let delete_start_ts = 150; - let (factory, engine, _ri_provider, mut gc_runner, regions, _) = + let (factory, mut engine, _ri_provider, mut gc_runner, regions, _) = multi_gc_engine_setup(store_id, put_start_ts, delete_start_ts, true); gc_runner.gc(regions[0].clone(), 200.into()).unwrap(); @@ -2731,7 +2739,7 @@ mod tests { let k = format!("k{:02}", i).into_bytes(); // Stale MVCC-PUTs will be cleaned in write CF's compaction filter. - must_get_none_on_region(&engine, region_id, &k, delete_start_ts - 1); + must_get_none_on_region(&mut engine, region_id, &k, delete_start_ts - 1); // MVCC-DELETIONs is cleaned let mut raw_k = vec![b'z']; @@ -2748,7 +2756,7 @@ mod tests { let put_start_ts = 100; let delete_start_ts = 150; - let (factory, engine, ri_provider, mut gc_runner, ..) = + let (factory, mut engine, ri_provider, mut gc_runner, ..) = multi_gc_engine_setup(store_id, put_start_ts, delete_start_ts, true); let mut keys = Vec::new(); @@ -2782,10 +2790,10 @@ mod tests { if i % 2 == 0 { assert!(db.get_cf(cf, &raw_k).unwrap().is_some()); - must_get_on_region(&engine, region_id, &k, delete_start_ts - 1, &val); + must_get_on_region(&mut engine, region_id, &k, delete_start_ts - 1, &val); } else { assert!(db.get_cf(cf, &raw_k).unwrap().is_none()); - must_get_none_on_region(&engine, region_id, &k, delete_start_ts - 1); + must_get_none_on_region(&mut engine, region_id, &k, delete_start_ts - 1); } } } @@ -2817,7 +2825,7 @@ mod tests { let mut region_info = HashMap::default(); region_info.insert(1, r1.clone()); region_info.insert(2, r2.clone()); - let engine = MultiRocksEngine { + let mut engine = MultiRocksEngine { factory, region_info, }; @@ -2915,7 +2923,8 @@ mod tests { pb_ctx: &ctx, ..Default::default() }; - let snapshot = block_on(async { tikv_kv::snapshot(&engine, snap_ctx).await }).unwrap(); + let snapshot = + block_on(async { tikv_kv::snapshot(&mut engine, snap_ctx).await }).unwrap(); test_raws_region .clone() @@ -2938,7 +2947,7 @@ mod tests { ) { let store_id = 1; let put_start_ts = 100; - let (factory, engine, ri_provider, gc_runner, _, _rx) = + let (factory, mut engine, ri_provider, gc_runner, _, _rx) = multi_gc_engine_setup(store_id, put_start_ts, 0, false); let start_key = Key::from_raw(start_key); @@ -2970,10 +2979,10 @@ mod tests { if start_key <= key && key < end_key { regions.insert(region_id); assert!(db.get_cf(cf, &raw_k).unwrap().is_none()); - must_get_none_on_region(&engine, region_id, &k, put_start_ts + 10); + must_get_none_on_region(&mut engine, region_id, &k, put_start_ts + 10); } else { assert!(db.get_cf(cf, &raw_k).unwrap().is_some()); - must_get_on_region(&engine, region_id, &k, put_start_ts + 10, &val); + must_get_on_region(&mut engine, region_id, &k, put_start_ts + 10, &val); } } } diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index 085e8381943..eaa13995650 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -195,7 +195,7 @@ where } fn exec_snapshot( - &self, + &mut self, ctx: SnapContext<'_>, req: Request, cb: Callback>, @@ -428,7 +428,11 @@ where }) } - fn async_snapshot(&self, mut ctx: SnapContext<'_>, cb: Callback) -> kv::Result<()> { + fn async_snapshot( + &mut self, + mut ctx: SnapContext<'_>, + cb: Callback, + ) -> kv::Result<()> { fail_point!("raftkv_async_snapshot_err", |_| Err(box_err!( "injected error for async_snapshot" ))); @@ -481,7 +485,7 @@ where }) } - fn release_snapshot(&self) { + fn release_snapshot(&mut self) { self.router.release_snapshot_cache(); } diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index d42b29cfe47..f0192372e4b 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -138,29 +138,29 @@ mod tests { #[test] fn test_rocksdb() { - let engine = TestEngineBuilder::new() + let mut engine = TestEngineBuilder::new() .cfs(TEST_ENGINE_CFS) .build() .unwrap(); - test_base_curd_options(&engine) + test_base_curd_options(&mut engine) } #[test] fn test_rocksdb_linear() { - let engine = TestEngineBuilder::new() + let mut engine = TestEngineBuilder::new() .cfs(TEST_ENGINE_CFS) .build() .unwrap(); - test_linear(&engine); + test_linear(&mut engine); } #[test] fn test_rocksdb_statistic() { - let engine = TestEngineBuilder::new() + let mut engine = TestEngineBuilder::new() .cfs(TEST_ENGINE_CFS) .build() .unwrap(); - test_cfs_statistics(&engine); + test_cfs_statistics(&mut engine); } #[test] @@ -178,27 +178,27 @@ mod tests { must_put_cf(&engine, "cf", b"k", b"v1"); } { - let engine = TestEngineBuilder::new() + let mut engine = TestEngineBuilder::new() .path(dir.path()) .cfs(TEST_ENGINE_CFS) .build() .unwrap(); - assert_has_cf(&engine, "cf", b"k", b"v1"); + assert_has_cf(&mut engine, "cf", b"k", b"v1"); } } #[test] fn test_rocksdb_perf_statistics() { - let engine = TestEngineBuilder::new() + let mut engine = TestEngineBuilder::new() .cfs(TEST_ENGINE_CFS) .build() .unwrap(); - test_perf_statistics(&engine); + test_perf_statistics(&mut engine); } #[test] fn test_max_skippable_internal_keys_error() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); must_put(&engine, b"foo", b"bar"); must_delete(&engine, b"foo"); must_put(&engine, b"foo1", b"bar1"); @@ -224,7 +224,7 @@ mod tests { ); } - fn test_perf_statistics(engine: &E) { + fn test_perf_statistics(engine: &mut E) { must_put(engine, b"foo", b"bar1"); must_put(engine, b"foo2", b"bar2"); must_put(engine, b"foo3", b"bar3"); // deleted @@ -268,7 +268,7 @@ mod tests { #[test] fn test_prefix_seek_skip_tombstone() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); engine .put_cf( &Context::default(), diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 9a5f37011aa..3dd5fc2e10a 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -347,7 +347,7 @@ where tls_cell.with(|c| { let mut c = c.borrow_mut(); let perf_context = c.get_or_insert_with(|| { - with_tls_engine(|engine: &E| { + with_tls_engine(|engine: &mut E| { Box::new(engine.kv_engine().unwrap().get_perf_context( PerfLevel::Uninitialized, PerfContextKind::Storage(cmd.get_str()), diff --git a/src/storage/mod.rs b/src/storage/mod.rs index e9d1f06e524..abdfcd333ac 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -315,7 +315,7 @@ impl Storage { /// Get a snapshot of `engine`. fn snapshot( - engine: &E, + engine: &mut E, ctx: SnapContext<'_>, ) -> impl std::future::Future> { kv::snapshot(engine, ctx) @@ -324,11 +324,11 @@ impl Storage { } #[cfg(test)] - pub fn get_snapshot(&self) -> E::Snap { + pub fn get_snapshot(&mut self) -> E::Snap { self.engine.snapshot(Default::default()).unwrap() } - pub fn release_snapshot(&self) { + pub fn release_snapshot(&mut self) { self.engine.release_snapshot(); } @@ -349,7 +349,7 @@ impl Storage { } #[inline] - fn with_tls_engine(f: impl FnOnce(&E) -> R) -> R { + fn with_tls_engine(f: impl FnOnce(&mut E) -> R) -> R { // Safety: the read pools ensure that a TLS engine exists. unsafe { with_tls_engine(f) } } @@ -2971,7 +2971,7 @@ impl Engine for TxnTestEngine { } fn async_snapshot( - &self, + &mut self, ctx: SnapContext<'_>, cb: tikv_kv::Callback, ) -> tikv_kv::Result<()> { @@ -3444,7 +3444,7 @@ mod tests { #[test] fn test_prewrite_blocks_read() { use kvproto::kvrpcpb::ExtraOp; - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) .build() .unwrap(); @@ -8682,7 +8682,7 @@ mod tests { // they should not have overlapped ts, which is an expected property. #[test] fn test_overlapped_ts_rollback_before_prewrite() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine.clone(), DummyLockManager) .build() @@ -8787,8 +8787,8 @@ mod tests { .unwrap(); rx.recv().unwrap(); - must_unlocked(&engine, k2); - must_written(&engine, k2, 10, 10, WriteType::Rollback); + must_unlocked(&mut engine, k2); + must_written(&mut engine, k2, 10, 10, WriteType::Rollback); // T1 prewrites, start_ts = 1, for_update_ts = 3 storage @@ -9003,7 +9003,7 @@ mod tests { #[test] fn test_resolve_commit_pessimistic_locks() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) .build() .unwrap(); let (tx, rx) = channel(); @@ -9083,7 +9083,7 @@ mod tests { // Pessimistically rollback the k2 lock. // Non lite lock resolve on k1 and k2, there should no errors as lock on k2 is // pessimistic type. - must_rollback(&storage.engine, b"k2", 10, false); + must_rollback(&mut storage.engine, b"k2", 10, false); let mut temp_map = HashMap::default(); temp_map.insert(10.into(), 20.into()); storage @@ -9169,7 +9169,7 @@ mod tests { // Unlock the k6 first. // Non lite lock resolve on k5 and k6, error should be reported. - must_rollback(&storage.engine, b"k6", 10, true); + must_rollback(&mut storage.engine, b"k6", 10, true); storage .sched_txn_command( commands::ResolveLock::new( diff --git a/src/storage/mvcc/consistency_check.rs b/src/storage/mvcc/consistency_check.rs index fba4f207054..487ae61d5e8 100644 --- a/src/storage/mvcc/consistency_check.rs +++ b/src/storage/mvcc/consistency_check.rs @@ -448,18 +448,18 @@ mod tests { #[test] fn test_mvcc_checksum() { - let engine = TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine, b"zAAAAA", b"value", b"PRIMARY", 100); - must_commit(&engine, b"zAAAAA", 100, 101); - must_prewrite_put(&engine, b"zCCCCC", b"value", b"PRIMARY", 110); - must_commit(&engine, b"zCCCCC", 110, 111); - - must_prewrite_put(&engine, b"zBBBBB", b"value", b"PRIMARY", 200); - must_commit(&engine, b"zBBBBB", 200, 201); - must_prewrite_put(&engine, b"zDDDDD", b"value", b"PRIMARY", 200); - must_rollback(&engine, b"zDDDDD", 200, false); - must_prewrite_put(&engine, b"zFFFFF", b"value", b"PRIMARY", 200); - must_prewrite_delete(&engine, b"zGGGGG", b"PRIMARY", 200); + let mut engine = TestEngineBuilder::new().build().unwrap(); + must_prewrite_put(&mut engine, b"zAAAAA", b"value", b"PRIMARY", 100); + must_commit(&mut engine, b"zAAAAA", 100, 101); + must_prewrite_put(&mut engine, b"zCCCCC", b"value", b"PRIMARY", 110); + must_commit(&mut engine, b"zCCCCC", 110, 111); + + must_prewrite_put(&mut engine, b"zBBBBB", b"value", b"PRIMARY", 200); + must_commit(&mut engine, b"zBBBBB", 200, 201); + must_prewrite_put(&mut engine, b"zDDDDD", b"value", b"PRIMARY", 200); + must_rollback(&mut engine, b"zDDDDD", 200, false); + must_prewrite_put(&mut engine, b"zFFFFF", b"value", b"PRIMARY", 200); + must_prewrite_delete(&mut engine, b"zGGGGG", b"PRIMARY", 200); let mut checksums = Vec::with_capacity(3); for &safe_point in &[150, 160, 100] { diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 3e6678e760d..6191c2ad46d 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -439,12 +439,17 @@ pub mod tests { } } - pub fn must_get(engine: &E, key: &[u8], ts: impl Into, expect: &[u8]) { + pub fn must_get( + engine: &mut E, + key: &[u8], + ts: impl Into, + expect: &[u8], + ) { must_get_impl(engine, None, key, ts, expect); } pub fn must_get_on_region( - engine: &E, + engine: &mut E, region_id: u64, key: &[u8], ts: impl Into, @@ -454,7 +459,7 @@ pub mod tests { } fn must_get_impl( - engine: &E, + engine: &mut E, region_id: Option, key: &[u8], ts: impl Into, @@ -478,7 +483,7 @@ pub mod tests { } pub fn must_get_no_lock_check( - engine: &E, + engine: &mut E, key: &[u8], ts: impl Into, expect: &[u8], @@ -514,12 +519,12 @@ pub mod tests { Ok(()) } - pub fn must_get_none(engine: &E, key: &[u8], ts: impl Into) { + pub fn must_get_none(engine: &mut E, key: &[u8], ts: impl Into) { must_get_none_impl(engine, key, ts, None); } pub fn must_get_none_on_region( - engine: &E, + engine: &mut E, region_id: u64, key: &[u8], ts: impl Into, @@ -528,7 +533,7 @@ pub mod tests { } fn must_get_none_impl( - engine: &E, + engine: &mut E, key: &[u8], ts: impl Into, region_id: Option, @@ -549,7 +554,7 @@ pub mod tests { assert!(reader.get(key, ts).unwrap().is_none()); } - pub fn must_get_err(engine: &E, key: &[u8], ts: impl Into) { + pub fn must_get_err(engine: &mut E, key: &[u8], ts: impl Into) { let ts = ts.into(); let ctx = SnapContext::default(); let snapshot = engine.snapshot(ctx).unwrap(); @@ -561,7 +566,11 @@ pub mod tests { reader.get(key, ts).unwrap_err(); } - pub fn must_locked(engine: &E, key: &[u8], start_ts: impl Into) -> Lock { + pub fn must_locked( + engine: &mut E, + key: &[u8], + start_ts: impl Into, + ) -> Lock { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, None, true); let lock = reader.load_lock(&Key::from_raw(key)).unwrap().unwrap(); @@ -571,7 +580,7 @@ pub mod tests { } pub fn must_locked_with_ttl( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, ttl: u64, @@ -585,7 +594,7 @@ pub mod tests { } pub fn must_large_txn_locked( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, ttl: u64, @@ -605,14 +614,14 @@ pub mod tests { } } - pub fn must_unlocked(engine: &E, key: &[u8]) { + pub fn must_unlocked(engine: &mut E, key: &[u8]) { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, None, true); assert!(reader.load_lock(&Key::from_raw(key)).unwrap().is_none()); } pub fn must_written( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, commit_ts: impl Into, @@ -628,7 +637,7 @@ pub mod tests { } pub fn must_have_write( - engine: &E, + engine: &mut E, key: &[u8], commit_ts: impl Into, ) -> Write { @@ -639,14 +648,18 @@ pub mod tests { write.to_owned() } - pub fn must_not_have_write(engine: &E, key: &[u8], commit_ts: impl Into) { + pub fn must_not_have_write( + engine: &mut E, + key: &[u8], + commit_ts: impl Into, + ) { let snapshot = engine.snapshot(Default::default()).unwrap(); let k = Key::from_raw(key).append_ts(commit_ts.into()); let v = snapshot.get_cf(CF_WRITE, &k).unwrap(); assert!(v.is_none()); } - pub fn must_seek_write_none(engine: &E, key: &[u8], ts: impl Into) { + pub fn must_seek_write_none(engine: &mut E, key: &[u8], ts: impl Into) { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, None, true); assert!( @@ -658,7 +671,7 @@ pub mod tests { } pub fn must_seek_write( - engine: &E, + engine: &mut E, key: &[u8], ts: impl Into, start_ts: impl Into, @@ -677,7 +690,7 @@ pub mod tests { } pub fn must_get_commit_ts( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, commit_ts: impl Into, @@ -694,7 +707,7 @@ pub mod tests { } pub fn must_get_commit_ts_none( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, ) { @@ -710,7 +723,11 @@ pub mod tests { } } - pub fn must_get_rollback_ts(engine: &E, key: &[u8], start_ts: impl Into) { + pub fn must_get_rollback_ts( + engine: &mut E, + key: &[u8], + start_ts: impl Into, + ) { let start_ts = start_ts.into(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = SnapshotReader::new(start_ts, snapshot, true); @@ -725,7 +742,7 @@ pub mod tests { } pub fn must_get_rollback_ts_none( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, ) { @@ -740,7 +757,7 @@ pub mod tests { } pub fn must_get_rollback_protected( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, protected: bool, @@ -759,7 +776,7 @@ pub mod tests { } pub fn must_get_overlapped_rollback>( - engine: &E, + engine: &mut E, key: &[u8], start_ts: T, overlapped_start_ts: T, @@ -783,7 +800,7 @@ pub mod tests { } pub fn must_scan_keys( - engine: &E, + engine: &mut E, start: Option<&[u8]>, limit: usize, keys: Vec<&[u8]>, diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 2758460a526..1e26d9bf21b 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -403,12 +403,12 @@ mod tests { }, }; - fn new_point_getter(engine: &E, ts: TimeStamp) -> PointGetter { + fn new_point_getter(engine: &mut E, ts: TimeStamp) -> PointGetter { new_point_getter_with_iso(engine, ts, IsolationLevel::Si) } fn new_point_getter_with_iso( - engine: &E, + engine: &mut E, ts: TimeStamp, iso_level: IsolationLevel, ) -> PointGetter { @@ -429,7 +429,7 @@ mod tests { } fn must_met_newer_ts_data( - engine: &E, + engine: &mut E, getter_ts: impl Into, key: &[u8], value: &[u8], @@ -502,59 +502,59 @@ mod tests { /// PUT zz -> zvzv.... (commit at 103) fn new_sample_engine() -> RocksEngine { let suffix = "v".repeat(SHORT_VALUE_MAX_LEN + 1); - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); must_prewrite_put( - &engine, + &mut engine, b"foo1", &format!("foo1{}", suffix).into_bytes(), b"foo1", 2, ); - must_commit(&engine, b"foo1", 2, 3); + must_commit(&mut engine, b"foo1", 2, 3); must_prewrite_put( - &engine, + &mut engine, b"foo2", &format!("foo2{}", suffix).into_bytes(), b"foo2", 4, ); must_prewrite_put( - &engine, + &mut engine, b"bar", &format!("bar{}", suffix).into_bytes(), b"foo2", 4, ); - must_commit(&engine, b"foo2", 4, 5); - must_commit(&engine, b"bar", 4, 5); - must_prewrite_delete(&engine, b"xxx", b"xxx", 6); - must_commit(&engine, b"xxx", 6, 7); + must_commit(&mut engine, b"foo2", 4, 5); + must_commit(&mut engine, b"bar", 4, 5); + must_prewrite_delete(&mut engine, b"xxx", b"xxx", 6); + must_commit(&mut engine, b"xxx", 6, 7); must_prewrite_put( - &engine, + &mut engine, b"box", &format!("box{}", suffix).into_bytes(), b"box", 8, ); - must_prewrite_delete(&engine, b"foo1", b"box", 8); - must_commit(&engine, b"box", 8, 9); - must_commit(&engine, b"foo1", 8, 9); - must_prewrite_lock(&engine, b"bar", b"bar", 10); - must_commit(&engine, b"bar", 10, 11); + must_prewrite_delete(&mut engine, b"foo1", b"box", 8); + must_commit(&mut engine, b"box", 8, 9); + must_commit(&mut engine, b"foo1", 8, 9); + must_prewrite_lock(&mut engine, b"bar", b"bar", 10); + must_commit(&mut engine, b"bar", 10, 11); for i in 20..100 { if i % 2 == 0 { - must_prewrite_lock(&engine, b"foo2", b"foo2", i); - must_commit(&engine, b"foo2", i, i + 1); + must_prewrite_lock(&mut engine, b"foo2", b"foo2", i); + must_commit(&mut engine, b"foo2", i, i + 1); } } must_prewrite_put( - &engine, + &mut engine, b"zz", &format!("zz{}", suffix).into_bytes(), b"zz", 102, ); - must_commit(&engine, b"zz", 102, 103); + must_commit(&mut engine, b"zz", 102, 103); engine } @@ -566,35 +566,35 @@ mod tests { /// PUT foo2 -> foo2vv... (start at 4) fn new_sample_engine_2() -> RocksEngine { let suffix = "v".repeat(SHORT_VALUE_MAX_LEN + 1); - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); must_prewrite_put( - &engine, + &mut engine, b"foo1", &format!("foo1{}", suffix).into_bytes(), b"foo1", 2, ); - must_prewrite_put(&engine, b"bar", b"barval", b"foo1", 2); - must_commit(&engine, b"foo1", 2, 3); - must_commit(&engine, b"bar", 2, 3); + must_prewrite_put(&mut engine, b"bar", b"barval", b"foo1", 2); + must_commit(&mut engine, b"foo1", 2, 3); + must_commit(&mut engine, b"bar", 2, 3); must_prewrite_put( - &engine, + &mut engine, b"foo2", &format!("foo2{}", suffix).into_bytes(), b"foo2", 4, ); - must_prewrite_delete(&engine, b"bar", b"foo2", 4); + must_prewrite_delete(&mut engine, b"bar", b"foo2", 4); engine } /// No ts larger than get ts #[test] fn test_basic_1() { - let engine = new_sample_engine(); + let mut engine = new_sample_engine(); - let mut getter = new_point_getter(&engine, 200.into()); + let mut getter = new_point_getter(&mut engine, 200.into()); // Get a deleted key must_get_none(&mut getter, b"foo1"); @@ -661,42 +661,42 @@ mod tests { #[test] fn test_use_prefix_seek() { - let engine = TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine, b"foo1", b"bar1", b"foo1", 10); - must_commit(&engine, b"foo1", 10, 20); + let mut engine = TestEngineBuilder::new().build().unwrap(); + must_prewrite_put(&mut engine, b"foo1", b"bar1", b"foo1", 10); + must_commit(&mut engine, b"foo1", 10, 20); // Mustn't get the next user key even if point getter doesn't compare user key. - let mut getter = new_point_getter(&engine, 30.into()); + let mut getter = new_point_getter(&mut engine, 30.into()); must_get_none(&mut getter, b"foo0"); - let mut getter = new_point_getter(&engine, 30.into()); + let mut getter = new_point_getter(&mut engine, 30.into()); must_get_none(&mut getter, b"foo"); must_get_none(&mut getter, b"foo0"); } #[test] fn test_tombstone() { - let engine = TestEngineBuilder::new().build().unwrap(); - - must_prewrite_put(&engine, b"foo", b"bar", b"foo", 10); - must_prewrite_put(&engine, b"foo1", b"bar1", b"foo", 10); - must_prewrite_put(&engine, b"foo2", b"bar2", b"foo", 10); - must_prewrite_put(&engine, b"foo3", b"bar3", b"foo", 10); - must_commit(&engine, b"foo", 10, 20); - must_commit(&engine, b"foo1", 10, 20); - must_commit(&engine, b"foo2", 10, 20); - must_commit(&engine, b"foo3", 10, 20); - must_prewrite_delete(&engine, b"foo1", b"foo1", 30); - must_prewrite_delete(&engine, b"foo2", b"foo1", 30); - must_commit(&engine, b"foo1", 30, 40); - must_commit(&engine, b"foo2", 30, 40); - - must_gc(&engine, b"foo", 50); - must_gc(&engine, b"foo1", 50); - must_gc(&engine, b"foo2", 50); - must_gc(&engine, b"foo3", 50); - - let mut getter = new_point_getter(&engine, TimeStamp::max()); + let mut engine = TestEngineBuilder::new().build().unwrap(); + + must_prewrite_put(&mut engine, b"foo", b"bar", b"foo", 10); + must_prewrite_put(&mut engine, b"foo1", b"bar1", b"foo", 10); + must_prewrite_put(&mut engine, b"foo2", b"bar2", b"foo", 10); + must_prewrite_put(&mut engine, b"foo3", b"bar3", b"foo", 10); + must_commit(&mut engine, b"foo", 10, 20); + must_commit(&mut engine, b"foo1", 10, 20); + must_commit(&mut engine, b"foo2", 10, 20); + must_commit(&mut engine, b"foo3", 10, 20); + must_prewrite_delete(&mut engine, b"foo1", b"foo1", 30); + must_prewrite_delete(&mut engine, b"foo2", b"foo1", 30); + must_commit(&mut engine, b"foo1", 30, 40); + must_commit(&mut engine, b"foo2", 30, 40); + + must_gc(&mut engine, b"foo", 50); + must_gc(&mut engine, b"foo1", 50); + must_gc(&mut engine, b"foo2", 50); + must_gc(&mut engine, b"foo3", 50); + + let mut getter = new_point_getter(&mut engine, TimeStamp::max()); let perf_statistics = ReadPerfInstant::new(); must_get_value(&mut getter, b"foo", b"bar"); assert_eq!(perf_statistics.delta().internal_delete_skipped_count, 0); @@ -716,9 +716,9 @@ mod tests { #[test] fn test_with_iter_lower_bound() { - let engine = TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine, b"foo", b"bar", b"foo", 10); - must_commit(&engine, b"foo", 10, 20); + let mut engine = TestEngineBuilder::new().build().unwrap(); + must_prewrite_put(&mut engine, b"foo", b"bar", b"foo", 10); + must_commit(&mut engine, b"foo", 10, 20); let snapshot = engine.snapshot(Default::default()).unwrap(); let write_cursor = CursorBuilder::new(&snapshot, CF_WRITE) @@ -747,9 +747,9 @@ mod tests { /// Some ts larger than get ts #[test] fn test_basic_2() { - let engine = new_sample_engine(); + let mut engine = new_sample_engine(); - let mut getter = new_point_getter(&engine, 5.into()); + let mut getter = new_point_getter(&mut engine, 5.into()); must_get_value(&mut getter, b"bar", b"barv"); let s = getter.take_statistics(); @@ -814,9 +814,9 @@ mod tests { /// All ts larger than get ts #[test] fn test_basic_3() { - let engine = new_sample_engine(); + let mut engine = new_sample_engine(); - let mut getter = new_point_getter(&engine, 2.into()); + let mut getter = new_point_getter(&mut engine, 2.into()); must_get_none(&mut getter, b"foo1"); let s = getter.take_statistics(); @@ -838,9 +838,9 @@ mod tests { /// There are some locks in the Lock CF. #[test] fn test_locked() { - let engine = new_sample_engine_2(); + let mut engine = new_sample_engine_2(); - let mut getter = new_point_getter(&engine, 1.into()); + let mut getter = new_point_getter(&mut engine, 1.into()); must_get_none(&mut getter, b"a"); must_get_none(&mut getter, b"bar"); must_get_none(&mut getter, b"foo1"); @@ -849,7 +849,7 @@ mod tests { assert_seek_next_prev(&s.write, 4, 0, 0); assert_eq!(s.processed_size, 0); - let mut getter = new_point_getter(&engine, 3.into()); + let mut getter = new_point_getter(&mut engine, 3.into()); must_get_none(&mut getter, b"a"); must_get_value(&mut getter, b"bar", b"barv"); must_get_value(&mut getter, b"bar", b"barv"); @@ -868,7 +868,7 @@ mod tests { * 2 ); - let mut getter = new_point_getter(&engine, 4.into()); + let mut getter = new_point_getter(&mut engine, 4.into()); must_get_none(&mut getter, b"a"); must_get_err(&mut getter, b"bar"); must_get_err(&mut getter, b"bar"); @@ -887,7 +887,7 @@ mod tests { #[test] fn test_omit_value() { - let engine = new_sample_engine_2(); + let mut engine = new_sample_engine_2(); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -904,46 +904,46 @@ mod tests { #[test] fn test_get_latest_value() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key, val) = (b"foo", b"bar"); - must_prewrite_put(&engine, key, val, key, 10); - must_commit(&engine, key, 10, 20); + must_prewrite_put(&mut engine, key, val, key, 10); + must_commit(&mut engine, key, 10, 20); - let mut getter = new_point_getter(&engine, TimeStamp::max()); + let mut getter = new_point_getter(&mut engine, TimeStamp::max()); must_get_value(&mut getter, key, val); // Ignore the primary lock if read with max ts. - must_prewrite_delete(&engine, key, key, 30); - let mut getter = new_point_getter(&engine, TimeStamp::max()); + must_prewrite_delete(&mut engine, key, key, 30); + let mut getter = new_point_getter(&mut engine, TimeStamp::max()); must_get_value(&mut getter, key, val); - must_rollback(&engine, key, 30, false); + must_rollback(&mut engine, key, 30, false); // Should not ignore the secondary lock even though reading the latest version - must_prewrite_delete(&engine, key, b"bar", 40); - let mut getter = new_point_getter(&engine, TimeStamp::max()); + must_prewrite_delete(&mut engine, key, b"bar", 40); + let mut getter = new_point_getter(&mut engine, TimeStamp::max()); must_get_err(&mut getter, key); - must_rollback(&engine, key, 40, false); + must_rollback(&mut engine, key, 40, false); // Should get the latest committed value if there is a primary lock with a ts // less than the latest Write's commit_ts. // // write.start_ts(10) < primary_lock.start_ts(15) < write.commit_ts(20) - must_acquire_pessimistic_lock(&engine, key, key, 15, 50); - must_pessimistic_prewrite_delete(&engine, key, key, 15, 50, DoPessimisticCheck); - let mut getter = new_point_getter(&engine, TimeStamp::max()); + must_acquire_pessimistic_lock(&mut engine, key, key, 15, 50); + must_pessimistic_prewrite_delete(&mut engine, key, key, 15, 50, DoPessimisticCheck); + let mut getter = new_point_getter(&mut engine, TimeStamp::max()); must_get_value(&mut getter, key, val); } #[test] fn test_get_bypass_locks() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key, val) = (b"foo", b"bar"); - must_prewrite_put(&engine, key, val, key, 10); - must_commit(&engine, key, 10, 20); + must_prewrite_put(&mut engine, key, val, key, 10); + must_commit(&mut engine, key, 10, 20); - must_prewrite_delete(&engine, key, key, 30); + must_prewrite_delete(&mut engine, key, key, 30); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut getter = PointGetterBuilder::new(snapshot, 60.into()) @@ -964,9 +964,10 @@ mod tests { #[test] fn test_get_access_locks() { - let engine = TestEngineBuilder::new().build().unwrap(); - let build_getter = |ts: u64, bypass_locks, access_locks| { - let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); + let mut engine_clone = engine.clone(); + let mut build_getter = |ts: u64, bypass_locks, access_locks| { + let snapshot = engine_clone.snapshot(Default::default()).unwrap(); PointGetterBuilder::new(snapshot, ts.into()) .isolation_level(IsolationLevel::Si) .bypass_locks(TsSet::from_u64s(bypass_locks)) @@ -977,42 +978,42 @@ mod tests { // short value let (key, val) = (b"foo", b"bar"); - must_prewrite_put(&engine, key, val, key, 10); + must_prewrite_put(&mut engine, key, val, key, 10); must_get_value(&mut build_getter(20, vec![], vec![10]), key, val); - must_commit(&engine, key, 10, 15); + must_commit(&mut engine, key, 10, 15); must_get_value(&mut build_getter(20, vec![], vec![]), key, val); // load value from default cf. let val = "v".repeat(SHORT_VALUE_MAX_LEN + 1); let val = val.as_bytes(); - must_prewrite_put(&engine, key, val, key, 20); + must_prewrite_put(&mut engine, key, val, key, 20); must_get_value(&mut build_getter(30, vec![], vec![20]), key, val); - must_commit(&engine, key, 20, 25); + must_commit(&mut engine, key, 20, 25); must_get_value(&mut build_getter(30, vec![], vec![]), key, val); // delete - must_prewrite_delete(&engine, key, key, 30); + must_prewrite_delete(&mut engine, key, key, 30); must_get_none(&mut build_getter(40, vec![], vec![30]), key); - must_commit(&engine, key, 30, 35); + must_commit(&mut engine, key, 30, 35); must_get_none(&mut build_getter(40, vec![], vec![]), key); // ignore locks not blocking read let (key, val) = (b"foo", b"bar"); // lock's ts > read's ts - must_prewrite_put(&engine, key, val, key, 50); + must_prewrite_put(&mut engine, key, val, key, 50); must_get_none(&mut build_getter(45, vec![], vec![50]), key); - must_commit(&engine, key, 50, 55); + must_commit(&mut engine, key, 50, 55); // LockType::Lock - must_prewrite_lock(&engine, key, key, 60); + must_prewrite_lock(&mut engine, key, key, 60); must_get_value(&mut build_getter(65, vec![], vec![60]), key, val); - must_commit(&engine, key, 60, 65); + must_commit(&mut engine, key, 60, 65); // LockType::Pessimistic - must_acquire_pessimistic_lock(&engine, key, key, 70, 70); + must_acquire_pessimistic_lock(&mut engine, key, key, 70, 70); must_get_value(&mut build_getter(75, vec![], vec![70]), key, val); - must_rollback(&engine, key, 70, false); + must_rollback(&mut engine, key, 70, false); // lock's min_commit_ts > read's ts must_prewrite_put_impl( - &engine, + &mut engine, key, &val[..1], key, @@ -1029,117 +1030,117 @@ mod tests { AssertionLevel::Off, ); must_get_value(&mut build_getter(85, vec![], vec![80]), key, val); - must_rollback(&engine, key, 80, false); + must_rollback(&mut engine, key, 80, false); // read'ts == max && lock is a primary lock. - must_prewrite_put(&engine, key, &val[..1], key, 90); + must_prewrite_put(&mut engine, key, &val[..1], key, 90); must_get_value( &mut build_getter(TimeStamp::max().into_inner(), vec![], vec![90]), key, val, ); - must_rollback(&engine, key, 90, false); + must_rollback(&mut engine, key, 90, false); // lock in resolve_keys(it can't happen). - must_prewrite_put(&engine, key, &val[..1], key, 100); + must_prewrite_put(&mut engine, key, &val[..1], key, 100); must_get_value(&mut build_getter(105, vec![100], vec![100]), key, val); - must_rollback(&engine, key, 100, false); + must_rollback(&mut engine, key, 100, false); } #[test] fn test_met_newer_ts_data() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key, val1) = (b"foo", b"bar1"); - must_prewrite_put(&engine, key, val1, key, 10); - must_commit(&engine, key, 10, 20); + must_prewrite_put(&mut engine, key, val1, key, 10); + must_commit(&mut engine, key, 10, 20); let (key, val2) = (b"foo", b"bar2"); - must_prewrite_put(&engine, key, val2, key, 30); - must_commit(&engine, key, 30, 40); + must_prewrite_put(&mut engine, key, val2, key, 30); + must_commit(&mut engine, key, 30, 40); - must_met_newer_ts_data(&engine, 20, key, val1, true); - must_met_newer_ts_data(&engine, 30, key, val1, true); - must_met_newer_ts_data(&engine, 40, key, val2, false); - must_met_newer_ts_data(&engine, 50, key, val2, false); + must_met_newer_ts_data(&mut engine, 20, key, val1, true); + must_met_newer_ts_data(&mut engine, 30, key, val1, true); + must_met_newer_ts_data(&mut engine, 40, key, val2, false); + must_met_newer_ts_data(&mut engine, 50, key, val2, false); - must_prewrite_lock(&engine, key, key, 60); + must_prewrite_lock(&mut engine, key, key, 60); - must_met_newer_ts_data(&engine, 50, key, val2, true); - must_met_newer_ts_data(&engine, 60, key, val2, true); + must_met_newer_ts_data(&mut engine, 50, key, val2, true); + must_met_newer_ts_data(&mut engine, 60, key, val2, true); } #[test] fn test_point_get_check_gc_fence() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // PUT, Read // `--------------^ - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 10); - must_commit(&engine, b"k1", 10, 20); - must_cleanup_with_gc_fence(&engine, b"k1", 20, 0, 50, true); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 10); + must_commit(&mut engine, b"k1", 10, 20); + must_cleanup_with_gc_fence(&mut engine, b"k1", 20, 0, 50, true); // PUT, Read // `---------^ - must_prewrite_put(&engine, b"k2", b"v2", b"k2", 11); - must_commit(&engine, b"k2", 11, 20); - must_cleanup_with_gc_fence(&engine, b"k2", 20, 0, 40, true); + must_prewrite_put(&mut engine, b"k2", b"v2", b"k2", 11); + must_commit(&mut engine, b"k2", 11, 20); + must_cleanup_with_gc_fence(&mut engine, b"k2", 20, 0, 40, true); // PUT, Read // `-----^ - must_prewrite_put(&engine, b"k3", b"v3", b"k3", 12); - must_commit(&engine, b"k3", 12, 20); - must_cleanup_with_gc_fence(&engine, b"k3", 20, 0, 30, true); + must_prewrite_put(&mut engine, b"k3", b"v3", b"k3", 12); + must_commit(&mut engine, b"k3", 12, 20); + must_cleanup_with_gc_fence(&mut engine, b"k3", 20, 0, 30, true); // PUT, PUT, Read // `-----^ `----^ - must_prewrite_put(&engine, b"k4", b"v4", b"k4", 13); - must_commit(&engine, b"k4", 13, 14); - must_prewrite_put(&engine, b"k4", b"v4x", b"k4", 15); - must_commit(&engine, b"k4", 15, 20); - must_cleanup_with_gc_fence(&engine, b"k4", 14, 0, 20, false); - must_cleanup_with_gc_fence(&engine, b"k4", 20, 0, 30, true); + must_prewrite_put(&mut engine, b"k4", b"v4", b"k4", 13); + must_commit(&mut engine, b"k4", 13, 14); + must_prewrite_put(&mut engine, b"k4", b"v4x", b"k4", 15); + must_commit(&mut engine, b"k4", 15, 20); + must_cleanup_with_gc_fence(&mut engine, b"k4", 14, 0, 20, false); + must_cleanup_with_gc_fence(&mut engine, b"k4", 20, 0, 30, true); // PUT, DEL, Read // `-----^ `----^ - must_prewrite_put(&engine, b"k5", b"v5", b"k5", 13); - must_commit(&engine, b"k5", 13, 14); - must_prewrite_delete(&engine, b"k5", b"v5", 15); - must_commit(&engine, b"k5", 15, 20); - must_cleanup_with_gc_fence(&engine, b"k5", 14, 0, 20, false); - must_cleanup_with_gc_fence(&engine, b"k5", 20, 0, 30, true); + must_prewrite_put(&mut engine, b"k5", b"v5", b"k5", 13); + must_commit(&mut engine, b"k5", 13, 14); + must_prewrite_delete(&mut engine, b"k5", b"v5", 15); + must_commit(&mut engine, b"k5", 15, 20); + must_cleanup_with_gc_fence(&mut engine, b"k5", 14, 0, 20, false); + must_cleanup_with_gc_fence(&mut engine, b"k5", 20, 0, 30, true); // PUT, LOCK, LOCK, Read // `------------------------^ - must_prewrite_put(&engine, b"k6", b"v6", b"k6", 16); - must_commit(&engine, b"k6", 16, 20); - must_prewrite_lock(&engine, b"k6", b"k6", 25); - must_commit(&engine, b"k6", 25, 26); - must_prewrite_lock(&engine, b"k6", b"k6", 28); - must_commit(&engine, b"k6", 28, 29); - must_cleanup_with_gc_fence(&engine, b"k6", 20, 0, 50, true); + must_prewrite_put(&mut engine, b"k6", b"v6", b"k6", 16); + must_commit(&mut engine, b"k6", 16, 20); + must_prewrite_lock(&mut engine, b"k6", b"k6", 25); + must_commit(&mut engine, b"k6", 25, 26); + must_prewrite_lock(&mut engine, b"k6", b"k6", 28); + must_commit(&mut engine, b"k6", 28, 29); + must_cleanup_with_gc_fence(&mut engine, b"k6", 20, 0, 50, true); // PUT, LOCK, LOCK, Read // `---------^ - must_prewrite_put(&engine, b"k7", b"v7", b"k7", 16); - must_commit(&engine, b"k7", 16, 20); - must_prewrite_lock(&engine, b"k7", b"k7", 25); - must_commit(&engine, b"k7", 25, 26); - must_cleanup_with_gc_fence(&engine, b"k7", 20, 0, 27, true); - must_prewrite_lock(&engine, b"k7", b"k7", 28); - must_commit(&engine, b"k7", 28, 29); + must_prewrite_put(&mut engine, b"k7", b"v7", b"k7", 16); + must_commit(&mut engine, b"k7", 16, 20); + must_prewrite_lock(&mut engine, b"k7", b"k7", 25); + must_commit(&mut engine, b"k7", 25, 26); + must_cleanup_with_gc_fence(&mut engine, b"k7", 20, 0, 27, true); + must_prewrite_lock(&mut engine, b"k7", b"k7", 28); + must_commit(&mut engine, b"k7", 28, 29); // PUT, Read // * (GC fence ts is 0) - must_prewrite_put(&engine, b"k8", b"v8", b"k8", 17); - must_commit(&engine, b"k8", 17, 30); - must_cleanup_with_gc_fence(&engine, b"k8", 30, 0, 0, true); + must_prewrite_put(&mut engine, b"k8", b"v8", b"k8", 17); + must_commit(&mut engine, b"k8", 17, 30); + must_cleanup_with_gc_fence(&mut engine, b"k8", 30, 0, 0, true); // PUT, LOCK, Read // `-----------^ - must_prewrite_put(&engine, b"k9", b"v9", b"k9", 18); - must_commit(&engine, b"k9", 18, 20); - must_prewrite_lock(&engine, b"k9", b"k9", 25); - must_commit(&engine, b"k9", 25, 26); - must_cleanup_with_gc_fence(&engine, b"k9", 20, 0, 27, true); + must_prewrite_put(&mut engine, b"k9", b"v9", b"k9", 18); + must_commit(&mut engine, b"k9", 18, 20); + must_prewrite_lock(&mut engine, b"k9", b"k9", 25); + must_commit(&mut engine, b"k9", 25, 26); + must_cleanup_with_gc_fence(&mut engine, b"k9", 20, 0, 27, true); let expected_results = vec![ (b"k1", Some(b"v1")), @@ -1154,12 +1155,12 @@ mod tests { ]; for (k, v) in expected_results.iter().copied() { - let mut single_getter = new_point_getter(&engine, 40.into()); + let mut single_getter = new_point_getter(&mut engine, 40.into()); let value = single_getter.get(&Key::from_raw(k)).unwrap(); assert_eq!(value, v.map(|v| v.to_vec())); } - let mut getter = new_point_getter(&engine, 40.into()); + let mut getter = new_point_getter(&mut engine, 40.into()); for (k, v) in expected_results { let value = getter.get(&Key::from_raw(k)).unwrap(); assert_eq!(value, v.map(|v| v.to_vec())); @@ -1168,68 +1169,68 @@ mod tests { #[test] fn test_point_get_check_rc_ts() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key0, val0) = (b"k0", b"v0"); - must_prewrite_put(&engine, key0, val0, key0, 1); - must_commit(&engine, key0, 1, 5); + must_prewrite_put(&mut engine, key0, val0, key0, 1); + must_commit(&mut engine, key0, 1, 5); let (key1, val1) = (b"k1", b"v1"); - must_prewrite_put(&engine, key1, val1, key1, 10); - must_commit(&engine, key1, 10, 20); + must_prewrite_put(&mut engine, key1, val1, key1, 10); + must_commit(&mut engine, key1, 10, 20); let (key2, val2, val22) = (b"k2", b"v2", b"v22"); - must_prewrite_put(&engine, key2, val2, key2, 30); - must_commit(&engine, key2, 30, 40); - must_prewrite_put(&engine, key2, val22, key2, 41); - must_commit(&engine, key2, 41, 42); + must_prewrite_put(&mut engine, key2, val2, key2, 30); + must_commit(&mut engine, key2, 30, 40); + must_prewrite_put(&mut engine, key2, val22, key2, 41); + must_commit(&mut engine, key2, 41, 42); let (key3, val3) = (b"k3", b"v3"); - must_prewrite_put(&engine, key3, val3, key3, 50); + must_prewrite_put(&mut engine, key3, val3, key3, 50); let (key4, val4) = (b"k4", b"val4"); - must_prewrite_put(&engine, key4, val4, key4, 55); - must_commit(&engine, key4, 55, 56); - must_prewrite_lock(&engine, key4, key4, 60); + must_prewrite_put(&mut engine, key4, val4, key4, 55); + must_commit(&mut engine, key4, 55, 56); + must_prewrite_lock(&mut engine, key4, key4, 60); let (key5, val5) = (b"k5", b"val5"); - must_prewrite_put(&engine, key5, val5, key5, 57); - must_commit(&engine, key5, 57, 58); - must_acquire_pessimistic_lock(&engine, key5, key5, 65, 65); + must_prewrite_put(&mut engine, key5, val5, key5, 57); + must_commit(&mut engine, key5, 57, 58); + must_acquire_pessimistic_lock(&mut engine, key5, key5, 65, 65); // No more recent version. let mut getter_with_ts_ok = - new_point_getter_with_iso(&engine, 25.into(), IsolationLevel::RcCheckTs); + new_point_getter_with_iso(&mut engine, 25.into(), IsolationLevel::RcCheckTs); must_get_value(&mut getter_with_ts_ok, key1, val1); // The read_ts is stale error should be reported. let mut getter_not_ok = - new_point_getter_with_iso(&engine, 35.into(), IsolationLevel::RcCheckTs); + new_point_getter_with_iso(&mut engine, 35.into(), IsolationLevel::RcCheckTs); must_get_err(&mut getter_not_ok, key2); // Though lock.ts > read_ts error should still be reported. let mut getter_not_ok = - new_point_getter_with_iso(&engine, 45.into(), IsolationLevel::RcCheckTs); + new_point_getter_with_iso(&mut engine, 45.into(), IsolationLevel::RcCheckTs); must_get_err(&mut getter_not_ok, key3); // Error should not be reported if the lock type is rollback or lock. let mut getter_ok = - new_point_getter_with_iso(&engine, 70.into(), IsolationLevel::RcCheckTs); + new_point_getter_with_iso(&mut engine, 70.into(), IsolationLevel::RcCheckTs); must_get_value(&mut getter_ok, key4, val4); let mut getter_ok = - new_point_getter_with_iso(&engine, 70.into(), IsolationLevel::RcCheckTs); + new_point_getter_with_iso(&mut engine, 70.into(), IsolationLevel::RcCheckTs); must_get_value(&mut getter_ok, key5, val5); // Test batch point get. Report error if more recent version is met. let mut batch_getter = - new_point_getter_with_iso(&engine, 35.into(), IsolationLevel::RcCheckTs); + new_point_getter_with_iso(&mut engine, 35.into(), IsolationLevel::RcCheckTs); must_get_value(&mut batch_getter, key0, val0); must_get_value(&mut batch_getter, key1, val1); must_get_err(&mut batch_getter, key2); // Test batch point get. Report error if lock is met though lock.ts > read_ts. let mut batch_getter = - new_point_getter_with_iso(&engine, 45.into(), IsolationLevel::RcCheckTs); + new_point_getter_with_iso(&mut engine, 45.into(), IsolationLevel::RcCheckTs); must_get_value(&mut batch_getter, key0, val0); must_get_value(&mut batch_getter, key1, val1); must_get_value(&mut batch_getter, key2, val22); @@ -1238,7 +1239,7 @@ mod tests { // Test batch point get. Error should not be reported if the lock type is // rollback or lock. let mut batch_getter_ok = - new_point_getter_with_iso(&engine, 70.into(), IsolationLevel::RcCheckTs); + new_point_getter_with_iso(&mut engine, 70.into(), IsolationLevel::RcCheckTs); must_get_value(&mut batch_getter_ok, key4, val4); must_get_value(&mut batch_getter_ok, key5, val5); } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 1aff262186c..0f6eb5a390e 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -2428,7 +2428,7 @@ pub mod tests { }, ]; for (i, case) in cases.into_iter().enumerate() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(42.into()); let mut txn = MvccTxn::new(TimeStamp::new(10), cm.clone()); for (write_record, put_ts) in case.written.iter() { @@ -2461,7 +2461,7 @@ pub mod tests { // Must return Oldvalue::None when prev_write_loaded is true and prev_write is // None. - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, None, true); let prev_write_loaded = true; diff --git a/src/storage/mvcc/reader/scanner/backward.rs b/src/storage/mvcc/reader/scanner/backward.rs index 11ed487cd56..ee1780b76b4 100644 --- a/src/storage/mvcc/reader/scanner/backward.rs +++ b/src/storage/mvcc/reader/scanner/backward.rs @@ -506,30 +506,30 @@ mod tests { #[test] fn test_basic() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate REVERSE_SEEK_BOUND / 2 Put for key [10]. let k = &[10_u8]; for ts in 0..REVERSE_SEEK_BOUND / 2 { - must_prewrite_put(&engine, k, &[ts as u8], k, ts); - must_commit(&engine, k, ts, ts); + must_prewrite_put(&mut engine, k, &[ts as u8], k, ts); + must_commit(&mut engine, k, ts, ts); } // Generate REVERSE_SEEK_BOUND + 1 Put for key [9]. let k = &[9_u8]; for ts in 0..=REVERSE_SEEK_BOUND { - must_prewrite_put(&engine, k, &[ts as u8], k, ts); - must_commit(&engine, k, ts, ts); + must_prewrite_put(&mut engine, k, &[ts as u8], k, ts); + must_commit(&mut engine, k, ts, ts); } // Generate REVERSE_SEEK_BOUND / 2 Put and REVERSE_SEEK_BOUND / 2 + 1 Rollback // for key [8]. let k = &[8_u8]; for ts in 0..=REVERSE_SEEK_BOUND { - must_prewrite_put(&engine, k, &[ts as u8], k, ts); + must_prewrite_put(&mut engine, k, &[ts as u8], k, ts); if ts < REVERSE_SEEK_BOUND / 2 { - must_commit(&engine, k, ts, ts); + must_commit(&mut engine, k, ts, ts); } else { let modifies = vec![ // ts is rather small, so it is ok to `as u8` @@ -548,16 +548,16 @@ mod tests { // Rollback for key [7]. let k = &[7_u8]; for ts in 0..REVERSE_SEEK_BOUND / 2 { - must_prewrite_put(&engine, k, &[ts as u8], k, ts); - must_commit(&engine, k, ts, ts); + must_prewrite_put(&mut engine, k, &[ts as u8], k, ts); + must_commit(&mut engine, k, ts, ts); } { let ts = REVERSE_SEEK_BOUND / 2; - must_prewrite_delete(&engine, k, k, ts); - must_commit(&engine, k, ts, ts); + must_prewrite_delete(&mut engine, k, k, ts); + must_commit(&mut engine, k, ts, ts); } for ts in REVERSE_SEEK_BOUND / 2 + 1..=REVERSE_SEEK_BOUND { - must_prewrite_put(&engine, k, &[ts as u8], k, ts); + must_prewrite_put(&mut engine, k, &[ts as u8], k, ts); let modifies = vec![ // ts is rather small, so it is ok to `as u8` Modify::Put( @@ -573,14 +573,14 @@ mod tests { // Generate 1 PUT for key [6]. let k = &[6_u8]; for ts in 0..1 { - must_prewrite_put(&engine, k, &[ts as u8], k, ts); - must_commit(&engine, k, ts, ts); + must_prewrite_put(&mut engine, k, &[ts as u8], k, ts); + must_commit(&mut engine, k, ts, ts); } // Generate REVERSE_SEEK_BOUND + 1 Rollback for key [5]. let k = &[5_u8]; for ts in 0..=REVERSE_SEEK_BOUND { - must_prewrite_put(&engine, k, &[ts as u8], k, ts); + must_prewrite_put(&mut engine, k, &[ts as u8], k, ts); let modifies = vec![ // ts is rather small, so it is ok to `as u8` Modify::Put( @@ -597,8 +597,8 @@ mod tests { // with ts = REVERSE_SEEK_BOUND + 1 for key [4]. let k = &[4_u8]; for ts in REVERSE_SEEK_BOUND..REVERSE_SEEK_BOUND + 2 { - must_prewrite_put(&engine, k, &[ts as u8], k, ts); - must_commit(&engine, k, ts, ts); + must_prewrite_put(&mut engine, k, &[ts as u8], k, ts); + must_commit(&mut engine, k, ts, ts); } // Assume REVERSE_SEEK_BOUND == 4, we have keys: @@ -806,7 +806,7 @@ mod tests { /// Case 1. prev out of bound, next_version is None. #[test] fn test_reverse_get_out_of_bound_1() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate N/2 rollback for [b]. for ts in 0..REVERSE_SEEK_BOUND / 2 { @@ -823,9 +823,9 @@ mod tests { } // Generate 1 put for [c]. - must_prewrite_put(&engine, b"c", b"value", b"c", REVERSE_SEEK_BOUND * 2); + must_prewrite_put(&mut engine, b"c", b"value", b"c", REVERSE_SEEK_BOUND * 2); must_commit( - &engine, + &mut engine, b"c", REVERSE_SEEK_BOUND * 2, REVERSE_SEEK_BOUND * 2, @@ -890,11 +890,11 @@ mod tests { /// Case 2. prev out of bound, next_version is Some. #[test] fn test_reverse_get_out_of_bound_2() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put and N/2 rollback for [b]. - must_prewrite_put(&engine, b"b", b"value_b", b"b", 0); - must_commit(&engine, b"b", 0, 0); + must_prewrite_put(&mut engine, b"b", b"value_b", b"b", 0); + must_commit(&mut engine, b"b", 0, 0); for ts in 1..=REVERSE_SEEK_BOUND / 2 { let modifies = vec![ // ts is rather small, so it is ok to `as u8` @@ -909,9 +909,9 @@ mod tests { } // Generate 1 put for [c]. - must_prewrite_put(&engine, b"c", b"value_c", b"c", REVERSE_SEEK_BOUND * 2); + must_prewrite_put(&mut engine, b"c", b"value_c", b"c", REVERSE_SEEK_BOUND * 2); must_commit( - &engine, + &mut engine, b"c", REVERSE_SEEK_BOUND * 2, REVERSE_SEEK_BOUND * 2, @@ -983,16 +983,16 @@ mod tests { /// Case 1. prev() out of bound #[test] fn test_move_prev_user_key_out_of_bound_1() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Generate 1 put for [c]. - must_prewrite_put(&engine, b"c", b"value", b"c", 1); - must_commit(&engine, b"c", 1, 1); + must_prewrite_put(&mut engine, b"c", b"value", b"c", 1); + must_commit(&mut engine, b"c", 1, 1); // Generate N/2 put for [b] . for ts in 1..=SEEK_BOUND / 2 { - must_prewrite_put(&engine, b"b", &[ts as u8], b"b", ts); - must_commit(&engine, b"b", ts, ts); + must_prewrite_put(&mut engine, b"b", &[ts as u8], b"b", ts); + must_commit(&mut engine, b"b", ts, ts); } let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1065,16 +1065,16 @@ mod tests { /// Case 2. seek_for_prev() out of bound #[test] fn test_move_prev_user_key_out_of_bound_2() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Generate 1 put for [c]. - must_prewrite_put(&engine, b"c", b"value", b"c", 1); - must_commit(&engine, b"c", 1, 1); + must_prewrite_put(&mut engine, b"c", b"value", b"c", 1); + must_commit(&mut engine, b"c", 1, 1); // Generate N+1 put for [b] . for ts in 1..SEEK_BOUND + 2 { - must_prewrite_put(&engine, b"b", &[ts as u8], b"b", ts); - must_commit(&engine, b"b", ts, ts); + must_prewrite_put(&mut engine, b"b", &[ts as u8], b"b", ts); + must_commit(&mut engine, b"b", ts, ts); } let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1153,18 +1153,18 @@ mod tests { /// Case 3. a more complicated case #[test] fn test_move_prev_user_key_out_of_bound_3() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // N denotes for SEEK_BOUND, M denotes for REVERSE_SEEK_BOUND // Generate 1 put for [c]. - must_prewrite_put(&engine, b"c", b"value", b"c", 1); - must_commit(&engine, b"c", 1, 1); + must_prewrite_put(&mut engine, b"c", b"value", b"c", 1); + must_commit(&mut engine, b"c", 1, 1); // Generate N+M+1 put for [b] . for ts in 1..SEEK_BOUND + REVERSE_SEEK_BOUND + 2 { - must_prewrite_put(&engine, b"b", &[ts as u8], b"b", ts); - must_commit(&engine, b"b", ts, ts); + must_prewrite_put(&mut engine, b"b", &[ts as u8], b"b", ts); + must_commit(&mut engine, b"b", ts, ts); } let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1246,21 +1246,21 @@ mod tests { /// Range is left open right closed. #[test] fn test_range() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Generate 1 put for [1], [2] ... [6]. for i in 1..7 { // ts = 1: value = [] - must_prewrite_put(&engine, &[i], &[], &[i], 1); - must_commit(&engine, &[i], 1, 1); + must_prewrite_put(&mut engine, &[i], &[], &[i], 1); + must_commit(&mut engine, &[i], 1, 1); // ts = 7: value = [ts] - must_prewrite_put(&engine, &[i], &[i], &[i], 7); - must_commit(&engine, &[i], 7, 7); + must_prewrite_put(&mut engine, &[i], &[i], &[i], 7); + must_commit(&mut engine, &[i], 7, 7); // ts = 14: value = [] - must_prewrite_put(&engine, &[i], &[], &[i], 14); - must_commit(&engine, &[i], 14, 14); + must_prewrite_put(&mut engine, &[i], &[], &[i], 14); + must_commit(&mut engine, &[i], 14, 14); } let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1376,7 +1376,7 @@ mod tests { #[test] fn test_many_tombstones() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Generate RocksDB tombstones in write cf. let start_ts = 1; @@ -1384,11 +1384,11 @@ mod tests { for i in 0..16 { for y in 0..16 { let pk = &[i as u8, y as u8]; - must_prewrite_put(&engine, pk, b"", pk, start_ts); - must_rollback(&engine, pk, start_ts, false); + must_prewrite_put(&mut engine, pk, b"", pk, start_ts); + must_rollback(&mut engine, pk, start_ts, false); // Generate 254 RocksDB tombstones between [0,0] and [15,15]. if !((i == 0 && y == 0) || (i == 15 && y == 15)) { - must_gc(&engine, pk, safe_point); + must_gc(&mut engine, pk, safe_point); } } } @@ -1397,7 +1397,7 @@ mod tests { let start_ts = 3; for i in 0..16 { let pk = &[i as u8]; - must_prewrite_put(&engine, pk, b"", pk, start_ts); + must_prewrite_put(&mut engine, pk, b"", pk, start_ts); } let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1420,9 +1420,9 @@ mod tests { #[test] fn test_backward_scanner_check_gc_fence() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); - let (read_ts, expected_result) = prepare_test_data_for_check_gc_fence(&engine); + let (read_ts, expected_result) = prepare_test_data_for_check_gc_fence(&mut engine); let expected_result: Vec<_> = expected_result .into_iter() .filter_map(|(key, value)| value.map(|v| (key, v))) @@ -1446,34 +1446,34 @@ mod tests { #[test] fn test_rc_read_check_ts() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key0, val0) = (b"k0", b"v0"); - must_prewrite_put(&engine, key0, val0, key0, 60); + must_prewrite_put(&mut engine, key0, val0, key0, 60); let (key1, val1) = (b"k1", b"v1"); - must_prewrite_put(&engine, key1, val1, key1, 25); - must_commit(&engine, key1, 25, 30); + must_prewrite_put(&mut engine, key1, val1, key1, 25); + must_commit(&mut engine, key1, 25, 30); let (key2, val2, val22) = (b"k2", b"v2", b"v22"); - must_prewrite_put(&engine, key2, val2, key2, 6); - must_commit(&engine, key2, 6, 9); - must_prewrite_put(&engine, key2, val22, key2, 10); - must_commit(&engine, key2, 10, 20); + must_prewrite_put(&mut engine, key2, val2, key2, 6); + must_commit(&mut engine, key2, 6, 9); + must_prewrite_put(&mut engine, key2, val22, key2, 10); + must_commit(&mut engine, key2, 10, 20); let (key3, val3) = (b"k3", b"v3"); - must_prewrite_put(&engine, key3, val3, key3, 5); - must_commit(&engine, key3, 5, 6); + must_prewrite_put(&mut engine, key3, val3, key3, 5); + must_commit(&mut engine, key3, 5, 6); let (key4, val4) = (b"k4", b"val4"); - must_prewrite_put(&engine, key4, val4, key4, 3); - must_commit(&engine, key4, 3, 4); - must_prewrite_lock(&engine, key4, key4, 5); + must_prewrite_put(&mut engine, key4, val4, key4, 3); + must_commit(&mut engine, key4, 3, 4); + must_prewrite_lock(&mut engine, key4, key4, 5); let (key5, val5) = (b"k5", b"val5"); - must_prewrite_put(&engine, key5, val5, key5, 1); - must_commit(&engine, key5, 1, 2); - must_acquire_pessimistic_lock(&engine, key5, key5, 3, 3); + must_prewrite_put(&mut engine, key5, val5, key5, 1); + must_commit(&mut engine, key5, 1, 2); + must_acquire_pessimistic_lock(&mut engine, key5, key5, 3, 3); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, 29.into()) diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index aee185e307f..c59c20fbe05 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -1005,7 +1005,7 @@ pub mod test_util { #[allow(clippy::type_complexity)] pub fn prepare_test_data_for_check_gc_fence( - engine: &impl Engine, + engine: &mut impl Engine, ) -> (TimeStamp, Vec<(Vec, Option>)>) { // Generates test data that is consistent after timestamp 40. @@ -1120,12 +1120,12 @@ mod latest_kv_tests { /// goes out of bound. #[test] fn test_get_out_of_bound() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put for [a]. - must_prewrite_put(&engine, b"a", b"value", b"a", 7); - must_commit(&engine, b"a", 7, 7); + must_prewrite_put(&mut engine, b"a", b"value", b"a", 7); + must_commit(&mut engine, b"a", 7, 7); // Generate 5 rollback for [b]. for ts in 0..5 { @@ -1189,11 +1189,11 @@ mod latest_kv_tests { /// Case 1. next() out of bound #[test] fn test_move_next_user_key_out_of_bound_1() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put for [a]. - must_prewrite_put(&engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); - must_commit(&engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); + must_prewrite_put(&mut engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); + must_commit(&mut engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); // Generate SEEK_BOUND / 2 rollback and 1 put for [b] . for ts in 0..SEEK_BOUND / 2 { @@ -1208,8 +1208,8 @@ mod latest_kv_tests { ]; write(&engine, &ctx, modifies); } - must_prewrite_put(&engine, b"b", b"b_value", b"a", SEEK_BOUND / 2); - must_commit(&engine, b"b", SEEK_BOUND / 2, SEEK_BOUND / 2); + must_prewrite_put(&mut engine, b"b", b"b_value", b"a", SEEK_BOUND / 2); + must_commit(&mut engine, b"b", SEEK_BOUND / 2, SEEK_BOUND / 2); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, (SEEK_BOUND * 2).into()) @@ -1271,12 +1271,12 @@ mod latest_kv_tests { /// Case 2. seek() out of bound #[test] fn test_move_next_user_key_out_of_bound_2() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put for [a]. - must_prewrite_put(&engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); - must_commit(&engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); + must_prewrite_put(&mut engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); + must_commit(&mut engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); // Generate SEEK_BOUND-1 rollback and 1 put for [b] . for ts in 1..SEEK_BOUND { @@ -1291,8 +1291,8 @@ mod latest_kv_tests { ]; write(&engine, &ctx, modifies); } - must_prewrite_put(&engine, b"b", b"b_value", b"a", SEEK_BOUND); - must_commit(&engine, b"b", SEEK_BOUND, SEEK_BOUND); + must_prewrite_put(&mut engine, b"b", b"b_value", b"a", SEEK_BOUND); + must_commit(&mut engine, b"b", SEEK_BOUND, SEEK_BOUND); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, (SEEK_BOUND * 2).into()) @@ -1353,21 +1353,21 @@ mod latest_kv_tests { /// Range is left open right closed. #[test] fn test_range() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Generate 1 put for [1], [2] ... [6]. for i in 1..7 { // ts = 1: value = [] - must_prewrite_put(&engine, &[i], &[], &[i], 1); - must_commit(&engine, &[i], 1, 1); + must_prewrite_put(&mut engine, &[i], &[], &[i], 1); + must_commit(&mut engine, &[i], 1, 1); // ts = 7: value = [ts] - must_prewrite_put(&engine, &[i], &[i], &[i], 7); - must_commit(&engine, &[i], 7, 7); + must_prewrite_put(&mut engine, &[i], &[i], &[i], 7); + must_commit(&mut engine, &[i], 7, 7); // ts = 14: value = [] - must_prewrite_put(&engine, &[i], &[], &[i], 14); - must_commit(&engine, &[i], 14, 14); + must_prewrite_put(&mut engine, &[i], &[], &[i], 14); + must_commit(&mut engine, &[i], 14, 14); } let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1478,9 +1478,9 @@ mod latest_kv_tests { #[test] fn test_latest_kv_check_gc_fence() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); - let (read_ts, expected_result) = prepare_test_data_for_check_gc_fence(&engine); + let (read_ts, expected_result) = prepare_test_data_for_check_gc_fence(&mut engine); let expected_result: Vec<_> = expected_result .into_iter() .filter_map(|(key, value)| value.map(|v| (key, v))) @@ -1502,38 +1502,38 @@ mod latest_kv_tests { #[test] fn test_rc_read_check_ts() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key0, val0) = (b"k0", b"v0"); - must_prewrite_put(&engine, key0, val0, key0, 1); - must_commit(&engine, key0, 1, 5); + must_prewrite_put(&mut engine, key0, val0, key0, 1); + must_commit(&mut engine, key0, 1, 5); let (key1, val1) = (b"k1", b"v1"); - must_prewrite_put(&engine, key1, val1, key1, 10); - must_commit(&engine, key1, 10, 20); + must_prewrite_put(&mut engine, key1, val1, key1, 10); + must_commit(&mut engine, key1, 10, 20); let (key2, val2, val22) = (b"k2", b"v2", b"v22"); - must_prewrite_put(&engine, key2, val2, key2, 30); - must_commit(&engine, key2, 30, 40); - must_prewrite_put(&engine, key2, val22, key2, 41); - must_commit(&engine, key2, 41, 42); + must_prewrite_put(&mut engine, key2, val2, key2, 30); + must_commit(&mut engine, key2, 30, 40); + must_prewrite_put(&mut engine, key2, val22, key2, 41); + must_commit(&mut engine, key2, 41, 42); let (key3, val3) = (b"k3", b"v3"); - must_prewrite_put(&engine, key3, val3, key3, 50); - must_commit(&engine, key3, 50, 51); + must_prewrite_put(&mut engine, key3, val3, key3, 50); + must_commit(&mut engine, key3, 50, 51); let (key4, val4) = (b"k4", b"val4"); - must_prewrite_put(&engine, key4, val4, key4, 55); - must_commit(&engine, key4, 55, 56); - must_prewrite_lock(&engine, key4, key4, 60); + must_prewrite_put(&mut engine, key4, val4, key4, 55); + must_commit(&mut engine, key4, 55, 56); + must_prewrite_lock(&mut engine, key4, key4, 60); let (key5, val5) = (b"k5", b"val5"); - must_prewrite_put(&engine, key5, val5, key5, 57); - must_commit(&engine, key5, 57, 58); - must_acquire_pessimistic_lock(&engine, key5, key5, 65, 65); + must_prewrite_put(&mut engine, key5, val5, key5, 57); + must_commit(&mut engine, key5, 57, 58); + must_acquire_pessimistic_lock(&mut engine, key5, key5, 65, 65); let (key6, val6) = (b"k6", b"v6"); - must_prewrite_put(&engine, key6, val6, key6, 75); + must_prewrite_put(&mut engine, key6, val6, key6, 75); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, 35.into()) @@ -1607,12 +1607,12 @@ mod latest_entry_tests { /// out of bound. #[test] fn test_get_out_of_bound() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put for [a]. - must_prewrite_put(&engine, b"a", b"value", b"a", 7); - must_commit(&engine, b"a", 7, 7); + must_prewrite_put(&mut engine, b"a", b"value", b"a", 7); + must_commit(&mut engine, b"a", 7, 7); // Generate 5 rollback for [b]. for ts in 0..5 { @@ -1678,12 +1678,12 @@ mod latest_entry_tests { /// Case 1. next() out of bound #[test] fn test_move_next_user_key_out_of_bound_1() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put for [a]. - must_prewrite_put(&engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); - must_commit(&engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); + must_prewrite_put(&mut engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); + must_commit(&mut engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); // Generate SEEK_BOUND / 2 rollback and 1 put for [b] . for ts in 0..SEEK_BOUND / 2 { @@ -1698,8 +1698,8 @@ mod latest_entry_tests { ]; write(&engine, &ctx, modifies); } - must_prewrite_put(&engine, b"b", b"b_value", b"a", SEEK_BOUND / 2); - must_commit(&engine, b"b", SEEK_BOUND / 2, SEEK_BOUND / 2); + must_prewrite_put(&mut engine, b"b", b"b_value", b"a", SEEK_BOUND / 2); + must_commit(&mut engine, b"b", SEEK_BOUND / 2, SEEK_BOUND / 2); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, (SEEK_BOUND * 2).into()) @@ -1762,12 +1762,12 @@ mod latest_entry_tests { /// Case 2. seek() out of bound #[test] fn test_move_next_user_key_out_of_bound_2() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put for [a]. - must_prewrite_put(&engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); - must_commit(&engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); + must_prewrite_put(&mut engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); + must_commit(&mut engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); // Generate SEEK_BOUND-1 rollback and 1 put for [b] . for ts in 1..SEEK_BOUND { @@ -1782,8 +1782,8 @@ mod latest_entry_tests { ]; write(&engine, &ctx, modifies); } - must_prewrite_put(&engine, b"b", b"b_value", b"a", SEEK_BOUND); - must_commit(&engine, b"b", SEEK_BOUND, SEEK_BOUND); + must_prewrite_put(&mut engine, b"b", b"b_value", b"a", SEEK_BOUND); + must_commit(&mut engine, b"b", SEEK_BOUND, SEEK_BOUND); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, (SEEK_BOUND * 2).into()) @@ -1846,21 +1846,21 @@ mod latest_entry_tests { /// Range is left open right closed. #[test] fn test_range() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Generate 1 put for [1], [2] ... [6]. for i in 1..7 { // ts = 1: value = [] - must_prewrite_put(&engine, &[i], &[], &[i], 1); - must_commit(&engine, &[i], 1, 1); + must_prewrite_put(&mut engine, &[i], &[], &[i], 1); + must_commit(&mut engine, &[i], 1, 1); // ts = 7: value = [ts] - must_prewrite_put(&engine, &[i], &[i], &[i], 7); - must_commit(&engine, &[i], 7, 7); + must_prewrite_put(&mut engine, &[i], &[i], &[i], 7); + must_commit(&mut engine, &[i], 7, 7); // ts = 14: value = [] - must_prewrite_put(&engine, &[i], &[], &[i], 14); - must_commit(&engine, &[i], 14, 14); + must_prewrite_put(&mut engine, &[i], &[], &[i], 14); + must_commit(&mut engine, &[i], 14, 14); } let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1918,20 +1918,20 @@ mod latest_entry_tests { #[test] fn test_output_delete_and_after_ts() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate put for [a] at 3. - must_prewrite_put(&engine, b"a", b"a_3", b"a", 3); - must_commit(&engine, b"a", 3, 3); + must_prewrite_put(&mut engine, b"a", b"a_3", b"a", 3); + must_commit(&mut engine, b"a", 3, 3); // Generate put for [a] at 7. - must_prewrite_put(&engine, b"a", b"a_7", b"a", 7); - must_commit(&engine, b"a", 7, 7); + must_prewrite_put(&mut engine, b"a", b"a_7", b"a", 7); + must_commit(&mut engine, b"a", 7, 7); // Generate put for [b] at 1. - must_prewrite_put(&engine, b"b", b"b_1", b"b", 1); - must_commit(&engine, b"b", 1, 1); + must_prewrite_put(&mut engine, b"b", b"b_1", b"b", 1); + must_commit(&mut engine, b"b", 1, 1); // Generate rollbacks for [b] at 2, 3, 4. for ts in 2..5 { @@ -1948,8 +1948,8 @@ mod latest_entry_tests { } // Generate delete for [b] at 10. - must_prewrite_delete(&engine, b"b", b"b", 10); - must_commit(&engine, b"b", 10, 10); + must_prewrite_delete(&mut engine, b"b", b"b", 10); + must_commit(&mut engine, b"b", 10, 10); let entry_a_3 = EntryBuilder::default() .key(b"a") @@ -1975,7 +1975,7 @@ mod latest_entry_tests { .commit_ts(10.into()) .build_commit(WriteType::Delete, true); - let check = |ts: u64, after_ts: u64, output_delete, expected: Vec<&TxnEntry>| { + let mut check = |ts: u64, after_ts: u64, output_delete, expected: Vec<&TxnEntry>| { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, ts.into()) .range(None, None) @@ -2003,9 +2003,9 @@ mod latest_entry_tests { #[test] fn test_latest_entry_check_gc_fence() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); - let (read_ts, expected_result) = prepare_test_data_for_check_gc_fence(&engine); + let (read_ts, expected_result) = prepare_test_data_for_check_gc_fence(&mut engine); let expected_result: Vec<_> = expected_result .into_iter() .filter_map(|(key, value)| value.map(|v| (key, v))) @@ -2039,12 +2039,12 @@ mod delta_entry_tests { /// bound. #[test] fn test_get_out_of_bound() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put for [a]. - must_prewrite_put(&engine, b"a", b"value", b"a", 7); - must_commit(&engine, b"a", 7, 7); + must_prewrite_put(&mut engine, b"a", b"value", b"a", 7); + must_commit(&mut engine, b"a", 7, 7); // Generate 5 rollback for [b]. for ts in 0..5 { @@ -2110,11 +2110,11 @@ mod delta_entry_tests { /// Case 1. next() out of bound #[test] fn test_move_next_user_key_out_of_bound_1() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put for [a]. - must_prewrite_put(&engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); - must_commit(&engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); + must_prewrite_put(&mut engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); + must_commit(&mut engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); // Generate SEEK_BOUND / 2 rollback and 1 put for [b] . for ts in 0..SEEK_BOUND / 2 { @@ -2129,8 +2129,8 @@ mod delta_entry_tests { ]; write(&engine, &ctx, modifies); } - must_prewrite_put(&engine, b"b", b"b_value", b"a", SEEK_BOUND / 2); - must_commit(&engine, b"b", SEEK_BOUND / 2, SEEK_BOUND / 2); + must_prewrite_put(&mut engine, b"b", b"b_value", b"a", SEEK_BOUND / 2); + must_commit(&mut engine, b"b", SEEK_BOUND / 2, SEEK_BOUND / 2); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, (SEEK_BOUND * 2).into()) @@ -2193,12 +2193,12 @@ mod delta_entry_tests { /// Case 2. seek() out of bound #[test] fn test_move_next_user_key_out_of_bound_2() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate 1 put for [a]. - must_prewrite_put(&engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); - must_commit(&engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); + must_prewrite_put(&mut engine, b"a", b"a_value", b"a", SEEK_BOUND * 2); + must_commit(&mut engine, b"a", SEEK_BOUND * 2, SEEK_BOUND * 2); // Generate SEEK_BOUND rollback and 1 put for [b] . // It differs from EntryScanner that this will try to fetch multiple versions of @@ -2215,8 +2215,8 @@ mod delta_entry_tests { ]; write(&engine, &ctx, modifies); } - must_prewrite_put(&engine, b"b", b"b_value", b"a", SEEK_BOUND + 1); - must_commit(&engine, b"b", SEEK_BOUND + 1, SEEK_BOUND + 1); + must_prewrite_put(&mut engine, b"b", b"b_value", b"a", SEEK_BOUND + 1); + must_commit(&mut engine, b"b", SEEK_BOUND + 1, SEEK_BOUND + 1); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, (SEEK_BOUND * 2).into()) @@ -2279,21 +2279,21 @@ mod delta_entry_tests { /// Range is left open right closed. #[test] fn test_range() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Generate 1 put for [1], [2] ... [6]. for i in 1..7 { // ts = 1: value = [] - must_prewrite_put(&engine, &[i], &[], &[i], 1); - must_commit(&engine, &[i], 1, 1); + must_prewrite_put(&mut engine, &[i], &[], &[i], 1); + must_commit(&mut engine, &[i], 1, 1); // ts = 7: value = [ts] - must_prewrite_put(&engine, &[i], &[i], &[i], 7); - must_commit(&engine, &[i], 7, 7); + must_prewrite_put(&mut engine, &[i], &[i], &[i], 7); + must_commit(&mut engine, &[i], 7, 7); // ts = 14: value = [] - must_prewrite_put(&engine, &[i], &[], &[i], 14); - must_commit(&engine, &[i], 14, 14); + must_prewrite_put(&mut engine, &[i], &[], &[i], 14); + must_commit(&mut engine, &[i], 14, 14); } let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -2472,16 +2472,16 @@ mod delta_entry_tests { .collect::>() }; - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); for (key, lock, writes) in &test_data { for (start_ts, commit_ts, write_type, value) in writes { let value = make_value(value); if *write_type != WriteType::Rollback { - must_acquire_pessimistic_lock(&engine, key, key, start_ts, commit_ts - 1); + must_acquire_pessimistic_lock(&mut engine, key, key, start_ts, commit_ts - 1); } match write_type { WriteType::Put => must_pessimistic_prewrite_put( - &engine, + &mut engine, key, &value, key, @@ -2490,7 +2490,7 @@ mod delta_entry_tests { DoPessimisticCheck, ), WriteType::Delete => must_pessimistic_prewrite_delete( - &engine, + &mut engine, key, key, start_ts, @@ -2498,17 +2498,17 @@ mod delta_entry_tests { DoPessimisticCheck, ), WriteType::Lock => must_pessimistic_prewrite_lock( - &engine, + &mut engine, key, key, start_ts, commit_ts - 1, DoPessimisticCheck, ), - WriteType::Rollback => must_rollback(&engine, key, start_ts, false), + WriteType::Rollback => must_rollback(&mut engine, key, start_ts, false), } if *write_type != WriteType::Rollback { - must_commit(&engine, key, start_ts, commit_ts); + must_commit(&mut engine, key, start_ts, commit_ts); } } @@ -2520,10 +2520,10 @@ mod delta_entry_tests { .map(|(_, commit_ts, ..)| commit_ts) .unwrap_or(0); let for_update_ts = std::cmp::max(*ts, max_commit_ts + 1); - must_acquire_pessimistic_lock(&engine, key, key, *ts, for_update_ts); + must_acquire_pessimistic_lock(&mut engine, key, key, *ts, for_update_ts); match lock_type { LockType::Put => must_pessimistic_prewrite_put( - &engine, + &mut engine, key, &value, key, @@ -2532,7 +2532,7 @@ mod delta_entry_tests { DoPessimisticCheck, ), LockType::Delete => must_pessimistic_prewrite_delete( - &engine, + &mut engine, key, key, ts, @@ -2540,7 +2540,7 @@ mod delta_entry_tests { DoPessimisticCheck, ), LockType::Lock => must_pessimistic_prewrite_lock( - &engine, + &mut engine, key, key, ts, @@ -2552,7 +2552,7 @@ mod delta_entry_tests { } } - let check = |from_key, to_key, from_ts, to_ts| { + let mut check = |from_key, to_key, from_ts, to_ts| { let expected = expected_entries(from_key, to_key, from_ts, to_ts); let from_key = if from_key.is_empty() { @@ -2604,23 +2604,23 @@ mod delta_entry_tests { #[test] fn test_output_old_value() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); // Generate put for [a] at 1. - must_prewrite_put(&engine, b"a", b"a_1", b"a", 1); - must_commit(&engine, b"a", 1, 1); + must_prewrite_put(&mut engine, b"a", b"a_1", b"a", 1); + must_commit(&mut engine, b"a", 1, 1); // Generate put for [a] at 3. - must_prewrite_put(&engine, b"a", b"a_3", b"a", 3); - must_commit(&engine, b"a", 3, 3); + must_prewrite_put(&mut engine, b"a", b"a_3", b"a", 3); + must_commit(&mut engine, b"a", 3, 3); // Generate delete for [a] at 5. - must_prewrite_delete(&engine, b"a", b"a", 5); + must_prewrite_delete(&mut engine, b"a", b"a", 5); // Generate put for [b] at 2. - must_prewrite_put(&engine, b"b", b"b_2", b"b", 2); - must_commit(&engine, b"b", 2, 2); + must_prewrite_put(&mut engine, b"b", b"b_2", b"b", 2); + must_commit(&mut engine, b"b", 2, 2); // Generate rollbacks for [b] at 6, 7, 8. for ts in 6..9 { @@ -2637,18 +2637,18 @@ mod delta_entry_tests { } // Generate delete for [b] at 10. - must_prewrite_delete(&engine, b"b", b"b", 10); - must_commit(&engine, b"b", 10, 10); + must_prewrite_delete(&mut engine, b"b", b"b", 10); + must_commit(&mut engine, b"b", 10, 10); // Generate put for [b] at 15. - must_acquire_pessimistic_lock(&engine, b"b", b"b", 9, 15); - must_pessimistic_prewrite_put(&engine, b"b", b"b_15", b"b", 9, 15, DoPessimisticCheck); + must_acquire_pessimistic_lock(&mut engine, b"b", b"b", 9, 15); + must_pessimistic_prewrite_put(&mut engine, b"b", b"b_15", b"b", 9, 15, DoPessimisticCheck); - must_prewrite_put(&engine, b"c", b"c_4", b"c", 4); - must_commit(&engine, b"c", 4, 6); - must_acquire_pessimistic_lock(&engine, b"c", b"c", 5, 15); - must_pessimistic_prewrite_put(&engine, b"c", b"c_5", b"c", 5, 15, DoPessimisticCheck); - must_cleanup(&engine, b"c", 20, 0); + must_prewrite_put(&mut engine, b"c", b"c_4", b"c", 4); + must_commit(&mut engine, b"c", 4, 6); + must_acquire_pessimistic_lock(&mut engine, b"c", b"c", 5, 15); + must_pessimistic_prewrite_put(&mut engine, b"c", b"c_5", b"c", 5, 15, DoPessimisticCheck); + must_cleanup(&mut engine, b"c", 20, 0); let entry_a_1 = EntryBuilder::default() .key(b"a") @@ -2703,7 +2703,7 @@ mod delta_entry_tests { .old_value(b"c_4") .build_prewrite(LockType::Put, true); - let check = |after_ts: u64, expected: Vec<&TxnEntry>| { + let mut check = |after_ts: u64, expected: Vec<&TxnEntry>| { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, TimeStamp::max()) .range(None, None) @@ -2739,8 +2739,8 @@ mod delta_entry_tests { #[test] fn test_old_value_check_gc_fence() { - let engine = TestEngineBuilder::new().build().unwrap(); - prepare_test_data_for_check_gc_fence(&engine); + let mut engine = TestEngineBuilder::new().build().unwrap(); + prepare_test_data_for_check_gc_fence(&mut engine); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, TimeStamp::max()) @@ -2772,7 +2772,7 @@ mod delta_entry_tests { for i in b'1'..=b'8' { let key = &[b'k', i]; let value = &[b'v', i, b'x', b'x']; - must_prewrite_put(&engine, key, value, b"k1", 55); + must_prewrite_put(&mut engine, key, value, b"k1", 55); } let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, TimeStamp::max()) @@ -2808,7 +2808,7 @@ mod delta_entry_tests { // Commit all the locks and check again. for i in b'1'..=b'8' { let key = &[b'k', i]; - must_commit(&engine, key, 55, 56); + must_commit(&mut engine, key, 55, 56); } let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, TimeStamp::max()) diff --git a/src/storage/mvcc/reader/scanner/mod.rs b/src/storage/mvcc/reader/scanner/mod.rs index 1f45390a21e..7b799a3f456 100644 --- a/src/storage/mvcc/reader/scanner/mod.rs +++ b/src/storage/mvcc/reader/scanner/mod.rs @@ -627,18 +627,18 @@ mod tests { const POST_TS: TimeStamp = TimeStamp::new(5); let new_engine = || TestEngineBuilder::new().build().unwrap(); - let add_write_at_ts = |commit_ts, engine, key, value| { + let add_write_at_ts = |commit_ts, engine: &mut _, key, value| { must_prewrite_put(engine, key, value, key, commit_ts); must_commit(engine, key, commit_ts, commit_ts); }; - let add_lock_at_ts = |lock_ts, engine, key| { + let add_lock_at_ts = |lock_ts, engine: &mut _, key| { must_prewrite_put(engine, key, b"lock", key, lock_ts); must_locked(engine, key, lock_ts); }; let test_scanner_result = - move |engine: &RocksEngine, expected_result: Vec<(Vec, Option>)>| { + move |engine: &mut RocksEngine, expected_result: Vec<(Vec, Option>)>| { let snapshot = engine.snapshot(Default::default()).unwrap(); let scanner = ScannerBuilder::new(snapshot, SCAN_TS) @@ -657,68 +657,68 @@ mod tests { }; // Lock after write - let engine = new_engine(); + let mut engine = new_engine(); - add_write_at_ts(POST_TS, &engine, b"a", b"a_value"); - add_lock_at_ts(PREV_TS, &engine, b"b"); + add_write_at_ts(POST_TS, &mut engine, b"a", b"a_value"); + add_lock_at_ts(PREV_TS, &mut engine, b"b"); let expected_result = desc_map(vec![ (b"a".to_vec(), Some(b"a_value".to_vec())), (b"b".to_vec(), None), ]); - test_scanner_result(&engine, expected_result); + test_scanner_result(&mut engine, expected_result); // Lock before write for same key - let engine = new_engine(); - add_write_at_ts(PREV_TS, &engine, b"a", b"a_value"); - add_lock_at_ts(POST_TS, &engine, b"a"); + let mut engine = new_engine(); + add_write_at_ts(PREV_TS, &mut engine, b"a", b"a_value"); + add_lock_at_ts(POST_TS, &mut engine, b"a"); let expected_result = vec![(b"a".to_vec(), None)]; - test_scanner_result(&engine, expected_result); + test_scanner_result(&mut engine, expected_result); // Lock before write in different keys - let engine = new_engine(); - add_lock_at_ts(POST_TS, &engine, b"a"); - add_write_at_ts(PREV_TS, &engine, b"b", b"b_value"); + let mut engine = new_engine(); + add_lock_at_ts(POST_TS, &mut engine, b"a"); + add_write_at_ts(PREV_TS, &mut engine, b"b", b"b_value"); let expected_result = desc_map(vec![ (b"a".to_vec(), None), (b"b".to_vec(), Some(b"b_value".to_vec())), ]); - test_scanner_result(&engine, expected_result); + test_scanner_result(&mut engine, expected_result); // Only a lock here - let engine = new_engine(); - add_lock_at_ts(PREV_TS, &engine, b"a"); + let mut engine = new_engine(); + add_lock_at_ts(PREV_TS, &mut engine, b"a"); let expected_result = desc_map(vec![(b"a".to_vec(), None)]); - test_scanner_result(&engine, expected_result); + test_scanner_result(&mut engine, expected_result); // Write Only - let engine = new_engine(); - add_write_at_ts(PREV_TS, &engine, b"a", b"a_value"); + let mut engine = new_engine(); + add_write_at_ts(PREV_TS, &mut engine, b"a", b"a_value"); let expected_result = desc_map(vec![(b"a".to_vec(), Some(b"a_value".to_vec()))]); - test_scanner_result(&engine, expected_result); + test_scanner_result(&mut engine, expected_result); } fn test_scan_with_lock_impl(desc: bool) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); for i in 0..5 { - must_prewrite_put(&engine, &[i], &[b'v', i], &[i], 1); - must_commit(&engine, &[i], 1, 2); - must_prewrite_put(&engine, &[i], &[b'v', i], &[i], 10); - must_commit(&engine, &[i], 10, 100); + must_prewrite_put(&mut engine, &[i], &[b'v', i], &[i], 1); + must_commit(&mut engine, &[i], 1, 2); + must_prewrite_put(&mut engine, &[i], &[b'v', i], &[i], 10); + must_commit(&mut engine, &[i], 10, 100); } - must_acquire_pessimistic_lock(&engine, &[1], &[1], 20, 110); - must_acquire_pessimistic_lock(&engine, &[2], &[2], 50, 110); - must_acquire_pessimistic_lock(&engine, &[3], &[3], 105, 110); - must_prewrite_put(&engine, &[4], b"a", &[4], 105); + must_acquire_pessimistic_lock(&mut engine, &[1], &[1], 20, 110); + must_acquire_pessimistic_lock(&mut engine, &[2], &[2], 50, 110); + must_acquire_pessimistic_lock(&mut engine, &[3], &[3], 105, 110); + must_prewrite_put(&mut engine, &[4], b"a", &[4], 105); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -778,16 +778,16 @@ mod tests { } fn test_scan_bypass_locks_impl(desc: bool) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); for i in 0..5 { - must_prewrite_put(&engine, &[i], &[b'v', i], &[i], 10); - must_commit(&engine, &[i], 10, 20); + must_prewrite_put(&mut engine, &[i], &[b'v', i], &[i], 10); + must_commit(&mut engine, &[i], 10, 20); } // Locks are: 30, 40, 50, 60, 70 for i in 0..5 { - must_prewrite_put(&engine, &[i], &[b'v', i], &[i], 30 + u64::from(i) * 10); + must_prewrite_put(&mut engine, &[i], &[b'v', i], &[i], 30 + u64::from(i) * 10); } let bypass_locks = TsSet::from_u64s(vec![30, 41, 50]); @@ -821,28 +821,28 @@ mod tests { } fn test_scan_access_locks_impl(desc: bool, delete_bound: bool) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); for i in 0..=8 { - must_prewrite_put(&engine, &[i], &[b'v', i], &[i], 10); - must_commit(&engine, &[i], 10, 20); + must_prewrite_put(&mut engine, &[i], &[b'v', i], &[i], 10); + must_commit(&mut engine, &[i], 10, 20); } if delete_bound { - must_prewrite_delete(&engine, &[0], &[0], 30); // access delete + must_prewrite_delete(&mut engine, &[0], &[0], 30); // access delete } else { - must_prewrite_put(&engine, &[0], &[b'v', 0, 0], &[0], 30); // access put + must_prewrite_put(&mut engine, &[0], &[b'v', 0, 0], &[0], 30); // access put } - must_prewrite_put(&engine, &[1], &[b'v', 1, 1], &[1], 40); // access put - must_prewrite_delete(&engine, &[2], &[2], 50); // access delete - must_prewrite_lock(&engine, &[3], &[3], 60); // access lock(actually ignored) - must_prewrite_put(&engine, &[4], &[b'v', 4, 4], &[4], 70); // locked - must_prewrite_put(&engine, &[5], &[b'v', 5, 5], &[5], 80); // bypass - must_prewrite_put(&engine, &[6], &[b'v', 6, 6], &[6], 100); // locked with larger ts + must_prewrite_put(&mut engine, &[1], &[b'v', 1, 1], &[1], 40); // access put + must_prewrite_delete(&mut engine, &[2], &[2], 50); // access delete + must_prewrite_lock(&mut engine, &[3], &[3], 60); // access lock(actually ignored) + must_prewrite_put(&mut engine, &[4], &[b'v', 4, 4], &[4], 70); // locked + must_prewrite_put(&mut engine, &[5], &[b'v', 5, 5], &[5], 80); // bypass + must_prewrite_put(&mut engine, &[6], &[b'v', 6, 6], &[6], 100); // locked with larger ts if delete_bound { - must_prewrite_delete(&engine, &[8], &[8], 90); // access delete + must_prewrite_delete(&mut engine, &[8], &[8], 90); // access delete } else { - must_prewrite_put(&engine, &[8], &[b'v', 8, 8], &[8], 90); // access put + must_prewrite_put(&mut engine, &[8], &[b'v', 8, 8], &[8], 90); // access put } let bypass_locks = TsSet::from_u64s(vec![80]); @@ -887,7 +887,7 @@ mod tests { } fn must_met_newer_ts_data( - engine: &E, + engine: &mut E, scanner_ts: impl Into, key: &[u8], value: Option<&[u8]>, @@ -922,39 +922,39 @@ mod tests { } fn test_met_newer_ts_data_impl(deep_write_seek: bool, desc: bool) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key, val1) = (b"foo", b"bar1"); if deep_write_seek { for i in 0..SEEK_BOUND { - must_prewrite_put(&engine, key, val1, key, i); - must_commit(&engine, key, i, i); + must_prewrite_put(&mut engine, key, val1, key, i); + must_commit(&mut engine, key, i, i); } } - must_prewrite_put(&engine, key, val1, key, 100); - must_commit(&engine, key, 100, 200); + must_prewrite_put(&mut engine, key, val1, key, 100); + must_commit(&mut engine, key, 100, 200); let (key, val2) = (b"foo", b"bar2"); - must_prewrite_put(&engine, key, val2, key, 300); - must_commit(&engine, key, 300, 400); + must_prewrite_put(&mut engine, key, val2, key, 300); + must_commit(&mut engine, key, 300, 400); must_met_newer_ts_data( - &engine, + &mut engine, 100, key, if deep_write_seek { Some(val1) } else { None }, desc, true, ); - must_met_newer_ts_data(&engine, 200, key, Some(val1), desc, true); - must_met_newer_ts_data(&engine, 300, key, Some(val1), desc, true); - must_met_newer_ts_data(&engine, 400, key, Some(val2), desc, false); - must_met_newer_ts_data(&engine, 500, key, Some(val2), desc, false); + must_met_newer_ts_data(&mut engine, 200, key, Some(val1), desc, true); + must_met_newer_ts_data(&mut engine, 300, key, Some(val1), desc, true); + must_met_newer_ts_data(&mut engine, 400, key, Some(val2), desc, false); + must_met_newer_ts_data(&mut engine, 500, key, Some(val2), desc, false); - must_prewrite_lock(&engine, key, key, 600); + must_prewrite_lock(&mut engine, key, key, 600); - must_met_newer_ts_data(&engine, 500, key, Some(val2), desc, true); - must_met_newer_ts_data(&engine, 600, key, Some(val2), desc, true); + must_met_newer_ts_data(&mut engine, 500, key, Some(val2), desc, true); + must_met_newer_ts_data(&mut engine, 600, key, Some(val2), desc, true); } #[test] @@ -967,9 +967,10 @@ mod tests { #[test] fn test_old_value_with_hint_min_ts() { - let engine = TestEngineBuilder::new().build_without_cache().unwrap(); - let create_scanner = |from_ts: u64| { - let snap = engine.snapshot(Default::default()).unwrap(); + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); + let mut engine_clone = engine.clone(); + let mut create_scanner = |from_ts: u64| { + let snap = engine_clone.snapshot(Default::default()).unwrap(); ScannerBuilder::new(snap, TimeStamp::max()) .fill_cache(false) .hint_min_ts(Some(from_ts.into())) @@ -981,10 +982,10 @@ mod tests { (0..128).for_each(|_| value.extend_from_slice(b"long-val")); // Create the initial data with CF_WRITE L0: |zkey_110, zkey1_160| - must_prewrite_put(&engine, b"zkey", &value, b"zkey", 100); - must_commit(&engine, b"zkey", 100, 110); - must_prewrite_put(&engine, b"zkey1", &value, b"zkey1", 150); - must_commit(&engine, b"zkey1", 150, 160); + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); + must_prewrite_put(&mut engine, b"zkey1", &value, b"zkey1", 150); + must_commit(&mut engine, b"zkey1", 150, 160); engine .kv_engine() .unwrap() @@ -995,7 +996,7 @@ mod tests { .unwrap() .flush_cf(CF_DEFAULT, true) .unwrap(); - must_prewrite_delete(&engine, b"zkey", b"zkey", 200); + must_prewrite_delete(&mut engine, b"zkey", b"zkey", 200); let tests = vec![ // `zkey_110` is filtered, so no old value and block reads is 0. @@ -1018,7 +1019,7 @@ mod tests { } // CF_WRITE L0: |zkey_110, zkey1_160|, |zkey_210| - must_commit(&engine, b"zkey", 200, 210); + must_commit(&mut engine, b"zkey", 200, 210); engine .kv_engine() .unwrap() @@ -1058,7 +1059,7 @@ mod tests { } fn test_rc_scan_skip_lock_impl(desc: bool) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key1, val1, val12) = (b"foo1", b"bar1", b"bar12"); let (key2, val2) = (b"foo2", b"bar2"); let mut expected = vec![(key1, val1), (key2, val2)]; @@ -1066,13 +1067,13 @@ mod tests { expected.reverse(); } - must_prewrite_put(&engine, key1, val1, key1, 10); - must_commit(&engine, key1, 10, 20); + must_prewrite_put(&mut engine, key1, val1, key1, 10); + must_commit(&mut engine, key1, 10, 20); - must_prewrite_put(&engine, key2, val2, key2, 30); - must_commit(&engine, key2, 30, 40); + must_prewrite_put(&mut engine, key2, val2, key2, 30); + must_commit(&mut engine, key2, 30, 40); - must_prewrite_put(&engine, key1, val12, key1, 50); + must_prewrite_put(&mut engine, key1, val12, key1, 50); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, 60.into()) diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index b456b359b8f..7171417d060 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -292,66 +292,66 @@ pub(crate) mod tests { }; fn test_mvcc_txn_read_imp(k1: &[u8], k2: &[u8], v: &[u8]) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); - must_get_none(&engine, k1, 1); + must_get_none(&mut engine, k1, 1); - must_prewrite_put(&engine, k1, v, k1, 2); - must_rollback(&engine, k1, 2, false); + must_prewrite_put(&mut engine, k1, v, k1, 2); + must_rollback(&mut engine, k1, 2, false); // should ignore rollback - must_get_none(&engine, k1, 3); + must_get_none(&mut engine, k1, 3); - must_prewrite_lock(&engine, k1, k1, 3); - must_commit(&engine, k1, 3, 4); + must_prewrite_lock(&mut engine, k1, k1, 3); + must_commit(&mut engine, k1, 3, 4); // should ignore read lock - must_get_none(&engine, k1, 5); + must_get_none(&mut engine, k1, 5); - must_prewrite_put(&engine, k1, v, k1, 5); - must_prewrite_put(&engine, k2, v, k1, 5); + must_prewrite_put(&mut engine, k1, v, k1, 5); + must_prewrite_put(&mut engine, k2, v, k1, 5); // should not be affected by later locks - must_get_none(&engine, k1, 4); + must_get_none(&mut engine, k1, 4); // should read pending locks - must_get_err(&engine, k1, 7); + must_get_err(&mut engine, k1, 7); // should ignore the primary lock and get none when reading the latest record - must_get_none(&engine, k1, u64::max_value()); + must_get_none(&mut engine, k1, u64::max_value()); // should read secondary locks even when reading the latest record - must_get_err(&engine, k2, u64::max_value()); + must_get_err(&mut engine, k2, u64::max_value()); - must_commit(&engine, k1, 5, 10); - must_commit(&engine, k2, 5, 10); - must_get_none(&engine, k1, 3); + must_commit(&mut engine, k1, 5, 10); + must_commit(&mut engine, k2, 5, 10); + must_get_none(&mut engine, k1, 3); // should not read with ts < commit_ts - must_get_none(&engine, k1, 7); + must_get_none(&mut engine, k1, 7); // should read with ts > commit_ts - must_get(&engine, k1, 13, v); + must_get(&mut engine, k1, 13, v); // should read the latest record if `ts == u64::max_value()` - must_get(&engine, k1, u64::max_value(), v); + must_get(&mut engine, k1, u64::max_value(), v); - must_prewrite_delete(&engine, k1, k1, 15); + must_prewrite_delete(&mut engine, k1, k1, 15); // should ignore the lock and get previous record when reading the latest record - must_get(&engine, k1, u64::max_value(), v); - must_commit(&engine, k1, 15, 20); - must_get_none(&engine, k1, 3); - must_get_none(&engine, k1, 7); - must_get(&engine, k1, 13, v); - must_get(&engine, k1, 17, v); - must_get_none(&engine, k1, 23); + must_get(&mut engine, k1, u64::max_value(), v); + must_commit(&mut engine, k1, 15, 20); + must_get_none(&mut engine, k1, 3); + must_get_none(&mut engine, k1, 7); + must_get(&mut engine, k1, 13, v); + must_get(&mut engine, k1, 17, v); + must_get_none(&mut engine, k1, 23); // intersecting timestamps with pessimistic txn // T1: start_ts = 25, commit_ts = 27 // T2: start_ts = 23, commit_ts = 31 - must_prewrite_put(&engine, k1, v, k1, 25); - must_commit(&engine, k1, 25, 27); - must_acquire_pessimistic_lock(&engine, k1, k1, 23, 29); - must_get(&engine, k1, 30, v); - must_pessimistic_prewrite_delete(&engine, k1, k1, 23, 29, DoPessimisticCheck); - must_get_err(&engine, k1, 30); + must_prewrite_put(&mut engine, k1, v, k1, 25); + must_commit(&mut engine, k1, 25, 27); + must_acquire_pessimistic_lock(&mut engine, k1, k1, 23, 29); + must_get(&mut engine, k1, 30, v); + must_pessimistic_prewrite_delete(&mut engine, k1, k1, 23, 29, DoPessimisticCheck); + must_get_err(&mut engine, k1, 30); // should read the latest record when `ts == u64::max_value()` // even if lock.start_ts(23) < latest write.commit_ts(27) - must_get(&engine, k1, u64::max_value(), v); - must_commit(&engine, k1, 23, 31); - must_get(&engine, k1, 30, v); - must_get_none(&engine, k1, 32); + must_get(&mut engine, k1, u64::max_value(), v); + must_commit(&mut engine, k1, 23, 31); + must_get(&mut engine, k1, 30, v); + must_get_none(&mut engine, k1, 32); } #[test] @@ -363,217 +363,217 @@ pub(crate) mod tests { } fn test_mvcc_txn_prewrite_imp(k: &[u8], v: &[u8]) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine, k, v, k, 5); + must_prewrite_put(&mut engine, k, v, k, 5); // Key is locked. - must_locked(&engine, k, 5); + must_locked(&mut engine, k, 5); // Retry prewrite. - must_prewrite_put(&engine, k, v, k, 5); + must_prewrite_put(&mut engine, k, v, k, 5); // Conflict. - must_prewrite_lock_err(&engine, k, k, 6); + must_prewrite_lock_err(&mut engine, k, k, 6); - must_commit(&engine, k, 5, 10); - must_written(&engine, k, 5, 10, WriteType::Put); + must_commit(&mut engine, k, 5, 10); + must_written(&mut engine, k, 5, 10, WriteType::Put); // Delayed prewrite request after committing should do nothing. - must_prewrite_put_err(&engine, k, v, k, 5); - must_unlocked(&engine, k); + must_prewrite_put_err(&mut engine, k, v, k, 5); + must_unlocked(&mut engine, k); // Write conflict. - must_prewrite_lock_err(&engine, k, k, 6); - must_unlocked(&engine, k); + must_prewrite_lock_err(&mut engine, k, k, 6); + must_unlocked(&mut engine, k); // Not conflict. - must_prewrite_lock(&engine, k, k, 12); - must_locked(&engine, k, 12); - must_rollback(&engine, k, 12, false); - must_unlocked(&engine, k); - must_written(&engine, k, 12, 12, WriteType::Rollback); + must_prewrite_lock(&mut engine, k, k, 12); + must_locked(&mut engine, k, 12); + must_rollback(&mut engine, k, 12, false); + must_unlocked(&mut engine, k); + must_written(&mut engine, k, 12, 12, WriteType::Rollback); // Cannot retry Prewrite after rollback. - must_prewrite_lock_err(&engine, k, k, 12); + must_prewrite_lock_err(&mut engine, k, k, 12); // Can prewrite after rollback. - must_prewrite_delete(&engine, k, k, 13); - must_rollback(&engine, k, 13, false); - must_unlocked(&engine, k); + must_prewrite_delete(&mut engine, k, k, 13); + must_rollback(&mut engine, k, 13, false); + must_unlocked(&mut engine, k); } #[test] fn test_mvcc_txn_prewrite_insert() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k1, v1, v2, v3) = (b"k1", b"v1", b"v2", b"v3"); - must_prewrite_put(&engine, k1, v1, k1, 1); - must_commit(&engine, k1, 1, 2); + must_prewrite_put(&mut engine, k1, v1, k1, 1); + must_commit(&mut engine, k1, 1, 2); // "k1" already exist, returns AlreadyExist error. assert!(matches!( - try_prewrite_insert(&engine, k1, v2, k1, 3), + try_prewrite_insert(&mut engine, k1, v2, k1, 3), Err(Error(box ErrorInner::AlreadyExist { .. })) )); // Delete "k1" - must_prewrite_delete(&engine, k1, k1, 4); + must_prewrite_delete(&mut engine, k1, k1, 4); // There is a lock, returns KeyIsLocked error. assert!(matches!( - try_prewrite_insert(&engine, k1, v2, k1, 6), + try_prewrite_insert(&mut engine, k1, v2, k1, 6), Err(Error(box ErrorInner::KeyIsLocked(_))) )); - must_commit(&engine, k1, 4, 5); + must_commit(&mut engine, k1, 4, 5); // After delete "k1", insert returns ok. - try_prewrite_insert(&engine, k1, v2, k1, 6).unwrap(); - must_commit(&engine, k1, 6, 7); + try_prewrite_insert(&mut engine, k1, v2, k1, 6).unwrap(); + must_commit(&mut engine, k1, 6, 7); // Rollback - must_prewrite_put(&engine, k1, v3, k1, 8); - must_rollback(&engine, k1, 8, false); + must_prewrite_put(&mut engine, k1, v3, k1, 8); + must_rollback(&mut engine, k1, 8, false); assert!(matches!( - try_prewrite_insert(&engine, k1, v3, k1, 9), + try_prewrite_insert(&mut engine, k1, v3, k1, 9), Err(Error(box ErrorInner::AlreadyExist { .. })) )); // Delete "k1" again - must_prewrite_delete(&engine, k1, k1, 10); - must_commit(&engine, k1, 10, 11); + must_prewrite_delete(&mut engine, k1, k1, 10); + must_commit(&mut engine, k1, 10, 11); // Rollback again - must_prewrite_put(&engine, k1, v3, k1, 12); - must_rollback(&engine, k1, 12, false); + must_prewrite_put(&mut engine, k1, v3, k1, 12); + must_rollback(&mut engine, k1, 12, false); // After delete "k1", insert returns ok. - try_prewrite_insert(&engine, k1, v2, k1, 13).unwrap(); - must_commit(&engine, k1, 13, 14); + try_prewrite_insert(&mut engine, k1, v2, k1, 13).unwrap(); + must_commit(&mut engine, k1, 13, 14); } #[test] fn test_mvcc_txn_prewrite_check_not_exist() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k1, v1, v2, v3) = (b"k1", b"v1", b"v2", b"v3"); - must_prewrite_put(&engine, k1, v1, k1, 1); - must_commit(&engine, k1, 1, 2); + must_prewrite_put(&mut engine, k1, v1, k1, 1); + must_commit(&mut engine, k1, 1, 2); // "k1" already exist, returns AlreadyExist error. - try_prewrite_check_not_exists(&engine, k1, k1, 3).unwrap_err(); + try_prewrite_check_not_exists(&mut engine, k1, k1, 3).unwrap_err(); // Delete "k1" - must_prewrite_delete(&engine, k1, k1, 4); - must_commit(&engine, k1, 4, 5); + must_prewrite_delete(&mut engine, k1, k1, 4); + must_commit(&mut engine, k1, 4, 5); // After delete "k1", check_not_exists returns ok. - try_prewrite_check_not_exists(&engine, k1, k1, 6).unwrap(); + try_prewrite_check_not_exists(&mut engine, k1, k1, 6).unwrap(); - try_prewrite_insert(&engine, k1, v2, k1, 7).unwrap(); - must_commit(&engine, k1, 7, 8); + try_prewrite_insert(&mut engine, k1, v2, k1, 7).unwrap(); + must_commit(&mut engine, k1, 7, 8); // Rollback - must_prewrite_put(&engine, k1, v3, k1, 9); - must_rollback(&engine, k1, 9, false); - try_prewrite_check_not_exists(&engine, k1, k1, 10).unwrap_err(); + must_prewrite_put(&mut engine, k1, v3, k1, 9); + must_rollback(&mut engine, k1, 9, false); + try_prewrite_check_not_exists(&mut engine, k1, k1, 10).unwrap_err(); // Delete "k1" again - must_prewrite_delete(&engine, k1, k1, 11); - must_commit(&engine, k1, 11, 12); + must_prewrite_delete(&mut engine, k1, k1, 11); + must_commit(&mut engine, k1, 11, 12); // Rollback again - must_prewrite_put(&engine, k1, v3, k1, 13); - must_rollback(&engine, k1, 13, false); + must_prewrite_put(&mut engine, k1, v3, k1, 13); + must_rollback(&mut engine, k1, 13, false); // After delete "k1", check_not_exists returns ok. - try_prewrite_check_not_exists(&engine, k1, k1, 14).unwrap(); + try_prewrite_check_not_exists(&mut engine, k1, k1, 14).unwrap(); } #[test] fn test_mvcc_txn_pessmistic_prewrite_check_not_exist() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k1"; - try_pessimistic_prewrite_check_not_exists(&engine, k, k, 3).unwrap_err(); + try_pessimistic_prewrite_check_not_exists(&mut engine, k, k, 3).unwrap_err(); } #[test] fn test_rollback_lock_optimistic() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k1", b"v1"); - must_prewrite_put(&engine, k, v, k, 5); - must_commit(&engine, k, 5, 10); + must_prewrite_put(&mut engine, k, v, k, 5); + must_commit(&mut engine, k, 5, 10); // Lock - must_prewrite_lock(&engine, k, k, 15); - must_locked(&engine, k, 15); + must_prewrite_lock(&mut engine, k, k, 15); + must_locked(&mut engine, k, 15); // Rollback lock - must_rollback(&engine, k, 15, false); + must_rollback(&mut engine, k, 15, false); // Rollbacks of optimistic transactions needn't be protected - must_get_rollback_protected(&engine, k, 15, false); + must_get_rollback_protected(&mut engine, k, 15, false); } #[test] fn test_rollback_lock_pessimistic() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k1, k2, v) = (b"k1", b"k2", b"v1"); - must_acquire_pessimistic_lock(&engine, k1, k1, 5, 5); - must_acquire_pessimistic_lock(&engine, k2, k1, 5, 7); - must_rollback(&engine, k1, 5, false); - must_rollback(&engine, k2, 5, false); + must_acquire_pessimistic_lock(&mut engine, k1, k1, 5, 5); + must_acquire_pessimistic_lock(&mut engine, k2, k1, 5, 7); + must_rollback(&mut engine, k1, 5, false); + must_rollback(&mut engine, k2, 5, false); // The rollback of the primary key should be protected - must_get_rollback_protected(&engine, k1, 5, true); + must_get_rollback_protected(&mut engine, k1, 5, true); // The rollback of the secondary key needn't be protected - must_get_rollback_protected(&engine, k2, 5, false); - - must_acquire_pessimistic_lock(&engine, k1, k1, 15, 15); - must_acquire_pessimistic_lock(&engine, k2, k1, 15, 17); - must_pessimistic_prewrite_put(&engine, k1, v, k1, 15, 17, DoPessimisticCheck); - must_pessimistic_prewrite_put(&engine, k2, v, k1, 15, 17, DoPessimisticCheck); - must_rollback(&engine, k1, 15, false); - must_rollback(&engine, k2, 15, false); + must_get_rollback_protected(&mut engine, k2, 5, false); + + must_acquire_pessimistic_lock(&mut engine, k1, k1, 15, 15); + must_acquire_pessimistic_lock(&mut engine, k2, k1, 15, 17); + must_pessimistic_prewrite_put(&mut engine, k1, v, k1, 15, 17, DoPessimisticCheck); + must_pessimistic_prewrite_put(&mut engine, k2, v, k1, 15, 17, DoPessimisticCheck); + must_rollback(&mut engine, k1, 15, false); + must_rollback(&mut engine, k2, 15, false); // The rollback of the primary key should be protected - must_get_rollback_protected(&engine, k1, 15, true); + must_get_rollback_protected(&mut engine, k1, 15, true); // The rollback of the secondary key needn't be protected - must_get_rollback_protected(&engine, k2, 15, false); + must_get_rollback_protected(&mut engine, k2, 15, false); } #[test] fn test_rollback_del() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k1", b"v1"); - must_prewrite_put(&engine, k, v, k, 5); - must_commit(&engine, k, 5, 10); + must_prewrite_put(&mut engine, k, v, k, 5); + must_commit(&mut engine, k, 5, 10); // Prewrite delete - must_prewrite_delete(&engine, k, k, 15); - must_locked(&engine, k, 15); + must_prewrite_delete(&mut engine, k, k, 15); + must_locked(&mut engine, k, 15); // Rollback delete - must_rollback(&engine, k, 15, false); + must_rollback(&mut engine, k, 15, false); } #[test] fn test_rollback_overlapped() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k1, v1) = (b"key1", b"v1"); let (k2, v2) = (b"key2", b"v2"); - must_prewrite_put(&engine, k1, v1, k1, 10); - must_prewrite_put(&engine, k2, v2, k2, 11); - must_commit(&engine, k1, 10, 20); - must_commit(&engine, k2, 11, 20); - let w1 = must_written(&engine, k1, 10, 20, WriteType::Put); - let w2 = must_written(&engine, k2, 11, 20, WriteType::Put); + must_prewrite_put(&mut engine, k1, v1, k1, 10); + must_prewrite_put(&mut engine, k2, v2, k2, 11); + must_commit(&mut engine, k1, 10, 20); + must_commit(&mut engine, k2, 11, 20); + let w1 = must_written(&mut engine, k1, 10, 20, WriteType::Put); + let w2 = must_written(&mut engine, k2, 11, 20, WriteType::Put); assert!(!w1.has_overlapped_rollback); assert!(!w2.has_overlapped_rollback); - must_cleanup(&engine, k1, 20, 0); - must_rollback(&engine, k2, 20, false); + must_cleanup(&mut engine, k1, 20, 0); + must_rollback(&mut engine, k2, 20, false); - let w1r = must_written(&engine, k1, 10, 20, WriteType::Put); + let w1r = must_written(&mut engine, k1, 10, 20, WriteType::Put); assert!(w1r.has_overlapped_rollback); // The only difference between w1r and w1 is the overlapped_rollback flag. assert_eq!(w1r.set_overlapped_rollback(false, None), w1); - let w2r = must_written(&engine, k2, 11, 20, WriteType::Put); + let w2r = must_written(&mut engine, k2, 11, 20, WriteType::Put); // Rollback is invoked on secondaries, so the rollback is not protected and // overlapped_rollback won't be set. assert_eq!(w2r, w2); @@ -589,7 +589,7 @@ pub(crate) mod tests { #[test] fn test_mvcc_txn_rollback_after_commit() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k"; let v = b"v"; @@ -598,49 +598,49 @@ pub(crate) mod tests { let t3 = 20; let t4 = 30; - must_prewrite_put(&engine, k, v, k, t1); + must_prewrite_put(&mut engine, k, v, k, t1); - must_rollback(&engine, k, t2, false); - must_rollback(&engine, k, t2, false); - must_rollback(&engine, k, t4, false); + must_rollback(&mut engine, k, t2, false); + must_rollback(&mut engine, k, t2, false); + must_rollback(&mut engine, k, t4, false); - must_commit(&engine, k, t1, t3); + must_commit(&mut engine, k, t1, t3); // The rollback should be failed since the transaction // was committed before. - must_rollback_err(&engine, k, t1); - must_get(&engine, k, t4, v); + must_rollback_err(&mut engine, k, t1); + must_get(&mut engine, k, t4, v); } fn test_mvcc_txn_rollback_imp(k: &[u8], v: &[u8]) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine, k, v, k, 5); - must_rollback(&engine, k, 5, false); + must_prewrite_put(&mut engine, k, v, k, 5); + must_rollback(&mut engine, k, 5, false); // Rollback should be idempotent - must_rollback(&engine, k, 5, false); + must_rollback(&mut engine, k, 5, false); // Lock should be released after rollback - must_unlocked(&engine, k); - must_prewrite_lock(&engine, k, k, 10); - must_rollback(&engine, k, 10, false); + must_unlocked(&mut engine, k); + must_prewrite_lock(&mut engine, k, k, 10); + must_rollback(&mut engine, k, 10, false); // data should be dropped after rollback - must_get_none(&engine, k, 20); + must_get_none(&mut engine, k, 20); // Can't rollback committed transaction. - must_prewrite_put(&engine, k, v, k, 25); - must_commit(&engine, k, 25, 30); - must_rollback_err(&engine, k, 25); - must_rollback_err(&engine, k, 25); + must_prewrite_put(&mut engine, k, v, k, 25); + must_commit(&mut engine, k, 25, 30); + must_rollback_err(&mut engine, k, 25); + must_rollback_err(&mut engine, k, 25); // Can't rollback other transaction's lock - must_prewrite_delete(&engine, k, k, 35); - must_rollback(&engine, k, 34, true); - must_rollback(&engine, k, 36, true); - must_written(&engine, k, 34, 34, WriteType::Rollback); - must_written(&engine, k, 36, 36, WriteType::Rollback); - must_locked(&engine, k, 35); - must_commit(&engine, k, 35, 40); - must_get(&engine, k, 39, v); - must_get_none(&engine, k, 41); + must_prewrite_delete(&mut engine, k, k, 35); + must_rollback(&mut engine, k, 34, true); + must_rollback(&mut engine, k, 36, true); + must_written(&mut engine, k, 34, 34, WriteType::Rollback); + must_written(&mut engine, k, 36, 36, WriteType::Rollback); + must_locked(&mut engine, k, 35); + must_commit(&mut engine, k, 35, 40); + must_get(&mut engine, k, 39, v); + must_get_none(&mut engine, k, 41); } #[test] @@ -653,33 +653,40 @@ pub(crate) mod tests { #[test] fn test_mvcc_txn_rollback_before_prewrite() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let key = b"key"; - must_rollback(&engine, key, 5, false); - must_prewrite_lock_err(&engine, key, key, 5); + must_rollback(&mut engine, key, 5, false); + must_prewrite_lock_err(&mut engine, key, key, 5); } fn test_write_imp(k: &[u8], v: &[u8], k2: &[u8]) { - let engine = TestEngineBuilder::new().build().unwrap(); - - must_prewrite_put(&engine, k, v, k, 5); - must_seek_write_none(&engine, k, 5); - - must_commit(&engine, k, 5, 10); - must_seek_write(&engine, k, TimeStamp::max(), 5, 10, WriteType::Put); - must_seek_write_none(&engine, k2, TimeStamp::max()); - must_get_commit_ts(&engine, k, 5, 10); - - must_prewrite_delete(&engine, k, k, 15); - must_rollback(&engine, k, 15, false); - must_seek_write(&engine, k, TimeStamp::max(), 15, 15, WriteType::Rollback); - must_get_commit_ts(&engine, k, 5, 10); - must_get_commit_ts_none(&engine, k, 15); - - must_prewrite_lock(&engine, k, k, 25); - must_commit(&engine, k, 25, 30); - must_seek_write(&engine, k, TimeStamp::max(), 25, 30, WriteType::Lock); - must_get_commit_ts(&engine, k, 25, 30); + let mut engine = TestEngineBuilder::new().build().unwrap(); + + must_prewrite_put(&mut engine, k, v, k, 5); + must_seek_write_none(&mut engine, k, 5); + + must_commit(&mut engine, k, 5, 10); + must_seek_write(&mut engine, k, TimeStamp::max(), 5, 10, WriteType::Put); + must_seek_write_none(&mut engine, k2, TimeStamp::max()); + must_get_commit_ts(&mut engine, k, 5, 10); + + must_prewrite_delete(&mut engine, k, k, 15); + must_rollback(&mut engine, k, 15, false); + must_seek_write( + &mut engine, + k, + TimeStamp::max(), + 15, + 15, + WriteType::Rollback, + ); + must_get_commit_ts(&mut engine, k, 5, 10); + must_get_commit_ts_none(&mut engine, k, 15); + + must_prewrite_lock(&mut engine, k, k, 25); + must_commit(&mut engine, k, 25, 30); + must_seek_write(&mut engine, k, TimeStamp::max(), 25, 30, WriteType::Lock); + must_get_commit_ts(&mut engine, k, 25, 30); } #[test] @@ -691,21 +698,27 @@ pub(crate) mod tests { } fn test_scan_keys_imp(keys: Vec<&[u8]>, values: Vec<&[u8]>) { - let engine = TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine, keys[0], values[0], keys[0], 1); - must_commit(&engine, keys[0], 1, 10); - must_prewrite_lock(&engine, keys[1], keys[1], 1); - must_commit(&engine, keys[1], 1, 5); - must_prewrite_delete(&engine, keys[2], keys[2], 1); - must_commit(&engine, keys[2], 1, 20); - must_prewrite_put(&engine, keys[3], values[1], keys[3], 1); - must_prewrite_lock(&engine, keys[4], keys[4], 10); - must_prewrite_delete(&engine, keys[5], keys[5], 5); - - must_scan_keys(&engine, None, 100, vec![keys[0], keys[1], keys[2]], None); - must_scan_keys(&engine, None, 3, vec![keys[0], keys[1], keys[2]], None); - must_scan_keys(&engine, None, 2, vec![keys[0], keys[1]], Some(keys[1])); - must_scan_keys(&engine, Some(keys[1]), 1, vec![keys[1]], Some(keys[1])); + let mut engine = TestEngineBuilder::new().build().unwrap(); + must_prewrite_put(&mut engine, keys[0], values[0], keys[0], 1); + must_commit(&mut engine, keys[0], 1, 10); + must_prewrite_lock(&mut engine, keys[1], keys[1], 1); + must_commit(&mut engine, keys[1], 1, 5); + must_prewrite_delete(&mut engine, keys[2], keys[2], 1); + must_commit(&mut engine, keys[2], 1, 20); + must_prewrite_put(&mut engine, keys[3], values[1], keys[3], 1); + must_prewrite_lock(&mut engine, keys[4], keys[4], 10); + must_prewrite_delete(&mut engine, keys[5], keys[5], 5); + + must_scan_keys( + &mut engine, + None, + 100, + vec![keys[0], keys[1], keys[2]], + None, + ); + must_scan_keys(&mut engine, None, 3, vec![keys[0], keys[1], keys[2]], None); + must_scan_keys(&mut engine, None, 2, vec![keys[0], keys[1]], Some(keys[1])); + must_scan_keys(&mut engine, Some(keys[1]), 1, vec![keys[1]], Some(keys[1])); } #[test] @@ -746,7 +759,7 @@ pub(crate) mod tests { } fn test_write_size_imp(k: &[u8], v: &[u8], pk: &[u8]) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let cm = ConcurrencyManager::new(10.into()); @@ -789,11 +802,11 @@ pub(crate) mod tests { #[test] fn test_skip_constraint_check() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key, value) = (b"key", b"value"); - must_prewrite_put(&engine, key, value, key, 5); - must_commit(&engine, key, 5, 10); + must_prewrite_put(&mut engine, key, value, key, 5); + must_commit(&mut engine, key, 5, 10); let snapshot = engine.snapshot(Default::default()).unwrap(); let cm = ConcurrencyManager::new(10.into()); @@ -825,82 +838,82 @@ pub(crate) mod tests { #[test] fn test_read_commit() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key, v1, v2) = (b"key", b"v1", b"v2"); - must_prewrite_put(&engine, key, v1, key, 5); - must_commit(&engine, key, 5, 10); - must_prewrite_put(&engine, key, v2, key, 15); - must_get_err(&engine, key, 20); - must_get_no_lock_check(&engine, key, 12, v1); - must_get_no_lock_check(&engine, key, 20, v1); + must_prewrite_put(&mut engine, key, v1, key, 5); + must_commit(&mut engine, key, 5, 10); + must_prewrite_put(&mut engine, key, v2, key, 15); + must_get_err(&mut engine, key, 20); + must_get_no_lock_check(&mut engine, key, 12, v1); + must_get_no_lock_check(&mut engine, key, 20, v1); } #[test] fn test_collapse_prev_rollback() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key, value) = (b"key", b"value"); // Add a Rollback whose start ts is 1. - must_prewrite_put(&engine, key, value, key, 1); - must_rollback(&engine, key, 1, false); - must_get_rollback_ts(&engine, key, 1); + must_prewrite_put(&mut engine, key, value, key, 1); + must_rollback(&mut engine, key, 1, false); + must_get_rollback_ts(&mut engine, key, 1); // Add a Rollback whose start ts is 2, the previous Rollback whose // start ts is 1 will be collapsed. - must_prewrite_put(&engine, key, value, key, 2); - must_rollback(&engine, key, 2, false); - must_get_none(&engine, key, 2); - must_get_rollback_ts(&engine, key, 2); - must_get_rollback_ts_none(&engine, key, 1); + must_prewrite_put(&mut engine, key, value, key, 2); + must_rollback(&mut engine, key, 2, false); + must_get_none(&mut engine, key, 2); + must_get_rollback_ts(&mut engine, key, 2); + must_get_rollback_ts_none(&mut engine, key, 1); // Rollback arrive before Prewrite, it will collapse the // previous rollback whose start ts is 2. - must_rollback(&engine, key, 3, false); - must_get_none(&engine, key, 3); - must_get_rollback_ts(&engine, key, 3); - must_get_rollback_ts_none(&engine, key, 2); + must_rollback(&mut engine, key, 3, false); + must_get_none(&mut engine, key, 3); + must_get_rollback_ts(&mut engine, key, 3); + must_get_rollback_ts_none(&mut engine, key, 2); } #[test] fn test_scan_values_in_default() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); must_prewrite_put( - &engine, + &mut engine, &[2], "v".repeat(SHORT_VALUE_MAX_LEN + 1).as_bytes(), &[2], 3, ); - must_commit(&engine, &[2], 3, 3); + must_commit(&mut engine, &[2], 3, 3); must_prewrite_put( - &engine, + &mut engine, &[3], "a".repeat(SHORT_VALUE_MAX_LEN + 1).as_bytes(), &[3], 3, ); - must_commit(&engine, &[3], 3, 4); + must_commit(&mut engine, &[3], 3, 4); must_prewrite_put( - &engine, + &mut engine, &[3], "b".repeat(SHORT_VALUE_MAX_LEN + 1).as_bytes(), &[3], 5, ); - must_commit(&engine, &[3], 5, 5); + must_commit(&mut engine, &[3], 5, 5); must_prewrite_put( - &engine, + &mut engine, &[6], "x".repeat(SHORT_VALUE_MAX_LEN + 1).as_bytes(), &[6], 3, ); - must_commit(&engine, &[6], 3, 6); + must_commit(&mut engine, &[6], 3, 6); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, Some(ScanMode::Forward), true); @@ -919,31 +932,31 @@ pub(crate) mod tests { #[test] fn test_seek_ts() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine, &[2], b"vv", &[2], 3); - must_commit(&engine, &[2], 3, 3); + must_prewrite_put(&mut engine, &[2], b"vv", &[2], 3); + must_commit(&mut engine, &[2], 3, 3); must_prewrite_put( - &engine, + &mut engine, &[3], "a".repeat(SHORT_VALUE_MAX_LEN + 1).as_bytes(), &[3], 4, ); - must_commit(&engine, &[3], 4, 4); + must_commit(&mut engine, &[3], 4, 4); must_prewrite_put( - &engine, + &mut engine, &[5], "b".repeat(SHORT_VALUE_MAX_LEN + 1).as_bytes(), &[5], 2, ); - must_commit(&engine, &[5], 2, 5); + must_commit(&mut engine, &[5], 2, 5); - must_prewrite_put(&engine, &[6], b"xxx", &[6], 3); - must_commit(&engine, &[6], 3, 6); + must_prewrite_put(&mut engine, &[6], b"xxx", &[6], 3); + must_commit(&mut engine, &[6], 3, 6); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, Some(ScanMode::Forward), true); @@ -956,53 +969,71 @@ pub(crate) mod tests { #[test] fn test_pessimistic_txn_ttl() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k", b"v"); // Pessimistic prewrite keeps the larger TTL of the prewrite request and the // original pessimisitic lock. - must_acquire_pessimistic_lock_with_ttl(&engine, k, k, 10, 10, 100); - must_pessimistic_locked(&engine, k, 10, 10); - must_pessimistic_prewrite_put_with_ttl(&engine, k, v, k, 10, 10, DoPessimisticCheck, 110); - must_locked_with_ttl(&engine, k, 10, 110); + must_acquire_pessimistic_lock_with_ttl(&mut engine, k, k, 10, 10, 100); + must_pessimistic_locked(&mut engine, k, 10, 10); + must_pessimistic_prewrite_put_with_ttl( + &mut engine, + k, + v, + k, + 10, + 10, + DoPessimisticCheck, + 110, + ); + must_locked_with_ttl(&mut engine, k, 10, 110); - must_rollback(&engine, k, 10, false); + must_rollback(&mut engine, k, 10, false); // TTL not changed if the pessimistic lock's TTL is larger than that provided in // the prewrite request. - must_acquire_pessimistic_lock_with_ttl(&engine, k, k, 20, 20, 100); - must_pessimistic_locked(&engine, k, 20, 20); - must_pessimistic_prewrite_put_with_ttl(&engine, k, v, k, 20, 20, DoPessimisticCheck, 90); - must_locked_with_ttl(&engine, k, 20, 100); + must_acquire_pessimistic_lock_with_ttl(&mut engine, k, k, 20, 20, 100); + must_pessimistic_locked(&mut engine, k, 20, 20); + must_pessimistic_prewrite_put_with_ttl( + &mut engine, + k, + v, + k, + 20, + 20, + DoPessimisticCheck, + 90, + ); + must_locked_with_ttl(&mut engine, k, 20, 100); } #[test] fn test_constraint_check_with_overlapping_txn() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k1"; let v = b"v1"; - must_prewrite_put(&engine, k, v, k, 10); - must_commit(&engine, k, 10, 11); - must_acquire_pessimistic_lock(&engine, k, k, 5, 12); - must_pessimistic_prewrite_lock(&engine, k, k, 5, 12, DoPessimisticCheck); - must_commit(&engine, k, 5, 15); + must_prewrite_put(&mut engine, k, v, k, 10); + must_commit(&mut engine, k, 10, 11); + must_acquire_pessimistic_lock(&mut engine, k, k, 5, 12); + must_pessimistic_prewrite_lock(&mut engine, k, k, 5, 12, DoPessimisticCheck); + must_commit(&mut engine, k, 5, 15); // Now in write cf: // start_ts = 10, commit_ts = 11, Put("v1") // start_ts = 5, commit_ts = 15, Lock - must_get(&engine, k, 19, v); - try_prewrite_insert(&engine, k, v, k, 20).unwrap_err(); + must_get(&mut engine, k, 19, v); + try_prewrite_insert(&mut engine, k, v, k, 20).unwrap_err(); } #[test] fn test_lock_info_validation() { use kvproto::kvrpcpb::{LockInfo, Op}; - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k"; let v = b"v"; @@ -1022,7 +1053,7 @@ pub(crate) mod tests { expected_lock_info.set_lock_type(Op::Put); // Write an optimistic lock. must_prewrite_put_impl( - &engine, + &mut engine, expected_lock_info.get_key(), v, expected_lock_info.get_primary_lock(), @@ -1043,7 +1074,7 @@ pub(crate) mod tests { expected_lock_info.set_lock_for_update_ts(10); // Write a pessimistic lock. must_acquire_pessimistic_lock_impl( - &engine, + &mut engine, expected_lock_info.get_key(), expected_lock_info.get_primary_lock(), expected_lock_info.get_lock_version(), @@ -1058,30 +1089,38 @@ pub(crate) mod tests { } assert_lock_info_eq( - must_prewrite_put_err(&engine, k, v, k, 20), + must_prewrite_put_err(&mut engine, k, v, k, 20), &expected_lock_info, ); assert_lock_info_eq( - must_acquire_pessimistic_lock_err(&engine, k, k, 30, 30), + must_acquire_pessimistic_lock_err(&mut engine, k, k, 30, 30), &expected_lock_info, ); // If the lock is not expired, cleanup will return the lock info. - assert_lock_info_eq(must_cleanup_err(&engine, k, 10, 1), &expected_lock_info); + assert_lock_info_eq(must_cleanup_err(&mut engine, k, 10, 1), &expected_lock_info); expected_lock_info.set_lock_ttl(0); assert_lock_info_eq( - must_pessimistic_prewrite_put_err(&engine, k, v, k, 40, 40, SkipPessimisticCheck), + must_pessimistic_prewrite_put_err( + &mut engine, + k, + v, + k, + 40, + 40, + SkipPessimisticCheck, + ), &expected_lock_info, ); // Delete the lock if *is_optimistic { - must_rollback(&engine, k, expected_lock_info.get_lock_version(), false); + must_rollback(&mut engine, k, expected_lock_info.get_lock_version(), false); } else { pessimistic_rollback::tests::must_success( - &engine, + &mut engine, k, expected_lock_info.get_lock_version(), expected_lock_info.get_lock_for_update_ts(), @@ -1092,20 +1131,20 @@ pub(crate) mod tests { #[test] fn test_non_pessimistic_lock_conflict_with_optimistic_txn() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k1"; let v = b"v1"; - must_prewrite_put(&engine, k, v, k, 2); - must_locked(&engine, k, 2); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 1, 1, SkipPessimisticCheck); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 3, 3, SkipPessimisticCheck); + must_prewrite_put(&mut engine, k, v, k, 2); + must_locked(&mut engine, k, 2); + must_pessimistic_prewrite_put_err(&mut engine, k, v, k, 1, 1, SkipPessimisticCheck); + must_pessimistic_prewrite_put_err(&mut engine, k, v, k, 3, 3, SkipPessimisticCheck); } #[test] fn test_non_pessimistic_lock_conflict_with_pessismitic_txn() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // k1 is a row key, k2 is the corresponding index key. let (k1, v1) = (b"k1", b"v1"); @@ -1113,27 +1152,35 @@ pub(crate) mod tests { let (k3, v3) = (b"k3", b"v3"); // Commit k3 at 20. - must_prewrite_put(&engine, k3, v3, k3, 1); - must_commit(&engine, k3, 1, 20); + must_prewrite_put(&mut engine, k3, v3, k3, 1); + must_commit(&mut engine, k3, 1, 20); // Txn-10 acquires pessimistic locks on k1 and k3. - must_acquire_pessimistic_lock(&engine, k1, k1, 10, 10); - must_acquire_pessimistic_lock_err(&engine, k3, k1, 10, 10); + must_acquire_pessimistic_lock(&mut engine, k1, k1, 10, 10); + must_acquire_pessimistic_lock_err(&mut engine, k3, k1, 10, 10); // Update for_update_ts to 20 due to write conflict - must_acquire_pessimistic_lock(&engine, k3, k1, 10, 20); - must_pessimistic_prewrite_put(&engine, k1, v1, k1, 10, 20, DoPessimisticCheck); - must_pessimistic_prewrite_put(&engine, k3, v3, k1, 10, 20, DoPessimisticCheck); + must_acquire_pessimistic_lock(&mut engine, k3, k1, 10, 20); + must_pessimistic_prewrite_put(&mut engine, k1, v1, k1, 10, 20, DoPessimisticCheck); + must_pessimistic_prewrite_put(&mut engine, k3, v3, k1, 10, 20, DoPessimisticCheck); // Write a non-pessimistic lock with for_update_ts 20. - must_pessimistic_prewrite_put(&engine, k2, v2, k1, 10, 20, SkipPessimisticCheck); + must_pessimistic_prewrite_put(&mut engine, k2, v2, k1, 10, 20, SkipPessimisticCheck); // Roll back the primary key due to timeout, but the non-pessimistic lock is not // rolled back. - must_rollback(&engine, k1, 10, false); + must_rollback(&mut engine, k1, 10, false); // Txn-15 acquires pessimistic locks on k1. - must_acquire_pessimistic_lock(&engine, k1, k1, 15, 15); - must_pessimistic_prewrite_put(&engine, k1, v1, k1, 15, 15, DoPessimisticCheck); + must_acquire_pessimistic_lock(&mut engine, k1, k1, 15, 15); + must_pessimistic_prewrite_put(&mut engine, k1, v1, k1, 15, 15, DoPessimisticCheck); // There is a non-pessimistic lock conflict here. - match must_pessimistic_prewrite_put_err(&engine, k2, v2, k1, 15, 15, SkipPessimisticCheck) { + match must_pessimistic_prewrite_put_err( + &mut engine, + k2, + v2, + k1, + 15, + 15, + SkipPessimisticCheck, + ) { Error(box ErrorInner::KeyIsLocked(info)) => assert_eq!(info.get_lock_ttl(), 0), e => panic!("unexpected error: {}", e), }; @@ -1141,19 +1188,19 @@ pub(crate) mod tests { #[test] fn test_commit_pessimistic_lock() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k"; - must_acquire_pessimistic_lock(&engine, k, k, 10, 10); - must_commit_err(&engine, k, 20, 30); - must_commit(&engine, k, 10, 20); - must_seek_write(&engine, k, 30, 10, 20, WriteType::Lock); + must_acquire_pessimistic_lock(&mut engine, k, k, 10, 10); + must_commit_err(&mut engine, k, 20, 30); + must_commit(&mut engine, k, 10, 20); + must_seek_write(&mut engine, k, 30, 10, 20, WriteType::Lock); } #[test] fn test_amend_pessimistic_lock() { fn fail_to_write_pessimistic_lock( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, for_update_ts: impl Into, @@ -1165,35 +1212,35 @@ pub(crate) mod tests { pessimistic_rollback::tests::must_success(engine, key, start_ts, for_update_ts); } - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, mut v) = (b"k", b"v".to_vec()); // Key not exist; should succeed. - fail_to_write_pessimistic_lock(&engine, k, 10, 10); - must_pessimistic_prewrite_put(&engine, k, &v, k, 10, 10, DoPessimisticCheck); - must_commit(&engine, k, 10, 20); - must_get(&engine, k, 20, &v); + fail_to_write_pessimistic_lock(&mut engine, k, 10, 10); + must_pessimistic_prewrite_put(&mut engine, k, &v, k, 10, 10, DoPessimisticCheck); + must_commit(&mut engine, k, 10, 20); + must_get(&mut engine, k, 20, &v); // for_update_ts(30) >= start_ts(30) > commit_ts(20); should succeed. v.push(0); - fail_to_write_pessimistic_lock(&engine, k, 30, 30); - must_pessimistic_prewrite_put(&engine, k, &v, k, 30, 30, DoPessimisticCheck); - must_commit(&engine, k, 30, 40); - must_get(&engine, k, 40, &v); + fail_to_write_pessimistic_lock(&mut engine, k, 30, 30); + must_pessimistic_prewrite_put(&mut engine, k, &v, k, 30, 30, DoPessimisticCheck); + must_commit(&mut engine, k, 30, 40); + must_get(&mut engine, k, 40, &v); // for_update_ts(40) >= commit_ts(40) > start_ts(35); should fail. - fail_to_write_pessimistic_lock(&engine, k, 35, 40); - must_pessimistic_prewrite_put_err(&engine, k, &v, k, 35, 40, DoPessimisticCheck); + fail_to_write_pessimistic_lock(&mut engine, k, 35, 40); + must_pessimistic_prewrite_put_err(&mut engine, k, &v, k, 35, 40, DoPessimisticCheck); // KeyIsLocked; should fail. - must_acquire_pessimistic_lock(&engine, k, k, 50, 50); - must_pessimistic_prewrite_put_err(&engine, k, &v, k, 60, 60, DoPessimisticCheck); - pessimistic_rollback::tests::must_success(&engine, k, 50, 50); + must_acquire_pessimistic_lock(&mut engine, k, k, 50, 50); + must_pessimistic_prewrite_put_err(&mut engine, k, &v, k, 60, 60, DoPessimisticCheck); + pessimistic_rollback::tests::must_success(&mut engine, k, 50, 50); // The txn has been rolled back; should fail. - must_acquire_pessimistic_lock(&engine, k, k, 80, 80); - must_cleanup(&engine, k, 80, TimeStamp::max()); - must_pessimistic_prewrite_put_err(&engine, k, &v, k, 80, 80, DoPessimisticCheck); + must_acquire_pessimistic_lock(&mut engine, k, k, 80, 80); + must_cleanup(&mut engine, k, 80, TimeStamp::max()); + must_pessimistic_prewrite_put_err(&mut engine, k, &v, k, 80, 80, DoPessimisticCheck); } #[test] @@ -1201,12 +1248,13 @@ pub(crate) mod tests { // copy must_prewrite_put_impl, check that the key is written with the correct // secondaries and the right timestamp - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); + let mut engine_clone = engine.clone(); let ctx = Context::default(); let cm = ConcurrencyManager::new(42.into()); - let do_prewrite = || { - let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut do_prewrite = || { + let snapshot = engine_clone.snapshot(Default::default()).unwrap(); let mut txn = MvccTxn::new(TimeStamp::new(2), cm.clone()); let mut reader = SnapshotReader::new(TimeStamp::new(2), snapshot, true); let mutation = Mutation::make_put(Key::from_raw(b"key"), b"value".to_vec()); @@ -1228,7 +1276,7 @@ pub(crate) mod tests { .unwrap(); let modifies = txn.into_modifies(); if !modifies.is_empty() { - engine + engine_clone .write(&ctx, WriteData::from_modifies(modifies)) .unwrap(); } @@ -1257,13 +1305,13 @@ pub(crate) mod tests { #[test] fn test_async_pessimistic_prewrite_primary() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let ctx = Context::default(); let cm = ConcurrencyManager::new(42.into()); - must_acquire_pessimistic_lock(&engine, b"key", b"key", 2, 2); + must_acquire_pessimistic_lock(&mut engine, b"key", b"key", 2, 2); - let do_pessimistic_prewrite = || { + let do_pessimistic_prewrite = |engine: &mut RocksEngine| { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut txn = MvccTxn::new(TimeStamp::new(2), cm.clone()); let mut reader = SnapshotReader::new(TimeStamp::new(2), snapshot, true); @@ -1293,7 +1341,7 @@ pub(crate) mod tests { min_commit_ts }; - assert_eq!(do_pessimistic_prewrite(), 43.into()); + assert_eq!(do_pessimistic_prewrite(&mut engine), 43.into()); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, None, true); @@ -1310,17 +1358,27 @@ pub(crate) mod tests { // A duplicate prewrite request should return the min_commit_ts in the primary // key - assert_eq!(do_pessimistic_prewrite(), 43.into()); + assert_eq!(do_pessimistic_prewrite(&mut engine), 43.into()); } #[test] fn test_async_commit_pushed_min_commit_ts() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(42.into()); // Simulate that min_commit_ts is pushed forward larger than latest_ts must_acquire_pessimistic_lock_impl( - &engine, b"key", b"key", 2, false, 20000, 2, false, false, 100, false, + &mut engine, + b"key", + b"key", + 2, + false, + 20000, + 2, + false, + false, + 100, + false, ); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1348,125 +1406,125 @@ pub(crate) mod tests { #[test] fn test_txn_timestamp_overlapping() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k1", b"v1"); // Prepare a committed transaction. - must_prewrite_put(&engine, k, v, k, 10); - must_locked(&engine, k, 10); - must_commit(&engine, k, 10, 20); - must_unlocked(&engine, k); - must_written(&engine, k, 10, 20, WriteType::Put); + must_prewrite_put(&mut engine, k, v, k, 10); + must_locked(&mut engine, k, 10); + must_commit(&mut engine, k, 10, 20); + must_unlocked(&mut engine, k); + must_written(&mut engine, k, 10, 20, WriteType::Put); // Optimistic transaction allows the start_ts equals to another transaction's // commit_ts on the same key. - must_prewrite_put(&engine, k, v, k, 20); - must_locked(&engine, k, 20); - must_commit(&engine, k, 20, 30); - must_unlocked(&engine, k); + must_prewrite_put(&mut engine, k, v, k, 20); + must_locked(&mut engine, k, 20); + must_commit(&mut engine, k, 20, 30); + must_unlocked(&mut engine, k); // ...but it can be rejected by overlapped rollback flag. - must_cleanup(&engine, k, 30, 0); - let w = must_written(&engine, k, 20, 30, WriteType::Put); + must_cleanup(&mut engine, k, 30, 0); + let w = must_written(&mut engine, k, 20, 30, WriteType::Put); assert!(w.has_overlapped_rollback); - must_unlocked(&engine, k); - must_prewrite_put_err(&engine, k, v, k, 30); - must_unlocked(&engine, k); + must_unlocked(&mut engine, k); + must_prewrite_put_err(&mut engine, k, v, k, 30); + must_unlocked(&mut engine, k); // Prepare a committed transaction. - must_prewrite_put(&engine, k, v, k, 40); - must_locked(&engine, k, 40); - must_commit(&engine, k, 40, 50); - must_unlocked(&engine, k); - must_written(&engine, k, 40, 50, WriteType::Put); + must_prewrite_put(&mut engine, k, v, k, 40); + must_locked(&mut engine, k, 40); + must_commit(&mut engine, k, 40, 50); + must_unlocked(&mut engine, k); + must_written(&mut engine, k, 40, 50, WriteType::Put); // Pessimistic transaction also works in the same case. - must_acquire_pessimistic_lock(&engine, k, k, 50, 50); - must_pessimistic_locked(&engine, k, 50, 50); - must_pessimistic_prewrite_put(&engine, k, v, k, 50, 50, DoPessimisticCheck); - must_commit(&engine, k, 50, 60); - must_unlocked(&engine, k); - must_written(&engine, k, 50, 60, WriteType::Put); + must_acquire_pessimistic_lock(&mut engine, k, k, 50, 50); + must_pessimistic_locked(&mut engine, k, 50, 50); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 50, 50, DoPessimisticCheck); + must_commit(&mut engine, k, 50, 60); + must_unlocked(&mut engine, k); + must_written(&mut engine, k, 50, 60, WriteType::Put); // .. and it can also be rejected by overlapped rollback flag. - must_cleanup(&engine, k, 60, 0); - let w = must_written(&engine, k, 50, 60, WriteType::Put); + must_cleanup(&mut engine, k, 60, 0); + let w = must_written(&mut engine, k, 50, 60, WriteType::Put); assert!(w.has_overlapped_rollback); - must_unlocked(&engine, k); - must_acquire_pessimistic_lock_err(&engine, k, k, 60, 60); - must_unlocked(&engine, k); + must_unlocked(&mut engine, k); + must_acquire_pessimistic_lock_err(&mut engine, k, k, 60, 60); + must_unlocked(&mut engine, k); } #[test] fn test_rollback_while_other_transaction_running() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k1", b"v1"); - must_prewrite_put_async_commit(&engine, k, v, k, &Some(vec![]), 10, 0); - must_cleanup(&engine, k, 15, 0); - must_commit(&engine, k, 10, 15); - let w = must_written(&engine, k, 10, 15, WriteType::Put); + must_prewrite_put_async_commit(&mut engine, k, v, k, &Some(vec![]), 10, 0); + must_cleanup(&mut engine, k, 15, 0); + must_commit(&mut engine, k, 10, 15); + let w = must_written(&mut engine, k, 10, 15, WriteType::Put); assert!(w.has_overlapped_rollback); // GC fence shouldn't be set in this case. assert!(w.gc_fence.is_none()); - must_prewrite_put_async_commit(&engine, k, v, k, &Some(vec![]), 20, 0); - check_txn_status::tests::must_success(&engine, k, 25, 0, 0, true, false, false, |s| { + must_prewrite_put_async_commit(&mut engine, k, v, k, &Some(vec![]), 20, 0); + check_txn_status::tests::must_success(&mut engine, k, 25, 0, 0, true, false, false, |s| { s == TxnStatus::LockNotExist }); - must_commit(&engine, k, 20, 25); - let w = must_written(&engine, k, 20, 25, WriteType::Put); + must_commit(&mut engine, k, 20, 25); + let w = must_written(&mut engine, k, 20, 25, WriteType::Put); assert!(w.has_overlapped_rollback); assert!(w.gc_fence.is_none()); - must_prewrite_put_async_commit(&engine, k, v, k, &Some(vec![]), 30, 0); + must_prewrite_put_async_commit(&mut engine, k, v, k, &Some(vec![]), 30, 0); check_secondary_locks::tests::must_success( - &engine, + &mut engine, k, 35, SecondaryLocksStatus::RolledBack, ); - must_commit(&engine, k, 30, 35); - let w = must_written(&engine, k, 30, 35, WriteType::Put); + must_commit(&mut engine, k, 30, 35); + let w = must_written(&mut engine, k, 30, 35, WriteType::Put); assert!(w.has_overlapped_rollback); assert!(w.gc_fence.is_none()); // Do not commit with overlapped_rollback if the rollback ts doesn't equal to // commit_ts. - must_prewrite_put_async_commit(&engine, k, v, k, &Some(vec![]), 40, 0); - must_cleanup(&engine, k, 44, 0); - must_commit(&engine, k, 40, 45); - let w = must_written(&engine, k, 40, 45, WriteType::Put); + must_prewrite_put_async_commit(&mut engine, k, v, k, &Some(vec![]), 40, 0); + must_cleanup(&mut engine, k, 44, 0); + must_commit(&mut engine, k, 40, 45); + let w = must_written(&mut engine, k, 40, 45, WriteType::Put); assert!(!w.has_overlapped_rollback); // Do not put rollback mark to the lock if the lock is not async commit or if // lock.ts is before start_ts or min_commit_ts. - must_prewrite_put(&engine, k, v, k, 50); - must_cleanup(&engine, k, 55, 0); - let l = must_locked(&engine, k, 50); + must_prewrite_put(&mut engine, k, v, k, 50); + must_cleanup(&mut engine, k, 55, 0); + let l = must_locked(&mut engine, k, 50); assert!(l.rollback_ts.is_empty()); - must_commit(&engine, k, 50, 56); + must_commit(&mut engine, k, 50, 56); - must_prewrite_put_async_commit(&engine, k, v, k, &Some(vec![]), 60, 0); - must_cleanup(&engine, k, 59, 0); - let l = must_locked(&engine, k, 60); + must_prewrite_put_async_commit(&mut engine, k, v, k, &Some(vec![]), 60, 0); + must_cleanup(&mut engine, k, 59, 0); + let l = must_locked(&mut engine, k, 60); assert!(l.rollback_ts.is_empty()); - must_commit(&engine, k, 60, 65); + must_commit(&mut engine, k, 60, 65); - must_prewrite_put_async_commit(&engine, k, v, k, &Some(vec![]), 70, 75); - must_cleanup(&engine, k, 74, 0); - must_cleanup(&engine, k, 75, 0); - let l = must_locked(&engine, k, 70); + must_prewrite_put_async_commit(&mut engine, k, v, k, &Some(vec![]), 70, 75); + must_cleanup(&mut engine, k, 74, 0); + must_cleanup(&mut engine, k, 75, 0); + let l = must_locked(&mut engine, k, 70); assert_eq!(l.min_commit_ts, 75.into()); assert_eq!(l.rollback_ts, vec![75.into()]); } #[test] fn test_gc_fence() { - let rollback = |engine: &RocksEngine, k: &[u8], start_ts: u64| { + let rollback = |engine: &mut RocksEngine, k: &[u8], start_ts: u64| { must_cleanup(engine, k, start_ts, 0); }; - let check_status = |engine: &RocksEngine, k: &[u8], start_ts: u64| { + let check_status = |engine: &mut RocksEngine, k: &[u8], start_ts: u64| { check_txn_status::tests::must_success( engine, k, @@ -1479,7 +1537,7 @@ pub(crate) mod tests { |_| true, ); }; - let check_secondary = |engine: &RocksEngine, k: &[u8], start_ts: u64| { + let check_secondary = |engine: &mut RocksEngine, k: &[u8], start_ts: u64| { check_secondary_locks::tests::must_success( engine, k, @@ -1489,115 +1547,115 @@ pub(crate) mod tests { }; for &rollback in &[rollback, check_status, check_secondary] { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Get gc fence without any newer versions. - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 101); - must_commit(&engine, b"k1", 101, 102); - rollback(&engine, b"k1", 102); - must_get_overlapped_rollback(&engine, b"k1", 102, 101, WriteType::Put, Some(0)); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 101); + must_commit(&mut engine, b"k1", 101, 102); + rollback(&mut engine, b"k1", 102); + must_get_overlapped_rollback(&mut engine, b"k1", 102, 101, WriteType::Put, Some(0)); // Get gc fence with a newer put. - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 103); - must_commit(&engine, b"k1", 103, 104); - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 105); - must_commit(&engine, b"k1", 105, 106); - rollback(&engine, b"k1", 104); - must_get_overlapped_rollback(&engine, b"k1", 104, 103, WriteType::Put, Some(106)); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 103); + must_commit(&mut engine, b"k1", 103, 104); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 105); + must_commit(&mut engine, b"k1", 105, 106); + rollback(&mut engine, b"k1", 104); + must_get_overlapped_rollback(&mut engine, b"k1", 104, 103, WriteType::Put, Some(106)); // Get gc fence with a newer delete. - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 107); - must_commit(&engine, b"k1", 107, 108); - must_prewrite_delete(&engine, b"k1", b"k1", 109); - must_commit(&engine, b"k1", 109, 110); - rollback(&engine, b"k1", 108); - must_get_overlapped_rollback(&engine, b"k1", 108, 107, WriteType::Put, Some(110)); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 107); + must_commit(&mut engine, b"k1", 107, 108); + must_prewrite_delete(&mut engine, b"k1", b"k1", 109); + must_commit(&mut engine, b"k1", 109, 110); + rollback(&mut engine, b"k1", 108); + must_get_overlapped_rollback(&mut engine, b"k1", 108, 107, WriteType::Put, Some(110)); // Get gc fence with a newer rollback and lock. - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 111); - must_commit(&engine, b"k1", 111, 112); - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 113); - must_rollback(&engine, b"k1", 113, false); - must_prewrite_lock(&engine, b"k1", b"k1", 115); - must_commit(&engine, b"k1", 115, 116); - rollback(&engine, b"k1", 112); - must_get_overlapped_rollback(&engine, b"k1", 112, 111, WriteType::Put, Some(0)); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 111); + must_commit(&mut engine, b"k1", 111, 112); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 113); + must_rollback(&mut engine, b"k1", 113, false); + must_prewrite_lock(&mut engine, b"k1", b"k1", 115); + must_commit(&mut engine, b"k1", 115, 116); + rollback(&mut engine, b"k1", 112); + must_get_overlapped_rollback(&mut engine, b"k1", 112, 111, WriteType::Put, Some(0)); // Get gc fence with a newer put after some rollbacks and locks. - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 121); - must_commit(&engine, b"k1", 121, 122); - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 123); - must_rollback(&engine, b"k1", 123, false); - must_prewrite_lock(&engine, b"k1", b"k1", 125); - must_commit(&engine, b"k1", 125, 126); - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 127); - must_commit(&engine, b"k1", 127, 128); - rollback(&engine, b"k1", 122); - must_get_overlapped_rollback(&engine, b"k1", 122, 121, WriteType::Put, Some(128)); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 121); + must_commit(&mut engine, b"k1", 121, 122); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 123); + must_rollback(&mut engine, b"k1", 123, false); + must_prewrite_lock(&mut engine, b"k1", b"k1", 125); + must_commit(&mut engine, b"k1", 125, 126); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 127); + must_commit(&mut engine, b"k1", 127, 128); + rollback(&mut engine, b"k1", 122); + must_get_overlapped_rollback(&mut engine, b"k1", 122, 121, WriteType::Put, Some(128)); // A key's gc fence won't be another MVCC key. - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 131); - must_commit(&engine, b"k1", 131, 132); - must_prewrite_put(&engine, b"k0", b"v1", b"k0", 133); - must_commit(&engine, b"k0", 133, 134); - must_prewrite_put(&engine, b"k2", b"v1", b"k2", 133); - must_commit(&engine, b"k2", 133, 134); - rollback(&engine, b"k1", 132); - must_get_overlapped_rollback(&engine, b"k1", 132, 131, WriteType::Put, Some(0)); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 131); + must_commit(&mut engine, b"k1", 131, 132); + must_prewrite_put(&mut engine, b"k0", b"v1", b"k0", 133); + must_commit(&mut engine, b"k0", 133, 134); + must_prewrite_put(&mut engine, b"k2", b"v1", b"k2", 133); + must_commit(&mut engine, b"k2", 133, 134); + rollback(&mut engine, b"k1", 132); + must_get_overlapped_rollback(&mut engine, b"k1", 132, 131, WriteType::Put, Some(0)); } } #[test] fn test_overlapped_ts_commit_before_rollback() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k1, v1) = (b"key1", b"v1"); let (k2, v2) = (b"key2", b"v2"); let key2 = k2.to_vec(); let secondaries = Some(vec![key2]); // T1, start_ts = 10, commit_ts = 20; write k1, k2 - must_prewrite_put_async_commit(&engine, k1, v1, k1, &secondaries, 10, 0); - must_prewrite_put_async_commit(&engine, k2, v2, k1, &secondaries, 10, 0); - must_commit(&engine, k1, 10, 20); - must_commit(&engine, k2, 10, 20); + must_prewrite_put_async_commit(&mut engine, k1, v1, k1, &secondaries, 10, 0); + must_prewrite_put_async_commit(&mut engine, k2, v2, k1, &secondaries, 10, 0); + must_commit(&mut engine, k1, 10, 20); + must_commit(&mut engine, k2, 10, 20); - let w = must_written(&engine, k1, 10, 20, WriteType::Put); + let w = must_written(&mut engine, k1, 10, 20, WriteType::Put); assert!(!w.has_overlapped_rollback); // T2, start_ts = 20 - must_acquire_pessimistic_lock(&engine, k2, k2, 20, 25); - must_pessimistic_prewrite_put(&engine, k2, v2, k2, 20, 25, DoPessimisticCheck); + must_acquire_pessimistic_lock(&mut engine, k2, k2, 20, 25); + must_pessimistic_prewrite_put(&mut engine, k2, v2, k2, 20, 25, DoPessimisticCheck); - must_cleanup(&engine, k2, 20, 0); + must_cleanup(&mut engine, k2, 20, 0); - let w = must_written(&engine, k2, 10, 20, WriteType::Put); + let w = must_written(&mut engine, k2, 10, 20, WriteType::Put); assert!(w.has_overlapped_rollback); - must_get(&engine, k2, 30, v2); - must_acquire_pessimistic_lock_err(&engine, k2, k2, 20, 25); + must_get(&mut engine, k2, 30, v2); + must_acquire_pessimistic_lock_err(&mut engine, k2, k2, 20, 25); } #[test] fn test_overlapped_ts_prewrite_before_rollback() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k1, v1) = (b"key1", b"v1"); let (k2, v2) = (b"key2", b"v2"); let key2 = k2.to_vec(); let secondaries = Some(vec![key2]); // T1, start_ts = 10 - must_prewrite_put_async_commit(&engine, k1, v1, k1, &secondaries, 10, 0); - must_prewrite_put_async_commit(&engine, k2, v2, k1, &secondaries, 10, 0); + must_prewrite_put_async_commit(&mut engine, k1, v1, k1, &secondaries, 10, 0); + must_prewrite_put_async_commit(&mut engine, k2, v2, k1, &secondaries, 10, 0); // T2, start_ts = 20 - must_prewrite_put_err(&engine, k2, v2, k2, 20); - must_cleanup(&engine, k2, 20, 0); + must_prewrite_put_err(&mut engine, k2, v2, k2, 20); + must_cleanup(&mut engine, k2, 20, 0); // commit T1 - must_commit(&engine, k1, 10, 20); - must_commit(&engine, k2, 10, 20); + must_commit(&mut engine, k1, 10, 20); + must_commit(&mut engine, k2, 10, 20); - let w = must_written(&engine, k2, 10, 20, WriteType::Put); + let w = must_written(&mut engine, k2, 10, 20, WriteType::Put); assert!(w.has_overlapped_rollback); - must_prewrite_put_err(&engine, k2, v2, k2, 20); + must_prewrite_put_err(&mut engine, k2, v2, k2, 20); } } diff --git a/src/storage/raw/raw_mvcc.rs b/src/storage/raw/raw_mvcc.rs index 59dd5e8f13d..6d86203e8f2 100644 --- a/src/storage/raw/raw_mvcc.rs +++ b/src/storage/raw/raw_mvcc.rs @@ -257,7 +257,7 @@ mod tests { fn test_raw_mvcc_snapshot() { // Use `Engine` to be independent to `Storage`. // Do not set "api version" to use `Engine` as a raw RocksDB. - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (tx, rx) = channel(); let ctx = Context::default(); diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 699002f0126..7c2f41d3e1b 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -293,7 +293,7 @@ pub mod tests { }; pub fn must_succeed_impl( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], start_ts: impl Into, @@ -337,7 +337,7 @@ pub mod tests { } pub fn must_succeed( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], start_ts: impl Into, @@ -347,7 +347,7 @@ pub mod tests { } pub fn must_succeed_return_value( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], start_ts: impl Into, @@ -370,7 +370,7 @@ pub mod tests { } pub fn must_succeed_with_ttl( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], start_ts: impl Into, @@ -396,7 +396,7 @@ pub mod tests { } pub fn must_succeed_for_large_txn( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], start_ts: impl Into, @@ -421,7 +421,7 @@ pub mod tests { } pub fn must_err( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], start_ts: impl Into, @@ -442,7 +442,7 @@ pub mod tests { } pub fn must_err_return_value( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], start_ts: impl Into, @@ -464,7 +464,7 @@ pub mod tests { } fn must_err_impl( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], start_ts: impl Into, @@ -499,7 +499,7 @@ pub mod tests { } pub fn must_pessimistic_locked( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, for_update_ts: impl Into, @@ -514,7 +514,7 @@ pub mod tests { #[test] fn test_pessimistic_lock() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k1"; let v = b"v1"; @@ -523,221 +523,221 @@ pub mod tests { // important, we should consider whether they are better to be fixed. // Normal - must_succeed(&engine, k, k, 1, 1); - must_pessimistic_locked(&engine, k, 1, 1); - must_pessimistic_prewrite_put(&engine, k, v, k, 1, 1, DoPessimisticCheck); - must_locked(&engine, k, 1); - must_commit(&engine, k, 1, 2); - must_unlocked(&engine, k); + must_succeed(&mut engine, k, k, 1, 1); + must_pessimistic_locked(&mut engine, k, 1, 1); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 1, 1, DoPessimisticCheck); + must_locked(&mut engine, k, 1); + must_commit(&mut engine, k, 1, 2); + must_unlocked(&mut engine, k); // Lock conflict - must_prewrite_put(&engine, k, v, k, 3); - must_err(&engine, k, k, 4, 4); - must_cleanup(&engine, k, 3, 0); - must_unlocked(&engine, k); - must_succeed(&engine, k, k, 5, 5); - must_prewrite_lock_err(&engine, k, k, 6); - must_err(&engine, k, k, 6, 6); - must_cleanup(&engine, k, 5, 0); - must_unlocked(&engine, k); + must_prewrite_put(&mut engine, k, v, k, 3); + must_err(&mut engine, k, k, 4, 4); + must_cleanup(&mut engine, k, 3, 0); + must_unlocked(&mut engine, k); + must_succeed(&mut engine, k, k, 5, 5); + must_prewrite_lock_err(&mut engine, k, k, 6); + must_err(&mut engine, k, k, 6, 6); + must_cleanup(&mut engine, k, 5, 0); + must_unlocked(&mut engine, k); // Data conflict - must_prewrite_put(&engine, k, v, k, 7); - must_commit(&engine, k, 7, 9); - must_unlocked(&engine, k); - must_prewrite_lock_err(&engine, k, k, 8); - must_err(&engine, k, k, 8, 8); - must_succeed(&engine, k, k, 8, 9); - must_pessimistic_prewrite_put(&engine, k, v, k, 8, 8, DoPessimisticCheck); - must_commit(&engine, k, 8, 10); - must_unlocked(&engine, k); + must_prewrite_put(&mut engine, k, v, k, 7); + must_commit(&mut engine, k, 7, 9); + must_unlocked(&mut engine, k); + must_prewrite_lock_err(&mut engine, k, k, 8); + must_err(&mut engine, k, k, 8, 8); + must_succeed(&mut engine, k, k, 8, 9); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 8, 8, DoPessimisticCheck); + must_commit(&mut engine, k, 8, 10); + must_unlocked(&mut engine, k); // Rollback - must_succeed(&engine, k, k, 11, 11); - must_pessimistic_locked(&engine, k, 11, 11); - must_cleanup(&engine, k, 11, 0); - must_err(&engine, k, k, 11, 11); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 11, 11, DoPessimisticCheck); - must_prewrite_lock_err(&engine, k, k, 11); - must_unlocked(&engine, k); - - must_succeed(&engine, k, k, 12, 12); - must_pessimistic_prewrite_put(&engine, k, v, k, 12, 12, DoPessimisticCheck); - must_locked(&engine, k, 12); - must_cleanup(&engine, k, 12, 0); - must_err(&engine, k, k, 12, 12); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 12, 12, DoPessimisticCheck); - must_prewrite_lock_err(&engine, k, k, 12); - must_unlocked(&engine, k); + must_succeed(&mut engine, k, k, 11, 11); + must_pessimistic_locked(&mut engine, k, 11, 11); + must_cleanup(&mut engine, k, 11, 0); + must_err(&mut engine, k, k, 11, 11); + must_pessimistic_prewrite_put_err(&mut engine, k, v, k, 11, 11, DoPessimisticCheck); + must_prewrite_lock_err(&mut engine, k, k, 11); + must_unlocked(&mut engine, k); + + must_succeed(&mut engine, k, k, 12, 12); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 12, 12, DoPessimisticCheck); + must_locked(&mut engine, k, 12); + must_cleanup(&mut engine, k, 12, 0); + must_err(&mut engine, k, k, 12, 12); + must_pessimistic_prewrite_put_err(&mut engine, k, v, k, 12, 12, DoPessimisticCheck); + must_prewrite_lock_err(&mut engine, k, k, 12); + must_unlocked(&mut engine, k); // Duplicated - must_succeed(&engine, k, k, 13, 13); - must_pessimistic_locked(&engine, k, 13, 13); - must_succeed(&engine, k, k, 13, 13); - must_pessimistic_locked(&engine, k, 13, 13); - must_pessimistic_prewrite_put(&engine, k, v, k, 13, 13, DoPessimisticCheck); - must_locked(&engine, k, 13); - must_pessimistic_prewrite_put(&engine, k, v, k, 13, 13, DoPessimisticCheck); - must_locked(&engine, k, 13); - must_commit(&engine, k, 13, 14); - must_unlocked(&engine, k); - must_commit(&engine, k, 13, 14); - must_unlocked(&engine, k); + must_succeed(&mut engine, k, k, 13, 13); + must_pessimistic_locked(&mut engine, k, 13, 13); + must_succeed(&mut engine, k, k, 13, 13); + must_pessimistic_locked(&mut engine, k, 13, 13); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 13, 13, DoPessimisticCheck); + must_locked(&mut engine, k, 13); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 13, 13, DoPessimisticCheck); + must_locked(&mut engine, k, 13); + must_commit(&mut engine, k, 13, 14); + must_unlocked(&mut engine, k); + must_commit(&mut engine, k, 13, 14); + must_unlocked(&mut engine, k); // Pessimistic lock doesn't block reads. - must_succeed(&engine, k, k, 15, 15); - must_pessimistic_locked(&engine, k, 15, 15); - must_get(&engine, k, 16, v); - must_pessimistic_prewrite_delete(&engine, k, k, 15, 15, DoPessimisticCheck); - must_get_err(&engine, k, 16); - must_commit(&engine, k, 15, 17); + must_succeed(&mut engine, k, k, 15, 15); + must_pessimistic_locked(&mut engine, k, 15, 15); + must_get(&mut engine, k, 16, v); + must_pessimistic_prewrite_delete(&mut engine, k, k, 15, 15, DoPessimisticCheck); + must_get_err(&mut engine, k, 16); + must_commit(&mut engine, k, 15, 17); // Rollback - must_succeed(&engine, k, k, 18, 18); - must_rollback(&engine, k, 18, false); - must_unlocked(&engine, k); - must_prewrite_put(&engine, k, v, k, 19); - must_commit(&engine, k, 19, 20); - must_err(&engine, k, k, 18, 21); - must_unlocked(&engine, k); + must_succeed(&mut engine, k, k, 18, 18); + must_rollback(&mut engine, k, 18, false); + must_unlocked(&mut engine, k); + must_prewrite_put(&mut engine, k, v, k, 19); + must_commit(&mut engine, k, 19, 20); + must_err(&mut engine, k, k, 18, 21); + must_unlocked(&mut engine, k); // LockTypeNotMatch - must_prewrite_put(&engine, k, v, k, 23); - must_locked(&engine, k, 23); - must_err(&engine, k, k, 23, 23); - must_cleanup(&engine, k, 23, 0); - must_succeed(&engine, k, k, 24, 24); - must_pessimistic_locked(&engine, k, 24, 24); - must_prewrite_put_err(&engine, k, v, k, 24); - must_rollback(&engine, k, 24, false); + must_prewrite_put(&mut engine, k, v, k, 23); + must_locked(&mut engine, k, 23); + must_err(&mut engine, k, k, 23, 23); + must_cleanup(&mut engine, k, 23, 0); + must_succeed(&mut engine, k, k, 24, 24); + must_pessimistic_locked(&mut engine, k, 24, 24); + must_prewrite_put_err(&mut engine, k, v, k, 24); + must_rollback(&mut engine, k, 24, false); // Acquire lock on a prewritten key should fail. - must_succeed(&engine, k, k, 26, 26); - must_pessimistic_locked(&engine, k, 26, 26); - must_pessimistic_prewrite_delete(&engine, k, k, 26, 26, DoPessimisticCheck); - must_locked(&engine, k, 26); - must_err(&engine, k, k, 26, 26); - must_locked(&engine, k, 26); + must_succeed(&mut engine, k, k, 26, 26); + must_pessimistic_locked(&mut engine, k, 26, 26); + must_pessimistic_prewrite_delete(&mut engine, k, k, 26, 26, DoPessimisticCheck); + must_locked(&mut engine, k, 26); + must_err(&mut engine, k, k, 26, 26); + must_locked(&mut engine, k, 26); // Acquire lock on a committed key should fail. - must_commit(&engine, k, 26, 27); - must_unlocked(&engine, k); - must_get_none(&engine, k, 28); - must_err(&engine, k, k, 26, 26); - must_unlocked(&engine, k); - must_get_none(&engine, k, 28); + must_commit(&mut engine, k, 26, 27); + must_unlocked(&mut engine, k); + must_get_none(&mut engine, k, 28); + must_err(&mut engine, k, k, 26, 26); + must_unlocked(&mut engine, k); + must_get_none(&mut engine, k, 28); // Pessimistic prewrite on a committed key should fail. - must_pessimistic_prewrite_put_err(&engine, k, v, k, 26, 26, DoPessimisticCheck); - must_unlocked(&engine, k); - must_get_none(&engine, k, 28); + must_pessimistic_prewrite_put_err(&mut engine, k, v, k, 26, 26, DoPessimisticCheck); + must_unlocked(&mut engine, k); + must_get_none(&mut engine, k, 28); // Currently we cannot avoid this. - must_succeed(&engine, k, k, 26, 29); - pessimistic_rollback::tests::must_success(&engine, k, 26, 29); - must_unlocked(&engine, k); + must_succeed(&mut engine, k, k, 26, 29); + pessimistic_rollback::tests::must_success(&mut engine, k, 26, 29); + must_unlocked(&mut engine, k); // Non pessimistic key in pessimistic transaction. - must_pessimistic_prewrite_put(&engine, k, v, k, 30, 30, SkipPessimisticCheck); - must_locked(&engine, k, 30); - must_commit(&engine, k, 30, 31); - must_unlocked(&engine, k); - must_get_commit_ts(&engine, k, 30, 31); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 30, 30, SkipPessimisticCheck); + must_locked(&mut engine, k, 30); + must_commit(&mut engine, k, 30, 31); + must_unlocked(&mut engine, k); + must_get_commit_ts(&mut engine, k, 30, 31); // Rollback collapsed. - must_rollback(&engine, k, 32, false); - must_rollback(&engine, k, 33, false); - must_err(&engine, k, k, 32, 32); + must_rollback(&mut engine, k, 32, false); + must_rollback(&mut engine, k, 33, false); + must_err(&mut engine, k, k, 32, 32); // Currently we cannot avoid this. - must_succeed(&engine, k, k, 32, 34); - pessimistic_rollback::tests::must_success(&engine, k, 32, 34); - must_unlocked(&engine, k); + must_succeed(&mut engine, k, k, 32, 34); + pessimistic_rollback::tests::must_success(&mut engine, k, 32, 34); + must_unlocked(&mut engine, k); // Acquire lock when there is lock with different for_update_ts. - must_succeed(&engine, k, k, 35, 36); - must_pessimistic_locked(&engine, k, 35, 36); - must_succeed(&engine, k, k, 35, 35); - must_pessimistic_locked(&engine, k, 35, 36); - must_succeed(&engine, k, k, 35, 37); - must_pessimistic_locked(&engine, k, 35, 37); + must_succeed(&mut engine, k, k, 35, 36); + must_pessimistic_locked(&mut engine, k, 35, 36); + must_succeed(&mut engine, k, k, 35, 35); + must_pessimistic_locked(&mut engine, k, 35, 36); + must_succeed(&mut engine, k, k, 35, 37); + must_pessimistic_locked(&mut engine, k, 35, 37); // Cannot prewrite when there is another transaction's pessimistic lock. - must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 36, DoPessimisticCheck); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 38, DoPessimisticCheck); - must_pessimistic_locked(&engine, k, 35, 37); + must_pessimistic_prewrite_put_err(&mut engine, k, v, k, 36, 36, DoPessimisticCheck); + must_pessimistic_prewrite_put_err(&mut engine, k, v, k, 36, 38, DoPessimisticCheck); + must_pessimistic_locked(&mut engine, k, 35, 37); // Cannot prewrite when there is another transaction's non-pessimistic lock. - must_pessimistic_prewrite_put(&engine, k, v, k, 35, 37, DoPessimisticCheck); - must_locked(&engine, k, 35); - must_pessimistic_prewrite_put_err(&engine, k, v, k, 36, 38, DoPessimisticCheck); - must_locked(&engine, k, 35); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 35, 37, DoPessimisticCheck); + must_locked(&mut engine, k, 35); + must_pessimistic_prewrite_put_err(&mut engine, k, v, k, 36, 38, DoPessimisticCheck); + must_locked(&mut engine, k, 35); // Commit pessimistic transaction's key but with smaller commit_ts than // for_update_ts. Currently not checked, so in this case it will // actually be successfully committed. - must_commit(&engine, k, 35, 36); - must_unlocked(&engine, k); - must_get_commit_ts(&engine, k, 35, 36); + must_commit(&mut engine, k, 35, 36); + must_unlocked(&mut engine, k); + must_get_commit_ts(&mut engine, k, 35, 36); // Prewrite meets pessimistic lock on a non-pessimistic key. // Currently not checked, so prewrite will success. - must_succeed(&engine, k, k, 40, 40); - must_pessimistic_locked(&engine, k, 40, 40); - must_pessimistic_prewrite_put(&engine, k, v, k, 40, 40, SkipPessimisticCheck); - must_locked(&engine, k, 40); - must_commit(&engine, k, 40, 41); - must_unlocked(&engine, k); + must_succeed(&mut engine, k, k, 40, 40); + must_pessimistic_locked(&mut engine, k, 40, 40); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 40, 40, SkipPessimisticCheck); + must_locked(&mut engine, k, 40); + must_commit(&mut engine, k, 40, 41); + must_unlocked(&mut engine, k); // Prewrite with different for_update_ts. // Currently not checked. - must_succeed(&engine, k, k, 42, 45); - must_pessimistic_locked(&engine, k, 42, 45); - must_pessimistic_prewrite_put(&engine, k, v, k, 42, 43, DoPessimisticCheck); - must_locked(&engine, k, 42); - must_commit(&engine, k, 42, 45); - must_unlocked(&engine, k); - - must_succeed(&engine, k, k, 46, 47); - must_pessimistic_locked(&engine, k, 46, 47); - must_pessimistic_prewrite_put(&engine, k, v, k, 46, 48, DoPessimisticCheck); - must_locked(&engine, k, 46); - must_commit(&engine, k, 46, 50); - must_unlocked(&engine, k); + must_succeed(&mut engine, k, k, 42, 45); + must_pessimistic_locked(&mut engine, k, 42, 45); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 42, 43, DoPessimisticCheck); + must_locked(&mut engine, k, 42); + must_commit(&mut engine, k, 42, 45); + must_unlocked(&mut engine, k); + + must_succeed(&mut engine, k, k, 46, 47); + must_pessimistic_locked(&mut engine, k, 46, 47); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 46, 48, DoPessimisticCheck); + must_locked(&mut engine, k, 46); + must_commit(&mut engine, k, 46, 50); + must_unlocked(&mut engine, k); // Prewrite on non-pessimistic key meets write with larger commit_ts than // current for_update_ts (non-pessimistic data conflict). // Normally non-pessimistic keys in pessimistic transactions are used when we // are sure that there won't be conflicts. So this case is also not checked, and // prewrite will succeeed. - must_pessimistic_prewrite_put(&engine, k, v, k, 47, 48, SkipPessimisticCheck); - must_locked(&engine, k, 47); - must_cleanup(&engine, k, 47, 0); - must_unlocked(&engine, k); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 47, 48, SkipPessimisticCheck); + must_locked(&mut engine, k, 47); + must_cleanup(&mut engine, k, 47, 0); + must_unlocked(&mut engine, k); // The rollback of the primary key in a pessimistic transaction should be // protected from being collapsed. - must_succeed(&engine, k, k, 49, 60); - must_pessimistic_prewrite_put(&engine, k, v, k, 49, 60, DoPessimisticCheck); - must_locked(&engine, k, 49); - must_cleanup(&engine, k, 49, 0); - must_get_rollback_protected(&engine, k, 49, true); - must_prewrite_put(&engine, k, v, k, 51); - must_rollback(&engine, k, 51, false); - must_err(&engine, k, k, 49, 60); + must_succeed(&mut engine, k, k, 49, 60); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 49, 60, DoPessimisticCheck); + must_locked(&mut engine, k, 49); + must_cleanup(&mut engine, k, 49, 0); + must_get_rollback_protected(&mut engine, k, 49, true); + must_prewrite_put(&mut engine, k, v, k, 51); + must_rollback(&mut engine, k, 51, false); + must_err(&mut engine, k, k, 49, 60); // Overlapped rollback record will be written when the current start_ts equals // to another write records' commit ts. Now there is a commit record with // commit_ts = 50. - must_succeed(&engine, k, k, 50, 61); - must_pessimistic_prewrite_put(&engine, k, v, k, 50, 61, DoPessimisticCheck); - must_locked(&engine, k, 50); - must_cleanup(&engine, k, 50, 0); - must_get_overlapped_rollback(&engine, k, 50, 46, WriteType::Put, Some(0)); + must_succeed(&mut engine, k, k, 50, 61); + must_pessimistic_prewrite_put(&mut engine, k, v, k, 50, 61, DoPessimisticCheck); + must_locked(&mut engine, k, 50); + must_cleanup(&mut engine, k, 50, 0); + must_get_overlapped_rollback(&mut engine, k, 50, 46, WriteType::Put, Some(0)); // start_ts and commit_ts interlacing for start_ts in &[140, 150, 160] { let for_update_ts = start_ts + 48; let commit_ts = start_ts + 50; - must_succeed(&engine, k, k, *start_ts, for_update_ts); + must_succeed(&mut engine, k, k, *start_ts, for_update_ts); must_pessimistic_prewrite_put( - &engine, + &mut engine, k, v, k, @@ -745,105 +745,108 @@ pub mod tests { for_update_ts, DoPessimisticCheck, ); - must_commit(&engine, k, *start_ts, commit_ts); - must_get(&engine, k, commit_ts + 1, v); + must_commit(&mut engine, k, *start_ts, commit_ts); + must_get(&mut engine, k, commit_ts + 1, v); } - must_rollback(&engine, k, 170, false); + must_rollback(&mut engine, k, 170, false); // Now the data should be like: (start_ts -> commit_ts) // 140 -> 190 // 150 -> 200 // 160 -> 210 // 170 -> rollback - must_get_commit_ts(&engine, k, 140, 190); - must_get_commit_ts(&engine, k, 150, 200); - must_get_commit_ts(&engine, k, 160, 210); - must_get_rollback_ts(&engine, k, 170); + must_get_commit_ts(&mut engine, k, 140, 190); + must_get_commit_ts(&mut engine, k, 150, 200); + must_get_commit_ts(&mut engine, k, 160, 210); + must_get_rollback_ts(&mut engine, k, 170); } #[test] fn test_pessimistic_lock_return_value() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k", b"v"); assert_eq!( - must_succeed_return_value(&engine, k, k, 10, 10, false), + must_succeed_return_value(&mut engine, k, k, 10, 10, false), None ); - must_pessimistic_locked(&engine, k, 10, 10); - pessimistic_rollback::tests::must_success(&engine, k, 10, 10); + must_pessimistic_locked(&mut engine, k, 10, 10); + pessimistic_rollback::tests::must_success(&mut engine, k, 10, 10); // Put - must_prewrite_put(&engine, k, v, k, 10); + must_prewrite_put(&mut engine, k, v, k, 10); // KeyIsLocked - match must_err_return_value(&engine, k, k, 20, 20, false) { + match must_err_return_value(&mut engine, k, k, 20, 20, false) { MvccError(box ErrorInner::KeyIsLocked(_)) => (), e => panic!("unexpected error: {}", e), }; - must_commit(&engine, k, 10, 20); + must_commit(&mut engine, k, 10, 20); // WriteConflict - match must_err_return_value(&engine, k, k, 15, 15, false) { + match must_err_return_value(&mut engine, k, k, 15, 15, false) { MvccError(box ErrorInner::WriteConflict { .. }) => (), e => panic!("unexpected error: {}", e), }; assert_eq!( - must_succeed_return_value(&engine, k, k, 25, 25, false), + must_succeed_return_value(&mut engine, k, k, 25, 25, false), Some(v.to_vec()) ); - must_pessimistic_locked(&engine, k, 25, 25); - pessimistic_rollback::tests::must_success(&engine, k, 25, 25); + must_pessimistic_locked(&mut engine, k, 25, 25); + pessimistic_rollback::tests::must_success(&mut engine, k, 25, 25); // Skip Write::Lock - must_prewrite_lock(&engine, k, k, 30); - must_commit(&engine, k, 30, 40); + must_prewrite_lock(&mut engine, k, k, 30); + must_commit(&mut engine, k, 30, 40); assert_eq!( - must_succeed_return_value(&engine, k, k, 45, 45, false), + must_succeed_return_value(&mut engine, k, k, 45, 45, false), Some(v.to_vec()) ); - must_pessimistic_locked(&engine, k, 45, 45); - pessimistic_rollback::tests::must_success(&engine, k, 45, 45); + must_pessimistic_locked(&mut engine, k, 45, 45); + pessimistic_rollback::tests::must_success(&mut engine, k, 45, 45); // Skip Write::Rollback - must_rollback(&engine, k, 50, false); + must_rollback(&mut engine, k, 50, false); assert_eq!( - must_succeed_return_value(&engine, k, k, 55, 55, false), + must_succeed_return_value(&mut engine, k, k, 55, 55, false), Some(v.to_vec()) ); - must_pessimistic_locked(&engine, k, 55, 55); - pessimistic_rollback::tests::must_success(&engine, k, 55, 55); + must_pessimistic_locked(&mut engine, k, 55, 55); + pessimistic_rollback::tests::must_success(&mut engine, k, 55, 55); // Delete - must_prewrite_delete(&engine, k, k, 60); - must_commit(&engine, k, 60, 70); + must_prewrite_delete(&mut engine, k, k, 60); + must_commit(&mut engine, k, 60, 70); assert_eq!( - must_succeed_return_value(&engine, k, k, 75, 75, false), + must_succeed_return_value(&mut engine, k, k, 75, 75, false), None ); // Duplicated command assert_eq!( - must_succeed_return_value(&engine, k, k, 75, 75, false), + must_succeed_return_value(&mut engine, k, k, 75, 75, false), None ); assert_eq!( - must_succeed_return_value(&engine, k, k, 75, 55, false), + must_succeed_return_value(&mut engine, k, k, 75, 55, false), Some(v.to_vec()) ); - must_pessimistic_locked(&engine, k, 75, 75); - pessimistic_rollback::tests::must_success(&engine, k, 75, 75); + must_pessimistic_locked(&mut engine, k, 75, 75); + pessimistic_rollback::tests::must_success(&mut engine, k, 75, 75); } #[test] fn test_pessimistic_lock_only_if_exists() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k", b"v"); // The key doesn't exist, no pessimistic lock is generated - assert_eq!(must_succeed_return_value(&engine, k, k, 10, 10, true), None); - must_unlocked(&engine, k); + assert_eq!( + must_succeed_return_value(&mut engine, k, k, 10, 10, true), + None + ); + must_unlocked(&mut engine, k); match must_err_impl( - &engine, + &mut engine, k, k, 10, @@ -862,133 +865,139 @@ pub mod tests { }; // Put the value, writecf: k_20_put_v - must_prewrite_put(&engine, k, v, k, 10); - must_commit(&engine, k, 10, 20); + must_prewrite_put(&mut engine, k, v, k, 10); + must_commit(&mut engine, k, 10, 20); // Pessimistic lock generated assert_eq!( - must_succeed_return_value(&engine, k, k, 25, 25, true), + must_succeed_return_value(&mut engine, k, k, 25, 25, true), Some(v.to_vec()) ); - must_pessimistic_locked(&engine, k, 25, 25); - pessimistic_rollback::tests::must_success(&engine, k, 25, 25); + must_pessimistic_locked(&mut engine, k, 25, 25); + pessimistic_rollback::tests::must_success(&mut engine, k, 25, 25); // Skip Write::Lock, WriteRecord: k_20_put_v k_40_lock - must_prewrite_lock(&engine, k, k, 30); - must_commit(&engine, k, 30, 40); + must_prewrite_lock(&mut engine, k, k, 30); + must_commit(&mut engine, k, 30, 40); assert_eq!( - must_succeed_return_value(&engine, k, k, 45, 45, true), + must_succeed_return_value(&mut engine, k, k, 45, 45, true), Some(v.to_vec()) ); - must_pessimistic_locked(&engine, k, 45, 45); - pessimistic_rollback::tests::must_success(&engine, k, 45, 45); + must_pessimistic_locked(&mut engine, k, 45, 45); + pessimistic_rollback::tests::must_success(&mut engine, k, 45, 45); // Skip Write::Rollback WriteRecord: k_20_put_v k_40_lock k_50_R - must_rollback(&engine, k, 50, false); + must_rollback(&mut engine, k, 50, false); assert_eq!( - must_succeed_return_value(&engine, k, k, 55, 55, true), + must_succeed_return_value(&mut engine, k, k, 55, 55, true), Some(v.to_vec()) ); - must_pessimistic_locked(&engine, k, 55, 55); - pessimistic_rollback::tests::must_success(&engine, k, 55, 55); + must_pessimistic_locked(&mut engine, k, 55, 55); + pessimistic_rollback::tests::must_success(&mut engine, k, 55, 55); // Delete WriteRecord: k_20_put_v k_40_lock k_50_R k_70_delete - must_prewrite_delete(&engine, k, k, 60); - must_commit(&engine, k, 60, 70); - assert_eq!(must_succeed_return_value(&engine, k, k, 75, 75, true), None); - must_unlocked(&engine, k); + must_prewrite_delete(&mut engine, k, k, 60); + must_commit(&mut engine, k, 60, 70); + assert_eq!( + must_succeed_return_value(&mut engine, k, k, 75, 75, true), + None + ); + must_unlocked(&mut engine, k); // Duplicated command assert_eq!( - must_succeed_return_value(&engine, k, k, 75, 75, false), + must_succeed_return_value(&mut engine, k, k, 75, 75, false), None ); - must_pessimistic_locked(&engine, k, 75, 75); - assert_eq!(must_succeed_return_value(&engine, k, k, 75, 85, true), None); - must_pessimistic_locked(&engine, k, 75, 85); - pessimistic_rollback::tests::must_success(&engine, k, 75, 85); - must_unlocked(&engine, k); + must_pessimistic_locked(&mut engine, k, 75, 75); + assert_eq!( + must_succeed_return_value(&mut engine, k, k, 75, 85, true), + None + ); + must_pessimistic_locked(&mut engine, k, 75, 85); + pessimistic_rollback::tests::must_success(&mut engine, k, 75, 85); + must_unlocked(&mut engine, k); } #[test] fn test_overwrite_pessimistic_lock() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k1"; - must_succeed(&engine, k, k, 1, 2); - must_pessimistic_locked(&engine, k, 1, 2); - must_succeed(&engine, k, k, 1, 1); - must_pessimistic_locked(&engine, k, 1, 2); - must_succeed(&engine, k, k, 1, 3); - must_pessimistic_locked(&engine, k, 1, 3); + must_succeed(&mut engine, k, k, 1, 2); + must_pessimistic_locked(&mut engine, k, 1, 2); + must_succeed(&mut engine, k, k, 1, 1); + must_pessimistic_locked(&mut engine, k, 1, 2); + must_succeed(&mut engine, k, k, 1, 3); + must_pessimistic_locked(&mut engine, k, 1, 3); } #[test] fn test_pessimistic_lock_check_gc_fence() { use pessimistic_rollback::tests::must_success as must_pessimistic_rollback; - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // PUT, Read // `------^ - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 10); - must_commit(&engine, b"k1", 10, 30); - must_cleanup_with_gc_fence(&engine, b"k1", 30, 0, 40, true); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 10); + must_commit(&mut engine, b"k1", 10, 30); + must_cleanup_with_gc_fence(&mut engine, b"k1", 30, 0, 40, true); // PUT, Read // * (GC fence ts = 0) - must_prewrite_put(&engine, b"k2", b"v2", b"k2", 11); - must_commit(&engine, b"k2", 11, 30); - must_cleanup_with_gc_fence(&engine, b"k2", 30, 0, 0, true); + must_prewrite_put(&mut engine, b"k2", b"v2", b"k2", 11); + must_commit(&mut engine, b"k2", 11, 30); + must_cleanup_with_gc_fence(&mut engine, b"k2", 30, 0, 0, true); // PUT, LOCK, LOCK, Read // `---------^ - must_prewrite_put(&engine, b"k3", b"v3", b"k3", 12); - must_commit(&engine, b"k3", 12, 30); - must_prewrite_lock(&engine, b"k3", b"k3", 37); - must_commit(&engine, b"k3", 37, 38); - must_cleanup_with_gc_fence(&engine, b"k3", 30, 0, 40, true); - must_prewrite_lock(&engine, b"k3", b"k3", 42); - must_commit(&engine, b"k3", 42, 43); + must_prewrite_put(&mut engine, b"k3", b"v3", b"k3", 12); + must_commit(&mut engine, b"k3", 12, 30); + must_prewrite_lock(&mut engine, b"k3", b"k3", 37); + must_commit(&mut engine, b"k3", 37, 38); + must_cleanup_with_gc_fence(&mut engine, b"k3", 30, 0, 40, true); + must_prewrite_lock(&mut engine, b"k3", b"k3", 42); + must_commit(&mut engine, b"k3", 42, 43); // PUT, LOCK, LOCK, Read // * - must_prewrite_put(&engine, b"k4", b"v4", b"k4", 13); - must_commit(&engine, b"k4", 13, 30); - must_prewrite_lock(&engine, b"k4", b"k4", 37); - must_commit(&engine, b"k4", 37, 38); - must_prewrite_lock(&engine, b"k4", b"k4", 42); - must_commit(&engine, b"k4", 42, 43); - must_cleanup_with_gc_fence(&engine, b"k4", 30, 0, 0, true); + must_prewrite_put(&mut engine, b"k4", b"v4", b"k4", 13); + must_commit(&mut engine, b"k4", 13, 30); + must_prewrite_lock(&mut engine, b"k4", b"k4", 37); + must_commit(&mut engine, b"k4", 37, 38); + must_prewrite_lock(&mut engine, b"k4", b"k4", 42); + must_commit(&mut engine, b"k4", 42, 43); + must_cleanup_with_gc_fence(&mut engine, b"k4", 30, 0, 0, true); // PUT, PUT, READ // `-----^ `------^ - must_prewrite_put(&engine, b"k5", b"v5", b"k5", 14); - must_commit(&engine, b"k5", 14, 20); - must_prewrite_put(&engine, b"k5", b"v5x", b"k5", 21); - must_commit(&engine, b"k5", 21, 30); - must_cleanup_with_gc_fence(&engine, b"k5", 20, 0, 30, false); - must_cleanup_with_gc_fence(&engine, b"k5", 30, 0, 40, true); + must_prewrite_put(&mut engine, b"k5", b"v5", b"k5", 14); + must_commit(&mut engine, b"k5", 14, 20); + must_prewrite_put(&mut engine, b"k5", b"v5x", b"k5", 21); + must_commit(&mut engine, b"k5", 21, 30); + must_cleanup_with_gc_fence(&mut engine, b"k5", 20, 0, 30, false); + must_cleanup_with_gc_fence(&mut engine, b"k5", 30, 0, 40, true); // PUT, PUT, READ // `-----^ * - must_prewrite_put(&engine, b"k6", b"v6", b"k6", 15); - must_commit(&engine, b"k6", 15, 20); - must_prewrite_put(&engine, b"k6", b"v6x", b"k6", 22); - must_commit(&engine, b"k6", 22, 30); - must_cleanup_with_gc_fence(&engine, b"k6", 20, 0, 30, false); - must_cleanup_with_gc_fence(&engine, b"k6", 30, 0, 0, true); + must_prewrite_put(&mut engine, b"k6", b"v6", b"k6", 15); + must_commit(&mut engine, b"k6", 15, 20); + must_prewrite_put(&mut engine, b"k6", b"v6x", b"k6", 22); + must_commit(&mut engine, b"k6", 22, 30); + must_cleanup_with_gc_fence(&mut engine, b"k6", 20, 0, 30, false); + must_cleanup_with_gc_fence(&mut engine, b"k6", 30, 0, 0, true); // PUT, LOCK, READ // `----------^ // Note that this case is special because usually the `LOCK` is the first write // already got during prewrite/acquire_pessimistic_lock and will continue // searching an older version from the `LOCK` record. - must_prewrite_put(&engine, b"k7", b"v7", b"k7", 16); - must_commit(&engine, b"k7", 16, 30); - must_prewrite_lock(&engine, b"k7", b"k7", 37); - must_commit(&engine, b"k7", 37, 38); - must_cleanup_with_gc_fence(&engine, b"k7", 30, 0, 40, true); + must_prewrite_put(&mut engine, b"k7", b"v7", b"k7", 16); + must_commit(&mut engine, b"k7", 16, 30); + must_prewrite_lock(&mut engine, b"k7", b"k7", 37); + must_commit(&mut engine, b"k7", 37, 38); + must_cleanup_with_gc_fence(&mut engine, b"k7", 30, 0, 40, true); let cases = vec![ (b"k1" as &[u8], None), @@ -1004,34 +1013,68 @@ pub mod tests { // Test constraint check with `should_not_exist`. if expected_value.is_none() { assert!( - must_succeed_impl(&engine, key, key, 50, true, 0, 50, false, false, 51, false) - .is_none() + must_succeed_impl( + &mut engine, + key, + key, + 50, + true, + 0, + 50, + false, + false, + 51, + false + ) + .is_none() ); - must_pessimistic_rollback(&engine, key, 50, 51); + must_pessimistic_rollback(&mut engine, key, 50, 51); } else { - must_err_impl(&engine, key, key, 50, true, 50, false, false, 51, false); + must_err_impl(&mut engine, key, key, 50, true, 50, false, false, 51, false); } - must_unlocked(&engine, key); + must_unlocked(&mut engine, key); // Test getting value. - let res = - must_succeed_impl(&engine, key, key, 50, false, 0, 50, true, false, 51, false); + let res = must_succeed_impl( + &mut engine, + key, + key, + 50, + false, + 0, + 50, + true, + false, + 51, + false, + ); assert_eq!(res, expected_value.map(|v| v.to_vec())); - must_pessimistic_rollback(&engine, key, 50, 51); + must_pessimistic_rollback(&mut engine, key, 50, 51); // Test getting value when already locked. - must_succeed(&engine, key, key, 50, 51); - let res2 = - must_succeed_impl(&engine, key, key, 50, false, 0, 50, true, false, 51, false); + must_succeed(&mut engine, key, key, 50, 51); + let res2 = must_succeed_impl( + &mut engine, + key, + key, + 50, + false, + 0, + 50, + true, + false, + 51, + false, + ); assert_eq!(res2, expected_value.map(|v| v.to_vec())); - must_pessimistic_rollback(&engine, key, 50, 51); + must_pessimistic_rollback(&mut engine, key, 50, 51); } } #[test] fn test_old_value_put_delete_lock_insert() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); - let start_ts = old_value_put_delete_lock_insert(&engine, b"k1"); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let start_ts = old_value_put_delete_lock_insert(&mut engine, b"k1"); let key = Key::from_raw(b"k1"); for should_not_exist in &[true, false] { for need_value in &[true, false] { @@ -1067,21 +1110,21 @@ pub mod tests { #[test] fn test_old_value_for_update_ts() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k1"; let v1 = b"v1"; // Put v1 @ start ts 1, commit ts 2 - must_succeed(&engine, k, k, 1, 1); - must_pessimistic_prewrite_put(&engine, k, v1, k, 1, 1, DoPessimisticCheck); - must_commit(&engine, k, 1, 2); + must_succeed(&mut engine, k, k, 1, 1); + must_pessimistic_prewrite_put(&mut engine, k, v1, k, 1, 1, DoPessimisticCheck); + must_commit(&mut engine, k, 1, 2); let v2 = b"v2"; // Put v2 @ start ts 10, commit ts 11 - must_succeed(&engine, k, k, 10, 10); - must_pessimistic_prewrite_put(&engine, k, v2, k, 10, 10, DoPessimisticCheck); - must_commit(&engine, k, 10, 11); + must_succeed(&mut engine, k, k, 10, 10); + must_pessimistic_prewrite_put(&mut engine, k, v2, k, 10, 10, DoPessimisticCheck); + must_commit(&mut engine, k, 10, 11); // Lock @ start ts 9, for update ts 12, commit ts 13 let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1204,14 +1247,14 @@ pub mod tests { #[test] fn test_acquire_pessimistic_lock_should_not_exist() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (key, value) = (b"k", b"val"); // T1: start_ts = 3, commit_ts = 5, put key:value - must_succeed(&engine, key, key, 3, 3); - must_pessimistic_prewrite_put(&engine, key, value, key, 3, 3, DoPessimisticCheck); - must_commit(&engine, key, 3, 5); + must_succeed(&mut engine, key, key, 3, 3); + must_pessimistic_prewrite_put(&mut engine, key, value, key, 3, 3, DoPessimisticCheck); + must_commit(&mut engine, key, 3, 5); // T2: start_ts = 15, acquire pessimistic lock on k, with should_not_exist flag // set. @@ -1245,9 +1288,9 @@ pub mod tests { // T3: start_ts = 8, commit_ts = max_ts + 1 = 16, prewrite a DELETE operation on // k - must_succeed(&engine, key, key, 8, 8); - must_pessimistic_prewrite_delete(&engine, key, key, 8, 8, DoPessimisticCheck); - must_commit(&engine, key, 8, cm.max_ts().into_inner() + 1); + must_succeed(&mut engine, key, key, 8, 8); + must_pessimistic_prewrite_delete(&mut engine, key, key, 8, 8, DoPessimisticCheck); + must_commit(&mut engine, key, 8, cm.max_ts().into_inner() + 1); // T1: start_ts = 10, repeatedly acquire pessimistic lock on k, with // should_not_exist flag set @@ -1279,35 +1322,35 @@ pub mod tests { #[test] fn test_check_existence() { use pessimistic_rollback::tests::must_success as must_pessimistic_rollback; - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // k1: Not exists // k2: Exists - must_prewrite_put(&engine, b"k2", b"v2", b"k2", 5); - must_commit(&engine, b"k2", 5, 20); + must_prewrite_put(&mut engine, b"k2", b"v2", b"k2", 5); + must_commit(&mut engine, b"k2", 5, 20); // k3: Delete - must_prewrite_put(&engine, b"k3", b"v3", b"k3", 5); - must_commit(&engine, b"k3", 5, 6); - must_prewrite_delete(&engine, b"k3", b"k3", 7); - must_commit(&engine, b"k3", 7, 20); + must_prewrite_put(&mut engine, b"k3", b"v3", b"k3", 5); + must_commit(&mut engine, b"k3", 5, 6); + must_prewrite_delete(&mut engine, b"k3", b"k3", 7); + must_commit(&mut engine, b"k3", 7, 20); // k4: Exist + Lock + Rollback - must_prewrite_put(&engine, b"k4", b"v4", b"k4", 5); - must_commit(&engine, b"k4", 5, 15); - must_prewrite_lock(&engine, b"k4", b"k4", 16); - must_commit(&engine, b"k4", 16, 17); - must_rollback(&engine, b"k4", 20, true); + must_prewrite_put(&mut engine, b"k4", b"v4", b"k4", 5); + must_commit(&mut engine, b"k4", 5, 15); + must_prewrite_lock(&mut engine, b"k4", b"k4", 16); + must_commit(&mut engine, b"k4", 16, 17); + must_rollback(&mut engine, b"k4", 20, true); // k5: GC fence invalid - must_prewrite_put(&engine, b"k5", b"v5", b"k5", 5); - must_commit(&engine, b"k5", 5, 6); + must_prewrite_put(&mut engine, b"k5", b"v5", b"k5", 5); + must_commit(&mut engine, b"k5", 5, 6); // A invalid gc fence is assumed never pointing to a ts greater than GC // safepoint, and a read operation's ts is assumed never less than the // GC safepoint. Therefore since we will read at ts=10 later, we can't // put a version greater than 10 in this case. - must_cleanup_with_gc_fence(&engine, b"k5", 6, 0, 8, true); + must_cleanup_with_gc_fence(&mut engine, b"k5", 6, 0, 8, true); for &need_value in &[false, true] { for &need_check_existence in &[false, true] { @@ -1319,7 +1362,7 @@ pub mod tests { ); if repeated_request { for &k in &[b"k1", b"k2", b"k3", b"k4", b"k5"] { - must_succeed(&engine, k, k, start_ts, 30); + must_succeed(&mut engine, k, k, start_ts, 30); } } @@ -1334,7 +1377,7 @@ pub mod tests { }; let value1 = must_succeed_impl( - &engine, + &mut engine, b"k1", b"k1", start_ts, @@ -1347,10 +1390,10 @@ pub mod tests { false, ); assert_eq!(value1, None); - must_pessimistic_rollback(&engine, b"k1", start_ts, 30); + must_pessimistic_rollback(&mut engine, b"k1", start_ts, 30); let value2 = must_succeed_impl( - &engine, + &mut engine, b"k2", b"k2", start_ts, @@ -1363,10 +1406,10 @@ pub mod tests { false, ); assert_eq!(value2, expected_value(Some(b"v2"))); - must_pessimistic_rollback(&engine, b"k2", start_ts, 30); + must_pessimistic_rollback(&mut engine, b"k2", start_ts, 30); let value3 = must_succeed_impl( - &engine, + &mut engine, b"k3", b"k3", start_ts, @@ -1379,10 +1422,10 @@ pub mod tests { false, ); assert_eq!(value3, None); - must_pessimistic_rollback(&engine, b"k3", start_ts, 30); + must_pessimistic_rollback(&mut engine, b"k3", start_ts, 30); let value4 = must_succeed_impl( - &engine, + &mut engine, b"k4", b"k4", start_ts, @@ -1395,10 +1438,10 @@ pub mod tests { false, ); assert_eq!(value4, expected_value(Some(b"v4"))); - must_pessimistic_rollback(&engine, b"k4", start_ts, 30); + must_pessimistic_rollback(&mut engine, b"k4", start_ts, 30); let value5 = must_succeed_impl( - &engine, + &mut engine, b"k5", b"k5", start_ts, @@ -1411,7 +1454,7 @@ pub mod tests { false, ); assert_eq!(value5, None); - must_pessimistic_rollback(&engine, b"k5", start_ts, 30); + must_pessimistic_rollback(&mut engine, b"k5", start_ts, 30); } } } diff --git a/src/storage/txn/actions/check_data_constraint.rs b/src/storage/txn/actions/check_data_constraint.rs index 35999ee6cb2..d90a95a24ab 100644 --- a/src/storage/txn/actions/check_data_constraint.rs +++ b/src/storage/txn/actions/check_data_constraint.rs @@ -50,7 +50,7 @@ mod tests { #[test] fn test_check_data_constraint() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(42.into()); let mut txn = MvccTxn::new(TimeStamp::new(2), cm); txn.put_write( diff --git a/src/storage/txn/actions/cleanup.rs b/src/storage/txn/actions/cleanup.rs index c72905c8910..5ed77d4fab3 100644 --- a/src/storage/txn/actions/cleanup.rs +++ b/src/storage/txn/actions/cleanup.rs @@ -105,7 +105,7 @@ pub mod tests { }; pub fn must_succeed( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, current_ts: impl Into, @@ -122,7 +122,7 @@ pub mod tests { } pub fn must_err( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, current_ts: impl Into, @@ -137,7 +137,7 @@ pub mod tests { } pub fn must_cleanup_with_gc_fence( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, current_ts: impl Into, @@ -183,11 +183,11 @@ pub mod tests { #[test] fn test_must_cleanup_with_gc_fence() { // Tests the test util - let engine = TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine, b"k", b"v", b"k", 10); - must_commit(&engine, b"k", 10, 20); - must_cleanup_with_gc_fence(&engine, b"k", 20, 0, 30, true); - let w = must_written(&engine, b"k", 10, 20, WriteType::Put); + let mut engine = TestEngineBuilder::new().build().unwrap(); + must_prewrite_put(&mut engine, b"k", b"v", b"k", 10); + must_commit(&mut engine, b"k", 10, 20); + must_cleanup_with_gc_fence(&mut engine, b"k", 20, 0, 30, true); + let w = must_written(&mut engine, b"k", 10, 20, WriteType::Put); assert!(w.has_overlapped_rollback); assert_eq!(w.gc_fence.unwrap(), 30.into()); } @@ -196,45 +196,53 @@ pub mod tests { fn test_cleanup() { // Cleanup's logic is mostly similar to rollback, except the TTL check. Tests // that not related to TTL check should be covered by other test cases. - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Shorthand for composing ts. let ts = TimeStamp::compose; let (k, v) = (b"k", b"v"); - must_prewrite_put(&engine, k, v, k, ts(10, 0)); - must_locked(&engine, k, ts(10, 0)); - txn_heart_beat::tests::must_success(&engine, k, ts(10, 0), 100, 100); + must_prewrite_put(&mut engine, k, v, k, ts(10, 0)); + must_locked(&mut engine, k, ts(10, 0)); + txn_heart_beat::tests::must_success(&mut engine, k, ts(10, 0), 100, 100); // Check the last txn_heart_beat has set the lock's TTL to 100. - txn_heart_beat::tests::must_success(&engine, k, ts(10, 0), 90, 100); + txn_heart_beat::tests::must_success(&mut engine, k, ts(10, 0), 90, 100); // TTL not expired. Do nothing but returns an error. - must_err(&engine, k, ts(10, 0), ts(20, 0)); - must_locked(&engine, k, ts(10, 0)); + must_err(&mut engine, k, ts(10, 0), ts(20, 0)); + must_locked(&mut engine, k, ts(10, 0)); // Try to cleanup another transaction's lock. Does nothing. - must_succeed(&engine, k, ts(10, 1), ts(120, 0)); + must_succeed(&mut engine, k, ts(10, 1), ts(120, 0)); // If there is no existing lock when cleanup, it may be a pessimistic // transaction, so the rollback should be protected. - must_get_rollback_protected(&engine, k, ts(10, 1), true); - must_locked(&engine, k, ts(10, 0)); + must_get_rollback_protected(&mut engine, k, ts(10, 1), true); + must_locked(&mut engine, k, ts(10, 0)); // TTL expired. The lock should be removed. - must_succeed(&engine, k, ts(10, 0), ts(120, 0)); - must_unlocked(&engine, k); + must_succeed(&mut engine, k, ts(10, 0), ts(120, 0)); + must_unlocked(&mut engine, k); // Rollbacks of optimistic transactions needn't be protected - must_get_rollback_protected(&engine, k, ts(10, 0), false); - must_get_rollback_ts(&engine, k, ts(10, 0)); + must_get_rollback_protected(&mut engine, k, ts(10, 0), false); + must_get_rollback_ts(&mut engine, k, ts(10, 0)); // Rollbacks of primary keys in pessimistic transactions should be protected - must_acquire_pessimistic_lock(&engine, k, k, ts(11, 1), ts(12, 1)); - must_succeed(&engine, k, ts(11, 1), ts(120, 0)); - must_get_rollback_protected(&engine, k, ts(11, 1), true); - - must_acquire_pessimistic_lock(&engine, k, k, ts(13, 1), ts(14, 1)); - must_pessimistic_prewrite_put(&engine, k, v, k, ts(13, 1), ts(14, 1), DoPessimisticCheck); - must_succeed(&engine, k, ts(13, 1), ts(120, 0)); - must_get_rollback_protected(&engine, k, ts(13, 1), true); + must_acquire_pessimistic_lock(&mut engine, k, k, ts(11, 1), ts(12, 1)); + must_succeed(&mut engine, k, ts(11, 1), ts(120, 0)); + must_get_rollback_protected(&mut engine, k, ts(11, 1), true); + + must_acquire_pessimistic_lock(&mut engine, k, k, ts(13, 1), ts(14, 1)); + must_pessimistic_prewrite_put( + &mut engine, + k, + v, + k, + ts(13, 1), + ts(14, 1), + DoPessimisticCheck, + ); + must_succeed(&mut engine, k, ts(13, 1), ts(120, 0)); + must_get_rollback_protected(&mut engine, k, ts(13, 1), true); } } diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index e0a4257de26..6fd925b536e 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -128,7 +128,7 @@ pub mod tests { }; pub fn must_succeed( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, commit_ts: impl Into, @@ -137,7 +137,7 @@ pub mod tests { } pub fn must_succeed_on_region( - engine: &E, + engine: &mut E, region_id: u64, key: &[u8], start_ts: impl Into, @@ -147,7 +147,7 @@ pub mod tests { } fn must_succeed_impl( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, commit_ts: impl Into, @@ -171,7 +171,7 @@ pub mod tests { } pub fn must_err( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, commit_ts: impl Into, @@ -186,23 +186,23 @@ pub mod tests { #[cfg(test)] fn test_commit_ok_imp(k1: &[u8], v1: &[u8], k2: &[u8], k3: &[u8]) { - let engine = TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine, k1, v1, k1, 10); - must_prewrite_lock(&engine, k2, k1, 10); - must_prewrite_delete(&engine, k3, k1, 10); - must_locked(&engine, k1, 10); - must_locked(&engine, k2, 10); - must_locked(&engine, k3, 10); - must_succeed(&engine, k1, 10, 15); - must_succeed(&engine, k2, 10, 15); - must_succeed(&engine, k3, 10, 15); - must_written(&engine, k1, 10, 15, WriteType::Put); - must_written(&engine, k2, 10, 15, WriteType::Lock); - must_written(&engine, k3, 10, 15, WriteType::Delete); + let mut engine = TestEngineBuilder::new().build().unwrap(); + must_prewrite_put(&mut engine, k1, v1, k1, 10); + must_prewrite_lock(&mut engine, k2, k1, 10); + must_prewrite_delete(&mut engine, k3, k1, 10); + must_locked(&mut engine, k1, 10); + must_locked(&mut engine, k2, 10); + must_locked(&mut engine, k3, 10); + must_succeed(&mut engine, k1, 10, 15); + must_succeed(&mut engine, k2, 10, 15); + must_succeed(&mut engine, k3, 10, 15); + must_written(&mut engine, k1, 10, 15, WriteType::Put); + must_written(&mut engine, k2, 10, 15, WriteType::Lock); + must_written(&mut engine, k3, 10, 15, WriteType::Delete); // commit should be idempotent - must_succeed(&engine, k1, 10, 15); - must_succeed(&engine, k2, 10, 15); - must_succeed(&engine, k3, 10, 15); + must_succeed(&mut engine, k1, 10, 15); + must_succeed(&mut engine, k2, 10, 15); + must_succeed(&mut engine, k3, 10, 15); } #[test] @@ -215,16 +215,16 @@ pub mod tests { #[cfg(test)] fn test_commit_err_imp(k: &[u8], v: &[u8]) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // Not prewrite yet - must_err(&engine, k, 1, 2); - must_prewrite_put(&engine, k, v, k, 5); + must_err(&mut engine, k, 1, 2); + must_prewrite_put(&mut engine, k, v, k, 5); // start_ts not match - must_err(&engine, k, 4, 5); - must_rollback(&engine, k, 5, false); + must_err(&mut engine, k, 4, 5); + must_rollback(&mut engine, k, 5, false); // commit after rollback - must_err(&engine, k, 5, 6); + must_err(&mut engine, k, 5, 6); } #[test] @@ -237,7 +237,7 @@ pub mod tests { #[test] fn test_min_commit_ts() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k", b"v"); @@ -253,9 +253,9 @@ pub mod tests { } }; - must_prewrite_put_for_large_txn(&engine, k, v, k, ts(10, 0), 100, 0); + must_prewrite_put_for_large_txn(&mut engine, k, v, k, ts(10, 0), 100, 0); check_txn_status::tests::must_success( - &engine, + &mut engine, k, ts(10, 0), ts(20, 0), @@ -266,13 +266,13 @@ pub mod tests { uncommitted(100, ts(20, 1)), ); // The min_commit_ts should be ts(20, 1) - must_err(&engine, k, ts(10, 0), ts(15, 0)); - must_err(&engine, k, ts(10, 0), ts(20, 0)); - must_succeed(&engine, k, ts(10, 0), ts(20, 1)); + must_err(&mut engine, k, ts(10, 0), ts(15, 0)); + must_err(&mut engine, k, ts(10, 0), ts(20, 0)); + must_succeed(&mut engine, k, ts(10, 0), ts(20, 1)); - must_prewrite_put_for_large_txn(&engine, k, v, k, ts(30, 0), 100, 0); + must_prewrite_put_for_large_txn(&mut engine, k, v, k, ts(30, 0), 100, 0); check_txn_status::tests::must_success( - &engine, + &mut engine, k, ts(30, 0), ts(40, 0), @@ -282,13 +282,13 @@ pub mod tests { false, uncommitted(100, ts(40, 1)), ); - must_succeed(&engine, k, ts(30, 0), ts(50, 0)); + must_succeed(&mut engine, k, ts(30, 0), ts(50, 0)); // If the min_commit_ts of the pessimistic lock is greater than prewrite's, use // it. - must_acquire_pessimistic_lock_for_large_txn(&engine, k, k, ts(60, 0), ts(60, 0), 100); + must_acquire_pessimistic_lock_for_large_txn(&mut engine, k, k, ts(60, 0), ts(60, 0), 100); check_txn_status::tests::must_success( - &engine, + &mut engine, k, ts(60, 0), ts(70, 0), @@ -299,7 +299,7 @@ pub mod tests { uncommitted(100, ts(70, 1)), ); must_prewrite_put_impl( - &engine, + &mut engine, k, v, k, @@ -316,8 +316,8 @@ pub mod tests { kvproto::kvrpcpb::AssertionLevel::Off, ); // The min_commit_ts is ts(70, 0) other than ts(60, 1) in prewrite request. - must_large_txn_locked(&engine, k, ts(60, 0), 100, ts(70, 1), false); - must_err(&engine, k, ts(60, 0), ts(65, 0)); - must_succeed(&engine, k, ts(60, 0), ts(80, 0)); + must_large_txn_locked(&mut engine, k, ts(60, 0), 100, ts(70, 1), false); + must_err(&mut engine, k, ts(60, 0), ts(65, 0)); + must_succeed(&mut engine, k, ts(60, 0), ts(80, 0)); } } diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 0b9f0461297..5fcf0327c37 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -164,7 +164,7 @@ pub mod tests { }; fn must_flashback_write( - engine: &E, + engine: &mut E, key: &[u8], version: impl Into, start_ts: impl Into, @@ -209,91 +209,91 @@ pub mod tests { #[test] fn test_flashback_to_version() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let k = b"k"; // Prewrite and commit Put(k -> v1) with stat_ts = 1, commit_ts = 2. let v1 = b"v1"; - must_prewrite_put(&engine, k, v1, k, *ts.incr()); - must_commit(&engine, k, ts, *ts.incr()); - must_get(&engine, k, *ts.incr(), v1); + must_prewrite_put(&mut engine, k, v1, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, *ts.incr(), v1); // Prewrite and rollback Put(k -> v2) with stat_ts = 4. let v2 = b"v2"; - must_prewrite_put(&engine, k, v2, k, *ts.incr()); - must_rollback(&engine, k, ts, false); - must_get(&engine, k, *ts.incr(), v1); + must_prewrite_put(&mut engine, k, v2, k, *ts.incr()); + must_rollback(&mut engine, k, ts, false); + must_get(&mut engine, k, *ts.incr(), v1); // Prewrite and rollback Delete(k) with stat_ts = 6. - must_prewrite_delete(&engine, k, k, *ts.incr()); - must_rollback(&engine, k, ts, false); - must_get(&engine, k, *ts.incr(), v1); + must_prewrite_delete(&mut engine, k, k, *ts.incr()); + must_rollback(&mut engine, k, ts, false); + must_get(&mut engine, k, *ts.incr(), v1); // Prewrite and commit Delete(k) with stat_ts = 8, commit_ts = 9. - must_prewrite_delete(&engine, k, k, *ts.incr()); - must_commit(&engine, k, ts, *ts.incr()); - must_get_none(&engine, k, *ts.incr()); + must_prewrite_delete(&mut engine, k, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get_none(&mut engine, k, *ts.incr()); // Prewrite and commit Put(k -> v2) with stat_ts = 11, commit_ts = 12. - must_prewrite_put(&engine, k, v2, k, *ts.incr()); - must_commit(&engine, k, ts, *ts.incr()); - must_get(&engine, k, *ts.incr(), v2); + must_prewrite_put(&mut engine, k, v2, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, *ts.incr(), v2); // Flashback to version 1 with start_ts = 14, commit_ts = 15. assert_eq!( - must_flashback_write(&engine, k, 1, *ts.incr(), *ts.incr()), + must_flashback_write(&mut engine, k, 1, *ts.incr(), *ts.incr()), 1 ); - must_get_none(&engine, k, *ts.incr()); + must_get_none(&mut engine, k, *ts.incr()); // Flashback to version 2 with start_ts = 17, commit_ts = 18. assert_eq!( - must_flashback_write(&engine, k, 2, *ts.incr(), *ts.incr()), + must_flashback_write(&mut engine, k, 2, *ts.incr(), *ts.incr()), 1 ); - must_get(&engine, k, *ts.incr(), v1); + must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 5 with start_ts = 20, commit_ts = 21. assert_eq!( - must_flashback_write(&engine, k, 5, *ts.incr(), *ts.incr()), + must_flashback_write(&mut engine, k, 5, *ts.incr(), *ts.incr()), 1 ); - must_get(&engine, k, *ts.incr(), v1); + must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 7 with start_ts = 23, commit_ts = 24. assert_eq!( - must_flashback_write(&engine, k, 7, *ts.incr(), *ts.incr()), + must_flashback_write(&mut engine, k, 7, *ts.incr(), *ts.incr()), 1 ); - must_get(&engine, k, *ts.incr(), v1); + must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 10 with start_ts = 26, commit_ts = 27. assert_eq!( - must_flashback_write(&engine, k, 10, *ts.incr(), *ts.incr()), + must_flashback_write(&mut engine, k, 10, *ts.incr(), *ts.incr()), 1 ); - must_get_none(&engine, k, *ts.incr()); + must_get_none(&mut engine, k, *ts.incr()); // Flashback to version 13 with start_ts = 29, commit_ts = 30. assert_eq!( - must_flashback_write(&engine, k, 13, *ts.incr(), *ts.incr()), + must_flashback_write(&mut engine, k, 13, *ts.incr(), *ts.incr()), 1 ); - must_get(&engine, k, *ts.incr(), v2); + must_get(&mut engine, k, *ts.incr(), v2); // Flashback to version 27 with start_ts = 32, commit_ts = 33. assert_eq!( - must_flashback_write(&engine, k, 27, *ts.incr(), *ts.incr()), + must_flashback_write(&mut engine, k, 27, *ts.incr(), *ts.incr()), 1 ); - must_get_none(&engine, k, *ts.incr()); + must_get_none(&mut engine, k, *ts.incr()); } #[test] fn test_flashback_to_version_deleted() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let (k, v) = (b"k", b"v"); - must_prewrite_put(&engine, k, v, k, *ts.incr()); - must_commit(&engine, k, ts, *ts.incr()); - must_get(&engine, k, ts, v); - must_prewrite_delete(&engine, k, k, *ts.incr()); - must_commit(&engine, k, ts, *ts.incr()); + must_prewrite_put(&mut engine, k, v, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, ts, v); + must_prewrite_delete(&mut engine, k, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); // Since the key has been deleted, flashback to version 1 should not do // anything. assert_eq!( - must_flashback_write(&engine, k, ts, *ts.incr(), *ts.incr()), + must_flashback_write(&mut engine, k, ts, *ts.incr(), *ts.incr()), 0 ); - must_get_none(&engine, k, ts); + must_get_none(&mut engine, k, ts); } } diff --git a/src/storage/txn/actions/gc.rs b/src/storage/txn/actions/gc.rs index 29264c7df90..8c24baf7d5b 100644 --- a/src/storage/txn/actions/gc.rs +++ b/src/storage/txn/actions/gc.rs @@ -137,7 +137,7 @@ pub mod tests { RocksEngine, TestEngineBuilder, }; - pub fn must_succeed(engine: &E, key: &[u8], safe_point: impl Into) { + pub fn must_succeed(engine: &mut E, key: &[u8], safe_point: impl Into) { let ctx = SnapContext::default(); let snapshot = engine.snapshot(ctx).unwrap(); let cm = ConcurrencyManager::new(1.into()); @@ -150,22 +150,22 @@ pub mod tests { #[cfg(test)] fn test_gc_imp(k: &[u8], v1: &[u8], v2: &[u8], v3: &[u8], v4: &[u8], gc: F) where - F: Fn(&RocksEngine, &[u8], u64), + F: Fn(&mut RocksEngine, &[u8], u64), { - let engine = TestEngineBuilder::new().build().unwrap(); - - must_prewrite_put(&engine, k, v1, k, 5); - must_commit(&engine, k, 5, 10); - must_prewrite_put(&engine, k, v2, k, 15); - must_commit(&engine, k, 15, 20); - must_prewrite_delete(&engine, k, k, 25); - must_commit(&engine, k, 25, 30); - must_prewrite_put(&engine, k, v3, k, 35); - must_commit(&engine, k, 35, 40); - must_prewrite_lock(&engine, k, k, 45); - must_commit(&engine, k, 45, 50); - must_prewrite_put(&engine, k, v4, k, 55); - must_rollback(&engine, k, 55, false); + let mut engine = TestEngineBuilder::new().build().unwrap(); + + must_prewrite_put(&mut engine, k, v1, k, 5); + must_commit(&mut engine, k, 5, 10); + must_prewrite_put(&mut engine, k, v2, k, 15); + must_commit(&mut engine, k, 15, 20); + must_prewrite_delete(&mut engine, k, k, 25); + must_commit(&mut engine, k, 25, 30); + must_prewrite_put(&mut engine, k, v3, k, 35); + must_commit(&mut engine, k, 35, 40); + must_prewrite_lock(&mut engine, k, k, 45); + must_commit(&mut engine, k, 45, 50); + must_prewrite_put(&mut engine, k, v4, k, 55); + must_rollback(&mut engine, k, 55, false); // Transactions: // startTS commitTS Command @@ -192,19 +192,19 @@ pub mod tests { // 10 Commit(PUT,5) // 5 x5 - gc(&engine, k, 12); - must_get(&engine, k, 12, v1); + gc(&mut engine, k, 12); + must_get(&mut engine, k, 12, v1); - gc(&engine, k, 22); - must_get(&engine, k, 22, v2); - must_get_none(&engine, k, 12); + gc(&mut engine, k, 22); + must_get(&mut engine, k, 22, v2); + must_get_none(&mut engine, k, 12); - gc(&engine, k, 32); - must_get_none(&engine, k, 22); - must_get_none(&engine, k, 35); + gc(&mut engine, k, 32); + must_get_none(&mut engine, k, 22); + must_get_none(&mut engine, k, 35); - gc(&engine, k, 60); - must_get(&engine, k, 62, v3); + gc(&mut engine, k, 60); + must_get(&mut engine, k, 62, v3); } #[test] diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 5883fc4b983..a8a33799686 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -752,6 +752,8 @@ pub mod tests { #[cfg(test)] use rand::{Rng, SeedableRng}; #[cfg(test)] + use tikv_kv::RocksEngine; + #[cfg(test)] use txn_types::OldValue; use super::*; @@ -805,7 +807,7 @@ pub mod tests { // Insert has a constraint that key should not exist pub fn try_prewrite_insert( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -840,7 +842,7 @@ pub mod tests { } pub fn try_prewrite_check_not_exists( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], ts: impl Into, @@ -865,7 +867,7 @@ pub mod tests { #[test] fn test_async_commit_prewrite_check_max_commit_ts() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(42.into()); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -903,14 +905,14 @@ pub mod tests { let modifies = txn.into_modifies(); assert_eq!(modifies.len(), 2); // the mutation that meets CommitTsTooLarge still exists write(&engine, &Default::default(), modifies); - assert!(must_locked(&engine, b"k1", 10).use_async_commit); + assert!(must_locked(&mut engine, b"k1", 10).use_async_commit); // The written lock should not have use_async_commit flag. - assert!(!must_locked(&engine, b"k2", 10).use_async_commit); + assert!(!must_locked(&mut engine, b"k2", 10).use_async_commit); } #[test] fn test_async_commit_prewrite_min_commit_ts() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(41.into()); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1042,7 +1044,7 @@ pub mod tests { #[test] fn test_1pc_check_max_commit_ts() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(42.into()); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1082,12 +1084,12 @@ pub mod tests { assert_eq!(modifies.len(), 2); // the mutation that meets CommitTsTooLarge still exists write(&engine, &Default::default(), modifies); // success 1pc prewrite needs to be transformed to locks - assert!(!must_locked(&engine, b"k1", 10).use_async_commit); - assert!(!must_locked(&engine, b"k2", 10).use_async_commit); + assert!(!must_locked(&mut engine, b"k1", 10).use_async_commit); + assert!(!must_locked(&mut engine, b"k2", 10).use_async_commit); } pub fn try_pessimistic_prewrite_check_not_exists( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], ts: impl Into, @@ -1123,11 +1125,11 @@ pub mod tests { #[test] fn test_async_commit_pessimistic_prewrite_check_max_commit_ts() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(42.into()); - must_acquire_pessimistic_lock(&engine, b"k1", b"k1", 10, 10); - must_acquire_pessimistic_lock(&engine, b"k2", b"k1", 10, 10); + must_acquire_pessimistic_lock(&mut engine, b"k1", b"k1", 10, 10); + must_acquire_pessimistic_lock(&mut engine, b"k2", b"k1", 10, 10); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1173,11 +1175,11 @@ pub mod tests { #[test] fn test_1pc_pessimistic_prewrite_check_max_commit_ts() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(42.into()); - must_acquire_pessimistic_lock(&engine, b"k1", b"k1", 10, 10); - must_acquire_pessimistic_lock(&engine, b"k2", b"k1", 10, 10); + must_acquire_pessimistic_lock(&mut engine, b"k1", b"k1", 10, 10); + must_acquire_pessimistic_lock(&mut engine, b"k2", b"k1", 10, 10); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1223,69 +1225,69 @@ pub mod tests { #[test] fn test_prewrite_check_gc_fence() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(1.into()); // PUT, Read // `------^ - must_prewrite_put(&engine, b"k1", b"v1", b"k1", 10); - must_commit(&engine, b"k1", 10, 30); - must_cleanup_with_gc_fence(&engine, b"k1", 30, 0, 40, true); + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 10); + must_commit(&mut engine, b"k1", 10, 30); + must_cleanup_with_gc_fence(&mut engine, b"k1", 30, 0, 40, true); // PUT, Read // * (GC fence ts = 0) - must_prewrite_put(&engine, b"k2", b"v2", b"k2", 11); - must_commit(&engine, b"k2", 11, 30); - must_cleanup_with_gc_fence(&engine, b"k2", 30, 0, 0, true); + must_prewrite_put(&mut engine, b"k2", b"v2", b"k2", 11); + must_commit(&mut engine, b"k2", 11, 30); + must_cleanup_with_gc_fence(&mut engine, b"k2", 30, 0, 0, true); // PUT, LOCK, LOCK, Read // `---------^ - must_prewrite_put(&engine, b"k3", b"v3", b"k3", 12); - must_commit(&engine, b"k3", 12, 30); - must_prewrite_lock(&engine, b"k3", b"k3", 37); - must_commit(&engine, b"k3", 37, 38); - must_cleanup_with_gc_fence(&engine, b"k3", 30, 0, 40, true); - must_prewrite_lock(&engine, b"k3", b"k3", 42); - must_commit(&engine, b"k3", 42, 43); + must_prewrite_put(&mut engine, b"k3", b"v3", b"k3", 12); + must_commit(&mut engine, b"k3", 12, 30); + must_prewrite_lock(&mut engine, b"k3", b"k3", 37); + must_commit(&mut engine, b"k3", 37, 38); + must_cleanup_with_gc_fence(&mut engine, b"k3", 30, 0, 40, true); + must_prewrite_lock(&mut engine, b"k3", b"k3", 42); + must_commit(&mut engine, b"k3", 42, 43); // PUT, LOCK, LOCK, Read // * - must_prewrite_put(&engine, b"k4", b"v4", b"k4", 13); - must_commit(&engine, b"k4", 13, 30); - must_prewrite_lock(&engine, b"k4", b"k4", 37); - must_commit(&engine, b"k4", 37, 38); - must_prewrite_lock(&engine, b"k4", b"k4", 42); - must_commit(&engine, b"k4", 42, 43); - must_cleanup_with_gc_fence(&engine, b"k4", 30, 0, 0, true); + must_prewrite_put(&mut engine, b"k4", b"v4", b"k4", 13); + must_commit(&mut engine, b"k4", 13, 30); + must_prewrite_lock(&mut engine, b"k4", b"k4", 37); + must_commit(&mut engine, b"k4", 37, 38); + must_prewrite_lock(&mut engine, b"k4", b"k4", 42); + must_commit(&mut engine, b"k4", 42, 43); + must_cleanup_with_gc_fence(&mut engine, b"k4", 30, 0, 0, true); // PUT, PUT, READ // `-----^ `------^ - must_prewrite_put(&engine, b"k5", b"v5", b"k5", 14); - must_commit(&engine, b"k5", 14, 20); - must_prewrite_put(&engine, b"k5", b"v5x", b"k5", 21); - must_commit(&engine, b"k5", 21, 30); - must_cleanup_with_gc_fence(&engine, b"k5", 20, 0, 30, false); - must_cleanup_with_gc_fence(&engine, b"k5", 30, 0, 40, true); + must_prewrite_put(&mut engine, b"k5", b"v5", b"k5", 14); + must_commit(&mut engine, b"k5", 14, 20); + must_prewrite_put(&mut engine, b"k5", b"v5x", b"k5", 21); + must_commit(&mut engine, b"k5", 21, 30); + must_cleanup_with_gc_fence(&mut engine, b"k5", 20, 0, 30, false); + must_cleanup_with_gc_fence(&mut engine, b"k5", 30, 0, 40, true); // PUT, PUT, READ // `-----^ * - must_prewrite_put(&engine, b"k6", b"v6", b"k6", 15); - must_commit(&engine, b"k6", 15, 20); - must_prewrite_put(&engine, b"k6", b"v6x", b"k6", 22); - must_commit(&engine, b"k6", 22, 30); - must_cleanup_with_gc_fence(&engine, b"k6", 20, 0, 30, false); - must_cleanup_with_gc_fence(&engine, b"k6", 30, 0, 0, true); + must_prewrite_put(&mut engine, b"k6", b"v6", b"k6", 15); + must_commit(&mut engine, b"k6", 15, 20); + must_prewrite_put(&mut engine, b"k6", b"v6x", b"k6", 22); + must_commit(&mut engine, b"k6", 22, 30); + must_cleanup_with_gc_fence(&mut engine, b"k6", 20, 0, 30, false); + must_cleanup_with_gc_fence(&mut engine, b"k6", 30, 0, 0, true); // PUT, LOCK, READ // `----------^ // Note that this case is special because usually the `LOCK` is the first write // already got during prewrite/acquire_pessimistic_lock and will continue // searching an older version from the `LOCK` record. - must_prewrite_put(&engine, b"k7", b"v7", b"k7", 16); - must_commit(&engine, b"k7", 16, 30); - must_prewrite_lock(&engine, b"k7", b"k7", 37); - must_commit(&engine, b"k7", 37, 38); - must_cleanup_with_gc_fence(&engine, b"k7", 30, 0, 40, true); + must_prewrite_put(&mut engine, b"k7", b"v7", b"k7", 16); + must_commit(&mut engine, b"k7", 16, 30); + must_prewrite_lock(&mut engine, b"k7", b"k7", 37); + must_commit(&mut engine, b"k7", 37, 38); + must_cleanup_with_gc_fence(&mut engine, b"k7", 30, 0, 40, true); // 1. Check GC fence when doing constraint check with the older version. let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -1401,11 +1403,11 @@ pub mod tests { #[test] fn test_resend_prewrite_non_pessimistic_lock() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); - must_acquire_pessimistic_lock(&engine, b"k1", b"k1", 10, 10); + must_acquire_pessimistic_lock(&mut engine, b"k1", b"k1", 10, 10); must_pessimistic_prewrite_put_async_commit( - &engine, + &mut engine, b"k1", b"v1", b"k1", @@ -1416,7 +1418,7 @@ pub mod tests { 15, ); must_pessimistic_prewrite_put_async_commit( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -1428,14 +1430,14 @@ pub mod tests { ); // The transaction may be committed by another reader. - must_commit(&engine, b"k1", 10, 20); - must_commit(&engine, b"k2", 10, 20); + must_commit(&mut engine, b"k1", 10, 20); + must_commit(&mut engine, b"k2", 10, 20); // This is a re-sent prewrite. It should report a PessimisticLockNotFound. In // production, the caller will need to check if the current transaction is // already committed before, in order to provide the idempotency. let err = must_retry_pessimistic_prewrite_put_err( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -1450,23 +1452,23 @@ pub mod tests { Error(box ErrorInner::PessimisticLockNotFound { .. }) )); // Commit repeatedly, these operations should have no effect. - must_commit(&engine, b"k1", 10, 25); - must_commit(&engine, b"k2", 10, 25); + must_commit(&mut engine, b"k1", 10, 25); + must_commit(&mut engine, b"k2", 10, 25); // Seek from 30, we should read commit_ts = 20 instead of 25. - must_seek_write(&engine, b"k1", 30, 10, 20, WriteType::Put); - must_seek_write(&engine, b"k2", 30, 10, 20, WriteType::Put); + must_seek_write(&mut engine, b"k1", 30, 10, 20, WriteType::Put); + must_seek_write(&mut engine, b"k2", 30, 10, 20, WriteType::Put); // Write another version to the keys. - must_prewrite_put(&engine, b"k1", b"v11", b"k1", 35); - must_prewrite_put(&engine, b"k2", b"v22", b"k1", 35); - must_commit(&engine, b"k1", 35, 40); - must_commit(&engine, b"k2", 35, 40); + must_prewrite_put(&mut engine, b"k1", b"v11", b"k1", 35); + must_prewrite_put(&mut engine, b"k2", b"v22", b"k1", 35); + must_commit(&mut engine, b"k1", 35, 40); + must_commit(&mut engine, b"k2", 35, 40); // A retrying non-pessimistic-lock prewrite request should not skip constraint // checks. It reports a PessimisticLockNotFound. let err = must_retry_pessimistic_prewrite_put_err( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -1480,10 +1482,10 @@ pub mod tests { err, Error(box ErrorInner::PessimisticLockNotFound { .. }) )); - must_unlocked(&engine, b"k2"); + must_unlocked(&mut engine, b"k2"); let err = must_retry_pessimistic_prewrite_put_err( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -1497,13 +1499,13 @@ pub mod tests { err, Error(box ErrorInner::PessimisticLockNotFound { .. }) )); - must_unlocked(&engine, b"k2"); + must_unlocked(&mut engine, b"k2"); // Committing still does nothing. - must_commit(&engine, b"k2", 10, 25); + must_commit(&mut engine, b"k2", 10, 25); // Try a different txn start ts (which haven't been successfully committed // before). let err = must_retry_pessimistic_prewrite_put_err( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -1517,11 +1519,11 @@ pub mod tests { err, Error(box ErrorInner::PessimisticLockNotFound { .. }) )); - must_unlocked(&engine, b"k2"); + must_unlocked(&mut engine, b"k2"); // However conflict still won't be checked if there's a non-retry request // arriving. must_prewrite_put_impl( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -1537,14 +1539,14 @@ pub mod tests { kvproto::kvrpcpb::Assertion::None, kvproto::kvrpcpb::AssertionLevel::Off, ); - must_locked(&engine, b"k2", 12); - must_rollback(&engine, b"k2", 12, false); + must_locked(&mut engine, b"k2", 12); + must_rollback(&mut engine, b"k2", 12, false); // And conflict check is according to the for_update_ts for pessimistic // prewrite. So, it will not report error if for_update_ts is large // enough. must_prewrite_put_impl( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -1560,14 +1562,14 @@ pub mod tests { kvproto::kvrpcpb::Assertion::None, kvproto::kvrpcpb::AssertionLevel::Off, ); - must_locked(&engine, b"k2", 13); - must_rollback(&engine, b"k2", 13, false); + must_locked(&mut engine, b"k2", 13); + must_rollback(&mut engine, b"k2", 13, false); // Write a Rollback at 50 first. A retried prewrite at the same ts should // report WriteConflict. - must_rollback(&engine, b"k2", 50, false); + must_rollback(&mut engine, b"k2", 50, false); let err = must_retry_pessimistic_prewrite_put_err( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -1584,7 +1586,7 @@ pub mod tests { ); // But prewriting at 48 can succeed because a newer rollback is allowed. must_prewrite_put_impl( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -1600,28 +1602,28 @@ pub mod tests { kvproto::kvrpcpb::Assertion::None, kvproto::kvrpcpb::AssertionLevel::Off, ); - must_locked(&engine, b"k2", 48); + must_locked(&mut engine, b"k2", 48); } #[test] fn test_old_value_rollback_and_lock() { - let engine_rollback = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine_rollback = crate::storage::TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine_rollback, b"k1", b"v1", b"k1", 10); - must_commit(&engine_rollback, b"k1", 10, 30); + must_prewrite_put(&mut engine_rollback, b"k1", b"v1", b"k1", 10); + must_commit(&mut engine_rollback, b"k1", 10, 30); - must_prewrite_put(&engine_rollback, b"k1", b"v2", b"k1", 40); - must_rollback(&engine_rollback, b"k1", 40, false); + must_prewrite_put(&mut engine_rollback, b"k1", b"v2", b"k1", 40); + must_rollback(&mut engine_rollback, b"k1", 40, false); - let engine_lock = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine_lock = crate::storage::TestEngineBuilder::new().build().unwrap(); - must_prewrite_put(&engine_lock, b"k1", b"v1", b"k1", 10); - must_commit(&engine_lock, b"k1", 10, 30); + must_prewrite_put(&mut engine_lock, b"k1", b"v1", b"k1", 10); + must_commit(&mut engine_lock, b"k1", 10, 30); - must_prewrite_lock(&engine_lock, b"k1", b"k1", 40); - must_commit(&engine_lock, b"k1", 40, 45); + must_prewrite_lock(&mut engine_lock, b"k1", b"k1", 40); + must_commit(&mut engine_lock, b"k1", 40, 45); - for engine in &[engine_rollback, engine_lock] { + for engine in &mut [engine_rollback, engine_lock] { let start_ts = TimeStamp::from(50); let txn_props = TransactionProperties { start_ts, @@ -1660,7 +1662,7 @@ pub mod tests { // Prepares a test case that put, delete and lock a key and returns // a timestamp for testing the case. #[cfg(test)] - pub fn old_value_put_delete_lock_insert(engine: &E, key: &[u8]) -> TimeStamp { + pub fn old_value_put_delete_lock_insert(engine: &mut E, key: &[u8]) -> TimeStamp { must_prewrite_put(engine, key, b"v1", key, 10); must_commit(engine, key, 10, 20); @@ -1675,8 +1677,8 @@ pub mod tests { #[test] fn test_old_value_put_delete_lock_insert() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); - let start_ts = old_value_put_delete_lock_insert(&engine, b"k1"); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let start_ts = old_value_put_delete_lock_insert(&mut engine, b"k1"); let txn_props = TransactionProperties { start_ts, kind: TransactionKind::Optimistic(false), @@ -1729,7 +1731,7 @@ pub mod tests { let mut rg = rand::rngs::StdRng::seed_from_u64(seed); // Generate 1000 random cases; - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let cases = 1000; for _ in 0..cases { // At most 12 ops per-case. @@ -1753,20 +1755,20 @@ pub mod tests { match op { 0 => { - must_prewrite_put(&engine, key, &[i as u8], key, start_ts); - must_commit(&engine, key, start_ts, commit_ts); + must_prewrite_put(&mut engine, key, &[i as u8], key, start_ts); + must_commit(&mut engine, key, start_ts, commit_ts); } 1 => { - must_prewrite_delete(&engine, key, key, start_ts); - must_commit(&engine, key, start_ts, commit_ts); + must_prewrite_delete(&mut engine, key, key, start_ts); + must_commit(&mut engine, key, start_ts, commit_ts); } 2 => { - must_prewrite_lock(&engine, key, key, start_ts); - must_commit(&engine, key, start_ts, commit_ts); + must_prewrite_lock(&mut engine, key, key, start_ts); + must_commit(&mut engine, key, start_ts, commit_ts); } 3 => { - must_prewrite_put(&engine, key, &[i as u8], key, start_ts); - must_rollback(&engine, key, start_ts, false); + must_prewrite_put(&mut engine, key, &[i as u8], key, start_ts); + must_rollback(&mut engine, key, start_ts, false); } _ => unreachable!(), } @@ -1881,19 +1883,22 @@ pub mod tests { #[test] fn test_prewrite_with_assertion() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); - - let prewrite_put = |key: &'_ _, - value, - ts: u64, - pessimistic_action, - for_update_ts: u64, - assertion, - assertion_level, - expect_success| { + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + + fn prewrite_put( + engine: &mut E, + key: &[u8], + value: &[u8], + ts: u64, + pessimistic_action: PrewriteRequestPessimisticAction, + for_update_ts: u64, + assertion: Assertion, + assertion_level: AssertionLevel, + expect_success: bool, + ) { if expect_success { must_prewrite_put_impl( - &engine, + engine, key, value, key, @@ -1911,7 +1916,7 @@ pub mod tests { ); } else { let err = must_prewrite_put_err_impl( - &engine, + engine, key, value, key, @@ -1926,193 +1931,209 @@ pub mod tests { ); assert!(matches!(err, Error(box ErrorInner::AssertionFailed { .. }))); } - }; - - let test = |key_prefix: &[u8], assertion_level, prepare: &dyn for<'a> Fn(&'a [u8])| { - let k1 = [key_prefix, b"k1"].concat(); - let k2 = [key_prefix, b"k2"].concat(); - let k3 = [key_prefix, b"k3"].concat(); - let k4 = [key_prefix, b"k4"].concat(); + } - for k in &[&k1, &k2, &k3, &k4] { - prepare(k.as_slice()); - } + let mut test = + |key_prefix: &[u8], + assertion_level, + prepare: &mut dyn for<'a> FnMut(&mut RocksEngine, &'a [u8])| { + let k1 = [key_prefix, b"k1"].concat(); + let k2 = [key_prefix, b"k2"].concat(); + let k3 = [key_prefix, b"k3"].concat(); + let k4 = [key_prefix, b"k4"].concat(); + + for k in &[&k1, &k2, &k3, &k4] { + prepare(&mut engine, k.as_slice()); + } - // Assertion passes (optimistic). - prewrite_put( - &k1, - b"v1", - 10, - SkipPessimisticCheck, - 0, - Assertion::NotExist, - assertion_level, - true, - ); - must_commit(&engine, &k1, 10, 15); + // Assertion passes (optimistic). + prewrite_put( + &mut engine, + &k1, + b"v1", + 10, + SkipPessimisticCheck, + 0, + Assertion::NotExist, + assertion_level, + true, + ); + must_commit(&mut engine, &k1, 10, 15); - prewrite_put( - &k1, - b"v1", - 20, - SkipPessimisticCheck, - 0, - Assertion::Exist, - assertion_level, - true, - ); - must_commit(&engine, &k1, 20, 25); - - // Assertion passes (pessimistic). - prewrite_put( - &k2, - b"v2", - 10, - DoPessimisticCheck, - 11, - Assertion::NotExist, - assertion_level, - true, - ); - must_commit(&engine, &k2, 10, 15); - - prewrite_put( - &k2, - b"v2", - 20, - DoPessimisticCheck, - 21, - Assertion::Exist, - assertion_level, - true, - ); - must_commit(&engine, &k2, 20, 25); - - // Optimistic transaction assertion fail on fast/strict level. - let pass = assertion_level == AssertionLevel::Off; - prewrite_put( - &k1, - b"v1", - 30, - SkipPessimisticCheck, - 0, - Assertion::NotExist, - assertion_level, - pass, - ); - prewrite_put( - &k3, - b"v3", - 30, - SkipPessimisticCheck, - 0, - Assertion::Exist, - assertion_level, - pass, - ); - must_rollback(&engine, &k1, 30, true); - must_rollback(&engine, &k3, 30, true); - - // Pessimistic transaction assertion fail on fast/strict level if assertion - // happens during amending pessimistic lock. - let pass = assertion_level == AssertionLevel::Off; - prewrite_put( - &k2, - b"v2", - 30, - DoPessimisticCheck, - 31, - Assertion::NotExist, - assertion_level, - pass, - ); - prewrite_put( - &k4, - b"v4", - 30, - DoPessimisticCheck, - 31, - Assertion::Exist, - assertion_level, - pass, - ); - must_rollback(&engine, &k2, 30, true); - must_rollback(&engine, &k4, 30, true); - - // Pessimistic transaction fail on strict level no matter what - // `pessimistic_action` is. - let pass = assertion_level != AssertionLevel::Strict; - prewrite_put( - &k1, - b"v1", - 40, - SkipPessimisticCheck, - 41, - Assertion::NotExist, - assertion_level, - pass, - ); - prewrite_put( - &k3, - b"v3", - 40, - SkipPessimisticCheck, - 41, - Assertion::Exist, - assertion_level, - pass, - ); - must_rollback(&engine, &k1, 40, true); - must_rollback(&engine, &k3, 40, true); - - must_acquire_pessimistic_lock(&engine, &k2, &k2, 40, 41); - must_acquire_pessimistic_lock(&engine, &k4, &k4, 40, 41); - prewrite_put( - &k2, - b"v2", - 40, - DoPessimisticCheck, - 41, - Assertion::NotExist, - assertion_level, - pass, - ); - prewrite_put( - &k4, - b"v4", - 40, - DoPessimisticCheck, - 41, - Assertion::Exist, - assertion_level, - pass, - ); - must_rollback(&engine, &k1, 40, true); - must_rollback(&engine, &k3, 40, true); - }; + prewrite_put( + &mut engine, + &k1, + b"v1", + 20, + SkipPessimisticCheck, + 0, + Assertion::Exist, + assertion_level, + true, + ); + must_commit(&mut engine, &k1, 20, 25); + + // Assertion passes (pessimistic). + prewrite_put( + &mut engine, + &k2, + b"v2", + 10, + DoPessimisticCheck, + 11, + Assertion::NotExist, + assertion_level, + true, + ); + must_commit(&mut engine, &k2, 10, 15); + + prewrite_put( + &mut engine, + &k2, + b"v2", + 20, + DoPessimisticCheck, + 21, + Assertion::Exist, + assertion_level, + true, + ); + must_commit(&mut engine, &k2, 20, 25); + + // Optimistic transaction assertion fail on fast/strict level. + let pass = assertion_level == AssertionLevel::Off; + prewrite_put( + &mut engine, + &k1, + b"v1", + 30, + SkipPessimisticCheck, + 0, + Assertion::NotExist, + assertion_level, + pass, + ); + prewrite_put( + &mut engine, + &k3, + b"v3", + 30, + SkipPessimisticCheck, + 0, + Assertion::Exist, + assertion_level, + pass, + ); + must_rollback(&mut engine, &k1, 30, true); + must_rollback(&mut engine, &k3, 30, true); + + // Pessimistic transaction assertion fail on fast/strict level if assertion + // happens during amending pessimistic lock. + let pass = assertion_level == AssertionLevel::Off; + prewrite_put( + &mut engine, + &k2, + b"v2", + 30, + DoPessimisticCheck, + 31, + Assertion::NotExist, + assertion_level, + pass, + ); + prewrite_put( + &mut engine, + &k4, + b"v4", + 30, + DoPessimisticCheck, + 31, + Assertion::Exist, + assertion_level, + pass, + ); + must_rollback(&mut engine, &k2, 30, true); + must_rollback(&mut engine, &k4, 30, true); + + // Pessimistic transaction fail on strict level no matter what + // `pessimistic_action` is. + let pass = assertion_level != AssertionLevel::Strict; + prewrite_put( + &mut engine, + &k1, + b"v1", + 40, + SkipPessimisticCheck, + 41, + Assertion::NotExist, + assertion_level, + pass, + ); + prewrite_put( + &mut engine, + &k3, + b"v3", + 40, + SkipPessimisticCheck, + 41, + Assertion::Exist, + assertion_level, + pass, + ); + must_rollback(&mut engine, &k1, 40, true); + must_rollback(&mut engine, &k3, 40, true); + + must_acquire_pessimistic_lock(&mut engine, &k2, &k2, 40, 41); + must_acquire_pessimistic_lock(&mut engine, &k4, &k4, 40, 41); + prewrite_put( + &mut engine, + &k2, + b"v2", + 40, + DoPessimisticCheck, + 41, + Assertion::NotExist, + assertion_level, + pass, + ); + prewrite_put( + &mut engine, + &k4, + b"v4", + 40, + DoPessimisticCheck, + 41, + Assertion::Exist, + assertion_level, + pass, + ); + must_rollback(&mut engine, &k1, 40, true); + must_rollback(&mut engine, &k3, 40, true); + }; - let prepare_rollback = |k: &'_ _| must_rollback(&engine, k, 3, true); - let prepare_lock_record = |k: &'_ _| { - must_prewrite_lock(&engine, k, k, 3); - must_commit(&engine, k, 3, 5); + let mut prepare_rollback = + |engine: &mut RocksEngine, k: &'_ _| must_rollback(engine, k, 3, true); + let mut prepare_lock_record = |engine: &mut RocksEngine, k: &'_ _| { + must_prewrite_lock(engine, k, k, 3); + must_commit(engine, k, 3, 5); }; - let prepare_delete = |k: &'_ _| { - must_prewrite_put(&engine, k, b"deleted-value", k, 3); - must_commit(&engine, k, 3, 5); - must_prewrite_delete(&engine, k, k, 7); - must_commit(&engine, k, 7, 9); + let mut prepare_delete = |engine: &mut RocksEngine, k: &'_ _| { + must_prewrite_put(engine, k, b"deleted-value", k, 3); + must_commit(engine, k, 3, 5); + must_prewrite_delete(engine, k, k, 7); + must_commit(engine, k, 7, 9); }; - let prepare_gc_fence = |k: &'_ _| { - must_prewrite_put(&engine, k, b"deleted-value", k, 3); - must_commit(&engine, k, 3, 5); - must_cleanup_with_gc_fence(&engine, k, 5, 0, 7, true); + let mut prepare_gc_fence = |engine: &mut RocksEngine, k: &'_ _| { + must_prewrite_put(engine, k, b"deleted-value", k, 3); + must_commit(engine, k, 3, 5); + must_cleanup_with_gc_fence(engine, k, 5, 0, 7, true); }; // Test multiple cases without recreating the engine. So use a increasing key // prefix to avoid each case interfering each other. let mut key_prefix = b'a'; - let mut test_all_levels = |prepare| { + let mut test_all_levels = |prepare: &mut dyn for<'a> FnMut(&mut RocksEngine, &'a [u8])| { test(&[key_prefix], AssertionLevel::Off, prepare); key_prefix += 1; test(&[key_prefix], AssertionLevel::Fast, prepare); @@ -2121,26 +2142,33 @@ pub mod tests { key_prefix += 1; }; - test_all_levels(&|_| ()); - test_all_levels(&prepare_rollback); - test_all_levels(&prepare_lock_record); - test_all_levels(&prepare_delete); - test_all_levels(&prepare_gc_fence); + test_all_levels(&mut |_, _| ()); + test_all_levels(&mut prepare_rollback); + test_all_levels(&mut prepare_lock_record); + test_all_levels(&mut prepare_delete); + test_all_levels(&mut prepare_gc_fence); } #[test] fn test_deferred_constraint_check() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let key = b"key"; let key2 = b"key2"; let value = b"value"; // 1. write conflict - must_prewrite_put(&engine, key, value, key, 1); - must_commit(&engine, key, 1, 5); - must_pessimistic_prewrite_insert(&engine, key2, value, key, 3, 3, SkipPessimisticCheck); - let err = - must_pessimistic_prewrite_insert_err(&engine, key, value, key, 3, 3, DoConstraintCheck); + must_prewrite_put(&mut engine, key, value, key, 1); + must_commit(&mut engine, key, 1, 5); + must_pessimistic_prewrite_insert(&mut engine, key2, value, key, 3, 3, SkipPessimisticCheck); + let err = must_pessimistic_prewrite_insert_err( + &mut engine, + key, + value, + key, + 3, + 3, + DoConstraintCheck, + ); assert!(matches!( err, Error(box ErrorInner::WriteConflict { @@ -2150,10 +2178,10 @@ pub mod tests { )); // 2. unique constraint fail - must_prewrite_put(&engine, key, value, key, 11); - must_commit(&engine, key, 11, 12); + must_prewrite_put(&mut engine, key, value, key, 11); + must_commit(&mut engine, key, 11, 12); let err = must_pessimistic_prewrite_insert_err( - &engine, + &mut engine, key, value, key, @@ -2164,8 +2192,8 @@ pub mod tests { assert!(matches!(err, Error(box ErrorInner::AlreadyExist { .. }))); // 3. success - must_prewrite_delete(&engine, key, key, 21); - must_commit(&engine, key, 21, 22); - must_pessimistic_prewrite_insert(&engine, key, value, key, 23, 23, DoConstraintCheck); + must_prewrite_delete(&mut engine, key, key, 21); + must_commit(&mut engine, key, 21, 22); + must_pessimistic_prewrite_insert(&mut engine, key, value, key, 23, 23, DoConstraintCheck); } } diff --git a/src/storage/txn/actions/tests.rs b/src/storage/txn/actions/tests.rs index 65eafa157ce..fdf060d950d 100644 --- a/src/storage/txn/actions/tests.rs +++ b/src/storage/txn/actions/tests.rs @@ -18,7 +18,7 @@ use crate::storage::{ }; pub fn must_prewrite_put_impl( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -56,7 +56,7 @@ pub fn must_prewrite_put_impl( } pub fn must_prewrite_insert_impl( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -94,7 +94,7 @@ pub fn must_prewrite_insert_impl( } pub fn must_prewrite_put_impl_with_should_not_exist( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -164,7 +164,7 @@ pub fn must_prewrite_put_impl_with_should_not_exist( } pub fn must_prewrite_put( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -190,7 +190,7 @@ pub fn must_prewrite_put( } pub fn must_prewrite_put_on_region( - engine: &E, + engine: &mut E, region_id: u64, key: &[u8], value: &[u8], @@ -219,7 +219,7 @@ pub fn must_prewrite_put_on_region( } pub fn must_pessimistic_prewrite_put( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -247,7 +247,7 @@ pub fn must_pessimistic_prewrite_put( } pub fn must_pessimistic_prewrite_insert( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -275,7 +275,7 @@ pub fn must_pessimistic_prewrite_insert( } pub fn must_pessimistic_prewrite_put_with_ttl( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -304,7 +304,7 @@ pub fn must_pessimistic_prewrite_put_with_ttl( } pub fn must_prewrite_put_for_large_txn( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -341,7 +341,7 @@ pub fn must_prewrite_put_for_large_txn( } pub fn must_prewrite_put_async_commit( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -370,7 +370,7 @@ pub fn must_prewrite_put_async_commit( } pub fn must_pessimistic_prewrite_put_async_commit( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -426,7 +426,7 @@ fn default_txn_props( } pub fn must_prewrite_put_err_impl( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -457,7 +457,7 @@ pub fn must_prewrite_put_err_impl( } pub fn must_prewrite_insert_err_impl( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -488,7 +488,7 @@ pub fn must_prewrite_insert_err_impl( } pub fn must_prewrite_put_err_impl_with_should_not_exist( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -535,7 +535,7 @@ pub fn must_prewrite_put_err_impl_with_should_not_exist( } pub fn must_prewrite_put_err( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -558,7 +558,7 @@ pub fn must_prewrite_put_err( } pub fn must_pessimistic_prewrite_put_err( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -583,7 +583,7 @@ pub fn must_pessimistic_prewrite_put_err( } pub fn must_pessimistic_prewrite_insert_err( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -608,7 +608,7 @@ pub fn must_pessimistic_prewrite_insert_err( } pub fn must_retry_pessimistic_prewrite_put_err( - engine: &E, + engine: &mut E, key: &[u8], value: &[u8], pk: &[u8], @@ -635,7 +635,7 @@ pub fn must_retry_pessimistic_prewrite_put_err( } fn must_prewrite_delete_impl( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], ts: impl Into, @@ -675,7 +675,7 @@ fn must_prewrite_delete_impl( } pub fn must_prewrite_delete( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], ts: impl Into, @@ -692,7 +692,7 @@ pub fn must_prewrite_delete( } pub fn must_prewrite_delete_on_region( - engine: &E, + engine: &mut E, region_id: u64, key: &[u8], pk: &[u8], @@ -710,7 +710,7 @@ pub fn must_prewrite_delete_on_region( } pub fn must_pessimistic_prewrite_delete( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], ts: impl Into, @@ -721,7 +721,7 @@ pub fn must_pessimistic_prewrite_delete( } fn must_prewrite_lock_impl( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], ts: impl Into, @@ -752,12 +752,17 @@ fn must_prewrite_lock_impl( .unwrap(); } -pub fn must_prewrite_lock(engine: &E, key: &[u8], pk: &[u8], ts: impl Into) { +pub fn must_prewrite_lock( + engine: &mut E, + key: &[u8], + pk: &[u8], + ts: impl Into, +) { must_prewrite_lock_impl(engine, key, pk, ts, TimeStamp::zero(), SkipPessimisticCheck); } pub fn must_prewrite_lock_err( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], ts: impl Into, @@ -780,7 +785,7 @@ pub fn must_prewrite_lock_err( } pub fn must_pessimistic_prewrite_lock( - engine: &E, + engine: &mut E, key: &[u8], pk: &[u8], ts: impl Into, @@ -791,7 +796,7 @@ pub fn must_pessimistic_prewrite_lock( } pub fn must_rollback( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, protect_rollback: bool, @@ -813,7 +818,7 @@ pub fn must_rollback( write(engine, &ctx, txn.into_modifies()); } -pub fn must_rollback_err(engine: &E, key: &[u8], start_ts: impl Into) { +pub fn must_rollback_err(engine: &mut E, key: &[u8], start_ts: impl Into) { let snapshot = engine.snapshot(Default::default()).unwrap(); let start_ts = start_ts.into(); let cm = ConcurrencyManager::new(start_ts); diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index bdb4bca2110..4b780f5bf2d 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -81,7 +81,7 @@ mod tests { } fn test_atomic_process_write_impl() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let raw_keys = vec![b"ra", b"rz"]; let raw_values = vec![b"valuea", b"valuez"]; diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 7f6f4879a3d..2678effbf7b 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -178,7 +178,7 @@ pub mod tests { }; pub fn must_success( - engine: &E, + engine: &mut E, key: &[u8], lock_ts: impl Into, expect_status: SecondaryLocksStatus, @@ -215,12 +215,13 @@ pub mod tests { #[test] fn test_check_async_commit_secondary_locks() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); + let mut engine_clone = engine.clone(); let ctx = Context::default(); let cm = ConcurrencyManager::new(1.into()); - let check_secondary = |key, ts| { - let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut check_secondary = |key, ts| { + let snapshot = engine_clone.snapshot(Default::default()).unwrap(); let key = Key::from_raw(key); let ts = TimeStamp::new(ts); let command = crate::storage::txn::commands::CheckSecondaryLocks { @@ -242,7 +243,7 @@ pub mod tests { ) .unwrap(); if !result.to_be_write.modifies.is_empty() { - engine.write(&ctx, result.to_be_write).unwrap(); + engine_clone.write(&ctx, result.to_be_write).unwrap(); } if let ProcessResult::SecondaryLocksStatus { status } = result.pr { status @@ -251,11 +252,11 @@ pub mod tests { } }; - must_prewrite_lock(&engine, b"k1", b"key", 1); - must_commit(&engine, b"k1", 1, 3); - must_rollback(&engine, b"k1", 5, false); - must_prewrite_lock(&engine, b"k1", b"key", 7); - must_commit(&engine, b"k1", 7, 9); + must_prewrite_lock(&mut engine, b"k1", b"key", 1); + must_commit(&mut engine, b"k1", 1, 3); + must_rollback(&mut engine, b"k1", 5, false); + must_prewrite_lock(&mut engine, b"k1", b"key", 7); + must_commit(&mut engine, b"k1", 7, 9); // Lock CF has no lock // @@ -269,20 +270,20 @@ pub mod tests { check_secondary(b"k1", 7), SecondaryLocksStatus::Committed(9.into()) ); - must_get_commit_ts(&engine, b"k1", 7, 9); + must_get_commit_ts(&mut engine, b"k1", 7, 9); assert_eq!(check_secondary(b"k1", 5), SecondaryLocksStatus::RolledBack); - must_get_rollback_ts(&engine, b"k1", 5); + must_get_rollback_ts(&mut engine, b"k1", 5); assert_eq!( check_secondary(b"k1", 1), SecondaryLocksStatus::Committed(3.into()) ); - must_get_commit_ts(&engine, b"k1", 1, 3); + must_get_commit_ts(&mut engine, b"k1", 1, 3); assert_eq!(check_secondary(b"k1", 6), SecondaryLocksStatus::RolledBack); - must_get_rollback_protected(&engine, b"k1", 6, true); + must_get_rollback_protected(&mut engine, b"k1", 6, true); // ---------------------------- - must_acquire_pessimistic_lock(&engine, b"k1", b"key", 11, 11); + must_acquire_pessimistic_lock(&mut engine, b"k1", b"key", 11, 11); // Lock CF has a pessimistic lock // @@ -294,11 +295,11 @@ pub mod tests { let status = check_secondary(b"k1", 11); assert_eq!(status, SecondaryLocksStatus::RolledBack); - must_get_rollback_protected(&engine, b"k1", 11, true); + must_get_rollback_protected(&mut engine, b"k1", 11, true); // ---------------------------- - must_prewrite_lock(&engine, b"k1", b"key", 13); + must_prewrite_lock(&mut engine, b"k1", b"key", 13); // Lock CF has an optimistic lock // @@ -313,11 +314,11 @@ pub mod tests { SecondaryLocksStatus::Locked(_) => {} res => panic!("unexpected lock status: {:?}", res), } - must_locked(&engine, b"k1", 13); + must_locked(&mut engine, b"k1", 13); // ---------------------------- - must_commit(&engine, b"k1", 13, 15); + must_commit(&mut engine, b"k1", 13, 15); // Lock CF has an optimistic lock // @@ -333,12 +334,12 @@ pub mod tests { SecondaryLocksStatus::RolledBack => {} res => panic!("unexpected lock status: {:?}", res), } - must_get_rollback_protected(&engine, b"k1", 14, true); + must_get_rollback_protected(&mut engine, b"k1", 14, true); match check_secondary(b"k1", 15) { SecondaryLocksStatus::RolledBack => {} res => panic!("unexpected lock status: {:?}", res), } - must_get_overlapped_rollback(&engine, b"k1", 15, 13, WriteType::Lock, Some(0)); + must_get_overlapped_rollback(&mut engine, b"k1", 15, 13, WriteType::Lock, Some(0)); } } diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 24f69e9a237..ef323cf206b 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -166,7 +166,7 @@ pub mod tests { }; pub fn must_success( - engine: &E, + engine: &mut E, primary_key: &[u8], lock_ts: impl Into, caller_start_ts: impl Into, @@ -213,7 +213,7 @@ pub mod tests { } pub fn must_err( - engine: &E, + engine: &mut E, primary_key: &[u8], lock_ts: impl Into, caller_start_ts: impl Into, @@ -285,15 +285,15 @@ pub mod tests { #[test] fn test_check_async_commit_txn_status() { let do_test = |rollback_if_not_exist: bool| { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let r = rollback_if_not_exist; // case 1: primary is prewritten (optimistic) - must_prewrite_put_async_commit(&engine, b"k1", b"v", b"k1", &Some(vec![]), 1, 2); + must_prewrite_put_async_commit(&mut engine, b"k1", b"v", b"k1", &Some(vec![]), 1, 2); // All following check_txn_status should return the unchanged lock information // caller_start_ts == current_ts == 0 must_success( - &engine, + &mut engine, b"k1", 1, 0, @@ -305,7 +305,7 @@ pub mod tests { ); // caller_start_ts != 0 must_success( - &engine, + &mut engine, b"k1", 1, 5, @@ -317,7 +317,7 @@ pub mod tests { ); // current_ts != 0 must_success( - &engine, + &mut engine, b"k1", 1, 0, @@ -329,7 +329,7 @@ pub mod tests { ); // caller_start_ts != 0 && current_ts != 0 must_success( - &engine, + &mut engine, b"k1", 1, 10, @@ -341,7 +341,7 @@ pub mod tests { ); // caller_start_ts == u64::MAX must_success( - &engine, + &mut engine, b"k1", 1, TimeStamp::max(), @@ -353,7 +353,7 @@ pub mod tests { ); // current_ts == u64::MAX must_success( - &engine, + &mut engine, b"k1", 1, 12, @@ -365,7 +365,7 @@ pub mod tests { ); // force_sync_commit = true must_success( - &engine, + &mut engine, b"k1", 1, 12, @@ -375,13 +375,13 @@ pub mod tests { false, |s| s == TtlExpire, ); - must_unlocked(&engine, b"k1"); - must_get_rollback_protected(&engine, b"k1", 1, false); + must_unlocked(&mut engine, b"k1"); + must_get_rollback_protected(&mut engine, b"k1", 1, false); // case 2: primary is prewritten (pessimistic) - must_acquire_pessimistic_lock(&engine, b"k2", b"k2", 15, 15); + must_acquire_pessimistic_lock(&mut engine, b"k2", b"k2", 15, 15); must_pessimistic_prewrite_put_async_commit( - &engine, + &mut engine, b"k2", b"v", b"k2", @@ -394,7 +394,7 @@ pub mod tests { // All following check_txn_status should return the unchanged lock information // caller_start_ts == current_ts == 0 must_success( - &engine, + &mut engine, b"k2", 15, 0, @@ -406,7 +406,7 @@ pub mod tests { ); // caller_start_ts != 0 must_success( - &engine, + &mut engine, b"k2", 15, 18, @@ -418,7 +418,7 @@ pub mod tests { ); // current_ts != 0 must_success( - &engine, + &mut engine, b"k2", 15, 0, @@ -430,7 +430,7 @@ pub mod tests { ); // caller_start_ts != 0 && current_ts != 0 must_success( - &engine, + &mut engine, b"k2", 15, 19, @@ -442,7 +442,7 @@ pub mod tests { ); // caller_start_ts == u64::MAX must_success( - &engine, + &mut engine, b"k2", 15, TimeStamp::max(), @@ -454,7 +454,7 @@ pub mod tests { ); // current_ts == u64::MAX must_success( - &engine, + &mut engine, b"k2", 15, 20, @@ -466,7 +466,7 @@ pub mod tests { ); // force_sync_commit = true must_success( - &engine, + &mut engine, b"k2", 15, 20, @@ -476,15 +476,15 @@ pub mod tests { false, |s| s == TtlExpire, ); - must_unlocked(&engine, b"k2"); - must_get_rollback_protected(&engine, b"k2", 15, true); + must_unlocked(&mut engine, b"k2"); + must_get_rollback_protected(&mut engine, b"k2", 15, true); // case 3: pessimistic transaction with two keys (large txn), secondary is // prewritten first - must_acquire_pessimistic_lock_for_large_txn(&engine, b"k3", b"k3", 20, 20, 100); - must_acquire_pessimistic_lock_for_large_txn(&engine, b"k4", b"k3", 20, 25, 100); + must_acquire_pessimistic_lock_for_large_txn(&mut engine, b"k3", b"k3", 20, 20, 100); + must_acquire_pessimistic_lock_for_large_txn(&mut engine, b"k4", b"k3", 20, 25, 100); must_pessimistic_prewrite_put_async_commit( - &engine, + &mut engine, b"k4", b"v", b"k3", @@ -497,7 +497,7 @@ pub mod tests { // the client must call check_txn_status with caller_start_ts == current_ts == // 0, should not push must_success( - &engine, + &mut engine, b"k3", 20, 0, @@ -510,10 +510,10 @@ pub mod tests { // case 4: pessimistic transaction with two keys (not large txn), secondary is // prewritten first - must_acquire_pessimistic_lock_with_ttl(&engine, b"k5", b"k5", 30, 30, 100); - must_acquire_pessimistic_lock_with_ttl(&engine, b"k6", b"k5", 30, 35, 100); + must_acquire_pessimistic_lock_with_ttl(&mut engine, b"k5", b"k5", 30, 30, 100); + must_acquire_pessimistic_lock_with_ttl(&mut engine, b"k6", b"k5", 30, 35, 100); must_pessimistic_prewrite_put_async_commit( - &engine, + &mut engine, b"k6", b"v", b"k5", @@ -526,7 +526,7 @@ pub mod tests { // the client must call check_txn_status with caller_start_ts == current_ts == // 0, should not push must_success( - &engine, + &mut engine, b"k5", 30, 0, @@ -543,7 +543,7 @@ pub mod tests { } fn test_check_txn_status_impl(rollback_if_not_exist: bool) { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k1", b"v1"); @@ -554,7 +554,7 @@ pub mod tests { // Try to check a not exist thing. if r { must_success( - &engine, + &mut engine, k, ts(3, 0), ts(3, 1), @@ -565,20 +565,29 @@ pub mod tests { |s| s == LockNotExist, ); // A protected rollback record will be written. - must_get_rollback_protected(&engine, k, ts(3, 0), true); + must_get_rollback_protected(&mut engine, k, ts(3, 0), true); } else { - must_err(&engine, k, ts(3, 0), ts(3, 1), ts(3, 2), r, false, false); + must_err( + &mut engine, + k, + ts(3, 0), + ts(3, 1), + ts(3, 2), + r, + false, + false, + ); } // Lock the key with TTL=100. - must_prewrite_put_for_large_txn(&engine, k, v, k, ts(5, 0), 100, 0); + must_prewrite_put_for_large_txn(&mut engine, k, v, k, ts(5, 0), 100, 0); // The initial min_commit_ts is start_ts + 1. - must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(5, 1), false); + must_large_txn_locked(&mut engine, k, ts(5, 0), 100, ts(5, 1), false); // CheckTxnStatus with caller_start_ts = 0 and current_ts = 0 should just return // the information of the lock without changing it. must_success( - &engine, + &mut engine, k, ts(5, 0), 0, @@ -591,7 +600,7 @@ pub mod tests { // Update min_commit_ts to current_ts. must_success( - &engine, + &mut engine, k, ts(5, 0), ts(6, 0), @@ -601,12 +610,12 @@ pub mod tests { false, uncommitted(100, ts(7, 0), true), ); - must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(7, 0), false); + must_large_txn_locked(&mut engine, k, ts(5, 0), 100, ts(7, 0), false); // Update min_commit_ts to caller_start_ts + 1 if current_ts < caller_start_ts. // This case should be impossible. But if it happens, we prevents it. must_success( - &engine, + &mut engine, k, ts(5, 0), ts(9, 0), @@ -616,13 +625,13 @@ pub mod tests { false, uncommitted(100, ts(9, 1), true), ); - must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(9, 1), false); + must_large_txn_locked(&mut engine, k, ts(5, 0), 100, ts(9, 1), false); // caller_start_ts < lock.min_commit_ts < current_ts // When caller_start_ts < lock.min_commit_ts, no need to update it, but pushed // should be true. must_success( - &engine, + &mut engine, k, ts(5, 0), ts(8, 0), @@ -632,11 +641,11 @@ pub mod tests { false, uncommitted(100, ts(9, 1), true), ); - must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(9, 1), false); + must_large_txn_locked(&mut engine, k, ts(5, 0), 100, ts(9, 1), false); // current_ts < lock.min_commit_ts < caller_start_ts must_success( - &engine, + &mut engine, k, ts(5, 0), ts(11, 0), @@ -646,12 +655,12 @@ pub mod tests { false, uncommitted(100, ts(11, 1), true), ); - must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(11, 1), false); + must_large_txn_locked(&mut engine, k, ts(5, 0), 100, ts(11, 1), false); // For same caller_start_ts and current_ts, update min_commit_ts to // caller_start_ts + 1 must_success( - &engine, + &mut engine, k, ts(5, 0), ts(12, 0), @@ -661,11 +670,11 @@ pub mod tests { false, uncommitted(100, ts(12, 1), true), ); - must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(12, 1), false); + must_large_txn_locked(&mut engine, k, ts(5, 0), 100, ts(12, 1), false); // Logical time is also considered in the comparing must_success( - &engine, + &mut engine, k, ts(5, 0), ts(13, 1), @@ -675,14 +684,14 @@ pub mod tests { false, uncommitted(100, ts(13, 3), true), ); - must_large_txn_locked(&engine, k, ts(5, 0), 100, ts(13, 3), false); + must_large_txn_locked(&mut engine, k, ts(5, 0), 100, ts(13, 3), false); - must_commit(&engine, k, ts(5, 0), ts(15, 0)); - must_unlocked(&engine, k); + must_commit(&mut engine, k, ts(5, 0), ts(15, 0)); + must_unlocked(&mut engine, k); // Check committed key will get the commit ts. must_success( - &engine, + &mut engine, k, ts(5, 0), ts(12, 0), @@ -692,14 +701,14 @@ pub mod tests { false, committed(ts(15, 0)), ); - must_unlocked(&engine, k); + must_unlocked(&mut engine, k); - must_prewrite_put_for_large_txn(&engine, k, v, k, ts(20, 0), 100, 0); + must_prewrite_put_for_large_txn(&mut engine, k, v, k, ts(20, 0), 100, 0); // Check a committed transaction when there is another lock. Expect getting the // commit ts. must_success( - &engine, + &mut engine, k, ts(5, 0), ts(12, 0), @@ -714,7 +723,7 @@ pub mod tests { // `rollback_if_not_exist` is set. if r { must_success( - &engine, + &mut engine, k, ts(6, 0), ts(12, 0), @@ -726,7 +735,7 @@ pub mod tests { ); // And a rollback record will be written. must_seek_write( - &engine, + &mut engine, k, ts(6, 0), ts(6, 0), @@ -734,13 +743,22 @@ pub mod tests { WriteType::Rollback, ); } else { - must_err(&engine, k, ts(6, 0), ts(12, 0), ts(12, 0), r, false, false); + must_err( + &mut engine, + k, + ts(6, 0), + ts(12, 0), + ts(12, 0), + r, + false, + false, + ); } // TTL check is based on physical time (in ms). When logical time's difference // is larger than TTL, the lock won't be resolved. must_success( - &engine, + &mut engine, k, ts(20, 0), ts(21, 105), @@ -750,11 +768,11 @@ pub mod tests { false, uncommitted(100, ts(21, 106), true), ); - must_large_txn_locked(&engine, k, ts(20, 0), 100, ts(21, 106), false); + must_large_txn_locked(&mut engine, k, ts(20, 0), 100, ts(21, 106), false); // If physical time's difference exceeds TTL, lock will be resolved. must_success( - &engine, + &mut engine, k, ts(20, 0), ts(121, 0), @@ -764,9 +782,9 @@ pub mod tests { false, |s| s == TtlExpire, ); - must_unlocked(&engine, k); + must_unlocked(&mut engine, k); must_seek_write( - &engine, + &mut engine, k, TimeStamp::max(), ts(20, 0), @@ -775,10 +793,10 @@ pub mod tests { ); // Push the min_commit_ts of pessimistic locks. - must_acquire_pessimistic_lock_for_large_txn(&engine, k, k, ts(4, 0), ts(130, 0), 200); - must_large_txn_locked(&engine, k, ts(4, 0), 200, ts(130, 1), true); + must_acquire_pessimistic_lock_for_large_txn(&mut engine, k, k, ts(4, 0), ts(130, 0), 200); + must_large_txn_locked(&mut engine, k, ts(4, 0), 200, ts(130, 1), true); must_success( - &engine, + &mut engine, k, ts(4, 0), ts(135, 0), @@ -788,20 +806,28 @@ pub mod tests { false, uncommitted(200, ts(135, 1), true), ); - must_large_txn_locked(&engine, k, ts(4, 0), 200, ts(135, 1), true); + must_large_txn_locked(&mut engine, k, ts(4, 0), 200, ts(135, 1), true); // Commit the key. - must_pessimistic_prewrite_put(&engine, k, v, k, ts(4, 0), ts(130, 0), DoPessimisticCheck); - must_commit(&engine, k, ts(4, 0), ts(140, 0)); - must_unlocked(&engine, k); - must_get_commit_ts(&engine, k, ts(4, 0), ts(140, 0)); + must_pessimistic_prewrite_put( + &mut engine, + k, + v, + k, + ts(4, 0), + ts(130, 0), + DoPessimisticCheck, + ); + must_commit(&mut engine, k, ts(4, 0), ts(140, 0)); + must_unlocked(&mut engine, k); + must_get_commit_ts(&mut engine, k, ts(4, 0), ts(140, 0)); // Now the transactions are intersecting: // T1: start_ts = 5, commit_ts = 15 // T2: start_ts = 20, rollback // T3: start_ts = 4, commit_ts = 140 must_success( - &engine, + &mut engine, k, ts(4, 0), ts(10, 0), @@ -812,7 +838,7 @@ pub mod tests { committed(ts(140, 0)), ); must_success( - &engine, + &mut engine, k, ts(5, 0), ts(10, 0), @@ -823,7 +849,7 @@ pub mod tests { committed(ts(15, 0)), ); must_success( - &engine, + &mut engine, k, ts(20, 0), ts(10, 0), @@ -835,9 +861,9 @@ pub mod tests { ); // Rollback expired pessimistic lock. - must_acquire_pessimistic_lock_for_large_txn(&engine, k, k, ts(150, 0), ts(150, 0), 100); + must_acquire_pessimistic_lock_for_large_txn(&mut engine, k, k, ts(150, 0), ts(150, 0), 100); must_success( - &engine, + &mut engine, k, ts(150, 0), ts(160, 0), @@ -847,9 +873,9 @@ pub mod tests { false, uncommitted(100, ts(160, 1), true), ); - must_large_txn_locked(&engine, k, ts(150, 0), 100, ts(160, 1), true); + must_large_txn_locked(&mut engine, k, ts(150, 0), 100, ts(160, 1), true); must_success( - &engine, + &mut engine, k, ts(150, 0), ts(160, 0), @@ -859,10 +885,10 @@ pub mod tests { false, |s| s == TtlExpire, ); - must_unlocked(&engine, k); + must_unlocked(&mut engine, k); // Rolling back a pessimistic lock should leave Rollback mark. must_seek_write( - &engine, + &mut engine, k, TimeStamp::max(), ts(150, 0), @@ -871,10 +897,10 @@ pub mod tests { ); // Rollback when current_ts is u64::max_value() - must_prewrite_put_for_large_txn(&engine, k, v, k, ts(270, 0), 100, 0); - must_large_txn_locked(&engine, k, ts(270, 0), 100, ts(270, 1), false); + must_prewrite_put_for_large_txn(&mut engine, k, v, k, ts(270, 0), 100, 0); + must_large_txn_locked(&mut engine, k, ts(270, 0), 100, ts(270, 1), false); must_success( - &engine, + &mut engine, k, ts(270, 0), ts(271, 0), @@ -884,9 +910,9 @@ pub mod tests { false, |s| s == TtlExpire, ); - must_unlocked(&engine, k); + must_unlocked(&mut engine, k); must_seek_write( - &engine, + &mut engine, k, TimeStamp::max(), ts(270, 0), @@ -894,10 +920,10 @@ pub mod tests { WriteType::Rollback, ); - must_acquire_pessimistic_lock_for_large_txn(&engine, k, k, ts(280, 0), ts(280, 0), 100); - must_large_txn_locked(&engine, k, ts(280, 0), 100, ts(280, 1), true); + must_acquire_pessimistic_lock_for_large_txn(&mut engine, k, k, ts(280, 0), ts(280, 0), 100); + must_large_txn_locked(&mut engine, k, ts(280, 0), 100, ts(280, 1), true); must_success( - &engine, + &mut engine, k, ts(280, 0), ts(281, 0), @@ -907,9 +933,9 @@ pub mod tests { false, |s| s == TtlExpire, ); - must_unlocked(&engine, k); + must_unlocked(&mut engine, k); must_seek_write( - &engine, + &mut engine, k, TimeStamp::max(), ts(280, 0), @@ -918,9 +944,9 @@ pub mod tests { ); // Don't push forward the min_commit_ts if the min_commit_ts of the lock is 0. - must_acquire_pessimistic_lock_with_ttl(&engine, k, k, ts(290, 0), ts(290, 0), 100); + must_acquire_pessimistic_lock_with_ttl(&mut engine, k, k, ts(290, 0), ts(290, 0), 100); must_success( - &engine, + &mut engine, k, ts(290, 0), ts(300, 0), @@ -930,11 +956,11 @@ pub mod tests { false, uncommitted(100, TimeStamp::zero(), false), ); - must_large_txn_locked(&engine, k, ts(290, 0), 100, TimeStamp::zero(), true); - pessimistic_rollback::tests::must_success(&engine, k, ts(290, 0), ts(290, 0)); + must_large_txn_locked(&mut engine, k, ts(290, 0), 100, TimeStamp::zero(), true); + pessimistic_rollback::tests::must_success(&mut engine, k, ts(290, 0), ts(290, 0)); must_prewrite_put_impl( - &engine, + &mut engine, k, v, k, @@ -953,7 +979,7 @@ pub mod tests { kvproto::kvrpcpb::AssertionLevel::Off, ); must_success( - &engine, + &mut engine, k, ts(300, 0), ts(310, 0), @@ -963,15 +989,15 @@ pub mod tests { false, uncommitted(100, TimeStamp::zero(), false), ); - must_large_txn_locked(&engine, k, ts(300, 0), 100, TimeStamp::zero(), false); - must_rollback(&engine, k, ts(300, 0), false); + must_large_txn_locked(&mut engine, k, ts(300, 0), 100, TimeStamp::zero(), false); + must_rollback(&mut engine, k, ts(300, 0), false); - must_prewrite_put_for_large_txn(&engine, k, v, k, ts(310, 0), 100, 0); - must_large_txn_locked(&engine, k, ts(310, 0), 100, ts(310, 1), false); + must_prewrite_put_for_large_txn(&mut engine, k, v, k, ts(310, 0), 100, 0); + must_large_txn_locked(&mut engine, k, ts(310, 0), 100, ts(310, 1), false); // Don't push forward the min_commit_ts if caller_start_ts is max, but pushed // should be true. must_success( - &engine, + &mut engine, k, ts(310, 0), TimeStamp::max(), @@ -981,9 +1007,9 @@ pub mod tests { false, uncommitted(100, ts(310, 1), true), ); - must_commit(&engine, k, ts(310, 0), ts(315, 0)); + must_commit(&mut engine, k, ts(310, 0), ts(315, 0)); must_success( - &engine, + &mut engine, k, ts(310, 0), TimeStamp::max(), @@ -1003,7 +1029,7 @@ pub mod tests { #[test] fn test_check_txn_status_resolving_pessimistic_lock() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k1"; let v = b"v1"; let ts = TimeStamp::compose; @@ -1012,7 +1038,7 @@ pub mod tests { // Path: there is no commit or rollback record, no rollback record should be // written. must_success( - &engine, + &mut engine, k, ts(3, 0), ts(3, 0), @@ -1022,17 +1048,26 @@ pub mod tests { true, |s| s == LockNotExistDoNothing, ); - must_get_rollback_ts_none(&engine, k, ts(5, 0)); + must_get_rollback_ts_none(&mut engine, k, ts(5, 0)); // Path: there is no commit or rollback record, error should be reported if // rollback_if_not_exist is set to false. - must_err(&engine, k, ts(3, 0), ts(5, 0), ts(5, 0), false, false, true); + must_err( + &mut engine, + k, + ts(3, 0), + ts(5, 0), + ts(5, 0), + false, + false, + true, + ); // Path: the pessimistic primary key lock does exist, and it's not expired yet. - must_acquire_pessimistic_lock_with_ttl(&engine, k, k, ts(10, 0), ts(10, 0), 10); - must_pessimistic_locked(&engine, k, ts(10, 0), ts(10, 0)); + must_acquire_pessimistic_lock_with_ttl(&mut engine, k, k, ts(10, 0), ts(10, 0), 10); + must_pessimistic_locked(&mut engine, k, ts(10, 0), ts(10, 0)); must_success( - &engine, + &mut engine, k, ts(10, 0), ts(11, 0), @@ -1047,7 +1082,7 @@ pub mod tests { // primary lock will be pessimistically rolled back but there will not // be a rollback record. must_success( - &engine, + &mut engine, k, ts(10, 0), ts(21, 0), @@ -1057,13 +1092,13 @@ pub mod tests { true, |s| s == PessimisticRollBack, ); - must_unlocked(&engine, k); - must_get_rollback_ts_none(&engine, k, ts(22, 0)); + must_unlocked(&mut engine, k); + must_get_rollback_ts_none(&mut engine, k, ts(22, 0)); // Path: the prewrite primary key lock does exist, and it's not expired yet. // Should return locked status. must_prewrite_put_impl( - &engine, + &mut engine, k, v, k, @@ -1082,7 +1117,7 @@ pub mod tests { kvproto::kvrpcpb::AssertionLevel::Off, ); must_success( - &engine, + &mut engine, k, ts(30, 0), ts(31, 0), @@ -1097,7 +1132,7 @@ pub mod tests { // lock, rollback record should be written and the transaction status is // certain. must_success( - &engine, + &mut engine, k, ts(30, 0), ts(41, 0), @@ -1107,16 +1142,16 @@ pub mod tests { true, |s| s == TtlExpire, ); - must_unlocked(&engine, k); - must_get_rollback_ts(&engine, k, ts(30, 0)); + must_unlocked(&mut engine, k); + must_get_rollback_ts(&mut engine, k, ts(30, 0)); // Path: the resolving_pessimistic_lock is false and the primary key lock is // pessimistic lock, the transaction is in commit phase and the rollback // record should be written. - must_acquire_pessimistic_lock_with_ttl(&engine, k, k, ts(50, 0), ts(50, 0), 10); - must_pessimistic_locked(&engine, k, ts(50, 0), ts(50, 0)); + must_acquire_pessimistic_lock_with_ttl(&mut engine, k, k, ts(50, 0), ts(50, 0), 10); + must_pessimistic_locked(&mut engine, k, ts(50, 0), ts(50, 0)); must_success( - &engine, + &mut engine, k, ts(50, 0), ts(61, 0), @@ -1127,7 +1162,7 @@ pub mod tests { false, |s| s == TtlExpire, ); - must_unlocked(&engine, k); - must_get_rollback_ts(&engine, k, ts(50, 0)); + must_unlocked(&mut engine, k); + must_get_rollback_ts(&mut engine, k, ts(50, 0)); } } diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index b3aa7088dc6..34d9114f48a 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -125,7 +125,7 @@ mod tests { /// to key. The full test of `RawCompareAndSwap` is in /// `src/storage/mod.rs`. fn test_cas_basic_impl() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let key = b"rk"; @@ -146,7 +146,7 @@ mod tests { ts, Context::default(), ); - let (prev_val, succeed) = sched_command(&engine, cm.clone(), cmd).unwrap(); + let (prev_val, succeed) = sched_command(&mut engine, cm.clone(), cmd).unwrap(); assert!(prev_val.is_none()); assert!(succeed); @@ -161,7 +161,7 @@ mod tests { ts, Context::default(), ); - let (prev_val, succeed) = sched_command(&engine, cm.clone(), cmd).unwrap(); + let (prev_val, succeed) = sched_command(&mut engine, cm.clone(), cmd).unwrap(); assert_eq!(prev_val, Some(b"v1".to_vec())); assert!(!succeed); @@ -176,13 +176,13 @@ mod tests { ts, Context::default(), ); - let (prev_val, succeed) = sched_command(&engine, cm, cmd).unwrap(); + let (prev_val, succeed) = sched_command(&mut engine, cm, cmd).unwrap(); assert_eq!(prev_val, Some(b"v1".to_vec())); assert!(succeed); } pub fn sched_command( - engine: &E, + engine: &mut E, cm: ConcurrencyManager, cmd: TypedCommand<(Option, bool)>, ) -> Result<(Option, bool)> { @@ -218,7 +218,7 @@ mod tests { } fn test_cas_process_write_impl() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let raw_key = b"rk"; let raw_value = b"valuek"; diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index d06218338da..2f2d123e9bb 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -752,7 +752,7 @@ pub mod test_util { // Some utils for tests that may be used in multiple source code files. pub fn prewrite_command( - engine: &E, + engine: &mut E, cm: ConcurrencyManager, statistics: &mut Statistics, cmd: TypedCommand, @@ -786,7 +786,7 @@ pub mod test_util { } pub fn prewrite( - engine: &E, + engine: &mut E, statistics: &mut Statistics, mutations: Vec, primary: Vec, @@ -806,7 +806,7 @@ pub mod test_util { } pub fn prewrite_with_cm( - engine: &E, + engine: &mut E, cm: ConcurrencyManager, statistics: &mut Statistics, mutations: Vec, @@ -828,7 +828,7 @@ pub mod test_util { } pub fn pessimistic_prewrite( - engine: &E, + engine: &mut E, statistics: &mut Statistics, mutations: Vec<(Mutation, PrewriteRequestPessimisticAction)>, primary: Vec, @@ -850,7 +850,7 @@ pub mod test_util { } pub fn pessimistic_prewrite_with_cm( - engine: &E, + engine: &mut E, cm: ConcurrencyManager, statistics: &mut Statistics, mutations: Vec<(Mutation, PrewriteRequestPessimisticAction)>, @@ -879,7 +879,7 @@ pub mod test_util { } pub fn commit( - engine: &E, + engine: &mut E, statistics: &mut Statistics, keys: Vec, lock_ts: u64, @@ -910,7 +910,7 @@ pub mod test_util { } pub fn rollback( - engine: &E, + engine: &mut E, statistics: &mut Statistics, keys: Vec, start_ts: u64, diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index 010238426ee..837d077153e 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -119,7 +119,7 @@ pub mod tests { }; pub fn must_success( - engine: &E, + engine: &mut E, key: &[u8], start_ts: impl Into, for_update_ts: impl Into, @@ -150,60 +150,60 @@ pub mod tests { #[test] fn test_pessimistic_rollback() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k1"; let v = b"v1"; // Normal - must_acquire_pessimistic_lock(&engine, k, k, 1, 1); - must_pessimistic_locked(&engine, k, 1, 1); - must_success(&engine, k, 1, 1); - must_unlocked(&engine, k); - must_get_commit_ts_none(&engine, k, 1); + must_acquire_pessimistic_lock(&mut engine, k, k, 1, 1); + must_pessimistic_locked(&mut engine, k, 1, 1); + must_success(&mut engine, k, 1, 1); + must_unlocked(&mut engine, k); + must_get_commit_ts_none(&mut engine, k, 1); // Pessimistic rollback is idempotent - must_success(&engine, k, 1, 1); - must_unlocked(&engine, k); - must_get_commit_ts_none(&engine, k, 1); + must_success(&mut engine, k, 1, 1); + must_unlocked(&mut engine, k); + must_get_commit_ts_none(&mut engine, k, 1); // Succeed if the lock doesn't exist. - must_success(&engine, k, 2, 2); + must_success(&mut engine, k, 2, 2); // Do nothing if meets other transaction's pessimistic lock - must_acquire_pessimistic_lock(&engine, k, k, 2, 3); - must_success(&engine, k, 1, 1); - must_success(&engine, k, 1, 2); - must_success(&engine, k, 1, 3); - must_success(&engine, k, 1, 4); - must_success(&engine, k, 3, 3); - must_success(&engine, k, 4, 4); + must_acquire_pessimistic_lock(&mut engine, k, k, 2, 3); + must_success(&mut engine, k, 1, 1); + must_success(&mut engine, k, 1, 2); + must_success(&mut engine, k, 1, 3); + must_success(&mut engine, k, 1, 4); + must_success(&mut engine, k, 3, 3); + must_success(&mut engine, k, 4, 4); // Succeed if for_update_ts is larger; do nothing if for_update_ts is smaller. - must_pessimistic_locked(&engine, k, 2, 3); - must_success(&engine, k, 2, 2); - must_pessimistic_locked(&engine, k, 2, 3); - must_success(&engine, k, 2, 4); - must_unlocked(&engine, k); + must_pessimistic_locked(&mut engine, k, 2, 3); + must_success(&mut engine, k, 2, 2); + must_pessimistic_locked(&mut engine, k, 2, 3); + must_success(&mut engine, k, 2, 4); + must_unlocked(&mut engine, k); // Do nothing if rollbacks a non-pessimistic lock. - must_prewrite_put(&engine, k, v, k, 3); - must_locked(&engine, k, 3); - must_success(&engine, k, 3, 3); - must_locked(&engine, k, 3); + must_prewrite_put(&mut engine, k, v, k, 3); + must_locked(&mut engine, k, 3); + must_success(&mut engine, k, 3, 3); + must_locked(&mut engine, k, 3); // Do nothing if meets other transaction's optimistic lock - must_success(&engine, k, 2, 2); - must_success(&engine, k, 2, 3); - must_success(&engine, k, 2, 4); - must_success(&engine, k, 4, 4); - must_locked(&engine, k, 3); + must_success(&mut engine, k, 2, 2); + must_success(&mut engine, k, 2, 3); + must_success(&mut engine, k, 2, 4); + must_success(&mut engine, k, 4, 4); + must_locked(&mut engine, k, 3); // Do nothing if committed - must_commit(&engine, k, 3, 4); - must_unlocked(&engine, k); - must_get_commit_ts(&engine, k, 3, 4); - must_success(&engine, k, 3, 3); - must_success(&engine, k, 3, 4); - must_success(&engine, k, 3, 5); + must_commit(&mut engine, k, 3, 4); + must_unlocked(&mut engine, k); + must_get_commit_ts(&mut engine, k, 3, 4); + must_success(&mut engine, k, 3, 3); + must_success(&mut engine, k, 3, 4); + must_success(&mut engine, k, 3, 5); } } diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 333d3eb1aca..be47e22e42b 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -918,9 +918,9 @@ mod tests { )); } let mut statistic = Statistics::default(); - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); prewrite( - &engine, + &mut engine, &mut statistic, vec![Mutation::make_put( Key::from_raw(&[pri_key_number]), @@ -933,7 +933,7 @@ mod tests { .unwrap(); assert_eq!(1, statistic.write.seek); let e = prewrite( - &engine, + &mut engine, &mut statistic, mutations.clone(), pri_key.to_vec(), @@ -948,7 +948,7 @@ mod tests { _ => panic!("error type not match"), } commit( - &engine, + &mut engine, &mut statistic, vec![Key::from_raw(&[pri_key_number])], 99, @@ -957,7 +957,7 @@ mod tests { .unwrap(); assert_eq!(3, statistic.write.seek); let e = prewrite( - &engine, + &mut engine, &mut statistic, mutations.clone(), pri_key.to_vec(), @@ -973,7 +973,7 @@ mod tests { _ => panic!("error type not match"), } let e = prewrite( - &engine, + &mut engine, &mut statistic, mutations.clone(), pri_key.to_vec(), @@ -997,7 +997,7 @@ mod tests { ) .unwrap(); prewrite( - &engine, + &mut engine, &mut statistic, mutations.clone(), pri_key.to_vec(), @@ -1008,7 +1008,7 @@ mod tests { // All keys are prewritten successful with only one seek operations. assert_eq!(1, statistic.write.seek); let keys: Vec = mutations.iter().map(|m| m.key().clone()).collect(); - commit(&engine, &mut statistic, keys.clone(), 104, 105).unwrap(); + commit(&mut engine, &mut statistic, keys.clone(), 104, 105).unwrap(); let snap = engine.snapshot(Default::default()).unwrap(); for k in keys { let v = snap.get_cf(CF_WRITE, &k.append_ts(105.into())).unwrap(); @@ -1040,11 +1040,11 @@ mod tests { b"100".to_vec(), )); } - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let keys: Vec = mutations.iter().map(|m| m.key().clone()).collect(); let mut statistic = Statistics::default(); prewrite( - &engine, + &mut engine, &mut statistic, mutations.clone(), pri_key.to_vec(), @@ -1053,10 +1053,10 @@ mod tests { ) .unwrap(); // Rollback to make tombstones in lock-cf. - rollback(&engine, &mut statistic, keys, 100).unwrap(); + rollback(&mut engine, &mut statistic, keys, 100).unwrap(); // Gc rollback flags store in write-cf to make sure the next prewrite operation // will skip seek write cf. - gc_by_compact(&engine, pri_key, 101); + gc_by_compact(&mut engine, pri_key, 101); set_perf_level(PerfLevel::EnableTimeExceptForMutex); let perf = ReadPerfInstant::new(); let mut statistic = Statistics::default(); @@ -1064,7 +1064,7 @@ mod tests { mutations.pop(); } prewrite( - &engine, + &mut engine, &mut statistic, mutations, pri_key.to_vec(), @@ -1081,7 +1081,7 @@ mod tests { fn test_prewrite_1pc() { use crate::storage::mvcc::tests::{must_get, must_get_commit_ts, must_unlocked}; - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let key = b"k"; @@ -1090,7 +1090,7 @@ mod tests { let mut statistics = Statistics::default(); prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, mutations, @@ -1099,9 +1099,9 @@ mod tests { Some(15), ) .unwrap(); - must_unlocked(&engine, key); - must_get(&engine, key, 12, value); - must_get_commit_ts(&engine, key, 10, 11); + must_unlocked(&mut engine, key); + must_get(&mut engine, key, 12, value); + must_get_commit_ts(&mut engine, key, 10, 11); cm.update_max_ts(50.into()); @@ -1111,7 +1111,7 @@ mod tests { // Test the idempotency of prewrite when falling back to 2PC. for _ in 0..2 { let res = prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, mutations.clone(), @@ -1122,17 +1122,17 @@ mod tests { .unwrap(); assert!(res.min_commit_ts.is_zero()); assert!(res.one_pc_commit_ts.is_zero()); - must_locked(&engine, key, 20); + must_locked(&mut engine, key, 20); } - must_rollback(&engine, key, 20, false); + must_rollback(&mut engine, key, 20, false); let mutations = vec![ Mutation::make_put(Key::from_raw(key), value.to_vec()), Mutation::make_check_not_exists(Key::from_raw(b"non_exist")), ]; let mut statistics = Statistics::default(); prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, mutations, @@ -1150,7 +1150,7 @@ mod tests { // Lock k2. let mut statistics = Statistics::default(); prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, vec![Mutation::make_put(Key::from_raw(k2), v2.to_vec())], @@ -1165,7 +1165,7 @@ mod tests { Mutation::make_put(Key::from_raw(k2), v2.to_vec()), ]; prewrite_with_cm( - &engine, + &mut engine, cm, &mut statistics, mutations, @@ -1174,20 +1174,20 @@ mod tests { Some(70), ) .unwrap_err(); - must_unlocked(&engine, k1); - must_locked(&engine, k2, 50); - must_get_commit_ts_none(&engine, k1, 60); - must_get_commit_ts_none(&engine, k2, 60); + must_unlocked(&mut engine, k1); + must_locked(&mut engine, k2, 50); + must_get_commit_ts_none(&mut engine, k1, 60); + must_get_commit_ts_none(&mut engine, k2, 60); } #[test] fn test_prewrite_pessimsitic_1pc() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let key = b"k"; let value = b"v"; - must_acquire_pessimistic_lock(&engine, key, key, 10, 10); + must_acquire_pessimistic_lock(&mut engine, key, key, 10, 10); let mutations = vec![( Mutation::make_put(Key::from_raw(key), value.to_vec()), @@ -1195,7 +1195,7 @@ mod tests { )]; let mut statistics = Statistics::default(); pessimistic_prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, mutations, @@ -1206,14 +1206,14 @@ mod tests { ) .unwrap(); - must_unlocked(&engine, key); - must_get(&engine, key, 12, value); - must_get_commit_ts(&engine, key, 10, 11); + must_unlocked(&mut engine, key); + must_get(&mut engine, key, 12, value); + must_get_commit_ts(&mut engine, key, 10, 11); let (k1, v1) = (b"k", b"v"); let (k2, v2) = (b"k2", b"v2"); - must_acquire_pessimistic_lock(&engine, k1, k1, 8, 12); + must_acquire_pessimistic_lock(&mut engine, k1, k1, 8, 12); let mutations = vec![ ( @@ -1227,7 +1227,7 @@ mod tests { ]; statistics = Statistics::default(); pessimistic_prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, mutations, @@ -1238,15 +1238,15 @@ mod tests { ) .unwrap(); - must_unlocked(&engine, k1); - must_unlocked(&engine, k2); - must_get(&engine, k1, 16, v1); - must_get(&engine, k2, 16, v2); - must_get_commit_ts(&engine, k1, 8, 13); - must_get_commit_ts(&engine, k2, 8, 13); + must_unlocked(&mut engine, k1); + must_unlocked(&mut engine, k2); + must_get(&mut engine, k1, 16, v1); + must_get(&mut engine, k2, 16, v2); + must_get_commit_ts(&mut engine, k1, 8, 13); + must_get_commit_ts(&mut engine, k2, 8, 13); cm.update_max_ts(50.into()); - must_acquire_pessimistic_lock(&engine, k1, k1, 20, 20); + must_acquire_pessimistic_lock(&mut engine, k1, k1, 20, 20); let mutations = vec![( Mutation::make_put(Key::from_raw(k1), v1.to_vec()), @@ -1254,7 +1254,7 @@ mod tests { )]; statistics = Statistics::default(); let res = pessimistic_prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, mutations, @@ -1266,9 +1266,9 @@ mod tests { .unwrap(); assert!(res.min_commit_ts.is_zero()); assert!(res.one_pc_commit_ts.is_zero()); - must_locked(&engine, k1, 20); + must_locked(&mut engine, k1, 20); - must_rollback(&engine, k1, 20, true); + must_rollback(&mut engine, k1, 20, true); // Test a 1PC request should not be partially written when encounters error on // the halfway. If some of the keys are successfully written as committed state, @@ -1277,7 +1277,7 @@ mod tests { // Lock k2 with a optimistic lock. let mut statistics = Statistics::default(); prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, vec![Mutation::make_put(Key::from_raw(k2), v2.to_vec())], @@ -1297,9 +1297,9 @@ mod tests { SkipPessimisticCheck, ), ]; - must_acquire_pessimistic_lock(&engine, k1, k1, 60, 60); + must_acquire_pessimistic_lock(&mut engine, k1, k1, 60, 60); pessimistic_prewrite_with_cm( - &engine, + &mut engine, cm, &mut statistics, mutations, @@ -1309,15 +1309,15 @@ mod tests { Some(70), ) .unwrap_err(); - must_pessimistic_locked(&engine, k1, 60, 60); - must_locked(&engine, k2, 50); - must_get_commit_ts_none(&engine, k1, 60); - must_get_commit_ts_none(&engine, k2, 60); + must_pessimistic_locked(&mut engine, k1, 60, 60); + must_locked(&mut engine, k2, 50); + must_get_commit_ts_none(&mut engine, k1, 60); + must_get_commit_ts_none(&mut engine, k2, 60); } #[test] fn test_prewrite_async_commit() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let key = b"k"; @@ -1340,10 +1340,10 @@ mod tests { Context::default(), ); - let res = prewrite_command(&engine, cm.clone(), &mut statistics, cmd).unwrap(); + let res = prewrite_command(&mut engine, cm.clone(), &mut statistics, cmd).unwrap(); assert!(!res.min_commit_ts.is_zero()); assert_eq!(res.one_pc_commit_ts, TimeStamp::zero()); - must_locked(&engine, key, 10); + must_locked(&mut engine, key, 10); cm.update_max_ts(50.into()); @@ -1373,23 +1373,23 @@ mod tests { Context::default(), ); - let res = prewrite_command(&engine, cm.clone(), &mut statistics, cmd).unwrap(); + let res = prewrite_command(&mut engine, cm.clone(), &mut statistics, cmd).unwrap(); assert!(res.min_commit_ts.is_zero()); assert!(res.one_pc_commit_ts.is_zero()); - assert!(!must_locked(&engine, k1, 20).use_async_commit); - assert!(!must_locked(&engine, k2, 20).use_async_commit); + assert!(!must_locked(&mut engine, k1, 20).use_async_commit); + assert!(!must_locked(&mut engine, k2, 20).use_async_commit); } } #[test] fn test_prewrite_pessimsitic_async_commit() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let key = b"k"; let value = b"v"; - must_acquire_pessimistic_lock(&engine, key, key, 10, 10); + must_acquire_pessimistic_lock(&mut engine, key, key, 10, 10); let mutations = vec![( Mutation::make_put(Key::from_raw(key), value.to_vec()), @@ -1411,18 +1411,18 @@ mod tests { Context::default(), ); - let res = prewrite_command(&engine, cm.clone(), &mut statistics, cmd).unwrap(); + let res = prewrite_command(&mut engine, cm.clone(), &mut statistics, cmd).unwrap(); assert!(!res.min_commit_ts.is_zero()); assert_eq!(res.one_pc_commit_ts, TimeStamp::zero()); - must_locked(&engine, key, 10); + must_locked(&mut engine, key, 10); cm.update_max_ts(50.into()); let (k1, v1) = (b"k1", b"v1"); let (k2, v2) = (b"k2", b"v2"); - must_acquire_pessimistic_lock(&engine, k1, k1, 20, 20); - must_acquire_pessimistic_lock(&engine, k2, k1, 20, 20); + must_acquire_pessimistic_lock(&mut engine, k1, k1, 20, 20); + must_acquire_pessimistic_lock(&mut engine, k2, k1, 20, 20); let mutations = vec![ ( @@ -1451,11 +1451,11 @@ mod tests { Context::default(), ); - let res = prewrite_command(&engine, cm, &mut statistics, cmd).unwrap(); + let res = prewrite_command(&mut engine, cm, &mut statistics, cmd).unwrap(); assert!(res.min_commit_ts.is_zero()); assert!(res.one_pc_commit_ts.is_zero()); - assert!(!must_locked(&engine, k1, 20).use_async_commit); - assert!(!must_locked(&engine, k2, 20).use_async_commit); + assert!(!must_locked(&mut engine, k1, 20).use_async_commit); + assert!(!must_locked(&mut engine, k2, 20).use_async_commit); } #[test] @@ -1675,7 +1675,7 @@ mod tests { statistics: &mut statistics, async_apply_prewrite: case.async_apply_prewrite, }; - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); assert_eq!(result.response_policy, case.expected); @@ -1685,7 +1685,7 @@ mod tests { // this test for prewrite with should_not_exist flag #[test] fn test_prewrite_should_not_exist() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); // concurency_manager.max_tx = 5 let cm = ConcurrencyManager::new(5.into()); let mut statistics = Statistics::default(); @@ -1693,12 +1693,12 @@ mod tests { let (key, value) = (b"k", b"val"); // T1: start_ts = 3, commit_ts = 5, put key:value - must_prewrite_put(&engine, key, value, key, 3); - must_commit(&engine, key, 3, 5); + must_prewrite_put(&mut engine, key, value, key, 3); + must_commit(&mut engine, key, 3, 5); // T2: start_ts = 15, prewrite on k, with should_not_exist flag set. let res = prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, vec![Mutation::make_check_not_exists(Key::from_raw(key))], @@ -1718,12 +1718,12 @@ mod tests { // T3: start_ts = 8, commit_ts = max_ts + 1 = 16, prewrite a DELETE operation on // k - must_prewrite_delete(&engine, key, key, 8); - must_commit(&engine, key, 8, cm.max_ts().into_inner() + 1); + must_prewrite_delete(&mut engine, key, key, 8); + must_commit(&mut engine, key, 8, cm.max_ts().into_inner() + 1); // T1: start_ts = 10, repeatedly prewrite on k, with should_not_exist flag set let res = prewrite_with_cm( - &engine, + &mut engine, cm, &mut statistics, vec![Mutation::make_check_not_exists(Key::from_raw(key))], @@ -1742,15 +1742,15 @@ mod tests { #[test] fn test_optimistic_prewrite_committed_transaction() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(1.into()); let mut statistics = Statistics::default(); let key = b"k"; // T1: start_ts = 5, commit_ts = 10, async commit - must_prewrite_put_async_commit(&engine, key, b"v1", key, &Some(vec![]), 5, 10); - must_commit(&engine, key, 5, 10); + must_prewrite_put_async_commit(&mut engine, key, b"v1", key, &Some(vec![]), 5, 10); + must_commit(&mut engine, key, 5, 10); // T2: start_ts = 15, commit_ts = 16, 1PC let cmd = Prewrite::with_1pc( @@ -1759,12 +1759,12 @@ mod tests { 15.into(), TimeStamp::default(), ); - let result = prewrite_command(&engine, cm.clone(), &mut statistics, cmd).unwrap(); + let result = prewrite_command(&mut engine, cm.clone(), &mut statistics, cmd).unwrap(); let one_pc_commit_ts = result.one_pc_commit_ts; // T3 is after T1 and T2 - must_prewrite_put(&engine, key, b"v3", key, 20); - must_commit(&engine, key, 20, 25); + must_prewrite_put(&mut engine, key, b"v3", key, 20); + must_commit(&mut engine, key, 20, 25); // Repeating the T1 prewrite request let cmd = Prewrite::new( @@ -1831,16 +1831,16 @@ mod tests { #[test] fn test_pessimistic_prewrite_committed_transaction() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(1.into()); let mut statistics = Statistics::default(); let key = b"k"; // T1: start_ts = 5, commit_ts = 10, async commit - must_acquire_pessimistic_lock(&engine, key, key, 5, 5); + must_acquire_pessimistic_lock(&mut engine, key, key, 5, 5); must_pessimistic_prewrite_put_async_commit( - &engine, + &mut engine, key, b"v1", key, @@ -1850,10 +1850,10 @@ mod tests { DoPessimisticCheck, 10, ); - must_commit(&engine, key, 5, 10); + must_commit(&mut engine, key, 5, 10); // T2: start_ts = 15, commit_ts = 16, 1PC - must_acquire_pessimistic_lock(&engine, key, key, 15, 15); + must_acquire_pessimistic_lock(&mut engine, key, key, 15, 15); let cmd = PrewritePessimistic::with_1pc( vec![( Mutation::make_put(Key::from_raw(key), b"v2".to_vec()), @@ -1864,12 +1864,12 @@ mod tests { 15.into(), TimeStamp::default(), ); - let result = prewrite_command(&engine, cm.clone(), &mut statistics, cmd).unwrap(); + let result = prewrite_command(&mut engine, cm.clone(), &mut statistics, cmd).unwrap(); let one_pc_commit_ts = result.one_pc_commit_ts; // T3 is after T1 and T2 - must_prewrite_put(&engine, key, b"v3", key, 20); - must_commit(&engine, key, 20, 25); + must_prewrite_put(&mut engine, key, b"v3", key, 20); + must_commit(&mut engine, key, 20, 25); // Repeating the T1 prewrite request let cmd = PrewritePessimistic::new( @@ -1943,11 +1943,11 @@ mod tests { #[test] fn test_repeated_pessimistic_prewrite_1pc() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(1.into()); let mut statistics = Statistics::default(); - must_acquire_pessimistic_lock(&engine, b"k2", b"k2", 5, 5); + must_acquire_pessimistic_lock(&mut engine, b"k2", b"k2", 5, 5); // The second key needs a pessimistic lock let mutations = vec![ ( @@ -1960,7 +1960,7 @@ mod tests { ), ]; let res = pessimistic_prewrite_with_cm( - &engine, + &mut engine, cm.clone(), &mut statistics, mutations.clone(), @@ -1974,7 +1974,7 @@ mod tests { cm.update_max_ts(commit_ts.next()); // repeate the prewrite let res = pessimistic_prewrite_with_cm( - &engine, + &mut engine, cm, &mut statistics, mutations, @@ -1986,48 +1986,52 @@ mod tests { .unwrap(); // The new commit ts should be same as before. assert_eq!(res.one_pc_commit_ts, commit_ts); - must_seek_write(&engine, b"k1", 100, 5, commit_ts, WriteType::Put); - must_seek_write(&engine, b"k2", 100, 5, commit_ts, WriteType::Put); + must_seek_write(&mut engine, b"k1", 100, 5, commit_ts, WriteType::Put); + must_seek_write(&mut engine, b"k2", 100, 5, commit_ts, WriteType::Put); } #[test] fn test_repeated_prewrite_non_pessimistic_lock() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(1.into()); let mut statistics = Statistics::default(); let cm = &cm; - let mut prewrite_with_retry_flag = - |key: &[u8], - value: &[u8], - pk: &[u8], - secondary_keys, - ts: u64, - pessimistic_action, - is_retry_request| { - let mutation = Mutation::make_put(Key::from_raw(key), value.to_vec()); - let mut ctx = Context::default(); - ctx.set_is_retry_request(is_retry_request); - let cmd = PrewritePessimistic::new( - vec![(mutation, pessimistic_action)], - pk.to_vec(), - ts.into(), - 100, - ts.into(), - 1, - (ts + 1).into(), - 0.into(), - secondary_keys, - false, - AssertionLevel::Off, - ctx, - ); - prewrite_command(&engine, cm.clone(), &mut statistics, cmd) - }; + fn prewrite_with_retry_flag( + key: &[u8], + value: &[u8], + pk: &[u8], + secondary_keys: Option>>, + ts: u64, + pessimistic_action: PrewriteRequestPessimisticAction, + is_retry_request: bool, + engine: &mut E, + cm: &ConcurrencyManager, + statistics: &mut Statistics, + ) -> Result { + let mutation = Mutation::make_put(Key::from_raw(key), value.to_vec()); + let mut ctx = Context::default(); + ctx.set_is_retry_request(is_retry_request); + let cmd = PrewritePessimistic::new( + vec![(mutation, pessimistic_action)], + pk.to_vec(), + ts.into(), + 100, + ts.into(), + 1, + (ts + 1).into(), + 0.into(), + secondary_keys, + false, + AssertionLevel::Off, + ctx, + ); + prewrite_command(engine, cm.clone(), statistics, cmd) + } - must_acquire_pessimistic_lock(&engine, b"k1", b"k1", 10, 10); + must_acquire_pessimistic_lock(&mut engine, b"k1", b"k1", 10, 10); must_pessimistic_prewrite_put_async_commit( - &engine, + &mut engine, b"k1", b"v1", b"k1", @@ -2038,7 +2042,7 @@ mod tests { 15, ); must_pessimistic_prewrite_put_async_commit( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -2050,8 +2054,8 @@ mod tests { ); // The transaction may be committed by another reader. - must_commit(&engine, b"k1", 10, 20); - must_commit(&engine, b"k2", 10, 20); + must_commit(&mut engine, b"k1", 10, 20); + must_commit(&mut engine, b"k2", 10, 20); // This is a re-sent prewrite. prewrite_with_retry_flag( @@ -2062,21 +2066,24 @@ mod tests { 10, SkipPessimisticCheck, true, + &mut engine, + cm, + &mut statistics, ) .unwrap(); // Commit repeatedly, these operations should have no effect. - must_commit(&engine, b"k1", 10, 25); - must_commit(&engine, b"k2", 10, 25); + must_commit(&mut engine, b"k1", 10, 25); + must_commit(&mut engine, b"k2", 10, 25); // Seek from 30, we should read commit_ts = 20 instead of 25. - must_seek_write(&engine, b"k1", 30, 10, 20, WriteType::Put); - must_seek_write(&engine, b"k2", 30, 10, 20, WriteType::Put); + must_seek_write(&mut engine, b"k1", 30, 10, 20, WriteType::Put); + must_seek_write(&mut engine, b"k2", 30, 10, 20, WriteType::Put); // Write another version to the keys. - must_prewrite_put(&engine, b"k1", b"v11", b"k1", 35); - must_prewrite_put(&engine, b"k2", b"v22", b"k1", 35); - must_commit(&engine, b"k1", 35, 40); - must_commit(&engine, b"k2", 35, 40); + must_prewrite_put(&mut engine, b"k1", b"v11", b"k1", 35); + must_prewrite_put(&mut engine, b"k2", b"v22", b"k1", 35); + must_commit(&mut engine, b"k1", 35, 40); + must_commit(&mut engine, b"k2", 35, 40); // A retrying non-pessimistic-lock prewrite request should not skip constraint // checks. Here it should take no effect, even there's already a newer version @@ -2089,37 +2096,72 @@ mod tests { 10, SkipPessimisticCheck, true, + &mut engine, + cm, + &mut statistics, ) .unwrap(); - must_unlocked(&engine, b"k2"); + must_unlocked(&mut engine, b"k2"); - prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 10, SkipPessimisticCheck, true) - .unwrap(); - must_unlocked(&engine, b"k2"); + prewrite_with_retry_flag( + b"k2", + b"v2", + b"k1", + None, + 10, + SkipPessimisticCheck, + true, + &mut engine, + cm, + &mut statistics, + ) + .unwrap(); + must_unlocked(&mut engine, b"k2"); // Committing still does nothing. - must_commit(&engine, b"k2", 10, 25); + must_commit(&mut engine, b"k2", 10, 25); // Try a different txn start ts (which haven't been successfully committed // before). It should report a PessimisticLockNotFound. - let err = - prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 11, SkipPessimisticCheck, true) - .unwrap_err(); + let err = prewrite_with_retry_flag( + b"k2", + b"v2", + b"k1", + None, + 11, + SkipPessimisticCheck, + true, + &mut engine, + cm, + &mut statistics, + ) + .unwrap_err(); assert!(matches!( err, Error(box ErrorInner::Mvcc(MvccError( box MvccErrorInner::PessimisticLockNotFound { .. } ))) )); - must_unlocked(&engine, b"k2"); + must_unlocked(&mut engine, b"k2"); // However conflict still won't be checked if there's a non-retry request // arriving. - prewrite_with_retry_flag(b"k2", b"v2", b"k1", None, 10, SkipPessimisticCheck, false) - .unwrap(); - must_locked(&engine, b"k2", 10); + prewrite_with_retry_flag( + b"k2", + b"v2", + b"k1", + None, + 10, + SkipPessimisticCheck, + false, + &mut engine, + cm, + &mut statistics, + ) + .unwrap(); + must_locked(&mut engine, b"k2", 10); } #[test] fn test_prewrite_rolledback_transaction() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(1.into()); let mut statistics = Statistics::default(); @@ -2128,10 +2170,10 @@ mod tests { let v2 = b"v2"; // Test the write conflict path. - must_acquire_pessimistic_lock(&engine, k1, v1, 1, 1); - must_rollback(&engine, k1, 1, true); - must_prewrite_put(&engine, k1, v2, k1, 5); - must_commit(&engine, k1, 5, 6); + must_acquire_pessimistic_lock(&mut engine, k1, v1, 1, 1); + must_rollback(&mut engine, k1, 1, true); + must_prewrite_put(&mut engine, k1, v2, k1, 5); + must_commit(&mut engine, k1, 5, 6); let prewrite_cmd = Prewrite::new( vec![Mutation::make_put(Key::from_raw(k1), v1.to_vec())], k1.to_vec(), @@ -2157,9 +2199,9 @@ mod tests { assert!(prewrite_cmd.cmd.process_write(snap, context).is_err()); // Test the pessimistic lock is not found path. - must_acquire_pessimistic_lock(&engine, k1, v1, 10, 10); - must_rollback(&engine, k1, 10, true); - must_acquire_pessimistic_lock(&engine, k1, v1, 15, 15); + must_acquire_pessimistic_lock(&mut engine, k1, v1, 10, 10); + must_rollback(&mut engine, k1, 10, true); + must_acquire_pessimistic_lock(&mut engine, k1, v1, 15, 15); let prewrite_cmd = PrewritePessimistic::with_defaults( vec![( Mutation::make_put(Key::from_raw(k1), v1.to_vec()), @@ -2182,7 +2224,7 @@ mod tests { #[test] fn test_assertion_fail_on_conflicting_index_key() { - let engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); // Simulate two transactions that tries to insert the same row with a secondary // index, and the second one canceled the first one (by rolling back its lock). @@ -2192,13 +2234,18 @@ mod tests { let t2_commit_ts = TimeStamp::compose(3, 0); // txn1 acquires lock on the row key. - must_acquire_pessimistic_lock(&engine, b"row", b"row", t1_start_ts, t1_start_ts); + must_acquire_pessimistic_lock(&mut engine, b"row", b"row", t1_start_ts, t1_start_ts); // txn2 rolls it back. - let err = - must_acquire_pessimistic_lock_err(&engine, b"row", b"row", t2_start_ts, t2_start_ts); + let err = must_acquire_pessimistic_lock_err( + &mut engine, + b"row", + b"row", + t2_start_ts, + t2_start_ts, + ); assert!(matches!(err, MvccError(box MvccErrorInner::KeyIsLocked(_)))); must_check_txn_status( - &engine, + &mut engine, b"row", t1_start_ts, t2_start_ts, @@ -2209,9 +2256,9 @@ mod tests { |status| status == TxnStatus::PessimisticRollBack, ); // And then txn2 acquire continues and finally commits - must_acquire_pessimistic_lock(&engine, b"row", b"row", t2_start_ts, t2_start_ts); + must_acquire_pessimistic_lock(&mut engine, b"row", b"row", t2_start_ts, t2_start_ts); must_prewrite_put_impl( - &engine, + &mut engine, b"row", b"value", b"row", @@ -2228,7 +2275,7 @@ mod tests { AssertionLevel::Strict, ); must_prewrite_put_impl( - &engine, + &mut engine, b"index", b"value", b"row", @@ -2244,8 +2291,8 @@ mod tests { Assertion::NotExist, AssertionLevel::Strict, ); - must_commit(&engine, b"row", t2_start_ts, t2_commit_ts); - must_commit(&engine, b"index", t2_start_ts, t2_commit_ts); + must_commit(&mut engine, b"row", t2_start_ts, t2_commit_ts); + must_commit(&mut engine, b"index", t2_start_ts, t2_commit_ts); // Txn1 continues. If the two keys are sent in the single prewrite request, the // AssertionFailed error won't be returned since there are other error. @@ -2267,7 +2314,7 @@ mod tests { t1_start_ts, t2_start_ts, ); - let err = prewrite_command(&engine, cm.clone(), &mut stat, cmd).unwrap_err(); + let err = prewrite_command(&mut engine, cm.clone(), &mut stat, cmd).unwrap_err(); assert!(matches!( err, Error(box ErrorInner::Mvcc(MvccError( @@ -2290,7 +2337,7 @@ mod tests { t1_start_ts, t2_start_ts, ); - let err = prewrite_command(&engine, cm, &mut stat, cmd).unwrap_err(); + let err = prewrite_command(&mut engine, cm, &mut stat, cmd).unwrap_err(); assert!(matches!( err, Error(box ErrorInner::Mvcc(MvccError( @@ -2301,7 +2348,7 @@ mod tests { // If the two keys are sent in different requests, it would be the client's duty // to ignore the assertion error. let err = must_prewrite_put_err_impl( - &engine, + &mut engine, b"row", b"value", b"row", @@ -2319,7 +2366,7 @@ mod tests { MvccError(box MvccErrorInner::PessimisticLockNotFound { .. }) )); let err = must_prewrite_put_err_impl( - &engine, + &mut engine, b"index", b"value", b"row", @@ -2340,19 +2387,19 @@ mod tests { #[test] fn test_prewrite_committed_encounter_newer_lock() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let mut statistics = Statistics::default(); let k1 = b"k1"; let v1 = b"v1"; let v2 = b"v2"; - must_prewrite_put_async_commit(&engine, k1, v1, k1, &Some(vec![]), 5, 10); + must_prewrite_put_async_commit(&mut engine, k1, v1, k1, &Some(vec![]), 5, 10); // This commit may actually come from a ResolveLock command - must_commit(&engine, k1, 5, 15); + must_commit(&mut engine, k1, 5, 15); // Another transaction prewrites - must_prewrite_put(&engine, k1, v2, k1, 20); + must_prewrite_put(&mut engine, k1, v2, k1, 20); // A retried prewrite of the first transaction should be idempotent. let prewrite_cmd = Prewrite::new( @@ -2389,14 +2436,14 @@ mod tests { #[test] fn test_repeated_prewrite_commit_ts_too_large() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = ConcurrencyManager::new(1.into()); let mut statistics = Statistics::default(); // First, prewrite and commit normally. - must_acquire_pessimistic_lock(&engine, b"k1", b"k1", 5, 10); + must_acquire_pessimistic_lock(&mut engine, b"k1", b"k1", 5, 10); must_pessimistic_prewrite_put_async_commit( - &engine, + &mut engine, b"k1", b"v1", b"k1", @@ -2407,7 +2454,7 @@ mod tests { 15, ); must_prewrite_put_impl( - &engine, + &mut engine, b"k2", b"v2", b"k1", @@ -2423,8 +2470,8 @@ mod tests { Assertion::None, AssertionLevel::Off, ); - must_commit(&engine, b"k1", 5, 18); - must_commit(&engine, b"k2", 5, 18); + must_commit(&mut engine, b"k1", 5, 18); + must_commit(&mut engine, b"k2", 5, 18); // Update max_ts to be larger than the max_commit_ts. cm.update_max_ts(50.into()); @@ -2446,9 +2493,9 @@ mod tests { AssertionLevel::Off, Context::default(), ); - let res = prewrite_command(&engine, cm, &mut statistics, cmd).unwrap(); + let res = prewrite_command(&mut engine, cm, &mut statistics, cmd).unwrap(); // It should return the real commit TS as the min_commit_ts in the result. assert_eq!(res.min_commit_ts, 18.into(), "{:?}", res); - must_unlocked(&engine, b"k2"); + must_unlocked(&mut engine, b"k2"); } } diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index 7e93e77dee6..fc3846931f3 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -81,15 +81,15 @@ mod tests { #[test] fn rollback_lock_with_existing_rollback() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k1, k2) = (b"k1", b"k2"); let v = b"v"; - must_acquire_pessimistic_lock(&engine, k1, k1, 10, 10); - must_rollback(&engine, k1, 10, false); - must_rollback(&engine, k2, 10, false); + must_acquire_pessimistic_lock(&mut engine, k1, k1, 10, 10); + must_rollback(&mut engine, k1, 10, false); + must_rollback(&mut engine, k2, 10, false); - must_pessimistic_prewrite_put(&engine, k2, v, k1, 10, 10, SkipPessimisticCheck); - must_rollback(&engine, k2, 10, false); + must_pessimistic_prewrite_put(&mut engine, k2, v, k1, 10, 10, SkipPessimisticCheck); + must_rollback(&mut engine, k2, 10, false); } } diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index 2149d5571da..70c13a20c26 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -113,7 +113,7 @@ pub mod tests { }; pub fn must_success( - engine: &E, + engine: &mut E, primary_key: &[u8], start_ts: impl Into, advise_ttl: u64, @@ -154,7 +154,7 @@ pub mod tests { } pub fn must_err( - engine: &E, + engine: &mut E, primary_key: &[u8], start_ts: impl Into, advise_ttl: u64, @@ -188,50 +188,50 @@ pub mod tests { #[test] fn test_txn_heart_beat() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k1", b"v1"); - let test = |ts| { + fn test(ts: u64, k: &[u8], engine: &mut impl Engine) { // Do nothing if advise_ttl is less smaller than current TTL. - must_success(&engine, k, ts, 90, 100); + must_success(engine, k, ts, 90, 100); // Return the new TTL if the TTL when the TTL is updated. - must_success(&engine, k, ts, 110, 110); + must_success(engine, k, ts, 110, 110); // The lock's TTL is updated and persisted into the db. - must_success(&engine, k, ts, 90, 110); + must_success(engine, k, ts, 90, 110); // Heart beat another transaction's lock will lead to an error. - must_err(&engine, k, ts - 1, 150); - must_err(&engine, k, ts + 1, 150); + must_err(engine, k, ts - 1, 150); + must_err(engine, k, ts + 1, 150); // The existing lock is not changed. - must_success(&engine, k, ts, 90, 110); - }; + must_success(engine, k, ts, 90, 110); + } // No lock. - must_err(&engine, k, 5, 100); + must_err(&mut engine, k, 5, 100); // Create a lock with TTL=100. // The initial TTL will be set to 0 after calling must_prewrite_put. Update it // first. - must_prewrite_put(&engine, k, v, k, 5); - must_locked(&engine, k, 5); - must_success(&engine, k, 5, 100, 100); + must_prewrite_put(&mut engine, k, v, k, 5); + must_locked(&mut engine, k, 5); + must_success(&mut engine, k, 5, 100, 100); - test(5); + test(5, k, &mut engine); - must_locked(&engine, k, 5); - must_commit(&engine, k, 5, 10); - must_unlocked(&engine, k); + must_locked(&mut engine, k, 5); + must_commit(&mut engine, k, 5, 10); + must_unlocked(&mut engine, k); // No lock. - must_err(&engine, k, 5, 100); - must_err(&engine, k, 10, 100); + must_err(&mut engine, k, 5, 100); + must_err(&mut engine, k, 10, 100); - must_acquire_pessimistic_lock(&engine, k, k, 8, 15); - must_pessimistic_locked(&engine, k, 8, 15); - must_success(&engine, k, 8, 100, 100); + must_acquire_pessimistic_lock(&mut engine, k, k, 8, 15); + must_pessimistic_locked(&mut engine, k, 8, 15); + must_success(&mut engine, k, 8, 100, 100); - test(8); + test(8, k, &mut engine); - must_pessimistic_locked(&engine, k, 8, 15); + must_pessimistic_locked(&mut engine, k, 8, 15); } } diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index a7c38e147ee..c3967820b34 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -473,7 +473,7 @@ impl Scheduler { .pool .spawn(async move { match unsafe { - with_tls_engine(|engine: &E| engine.precheck_write_with_ctx(&cmd_ctx)) + with_tls_engine(|engine: &mut E| engine.precheck_write_with_ctx(&cmd_ctx)) } { // Precheck failed, try to return err early. Err(e) => { @@ -577,7 +577,8 @@ impl Scheduler { } // The program is currently in scheduler worker threads. // Safety: `self.inner.worker_pool` should ensure that a TLS engine exists. - match unsafe { with_tls_engine(|engine: &E| kv::snapshot(engine, snap_ctx)) }.await + match unsafe { with_tls_engine(|engine: &mut E| kv::snapshot(engine, snap_ctx)) } + .await { Ok(snapshot) => { SCHED_STAGE_COUNTER_VEC.get(tag).snapshot_ok.inc(); @@ -943,7 +944,7 @@ impl Scheduler { { // Safety: `self.sched_pool` ensures a TLS engine exists. unsafe { - with_tls_engine(|engine: &E| { + with_tls_engine(|engine: &mut E| { // We skip writing the raftstore, but to improve CDC old value hit rate, // we should send the old values to the CDC scheduler. engine.schedule_txn_extra(to_be_write.extra); @@ -1152,7 +1153,7 @@ impl Scheduler { // Safety: `self.sched_pool` ensures a TLS engine exists. unsafe { - with_tls_engine(|engine: &E| { + with_tls_engine(|engine: &mut E| { if let Err(e) = engine.async_write_ext(&ctx, to_be_write, engine_cb, proposed_cb, committed_cb) { diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index 7300074bfde..b2f25cff640 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -663,7 +663,7 @@ mod tests { impl TestStore { fn new(key_num: u64) -> TestStore { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let keys: Vec = (START_ID..START_ID + key_num) .map(|i| format!("{}{}", KEY_PREFIX, i)) .collect(); diff --git a/tests/benches/hierarchy/engine/mod.rs b/tests/benches/hierarchy/engine/mod.rs index 85e6ce77e33..e089ef013ec 100644 --- a/tests/benches/hierarchy/engine/mod.rs +++ b/tests/benches/hierarchy/engine/mod.rs @@ -40,9 +40,9 @@ fn bench_engine_snapshot>( bencher: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); bencher.iter(|| { - black_box(&engine) + black_box(&mut engine) .snapshot(black_box(Default::default())) .unwrap() }); @@ -53,7 +53,7 @@ fn bench_engine_get>( bencher: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); let test_kvs: Vec = KvGenerator::with_seed( config.key_length, config.value_length, diff --git a/tests/benches/hierarchy/mvcc/mod.rs b/tests/benches/hierarchy/mvcc/mod.rs index f88533171c3..f57946a11cf 100644 --- a/tests/benches/hierarchy/mvcc/mod.rs +++ b/tests/benches/hierarchy/mvcc/mod.rs @@ -14,7 +14,7 @@ use txn_types::{Key, Mutation, TimeStamp}; use super::{BenchConfig, EngineFactory, DEFAULT_ITERATIONS, DEFAULT_KV_GENERATOR_SEED}; fn setup_prewrite( - engine: &E, + engine: &mut E, config: &BenchConfig, start_ts: impl Into, ) -> (E::Snap, Vec) @@ -66,7 +66,7 @@ where } fn mvcc_prewrite>(b: &mut Bencher<'_>, config: &BenchConfig) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( || { @@ -114,10 +114,10 @@ fn mvcc_prewrite>(b: &mut Bencher<'_>, config: &B } fn mvcc_commit>(b: &mut Bencher<'_>, config: &BenchConfig) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( - || setup_prewrite(&engine, config, 1), + || setup_prewrite(&mut engine, config, 1), |(snapshot, keys)| { for key in keys { let mut txn = mvcc::MvccTxn::new(1.into(), cm.clone()); @@ -133,10 +133,10 @@ fn mvcc_rollback_prewrote>( b: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( - || setup_prewrite(&engine, config, 1), + || setup_prewrite(&mut engine, config, 1), |(snapshot, keys)| { for key in keys { let mut txn = mvcc::MvccTxn::new(1.into(), cm.clone()); @@ -159,10 +159,10 @@ fn mvcc_rollback_conflict>( b: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( - || setup_prewrite(&engine, config, 2), + || setup_prewrite(&mut engine, config, 2), |(snapshot, keys)| { for key in keys { let mut txn = mvcc::MvccTxn::new(1.into(), cm.clone()); @@ -185,7 +185,7 @@ fn mvcc_rollback_non_prewrote>( b: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( || { @@ -221,7 +221,7 @@ fn mvcc_reader_load_lock>( b: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); let test_keys: Vec = KvGenerator::with_seed( config.key_length, config.value_length, @@ -251,7 +251,7 @@ fn mvcc_reader_seek_write>( b: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); b.iter_batched( || { let snapshot = engine.snapshot(Default::default()).unwrap(); diff --git a/tests/benches/hierarchy/txn/mod.rs b/tests/benches/hierarchy/txn/mod.rs index 840d4ac81fa..0bdb7ae8870 100644 --- a/tests/benches/hierarchy/txn/mod.rs +++ b/tests/benches/hierarchy/txn/mod.rs @@ -14,7 +14,7 @@ use txn_types::{Key, Mutation, TimeStamp}; use super::{BenchConfig, EngineFactory, DEFAULT_ITERATIONS}; fn setup_prewrite( - engine: &E, + engine: &mut E, config: &BenchConfig, start_ts: impl Into, ) -> Vec @@ -61,7 +61,7 @@ where } fn txn_prewrite>(b: &mut Bencher<'_>, config: &BenchConfig) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); let ctx = Context::default(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( @@ -109,11 +109,12 @@ fn txn_prewrite>(b: &mut Bencher<'_>, config: &Be } fn txn_commit>(b: &mut Bencher<'_>, config: &BenchConfig) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); + let mut engine_clone = engine.clone(); let ctx = Context::default(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( - || setup_prewrite(&engine, config, 1), + || setup_prewrite(&mut engine_clone, config, 1), |keys| { for key in keys { let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -132,11 +133,12 @@ fn txn_rollback_prewrote>( b: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); + let mut engine_clone = engine.clone(); let ctx = Context::default(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( - || setup_prewrite(&engine, config, 1), + || setup_prewrite(&mut engine_clone, config, 1), |keys| { for key in keys { let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -155,11 +157,12 @@ fn txn_rollback_conflict>( b: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); + let mut engine_clone = engine.clone(); let ctx = Context::default(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( - || setup_prewrite(&engine, config, 2), + || setup_prewrite(&mut engine_clone, config, 2), |keys| { for key in keys { let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -178,7 +181,7 @@ fn txn_rollback_non_prewrote>( b: &mut Bencher<'_>, config: &BenchConfig, ) { - let engine = config.engine_factory.build(); + let mut engine = config.engine_factory.build(); let ctx = Context::default(); let cm = ConcurrencyManager::new(1.into()); b.iter_batched( diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index 7802a90beac..a949570ebe1 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -119,7 +119,7 @@ impl RaftStoreRouter for SyncBenchRouter { impl LocalReadRouter for SyncBenchRouter { fn read( - &self, + &mut self, _: Option, req: RaftCmdRequest, cb: Callback, @@ -127,7 +127,7 @@ impl LocalReadRouter for SyncBenchRouter { self.send_command(req, cb, RaftCmdExtraOpts::default()) } - fn release_snapshot_cache(&self) {} + fn release_snapshot_cache(&mut self) {} } fn new_engine() -> (TempDir, RocksEngine) { @@ -180,7 +180,7 @@ fn bench_async_snapshot(b: &mut test::Bencher) { region.mut_region_epoch().set_version(2); region.mut_region_epoch().set_conf_ver(5); let (_tmp, db) = new_engine(); - let kv = RaftKv::new( + let mut kv = RaftKv::new( SyncBenchRouter::new(region.clone(), db.clone()), db, Arc::new(RwLock::new(HashSet::default())), diff --git a/tests/benches/misc/storage/incremental_get.rs b/tests/benches/misc/storage/incremental_get.rs index a57bd3c90d5..336f99cd35e 100644 --- a/tests/benches/misc/storage/incremental_get.rs +++ b/tests/benches/misc/storage/incremental_get.rs @@ -30,7 +30,7 @@ fn table_lookup_gen_data() -> (SnapshotStore>, Vec) { .unwrap(); store.commit(Context::default(), keys, 1, 2).unwrap(); - let engine = store.get_engine(); + let mut engine = store.get_engine(); let db = engine.get_rocksdb().get_sync_db(); db.compact_range_cf(db.cf_handle("write").unwrap(), None, None); db.compact_range_cf(db.cf_handle("default").unwrap(), None, None); diff --git a/tests/failpoints/cases/test_gc_metrics.rs b/tests/failpoints/cases/test_gc_metrics.rs index f96c03fe9f9..e698031f0bc 100644 --- a/tests/failpoints/cases/test_gc_metrics.rs +++ b/tests/failpoints/cases/test_gc_metrics.rs @@ -53,14 +53,14 @@ fn test_txn_create_compaction_filter() { cfg.writecf.dynamic_level_bytes = false; let dir = tempfile::TempDir::new().unwrap(); let builder = TestEngineBuilder::new().path(dir.path()); - let engine = builder.build_with_cfg(&cfg).unwrap(); + let mut engine = builder.build_with_cfg(&cfg).unwrap(); let raw_engine = engine.get_rocksdb(); let mut gc_runner = TestGcRunner::new(0); let value = vec![b'v'; 512]; - must_prewrite_put(&engine, b"zkey", &value, b"zkey", 100); - must_commit(&engine, b"zkey", 100, 110); + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); gc_runner .safe_point(TimeStamp::new(1).into_inner()) @@ -87,27 +87,27 @@ fn test_txn_mvcc_filtered() { MVCC_VERSIONS_HISTOGRAM.reset(); GC_COMPACTION_FILTERED.reset(); - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let raw_engine = engine.get_rocksdb(); let value = vec![b'v'; 512]; let mut gc_runner = TestGcRunner::new(0); // GC can't delete keys after the given safe point. - must_prewrite_put(&engine, b"zkey", &value, b"zkey", 100); - must_commit(&engine, b"zkey", 100, 110); + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", 100); + must_commit(&mut engine, b"zkey", 100, 110); gc_runner.safe_point(50).gc(&raw_engine); - must_get(&engine, b"zkey", 110, &value); + must_get(&mut engine, b"zkey", 110, &value); // GC can't delete keys before the safe ponit if they are latest versions. gc_runner.safe_point(200).gc(&raw_engine); - must_get(&engine, b"zkey", 110, &value); + must_get(&mut engine, b"zkey", 110, &value); - must_prewrite_put(&engine, b"zkey", &value, b"zkey", 120); - must_commit(&engine, b"zkey", 120, 130); + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", 120); + must_commit(&mut engine, b"zkey", 120, 130); // GC can't delete the latest version before the safe ponit. gc_runner.safe_point(115).gc(&raw_engine); - must_get(&engine, b"zkey", 110, &value); + must_get(&mut engine, b"zkey", 110, &value); // GC a version will also delete the key on default CF. gc_runner.safe_point(200).gc(&raw_engine); @@ -135,7 +135,7 @@ fn test_txn_gc_keys_handled() { GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED.reset(); let engine = TestEngineBuilder::new().build().unwrap(); - let prefixed_engine = PrefixedEngine(engine.clone()); + let mut prefixed_engine = PrefixedEngine(engine.clone()); let (tx, _rx) = mpsc::channel(); let feature_gate = FeatureGate::default(); @@ -172,10 +172,10 @@ fn test_txn_gc_keys_handled() { for i in 0..3 { let k = format!("k{:02}", i).into_bytes(); - must_prewrite_put(&prefixed_engine, &k, b"value", &k, 101); - must_commit(&prefixed_engine, &k, 101, 102); - must_prewrite_delete(&prefixed_engine, &k, &k, 151); - must_commit(&prefixed_engine, &k, 151, 152); + must_prewrite_put(&mut prefixed_engine, &k, b"value", &k, 101); + must_commit(&mut prefixed_engine, &k, 101, 102); + must_prewrite_delete(&mut prefixed_engine, &k, &k, 151); + must_commit(&mut prefixed_engine, &k, 151, 152); } db.flush_cf(cf, true).unwrap(); diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index 73031b10283..5845d4d4eb7 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -315,16 +315,16 @@ fn test_collect_applying_locks() { // correctly. #[test] fn test_error_in_compaction_filter() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let raw_engine = engine.get_rocksdb(); let large_value = vec![b'x'; 300]; - must_prewrite_put(&engine, b"zkey", &large_value, b"zkey", 101); - must_commit(&engine, b"zkey", 101, 102); - must_prewrite_put(&engine, b"zkey", &large_value, b"zkey", 103); - must_commit(&engine, b"zkey", 103, 104); - must_prewrite_delete(&engine, b"zkey", b"zkey", 105); - must_commit(&engine, b"zkey", 105, 106); + must_prewrite_put(&mut engine, b"zkey", &large_value, b"zkey", 101); + must_commit(&mut engine, b"zkey", 101, 102); + must_prewrite_put(&mut engine, b"zkey", &large_value, b"zkey", 103); + must_commit(&mut engine, b"zkey", 103, 104); + must_prewrite_delete(&mut engine, b"zkey", b"zkey", 105); + must_commit(&mut engine, b"zkey", 105, 106); let fp = "write_compaction_filter_flush_write_batch"; fail::cfg(fp, "return").unwrap(); @@ -339,8 +339,8 @@ fn test_error_in_compaction_filter() { } // Although versions on default CF is not cleaned, write CF is GCed correctly. - must_get_none(&engine, b"zkey", 102); - must_get_none(&engine, b"zkey", 104); + must_get_none(&mut engine, b"zkey", 102); + must_get_none(&mut engine, b"zkey", 104); fail::remove(fp); } diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 933fce2add0..101cf30d446 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -512,7 +512,7 @@ fn test_async_commit_prewrite_with_stale_max_ts() { let mut cluster = new_server_cluster(0, 2); cluster.run(); - let engine = cluster + let mut engine = cluster .sim .read() .unwrap() diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index d3e9e08500f..1a6f2da9b87 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -42,28 +42,28 @@ use txn_types::{Key, Mutation, PessimisticLock, TimeStamp}; #[test] fn test_txn_failpoints() { - let engine = TestEngineBuilder::new().build().unwrap(); + let mut engine = TestEngineBuilder::new().build().unwrap(); let (k, v) = (b"k", b"v"); fail::cfg("prewrite", "return(WriteConflict)").unwrap(); - must_prewrite_put_err(&engine, k, v, k, 10); + must_prewrite_put_err(&mut engine, k, v, k, 10); fail::remove("prewrite"); - must_prewrite_put(&engine, k, v, k, 10); + must_prewrite_put(&mut engine, k, v, k, 10); fail::cfg("commit", "delay(100)").unwrap(); - must_commit(&engine, k, 10, 20); + must_commit(&mut engine, k, 10, 20); fail::remove("commit"); let v1 = b"v1"; let (k2, v2) = (b"k2", b"v2"); - must_acquire_pessimistic_lock(&engine, k, k, 30, 30); + must_acquire_pessimistic_lock(&mut engine, k, k, 30, 30); fail::cfg("pessimistic_prewrite", "return()").unwrap(); - must_pessimistic_prewrite_put_err(&engine, k, v1, k, 30, 30, DoPessimisticCheck); - must_prewrite_put(&engine, k2, v2, k2, 31); + must_pessimistic_prewrite_put_err(&mut engine, k, v1, k, 30, 30, DoPessimisticCheck); + must_prewrite_put(&mut engine, k2, v2, k2, 31); fail::remove("pessimistic_prewrite"); - must_pessimistic_prewrite_put(&engine, k, v1, k, 30, 30, DoPessimisticCheck); - must_commit(&engine, k, 30, 40); - must_commit(&engine, k2, 31, 41); - must_get(&engine, k, 50, v1); - must_get(&engine, k2, 50, v2); + must_pessimistic_prewrite_put(&mut engine, k, v1, k, 30, 30, DoPessimisticCheck); + must_commit(&mut engine, k, 30, 40); + must_commit(&mut engine, k2, 31, 41); + must_get(&mut engine, k, 50, v1); + must_get(&mut engine, k2, 50, v2); } #[test] @@ -338,8 +338,8 @@ fn test_max_commit_ts_error() { cm.read_range_check(None, None, |_, _| Err(())).unwrap(); // Two locks should be written, the second one does not async commit. - let l1 = must_locked(&storage.get_engine(), b"k1", 10); - let l2 = must_locked(&storage.get_engine(), b"k2", 10); + let l1 = must_locked(&mut storage.get_engine(), b"k1", 10); + let l2 = must_locked(&mut storage.get_engine(), b"k2", 10); assert!(l1.use_async_commit); assert!(!l2.use_async_commit); } diff --git a/tests/failpoints/cases/test_ttl.rs b/tests/failpoints/cases/test_ttl.rs index 25ffcf6ff4c..12449752285 100644 --- a/tests/failpoints/cases/test_ttl.rs +++ b/tests/failpoints/cases/test_ttl.rs @@ -176,7 +176,7 @@ fn test_ttl_snapshot() { fn test_ttl_snapshot_impl() { fail::cfg("ttl_current_ts", "return(100)").unwrap(); let dir = tempfile::TempDir::new().unwrap(); - let engine = TestEngineBuilder::new() + let mut engine = TestEngineBuilder::new() .path(dir.path()) .api_version(F::TAG) .build() @@ -273,7 +273,7 @@ fn test_ttl_iterator() { fn test_ttl_iterator_impl() { fail::cfg("ttl_current_ts", "return(100)").unwrap(); let dir = tempfile::TempDir::new().unwrap(); - let engine = TestEngineBuilder::new() + let mut engine = TestEngineBuilder::new() .path(dir.path()) .api_version(F::TAG) .build() diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index f44b2f99642..6bc7e2fb7b8 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -1161,7 +1161,7 @@ fn test_sync_max_ts_after_region_merge() { let right = cluster.get_region(b"k3"); let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); - let storage = cluster + let mut storage = cluster .sim .read() .unwrap() @@ -1169,7 +1169,7 @@ fn test_sync_max_ts_after_region_merge() { .get(&1) .unwrap() .clone(); - let wait_for_synced = |cluster: &mut Cluster| { + let mut wait_for_synced = |cluster: &mut Cluster| { let region_id = right.get_id(); let leader = cluster.leader_of_region(region_id).unwrap(); let epoch = cluster.get_region_epoch(region_id); diff --git a/tests/integrations/raftstore/test_transfer_leader.rs b/tests/integrations/raftstore/test_transfer_leader.rs index b360bd3da58..130290e01b8 100644 --- a/tests/integrations/raftstore/test_transfer_leader.rs +++ b/tests/integrations/raftstore/test_transfer_leader.rs @@ -232,7 +232,7 @@ fn test_sync_max_ts_after_leader_transfer() { cluster.run(); let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); - let storage = cluster + let mut storage = cluster .sim .read() .unwrap() @@ -240,7 +240,7 @@ fn test_sync_max_ts_after_leader_transfer() { .get(&1) .unwrap() .clone(); - let wait_for_synced = |cluster: &mut Cluster| { + let mut wait_for_synced = |cluster: &mut Cluster| { let region_id = 1; let leader = cluster.leader_of_region(region_id).unwrap(); let epoch = cluster.get_region_epoch(region_id); diff --git a/tests/integrations/storage/test_raftkv.rs b/tests/integrations/storage/test_raftkv.rs index 20a3e5ebeaf..01993fb89cd 100644 --- a/tests/integrations/storage/test_raftkv.rs +++ b/tests/integrations/storage/test_raftkv.rs @@ -24,7 +24,7 @@ fn test_raftkv() { let region = cluster.get_region(b""); let leader_id = cluster.leader_of_region(region.get_id()).unwrap(); - let storage = cluster.sim.rl().storages[&leader_id.get_id()].clone(); + let mut storage = cluster.sim.rl().storages[&leader_id.get_id()].clone(); let mut ctx = Context::default(); ctx.set_region_id(region.get_id()); @@ -35,11 +35,11 @@ fn test_raftkv() { ..Default::default() }; - get_put(snap_ctx.clone(), &storage); - batch(snap_ctx.clone(), &storage); - seek(snap_ctx.clone(), &storage); - near_seek(snap_ctx.clone(), &storage); - cf(snap_ctx, &storage); + get_put(snap_ctx.clone(), &mut storage); + batch(snap_ctx.clone(), &mut storage); + seek(snap_ctx.clone(), &mut storage); + near_seek(snap_ctx.clone(), &mut storage); + cf(snap_ctx, &mut storage); empty_write(&ctx, &storage); wrong_context(&ctx, &storage); // TODO: test multiple node @@ -59,7 +59,7 @@ fn test_read_leader_in_lease() { let region = cluster.get_region(b""); let leader = cluster.leader_of_region(region.get_id()).unwrap(); - let storage = cluster.sim.rl().storages[&leader.get_id()].clone(); + let mut storage = cluster.sim.rl().storages[&leader.get_id()].clone(); let mut ctx = Context::default(); ctx.set_region_id(region.get_id()); @@ -71,14 +71,14 @@ fn test_read_leader_in_lease() { }; // write some data - assert_none(snap_ctx.clone(), &storage, k2); + assert_none(snap_ctx.clone(), &mut storage, k2); must_put(&ctx, &storage, k2, v2); // isolate leader cluster.add_send_filter(IsolationFilterFactory::new(leader.get_store_id())); // leader still in lease, check if can read on leader - assert_eq!(can_read(snap_ctx, &storage, k2, v2), true); + assert_eq!(can_read(snap_ctx, &mut storage, k2, v2), true); } #[test] @@ -95,7 +95,7 @@ fn test_read_index_on_replica() { let region = cluster.get_region(b""); let leader = cluster.leader_of_region(region.get_id()).unwrap(); - let storage = cluster.sim.rl().storages[&leader.get_id()].clone(); + let mut storage = cluster.sim.rl().storages[&leader.get_id()].clone(); let mut ctx = Context::default(); ctx.set_region_id(region.get_id()); @@ -108,7 +108,7 @@ fn test_read_index_on_replica() { // write some data let peers = region.get_peers(); - assert_none(snap_ctx, &storage, k2); + assert_none(snap_ctx, &mut storage, k2); must_put(&ctx, &storage, k2, v2); // read on follower @@ -155,7 +155,7 @@ fn test_read_on_replica() { let region = cluster.get_region(b""); let leader = cluster.leader_of_region(region.get_id()).unwrap(); - let leader_storage = cluster.sim.rl().storages[&leader.get_id()].clone(); + let mut leader_storage = cluster.sim.rl().storages[&leader.get_id()].clone(); let mut leader_ctx = Context::default(); leader_ctx.set_region_id(region.get_id()); @@ -168,7 +168,7 @@ fn test_read_on_replica() { // write some data let peers = region.get_peers(); - assert_none(leader_snap_ctx, &leader_storage, k2); + assert_none(leader_snap_ctx, &mut leader_storage, k2); must_put(&leader_ctx, &leader_storage, k2, v2); // read on follower @@ -192,19 +192,19 @@ fn test_read_on_replica() { pb_ctx: &follower_ctx, ..Default::default() }; - let follower_storage = cluster.sim.rl().storages[&follower_id].clone(); - assert_has(follower_snap_ctx.clone(), &follower_storage, k2, v2); + let mut follower_storage = cluster.sim.rl().storages[&follower_id].clone(); + assert_has(follower_snap_ctx.clone(), &mut follower_storage, k2, v2); must_put(&leader_ctx, &leader_storage, k3, v3); - assert_has(follower_snap_ctx.clone(), &follower_storage, k3, v3); + assert_has(follower_snap_ctx.clone(), &mut follower_storage, k3, v3); cluster.stop_node(follower_id); must_put(&leader_ctx, &leader_storage, k4, v4); cluster.run_node(follower_id).unwrap(); - let follower_storage = cluster.sim.rl().storages[&follower_id].clone(); + let mut follower_storage = cluster.sim.rl().storages[&follower_id].clone(); // sleep to ensure the follower has received a heartbeat from the leader thread::sleep(time::Duration::from_millis(300)); - assert_has(follower_snap_ctx, &follower_storage, k4, v4); + assert_has(follower_snap_ctx, &mut follower_storage, k4, v4); } #[test] @@ -263,7 +263,7 @@ fn test_read_on_replica_check_memory_locks() { key_ranges: vec![range], ..Default::default() }; - let follower_storage = cluster.sim.rl().storages[&follower_id].clone(); + let mut follower_storage = cluster.sim.rl().storages[&follower_id].clone(); match follower_storage.snapshot(follower_snap_ctx) { Err(Error(box ErrorInner::KeyIsLocked(lock_info))) => { assert_eq!(lock_info, lock.into_lock_info(raw_key.to_vec())) @@ -397,12 +397,12 @@ fn must_delete_cf(ctx: &Context, engine: &E, cf: CfName, key: &[u8]) engine.delete_cf(ctx, cf, Key::from_raw(key)).unwrap(); } -fn assert_has(ctx: SnapContext<'_>, engine: &E, key: &[u8], value: &[u8]) { +fn assert_has(ctx: SnapContext<'_>, engine: &mut E, key: &[u8], value: &[u8]) { let snapshot = engine.snapshot(ctx).unwrap(); assert_eq!(snapshot.get(&Key::from_raw(key)).unwrap().unwrap(), value); } -fn can_read(ctx: SnapContext<'_>, engine: &E, key: &[u8], value: &[u8]) -> bool { +fn can_read(ctx: SnapContext<'_>, engine: &mut E, key: &[u8], value: &[u8]) -> bool { if let Ok(s) = engine.snapshot(ctx) { assert_eq!(s.get(&Key::from_raw(key)).unwrap().unwrap(), value); return true; @@ -412,7 +412,7 @@ fn can_read(ctx: SnapContext<'_>, engine: &E, key: &[u8], value: &[u8 fn assert_has_cf( ctx: SnapContext<'_>, - engine: &E, + engine: &mut E, cf: CfName, key: &[u8], value: &[u8], @@ -424,19 +424,19 @@ fn assert_has_cf( ); } -fn assert_none(ctx: SnapContext<'_>, engine: &E, key: &[u8]) { +fn assert_none(ctx: SnapContext<'_>, engine: &mut E, key: &[u8]) { let snapshot = engine.snapshot(ctx).unwrap(); assert_eq!(snapshot.get(&Key::from_raw(key)).unwrap(), None); } -fn assert_none_cf(ctx: SnapContext<'_>, engine: &E, cf: CfName, key: &[u8]) { +fn assert_none_cf(ctx: SnapContext<'_>, engine: &mut E, cf: CfName, key: &[u8]) { let snapshot = engine.snapshot(ctx).unwrap(); assert_eq!(snapshot.get_cf(cf, &Key::from_raw(key)).unwrap(), None); } fn assert_seek( ctx: SnapContext<'_>, - engine: &E, + engine: &mut E, cf: CfName, key: &[u8], pair: (&[u8], &[u8]), @@ -479,7 +479,7 @@ fn assert_near_reverse_seek(cursor: &mut Cursor, key: &[u8], pai assert_eq!(cursor.value(&mut statistics), pair.1); } -fn get_put(ctx: SnapContext<'_>, engine: &E) { +fn get_put(ctx: SnapContext<'_>, engine: &mut E) { assert_none(ctx.clone(), engine, b"x"); must_put(ctx.pb_ctx, engine, b"x", b"1"); assert_has(ctx.clone(), engine, b"x", b"1"); @@ -487,7 +487,7 @@ fn get_put(ctx: SnapContext<'_>, engine: &E) { assert_has(ctx, engine, b"x", b"2"); } -fn batch(ctx: SnapContext<'_>, engine: &E) { +fn batch(ctx: SnapContext<'_>, engine: &mut E) { engine .write( ctx.pb_ctx, @@ -513,7 +513,7 @@ fn batch(ctx: SnapContext<'_>, engine: &E) { assert_none(ctx, engine, b"y"); } -fn seek(ctx: SnapContext<'_>, engine: &E) { +fn seek(ctx: SnapContext<'_>, engine: &mut E) { must_put(ctx.pb_ctx, engine, b"x", b"1"); assert_seek(ctx.clone(), engine, CF_DEFAULT, b"x", (b"x", b"1")); assert_seek(ctx.clone(), engine, CF_DEFAULT, b"a", (b"x", b"1")); @@ -536,7 +536,7 @@ fn seek(ctx: SnapContext<'_>, engine: &E) { must_delete(ctx.pb_ctx, engine, b"z"); } -fn near_seek(ctx: SnapContext<'_>, engine: &E) { +fn near_seek(ctx: SnapContext<'_>, engine: &mut E) { must_put(ctx.pb_ctx, engine, b"x", b"1"); must_put(ctx.pb_ctx, engine, b"z", b"2"); let snapshot = engine.snapshot(ctx.clone()).unwrap(); @@ -562,7 +562,7 @@ fn near_seek(ctx: SnapContext<'_>, engine: &E) { } // TODO: remove following as the code path of cf is the same. -fn cf(ctx: SnapContext<'_>, engine: &E) { +fn cf(ctx: SnapContext<'_>, engine: &mut E) { assert_none_cf(ctx.clone(), engine, "default", b"key"); must_put_cf(ctx.pb_ctx, engine, "default", b"key", b"value"); assert_has_cf(ctx.clone(), engine, "default", b"key", b"value"); From a7e8153fc6f557eef4fda0662c2c024f0fa15a0d Mon Sep 17 00:00:00 2001 From: Lei Zhao Date: Wed, 28 Sep 2022 16:25:44 +0800 Subject: [PATCH 247/676] tikv_util: make LruCache sound (#13552) close tikv/tikv#13551 It's error-prone to use reference/`Box` and raw pointer/`NonNull` at the same time. This PR replaces `Box` by `NonNull` and always use it as parameter. Signed-off-by: youjiali1995 Co-authored-by: Ti Chi Robot --- components/tikv_util/src/lru.rs | 83 +++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/components/tikv_util/src/lru.rs b/components/tikv_util/src/lru.rs index d8d2385fc34..2488fe7ef36 100644 --- a/components/tikv_util/src/lru.rs +++ b/components/tikv_util/src/lru.rs @@ -20,37 +20,39 @@ struct ValueEntry { } struct Trace { - head: Box>, - tail: Box>, + head: NonNull>, + tail: NonNull>, tick: usize, sample_mask: usize, } #[inline] -unsafe fn suture(leading: &mut Record, following: &mut Record) { - leading.next = NonNull::new_unchecked(following); - following.prev = NonNull::new_unchecked(leading); +unsafe fn suture(mut leading: NonNull>, mut following: NonNull>) { + leading.as_mut().next = following; + following.as_mut().prev = leading; } #[inline] -unsafe fn cut_out(record: &mut Record) { - suture(record.prev.as_mut(), record.next.as_mut()) +unsafe fn cut_out(record: NonNull>) { + suture(record.as_ref().prev, record.as_ref().next) } impl Trace { fn new(sample_mask: usize) -> Trace { unsafe { - let mut head = Box::new(Record { - prev: NonNull::new_unchecked(1usize as _), - next: NonNull::new_unchecked(1usize as _), + let head = Box::leak(Box::new(Record { + prev: NonNull::dangling(), + next: NonNull::dangling(), key: MaybeUninit::uninit(), - }); - let mut tail = Box::new(Record { - prev: NonNull::new_unchecked(1usize as _), - next: NonNull::new_unchecked(1usize as _), + })) + .into(); + let tail = Box::leak(Box::new(Record { + prev: NonNull::dangling(), + next: NonNull::dangling(), key: MaybeUninit::uninit(), - }); - suture(&mut head, &mut tail); + })) + .into(); + suture(head, tail); Trace { head, @@ -69,17 +71,17 @@ impl Trace { } } - fn promote(&mut self, mut record: NonNull>) { + fn promote(&mut self, record: NonNull>) { unsafe { - cut_out(record.as_mut()); - suture(record.as_mut(), self.head.next.as_mut()); - suture(&mut self.head, record.as_mut()); + cut_out(record); + suture(record, self.head.as_ref().next); + suture(self.head, record); } } - fn delete(&mut self, mut record: NonNull>) { + fn delete(&mut self, record: NonNull>) { unsafe { - cut_out(record.as_mut()); + cut_out(record); ptr::drop_in_place(Box::from_raw(record.as_ptr()).key.as_mut_ptr()); } @@ -87,24 +89,24 @@ impl Trace { fn create(&mut self, key: K) -> NonNull> { let record = Box::leak(Box::new(Record { - prev: unsafe { NonNull::new_unchecked(&mut *self.head) }, - next: self.head.next, + prev: self.head, + next: unsafe { self.head.as_ref().next }, key: MaybeUninit::new(key), })) .into(); unsafe { - self.head.next.as_mut().prev = record; - self.head.next = record; + self.head.as_mut().next.as_mut().prev = record; + self.head.as_mut().next = record; } record } fn reuse_tail(&mut self, key: K) -> (K, NonNull>) { unsafe { - let mut record = self.tail.prev; - cut_out(record.as_mut()); - suture(record.as_mut(), self.head.next.as_mut()); - suture(&mut self.head, record.as_mut()); + let mut record = self.tail.as_ref().prev; + cut_out(record); + suture(record, self.head.as_ref().next); + suture(self.head, record); let old_key = record.as_mut().key.as_ptr().read(); record.as_mut().key = MaybeUninit::new(key); @@ -113,21 +115,21 @@ impl Trace { } fn clear(&mut self) { - let mut cur = self.head.next; unsafe { - while cur.as_ptr() != &mut *self.tail { - let tmp = cur.as_mut().next; + let mut cur = self.head.as_ref().next; + while cur != self.tail { + let tmp = cur.as_ref().next; ptr::drop_in_place(Box::from_raw(cur.as_ptr()).key.as_mut_ptr()); cur = tmp; } - suture(&mut self.head, &mut self.tail); + suture(self.head, self.tail); } } fn remove_tail(&mut self) -> K { unsafe { - let mut record = self.tail.prev; - cut_out(record.as_mut()); + let record = self.tail.as_ref().prev; + cut_out(record); let r = Box::from_raw(record.as_ptr()); r.key.as_ptr().read() @@ -135,6 +137,15 @@ impl Trace { } } +impl Drop for Trace { + fn drop(&mut self) { + unsafe { + drop(Box::from_raw(self.head.as_ptr())); + drop(Box::from_raw(self.tail.as_ptr())); + } + } +} + pub trait SizePolicy { fn current(&self) -> usize; fn on_insert(&mut self, key: &K, value: &V); From c6f4f1cbfd82a349996c3a2fce386879d1537149 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 28 Sep 2022 17:01:45 +0800 Subject: [PATCH 248/676] engine: fix performance issue of deleting files in ranges (#13540) close tikv/tikv#13534 Signed-off-by: tabokie --- Cargo.lock | 6 ++-- components/engine_rocks/src/misc.rs | 45 ++++++++++++++++++++--------- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 12f1271156d..bb3b33463cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2793,7 +2793,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#4c859a208355bc15ceb7dc1f05303f68acfb4791" +source = "git+https://github.com/tikv/rust-rocksdb.git#bd07e9e598db63574cf06edaeea3c4687eadff59" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2812,7 +2812,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#4c859a208355bc15ceb7dc1f05303f68acfb4791" +source = "git+https://github.com/tikv/rust-rocksdb.git#bd07e9e598db63574cf06edaeea3c4687eadff59" dependencies = [ "bzip2-sys", "cc", @@ -4655,7 +4655,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#4c859a208355bc15ceb7dc1f05303f68acfb4791" +source = "git+https://github.com/tikv/rust-rocksdb.git#bd07e9e598db63574cf06edaeea3c4687eadff59" dependencies = [ "libc 0.2.132", "librocksdb_sys", diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index e7c9ef547d8..482686ffd1a 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -4,6 +4,7 @@ use engine_traits::{ CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, }; +use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; use crate::{ @@ -150,26 +151,42 @@ impl MiscExt for RocksEngine { match strategy { DeleteStrategy::DeleteFiles => { let handle = util::get_cf_handle(self.as_inner(), cf)?; - for r in ranges { - if r.start_key >= r.end_key { - continue; - } - self.as_inner() - .delete_files_in_range_cf(handle, r.start_key, r.end_key, false) - .map_err(r2e)?; + let rocks_ranges: Vec<_> = ranges + .iter() + .filter_map(|r| { + if r.start_key >= r.end_key { + None + } else { + Some(RocksRange::new(r.start_key, r.end_key)) + } + }) + .collect(); + if rocks_ranges.is_empty() { + return Ok(()); } + self.as_inner() + .delete_files_in_ranges_cf(handle, &rocks_ranges, false) + .map_err(r2e)?; } DeleteStrategy::DeleteBlobs => { let handle = util::get_cf_handle(self.as_inner(), cf)?; if self.is_titan() { - for r in ranges { - if r.start_key >= r.end_key { - continue; - } - self.as_inner() - .delete_blob_files_in_range_cf(handle, r.start_key, r.end_key, false) - .map_err(r2e)?; + let rocks_ranges: Vec<_> = ranges + .iter() + .filter_map(|r| { + if r.start_key >= r.end_key { + None + } else { + Some(RocksRange::new(r.start_key, r.end_key)) + } + }) + .collect(); + if rocks_ranges.is_empty() { + return Ok(()); } + self.as_inner() + .delete_blob_files_in_ranges_cf(handle, &rocks_ranges, false) + .map_err(r2e)?; } } DeleteStrategy::DeleteByRange => { From d96b8d100b6982f79dadd61a449c743889e21a88 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 29 Sep 2022 11:35:45 +0800 Subject: [PATCH 249/676] storage: Rollback all locks to prevent from losing the pessimistic lock in flashback (#13521) close tikv/tikv#13493 Rollback all locks to prevent from losing the pessimistic lock in flashback Signed-off-by: husharp --- .../txn/actions/flashback_to_version.rs | 89 +++++++++++++------ .../integrations/raftstore/test_flashback.rs | 10 +-- 2 files changed, 67 insertions(+), 32 deletions(-) diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 5fcf0327c37..e160a4a43b9 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -1,10 +1,10 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use txn_types::{Key, Lock, LockType, TimeStamp, Write, WriteType}; +use txn_types::{Key, Lock, TimeStamp, Write, WriteType}; use crate::storage::{ mvcc::{MvccReader, MvccTxn, SnapshotReader, MAX_TXN_WRITE_SIZE}, - txn::{Error, ErrorInner, Result as TxnResult}, + txn::{actions::check_txn_status::rollback_lock, Error, ErrorInner, Result as TxnResult}, Snapshot, Statistics, }; @@ -85,7 +85,6 @@ pub fn flashback_to_version( start_ts: TimeStamp, commit_ts: TimeStamp, ) -> TxnResult { - let mut rows = 0; // To flashback the `CF_LOCK`, we need to delete all locks records whose // `start_ts` is greater than the specified version, and if it's not a // short-value `LockType::Put`, we need to delete the actual data from @@ -96,14 +95,16 @@ pub fn flashback_to_version( *next_lock_key = Some(key); break; } - txn.unlock_key(key.clone(), lock.is_pessimistic_txn()); - rows += 1; - // If the short value is none and it's a `LockType::Put`, we should delete the - // corresponding key from `CF_DEFAULT` as well. - if lock.short_value.is_none() && lock.lock_type == LockType::Put { - txn.delete_value(key, lock.ts); - rows += 1; - } + // To guarantee rollback with start ts of the locks + reader.start_ts = lock.ts; + rollback_lock( + txn, + reader, + key.clone(), + &lock, + lock.is_pessimistic_txn(), + true, + )?; } // To flashback the `CF_WRITE` and `CF_DEFAULT`, we need to write a new MVCC // record for each key in `self.keys` with its old value at `self.version`, @@ -127,7 +128,6 @@ pub fn flashback_to_version( start_ts, reader.load_data(&key, old_write.clone())?, ); - rows += 1; } Write::new(old_write.write_type, start_ts, old_write.short_value) } else { @@ -141,9 +141,8 @@ pub fn flashback_to_version( Write::new(WriteType::Delete, start_ts, None) }; txn.put_write(key.clone(), commit_ts, new_write.as_ref().to_bytes()); - rows += 1; } - Ok(rows) + Ok(txn.modifies.len()) } #[cfg(test)] @@ -156,14 +155,18 @@ pub mod tests { use super::*; use crate::storage::{ mvcc::tests::{must_get, must_get_none, write}, - txn::actions::{ - commit::tests::must_succeed as must_commit, - tests::{must_prewrite_delete, must_prewrite_put, must_rollback}, + txn::{ + actions::{ + acquire_pessimistic_lock::tests::must_pessimistic_locked, + commit::tests::must_succeed as must_commit, + tests::{must_prewrite_delete, must_prewrite_put, must_rollback}, + }, + tests::{must_acquire_pessimistic_lock, must_pessimistic_prewrite_put_err}, }, Engine, TestEngineBuilder, }; - fn must_flashback_write( + fn must_flashback_to_version( engine: &mut E, key: &[u8], version: impl Into, @@ -176,6 +179,10 @@ pub mod tests { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); let mut statistics = Statistics::default(); + let (key_locks, has_remain_locks) = + flashback_to_version_read_lock(&mut reader, &Some(key.clone()), &None, &mut statistics) + .unwrap(); + assert!(!has_remain_locks); let (key_old_writes, has_remain_writes) = flashback_to_version_read_write( &mut reader, 0, @@ -197,7 +204,7 @@ pub mod tests { &mut reader, &mut None, &mut Some(key), - vec![], + key_locks, key_old_writes, start_ts, commit_ts, @@ -236,43 +243,43 @@ pub mod tests { must_get(&mut engine, k, *ts.incr(), v2); // Flashback to version 1 with start_ts = 14, commit_ts = 15. assert_eq!( - must_flashback_write(&mut engine, k, 1, *ts.incr(), *ts.incr()), + must_flashback_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, *ts.incr()); // Flashback to version 2 with start_ts = 17, commit_ts = 18. assert_eq!( - must_flashback_write(&mut engine, k, 2, *ts.incr(), *ts.incr()), + must_flashback_to_version(&mut engine, k, 2, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 5 with start_ts = 20, commit_ts = 21. assert_eq!( - must_flashback_write(&mut engine, k, 5, *ts.incr(), *ts.incr()), + must_flashback_to_version(&mut engine, k, 5, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 7 with start_ts = 23, commit_ts = 24. assert_eq!( - must_flashback_write(&mut engine, k, 7, *ts.incr(), *ts.incr()), + must_flashback_to_version(&mut engine, k, 7, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 10 with start_ts = 26, commit_ts = 27. assert_eq!( - must_flashback_write(&mut engine, k, 10, *ts.incr(), *ts.incr()), + must_flashback_to_version(&mut engine, k, 10, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, *ts.incr()); // Flashback to version 13 with start_ts = 29, commit_ts = 30. assert_eq!( - must_flashback_write(&mut engine, k, 13, *ts.incr(), *ts.incr()), + must_flashback_to_version(&mut engine, k, 13, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v2); // Flashback to version 27 with start_ts = 32, commit_ts = 33. assert_eq!( - must_flashback_write(&mut engine, k, 27, *ts.incr(), *ts.incr()), + must_flashback_to_version(&mut engine, k, 27, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, *ts.incr()); @@ -291,9 +298,37 @@ pub mod tests { // Since the key has been deleted, flashback to version 1 should not do // anything. assert_eq!( - must_flashback_write(&mut engine, k, ts, *ts.incr(), *ts.incr()), + must_flashback_to_version(&mut engine, k, ts, *ts.incr(), *ts.incr()), 0 ); must_get_none(&mut engine, k, ts); } + + #[test] + fn test_flashback_to_version_pessimistic() { + use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + let k = b"k"; + let (v1, v2, v3) = (b"v1", b"v2", b"v3"); + // Prewrite and commit Put(k -> v1) with stat_ts = 10, commit_ts = 15. + must_prewrite_put(&mut engine, k, v1, k, 10); + must_commit(&mut engine, k, 10, 15); + // Prewrite and commit Put(k -> v2) with stat_ts = 20, commit_ts = 25. + must_prewrite_put(&mut engine, k, v2, k, 20); + must_commit(&mut engine, k, 20, 25); + + must_acquire_pessimistic_lock(&mut engine, k, k, 30, 30); + must_pessimistic_locked(&mut engine, k, 30, 30); + + // Flashback to version 17 with start_ts = 35, commit_ts = 40. + // Distinguish from pessimistic start_ts 30 to make sure rollback ts is by lock + // ts. + assert_eq!(must_flashback_to_version(&mut engine, k, 17, 35, 40), 3); + + // Pessimistic Prewrite Put(k -> v3) with stat_ts = 30 will be error with + // Rollback. + must_pessimistic_prewrite_put_err(&mut engine, k, v3, k, 30, 30, DoPessimisticCheck); + must_get(&mut engine, k, 45, v1); + } } diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index cf91873d385..064edebf88a 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -8,7 +8,7 @@ use test_raftstore::*; use txn_types::WriteBatchFlags; #[test] -fn test_flahsback_for_applied_index() { +fn test_flashback_for_applied_index() { let mut cluster = new_node_cluster(0, 3); cluster.run(); @@ -79,7 +79,7 @@ fn test_flashback_for_schedule() { } #[test] -fn test_flahsback_for_write() { +fn test_flashback_for_write() { let mut cluster = new_node_cluster(0, 3); cluster.run(); @@ -107,7 +107,7 @@ fn test_flahsback_for_write() { } #[test] -fn test_flahsback_for_read() { +fn test_flashback_for_read() { let mut cluster = new_node_cluster(0, 3); cluster.run(); @@ -141,7 +141,7 @@ fn test_flahsback_for_read() { // However, when flashback is enabled, it will make the lease None and prevent // renew lease. #[test] -fn test_flahsback_for_local_read() { +fn test_flashback_for_local_read() { let mut cluster = new_node_cluster(0, 3); let election_timeout = configure_for_lease_read(&mut cluster, Some(50), None); @@ -208,7 +208,7 @@ fn test_flahsback_for_local_read() { } #[test] -fn test_flahsback_for_status_cmd_as_region_detail() { +fn test_flashback_for_status_cmd_as_region_detail() { let mut cluster = new_node_cluster(0, 3); cluster.run(); From 5bc1fa7183886c2b0098c47e80df721037b193aa Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Thu, 29 Sep 2022 13:49:45 +0800 Subject: [PATCH 250/676] storage: remove histogram about locking key in prewrite (#13527) close tikv/tikv#13526 It is impossible to get blocked when acquiring a lock from the concurrency manager when generating async-commit timestamp because the latch already ensures there can't be race on the same key. So, the histogram recording the duration is useless and is removed in this commit. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- src/storage/mvcc/metrics.rs | 9 ++------- src/storage/txn/actions/prewrite.rs | 8 +++----- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/src/storage/mvcc/metrics.rs b/src/storage/mvcc/metrics.rs index ddfdc14f5ef..3c4bda63f7e 100644 --- a/src/storage/mvcc/metrics.rs +++ b/src/storage/mvcc/metrics.rs @@ -68,12 +68,6 @@ lazy_static! { exponential_buckets(1.0, 2.0, 30).unwrap() ) .unwrap(); - pub static ref CONCURRENCY_MANAGER_LOCK_DURATION_HISTOGRAM: Histogram = register_histogram!( - "tikv_concurrency_manager_lock_duration", - "Histogram of the duration of lock key in the concurrency manager", - exponential_buckets(1e-7, 2.0, 20).unwrap() // 100ns ~ 100ms - ) - .unwrap(); pub static ref MVCC_CONFLICT_COUNTER: MvccConflictCounterVec = { register_static_int_counter_vec!( MvccConflictCounterVec, @@ -107,6 +101,7 @@ lazy_static! { "tikv_storage_mvcc_prewrite_assertion_perf", "Counter of assertion operations in transactions", &["type"] - ).unwrap() + ) + .unwrap() }; } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index a8a33799686..40709032d61 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -16,8 +16,8 @@ use txn_types::{ use crate::storage::{ mvcc::{ metrics::{ - CONCURRENCY_MANAGER_LOCK_DURATION_HISTOGRAM, MVCC_CONFLICT_COUNTER, - MVCC_DUPLICATE_CMD_COUNTER_VEC, MVCC_PREWRITE_ASSERTION_PERF_COUNTER_VEC, + MVCC_CONFLICT_COUNTER, MVCC_DUPLICATE_CMD_COUNTER_VEC, + MVCC_PREWRITE_ASSERTION_PERF_COUNTER_VEC, }, Error, ErrorInner, Lock, LockType, MvccTxn, Result, SnapshotReader, }, @@ -646,9 +646,7 @@ fn async_commit_timestamps( ) -> Result { // This operation should not block because the latch makes sure only one thread // is operating on this key. - let key_guard = CONCURRENCY_MANAGER_LOCK_DURATION_HISTOGRAM.observe_closure_duration(|| { - ::futures_executor::block_on(txn.concurrency_manager.lock_key(key)) - }); + let key_guard = ::futures_executor::block_on(txn.concurrency_manager.lock_key(key)); let final_min_commit_ts = key_guard.with_lock(|l| { let max_ts = txn.concurrency_manager.max_ts(); From 9321040f2eea6617e486c58b959d40abe527aa82 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Thu, 29 Sep 2022 14:15:45 +0800 Subject: [PATCH 251/676] storage: Add new implementation of lock waiting queue (#13486) ref tikv/tikv#13298 Implements a new version of lock waiting queue, which is important for supporting resumable acquire_pessimistic_lock requests. (ref #13298) * Make `storage::lock_manager` an directory * Add new implementation of the lock waiting queue and other related stuff. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- Cargo.lock | 9 + Cargo.toml | 3 + src/storage/errors.rs | 31 + src/storage/lock_manager/lock_wait_context.rs | 270 +++++ .../lock_manager/lock_waiting_queue.rs | 977 ++++++++++++++++++ .../{lock_manager.rs => lock_manager/mod.rs} | 15 + src/storage/types.rs | 26 + 7 files changed, 1331 insertions(+) create mode 100644 src/storage/lock_manager/lock_wait_context.rs create mode 100644 src/storage/lock_manager/lock_waiting_queue.rs rename src/storage/{lock_manager.rs => lock_manager/mod.rs} (90%) diff --git a/Cargo.lock b/Cargo.lock index bb3b33463cf..821e15edc18 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5561,6 +5561,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "sync_wrapper" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" + [[package]] name = "sysinfo" version = "0.16.4" @@ -6157,6 +6163,7 @@ dependencies = [ "crc32fast", "crc64fast", "crossbeam", + "dashmap", "encryption_export", "engine_panic", "engine_rocks", @@ -6228,8 +6235,10 @@ dependencies = [ "serde_json", "slog", "slog-global", + "smallvec", "sst_importer", "strum 0.20.0", + "sync_wrapper", "sysinfo", "tempfile", "test_sst_importer", diff --git a/Cargo.toml b/Cargo.toml index 13479b2a8fb..f51e2ddd303 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,6 +80,7 @@ coprocessor_plugin_api = { path = "components/coprocessor_plugin_api" } crc32fast = "1.2" crc64fast = "0.1" crossbeam = "0.8" +dashmap = "5" encryption_export = { path = "components/encryption/export", default-features = false } engine_panic = { path = "components/engine_panic", default-features = false } engine_rocks = { path = "components/engine_rocks", default-features = false } @@ -146,8 +147,10 @@ serde_ignored = "0.1" serde_json = { version = "1.0", features = ["preserve_order"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +smallvec = "1.4" sst_importer = { path = "components/sst_importer", default-features = false } strum = { version = "0.20", features = ["derive"] } +sync_wrapper = "0.1.1" sysinfo = "0.16" tempfile = "3.0" thiserror = "1.0" diff --git a/src/storage/errors.rs b/src/storage/errors.rs index faf12f34003..7ce5d925dfa 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -2,9 +2,11 @@ //! Types for storage related errors and associated helper methods. use std::{ + convert::TryFrom, error::Error as StdError, fmt::{self, Debug, Display, Formatter}, io::Error as IoError, + sync::Arc, }; use error_code::{self, ErrorCode, ErrorCodeExt}; @@ -456,6 +458,35 @@ pub fn extract_key_errors(res: Result>>) -> Vec); + +impl From for SharedError { + fn from(e: ErrorInner) -> Self { + Self(Arc::new(e)) + } +} + +impl From for SharedError { + fn from(e: Error) -> Self { + Self(Arc::from(e.0)) + } +} + +/// Tries to convert the shared error to owned one. It can success only when +/// it's the only reference to the error. +impl TryFrom for Error { + type Error = (); + + fn try_from(e: SharedError) -> std::result::Result { + Arc::try_unwrap(e.0).map(Into::into).map_err(|_| ()) + } +} + #[cfg(test)] mod test { use kvproto::kvrpcpb::WriteConflictReason; diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs new file mode 100644 index 00000000000..97ff49f965b --- /dev/null +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -0,0 +1,270 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! Holds the state of a lock-waiting `AcquirePessimisticLock` request. +//! +//! When an `AcquirePessimisticLock` request meets a lock and enters +//! lock-waiting state, it then may be either woken up by popping from the +//! [`LockWaitingQueue`](super::lock_waiting_queue::LockWaitQueues), +//! or cancelled by the +//! [`WaiterManager`](crate::server::lock_manager::WaiterManager) due to +//! timeout. [`LockWaitContext`] is therefore used to share the necessary state +//! of a single `AcquirePessimisticLock` request, and ensuring the internal +//! callback for returning response through RPC is called at most only once. +//! +//! Note: The corresponding implementation in `WaiterManager` is not yet +//! implemented, and this mod is currently not used yet. + +use std::{ + convert::TryInto, + result::Result, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use parking_lot::Mutex; +use txn_types::TimeStamp; + +use crate::storage::{ + errors::SharedError, + lock_manager::{lock_waiting_queue::PessimisticLockKeyCallback, LockManager, LockWaitToken}, + Error as StorageError, PessimisticLockRes, ProcessResult, StorageCallback, +}; + +pub struct LockWaitContextInner { + /// The callback for finishing the current AcquirePessimisticLock request. + /// Usually, requests are accepted from RPC, and in this case calling + /// the callback means returning the response to the client via RPC. + cb: StorageCallback, + + /// The token of the corresponding waiter in `LockManager`. + #[allow(dead_code)] + lock_wait_token: LockWaitToken, +} + +/// The content of the `LockWaitContext` that needs to be shared among all +/// clones. +/// +/// When a AcquirePessimisticLock request meets lock and enters lock waiting +/// state, a `LockWaitContext` will be created, and the +/// `LockWaitContextSharedState` will be shared in these places: +/// * Callbacks created from the `lockWaitContext` and distributed to the lock +/// waiting queue and the `LockManager`. When one of the callbacks is called +/// and the request is going to be finished, they need to take the +/// [`LockWaitContextInner`] to call the callback. +/// * The [`LockWaitEntry`](crate::storage::lock_manager::lock_waiting_queue::LockWaitEntry), for +/// checking whether the request is already finished (cancelled). +pub struct LockWaitContextSharedState { + ctx_inner: Mutex>, + pub finished: AtomicBool, +} + +impl LockWaitContextSharedState { + /// Checks whether the lock-waiting request is already finished. + pub fn is_finished(&self) -> bool { + self.finished.load(Ordering::Acquire) + } +} + +#[derive(Clone)] +pub struct LockWaitContext { + shared_states: Arc, + #[allow(dead_code)] + lock_manager: L, + allow_lock_with_conflict: bool, + + // Fields for logging: + start_ts: TimeStamp, + for_update_ts: TimeStamp, +} + +impl LockWaitContext { + pub fn new( + lock_manager: L, + lock_wait_token: LockWaitToken, + start_ts: TimeStamp, + for_update_ts: TimeStamp, + cb: StorageCallback, + allow_lock_with_conflict: bool, + ) -> Self { + let inner = LockWaitContextInner { + cb, + lock_wait_token, + }; + Self { + shared_states: Arc::new(LockWaitContextSharedState { + ctx_inner: Mutex::new(Some(inner)), + finished: AtomicBool::new(false), + }), + lock_manager, + allow_lock_with_conflict, + start_ts, + for_update_ts, + } + } + + pub fn get_shared_states(&self) -> &Arc { + &self.shared_states + } + + /// Get the callback that should be invoked when finishes executing the + /// scheduler command that issued the lock-waiting. + /// + /// When we support partially finishing a pessimistic lock request (i.e. + /// when acquiring lock multiple keys in one single request, allowing + /// some keys to be locked successfully while the others are blocked or + /// failed), this will be useful for handling the result of the first + /// write batch. But currently, the first write batch of a lock-waiting + /// request is always empty, so the callback is just noop. + pub fn get_callback_for_first_write_batch(&self) -> StorageCallback { + StorageCallback::Boolean(Box::new(|res| { + res.unwrap(); + })) + } + + /// Get the callback that should be called when the request is woken up on a + /// key. + pub fn get_callback_for_blocked_key(&self) -> PessimisticLockKeyCallback { + let ctx = self.clone(); + Box::new(move |res| { + ctx.finish_request(res); + }) + } + + /// Get the callback that's used to cancel a lock-waiting request. Usually + /// called by + /// [`WaiterManager`](crate::server::lock_manager::WaiterManager) due to + /// timeout. + pub fn get_callback_for_cancellation(&self) -> impl FnOnce(StorageError) { + let ctx = self.clone(); + move |e| { + ctx.finish_request(Err(e.into())); + } + } + + fn finish_request(&self, result: Result) { + let ctx_inner = if let Some(inner) = self.shared_states.ctx_inner.lock().take() { + inner + } else { + debug!("double invoking of finish_request of LockWaitContext"; + "start_ts" => self.start_ts, + "for_update_ts" => self.for_update_ts + ); + return; + }; + + self.shared_states.finished.store(true, Ordering::Release); + + // TODO: Uncomment this after the corresponding change of `LockManager` is done. + // self.lock_manager + // .remove_lock_wait(ctx_inner.lock_wait_token); + + if !self.allow_lock_with_conflict { + // The result must be an owned error. + let err = result.unwrap_err().try_into().unwrap(); + ctx_inner.cb.execute(ProcessResult::Failed { err }); + return; + } + + // The following code is only valid after implementing the new lock-waiting + // model. + unreachable!(); + } +} + +#[cfg(test)] +mod tests { + use std::{ + sync::mpsc::{channel, Receiver}, + time::Duration, + }; + + use super::*; + use crate::storage::{ + lock_manager::DummyLockManager, + mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, + txn::{Error as TxnError, ErrorInner as TxnErrorInner}, + ErrorInner as StorageErrorInner, Result as StorageResult, + }; + + fn create_storage_cb() -> ( + StorageCallback, + Receiver>>, + ) { + let (tx, rx) = channel(); + let cb = StorageCallback::PessimisticLock(Box::new(move |r| tx.send(r).unwrap())); + (cb, rx) + } + + fn create_test_lock_wait_ctx() -> ( + LockWaitContext, + Receiver>>, + ) { + // TODO: Use `ProxyLockMgr` to check the correctness of the `remove_lock_wait` + // invocation. + let lock_mgr = DummyLockManager {}; + let (cb, rx) = create_storage_cb(); + let ctx = LockWaitContext::new( + lock_mgr, + super::super::LockWaitToken(Some(1)), + 1.into(), + 1.into(), + cb, + false, + ); + (ctx, rx) + } + + #[test] + fn test_lock_wait_context() { + let write_conflict = || { + StorageErrorInner::Txn(TxnError::from(TxnErrorInner::Mvcc(MvccError::from( + MvccErrorInner::WriteConflict { + start_ts: 1.into(), + conflict_start_ts: 2.into(), + conflict_commit_ts: 2.into(), + key: b"k1".to_vec(), + primary: b"k1".to_vec(), + reason: kvproto::kvrpcpb::WriteConflictReason::PessimisticRetry, + }, + )))) + }; + let key_is_locked = || { + StorageErrorInner::Txn(TxnError::from(TxnErrorInner::Mvcc(MvccError::from( + MvccErrorInner::KeyIsLocked(kvproto::kvrpcpb::LockInfo::default()), + )))) + }; + + let (ctx, rx) = create_test_lock_wait_ctx(); + // Nothing happens currently. + (ctx.get_callback_for_first_write_batch()).execute(ProcessResult::Res); + rx.recv_timeout(Duration::from_millis(20)).unwrap_err(); + (ctx.get_callback_for_blocked_key())(Err(SharedError::from(write_conflict()))); + let res = rx.recv().unwrap().unwrap_err(); + assert!(matches!( + &res, + StorageError(box StorageErrorInner::Txn(TxnError( + box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::WriteConflict { .. })) + ))) + )); + // The tx should be dropped. + rx.recv().unwrap_err(); + // Nothing happens if the callback is double-called. + (ctx.get_callback_for_cancellation())(StorageError::from(key_is_locked())); + + let (ctx, rx) = create_test_lock_wait_ctx(); + (ctx.get_callback_for_cancellation())(StorageError::from(key_is_locked())); + let res = rx.recv().unwrap().unwrap_err(); + assert!(matches!( + &res, + StorageError(box StorageErrorInner::Txn(TxnError( + box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::KeyIsLocked(_))) + ))) + )); + // Nothing happens if the callback is double-called. + (ctx.get_callback_for_blocked_key())(Err(SharedError::from(write_conflict()))); + // The tx should be dropped. + rx.recv().unwrap_err(); + } +} diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs new file mode 100644 index 00000000000..a3312a4fdb2 --- /dev/null +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -0,0 +1,977 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This mod contains the [`LockWaitQueues`] for managing waiting and waking up +//! of `AcquirePessimisticLock` requests in lock-contention scenarios, and other +//! related accessories, including: +//! +//! - [`SharedError`]: A wrapper type to [`crate::storage::Error`] to allow the +//! error being shared in multiple places +//! - Related type aliases +//! - [`LockWaitEntry`]: which is used to represent lock-waiting requests in the +//! queue +//! - [`Box`]: The comparable wrapper of [`LockWaitEntry`] which +//! defines the priority ordering among lock-waiting requests +//! +//! Each key may have its own lock-waiting queue, which is a priority queue that +//! orders the entries with the order defined by +//! [`Box`]. +//! +//! There are be two kinds of `AcquirePessimisticLock` requests: +//! +//! * Requests in legacy mode: indicated by `allow_lock_with_conflict = false`. +//! A legacy request is woken up, it should return a `WriteConflict` +//! immediately to the client to tell the client to retry. Then, the remaining +//! lock-waiting entries should be woken up after delaying for +//! `wake-up-delay-duration` which is a configurable value. +//! * Resumable requests: indicated by `allow_lock_with_conflict = true`. This +//! kind of requests are allowed to lock even if there is write conflict, When +//! it's woken up after waiting for another lock, it can then resume execution +//! and try to acquire the lock again. No delayed waking up is necessary. +//! **Note that though the `LockWaitQueues` is designed to accept it, this +//! kind of requests are currently not implemented yet.** +//! +//! ## Details about delayed waking up +//! +//! The delayed waking-up is needed after waking up a request in legacy mode. +//! The reasons are: +//! +//! * The head of the queue (let's denote its belonging transaction by `T1`) is +//! woken up after the current lock being released, then the request will +//! return a `WriteConflict` error immediately, and the key is left unlocked. +//! It's possible that `T1` won't lock the key again. However, the other +//! waiting requests need releasing-lock event to be woken up. In this case, +//! we should not let them wait forever until timeout. +//! * When many transactions are blocked on the same key, and a transaction is +//! granted the lock after another releasing the lock, the transaction that's +//! blocking other transactions is changed. After cancelling the other +//! transactions and let them retry the `AcquirePessimisticLock` request, they +//! will be able to re-detect deadlocks with the latest information. +//! +//! To achieve this, after delaying for `wake-up-delay-duration` since the +//! latest waking-up event on the key, a call to +//! [`LockWaitQueues::delayed_notify_all`] will be made. However, since the +//! [`LockWaitQueues`] do not have its own thread pool, the user may receive a +//! future after calling some of the functions, and the user will be responsible +//! for executing the future in a suitable place. + +use std::{ + collections::BinaryHeap, + future::Future, + pin::Pin, + result::Result, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::{Duration, Instant}, +}; + +use dashmap; +use futures_util::compat::Future01CompatExt; +use kvproto::kvrpcpb; +use smallvec::SmallVec; +use sync_wrapper::SyncWrapper; +use tikv_util::{time::InstantExt, timer::GLOBAL_TIMER_HANDLE}; +use txn_types::{Key, TimeStamp}; + +use crate::storage::{ + errors::SharedError, + lock_manager::{lock_wait_context::LockWaitContextSharedState, LockManager, LockWaitToken}, + mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, + txn::Error as TxnError, + types::{PessimisticLockParameters, PessimisticLockRes}, + Error as StorageError, +}; + +pub type CallbackWithSharedError = Box) + Send + 'static>; +pub type PessimisticLockKeyCallback = CallbackWithSharedError; + +/// Represents an `AcquirePessimisticLock` request that's waiting for a lock, +/// and contains the request's parameters. +pub struct LockWaitEntry { + pub key: Key, + pub lock_hash: u64, + // TODO: Use term to filter out stale entries in the queue. + // pub term: Option, + pub parameters: PessimisticLockParameters, + pub lock_wait_token: LockWaitToken, + pub req_states: Option>, + pub legacy_wake_up_index: Option, + pub key_cb: Option>, +} + +impl PartialEq for LockWaitEntry { + fn eq(&self, other: &Self) -> bool { + self.parameters.start_ts == other.parameters.start_ts + } +} + +impl Eq for LockWaitEntry {} + +impl PartialOrd for LockWaitEntry { + fn partial_cmp(&self, other: &Self) -> Option { + // Reverse it since the std BinaryHeap is max heap and we want to pop the + // minimal. + other + .parameters + .start_ts + .partial_cmp(&self.parameters.start_ts) + } +} + +impl Ord for LockWaitEntry { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // Reverse it since the std BinaryHeap is max heap and we want to pop the + // minimal. + other.parameters.start_ts.cmp(&self.parameters.start_ts) + } +} + +pub struct KeyLockWaitState { + #[allow(dead_code)] + current_lock: kvrpcpb::LockInfo, + + /// The counter of wake up events of legacy pessimistic lock requests + /// (`allow_lock_with_conflict == false`). When an lock wait entry is + /// pushed to the queue, it records the current counter. The purpose + /// is to mark the entries that needs to be woken up after delaying. + /// + /// Here is an example showing how it works (note that requests in + /// the example are all in legacy mode): + /// + /// Let's denote a lock-wait entry by `(start_ts, + /// legacy_wake_up_index)`. Consider there are three requests with + /// start_ts 20, 30, 40 respectively, and they are pushed to the + /// queue when the `KeyLockWaitState::legacy_wake_up_index` is 0. Then the + /// `KeyLockWaitState` is: + /// + /// ```text + /// legacy_wake_up_index: 0, queue: [(20, 0), (30, 0), (40, 0)] + /// ``` + /// + /// Then the lock on the key is released. We pops the first entry in the + /// queue to wake it up, and then schedule a call to + /// [`LockWaitQueues::delayed_notify_all`] after delaying for + /// `wake_up_delay_duration`. The current state becomes: + /// + /// ```text + /// legacy_wake_up_index: 1, queue: [(30, 0), (40, 0)] + /// ```` + /// + /// Here, if some other request arrives, one of them may successfully + /// acquire the lock and others are pushed to the queue. the state + /// becomes: + /// + /// ```text + /// legacy_wake_up_index: 1, queue: [(30, 0), (40, 0), (50, 1), (60, 1)] + /// ``` + /// + /// Then `wake_up_delay_duration` is elapsed since the previous waking up. + /// Here, we expect that the lock wait entries 30 and 40 can be woken + /// up, since they exists when the previous waking up occurs. But we + /// don't want to wake up later-arrived entries (50 and 60) since it + /// introduces useless pessimistic retries to transaction 50 and 60 when + /// they don't need to. The solution is, only wake up the entries that + /// has `entry.legacy_wake_up_index < + /// key_lock_wait_state.legacy_wake_up_index`. Therefore, we only wakes up + /// entries 30 and 40 who has `legacy_wake_up_index < 1`, while 50 + /// and 60 will be left untouched. + /// + /// When waking up resumable requests, the mechanism above won't take + /// effect. If a legacy request is woken up and triggered the mechanism, + /// and there is a resumable request in the queue, `delayed_notify_all` + /// will stop at the first resumable request it meets, pop it out, and + /// return it from a [`DelayedNotifyAllFuture`]. See + /// [`LockWaitQueues::pop_for_waking_up`]. + legacy_wake_up_index: usize, + queue: BinaryHeap>, + + /// The start_ts of the most recent waking up event. + last_conflict_start_ts: TimeStamp, + /// The commit_ts of the most recent waking up event. + last_conflict_commit_ts: TimeStamp, + + /// `(id, start_time, delay_duration)` + delayed_notify_all_state: Option<(u64, Instant, Arc)>, +} + +impl KeyLockWaitState { + fn new() -> Self { + Self { + current_lock: kvrpcpb::LockInfo::default(), + legacy_wake_up_index: 0, + queue: BinaryHeap::new(), + last_conflict_start_ts: TimeStamp::zero(), + last_conflict_commit_ts: TimeStamp::zero(), + delayed_notify_all_state: None, + } + } +} + +pub type DelayedNotifyAllFuture = Pin>> + Send>>; + +pub struct LockWaitQueueInner { + queue_map: dashmap::DashMap, + id_allocated: AtomicU64, +} + +#[derive(Clone)] +pub struct LockWaitQueues { + inner: Arc, + #[allow(dead_code)] + lock_mgr: L, +} + +impl LockWaitQueues { + pub fn new(lock_mgr: L) -> Self { + Self { + inner: Arc::new(LockWaitQueueInner { + queue_map: dashmap::DashMap::new(), + id_allocated: AtomicU64::new(1), + }), + lock_mgr, + } + } + + /// Enqueues a lock wait entry. The key is indicated by the `key` field of + /// the `lock_wait_entry`. The caller also needs to provide the + /// information of the current-holding lock. + pub fn push_lock_wait( + &self, + mut lock_wait_entry: Box, + current_lock: kvrpcpb::LockInfo, + ) { + let mut key_state = self + .inner + .queue_map + .entry(lock_wait_entry.key.clone()) + .or_insert_with(|| KeyLockWaitState::new()); + key_state.current_lock = current_lock; + + if lock_wait_entry.legacy_wake_up_index.is_none() { + lock_wait_entry.legacy_wake_up_index = Some(key_state.value().legacy_wake_up_index); + } + key_state.value_mut().queue.push(lock_wait_entry); + } + + /// Dequeues the head of the lock waiting queue of the specified key, + /// assuming the popped entry will be woken up. + /// + /// If it's waking up a legacy request and the queue is not empty, a future + /// will be returned and the caller will be responsible for executing it. + /// The future waits until `wake_up_delay_duration` is elapsed since the + /// most recent waking-up, and then wakes up all lock waiting entries that + /// exists at the time when the latest waking-up happens. The future + /// will return a `LockWaitEntry` if a resumable entry is popped out + /// from the queue while executing, and in this case the caller will be + /// responsible to wake it up. + pub fn pop_for_waking_up( + &self, + key: &Key, + conflicting_start_ts: TimeStamp, + conflicting_commit_ts: TimeStamp, + wake_up_delay_duration_ms: u64, + ) -> Option<(Box, Option)> { + self.pop_for_waking_up_impl( + key, + conflicting_start_ts, + conflicting_commit_ts, + Some(wake_up_delay_duration_ms), + ) + } + + fn pop_for_waking_up_impl( + &self, + key: &Key, + conflicting_start_ts: TimeStamp, + conflicting_commit_ts: TimeStamp, + wake_up_delay_duration_ms: Option, + ) -> Option<(Box, Option)> { + let mut result = None; + + // We don't want other threads insert any more entries between finding the + // queue is empty and removing the queue from the map. Wrap the logic + // within a call to `remove_if_mut` to avoid releasing lock during the + // procedure. + self.inner.queue_map.remove_if_mut(key, |_, v| { + v.last_conflict_start_ts = conflicting_start_ts; + v.last_conflict_commit_ts = conflicting_commit_ts; + + while let Some(lock_wait_entry) = v.queue.pop() { + if lock_wait_entry.req_states.as_ref().unwrap().is_finished() { + // Skip already cancelled entries. + continue; + } + + if !lock_wait_entry.parameters.allow_lock_with_conflict { + // If a pessimistic lock request in legacy mode is woken up, increase the + // counter. + v.legacy_wake_up_index += 1; + let notify_all_future = match wake_up_delay_duration_ms { + Some(delay) if !v.queue.is_empty() => { + self.handle_delayed_wake_up(v, key, delay) + } + _ => None, + }; + result = Some((lock_wait_entry, notify_all_future)); + } else { + result = Some((lock_wait_entry, None)); + } + break; + } + + // Remove the queue if it's emptied. + v.queue.is_empty() + }); + + result + } + + /// Schedule delayed waking up on the specified key. + /// + /// Returns a future if it's needed to spawn a new async task to do the + /// delayed waking up. The caller should be responsible for executing + /// it. + fn handle_delayed_wake_up( + &self, + key_lock_wait_state: &mut KeyLockWaitState, + key: &Key, + wake_up_delay_duration_ms: u64, + ) -> Option { + if let Some((_, start_time, delay_duration)) = + &mut key_lock_wait_state.delayed_notify_all_state + { + // There's already an async task spawned for handling delayed waking up on this + // key. Update its state to extend its delaying duration (until now + // + wake_up_delay_duration). + let new_delay_duration = + (start_time.saturating_elapsed().as_millis() as u64) + wake_up_delay_duration_ms; + delay_duration.store(new_delay_duration, Ordering::Release); + None + } else { + // It's needed to spawn a new async task for performing delayed waking up on + // this key. Return a future to let the caller execute it in a + // proper thread pool. + let notify_id = self.allocate_internal_id(); + let start_time = Instant::now(); + let delay_ms = Arc::new(AtomicU64::new(wake_up_delay_duration_ms)); + + key_lock_wait_state.delayed_notify_all_state = + Some((notify_id, start_time, delay_ms.clone())); + Some(Box::pin(self.clone().async_delayed_notify_all( + key.clone(), + start_time, + delay_ms, + notify_id, + ))) + } + } + + fn allocate_internal_id(&self) -> u64 { + self.inner.id_allocated.fetch_add(1, Ordering::SeqCst) + } + + async fn async_delayed_notify_all( + self, + key: Key, + start_time: Instant, + delay_ms: Arc, + notify_id: u64, + ) -> Option> { + let mut prev_delay_ms = 0; + // The delay duration may be extended by later waking-up events, by updating the + // value of `delay_ms`. So we loop until we find that the elapsed + // duration is larger than `delay_ms`. + loop { + let current_delay_ms = delay_ms.load(Ordering::Acquire); + if current_delay_ms == 0 { + // Cancelled. + return None; + } + + if current_delay_ms <= prev_delay_ms + || (start_time.saturating_elapsed().as_millis() as u64) >= current_delay_ms + { + // Timed out. + break; + } + + let deadline = start_time + Duration::from_millis(current_delay_ms); + + GLOBAL_TIMER_HANDLE.delay(deadline).compat().await.unwrap(); + + prev_delay_ms = current_delay_ms; + } + + self.delayed_notify_all(&key, notify_id) + } + + fn delayed_notify_all(&self, key: &Key, notify_id: u64) -> Option> { + let mut popped_lock_wait_entries = SmallVec::<[_; 4]>::new(); + + let mut woken_up_resumable_entry = None; + let mut conflicting_start_ts = TimeStamp::zero(); + let mut conflicting_commit_ts = TimeStamp::zero(); + + // We don't want other threads insert any more entries between finding the + // queue is empty and removing the queue from the map. Wrap the logic + // within a call to `remove_if_mut` to avoid releasing lock during the + // procedure. + self.inner.queue_map.remove_if_mut(key, |_, v| { + // The KeyLockWaitState of the key might have been removed from the map and then + // recreated. Skip. + if v.delayed_notify_all_state + .as_ref() + .map_or(true, |(id, ..)| *id != notify_id) + { + return false; + } + + // Clear the state which indicates the scheduled `delayed_notify_all` has + // finished. + v.delayed_notify_all_state = None; + + conflicting_start_ts = v.last_conflict_start_ts; + conflicting_commit_ts = v.last_conflict_commit_ts; + + let legacy_wake_up_index = v.legacy_wake_up_index; + + while let Some(front) = v.queue.peek() { + if front.req_states.as_ref().unwrap().is_finished() { + // Skip already cancelled entries. + v.queue.pop(); + continue; + } + if front + .legacy_wake_up_index + .map_or(false, |idx| idx >= legacy_wake_up_index) + { + // This entry is added after the legacy-wakeup that issued the current + // delayed_notify_all operation. Keep it and other remaining items in the queue. + break; + } + let lock_wait_entry = v.queue.pop().unwrap(); + if lock_wait_entry.parameters.allow_lock_with_conflict { + woken_up_resumable_entry = Some(lock_wait_entry); + break; + } + popped_lock_wait_entries.push(lock_wait_entry); + } + + // If the queue is empty, remove it from the map. + v.queue.is_empty() + }); + + // Call callbacks to cancel these entries here. + // TODO: Perhaps we'd better make it concurrent with scheduling the new command + // (if `woken_up_resumable_entry` is some) if there are too many. + for lock_wait_entry in popped_lock_wait_entries { + let lock_wait_entry = *lock_wait_entry; + let cb = lock_wait_entry.key_cb.unwrap().into_inner(); + let e = StorageError::from(TxnError::from(MvccError::from( + MvccErrorInner::WriteConflict { + start_ts: lock_wait_entry.parameters.start_ts, + conflict_start_ts: conflicting_start_ts, + conflict_commit_ts: conflicting_commit_ts, + key: lock_wait_entry.key.into_raw().unwrap(), + primary: lock_wait_entry.parameters.primary, + reason: kvrpcpb::WriteConflictReason::PessimisticRetry, + }, + ))); + cb(Err(e.into())); + } + + // Return the item to be woken up in resumable way. + woken_up_resumable_entry + } +} + +#[cfg(test)] +mod tests { + use std::{ + sync::mpsc::{channel, Receiver, RecvTimeoutError}, + time::Duration, + }; + + use super::*; + use crate::storage::{ + lock_manager::{lock_wait_context::LockWaitContext, DummyLockManager, WaitTimeout}, + txn::ErrorInner as TxnErrorInner, + ErrorInner as StorageErrorInner, StorageCallback, + }; + + struct TestLockWaitEntryHandle { + wake_up_rx: Receiver>, + cancel_cb: Box, + } + + impl TestLockWaitEntryHandle { + fn wait_for_result_timeout( + &self, + timeout: Duration, + ) -> Option> { + match self.wake_up_rx.recv_timeout(timeout) { + Ok(res) => Some(res), + Err(RecvTimeoutError::Timeout) => None, + Err(e) => panic!( + "unexpected error when receiving result of a LockWaitEntry: {:?}", + e + ), + } + } + + fn wait_for_result(self) -> Result { + self.wake_up_rx + .recv_timeout(Duration::from_secs(10)) + .unwrap() + } + + fn cancel(self) { + (self.cancel_cb)(); + } + } + + // Additionally add some helper functions to the LockWaitQueues for simplifying + // test code. + impl LockWaitQueues { + fn make_lock_info_pb(&self, key: &[u8], ts: impl Into) -> kvrpcpb::LockInfo { + let ts = ts.into(); + let mut lock_info = kvrpcpb::LockInfo::default(); + lock_info.set_lock_version(ts.into_inner()); + lock_info.set_lock_for_update_ts(ts.into_inner()); + lock_info.set_key(key.to_owned()); + lock_info.set_primary_lock(key.to_owned()); + lock_info + } + + fn make_mock_lock_wait_entry( + &self, + key: &[u8], + start_ts: impl Into, + lock_info_pb: kvrpcpb::LockInfo, + ) -> (Box, TestLockWaitEntryHandle) { + let start_ts = start_ts.into(); + let token = super::super::LockWaitToken(Some(1)); + let dummy_request_cb = StorageCallback::PessimisticLock(Box::new(|_| ())); + let dummy_ctx = LockWaitContext::new( + self.lock_mgr.clone(), + token, + start_ts, + start_ts, + dummy_request_cb, + false, + ); + + let parameters = PessimisticLockParameters { + pb_ctx: Default::default(), + primary: key.to_owned(), + start_ts, + lock_ttl: 1000, + for_update_ts: start_ts, + wait_timeout: Some(WaitTimeout::Default), + return_values: false, + min_commit_ts: 0.into(), + check_existence: false, + is_first_lock: false, + allow_lock_with_conflict: false, + }; + + let key = Key::from_raw(key); + let lock_hash = key.gen_hash(); + let (tx, rx) = channel(); + let lock_wait_entry = Box::new(LockWaitEntry { + key, + lock_hash, + parameters, + lock_wait_token: token, + req_states: Some(dummy_ctx.get_shared_states().clone()), + legacy_wake_up_index: None, + key_cb: Some(SyncWrapper::new(Box::new(move |res| tx.send(res).unwrap()))), + }); + + let cancel_callback = dummy_ctx.get_callback_for_cancellation(); + let cancel = move || { + cancel_callback(StorageError::from(TxnError::from(MvccError::from( + MvccErrorInner::KeyIsLocked(lock_info_pb), + )))) + }; + + ( + lock_wait_entry, + TestLockWaitEntryHandle { + wake_up_rx: rx, + cancel_cb: Box::new(cancel), + }, + ) + } + + fn mock_lock_wait( + &self, + key: &[u8], + start_ts: impl Into, + encountered_lock_ts: impl Into, + resumable: bool, + ) -> TestLockWaitEntryHandle { + let lock_info_pb = self.make_lock_info_pb(key, encountered_lock_ts); + let (mut entry, handle) = + self.make_mock_lock_wait_entry(key, start_ts, lock_info_pb.clone()); + entry.parameters.allow_lock_with_conflict = resumable; + self.push_lock_wait(entry, lock_info_pb); + handle + } + + /// Pop an entry from the queue of the specified key, but do not create + /// the future for delayed wake up. Used in tests that do not + /// care about the delayed wake up. + fn must_pop( + &self, + key: &[u8], + conflicting_start_ts: impl Into, + conflicting_commit_ts: impl Into, + ) -> Box { + let (entry, f) = self + .pop_for_waking_up_impl( + &Key::from_raw(key), + conflicting_start_ts.into(), + conflicting_commit_ts.into(), + None, + ) + .unwrap(); + assert!(f.is_none()); + entry + } + + fn must_pop_none( + &self, + key: &[u8], + conflicting_start_ts: impl Into, + conflicting_commit_ts: impl Into, + ) { + let res = self.pop_for_waking_up_impl( + &Key::from_raw(key), + conflicting_start_ts.into(), + conflicting_commit_ts.into(), + Some(1), + ); + assert!(res.is_none()); + } + + fn must_pop_with_delayed_notify( + &self, + key: &[u8], + conflicting_start_ts: impl Into, + conflicting_commit_ts: impl Into, + ) -> (Box, DelayedNotifyAllFuture) { + let (res, f) = self + .pop_for_waking_up_impl( + &Key::from_raw(key), + conflicting_start_ts.into(), + conflicting_commit_ts.into(), + Some(50), + ) + .unwrap(); + (res, f.unwrap()) + } + + fn must_pop_with_no_delayed_notify( + &self, + key: &[u8], + conflicting_start_ts: impl Into, + conflicting_commit_ts: impl Into, + ) -> Box { + let (res, f) = self + .pop_for_waking_up_impl( + &Key::from_raw(key), + conflicting_start_ts.into(), + conflicting_commit_ts.into(), + Some(50), + ) + .unwrap(); + assert!(f.is_none()); + res + } + + fn must_not_contain_key(&self, key: &[u8]) { + assert!(self.inner.queue_map.get(&Key::from_raw(key)).is_none()); + } + + fn must_have_next_entry(&self, key: &[u8], start_ts: impl Into) { + assert_eq!( + self.inner + .queue_map + .get(&Key::from_raw(key)) + .unwrap() + .queue + .peek() + .unwrap() + .parameters + .start_ts, + start_ts.into() + ); + } + + fn get_delayed_notify_id(&self, key: &[u8]) -> Option { + self.inner + .queue_map + .get(&Key::from_raw(key)) + .unwrap() + .delayed_notify_all_state + .as_ref() + .map(|(id, ..)| *id) + } + } + + impl LockWaitEntry { + fn check_key(&self, expected_key: &[u8]) -> &Self { + assert_eq!(self.key, Key::from_raw(expected_key)); + self + } + + fn check_start_ts(&self, expected_start_ts: impl Into) -> &Self { + assert_eq!(self.parameters.start_ts, expected_start_ts.into()); + self + } + } + + fn expect_write_conflict( + err: &StorageErrorInner, + expect_conflict_start_ts: impl Into, + expect_conflict_commit_ts: impl Into, + ) { + match err { + StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( + box MvccErrorInner::WriteConflict { + conflict_start_ts, + conflict_commit_ts, + .. + }, + )))) => { + assert_eq!(*conflict_start_ts, expect_conflict_start_ts.into()); + assert_eq!(*conflict_commit_ts, expect_conflict_commit_ts.into()); + } + e => panic!("unexpected error: {:?}", e), + } + } + + #[test] + fn test_simple_push_pop() { + let queues = LockWaitQueues::new(DummyLockManager {}); + + queues.mock_lock_wait(b"k1", 10, 5, false); + queues.mock_lock_wait(b"k2", 11, 5, false); + + queues + .must_pop(b"k1", 5, 6) + .check_key(b"k1") + .check_start_ts(10); + queues.must_pop_none(b"k1", 5, 6); + queues.must_not_contain_key(b"k1"); + + queues + .must_pop(b"k2", 5, 6) + .check_key(b"k2") + .check_start_ts(11); + queues.must_pop_none(b"k2", 5, 6); + queues.must_not_contain_key(b"k2"); + } + + #[test] + fn test_popping_priority() { + let queues = LockWaitQueues::new(DummyLockManager {}); + + queues.mock_lock_wait(b"k1", 10, 5, false); + queues.mock_lock_wait(b"k1", 20, 5, false); + queues.mock_lock_wait(b"k1", 12, 5, false); + queues.mock_lock_wait(b"k1", 13, 5, false); + // Duplication is possible considering network issues and RPC retrying. + queues.mock_lock_wait(b"k1", 12, 5, false); + + // Ordered by start_ts + for &expected_start_ts in &[10u64, 12, 12, 13, 20] { + queues + .must_pop(b"k1", 5, 6) + .check_key(b"k1") + .check_start_ts(expected_start_ts); + } + + queues.must_not_contain_key(b"k1"); + } + + #[test] + fn test_dropping_cancelled_entries() { + let queues = LockWaitQueues::new(DummyLockManager {}); + + let h10 = queues.mock_lock_wait(b"k1", 10, 5, false); + let h11 = queues.mock_lock_wait(b"k1", 11, 5, false); + queues.mock_lock_wait(b"k1", 12, 5, false); + let h13 = queues.mock_lock_wait(b"k1", 13, 5, false); + queues.mock_lock_wait(b"k1", 14, 5, false); + + h10.cancel(); + h11.cancel(); + h13.cancel(); + + for &expected_start_ts in &[12u64, 14] { + queues + .must_pop(b"k1", 5, 6) + .check_start_ts(expected_start_ts); + } + queues.must_not_contain_key(b"k1"); + } + + #[tokio::test] + async fn test_delayed_notify_all() { + let queues = LockWaitQueues::new(DummyLockManager {}); + + queues.mock_lock_wait(b"k1", 8, 5, false); + + let handles1 = vec![ + queues.mock_lock_wait(b"k1", 11, 5, false), + queues.mock_lock_wait(b"k1", 12, 5, false), + queues.mock_lock_wait(b"k1", 13, 5, false), + ]; + + // Current queue: [8, 11, 12, 13] + + let (entry, delay_wake_up_future) = queues.must_pop_with_delayed_notify(b"k1", 5, 6); + entry.check_key(b"k1").check_start_ts(8); + + // Current queue: [11*, 12*, 13*] (Items marked with * means it has + // legacy_wake_up_index less than that in KeyLockWaitState, so it might + // be woken up when calling delayed_notify_all). + + let handles2 = vec![ + queues.mock_lock_wait(b"k1", 14, 5, false), + queues.mock_lock_wait(b"k1", 15, 5, true), + queues.mock_lock_wait(b"k1", 16, 5, false), + ]; + + // Current queue: [11*, 12*, 13*, 14, 15, 16] + + assert!( + handles1[0] + .wait_for_result_timeout(Duration::from_millis(100)) + .is_none() + ); + + // Wakes up transaction 11 to 13, and cancels them. + assert!(delay_wake_up_future.await.is_none()); + assert!(queues.get_delayed_notify_id(b"k1").is_none()); + handles1 + .into_iter() + .for_each(|h| expect_write_conflict(&h.wait_for_result().unwrap_err().0, 5, 6)); + // 14 is not woken up. + assert!( + handles2[0] + .wait_for_result_timeout(Duration::from_millis(100)) + .is_none() + ); + + // Current queue: [14, 15, 16] + + queues.mock_lock_wait(b"k1", 9, 5, false); + // Current queue: [9, 14, 15, 16] + + // 9 will be woken up and delayed wake up should be scheduled. After delaying, + // 14 to 16 should be all woken up later if they are all not resumable. + // However since 15 is resumable, it will only wake up 14 and return 15 + // through the result of the `delay_wake_up_future`. + let (entry, delay_wake_up_future) = queues.must_pop_with_delayed_notify(b"k1", 7, 8); + entry.check_key(b"k1").check_start_ts(9); + + // Current queue: [14*, 15*, 16*] + + queues.mock_lock_wait(b"k1", 17, 5, false); + let handle18 = queues.mock_lock_wait(b"k1", 18, 5, false); + + // Current queue: [14*, 15*, 16*, 17, 18] + + // Wakes up 14, and stops at 15 which is resumable. Then, 15 should be returned + // and the caller should be responsible for waking it up. + let entry15 = delay_wake_up_future.await.unwrap(); + entry15.check_key(b"k1").check_start_ts(15); + + // Current queue: [16*, 17, 18] + + let mut it = handles2.into_iter(); + // Receive 14. + expect_write_conflict(&it.next().unwrap().wait_for_result().unwrap_err().0, 7, 8); + // 15 is not woken up. + assert!( + it.next() + .unwrap() + .wait_for_result_timeout(Duration::from_millis(100)) + .is_none() + ); + // Neither did 16. + let handle16 = it.next().unwrap(); + assert!( + handle16 + .wait_for_result_timeout(Duration::from_millis(100)) + .is_none() + ); + + queues.must_have_next_entry(b"k1", 16); + + // Call delayed_notify_all when the key does not have + // `delayed_notify_all_state`. This case may happen when the key is + // removed and recreated in the map. Nothing would happen. + assert!(queues.get_delayed_notify_id(b"k1").is_none()); + assert!( + queues + .delayed_notify_all(&Key::from_raw(b"k1"), 1) + .is_none() + ); + queues.must_have_next_entry(b"k1", 16); + assert!( + handle16 + .wait_for_result_timeout(Duration::from_millis(100)) + .is_none() + ); + + // Current queue: [16*, 17, 18] + + let (entry, delayed_wake_up_future) = queues.must_pop_with_delayed_notify(b"k1", 7, 8); + entry.check_key(b"k1").check_start_ts(16); + queues.must_have_next_entry(b"k1", 17); + let notify_id = queues.get_delayed_notify_id(b"k1").unwrap(); + // Call `delayed_notify_all` with a different ID. Nothing happens. + assert!( + queues + .delayed_notify_all(&Key::from_raw(b"k1"), notify_id - 1) + .is_none() + ); + queues.must_have_next_entry(b"k1", 17); + + // Current queue: [17*, 18*] + + // Don't need to create new future if there already exists one for the key. + let entry = queues.must_pop_with_no_delayed_notify(b"k1", 9, 10); + entry.check_key(b"k1").check_start_ts(17); + queues.must_have_next_entry(b"k1", 18); + + // Current queue: [18*] + + queues.mock_lock_wait(b"k1", 19, 5, false); + // Current queue: [18*, 19] + assert!(delayed_wake_up_future.await.is_none()); + // 18 will be cancelled with ts of the latest wake-up event. + expect_write_conflict(&handle18.wait_for_result().unwrap_err().0, 9, 10); + // Current queue: [19] + + // Don't need to create new future if the queue is cleared. + let entry = queues.must_pop_with_no_delayed_notify(b"k1", 9, 10); + entry.check_key(b"k1").check_start_ts(19); + // Current queue: empty + queues.must_not_contain_key(b"k1"); + + // Calls delayed_notify_all on keys that not exists (maybe deleted due to + // completely waking up). Nothing would happen. + assert!( + queues + .delayed_notify_all(&Key::from_raw(b"k1"), 1) + .is_none() + ); + queues.must_not_contain_key(b"k1"); + } +} diff --git a/src/storage/lock_manager.rs b/src/storage/lock_manager/mod.rs similarity index 90% rename from src/storage/lock_manager.rs rename to src/storage/lock_manager/mod.rs index 79a9d0572f3..235a31c3710 100644 --- a/src/storage/lock_manager.rs +++ b/src/storage/lock_manager/mod.rs @@ -10,6 +10,9 @@ use crate::{ storage::{txn::ProcessResult, types::StorageCallback}, }; +pub mod lock_wait_context; +pub mod lock_waiting_queue; + #[derive(Clone, Copy, PartialEq, Debug, Default)] pub struct Lock { pub ts: TimeStamp, @@ -64,6 +67,18 @@ impl From for WaitTimeout { } } +/// Uniquely identifies a lock-waiting request in a `LockManager`. +/// +/// Not used yet, but necessary for implementing `LockWaitQueues`. +#[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)] +pub struct LockWaitToken(pub Option); + +impl LockWaitToken { + pub fn is_valid(&self) -> bool { + self.0.is_some() + } +} + /// `LockManager` manages transactions waiting for locks held by other /// transactions. It has responsibility to handle deadlocks between /// transactions. diff --git a/src/storage/types.rs b/src/storage/types.rs index 70cd7d2d991..c8303787a41 100644 --- a/src/storage/types.rs +++ b/src/storage/types.rs @@ -8,6 +8,7 @@ use kvproto::kvrpcpb; use txn_types::{Key, Value}; use crate::storage::{ + lock_manager::WaitTimeout, mvcc::{Lock, LockType, TimeStamp, Write, WriteType}, txn::ProcessResult, Callback, Result, @@ -121,6 +122,31 @@ pub struct PrewriteResult { pub one_pc_commit_ts: TimeStamp, } +#[cfg_attr(test, derive(Default))] +pub struct PessimisticLockParameters { + pub pb_ctx: kvrpcpb::Context, + pub primary: Vec, + pub start_ts: TimeStamp, + pub lock_ttl: u64, + pub for_update_ts: TimeStamp, + pub wait_timeout: Option, + pub return_values: bool, + pub min_commit_ts: TimeStamp, + pub check_existence: bool, + pub is_first_lock: bool, + + /// Whether it's allowed for an pessimistic lock request to acquire the lock + /// even there is write conflict (i.e. the latest version's `commit_ts` is + /// greater than the current request's `for_update_ts`. + /// + /// When this is true, it's also inferred that the request is resumable, + /// which means, if such a request encounters a lock of another + /// transaction and it waits for the lock, it can resume executing and + /// continue trying to acquire the lock when it's woken up. Also see: + /// [`super::lock_manager::lock_waiting_queue`] + pub allow_lock_with_conflict: bool, +} + #[derive(Clone, Debug, PartialEq)] pub enum PessimisticLockRes { /// The previous value is loaded while handling the `AcquirePessimisticLock` From c48b0cfa8630d949a6e32a14a74fad7fdbc78c02 Mon Sep 17 00:00:00 2001 From: lijie Date: Thu, 29 Sep 2022 18:59:44 +0800 Subject: [PATCH 252/676] bump master version to v6.4.0-alpha (#13499) close tikv/tikv#13563 bump master version to v6.4.0-alpha Signed-off-by: lijie Co-authored-by: Yilin Chen --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 821e15edc18..25897f0a8a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6144,7 +6144,7 @@ dependencies = [ [[package]] name = "tikv" -version = "6.2.0-alpha" +version = "6.4.0-alpha" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index f51e2ddd303..545ee9380a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "6.2.0-alpha" +version = "6.4.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 1f0b9bf70f2b91f85deb4a298ee74d06d1da0e9d Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 29 Sep 2022 19:49:45 -0700 Subject: [PATCH 253/676] raftstore-v2: add basic apply (#13495) ref tikv/tikv#12842 This PR adds the implementation of apply for v2. In v2, we don't need to batch writes across regions, so there is no need to use batch system. Instead, future pool is used to simplify implementations. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/batch-system/src/lib.rs | 2 +- components/raftstore-v2/Cargo.toml | 1 + components/raftstore-v2/src/batch/apply.rs | 184 --------- components/raftstore-v2/src/batch/mod.rs | 2 - components/raftstore-v2/src/batch/store.rs | 32 +- components/raftstore-v2/src/fsm/apply.rs | 130 ++++--- components/raftstore-v2/src/fsm/mod.rs | 2 +- components/raftstore-v2/src/fsm/peer.rs | 7 +- .../raftstore-v2/src/operation/command/mod.rs | 249 +++++++++++- .../src/operation/command/write/mod.rs | 81 +++- components/raftstore-v2/src/operation/mod.rs | 2 +- .../raftstore-v2/src/operation/query/mod.rs | 15 +- components/raftstore-v2/src/raft/apply.rs | 101 ++++- components/raftstore-v2/src/raft/peer.rs | 25 +- .../src/router/internal_message.rs | 15 +- .../raftstore-v2/tests/failpoints/mod.rs | 4 + .../tests/failpoints/test_basic_write.rs | 105 ++++++ .../tests/integrations/cluster.rs | 356 ++++++++++++++++++ .../raftstore-v2/tests/integrations/mod.rs | 330 +--------------- .../tests/integrations/test_basic_write.rs | 63 +++- .../tests/integrations/test_life.rs | 7 +- .../tests/integrations/test_read.rs | 12 +- .../tests/integrations/test_status.rs | 6 +- components/raftstore/src/store/fsm/apply.rs | 9 +- 25 files changed, 1086 insertions(+), 655 deletions(-) delete mode 100644 components/raftstore-v2/src/batch/apply.rs create mode 100644 components/raftstore-v2/tests/failpoints/test_basic_write.rs create mode 100644 components/raftstore-v2/tests/integrations/cluster.rs diff --git a/Cargo.lock b/Cargo.lock index 25897f0a8a8..2091ea3d4f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4247,6 +4247,7 @@ dependencies = [ "engine_traits", "error_code", "fail", + "file_system", "futures 0.3.15", "keys", "kvproto", diff --git a/components/batch-system/src/lib.rs b/components/batch-system/src/lib.rs index 9ca2953972d..9a307a534ac 100644 --- a/components/batch-system/src/lib.rs +++ b/components/batch-system/src/lib.rs @@ -16,7 +16,7 @@ pub use self::{ PollHandler, Poller, PoolState, }, config::Config, - fsm::{Fsm, Priority}, + fsm::{Fsm, FsmScheduler, Priority}, mailbox::{BasicMailbox, Mailbox}, router::Router, }; diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index c7d920e4011..09fa707c408 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -35,6 +35,7 @@ crossbeam = "0.8" engine_traits = { path = "../engine_traits" } error_code = { path = "../error_code" } fail = "0.5" +file_system = { path = "../file_system" } futures = { version = "0.3", features = ["compat"] } keys = { path = "../keys", default-features = false } kvproto = { git = "https://github.com/pingcap/kvproto.git" } diff --git a/components/raftstore-v2/src/batch/apply.rs b/components/raftstore-v2/src/batch/apply.rs deleted file mode 100644 index ebc7696aa64..00000000000 --- a/components/raftstore-v2/src/batch/apply.rs +++ /dev/null @@ -1,184 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -//! This module contains all structs related to apply batch system. -//! -//! After being started, each thread will have its own `ApplyPoller` and poll -//! using `ApplyContext`. For more information, see the documentation of -//! batch-system. - -use std::{ - ops::{Deref, DerefMut}, - sync::Arc, -}; - -use batch_system::{ - BasicMailbox, BatchRouter, BatchSystem, HandleResult, HandlerBuilder, PollHandler, -}; -use engine_traits::{KvEngine, RaftEngine}; -use raftstore::store::{ - fsm::{ - apply::{ControlFsm, ControlMsg}, - ApplyNotifier, - }, - util::LatencyInspector, - Config, -}; -use slog::Logger; -use tikv_util::config::{Tracker, VersionTrack}; - -use crate::{ - fsm::{ApplyFsm, ApplyFsmDelegate}, - raft::{Apply, Peer}, - router::ApplyTask, -}; - -pub struct ApplyContext { - cfg: Config, -} - -impl ApplyContext { - pub fn new(cfg: Config) -> Self { - ApplyContext { cfg } - } -} - -pub struct ApplyPoller { - apply_task_buf: Vec, - pending_latency_inspect: Vec, - apply_ctx: ApplyContext, - cfg_tracker: Tracker, -} - -impl ApplyPoller { - pub fn new(apply_ctx: ApplyContext, cfg_tracker: Tracker) -> ApplyPoller { - Self { - apply_task_buf: Vec::new(), - pending_latency_inspect: Vec::new(), - apply_ctx, - cfg_tracker, - } - } - - /// Updates the internal buffer to match the latest configuration. - fn apply_buf_capacity(&mut self) { - let new_cap = self.messages_per_tick(); - tikv_util::set_vec_capacity(&mut self.apply_task_buf, new_cap); - } - - #[inline] - fn messages_per_tick(&self) -> usize { - self.apply_ctx.cfg.messages_per_tick - } -} - -impl PollHandler, ControlFsm> for ApplyPoller -where - EK: KvEngine, -{ - fn begin(&mut self, _batch_size: usize, update_cfg: F) - where - for<'a> F: FnOnce(&'a batch_system::Config), - { - let cfg = self.cfg_tracker.any_new().map(|c| c.clone()); - if let Some(cfg) = cfg { - let last_messages_per_tick = self.messages_per_tick(); - self.apply_ctx.cfg = cfg; - if self.apply_ctx.cfg.messages_per_tick != last_messages_per_tick { - self.apply_buf_capacity(); - } - update_cfg(&self.apply_ctx.cfg.apply_batch_system); - } - } - - fn handle_control(&mut self, control: &mut ControlFsm) -> Option { - control.handle_messages(&mut self.pending_latency_inspect); - for inspector in self.pending_latency_inspect.drain(..) { - // TODO: support apply duration. - inspector.finish(); - } - Some(0) - } - - fn handle_normal( - &mut self, - normal: &mut impl DerefMut>, - ) -> batch_system::HandleResult { - let received_cnt = normal.recv(&mut self.apply_task_buf); - let handle_result = if received_cnt == self.messages_per_tick() { - HandleResult::KeepProcessing - } else { - HandleResult::stop_at(0, false) - }; - let mut delegate = ApplyFsmDelegate::new(normal, &mut self.apply_ctx); - delegate.handle_msgs(&mut self.apply_task_buf); - handle_result - } - - fn end(&mut self, batch: &mut [Option>>]) { - // TODO: support memory trace - } -} - -pub struct ApplyPollerBuilder { - cfg: Arc>, -} - -impl ApplyPollerBuilder { - pub fn new(cfg: Arc>) -> Self { - Self { cfg } - } -} - -impl HandlerBuilder, ControlFsm> for ApplyPollerBuilder { - type Handler = ApplyPoller; - - fn build(&mut self, priority: batch_system::Priority) -> Self::Handler { - let apply_ctx = ApplyContext::new(self.cfg.value().clone()); - let cfg_tracker = self.cfg.clone().tracker("apply".to_string()); - ApplyPoller::new(apply_ctx, cfg_tracker) - } -} - -/// Batch system for applying logs pipeline. -pub struct ApplySystem { - system: BatchSystem, ControlFsm>, -} - -impl Deref for ApplySystem { - type Target = BatchSystem, ControlFsm>; - - fn deref(&self) -> &BatchSystem, ControlFsm> { - &self.system - } -} - -impl DerefMut for ApplySystem { - fn deref_mut(&mut self) -> &mut BatchSystem, ControlFsm> { - &mut self.system - } -} - -impl ApplySystem { - pub fn schedule_all<'a, ER: RaftEngine>(&self, peers: impl Iterator>) { - let mut mailboxes = Vec::with_capacity(peers.size_hint().0); - for peer in peers { - let apply = Apply::new(peer); - let (tx, fsm) = ApplyFsm::new(apply); - mailboxes.push(( - peer.region_id(), - BasicMailbox::new(tx, fsm, self.router().state_cnt().clone()), - )); - } - self.router().register_all(mailboxes); - } -} - -pub type ApplyRouter = BatchRouter, ControlFsm>; - -pub fn create_apply_batch_system(cfg: &Config) -> (ApplyRouter, ApplySystem) { - let (control_tx, control_fsm) = ControlFsm::new(); - let (router, system) = - batch_system::create_system(&cfg.apply_batch_system, control_tx, control_fsm); - let system = ApplySystem { system }; - (router, system) -} diff --git a/components/raftstore-v2/src/batch/mod.rs b/components/raftstore-v2/src/batch/mod.rs index 7e00932d1e1..7daeebaa8f0 100644 --- a/components/raftstore-v2/src/batch/mod.rs +++ b/components/raftstore-v2/src/batch/mod.rs @@ -5,8 +5,6 @@ //! StoreSystem is used for polling raft state machines, ApplySystem is used for //! applying logs. -mod apply; mod store; -pub(crate) use apply::ApplyContext; pub use store::{create_store_batch_system, StoreContext, StoreRouter, StoreSystem}; diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 1bb17ff2c85..bd777477bf0 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -14,6 +14,7 @@ use batch_system::{ use collections::HashMap; use crossbeam::channel::{Sender, TrySendError}; use engine_traits::{Engines, KvEngine, RaftEngine, TabletFactory}; +use file_system::{set_io_type, IoType}; use futures::{compat::Future01CompatExt, FutureExt}; use kvproto::{ metapb::Store, @@ -30,14 +31,15 @@ use tikv_util::{ config::{Tracker, VersionTrack}, defer, future::poll_future_notify, + sys::SysQuota, time::Instant as TiInstant, timer::SteadyTimer, worker::{Scheduler, Worker}, + yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, Either, }; use time::Timespec; -use super::apply::{create_apply_batch_system, ApplyPollerBuilder, ApplyRouter, ApplySystem}; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, raft::{Peer, Storage}, @@ -67,6 +69,7 @@ pub struct StoreContext { pub store_meta: Arc>>, pub engine: ER, pub tablet_factory: Arc>, + pub apply_pool: FuturePool, pub log_fetch_scheduler: Scheduler, } @@ -216,6 +219,7 @@ struct StorePollerBuilder { router: StoreRouter, log_fetch_scheduler: Scheduler, write_senders: WriteSenders, + apply_pool: FuturePool, logger: Logger, store_meta: Arc>>, } @@ -233,6 +237,16 @@ impl StorePollerBuilder { logger: Logger, store_meta: Arc>>, ) -> Self { + let pool_size = cfg.value().apply_batch_system.pool_size; + let max_pool_size = std::cmp::max( + pool_size, + std::cmp::max(4, SysQuota::cpu_cores_quota() as usize), + ); + let apply_pool = YatpPoolBuilder::new(DefaultTicker::default()) + .thread_count(1, pool_size, max_pool_size) + .after_start(move || set_io_type(IoType::ForegroundWrite)) + .name_prefix("apply") + .build_future_pool(); StorePollerBuilder { cfg, store_id, @@ -241,6 +255,7 @@ impl StorePollerBuilder { trans, router, log_fetch_scheduler, + apply_pool, logger, write_senders: store_writers.senders(), store_meta, @@ -310,6 +325,7 @@ where store_meta: self.store_meta.clone(), engine: self.engine.clone(), tablet_factory: self.tablet_factory.clone(), + apply_pool: self.apply_pool.clone(), log_fetch_scheduler: self.log_fetch_scheduler.clone(), }; let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); @@ -337,8 +353,6 @@ impl Default for Workers { /// The system used for polling Raft activities. pub struct StoreSystem { system: BatchSystem, StoreFsm>, - apply_router: ApplyRouter, - apply_system: ApplySystem, workers: Option>, logger: Logger, } @@ -380,8 +394,6 @@ impl StoreSystem { ); self.workers = Some(workers); let peers = builder.init()?; - self.apply_system - .schedule_all(peers.values().map(|pair| pair.1.peer())); // Choose a different name so we know what version is actually used. rs stands // for raft store. let tag = format!("rs-{}", store_id); @@ -403,10 +415,6 @@ impl StoreSystem { router.force_send(addr, PeerMsg::Start).unwrap(); } router.send_control(StoreMsg::Start).unwrap(); - - let apply_poller_builder = ApplyPollerBuilder::new(cfg); - self.apply_system - .spawn("apply".to_owned(), apply_poller_builder); Ok(()) } @@ -416,7 +424,8 @@ impl StoreSystem { } let mut workers = self.workers.take().unwrap(); - self.apply_system.shutdown(); + // TODO: gracefully shutdown future pool + self.system.shutdown(); workers.store_writers.shutdown(); @@ -493,11 +502,8 @@ where let (store_tx, store_fsm) = StoreFsm::new(cfg, store_id, logger.clone()); let (router, system) = batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); - let (apply_router, apply_system) = create_apply_batch_system(cfg); let system = StoreSystem { system, - apply_router, - apply_system, workers: None, logger: logger.clone(), }; diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 21646be4738..b37d0b33518 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -1,74 +1,102 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use batch_system::Fsm; +use std::{ + pin::Pin, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, + task::{Context, Poll}, +}; + +use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; use engine_traits::KvEngine; -use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; +use futures::{Future, StreamExt}; +use kvproto::raft_serverpb::RegionLocalState; +use slog::Logger; +use tikv_util::mpsc::future::{self, Receiver, Sender, WakePolicy}; -use crate::{batch::ApplyContext, raft::Apply, router::ApplyTask}; +use crate::{ + raft::Apply, + router::{ApplyRes, ApplyTask, PeerMsg}, + tablet::CachedTablet, +}; -pub struct ApplyFsm { - apply: Apply, - receiver: Receiver, - is_stopped: bool, +/// A trait for reporting apply result. +/// +/// Using a trait to make signiture simpler. +pub trait ApplyResReporter { + fn report(&self, apply_res: ApplyRes); } -impl ApplyFsm { - pub fn new(apply: Apply) -> (LooseBoundedSender, Box) { - let (tx, rx) = mpsc::loose_bounded(usize::MAX); - ( - tx, - Box::new(Self { - apply, - receiver: rx, - is_stopped: false, - }), - ) - } - - /// Fetches messages to `apply_task_buf`. It will stop when the buffer - /// capacity is reached or there is no more pending messages. - /// - /// Returns how many messages are fetched. - pub fn recv(&mut self, apply_task_buf: &mut Vec) -> usize { - let l = apply_task_buf.len(); - for i in l..apply_task_buf.capacity() { - match self.receiver.try_recv() { - Ok(msg) => apply_task_buf.push(msg), - Err(e) => { - if let TryRecvError::Disconnected = e { - self.is_stopped = true; - } - return i - l; - } - } - } - apply_task_buf.capacity() - l +impl, S: FsmScheduler> ApplyResReporter for Mailbox { + fn report(&self, apply_res: ApplyRes) { + // TODO: check shutdown. + self.force_send(PeerMsg::ApplyRes(apply_res)).unwrap(); } } -impl Fsm for ApplyFsm { - type Message = ApplyTask; +/// Schedule task to `ApplyFsm`. +pub struct ApplyScheduler { + sender: Sender, +} +impl ApplyScheduler { #[inline] - fn is_stopped(&self) -> bool { - self.is_stopped + pub fn send(&self, task: ApplyTask) { + // TODO: ignore error when shutting down. + self.sender.send(task).unwrap(); } } -pub struct ApplyFsmDelegate<'a, EK: KvEngine> { - fsm: &'a mut ApplyFsm, - apply_ctx: &'a mut ApplyContext, +pub struct ApplyFsm { + apply: Apply, + receiver: Receiver, } -impl<'a, EK: KvEngine> ApplyFsmDelegate<'a, EK> { - pub fn new(fsm: &'a mut ApplyFsm, apply_ctx: &'a mut ApplyContext) -> Self { - Self { fsm, apply_ctx } +impl ApplyFsm { + pub fn new( + region_state: RegionLocalState, + res_reporter: R, + remote_tablet: CachedTablet, + logger: Logger, + ) -> (ApplyScheduler, Self) { + let (tx, rx) = future::unbounded(WakePolicy::Immediately); + let apply = Apply::new(region_state, res_reporter, remote_tablet, logger); + ( + ApplyScheduler { sender: tx }, + Self { + apply, + receiver: rx, + }, + ) } +} - pub fn handle_msgs(&self, apply_task_buf: &mut Vec) { - for task in apply_task_buf.drain(..) { - // TODO: handle the tasks. +impl ApplyFsm { + pub async fn handle_all_tasks(&mut self) { + loop { + let mut task = match self.receiver.next().await { + Some(t) => t, + None => return, + }; + loop { + match task { + // TODO: flush by buffer size. + ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, + } + + // TODO: yield after some time. + + // Perhaps spin sometime? + match self.receiver.try_recv() { + Ok(t) => task = t, + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Disconnected) => return, + } + } + self.apply.flush(); } } } diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs index 191f629900a..b3d0e0483ba 100644 --- a/components/raftstore-v2/src/fsm/mod.rs +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -9,6 +9,6 @@ mod apply; mod peer; mod store; -pub use apply::{ApplyFsm, ApplyFsmDelegate}; +pub use apply::{ApplyFsm, ApplyResReporter, ApplyScheduler}; pub use peer::{PeerFsm, PeerFsmDelegate, SenderFsmPair}; pub use store::{Store, StoreFsm, StoreFsmDelegate, StoreMeta}; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 8443ef265a8..cd8775359fc 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -14,8 +14,10 @@ use tikv_util::{ is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver, Sender}, time::{duration_to_sec, Instant}, + yatp_pool::FuturePool, }; +use super::ApplyFsm; use crate::{ batch::StoreContext, raft::{Peer, Storage}, @@ -176,6 +178,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, fn on_start(&mut self) { self.schedule_tick(PeerTick::Raft); + if self.fsm.peer.storage().is_initialized() { + self.fsm.peer.schedule_apply_fsm(self.store_ctx); + } } #[inline] @@ -215,7 +220,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.on_command(cmd.request, cmd.ch) } PeerMsg::Tick(tick) => self.on_tick(tick), - PeerMsg::ApplyRes(res) => unimplemented!(), + PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(res), PeerMsg::Start => self.on_start(), PeerMsg::Noop => unimplemented!(), PeerMsg::Persisted { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index fa3c89dce74..bef599d5239 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -4,17 +4,39 @@ //! all replicas and executed in the same order. Typical commands include: //! - normal writes like put, delete, etc. //! - admin commands like split, compact, etc. +//! +//! General proceessing is: +//! - Propose a command to the leader via PeerMsg::Command, +//! - The leader batch up commands and replicates them to followers, +//! - Once they are replicated to majority, leader considers it committed and +//! send to another thread for execution via +//! `schedule_apply_committed_entries`, +//! - The apply thread executes the commands in buffer, and write to LSM tree +//! via `flush`, +//! - Applied result are sent back to peer fsm, and update memory state in +//! `on_apply_res`. use std::cmp; -use engine_traits::{KvEngine, RaftEngine}; -use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest}; +use batch_system::{Fsm, FsmScheduler, Mailbox}; +use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; +use kvproto::{ + raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader}, + raft_serverpb::RegionLocalState, +}; use protobuf::Message; use raft::eraftpb::Entry; use raftstore::{ store::{ - fsm::Proposal, local_metrics::RaftMetrics, metrics::*, msg::ErrorCallback, util, - WriteCallback, + cmd_resp, + fsm::{ + apply::{APPLY_WB_SHRINK_SIZE, DEFAULT_APPLY_WB_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP}, + Proposal, + }, + local_metrics::RaftMetrics, + metrics::*, + msg::ErrorCallback, + util, WriteCallback, }, Error, Result, }; @@ -23,15 +45,33 @@ use tikv_util::{box_err, time::monotonic_raw_now}; use crate::{ batch::StoreContext, - fsm::{PeerFsm, PeerFsmDelegate}, - raft::Peer, - router::CmdResChannel, + fsm::{ApplyFsm, ApplyResReporter, PeerFsmDelegate}, + raft::{Apply, Peer}, + router::{ApplyRes, ApplyTask, CmdResChannel, PeerMsg}, }; mod write; pub use write::{SimpleWriteDecoder, SimpleWriteEncoder}; +use self::write::SimpleWrite; + +#[derive(Debug)] +pub struct CommittedEntries { + /// Entries need to be applied. Note some entries may not be included for + /// flow control. + entry_and_proposals: Vec<(Entry, Vec)>, +} + +fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { + let mut resp = RaftCmdResponse::default(); + if !header.get_uuid().is_empty() { + let uuid = header.get_uuid().to_vec(); + resp.mut_header().set_uuid(uuid); + } + resp +} + impl<'a, EK: KvEngine, ER: RaftEngine, T> PeerFsmDelegate<'a, EK, ER, T> { #[inline] pub fn on_command(&mut self, req: RaftCmdRequest, ch: CmdResChannel) { @@ -48,6 +88,25 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> PeerFsmDelegate<'a, EK, ER, T> { } impl Peer { + /// Schedule an apply fsm to apply logs in the background. + /// + /// Everytime a snapshot is applied or peer is just started, it will + /// schedule a new apply fsm. The old fsm will stopped automatically + /// when the old apply scheduler is dropped. + #[inline] + pub fn schedule_apply_fsm(&mut self, store_ctx: &mut StoreContext) { + let region_state = self.storage().region_state().clone(); + let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); + let tablet = self.tablet().clone(); + let logger = self.logger.clone(); + let (apply_scheduler, mut apply_fsm) = ApplyFsm::new(region_state, mailbox, tablet, logger); + store_ctx + .apply_pool + .spawn(async move { apply_fsm.handle_all_tasks().await }) + .unwrap(); + self.set_apply_scheduler(apply_scheduler); + } + #[inline] fn validate_command(&self, req: &RaftCmdRequest, metrics: &mut RaftMetrics) -> Result<()> { if let Err(e) = util::check_store_id(req, self.peer().get_store_id()) { @@ -147,19 +206,171 @@ impl Peer { // on_committed callback if necessary. p.cb.notify_committed(); } - entry_and_proposals.push((e, proposal)); + entry_and_proposals.push((e, proposal.map_or_else(Vec::new, |p| p.cb))); } } else { - entry_and_proposals = committed_entries.into_iter().map(|e| (e, None)).collect(); - } - // Note that the `commit_index` and `commit_term` here may be used to - // forward the commit index after being restarted. So it must be less - // than or equal to persisted index. - let commit_index = cmp::min( - self.raft_group().raft.raft_log.committed, - self.raft_group().raft.raft_log.persisted, - ); - let commit_term = self.raft_group().raft.raft_log.term(commit_index).unwrap(); - // TODO: schedule apply task + entry_and_proposals = committed_entries.into_iter().map(|e| (e, vec![])).collect(); + } + // Unlike v1, v2 doesn't need to persist commit index and commit term. The + // point of persist commit index/term of raft apply state is to recover commit + // index when the writes to raft engine is lost but writes to kv engine is + // persisted. But in v2, writes to raft engine must be persisted before + // memtables in kv engine is flushed. + let apply = CommittedEntries { + entry_and_proposals, + }; + self.apply_scheduler() + .send(ApplyTask::CommittedEntries(apply)); + } + + pub fn on_apply_res(&mut self, apply_res: ApplyRes) { + if !self.serving() { + return; + } + // It must just applied a snapshot. + if apply_res.applied_index < self.entry_storage().first_index() { + // TODO: handle admin side effects like split/merge. + return; + } + self.raft_group_mut() + .advance_apply_to(apply_res.applied_index); + let is_leader = self.is_leader(); + let entry_storage = self.entry_storage_mut(); + entry_storage + .apply_state_mut() + .set_applied_index(apply_res.applied_index); + entry_storage.set_applied_term(apply_res.applied_term); + if !is_leader { + entry_storage.compact_entry_cache(apply_res.applied_index + 1); + // TODO: handle read. + } else { + // TODO: handle read. + } + } +} + +impl Apply { + #[inline] + pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { + fail::fail_point!("APPLY_COMMITTED_ENTRIES"); + for (e, ch) in ce.entry_and_proposals { + if !e.get_data().is_empty() { + let mut set_save_point = false; + if let Some(wb) = self.write_batch_mut() { + wb.set_save_point(); + set_save_point = true; + } + let resp = match self.apply_entry(&e).await { + Ok(resp) => resp, + Err(e) => { + if let Some(wb) = self.write_batch_mut() { + if set_save_point { + wb.rollback_to_save_point().unwrap(); + } else { + wb.clear(); + } + } + cmd_resp::new_error(e) + } + }; + self.callbacks_mut().push((ch, resp)); + } else { + assert!(ch.is_empty()); + } + // Flush may be triggerred in the middle, so always update the index and term. + self.set_apply_progress(e.index, e.term); + } + } + + #[inline] + async fn apply_entry(&mut self, entry: &Entry) -> Result { + match SimpleWriteDecoder::new(entry.get_data()) { + Ok(decoder) => { + util::compare_region_epoch( + decoder.header().get_region_epoch(), + self.region_state().get_region(), + false, + true, + true, + )?; + let res = Ok(new_response(decoder.header())); + for req in decoder { + match req { + SimpleWrite::Put(put) => self.apply_put(put.cf, put.key, put.value)?, + SimpleWrite::Delete(delete) => self.apply_delete(delete.cf, delete.key)?, + SimpleWrite::DeleteRange(dr) => self.apply_delete_range( + dr.cf, + dr.start_key, + dr.end_key, + dr.notify_only, + )?, + } + } + res + } + Err(req) => { + util::check_region_epoch(&req, self.region_state().get_region(), true)?; + if req.has_admin_request() { + // TODO: implement admin request. + } else { + for r in req.get_requests() { + match r.get_cmd_type() { + // These three writes should all use the new codec. Keep them here for + // backward compatibility. + CmdType::Put => { + let put = r.get_put(); + self.apply_put(put.get_cf(), put.get_key(), put.get_value())?; + } + CmdType::Delete => { + let delete = r.get_delete(); + self.apply_delete(delete.get_cf(), delete.get_key())?; + } + CmdType::DeleteRange => { + let dr = r.get_delete_range(); + self.apply_delete_range( + dr.get_cf(), + dr.get_start_key(), + dr.get_end_key(), + dr.get_notify_only(), + )?; + } + _ => unimplemented!(), + } + } + } + Ok(new_response(req.get_header())) + } + } + } + + #[inline] + pub fn flush(&mut self) { + if let Some(wb) = self.write_batch_mut() && !wb.is_empty() { + let mut write_opt = WriteOptions::default(); + write_opt.set_disable_wal(true); + if let Err(e) = wb.write_opt(&write_opt) { + panic!("failed to write data: {:?}", self.logger.list()); + } + if wb.data_size() <= APPLY_WB_SHRINK_SIZE { + wb.clear(); + } else { + self.write_batch_mut().take(); + } + } + let callbacks = self.callbacks_mut(); + for (ch, resp) in callbacks.drain(..) { + ch.set_result(resp); + } + if callbacks.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { + callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); + } + let mut apply_res = ApplyRes::default(); + let (index, term) = self.apply_progress(); + apply_res.applied_index = index; + apply_res.applied_term = term; + if self.reset_state_changed() { + apply_res.region_state = Some(self.region_state().clone()); + } + self.res_reporter().report(apply_res); } } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 30dfb3bb753..76692b6af0a 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -1,24 +1,31 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{KvEngine, RaftEngine}; -use kvproto::raft_cmdpb::RaftCmdRequest; +use engine_traits::{KvEngine, Mutable, RaftEngine, CF_DEFAULT}; +use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; use raftstore::{ store::{ cmd_resp, fsm::{apply, Proposal, MAX_PROPOSAL_SIZE_RATIO}, msg::ErrorCallback, - WriteCallback, + util, WriteCallback, }, Result, }; use tikv_util::Either; -use crate::{batch::StoreContext, raft::Peer, router::CmdResChannel}; +use crate::{ + batch::StoreContext, + raft::{Apply, Peer}, + router::CmdResChannel, +}; mod simple_write; pub use simple_write::{SimpleWriteDecoder, SimpleWriteEncoder}; +pub use self::simple_write::SimpleWrite; +use super::CommittedEntries; + impl Peer { #[inline] pub fn on_write_command( @@ -31,7 +38,7 @@ impl Peer { apply::notify_req_region_removed(self.region_id(), ch); return; } - if let Some(encoder) = self.raw_write_encoder_mut() { + if let Some(encoder) = self.simple_write_encoder_mut() { match encoder.amend(req) { Ok(()) => { encoder.add_response_channel(ch); @@ -55,7 +62,7 @@ impl Peer { Ok(mut encoder) => { encoder.add_response_channel(ch); self.set_has_ready(); - self.raw_write_encoder_mut().replace(encoder); + self.simple_write_encoder_mut().replace(encoder); } Err(req) => { let res = self.propose_command(ctx, req); @@ -83,10 +90,70 @@ impl Peer { } pub fn propose_pending_command(&mut self, ctx: &mut StoreContext) { - if let Some(encoder) = self.raw_write_encoder_mut().take() { + if let Some(encoder) = self.simple_write_encoder_mut().take() { let (data, chs) = encoder.encode(); let res = self.propose(ctx, data); self.post_propose_write(ctx, res, chs); } } } + +impl Apply { + #[inline] + pub fn apply_put(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + util::check_key_in_region(key, self.region_state().get_region())?; + let res = if cf.is_empty() || cf == CF_DEFAULT { + // TODO: use write_vector + self.write_batch_or_default().put(key, value) + } else { + self.write_batch_or_default().put_cf(cf, key, value) + }; + res.unwrap_or_else(|e| { + panic!( + "{:?} failed to write ({}, {}) {}: {:?}", + self.logger.list(), + log_wrappers::Value::key(key), + log_wrappers::Value::value(value), + cf, + e + ); + }); + fail::fail_point!("APPLY_PUT", |_| Err(raftstore::Error::Other( + "aborted by failpoint".into() + ))); + Ok(()) + } + + #[inline] + pub fn apply_delete(&mut self, cf: &str, key: &[u8]) -> Result<()> { + util::check_key_in_region(key, self.region_state().get_region())?; + let res = if cf.is_empty() || cf == CF_DEFAULT { + // TODO: use write_vector + self.write_batch_or_default().delete(key) + } else { + self.write_batch_or_default().delete_cf(cf, key) + }; + res.unwrap_or_else(|e| { + panic!( + "{:?} failed to delete {} {}: {:?}", + self.logger.list(), + log_wrappers::Value::key(key), + cf, + e + ); + }); + Ok(()) + } + + #[inline] + pub fn apply_delete_range( + &mut self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + notify_only: bool, + ) -> Result<()> { + /// TODO: reuse the same delete as split/merge. + Ok(()) + } +} diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index fd1a4c79600..ebef0cf0595 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -5,6 +5,6 @@ mod life; mod query; mod ready; -pub use command::{SimpleWriteDecoder, SimpleWriteEncoder}; +pub use command::{CommittedEntries, SimpleWriteDecoder, SimpleWriteEncoder}; pub use life::DestroyProgress; pub use ready::AsyncWriter; diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index dbee230a7a5..14cedc7b212 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -11,6 +11,8 @@ //! Follower's read index and replica read is implemenented replica module. //! Leader's read index and lease renew is implemented in lease module. +use std::{cmp, sync::Arc}; + use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ @@ -354,12 +356,23 @@ impl Peer { /// Query internal states for debugging purpose. pub fn on_query_debug_info(&self, ch: DebugInfoChannel) { let entry_storage = self.storage().entry_storage(); - let meta = RegionMeta::new( + let mut meta = RegionMeta::new( self.storage().region_state(), entry_storage.apply_state(), GroupState::Ordered, self.raft_group().status(), ); + // V2 doesn't persist commit index and term, fill them with in-memory values. + meta.raft_apply.commit_index = cmp::min( + self.raft_group().raft.raft_log.committed, + self.raft_group().raft.raft_log.persisted, + ); + meta.raft_apply.commit_term = self + .raft_group() + .raft + .raft_log + .term(meta.raft_apply.commit_index) + .unwrap(); ch.set_result(meta); } } diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 09646965bda..b210890ac40 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -1,23 +1,108 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; +use raftstore::store::fsm::apply::DEFAULT_APPLY_WB_SIZE; use slog::Logger; use super::Peer; -use crate::tablet::CachedTablet; +use crate::{ + fsm::ApplyResReporter, + router::{ApplyRes, CmdResChannel}, + tablet::CachedTablet, +}; /// Apply applies all the committed commands to kv db. -pub struct Apply { - tablet: CachedTablet, - logger: Logger, +pub struct Apply { + remote_tablet: CachedTablet, + tablet: EK, + write_batch: Option, + + callbacks: Vec<(Vec, RaftCmdResponse)>, + + applied_index: u64, + applied_term: u64, + + region_state: RegionLocalState, + state_changed: bool, + + res_reporter: R, + pub(crate) logger: Logger, } -impl Apply { +impl Apply { #[inline] - pub fn new(peer: &Peer) -> Self { + pub fn new( + region_state: RegionLocalState, + res_reporter: R, + mut remote_tablet: CachedTablet, + logger: Logger, + ) -> Self { Apply { - tablet: peer.tablet().clone(), - logger: peer.logger.clone(), + tablet: remote_tablet.latest().unwrap().clone(), + remote_tablet, + write_batch: None, + callbacks: vec![], + applied_index: 0, + applied_term: 0, + region_state, + state_changed: false, + res_reporter, + logger, + } + } + + #[inline] + pub fn res_reporter(&self) -> &R { + &self.res_reporter + } + + #[inline] + pub fn callbacks_mut(&mut self) -> &mut Vec<(Vec, RaftCmdResponse)> { + &mut self.callbacks + } + + #[inline] + pub fn write_batch_mut(&mut self) -> &mut Option { + &mut self.write_batch + } + + #[inline] + pub fn write_batch_or_default(&mut self) -> &mut EK::WriteBatch { + if self.write_batch.is_none() { + self.write_batch = Some(self.tablet.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE)); } + self.write_batch.as_mut().unwrap() + } + + #[inline] + pub fn set_apply_progress(&mut self, index: u64, term: u64) { + self.applied_index = index; + self.applied_term = term; + } + + #[inline] + pub fn apply_progress(&self) -> (u64, u64) { + (self.applied_index, self.applied_term) + } + + #[inline] + pub fn region_state(&self) -> &RegionLocalState { + &self.region_state + } + + #[inline] + pub fn reset_state_changed(&mut self) -> bool { + std::mem::take(&mut self.state_changed) + } + + /// Publish the tablet so that it can be used by read worker. + /// + /// Note, during split/merge, lease is expired explicitly and read is + /// forbidden. So publishing it immediately is OK. + #[inline] + pub fn publish_tablet(&mut self, tablet: EK) { + self.remote_tablet.set(tablet.clone()); + self.tablet = tablet; } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 57e14bad02d..8b69a52f623 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -31,9 +31,9 @@ use tikv_util::{ Either, }; -use super::storage::Storage; +use super::{storage::Storage, Apply}; use crate::{ - batch::StoreContext, + fsm::{ApplyFsm, ApplyScheduler}, operation::{AsyncWriter, DestroyProgress, SimpleWriteEncoder}, router::{CmdResChannel, QueryResChannel}, tablet::{self, CachedTablet}, @@ -55,6 +55,7 @@ pub struct Peer { /// than protobuf. raw_write_encoder: Option, proposals: ProposalQueue>, + apply_scheduler: Option, /// Set to true if any side effect needs to be handled. has_ready: bool, @@ -121,15 +122,18 @@ impl Peer { None }; + let tablet = CachedTablet::new(tablet); + let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { - tablet: CachedTablet::new(tablet), + tablet, peer_cache: vec![], raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), + apply_scheduler: None, has_ready: false, destroy_progress: DestroyProgress::None, raft_group, @@ -364,12 +368,13 @@ impl Peer { self.entry_storage().applied_term() == self.term() } - pub fn raw_write_encoder_mut(&mut self) -> &mut Option { + #[inline] + pub fn simple_write_encoder_mut(&mut self) -> &mut Option { &mut self.raw_write_encoder } #[inline] - pub fn raw_write_encoder(&self) -> &Option { + pub fn simple_write_encoder(&self) -> &Option { &self.raw_write_encoder } @@ -382,4 +387,14 @@ impl Peer { pub fn proposals_mut(&mut self) -> &mut ProposalQueue> { &mut self.proposals } + + #[inline] + pub fn apply_scheduler(&self) -> &ApplyScheduler { + self.apply_scheduler.as_ref().unwrap() + } + + #[inline] + pub fn set_apply_scheduler(&mut self, apply_scheduler: ApplyScheduler) { + self.apply_scheduler = Some(apply_scheduler); + } } diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 05653e4fdcc..28a93e897af 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,6 +1,17 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -pub enum ApplyTask {} +use kvproto::raft_serverpb::RegionLocalState; + +use crate::operation::CommittedEntries; #[derive(Debug)] -pub enum ApplyRes {} +pub enum ApplyTask { + CommittedEntries(CommittedEntries), +} + +#[derive(Debug, Default)] +pub struct ApplyRes { + pub applied_index: u64, + pub applied_term: u64, + pub region_state: Option, +} diff --git a/components/raftstore-v2/tests/failpoints/mod.rs b/components/raftstore-v2/tests/failpoints/mod.rs index 88dfd0a81aa..26403f2f0a3 100644 --- a/components/raftstore-v2/tests/failpoints/mod.rs +++ b/components/raftstore-v2/tests/failpoints/mod.rs @@ -5,4 +5,8 @@ #![feature(custom_test_frameworks)] #![test_runner(test_util::run_failpoint_tests)] +#[allow(dead_code)] +#[path = "../integrations/cluster.rs"] +mod cluster; +mod test_basic_write; mod test_bootstrap; diff --git a/components/raftstore-v2/tests/failpoints/test_basic_write.rs b/components/raftstore-v2/tests/failpoints/test_basic_write.rs new file mode 100644 index 00000000000..5014e0efd3e --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_basic_write.rs @@ -0,0 +1,105 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{assert_matches::assert_matches, time::Duration}; + +use engine_traits::{OpenOptions, Peekable, TabletFactory}; +use futures::executor::block_on; +use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; +use raftstore::store::{INIT_EPOCH_CONF_VER, INIT_EPOCH_VER}; +use raftstore_v2::router::PeerMsg; +use tikv_util::store::new_peer; + +use crate::cluster::Cluster; + +/// Check if write batch is correctly maintained during apply. +#[test] +fn test_write_batch_rollback() { + let cluster = Cluster::default(); + let router = cluster.router(0); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(2); + let epoch = req.mut_header().mut_region_epoch(); + epoch.set_version(INIT_EPOCH_VER); + epoch.set_conf_ver(INIT_EPOCH_CONF_VER); + req.mut_header().set_peer(new_peer(1, 3)); + let mut put_req = Request::default(); + put_req.set_cmd_type(CmdType::Put); + put_req.mut_put().set_key(b"key".to_vec()); + put_req.mut_put().set_value(b"value".to_vec()); + req.mut_requests().push(put_req.clone()); + + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + // Make several entries to batch in apply thread. + fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); + + let tablet_factory = cluster.node(0).tablet_factory(); + let tablet = tablet_factory + .open_tablet(2, None, OpenOptions::default().set_cache_only(true)) + .unwrap(); + + // Good proposal should be committed. + let (msg, mut sub0) = PeerMsg::raft_command(req.clone()); + router.send(2, msg).unwrap(); + assert!(block_on(sub0.wait_proposed())); + assert!(block_on(sub0.wait_committed())); + + // If the write batch is correctly initialized, next write should not contain + // last result. + req.mut_requests()[0].mut_put().set_key(b"key1".to_vec()); + let (msg, mut sub1) = PeerMsg::raft_command(req.clone()); + router.send(2, msg).unwrap(); + assert!(block_on(sub1.wait_proposed())); + assert!(block_on(sub1.wait_committed())); + + fail::cfg("APPLY_PUT", "1*return()").unwrap(); + // Wake up and sleep in next committed entry. + fail::remove("APPLY_COMMITTED_ENTRIES"); + // First apply will fail due to aborted. If write batch is initialized + // correctly, correct response can be returned. + let resp = block_on(sub0.result()).unwrap(); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains("aborted"), + "{:?}", + resp + ); + let resp = block_on(sub1.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert_matches!(tablet.get_value(b"key"), Ok(None)); + assert_eq!(tablet.get_value(b"key1").unwrap().unwrap(), b"value"); + + fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); + + // Trigger error again, so an initialized write batch should be rolled back. + req.mut_requests()[0].mut_put().set_key(b"key2".to_vec()); + let (msg, mut sub0) = PeerMsg::raft_command(req.clone()); + router.send(2, msg).unwrap(); + assert!(block_on(sub0.wait_proposed())); + assert!(block_on(sub0.wait_committed())); + + // If the write batch is correctly rollbacked, next write should not contain + // last result. + req.mut_requests()[0].mut_put().set_key(b"key3".to_vec()); + let (msg, mut sub1) = PeerMsg::raft_command(req.clone()); + router.send(2, msg).unwrap(); + assert!(block_on(sub1.wait_proposed())); + assert!(block_on(sub1.wait_committed())); + + fail::cfg("APPLY_PUT", "1*return()").unwrap(); + fail::remove("APPLY_COMMITTED_ENTRIES"); + let resp = block_on(sub0.result()).unwrap(); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains("aborted"), + "{:?}", + resp + ); + let resp = block_on(sub1.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert_matches!(tablet.get_value(b"key2"), Ok(None)); + assert_eq!(tablet.get_value(b"key3").unwrap().unwrap(), b"value"); +} diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs new file mode 100644 index 00000000000..caaa5120325 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -0,0 +1,356 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + ops::{Deref, DerefMut}, + path::Path, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Mutex, + }, + thread, + time::{Duration, Instant}, +}; + +use crossbeam::channel::{self, Receiver, Sender}; +use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactoryV2}, + raft::RaftTestEngine, +}; +use engine_traits::{OpenOptions, TabletFactory, ALL_CFS}; +use futures::executor::block_on; +use kvproto::{ + metapb::Store, + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_serverpb::RaftMessage, +}; +use pd_client::RpcClient; +use raftstore::store::{region_meta::RegionMeta, Config, Transport, RAFT_INIT_LOG_INDEX}; +use raftstore_v2::{ + create_store_batch_system, + router::{DebugInfoChannel, PeerMsg, QueryResult}, + Bootstrap, StoreMeta, StoreRouter, StoreSystem, +}; +use slog::{o, Logger}; +use tempfile::TempDir; +use test_pd::mocker::Service; +use tikv_util::config::{ReadableDuration, VersionTrack}; + +#[derive(Clone)] +pub struct TestRouter(StoreRouter); + +impl Deref for TestRouter { + type Target = StoreRouter; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for TestRouter { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl TestRouter { + pub fn query(&self, region_id: u64, req: RaftCmdRequest) -> Option { + let (msg, sub) = PeerMsg::raft_query(req); + self.send(region_id, msg).unwrap(); + block_on(sub.result()) + } + + pub fn must_query_debug_info(&self, region_id: u64, timeout: Duration) -> Option { + let timer = Instant::now(); + while timer.elapsed() < timeout { + let (ch, sub) = DebugInfoChannel::pair(); + let msg = PeerMsg::QueryDebugInfo(ch); + let res = self.send(region_id, msg); + if res.is_err() { + thread::sleep(Duration::from_millis(10)); + continue; + } + return block_on(sub.result()); + } + None + } + + pub fn command(&self, region_id: u64, req: RaftCmdRequest) -> Option { + let (msg, sub) = PeerMsg::raft_command(req); + self.send(region_id, msg).unwrap(); + block_on(sub.result()) + } + + pub fn wait_applied_to_current_term(&self, region_id: u64, timeout: Duration) { + let mut now = Instant::now(); + let deadline = now + timeout; + let mut res = None; + while now < deadline { + res = self.must_query_debug_info(region_id, deadline - now); + if let Some(info) = &res { + // If term matches and apply to commit index, then it must apply to current + // term. + if info.raft_apply.applied_index == info.raft_apply.commit_index + && info.raft_apply.commit_term == info.raft_status.hard_state.term + { + return; + } + } + thread::sleep(Duration::from_millis(10)); + now = Instant::now(); + } + panic!( + "region {} is not applied to current term, {:?}", + region_id, res + ); + } +} + +pub struct RunningState { + pub raft_engine: RaftTestEngine, + pub factory: Arc, + pub system: StoreSystem, + pub cfg: Arc>, + pub transport: TestTransport, +} + +impl RunningState { + fn new( + pd_client: &RpcClient, + path: &Path, + cfg: Arc>, + transport: TestTransport, + logger: &Logger, + ) -> (TestRouter, Self) { + let cf_opts = ALL_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let factory = Arc::new(TestTabletFactoryV2::new( + path, + DbOptions::default(), + cf_opts, + )); + let raft_engine = + engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), None) + .unwrap(); + let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client, logger.clone()); + let store_id = bootstrap.bootstrap_store().unwrap(); + let mut store = Store::default(); + store.set_id(store_id); + if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { + if factory.exists(region.get_id(), RAFT_INIT_LOG_INDEX) { + factory + .destroy_tablet(region.get_id(), RAFT_INIT_LOG_INDEX) + .unwrap(); + } + factory + .open_tablet( + region.get_id(), + Some(RAFT_INIT_LOG_INDEX), + OpenOptions::default().set_create_new(true), + ) + .unwrap(); + } + + let (router, mut system) = create_store_batch_system::( + &cfg.value(), + store_id, + logger.clone(), + ); + + let store_meta = Arc::new(Mutex::new(StoreMeta::::new())); + system + .start( + store_id, + cfg.clone(), + raft_engine.clone(), + factory.clone(), + transport.clone(), + &router, + store_meta, + ) + .unwrap(); + + let state = Self { + raft_engine, + factory, + system, + cfg, + transport, + }; + (TestRouter(router), state) + } +} + +impl Drop for RunningState { + fn drop(&mut self) { + self.system.shutdown(); + } +} + +pub struct TestNode { + pd_client: RpcClient, + path: TempDir, + running_state: Option, + logger: Logger, +} + +impl TestNode { + fn with_pd(pd_server: &test_pd::Server) -> TestNode { + let logger = slog_global::borrow_global().new(o!()); + let pd_client = test_pd::util::new_client(pd_server.bind_addrs(), None); + let path = TempDir::new().unwrap(); + + TestNode { + pd_client, + path, + running_state: None, + logger, + } + } + + fn start(&mut self, cfg: Arc>, trans: TestTransport) -> TestRouter { + let (router, state) = + RunningState::new(&self.pd_client, self.path.path(), cfg, trans, &self.logger); + self.running_state = Some(state); + router + } + + pub fn tablet_factory(&self) -> &Arc { + &self.running_state().unwrap().factory + } + + fn stop(&mut self) { + self.running_state.take(); + } + + fn restart(&mut self) -> TestRouter { + let state = self.running_state().unwrap(); + let prev_transport = state.transport.clone(); + let cfg = state.cfg.clone(); + self.stop(); + self.start(cfg, prev_transport) + } + + pub fn running_state(&self) -> Option<&RunningState> { + self.running_state.as_ref() + } +} + +impl Drop for TestNode { + fn drop(&mut self) { + self.stop(); + } +} + +#[derive(Clone)] +pub struct TestTransport { + tx: Sender, + flush_cnt: Arc, +} + +pub fn new_test_transport() -> (TestTransport, Receiver) { + let (tx, rx) = channel::unbounded(); + let flush_cnt = Default::default(); + (TestTransport { tx, flush_cnt }, rx) +} + +impl Transport for TestTransport { + fn send(&mut self, msg: RaftMessage) -> raftstore_v2::Result<()> { + let _ = self.tx.send(msg); + Ok(()) + } + + fn set_store_allowlist(&mut self, _stores: Vec) {} + + fn need_flush(&self) -> bool { + !self.tx.is_empty() + } + + fn flush(&mut self) { + self.flush_cnt.fetch_add(1, Ordering::SeqCst); + } +} + +// TODO: remove following when we finally integrate it in tikv-server binary. +pub fn v2_default_config() -> Config { + let mut config = Config::default(); + config.store_io_pool_size = 1; + config +} + +/// Disable all ticks, so test case can schedule manually. +pub fn disable_all_auto_ticks(cfg: &mut Config) { + cfg.raft_base_tick_interval = ReadableDuration::ZERO; + cfg.raft_log_gc_tick_interval = ReadableDuration::ZERO; + cfg.raft_log_compact_sync_interval = ReadableDuration::ZERO; + cfg.raft_engine_purge_interval = ReadableDuration::ZERO; + cfg.split_region_check_tick_interval = ReadableDuration::ZERO; + cfg.region_compact_check_interval = ReadableDuration::ZERO; + cfg.pd_heartbeat_tick_interval = ReadableDuration::ZERO; + cfg.pd_store_heartbeat_tick_interval = ReadableDuration::ZERO; + cfg.snap_mgr_gc_tick_interval = ReadableDuration::ZERO; + cfg.lock_cf_compact_interval = ReadableDuration::ZERO; + cfg.peer_stale_state_check_interval = ReadableDuration::ZERO; + cfg.consistency_check_interval = ReadableDuration::ZERO; + cfg.report_region_flow_interval = ReadableDuration::ZERO; + cfg.check_leader_lease_interval = ReadableDuration::ZERO; + cfg.merge_check_tick_interval = ReadableDuration::ZERO; + cfg.cleanup_import_sst_interval = ReadableDuration::ZERO; + cfg.inspect_interval = ReadableDuration::ZERO; + cfg.report_min_resolved_ts_interval = ReadableDuration::ZERO; + cfg.reactive_memory_lock_tick_interval = ReadableDuration::ZERO; + cfg.report_region_buckets_tick_interval = ReadableDuration::ZERO; + cfg.check_long_uncommitted_interval = ReadableDuration::ZERO; +} + +pub struct Cluster { + pd_server: test_pd::Server, + nodes: Vec, + receivers: Vec>, + routers: Vec, +} + +impl Default for Cluster { + fn default() -> Cluster { + Cluster::with_node_count(1) + } +} + +impl Cluster { + pub fn with_node_count(count: usize) -> Self { + let pd_server = test_pd::Server::new(1); + let mut cluster = Cluster { + pd_server, + nodes: vec![], + receivers: vec![], + routers: vec![], + }; + let mut cfg = v2_default_config(); + disable_all_auto_ticks(&mut cfg); + for _ in 1..=count { + let mut node = TestNode::with_pd(&cluster.pd_server); + let (tx, rx) = new_test_transport(); + let router = node.start(Arc::new(VersionTrack::new(cfg.clone())), tx); + cluster.nodes.push(node); + cluster.receivers.push(rx); + cluster.routers.push(router); + } + cluster + } + + pub fn restart(&mut self, offset: usize) { + let router = self.nodes[offset].restart(); + self.routers[offset] = router; + } + + pub fn node(&self, offset: usize) -> &TestNode { + &self.nodes[offset] + } + + pub fn router(&self, offset: usize) -> TestRouter { + self.routers[offset].clone() + } +} diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index 5c5fc02b489..db37c7cbf64 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -4,337 +4,9 @@ #![feature(assert_matches)] #![feature(custom_test_frameworks)] #![test_runner(test_util::run_tests)] -// TODO: remove following when tests can be run. -#![allow(dead_code)] -#![allow(unused_imports)] - -use std::{ - ops::{Deref, DerefMut}, - path::Path, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Mutex, - }, - thread, - time::{Duration, Instant}, -}; - -use crossbeam::channel::{self, Receiver, Sender}; -use engine_test::{ - ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, TestTabletFactoryV2}, - raft::RaftTestEngine, -}; -use engine_traits::{OpenOptions, TabletFactory, ALL_CFS}; -use futures::executor::block_on; -use kvproto::{ - metapb::Store, - raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, - raft_serverpb::RaftMessage, -}; -use pd_client::RpcClient; -use raftstore::store::{region_meta::RegionMeta, Config, Transport, RAFT_INIT_LOG_INDEX}; -use raftstore_v2::{ - create_store_batch_system, - router::{DebugInfoChannel, PeerMsg, QueryResult}, - Bootstrap, StoreMeta, StoreRouter, StoreSystem, -}; -use slog::{o, Logger}; -use tempfile::TempDir; -use test_pd::mocker::Service; -use tikv_util::config::{ReadableDuration, VersionTrack}; +mod cluster; mod test_basic_write; mod test_life; mod test_read; mod test_status; - -#[derive(Clone)] -struct TestRouter(StoreRouter); - -impl Deref for TestRouter { - type Target = StoreRouter; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl DerefMut for TestRouter { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} - -impl TestRouter { - fn query(&self, region_id: u64, req: RaftCmdRequest) -> Option { - let (msg, sub) = PeerMsg::raft_query(req); - self.send(region_id, msg).unwrap(); - block_on(sub.result()) - } - - fn must_query_debug_info(&self, region_id: u64, timeout: Duration) -> Option { - let timer = Instant::now(); - while timer.elapsed() < timeout { - let (ch, sub) = DebugInfoChannel::pair(); - let msg = PeerMsg::QueryDebugInfo(ch); - if self.send(region_id, msg).is_err() { - thread::sleep(Duration::from_millis(10)); - continue; - } - return block_on(sub.result()); - } - None - } - - fn command(&self, region_id: u64, req: RaftCmdRequest) -> Option { - let (msg, sub) = PeerMsg::raft_command(req); - self.send(region_id, msg).unwrap(); - block_on(sub.result()) - } -} - -struct RunningState { - raft_engine: RaftTestEngine, - factory: Arc, - system: StoreSystem, - cfg: Arc>, - transport: TestTransport, -} - -impl RunningState { - fn new( - pd_client: &RpcClient, - path: &Path, - cfg: Arc>, - transport: TestTransport, - logger: &Logger, - ) -> (TestRouter, Self) { - let cf_opts = ALL_CFS - .iter() - .copied() - .map(|cf| (cf, CfOptions::default())) - .collect(); - let factory = Arc::new(TestTabletFactoryV2::new( - path, - DbOptions::default(), - cf_opts, - )); - let raft_engine = - engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), None) - .unwrap(); - let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client, logger.clone()); - let store_id = bootstrap.bootstrap_store().unwrap(); - let mut store = Store::default(); - store.set_id(store_id); - if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { - if factory.exists(region.get_id(), RAFT_INIT_LOG_INDEX) { - factory - .destroy_tablet(region.get_id(), RAFT_INIT_LOG_INDEX) - .unwrap(); - } - factory - .open_tablet( - region.get_id(), - Some(RAFT_INIT_LOG_INDEX), - OpenOptions::default().set_create_new(true), - ) - .unwrap(); - } - - let (router, mut system) = create_store_batch_system::( - &cfg.value(), - store_id, - logger.clone(), - ); - - let store_meta = Arc::new(Mutex::new(StoreMeta::::new())); - system - .start( - store_id, - cfg.clone(), - raft_engine.clone(), - factory.clone(), - transport.clone(), - &router, - store_meta, - ) - .unwrap(); - - let state = Self { - raft_engine, - factory, - system, - cfg, - transport, - }; - (TestRouter(router), state) - } -} - -impl Drop for RunningState { - fn drop(&mut self) { - self.system.shutdown(); - } -} - -struct TestNode { - pd_client: RpcClient, - path: TempDir, - running_state: Option, - logger: Logger, -} - -impl TestNode { - fn with_pd(pd_server: &test_pd::Server) -> TestNode { - let logger = slog_global::borrow_global().new(o!()); - let pd_client = test_pd::util::new_client(pd_server.bind_addrs(), None); - let path = TempDir::new().unwrap(); - - TestNode { - pd_client, - path, - running_state: None, - logger, - } - } - - fn start(&mut self, cfg: Arc>, trans: TestTransport) -> TestRouter { - let (router, state) = - RunningState::new(&self.pd_client, self.path.path(), cfg, trans, &self.logger); - self.running_state = Some(state); - router - } - - fn config(&self) -> &Arc> { - &self.running_state.as_ref().unwrap().cfg - } - - fn stop(&mut self) { - self.running_state.take(); - } - - fn restart(&mut self) -> TestRouter { - let state = self.running_state.as_ref().unwrap(); - let prev_transport = state.transport.clone(); - let cfg = state.cfg.clone(); - self.stop(); - self.start(cfg, prev_transport) - } -} - -impl Drop for TestNode { - fn drop(&mut self) { - self.stop(); - } -} - -#[derive(Clone)] -pub struct TestTransport { - tx: Sender, - flush_cnt: Arc, -} - -fn new_test_transport() -> (TestTransport, Receiver) { - let (tx, rx) = channel::unbounded(); - let flush_cnt = Default::default(); - (TestTransport { tx, flush_cnt }, rx) -} - -impl Transport for TestTransport { - fn send(&mut self, msg: RaftMessage) -> raftstore_v2::Result<()> { - let _ = self.tx.send(msg); - Ok(()) - } - - fn set_store_allowlist(&mut self, _stores: Vec) {} - - fn need_flush(&self) -> bool { - !self.tx.is_empty() - } - - fn flush(&mut self) { - self.flush_cnt.fetch_add(1, Ordering::SeqCst); - } -} - -// TODO: remove following when we finally integrate it in tikv-server binary. -fn v2_default_config() -> Config { - let mut config = Config::default(); - config.store_io_pool_size = 1; - config -} - -/// Disable all ticks, so test case can schedule manually. -fn disable_all_auto_ticks(cfg: &mut Config) { - cfg.raft_base_tick_interval = ReadableDuration::ZERO; - cfg.raft_log_gc_tick_interval = ReadableDuration::ZERO; - cfg.raft_log_compact_sync_interval = ReadableDuration::ZERO; - cfg.raft_engine_purge_interval = ReadableDuration::ZERO; - cfg.split_region_check_tick_interval = ReadableDuration::ZERO; - cfg.region_compact_check_interval = ReadableDuration::ZERO; - cfg.pd_heartbeat_tick_interval = ReadableDuration::ZERO; - cfg.pd_store_heartbeat_tick_interval = ReadableDuration::ZERO; - cfg.snap_mgr_gc_tick_interval = ReadableDuration::ZERO; - cfg.lock_cf_compact_interval = ReadableDuration::ZERO; - cfg.peer_stale_state_check_interval = ReadableDuration::ZERO; - cfg.consistency_check_interval = ReadableDuration::ZERO; - cfg.report_region_flow_interval = ReadableDuration::ZERO; - cfg.check_leader_lease_interval = ReadableDuration::ZERO; - cfg.merge_check_tick_interval = ReadableDuration::ZERO; - cfg.cleanup_import_sst_interval = ReadableDuration::ZERO; - cfg.inspect_interval = ReadableDuration::ZERO; - cfg.report_min_resolved_ts_interval = ReadableDuration::ZERO; - cfg.reactive_memory_lock_tick_interval = ReadableDuration::ZERO; - cfg.report_region_buckets_tick_interval = ReadableDuration::ZERO; - cfg.check_long_uncommitted_interval = ReadableDuration::ZERO; -} - -struct Cluster { - pd_server: test_pd::Server, - nodes: Vec, - receivers: Vec>, - routers: Vec, -} - -impl Default for Cluster { - fn default() -> Cluster { - Cluster::with_node_count(1) - } -} - -impl Cluster { - fn with_node_count(count: usize) -> Self { - let pd_server = test_pd::Server::new(1); - let mut cluster = Cluster { - pd_server, - nodes: vec![], - receivers: vec![], - routers: vec![], - }; - let mut cfg = v2_default_config(); - disable_all_auto_ticks(&mut cfg); - for _ in 1..=count { - let mut node = TestNode::with_pd(&cluster.pd_server); - let (tx, rx) = new_test_transport(); - let router = node.start(Arc::new(VersionTrack::new(cfg.clone())), tx); - cluster.nodes.push(node); - cluster.receivers.push(rx); - cluster.routers.push(router); - } - cluster - } - - fn restart(&mut self, offset: usize) { - let router = self.nodes[offset].restart(); - self.routers[offset] = router; - } - - fn node(&self, offset: usize) -> &TestNode { - &self.nodes[offset] - } - - fn router(&self, offset: usize) -> TestRouter { - self.routers[offset].clone() - } -} diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs index 7627d85c4e1..ce775982686 100644 --- a/components/raftstore-v2/tests/integrations/test_basic_write.rs +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -1,5 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::{assert_matches::assert_matches, time::Duration}; + +use engine_traits::{OpenOptions, Peekable, TabletFactory}; use futures::executor::block_on; use kvproto::{ raft_cmdpb::{CmdType, RaftCmdRequest, Request}, @@ -9,7 +12,7 @@ use raftstore::store::{INIT_EPOCH_CONF_VER, INIT_EPOCH_VER}; use raftstore_v2::router::PeerMsg; use tikv_util::store::new_peer; -use crate::Cluster; +use crate::cluster::Cluster; /// Test basic write flow. #[test] @@ -28,16 +31,15 @@ fn test_basic_write() { put_req.mut_put().set_value(b"value".to_vec()); req.mut_requests().push(put_req); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + // Good proposal should be committed. let (msg, mut sub) = PeerMsg::raft_command(req.clone()); router.send(2, msg).unwrap(); - // TODO: check proposed event is triggered. It won't work for now as there is no - // apply yet. - // assert!(block_on(sub.wait_proposed())); - // Epoch checker is not introduced yet, so committed won't be triggerred. - // Instead, it will be cancelled. - assert!(!block_on(sub.wait_committed())); - // TODO: verify it's applied. + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); // Store id should be checked. let mut invalid_req = req.clone(); @@ -112,3 +114,48 @@ fn test_basic_write() { let resp = router.command(2, req).unwrap(); assert!(resp.get_header().get_error().has_not_leader(), "{:?}", resp); } + +#[test] +fn test_put_delete() { + let cluster = Cluster::default(); + let router = cluster.router(0); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(2); + let epoch = req.mut_header().mut_region_epoch(); + epoch.set_version(INIT_EPOCH_VER); + epoch.set_conf_ver(INIT_EPOCH_CONF_VER); + req.mut_header().set_peer(new_peer(1, 3)); + let mut put_req = Request::default(); + put_req.set_cmd_type(CmdType::Put); + put_req.mut_put().set_key(b"key".to_vec()); + put_req.mut_put().set_value(b"value".to_vec()); + req.mut_requests().push(put_req); + + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + let tablet_factory = cluster.node(0).tablet_factory(); + let tablet = tablet_factory + .open_tablet(2, None, OpenOptions::default().set_cache_only(true)) + .unwrap(); + assert!(tablet.get_value(b"key").unwrap().is_none()); + let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + router.send(2, msg).unwrap(); + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert_eq!(tablet.get_value(b"key").unwrap().unwrap(), b"value"); + + let mut delete_req = Request::default(); + delete_req.set_cmd_type(CmdType::Delete); + delete_req.mut_delete().set_key(b"key".to_vec()); + req.clear_requests(); + req.mut_requests().push(delete_req); + let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + router.send(2, msg).unwrap(); + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert_matches!(tablet.get_value(b"key"), Ok(None)); +} diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs index aebd0ee70bf..e905e7e4ac2 100644 --- a/components/raftstore-v2/tests/integrations/test_life.rs +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -11,13 +11,12 @@ use engine_traits::{RaftEngine, RaftEngineReadOnly}; use futures::executor::block_on; use kvproto::{ metapb, - raft_cmdpb::{RaftCmdRequest, StatusCmdType}, raft_serverpb::{PeerState, RaftMessage}, }; use raftstore_v2::router::{DebugInfoChannel, PeerMsg}; use tikv_util::store::new_peer; -use crate::{Cluster, TestRouter}; +use crate::cluster::{Cluster, TestRouter}; fn assert_peer_not_exist(region_id: u64, peer_id: u64, router: &TestRouter) { let timer = Instant::now(); @@ -119,7 +118,7 @@ fn test_life_by_message() { .must_query_debug_info(test_region_id, timeout) .unwrap(); assert_eq!(meta.raft_status.id, test_peer_id); - let raft_engine = &cluster.node(0).running_state.as_ref().unwrap().raft_engine; + let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; raft_engine.get_raft_state(test_region_id).unwrap().unwrap(); raft_engine .get_apply_state(test_region_id) @@ -137,7 +136,7 @@ fn test_life_by_message() { cluster.restart(0); let router = cluster.router(0); assert_peer_not_exist(test_region_id, test_peer_id, &router); - let raft_engine = &cluster.node(0).running_state.as_ref().unwrap().raft_engine; + let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; assert_tombstone(raft_engine, test_region_id, &new_peer(1, test_peer_id)); } diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs index 90a6cf671c6..8e2c3eeb04f 100644 --- a/components/raftstore-v2/tests/integrations/test_read.rs +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -1,16 +1,10 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::assert_matches::assert_matches; - -use futures::executor::block_on; -use kvproto::{ - kvrpcpb::Context, - raft_cmdpb::{CmdType, GetRequest, RaftCmdRequest, ReadIndexRequest, Request, StatusCmdType}, -}; -use tikv_util::{codec::number::NumberEncoder, store::new_peer}; +use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, ReadIndexRequest, Request, StatusCmdType}; +use tikv_util::store::new_peer; use txn_types::WriteBatchFlags; -use crate::Cluster; +use crate::cluster::Cluster; #[test] fn test_read_index() { diff --git a/components/raftstore-v2/tests/integrations/test_status.rs b/components/raftstore-v2/tests/integrations/test_status.rs index bb7071ab16d..1f7415d9da3 100644 --- a/components/raftstore-v2/tests/integrations/test_status.rs +++ b/components/raftstore-v2/tests/integrations/test_status.rs @@ -1,13 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::assert_matches::assert_matches; - -use futures::executor::block_on; use kvproto::raft_cmdpb::{RaftCmdRequest, StatusCmdType}; -use raftstore_v2::router::{PeerMsg, PeerTick, QueryResChannel, QueryResult, RaftRequest}; use tikv_util::store::new_peer; -use crate::Cluster; +use crate::cluster::Cluster; #[test] fn test_status() { diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 510196f9ce1..a84a60183b6 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -97,10 +97,11 @@ use crate::{ Error, Result, }; -const DEFAULT_APPLY_WB_SIZE: usize = 4 * 1024; -const APPLY_WB_SHRINK_SIZE: usize = 1024 * 1024; -const SHRINK_PENDING_CMD_QUEUE_CAP: usize = 64; -const MAX_APPLY_BATCH_SIZE: usize = 64 * 1024 * 1024; +// These consts are shared in both v1 and v2. +pub const DEFAULT_APPLY_WB_SIZE: usize = 4 * 1024; +pub const APPLY_WB_SHRINK_SIZE: usize = 1024 * 1024; +pub const SHRINK_PENDING_CMD_QUEUE_CAP: usize = 64; +pub const MAX_APPLY_BATCH_SIZE: usize = 64 * 1024 * 1024; pub struct PendingCmd { pub index: u64, From 6a78b01181cfe8de4346d7505b3a84b32d5ed421 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 30 Sep 2022 11:41:45 +0800 Subject: [PATCH 254/676] *: Refine interfaces for TiFlash when using TiKV as a component (#13487) close tikv/tikv#12849 Export necessary functions for TiFlash when using TiKV as a component Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- components/encryption/src/manager/mod.rs | 5 +++++ src/config.rs | 3 ++- src/server/service/diagnostics/mod.rs | 3 ++- src/server/service/mod.rs | 2 +- src/storage/config_manager.rs | 18 +++++++++--------- 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index fb6b2312027..0f78e794629 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -732,6 +732,11 @@ impl DataKeyManager { }; Ok(Some(encrypted_file)) } + + /// Return which method this manager is using. + pub fn encryption_method(&self) -> engine_traits::EncryptionMethod { + crypter::to_engine_encryption_method(self.method) + } } impl Drop for DataKeyManager { diff --git a/src/config.rs b/src/config.rs index 265bcce4071..f4fbf17a38f 100644 --- a/src/config.rs +++ b/src/config.rs @@ -100,7 +100,8 @@ const LOCKCF_MIN_MEM: usize = 256 * MIB as usize; const LOCKCF_MAX_MEM: usize = GIB as usize; const RAFT_MIN_MEM: usize = 256 * MIB as usize; const RAFT_MAX_MEM: usize = 2 * GIB as usize; -const LAST_CONFIG_FILE: &str = "last_tikv.toml"; +/// Configs that actually took effect in the last run +pub const LAST_CONFIG_FILE: &str = "last_tikv.toml"; const TMP_CONFIG_FILE: &str = "tmp_tikv.toml"; const MAX_BLOCK_SIZE: usize = 32 * MIB as usize; diff --git a/src/server/service/diagnostics/mod.rs b/src/server/service/diagnostics/mod.rs index 60df07aa167..abede000858 100644 --- a/src/server/service/diagnostics/mod.rs +++ b/src/server/service/diagnostics/mod.rs @@ -28,7 +28,8 @@ use tokio::runtime::Handle; use crate::server::Error; mod log; -mod sys; +/// Information about the current hardware and operating system. +pub mod sys; lazy_static! { pub static ref SYS_INFO: Mutex = Mutex::new(sysinfo::System::new()); diff --git a/src/server/service/mod.rs b/src/server/service/mod.rs index d80c2f6806c..1576e7db41c 100644 --- a/src/server/service/mod.rs +++ b/src/server/service/mod.rs @@ -2,7 +2,7 @@ mod batch; mod debug; -mod diagnostics; +pub mod diagnostics; mod kv; pub use self::{ diff --git a/src/storage/config_manager.rs b/src/storage/config_manager.rs index 8bc92a7f697..de3b13408f0 100644 --- a/src/storage/config_manager.rs +++ b/src/storage/config_manager.rs @@ -4,7 +4,7 @@ use std::{convert::TryInto, sync::Arc}; -use engine_traits::{CfNamesExt, CfOptionsExt, TabletFactory, CF_DEFAULT}; +use engine_traits::{KvEngine, TabletFactory, CF_DEFAULT}; use file_system::{get_io_rate_limiter, IoPriority, IoType}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use strum::IntoEnumIterator; @@ -19,20 +19,20 @@ use crate::{ storage::{lock_manager::LockManager, txn::flow_controller::FlowController, TxnScheduler}, }; -pub struct StorageConfigManger { - tablet_factory: Arc + Send + Sync>, +pub struct StorageConfigManger { + tablet_factory: Arc + Send + Sync>, shared_block_cache: bool, ttl_checker_scheduler: Scheduler, flow_controller: Arc, scheduler: TxnScheduler, } -unsafe impl Send for StorageConfigManger {} -unsafe impl Sync for StorageConfigManger {} +unsafe impl Send for StorageConfigManger {} +unsafe impl Sync for StorageConfigManger {} -impl StorageConfigManger { +impl StorageConfigManger { pub fn new( - tablet_factory: Arc + Send + Sync>, + tablet_factory: Arc + Send + Sync>, shared_block_cache: bool, ttl_checker_scheduler: Scheduler, flow_controller: Arc, @@ -48,7 +48,7 @@ impl StorageConfigManger { } } -impl ConfigManager for StorageConfigManger { +impl ConfigManager for StorageConfigManger { fn dispatch(&mut self, mut change: ConfigChange) -> CfgResult<()> { if let Some(ConfigValue::Module(mut block_cache)) = change.remove("block_cache") { if !self.shared_block_cache { @@ -74,7 +74,7 @@ impl ConfigManager for StorageConfigManger { let enable: bool = v.into(); let enable_str = if enable { "true" } else { "false" }; self.tablet_factory.for_each_opened_tablet( - &mut |_region_id, _suffix, tablet: &EK::Local| { + &mut |_region_id, _suffix, tablet: &K| { for cf in tablet.cf_names() { tablet .set_options_cf(cf, &[("disable_write_stall", enable_str)]) From e412adfb32747a10339a7937fed019a1295fdea9 Mon Sep 17 00:00:00 2001 From: zzm Date: Fri, 30 Sep 2022 16:17:45 +0800 Subject: [PATCH 255/676] storage, raftstore, causal-ts: Get snapshot before raw put(delete) to ensure that causal ts provider flush correctly (#13520) close tikv/tikv#13502, ref tikv/tikv#13550 1. Move causal ts provider's `flush` from coprocessor observer to raftstore pd woker. When implementing asynchronous refresh, `storage` can check whether the causal ts provider has completed the refresh through the flag `max_ts_sync_status`. 2. To check `max_ts_sync_status` in `storage`, we need get snapshot. Signed-off-by: zeminzhou Signed-off-by: zzm Co-authored-by: Ping Yu --- Cargo.lock | 2 +- components/backup/src/endpoint.rs | 12 +- components/causal_ts/Cargo.toml | 1 - components/causal_ts/benches/tso.rs | 4 +- components/causal_ts/src/lib.rs | 27 +-- components/causal_ts/src/observer.rs | 105 --------- components/causal_ts/src/tso.rs | 44 ++-- components/cdc/src/endpoint.rs | 7 +- .../cdc/tests/failpoints/test_endpoint.rs | 2 +- components/cdc/tests/mod.rs | 17 +- components/raftstore/Cargo.toml | 1 + components/raftstore/src/store/fsm/store.rs | 5 + components/raftstore/src/store/worker/pd.rs | 27 ++- components/server/src/server.rs | 7 +- components/test_raftstore/src/node.rs | 1 + components/test_raftstore/src/server.rs | 8 +- src/server/node.rs | 4 + src/storage/mod.rs | 217 +++++++++--------- src/storage/txn/commands/atomic_store.rs | 47 ++-- .../txn/commands/check_secondary_locks.rs | 2 + src/storage/txn/commands/check_txn_status.rs | 2 + src/storage/txn/commands/compare_and_swap.rs | 105 +++++---- src/storage/txn/commands/mod.rs | 23 ++ .../txn/commands/pessimistic_rollback.rs | 1 + src/storage/txn/commands/prewrite.rs | 9 + src/storage/txn/commands/txn_heart_beat.rs | 2 + src/storage/txn/scheduler.rs | 78 ++++++- tests/failpoints/cases/test_rawkv.rs | 68 ++++-- tests/failpoints/cases/test_storage.rs | 26 ++- .../integrations/config/dynamic/raftstore.rs | 1 + .../integrations/raftstore/test_bootstrap.rs | 1 + tests/integrations/raftstore/test_merge.rs | 7 +- .../raftstore/test_transfer_leader.rs | 7 +- tests/integrations/server/kv_service.rs | 1 + 34 files changed, 497 insertions(+), 374 deletions(-) delete mode 100644 components/causal_ts/src/observer.rs diff --git a/Cargo.lock b/Cargo.lock index 2091ea3d4f0..14620ebb6d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -758,7 +758,6 @@ dependencies = [ "prometheus", "prometheus-static-metric", "raft", - "raftstore", "serde", "serde_derive", "slog", @@ -4175,6 +4174,7 @@ dependencies = [ "bitflags", "byteorder", "bytes", + "causal_ts", "collections", "concurrency_manager", "crc32fast", diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 1d4f9bbfdd9..92131381017 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -15,7 +15,7 @@ use engine_rocks::RocksEngine; use engine_traits::{name_to_cf, raw_ttl::ttl_current_ts, CfName, SstCompressionType}; use external_storage::{BackendConfig, HdfsConfig}; use external_storage_export::{create_storage, ExternalStorage}; -use futures::channel::mpsc::*; +use futures::{channel::mpsc::*, executor::block_on}; use kvproto::{ brpb::*, encryptionpb::EncryptionMethod, @@ -982,7 +982,9 @@ impl Endpoint { if let Err(e) = self .causal_ts_provider .as_ref() - .map_or(Ok(()), |provider| provider.flush()) + .map_or(Ok(TimeStamp::new(0)), |provider| { + block_on(provider.async_flush()) + }) { error!("backup flush causal timestamp failed"; "err" => ?e); let mut response = BackupResponse::default(); @@ -1826,7 +1828,7 @@ pub mod tests { let limiter = Arc::new(IoRateLimiter::new_for_test()); let ts_provider: Arc = Arc::new(causal_ts::tests::TestProvider::default().into()); - let start_ts = ts_provider.get_ts().unwrap(); + let start_ts = block_on(ts_provider.async_get_ts()).unwrap(); let (tmp, endpoint) = new_endpoint_with_limiter( Some(limiter), ApiVersion::V2, @@ -1844,8 +1846,8 @@ pub mod tests { req.set_dst_api_version(ApiVersion::V2); let (task, _) = Task::new(req, tx).unwrap(); endpoint.handle_backup_task(task); - let end_ts = ts_provider.get_ts().unwrap(); - assert_eq!(end_ts.into_inner(), start_ts.next().into_inner() + 100); + let end_ts = block_on(ts_provider.async_get_ts()).unwrap(); + assert_eq!(end_ts.into_inner(), start_ts.next().into_inner() + 101); } #[test] diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index d4a7d95d4ea..beaf5575c80 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -24,7 +24,6 @@ pd_client = { path = "../pd_client", default-features = false } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } serde = "1.0" serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/causal_ts/benches/tso.rs b/components/causal_ts/benches/tso.rs index 66d950a52b5..72d381a4be7 100644 --- a/components/causal_ts/benches/tso.rs +++ b/components/causal_ts/benches/tso.rs @@ -87,7 +87,7 @@ fn bench_batch_tso_provider_get_ts(c: &mut Criterion) { c.bench_function("bench_batch_tso_provider_get_ts", |b| { b.iter(|| { - black_box(provider.get_ts().unwrap()); + black_box(block_on(provider.async_get_ts()).unwrap()); }) }); } @@ -108,7 +108,7 @@ fn bench_batch_tso_provider_flush(c: &mut Criterion) { c.bench_function("bench_batch_tso_provider_flush", |b| { b.iter(|| { - black_box(provider.flush()).unwrap(); + black_box(block_on(provider.async_flush())).unwrap(); }) }); } diff --git a/components/causal_ts/src/lib.rs b/components/causal_ts/src/lib.rs index b32e33540f6..3eb59f35c36 100644 --- a/components/causal_ts/src/lib.rs +++ b/components/causal_ts/src/lib.rs @@ -13,12 +13,11 @@ pub use errors::*; mod tso; pub use tso::*; mod metrics; -pub use metrics::*; -mod observer; use async_trait::async_trait; use enum_dispatch::enum_dispatch; -use futures::executor::block_on; -pub use observer::*; +pub use metrics::*; +#[cfg(any(test, feature = "testexport"))] +use test_pd_client::TestPdClient; use txn_types::TimeStamp; pub use crate::errors::Result; @@ -27,26 +26,18 @@ pub use crate::errors::Result; #[enum_dispatch] pub trait CausalTsProvider: Send + Sync { /// Get a new timestamp. - fn get_ts(&self) -> Result { - block_on(self.async_get_ts()) - } - - /// Flush (cached) timestamps to keep causality on some events, such as - /// "leader transfer". - fn flush(&self) -> Result<()> { - block_on(self.async_flush()) - } - async fn async_get_ts(&self) -> Result; - async fn async_flush(&self) -> Result<()>; + /// Flush (cached) timestamps and return first timestamp to keep causality + /// on some events, such as "leader transfer". + async fn async_flush(&self) -> Result; } #[enum_dispatch(CausalTsProvider)] pub enum CausalTsProviderImpl { BatchTsoProvider(BatchTsoProvider), #[cfg(any(test, feature = "testexport"))] - BatchTsoProviderTest(BatchTsoProvider), + BatchTsoProviderTest(BatchTsoProvider), TestProvider(tests::TestProvider), } @@ -81,9 +72,9 @@ pub mod tests { // This is used for unit test. Add 100 from current. // Do not modify this value as several test cases depend on it. - async fn async_flush(&self) -> Result<()> { + async fn async_flush(&self) -> Result { self.ts.fetch_add(100, Ordering::Relaxed); - Ok(()) + self.async_get_ts().await } } } diff --git a/components/causal_ts/src/observer.rs b/components/causal_ts/src/observer.rs deleted file mode 100644 index 4b101c01b14..00000000000 --- a/components/causal_ts/src/observer.rs +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::Arc; - -use engine_traits::KvEngine; -use kvproto::metapb::Region; -use raft::StateRole; -use raftstore::coprocessor::{ - BoxRegionChangeObserver, BoxRoleObserver, Coprocessor, CoprocessorHost, ObserverContext, - RegionChangeEvent, RegionChangeObserver, RegionChangeReason, RoleChange, RoleObserver, -}; - -use crate::CausalTsProvider; - -/// CausalObserver appends timestamp for RawKV V2 data, and invoke -/// causal_ts_provider.flush() on specified event, e.g. leader -/// transfer, snapshot apply. -/// Should be used ONLY when API v2 is enabled. -pub struct CausalObserver { - causal_ts_provider: Arc, -} - -impl Clone for CausalObserver { - fn clone(&self) -> Self { - Self { - causal_ts_provider: self.causal_ts_provider.clone(), - } - } -} - -// Causal observer's priority should be higher than all other observers, to -// avoid being bypassed. -const CAUSAL_OBSERVER_PRIORITY: u32 = 0; -impl CausalObserver { - pub fn new(causal_ts_provider: Arc) -> Self { - Self { causal_ts_provider } - } - - pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { - coprocessor_host - .registry - .register_role_observer(CAUSAL_OBSERVER_PRIORITY, BoxRoleObserver::new(self.clone())); - coprocessor_host.registry.register_region_change_observer( - CAUSAL_OBSERVER_PRIORITY, - BoxRegionChangeObserver::new(self.clone()), - ); - } -} - -const REASON_LEADER_TRANSFER: &str = "leader_transfer"; -const REASON_REGION_MERGE: &str = "region_merge"; - -impl CausalObserver { - fn flush_timestamp(&self, region: &Region, reason: &'static str) { - fail::fail_point!("causal_observer_flush_timestamp", |_| ()); - - if let Err(err) = self.causal_ts_provider.flush() { - warn!("CausalObserver::flush_timestamp error"; "error" => ?err, "region_id" => region.get_id(), "region" => ?region, "reason" => reason); - } else { - debug!("CausalObserver::flush_timestamp succeed"; "region_id" => region.get_id(), "region" => ?region, "reason" => reason); - } - } -} - -impl Coprocessor for CausalObserver {} - -impl RoleObserver for CausalObserver { - /// Observe becoming leader, to flush CausalTsProvider. - fn on_role_change(&self, ctx: &mut ObserverContext<'_>, role_change: &RoleChange) { - // In scenario of frequent leader transfer, the observing of change from - // follower to leader by `on_role_change` would be later than the real role - // change in raft state and adjacent write commands. - // This would lead to the late of flush, and violate causality. See issue - // #12498. So we observe role change to Candidate to fix this issue. - // Also note that when there is only one peer, it would become leader directly. - if role_change.state == StateRole::Candidate - || (ctx.region().peers.len() == 1 && role_change.state == StateRole::Leader) - { - self.flush_timestamp(ctx.region(), REASON_LEADER_TRANSFER); - } - } -} - -impl RegionChangeObserver for CausalObserver { - fn on_region_changed( - &self, - ctx: &mut ObserverContext<'_>, - event: RegionChangeEvent, - role: StateRole, - ) { - if role != StateRole::Leader { - return; - } - - // In the scenario of region merge, the target region would merge some entries - // from source region with larger timestamps (when leader of source region is in - // another store with larger TSO batch than the store of target region's - // leader). So we need a flush after commit merge. See issue #12680. - // TODO: do not need flush if leaders of source & target region are in the same - // store. - if let RegionChangeEvent::Update(RegionChangeReason::CommitMerge) = event { - self.flush_timestamp(ctx.region(), REASON_REGION_MERGE); - } - } -} diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 70aa692dd15..5a9d119f6d5 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -31,6 +31,8 @@ use std::{ }; use async_trait::async_trait; +#[cfg(test)] +use futures::executor::block_on; use parking_lot::RwLock; use pd_client::PdClient; use tikv_util::{ @@ -560,6 +562,16 @@ impl BatchTsoProvider { pub fn tso_usage(&self) -> u32 { self.batch_list.usage() } + + #[cfg(test)] + pub fn get_ts(&self) -> Result { + block_on(self.async_get_ts()) + } + + #[cfg(test)] + pub fn flush(&self) -> Result { + block_on(self.async_flush()) + } } const GET_TS_MAX_RETRY: u32 = 3; @@ -609,8 +621,14 @@ impl CausalTsProvider for BatchTsoProvider { Err(Error::TsoBatchUsedUp(last_batch_size)) } - async fn async_flush(&self) -> Result<()> { - self.renew_tso_batch(true, TsoBatchRenewReason::flush).await + async fn async_flush(&self) -> Result { + fail::fail_point!("causal_ts_provider_flush", |_| Err(box_err!( + "async_flush err(failpoints)" + ))); + self.renew_tso_batch(true, TsoBatchRenewReason::flush) + .await?; + // TODO: Return the first tso by renew_tso_batch instead of async_get_ts + self.async_get_ts().await } } @@ -634,8 +652,8 @@ impl CausalTsProvider for SimpleTsoProvider { Ok(ts) } - async fn async_flush(&self) -> Result<()> { - Ok(()) + async fn async_flush(&self) -> Result { + self.async_get_ts().await } } @@ -858,7 +876,7 @@ pub mod tests { let provider = SimpleTsoProvider::new(pd_cli.clone()); pd_cli.set_tso(100.into()); - let ts = provider.get_ts().unwrap(); + let ts = block_on(provider.async_get_ts()).unwrap(); assert_eq!(ts, 101.into(), "ts: {:?}", ts); } @@ -886,12 +904,12 @@ pub mod tests { assert_eq!(provider.tso_remain(), 90); assert_eq!(provider.tso_usage(), 10); - provider.flush().unwrap(); // allocated: [1101, 1200] - assert_eq!(provider.tso_remain(), 100); - assert_eq!(provider.tso_usage(), 0); + assert_eq!(provider.flush().unwrap(), TimeStamp::from(1101)); // allocated: [1101, 1200] + assert_eq!(provider.tso_remain(), 99); + assert_eq!(provider.tso_usage(), 1); // used up pd_cli.trigger_tso_failure(); // make renew fail to verify used-up - for ts in 1101..=1200u64 { + for ts in 1102..=1200u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } assert_eq!(provider.tso_remain(), 0); @@ -900,8 +918,8 @@ pub mod tests { assert_eq!(provider.tso_remain(), 0); assert_eq!(provider.tso_usage(), 100); - provider.flush().unwrap(); // allocated: [1201, 2200] - for ts in 1201..=1260u64 { + assert_eq!(provider.flush().unwrap(), TimeStamp::from(1201)); // allocated: [1201, 2200] + for ts in 1202..=1260u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } assert_eq!(provider.tso_remain(), 940); @@ -979,9 +997,9 @@ pub mod tests { pd_cli.trigger_tso_failure(); provider.flush().unwrap_err(); - provider.flush().unwrap(); // allocated: [1301, 3300] + assert_eq!(provider.flush().unwrap(), TimeStamp::from(1301)); // allocated: [1301, 3300] pd_cli.trigger_tso_failure(); // make renew fail to verify used-up - for ts in 1301..=3300u64 { + for ts in 1302..=3300u64 { assert_eq!(TimeStamp::from(ts), provider.get_ts().unwrap()) } provider.get_ts().unwrap_err(); diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index fccf0ec0cad..26c0a11371e 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1025,7 +1025,7 @@ impl, E: KvEngine> Endpoint { // RawKV write requests will get larger TSO after this point. // RawKV CDC's resolved_ts is guaranteed by ConcurrencyManager::global_min_lock_ts, // which lock flying keys's ts in raw put and delete interfaces in `Storage`. - Some(provider) => provider.get_ts().unwrap_or_default(), + Some(provider) => provider.async_get_ts().await.unwrap_or_default(), None => pd_client.get_tso().await.unwrap_or_default(), }; let mut min_ts = min_ts_pd; @@ -1285,6 +1285,7 @@ mod tests { use std::ops::{Deref, DerefMut}; use engine_rocks::RocksEngine; + use futures::executor::block_on; use kvproto::{ cdcpb::{ChangeDataRequestKvApi, Header}, errorpb::Error as ErrorHeader, @@ -1893,7 +1894,7 @@ mod tests { }; let ts_provider: Arc = Arc::new(causal_ts::tests::TestProvider::default().into()); - let start_ts = ts_provider.get_ts().unwrap(); + let start_ts = block_on(ts_provider.async_get_ts()).unwrap(); let mut suite = mock_endpoint_with_ts_provider(&cfg, None, ApiVersion::V2, Some(ts_provider.clone())); suite.run(Task::RegisterMinTsEvent); @@ -1902,7 +1903,7 @@ mod tests { .recv_timeout(Duration::from_millis(1500)) .unwrap() .unwrap(); - let end_ts = ts_provider.get_ts().unwrap(); + let end_ts = block_on(ts_provider.async_get_ts()).unwrap(); assert!(end_ts.into_inner() > start_ts.next().into_inner()); // may trigger more than once. } diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index 19e24926d5b..31c302c3c14 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -493,7 +493,7 @@ fn test_cdc_rawkv_resolved_ts() { let pause_write_fp = "raftkv_async_write"; fail::cfg(pause_write_fp, "pause").unwrap(); - let ts = ts_provider.get_ts().unwrap(); + let ts = block_on(ts_provider.async_get_ts()).unwrap(); let handle = thread::spawn(move || { let _ = client.raw_put(&put_req).unwrap(); }); diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index a14ebd14c80..c14a91de99a 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -7,6 +7,7 @@ use cdc::{recv_timeout, CdcObserver, FeatureGate, MemoryQuota, Task}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::RocksEngine; +use futures::executor::block_on; use grpcio::{ ChannelBuilder, ClientDuplexReceiver, ClientDuplexSender, ClientUnaryReceiver, Environment, }; @@ -512,12 +513,14 @@ impl TestSuite { pub fn flush_causal_timestamp_for_region(&mut self, region_id: u64) { let leader = self.cluster.leader_of_region(region_id).unwrap(); - self.cluster - .sim - .rl() - .get_causal_ts_provider(leader.get_store_id()) - .unwrap() - .flush() - .unwrap(); + block_on( + self.cluster + .sim + .rl() + .get_causal_ts_provider(leader.get_store_id()) + .unwrap() + .async_flush(), + ) + .unwrap(); } } diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 3b47ca08ec5..4c41b19c828 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -32,6 +32,7 @@ batch-system = { path = "../batch-system", default-features = false } bitflags = "1.0.1" byteorder = "1.2" bytes = "1.0" +causal_ts = { path = "../causal_ts" } collections = { path = "../collections" } concurrency_manager = { path = "../concurrency_manager", default-features = false } crc32fast = "1.2" diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 160a63a394a..d53270c2ef0 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -19,6 +19,7 @@ use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, Config as BatchSystemConfig, Fsm, HandleResult, HandlerBuilder, PollHandler, Priority, }; +use causal_ts::CausalTsProviderImpl; use collections::{HashMap, HashMapEntry, HashSet}; use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{unbounded, TryRecvError, TrySendError}; @@ -1461,6 +1462,7 @@ impl RaftBatchSystem { concurrency_manager: ConcurrencyManager, collector_reg_handle: CollectorRegHandle, health_service: Option, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Result<()> { assert!(self.workers.is_none()); // TODO: we can get cluster meta regularly too later. @@ -1599,6 +1601,7 @@ impl RaftBatchSystem { collector_reg_handle, region_read_progress, health_service, + causal_ts_provider, )?; Ok(()) } @@ -1615,6 +1618,7 @@ impl RaftBatchSystem { collector_reg_handle: CollectorRegHandle, region_read_progress: RegionReadProgressRegistry, health_service: Option, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Result<()> { let cfg = builder.cfg.value().clone(); let store = builder.store.clone(); @@ -1696,6 +1700,7 @@ impl RaftBatchSystem { region_read_progress, health_service, coprocessor_host, + causal_ts_provider, ); assert!(workers.pd_worker.start_with_timer(pd_runner)); diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 75393d486f9..f3518f4f674 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -14,6 +14,7 @@ use std::{ time::{Duration, Instant}, }; +use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine}; @@ -47,6 +48,7 @@ use tikv_util::{ warn, worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}, }; +use txn_types::TimeStamp; use yatp::Remote; use crate::{ @@ -895,6 +897,7 @@ where health_service: Option, curr_health_status: ServingStatus, coprocessor_host: CoprocessorHost, + causal_ts_provider: Option>, // used for rawkv apiv2 } impl Runner @@ -920,6 +923,7 @@ where region_read_progress: RegionReadProgressRegistry, health_service: Option, coprocessor_host: CoprocessorHost, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Runner { // Register the region CPU records collector. let mut region_cpu_records_collector = None; @@ -964,6 +968,7 @@ where health_service, curr_health_status: ServingStatus::Serving, coprocessor_host, + causal_ts_provider, } } @@ -1600,10 +1605,30 @@ where ) { let pd_client = self.pd_client.clone(); let concurrency_manager = self.concurrency_manager.clone(); + let causal_ts_provider = self.causal_ts_provider.clone(); + let f = async move { let mut success = false; while txn_ext.max_ts_sync_status.load(Ordering::SeqCst) == initial_status { - match pd_client.get_tso().await { + // On leader transfer / region merge, RawKV API v2 need to invoke + // causal_ts_provider.flush() to renew cached TSO, to ensure that + // the next TSO returned by causal_ts_provider.get_ts() on current + // store must be larger than the store where the leader is on before. + // + // And it won't break correctness of transaction commands, as + // causal_ts_provider.flush() is implemented as pd_client.get_tso() + renew TSO + // cached. + let res: crate::Result = + if let Some(causal_ts_provider) = &causal_ts_provider { + causal_ts_provider + .async_flush() + .await + .map_err(|e| box_err!(e)) + } else { + pd_client.get_tso().await.map_err(Into::into) + }; + + match res { Ok(ts) => { concurrency_manager.update_max_ts(ts); // Set the least significant bit to 1 to mark it as synced. diff --git a/components/server/src/server.rs b/components/server/src/server.rs index bafc61ea077..247bc6ccb58 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -862,12 +862,6 @@ where None }; - // Register causal observer for RawKV API V2 - if let Some(provider) = self.causal_ts_provider.clone() { - let causal_ob = causal_ts::CausalObserver::new(provider); - causal_ob.register_to(self.coprocessor_host.as_mut().unwrap()); - }; - let check_leader_runner = CheckLeaderRunner::new( engines.store_meta.clone(), self.coprocessor_host.clone().unwrap(), @@ -1055,6 +1049,7 @@ where auto_split_controller, self.concurrency_manager.clone(), collector_reg_handle, + self.causal_ts_provider.clone(), ) .unwrap_or_else(|e| fatal!("failed to start node: {}", e)); diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index d6aa1eaefc8..78d98e5a5d3 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -325,6 +325,7 @@ impl Simulator for NodeCluster { AutoSplitController::default(), cm, CollectorRegHandle::new_for_test(), + None, )?; assert!( engines diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index a3a9455fb20..67eb3a22db6 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -381,10 +381,7 @@ impl ServerCluster { .unwrap() .into(), ); - self.causal_ts_providers - .insert(node_id, causal_ts_provider.clone()); - let causal_ob = causal_ts::CausalObserver::new(causal_ts_provider); - causal_ob.register_to(&mut coprocessor_host); + self.causal_ts_providers.insert(node_id, causal_ts_provider); } // Start resource metering. @@ -583,6 +580,8 @@ impl ServerCluster { max_unified_read_pool_thread_count, None, ); + + let causal_ts_provider = self.get_causal_ts_provider(node_id); node.start( engines, simulate_trans.clone(), @@ -595,6 +594,7 @@ impl ServerCluster { auto_split_controller, concurrency_manager.clone(), collector_reg_handle, + causal_ts_provider, )?; assert!(node_id == 0 || node_id == node.id()); let node_id = node.id(); diff --git a/src/server/node.rs b/src/server/node.rs index f8c10673e1a..65dd592b490 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -222,6 +222,7 @@ where auto_split_controller: AutoSplitController, concurrency_manager: ConcurrencyManager, collector_reg_handle: CollectorRegHandle, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Result<()> where T: Transport + 'static, @@ -258,6 +259,7 @@ where auto_split_controller, concurrency_manager, collector_reg_handle, + causal_ts_provider, )?; Ok(()) @@ -504,6 +506,7 @@ where auto_split_controller: AutoSplitController, concurrency_manager: ConcurrencyManager, collector_reg_handle: CollectorRegHandle, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Result<()> where T: Transport + 'static, @@ -536,6 +539,7 @@ where concurrency_manager, collector_reg_handle, self.health_service.clone(), + causal_ts_provider, )?; Ok(()) } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index abdfcd333ac..e2192573dea 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -123,7 +123,7 @@ use crate::{ commands::{RawAtomicStore, RawCompareAndSwap, TypedCommand}, flow_controller::{EngineFlowController, FlowController}, scheduler::Scheduler as TxnScheduler, - Command, + Command, ErrorInner as TxnError, }, types::StorageCallbackType, }, @@ -273,6 +273,7 @@ impl Storage { config, dynamic_switches, flow_controller, + causal_ts_provider.clone(), reporter, resource_tag_factory.clone(), Arc::clone("a_limiter), @@ -1847,47 +1848,42 @@ impl Storage { } } - fn get_causal_ts(ts_provider: &Option>) -> Result> { - if let Some(p) = ts_provider { - match p.get_ts() { - Ok(ts) => Ok(Some(ts)), - Err(e) => Err(box_err!("Fail to get ts: {}", e)), + async fn check_causal_ts_flushed(ctx: &mut Context, tag: CommandKind) -> Result<()> { + if F::TAG == ApiVersion::V2 { + let snap_ctx = SnapContext { + pb_ctx: ctx, + ..Default::default() + }; + match Self::with_tls_engine(|engine| Self::snapshot(engine, snap_ctx)).await { + Ok(snapshot) => { + SCHED_STAGE_COUNTER_VEC.get(tag).snapshot_ok.inc(); + if !snapshot.ext().is_max_ts_synced() { + return Err(Error::from(txn::Error::from( + TxnError::MaxTimestampNotSynced { + region_id: ctx.get_region_id(), + start_ts: TimeStamp::zero(), + }, + ))); + } + let term = snapshot.ext().get_term(); + if let Some(term) = term { + ctx.set_term(term.get()); + } + } + Err(err) => { + SCHED_STAGE_COUNTER_VEC.get(tag).snapshot_err.inc(); + info!("get snapshot failed"; "tag" => ?tag, "err" => ?err); + return Err(err); + } } - } else { - Ok(None) - } - } - - async fn get_raw_key_guard( - ts_provider: &Option>, - concurrency_manager: ConcurrencyManager, - ) -> Result> { - // NOTE: the ts cannot be reused as timestamp of data key. - // There is a little chance that CDC will acquired a timestamp for resolved-ts - // just between the Self::get_causal_ts & concurrency_manager.lock_key, - // which violate the constraint that resolve-ts should not be larger - // than timestamp of captured data. - let ts = Self::get_causal_ts(ts_provider)?; - if let Some(ts) = ts { - let raw_key = vec![api_version::api_v2::RAW_KEY_PREFIX]; - // Make keys for locking by RAW_KEY_PREFIX & ts. RAW_KEY_PREFIX to avoid - // conflict with TiDB & TxnKV keys, and ts to avoid collision with - // other raw write requests. Ts in lock value to used by CDC which - // get maximum resolved-ts from concurrency_manager.global_min_lock_ts - let encode_key = ApiV2::encode_raw_key(&raw_key, Some(ts)); - let key_guard = concurrency_manager.lock_key(&encode_key).await; - let lock = Lock::new(LockType::Put, raw_key, ts, 0, None, 0.into(), 1, ts); - key_guard.with_lock(|l| *l = Some(lock)); - Ok(Some(key_guard)) - } else { - Ok(None) } + Ok(()) } /// Write a raw key to the storage. pub fn raw_put( &self, - ctx: Context, + mut ctx: Context, cf: String, key: Vec, value: Vec, @@ -1914,11 +1910,16 @@ impl Storage { return callback(Err(Error::from(e))); } let command_duration = tikv_util::time::Instant::now(); - let key_guard = Self::get_raw_key_guard(&provider, concurrency_manager).await; + + if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { + return callback(Err(e)); + } + + let key_guard = get_raw_key_guard(&provider, concurrency_manager).await; if let Err(e) = key_guard { return callback(Err(e)); } - let ts = Self::get_causal_ts(&provider); + let ts = get_causal_ts(&provider).await; if let Err(e) = ts { return callback(Err(e)); } @@ -1989,7 +1990,7 @@ impl Storage { /// Write some keys to the storage in a batch. pub fn raw_batch_put( &self, - ctx: Context, + mut ctx: Context, cf: String, pairs: Vec, ttls: Vec, @@ -2022,11 +2023,15 @@ impl Storage { } let command_duration = tikv_util::time::Instant::now(); - let key_guard = Self::get_raw_key_guard(&provider, concurrency_manager).await; + if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { + return callback(Err(e)); + } + + let key_guard = get_raw_key_guard(&provider, concurrency_manager).await; if let Err(e) = key_guard { return callback(Err(e)); } - let ts = Self::get_causal_ts(&provider); + let ts = get_causal_ts(&provider).await; if let Err(e) = ts { return callback(Err(e)); } @@ -2063,7 +2068,7 @@ impl Storage { /// operations. pub fn raw_delete( &self, - ctx: Context, + mut ctx: Context, cf: String, key: Vec, callback: Callback<()>, @@ -2083,11 +2088,15 @@ impl Storage { } let command_duration = tikv_util::time::Instant::now(); - let key_guard = Self::get_raw_key_guard(&provider, concurrency_manager).await; + if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { + return callback(Err(e)); + } + + let key_guard = get_raw_key_guard(&provider, concurrency_manager).await; if let Err(e) = key_guard { return callback(Err(e)); } - let ts = Self::get_causal_ts(&provider); + let ts = get_causal_ts(&provider).await; if let Err(e) = ts { return callback(Err(e)); } @@ -2168,7 +2177,7 @@ impl Storage { /// operations. pub fn raw_batch_delete( &self, - ctx: Context, + mut ctx: Context, cf: String, keys: Vec>, callback: Callback<()>, @@ -2188,11 +2197,15 @@ impl Storage { } let command_duration = tikv_util::time::Instant::now(); - let key_guard = Self::get_raw_key_guard(&provider, concurrency_manager).await; + if let Err(e) = Self::check_causal_ts_flushed(&mut ctx, CMD).await { + return callback(Err(e)); + } + + let key_guard = get_raw_key_guard(&provider, concurrency_manager).await; if let Err(e) = key_guard { return callback(Err(e)); } - let ts = Self::get_causal_ts(&provider); + let ts = get_causal_ts(&provider).await; if let Err(e) = ts { return callback(Err(e)); } @@ -2602,7 +2615,7 @@ impl Storage { previous_value: Option>, value: Vec, ttl: u64, - cb: Callback<(Option, bool)>, + callback: Callback<(Option, bool)>, ) -> Result<()> { const CMD: CommandKind = CommandKind::raw_compare_and_swap; let api_version = self.api_version; @@ -2612,43 +2625,14 @@ impl Storage { if !F::IS_TTL_ENABLED && ttl != 0 { return Err(Error::from(ErrorInner::TtlNotEnabled)); } - let provider = self.causal_ts_provider.clone(); let sched = self.get_scheduler(); - let concurrency_manager = self.get_concurrency_manager(); self.sched_raw_command(CMD, async move { - // Raw atomic cmd has two locks, one is concurrency_manager and the other is txn - // latch. Now, concurrency_manager lock key with ts encoded, it aims - // to "lock" resolved-ts to be less than its timestamp, rather than - // to "lock" other concurrent requests. TODO: Merge the two locks - // into one to simplify the process. Same to other raw atomic - // commands. - let key_guard = Self::get_raw_key_guard(&provider, concurrency_manager).await; - if let Err(e) = key_guard { - return cb(Err(e)); - } - let ts = Self::get_causal_ts(&provider); - if let Err(e) = ts { - return cb(Err(e)); - } - // Do NOT encode ts here as RawCompareAndSwap use key to gen lock. let key = F::encode_raw_key_owned(key, None); - let cmd = RawCompareAndSwap::new( - cf, - key, - previous_value, - value, - ttl, - api_version, - ts.unwrap(), - ctx, - ); + let cmd = RawCompareAndSwap::new(cf, key, previous_value, value, ttl, api_version, ctx); Self::sched_raw_atomic_command( sched, cmd, - Box::new(|res| { - cb(res.map_err(Error::from)); - drop(key_guard) - }), + Box::new(|res| callback(res.map_err(Error::from))), ); }) } @@ -2672,28 +2656,14 @@ impl Storage { let cf = Self::rawkv_cf(&cf, self.api_version)?; Self::check_ttl_valid(pairs.len(), &ttls)?; - let provider = self.causal_ts_provider.clone(); let sched = self.get_scheduler(); - let concurrency_manager = self.get_concurrency_manager(); self.sched_raw_command(CMD, async move { - let key_guard = Self::get_raw_key_guard(&provider, concurrency_manager).await; - if let Err(e) = key_guard { - return callback(Err(e)); - } - let ts = Self::get_causal_ts(&provider); - if let Err(e) = ts { - return callback(Err(e)); - } - // Do NOT encode ts here as RawAtomicStore use key to gen lock let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls, None); - let cmd = RawAtomicStore::new(cf, modifies, ts.unwrap(), ctx); + let cmd = RawAtomicStore::new(cf, modifies, ctx); Self::sched_raw_atomic_command( sched, cmd, - Box::new(|res| { - callback(res.map_err(Error::from)); - drop(key_guard) - }), + Box::new(|res| callback(res.map_err(Error::from))), ); }) } @@ -2706,34 +2676,21 @@ impl Storage { callback: Callback<()>, ) -> Result<()> { const CMD: CommandKind = CommandKind::raw_atomic_store; - Self::check_api_version(self.api_version, ctx.api_version, CMD, &keys)?; + Self::check_api_version(self.api_version, ctx.api_version, CMD, &keys)?; let cf = Self::rawkv_cf(&cf, self.api_version)?; - let provider = self.causal_ts_provider.clone(); let sched = self.get_scheduler(); - let concurrency_manager = self.get_concurrency_manager(); self.sched_raw_command(CMD, async move { - let key_guard = Self::get_raw_key_guard(&provider, concurrency_manager).await; - if let Err(e) = key_guard { - return callback(Err(e)); - } - let ts = Self::get_causal_ts(&provider); - if let Err(e) = ts { - return callback(Err(e)); - } // Do NOT encode ts here as RawAtomicStore use key to gen lock let modifies = keys .into_iter() .map(|k| Self::raw_delete_request_to_modify(cf, k, None)) .collect(); - let cmd = RawAtomicStore::new(cf, modifies, ts.unwrap(), ctx); + let cmd = RawAtomicStore::new(cf, modifies, ctx); Self::sched_raw_atomic_command( sched, cmd, - Box::new(|res| { - callback(res.map_err(Error::from)); - drop(key_guard) - }), + Box::new(|res| callback(res.map_err(Error::from))), ); }) } @@ -2829,6 +2786,45 @@ impl Storage { } } +pub async fn get_raw_key_guard( + ts_provider: &Option>, + concurrency_manager: ConcurrencyManager, +) -> Result> { + // NOTE: the ts cannot be reused as timestamp of data key. + // There is a little chance that CDC will acquired a timestamp for resolved-ts + // just between the get_causal_ts & concurrency_manager.lock_key, + // which violate the constraint that resolve-ts should not be larger + // than timestamp of captured data. + let ts = get_causal_ts(ts_provider).await?; + if let Some(ts) = ts { + let raw_key = vec![api_version::api_v2::RAW_KEY_PREFIX]; + // Make keys for locking by RAW_KEY_PREFIX & ts. RAW_KEY_PREFIX to avoid + // conflict with TiDB & TxnKV keys, and ts to avoid collision with + // other raw write requests. Ts in lock value to used by CDC which + // get maximum resolved-ts from concurrency_manager.global_min_lock_ts + let encode_key = ApiV2::encode_raw_key(&raw_key, Some(ts)); + let key_guard = concurrency_manager.lock_key(&encode_key).await; + let lock = Lock::new(LockType::Put, raw_key, ts, 0, None, 0.into(), 1, ts); + key_guard.with_lock(|l| *l = Some(lock)); + Ok(Some(key_guard)) + } else { + Ok(None) + } +} + +pub async fn get_causal_ts( + ts_provider: &Option>, +) -> Result> { + if let Some(p) = ts_provider { + match p.async_get_ts().await { + Ok(ts) => Ok(Some(ts)), + Err(e) => Err(box_err!("Fail to get ts: {}", e)), + } + } else { + Ok(None) + } +} + pub struct DynamicConfigs { pub pipelined_pessimistic_lock: Arc, pub in_memory_pessimistic_lock: Arc, @@ -3465,6 +3461,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut Statistics::default(), async_apply_prewrite: false, + raw_ext: None, }, ) .unwrap(); diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index 4b780f5bf2d..150b065e5db 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -2,7 +2,6 @@ // #[PerformanceCriticalPath] use engine_traits::CfName; -use txn_types::TimeStamp; use crate::storage::{ kv::{Modify, WriteData}, @@ -26,7 +25,6 @@ command! { /// The set of mutations to apply. cf: CfName, mutations: Vec, - data_ts: Option, } } @@ -41,16 +39,18 @@ impl CommandExt for RawAtomicStore { } impl WriteCommand for RawAtomicStore { - fn process_write(self, _: S, _: WriteContext<'_, L>) -> Result { + fn process_write(self, _: S, wctx: WriteContext<'_, L>) -> Result { let rows = self.mutations.len(); - let (mut mutations, ctx) = (self.mutations, self.ctx); - if let Some(ts) = self.data_ts { + let (mut mutations, ctx, raw_ext) = (self.mutations, self.ctx, wctx.raw_ext); + + if let Some(ref raw_ext) = raw_ext { for mutation in &mut mutations { if let Modify::Put(_, ref mut key, _) = mutation { - key.append_ts_inplace(ts); + key.append_ts_inplace(raw_ext.ts); } } }; + let mut to_be_write = WriteData::from_modifies(mutations); to_be_write.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -59,7 +59,7 @@ impl WriteCommand for RawAtomicStore { rows, pr: ProcessResult::Res, lock_info: None, - lock_guards: vec![], + lock_guards: raw_ext.into_iter().map(|r| r.key_guard).collect(), response_policy: ResponsePolicy::OnApplied, }) } @@ -67,13 +67,16 @@ impl WriteCommand for RawAtomicStore { #[cfg(test)] mod tests { - use api_version::{test_kv_format_impl, KvFormat, RawValue}; + use api_version::{test_kv_format_impl, ApiV2, KvFormat, RawValue}; use engine_traits::CF_DEFAULT; - use kvproto::kvrpcpb::Context; + use futures::executor::block_on; + use kvproto::kvrpcpb::{ApiVersion, Context}; use tikv_kv::Engine; use super::*; - use crate::storage::{lock_manager::DummyLockManager, Statistics, TestEngineBuilder}; + use crate::storage::{ + lock_manager::DummyLockManager, txn::scheduler::get_raw_ext, Statistics, TestEngineBuilder, + }; #[test] fn test_atomic_process_write() { @@ -85,11 +88,8 @@ mod tests { let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let raw_keys = vec![b"ra", b"rz"]; let raw_values = vec![b"valuea", b"valuez"]; - let encode_ts = if F::TAG == kvproto::kvrpcpb::ApiVersion::V2 { - Some(TimeStamp::from(100)) - } else { - None - }; + let ts_provider = super::super::test_util::gen_ts_provider(F::TAG); + let mut modifies = vec![]; for i in 0..raw_keys.len() { let raw_value = RawValue { @@ -103,15 +103,17 @@ mod tests { F::encode_raw_value_owned(raw_value), )); } - let cmd = RawAtomicStore::new(CF_DEFAULT, modifies, encode_ts, Context::default()); + let cmd = RawAtomicStore::new(CF_DEFAULT, modifies, Context::default()); let mut statistic = Statistics::default(); let snap = engine.snapshot(Default::default()).unwrap(); + let raw_ext = block_on(get_raw_ext(ts_provider, cm.clone(), true, &cmd.cmd)).unwrap(); let context = WriteContext { lock_mgr: &DummyLockManager {}, concurrency_manager: cm, extra_op: kvproto::kvrpcpb::ExtraOp::Noop, statistics: &mut statistic, async_apply_prewrite: false, + raw_ext, }; let cmd: Command = cmd.into(); let write_result = cmd.process_write(snap, context).unwrap(); @@ -124,10 +126,19 @@ mod tests { }; modifies_with_ts.push(Modify::Put( CF_DEFAULT, - F::encode_raw_key_owned(raw_keys[i].to_vec(), encode_ts), + F::encode_raw_key_owned(raw_keys[i].to_vec(), Some(101.into())), F::encode_raw_value_owned(raw_value), )); } - assert_eq!(write_result.to_be_write.modifies, modifies_with_ts) + assert_eq!(write_result.to_be_write.modifies, modifies_with_ts); + if F::TAG == ApiVersion::V2 { + assert_eq!(write_result.lock_guards.len(), 1); + let raw_key = vec![api_version::api_v2::RAW_KEY_PREFIX]; + let encoded_key = ApiV2::encode_raw_key(&raw_key, Some(100.into())); + assert_eq!( + write_result.lock_guards.first().unwrap().key(), + &encoded_key + ); + } } } diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 2678effbf7b..56138a09a50 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -202,6 +202,7 @@ pub mod tests { extra_op: Default::default(), statistics: &mut Default::default(), async_apply_prewrite: false, + raw_ext: None, }, ) .unwrap(); @@ -239,6 +240,7 @@ pub mod tests { extra_op: Default::default(), statistics: &mut Default::default(), async_apply_prewrite: false, + raw_ext: None, }, ) .unwrap(); diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index ef323cf206b..73079e00f5d 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -201,6 +201,7 @@ pub mod tests { extra_op: Default::default(), statistics: &mut Default::default(), async_apply_prewrite: false, + raw_ext: None, }, ) .unwrap(); @@ -248,6 +249,7 @@ pub mod tests { extra_op: Default::default(), statistics: &mut Default::default(), async_apply_prewrite: false, + raw_ext: None, }, ) .is_err() diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index 34d9114f48a..4dbd51e70e0 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -6,7 +6,7 @@ use engine_traits::{raw_ttl::ttl_to_expire_ts, CfName}; use kvproto::kvrpcpb::ApiVersion; use raw::RawStore; use tikv_kv::Statistics; -use txn_types::{Key, TimeStamp, Value}; +use txn_types::{Key, Value}; use crate::storage::{ kv::{Modify, WriteData}, @@ -37,7 +37,6 @@ command! { value: Value, ttl: u64, api_version: ApiVersion, - data_ts: Option, } } @@ -52,9 +51,16 @@ impl CommandExt for RawCompareAndSwap { } impl WriteCommand for RawCompareAndSwap { - fn process_write(self, snapshot: S, _: WriteContext<'_, L>) -> Result { - let (cf, mut key, value, previous_value, ctx) = - (self.cf, self.key, self.value, self.previous_value, self.ctx); + fn process_write(self, snapshot: S, wctx: WriteContext<'_, L>) -> Result { + let (cf, mut key, value, previous_value, ctx, raw_ext) = ( + self.cf, + self.key, + self.value, + self.previous_value, + self.ctx, + wctx.raw_ext, + ); + let mut data = vec![]; let old_value = RawStore::new(snapshot, self.api_version).raw_get_key_value( cf, @@ -62,7 +68,7 @@ impl WriteCommand for RawCompareAndSwap { &mut Statistics::default(), )?; - let pr = if old_value == previous_value { + let (pr, lock_guards) = if old_value == previous_value { let raw_value = RawValue { user_value: value, expire_ts: ttl_to_expire_ts(self.ttl), @@ -74,20 +80,28 @@ impl WriteCommand for RawCompareAndSwap { ApiVersion::API => API::encode_raw_value_owned(raw_value), } ); - if let Some(ts) = self.data_ts { - key = key.append_ts(ts); + + if let Some(ref raw_ext) = raw_ext { + key = key.append_ts(raw_ext.ts); } + let m = Modify::Put(cf, key, encoded_raw_value); data.push(m); - ProcessResult::RawCompareAndSwapRes { - previous_value: old_value, - succeed: true, - } + ( + ProcessResult::RawCompareAndSwapRes { + previous_value: old_value, + succeed: true, + }, + raw_ext.into_iter().map(|r| r.key_guard).collect(), + ) } else { - ProcessResult::RawCompareAndSwapRes { - previous_value: old_value, - succeed: false, - } + ( + ProcessResult::RawCompareAndSwapRes { + previous_value: old_value, + succeed: false, + }, + vec![], + ) }; fail_point!("txn_commands_compare_and_swap"); let rows = data.len(); @@ -99,7 +113,7 @@ impl WriteCommand for RawCompareAndSwap { rows, pr, lock_info: None, - lock_guards: vec![], + lock_guards, response_policy: ResponsePolicy::OnApplied, }) } @@ -107,13 +121,20 @@ impl WriteCommand for RawCompareAndSwap { #[cfg(test)] mod tests { - use api_version::test_kv_format_impl; + use std::sync::Arc; + + use api_version::{test_kv_format_impl, ApiV2}; + use causal_ts::CausalTsProviderImpl; use concurrency_manager::ConcurrencyManager; use engine_traits::CF_DEFAULT; + use futures::executor::block_on; use kvproto::kvrpcpb::Context; use super::*; - use crate::storage::{lock_manager::DummyLockManager, Engine, Statistics, TestEngineBuilder}; + use crate::storage::{ + lock_manager::DummyLockManager, txn::scheduler::get_raw_ext, Engine, Statistics, + TestEngineBuilder, + }; #[test] fn test_cas_basic() { @@ -126,15 +147,11 @@ mod tests { /// `src/storage/mod.rs`. fn test_cas_basic_impl() { let mut engine = TestEngineBuilder::new().build().unwrap(); + let ts_provider = super::super::test_util::gen_ts_provider(F::TAG); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let key = b"rk"; let encoded_key = F::encode_raw_key(key, None); - let mut ts = if F::TAG == kvproto::kvrpcpb::ApiVersion::V2 { - Some(TimeStamp::from(100)) - } else { - None - }; let cmd = RawCompareAndSwap::new( CF_DEFAULT, @@ -143,14 +160,13 @@ mod tests { b"v1".to_vec(), 0, F::TAG, - ts, Context::default(), ); - let (prev_val, succeed) = sched_command(&mut engine, cm.clone(), cmd).unwrap(); + let (prev_val, succeed) = + sched_command(&mut engine, cm.clone(), cmd, ts_provider.clone()).unwrap(); assert!(prev_val.is_none()); assert!(succeed); - ts = ts.map(|t| t.next()); let cmd = RawCompareAndSwap::new( CF_DEFAULT, encoded_key.clone(), @@ -158,14 +174,13 @@ mod tests { b"v2".to_vec(), 1, F::TAG, - ts, Context::default(), ); - let (prev_val, succeed) = sched_command(&mut engine, cm.clone(), cmd).unwrap(); + let (prev_val, succeed) = + sched_command(&mut engine, cm.clone(), cmd, ts_provider.clone()).unwrap(); assert_eq!(prev_val, Some(b"v1".to_vec())); assert!(!succeed); - ts = ts.map(|t| t.next()); let cmd = RawCompareAndSwap::new( CF_DEFAULT, encoded_key, @@ -173,10 +188,9 @@ mod tests { b"v3".to_vec(), 2, F::TAG, - ts, Context::default(), ); - let (prev_val, succeed) = sched_command(&mut engine, cm, cmd).unwrap(); + let (prev_val, succeed) = sched_command(&mut engine, cm, cmd, ts_provider).unwrap(); assert_eq!(prev_val, Some(b"v1".to_vec())); assert!(succeed); } @@ -185,16 +199,20 @@ mod tests { engine: &mut E, cm: ConcurrencyManager, cmd: TypedCommand<(Option, bool)>, + ts_provider: Option>, ) -> Result<(Option, bool)> { let snap = engine.snapshot(Default::default())?; use kvproto::kvrpcpb::ExtraOp; let mut statistic = Statistics::default(); + + let raw_ext = block_on(get_raw_ext(ts_provider, cm.clone(), true, &cmd.cmd)).unwrap(); let context = WriteContext { lock_mgr: &DummyLockManager {}, concurrency_manager: cm, extra_op: ExtraOp::Noop, statistics: &mut statistic, async_apply_prewrite: false, + raw_ext, }; let ret = cmd.cmd.process_write(snap, context)?; match ret.pr { @@ -219,14 +237,11 @@ mod tests { fn test_cas_process_write_impl() { let mut engine = TestEngineBuilder::new().build().unwrap(); + let ts_provider = super::super::test_util::gen_ts_provider(F::TAG); + let cm = concurrency_manager::ConcurrencyManager::new(1.into()); let raw_key = b"rk"; let raw_value = b"valuek"; - let encode_ts = if F::TAG == kvproto::kvrpcpb::ApiVersion::V2 { - Some(TimeStamp::from(100)) - } else { - None - }; let ttl = 30; let encode_value = RawValue { user_value: raw_value.to_vec(), @@ -240,25 +255,35 @@ mod tests { raw_value.to_vec(), ttl, F::TAG, - encode_ts, Context::default(), ); let mut statistic = Statistics::default(); let snap = engine.snapshot(Default::default()).unwrap(); + let raw_ext = block_on(get_raw_ext(ts_provider, cm.clone(), true, &cmd.cmd)).unwrap(); let context = WriteContext { lock_mgr: &DummyLockManager {}, concurrency_manager: cm, extra_op: kvproto::kvrpcpb::ExtraOp::Noop, statistics: &mut statistic, async_apply_prewrite: false, + raw_ext, }; let cmd: Command = cmd.into(); let write_result = cmd.process_write(snap, context).unwrap(); let modifies_with_ts = vec![Modify::Put( CF_DEFAULT, - F::encode_raw_key(raw_key, encode_ts), + F::encode_raw_key(raw_key, Some(101.into())), F::encode_raw_value_owned(encode_value), )]; - assert_eq!(write_result.to_be_write.modifies, modifies_with_ts) + assert_eq!(write_result.to_be_write.modifies, modifies_with_ts); + if F::TAG == ApiVersion::V2 { + assert_eq!(write_result.lock_guards.len(), 1); + let raw_key = vec![api_version::api_v2::RAW_KEY_PREFIX]; + let encoded_key = ApiV2::encode_raw_key(&raw_key, Some(100.into())); + assert_eq!( + write_result.lock_guards.first().unwrap().key(), + &encoded_key + ); + } } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 2f2d123e9bb..7c2c945d4e2 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -526,12 +526,18 @@ pub trait CommandExt: Display { fn gen_lock(&self) -> latch::Lock; } +pub struct RawExt { + pub ts: TimeStamp, + pub key_guard: KeyHandleGuard, +} + pub struct WriteContext<'a, L: LockManager> { pub lock_mgr: &'a L, pub concurrency_manager: ConcurrencyManager, pub extra_op: ExtraOp, pub statistics: &'a mut Statistics, pub async_apply_prewrite: bool, + pub raw_ext: Option, // use for apiv2 } pub struct ReaderWithStats<'a, S: Snapshot> { @@ -740,6 +746,10 @@ pub trait WriteCommand: CommandExt { #[cfg(test)] pub mod test_util { + use std::sync::Arc; + + use causal_ts::CausalTsProviderImpl; + use kvproto::kvrpcpb::ApiVersion; use txn_types::Mutation; use super::*; @@ -764,6 +774,7 @@ pub mod test_util { extra_op: ExtraOp::Noop, statistics, async_apply_prewrite: false, + raw_ext: None, }; let ret = cmd.cmd.process_write(snap, context)?; let res = match ret.pr { @@ -901,6 +912,7 @@ pub mod test_util { extra_op: ExtraOp::Noop, statistics, async_apply_prewrite: false, + raw_ext: None, }; let ret = cmd.cmd.process_write(snap, context)?; @@ -925,6 +937,7 @@ pub mod test_util { extra_op: ExtraOp::Noop, statistics, async_apply_prewrite: false, + raw_ext: None, }; let ret = cmd.cmd.process_write(snap, context)?; @@ -932,4 +945,14 @@ pub mod test_util { engine.write(&ctx, ret.to_be_write).unwrap(); Ok(()) } + + pub fn gen_ts_provider(api_version: ApiVersion) -> Option> { + if api_version == ApiVersion::V2 { + let test_provider: causal_ts::CausalTsProviderImpl = + causal_ts::tests::TestProvider::default().into(); + Some(Arc::new(test_provider)) + } else { + None + } + } } diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index 837d077153e..f7394cf32aa 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -143,6 +143,7 @@ pub mod tests { extra_op: Default::default(), statistics: &mut Default::default(), async_apply_prewrite: false, + raw_ext: None, }; let result = command.process_write(snapshot, write_context).unwrap(); write(engine, &ctx, result.to_be_write.modifies); diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index be47e22e42b..be57873b68c 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -1505,6 +1505,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut Statistics::default(), async_apply_prewrite: false, + raw_ext: None, } }; } @@ -1674,6 +1675,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut statistics, async_apply_prewrite: case.async_apply_prewrite, + raw_ext: None, }; let mut engine = TestEngineBuilder::new().build().unwrap(); let snap = engine.snapshot(Default::default()).unwrap(); @@ -1787,6 +1789,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut statistics, async_apply_prewrite: false, + raw_ext: None, }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -1814,6 +1817,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut statistics, async_apply_prewrite: false, + raw_ext: None, }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -1895,6 +1899,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut statistics, async_apply_prewrite: false, + raw_ext: None, }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -1926,6 +1931,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut statistics, async_apply_prewrite: false, + raw_ext: None, }; let snap = engine.snapshot(Default::default()).unwrap(); let result = cmd.cmd.process_write(snap, context).unwrap(); @@ -2194,6 +2200,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut statistics, async_apply_prewrite: false, + raw_ext: None, }; let snap = engine.snapshot(Default::default()).unwrap(); assert!(prewrite_cmd.cmd.process_write(snap, context).is_err()); @@ -2217,6 +2224,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut statistics, async_apply_prewrite: false, + raw_ext: None, }; let snap = engine.snapshot(Default::default()).unwrap(); assert!(prewrite_cmd.cmd.process_write(snap, context).is_err()); @@ -2422,6 +2430,7 @@ mod tests { extra_op: ExtraOp::Noop, statistics: &mut statistics, async_apply_prewrite: false, + raw_ext: None, }; let snap = engine.snapshot(Default::default()).unwrap(); let res = prewrite_cmd.cmd.process_write(snap, context).unwrap(); diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index 70c13a20c26..7ec773b99dc 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -139,6 +139,7 @@ pub mod tests { extra_op: Default::default(), statistics: &mut Default::default(), async_apply_prewrite: false, + raw_ext: None, }, ) .unwrap(); @@ -180,6 +181,7 @@ pub mod tests { extra_op: Default::default(), statistics: &mut Default::default(), async_apply_prewrite: false, + raw_ext: None, }, ) .is_err() diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index c3967820b34..b65445b8c24 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -34,6 +34,7 @@ use std::{ u64, }; +use causal_ts::CausalTsProviderImpl; use collections::HashMap; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; use crossbeam::utils::CachePadded; @@ -58,7 +59,7 @@ use crate::{ server::lock_manager::waiter_manager, storage::{ config::Config, - get_priority_tag, + get_causal_ts, get_priority_tag, get_raw_key_guard, kv::{ self, with_tls_engine, Engine, ExtCallback, FlowStatsReporter, Result as EngineResult, SnapContext, Statistics, @@ -66,11 +67,13 @@ use crate::{ lock_manager::{self, DiagnosticContext, LockManager, WaitTimeout}, metrics::*, txn::{ - commands::{Command, ResponsePolicy, WriteContext, WriteResult, WriteResultLockInfo}, + commands::{ + Command, RawExt, ResponsePolicy, WriteContext, WriteResult, WriteResultLockInfo, + }, flow_controller::FlowController, latch::{Latches, Lock}, sched_pool::{tls_collect_query, tls_collect_scan_details, SchedPool}, - Error, ProcessResult, + Error, ErrorInner, ProcessResult, }, types::StorageCallback, DynamicConfigs, Error as StorageError, ErrorInner as StorageErrorInner, @@ -205,6 +208,9 @@ struct SchedulerInner { flow_controller: Arc, + // used for apiv2 + causal_ts_provider: Option>, + control_mutex: Arc>, lock_mgr: L, @@ -349,6 +355,7 @@ impl Scheduler { config: &Config, dynamic_configs: DynamicConfigs, flow_controller: Arc, + causal_ts_provider: Option>, reporter: R, resource_tag_factory: ResourceTagFactory, quota_limiter: Arc, @@ -385,6 +392,7 @@ impl Scheduler { in_memory_pessimistic_lock: dynamic_configs.in_memory_pessimistic_lock, enable_async_apply_prewrite: config.enable_async_apply_prewrite, flow_controller, + causal_ts_provider, resource_tag_factory, quota_limiter, feature_gate, @@ -843,16 +851,34 @@ impl Scheduler { let pipelined = task.cmd.can_be_pipelined() && pessimistic_lock_mode == PessimisticLockMode::Pipelined; let txn_ext = snapshot.ext().get_txn_ext().cloned(); + let max_ts_synced = snapshot.ext().is_max_ts_synced(); + let causal_ts_provider = self.inner.causal_ts_provider.clone(); + let concurrency_manager = self.inner.concurrency_manager.clone(); + + let raw_ext = get_raw_ext( + causal_ts_provider, + concurrency_manager.clone(), + max_ts_synced, + &task.cmd, + ) + .await; + if let Err(err) = raw_ext { + info!("get_raw_ext failed"; "cid" => cid, "err" => ?err); + scheduler.finish_with_err(cid, err); + return; + } + let raw_ext = raw_ext.unwrap(); let deadline = task.cmd.deadline(); let write_result = { let _guard = sample.observe_cpu(); let context = WriteContext { lock_mgr: &self.inner.lock_mgr, - concurrency_manager: self.inner.concurrency_manager.clone(), + concurrency_manager, extra_op: task.extra_op, statistics, async_apply_prewrite: self.inner.enable_async_apply_prewrite, + raw_ext, }; let begin_instant = Instant::now(); let res = unsafe { @@ -1238,6 +1264,44 @@ impl Scheduler { } } +pub async fn get_raw_ext( + causal_ts_provider: Option>, + concurrency_manager: ConcurrencyManager, + max_ts_synced: bool, + cmd: &Command, +) -> Result, Error> { + if causal_ts_provider.is_some() { + match cmd { + Command::RawCompareAndSwap(_) | Command::RawAtomicStore(_) => { + if !max_ts_synced { + return Err(ErrorInner::MaxTimestampNotSynced { + region_id: cmd.ctx().get_region_id(), + start_ts: TimeStamp::zero(), + } + .into()); + } + let key_guard = get_raw_key_guard(&causal_ts_provider, concurrency_manager) + .await + .map_err(|err: StorageError| { + ErrorInner::Other(box_err!("failed to key guard: {:?}", err)) + })?; + let ts = + get_causal_ts(&causal_ts_provider) + .await + .map_err(|err: StorageError| { + ErrorInner::Other(box_err!("failed to get casual ts: {:?}", err)) + })?; + return Ok(Some(RawExt { + ts: ts.unwrap(), + key_guard: key_guard.unwrap(), + })); + } + _ => {} + } + } + Ok(None) +} + #[derive(Debug, PartialEq)] enum PessimisticLockMode { // Return success only if the pessimistic lock is persisted. @@ -1302,6 +1366,7 @@ mod tests { in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), + None, DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), @@ -1448,6 +1513,7 @@ mod tests { in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), + None, DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), @@ -1552,6 +1618,7 @@ mod tests { in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), + None, DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), @@ -1610,6 +1677,7 @@ mod tests { in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), + None, DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), @@ -1676,6 +1744,7 @@ mod tests { in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), + None, DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), @@ -1737,6 +1806,7 @@ mod tests { in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), + None, DummyReporter, ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index c50450c9dc4..fd56bd87992 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -3,6 +3,7 @@ use std::{sync::Arc, time::Duration}; use causal_ts::CausalTsProvider; +use futures::executor::block_on; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ kvrpcpb::*, @@ -64,6 +65,25 @@ impl TestSuite { must_raw_put(&client, ctx, key.to_vec(), value.to_vec()) } + pub fn raw_put_err_by_timestamp_not_synced(&mut self, key: &[u8], value: &[u8]) { + let region_id = self.cluster.get_region_id(key); + let client = self.get_client(region_id); + let ctx = self.get_context(region_id); + + let mut put_req = RawPutRequest::default(); + put_req.set_context(ctx); + put_req.key = key.to_vec(); + put_req.value = value.to_vec(); + + let put_resp = client.raw_put(&put_req).unwrap(); + assert!(put_resp.get_region_error().has_max_timestamp_not_synced()); + assert!( + put_resp.get_error().is_empty(), + "{:?}", + put_resp.get_error() + ); + } + pub fn must_raw_get(&mut self, key: &[u8]) -> Option> { let region_id = self.cluster.get_region_id(key); let client = self.get_client(region_id); @@ -72,13 +92,15 @@ impl TestSuite { } pub fn flush_timestamp(&mut self, node_id: u64) { - self.cluster - .sim - .rl() - .get_causal_ts_provider(node_id) - .unwrap() - .flush() - .unwrap(); + block_on( + self.cluster + .sim + .rl() + .get_causal_ts_provider(node_id) + .unwrap() + .async_flush(), + ) + .unwrap(); } pub fn must_merge_region_by_key(&mut self, source_key: &[u8], target_key: &[u8]) { @@ -92,7 +114,7 @@ impl TestSuite { let mut merged; let timer = Instant::now(); loop { - if timer.saturating_elapsed() > Duration::from_secs(5) { + if timer.saturating_elapsed() > Duration::from_secs(10) { panic!("region merge failed"); } merged = self.cluster.get_region(source_key); @@ -119,7 +141,7 @@ impl TestSuite { } } -const FP_CAUSAL_OBSERVER_FLUSH_TIMESTAMP: &str = "causal_observer_flush_timestamp"; +const FP_CAUSAL_TS_PROVIDER_FLUSH: &str = "causal_ts_provider_flush"; /// Verify correctness on leader transfer. // TODO: simulate and test for the scenario of issue #12498. @@ -129,9 +151,6 @@ fn test_leader_transfer() { let key1 = b"rk1"; let region = suite.cluster.get_region(key1); - // Disable CausalObserver::flush_timestamp to produce causality issue. - fail::cfg(FP_CAUSAL_OBSERVER_FLUSH_TIMESTAMP, "return").unwrap(); - // Transfer leader and write to store 1. { suite.must_transfer_leader(®ion, 1); @@ -145,15 +164,18 @@ fn test_leader_transfer() { assert_eq!(suite.must_raw_get(key1), Some(b"v4".to_vec())); } + // Disable CausalObserver::flush_timestamp to produce causality issue. + fail::cfg(FP_CAUSAL_TS_PROVIDER_FLUSH, "return").unwrap(); + // Transfer leader and write to store 2. { suite.must_transfer_leader(®ion, 2); suite.must_leader_on_store(key1, 2); // Store 2 has a TSO batch smaller than store 1. - suite.must_raw_put(key1, b"v5"); + suite.raw_put_err_by_timestamp_not_synced(key1, b"v5"); assert_eq!(suite.must_raw_get(key1), Some(b"v4".to_vec())); - suite.must_raw_put(key1, b"v6"); + suite.raw_put_err_by_timestamp_not_synced(key1, b"v6"); assert_eq!(suite.must_raw_get(key1), Some(b"v4".to_vec())); } @@ -161,7 +183,7 @@ fn test_leader_transfer() { suite.must_transfer_leader(®ion, 1); suite.must_leader_on_store(key1, 1); // Enable CausalObserver::flush_timestamp. - fail::cfg(FP_CAUSAL_OBSERVER_FLUSH_TIMESTAMP, "off").unwrap(); + fail::cfg(FP_CAUSAL_TS_PROVIDER_FLUSH, "off").unwrap(); // Transfer leader and write to store 2 again. { suite.must_transfer_leader(®ion, 2); @@ -173,7 +195,7 @@ fn test_leader_transfer() { assert_eq!(suite.must_raw_get(key1), Some(b"v8".to_vec())); } - fail::remove(FP_CAUSAL_OBSERVER_FLUSH_TIMESTAMP); + fail::remove(FP_CAUSAL_TS_PROVIDER_FLUSH); suite.stop(); } @@ -199,9 +221,6 @@ fn test_region_merge() { assert_eq!(region1.get_end_key(), region3.get_start_key()); assert_eq!(region3.get_end_key(), region5.get_start_key()); - // Disable CausalObserver::flush_timestamp to produce causality issue. - fail::cfg(FP_CAUSAL_OBSERVER_FLUSH_TIMESTAMP, "return").unwrap(); - // Transfer leaders: region 1 -> store 1, region 3 -> store 2, region 5 -> store // 3. suite.must_transfer_leader(®ion1, 1); @@ -219,20 +238,23 @@ fn test_region_merge() { assert_eq!(suite.must_raw_get(keys[1]), Some(b"v4".to_vec())); } + // Disable CausalObserver::flush_timestamp to produce causality issue. + fail::cfg(FP_CAUSAL_TS_PROVIDER_FLUSH, "return").unwrap(); + // Merge region 1 to 3. { suite.must_merge_region_by_key(keys[1], keys[3]); suite.must_leader_on_store(keys[1], 2); // Write to store 2. Store 2 has a TSO batch smaller than store 1. - suite.must_raw_put(keys[1], b"v5"); + suite.raw_put_err_by_timestamp_not_synced(keys[1], b"v5"); assert_eq!(suite.must_raw_get(keys[1]), Some(b"v4".to_vec())); - suite.must_raw_put(keys[1], b"v6"); + suite.raw_put_err_by_timestamp_not_synced(keys[1], b"v6"); assert_eq!(suite.must_raw_get(keys[1]), Some(b"v4".to_vec())); } // Enable CausalObserver::flush_timestamp. - fail::cfg(FP_CAUSAL_OBSERVER_FLUSH_TIMESTAMP, "off").unwrap(); + fail::cfg(FP_CAUSAL_TS_PROVIDER_FLUSH, "off").unwrap(); // Merge region 3 to 5. { @@ -246,6 +268,6 @@ fn test_region_merge() { assert_eq!(suite.must_raw_get(keys[1]), Some(b"v8".to_vec())); } - fail::remove(FP_CAUSAL_OBSERVER_FLUSH_TIMESTAMP); + fail::remove(FP_CAUSAL_TS_PROVIDER_FLUSH); suite.stop(); } diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 101cf30d446..45f5e16675c 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -10,7 +10,7 @@ use std::{ time::Duration, }; -use api_version::KvFormat; +use api_version::{ApiV1, ApiV2, KvFormat}; use causal_ts::CausalTsProvider; use collections::HashMap; use engine_traits::DummyFactory; @@ -509,7 +509,12 @@ fn test_pipelined_pessimistic_lock() { #[test] fn test_async_commit_prewrite_with_stale_max_ts() { - let mut cluster = new_server_cluster(0, 2); + test_async_commit_prewrite_with_stale_max_ts_impl::(); + test_async_commit_prewrite_with_stale_max_ts_impl::(); +} + +fn test_async_commit_prewrite_with_stale_max_ts_impl() { + let mut cluster = new_server_cluster_with_api_ver(0, 2, F::TAG); cluster.run(); let mut engine = cluster @@ -521,7 +526,7 @@ fn test_async_commit_prewrite_with_stale_max_ts() { .unwrap() .clone(); let storage = - TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine.clone(), DummyLockManager) + TestStorageBuilder::<_, _, F>::from_engine_and_lock_mgr(engine.clone(), DummyLockManager) .build() .unwrap(); @@ -532,6 +537,7 @@ fn test_async_commit_prewrite_with_stale_max_ts() { let mut ctx = Context::default(); ctx.set_region_id(1); + ctx.set_api_version(F::TAG); ctx.set_region_epoch(cluster.get_region_epoch(1)); ctx.set_peer(cluster.leader_of_region(1).unwrap()); @@ -541,15 +547,15 @@ fn test_async_commit_prewrite_with_stale_max_ts() { storage .sched_txn_command( commands::Prewrite::new( - vec![Mutation::make_put(Key::from_raw(b"k1"), b"v".to_vec())], - b"k1".to_vec(), + vec![Mutation::make_put(Key::from_raw(b"xk1"), b"v".to_vec())], + b"xk1".to_vec(), 10.into(), 100, false, 2, TimeStamp::default(), TimeStamp::default(), - Some(vec![b"k2".to_vec()]), + Some(vec![b"xk2".to_vec()]), false, AssertionLevel::Off, ctx.clone(), @@ -574,17 +580,17 @@ fn test_async_commit_prewrite_with_stale_max_ts() { .sched_txn_command( commands::PrewritePessimistic::new( vec![( - Mutation::make_put(Key::from_raw(b"k1"), b"v".to_vec()), + Mutation::make_put(Key::from_raw(b"xk1"), b"v".to_vec()), DoPessimisticCheck, )], - b"k1".to_vec(), + b"xk1".to_vec(), 10.into(), 100, 20.into(), 2, TimeStamp::default(), TimeStamp::default(), - Some(vec![b"k2".to_vec()]), + Some(vec![b"xk2".to_vec()]), false, AssertionLevel::Off, ctx.clone(), @@ -1485,7 +1491,7 @@ fn test_raw_put_key_guard() { let node_id = leader.get_id(); let leader_cm = cluster.sim.rl().get_concurrency_manager(node_id); let ts_provider = cluster.sim.rl().get_causal_ts_provider(node_id).unwrap(); - let ts = ts_provider.get_ts().unwrap(); + let ts = block_on(ts_provider.async_get_ts()).unwrap(); let env = Arc::new(Environment::new(1)); let channel = diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 55cf75d2b75..38fdf5c175c 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -111,6 +111,7 @@ fn start_raftstore( ConcurrencyManager::new(1.into()), CollectorRegHandle::new_for_test(), None, + None, ) .unwrap(); diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index cc5b6ca1ee0..8ede13bd0f4 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -112,6 +112,7 @@ fn test_node_bootstrap_with_prepared_data() { AutoSplitController::default(), ConcurrencyManager::new(1.into()), CollectorRegHandle::new_for_test(), + None, ) .unwrap(); assert!( diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 6bc7e2fb7b8..48adb2eb84c 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -2,6 +2,7 @@ use std::{iter::*, sync::*, thread, time::*}; +use api_version::{test_kv_format_impl, KvFormat}; use engine_traits::{Peekable, CF_LOCK, CF_RAFT, CF_WRITE}; use kvproto::{ kvrpcpb::Context, @@ -1145,7 +1146,11 @@ fn test_merge_remove_target_peer_isolated() { #[test] fn test_sync_max_ts_after_region_merge() { - let mut cluster = new_server_cluster(0, 3); + test_kv_format_impl!(test_sync_max_ts_after_region_merge_impl); +} + +fn test_sync_max_ts_after_region_merge_impl() { + let mut cluster = new_server_cluster_with_api_ver(0, 3, F::TAG); configure_for_merge(&mut cluster); cluster.run(); diff --git a/tests/integrations/raftstore/test_transfer_leader.rs b/tests/integrations/raftstore/test_transfer_leader.rs index 130290e01b8..9f2e564341f 100644 --- a/tests/integrations/raftstore/test_transfer_leader.rs +++ b/tests/integrations/raftstore/test_transfer_leader.rs @@ -2,6 +2,7 @@ use std::{sync::Arc, thread, time::Duration}; +use api_version::{test_kv_format_impl, KvFormat}; use engine_traits::CF_LOCK; use kvproto::kvrpcpb::Context; use raft::eraftpb::MessageType; @@ -227,7 +228,11 @@ fn test_server_transfer_leader_during_snapshot() { #[test] fn test_sync_max_ts_after_leader_transfer() { - let mut cluster = new_server_cluster(0, 3); + test_kv_format_impl!(test_sync_max_ts_after_leader_transfer_impl); +} + +fn test_sync_max_ts_after_leader_transfer_impl() { + let mut cluster = new_server_cluster_with_api_ver(0, 3, F::TAG); cluster.cfg.raft_store.raft_heartbeat_ticks = 20; cluster.run(); diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index ee23f2fc179..253d1e0c067 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -1143,6 +1143,7 @@ fn test_double_run_node() { AutoSplitController::default(), ConcurrencyManager::new(1.into()), CollectorRegHandle::new_for_test(), + None, ) .unwrap_err(); assert!(format!("{:?}", e).contains("already started"), "{:?}", e); From 47d8c9e483db762cefe8725abac9f5110e97ae63 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Fri, 30 Sep 2022 17:39:45 +0800 Subject: [PATCH 256/676] storage/lock_manager: Add metrics to the new lock waiting queue (#13560) ref tikv/tikv#13298 Add metrics to the new lock waiting queue, including: * The number of keys on which there is lock waiting * The number of requests that are waiting in the queue * The histogram of the queue length observed when enqueueing new requests Signed-off-by: MyonKeminta Signed-off-by: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Co-authored-by: Ti Chi Robot Co-authored-by: TonsnakeLin <87681388+TonsnakeLin@users.noreply.github.com> --- .../lock_manager/lock_waiting_queue.rs | 45 +++++++++++++++++-- src/storage/metrics.rs | 24 ++++++++++ 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index a3312a4fdb2..c1f2e800834 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -77,6 +77,7 @@ use txn_types::{Key, TimeStamp}; use crate::storage::{ errors::SharedError, lock_manager::{lock_wait_context::LockWaitContextSharedState, LockManager, LockWaitToken}, + metrics::*, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::Error as TxnError, types::{PessimisticLockParameters, PessimisticLockRes}, @@ -241,17 +242,29 @@ impl LockWaitQueues { mut lock_wait_entry: Box, current_lock: kvrpcpb::LockInfo, ) { + let mut new_key = false; let mut key_state = self .inner .queue_map .entry(lock_wait_entry.key.clone()) - .or_insert_with(|| KeyLockWaitState::new()); + .or_insert_with(|| { + new_key = true; + KeyLockWaitState::new() + }); key_state.current_lock = current_lock; if lock_wait_entry.legacy_wake_up_index.is_none() { lock_wait_entry.legacy_wake_up_index = Some(key_state.value().legacy_wake_up_index); } key_state.value_mut().queue.push(lock_wait_entry); + + let len = key_state.value_mut().queue.len(); + drop(key_state); + LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.waiters.inc(); + LOCK_WAIT_QUEUE_LENGTH_HISTOGRAM.observe(len as f64); + if new_key { + LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.keys.inc() + } } /// Dequeues the head of the lock waiting queue of the specified key, @@ -288,16 +301,20 @@ impl LockWaitQueues { wake_up_delay_duration_ms: Option, ) -> Option<(Box, Option)> { let mut result = None; + // For statistics. + let mut removed_waiters = 0; // We don't want other threads insert any more entries between finding the // queue is empty and removing the queue from the map. Wrap the logic // within a call to `remove_if_mut` to avoid releasing lock during the // procedure. - self.inner.queue_map.remove_if_mut(key, |_, v| { + let removed_key = self.inner.queue_map.remove_if_mut(key, |_, v| { v.last_conflict_start_ts = conflicting_start_ts; v.last_conflict_commit_ts = conflicting_commit_ts; while let Some(lock_wait_entry) = v.queue.pop() { + removed_waiters += 1; + if lock_wait_entry.req_states.as_ref().unwrap().is_finished() { // Skip already cancelled entries. continue; @@ -324,6 +341,15 @@ impl LockWaitQueues { v.queue.is_empty() }); + if removed_waiters != 0 { + LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC + .waiters + .sub(removed_waiters); + } + if removed_key.is_some() { + LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.keys.dec(); + } + result } @@ -413,11 +439,13 @@ impl LockWaitQueues { let mut conflicting_start_ts = TimeStamp::zero(); let mut conflicting_commit_ts = TimeStamp::zero(); + let mut removed_waiters = 0; + // We don't want other threads insert any more entries between finding the // queue is empty and removing the queue from the map. Wrap the logic // within a call to `remove_if_mut` to avoid releasing lock during the // procedure. - self.inner.queue_map.remove_if_mut(key, |_, v| { + let removed_key = self.inner.queue_map.remove_if_mut(key, |_, v| { // The KeyLockWaitState of the key might have been removed from the map and then // recreated. Skip. if v.delayed_notify_all_state @@ -440,6 +468,7 @@ impl LockWaitQueues { if front.req_states.as_ref().unwrap().is_finished() { // Skip already cancelled entries. v.queue.pop(); + removed_waiters += 1; continue; } if front @@ -451,6 +480,7 @@ impl LockWaitQueues { break; } let lock_wait_entry = v.queue.pop().unwrap(); + removed_waiters += 1; if lock_wait_entry.parameters.allow_lock_with_conflict { woken_up_resumable_entry = Some(lock_wait_entry); break; @@ -462,6 +492,15 @@ impl LockWaitQueues { v.queue.is_empty() }); + if removed_waiters != 0 { + LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC + .waiters + .sub(removed_waiters); + } + if removed_key.is_some() { + LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.keys.dec(); + } + // Call callbacks to cancel these entries here. // TODO: Perhaps we'd better make it concurrent with scheduling the new command // (if `woken_up_resumable_entry` is some) if there are too many. diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 3dd5fc2e10a..b74c5b7d51f 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -361,6 +361,15 @@ where }) } +make_static_metric! { + pub struct LockWaitQueueEntriesGauge: IntGauge { + "type" => { + waiters, + keys, + }, + } +} + lazy_static! { pub static ref KV_COMMAND_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_storage_command_total", @@ -575,4 +584,19 @@ lazy_static! { .unwrap(); pub static ref IN_MEMORY_PESSIMISTIC_LOCKING_COUNTER_STATIC: InMemoryPessimisticLockingCounter = auto_flush_from!(IN_MEMORY_PESSIMISTIC_LOCKING_COUNTER, InMemoryPessimisticLockingCounter); + + pub static ref LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC: LockWaitQueueEntriesGauge = register_static_int_gauge_vec!( + LockWaitQueueEntriesGauge, + "tikv_lock_wait_queue_entries_gauge_vec", + "Statistics of the lock wait queue's state", + &["type"] + ) + .unwrap(); + + pub static ref LOCK_WAIT_QUEUE_LENGTH_HISTOGRAM: Histogram = register_histogram!( + "tikv_lock_wait_queue_length", + "Statistics of length of queues counted when enqueueing", + exponential_buckets(1.0, 2.0, 16).unwrap() + ) + .unwrap(); } From 956610725039835557e7516828b069a44073c36d Mon Sep 17 00:00:00 2001 From: Liqi Geng Date: Thu, 6 Oct 2022 09:57:47 +0800 Subject: [PATCH 257/676] copr: fix wrong sql mode constants (#13567) close tikv/tikv#13566 Signed-off-by: gengliqi Co-authored-by: Ti Chi Robot --- components/tidb_query_datatype/src/expr/ctx.rs | 12 ++++++------ components/tidb_query_expr/src/impl_regexp.rs | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/components/tidb_query_datatype/src/expr/ctx.rs b/components/tidb_query_datatype/src/expr/ctx.rs index ffaf63a9774..758f7b13736 100644 --- a/components/tidb_query_datatype/src/expr/ctx.rs +++ b/components/tidb_query_datatype/src/expr/ctx.rs @@ -11,12 +11,12 @@ use crate::codec::mysql::Tz; bitflags! { /// Please refer to SQLMode in `mysql/const.go` in repo `pingcap/parser` for details. pub struct SqlMode: u64 { - const STRICT_TRANS_TABLES = 1 << 22; - const STRICT_ALL_TABLES = 1 << 23; - const NO_ZERO_IN_DATE = 1 << 24; - const NO_ZERO_DATE = 1 << 25; - const INVALID_DATES = 1 << 26; - const ERROR_FOR_DIVISION_BY_ZERO = 1 << 27; + const STRICT_TRANS_TABLES = 1 << 21; + const STRICT_ALL_TABLES = 1 << 22; + const NO_ZERO_IN_DATE = 1 << 23; + const NO_ZERO_DATE = 1 << 24; + const INVALID_DATES = 1 << 25; + const ERROR_FOR_DIVISION_BY_ZERO = 1 << 26; } } diff --git a/components/tidb_query_expr/src/impl_regexp.rs b/components/tidb_query_expr/src/impl_regexp.rs index 253b376c2f2..2e5830740ee 100644 --- a/components/tidb_query_expr/src/impl_regexp.rs +++ b/components/tidb_query_expr/src/impl_regexp.rs @@ -88,7 +88,7 @@ fn build_regexp_from_args( b"" }; - build_regexp::(pattern, match_type).map(|reg| Some(reg)) + build_regexp::(pattern, match_type).map(Some) } fn init_regexp_data(expr: &mut Expr) -> Result> { @@ -111,7 +111,7 @@ fn init_regexp_data(expr: &mut Expr) -> Result(pattern, match_type).map(|reg| Some(reg)) + build_regexp::(pattern, match_type).map(Some) } /// Currently, TiDB only supports regular expressions for utf-8 strings. From 1a9446f334c2d29417f656fa8ef4ec6a1dda95f7 Mon Sep 17 00:00:00 2001 From: Jay Date: Sun, 9 Oct 2022 01:05:48 -0700 Subject: [PATCH 258/676] engine_tirocks: add properties (#13558) ref tikv/tikv#13058 Significant differences are: - This PR uses codec components instead of tikv_util::codec. - Files are re-arranged to make properties related logic better organized. - An extra allocation is reduced by passing `UserCollectedProperties` directly instead of creating a new HashMap. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/codec/src/error.rs | 3 + components/engine_rocks/src/properties.rs | 6 - components/engine_tirocks/Cargo.toml | 7 + components/engine_tirocks/src/engine.rs | 24 + components/engine_tirocks/src/lib.rs | 7 + .../engine_tirocks/src/properties/mod.rs | 164 ++++ .../engine_tirocks/src/properties/mvcc.rs | 364 ++++++++ .../engine_tirocks/src/properties/range.rs | 803 ++++++++++++++++++ .../engine_tirocks/src/properties/table.rs | 96 +++ .../engine_tirocks/src/properties/ttl.rs | 225 +++++ 10 files changed, 1693 insertions(+), 6 deletions(-) create mode 100644 components/engine_tirocks/src/properties/mod.rs create mode 100644 components/engine_tirocks/src/properties/mvcc.rs create mode 100644 components/engine_tirocks/src/properties/range.rs create mode 100644 components/engine_tirocks/src/properties/table.rs create mode 100644 components/engine_tirocks/src/properties/ttl.rs diff --git a/components/codec/src/error.rs b/components/codec/src/error.rs index b85d8dd078d..09118824c6b 100644 --- a/components/codec/src/error.rs +++ b/components/codec/src/error.rs @@ -13,6 +13,8 @@ pub enum ErrorInner { #[error("Data padding is incorrect")] BadPadding, + #[error("key not found")] + KeyNotFound, } impl ErrorInner { @@ -56,6 +58,7 @@ impl ErrorCodeExt for Error { match self.0.as_ref() { ErrorInner::Io(_) => error_code::codec::IO, ErrorInner::BadPadding => error_code::codec::BAD_PADDING, + ErrorInner::KeyNotFound => error_code::codec::KEY_NOT_FOUND, } } } diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index d468fb2d523..a95a9aecf7b 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -131,12 +131,6 @@ impl<'a> DecodeProperties for UserCollectedPropertiesDecoder<'a> { } } -#[derive(Debug, Clone, PartialEq, Copy)] -pub enum RangeOffsetKind { - Size, - Keys, -} - #[derive(Debug, Default, Clone, Copy)] pub struct RangeOffsets { pub size: u64, diff --git a/components/engine_tirocks/Cargo.toml b/components/engine_tirocks/Cargo.toml index 469a659567e..5ffa4428dd2 100644 --- a/components/engine_tirocks/Cargo.toml +++ b/components/engine_tirocks/Cargo.toml @@ -4,9 +4,14 @@ version = "0.1.0" edition = "2021" [dependencies] +api_version = { path = "../api_version" } +codec = { path = "../codec" } +collections = { path = "../collections" } derive_more = "0.99.3" engine_traits = { path = "../engine_traits" } +keys = { path = "../keys" } lazy_static = "1.4.0" +log_wrappers = { path = "../log_wrappers" } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } @@ -16,7 +21,9 @@ tikv_alloc = { path = "../tikv_alloc" } tikv_util = { path = "../tikv_util" } tirocks = { git = "https://github.com/busyjay/tirocks.git", branch = "dev" } tracker = { path = "../tracker" } +txn_types = { path = "../txn_types" } [dev-dependencies] kvproto = { git = "https://github.com/pingcap/kvproto.git" } +rand = "0.8" tempfile = "3.0" diff --git a/components/engine_tirocks/src/engine.rs b/components/engine_tirocks/src/engine.rs index 87ae0efeb79..c3f99cafcc6 100644 --- a/components/engine_tirocks/src/engine.rs +++ b/components/engine_tirocks/src/engine.rs @@ -87,6 +87,30 @@ impl RocksEngine { pub(crate) fn multi_batch_write(&self) -> bool { self.multi_batch_write } + + #[inline] + pub(crate) fn approximate_memtable_stats( + &self, + cf: &str, + start: &[u8], + end: &[u8], + ) -> Result<(u64, u64)> { + let handle = self.cf(cf)?; + Ok(self + .as_inner() + .approximate_mem_table_stats(handle, start, end)) + } + + // TODO: move this function when MiscExt is implemented. + #[cfg(test)] + pub(crate) fn flush(&self, cf: &str, wait: bool) -> Result<()> { + use tirocks::option::FlushOptions; + + let write_handle = self.cf(cf)?; + self.as_inner() + .flush(FlushOptions::default().set_wait(wait), write_handle) + .map_err(r2e) + } } impl engine_traits::Iterable for RocksEngine { diff --git a/components/engine_tirocks/src/lib.rs b/components/engine_tirocks/src/lib.rs index da56cfabb6c..ecf7035b8c4 100644 --- a/components/engine_tirocks/src/lib.rs +++ b/components/engine_tirocks/src/lib.rs @@ -5,8 +5,13 @@ //! When all features of engine_rocks are implemented in this module, //! engine_rocks will be removed and TiKV will switch to tirocks. +#![cfg_attr(test, feature(test))] + extern crate tikv_alloc as _; +#[cfg(test)] +extern crate test; + mod cf_options; mod db_options; mod db_vector; @@ -14,6 +19,7 @@ mod engine; mod engine_iterator; mod logger; mod perf_context; +mod properties; mod snapshot; mod status; mod util; @@ -23,6 +29,7 @@ pub use engine::*; pub use engine_iterator::*; pub use logger::*; pub use perf_context::*; +pub use properties::*; pub use snapshot::RocksSnapshot; pub use status::*; pub use util::*; diff --git a/components/engine_tirocks/src/properties/mod.rs b/components/engine_tirocks/src/properties/mod.rs new file mode 100644 index 00000000000..967273aae3a --- /dev/null +++ b/components/engine_tirocks/src/properties/mod.rs @@ -0,0 +1,164 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod mvcc; +mod range; +mod table; +mod ttl; + +use std::{ + cmp, + collections::BTreeMap, + io::Read, + ops::{Deref, DerefMut}, +}; + +use codec::{ + number::NumberCodec, + prelude::{NumberDecoder, NumberEncoder}, +}; +use collections::HashMap; +use tirocks::properties::table::user::UserCollectedProperties; + +pub use self::{ + mvcc::MvccPropertiesCollectorFactory, + range::{RangeProperties, RangePropertiesCollectorFactory}, + table::{RocksTablePropertiesCollection, RocksUserCollectedProperties}, + ttl::TtlPropertiesCollectorFactory, +}; + +/// A struct to help collect properties. +/// +/// The properties of a file can be collected by ranges. Every range will be +/// referenced by a `PropIndex`. +#[derive(Clone, Debug, Default)] +pub struct PropIndex { + /// The properties calculated from the range. The range starts from + /// `offset` of previous `PropIndex` to this `offset`. How large the range + /// is depends on the implementation. + pub prop: u64, + /// The offset in the file. Offsets are not necessary the size of file. It + /// only makes sense to the implementations. + pub offset: u64, +} + +#[derive(Debug, Default)] +pub struct PropIndexes(BTreeMap, PropIndex>); + +impl Deref for PropIndexes { + type Target = BTreeMap, PropIndex>; + fn deref(&self) -> &BTreeMap, PropIndex> { + &self.0 + } +} + +impl DerefMut for PropIndexes { + fn deref_mut(&mut self) -> &mut BTreeMap, PropIndex> { + &mut self.0 + } +} + +impl PropIndexes { + pub fn new() -> PropIndexes { + PropIndexes(BTreeMap::new()) + } + + pub fn into_map(self) -> BTreeMap, PropIndex> { + self.0 + } + + pub fn add(&mut self, key: Vec, index: PropIndex) { + self.0.insert(key, index); + } + + // Format: | klen | k | v.size | v.offset | + pub fn encode(&self) -> Vec { + let cap = cmp::min((8 * 3 + 24) * self.0.len(), 1024); + let mut buf = Vec::with_capacity(cap); + for (k, v) in &self.0 { + buf.write_u64(k.len() as u64).unwrap(); + buf.extend(k); + buf.write_u64(v.prop).unwrap(); + buf.write_u64(v.offset).unwrap(); + } + buf + } + + pub fn decode(mut buf: &[u8]) -> codec::Result { + let mut res = BTreeMap::new(); + while !buf.is_empty() { + let klen = buf.read_u64()?; + let mut k = vec![0; klen as usize]; + buf.read_exact(&mut k)?; + let v = PropIndex { + prop: buf.read_u64()?, + offset: buf.read_u64()?, + }; + res.insert(k, v); + } + Ok(PropIndexes(res)) + } +} + +trait EncodeProperties { + fn encode(&mut self, name: &str, value: &[u8]); + + #[inline] + fn encode_u64(&mut self, name: &str, value: u64) { + let mut buf = [0; 8]; + NumberCodec::encode_u64(&mut buf, value); + self.encode(name, &buf); + } + + #[inline] + fn encode_indexes(&mut self, name: &str, indexes: &PropIndexes) { + self.encode(name, &indexes.encode()); + } +} + +impl EncodeProperties for UserCollectedProperties { + #[inline] + fn encode(&mut self, name: &str, value: &[u8]) { + self.add(name.as_bytes(), value); + } +} + +impl EncodeProperties for HashMap, Vec> { + #[inline] + fn encode(&mut self, name: &str, value: &[u8]) { + self.insert(name.as_bytes().to_owned(), value.to_owned()); + } +} + +trait DecodeProperties { + fn decode(&self, k: &str) -> codec::Result<&[u8]>; + + #[inline] + fn decode_u64(&self, k: &str) -> codec::Result { + let mut buf = self.decode(k)?; + buf.read_u64() + } + + #[inline] + fn decode_indexes(&self, k: &str) -> codec::Result { + let buf = self.decode(k)?; + PropIndexes::decode(buf) + } +} + +impl DecodeProperties for UserCollectedProperties { + #[inline] + fn decode(&self, k: &str) -> codec::Result<&[u8]> { + self.get(k.as_bytes()) + .ok_or_else(|| codec::ErrorInner::KeyNotFound.into()) + } +} + +impl DecodeProperties for HashMap, Vec> { + #[inline] + fn decode(&self, k: &str) -> codec::Result<&[u8]> { + match self.get(k.as_bytes()) { + Some(v) => Ok(v.as_slice()), + None => Err(codec::ErrorInner::KeyNotFound.into()), + } + } +} diff --git a/components/engine_tirocks/src/properties/mvcc.rs b/components/engine_tirocks/src/properties/mvcc.rs new file mode 100644 index 00000000000..1ca170f33d5 --- /dev/null +++ b/components/engine_tirocks/src/properties/mvcc.rs @@ -0,0 +1,364 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cmp, ffi::CStr}; + +use api_version::{ApiV2, KeyMode, KvFormat}; +use engine_traits::{raw_ttl::ttl_current_ts, MvccProperties}; +use tirocks::properties::table::user::{ + Context, EntryType, SequenceNumber, TablePropertiesCollector, TablePropertiesCollectorFactory, + UserCollectedProperties, +}; +use txn_types::{Key, TimeStamp, Write, WriteType}; + +use super::{DecodeProperties, EncodeProperties, PropIndex, PropIndexes}; +use crate::RocksEngine; + +pub const PROP_NUM_ERRORS: &str = "tikv.num_errors"; +pub const PROP_MIN_TS: &str = "tikv.min_ts"; +pub const PROP_MAX_TS: &str = "tikv.max_ts"; +pub const PROP_NUM_ROWS: &str = "tikv.num_rows"; +pub const PROP_NUM_PUTS: &str = "tikv.num_puts"; +pub const PROP_NUM_DELETES: &str = "tikv.num_deletes"; +pub const PROP_NUM_VERSIONS: &str = "tikv.num_versions"; +pub const PROP_MAX_ROW_VERSIONS: &str = "tikv.max_row_versions"; +pub const PROP_ROWS_INDEX: &str = "tikv.rows_index"; +pub const PROP_ROWS_INDEX_DISTANCE: u64 = 10000; + +/// Can be used for write CF in TiDB & TxnKV scenario, or be used for default CF +/// in RawKV scenario. +pub struct MvccPropertiesCollector { + name: &'static CStr, + props: MvccProperties, + last_row: Vec, + num_errors: u64, + row_versions: u64, + cur_prop_index: PropIndex, + row_prop_indexes: PropIndexes, + key_mode: KeyMode, // Use KeyMode::Txn for both TiDB & TxnKV, KeyMode::Raw for RawKV. + current_ts: u64, +} + +impl MvccPropertiesCollector { + fn new(name: &'static CStr, key_mode: KeyMode) -> MvccPropertiesCollector { + MvccPropertiesCollector { + name, + props: MvccProperties::new(), + last_row: Vec::new(), + num_errors: 0, + row_versions: 0, + cur_prop_index: PropIndex::default(), + row_prop_indexes: PropIndexes::new(), + key_mode, + current_ts: ttl_current_ts(), + } + } + + fn finish(&mut self, properties: &mut impl EncodeProperties) { + // Insert last handle. + if self.cur_prop_index.prop > 0 { + self.row_prop_indexes + .insert(self.last_row.clone(), self.cur_prop_index.clone()); + } + encode_mvcc(&self.props, properties); + properties.encode_u64(PROP_NUM_ERRORS, self.num_errors); + properties.encode_indexes(PROP_ROWS_INDEX, &self.row_prop_indexes); + } +} + +impl TablePropertiesCollector for MvccPropertiesCollector { + fn name(&self) -> &CStr { + self.name + } + + fn add( + &mut self, + key: &[u8], + value: &[u8], + entry_type: EntryType, + _: SequenceNumber, + _: u64, + ) -> tirocks::Result<()> { + // TsFilter filters sst based on max_ts and min_ts during iterating. + // To prevent seeing outdated (GC) records, we should consider + // RocksDB delete entry type. + if entry_type != EntryType::kEntryPut && entry_type != EntryType::kEntryDelete { + return Ok(()); + } + + if !keys::validate_data_key(key) { + self.num_errors += 1; + return Ok(()); + } + + let (k, ts) = match Key::split_on_ts_for(key) { + Ok((k, ts)) => (k, ts), + Err(_) => { + self.num_errors += 1; + return Ok(()); + } + }; + + self.props.min_ts = cmp::min(self.props.min_ts, ts); + self.props.max_ts = cmp::max(self.props.max_ts, ts); + if entry_type == EntryType::kEntryDelete { + // Empty value for delete entry type, skip following properties. + return Ok(()); + } + + self.props.num_versions += 1; + + if k != self.last_row.as_slice() { + self.props.num_rows += 1; + self.row_versions = 1; + self.last_row.clear(); + self.last_row.extend(k); + } else { + self.row_versions += 1; + } + if self.row_versions > self.props.max_row_versions { + self.props.max_row_versions = self.row_versions; + } + + if self.key_mode == KeyMode::Raw { + let decode_raw_value = ApiV2::decode_raw_value(value); + match decode_raw_value { + Ok(raw_value) => { + if raw_value.is_valid(self.current_ts) { + self.props.num_puts += 1; + } else { + self.props.num_deletes += 1; + } + } + Err(_) => { + self.num_errors += 1; + } + } + } else { + let write_type = match Write::parse_type(value) { + Ok(v) => v, + Err(_) => { + self.num_errors += 1; + return Ok(()); + } + }; + + match write_type { + WriteType::Put => self.props.num_puts += 1, + WriteType::Delete => self.props.num_deletes += 1, + _ => {} + } + } + + // Add new row. + if self.row_versions == 1 { + self.cur_prop_index.prop += 1; + self.cur_prop_index.offset += 1; + if self.cur_prop_index.offset == 1 + || self.cur_prop_index.prop >= PROP_ROWS_INDEX_DISTANCE + { + self.row_prop_indexes + .insert(self.last_row.clone(), self.cur_prop_index.clone()); + self.cur_prop_index.prop = 0; + } + } + Ok(()) + } + + fn finish(&mut self, properties: &mut UserCollectedProperties) -> tirocks::Result<()> { + self.finish(properties); + Ok(()) + } +} + +/// Can be used for write CF of TiDB/TxnKV, default CF of RawKV. +pub struct MvccPropertiesCollectorFactory { + name: &'static CStr, + key_mode: KeyMode, +} + +impl Default for MvccPropertiesCollectorFactory { + fn default() -> Self { + Self { + name: CStr::from_bytes_with_nul(b"tikv.mvcc-properties-collector\0").unwrap(), + key_mode: KeyMode::Txn, + } + } +} + +impl MvccPropertiesCollectorFactory { + pub fn rawkv() -> Self { + Self { + name: CStr::from_bytes_with_nul(b"tikv.rawkv-mvcc-properties-collector\0").unwrap(), + key_mode: KeyMode::Raw, + } + } +} + +impl TablePropertiesCollectorFactory for MvccPropertiesCollectorFactory { + type Collector = MvccPropertiesCollector; + + fn name(&self) -> &CStr { + self.name + } + + fn create_table_properties_collector(&self, _: Context) -> Self::Collector { + MvccPropertiesCollector::new(self.name, self.key_mode) + } +} + +fn encode_mvcc(mvcc_props: &MvccProperties, props: &mut impl EncodeProperties) { + props.encode_u64(PROP_MIN_TS, mvcc_props.min_ts.into_inner()); + props.encode_u64(PROP_MAX_TS, mvcc_props.max_ts.into_inner()); + props.encode_u64(PROP_NUM_ROWS, mvcc_props.num_rows); + props.encode_u64(PROP_NUM_PUTS, mvcc_props.num_puts); + props.encode_u64(PROP_NUM_DELETES, mvcc_props.num_deletes); + props.encode_u64(PROP_NUM_VERSIONS, mvcc_props.num_versions); + props.encode_u64(PROP_MAX_ROW_VERSIONS, mvcc_props.max_row_versions); +} + +pub(super) fn decode_mvcc(props: &impl DecodeProperties) -> codec::Result { + let mut res = MvccProperties::new(); + res.min_ts = props.decode_u64(PROP_MIN_TS)?.into(); + res.max_ts = props.decode_u64(PROP_MAX_TS)?.into(); + res.num_rows = props.decode_u64(PROP_NUM_ROWS)?; + res.num_puts = props.decode_u64(PROP_NUM_PUTS)?; + res.num_versions = props.decode_u64(PROP_NUM_VERSIONS)?; + // To be compatible with old versions. + res.num_deletes = props + .decode_u64(PROP_NUM_DELETES) + .unwrap_or(res.num_versions - res.num_puts); + res.max_row_versions = props.decode_u64(PROP_MAX_ROW_VERSIONS)?; + Ok(res) +} + +impl engine_traits::MvccPropertiesExt for RocksEngine { + fn get_mvcc_properties_cf( + &self, + cf: &str, + safe_point: TimeStamp, + start_key: &[u8], + end_key: &[u8], + ) -> Option { + let collection = match self.range_properties(cf, start_key, end_key) { + Ok(c) if !c.is_empty() => c, + _ => return None, + }; + let mut props = MvccProperties::new(); + for (_, v) in &*collection { + let mvcc = match decode_mvcc(v.user_collected_properties()) { + Ok(m) => m, + Err(_) => return None, + }; + // Filter out properties after safe_point. + if mvcc.min_ts > safe_point { + continue; + } + props.add(&mvcc); + } + Some(props) + } +} + +#[cfg(test)] +mod tests { + use api_version::RawValue; + use collections::HashMap; + use test::Bencher; + use txn_types::{Key, Write, WriteType}; + + use super::*; + + #[test] + fn test_mvcc_properties() { + let cases = [ + ("ab", 2, WriteType::Put, EntryType::kEntryPut), + ("ab", 1, WriteType::Delete, EntryType::kEntryPut), + ("ab", 1, WriteType::Delete, EntryType::kEntryDelete), + ("cd", 5, WriteType::Delete, EntryType::kEntryPut), + ("cd", 4, WriteType::Put, EntryType::kEntryPut), + ("cd", 3, WriteType::Put, EntryType::kEntryPut), + ("ef", 6, WriteType::Put, EntryType::kEntryPut), + ("ef", 6, WriteType::Put, EntryType::kEntryDelete), + ("gh", 7, WriteType::Delete, EntryType::kEntryPut), + ]; + let mut collector = + MvccPropertiesCollector::new(CStr::from_bytes_with_nul(b"\0").unwrap(), KeyMode::Txn); + for &(key, ts, write_type, entry_type) in &cases { + let ts = ts.into(); + let k = Key::from_raw(key.as_bytes()).append_ts(ts); + let k = keys::data_key(k.as_encoded()); + let v = Write::new(write_type, ts, None).as_ref().to_bytes(); + collector.add(&k, &v, entry_type, 0, 0).unwrap(); + } + let mut result = HashMap::default(); + collector.finish(&mut result); + + let props = decode_mvcc(&result).unwrap(); + assert_eq!(props.min_ts, 1.into()); + assert_eq!(props.max_ts, 7.into()); + assert_eq!(props.num_rows, 4); + assert_eq!(props.num_puts, 4); + assert_eq!(props.num_versions, 7); + assert_eq!(props.max_row_versions, 3); + } + + #[test] + fn test_mvcc_properties_rawkv_mode() { + let test_raws = vec![ + (b"r\0a", 1, false, u64::MAX), + (b"r\0a", 5, false, u64::MAX), + (b"r\0a", 7, false, u64::MAX), + (b"r\0b", 1, false, u64::MAX), + (b"r\0b", 1, true, u64::MAX), + (b"r\0c", 1, true, 10), + (b"r\0d", 1, true, 10), + ]; + + let mut collector = + MvccPropertiesCollector::new(CStr::from_bytes_with_nul(b"\0").unwrap(), KeyMode::Raw); + for &(key, ts, is_delete, expire_ts) in &test_raws { + let encode_key = ApiV2::encode_raw_key(key, Some(ts.into())); + let k = keys::data_key(encode_key.as_encoded()); + let v = ApiV2::encode_raw_value(RawValue { + user_value: &[0; 10][..], + expire_ts: Some(expire_ts), + is_delete, + }); + collector.add(&k, &v, EntryType::kEntryPut, 0, 0).unwrap(); + } + + let mut result = HashMap::default(); + collector.finish(&mut result); + + let props = decode_mvcc(&result).unwrap(); + assert_eq!(props.min_ts, 1.into()); + assert_eq!(props.max_ts, 7.into()); + assert_eq!(props.num_rows, 4); + assert_eq!(props.num_deletes, 3); + assert_eq!(props.num_puts, 4); + assert_eq!(props.num_versions, 7); + assert_eq!(props.max_row_versions, 3); + } + + #[bench] + fn bench_mvcc_properties(b: &mut Bencher) { + let ts = 1.into(); + let num_entries = 100; + let mut entries = Vec::new(); + for i in 0..num_entries { + let s = format!("{:032}", i); + let k = Key::from_raw(s.as_bytes()).append_ts(ts); + let k = keys::data_key(k.as_encoded()); + let w = Write::new(WriteType::Put, ts, Some(s.as_bytes().to_owned())); + entries.push((k, w.as_ref().to_bytes())); + } + + let mut collector = + MvccPropertiesCollector::new(CStr::from_bytes_with_nul(b"\0").unwrap(), KeyMode::Txn); + b.iter(|| { + for &(ref k, ref v) in &entries { + collector.add(k, v, EntryType::kEntryPut, 0, 0).unwrap(); + } + }); + } +} diff --git a/components/engine_tirocks/src/properties/range.rs b/components/engine_tirocks/src/properties/range.rs new file mode 100644 index 00000000000..59b9e68a6bb --- /dev/null +++ b/components/engine_tirocks/src/properties/range.rs @@ -0,0 +1,803 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ffi::CStr, io::Read, path::Path}; + +use codec::prelude::{NumberDecoder, NumberEncoder}; +use engine_traits::{MvccProperties, Range, Result, CF_DEFAULT, CF_LOCK, CF_WRITE, LARGE_CFS}; +use tikv_util::{box_err, box_try, debug, info}; +use tirocks::{ + properties::table::user::{ + Context, EntryType, SequenceNumber, TablePropertiesCollector, + TablePropertiesCollectorFactory, UserCollectedProperties, + }, + titan::TitanBlobIndex, +}; + +use super::{mvcc::decode_mvcc, DecodeProperties, EncodeProperties, PropIndexes}; +use crate::RocksEngine; + +const PROP_TOTAL_SIZE: &str = "tikv.total_size"; +const PROP_SIZE_INDEX: &str = "tikv.size_index"; +const PROP_RANGE_INDEX: &str = "tikv.range_index"; +pub const DEFAULT_PROP_SIZE_INDEX_DISTANCE: u64 = 4 * 1024 * 1024; +pub const DEFAULT_PROP_KEYS_INDEX_DISTANCE: u64 = 40 * 1024; + +// Deprecated. Only for compatible issue from v2.0 or older version. +#[derive(Debug, Default)] +pub struct SizeProperties { + pub total_size: u64, + pub prop_indexes: PropIndexes, +} + +impl SizeProperties { + fn decode(props: &impl DecodeProperties) -> codec::Result { + Ok(SizeProperties { + total_size: props.decode_u64(PROP_TOTAL_SIZE)?, + prop_indexes: props.decode_indexes(PROP_SIZE_INDEX)?, + }) + } +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct RangeOffsets { + pub size: u64, + pub keys: u64, +} + +#[derive(Debug, Default)] +pub struct RangeProperties { + pub offsets: Vec<(Vec, RangeOffsets)>, +} + +impl RangeProperties { + pub fn get(&self, key: &[u8]) -> &RangeOffsets { + let idx = self + .offsets + .binary_search_by_key(&key, |&(ref k, _)| k) + .unwrap(); + &self.offsets[idx].1 + } + + fn encode(&self, props: &mut impl EncodeProperties) { + let mut buf = Vec::with_capacity(1024); + for (k, offsets) in &self.offsets { + buf.write_u64(k.len() as u64).unwrap(); + buf.extend(k); + buf.write_u64(offsets.size).unwrap(); + buf.write_u64(offsets.keys).unwrap(); + } + props.encode(PROP_RANGE_INDEX, &buf); + } + + pub(super) fn decode(props: &impl DecodeProperties) -> codec::Result { + match RangeProperties::decode_from_range_properties(props) { + Ok(res) => return Ok(res), + Err(e) => info!( + "decode to RangeProperties failed with err: {:?}, try to decode to SizeProperties, maybe upgrade from v2.0 or older version?", + e + ), + } + SizeProperties::decode(props).map(|res| res.into()) + } + + fn decode_from_range_properties( + props: &impl DecodeProperties, + ) -> codec::Result { + let mut res = RangeProperties::default(); + let mut buf = props.decode(PROP_RANGE_INDEX)?; + while !buf.is_empty() { + let klen = buf.read_u64()?; + let mut k = vec![0; klen as usize]; + buf.read_exact(&mut k)?; + let offsets = RangeOffsets { + size: buf.read_u64()?, + keys: buf.read_u64()?, + }; + res.offsets.push((k, offsets)); + } + Ok(res) + } + + pub fn get_approximate_size_in_range(&self, start: &[u8], end: &[u8]) -> u64 { + self.get_approximate_distance_in_range(start, end).0 + } + + pub fn get_approximate_keys_in_range(&self, start: &[u8], end: &[u8]) -> u64 { + self.get_approximate_distance_in_range(start, end).1 + } + + /// Returns `size` and `keys`. + pub fn get_approximate_distance_in_range(&self, start: &[u8], end: &[u8]) -> (u64, u64) { + assert!(start <= end); + if start == end { + return (0, 0); + } + let start_offset = match self.offsets.binary_search_by_key(&start, |&(ref k, _)| k) { + Ok(idx) => Some(idx), + Err(next_idx) => next_idx.checked_sub(1), + }; + let end_offset = match self.offsets.binary_search_by_key(&end, |&(ref k, _)| k) { + Ok(idx) => Some(idx), + Err(next_idx) => next_idx.checked_sub(1), + }; + let start = start_offset.map_or_else(|| Default::default(), |x| self.offsets[x].1); + let end = end_offset.map_or_else(|| Default::default(), |x| self.offsets[x].1); + assert!(end.size >= start.size && end.keys >= start.keys); + (end.size - start.size, end.keys - start.keys) + } + + // equivalent to range(Excluded(start_key), Excluded(end_key)) + pub fn take_excluded_range( + mut self, + start_key: &[u8], + end_key: &[u8], + ) -> Vec<(Vec, RangeOffsets)> { + let start_offset = match self + .offsets + .binary_search_by_key(&start_key, |&(ref k, _)| k) + { + Ok(idx) => { + if idx == self.offsets.len() - 1 { + return vec![]; + } else { + idx + 1 + } + } + Err(next_idx) => next_idx, + }; + + let end_offset = match self.offsets.binary_search_by_key(&end_key, |&(ref k, _)| k) { + Ok(idx) => { + if idx == 0 { + return vec![]; + } else { + idx - 1 + } + } + Err(next_idx) => { + if next_idx == 0 { + return vec![]; + } else { + next_idx - 1 + } + } + }; + + if start_offset > end_offset { + return vec![]; + } + + self.offsets.drain(start_offset..=end_offset).collect() + } + + pub fn smallest_key(&self) -> Option> { + self.offsets.first().map(|(k, _)| k.to_owned()) + } + + pub fn largest_key(&self) -> Option> { + self.offsets.last().map(|(k, _)| k.to_owned()) + } +} + +impl From for RangeProperties { + fn from(p: SizeProperties) -> RangeProperties { + let mut res = RangeProperties::default(); + for (key, size_index) in p.prop_indexes.into_map() { + let range = RangeOffsets { + // For SizeProperties, the offset is accumulation of the size. + size: size_index.offset, + ..Default::default() + }; + res.offsets.push((key, range)); + } + res + } +} + +fn range_properties_collector_name() -> &'static CStr { + CStr::from_bytes_with_nul(b"tikv.range-properties-collector\0").unwrap() +} + +pub struct RangePropertiesCollector { + props: RangeProperties, + last_offsets: RangeOffsets, + last_key: Vec, + cur_offsets: RangeOffsets, + prop_size_index_distance: u64, + prop_keys_index_distance: u64, +} + +impl Default for RangePropertiesCollector { + fn default() -> Self { + RangePropertiesCollector { + props: RangeProperties::default(), + last_offsets: RangeOffsets::default(), + last_key: vec![], + cur_offsets: RangeOffsets::default(), + prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, + prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, + } + } +} + +impl RangePropertiesCollector { + pub fn new(prop_size_index_distance: u64, prop_keys_index_distance: u64) -> Self { + RangePropertiesCollector { + prop_size_index_distance, + prop_keys_index_distance, + ..Default::default() + } + } + + #[inline] + fn size_in_last_range(&self) -> u64 { + self.cur_offsets.size - self.last_offsets.size + } + + #[inline] + fn keys_in_last_range(&self) -> u64 { + self.cur_offsets.keys - self.last_offsets.keys + } + + #[inline] + fn insert_new_point(&mut self, key: Vec) { + self.last_offsets = self.cur_offsets; + self.props.offsets.push((key, self.cur_offsets)); + } + + #[inline] + fn finish(&mut self, props: &mut impl EncodeProperties) { + if self.size_in_last_range() > 0 || self.keys_in_last_range() > 0 { + let key = self.last_key.clone(); + self.insert_new_point(key); + } + self.props.encode(props); + } +} + +impl TablePropertiesCollector for RangePropertiesCollector { + #[inline] + fn name(&self) -> &CStr { + range_properties_collector_name() + } + + #[inline] + fn add( + &mut self, + key: &[u8], + value: &[u8], + entry_type: EntryType, + _: SequenceNumber, + _: u64, + ) -> tirocks::Result<()> { + // size + let entry_size = match entry_type { + EntryType::kEntryPut => value.len() as u64, + EntryType::kEntryBlobIndex => match TitanBlobIndex::decode(value) { + Ok(index) => index.blob_size + value.len() as u64, + // Perhaps should panic? + Err(_) => return Ok(()), + }, + _ => return Ok(()), + }; + self.cur_offsets.size += entry_size + key.len() as u64; + // keys + self.cur_offsets.keys += 1; + // Add the start key for convenience. + if self.last_key.is_empty() + || self.size_in_last_range() >= self.prop_size_index_distance + || self.keys_in_last_range() >= self.prop_keys_index_distance + { + self.insert_new_point(key.to_owned()); + } + self.last_key.clear(); + self.last_key.extend_from_slice(key); + Ok(()) + } + + #[inline] + fn finish(&mut self, prop: &mut UserCollectedProperties) -> tirocks::Result<()> { + self.finish(prop); + Ok(()) + } +} + +pub struct RangePropertiesCollectorFactory { + pub prop_size_index_distance: u64, + pub prop_keys_index_distance: u64, +} + +impl Default for RangePropertiesCollectorFactory { + #[inline] + fn default() -> Self { + RangePropertiesCollectorFactory { + prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, + prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, + } + } +} + +impl TablePropertiesCollectorFactory for RangePropertiesCollectorFactory { + type Collector = RangePropertiesCollector; + + #[inline] + fn name(&self) -> &CStr { + range_properties_collector_name() + } + + #[inline] + fn create_table_properties_collector(&self, _: Context) -> RangePropertiesCollector { + RangePropertiesCollector::new(self.prop_size_index_distance, self.prop_keys_index_distance) + } +} + +fn get_range_entries_and_versions( + engine: &crate::RocksEngine, + cf: &str, + start: &[u8], + end: &[u8], +) -> Option<(u64, u64)> { + let collection = match engine.properties_of_tables_in_range(cf, &[(start, end)]) { + Ok(v) => v, + Err(_) => return None, + }; + + if collection.is_empty() { + return None; + } + + // Aggregate total MVCC properties and total number entries. + let mut props = MvccProperties::new(); + let mut num_entries = 0; + for (_, v) in &*collection { + let mvcc = match decode_mvcc(v.user_collected_properties()) { + Ok(v) => v, + Err(_) => return None, + }; + num_entries += v.num_entries(); + props.add(&mvcc); + } + + Some((num_entries, props.num_versions)) +} + +impl engine_traits::RangePropertiesExt for RocksEngine { + fn get_range_approximate_keys(&self, range: Range<'_>, large_threshold: u64) -> Result { + // try to get from RangeProperties first. + match self.get_range_approximate_keys_cf(CF_WRITE, range, large_threshold) { + Ok(v) => { + return Ok(v); + } + Err(e) => debug!( + "failed to get keys from RangeProperties"; + "err" => ?e, + ), + } + + let start = &range.start_key; + let end = &range.end_key; + let (_, keys) = + get_range_entries_and_versions(self, CF_WRITE, start, end).unwrap_or_default(); + Ok(keys) + } + + fn get_range_approximate_keys_cf( + &self, + cfname: &str, + range: Range<'_>, + large_threshold: u64, + ) -> Result { + let start_key = &range.start_key; + let end_key = &range.end_key; + let mut total_keys = 0; + let (mem_keys, _) = + self.approximate_memtable_stats(cfname, range.start_key, range.end_key)?; + total_keys += mem_keys; + + let collection = box_try!(self.range_properties(cfname, start_key, end_key)); + for (_, v) in &*collection { + let props = box_try!(RangeProperties::decode(v.user_collected_properties())); + total_keys += props.get_approximate_keys_in_range(start_key, end_key); + } + + if large_threshold != 0 && total_keys > large_threshold { + let ssts = collection + .into_iter() + .map(|(k, v)| { + let props = RangeProperties::decode(v.user_collected_properties()).unwrap(); + let keys = props.get_approximate_keys_in_range(start_key, end_key); + let p = std::str::from_utf8(k).unwrap(); + format!( + "{}:{}", + Path::new(p) + .file_name() + .map(|f| f.to_str().unwrap()) + .unwrap_or(p), + keys + ) + }) + .collect::>() + .join(", "); + info!( + "range contains too many keys"; + "start" => log_wrappers::Value::key(range.start_key), + "end" => log_wrappers::Value::key(range.end_key), + "total_keys" => total_keys, + "memtable" => mem_keys, + "ssts_keys" => ssts, + "cf" => cfname, + ) + } + Ok(total_keys) + } + + fn get_range_approximate_size(&self, range: Range<'_>, large_threshold: u64) -> Result { + let mut size = 0; + for cf in LARGE_CFS { + size += self + .get_range_approximate_size_cf(cf, range, large_threshold) + // CF_LOCK doesn't have RangeProperties until v4.0, so we swallow the error for + // backward compatibility. + .or_else(|e| if cf == &CF_LOCK { Ok(0) } else { Err(e) })?; + } + Ok(size) + } + + fn get_range_approximate_size_cf( + &self, + cf: &str, + range: Range<'_>, + large_threshold: u64, + ) -> Result { + let start_key = &range.start_key; + let end_key = &range.end_key; + let mut total_size = 0; + let (_, mem_size) = self.approximate_memtable_stats(cf, range.start_key, range.end_key)?; + total_size += mem_size; + + let collection = box_try!(self.range_properties(cf, start_key, end_key)); + for (_, v) in &*collection { + let props = box_try!(RangeProperties::decode(v.user_collected_properties())); + total_size += props.get_approximate_size_in_range(start_key, end_key); + } + + if large_threshold != 0 && total_size > large_threshold { + let ssts = collection + .into_iter() + .map(|(k, v)| { + let props = RangeProperties::decode(v.user_collected_properties()).unwrap(); + let size = props.get_approximate_size_in_range(start_key, end_key); + let p = std::str::from_utf8(k).unwrap(); + format!( + "{}:{}", + Path::new(p) + .file_name() + .map(|f| f.to_str().unwrap()) + .unwrap_or(p), + size + ) + }) + .collect::>() + .join(", "); + info!( + "range size is too large"; + "start" => log_wrappers::Value::key(range.start_key), + "end" => log_wrappers::Value::key(range.end_key), + "total_size" => total_size, + "memtable" => mem_size, + "ssts_size" => ssts, + "cf" => cf, + ) + } + Ok(total_size) + } + + fn get_range_approximate_split_keys( + &self, + range: Range<'_>, + key_count: usize, + ) -> Result>> { + let get_cf_size = |cf: &str| self.get_range_approximate_size_cf(cf, range, 0); + let cfs = [ + (CF_DEFAULT, box_try!(get_cf_size(CF_DEFAULT))), + (CF_WRITE, box_try!(get_cf_size(CF_WRITE))), + // CF_LOCK doesn't have RangeProperties until v4.0, so we swallow the error for + // backward compatibility. + (CF_LOCK, get_cf_size(CF_LOCK).unwrap_or(0)), + ]; + + let total_size: u64 = cfs.iter().map(|(_, s)| s).sum(); + if total_size == 0 { + return Err(box_err!("all CFs are empty")); + } + + let (cf, _) = cfs.iter().max_by_key(|(_, s)| s).unwrap(); + + self.get_range_approximate_split_keys_cf(cf, range, key_count) + } + + fn get_range_approximate_split_keys_cf( + &self, + cfname: &str, + range: Range<'_>, + key_count: usize, + ) -> Result>> { + let start_key = &range.start_key; + let end_key = &range.end_key; + let collection = box_try!(self.range_properties(cfname, start_key, end_key)); + + let mut keys = vec![]; + for (_, v) in &*collection { + let props = box_try!(RangeProperties::decode(v.user_collected_properties())); + keys.extend( + props + .take_excluded_range(start_key, end_key) + .into_iter() + .map(|(k, _)| k), + ); + } + + if keys.is_empty() { + return Ok(vec![]); + } + + const SAMPLING_THRESHOLD: usize = 20000; + const SAMPLE_RATIO: usize = 1000; + // If there are too many keys, reduce its amount before sorting, or it may take + // too much time to sort the keys. + if keys.len() > SAMPLING_THRESHOLD { + let len = keys.len(); + keys = keys.into_iter().step_by(len / SAMPLE_RATIO).collect(); + } + keys.sort(); + + // If the keys are too few, return them directly. + if keys.len() <= key_count { + return Ok(keys); + } + + // Find `key_count` keys which divides the whole range into `parts` parts + // evenly. + let mut res = Vec::with_capacity(key_count); + let section_len = (keys.len() as f64) / ((key_count + 1) as f64); + for i in 1..=key_count { + res.push(keys[(section_len * (i as f64)) as usize].clone()) + } + res.dedup(); + Ok(res) + } +} + +#[cfg(test)] +mod tests { + use collections::HashMap; + use engine_traits::{SyncMutable, CF_WRITE, LARGE_CFS}; + use rand::Rng; + use tempfile::Builder; + use tirocks::properties::table::user::SysTablePropertiesCollectorFactory; + use txn_types::Key; + + use super::*; + use crate::{ + cf_options::RocksCfOptions, db_options::RocksDbOptions, + properties::mvcc::MvccPropertiesCollectorFactory, + }; + + #[allow(clippy::many_single_char_names)] + #[test] + fn test_range_properties() { + let cases = [ + ("a", 0, 1), + // handle "a": size(size = 1, offset = 1),keys(1,1) + ("b", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8, 1), + ("c", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4, 1), + ("d", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), + ("e", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8, 1), + // handle "e": size(size = DISTANCE + 4, offset = DISTANCE + 5),keys(4,5) + ("f", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4, 1), + ("g", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), + ("h", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8, 1), + ("i", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4, 1), + // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + + // 9),keys(4,5) + ("j", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), + ("k", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2, 1), + // handle "k": size(size = DISTANCE + 2, offset = DISTANCE / 8 * 25 + 11),keys(2,11) + ("l", 0, DEFAULT_PROP_KEYS_INDEX_DISTANCE / 2), + ("m", 0, DEFAULT_PROP_KEYS_INDEX_DISTANCE / 2), + // handle "m": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE,offset = + // 11+DEFAULT_PROP_KEYS_INDEX_DISTANCE + ("n", 1, DEFAULT_PROP_KEYS_INDEX_DISTANCE), + // handle "n": keys = DEFAULT_PROP_KEYS_INDEX_DISTANCE, offset = + // 11+2*DEFAULT_PROP_KEYS_INDEX_DISTANCE + ("o", 1, 1), + // handle "o": keys = 1, offset = 12 + 2*DEFAULT_PROP_KEYS_INDEX_DISTANCE + ]; + + let mut collector = RangePropertiesCollector::default(); + for &(k, vlen, count) in &cases { + let v = vec![0; vlen as usize]; + for _ in 0..count { + collector + .add(k.as_bytes(), &v, EntryType::kEntryPut, 0, 0) + .unwrap(); + } + } + for &(k, vlen, _) in &cases { + let v = vec![0; vlen as usize]; + collector + .add(k.as_bytes(), &v, EntryType::kEntryOther, 0, 0) + .unwrap(); + } + let mut result = HashMap::default(); + collector.finish(&mut result); + + let props = RangeProperties::decode(&result).unwrap(); + assert_eq!(props.smallest_key().unwrap(), cases[0].0.as_bytes()); + assert_eq!( + props.largest_key().unwrap(), + cases[cases.len() - 1].0.as_bytes() + ); + assert_eq!( + props.get_approximate_size_in_range(b"", b"k"), + DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 25 + 11 + ); + assert_eq!(props.get_approximate_keys_in_range(b"", b"k"), 11_u64); + + assert_eq!(props.offsets.len(), 7); + let a = props.get(b"a"); + assert_eq!(a.size, 1); + let e = props.get(b"e"); + assert_eq!(e.size, DEFAULT_PROP_SIZE_INDEX_DISTANCE + 5); + let i = props.get(b"i"); + assert_eq!(i.size, DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 17 + 9); + let k = props.get(b"k"); + assert_eq!(k.size, DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 25 + 11); + let m = props.get(b"m"); + assert_eq!(m.keys, 11 + DEFAULT_PROP_KEYS_INDEX_DISTANCE); + let n = props.get(b"n"); + assert_eq!(n.keys, 11 + 2 * DEFAULT_PROP_KEYS_INDEX_DISTANCE); + let o = props.get(b"o"); + assert_eq!(o.keys, 12 + 2 * DEFAULT_PROP_KEYS_INDEX_DISTANCE); + let empty = RangeOffsets::default(); + let cases = [ + (" ", "k", k, &empty, 3), + (" ", " ", &empty, &empty, 0), + ("k", "k", k, k, 0), + ("a", "k", k, a, 2), + ("a", "i", i, a, 1), + ("e", "h", e, e, 0), + ("b", "h", e, a, 1), + ("g", "g", i, i, 0), + ]; + for &(start, end, end_idx, start_idx, count) in &cases { + let props = RangeProperties::decode(&result).unwrap(); + let size = end_idx.size - start_idx.size; + assert_eq!( + props.get_approximate_size_in_range(start.as_bytes(), end.as_bytes()), + size + ); + let keys = end_idx.keys - start_idx.keys; + assert_eq!( + props.get_approximate_keys_in_range(start.as_bytes(), end.as_bytes()), + keys + ); + assert_eq!( + props + .take_excluded_range(start.as_bytes(), end.as_bytes()) + .len(), + count + ); + } + } + + #[test] + fn test_range_properties_with_blob_index() { + let cases = [ + ("a", 0), + // handle "a": size(size = 1, offset = 1),keys(1,1) + ("b", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8), + ("c", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4), + ("d", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), + ("e", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8), + // handle "e": size(size = DISTANCE + 4, offset = DISTANCE + 5),keys(4,5) + ("f", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4), + ("g", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), + ("h", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8), + ("i", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 4), + // handle "i": size(size = DISTANCE / 8 * 9 + 4, offset = DISTANCE / 8 * 17 + + // 9),keys(4,5) + ("j", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), + ("k", DEFAULT_PROP_SIZE_INDEX_DISTANCE / 2), + // handle "k": size(size = DISTANCE + 2, offset = DISTANCE / 8 * 25 + 11),keys(2,11) + ]; + + let handles = ["a", "e", "i", "k"]; + + let mut rng = rand::thread_rng(); + let mut collector = RangePropertiesCollector::default(); + let mut extra_value_size: u64 = 0; + for &(k, vlen) in &cases { + if handles.contains(&k) || rng.gen_range(0..2) == 0 { + let v = vec![0; vlen as usize - extra_value_size as usize]; + extra_value_size = 0; + collector + .add(k.as_bytes(), &v, EntryType::kEntryPut, 0, 0) + .unwrap(); + } else { + let blob_index = TitanBlobIndex::new(0, vlen - extra_value_size, 0); + let v = blob_index.encode(); + extra_value_size = v.len() as u64; + collector + .add(k.as_bytes(), &v, EntryType::kEntryBlobIndex, 0, 0) + .unwrap(); + } + } + let mut result = HashMap::default(); + collector.finish(&mut result); + + let props = RangeProperties::decode(&result).unwrap(); + assert_eq!(props.smallest_key().unwrap(), cases[0].0.as_bytes()); + assert_eq!( + props.largest_key().unwrap(), + cases[cases.len() - 1].0.as_bytes() + ); + assert_eq!( + props.get_approximate_size_in_range(b"e", b"i"), + DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 9 + 4 + ); + assert_eq!( + props.get_approximate_size_in_range(b"", b"k"), + DEFAULT_PROP_SIZE_INDEX_DISTANCE / 8 * 25 + 11 + ); + } + + #[test] + fn test_get_range_entries_and_versions() { + let path = Builder::new() + .prefix("_test_get_range_entries_and_versions") + .tempdir() + .unwrap(); + let db_opts = RocksDbOptions::default(); + let cfs_opts = LARGE_CFS + .iter() + .map(|cf| { + let mut cf_opts = RocksCfOptions::default(); + cf_opts + .set_level0_file_num_compaction_trigger(10) + .add_table_properties_collector_factory( + &SysTablePropertiesCollectorFactory::new( + MvccPropertiesCollectorFactory::default(), + ), + ); + (*cf, cf_opts) + }) + .collect(); + let db = crate::util::new_engine_opt(path.path(), db_opts, cfs_opts).unwrap(); + + let cases = ["a", "b", "c"]; + for &key in &cases { + let k1 = keys::data_key( + Key::from_raw(key.as_bytes()) + .append_ts(2.into()) + .as_encoded(), + ); + db.put_cf(CF_WRITE, &k1, b"v1").unwrap(); + db.delete_cf(CF_WRITE, &k1).unwrap(); + let key = keys::data_key( + Key::from_raw(key.as_bytes()) + .append_ts(3.into()) + .as_encoded(), + ); + db.put_cf(CF_WRITE, &key, b"v2").unwrap(); + db.flush(CF_WRITE, true).unwrap(); + } + + let start_keys = keys::data_key(&[]); + let end_keys = keys::data_end_key(&[]); + let (entries, versions) = + get_range_entries_and_versions(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); + assert_eq!(entries, (cases.len() * 2) as u64); + assert_eq!(versions, cases.len() as u64); + } +} diff --git a/components/engine_tirocks/src/properties/table.rs b/components/engine_tirocks/src/properties/table.rs new file mode 100644 index 00000000000..84998bbeb88 --- /dev/null +++ b/components/engine_tirocks/src/properties/table.rs @@ -0,0 +1,96 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::mem; + +use engine_traits::{Range, Result}; +use tirocks::properties::table::{ + builtin::OwnedTablePropertiesCollection, user::UserCollectedProperties, +}; + +use super::range::RangeProperties; +use crate::{r2e, RocksEngine}; + +#[repr(transparent)] +pub struct RocksUserCollectedProperties(UserCollectedProperties); + +impl RocksUserCollectedProperties { + #[inline] + fn from_rocks(v: &UserCollectedProperties) -> &Self { + unsafe { mem::transmute(v) } + } +} + +impl engine_traits::UserCollectedProperties for RocksUserCollectedProperties { + #[inline] + fn get(&self, index: &[u8]) -> Option<&[u8]> { + self.0.get(index) + } + + #[inline] + fn approximate_size_and_keys(&self, start: &[u8], end: &[u8]) -> Option<(usize, usize)> { + let rp = RangeProperties::decode(&self.0).ok()?; + let x = rp.get_approximate_distance_in_range(start, end); + Some((x.0 as usize, x.1 as usize)) + } +} + +#[repr(transparent)] +pub struct RocksTablePropertiesCollection(OwnedTablePropertiesCollection); + +impl engine_traits::TablePropertiesCollection for RocksTablePropertiesCollection { + type UserCollectedProperties = RocksUserCollectedProperties; + + #[inline] + fn iter_user_collected_properties(&self, mut f: F) + where + F: FnMut(&Self::UserCollectedProperties) -> bool, + { + for (_, props) in &*self.0 { + let props = props.user_collected_properties(); + if !f(RocksUserCollectedProperties::from_rocks(props)) { + break; + } + } + } +} + +impl engine_traits::TablePropertiesExt for RocksEngine { + type TablePropertiesCollection = RocksTablePropertiesCollection; + + fn table_properties_collection( + &self, + cf: &str, + ranges: &[Range<'_>], + ) -> Result { + // FIXME: extra allocation + let ranges: Vec<_> = ranges.iter().map(|r| (r.start_key, r.end_key)).collect(); + let collection = self.properties_of_tables_in_range(cf, &ranges)?; + Ok(RocksTablePropertiesCollection(collection)) + } +} + +impl RocksEngine { + #[inline] + pub(crate) fn properties_of_tables_in_range( + &self, + cf: &str, + ranges: &[(&[u8], &[u8])], + ) -> Result { + let handle = self.cf(cf)?; + let mut c = OwnedTablePropertiesCollection::default(); + self.as_inner() + .properties_of_tables_in_range(handle, ranges, &mut c) + .map_err(r2e)?; + Ok(c) + } + + #[inline] + pub fn range_properties( + &self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + ) -> Result { + self.properties_of_tables_in_range(cf, &[(start_key, end_key)]) + } +} diff --git a/components/engine_tirocks/src/properties/ttl.rs b/components/engine_tirocks/src/properties/ttl.rs new file mode 100644 index 00000000000..c4190fe59bd --- /dev/null +++ b/components/engine_tirocks/src/properties/ttl.rs @@ -0,0 +1,225 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ffi::CStr, marker::PhantomData}; + +use api_version::{KeyMode, KvFormat, RawValue}; +use engine_traits::{Result, TtlProperties, TtlPropertiesExt}; +use tikv_util::error; +use tirocks::properties::table::user::{ + Context, EntryType, SequenceNumber, TablePropertiesCollector, TablePropertiesCollectorFactory, + UserCollectedProperties, +}; + +use super::{DecodeProperties, EncodeProperties}; +use crate::RocksEngine; + +const PROP_MAX_EXPIRE_TS: &str = "tikv.max_expire_ts"; +const PROP_MIN_EXPIRE_TS: &str = "tikv.min_expire_ts"; + +fn encode_ttl(ttl_props: &TtlProperties, props: &mut impl EncodeProperties) { + props.encode_u64(PROP_MAX_EXPIRE_TS, ttl_props.max_expire_ts); + props.encode_u64(PROP_MIN_EXPIRE_TS, ttl_props.min_expire_ts); +} + +pub(super) fn decode_ttl(props: &impl DecodeProperties) -> codec::Result { + let res = TtlProperties { + max_expire_ts: props.decode_u64(PROP_MAX_EXPIRE_TS)?, + min_expire_ts: props.decode_u64(PROP_MIN_EXPIRE_TS)?, + }; + Ok(res) +} + +impl TtlPropertiesExt for RocksEngine { + fn get_range_ttl_properties_cf( + &self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + ) -> Result> { + let collection = self.properties_of_tables_in_range(cf, &[(start_key, end_key)])?; + if collection.is_empty() { + return Ok(vec![]); + } + + let mut res = Vec::new(); + for (file_name, v) in &*collection { + let prop = match decode_ttl(v.user_collected_properties()) { + Ok(v) => v, + Err(_) => continue, + }; + res.push((std::str::from_utf8(file_name).unwrap().to_string(), prop)); + } + Ok(res) + } +} + +/// Can only be used for default CF. +pub struct TtlPropertiesCollector { + prop: TtlProperties, + _phantom: PhantomData, +} + +impl TtlPropertiesCollector { + fn finish(&mut self, properties: &mut impl EncodeProperties) { + if self.prop.max_expire_ts == 0 && self.prop.min_expire_ts == 0 { + return; + } + encode_ttl(&self.prop, properties); + } +} + +impl TablePropertiesCollector for TtlPropertiesCollector { + fn name(&self) -> &CStr { + ttl_properties_collector_name() + } + + fn add( + &mut self, + key: &[u8], + value: &[u8], + entry_type: EntryType, + _: SequenceNumber, + _: u64, + ) -> tirocks::Result<()> { + if entry_type != EntryType::kEntryPut { + return Ok(()); + } + // Only consider data keys. + if !key.starts_with(keys::DATA_PREFIX_KEY) { + return Ok(()); + } + // Only consider raw keys. + if F::parse_key_mode(&key[keys::DATA_PREFIX_KEY.len()..]) != KeyMode::Raw { + return Ok(()); + } + + match F::decode_raw_value(value) { + Ok(RawValue { + expire_ts: Some(expire_ts), + .. + }) => { + self.prop.max_expire_ts = std::cmp::max(self.prop.max_expire_ts, expire_ts); + if self.prop.min_expire_ts == 0 { + self.prop.min_expire_ts = expire_ts; + } else { + self.prop.min_expire_ts = std::cmp::min(self.prop.min_expire_ts, expire_ts); + } + } + Err(err) => { + error!( + "failed to get expire ts"; + "key" => log_wrappers::Value::key(key), + "value" => log_wrappers::Value::value(value), + "err" => %err, + ); + } + _ => {} + } + Ok(()) + } + + fn finish(&mut self, properties: &mut UserCollectedProperties) -> tirocks::Result<()> { + self.finish(properties); + Ok(()) + } +} + +fn ttl_properties_collector_name() -> &'static CStr { + CStr::from_bytes_with_nul(b"tikv.ttl-properties-collector\0").unwrap() +} + +#[derive(Default)] +pub struct TtlPropertiesCollectorFactory { + _phantom: PhantomData, +} + +impl TablePropertiesCollectorFactory for TtlPropertiesCollectorFactory { + type Collector = TtlPropertiesCollector; + + fn name(&self) -> &CStr { + ttl_properties_collector_name() + } + + fn create_table_properties_collector(&self, _: Context) -> TtlPropertiesCollector { + TtlPropertiesCollector { + prop: Default::default(), + _phantom: PhantomData, + } + } +} + +#[cfg(test)] +mod tests { + use api_version::test_kv_format_impl; + use collections::HashMap; + use kvproto::kvrpcpb::ApiVersion; + use tikv_util::time::UnixSecs; + + use super::*; + + #[test] + fn test_ttl_properties() { + test_kv_format_impl!(test_ttl_properties_impl); + } + + fn test_ttl_properties_impl() { + let get_properties = |case: &[(&'static str, u64)]| -> codec::Result { + let mut collector = TtlPropertiesCollector:: { + prop: Default::default(), + _phantom: PhantomData, + }; + for &(k, ts) in case { + let v = RawValue { + user_value: &[0; 10][..], + expire_ts: Some(ts), + is_delete: false, + }; + collector + .add( + k.as_bytes(), + &F::encode_raw_value(v), + EntryType::kEntryPut, + 0, + 0, + ) + .unwrap(); + } + for &(k, _) in case { + let v = vec![0; 10]; + collector + .add(k.as_bytes(), &v, EntryType::kEntryOther, 0, 0) + .unwrap(); + } + let mut result = HashMap::default(); + collector.finish(&mut result); + decode_ttl(&result) + }; + + let case1 = [ + ("zr\0a", 0), + ("zr\0b", UnixSecs::now().into_inner()), + ("zr\0c", 1), + ("zr\0d", u64::MAX), + ("zr\0e", 0), + ]; + let props = get_properties(&case1).unwrap(); + assert_eq!(props.max_expire_ts, u64::MAX); + match F::TAG { + ApiVersion::V1 => unreachable!(), + ApiVersion::V1ttl => assert_eq!(props.min_expire_ts, 1), + // expire_ts = 0 is no longer a special case in API V2 + ApiVersion::V2 => assert_eq!(props.min_expire_ts, 0), + } + + let case2 = [("zr\0a", 0)]; + get_properties(&case2).unwrap_err(); + + let case3 = []; + get_properties(&case3).unwrap_err(); + + let case4 = [("zr\0a", 1)]; + let props = get_properties(&case4).unwrap(); + assert_eq!(props.max_expire_ts, 1); + assert_eq!(props.min_expire_ts, 1); + } +} From 6d71c2bf9101db808d63eb1a3ae94d1194e208ac Mon Sep 17 00:00:00 2001 From: Ping Yu Date: Tue, 11 Oct 2022 11:39:50 +0800 Subject: [PATCH 259/676] rawkv: Fix unstable test-region-merge (#13580) close tikv/tikv#13582 Fix unstable test `test_rawkv::test_region_merge`. Signed-off-by: pingyu --- components/causal_ts/src/tso.rs | 3 --- tests/failpoints/cases/test_rawkv.rs | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 5a9d119f6d5..ad9f3ec1fc6 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -622,9 +622,6 @@ impl CausalTsProvider for BatchTsoProvider { } async fn async_flush(&self) -> Result { - fail::fail_point!("causal_ts_provider_flush", |_| Err(box_err!( - "async_flush err(failpoints)" - ))); self.renew_tso_batch(true, TsoBatchRenewReason::flush) .await?; // TODO: Return the first tso by renew_tso_batch instead of async_get_ts diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index fd56bd87992..547b6144c7c 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -141,7 +141,7 @@ impl TestSuite { } } -const FP_CAUSAL_TS_PROVIDER_FLUSH: &str = "causal_ts_provider_flush"; +const FP_GET_TSO: &str = "test_raftstore_get_tso"; /// Verify correctness on leader transfer. // TODO: simulate and test for the scenario of issue #12498. @@ -164,8 +164,8 @@ fn test_leader_transfer() { assert_eq!(suite.must_raw_get(key1), Some(b"v4".to_vec())); } - // Disable CausalObserver::flush_timestamp to produce causality issue. - fail::cfg(FP_CAUSAL_TS_PROVIDER_FLUSH, "return").unwrap(); + // Make causal_ts_provider.async_flush() & handle_update_max_timestamp fail. + fail::cfg(FP_GET_TSO, "return(50)").unwrap(); // Transfer leader and write to store 2. { @@ -182,8 +182,8 @@ fn test_leader_transfer() { // Transfer leader back. suite.must_transfer_leader(®ion, 1); suite.must_leader_on_store(key1, 1); - // Enable CausalObserver::flush_timestamp. - fail::cfg(FP_CAUSAL_TS_PROVIDER_FLUSH, "off").unwrap(); + // Make handle_update_max_timestamp succeed. + fail::cfg(FP_GET_TSO, "off").unwrap(); // Transfer leader and write to store 2 again. { suite.must_transfer_leader(®ion, 2); @@ -195,7 +195,7 @@ fn test_leader_transfer() { assert_eq!(suite.must_raw_get(key1), Some(b"v8".to_vec())); } - fail::remove(FP_CAUSAL_TS_PROVIDER_FLUSH); + fail::remove(FP_GET_TSO); suite.stop(); } @@ -238,8 +238,8 @@ fn test_region_merge() { assert_eq!(suite.must_raw_get(keys[1]), Some(b"v4".to_vec())); } - // Disable CausalObserver::flush_timestamp to produce causality issue. - fail::cfg(FP_CAUSAL_TS_PROVIDER_FLUSH, "return").unwrap(); + // Make causal_ts_provider.async_flush() & handle_update_max_timestamp fail. + fail::cfg(FP_GET_TSO, "return(50)").unwrap(); // Merge region 1 to 3. { @@ -253,8 +253,8 @@ fn test_region_merge() { assert_eq!(suite.must_raw_get(keys[1]), Some(b"v4".to_vec())); } - // Enable CausalObserver::flush_timestamp. - fail::cfg(FP_CAUSAL_TS_PROVIDER_FLUSH, "off").unwrap(); + // Make handle_update_max_timestamp succeed. + fail::cfg(FP_GET_TSO, "off").unwrap(); // Merge region 3 to 5. { @@ -268,6 +268,6 @@ fn test_region_merge() { assert_eq!(suite.must_raw_get(keys[1]), Some(b"v8".to_vec())); } - fail::remove(FP_CAUSAL_TS_PROVIDER_FLUSH); + fail::remove(FP_GET_TSO); suite.stop(); } From 94c8aa5c3c5335ea99281bea7c6c79f61cb201de Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 11 Oct 2022 16:47:50 +0800 Subject: [PATCH 260/676] raftstore-v2: implement local read for raftstore-v2 (#13375) ref tikv/tikv#12842 Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 32 +- components/raftstore-v2/src/bootstrap.rs | 2 +- components/raftstore-v2/src/fsm/peer.rs | 3 +- components/raftstore-v2/src/fsm/store.rs | 1 - .../src/operation/command/write/mod.rs | 1 - .../operation/command/write/simple_write.rs | 3 - components/raftstore-v2/src/operation/life.rs | 2 +- components/raftstore-v2/src/operation/mod.rs | 2 + .../raftstore-v2/src/operation/query/lease.rs | 27 +- .../raftstore-v2/src/operation/query/local.rs | 641 ++++++++++++++++-- .../raftstore-v2/src/operation/query/mod.rs | 4 +- .../src/operation/query/replica.rs | 14 +- .../src/operation/ready/async_writer.rs | 10 +- .../raftstore-v2/src/operation/ready/mod.rs | 7 +- components/raftstore-v2/src/raft/peer.rs | 60 +- components/raftstore-v2/src/raft/storage.rs | 4 +- components/raftstore-v2/src/router/imp.rs | 75 +- components/raftstore-v2/src/router/message.rs | 10 +- components/raftstore-v2/src/router/mod.rs | 1 + .../src/router/response_channel.rs | 25 +- .../tests/integrations/cluster.rs | 24 +- .../tests/integrations/test_read.rs | 43 ++ components/raftstore/src/router.rs | 14 +- components/raftstore/src/store/mod.rs | 9 +- components/raftstore/src/store/peer.rs | 8 +- components/raftstore/src/store/worker/mod.rs | 6 +- components/raftstore/src/store/worker/read.rs | 512 +++++++------- 27 files changed, 1120 insertions(+), 420 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index bd777477bf0..b387300b40e 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -1,10 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - cell::Cell, - mem, ops::{Deref, DerefMut}, - sync::{atomic::AtomicUsize, Arc, Mutex}, + sync::{Arc, Mutex}, time::Duration, }; @@ -23,7 +21,7 @@ use kvproto::{ use raft::INVALID_ID; use raftstore::store::{ fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, RaftlogFetchRunner, - RaftlogFetchTask, StoreWriters, Transport, WriteMsg, WriteSenders, + RaftlogFetchTask, StoreWriters, Transport, WriteSenders, }; use slog::Logger; use tikv_util::{ @@ -42,8 +40,8 @@ use time::Timespec; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, - raft::{Peer, Storage}, - router::{PeerMsg, PeerTick, QueryResChannel, StoreMsg}, + raft::Storage, + router::{PeerMsg, PeerTick, StoreMsg}, Error, Result, }; @@ -390,7 +388,7 @@ impl StoreSystem { log_fetch_scheduler, &mut workers.store_writers, self.logger.clone(), - store_meta, + store_meta.clone(), ); self.workers = Some(workers); let peers = builder.init()?; @@ -401,12 +399,20 @@ impl StoreSystem { let mut mailboxes = Vec::with_capacity(peers.len()); let mut address = Vec::with_capacity(peers.len()); - for (region_id, (tx, fsm)) in peers { - address.push(region_id); - mailboxes.push(( - region_id, - BasicMailbox::new(tx, fsm, router.state_cnt().clone()), - )); + { + let mut meta = store_meta.as_ref().lock().unwrap(); + for (region_id, (tx, fsm)) in peers { + meta.readers + .insert(region_id, fsm.peer().generate_read_delegate()); + meta.tablet_caches + .insert(region_id, fsm.peer().tablet().clone()); + + address.push(region_id); + mailboxes.push(( + region_id, + BasicMailbox::new(tx, fsm, router.state_cnt().clone()), + )); + } } router.register_all(mailboxes); diff --git a/components/raftstore-v2/src/bootstrap.rs b/components/raftstore-v2/src/bootstrap.rs index c3e2d2de6f7..6700db4d45f 100644 --- a/components/raftstore-v2/src/bootstrap.rs +++ b/components/raftstore-v2/src/bootstrap.rs @@ -7,7 +7,7 @@ use error_code::ErrorCodeExt; use fail::fail_point; use kvproto::{ metapb::{Region, Store}, - raft_serverpb::{RaftLocalState, RegionLocalState, StoreIdent}, + raft_serverpb::{RaftLocalState, StoreIdent}, }; use pd_client::PdClient; use raft::INVALID_ID; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index cd8775359fc..389e59f0ee4 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -7,12 +7,11 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine, TabletFactory}; -use kvproto::metapb; use raftstore::store::{Config, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ is_zero_duration, - mpsc::{self, LooseBoundedSender, Receiver, Sender}, + mpsc::{self, LooseBoundedSender, Receiver}, time::{duration_to_sec, Instant}, yatp_pool::FuturePool, }; diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index d55d132679f..3be571bdfbc 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -4,7 +4,6 @@ use std::time::SystemTime; use batch_system::Fsm; use collections::HashMap; -use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine}; use raftstore::store::{Config, ReadDelegate}; use slog::{o, Logger}; diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 76692b6af0a..798e1b45631 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -11,7 +11,6 @@ use raftstore::{ }, Result, }; -use tikv_util::Either; use crate::{ batch::StoreContext, diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index 82628a40385..46544be1a32 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -1,11 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::borrow::Cow; - use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request}; use protobuf::{CodedInputStream, Message, SingularPtrField}; -use tikv_util::Either; use crate::router::CmdResChannel; diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 2cf7594b9a7..678cf6ece4b 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -25,7 +25,7 @@ use tikv_util::store::find_peer; use crate::{ batch::StoreContext, - fsm::{PeerFsm, Store, StoreFsmDelegate}, + fsm::{PeerFsm, Store}, raft::{Peer, Storage}, router::PeerMsg, }; diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index ebef0cf0595..7b31473f784 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -8,3 +8,5 @@ mod ready; pub use command::{CommittedEntries, SimpleWriteDecoder, SimpleWriteEncoder}; pub use life::DestroyProgress; pub use ready::AsyncWriter; + +pub(crate) use self::query::LocalReader; diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index fe25fdab454..00a485c8460 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -3,24 +3,14 @@ use std::sync::{Arc, Mutex}; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::{ - kvrpcpb::ExtraOp as TxnExtraOp, - raft_cmdpb::{self, RaftCmdRequest, RaftCmdResponse}, +use kvproto::raft_cmdpb::RaftCmdRequest; +use raftstore::store::{ + can_amend_read, fsm::apply::notify_stale_req, metrics::RAFT_READ_INDEX_PENDING_COUNT, + msg::ReadCallback, propose_read_index, should_renew_lease, util::LeaseState, ReadDelegate, + ReadIndexRequest, ReadProgress, TrackVer, Transport, }; -use raftstore::{ - store::{ - can_amend_read, cmd_resp, - fsm::{apply::notify_stale_req, Proposal}, - metrics::RAFT_READ_INDEX_PENDING_COUNT, - msg::{ErrorCallback, ReadCallback}, - propose_read_index, should_renew_lease, - util::{check_region_epoch, LeaseState}, - ReadDelegate, ReadIndexRequest, ReadProgress, TrackVer, Transport, - }, - Error, -}; -use slog::{debug, error, info, o, Logger}; -use tikv_util::{box_err, time::monotonic_raw_now, Either}; +use slog::debug; +use tikv_util::time::monotonic_raw_now; use time::Timespec; use tracker::GLOBAL_TRACKERS; @@ -28,8 +18,7 @@ use crate::{ batch::StoreContext, fsm::StoreMeta, raft::Peer, - router::{CmdResChannel, QueryResChannel, QueryResult, ReadResponse}, - Result, + router::{QueryResChannel, QueryResult, ReadResponse}, }; impl Peer { diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index a0535643eb3..bdf829dc4f5 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -2,44 +2,251 @@ // #[PerformanceCriticalPath] use std::{ - cell::Cell, - collections::HashMap, - fmt::{self, Display, Formatter}, - marker::PhantomData, ops::Deref, - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, Mutex, - }, - time::Duration, + sync::{atomic, Arc, Mutex}, }; -use crossbeam::atomic::AtomicCell; -use engine_traits::{KvEngine, RaftEngine, Snapshot, TabletFactory}; -use fail::fail_point; +use batch_system::Router; +use crossbeam::channel::TrySendError; +use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ - metapb, - raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, ReadIndexResponse, Request, Response}, + errorpb, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse}, }; -use pd_client::BucketMeta; use raftstore::{ + errors::RAFTSTORE_IS_BUSY, store::{ - cmd_resp, - util::{self, LeaseState, RegionReadProgress, RemoteLease}, - ReadDelegate, ReadExecutor, ReadExecutorProvider, ReadProgress, ReadResponse, - RegionSnapshot, RequestInspector, RequestPolicy, + cmd_resp, util::LeaseState, LocalReaderCore, ReadDelegate, ReadExecutor, + ReadExecutorProvider, RegionSnapshot, RequestInspector, RequestPolicy, + TLS_LOCAL_READ_METRICS, }, Error, Result, }; -use slog::{debug, error, info, o, warn, Logger}; +use slog::{debug, Logger}; use tikv_util::{ + box_err, codec::number::decode_u64, - lru::LruCache, - time::{monotonic_raw_now, Instant, ThreadReadId}, + time::{monotonic_raw_now, ThreadReadId}, }; use time::Timespec; +use txn_types::WriteBatchFlags; + +use crate::{ + fsm::StoreMeta, + router::{PeerMsg, QueryResult}, + tablet::CachedTablet, + StoreRouter, +}; + +pub trait MsgRouter: Send { + fn send(&self, addr: u64, msg: PeerMsg) -> std::result::Result<(), TrySendError>; +} + +impl MsgRouter for StoreRouter +where + EK: KvEngine, + ER: RaftEngine, +{ + fn send(&self, addr: u64, msg: PeerMsg) -> std::result::Result<(), TrySendError> { + Router::send(self, addr, msg) + } +} + +#[derive(Clone)] +pub struct LocalReader +where + E: KvEngine, + C: MsgRouter, +{ + local_reader: LocalReaderCore, StoreMetaDelegate>, + router: C, + + logger: Logger, +} + +impl LocalReader +where + E: KvEngine, + C: MsgRouter, +{ + pub fn new(store_meta: Arc>>, router: C, logger: Logger) -> Self { + Self { + local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta)), + router, + logger, + } + } + + pub fn store_meta(&self) -> &Arc>> { + self.local_reader.store_meta() + } + + pub fn pre_propose_raft_command( + &mut self, + req: &RaftCmdRequest, + ) -> Result, RequestPolicy)>> { + if let Some(delegate) = self.local_reader.validate_request(req)? { + let mut inspector = SnapRequestInspector { + delegate: &delegate, + logger: &self.logger, + }; + match inspector.inspect(req) { + Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), + Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), + // It can not handle other policies. + Ok(_) => Ok(None), + Err(e) => Err(e), + } + } else { + Err(Error::RegionNotFound(req.get_header().get_region_id())) + } + } + + fn try_get_snapshot( + &mut self, + req: RaftCmdRequest, + ) -> std::result::Result>, RaftCmdResponse> { + match self.pre_propose_raft_command(&req) { + Ok(Some((mut delegate, policy))) => match policy { + RequestPolicy::ReadLocal => { + let region = Arc::clone(&delegate.region); + let snap = RegionSnapshot::from_snapshot( + delegate.get_snapshot(None, &mut None), + region, + ); + // Ensures the snapshot is acquired before getting the time + atomic::fence(atomic::Ordering::Release); + let snapshot_ts = monotonic_raw_now(); + + if !delegate.is_in_leader_lease(snapshot_ts) { + return Ok(None); + } + + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); + + // Try renew lease in advance + self.maybe_renew_lease_in_advance(&delegate, &req, snapshot_ts); + Ok(Some(snap)) + } + RequestPolicy::StaleRead => { + let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); + delegate.check_stale_read_safe(read_ts)?; + + let region = Arc::clone(&delegate.region); + let snap = RegionSnapshot::from_snapshot( + delegate.get_snapshot(None, &mut None), + region, + ); + + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); + + delegate.check_stale_read_safe(read_ts)?; -use crate::{fsm::StoreMeta, tablet::CachedTablet}; + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); + Ok(Some(snap)) + } + _ => unreachable!(), + }, + Ok(None) => Ok(None), + Err(e) => { + let mut response = cmd_resp::new_error(e); + if let Some(delegate) = self + .local_reader + .delegates + .get(&req.get_header().get_region_id()) + { + cmd_resp::bind_term(&mut response, delegate.term); + } + Err(response) + } + } + } + + pub async fn snapshot( + &mut self, + mut req: RaftCmdRequest, + ) -> std::result::Result, RaftCmdResponse> { + let region_id = req.header.get_ref().region_id; + if let Some(snap) = self.try_get_snapshot(req.clone())? { + return Ok(snap); + } + + if let Some(query_res) = self.try_to_renew_lease(region_id, &req).await? { + // If query successful, try again. + if query_res.read().is_some() { + req.mut_header().set_read_quorum(false); + if let Some(snap) = self.try_get_snapshot(req)? { + return Ok(snap); + } + } + } + + let mut err = errorpb::Error::default(); + err.set_message(format!( + "Fail to get snapshot from LocalReader for region {}. Maybe due to `not leader` or `not applied to the current term`", + region_id + )); + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + Err(resp) + } + + // try to renew the lease by sending read query where the reading process may + // renew the lease + async fn try_to_renew_lease( + &self, + region_id: u64, + req: &RaftCmdRequest, + ) -> std::result::Result, RaftCmdResponse> { + let (msg, sub) = PeerMsg::raft_query(req.clone()); + let mut err = errorpb::Error::default(); + match MsgRouter::send(&self.router, region_id, msg) { + Ok(()) => return Ok(sub.result().await), + Err(TrySendError::Full(c)) => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.channel_full.inc()); + err.set_message(RAFTSTORE_IS_BUSY.to_owned()); + err.mut_server_is_busy() + .set_reason(RAFTSTORE_IS_BUSY.to_owned()); + } + Err(TrySendError::Disconnected(c)) => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); + err.set_message(format!("region {} is missing", region_id)); + err.mut_region_not_found().set_region_id(region_id); + } + } + + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + Err(resp) + } + + // If the remote lease will be expired in near future send message + // to `raftstore` to renew it + fn maybe_renew_lease_in_advance( + &self, + delegate: &ReadDelegate, + req: &RaftCmdRequest, + ts: Timespec, + ) { + if !delegate.need_renew_lease(ts) { + return; + } + + let region_id = req.header.get_ref().region_id; + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().renew_lease_advance.inc()); + // Send a read query which may renew the lease + let (msg, sub) = PeerMsg::raft_query(req.clone()); + if let Err(e) = MsgRouter::send(&self.router, region_id, msg) { + debug!( + self.logger, + "failed to send query for trying to renew lease"; + "region" => region_id, + "error" => ?e + ) + } + } +} /// CachedReadDelegate is a wrapper the ReadDelegate and CachedTablet. /// CachedTablet can fetch the latest tablet of this ReadDelegate's region. The @@ -78,10 +285,12 @@ where } } -impl ReadExecutor for CachedReadDelegate +impl ReadExecutor for CachedReadDelegate where E: KvEngine, { + type Tablet = E; + fn get_tablet(&mut self) -> &E { self.cached_tablet.latest().unwrap() } @@ -112,11 +321,12 @@ where } } -impl ReadExecutorProvider for StoreMetaDelegate +impl ReadExecutorProvider for StoreMetaDelegate where E: KvEngine, { type Executor = CachedReadDelegate; + type StoreMeta = Arc>>; fn store_id(&self) -> Option { self.store_meta.as_ref().lock().unwrap().store_id @@ -140,41 +350,375 @@ where } (meta.readers.len(), None) } + + fn store_meta(&self) -> &Self::StoreMeta { + &self.store_meta + } +} + +struct SnapRequestInspector<'r> { + delegate: &'r ReadDelegate, + logger: &'r Logger, +} + +impl<'r> SnapRequestInspector<'r> { + fn inspect(&mut self, req: &RaftCmdRequest) -> Result { + assert!(!req.has_admin_request()); + if req.get_requests().len() != 1 + || req.get_requests().first().unwrap().get_cmd_type() != CmdType::Snap + { + return Err(box_err!( + "LocalReader can only serve for exactly one Snap request" + )); + } + + let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); + if flags.contains(WriteBatchFlags::STALE_READ) { + return Ok(RequestPolicy::StaleRead); + } + + if req.get_header().get_read_quorum() { + return Ok(RequestPolicy::ReadIndex); + } + + // If applied index's term is differ from current raft's term, leader transfer + // must happened, if read locally, we may read old value. + if !self.has_applied_to_current_term() { + return Ok(RequestPolicy::ReadIndex); + } + + // Local read should be performed, if and only if leader is in lease. + // None for now. + match self.inspect_lease() { + LeaseState::Valid => Ok(RequestPolicy::ReadLocal), + LeaseState::Expired | LeaseState::Suspect => { + // Perform a consistent read to Raft quorum and try to renew the leader lease. + Ok(RequestPolicy::ReadIndex) + } + } + } + + fn has_applied_to_current_term(&mut self) -> bool { + if self.delegate.applied_term == self.delegate.term { + true + } else { + debug!( + self.logger, + "rejected by term check"; + "tag" => &self.delegate.tag, + "applied_term" => self.delegate.applied_term, + "delegate_term" => ?self.delegate.term, + ); + + // only for metric. + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.applied_term.inc()); + false + } + } + + fn inspect_lease(&mut self) -> LeaseState { + // TODO: disable localreader if we did not enable raft's check_quorum. + if self.delegate.leader_lease.is_some() { + // We skip lease check, because it is postponed until `handle_read`. + LeaseState::Valid + } else { + debug!(self.logger, "rejected by leader lease"; "tag" => &self.delegate.tag); + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_lease.inc()); + LeaseState::Expired + } + } } #[cfg(test)] mod tests { - use std::{borrow::Borrow, sync::mpsc::*, thread}; + use std::{ + cell::Cell, + sync::mpsc::*, + thread::{self, JoinHandle}, + }; + use crossbeam::{atomic::AtomicCell, channel::TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, KvTestSnapshot, TestTabletFactoryV2}, + kv::{KvTestEngine, TestTabletFactoryV2}, }; - use engine_traits::{OpenOptions, Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; - use kvproto::{metapb::Region, raft_cmdpb::*}; + use engine_traits::{OpenOptions, Peekable, SyncMutable, TabletFactory, ALL_CFS}; + use futures::executor::block_on; + use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; use raftstore::store::{ - util::Lease, Callback, CasualMessage, CasualRouter, LocalReader, ProposalRouter, - RaftCommand, + util::Lease, ReadCallback, ReadProgress, RegionReadProgress, TrackVer, TxnExt, + TLS_LOCAL_READ_METRICS, }; - use tempfile::{Builder, TempDir}; + use slog::o; + use tempfile::Builder; use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; use time::Duration; - use txn_types::{Key, Lock, LockType, WriteBatchFlags}; + use txn_types::WriteBatchFlags; use super::*; + use crate::router::{QueryResult, ReadResponse}; + + struct MockRouter { + p_router: SyncSender<(u64, PeerMsg)>, + } + + impl MockRouter { + fn new() -> (MockRouter, Receiver<(u64, PeerMsg)>) { + let (p_ch, p_rx) = sync_channel(1); + (MockRouter { p_router: p_ch }, p_rx) + } + } + + impl MsgRouter for MockRouter { + fn send(&self, addr: u64, cmd: PeerMsg) -> std::result::Result<(), TrySendError> { + self.p_router.send((addr, cmd)).unwrap(); + Ok(()) + } + } + + #[allow(clippy::type_complexity)] + fn new_reader( + store_id: u64, + store_meta: Arc>>, + ) -> ( + LocalReader, + Receiver<(u64, PeerMsg)>, + ) { + let (ch, rx) = MockRouter::new(); + let mut reader = LocalReader::new( + store_meta, + ch, + Logger::root(slog::Discard, o!("key1" => "value1")), + ); + reader.local_reader.store_id = Cell::new(Some(store_id)); + (reader, rx) + } + + fn new_peers(store_id: u64, pr_ids: Vec) -> Vec { + pr_ids + .into_iter() + .map(|id| { + let mut pr = metapb::Peer::default(); + pr.set_store_id(store_id); + pr.set_id(id); + pr + }) + .collect() + } + + #[test] + fn test_read() { + // It mocks that local reader communications with raftstore. + // rx receives msgs like raftstore, then call f() to do something (such as renew + // lease or something), then send the result back to the local reader through ch + fn handle_msg( + f: F, + rx: Receiver<(u64, PeerMsg)>, + ch_tx: SyncSender>, + ) -> JoinHandle<()> { + thread::spawn(move || { + // Msg for query will be sent + let (_, msg) = rx.recv().unwrap(); + + f(); + match msg { + PeerMsg::RaftQuery(query) => ReadCallback::set_result( + query.ch, + QueryResult::Read(ReadResponse { + read_index: 0, + txn_extra_op: Default::default(), + }), + ), + _ => unreachable!(), + } + ch_tx.send(rx).unwrap(); + }) + } + + let store_id = 1; + + // Building a tablet factory + let ops = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let path = Builder::new() + .prefix("test-local-reader") + .tempdir() + .unwrap(); + let factory = Arc::new(TestTabletFactoryV2::new(path.path(), ops, cf_opts)); + + let store_meta = Arc::new(Mutex::new(StoreMeta::new())); + let (mut reader, mut rx) = new_reader(store_id, store_meta.clone()); - fn new_read_delegate( - region: &Region, - peer_id: u64, - term: u64, - applied_index_term: u64, - ) -> ReadDelegate { - let mut read_delegate_core = ReadDelegate::mock(region.id); - read_delegate_core.peer_id = peer_id; - read_delegate_core.term = term; - read_delegate_core.applied_term = applied_index_term; - read_delegate_core.region = Arc::new(region.clone()); - read_delegate_core + let mut region1 = metapb::Region::default(); + region1.set_id(1); + let prs = new_peers(store_id, vec![1, 2, 3]); + region1.set_peers(prs.clone().into()); + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let leader2 = prs[0].clone(); + region1.set_region_epoch(epoch13.clone()); + let term6 = 6; + let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); + let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, "".to_owned())); + + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader2); + header.set_region_epoch(epoch13); + header.set_term(term6); + cmd.set_header(header); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + // The region is not register yet. + let res = block_on(reader.snapshot(cmd.clone())).unwrap_err(); + assert!( + res.header + .as_ref() + .unwrap() + .get_error() + .has_region_not_found() + ); + // No msg will ben sent + rx.try_recv().unwrap_err(); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.no_region.get()), + 1 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 1 + ); + assert!(reader.local_reader.delegates.get(&1).is_none()); + + // Register region 1 + lease.renew(monotonic_raw_now()); + let remote = lease.maybe_new_remote_lease(term6).unwrap(); + { + let mut meta = store_meta.as_ref().lock().unwrap(); + + // Create read_delegate with region id 1 + let read_delegate = ReadDelegate { + tag: String::new(), + region: Arc::new(region1.clone()), + peer_id: 1, + term: term6, + applied_term: term6 - 1, + leader_lease: Some(remote), + last_valid_ts: Timespec::new(0, 0), + txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), + txn_ext: Arc::new(TxnExt::default()), + read_progress: read_progress.clone(), + pending_remove: false, + track_ver: TrackVer::new(), + bucket_meta: None, + }; + meta.readers.insert(1, read_delegate); + // create tablet with region_id 1 and prepare some data + let tablet1 = factory + .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); + let cache = CachedTablet::new(Some(tablet1)); + meta.tablet_caches.insert(1, cache); + } + + let (ch_tx, ch_rx) = sync_channel(1); + + // Case: Applied term not match + let store_meta_clone = store_meta.clone(); + let handler = handle_msg( + move || { + let mut meta = store_meta_clone.lock().unwrap(); + meta.readers + .get_mut(&1) + .unwrap() + .update(ReadProgress::applied_term(term6)); + }, + rx, + ch_tx.clone(), + ); + // The first try will be rejected due to unmatched applied term but after update + // the applied term by the above thread, the snapshot will be acquired by + // retrying. + let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); + assert_eq!(*snap.get_region(), region1); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 3 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.applied_term.get()), + 1 + ); + handler.join().unwrap(); + rx = ch_rx.recv().unwrap(); + + // Case: Expire lease to make the local reader lease check fail. + lease.expire_remote_lease(); + let remote = lease.maybe_new_remote_lease(term6).unwrap(); + let handler = handle_msg( + move || { + let mut meta = store_meta.lock().unwrap(); + meta.readers + .get_mut(&1) + .unwrap() + .update(ReadProgress::leader_lease(remote)); + }, + rx, + ch_tx.clone(), + ); + let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); + // Updating lease makes cache miss. + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 4 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), + 1 + ); + handler.join().unwrap(); + rx = ch_rx.recv().unwrap(); + + // Case: Read quorum. + let mut cmd_read_quorum = cmd.clone(); + cmd_read_quorum.mut_header().set_read_quorum(true); + let handler = handle_msg(|| {}, rx, ch_tx); + let _ = block_on(reader.snapshot(cmd_read_quorum.clone())).unwrap(); + handler.join().unwrap(); + ch_rx.recv().unwrap(); + + // Case: Stale read + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 0 + ); + read_progress.update_safe_ts(1, 1); + assert_eq!(read_progress.safe_ts(), 1); + let data = { + let mut d = [0u8; 8]; + (&mut d[..]).encode_u64(2).unwrap(); + d + }; + cmd.mut_header() + .set_flags(WriteBatchFlags::STALE_READ.bits()); + cmd.mut_header().set_flag_data(data.into()); + let res = block_on(reader.snapshot(cmd.clone())).unwrap_err(); + assert!(res.get_header().get_error().has_data_is_not_ready()); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.safe_ts.get()), + 1 + ); + read_progress.update_safe_ts(1, 2); + assert_eq!(read_progress.safe_ts(), 2); + let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); + assert_eq!(*snap.get_region(), region1); } #[test] @@ -197,7 +741,7 @@ mod tests { let mut meta = store_meta.store_meta.as_ref().lock().unwrap(); // Create read_delegate with region id 1 - let mut read_delegate = ReadDelegate::mock(1); + let read_delegate = ReadDelegate::mock(1); meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data @@ -208,9 +752,8 @@ mod tests { let cache = CachedTablet::new(Some(tablet1.clone())); meta.tablet_caches.insert(1, cache); - // Create read_delegate with region id 1 - let mut read_delegate = ReadDelegate::mock(2); - let cache = CachedTablet::new(Some(read_delegate.clone())); + // Create read_delegate with region id 2 + let read_delegate = ReadDelegate::mock(2); meta.readers.insert(2, read_delegate); // create tablet with region_id 1 and prepare some data diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 14cedc7b212..b592b4819a5 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -25,7 +25,7 @@ use raftstore::{ store::{ cmd_resp, local_metrics::RaftMetrics, metrics::RAFT_READ_INDEX_PENDING_COUNT, msg::ErrorCallback, region_meta::RegionMeta, util, util::LeaseState, GroupState, - ReadCallback, ReadIndexContext, RequestPolicy, Transport, + ReadIndexContext, RequestPolicy, Transport, }, Error, Result, }; @@ -46,6 +46,8 @@ mod lease; mod local; mod replica; +pub(crate) use self::local::LocalReader; + impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> PeerFsmDelegate<'a, EK, ER, T> { diff --git a/components/raftstore-v2/src/operation/query/replica.rs b/components/raftstore-v2/src/operation/query/replica.rs index 5a56a23663e..9433cd10c52 100644 --- a/components/raftstore-v2/src/operation/query/replica.rs +++ b/components/raftstore-v2/src/operation/query/replica.rs @@ -1,7 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{KvEngine, RaftEngine}; -use kvproto::raft_cmdpb::{self, CmdType, RaftCmdRequest, RaftCmdResponse}; +use kvproto::raft_cmdpb::{self, RaftCmdRequest, RaftCmdResponse}; use pd_client::INVALID_ID; use raftstore::{ store::{ @@ -9,22 +9,18 @@ use raftstore::{ fsm::apply::notify_stale_req, metrics::RAFT_READ_INDEX_PENDING_COUNT, msg::{ErrorCallback, ReadCallback}, - propose_read_index, - util::check_region_epoch, - ReadIndexRequest, Transport, + propose_read_index, ReadIndexRequest, Transport, }, Error, }; -use slog::{debug, error, info, o, Logger}; -use tikv_util::{box_err, time::monotonic_raw_now}; -use time::Timespec; +use slog::debug; +use tikv_util::time::monotonic_raw_now; use tracker::GLOBAL_TRACKERS; use crate::{ batch::StoreContext, raft::Peer, - router::{message::RaftRequest, QueryResChannel, QueryResult, ReadResponse}, - Result, + router::{QueryResChannel, QueryResult, ReadResponse}, }; impl Peer { /// read index on follower diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index 3db4426ebf7..3ebc1f20da7 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -1,16 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - collections::VecDeque, - sync::{atomic::AtomicUsize, Arc}, -}; +use std::collections::VecDeque; -use crossbeam::channel::Sender; use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_serverpb::RaftMessage; use raftstore::store::{ - local_metrics::RaftMetrics, Config, PersistedNotifier, WriteMsg, WriteRouter, - WriteRouterContext, WriteSenders, WriteTask, + local_metrics::RaftMetrics, Config, PersistedNotifier, WriteRouter, WriteRouterContext, + WriteSenders, WriteTask, }; use slog::{warn, Logger}; diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 2580b4bb79a..e20192394a6 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -32,8 +32,7 @@ use slog::{debug, error, trace, warn}; pub use self::async_writer::AsyncWriter; use crate::{ batch::StoreContext, - fsm::{PeerFsm, PeerFsmDelegate}, - operation::DestroyProgress, + fsm::PeerFsmDelegate, raft::{Peer, Storage}, router::PeerTick, }; @@ -83,7 +82,7 @@ impl Peer { } if msg.has_merge_target() { unimplemented!(); - return; + // return; } // We don't handle stale message like v1, as we rely on leader to actively // cleanup stale peers. @@ -103,7 +102,7 @@ impl Peer { } if msg.has_extra_msg() { unimplemented!(); - return; + // return; } // TODO: drop all msg append when the peer is uninitialized and has conflict // ranges with other peers. diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 8b69a52f623..e7ee6e7465a 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -4,39 +4,23 @@ use std::{mem, sync::Arc}; use crossbeam::atomic::AtomicCell; use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; -use fail::fail_point; -use kvproto::{ - metapb, - raft_cmdpb::{self, RaftCmdRequest}, - raft_serverpb::RegionLocalState, -}; -use protobuf::Message; -use raft::{RawNode, StateRole, INVALID_ID}; -use raftstore::{ - store::{ - fsm::Proposal, - metrics::PEER_PROPOSE_LOG_SIZE_HISTOGRAM, - util::{Lease, RegionReadProgress}, - Config, EntryStorage, ProposalQueue, RaftlogFetchTask, ReadIndexQueue, ReadIndexRequest, - Transport, WriteRouter, - }, - Error, -}; -use slog::{debug, error, info, o, warn, Logger}; -use tikv_util::{ - box_err, - config::ReadableSize, - time::{monotonic_raw_now, Instant as TiInstant}, - worker::Scheduler, - Either, +use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb}; +use pd_client::BucketStat; +use raft::{RawNode, StateRole}; +use raftstore::store::{ + util::{Lease, RegionReadProgress}, + Config, EntryStorage, ProposalQueue, ReadDelegate, ReadIndexQueue, TrackVer, TxnExt, }; +use slog::Logger; +use tikv_util::{box_err, config::ReadableSize}; +use time::Timespec; use super::{storage::Storage, Apply}; use crate::{ fsm::{ApplyFsm, ApplyScheduler}, operation::{AsyncWriter, DestroyProgress, SimpleWriteEncoder}, router::{CmdResChannel, QueryResChannel}, - tablet::{self, CachedTablet}, + tablet::CachedTablet, Result, }; @@ -68,6 +52,12 @@ pub struct Peer { pending_reads: ReadIndexQueue, read_progress: Arc, leader_lease: Lease, + + /// region buckets. + region_buckets: Option, + /// Transaction extensions related to this peer. + txn_ext: Arc, + txn_extra_op: Arc>, } impl Peer { @@ -149,6 +139,9 @@ impl Peer { cfg.raft_store_max_leader_lease(), cfg.renew_leader_lease_advance_duration(), ), + region_buckets: None, + txn_ext: Arc::default(), + txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), }; // If this region has only one peer and I am the one, campaign directly. @@ -397,4 +390,19 @@ impl Peer { pub fn set_apply_scheduler(&mut self, apply_scheduler: ApplyScheduler) { self.apply_scheduler = Some(apply_scheduler); } + + pub fn generate_read_delegate(&self) -> ReadDelegate { + let peer_id = self.peer().get_id(); + + ReadDelegate::new( + peer_id, + self.term(), + self.region().clone(), + self.storage().entry_storage().applied_term(), + self.txn_extra_op.clone(), + self.txn_ext.clone(), + self.read_progress().clone(), + self.region_buckets.as_ref().map(|b| b.meta.clone()), + ) + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 2ada737c620..1615255ab23 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -8,7 +8,7 @@ use kvproto::{ raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; use raft::{ - eraftpb::{ConfState, Entry, HardState, Snapshot}, + eraftpb::{ConfState, Entry, Snapshot}, GetEntriesContext, RaftState, INVALID_ID, }; use raftstore::store::{ @@ -17,7 +17,7 @@ use raftstore::store::{ use slog::{o, Logger}; use tikv_util::{box_err, store::find_peer, worker::Scheduler}; -use crate::{Error, Result}; +use crate::Result; pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { let region_id = region.get_id(); diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 401961dfdb1..78abef13247 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -1,13 +1,84 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::sync::{Arc, Mutex}; + +use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine}; -use raftstore::store::{FetchedLogs, LogFetchedNotifier}; +use kvproto::{ + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_serverpb::RaftMessage, +}; +use raftstore::store::{FetchedLogs, LogFetchedNotifier, RegionSnapshot}; +use slog::Logger; use super::PeerMsg; -use crate::batch::StoreRouter; +use crate::{batch::StoreRouter, operation::LocalReader, StoreMeta}; impl LogFetchedNotifier for StoreRouter { fn notify(&self, region_id: u64, fetched: FetchedLogs) { let _ = self.force_send(region_id, PeerMsg::FetchedLogs(fetched)); } } + +/// A router that routes messages to the raftstore +pub struct RaftRouter +where + EK: KvEngine, + ER: RaftEngine, +{ + router: StoreRouter, + local_reader: LocalReader>, +} + +impl Clone for RaftRouter +where + EK: KvEngine, + ER: RaftEngine, +{ + fn clone(&self) -> Self { + RaftRouter { + router: self.router.clone(), + local_reader: self.local_reader.clone(), + } + } +} + +impl RaftRouter { + pub fn new(store_id: u64, router: StoreRouter) -> Self { + let mut store_meta = StoreMeta::new(); + store_meta.store_id = Some(store_id); + let store_meta = Arc::new(Mutex::new(store_meta)); + + let logger = router.logger().clone(); + RaftRouter { + router: router.clone(), + local_reader: LocalReader::new(store_meta, router, logger), + } + } + + pub fn store_router(&self) -> &StoreRouter { + &self.router + } + + pub fn send(&self, addr: u64, msg: PeerMsg) -> Result<(), TrySendError> { + self.router.send(addr, msg) + } + + pub fn store_meta(&self) -> &Arc>> { + self.local_reader.store_meta() + } + + pub fn send_raft_message( + &self, + msg: Box, + ) -> std::result::Result<(), TrySendError>> { + self.router.send_raft_message(msg) + } + + pub async fn get_snapshot( + &mut self, + req: RaftCmdRequest, + ) -> std::result::Result, RaftCmdResponse> { + self.local_reader.snapshot(req).await + } +} diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 7be1be95554..fb323dca9d4 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -3,14 +3,8 @@ // #[PerformanceCriticalPath] use std::fmt; -use engine_traits::{KvEngine, Snapshot}; -use kvproto::{ - cdcpb::Event, - metapb, - raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, - raft_serverpb::RaftMessage, -}; -use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs, RegionSnapshot}; +use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; +use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs}; use tikv_util::time::Instant; use super::{ diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index 1ee580a12d2..e9e7cf6cfc8 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -7,6 +7,7 @@ mod response_channel; pub(crate) use self::internal_message::ApplyTask; pub use self::{ + imp::RaftRouter, internal_message::ApplyRes, message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, response_channel::{ diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index 9478eb52339..55219540c2f 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -16,7 +16,6 @@ use std::{ cell::UnsafeCell, fmt::{self, Debug, Formatter}, future::Future, - mem, pin::Pin, sync::{ atomic::{AtomicU64, Ordering}, @@ -25,18 +24,13 @@ use std::{ task::{Context, Poll}, }; -use engine_traits::Snapshot; use futures::task::AtomicWaker; -use kvproto::{ - kvrpcpb::ExtraOp as TxnExtraOp, - raft_cmdpb::{RaftCmdResponse, Response}, -}; +use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, raft_cmdpb::RaftCmdResponse}; use raftstore::store::{ local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, - RegionSnapshot, WriteCallback, + WriteCallback, }; use smallvec::SmallVec; -use tikv_util::memory::HeapSize; use tracker::TrackerToken; /// A struct allows to watch and notify specific events. @@ -224,7 +218,7 @@ pub struct BaseSubscriber { impl BaseSubscriber { /// Wait for the result. #[inline] - pub async fn result(mut self) -> Option { + pub async fn result(self) -> Option { WaitResult { core: &self.core }.await } } @@ -247,7 +241,7 @@ impl BaseChannel { /// Sets the final result. #[inline] - pub fn set_result(mut self, res: Res) { + pub fn set_result(self, res: Res) { self.core.set_result(res); } } @@ -334,7 +328,7 @@ impl WriteCallback for CmdResChannel { // TODO: support executing hooks inside setting result. #[inline] - fn set_result(mut self, res: RaftCmdResponse) { + fn set_result(self, res: RaftCmdResponse) { self.set_result(res); } } @@ -425,14 +419,13 @@ pub type DebugInfoSubscriber = BaseSubscriber; #[cfg(test)] mod tests { - use engine_test::kv::KvTestSnapshot; use futures::executor::block_on; use super::*; #[test] fn test_cancel() { - let (mut chan, mut sub) = CmdResChannel::pair(); + let (chan, mut sub) = CmdResChannel::pair(); drop(chan); assert!(!block_on(sub.wait_proposed())); assert!(!block_on(sub.wait_committed())); @@ -447,7 +440,7 @@ mod tests { assert!(!block_on(sub.wait_committed())); assert_eq!(block_on(sub.result()), Some(result)); - let (mut chan, mut sub) = QueryResChannel::pair(); + let (chan, sub) = QueryResChannel::pair(); drop(chan); assert!(block_on(sub.result()).is_none()); } @@ -464,12 +457,12 @@ mod tests { assert!(block_on(sub.wait_committed())); assert_eq!(block_on(sub.result()), Some(result.clone())); - let (mut chan, mut sub) = QueryResChannel::pair(); + let (chan, sub) = QueryResChannel::pair(); let resp = QueryResult::Response(result.clone()); chan.set_result(resp.clone()); assert_eq!(block_on(sub.result()).unwrap(), resp); - let (mut chan, mut sub) = QueryResChannel::pair(); + let (chan, sub) = QueryResChannel::pair(); let read = QueryResult::Read(ReadResponse { read_index: 0, txn_extra_op: TxnExtraOp::ReadOldValue, diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index caaa5120325..d46ff09f2b1 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -28,8 +28,8 @@ use pd_client::RpcClient; use raftstore::store::{region_meta::RegionMeta, Config, Transport, RAFT_INIT_LOG_INDEX}; use raftstore_v2::{ create_store_batch_system, - router::{DebugInfoChannel, PeerMsg, QueryResult}, - Bootstrap, StoreMeta, StoreRouter, StoreSystem, + router::{DebugInfoChannel, PeerMsg, QueryResult, RaftRouter}, + Bootstrap, StoreMeta, StoreSystem, }; use slog::{o, Logger}; use tempfile::TempDir; @@ -37,10 +37,10 @@ use test_pd::mocker::Service; use tikv_util::config::{ReadableDuration, VersionTrack}; #[derive(Clone)] -pub struct TestRouter(StoreRouter); +pub struct TestRouter(RaftRouter); impl Deref for TestRouter { - type Target = StoreRouter; + type Target = RaftRouter; fn deref(&self) -> &Self::Target { &self.0 @@ -112,6 +112,8 @@ pub struct RunningState { pub system: StoreSystem, pub cfg: Arc>, pub transport: TestTransport, + // We need this to clear the ref counts of CachedTablet when shutdown + store_meta: Arc>>, } impl RunningState { @@ -160,7 +162,9 @@ impl RunningState { logger.clone(), ); - let store_meta = Arc::new(Mutex::new(StoreMeta::::new())); + let router = RaftRouter::new(store_id, router); + let store_meta = router.store_meta().clone(); + system .start( store_id, @@ -168,8 +172,8 @@ impl RunningState { raft_engine.clone(), factory.clone(), transport.clone(), - &router, - store_meta, + router.store_router(), + store_meta.clone(), ) .unwrap(); @@ -179,6 +183,7 @@ impl RunningState { system, cfg, transport, + store_meta, }; (TestRouter(router), state) } @@ -223,7 +228,10 @@ impl TestNode { } fn stop(&mut self) { - self.running_state.take(); + if let Some(state) = std::mem::take(&mut self.running_state) { + let mut meta = state.store_meta.lock().unwrap(); + meta.tablet_caches.clear(); + } } fn restart(&mut self) -> TestRouter { diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs index 8e2c3eeb04f..bb7156c6af7 100644 --- a/components/raftstore-v2/tests/integrations/test_read.rs +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use futures::executor::block_on; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, ReadIndexRequest, Request, StatusCmdType}; use tikv_util::store::new_peer; use txn_types::WriteBatchFlags; @@ -190,3 +191,45 @@ fn test_snap_with_invalid_parameter() { let error_resp = res.response().unwrap(); assert!(error_resp.get_header().has_error()); } + +#[test] +fn test_local_read() { + let cluster = Cluster::default(); + let mut router = cluster.router(0); + std::thread::sleep(std::time::Duration::from_millis(200)); + let region_id = 2; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(1, 3)); + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionDetail); + let res = router.query(region_id, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + let detail = status_resp.get_region_detail(); + let mut region = detail.get_region().clone(); + + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(1, 3)); + req.mut_header().set_term(6); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(region.take_region_epoch()); + let mut request_inner = Request::default(); + request_inner.set_cmd_type(CmdType::Snap); + req.mut_requests().push(request_inner); + + // FIXME: Get snapshot from local reader, but it will fail as the leader has not + // applied in the current term (due to unimplementation of ApplyRes). + let resp = block_on(async { router.get_snapshot(req.clone()).await.unwrap_err() }); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains("Fail to get snapshot ") + ); + + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read().unwrap(); + // The read index will be 0 as the retry process in the `get_snapshot` will + // renew the lease. + assert_eq!(resp.read_index, 0); +} diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 90cc41f2bd8..1ded8be3886 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -11,8 +11,8 @@ use crate::{ store::{ fsm::RaftRouter, transport::{CasualRouter, ProposalRouter, SignificantRouter}, - CachedReadDelegate, Callback, CasualMessage, LocalReader, PeerMsg, RaftCmdExtraOpts, - RaftCommand, SignificantMsg, StoreMetaDelegate, StoreMsg, StoreRouter, + Callback, CasualMessage, LocalReader, PeerMsg, RaftCmdExtraOpts, RaftCommand, + SignificantMsg, StoreMsg, StoreRouter, }, DiscardReason, Error as RaftStoreError, Result as RaftStoreResult, }; @@ -171,8 +171,7 @@ where ER: RaftEngine, { router: RaftRouter, - local_reader: - LocalReader, EK, CachedReadDelegate, StoreMetaDelegate>, + local_reader: LocalReader>, } impl Clone for ServerRaftStoreRouter @@ -192,12 +191,7 @@ impl ServerRaftStoreRouter { /// Creates a new router. pub fn new( router: RaftRouter, - local_reader: LocalReader< - RaftRouter, - EK, - CachedReadDelegate, - StoreMetaDelegate, - >, + local_reader: LocalReader>, ) -> ServerRaftStoreRouter { ServerRaftStoreRouter { router, diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index ed722fd2475..a60eb087562 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -77,9 +77,10 @@ pub use self::{ worker::{ AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, FetchedLogs, FlowStatistics, FlowStatsReporter, KeyEntry, - LocalReadContext, LocalReader, LogFetchedNotifier, PdTask, RaftlogFetchRunner, - RaftlogFetchTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, ReadProgress, - ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, - SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, + LocalReadContext, LocalReader, LocalReaderCore, LogFetchedNotifier, PdTask, + RaftlogFetchRunner, RaftlogFetchTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, + ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, + SplitConfig, SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, + TLS_LOCAL_READ_METRICS, }, }; diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 10996fcbae0..aca4db04fd5 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -613,7 +613,7 @@ pub fn can_amend_read( now: Timespec, ) -> bool { match lease_state { - // Here combine the new read request with the previous one even if the lease expired + // Here, combining the new read request with the previous one even if the lease expired // is ok because in this case, the previous read index must be sent out with a valid // lease instead of a suspect lease. So there must no pending transfer-leader // proposals before or after the previous read index, and the lease can be renewed @@ -4716,7 +4716,7 @@ where Ok(propose_index) } - fn handle_read>( + fn handle_read>( &self, reader: &mut E, req: RaftCmdRequest, @@ -5609,11 +5609,13 @@ where } } -impl ReadExecutor for PollContext +impl ReadExecutor for PollContext where EK: KvEngine, ER: RaftEngine, { + type Tablet = EK; + fn get_tablet(&mut self) -> &EK { &self.engines.kv } diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 99adcecc04a..4335369c3cb 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -26,6 +26,7 @@ pub use self::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{Runner as CompactRunner, Task as CompactTask}, consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, + metrics::TLS_LOCAL_READ_METRICS, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, Runner as PdRunner, Task as PdTask, @@ -35,8 +36,9 @@ pub use self::{ }, raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, read::{ - CachedReadDelegate, LocalReadContext, LocalReader, Progress as ReadProgress, ReadDelegate, - ReadExecutor, ReadExecutorProvider, StoreMetaDelegate, TrackVer, + CachedReadDelegate, LocalReadContext, LocalReader, LocalReaderCore, + Progress as ReadProgress, ReadDelegate, ReadExecutor, ReadExecutorProvider, + StoreMetaDelegate, TrackVer, }, refresh_config::{ BatchComponent as RaftStoreBatchComponent, Runner as RefreshConfigRunner, diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 5efb750b863..5801083f1bc 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -12,12 +12,12 @@ use std::{ }; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; -use engine_traits::{KvEngine, RaftEngine, Snapshot}; +use engine_traits::{KvEngine, Peekable, RaftEngine}; use fail::fail_point; use kvproto::{ errorpb, kvrpcpb::ExtraOp as TxnExtraOp, - metapb, + metapb::{self, Region}, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, ReadIndexResponse, Request, Response}, }; use pd_client::BucketMeta; @@ -36,20 +36,22 @@ use crate::{ cmd_resp, fsm::store::StoreMeta, util::{self, LeaseState, RegionReadProgress, RemoteLease}, - Callback, CasualMessage, CasualRouter, Peer, ProposalRouter, RaftCommand, ReadResponse, - RegionSnapshot, RequestInspector, RequestPolicy, TxnExt, + Callback, CasualMessage, CasualRouter, Peer, ProposalRouter, RaftCommand, ReadCallback, + ReadResponse, RegionSnapshot, RequestInspector, RequestPolicy, TxnExt, }, Error, Result, }; /// #[RaftstoreCommon] -pub trait ReadExecutor { - fn get_tablet(&mut self) -> &E; +pub trait ReadExecutor { + type Tablet: KvEngine; + + fn get_tablet(&mut self) -> &Self::Tablet; fn get_snapshot( &mut self, ts: Option, - read_context: &mut Option>, - ) -> Arc; + read_context: &mut Option>, + ) -> Arc<::Snapshot>; fn get_value(&mut self, req: &Request, region: &metapb::Region) -> Result { let key = req.get_get().get_key(); @@ -94,8 +96,8 @@ pub trait ReadExecutor { region: &Arc, read_index: Option, mut ts: Option, - mut read_context: Option>, - ) -> ReadResponse { + mut read_context: Option>, + ) -> ReadResponse<::Snapshot> { let requests = msg.get_requests(); let mut response = ReadResponse { response: RaftCmdResponse::default(), @@ -151,28 +153,6 @@ pub trait ReadExecutor { } } -/// #[RaftstoreCommon]: A read only delegate of `Peer`. -#[derive(Clone, Debug)] -pub struct ReadDelegate { - pub region: Arc, - pub peer_id: u64, - pub term: u64, - pub applied_term: u64, - pub leader_lease: Option, - pub last_valid_ts: Timespec, - - pub tag: String, - pub bucket_meta: Option>, - pub txn_extra_op: Arc>, - pub txn_ext: Arc, - pub read_progress: Arc, - pub pending_remove: bool, - - // `track_ver` used to keep the local `ReadDelegate` in `LocalReader` - // up-to-date with the global `ReadDelegate` stored at `StoreMeta` - pub track_ver: TrackVer, -} - /// CachedReadDelegate is a wrapper the ReadDelegate and kv_engine. LocalReader /// dispatch local read requests to ReadDeleage according to the region_id where /// ReadDelegate needs kv_engine to read data or fetch snapshot. @@ -224,17 +204,17 @@ impl Drop for ReadDelegate { } /// #[RaftstoreCommon] -pub trait ReadExecutorProvider: Send + Clone + 'static -where - E: KvEngine, -{ - type Executor: ReadExecutor; +pub trait ReadExecutorProvider: Send + Clone + 'static { + type Executor: ReadExecutor; + type StoreMeta; fn store_id(&self) -> Option; /// get the ReadDelegate with region_id and the number of delegates in the /// StoreMeta fn get_executor_and_len(&self, region_id: u64) -> (usize, Option); + + fn store_meta(&self) -> &Self::StoreMeta; } #[derive(Clone)] @@ -258,11 +238,12 @@ where } } -impl ReadExecutorProvider for StoreMetaDelegate +impl ReadExecutorProvider for StoreMetaDelegate where E: KvEngine, { type Executor = CachedReadDelegate; + type StoreMeta = Arc>; fn store_id(&self) -> Option { self.store_meta.as_ref().lock().unwrap().store_id @@ -284,6 +265,10 @@ where } (meta.readers.len(), None) } + + fn store_meta(&self) -> &Self::StoreMeta { + &self.store_meta + } } /// #[RaftstoreCommon] @@ -336,8 +321,30 @@ impl Clone for TrackVer { } } +/// #[RaftstoreCommon]: A read only delegate of `Peer`. +#[derive(Clone, Debug)] +pub struct ReadDelegate { + pub region: Arc, + pub peer_id: u64, + pub term: u64, + pub applied_term: u64, + pub leader_lease: Option, + pub last_valid_ts: Timespec, + + pub tag: String, + pub bucket_meta: Option>, + pub txn_extra_op: Arc>, + pub txn_ext: Arc, + pub read_progress: Arc, + pub pending_remove: bool, + + // `track_ver` used to keep the local `ReadDelegate` in `LocalReader` + // up-to-date with the global `ReadDelegate` stored at `StoreMeta` + pub track_ver: TrackVer, +} + impl ReadDelegate { - pub fn from_peer(peer: &Peer) -> ReadDelegate { + pub fn from_peer(peer: &Peer) -> Self { let region = peer.region().clone(); let region_id = region.get_id(); let peer_id = peer.peer.get_id(); @@ -358,6 +365,34 @@ impl ReadDelegate { } } + pub fn new( + peer_id: u64, + term: u64, + region: Region, + applied_term: u64, + txn_extra_op: Arc>, + txn_ext: Arc, + read_progress: Arc, + bucket_meta: Option>, + ) -> Self { + let region_id = region.id; + ReadDelegate { + region: Arc::new(region), + peer_id, + term, + applied_term, + leader_lease: None, + last_valid_ts: Timespec::new(0, 0), + tag: format!("[region {}] {}", region_id, peer_id), + txn_extra_op, + txn_ext, + read_progress, + pending_remove: false, + bucket_meta, + track_ver: TrackVer::new(), + } + } + pub fn fresh_valid_ts(&mut self) { self.last_valid_ts = monotonic_raw_now(); } @@ -389,21 +424,24 @@ impl ReadDelegate { } } + pub fn need_renew_lease(&self, ts: Timespec) -> bool { + self.leader_lease + .as_ref() + .map(|lease| lease.need_renew(ts)) + .unwrap_or(false) + } + // If the remote lease will be expired in near future send message - // to `raftstore` renew it + // to `raftstore` to renew it pub fn maybe_renew_lease_advance( &self, router: &dyn CasualRouter, ts: Timespec, ) { - if !self - .leader_lease - .as_ref() - .map(|lease| lease.need_renew(ts)) - .unwrap_or(false) - { + if !self.need_renew_lease(ts) { return; } + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().renew_lease_advance.inc()); let region_id = self.region.get_id(); if let Err(e) = router.send(region_id, CasualMessage::RenewLease) { @@ -437,10 +475,7 @@ impl ReadDelegate { false } - pub fn check_stale_read_safe( - &self, - read_ts: u64, - ) -> std::result::Result<(), ReadResponse> { + pub fn check_stale_read_safe(&self, read_ts: u64) -> std::result::Result<(), RaftCmdResponse> { let safe_ts = self.read_progress.safe_ts(); if safe_ts >= read_ts { return Ok(()); @@ -458,11 +493,7 @@ impl ReadDelegate { safe_ts, }); cmd_resp::bind_term(&mut response, self.term); - Err(ReadResponse { - response, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }) + Err(response) } /// Used in some external tests. @@ -538,107 +569,31 @@ impl Progress { /// #[RaftstoreCommon]: LocalReader is an entry point where local read requests are dipatch to the /// relevant regions by LocalReader so that these requests can be handled by the /// relevant ReadDelegate respectively. -pub struct LocalReader +pub struct LocalReaderCore where - C: ProposalRouter + CasualRouter, - E: KvEngine, - D: ReadExecutor + Deref, - S: ReadExecutorProvider, + D: ReadExecutor + Deref, + S: ReadExecutorProvider, { pub store_id: Cell>, store_meta: S, - kv_engine: E, - // region id -> ReadDelegate - // The use of `Arc` here is a workaround, see the comment at `get_delegate` pub delegates: LruCache, - snap_cache: Box>>, - cache_read_id: ThreadReadId, - // A channel to raftstore. - router: C, } -impl ReadExecutor for CachedReadDelegate +impl LocalReaderCore where - E: KvEngine, + D: ReadExecutor + Deref + Clone, + S: ReadExecutorProvider, { - fn get_tablet(&mut self) -> &E { - &self.kv_engine - } - - fn get_snapshot( - &mut self, - create_time: Option, - read_context: &mut Option>, - ) -> Arc { - let ctx = read_context.as_mut().unwrap(); - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); - if let Some(ts) = create_time { - if ts == *ctx.read_id { - if let Some(snap) = ctx.snap_cache.as_ref().as_ref() { - TLS_LOCAL_READ_METRICS - .with(|m| m.borrow_mut().local_executed_snapshot_cache_hit.inc()); - return snap.clone(); - } - } - let snap = Arc::new(self.kv_engine.snapshot()); - *ctx.read_id = ts; - *ctx.snap_cache = Box::new(Some(snap.clone())); - return snap; - } - Arc::new(self.kv_engine.snapshot()) - } -} - -impl LocalReader -where - C: ProposalRouter + CasualRouter, - E: KvEngine, - D: ReadExecutor + Deref + Clone, - S: ReadExecutorProvider, -{ - pub fn new(kv_engine: E, store_meta: S, router: C) -> Self { - let cache_read_id = ThreadReadId::new(); - LocalReader { + pub fn new(store_meta: S) -> Self { + LocalReaderCore { store_meta, - kv_engine, - router, - snap_cache: Box::new(None), - cache_read_id, store_id: Cell::new(None), delegates: LruCache::with_capacity_and_sample(0, 7), } } - fn redirect(&mut self, mut cmd: RaftCommand) { - debug!("localreader redirects command"; "command" => ?cmd); - let region_id = cmd.request.get_header().get_region_id(); - let mut err = errorpb::Error::default(); - match ProposalRouter::send(&self.router, cmd) { - Ok(()) => return, - Err(TrySendError::Full(c)) => { - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.channel_full.inc()); - err.set_message(RAFTSTORE_IS_BUSY.to_owned()); - err.mut_server_is_busy() - .set_reason(RAFTSTORE_IS_BUSY.to_owned()); - cmd = c; - } - Err(TrySendError::Disconnected(c)) => { - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); - err.set_message(format!("region {} is missing", region_id)); - err.mut_region_not_found().set_region_id(region_id); - cmd = c; - } - } - - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - let read_resp = ReadResponse { - response: resp, - snapshot: None, - txn_extra_op: TxnExtraOp::Noop, - }; - - cmd.callback.invoke_read(read_resp); + pub fn store_meta(&self) -> &S::StoreMeta { + self.store_meta.store_meta() } // Ideally `get_delegate` should return `Option<&ReadDelegate>`, but if so the @@ -673,10 +628,7 @@ where rd.filter(|r| !r.pending_remove) } - pub fn pre_propose_raft_command( - &mut self, - req: &RaftCmdRequest, - ) -> Result> { + pub fn validate_request(&mut self, req: &RaftCmdRequest) -> Result> { // Check store id. if self.store_id.get().is_none() { let store_id = self.store_meta.store_id(); @@ -728,17 +680,113 @@ where return Ok(None); } - let mut inspector = Inspector { - delegate: &delegate, - }; - match inspector.inspect(req) { - Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), - Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), - // It can not handle other policies. - Ok(_) => Ok(None), - Err(e) => Err(e), + Ok(Some(delegate)) + } +} + +impl Clone for LocalReaderCore +where + D: ReadExecutor + Deref, + S: ReadExecutorProvider, +{ + fn clone(&self) -> Self { + LocalReaderCore { + store_meta: self.store_meta.clone(), + store_id: self.store_id.clone(), + delegates: LruCache::with_capacity_and_sample(0, 7), } } +} + +pub struct LocalReader +where + E: KvEngine, + C: ProposalRouter + CasualRouter, +{ + local_reader: LocalReaderCore, StoreMetaDelegate>, + kv_engine: E, + + snap_cache: Box>>, + cache_read_id: ThreadReadId, + + // A channel to raftstore. + router: C, +} + +impl LocalReader +where + E: KvEngine, + C: ProposalRouter + CasualRouter, +{ + pub fn new(kv_engine: E, store_meta: StoreMetaDelegate, router: C) -> Self { + let cache_read_id = ThreadReadId::new(); + Self { + local_reader: LocalReaderCore::new(store_meta), + kv_engine, + snap_cache: Box::new(None), + cache_read_id, + router, + } + } + + fn local_read_context(&mut self) -> LocalReadContext<'_, E> { + LocalReadContext { + snap_cache: &mut self.snap_cache, + read_id: &mut self.cache_read_id, + } + } + + pub fn pre_propose_raft_command( + &mut self, + req: &RaftCmdRequest, + ) -> Result, RequestPolicy)>> { + if let Some(delegate) = self.local_reader.validate_request(req)? { + let mut inspector = Inspector { + delegate: &delegate, + }; + match inspector.inspect(req) { + Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), + Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), + // It can not handle other policies. + Ok(_) => Ok(None), + Err(e) => Err(e), + } + } else { + Ok(None) + } + } + + fn redirect(&mut self, mut cmd: RaftCommand) { + debug!("localreader redirects command"; "command" => ?cmd); + let region_id = cmd.request.get_header().get_region_id(); + let mut err = errorpb::Error::default(); + match ProposalRouter::send(&self.router, cmd) { + Ok(()) => return, + Err(TrySendError::Full(c)) => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.channel_full.inc()); + err.set_message(RAFTSTORE_IS_BUSY.to_owned()); + err.mut_server_is_busy() + .set_reason(RAFTSTORE_IS_BUSY.to_owned()); + cmd = c; + } + Err(TrySendError::Disconnected(c)) => { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); + err.set_message(format!("region {} is missing", region_id)); + err.mut_region_not_found().set_region_id(region_id); + cmd = c; + } + } + + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + let read_resp = ReadResponse { + response: resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }; + + cmd.callback.set_result(read_resp); + } pub fn propose_raft_command( &mut self, @@ -748,7 +796,6 @@ where ) { match self.pre_propose_raft_command(&req) { Ok(Some((mut delegate, policy))) => { - let delegate_ext: LocalReadContext<'_, E>; let mut response = match policy { // Leader can read local if and only if it is in lease. RequestPolicy::ReadLocal => { @@ -769,14 +816,11 @@ where return; } - delegate_ext = LocalReadContext { - snap_cache: &mut self.snap_cache, - read_id: &mut self.cache_read_id, - }; + let read_ctx = self.local_read_context(); let region = Arc::clone(&delegate.region); let response = - delegate.execute(&req, ®ion, None, read_id, Some(delegate_ext)); + delegate.execute(&req, ®ion, None, read_id, Some(read_ctx)); // Try renew lease in advance delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); response @@ -785,24 +829,29 @@ where RequestPolicy::StaleRead => { let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); if let Err(resp) = delegate.check_stale_read_safe(read_ts) { - cb.invoke_read(resp); + cb.set_result(ReadResponse { + response: resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); return; } - delegate_ext = LocalReadContext { - snap_cache: &mut self.snap_cache, - read_id: &mut self.cache_read_id, - }; + let read_ctx = self.local_read_context(); let region = Arc::clone(&delegate.region); // Getting the snapshot let response = - delegate.execute(&req, ®ion, None, read_id, Some(delegate_ext)); + delegate.execute(&req, ®ion, None, read_id, Some(read_ctx)); // Double check in case `safe_ts` change after the first check and before // getting snapshot if let Err(resp) = delegate.check_stale_read_safe(read_ts) { - cb.invoke_read(resp); + cb.set_result(ReadResponse { + response: resp, + snapshot: None, + txn_extra_op: TxnExtraOp::Noop, + }); return; } TLS_LOCAL_READ_METRICS @@ -817,16 +866,20 @@ where snap.bucket_meta = delegate.bucket_meta.clone(); } response.txn_extra_op = delegate.txn_extra_op.load(); - cb.invoke_read(response); + cb.set_result(response); } // Forward to raftstore. Ok(None) => self.redirect(RaftCommand::new(req, cb)), Err(e) => { let mut response = cmd_resp::new_error(e); - if let Some(delegate) = self.delegates.get(&req.get_header().get_region_id()) { + if let Some(delegate) = self + .local_reader + .delegates + .get(&req.get_header().get_region_id()) + { cmd_resp::bind_term(&mut response, delegate.term); } - cb.invoke_read(ReadResponse { + cb.set_result(ReadResponse { response, snapshot: None, txn_extra_op: TxnExtraOp::Noop, @@ -857,23 +910,53 @@ where } } -impl Clone for LocalReader +impl Clone for LocalReader where - C: ProposalRouter + CasualRouter + Clone, E: KvEngine, - D: ReadExecutor + Deref, - S: ReadExecutorProvider, + C: ProposalRouter + CasualRouter + Clone, { fn clone(&self) -> Self { - LocalReader { - store_meta: self.store_meta.clone(), + Self { + local_reader: self.local_reader.clone(), kv_engine: self.kv_engine.clone(), - router: self.router.clone(), - store_id: self.store_id.clone(), - delegates: LruCache::with_capacity_and_sample(0, 7), snap_cache: self.snap_cache.clone(), cache_read_id: self.cache_read_id.clone(), + router: self.router.clone(), + } + } +} + +impl ReadExecutor for CachedReadDelegate +where + E: KvEngine, +{ + type Tablet = E; + + fn get_tablet(&mut self) -> &E { + &self.kv_engine + } + + fn get_snapshot( + &mut self, + create_time: Option, + read_context: &mut Option>, + ) -> Arc { + let ctx = read_context.as_mut().unwrap(); + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); + if let Some(ts) = create_time { + if ts == *ctx.read_id { + if let Some(snap) = ctx.snap_cache.as_ref().as_ref() { + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_snapshot_cache_hit.inc()); + return snap.clone(); + } + } + let snap = Arc::new(self.kv_engine.snapshot()); + *ctx.read_id = ts; + *ctx.snap_cache = Box::new(Some(snap.clone())); + return snap; } + Arc::new(self.kv_engine.snapshot()) } } @@ -976,19 +1059,14 @@ mod tests { store_meta: Arc>, ) -> ( TempDir, - LocalReader< - MockRouter, - KvTestEngine, - CachedReadDelegate, - StoreMetaDelegate, - >, + LocalReader, Receiver>, ) { let path = Builder::new().prefix(path).tempdir().unwrap(); let db = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); let (ch, rx, _) = MockRouter::new(); let mut reader = LocalReader::new(db.clone(), StoreMetaDelegate::new(store_meta, db), ch); - reader.store_id = Cell::new(Some(store_id)); + reader.local_reader.store_id = Cell::new(Some(store_id)); (path, reader, rx) } @@ -1005,12 +1083,7 @@ mod tests { } fn must_redirect( - reader: &mut LocalReader< - MockRouter, - KvTestEngine, - CachedReadDelegate, - StoreMetaDelegate, - >, + reader: &mut LocalReader, rx: &Receiver>, cmd: RaftCmdRequest, ) { @@ -1030,12 +1103,7 @@ mod tests { } fn must_not_redirect( - reader: &mut LocalReader< - MockRouter, - KvTestEngine, - CachedReadDelegate, - StoreMetaDelegate, - >, + reader: &mut LocalReader, rx: &Receiver>, task: RaftCommand, ) { @@ -1092,7 +1160,7 @@ mod tests { TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), 1 ); - assert!(reader.delegates.get(&1).is_none()); + assert!(reader.local_reader.delegates.get(&1).is_none()); // Register region 1 lease.renew(monotonic_raw_now()); @@ -1369,16 +1437,16 @@ mod tests { // Remove invalid delegate let reader_clone = store_meta.lock().unwrap().readers.get(&1).unwrap().clone(); - assert!(reader.get_delegate(1).is_some()); + assert!(reader.local_reader.get_delegate(1).is_some()); // dropping the non-source `reader` will not make other readers invalid drop(reader_clone); - assert!(reader.get_delegate(1).is_some()); + assert!(reader.local_reader.get_delegate(1).is_some()); // drop the source `reader` store_meta.lock().unwrap().readers.remove(&1).unwrap(); // the invalid delegate should be removed - assert!(reader.get_delegate(1).is_none()); + assert!(reader.local_reader.get_delegate(1).is_none()); } #[test] @@ -1408,7 +1476,7 @@ mod tests { meta.readers.insert(1, read_delegate); } - let d = reader.get_delegate(1).unwrap(); + let d = reader.local_reader.get_delegate(1).unwrap(); assert_eq!(&*d.region, ®ion); assert_eq!(d.term, 1); assert_eq!(d.applied_term, 1); @@ -1423,13 +1491,16 @@ mod tests { .unwrap() .update(Progress::region(region.clone())); } - assert_eq!(&*reader.get_delegate(1).unwrap().region, ®ion); + assert_eq!( + &*reader.local_reader.get_delegate(1).unwrap().region, + ®ion + ); { let mut meta = store_meta.lock().unwrap(); meta.readers.get_mut(&1).unwrap().update(Progress::term(2)); } - assert_eq!(reader.get_delegate(1).unwrap().term, 2); + assert_eq!(reader.local_reader.get_delegate(1).unwrap().term, 2); { let mut meta = store_meta.lock().unwrap(); @@ -1438,7 +1509,7 @@ mod tests { .unwrap() .update(Progress::applied_term(2)); } - assert_eq!(reader.get_delegate(1).unwrap().applied_term, 2); + assert_eq!(reader.local_reader.get_delegate(1).unwrap().applied_term, 2); { let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. @@ -1447,7 +1518,7 @@ mod tests { let mut meta = store_meta.lock().unwrap(); meta.readers.get_mut(&1).unwrap().update(pg); } - let d = reader.get_delegate(1).unwrap(); + let d = reader.local_reader.get_delegate(1).unwrap(); assert_eq!(d.leader_lease.clone().unwrap().term(), 3); } @@ -1545,14 +1616,11 @@ mod tests { meta.readers.insert(1, read_delegate); } - let mut delegate = reader.get_delegate(region1.id).unwrap(); + let mut delegate = reader.local_reader.get_delegate(region1.id).unwrap(); let read_id = Some(ThreadReadId::new()); { - let mut read_context = Some(LocalReadContext { - snap_cache: &mut reader.snap_cache, - read_id: &mut reader.cache_read_id, - }); + let mut read_context = Some(reader.local_read_context()); for _ in 0..10 { // Different region id should reuse the cache @@ -1568,10 +1636,7 @@ mod tests { let read_id = Some(ThreadReadId::new()); { - let read_context = LocalReadContext { - snap_cache: &mut reader.snap_cache, - read_id: &mut reader.cache_read_id, - }; + let read_context = reader.local_read_context(); let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); } @@ -1582,10 +1647,7 @@ mod tests { ); { - let read_context = LocalReadContext { - snap_cache: &mut reader.snap_cache, - read_id: &mut reader.cache_read_id, - }; + let read_context = reader.local_read_context(); let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); // We can hit it again. assert_eq!( @@ -1596,10 +1658,7 @@ mod tests { reader.release_snapshot_cache(); { - let read_context = LocalReadContext { - snap_cache: &mut reader.snap_cache, - read_id: &mut reader.cache_read_id, - }; + let read_context = reader.local_read_context(); let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); } // After release, we will mss the cache even with the prevsiou read_id. @@ -1609,10 +1668,7 @@ mod tests { ); { - let read_context = LocalReadContext { - snap_cache: &mut reader.snap_cache, - read_id: &mut reader.cache_read_id, - }; + let read_context = reader.local_read_context(); let _ = delegate.get_snapshot(read_id, &mut Some(read_context)); } // We can hit it again. From be44dbabf13be6a037e58163cf291d3e6ffa9808 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 12 Oct 2022 01:49:50 +0800 Subject: [PATCH 261/676] log-backup: remove checkpoint V2 from codebase (#13197) ref tikv/tikv#13196 Now, all integration test cases uses checkpoint V3. Removed `test_inflight_message` because it is invalid in V3. Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- .../backup-stream/src/checkpoint_manager.rs | 103 +----------- components/backup-stream/src/endpoint.rs | 62 ++----- .../backup-stream/src/metadata/client.rs | 37 +---- components/backup-stream/src/metadata/keys.rs | 4 + components/backup-stream/src/metadata/test.rs | 76 +-------- .../backup-stream/src/subscription_manager.rs | 5 +- .../backup-stream/src/subscription_track.rs | 111 +------------ components/backup-stream/tests/mod.rs | 152 +++++++----------- src/config.rs | 4 - 9 files changed, 92 insertions(+), 462 deletions(-) diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 2874d548c5a..4b80eb44a2f 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -11,11 +11,9 @@ use tikv_util::{info, worker::Scheduler}; use txn_types::TimeStamp; use crate::{ - errors::{ContextualResultExt, Error, Result}, + errors::{Error, Result}, metadata::{store::MetaStore, Checkpoint, CheckpointProvider, MetadataClient}, - metrics, - subscription_track::SubscriptionTracer, - try_send, RegionCheckpointOperation, Task, + metrics, try_send, RegionCheckpointOperation, Task, }; /// A manager for maintaining the last flush ts. @@ -221,119 +219,25 @@ impl FlushObserver for BasicFlushObserver { } } -pub struct CheckpointV2FlushObserver { - resolvers: SubscriptionTracer, - meta_cli: MetadataClient, - - fresh_regions: Vec, - checkpoints: Vec<(Region, TimeStamp)>, - can_advance: Option, - base: O, -} - -impl CheckpointV2FlushObserver { - pub fn new( - meta_cli: MetadataClient, - can_advance: F, - resolvers: SubscriptionTracer, - base: O, - ) -> Self { - Self { - resolvers, - meta_cli, - fresh_regions: vec![], - checkpoints: vec![], - can_advance: Some(can_advance), - base, - } - } -} - -#[async_trait::async_trait] -impl FlushObserver for CheckpointV2FlushObserver -where - S: MetaStore + 'static, - F: FnOnce() -> bool + Send + 'static, - O: FlushObserver, -{ - async fn before(&mut self, _checkpoints: Vec<(Region, TimeStamp)>) { - let fresh_regions = self.resolvers.collect_fresh_subs(); - let removal = self.resolvers.collect_removal_subs(); - let checkpoints = removal - .into_iter() - .map(|sub| (sub.meta, sub.resolver.resolved_ts())) - .collect::>(); - self.checkpoints = checkpoints; - self.fresh_regions = fresh_regions; - } - - async fn after(&mut self, task: &str, rts: u64) -> Result<()> { - if !self.can_advance.take().map(|f| f()).unwrap_or(true) { - let cp_now = self - .meta_cli - .get_local_task_checkpoint(task) - .await - .context(format_args!( - "during checking whether we should skip advancing ts to {}.", - rts - ))?; - // if we need to roll back checkpoint ts, don't prevent it. - if rts >= cp_now.into_inner() { - info!("skipping advance checkpoint."; "rts" => %rts, "old_rts" => %cp_now); - return Ok(()); - } - } - // Optionally upload the region checkpoint. - // Unless in some extreme condition, skipping upload the region checkpoint won't - // lead to data loss. - if let Err(err) = self - .meta_cli - .upload_region_checkpoint(task, &self.checkpoints) - .await - { - err.report("failed to upload region checkpoint"); - } - // we can advance the progress at next time. - // return early so we won't be mislead by the metrics. - self.meta_cli - .set_local_task_checkpoint(task, rts) - .await - .context(format_args!("on flushing task {}", task))?; - self.base.after(task, rts).await?; - self.meta_cli - .clear_region_checkpoint(task, &self.fresh_regions) - .await - .context(format_args!("on clearing the checkpoint for task {}", task))?; - Ok(()) - } -} - pub struct CheckpointV3FlushObserver { /// We should modify the rts (the local rts isn't right.) /// This should be a BasicFlushObserver or something likewise. baseline: O, sched: Scheduler, meta_cli: MetadataClient, - subs: SubscriptionTracer, checkpoints: Vec<(Region, TimeStamp)>, global_checkpoint_cache: HashMap, } impl CheckpointV3FlushObserver { - pub fn new( - sched: Scheduler, - meta_cli: MetadataClient, - subs: SubscriptionTracer, - baseline: O, - ) -> Self { + pub fn new(sched: Scheduler, meta_cli: MetadataClient, baseline: O) -> Self { Self { sched, meta_cli, checkpoints: vec![], // We almost always have only one entry. global_checkpoint_cache: HashMap::with_capacity(1), - subs, baseline, } } @@ -369,7 +273,6 @@ where } async fn after(&mut self, task: &str, _rts: u64) -> Result<()> { - self.subs.update_status_for_v3(); let t = Task::RegionCheckpointsOp(RegionCheckpointOperation::Update(std::mem::take( &mut self.checkpoints, ))); diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index d463964558a..22a415ca6bb 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -1,12 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::HashSet, - fmt, - marker::PhantomData, - path::PathBuf, - sync::{atomic::Ordering, Arc}, - time::Duration, + collections::HashSet, fmt, marker::PhantomData, path::PathBuf, sync::Arc, time::Duration, }; use concurrency_manager::ConcurrencyManager; @@ -46,8 +41,8 @@ use super::metrics::HANDLE_EVENT_DURATION_HISTOGRAM; use crate::{ annotate, checkpoint_manager::{ - BasicFlushObserver, CheckpointManager, CheckpointV2FlushObserver, - CheckpointV3FlushObserver, FlushObserver, GetCheckpointResult, RegionIdWithVersion, + BasicFlushObserver, CheckpointManager, CheckpointV3FlushObserver, FlushObserver, + GetCheckpointResult, RegionIdWithVersion, }, errors::{Error, Result}, event_loader::{InitialDataLoader, PendingMemoryQuota}, @@ -92,6 +87,9 @@ pub struct Endpoint { initial_scan_throughput_quota: Limiter, region_operator: RegionSubscriptionManager, failover_time: Option, + // We holds the config before, even it is useless for now, + // however probably it would be useful in the future. + #[allow(dead_code)] config: BackupStreamConfig, checkpoint_mgr: CheckpointManager, } @@ -226,7 +224,7 @@ where let safepoint = meta_cli.global_progress_of_task(&task).await?; pdc.update_service_safe_point( safepoint_name, - TimeStamp::new(safepoint - 1), + TimeStamp::new(safepoint.saturating_sub(1)), safepoint_ttl, ) .await?; @@ -402,23 +400,9 @@ where } } - fn flush_observer(&self) -> Box { + fn flush_observer(&self) -> impl FlushObserver { let basic = BasicFlushObserver::new(self.pd_client.clone(), self.store_id); - if self.config.use_checkpoint_v3 { - Box::new(CheckpointV3FlushObserver::new( - self.scheduler.clone(), - self.meta_client.clone(), - self.subs.clone(), - basic, - )) - } else { - Box::new(CheckpointV2FlushObserver::new( - self.meta_client.clone(), - self.make_flush_guard(), - self.subs.clone(), - basic, - )) - } + CheckpointV3FlushObserver::new(self.scheduler.clone(), self.meta_client.clone(), basic) } /// Convert a batch of events to the cmd batch, and update the resolver @@ -574,7 +558,6 @@ where let cli = self.meta_client.clone(); let init = self.make_initial_loader(); let range_router = self.range_router.clone(); - let use_v3 = self.config.use_checkpoint_v3; info!( "register backup stream task"; @@ -598,9 +581,7 @@ where let task_clone = task.clone(); let run = async move { let task_name = task.info.get_name(); - if !use_v3 { - cli.init_task(&task.info).await?; - } + cli.init_task(&task.info).await?; let ranges = cli.ranges_of_task(task_name).await?; info!( "register backup stream ranges"; @@ -704,29 +685,6 @@ where self.pool.block_on(router.unregister_task(task)) } - /// Make a guard for checking whether we can flush the checkpoint ts. - fn make_flush_guard(&self) -> impl FnOnce() -> bool + Send { - let failover = self.failover_time; - let flush_duration = self.config.max_flush_interval; - move || { - if failover - .as_ref() - .map(|failover_t| failover_t.saturating_elapsed() < flush_duration.0 * 2) - .unwrap_or(false) - { - warn!("during failover, skipping advancing resolved ts"; - "failover_time_ago" => ?failover.map(|failover_t| failover_t.saturating_elapsed())); - return false; - } - let in_flight = crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.load(Ordering::SeqCst); - if in_flight > 0 { - warn!("inflight leader detected, skipping advancing resolved ts"; "in_flight" => %in_flight); - return false; - } - true - } - } - fn prepare_min_ts(&self) -> future![TimeStamp] { let pd_cli = self.pd_client.clone(); let cm = self.concurrency_manager.clone(); diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index e92addd2992..2ebf553e1cb 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Ordering, collections::HashMap, fmt::Debug, path::Path, time::Duration}; +use std::{cmp::Ordering, collections::HashMap, fmt::Debug, path::Path}; use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, @@ -13,8 +13,8 @@ use txn_types::TimeStamp; use super::{ keys::{self, KeyValue, MetaKey}, store::{ - CondTransaction, Condition, GetExtra, Keys, KvEvent, KvEventType, MetaStore, PutOption, - Snapshot, Subscription, Transaction, WithRevision, + CondTransaction, Condition, GetExtra, Keys, KvEvent, KvEventType, MetaStore, Snapshot, + Subscription, Transaction, WithRevision, }, }; use crate::{ @@ -671,37 +671,6 @@ impl MetadataClient { .await } - /// upload a region-level checkpoint. - pub async fn upload_region_checkpoint( - &self, - task_name: &str, - checkpoints: &[(Region, TimeStamp)], - ) -> Result<()> { - let txn = checkpoints - .iter() - .fold(Transaction::default(), |txn, (region, cp)| { - txn.put_opt( - KeyValue( - MetaKey::next_bakcup_ts_of_region(task_name, region), - (*cp).into_inner().to_be_bytes().to_vec(), - ), - PutOption { - ttl: Duration::from_secs(600), - }, - ) - }); - self.meta_store.txn(txn).await - } - - pub async fn clear_region_checkpoint(&self, task_name: &str, regions: &[Region]) -> Result<()> { - let txn = regions.iter().fold(Transaction::default(), |txn, region| { - txn.delete(Keys::Key(MetaKey::next_bakcup_ts_of_region( - task_name, region, - ))) - }); - self.meta_store.txn(txn).await - } - pub async fn global_checkpoint_of(&self, task: &str) -> Result> { let cps = self.checkpoints_of(task).await?; let mut min_checkpoint = None; diff --git a/components/backup-stream/src/metadata/keys.rs b/components/backup-stream/src/metadata/keys.rs index 32962ec36b0..f7a2c960ec4 100644 --- a/components/backup-stream/src/metadata/keys.rs +++ b/components/backup-stream/src/metadata/keys.rs @@ -162,6 +162,10 @@ impl MetaKey { Self(format!("{}{}/{}/{}", PREFIX, PATH_LAST_ERROR, name, store).into_bytes()) } + pub fn central_global_checkpoint_of(name: &str) -> Self { + Self(format!("{}/checkpoint/{}/central_global", PREFIX, name).into_bytes()) + } + /// return the key that keeps the range [self, self.next()) contains only /// `self`. pub fn next(&self) -> Self { diff --git a/components/backup-stream/src/metadata/test.rs b/components/backup-stream/src/metadata/test.rs index b9fb965033a..ec2a30efbf3 100644 --- a/components/backup-stream/src/metadata/test.rs +++ b/components/backup-stream/src/metadata/test.rs @@ -7,21 +7,13 @@ use std::{ iter::FromIterator, }; -use kvproto::{ - brpb::{Noop, StorageBackend}, - metapb::Region, -}; +use kvproto::brpb::{Noop, StorageBackend}; use tokio_stream::StreamExt; -use txn_types::TimeStamp; use super::{keys::MetaKey, MetadataClient, StreamTask}; use crate::{ errors::Result, - metadata::{ - client::{Checkpoint, CheckpointProvider}, - store::SlashEtcStore, - MetadataEvent, - }, + metadata::{store::SlashEtcStore, MetadataEvent}, }; fn test_meta_cli() -> MetadataClient { @@ -99,12 +91,6 @@ fn task_matches(expected: &[StreamTask], real: &[StreamTask]) { ); } -fn fake_region(id: u64) -> Region { - let mut r = Region::new(); - r.set_id(id); - r -} - #[tokio::test] async fn test_watch() -> Result<()> { let cli = test_meta_cli(); @@ -197,61 +183,3 @@ async fn test_init() -> Result<()> { Ok(()) } - -#[tokio::test] -async fn test_region_checkpoint() -> Result<()> { - let cli = test_meta_cli(); - let task = simple_task("simple_2"); - cli.insert_task_with_range(&task, &[]).await?; - - let cps = [ - (fake_region(1), TimeStamp::new(42)), - (fake_region(2), TimeStamp::new(64)), - ]; - cli.upload_region_checkpoint("simple_2", &cps).await?; - cli.set_local_task_checkpoint("simple_2", 50).await?; - - let rcp = cli - .get_region_checkpoint("simple_2", &fake_region(1)) - .await?; - assert_eq!( - rcp, - Checkpoint { - provider: CheckpointProvider::Region { id: 1, version: 0 }, - ts: TimeStamp::new(42) - } - ); - let gcp = cli - .get_region_checkpoint("simple_2", &fake_region(3)) - .await?; - assert_eq!( - gcp, - Checkpoint { - provider: CheckpointProvider::Store(42), - ts: TimeStamp::new(50) - } - ); - cli.clear_region_checkpoint("simple_2", &[fake_region(1)]) - .await?; - let rcp = cli - .get_region_checkpoint("simple_2", &fake_region(2)) - .await?; - assert_eq!( - rcp, - Checkpoint { - provider: CheckpointProvider::Region { id: 2, version: 0 }, - ts: TimeStamp::new(64) - } - ); - let gcp = cli - .get_region_checkpoint("simple_2", &fake_region(1)) - .await?; - assert_eq!( - gcp, - Checkpoint { - provider: CheckpointProvider::Store(42), - ts: TimeStamp::new(50) - } - ); - Ok(()) -} diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 751f41ee587..d47974bcd42 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -403,7 +403,7 @@ where self.subs.deregister_region_if(region, |_, _| true); } ObserveOp::Destroy { ref region } => { - let stopped = self.subs.deregister_region_if(region, |old, new| { + self.subs.deregister_region_if(region, |old, new| { raftstore::store::util::compare_region_epoch( old.meta.get_region_epoch(), new, @@ -414,9 +414,6 @@ where .map_err(|err| warn!("check epoch and stop failed."; "err" => %err)) .is_ok() }); - if stopped { - self.subs.destroy_stopped_region(region.get_id()); - } } ObserveOp::RefreshResolver { ref region } => self.refresh_resolver(region).await, ObserveOp::NotifyFailToStartObserve { diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 4120a71e4ee..50c3c6c1143 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -15,22 +15,10 @@ use crate::{debug, metrics::TRACK_REGION, utils}; #[derive(Clone, Default, Debug)] pub struct SubscriptionTracer(Arc>); -#[derive(Debug, PartialEq, Clone, Copy)] -pub enum SubscriptionState { - /// When it is newly added (maybe after split or leader transfered from - /// other store), without any flush. - Fresh, - /// It has been flushed, and running normally. - Normal, - /// It has been moved to other store. - Removal, -} - pub struct RegionSubscription { pub meta: Region, pub(crate) handle: ObserveHandle, pub(crate) resolver: TwoPhaseResolver, - state: SubscriptionState, } impl std::fmt::Debug for RegionSubscription { @@ -43,32 +31,17 @@ impl std::fmt::Debug for RegionSubscription { } impl RegionSubscription { - /// move self out. - fn take(&mut self) -> Self { - Self { - meta: self.meta.clone(), - handle: self.handle.clone(), - resolver: std::mem::replace(&mut self.resolver, TwoPhaseResolver::new(0, None)), - state: self.state, - } - } - pub fn new(region: Region, handle: ObserveHandle, start_ts: Option) -> Self { let resolver = TwoPhaseResolver::new(region.get_id(), start_ts); Self { handle, meta: region, resolver, - state: SubscriptionState::Fresh, } } pub fn stop(&mut self) { - if self.state == SubscriptionState::Removal { - return; - } self.handle.stop_observing(); - self.state = SubscriptionState::Removal; } pub fn is_observing(&self) -> bool { @@ -111,10 +84,7 @@ impl SubscriptionTracer { region.get_id(), RegionSubscription::new(region.clone(), handle, start_ts), ) { - if o.state != SubscriptionState::Removal { - TRACK_REGION.dec(); - warn!("register region which is already registered"; "region_id" => %region.get_id()); - } + TRACK_REGION.dec(); o.stop(); } } @@ -125,7 +95,6 @@ impl SubscriptionTracer { self.0 .iter_mut() // Don't advance the checkpoint ts of removed region. - .filter(|s| s.state != SubscriptionState::Removal) .map(|mut s| (s.meta.clone(), s.resolver.resolve(min_ts))) .collect() } @@ -150,12 +119,6 @@ impl SubscriptionTracer { } } - /// destroy subscription if the subscription is stopped. - pub fn destroy_stopped_region(&self, region_id: u64) { - self.0 - .remove_if(®ion_id, |_, sub| sub.state == SubscriptionState::Removal); - } - /// try to mark a region no longer be tracked by this observer. /// returns whether success (it failed if the region hasn't been observed /// when calling this.) @@ -165,27 +128,13 @@ impl SubscriptionTracer { if_cond: impl FnOnce(&RegionSubscription, &Region) -> bool, ) -> bool { let region_id = region.get_id(); - let remove_result = self.0.get_mut(®ion_id); + let remove_result = self.0.remove(®ion_id); match remove_result { - Some(mut o) => { - // If the state is 'removal', we should act as if the region subscription - // has been removed: the callback should not be called because somebody may - // use this method to check whether a key exists: - // ``` - // let mut present = false; - // deregister_region_if(42, |..| { - // present = true; - // }); - // ``` - // At that time, if we call the callback with stale value, the called may get - // false positive. - if o.state == SubscriptionState::Removal { - return false; - } - if if_cond(o.value(), region) { + Some((_, mut v)) => { + if if_cond(&v, region) { TRACK_REGION.dec(); - o.value_mut().stop(); - info!("stop listen stream from store"; "observer" => ?o.value(), "region_id"=> %region_id); + v.stop(); + info!("stop listen stream from store"; "observer" => ?v, "region_id"=> %region_id); return true; } false @@ -224,54 +173,11 @@ impl SubscriptionTracer { false } - /// Remove and collect the subscriptions have been marked as removed. - pub fn collect_removal_subs(&self) -> Vec { - let mut result = vec![]; - self.0.retain(|_k, v| { - if v.state == SubscriptionState::Removal { - result.push(v.take()); - false - } else { - true - } - }); - result - } - - /// Collect the fresh subscriptions, and mark them as Normal. - pub fn collect_fresh_subs(&self) -> Vec { - self.0 - .iter_mut() - .filter_map(|mut s| { - let v = s.value_mut(); - if v.state == SubscriptionState::Fresh { - v.state = SubscriptionState::Normal; - Some(v.meta.clone()) - } else { - None - } - }) - .collect() - } - - /// Remove all "Removal" entries. - /// Set all "Fresh" entries to "Normal". - pub fn update_status_for_v3(&self) { - self.0.retain(|_k, v| match v.state { - SubscriptionState::Fresh => { - v.state = SubscriptionState::Normal; - true - } - SubscriptionState::Normal => true, - SubscriptionState::Removal => false, - }) - } - /// check whether the region_id should be observed by this observer. pub fn is_observing(&self, region_id: u64) -> bool { let sub = self.0.get_mut(®ion_id); match sub { - Some(mut sub) if !sub.is_observing() || sub.state == SubscriptionState::Removal => { + Some(mut sub) if !sub.is_observing() => { sub.value_mut().stop(); false } @@ -538,8 +444,5 @@ mod test { (region(4, 8, 1), TimeStamp::new(128)), ] ); - let removal = subs.collect_removal_subs(); - assert_eq!(removal.len(), 1); - assert_eq!(removal[0].meta.get_id(), 5); } } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 5b53c040582..6e902fb1e08 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -13,12 +13,13 @@ use async_compression::futures::write::ZstdDecoder; use backup_stream::{ errors::Result, metadata::{ + keys::{KeyValue, MetaKey}, store::{MetaStore, SlashEtcStore}, MetadataClient, StreamTask, }, observer::BackupStreamObserver, router::Router, - Endpoint, Task, + Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; use futures::{executor::block_on, AsyncWriteExt, Future}; use grpcio::ChannelBuilder; @@ -98,7 +99,6 @@ struct ErrorStore { pub struct SuiteBuilder { name: String, nodes: usize, - use_v3: bool, metastore_error: Box Result<()> + Send + Sync>, } @@ -107,16 +107,10 @@ impl SuiteBuilder { Self { name: s.to_owned(), nodes: 4, - use_v3: false, metastore_error: Box::new(|_| Ok(())), } } - pub fn use_v3(mut self) -> Self { - self.use_v3 = true; - self - } - pub fn nodes(mut self, n: usize) -> Self { self.nodes = n; self @@ -134,7 +128,6 @@ impl SuiteBuilder { let Self { name: case, nodes: n, - use_v3, metastore_error, } = self; @@ -162,7 +155,7 @@ impl SuiteBuilder { } suite.cluster.run(); for id in 1..=(n as u64) { - suite.start_endpoint(id, use_v3); + suite.start_endpoint(id); } // We must wait until the endpoints get ready to watching the metastore, or some // modifies may be lost. Either make Endpoint::with_client wait until watch did @@ -254,7 +247,7 @@ impl Suite { worker } - fn start_endpoint(&mut self, id: u64, use_v3: bool) { + fn start_endpoint(&mut self, id: u64) { let cluster = &mut self.cluster; let worker = self.endpoints.get_mut(&id).unwrap(); let sim = cluster.sim.wl(); @@ -263,7 +256,6 @@ impl Suite { let regions = sim.region_info_accessors.get(&id).unwrap().clone(); let mut cfg = BackupStreamConfig::default(); cfg.enable = true; - cfg.use_checkpoint_v3 = use_v3; cfg.temp_path = format!("/{}/{}", self.temp_files.path().display(), id); let ob = self.obs.get(&id).unwrap().clone(); let endpoint = Endpoint::new( @@ -303,6 +295,44 @@ impl Suite { self.wait_with(move |r| block_on(r.get_task_info(&name)).is_ok()) } + /// This function tries to calculate the global checkpoint from the flush + /// status of nodes. + /// + /// NOTE: this won't check the region consistency for now, the checkpoint + /// may be weaker than expected. + fn global_checkpoint(&self) -> u64 { + let (tx, rx) = std::sync::mpsc::channel(); + self.run(|| { + let tx = tx.clone(); + Task::RegionCheckpointsOp(RegionCheckpointOperation::Get( + RegionSet::Universal, + Box::new(move |rs| rs.into_iter().for_each(|x| tx.send(x).unwrap())), + )) + }); + drop(tx); + + rx.into_iter() + .map(|r| match r { + GetCheckpointResult::Ok { checkpoint, .. } => checkpoint.into_inner(), + GetCheckpointResult::NotFound { .. } + | GetCheckpointResult::EpochNotMatch { .. } => { + unreachable!() + } + }) + .min() + .unwrap_or(0) + } + + async fn advance_global_checkpoint(&self, task: &str) -> Result<()> { + let cp = self.global_checkpoint(); + self.meta_store + .set(KeyValue( + MetaKey::central_global_checkpoint_of(task), + cp.to_be_bytes().to_vec(), + )) + .await + } + async fn write_records(&mut self, from: usize, n: usize, for_table: i64) -> HashSet> { let mut inserted = HashSet::default(); for ts in (from..(from + n)).map(|x| x * 2) { @@ -696,8 +726,8 @@ mod test { use std::time::{Duration, Instant}; use backup_stream::{ - errors::Error, metadata::MetadataClient, router::TaskSelector, GetCheckpointResult, - RegionCheckpointOperation, RegionSet, Task, + errors::Error, router::TaskSelector, GetCheckpointResult, RegionCheckpointOperation, + RegionSet, Task, }; use pd_client::PdClient; use tikv_util::{box_err, defer, info, HandyRwLock}; @@ -709,7 +739,7 @@ mod test { #[test] fn basic() { - let mut suite = super::SuiteBuilder::new_named("basic").use_v3().build(); + let mut suite = super::SuiteBuilder::new_named("basic").build(); fail::cfg("try_start_observe", "1*return").unwrap(); run_async_test(async { @@ -732,9 +762,7 @@ mod test { #[test] fn with_split() { - let mut suite = super::SuiteBuilder::new_named("with_split") - .use_v3() - .build(); + let mut suite = super::SuiteBuilder::new_named("with_split").build(); run_async_test(async { let round1 = suite.write_records(0, 128, 1).await; suite.must_split(&make_split_key_at_record(1, 42)); @@ -769,7 +797,7 @@ mod test { /// scanning get the snapshot. #[test] fn with_split_txn() { - let mut suite = super::SuiteBuilder::new_named("split_txn").use_v3().build(); + let mut suite = super::SuiteBuilder::new_named("split_txn").build(); run_async_test(async { let start_ts = suite.cluster.pd_client.get_tso().await.unwrap(); let keys = (1..1960).map(|i| make_record_key(1, i)).collect::>(); @@ -809,9 +837,7 @@ mod test { #[test] /// This case tests whether the backup can continue when the leader failes. fn leader_down() { - let mut suite = super::SuiteBuilder::new_named("leader_down") - .use_v3() - .build(); + let mut suite = super::SuiteBuilder::new_named("leader_down").build(); suite.must_register_task(1, "test_leader_down"); suite.sync(); let round1 = run_async_test(suite.write_records(0, 128, 1)); @@ -842,20 +868,11 @@ mod test { suite.write_records(258, 128, 1).await; suite.force_flush_files("test_async_commit"); std::thread::sleep(Duration::from_secs(4)); - let cli = MetadataClient::new(suite.meta_store.clone(), 1); - assert_eq!( - cli.global_progress_of_task("test_async_commit") - .await - .unwrap(), - 256 - ); + assert_eq!(suite.global_checkpoint(), 256); suite.just_commit_a_key(make_record_key(1, 256), TimeStamp::new(256), ts); suite.force_flush_files("test_async_commit"); suite.wait_for_flush(); - let cp = cli - .global_progress_of_task("test_async_commit") - .await - .unwrap(); + let cp = suite.global_checkpoint(); assert!(cp > 256, "it is {:?}", cp); }); suite.cluster.shutdown(); @@ -871,6 +888,7 @@ mod test { run_async_test(suite.write_records(0, 1, 1)); suite.force_flush_files("test_fatal_error"); suite.wait_for_flush(); + run_async_test(suite.advance_global_checkpoint("test_fatal_error")).unwrap(); let (victim, endpoint) = suite.endpoints.iter().next().unwrap(); endpoint .scheduler() @@ -879,24 +897,23 @@ mod test { Box::new(Error::Other(box_err!("everything is alright"))), )) .unwrap(); - let meta_cli = suite.get_meta_cli(); suite.sync(); - let err = run_async_test(meta_cli.get_last_error("test_fatal_error", *victim)) - .unwrap() - .unwrap(); + let err = run_async_test( + suite + .get_meta_cli() + .get_last_error("test_fatal_error", *victim), + ) + .unwrap() + .unwrap(); info!("err"; "err" => ?err); assert_eq!(err.error_code, error_code::backup_stream::OTHER.code); assert!(err.error_message.contains("everything is alright")); assert_eq!(err.store_id, *victim); - let paused = run_async_test(meta_cli.check_task_paused("test_fatal_error")).unwrap(); + let paused = + run_async_test(suite.get_meta_cli().check_task_paused("test_fatal_error")).unwrap(); assert!(paused); let safepoints = suite.cluster.pd_client.gc_safepoints.rl(); - let checkpoint = run_async_test( - suite - .get_meta_cli() - .global_progress_of_task("test_fatal_error"), - ) - .unwrap(); + let checkpoint = suite.global_checkpoint(); assert!( safepoints.iter().any(|sp| { @@ -909,55 +926,10 @@ mod test { ); } - #[test] - fn inflight_messages() { - // We should remove the failpoints when paniked or we may get stucked. - defer! {{ - fail::remove("delay_on_start_observe"); - fail::remove("delay_on_flush"); - }} - let mut suite = super::SuiteBuilder::new_named("inflight_message") - .nodes(3) - .build(); - suite.must_register_task(1, "inflight_message"); - run_async_test(suite.write_records(0, 128, 1)); - fail::cfg("delay_on_flush", "pause").unwrap(); - suite.force_flush_files("inflight_message"); - fail::cfg("delay_on_start_observe", "pause").unwrap(); - suite.must_shuffle_leader(1); - // Handling the `StartObserve` message and doing flush are executed - // asynchronously. Make a delay of unblocking flush thread for make sure - // we have handled the `StartObserve`. - std::thread::sleep(Duration::from_secs(1)); - fail::cfg("delay_on_flush", "off").unwrap(); - suite.wait_for_flush(); - let checkpoint = run_async_test( - suite - .get_meta_cli() - .global_progress_of_task("inflight_message"), - ); - fail::cfg("delay_on_start_observe", "off").unwrap(); - // The checkpoint should not advance if there are inflight messages. - assert_eq!(checkpoint.unwrap(), 0); - run_async_test(suite.write_records(256, 128, 1)); - suite.force_flush_files("inflight_message"); - suite.wait_for_flush(); - let checkpoint = run_async_test( - suite - .get_meta_cli() - .global_progress_of_task("inflight_message"), - ) - .unwrap(); - // The checkpoint should be advanced as expected when the inflight message has - // been consumed. - assert!(checkpoint > 512, "checkpoint = {}", checkpoint); - } - #[test] fn region_checkpoint_info() { let mut suite = super::SuiteBuilder::new_named("checkpoint_info") .nodes(1) - .use_v3() .build(); suite.must_register_task(1, "checkpoint_info"); suite.must_split(&make_split_key_at_record(1, 42)); @@ -1070,7 +1042,6 @@ mod test { let mut suite = SuiteBuilder::new_named("fail_to_refresh_region") .nodes(1) - .use_v3() .build(); suite.must_register_task(1, "fail_to_refresh_region"); @@ -1131,6 +1102,7 @@ mod test { suite.force_flush_files("pessimistic_lock"); suite.wait_for_flush(); std::thread::sleep(Duration::from_secs(1)); + run_async_test(suite.advance_global_checkpoint("pessimistic_lock")).unwrap(); let checkpoint = run_async_test( suite .get_meta_cli() diff --git a/src/config.rs b/src/config.rs index f4fbf17a38f..68193fe0ba9 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2497,9 +2497,6 @@ pub struct BackupStreamConfig { pub initial_scan_pending_memory_quota: ReadableSize, #[online_config(skip)] pub initial_scan_rate_limit: ReadableSize, - #[serde(skip)] - #[online_config(skip)] - pub use_checkpoint_v3: bool, } impl BackupStreamConfig { @@ -2532,7 +2529,6 @@ impl Default for BackupStreamConfig { file_size_limit: ReadableSize::mb(256), initial_scan_pending_memory_quota: ReadableSize(quota_size as _), initial_scan_rate_limit: ReadableSize::mb(60), - use_checkpoint_v3: true, } } } From f702db210210f852962f2a96087839b4fab01a04 Mon Sep 17 00:00:00 2001 From: Zwb Date: Wed, 12 Oct 2022 12:59:50 +0800 Subject: [PATCH 262/676] trace peers' availability info on leader side (#13209) ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/raftstore/src/store/config.rs | 8 +++ components/raftstore/src/store/fsm/peer.rs | 65 +++++++++++++++++++++ components/raftstore/src/store/fsm/store.rs | 2 + components/raftstore/src/store/msg.rs | 3 + components/raftstore/src/store/peer.rs | 37 ++++++++++++ components/raftstore/src/store/worker/pd.rs | 1 + src/server/service/kv.rs | 1 + tests/integrations/config/mod.rs | 1 + 9 files changed, 119 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 14620ebb6d1..7de5b6975f3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2663,7 +2663,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#f6d05706948aa296cff4db060e0962d3720f32eb" +source = "git+https://github.com/pingcap/kvproto.git#43b4391f08e72aa7c86e9a86ab62d084f3633cc0" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 8052a58dea8..a5e84aa8501 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -303,6 +303,12 @@ pub struct Config { pub max_snapshot_file_raw_size: ReadableSize, pub unreachable_backoff: ReadableDuration, + + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(hidden)] + // Interval to check peers availability info. + pub check_peers_availability_interval: ReadableDuration, } impl Default for Config { @@ -407,6 +413,8 @@ impl Default for Config { report_region_buckets_tick_interval: ReadableDuration::secs(10), max_snapshot_file_raw_size: ReadableSize::mb(100), unreachable_backoff: ReadableDuration::secs(10), + // TODO: make its value reasonable + check_peers_availability_interval: ReadableDuration::secs(30), } } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index e9a50d54db2..311258e72ff 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1217,6 +1217,7 @@ where PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted_tick(), + PeerTick::CheckPeersAvailability => self.on_check_peers_availability(), } } @@ -2627,6 +2628,42 @@ where self.fsm.hibernate_state.count_vote(from.get_id()); } + fn on_availability_response(&mut self, from: &metapb::Peer, msg: &ExtraMessage) { + if !self.fsm.peer.is_leader() { + return; + } + if !msg.wait_data { + self.fsm + .peer + .wait_data_peers + .retain(|id| *id != from.get_id()); + debug!( + "receive peer ready info"; + "peer_id" => self.fsm.peer.peer.get_id(), + ); + return; + } + self.register_check_peers_availability_tick(); + } + + fn on_availability_request(&mut self, from: &metapb::Peer) { + if self.fsm.peer.is_leader() { + return; + } + let mut resp = ExtraMessage::default(); + resp.set_type(ExtraMessageType::MsgAvailabilityResponse); + resp.wait_data = self.fsm.peer.wait_data; + self.fsm + .peer + .send_extra_message(resp, &mut self.ctx.trans, from); + debug!( + "peer responses availability info to leader"; + "region_id" => self.region().get_id(), + "peer_id" => self.fsm.peer.peer.get_id(), + "leader_id" => from.id, + ); + } + fn on_extra_message(&mut self, mut msg: RaftMessage) { match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { @@ -2660,6 +2697,12 @@ where ExtraMessageType::MsgRejectRaftLogCausedByMemoryUsage => { unimplemented!() } + ExtraMessageType::MsgAvailabilityRequest => { + self.on_availability_request(msg.get_from_peer()); + } + ExtraMessageType::MsgAvailabilityResponse => { + self.on_availability_response(msg.get_from_peer(), msg.get_extra_msg()); + } } } @@ -3209,6 +3252,7 @@ where ); } else { self.fsm.peer.transfer_leader(&from); + self.fsm.peer.wait_data_peers.clear(); } } } @@ -3660,6 +3704,7 @@ where .peer .peers_start_pending_time .retain(|&(p, _)| p != peer_id); + self.fsm.peer.wait_data_peers.retain(|id| *id != peer_id); } self.fsm.peer.remove_peer_from_cache(peer_id); // We only care remove itself now. @@ -5858,6 +5903,26 @@ where self.schedule_tick(PeerTick::PdHeartbeat) } + fn register_check_peers_availability_tick(&mut self) { + fail_point!("ignore schedule check peers availability tick", |_| {}); + self.schedule_tick(PeerTick::CheckPeersAvailability) + } + + fn on_check_peers_availability(&mut self) { + for peer_id in self.fsm.peer.wait_data_peers.iter() { + let peer = self.fsm.peer.get_peer_from_cache(*peer_id).unwrap(); + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityRequest); + self.fsm + .peer + .send_extra_message(msg, &mut self.ctx.trans, &peer); + debug!( + "check peer availability"; + "target peer id" => *peer_id, + ); + } + } + fn on_check_peer_stale_state_tick(&mut self) { if self.fsm.peer.pending_remove { return; diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index d53270c2ef0..c83309011ac 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -558,6 +558,8 @@ where self.cfg.report_region_buckets_tick_interval.0; self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = self.cfg.check_long_uncommitted_interval.0; + self.tick_batch[PeerTick::CheckPeersAvailability as usize].wait_duration = + self.cfg.check_peers_availability_interval.0; } } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 991a89e7147..93c691fb241 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -376,6 +376,7 @@ pub enum PeerTick { ReactivateMemoryLock = 8, ReportBuckets = 9, CheckLongUncommitted = 10, + CheckPeersAvailability = 11, } impl PeerTick { @@ -395,6 +396,7 @@ impl PeerTick { PeerTick::ReactivateMemoryLock => "reactivate_memory_lock", PeerTick::ReportBuckets => "report_buckets", PeerTick::CheckLongUncommitted => "check_long_uncommitted", + PeerTick::CheckPeersAvailability => "check_peers_availability", } } @@ -411,6 +413,7 @@ impl PeerTick { PeerTick::ReactivateMemoryLock, PeerTick::ReportBuckets, PeerTick::CheckLongUncommitted, + PeerTick::CheckPeersAvailability, ]; TICKS } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index aca4db04fd5..2d3fea79378 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -888,6 +888,8 @@ where peer_cache: RefCell>, /// Record the last instant of each peer's heartbeat response. pub peer_heartbeats: HashMap, + /// Record the waiting data status of each follower or learner peer. + pub wait_data_peers: Vec, proposals: ProposalQueue>, leader_missing_time: Option, @@ -910,6 +912,13 @@ where /// target peer. /// - all read requests must be rejected. pub pending_remove: bool, + /// Currently it's used to indicate whether the witness -> non-witess + /// convertion operation is complete. The meaning of completion is that + /// this peer must contain the applied data, then PD can consider that + /// the conversion operation is complete, and can continue to schedule + /// other operators to prevent the existence of multiple witnesses in + /// the same time period. + pub wait_data: bool, /// Force leader state is only used in online recovery when the majority of /// peers are missing. In this state, it forces one peer to become leader @@ -1112,6 +1121,7 @@ where long_uncommitted_threshold: cfg.long_uncommitted_base_threshold.0, peer_cache: RefCell::new(HashMap::default()), peer_heartbeats: HashMap::default(), + wait_data_peers: Vec::default(), peers_start_pending_time: vec![], down_peer_ids: vec![], size_diff_hint: 0, @@ -1122,6 +1132,7 @@ where compaction_declined_bytes: 0, leader_unreachable: false, pending_remove: false, + wait_data: false, should_wake_up: false, force_leader: None, pending_merge_state: None, @@ -2005,6 +2016,7 @@ where if !self.is_leader() { self.peer_heartbeats.clear(); self.peers_start_pending_time.clear(); + self.wait_data_peers.clear(); return; } @@ -2564,6 +2576,7 @@ where // Update apply index to `last_applying_idx` self.read_progress .update_applied(self.last_applying_idx, &ctx.coprocessor_host); + self.notify_leader_the_peer_is_available(ctx); } CheckApplyingSnapStatus::Idle => { // FIXME: It's possible that the snapshot applying task is canceled. @@ -2580,6 +2593,29 @@ where true } + fn notify_leader_the_peer_is_available( + &mut self, + ctx: &mut PollContext, + ) { + if self.wait_data { + self.wait_data = false; + fail_point!("ignore notify leader the peer is available", |_| {}); + let leader_id = self.leader_id(); + let leader = self.get_peer_from_cache(leader_id); + if let Some(leader) = leader { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityResponse); + msg.wait_data = false; + self.send_extra_message(msg, &mut ctx.trans, &leader); + info!( + "notify leader the leader is available"; + "region id" => self.region().get_id(), + "peer id" => self.peer.id + ); + } + } + } + pub fn handle_raft_ready_append( &mut self, ctx: &mut PollContext, @@ -5221,6 +5257,7 @@ where approximate_size: self.approximate_size, approximate_keys: self.approximate_keys, replication_status: self.region_replication_status(), + wait_data_peers: self.wait_data_peers.clone(), }); if let Err(e) = ctx.pd_scheduler.schedule(task) { error!( diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index f3518f4f674..ec06d756fe9 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -120,6 +120,7 @@ pub struct HeartbeatTask { pub approximate_size: Option, pub approximate_keys: Option, pub replication_status: Option, + pub wait_data_peers: Vec, } /// Uses an asynchronous thread to tell PD something. diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 35deb7e4107..1beab4f0dc6 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1375,6 +1375,7 @@ fn handle_batch_commands_request< response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::$metric_name, source); })* Some(batch_commands_request::request::Cmd::Import(_)) => unimplemented!(), + Some(batch_commands_request::request::Cmd::PrepareFlashbackToVersion(_)) => unimplemented!(), } } } diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index d16fe3b39f6..d0eac27e3b1 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -250,6 +250,7 @@ fn test_serde_custom_tikv_config() { long_uncommitted_base_threshold: ReadableDuration::secs(1), max_snapshot_file_raw_size: ReadableSize::gb(10), unreachable_backoff: ReadableDuration::secs(111), + check_peers_availability_interval: ReadableDuration::secs(30), }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { From c286b8a0a62115beba99f13ccd3db2529debcd18 Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 12 Oct 2022 16:31:52 +0800 Subject: [PATCH 263/676] *: Add a Raft admin command to put the region into a locking flashback state. (#13541) ref tikv/tikv#13303, ref tikv/tikv#13519 Add a Raft admin command to put the region into a lock state to prevent any reading, writing, and scheduling and persist the state in the RegionLocalState. Signed-off-by: husharp --- Cargo.lock | 2 +- components/raftstore/src/store/fsm/apply.rs | 151 +++++++++- components/raftstore/src/store/fsm/peer.rs | 62 +--- components/raftstore/src/store/msg.rs | 4 +- components/raftstore/src/store/peer.rs | 51 +--- components/raftstore/src/store/util.rs | 24 +- components/test_raftstore/src/cluster.rs | 59 ++-- src/server/metrics.rs | 1 + src/server/service/kv.rs | 159 ++++++++-- .../integrations/raftstore/test_flashback.rs | 275 ++++++++++++++---- 10 files changed, 586 insertions(+), 202 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7de5b6975f3..cee27c1494d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2663,7 +2663,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#43b4391f08e72aa7c86e9a86ab62d084f3633cc0" +source = "git+https://github.com/pingcap/kvproto.git#4c6f1502851ed55b3ed023d180b6b10766446630" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index a84a60183b6..5fb5754b116 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -87,10 +87,9 @@ use crate::{ msg::{Callback, ErrorCallback, PeerMsg, ReadResponse, SignificantMsg}, peer::Peer, peer_storage::{write_initial_apply_state, write_peer_state}, - util, util::{ - admin_cmd_epoch_lookup, check_region_epoch, compare_region_epoch, ChangePeerI, - ConfChangeKind, KeysInfoFormatter, LatencyInspector, + self, admin_cmd_epoch_lookup, check_flashback_state, check_region_epoch, + compare_region_epoch, ChangePeerI, ConfChangeKind, KeysInfoFormatter, LatencyInspector, }, Config, RegionSnapshot, RegionTask, WriteCallback, }, @@ -277,6 +276,9 @@ pub enum ExecResult { TransferLeader { term: u64, }, + SetFlashbackState { + region: Region, + }, } /// The possible returned value when applying logs. @@ -1342,6 +1344,12 @@ where "peer_id" => self.id(), "err" => ?e ), + Error::FlashbackInProgress(..) => debug!( + "flashback is in process"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "err" => ?e + ), _ => error!(?e; "execute raft command"; "region_id" => self.region_id(), @@ -1368,6 +1376,7 @@ where ExecResult::CommitMerge { ref region, .. } => (Some(region.clone()), None), ExecResult::RollbackMerge { ref region, .. } => (Some(region.clone()), None), ExecResult::IngestSst { ref ssts } => (None, Some(ssts.clone())), + ExecResult::SetFlashbackState { region } => (Some(region.clone()), None), _ => (None, None), }, _ => (None, None), @@ -1432,6 +1441,9 @@ where self.region = region.clone(); self.is_merging = false; } + ExecResult::SetFlashbackState { ref region } => { + self.region = region.clone(); + } } } if let Some(epoch) = origin_epoch { @@ -1510,6 +1522,7 @@ where let include_region = req.get_header().get_region_epoch().get_version() >= self.last_merge_version; check_region_epoch(req, &self.region, include_region)?; + check_flashback_state(req, &self.region)?; if req.has_admin_request() { self.exec_admin_cmd(ctx, req) } else { @@ -1548,6 +1561,9 @@ where AdminCmdType::PrepareMerge => self.exec_prepare_merge(ctx, request), AdminCmdType::CommitMerge => self.exec_commit_merge(ctx, request), AdminCmdType::RollbackMerge => self.exec_rollback_merge(ctx, request), + AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { + self.exec_flashback(ctx, request) + } AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), }?; response.set_cmd_type(cmd_type); @@ -2792,6 +2808,41 @@ where )) } + fn exec_flashback( + &self, + ctx: &mut ApplyContext, + req: &AdminRequest, + ) -> Result<(AdminResponse, ApplyResult)> { + let region_id = self.region_id(); + let region_state_key = keys::region_state_key(region_id); + let mut old_state = match ctx + .engine + .get_msg_cf::(CF_RAFT, ®ion_state_key) + { + Ok(Some(s)) => s, + _ => { + return Err(box_err!("failed to get region state of {}", region_id)); + } + }; + let is_in_flashback = req.get_cmd_type() == AdminCmdType::PrepareFlashback; + old_state.mut_region().set_is_in_flashback(is_in_flashback); + let mut region = self.region.clone(); + region.set_is_in_flashback(is_in_flashback); + ctx.kv_wb_mut() + .put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), &old_state) + .unwrap_or_else(|e| { + error!( + "{} failed to change flashback state to {:?} for region {}: {:?}", + self.tag, req, region_id, e + ) + }); + + Ok(( + AdminResponse::default(), + ApplyResult::Res(ExecResult::SetFlashbackState { region }), + )) + } + fn exec_compact_log( &mut self, req: &AdminRequest, @@ -4439,7 +4490,7 @@ mod tests { use engine_panic::PanicEngine; use engine_test::kv::{new_engine, KvTestEngine, KvTestSnapshot}; - use engine_traits::{Peekable as PeekableTrait, WriteBatchExt}; + use engine_traits::{Peekable as PeekableTrait, SyncMutable, WriteBatchExt}; use kvproto::{ kvrpcpb::ApiVersion, metapb::{self, RegionEpoch}, @@ -4454,6 +4505,7 @@ mod tests { store::{new_learner_peer, new_peer}, worker::dummy_scheduler, }; + use txn_types::WriteBatchFlags; use uuid::Uuid; use super::*; @@ -5110,6 +5162,7 @@ mod tests { true } AdminCmdType::BatchSplit => true, + AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => true, _ => false, } } @@ -6516,4 +6569,94 @@ mod tests { }); res.unwrap_err(); } + + #[test] + fn flashback_need_to_be_applied() { + let (_path, engine) = create_tmp_engine("flashback_need_to_be_applied"); + let (_, importer) = create_tmp_importer("flashback_need_to_be_applied"); + let mut host = CoprocessorHost::::default(); + host.registry + .register_query_observer(1, BoxQueryObserver::new(ApplyObserver::default())); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let cfg = Arc::new(VersionTrack::new(Config::default())); + let (router, mut system) = create_apply_batch_system(&cfg.value()); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "flashback_need_to_be_applied".to_owned(), + cfg, + sender, + region_scheduler, + coprocessor_host: host, + importer, + engine: engine.clone(), + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("flashback_need_to_be_applied".to_owned(), builder); + + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(2, 3)); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + reg.region.set_is_in_flashback(true); + router.schedule_task(1, Msg::Registration(reg)); + + let (capture_tx, capture_rx) = mpsc::channel(); + let mut region_state = RegionLocalState::default(); + region_state.mut_region().set_is_in_flashback(false); + let region_state_key = keys::region_state_key(1); + engine + .put_msg_cf(CF_RAFT, ®ion_state_key, ®ion_state) + .unwrap(); + // Check for not flashback request. + let mut cmd = AdminRequest::default(); + cmd.set_cmd_type(AdminCmdType::TransferLeader); + let mut flashback_req = EntryBuilder::new(1, 1).epoch(1, 3); + flashback_req.req.set_admin_request(cmd.clone()); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![flashback_req.build()], + vec![cb(1, 1, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_flashback_in_progress()); + // Check for flashback request. + cmd.set_cmd_type(AdminCmdType::PrepareFlashback); + region_state.mut_region().set_is_in_flashback(false); + let mut flashback_req = EntryBuilder::new(2, 2).epoch(1, 3); + flashback_req.req.set_admin_request(cmd.clone()); + flashback_req + .req + .mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 2, + vec![flashback_req.build()], + vec![cb(2, 2, capture_tx)], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + rx.recv_timeout(Duration::from_millis(500)).unwrap(); + system.shutdown(); + } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 311258e72ff..d4a31561c63 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -21,7 +21,7 @@ use collections::{HashMap, HashSet}; use engine_traits::{Engines, KvEngine, RaftEngine, SstMetaInfo, WriteBatchExt, CF_LOCK, CF_RAFT}; use error_code::ErrorCodeExt; use fail::fail_point; -use futures::channel::{mpsc::UnboundedSender, oneshot::Sender}; +use futures::channel::mpsc::UnboundedSender; use keys::{self, enc_end_key, enc_start_key}; use kvproto::{ brpb::CheckAdminResponse, @@ -82,11 +82,10 @@ use crate::{ metrics::*, msg::{Callback, ExtCallback, InspectedRaftMessage}, peer::{ - ConsistencyState, FlashbackState, ForceLeaderState, Peer, PersistSnapshotResult, - SnapshotRecoveryState, SnapshotRecoveryWaitApplySyncer, StaleState, - UnsafeRecoveryExecutePlanSyncer, UnsafeRecoveryFillOutReportSyncer, - UnsafeRecoveryForceLeaderSyncer, UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, - TRANSFER_LEADER_COMMAND_REPLY_CTX, + ConsistencyState, ForceLeaderState, Peer, PersistSnapshotResult, SnapshotRecoveryState, + SnapshotRecoveryWaitApplySyncer, StaleState, UnsafeRecoveryExecutePlanSyncer, + UnsafeRecoveryFillOutReportSyncer, UnsafeRecoveryForceLeaderSyncer, + UnsafeRecoveryState, UnsafeRecoveryWaitApplySyncer, TRANSFER_LEADER_COMMAND_REPLY_CTX, }, region_meta::RegionMeta, transport::Transport, @@ -987,38 +986,6 @@ where syncer.report_for_self(self_report); } - // Call msg PrepareFlashback to stop the scheduling and RW tasks. - // Once called, it will wait for the channel's notification in FlashbackState to - // finish. We place a flag in the request, which is checked when the - // pre_propose_raft_command is called. Stopping tasks is done by applying - // the flashback-only command in this way, But for RW local reads which need - // to be considered, we let the leader lease to None to ensure that local reads - // are not executed. - fn on_prepare_flashback(&mut self, ch: Sender) { - info!( - "prepare flashback"; - "region_id" => self.region().get_id(), - "peer_id" => self.fsm.peer.peer_id(), - ); - if self.fsm.peer.flashback_state.is_some() { - ch.send(false).unwrap(); - return; - } - self.fsm.peer.flashback_state = Some(FlashbackState::new(ch)); - // Let the leader lease to None to ensure that local reads are not executed. - self.fsm.peer.leader_lease_mut().expire_remote_lease(); - self.fsm.peer.maybe_finish_flashback_wait_apply(); - } - - fn on_finish_flashback(&mut self) { - info!( - "finish flashback"; - "region_id" => self.region().get_id(), - "peer_id" => self.fsm.peer.peer_id(), - ); - self.fsm.peer.flashback_state.take(); - } - fn on_check_pending_admin(&mut self, ch: UnboundedSender) { if !self.fsm.peer.is_leader() { // no need to check non-leader pending conf change. @@ -1464,9 +1431,6 @@ where SignificantMsg::UnsafeRecoveryFillOutReport(syncer) => { self.on_unsafe_recovery_fill_out_report(syncer) } - - SignificantMsg::PrepareFlashback(ch) => self.on_prepare_flashback(ch), - SignificantMsg::FinishFlashback => self.on_finish_flashback(), // for snapshot recovery (safe recovery) SignificantMsg::SnapshotRecoveryWaitApply(syncer) => { self.on_snapshot_recovery_wait_apply(syncer) @@ -2309,10 +2273,6 @@ where if self.fsm.peer.unsafe_recovery_state.is_some() { self.check_unsafe_recovery_state(); } - // TODO: combine recovery state and flashback state as a wait apply queue. - if self.fsm.peer.flashback_state.is_some() { - self.fsm.peer.maybe_finish_flashback_wait_apply(); - } if self.fsm.peer.snapshot_recovery_state.is_some() { self.fsm @@ -4831,6 +4791,9 @@ where } ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), + ExecResult::SetFlashbackState { region } => { + self.on_set_flashback_state(region.get_is_in_flashback()) + } } } @@ -4938,7 +4901,7 @@ where let region_id = self.region_id(); // When in the flashback state, we should not allow any other request to be // proposed. - if self.fsm.peer.flashback_state.is_some() { + if self.fsm.peer.is_in_flashback { self.ctx.raft_metrics.invalid_proposal.flashback.inc(); let flags = WriteBatchFlags::from_bits_truncate(msg.get_header().get_flags()); if !flags.contains(WriteBatchFlags::FLASHBACK) { @@ -6193,6 +6156,13 @@ where self.fsm.has_ready = true; } + fn on_set_flashback_state(&mut self, is_in_flashback: bool) { + // Set flashback memory + self.fsm.peer.is_in_flashback = is_in_flashback; + // Let the leader lease to None to ensure that local reads are not executed. + self.fsm.peer.leader_lease_mut().expire_remote_lease(); + } + /// Verify and store the hash to state. return true means the hash has been /// stored successfully. // TODO: Consider context in the function. diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 93c691fb241..6851ebd30d8 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -7,7 +7,7 @@ use std::{borrow::Cow, fmt}; use collections::HashSet; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; -use futures::channel::{mpsc::UnboundedSender, oneshot::Sender}; +use futures::channel::mpsc::UnboundedSender; use kvproto::{ brpb::CheckAdminResponse, import_sstpb::SstMeta, @@ -516,8 +516,6 @@ where UnsafeRecoveryWaitApply(UnsafeRecoveryWaitApplySyncer), UnsafeRecoveryFillOutReport(UnsafeRecoveryFillOutReportSyncer), SnapshotRecoveryWaitApply(SnapshotRecoveryWaitApplySyncer), - PrepareFlashback(Sender), - FinishFlashback, CheckPendingAdmin(UnboundedSender), } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 2d3fea79378..c95dda17c2c 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -24,7 +24,6 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; -use futures::channel::oneshot::Sender; use getset::{Getters, MutGetters}; use kvproto::{ errorpb, @@ -840,32 +839,6 @@ pub enum UnsafeRecoveryState { Destroy(UnsafeRecoveryExecutePlanSyncer), } -// This state is set by the peer fsm when invoke msg PrepareFlashback. Once set, -// it is checked every time this peer applies a new entry or a snapshot, -// if the latest committed index is met, the syncer will be called to notify the -// result. -#[derive(Debug)] -pub struct FlashbackState(Option>); - -impl FlashbackState { - pub fn new(ch: Sender) -> Self { - FlashbackState(Some(ch)) - } - - pub fn finish_wait_apply(&mut self) { - if self.0.is_none() { - return; - } - let ch = self.0.take().unwrap(); - match ch.send(true) { - Ok(_) => {} - Err(e) => { - error!("Fail to notify flashback state"; "err" => ?e); - } - } - } -} - #[derive(Getters, MutGetters)] pub struct Peer where @@ -1056,7 +1029,8 @@ where /// lead_transferee if the peer is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, - pub flashback_state: Option, + // Used as the memory state for Flashback to reject RW/Schedule before proposing. + pub is_in_flashback: bool, pub snapshot_recovery_state: Option, } @@ -1089,7 +1063,6 @@ where peer.get_id(), tag.clone(), )?; - let applied_index = ps.applied_index(); let raft_cfg = raft::Config { @@ -1192,7 +1165,7 @@ where last_region_buckets: None, lead_transferee: raft::INVALID_ID, unsafe_recovery_state: None, - flashback_state: None, + is_in_flashback: region.get_is_in_flashback(), snapshot_recovery_state: None, }; @@ -2555,10 +2528,6 @@ where debug!("unsafe recovery finishes applying a snapshot"); self.unsafe_recovery_maybe_finish_wait_apply(/* force= */ false); } - if self.flashback_state.is_some() { - debug!("flashback finishes applying a snapshot"); - self.maybe_finish_flashback_wait_apply(); - } if self.snapshot_recovery_state.is_some() { debug!("snapshot recovery finishes applying a snapshot"); self.snapshot_recovery_maybe_finish_wait_apply(false); @@ -3541,7 +3510,7 @@ where self.force_leader.is_some(), ) { None - } else if self.flashback_state.is_some() { + } else if self.is_in_flashback { debug!( "prevents renew lease while in flashback state"; "region_id" => self.region_id, @@ -5131,16 +5100,6 @@ where } } } - - pub fn maybe_finish_flashback_wait_apply(&mut self) { - let finished = - self.raft_group.raft.raft_log.applied == self.raft_group.raft.raft_log.last_index(); - if finished { - if let Some(flashback_state) = self.flashback_state.as_mut() { - flashback_state.finish_wait_apply(); - } - } - } } #[derive(Default, Debug)] @@ -5690,6 +5649,8 @@ pub fn get_sync_log_from_request(msg: &RaftCmdRequest) -> bool { | AdminCmdType::PrepareMerge | AdminCmdType::CommitMerge | AdminCmdType::RollbackMerge + | AdminCmdType::PrepareFlashback + | AdminCmdType::FinishFlashback ); } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 067bb6f727e..42276c79ab6 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -29,7 +29,7 @@ use raft::{ use raft_proto::ConfChangeI; use tikv_util::{box_err, debug, info, store::region, time::monotonic_raw_now, Either}; use time::{Duration, Timespec}; -use txn_types::TimeStamp; +use txn_types::{TimeStamp, WriteBatchFlags}; use super::peer_storage; use crate::{coprocessor::CoprocessorHost, Error, Result}; @@ -192,6 +192,9 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdType::RollbackMerge => AdminCmdEpochState::new(true, true, true, false), // Transfer leader AdminCmdType::TransferLeader => AdminCmdEpochState::new(true, true, false, false), + AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { + AdminCmdEpochState::new(false, false, false, false) + } } } @@ -277,6 +280,25 @@ pub fn compare_region_epoch( Ok(()) } +pub fn check_flashback_state(req: &RaftCmdRequest, region: &metapb::Region) -> Result<()> { + // If admin flashback has not been applied but the region is already in a + // flashback state, the request is rejected + if region.get_is_in_flashback() { + let flags = WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()); + if flags.contains(WriteBatchFlags::FLASHBACK) { + return Ok(()); + } + if req.has_admin_request() + && (req.get_admin_request().get_cmd_type() == AdminCmdType::PrepareFlashback + || req.get_admin_request().get_cmd_type() == AdminCmdType::FinishFlashback) + { + return Ok(()); + } + return Err(Error::FlashbackInProgress(region.get_id())); + } + Ok(()) +} + pub fn is_region_epoch_equal( from_epoch: &metapb::RegionEpoch, current_epoch: &metapb::RegionEpoch, diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index ef0f2246b7d..9b1f19bf21a 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -34,6 +34,7 @@ use kvproto::{ use pd_client::{BucketStat, PdClient}; use raft::eraftpb::ConfChangeType; use raftstore::{ + router::RaftStoreRouter, store::{ fsm::{ create_raft_batch_system, @@ -54,6 +55,7 @@ use tikv_util::{ worker::LazyWorker, HandyRwLock, }; +use txn_types::WriteBatchFlags; use super::*; use crate::Config; @@ -1419,26 +1421,49 @@ impl Cluster { .unwrap(); } - pub async fn call_and_wait_prepare_flashback(&mut self, region_id: u64, store_id: u64) { - let router = self.sim.rl().get_router(store_id).unwrap(); - let (tx, rx) = oneshot::channel(); - - router - .significant_send(region_id, SignificantMsg::PrepareFlashback(tx)) - .unwrap(); - - let prepared = rx.await.unwrap(); - if !prepared { - panic!("prepare flashback failed"); - } - } + pub async fn send_flashback_msg( + &mut self, + region_id: u64, + store_id: u64, + cmd_type: AdminCmdType, + epoch: metapb::RegionEpoch, + peer: metapb::Peer, + ) { + let (result_tx, result_rx) = oneshot::channel(); + let cb = Callback::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + result_tx.send(false).unwrap(); + error!("send flashback msg failed"; "region_id" => region_id); + return; + } + result_tx.send(true).unwrap(); + })); + + let mut admin = AdminRequest::default(); + admin.set_cmd_type(cmd_type); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_region_epoch(epoch); + req.mut_header().set_peer(peer); + req.set_admin_request(admin); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); - pub fn call_finish_flashback(&mut self, region_id: u64, store_id: u64) { let router = self.sim.rl().get_router(store_id).unwrap(); + if let Err(e) = router.send_command( + req, + cb, + RaftCmdExtraOpts { + deadline: None, + disk_full_opt: kvproto::kvrpcpb::DiskFullOpt::AllowedOnAlmostFull, + }, + ) { + panic!("router send failed, error{}", e); + } - router - .significant_send(region_id, SignificantMsg::FinishFlashback) - .unwrap(); + if !result_rx.await.unwrap() { + panic!("Flashback call msg failed"); + } } pub fn must_split(&mut self, region: &metapb::Region, split_key: &[u8]) { diff --git a/src/server/metrics.rs b/src/server/metrics.rs index a73e79ec59b..3e07a75899f 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -35,6 +35,7 @@ make_auto_flush_static_metric! { kv_resolve_lock, kv_gc, kv_delete_range, + kv_prepare_flashback_to_version, kv_flashback_to_version, raw_get, raw_batch_get, diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 1beab4f0dc6..924236529d9 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -21,7 +21,10 @@ use kvproto::{ errorpb::{Error as RegionError, *}, kvrpcpb::*, mpp::*, - raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request as RaftRequest}, + raft_cmdpb::{ + AdminCmdType, AdminRequest, CmdType, RaftCmdRequest, RaftRequestHeader, + Request as RaftRequest, + }, raft_serverpb::*, tikvpb::*, }; @@ -32,7 +35,7 @@ use raftstore::{ store::{ memory::{MEMTRACE_APPLYS, MEMTRACE_RAFT_ENTRIES, MEMTRACE_RAFT_MESSAGES}, metrics::RAFT_ENTRIES_CACHES_GAUGE, - Callback, CasualMessage, CheckLeaderTask, RaftCmdExtraOpts, SignificantMsg, + Callback, CasualMessage, CheckLeaderTask, RaftCmdExtraOpts, }, DiscardReason, Error as RaftStoreError, Result as RaftStoreResult, }; @@ -44,8 +47,9 @@ use tikv_util::{ time::{duration_to_ms, duration_to_sec, Instant}, worker::Scheduler, }; +use tokio::sync::Mutex; use tracker::{set_tls_tracker_token, RequestInfo, RequestType, Tracker, GLOBAL_TRACKERS}; -use txn_types::{self, Key}; +use txn_types::{self, Key, WriteBatchFlags}; use super::batch::{BatcherBuilder, ReqBatcher}; use crate::{ @@ -401,6 +405,37 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ); } + fn kv_prepare_flashback_to_version( + &mut self, + ctx: RpcContext<'_>, + mut req: PrepareFlashbackToVersionRequest, + sink: UnarySink, + ) { + let begin_instant = Instant::now(); + + let source = req.mut_context().take_request_source(); + let resp = future_prepare_flashback_to_version(&self.storage, &self.ch, req); + let task = async move { + let resp = resp.await?; + let elapsed = begin_instant.saturating_elapsed(); + sink.success(resp).await?; + GRPC_MSG_HISTOGRAM_STATIC + .kv_prepare_flashback_to_version + .observe(elapsed.as_secs_f64()); + record_request_source_metrics(source, elapsed); + ServerResult::Ok(()) + } + .map_err(|e| { + log_net_error!(e, "kv rpc failed"; + "request" => stringify!($fn_name) + ); + GRPC_MSG_FAIL_COUNTER.kv_prepare_flashback_to_version.inc(); + }) + .map(|_| ()); + + ctx.spawn(task); + } + fn kv_flashback_to_version( &mut self, ctx: RpcContext<'_>, @@ -1375,7 +1410,6 @@ fn handle_batch_commands_request< response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::$metric_name, source); })* Some(batch_commands_request::request::Cmd::Import(_)) => unimplemented!(), - Some(batch_commands_request::request::Cmd::PrepareFlashbackToVersion(_)) => unimplemented!(), } } } @@ -1394,6 +1428,7 @@ fn handle_batch_commands_request< ResolveLock, future_resolve_lock(storage), kv_resolve_lock; Gc, future_gc(), kv_gc; DeleteRange, future_delete_range(storage), kv_delete_range; + PrepareFlashbackToVersion, future_prepare_flashback_to_version(storage, ch), kv_prepare_flashback_to_version; FlashbackToVersion, future_flashback_to_version(storage, ch), kv_flashback_to_version; RawBatchGet, future_raw_batch_get(storage), raw_batch_get; RawPut, future_raw_put(storage), raw_put; @@ -1687,6 +1722,27 @@ fn future_delete_range( } } +// Preparing the flashback for a region/key range will "lock" the region so that +// there is no any read, write or schedule operation could be proposed before +// the actual flashback operation. +fn future_prepare_flashback_to_version< + E: Engine, + L: LockManager, + F: KvFormat, + T: RaftStoreRouter + 'static, +>( + // Keep this param to hint the type of E for the compiler. + _storage: &Storage, + _raft_router: &T, + _req: PrepareFlashbackToVersionRequest, +) -> impl Future> { + // TODO: implement this. + async move { unimplemented!() } +} + +// Flashback the region to a specific point with the given `version`, please +// make sure the region is "locked" by `PrepareFlashbackToVersion` first, +// otherwise this request will fail. fn future_flashback_to_version< T: RaftStoreRouter + 'static, E: Engine, @@ -1698,24 +1754,26 @@ fn future_flashback_to_version< req: FlashbackToVersionRequest, ) -> impl Future> { let storage_clone = storage.clone(); - let raft_router_clone = raft_router.clone(); + let raft_router = Mutex::new(raft_router.clone()); async move { - // Send a `SignificantMsg::PrepareFlashback` to prepare the raftstore for the + // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the // later flashback. This will first block all scheduling, read and write - // operations and then wait for the latest Raft log to be applied before - // we start the flashback command. - let region_id = req.get_context().get_region_id(); - let (result_tx, result_rx) = oneshot::channel(); - raft_router_clone - .significant_send(region_id, SignificantMsg::PrepareFlashback(result_tx))?; - if !result_rx.await? { - return Err(Error::Other(box_err!( - "failed to prepare the region {} for flashback", - region_id - ))); - } + // operations, then wait for the latest Raft log to be applied before we start + // the flashback command. Once invoked, we update the persistence state + // in `RegionLocalState` and region's meta, and when that + // admin cmd is applied, the `PrepareFlashback` command will update the memory + // state of the flashback, rejecting all read and write operations at + // propose and applied. We make FlashbackToVersion a two-stage request + // and lock the region in the first stage. + send_flashback_msg::( + &raft_router, + req.get_context(), + AdminCmdType::PrepareFlashback, + ) + .await?; + let (cb, f) = paired_future_callback(); - let res = storage_clone.sched_txn_command(req.into(), cb); + let res = storage_clone.sched_txn_command(req.clone().into(), cb); // Avoid crossing `.await` to bypass the `Send` constraint. drop(storage_clone); let v = match res { @@ -1725,9 +1783,17 @@ fn future_flashback_to_version< fail_point!("skip_finish_flashback_to_version", |_| { Ok(FlashbackToVersionResponse::default()) }); - // Send a `SignificantMsg::FinishFlashback` to notify the raftstore that the - // flashback has been finished. - raft_router_clone.significant_send(region_id, SignificantMsg::FinishFlashback)?; + // Send an `AdminCmdType::FinishFlashback` to unset the persistence state + // in `RegionLocalState` and region's meta, and when that + // admin cmd is applied, will update the memory + // state of the flashback + send_flashback_msg::( + &raft_router, + req.get_context(), + AdminCmdType::FinishFlashback, + ) + .await?; + let mut resp = FlashbackToVersionResponse::default(); if let Some(err) = extract_region_error(&v) { resp.set_region_error(err); @@ -2401,6 +2467,55 @@ fn needs_reject_raft_append(reject_messages_on_memory_ratio: f64) -> bool { false } +async fn send_flashback_msg + 'static, E: Engine>( + raft_router: &Mutex, + ctx: &Context, + cmd_type: AdminCmdType, +) -> ServerResult<()> { + let (result_tx, result_rx) = oneshot::channel(); + let cb = Callback::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + result_tx.send(false).unwrap(); + error!("send flashback msg failed"; "error" => ?resp.response.get_header().get_error()); + return; + } + result_tx.send(true).unwrap(); + })); + let mut admin = AdminRequest::default(); + admin.set_cmd_type(cmd_type); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(ctx.get_region_id()); + req.mut_header() + .set_region_epoch(ctx.get_region_epoch().clone()); + req.mut_header().set_peer(ctx.get_peer().clone()); + req.set_admin_request(admin); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + // call admin request directly + let raft_router = raft_router.lock().await; + if let Err(e) = raft_router.send_command( + req, + cb, + RaftCmdExtraOpts { + deadline: None, + disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, + }, + ) { + return Err(Error::Other(box_err!( + "flashback router send failed, error {:?}", + e + ))); + } + if !result_rx.await? { + return Err(Error::Other(box_err!( + "send flashback msg {:?} to region {} failed", + cmd_type, + ctx.get_region_id() + ))); + } + Ok(()) +} + #[cfg(test)] mod tests { use std::thread; diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 064edebf88a..5709cd22804 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -1,33 +1,13 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::time::{Duration, Instant}; use futures::executor::block_on; use kvproto::metapb; use test_raftstore::*; +use tikv_util::time::InstantExt; use txn_types::WriteBatchFlags; -#[test] -fn test_flashback_for_applied_index() { - let mut cluster = new_node_cluster(0, 3); - cluster.run(); - - // write for cluster. - let value = vec![1_u8; 8096]; - multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); - - // prepare for flashback - let region = cluster.get_region(b"k1"); - block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); - - let last_index = cluster - .raft_local_state(region.get_id(), 1) - .get_last_index(); - let appied_index = cluster.apply_state(region.get_id(), 1).get_applied_index(); - - assert_eq!(last_index, appied_index); -} - #[test] fn test_flashback_for_schedule() { let mut cluster = new_node_cluster(0, 3); @@ -36,16 +16,21 @@ fn test_flashback_for_schedule() { cluster.must_transfer_leader(1, new_peer(2, 2)); cluster.must_transfer_leader(1, new_peer(1, 1)); - // prepare for flashback + // Prepare for flashback let region = cluster.get_region(b"k1"); - block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); + block_on(cluster.send_flashback_msg( + region.get_id(), + 1, + kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, + cluster.get_region_epoch(1), + new_peer(1, 1), + )); - // verify the schedule is unabled. + // Verify the schedule is unabled. let mut region = cluster.get_region(b"k3"); let admin_req = new_transfer_leader_cmd(new_peer(2, 2)); - let mut transfer_leader = + let transfer_leader = new_admin_request(region.get_id(), ®ion.take_region_epoch(), admin_req); - transfer_leader.mut_header().set_peer(new_peer(1, 1)); let resp = cluster .call_command_on_leader(transfer_leader, Duration::from_secs(3)) .unwrap(); @@ -58,23 +43,17 @@ fn test_flashback_for_schedule() { } ); - // verify the schedule can be executed if add flashback flag in request's + // Verify the schedule can be executed if add flashback flag in request's // header. - let mut region = cluster.get_region(b"k3"); - let admin_req = new_transfer_leader_cmd(new_peer(2, 2)); - let mut transfer_leader = - new_admin_request(region.get_id(), ®ion.take_region_epoch(), admin_req); - transfer_leader.mut_header().set_peer(new_peer(1, 1)); - transfer_leader - .mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let resp = cluster - .call_command_on_leader(transfer_leader, Duration::from_secs(5)) - .unwrap(); - assert!(!resp.get_header().has_error()); - - cluster.call_finish_flashback(region.get_id(), 1); - // transfer leader to (1, 1) + must_transfer_leader(&mut cluster, region.get_id(), new_peer(2, 2)); + block_on(cluster.send_flashback_msg( + region.get_id(), + 2, + kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, + cluster.get_region_epoch(1), + new_peer(2, 2), + )); + // Transfer leader to (1, 1) cluster.must_transfer_leader(1, new_peer(1, 1)); } @@ -82,16 +61,23 @@ fn test_flashback_for_schedule() { fn test_flashback_for_write() { let mut cluster = new_node_cluster(0, 3); cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); - // write for cluster + // Write for cluster let value = vec![1_u8; 8096]; multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); - // prepare for flashback + // Prepare for flashback let region = cluster.get_region(b"k1"); - block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); + block_on(cluster.send_flashback_msg( + region.get_id(), + 1, + kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, + cluster.get_region_epoch(1), + new_peer(1, 1), + )); - // write will be blocked + // Write will be blocked let value = vec![1_u8; 8096]; must_get_error_flashback_in_progress(&mut cluster, ®ion, new_put_cmd(b"k1", &value)); @@ -101,7 +87,13 @@ fn test_flashback_for_write() { new_put_cmd(b"k1", &value), ); - cluster.call_finish_flashback(region.get_id(), 1); + block_on(cluster.send_flashback_msg( + region.get_id(), + 1, + kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, + cluster.get_region_epoch(1), + new_peer(1, 1), + )); multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); } @@ -110,21 +102,28 @@ fn test_flashback_for_write() { fn test_flashback_for_read() { let mut cluster = new_node_cluster(0, 3); cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); - // write for cluster + // Write for cluster let value = vec![1_u8; 8096]; multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); // read for cluster multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); - // prepare for flashback + // Prepare for flashback let region = cluster.get_region(b"k1"); - block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); + block_on(cluster.send_flashback_msg( + region.get_id(), + 1, + kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, + cluster.get_region_epoch(1), + new_peer(1, 1), + )); // read will be blocked must_get_error_flashback_in_progress(&mut cluster, ®ion, new_get_cf_cmd("write", b"k1")); - // verify the read can be executed if add flashback flag in request's + // Verify the read can be executed if add flashback flag in request's // header. must_cmd_add_flashback_flag( &mut cluster, @@ -132,7 +131,13 @@ fn test_flashback_for_read() { new_get_cf_cmd("write", b"k1"), ); - cluster.call_finish_flashback(region.get_id(), 1); + block_on(cluster.send_flashback_msg( + region.get_id(), + 1, + kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, + cluster.get_region_epoch(1), + new_peer(1, 1), + )); multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); } @@ -157,7 +162,7 @@ fn test_flashback_for_local_read() { let region = cluster.get_region(b"k1"); cluster.must_transfer_leader(region.get_id(), peer.clone()); - // check local read before prepare flashback + // Check local read before prepare flashback let state = cluster.raft_local_state(region.get_id(), store_id); let last_index = state.get_last_index(); // Make sure the leader transfer procedure timeouts. @@ -167,8 +172,20 @@ fn test_flashback_for_local_read() { let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index); - // prepare for flashback - block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), store_id)); + // Prepare for flashback + block_on(cluster.send_flashback_msg( + region.get_id(), + store_id, + kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, + cluster.get_region_epoch(1), + new_peer(store_id, store_id), + )); + + // Check the leader does a local read. + let state = cluster.raft_local_state(region.get_id(), store_id); + assert_eq!(state.get_last_index(), last_index + 1); + // Wait for apply_res to set leader lease . + sleep_ms(500); must_error_read_on_peer( &mut cluster, @@ -191,11 +208,20 @@ fn test_flashback_for_local_read() { // Also check read by propose was blocked let state = cluster.raft_local_state(region.get_id(), store_id); - assert_eq!(state.get_last_index(), last_index); + assert_eq!(state.get_last_index(), last_index + 1); + + block_on(cluster.send_flashback_msg( + region.get_id(), + store_id, + kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, + cluster.get_region_epoch(1), + new_peer(store_id, store_id), + )); - cluster.call_finish_flashback(region.get_id(), store_id); + let state = cluster.raft_local_state(region.get_id(), store_id); + assert_eq!(state.get_last_index(), last_index + 2); - // check local read after finish flashback + // Check local read after finish flashback let state = cluster.raft_local_state(region.get_id(), store_id); let last_index = state.get_last_index(); // Make sure the leader transfer procedure timeouts. @@ -212,11 +238,17 @@ fn test_flashback_for_status_cmd_as_region_detail() { let mut cluster = new_node_cluster(0, 3); cluster.run(); + let leader = cluster.leader_of_region(1).unwrap(); let region = cluster.get_region(b"k1"); - block_on(cluster.call_and_wait_prepare_flashback(region.get_id(), 1)); + block_on(cluster.send_flashback_msg( + region.get_id(), + leader.get_store_id(), + kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, + cluster.get_region_epoch(1), + new_peer(leader.get_store_id(), leader.get_store_id()), + )); - let leader = cluster.leader_of_region(1).unwrap(); - let region_detail = cluster.region_detail(1, 1); + let region_detail = cluster.region_detail(region.get_id(), leader.get_store_id()); assert!(region_detail.has_region()); let region = region_detail.get_region(); assert_eq!(region.get_id(), 1); @@ -231,6 +263,123 @@ fn test_flashback_for_status_cmd_as_region_detail() { assert_eq!(region_detail.get_leader(), &leader); } +#[test] +fn test_flashback_for_check_is_in_persist() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + + let leader_peer = new_peer(2, 2); + cluster.must_transfer_leader(1, leader_peer.clone()); + + let local_state = cluster.region_local_state(1, 2); + assert!(!local_state.get_region().get_is_in_flashback()); + + // Prepare for flashback + block_on(cluster.send_flashback_msg( + 1, + 2, + kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, + cluster.get_region_epoch(1), + leader_peer.clone(), + )); + let local_state = cluster.region_local_state(1, 2); + assert!(local_state.get_region().get_is_in_flashback()); + + block_on(cluster.send_flashback_msg( + 1, + 2, + kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, + cluster.get_region_epoch(1), + leader_peer, + )); + let local_state = cluster.region_local_state(1, 2); + assert!(!local_state.get_region().get_is_in_flashback()); +} + +#[test] +fn test_flashback_for_apply_snapshot() { + let mut cluster = new_node_cluster(0, 5); + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + // Make node3 isolationed + cluster.add_send_filter(IsolationFilterFactory::new(5)); + + let local_state = cluster.region_local_state(1, 1); + assert!(!local_state.get_region().get_is_in_flashback()); + + // Write for cluster + let value = vec![1_u8; 8096]; + multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); + + // Prepare for flashback + block_on(cluster.send_flashback_msg( + 1, + 1, + kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, + cluster.get_region_epoch(1), + new_peer(1, 1), + )); + let local_state = cluster.region_local_state(1, 1); + assert!(local_state.get_region().get_is_in_flashback()); + + // Add node 3 back. + cluster.clear_send_filters(); + // Wait for snapshot + sleep_ms(500); + + must_transfer_leader(&mut cluster, 1, new_peer(5, 5)); + let local_state = cluster.region_local_state(1, 5); + assert!(local_state.get_region().get_is_in_flashback()); + + block_on(cluster.send_flashback_msg( + 1, + 5, + kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, + cluster.get_region_epoch(1), + new_peer(5, 5), + )); +} + +fn transfer_leader(cluster: &mut Cluster, region_id: u64, leader: metapb::Peer) { + let epoch = cluster.get_region_epoch(region_id); + let admin_req = new_transfer_leader_cmd(leader); + let mut transfer_leader = new_admin_request(region_id, &epoch, admin_req); + transfer_leader + .mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let resp = cluster + .call_command_on_leader(transfer_leader, Duration::from_secs(5)) + .unwrap(); + assert!(!resp.get_header().has_error()); +} + +fn must_transfer_leader( + cluster: &mut Cluster, + region_id: u64, + leader: metapb::Peer, +) { + let timer = Instant::now(); + loop { + cluster.reset_leader_of_region(region_id); + let cur_leader = cluster.leader_of_region(region_id); + if let Some(ref cur_leader) = cur_leader { + if cur_leader.get_id() == leader.get_id() + && cur_leader.get_store_id() == leader.get_store_id() + { + return; + } + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!( + "failed to transfer leader to [{}] {:?}, current leader: {:?}", + region_id, leader, cur_leader + ); + } + transfer_leader(cluster, region_id, leader.clone()); + } +} + fn multi_do_cmd(cluster: &mut Cluster, cmd: kvproto::raft_cmdpb::Request) { for _ in 0..100 { let mut reqs = vec![]; @@ -246,7 +395,7 @@ fn must_cmd_add_flashback_flag( region: &mut metapb::Region, cmd: kvproto::raft_cmdpb::Request, ) { - // verify the read can be executed if add flashback flag in request's + // Verify the read can be executed if add flashback flag in request's // header. let mut req = new_request( region.get_id(), From 066a4222da1689e94b7cf78ed3f3131166c9a524 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 12 Oct 2022 01:49:50 -0700 Subject: [PATCH 264/676] channel: early break when fetch nothing (#13516) close tikv/tikv#13394 Otherwise it will waste CPU on loop. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/tikv_util/src/mpsc/future.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/tikv_util/src/mpsc/future.rs b/components/tikv_util/src/mpsc/future.rs index c38dc8c1492..1e9f94c2f2d 100644 --- a/components/tikv_util/src/mpsc/future.rs +++ b/components/tikv_util/src/mpsc/future.rs @@ -213,6 +213,8 @@ where for _ in 1..ctx.max_batch_size { if let Poll::Ready(Some(m)) = ctx.rx.poll_next_unpin(cx) { (ctx.collector)(&mut collector, m); + } else { + break; } } Poll::Ready(Some(collector)) From 2a9888381bdf5384f4b097cd670bb1192496c2b4 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Thu, 13 Oct 2022 12:15:51 +0800 Subject: [PATCH 265/676] storage/lock_manager: Avoid stale entries in the new lock waiting queue (#13584) ref tikv/tikv#13298 Avoid stale entries in the new lock waiting queue. This is done by making use of another implementation of the priority queue (instead of the std BinaryHeap) that supports efficiently removing element by key. So that when a lock-waiting request is canceled, the entry can be removed from the queue immediately, instead of waiting for the lazy-cleaning-up. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot Co-authored-by: Yilin Chen --- Cargo.lock | 11 ++ Cargo.toml | 2 + src/storage/lock_manager/lock_wait_context.rs | 159 ++++++++------- .../lock_manager/lock_waiting_queue.rs | 183 +++++++++++++----- src/storage/metrics.rs | 2 +- 5 files changed, 232 insertions(+), 125 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cee27c1494d..bf3536544fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2648,6 +2648,15 @@ dependencies = [ "winapi-build", ] +[[package]] +name = "keyed_priority_queue" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d63b6407b66fc81fc539dccf3ddecb669f393c5101b6a2be3976c95099a06e8" +dependencies = [ + "indexmap", +] + [[package]] name = "keys" version = "0.1.0" @@ -6180,6 +6189,7 @@ dependencies = [ "futures-executor", "futures-timer", "futures-util", + "fxhash", "getset", "grpcio", "grpcio-health", @@ -6190,6 +6200,7 @@ dependencies = [ "hyper-tls", "into_other", "itertools", + "keyed_priority_queue", "keys", "kvproto", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index 545ee9380a7..c38b98631c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -95,6 +95,7 @@ futures = { version = "0.3", features = ["thread-pool", "compat"] } futures-executor = "0.3.1" futures-timer = "3.0" futures-util = { version = "0.3.1", default-features = false, features = ["io", "async-await"] } +fxhash = "0.2.1" getset = "0.1" grpcio = { version = "0.10.3", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } @@ -104,6 +105,7 @@ hyper = { version = "0.14", features = ["full"] } hyper-tls = "0.5" into_other = { path = "components/into_other", default-features = false } itertools = "0.10" +keyed_priority_queue = "0.4" keys = { path = "components/keys", default-features = false } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs index 97ff49f965b..46ed24fde70 100644 --- a/src/storage/lock_manager/lock_wait_context.rs +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -14,21 +14,17 @@ //! Note: The corresponding implementation in `WaiterManager` is not yet //! implemented, and this mod is currently not used yet. -use std::{ - convert::TryInto, - result::Result, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, -}; +use std::{convert::TryInto, result::Result, sync::Arc}; use parking_lot::Mutex; -use txn_types::TimeStamp; +use txn_types::Key; use crate::storage::{ errors::SharedError, - lock_manager::{lock_waiting_queue::PessimisticLockKeyCallback, LockManager, LockWaitToken}, + lock_manager::{ + lock_waiting_queue::{LockWaitQueues, PessimisticLockKeyCallback}, + LockManager, LockWaitToken, + }, Error as StorageError, PessimisticLockRes, ProcessResult, StorageCallback, }; @@ -37,10 +33,6 @@ pub struct LockWaitContextInner { /// Usually, requests are accepted from RPC, and in this case calling /// the callback means returning the response to the client via RPC. cb: StorageCallback, - - /// The token of the corresponding waiter in `LockManager`. - #[allow(dead_code)] - lock_wait_token: LockWaitToken, } /// The content of the `LockWaitContext` that needs to be shared among all @@ -54,53 +46,41 @@ pub struct LockWaitContextInner { /// and the request is going to be finished, they need to take the /// [`LockWaitContextInner`] to call the callback. /// * The [`LockWaitEntry`](crate::storage::lock_manager::lock_waiting_queue::LockWaitEntry), for -/// checking whether the request is already finished (cancelled). +/// providing information pub struct LockWaitContextSharedState { ctx_inner: Mutex>, - pub finished: AtomicBool, -} -impl LockWaitContextSharedState { - /// Checks whether the lock-waiting request is already finished. - pub fn is_finished(&self) -> bool { - self.finished.load(Ordering::Acquire) - } + /// The token to identify the waiter. + lock_wait_token: LockWaitToken, + + /// The key on which lock waiting occurs. + key: Key, } #[derive(Clone)] pub struct LockWaitContext { shared_states: Arc, - #[allow(dead_code)] - lock_manager: L, + lock_wait_queues: LockWaitQueues, allow_lock_with_conflict: bool, - - // Fields for logging: - start_ts: TimeStamp, - for_update_ts: TimeStamp, } impl LockWaitContext { pub fn new( - lock_manager: L, + key: Key, + lock_wait_queues: LockWaitQueues, lock_wait_token: LockWaitToken, - start_ts: TimeStamp, - for_update_ts: TimeStamp, cb: StorageCallback, allow_lock_with_conflict: bool, ) -> Self { - let inner = LockWaitContextInner { - cb, - lock_wait_token, - }; + let inner = LockWaitContextInner { cb }; Self { shared_states: Arc::new(LockWaitContextSharedState { ctx_inner: Mutex::new(Some(inner)), - finished: AtomicBool::new(false), + key, + lock_wait_token, }), - lock_manager, + lock_wait_queues, allow_lock_with_conflict, - start_ts, - for_update_ts, } } @@ -128,7 +108,7 @@ impl LockWaitContext { pub fn get_callback_for_blocked_key(&self) -> PessimisticLockKeyCallback { let ctx = self.clone(); Box::new(move |res| { - ctx.finish_request(res); + ctx.finish_request(res, false); }) } @@ -136,29 +116,38 @@ impl LockWaitContext { /// called by /// [`WaiterManager`](crate::server::lock_manager::WaiterManager) due to /// timeout. + /// + /// This function is assumed to be called when the lock-waiting request is + /// queueing but canceled outside, so it includes an operation to actively + /// remove the entry from the lock waiting queue. pub fn get_callback_for_cancellation(&self) -> impl FnOnce(StorageError) { let ctx = self.clone(); move |e| { - ctx.finish_request(Err(e.into())); + ctx.finish_request(Err(e.into()), true); } } - fn finish_request(&self, result: Result) { - let ctx_inner = if let Some(inner) = self.shared_states.ctx_inner.lock().take() { - inner + fn finish_request(&self, result: Result, is_canceling: bool) { + if is_canceling { + let entry = self + .lock_wait_queues + .remove_by_token(&self.shared_states.key, self.shared_states.lock_wait_token); + if entry.is_none() { + // Already popped out from the queue so that it will be woken up normally. Do + // nothing. + return; + } } else { - debug!("double invoking of finish_request of LockWaitContext"; - "start_ts" => self.start_ts, - "for_update_ts" => self.for_update_ts - ); - return; - }; - - self.shared_states.finished.store(true, Ordering::Release); + // TODO: Uncomment this after the corresponding change of + // `LockManager` is done. self.lock_wait_queues. + // get_lock_mgr() .remove_lock_wait(ctx_inner. + // lock_wait_token); + } - // TODO: Uncomment this after the corresponding change of `LockManager` is done. - // self.lock_manager - // .remove_lock_wait(ctx_inner.lock_wait_token); + // When this is executed, the waiter is either woken up from the queue or + // canceled and removed from the queue. There should be no chance to try + // to take the `ctx_inner` more than once. + let ctx_inner = self.shared_states.ctx_inner.lock().take().unwrap(); if !self.allow_lock_with_conflict { // The result must be an owned error. @@ -176,15 +165,17 @@ impl LockWaitContext { #[cfg(test)] mod tests { use std::{ + default::Default, sync::mpsc::{channel, Receiver}, time::Duration, }; use super::*; use crate::storage::{ - lock_manager::DummyLockManager, + lock_manager::{lock_waiting_queue::LockWaitEntry, DummyLockManager}, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::{Error as TxnError, ErrorInner as TxnErrorInner}, + types::PessimisticLockParameters, ErrorInner as StorageErrorInner, Result as StorageResult, }; @@ -197,23 +188,18 @@ mod tests { (cb, rx) } - fn create_test_lock_wait_ctx() -> ( + fn create_test_lock_wait_ctx( + key: &Key, + lock_wait_queues: &LockWaitQueues, + ) -> ( + LockWaitToken, LockWaitContext, Receiver>>, ) { - // TODO: Use `ProxyLockMgr` to check the correctness of the `remove_lock_wait` - // invocation. - let lock_mgr = DummyLockManager {}; let (cb, rx) = create_storage_cb(); - let ctx = LockWaitContext::new( - lock_mgr, - super::super::LockWaitToken(Some(1)), - 1.into(), - 1.into(), - cb, - false, - ); - (ctx, rx) + let token = LockWaitToken(Some(1)); + let ctx = LockWaitContext::new(key.clone(), lock_wait_queues.clone(), token, cb, false); + (token, ctx, rx) } #[test] @@ -236,7 +222,13 @@ mod tests { )))) }; - let (ctx, rx) = create_test_lock_wait_ctx(); + let key = Key::from_raw(b"k"); + + // TODO: Use `ProxyLockMgr` to check the correctness of the `remove_lock_wait` + // invocation. + let lock_wait_queues = LockWaitQueues::new(DummyLockManager {}); + + let (_, ctx, rx) = create_test_lock_wait_ctx(&key, &lock_wait_queues); // Nothing happens currently. (ctx.get_callback_for_first_write_batch()).execute(ProcessResult::Res); rx.recv_timeout(Duration::from_millis(20)).unwrap_err(); @@ -253,8 +245,27 @@ mod tests { // Nothing happens if the callback is double-called. (ctx.get_callback_for_cancellation())(StorageError::from(key_is_locked())); - let (ctx, rx) = create_test_lock_wait_ctx(); + let (token, ctx, rx) = create_test_lock_wait_ctx(&key, &lock_wait_queues); + // Add a corresponding entry to the lock waiting queue to test actively removing + // the entry from the queue. + lock_wait_queues.push_lock_wait( + Box::new(LockWaitEntry { + key: key.clone(), + lock_hash: key.gen_hash(), + parameters: PessimisticLockParameters { + start_ts: 1.into(), + for_update_ts: 1.into(), + ..Default::default() + }, + lock_wait_token: token, + legacy_wake_up_index: None, + key_cb: None, + }), + kvproto::kvrpcpb::LockInfo::default(), + ); + lock_wait_queues.must_have_next_entry(b"k", 1); (ctx.get_callback_for_cancellation())(StorageError::from(key_is_locked())); + lock_wait_queues.must_not_contain_key(b"k"); let res = rx.recv().unwrap().unwrap_err(); assert!(matches!( &res, @@ -262,8 +273,10 @@ mod tests { box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::KeyIsLocked(_))) ))) )); - // Nothing happens if the callback is double-called. - (ctx.get_callback_for_blocked_key())(Err(SharedError::from(write_conflict()))); + // Since the cancellation callback can fully execute only when it's successfully + // removed from the lock waiting queues, it's impossible that `finish_request` + // is called again after that. + // The tx should be dropped. rx.recv().unwrap_err(); } diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index c1f2e800834..3651ce21c1c 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -55,7 +55,6 @@ //! for executing the future in a suitable place. use std::{ - collections::BinaryHeap, future::Future, pin::Pin, result::Result, @@ -68,6 +67,7 @@ use std::{ use dashmap; use futures_util::compat::Future01CompatExt; +use keyed_priority_queue::KeyedPriorityQueue; use kvproto::kvrpcpb; use smallvec::SmallVec; use sync_wrapper::SyncWrapper; @@ -76,7 +76,7 @@ use txn_types::{Key, TimeStamp}; use crate::storage::{ errors::SharedError, - lock_manager::{lock_wait_context::LockWaitContextSharedState, LockManager, LockWaitToken}, + lock_manager::{LockManager, LockWaitToken}, metrics::*, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::Error as TxnError, @@ -92,11 +92,8 @@ pub type PessimisticLockKeyCallback = CallbackWithSharedError, pub parameters: PessimisticLockParameters, pub lock_wait_token: LockWaitToken, - pub req_states: Option>, pub legacy_wake_up_index: Option, pub key_cb: Option>, } @@ -111,7 +108,7 @@ impl Eq for LockWaitEntry {} impl PartialOrd for LockWaitEntry { fn partial_cmp(&self, other: &Self) -> Option { - // Reverse it since the std BinaryHeap is max heap and we want to pop the + // Reverse it since the priority queue is a max heap and we want to pop the // minimal. other .parameters @@ -122,7 +119,7 @@ impl PartialOrd for LockWaitEntry { impl Ord for LockWaitEntry { fn cmp(&self, other: &Self) -> std::cmp::Ordering { - // Reverse it since the std BinaryHeap is max heap and we want to pop the + // Reverse it since the priority queue is a max heap and we want to pop the // minimal. other.parameters.start_ts.cmp(&self.parameters.start_ts) } @@ -185,7 +182,11 @@ pub struct KeyLockWaitState { /// return it from a [`DelayedNotifyAllFuture`]. See /// [`LockWaitQueues::pop_for_waking_up`]. legacy_wake_up_index: usize, - queue: BinaryHeap>, + queue: KeyedPriorityQueue< + LockWaitToken, + Box, + std::hash::BuildHasherDefault, + >, /// The start_ts of the most recent waking up event. last_conflict_start_ts: TimeStamp, @@ -201,7 +202,7 @@ impl KeyLockWaitState { Self { current_lock: kvrpcpb::LockInfo::default(), legacy_wake_up_index: 0, - queue: BinaryHeap::new(), + queue: KeyedPriorityQueue::default(), last_conflict_start_ts: TimeStamp::zero(), last_conflict_commit_ts: TimeStamp::zero(), delayed_notify_all_state: None, @@ -219,7 +220,6 @@ pub struct LockWaitQueueInner { #[derive(Clone)] pub struct LockWaitQueues { inner: Arc, - #[allow(dead_code)] lock_mgr: L, } @@ -256,7 +256,10 @@ impl LockWaitQueues { if lock_wait_entry.legacy_wake_up_index.is_none() { lock_wait_entry.legacy_wake_up_index = Some(key_state.value().legacy_wake_up_index); } - key_state.value_mut().queue.push(lock_wait_entry); + key_state + .value_mut() + .queue + .push(lock_wait_entry.lock_wait_token, lock_wait_entry); let len = key_state.value_mut().queue.len(); drop(key_state); @@ -312,14 +315,9 @@ impl LockWaitQueues { v.last_conflict_start_ts = conflicting_start_ts; v.last_conflict_commit_ts = conflicting_commit_ts; - while let Some(lock_wait_entry) = v.queue.pop() { + if let Some((_, lock_wait_entry)) = v.queue.pop() { removed_waiters += 1; - if lock_wait_entry.req_states.as_ref().unwrap().is_finished() { - // Skip already cancelled entries. - continue; - } - if !lock_wait_entry.parameters.allow_lock_with_conflict { // If a pessimistic lock request in legacy mode is woken up, increase the // counter. @@ -334,7 +332,6 @@ impl LockWaitQueues { } else { result = Some((lock_wait_entry, None)); } - break; } // Remove the queue if it's emptied. @@ -464,13 +461,7 @@ impl LockWaitQueues { let legacy_wake_up_index = v.legacy_wake_up_index; - while let Some(front) = v.queue.peek() { - if front.req_states.as_ref().unwrap().is_finished() { - // Skip already cancelled entries. - v.queue.pop(); - removed_waiters += 1; - continue; - } + while let Some((_, front)) = v.queue.peek() { if front .legacy_wake_up_index .map_or(false, |idx| idx >= legacy_wake_up_index) @@ -479,7 +470,7 @@ impl LockWaitQueues { // delayed_notify_all operation. Keep it and other remaining items in the queue. break; } - let lock_wait_entry = v.queue.pop().unwrap(); + let (_, lock_wait_entry) = v.queue.pop().unwrap(); removed_waiters += 1; if lock_wait_entry.parameters.allow_lock_with_conflict { woken_up_resumable_entry = Some(lock_wait_entry); @@ -523,6 +514,63 @@ impl LockWaitQueues { // Return the item to be woken up in resumable way. woken_up_resumable_entry } + + /// Finds a specific LockWaitEntry by key and token, and removes it from the + /// queue. No extra operation will be performed on the removed entry. + /// The caller is responsible for finishing or cancelling the request to + /// let it return the response to the client. + pub fn remove_by_token( + &self, + key: &Key, + lock_wait_token: LockWaitToken, + ) -> Option> { + let mut result = None; + + // We don't want other threads insert any more entries between finding the + // queue is empty and removing the queue from the map. Wrap the logic + // within a call to `remove_if_mut` to avoid releasing lock during the + // procedure. + let removed_key = self.inner.queue_map.remove_if_mut(key, |_, v| { + if let Some(res) = v.queue.remove(&lock_wait_token) { + LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.waiters.dec(); + result = Some(res); + } + v.queue.is_empty() + }); + + if removed_key.is_some() { + LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.keys.dec(); + } + + result + } + + #[allow(dead_code)] + pub(super) fn get_lock_mgr(&self) -> &L { + &self.lock_mgr + } + + #[cfg(test)] + pub fn must_not_contain_key(&self, key: &[u8]) { + assert!(self.inner.queue_map.get(&Key::from_raw(key)).is_none()); + } + + #[cfg(test)] + pub fn must_have_next_entry(&self, key: &[u8], start_ts: impl Into) { + assert_eq!( + self.inner + .queue_map + .get(&Key::from_raw(key)) + .unwrap() + .queue + .peek() + .unwrap() + .1 + .parameters + .start_ts, + start_ts.into() + ); + } } #[cfg(test)] @@ -540,6 +588,7 @@ mod tests { }; struct TestLockWaitEntryHandle { + token: LockWaitToken, wake_up_rx: Receiver>, cancel_cb: Box, } @@ -573,7 +622,7 @@ mod tests { // Additionally add some helper functions to the LockWaitQueues for simplifying // test code. impl LockWaitQueues { - fn make_lock_info_pb(&self, key: &[u8], ts: impl Into) -> kvrpcpb::LockInfo { + pub fn make_lock_info_pb(&self, key: &[u8], ts: impl Into) -> kvrpcpb::LockInfo { let ts = ts.into(); let mut lock_info = kvrpcpb::LockInfo::default(); lock_info.set_lock_version(ts.into_inner()); @@ -590,13 +639,12 @@ mod tests { lock_info_pb: kvrpcpb::LockInfo, ) -> (Box, TestLockWaitEntryHandle) { let start_ts = start_ts.into(); - let token = super::super::LockWaitToken(Some(1)); + let token = LockWaitToken(Some(self.allocate_internal_id())); let dummy_request_cb = StorageCallback::PessimisticLock(Box::new(|_| ())); let dummy_ctx = LockWaitContext::new( - self.lock_mgr.clone(), + Key::from_raw(key), + self.clone(), token, - start_ts, - start_ts, dummy_request_cb, false, ); @@ -623,7 +671,6 @@ mod tests { lock_hash, parameters, lock_wait_token: token, - req_states: Some(dummy_ctx.get_shared_states().clone()), legacy_wake_up_index: None, key_cb: Some(SyncWrapper::new(Box::new(move |res| tx.send(res).unwrap()))), }); @@ -638,6 +685,7 @@ mod tests { ( lock_wait_entry, TestLockWaitEntryHandle { + token, wake_up_rx: rx, cancel_cb: Box::new(cancel), }, @@ -730,25 +778,6 @@ mod tests { res } - fn must_not_contain_key(&self, key: &[u8]) { - assert!(self.inner.queue_map.get(&Key::from_raw(key)).is_none()); - } - - fn must_have_next_entry(&self, key: &[u8], start_ts: impl Into) { - assert_eq!( - self.inner - .queue_map - .get(&Key::from_raw(key)) - .unwrap() - .queue - .peek() - .unwrap() - .parameters - .start_ts, - start_ts.into() - ); - } - fn get_delayed_notify_id(&self, key: &[u8]) -> Option { self.inner .queue_map @@ -758,6 +787,13 @@ mod tests { .as_ref() .map(|(id, ..)| *id) } + + fn get_queue_length_of_key(&self, key: &[u8]) -> usize { + self.inner + .queue_map + .get(&Key::from_raw(key)) + .map_or(0, |v| v.queue.len()) + } } impl LockWaitEntry { @@ -836,6 +872,47 @@ mod tests { queues.must_not_contain_key(b"k1"); } + #[test] + fn test_removing_by_token() { + let queues = LockWaitQueues::new(DummyLockManager {}); + + queues.mock_lock_wait(b"k1", 10, 5, false); + let token11 = queues.mock_lock_wait(b"k1", 11, 5, false).token; + queues.mock_lock_wait(b"k1", 12, 5, false); + let token13 = queues.mock_lock_wait(b"k1", 13, 5, false).token; + queues.mock_lock_wait(b"k1", 14, 5, false); + assert_eq!(queues.get_queue_length_of_key(b"k1"), 5); + + queues + .remove_by_token(&Key::from_raw(b"k1"), token11) + .unwrap() + .check_key(b"k1") + .check_start_ts(11); + queues + .remove_by_token(&Key::from_raw(b"k1"), token13) + .unwrap() + .check_key(b"k1") + .check_start_ts(13); + assert_eq!(queues.get_queue_length_of_key(b"k1"), 3); + + // Removing not-existing entry takes no effect. + assert!( + queues + .remove_by_token(&Key::from_raw(b"k1"), token11) + .is_none() + ); + assert!( + queues + .remove_by_token(&Key::from_raw(b"k2"), token11) + .is_none() + ); + assert_eq!(queues.get_queue_length_of_key(b"k1"), 3); + + queues.must_pop(b"k1", 5, 6).check_start_ts(10); + queues.must_pop(b"k1", 5, 6).check_start_ts(12); + queues.must_pop(b"k1", 5, 6).check_start_ts(14); + } + #[test] fn test_dropping_cancelled_entries() { let queues = LockWaitQueues::new(DummyLockManager {}); @@ -846,10 +923,14 @@ mod tests { let h13 = queues.mock_lock_wait(b"k1", 13, 5, false); queues.mock_lock_wait(b"k1", 14, 5, false); + assert_eq!(queues.get_queue_length_of_key(b"k1"), 5); + h10.cancel(); h11.cancel(); h13.cancel(); + assert_eq!(queues.get_queue_length_of_key(b"k1"), 2); + for &expected_start_ts in &[12u64, 14] { queues .must_pop(b"k1", 5, 6) diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index b74c5b7d51f..2bbe4b7b762 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -595,7 +595,7 @@ lazy_static! { pub static ref LOCK_WAIT_QUEUE_LENGTH_HISTOGRAM: Histogram = register_histogram!( "tikv_lock_wait_queue_length", - "Statistics of length of queues counted when enqueueing", + "Statistics of length of queues counted when enqueueing", exponential_buckets(1.0, 2.0, 16).unwrap() ) .unwrap(); From efc84fc2e346fffef3f21d5e15696e217068abcd Mon Sep 17 00:00:00 2001 From: lizhenhuan <1916038084@qq.com> Date: Thu, 13 Oct 2022 13:19:50 +0800 Subject: [PATCH 266/676] Json contains push tikv (#13469) close tikv/tikv#13468 Signed-off-by: lizhenhuan <1916038084@qq.com> Signed-off-by: 3pointer Signed-off-by: Leavrth Signed-off-by: CalvinNeo Co-authored-by: 3pointer Co-authored-by: Jianjun Liao <36503113+Leavrth@users.noreply.github.com> Co-authored-by: Calvin Neo Co-authored-by: Yilin Chen Co-authored-by: Ti Chi Robot --- .../src/codec/mysql/json/binary.rs | 4 +- .../src/codec/mysql/json/json_contains.rs | 106 +++++ .../src/codec/mysql/json/mod.rs | 1 + components/tidb_query_expr/src/impl_json.rs | 383 ++++++++++++++++++ components/tidb_query_expr/src/lib.rs | 1 + 5 files changed, 493 insertions(+), 2 deletions(-) create mode 100644 components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs diff --git a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs index 12f8fbd5129..daeae751fb5 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs @@ -17,7 +17,7 @@ impl<'a> JsonRef<'a> { /// Return the `i`th key in current Object json /// - /// See `arrayGetElem()` in TiDB `json/binary.go` + /// See `objectGetKey()` in TiDB `types/json_binary.go` pub fn object_get_key(&self, i: usize) -> &'a [u8] { let key_off_start = HEADER_LEN + i * KEY_ENTRY_LEN; let key_off = NumberCodec::decode_u32_le(&self.value()[key_off_start..]) as usize; @@ -28,7 +28,7 @@ impl<'a> JsonRef<'a> { /// Returns the JsonRef of `i`th value in current Object json /// - /// See `arrayGetElem()` in TiDB `json/binary.go` + /// See `objectGetVal()` in TiDB `types/json_binary.go` pub fn object_get_val(&self, i: usize) -> Result> { let ele_count = self.get_elem_count(); let val_entry_off = HEADER_LEN + ele_count * KEY_ENTRY_LEN + i * VALUE_ENTRY_LEN; diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs new file mode 100644 index 00000000000..46de1af9e0b --- /dev/null +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_contains.rs @@ -0,0 +1,106 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::cmp::Ordering; + +use super::{super::Result, JsonRef, JsonType}; + +impl<'a> JsonRef<'a> { + /// `json_contains` is the implementation for JSON_CONTAINS in mysql + /// + /// See `ContainsBinaryJSON()` in TiDB `types/json_binary_functions.go` + pub fn json_contains(&self, target: JsonRef<'_>) -> Result { + match self.type_code { + JsonType::Object => { + if target.type_code == JsonType::Object { + let elem_count = target.get_elem_count(); + for i in 0..elem_count { + let key = target.object_get_key(i); + let val = target.object_get_val(i)?; + let idx = self.object_search_key(key); + match idx { + None => { + return Ok(false); + } + Some(idx) => { + let exp = self.object_get_val(idx)?; + if !(exp.json_contains(val)?) { + return Ok(false); + } + } + } + } + return Ok(true); + } + } + JsonType::Array => { + if target.type_code == JsonType::Array { + let elem_count = target.get_elem_count(); + for i in 0..elem_count { + if !(self.json_contains(target.array_get_elem(i)?)?) { + return Ok(false); + } + } + return Ok(true); + } + let elem_count = self.get_elem_count(); + for i in 0..elem_count { + if self.array_get_elem(i)?.json_contains(target)? { + return Ok(true); + } + } + } + _ => { + return match self.partial_cmp(&target).unwrap() { + Ordering::Equal => Ok(true), + _ => Ok(false), + }; + } + }; + Ok(false) + } +} + +#[cfg(test)] +mod tests { + use super::super::Json; + #[test] + fn test_json_contains() { + let mut test_cases = vec![ + (r#"{"a":{"a":1},"b":2}"#, r#"{"b":2}"#, true), + (r#"{}"#, r#"{}"#, true), + (r#"{"a":1}"#, r#"{}"#, true), + (r#"{"a":1}"#, r#"1"#, false), + (r#"{"a":[1]}"#, r#"[1]"#, false), + (r#"{"b":2, "c":3}"#, r#"{"c":3}"#, true), + (r#"1"#, r#"1"#, true), + (r#"[1]"#, r#"1"#, true), + (r#"[1,2]"#, r#"[1]"#, true), + (r#"[1,2]"#, r#"[1,3]"#, false), + (r#"[1,2]"#, r#"["1"]"#, false), + (r#"[1,2,[1,3]]"#, r#"[1,3]"#, true), + (r#"[1,2,[1,[5,[3]]]]"#, r#"[1,3]"#, true), + (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[1,{"a":[3]}]"#, true), + (r#"[{"a":1}]"#, r#"{"a":1}"#, true), + (r#"[{"a":1,"b":2}]"#, r#"{"a":1}"#, true), + (r#"[{"a":{"a":1},"b":2}]"#, r#"{"a":1}"#, false), + (r#"{"a":{"a":1},"b":2}"#, r#"{"b":3}"#, false), + (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[1,{"a":[3]}]"#, true), + (r#"[1,2,[1,[5,{"a":[2,3]}]]]"#, r#"[10,{"a":[3]}]"#, false), + ]; + for (i, (js, value, expected)) in test_cases.drain(..).enumerate() { + let j = js.parse(); + assert!(j.is_ok(), "#{} expect parse ok but got {:?}", i, j); + let j: Json = j.unwrap(); + let value = value.parse(); + assert!(value.is_ok(), "#{} expect parse ok but got {:?}", i, j); + let value: Json = value.unwrap(); + + let got = j.as_ref().json_contains(value.as_ref()).unwrap(); + assert_eq!( + got, expected, + "#{} expect {:?}, but got {:?}", + i, expected, got + ); + } + } +} diff --git a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs index f21f789c0d0..0cd382f6d65 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/mod.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/mod.rs @@ -65,6 +65,7 @@ mod modifier; mod path_expr; mod serde; // json functions +mod json_contains; mod json_depth; mod json_extract; mod json_keys; diff --git a/components/tidb_query_expr/src/impl_json.rs b/components/tidb_query_expr/src/impl_json.rs index 60f784dc604..1926cc648e0 100644 --- a/components/tidb_query_expr/src/impl_json.rs +++ b/components/tidb_query_expr/src/impl_json.rs @@ -290,6 +290,53 @@ fn json_length(args: &[ScalarValueRef]) -> Result> { }) } +// Args should be like `(Option , Option, +// &[Option])`. or `(Option , Option)` +fn json_contains_validator(expr: &tipb::Expr) -> Result<()> { + assert!(expr.get_children().len() == 2 || expr.get_children().len() == 3); + let children = expr.get_children(); + super::function::validate_expr_return_type(&children[0], EvalType::Json)?; + super::function::validate_expr_return_type(&children[1], EvalType::Json)?; + if expr.get_children().len() == 3 { + super::function::validate_expr_return_type(&children[2], EvalType::Bytes)?; + } + Ok(()) +} + +#[rpn_fn(nullable, raw_varg,min_args= 2, max_args = 3, extra_validator = json_contains_validator)] +#[inline] +fn json_contains(args: &[ScalarValueRef]) -> Result> { + assert!(args.len() == 2 || args.len() == 3); + let j: Option = args[0].as_json(); + let mut j = match j { + None => return Ok(None), + Some(j) => j.to_owned(), + }; + let target: Option = args[1].as_json(); + let target = match target { + None => return Ok(None), + Some(target) => target, + }; + + if args.len() == 3 { + match parse_json_path_list(&args[2..])? { + Some(path_expr_list) => { + if path_expr_list.len() == 1 && path_expr_list[0].contains_any_asterisk() { + return Ok(None); + } + match j.as_ref().extract(&path_expr_list)? { + Some(json) => { + j = json; + } + _ => return Ok(None), + } + } + None => return Ok(None), + }; + } + Ok(Some(j.as_ref().json_contains(target)? as i64)) +} + #[rpn_fn(nullable, raw_varg, min_args = 2, extra_validator = json_with_paths_validator)] #[inline] fn json_remove(args: &[ScalarValueRef]) -> Result> { @@ -779,6 +826,342 @@ mod tests { } } + #[test] + fn test_json_contains() { + let cases: Vec<(Vec, Option)> = vec![ + ( + vec![ + Some(Json::from_str(r#"{"a":{"a":1},"b":2}"#).unwrap()).into(), + Some(Json::from_str(r#"2"#).unwrap()).into(), + Some(b"$.b".to_vec()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"{"a":{"a":1},"b":2}"#).unwrap()).into(), + Some(Json::from_str(r#"3"#).unwrap()).into(), + Some(b"$.b".to_vec()).into(), + ], + Some(0), + ), + ( + vec![ + Some(Json::from_str(r#"{"a":{"a":1},"b":2}"#).unwrap()).into(), + Some(Json::from_str(r#"{"b":3}"#).unwrap()).into(), + ], + Some(0), + ), + ( + vec![ + Some(Json::from_str(r#"{"a":{"a":1},"b":2}"#).unwrap()).into(), + Some(Json::from_str(r#"{"b":2}"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"{"a":{"a":1},"b":2}"#).unwrap()).into(), + Some(Json::from_str(r#"{"a":1}"#).unwrap()).into(), + Some(b"$.a".to_vec()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[{"optUid": 10, "value": "admin"}]"#).unwrap()).into(), + Some(Json::from_str(r#"10"#).unwrap()).into(), + Some(b"$[0].optUid".to_vec()).into(), + ], + Some(1), + ), + // copy from tidb Tests None arguments + (vec![None::.into(), None::.into()], None), + ( + vec![ + Some(Json::from_str(r#"{"a":1}"#).unwrap()).into(), + None::.into(), + ], + None, + ), + ( + vec![ + None::.into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + ], + None, + ), + ( + vec![ + None::.into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + Some(b"$.c".to_vec()).into(), + ], + None, + ), + ( + vec![ + Some(Json::from_str(r#"{"a": [1, 2, {"aa": "xx"}]}"#).unwrap()).into(), + None::.into(), + Some(b"$.a[3]".to_vec()).into(), + ], + None, + ), + ( + vec![ + Some(Json::from_str(r#"{"a": [1, 2, {"aa": "xx"}]}"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + None::.into(), + ], + None, + ), + // Tests with path expression + ( + vec![ + Some(Json::from_str(r#"[1,2,[1,[5,[3]]]]"#).unwrap()).into(), + Some(Json::from_str(r#"[1,3]"#).unwrap()).into(), + Some(b"$[2]".to_vec()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[1,2,[1,[5,{"a":[2,3]}]]]"#).unwrap()).into(), + Some(Json::from_str(r#"[1,{"a":[3]}]"#).unwrap()).into(), + Some(b"$[2]".to_vec()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[{"a":1}]"#).unwrap()).into(), + Some(Json::from_str(r#"{"a":1}"#).unwrap()).into(), + Some(b"$".to_vec()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[{"a":1,"b":2}]"#).unwrap()).into(), + Some(Json::from_str(r#"{"a":1,"b":2}"#).unwrap()).into(), + Some(b"$".to_vec()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[{"a":{"a":1},"b":2}]"#).unwrap()).into(), + Some(Json::from_str(r#"{"a":1}"#).unwrap()).into(), + Some(b"$.a".to_vec()).into(), + ], + None, + ), + // Tests without path expression + // {[]interface{}{`{}`, `{}`}, 1, nil}, + // {[]interface{}{`{"a":1}`, `{}`}, 1, nil}, + // {[]interface{}{`{"a":1}`, `1`}, 0, nil}, + ( + vec![ + Some(Json::from_str(r#"{}"#).unwrap()).into(), + Some(Json::from_str(r#"{}"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"{"a":1}"#).unwrap()).into(), + Some(Json::from_str(r#"{}"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"{"a":1}"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + ], + Some(0), + ), + // {[]interface{}{`{"a":[1]}`, `[1]`}, 0, nil}, + // {[]interface{}{`{"b":2, "c":3}`, `{"c":3}`}, 1, nil}, + // {[]interface{}{`1`, `1`}, 1, nil}, + // {[]interface{}{`[1]`, `1`}, 1, nil}, + ( + vec![ + Some(Json::from_str(r#"{"a":[1]}"#).unwrap()).into(), + Some(Json::from_str(r#"[1]"#).unwrap()).into(), + ], + Some(0), + ), + ( + vec![ + Some(Json::from_str(r#"{"b":2, "c":3}"#).unwrap()).into(), + Some(Json::from_str(r#"{"c":3}"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"1"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[1]"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + ], + Some(1), + ), + // {[]interface{}{`[1,2]`, `[1]`}, 1, nil}, + // {[]interface{}{`[1,2]`, `[1,3]`}, 0, nil}, + // {[]interface{}{`[1,2]`, `["1"]`}, 0, nil}, + // {[]interface{}{`[1,2,[1,3]]`, `[1,3]`}, 1, nil}, + ( + vec![ + Some(Json::from_str(r#"[1,2]"#).unwrap()).into(), + Some(Json::from_str(r#"[1]"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[1,2]"#).unwrap()).into(), + Some(Json::from_str(r#"[1,3]"#).unwrap()).into(), + ], + Some(0), + ), + ( + vec![ + Some(Json::from_str(r#"[1,2]"#).unwrap()).into(), + Some(Json::from_str(r#"["1"]"#).unwrap()).into(), + ], + Some(0), + ), + ( + vec![ + Some(Json::from_str(r#"[1,2,[1,3]]"#).unwrap()).into(), + Some(Json::from_str(r#"[1,3]"#).unwrap()).into(), + ], + Some(1), + ), + // {[]interface{}{`[1,2,[1,3]]`, `[1, 3]`}, 1, nil}, + // {[]interface{}{`[1,2,[1,[5,[3]]]]`, `[1,3]`}, 1, nil}, + // {[]interface{}{`[1,2,[1,[5,{"a":[2,3]}]]]`, `[1,{"a":[3]}]`}, 1, nil}, + ( + vec![ + Some(Json::from_str(r#"[1,2,[1,3]]"#).unwrap()).into(), + Some(Json::from_str(r#"[1, 3]"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[1,2,[1,[5,[3]]]]"#).unwrap()).into(), + Some(Json::from_str(r#"[1,3]"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[1,2,[1,[5,{"a":[2,3]}]]]"#).unwrap()).into(), + Some(Json::from_str(r#"[1,{"a":[3]}]"#).unwrap()).into(), + ], + Some(1), + ), + // {[]interface{}{`[{"a":1}]`, `{"a":1}`}, 1, nil}, + // {[]interface{}{`[{"a":1,"b":2}]`, `{"a":1}`}, 1, nil}, + // {[]interface{}{`[{"a":{"a":1},"b":2}]`, `{"a":1}`}, 0, nil}, + ( + vec![ + Some(Json::from_str(r#"[{"a":1}]"#).unwrap()).into(), + Some(Json::from_str(r#"{"a":1}"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[{"a":1,"b":2}]"#).unwrap()).into(), + Some(Json::from_str(r#"{"a":1}"#).unwrap()).into(), + ], + Some(1), + ), + ( + vec![ + Some(Json::from_str(r#"[{"a":{"a":1},"b":2}]"#).unwrap()).into(), + Some(Json::from_str(r#"{"a":1}"#).unwrap()).into(), + ], + Some(0), + ), + // Tests path expression contains any asterisk + // {[]interface{}{`{"a": [1, 2, {"aa": "xx"}]}`, `1`, "$.*"}, nil, + // json.ErrInvalidJSONPathWildcard}, {[]interface{}{`{"a": [1, 2, {"aa": + // "xx"}]}`, `1`, "$[*]"}, nil, json.ErrInvalidJSONPathWildcard}, + // {[]interface{}{`{"a": [1, 2, {"aa": "xx"}]}`, `1`, "$**.a"}, nil, + // json.ErrInvalidJSONPathWildcard}, + ( + vec![ + Some(Json::from_str(r#"{"a": [1, 2, {"aa": "xx"}]}"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + Some(b"$.*".to_vec()).into(), + ], + None, + ), + ( + vec![ + Some(Json::from_str(r#"{"a": [1, 2, {"aa": "xx"}]}"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + Some(b"$[*]".to_vec()).into(), + ], + None, + ), + ( + vec![ + Some(Json::from_str(r#"{"a": [1, 2, {"aa": "xx"}]}"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + Some(b"$**.a".to_vec()).into(), + ], + None, + ), + // Tests path expression does not identify a section of the target document + // {[]interface{}{`{"a": [1, 2, {"aa": "xx"}]}`, `1`, "$.c"}, nil, nil}, + // {[]interface{}{`{"a": [1, 2, {"aa": "xx"}]}`, `1`, "$.a[3]"}, nil, nil}, + // {[]interface{}{`{"a": [1, 2, {"aa": "xx"}]}`, `1`, "$.a[2].b"}, nil, nil}, + ( + vec![ + Some(Json::from_str(r#"{"a": [1, 2, {"aa": "xx"}]}"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + Some(b"$.c".to_vec()).into(), + ], + None, + ), + ( + vec![ + Some(Json::from_str(r#"{"a": [1, 2, {"aa": "xx"}]}"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + Some(b"$.a[3]".to_vec()).into(), + ], + None, + ), + ( + vec![ + Some(Json::from_str(r#"{"a": [1, 2, {"aa": "xx"}]}"#).unwrap()).into(), + Some(Json::from_str(r#"1"#).unwrap()).into(), + Some(b"$.a[2].b".to_vec()).into(), + ], + None, + ), + ]; + + for (vargs, expected) in cases { + let output = RpnFnScalarEvaluator::new() + .push_params(vargs.clone()) + .evaluate(ScalarFuncSig::JsonContainsSig) + .unwrap(); + assert_eq!(output, expected, "{:?}", vargs); + } + } + #[test] fn test_json_keys() { let cases: Vec<(Vec, Option, bool)> = vec![ diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index ab6e788ae2e..8bb1cc05480 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -590,6 +590,7 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::JsonUnquoteSig => json_unquote_fn_meta(), ScalarFuncSig::JsonExtractSig => json_extract_fn_meta(), ScalarFuncSig::JsonLengthSig => json_length_fn_meta(), + ScalarFuncSig::JsonContainsSig => json_contains_fn_meta(), ScalarFuncSig::JsonRemoveSig => json_remove_fn_meta(), ScalarFuncSig::JsonKeysSig => json_keys_fn_meta(), ScalarFuncSig::JsonKeys2ArgsSig => json_keys_fn_meta(), From 11a340c3ec54fdbe1582a4b6eb669f43aa924ba2 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 13 Oct 2022 14:43:51 +0800 Subject: [PATCH 267/676] local_reader: solve the race condition when acquiring the snapshot (#13568) close tikv/tikv#13553 Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- .../raftstore-v2/src/operation/query/local.rs | 24 +- .../raftstore-v2/src/operation/query/mod.rs | 4 +- components/raftstore/src/store/peer.rs | 14 +- components/raftstore/src/store/worker/read.rs | 530 ++++++++++++------ tests/failpoints/cases/mod.rs | 1 + tests/failpoints/cases/test_local_read.rs | 81 +++ 6 files changed, 453 insertions(+), 201 deletions(-) create mode 100644 tests/failpoints/cases/test_local_read.rs diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index bdf829dc4f5..78cc9976dab 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -16,7 +16,7 @@ use kvproto::{ use raftstore::{ errors::RAFTSTORE_IS_BUSY, store::{ - cmd_resp, util::LeaseState, LocalReaderCore, ReadDelegate, ReadExecutor, + cmd_resp, util::LeaseState, LocalReadContext, LocalReaderCore, ReadDelegate, ReadExecutor, ReadExecutorProvider, RegionSnapshot, RequestInspector, RequestPolicy, TLS_LOCAL_READ_METRICS, }, @@ -110,10 +110,7 @@ where Ok(Some((mut delegate, policy))) => match policy { RequestPolicy::ReadLocal => { let region = Arc::clone(&delegate.region); - let snap = RegionSnapshot::from_snapshot( - delegate.get_snapshot(None, &mut None), - region, - ); + let snap = RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); // Ensures the snapshot is acquired before getting the time atomic::fence(atomic::Ordering::Release); let snapshot_ts = monotonic_raw_now(); @@ -133,10 +130,7 @@ where delegate.check_stale_read_safe(read_ts)?; let region = Arc::clone(&delegate.region); - let snap = RegionSnapshot::from_snapshot( - delegate.get_snapshot(None, &mut None), - region, - ); + let snap = RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); @@ -295,11 +289,7 @@ where self.cached_tablet.latest().unwrap() } - fn get_snapshot( - &mut self, - _: Option, - _: &mut Option>, - ) -> Arc { + fn get_snapshot(&mut self, _: &Option>) -> Arc { Arc::new(self.cached_tablet.latest().unwrap().snapshot()) } } @@ -381,7 +371,7 @@ impl<'r> SnapRequestInspector<'r> { return Ok(RequestPolicy::ReadIndex); } - // If applied index's term is differ from current raft's term, leader transfer + // If applied index's term differs from current raft's term, leader transfer // must happened, if read locally, we may read old value. if !self.has_applied_to_current_term() { return Ok(RequestPolicy::ReadIndex); @@ -769,7 +759,7 @@ mod tests { let mut delegate = delegate.unwrap(); let tablet = delegate.get_tablet(); assert_eq!(tablet1.as_inner().path(), tablet.as_inner().path()); - let snapshot = delegate.get_snapshot(None, &mut None); + let snapshot = delegate.get_snapshot(&None); assert_eq!( b"val1".to_vec(), *snapshot.get_value(b"a1").unwrap().unwrap() @@ -779,7 +769,7 @@ mod tests { let mut delegate = delegate.unwrap(); let tablet = delegate.get_tablet(); assert_eq!(tablet2.as_inner().path(), tablet.as_inner().path()); - let snapshot = delegate.get_snapshot(None, &mut None); + let snapshot = delegate.get_snapshot(&None); assert_eq!( b"val2".to_vec(), *snapshot.get_value(b"a2").unwrap().unwrap() diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index b592b4819a5..0b10e0679a5 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -56,8 +56,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> return Ok(RequestPolicy::ReadIndex); } - // If applied index's term is differ from current raft's term, leader transfer - // must happened, if read locally, we may read old value. + // If applied index's term differs from current raft's term, leader + // transfer must happened, if read locally, we may read old value. // TODO: to add the block back when apply is implemented. // if !self.fsm.peer().has_applied_to_current_term() { // return Ok(RequestPolicy::ReadIndex); diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index c95dda17c2c..37c2fd5a99a 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -60,7 +60,7 @@ use tikv_util::{ codec::number::decode_u64, debug, error, info, sys::disk::DiskUsage, - time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant, InstantExt, ThreadReadId}, + time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant, InstantExt}, warn, worker::Scheduler, Either, @@ -4766,7 +4766,7 @@ where } } - let mut resp = reader.execute(&req, &Arc::new(region), read_index, None, None); + let mut resp = reader.execute(&req, &Arc::new(region), read_index, None); if let Some(snap) = resp.snapshot.as_mut() { snap.txn_ext = Some(self.txn_ext.clone()); snap.bucket_meta = self.region_buckets.as_ref().map(|b| b.meta.clone()); @@ -5558,8 +5558,8 @@ pub trait RequestInspector { return Ok(RequestPolicy::ReadIndex); } - // If applied index's term is differ from current raft's term, leader transfer - // must happened, if read locally, we may read old value. + // If applied index's term differs from current raft's term, leader + // transfer must happened, if read locally, we may read old value. if !self.has_applied_to_current_term() { return Ok(RequestPolicy::ReadIndex); } @@ -5616,11 +5616,7 @@ where &self.engines.kv } - fn get_snapshot( - &mut self, - _: Option, - _: &mut Option>, - ) -> Arc { + fn get_snapshot(&mut self, _: &Option>) -> Arc { Arc::new(self.engines.kv.snapshot()) } } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 5801083f1bc..1f6d7c4bab7 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -6,7 +6,7 @@ use std::{ fmt::{self, Display, Formatter}, ops::Deref, sync::{ - atomic::{AtomicU64, Ordering}, + atomic::{self, AtomicU64, Ordering}, Arc, Mutex, }, }; @@ -47,22 +47,31 @@ pub trait ReadExecutor { type Tablet: KvEngine; fn get_tablet(&mut self) -> &Self::Tablet; + + /// Get the snapshot fo the tablet. + /// + /// If the tablet is not ready, `None` is returned. + /// Currently, only multi-rocksdb version may return `None`. fn get_snapshot( &mut self, - ts: Option, - read_context: &mut Option>, + read_context: &Option>, ) -> Arc<::Snapshot>; - fn get_value(&mut self, req: &Request, region: &metapb::Region) -> Result { + fn get_value( + &mut self, + req: &Request, + region: &metapb::Region, + read_context: &Option>, + ) -> Result { let key = req.get_get().get_key(); // region key range has no data prefix, so we must use origin key to check. util::check_key_in_region(key, region)?; - let engine = self.get_tablet(); let mut resp = Response::default(); + let snapshot = self.get_snapshot(read_context); let res = if !req.get_get().get_cf().is_empty() { let cf = req.get_get().get_cf(); - engine + snapshot .get_value_cf(cf, &keys::data_key(key)) .unwrap_or_else(|e| { panic!( @@ -74,14 +83,16 @@ pub trait ReadExecutor { ) }) } else { - engine.get_value(&keys::data_key(key)).unwrap_or_else(|e| { - panic!( - "[region {}] failed to get {}: {:?}", - region.get_id(), - log_wrappers::Value::key(key), - e - ) - }) + snapshot + .get_value(&keys::data_key(key)) + .unwrap_or_else(|e| { + panic!( + "[region {}] failed to get {}: {:?}", + region.get_id(), + log_wrappers::Value::key(key), + e + ) + }) }; if let Some(res) = res { resp.mut_get().set_value(res.to_vec()); @@ -95,8 +106,7 @@ pub trait ReadExecutor { msg: &RaftCmdRequest, region: &Arc, read_index: Option, - mut ts: Option, - mut read_context: Option>, + local_read_ctx: Option>, ) -> ReadResponse<::Snapshot> { let requests = msg.get_requests(); let mut response = ReadResponse { @@ -108,7 +118,7 @@ pub trait ReadExecutor { for req in requests { let cmd_type = req.get_cmd_type(); let mut resp = match cmd_type { - CmdType::Get => match self.get_value(req, region.as_ref()) { + CmdType::Get => match self.get_value(req, region.as_ref(), &local_read_ctx) { Ok(resp) => resp, Err(e) => { error!(?e; @@ -121,7 +131,7 @@ pub trait ReadExecutor { }, CmdType::Snap => { let snapshot = RegionSnapshot::from_snapshot( - self.get_snapshot(ts.take(), &mut read_context), + self.get_snapshot(&local_read_ctx), region.clone(), ); response.snapshot = Some(snapshot); @@ -187,13 +197,61 @@ where } } -/// #[RaftstoreCommon]: LocalReadContext combines some LocalReader's fields for temporary usage. pub struct LocalReadContext<'a, E> where E: KvEngine, { - read_id: &'a mut ThreadReadId, - snap_cache: &'a mut Box>>, + read_id: Option, + snap_cache: &'a mut SnapCache, +} + +impl<'a, E> LocalReadContext<'a, E> +where + E: KvEngine, +{ + fn new(snap_cache: &'a mut SnapCache, read_id: Option) -> Self { + Self { + snap_cache, + read_id, + } + } + + /// Update the snapshot in the `snap_cache` if the read_id is None or does + /// not match. + fn maybe_update_snapshot(&mut self, engine: &E, delegate_last_valid_ts: Timespec) -> bool { + // When the read_id is None, it means the `snap_cache` has been cleared + // before and the `cached_read_id` of it is None because only a consecutive + // requests will have the same cache and the cache will be cleared after the + // last request of the batch. + if self.read_id.is_some() { + if self.snap_cache.cached_read_id == self.read_id + && self.read_id.as_ref().unwrap().create_time >= delegate_last_valid_ts + { + // Cache hit + return false; + } + + self.snap_cache.cached_read_id = self.read_id.clone(); + } + + self.snap_cache.snapshot = Some(Arc::new(engine.snapshot())); + + // Ensures the snapshot is acquired before getting the time + atomic::fence(atomic::Ordering::Release); + self.snap_cache.cached_snapshot_ts = monotonic_raw_now(); + + true + } + + // Note: must be called after `maybe_update_snapshot` + fn snapshot_ts(&self) -> Timespec { + self.snap_cache.cached_snapshot_ts + } + + // Note: must be called after `maybe_update_snapshot` + fn snapshot(&self) -> Option> { + self.snap_cache.snapshot.clone() + } } impl Drop for ReadDelegate { @@ -460,6 +518,7 @@ impl ReadDelegate { let term = lease.term(); if term == self.term { if lease.inspect(Some(ts)) == LeaseState::Valid { + fail_point!("after_pass_lease_check"); return true; } else { TLS_LOCAL_READ_METRICS @@ -566,6 +625,46 @@ impl Progress { } } +struct SnapCache +where + E: KvEngine, +{ + cached_read_id: Option, + snapshot: Option>, + cached_snapshot_ts: Timespec, +} + +impl SnapCache +where + E: KvEngine, +{ + fn new() -> Self { + SnapCache { + cached_read_id: None, + snapshot: None, + cached_snapshot_ts: Timespec::new(0, 0), + } + } + + fn clear(&mut self) { + self.cached_read_id.take(); + self.snapshot.take(); + } +} + +impl Clone for SnapCache +where + E: KvEngine, +{ + fn clone(&self) -> Self { + Self { + cached_read_id: self.cached_read_id.clone(), + snapshot: self.snapshot.clone(), + cached_snapshot_ts: self.cached_snapshot_ts, + } + } +} + /// #[RaftstoreCommon]: LocalReader is an entry point where local read requests are dipatch to the /// relevant regions by LocalReader so that these requests can be handled by the /// relevant ReadDelegate respectively. @@ -705,10 +804,7 @@ where { local_reader: LocalReaderCore, StoreMetaDelegate>, kv_engine: E, - - snap_cache: Box>>, - cache_read_id: ThreadReadId, - + snap_cache: SnapCache, // A channel to raftstore. router: C, } @@ -719,23 +815,14 @@ where C: ProposalRouter + CasualRouter, { pub fn new(kv_engine: E, store_meta: StoreMetaDelegate, router: C) -> Self { - let cache_read_id = ThreadReadId::new(); Self { local_reader: LocalReaderCore::new(store_meta), kv_engine, - snap_cache: Box::new(None), - cache_read_id, + snap_cache: SnapCache::new(), router, } } - fn local_read_context(&mut self) -> LocalReadContext<'_, E> { - LocalReadContext { - snap_cache: &mut self.snap_cache, - read_id: &mut self.cache_read_id, - } - } - pub fn pre_propose_raft_command( &mut self, req: &RaftCmdRequest, @@ -790,37 +877,34 @@ where pub fn propose_raft_command( &mut self, - mut read_id: Option, + read_id: Option, req: RaftCmdRequest, cb: Callback, ) { match self.pre_propose_raft_command(&req) { Ok(Some((mut delegate, policy))) => { + let snap_updated; + let last_valid_ts = delegate.last_valid_ts; let mut response = match policy { // Leader can read local if and only if it is in lease. RequestPolicy::ReadLocal => { - let snapshot_ts = match read_id.as_mut() { - // If this peer became Leader not long ago and just after the cached - // snapshot was created, this snapshot can not see all data of the peer. - Some(id) => { - if id.create_time <= delegate.last_valid_ts { - id.create_time = monotonic_raw_now(); - } - id.create_time - } - None => monotonic_raw_now(), - }; + let mut local_read_ctx = + LocalReadContext::new(&mut self.snap_cache, read_id); + + snap_updated = local_read_ctx + .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); + + let snapshot_ts = local_read_ctx.snapshot_ts(); if !delegate.is_in_leader_lease(snapshot_ts) { + fail_point!("localreader_before_redirect", |_| {}); // Forward to raftstore. self.redirect(RaftCommand::new(req, cb)); return; } - let read_ctx = self.local_read_context(); - let region = Arc::clone(&delegate.region); - let response = - delegate.execute(&req, ®ion, None, read_id, Some(read_ctx)); + let response = delegate.execute(&req, ®ion, None, Some(local_read_ctx)); + // Try renew lease in advance delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); response @@ -837,12 +921,14 @@ where return; } - let read_ctx = self.local_read_context(); + let mut local_read_ctx = + LocalReadContext::new(&mut self.snap_cache, read_id); + snap_updated = local_read_ctx + .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); let region = Arc::clone(&delegate.region); // Getting the snapshot - let response = - delegate.execute(&req, ®ion, None, read_id, Some(read_ctx)); + let response = delegate.execute(&req, ®ion, None, Some(local_read_ctx)); // Double check in case `safe_ts` change after the first check and before // getting snapshot @@ -860,6 +946,13 @@ where } _ => unreachable!(), }; + + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); + if !snap_updated { + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_snapshot_cache_hit.inc()); + } + cmd_resp::bind_term(&mut response.response, delegate.term); if let Some(snap) = response.snapshot.as_mut() { snap.txn_ext = Some(delegate.txn_ext.clone()); @@ -906,7 +999,7 @@ where } pub fn release_snapshot_cache(&mut self) { - self.snap_cache.as_mut().take(); + self.snap_cache.clear(); } } @@ -920,7 +1013,6 @@ where local_reader: self.local_reader.clone(), kv_engine: self.kv_engine.clone(), snap_cache: self.snap_cache.clone(), - cache_read_id: self.cache_read_id.clone(), router: self.router.clone(), } } @@ -936,27 +1028,8 @@ where &self.kv_engine } - fn get_snapshot( - &mut self, - create_time: Option, - read_context: &mut Option>, - ) -> Arc { - let ctx = read_context.as_mut().unwrap(); - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); - if let Some(ts) = create_time { - if ts == *ctx.read_id { - if let Some(snap) = ctx.snap_cache.as_ref().as_ref() { - TLS_LOCAL_READ_METRICS - .with(|m| m.borrow_mut().local_executed_snapshot_cache_hit.inc()); - return snap.clone(); - } - } - let snap = Arc::new(self.kv_engine.snapshot()); - *ctx.read_id = ts; - *ctx.snap_cache = Box::new(Some(snap.clone())); - return snap; - } - Arc::new(self.kv_engine.snapshot()) + fn get_snapshot(&mut self, read_context: &Option>) -> Arc { + read_context.as_ref().unwrap().snapshot().unwrap() } } @@ -998,12 +1071,12 @@ impl<'r> RequestInspector for Inspector<'r> { #[cfg(test)] mod tests { - use std::{sync::mpsc::*, thread}; + use std::{ops::Add, sync::mpsc::*, thread}; use crossbeam::channel::TrySendError; use engine_test::kv::{KvTestEngine, KvTestSnapshot}; use engine_traits::{Peekable, SyncMutable, ALL_CFS}; - use kvproto::raft_cmdpb::*; + use kvproto::{metapb::RegionEpoch, raft_cmdpb::*}; use tempfile::{Builder, TempDir}; use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; use time::Duration; @@ -1107,7 +1180,16 @@ mod tests { rx: &Receiver>, task: RaftCommand, ) { - reader.propose_raft_command(None, task.request, task.callback); + must_not_redirect_with_read_id(reader, rx, task, None); + } + + fn must_not_redirect_with_read_id( + reader: &mut LocalReader, + rx: &Receiver>, + task: RaftCommand, + read_id: Option, + ) { + reader.propose_raft_command(read_id, task.request, task.callback); assert_eq!(rx.try_recv().unwrap_err(), TryRecvError::Empty); } @@ -1523,14 +1605,13 @@ mod tests { } #[test] - fn test_read_delegate() { + fn test_read_executor_provider() { let path = Builder::new() .prefix("test-local-reader") .tempdir() .unwrap(); let kv_engine = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); - kv_engine.put(b"a1", b"val1").unwrap(); let store_meta = StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(0))), kv_engine.clone()); @@ -1546,135 +1627,238 @@ mod tests { meta.readers.insert(2, read_delegate); } - let mut read_id = ThreadReadId::new(); - let mut snap_cache = Box::new(None); - - let read_id_copy = Some(read_id.clone()); - - let mut read_context = Some(LocalReadContext { - read_id: &mut read_id, - snap_cache: &mut snap_cache, - }); - - let (_, delegate) = store_meta.get_executor_and_len(1); + let (len, delegate) = store_meta.get_executor_and_len(1); + assert_eq!(2, len); let mut delegate = delegate.unwrap(); + assert_eq!(1, delegate.region.id); let tablet = delegate.get_tablet(); assert_eq!(kv_engine.as_inner().path(), tablet.as_inner().path()); - let snapshot = delegate.get_snapshot(read_id_copy.clone(), &mut read_context); - assert_eq!( - b"val1".to_vec(), - *snapshot.get_value(b"a1").unwrap().unwrap() - ); - let (_, delegate) = store_meta.get_executor_and_len(2); + let (len, delegate) = store_meta.get_executor_and_len(2); + assert_eq!(2, len); let mut delegate = delegate.unwrap(); + assert_eq!(2, delegate.region.id); let tablet = delegate.get_tablet(); assert_eq!(kv_engine.as_inner().path(), tablet.as_inner().path()); - let snapshot = delegate.get_snapshot(read_id_copy, &mut read_context); - assert_eq!( - b"val1".to_vec(), - *snapshot.get_value(b"a1").unwrap().unwrap() - ); - - assert!(snap_cache.as_ref().is_some()); - assert_eq!( - TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_requests.get()), - 2 - ); - assert_eq!( - TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), - 1 - ); } - #[test] - fn test_snap_cache_hit() { - let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); - let (_tmp, mut reader, _) = new_reader("test-local-reader", 1, store_meta.clone()); + fn prepare_read_delegate( + store_id: u64, + region_id: u64, + term: u64, + pr_ids: Vec, + region_epoch: RegionEpoch, + store_meta: Arc>, + ) { + let mut region = metapb::Region::default(); + region.set_id(region_id); + let prs = new_peers(store_id, pr_ids); + region.set_peers(prs.clone().into()); - let mut region1 = metapb::Region::default(); - region1.set_id(1); + let leader = prs[0].clone(); + region.set_region_epoch(region_epoch); + let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. + let read_progress = Arc::new(RegionReadProgress::new(®ion, 1, 1, "".to_owned())); - // Register region 1 + // Register region + lease.renew(monotonic_raw_now()); + let remote = lease.maybe_new_remote_lease(term).unwrap(); + // But the applied_term is stale. { let mut meta = store_meta.lock().unwrap(); let read_delegate = ReadDelegate { tag: String::new(), - region: Arc::new(region1.clone()), - peer_id: 1, - term: 1, - applied_term: 1, - leader_lease: None, + region: Arc::new(region.clone()), + peer_id: leader.get_id(), + term, + applied_term: term, + leader_lease: Some(remote), last_valid_ts: Timespec::new(0, 0), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), txn_ext: Arc::new(TxnExt::default()), - read_progress: Arc::new(RegionReadProgress::new(®ion1, 1, 1, "".to_owned())), + read_progress, pending_remove: false, track_ver: TrackVer::new(), bucket_meta: None, }; - meta.readers.insert(1, read_delegate); + meta.readers.insert(region_id, read_delegate); } + } - let mut delegate = reader.local_reader.get_delegate(region1.id).unwrap(); - let read_id = Some(ThreadReadId::new()); + #[test] + fn test_snap_across_regions() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx) = new_reader("test-local-reader", store_id, store_meta.clone()); - { - let mut read_context = Some(reader.local_read_context()); + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let term6 = 6; - for _ in 0..10 { - // Different region id should reuse the cache - let _ = delegate.get_snapshot(read_id.clone(), &mut read_context); - } - } - // We should hit cache 9 times - assert_eq!( - TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), - 9 + // Register region1 + let pr_ids1 = vec![2, 3, 4]; + let prs1 = new_peers(store_id, pr_ids1.clone()); + prepare_read_delegate( + store_id, + 1, + term6, + pr_ids1, + epoch13.clone(), + store_meta.clone(), + ); + let leader1 = prs1[0].clone(); + + // Register region2 + let pr_ids2 = vec![22, 33, 44]; + let prs2 = new_peers(store_id, pr_ids2.clone()); + prepare_read_delegate(store_id, 2, term6, pr_ids2, epoch13.clone(), store_meta); + let leader2 = prs2[0].clone(); + + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader1); + header.set_region_epoch(epoch13.clone()); + header.set_term(term6); + cmd.set_header(header); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp.snapshot.unwrap()).unwrap(); + })), ); + // First request will not hit cache let read_id = Some(ThreadReadId::new()); + must_not_redirect_with_read_id(&mut reader, &rx, task, read_id.clone()); + let snap1 = snap_rx.recv().unwrap(); - { - let read_context = reader.local_read_context(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(2); + header.set_peer(leader2); + header.set_region_epoch(epoch13); + header.set_term(term6); + cmd.set_header(header); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp.snapshot.unwrap()).unwrap(); + })), + ); + must_not_redirect_with_read_id(&mut reader, &rx, task, read_id); + let snap2 = snap_rx.recv().unwrap(); + assert!(std::ptr::eq(snap1.get_snapshot(), snap2.get_snapshot())); - let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); - } - // This time, we will miss the cache - assert_eq!( - TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), - 9 + // If we use a new read id, the cache will be miss and a new snapshot will be + // generated + let read_id = Some(ThreadReadId::new()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp.snapshot.unwrap()).unwrap(); + })), ); + must_not_redirect_with_read_id(&mut reader, &rx, task, read_id); + let snap2 = snap_rx.recv().unwrap(); + assert!(!std::ptr::eq(snap1.get_snapshot(), snap2.get_snapshot())); + } - { - let read_context = reader.local_read_context(); - let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); - // We can hit it again. - assert_eq!( - TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), - 10 - ); - } + fn create_engine(path: &str) -> KvTestEngine { + let path = Builder::new().prefix(path).tempdir().unwrap(); + engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap() + } - reader.release_snapshot_cache(); - { - let read_context = reader.local_read_context(); - let _ = delegate.get_snapshot(read_id.clone(), &mut Some(read_context)); - } - // After release, we will mss the cache even with the prevsiou read_id. + #[test] + fn test_snap_cache_context() { + let db = create_engine("test_snap_cache_context"); + let mut snap_cache = SnapCache::new(); + let mut read_context = LocalReadContext::new(&mut snap_cache, None); + + // Have not inited the snap cache + assert!(read_context.snapshot().is_none()); + + db.put(b"a1", b"val1").unwrap(); + + let compare_ts = monotonic_raw_now(); + // Case 1: snap_cache_context.read_id is None + assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); + assert!(read_context.snapshot_ts() > compare_ts); assert_eq!( - TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), - 10 + read_context + .snapshot() + .unwrap() + .get_value(b"a1") + .unwrap() + .unwrap(), + b"val1" ); - { - let read_context = reader.local_read_context(); - let _ = delegate.get_snapshot(read_id, &mut Some(read_context)); - } - // We can hit it again. + // snap_cache_context is *not* created with read_id, so calling + // `maybe_update_snapshot` again will update the snapshot + let compare_ts = monotonic_raw_now(); + assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); + assert!(read_context.snapshot_ts() > compare_ts); + + let read_id = ThreadReadId::new(); + let read_id_clone = read_id.clone(); + let mut read_context = LocalReadContext::new(&mut snap_cache, Some(read_id)); + + let compare_ts = monotonic_raw_now(); + // Case 2: snap_cache_context.read_id is not None but not equals to the + // snap_cache.cached_read_id + assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); + assert!(read_context.snapshot_ts() > compare_ts); + let snap_ts = read_context.snapshot_ts(); + assert_eq!( + read_context + .snapshot() + .unwrap() + .get_value(b"a1") + .unwrap() + .unwrap(), + b"val1" + ); + + let db2 = create_engine("test_snap_cache_context2"); + // snap_cache_context is created with read_id, so calling + // `maybe_update_snapshot` again will *not* update the snapshot + // Case 3: snap_cache_context.read_id is not None and equals to the + // snap_cache.cached_read_id + assert!(!read_context.maybe_update_snapshot(&db2, Timespec::new(0, 0))); + assert_eq!(read_context.snapshot_ts(), snap_ts); assert_eq!( - TLS_LOCAL_READ_METRICS.with(|m| m.borrow().local_executed_snapshot_cache_hit.get()), - 11 + read_context + .snapshot() + .unwrap() + .get_value(b"a1") + .unwrap() + .unwrap(), + b"val1" + ); + + // Case 4: delegate.last_valid_ts is larger than create_time of read_id + let mut last_valid_ts = read_id_clone.create_time; + last_valid_ts = last_valid_ts.add(Duration::nanoseconds(1)); + assert!(read_context.maybe_update_snapshot(&db2, last_valid_ts)); + assert!(read_context.snapshot_ts() > snap_ts); + assert!( + read_context + .snapshot() + .unwrap() + .get_value(b"a1") + .unwrap() + .is_none(), ); } } diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index 3fda1ca0a80..b291e86b88c 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -15,6 +15,7 @@ mod test_gc_worker; mod test_hibernate; mod test_import_service; mod test_kv_service; +mod test_local_read; mod test_memory_usage_limit; mod test_merge; mod test_metrics_overflow; diff --git a/tests/failpoints/cases/test_local_read.rs b/tests/failpoints/cases/test_local_read.rs new file mode 100644 index 00000000000..06365fb36fb --- /dev/null +++ b/tests/failpoints/cases/test_local_read.rs @@ -0,0 +1,81 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::Arc, thread, time::Duration}; + +use grpcio::{ChannelBuilder, Environment}; +use kvproto::{ + kvrpcpb::{Context, RawGetRequest}, + tikvpb_grpc::TikvClient, +}; +use test_raftstore::{ + must_get_equal, must_get_none, must_raw_get, must_raw_put, new_peer, new_server_cluster, +}; +use tikv_util::HandyRwLock; + +// The test mocks the situation that just after passing the lease check, even +// when lease expires, we can read the correct value. +#[test] +fn test_consistency_after_lease_pass() { + let mut cluster = new_server_cluster(0, 3); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + let leader = new_peer(1, 1); + cluster.must_transfer_leader(1, leader); + + // Create clients. + let env = Arc::new(Environment::new(1)); + let channel = ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(1)); + let client = TikvClient::new(channel); + + let region = cluster.get_region(&b"key1"[..]); + let region_id = region.id; + let leader = cluster.leader_of_region(region_id).unwrap(); + + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader.clone()); + ctx.set_region_epoch(region.get_region_epoch().clone()); + + must_raw_put(&client, ctx.clone(), b"key1".to_vec(), b"value1".to_vec()); + must_get_equal(&cluster.get_engine(1), b"key1", b"value1"); + + // Ensure the request is executed by the local reader + fail::cfg("localreader_before_redirect", "panic").unwrap(); + + // Lease read works correctly + assert_eq!( + must_raw_get(&client, ctx.clone(), b"key1".to_vec()).unwrap(), + b"value1".to_vec() + ); + + // we pause just after pass the lease check, and then remove the peer. We can + // still read the relevant value as we should have already got the snapshot when + // passing the lease check. + fail::cfg("after_pass_lease_check", "pause").unwrap(); + + let mut get_req = RawGetRequest::default(); + get_req.set_context(ctx); + get_req.key = b"key1".to_vec(); + let mut receiver = client.raw_get_async(&get_req).unwrap(); + + thread::sleep(Duration::from_millis(200)); + + let mut peer = leader.clone(); + cluster.must_transfer_leader(1, new_peer(2, 2)); + pd_client.must_remove_peer(region_id, leader); + peer.id = 1000; + // After we pass the lease check, we should have got the snapshot, so the data + // that the region contains cannot be deleted. + // So we need to add the new peer for this region and stop before applying the + // snapshot so that the old data will be deleted and the snapshot data has not + // been written. + fail::cfg("apply_snap_cleanup_range", "pause").unwrap(); + pd_client.must_add_peer(region_id, peer); + + // Wait for data to be cleaned + must_get_none(&cluster.get_engine(1), b"key1"); + fail::cfg("after_pass_lease_check", "off").unwrap(); + + assert_eq!(b"value1", receiver.receive_sync().unwrap().1.get_value()); +} From b448214b8f2c0a6a9ba2381a1983ce20e6514218 Mon Sep 17 00:00:00 2001 From: Ping Yu Date: Fri, 14 Oct 2022 12:59:51 +0800 Subject: [PATCH 268/676] causal-ts: rename `available-interval` to `alloc-ahead-buffer` (#13597) ref tikv/tikv#13596, close tikv/tikv#13596 Rename `causal-ts.available-interval` to `causal-ts.alloc-ahead-buffer` for more clear meaning. Signed-off-by: pingyu --- components/causal_ts/src/config.rs | 23 ++++++++++++----------- components/causal_ts/src/tso.rs | 20 ++++++++++---------- components/server/src/server.rs | 2 +- components/test_raftstore/src/server.rs | 2 +- tests/integrations/config/mod.rs | 2 +- 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/components/causal_ts/src/config.rs b/components/causal_ts/src/config.rs index 0b08fecc7d6..17994344924 100644 --- a/components/causal_ts/src/config.rs +++ b/components/causal_ts/src/config.rs @@ -28,28 +28,29 @@ pub struct Config { /// interval. The 50ms limitation can not be broken through now (see /// `tso-update-physical-interval`). pub renew_batch_max_size: u32, - /// The available interval of BatchTsoProvider. + /// The size (in duration) of TSO buffer allocated ahead for + /// BatchTsoProvider. /// /// Default is 3s. - /// The longer of the value can provide better "high-availability" against - /// PD failure, but more overhead of `TsoBatchList` & pressure to TSO + /// The longer of the value will help to improve tolerance against PD + /// failure, but more overhead of `TsoBatchList` & pressure to TSO /// service. - pub available_interval: ReadableDuration, + pub alloc_ahead_buffer: ReadableDuration, } impl Config { pub fn validate(&self) -> Result<(), Box> { if self.renew_interval.is_zero() { - return Err("causal-ts.renew_interval can't be zero".into()); + return Err("causal-ts.renew-interval can't be zero".into()); } if self.renew_batch_min_size == 0 { - return Err("causal-ts.renew_batch_min_size should be greater than 0".into()); + return Err("causal-ts.renew-batch-min-size should be greater than 0".into()); } if self.renew_batch_max_size == 0 { - return Err("causal-ts.renew_batch_max_size should be greater than 0".into()); + return Err("causal-ts.renew-batch-max-size should be greater than 0".into()); } - if self.available_interval.is_zero() { - return Err("causal-ts.available-interval can't be zero".into()); + if self.alloc_ahead_buffer.is_zero() { + return Err("causal-ts.alloc-ahead-buffer can't be zero".into()); } Ok(()) } @@ -63,8 +64,8 @@ impl Default for Config { ), renew_batch_min_size: crate::tso::DEFAULT_TSO_BATCH_MIN_SIZE, renew_batch_max_size: crate::tso::DEFAULT_TSO_BATCH_MAX_SIZE, - available_interval: ReadableDuration::millis( - crate::tso::DEFAULT_TSO_BATCH_AVAILABLE_INTERVAL_MS, + alloc_ahead_buffer: ReadableDuration::millis( + crate::tso::DEFAULT_TSO_BATCH_ALLOC_AHEAD_BUFFER_MS, ), } } diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index ad9f3ec1fc6..5056cfe2ebd 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -2,12 +2,12 @@ //! ## The algorithm to make the TSO cache tolerate failure of TSO service //! -//! 1. The scale of High-Available is specified by config item -//! `causal-ts.available-interval`. +//! 1. The expected total size (in duration) of TSO cache is specified by +//! config item `causal-ts.alloc-ahead-buffer`. //! //! 2. Count usage of TSO on every renew interval. //! -//! 3. Calculate `cache_multiplier` by `causal-ts.available-interval / +//! 3. Calculate `cache_multiplier` by `causal-ts.alloc-ahead-buffer / //! causal-ts.renew-interval`. //! //! 4. Then `tso_usage x cache_multiplier` is the expected number of TSO should @@ -67,9 +67,9 @@ pub(crate) const DEFAULT_TSO_BATCH_MAX_SIZE: u32 = 8192; /// of PD. The longer of the value can provide better "High-Availability" /// against PD failure, but more overhead of `TsoBatchList` & pressure to TSO /// service. -pub(crate) const DEFAULT_TSO_BATCH_AVAILABLE_INTERVAL_MS: u64 = 3000; +pub(crate) const DEFAULT_TSO_BATCH_ALLOC_AHEAD_BUFFER_MS: u64 = 3000; /// Just a limitation for safety, in case user specify a too big -/// `available_interval`. +/// `alloc_ahead_buffer`. const MAX_TSO_BATCH_LIST_CAPACITY: u32 = 1024; /// TSO range: [(physical, logical_start), (physical, logical_end)) @@ -326,7 +326,7 @@ impl BatchTsoProvider { Self::new_opt( pd_client, Duration::from_millis(DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS), - Duration::from_millis(DEFAULT_TSO_BATCH_AVAILABLE_INTERVAL_MS), + Duration::from_millis(DEFAULT_TSO_BATCH_ALLOC_AHEAD_BUFFER_MS), DEFAULT_TSO_BATCH_MIN_SIZE, DEFAULT_TSO_BATCH_MAX_SIZE, ) @@ -334,23 +334,23 @@ impl BatchTsoProvider { } #[allow(unused_mut)] - fn calc_cache_multiplier(mut renew_interval: Duration, available_interval: Duration) -> u32 { + fn calc_cache_multiplier(mut renew_interval: Duration, alloc_ahead: Duration) -> u32 { #[cfg(any(test, feature = "testexport"))] if renew_interval.is_zero() { // Should happen in test only. renew_interval = Duration::from_millis(DEFAULT_TSO_BATCH_RENEW_INTERVAL_MS); } - available_interval.div_duration_f64(renew_interval).ceil() as u32 + alloc_ahead.div_duration_f64(renew_interval).ceil() as u32 } pub async fn new_opt( pd_client: Arc, renew_interval: Duration, - available_interval: Duration, + alloc_ahead: Duration, batch_min_size: u32, batch_max_size: u32, ) -> Result { - let cache_multiplier = Self::calc_cache_multiplier(renew_interval, available_interval); + let cache_multiplier = Self::calc_cache_multiplier(renew_interval, alloc_ahead); let renew_parameter = RenewParameter { batch_min_size, batch_max_size, diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 247bc6ccb58..2320d1156f4 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -349,7 +349,7 @@ where let tso = block_on(causal_ts::BatchTsoProvider::new_opt( pd_client.clone(), config.causal_ts.renew_interval.0, - config.causal_ts.available_interval.0, + config.causal_ts.alloc_ahead_buffer.0, config.causal_ts.renew_batch_min_size, config.causal_ts.renew_batch_max_size, )); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 67eb3a22db6..4c0bbce3fd1 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -374,7 +374,7 @@ impl ServerCluster { block_on(causal_ts::BatchTsoProvider::new_opt( self.pd_client.clone(), cfg.causal_ts.renew_interval.0, - cfg.causal_ts.available_interval.0, + cfg.causal_ts.alloc_ahead_buffer.0, cfg.causal_ts.renew_batch_min_size, cfg.causal_ts.renew_batch_max_size, )) diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index d0eac27e3b1..e2d5ef06b6e 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -798,7 +798,7 @@ fn test_serde_custom_tikv_config() { renew_interval: ReadableDuration::millis(100), renew_batch_min_size: 100, renew_batch_max_size: 8192, - available_interval: ReadableDuration::millis(3000), + alloc_ahead_buffer: ReadableDuration::millis(3000), }; let custom = read_file_in_project_dir("integrations/config/test-custom.toml"); From 26b6c3cfccb386af50b1592ae4583dfb0003d7c1 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 14 Oct 2022 18:21:53 +0800 Subject: [PATCH 269/676] local_reader: release snapshot properly (#13605) close tikv/tikv#13553 Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/worker/read.rs | 153 +++++++++++++++--- 1 file changed, 133 insertions(+), 20 deletions(-) diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 1f6d7c4bab7..fd6c7552f5d 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -203,6 +203,11 @@ where { read_id: Option, snap_cache: &'a mut SnapCache, + + // Used when read_id is not set, duplicated definition to avoid cache invalidation in case + // stale read and local read are mixed in one batch. + snapshot: Option>, + snapshot_ts: Option, } impl<'a, E> LocalReadContext<'a, E> @@ -213,6 +218,8 @@ where Self { snap_cache, read_id, + snapshot: None, + snapshot_ts: None, } } @@ -232,25 +239,40 @@ where } self.snap_cache.cached_read_id = self.read_id.clone(); - } + self.snap_cache.snapshot = Some(Arc::new(engine.snapshot())); - self.snap_cache.snapshot = Some(Arc::new(engine.snapshot())); + // Ensures the snapshot is acquired before getting the time + atomic::fence(atomic::Ordering::Release); + self.snap_cache.cached_snapshot_ts = monotonic_raw_now(); + } else { + // read_id being None means the snapshot acquired will only be used in this + // request + self.snapshot = Some(Arc::new(engine.snapshot())); - // Ensures the snapshot is acquired before getting the time - atomic::fence(atomic::Ordering::Release); - self.snap_cache.cached_snapshot_ts = monotonic_raw_now(); + // Ensures the snapshot is acquired before getting the time + atomic::fence(atomic::Ordering::Release); + self.snapshot_ts = Some(monotonic_raw_now()); + } true } - // Note: must be called after `maybe_update_snapshot` - fn snapshot_ts(&self) -> Timespec { - self.snap_cache.cached_snapshot_ts + fn snapshot_ts(&self) -> Option { + if self.read_id.is_some() { + Some(self.snap_cache.cached_snapshot_ts) + } else { + self.snapshot_ts + } } // Note: must be called after `maybe_update_snapshot` fn snapshot(&self) -> Option> { - self.snap_cache.snapshot.clone() + // read_id being some means we go through cache + if self.read_id.is_some() { + self.snap_cache.snapshot.clone() + } else { + self.snapshot.clone() + } } } @@ -894,7 +916,7 @@ where snap_updated = local_read_ctx .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); - let snapshot_ts = local_read_ctx.snapshot_ts(); + let snapshot_ts = local_read_ctx.snapshot_ts().unwrap(); if !delegate.is_in_leader_lease(snapshot_ts) { fail_point!("localreader_before_redirect", |_| {}); // Forward to raftstore. @@ -921,8 +943,8 @@ where return; } - let mut local_read_ctx = - LocalReadContext::new(&mut self.snap_cache, read_id); + // Stale read does not use cache, so we pass None for read_id + let mut local_read_ctx = LocalReadContext::new(&mut self.snap_cache, None); snap_updated = local_read_ctx .maybe_update_snapshot(delegate.get_tablet(), last_valid_ts); @@ -1075,7 +1097,7 @@ mod tests { use crossbeam::channel::TrySendError; use engine_test::kv::{KvTestEngine, KvTestSnapshot}; - use engine_traits::{Peekable, SyncMutable, ALL_CFS}; + use engine_traits::{MiscExt, Peekable, SyncMutable, ALL_CFS}; use kvproto::{metapb::RegionEpoch, raft_cmdpb::*}; use tempfile::{Builder, TempDir}; use tikv_util::{codec::number::NumberEncoder, time::monotonic_raw_now}; @@ -1785,15 +1807,15 @@ mod tests { let mut snap_cache = SnapCache::new(); let mut read_context = LocalReadContext::new(&mut snap_cache, None); - // Have not inited the snap cache assert!(read_context.snapshot().is_none()); + assert!(read_context.snapshot_ts().is_none()); db.put(b"a1", b"val1").unwrap(); let compare_ts = monotonic_raw_now(); // Case 1: snap_cache_context.read_id is None assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); - assert!(read_context.snapshot_ts() > compare_ts); + assert!(read_context.snapshot_ts().unwrap() > compare_ts); assert_eq!( read_context .snapshot() @@ -1808,7 +1830,7 @@ mod tests { // `maybe_update_snapshot` again will update the snapshot let compare_ts = monotonic_raw_now(); assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); - assert!(read_context.snapshot_ts() > compare_ts); + assert!(read_context.snapshot_ts().unwrap() > compare_ts); let read_id = ThreadReadId::new(); let read_id_clone = read_id.clone(); @@ -1818,8 +1840,8 @@ mod tests { // Case 2: snap_cache_context.read_id is not None but not equals to the // snap_cache.cached_read_id assert!(read_context.maybe_update_snapshot(&db, Timespec::new(0, 0))); - assert!(read_context.snapshot_ts() > compare_ts); - let snap_ts = read_context.snapshot_ts(); + assert!(read_context.snapshot_ts().unwrap() > compare_ts); + let snap_ts = read_context.snapshot_ts().unwrap(); assert_eq!( read_context .snapshot() @@ -1836,7 +1858,7 @@ mod tests { // Case 3: snap_cache_context.read_id is not None and equals to the // snap_cache.cached_read_id assert!(!read_context.maybe_update_snapshot(&db2, Timespec::new(0, 0))); - assert_eq!(read_context.snapshot_ts(), snap_ts); + assert_eq!(read_context.snapshot_ts().unwrap(), snap_ts); assert_eq!( read_context .snapshot() @@ -1851,7 +1873,7 @@ mod tests { let mut last_valid_ts = read_id_clone.create_time; last_valid_ts = last_valid_ts.add(Duration::nanoseconds(1)); assert!(read_context.maybe_update_snapshot(&db2, last_valid_ts)); - assert!(read_context.snapshot_ts() > snap_ts); + assert!(read_context.snapshot_ts().unwrap() > snap_ts); assert!( read_context .snapshot() @@ -1861,4 +1883,95 @@ mod tests { .is_none(), ); } + + #[test] + fn test_snap_release_for_not_using_cache() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx) = new_reader("test-local-reader", store_id, store_meta.clone()); + reader.kv_engine.put(b"key", b"value").unwrap(); + + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let term6 = 6; + + // Register region1 + let pr_ids1 = vec![2, 3, 4]; + let prs1 = new_peers(store_id, pr_ids1.clone()); + prepare_read_delegate(store_id, 1, term6, pr_ids1, epoch13.clone(), store_meta); + let leader1 = prs1[0].clone(); + + // Local read + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader1); + header.set_region_epoch(epoch13); + header.set_term(term6); + cmd.set_header(header.clone()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + // using cache and release + let read_id = ThreadReadId::new(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |_: ReadResponse| {})), + ); + must_not_redirect_with_read_id(&mut reader, &rx, task, Some(read_id)); + assert!( + reader + .kv_engine + .get_oldest_snapshot_sequence_number() + .is_some() + ); + reader.release_snapshot_cache(); + assert!( + reader + .kv_engine + .get_oldest_snapshot_sequence_number() + .is_none() + ); + + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |_: ReadResponse| {})), + ); + + // not use cache + must_not_redirect_with_read_id(&mut reader, &rx, task, None); + assert!( + reader + .kv_engine + .get_oldest_snapshot_sequence_number() + .is_none() + ); + + // Stale read + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(0).unwrap(); + header.set_flags(header.get_flags() | WriteBatchFlags::STALE_READ.bits()); + header.set_flag_data(data.into()); + + cmd.set_header(header); + let task = RaftCommand::::new( + cmd, + Callback::read(Box::new(move |_: ReadResponse| {})), + ); + let read_id = ThreadReadId::new(); + must_not_redirect_with_read_id(&mut reader, &rx, task, Some(read_id)); + // Stale read will not use snap cache + assert!(reader.snap_cache.snapshot.is_none()); + assert!( + reader + .kv_engine + .get_oldest_snapshot_sequence_number() + .is_none() + ); + } } From b37c2f606d48d3a0fdb2227900746b7180b96724 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 17 Oct 2022 11:39:52 +0800 Subject: [PATCH 270/676] *: introduce the two-phase kv_flashback_to_version (#13557) close tikv/tikv#13519, ref tikv/tikv#13519, ref tikv/tikv#13541 Make `FlashbackToVersion` become a two-phase request as described in #13519. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/error_code/src/raftstore.rs | 3 + components/raftstore/src/errors.rs | 9 ++ components/raftstore/src/store/fsm/apply.rs | 8 +- components/raftstore/src/store/fsm/peer.rs | 23 +++-- components/raftstore/src/store/metrics.rs | 3 +- components/raftstore/src/store/util.rs | 40 +++++--- components/test_raftstore/src/cluster.rs | 94 +++++++++++------- components/test_raftstore/src/util.rs | 22 +++++ src/server/service/kv.rs | 60 +++++------ .../txn/actions/flashback_to_version.rs | 35 ++++++- .../integrations/raftstore/test_flashback.rs | 99 +++++++++++++------ tests/integrations/server/kv_service.rs | 62 ++++++------ 13 files changed, 308 insertions(+), 152 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bf3536544fc..97c6209b2d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2672,7 +2672,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#4c6f1502851ed55b3ed023d180b6b10766446630" +source = "git+https://github.com/pingcap/kvproto.git#26e28e6a281abb927f91ef992eb8f93b39698ffa" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index 2fd0d168a14..1b6a85493cf 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -31,6 +31,7 @@ define_error_codes!( PENDING_PREPARE_MERGE => ("PendingPrepareMerge", "", ""), RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), FLASHBACK_IN_PROGRESS => ("FlashbackInProgress", "", ""), + FLASHBACK_NOT_PREPARED => ("FlashbackNotPrepared", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), @@ -67,6 +68,8 @@ impl ErrorCodeExt for errorpb::Error { RECOVERY_IN_PROGRESS } else if self.has_flashback_in_progress() { FLASHBACK_IN_PROGRESS + } else if self.has_flashback_not_prepared() { + FLASHBACK_NOT_PREPARED } else { UNKNOWN } diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 70e342da96a..3c415c65af6 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -61,6 +61,9 @@ pub enum Error { #[error("region {0} is in the flashback progress")] FlashbackInProgress(u64), + #[error("region {0} not prepared the flashback")] + FlashbackNotPrepared(u64), + #[error( "key {} is not in region key range [{}, {}) for region {}", log_wrappers::Value::key(.0), @@ -255,6 +258,11 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_flashback_in_progress(e); } + Error::FlashbackNotPrepared(region_id) => { + let mut e = errorpb::FlashbackNotPrepared::default(); + e.set_region_id(region_id); + errorpb.set_flashback_not_prepared(e); + } _ => {} }; @@ -290,6 +298,7 @@ impl ErrorCodeExt for Error { Error::DiskFull(..) => error_code::raftstore::DISK_FULL, Error::RecoveryInProgress(..) => error_code::raftstore::RECOVERY_IN_PROGRESS, Error::FlashbackInProgress(..) => error_code::raftstore::FLASHBACK_IN_PROGRESS, + Error::FlashbackNotPrepared(..) => error_code::raftstore::FLASHBACK_NOT_PREPARED, Error::StaleCommand => error_code::raftstore::STALE_COMMAND, Error::RegionNotInitialized(_) => error_code::raftstore::REGION_NOT_INITIALIZED, Error::KeyNotInRegion(..) => error_code::raftstore::KEY_NOT_IN_REGION, diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 5fb5754b116..dae732797b1 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1350,6 +1350,12 @@ where "peer_id" => self.id(), "err" => ?e ), + Error::FlashbackNotPrepared(..) => debug!( + "flashback is not prepared"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "err" => ?e + ), _ => error!(?e; "execute raft command"; "region_id" => self.region_id(), @@ -1522,7 +1528,7 @@ where let include_region = req.get_header().get_region_epoch().get_version() >= self.last_merge_version; check_region_epoch(req, &self.region, include_region)?; - check_flashback_state(req, &self.region)?; + check_flashback_state(self.region.get_is_in_flashback(), req, self.region_id())?; if req.has_admin_request() { self.exec_admin_cmd(ctx, req) } else { diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index d4a31561c63..30877f57263 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -4899,14 +4899,23 @@ where } let region_id = self.region_id(); - // When in the flashback state, we should not allow any other request to be - // proposed. - if self.fsm.peer.is_in_flashback { - self.ctx.raft_metrics.invalid_proposal.flashback.inc(); - let flags = WriteBatchFlags::from_bits_truncate(msg.get_header().get_flags()); - if !flags.contains(WriteBatchFlags::FLASHBACK) { - return Err(Error::FlashbackInProgress(self.region_id())); + if let Err(e) = util::check_flashback_state(self.fsm.peer.is_in_flashback, msg, region_id) { + match e { + Error::FlashbackInProgress(_) => self + .ctx + .raft_metrics + .invalid_proposal + .flashback_in_progress + .inc(), + Error::FlashbackNotPrepared(_) => self + .ctx + .raft_metrics + .invalid_proposal + .flashback_not_prepared + .inc(), + _ => unreachable!(), } + return Err(e); } // Check whether the store has the right peer to handle the request. diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 32a23cd070e..af877e14b46 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -202,7 +202,8 @@ make_static_metric! { region_not_initialized, is_applying_snapshot, force_leader, - flashback, + flashback_in_progress, + flashback_not_prepared } pub label_enum RaftLogGcSkippedReason { diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 42276c79ab6..a21eb7756e2 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -280,21 +280,31 @@ pub fn compare_region_epoch( Ok(()) } -pub fn check_flashback_state(req: &RaftCmdRequest, region: &metapb::Region) -> Result<()> { - // If admin flashback has not been applied but the region is already in a - // flashback state, the request is rejected - if region.get_is_in_flashback() { - let flags = WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()); - if flags.contains(WriteBatchFlags::FLASHBACK) { - return Ok(()); - } - if req.has_admin_request() - && (req.get_admin_request().get_cmd_type() == AdminCmdType::PrepareFlashback - || req.get_admin_request().get_cmd_type() == AdminCmdType::FinishFlashback) - { - return Ok(()); - } - return Err(Error::FlashbackInProgress(region.get_id())); +// Check if the request could be proposed/applied under the current state of the +// flashback. +pub fn check_flashback_state( + is_in_flashback: bool, + req: &RaftCmdRequest, + region_id: u64, +) -> Result<()> { + // The admin flashback cmd could be proposed/applied under any state. + if req.has_admin_request() + && (req.get_admin_request().get_cmd_type() == AdminCmdType::PrepareFlashback + || req.get_admin_request().get_cmd_type() == AdminCmdType::FinishFlashback) + { + return Ok(()); + } + let is_flashback_request = WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + .contains(WriteBatchFlags::FLASHBACK); + // If the region is in the flashback state, the only allowed request is the + // flashback request itself. + if is_in_flashback && !is_flashback_request { + return Err(Error::FlashbackInProgress(region_id)); + } + // If the region is not in the flashback state, the flashback request itself + // should be rejected. + if !is_in_flashback && is_flashback_request { + return Err(Error::FlashbackNotPrepared(region_id)); } Ok(()) } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 9b1f19bf21a..a5ce174c6d2 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1421,7 +1421,7 @@ impl Cluster { .unwrap(); } - pub async fn send_flashback_msg( + pub fn block_send_flashback_msg( &mut self, region_id: u64, store_id: u64, @@ -1429,41 +1429,67 @@ impl Cluster { epoch: metapb::RegionEpoch, peer: metapb::Peer, ) { - let (result_tx, result_rx) = oneshot::channel(); - let cb = Callback::write(Box::new(move |resp| { - if resp.response.get_header().has_error() { - result_tx.send(false).unwrap(); - error!("send flashback msg failed"; "region_id" => region_id); - return; + self.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + block_on(async move { + let (result_tx, result_rx) = oneshot::channel(); + let cb = Callback::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + result_tx + .send(Some(resp.response.get_header().get_error().clone())) + .unwrap(); + return; + } + result_tx.send(None).unwrap(); + })); + + let mut admin = AdminRequest::default(); + admin.set_cmd_type(cmd_type); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_region_epoch(epoch); + req.mut_header().set_peer(peer); + req.set_admin_request(admin); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let router = self.sim.rl().get_router(store_id).unwrap(); + if let Err(e) = router.send_command( + req, + cb, + RaftCmdExtraOpts { + deadline: None, + disk_full_opt: kvproto::kvrpcpb::DiskFullOpt::AllowedOnAlmostFull, + }, + ) { + panic!( + "router send flashback msg {:?} failed, error: {}", + cmd_type, e + ); } - result_tx.send(true).unwrap(); - })); - - let mut admin = AdminRequest::default(); - admin.set_cmd_type(cmd_type); - let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(region_id); - req.mut_header().set_region_epoch(epoch); - req.mut_header().set_peer(peer); - req.set_admin_request(admin); - req.mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - - let router = self.sim.rl().get_router(store_id).unwrap(); - if let Err(e) = router.send_command( - req, - cb, - RaftCmdExtraOpts { - deadline: None, - disk_full_opt: kvproto::kvrpcpb::DiskFullOpt::AllowedOnAlmostFull, - }, - ) { - panic!("router send failed, error{}", e); - } - - if !result_rx.await.unwrap() { - panic!("Flashback call msg failed"); + if let Some(e) = result_rx.await.unwrap() { + panic!("call flashback msg {:?} failed, error: {:?}", cmd_type, e); + } + }); + } + + fn wait_applied_to_current_term(&mut self, region_id: u64, timeout: Duration) { + let mut now = Instant::now(); + let deadline = now + timeout; + while now < deadline { + if let Some(leader) = self.leader_of_region(region_id) { + let raft_apply_state = self.apply_state(region_id, leader.get_store_id()); + let raft_local_state = self.raft_local_state(region_id, leader.get_store_id()); + // If term matches and apply to commit index, then it must apply to current + // term. + if raft_apply_state.applied_index == raft_apply_state.commit_index + && raft_apply_state.commit_term == raft_local_state.get_hard_state().get_term() + { + return; + } + } + thread::sleep(Duration::from_millis(10)); + now = Instant::now(); } + panic!("region {} is not applied to current term", region_id,); } pub fn must_split(&mut self, region: &metapb::Region, split_key: &[u8]) { diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 87269ac5e02..3718dbce906 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -1218,6 +1218,28 @@ pub fn must_raw_get(client: &TikvClient, ctx: Context, key: Vec) -> Option FlashbackToVersionResponse { + let mut prepare_req = PrepareFlashbackToVersionRequest::default(); + prepare_req.set_context(ctx.clone()); + client + .kv_prepare_flashback_to_version(&prepare_req) + .unwrap(); + let mut req = FlashbackToVersionRequest::default(); + req.set_context(ctx); + req.set_start_ts(start_ts); + req.set_commit_ts(commit_ts); + req.version = version; + req.start_key = b"a".to_vec(); + req.end_key = b"z".to_vec(); + client.kv_flashback_to_version(&req).unwrap() +} + // A helpful wrapper to make the test logic clear pub struct PeerClient { pub cli: TikvClient, diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 924236529d9..84015ddab57 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1722,8 +1722,8 @@ fn future_delete_range( } } -// Preparing the flashback for a region/key range will "lock" the region so that -// there is no any read, write or schedule operation could be proposed before +// Preparing the flashback for a region will "lock" the region so that +// there is no any read, write or scheduling operation could be proposed before // the actual flashback operation. fn future_prepare_flashback_to_version< E: Engine, @@ -1733,11 +1733,24 @@ fn future_prepare_flashback_to_version< >( // Keep this param to hint the type of E for the compiler. _storage: &Storage, - _raft_router: &T, - _req: PrepareFlashbackToVersionRequest, + raft_router: &T, + req: PrepareFlashbackToVersionRequest, ) -> impl Future> { - // TODO: implement this. - async move { unimplemented!() } + let raft_router = Mutex::new(raft_router.clone()); + async move { + // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the + // later flashback. Once invoked, we will update the persistent region meta and + // the memory state of the flashback in Peer FSM to reject all read, write + // and scheduling operations for this region when propose/apply before we + // start the actual data flashback transaction command in the next phase. + send_flashback_msg::( + &raft_router, + req.get_context(), + AdminCmdType::PrepareFlashback, + ) + .await?; + Ok(PrepareFlashbackToVersionResponse::default()) + } } // Flashback the region to a specific point with the given `version`, please @@ -1756,22 +1769,8 @@ fn future_flashback_to_version< let storage_clone = storage.clone(); let raft_router = Mutex::new(raft_router.clone()); async move { - // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the - // later flashback. This will first block all scheduling, read and write - // operations, then wait for the latest Raft log to be applied before we start - // the flashback command. Once invoked, we update the persistence state - // in `RegionLocalState` and region's meta, and when that - // admin cmd is applied, the `PrepareFlashback` command will update the memory - // state of the flashback, rejecting all read and write operations at - // propose and applied. We make FlashbackToVersion a two-stage request - // and lock the region in the first stage. - send_flashback_msg::( - &raft_router, - req.get_context(), - AdminCmdType::PrepareFlashback, - ) - .await?; - + // Perform the data flashback transaction command. We will check if the region + // is in the flashback state when proposing the flashback modification. let (cb, f) = paired_future_callback(); let res = storage_clone.sched_txn_command(req.clone().into(), cb); // Avoid crossing `.await` to bypass the `Send` constraint. @@ -1793,7 +1792,6 @@ fn future_flashback_to_version< AdminCmdType::FinishFlashback, ) .await?; - let mut resp = FlashbackToVersionResponse::default(); if let Some(err) = extract_region_error(&v) { resp.set_region_error(err); @@ -2472,11 +2470,15 @@ async fn send_flashback_msg + 'static, E: Engine>( ctx: &Context, cmd_type: AdminCmdType, ) -> ServerResult<()> { + let region_id = ctx.get_region_id(); let (result_tx, result_rx) = oneshot::channel(); let cb = Callback::write(Box::new(move |resp| { if resp.response.get_header().has_error() { result_tx.send(false).unwrap(); - error!("send flashback msg failed"; "error" => ?resp.response.get_header().get_error()); + error!("exec flashback msg failed"; + "region_id" => region_id, + "type" => ?cmd_type, + "error" => ?resp.response.get_header().get_error()); return; } result_tx.send(true).unwrap(); @@ -2484,7 +2486,7 @@ async fn send_flashback_msg + 'static, E: Engine>( let mut admin = AdminRequest::default(); admin.set_cmd_type(cmd_type); let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(ctx.get_region_id()); + req.mut_header().set_region_id(region_id); req.mut_header() .set_region_epoch(ctx.get_region_epoch().clone()); req.mut_header().set_peer(ctx.get_peer().clone()); @@ -2502,15 +2504,17 @@ async fn send_flashback_msg + 'static, E: Engine>( }, ) { return Err(Error::Other(box_err!( - "flashback router send failed, error {:?}", + "send flashback msg {:?} failed for region {}, error {:?}", + cmd_type, + region_id, e ))); } if !result_rx.await? { return Err(Error::Other(box_err!( - "send flashback msg {:?} to region {} failed", + "wait flashback msg {:?} result failed for region {} failed", cmd_type, - ctx.get_region_id() + region_id ))); } Ok(()) diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index e160a4a43b9..96f80b9389c 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -10,7 +10,6 @@ use crate::storage::{ pub const FLASHBACK_BATCH_SIZE: usize = 256 + 1 /* To store the next key for multiple batches */; -// TODO: we should resolve all locks before starting a flashback. pub fn flashback_to_version_read_lock( reader: &mut MvccReader, next_lock_key: &Option, @@ -64,12 +63,19 @@ pub fn flashback_to_version_read_write( // Check the latest commit ts to make sure there is no commit change during the // flashback, otherwise, we need to abort the flashback. for (key, commit_ts, old_write) in key_ts_old_writes { - if commit_ts >= flashback_commit_ts { + if commit_ts > flashback_commit_ts { return Err(Error::from(ErrorInner::InvalidTxnTso { start_ts: flashback_start_ts, commit_ts: flashback_commit_ts, })); } + // Since the first flashback preparation phase make sure there will be no writes + // other than flashback after it, so we need to check if there is already a + // successful flashback result, and if so, just finish the flashback ASAP. + if commit_ts == flashback_commit_ts { + key_old_writes.clear(); + return Ok((key_old_writes, false)); + } key_old_writes.push((key, old_write)); } Ok((key_old_writes, has_remain_writes)) @@ -298,7 +304,7 @@ pub mod tests { // Since the key has been deleted, flashback to version 1 should not do // anything. assert_eq!( - must_flashback_to_version(&mut engine, k, ts, *ts.incr(), *ts.incr()), + must_flashback_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), 0 ); must_get_none(&mut engine, k, ts); @@ -331,4 +337,27 @@ pub mod tests { must_pessimistic_prewrite_put_err(&mut engine, k, v3, k, 30, 30, DoPessimisticCheck); must_get(&mut engine, k, 45, v1); } + + #[test] + fn test_duplicated_flashback_to_version() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let mut ts = TimeStamp::zero(); + let (k, v) = (b"k", b"v"); + must_prewrite_put(&mut engine, k, v, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, ts, v); + let start_ts = *ts.incr(); + let commit_ts = *ts.incr(); + assert_eq!( + must_flashback_to_version(&mut engine, k, 1, start_ts, commit_ts), + 1 + ); + must_get_none(&mut engine, k, ts); + // Flashback again with the same `start_ts` and `commit_ts` should not do + // anything. + assert_eq!( + must_flashback_to_version(&mut engine, k, 1, start_ts, commit_ts), + 0 + ); + } } diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 5709cd22804..be70e176f01 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -2,12 +2,39 @@ use std::time::{Duration, Instant}; -use futures::executor::block_on; -use kvproto::metapb; +use kvproto::{ + metapb, + raft_cmdpb::{CmdType, Request}, +}; use test_raftstore::*; use tikv_util::time::InstantExt; use txn_types::WriteBatchFlags; +#[test] +fn test_flashback_unprepared() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + + cluster.must_transfer_leader(1, new_peer(2, 2)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + let mut region = cluster.get_region(b"k1"); + let mut cmd = Request::default(); + cmd.set_cmd_type(CmdType::Put); + let mut req = new_request( + region.get_id(), + region.take_region_epoch(), + vec![cmd], + false, + ); + let new_leader = cluster.query_leader(1, region.get_id(), Duration::from_secs(1)); + req.mut_header().set_peer(new_leader.unwrap()); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let resp = cluster.call_command(req, Duration::from_secs(5)).unwrap(); + assert!(resp.get_header().get_error().has_flashback_not_prepared()); +} + #[test] fn test_flashback_for_schedule() { let mut cluster = new_node_cluster(0, 3); @@ -18,15 +45,15 @@ fn test_flashback_for_schedule() { // Prepare for flashback let region = cluster.get_region(b"k1"); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( region.get_id(), 1, kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, cluster.get_region_epoch(1), new_peer(1, 1), - )); + ); - // Verify the schedule is unabled. + // Verify the schedule is disabled. let mut region = cluster.get_region(b"k3"); let admin_req = new_transfer_leader_cmd(new_peer(2, 2)); let transfer_leader = @@ -46,13 +73,13 @@ fn test_flashback_for_schedule() { // Verify the schedule can be executed if add flashback flag in request's // header. must_transfer_leader(&mut cluster, region.get_id(), new_peer(2, 2)); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( region.get_id(), 2, kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, cluster.get_region_epoch(1), new_peer(2, 2), - )); + ); // Transfer leader to (1, 1) cluster.must_transfer_leader(1, new_peer(1, 1)); } @@ -69,13 +96,13 @@ fn test_flashback_for_write() { // Prepare for flashback let region = cluster.get_region(b"k1"); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( region.get_id(), 1, kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, cluster.get_region_epoch(1), new_peer(1, 1), - )); + ); // Write will be blocked let value = vec![1_u8; 8096]; @@ -87,13 +114,13 @@ fn test_flashback_for_write() { new_put_cmd(b"k1", &value), ); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( region.get_id(), 1, kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, cluster.get_region_epoch(1), new_peer(1, 1), - )); + ); multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); } @@ -112,13 +139,13 @@ fn test_flashback_for_read() { // Prepare for flashback let region = cluster.get_region(b"k1"); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( region.get_id(), 1, kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, cluster.get_region_epoch(1), new_peer(1, 1), - )); + ); // read will be blocked must_get_error_flashback_in_progress(&mut cluster, ®ion, new_get_cf_cmd("write", b"k1")); @@ -131,13 +158,13 @@ fn test_flashback_for_read() { new_get_cf_cmd("write", b"k1"), ); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( region.get_id(), 1, kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, cluster.get_region_epoch(1), new_peer(1, 1), - )); + ); multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); } @@ -173,13 +200,13 @@ fn test_flashback_for_local_read() { assert_eq!(state.get_last_index(), last_index); // Prepare for flashback - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( region.get_id(), store_id, kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, cluster.get_region_epoch(1), new_peer(store_id, store_id), - )); + ); // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); @@ -210,13 +237,13 @@ fn test_flashback_for_local_read() { let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 1); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( region.get_id(), store_id, kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, cluster.get_region_epoch(1), new_peer(store_id, store_id), - )); + ); let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 2); @@ -240,13 +267,13 @@ fn test_flashback_for_status_cmd_as_region_detail() { let leader = cluster.leader_of_region(1).unwrap(); let region = cluster.get_region(b"k1"); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( region.get_id(), leader.get_store_id(), kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, cluster.get_region_epoch(1), new_peer(leader.get_store_id(), leader.get_store_id()), - )); + ); let region_detail = cluster.region_detail(region.get_id(), leader.get_store_id()); assert!(region_detail.has_region()); @@ -275,23 +302,23 @@ fn test_flashback_for_check_is_in_persist() { assert!(!local_state.get_region().get_is_in_flashback()); // Prepare for flashback - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( 1, 2, kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, cluster.get_region_epoch(1), leader_peer.clone(), - )); + ); let local_state = cluster.region_local_state(1, 2); assert!(local_state.get_region().get_is_in_flashback()); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( 1, 2, kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, cluster.get_region_epoch(1), leader_peer, - )); + ); let local_state = cluster.region_local_state(1, 2); assert!(!local_state.get_region().get_is_in_flashback()); } @@ -302,26 +329,30 @@ fn test_flashback_for_apply_snapshot() { cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); - // Make node3 isolationed + // Make node3 isolated cluster.add_send_filter(IsolationFilterFactory::new(5)); let local_state = cluster.region_local_state(1, 1); assert!(!local_state.get_region().get_is_in_flashback()); + let local_state = cluster.region_local_state(1, 5); + assert!(!local_state.get_region().get_is_in_flashback()); // Write for cluster let value = vec![1_u8; 8096]; multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); // Prepare for flashback - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( 1, 1, kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, cluster.get_region_epoch(1), new_peer(1, 1), - )); + ); let local_state = cluster.region_local_state(1, 1); assert!(local_state.get_region().get_is_in_flashback()); + let local_state = cluster.region_local_state(1, 5); + assert!(!local_state.get_region().get_is_in_flashback()); // Add node 3 back. cluster.clear_send_filters(); @@ -332,13 +363,21 @@ fn test_flashback_for_apply_snapshot() { let local_state = cluster.region_local_state(1, 5); assert!(local_state.get_region().get_is_in_flashback()); - block_on(cluster.send_flashback_msg( + cluster.block_send_flashback_msg( 1, 5, kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, cluster.get_region_epoch(1), new_peer(5, 5), - )); + ); + + // Wait for applying + sleep_ms(500); + + let local_state = cluster.region_local_state(1, 5); + assert!(!local_state.get_region().get_is_in_flashback()); + let local_state = cluster.region_local_state(1, 1); + assert!(!local_state.get_region().get_is_in_flashback()); } fn transfer_leader(cluster: &mut Cluster, region_id: u64, leader: metapb::Peer) { diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 253d1e0c067..f3e3bda8a24 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -658,18 +658,8 @@ fn test_mvcc_flashback() { assert!(get_resp.get_error().has_locked()); assert!(get_resp.value.is_empty()); // Flashback - let mut flashback_to_version_req = FlashbackToVersionRequest::default(); - flashback_to_version_req.set_context(ctx.clone()); - ts += 1; - flashback_to_version_req.set_start_ts(ts); - ts += 1; - flashback_to_version_req.set_commit_ts(ts); - flashback_to_version_req.version = 5; - flashback_to_version_req.start_key = b"a".to_vec(); - flashback_to_version_req.end_key = b"z".to_vec(); - let flashback_resp = client - .kv_flashback_to_version(&flashback_to_version_req) - .unwrap(); + let flashback_resp = must_flashback_to_version(&client, ctx.clone(), 5, ts + 1, ts + 2); + ts += 2; assert!(!flashback_resp.has_region_error()); assert!(flashback_resp.get_error().is_empty()); // Should not meet the lock and can not get the latest data any more. @@ -682,16 +672,7 @@ fn test_mvcc_flashback_block_rw() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); // Flashback - let mut flashback_to_version_req = FlashbackToVersionRequest::default(); - flashback_to_version_req.set_context(ctx.clone()); - flashback_to_version_req.set_start_ts(1); - flashback_to_version_req.set_commit_ts(2); - flashback_to_version_req.version = 0; - flashback_to_version_req.start_key = b"a".to_vec(); - flashback_to_version_req.end_key = b"z".to_vec(); - let flashback_resp = client - .kv_flashback_to_version(&flashback_to_version_req) - .unwrap(); + let flashback_resp = must_flashback_to_version(&client, ctx.clone(), 0, 1, 2); assert!(!flashback_resp.has_region_error()); assert!(flashback_resp.get_error().is_empty()); // Try to read. @@ -731,16 +712,7 @@ fn test_mvcc_flashback_block_scheduling() { let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); // Flashback - let mut flashback_to_version_req = FlashbackToVersionRequest::default(); - flashback_to_version_req.set_context(ctx); - flashback_to_version_req.set_start_ts(1); - flashback_to_version_req.set_commit_ts(2); - flashback_to_version_req.version = 0; - flashback_to_version_req.start_key = b"a".to_vec(); - flashback_to_version_req.end_key = b"z".to_vec(); - let flashback_resp = client - .kv_flashback_to_version(&flashback_to_version_req) - .unwrap(); + let flashback_resp = must_flashback_to_version(&client, ctx, 0, 1, 2); assert!(!flashback_resp.has_region_error()); assert!(flashback_resp.get_error().is_empty()); // Try to transfer leader. @@ -754,6 +726,32 @@ fn test_mvcc_flashback_block_scheduling() { fail::remove("skip_finish_flashback_to_version"); } +#[test] +fn test_mvcc_flashback_unprepared() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + // Prewrite + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 1); + // Commit + must_kv_commit(&client, ctx.clone(), vec![k.clone()], 1, 2, 2); + must_kv_read_equal(&client, ctx.clone(), k.clone(), v.clone(), 3); + // Try to flashback without preparing first. + let mut req = FlashbackToVersionRequest::default(); + req.set_context(ctx.clone()); + req.set_start_ts(4); + req.set_commit_ts(5); + req.version = 0; + req.start_key = b"a".to_vec(); + req.end_key = b"z".to_vec(); + let resp = client.kv_flashback_to_version(&req).unwrap(); + assert!(resp.get_region_error().has_flashback_not_prepared()); + must_kv_read_equal(&client, ctx, k, v, 6); +} + // raft related RPC is tested as parts of test_snapshot.rs, so skip here. #[test] From 39961b106722d17fe7b52c67d9b623f2168812a0 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 17 Oct 2022 18:43:53 +0800 Subject: [PATCH 271/676] raftstore: move check_flashback_state to after check_region_epoch (#13618) ref tikv/tikv#13303, ref pingcap/tidb#38475 Move `check_flashback_state` to after `check_region_epoch` to make sure the Region Cache on the client-side could be refreshed ASAP. Signed-off-by: JmPotato --- components/raftstore/src/store/fsm/peer.rs | 49 ++++++++++++---------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 30877f57263..57f5fe158f5 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -4898,26 +4898,6 @@ where return Ok(Some(resp)); } - let region_id = self.region_id(); - if let Err(e) = util::check_flashback_state(self.fsm.peer.is_in_flashback, msg, region_id) { - match e { - Error::FlashbackInProgress(_) => self - .ctx - .raft_metrics - .invalid_proposal - .flashback_in_progress - .inc(), - Error::FlashbackNotPrepared(_) => self - .ctx - .raft_metrics - .invalid_proposal - .flashback_not_prepared - .inc(), - _ => unreachable!(), - } - return Err(e); - } - // Check whether the store has the right peer to handle the request. let leader_id = self.fsm.peer.leader_id(); let request = msg.get_requests(); @@ -4944,6 +4924,7 @@ where _ => read_only = false, } } + let region_id = self.region_id(); let allow_replica_read = read_only && msg.get_header().get_replica_read(); let flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); let allow_stale_read = read_only && flags.contains(WriteBatchFlags::STALE_READ); @@ -5005,11 +4986,33 @@ where let requested_version = msg.get_header().get_region_epoch().version; self.collect_sibling_region(requested_version, &mut new_regions); self.ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); - Err(Error::EpochNotMatch(m, new_regions)) + return Err(Error::EpochNotMatch(m, new_regions)); + } + Err(e) => return Err(e), + _ => {} + }; + // Check whether the region is in the flashback state and the request could be + // proposed. + if let Err(e) = util::check_flashback_state(self.fsm.peer.is_in_flashback, msg, region_id) { + match e { + Error::FlashbackInProgress(_) => self + .ctx + .raft_metrics + .invalid_proposal + .flashback_in_progress + .inc(), + Error::FlashbackNotPrepared(_) => self + .ctx + .raft_metrics + .invalid_proposal + .flashback_not_prepared + .inc(), + _ => unreachable!(), } - Err(e) => Err(e), - Ok(()) => Ok(None), + return Err(e); } + + Ok(None) } /// Proposes pending batch raft commands (if any), then proposes the From 571b5a263c7e84c2ab8aeb5feaebc8d50cae48cb Mon Sep 17 00:00:00 2001 From: haojinming Date: Mon, 17 Oct 2022 19:03:53 +0800 Subject: [PATCH 272/676] test: Fix incorrect rawkv case test_raw_put_key_guard (#13600) close tikv/tikv#13599 The logic in `test_raw_put_key_guard` is incorrect, fix it. Signed-off-by: haojinming Co-authored-by: Ping Yu Co-authored-by: Ti Chi Robot --- tests/failpoints/cases/test_rawkv.rs | 50 ++++++++++++++++++++++++- tests/failpoints/cases/test_storage.rs | 51 +------------------------- 2 files changed, 50 insertions(+), 51 deletions(-) diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index 547b6144c7c..274a458958e 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -1,8 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, time::Duration}; +use std::{sync::Arc, thread, time::Duration}; -use causal_ts::CausalTsProvider; +use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; use futures::executor::block_on; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ @@ -103,6 +103,10 @@ impl TestSuite { .unwrap(); } + pub fn get_causal_ts_provider(&mut self, node_id: u64) -> Option> { + self.cluster.sim.rl().get_causal_ts_provider(node_id) + } + pub fn must_merge_region_by_key(&mut self, source_key: &[u8], target_key: &[u8]) { let source = self.cluster.get_region(source_key); let target = self.cluster.get_region(target_key); @@ -271,3 +275,45 @@ fn test_region_merge() { fail::remove(FP_GET_TSO); suite.stop(); } + +// Verify the raw key guard correctness in apiv2 +#[test] +fn test_raw_put_key_guard() { + let mut suite = TestSuite::new(3, ApiVersion::V2); + let pause_write_fp = "raftkv_async_write"; + + let test_key = b"rk3".to_vec(); + let test_value = b"v3".to_vec(); + + let region = suite.cluster.get_region(&test_key); + let region_id = region.get_id(); + let client = suite.get_client(region_id); + let ctx = suite.get_context(region_id); + let node_id = region.get_peers()[0].get_id(); + let leader_cm = suite.cluster.sim.rl().get_concurrency_manager(node_id); + let ts_provider = suite.get_causal_ts_provider(node_id).unwrap(); + let ts = block_on(ts_provider.async_get_ts()).unwrap(); + + let copy_test_key = test_key.clone(); + let copy_test_value = test_value.clone(); + let apply_wait_timeout = 2000; // ms, assume send request and apply can be finished in 2s. + fail::cfg(pause_write_fp, "pause").unwrap(); + let handle = thread::spawn(move || { + must_raw_put(&client, ctx, copy_test_key, copy_test_value); + }); + thread::sleep(Duration::from_millis(apply_wait_timeout)); + + // Before raw_put finish, min_ts should be the ts of "key guard" of the raw_put + // request. + assert_eq!(suite.must_raw_get(&test_key), None); + let min_ts = leader_cm.global_min_lock_ts(); + assert_eq!(min_ts.unwrap(), ts.next()); + + fail::remove(pause_write_fp); + handle.join().unwrap(); + + // After raw_put is finished, "key guard" is released. + assert_eq!(suite.must_raw_get(&test_key), Some(test_value)); + let min_ts = leader_cm.global_min_lock_ts(); + assert!(min_ts.is_none()); +} diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 45f5e16675c..ec38958ad57 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -11,7 +11,6 @@ use std::{ }; use api_version::{ApiV1, ApiV2, KvFormat}; -use causal_ts::CausalTsProvider; use collections::HashMap; use engine_traits::DummyFactory; use errors::{extract_key_error, extract_region_error}; @@ -19,8 +18,8 @@ use futures::executor::block_on; use grpcio::*; use kvproto::{ kvrpcpb::{ - self, ApiVersion, AssertionLevel, BatchRollbackRequest, CommandPri, CommitRequest, Context, - GetRequest, Op, PrewriteRequest, PrewriteRequestPessimisticAction::*, RawPutRequest, + self, AssertionLevel, BatchRollbackRequest, CommandPri, CommitRequest, Context, GetRequest, + Op, PrewriteRequest, PrewriteRequestPessimisticAction::*, RawPutRequest, }, tikvpb::TikvClient, }; @@ -1479,49 +1478,3 @@ fn test_raw_put_deadline() { assert!(!put_resp.has_region_error(), "{:?}", put_resp); must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); } - -#[test] -fn test_raw_put_key_guard() { - let api_version = ApiVersion::V2; - let pause_write_fp = "raftkv_async_write"; - let mut cluster = new_server_cluster_with_api_ver(0, 1, api_version); - cluster.run(); - let region = cluster.get_region(b""); - let leader = region.get_peers()[0].clone(); - let node_id = leader.get_id(); - let leader_cm = cluster.sim.rl().get_concurrency_manager(node_id); - let ts_provider = cluster.sim.rl().get_causal_ts_provider(node_id).unwrap(); - let ts = block_on(ts_provider.async_get_ts()).unwrap(); - - let env = Arc::new(Environment::new(1)); - let channel = - ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); - let client = TikvClient::new(channel); - - let mut ctx = Context::default(); - ctx.set_region_id(region.get_id()); - ctx.set_region_epoch(region.get_region_epoch().clone()); - ctx.set_peer(leader); - ctx.set_api_version(api_version); - let mut put_req = RawPutRequest::default(); - put_req.set_context(ctx); - put_req.key = b"rk3".to_vec(); - put_req.value = b"v3".to_vec(); - - fail::cfg(pause_write_fp, "pause").unwrap(); - let handle = thread::spawn(move || { - let _ = client.raw_put(&put_req).unwrap(); - }); - - thread::sleep(Duration::from_millis(100)); - must_get_none(&cluster.get_engine(1), b"rk3"); - let min_ts = leader_cm.global_min_lock_ts(); - assert_eq!(min_ts.unwrap(), ts.next()); - - fail::remove(pause_write_fp); - handle.join().unwrap(); - thread::sleep(Duration::from_millis(100)); - must_get_none(&cluster.get_engine(1), b"rk3"); - let min_ts = leader_cm.global_min_lock_ts(); - assert!(min_ts.is_none()); -} From 13f58a9b05500375b537ab9da58768051fa6fdfa Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 18 Oct 2022 14:27:53 +0800 Subject: [PATCH 273/676] tests: refine the flashback raftstore test (#13615) ref tikv/tikv#13303 Refine the flashback raftstore test. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- components/test_raftstore/src/cluster.rs | 16 +- etc/error_code.toml | 5 + .../integrations/raftstore/test_flashback.rs | 259 +++++------------- 3 files changed, 86 insertions(+), 194 deletions(-) diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index a5ce174c6d2..7a932d324f0 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1421,15 +1421,11 @@ impl Cluster { .unwrap(); } - pub fn block_send_flashback_msg( - &mut self, - region_id: u64, - store_id: u64, - cmd_type: AdminCmdType, - epoch: metapb::RegionEpoch, - peer: metapb::Peer, - ) { + pub fn must_send_flashback_msg(&mut self, region_id: u64, cmd_type: AdminCmdType) { self.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + let leader = self.leader_of_region(region_id).unwrap(); + let store_id = leader.get_store_id(); + let region_epoch = self.get_region_epoch(region_id); block_on(async move { let (result_tx, result_rx) = oneshot::channel(); let cb = Callback::write(Box::new(move |resp| { @@ -1446,8 +1442,8 @@ impl Cluster { admin.set_cmd_type(cmd_type); let mut req = RaftCmdRequest::default(); req.mut_header().set_region_id(region_id); - req.mut_header().set_region_epoch(epoch); - req.mut_header().set_peer(peer); + req.mut_header().set_region_epoch(region_epoch); + req.mut_header().set_peer(leader); req.set_admin_request(admin); req.mut_header() .set_flags(WriteBatchFlags::FLASHBACK.bits()); diff --git a/etc/error_code.toml b/etc/error_code.toml index 7a6b956449f..5cdd770f8d2 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -438,6 +438,11 @@ error = ''' KV:Raftstore:FlashbackInProgress ''' +["KV:Raftstore:FlashbackNotPrepared"] +error = ''' +KV:Raftstore:FlashbackNotPrepared +''' + ["KV:Raftstore:SnapAbort"] error = ''' KV:Raftstore:SnapAbort diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index be70e176f01..810da9d840f 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -1,13 +1,15 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::{Duration, Instant}; +use std::{ + thread::sleep, + time::{Duration, Instant}, +}; use kvproto::{ metapb, - raft_cmdpb::{CmdType, Request}, + raft_cmdpb::{AdminCmdType, CmdType, Request}, }; use test_raftstore::*; -use tikv_util::time::InstantExt; use txn_types::WriteBatchFlags; #[test] @@ -31,7 +33,7 @@ fn test_flashback_unprepared() { req.mut_header().set_peer(new_leader.unwrap()); req.mut_header() .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let resp = cluster.call_command(req, Duration::from_secs(5)).unwrap(); + let resp = cluster.call_command(req, Duration::from_secs(3)).unwrap(); assert!(resp.get_header().get_error().has_flashback_not_prepared()); } @@ -45,13 +47,7 @@ fn test_flashback_for_schedule() { // Prepare for flashback let region = cluster.get_region(b"k1"); - cluster.block_send_flashback_msg( - region.get_id(), - 1, - kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, - cluster.get_region_epoch(1), - new_peer(1, 1), - ); + cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); // Verify the schedule is disabled. let mut region = cluster.get_region(b"k3"); @@ -70,18 +66,9 @@ fn test_flashback_for_schedule() { } ); - // Verify the schedule can be executed if add flashback flag in request's - // header. - must_transfer_leader(&mut cluster, region.get_id(), new_peer(2, 2)); - cluster.block_send_flashback_msg( - region.get_id(), - 2, - kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, - cluster.get_region_epoch(1), - new_peer(2, 2), - ); - // Transfer leader to (1, 1) - cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); + // Transfer leader to (2, 2) should succeed. + cluster.must_transfer_leader(1, new_peer(2, 2)); } #[test] @@ -96,31 +83,19 @@ fn test_flashback_for_write() { // Prepare for flashback let region = cluster.get_region(b"k1"); - cluster.block_send_flashback_msg( - region.get_id(), - 1, - kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, - cluster.get_region_epoch(1), - new_peer(1, 1), - ); + cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); // Write will be blocked let value = vec![1_u8; 8096]; must_get_error_flashback_in_progress(&mut cluster, ®ion, new_put_cmd(b"k1", &value)); - - must_cmd_add_flashback_flag( + // Write with flashback flag will succeed + must_do_cmd_with_flashback_flag( &mut cluster, &mut region.clone(), new_put_cmd(b"k1", &value), ); - cluster.block_send_flashback_msg( - region.get_id(), - 1, - kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, - cluster.get_region_epoch(1), - new_peer(1, 1), - ); + cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); } @@ -139,32 +114,20 @@ fn test_flashback_for_read() { // Prepare for flashback let region = cluster.get_region(b"k1"); - cluster.block_send_flashback_msg( - region.get_id(), - 1, - kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, - cluster.get_region_epoch(1), - new_peer(1, 1), - ); + cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); // read will be blocked must_get_error_flashback_in_progress(&mut cluster, ®ion, new_get_cf_cmd("write", b"k1")); // Verify the read can be executed if add flashback flag in request's // header. - must_cmd_add_flashback_flag( + must_do_cmd_with_flashback_flag( &mut cluster, &mut region.clone(), new_get_cf_cmd("write", b"k1"), ); - cluster.block_send_flashback_msg( - region.get_id(), - 1, - kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, - cluster.get_region_epoch(1), - new_peer(1, 1), - ); + cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); } @@ -180,9 +143,8 @@ fn test_flashback_for_local_read() { // Avoid triggering the log compaction in this test case. cluster.cfg.raft_store.raft_log_gc_threshold = 100; - let node_id = 3u64; - let store_id = 3u64; - let peer = new_peer(store_id, node_id); + let store_id = 3; + let peer = new_peer(store_id, 3); cluster.run(); cluster.must_put(b"k1", b"v1"); @@ -193,25 +155,19 @@ fn test_flashback_for_local_read() { let state = cluster.raft_local_state(region.get_id(), store_id); let last_index = state.get_last_index(); // Make sure the leader transfer procedure timeouts. - std::thread::sleep(election_timeout * 2); + sleep(election_timeout * 2); must_read_on_peer(&mut cluster, peer.clone(), region.clone(), b"k1", b"v1"); // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index); // Prepare for flashback - cluster.block_send_flashback_msg( - region.get_id(), - store_id, - kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, - cluster.get_region_epoch(1), - new_peer(store_id, store_id), - ); + cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 1); - // Wait for apply_res to set leader lease . + // Wait for apply_res to set leader lease. sleep_ms(500); must_error_read_on_peer( @@ -224,7 +180,7 @@ fn test_flashback_for_local_read() { // Wait for the leader's lease to expire to ensure that a renew lease interval // has elapsed. - std::thread::sleep(election_timeout * 2); + sleep(election_timeout * 2); must_error_read_on_peer( &mut cluster, peer.clone(), @@ -237,13 +193,7 @@ fn test_flashback_for_local_read() { let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 1); - cluster.block_send_flashback_msg( - region.get_id(), - store_id, - kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, - cluster.get_region_epoch(1), - new_peer(store_id, store_id), - ); + cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 2); @@ -252,7 +202,7 @@ fn test_flashback_for_local_read() { let state = cluster.raft_local_state(region.get_id(), store_id); let last_index = state.get_last_index(); // Make sure the leader transfer procedure timeouts. - std::thread::sleep(election_timeout * 2); + sleep(election_timeout * 2); must_read_on_peer(&mut cluster, peer, region.clone(), b"k1", b"v1"); // Check the leader does a local read. @@ -267,13 +217,7 @@ fn test_flashback_for_status_cmd_as_region_detail() { let leader = cluster.leader_of_region(1).unwrap(); let region = cluster.get_region(b"k1"); - cluster.block_send_flashback_msg( - region.get_id(), - leader.get_store_id(), - kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, - cluster.get_region_epoch(1), - new_peer(leader.get_store_id(), leader.get_store_id()), - ); + cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); let region_detail = cluster.region_detail(region.get_id(), leader.get_store_id()); assert!(region_detail.has_region()); @@ -295,128 +239,75 @@ fn test_flashback_for_check_is_in_persist() { let mut cluster = new_node_cluster(0, 3); cluster.run(); - let leader_peer = new_peer(2, 2); - cluster.must_transfer_leader(1, leader_peer.clone()); - - let local_state = cluster.region_local_state(1, 2); - assert!(!local_state.get_region().get_is_in_flashback()); + cluster.must_transfer_leader(1, new_peer(2, 2)); + must_check_flashback_state(&mut cluster, 1, 2, false); // Prepare for flashback - cluster.block_send_flashback_msg( - 1, - 2, - kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, - cluster.get_region_epoch(1), - leader_peer.clone(), - ); - let local_state = cluster.region_local_state(1, 2); - assert!(local_state.get_region().get_is_in_flashback()); - - cluster.block_send_flashback_msg( - 1, - 2, - kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, - cluster.get_region_epoch(1), - leader_peer, - ); - let local_state = cluster.region_local_state(1, 2); - assert!(!local_state.get_region().get_is_in_flashback()); + cluster.must_send_flashback_msg(1, AdminCmdType::PrepareFlashback); + must_check_flashback_state(&mut cluster, 1, 2, true); + + cluster.must_send_flashback_msg(1, AdminCmdType::FinishFlashback); + must_check_flashback_state(&mut cluster, 1, 2, false); } #[test] fn test_flashback_for_apply_snapshot() { - let mut cluster = new_node_cluster(0, 5); + let mut cluster = new_node_cluster(0, 3); + configure_for_snapshot(&mut cluster); cluster.run(); + + cluster.must_transfer_leader(1, new_peer(3, 3)); cluster.must_transfer_leader(1, new_peer(1, 1)); - // Make node3 isolated - cluster.add_send_filter(IsolationFilterFactory::new(5)); + must_check_flashback_state(&mut cluster, 1, 1, false); + must_check_flashback_state(&mut cluster, 1, 3, false); - let local_state = cluster.region_local_state(1, 1); - assert!(!local_state.get_region().get_is_in_flashback()); - let local_state = cluster.region_local_state(1, 5); - assert!(!local_state.get_region().get_is_in_flashback()); + // Make store 3 isolated. + cluster.add_send_filter(IsolationFilterFactory::new(3)); - // Write for cluster - let value = vec![1_u8; 8096]; - multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); + // Write some data to trigger snapshot. + for i in 100..110 { + let key = format!("k{}", i); + let value = format!("v{}", i); + cluster.must_put_cf("write", key.as_bytes(), value.as_bytes()); + } // Prepare for flashback - cluster.block_send_flashback_msg( - 1, - 1, - kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, - cluster.get_region_epoch(1), - new_peer(1, 1), - ); - let local_state = cluster.region_local_state(1, 1); - assert!(local_state.get_region().get_is_in_flashback()); - let local_state = cluster.region_local_state(1, 5); - assert!(!local_state.get_region().get_is_in_flashback()); + cluster.must_send_flashback_msg(1, AdminCmdType::PrepareFlashback); + must_check_flashback_state(&mut cluster, 1, 1, true); + must_check_flashback_state(&mut cluster, 1, 3, false); - // Add node 3 back. + // Add store 3 back. cluster.clear_send_filters(); - // Wait for snapshot - sleep_ms(500); - - must_transfer_leader(&mut cluster, 1, new_peer(5, 5)); - let local_state = cluster.region_local_state(1, 5); - assert!(local_state.get_region().get_is_in_flashback()); - - cluster.block_send_flashback_msg( - 1, - 5, - kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, - cluster.get_region_epoch(1), - new_peer(5, 5), - ); + must_check_flashback_state(&mut cluster, 1, 1, true); + must_check_flashback_state(&mut cluster, 1, 3, true); - // Wait for applying - sleep_ms(500); - - let local_state = cluster.region_local_state(1, 5); - assert!(!local_state.get_region().get_is_in_flashback()); - let local_state = cluster.region_local_state(1, 1); - assert!(!local_state.get_region().get_is_in_flashback()); -} - -fn transfer_leader(cluster: &mut Cluster, region_id: u64, leader: metapb::Peer) { - let epoch = cluster.get_region_epoch(region_id); - let admin_req = new_transfer_leader_cmd(leader); - let mut transfer_leader = new_admin_request(region_id, &epoch, admin_req); - transfer_leader - .mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let resp = cluster - .call_command_on_leader(transfer_leader, Duration::from_secs(5)) - .unwrap(); - assert!(!resp.get_header().has_error()); + cluster.must_send_flashback_msg(1, AdminCmdType::FinishFlashback); + must_check_flashback_state(&mut cluster, 1, 1, false); + must_check_flashback_state(&mut cluster, 1, 3, false); } -fn must_transfer_leader( - cluster: &mut Cluster, +fn must_check_flashback_state( + cluster: &mut Cluster, region_id: u64, - leader: metapb::Peer, + store_id: u64, + is_in_flashback: bool, ) { - let timer = Instant::now(); - loop { - cluster.reset_leader_of_region(region_id); - let cur_leader = cluster.leader_of_region(region_id); - if let Some(ref cur_leader) = cur_leader { - if cur_leader.get_id() == leader.get_id() - && cur_leader.get_store_id() == leader.get_store_id() - { - return; - } - } - if timer.saturating_elapsed() > Duration::from_secs(5) { - panic!( - "failed to transfer leader to [{}] {:?}, current leader: {:?}", - region_id, leader, cur_leader - ); + let mut now = Instant::now(); + let timeout = Duration::from_secs(3); + let deadline = now + timeout; + while now < deadline { + let local_state = cluster.region_local_state(region_id, store_id); + if local_state.get_region().get_is_in_flashback() == is_in_flashback { + return; } - transfer_leader(cluster, region_id, leader.clone()); + sleep(Duration::from_millis(10)); + now = Instant::now(); } + panic!( + "region {} on store {} flashback state unmatched, want: {}", + region_id, store_id, is_in_flashback, + ); } fn multi_do_cmd(cluster: &mut Cluster, cmd: kvproto::raft_cmdpb::Request) { @@ -429,7 +320,7 @@ fn multi_do_cmd(cluster: &mut Cluster, cmd: kvproto::raft_cmdpb } } -fn must_cmd_add_flashback_flag( +fn must_do_cmd_with_flashback_flag( cluster: &mut Cluster, region: &mut metapb::Region, cmd: kvproto::raft_cmdpb::Request, @@ -446,7 +337,7 @@ fn must_cmd_add_flashback_flag( req.mut_header().set_peer(new_leader.unwrap()); req.mut_header() .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let resp = cluster.call_command(req, Duration::from_secs(5)).unwrap(); + let resp = cluster.call_command(req, Duration::from_secs(3)).unwrap(); assert!(!resp.get_header().has_error()); } From ed64ed21cfa1c734191549a0db66986dab04f4bd Mon Sep 17 00:00:00 2001 From: hehechen Date: Tue, 18 Oct 2022 16:47:53 +0800 Subject: [PATCH 274/676] resolved_ts: track 1PC (#13579) close tikv/tikv#13353 Signed-off-by: hehechen Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/util.rs | 2 +- components/resolved_ts/src/endpoint.rs | 4 +- .../resolved_ts/tests/failpoints/mod.rs | 6 +-- .../resolved_ts/tests/integrations/mod.rs | 25 +++++++++- components/resolved_ts/tests/mod.rs | 5 ++ .../cases/test_replica_stale_read.rs | 49 +++++++++++++++++++ 6 files changed, 84 insertions(+), 7 deletions(-) diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index a21eb7756e2..5f2c6615527 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -903,7 +903,7 @@ impl RegionReadProgressRegistry { .lock() .unwrap() .get(region_id) - .map(|rp| rp.core.lock().unwrap().applied_index) + .map(|rp| rp.core.lock().unwrap().read_state.idx) } // NOTICE: this function is an alias of `get_safe_ts` to distinguish the diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 76202240a10..a79ff66e384 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -191,7 +191,9 @@ impl ObserveRegion { .resolver .untrack_lock(&key.to_raw().unwrap(), Some(*index)), // One pc command do not contains any lock, so just skip it - ChangeRow::OnePc { .. } => {} + ChangeRow::OnePc { .. } => { + self.resolver.update_tracked_index(*index); + } ChangeRow::IngestSsT => { self.resolver.update_tracked_index(*index); } diff --git a/components/resolved_ts/tests/failpoints/mod.rs b/components/resolved_ts/tests/failpoints/mod.rs index ab4e88f9d25..808f5ed62ff 100644 --- a/components/resolved_ts/tests/failpoints/mod.rs +++ b/components/resolved_ts/tests/failpoints/mod.rs @@ -22,7 +22,7 @@ fn test_check_leader_timeout() { mutation.set_op(Op::Put); mutation.key = k.to_vec(); mutation.value = v.to_vec(); - suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); suite .cluster .must_transfer_leader(region.id, new_peer(1, 1)); @@ -78,7 +78,7 @@ fn test_report_min_resolved_ts() { mutation.set_op(Op::Put); mutation.key = k.to_vec(); mutation.value = v.to_vec(); - suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); // Commit let commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); @@ -112,7 +112,7 @@ fn test_report_min_resolved_ts_disable() { mutation.set_op(Op::Put); mutation.key = k.to_vec(); mutation.value = v.to_vec(); - suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); // Commit let commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); diff --git a/components/resolved_ts/tests/integrations/mod.rs b/components/resolved_ts/tests/integrations/mod.rs index a8acab00625..da28758a5d2 100644 --- a/components/resolved_ts/tests/integrations/mod.rs +++ b/components/resolved_ts/tests/integrations/mod.rs @@ -19,12 +19,12 @@ fn test_resolved_ts_basic() { // Prewrite let (k, v) = (b"k1", b"v"); - let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); let mut mutation = Mutation::default(); mutation.set_op(Op::Put); mutation.key = k.to_vec(); mutation.value = v.to_vec(); - suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts); + suite.must_kv_prewrite(region.id, vec![mutation], k.to_vec(), start_ts, false); // The `resolved-ts` won't be updated due to there is lock on the region, // the `resolved-ts` may not be the `start_ts` of the lock if the `resolved-ts` @@ -81,6 +81,27 @@ fn test_resolved_ts_basic() { } assert!(tracked_index_after > tracked_index_before); + // 1PC + let tracked_index_before = suite.region_tracked_index(r1.id); + + start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let (k, v) = (b"k2", b"v"); + let mut mutation_1pc = Mutation::default(); + mutation_1pc.set_op(Op::Put); + mutation_1pc.key = k.to_vec(); + mutation_1pc.value = v.to_vec(); + suite.must_kv_prewrite(r1.id, vec![mutation_1pc], k.to_vec(), start_ts, true); + + tracked_index_after = suite.region_tracked_index(r1.id); + for _ in 0..10 { + if tracked_index_after > tracked_index_before { + break; + } + tracked_index_after = suite.region_tracked_index(r1.id); + sleep_ms(200) + } + assert!(tracked_index_after > tracked_index_before); + suite.stop(); } diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index cd95b1e911d..376aa216224 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -131,6 +131,7 @@ impl TestSuite { muts: Vec, pk: Vec, ts: TimeStamp, + try_one_pc: bool, ) { let mut prewrite_req = PrewriteRequest::default(); prewrite_req.set_context(self.get_context(region_id)); @@ -138,6 +139,7 @@ impl TestSuite { prewrite_req.primary_lock = pk; prewrite_req.start_version = ts.into_inner(); prewrite_req.lock_ttl = prewrite_req.start_version + 1; + prewrite_req.try_one_pc = try_one_pc; let prewrite_resp = self .get_tikv_client(region_id) .kv_prewrite(&prewrite_req) @@ -152,6 +154,9 @@ impl TestSuite { "{:?}", prewrite_resp.get_errors() ); + if try_one_pc { + assert_ne!(prewrite_resp.get_one_pc_commit_ts(), 0); + } } pub fn must_kv_commit( diff --git a/tests/failpoints/cases/test_replica_stale_read.rs b/tests/failpoints/cases/test_replica_stale_read.rs index 7748ed73b96..3dc7223ae41 100644 --- a/tests/failpoints/cases/test_replica_stale_read.rs +++ b/tests/failpoints/cases/test_replica_stale_read.rs @@ -84,6 +84,55 @@ fn test_stale_read_basic_flow_replicate() { follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value2".to_vec(), get_tso(&pd_client)); } +// Similar to test_stale_read_basic_flow_replicate, but we use 1pc to update. +#[test] +fn test_stale_read_1pc_flow_replicate() { + let (mut cluster, pd_client, mut leader_client) = prepare_for_stale_read(new_peer(1, 1)); + let mut follower_client2 = PeerClient::new(&cluster, 1, new_peer(2, 2)); + // Set the `stale_read` flag + leader_client.ctx.set_stale_read(true); + follower_client2.ctx.set_stale_read(true); + + let commit_ts1 = leader_client.must_kv_write( + &pd_client, + vec![new_mutation(Op::Put, &b"key1"[..], &b"value1"[..])], + b"key1".to_vec(), + ); + + // Can read `value1` with the newest ts + follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), get_tso(&pd_client)); + + // Stop replicate data to follower 2 + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 2) + .direction(Direction::Recv) + .msg_type(MessageType::MsgAppend), + )); + // Update `key1` + leader_client.must_kv_prewrite_one_pc( + vec![new_mutation(Op::Put, &b"key1"[..], &b"value2"[..])], + b"key1".to_vec(), + get_tso(&pd_client), + ); + let read_ts = get_tso(&pd_client); + // wait for advance_resolved_ts. + sleep_ms(200); + // Follower 2 can still read `value1`, but can not read `value2` due + // to it don't have enough data + follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), commit_ts1); + let resp1 = follower_client2.kv_read(b"key1".to_vec(), read_ts); + assert!(resp1.get_region_error().has_data_is_not_ready()); + + // Leader have up to date data so it can read `value2` + leader_client.must_kv_read_equal(b"key1".to_vec(), b"value2".to_vec(), get_tso(&pd_client)); + + // clear the `MsgAppend` filter + cluster.clear_send_filters(); + + // Now we can read `value2` with the newest ts + follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value2".to_vec(), get_tso(&pd_client)); +} + // Testing how mvcc locks could effect stale read service #[test] fn test_stale_read_basic_flow_lock() { From a63944c4c6f51bebaee2ce4be99cfc1f7d3947c9 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 18 Oct 2022 22:03:53 +0800 Subject: [PATCH 275/676] *: fix build with panic engines (#13585) ref tikv/tikv#13131 None Signed-off-by: tabokie --- Cargo.lock | 8 +- Cargo.toml | 159 ++++++--- Makefile | 2 +- cmd/tikv-ctl/Cargo.toml | 42 +-- cmd/tikv-server/Cargo.toml | 4 +- components/api_version/Cargo.toml | 12 +- components/backup-stream/Cargo.toml | 46 +-- components/backup/Cargo.toml | 48 +-- components/batch-system/Cargo.toml | 10 +- components/causal_ts/Cargo.toml | 20 +- components/cdc/Cargo.toml | 46 +-- components/cloud/Cargo.toml | 4 +- components/cloud/aws/Cargo.toml | 20 +- components/cloud/azure/Cargo.toml | 6 +- components/cloud/gcp/Cargo.toml | 6 +- components/codec/Cargo.toml | 6 +- components/collections/Cargo.toml | 2 +- components/concurrency_manager/Cargo.toml | 6 +- components/encryption/Cargo.toml | 14 +- components/encryption/export/Cargo.toml | 14 +- components/encryption/export/examples/ecli.rs | 2 +- components/engine_panic/Cargo.toml | 10 +- components/engine_rocks/Cargo.toml | 26 +- components/engine_rocks/src/engine.rs | 14 +- components/engine_rocks_helper/Cargo.toml | 16 +- components/engine_test/Cargo.toml | 18 +- components/engine_test/src/lib.rs | 20 +- components/engine_tirocks/Cargo.toml | 20 +- components/engine_traits/Cargo.toml | 16 +- components/engine_traits_tests/Cargo.toml | 8 +- components/error_code/Cargo.toml | 4 +- components/external_storage/Cargo.toml | 12 +- components/external_storage/export/Cargo.toml | 28 +- .../external_storage/export/examples/scli.rs | 25 +- components/file_system/Cargo.toml | 8 +- components/into_other/Cargo.toml | 2 +- components/keys/Cargo.toml | 6 +- components/log_wrappers/Cargo.toml | 2 +- components/pd_client/Cargo.toml | 16 +- components/profiler/Cargo.toml | 2 +- components/raft_log_engine/Cargo.toml | 12 +- components/raftstore-v2/Cargo.toml | 30 +- .../raftstore-v2/src/operation/query/local.rs | 6 +- components/raftstore/Cargo.toml | 56 ++-- components/raftstore/src/store/worker/read.rs | 4 +- components/resolved_ts/Cargo.toml | 36 +-- components/resource_metering/Cargo.toml | 8 +- components/security/Cargo.toml | 8 +- components/server/Cargo.toml | 56 ++-- components/server/src/lib.rs | 3 + components/server/src/server.rs | 17 +- components/snap_recovery/Cargo.toml | 26 +- components/sst_importer/Cargo.toml | 30 +- components/test_backup/Cargo.toml | 26 +- components/test_coprocessor/Cargo.toml | 22 +- .../example_plugin/Cargo.toml | 4 +- components/test_pd/Cargo.toml | 10 +- components/test_pd_client/Cargo.toml | 14 +- components/test_raftstore/Cargo.toml | 48 +-- components/test_sst_importer/Cargo.toml | 6 +- components/test_storage/Cargo.toml | 18 +- components/test_util/Cargo.toml | 10 +- components/tidb_query_aggr/Cargo.toml | 14 +- components/tidb_query_common/Cargo.toml | 6 +- components/tidb_query_datatype/Cargo.toml | 14 +- components/tidb_query_executors/Cargo.toml | 20 +- components/tidb_query_expr/Cargo.toml | 20 +- components/tikv_kv/Cargo.toml | 32 +- components/tikv_kv/src/cursor.rs | 20 +- components/tikv_util/Cargo.toml | 18 +- components/tipb_helper/Cargo.toml | 4 +- components/tracker/Cargo.toml | 2 +- components/txn_types/Cargo.toml | 14 +- fuzz/fuzzer-afl/Cargo.toml | 2 +- fuzz/fuzzer-honggfuzz/Cargo.toml | 2 +- fuzz/fuzzer-libfuzzer/Cargo.toml | 2 +- fuzz/targets/Cargo.toml | 4 +- scripts/check-bins.py | 2 +- scripts/clippy | 2 +- scripts/clippy-all | 2 +- src/config.rs | 2 +- src/server/engine_factory_v2.rs | 3 +- src/server/gc_worker/gc_worker.rs | 302 ++++++++---------- src/storage/txn/commands/prewrite.rs | 3 + tests/Cargo.toml | 90 +++--- 85 files changed, 915 insertions(+), 845 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 97c6209b2d7..14951b8e253 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1673,7 +1673,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7531096570974c3a9dcf9e4b8e1cede1ec26cf5046219fb3b9d897503b9be59" [[package]] -name = "example_plugin" +name = "example_coprocessor_plugin" version = "0.1.0" dependencies = [ "coprocessor_plugin_api", @@ -4101,7 +4101,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#5f718cfe50a28f7fee0282c0959670de5962eec8" +source = "git+https://github.com/tikv/raft-engine.git#a0d29980f1448565a6d03f911ebb103c4266f1f4" dependencies = [ "byteorder", "crc32fast", @@ -4135,7 +4135,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#5f718cfe50a28f7fee0282c0959670de5962eec8" +source = "git+https://github.com/tikv/raft-engine.git#a0d29980f1448565a6d03f911ebb103c4266f1f4" dependencies = [ "clap 3.1.6", "env_logger", @@ -6181,7 +6181,7 @@ dependencies = [ "engine_traits", "engine_traits_tests", "error_code", - "example_plugin", + "example_coprocessor_plugin", "fail", "file_system", "flate2", diff --git a/Cargo.toml b/Cargo.toml index c38b98631c8..786b229df3b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,32 +64,32 @@ name = "tikv" [dependencies] anyhow = "1.0" -api_version = { path = "components/api_version", default-features = false } +api_version = { workspace = true } async-stream = "0.2" async-trait = "0.1" backtrace = "0.3" -batch-system = { path = "components/batch-system", default-features = false } +batch-system = { workspace = true } byteorder = "1.2" -case_macros = { path = "components/case_macros" } -causal_ts = { path = "components/causal_ts" } +case_macros = { workspace = true } +causal_ts = { workspace = true } chrono = "0.4" -codec = { path = "components/codec", default-features = false } -collections = { path = "components/collections" } -concurrency_manager = { path = "components/concurrency_manager", default-features = false } -coprocessor_plugin_api = { path = "components/coprocessor_plugin_api" } +codec = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } +coprocessor_plugin_api = { workspace = true } crc32fast = "1.2" crc64fast = "0.1" crossbeam = "0.8" dashmap = "5" -encryption_export = { path = "components/encryption/export", default-features = false } -engine_panic = { path = "components/engine_panic", default-features = false } -engine_rocks = { path = "components/engine_rocks", default-features = false } -engine_test = { path = "components/engine_test", default-features = false } -engine_traits = { path = "components/engine_traits", default-features = false } -engine_traits_tests = { path = "components/engine_traits_tests", default-features = false } -error_code = { path = "components/error_code", default-features = false } +encryption_export = { workspace = true } +engine_panic = { workspace = true } +engine_rocks = { workspace = true } +engine_test = { workspace = true } +engine_traits = { workspace = true } +engine_traits_tests = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "components/file_system", default-features = false } +file_system = { workspace = true } flate2 = { version = "1.0", default-features = false, features = ["zlib"] } futures = { version = "0.3", features = ["thread-pool", "compat"] } futures-executor = "0.3.1" @@ -103,18 +103,18 @@ hex = "0.4" http = "0" hyper = { version = "0.14", features = ["full"] } hyper-tls = "0.5" -into_other = { path = "components/into_other", default-features = false } +into_other = { workspace = true } itertools = "0.10" keyed_priority_queue = "0.4" -keys = { path = "components/keys", default-features = false } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" libc = "0.2" libloading = "0.7" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "components/log_wrappers" } +log_wrappers = { workspace = true } match-template = "0.0.1" -memory_trace_macros = { path = "components/memory_trace_macros" } +memory_trace_macros = { workspace = true } mime = "0.3.13" more-asserts = "0.2" murmur3 = "0.5.1" @@ -122,11 +122,11 @@ nom = { version = "5.1.0", default-features = false, features = ["std"] } notify = "4" num-traits = "0.2.14" num_cpus = "1" -online_config = { path = "components/online_config" } +online_config = { workspace = true } openssl = "0.10" parking_lot = "0.12" paste = "1.0" -pd_client = { path = "components/pd_client", default-features = false } +pd_client = { workspace = true } pin-project = "1.0" pnet_datalink = "0.23" pprof = { git = "https://github.com/tikv/pprof-rs.git", rev = "3fed55af8fc6cf69dbd954a0321c799c5a111e4e", default-features = false, features = ["flamegraph", "protobuf-codec"] } @@ -134,14 +134,14 @@ prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raft_log_engine = { path = "components/raft_log_engine", default-features = false } -raftstore = { path = "components/raftstore", default-features = false, features = ["engine_rocks"] } +raft_log_engine = { workspace = true } +raftstore = { workspace = true, features = ["engine_rocks"] } rand = "0.7.3" regex = "1.3" -resource_metering = { path = "components/resource_metering" } +resource_metering = { workspace = true } rev_lines = "0.2.1" seahash = "4.1.0" -security = { path = "components/security", default-features = false } +security = { workspace = true } semver = "0.11" serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" @@ -150,41 +150,41 @@ serde_json = { version = "1.0", features = ["preserve_order"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } smallvec = "1.4" -sst_importer = { path = "components/sst_importer", default-features = false } +sst_importer = { workspace = true } strum = { version = "0.20", features = ["derive"] } sync_wrapper = "0.1.1" sysinfo = "0.16" tempfile = "3.0" thiserror = "1.0" -tidb_query_aggr = { path = "components/tidb_query_aggr", default-features = false } -tidb_query_common = { path = "components/tidb_query_common", default-features = false } -tidb_query_datatype = { path = "components/tidb_query_datatype", default-features = false } -tidb_query_executors = { path = "components/tidb_query_executors", default-features = false } -tidb_query_expr = { path = "components/tidb_query_expr", default-features = false } -tikv_alloc = { path = "components/tikv_alloc" } -tikv_kv = { path = "components/tikv_kv", default-features = false } -tikv_util = { path = "components/tikv_util", default-features = false } +tidb_query_aggr = { workspace = true } +tidb_query_common = { workspace = true } +tidb_query_datatype = { workspace = true } +tidb_query_executors = { workspace = true } +tidb_query_expr = { workspace = true } +tikv_alloc = { workspace = true } +tikv_kv = { workspace = true } +tikv_util = { workspace = true } time = "0.1" tipb = { git = "https://github.com/pingcap/tipb.git" } tokio = { version = "1.17", features = ["full"] } tokio-openssl = "0.6" tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } toml = "0.5" -tracker = { path = "components/tracker" } -txn_types = { path = "components/txn_types", default-features = false } +tracker = { workspace = true } +txn_types = { workspace = true } url = "2" uuid = { version = "0.8.1", features = ["serde", "v4"] } walkdir = "2" yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] -api_version = { path = "components/api_version", features = ["testexport"] } -example_plugin = { path = "components/test_coprocessor_plugin/example_plugin" } # should be a binary dependency +api_version = { workspace = true, features = ["testexport"] } +example_coprocessor_plugin = { workspace = true } # should be a binary dependency hyper-openssl = "0.9" -panic_hook = { path = "components/panic_hook" } +panic_hook = { workspace = true } reqwest = { version = "0.11", features = ["blocking"] } -test_sst_importer = { path = "components/test_sst_importer", default-features = false } -test_util = { path = "components/test_util", default-features = false } +test_sst_importer = { workspace = true } +test_util = { workspace = true } tokio = { version = "1.17", features = ["macros", "rt-multi-thread", "time"] } zipf = "6.1.0" @@ -241,7 +241,6 @@ members = [ "components/codec", "components/collections", "components/concurrency_manager", - "components/concurrency_manager", "components/coprocessor_plugin_api", "components/encryption", "components/encryption/export", @@ -259,11 +258,12 @@ members = [ "components/online_config", "components/panic_hook", "components/pd_client", + "components/profiler", "components/raftstore", "components/raftstore-v2", "components/resolved_ts", "components/resource_metering", - "components/server", + "components/security", "components/server", "components/snap_recovery", "components/sst_importer", @@ -283,6 +283,7 @@ members = [ "components/tidb_query_executors", "components/tidb_query_expr", "components/tikv_alloc", + "components/tikv_kv", "components/tikv_util", "components/tipb_helper", "components/tracker", @@ -295,6 +296,76 @@ members = [ ] default-members = ["cmd/tikv-server", "cmd/tikv-ctl"] +[workspace.dependencies] +api_version = { path = "components/api_version" } +aws = { path = "components/cloud/aws" } +azure = { path = "components/cloud/azure" } +backup = { path = "components/backup", default-features = false } +backup-stream = { path = "components/backup-stream", default-features = false } +batch-system = { path = "components/batch-system" } +case_macros = { path = "components/case_macros" } +causal_ts = { path = "components/causal_ts" } +cdc = { path = "components/cdc", default-features = false } +cloud = { path = "components/cloud" } +codec = { path = "components/codec" } +collections = { path = "components/collections" } +concurrency_manager = { path = "components/concurrency_manager" } +coprocessor_plugin_api = { path = "components/coprocessor_plugin_api" } +encryption = { path = "components/encryption" } +encryption_export = { path = "components/encryption/export" } +engine_panic = { path = "components/engine_panic" } +engine_rocks = { path = "components/engine_rocks" } +engine_rocks_helper = { path = "components/engine_rocks_helper" } +engine_test = { path = "components/engine_test", default-features = false } +engine_traits = { path = "components/engine_traits" } +engine_traits_tests = { path = "components/engine_traits_tests", default-features = false } +error_code = { path = "components/error_code" } +external_storage = { path = "components/external_storage" } +external_storage_export = { path = "components/external_storage/export" } +file_system = { path = "components/file_system" } +gcp = { path = "components/cloud/gcp" } +into_other = { path = "components/into_other" } +keys = { path = "components/keys" } +log_wrappers = { path = "components/log_wrappers" } +memory_trace_macros = { path = "components/memory_trace_macros" } +online_config = { path = "components/online_config" } +panic_hook = { path = "components/panic_hook" } +pd_client = { path = "components/pd_client" } +profiler = { path = "components/profiler" } +raft_log_engine = { path = "components/raft_log_engine" } +raftstore = { path = "components/raftstore", default-features = false } +raftstore_v2 = { path = "components/raftstore-v2", default-features = false } +resolved_ts = { path = "components/resolved_ts" } +resource_metering = { path = "components/resource_metering" } +security = { path = "components/security" } +server = { path = "components/server" } +snap_recovery = { path = "components/snap_recovery" } +sst_importer = { path = "components/sst_importer" } +test_backup = { path = "components/test_backup" } +test_coprocessor = { path = "components/test_coprocessor", default-features = false } +example_coprocessor_plugin = { path = "components/test_coprocessor_plugin/example_plugin" } +test_pd = { path = "components/test_pd" } +test_pd_client = { path = "components/test_pd_client" } +test_raftstore = { path = "components/test_raftstore", default-features = false } +test_sst_importer = { path = "components/test_sst_importer" } +test_storage = { path = "components/test_storage", default-features = false } +test_util = { path = "components/test_util" } +tidb_query_aggr = { path = "components/tidb_query_aggr" } +tidb_query_codegen = { path = "components/tidb_query_codegen" } +tidb_query_common = { path = "components/tidb_query_common" } +tidb_query_datatype = { path = "components/tidb_query_datatype" } +tidb_query_executors = { path = "components/tidb_query_executors" } +tidb_query_expr = { path = "components/tidb_query_expr" } +tikv = { path = ".", default-features = false } +tikv_alloc = { path = "components/tikv_alloc" } +tikv_kv = { path = "components/tikv_kv", default-features = false } +tikv_util = { path = "components/tikv_util" } +tipb_helper = { path = "components/tipb_helper" } +tracker = { path = "components/tracker" } +txn_types = { path = "components/txn_types" } +# External libs +grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } + [profile.dev.package.grpcio-sys] debug = false opt-level = 1 diff --git a/Makefile b/Makefile index 3229a307e7f..f60fb16bcb0 100644 --- a/Makefile +++ b/Makefile @@ -334,7 +334,7 @@ pre-format: unset-override format: pre-format @cargo fmt - @cargo sort -w ./Cargo.toml ./*/Cargo.toml components/*/Cargo.toml cmd/*/Cargo.toml >/dev/null + @cargo sort -w >/dev/null doc: @cargo doc --workspace --document-private-items \ diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index 13d8b351e21..3b2d1dd2f75 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -45,49 +45,49 @@ test-engines-panic = [ nortcheck = ["engine_rocks/nortcheck"] [dependencies] -backup = { path = "../../components/backup", default-features = false } -cdc = { path = "../../components/cdc", default-features = false } +backup = { workspace = true } +cdc = { workspace = true } chrono = "0.4" clap = "2.32" -collections = { path = "../../components/collections" } -concurrency_manager = { path = "../../components/concurrency_manager", default-features = false } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -encryption_export = { path = "../../components/encryption/export", default-features = false } -engine_rocks = { path = "../../components/engine_rocks", default-features = false } -engine_traits = { path = "../../components/engine_traits", default-features = false } -error_code = { path = "../../components/error_code", default-features = false } -file_system = { path = "../../components/file_system", default-features = false } +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +file_system = { workspace = true } futures = "0.3" gag = "1.0" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } hex = "0.4" -keys = { path = "../../components/keys", default-features = false } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "../../components/log_wrappers" } -pd_client = { path = "../../components/pd_client", default-features = false } +log_wrappers = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-engine-ctl = { git = "https://github.com/tikv/raft-engine.git" } -raft_log_engine = { path = "../../components/raft_log_engine", default-features = false } -raftstore = { path = "../../components/raftstore", default-features = false } +raft_log_engine = { workspace = true } +raftstore = { workspace = true } rand = "0.8" regex = "1" -security = { path = "../../components/security", default-features = false } +security = { workspace = true } serde_json = "1.0" -server = { path = "../../components/server" } +server = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } structopt = "0.3" tempfile = "3.0" -tikv = { path = "../../", default-features = false } -tikv_alloc = { path = "../../components/tikv_alloc" } -tikv_util = { path = "../../components/tikv_util", default-features = false } +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "time"] } toml = "0.5" -txn_types = { path = "../../components/txn_types", default-features = false } +txn_types = { workspace = true } [build-dependencies] cc = "1.0" diff --git a/cmd/tikv-server/Cargo.toml b/cmd/tikv-server/Cargo.toml index 9b1aa869037..c5b5cb6403c 100644 --- a/cmd/tikv-server/Cargo.toml +++ b/cmd/tikv-server/Cargo.toml @@ -33,8 +33,8 @@ pprof-fp = ["tikv/pprof-fp"] [dependencies] clap = "2.32" serde_json = { version = "1.0", features = ["preserve_order"] } -server = { path = "../../components/server", default-features = false } -tikv = { path = "../../", default-features = false } +server = { workspace = true } +tikv = { workspace = true } toml = "0.5" [build-dependencies] diff --git a/components/api_version/Cargo.toml b/components/api_version/Cargo.toml index e2d4beaacbf..421c01a1514 100644 --- a/components/api_version/Cargo.toml +++ b/components/api_version/Cargo.toml @@ -9,14 +9,14 @@ testexport = [] [dependencies] bitflags = "1.0.1" -codec = { path = "../codec", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +codec = { workspace = true } +engine_traits = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } match-template = "0.0.1" thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } -txn_types = { path = "../txn_types", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } +txn_types = { workspace = true } [dev-dependencies] -panic_hook = { path = "../panic_hook" } +panic_hook = { workspace = true } diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 8e6e43c8203..0f3b97461bb 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -23,63 +23,63 @@ async-compression = { version = "0.3.14", features = ["tokio", "zstd"] } async-trait = { version = "0.1" } bytes = "1" chrono = "0.4" -concurrency_manager = { path = "../concurrency_manager" } +concurrency_manager = { workspace = true } crossbeam = "0.8" crossbeam-channel = "0.5" dashmap = "5" -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code" } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } # We cannot update the etcd-client to latest version because of the cyclic requirement. # Also we need wait until https://github.com/etcdv3/etcd-client/pull/43/files to be merged. etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "e0321a1990ee561cf042973666c0db61c8d82364", features = ["pub-response-field", "tls"] } -external_storage = { path = "../external_storage", default-features = false } -external_storage_export = { path = "../external_storage/export", default-features = false } +external_storage = { workspace = true } +external_storage_export = { workspace = true } fail = "0.5" -file_system = { path = "../file_system" } +file_system = { workspace = true } futures = "0.3" futures-io = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } hex = "0.4" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.4" -log_wrappers = { path = "../log_wrappers" } -online_config = { path = "../online_config" } +log_wrappers = { workspace = true } +online_config = { workspace = true } openssl = "0.10" -pd_client = { path = "../pd_client" } +pd_client = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } +raftstore = { workspace = true } regex = "1" -resolved_ts = { path = "../resolved_ts" } +resolved_ts = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1" -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_kv = { path = "../tikv_kv" } -tikv_util = { path = "../tikv_util" } +tidb_query_datatype = { workspace = true } +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_kv = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "macros", "time", "sync"] } tokio-stream = "0.1" tokio-util = { version = "0.7", features = ["compat"] } tonic = "0.5" -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } uuid = "0.8" yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] async-trait = "0.1" -engine_panic = { path = "../engine_panic" } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +engine_panic = { workspace = true } +grpcio = { workspace = true } hex = "0.4" protobuf = { version = "2.8", features = ["bytes"] } rand = "0.8.0" tempdir = "0.3" tempfile = "3.0" -test_raftstore = { path = "../test_raftstore", default-features = false } -test_util = { path = "../test_util", default-features = false } +test_raftstore = { workspace = true } +test_util = { workspace = true } url = "2" walkdir = "2" diff --git a/components/backup/Cargo.toml b/components/backup/Cargo.toml index a59f8949b77..17439a0f615 100644 --- a/components/backup/Cargo.toml +++ b/components/backup/Cargo.toml @@ -33,47 +33,47 @@ mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] [dependencies] -api_version = { path = "../api_version", default-features = false } +api_version = { workspace = true } async-channel = "1.4" -aws = { path = "../cloud/aws" } -causal_ts = { path = "../causal_ts" } -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +aws = { workspace = true } +causal_ts = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } crc64fast = "0.1" -encryption = { path = "../encryption", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } -external_storage = { path = "../external_storage", default-features = false } -external_storage_export = { path = "../external_storage/export", default-features = false } -file_system = { path = "../file_system", default-features = false } +encryption = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +external_storage = { workspace = true } +external_storage_export = { workspace = true } +file_system = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } hex = "0.4" -keys = { path = "../keys", default-features = false } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } -online_config = { path = "../online_config" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +online_config = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } -security = { path = "../security", default-features = false } +raftstore = { workspace = true } +security = { workspace = true } serde = "1.0" serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } # better to not use slog-global, but pass in the logger slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1.0" -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tidb_query_common = { workspace = true } +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-stream = "0.1" -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index 03aabafe3ae..7fe5798f833 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -8,20 +8,20 @@ default = ["test-runner"] test-runner = ["derive_more"] [dependencies] -collections = { path = "../collections" } +collections = { workspace = true } crossbeam = "0.8" derive_more = { version = "0.99", optional = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } +file_system = { workspace = true } lazy_static = "1.3" -online_config = { path = "../online_config" } +online_config = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_alloc = { path = "../tikv_alloc", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } [dev-dependencies] criterion = "0.3" diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index beaf5575c80..d05e9b66ddd 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -8,19 +8,19 @@ publish = false testexport = [] [dependencies] -api_version = { path = "../api_version", default-features = false } +api_version = { workspace = true } async-trait = { version = "0.1" } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } enum_dispatch = "0.3.8" -error_code = { path = "../error_code", default-features = false } +error_code = { workspace = true } fail = "0.5" futures = { version = "0.3" } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } parking_lot = "0.12" -pd_client = { path = "../pd_client", default-features = false } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } @@ -28,12 +28,12 @@ serde = "1.0" serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -test_pd_client = { path = "../test_pd_client" } +test_pd_client = { workspace = true } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1", features = ["sync"] } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } [dev-dependencies] criterion = "0.3" diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index dbefc7df82c..27ce81c57b4 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -28,51 +28,51 @@ mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] [dependencies] -api_version = { path = "../api_version" } +api_version = { workspace = true } bitflags = "1.0" -causal_ts = { path = "../causal_ts" } -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +causal_ts = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" futures = "0.3" futures-timer = "3.0" getset = "0.1" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -keys = { path = "../keys" } +grpcio = { workspace = true } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } -online_config = { path = "../online_config" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +online_config = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } -resolved_ts = { path = "../resolved_ts", default-features = false } -security = { path = "../security", default-features = false } +raftstore = { workspace = true } +resolved_ts = { workspace = true } +security = { workspace = true } semver = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1.0" -tikv = { path = "../..", default-features = false } -tikv_kv = { path = "../tikv_kv", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +tikv = { workspace = true } +tikv_kv = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "time"] } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } [dev-dependencies] criterion = "0.3" -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } tempfile = "3.0" -test_pd_client = { path = "../test_pd_client" } -test_raftstore = { path = "../test_raftstore", default-features = false } -test_util = { path = "../test_util", default-features = false } +test_pd_client = { workspace = true } +test_raftstore = { workspace = true } +test_util = { workspace = true } [[test]] name = "integrations" diff --git a/components/cloud/Cargo.toml b/components/cloud/Cargo.toml index 5752f84e43c..45ae2b40b23 100644 --- a/components/cloud/Cargo.toml +++ b/components/cloud/Cargo.toml @@ -7,7 +7,7 @@ publish = false [dependencies] async-trait = "0.1" derive_more = "0.99.3" -error_code = { path = "../error_code", default-features = false } +error_code = { workspace = true } futures-io = "0.3" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" @@ -16,7 +16,7 @@ prometheus = { version = "0.13", default-features = false, features = ["nightly" protobuf = { version = "2.8", features = ["bytes"] } rusoto_core = "0.46.0" thiserror = "1.0" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } url = "2.0" [dev-dependencies] diff --git a/components/cloud/aws/Cargo.toml b/components/cloud/aws/Cargo.toml index 293509709db..964048121d6 100644 --- a/components/cloud/aws/Cargo.toml +++ b/components/cloud/aws/Cargo.toml @@ -9,36 +9,36 @@ failpoints = ["fail/failpoints"] [dependencies] async-trait = "0.1" +base64 = "0.13.0" bytes = "1.0" -cloud = { path = "../", default-features = false } +cloud = { workspace = true } fail = "0.5" futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } # This is only a dependency to vendor openssl for rusoto. It's not clear exactly # how openssl is built for tikv, but it seems to be controlled by grpcio. This # makes `cargo test -p aws` link correctly. -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } http = "0.2.0" hyper = "0.14" hyper-tls = "0.5" kvproto = { git = "https://github.com/pingcap/kvproto.git" } +lazy_static = "1.3" +md5 = "0.7.0" +prometheus = { version = "0.13", default-features = false, features = ["nightly"] } rusoto_core = "0.46.0" rusoto_credential = "0.46.0" rusoto_kms = { version = "0.46.0", features = ["serialize_structs"] } -rusoto_sts = "0.46.0" rusoto_s3 = { version = "0.46.0", features = ["serialize_structs"] } +rusoto_sts = "0.46.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +thiserror = "1.0" +tikv_util = { workspace = true } # better to not use slog-global, but pass in the logger tokio = { version = "1.5", features = ["time"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../../tikv_util", default-features = false } url = "2.0" -thiserror = "1.0" -lazy_static = "1.3" -prometheus = { version = "0.13", default-features = false, features = ["nightly"] } uuid = "0.8" -md5 = "0.7.0" -base64 = "0.13.0" [dev-dependencies] futures = "0.3" diff --git a/components/cloud/azure/Cargo.toml b/components/cloud/azure/Cargo.toml index 042898c31d5..3d8b01e893b 100644 --- a/components/cloud/azure/Cargo.toml +++ b/components/cloud/azure/Cargo.toml @@ -6,18 +6,18 @@ publish = false [dependencies] async-trait = "0.1" -azure_core = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust"} +azure_core = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust" } azure_identity = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust" } azure_storage = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust", default-features = false, features = ["account", "blob"] } base64 = "0.13" chrono = "0.4" -cloud = { path = "../", default-features = false } +cloud = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } kvproto = { git = "https://github.com/pingcap/kvproto.git" } oauth2 = { version = "4.0.0", default-features = false } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../../tikv_util", default-features = false } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time"] } url = "2.0" diff --git a/components/cloud/gcp/Cargo.toml b/components/cloud/gcp/Cargo.toml index a9045d6f27c..f184377c0af 100644 --- a/components/cloud/gcp/Cargo.toml +++ b/components/cloud/gcp/Cargo.toml @@ -5,8 +5,9 @@ edition = "2018" publish = false [dependencies] -futures-util = { version = "0.3", default-features = false, features = ["io"] } async-trait = "0.1" +cloud = { workspace = true } +futures-util = { version = "0.3", default-features = false, features = ["io"] } http = "0.2.0" hyper = "0.14" hyper-tls = "0.5" @@ -16,8 +17,7 @@ slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debu slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tame-gcs = { version = "0.10", features = ["async-multipart"] } tame-oauth = "0.4.7" -cloud = { path = "../", default-features = false } -tikv_util = { path = "../../tikv_util", default-features = false } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time"] } url = "2.0" diff --git a/components/codec/Cargo.toml b/components/codec/Cargo.toml index 93e91209d66..8b00f077863 100644 --- a/components/codec/Cargo.toml +++ b/components/codec/Cargo.toml @@ -6,14 +6,14 @@ publish = false [dependencies] byteorder = "1.2" -error_code = { path = "../error_code", default-features = false } +error_code = { workspace = true } libc = "0.2" static_assertions = { version = "1.0", features = ["nightly"] } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } [dev-dependencies] bytes = "1.0" -panic_hook = { path = "../panic_hook" } +panic_hook = { workspace = true } protobuf = "2" rand = "0.8" diff --git a/components/collections/Cargo.toml b/components/collections/Cargo.toml index a94cb0216cf..dca0afbc2c8 100644 --- a/components/collections/Cargo.toml +++ b/components/collections/Cargo.toml @@ -6,4 +6,4 @@ publish = false [dependencies] fxhash = "0.2.1" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } diff --git a/components/concurrency_manager/Cargo.toml b/components/concurrency_manager/Cargo.toml index b6e382d7f14..2d008cf49f1 100644 --- a/components/concurrency_manager/Cargo.toml +++ b/components/concurrency_manager/Cargo.toml @@ -8,9 +8,9 @@ version = "0.0.1" fail = "0.5" kvproto = { git = "https://github.com/pingcap/kvproto.git" } parking_lot = "0.12" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["macros", "sync", "time"] } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } # FIXME: switch to the crates.io version after crossbeam-skiplist is released [dependencies.crossbeam-skiplist] @@ -22,7 +22,7 @@ package = "crossbeam-skiplist" criterion = "0.3" futures = "0.3" rand = "0.8.3" -tikv_alloc = { path = "../tikv_alloc", features = ["jemalloc"] } +tikv_alloc = { workspace = true, features = ["jemalloc"] } [[bench]] name = "lock_table" diff --git a/components/encryption/Cargo.toml b/components/encryption/Cargo.toml index 80ad86b3b75..b66ef2aa147 100644 --- a/components/encryption/Cargo.toml +++ b/components/encryption/Cargo.toml @@ -14,16 +14,16 @@ bytes = "1.0" crc32fast = "1.2" crossbeam = "0.8" derive_more = "0.99.3" -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } +engine_traits = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } +file_system = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["std", "io"] } hex = "0.4.2" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" -online_config = { path = "../online_config" } +online_config = { workspace = true } openssl = "0.10" prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } @@ -34,12 +34,12 @@ slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debu # better to not use slog-global, but pass in the logger slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "rt"] } [dev-dependencies] matches = "0.1.8" tempfile = "3.1" -test_util = { path = "../test_util", default-features = false } +test_util = { workspace = true } toml = "0.5" diff --git a/components/encryption/export/Cargo.toml b/components/encryption/export/Cargo.toml index 2fe0b0cb55a..f76c2b8f03c 100644 --- a/components/encryption/export/Cargo.toml +++ b/components/encryption/export/Cargo.toml @@ -12,20 +12,20 @@ cloud-azure = [] [dependencies] async-trait = "0.1" -aws = { path = "../../cloud/aws", optional = true, default-features = false } -cloud = { path = "../../cloud/", default-features = false } +aws = { workspace = true, optional = true } +cloud = { workspace = true } derive_more = "0.99.3" -encryption = { path = "../", default-features = false } -error_code = { path = "../../error_code", default-features = false } -file_system = { path = "../../file_system", default-features = false } +encryption = { workspace = true } +error_code = { workspace = true } +file_system = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } openssl = "0.10" protobuf = { version = "2.8", features = ["bytes"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } # better to not use slog-global, but pass in the logger slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../../tikv_util", default-features = false } +tikv_util = { workspace = true } [dev-dependencies] rust-ini = "0.14.0" -structopt = "0.3" \ No newline at end of file +structopt = "0.3" diff --git a/components/encryption/export/examples/ecli.rs b/components/encryption/export/examples/ecli.rs index d9d2bcb8098..ed2247cc77c 100644 --- a/components/encryption/export/examples/ecli.rs +++ b/components/encryption/export/examples/ecli.rs @@ -3,7 +3,7 @@ use std::io::{Read, Write}; pub use cloud::kms::Config as CloudConfig; -#[cfg(feature = "aws")] +#[cfg(feature = "cloud-aws")] use encryption_export::{create_cloud_backend, KmsConfig}; use encryption_export::{Backend, Error, Result}; use file_system::{File, OpenOptions}; diff --git a/components/engine_panic/Cargo.toml b/components/engine_panic/Cargo.toml index b00180c98d2..c5703994c73 100644 --- a/components/engine_panic/Cargo.toml +++ b/components/engine_panic/Cargo.toml @@ -6,11 +6,11 @@ edition = "2018" publish = false [dependencies] -engine_traits = { path = "../engine_traits", default-features = false } +engine_traits = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } # FIXME: Remove this dep from the engine_traits interface -tikv_util = { path = "../tikv_util", default-features = false } -tracker = { path = "../tracker" } -txn_types = { path = "../txn_types", default-features = false } +tikv_util = { workspace = true } +tracker = { workspace = true } +txn_types = { workspace = true } diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index e35438c4fe1..44dd708271d 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -23,20 +23,20 @@ failpoints = ["fail/failpoints"] nortcheck = [] [dependencies] -api_version = { path = "../api_version", default-features = false } -case_macros = { path = "../case_macros" } -collections = { path = "../collections", default-features = false } +api_version = { workspace = true } +case_macros = { workspace = true } +collections = { workspace = true } derive_more = "0.99.3" -encryption = { path = "../encryption", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +encryption = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } -keys = { path = "../keys", default-features = false } +file_system = { workspace = true } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.4.0" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } num_cpus = "1" -online_config = { path = "../online_config" } +online_config = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = "2" @@ -48,11 +48,11 @@ slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debu slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } slog_derive = "0.2" tempfile = "3.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } time = "0.1" -tracker = { path = "../tracker" } -txn_types = { path = "../txn_types", default-features = false } +tracker = { workspace = true } +txn_types = { workspace = true } [dependencies.rocksdb] git = "https://github.com/tikv/rust-rocksdb.git" diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 9e3bba56bad..41066c85756 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -1,6 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{any::Any, fs, path::Path, sync::Arc}; +use std::{any::Any, sync::Arc}; use engine_traits::{ IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable, TabletAccessor, @@ -50,18 +50,6 @@ impl RocksEngine { self.db.clone() } - pub fn exists(path: &str) -> bool { - let path = Path::new(path); - if !path.exists() || !path.is_dir() { - return false; - } - - // If path is not an empty directory, we say db exists. If path is not an empty - // directory but db has not been created, `DB::list_column_families` fails and - // we can clean up the directory by this indication. - fs::read_dir(&path).unwrap().next().is_some() - } - pub fn set_shared_block_cache(&mut self, enable: bool) { self.shared_block_cache = enable; } diff --git a/components/engine_rocks_helper/Cargo.toml b/components/engine_rocks_helper/Cargo.toml index 77133f09cbd..16e79a3b007 100644 --- a/components/engine_rocks_helper/Cargo.toml +++ b/components/engine_rocks_helper/Cargo.toml @@ -8,21 +8,21 @@ publish = false failpoints = ["fail/failpoints"] [dependencies] -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits" } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" futures = "0.3" -keys = { path = "../keys", default-features = false } +keys = { workspace = true } lazy_static = "1.4.0" -pd_client = { path = "../pd_client", default-features = false } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = "2.8" -raftstore = { path = "../raftstore", default-features = false } +raftstore = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } [dev-dependencies] -engine_test = { path = "../engine_test" } -kvproto = { git = "https://github.com/pingcap/kvproto.git", default-features = false } +engine_test = { workspace = true } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } tempfile = "3.0" diff --git a/components/engine_test/Cargo.toml b/components/engine_test/Cargo.toml index a9bfbfd41d3..16e538acc51 100644 --- a/components/engine_test/Cargo.toml +++ b/components/engine_test/Cargo.toml @@ -24,14 +24,14 @@ test-engines-panic = [ ] [dependencies] -collections = { path = "../collections", default-features = false } -encryption = { path = "../encryption", default-features = false } -engine_panic = { path = "../engine_panic", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -file_system = { path = "../file_system", default-features = false } -raft_log_engine = { path = "../raft_log_engine", default-features = false } +collections = { workspace = true } +encryption = { workspace = true } +engine_panic = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +file_system = { workspace = true } +raft_log_engine = { workspace = true } tempfile = "3.0" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } # FIXME: Remove this dep from the engine_traits interface -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index a1b9e156ce1..b2a574422fb 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -91,7 +91,8 @@ pub mod kv { RocksSnapshot as KvTestSnapshot, RocksWriteBatchVec as KvTestWriteBatch, }; use engine_traits::{ - CfOptions, CfOptionsExt, OpenOptions, Result, TabletAccessor, TabletFactory, CF_DEFAULT, + CfOptions, CfOptionsExt, MiscExt, OpenOptions, Result, TabletAccessor, TabletFactory, + CF_DEFAULT, }; use tikv_util::box_err; @@ -160,10 +161,7 @@ pub mod kv { ) -> Result { if let Some(db) = self.root_db.lock().unwrap().as_ref() { if options.create_new() { - return Err(box_err!( - "root tablet {} already exists", - db.as_inner().path() - )); + return Err(box_err!("root tablet {} already exists", db.path())); } return Ok(db.clone()); } @@ -273,11 +271,7 @@ pub mod kv { // Target tablet exist in the cache if options.create_new() { - return Err(box_err!( - "region {} {} already exists", - id, - tablet.as_inner().path() - )); + return Err(box_err!("region {} {} already exists", id, tablet.path())); } return Ok(tablet.clone()); } else if !options.cache_only() { @@ -382,11 +376,7 @@ pub mod kv { { let reg = self.registry.lock().unwrap(); if let Some(db) = reg.get(&(id, suffix)) { - return Err(box_err!( - "region {} {} already exists", - id, - db.as_inner().path() - )); + return Err(box_err!("region {} {} already exists", id, db.path())); } } diff --git a/components/engine_tirocks/Cargo.toml b/components/engine_tirocks/Cargo.toml index 5ffa4428dd2..8ecce112579 100644 --- a/components/engine_tirocks/Cargo.toml +++ b/components/engine_tirocks/Cargo.toml @@ -4,24 +4,24 @@ version = "0.1.0" edition = "2021" [dependencies] -api_version = { path = "../api_version" } -codec = { path = "../codec" } -collections = { path = "../collections" } +api_version = { workspace = true } +codec = { workspace = true } +collections = { workspace = true } derive_more = "0.99.3" -engine_traits = { path = "../engine_traits" } -keys = { path = "../keys" } +engine_traits = { workspace = true } +keys = { workspace = true } lazy_static = "1.4.0" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } slog_derive = "0.2" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util" } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tirocks = { git = "https://github.com/busyjay/tirocks.git", branch = "dev" } -tracker = { path = "../tracker" } -txn_types = { path = "../txn_types" } +tracker = { workspace = true } +txn_types = { workspace = true } [dev-dependencies] kvproto = { git = "https://github.com/pingcap/kvproto.git" } diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index fb4bb69e5bc..c2e9d729868 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -8,22 +8,22 @@ publish = false failpoints = ["fail/failpoints"] [dependencies] -case_macros = { path = "../case_macros" } -error_code = { path = "../error_code", default-features = false } +case_macros = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } +file_system = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } protobuf = "2" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } serde = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } -tracker = { path = "../tracker" } -txn_types = { path = "../txn_types", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } +tracker = { workspace = true } +txn_types = { workspace = true } [dev-dependencies] serde_derive = "1.0" diff --git a/components/engine_traits_tests/Cargo.toml b/components/engine_traits_tests/Cargo.toml index a011b1cc281..301a7ee5d76 100644 --- a/components/engine_traits_tests/Cargo.toml +++ b/components/engine_traits_tests/Cargo.toml @@ -25,8 +25,8 @@ test-engines-panic = [ ] [dependencies] -engine_test = { path = "../engine_test", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -panic_hook = { path = "../panic_hook" } +engine_test = { workspace = true } +engine_traits = { workspace = true } +panic_hook = { workspace = true } tempfile = "3.0" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } diff --git a/components/error_code/Cargo.toml b/components/error_code/Cargo.toml index 3b7284faa63..484f8d24ad3 100644 --- a/components/error_code/Cargo.toml +++ b/components/error_code/Cargo.toml @@ -13,9 +13,9 @@ name = "error_code_gen" path = "bin.rs" [dependencies] -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } serde = { version = "1.0", features = ["derive"] } -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } diff --git a/components/external_storage/Cargo.toml b/components/external_storage/Cargo.toml index b74af6ff39d..8c92b79583e 100644 --- a/components/external_storage/Cargo.toml +++ b/components/external_storage/Cargo.toml @@ -19,16 +19,16 @@ failpoints = ["fail/failpoints"] async-compression = { version = "0.3.14", features = ["futures-io", "zstd"] } async-trait = "0.1" bytes = "1.0" -encryption = { path = "../encryption" } -engine_traits = { path = "../engine_traits" } +encryption = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" ffi-support = { optional = true, version = "0.4.2" } -file_system = { path = "../file_system" } +file_system = { workspace = true } futures = "0.3" futures-executor = "0.3" futures-io = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { version = "0.10", optional = true, default-features = false, features = ["openssl-vendored"] } +grpcio = { workspace = true, optional = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" libloading = { optional = true, version = "0.7.0" } @@ -40,8 +40,8 @@ rusoto_core = "0.46.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } # better to not use slog-global, but pass in the logger slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "fs", "process"] } tokio-util = { version = "0.7", features = ["compat"] } url = "2.0" diff --git a/components/external_storage/export/Cargo.toml b/components/external_storage/export/Cargo.toml index 82ff01c2afb..076bdd9d0dd 100644 --- a/components/external_storage/export/Cargo.toml +++ b/components/external_storage/export/Cargo.toml @@ -49,36 +49,36 @@ cloud-storage-grpc = [ ] [dependencies] -aws = { optional = true, path = "../../cloud/aws", default-features = false } -azure = { optional = true, path = "../../cloud/azure", default-features = false } -cloud = { path = "../../cloud", default_features = false } -lazy_static = { optional = true, version = "1.3" } -gcp = { optional = true, path = "../../cloud/gcp", default-features = false } -grpcio = { version = "0.10", optional = true, default-features = false, features = ["openssl-vendored"] } -encryption = { path = "../../encryption", default-features = false } -external_storage = { path = "../", default-features = false } -engine_traits = { path = "../../engine_traits", default-features = false } +async-compression = { version = "0.3.14", features = ["futures-io", "zstd"] } +async-trait = "0.1" +aws = { optional = true, workspace = true } +azure = { optional = true, workspace = true } +cloud = { workspace = true } +encryption = { workspace = true } +engine_traits = { workspace = true } +external_storage = { workspace = true } ffi-support = { optional = true, version = "0.4.2" } -file_system = { optional = true, path = "../../file_system" } +file_system = { workspace = true, optional = true } futures = { optional = true, version = "0.3" } futures-executor = { optional = true, version = "0.3" } futures-io = { version = "0.3" } futures-util = { version = "0.3", default-features = false, features = ["io"] } +gcp = { optional = true, workspace = true } +grpcio = { workspace = true, optional = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } +lazy_static = { optional = true, version = "1.3" } libloading = { optional = true, version = "0.7.0" } once_cell = { optional = true, version = "1.3.1" } protobuf = { optional = true, version = "2" } slog-global = { optional = true, version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../../tikv_util" } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "rt", "net"], optional = true } tokio-util = { version = "0.7", features = ["compat"], optional = true } url = "2.0" -async-trait = "0.1" -async-compression = { version = "0.3.14", features = ["futures-io", "zstd"]} [dev-dependencies] -matches = "0.1.8" futures-util = { version = "0.3", default-features = false, features = ["io"] } +matches = "0.1.8" rust-ini = "0.14.0" structopt = "0.3" tempfile = "3.1" diff --git a/components/external_storage/export/examples/scli.rs b/components/external_storage/export/examples/scli.rs index e98e24ab452..0ab54721b29 100644 --- a/components/external_storage/export/examples/scli.rs +++ b/components/external_storage/export/examples/scli.rs @@ -6,9 +6,15 @@ use std::{ path::Path, }; +#[cfg(feature = "cloud-azure")] +use external_storage_export::make_azblob_backend; +#[cfg(feature = "cloud-gcp")] +use external_storage_export::make_gcs_backend; +#[cfg(feature = "cloud-aws")] +use external_storage_export::make_s3_backend; use external_storage_export::{ - create_storage, make_azblob_backend, make_cloud_backend, make_gcs_backend, make_hdfs_backend, - make_local_backend, make_noop_backend, make_s3_backend, ExternalStorage, UnpinReader, + create_storage, make_cloud_backend, make_hdfs_backend, make_local_backend, make_noop_backend, + ExternalStorage, UnpinReader, }; use futures_util::io::{copy, AllowStdIo}; use ini::ini::Ini; @@ -144,7 +150,10 @@ fn create_s3_storage(opt: &Opt) -> Result { if let Some(prefix) = &opt.prefix { config.prefix = prefix.to_string(); } - Ok(make_s3_backend(config)) + #[cfg(feature = "cloud-aws")] + return Ok(make_s3_backend(config)); + #[cfg(not(feature = "cloud-aws"))] + return Err(Error::new(ErrorKind::Other, "missing feature")); } fn create_gcs_storage(opt: &Opt) -> Result { @@ -164,7 +173,10 @@ fn create_gcs_storage(opt: &Opt) -> Result { if let Some(prefix) = &opt.prefix { config.prefix = prefix.to_string(); } - Ok(make_gcs_backend(config)) + #[cfg(feature = "cloud-gcp")] + return Ok(make_gcs_backend(config)); + #[cfg(not(feature = "cloud-gcp"))] + return Err(Error::new(ErrorKind::Other, "missing feature")); } fn create_azure_storage(opt: &Opt) -> Result { @@ -200,7 +212,10 @@ fn create_azure_storage(opt: &Opt) -> Result { if let Some(prefix) = &opt.prefix { config.prefix = prefix.to_string(); } - Ok(make_azblob_backend(config)) + #[cfg(feature = "cloud-azure")] + return Ok(make_azblob_backend(config)); + #[cfg(not(feature = "cloud-azure"))] + return Err(Error::new(ErrorKind::Other, "missing feature")); } fn process() -> Result<()> { diff --git a/components/file_system/Cargo.toml b/components/file_system/Cargo.toml index e3924c0fc25..033d31681c1 100644 --- a/components/file_system/Cargo.toml +++ b/components/file_system/Cargo.toml @@ -8,13 +8,13 @@ publish = false bcc-iosnoop = ["bcc"] [dependencies] -collections = { path = "../collections" } +collections = { workspace = true } crc32fast = "1.2" crossbeam-utils = "0.8.0" fs2 = "0.4" lazy_static = "1.3" libc = "0.2" -online_config = { path = "../online_config" } +online_config = { workspace = true } openssl = "0.10" parking_lot = "0.12" prometheus = { version = "0.13", features = ["nightly"] } @@ -24,8 +24,8 @@ serde = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } strum = { version = "0.20", features = ["derive"] } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time"] } [dev-dependencies] diff --git a/components/into_other/Cargo.toml b/components/into_other/Cargo.toml index be278cdc764..39989a4bf75 100644 --- a/components/into_other/Cargo.toml +++ b/components/into_other/Cargo.toml @@ -5,6 +5,6 @@ edition = "2018" publish = false [dependencies] -engine_traits = { path = "../engine_traits", default-features = false } +engine_traits = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } diff --git a/components/keys/Cargo.toml b/components/keys/Cargo.toml index a9bd4ddbf18..f8318237b20 100644 --- a/components/keys/Cargo.toml +++ b/components/keys/Cargo.toml @@ -7,9 +7,9 @@ publish = false [dependencies] byteorder = "1.2" kvproto = { git = "https://github.com/pingcap/kvproto.git" } -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } [dev-dependencies] -panic_hook = { path = "../panic_hook" } +panic_hook = { workspace = true } diff --git a/components/log_wrappers/Cargo.toml b/components/log_wrappers/Cargo.toml index e8e9a3cc52f..4c9e62b6876 100644 --- a/components/log_wrappers/Cargo.toml +++ b/components/log_wrappers/Cargo.toml @@ -9,4 +9,4 @@ hex = "0.4" protobuf = { version = "2.8", features = ["bytes"] } slog = "2.3" slog-term = "2.4" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } diff --git a/components/pd_client/Cargo.toml b/components/pd_client/Cargo.toml index 44f09485705..c2ee9982bcd 100644 --- a/components/pd_client/Cargo.toml +++ b/components/pd_client/Cargo.toml @@ -8,26 +8,26 @@ publish = false failpoints = ["fail/failpoints"] [dependencies] -collections = { path = "../collections" } -error_code = { path = "../error_code", default-features = false } +collections = { workspace = true } +error_code = { workspace = true } fail = "0.5" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } -security = { path = "../security", default-features = false } +security = { workspace = true } semver = "0.10" serde = "1.0" serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1", features = ["sync"] } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/profiler/Cargo.toml b/components/profiler/Cargo.toml index f0879722b1b..b0c456b209f 100644 --- a/components/profiler/Cargo.toml +++ b/components/profiler/Cargo.toml @@ -8,7 +8,7 @@ publish = false profiling = ["lazy_static", "gperftools", "callgrind", "valgrind_request"] [dependencies] -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } [target.'cfg(unix)'.dependencies] lazy_static = { version = "1.3.0", optional = true } diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index d13e9ea4a0b..2b9d2de73ff 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -5,13 +5,13 @@ publish = false edition = "2018" [dependencies] -encryption = { path = "../encryption" } -engine_traits = { path = "../engine_traits", default-features = false } -file_system = { path = "../file_system" } +encryption = { workspace = true } +engine_traits = { workspace = true } +file_system = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.4.0" num_cpus = "1" -online_config = { path = "../online_config" } +online_config = { workspace = true } protobuf = "2" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-engine = { git = "https://github.com/tikv/raft-engine.git", features = ["swap"] } @@ -19,6 +19,6 @@ serde = "1.0" serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } time = "0.1" -tracker = { path = "../tracker" } +tracker = { workspace = true } diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 09fa707c408..9adaf0c13e2 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -29,35 +29,35 @@ cloud-gcp = ["raftstore/cloud-gcp"] cloud-azure = ["raftstore/cloud-azure"] [dependencies] -batch-system = { path = "../batch-system", default-features = false } -collections = { path = "../collections" } +batch-system = { workspace = true } +collections = { workspace = true } crossbeam = "0.8" -engine_traits = { path = "../engine_traits" } -error_code = { path = "../error_code" } +engine_traits = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "../file_system" } +file_system = { workspace = true } futures = { version = "0.3", features = ["compat"] } -keys = { path = "../keys", default-features = false } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } -log_wrappers = { path = "../log_wrappers" } -pd_client = { path = "../pd_client" } +log_wrappers = { workspace = true } +pd_client = { workspace = true } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0" } -raftstore = { path = "../raftstore" } +raftstore = { workspace = true } slog = "2.3" smallvec = "1.4" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } time = "0.1" -tracker = { path = "../tracker" } -txn_types = { path = "../txn_types", default-features = false } +tracker = { workspace = true } +txn_types = { workspace = true } [dev-dependencies] -engine_test = { path = "../engine_test", default-features = false } +engine_test = { workspace = true } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tempfile = "3.0" -test_pd = { path = "../test_pd" } -test_util = { path = "../test_util" } +test_pd = { workspace = true } +test_util = { workspace = true } [[test]] name = "raftstore-v2-failpoints" diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 78cc9976dab..5986d3e4596 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -432,7 +432,7 @@ mod tests { ctor::{CfOptions, DbOptions}, kv::{KvTestEngine, TestTabletFactoryV2}, }; - use engine_traits::{OpenOptions, Peekable, SyncMutable, TabletFactory, ALL_CFS}; + use engine_traits::{MiscExt, OpenOptions, Peekable, SyncMutable, TabletFactory, ALL_CFS}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; use raftstore::store::{ @@ -758,7 +758,7 @@ mod tests { let (_, delegate) = store_meta.get_executor_and_len(1); let mut delegate = delegate.unwrap(); let tablet = delegate.get_tablet(); - assert_eq!(tablet1.as_inner().path(), tablet.as_inner().path()); + assert_eq!(tablet1.path(), tablet.path()); let snapshot = delegate.get_snapshot(&None); assert_eq!( b"val1".to_vec(), @@ -768,7 +768,7 @@ mod tests { let (_, delegate) = store_meta.get_executor_and_len(2); let mut delegate = delegate.unwrap(); let tablet = delegate.get_tablet(); - assert_eq!(tablet2.as_inner().path(), tablet.as_inner().path()); + assert_eq!(tablet2.path(), tablet.path()); let snapshot = delegate.get_snapshot(&None); assert_eq!( b"val2".to_vec(), diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 4c41b19c828..54eb07e8161 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -28,73 +28,73 @@ cloud-gcp = ["sst_importer/cloud-gcp"] cloud-azure = ["sst_importer/cloud-azure"] [dependencies] -batch-system = { path = "../batch-system", default-features = false } +batch-system = { workspace = true } bitflags = "1.0.1" byteorder = "1.2" bytes = "1.0" -causal_ts = { path = "../causal_ts" } -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +causal_ts = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } crc32fast = "1.2" crossbeam = "0.8" derivative = "2" -encryption = { path = "../encryption", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false, optional = true } +encryption = { workspace = true } +engine_rocks = { workspace = true, optional = true } # Should be [dev-dependencies] but we need to control the features # https://github.com/rust-lang/cargo/issues/6915 -engine_test = { path = "../engine_test", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } +engine_test = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "../file_system", default-features = false } +file_system = { workspace = true } fs2 = "0.4" futures = "0.3" futures-util = { version = "0.3.1", default-features = false, features = ["io"] } getset = "0.1" grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } -into_other = { path = "../into_other", default-features = false } +into_other = { workspace = true } itertools = "0.10" -keys = { path = "../keys", default-features = false } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "../log_wrappers" } -memory_trace_macros = { path = "../memory_trace_macros" } -online_config = { path = "../online_config" } +log_wrappers = { workspace = true } +memory_trace_macros = { workspace = true } +online_config = { workspace = true } openssl = "0.10" ordered-float = "2.6" parking_lot = "0.12" -pd_client = { path = "../pd_client", default-features = false } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0", default-features = false } rand = "0.8.3" -resource_metering = { path = "../resource_metering" } +resource_metering = { workspace = true } serde = "1.0" serde_derive = "1.0" serde_with = "1.4" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } smallvec = "1.4" -sst_importer = { path = "../sst_importer", default-features = false } +sst_importer = { workspace = true } tempfile = "3.0" thiserror = "1.0" -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tidb_query_datatype = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } time = "0.1" tokio = { version = "1.5", features = ["sync", "rt-multi-thread"] } -tracker = { path = "../tracker" } -txn_types = { path = "../txn_types", default-features = false } +tracker = { workspace = true } +txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] -encryption_export = { path = "../encryption/export", default-features = false } -engine_panic = { path = "../engine_panic", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -panic_hook = { path = "../panic_hook" } -test_sst_importer = { path = "../test_sst_importer", default-features = false } +encryption_export = { workspace = true } +engine_panic = { workspace = true } +engine_rocks = { workspace = true } +panic_hook = { workspace = true } +test_sst_importer = { workspace = true } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index fd6c7552f5d..d62f2f6c1db 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -1654,14 +1654,14 @@ mod tests { let mut delegate = delegate.unwrap(); assert_eq!(1, delegate.region.id); let tablet = delegate.get_tablet(); - assert_eq!(kv_engine.as_inner().path(), tablet.as_inner().path()); + assert_eq!(kv_engine.path(), tablet.path()); let (len, delegate) = store_meta.get_executor_and_len(2); assert_eq!(2, len); let mut delegate = delegate.unwrap(); assert_eq!(2, delegate.region.id); let tablet = delegate.get_tablet(); - assert_eq!(kv_engine.as_inner().path(), tablet.as_inner().path()); + assert_eq!(kv_engine.path(), tablet.path()); } fn prepare_read_delegate( diff --git a/components/resolved_ts/Cargo.toml b/components/resolved_ts/Cargo.toml index 6309440202b..d4a7e3d1ca2 100644 --- a/components/resolved_ts/Cargo.toml +++ b/components/resolved_ts/Cargo.toml @@ -23,41 +23,41 @@ test-engines-rocksdb = ["tikv/test-engines-rocksdb"] test-engines-panic = ["tikv/test-engines-panic"] [dependencies] -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -engine_traits = { path = "../engine_traits", default-features = false } +engine_traits = { workspace = true } fail = "0.5" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored"] } +grpcio = { workspace = true } hex = "0.4" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } -online_config = { path = "../online_config" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +online_config = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false } -security = { path = "../security", default-features = false } +raftstore = { workspace = true } +security = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1.0" -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +tikv = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "time"] } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } [dev-dependencies] -engine_rocks = { path = "../engine_rocks", default-features = false } -panic_hook = { path = "../panic_hook" } +engine_rocks = { workspace = true } +panic_hook = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } tempfile = "3.0" -test_raftstore = { path = "../test_raftstore", default-features = false } -test_sst_importer = { path = "../test_sst_importer" } -test_util = { path = "../test_util", default-features = false } -tikv_kv = { path = "../tikv_kv" } +test_raftstore = { workspace = true } +test_sst_importer = { workspace = true } +test_util = { workspace = true } +tikv_kv = { workspace = true } [[test]] name = "integrations" diff --git a/components/resource_metering/Cargo.toml b/components/resource_metering/Cargo.toml index 72a0c0dc339..acb2dff89d3 100644 --- a/components/resource_metering/Cargo.toml +++ b/components/resource_metering/Cargo.toml @@ -4,15 +4,15 @@ version = "0.0.1" edition = "2018" [dependencies] -collections = { path = "../collections" } +collections = { workspace = true } crossbeam = "0.8" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -online_config = { path = "../online_config" } +online_config = { workspace = true } pdqselect = "0.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } @@ -20,7 +20,7 @@ serde = "1.0" serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../tikv_util" } +tikv_util = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } diff --git a/components/security/Cargo.toml b/components/security/Cargo.toml index 8257d04f51f..4599b1df43e 100644 --- a/components/security/Cargo.toml +++ b/components/security/Cargo.toml @@ -8,13 +8,13 @@ publish = false tonic = ["dep:tonic"] [dependencies] -collections = { path = "../collections" } -encryption = { path = "../encryption", default-features = false } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +collections = { workspace = true } +encryption = { workspace = true } +grpcio = { workspace = true } serde = "1.0" serde_derive = "1.0" serde_json = "1.0" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } tonic = { version = "0.5", features = ["tls"], optional = true } [dev-dependencies] diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index 6f54f37ba0b..1f4d98b2847 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -33,54 +33,54 @@ nortcheck = ["engine_rocks/nortcheck"] backup-stream-debug = ["backup-stream/backup-stream-debug"] [dependencies] -api_version = { path = "../api_version" } -backup = { path = "../backup", default-features = false } -backup-stream = { path = "../backup-stream", default-features = false } -causal_ts = { path = "../causal_ts" } -cdc = { path = "../cdc", default-features = false } +api_version = { workspace = true } +backup = { workspace = true } +backup-stream = { workspace = true } +causal_ts = { workspace = true } +cdc = { workspace = true } chrono = "0.4" clap = "2.32" -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -encryption = { path = "../encryption", default-features = false } -encryption_export = { path = "../encryption/export", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_rocks_helper = { path = "../engine_rocks_helper" } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } -file_system = { path = "../file_system", default-features = false } +encryption = { workspace = true } +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_rocks_helper = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +file_system = { workspace = true } fs2 = "0.4" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored"] } +grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } hex = "0.4" -keys = { path = "../keys", default-features = false } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "../log_wrappers" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raft_log_engine = { path = "../raft_log_engine", default-features = false } -raftstore = { path = "../raftstore", default-features = false, features = ["engine_rocks"] } +raft_log_engine = { workspace = true } +raftstore = { workspace = true, features = ["engine_rocks"] } rand = "0.8" -resolved_ts = { path = "../../components/resolved_ts", default-features = false } -resource_metering = { path = "../resource_metering" } -security = { path = "../security", default-features = false, features = ["tonic"] } +resolved_ts = { workspace = true } +resource_metering = { workspace = true } +security = { workspace = true, features = ["tonic"] } serde_json = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -snap_recovery = { path = "../snap_recovery", default-features = false } +snap_recovery = { workspace = true } tempfile = "3.0" -tikv = { path = "../..", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } toml = "0.5" -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [target.'cfg(unix)'.dependencies] diff --git a/components/server/src/lib.rs b/components/server/src/lib.rs index 8a46f601a75..57793792289 100644 --- a/components/server/src/lib.rs +++ b/components/server/src/lib.rs @@ -1,5 +1,8 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +#![allow(incomplete_features)] +#![feature(specialization)] + #[macro_use] extern crate tikv_util; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 2320d1156f4..2295839a806 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1595,10 +1595,23 @@ pub trait ConfiguredRaftEngine: RaftEngine { _: &Option>, _: &Option, ) -> Self; - fn as_rocks_engine(&self) -> Option<&RocksEngine> { + fn as_rocks_engine(&self) -> Option<&RocksEngine>; + fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool); +} + +impl ConfiguredRaftEngine for T { + default fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Option, + ) -> Self { + unimplemented!() + } + default fn as_rocks_engine(&self) -> Option<&RocksEngine> { None } - fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool) {} + default fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool) {} } impl ConfiguredRaftEngine for RocksEngine { diff --git a/components/snap_recovery/Cargo.toml b/components/snap_recovery/Cargo.toml index d82601f577a..1b69d8ba150 100644 --- a/components/snap_recovery/Cargo.toml +++ b/components/snap_recovery/Cargo.toml @@ -7,26 +7,26 @@ publish = false [dependencies] chrono = "0.4" -encryption = { path = "../../components/encryption", default-features = false } -encryption_export = { path = "../../components/encryption/export", default-features = false } -engine_rocks = { path = "../../components/engine_rocks", default-features = false } -engine_traits = { path = "../../components/engine_traits", default-features = false } +encryption = { workspace = true } +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } futures = { version = "0.3", features = ["executor"] } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -keys = { path = "../../components/keys", default-features = false } +grpcio = { workspace = true } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto" } log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -pd_client = { path = "../pd_client", default-features = false } +pd_client = { workspace = true } protobuf = { version = "2.8", features = ["bytes"] } -raft_log_engine = { path = "../raft_log_engine", default-features = false } -raftstore = { path = "../../components/raftstore", default-features = false } +raft_log_engine = { workspace = true } +raftstore = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } structopt = "0.3" tempfile = "3.0" thiserror = "1.0" -tikv = { path = "../.." } -tikv_alloc = { path = "../../components/tikv_alloc" } -tikv_util = { path = "../../components/tikv_util", default-features = false } +tikv = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } toml = "0.5" -txn_types = { path = "../../components/txn_types", default-features = false } +txn_types = { workspace = true } diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index 887c9df6655..6b5fbd9127f 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -13,22 +13,22 @@ cloud-storage-grpc = ["external_storage_export/cloud-storage-grpc"] cloud-storage-dylib = ["external_storage_export/cloud-storage-dylib"] [dependencies] -api_version = { path = "../api_version", default-features = false } +api_version = { workspace = true } crc32fast = "1.2" dashmap = "5" -encryption = { path = "../encryption", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } -external_storage_export = { path = "../external_storage/export", default-features = false } -file_system = { path = "../file_system", default-features = false } +encryption = { workspace = true } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } +external_storage_export = { workspace = true } +file_system = { workspace = true } futures = { version = "0.3", features = ["thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -keys = { path = "../keys", default-features = false } +grpcio = { workspace = true } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } openssl = "0.10" prometheus = { version = "0.13", default-features = false } serde = "1.0" @@ -36,13 +36,13 @@ serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "rt-multi-thread", "macros"] } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } [dev-dependencies] tempfile = "3.0" -test_sst_importer = { path = "../test_sst_importer", default-features = false } -test_util = { path = "../test_util", default-features = false } +test_sst_importer = { workspace = true } +test_util = { workspace = true } diff --git a/components/test_backup/Cargo.toml b/components/test_backup/Cargo.toml index ea85e329202..902e57d5eed 100644 --- a/components/test_backup/Cargo.toml +++ b/components/test_backup/Cargo.toml @@ -11,24 +11,24 @@ cloud-gcp = ["external_storage_export/cloud-gcp"] cloud-azure = ["external_storage_export/cloud-azure"] [dependencies] -api_version = { path = "../api_version" } -backup = { path = "../backup" } -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager" } +api_version = { workspace = true } +backup = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } crc64fast = "0.1" -engine_traits = { path = "../engine_traits" } -external_storage_export = { path = "../external_storage/export", default-features = false } -file_system = { path = "../file_system", default-features = false } +engine_traits = { workspace = true } +external_storage_export = { workspace = true } +file_system = { workspace = true } futures = "0.3" futures-executor = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } protobuf = "2" rand = "0.8" tempfile = "3.0" -test_raftstore = { path = "../test_raftstore" } -tidb_query_common = { path = "../tidb_query_common" } -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } -txn_types = { path = "../txn_types", default-features = false } +test_raftstore = { workspace = true } +tidb_query_common = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } +txn_types = { workspace = true } diff --git a/components/test_coprocessor/Cargo.toml b/components/test_coprocessor/Cargo.toml index 6a12f16138f..a3bb3f8e476 100644 --- a/components/test_coprocessor/Cargo.toml +++ b/components/test_coprocessor/Cargo.toml @@ -20,18 +20,18 @@ test-engines-panic = [ ] [dependencies] -api_version = { path = "../api_version" } -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } +api_version = { workspace = true } +collections = { workspace = true } +concurrency_manager = { workspace = true } +engine_rocks = { workspace = true } futures = "0.3" kvproto = { git = "https://github.com/pingcap/kvproto.git" } protobuf = "2" -resource_metering = { path = "../resource_metering" } -test_storage = { path = "../test_storage", default-features = false } -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +resource_metering = { workspace = true } +test_storage = { workspace = true } +tidb_query_common = { workspace = true } +tidb_query_datatype = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } tipb = { git = "https://github.com/pingcap/tipb.git" } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } diff --git a/components/test_coprocessor_plugin/example_plugin/Cargo.toml b/components/test_coprocessor_plugin/example_plugin/Cargo.toml index cda1f2fa0c7..6bbc8b25012 100644 --- a/components/test_coprocessor_plugin/example_plugin/Cargo.toml +++ b/components/test_coprocessor_plugin/example_plugin/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "example_plugin" +name = "example_coprocessor_plugin" version = "0.1.0" edition = "2018" publish = false @@ -8,4 +8,4 @@ publish = false crate-type = ["dylib"] [dependencies] -coprocessor_plugin_api = { path = "../../coprocessor_plugin_api" } +coprocessor_plugin_api = { workspace = true } diff --git a/components/test_pd/Cargo.toml b/components/test_pd/Cargo.toml index efdc1a5a23c..d9163706895 100644 --- a/components/test_pd/Cargo.toml +++ b/components/test_pd/Cargo.toml @@ -5,13 +5,13 @@ edition = "2018" publish = false [dependencies] -collections = { path = "../collections" } +collections = { workspace = true } fail = "0.5" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } -pd_client = { path = "../pd_client", default-features = false } -security = { path = "../security", default-features = false } +pd_client = { workspace = true } +security = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } diff --git a/components/test_pd_client/Cargo.toml b/components/test_pd_client/Cargo.toml index 8894ce4a8e4..ad2b20de5a0 100644 --- a/components/test_pd_client/Cargo.toml +++ b/components/test_pd_client/Cargo.toml @@ -5,18 +5,18 @@ edition = "2018" publish = false [dependencies] -collections = { path = "../collections" } +collections = { workspace = true } fail = "0.5" futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } -keys = { path = "../keys", default-features = false } +grpcio = { workspace = true } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } -log_wrappers = { path = "../log_wrappers" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +pd_client = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index c442ab71137..fb627dccb11 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -23,43 +23,43 @@ test-engines-panic = [ ] [dependencies] -api_version = { path = "../api_version" } +api_version = { workspace = true } backtrace = "0.3" -causal_ts = { path = "../causal_ts", features = ["testexport"] } -collections = { path = "../collections" } -concurrency_manager = { path = "../concurrency_manager", default-features = false } +causal_ts = { workspace = true, features = ["testexport"] } +collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" -encryption_export = { path = "../encryption/export", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_rocks_helper = { path = "../engine_rocks_helper" } -engine_test = { path = "../engine_test", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_rocks_helper = { workspace = true } +engine_test = { workspace = true } +engine_traits = { workspace = true } fail = "0.5" -file_system = { path = "../file_system" } +file_system = { workspace = true } futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } -keys = { path = "../keys", default-features = false } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } -pd_client = { path = "../pd_client", default-features = false } +log_wrappers = { workspace = true } +pd_client = { workspace = true } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raftstore = { path = "../raftstore", default-features = false, features = ["testexport"] } +raftstore = { workspace = true, features = ["testexport"] } rand = "0.8" -resolved_ts = { path = "../resolved_ts" } -resource_metering = { path = "../resource_metering" } -security = { path = "../security", default-features = false } -server = { path = "../server" } +resolved_ts = { workspace = true } +resource_metering = { workspace = true } +security = { workspace = true } +server = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } # better to not use slog-global, but pass in the logger slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tempfile = "3.0" -test_pd_client = { path = "../test_pd_client" } -test_util = { path = "../test_util", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +test_pd_client = { workspace = true } +test_util = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } -txn_types = { path = "../txn_types", default-features = false } +txn_types = { workspace = true } diff --git a/components/test_sst_importer/Cargo.toml b/components/test_sst_importer/Cargo.toml index 71b8a69cf75..b0c3e96ef5a 100644 --- a/components/test_sst_importer/Cargo.toml +++ b/components/test_sst_importer/Cargo.toml @@ -10,8 +10,8 @@ test = false [dependencies] crc32fast = "1.2" -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -keys = { path = "../keys", default-features = false } +engine_rocks = { workspace = true } +engine_traits = { workspace = true } +keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } uuid = { version = "0.8.1", features = ["serde", "v4"] } diff --git a/components/test_storage/Cargo.toml b/components/test_storage/Cargo.toml index 65aa08cd101..04adc4e6de4 100644 --- a/components/test_storage/Cargo.toml +++ b/components/test_storage/Cargo.toml @@ -21,14 +21,14 @@ test-engines-panic = [ ] [dependencies] -api_version = { path = "../api_version" } -collections = { path = "../collections" } +api_version = { workspace = true } +collections = { workspace = true } futures = "0.3" kvproto = { git = "https://github.com/pingcap/kvproto.git" } -pd_client = { path = "../pd_client", default-features = false } -raftstore = { path = "../raftstore", default-features = false } -test_raftstore = { path = "../test_raftstore", default-features = false } -tikv = { path = "../../", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } -tracker = { path = "../tracker", default-features = false } -txn_types = { path = "../txn_types", default-features = false } +pd_client = { workspace = true } +raftstore = { workspace = true } +test_raftstore = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } +tracker = { workspace = true } +txn_types = { workspace = true } diff --git a/components/test_util/Cargo.toml b/components/test_util/Cargo.toml index c5dc5dfd1d2..8aca28b092b 100644 --- a/components/test_util/Cargo.toml +++ b/components/test_util/Cargo.toml @@ -12,16 +12,16 @@ cloud-azure = ["encryption_export/cloud-azure"] [dependencies] backtrace = "0.3" -collections = { path = "../collections" } -encryption_export = { path = "../encryption/export", default-features = false } +collections = { workspace = true } +encryption_export = { workspace = true } fail = "0.5" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } rand = "0.8" rand_isaac = "0.3" -security = { path = "../security", default-features = false } +security = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tempfile = "3.0" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } time = "0.1" diff --git a/components/tidb_query_aggr/Cargo.toml b/components/tidb_query_aggr/Cargo.toml index e1642fb6f31..db8d9d64faf 100644 --- a/components/tidb_query_aggr/Cargo.toml +++ b/components/tidb_query_aggr/Cargo.toml @@ -7,13 +7,13 @@ description = "Vector aggr functions of query engine to run TiDB pushed down exe [dependencies] match-template = "0.0.1" -tidb_query_codegen = { path = "../tidb_query_codegen" } -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tidb_query_expr = { path = "../tidb_query_expr", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +tidb_query_codegen = { workspace = true } +tidb_query_common = { workspace = true } +tidb_query_datatype = { workspace = true } +tidb_query_expr = { workspace = true } +tikv_util = { workspace = true } tipb = { git = "https://github.com/pingcap/tipb.git" } [dev-dependencies] -panic_hook = { path = "../panic_hook" } -tipb_helper = { path = "../tipb_helper", default-features = false } +panic_hook = { workspace = true } +tipb_helper = { workspace = true } diff --git a/components/tidb_query_common/Cargo.toml b/components/tidb_query_common/Cargo.toml index 0efadbd48e9..05133b130e7 100644 --- a/components/tidb_query_common/Cargo.toml +++ b/components/tidb_query_common/Cargo.toml @@ -9,16 +9,16 @@ description = "Common utility of a query engine to run TiDB pushed down executor anyhow = "1.0" async-trait = "0.1" derive_more = "0.99.3" -error_code = { path = "../error_code", default-features = false } +error_code = { workspace = true } futures = "0.3" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" serde_json = "1.0" thiserror = "1.0" -tikv_util = { path = "../tikv_util", default-features = false } +tikv_util = { workspace = true } time = "0.1" yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index 7eb9a296ac2..de8f0b41110 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -13,14 +13,14 @@ boolinator = "2.4.0" bstr = "0.2.8" chrono = "0.4" chrono-tz = "0.5.1" -codec = { path = "../codec", default-features = false } -collections = { path = "../collections" } +codec = { workspace = true } +collections = { workspace = true } encoding_rs = { git = "https://github.com/xiongjiwei/encoding_rs.git", rev = "68e0bc5a72a37a78228d80cd98047326559cf43c" } -error_code = { path = "../error_code", default-features = false } +error_code = { workspace = true } hex = "0.4" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } match-template = "0.0.1" nom = { version = "5.1.0", default-features = false, features = ["std"] } num = { version = "0.3", default-features = false } @@ -35,7 +35,7 @@ slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debu slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } static_assertions = { version = "1.0", features = ["nightly"] } thiserror = "1.0" -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tidb_query_common = { workspace = true } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } tipb = { git = "https://github.com/pingcap/tipb.git" } diff --git a/components/tidb_query_executors/Cargo.toml b/components/tidb_query_executors/Cargo.toml index ada01c8aef0..e448340eddf 100644 --- a/components/tidb_query_executors/Cargo.toml +++ b/components/tidb_query_executors/Cargo.toml @@ -7,27 +7,27 @@ description = "A vector query engine to run TiDB pushed down executors" [dependencies] async-trait = "0.1" -codec = { path = "../codec", default-features = false } -collections = { path = "../collections" } +codec = { workspace = true } +collections = { workspace = true } fail = "0.5" futures = { version = "0.3", features = ["compat"] } itertools = "0.10" kvproto = { git = "https://github.com/pingcap/kvproto.git" } -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } match-template = "0.0.1" protobuf = { version = "2.8", features = ["bytes"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } smallvec = "1.4" -tidb_query_aggr = { path = "../tidb_query_aggr", default-features = false } -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tidb_query_expr = { path = "../tidb_query_expr", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +tidb_query_aggr = { workspace = true } +tidb_query_common = { workspace = true } +tidb_query_datatype = { workspace = true } +tidb_query_expr = { workspace = true } +tikv_util = { workspace = true } tipb = { git = "https://github.com/pingcap/tipb.git" } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] anyhow = "1.0" -tidb_query_codegen = { path = "../tidb_query_codegen", default-features = false } -tipb_helper = { path = "../tipb_helper", default-features = false } +tidb_query_codegen = { workspace = true } +tipb_helper = { workspace = true } diff --git a/components/tidb_query_expr/Cargo.toml b/components/tidb_query_expr/Cargo.toml index a04553b5b6d..1ca4a46b6dd 100644 --- a/components/tidb_query_expr/Cargo.toml +++ b/components/tidb_query_expr/Cargo.toml @@ -9,11 +9,11 @@ description = "Vector expressions of query engine to run TiDB pushed down execut base64 = "0.13" bstr = "0.2.8" byteorder = "1.2" -codec = { path = "../codec", default-features = false } -file_system = { path = "../file_system", default-features = false } +codec = { workspace = true } +file_system = { workspace = true } flate2 = { version = "=1.0.11", default-features = false, features = ["zlib"] } hex = "0.4" -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } match-template = "0.0.1" num = { version = "0.3", default-features = false } num-traits = "0.2" @@ -25,10 +25,10 @@ safemem = { version = "0.3", default-features = false } serde = "1.0" serde_json = "1.0" static_assertions = { version = "1.0", features = ["nightly"] } -tidb_query_codegen = { path = "../tidb_query_codegen" } -tidb_query_common = { path = "../tidb_query_common", default-features = false } -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } -tikv_util = { path = "../tikv_util", default-features = false } +tidb_query_codegen = { workspace = true } +tidb_query_common = { workspace = true } +tidb_query_datatype = { workspace = true } +tikv_util = { workspace = true } time = "0.1" tipb = { git = "https://github.com/pingcap/tipb.git" } twoway = "0.2.0" @@ -37,6 +37,6 @@ uuid = { version = "0.8.1", features = ["v4"] } [dev-dependencies] bstr = "0.2.8" chrono = "0.4" -panic_hook = { path = "../panic_hook" } -profiler = { path = "../profiler" } -tipb_helper = { path = "../tipb_helper", default-features = false } +panic_hook = { workspace = true } +profiler = { workspace = true } +tipb_helper = { workspace = true } diff --git a/components/tikv_kv/Cargo.toml b/components/tikv_kv/Cargo.toml index 768f67626c2..6ee74371674 100644 --- a/components/tikv_kv/Cargo.toml +++ b/components/tikv_kv/Cargo.toml @@ -26,31 +26,31 @@ test-engines-panic = [ [dependencies] backtrace = "0.3" -collections = { path = "../collections" } -engine_panic = { path = "../engine_panic", default-features = false } -engine_rocks = { path = "../engine_rocks", default-features = false } -engine_test = { path = "../engine_test", default-features = false } -engine_traits = { path = "../engine_traits", default-features = false } -error_code = { path = "../error_code", default-features = false } +collections = { workspace = true } +engine_panic = { workspace = true } +engine_rocks = { workspace = true } +engine_test = { workspace = true } +engine_traits = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "../file_system" } +file_system = { workspace = true } futures = { version = "0.3", features = ["thread-pool", "compat"] } -into_other = { path = "../into_other", default-features = false } +into_other = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } -log_wrappers = { path = "../log_wrappers" } -pd_client = { path = "../pd_client" } +log_wrappers = { workspace = true } +pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" -raftstore = { path = "../raftstore", default-features = false } +raftstore = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } slog_derive = "0.2" tempfile = "3.0" thiserror = "1.0" -tikv_util = { path = "../tikv_util", default-features = false } -tracker = { path = "../tracker" } -txn_types = { path = "../txn_types", default-features = false } +tikv_util = { workspace = true } +tracker = { workspace = true } +txn_types = { workspace = true } [dev-dependencies] -keys = { path = "../keys", default-features = false } -panic_hook = { path = "../panic_hook" } +keys = { workspace = true } +panic_hook = { workspace = true } diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index e0eaad4c0ce..2c9a071fbbb 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -579,7 +579,6 @@ mod tests { util::{new_engine_opt, FixedPrefixSliceTransform}, RocksCfOptions, RocksDbOptions, RocksEngine, RocksSnapshot, }; - use engine_test::new_temp_engine; use engine_traits::{IterOptions, SyncMutable, CF_DEFAULT}; use keys::data_key; use kvproto::metapb::{Peer, Region}; @@ -666,11 +665,20 @@ mod tests { #[test] fn test_reverse_iterate() { - let path = Builder::new().prefix("test-cursor").tempdir().unwrap(); - let engines = new_temp_engine(&path); - let (region, test_data) = load_default_dataset(engines.kv.clone()); + let path = Builder::new() + .prefix("test_reverse_iterate") + .tempdir() + .unwrap(); + let cf_opts = RocksCfOptions::default(); + let engine = new_engine_opt( + path.path().to_str().unwrap(), + RocksDbOptions::default(), + vec![(CF_DEFAULT, cf_opts)], + ) + .unwrap(); + let (region, test_data) = load_default_dataset(engine.clone()); - let snap = RegionSnapshot::::from_raw(engines.kv.clone(), region); + let snap = RegionSnapshot::::from_raw(engine.clone(), region); let mut statistics = CfStatistics::default(); let it = snap.iter(CF_DEFAULT, IterOptions::default()).unwrap(); let mut iter = Cursor::new(it, ScanMode::Mixed, false); @@ -725,7 +733,7 @@ mod tests { // test last region let mut region = Region::default(); region.mut_peers().push(Peer::default()); - let snap = RegionSnapshot::::from_raw(engines.kv, region); + let snap = RegionSnapshot::::from_raw(engine, region); let it = snap.iter(CF_DEFAULT, IterOptions::default()).unwrap(); let mut iter = Cursor::new(it, ScanMode::Mixed, false); assert!( diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 5b508a4a4d4..5ff65b33df3 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -14,27 +14,27 @@ backtrace = "0.3.9" byteorder = "1.2" bytes = "1.0" chrono = "0.4" -codec = { path = "../codec", default-features = false } -collections = { path = "../collections" } +codec = { workspace = true } +collections = { workspace = true } cpu-time = "1.0.0" crc32fast = "1.2" crossbeam = "0.8" derive_more = "0.99.3" -error_code = { path = "../error_code", default-features = false } +error_code = { workspace = true } fail = "0.5" futures = { version = "0.3", features = ["compat", "thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } http = "0.2.0" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } nix = "0.24" num-traits = "0.2" num_cpus = "1" -online_config = { path = "../online_config" } +online_config = { workspace = true } openssl = "0.10" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } @@ -51,12 +51,12 @@ slog-json = "2.3" slog-term = "2.4" sysinfo = "0.16" thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } +tikv_alloc = { workspace = true } time = "0.1" tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-executor = "0.1" tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } -tracker = { path = "../tracker" } +tracker = { workspace = true } url = "2" yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } @@ -67,7 +67,7 @@ procfs = { version = "0.12", default-features = false } [dev-dependencies] gag = "1.0" -panic_hook = { path = "../panic_hook" } +panic_hook = { workspace = true } protobuf = "2" regex = "1.0" tempfile = "3.0" diff --git a/components/tipb_helper/Cargo.toml b/components/tipb_helper/Cargo.toml index 1e7f30c4c9f..31d2c290fdc 100644 --- a/components/tipb_helper/Cargo.toml +++ b/components/tipb_helper/Cargo.toml @@ -5,6 +5,6 @@ edition = "2018" publish = false [dependencies] -codec = { path = "../codec", default-features = false } -tidb_query_datatype = { path = "../tidb_query_datatype", default-features = false } +codec = { workspace = true } +tidb_query_datatype = { workspace = true } tipb = { git = "https://github.com/pingcap/tipb.git" } diff --git a/components/tracker/Cargo.toml b/components/tracker/Cargo.toml index f9b97010bd8..b369fab9628 100644 --- a/components/tracker/Cargo.toml +++ b/components/tracker/Cargo.toml @@ -5,7 +5,7 @@ edition = "2018" publish = false [dependencies] -collections = { path = "../../components/collections" } +collections = { workspace = true } crossbeam-utils = "0.8" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1" diff --git a/components/txn_types/Cargo.toml b/components/txn_types/Cargo.toml index 18127d81254..9ccfe0bb323 100644 --- a/components/txn_types/Cargo.toml +++ b/components/txn_types/Cargo.toml @@ -7,17 +7,17 @@ publish = false [dependencies] bitflags = "1.0.1" byteorder = "1.2" -codec = { path = "../codec", default-features = false } -collections = { path = "../collections" } -error_code = { path = "../error_code", default-features = false } +codec = { workspace = true } +collections = { workspace = true } +error_code = { workspace = true } farmhash = "1.1.5" kvproto = { git = "https://github.com/pingcap/kvproto.git" } -log_wrappers = { path = "../log_wrappers" } +log_wrappers = { workspace = true } slog = "2.3" thiserror = "1.0" -tikv_alloc = { path = "../tikv_alloc" } -tikv_util = { path = "../tikv_util", default-features = false } +tikv_alloc = { workspace = true } +tikv_util = { workspace = true } [dev-dependencies] -panic_hook = { path = "../panic_hook" } +panic_hook = { workspace = true } rand = "0.8" diff --git a/fuzz/fuzzer-afl/Cargo.toml b/fuzz/fuzzer-afl/Cargo.toml index 4987508c80b..6c97305a253 100644 --- a/fuzz/fuzzer-afl/Cargo.toml +++ b/fuzz/fuzzer-afl/Cargo.toml @@ -4,7 +4,7 @@ version = "0.0.1" publish = false [dependencies] -fuzz-targets = { path = "../targets", default-features = false } +fuzz-targets = { path = "../targets" } # AFL only works for x86 targets [target.'cfg(all(not(target_os = "windows"), target_arch = "x86_64"))'.dependencies] diff --git a/fuzz/fuzzer-honggfuzz/Cargo.toml b/fuzz/fuzzer-honggfuzz/Cargo.toml index 51b6fa0e975..500f7466af3 100644 --- a/fuzz/fuzzer-honggfuzz/Cargo.toml +++ b/fuzz/fuzzer-honggfuzz/Cargo.toml @@ -4,7 +4,7 @@ version = "0.0.1" publish = false [dependencies] -fuzz-targets = { path = "../targets", default-features = false } +fuzz-targets = { path = "../targets" } [target.'cfg(not(target_os = "windows"))'.dependencies] honggfuzz = "0.5.47" diff --git a/fuzz/fuzzer-libfuzzer/Cargo.toml b/fuzz/fuzzer-libfuzzer/Cargo.toml index 97e27b015d8..db508147afa 100644 --- a/fuzz/fuzzer-libfuzzer/Cargo.toml +++ b/fuzz/fuzzer-libfuzzer/Cargo.toml @@ -4,5 +4,5 @@ version = "0.0.1" publish = false [dependencies] -fuzz-targets = { path = "../targets", default-features = false } +fuzz-targets = { path = "../targets" } libfuzzer-sys = "0.3.1" diff --git a/fuzz/targets/Cargo.toml b/fuzz/targets/Cargo.toml index 35de6e02f58..878ce33aea9 100644 --- a/fuzz/targets/Cargo.toml +++ b/fuzz/targets/Cargo.toml @@ -10,5 +10,5 @@ path = "mod.rs" [dependencies] anyhow = "1.0" byteorder = "1" -tidb_query_datatype = { path = "../../components/tidb_query_datatype", default-features = false } -tikv_util = { path = "../../components/tikv_util", default-features = false } +tidb_query_datatype = { workspace = true } +tikv_util = { workspace = true } diff --git a/scripts/check-bins.py b/scripts/check-bins.py index e8c7bf03791..aaa13e6b9de 100644 --- a/scripts/check-bins.py +++ b/scripts/check-bins.py @@ -13,7 +13,7 @@ WHITE_LIST = { "online_config", "online_config_derive", "tidb_query_codegen", "panic_hook", "fuzz", "fuzzer_afl", "fuzzer_honggfuzz", "fuzzer_libfuzzer", - "coprocessor_plugin_api", "example_plugin", "memory_trace_macros", "case_macros", + "coprocessor_plugin_api", "example_coprocessor_plugin", "memory_trace_macros", "case_macros", "tracker" } diff --git a/scripts/clippy b/scripts/clippy index e03ea2bfa8f..c5999ad670c 100755 --- a/scripts/clippy +++ b/scripts/clippy @@ -48,4 +48,4 @@ CLIPPY_LINTS=( cargo clippy --workspace \ --exclude fuzz-targets --exclude fuzzer-honggfuzz --exclude fuzzer-afl --exclude fuzzer-libfuzzer \ - --features "${TIKV_ENABLE_FEATURES}" "$@" -- "${CLIPPY_LINTS[@]}" + --no-default-features --features "${TIKV_ENABLE_FEATURES}" "$@" -- "${CLIPPY_LINTS[@]}" diff --git a/scripts/clippy-all b/scripts/clippy-all index 44b0663e106..e9257cf0c35 100755 --- a/scripts/clippy-all +++ b/scripts/clippy-all @@ -15,7 +15,7 @@ if [[ -n "$SHELL_DEBUG" ]] ; then set -x fi -./scripts/clippy --all-targets +./scripts/clippy --all-targets --features "testexport failpoints" # for pkg in "components/cdc" "components/backup" "cmd" "tests"; do # cd $pkg diff --git a/src/config.rs b/src/config.rs index 68193fe0ba9..9dcf17d17d5 100644 --- a/src/config.rs +++ b/src/config.rs @@ -37,7 +37,7 @@ use engine_rocks::{ DEFAULT_PROP_SIZE_INDEX_DISTANCE, }; use engine_traits::{ - CfOptions as _, CfOptionsExt, DbOptions as _, DbOptionsExt, TabletAccessor, + CfOptions as _, CfOptionsExt, DbOptions as _, DbOptionsExt, MiscExt, TabletAccessor, TabletErrorCollector, TitanCfOptions as _, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use file_system::IoRateLimiter; diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 73331580990..b4a7688ef68 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -8,7 +8,8 @@ use std::{ use collections::HashMap; use engine_rocks::RocksEngine; use engine_traits::{ - CfOptions, CfOptionsExt, OpenOptions, Result, TabletAccessor, TabletFactory, CF_DEFAULT, + CfOptions, CfOptionsExt, MiscExt, OpenOptions, Result, TabletAccessor, TabletFactory, + CF_DEFAULT, }; use crate::server::engine_factory::KvEngineFactory; diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 82496068b99..5b638a01f48 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -1446,12 +1446,10 @@ where #[cfg(any(test, feature = "testexport"))] pub mod test_gc_worker { - use std::sync::Arc; + use std::sync::{Arc, Mutex}; use collections::HashMap; use engine_rocks::{RocksEngine, RocksSnapshot}; - use engine_test::kv::TestTabletFactoryV2; - use engine_traits::{KvEngine, OpenOptions, TabletFactory}; use kvproto::{ kvrpcpb::Context, metapb::{Peer, Region}, @@ -1570,17 +1568,15 @@ pub mod test_gc_worker { } } - #[derive(Clone)] + #[derive(Clone, Default)] pub struct MultiRocksEngine { - // Factory is not a normal way to fetch tablet and is just used in test to ease the test. - // Note: at most one tablet is allowed to exist for each region in the cache of factory - pub factory: Arc, + pub engines: Arc>>, pub region_info: HashMap, } impl Engine for MultiRocksEngine { - type Snap = RegionSnapshot; - type Local = RocksEngine; + type Snap = ::Snap; + type Local = ::Local; fn kv_engine(&self) -> Option { None @@ -1590,36 +1586,10 @@ pub mod test_gc_worker { &self, region_modifies: HashMap>, ) -> kv::Result<()> { - for (region_id, mut modifies) in region_modifies { - for modify in &mut modifies { - match modify { - Modify::Delete(_, ref mut key) => { - let bytes = keys::data_key(key.as_encoded()); - *key = Key::from_encoded(bytes); - } - Modify::Put(_, ref mut key, _) => { - let bytes = keys::data_key(key.as_encoded()); - *key = Key::from_encoded(bytes); - } - Modify::PessimisticLock(ref mut key, _) => { - let bytes = keys::data_key(key.as_encoded()); - *key = Key::from_encoded(bytes); - } - Modify::DeleteRange(_, ref mut key1, ref mut key2, _) => { - let bytes = keys::data_key(key1.as_encoded()); - *key1 = Key::from_encoded(bytes); - let bytes = keys::data_end_key(key2.as_encoded()); - *key2 = Key::from_encoded(bytes); - } - } - } - - let tablet = self - .factory - .open_tablet(region_id, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); - - write_modifies(&tablet, modifies)?; + for (region_id, modifies) in region_modifies { + let mut map = HashMap::default(); + map.insert(region_id, modifies); + self.engines.lock().unwrap()[®ion_id].modify_on_kv_engine(map)?; } Ok(()) @@ -1628,35 +1598,10 @@ pub mod test_gc_worker { fn async_write( &self, ctx: &Context, - mut batch: WriteData, + batch: WriteData, callback: EngineCallback<()>, ) -> EngineResult<()> { - batch.modifies.iter_mut().for_each(|modify| match modify { - Modify::Delete(_, ref mut key) => { - *key = Key::from_encoded(keys::data_key(key.as_encoded())); - } - Modify::Put(_, ref mut key, _) => { - *key = Key::from_encoded(keys::data_key(key.as_encoded())); - } - Modify::PessimisticLock(ref mut key, _) => { - *key = Key::from_encoded(keys::data_key(key.as_encoded())); - } - Modify::DeleteRange(_, ref mut start_key, ref mut end_key, _) => { - *start_key = Key::from_encoded(keys::data_key(start_key.as_encoded())); - *end_key = Key::from_encoded(keys::data_end_key(end_key.as_encoded())); - } - }); - let tablet = self - .factory - .open_tablet( - ctx.region_id, - None, - OpenOptions::default().set_cache_only(true), - ) - .unwrap(); - - callback(write_modifies(&tablet, batch.modifies)); - Ok(()) + self.engines.lock().unwrap()[&ctx.region_id].async_write(ctx, batch, callback) } fn async_snapshot( @@ -1665,15 +1610,12 @@ pub mod test_gc_worker { callback: EngineCallback, ) -> EngineResult<()> { let region_id = ctx.pb_ctx.region_id; - let tablet = self - .factory - .open_tablet(region_id, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); - callback(Ok(RegionSnapshot::from_snapshot( - Arc::new(tablet.snapshot()), - Arc::new(self.region_info.get(®ion_id).unwrap().clone()), - ))); - Ok(()) + self.engines + .lock() + .unwrap() + .get_mut(®ion_id) + .unwrap() + .async_snapshot(ctx, callback) } } } @@ -1683,6 +1625,7 @@ mod tests { use std::{ collections::{BTreeMap, BTreeSet}, + path::Path, sync::mpsc::{self, channel}, thread, time::Duration, @@ -1690,11 +1633,7 @@ mod tests { use api_version::{ApiV2, KvFormat, RawValue}; use engine_rocks::{util::get_cf_handle, RocksEngine}; - use engine_test::{ - ctor::{CfOptions, DbOptions}, - kv::TestTabletFactoryV2, - }; - use engine_traits::{OpenOptions, TabletFactory, ALL_CFS}; + use engine_traits::Peekable as _; use futures::executor::block_on; use kvproto::{ kvrpcpb::{ApiVersion, Op}, @@ -2620,47 +2559,55 @@ mod tests { // region 2: includes ("k10", "value-10") to ("k19", "value-19") // region 3: includes ("k20", "value-20") to ("k29", "value-29") fn multi_gc_engine_setup( + path: &Path, store_id: u64, put_start_ts: u64, delete_start_ts: u64, need_deletion: bool, ) -> ( - Arc, MultiRocksEngine, Arc, GcRunner, Vec, mpsc::Receiver, ) { - // Building a tablet factory - let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); - let path = Builder::new().prefix("multi-rocks-gc").tempdir().unwrap(); - let factory = Arc::new(TestTabletFactoryV2::new(path.path(), ops, cf_opts)); + let mut engine = MultiRocksEngine::default(); // Note: as the tablet split is not supported yet, we artificially divide the // region to: 1 ["", "k10"], 2 ["k10", "k20"], 3["k20", "30"] let r1 = init_region(b"", b"k10", 1, Some(store_id)); + engine.region_info.insert(1, r1.clone()); + engine.engines.lock().unwrap().insert( + 1, + PrefixedEngine( + TestEngineBuilder::new() + .path(path.join("1")) + .build() + .unwrap(), + ), + ); let r2 = init_region(b"k10", b"k20", 2, Some(store_id)); + engine.region_info.insert(2, r2.clone()); + engine.engines.lock().unwrap().insert( + 2, + PrefixedEngine( + TestEngineBuilder::new() + .path(path.join("2")) + .build() + .unwrap(), + ), + ); let r3 = init_region(b"k20", b"", 3, Some(store_id)); - let _ = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - let _ = factory - .open_tablet(2, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - let _ = factory - .open_tablet(3, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - - let mut region_info = HashMap::default(); - region_info.insert(1, r1.clone()); - region_info.insert(2, r2.clone()); - region_info.insert(3, r3.clone()); - let mut engine = MultiRocksEngine { - factory: factory.clone(), - region_info, - }; + engine.region_info.insert(3, r3.clone()); + engine.engines.lock().unwrap().insert( + 3, + PrefixedEngine( + TestEngineBuilder::new() + .path(path.join("3")) + .build() + .unwrap(), + ), + ); let (tx, rx) = mpsc::channel(); let feature_gate = FeatureGate::default(); @@ -2705,36 +2652,29 @@ mod tests { } } - ( - factory, - engine, - ri_provider, - gc_runner, - vec![r1, r2, r3], - rx, - ) + (engine, ri_provider, gc_runner, vec![r1, r2, r3], rx) } #[test] fn test_gc_for_multi_rocksdb() { + let dir = Builder::new() + .prefix("test_gc_for_multi_rocksdb") + .tempdir() + .unwrap(); let store_id = 1; let put_start_ts = 100; let delete_start_ts = 150; - let (factory, mut engine, _ri_provider, mut gc_runner, regions, _) = - multi_gc_engine_setup(store_id, put_start_ts, delete_start_ts, true); + let (mut engine, _ri_provider, mut gc_runner, regions, _) = + multi_gc_engine_setup(dir.path(), store_id, put_start_ts, delete_start_ts, true); gc_runner.gc(regions[0].clone(), 200.into()).unwrap(); gc_runner.gc(regions[1].clone(), 200.into()).unwrap(); gc_runner.gc(regions[2].clone(), 200.into()).unwrap(); for region_id in 1..=3 { - let db = factory - .open_tablet(region_id, None, OpenOptions::default().set_cache_only(true)) - .unwrap() - .as_inner() - .clone(); - let cf = get_cf_handle(&db, CF_WRITE).unwrap(); + let region_engine = engine.engines.lock().unwrap()[®ion_id].clone(); + for i in 10 * (region_id - 1)..10 * region_id { let k = format!("k{:02}", i).into_bytes(); @@ -2745,19 +2685,30 @@ mod tests { let mut raw_k = vec![b'z']; let suffix = Key::from_raw(&k).append_ts((delete_start_ts + 1).into()); raw_k.extend_from_slice(suffix.as_encoded()); - assert!(db.get_cf(cf, &raw_k).unwrap().is_none()); + assert!( + region_engine + .kv_engine() + .unwrap() + .get_value_cf(CF_WRITE, &raw_k) + .unwrap() + .is_none() + ); } } } #[test] fn test_gc_keys_for_multi_rocksdb() { + let dir = Builder::new() + .prefix("test_gc_keys_for_multi_rocksdb") + .tempdir() + .unwrap(); let store_id = 1; let put_start_ts = 100; let delete_start_ts = 150; - let (factory, mut engine, ri_provider, mut gc_runner, ..) = - multi_gc_engine_setup(store_id, put_start_ts, delete_start_ts, true); + let (mut engine, ri_provider, mut gc_runner, ..) = + multi_gc_engine_setup(dir.path(), store_id, put_start_ts, delete_start_ts, true); let mut keys = Vec::new(); for i in 0..30 { @@ -2774,12 +2725,8 @@ mod tests { .unwrap(); for region_id in 1..=3 { - let db = factory - .open_tablet(region_id, None, OpenOptions::default().set_cache_only(true)) - .unwrap() - .as_inner() - .clone(); - let cf = get_cf_handle(&db, CF_WRITE).unwrap(); + let region_engine = engine.engines.lock().unwrap()[®ion_id].clone(); + for i in 10 * (region_id - 1)..10 * region_id { let k = format!("k{:02}", i).into_bytes(); let val = format!("value-{:02}", i).into_bytes(); @@ -2789,10 +2736,24 @@ mod tests { raw_k.extend_from_slice(suffix.as_encoded()); if i % 2 == 0 { - assert!(db.get_cf(cf, &raw_k).unwrap().is_some()); + assert!( + region_engine + .kv_engine() + .unwrap() + .get_value_cf(CF_WRITE, &raw_k) + .unwrap() + .is_some() + ); must_get_on_region(&mut engine, region_id, &k, delete_start_ts - 1, &val); } else { - assert!(db.get_cf(cf, &raw_k).unwrap().is_none()); + assert!( + region_engine + .kv_engine() + .unwrap() + .get_value_cf(CF_WRITE, &raw_k) + .unwrap() + .is_none() + ); must_get_none_on_region(&mut engine, region_id, &k, delete_start_ts - 1); } } @@ -2801,34 +2762,38 @@ mod tests { #[test] fn test_raw_gc_keys_for_multi_rocksdb() { - let store_id = 1; - // Building a tablet factory - let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); - let path = Builder::new() - .prefix("multi-rocks-raw-gc") + let dir = Builder::new() + .prefix("test_raw_gc_keys_for_multi_rocksdb") .tempdir() .unwrap(); - let factory = Arc::new(TestTabletFactoryV2::new(path.path(), ops, cf_opts)); + let store_id = 1; + + let mut engine = MultiRocksEngine::default(); // Note: as the tablet split is not supported yet, we artificially divide the // region to: 1 ["", "k10"], 2 ["k10", ""] let r1 = init_region(b"", b"k10", 1, Some(store_id)); + engine.region_info.insert(1, r1.clone()); + engine.engines.lock().unwrap().insert( + 1, + PrefixedEngine( + TestEngineBuilder::new() + .path(dir.path().join("1")) + .build() + .unwrap(), + ), + ); let r2 = init_region(b"k10", b"", 2, Some(store_id)); - let _ = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - let _ = factory - .open_tablet(2, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - - let mut region_info = HashMap::default(); - region_info.insert(1, r1.clone()); - region_info.insert(2, r2.clone()); - let mut engine = MultiRocksEngine { - factory, - region_info, - }; + engine.region_info.insert(2, r2.clone()); + engine.engines.lock().unwrap().insert( + 2, + PrefixedEngine( + TestEngineBuilder::new() + .path(dir.path().join("2")) + .build() + .unwrap(), + ), + ); let (tx, _rx) = mpsc::channel(); let ri_provider = Arc::new(MockRegionInfoProvider::new(vec![r1, r2])); @@ -2945,10 +2910,14 @@ mod tests { end_key: &[u8], exected_regions: Vec, ) { + let dir = Builder::new() + .prefix("test_destroy_range_for_multi_rocksdb_impl") + .tempdir() + .unwrap(); let store_id = 1; let put_start_ts = 100; - let (factory, mut engine, ri_provider, gc_runner, _, _rx) = - multi_gc_engine_setup(store_id, put_start_ts, 0, false); + let (mut engine, ri_provider, gc_runner, _, _rx) = + multi_gc_engine_setup(dir.path(), store_id, put_start_ts, 0, false); let start_key = Key::from_raw(start_key); let end_key = Key::from_raw(end_key); @@ -2960,12 +2929,7 @@ mod tests { let mut regions = BTreeSet::new(); for region_id in 1..=3 { - let db = factory - .open_tablet(region_id, None, OpenOptions::default().set_cache_only(true)) - .unwrap() - .as_inner() - .clone(); - let cf = get_cf_handle(&db, CF_WRITE).unwrap(); + let region_engine = engine.engines.lock().unwrap()[®ion_id].clone(); for i in 10 * (region_id - 1)..10 * region_id { let k = format!("k{:02}", i).into_bytes(); @@ -2978,10 +2942,24 @@ mod tests { if start_key <= key && key < end_key { regions.insert(region_id); - assert!(db.get_cf(cf, &raw_k).unwrap().is_none()); + assert!( + region_engine + .kv_engine() + .unwrap() + .get_value_cf(CF_WRITE, &raw_k) + .unwrap() + .is_none() + ); must_get_none_on_region(&mut engine, region_id, &k, put_start_ts + 10); } else { - assert!(db.get_cf(cf, &raw_k).unwrap().is_some()); + assert!( + region_engine + .kv_engine() + .unwrap() + .get_value_cf(CF_WRITE, &raw_k) + .unwrap() + .is_some() + ); must_get_on_region(&mut engine, region_id, &k, put_start_ts + 10, &val); } } diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index be57873b68c..e8b85d37d66 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -1459,6 +1459,9 @@ mod tests { } #[test] + // FIXME: Either implement storage::kv traits for all engine types, or avoid using raw engines + // in this test. + #[cfg(feature = "test-engine-kv-rocksdb")] fn test_out_of_sync_max_ts() { use engine_test::kv::KvTestEngineIterator; use engine_traits::{IterOptions, ReadOptions}; diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 5c573b6e809..043e3ad2d23 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -68,47 +68,47 @@ sse = ["tikv/sse"] portable = ["tikv/portable"] [dependencies] -api_version = { path = "../components/api_version", default-features = false } +api_version = { workspace = true } async-trait = "0.1" -batch-system = { path = "../components/batch-system", default-features = false } -cdc = { path = "../components/cdc", default-features = false } -collections = { path = "../components/collections" } +batch-system = { workspace = true } +cdc = { workspace = true } +collections = { workspace = true } crc64fast = "0.1" crossbeam = "0.8" -encryption = { path = "../components/encryption", default-features = false } -engine_rocks_helper = { path = "../components/engine_rocks_helper" } -error_code = { path = "../components/error_code", default-features = false } +encryption = { workspace = true } +engine_rocks_helper = { workspace = true } +error_code = { workspace = true } fail = "0.5" -file_system = { path = "../components/file_system" } +file_system = { workspace = true } futures = "0.3" -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false } kvproto = { git = "https://github.com/pingcap/kvproto.git" } libc = "0.2" -log_wrappers = { path = "../components/log_wrappers" } +log_wrappers = { workspace = true } more-asserts = "0.2" -online_config = { path = "../components/online_config", default-features = false } +online_config = { workspace = true } paste = "1.0" -pd_client = { path = "../components/pd_client", default-features = false } +pd_client = { workspace = true } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -raft_log_engine = { path = "../components/raft_log_engine", default-features = false } -raftstore = { path = "../components/raftstore", default-features = false } +raft_log_engine = { workspace = true } +raftstore = { workspace = true } rand = "0.8.3" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tempfile = "3.0" -tidb_query_aggr = { path = "../components/tidb_query_aggr", default-features = false } -tidb_query_common = { path = "../components/tidb_query_common", default-features = false } -tidb_query_datatype = { path = "../components/tidb_query_datatype", default-features = false } -tidb_query_executors = { path = "../components/tidb_query_executors", default-features = false } -tidb_query_expr = { path = "../components/tidb_query_expr", default-features = false } -tikv = { path = "../", default-features = false } -tikv_util = { path = "../components/tikv_util", default-features = false } +tidb_query_aggr = { workspace = true } +tidb_query_common = { workspace = true } +tidb_query_datatype = { workspace = true } +tidb_query_executors = { workspace = true } +tidb_query_expr = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } time = "0.1" tipb = { git = "https://github.com/pingcap/tipb.git" } toml = "0.5" -txn_types = { path = "../components/txn_types", default-features = false } +txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } [target.'cfg(target_os = "linux")'.dependencies] @@ -119,34 +119,34 @@ arrow = "13.0" byteorder = "1.2" # See https://bheisler.github.io/criterion.rs/book/user_guide/known_limitations.html for the usage # of `real_blackbox` feature. -causal_ts = { path = "../components/causal_ts" } -concurrency_manager = { path = "../components/concurrency_manager", default-features = false } +causal_ts = { workspace = true } +concurrency_manager = { workspace = true } criterion = "0.3" criterion-cpu-time = "0.1" -engine_rocks = { path = "../components/engine_rocks", default-features = false } -engine_test = { path = "../components/engine_test", default-features = false } -engine_traits = { path = "../components/engine_traits", default-features = false } -external_storage_export = { path = "../components/external_storage/export", default-features = false } -file_system = { path = "../components/file_system" } +engine_rocks = { workspace = true } +engine_test = { workspace = true } +engine_traits = { workspace = true } +external_storage_export = { workspace = true } +file_system = { workspace = true } hyper = { version = "0.14", default-features = false, features = ["runtime"] } -keys = { path = "../components/keys", default-features = false } -panic_hook = { path = "../components/panic_hook" } -profiler = { path = "../components/profiler" } +keys = { workspace = true } +panic_hook = { workspace = true } +profiler = { workspace = true } rand_xorshift = "0.3" -resource_metering = { path = "../components/resource_metering" } -security = { path = "../components/security", default-features = false } +resource_metering = { workspace = true } +security = { workspace = true } serde_json = "1.0" -sst_importer = { path = "../components/sst_importer", default-features = false } -test_backup = { path = "../components/test_backup", default-features = false } -test_coprocessor = { path = "../components/test_coprocessor", default-features = false } -test_pd = { path = "../components/test_pd", default-features = false } -test_pd_client = { path = "../components/test_pd_client" } -test_raftstore = { path = "../components/test_raftstore", default-features = false } -test_sst_importer = { path = "../components/test_sst_importer", default-features = false } -test_storage = { path = "../components/test_storage", default-features = false } -test_util = { path = "../components/test_util", default-features = false } -tidb_query_datatype = { path = "../components/tidb_query_datatype", default-features = false } -tipb_helper = { path = "../components/tipb_helper", default-features = false } +sst_importer = { workspace = true } +test_backup = { workspace = true } +test_coprocessor = { workspace = true } +test_pd = { workspace = true } +test_pd_client = { workspace = true } +test_raftstore = { workspace = true } +test_sst_importer = { workspace = true } +test_storage = { workspace = true } +test_util = { workspace = true } +tidb_query_datatype = { workspace = true } +tipb_helper = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } [target.'cfg(all(target_os = "linux", target_arch = "x86_64"))'.dev-dependencies] From 08b5a4a6e5197143da1f592476da3203748a01b1 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 19 Oct 2022 14:51:54 +0800 Subject: [PATCH 276/676] raftstore-v2: test_read is not stable (#13626) close tikv/tikv#13625 Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- .../raftstore-v2/src/operation/query/local.rs | 104 ++++++++++-------- 1 file changed, 61 insertions(+), 43 deletions(-) diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 5986d3e4596..12df1e7926f 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -178,7 +178,8 @@ where let mut err = errorpb::Error::default(); err.set_message(format!( - "Fail to get snapshot from LocalReader for region {}. Maybe due to `not leader` or `not applied to the current term`", + "Fail to get snapshot from LocalReader for region {}. \ + Maybe due to `not leader`, `region not found` or `not applied to the current term`", region_id )); let mut resp = RaftCmdResponse::default(); @@ -496,22 +497,28 @@ mod tests { .collect() } - #[test] - fn test_read() { - // It mocks that local reader communications with raftstore. - // rx receives msgs like raftstore, then call f() to do something (such as renew - // lease or something), then send the result back to the local reader through ch - fn handle_msg( - f: F, - rx: Receiver<(u64, PeerMsg)>, - ch_tx: SyncSender>, - ) -> JoinHandle<()> { - thread::spawn(move || { - // Msg for query will be sent + // It mocks that local reader communications with raftstore. + // mix_rx receives a closure, msg receiver, and sender of the msg receiver + // - closure: do some update such as renew lease or something which we could do + // in real raftstore + // - msg receiver: receives the msg from local reader + // - sender of the msg receiver: send the msg receiver out of the thread so that + // we can use it again. + fn mock_raftstore( + mix_rx: Receiver<( + Box, + Receiver<(u64, PeerMsg)>, + SyncSender>, + )>, + ) -> JoinHandle<()> { + thread::spawn(move || { + while let Ok((f, rx, ch_tx)) = mix_rx.recv() { + // Receives msg from local reader let (_, msg) = rx.recv().unwrap(); - f(); + match msg { + // send the result back to local reader PeerMsg::RaftQuery(query) => ReadCallback::set_result( query.ch, QueryResult::Read(ReadResponse { @@ -522,9 +529,12 @@ mod tests { _ => unreachable!(), } ch_tx.send(rx).unwrap(); - }) - } + } + }) + } + #[test] + fn test_read() { let store_id = 1; // Building a tablet factory @@ -538,6 +548,8 @@ mod tests { let store_meta = Arc::new(Mutex::new(StoreMeta::new())); let (mut reader, mut rx) = new_reader(store_id, store_meta.clone()); + let (mix_tx, mix_rx) = sync_channel(1); + let handler = mock_raftstore(mix_rx); let mut region1 = metapb::Region::default(); region1.set_id(1); @@ -552,7 +564,7 @@ mod tests { let leader2 = prs[0].clone(); region1.set_region_epoch(epoch13.clone()); let term6 = 6; - let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); + let mut lease = Lease::new(Duration::seconds(10), Duration::milliseconds(2500)); let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, "".to_owned())); let mut cmd = RaftCmdRequest::default(); @@ -622,17 +634,20 @@ mod tests { // Case: Applied term not match let store_meta_clone = store_meta.clone(); - let handler = handle_msg( - move || { - let mut meta = store_meta_clone.lock().unwrap(); - meta.readers - .get_mut(&1) - .unwrap() - .update(ReadProgress::applied_term(term6)); - }, - rx, - ch_tx.clone(), - ); + // Send what we want to do to mock raftstore + mix_tx + .send(( + Box::new(move || { + let mut meta = store_meta_clone.lock().unwrap(); + meta.readers + .get_mut(&1) + .unwrap() + .update(ReadProgress::applied_term(term6)); + }), + rx, + ch_tx.clone(), + )) + .unwrap(); // The first try will be rejected due to unmatched applied term but after update // the applied term by the above thread, the snapshot will be acquired by // retrying. @@ -646,23 +661,25 @@ mod tests { TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.applied_term.get()), 1 ); - handler.join().unwrap(); rx = ch_rx.recv().unwrap(); // Case: Expire lease to make the local reader lease check fail. lease.expire_remote_lease(); let remote = lease.maybe_new_remote_lease(term6).unwrap(); - let handler = handle_msg( - move || { - let mut meta = store_meta.lock().unwrap(); - meta.readers - .get_mut(&1) - .unwrap() - .update(ReadProgress::leader_lease(remote)); - }, - rx, - ch_tx.clone(), - ); + // Send what we want to do to mock raftstore + mix_tx + .send(( + Box::new(move || { + let mut meta = store_meta.lock().unwrap(); + meta.readers + .get_mut(&1) + .unwrap() + .update(ReadProgress::leader_lease(remote)); + }), + rx, + ch_tx.clone(), + )) + .unwrap(); let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); // Updating lease makes cache miss. assert_eq!( @@ -673,15 +690,13 @@ mod tests { TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), 1 ); - handler.join().unwrap(); rx = ch_rx.recv().unwrap(); // Case: Read quorum. let mut cmd_read_quorum = cmd.clone(); cmd_read_quorum.mut_header().set_read_quorum(true); - let handler = handle_msg(|| {}, rx, ch_tx); + mix_tx.send((Box::new(move || {}), rx, ch_tx)).unwrap(); let _ = block_on(reader.snapshot(cmd_read_quorum.clone())).unwrap(); - handler.join().unwrap(); ch_rx.recv().unwrap(); // Case: Stale read @@ -709,6 +724,9 @@ mod tests { assert_eq!(read_progress.safe_ts(), 2); let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); assert_eq!(*snap.get_region(), region1); + + drop(mix_tx); + handler.join().unwrap(); } #[test] From da4877567c0ff11a863d6e39923e8c9819064a57 Mon Sep 17 00:00:00 2001 From: haojinming Date: Thu, 20 Oct 2022 11:53:54 +0800 Subject: [PATCH 277/676] test: Add rawkv empty key/value test (#13636) close tikv/tikv#13635 Signed-off-by: haojinming --- src/storage/mod.rs | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index e2192573dea..84d52b6990a 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -5244,7 +5244,13 @@ mod tests { fn test_raw_v2_multi_versions() { // Test update on the same key to verify multi-versions implementation of RawKV // V2. - let test_data = vec![Some(b"v1"), Some(b"v2"), None, Some(b"v3")]; + let test_data = vec![ + Some(b"v1".to_vec()), + Some(b"v2".to_vec()), + None, + Some(b"".to_vec()), + Some(b"v3".to_vec()), + ]; let k = b"r\0k".to_vec(); let storage = TestStorageBuilder::<_, _, ApiV2>::new(DummyLockManager) @@ -5256,7 +5262,11 @@ mod tests { ..Default::default() }; - let last_data = test_data.last().unwrap().map(|x| (k.clone(), x.to_vec())); + let last_data = test_data + .last() + .unwrap() + .as_ref() + .map(|x| (k.clone(), x.clone())); for v in test_data { if let Some(v) = v { storage @@ -5264,7 +5274,7 @@ mod tests { ctx.clone(), "".to_string(), k.clone(), - v.to_vec(), + v.clone(), 0, expect_ok_callback(tx.clone(), 0), ) @@ -5272,7 +5282,7 @@ mod tests { rx.recv().unwrap(); expect_value( - v.to_vec(), + v.clone(), block_on(storage.raw_get(ctx.clone(), "".to_string(), k.clone())).unwrap(), ); } else { @@ -5543,12 +5553,19 @@ mod tests { ..Default::default() }; + let empty_key = if F::TAG == ApiVersion::V2 { + b"r".to_vec() + } else { + b"".to_vec() + }; let test_data = vec![ + (empty_key.clone(), b"ff".to_vec(), 10), // empty key (b"r\0a".to_vec(), b"aa".to_vec(), 10), (b"r\0b".to_vec(), b"bb".to_vec(), 20), (b"r\0c".to_vec(), b"cc".to_vec(), 30), (b"r\0d".to_vec(), b"dd".to_vec(), 0), (b"r\0e".to_vec(), b"ee".to_vec(), 40), + (b"r\0g".to_vec(), b"".to_vec(), 50), // empty value ]; let kvpairs = test_data @@ -5601,7 +5618,7 @@ mod tests { block_on(storage.raw_scan( ctx, "".to_string(), - b"r".to_vec(), + empty_key, Some(b"rz".to_vec()), 20, false, From 262c1b840773b468ee7857725c305c16c6d5d047 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Thu, 20 Oct 2022 23:57:54 -0700 Subject: [PATCH 278/676] raftstore-v2: add read handling after apply (#13565) ref tikv/tikv#12842 Renew lease if possible after write proposal committed. Respond to read index if any pending reads after apply. Signed-off-by: qi.xu Signed-off-by: tonyxuqqi Co-authored-by: qi.xu --- components/raftstore-v2/src/fsm/peer.rs | 2 +- .../raftstore-v2/src/operation/command/mod.rs | 7 +- .../raftstore-v2/src/operation/query/lease.rs | 64 +++++++++++-------- .../raftstore-v2/src/operation/query/mod.rs | 64 +++++++++++++++---- .../raftstore-v2/src/operation/ready/mod.rs | 20 +++++- components/raftstore-v2/src/raft/peer.rs | 27 ++++++++ .../tests/integrations/cluster.rs | 14 +++- .../tests/integrations/test_read.rs | 57 ++++++++++++----- .../src/coprocessor/split_check/size.rs | 16 ++--- components/raftstore/src/store/peer.rs | 2 +- components/raftstore/src/store/util.rs | 10 +++ 11 files changed, 208 insertions(+), 75 deletions(-) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 389e59f0ee4..6b9cccc8b84 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -219,7 +219,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.on_command(cmd.request, cmd.ch) } PeerMsg::Tick(tick) => self.on_tick(tick), - PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(res), + PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(self.store_ctx, res), PeerMsg::Start => self.on_start(), PeerMsg::Noop => unimplemented!(), PeerMsg::Persisted { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index bef599d5239..fcfeb29fbe2 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -223,7 +223,7 @@ impl Peer { .send(ApplyTask::CommittedEntries(apply)); } - pub fn on_apply_res(&mut self, apply_res: ApplyRes) { + pub fn on_apply_res(&mut self, ctx: &mut StoreContext, apply_res: ApplyRes) { if !self.serving() { return; } @@ -235,6 +235,7 @@ impl Peer { self.raft_group_mut() .advance_apply_to(apply_res.applied_index); let is_leader = self.is_leader(); + let progress_to_be_updated = self.entry_storage().applied_term() != apply_res.applied_term; let entry_storage = self.entry_storage_mut(); entry_storage .apply_state_mut() @@ -242,10 +243,8 @@ impl Peer { entry_storage.set_applied_term(apply_res.applied_term); if !is_leader { entry_storage.compact_entry_cache(apply_res.applied_index + 1); - // TODO: handle read. - } else { - // TODO: handle read. } + self.handle_read_on_apply(ctx, apply_res, progress_to_be_updated); } } diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 00a485c8460..1ae4aecd1cc 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -175,38 +175,50 @@ impl Peer { self.maybe_update_read_progress(reader, progress); } if let Some(progress) = read_progress { - let mut meta = store_meta.lock().unwrap(); - // TODO: remove this block of code when snapshot is done; add the logic into - // on_persist_snapshot. - let reader = meta.readers.get_mut(&self.region_id()); - if reader.is_none() { - let region = self.region().clone(); - let region_id = region.get_id(); - let peer_id = self.peer_id(); - let delegate = ReadDelegate { - region: Arc::new(region), - peer_id, - term: self.term(), - applied_term: self.entry_storage().applied_term(), - leader_lease: None, - last_valid_ts: Timespec::new(0, 0), - tag: format!("[region {}] {}", region_id, peer_id), - read_progress: self.read_progress().clone(), - pending_remove: false, - bucket_meta: None, - txn_extra_op: Default::default(), - txn_ext: Default::default(), - track_ver: TrackVer::new(), - }; - meta.readers.insert(self.region_id(), delegate); - } + // TODO: remove it + self.add_reader_if_necessary(store_meta); + let mut meta = store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id()).unwrap(); self.maybe_update_read_progress(reader, progress); } } - fn maybe_update_read_progress(&self, reader: &mut ReadDelegate, progress: ReadProgress) { + // TODO: remove this block of code when snapshot is done; add the logic into + // on_persist_snapshot. + pub(crate) fn add_reader_if_necessary(&mut self, store_meta: &mut Arc>>) { + let mut meta = store_meta.lock().unwrap(); + // TODO: remove this block of code when snapshot is done; add the logic into + // on_persist_snapshot. + let reader = meta.readers.get_mut(&self.region_id()); + if reader.is_none() { + let region = self.region().clone(); + let region_id = region.get_id(); + let peer_id = self.peer_id(); + let delegate = ReadDelegate { + region: Arc::new(region), + peer_id, + term: self.term(), + applied_term: self.entry_storage().applied_term(), + leader_lease: None, + last_valid_ts: Timespec::new(0, 0), + tag: format!("[region {}] {}", region_id, peer_id), + read_progress: self.read_progress().clone(), + pending_remove: false, + bucket_meta: None, + txn_extra_op: Default::default(), + txn_ext: Default::default(), + track_ver: TrackVer::new(), + }; + meta.readers.insert(self.region_id(), delegate); + } + } + + pub(crate) fn maybe_update_read_progress( + &self, + reader: &mut ReadDelegate, + progress: ReadProgress, + ) { debug!( self.logger, "update read progress"; diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 0b10e0679a5..960e667c7d9 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -18,14 +18,16 @@ use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ errorpb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, StatusCmdType}, + raft_serverpb::RaftApplyState, }; use raft::Ready; use raftstore::{ errors::RAFTSTORE_IS_BUSY, store::{ - cmd_resp, local_metrics::RaftMetrics, metrics::RAFT_READ_INDEX_PENDING_COUNT, - msg::ErrorCallback, region_meta::RegionMeta, util, util::LeaseState, GroupState, - ReadIndexContext, RequestPolicy, Transport, + cmd_resp, fsm::ApplyMetrics, local_metrics::RaftMetrics, + metrics::RAFT_READ_INDEX_PENDING_COUNT, msg::ErrorCallback, region_meta::RegionMeta, util, + util::LeaseState, GroupState, ReadCallback, ReadIndexContext, ReadProgress, RequestPolicy, + Transport, }, Error, Result, }; @@ -38,7 +40,8 @@ use crate::{ fsm::PeerFsmDelegate, raft::Peer, router::{ - message::RaftRequest, DebugInfoChannel, PeerMsg, QueryResChannel, QueryResult, ReadResponse, + message::RaftRequest, ApplyRes, DebugInfoChannel, PeerMsg, QueryResChannel, QueryResult, + ReadResponse, }, }; @@ -56,12 +59,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> return Ok(RequestPolicy::ReadIndex); } - // If applied index's term differs from current raft's term, leader - // transfer must happened, if read locally, we may read old value. - // TODO: to add the block back when apply is implemented. - // if !self.fsm.peer().has_applied_to_current_term() { - // return Ok(RequestPolicy::ReadIndex); - // } + // If applied index's term is differ from current raft's term, leader transfer + // must happened, if read locally, we may read old value. + if !self.fsm.peer().applied_to_current_term() { + return Ok(RequestPolicy::ReadIndex); + } match self.fsm.peer_mut().inspect_lease() { LeaseState::Valid => Ok(RequestPolicy::ReadLocal), @@ -218,9 +220,10 @@ impl Peer { } } - // TODO: add ready_to_handle_read for splitting and merging - while let Some(mut read) = self.pending_reads_mut().pop_front() { - self.respond_read_index(&mut read, ctx); + if self.ready_to_handle_read() { + while let Some(mut read) = self.pending_reads_mut().pop_front() { + self.respond_read_index(&mut read, ctx); + } } } @@ -377,4 +380,39 @@ impl Peer { .unwrap(); ch.set_result(meta); } + + // the v1's post_apply + // As the logic is mostly for read, rename it to handle_read_after_apply + pub fn handle_read_on_apply( + &mut self, + ctx: &mut StoreContext, + apply_res: ApplyRes, + progress_to_be_updated: bool, + ) { + // TODO: add is_handling_snapshot check + // it could update has_ready + + // TODO: add peer_stat(for PD hotspot scheduling) and deleted_keys_hint + if !self.is_leader() { + self.post_pending_read_index_on_replica(ctx) + } else if self.ready_to_handle_read() { + while let Some(mut read) = self.pending_reads_mut().pop_front() { + self.respond_read_index(&mut read, ctx); + } + } + self.pending_reads_mut().gc(); + self.read_progress_mut() + .update_applied_core(apply_res.applied_index); + + // Only leaders need to update applied_term. + if progress_to_be_updated && self.is_leader() { + // TODO: add coprocessor_host hook + let progress = ReadProgress::applied_term(apply_res.applied_term); + // TODO: remove it + self.add_reader_if_necessary(&mut ctx.store_meta); + let mut meta = ctx.store_meta.lock().unwrap(); + let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + self.maybe_update_read_progress(reader, progress); + } + } } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index e20192394a6..3129dcfb832 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -28,6 +28,7 @@ use protobuf::Message as _; use raft::{eraftpb, Ready}; use raftstore::store::{util, ExtraStates, FetchedLogs, Transport, WriteTask}; use slog::{debug, error, trace, warn}; +use tikv_util::time::{duration_to_sec, monotonic_raw_now}; pub use self::async_writer::AsyncWriter; use crate::{ @@ -228,7 +229,24 @@ impl Peer { // TODO: skip handling committed entries if a snapshot is being applied // asynchronously. if self.is_leader() { - // TODO: Update lease + for entry in committed_entries.iter().rev() { + // TODO: handle raft_log_size_hint + let propose_time = self + .proposals() + .find_propose_time(entry.get_term(), entry.get_index()); + if let Some(propose_time) = propose_time { + // We must renew current_time because this value may be created a long time ago. + // If we do not renew it, this time may be smaller than propose_time of a + // command, which was proposed in another thread while this thread receives its + // AppendEntriesResponse and is ready to calculate its commit-log-duration. + ctx.current_time.replace(monotonic_raw_now()); + ctx.raft_metrics.commit_log.observe(duration_to_sec( + (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), + )); + self.maybe_renew_leader_lease(propose_time, &mut ctx.store_meta, None); + break; + } + } } self.schedule_apply_committed_entries(ctx, committed_entries); } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index e7ee6e7465a..650c410cef9 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -187,6 +187,11 @@ impl Peer { &self.read_progress } + #[inline] + pub fn read_progress_mut(&mut self) -> &mut Arc { + &mut self.read_progress + } + #[inline] pub fn leader_lease(&self) -> &Lease { &self.leader_lease @@ -382,6 +387,28 @@ impl Peer { } #[inline] + pub fn proposals(&self) -> &ProposalQueue> { + &self.proposals + } + + #[inline] + pub fn ready_to_handle_read(&self) -> bool { + // TODO: It may cause read index to wait a long time. + + // There may be some values that are not applied by this leader yet but the old + // leader, if applied_term isn't equal to current term. + self.applied_to_current_term() + // There may be stale read if the old leader splits really slow, + // the new region may already elected a new leader while + // the old leader still think it owns the split range. + && !self.is_splitting() + // There may be stale read if a target leader is in another store and + // applied commit merge, written new values, but the sibling peer in + // this store does not apply commit merge, so the leader is not ready + // to read, until the merge is rollbacked. + && !self.is_merging() + } + pub fn apply_scheduler(&self) -> &ApplyScheduler { self.apply_scheduler.as_ref().unwrap() } diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index d46ff09f2b1..554db96acbf 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -323,12 +323,16 @@ pub struct Cluster { impl Default for Cluster { fn default() -> Cluster { - Cluster::with_node_count(1) + Cluster::with_node_count(1, None) } } impl Cluster { - pub fn with_node_count(count: usize) -> Self { + pub fn with_config(config: Config) -> Cluster { + Cluster::with_node_count(1, Some(config)) + } + + pub fn with_node_count(count: usize, config: Option) -> Self { let pd_server = test_pd::Server::new(1); let mut cluster = Cluster { pd_server, @@ -336,7 +340,11 @@ impl Cluster { receivers: vec![], routers: vec![], }; - let mut cfg = v2_default_config(); + let mut cfg = if let Some(config) = config { + config + } else { + v2_default_config() + }; disable_all_auto_ticks(&mut cfg); for _ in 1..=count { let mut node = TestNode::with_pd(&cluster.pd_server); diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs index bb7156c6af7..9f3c5c2c03a 100644 --- a/components/raftstore-v2/tests/integrations/test_read.rs +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -2,14 +2,17 @@ use futures::executor::block_on; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, ReadIndexRequest, Request, StatusCmdType}; -use tikv_util::store::new_peer; +use raftstore_v2::router::PeerMsg; +use tikv_util::{config::ReadableDuration, store::new_peer}; use txn_types::WriteBatchFlags; -use crate::cluster::Cluster; +use crate::cluster::{v2_default_config, Cluster}; #[test] fn test_read_index() { - let cluster = Cluster::default(); + let mut config = v2_default_config(); + config.raft_store_max_leader_lease = ReadableDuration::millis(150); + let cluster = Cluster::with_config(config); let router = cluster.router(0); std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; @@ -20,7 +23,7 @@ fn test_read_index() { let res = router.query(region_id, req.clone()).unwrap(); let status_resp = res.response().unwrap().get_status_response(); let detail = status_resp.get_region_detail(); - let mut region = detail.get_region().clone(); + let region = detail.get_region().clone(); let read_index_req = ReadIndexRequest::default(); let mut req = RaftCmdRequest::default(); @@ -28,7 +31,7 @@ fn test_read_index() { req.mut_header().set_term(7); req.mut_header().set_region_id(region_id); req.mut_header() - .set_region_epoch(region.take_region_epoch()); + .set_region_epoch(region.get_region_epoch().clone()); let mut request_inner = Request::default(); request_inner.set_cmd_type(CmdType::Snap); request_inner.set_read_index(read_index_req); @@ -37,7 +40,38 @@ fn test_read_index() { let resp = res.read().unwrap(); assert_eq!(resp.read_index, 6); // single node commited index should be 6. - // TODO: add more test when write is implemented. + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read().unwrap(); + // Since it's still with the lease, read index will be skipped. + assert_eq!(resp.read_index, 0); + + std::thread::sleep(std::time::Duration::from_millis(200)); + // the read lease should be expired + let res = router.query(region_id, req.clone()).unwrap(); + let resp = res.read().unwrap(); + assert_eq!(resp.read_index, 6); + + std::thread::sleep(std::time::Duration::from_millis(200)); + let read_req = req.clone(); + // the read lease should be expired and renewed by write + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(1, 3)); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + let mut put_req = Request::default(); + put_req.set_cmd_type(CmdType::Put); + put_req.mut_put().set_key(b"key".to_vec()); + put_req.mut_put().set_value(b"value".to_vec()); + req.mut_requests().push(put_req); + + let (msg, sub) = PeerMsg::raft_command(req.clone()); + router.send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + + let res = router.query(region_id, read_req).unwrap(); + let resp = res.read().unwrap(); + assert_eq!(resp.read_index, 0); } #[test] @@ -217,16 +251,7 @@ fn test_local_read() { request_inner.set_cmd_type(CmdType::Snap); req.mut_requests().push(request_inner); - // FIXME: Get snapshot from local reader, but it will fail as the leader has not - // applied in the current term (due to unimplementation of ApplyRes). - let resp = block_on(async { router.get_snapshot(req.clone()).await.unwrap_err() }); - assert!( - resp.get_header() - .get_error() - .get_message() - .contains("Fail to get snapshot ") - ); - + block_on(async { router.get_snapshot(req.clone()).await.unwrap() }); let res = router.query(region_id, req.clone()).unwrap(); let resp = res.read().unwrap(); // The read index will be 0 as the retry process in the `get_snapshot` will diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 44318a27b60..bdcf817365c 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -615,7 +615,7 @@ pub mod tests { }; let cop_host = CoprocessorHost::new(tx.clone(), cfg); let mut runnable = SplitCheckRunner::new(engine.clone(), tx, cop_host.clone()); - for i in 0..2000 { + for i in 0..1000 { // if not mvcc, kv size is (6+1)*2 = 14, given bucket size is 3000, expect each // bucket has about 210 keys if mvcc, kv size is about 18*2 = 36, expect each // bucket has about 80 keys @@ -646,7 +646,7 @@ pub mod tests { let end = format!("{:04}", 20).into_bytes(); // insert keys into 0000 ~ 0020 with 000000 ~ 002000 - for i in 0..2000 { + for i in 0..1000 { // kv size is (6+1)*2 = 14, given bucket size is 3000, expect each bucket has // about 210 keys if mvcc, kv size is about 18*2 = 36, expect each bucket has // about 80 keys @@ -667,9 +667,9 @@ pub mod tests { assert_eq!(host.policy(), CheckPolicy::Approximate); if !mvcc { - must_generate_buckets_approximate(&rx, Some(BucketRange(start, end)), 150, 450, mvcc); + must_generate_buckets_approximate(&rx, Some(BucketRange(start, end)), 75, 225, mvcc); } else { - must_generate_buckets_approximate(&rx, Some(BucketRange(start, end)), 70, 150, mvcc); + must_generate_buckets_approximate(&rx, Some(BucketRange(start, end)), 35, 85, mvcc); } drop(rx); } @@ -685,16 +685,12 @@ pub mod tests { #[test] fn test_generate_bucket_by_approximate() { - for cf in LARGE_CFS { - test_generate_bucket_impl(LARGE_CFS, cf, false); - } + test_generate_bucket_impl(LARGE_CFS, CF_WRITE, false); } #[test] fn test_generate_bucket_mvcc_by_approximate() { - for cf in LARGE_CFS { - test_generate_bucket_impl(LARGE_CFS, cf, true); - } + test_generate_bucket_impl(LARGE_CFS, CF_DEFAULT, true); } #[test] diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 37c2fd5a99a..522b68e2f09 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -153,7 +153,7 @@ impl ProposalQueue { }) } - fn find_propose_time(&self, term: u64, index: u64) -> Option { + pub fn find_propose_time(&self, term: u64, index: u64) -> Option { self.queue .binary_search_by_key(&(term, index), |p: &Proposal<_>| (p.term, p.index)) .ok() diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 5f2c6615527..a4b48e4ba37 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -1033,6 +1033,16 @@ impl RegionReadProgress { } } + // TODO: remove it when coprocessor hook is implemented in v2. + pub fn update_applied_core(&self, applied: u64) { + let mut core = self.core.lock().unwrap(); + if let Some(ts) = core.update_applied(applied) { + if !core.pause { + self.safe_ts.store(ts, AtomicOrdering::Release); + } + } + } + pub fn update_safe_ts(&self, apply_index: u64, ts: u64) { if apply_index == 0 || ts == 0 { return; From 92aaf95e0dc83f855cf6d9bfd046c2ea0dfc9f3d Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 21 Oct 2022 16:53:55 +0800 Subject: [PATCH 279/676] cdc: observe the slowest resolved ts lag (#13647) close tikv/tikv#13646 cdc: observe the slowest resolved ts lag Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/cdc/src/delegate.rs | 2 -- components/cdc/src/endpoint.rs | 8 +++++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index da12c1521d6..de38a7b1fc8 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -423,8 +423,6 @@ impl Delegate { let resolved_ts = resolver.resolve(min_ts); debug!("cdc resolved ts updated"; "region_id" => self.region_id, "resolved_ts" => resolved_ts); - CDC_RESOLVED_TS_GAP_HISTOGRAM - .observe((min_ts.physical() - resolved_ts.physical()) as f64 / 1000f64); Some(resolved_ts) } diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 26c0a11371e..7542bb1bfc8 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -65,7 +65,7 @@ use crate::{ }; const FEATURE_RESOLVED_TS_STORE: Feature = Feature::require(5, 0, 0); -const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s +const METRICS_FLUSH_INTERVAL: u64 = 1_000; // 1s // 10 minutes, it's the default gc life time of TiDB // and is long enough for most transactions. const WARN_RESOLVED_TS_LAG_THRESHOLD: Duration = Duration::from_secs(600); @@ -1247,6 +1247,12 @@ impl, E: KvEngine> RunnableWithTimer for Endpoin .physical() .saturating_sub(self.min_resolved_ts.physical()) as i64, ); + CDC_RESOLVED_TS_GAP_HISTOGRAM.observe( + self.current_ts + .physical() + .saturating_sub(self.min_resolved_ts.physical()) as f64 + / 1000f64, + ); } self.min_resolved_ts = TimeStamp::max(); self.current_ts = TimeStamp::max(); From 71af2905ad8fdd6e3aade27eb66fac06e77d6cec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 24 Oct 2022 17:03:56 +0800 Subject: [PATCH 280/676] log-backup: Fix Initial Scan Racing (#13628) close tikv/tikv#13616 Checking the resolver version by observer ID. Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- components/backup-stream/src/event_loader.rs | 27 ++++++-- .../backup-stream/src/subscription_manager.rs | 5 +- components/backup-stream/tests/mod.rs | 66 +++++++++++++++++-- 3 files changed, 85 insertions(+), 13 deletions(-) diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 90a330cf446..27c05b5b875 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -10,7 +10,7 @@ use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::CmdType}; use raftstore::{ - coprocessor::RegionInfoProvider, + coprocessor::{ObserveHandle, RegionInfoProvider}, router::RaftStoreRouter, store::{fsm::ChangeObserver, Callback, SignificantMsg}, }; @@ -335,17 +335,19 @@ where Ok(snap) } - pub fn with_resolver( + fn with_resolver( &self, region: &Region, + handle: &ObserveHandle, f: impl FnOnce(&mut TwoPhaseResolver) -> Result, ) -> Result { - Self::with_resolver_by(&self.tracing, region, f) + Self::with_resolver_by(&self.tracing, region, handle, f) } - pub fn with_resolver_by( + fn with_resolver_by( tracing: &SubscriptionTracer, region: &Region, + handle: &ObserveHandle, f: impl FnOnce(&mut TwoPhaseResolver) -> Result, ) -> Result { let region_id = region.get_id(); @@ -353,6 +355,8 @@ where .get_subscription_of(region_id) .ok_or_else(|| Error::Other(box_err!("observer for region {} canceled", region_id))) .and_then(|v| { + // NOTE: once we have compared the observer handle, perhaps we can remove this + // check because epoch version changed implies observer handle changed. raftstore::store::util::compare_region_epoch( region.get_region_epoch(), &v.value().meta, @@ -362,6 +366,10 @@ where true, false, )?; + if v.value().handle().id != handle.id { + return Err(box_err!("stale observe handle {:?}, should be {:?}, perhaps new initial scanning starts", + handle.id, v.value().handle().id)); + } Ok(v) }) .map_err(|err| Error::Contextual { @@ -379,6 +387,7 @@ where fn scan_and_async_send( &self, region: &Region, + handle: &ObserveHandle, mut event_loader: EventLoader, join_handles: &mut Vec>, ) -> Result { @@ -401,7 +410,9 @@ where // and we would exit after the first run of loop :( let no_progress = event_loader.entry_batch.is_empty(); let stat = stat?; - self.with_resolver(region, |r| event_loader.emit_entries_to(&mut events, r))?; + self.with_resolver(region, handle, |r| { + event_loader.emit_entries_to(&mut events, r) + })?; if no_progress { metrics::INITIAL_SCAN_DURATION.observe(start.saturating_elapsed_secs()); return Ok(stats.stat); @@ -429,6 +440,8 @@ where pub fn do_initial_scan( &self, region: &Region, + // We are using this handle for checking whether the initial scan is stale. + handle: ObserveHandle, start_ts: TimeStamp, snap: impl Snapshot, ) -> Result { @@ -440,13 +453,13 @@ where // It is ok to sink more data than needed. So scan to +inf TS for convenance. let event_loader = EventLoader::load_from(snap, start_ts, TimeStamp::max(), region)?; - let stats = self.scan_and_async_send(region, event_loader, &mut join_handles)?; + let stats = self.scan_and_async_send(region, &handle, event_loader, &mut join_handles)?; Handle::current() .block_on(futures::future::try_join_all(join_handles)) .map_err(|err| annotate!(err, "tokio runtime failed to join consuming threads"))?; - Self::with_resolver_by(&tr, region, |r| { + Self::with_resolver_by(&tr, region, &handle, |r| { r.phase_one_done(); Ok(()) }) diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index d47974bcd42..83181829b43 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -128,12 +128,15 @@ where handle: ObserveHandle, ) -> Result { let region_id = region.get_id(); + let h = handle.clone(); // Note: we have external retry at `ScanCmd::exec_by_with_retry`, should we keep // retrying here? let snap = self.observe_over_with_retry(region, move || { ChangeObserver::from_pitr(region_id, handle.clone()) })?; - let stat = self.do_initial_scan(region, start_ts, snap)?; + #[cfg(feature = "failpoints")] + fail::fail_point!("scan_after_get_snapshot"); + let stat = self.do_initial_scan(region, h, start_ts, snap)?; Ok(stat) } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 6e902fb1e08..284f1605c30 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -100,6 +100,7 @@ pub struct SuiteBuilder { name: String, nodes: usize, metastore_error: Box Result<()> + Send + Sync>, + cfg: Box, } impl SuiteBuilder { @@ -108,6 +109,9 @@ impl SuiteBuilder { name: s.to_owned(), nodes: 4, metastore_error: Box::new(|_| Ok(())), + cfg: Box::new(|cfg| { + cfg.enable = true; + }), } } @@ -124,11 +128,21 @@ impl SuiteBuilder { self } + pub fn cfg(mut self, f: impl FnOnce(&mut BackupStreamConfig) + 'static) -> Self { + let old_f = self.cfg; + self.cfg = Box::new(move |cfg| { + old_f(cfg); + f(cfg); + }); + self + } + pub fn build(self) -> Suite { let Self { name: case, nodes: n, metastore_error, + cfg: cfg_f, } = self; info!("start test"; "case" => %case, "nodes" => %n); @@ -154,8 +168,10 @@ impl SuiteBuilder { suite.endpoints.insert(id, worker); } suite.cluster.run(); + let mut cfg = BackupStreamConfig::default(); + cfg_f(&mut cfg); for id in 1..=(n as u64) { - suite.start_endpoint(id); + suite.start_endpoint(id, cfg.clone()); } // We must wait until the endpoints get ready to watching the metastore, or some // modifies may be lost. Either make Endpoint::with_client wait until watch did @@ -247,17 +263,16 @@ impl Suite { worker } - fn start_endpoint(&mut self, id: u64) { + fn start_endpoint(&mut self, id: u64, mut cfg: BackupStreamConfig) { let cluster = &mut self.cluster; let worker = self.endpoints.get_mut(&id).unwrap(); let sim = cluster.sim.wl(); let raft_router = sim.get_server_router(id); let cm = sim.get_concurrency_manager(id); let regions = sim.region_info_accessors.get(&id).unwrap().clone(); - let mut cfg = BackupStreamConfig::default(); + let ob = self.obs.get(&id).unwrap().clone(); cfg.enable = true; cfg.temp_path = format!("/{}/{}", self.temp_files.path().display(), id); - let ob = self.obs.get(&id).unwrap().clone(); let endpoint = Endpoint::new( id, self.meta_store.clone(), @@ -313,7 +328,10 @@ impl Suite { rx.into_iter() .map(|r| match r { - GetCheckpointResult::Ok { checkpoint, .. } => checkpoint.into_inner(), + GetCheckpointResult::Ok { checkpoint, region } => { + info!("getting checkpoint"; "checkpoint" => %checkpoint, "region" => ?region); + checkpoint.into_inner() + } GetCheckpointResult::NotFound { .. } | GetCheckpointResult::EpochNotMatch { .. } => { unreachable!() @@ -834,6 +852,44 @@ mod test { suite.cluster.shutdown(); } + #[test] + fn frequent_initial_scan() { + let mut suite = super::SuiteBuilder::new_named("frequent_initial_scan") + .cfg(|c| c.num_threads = 1) + .build(); + let keys = (1..1024).map(|i| make_record_key(1, i)).collect::>(); + let start_ts = suite.tso(); + suite.must_kv_prewrite( + 1, + keys.clone() + .into_iter() + .map(|k| mutation(k, b"hello, world".to_vec())) + .collect(), + make_record_key(1, 886), + start_ts, + ); + fail::cfg("scan_after_get_snapshot", "pause").unwrap(); + suite.must_register_task(1, "frequent_initial_scan"); + let commit_ts = suite.tso(); + suite.commit_keys(keys, start_ts, commit_ts); + suite.run(|| { + Task::ModifyObserve(backup_stream::ObserveOp::Stop { + region: suite.cluster.get_region(&make_record_key(1, 886)), + }) + }); + suite.run(|| { + Task::ModifyObserve(backup_stream::ObserveOp::Start { + region: suite.cluster.get_region(&make_record_key(1, 886)), + }) + }); + fail::cfg("scan_after_get_snapshot", "off").unwrap(); + suite.force_flush_files("frequent_initial_scan"); + suite.wait_for_flush(); + std::thread::sleep(Duration::from_secs(1)); + let c = suite.global_checkpoint(); + assert!(c > commit_ts.into_inner(), "{} vs {}", c, commit_ts); + } + #[test] /// This case tests whether the backup can continue when the leader failes. fn leader_down() { From f0c33168fbdece62837ca40d80d570855d4871d0 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Mon, 24 Oct 2022 18:51:56 +0800 Subject: [PATCH 281/676] tikv_util: fix panic when there are multiple cgroup2 mountinfos (#13661) close tikv/tikv#13660 fix panic when there are multiple cgroup2 mountinfos Signed-off-by: tabokie --- components/tikv_util/src/sys/cgroup.rs | 47 +++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/components/tikv_util/src/sys/cgroup.rs b/components/tikv_util/src/sys/cgroup.rs index 59830748382..df15a2dac76 100644 --- a/components/tikv_util/src/sys/cgroup.rs +++ b/components/tikv_util/src/sys/cgroup.rs @@ -255,11 +255,18 @@ fn cgroup_mountinfos_v2() -> HashMap { } fn parse_mountinfos_v2(infos: Vec) -> HashMap { - let mut ret = HashMap::new(); - let mut cg_infos = infos.into_iter().filter(|x| x.fs_type == "cgroup2"); - if let Some(cg_info) = cg_infos.next() { - assert!(cg_infos.next().is_none()); // Only one item for cgroup-2. - ret.insert("".to_string(), (cg_info.root, cg_info.mount_point)); + let mut ret: HashMap = HashMap::new(); + let cg_infos = infos.into_iter().filter(|x| x.fs_type == "cgroup2"); + for info in cg_infos { + // Should only be one item for cgroup-2. + if let Some((root, mount_point)) = ret.insert("".to_string(), (info.root, info.mount_point)) + { + warn!( + "Found multiple cgroup2 mountinfos, dropping {} {}", + root, + mount_point.display() + ); + } } ret } @@ -450,6 +457,36 @@ mod tests { assert_eq!(cgroup_sys.memory_limit_in_bytes(), None); } + #[test] + fn test_conflicting_mountinfo() { + let temp = tempfile::TempDir::new().unwrap(); + let dir = temp.path().to_str().unwrap(); + std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + + let mut f = OpenOptions::new() + .create(true) + .write(true) + .open(&format!("{}/mountinfo", dir)) + .unwrap(); + f.write_all(b"1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw + 1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw").unwrap(); + + let cgroups = parse_proc_cgroup_v2("0::/\n"); + let mount_points = { + let infos = Process::new_with_root(PathBuf::from(dir)) + .and_then(|x| x.mountinfo()) + .unwrap(); + parse_mountinfos_v2(infos) + }; + let cgroup_sys = CGroupSys { + cgroups, + mount_points, + is_v2: true, + }; + + assert_eq!(cgroup_sys.memory_limit_in_bytes(), None); + } + #[test] fn test_cgroup_without_mountinfo() { let temp = tempfile::TempDir::new().unwrap(); From 7692be47d181f0ceb96d904fe7bc485f274e0c1f Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Tue, 25 Oct 2022 10:51:56 +0800 Subject: [PATCH 282/676] mvcc: log the mvcc key if default not found error happens (#13659) close tikv/tikv#13655 Log the mvcc key if the default not error happens. Usually, the next step is to locate the key region and do an unsafe recovery, so mvcc key format logging is more convenient. Signed-off-by: cfzjywxk Co-authored-by: Ti Chi Robot --- src/storage/mvcc/reader/point_getter.rs | 2 +- src/storage/mvcc/reader/reader.rs | 17 +++++++++++++---- src/storage/mvcc/reader/scanner/mod.rs | 4 ++-- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 1e26d9bf21b..012189201c5 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -347,7 +347,7 @@ impl PointGetter { Ok(value) } else { Err(default_not_found_error( - user_key.to_raw()?, + user_key.clone().append_ts(write_start_ts).into_encoded(), "load_data_from_default_cf", )) } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 0f6eb5a390e..6bf712050ac 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -204,7 +204,7 @@ impl MvccReader { self.statistics.data.processed_keys += 1; Ok(val) } - None => Err(default_not_found_error(key.to_raw()?, "get")), + None => Err(default_not_found_error(k.into_encoded(), "get")), } } @@ -2163,7 +2163,10 @@ pub mod tests { }, Case { // write has no short_value, the reader has a cursor, got nothing - expected: Err(default_not_found_error(k.to_vec(), "get")), + expected: Err(default_not_found_error( + Key::from_raw(k).append_ts(TimeStamp::new(3)).into_encoded(), + "get", + )), modifies: vec![Modify::Put( CF_WRITE, Key::from_raw(k).append_ts(TimeStamp::new(1)), @@ -2189,7 +2192,10 @@ pub mod tests { }, Case { // write has no short_value, the reader has no cursor, got nothing - expected: Err(default_not_found_error(k.to_vec(), "get")), + expected: Err(default_not_found_error( + Key::from_raw(k).append_ts(TimeStamp::new(5)).into_encoded(), + "get", + )), modifies: vec![], scan_mode: None, key: Key::from_raw(k), @@ -2248,7 +2254,10 @@ pub mod tests { // some write for `key` at `ts` exists, load data return Err // todo: "some write for `key` at `ts` exists" should be checked by `test_get_write` // "load data return Err" is checked by test_load_data - expected: Err(default_not_found_error(k.to_vec(), "get")), + expected: Err(default_not_found_error( + Key::from_raw(k).append_ts(TimeStamp::new(2)).into_encoded(), + "get", + )), modifies: vec![Modify::Put( CF_WRITE, Key::from_raw(k).append_ts(TimeStamp::new(2)), diff --git a/src/storage/mvcc/reader/scanner/mod.rs b/src/storage/mvcc/reader/scanner/mod.rs index 7b799a3f456..664a4fed99e 100644 --- a/src/storage/mvcc/reader/scanner/mod.rs +++ b/src/storage/mvcc/reader/scanner/mod.rs @@ -366,7 +366,7 @@ where || default_cursor.key(&mut statistics.data) != seek_key.as_encoded().as_slice() { return Err(default_not_found_error( - user_key.to_raw()?, + user_key.clone().append_ts(write_start_ts).into_encoded(), "near_load_data_by_write", )); } @@ -391,7 +391,7 @@ where || default_cursor.key(&mut statistics.data) != seek_key.as_encoded().as_slice() { return Err(default_not_found_error( - user_key.to_raw()?, + user_key.clone().append_ts(write_start_ts).into_encoded(), "near_reverse_load_data_by_write", )); } From 5d2e706b258d339479cac0f0906d0dfb282b7c44 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 25 Oct 2022 01:53:56 -0700 Subject: [PATCH 283/676] raftstore-v2: support conf change (#13631) ref tikv/tikv#12842 Most workflow is the same as v1 except if apply failed, v2 will not return apply result back to raftstore. This behavior is a legacy behavior that only works for conf change before joint consensus and can be removed now. Signed-off-by: Jay Lee --- components/raftstore-v2/Cargo.toml | 5 +- components/raftstore-v2/src/fsm/apply.rs | 5 +- components/raftstore-v2/src/fsm/peer.rs | 2 + .../operation/command/admin/conf_change.rs | 509 ++++++++++++++++++ .../src/operation/command/admin/mod.rs | 87 +++ .../raftstore-v2/src/operation/command/mod.rs | 225 +++++--- .../src/operation/command/write/mod.rs | 3 +- .../operation/command/write/simple_write.rs | 37 +- components/raftstore-v2/src/operation/mod.rs | 2 +- .../raftstore-v2/src/operation/query/mod.rs | 8 +- .../src/operation/ready/async_writer.rs | 32 ++ .../raftstore-v2/src/operation/ready/mod.rs | 11 + components/raftstore-v2/src/raft/apply.rs | 50 +- components/raftstore-v2/src/raft/storage.rs | 15 +- .../src/router/internal_message.rs | 5 +- components/raftstore-v2/src/router/message.rs | 5 + components/raftstore-v2/src/router/mod.rs | 4 + .../src/router/response_channel.rs | 5 + .../tests/failpoints/test_basic_write.rs | 11 +- .../tests/integrations/cluster.rs | 101 +++- .../raftstore-v2/tests/integrations/mod.rs | 1 + .../tests/integrations/test_basic_write.rs | 16 +- .../tests/integrations/test_conf_change.rs | 69 +++ .../tests/integrations/test_read.rs | 97 +--- components/raftstore/src/store/peer.rs | 155 +----- components/raftstore/src/store/region_meta.rs | 32 +- components/raftstore/src/store/util.rs | 119 +++- .../raftstore/test_replication_mode.rs | 2 +- 28 files changed, 1264 insertions(+), 349 deletions(-) create mode 100644 components/raftstore-v2/src/operation/command/admin/conf_change.rs create mode 100644 components/raftstore-v2/src/operation/command/admin/mod.rs create mode 100644 components/raftstore-v2/tests/integrations/test_conf_change.rs diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 9adaf0c13e2..8bb91b40bb9 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [features] -default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] +default = ["testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] failpoints = ["raftstore/failpoints"] testexport = ["raftstore/testexport"] test-engine-kv-rocksdb = [ @@ -62,8 +62,9 @@ test_util = { workspace = true } [[test]] name = "raftstore-v2-failpoints" path = "tests/failpoints/mod.rs" -required-features = ["failpoints"] +required-features = ["failpoints", "testexport"] [[test]] name = "raftstore-v2-integrations" path = "tests/integrations/mod.rs" +required-features = ["testexport"] diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index b37d0b33518..4a1e05b8f75 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -13,7 +13,7 @@ use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; use engine_traits::KvEngine; use futures::{Future, StreamExt}; -use kvproto::raft_serverpb::RegionLocalState; +use kvproto::{metapb, raft_serverpb::RegionLocalState}; use slog::Logger; use tikv_util::mpsc::future::{self, Receiver, Sender, WakePolicy}; @@ -57,13 +57,14 @@ pub struct ApplyFsm { impl ApplyFsm { pub fn new( + peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, remote_tablet: CachedTablet, logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); - let apply = Apply::new(region_state, res_reporter, remote_tablet, logger); + let apply = Apply::new(peer, region_state, res_reporter, remote_tablet, logger); ( ApplyScheduler { sender: tx }, Self { diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 6b9cccc8b84..a1beedef968 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -233,6 +233,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.fsm.peer_mut().on_fetched_logs(fetched_logs) } PeerMsg::QueryDebugInfo(ch) => self.fsm.peer_mut().on_query_debug_info(ch), + #[cfg(feature = "testexport")] + PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } } // TODO: instead of propose pending commands immediately, we should use timeout. diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs new file mode 100644 index 00000000000..03d0690fe25 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -0,0 +1,509 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements the configuration change command. +//! +//! The command will go through the following steps: +//! - Propose conf change +//! - Apply after conf change is committed +//! - Update raft state using the result of conf change + +use collections::HashSet; +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + metapb::{self, PeerRole}, + raft_cmdpb::{AdminRequest, AdminResponse, ChangePeerRequest, RaftCmdRequest}, + raft_serverpb::{PeerState, RegionLocalState}, +}; +use protobuf::Message; +use raft::prelude::*; +use raft_proto::ConfChangeI; +use raftstore::{ + store::{ + metrics::{PEER_ADMIN_CMD_COUNTER_VEC, PEER_PROPOSE_LOG_SIZE_HISTOGRAM}, + util::{self, ChangePeerI, ConfChangeKind}, + ProposalContext, + }, + Error, Result, +}; +use slog::{error, info, warn}; +use tikv_util::box_err; + +use super::AdminCmdResult; +use crate::{ + batch::StoreContext, + raft::{Apply, Peer}, + router::ApplyRes, +}; + +/// The apply result of conf change. +#[derive(Default, Debug)] +pub struct ConfChangeResult { + pub index: u64, + // The proposed ConfChangeV2 or (legacy) ConfChange + // ConfChange (if it is) will convert to ConfChangeV2 + pub conf_change: ConfChangeV2, + // The change peer requests come along with ConfChangeV2 + // or (legacy) ConfChange, for ConfChange, it only contains + // one element + pub changes: Vec, + pub region_state: RegionLocalState, +} + +impl Peer { + #[inline] + pub fn propose_conf_change( + &mut self, + ctx: &mut StoreContext, + mut req: RaftCmdRequest, + ) -> Result { + if self.raft_group().raft.has_pending_conf() { + info!( + self.logger, + "there is a pending conf change, try later"; + ); + return Err(box_err!("there is a pending conf change, try later")); + } + let data = req.write_to_bytes()?; + let admin = req.get_admin_request(); + let leader_role = self.peer().get_role(); + if admin.has_change_peer() { + self.propose_conf_change_imp(ctx, admin.get_change_peer(), data) + } else if admin.has_change_peer_v2() { + self.propose_conf_change_imp(ctx, admin.get_change_peer_v2(), data) + } else { + unreachable!() + } + } + + /// Fails in following cases: + /// + /// 1. A pending conf change has not been applied yet; + /// 2. Removing the leader is not allowed in the configuration; + /// 3. The conf change makes the raft group not healthy; + /// 4. The conf change is dropped by raft group internally. + /// 5. There is a same peer on the same store in history record (TODO). + fn propose_conf_change_imp( + &mut self, + ctx: &mut StoreContext, + change_peer: impl ChangePeerI, + data: Vec, + ) -> Result { + let data_size = data.len(); + let cc = change_peer.to_confchange(data); + let changes = change_peer.get_change_peers(); + + util::check_conf_change( + &ctx.cfg, + self.raft_group(), + self.peer(), + changes.as_ref(), + &cc, + false, + )?; + + // TODO: check if the new peer is already in history record. + + ctx.raft_metrics.propose.conf_change.inc(); + // TODO: use local histogram metrics + PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data_size as f64); + info!( + self.logger, + "propose conf change peer"; + "changes" => ?changes.as_ref(), + "kind" => ?ConfChangeKind::confchange_kind(changes.as_ref().len()), + ); + + let last_index = self.raft_group().raft.raft_log.last_index(); + self.raft_group_mut() + .propose_conf_change(ProposalContext::SYNC_LOG.to_vec(), cc)?; + let proposal_index = self.raft_group().raft.raft_log.last_index(); + if proposal_index == last_index { + // The message is dropped silently, this usually due to leader absence + // or transferring leader. Both cases can be considered as NotLeader error. + return Err(Error::NotLeader(self.region_id(), None)); + } + + Ok(proposal_index) + } + + pub fn on_apply_res_conf_change(&mut self, conf_change: ConfChangeResult) { + // TODO: cancel generating snapshot. + + // Snapshot is applied in memory without waiting for all entries being + // applied. So it's possible conf_change.index < first_index. + if conf_change.index >= self.raft_group().raft.raft_log.first_index() { + match self.raft_group_mut().apply_conf_change(&conf_change.conf_change) { + Ok(_) + // PD could dispatch redundant conf changes. + | Err(raft::Error::NotExists { .. }) | Err(raft::Error::Exists { .. }) => (), + _ => unreachable!(), + } + } + + let remove_self = conf_change.region_state.get_state() == PeerState::Tombstone; + self.storage_mut() + .set_region_state(conf_change.region_state); + if self.is_leader() { + info!( + self.logger, + "notify pd with change peer region"; + "region" => ?self.region(), + ); + let demote_self = tikv_util::store::is_learner(self.peer()); + if remove_self || demote_self { + warn!(self.logger, "removing or demoting leader"; "remove" => remove_self, "demote" => demote_self); + let term = self.term(); + self.raft_group_mut() + .raft + .become_follower(term, raft::INVALID_ID); + } else if conf_change.changes.iter().any(|c| { + matches!( + c.get_change_type(), + ConfChangeType::AddNode | ConfChangeType::AddLearnerNode + ) + }) { + // Speed up snapshot instead of waiting another heartbeat. + self.raft_group_mut().ping(); + self.set_has_ready(); + } + } + if remove_self { + self.mark_for_destroy(None); + } + } +} + +impl Apply { + #[inline] + pub fn apply_conf_change( + &mut self, + index: u64, + req: &AdminRequest, + cc: ConfChangeV2, + ) -> Result<(AdminResponse, AdminCmdResult)> { + assert!(req.has_change_peer()); + self.apply_conf_change_imp(index, std::slice::from_ref(req.get_change_peer()), cc, true) + } + + #[inline] + pub fn apply_conf_change_v2( + &mut self, + index: u64, + req: &AdminRequest, + cc: ConfChangeV2, + ) -> Result<(AdminResponse, AdminCmdResult)> { + assert!(req.has_change_peer_v2()); + self.apply_conf_change_imp( + index, + req.get_change_peer_v2().get_change_peers(), + cc, + false, + ) + } + + #[inline] + fn apply_conf_change_imp( + &mut self, + index: u64, + changes: &[ChangePeerRequest], + cc: ConfChangeV2, + legacy: bool, + ) -> Result<(AdminResponse, AdminCmdResult)> { + let region = self.region_state().get_region(); + let peer_id = self.peer().get_id(); + let change_kind = ConfChangeKind::confchange_kind(changes.len()); + info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch()); + let mut new_region = region.clone(); + match change_kind { + ConfChangeKind::LeaveJoint => self.apply_leave_joint(&mut new_region), + kind => { + debug_assert!(!legacy || kind == ConfChangeKind::Simple, "{:?}", kind); + debug_assert!( + kind != ConfChangeKind::Simple || changes.len() == 1, + "{:?}", + changes + ); + for cp in changes { + let res = if legacy { + self.apply_single_change_legacy(cp, &mut new_region) + } else { + self.apply_single_change(kind, cp, &mut new_region) + }; + if let Err(e) = res { + error!(self.logger, "failed to apply conf change"; + "changes" => ?changes, + "legacy" => legacy, + "original region" => ?region, "err" => ?e); + } + } + let conf_ver = region.get_region_epoch().get_conf_ver() + changes.len() as u64; + new_region.mut_region_epoch().set_conf_ver(conf_ver); + } + }; + + info!( + self.logger, + "conf change successfully"; + "changes" => ?changes, + "legacy" => legacy, + "original region" => ?region, + "current region" => ?new_region, + ); + let my_id = self.peer().get_id(); + let state = self.region_state_mut(); + state.set_region(new_region.clone()); + let new_peer = new_region + .get_peers() + .iter() + .find(|p| p.get_id() == my_id) + .cloned(); + if new_peer.is_none() { + // A peer will reject any snapshot that doesn't include itself in the + // configuration. So if it disappear from the configuration, it must + // be removed by conf change. + state.set_state(PeerState::Tombstone); + } + let mut resp = AdminResponse::default(); + resp.mut_change_peer().set_region(new_region); + let mut conf_change = ConfChangeResult { + index, + conf_change: cc, + changes: changes.to_vec(), + region_state: state.clone(), + }; + if state.get_state() == PeerState::Tombstone { + self.mark_tombstone(); + } + if let Some(peer) = new_peer { + self.set_peer(peer); + } + Ok((resp, AdminCmdResult::ConfChange(conf_change))) + } + + #[inline] + fn apply_leave_joint(&self, region: &mut metapb::Region) { + let mut change_num = 0; + for peer in region.mut_peers().iter_mut() { + match peer.get_role() { + PeerRole::IncomingVoter => peer.set_role(PeerRole::Voter), + PeerRole::DemotingVoter => peer.set_role(PeerRole::Learner), + _ => continue, + } + change_num += 1; + } + if change_num == 0 { + panic!( + "{:?} can't leave a non-joint config, region: {:?}", + self.logger.list(), + self.region_state() + ); + } + let conf_ver = region.get_region_epoch().get_conf_ver() + change_num; + region.mut_region_epoch().set_conf_ver(conf_ver); + info!(self.logger, "leave joint state successfully"; "region" => ?region); + } + + /// This is used for conf change v1. Use a standalone function to avoid + /// future refactor breaks consistency accidentally. + #[inline] + fn apply_single_change_legacy( + &self, + cp: &ChangePeerRequest, + region: &mut metapb::Region, + ) -> Result<()> { + let peer = cp.get_peer(); + let store_id = peer.get_store_id(); + let change_type = cp.get_change_type(); + + match change_type { + ConfChangeType::AddNode => { + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["add_peer", "all"]) + .inc(); + + let mut exists = false; + if let Some(p) = tikv_util::store::find_peer_mut(region, store_id) { + exists = true; + if !tikv_util::store::is_learner(p) || p.get_id() != peer.get_id() { + return Err(box_err!( + "can't add duplicated peer {:?} to region {:?}", + peer, + self.region_state() + )); + } else { + p.set_role(PeerRole::Voter); + } + } + if !exists { + // TODO: Do we allow adding peer in same node? + region.mut_peers().push(peer.clone()); + } + + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["add_peer", "success"]) + .inc(); + } + ConfChangeType::RemoveNode => { + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["remove_peer", "all"]) + .inc(); + + if let Some(p) = tikv_util::store::remove_peer(region, store_id) { + // Considering `is_learner` flag in `Peer` here is by design. + if &p != peer { + return Err(box_err!( + "remove unmatched peer: expect: {:?}, get {:?}, ignore", + peer, + p + )); + } + } else { + return Err(box_err!( + "remove missing peer {:?} from region {:?}", + peer, + self.region_state() + )); + } + + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["remove_peer", "success"]) + .inc(); + } + ConfChangeType::AddLearnerNode => { + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["add_learner", "all"]) + .inc(); + + if tikv_util::store::find_peer(region, store_id).is_some() { + return Err(box_err!( + "can't add duplicated learner {:?} to region {:?}", + peer, + self.region_state() + )); + } + region.mut_peers().push(peer.clone()); + + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["add_learner", "success"]) + .inc(); + } + } + Ok(()) + } + + #[inline] + fn apply_single_change( + &self, + kind: ConfChangeKind, + cp: &ChangePeerRequest, + region: &mut metapb::Region, + ) -> Result<()> { + let (change_type, peer) = (cp.get_change_type(), cp.get_peer()); + let store_id = peer.get_store_id(); + + let metric = match change_type { + ConfChangeType::AddNode => "add_peer", + ConfChangeType::RemoveNode => "remove_peer", + ConfChangeType::AddLearnerNode => "add_learner", + }; + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&[metric, "all"]) + .inc(); + + if let Some(exist_peer) = tikv_util::store::find_peer(region, store_id) { + let r = exist_peer.get_role(); + if r == PeerRole::IncomingVoter || r == PeerRole::DemotingVoter { + panic!( + "{:?} can't apply confchange because configuration is still in joint state, confchange: {:?}, region: {:?}", + self.logger.list(), + cp, + self.region_state() + ); + } + } + match ( + tikv_util::store::find_peer_mut(region, store_id), + change_type, + ) { + (None, ConfChangeType::AddNode) => { + let mut peer = peer.clone(); + match kind { + ConfChangeKind::Simple => peer.set_role(PeerRole::Voter), + ConfChangeKind::EnterJoint => peer.set_role(PeerRole::IncomingVoter), + _ => unreachable!(), + } + region.mut_peers().push(peer); + } + (None, ConfChangeType::AddLearnerNode) => { + let mut peer = peer.clone(); + peer.set_role(PeerRole::Learner); + region.mut_peers().push(peer); + } + (None, ConfChangeType::RemoveNode) => { + return Err(box_err!( + "remove missing peer {:?} from region {:?}", + peer, + self.region_state() + )); + } + // Add node + (Some(exist_peer), ConfChangeType::AddNode) + | (Some(exist_peer), ConfChangeType::AddLearnerNode) => { + let (role, exist_id, incoming_id) = + (exist_peer.get_role(), exist_peer.get_id(), peer.get_id()); + + if exist_id != incoming_id // Add peer with different id to the same store + // The peer is already the requested role + || (role, change_type) == (PeerRole::Voter, ConfChangeType::AddNode) + || (role, change_type) == (PeerRole::Learner, ConfChangeType::AddLearnerNode) + { + return Err(box_err!( + "can't add duplicated peer {:?} to region {:?}, duplicated with exist peer {:?}", + peer, + self.region_state(), + exist_peer + )); + } + match (role, change_type) { + (PeerRole::Voter, ConfChangeType::AddLearnerNode) => match kind { + ConfChangeKind::Simple => exist_peer.set_role(PeerRole::Learner), + ConfChangeKind::EnterJoint => exist_peer.set_role(PeerRole::DemotingVoter), + _ => unreachable!(), + }, + (PeerRole::Learner, ConfChangeType::AddNode) => match kind { + ConfChangeKind::Simple => exist_peer.set_role(PeerRole::Voter), + ConfChangeKind::EnterJoint => exist_peer.set_role(PeerRole::IncomingVoter), + _ => unreachable!(), + }, + _ => unreachable!(), + } + } + // Remove node + (Some(exist_peer), ConfChangeType::RemoveNode) => { + if kind == ConfChangeKind::EnterJoint && exist_peer.get_role() == PeerRole::Voter { + return Err(box_err!( + "can not remove voter {:?} directly from region {:?}", + peer, + self.region_state() + )); + } + match tikv_util::store::remove_peer(region, store_id) { + Some(p) => { + if &p != peer { + return Err(box_err!( + "remove unmatched peer: expect: {:?}, get {:?}, ignore", + peer, + p + )); + } + } + None => unreachable!(), + } + } + } + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&[metric, "success"]) + .inc(); + Ok(()) + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs new file mode 100644 index 00000000000..396e3ede98f --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -0,0 +1,87 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod conf_change; + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + raft_cmdpb::{AdminRequest, RaftCmdRequest}, + raft_serverpb::PeerState, +}; +use protobuf::Message; +use raft::prelude::ConfChangeV2; +use raftstore::{ + store::{ + self, cmd_resp, + fsm::apply, + msg::ErrorCallback, + util::{ChangePeerI, ConfChangeKind}, + }, + Result, +}; +use slog::info; +use tikv_util::box_err; + +use self::conf_change::ConfChangeResult; +use crate::{ + batch::StoreContext, + raft::{Apply, Peer}, + router::CmdResChannel, +}; + +#[derive(Debug)] +pub enum AdminCmdResult { + ConfChange(ConfChangeResult), +} + +impl Peer { + #[inline] + pub fn on_admin_command( + &mut self, + ctx: &mut StoreContext, + mut req: RaftCmdRequest, + ch: CmdResChannel, + ) { + if !self.serving() { + apply::notify_req_region_removed(self.region_id(), ch); + return; + } + if let Err(e) = self.validate_command(&req, &mut ctx.raft_metrics) { + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + + // The admin request is rejected because it may need to update epoch checker + // which introduces an uncertainty and may breaks the correctness of epoch + // checker. + if !self.applied_to_current_term() { + let e = box_err!( + "{:?} peer has not applied to current term, applied_term {}, current_term {}", + self.logger.list(), + self.storage().entry_storage().applied_term(), + self.term() + ); + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + // To maintain propose order, we need to make pending proposal first. + self.propose_pending_command(ctx); + let cmd_type = req.get_admin_request().get_cmd_type(); + let res = if apply::is_conf_change_cmd(&req) { + self.propose_conf_change(ctx, req) + } else { + // propose other admin command. + unimplemented!() + }; + if let Err(e) = &res { + info!( + self.logger, + "failed to propose admin command"; + "cmd_type" => ?cmd_type, + "error" => ?e, + ); + } + self.post_propose_write(ctx, res, vec![ch]); + } +} diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index fcfeb29fbe2..fe863a74b8a 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -21,16 +21,19 @@ use std::cmp; use batch_system::{Fsm, FsmScheduler, Mailbox}; use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; use kvproto::{ - raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader}, + raft_cmdpb::{AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader}, raft_serverpb::RegionLocalState, }; use protobuf::Message; -use raft::eraftpb::Entry; +use raft::eraftpb::{ConfChange, ConfChangeV2, Entry, EntryType}; +use raft_proto::ConfChangeI; use raftstore::{ store::{ cmd_resp, fsm::{ - apply::{APPLY_WB_SHRINK_SIZE, DEFAULT_APPLY_WB_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP}, + apply::{ + self, APPLY_WB_SHRINK_SIZE, DEFAULT_APPLY_WB_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP, + }, Proposal, }, local_metrics::RaftMetrics, @@ -50,12 +53,28 @@ use crate::{ router::{ApplyRes, ApplyTask, CmdResChannel, PeerMsg}, }; +mod admin; mod write; +pub use admin::AdminCmdResult; pub use write::{SimpleWriteDecoder, SimpleWriteEncoder}; use self::write::SimpleWrite; +fn parse_at(logger: &slog::Logger, buf: &[u8], index: u64, term: u64) -> M { + let mut m = M::default(); + match m.merge_from_bytes(buf) { + Ok(()) => m, + Err(e) => panic!( + "{:?} data is corrupted at [{}] {}: {:?}", + logger.list(), + term, + index, + e + ), + } +} + #[derive(Debug)] pub struct CommittedEntries { /// Entries need to be applied. Note some entries may not be included for @@ -80,7 +99,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> PeerFsmDelegate<'a, EK, ER, T> { .peer_mut() .on_write_command(self.store_ctx, req, ch) } else if req.has_admin_request() { - // self.on_admin_request(req, ch) + self.fsm + .peer_mut() + .on_admin_command(self.store_ctx, req, ch) } else if req.has_status_request() { error!(self.fsm.logger(), "status command should be sent by Query"); } @@ -99,7 +120,8 @@ impl Peer { let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); let tablet = self.tablet().clone(); let logger = self.logger.clone(); - let (apply_scheduler, mut apply_fsm) = ApplyFsm::new(region_state, mailbox, tablet, logger); + let (apply_scheduler, mut apply_fsm) = + ApplyFsm::new(self.peer().clone(), region_state, mailbox, tablet, logger); store_ctx .apply_pool .spawn(async move { apply_fsm.handle_all_tasks().await }) @@ -229,9 +251,17 @@ impl Peer { } // It must just applied a snapshot. if apply_res.applied_index < self.entry_storage().first_index() { - // TODO: handle admin side effects like split/merge. + // Ignore admin command side effects, otherwise it may split incomplete + // region. return; } + for admin_res in apply_res.admin_result { + match admin_res { + AdminCmdResult::ConfChange(conf_change) => { + self.on_apply_res_conf_change(conf_change) + } + } + } self.raft_group_mut() .advance_apply_to(apply_res.applied_index); let is_leader = self.is_leader(); @@ -244,7 +274,12 @@ impl Peer { if !is_leader { entry_storage.compact_entry_cache(apply_res.applied_index + 1); } - self.handle_read_on_apply(ctx, apply_res, progress_to_be_updated); + self.handle_read_on_apply( + ctx, + apply_res.applied_term, + apply_res.applied_index, + progress_to_be_updated, + ); } } @@ -253,6 +288,10 @@ impl Apply { pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); for (e, ch) in ce.entry_and_proposals { + if self.tombstone() { + apply::notify_req_region_removed(self.region_state().get_region().get_id(), ch); + continue; + } if !e.get_data().is_empty() { let mut set_save_point = false; if let Some(wb) = self.write_batch_mut() { @@ -283,62 +322,130 @@ impl Apply { #[inline] async fn apply_entry(&mut self, entry: &Entry) -> Result { - match SimpleWriteDecoder::new(entry.get_data()) { - Ok(decoder) => { - util::compare_region_epoch( - decoder.header().get_region_epoch(), - self.region_state().get_region(), - false, - true, - true, - )?; - let res = Ok(new_response(decoder.header())); - for req in decoder { - match req { - SimpleWrite::Put(put) => self.apply_put(put.cf, put.key, put.value)?, - SimpleWrite::Delete(delete) => self.apply_delete(delete.cf, delete.key)?, - SimpleWrite::DeleteRange(dr) => self.apply_delete_range( - dr.cf, - dr.start_key, - dr.end_key, - dr.notify_only, - )?, + let mut conf_change = None; + let req = match entry.get_entry_type() { + EntryType::EntryNormal => match SimpleWriteDecoder::new( + &self.logger, + entry.get_data(), + entry.get_index(), + entry.get_term(), + ) { + Ok(decoder) => { + util::compare_region_epoch( + decoder.header().get_region_epoch(), + self.region_state().get_region(), + false, + true, + true, + )?; + let res = Ok(new_response(decoder.header())); + for req in decoder { + match req { + SimpleWrite::Put(put) => self.apply_put(put.cf, put.key, put.value)?, + SimpleWrite::Delete(delete) => { + self.apply_delete(delete.cf, delete.key)? + } + SimpleWrite::DeleteRange(dr) => self.apply_delete_range( + dr.cf, + dr.start_key, + dr.end_key, + dr.notify_only, + )?, + } } + return res; } - res + Err(req) => req, + }, + EntryType::EntryConfChange => { + let cc: ConfChange = parse_at( + &self.logger, + entry.get_data(), + entry.get_index(), + entry.get_term(), + ); + let req: RaftCmdRequest = parse_at( + &self.logger, + cc.get_context(), + entry.get_index(), + entry.get_term(), + ); + conf_change = Some(cc.into_v2()); + req } - Err(req) => { - util::check_region_epoch(&req, self.region_state().get_region(), true)?; - if req.has_admin_request() { - // TODO: implement admin request. - } else { - for r in req.get_requests() { - match r.get_cmd_type() { - // These three writes should all use the new codec. Keep them here for - // backward compatibility. - CmdType::Put => { - let put = r.get_put(); - self.apply_put(put.get_cf(), put.get_key(), put.get_value())?; - } - CmdType::Delete => { - let delete = r.get_delete(); - self.apply_delete(delete.get_cf(), delete.get_key())?; - } - CmdType::DeleteRange => { - let dr = r.get_delete_range(); - self.apply_delete_range( - dr.get_cf(), - dr.get_start_key(), - dr.get_end_key(), - dr.get_notify_only(), - )?; - } - _ => unimplemented!(), - } + EntryType::EntryConfChangeV2 => { + let cc: ConfChangeV2 = parse_at( + &self.logger, + entry.get_data(), + entry.get_index(), + entry.get_term(), + ); + let req: RaftCmdRequest = parse_at( + &self.logger, + cc.get_context(), + entry.get_index(), + entry.get_term(), + ); + conf_change = Some(cc); + req + } + }; + + util::check_region_epoch(&req, self.region_state().get_region(), true)?; + if req.has_admin_request() { + let admin_req = req.get_admin_request(); + let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { + AdminCmdType::CompactLog => unimplemented!(), + AdminCmdType::Split => unimplemented!(), + AdminCmdType::BatchSplit => unimplemented!(), + AdminCmdType::PrepareMerge => unimplemented!(), + AdminCmdType::CommitMerge => unimplemented!(), + AdminCmdType::RollbackMerge => unimplemented!(), + AdminCmdType::TransferLeader => unreachable!(), + AdminCmdType::ChangePeer => { + self.apply_conf_change(entry.get_index(), admin_req, conf_change.unwrap())? + } + AdminCmdType::ChangePeerV2 => { + self.apply_conf_change_v2(entry.get_index(), admin_req, conf_change.unwrap())? + } + AdminCmdType::ComputeHash => unimplemented!(), + AdminCmdType::VerifyHash => unimplemented!(), + AdminCmdType::PrepareFlashback => unimplemented!(), + AdminCmdType::FinishFlashback => unimplemented!(), + AdminCmdType::InvalidAdmin => { + return Err(box_err!("invalid admin command type")); + } + }; + self.push_admin_result(admin_result); + let mut resp = new_response(req.get_header()); + resp.set_admin_response(admin_resp); + Ok(resp) + } else { + for r in req.get_requests() { + match r.get_cmd_type() { + // These three writes should all use the new codec. Keep them here for + // backward compatibility. + CmdType::Put => { + let put = r.get_put(); + self.apply_put(put.get_cf(), put.get_key(), put.get_value())?; } + CmdType::Delete => { + let delete = r.get_delete(); + self.apply_delete(delete.get_cf(), delete.get_key())?; + } + CmdType::DeleteRange => { + let dr = r.get_delete_range(); + self.apply_delete_range( + dr.get_cf(), + dr.get_start_key(), + dr.get_end_key(), + dr.get_notify_only(), + )?; + } + _ => unimplemented!(), } - Ok(new_response(req.get_header())) } + Ok(new_response(req.get_header())) } } @@ -367,9 +474,7 @@ impl Apply { let (index, term) = self.apply_progress(); apply_res.applied_index = index; apply_res.applied_term = term; - if self.reset_state_changed() { - apply_res.region_state = Some(self.region_state().clone()); - } + apply_res.admin_result = self.take_admin_result(); self.res_reporter().report(apply_res); } } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 798e1b45631..a760a5acfb2 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -70,7 +70,8 @@ impl Peer { } } - fn post_propose_write( + #[inline] + pub fn post_propose_write( &mut self, ctx: &mut StoreContext, res: Result, diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index 46544be1a32..364e2741868 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -3,8 +3,9 @@ use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request}; use protobuf::{CodedInputStream, Message, SingularPtrField}; +use slog::Logger; -use crate::router::CmdResChannel; +use crate::{operation::command::parse_at, router::CmdResChannel}; // MAGIC number to hint simple write codec is used. If it's a protobuf message, // the first one or several bytes are for field tag, which can't be zero. @@ -135,22 +136,32 @@ pub struct SimpleWriteDecoder<'a> { } impl<'a> SimpleWriteDecoder<'a> { - pub fn new(buf: &'a [u8]) -> Result, RaftCmdRequest> { + pub fn new( + logger: &Logger, + buf: &'a [u8], + index: u64, + term: u64, + ) -> Result, RaftCmdRequest> { match buf.first().cloned() { Some(MAGIC_PREFIX) => { let mut is = CodedInputStream::from_bytes(&buf[1..]); - let header = is.read_message().unwrap(); + let header = match is.read_message() { + Ok(h) => h, + Err(e) => panic!( + "{:?} data corrupted at [{}] {}: {:?}", + logger.list(), + term, + index, + e + ), + }; let read = is.pos(); Ok(SimpleWriteDecoder { header, buf: &buf[1 + read as usize..], }) } - _ => { - let mut req = RaftCmdRequest::new(); - req.merge_from_bytes(buf).unwrap(); - Err(req) - } + _ => Err(parse_at(logger, buf, index, term)), } } @@ -346,6 +357,8 @@ fn decode<'a>(buf: &mut &'a [u8]) -> Option> { #[cfg(test)] mod tests { + use slog::o; + use super::*; #[test] @@ -392,7 +405,8 @@ mod tests { encoder.amend(cmd.clone()).unwrap(); let (bytes, _) = encoder.encode(); - let mut decoder = SimpleWriteDecoder::new(&bytes).unwrap(); + let logger = slog_global::borrow_global().new(o!()); + let mut decoder = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap(); assert_eq!(decoder.header(), cmd.get_header()); let write = decoder.next().unwrap(); let SimpleWrite::Put(put) = write else { panic!("should be put") }; @@ -459,7 +473,8 @@ mod tests { invalid_cmd.mut_requests().push(req); let fallback = SimpleWriteEncoder::new(invalid_cmd.clone(), usize::MAX).unwrap_err(); let bytes = fallback.write_to_bytes().unwrap(); - let decoded = SimpleWriteDecoder::new(&bytes).unwrap_err(); + let logger = slog_global::borrow_global().new(o!()); + let decoded = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap_err(); assert_eq!(decoded, invalid_cmd); let mut valid_cmd = RaftCmdRequest::default(); @@ -480,7 +495,7 @@ mod tests { encoder.amend(valid_cmd2).unwrap_err(); let (bytes, _) = encoder.encode(); - let mut decoder = SimpleWriteDecoder::new(&bytes).unwrap(); + let mut decoder = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap(); assert_eq!(decoder.header(), valid_cmd.get_header()); let req = decoder.next().unwrap(); let SimpleWrite::Put(put) = req else { panic!("should be put") }; diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 7b31473f784..1eaeb21ec18 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -5,7 +5,7 @@ mod life; mod query; mod ready; -pub use command::{CommittedEntries, SimpleWriteDecoder, SimpleWriteEncoder}; +pub use command::{AdminCmdResult, CommittedEntries, SimpleWriteDecoder, SimpleWriteEncoder}; pub use life::DestroyProgress; pub use ready::AsyncWriter; diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 960e667c7d9..8b84b0788ce 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -386,7 +386,8 @@ impl Peer { pub fn handle_read_on_apply( &mut self, ctx: &mut StoreContext, - apply_res: ApplyRes, + applied_term: u64, + applied_index: u64, progress_to_be_updated: bool, ) { // TODO: add is_handling_snapshot check @@ -401,13 +402,12 @@ impl Peer { } } self.pending_reads_mut().gc(); - self.read_progress_mut() - .update_applied_core(apply_res.applied_index); + self.read_progress_mut().update_applied_core(applied_index); // Only leaders need to update applied_term. if progress_to_be_updated && self.is_leader() { // TODO: add coprocessor_host hook - let progress = ReadProgress::applied_term(apply_res.applied_term); + let progress = ReadProgress::applied_term(applied_term); // TODO: remove it self.add_reader_if_necessary(&mut ctx.store_meta); let mut meta = ctx.store_meta.lock().unwrap(); diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index 3ebc1f20da7..d5673d76a40 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -29,6 +29,8 @@ pub struct AsyncWriter { write_router: WriteRouter, unpersisted_readies: VecDeque, persisted_number: u64, + #[cfg(feature = "testexport")] + flush_subscribers: VecDeque<(u64, crate::router::FlushChannel)>, } impl AsyncWriter { @@ -38,6 +40,8 @@ impl AsyncWriter { write_router, unpersisted_readies: VecDeque::new(), persisted_number: 0, + #[cfg(feature = "testexport")] + flush_subscribers: VecDeque::new(), } } @@ -159,6 +163,34 @@ impl AsyncWriter { } } +#[cfg(feature = "testexport")] +impl AsyncWriter { + pub fn subscirbe_flush(&mut self, ch: crate::router::FlushChannel) { + self.flush_subscribers + .push_back((self.known_largest_number(), ch)); + } + + pub fn notify_flush(&mut self) { + if self.flush_subscribers.is_empty() { + return; + } + if self.all_ready_persisted() { + for (_, ch) in self.flush_subscribers.drain(..) { + ch.set_result(()); + } + } + while let Some((number, ch)) = self.flush_subscribers.pop_front() { + // A channel is registered without ready, so persisted_number should be larger. + if self.persisted_number > number { + ch.set_result(()); + } else { + self.flush_subscribers.push_front((number, ch)); + break; + } + } + } +} + impl WriteRouterContext for StoreContext where EK: KvEngine, diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 3129dcfb832..cfc3d086163 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -261,11 +261,15 @@ impl Peer { pub fn handle_raft_ready(&mut self, ctx: &mut StoreContext) { let has_ready = self.reset_has_ready(); if !has_ready || self.destroy_progress().started() { + #[cfg(feature = "testexport")] + self.async_writer.notify_flush(); return; } ctx.has_ready = true; if !self.raft_group().has_ready() && (self.serving() || self.postpond_destroy()) { + #[cfg(feature = "testexport")] + self.async_writer.notify_flush(); return; } @@ -336,6 +340,8 @@ impl Peer { } ctx.raft_metrics.ready.has_ready_region.inc(); + #[cfg(feature = "testexport")] + self.async_writer.notify_flush(); } /// Called when an asynchronously write finishes. @@ -372,6 +378,11 @@ impl Peer { self.finish_destroy(ctx); } } + + #[cfg(feature = "testexport")] + pub fn on_wait_flush(&mut self, ch: crate::router::FlushChannel) { + self.async_writer.subscirbe_flush(ch); + } } impl Storage { diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index b210890ac40..068e5124c0c 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -1,30 +1,37 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::mem; + use engine_traits::{KvEngine, RaftEngine}; -use kvproto::{raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; +use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::fsm::apply::DEFAULT_APPLY_WB_SIZE; use slog::Logger; use super::Peer; use crate::{ fsm::ApplyResReporter, + operation::AdminCmdResult, router::{ApplyRes, CmdResChannel}, tablet::CachedTablet, }; /// Apply applies all the committed commands to kv db. pub struct Apply { + peer: metapb::Peer, remote_tablet: CachedTablet, tablet: EK, write_batch: Option, callbacks: Vec<(Vec, RaftCmdResponse)>, + /// A flag indicates whether the peer is destroyed by applying admin + /// command. + tombstone: bool, applied_index: u64, applied_term: u64, + admin_cmd_result: Vec, region_state: RegionLocalState, - state_changed: bool, res_reporter: R, pub(crate) logger: Logger, @@ -33,20 +40,23 @@ pub struct Apply { impl Apply { #[inline] pub fn new( + peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, mut remote_tablet: CachedTablet, logger: Logger, ) -> Self { Apply { + peer, tablet: remote_tablet.latest().unwrap().clone(), remote_tablet, write_batch: None, callbacks: vec![], + tombstone: false, applied_index: 0, applied_term: 0, + admin_cmd_result: vec![], region_state, - state_changed: false, res_reporter, logger, } @@ -92,8 +102,8 @@ impl Apply { } #[inline] - pub fn reset_state_changed(&mut self) -> bool { - std::mem::take(&mut self.state_changed) + pub fn region_state_mut(&mut self) -> &mut RegionLocalState { + &mut self.region_state } /// Publish the tablet so that it can be used by read worker. @@ -105,4 +115,34 @@ impl Apply { self.remote_tablet.set(tablet.clone()); self.tablet = tablet; } + + #[inline] + pub fn peer(&self) -> &metapb::Peer { + &self.peer + } + + #[inline] + pub fn set_peer(&mut self, peer: metapb::Peer) { + self.peer = peer; + } + + #[inline] + pub fn mark_tombstone(&mut self) { + self.tombstone = true; + } + + #[inline] + pub fn tombstone(&self) -> bool { + self.tombstone + } + + #[inline] + pub fn push_admin_result(&mut self, admin_result: AdminCmdResult) { + self.admin_cmd_result.push(admin_result); + } + + #[inline] + pub fn take_admin_result(&mut self) -> Vec { + mem::take(&mut self.admin_cmd_result) + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 1615255ab23..b08624b1185 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -240,6 +240,17 @@ impl Storage { pub fn set_ever_persisted(&mut self) { self.ever_persisted = true; } + + #[inline] + pub fn set_region_state(&mut self, state: RegionLocalState) { + self.region_state = state; + for peer in self.region_state.get_region().get_peers() { + if peer.get_id() == self.peer.get_id() { + self.peer = peer.clone(); + break; + } + } + } } impl raft::Storage for Storage { @@ -295,7 +306,9 @@ impl raft::Storage for Storage { } fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { - unimplemented!() + Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )) } } diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 28a93e897af..e9893bad968 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,8 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use kvproto::raft_serverpb::RegionLocalState; +use raftstore::store::fsm::ChangePeer; -use crate::operation::CommittedEntries; +use crate::operation::{AdminCmdResult, CommittedEntries}; #[derive(Debug)] pub enum ApplyTask { @@ -13,5 +14,5 @@ pub enum ApplyTask { pub struct ApplyRes { pub applied_index: u64, pub applied_term: u64, - pub region_state: Option, + pub admin_result: Vec, } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index fb323dca9d4..c607e389135 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -134,6 +134,9 @@ pub enum PeerMsg { ready_number: u64, }, QueryDebugInfo(DebugInfoChannel), + /// A message that used to check if a flush is happened. + #[cfg(feature = "testexport")] + WaitFlush(super::FlushChannel), } impl PeerMsg { @@ -172,6 +175,8 @@ impl fmt::Debug for PeerMsg { ), PeerMsg::FetchedLogs(fetched) => write!(fmt, "FetchedLogs {:?}", fetched), PeerMsg::QueryDebugInfo(_) => write!(fmt, "QueryDebugInfo"), + #[cfg(feature = "testexport")] + PeerMsg::WaitFlush(_) => write!(fmt, "FlushMessages"), } } } diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index e9e7cf6cfc8..a09b0593b80 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -6,6 +6,10 @@ pub mod message; mod response_channel; pub(crate) use self::internal_message::ApplyTask; +#[cfg(feature = "testexport")] +pub use self::response_channel::FlushChannel; +#[cfg(feature = "testexport")] +pub use self::response_channel::FlushSubscriber; pub use self::{ imp::RaftRouter, internal_message::ApplyRes, diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index 55219540c2f..d68c414ca5f 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -417,6 +417,11 @@ impl fmt::Debug for QueryResChannel { pub type DebugInfoChannel = BaseChannel; pub type DebugInfoSubscriber = BaseSubscriber; +#[cfg(feature = "testexport")] +pub type FlushChannel = BaseChannel<()>; +#[cfg(feature = "testexport")] +pub type FlushSubscriber = BaseSubscriber<()>; + #[cfg(test)] mod tests { use futures::executor::block_on; diff --git a/components/raftstore-v2/tests/failpoints/test_basic_write.rs b/components/raftstore-v2/tests/failpoints/test_basic_write.rs index 5014e0efd3e..4bf4201f67c 100644 --- a/components/raftstore-v2/tests/failpoints/test_basic_write.rs +++ b/components/raftstore-v2/tests/failpoints/test_basic_write.rs @@ -4,10 +4,8 @@ use std::{assert_matches::assert_matches, time::Duration}; use engine_traits::{OpenOptions, Peekable, TabletFactory}; use futures::executor::block_on; -use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; -use raftstore::store::{INIT_EPOCH_CONF_VER, INIT_EPOCH_VER}; +use kvproto::raft_cmdpb::{CmdType, Request}; use raftstore_v2::router::PeerMsg; -use tikv_util::store::new_peer; use crate::cluster::Cluster; @@ -16,12 +14,7 @@ use crate::cluster::Cluster; fn test_write_batch_rollback() { let cluster = Cluster::default(); let router = cluster.router(0); - let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(2); - let epoch = req.mut_header().mut_region_epoch(); - epoch.set_version(INIT_EPOCH_VER); - epoch.set_conf_ver(INIT_EPOCH_CONF_VER); - req.mut_header().set_peer(new_peer(1, 3)); + let mut req = router.new_request_for(2); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); put_req.mut_put().set_key(b"key".to_vec()); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 554db96acbf..1d458d7a73e 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -11,7 +11,8 @@ use std::{ time::{Duration, Instant}, }; -use crossbeam::channel::{self, Receiver, Sender}; +use collections::HashSet; +use crossbeam::channel::{self, Receiver, Sender, TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, kv::{KvTestEngine, TestTabletFactoryV2}, @@ -28,13 +29,16 @@ use pd_client::RpcClient; use raftstore::store::{region_meta::RegionMeta, Config, Transport, RAFT_INIT_LOG_INDEX}; use raftstore_v2::{ create_store_batch_system, - router::{DebugInfoChannel, PeerMsg, QueryResult, RaftRouter}, + router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, Bootstrap, StoreMeta, StoreSystem, }; -use slog::{o, Logger}; +use slog::{debug, o, Logger}; use tempfile::TempDir; use test_pd::mocker::Service; -use tikv_util::config::{ReadableDuration, VersionTrack}; +use tikv_util::{ + config::{ReadableDuration, VersionTrack}, + store::new_peer, +}; #[derive(Clone)] pub struct TestRouter(RaftRouter); @@ -81,6 +85,20 @@ impl TestRouter { block_on(sub.result()) } + pub fn wait_flush(&self, region_id: u64, timeout: Duration) -> bool { + let timer = Instant::now(); + while timer.elapsed() < timeout { + let (ch, sub) = FlushChannel::pair(); + let res = self.send(region_id, PeerMsg::WaitFlush(ch)); + match res { + Ok(_) => return block_on(sub.result()).is_some(), + Err(TrySendError::Disconnected(_)) => return false, + Err(TrySendError::Full(_)) => thread::sleep(Duration::from_millis(10)), + } + } + panic!("unable to flush {}", region_id); + } + pub fn wait_applied_to_current_term(&self, region_id: u64, timeout: Duration) { let mut now = Instant::now(); let deadline = now + timeout; @@ -104,9 +122,33 @@ impl TestRouter { region_id, res ); } + + pub fn new_request_for(&self, region_id: u64) -> RaftCmdRequest { + let meta = self + .must_query_debug_info(region_id, Duration::from_secs(1)) + .unwrap(); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + let epoch = req.mut_header().mut_region_epoch(); + let epoch_meta = &meta.region_state.epoch; + epoch.set_version(epoch_meta.version); + epoch.set_conf_ver(epoch_meta.conf_ver); + let target_peer = *meta + .region_state + .peers + .iter() + .find(|p| p.id == meta.raft_status.id) + .unwrap(); + let mut peer = new_peer(target_peer.store_id, target_peer.id); + peer.role = target_peer.role.into(); + req.mut_header().set_peer(peer); + req.mut_header().set_term(meta.raft_status.hard_state.term); + req + } } pub struct RunningState { + store_id: u64, pub raft_engine: RaftTestEngine, pub factory: Arc, pub system: StoreSystem, @@ -178,6 +220,7 @@ impl RunningState { .unwrap(); let state = Self { + store_id, raft_engine, factory, system, @@ -203,8 +246,7 @@ pub struct TestNode { } impl TestNode { - fn with_pd(pd_server: &test_pd::Server) -> TestNode { - let logger = slog_global::borrow_global().new(o!()); + fn with_pd(pd_server: &test_pd::Server, logger: Logger) -> TestNode { let pd_client = test_pd::util::new_client(pd_server.bind_addrs(), None); let path = TempDir::new().unwrap(); @@ -245,6 +287,10 @@ impl TestNode { pub fn running_state(&self) -> Option<&RunningState> { self.running_state.as_ref() } + + pub fn id(&self) -> u64 { + self.running_state().unwrap().store_id + } } impl Drop for TestNode { @@ -319,6 +365,7 @@ pub struct Cluster { nodes: Vec, receivers: Vec>, routers: Vec, + logger: Logger, } impl Default for Cluster { @@ -334,11 +381,13 @@ impl Cluster { pub fn with_node_count(count: usize, config: Option) -> Self { let pd_server = test_pd::Server::new(1); + let logger = slog_global::borrow_global().new(o!()); let mut cluster = Cluster { pd_server, nodes: vec![], receivers: vec![], routers: vec![], + logger, }; let mut cfg = if let Some(config) = config { config @@ -347,7 +396,7 @@ impl Cluster { }; disable_all_auto_ticks(&mut cfg); for _ in 1..=count { - let mut node = TestNode::with_pd(&cluster.pd_server); + let mut node = TestNode::with_pd(&cluster.pd_server, cluster.logger.clone()); let (tx, rx) = new_test_transport(); let router = node.start(Arc::new(VersionTrack::new(cfg.clone())), tx); cluster.nodes.push(node); @@ -369,4 +418,42 @@ impl Cluster { pub fn router(&self, offset: usize) -> TestRouter { self.routers[offset].clone() } + + /// Send messages and wait for side effects are all handled. + #[allow(clippy::vec_box)] + pub fn dispatch(&self, region_id: u64, mut msgs: Vec>) { + let mut regions = HashSet::default(); + regions.insert(region_id); + loop { + for msg in msgs.drain(..) { + let offset = match self + .nodes + .iter() + .position(|n| n.id() == msg.get_to_peer().get_store_id()) + { + Some(offset) => offset, + None => { + debug!(self.logger, "failed to find node"; "message" => ?msg); + continue; + } + }; + regions.insert(msg.get_region_id()); + if let Err(e) = self.routers[offset].send_raft_message(msg) { + debug!(self.logger, "failed to send raft message"; "err" => ?e); + } + } + for (router, rx) in self.routers.iter().zip(&self.receivers) { + for region_id in ®ions { + router.wait_flush(*region_id, Duration::from_secs(3)); + } + while let Ok(msg) = rx.try_recv() { + msgs.push(Box::new(msg)); + } + } + regions.clear(); + if msgs.is_empty() { + return; + } + } + } } diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index db37c7cbf64..50fb5c4e16a 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -7,6 +7,7 @@ mod cluster; mod test_basic_write; +mod test_conf_change; mod test_life; mod test_read; mod test_status; diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs index ce775982686..7c8bdb369a1 100644 --- a/components/raftstore-v2/tests/integrations/test_basic_write.rs +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -5,7 +5,7 @@ use std::{assert_matches::assert_matches, time::Duration}; use engine_traits::{OpenOptions, Peekable, TabletFactory}; use futures::executor::block_on; use kvproto::{ - raft_cmdpb::{CmdType, RaftCmdRequest, Request}, + raft_cmdpb::{CmdType, Request}, raft_serverpb::RaftMessage, }; use raftstore::store::{INIT_EPOCH_CONF_VER, INIT_EPOCH_VER}; @@ -19,12 +19,7 @@ use crate::cluster::Cluster; fn test_basic_write() { let cluster = Cluster::default(); let router = cluster.router(0); - let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(2); - let epoch = req.mut_header().mut_region_epoch(); - epoch.set_version(INIT_EPOCH_VER); - epoch.set_conf_ver(INIT_EPOCH_CONF_VER); - req.mut_header().set_peer(new_peer(1, 3)); + let mut req = router.new_request_for(2); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); put_req.mut_put().set_key(b"key".to_vec()); @@ -119,12 +114,7 @@ fn test_basic_write() { fn test_put_delete() { let cluster = Cluster::default(); let router = cluster.router(0); - let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(2); - let epoch = req.mut_header().mut_region_epoch(); - epoch.set_version(INIT_EPOCH_VER); - epoch.set_conf_ver(INIT_EPOCH_CONF_VER); - req.mut_header().set_peer(new_peer(1, 3)); + let mut req = router.new_request_for(2); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); put_req.mut_put().set_key(b"key".to_vec()); diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs new file mode 100644 index 00000000000..f9479786a7b --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -0,0 +1,69 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use kvproto::raft_cmdpb::AdminCmdType; +use raft::prelude::ConfChangeType; +use tikv_util::store::new_learner_peer; + +use crate::cluster::Cluster; + +#[test] +fn test_simple_change() { + let cluster = Cluster::with_node_count(2, None); + let router0 = cluster.router(0); + let mut req = router0.new_request_for(2); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddLearnerNode); + let store_id = cluster.node(1).id(); + let new_peer = new_learner_peer(store_id, 10); + admin_req.mut_change_peer().set_peer(new_peer.clone()); + let resp = router0.command(2, req.clone()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let epoch = req.get_header().get_region_epoch(); + let new_conf_ver = epoch.get_conf_ver() + 1; + let leader_peer = req.get_header().get_peer().clone(); + let meta = router0 + .must_query_debug_info(2, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + assert_eq!(meta.region_state.peers, vec![leader_peer, new_peer]); + + // So heartbeat will create a learner. + cluster.dispatch(2, vec![]); + let router1 = cluster.router(1); + let meta = router1 + .must_query_debug_info(2, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.id, 10, "{:?}", meta); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + assert_eq!( + meta.raft_status.soft_state.leader_id, + req.get_header().get_peer().get_id() + ); + + req.mut_header() + .mut_region_epoch() + .set_conf_ver(new_conf_ver); + req.mut_admin_request() + .mut_change_peer() + .set_change_type(ConfChangeType::RemoveNode); + let resp = router0.command(2, req.clone()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let epoch = req.get_header().get_region_epoch(); + let new_conf_ver = epoch.get_conf_ver() + 1; + let leader_peer = req.get_header().get_peer().clone(); + let meta = router0 + .must_query_debug_info(2, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + assert_eq!(meta.region_state.peers, vec![leader_peer]); + // TODO: check if the peer is removed once life trace is implemented or + // snapshot is implemented. +} diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs index 9f3c5c2c03a..4f49757085f 100644 --- a/components/raftstore-v2/tests/integrations/test_read.rs +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -1,7 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use futures::executor::block_on; -use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, ReadIndexRequest, Request, StatusCmdType}; +use kvproto::raft_cmdpb::{CmdType, Request}; use raftstore_v2::router::PeerMsg; use tikv_util::{config::ReadableDuration, store::new_peer}; use txn_types::WriteBatchFlags; @@ -16,25 +16,10 @@ fn test_read_index() { let router = cluster.router(0); std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_status_request() - .set_cmd_type(StatusCmdType::RegionDetail); - let res = router.query(region_id, req.clone()).unwrap(); - let status_resp = res.response().unwrap().get_status_response(); - let detail = status_resp.get_region_detail(); - let region = detail.get_region().clone(); - - let read_index_req = ReadIndexRequest::default(); - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_header().set_term(7); - req.mut_header().set_region_id(region_id); - req.mut_header() - .set_region_epoch(region.get_region_epoch().clone()); + let mut req = router.new_request_for(region_id); let mut request_inner = Request::default(); request_inner.set_cmd_type(CmdType::Snap); - request_inner.set_read_index(read_index_req); + request_inner.mut_read_index(); req.mut_requests().push(request_inner); let res = router.query(region_id, req.clone()).unwrap(); let resp = res.read().unwrap(); @@ -54,11 +39,7 @@ fn test_read_index() { std::thread::sleep(std::time::Duration::from_millis(200)); let read_req = req.clone(); // the read lease should be expired and renewed by write - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_header().set_region_id(region_id); - req.mut_header() - .set_region_epoch(region.get_region_epoch().clone()); + let mut req = router.new_request_for(region_id); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); put_req.mut_put().set_key(b"key".to_vec()); @@ -80,21 +61,7 @@ fn test_snap_without_read_index() { let router = cluster.router(0); std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_status_request() - .set_cmd_type(StatusCmdType::RegionDetail); - let res = router.query(region_id, req.clone()).unwrap(); - let status_resp = res.response().unwrap().get_status_response(); - let detail = status_resp.get_region_detail(); - let mut region = detail.get_region().clone(); - - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_header().set_term(6); - req.mut_header().set_region_id(region_id); - req.mut_header() - .set_region_epoch(region.take_region_epoch()); + let mut req = router.new_request_for(region_id); let mut request_inner = Request::default(); request_inner.set_cmd_type(CmdType::Snap); req.mut_requests().push(request_inner); @@ -126,21 +93,7 @@ fn test_query_with_write_cmd() { let router = cluster.router(0); std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_status_request() - .set_cmd_type(StatusCmdType::RegionDetail); - let res = router.query(region_id, req.clone()).unwrap(); - let status_resp = res.response().unwrap().get_status_response(); - let detail = status_resp.get_region_detail(); - let mut region = detail.get_region().clone(); - - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_header().set_term(6); - req.mut_header().set_region_id(region_id); - req.mut_header() - .set_region_epoch(region.take_region_epoch()); + let mut req = router.new_request_for(2); for write_cmd in [ CmdType::Prewrite, @@ -157,6 +110,7 @@ fn test_query_with_write_cmd() { assert!(resp.is_none()); let error_resp = res.response().unwrap(); assert!(error_resp.get_header().has_error()); + req.clear_requests(); } } @@ -166,21 +120,7 @@ fn test_snap_with_invalid_parameter() { let router = cluster.router(0); std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_status_request() - .set_cmd_type(StatusCmdType::RegionDetail); - let res = router.query(region_id, req.clone()).unwrap(); - let status_resp = res.response().unwrap().get_status_response(); - let detail = status_resp.get_region_detail(); - let mut region = detail.get_region().clone(); - let mut region_epoch = region.take_region_epoch(); - - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_header().set_term(6); - req.mut_header().set_region_id(region_id); - req.mut_header().set_region_epoch(region_epoch.clone()); + let mut req = router.new_request_for(region_id); let mut request_inner = Request::default(); request_inner.set_cmd_type(CmdType::Snap); req.mut_requests().push(request_inner); @@ -217,10 +157,11 @@ fn test_snap_with_invalid_parameter() { // run again with invalid region_epoch let mut invalid_req = req.clone(); - region_epoch.set_version(region_epoch.get_version() + 1); + let invalid_ver = req.get_header().get_region_epoch().get_version() + 1; invalid_req .mut_header() - .set_region_epoch(region_epoch.clone()); + .mut_region_epoch() + .set_version(invalid_ver); let res = router.query(region_id, invalid_req).unwrap(); let error_resp = res.response().unwrap(); assert!(error_resp.get_header().has_error()); @@ -232,21 +173,7 @@ fn test_local_read() { let mut router = cluster.router(0); std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_status_request() - .set_cmd_type(StatusCmdType::RegionDetail); - let res = router.query(region_id, req.clone()).unwrap(); - let status_resp = res.response().unwrap().get_status_response(); - let detail = status_resp.get_region_detail(); - let mut region = detail.get_region().clone(); - - let mut req = RaftCmdRequest::default(); - req.mut_header().set_peer(new_peer(1, 3)); - req.mut_header().set_term(6); - req.mut_header().set_region_id(region_id); - req.mut_header() - .set_region_epoch(region.take_region_epoch()); + let mut req = router.new_request_for(region_id); let mut request_inner = Request::default(); request_inner.set_cmd_type(CmdType::Snap); req.mut_requests().push(request_inner); diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 522b68e2f09..f67c3a28800 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -31,9 +31,8 @@ use kvproto::{ metapb::{self, PeerRole}, pdpb::{self, PeerStats}, raft_cmdpb::{ - self, AdminCmdType, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, - PutRequest, RaftCmdRequest, RaftCmdResponse, Request, TransferLeaderRequest, - TransferLeaderResponse, + self, AdminCmdType, AdminResponse, CmdType, CommitMergeRequest, PutRequest, RaftCmdRequest, + RaftCmdResponse, Request, TransferLeaderRequest, TransferLeaderResponse, }, raft_serverpb::{ ExtraMessage, ExtraMessageType, MergeState, PeerState, RaftApplyState, RaftMessage, @@ -47,11 +46,10 @@ use pd_client::{BucketStat, INVALID_ID}; use protobuf::Message; use raft::{ self, - eraftpb::{self, ConfChangeType, Entry, EntryType, MessageType}, - Changer, GetEntriesContext, LightReady, ProgressState, ProgressTracker, RawNode, Ready, - SnapshotStatus, StateRole, INVALID_INDEX, NO_LIMIT, + eraftpb::{self, Entry, EntryType, MessageType}, + GetEntriesContext, LightReady, ProgressState, RawNode, Ready, SnapshotStatus, StateRole, + INVALID_INDEX, NO_LIMIT, }; -use raft_proto::ConfChangeI; use rand::seq::SliceRandom; use smallvec::SmallVec; use tikv_alloc::trace::TraceEvent; @@ -3696,138 +3694,6 @@ where self.proposals.push(p); } - // TODO: set higher election priority of voter/incoming voter than demoting - // voter - /// Validate the `ConfChange` requests and check whether it's safe to - /// propose these conf change requests. - /// It's safe iff at least the quorum of the Raft group is still healthy - /// right after all conf change is applied. - /// If 'allow_remove_leader' is false then the peer to be removed should - /// not be the leader. - fn check_conf_change( - &mut self, - ctx: &mut PollContext, - change_peers: &[ChangePeerRequest], - cc: &impl ConfChangeI, - ) -> Result<()> { - // Check whether current joint state can handle this request - let mut after_progress = self.check_joint_state(cc)?; - let current_progress = self.raft_group.status().progress.unwrap().clone(); - let kind = ConfChangeKind::confchange_kind(change_peers.len()); - - if kind == ConfChangeKind::LeaveJoint { - if self.peer.get_role() == PeerRole::DemotingVoter && !self.is_force_leader() { - return Err(box_err!( - "{} ignore leave joint command that demoting leader", - self.tag - )); - } - // Leaving joint state, skip check - return Ok(()); - } - - // Check whether this request is valid - let mut check_dup = HashSet::default(); - let mut only_learner_change = true; - let current_voter = current_progress.conf().voters().ids(); - for cp in change_peers.iter() { - let (change_type, peer) = (cp.get_change_type(), cp.get_peer()); - match (change_type, peer.get_role()) { - (ConfChangeType::RemoveNode, PeerRole::Voter) if kind != ConfChangeKind::Simple => { - return Err(box_err!( - "{} invalid conf change request: {:?}, can not remove voter directly", - self.tag, - cp - )); - } - (ConfChangeType::RemoveNode, _) - | (ConfChangeType::AddNode, PeerRole::Voter) - | (ConfChangeType::AddLearnerNode, PeerRole::Learner) => {} - _ => { - return Err(box_err!( - "{} invalid conf change request: {:?}", - self.tag, - cp - )); - } - } - - if !check_dup.insert(peer.get_id()) { - return Err(box_err!( - "{} invalid conf change request, have multiple commands for the same peer {}", - self.tag, - peer.get_id() - )); - } - - if peer.get_id() == self.peer_id() - && (change_type == ConfChangeType::RemoveNode - // In Joint confchange, the leader is allowed to be DemotingVoter - || (kind == ConfChangeKind::Simple - && change_type == ConfChangeType::AddLearnerNode)) - && !ctx.cfg.allow_remove_leader() - { - return Err(box_err!( - "{} ignore remove leader or demote leader", - self.tag - )); - } - - if current_voter.contains(peer.get_id()) || change_type == ConfChangeType::AddNode { - only_learner_change = false; - } - } - - // Multiple changes that only effect learner will not product `IncommingVoter` - // or `DemotingVoter` after apply, but raftstore layer and PD rely on these - // roles to detect joint state - if kind != ConfChangeKind::Simple && only_learner_change { - return Err(box_err!( - "{} invalid conf change request, multiple changes that only effect learner", - self.tag - )); - } - - let promoted_commit_index = after_progress.maximal_committed_index().0; - if current_progress.is_singleton() // It's always safe if there is only one node in the cluster. - || promoted_commit_index >= self.get_store().truncated_index() || self.force_leader.is_some() - { - return Ok(()); - } - - PEER_ADMIN_CMD_COUNTER_VEC - .with_label_values(&["conf_change", "reject_unsafe"]) - .inc(); - - // Waking it up to replicate logs to candidate. - self.should_wake_up = true; - Err(box_err!( - "{} unsafe to perform conf change {:?}, before: {:?}, after: {:?}, truncated index {}, promoted commit index {}", - self.tag, - change_peers, - current_progress.conf().to_conf_state(), - after_progress.conf().to_conf_state(), - self.get_store().truncated_index(), - promoted_commit_index - )) - } - - /// Check if current joint state can handle this confchange - fn check_joint_state(&mut self, cc: &impl ConfChangeI) -> Result { - let cc = &cc.as_v2(); - let mut prs = self.raft_group.status().progress.unwrap().clone(); - let mut changer = Changer::new(&prs); - let (cfg, changes) = if cc.leave_joint() { - changer.leave_joint()? - } else if let Some(auto_leave) = cc.enter_joint() { - changer.enter_joint(auto_leave, &cc.changes)? - } else { - changer.simple(&cc.changes)? - }; - prs.apply_conf(cfg, changes, self.raft_group.raft.raft_log.last_index()); - Ok(prs) - } - pub fn transfer_leader(&mut self, peer: &metapb::Peer) { info!( "transfer leader"; @@ -4696,7 +4562,16 @@ where let cc = change_peer.to_confchange(data); let changes = change_peer.get_change_peers(); - self.check_conf_change(ctx, changes.as_ref(), &cc)?; + // Because the group is always woken up when there is log gap, so no need + // to wake it up again when command is aborted by log gap. + util::check_conf_change( + &ctx.cfg, + &self.raft_group, + &self.peer, + changes.as_ref(), + &cc, + self.is_force_leader(), + )?; ctx.raft_metrics.propose.conf_change.inc(); // TODO: use local histogram metrics diff --git a/components/raftstore/src/store/region_meta.rs b/components/raftstore/src/store/region_meta.rs index 9af541cbfd9..0370c7604ec 100644 --- a/components/raftstore/src/store/region_meta.rs +++ b/components/raftstore/src/store/region_meta.rs @@ -2,7 +2,10 @@ use std::collections::HashMap; -use kvproto::{metapb::PeerRole, raft_serverpb}; +use kvproto::{ + metapb::{self, PeerRole}, + raft_serverpb, +}; use raft::{Progress, ProgressState, StateRole, Status}; use serde::{Deserialize, Serialize}; @@ -127,7 +130,7 @@ impl<'a> From> for RaftStatus { } } -#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialEq, Serialize, Deserialize)] pub enum RaftPeerRole { Voter, Learner, @@ -146,6 +149,24 @@ impl From for RaftPeerRole { } } +impl From for PeerRole { + fn from(role: RaftPeerRole) -> Self { + match role { + RaftPeerRole::Voter => PeerRole::Voter, + RaftPeerRole::Learner => PeerRole::Learner, + RaftPeerRole::IncomingVoter => PeerRole::IncomingVoter, + RaftPeerRole::DemotingVoter => PeerRole::DemotingVoter, + } + } +} + +impl PartialEq for RaftPeerRole { + fn eq(&self, other: &PeerRole) -> bool { + let r: RaftPeerRole = (*other).into(); + *self == r + } +} + #[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub struct Epoch { pub conf_ver: u64, @@ -159,6 +180,13 @@ pub struct RegionPeer { pub role: RaftPeerRole, } +impl PartialEq for RegionPeer { + #[inline] + fn eq(&self, other: &metapb::Peer) -> bool { + self.id == other.id && self.store_id == other.store_id && self.role == other.role + } +} + #[derive(Debug, Copy, Clone, Serialize, Deserialize)] pub struct RegionMergeState { pub min_index: u64, diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index a4b48e4ba37..a49d4707eb3 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -14,6 +14,7 @@ use std::{ u64, }; +use collections::HashSet; use engine_traits::KvEngine; use kvproto::{ kvrpcpb::{self, KeyRange, LeaderInfo}, @@ -24,14 +25,14 @@ use kvproto::{ use protobuf::{self, Message}; use raft::{ eraftpb::{self, ConfChangeType, ConfState, MessageType}, - INVALID_INDEX, + Changer, RawNode, INVALID_INDEX, }; use raft_proto::ConfChangeI; use tikv_util::{box_err, debug, info, store::region, time::monotonic_raw_now, Either}; use time::{Duration, Timespec}; use txn_types::{TimeStamp, WriteBatchFlags}; -use super::peer_storage; +use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; use crate::{coprocessor::CoprocessorHost, Error, Result}; const INVALID_TIMESTAMP: u64 = u64::MAX; @@ -765,7 +766,7 @@ impl< } } -#[derive(PartialEq, Debug)] +#[derive(PartialEq, Debug, Clone, Copy)] pub enum ConfChangeKind { // Only contains one configuration change Simple, @@ -847,6 +848,118 @@ impl<'a> ChangePeerI for &'a ChangePeerV2Request { } } +/// Check if the conf change request is valid. +/// +/// The function will try to keep operation safe. In some edge cases (or +/// tests), we may not care about safety. In this case, `ignore_safety` +/// can be set to true. +/// +/// Make sure the peer can serve read and write when ignore safety, otherwise +/// it may produce stale result or cause unavailability. +pub fn check_conf_change( + cfg: &Config, + node: &RawNode, + leader: &metapb::Peer, + change_peers: &[ChangePeerRequest], + cc: &impl ConfChangeI, + ignore_safety: bool, +) -> Result<()> { + let current_progress = node.status().progress.unwrap().clone(); + let mut after_progress = current_progress.clone(); + let cc_v2 = cc.as_v2(); + let mut changer = Changer::new(&after_progress); + let (conf, changes) = if cc_v2.leave_joint() { + changer.leave_joint()? + } else if let Some(auto_leave) = cc_v2.enter_joint() { + changer.enter_joint(auto_leave, &cc_v2.changes)? + } else { + changer.simple(&cc_v2.changes)? + }; + after_progress.apply_conf(conf, changes, node.raft.raft_log.last_index()); + + // Because the conf change can be applied successfully above, so the current + // raft group state must matches the command. For example, won't call leave + // joint on a non joint state. + let kind = ConfChangeKind::confchange_kind(change_peers.len()); + if kind == ConfChangeKind::LeaveJoint { + if ignore_safety || leader.get_role() != PeerRole::DemotingVoter { + return Ok(()); + } + return Err(box_err!("ignore leave joint command that demoting leader")); + } + + let mut check_dup = HashSet::default(); + let mut only_learner_change = true; + let current_voter = current_progress.conf().voters().ids(); + for cp in change_peers { + let (change_type, peer) = (cp.get_change_type(), cp.get_peer()); + match (change_type, peer.get_role()) { + (ConfChangeType::RemoveNode, PeerRole::Voter) if kind != ConfChangeKind::Simple => { + return Err(box_err!("{:?}: can not remove voter directly", cp)); + } + (ConfChangeType::RemoveNode, _) + | (ConfChangeType::AddNode, PeerRole::Voter) + | (ConfChangeType::AddLearnerNode, PeerRole::Learner) => {} + _ => { + return Err(box_err!("{:?}: op not match role", cp)); + } + } + + if !check_dup.insert(peer.get_id()) { + return Err(box_err!( + "have multiple commands for the same peer {}", + peer.get_id() + )); + } + + if peer.get_id() == leader.get_id() + && (change_type == ConfChangeType::RemoveNode + // In Joint confchange, the leader is allowed to be DemotingVoter + || (kind == ConfChangeKind::Simple + && change_type == ConfChangeType::AddLearnerNode)) + && !cfg.allow_remove_leader() + { + return Err(box_err!("ignore remove leader or demote leader")); + } + + if current_voter.contains(peer.get_id()) || change_type == ConfChangeType::AddNode { + only_learner_change = false; + } + } + + // Multiple changes that only effect learner will not product `IncommingVoter` + // or `DemotingVoter` after apply, but raftstore layer and PD rely on these + // roles to detect joint state + if kind != ConfChangeKind::Simple && only_learner_change { + return Err(box_err!("multiple changes that only effect learner")); + } + + if !ignore_safety { + let promoted_commit_index = after_progress.maximal_committed_index().0; + let first_index = node.raft.raft_log.first_index(); + if current_progress.is_singleton() // It's always safe if there is only one node in the cluster. + || promoted_commit_index + 1 >= first_index + { + return Ok(()); + } + + PEER_ADMIN_CMD_COUNTER_VEC + .with_label_values(&["conf_change", "reject_unsafe"]) + .inc(); + + Err(box_err!( + "{:?}: before: {:?}, after: {:?}, first index {}, promoted commit index {}", + change_peers, + current_progress.conf().to_conf_state(), + after_progress.conf().to_conf_state(), + first_index, + promoted_commit_index + )) + } else { + Ok(()) + } +} + pub struct MsgType<'a>(pub &'a RaftMessage); impl Display for MsgType<'_> { diff --git a/tests/integrations/raftstore/test_replication_mode.rs b/tests/integrations/raftstore/test_replication_mode.rs index 3eddc7ce40d..d20249bc53f 100644 --- a/tests/integrations/raftstore/test_replication_mode.rs +++ b/tests/integrations/raftstore/test_replication_mode.rs @@ -189,7 +189,7 @@ fn test_check_conf_change() { res.get_header() .get_error() .get_message() - .contains("unsafe to perform conf change"), + .contains("promoted commit index"), "{:?}", res ); From 2c083a41e1cae9b9222b8cd3f57675c7c37fffdc Mon Sep 17 00:00:00 2001 From: cosven Date: Tue, 25 Oct 2022 17:15:56 +0800 Subject: [PATCH 284/676] tests: fix one flaky testcase (#13602) close tikv/tikv#13603 fix one flaky testcase Signed-off-by: cosven Co-authored-by: qupeng --- tests/integrations/raftstore/test_snap_recovery.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integrations/raftstore/test_snap_recovery.rs b/tests/integrations/raftstore/test_snap_recovery.rs index 2db42d68e3f..70f9ae8d97c 100644 --- a/tests/integrations/raftstore/test_snap_recovery.rs +++ b/tests/integrations/raftstore/test_snap_recovery.rs @@ -16,6 +16,8 @@ fn test_check_pending_admin() { cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + // write a key to let leader stuck. cluster.must_put(b"k", b"v"); must_get_equal(&cluster.get_engine(1), b"k", b"v"); From 7fbbcdcee6f7381a9c048030c8f95fac055c42bb Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 25 Oct 2022 17:29:56 +0800 Subject: [PATCH 285/676] cdc, resolved_ts: add leadership resolver (#13657) close tikv/tikv#13656 cdc, resolved_ts: add leadership resolver Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/cdc/Cargo.toml | 1 + components/cdc/src/endpoint.rs | 186 +++---- .../cdc/tests/failpoints/test_endpoint.rs | 15 +- components/raftstore/src/store/util.rs | 78 +-- components/resolved_ts/src/advance.rs | 485 ++++++++++-------- components/resolved_ts/src/endpoint.rs | 92 ++-- components/test_raftstore/src/server.rs | 2 + 7 files changed, 465 insertions(+), 394 deletions(-) diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index 27ce81c57b4..62ef4cc29f5 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -26,6 +26,7 @@ portable = ["tikv/portable"] sse = ["tikv/sse"] mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] +pprof-fp = ["tikv/pprof-fp"] [dependencies] api_version = { workspace = true } diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 7542bb1bfc8..614e282a5d9 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -24,33 +24,30 @@ use kvproto::{ }, kvrpcpb::ApiVersion, metapb::Region, - tikvpb::TikvClient, }; use online_config::{ConfigChange, OnlineConfig}; use pd_client::{Feature, PdClient}; use raftstore::{ coprocessor::{CmdBatch, ObserveId}, router::RaftStoreRouter, - store::{ - fsm::{ChangeObserver, StoreMeta}, - msg::{Callback, SignificantMsg}, - RegionReadProgressRegistry, - }, + store::fsm::{ChangeObserver, StoreMeta}, }; -use resolved_ts::Resolver; +use resolved_ts::{LeadershipResolver, Resolver}; use security::SecurityManager; use tikv::{config::CdcConfig, storage::Statistics}; use tikv_util::{ - debug, error, impl_display_as_debug, info, + debug, defer, error, impl_display_as_debug, info, + mpsc::bounded, + slow_log, sys::thread::ThreadBuildWrapper, - time::Limiter, + time::{Limiter, SlowTimer}, timer::SteadyTimer, warn, worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}, }; use tokio::{ runtime::{Builder, Runtime}, - sync::{Mutex, Semaphore}, + sync::Semaphore, }; use txn_types::{TimeStamp, TxnExtra, TxnExtraScheduler}; @@ -155,7 +152,9 @@ pub enum Task { region: Region, resolver: Resolver, }, - RegisterMinTsEvent, + RegisterMinTsEvent { + leader_resolver: LeadershipResolver, + }, // The result of ChangeCmd should be returned from CDC Endpoint to ensure // the downstream switches to Normal after the previous commands was sunk. InitDownstream { @@ -223,7 +222,7 @@ impl fmt::Debug for Task { .field("observe_id", &observe_id) .field("region_id", ®ion.get_id()) .finish(), - Task::RegisterMinTsEvent => de.field("type", &"register_min_ts").finish(), + Task::RegisterMinTsEvent { .. } => de.field("type", &"register_min_ts").finish(), Task::InitDownstream { ref region_id, ref downstream_id, @@ -348,12 +347,6 @@ pub struct Endpoint { old_value_cache: OldValueCache, resolved_region_heap: ResolvedRegionHeap, - // Check leader - // store_id -> client - tikv_clients: Arc>>, - env: Arc, - security_mgr: Arc, - region_read_progress: RegionReadProgressRegistry, causal_ts_provider: Option>, // Metrics and logging. @@ -416,10 +409,17 @@ impl, E: KvEngine> Endpoint { let max_scan_batch_size = 1024; let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); - let ep = Endpoint { - cluster_id, + let store_resolver_gc_interval = Duration::from_secs(60); + let leader_resolver = LeadershipResolver::new( + store_meta.lock().unwrap().store_id.unwrap(), + pd_client.clone(), env, security_mgr, + region_read_progress, + store_resolver_gc_interval, + ); + let ep = Endpoint { + cluster_id, capture_regions: HashMap::default(), connections: HashMap::default(), scheduler, @@ -447,14 +447,13 @@ impl, E: KvEngine> Endpoint { resolved_region_count: 0, unresolved_region_count: 0, sink_memory_quota, - tikv_clients: Arc::new(Mutex::new(HashMap::default())), - region_read_progress, + // store_resolver, // Log the first resolved ts warning. warn_resolved_ts_repeat_count: WARN_RESOLVED_TS_COUNT_THRESHOLD, current_ts: TimeStamp::zero(), causal_ts_provider, }; - ep.register_min_ts_event(); + ep.register_min_ts_event(leader_resolver); ep } @@ -997,24 +996,21 @@ impl, E: KvEngine> Endpoint { let _ = downstream.sink_event(resolved_ts_event, force_send); } - fn register_min_ts_event(&self) { + fn register_min_ts_event(&self, mut leader_resolver: LeadershipResolver) { let timeout = self.timer.delay(self.config.min_ts_interval.0); let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); let raft_router = self.raft_router.clone(); - let regions: Vec<(u64, ObserveId)> = self + let regions: Vec = self .capture_regions .iter() - .map(|(region_id, delegate)| (*region_id, delegate.handle.id)) + .map(|(region_id, _)| *region_id) .collect(); let cm: ConcurrencyManager = self.concurrency_manager.clone(); - let env = self.env.clone(); - let security_mgr = self.security_mgr.clone(); - let store_meta = self.store_meta.clone(); - let tikv_clients = self.tikv_clients.clone(); let hibernate_regions_compatible = self.config.hibernate_regions_compatible; - let region_read_progress = self.region_read_progress.clone(); let causal_ts_provider = self.causal_ts_provider.clone(); + // We use channel to deliver leader_resolver in async block. + let (leader_resolver_tx, leader_resolver_rx) = bounded(1); let fut = async move { let _ = timeout.compat().await; @@ -1043,37 +1039,37 @@ impl, E: KvEngine> Endpoint { min_ts_min_lock = min_mem_lock_ts; } - match scheduler.schedule(Task::RegisterMinTsEvent) { - Ok(_) | Err(ScheduleError::Stopped(_)) => (), - // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not - // advance normally. - Err(err) => panic!("failed to regiester min ts event, error: {:?}", err), - } + let slow_timer = SlowTimer::default(); + defer!({ + slow_log!(T slow_timer, "cdc resolve region leadership"); + if let Ok(leader_resolver) = leader_resolver_rx.try_recv() { + match scheduler.schedule(Task::RegisterMinTsEvent { leader_resolver }) { + Ok(_) | Err(ScheduleError::Stopped(_)) => (), + // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not + // advance normally. + Err(err) => panic!("failed to regiester min ts event, error: {:?}", err), + } + } else { + // During shutdown, tso runtime drops future immediately, + // leader_resolver may be lost when this future drops before + // delivering leader_resolver. + warn!("cdc leader resolver is lost, are we shutdown?"); + } + }); + // Check region peer leadership, make sure they are leaders. let gate = pd_client.feature_gate(); - let regions = if hibernate_regions_compatible && gate.can_enable(FEATURE_RESOLVED_TS_STORE) { CDC_RESOLVED_TS_ADVANCE_METHOD.set(1); - let regions = regions - .into_iter() - .map(|(region_id, _)| region_id) - .collect(); - resolved_ts::region_resolved_ts_store( - regions, - store_meta, - region_read_progress, - pd_client, - security_mgr, - env, - tikv_clients, - min_ts, - ) - .await + leader_resolver.resolve(regions, min_ts).await } else { CDC_RESOLVED_TS_ADVANCE_METHOD.set(0); - Self::region_resolved_ts_raft(regions, &scheduler, raft_router, min_ts).await + leader_resolver + .resolve_by_raft(regions, min_ts, raft_router) + .await }; + leader_resolver_tx.send(leader_resolver).unwrap(); if !regions.is_empty() { match scheduler.schedule(Task::MinTs { @@ -1082,7 +1078,7 @@ impl, E: KvEngine> Endpoint { current_ts: min_ts_pd, }) { Ok(_) | Err(ScheduleError::Stopped(_)) => (), - // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not + // Must schedule `MinTS` event otherwise resolved ts can not // advance normally. Err(err) => panic!("failed to schedule min ts event, error: {:?}", err), } @@ -1098,54 +1094,6 @@ impl, E: KvEngine> Endpoint { self.tso_worker.spawn(fut); } - async fn region_resolved_ts_raft( - regions: Vec<(u64, ObserveId)>, - scheduler: &Scheduler, - raft_router: T, - min_ts: TimeStamp, - ) -> Vec { - // TODO: send a message to raftstore would consume too much cpu time, - // try to handle it outside raftstore. - let regions: Vec<_> = regions - .iter() - .copied() - .map(|(region_id, observe_id)| { - let scheduler_clone = scheduler.clone(); - let raft_router_clone = raft_router.clone(); - async move { - let (tx, rx) = tokio::sync::oneshot::channel(); - if let Err(e) = raft_router_clone.significant_send( - region_id, - SignificantMsg::LeaderCallback(Callback::read(Box::new(move |resp| { - let resp = if resp.response.get_header().has_error() { - None - } else { - Some(region_id) - }; - if tx.send(resp).is_err() { - error!("cdc send tso response failed"; "region_id" => region_id); - } - }))), - ) { - warn!("cdc send LeaderCallback failed"; "err" => ?e, "min_ts" => min_ts); - let deregister = Deregister::Delegate { - observe_id, - region_id, - err: Error::request(e.into()), - }; - if let Err(e) = scheduler_clone.schedule(Task::Deregister(deregister)) { - error!("cdc schedule cdc task failed"; "error" => ?e); - } - return None; - } - rx.await.unwrap_or(None) - } - }) - .collect(); - let resps = futures::future::join_all(regions).await; - resps.into_iter().flatten().collect::>() - } - fn on_open_conn(&mut self, conn: Conn) { self.connections.insert(conn.get_id(), conn); } @@ -1180,7 +1128,9 @@ impl, E: KvEngine> Runnable for Endpoint { old_value_cb, } => self.on_multi_batch(multi, old_value_cb), Task::OpenConn { conn } => self.on_open_conn(conn), - Task::RegisterMinTsEvent => self.register_min_ts_event(), + Task::RegisterMinTsEvent { + leader_resolver: store_resolver, + } => self.register_min_ts_event(store_resolver), Task::InitDownstream { region_id, downstream_id, @@ -1320,6 +1270,7 @@ mod tests { raft_router: MockRaftStoreRouter, task_rx: ReceiverWrapper, raft_rxs: HashMap>>, + leader_resolver: Option, } impl TestEndpointSuite { @@ -1384,11 +1335,26 @@ mod tests { ) -> TestEndpointSuite { let (task_sched, task_rx) = dummy_scheduler(); let raft_router = MockRaftStoreRouter::new(); + let mut store_meta = StoreMeta::new(0); + store_meta.store_id = Some(1); + let region_read_progress = store_meta.region_read_progress.clone(); + let pd_client = Arc::new(TestPdClient::new(0, true)); + let env = Arc::new(Environment::new(1)); + let security_mgr = Arc::new(SecurityManager::default()); + let store_resolver_gc_interval = Duration::from_secs(60); + let leader_resolver = LeadershipResolver::new( + 1, + pd_client.clone(), + env.clone(), + security_mgr.clone(), + region_read_progress, + store_resolver_gc_interval, + ); let ep = Endpoint::new( DEFAULT_CLUSTER_ID, cfg, api_version, - Arc::new(TestPdClient::new(0, true)), + pd_client, task_sched.clone(), raft_router.clone(), engine.unwrap_or_else(|| { @@ -1399,10 +1365,10 @@ mod tests { .unwrap() }), CdcObserver::new(task_sched), - Arc::new(StdMutex::new(StoreMeta::new(0))), + Arc::new(StdMutex::new(store_meta)), ConcurrencyManager::new(1.into()), - Arc::new(Environment::new(1)), - Arc::new(SecurityManager::default()), + env, + security_mgr, MemoryQuota::new(usize::MAX), causal_ts_provider, ); @@ -1412,6 +1378,7 @@ mod tests { raft_router, task_rx, raft_rxs: HashMap::default(), + leader_resolver: Some(leader_resolver), } } @@ -1903,7 +1870,8 @@ mod tests { let start_ts = block_on(ts_provider.async_get_ts()).unwrap(); let mut suite = mock_endpoint_with_ts_provider(&cfg, None, ApiVersion::V2, Some(ts_provider.clone())); - suite.run(Task::RegisterMinTsEvent); + let leader_resolver = suite.leader_resolver.take().unwrap(); + suite.run(Task::RegisterMinTsEvent { leader_resolver }); suite .task_rx .recv_timeout(Duration::from_millis(1500)) diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index 31c302c3c14..6e208ccac90 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -501,7 +501,20 @@ fn test_cdc_rawkv_resolved_ts() { sleep_ms(100); let event = receive_event(true).resolved_ts.unwrap(); - assert_eq!(ts.next(), TimeStamp::from(event.ts)); + assert!( + ts.next() >= TimeStamp::from(event.ts), + "{} {}", + ts, + TimeStamp::from(event.ts) + ); + // Receive again to make sure resolved ts <= ongoing request's ts. + let event = receive_event(true).resolved_ts.unwrap(); + assert!( + ts.next() >= TimeStamp::from(event.ts), + "{} {}", + ts, + TimeStamp::from(event.ts) + ); fail::remove(pause_write_fp); handle.join().unwrap(); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index a49d4707eb3..61da5805727 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -9,7 +9,7 @@ use std::{ option::Option, sync::{ atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}, - Arc, Mutex, + Arc, Mutex, MutexGuard, }, u64, }; @@ -1051,7 +1051,7 @@ impl RegionReadProgressRegistry { ) -> Vec { let mut regions = Vec::with_capacity(leaders.len()); let registry = self.registry.lock().unwrap(); - for leader_info in leaders { + for leader_info in &leaders { let region_id = leader_info.get_region_id(); if let Some(rp) = registry.get(®ion_id) { if rp.consume_leader_info(leader_info, coprocessor) { @@ -1062,18 +1062,6 @@ impl RegionReadProgressRegistry { regions } - // Get the `LeaderInfo` of the requested regions - pub fn dump_leader_infos(&self, regions: &[u64]) -> HashMap, LeaderInfo)> { - let registry = self.registry.lock().unwrap(); - let mut info_map = HashMap::with_capacity(regions.len()); - for region_id in regions { - if let Some(rrp) = registry.get(region_id) { - info_map.insert(*region_id, rrp.dump_leader_info()); - } - } - info_map - } - /// Invoke the provided callback with the registry, an internal lock will /// hold while invoking the callback so it is important that *not* try /// to acquiring any lock inside the callback to avoid dead lock @@ -1196,14 +1184,14 @@ impl RegionReadProgress { // provided `LeaderInfo` is same as ours pub fn consume_leader_info( &self, - mut leader_info: LeaderInfo, + leader_info: &LeaderInfo, coprocessor: &CoprocessorHost, ) -> bool { let mut core = self.core.lock().unwrap(); if leader_info.has_read_state() { // It is okay to update `safe_ts` without checking the `LeaderInfo`, the // `read_state` is guaranteed to be valid when it is published by the leader - let rs = leader_info.take_read_state(); + let rs = leader_info.get_read_state(); let (apply_index, ts) = (rs.get_applied_index(), rs.get_safe_ts()); if apply_index != 0 && ts != 0 && !core.discard { if let Some(ts) = core.update_safe_ts(apply_index, ts) { @@ -1224,23 +1212,11 @@ impl RegionReadProgress { // Dump the `LeaderInfo` and the peer list pub fn dump_leader_info(&self) -> (Vec, LeaderInfo) { - let mut leader_info = LeaderInfo::default(); let core = self.core.lock().unwrap(); - let read_state = { - // Get the latest `read_state` - let ReadState { idx, ts } = core.pending_items.back().unwrap_or(&core.read_state); - let mut rs = kvrpcpb::ReadState::default(); - rs.set_applied_index(*idx); - rs.set_safe_ts(*ts); - rs - }; - let li = &core.leader_info; - leader_info.set_peer_id(li.leader_id); - leader_info.set_term(li.leader_term); - leader_info.set_region_id(core.region_id); - leader_info.set_region_epoch(li.epoch.clone()); - leader_info.set_read_state(read_state); - (li.peers.clone(), leader_info) + ( + core.get_local_leader_info().peers.clone(), + core.get_leader_info(), + ) } pub fn update_leader_info(&self, peer_id: u64, term: u64, region: &Region) { @@ -1286,10 +1262,15 @@ impl RegionReadProgress { pub fn resolved_ts(&self) -> u64 { self.safe_ts() } + + // Dump the `LeaderInfo` and the peer list + pub fn get_core(&self) -> MutexGuard<'_, RegionReadProgressCore> { + self.core.lock().unwrap() + } } #[derive(Debug)] -struct RegionReadProgressCore { +pub struct RegionReadProgressCore { tag: String, region_id: u64, applied_index: u64, @@ -1336,6 +1317,14 @@ impl LocalLeaderInfo { peers: region.get_peers().to_vec(), } } + + pub fn get_peers(&self) -> &[Peer] { + &self.peers + } + + pub fn get_leader_id(&self) -> u64 { + self.leader_id + } } impl RegionReadProgressCore { @@ -1449,6 +1438,29 @@ impl RegionReadProgressCore { } self.pending_items.push_back(item); } + + pub fn get_leader_info(&self) -> LeaderInfo { + let mut leader_info = LeaderInfo::default(); + let read_state = { + // Get the latest `read_state` + let ReadState { idx, ts } = self.pending_items.back().unwrap_or(&self.read_state); + let mut rs = kvrpcpb::ReadState::default(); + rs.set_applied_index(*idx); + rs.set_safe_ts(*ts); + rs + }; + let li = &self.leader_info; + leader_info.set_peer_id(li.leader_id); + leader_info.set_term(li.leader_term); + leader_info.set_region_id(self.region_id); + leader_info.set_region_epoch(li.epoch.clone()); + leader_info.set_read_state(read_state); + leader_info + } + + pub fn get_local_leader_info(&self) -> &LocalLeaderInfo { + &self.leader_info + } } /// Represent the duration of all stages of raftstore recorded by one diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 190c4474711..35426f4861d 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -4,7 +4,7 @@ use std::{ ffi::CString, sync::{ atomic::{AtomicI32, Ordering}, - Arc, Mutex as StdMutex, + Arc, }, time::Duration, }; @@ -14,7 +14,7 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use fail::fail_point; use futures::{compat::Future01CompatExt, future::select_all, FutureExt, TryFutureExt}; -use grpcio::{ChannelBuilder, Environment, Error as GrpcError, RpcStatusCode}; +use grpcio::{ChannelBuilder, Environment}; use kvproto::{ kvrpcpb::{CheckLeaderRequest, LeaderInfo}, metapb::{Peer, PeerRole}, @@ -22,14 +22,24 @@ use kvproto::{ }; use pd_client::PdClient; use protobuf::Message; -use raftstore::store::{fsm::StoreMeta, util::RegionReadProgressRegistry}; +use raftstore::{ + router::RaftStoreRouter, + store::{ + msg::{Callback, SignificantMsg}, + util::RegionReadProgressRegistry, + }, +}; use security::SecurityManager; use tikv_util::{ - info, sys::thread::ThreadBuildWrapper, time::Instant, timer::SteadyTimer, worker::Scheduler, + info, + sys::thread::ThreadBuildWrapper, + time::{Instant, SlowTimer}, + timer::SteadyTimer, + worker::Scheduler, }; use tokio::{ runtime::{Builder, Runtime}, - sync::Mutex, + sync::{Mutex, Notify}, }; use txn_types::TimeStamp; @@ -38,8 +48,6 @@ use crate::{endpoint::Task, metrics::*, util}; const DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS: u64 = 5_000; // 5s pub struct AdvanceTsWorker { - store_meta: Arc>, - region_read_progress: RegionReadProgressRegistry, pd_client: Arc, timer: SteadyTimer, worker: Runtime, @@ -47,21 +55,13 @@ pub struct AdvanceTsWorker { /// The concurrency manager for transactions. It's needed for CDC to check /// locks when calculating resolved_ts. concurrency_manager: ConcurrencyManager, - // store_id -> client - tikv_clients: Arc>>, - env: Arc, - security_mgr: Arc, } impl AdvanceTsWorker { pub fn new( pd_client: Arc, scheduler: Scheduler>, - store_meta: Arc>, - region_read_progress: RegionReadProgressRegistry, concurrency_manager: ConcurrencyManager, - env: Arc, - security_mgr: Arc, ) -> Self { let worker = Builder::new_multi_thread() .thread_name("advance-ts") @@ -72,33 +72,28 @@ impl AdvanceTsWorker { .build() .unwrap(); Self { - env, - security_mgr, scheduler, pd_client, worker, timer: SteadyTimer::default(), - store_meta, - region_read_progress, concurrency_manager, - tikv_clients: Arc::new(Mutex::new(HashMap::default())), } } } impl AdvanceTsWorker { - pub fn advance_ts_for_regions(&self, regions: Vec) { - if regions.is_empty() { - return; - } + // Advance ts asynchronously and register RegisterAdvanceEvent when its done. + pub fn advance_ts_for_regions( + &self, + regions: Vec, + mut leader_resolver: LeadershipResolver, + advance_ts_interval: Duration, + cfg_update_notify: Arc, + ) { + let cm = self.concurrency_manager.clone(); let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); - let cm: ConcurrencyManager = self.concurrency_manager.clone(); - let env = self.env.clone(); - let security_mgr = self.security_mgr.clone(); - let store_meta = self.store_meta.clone(); - let tikv_clients = self.tikv_clients.clone(); - let region_read_progress = self.region_read_progress.clone(); + let timeout = self.timer.delay(advance_ts_interval); let fut = async move { // Ignore get tso errors since we will retry every `advance_ts_interval`. @@ -115,211 +110,291 @@ impl AdvanceTsWorker { } } - let regions = region_resolved_ts_store( - regions, - store_meta, - region_read_progress, - pd_client, - security_mgr, - env, - tikv_clients, - min_ts, - ) - .await; - + let regions = leader_resolver.resolve(regions, min_ts).await; if !regions.is_empty() { - if let Err(e) = scheduler.schedule(Task::AdvanceResolvedTs { + if let Err(e) = scheduler.schedule(Task::ResolvedTsAdvanced { regions, ts: min_ts, }) { info!("failed to schedule advance event"; "err" => ?e); } } - }; - self.worker.spawn(fut); - } - pub fn register_next_event(&self, advance_ts_interval: Duration, cfg_version: usize) { - let scheduler = self.scheduler.clone(); - let timeout = self.timer.delay(advance_ts_interval); - let fut = async move { - let _ = timeout.compat().await; - if let Err(e) = scheduler.schedule(Task::RegisterAdvanceEvent { cfg_version }) { - info!("failed to schedule register advance event"; "err" => ?e); + futures::select! { + _ = timeout.compat().fuse() => (), + // Skip wait timeout if cfg is updated. + _ = cfg_update_notify.notified().fuse() => (), + }; + // NB: We must schedule the leader resolver even if there is no region, + // otherwise we can not advance resolved ts next time. + if let Err(e) = scheduler.schedule(Task::AdvanceResolvedTs { leader_resolver }) { + error!("failed to schedule register advance event"; "err" => ?e); } }; self.worker.spawn(fut); } } -// Confirms leadership of region peer before trying to advance resolved ts. -// This function broadcasts a special message to all stores, gets the leader id -// of them to confirm whether current peer has a quorum which accepts its -// leadership. -pub async fn region_resolved_ts_store( - regions: Vec, - store_meta: Arc>, - region_read_progress: RegionReadProgressRegistry, +pub struct LeadershipResolver { + tikv_clients: Mutex>, pd_client: Arc, - security_mgr: Arc, env: Arc, - tikv_clients: Arc>>, - min_ts: TimeStamp, -) -> Vec { - PENDING_RTS_COUNT.inc(); - defer!(PENDING_RTS_COUNT.dec()); - fail_point!("before_sync_replica_read_state", |_| regions.clone()); - - let store_id = match store_meta.lock().unwrap().store_id { - Some(id) => id, - None => return vec![], - }; + security_mgr: Arc, + region_read_progress: RegionReadProgressRegistry, + store_id: u64, // store_id -> leaders info, record the request to each stores - let mut store_map: HashMap> = HashMap::default(); + store_map: HashMap>, // region_id -> region, cache the information of regions - let mut region_map: HashMap> = HashMap::default(); + region_map: HashMap>, // region_id -> peers id, record the responses - let mut resp_map: HashMap> = HashMap::default(); - // region_id -> `(Vec, LeaderInfo)` - let info_map = region_read_progress.dump_leader_infos(®ions); - let mut valid_regions = HashSet::default(); - - for (region_id, (peer_list, leader_info)) in info_map { - let leader_id = leader_info.get_peer_id(); - // Check if the leader in this store - if util::find_store_id(&peer_list, leader_id) != Some(store_id) { - continue; + resp_map: HashMap>, + valid_regions: HashSet, + + gc_interval: Duration, + last_gc_time: Instant, +} + +impl LeadershipResolver { + pub fn new( + store_id: u64, + pd_client: Arc, + env: Arc, + security_mgr: Arc, + region_read_progress: RegionReadProgressRegistry, + gc_interval: Duration, + ) -> LeadershipResolver { + LeadershipResolver { + tikv_clients: Mutex::default(), + store_id, + pd_client, + env, + security_mgr, + region_read_progress, + + store_map: HashMap::default(), + region_map: HashMap::default(), + resp_map: HashMap::default(), + valid_regions: HashSet::default(), + last_gc_time: Instant::now_coarse(), + gc_interval, } - let mut unvotes = 0; - for peer in &peer_list { - if peer.store_id == store_id && peer.id == leader_id { - resp_map.entry(region_id).or_default().push(store_id); - } else { - // It's still necessary to check leader on learners even if they don't vote - // because performing stale read on learners require it. - store_map - .entry(peer.store_id) - .or_default() - .push(leader_info.clone()); - if peer.get_role() != PeerRole::Learner { - unvotes += 1; - } - } + } + + fn gc(&mut self) { + let now = Instant::now_coarse(); + if now - self.last_gc_time > self.gc_interval { + self.store_map = HashMap::default(); + self.region_map = HashMap::default(); + self.resp_map = HashMap::default(); + self.valid_regions = HashSet::default(); + self.last_gc_time = now; } - // Check `region_has_quorum` here because `store_map` can be empty, - // in which case `region_has_quorum` won't be called any more. - if unvotes == 0 && region_has_quorum(&peer_list, &resp_map[®ion_id]) { - valid_regions.insert(region_id); - } else { - region_map.insert(region_id, peer_list); + } + + fn clear(&mut self) { + self.store_map.clear(); + self.region_map.clear(); + self.resp_map.clear(); + self.valid_regions.clear(); + } + + pub async fn resolve_by_raft( + &self, + regions: Vec, + min_ts: TimeStamp, + raft_router: T, + ) -> Vec + where + T: 'static + RaftStoreRouter, + E: KvEngine, + { + let mut reqs = Vec::with_capacity(regions.len()); + for region_id in regions { + let raft_router_clone = raft_router.clone(); + let req = async move { + let (tx, rx) = tokio::sync::oneshot::channel(); + let msg = SignificantMsg::LeaderCallback(Callback::read(Box::new(move |resp| { + let resp = if resp.response.get_header().has_error() { + None + } else { + Some(region_id) + }; + if tx.send(resp).is_err() { + error!("cdc send tso response failed"; "region_id" => region_id); + } + }))); + if let Err(e) = raft_router_clone.significant_send(region_id, msg) { + warn!("cdc send LeaderCallback failed"; "err" => ?e, "min_ts" => min_ts); + return None; + } + rx.await.unwrap_or(None) + }; + reqs.push(req); } + + let resps = futures::future::join_all(reqs).await; + resps.into_iter().flatten().collect::>() } - // Approximate `LeaderInfo` size - let leader_info_size = store_map - .values() - .next() - .map_or(0, |regions| regions[0].compute_size()); - let store_count = store_map.len(); - let mut stores: Vec<_> = store_map - .into_iter() - .map(|(to_store, regions)| { - let tikv_clients = tikv_clients.clone(); - let env = env.clone(); - let pd_client = pd_client.clone(); - let security_mgr = security_mgr.clone(); - let region_num = regions.len() as u32; - CHECK_LEADER_REQ_SIZE_HISTOGRAM.observe((leader_info_size * region_num) as f64); - CHECK_LEADER_REQ_ITEM_COUNT_HISTOGRAM.observe(region_num as f64); - - // Check leadership for `regions` on `to_store`. - async move { - PENDING_CHECK_LEADER_REQ_COUNT.inc(); - defer!(PENDING_CHECK_LEADER_REQ_COUNT.dec()); - let client = - get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients.clone()) - .await - .map_err(|e| { - (to_store, e.retryable(), format!("[get tikv client] {}", e)) - })?; - - let mut req = CheckLeaderRequest::default(); - req.set_regions(regions.into()); - req.set_ts(min_ts.into_inner()); - let start = Instant::now_coarse(); - defer!({ - let elapsed = start.saturating_elapsed(); - slow_log!( - elapsed, - "check leader rpc costs too long, to_store: {}", - to_store - ); - RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC - .with_label_values(&["rpc"]) - .observe(elapsed.as_secs_f64()); - }); - - let rpc = match client.check_leader_async(&req) { - Ok(rpc) => rpc, - Err(GrpcError::RpcFailure(status)) - if status.code() == RpcStatusCode::UNIMPLEMENTED => - { - // Some stores like TiFlash don't implement it. - return Ok((to_store, vec![])); + + // Confirms leadership of region peer before trying to advance resolved ts. + // This function broadcasts a special message to all stores, gets the leader id + // of them to confirm whether current peer has a quorum which accepts its + // leadership. + pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + // Clear previous result before resolving. + self.clear(); + // GC when necessary to prevent memory leak. + self.gc(); + + PENDING_RTS_COUNT.inc(); + defer!(PENDING_RTS_COUNT.dec()); + fail_point!("before_sync_replica_read_state", |_| regions.clone()); + + let store_id = self.store_id; + let valid_regions = &mut self.valid_regions; + let region_map = &mut self.region_map; + let resp_map = &mut self.resp_map; + let store_map = &mut self.store_map; + self.region_read_progress.with(|registry| { + for (region_id, read_progress) in registry { + let core = read_progress.get_core(); + let local_leader_info = core.get_local_leader_info(); + let leader_id = local_leader_info.get_leader_id(); + let peer_list = local_leader_info.get_peers(); + // Check if the leader in this store + if util::find_store_id(peer_list, leader_id) != Some(store_id) { + continue; + } + let leader_info = core.get_leader_info(); + + let mut unvotes = 0; + for peer in peer_list { + if peer.store_id == store_id && peer.id == leader_id { + resp_map.entry(*region_id).or_default().push(store_id); + } else { + // It's still necessary to check leader on learners even if they don't vote + // because performing stale read on learners require it. + store_map + .entry(peer.store_id) + .or_default() + .push(leader_info.clone()); + if peer.get_role() != PeerRole::Learner { + unvotes += 1; + } } - Err(e) => return Err((to_store, true, format!("[rpc create failed]{}", e))), - }; - - PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); - defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); - let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); - let regions = tokio::time::timeout(timeout, rpc) - .map_err(|e| (to_store, true, format!("[timeout] {}", e))) - .await? - .map_err(|e| (to_store, true, format!("[rpc failed] {}", e)))? - .take_regions(); - Ok((to_store, regions)) + } + // Check `region_has_quorum` here because `store_map` can be empty, + // in which case `region_has_quorum` won't be called any more. + if unvotes == 0 && region_has_quorum(peer_list, &resp_map[region_id]) { + valid_regions.insert(*region_id); + } else { + region_map.insert(*region_id, peer_list.to_vec()); + } } - .boxed() - }) - .collect(); - let start = Instant::now_coarse(); + }); - defer!({ - RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC - .with_label_values(&["all"]) - .observe(start.saturating_elapsed_secs()); - }); - for _ in 0..store_count { - // Use `select_all` to avoid the process getting blocked when some TiKVs were - // down. - let (res, _, remains) = select_all(stores).await; - stores = remains; - match res { - Ok((to_store, regions)) => regions.into_iter().for_each(|region_id| { - if let Some(r) = region_map.get(®ion_id) { - let resps = resp_map.entry(region_id).or_default(); - resps.push(to_store); - if region_has_quorum(r, resps) { - valid_regions.insert(region_id); + let env = &self.env; + let pd_client = &self.pd_client; + let security_mgr = &self.security_mgr; + let tikv_clients = &self.tikv_clients; + // Approximate `LeaderInfo` size + let leader_info_size = store_map + .values() + .next() + .map_or(0, |regions| regions[0].compute_size()); + let store_count = store_map.len(); + let mut stores: Vec<_> = store_map + .drain() + .map(|(to_store, regions)| { + let env = env.clone(); + let region_num = regions.len() as u32; + CHECK_LEADER_REQ_SIZE_HISTOGRAM.observe((leader_info_size * region_num) as f64); + CHECK_LEADER_REQ_ITEM_COUNT_HISTOGRAM.observe(region_num as f64); + + // Check leadership for `regions` on `to_store`. + async move { + PENDING_CHECK_LEADER_REQ_COUNT.inc(); + defer!(PENDING_CHECK_LEADER_REQ_COUNT.dec()); + let client = + get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients) + .await + .map_err(|e| { + (to_store, e.retryable(), format!("[get tikv client] {}", e)) + })?; + + let mut req = CheckLeaderRequest::default(); + req.set_regions(regions.into()); + req.set_ts(min_ts.into_inner()); + let slow_timer = SlowTimer::default(); + defer!({ + slow_log!( + T + slow_timer, + "check leader rpc costs too long, to_store: {}", + to_store + ); + let elapsed = slow_timer.saturating_elapsed(); + RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC + .with_label_values(&["rpc"]) + .observe(elapsed.as_secs_f64()); + }); + + let rpc = client + .check_leader_async(&req) + .map_err(|e| (to_store, true, format!("[rpc create failed]{}", e)))?; + PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); + defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); + let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); + let resp = tokio::time::timeout(timeout, rpc) + .map_err(|e| (to_store, true, format!("[timeout] {}", e))) + .await? + .map_err(|e| (to_store, true, format!("[rpc failed] {}", e)))?; + Ok((to_store, resp)) + } + .boxed() + }) + .collect(); + let start = Instant::now_coarse(); + + defer!({ + RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC + .with_label_values(&["all"]) + .observe(start.saturating_elapsed_secs()); + }); + for _ in 0..store_count { + // Use `select_all` to avoid the process getting blocked when some + // TiKVs were down. + let (res, _, remains) = select_all(stores).await; + stores = remains; + match res { + Ok((to_store, resp)) => { + for region_id in resp.regions { + if let Some(r) = region_map.get(®ion_id) { + let resps = resp_map.entry(region_id).or_default(); + resps.push(to_store); + if region_has_quorum(r, resps) { + valid_regions.insert(region_id); + } + } } } - }), - Err((to_store, reconnect, err)) => { - info!("check leader failed"; "error" => ?err, "to_store" => to_store); - if reconnect { - tikv_clients.lock().await.remove(&to_store); + Err((to_store, reconnect, err)) => { + info!("check leader failed"; "error" => ?err, "to_store" => to_store); + if reconnect { + self.tikv_clients.lock().await.remove(&to_store); + } } } + // Return early if all regions had already got quorum. + if valid_regions.len() == regions.len() { + // break here because all regions have quorum, + // so there is no need waiting for other stores to respond. + break; + } } - // Return early if all regions had already got quorum. - if valid_regions.len() == regions.len() { - // break here because all regions have quorum, - // so there is no need waiting for other stores to respond. - break; - } + self.valid_regions.drain().collect() } - valid_regions.into_iter().collect() } fn region_has_quorum(peers: &[Peer], stores: &[u64]) -> bool { @@ -374,10 +449,10 @@ static CONN_ID: AtomicI32 = AtomicI32::new(0); async fn get_tikv_client( store_id: u64, - pd_client: Arc, - security_mgr: Arc, + pd_client: &Arc, + security_mgr: &SecurityManager, env: Arc, - tikv_clients: Arc>>, + tikv_clients: &Mutex>, ) -> pd_client::Result { { let clients = tikv_clients.lock().await; diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index a79ff66e384..480c0ee6896 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -28,11 +28,15 @@ use raftstore::{ }; use security::SecurityManager; use tikv::config::ResolvedTsConfig; -use tikv_util::worker::{Runnable, RunnableWithTimer, Scheduler}; +use tikv_util::{ + warn, + worker::{Runnable, RunnableWithTimer, Scheduler}, +}; +use tokio::sync::Notify; use txn_types::{Key, TimeStamp}; use crate::{ - advance::AdvanceTsWorker, + advance::{AdvanceTsWorker, LeadershipResolver}, cmd::{ChangeLog, ChangeRow}, metrics::*, resolver::Resolver, @@ -263,7 +267,7 @@ impl ObserveRegion { pub struct Endpoint { store_id: Option, cfg: ResolvedTsConfig, - cfg_version: usize, + cfg_update_notify: Arc, store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, regions: HashMap, @@ -295,20 +299,22 @@ where let meta = store_meta.lock().unwrap(); (meta.region_read_progress.clone(), meta.store_id) }; - let advance_worker = AdvanceTsWorker::new( - pd_client, - scheduler.clone(), - store_meta.clone(), - region_read_progress.clone(), - concurrency_manager, + let advance_worker = + AdvanceTsWorker::new(pd_client.clone(), scheduler.clone(), concurrency_manager); + let scanner_pool = ScannerPool::new(cfg.scan_lock_pool_size, raft_router); + let store_resolver_gc_interval = Duration::from_secs(60); + let leader_resolver = LeadershipResolver::new( + store_id.unwrap(), + pd_client.clone(), env, security_mgr, + region_read_progress.clone(), + store_resolver_gc_interval, ); - let scanner_pool = ScannerPool::new(cfg.scan_lock_pool_size, raft_router); let ep = Self { store_id, cfg: cfg.clone(), - cfg_version: 0, + cfg_update_notify: Arc::new(Notify::new()), scheduler, store_meta, region_read_progress, @@ -318,7 +324,7 @@ where regions: HashMap::default(), _phantom: PhantomData::default(), }; - ep.register_advance_event(ep.cfg_version); + ep.handle_advance_resolved_ts(leader_resolver); ep } @@ -490,9 +496,9 @@ where } } - // Try to advance resolved ts. + // Update advanced resolved ts. // Must ensure all regions are leaders at the point of ts. - fn advance_resolved_ts(&mut self, regions: Vec, ts: TimeStamp) { + fn handle_resolved_ts_advanced(&mut self, regions: Vec, ts: TimeStamp) { if regions.is_empty() { return; } @@ -576,36 +582,28 @@ where } } - fn register_advance_event(&self, cfg_version: usize) { - // Ignore advance event that registered with previous `advance_ts_interval` - // config - if self.cfg_version != cfg_version { - return; - } + fn handle_advance_resolved_ts(&self, leader_resolver: LeadershipResolver) { let regions = self.regions.keys().into_iter().copied().collect(); - self.advance_worker.advance_ts_for_regions(regions); - self.advance_worker - .register_next_event(self.cfg.advance_ts_interval.0, self.cfg_version); + self.advance_worker.advance_ts_for_regions( + regions, + leader_resolver, + self.cfg.advance_ts_interval.0, + self.cfg_update_notify.clone(), + ); } fn handle_change_config(&mut self, change: ConfigChange) { let prev = format!("{:?}", self.cfg); - let prev_advance_ts_interval = self.cfg.advance_ts_interval; if let Err(e) = self.cfg.update(change) { - error!("update resolved-ts config unexpectly failed"; "err" => ?e); - return; - } - if self.cfg.advance_ts_interval != prev_advance_ts_interval { - // Increase the `cfg_version` to reject advance event that registered before - self.cfg_version += 1; - // Advance `resolved-ts` immediately after `advance_ts_interval` changed - self.register_advance_event(self.cfg_version); + warn!("resolved-ts config fails"; "error" => ?e); + } else { + self.cfg_update_notify.notify_waiters(); + info!( + "resolved-ts config changed"; + "prev" => prev, + "current" => ?self.cfg, + ); } - info!( - "resolved-ts config changed"; - "prev" => prev, - "current" => ?self.cfg, - ); } fn get_or_init_store_id(&mut self) -> Option { @@ -631,10 +629,10 @@ pub enum Task { observe_id: ObserveId, cause: String, }, - RegisterAdvanceEvent { - cfg_version: usize, - }, AdvanceResolvedTs { + leader_resolver: LeadershipResolver, + }, + ResolvedTsAdvanced { regions: Vec, ts: TimeStamp, }, @@ -683,7 +681,7 @@ impl fmt::Debug for Task { .field("observe_id", &observe_id) .field("cause", &cause) .finish(), - Task::AdvanceResolvedTs { + Task::ResolvedTsAdvanced { ref regions, ref ts, } => de @@ -703,9 +701,7 @@ impl fmt::Debug for Task { .field("observe_id", &observe_id) .field("apply_index", &apply_index) .finish(), - Task::RegisterAdvanceEvent { .. } => { - de.field("name", &"register_advance_event").finish() - } + Task::AdvanceResolvedTs { .. } => de.field("name", &"advance_resolved_ts").finish(), Task::ChangeConfig { ref change } => de .field("name", &"change_config") .field("change", &change) @@ -740,7 +736,12 @@ where observe_id, cause, } => self.re_register_region(region_id, observe_id, cause), - Task::AdvanceResolvedTs { regions, ts } => self.advance_resolved_ts(regions, ts), + Task::AdvanceResolvedTs { leader_resolver } => { + self.handle_advance_resolved_ts(leader_resolver) + } + Task::ResolvedTsAdvanced { regions, ts } => { + self.handle_resolved_ts_advanced(regions, ts) + } Task::ChangeLog { cmd_batch, snapshot, @@ -751,7 +752,6 @@ where entries, apply_index, } => self.handle_scan_locks(region_id, observe_id, entries, apply_index), - Task::RegisterAdvanceEvent { cfg_version } => self.register_advance_event(cfg_version), Task::ChangeConfig { change } => self.handle_change_config(change), } } diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 4c0bbce3fd1..5ae1b1a13a6 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -350,6 +350,8 @@ impl ServerCluster { let mut rts_worker = LazyWorker::new("resolved-ts"); let rts_ob = resolved_ts::Observer::new(rts_worker.scheduler()); rts_ob.register_to(&mut coprocessor_host); + // resolved ts endpoint needs store id. + store_meta.lock().unwrap().store_id = Some(node_id); // Resolved ts endpoint let rts_endpoint = resolved_ts::Endpoint::new( &cfg.resolved_ts, From 469eab2cfd0ffe1f6ddb6ab261322488f9880bf8 Mon Sep 17 00:00:00 2001 From: cosven Date: Tue, 25 Oct 2022 22:17:56 +0800 Subject: [PATCH 286/676] raftstore: warm up entry cache before leadership transfer (#13556) ref tikv/tikv#13060 Warm up the entry cache before becoming leader to avoid QPS spike. Signed-off-by: cosven Signed-off-by: cosven Co-authored-by: Xinye Tao --- components/backup-stream/tests/mod.rs | 2 +- components/raftstore/src/store/config.rs | 10 + .../raftstore/src/store/entry_storage.rs | 276 +++++++++++++++++- components/raftstore/src/store/fsm/peer.rs | 55 +++- components/raftstore/src/store/metrics.rs | 19 ++ components/raftstore/src/store/peer.rs | 105 ++++++- components/test_raftstore/src/cluster.rs | 17 ++ .../test_raftstore/src/common-test.toml | 1 + .../failpoints/cases/test_transfer_leader.rs | 260 ++++++++++++++++- tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + .../raftstore/test_transfer_leader.rs | 7 +- 12 files changed, 721 insertions(+), 33 deletions(-) diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 284f1605c30..2cc6016aeb1 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -722,7 +722,7 @@ impl Suite { let leader = self.cluster.leader_of_region(region_id); for peer in region.get_peers() { if leader.as_ref().map(|p| p.id != peer.id).unwrap_or(true) { - self.cluster.transfer_leader(region_id, peer.clone()); + self.cluster.must_transfer_leader(region_id, peer.clone()); self.cluster.reset_leader_of_region(region_id); return; } diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index a5e84aa8501..4d9cd73d207 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -299,6 +299,10 @@ pub struct Config { #[doc(hidden)] pub long_uncommitted_base_threshold: ReadableDuration, + /// Max duration for the entry cache to be warmed up. + /// Set it to 0 to disable warmup. + pub max_entry_cache_warmup_duration: ReadableDuration, + #[doc(hidden)] pub max_snapshot_file_raw_size: ReadableSize, @@ -401,6 +405,7 @@ impl Default for Config { /// the log commit duration is less than 1s. Feel free to adjust /// this config :) long_uncommitted_base_threshold: ReadableDuration::secs(20), + max_entry_cache_warmup_duration: ReadableDuration::secs(1), // They are preserved for compatibility check. region_max_size: ReadableSize(0), @@ -452,6 +457,11 @@ impl Config { self.raft_log_gc_size_limit.unwrap() } + #[inline] + pub fn warmup_entry_cache_enabled(&self) -> bool { + self.max_entry_cache_warmup_duration.0 != Duration::from_secs(0) + } + pub fn region_split_check_diff(&self) -> ReadableSize { self.region_split_check_diff.unwrap() } diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index e5c617ec91b..a0828d12332 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -11,6 +11,7 @@ use std::{ mem, ops::Range, sync::{Arc, Mutex}, + time::Duration, }; use collections::HashMap; @@ -21,9 +22,9 @@ use kvproto::{ raft_serverpb::{RaftApplyState, RaftLocalState}, }; use protobuf::Message; -use raft::{prelude::*, util::limit_size, GetEntriesContext, StorageError}; +use raft::{prelude::*, util::limit_size, GetEntriesContext, StorageError, INVALID_INDEX}; use tikv_alloc::TraceEvent; -use tikv_util::{box_err, debug, info, time::Instant, warn, worker::Scheduler}; +use tikv_util::{box_err, debug, error, info, time::Instant, warn, worker::Scheduler}; use super::{ metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE, RAFT_INIT_LOG_INDEX, @@ -35,6 +36,7 @@ const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; const SHRINK_CACHE_CAPACITY: usize = 64; const ENTRY_MEM_SIZE: usize = mem::size_of::(); +pub const MAX_WARMED_UP_CACHE_KEEP_TIME: Duration = Duration::from_secs(10); pub const MAX_INIT_ENTRY_COUNT: usize = 1024; #[inline] @@ -147,6 +149,25 @@ impl EntryCache { } } + /// Push entries to the left of the cache. + /// + /// When cache is not empty, the index of the last entry in entries + /// should be equal to `cache first index - 1`. When cache is + /// empty, it should be equal to the store's last index. Otherwise, + /// append new entries may fail due to unexpected hole. + fn prepend(&mut self, entries: Vec) { + let mut mem_size_change = 0; + let old_capacity = self.cache.capacity(); + for e in entries.into_iter().rev() { + mem_size_change += (bytes_capacity(&e.data) + bytes_capacity(&e.context)) as i64; + self.cache.push_front(e); + } + let new_capacity = self.cache.capacity(); + mem_size_change += Self::cache_vec_mem_size_change(new_capacity, old_capacity); + mem_size_change += self.shrink_if_necessary(); + self.flush_mem_size_change(mem_size_change); + } + fn append_impl(&mut self, region_id: u64, peer_id: u64, entries: &[Entry]) -> i64 { let mut mem_size_change = 0; @@ -530,6 +551,76 @@ pub fn init_applied_term( } } +/// When a peer(follower) receives a TransferLeaderMsg, it enters the +/// CacheWarmupState. When the peer becomes leader or it doesn't +/// become leader before a deadline, it exits the state. +#[derive(Clone, Debug)] +pub struct CacheWarmupState { + range: (u64, u64), + is_task_timeout: bool, + is_stale: bool, + started_at: Instant, +} + +impl CacheWarmupState { + pub fn new() -> Self { + CacheWarmupState::new_with_range(INVALID_INDEX, INVALID_INDEX) + } + + pub fn new_with_range(low: u64, high: u64) -> Self { + CacheWarmupState { + range: (low, high), + is_task_timeout: false, + is_stale: false, + started_at: Instant::now(), + } + } + + pub fn range(&self) -> (u64, u64) { + self.range + } + + /// How long has it been in this state. + pub fn elapsed(&self) -> Duration { + self.started_at.saturating_elapsed() + } + + /// Whether the warmup task is already timeout. + pub fn is_task_timeout(&self) -> bool { + self.is_task_timeout + } + + /// Check whether the task is timeout. + pub fn check_task_timeout(&mut self, duration: Duration) -> bool { + if self.is_task_timeout { + return true; + } + if self.elapsed() > duration { + WARM_UP_ENTRY_CACHE_COUNTER.timeout.inc(); + self.is_task_timeout = true; + } + self.is_task_timeout + } + + /// Check whether this state is stale. + pub fn check_stale(&mut self, duration: Duration) -> bool { + fail_point!("entry_cache_warmed_up_state_is_stale", |_| true); + if self.is_stale { + return true; + } + if self.elapsed() > duration { + self.is_stale = true; + } + self.is_stale + } +} + +impl Default for CacheWarmupState { + fn default() -> Self { + Self::new() + } +} + /// A subset of `PeerStorage` that focus on accessing log entries. pub struct EntryStorage { region_id: u64, @@ -543,6 +634,7 @@ pub struct EntryStorage { raftlog_fetch_scheduler: Scheduler, raftlog_fetch_stats: AsyncFetchStats, async_fetch_results: RefCell>, + cache_warmup_state: Option, } impl EntryStorage { @@ -576,6 +668,7 @@ impl EntryStorage { raftlog_fetch_scheduler, raftlog_fetch_stats: AsyncFetchStats::default(), async_fetch_results: RefCell::new(HashMap::default()), + cache_warmup_state: None, }) } @@ -980,8 +1073,129 @@ impl EntryStorage { self.last_term = last_term; } + pub fn entry_cache_warmup_state(&self) -> &Option { + &self.cache_warmup_state + } + + pub fn entry_cache_warmup_state_mut(&mut self) -> &mut Option { + &mut self.cache_warmup_state + } + + pub fn clear_entry_cache_warmup_state(&mut self) { + self.cache_warmup_state = None; + } + + /// Trigger a task to warm up the entry cache. + /// + /// This will ensure the range [low..=last_index] are loaded into + /// cache. Return the high index of the warmup range if a task is + /// successfully triggered. + pub fn async_warm_up_entry_cache(&mut self, low: u64) -> Option { + let high = if let Some(first_index) = self.entry_cache_first_index() { + if low >= first_index { + // Already warmed up. + self.cache_warmup_state = Some(CacheWarmupState::new()); + return None; + } + // Partially warmed up. + first_index + } else { + self.last_index() + 1 + }; + + // Fetch entries [low, high) to trigger an async fetch task in background. + self.cache_warmup_state = Some(CacheWarmupState::new_with_range(low, high)); + match self.entries(low, high, u64::MAX, GetEntriesContext::empty(true)) { + Ok(_) => { + // This should not happen, but it's OK :) + debug_assert!(false, "entries should not have been fetched"); + error!("entries are fetched unexpectedly during warming up"); + None + } + Err(raft::Error::Store(raft::StorageError::LogTemporarilyUnavailable)) => { + WARM_UP_ENTRY_CACHE_COUNTER.started.inc(); + Some(high) + } + Err(e) => { + error!( + "fetching entries met unexpected error during warming up"; + "err" => ?e, + ); + None + } + } + } + + /// Warm up entry cache if the result is valid. + /// + /// Return true when the warmup operation succeed within the timeout. + pub fn maybe_warm_up_entry_cache(&mut self, res: RaftlogFetchResult) -> bool { + let low = res.low; + // Warm up the entry cache if the low and high index are + // exactly the same as the warmup range. + let state = self.entry_cache_warmup_state().as_ref().unwrap(); + let range = state.range(); + let is_task_timeout = state.is_task_timeout(); + + if range.0 != low { + return false; + } + + match res.ents { + Ok(mut entries) => { + let last_entry_index = entries.last().map(|e| e.index); + if let Some(index) = last_entry_index { + // Generally speaking, when the res.low is the same as the warmup + // range start, the fetch result is exactly used for warmup. + // As the low index of each async_fetch task is different. + // There should exist only one exception. A async fetch task + // with same low index is triggered before the warmup task. + if index + 1 >= range.1 { + let is_valid = if let Some(first_index) = self.entry_cache_first_index() { + range.1 == first_index + } else { + range.1 == self.last_index() + 1 + }; + assert!(is_valid, "the warmup range should still be valid"); + entries.truncate((range.1 - range.0) as usize); + self.cache.prepend(entries); + WARM_UP_ENTRY_CACHE_COUNTER.finished.inc(); + fail_point!("on_entry_cache_warmed_up"); + return !is_task_timeout; + } + } + warn!( + "warm up the entry cache failed"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "last_entry_index" => last_entry_index.unwrap_or(0), + "expected_high" => range.1, + ); + } + Err(e) => { + warn!( + "warm up the entry cache failed"; + "region_id" => self.region_id, + "peer_id" => self.peer_id, + "err" => ?e, + ); + } + } + false + } + pub fn compact_entry_cache(&mut self, idx: u64) { - self.cache.compact_to(idx); + let mut can_compact = true; + if let Some(state) = self.entry_cache_warmup_state_mut() { + if state.check_stale(MAX_WARMED_UP_CACHE_KEEP_TIME) { + self.clear_entry_cache_warmup_state(); + } else { + can_compact = false; + } + } + if can_compact { + self.cache.compact_to(idx); + } } #[inline] @@ -1091,6 +1305,12 @@ pub mod tests { ); assert_eq!(rx.try_recv().unwrap(), 3); + cache.prepend(vec![new_padded_entry(100, 1, 1)]); + assert_eq!(rx.try_recv().unwrap(), 1); + cache.persisted = 100; + cache.compact_to(101); + assert_eq!(rx.try_recv().unwrap(), -1); + // Test size change for one overlapped entry. cache.append(0, 0, &[new_padded_entry(102, 2, 3)]); assert_eq!(rx.try_recv().unwrap(), 1); @@ -1522,6 +1742,7 @@ pub mod tests { entries = vec![new_entry(6, 6), new_entry(7, 6)]; append_ents(&mut store, &entries); validate_cache(&store, &entries); + store.cache.prepend(vec![new_entry(6, 5)]); // rewrite old entry entries = vec![new_entry(5, 6), new_entry(6, 6)]; @@ -1564,4 +1785,53 @@ pub mod tests { // invalid compaction should be ignored. store.compact_entry_cache(6); } + + #[test] + fn test_async_warm_up_entry_cache() { + let ents = vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 6)]; + + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let region_worker = Worker::new("snap-manager").lazy_build("snap-manager"); + let region_scheduler = region_worker.scheduler(); + let (dummy_scheduler, _rx) = dummy_scheduler(); + + let mut store = new_storage_from_ents(region_scheduler, dummy_scheduler, &td, &ents); + store.cache.compact_to(6); + assert_eq!(store.entry_cache_first_index().unwrap(), 6); + + // The return value should be None when it is already warmed up. + assert!(store.async_warm_up_entry_cache(6).is_none()); + + // The high index should be equal to the entry_cache_first_index. + assert_eq!(store.async_warm_up_entry_cache(5).unwrap(), 6); + + store.cache.compact_to(7); // Clean cache. + // The high index should be equal to the last_index + 1. + assert_eq!(store.async_warm_up_entry_cache(5).unwrap(), 7); + } + + #[test] + fn test_warmup_entry_cache() { + let ents = vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 6)]; + + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let region_worker = Worker::new("snap-manager").lazy_build("snap-manager"); + let region_scheduler = region_worker.scheduler(); + let (dummy_scheduler, _rx) = dummy_scheduler(); + let mut store = new_storage_from_ents(region_scheduler, dummy_scheduler, &td, &ents); + store.cache.compact_to(6); + store.cache_warmup_state = Some(CacheWarmupState::new_with_range(5, 6)); + + let res = RaftlogFetchResult { + ents: Ok(ents[1..3].to_vec()), + low: 5, + max_size: u64::MAX, + hit_size_limit: false, + tried_cnt: MAX_ASYNC_FETCH_TRY_CNT, + term: 1, + }; + store.maybe_warm_up_entry_cache(res); + // Cache should be warmed up. + assert_eq!(store.entry_cache_first_index().unwrap(), 5); + } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 57f5fe158f5..63761321405 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -70,6 +70,7 @@ use crate::{ coprocessor::{RegionChangeEvent, RegionChangeReason}, store::{ cmd_resp::{bind_term, new_error}, + entry_storage::MAX_WARMED_UP_CACHE_KEEP_TIME, fsm::{ apply, store::{PollContext, StoreMeta}, @@ -1801,8 +1802,17 @@ where fn on_raft_log_fetched(&mut self, context: GetEntriesContext, res: Box) { let low = res.low; - // if the peer is not the leader anymore or being destroyed, ignore the result. - if !self.fsm.peer.is_leader() || self.fsm.peer.pending_remove { + // If the peer is not the leader anymore and it's not in entry cache warmup + // state, or it is being destroyed, ignore the result. + if !self.fsm.peer.is_leader() + && self + .fsm + .peer + .get_store() + .entry_cache_warmup_state() + .is_none() + || self.fsm.peer.pending_remove + { self.fsm.peer.mut_store().clean_async_fetch_res(low); return; } @@ -1810,6 +1820,19 @@ where if self.fsm.peer.term() != res.term { // term has changed, the result may be not correct. self.fsm.peer.mut_store().clean_async_fetch_res(low); + } else if self + .fsm + .peer + .get_store() + .entry_cache_warmup_state() + .is_some() + { + if self.fsm.peer.mut_store().maybe_warm_up_entry_cache(*res) { + self.fsm.peer.ack_transfer_leader_msg(false); + self.fsm.has_ready = true; + } + self.fsm.peer.mut_store().clean_async_fetch_res(low); + return; } else { self.fsm .peer @@ -3216,10 +3239,13 @@ where } } } - } else { - self.fsm - .peer - .execute_transfer_leader(self.ctx, msg.get_from(), peer_disk_usage, false); + } else if !self + .fsm + .peer + .maybe_reject_transfer_leader_msg(self.ctx, msg, peer_disk_usage) + && self.fsm.peer.pre_ack_transfer_leader_msg(self.ctx, msg) + { + self.fsm.peer.ack_transfer_leader_msg(false); } } @@ -3757,6 +3783,14 @@ where } fn on_ready_compact_log(&mut self, first_index: u64, state: RaftTruncatedState) { + // Since this peer may be warming up the entry cache, log compaction should be + // temporarily skipped. Otherwise, the warmup task may fail. + if let Some(state) = self.fsm.peer.mut_store().entry_cache_warmup_state_mut() { + if !state.check_stale(MAX_WARMED_UP_CACHE_KEEP_TIME) { + return; + } + } + let total_cnt = self.fsm.peer.last_applying_idx - first_index; // the size of current CompactLog command can be ignored. let remain_cnt = self.fsm.peer.last_applying_idx - state.get_index() - 1; @@ -6157,14 +6191,7 @@ where if term != self.fsm.peer.term() { return; } - // As the leader can propose the TransferLeader request successfully, the disk - // of the leader is probably not full. - self.fsm.peer.execute_transfer_leader( - self.ctx, - self.fsm.peer.leader_id(), - DiskUsage::Normal, - true, - ); + self.fsm.peer.ack_transfer_leader_msg(true); self.fsm.has_ready = true; } diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index af877e14b46..14d8d7e97cc 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -78,6 +78,11 @@ make_auto_flush_static_metric! { fetch_unused, } + pub label_enum WarmUpEntryCacheType { + started, + timeout, + finished, + } pub label_enum RaftEventDurationType { compact_check, @@ -103,6 +108,11 @@ make_auto_flush_static_metric! { pub struct RaftEntryFetches : LocalIntCounter { "type" => RaftEntryType } + + pub struct WarmUpEntryCacheCounter : LocalIntCounter { + "type" => WarmUpEntryCacheType + } + pub struct SnapCf : LocalHistogram { "type" => CfNames, } @@ -614,6 +624,15 @@ lazy_static! { exponential_buckets(0.0005, 2.0, 21).unwrap() // 500us ~ 8.7m ).unwrap(); + pub static ref WARM_UP_ENTRY_CACHE_COUNTER_VEC: IntCounterVec = + register_int_counter_vec!( + "tikv_raftstore_prefill_entry_cache_total", + "Total number of prefill entry cache.", + &["type"] + ).unwrap(); + pub static ref WARM_UP_ENTRY_CACHE_COUNTER: WarmUpEntryCacheCounter = + auto_flush_from!(WARM_UP_ENTRY_CACHE_COUNTER_VEC, WarmUpEntryCacheCounter); + pub static ref LEADER_MISSING: IntGauge = register_int_gauge!( "tikv_raftstore_leader_missing", diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index f67c3a28800..b06eb5c0c3f 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1024,7 +1024,7 @@ where /// region buckets. pub region_buckets: Option, pub last_region_buckets: Option, - /// lead_transferee if the peer is in a leadership transferring. + /// lead_transferee if this peer(leader) is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, // Used as the memory state for Flashback to reject RW/Schedule before proposing. @@ -2249,6 +2249,8 @@ where self.require_updating_max_ts(&ctx.pd_scheduler); // Init the in-memory pessimistic lock table when the peer becomes leader. self.activate_in_memory_pessimistic_locks(); + // Exit entry cache warmup state when the peer becomes leader. + self.mut_store().clear_entry_cache_warmup_state(); if !ctx.store_disk_usages.is_empty() { self.refill_disk_full_peers(ctx); @@ -3721,10 +3723,12 @@ where // Broadcast heartbeat to make sure followers commit the entries immediately. // It's only necessary to ping the target peer, but ping all for simplicity. self.raft_group.ping(); + let mut msg = eraftpb::Message::new(); msg.set_to(peer.get_id()); msg.set_msg_type(eraftpb::MessageType::MsgTransferLeader); msg.set_from(self.peer_id()); + msg.set_index(self.get_store().entry_cache_first_index().unwrap_or(0)); // log term here represents the term of last log. For leader, the term of last // log is always its current term. Not just set term because raft library // forbids setting it for MsgTransferLeader messages. @@ -4384,33 +4388,95 @@ where Ok(Either::Left(propose_index)) } - pub fn execute_transfer_leader( + pub fn maybe_reject_transfer_leader_msg( &mut self, ctx: &mut PollContext, - from: u64, + msg: &eraftpb::Message, peer_disk_usage: DiskUsage, - reply_cmd: bool, // whether it is a reply to a TransferLeader command - ) { + ) -> bool { let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); if pending_snapshot - || from != self.leader_id() + || msg.get_from() != self.leader_id() // Transfer leader to node with disk full will lead to write availablity downback. // But if the current leader is disk full, and send such request, we should allow it, // because it may be a read leader balance request. || (!matches!(ctx.self_disk_usage, DiskUsage::Normal) && - matches!(peer_disk_usage,DiskUsage::Normal)) + matches!(peer_disk_usage, DiskUsage::Normal)) { info!( "reject transferring leader"; "region_id" => self.region_id, "peer_id" => self.peer.get_id(), - "from" => from, + "from" => msg.get_from(), "pending_snapshot" => pending_snapshot, "disk_usage" => ?ctx.self_disk_usage, ); - return; + return true; + } + false + } + + /// Before ack the transfer leader message sent by the leader. + /// Currently, it only warms up the entry cache in this stage. + /// + /// This return whether the msg should be acked. When cache is warmed up + /// or the warmup operation is timeout, it is true. + pub fn pre_ack_transfer_leader_msg( + &mut self, + ctx: &mut PollContext, + msg: &eraftpb::Message, + ) -> bool { + if !ctx.cfg.warmup_entry_cache_enabled() { + return true; } + // The start index of warmup range. It is leader's entry_cache_first_index, + // which in general is equal to the lowest matched index. + let mut low = msg.get_index(); + let last_index = self.get_store().last_index(); + let mut should_ack_now = false; + + // Need not to warm up when the index is 0. + // There are two cases where index can be 0: + // 1. During rolling upgrade, old instances may not support warmup. + // 2. The leader's entry cache is empty. + if low == 0 || low > last_index { + // There is little possibility that the warmup_range_start + // is larger than the last index. Check the test case + // `test_when_warmup_range_start_is_larger_than_last_index` + // for details. + should_ack_now = true; + } else { + if low < self.last_compacted_idx { + low = self.last_compacted_idx + }; + // Check if the entry cache is already warmed up. + if let Some(first_index) = self.get_store().entry_cache_first_index() { + if low >= first_index { + fail_point!("entry_cache_already_warmed_up"); + should_ack_now = true; + } + } + } + + if should_ack_now { + return true; + } + + // Check if the warmup operation is timeout if warmup is already started. + if let Some(state) = self.mut_store().entry_cache_warmup_state_mut() { + // If it is timeout, this peer should ack the message so that + // the leadership transfer process can continue. + state.check_task_timeout(ctx.cfg.max_entry_cache_warmup_duration.0) + } else { + self.mut_store().async_warm_up_entry_cache(low).is_none() + } + } + + pub fn ack_transfer_leader_msg( + &mut self, + reply_cmd: bool, // whether it is a reply to a TransferLeader command + ) { let mut msg = eraftpb::Message::new(); msg.set_from(self.peer_id()); msg.set_to(self.leader_id()); @@ -4431,10 +4497,23 @@ where /// /// 1. pre_transfer_leader on leader: /// Leader will send a MsgTransferLeader to follower. - /// 2. execute_transfer_leader on follower - /// If follower passes all necessary checks, it will reply an - /// ACK with type MsgTransferLeader and its promised persistent index. - /// 3. ready_to_transfer_leader on leader: + /// 2. pre_ack_transfer_leader_msg on follower: + /// If follower passes all necessary checks, it will try to warmup + /// the entry cache. + /// 3. ack_transfer_leader_msg on follower: + /// When the entry cache has been warmed up or the operator is timeout, + /// the follower reply an ACK with type MsgTransferLeader and + /// its promised persistent index. + /// + /// Additional steps when there are remaining pessimistic + /// locks to propose (detected in function on_transfer_leader_msg). + /// 1. Leader firstly proposes pessimistic locks and then proposes a + /// TransferLeader command. + /// 2. ack_transfer_leader_msg on follower again: + /// The follower applies the TransferLeader command and replies an + /// ACK with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. + /// + /// 4. ready_to_transfer_leader on leader: /// Leader checks if it's appropriate to transfer leadership. If it /// does, it calls raft transfer_leader API to do the remaining work. /// diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 7a932d324f0..c097b22222d 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1139,6 +1139,23 @@ impl Cluster { } } + pub fn wait_applied_index(&mut self, region_id: u64, store_id: u64, index: u64) { + let timer = Instant::now(); + loop { + let applied_index = self.apply_state(region_id, store_id).applied_index; + if applied_index >= index { + return; + } + if timer.saturating_elapsed() >= Duration::from_secs(5) { + panic!( + "[region {}] log is still not applied to {}: {} on store {}", + region_id, index, applied_index, store_id, + ); + } + thread::sleep(Duration::from_millis(10)); + } + } + pub fn wait_tombstone(&self, region_id: u64, peer: metapb::Peer, check_exist: bool) { let timer = Instant::now(); let mut state; diff --git a/components/test_raftstore/src/common-test.toml b/components/test_raftstore/src/common-test.toml index 6b179081def..50e62f67d28 100644 --- a/components/test_raftstore/src/common-test.toml +++ b/components/test_raftstore/src/common-test.toml @@ -65,6 +65,7 @@ raft-store-max-leader-lease = "240ms" allow-remove-leader = true merge-check-tick-interval = "100ms" pd-heartbeat-tick-interval = "20ms" +max-entry-cache-warmup-duration = "0ms" dev-assert = true hibernate-regions = true store-io-pool-size = 0 diff --git a/tests/failpoints/cases/test_transfer_leader.rs b/tests/failpoints/cases/test_transfer_leader.rs index 556549b8141..cc6b043f0e5 100644 --- a/tests/failpoints/cases/test_transfer_leader.rs +++ b/tests/failpoints/cases/test_transfer_leader.rs @@ -6,14 +6,19 @@ use std::{ time::Duration, }; +use crossbeam::channel; use engine_traits::CF_LOCK; use futures::executor::block_on; use grpcio::{ChannelBuilder, Environment}; use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; use pd_client::PdClient; +use raft::eraftpb::MessageType; use test_raftstore::*; use tikv::storage::Snapshot; -use tikv_util::HandyRwLock; +use tikv_util::{ + config::{ReadableDuration, ReadableSize}, + HandyRwLock, +}; use txn_types::{Key, PessimisticLock}; /// When a follower applies log slowly, leader should not transfer leader @@ -332,3 +337,256 @@ fn test_read_lock_after_become_follower() { // PessimisticLockNotFound. assert!(resp.get_region_error().has_stale_command()); } + +/// This function does the following things +/// +/// 0. Transfer the region's(id=1) leader to store 1. +/// 1. Inserted 5 entries and make all stores commit and apply them. +/// 2. Prevent the store 3 from append following logs. +/// 3. Insert another 20 entries. +/// 4. Wait for some time so that part of the entry cache are compacted +/// on the leader(store 1). +fn run_cluster_for_test_warmup_entry_cache(cluster: &mut Cluster) { + // Let the leader compact the entry cache. + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); + cluster.run(); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + + for i in 1..5u32 { + let k = i.to_string().into_bytes(); + let v = k.clone(); + cluster.must_put(&k, &v); + must_get_equal(&cluster.get_engine(3), &k, &v); + } + + // Let store 3 fall behind. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 3).direction(Direction::Recv), + )); + + for i in 1..20u32 { + let k = i.to_string().into_bytes(); + let v = k.clone(); + cluster.must_put(&k, &v); + must_get_equal(&cluster.get_engine(2), &k, &v); + } + + // Wait until part of the leader's entry cache is compacted. + sleep_ms(cluster.cfg.raft_store.raft_log_gc_tick_interval.as_millis() * 2); +} + +fn prevent_from_gc_raft_log(cluster: &mut Cluster) { + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100000); + cluster.cfg.raft_store.raft_log_gc_threshold = 1000; + cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); + cluster.cfg.raft_store.raft_log_reserve_max_ticks = 20; +} + +fn run_cluster_and_warm_up_cache_for_store2() -> Cluster { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.max_entry_cache_warmup_duration = ReadableDuration::secs(1000); + prevent_from_gc_raft_log(&mut cluster); + run_cluster_for_test_warmup_entry_cache(&mut cluster); + + let (sx, rx) = channel::unbounded(); + let recv_filter = Box::new( + RegionPacketFilter::new(1, 1) + .direction(Direction::Recv) + .msg_type(MessageType::MsgTransferLeader) + .set_msg_callback(Arc::new(move |m| { + sx.send(m.get_message().get_from()).unwrap(); + })), + ); + cluster.sim.wl().add_recv_filter(1, recv_filter); + + let (sx2, rx2) = channel::unbounded(); + fail::cfg_callback("on_entry_cache_warmed_up", move || sx2.send(true).unwrap()).unwrap(); + cluster.transfer_leader(1, new_peer(2, 2)); + + // Cache should be warmed up. + assert!(rx2.recv_timeout(Duration::from_millis(500)).unwrap()); + // It should ack the message just after cache is warmed up. + assert_eq!(rx.recv_timeout(Duration::from_millis(500)).unwrap(), 2); + cluster.sim.wl().clear_recv_filters(1); + cluster +} + +/// Leader should carry a correct index in TransferLeaderMsg so that +/// the follower can warm up the entry cache with this index. +#[test] +fn test_transfer_leader_msg_index() { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.raft_entry_cache_life_time = ReadableDuration::secs(1000); + prevent_from_gc_raft_log(&mut cluster); + run_cluster_for_test_warmup_entry_cache(&mut cluster); + + let (sx, rx) = channel::unbounded(); + let recv_filter = Box::new( + RegionPacketFilter::new(1, 2) + .direction(Direction::Recv) + .msg_type(MessageType::MsgTransferLeader) + .set_msg_callback(Arc::new(move |m| { + sx.send(m.get_message().get_index()).unwrap(); + })), + ); + cluster.sim.wl().add_recv_filter(2, recv_filter); + + // TransferLeaderMsg.index should be equal to the store3's replicated_index. + cluster.transfer_leader(1, new_peer(2, 2)); + let replicated_index = cluster.raft_local_state(1, 3).last_index; + assert_eq!( + rx.recv_timeout(Duration::from_secs(2)).unwrap(), + replicated_index, + ); +} + +/// The store should ack the transfer leader msg immediately +/// when the warmup range start is larger than it's last index. +#[test] +fn test_when_warmup_range_start_is_larger_than_last_index() { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.raft_entry_cache_life_time = ReadableDuration::secs(1000); + prevent_from_gc_raft_log(&mut cluster); + run_cluster_for_test_warmup_entry_cache(&mut cluster); + cluster.pd_client.disable_default_operator(); + + let s4 = cluster.add_new_engine(); + + // Prevent peer 4 from appending logs, so it's last index should + // be really small. + let recv_filter_s4 = Box::new( + RegionPacketFilter::new(1, s4) + .direction(Direction::Recv) + .msg_type(MessageType::MsgAppend), + ); + cluster.sim.wl().add_recv_filter(s4, recv_filter_s4); + + let (sx, rx) = channel::unbounded(); + let recv_filter_1 = Box::new( + RegionPacketFilter::new(1, 1) + .direction(Direction::Recv) + .msg_type(MessageType::MsgTransferLeader) + .set_msg_callback(Arc::new(move |m| { + sx.send(m.get_message().get_from()).unwrap(); + })), + ); + cluster.sim.wl().add_recv_filter(1, recv_filter_1); + + cluster.pd_client.must_add_peer(1, new_peer(s4, s4)); + cluster.transfer_leader(1, new_peer(s4, s4)); + // Store(s4) should ack the transfer leader msg immediately. + assert_eq!(rx.recv_timeout(Duration::from_millis(500)).unwrap(), s4); +} + +/// When the start index of warmup range is compacted, the follower should +/// still warm up and use the compacted_idx as the start index. +#[test] +fn test_when_warmup_range_start_is_compacted() { + let mut cluster = new_node_cluster(0, 3); + // GC raft log aggressively. + cluster.cfg.raft_store.merge_max_log_gap = 1; + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(5); + cluster.cfg.raft_store.max_entry_cache_warmup_duration = ReadableDuration::secs(1000); + run_cluster_for_test_warmup_entry_cache(&mut cluster); + cluster.pd_client.disable_default_operator(); + + // Case `test_transfer_leader_msg_index` already proves that + // the warmup_range_start is equal to the replicated_index. + let warmup_range_start = cluster.raft_local_state(1, 3).last_index; + cluster.wait_log_truncated(1, 2, warmup_range_start + 10); + let s2_truncated_index = cluster.truncated_state(1, 2).get_index(); + let s2_last_index = cluster.raft_local_state(1, 2).last_index; + assert!(warmup_range_start < s2_truncated_index); + assert!(s2_truncated_index + 5 <= s2_last_index); + + // Cache should be warmed up successfully. + let (sx, rx) = channel::unbounded(); + fail::cfg_callback("on_entry_cache_warmed_up", move || sx.send(true).unwrap()).unwrap(); + cluster.transfer_leader(1, new_peer(2, 2)); + rx.recv_timeout(Duration::from_millis(500)).unwrap(); +} + +/// Transfer leader should work as normal when disable warming up entry cache. +#[test] +fn test_turnoff_warmup_entry_cache() { + let mut cluster = new_node_cluster(0, 3); + prevent_from_gc_raft_log(&mut cluster); + run_cluster_for_test_warmup_entry_cache(&mut cluster); + cluster.cfg.raft_store.max_entry_cache_warmup_duration = ReadableDuration::secs(0); + fail::cfg("worker_async_fetch_raft_log", "pause").unwrap(); + cluster.must_transfer_leader(1, new_peer(2, 2)); +} + +/// When the follower has not warmed up the entry cache and the timeout of +/// warmup is very long, then the leadership transfer can never succeed. +#[test] +fn test_when_warmup_fail_and_its_timeout_is_too_long() { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.max_entry_cache_warmup_duration = ReadableDuration::secs(1000); + prevent_from_gc_raft_log(&mut cluster); + run_cluster_for_test_warmup_entry_cache(&mut cluster); + + fail::cfg("worker_async_fetch_raft_log", "pause").unwrap(); + cluster.transfer_leader(1, new_peer(2, 2)); + // Theoretically, the leader transfer can't succeed unless it sleeps + // max_entry_cache_warmup_duration. + sleep_ms(50); + let leader = cluster.leader_of_region(1).unwrap(); + assert_eq!(leader.get_id(), 1); +} + +/// When the follower has not warmed up the entry cache and the timeout of +/// warmup is pretty short, then the leadership transfer should succeed quickly. +#[test] +fn test_when_warmup_fail_and_its_timeout_is_short() { + let mut cluster = new_node_cluster(0, 3); + cluster.cfg.raft_store.max_entry_cache_warmup_duration = ReadableDuration::millis(10); + prevent_from_gc_raft_log(&mut cluster); + run_cluster_for_test_warmup_entry_cache(&mut cluster); + + fail::cfg("worker_async_fetch_raft_log", "pause").unwrap(); + cluster.must_transfer_leader(1, new_peer(2, 2)); +} + +/// The follower should ack the msg when the cache is warmed up. +/// Besides, the cache should be kept for a period of time. +#[test] +fn test_when_warmup_succeed_and_become_leader() { + let mut cluster = run_cluster_and_warm_up_cache_for_store2(); + + // Generally, the cache will be compacted during post_apply. + // However, if the cache is warmed up recently, the cache should be kept. + let applied_index = cluster.apply_state(1, 2).applied_index; + cluster.must_put(b"kk1", b"vv1"); + cluster.wait_applied_index(1, 2, applied_index + 1); + + // It should ack the message when cache is already warmed up. + // It needs not to fetch raft log anymore. + fail::cfg("worker_async_fetch_raft_log", "pause").unwrap(); + cluster.sim.wl().clear_recv_filters(1); + cluster.must_transfer_leader(1, new_peer(2, 2)); +} + +/// The follower should exit warmup state if it does not become leader +/// in a period of time. +#[test] +fn test_when_warmup_succeed_and_not_become_leader() { + let mut cluster = run_cluster_and_warm_up_cache_for_store2(); + + let (sx, rx) = channel::unbounded(); + fail::cfg_callback("worker_async_fetch_raft_log", move || { + sx.send(true).unwrap() + }) + .unwrap(); + fail::cfg("entry_cache_warmed_up_state_is_stale", "return").unwrap(); + + // Since the warmup state is stale, the peer should exit warmup state, + // and the entry cache should be compacted during post_apply. + let applied_index = cluster.apply_state(1, 2).applied_index; + cluster.must_put(b"kk1", b"vv1"); + cluster.wait_applied_index(1, 2, applied_index + 1); + // The peer should warm up cache again when it receives a new TransferLeaderMsg. + cluster.transfer_leader(1, new_peer(2, 2)); + assert!(rx.recv_timeout(Duration::from_millis(500)).unwrap()); +} diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index e2d5ef06b6e..90524079bfa 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -248,6 +248,7 @@ fn test_serde_custom_tikv_config() { report_region_buckets_tick_interval: ReadableDuration::secs(1234), check_long_uncommitted_interval: ReadableDuration::secs(1), long_uncommitted_base_threshold: ReadableDuration::secs(1), + max_entry_cache_warmup_duration: ReadableDuration::secs(2), max_snapshot_file_raw_size: ReadableSize::gb(10), unreachable_backoff: ReadableDuration::secs(111), check_peers_availability_interval: ReadableDuration::secs(30), diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 097ebd612cd..17f82f9eb87 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -222,6 +222,7 @@ report-min-resolved-ts-interval = "233ms" report-region-buckets-tick-interval = "1234s" max-snapshot-file-raw-size = "10GB" unreachable-backoff = "111s" +max-entry-cache-warmup-duration = "2s" [coprocessor] split-region-on-table = false diff --git a/tests/integrations/raftstore/test_transfer_leader.rs b/tests/integrations/raftstore/test_transfer_leader.rs index 9f2e564341f..b0fade84d8b 100644 --- a/tests/integrations/raftstore/test_transfer_leader.rs +++ b/tests/integrations/raftstore/test_transfer_leader.rs @@ -19,7 +19,12 @@ fn test_basic_transfer_leader(cluster: &mut Cluster) { cluster.cfg.raft_store.raft_heartbeat_ticks = 20; let reserved_time = Duration::from_millis( cluster.cfg.raft_store.raft_base_tick_interval.as_millis() - * cluster.cfg.raft_store.raft_heartbeat_ticks as u64, + * cluster.cfg.raft_store.raft_heartbeat_ticks as u64 + + cluster + .cfg + .raft_store + .max_entry_cache_warmup_duration + .as_millis(), ); cluster.run(); From c74c8ca907632263a0cfcd84b8413e9edc5c19e5 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Wed, 26 Oct 2022 10:13:56 +0800 Subject: [PATCH 287/676] raftstore: introduce an observer to control write apply state (#13609) close tikv/tikv#12849 introduce an observer to control write apply state Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- .../raftstore/src/coprocessor/dispatcher.rs | 39 +++++++++++++++++++ components/raftstore/src/coprocessor/mod.rs | 7 ++++ components/raftstore/src/store/fsm/apply.rs | 16 ++++++-- tests/integrations/raftstore/test_snap.rs | 1 + 4 files changed, 60 insertions(+), 3 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index df7794c3701..99228aef44c 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -655,6 +655,20 @@ impl CoprocessorHost { true } + /// Should be called everytime before we want to write apply state when + /// applying. Return a bool which indicates whether we can actually do + /// this write. + pub fn pre_write_apply_state(&self, region: &Region) -> bool { + let mut ctx = ObserverContext::new(region); + for observer in &self.registry.region_change_observers { + let observer = observer.observer.inner(); + if !observer.pre_write_apply_state(&mut ctx) { + return false; + } + } + true + } + pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -763,6 +777,8 @@ mod tests { PostApplySnapshot = 21, ShouldPreApplySnapshot = 22, OnUpdateSafeTs = 23, + PrePersist = 24, + PreWriteApplyState = 25, } impl Coprocessor for TestCoprocessor {} @@ -911,6 +927,25 @@ mod tests { .fetch_add(ObserverIndex::OnRegionChanged as usize, Ordering::SeqCst); ctx.bypass = self.bypass.load(Ordering::SeqCst); } + + fn pre_persist( + &self, + ctx: &mut ObserverContext<'_>, + _: bool, + _: Option<&RaftCmdRequest>, + ) -> bool { + self.called + .fetch_add(ObserverIndex::PrePersist as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + true + } + + fn pre_write_apply_state(&self, ctx: &mut ObserverContext<'_>) -> bool { + self.called + .fetch_add(ObserverIndex::PreWriteApplyState as usize, Ordering::SeqCst); + ctx.bypass = self.bypass.load(Ordering::SeqCst); + true + } } impl ApplySnapshotObserver for TestCoprocessor { @@ -1132,6 +1167,10 @@ mod tests { host.on_update_safe_ts(1, 1, 1); index += ObserverIndex::OnUpdateSafeTs as usize; assert_all!([&ob.called], &[index]); + + host.pre_write_apply_state(®ion); + index += ObserverIndex::PreWriteApplyState as usize; + assert_all!([&ob.called], &[index]); } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 35330701a95..7ac783c0d6d 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -321,6 +321,13 @@ pub trait RegionChangeObserver: Coprocessor { ) -> bool { true } + + /// Should be called everytime before we want to write apply state when + /// applying. Return a bool which indicates whether we can actually do + /// this write. + fn pre_write_apply_state(&self, _: &mut ObserverContext<'_>) -> bool { + true + } } #[derive(Clone, Debug, Default)] diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index dae732797b1..a9124dc2faf 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -500,7 +500,7 @@ where /// `finish_for`. pub fn commit(&mut self, delegate: &mut ApplyDelegate) { if delegate.last_flush_applied_index < delegate.apply_state.get_applied_index() { - delegate.write_apply_state(self.kv_wb_mut()); + delegate.maybe_write_apply_state(self); } self.commit_opt(delegate, true); } @@ -621,7 +621,7 @@ where ) { if self.host.pre_persist(&delegate.region, true, None) { if !delegate.pending_remove { - delegate.write_apply_state(self.kv_wb_mut()); + delegate.maybe_write_apply_state(self); } self.commit_opt(delegate, false); } else { @@ -1101,6 +1101,13 @@ where }); } + fn maybe_write_apply_state(&self, apply_ctx: &mut ApplyContext) { + let can_write = apply_ctx.host.pre_write_apply_state(&self.region); + if can_write { + self.write_apply_state(apply_ctx.kv_wb_mut()); + } + } + fn handle_raft_entry_normal( &mut self, apply_ctx: &mut ApplyContext, @@ -1285,6 +1292,9 @@ where .applied_batch .push(cmd_cb, cmd, &self.observe_info, self.region_id()); if should_write { + // An observer shall prevent a write_apply_state here by not return true + // when `post_exec`. + self.write_apply_state(apply_ctx.kv_wb_mut()); apply_ctx.commit(self); } exec_result @@ -3741,7 +3751,7 @@ where if apply_ctx.timer.is_none() { apply_ctx.timer = Some(Instant::now_coarse()); } - self.delegate.write_apply_state(apply_ctx.kv_wb_mut()); + self.delegate.maybe_write_apply_state(apply_ctx); fail_point!( "apply_on_handle_snapshot_1_1", self.delegate.id == 1 && self.delegate.region_id() == 1, diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 2bc05726bfc..8d3212ad4a6 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -514,6 +514,7 @@ fn test_inspected_snapshot() { assert_ne!(stats.fetch(IoType::Replication, IoOp::Write), 0); pd_client.must_remove_peer(1, new_peer(2, 2)); + must_get_none(&cluster.get_engine(2), b"k2"); assert_eq!(stats.fetch(IoType::LoadBalance, IoOp::Read), 0); assert_eq!(stats.fetch(IoType::LoadBalance, IoOp::Write), 0); pd_client.must_add_peer(1, new_peer(2, 2)); From a4dc37b0c07ee1b5bc7c60a5d8360666500ecdb4 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Wed, 26 Oct 2022 11:33:57 +0800 Subject: [PATCH 288/676] storage, lock_manager: Use the new lock waiting queue instead of WaiterManager to handle pessimistic lock waking up (#13447) ref tikv/tikv#13298 Updates the write path of acquiring lock and releasing lock to make use of the new `LockWaitQueue`. Some important points are: 1. `WriteResultLockInfo` (returned by `AcquirePessimisticLock::process_write`) carries parameters, which can be used for resuming the request in the future. 2. `WriteResultLockInfo` will be converted into `LockWaitContext` and `LockWaitEntry`, and then send to both `LockManager` and the new `LockWaitQueues`. 3. When a storage command releases some locks, will return the released locks to `Scheduler::process_write`, which will then call `on_release_locks` to pop lock waiting entries from the queues and wake up them asynchronously (to avoid increasing too much latency of the current command). 4. The `LockManager` (and its inner module `WaiterManager`) no longer has the responsibility for waking up waiters, but keeps its functionality of handling timeout and performing deadlock detection. Instead, it has a new `remove_lock_wait` method to remove a waiter from it. 5. Waiters in `WaiterManager` can now be uniquely identified by a `LockWaitToken`, and the data structure in `WaiterManager` is therefore changed. Accessing by lock hash and transaction ts is still necessary to handle the result of deadlock detection. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot Co-authored-by: Yilin Chen --- Cargo.lock | 8 +- components/resolved_ts/src/cmd.rs | 5 +- components/test_coprocessor/src/fixture.rs | 4 +- components/test_coprocessor/src/store.rs | 6 +- components/test_storage/src/sync_storage.rs | 10 +- metrics/grafana/tikv_details.json | 9 +- src/config.rs | 6 +- src/coprocessor_v2/raw_storage_impl.rs | 6 +- src/import/duplicate_detect.rs | 8 +- src/server/gc_worker/gc_worker.rs | 16 +- src/server/lock_manager/config.rs | 31 +- src/server/lock_manager/deadlock.rs | 129 +- src/server/lock_manager/metrics.rs | 8 +- src/server/lock_manager/mod.rs | 269 +++-- src/server/lock_manager/waiter_manager.rs | 1044 +++++++---------- src/server/server.rs | 4 +- src/storage/lock_manager/lock_wait_context.rs | 16 +- .../lock_manager/lock_waiting_queue.rs | 24 +- src/storage/lock_manager/mod.rs | 145 ++- src/storage/mod.rs | 561 ++++----- src/storage/mvcc/txn.rs | 24 +- src/storage/txn/actions/check_txn_status.rs | 6 +- src/storage/txn/actions/commit.rs | 2 +- .../txn/commands/acquire_pessimistic_lock.rs | 49 +- src/storage/txn/commands/atomic_store.rs | 9 +- .../txn/commands/check_secondary_locks.rs | 13 +- src/storage/txn/commands/check_txn_status.rs | 14 +- src/storage/txn/commands/cleanup.rs | 4 +- src/storage/txn/commands/commit.rs | 4 +- src/storage/txn/commands/compare_and_swap.rs | 11 +- .../txn/commands/flashback_to_version.rs | 5 +- src/storage/txn/commands/mod.rs | 78 +- src/storage/txn/commands/pause.rs | 5 +- .../txn/commands/pessimistic_rollback.rs | 10 +- src/storage/txn/commands/prewrite.rs | 55 +- src/storage/txn/commands/resolve_lock.rs | 13 +- src/storage/txn/commands/resolve_lock_lite.rs | 4 +- src/storage/txn/commands/rollback.rs | 4 +- src/storage/txn/commands/txn_heart_beat.rs | 11 +- src/storage/txn/scheduler.rs | 239 +++- tests/failpoints/cases/test_storage.rs | 43 +- tests/failpoints/cases/test_transaction.rs | 14 +- tests/failpoints/cases/test_ttl.rs | 4 +- .../config/dynamic/pessimistic_txn.rs | 82 +- .../resource_metering/test_suite/mod.rs | 15 +- 45 files changed, 1552 insertions(+), 1475 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 14951b8e253..a6b25808098 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1305,9 +1305,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "5.1.0" +version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" +checksum = "4c8858831f7781322e539ea39e72449c46b059638250c14344fec8d0aa6e539c" dependencies = [ "cfg-if 1.0.0", "num_cpus", @@ -5297,9 +5297,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" +checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" [[package]] name = "smartstring" diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index 89d7167cc26..a1468e15bab 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -300,10 +300,9 @@ mod tests { }; use tikv::storage::{ kv::{MockEngineBuilder, TestEngineBuilder}, - lock_manager::DummyLockManager, mvcc::{tests::write, Mutation, MvccTxn, SnapshotReader}, txn::{ - commands::one_pc_commit_ts, prewrite, tests::*, CommitKind, TransactionKind, + commands::one_pc_commit, prewrite, tests::*, CommitKind, TransactionKind, TransactionProperties, }, Engine, @@ -426,7 +425,7 @@ mod tests { SkipPessimisticCheck, ) .unwrap(); - one_pc_commit_ts(true, &mut txn, 10.into(), &DummyLockManager); + one_pc_commit(true, &mut txn, 10.into()); write(&engine, &Default::default(), txn.into_modifies()); let one_pc_row = engine .take_last_modifies() diff --git a/components/test_coprocessor/src/fixture.rs b/components/test_coprocessor/src/fixture.rs index 23fc877a996..a53ba4500bc 100644 --- a/components/test_coprocessor/src/fixture.rs +++ b/components/test_coprocessor/src/fixture.rs @@ -12,7 +12,7 @@ use tikv::{ read_pool::ReadPool, server::Config, storage::{ - kv::RocksEngine, lock_manager::DummyLockManager, Engine, TestEngineBuilder, + kv::RocksEngine, lock_manager::MockLockManager, Engine, TestEngineBuilder, TestStorageBuilderApiV1, }, }; @@ -79,7 +79,7 @@ pub fn init_data_with_details( commit: bool, cfg: &Config, ) -> (Store, Endpoint, Arc) { - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); let mut store = Store::from_storage(storage); diff --git a/components/test_coprocessor/src/store.rs b/components/test_coprocessor/src/store.rs index f19b0a113bd..278e210bc98 100644 --- a/components/test_coprocessor/src/store.rs +++ b/components/test_coprocessor/src/store.rs @@ -13,7 +13,7 @@ use tikv::{ server::gc_worker::GcConfig, storage::{ kv::{Engine, RocksEngine}, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, txn::FixtureStore, SnapshotStore, StorageApiV1, TestStorageBuilderApiV1, }, @@ -116,7 +116,7 @@ pub struct Store { impl Store { pub fn new() -> Self { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); Self::from_storage(storage) @@ -130,7 +130,7 @@ impl Default for Store { } impl Store { - pub fn from_storage(storage: StorageApiV1) -> Self { + pub fn from_storage(storage: StorageApiV1) -> Self { Self { store: SyncTestStorageApiV1::from_storage(0, storage, GcConfig::default()).unwrap(), current_ts: 1.into(), diff --git a/components/test_storage/src/sync_storage.rs b/components/test_storage/src/sync_storage.rs index faa84944eca..fa53688ea75 100644 --- a/components/test_storage/src/sync_storage.rs +++ b/components/test_storage/src/sync_storage.rs @@ -19,7 +19,7 @@ use raftstore::{ use tikv::{ server::gc_worker::{AutoGcConfig, GcConfig, GcSafePointProvider, GcWorker}, storage::{ - config::Config, kv::RocksEngine, lock_manager::DummyLockManager, test_util::GetConsumer, + config::Config, kv::RocksEngine, lock_manager::MockLockManager, test_util::GetConsumer, txn::commands, Engine, KvGetStatistics, PrewriteResult, Result, Storage, TestEngineBuilder, TestStorageBuilder, TxnStatus, }, @@ -87,7 +87,7 @@ impl SyncTestStorageBuilder { pub fn build(mut self, store_id: u64) -> Result> { let mut builder = TestStorageBuilder::<_, _, F>::from_engine_and_lock_mgr( self.engine.clone(), - DummyLockManager, + MockLockManager::new(), ); if let Some(config) = self.config.take() { builder = builder.config(config); @@ -107,7 +107,7 @@ impl SyncTestStorageBuilder { #[derive(Clone)] pub struct SyncTestStorage { gc_worker: GcWorker, - store: Storage, + store: Storage, } /// SyncTestStorage for Api V1 @@ -117,7 +117,7 @@ pub type SyncTestStorageApiV1 = SyncTestStorage; impl SyncTestStorage { pub fn from_storage( store_id: u64, - storage: Storage, + storage: Storage, config: GcConfig, ) -> Result { let (tx, _rx) = std::sync::mpsc::channel(); @@ -145,7 +145,7 @@ impl SyncTestStorage { .unwrap(); } - pub fn get_storage(&self) -> Storage { + pub fn get_storage(&self) -> Storage { self.store.clone() } diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 9d64207c214..45a657cc4bb 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -35349,13 +35349,20 @@ "legendFormat": "{{type}}", "refId": "A", "step": 4 + }, + { + "expr": "sum(max_over_time(tikv_lock_wait_queue_entries_gauge_vec{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[15s])) by (type)", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Wait table", + "title": "Lock Waiting Queue", "tooltip": { "msResolution": false, "shared": true, diff --git a/src/config.rs b/src/config.rs index 9dcf17d17d5..c978b1bf90a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -4104,7 +4104,7 @@ mod tests { server::{config::ServerConfigManager, ttl::TtlCheckerTask}, storage::{ config_manager::StorageConfigManger, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, txn::flow_controller::{EngineFlowController, FlowController}, Storage, TestStorageBuilder, }, @@ -4494,7 +4494,7 @@ mod tests { fn new_engines( cfg: TikvConfig, ) -> ( - Storage, + Storage, ConfigController, ReceiverWrapper, Arc, @@ -4513,7 +4513,7 @@ mod tests { ) .unwrap(); let storage = - TestStorageBuilder::<_, _, F>::from_engine_and_lock_mgr(engine, DummyLockManager) + TestStorageBuilder::<_, _, F>::from_engine_and_lock_mgr(engine, MockLockManager::new()) .config(cfg.storage.clone()) .build() .unwrap(); diff --git a/src/coprocessor_v2/raw_storage_impl.rs b/src/coprocessor_v2/raw_storage_impl.rs index fc505c50312..9a57b4c8624 100644 --- a/src/coprocessor_v2/raw_storage_impl.rs +++ b/src/coprocessor_v2/raw_storage_impl.rs @@ -215,11 +215,11 @@ mod test { use kvproto::kvrpcpb::{ApiVersion, Context}; use super::*; - use crate::storage::{lock_manager::DummyLockManager, TestStorageBuilder}; + use crate::storage::{lock_manager::MockLockManager, TestStorageBuilder}; #[tokio::test] async fn test_storage_api() { - let storage = TestStorageBuilder::<_, _, ApiV2>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, ApiV2>::new(MockLockManager::new()) .build() .unwrap(); let ctx = Context { @@ -255,7 +255,7 @@ mod test { #[tokio::test] async fn test_storage_api_batch() { - let storage = TestStorageBuilder::<_, _, ApiV2>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, ApiV2>::new(MockLockManager::new()) .build() .unwrap(); let ctx = Context { diff --git a/src/import/duplicate_detect.rs b/src/import/duplicate_detect.rs index dbd819efbbf..b1eaecab881 100644 --- a/src/import/duplicate_detect.rs +++ b/src/import/duplicate_detect.rs @@ -239,7 +239,7 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::{DummyLockManager, LockManager}, + lock_manager::{LockManager, MockLockManager}, txn::commands, Storage, TestStorageBuilderApiV1, }; @@ -350,7 +350,7 @@ mod tests { #[test] fn test_duplicate_detect() { - let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let mut data = vec![]; @@ -408,7 +408,7 @@ mod tests { // (108,18) is not repeated with (108,10). #[test] fn test_duplicate_detect_incremental() { - let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); for &start in &[100, 104, 108, 112] { @@ -469,7 +469,7 @@ mod tests { #[test] fn test_duplicate_detect_rollback_and_delete() { - let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let data = vec![ diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 5b638a01f48..8e345f0909b 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -1660,7 +1660,7 @@ mod tests { server::gc_worker::{MockSafePointProvider, PrefixedEngine}, storage::{ kv::{metrics::GcKeyMode, Modify, TestEngineBuilder, WriteData}, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, mvcc::{ tests::{must_get_none, must_get_none_on_region, must_get_on_region}, MAX_TXN_WRITE_SIZE, @@ -1738,7 +1738,7 @@ mod tests { /// Assert the data in `storage` is the same as `expected_data`. Keys in /// `expected_data` should be encoded form without ts. fn check_data( - storage: &Storage, + storage: &Storage, expected_data: &BTreeMap, Vec>, ) { let scan_res = block_on(storage.scan( @@ -1773,10 +1773,12 @@ mod tests { let store_id = 1; let engine = TestEngineBuilder::new().build().unwrap(); - let storage = - TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine.clone(), DummyLockManager) - .build() - .unwrap(); + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr( + engine.clone(), + MockLockManager::new(), + ) + .build() + .unwrap(); let gate = FeatureGate::default(); gate.set_version("5.0.0").unwrap(); @@ -1960,7 +1962,7 @@ mod tests { let prefixed_engine = PrefixedEngine(engine); let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr( prefixed_engine.clone(), - DummyLockManager, + MockLockManager::new(), ) .build() .unwrap(); diff --git a/src/server/lock_manager/config.rs b/src/server/lock_manager/config.rs index aba08f3d2e7..6464c3cb1cd 100644 --- a/src/server/lock_manager/config.rs +++ b/src/server/lock_manager/config.rs @@ -3,7 +3,7 @@ use std::{ error::Error, sync::{ - atomic::{AtomicBool, Ordering}, + atomic::{AtomicBool, AtomicU64, Ordering}, Arc, }, }; @@ -80,6 +80,7 @@ pub struct LockManagerConfigManager { pub detector_scheduler: DeadlockScheduler, pub pipelined: Arc, pub in_memory: Arc, + pub wake_up_delay_duration_ms: Arc, } impl LockManagerConfigManager { @@ -88,29 +89,35 @@ impl LockManagerConfigManager { detector_scheduler: DeadlockScheduler, pipelined: Arc, in_memory: Arc, + wake_up_delay_duration_ms: Arc, ) -> Self { LockManagerConfigManager { waiter_mgr_scheduler, detector_scheduler, pipelined, in_memory, + wake_up_delay_duration_ms, } } } impl ConfigManager for LockManagerConfigManager { fn dispatch(&mut self, mut change: ConfigChange) -> Result<(), Box> { - match ( - change.remove("wait_for_lock_timeout").map(Into::into), - change.remove("wake_up_delay_duration").map(Into::into), - ) { - (timeout @ Some(_), delay) => { - self.waiter_mgr_scheduler.change_config(timeout, delay); - self.detector_scheduler.change_ttl(timeout.unwrap().into()); - } - (None, delay @ Some(_)) => self.waiter_mgr_scheduler.change_config(None, delay), - (None, None) => {} - }; + if let Some(p) = change.remove("wait_for_lock_timeout").map(Into::into) { + self.waiter_mgr_scheduler.change_config(Some(p)); + self.detector_scheduler.change_ttl(p.into()); + } + if let Some(p) = change + .remove("wake_up_delay_duration") + .map(ReadableDuration::from) + { + info!( + "Waiter manager config changed"; + "wake_up_delay_duration" => %p, + ); + self.wake_up_delay_duration_ms + .store(p.as_millis(), Ordering::Relaxed); + } if let Some(p) = change.remove("pipelined").map(Into::into) { self.pipelined.store(p, Ordering::Relaxed); } diff --git a/src/server/lock_manager/deadlock.rs b/src/server/lock_manager/deadlock.rs index 4fee40138c1..9583df80dd6 100644 --- a/src/server/lock_manager/deadlock.rs +++ b/src/server/lock_manager/deadlock.rs @@ -46,7 +46,7 @@ use super::{ }; use crate::{ server::resolve::StoreAddrResolver, - storage::lock_manager::{DiagnosticContext, Lock}, + storage::lock_manager::{DiagnosticContext, KeyLockWaitInfo, LockDigest}, }; /// `Locks` is a set of locks belonging to one transaction. @@ -308,11 +308,11 @@ impl DetectTable { } /// Removes the corresponding wait_for_entry. - fn clean_up_wait_for(&mut self, txn_ts: TimeStamp, lock_ts: TimeStamp, lock_hash: u64) { + fn clean_up_wait_for(&mut self, txn_ts: TimeStamp, lock_digest: LockDigest) { if let Some(wait_for) = self.wait_for_map.get_mut(&txn_ts) { - if let Some(locks) = wait_for.get_mut(&lock_ts) { - if locks.remove(lock_hash) { - wait_for.remove(&lock_ts); + if let Some(locks) = wait_for.get_mut(&lock_digest.ts) { + if locks.remove(lock_digest.hash) { + wait_for.remove(&lock_digest.ts); if wait_for.is_empty() { self.wait_for_map.remove(&txn_ts); } @@ -396,7 +396,7 @@ pub enum Task { Detect { tp: DetectType, txn_ts: TimeStamp, - lock: Lock, + wait_info: Option, // Only valid when `tp == Detect`. diag_ctx: DiagnosticContext, }, @@ -424,11 +424,14 @@ impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { Task::Detect { - tp, txn_ts, lock, .. + tp, + txn_ts, + wait_info, + .. } => write!( f, - "Detect {{ tp: {:?}, txn_ts: {}, lock: {:?} }}", - tp, txn_ts, lock + "Detect {{ tp: {:?}, txn_ts: {}, wait_info: {:?} }}", + tp, txn_ts, wait_info ), Task::DetectRpc { .. } => write!(f, "Detect Rpc"), Task::ChangeRole(role) => write!(f, "ChangeRole {{ role: {:?} }}", role), @@ -459,20 +462,26 @@ impl Scheduler { } } - pub fn detect(&self, txn_ts: TimeStamp, lock: Lock, diag_ctx: DiagnosticContext) { + pub fn detect( + &self, + txn_ts: TimeStamp, + wait_info: KeyLockWaitInfo, + diag_ctx: DiagnosticContext, + ) { + // TODO: Support detect many keys in a batch self.notify_scheduler(Task::Detect { tp: DetectType::Detect, txn_ts, - lock, + wait_info: Some(wait_info), diag_ctx, }); } - pub fn clean_up_wait_for(&self, txn_ts: TimeStamp, lock: Lock) { + pub fn clean_up_wait_for(&self, start_ts: TimeStamp, wait_info: KeyLockWaitInfo) { self.notify_scheduler(Task::Detect { tp: DetectType::CleanUpWaitFor, - txn_ts, - lock, + txn_ts: start_ts, + wait_info: Some(wait_info), diag_ctx: DiagnosticContext::default(), }); } @@ -481,7 +490,7 @@ impl Scheduler { self.notify_scheduler(Task::Detect { tp: DetectType::CleanUp, txn_ts, - lock: Lock::default(), + wait_info: None, diag_ctx: DiagnosticContext::default(), }); } @@ -785,13 +794,14 @@ where let (send, recv) = leader_client.register_detect_handler(Box::new(move |mut resp| { let entry = resp.take_entry(); let txn = entry.txn.into(); - let lock = Lock { + let lock = LockDigest { ts: entry.wait_for_txn.into(), hash: entry.key_hash, }; let mut wait_chain: Vec<_> = resp.take_wait_chain().into(); + let key = entry.get_key().to_vec(); wait_chain.push(entry); - waiter_mgr_scheduler.deadlock(txn, lock, resp.get_deadlock_key_hash(), wait_chain) + waiter_mgr_scheduler.deadlock(txn, key, lock, resp.get_deadlock_key_hash(), wait_chain) })); spawn_local(send.map_err(|e| error!("leader client failed"; "err" => ?e))); // No need to log it again. @@ -810,7 +820,7 @@ where &mut self, tp: DetectType, txn_ts: TimeStamp, - lock: Lock, + wait_info: &Option, diag_ctx: DiagnosticContext, ) -> bool { assert!(!self.is_leader() && self.leader_info.is_some()); @@ -826,8 +836,10 @@ where }; let mut entry = WaitForEntry::default(); entry.set_txn(txn_ts.into_inner()); - entry.set_wait_for_txn(lock.ts.into_inner()); - entry.set_key_hash(lock.hash); + if let Some(wait_info) = wait_info.as_ref() { + entry.set_wait_for_txn(wait_info.lock_digest.ts.into_inner()); + entry.set_key_hash(wait_info.lock_digest.hash); + } entry.set_key(diag_ctx.key); entry.set_resource_group_tag(diag_ctx.resource_group_tag); let mut req = DeadlockRequest::default(); @@ -846,32 +858,38 @@ where &self, tp: DetectType, txn_ts: TimeStamp, - lock: Lock, + wait_info: Option, diag_ctx: DiagnosticContext, ) { let detect_table = &mut self.inner.borrow_mut().detect_table; match tp { DetectType::Detect => { + let wait_info = wait_info.unwrap(); if let Some((deadlock_key_hash, mut wait_chain)) = detect_table.detect( txn_ts, - lock.ts, - lock.hash, + wait_info.lock_digest.ts, + wait_info.lock_digest.hash, &diag_ctx.key, &diag_ctx.resource_group_tag, ) { let mut last_entry = WaitForEntry::default(); last_entry.set_txn(txn_ts.into_inner()); - last_entry.set_wait_for_txn(lock.ts.into_inner()); - last_entry.set_key_hash(lock.hash); - last_entry.set_key(diag_ctx.key); + last_entry.set_wait_for_txn(wait_info.lock_digest.ts.into_inner()); + last_entry.set_key_hash(wait_info.lock_digest.hash); + last_entry.set_key(diag_ctx.key.clone()); last_entry.set_resource_group_tag(diag_ctx.resource_group_tag); wait_chain.push(last_entry); - self.waiter_mgr_scheduler - .deadlock(txn_ts, lock, deadlock_key_hash, wait_chain); + self.waiter_mgr_scheduler.deadlock( + txn_ts, + diag_ctx.key.clone(), + wait_info.lock_digest, + deadlock_key_hash, + wait_chain, + ); } } DetectType::CleanUpWaitFor => { - detect_table.clean_up_wait_for(txn_ts, lock.ts, lock.hash) + detect_table.clean_up_wait_for(txn_ts, wait_info.unwrap().lock_digest) } DetectType::CleanUp => detect_table.clean_up(txn_ts), } @@ -882,11 +900,11 @@ where &mut self, tp: DetectType, txn_ts: TimeStamp, - lock: Lock, + wait_info: Option, diag_ctx: DiagnosticContext, ) { if self.is_leader() { - self.handle_detect_locally(tp, txn_ts, lock, diag_ctx); + self.handle_detect_locally(tp, txn_ts, wait_info, diag_ctx); } else { for _ in 0..2 { // TODO: If the leader hasn't been elected, it requests Pd for @@ -896,7 +914,7 @@ where if self.leader_client.is_none() && !self.refresh_leader_info() { break; } - if self.send_request_to_leader(tp, txn_ts, lock, diag_ctx.clone()) { + if self.send_request_to_leader(tp, txn_ts, &wait_info, diag_ctx.clone()) { return; } // Because the client is asynchronous, it won't be closed until @@ -906,7 +924,7 @@ where // If a request which causes deadlock is dropped, it leads to the waiter // timeout. TiDB will retry to acquire the lock and detect deadlock // again. - warn!("detect request dropped"; "tp" => ?tp, "txn_ts" => txn_ts, "lock" => ?lock); + warn!("detect request dropped"; "tp" => ?tp, "txn_ts" => txn_ts, "wait_info" => ?wait_info); ERROR_COUNTER_METRICS.dropped.inc(); } } @@ -917,6 +935,7 @@ where stream: RequestStream, sink: DuplexSink, ) { + // TODO: Support batch checking. if !self.is_leader() { let status = RpcStatus::with_message( RpcStatusCode::FAILED_PRECONDITION, @@ -963,7 +982,13 @@ where } } DeadlockRequestType::CleanUpWaitFor => { - detect_table.clean_up_wait_for(txn.into(), wait_for_txn.into(), *key_hash); + detect_table.clean_up_wait_for( + txn.into(), + LockDigest { + ts: wait_for_txn.into(), + hash: *key_hash, + }, + ); None } DeadlockRequestType::CleanUp => { @@ -1005,10 +1030,10 @@ where Task::Detect { tp, txn_ts, - lock, + wait_info, diag_ctx, } => { - self.handle_detect(tp, txn_ts, lock, diag_ctx); + self.handle_detect(tp, txn_ts, wait_info, diag_ctx); } Task::DetectRpc { stream, sink } => { self.handle_detect_rpc(stream, sink); @@ -1180,7 +1205,13 @@ pub mod tests { ); // Clean up entries shrinking the map. - detect_table.clean_up_wait_for(3.into(), 1.into(), 1); + detect_table.clean_up_wait_for( + 3.into(), + LockDigest { + ts: 1.into(), + hash: 1, + }, + ); assert_eq!( detect_table .wait_for_map @@ -1192,14 +1223,32 @@ pub mod tests { .len(), 1 ); - detect_table.clean_up_wait_for(3.into(), 1.into(), 2); + detect_table.clean_up_wait_for( + 3.into(), + LockDigest { + ts: 1.into(), + hash: 2, + }, + ); assert_eq!(detect_table.wait_for_map.get(&3.into()).unwrap().len(), 1); - detect_table.clean_up_wait_for(3.into(), 2.into(), 2); + detect_table.clean_up_wait_for( + 3.into(), + LockDigest { + ts: 2.into(), + hash: 2, + }, + ); assert_eq!(detect_table.wait_for_map.contains_key(&3.into()), false); // Clean up non-exist entry detect_table.clean_up(3.into()); - detect_table.clean_up_wait_for(3.into(), 1.into(), 1); + detect_table.clean_up_wait_for( + 3.into(), + LockDigest { + ts: 1.into(), + hash: 1, + }, + ); } #[test] diff --git a/src/server/lock_manager/metrics.rs b/src/server/lock_manager/metrics.rs index f400652966b..d6ff48bcf80 100644 --- a/src/server/lock_manager/metrics.rs +++ b/src/server/lock_manager/metrics.rs @@ -12,6 +12,7 @@ make_auto_flush_static_metric! { detect, clean_up_wait_for, clean_up, + update_wait_for, }, } @@ -60,13 +61,6 @@ lazy_static! { exponential_buckets(0.0001, 2.0, 20).unwrap() // 0.1ms ~ 104s ) .unwrap(); - pub static ref WAIT_TABLE_STATUS_GAUGE: WaitTableStatusGauge = register_static_int_gauge_vec!( - WaitTableStatusGauge, - "tikv_lock_manager_wait_table_status", - "Status of the wait table", - &["type"] - ) - .unwrap(); pub static ref DETECTOR_LEADER_GAUGE: IntGauge = register_int_gauge!( "tikv_lock_manager_detector_leader_heartbeat", "Heartbeat of the leader of the deadlock detector" diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index e437cea2bf1..ae60467124b 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -7,19 +7,15 @@ mod metrics; pub mod waiter_manager; use std::{ - collections::hash_map::DefaultHasher, - hash::{Hash, Hasher}, sync::{ - atomic::{AtomicBool, AtomicUsize, Ordering}, + atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, Arc, }, thread::JoinHandle, }; -use collections::HashSet; -use crossbeam::utils::CachePadded; use engine_traits::KvEngine; -use parking_lot::Mutex; +use kvproto::metapb::RegionEpoch; use pd_client::PdClient; use raftstore::coprocessor::CoprocessorHost; use security::SecurityManager; @@ -32,26 +28,22 @@ pub use self::{ waiter_manager::Scheduler as WaiterMgrScheduler, }; use self::{ - deadlock::{Detector, RoleChangeNotifier}, - waiter_manager::WaiterManager, + deadlock::Detector, + waiter_manager::{Waiter, WaiterManager}, }; use crate::{ - server::{resolve::StoreAddrResolver, Error, Result}, + server::{ + lock_manager::deadlock::RoleChangeNotifier, resolve::StoreAddrResolver, Error, Result, + }, storage::{ - lock_manager::{DiagnosticContext, Lock, LockManager as LockManagerTrait, WaitTimeout}, - DynamicConfigs as StorageDynamicConfigs, ProcessResult, StorageCallback, + lock_manager::{ + DiagnosticContext, KeyLockWaitInfo, LockManager as LockManagerTrait, LockWaitToken, + UpdateWaitForEvent, WaitTimeout, + }, + DynamicConfigs as StorageDynamicConfigs, Error as StorageError, }, }; -const DETECTED_SLOTS_NUM: usize = 128; - -#[inline] -fn detected_slot_idx(txn_ts: TimeStamp) -> usize { - let mut s = DefaultHasher::new(); - txn_ts.hash(&mut s); - (s.finish() as usize) & (DETECTED_SLOTS_NUM - 1) -} - /// `LockManager` has two components working in two threads: /// * One is the `WaiterManager` which manages transactions waiting for locks. /// * The other one is the `Detector` which detects deadlocks between @@ -65,12 +57,13 @@ pub struct LockManager { waiter_count: Arc, - /// Record transactions which have sent requests to detect deadlock. - detected: Arc<[CachePadded>>]>, + token_allocator: Arc, pipelined: Arc, in_memory: Arc, + + wake_up_delay_duration_ms: Arc, } impl Clone for LockManager { @@ -81,9 +74,10 @@ impl Clone for LockManager { waiter_mgr_scheduler: self.waiter_mgr_scheduler.clone(), detector_scheduler: self.detector_scheduler.clone(), waiter_count: self.waiter_count.clone(), - detected: self.detected.clone(), + token_allocator: self.token_allocator.clone(), pipelined: self.pipelined.clone(), in_memory: self.in_memory.clone(), + wake_up_delay_duration_ms: self.wake_up_delay_duration_ms.clone(), } } } @@ -92,8 +86,6 @@ impl LockManager { pub fn new(cfg: &Config) -> Self { let waiter_mgr_worker = FutureWorker::new("waiter-manager"); let detector_worker = FutureWorker::new("deadlock-detector"); - let mut detected = Vec::with_capacity(DETECTED_SLOTS_NUM); - detected.resize_with(DETECTED_SLOTS_NUM, || Mutex::new(HashSet::default()).into()); Self { waiter_mgr_scheduler: WaiterMgrScheduler::new(waiter_mgr_worker.scheduler()), @@ -101,9 +93,12 @@ impl LockManager { detector_scheduler: DetectorScheduler::new(detector_worker.scheduler()), detector_worker: Some(detector_worker), waiter_count: Arc::new(AtomicUsize::new(0)), - detected: detected.into(), + token_allocator: Arc::new(AtomicU64::new(0)), pipelined: Arc::new(AtomicBool::new(cfg.pipelined)), in_memory: Arc::new(AtomicBool::new(cfg.in_memory)), + wake_up_delay_duration_ms: Arc::new(AtomicU64::new( + cfg.wake_up_delay_duration.as_millis(), + )), } } @@ -225,6 +220,7 @@ impl LockManager { self.detector_scheduler.clone(), self.pipelined.clone(), self.in_memory.clone(), + self.wake_up_delay_duration_ms.clone(), ) } @@ -232,35 +228,33 @@ impl LockManager { StorageDynamicConfigs { pipelined_pessimistic_lock: self.pipelined.clone(), in_memory_pessimistic_lock: self.in_memory.clone(), + wake_up_delay_duration_ms: self.wake_up_delay_duration_ms.clone(), } } - - fn add_to_detected(&self, txn_ts: TimeStamp) { - let mut detected = self.detected[detected_slot_idx(txn_ts)].lock(); - detected.insert(txn_ts); - } - - fn remove_from_detected(&self, txn_ts: TimeStamp) -> bool { - let mut detected = self.detected[detected_slot_idx(txn_ts)].lock(); - detected.remove(&txn_ts) - } } impl LockManagerTrait for LockManager { + fn allocate_token(&self) -> LockWaitToken { + LockWaitToken(Some(self.token_allocator.fetch_add(1, Ordering::Relaxed))) + } + fn wait_for( &self, + token: LockWaitToken, + region_id: u64, + region_epoch: RegionEpoch, + term: u64, start_ts: TimeStamp, - cb: StorageCallback, - pr: ProcessResult, - lock: Lock, + wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, + cancel_callback: Box, diag_ctx: DiagnosticContext, ) { let timeout = match timeout { Some(t) => t, None => { - cb.execute(pr); + Waiter::cancel_no_timeout(wait_info, cancel_callback); return; } }; @@ -268,35 +262,34 @@ impl LockManagerTrait for LockManager { // Increase `waiter_count` here to prevent there is an on-the-fly WaitFor msg // but the waiter_mgr haven't processed it, subsequent WakeUp msgs may be lost. self.waiter_count.fetch_add(1, Ordering::SeqCst); - self.waiter_mgr_scheduler - .wait_for(start_ts, cb, pr, lock, timeout, diag_ctx.clone()); + + self.waiter_mgr_scheduler.wait_for( + token, + region_id, + region_epoch, + term, + start_ts, + wait_info.clone(), + timeout, + cancel_callback, + diag_ctx.clone(), + ); // If it is the first lock the transaction tries to lock, it won't cause // deadlock. if !is_first_lock { - self.add_to_detected(start_ts); - self.detector_scheduler.detect(start_ts, lock, diag_ctx); + self.detector_scheduler + .detect(start_ts, wait_info, diag_ctx); } } - fn wake_up( - &self, - lock_ts: TimeStamp, - hashes: Vec, - commit_ts: TimeStamp, - is_pessimistic_txn: bool, - ) { - // If `hashes` is some, there may be some waiters waiting for these locks. - // Try to wake up them. - if !hashes.is_empty() && self.has_waiter() { - self.waiter_mgr_scheduler - .wake_up(lock_ts, hashes, commit_ts); - } - // If a pessimistic transaction is committed or rolled back and it once sent - // requests to detect deadlock, clean up its wait-for entries in the - // deadlock detector. - if is_pessimistic_txn && self.remove_from_detected(lock_ts) { - self.detector_scheduler.clean_up(lock_ts); + fn update_wait_for(&self, updated_items: Vec) { + self.waiter_mgr_scheduler.update_wait_for(updated_items); + } + + fn remove_lock_wait(&self, token: LockWaitToken) { + if self.has_waiter() { + self.waiter_mgr_scheduler.remove_lock_wait(token); } } @@ -321,9 +314,11 @@ mod tests { use security::SecurityConfig; use tikv_util::config::ReadableDuration; use tracker::{TrackerToken, INVALID_TRACKER_TOKEN}; + use txn_types::Key; use self::{deadlock::tests::*, metrics::*, waiter_manager::tests::*}; use super::*; + use crate::storage::lock_manager::LockDigest; fn start_lock_manager() -> LockManager { let mut coprocessor_host = CoprocessorHost::::default(); @@ -378,69 +373,88 @@ mod tests { assert!(!lock_mgr.has_waiter()); let (waiter, lock_info, f) = new_test_waiter(10.into(), 20.into(), 20); lock_mgr.wait_for( + lock_mgr.allocate_token(), + 1, + RegionEpoch::default(), + 1, waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, + waiter.wait_info, true, Some(WaitTimeout::Default), + waiter.cancel_callback, DiagnosticContext::default(), ); assert!(lock_mgr.has_waiter()); assert_elapsed( - || expect_key_is_locked(block_on(f).unwrap().unwrap(), lock_info), + || expect_key_is_locked(block_on(f).unwrap(), lock_info), 2500, 3500, ); assert!(!lock_mgr.has_waiter()); - // Wake up + // Removal let (waiter_ts, lock) = ( 10.into(), - Lock { + LockDigest { ts: 20.into(), hash: 20, }, ); - let (waiter, lock_info, f) = new_test_waiter(waiter_ts, lock.ts, lock.hash); + let (waiter, _lock_info, f) = new_test_waiter(waiter_ts, lock.ts, lock.hash); + let token = lock_mgr.allocate_token(); lock_mgr.wait_for( + token, + 1, + RegionEpoch::default(), + 1, waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, + waiter.wait_info, true, Some(WaitTimeout::Default), + waiter.cancel_callback, DiagnosticContext::default(), ); assert!(lock_mgr.has_waiter()); - lock_mgr.wake_up(lock.ts, vec![lock.hash], 30.into(), false); + lock_mgr.remove_lock_wait(token); + // The waiter will be directly dropped. + // In normal cases, when `remove_lock_wait` is invoked, the request's callback + // must be called somewhere else. assert_elapsed( - || expect_write_conflict(block_on(f).unwrap(), waiter_ts, lock_info, 30.into()), + || { + block_on(f).unwrap_err(); + }, 0, 500, ); assert!(!lock_mgr.has_waiter()); // Deadlock - let (waiter1, lock_info1, f1) = new_test_waiter(10.into(), 20.into(), 20); + let (waiter1, _lock_info1, f1) = new_test_waiter_with_key(10.into(), 20.into(), b"k1"); + let token1 = lock_mgr.allocate_token(); lock_mgr.wait_for( + token1, + 1, + RegionEpoch::default(), + 1, waiter1.start_ts, - waiter1.cb, - waiter1.pr, - waiter1.lock, + waiter1.wait_info, false, Some(WaitTimeout::Default), + waiter1.cancel_callback, diag_ctx(b"k1", b"tag1", INVALID_TRACKER_TOKEN), ); assert!(lock_mgr.has_waiter()); - let (waiter2, lock_info2, f2) = new_test_waiter(20.into(), 10.into(), 10); + let (waiter2, lock_info2, f2) = new_test_waiter_with_key(20.into(), 10.into(), b"k2"); lock_mgr.wait_for( + lock_mgr.allocate_token(), + 1, + RegionEpoch::default(), + 1, waiter2.start_ts, - waiter2.cb, - waiter2.pr, - waiter2.lock, + waiter2.wait_info, false, Some(WaitTimeout::Default), + waiter2.cancel_callback, diag_ctx(b"k2", b"tag2", INVALID_TRACKER_TOKEN), ); assert!(lock_mgr.has_waiter()); @@ -450,17 +464,19 @@ mod tests { block_on(f2).unwrap(), 20.into(), lock_info2, - 20, + Key::from_raw(b"k1").gen_hash(), &[(10, 20, b"k1", b"tag1"), (20, 10, b"k2", b"tag2")], ) }, 0, 500, ); - // Waiter2 releases its lock. - lock_mgr.wake_up(20.into(), vec![20], 20.into(), true); + // Simulating waiter2 releases its lock so that waiter1 is removed + lock_mgr.remove_lock_wait(token1); assert_elapsed( - || expect_write_conflict(block_on(f1).unwrap(), 10.into(), lock_info1, 20.into()), + || { + block_on(f1).unwrap_err(); + }, 0, 500, ); @@ -468,53 +484,80 @@ mod tests { // If it's the first lock, no detect. // If it's not, detect deadlock. + // Note that if txn 30 is writing its first lock, there should never be another + // transaction waiting for txn 30's lock. We added this waiter (40 + // waiting for 30) just for checking whether the lock manager does the + // detection internally. + let (waiter1, _, f1) = new_test_waiter_with_key(40.into(), 30.into(), b"k1"); + let token1 = lock_mgr.allocate_token(); + lock_mgr.wait_for( + token1, + 1, + RegionEpoch::default(), + 1, + waiter1.start_ts, + waiter1.wait_info, + false, + Some(WaitTimeout::Default), + waiter1.cancel_callback, + diag_ctx(b"k1", b"tag1", INVALID_TRACKER_TOKEN), + ); for is_first_lock in &[true, false] { - let (waiter, _, f) = new_test_waiter(30.into(), 40.into(), 40); + let (waiter, lock_info2, f2) = new_test_waiter_with_key(30.into(), 40.into(), b"k2"); + let token2 = lock_mgr.allocate_token(); lock_mgr.wait_for( + token2, + 1, + RegionEpoch::default(), + 1, waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, + waiter.wait_info, *is_first_lock, Some(WaitTimeout::Default), - DiagnosticContext::default(), + waiter.cancel_callback, + diag_ctx(b"k2", b"tag2", INVALID_TRACKER_TOKEN), ); assert!(lock_mgr.has_waiter()); - assert_eq!(lock_mgr.remove_from_detected(30.into()), !is_first_lock); - lock_mgr.wake_up(40.into(), vec![40], 40.into(), false); - block_on(f).unwrap().unwrap_err(); + if *is_first_lock { + lock_mgr.remove_lock_wait(token2); + block_on(f2).unwrap_err(); + } else { + assert_elapsed( + || { + expect_deadlock( + block_on(f2).unwrap(), + 30.into(), + lock_info2, + Key::from_raw(b"k1").gen_hash(), + &[(40, 30, b"k1", b"tag1"), (30, 40, b"k2", b"tag2")], + ) + }, + 0, + 500, + ); + } } + lock_mgr.remove_lock_wait(token1); + block_on(f1).unwrap_err(); assert!(!lock_mgr.has_waiter()); - // If key_hashes is empty, no wake up. - let prev_wake_up = TASK_COUNTER_METRICS.wake_up.get(); - lock_mgr.wake_up(10.into(), vec![], 10.into(), false); - assert_eq!(TASK_COUNTER_METRICS.wake_up.get(), prev_wake_up); - - // If it's non-pessimistic-txn, no clean up. - let prev_clean_up = TASK_COUNTER_METRICS.clean_up.get(); - lock_mgr.wake_up(10.into(), vec![], 10.into(), false); - assert_eq!(TASK_COUNTER_METRICS.clean_up.get(), prev_clean_up); - - // If the txn doesn't wait for locks, no clean up. - let prev_clean_up = TASK_COUNTER_METRICS.clean_up.get(); - lock_mgr.wake_up(10.into(), vec![], 10.into(), true); - assert_eq!(TASK_COUNTER_METRICS.clean_up.get(), prev_clean_up); - // If timeout is none, no wait for. let (waiter, lock_info, f) = new_test_waiter(10.into(), 20.into(), 20); let prev_wait_for = TASK_COUNTER_METRICS.wait_for.get(); lock_mgr.wait_for( + lock_mgr.allocate_token(), + 1, + RegionEpoch::default(), + 1, waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, + waiter.wait_info, false, None, + waiter.cancel_callback, DiagnosticContext::default(), ); assert_elapsed( - || expect_key_is_locked(block_on(f).unwrap().unwrap(), lock_info), + || expect_key_is_locked(block_on(f).unwrap(), lock_info), 0, 500, ); diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index 2ba2b583de9..33164833fba 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -9,7 +9,7 @@ use std::{ atomic::{AtomicUsize, Ordering}, Arc, }, - time::{Duration, Instant}, + time::Instant, }; use collections::HashMap; @@ -18,7 +18,7 @@ use futures::{ future::Future, task::{Context, Poll}, }; -use kvproto::{deadlock::WaitForEntry, kvrpcpb::WriteConflictReason}; +use kvproto::{deadlock::WaitForEntry, metapb::RegionEpoch}; use tikv_util::{ config::ReadableDuration, time::{duration_to_sec, InstantExt}, @@ -27,13 +27,17 @@ use tikv_util::{ }; use tokio::task::spawn_local; use tracker::GLOBAL_TRACKERS; +use txn_types::Key; use super::{config::Config, deadlock::Scheduler as DetectorScheduler, metrics::*}; use crate::storage::{ - lock_manager::{DiagnosticContext, Lock, WaitTimeout}, + lock_manager::{ + DiagnosticContext, KeyLockWaitInfo, LockDigest, LockWaitToken, UpdateWaitForEvent, + WaitTimeout, + }, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, TimeStamp}, - txn::{Error as TxnError, ErrorInner as TxnErrorInner}, - Error as StorageError, ErrorInner as StorageErrorInner, ProcessResult, StorageCallback, + txn::Error as TxnError, + Error as StorageError, ErrorInner as StorageErrorInner, }; struct DelayInner { @@ -103,21 +107,27 @@ pub type Callback = Box) + Send>; #[allow(clippy::large_enum_variant)] pub enum Task { + SetKeyWakeUpDelayCallback { + cb: Box, + }, WaitFor { + token: LockWaitToken, + region_id: u64, + region_epoch: RegionEpoch, + term: u64, // which txn waits for the lock start_ts: TimeStamp, - cb: StorageCallback, - pr: ProcessResult, - lock: Lock, + wait_info: KeyLockWaitInfo, timeout: WaitTimeout, + cancel_callback: Box, diag_ctx: DiagnosticContext, start_waiting_time: Instant, }, - WakeUp { - // lock info - lock_ts: TimeStamp, - hashes: Vec, - commit_ts: TimeStamp, + RemoveLockWait { + token: LockWaitToken, + }, + UpdateWaitFor { + events: Vec, }, Dump { cb: Callback, @@ -125,16 +135,16 @@ pub enum Task { Deadlock { // Which txn causes deadlock start_ts: TimeStamp, - lock: Lock, + key: Vec, + lock: LockDigest, deadlock_key_hash: u64, wait_chain: Vec, }, ChangeConfig { timeout: Option, - delay: Option, }, #[cfg(any(test, feature = "testexport"))] - Validate(Box), + Validate(Box), } /// Debug for task. @@ -148,16 +158,33 @@ impl Debug for Task { impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { - Task::WaitFor { start_ts, lock, .. } => { - write!(f, "txn:{} waiting for {}:{}", start_ts, lock.ts, lock.hash) + Task::SetKeyWakeUpDelayCallback { .. } => { + write!(f, "setting key wake up delay callback") + } + Task::WaitFor { + token, + start_ts, + wait_info, + .. + } => { + write!( + f, + "txn:{} waiting for {}:{}, token {:?}", + start_ts, wait_info.lock_digest.ts, wait_info.lock_digest.hash, token + ) + } + Task::RemoveLockWait { token } => { + write!(f, "waking up txns waiting for token {:?}", token) + } + Task::UpdateWaitFor { events } => { + write!(f, "updating wait info {:?}", events) } - Task::WakeUp { lock_ts, .. } => write!(f, "waking up txns waiting for {}", lock_ts), Task::Dump { .. } => write!(f, "dump"), Task::Deadlock { start_ts, .. } => write!(f, "txn:{} deadlock", start_ts), - Task::ChangeConfig { timeout, delay } => write!( + Task::ChangeConfig { timeout } => write!( f, - "change config to default_wait_for_lock_timeout: {:?}, wake_up_delay_duration: {:?}", - timeout, delay + "change config to default_wait_for_lock_timeout: {:?}", + timeout ), #[cfg(any(test, feature = "testexport"))] Task::Validate(_) => write!(f, "validate waiter manager config"), @@ -172,15 +199,14 @@ impl Display for Task { /// has a timeout. Transaction will be notified when the lock is released /// or the corresponding waiter times out. pub(crate) struct Waiter { + // These field will be needed for supporting region-level waking up when region errors + // happens. + // region_id: u64, + // region_epoch: RegionEpoch, + // term: u64, pub(crate) start_ts: TimeStamp, - pub(crate) cb: StorageCallback, - /// The result of `Command::AcquirePessimisticLock`. - /// - /// It contains a `KeyIsLocked` error at the beginning. It will be changed - /// to `WriteConflict` error if the lock is released or `Deadlock` error if - /// it causes deadlock. - pub(crate) pr: ProcessResult, - pub(crate) lock: Lock, + pub(crate) wait_info: KeyLockWaitInfo, + pub(crate) cancel_callback: Box, pub diag_ctx: DiagnosticContext, delay: Delay, start_waiting_time: Instant, @@ -188,19 +214,20 @@ pub(crate) struct Waiter { impl Waiter { fn new( + _region_id: u64, + _region_epoch: RegionEpoch, + _term: u64, start_ts: TimeStamp, - cb: StorageCallback, - pr: ProcessResult, - lock: Lock, + wait_info: KeyLockWaitInfo, + cancel_callback: Box, deadline: Instant, diag_ctx: DiagnosticContext, start_waiting_time: Instant, ) -> Self { Self { start_ts, - cb, - pr, - lock, + wait_info, + cancel_callback, delay: Delay::new(deadline), diag_ctx, start_waiting_time, @@ -220,13 +247,13 @@ impl Waiter { } } + #[allow(dead_code)] fn reset_timeout(&self, deadline: Instant) { self.delay.reset(deadline); } - /// `Notify` consumes the `Waiter` to notify the corresponding transaction - /// going on. - fn notify(self) { + /// Consumes the `Waiter` to notify the corresponding transaction going on. + fn cancel(self, error: Option) -> KeyLockWaitInfo { let elapsed = self.start_waiting_time.saturating_elapsed(); GLOBAL_TRACKERS.with_tracker(self.diag_ctx.tracker, |tracker| { tracker.metrics.pessimistic_lock_wait_nanos = elapsed.as_nanos() as u64; @@ -234,165 +261,138 @@ impl Waiter { WAITER_LIFETIME_HISTOGRAM.observe(duration_to_sec(elapsed)); // Cancel the delay timer to prevent removing the same `Waiter` earlier. self.delay.cancel(); - self.cb.execute(self.pr); + if let Some(error) = error { + (self.cancel_callback)(error); + } + self.wait_info } - /// Changes the `ProcessResult` to `WriteConflict`. - /// It may be invoked more than once. - fn conflict_with(&mut self, lock_ts: TimeStamp, commit_ts: TimeStamp) { - let (key, primary) = self.extract_key_info(); - let mvcc_err = MvccError::from(MvccErrorInner::WriteConflict { - start_ts: self.start_ts, - conflict_start_ts: lock_ts, - conflict_commit_ts: commit_ts, - key, - primary, - reason: WriteConflictReason::PessimisticRetry, - }); - self.pr = ProcessResult::Failed { - err: StorageError::from(TxnError::from(mvcc_err)), - }; + fn cancel_for_finished(self) -> KeyLockWaitInfo { + self.cancel(None) + } + + fn cancel_for_timeout(self, _skip_resolving_lock: bool) -> KeyLockWaitInfo { + let lock_info = self.wait_info.lock_info.clone(); + // lock_info.set_skip_resolving_lock(skip_resolving_lock); + let error = MvccError::from(MvccErrorInner::KeyIsLocked(lock_info)); + self.cancel(Some(StorageError::from(TxnError::from(error)))) } - /// Changes the `ProcessResult` to `Deadlock`. - fn deadlock_with(&mut self, deadlock_key_hash: u64, wait_chain: Vec) { - let (key, _) = self.extract_key_info(); - let mvcc_err = MvccError::from(MvccErrorInner::Deadlock { + pub(super) fn cancel_no_timeout( + wait_info: KeyLockWaitInfo, + cancel_callback: Box, + ) { + let lock_info = wait_info.lock_info; + let error = MvccError::from(MvccErrorInner::KeyIsLocked(lock_info)); + cancel_callback(StorageError::from(TxnError::from(error))) + } + + fn cancel_for_deadlock( + self, + lock_digest: LockDigest, + key: Vec, + deadlock_key_hash: u64, + wait_chain: Vec, + ) -> KeyLockWaitInfo { + let e = MvccError::from(MvccErrorInner::Deadlock { start_ts: self.start_ts, - lock_ts: self.lock.ts, + lock_ts: lock_digest.ts, lock_key: key, deadlock_key_hash, wait_chain, }); - self.pr = ProcessResult::Failed { - err: StorageError::from(TxnError::from(mvcc_err)), - }; - } - - /// Extracts key and primary key from `ProcessResult`. - fn extract_key_info(&mut self) -> (Vec, Vec) { - match &mut self.pr { - ProcessResult::PessimisticLockRes { res } => match res { - Err(StorageError(box StorageErrorInner::Txn(TxnError( - box TxnErrorInner::Mvcc(MvccError(box MvccErrorInner::KeyIsLocked(info))), - )))) => (info.take_key(), info.take_primary_lock()), - _ => panic!("unexpected mvcc error"), - }, - ProcessResult::Failed { err } => match err { - StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( - MvccError(box MvccErrorInner::WriteConflict { - ref mut key, - ref mut primary, - .. - }), - )))) => (std::mem::take(key), std::mem::take(primary)), - _ => panic!("unexpected mvcc error"), - }, - _ => panic!("unexpected progress result"), - } + self.cancel(Some(StorageError::from(TxnError::from(e)))) } } -// NOTE: Now we assume `Waiters` is not very long. -// Maybe needs to use `BinaryHeap` or sorted `VecDeque` instead. -type Waiters = Vec; - struct WaitTable { - // Map lock hash to waiters. - wait_table: HashMap, + // Map lock hash and ts to waiters. + // For compatibility. + wait_table: HashMap<(u64, TimeStamp), LockWaitToken>, + waiter_pool: HashMap, waiter_count: Arc, + + wake_up_key_delay_callback: Option>, } impl WaitTable { fn new(waiter_count: Arc) -> Self { Self { wait_table: HashMap::default(), + waiter_pool: HashMap::default(), waiter_count, + wake_up_key_delay_callback: None, } } + fn set_wake_up_key_delay_callback( + &mut self, + cb: Option>, + ) { + self.wake_up_key_delay_callback = cb; + } + #[cfg(test)] fn count(&self) -> usize { - self.wait_table.iter().map(|(_, v)| v.len()).sum() + self.waiter_pool.len() } fn is_empty(&self) -> bool { - self.wait_table.is_empty() + self.waiter_pool.is_empty() } - /// Returns the duplicated `Waiter` if there is. - fn add_waiter(&mut self, waiter: Waiter) -> Option { - let waiters = self.wait_table.entry(waiter.lock.hash).or_insert_with(|| { - WAIT_TABLE_STATUS_GAUGE.locks.inc(); - Waiters::default() - }); - let old_idx = waiters.iter().position(|w| w.start_ts == waiter.start_ts); - waiters.push(waiter); - if let Some(old_idx) = old_idx { - let old = waiters.swap_remove(old_idx); - self.waiter_count.fetch_sub(1, Ordering::SeqCst); - Some(old) - } else { - WAIT_TABLE_STATUS_GAUGE.txns.inc(); - None - } - // Here we don't increase waiter_count because it's already updated in - // LockManager::wait_for() + /// Adds a waiter identified by given token. The caller must guarantee that + /// the `token` is unique and doesn't exist in waiter manager currently. + fn add_waiter(&mut self, token: LockWaitToken, waiter: Waiter) { + self.wait_table + .insert((waiter.wait_info.lock_digest.hash, waiter.start_ts), token); + assert!(self.waiter_pool.insert(token, waiter).is_none()); } - /// Removes all waiters waiting for the lock. - fn remove(&mut self, lock: Lock) { - self.wait_table.remove(&lock.hash); - WAIT_TABLE_STATUS_GAUGE.locks.dec(); + fn take_waiter(&mut self, token: LockWaitToken) -> Option { + let waiter = self.waiter_pool.remove(&token)?; + self.waiter_count.fetch_sub(1, Ordering::SeqCst); + self.wait_table + .remove(&(waiter.wait_info.lock_digest.hash, waiter.start_ts)); + Some(waiter) } - fn remove_waiter(&mut self, lock: Lock, waiter_ts: TimeStamp) -> Option { - let waiters = self.wait_table.get_mut(&lock.hash)?; - let idx = waiters - .iter() - .position(|waiter| waiter.start_ts == waiter_ts)?; - let waiter = waiters.swap_remove(idx); - self.waiter_count.fetch_sub(1, Ordering::SeqCst); - WAIT_TABLE_STATUS_GAUGE.txns.dec(); - if waiters.is_empty() { - self.remove(lock); + fn update_waiter(&mut self, update_event: &UpdateWaitForEvent) -> Option { + let waiter = self.waiter_pool.get_mut(&update_event.token)?; + + assert_eq!(waiter.wait_info.key, update_event.wait_info.key); + + if waiter.wait_info.lock_digest.ts == update_event.wait_info.lock_digest.ts { + // Unchanged. + return None; } - Some(waiter) + + let result = std::mem::replace(&mut waiter.wait_info, update_event.wait_info.clone()); + waiter.diag_ctx = update_event.diag_ctx.clone(); + + Some(result) } - /// Removes the `Waiter` with the smallest start ts and returns it with - /// remaining waiters. - /// - /// NOTE: Due to the borrow checker, it doesn't remove the entry in the - /// `WaitTable` even if there is no remaining waiter. - fn remove_oldest_waiter(&mut self, lock: Lock) -> Option<(Waiter, &mut Waiters)> { - let waiters = self.wait_table.get_mut(&lock.hash)?; - let oldest_idx = waiters - .iter() - .enumerate() - .min_by_key(|(_, w)| w.start_ts) - .unwrap() - .0; - let oldest = waiters.swap_remove(oldest_idx); - self.waiter_count.fetch_sub(1, Ordering::SeqCst); - WAIT_TABLE_STATUS_GAUGE.txns.dec(); - Some((oldest, waiters)) + fn take_waiter_by_lock_digest( + &mut self, + lock: LockDigest, + waiter_ts: TimeStamp, + ) -> Option { + let token = *self.wait_table.get(&(lock.hash, waiter_ts))?; + self.take_waiter(token) } fn to_wait_for_entries(&self) -> Vec { - self.wait_table + self.waiter_pool .iter() - .flat_map(|(_, waiters)| { - waiters.iter().map(|waiter| { - let mut wait_for_entry = WaitForEntry::default(); - wait_for_entry.set_txn(waiter.start_ts.into_inner()); - wait_for_entry.set_wait_for_txn(waiter.lock.ts.into_inner()); - wait_for_entry.set_key_hash(waiter.lock.hash); - wait_for_entry.set_key(waiter.diag_ctx.key.clone()); - wait_for_entry - .set_resource_group_tag(waiter.diag_ctx.resource_group_tag.clone()); - wait_for_entry - }) + .map(|(_, waiter)| { + let mut wait_for_entry = WaitForEntry::default(); + wait_for_entry.set_txn(waiter.start_ts.into_inner()); + wait_for_entry.set_wait_for_txn(waiter.wait_info.lock_digest.ts.into_inner()); + wait_for_entry.set_key_hash(waiter.wait_info.lock_digest.hash); + wait_for_entry.set_key(waiter.wait_info.key.to_raw().unwrap()); + wait_for_entry.set_resource_group_tag(waiter.diag_ctx.resource_group_tag.clone()); + wait_for_entry }) .collect() } @@ -409,8 +409,12 @@ impl Scheduler { fn notify_scheduler(&self, task: Task) -> bool { if let Err(Stopped(task)) = self.0.schedule(task) { error!("failed to send task to waiter_manager"; "task" => %task); - if let Task::WaitFor { cb, pr, .. } = task { - cb.execute(pr); + if let Task::WaitFor { + cancel_callback, .. + } = task + { + // TODO: Pass proper error for the scheduling error. + cancel_callback(StorageError(Box::new(StorageErrorInner::SchedTooBusy))); } return false; } @@ -419,30 +423,43 @@ impl Scheduler { pub fn wait_for( &self, + token: LockWaitToken, + region_id: u64, + region_epoch: RegionEpoch, + term: u64, start_ts: TimeStamp, - cb: StorageCallback, - pr: ProcessResult, - lock: Lock, + wait_info: KeyLockWaitInfo, timeout: WaitTimeout, + cancel_callback: Box, diag_ctx: DiagnosticContext, ) { self.notify_scheduler(Task::WaitFor { + token, + region_id, + region_epoch, + term, start_ts, - cb, - pr, - lock, + wait_info, timeout, + cancel_callback, diag_ctx, start_waiting_time: Instant::now(), }); } - pub fn wake_up(&self, lock_ts: TimeStamp, hashes: Vec, commit_ts: TimeStamp) { - self.notify_scheduler(Task::WakeUp { - lock_ts, - hashes, - commit_ts, - }); + pub fn set_key_wake_up_delay_callback( + &self, + cb: Box, + ) { + self.notify_scheduler(Task::SetKeyWakeUpDelayCallback { cb }); + } + + pub fn remove_lock_wait(&self, token: LockWaitToken) { + self.notify_scheduler(Task::RemoveLockWait { token }); + } + + pub fn update_wait_for(&self, events: Vec) { + self.notify_scheduler(Task::UpdateWaitFor { events }); } pub fn dump_wait_table(&self, cb: Callback) -> bool { @@ -452,43 +469,38 @@ impl Scheduler { pub fn deadlock( &self, txn_ts: TimeStamp, - lock: Lock, + key: Vec, + lock: LockDigest, deadlock_key_hash: u64, wait_chain: Vec, ) { self.notify_scheduler(Task::Deadlock { start_ts: txn_ts, + key, lock, deadlock_key_hash, wait_chain, }); } - pub fn change_config( - &self, - timeout: Option, - delay: Option, - ) { - self.notify_scheduler(Task::ChangeConfig { timeout, delay }); + pub fn change_config(&self, timeout: Option) { + self.notify_scheduler(Task::ChangeConfig { timeout }); } #[cfg(any(test, feature = "testexport"))] - pub fn validate(&self, f: Box) { + pub fn validate(&self, f: Box) { self.notify_scheduler(Task::Validate(f)); } } -/// WaiterManager handles waiting and wake-up of pessimistic lock +/// WaiterManager handles lock waiting, cancels waiters when needed (due to +/// timeout or deadlock detected), and provide lock waiting information for +/// diagnosing. pub struct WaiterManager { wait_table: Rc>, detector_scheduler: DetectorScheduler, /// It is the default and maximum timeout of waiter. default_wait_for_lock_timeout: ReadableDuration, - /// If more than one waiters are waiting for the same lock, only the - /// oldest one will be waked up immediately when the lock is released. - /// Others will be waked up after `wake_up_delay_duration` to reduce - /// contention and make the oldest one more likely acquires the lock. - wake_up_delay_duration: ReadableDuration, } unsafe impl Send for WaiterManager {} @@ -499,11 +511,12 @@ impl WaiterManager { detector_scheduler: DetectorScheduler, cfg: &Config, ) -> Self { + let wait_table = WaitTable::new(waiter_count); + Self { - wait_table: Rc::new(RefCell::new(WaitTable::new(waiter_count))), + wait_table: Rc::new(RefCell::new(wait_table)), detector_scheduler, default_wait_for_lock_timeout: cfg.wait_for_lock_timeout, - wake_up_delay_duration: cfg.wake_up_delay_duration, } } @@ -512,51 +525,52 @@ impl WaiterManager { + timeout.into_duration_with_ceiling(self.default_wait_for_lock_timeout.as_millis()) } - fn handle_wait_for(&mut self, waiter: Waiter) { - let (waiter_ts, lock) = (waiter.start_ts, waiter.lock); + fn handle_wait_for(&mut self, token: LockWaitToken, waiter: Waiter) { let wait_table = self.wait_table.clone(); let detector_scheduler = self.detector_scheduler.clone(); // Remove the waiter from wait table when it times out. let f = waiter.on_timeout(move || { - if let Some(waiter) = wait_table.borrow_mut().remove_waiter(lock, waiter_ts) { - detector_scheduler.clean_up_wait_for(waiter.start_ts, waiter.lock); - waiter.notify(); + let mut wait_table = wait_table.borrow_mut(); + if let Some(waiter) = wait_table.take_waiter(token) { + let start_ts = waiter.start_ts; + let wait_info = waiter.cancel_for_timeout(false); + detector_scheduler.clean_up_wait_for(start_ts, wait_info); } }); - if let Some(old) = self.wait_table.borrow_mut().add_waiter(waiter) { - old.notify(); - }; + self.wait_table.borrow_mut().add_waiter(token, waiter); spawn_local(f); } - fn handle_wake_up(&mut self, lock_ts: TimeStamp, hashes: Vec, commit_ts: TimeStamp) { + fn handle_remove_lock_wait(&mut self, token: LockWaitToken) { let mut wait_table = self.wait_table.borrow_mut(); if wait_table.is_empty() { return; } - let duration: Duration = self.wake_up_delay_duration.into(); - let new_timeout = Instant::now() + duration; - for hash in hashes { - let lock = Lock { ts: lock_ts, hash }; - if let Some((mut oldest, others)) = wait_table.remove_oldest_waiter(lock) { - // Notify the oldest one immediately. + let waiter = if let Some(w) = wait_table.take_waiter(token) { + w + } else { + return; + }; + let start_ts = waiter.start_ts; + let wait_info = waiter.cancel_for_finished(); + self.detector_scheduler + .clean_up_wait_for(start_ts, wait_info); + } + + fn handle_update_wait_for(&mut self, events: Vec) { + let mut wait_table = self.wait_table.borrow_mut(); + for event in events { + let previous_wait_info = wait_table.update_waiter(&event); + + if event.is_first_lock { + continue; + } + + if let Some(previous_wait_info) = previous_wait_info { self.detector_scheduler - .clean_up_wait_for(oldest.start_ts, oldest.lock); - oldest.conflict_with(lock_ts, commit_ts); - oldest.notify(); - // Others will be waked up after `wake_up_delay_duration`. - // - // NOTE: Actually these waiters are waiting for an unknown transaction. - // If there is a deadlock between them, it will be detected after timeout. - if others.is_empty() { - // Remove the empty entry here. - wait_table.remove(lock); - } else { - others.iter_mut().for_each(|waiter| { - waiter.conflict_with(lock_ts, commit_ts); - waiter.reset_timeout(new_timeout); - }); - } + .clean_up_wait_for(event.start_ts, previous_wait_info); + self.detector_scheduler + .detect(event.start_ts, event.wait_info, event.diag_ctx); } } } @@ -568,31 +582,27 @@ impl WaiterManager { fn handle_deadlock( &mut self, waiter_ts: TimeStamp, - lock: Lock, + key: Vec, + lock: LockDigest, deadlock_key_hash: u64, wait_chain: Vec, ) { - if let Some(mut waiter) = self.wait_table.borrow_mut().remove_waiter(lock, waiter_ts) { - waiter.deadlock_with(deadlock_key_hash, wait_chain); - waiter.notify(); + let waiter = self + .wait_table + .borrow_mut() + .take_waiter_by_lock_digest(lock, waiter_ts); + if let Some(waiter) = waiter { + waiter.cancel_for_deadlock(lock, key, deadlock_key_hash, wait_chain); } } - fn handle_config_change( - &mut self, - timeout: Option, - delay: Option, - ) { + fn handle_config_change(&mut self, timeout: Option) { if let Some(timeout) = timeout { self.default_wait_for_lock_timeout = timeout; } - if let Some(delay) = delay { - self.wake_up_delay_duration = delay; - } info!( "Waiter manager config changed"; "default_wait_for_lock_timeout" => self.default_wait_for_lock_timeout.to_string(), - "wake_up_delay_duration" => self.wake_up_delay_duration.to_string() ); } } @@ -600,52 +610,63 @@ impl WaiterManager { impl FutureRunnable for WaiterManager { fn run(&mut self, task: Task) { match task { + Task::SetKeyWakeUpDelayCallback { cb } => { + self.wait_table + .borrow_mut() + .set_wake_up_key_delay_callback(Some(cb)); + } Task::WaitFor { + token, + region_id, + region_epoch, + term, start_ts, - cb, - pr, - lock, + wait_info, timeout, + cancel_callback, diag_ctx, start_waiting_time, } => { let waiter = Waiter::new( + region_id, + region_epoch, + term, start_ts, - cb, - pr, - lock, + wait_info, + cancel_callback, self.normalize_deadline(timeout), diag_ctx, start_waiting_time, ); - self.handle_wait_for(waiter); + self.handle_wait_for(token, waiter); TASK_COUNTER_METRICS.wait_for.inc(); } - Task::WakeUp { - lock_ts, - hashes, - commit_ts, - } => { - self.handle_wake_up(lock_ts, hashes, commit_ts); + Task::RemoveLockWait { token } => { + self.handle_remove_lock_wait(token); TASK_COUNTER_METRICS.wake_up.inc(); } + Task::UpdateWaitFor { events } => { + self.handle_update_wait_for(events); + TASK_COUNTER_METRICS.update_wait_for.inc(); + } Task::Dump { cb } => { self.handle_dump(cb); TASK_COUNTER_METRICS.dump.inc(); } Task::Deadlock { start_ts, + key, lock, deadlock_key_hash, wait_chain, } => { - self.handle_deadlock(start_ts, lock, deadlock_key_hash, wait_chain); + self.handle_deadlock(start_ts, key, lock, deadlock_key_hash, wait_chain); } - Task::ChangeConfig { timeout, delay } => self.handle_config_change(timeout, delay), + Task::ChangeConfig { timeout } => self.handle_config_change(timeout), #[cfg(any(test, feature = "testexport"))] Task::Validate(f) => f( self.default_wait_for_lock_timeout, - self.wake_up_delay_duration, + // self.wake_up_delay_duration, ), } } @@ -662,16 +683,20 @@ pub mod tests { config::ReadableDuration, future::paired_future_callback, time::InstantExt, worker::FutureWorker, }; + use txn_types::Key; use super::*; - use crate::storage::PessimisticLockRes; + use crate::storage::txn::ErrorInner as TxnErrorInner; fn dummy_waiter(start_ts: TimeStamp, lock_ts: TimeStamp, hash: u64) -> Waiter { Waiter { start_ts, - cb: StorageCallback::Boolean(Box::new(|_| ())), - pr: ProcessResult::Res, - lock: Lock { ts: lock_ts, hash }, + wait_info: KeyLockWaitInfo { + key: Key::from_raw(b""), + lock_digest: LockDigest { ts: lock_ts, hash }, + lock_info: Default::default(), + }, + cancel_callback: Box::new(|_| ()), diag_ctx: DiagnosticContext::default(), delay: Delay::new(Instant::now()), start_waiting_time: Instant::now(), @@ -741,9 +766,7 @@ pub mod tests { pub(crate) type WaiterCtx = ( Waiter, LockInfo, - futures::channel::oneshot::Receiver< - Result, StorageError>, - >, + futures::channel::oneshot::Receiver, ); pub(crate) fn new_test_waiter( @@ -751,29 +774,48 @@ pub mod tests { lock_ts: TimeStamp, lock_hash: u64, ) -> WaiterCtx { - let raw_key = b"foo".to_vec(); + new_test_waiter_impl(waiter_ts, lock_ts, None, Some(lock_hash)) + } + + pub(crate) fn new_test_waiter_with_key( + waiter_ts: TimeStamp, + lock_ts: TimeStamp, + key: &[u8], + ) -> WaiterCtx { + new_test_waiter_impl(waiter_ts, lock_ts, Some(key), None) + } + + fn new_test_waiter_impl( + waiter_ts: TimeStamp, + lock_ts: TimeStamp, + key: Option<&[u8]>, + lock_hash: Option, + ) -> WaiterCtx { + let raw_key = key.unwrap_or(b"foo").to_vec(); + let lock_hash = lock_hash.unwrap_or_else(|| Key::from_raw(&raw_key).gen_hash()); let primary = b"bar".to_vec(); let mut info = LockInfo::default(); - info.set_key(raw_key); + info.set_key(raw_key.clone()); info.set_lock_version(lock_ts.into_inner()); info.set_primary_lock(primary); info.set_lock_ttl(3000); info.set_txn_size(16); - let pr = ProcessResult::PessimisticLockRes { - res: Err(StorageError::from(TxnError::from(MvccError::from( - MvccErrorInner::KeyIsLocked(info.clone()), - )))), - }; - let lock = Lock { + let lock = LockDigest { ts: lock_ts, hash: lock_hash, }; let (cb, f) = paired_future_callback(); let waiter = Waiter::new( + 1, + Default::default(), + 1, waiter_ts, - StorageCallback::PessimisticLock(cb), - pr, - lock, + KeyLockWaitInfo { + key: Key::from_raw(&raw_key), + lock_digest: lock, + lock_info: info.clone(), + }, + cb, Instant::now() + Duration::from_millis(3000), DiagnosticContext::default(), Instant::now(), @@ -781,71 +823,25 @@ pub mod tests { (waiter, info, f) } - #[test] - fn test_waiter_extract_key_info() { - let (mut waiter, mut lock_info, _) = new_test_waiter(10.into(), 20.into(), 20); - assert_eq!( - waiter.extract_key_info(), - (lock_info.take_key(), lock_info.take_primary_lock()) - ); - - let (mut waiter, mut lock_info, _) = new_test_waiter(10.into(), 20.into(), 20); - waiter.conflict_with(20.into(), 30.into()); - assert_eq!( - waiter.extract_key_info(), - (lock_info.take_key(), lock_info.take_primary_lock()) - ); - } - - pub(crate) fn expect_key_is_locked( - res: Result, - lock_info: LockInfo, - ) { - match res { - Err(StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( + pub(crate) fn expect_key_is_locked(error: StorageError, lock_info: LockInfo) { + match error { + StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( MvccError(box MvccErrorInner::KeyIsLocked(res)), - ))))) => assert_eq!(res, lock_info), + )))) => assert_eq!(res, lock_info), e => panic!("unexpected error: {:?}", e), } } - pub(crate) fn expect_write_conflict( - res: Result, - waiter_ts: TimeStamp, - mut lock_info: LockInfo, - commit_ts: TimeStamp, - ) { - match res { - Err(StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( - MvccError(box MvccErrorInner::WriteConflict { - start_ts, - conflict_start_ts, - conflict_commit_ts, - key, - primary, - .. - }), - ))))) => { - assert_eq!(start_ts, waiter_ts); - assert_eq!(conflict_start_ts, lock_info.get_lock_version().into()); - assert_eq!(conflict_commit_ts, commit_ts); - assert_eq!(key, lock_info.take_key()); - assert_eq!(primary, lock_info.take_primary_lock()); - } - e => panic!("unexpected error: {:?}", e), - } - } - - pub(crate) fn expect_deadlock( - res: Result, + pub(crate) fn expect_deadlock( + error: StorageError, waiter_ts: TimeStamp, mut lock_info: LockInfo, deadlock_hash: u64, expect_wait_chain: &[(u64, u64, &[u8], &[u8])], /* (waiter_ts, wait_for_ts, key, * resource_group_tag) */ ) { - match res { - Err(StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( + match error { + StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( MvccError(box MvccErrorInner::Deadlock { start_ts, lock_ts, @@ -853,7 +849,7 @@ pub mod tests { deadlock_key_hash, wait_chain, }), - ))))) => { + )))) => { assert_eq!(start_ts, waiter_ts); assert_eq!(lock_ts, lock_info.get_lock_version().into()); assert_eq!(lock_key, lock_info.take_key()); @@ -900,41 +896,21 @@ pub mod tests { #[test] fn test_waiter_notify() { let (waiter, lock_info, f) = new_test_waiter(10.into(), 20.into(), 20); - waiter.notify(); - expect_key_is_locked(block_on(f).unwrap().unwrap(), lock_info); - - // A waiter can conflict with other transactions more than once. - for conflict_times in 1..=3 { - let waiter_ts = TimeStamp::new(10); - let mut lock_ts = TimeStamp::new(20); - let (mut waiter, mut lock_info, f) = new_test_waiter(waiter_ts, lock_ts, 20); - let mut conflict_commit_ts = TimeStamp::new(30); - for _ in 0..conflict_times { - waiter.conflict_with(*lock_ts.incr(), *conflict_commit_ts.incr()); - lock_info.set_lock_version(lock_ts.into_inner()); - } - waiter.notify(); - expect_write_conflict( - block_on(f).unwrap(), - waiter_ts, - lock_info, - conflict_commit_ts, - ); - } + waiter.cancel_for_timeout(false); + expect_key_is_locked(block_on(f).unwrap(), lock_info); // Deadlock let waiter_ts = TimeStamp::new(10); - let (mut waiter, lock_info, f) = new_test_waiter(waiter_ts, 20.into(), 20); - waiter.deadlock_with(111, vec![]); - waiter.notify(); - expect_deadlock(block_on(f).unwrap(), waiter_ts, lock_info, 111, &[]); - - // Conflict then deadlock. - let waiter_ts = TimeStamp::new(10); - let (mut waiter, lock_info, f) = new_test_waiter(waiter_ts, 20.into(), 20); - waiter.conflict_with(20.into(), 30.into()); - waiter.deadlock_with(111, vec![]); - waiter.notify(); + let (waiter, lock_info, f) = new_test_waiter(waiter_ts, 20.into(), 20); + waiter.cancel_for_deadlock( + LockDigest { + ts: 20.into(), + hash: 20, + }, + b"foo".to_vec(), + 111, + vec![], + ); expect_deadlock(block_on(f).unwrap(), waiter_ts, lock_info, 111, &[]); } @@ -953,7 +929,7 @@ pub mod tests { waiter.reset_timeout(Instant::now() + Duration::from_millis(100)); let (tx, rx) = mpsc::sync_channel(1); let f = waiter.on_timeout(move || tx.send(1).unwrap()); - waiter.notify(); + waiter.cancel_for_timeout(false); assert_elapsed(|| block_on(f), 0, 200); rx.try_recv().unwrap_err(); } @@ -963,33 +939,33 @@ pub mod tests { let mut wait_table = WaitTable::new(Arc::new(AtomicUsize::new(0))); let mut waiter_info = Vec::new(); let mut rng = rand::thread_rng(); - for _ in 0..20 { + for i in 0..20 { let waiter_ts = rng.gen::().into(); - let lock = Lock { + let lock = LockDigest { ts: rng.gen::().into(), hash: rng.gen(), }; - // Avoid adding duplicated waiter. - if wait_table - .add_waiter(dummy_waiter(waiter_ts, lock.ts, lock.hash)) - .is_none() - { - waiter_info.push((waiter_ts, lock)); - } + wait_table.add_waiter( + LockWaitToken(Some(i)), + dummy_waiter(waiter_ts, lock.ts, lock.hash), + ); + waiter_info.push((waiter_ts, lock)); } assert_eq!(wait_table.count(), waiter_info.len()); for (waiter_ts, lock) in waiter_info { - let waiter = wait_table.remove_waiter(lock, waiter_ts).unwrap(); + let waiter = wait_table + .take_waiter_by_lock_digest(lock, waiter_ts) + .unwrap(); assert_eq!(waiter.start_ts, waiter_ts); - assert_eq!(waiter.lock, lock); + assert_eq!(waiter.wait_info.lock_digest, lock); } assert_eq!(wait_table.count(), 0); assert!(wait_table.wait_table.is_empty()); assert!( wait_table - .remove_waiter( - Lock { + .take_waiter_by_lock_digest( + LockDigest { ts: TimeStamp::zero(), hash: 0, }, @@ -999,87 +975,53 @@ pub mod tests { ); } - #[test] - fn test_wait_table_add_duplicated_waiter() { - let mut wait_table = WaitTable::new(Arc::new(AtomicUsize::new(0))); - let waiter_ts = 10.into(); - let lock = Lock { - ts: 20.into(), - hash: 20, - }; - assert!( - wait_table - .add_waiter(dummy_waiter(waiter_ts, lock.ts, lock.hash)) - .is_none() - ); - let waiter = wait_table - .add_waiter(dummy_waiter(waiter_ts, lock.ts, lock.hash)) - .unwrap(); - assert_eq!(waiter.start_ts, waiter_ts); - assert_eq!(waiter.lock, lock); - } - - #[test] - fn test_wait_table_remove_oldest_waiter() { - let mut wait_table = WaitTable::new(Arc::new(AtomicUsize::new(0))); - let lock = Lock { - ts: 10.into(), - hash: 10, - }; - let waiter_count = 10; - let mut waiters_ts: Vec = (0..waiter_count).map(TimeStamp::from).collect(); - waiters_ts.shuffle(&mut rand::thread_rng()); - for ts in waiters_ts.iter() { - wait_table.add_waiter(dummy_waiter(*ts, lock.ts, lock.hash)); - } - assert_eq!(wait_table.count(), waiters_ts.len()); - waiters_ts.sort(); - for (i, ts) in waiters_ts.into_iter().enumerate() { - let (oldest, others) = wait_table.remove_oldest_waiter(lock).unwrap(); - assert_eq!(oldest.start_ts, ts); - assert_eq!(others.len(), waiter_count as usize - i - 1); - } - // There is no waiter in the wait table but there is an entry in it. - assert_eq!(wait_table.count(), 0); - assert_eq!(wait_table.wait_table.len(), 1); - wait_table.remove(lock); - assert!(wait_table.wait_table.is_empty()); - } - #[test] fn test_wait_table_is_empty() { let waiter_count = Arc::new(AtomicUsize::new(0)); let mut wait_table = WaitTable::new(Arc::clone(&waiter_count)); - let lock = Lock { + let lock = LockDigest { ts: 2.into(), hash: 2, }; - wait_table.add_waiter(dummy_waiter(1.into(), lock.ts, lock.hash)); + wait_table.add_waiter( + LockWaitToken(Some(1)), + dummy_waiter(1.into(), lock.ts, lock.hash), + ); // Increase waiter_count manually and assert the previous value is zero assert_eq!(waiter_count.fetch_add(1, Ordering::SeqCst), 0); - // Adding a duplicated waiter shouldn't increase waiter count. - waiter_count.fetch_add(1, Ordering::SeqCst); - wait_table.add_waiter(dummy_waiter(1.into(), lock.ts, lock.hash)); - assert_eq!(waiter_count.load(Ordering::SeqCst), 1); - // Remove the waiter. - wait_table.remove_waiter(lock, 1.into()).unwrap(); + wait_table + .take_waiter_by_lock_digest(lock, 1.into()) + .unwrap(); assert_eq!(waiter_count.load(Ordering::SeqCst), 0); // Removing a non-existed waiter shouldn't decrease waiter count. - assert!(wait_table.remove_waiter(lock, 1.into()).is_none()); + assert!( + wait_table + .take_waiter_by_lock_digest(lock, 1.into()) + .is_none() + ); assert_eq!(waiter_count.load(Ordering::SeqCst), 0); - wait_table.add_waiter(dummy_waiter(1.into(), lock.ts, lock.hash)); - wait_table.add_waiter(dummy_waiter(2.into(), lock.ts, lock.hash)); + wait_table.add_waiter( + LockWaitToken(Some(2)), + dummy_waiter(1.into(), lock.ts, lock.hash), + ); + wait_table.add_waiter( + LockWaitToken(Some(3)), + dummy_waiter(2.into(), lock.ts, lock.hash), + ); waiter_count.fetch_add(2, Ordering::SeqCst); - wait_table.remove_oldest_waiter(lock).unwrap(); + wait_table.take_waiter(LockWaitToken(Some(3))).unwrap(); assert_eq!(waiter_count.load(Ordering::SeqCst), 1); - wait_table.remove_oldest_waiter(lock).unwrap(); + wait_table.take_waiter(LockWaitToken(Some(2))).unwrap(); assert_eq!(waiter_count.load(Ordering::SeqCst), 0); - wait_table.remove(lock); // Removing a non-existed waiter shouldn't decrease waiter count. - assert!(wait_table.remove_oldest_waiter(lock).is_none()); + assert!( + wait_table + .take_waiter_by_lock_digest(lock, 1.into()) + .is_none() + ); assert_eq!(waiter_count.load(Ordering::SeqCst), 0); } @@ -1090,7 +1032,10 @@ pub mod tests { for i in 1..5 { for j in 0..i { - wait_table.add_waiter(dummy_waiter((i * 10 + j).into(), i.into(), j)); + wait_table.add_waiter( + LockWaitToken(Some(i * 10 + j)), + dummy_waiter((i * 10 + j).into(), i.into(), j), + ); } } @@ -1135,15 +1080,18 @@ pub mod tests { // Default timeout let (waiter, lock_info, f) = new_test_waiter(10.into(), 20.into(), 20); scheduler.wait_for( + LockWaitToken(Some(1)), + 1, + RegionEpoch::default(), + 1, waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, + waiter.wait_info, WaitTimeout::Millis(1000), + waiter.cancel_callback, DiagnosticContext::default(), ); assert_elapsed( - || expect_key_is_locked(block_on(f).unwrap().unwrap(), lock_info), + || expect_key_is_locked(block_on(f).unwrap(), lock_info), 900, 1200, ); @@ -1151,15 +1099,18 @@ pub mod tests { // Custom timeout let (waiter, lock_info, f) = new_test_waiter(20.into(), 30.into(), 30); scheduler.wait_for( + LockWaitToken(Some(2)), + 1, + RegionEpoch::default(), + 1, waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, + waiter.wait_info, WaitTimeout::Millis(100), + waiter.cancel_callback, DiagnosticContext::default(), ); assert_elapsed( - || expect_key_is_locked(block_on(f).unwrap().unwrap(), lock_info), + || expect_key_is_locked(block_on(f).unwrap(), lock_info), 50, 300, ); @@ -1167,15 +1118,18 @@ pub mod tests { // Timeout can't exceed wait_for_lock_timeout let (waiter, lock_info, f) = new_test_waiter(30.into(), 40.into(), 40); scheduler.wait_for( + LockWaitToken(Some(3)), + 1, + RegionEpoch::default(), + 1, waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, + waiter.wait_info, WaitTimeout::Millis(3000), + waiter.cancel_callback, DiagnosticContext::default(), ); assert_elapsed( - || expect_key_is_locked(block_on(f).unwrap().unwrap(), lock_info), + || expect_key_is_locked(block_on(f).unwrap(), lock_info), 900, 1200, ); @@ -1183,160 +1137,29 @@ pub mod tests { worker.stop().unwrap(); } - #[test] - fn test_waiter_manager_wake_up() { - let (wait_for_lock_timeout, wake_up_delay_duration) = (1000, 100); - let (mut worker, scheduler) = - start_waiter_manager(wait_for_lock_timeout, wake_up_delay_duration); - - // Waiters waiting for different locks should be waked up immediately. - let lock_ts = 10.into(); - let lock_hashes = vec![10, 11, 12]; - let waiters_ts = vec![20.into(), 30.into(), 40.into()]; - let mut waiters_info = vec![]; - for (&lock_hash, &waiter_ts) in lock_hashes.iter().zip(waiters_ts.iter()) { - let (waiter, lock_info, f) = new_test_waiter(waiter_ts, lock_ts, lock_hash); - scheduler.wait_for( - waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, - WaitTimeout::Millis(wait_for_lock_timeout), - DiagnosticContext::default(), - ); - waiters_info.push((waiter_ts, lock_info, f)); - } - let commit_ts = 15.into(); - scheduler.wake_up(lock_ts, lock_hashes, commit_ts); - for (waiter_ts, lock_info, f) in waiters_info { - assert_elapsed( - || expect_write_conflict(block_on(f).unwrap(), waiter_ts, lock_info, commit_ts), - 0, - 200, - ); - } - - // Multiple waiters are waiting for one lock. - let mut lock = Lock { - ts: 10.into(), - hash: 10, - }; - let mut waiters_ts: Vec = (20..25).map(TimeStamp::from).collect(); - // Waiters are added in arbitrary order. - waiters_ts.shuffle(&mut rand::thread_rng()); - let mut waiters_info = vec![]; - for waiter_ts in waiters_ts { - let (waiter, lock_info, f) = new_test_waiter(waiter_ts, lock.ts, lock.hash); - scheduler.wait_for( - waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, - WaitTimeout::Millis(wait_for_lock_timeout), - DiagnosticContext::default(), - ); - waiters_info.push((waiter_ts, lock_info, f)); - } - waiters_info.sort_by_key(|(ts, ..)| *ts); - let mut commit_ts = 30.into(); - // Each waiter should be waked up immediately in order. - for (waiter_ts, mut lock_info, f) in waiters_info.drain(..waiters_info.len() - 1) { - scheduler.wake_up(lock.ts, vec![lock.hash], commit_ts); - lock_info.set_lock_version(lock.ts.into_inner()); - assert_elapsed( - || expect_write_conflict(block_on(f).unwrap(), waiter_ts, lock_info, commit_ts), - 0, - 200, - ); - // Now the lock is held by the waked up transaction. - lock.ts = waiter_ts; - commit_ts.incr(); - } - // Last waiter isn't waked up by other transactions. It will be waked up after - // wake_up_delay_duration. - let (waiter_ts, mut lock_info, f) = waiters_info.pop().unwrap(); - // It conflicts with the last transaction. - lock_info.set_lock_version(lock.ts.into_inner() - 1); - assert_elapsed( - || { - expect_write_conflict( - block_on(f).unwrap(), - waiter_ts, - lock_info, - *commit_ts.decr(), - ) - }, - wake_up_delay_duration - 50, - wake_up_delay_duration + 200, - ); - - // The max lifetime of waiter is its timeout. - let lock = Lock { - ts: 10.into(), - hash: 10, - }; - let (waiter1, lock_info1, f1) = new_test_waiter(20.into(), lock.ts, lock.hash); - scheduler.wait_for( - waiter1.start_ts, - waiter1.cb, - waiter1.pr, - waiter1.lock, - WaitTimeout::Millis(wait_for_lock_timeout), - DiagnosticContext::default(), - ); - let (waiter2, lock_info2, f2) = new_test_waiter(30.into(), lock.ts, lock.hash); - // Waiter2's timeout is 50ms which is less than wake_up_delay_duration. - scheduler.wait_for( - waiter2.start_ts, - waiter2.cb, - waiter2.pr, - waiter2.lock, - WaitTimeout::Millis(50), - DiagnosticContext::default(), - ); - let commit_ts = 15.into(); - let (tx, rx) = mpsc::sync_channel(1); - std::thread::spawn(move || { - // Waiters2's lifetime can't exceed it timeout. - assert_elapsed( - || expect_write_conflict(block_on(f2).unwrap(), 30.into(), lock_info2, 15.into()), - 30, - 100, - ); - tx.send(()).unwrap(); - }); - // It will increase waiter2's timeout to wake_up_delay_duration. - scheduler.wake_up(lock.ts, vec![lock.hash], commit_ts); - assert_elapsed( - || expect_write_conflict(block_on(f1).unwrap(), 20.into(), lock_info1, commit_ts), - 0, - 200, - ); - rx.recv().unwrap(); - - worker.stop().unwrap(); - } - #[test] fn test_waiter_manager_deadlock() { let (mut worker, scheduler) = start_waiter_manager(1000, 100); let (waiter_ts, lock) = ( 10.into(), - Lock { + LockDigest { ts: 20.into(), hash: 20, }, ); let (waiter, lock_info, f) = new_test_waiter(waiter_ts, lock.ts, lock.hash); scheduler.wait_for( + LockWaitToken(Some(1)), + 1, + RegionEpoch::default(), + 1, waiter.start_ts, - waiter.cb, - waiter.pr, - waiter.lock, + waiter.wait_info, WaitTimeout::Millis(1000), + waiter.cancel_callback, DiagnosticContext::default(), ); - scheduler.deadlock(waiter_ts, lock, 30, vec![]); + scheduler.deadlock(waiter_ts, b"foo".to_vec(), lock, 30, vec![]); assert_elapsed( || expect_deadlock(block_on(f).unwrap(), waiter_ts, lock_info, 30, &[]), 0, @@ -1344,67 +1167,4 @@ pub mod tests { ); worker.stop().unwrap(); } - - #[test] - fn test_waiter_manager_with_duplicated_waiters() { - let (mut worker, scheduler) = start_waiter_manager(1000, 100); - let (waiter_ts, lock) = ( - 10.into(), - Lock { - ts: 20.into(), - hash: 20, - }, - ); - let (waiter1, lock_info1, f1) = new_test_waiter(waiter_ts, lock.ts, lock.hash); - scheduler.wait_for( - waiter1.start_ts, - waiter1.cb, - waiter1.pr, - waiter1.lock, - WaitTimeout::Millis(1000), - DiagnosticContext::default(), - ); - let (waiter2, lock_info2, f2) = new_test_waiter(waiter_ts, lock.ts, lock.hash); - scheduler.wait_for( - waiter2.start_ts, - waiter2.cb, - waiter2.pr, - waiter2.lock, - WaitTimeout::Millis(1000), - DiagnosticContext::default(), - ); - // Should notify duplicated waiter immediately. - assert_elapsed( - || expect_key_is_locked(block_on(f1).unwrap().unwrap(), lock_info1), - 0, - 200, - ); - // The new waiter will be wake up after timeout. - assert_elapsed( - || expect_key_is_locked(block_on(f2).unwrap().unwrap(), lock_info2), - 900, - 1200, - ); - - worker.stop().unwrap(); - } - - #[bench] - fn bench_wake_up_small_table_against_big_hashes(b: &mut test::Bencher) { - let detect_worker = FutureWorker::new("dummy-deadlock"); - let detector_scheduler = DetectorScheduler::new(detect_worker.scheduler()); - let mut waiter_mgr = WaiterManager::new( - Arc::new(AtomicUsize::new(0)), - detector_scheduler, - &Config::default(), - ); - waiter_mgr - .wait_table - .borrow_mut() - .add_waiter(dummy_waiter(10.into(), 20.into(), 10000)); - let hashes: Vec = (0..1000).collect(); - b.iter(|| { - waiter_mgr.handle_wake_up(20.into(), hashes.clone(), 30.into()); - }); - } } diff --git a/src/server/server.rs b/src/server/server.rs index 23c52793c5f..992b5cf6fa0 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -441,7 +441,7 @@ mod tests { config::CoprReadPoolConfig, coprocessor::{self, readpool_impl}, server::TestRaftStoreRouter, - storage::{lock_manager::DummyLockManager, TestStorageBuilderApiV1}, + storage::{lock_manager::MockLockManager, TestStorageBuilderApiV1}, }; #[derive(Clone)] @@ -490,7 +490,7 @@ mod tests { ..Default::default() }; - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs index 46ed24fde70..24a61876f44 100644 --- a/src/storage/lock_manager/lock_wait_context.rs +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -10,9 +10,6 @@ //! timeout. [`LockWaitContext`] is therefore used to share the necessary state //! of a single `AcquirePessimisticLock` request, and ensuring the internal //! callback for returning response through RPC is called at most only once. -//! -//! Note: The corresponding implementation in `WaiterManager` is not yet -//! implemented, and this mod is currently not used yet. use std::{convert::TryInto, result::Result, sync::Arc}; @@ -138,10 +135,9 @@ impl LockWaitContext { return; } } else { - // TODO: Uncomment this after the corresponding change of - // `LockManager` is done. self.lock_wait_queues. - // get_lock_mgr() .remove_lock_wait(ctx_inner. - // lock_wait_token); + self.lock_wait_queues + .get_lock_mgr() + .remove_lock_wait(self.shared_states.lock_wait_token); } // When this is executed, the waiter is either woken up from the queue or @@ -172,7 +168,7 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::{lock_waiting_queue::LockWaitEntry, DummyLockManager}, + lock_manager::{lock_waiting_queue::LockWaitEntry, MockLockManager}, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::{Error as TxnError, ErrorInner as TxnErrorInner}, types::PessimisticLockParameters, @@ -197,7 +193,7 @@ mod tests { Receiver>>, ) { let (cb, rx) = create_storage_cb(); - let token = LockWaitToken(Some(1)); + let token = lock_wait_queues.get_lock_mgr().allocate_token(); let ctx = LockWaitContext::new(key.clone(), lock_wait_queues.clone(), token, cb, false); (token, ctx, rx) } @@ -226,7 +222,7 @@ mod tests { // TODO: Use `ProxyLockMgr` to check the correctness of the `remove_lock_wait` // invocation. - let lock_wait_queues = LockWaitQueues::new(DummyLockManager {}); + let lock_wait_queues = LockWaitQueues::new(MockLockManager::new()); let (_, ctx, rx) = create_test_lock_wait_ctx(&key, &lock_wait_queues); // Nothing happens currently. diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index 3651ce21c1c..16b3787bd7e 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -212,15 +212,15 @@ impl KeyLockWaitState { pub type DelayedNotifyAllFuture = Pin>> + Send>>; -pub struct LockWaitQueueInner { +pub struct LockWaitQueueInner { queue_map: dashmap::DashMap, id_allocated: AtomicU64, + lock_mgr: L, } #[derive(Clone)] pub struct LockWaitQueues { - inner: Arc, - lock_mgr: L, + inner: Arc>, } impl LockWaitQueues { @@ -229,8 +229,8 @@ impl LockWaitQueues { inner: Arc::new(LockWaitQueueInner { queue_map: dashmap::DashMap::new(), id_allocated: AtomicU64::new(1), + lock_mgr, }), - lock_mgr, } } @@ -547,7 +547,7 @@ impl LockWaitQueues { #[allow(dead_code)] pub(super) fn get_lock_mgr(&self) -> &L { - &self.lock_mgr + &self.inner.lock_mgr } #[cfg(test)] @@ -582,7 +582,7 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::{lock_wait_context::LockWaitContext, DummyLockManager, WaitTimeout}, + lock_manager::{lock_wait_context::LockWaitContext, MockLockManager, WaitTimeout}, txn::ErrorInner as TxnErrorInner, ErrorInner as StorageErrorInner, StorageCallback, }; @@ -639,7 +639,7 @@ mod tests { lock_info_pb: kvrpcpb::LockInfo, ) -> (Box, TestLockWaitEntryHandle) { let start_ts = start_ts.into(); - let token = LockWaitToken(Some(self.allocate_internal_id())); + let token = self.inner.lock_mgr.allocate_token(); let dummy_request_cb = StorageCallback::PessimisticLock(Box::new(|_| ())); let dummy_ctx = LockWaitContext::new( Key::from_raw(key), @@ -830,7 +830,7 @@ mod tests { #[test] fn test_simple_push_pop() { - let queues = LockWaitQueues::new(DummyLockManager {}); + let queues = LockWaitQueues::new(MockLockManager::new()); queues.mock_lock_wait(b"k1", 10, 5, false); queues.mock_lock_wait(b"k2", 11, 5, false); @@ -852,7 +852,7 @@ mod tests { #[test] fn test_popping_priority() { - let queues = LockWaitQueues::new(DummyLockManager {}); + let queues = LockWaitQueues::new(MockLockManager::new()); queues.mock_lock_wait(b"k1", 10, 5, false); queues.mock_lock_wait(b"k1", 20, 5, false); @@ -874,7 +874,7 @@ mod tests { #[test] fn test_removing_by_token() { - let queues = LockWaitQueues::new(DummyLockManager {}); + let queues = LockWaitQueues::new(MockLockManager::new()); queues.mock_lock_wait(b"k1", 10, 5, false); let token11 = queues.mock_lock_wait(b"k1", 11, 5, false).token; @@ -915,7 +915,7 @@ mod tests { #[test] fn test_dropping_cancelled_entries() { - let queues = LockWaitQueues::new(DummyLockManager {}); + let queues = LockWaitQueues::new(MockLockManager::new()); let h10 = queues.mock_lock_wait(b"k1", 10, 5, false); let h11 = queues.mock_lock_wait(b"k1", 11, 5, false); @@ -941,7 +941,7 @@ mod tests { #[tokio::test] async fn test_delayed_notify_all() { - let queues = LockWaitQueues::new(DummyLockManager {}); + let queues = LockWaitQueues::new(MockLockManager::new()); queues.mock_lock_wait(b"k1", 8, 5, false); diff --git a/src/storage/lock_manager/mod.rs b/src/storage/lock_manager/mod.rs index 235a31c3710..3ba9c7f7905 100644 --- a/src/storage/lock_manager/mod.rs +++ b/src/storage/lock_manager/mod.rs @@ -1,20 +1,34 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{ + fmt::{Debug, Formatter}, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::Duration, +}; +use collections::HashMap; +use kvproto::{kvrpcpb::LockInfo, metapb::RegionEpoch}; +use parking_lot::Mutex; use tracker::TrackerToken; -use txn_types::TimeStamp; +use txn_types::{Key, TimeStamp}; use crate::{ server::lock_manager::{waiter_manager, waiter_manager::Callback}, - storage::{txn::ProcessResult, types::StorageCallback}, + storage::{ + mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, + txn::Error as TxnError, + Error as StorageError, + }, }; pub mod lock_wait_context; pub mod lock_waiting_queue; #[derive(Clone, Copy, PartialEq, Debug, Default)] -pub struct Lock { +pub struct LockDigest { pub ts: TimeStamp, pub hash: u64, } @@ -32,6 +46,16 @@ pub struct DiagnosticContext { pub tracker: TrackerToken, } +impl Debug for DiagnosticContext { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DiagnosticContext") + .field("key", &log_wrappers::Value::key(&self.key)) + // TODO: Perhaps the resource group tag don't need to be a secret + .field("resource_group_tag", &log_wrappers::Value::key(&self.resource_group_tag)) + .finish() + } +} + /// Time to wait for lock released when encountering locks. #[derive(Clone, Copy, PartialEq, Debug)] pub enum WaitTimeout { @@ -67,9 +91,14 @@ impl From for WaitTimeout { } } +#[derive(Debug, Clone)] +pub struct KeyLockWaitInfo { + pub key: Key, + pub lock_digest: LockDigest, + pub lock_info: LockInfo, +} + /// Uniquely identifies a lock-waiting request in a `LockManager`. -/// -/// Not used yet, but necessary for implementing `LockWaitQueues`. #[derive(Clone, Copy, Hash, PartialEq, Eq, Debug)] pub struct LockWaitToken(pub Option); @@ -79,10 +108,27 @@ impl LockWaitToken { } } +#[derive(Debug)] +pub struct UpdateWaitForEvent { + pub token: LockWaitToken, + pub start_ts: TimeStamp, + pub is_first_lock: bool, + pub wait_info: KeyLockWaitInfo, + pub diag_ctx: DiagnosticContext, +} + /// `LockManager` manages transactions waiting for locks held by other /// transactions. It has responsibility to handle deadlocks between /// transactions. -pub trait LockManager: Clone + Send + 'static { +pub trait LockManager: Clone + Send + Sync + 'static { + /// Allocates a token for identifying a specific lock-waiting relationship. + /// Use this to allocate a token before invoking `wait_for`. + /// + /// Since some information required by `wait_for` need to be initialized by + /// the token, allocating token is therefore separated to a single + /// function instead of internally allocated in `wait_for`. + fn allocate_token(&self) -> LockWaitToken; + /// Transaction with `start_ts` waits for `lock` released. /// /// If the lock is released or waiting times out or deadlock occurs, the @@ -93,24 +139,22 @@ pub trait LockManager: Clone + Send + 'static { /// in deadlock. fn wait_for( &self, + token: LockWaitToken, + region_id: u64, + region_epoch: RegionEpoch, + term: u64, start_ts: TimeStamp, - cb: StorageCallback, - pr: ProcessResult, - lock: Lock, + wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, + cancel_callback: Box, diag_ctx: DiagnosticContext, ); - /// The locks with `lock_ts` and `hashes` are released, tries to wake up - /// transactions. - fn wake_up( - &self, - lock_ts: TimeStamp, - hashes: Vec, - commit_ts: TimeStamp, - is_pessimistic_txn: bool, - ); + fn update_wait_for(&self, updated_items: Vec); + + /// Remove a waiter specified by token. + fn remove_lock_wait(&self, token: LockWaitToken); /// Returns true if there are waiters in the `LockManager`. /// @@ -124,31 +168,66 @@ pub trait LockManager: Clone + Send + 'static { // For test #[derive(Clone)] -pub struct DummyLockManager; +pub struct MockLockManager { + allocated_token: Arc, + waiters: + Arc)>>>, +} + +impl MockLockManager { + pub fn new() -> Self { + Self { + allocated_token: Arc::new(AtomicU64::new(1)), + waiters: Arc::new(Mutex::new(HashMap::default())), + } + } +} + +// Make the linter happy. +impl Default for MockLockManager { + fn default() -> Self { + Self::new() + } +} + +impl LockManager for MockLockManager { + fn allocate_token(&self) -> LockWaitToken { + LockWaitToken(Some(self.allocated_token.fetch_add(1, Ordering::Relaxed))) + } -impl LockManager for DummyLockManager { fn wait_for( &self, + token: LockWaitToken, + _region_id: u64, + _region_epoch: RegionEpoch, + _term: u64, _start_ts: TimeStamp, - _cb: StorageCallback, - _pr: ProcessResult, - _lock: Lock, + wait_info: KeyLockWaitInfo, _is_first_lock: bool, - _wait_timeout: Option, + _timeout: Option, + cancel_callback: Box, _diag_ctx: DiagnosticContext, ) { + self.waiters + .lock() + .insert(token, (wait_info, cancel_callback)); } - fn wake_up( - &self, - _lock_ts: TimeStamp, - _hashes: Vec, - _commit_ts: TimeStamp, - _is_pessimistic_txn: bool, - ) { - } + fn update_wait_for(&self, _updated_items: Vec) {} + + fn remove_lock_wait(&self, _token: LockWaitToken) {} fn dump_wait_for_entries(&self, cb: Callback) { cb(vec![]) } } + +impl MockLockManager { + pub fn simulate_timeout_all(&self) { + let mut map = self.waiters.lock(); + for (_, (wait_info, cancel_callback)) in map.drain() { + let error = MvccError::from(MvccErrorInner::KeyIsLocked(wait_info.lock_info)); + cancel_callback(StorageError::from(TxnError::from(error))); + } + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 84d52b6990a..33d1c4ddf97 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -65,7 +65,7 @@ use std::{ iter, marker::PhantomData, sync::{ - atomic::{self, AtomicBool}, + atomic::{self, AtomicBool, AtomicU64}, Arc, }, }; @@ -116,7 +116,7 @@ use crate::{ storage::{ config::Config, kv::{with_tls_engine, Modify, WriteData}, - lock_manager::{DummyLockManager, LockManager}, + lock_manager::{LockManager, MockLockManager}, metrics::{CommandKind, *}, mvcc::{MvccReader, PointGetterBuilder}, txn::{ @@ -2828,6 +2828,7 @@ pub async fn get_causal_ts( pub struct DynamicConfigs { pub pipelined_pessimistic_lock: Arc, pub in_memory_pessimistic_lock: Arc, + pub wake_up_delay_duration_ms: Arc, } fn get_priority_tag(priority: CommandPri) -> CommandPriority { @@ -2923,6 +2924,7 @@ pub struct TestStorageBuilder { config: Config, pipelined_pessimistic_lock: Arc, in_memory_pessimistic_lock: Arc, + wake_up_delay_duration_ms: Arc, lock_mgr: L, resource_tag_factory: ResourceTagFactory, _phantom: PhantomData, @@ -2932,9 +2934,9 @@ pub struct TestStorageBuilder { /// To be convenience for test cases unrelated to RawKV. pub type TestStorageBuilderApiV1 = TestStorageBuilder; -impl TestStorageBuilder { +impl TestStorageBuilder { /// Build `Storage`. - pub fn new(lock_mgr: DummyLockManager) -> Self { + pub fn new(lock_mgr: MockLockManager) -> Self { let engine = TestEngineBuilder::new() .api_version(F::TAG) .build() @@ -3055,6 +3057,8 @@ impl TestStorageBuilder { config, pipelined_pessimistic_lock: Arc::new(AtomicBool::new(false)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), + // Make it very large to avoid tests being affected by the delayed-waking-up behavior. + wake_up_delay_duration_ms: Arc::new(AtomicU64::new(100000)), lock_mgr, resource_tag_factory: ResourceTagFactory::new_for_test(), _phantom: PhantomData, @@ -3119,6 +3123,7 @@ impl TestStorageBuilder { DynamicConfigs { pipelined_pessimistic_lock: self.pipelined_pessimistic_lock, in_memory_pessimistic_lock: self.in_memory_pessimistic_lock, + wake_up_delay_duration_ms: self.wake_up_delay_duration_ms, }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), DummyReporter, @@ -3148,6 +3153,7 @@ impl TestStorageBuilder { DynamicConfigs { pipelined_pessimistic_lock: self.pipelined_pessimistic_lock, in_memory_pessimistic_lock: self.in_memory_pessimistic_lock, + wake_up_delay_duration_ms: self.wake_up_delay_duration_ms, }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), DummyReporter, @@ -3179,7 +3185,7 @@ pub mod test_util { }; use super::*; - use crate::storage::txn::commands; + use crate::storage::{lock_manager::WaitTimeout, txn::commands}; pub fn expect_none(x: Option) { assert_eq!(x, None); @@ -3283,7 +3289,7 @@ pub mod test_util { 3000, false, for_update_ts, - None, + Some(WaitTimeout::Default), return_values, for_update_ts.next(), OldValues::default(), @@ -3405,9 +3411,11 @@ mod tests { use error_code::ErrorCodeExt; use errors::extract_key_error; use futures::executor::block_on; - use kvproto::kvrpcpb::{ - Assertion, AssertionLevel, CommandPri, Op, PrewriteRequestPessimisticAction::*, + use kvproto::{ + kvrpcpb::{Assertion, AssertionLevel, CommandPri, Op, PrewriteRequestPessimisticAction::*}, + metapb::RegionEpoch, }; + use parking_lot::Mutex; use tikv_util::config::ReadableSize; use tracker::INVALID_TRACKER_TOKEN; use txn_types::{Mutation, PessimisticLock, WriteType, SHORT_VALUE_MAX_LEN}; @@ -3426,8 +3434,11 @@ mod tests { kv::{ Error as KvError, ErrorInner as EngineErrorInner, ExpectedWrite, MockEngineBuilder, }, - lock_manager::{DiagnosticContext, Lock, WaitTimeout}, - mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, LockType}, + lock_manager::{ + DiagnosticContext, KeyLockWaitInfo, LockDigest, LockWaitToken, UpdateWaitForEvent, + WaitTimeout, + }, + mvcc::LockType, txn::{ commands, commands::{AcquirePessimisticLock, Prewrite}, @@ -3440,7 +3451,7 @@ mod tests { #[test] fn test_prewrite_blocks_read() { use kvproto::kvrpcpb::ExtraOp; - let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); @@ -3456,7 +3467,7 @@ mod tests { .process_write( snapshot, commands::WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: storage.concurrency_manager.clone(), extra_op: ExtraOp::Noop, statistics: &mut Statistics::default(), @@ -3478,7 +3489,7 @@ mod tests { #[test] fn test_get_put() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -3539,9 +3550,10 @@ mod tests { .cfs([CF_DEFAULT, "foo"]) .build() .unwrap(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) - .build() - .unwrap(); + let storage = + TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) + .build() + .unwrap(); let (tx, rx) = channel(); storage .sched_txn_command( @@ -3628,7 +3640,7 @@ mod tests { #[test] fn test_scan() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -3966,9 +3978,10 @@ mod tests { ) } .unwrap(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) - .build() - .unwrap(); + let storage = + TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) + .build() + .unwrap(); let (tx, rx) = channel(); storage .sched_txn_command( @@ -4195,7 +4208,7 @@ mod tests { #[test] fn test_batch_get() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -4270,7 +4283,7 @@ mod tests { #[test] fn test_batch_get_command() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -4358,7 +4371,7 @@ mod tests { #[test] fn test_txn() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -4444,7 +4457,7 @@ mod tests { scheduler_pending_write_threshold: ReadableSize(1), ..Default::default() }; - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .config(config) .build() .unwrap(); @@ -4487,7 +4500,7 @@ mod tests { #[test] fn test_cleanup() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let cm = storage.concurrency_manager.clone(); @@ -4525,7 +4538,7 @@ mod tests { #[test] fn test_cleanup_check_ttl() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -4583,7 +4596,7 @@ mod tests { #[test] fn test_flashback_to_version() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let mut ts = TimeStamp::zero(); @@ -4714,7 +4727,7 @@ mod tests { #[test] fn test_flashback_to_version_lock() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -4818,7 +4831,7 @@ mod tests { #[test] fn test_flashback_to_version_in_multi_batch() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -4915,7 +4928,7 @@ mod tests { #[test] fn test_high_priority_get_put() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -4972,7 +4985,7 @@ mod tests { scheduler_worker_pool_size: 1, ..Default::default() }; - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .config(config) .build() .unwrap(); @@ -5026,7 +5039,7 @@ mod tests { #[test] fn test_delete_range() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5132,7 +5145,7 @@ mod tests { } fn test_raw_get_put_impl() { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5186,7 +5199,7 @@ mod tests { } fn test_raw_checksum_impl() { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5253,7 +5266,7 @@ mod tests { ]; let k = b"r\0k".to_vec(); - let storage = TestStorageBuilder::<_, _, ApiV2>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, ApiV2>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5323,7 +5336,7 @@ mod tests { } fn test_raw_delete_impl() { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5417,7 +5430,7 @@ mod tests { } fn test_raw_delete_range_impl() { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5530,7 +5543,7 @@ mod tests { fn run_raw_batch_put( for_cas: bool, - storage: &Storage, + storage: &Storage, ctx: Context, kvpairs: Vec, ttls: Vec, @@ -5544,7 +5557,7 @@ mod tests { } fn test_raw_batch_put_impl(for_cas: bool) { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5634,7 +5647,7 @@ mod tests { } fn test_raw_batch_get_impl() { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5681,7 +5694,7 @@ mod tests { } fn test_raw_batch_get_command_impl() { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5745,7 +5758,7 @@ mod tests { fn run_raw_batch_delete( for_cas: bool, - storage: &Storage, + storage: &Storage, ctx: Context, keys: Vec>, cb: Callback<()>, @@ -5758,7 +5771,7 @@ mod tests { } fn test_raw_batch_delete_impl(for_cas: bool) { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -5872,7 +5885,7 @@ mod tests { (None, None) }; - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -6174,7 +6187,7 @@ mod tests { ]); // TODO: refactor to use `Api` parameter. assert_eq!( - >::check_key_ranges(&ranges, false,), + >::check_key_ranges(&ranges, false,), true ); @@ -6184,7 +6197,7 @@ mod tests { (b"c".to_vec(), vec![]), ]); assert_eq!( - >::check_key_ranges(&ranges, false,), + >::check_key_ranges(&ranges, false,), true ); @@ -6194,7 +6207,7 @@ mod tests { (b"c3".to_vec(), b"c".to_vec()), ]); assert_eq!( - >::check_key_ranges(&ranges, false,), + >::check_key_ranges(&ranges, false,), false ); @@ -6206,7 +6219,7 @@ mod tests { (b"a".to_vec(), vec![]), ]); assert_eq!( - >::check_key_ranges(&ranges, false,), + >::check_key_ranges(&ranges, false,), false ); @@ -6216,7 +6229,7 @@ mod tests { (b"c3".to_vec(), b"c".to_vec()), ]); assert_eq!( - >::check_key_ranges(&ranges, true,), + >::check_key_ranges(&ranges, true,), true ); @@ -6226,7 +6239,7 @@ mod tests { (b"a3".to_vec(), vec![]), ]); assert_eq!( - >::check_key_ranges(&ranges, true,), + >::check_key_ranges(&ranges, true,), true ); @@ -6236,7 +6249,7 @@ mod tests { (b"c".to_vec(), b"c3".to_vec()), ]); assert_eq!( - >::check_key_ranges(&ranges, true,), + >::check_key_ranges(&ranges, true,), false ); @@ -6246,7 +6259,7 @@ mod tests { (b"c3".to_vec(), vec![]), ]); assert_eq!( - >::check_key_ranges(&ranges, true,), + >::check_key_ranges(&ranges, true,), false ); } @@ -6271,7 +6284,7 @@ mod tests { .collect() }; - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -6512,7 +6525,7 @@ mod tests { } fn test_raw_get_key_ttl_impl() { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -6571,7 +6584,7 @@ mod tests { } fn test_raw_compare_and_swap_impl() { - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -6757,7 +6770,7 @@ mod tests { #[test] fn test_scan_lock() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -7058,7 +7071,7 @@ mod tests { fn test_resolve_lock_impl() { use crate::storage::txn::RESOLVE_LOCK_BATCH_SIZE; - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -7169,7 +7182,7 @@ mod tests { #[test] fn test_resolve_lock_lite() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -7277,7 +7290,7 @@ mod tests { #[test] fn test_txn_heart_beat() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -7364,7 +7377,7 @@ mod tests { #[test] fn test_check_txn_status() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let cm = storage.concurrency_manager.clone(); @@ -7571,7 +7584,7 @@ mod tests { #[test] fn test_check_secondary_locks() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let cm = storage.concurrency_manager.clone(); @@ -7689,7 +7702,8 @@ mod tests { } fn test_pessimistic_lock_impl(pipelined_pessimistic_lock: bool) { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let lock_mgr = MockLockManager::new(); + let storage = TestStorageBuilderApiV1::new(lock_mgr.clone()) .pipelined_pessimistic_lock(pipelined_pessimistic_lock) .build() .unwrap(); @@ -7781,8 +7795,11 @@ mod tests { }), ) .unwrap(); - // The DummyLockManager consumes the Msg::WaitForLock. + // The request enters lock waiting state. rx.recv_timeout(Duration::from_millis(100)).unwrap_err(); + lock_mgr.simulate_timeout_all(); + // The lock-waiting request is cancelled. + rx.recv().unwrap(); } // Needn't update max_ts when failing to read value @@ -7904,20 +7921,19 @@ mod tests { #[allow(clippy::large_enum_variant)] pub enum Msg { WaitFor { + token: LockWaitToken, + region_id: u64, + region_epoch: RegionEpoch, + term: u64, start_ts: TimeStamp, - cb: StorageCallback, - pr: ProcessResult, - lock: Lock, + wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, + cancel_callback: Box, diag_ctx: DiagnosticContext, }, - - WakeUp { - lock_ts: TimeStamp, - hashes: Vec, - commit_ts: TimeStamp, - is_pessimistic_txn: bool, + RemoveLockWait { + token: LockWaitToken, }, } @@ -7925,62 +7941,58 @@ mod tests { // It's used to check whether we send right messages to lock manager. #[derive(Clone)] pub struct ProxyLockMgr { - tx: Sender, + tx: Arc>>, has_waiter: Arc, } impl ProxyLockMgr { pub fn new(tx: Sender) -> Self { Self { - tx, + tx: Arc::new(Mutex::new(tx)), has_waiter: Arc::new(AtomicBool::new(false)), } } - - pub fn set_has_waiter(&mut self, has_waiter: bool) { - self.has_waiter.store(has_waiter, Ordering::Relaxed); - } } impl LockManager for ProxyLockMgr { + fn allocate_token(&self) -> LockWaitToken { + LockWaitToken(Some(1)) + } + fn wait_for( &self, + token: LockWaitToken, + region_id: u64, + region_epoch: RegionEpoch, + term: u64, start_ts: TimeStamp, - cb: StorageCallback, - pr: ProcessResult, - lock: Lock, + wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, + cancel_callback: Box, diag_ctx: DiagnosticContext, ) { self.tx + .lock() .send(Msg::WaitFor { + token, + region_id, + region_epoch, + term, start_ts, - cb, - pr, - lock, + wait_info, is_first_lock, timeout, + cancel_callback, diag_ctx, }) .unwrap(); } - fn wake_up( - &self, - lock_ts: TimeStamp, - hashes: Vec, - commit_ts: TimeStamp, - is_pessimistic_txn: bool, - ) { - self.tx - .send(Msg::WakeUp { - lock_ts, - hashes, - commit_ts, - is_pessimistic_txn, - }) - .unwrap(); + fn update_wait_for(&self, _updated_items: Vec) {} + + fn remove_lock_wait(&self, token: LockWaitToken) { + self.tx.lock().send(Msg::RemoveLockWait { token }).unwrap(); } fn has_waiter(&self) -> bool { @@ -8049,77 +8061,114 @@ mod tests { match msg { Msg::WaitFor { start_ts, - pr, - lock, + wait_info, is_first_lock, timeout, .. } => { assert_eq!(start_ts, TimeStamp::new(20)); assert_eq!( - lock, - Lock { + wait_info.lock_digest, + LockDigest { ts: 10.into(), hash: Key::from_raw(&k).gen_hash(), } ); assert_eq!(is_first_lock, true); assert_eq!(timeout, Some(WaitTimeout::Millis(100))); - match pr { - ProcessResult::PessimisticLockRes { res } => match res { - Err(Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( - MvccError(box MvccErrorInner::KeyIsLocked(info)), - ))))) => { - assert_eq!(info.get_key(), k.as_slice()); - assert_eq!(info.get_primary_lock(), k.as_slice()); - assert_eq!(info.get_lock_version(), 10); - } - _ => panic!("unexpected error"), - }, - _ => panic!("unexpected process result"), - }; } _ => panic!("unexpected msg"), } } - // Test whether `Storage` sends right wake-up msgs to `LockManager` + // Test whether `Storage` correctly wakes up lock-waiting requests #[test] - fn validate_wake_up_msg() { - fn assert_wake_up_msg_eq( - msg: Msg, - expected_lock_ts: TimeStamp, - expected_hashes: Vec, - expected_commit_ts: TimeStamp, - expected_is_pessimistic_txn: bool, - ) { - match msg { - Msg::WakeUp { - lock_ts, - hashes, - commit_ts, - is_pessimistic_txn, - } => { - assert_eq!(lock_ts, expected_lock_ts); - assert_eq!(hashes, expected_hashes); - assert_eq!(commit_ts, expected_commit_ts); - assert_eq!(is_pessimistic_txn, expected_is_pessimistic_txn); + fn test_wake_up() { + struct BlockedLockRequestHandle { + remaining: usize, + rx: std::sync::mpsc::Receiver, + } + + impl BlockedLockRequestHandle { + fn assert_blocked(&mut self) { + while self.remaining > 0 { + match self.rx.recv_timeout(Duration::from_millis(50)) { + Ok(_) => self.remaining -= 1, + Err(std::sync::mpsc::RecvTimeoutError::Timeout) => return, + Err(e) => panic!("unexpected error: {:?}", e), + } + } + panic!("pessimistic lock requests expected to be blocked finished unexpectedly") + } + + fn assert_woken_up(mut self) { + while self.remaining > 0 { + match self.rx.recv_timeout(Duration::from_millis(200)) { + Ok(_) => self.remaining -= 1, + Err(e) => panic!("unexpected error: {:?}", e), + } } - _ => panic!("unexpected msg"), } } - let (msg_tx, msg_rx) = channel(); - let mut lock_mgr = ProxyLockMgr::new(msg_tx); - lock_mgr.set_has_waiter(true); let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr( TestEngineBuilder::new().build().unwrap(), - lock_mgr, + MockLockManager::new(), ) .build() .unwrap(); + let lock_blocked = |keys: &[Key], + lock_ts: u64, + expected_conflicting_start_ts: u64, + expected_conflicting_commit_ts: u64| { + let (tx, rx) = channel(); + for k in keys { + storage + .sched_txn_command( + commands::AcquirePessimisticLock::new( + vec![(k.clone(), false)], + k.to_raw().unwrap(), + lock_ts.into(), + 3000, + false, + lock_ts.into(), + Some(WaitTimeout::Millis(5000)), + false, + (lock_ts + 1).into(), + OldValues::default(), + false, + false, + Context::default(), + ), + expect_fail_callback(tx.clone(), 6, move |e| match e { + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( + mvcc::Error(box mvcc::ErrorInner::WriteConflict { + conflict_start_ts, + conflict_commit_ts, + .. + }), + )))) => { + assert_eq!(conflict_start_ts, expected_conflicting_start_ts.into()); + assert_eq!( + conflict_commit_ts, + expected_conflicting_commit_ts.into() + ); + } + e => panic!("unexpected error chain: {:?}", e), + }), + ) + .unwrap(); + } + let mut h = BlockedLockRequestHandle { + remaining: keys.len(), + rx, + }; + h.assert_blocked(); + h + }; + let (tx, rx) = channel(); let prewrite_locks = |keys: &[Key], ts: TimeStamp| { storage @@ -8157,28 +8206,19 @@ mod tests { Key::from_raw(b"b"), Key::from_raw(b"c"), ]; - let key_hashes: Vec = keys.iter().map(|k| k.gen_hash()).collect(); // Commit prewrite_locks(&keys, 10.into()); - // If locks don't exsit, hashes of released locks should be empty. - for empty_hashes in &[false, true] { - storage - .sched_txn_command( - commands::Commit::new(keys.clone(), 10.into(), 20.into(), Context::default()), - expect_ok_callback(tx.clone(), 0), - ) - .unwrap(); - rx.recv().unwrap(); + let h = lock_blocked(&keys, 15, 10, 20); + storage + .sched_txn_command( + commands::Commit::new(keys.clone(), 10.into(), 20.into(), Context::default()), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); - let msg = msg_rx.recv().unwrap(); - let hashes = if *empty_hashes { - Vec::new() - } else { - key_hashes.clone() - }; - assert_wake_up_msg_eq(msg, 10.into(), hashes, 20.into(), false); - } + h.assert_woken_up(); // Cleanup for pessimistic in &[false, true] { @@ -8189,28 +8229,21 @@ mod tests { } else { prewrite_locks(&keys[..1], ts); } - for empty_hashes in &[false, true] { - storage - .sched_txn_command( - commands::Cleanup::new( - keys[0].clone(), - ts, - TimeStamp::max(), - Context::default(), - ), - expect_ok_callback(tx.clone(), 0), - ) - .unwrap(); - rx.recv().unwrap(); + let h = lock_blocked(&keys[..1], 35, ts.into_inner(), 0); + storage + .sched_txn_command( + commands::Cleanup::new( + keys[0].clone(), + ts, + TimeStamp::max(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); - let msg = msg_rx.recv().unwrap(); - let (hashes, pessimistic) = if *empty_hashes { - (Vec::new(), false) - } else { - (key_hashes[..1].to_vec(), *pessimistic) - }; - assert_wake_up_msg_eq(msg, ts, hashes, 0.into(), pessimistic); - } + h.assert_woken_up(); } // Rollback @@ -8222,50 +8255,36 @@ mod tests { } else { prewrite_locks(&keys, ts); } - for empty_hashes in &[false, true] { - storage - .sched_txn_command( - commands::Rollback::new(keys.clone(), ts, Context::default()), - expect_ok_callback(tx.clone(), 0), - ) - .unwrap(); - rx.recv().unwrap(); - - let msg = msg_rx.recv().unwrap(); - let (hashes, pessimistic) = if *empty_hashes { - (Vec::new(), false) - } else { - (key_hashes.clone(), *pessimistic) - }; - assert_wake_up_msg_eq(msg, ts, hashes, 0.into(), pessimistic); - } - } - - // PessimisticRollback - acquire_pessimistic_locks(&keys, 50.into()); - for empty_hashes in &[false, true] { + let h = lock_blocked(&keys, 45, ts.into_inner(), 0); storage .sched_txn_command( - commands::PessimisticRollback::new( - keys.clone(), - 50.into(), - 50.into(), - Context::default(), - ), + commands::Rollback::new(keys.clone(), ts, Context::default()), expect_ok_callback(tx.clone(), 0), ) .unwrap(); rx.recv().unwrap(); - let msg = msg_rx.recv().unwrap(); - let (hashes, pessimistic) = if *empty_hashes { - (Vec::new(), false) - } else { - (key_hashes.clone(), true) - }; - assert_wake_up_msg_eq(msg, 50.into(), hashes, 0.into(), pessimistic); + h.assert_woken_up(); } + // PessimisticRollback + acquire_pessimistic_locks(&keys, 50.into()); + let h = lock_blocked(&keys, 55, 50, 0); + storage + .sched_txn_command( + commands::PessimisticRollback::new( + keys.clone(), + 50.into(), + 50.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + h.assert_woken_up(); + // ResolveLockLite for commit in &[false, true] { let mut start_ts = TimeStamp::new(60); @@ -8276,28 +8295,21 @@ mod tests { TimeStamp::zero() }; prewrite_locks(&keys, start_ts); - for empty_hashes in &[false, true] { - storage - .sched_txn_command( - commands::ResolveLockLite::new( - start_ts, - commit_ts, - keys.clone(), - Context::default(), - ), - expect_ok_callback(tx.clone(), 0), - ) - .unwrap(); - rx.recv().unwrap(); + let h = lock_blocked(&keys, 65, start_ts.into_inner(), commit_ts.into_inner()); + storage + .sched_txn_command( + commands::ResolveLockLite::new( + start_ts, + commit_ts, + keys.clone(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); - let msg = msg_rx.recv().unwrap(); - let hashes = if *empty_hashes { - Vec::new() - } else { - key_hashes.clone() - }; - assert_wake_up_msg_eq(msg, start_ts, hashes, commit_ts, false); - } + h.assert_woken_up(); } // ResolveLock @@ -8310,10 +8322,10 @@ mod tests { Key::from_raw(b"e"), Key::from_raw(b"f"), ]; - let committed_key_hashes: Vec = committed_keys.iter().map(|k| k.gen_hash()).collect(); - // Commit start_ts=75 prewrite_locks(&committed_keys, 75.into()); txn_status.insert(TimeStamp::new(75), TimeStamp::new(76)); + let h_rolled_back = lock_blocked(&keys, 76, 70, 0); + let h_committed = lock_blocked(&committed_keys, 76, 75, 76); storage .sched_txn_command( commands::ResolveLockReadPhase::new(txn_status, None, Context::default()), @@ -8321,20 +8333,8 @@ mod tests { ) .unwrap(); rx.recv().unwrap(); - - let mut msg1 = msg_rx.recv().unwrap(); - let mut msg2 = msg_rx.recv().unwrap(); - match msg1 { - Msg::WakeUp { lock_ts, .. } => { - if lock_ts != TimeStamp::new(70) { - // Let msg1 be the msg of rolled back transaction. - std::mem::swap(&mut msg1, &mut msg2); - } - assert_wake_up_msg_eq(msg1, 70.into(), key_hashes, 0.into(), true); - assert_wake_up_msg_eq(msg2, 75.into(), committed_key_hashes, 76.into(), false); - } - _ => panic!("unexpect msg"), - } + h_rolled_back.assert_woken_up(); + h_committed.assert_woken_up(); // CheckTxnStatus let key = Key::from_raw(b"k"); @@ -8352,6 +8352,8 @@ mod tests { .unwrap(); rx.recv().unwrap(); + let mut h = lock_blocked(&[key.clone()], 105, start_ts.into_inner(), 0); + // Not expire storage .sched_txn_command( @@ -8385,14 +8387,14 @@ mod tests { ) .unwrap(); rx.recv().unwrap(); - // No msg - assert!(msg_rx.try_recv().is_err()); + // Not woken up + h.assert_blocked(); // Expired storage .sched_txn_command( commands::CheckTxnStatus::new( - key.clone(), + key, start_ts, TimeStamp::compose(110, 0), TimeStamp::compose(201, 0), @@ -8405,18 +8407,12 @@ mod tests { ) .unwrap(); rx.recv().unwrap(); - assert_wake_up_msg_eq( - msg_rx.recv().unwrap(), - start_ts, - vec![key.gen_hash()], - 0.into(), - false, - ); + h.assert_woken_up(); } #[test] fn test_check_memory_locks() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let cm = storage.get_concurrency_manager(); @@ -8516,7 +8512,7 @@ mod tests { #[test] fn test_read_access_locks() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); @@ -8601,7 +8597,7 @@ mod tests { #[test] fn test_async_commit_prewrite() { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let cm = storage.concurrency_manager.clone(); @@ -8697,10 +8693,12 @@ mod tests { #[test] fn test_overlapped_ts_rollback_before_prewrite() { let mut engine = TestEngineBuilder::new().build().unwrap(); - let storage = - TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine.clone(), DummyLockManager) - .build() - .unwrap(); + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr( + engine.clone(), + MockLockManager::new(), + ) + .build() + .unwrap(); let (k1, v1) = (b"key1", b"v1"); let (k2, v2) = (b"key2", b"v2"); @@ -8874,8 +8872,10 @@ mod tests { builder = builder.add_expected_write(expected_write) } let engine = builder.build(); - let mut builder = - TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager); + let mut builder = TestStorageBuilderApiV1::from_engine_and_lock_mgr( + engine, + MockLockManager::new(), + ); builder.config.enable_async_apply_prewrite = true; if self.pipelined_pessimistic_lock { builder @@ -9017,7 +9017,7 @@ mod tests { #[test] fn test_resolve_commit_pessimistic_locks() { - let mut storage = TestStorageBuilderApiV1::new(DummyLockManager) + let mut storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); @@ -9354,7 +9354,7 @@ mod tests { test_data.into_iter().enumerate() { // TODO: refactor to use `Api` parameter. - let res = StorageApiV1::::check_api_version( + let res = StorageApiV1::::check_api_version( storage_api_version, req_api_version, cmd, @@ -9410,7 +9410,7 @@ mod tests { range: &[(Option<&[u8]>, Option<&[u8]>)], err| { // TODO: refactor to use `Api` parameter. - let res = StorageApiV1::::check_api_version_ranges( + let res = StorageApiV1::::check_api_version_ranges( storage_api_version, req_api_version, cmd, @@ -9575,7 +9575,8 @@ mod tests { #[test] fn test_write_in_memory_pessimistic_locks() { let txn_ext = Arc::new(TxnExt::default()); - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let lock_mgr = MockLockManager::new(); + let storage = TestStorageBuilderApiV1::new(lock_mgr.clone()) .pipelined_pessimistic_lock(true) .in_memory_pessimistic_lock(true) .build_for_txn(txn_ext.clone()) @@ -9632,9 +9633,11 @@ mod tests { }), ) .unwrap(); - // DummyLockManager just drops the callback, so it will fail to receive - // anything. - rx.recv().unwrap_err(); + // The request enters lock waiting state. + rx.recv_timeout(Duration::from_millis(100)).unwrap_err(); + lock_mgr.simulate_timeout_all(); + // The lock-waiting request is cancelled. + rx.recv().unwrap().unwrap_err(); let (tx, rx) = channel(); storage @@ -9672,7 +9675,7 @@ mod tests { #[test] fn test_disable_in_memory_pessimistic_locks() { let txn_ext = Arc::new(TxnExt::default()); - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .pipelined_pessimistic_lock(true) .in_memory_pessimistic_lock(false) .build_for_txn(txn_ext.clone()) diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index 7171417d060..a73f8b99027 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -37,16 +37,19 @@ impl GcInfo { /// waiting for locks. #[derive(Debug, PartialEq)] pub struct ReleasedLock { - /// The hash value of the lock. - pub hash: u64, + pub start_ts: TimeStamp, + pub commit_ts: TimeStamp, + pub key: Key, /// Whether it is a pessimistic lock. pub pessimistic: bool, } impl ReleasedLock { - fn new(key: &Key, pessimistic: bool) -> Self { + pub fn new(start_ts: TimeStamp, commit_ts: TimeStamp, key: Key, pessimistic: bool) -> Self { Self { - hash: key.gen_hash(), + start_ts, + commit_ts, + key, pessimistic, } } @@ -114,8 +117,17 @@ impl MvccTxn { self.modifies.push(Modify::PessimisticLock(key, lock)) } - pub(crate) fn unlock_key(&mut self, key: Key, pessimistic: bool) -> Option { - let released = ReleasedLock::new(&key, pessimistic); + /// Append a modify that unlocks the key. If the lock is removed due to + /// committing, a non-zero `commit_ts` needs to be provided; otherwise if + /// the lock is removed due to rolling back, `commit_ts` must be set to + /// zero. + pub(crate) fn unlock_key( + &mut self, + key: Key, + pessimistic: bool, + commit_ts: TimeStamp, + ) -> Option { + let released = ReleasedLock::new(self.start_ts, commit_ts, key.clone(), pessimistic); let write = Modify::Delete(CF_LOCK, key); self.write_size += write.size(); self.modifies.push(write); diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index f80e61f93ad..4c900e5a438 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -45,7 +45,7 @@ pub fn check_txn_status_lock_exists( // If the resolving and primary key lock are both pessimistic locks, just unlock // the primary pessimistic lock and do not write rollback records. return if resolving_pessimistic_lock && lock.lock_type == LockType::Pessimistic { - let released = txn.unlock_key(primary_key, is_pessimistic_txn); + let released = txn.unlock_key(primary_key, is_pessimistic_txn, TimeStamp::zero()); MVCC_CHECK_TXN_STATUS_COUNTER_VEC.pessimistic_rollback.inc(); Ok((TxnStatus::PessimisticRollBack, released)) } else { @@ -157,7 +157,7 @@ pub fn rollback_lock( TxnCommitRecord::SingleRecord { write, .. } if write.write_type != WriteType::Rollback => { panic!("txn record found but not expected: {:?}", txn) } - _ => return Ok(txn.unlock_key(key, is_pessimistic_txn)), + _ => return Ok(txn.unlock_key(key, is_pessimistic_txn, TimeStamp::zero())), }; // If prewrite type is DEL or LOCK or PESSIMISTIC, it is no need to delete @@ -176,7 +176,7 @@ pub fn rollback_lock( collapse_prev_rollback(txn, reader, &key)?; } - Ok(txn.unlock_key(key, is_pessimistic_txn)) + Ok(txn.unlock_key(key, is_pessimistic_txn, TimeStamp::zero())) } pub fn collapse_prev_rollback( diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index 6fd925b536e..eb798090ba2 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -101,7 +101,7 @@ pub fn commit( } txn.put_write(key.clone(), commit_ts, write.as_ref().to_bytes()); - Ok(txn.unlock_key(key, lock.is_pessimistic_txn())) + Ok(txn.unlock_key(key, lock.is_pessimistic_txn(), commit_ts)) } pub mod tests { diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 949b347f251..359f0abacd8 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -11,11 +11,12 @@ use crate::storage::{ txn::{ acquire_pessimistic_lock, commands::{ - Command, CommandExt, ReaderWithStats, ResponsePolicy, TypedCommand, WriteCommand, - WriteContext, WriteResult, WriteResultLockInfo, + Command, CommandExt, ReaderWithStats, ReleasedLocks, ResponsePolicy, TypedCommand, + WriteCommand, WriteContext, WriteResult, WriteResultLockInfo, }, Error, ErrorInner, Result, }, + types::PessimisticLockParameters, Error as StorageError, ErrorInner as StorageErrorInner, PessimisticLockRes, ProcessResult, Result as StorageResult, Snapshot, }; @@ -155,12 +156,21 @@ impl WriteCommand for AcquirePessimisticLock let write_data = WriteData::new(txn.into_modifies(), extra); (pr, write_data, rows, ctx, None) } else { + let request_parameters = PessimisticLockParameters { + pb_ctx: ctx.clone(), + primary: self.primary.clone(), + start_ts: self.start_ts, + lock_ttl: self.lock_ttl, + for_update_ts: self.for_update_ts, + wait_timeout: self.wait_timeout, + return_values: self.return_values, + min_commit_ts: self.min_commit_ts, + check_existence: self.check_existence, + is_first_lock: self.is_first_lock, + allow_lock_with_conflict: false, + }; let lock_info_pb = extract_lock_info_from_result(&res); - let lock_info = WriteResultLockInfo::from_lock_info_pb( - lock_info_pb, - self.is_first_lock, - self.wait_timeout, - ); + let lock_info = WriteResultLockInfo::new(lock_info_pb.clone(), request_parameters); let pr = ProcessResult::PessimisticLockRes { res }; // Wait for lock released (pr, WriteData::default(), 0, ctx, Some(lock_info)) @@ -171,6 +181,7 @@ impl WriteCommand for AcquirePessimisticLock rows, pr, lock_info, + released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, }) @@ -194,17 +205,21 @@ mod tests { info.set_lock_version(ts); info.set_lock_ttl(100); let case = StorageError::from(StorageErrorInner::Txn(Error::from(ErrorInner::Mvcc( - MvccError::from(MvccErrorInner::KeyIsLocked(info)), + MvccError::from(MvccErrorInner::KeyIsLocked(info.clone())), )))); - let lock_info = WriteResultLockInfo::from_lock_info_pb( - extract_lock_info_from_result::<()>(&Err(case)), - is_first_lock, - wait_timeout, + let lock_info = WriteResultLockInfo::new( + extract_lock_info_from_result::<()>(&Err(case)).clone(), + PessimisticLockParameters { + is_first_lock, + wait_timeout, + ..Default::default() + }, ); - assert_eq!(lock_info.lock.ts, ts.into()); - assert_eq!(lock_info.lock.hash, key.gen_hash()); - assert_eq!(lock_info.key, raw_key); - assert_eq!(lock_info.is_first_lock, is_first_lock); - assert_eq!(lock_info.wait_timeout, wait_timeout); + assert_eq!(lock_info.lock_digest.ts, ts.into()); + assert_eq!(lock_info.lock_digest.hash, key.gen_hash()); + assert_eq!(lock_info.key.into_raw().unwrap(), raw_key); + assert_eq!(lock_info.parameters.is_first_lock, is_first_lock); + assert_eq!(lock_info.parameters.wait_timeout, wait_timeout); + assert_eq!(lock_info.lock_info_pb, info); } } diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index 150b065e5db..b935d991eea 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -8,8 +8,8 @@ use crate::storage::{ lock_manager::LockManager, txn::{ commands::{ - Command, CommandExt, ResponsePolicy, TypedCommand, WriteCommand, WriteContext, - WriteResult, + Command, CommandExt, ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, + WriteContext, WriteResult, }, Result, }, @@ -59,6 +59,7 @@ impl WriteCommand for RawAtomicStore { rows, pr: ProcessResult::Res, lock_info: None, + released_locks: ReleasedLocks::new(), lock_guards: raw_ext.into_iter().map(|r| r.key_guard).collect(), response_policy: ResponsePolicy::OnApplied, }) @@ -75,7 +76,7 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::DummyLockManager, txn::scheduler::get_raw_ext, Statistics, TestEngineBuilder, + lock_manager::MockLockManager, txn::scheduler::get_raw_ext, Statistics, TestEngineBuilder, }; #[test] @@ -108,7 +109,7 @@ mod tests { let snap = engine.snapshot(Default::default()).unwrap(); let raw_ext = block_on(get_raw_ext(ts_provider, cm.clone(), true, &cmd.cmd)).unwrap(); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: kvproto::kvrpcpb::ExtraOp::Noop, statistics: &mut statistic, diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 56138a09a50..1a4b547b6d7 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -65,7 +65,7 @@ impl WriteCommand for CheckSecondaryLocks { SnapshotReader::new_with_ctx(self.start_ts, snapshot, &self.ctx), context.statistics, ); - let mut released_locks = ReleasedLocks::new(self.start_ts, TimeStamp::zero()); + let mut released_locks = ReleasedLocks::new(); let mut result = SecondaryLocksStatus::Locked(Vec::new()); for key in self.keys { @@ -76,7 +76,7 @@ impl WriteCommand for CheckSecondaryLocks { // The lock exists, the lock information is returned. Some(lock) if lock.ts == self.start_ts => { if lock.lock_type == LockType::Pessimistic { - released_lock = txn.unlock_key(key.clone(), true); + released_lock = txn.unlock_key(key.clone(), true, TimeStamp::zero()); let overlapped_write = reader.get_txn_commit_record(&key)?.unwrap_none(); (SecondaryLockStatus::RolledBack, true, overlapped_write) } else { @@ -142,8 +142,6 @@ impl WriteCommand for CheckSecondaryLocks { let mut rows = 0; if let SecondaryLocksStatus::RolledBack = &result { - // Lock is only released when result is `RolledBack`. - released_locks.wake_up(context.lock_mgr); // One row is mutated only when a secondary lock is rolled back. rows = 1; } @@ -156,6 +154,7 @@ impl WriteCommand for CheckSecondaryLocks { rows, pr, lock_info: None, + released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) @@ -171,7 +170,7 @@ pub mod tests { use super::*; use crate::storage::{ kv::TestEngineBuilder, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, mvcc::tests::*, txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, Engine, @@ -197,7 +196,7 @@ pub mod tests { .process_write( snapshot, WriteContext { - lock_mgr: &DummyLockManager, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: Default::default(), statistics: &mut Default::default(), @@ -235,7 +234,7 @@ pub mod tests { .process_write( snapshot, WriteContext { - lock_mgr: &DummyLockManager, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm.clone(), extra_op: Default::default(), statistics: &mut Default::default(), diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 73079e00f5d..58f7f557448 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -121,13 +121,8 @@ impl WriteCommand for CheckTxnStatus { ), }; - let mut released_locks = ReleasedLocks::new(self.lock_ts, TimeStamp::zero()); + let mut released_locks = ReleasedLocks::new(); released_locks.push(released); - // The lock is released here only when the `check_txn_status` returns - // `TtlExpire`. - if let TxnStatus::TtlExpire = txn_status { - released_locks.wake_up(context.lock_mgr); - } let pr = ProcessResult::TxnStatus { txn_status }; let mut write_data = WriteData::from_modifies(txn.into_modifies()); @@ -138,6 +133,7 @@ impl WriteCommand for CheckTxnStatus { rows: 1, pr, lock_info: None, + released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) @@ -154,7 +150,7 @@ pub mod tests { use super::{TxnStatus::*, *}; use crate::storage::{ kv::Engine, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, mvcc::tests::*, txn::{ commands::{pessimistic_rollback, WriteCommand, WriteContext}, @@ -196,7 +192,7 @@ pub mod tests { .process_write( snapshot, WriteContext { - lock_mgr: &DummyLockManager, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: Default::default(), statistics: &mut Default::default(), @@ -244,7 +240,7 @@ pub mod tests { .process_write( snapshot, WriteContext { - lock_mgr: &DummyLockManager, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: Default::default(), statistics: &mut Default::default(), diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index c810c749bd6..0b82432e3cd 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -56,7 +56,7 @@ impl WriteCommand for Cleanup { context.statistics, ); - let mut released_locks = ReleasedLocks::new(self.start_ts, TimeStamp::zero()); + let mut released_locks = ReleasedLocks::new(); // The rollback must be protected, see more on // [issue #7364](https://github.com/tikv/tikv/issues/7364) released_locks.push(cleanup( @@ -66,7 +66,6 @@ impl WriteCommand for Cleanup { self.current_ts, true, )?); - released_locks.wake_up(context.lock_mgr); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); @@ -76,6 +75,7 @@ impl WriteCommand for Cleanup { rows: 1, pr: ProcessResult::Res, lock_info: None, + released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index f89d4fc09af..86e1f541306 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -59,11 +59,10 @@ impl WriteCommand for Commit { let rows = self.keys.len(); // Pessimistic txn needs key_hashes to wake up waiters - let mut released_locks = ReleasedLocks::new(self.lock_ts, self.commit_ts); + let mut released_locks = ReleasedLocks::new(); for k in self.keys { released_locks.push(commit(&mut txn, &mut reader, k, self.commit_ts)?); } - released_locks.wake_up(context.lock_mgr); let pr = ProcessResult::TxnStatus { txn_status: TxnStatus::committed(self.commit_ts), @@ -76,6 +75,7 @@ impl WriteCommand for Commit { rows, pr, lock_info: None, + released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index 4dbd51e70e0..2fff0620b27 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -14,8 +14,8 @@ use crate::storage::{ raw, txn::{ commands::{ - Command, CommandExt, ResponsePolicy, TypedCommand, WriteCommand, WriteContext, - WriteResult, + Command, CommandExt, ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, + WriteContext, WriteResult, }, Result, }, @@ -113,6 +113,7 @@ impl WriteCommand for RawCompareAndSwap { rows, pr, lock_info: None, + released_locks: ReleasedLocks::new(), lock_guards, response_policy: ResponsePolicy::OnApplied, }) @@ -132,7 +133,7 @@ mod tests { use super::*; use crate::storage::{ - lock_manager::DummyLockManager, txn::scheduler::get_raw_ext, Engine, Statistics, + lock_manager::MockLockManager, txn::scheduler::get_raw_ext, Engine, Statistics, TestEngineBuilder, }; @@ -207,7 +208,7 @@ mod tests { let raw_ext = block_on(get_raw_ext(ts_provider, cm.clone(), true, &cmd.cmd)).unwrap(); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: ExtraOp::Noop, statistics: &mut statistic, @@ -261,7 +262,7 @@ mod tests { let snap = engine.snapshot(Default::default()).unwrap(); let raw_ext = block_on(get_raw_ext(ts_provider, cm.clone(), true, &cmd.cmd)).unwrap(); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: kvproto::kvrpcpb::ExtraOp::Noop, statistics: &mut statistic, diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index b4255138eeb..9b198724e3b 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -9,8 +9,8 @@ use crate::storage::{ mvcc::{MvccTxn, SnapshotReader}, txn::{ commands::{ - Command, CommandExt, FlashbackToVersionReadPhase, ReaderWithStats, ResponsePolicy, - TypedCommand, WriteCommand, WriteContext, WriteResult, + Command, CommandExt, FlashbackToVersionReadPhase, ReaderWithStats, ReleasedLocks, + ResponsePolicy, TypedCommand, WriteCommand, WriteContext, WriteResult, }, flashback_to_version, latch, Result, }, @@ -104,6 +104,7 @@ impl WriteCommand for FlashbackToVersion { } }, lock_info: None, + released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 7c2c945d4e2..f5331087ac1 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -46,7 +46,7 @@ pub use mvcc_by_key::MvccByKey; pub use mvcc_by_start_ts::MvccByStartTs; pub use pause::Pause; pub use pessimistic_rollback::PessimisticRollback; -pub use prewrite::{one_pc_commit_ts, Prewrite, PrewritePessimistic}; +pub use prewrite::{one_pc_commit, Prewrite, PrewritePessimistic}; pub use resolve_lock::{ResolveLock, RESOLVE_LOCK_BATCH_SIZE}; pub use resolve_lock_lite::ResolveLockLite; pub use resolve_lock_readphase::ResolveLockReadPhase; @@ -63,8 +63,8 @@ use crate::storage::{ mvcc::{Lock as MvccLock, MvccReader, ReleasedLock, SnapshotReader}, txn::{latch, ProcessResult, Result}, types::{ - MvccInfo, PessimisticLockRes, PrewriteResult, SecondaryLocksStatus, StorageCallbackType, - TxnStatus, + MvccInfo, PessimisticLockParameters, PessimisticLockRes, PrewriteResult, + SecondaryLocksStatus, StorageCallbackType, TxnStatus, }, Result as StorageResult, Snapshot, Statistics, }; @@ -365,14 +365,6 @@ impl From for TypedCommand<()> { } } -#[derive(Default)] -pub(super) struct ReleasedLocks { - start_ts: TimeStamp, - commit_ts: TimeStamp, - hashes: Vec, - pessimistic: bool, -} - /// Represents for a scheduler command, when should the response sent to the /// client. For most cases, the response should be sent after the result being /// successfully applied to the storage (if needed). But in some special cases, @@ -398,62 +390,58 @@ pub struct WriteResult { pub rows: usize, pub pr: ProcessResult, pub lock_info: Option, + pub released_locks: ReleasedLocks, pub lock_guards: Vec, pub response_policy: ResponsePolicy, } pub struct WriteResultLockInfo { - pub lock: lock_manager::Lock, - pub key: Vec, - pub is_first_lock: bool, - pub wait_timeout: Option, + pub lock_digest: lock_manager::LockDigest, + pub key: Key, + pub lock_info_pb: LockInfo, + pub parameters: PessimisticLockParameters, } impl WriteResultLockInfo { - pub fn from_lock_info_pb( - lock_info: &LockInfo, - is_first_lock: bool, - wait_timeout: Option, - ) -> Self { - let lock = lock_manager::Lock { - ts: lock_info.get_lock_version().into(), - hash: Key::from_raw(lock_info.get_key()).gen_hash(), + pub fn new(lock_info_pb: LockInfo, parameters: PessimisticLockParameters) -> Self { + let lock = lock_manager::LockDigest { + ts: lock_info_pb.get_lock_version().into(), + hash: Key::from_raw(lock_info_pb.get_key()).gen_hash(), }; - let key = lock_info.get_key().to_owned(); + let key = Key::from_raw(lock_info_pb.get_key()); Self { - lock, + lock_digest: lock, key, - is_first_lock, - wait_timeout, + lock_info_pb, + parameters, } } } +#[derive(Default)] +pub struct ReleasedLocks(Vec); + impl ReleasedLocks { - pub fn new(start_ts: TimeStamp, commit_ts: TimeStamp) -> Self { - Self { - start_ts, - commit_ts, - ..Default::default() - } + pub fn new() -> Self { + Self::default() } pub fn push(&mut self, lock: Option) { if let Some(lock) = lock { - self.hashes.push(lock.hash); - if !self.pessimistic { - self.pessimistic = lock.pessimistic; - } + self.0.push(lock); } } pub fn is_empty(&self) -> bool { - self.hashes.is_empty() + self.0.is_empty() + } + + pub fn clear(&mut self) { + self.0.clear() } - // Wake up pessimistic transactions that waiting for these locks. - pub fn wake_up(self, lock_mgr: &L) { - lock_mgr.wake_up(self.start_ts, self.hashes, self.commit_ts, self.pessimistic); + pub fn into_iter(self) -> impl Iterator { + self.0.into_iter() } } @@ -756,7 +744,7 @@ pub mod test_util { use crate::storage::{ mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::{Error, ErrorInner, Result}, - DummyLockManager, Engine, + Engine, MockLockManager, }; // Some utils for tests that may be used in multiple source code files. @@ -769,7 +757,7 @@ pub mod test_util { ) -> Result { let snap = engine.snapshot(Default::default())?; let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: ExtraOp::Noop, statistics, @@ -907,7 +895,7 @@ pub mod test_util { ); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager, extra_op: ExtraOp::Noop, statistics, @@ -932,7 +920,7 @@ pub mod test_util { let concurrency_manager = ConcurrencyManager::new(start_ts.into()); let cmd = Rollback::new(keys, TimeStamp::from(start_ts), ctx); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager, extra_op: ExtraOp::Noop, statistics, diff --git a/src/storage/txn/commands/pause.rs b/src/storage/txn/commands/pause.rs index 684804f990d..05bbb508bdc 100644 --- a/src/storage/txn/commands/pause.rs +++ b/src/storage/txn/commands/pause.rs @@ -10,8 +10,8 @@ use crate::storage::{ lock_manager::LockManager, txn::{ commands::{ - Command, CommandExt, ResponsePolicy, TypedCommand, WriteCommand, WriteContext, - WriteResult, + Command, CommandExt, ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, + WriteContext, WriteResult, }, Result, }, @@ -49,6 +49,7 @@ impl WriteCommand for Pause { rows: 0, pr: ProcessResult::Res, lock_info: None, + released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index f7394cf32aa..b575787208a 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -58,7 +58,7 @@ impl WriteCommand for PessimisticRollback { let keys = mem::take(&mut self.keys); let rows = keys.len(); - let mut released_locks = ReleasedLocks::new(self.start_ts, TimeStamp::zero()); + let mut released_locks = ReleasedLocks::new(); for key in keys { fail_point!("pessimistic_rollback", |err| Err( crate::storage::mvcc::Error::from(crate::storage::mvcc::txn::make_txn_error( @@ -73,7 +73,7 @@ impl WriteCommand for PessimisticRollback { && lock.ts == self.start_ts && lock.for_update_ts <= self.for_update_ts { - Ok(txn.unlock_key(key, true)) + Ok(txn.unlock_key(key, true, TimeStamp::zero())) } else { Ok(None) } @@ -82,7 +82,6 @@ impl WriteCommand for PessimisticRollback { }; released_locks.push(released_lock?); } - released_locks.wake_up(context.lock_mgr); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); @@ -92,6 +91,7 @@ impl WriteCommand for PessimisticRollback { rows, pr: ProcessResult::MultiRes { results: vec![] }, lock_info: None, + released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) @@ -108,7 +108,7 @@ pub mod tests { use super::*; use crate::storage::{ kv::Engine, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, mvcc::tests::*, txn::{ commands::{WriteCommand, WriteContext}, @@ -136,7 +136,7 @@ pub mod tests { for_update_ts, deadline: Deadline::from_now(DEFAULT_EXECUTION_DURATION_LIMIT), }; - let lock_mgr = DummyLockManager; + let lock_mgr = MockLockManager::new(); let write_context = WriteContext { lock_mgr: &lock_mgr, concurrency_manager: cm, diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index e8b85d37d66..2b0915a5fdc 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -461,7 +461,6 @@ impl Prewriter { final_min_commit_ts, rows, context.async_apply_prewrite, - context.lock_mgr, )) } @@ -645,7 +644,6 @@ impl Prewriter { final_min_commit_ts: TimeStamp, rows: usize, async_apply_prewrite: bool, - lock_manager: &impl LockManager, ) -> WriteResult { let async_commit_ts = if self.secondary_keys.is_some() { final_min_commit_ts @@ -654,16 +652,14 @@ impl Prewriter { }; let mut result = if locks.is_empty() { + let (one_pc_commit_ts, released_locks) = + one_pc_commit(self.try_one_pc, &mut txn, final_min_commit_ts); + let pr = ProcessResult::PrewriteResult { result: PrewriteResult { locks: vec![], min_commit_ts: async_commit_ts, - one_pc_commit_ts: one_pc_commit_ts( - self.try_one_pc, - &mut txn, - final_min_commit_ts, - lock_manager, - ), + one_pc_commit_ts, }, }; let extra = TxnExtra { @@ -685,6 +681,7 @@ impl Prewriter { rows, pr, lock_info: None, + released_locks, lock_guards, response_policy: ResponsePolicy::OnApplied, } @@ -703,6 +700,7 @@ impl Prewriter { rows, pr, lock_info: None, + released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, } @@ -822,31 +820,28 @@ impl MutationLock for (Mutation, PrewriteRequestPessimisticAction) { } } -/// Compute the commit ts of a 1pc transaction. -pub fn one_pc_commit_ts( +/// Commits a 1pc transaction if possible, returns the commit ts and released +/// locks on success. +pub fn one_pc_commit( try_one_pc: bool, txn: &mut MvccTxn, final_min_commit_ts: TimeStamp, - lock_manager: &impl LockManager, -) -> TimeStamp { +) -> (TimeStamp, ReleasedLocks) { if try_one_pc { assert_ne!(final_min_commit_ts, TimeStamp::zero()); // All keys can be successfully locked and `try_one_pc` is set. Try to directly // commit them. let released_locks = handle_1pc_locks(txn, final_min_commit_ts); - if !released_locks.is_empty() { - released_locks.wake_up(lock_manager); - } - final_min_commit_ts + (final_min_commit_ts, released_locks) } else { assert!(txn.locks_for_1pc.is_empty()); - TimeStamp::zero() + (TimeStamp::zero(), ReleasedLocks::new()) } } /// Commit and delete all 1pc locks in txn. fn handle_1pc_locks(txn: &mut MvccTxn, commit_ts: TimeStamp) -> ReleasedLocks { - let mut released_locks = ReleasedLocks::new(txn.start_ts, commit_ts); + let mut released_locks = ReleasedLocks::new(); for (key, lock, delete_pessimistic_lock) in std::mem::take(&mut txn.locks_for_1pc) { let write = Write::new( @@ -858,7 +853,7 @@ fn handle_1pc_locks(txn: &mut MvccTxn, commit_ts: TimeStamp) -> ReleasedLocks { // records. txn.put_write(key.clone(), commit_ts, write.as_ref().to_bytes()); if delete_pessimistic_lock { - released_locks.push(txn.unlock_key(key, true)); + released_locks.push(txn.unlock_key(key, true, commit_ts)); } } @@ -905,7 +900,7 @@ mod tests { Error, ErrorInner, }, types::TxnStatus, - DummyLockManager, Engine, Snapshot, Statistics, TestEngineBuilder, + Engine, MockLockManager, Snapshot, Statistics, TestEngineBuilder, }; fn inner_test_prewrite_skip_constraint_check(pri_key_number: u8, write_num: usize) { @@ -1467,7 +1462,7 @@ mod tests { use engine_traits::{IterOptions, ReadOptions}; use kvproto::kvrpcpb::ExtraOp; - use crate::storage::{kv::Result, CfName, ConcurrencyManager, DummyLockManager, Value}; + use crate::storage::{kv::Result, CfName, ConcurrencyManager, MockLockManager, Value}; #[derive(Clone)] struct MockSnapshot; @@ -1503,7 +1498,7 @@ mod tests { macro_rules! context { () => { WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: ConcurrencyManager::new(10.into()), extra_op: ExtraOp::Noop, statistics: &mut Statistics::default(), @@ -1673,7 +1668,7 @@ mod tests { ) }; let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm.clone(), extra_op: ExtraOp::Noop, statistics: &mut statistics, @@ -1787,7 +1782,7 @@ mod tests { Context::default(), ); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm.clone(), extra_op: ExtraOp::Noop, statistics: &mut statistics, @@ -1815,7 +1810,7 @@ mod tests { TimeStamp::default(), ); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: ExtraOp::Noop, statistics: &mut statistics, @@ -1897,7 +1892,7 @@ mod tests { Context::default(), ); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm.clone(), extra_op: ExtraOp::Noop, statistics: &mut statistics, @@ -1929,7 +1924,7 @@ mod tests { TimeStamp::default(), ); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: ExtraOp::Noop, statistics: &mut statistics, @@ -2198,7 +2193,7 @@ mod tests { Context::default(), ); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm.clone(), extra_op: ExtraOp::Noop, statistics: &mut statistics, @@ -2222,7 +2217,7 @@ mod tests { 10.into(), ); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: ExtraOp::Noop, statistics: &mut statistics, @@ -2428,7 +2423,7 @@ mod tests { Context::default(), ); let context = WriteContext { - lock_mgr: &DummyLockManager {}, + lock_mgr: &MockLockManager::new(), concurrency_manager: ConcurrencyManager::new(20.into()), extra_op: ExtraOp::Noop, statistics: &mut statistics, diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index 1d2bfbf49d8..b89e91593f9 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -82,8 +82,7 @@ impl WriteCommand for ResolveLock { let mut scan_key = self.scan_key.take(); let rows = key_locks.len(); - // Map txn's start_ts to ReleasedLocks - let mut released_locks = HashMap::default(); + let mut released_locks = ReleasedLocks::new(); for (current_key, current_lock) in key_locks { txn.start_ts = current_lock.ts; reader.start_ts = current_lock.ts; @@ -118,20 +117,13 @@ impl WriteCommand for ResolveLock { commit_ts, })); }; - released_locks - .entry(current_lock.ts) - .or_insert_with(|| ReleasedLocks::new(current_lock.ts, commit_ts)) - .push(released); + released_locks.push(released); if txn.write_size() >= MAX_TXN_WRITE_SIZE { scan_key = Some(current_key); break; } } - let lock_mgr = context.lock_mgr; - released_locks - .into_iter() - .for_each(|(_, released_locks)| released_locks.wake_up(lock_mgr)); let pr = if scan_key.is_none() { ProcessResult::Res @@ -154,6 +146,7 @@ impl WriteCommand for ResolveLock { rows, pr, lock_info: None, + released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index 5a0f636d2f6..a31211c564e 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -54,7 +54,7 @@ impl WriteCommand for ResolveLockLite { let rows = self.resolve_keys.len(); // ti-client guarantees the size of resolve_keys will not too large, so no // necessary to control the write_size as ResolveLock. - let mut released_locks = ReleasedLocks::new(self.start_ts, self.commit_ts); + let mut released_locks = ReleasedLocks::new(); for key in self.resolve_keys { released_locks.push(if !self.commit_ts.is_zero() { commit(&mut txn, &mut reader, key, self.commit_ts)? @@ -62,7 +62,6 @@ impl WriteCommand for ResolveLockLite { cleanup(&mut txn, &mut reader, key, TimeStamp::zero(), false)? }); } - released_locks.wake_up(context.lock_mgr); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); @@ -72,6 +71,7 @@ impl WriteCommand for ResolveLockLite { rows, pr: ProcessResult::Res, lock_info: None, + released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index fc3846931f3..479f29cb276 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -50,14 +50,13 @@ impl WriteCommand for Rollback { ); let rows = self.keys.len(); - let mut released_locks = ReleasedLocks::new(self.start_ts, TimeStamp::zero()); + let mut released_locks = ReleasedLocks::new(); for k in self.keys { // Rollback is called only if the transaction is known to fail. Under the // circumstances, the rollback record needn't be protected. let released_lock = cleanup(&mut txn, &mut reader, k, TimeStamp::zero(), false)?; released_locks.push(released_lock); } - released_locks.wake_up(context.lock_mgr); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); @@ -67,6 +66,7 @@ impl WriteCommand for Rollback { rows, pr: ProcessResult::Res, lock_info: None, + released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index 7ec773b99dc..9bfbda5c748 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -9,8 +9,8 @@ use crate::storage::{ mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, SnapshotReader}, txn::{ commands::{ - Command, CommandExt, ReaderWithStats, ResponsePolicy, TypedCommand, WriteCommand, - WriteContext, WriteResult, + Command, CommandExt, ReaderWithStats, ReleasedLocks, ResponsePolicy, TypedCommand, + WriteCommand, WriteContext, WriteResult, }, Result, }, @@ -91,6 +91,7 @@ impl WriteCommand for TxnHeartBeat { rows: 1, pr, lock_info: None, + released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) @@ -106,7 +107,7 @@ pub mod tests { use super::*; use crate::storage::{ kv::TestEngineBuilder, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, mvcc::tests::*, txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, Engine, @@ -134,7 +135,7 @@ pub mod tests { .process_write( snapshot, WriteContext { - lock_mgr: &DummyLockManager, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: Default::default(), statistics: &mut Default::default(), @@ -176,7 +177,7 @@ pub mod tests { .process_write( snapshot, WriteContext { - lock_mgr: &DummyLockManager, + lock_mgr: &MockLockManager::new(), concurrency_manager: cm, extra_op: Default::default(), statistics: &mut Default::default(), diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index b65445b8c24..4ccc868f30d 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -41,7 +41,7 @@ use crossbeam::utils::CachePadded; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use futures::compat::Future01CompatExt; use kvproto::{ - kvrpcpb::{CommandPri, Context, DiskFullOpt, ExtraOp}, + kvrpcpb::{self, CommandPri, Context, DiskFullOpt, ExtraOp}, pdpb::QueryKind, }; use parking_lot::{Mutex, MutexGuard, RwLockWriteGuard}; @@ -64,11 +64,18 @@ use crate::{ self, with_tls_engine, Engine, ExtCallback, FlowStatsReporter, Result as EngineResult, SnapContext, Statistics, }, - lock_manager::{self, DiagnosticContext, LockManager, WaitTimeout}, + lock_manager::{ + self, + lock_wait_context::LockWaitContext, + lock_waiting_queue::{DelayedNotifyAllFuture, LockWaitEntry, LockWaitQueues}, + DiagnosticContext, LockManager, LockWaitToken, + }, metrics::*, + mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, ReleasedLock}, txn::{ commands::{ - Command, RawExt, ResponsePolicy, WriteContext, WriteResult, WriteResultLockInfo, + Command, RawExt, ReleasedLocks, ResponsePolicy, WriteContext, WriteResult, + WriteResultLockInfo, }, flow_controller::FlowController, latch::{Latches, Lock}, @@ -223,8 +230,12 @@ struct SchedulerInner { enable_async_apply_prewrite: bool, + pessimistic_lock_wake_up_delay_duration_ms: Arc, + resource_tag_factory: ResourceTagFactory, + lock_wait_queues: LockWaitQueues, + quota_limiter: Arc, feature_gate: FeatureGate, } @@ -367,6 +378,8 @@ impl Scheduler { task_slots.push(Mutex::new(Default::default()).into()); } + let lock_wait_queues = LockWaitQueues::new(lock_mgr.clone()); + let inner = Arc::new(SchedulerInner { task_slots, id_alloc: AtomicU64::new(0).into(), @@ -391,9 +404,11 @@ impl Scheduler { pipelined_pessimistic_lock: dynamic_configs.pipelined_pessimistic_lock, in_memory_pessimistic_lock: dynamic_configs.in_memory_pessimistic_lock, enable_async_apply_prewrite: config.enable_async_apply_prewrite, + pessimistic_lock_wake_up_delay_duration_ms: dynamic_configs.wake_up_delay_duration_ms, flow_controller, causal_ts_provider, resource_tag_factory, + lock_wait_queues, quota_limiter, feature_gate, }); @@ -722,27 +737,128 @@ impl Scheduler { /// Event handler for the request of waiting for lock fn on_wait_for_lock( &self, + ctx: &Context, cid: u64, - start_ts: TimeStamp, - pr: ProcessResult, - lock: lock_manager::Lock, - is_first_lock: bool, - wait_timeout: Option, - diag_ctx: DiagnosticContext, + lock_info: WriteResultLockInfo, + tracker: TrackerToken, ) { - debug!("command waits for lock released"; "cid" => cid); - let tctx = self.inner.dequeue_task_context(cid); - SCHED_STAGE_COUNTER_VEC.get(tctx.tag).lock_wait.inc(); + let key = lock_info.key.clone(); + let lock_digest = lock_info.lock_digest; + let start_ts = lock_info.parameters.start_ts; + let is_first_lock = lock_info.parameters.is_first_lock; + let wait_timeout = lock_info.parameters.wait_timeout; + + let diag_ctx = DiagnosticContext { + key: lock_info.key.to_raw().unwrap(), + resource_group_tag: ctx.get_resource_group_tag().into(), + tracker, + }; + let wait_token = self.inner.lock_mgr.allocate_token(); + + let (lock_req_ctx, lock_wait_entry, lock_info_pb) = + self.make_lock_waiting(cid, wait_token, lock_info); + + // The entry must be pushed to the lock waiting queue before sending to + // `lock_mgr`. When the request is canceled in anywhere outside the lock + // waiting queue (including `lock_mgr`), it first tries to remove the + // entry from the lock waiting queue. If the entry doesn't exist + // in the queue, it will be regarded as already popped out from the queue and + // therefore will woken up, thus the canceling operation will be + // skipped. So pushing the entry to the queue must be done before any + // possible cancellation. + self.inner + .lock_wait_queues + .push_lock_wait(lock_wait_entry, lock_info_pb.clone()); + + let wait_info = lock_manager::KeyLockWaitInfo { + key, + lock_digest, + lock_info: lock_info_pb, + }; self.inner.lock_mgr.wait_for( + wait_token, + ctx.get_region_id(), + ctx.get_region_epoch().clone(), + ctx.get_term(), start_ts, - tctx.cb.unwrap(), - pr, - lock, + wait_info, is_first_lock, wait_timeout, + Box::new(lock_req_ctx.get_callback_for_cancellation()), diag_ctx, ); - self.release_lock(&tctx.lock, cid); + } + + fn on_release_locks(&self, released_locks: ReleasedLocks) { + let mut legacy_wake_up_list = vec![]; + let mut delay_wake_up_futures = vec![]; + let wake_up_delay_duration_ms = self + .inner + .pessimistic_lock_wake_up_delay_duration_ms + .load(Ordering::Relaxed); + + released_locks.into_iter().for_each(|released_lock| { + let (lock_wait_entry, delay_wake_up_future) = + match self.inner.lock_wait_queues.pop_for_waking_up( + &released_lock.key, + released_lock.start_ts, + released_lock.commit_ts, + wake_up_delay_duration_ms, + ) { + Some(e) => e, + None => return, + }; + + // TODO: Currently there are only legacy requests. When resumable requests are + // supported, do not put them to the `legacy_wake_up_list`. + legacy_wake_up_list.push((lock_wait_entry, released_lock)); + if let Some(f) = delay_wake_up_future { + delay_wake_up_futures.push(f); + } + }); + + self.wake_up_legacy_pessimistic_locks(legacy_wake_up_list, delay_wake_up_futures); + } + + fn wake_up_legacy_pessimistic_locks( + &self, + legacy_wake_up_list: Vec<(Box, ReleasedLock)>, + delayed_wake_up_futures: Vec, + ) { + let self1 = self.clone(); + self.get_sched_pool(CommandPri::High) + .pool + .spawn(async move { + for (lock_info, released_lock) in legacy_wake_up_list { + let cb = lock_info.key_cb.unwrap().into_inner(); + let e = StorageError::from(Error::from(MvccError::from( + MvccErrorInner::WriteConflict { + start_ts: lock_info.parameters.start_ts, + conflict_start_ts: released_lock.start_ts, + conflict_commit_ts: released_lock.commit_ts, + key: released_lock.key.into_raw().unwrap(), + primary: lock_info.parameters.primary, + reason: kvrpcpb::WriteConflictReason::PessimisticRetry, + }, + ))); + cb(Err(e.into())); + } + + for f in delayed_wake_up_futures { + self1 + .get_sched_pool(CommandPri::High) + .pool + .spawn(async move { + let res = f.await; + // It returns only None currently. + // TODO: Handle not-none case when supporting resumable pessimistic lock + // requests. + assert!(res.is_none()); + }) + .unwrap(); + } + }) + .unwrap(); } fn early_response( @@ -842,7 +958,6 @@ impl Scheduler { let tag = task.cmd.tag(); let cid = task.cid; let priority = task.cmd.priority(); - let ts = task.cmd.ts(); let tracker = task.tracker; let scheduler = self.clone(); let quota_limiter = self.inner.quota_limiter.clone(); @@ -916,6 +1031,7 @@ impl Scheduler { rows, pr, lock_info, + released_locks, lock_guards, response_policy, } = match deadline @@ -938,23 +1054,27 @@ impl Scheduler { let region_id = ctx.get_region_id(); SCHED_STAGE_COUNTER_VEC.get(tag).write.inc(); + let mut pr = Some(pr); + + // TODO: Lock wait handling here. if let Some(lock_info) = lock_info { - let WriteResultLockInfo { - lock, - key, - is_first_lock, - wait_timeout, - } = lock_info; - let diag_ctx = DiagnosticContext { - key, - resource_group_tag: ctx.get_resource_group_tag().into(), - tracker, - }; - scheduler.on_wait_for_lock(cid, ts, pr, lock, is_first_lock, wait_timeout, diag_ctx); - return; + // Only handle lock waiting if `wait_timeout` is set. Otherwise it indicates + // that it's a lock-no-wait request and we need to report error + // immediately. + if lock_info.parameters.wait_timeout.is_some() { + assert_eq!(to_be_write.size(), 0); + pr = Some(ProcessResult::Res); + // allow_lock_with_conflict is not supported yet in this version. + assert!(!lock_info.parameters.allow_lock_with_conflict); + + scheduler.on_wait_for_lock(&ctx, cid, lock_info, tracker); + } + } + + if !released_locks.is_empty() { + scheduler.on_release_locks(released_locks); } - let mut pr = Some(pr); if to_be_write.modifies.is_empty() { scheduler.on_write_finished(cid, pr, Ok(()), lock_guards, false, false, tag); return; @@ -1262,6 +1382,39 @@ impl Scheduler { PessimisticLockMode::Sync } } + + fn make_lock_waiting( + &self, + cid: u64, + lock_wait_token: LockWaitToken, + lock_info: WriteResultLockInfo, + ) -> (LockWaitContext, Box, kvrpcpb::LockInfo) { + let mut slot = self.inner.get_task_slot(cid); + let task_ctx = slot.get_mut(&cid).unwrap(); + let cb = task_ctx.cb.take().unwrap(); + + let ctx = LockWaitContext::new( + lock_info.key.clone(), + self.inner.lock_wait_queues.clone(), + lock_wait_token, + cb, + lock_info.parameters.allow_lock_with_conflict, + ); + let first_batch_cb = ctx.get_callback_for_first_write_batch(); + task_ctx.cb = Some(first_batch_cb); + drop(slot); + + let lock_wait_entry = Box::new(LockWaitEntry { + key: lock_info.key, + lock_hash: lock_info.lock_digest.hash, + parameters: lock_info.parameters, + lock_wait_token, + legacy_wake_up_index: None, + key_cb: Some(ctx.get_callback_for_blocked_key().into()), + }); + + (ctx, lock_wait_entry, lock_info.lock_info_pb) + } } pub async fn get_raw_ext( @@ -1320,12 +1473,12 @@ mod tests { use kvproto::kvrpcpb::{BatchRollbackRequest, CheckTxnStatusRequest, Context}; use raftstore::store::{ReadStats, WriteStats}; use tikv_util::{config::ReadableSize, future::paired_future_callback}; - use txn_types::{Key, OldValues}; + use txn_types::{Key, OldValues, TimeStamp}; use super::*; use crate::storage::{ kv::{Error as KvError, ErrorInner as KvErrorInner}, - lock_manager::DummyLockManager, + lock_manager::{MockLockManager, WaitTimeout}, mvcc::{self, Mutation}, test_util::latest_feature_gate, txn::{ @@ -1346,7 +1499,7 @@ mod tests { } // TODO(cosven): use this in the following test cases to reduce duplicate code. - fn new_test_scheduler() -> (Scheduler, RocksEngine) { + fn new_test_scheduler() -> (Scheduler, RocksEngine) { let engine = TestEngineBuilder::new().build().unwrap(); let config = Config { scheduler_concurrency: 1024, @@ -1358,12 +1511,13 @@ mod tests { ( Scheduler::new( engine.clone(), - DummyLockManager, + MockLockManager::new(), ConcurrencyManager::new(1.into()), &config, DynamicConfigs { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), + wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), None, @@ -1505,12 +1659,13 @@ mod tests { }; let scheduler = Scheduler::new( engine, - DummyLockManager, + MockLockManager::new(), ConcurrencyManager::new(1.into()), &config, DynamicConfigs { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), + wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), None, @@ -1610,12 +1765,13 @@ mod tests { }; let scheduler = Scheduler::new( engine, - DummyLockManager, + MockLockManager::new(), ConcurrencyManager::new(1.into()), &config, DynamicConfigs { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), + wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), None, @@ -1669,12 +1825,13 @@ mod tests { }; let scheduler = Scheduler::new( engine, - DummyLockManager, + MockLockManager::new(), ConcurrencyManager::new(1.into()), &config, DynamicConfigs { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), + wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), None, @@ -1736,12 +1893,13 @@ mod tests { }; let scheduler = Scheduler::new( engine, - DummyLockManager, + MockLockManager::new(), ConcurrencyManager::new(1.into()), &config, DynamicConfigs { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), + wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), None, @@ -1798,12 +1956,13 @@ mod tests { let scheduler = Scheduler::new( engine, - DummyLockManager, + MockLockManager::new(), ConcurrencyManager::new(1.into()), &config, DynamicConfigs { pipelined_pessimistic_lock: Arc::new(AtomicBool::new(false)), in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), + wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), }, Arc::new(FlowController::Singleton(EngineFlowController::empty())), None, diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index ec38958ad57..e0f68b721b5 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -30,7 +30,7 @@ use tikv::{ self, config_manager::StorageConfigManger, kv::{Error as KvError, ErrorInner as KvErrorInner, SnapContext, SnapshotExt}, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, test_util::*, txn::{ @@ -53,9 +53,10 @@ fn test_scheduler_leader_change_twice() { let peers = region0.get_peers(); cluster.must_transfer_leader(region0.get_id(), peers[0].clone()); let engine0 = cluster.sim.rl().storages[&peers[0].get_id()].clone(); - let storage0 = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine0, DummyLockManager) - .build() - .unwrap(); + let storage0 = + TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine0, MockLockManager::new()) + .build() + .unwrap(); let mut ctx0 = Context::default(); ctx0.set_region_id(region0.get_id()); @@ -247,7 +248,7 @@ fn test_scale_scheduler_pool() { .get(&1) .unwrap() .clone(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .config(cluster.cfg.tikv.storage.clone()) .build() .unwrap(); @@ -344,7 +345,7 @@ fn test_pipelined_pessimistic_lock() { let before_pipelined_write_finish_fp = "before_pipelined_write_finish"; { - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .pipelined_pessimistic_lock(false) .build() .unwrap(); @@ -371,7 +372,7 @@ fn test_pipelined_pessimistic_lock() { fail::remove(rockskv_write_modifies_fp); } - let storage = TestStorageBuilderApiV1::new(DummyLockManager) + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) .pipelined_pessimistic_lock(true) .build() .unwrap(); @@ -524,10 +525,12 @@ fn test_async_commit_prewrite_with_stale_max_ts_impl() { .get(&1) .unwrap() .clone(); - let storage = - TestStorageBuilder::<_, _, F>::from_engine_and_lock_mgr(engine.clone(), DummyLockManager) - .build() - .unwrap(); + let storage = TestStorageBuilder::<_, _, F>::from_engine_and_lock_mgr( + engine.clone(), + MockLockManager::new(), + ) + .build() + .unwrap(); // Fail to get timestamp from PD at first fail::cfg("test_raftstore_get_tso", "pause").unwrap(); @@ -641,7 +644,7 @@ fn expect_locked(err: tikv::storage::Error, key: &[u8], lock_ts: TimeStamp) { } fn test_async_apply_prewrite_impl( - storage: &Storage, + storage: &Storage, ctx: Context, key: &[u8], value: &[u8], @@ -825,7 +828,7 @@ fn test_async_apply_prewrite() { .get(&1) .unwrap() .clone(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .async_apply_prewrite(true) .build() .unwrap(); @@ -923,7 +926,7 @@ fn test_async_apply_prewrite_fallback() { .get(&1) .unwrap() .clone(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .async_apply_prewrite(true) .build() .unwrap(); @@ -985,7 +988,7 @@ fn test_async_apply_prewrite_fallback() { } fn test_async_apply_prewrite_1pc_impl( - storage: &Storage, + storage: &Storage, ctx: Context, key: &[u8], value: &[u8], @@ -1112,7 +1115,7 @@ fn test_async_apply_prewrite_1pc() { .get(&1) .unwrap() .clone(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .async_apply_prewrite(true) .build() .unwrap(); @@ -1139,7 +1142,7 @@ fn test_atomic_cas_lock_by_latch() { .get(&1) .unwrap() .clone(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); @@ -1227,7 +1230,7 @@ fn test_before_async_write_deadline() { .get(&1) .unwrap() .clone(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); @@ -1259,7 +1262,7 @@ fn test_before_propose_deadline() { cluster.run(); let engine = cluster.sim.read().unwrap().storages[&1].clone(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); @@ -1292,7 +1295,7 @@ fn test_resolve_lock_deadline() { cluster.run(); let engine = cluster.sim.read().unwrap().storages[&1].clone(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index 1a6f2da9b87..e42a44047a4 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -30,7 +30,7 @@ use test_raftstore::new_server_cluster; use tikv::storage::{ self, kv::SnapshotExt, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, txn::tests::{ must_acquire_pessimistic_lock, must_commit, must_pessimistic_prewrite_put, must_pessimistic_prewrite_put_err, must_prewrite_put, must_prewrite_put_err, @@ -69,7 +69,7 @@ fn test_txn_failpoints() { #[test] fn test_atomic_getting_max_ts_and_storing_memory_lock() { let engine = TestEngineBuilder::new().build().unwrap(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); @@ -120,7 +120,7 @@ fn test_atomic_getting_max_ts_and_storing_memory_lock() { #[test] fn test_snapshot_must_be_later_than_updating_max_ts() { let engine = TestEngineBuilder::new().build().unwrap(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); @@ -163,7 +163,7 @@ fn test_snapshot_must_be_later_than_updating_max_ts() { #[test] fn test_update_max_ts_before_scan_memory_locks() { let engine = TestEngineBuilder::new().build().unwrap(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); @@ -217,7 +217,7 @@ macro_rules! lock_release_test { fn $test_name() { let engine = TestEngineBuilder::new().build().unwrap(); let storage = - TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); @@ -294,7 +294,7 @@ lock_release_test!( #[test] fn test_max_commit_ts_error() { let engine = TestEngineBuilder::new().build().unwrap(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); let cm = storage.get_concurrency_manager(); @@ -347,7 +347,7 @@ fn test_max_commit_ts_error() { #[test] fn test_exceed_max_commit_ts_in_the_middle_of_prewrite() { let engine = TestEngineBuilder::new().build().unwrap(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() .unwrap(); let cm = storage.get_concurrency_manager(); diff --git a/tests/failpoints/cases/test_ttl.rs b/tests/failpoints/cases/test_ttl.rs index 12449752285..026a21136ab 100644 --- a/tests/failpoints/cases/test_ttl.rs +++ b/tests/failpoints/cases/test_ttl.rs @@ -12,7 +12,7 @@ use tikv::{ server::ttl::check_ttl_and_compact_files, storage::{ kv::{SnapContext, TestEngineBuilder}, - lock_manager::DummyLockManager, + lock_manager::MockLockManager, raw::encoded::RawEncodeSnapshot, test_util::{expect_ok_callback, expect_value}, Engine, Iterator, Snapshot, Statistics, TestStorageBuilder, @@ -394,7 +394,7 @@ fn test_stoarge_raw_batch_put_ttl() { fn test_stoarge_raw_batch_put_ttl_impl() { fail::cfg("ttl_current_ts", "return(100)").unwrap(); - let storage = TestStorageBuilder::<_, _, F>::new(DummyLockManager) + let storage = TestStorageBuilder::<_, _, F>::new(MockLockManager::new()) .build() .unwrap(); let (tx, rx) = channel(); diff --git a/tests/integrations/config/dynamic/pessimistic_txn.rs b/tests/integrations/config/dynamic/pessimistic_txn.rs index caad8a64f9b..7af5455a199 100644 --- a/tests/integrations/config/dynamic/pessimistic_txn.rs +++ b/tests/integrations/config/dynamic/pessimistic_txn.rs @@ -69,11 +69,11 @@ fn setup( fn validate_waiter(router: &WaiterMgrScheduler, f: F) where - F: FnOnce(ReadableDuration, ReadableDuration) + Send + 'static, + F: FnOnce(ReadableDuration) + Send + 'static, { let (tx, rx) = mpsc::channel(); - router.validate(Box::new(move |v1, v2| { - f(v1, v2); + router.validate(Box::new(move |v1| { + f(v1); tx.send(()).unwrap(); })); rx.recv_timeout(Duration::from_secs(3)).unwrap(); @@ -107,30 +107,10 @@ fn test_lock_manager_cfg_update() { cfg_controller .update_config("raftstore.raft-log-gc-threshold", "2000") .unwrap(); - validate_waiter( - &waiter, - move |timeout: ReadableDuration, delay: ReadableDuration| { - assert_eq!(timeout.as_millis(), DEFAULT_TIMEOUT); - assert_eq!(delay.as_millis(), DEFAULT_DELAY); - }, - ); - validate_dead_lock(&deadlock, move |ttl: u64| { - assert_eq!(ttl, DEFAULT_TIMEOUT); + validate_waiter(&waiter, move |timeout: ReadableDuration| { + assert_eq!(timeout.as_millis(), DEFAULT_TIMEOUT); }); - - // only update wake_up_delay_duration - cfg_controller - .update_config("pessimistic-txn.wake-up-delay-duration", "500ms") - .unwrap(); - validate_waiter( - &waiter, - move |timeout: ReadableDuration, delay: ReadableDuration| { - assert_eq!(timeout.as_millis(), DEFAULT_TIMEOUT); - assert_eq!(delay.as_millis(), 500); - }, - ); validate_dead_lock(&deadlock, move |ttl: u64| { - // dead lock ttl should not change assert_eq!(ttl, DEFAULT_TIMEOUT); }); @@ -138,38 +118,11 @@ fn test_lock_manager_cfg_update() { cfg_controller .update_config("pessimistic-txn.wait-for-lock-timeout", "4000ms") .unwrap(); - validate_waiter( - &waiter, - move |timeout: ReadableDuration, delay: ReadableDuration| { - assert_eq!(timeout.as_millis(), 4000); - // wake_up_delay_duration should be the same as last update - assert_eq!(delay.as_millis(), 500); - }, - ); - validate_dead_lock(&deadlock, move |ttl: u64| { - assert_eq!(ttl, 4000); + validate_waiter(&waiter, move |timeout: ReadableDuration| { + assert_eq!(timeout.as_millis(), 4000); }); - - // update both config - let mut m = std::collections::HashMap::new(); - m.insert( - "pessimistic-txn.wait-for-lock-timeout".to_owned(), - "4321ms".to_owned(), - ); - m.insert( - "pessimistic-txn.wake-up-delay-duration".to_owned(), - "123ms".to_owned(), - ); - cfg_controller.update(m).unwrap(); - validate_waiter( - &waiter, - move |timeout: ReadableDuration, delay: ReadableDuration| { - assert_eq!(timeout.as_millis(), 4321); - assert_eq!(delay.as_millis(), 123); - }, - ); validate_dead_lock(&deadlock, move |ttl: u64| { - assert_eq!(ttl, 4321); + assert_eq!(ttl, 4000); }); // update pipelined @@ -206,5 +159,24 @@ fn test_lock_manager_cfg_update() { .load(Ordering::SeqCst) ); + // update wake-up-delay-duration + assert_eq!( + lock_mgr + .get_storage_dynamic_configs() + .wake_up_delay_duration_ms + .load(Ordering::SeqCst), + DEFAULT_DELAY + ); + cfg_controller + .update_config("pessimistic-txn.wake-up-delay-duration", "500ms") + .unwrap(); + assert_eq!( + lock_mgr + .get_storage_dynamic_configs() + .wake_up_delay_duration_ms + .load(Ordering::SeqCst), + 500 + ); + lock_mgr.stop(); } diff --git a/tests/integrations/resource_metering/test_suite/mod.rs b/tests/integrations/resource_metering/test_suite/mod.rs index 667c86d230a..7dc6eceb0d5 100644 --- a/tests/integrations/resource_metering/test_suite/mod.rs +++ b/tests/integrations/resource_metering/test_suite/mod.rs @@ -21,7 +21,7 @@ use test_util::alloc_port; use tikv::{ config::{ConfigController, TikvConfig}, storage::{ - lock_manager::DummyLockManager, RocksEngine, StorageApiV1, TestEngineBuilder, + lock_manager::MockLockManager, RocksEngine, StorageApiV1, TestEngineBuilder, TestStorageBuilderApiV1, }, }; @@ -32,7 +32,7 @@ pub struct TestSuite { pubsub_server_port: u16, receiver_server: Option, - storage: StorageApiV1, + storage: StorageApiV1, cfg_controller: ConfigController, resource_tag_factory: ResourceTagFactory, @@ -84,10 +84,11 @@ impl TestSuite { ); let engine = TestEngineBuilder::new().build().unwrap(); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, DummyLockManager) - .set_resource_tag_factory(resource_tag_factory.clone()) - .build() - .unwrap(); + let storage = + TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) + .set_resource_tag_factory(resource_tag_factory.clone()) + .build() + .unwrap(); let (tx, rx) = unbounded(); @@ -118,7 +119,7 @@ impl TestSuite { } } - pub fn get_storage(&self) -> StorageApiV1 { + pub fn get_storage(&self) -> StorageApiV1 { self.storage.clone() } From 0f5058ebcc489bc28f997b93765862d79312b5c0 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 27 Oct 2022 14:59:57 +0800 Subject: [PATCH 289/676] raftstore: make sure PrepareFlashback will get the latest region meta (#13664) close tikv/tikv#13643 * Use `self.region` while executing `PrepareFlashback` command to ensure it gets the latest region meta. * Check the epoch before executing the `PrepareFlashback`. Signed-off-by: JmPotato --- components/raftstore/src/store/fsm/apply.rs | 39 +++-- components/raftstore/src/store/metrics.rs | 4 +- components/raftstore/src/store/util.rs | 6 +- components/test_raftstore/src/cluster.rs | 81 +++++----- .../integrations/raftstore/test_flashback.rs | 146 +++++++++++++++--- 5 files changed, 199 insertions(+), 77 deletions(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index a9124dc2faf..d3eb7f86461 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1392,7 +1392,7 @@ where ExecResult::CommitMerge { ref region, .. } => (Some(region.clone()), None), ExecResult::RollbackMerge { ref region, .. } => (Some(region.clone()), None), ExecResult::IngestSst { ref ssts } => (None, Some(ssts.clone())), - ExecResult::SetFlashbackState { region } => (Some(region.clone()), None), + ExecResult::SetFlashbackState { ref region } => (Some(region.clone()), None), _ => (None, None), }, _ => (None, None), @@ -2829,30 +2829,27 @@ where ctx: &mut ApplyContext, req: &AdminRequest, ) -> Result<(AdminResponse, ApplyResult)> { - let region_id = self.region_id(); - let region_state_key = keys::region_state_key(region_id); - let mut old_state = match ctx - .engine - .get_msg_cf::(CF_RAFT, ®ion_state_key) - { - Ok(Some(s)) => s, - _ => { - return Err(box_err!("failed to get region state of {}", region_id)); - } - }; let is_in_flashback = req.get_cmd_type() == AdminCmdType::PrepareFlashback; - old_state.mut_region().set_is_in_flashback(is_in_flashback); + // Modify the region meta in memory. let mut region = self.region.clone(); region.set_is_in_flashback(is_in_flashback); - ctx.kv_wb_mut() - .put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), &old_state) - .unwrap_or_else(|e| { - error!( - "{} failed to change flashback state to {:?} for region {}: {:?}", - self.tag, req, region_id, e - ) - }); + // Modify the `RegionLocalState` persisted in disk. + write_peer_state(ctx.kv_wb_mut(), ®ion, PeerState::Normal, None).unwrap_or_else(|e| { + panic!( + "{} failed to change the flashback state to {} for region {:?}: {:?}", + self.tag, is_in_flashback, region, e + ) + }); + match req.get_cmd_type() { + AdminCmdType::PrepareFlashback => { + PEER_ADMIN_CMD_COUNTER.prepare_flashback.success.inc(); + } + AdminCmdType::FinishFlashback => { + PEER_ADMIN_CMD_COUNTER.finish_flashback.success.inc(); + } + _ => unreachable!(), + } Ok(( AdminResponse::default(), ApplyResult::Res(ExecResult::SetFlashbackState { region }), diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 14d8d7e97cc..7ab47cc90c6 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -33,7 +33,9 @@ make_auto_flush_static_metric! { commit_merge, rollback_merge, compact, - transfer_leader + transfer_leader, + prepare_flashback, + finish_flashback } pub label_enum AdminCmdStatus { diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 61da5805727..9f49730e1d0 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -193,8 +193,12 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdType::RollbackMerge => AdminCmdEpochState::new(true, true, true, false), // Transfer leader AdminCmdType::TransferLeader => AdminCmdEpochState::new(true, true, false, false), + // PrepareFlashback could be committed successfully before a split being applied, so we need + // to check the epoch to make sure it's sent to a correct key range. + // NOTICE: FinishFlashback will never meet the epoch not match error since any scheduling + // before it's forbidden. AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { - AdminCmdEpochState::new(false, false, false, false) + AdminCmdEpochState::new(true, true, false, false) } } } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index c097b22222d..c4ac98180a6 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -22,7 +22,7 @@ use file_system::IoRateLimiter; use futures::{self, channel::oneshot, executor::block_on}; use kvproto::{ errorpb::Error as PbError, - kvrpcpb::{ApiVersion, Context}, + kvrpcpb::{ApiVersion, Context, DiskFullOpt}, metapb::{self, Buckets, PeerRole, RegionEpoch, StoreLabel}, pdpb::{self, CheckPolicy, StoreReport}, raft_cmdpb::*, @@ -1438,14 +1438,47 @@ impl Cluster { .unwrap(); } - pub fn must_send_flashback_msg(&mut self, region_id: u64, cmd_type: AdminCmdType) { - self.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + pub fn must_send_flashback_msg( + &mut self, + region_id: u64, + cmd_type: AdminCmdType, + cb: Callback, + ) { let leader = self.leader_of_region(region_id).unwrap(); let store_id = leader.get_store_id(); let region_epoch = self.get_region_epoch(region_id); - block_on(async move { - let (result_tx, result_rx) = oneshot::channel(); - let cb = Callback::write(Box::new(move |resp| { + let mut admin = AdminRequest::default(); + admin.set_cmd_type(cmd_type); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_region_epoch(region_epoch); + req.mut_header().set_peer(leader); + req.set_admin_request(admin); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let router = self.sim.rl().get_router(store_id).unwrap(); + if let Err(e) = router.send_command( + req, + cb, + RaftCmdExtraOpts { + deadline: None, + disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, + }, + ) { + panic!( + "router send flashback msg {:?} failed, error: {}", + cmd_type, e + ); + } + } + + pub fn must_send_wait_flashback_msg(&mut self, region_id: u64, cmd_type: AdminCmdType) { + self.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + let (result_tx, result_rx) = oneshot::channel(); + self.must_send_flashback_msg( + region_id, + cmd_type, + Callback::write(Box::new(move |resp| { if resp.response.get_header().has_error() { result_tx .send(Some(resp.response.get_header().get_error().clone())) @@ -1453,38 +1486,14 @@ impl Cluster { return; } result_tx.send(None).unwrap(); - })); - - let mut admin = AdminRequest::default(); - admin.set_cmd_type(cmd_type); - let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(region_id); - req.mut_header().set_region_epoch(region_epoch); - req.mut_header().set_peer(leader); - req.set_admin_request(admin); - req.mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let router = self.sim.rl().get_router(store_id).unwrap(); - if let Err(e) = router.send_command( - req, - cb, - RaftCmdExtraOpts { - deadline: None, - disk_full_opt: kvproto::kvrpcpb::DiskFullOpt::AllowedOnAlmostFull, - }, - ) { - panic!( - "router send flashback msg {:?} failed, error: {}", - cmd_type, e - ); - } - if let Some(e) = result_rx.await.unwrap() { - panic!("call flashback msg {:?} failed, error: {:?}", cmd_type, e); - } - }); + })), + ); + if let Some(e) = block_on(result_rx).unwrap() { + panic!("call flashback msg {:?} failed, error: {:?}", cmd_type, e); + } } - fn wait_applied_to_current_term(&mut self, region_id: u64, timeout: Duration) { + pub fn wait_applied_to_current_term(&mut self, region_id: u64, timeout: Duration) { let mut now = Instant::now(); let deadline = now + timeout; while now < deadline { diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 810da9d840f..5227e7ea6bc 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -5,13 +5,123 @@ use std::{ time::{Duration, Instant}, }; +use futures::{channel::oneshot, executor::block_on}; use kvproto::{ + errorpb::FlashbackInProgress, metapb, raft_cmdpb::{AdminCmdType, CmdType, Request}, }; +use raftstore::store::Callback; use test_raftstore::*; use txn_types::WriteBatchFlags; +#[test] +fn test_prepare_flashback_after_split() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + + let old_region = cluster.get_region(b"a"); + cluster.wait_applied_to_current_term(old_region.get_id(), Duration::from_secs(3)); + // Pause the apply to make sure the split cmd and prepare flashback cmd are in + // the same batch. + let on_handle_apply_fp = "on_handle_apply"; + fail::cfg(on_handle_apply_fp, "pause").unwrap(); + // Send the split msg. + cluster.split_region( + &old_region, + b"b", + Callback::write(Box::new(|resp| { + if resp.response.get_header().has_error() { + panic!("split failed: {:?}", resp.response.get_header().get_error()); + } + })), + ); + // Make sure the admin split cmd is ready. + sleep(Duration::from_millis(100)); + // Send the prepare flashback msg. + let (result_tx, result_rx) = oneshot::channel(); + cluster.must_send_flashback_msg( + old_region.get_id(), + AdminCmdType::PrepareFlashback, + Callback::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + result_tx + .send(Some(resp.response.get_header().get_error().clone())) + .unwrap(); + return; + } + result_tx.send(None).unwrap(); + })), + ); + // Remove the pause to make these two commands are in the same batch to apply. + fail::remove(on_handle_apply_fp); + let prepare_flashback_err = block_on(result_rx).unwrap().unwrap(); + assert!( + prepare_flashback_err.has_epoch_not_match(), + "prepare flashback should fail with epoch not match, but got {:?}", + prepare_flashback_err + ); + // Check the region meta. + let left_region = cluster.get_region(b"a"); + let right_region = cluster.get_region(b"b"); + assert!(left_region.get_id() != old_region.get_id()); + assert!(left_region.get_end_key() == right_region.get_start_key()); + assert!( + left_region.get_region_epoch().get_version() + == right_region.get_region_epoch().get_version() + ); + must_check_flashback_state(&mut cluster, left_region.get_id(), 1, false); + must_check_flashback_state(&mut cluster, right_region.get_id(), 1, false); +} + +#[test] +fn test_prepare_flashback_after_conf_change() { + let mut cluster = new_node_cluster(0, 3); + // Disable default max peer count check. + cluster.pd_client.disable_default_operator(); + + let region_id = cluster.run_conf_change(); + cluster.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + // Pause the apply to make sure the conf change cmd and prepare flashback cmd + // are in the same batch. + let on_handle_apply_fp = "on_handle_apply"; + fail::cfg(on_handle_apply_fp, "pause").unwrap(); + // Send the conf change msg. + cluster.async_add_peer(region_id, new_peer(2, 2)).unwrap(); + // Make sure the conf change cmd is ready. + sleep(Duration::from_millis(100)); + // Send the prepare flashback msg. + let (result_tx, result_rx) = oneshot::channel(); + cluster.must_send_flashback_msg( + region_id, + AdminCmdType::PrepareFlashback, + Callback::write(Box::new(move |resp| { + if resp.response.get_header().has_error() { + result_tx + .send(Some(resp.response.get_header().get_error().clone())) + .unwrap(); + return; + } + result_tx.send(None).unwrap(); + })), + ); + // Remove the pause to make these two commands are in the same batch to apply. + fail::remove(on_handle_apply_fp); + let prepare_flashback_err = block_on(result_rx).unwrap().unwrap(); + assert!( + prepare_flashback_err.has_epoch_not_match(), + "prepare flashback should fail with epoch not match, but got {:?}", + prepare_flashback_err + ); + // Check the region meta. + let region = cluster.get_region(b"a"); + assert!(region.get_id() == region_id); + assert!(region.get_peers().len() == 2); + must_check_flashback_state(&mut cluster, region_id, 1, false); +} + #[test] fn test_flashback_unprepared() { let mut cluster = new_node_cluster(0, 3); @@ -47,7 +157,7 @@ fn test_flashback_for_schedule() { // Prepare for flashback let region = cluster.get_region(b"k1"); - cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); // Verify the schedule is disabled. let mut region = cluster.get_region(b"k3"); @@ -60,13 +170,13 @@ fn test_flashback_for_schedule() { let e = resp.get_header().get_error(); assert_eq!( e.get_flashback_in_progress(), - &kvproto::errorpb::FlashbackInProgress { + &FlashbackInProgress { region_id: region.get_id(), ..Default::default() } ); - cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); // Transfer leader to (2, 2) should succeed. cluster.must_transfer_leader(1, new_peer(2, 2)); } @@ -83,7 +193,7 @@ fn test_flashback_for_write() { // Prepare for flashback let region = cluster.get_region(b"k1"); - cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); // Write will be blocked let value = vec![1_u8; 8096]; @@ -95,7 +205,7 @@ fn test_flashback_for_write() { new_put_cmd(b"k1", &value), ); - cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); } @@ -114,7 +224,7 @@ fn test_flashback_for_read() { // Prepare for flashback let region = cluster.get_region(b"k1"); - cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); // read will be blocked must_get_error_flashback_in_progress(&mut cluster, ®ion, new_get_cf_cmd("write", b"k1")); @@ -127,7 +237,7 @@ fn test_flashback_for_read() { new_get_cf_cmd("write", b"k1"), ); - cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); } @@ -162,7 +272,7 @@ fn test_flashback_for_local_read() { assert_eq!(state.get_last_index(), last_index); // Prepare for flashback - cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); @@ -193,7 +303,7 @@ fn test_flashback_for_local_read() { let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 1); - cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 2); @@ -217,7 +327,7 @@ fn test_flashback_for_status_cmd_as_region_detail() { let leader = cluster.leader_of_region(1).unwrap(); let region = cluster.get_region(b"k1"); - cluster.must_send_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); let region_detail = cluster.region_detail(region.get_id(), leader.get_store_id()); assert!(region_detail.has_region()); @@ -243,10 +353,10 @@ fn test_flashback_for_check_is_in_persist() { must_check_flashback_state(&mut cluster, 1, 2, false); // Prepare for flashback - cluster.must_send_flashback_msg(1, AdminCmdType::PrepareFlashback); + cluster.must_send_wait_flashback_msg(1, AdminCmdType::PrepareFlashback); must_check_flashback_state(&mut cluster, 1, 2, true); - cluster.must_send_flashback_msg(1, AdminCmdType::FinishFlashback); + cluster.must_send_wait_flashback_msg(1, AdminCmdType::FinishFlashback); must_check_flashback_state(&mut cluster, 1, 2, false); } @@ -273,7 +383,7 @@ fn test_flashback_for_apply_snapshot() { } // Prepare for flashback - cluster.must_send_flashback_msg(1, AdminCmdType::PrepareFlashback); + cluster.must_send_wait_flashback_msg(1, AdminCmdType::PrepareFlashback); must_check_flashback_state(&mut cluster, 1, 1, true); must_check_flashback_state(&mut cluster, 1, 3, false); @@ -282,7 +392,7 @@ fn test_flashback_for_apply_snapshot() { must_check_flashback_state(&mut cluster, 1, 1, true); must_check_flashback_state(&mut cluster, 1, 3, true); - cluster.must_send_flashback_msg(1, AdminCmdType::FinishFlashback); + cluster.must_send_wait_flashback_msg(1, AdminCmdType::FinishFlashback); must_check_flashback_state(&mut cluster, 1, 1, false); must_check_flashback_state(&mut cluster, 1, 3, false); } @@ -310,7 +420,7 @@ fn must_check_flashback_state( ); } -fn multi_do_cmd(cluster: &mut Cluster, cmd: kvproto::raft_cmdpb::Request) { +fn multi_do_cmd(cluster: &mut Cluster, cmd: Request) { for _ in 0..100 { let mut reqs = vec![]; for _ in 0..100 { @@ -323,7 +433,7 @@ fn multi_do_cmd(cluster: &mut Cluster, cmd: kvproto::raft_cmdpb fn must_do_cmd_with_flashback_flag( cluster: &mut Cluster, region: &mut metapb::Region, - cmd: kvproto::raft_cmdpb::Request, + cmd: Request, ) { // Verify the read can be executed if add flashback flag in request's // header. @@ -344,7 +454,7 @@ fn must_do_cmd_with_flashback_flag( fn must_get_error_flashback_in_progress( cluster: &mut Cluster, region: &metapb::Region, - cmd: kvproto::raft_cmdpb::Request, + cmd: Request, ) { for _ in 0..100 { let mut reqs = vec![]; @@ -356,7 +466,7 @@ fn must_get_error_flashback_in_progress( Err(e) => { assert_eq!( e.get_flashback_in_progress(), - &kvproto::errorpb::FlashbackInProgress { + &FlashbackInProgress { region_id: region.get_id(), ..Default::default() } From b7880cdd1447e92bca7e6e68f0ab41e2fefcce45 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 28 Oct 2022 14:33:58 +0800 Subject: [PATCH 290/676] cdc, resolved_ts: reduce allocation in leadership resolver (#13666) close tikv/tikv#13665 cdc, resolved_ts: reduce allocation in leadership resolver Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/cdc/Cargo.toml | 1 - components/cdc/src/endpoint.rs | 38 +++-- components/raftstore/src/store/util.rs | 37 +++-- components/resolved_ts/src/advance.rs | 214 ++++++++++++++----------- components/resolved_ts/src/endpoint.rs | 3 +- components/resolved_ts/src/lib.rs | 1 - components/resolved_ts/src/util.rs | 12 -- 7 files changed, 181 insertions(+), 125 deletions(-) delete mode 100644 components/resolved_ts/src/util.rs diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index 62ef4cc29f5..27ce81c57b4 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -26,7 +26,6 @@ portable = ["tikv/portable"] sse = ["tikv/sse"] mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] -pprof-fp = ["tikv/pprof-fp"] [dependencies] api_version = { workspace = true } diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 614e282a5d9..8aa6aad3c29 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -40,7 +40,7 @@ use tikv_util::{ mpsc::bounded, slow_log, sys::thread::ThreadBuildWrapper, - time::{Limiter, SlowTimer}, + time::{Instant, Limiter, SlowTimer}, timer::SteadyTimer, warn, worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}, @@ -154,6 +154,8 @@ pub enum Task { }, RegisterMinTsEvent { leader_resolver: LeadershipResolver, + // The time at which the event actually occurred. + event_time: Instant, }, // The result of ChangeCmd should be returned from CDC Endpoint to ensure // the downstream switches to Normal after the previous commands was sunk. @@ -222,7 +224,9 @@ impl fmt::Debug for Task { .field("observe_id", &observe_id) .field("region_id", ®ion.get_id()) .finish(), - Task::RegisterMinTsEvent { .. } => de.field("type", &"register_min_ts").finish(), + Task::RegisterMinTsEvent { ref event_time, .. } => { + de.field("event_time", &event_time).finish() + } Task::InitDownstream { ref region_id, ref downstream_id, @@ -447,13 +451,12 @@ impl, E: KvEngine> Endpoint { resolved_region_count: 0, unresolved_region_count: 0, sink_memory_quota, - // store_resolver, // Log the first resolved ts warning. warn_resolved_ts_repeat_count: WARN_RESOLVED_TS_COUNT_THRESHOLD, current_ts: TimeStamp::zero(), causal_ts_provider, }; - ep.register_min_ts_event(leader_resolver); + ep.register_min_ts_event(leader_resolver, Instant::now()); ep } @@ -996,8 +999,16 @@ impl, E: KvEngine> Endpoint { let _ = downstream.sink_event(resolved_ts_event, force_send); } - fn register_min_ts_event(&self, mut leader_resolver: LeadershipResolver) { - let timeout = self.timer.delay(self.config.min_ts_interval.0); + fn register_min_ts_event(&self, mut leader_resolver: LeadershipResolver, event_time: Instant) { + // Try to keep advance resolved ts every `min_ts_interval`, thus + // the actual wait interval = `min_ts_interval` - the last register min_ts event + // time. + let interval = self + .config + .min_ts_interval + .0 + .checked_sub(event_time.saturating_elapsed()); + let timeout = self.timer.delay(interval.unwrap_or_default()); let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); let raft_router = self.raft_router.clone(); @@ -1043,7 +1054,10 @@ impl, E: KvEngine> Endpoint { defer!({ slow_log!(T slow_timer, "cdc resolve region leadership"); if let Ok(leader_resolver) = leader_resolver_rx.try_recv() { - match scheduler.schedule(Task::RegisterMinTsEvent { leader_resolver }) { + match scheduler.schedule(Task::RegisterMinTsEvent { + leader_resolver, + event_time: Instant::now(), + }) { Ok(_) | Err(ScheduleError::Stopped(_)) => (), // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not // advance normally. @@ -1129,8 +1143,9 @@ impl, E: KvEngine> Runnable for Endpoint { } => self.on_multi_batch(multi, old_value_cb), Task::OpenConn { conn } => self.on_open_conn(conn), Task::RegisterMinTsEvent { - leader_resolver: store_resolver, - } => self.register_min_ts_event(store_resolver), + leader_resolver, + event_time, + } => self.register_min_ts_event(leader_resolver, event_time), Task::InitDownstream { region_id, downstream_id, @@ -1871,7 +1886,10 @@ mod tests { let mut suite = mock_endpoint_with_ts_provider(&cfg, None, ApiVersion::V2, Some(ts_provider.clone())); let leader_resolver = suite.leader_resolver.take().unwrap(); - suite.run(Task::RegisterMinTsEvent { leader_resolver }); + suite.run(Task::RegisterMinTsEvent { + leader_resolver, + event_time: Instant::now(), + }); suite .task_rx .recv_timeout(Duration::from_millis(1500)) diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 9f49730e1d0..1e571296e1a 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -1215,11 +1215,11 @@ impl RegionReadProgress { } // Dump the `LeaderInfo` and the peer list - pub fn dump_leader_info(&self) -> (Vec, LeaderInfo) { + pub fn dump_leader_info(&self) -> (LeaderInfo, Option) { let core = self.core.lock().unwrap(); ( - core.get_local_leader_info().peers.clone(), core.get_leader_info(), + core.get_local_leader_info().leader_store_id, ) } @@ -1231,6 +1231,8 @@ impl RegionReadProgress { core.leader_info.epoch = region.get_region_epoch().clone(); core.leader_info.peers = region.get_peers().to_vec(); } + core.leader_info.leader_store_id = + find_store_id(&core.leader_info.peers, core.leader_info.leader_id) } /// Reset `safe_ts` to 0 and stop updating it @@ -1308,6 +1310,7 @@ pub struct ReadState { pub struct LocalLeaderInfo { leader_id: u64, leader_term: u64, + leader_store_id: Option, epoch: RegionEpoch, peers: Vec, } @@ -1317,6 +1320,7 @@ impl LocalLeaderInfo { LocalLeaderInfo { leader_id: raft::INVALID_ID, leader_term: 0, + leader_store_id: None, epoch: region.get_region_epoch().clone(), peers: region.get_peers().to_vec(), } @@ -1329,6 +1333,19 @@ impl LocalLeaderInfo { pub fn get_leader_id(&self) -> u64 { self.leader_id } + + pub fn get_leader_store_id(&self) -> Option { + self.leader_store_id + } +} + +fn find_store_id(peer_list: &[Peer], peer_id: u64) -> Option { + for peer in peer_list { + if peer.id == peer_id { + return Some(peer.store_id); + } + } + None } impl RegionReadProgressCore { @@ -1444,7 +1461,6 @@ impl RegionReadProgressCore { } pub fn get_leader_info(&self) -> LeaderInfo { - let mut leader_info = LeaderInfo::default(); let read_state = { // Get the latest `read_state` let ReadState { idx, ts } = self.pending_items.back().unwrap_or(&self.read_state); @@ -1454,12 +1470,15 @@ impl RegionReadProgressCore { rs }; let li = &self.leader_info; - leader_info.set_peer_id(li.leader_id); - leader_info.set_term(li.leader_term); - leader_info.set_region_id(self.region_id); - leader_info.set_region_epoch(li.epoch.clone()); - leader_info.set_read_state(read_state); - leader_info + LeaderInfo { + peer_id: li.leader_id, + region_id: self.region_id, + term: li.leader_term, + region_epoch: protobuf::SingularPtrField::some(li.epoch.clone()), + read_state: protobuf::SingularPtrField::some(read_state), + unknown_fields: protobuf::UnknownFields::default(), + cached_size: protobuf::CachedSize::default(), + } } pub fn get_local_leader_info(&self) -> &LocalLeaderInfo { diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 35426f4861d..496c5c8fab8 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -14,9 +14,9 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use fail::fail_point; use futures::{compat::Future01CompatExt, future::select_all, FutureExt, TryFutureExt}; -use grpcio::{ChannelBuilder, Environment}; +use grpcio::{ChannelBuilder, Environment, Error as GrpcError, RpcStatusCode}; use kvproto::{ - kvrpcpb::{CheckLeaderRequest, LeaderInfo}, + kvrpcpb::{CheckLeaderRequest, CheckLeaderResponse}, metapb::{Peer, PeerRole}, tikvpb::TikvClient, }; @@ -43,7 +43,7 @@ use tokio::{ }; use txn_types::TimeStamp; -use crate::{endpoint::Task, metrics::*, util}; +use crate::{endpoint::Task, metrics::*}; const DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS: u64 = 5_000; // 5s @@ -143,11 +143,11 @@ pub struct LeadershipResolver { region_read_progress: RegionReadProgressRegistry, store_id: u64, - // store_id -> leaders info, record the request to each stores - store_map: HashMap>, - // region_id -> region, cache the information of regions + // store_id -> check leader request, record the request to each stores. + store_req_map: HashMap, + // region_id -> region, cache the information of regions. region_map: HashMap>, - // region_id -> peers id, record the responses + // region_id -> peers id, record the responses. resp_map: HashMap>, valid_regions: HashSet, @@ -172,7 +172,7 @@ impl LeadershipResolver { security_mgr, region_read_progress, - store_map: HashMap::default(), + store_req_map: HashMap::default(), region_map: HashMap::default(), resp_map: HashMap::default(), valid_regions: HashSet::default(), @@ -184,7 +184,7 @@ impl LeadershipResolver { fn gc(&mut self) { let now = Instant::now_coarse(); if now - self.last_gc_time > self.gc_interval { - self.store_map = HashMap::default(); + self.store_req_map = HashMap::default(); self.region_map = HashMap::default(); self.resp_map = HashMap::default(); self.valid_regions = HashSet::default(); @@ -193,9 +193,16 @@ impl LeadershipResolver { } fn clear(&mut self) { - self.store_map.clear(); - self.region_map.clear(); - self.resp_map.clear(); + for v in self.store_req_map.values_mut() { + v.regions.clear(); + v.ts = 0; + } + for v in self.region_map.values_mut() { + v.clear(); + } + for v in self.resp_map.values_mut() { + v.clear(); + } self.valid_regions.clear(); } @@ -241,7 +248,7 @@ impl LeadershipResolver { // This function broadcasts a special message to all stores, gets the leader id // of them to confirm whether current peer has a quorum which accepts its // leadership. - pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + pub async fn resolve(&mut self, _regions: Vec, min_ts: TimeStamp) -> Vec { // Clear previous result before resolving. self.clear(); // GC when necessary to prevent memory leak. @@ -249,21 +256,22 @@ impl LeadershipResolver { PENDING_RTS_COUNT.inc(); defer!(PENDING_RTS_COUNT.dec()); - fail_point!("before_sync_replica_read_state", |_| regions.clone()); + fail_point!("before_sync_replica_read_state", |_| _regions.clone()); let store_id = self.store_id; let valid_regions = &mut self.valid_regions; let region_map = &mut self.region_map; let resp_map = &mut self.resp_map; - let store_map = &mut self.store_map; + let store_req_map = &mut self.store_req_map; self.region_read_progress.with(|registry| { for (region_id, read_progress) in registry { let core = read_progress.get_core(); let local_leader_info = core.get_local_leader_info(); let leader_id = local_leader_info.get_leader_id(); + let leader_store_id = local_leader_info.get_leader_store_id(); let peer_list = local_leader_info.get_peers(); // Check if the leader in this store - if util::find_store_id(peer_list, leader_id) != Some(store_id) { + if leader_store_id != Some(store_id) { continue; } let leader_info = core.get_leader_info(); @@ -271,13 +279,21 @@ impl LeadershipResolver { let mut unvotes = 0; for peer in peer_list { if peer.store_id == store_id && peer.id == leader_id { - resp_map.entry(*region_id).or_default().push(store_id); + resp_map + .entry(*region_id) + .or_insert_with(|| Vec::with_capacity(peer_list.len())) + .push(store_id); } else { // It's still necessary to check leader on learners even if they don't vote // because performing stale read on learners require it. - store_map + store_req_map .entry(peer.store_id) - .or_default() + .or_insert_with(|| { + let mut req = CheckLeaderRequest::default(); + req.regions = Vec::with_capacity(registry.len()).into(); + req + }) + .regions .push(leader_info.clone()); if peer.get_role() != PeerRole::Learner { unvotes += 1; @@ -289,7 +305,10 @@ impl LeadershipResolver { if unvotes == 0 && region_has_quorum(peer_list, &resp_map[region_id]) { valid_regions.insert(*region_id); } else { - region_map.insert(*region_id, peer_list.to_vec()); + region_map + .entry(*region_id) + .or_insert_with(|| Vec::with_capacity(peer_list.len())) + .extend_from_slice(peer_list); } } }); @@ -299,62 +318,69 @@ impl LeadershipResolver { let security_mgr = &self.security_mgr; let tikv_clients = &self.tikv_clients; // Approximate `LeaderInfo` size - let leader_info_size = store_map + let leader_info_size = store_req_map .values() - .next() - .map_or(0, |regions| regions[0].compute_size()); - let store_count = store_map.len(); - let mut stores: Vec<_> = store_map - .drain() - .map(|(to_store, regions)| { - let env = env.clone(); - let region_num = regions.len() as u32; - CHECK_LEADER_REQ_SIZE_HISTOGRAM.observe((leader_info_size * region_num) as f64); - CHECK_LEADER_REQ_ITEM_COUNT_HISTOGRAM.observe(region_num as f64); - - // Check leadership for `regions` on `to_store`. - async move { - PENDING_CHECK_LEADER_REQ_COUNT.inc(); - defer!(PENDING_CHECK_LEADER_REQ_COUNT.dec()); - let client = - get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients) - .await - .map_err(|e| { - (to_store, e.retryable(), format!("[get tikv client] {}", e)) - })?; - - let mut req = CheckLeaderRequest::default(); - req.set_regions(regions.into()); - req.set_ts(min_ts.into_inner()); - let slow_timer = SlowTimer::default(); - defer!({ - slow_log!( - T - slow_timer, - "check leader rpc costs too long, to_store: {}", - to_store - ); - let elapsed = slow_timer.saturating_elapsed(); - RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC - .with_label_values(&["rpc"]) - .observe(elapsed.as_secs_f64()); - }); - - let rpc = client - .check_leader_async(&req) - .map_err(|e| (to_store, true, format!("[rpc create failed]{}", e)))?; - PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); - defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); - let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); - let resp = tokio::time::timeout(timeout, rpc) - .map_err(|e| (to_store, true, format!("[timeout] {}", e))) - .await? - .map_err(|e| (to_store, true, format!("[rpc failed] {}", e)))?; - Ok((to_store, resp)) - } - .boxed() - }) - .collect(); + .find(|req| !req.regions.is_empty()) + .map_or(0, |req| req.regions[0].compute_size()); + let store_count = store_req_map.len(); + let mut check_leader_rpcs = Vec::with_capacity(store_req_map.len()); + for (store_id, req) in store_req_map { + if req.regions.is_empty() { + continue; + } + let env = env.clone(); + let to_store = *store_id; + let region_num = req.regions.len() as u32; + CHECK_LEADER_REQ_SIZE_HISTOGRAM.observe((leader_info_size * region_num) as f64); + CHECK_LEADER_REQ_ITEM_COUNT_HISTOGRAM.observe(region_num as f64); + + // Check leadership for `regions` on `to_store`. + let rpc = async move { + PENDING_CHECK_LEADER_REQ_COUNT.inc(); + defer!(PENDING_CHECK_LEADER_REQ_COUNT.dec()); + let client = get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients) + .await + .map_err(|e| (to_store, e.retryable(), format!("[get tikv client] {}", e)))?; + + // Set min_ts in the request. + req.set_ts(min_ts.into_inner()); + let slow_timer = SlowTimer::default(); + defer!({ + slow_log!( + T + slow_timer, + "check leader rpc costs too long, to_store: {}", + to_store + ); + let elapsed = slow_timer.saturating_elapsed(); + RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC + .with_label_values(&["rpc"]) + .observe(elapsed.as_secs_f64()); + }); + + let rpc = match client.check_leader_async(req) { + Ok(rpc) => rpc, + Err(GrpcError::RpcFailure(status)) + if status.code() == RpcStatusCode::UNIMPLEMENTED => + { + // Some stores like TiFlash don't implement it. + return Ok((to_store, CheckLeaderResponse::default())); + } + Err(e) => return Err((to_store, true, format!("[rpc create failed]{}", e))), + }; + + PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); + defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); + let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); + let resp = tokio::time::timeout(timeout, rpc) + .map_err(|e| (to_store, true, format!("[timeout] {}", e))) + .await? + .map_err(|e| (to_store, true, format!("[rpc failed] {}", e)))?; + Ok((to_store, resp)) + } + .boxed(); + check_leader_rpcs.push(rpc); + } let start = Instant::now_coarse(); defer!({ @@ -362,21 +388,19 @@ impl LeadershipResolver { .with_label_values(&["all"]) .observe(start.saturating_elapsed_secs()); }); - for _ in 0..store_count { + let rpc_count = check_leader_rpcs.len(); + for _ in 0..rpc_count { // Use `select_all` to avoid the process getting blocked when some // TiKVs were down. - let (res, _, remains) = select_all(stores).await; - stores = remains; + let (res, _, remains) = select_all(check_leader_rpcs).await; + check_leader_rpcs = remains; match res { Ok((to_store, resp)) => { for region_id in resp.regions { - if let Some(r) = region_map.get(®ion_id) { - let resps = resp_map.entry(region_id).or_default(); - resps.push(to_store); - if region_has_quorum(r, resps) { - valid_regions.insert(region_id); - } - } + resp_map + .entry(region_id) + .or_insert_with(|| Vec::with_capacity(store_count)) + .push(to_store); } } Err((to_store, reconnect, err)) => { @@ -386,11 +410,21 @@ impl LeadershipResolver { } } } - // Return early if all regions had already got quorum. - if valid_regions.len() == regions.len() { - // break here because all regions have quorum, - // so there is no need waiting for other stores to respond. - break; + } + for (region_id, prs) in region_map { + if prs.is_empty() { + // The peer had the leadership before, but now it's no longer + // the case. Skip checking the region. + continue; + } + if let Some(resp) = resp_map.get(region_id) { + if resp.is_empty() { + // No response, maybe the peer lost leadership. + continue; + } + if region_has_quorum(prs, resp) { + valid_regions.insert(*region_id); + } } } self.valid_regions.drain().collect() diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 480c0ee6896..4f957e8266d 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -788,8 +788,7 @@ where let (mut oldest_leader_ts, mut oldest_leader_region) = (u64::MAX, 0); self.region_read_progress.with(|registry| { for (region_id, read_progress) in registry { - let (peers, leader_info) = read_progress.dump_leader_info(); - let leader_store_id = crate::util::find_store_id(&peers, leader_info.peer_id); + let (leader_info, leader_store_id) = read_progress.dump_leader_info(); let ts = leader_info.get_read_state().get_safe_ts(); if ts == 0 { zero_ts_count += 1; diff --git a/components/resolved_ts/src/lib.rs b/components/resolved_ts/src/lib.rs index 5ad2941dde2..5d4e233808d 100644 --- a/components/resolved_ts/src/lib.rs +++ b/components/resolved_ts/src/lib.rs @@ -37,4 +37,3 @@ mod scanner; pub use scanner::*; mod metrics; pub use metrics::*; -mod util; diff --git a/components/resolved_ts/src/util.rs b/components/resolved_ts/src/util.rs deleted file mode 100644 index 11bc1c547a0..00000000000 --- a/components/resolved_ts/src/util.rs +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use kvproto::metapb::Peer; - -pub fn find_store_id(peer_list: &[Peer], peer_id: u64) -> Option { - for peer in peer_list { - if peer.id == peer_id { - return Some(peer.store_id); - } - } - None -} From d03290473d8116d4c676c42fd6780b198d8ce7f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Fri, 28 Oct 2022 15:19:58 +0800 Subject: [PATCH 291/676] log-backup: added `Debug` output of error to report log (#13686) close tikv/tikv#13685 `report` would now print the `Debug` version of error. Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- components/backup-stream/src/errors.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index 493cf28babc..b34e7126360 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -132,14 +132,14 @@ macro_rules! annotate { impl Error { pub fn report(&self, context: impl Display) { - warn!("backup stream meet error"; "context" => %context, "err" => %self); + warn!("backup stream meet error"; "context" => %context, "err" => %self, "verbose_err" => ?self); metrics::STREAM_ERROR .with_label_values(&[self.kind()]) .inc() } pub fn report_fatal(&self) { - error!(%self; "backup stream meet fatal error"); + error!(%self; "backup stream meet fatal error"; "verbose" => ?self, ); metrics::STREAM_FATAL_ERROR .with_label_values(&[self.kind()]) .inc() From fe61a03438d91bb1739e8e95db66c9927254d62f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Fri, 28 Oct 2022 17:05:58 +0800 Subject: [PATCH 292/676] log-backup: fix slow flush in GCP (#13674) close tikv/tikv#13688, close pingcap/tidb#38642 Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/cloud/gcp/Cargo.toml | 2 + components/cloud/gcp/src/gcs.rs | 157 ++++++++++++++++++++++++++------ components/cloud/gcp/src/lib.rs | 19 ++++ 4 files changed, 152 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a6b25808098..49e6184ab5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2149,6 +2149,7 @@ dependencies = [ "hyper-tls", "kvproto", "matches", + "pin-project", "slog", "slog-global", "tame-gcs", diff --git a/components/cloud/gcp/Cargo.toml b/components/cloud/gcp/Cargo.toml index f184377c0af..f0446fa096d 100644 --- a/components/cloud/gcp/Cargo.toml +++ b/components/cloud/gcp/Cargo.toml @@ -23,3 +23,5 @@ url = "2.0" [dev-dependencies] matches = "0.1.8" +pin-project = "1" +tokio = { version = "1.5", features = ["rt"] } diff --git a/components/cloud/gcp/src/gcs.rs b/components/cloud/gcp/src/gcs.rs index e8e8ad20ee9..01f69a6d245 100644 --- a/components/cloud/gcp/src/gcs.rs +++ b/components/cloud/gcp/src/gcs.rs @@ -2,12 +2,13 @@ use std::{convert::TryInto, fmt::Display, io, sync::Arc}; use async_trait::async_trait; -use cloud::blob::{ - none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty, +use cloud::{ + blob::{none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty}, + metrics, }; use futures_util::{ future::TryFutureExt, - io::{AsyncRead, AsyncReadExt, Cursor}, + io::{self as async_io, AsyncRead, Cursor}, stream::{StreamExt, TryStreamExt}, }; use http::HeaderValue; @@ -20,7 +21,12 @@ use tame_gcs::{ types::{BucketName, ObjectId}, }; use tame_oauth::gcp::{ServiceAccountAccess, ServiceAccountInfo, TokenOrRequest}; -use tikv_util::stream::{error_stream, retry, AsyncReadAsSyncStreamOfBytes, RetryError}; +use tikv_util::{ + stream::{error_stream, AsyncReadAsSyncStreamOfBytes, RetryError}, + time::Instant, +}; + +use crate::utils::retry; const GOOGLE_APIS: &str = "https://www.googleapis.com"; const HARDCODED_ENDPOINTS_SUFFIX: &[&str] = &["upload/storage/v1/", "storage/v1/"]; @@ -156,6 +162,7 @@ impl ResultExt for Result { } } +#[derive(Debug)] enum RequestError { Hyper(hyper::Error, String), OAuth(tame_oauth::Error, String), @@ -433,6 +440,14 @@ fn parse_predefined_acl(acl: &str) -> Result, &str> { })) } +/// Like AsyncReadExt::read_to_end, but only try to initialize the buffer once. +/// Check https://github.com/rust-lang/futures-rs/issues/2658 for the reason we cannot +/// directly use it. +async fn read_to_end(r: R, v: &mut Vec) -> std::io::Result { + let mut c = Cursor::new(v); + async_io::copy(r, &mut c).await +} + const STORAGE_NAME: &str = "gcs"; #[async_trait] @@ -441,12 +456,7 @@ impl BlobStorage for GcsStorage { Box::new(self.config.clone()) as Box } - async fn put( - &self, - name: &str, - mut reader: PutResource, - content_length: u64, - ) -> io::Result<()> { + async fn put(&self, name: &str, reader: PutResource, content_length: u64) -> io::Result<()> { if content_length == 0 { // It is probably better to just write the empty file // However, currently going forward results in a body write aborted error @@ -470,25 +480,36 @@ impl BlobStorage for GcsStorage { // FIXME: Switch to upload() API so we don't need to read the entire data into // memory in order to retry. + let begin = Instant::now_coarse(); let mut data = Vec::with_capacity(content_length as usize); - reader.read_to_end(&mut data).await?; - retry(|| async { - let data = Cursor::new(data.clone()); - let req = Object::insert_multipart( - &bucket, - data, - content_length, - &metadata, - Some(InsertObjectOptional { - predefined_acl: self.config.predefined_acl, - ..Default::default() - }), - ) - .map_err(RequestError::Gcs)? - .map(|reader| Body::wrap_stream(AsyncReadAsSyncStreamOfBytes::new(reader))); - self.make_request(req, tame_gcs::Scopes::ReadWrite).await - }) + read_to_end(reader, &mut data).await?; + metrics::CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["gcp", "read_local"]) + .observe(begin.saturating_elapsed_secs()); + let begin = Instant::now_coarse(); + retry( + || async { + let data = Cursor::new(data.clone()); + let req = Object::insert_multipart( + &bucket, + data, + content_length, + &metadata, + Some(InsertObjectOptional { + predefined_acl: self.config.predefined_acl, + ..Default::default() + }), + ) + .map_err(RequestError::Gcs)? + .map(|reader| Body::wrap_stream(AsyncReadAsSyncStreamOfBytes::new(reader))); + self.make_request(req, tame_gcs::Scopes::ReadWrite).await + }, + "insert_multipart", + ) .await?; + metrics::CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["gcp", "insert_multipart"]) + .observe(begin.saturating_elapsed_secs()); Ok::<_, io::Error>(()) } @@ -504,6 +525,10 @@ impl BlobStorage for GcsStorage { #[cfg(test)] mod tests { + extern crate test; + use std::task::Poll; + + use futures_util::AsyncReadExt; use matches::assert_matches; use super::*; @@ -605,6 +630,84 @@ mod tests { assert_eq!(c1.bucket.prefix, c2.bucket.prefix); } + enum ThrottleReadState { + Spawning, + Emitting, + } + /// ThrottleRead throttles a `Read` -- make it emits 2 chars for each + /// `read` call. This is copy & paste from the implmentation from s3.rs. + #[pin_project::pin_project] + struct ThrottleRead { + #[pin] + inner: R, + state: ThrottleReadState, + } + impl AsyncRead for ThrottleRead { + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut [u8], + ) -> Poll> { + let this = self.project(); + match this.state { + ThrottleReadState::Spawning => { + *this.state = ThrottleReadState::Emitting; + cx.waker().wake_by_ref(); + Poll::Pending + } + ThrottleReadState::Emitting => { + *this.state = ThrottleReadState::Spawning; + this.inner.poll_read(cx, &mut buf[..2]) + } + } + } + } + impl ThrottleRead { + fn new(r: R) -> Self { + Self { + inner: r, + state: ThrottleReadState::Spawning, + } + } + } + + const BENCH_READ_SIZE: usize = 128 * 1024; + + // 255,120,895 ns/iter (+/- 73,332,249) (futures-util 0.3.15) + #[bench] + fn bench_read_to_end(b: &mut test::Bencher) { + let mut v = [0; BENCH_READ_SIZE]; + let mut dst = Vec::with_capacity(BENCH_READ_SIZE); + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + + b.iter(|| { + let mut r = ThrottleRead::new(Cursor::new(&mut v)); + dst.clear(); + + rt.block_on(r.read_to_end(&mut dst)).unwrap(); + assert_eq!(dst.len(), BENCH_READ_SIZE) + }) + } + + // 5,850,042 ns/iter (+/- 3,787,438) + #[bench] + fn bench_manual_read_to_end(b: &mut test::Bencher) { + let mut v = [0; BENCH_READ_SIZE]; + let mut dst = Vec::with_capacity(BENCH_READ_SIZE); + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + b.iter(|| { + let r = ThrottleRead::new(Cursor::new(&mut v)); + dst.clear(); + + rt.block_on(read_to_end(r, &mut dst)).unwrap(); + assert_eq!(dst.len(), BENCH_READ_SIZE) + }) + } + fn cloud_dynamic_from_input(mut gcs: InputConfig) -> CloudDynamic { let mut bucket = InputBucket::default(); if !gcs.endpoint.is_empty() { diff --git a/components/cloud/gcp/src/lib.rs b/components/cloud/gcp/src/lib.rs index 4652bbf5b74..9ad97793988 100644 --- a/components/cloud/gcp/src/lib.rs +++ b/components/cloud/gcp/src/lib.rs @@ -1,7 +1,26 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(test)] #[macro_use] extern crate slog_global; mod gcs; pub use gcs::{Config, GcsStorage}; + +pub mod utils { + use std::future::Future; + + use cloud::metrics; + use tikv_util::stream::{retry_ext, RetryError, RetryExt}; + pub async fn retry(action: G, name: &'static str) -> Result + where + G: FnMut() -> F, + F: Future>, + E: RetryError + std::fmt::Debug, + { + retry_ext(action, RetryExt::default().with_fail_hook(move |err: &E| { + warn!("gcp request meet error."; "err" => ?err, "retry?" => %err.is_retryable(), "context" => %name); + metrics::CLOUD_ERROR_VEC.with_label_values(&["gcp", name]).inc(); + })).await + } +} From dea7b050fbca2266a5e81883bb396b9652fba646 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 28 Oct 2022 18:41:58 +0800 Subject: [PATCH 293/676] readpool: update yatp and add new metrics for unified-read-pool (#13489) ref tikv/tikv#13313 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- Cargo.lock | 33 ++- Cargo.toml | 4 + components/server/src/server.rs | 3 + metrics/grafana/tikv_details.json | 360 ++++++++++++++++++++++++++++++ 4 files changed, 394 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 49e6184ab5b..bc757a3ecdf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1149,13 +1149,12 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +version = "0.8.2" +source = "git+https://github.com/crossbeam-rs/crossbeam?rev=41ed3d948720f26149b2ebeaf58fe8a193134056#41ed3d948720f26149b2ebeaf58fe8a193134056" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.8", - "crossbeam-utils 0.8.8", + "crossbeam-epoch 0.9.10", + "crossbeam-utils 0.8.11", ] [[package]] @@ -1184,6 +1183,19 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "crossbeam-epoch" +version = "0.9.10" +source = "git+https://github.com/crossbeam-rs/crossbeam?rev=41ed3d948720f26149b2ebeaf58fe8a193134056#41ed3d948720f26149b2ebeaf58fe8a193134056" +dependencies = [ + "autocfg", + "cfg-if 1.0.0", + "crossbeam-utils 0.8.11", + "memoffset", + "once_cell", + "scopeguard", +] + [[package]] name = "crossbeam-queue" version = "0.3.5" @@ -1236,6 +1248,15 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.11" +source = "git+https://github.com/crossbeam-rs/crossbeam?rev=41ed3d948720f26149b2ebeaf58fe8a193134056#41ed3d948720f26149b2ebeaf58fe8a193134056" +dependencies = [ + "cfg-if 1.0.0", + "once_cell", +] + [[package]] name = "crypto-mac" version = "0.10.0" @@ -7251,7 +7272,7 @@ checksum = "541b12c998c5b56aa2b4e6f18f03664eef9a4fd0a246a55594efae6cc2d964b5" [[package]] name = "yatp" version = "0.0.1" -source = "git+https://github.com/tikv/yatp.git?branch=master#2f5f6e47ba6fce8d55e7a57b7ee39a93bc0e8194" +source = "git+https://github.com/tikv/yatp.git?branch=master#39cb495953d40a7e846363c06090755c2eac65fa" dependencies = [ "crossbeam-deque", "dashmap", diff --git a/Cargo.toml b/Cargo.toml index 786b229df3b..d95dd1c67c1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -211,6 +211,10 @@ fs2 = { git = "https://github.com/tabokie/fs2-rs", branch = "tikv" } # Remove this when a new version is release. We need to solve rust-lang/cmake-rs#143. cmake = { git = "https://github.com/rust-lang/cmake-rs" } +# TODO: remove this after crossbeam-deque is updated to the next release version. +# This is a workaround for cargo can't resolving the this patch in yatp. +crossbeam-deque = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "41ed3d948720f26149b2ebeaf58fe8a193134056" } + [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } # When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 2295839a806..82973946d96 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -564,6 +564,9 @@ where yatp::metrics::set_namespace(Some("tikv")); prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL0_CHANCE.clone())).unwrap(); prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL_ELAPSED.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_POLL_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_TIMES.clone())).unwrap(); } fn init_encryption(&mut self) { diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 45a657cc4bb..471bf4bea2e 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -17938,6 +17938,366 @@ "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Unified read pool task execution time during one schedule.", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 4199, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.50, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "50%", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "999%", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Duration of One Time Slice", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Unified read pool task total execution duration.", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 4202, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.50, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "50%", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "999%", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Execute Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Task schedule number of times.", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 4204, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.50, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "50%", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "999%", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Schedule Times", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "title": "Unified Read Pool", From 53470735e263343a0a53acc596fe1e82fe1c5b65 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 31 Oct 2022 16:59:59 +0800 Subject: [PATCH 294/676] Raftstore-v2: tablet factory supports temporary split path (#13678) close tikv/tikv#13679 Signed-off-by: SpadeA-Tang Co-authored-by: Xinye Tao --- components/engine_test/src/lib.rs | 91 ++++++++--------- components/engine_traits/src/engine.rs | 15 ++- src/server/engine_factory.rs | 2 +- src/server/engine_factory_v2.rs | 133 +++++++++++++++---------- 4 files changed, 142 insertions(+), 99 deletions(-) diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index b2a574422fb..b460e97d4ce 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -189,8 +189,8 @@ pub mod kv { } #[inline] - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { - Path::new(&self.root_path).join(format!("tablets/{}_{}", id, suffix)) + fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { + self.root_path.join("db") } #[inline] @@ -226,7 +226,8 @@ pub mod kv { #[derive(Clone)] pub struct TestTabletFactoryV2 { inner: TestTabletFactory, - registry: Arc>>, + // region_id -> (tablet, tablet_suffix) + registry: Arc>>, } impl TestTabletFactoryV2 { @@ -242,17 +243,6 @@ pub mod kv { } } - // Extract tablet id and tablet suffix from the path. - fn get_id_and_suffix_from_path(path: &Path) -> (u64, u64) { - let (mut tablet_id, mut tablet_suffix) = (0, 1); - if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { - let mut split = s.split('_'); - tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); - tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); - } - (tablet_id, tablet_suffix) - } - impl TabletFactory for TestTabletFactoryV2 { /// See the comment above the same name method in KvEngineFactoryV2 fn open_tablet( @@ -261,33 +251,34 @@ pub mod kv { suffix: Option, mut options: OpenOptions, ) -> Result { + if options.create_new() && suffix.is_none() { + return Err(box_err!( + "suffix should be provided when creating new tablet" + )); + } + if options.create_new() || options.create() { options = options.set_cache_only(false); } let mut reg = self.registry.lock().unwrap(); if let Some(suffix) = suffix { - if let Some(tablet) = reg.get(&(id, suffix)) { + if let Some((cached_tablet, cached_suffix)) = reg.get(&id) && *cached_suffix == suffix { // Target tablet exist in the cache - if options.create_new() { - return Err(box_err!("region {} {} already exists", id, tablet.path())); + return Err(box_err!("region {} {} already exists", id, cached_tablet.path())); } - return Ok(tablet.clone()); + return Ok(cached_tablet.clone()); } else if !options.cache_only() { let tablet_path = self.tablet_path(id, suffix); let tablet = self.open_tablet_raw(&tablet_path, id, suffix, options.clone())?; if !options.skip_cache() { - reg.insert((id, suffix), tablet.clone()); + reg.insert(id, (tablet.clone(), suffix)); } return Ok(tablet); } - } else if options.cache_only() { - // This branch reads an arbitrary tablet with region id `id` - - if let Some(k) = reg.keys().find(|k| k.0 == id) { - return Ok(reg.get(k).unwrap().clone()); - } + } else if let Some((tablet, _)) = reg.get(&id) { + return Ok(tablet.clone()); } Err(box_err!( @@ -343,17 +334,24 @@ pub mod kv { } #[inline] - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf { self.inner .root_path - .join(format!("tablets/{}_{}", id, suffix)) + .join(format!("tablets/{}{}_{}", prefix, id, suffix)) } #[inline] fn mark_tombstone(&self, region_id: u64, suffix: u64) { let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); - std::fs::File::create(&path).unwrap(); - self.registry.lock().unwrap().remove(&(region_id, suffix)); + // When the full directory path does not exsit, create will return error and in + // this case, we just ignore it. + let _ = std::fs::File::create(&path); + { + let mut reg = self.registry.lock().unwrap(); + if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { + reg.insert(region_id, (cached_tablet, cached_suffix)); + } + } } #[inline] @@ -364,37 +362,40 @@ pub mod kv { } #[inline] - fn destroy_tablet(&self, id: u64, suffix: u64) -> engine_traits::Result<()> { - let path = self.tablet_path(id, suffix); - self.registry.lock().unwrap().remove(&(id, suffix)); + fn destroy_tablet(&self, region_id: u64, suffix: u64) -> engine_traits::Result<()> { + let path = self.tablet_path(region_id, suffix); + { + let mut reg = self.registry.lock().unwrap(); + if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { + reg.insert(region_id, (cached_tablet, cached_suffix)); + } + } let _ = std::fs::remove_dir_all(path); Ok(()) } #[inline] - fn load_tablet(&self, path: &Path, id: u64, suffix: u64) -> Result { + fn load_tablet(&self, path: &Path, region_id: u64, suffix: u64) -> Result { { let reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { - return Err(box_err!("region {} {} already exists", id, db.path())); + if let Some((db, db_suffix)) = reg.get(®ion_id) && *db_suffix == suffix { + return Err(box_err!("region {} {} already exists", region_id, db.path())); } } - let db_path = self.tablet_path(id, suffix); + let db_path = self.tablet_path(region_id, suffix); std::fs::rename(path, &db_path)?; - let new_engine = - self.open_tablet(id, Some(suffix), OpenOptions::default().set_create(true)); - if new_engine.is_ok() { - let (old_id, old_suffix) = get_id_and_suffix_from_path(path); - self.registry.lock().unwrap().remove(&(old_id, old_suffix)); - } - new_engine + self.open_tablet( + region_id, + Some(suffix), + OpenOptions::default().set_create(true), + ) } fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { let reg = self.registry.lock().unwrap(); // pick up any tablet and set the shared block cache capacity - if let Some(((_id, _suffix), tablet)) = (*reg).iter().next() { + if let Some((_id, (tablet, _suffix))) = (*reg).iter().next() { let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap opt.set_block_cache_capacity(capacity)?; } @@ -406,7 +407,7 @@ pub mod kv { #[inline] fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &KvTestEngine)) { let reg = self.registry.lock().unwrap(); - for ((id, suffix), tablet) in &*reg { + for (id, (tablet, suffix)) in &*reg { f(*id, *suffix, tablet) } } diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 5ad9a13b86f..34c8d67c3d3 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -224,6 +224,9 @@ impl OpenOptions { } } +pub const SPLIT_PREFIX: &str = "split_"; +pub const MERGE_PREFIX: &str = "merge_"; + /// A factory trait to create new engine. // It should be named as `EngineFactory` for consistency, but we are about to // rename engine to tablet, so always use tablet for new traits/types. @@ -261,7 +264,15 @@ pub trait TabletFactory: TabletAccessor + Send + Sync { fn exists_raw(&self, path: &Path) -> bool; /// Get the tablet path by id and suffix - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf; + fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + self.tablet_path_with_prefix("", id, suffix) + } + + /// Get the tablet path by id and suffix + /// + /// Used in special situations + /// Ex: split/merge. + fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf; /// Tablets root path fn tablets_path(&self) -> PathBuf; @@ -323,7 +334,7 @@ where true } - fn tablet_path(&self, _id: u64, _suffix: u64) -> PathBuf { + fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { PathBuf::from(&self.root_path) } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index d8492dae5ce..7e8a1457500 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -272,7 +272,7 @@ impl TabletFactory for KvEngineFactory { false } - fn tablet_path(&self, _id: u64, _suffix: u64) -> PathBuf { + fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { self.kv_engine_path() } diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index b4a7688ef68..323f414c05c 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -19,7 +19,8 @@ const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; #[derive(Clone)] pub struct KvEngineFactoryV2 { inner: KvEngineFactory, - registry: Arc>>, + // region_id -> (tablet, tablet_suffix) + registry: Arc>>, } impl KvEngineFactoryV2 { @@ -31,23 +32,11 @@ impl KvEngineFactoryV2 { } } -// Extract tablet id and tablet suffix from the path. -fn get_id_and_suffix_from_path(path: &Path) -> (u64, u64) { - let (mut tablet_id, mut tablet_suffix) = (0, 1); - if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { - let mut split = s.split('_'); - tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); - tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); - } - (tablet_id, tablet_suffix) -} - impl TabletFactory for KvEngineFactoryV2 { /// open a tablet according to the OpenOptions. /// /// If options.cache_only is true, only open the relevant tablet from - /// `registry`, and if suffix is None, return an arbitrary tablet with the - /// target region id if there are any. + /// `registry`. /// /// If options.create_new is true, create a tablet by id and suffix. If the /// tablet exists, it will fail. @@ -55,6 +44,8 @@ impl TabletFactory for KvEngineFactoryV2 { /// If options.create is true, open the tablet with id and suffix if it /// exists or create it otherwise. /// + /// If options.skip_cache is true, cache will not be updated. + /// /// Note: options.cache_only and options.create and/or options.create_new /// cannot be true simultaneously fn open_tablet( @@ -63,39 +54,39 @@ impl TabletFactory for KvEngineFactoryV2 { suffix: Option, mut options: OpenOptions, ) -> Result { + if options.create_new() && suffix.is_none() { + return Err(box_err!( + "suffix should be provided when creating new tablet" + )); + } + if options.create() || options.create_new() { options = options.set_cache_only(false); } let mut reg = self.registry.lock().unwrap(); if let Some(suffix) = suffix { - if let Some(tablet) = reg.get(&(id, suffix)) { + if let Some((cached_tablet, cached_suffix)) = reg.get(&id) && *cached_suffix == suffix { // Target tablet exist in the cache - if options.create_new() { return Err(box_err!( "region {} {} already exists", id, - tablet.as_inner().path() + cached_tablet.as_inner().path() )); } - return Ok(tablet.clone()); + return Ok(cached_tablet.clone()); } else if !options.cache_only() { let tablet_path = self.tablet_path(id, suffix); let tablet = self.open_tablet_raw(&tablet_path, id, suffix, options.clone())?; if !options.skip_cache() { debug!("Insert a tablet"; "key" => ?(id, suffix)); - reg.insert((id, suffix), tablet.clone()); + reg.insert(id, (tablet.clone(), suffix)); } return Ok(tablet); } - } else if options.cache_only() { - // This branch reads an arbitrary tablet with region id `id` - - if let Some(k) = reg.keys().find(|k| k.0 == id) { - debug!("choose a random tablet"; "key" => ?k); - return Ok(reg.get(k).unwrap().clone()); - } + } else if let Some((tablet, _)) = reg.get(&id) { + return Ok(tablet.clone()); } Err(box_err!( @@ -154,18 +145,25 @@ impl TabletFactory for KvEngineFactoryV2 { } #[inline] - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf { self.inner .store_path() - .join(format!("tablets/{}_{}", id, suffix)) + .join(format!("tablets/{}{}_{}", prefix, id, suffix)) } #[inline] fn mark_tombstone(&self, region_id: u64, suffix: u64) { let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); - std::fs::File::create(&path).unwrap(); + // When the full directory path does not exsit, create will return error and in + // this case, we just ignore it. + let _ = std::fs::File::create(&path); debug!("tombstone tablet"; "region_id" => region_id, "suffix" => suffix); - self.registry.lock().unwrap().remove(&(region_id, suffix)); + { + let mut reg = self.registry.lock().unwrap(); + if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { + reg.insert(region_id, (cached_tablet, cached_suffix)); + } + } } #[inline] @@ -176,42 +174,45 @@ impl TabletFactory for KvEngineFactoryV2 { } #[inline] - fn destroy_tablet(&self, id: u64, suffix: u64) -> engine_traits::Result<()> { - let path = self.tablet_path(id, suffix); - self.registry.lock().unwrap().remove(&(id, suffix)); + fn destroy_tablet(&self, region_id: u64, suffix: u64) -> engine_traits::Result<()> { + let path = self.tablet_path(region_id, suffix); + { + let mut reg = self.registry.lock().unwrap(); + if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { + reg.insert(region_id, (cached_tablet, cached_suffix)); + } + } self.inner.destroy_tablet(&path)?; - self.inner.on_tablet_destroy(id, suffix); + self.inner.on_tablet_destroy(region_id, suffix); Ok(()) } #[inline] - fn load_tablet(&self, path: &Path, id: u64, suffix: u64) -> Result { + fn load_tablet(&self, path: &Path, region_id: u64, suffix: u64) -> Result { { let reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { + if let Some((db, db_suffix)) = reg.get(®ion_id) && *db_suffix == suffix { return Err(box_err!( "region {} {} already exists", - id, + region_id, db.as_inner().path() )); } } - let db_path = self.tablet_path(id, suffix); + let db_path = self.tablet_path(region_id, suffix); std::fs::rename(path, &db_path)?; - let new_engine = - self.open_tablet(id, Some(suffix), OpenOptions::default().set_create(true)); - if new_engine.is_ok() { - let (old_id, old_suffix) = get_id_and_suffix_from_path(path); - self.registry.lock().unwrap().remove(&(old_id, old_suffix)); - } - new_engine + self.open_tablet( + region_id, + Some(suffix), + OpenOptions::default().set_create(true), + ) } fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { let reg = self.registry.lock().unwrap(); // pick up any tablet and set the shared block cache capacity - if let Some(((_id, _suffix), tablet)) = (*reg).iter().next() { + if let Some((_id, (tablet, _suffix))) = (*reg).iter().next() { let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap opt.set_block_cache_capacity(capacity)?; } @@ -223,7 +224,7 @@ impl TabletAccessor for KvEngineFactoryV2 { #[inline] fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { let reg = self.registry.lock().unwrap(); - for ((id, suffix), tablet) in &*reg { + for (id, (tablet, suffix)) in &*reg { f(*id, *suffix, tablet) } } @@ -236,7 +237,7 @@ impl TabletAccessor for KvEngineFactoryV2 { #[cfg(test)] mod tests { - use engine_traits::{OpenOptions, TabletFactory, CF_WRITE}; + use engine_traits::{OpenOptions, TabletFactory, CF_WRITE, SPLIT_PREFIX}; use super::*; use crate::{config::TikvConfig, server::KvEngineFactoryBuilder}; @@ -373,6 +374,11 @@ mod tests { .unwrap(); assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + // Only both region id and suffix match can get the tablet from the cache. + factory + .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) + .unwrap_err(); + let tablet_path = factory.tablet_path(1, 10); let result = factory.open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)); result.unwrap_err(); @@ -400,14 +406,39 @@ mod tests { .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) .unwrap(); + factory + .open_tablet(1, Some(30), OpenOptions::default().set_create_new(true)) + .unwrap(); + // After open a tablet with the same id but higher suffix, we cannot get the old + // one from cache. + factory + .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) + .unwrap_err(); + // Destroy/mark tombstone the old tablet will not unregister the new tablet in + // the cache factory.mark_tombstone(1, 20); - assert!(factory.is_tombstoned(1, 20)); + factory + .open_tablet(1, Some(30), OpenOptions::default().set_cache_only(true)) + .unwrap(); factory.destroy_tablet(1, 20).unwrap(); + factory + .open_tablet(1, Some(30), OpenOptions::default().set_cache_only(true)) + .unwrap(); - let result = factory.open_tablet(1, Some(20), OpenOptions::default()); + factory.mark_tombstone(1, 30); + assert!(factory.is_tombstoned(1, 30)); + factory.destroy_tablet(1, 30).unwrap(); + + let result = factory.open_tablet(1, Some(30), OpenOptions::default()); result.unwrap_err(); assert!(!factory.is_single_engine()); + + assert!( + factory + .tablet_path_with_prefix(SPLIT_PREFIX, 1, 10) + .ends_with("split_1_10") + ); } #[test] @@ -428,7 +459,7 @@ mod tests { .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) .unwrap(); drop(tablet); - let tablet = factory.registry.lock().unwrap().remove(&(1, 10)).unwrap(); + let (tablet, _) = factory.registry.lock().unwrap().remove(&1).unwrap(); drop(tablet); factory .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) From e0885803a17dfcd26964e74029ce5af9a32cd797 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 31 Oct 2022 17:31:59 +0800 Subject: [PATCH 295/676] *: prepare for raftstore-v2's split (#13693) ref tikv/tikv#12842 Signed-off-by: SpadeA-Tang Co-authored-by: Xinye Tao --- components/engine_panic/src/checkpoint.rs | 29 +++++++ components/engine_panic/src/lib.rs | 1 + components/engine_rocks/src/checkpoint.rs | 55 +++++++++++++ components/engine_rocks/src/lib.rs | 2 + components/engine_traits/src/checkpoint.rs | 20 +++++ components/engine_traits/src/engine.rs | 1 + components/engine_traits/src/lib.rs | 2 + components/raftstore/src/store/fsm/apply.rs | 85 +++++++++++++-------- 8 files changed, 163 insertions(+), 32 deletions(-) create mode 100644 components/engine_panic/src/checkpoint.rs create mode 100644 components/engine_rocks/src/checkpoint.rs create mode 100644 components/engine_traits/src/checkpoint.rs diff --git a/components/engine_panic/src/checkpoint.rs b/components/engine_panic/src/checkpoint.rs new file mode 100644 index 00000000000..6743810eb90 --- /dev/null +++ b/components/engine_panic/src/checkpoint.rs @@ -0,0 +1,29 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use core::panic; +use std::path::Path; + +use engine_traits::{Checkpointable, Checkpointer, Result}; + +use crate::PanicEngine; + +pub struct PanicCheckpointer {} + +impl Checkpointable for PanicEngine { + type Checkpointer = PanicCheckpointer; + + fn new_checkpointer(&self) -> Result { + panic!() + } +} + +impl Checkpointer for PanicCheckpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()> { + panic!() + } +} diff --git a/components/engine_panic/src/lib.rs b/components/engine_panic/src/lib.rs index 0573c936135..70c7f00ece8 100644 --- a/components/engine_panic/src/lib.rs +++ b/components/engine_panic/src/lib.rs @@ -46,5 +46,6 @@ pub mod flow_control_factors; pub use crate::flow_control_factors::*; pub mod table_properties; pub use crate::table_properties::*; +pub mod checkpoint; mod raft_engine; diff --git a/components/engine_rocks/src/checkpoint.rs b/components/engine_rocks/src/checkpoint.rs new file mode 100644 index 00000000000..8b82043a392 --- /dev/null +++ b/components/engine_rocks/src/checkpoint.rs @@ -0,0 +1,55 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::path::Path; + +use engine_traits::{Checkpointable, Checkpointer, Result}; + +use crate::{r2e, RocksEngine}; + +impl Checkpointable for RocksEngine { + type Checkpointer = RocksEngineCheckpointer; + + fn new_checkpointer(&self) -> Result { + match self.as_inner().new_checkpointer() { + Ok(pointer) => Ok(RocksEngineCheckpointer(pointer)), + Err(e) => Err(r2e(e)), + } + } +} + +pub struct RocksEngineCheckpointer(rocksdb::Checkpointer); + +impl Checkpointer for RocksEngineCheckpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()> { + self.0 + .create_at(db_out_dir, titan_out_dir, log_size_for_flush) + .map_err(|e| r2e(e)) + } +} + +#[cfg(test)] +mod tests { + use engine_traits::{Checkpointable, Checkpointer, Peekable, SyncMutable, ALL_CFS}; + use tempfile::tempdir; + + use crate::util::new_engine; + + #[test] + fn test_checkpoint() { + let dir = tempdir().unwrap(); + let path = dir.path().join("origin"); + let engine = new_engine(path.as_path().to_str().unwrap(), ALL_CFS).unwrap(); + engine.put(b"key", b"value").unwrap(); + + let mut check_pointer = engine.new_checkpointer().unwrap(); + let path2 = dir.path().join("checkpoint"); + check_pointer.create_at(path2.as_path(), None, 0).unwrap(); + let engine2 = new_engine(path2.as_path().to_str().unwrap(), ALL_CFS).unwrap(); + assert_eq!(engine2.get_value(b"key").unwrap().unwrap(), b"value"); + } +} diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index 774fe9cb37b..c1e23dac4a6 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -28,6 +28,8 @@ mod cf_names; pub use crate::cf_names::*; mod cf_options; pub use crate::cf_options::*; +mod checkpoint; +pub use crate::checkpoint::*; mod compact; pub use crate::compact::*; mod db_options; diff --git a/components/engine_traits/src/checkpoint.rs b/components/engine_traits/src/checkpoint.rs new file mode 100644 index 00000000000..6ea3556938f --- /dev/null +++ b/components/engine_traits/src/checkpoint.rs @@ -0,0 +1,20 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::path::Path; + +use crate::Result; + +pub trait Checkpointable { + type Checkpointer: Checkpointer; + + fn new_checkpointer(&self) -> Result; +} + +pub trait Checkpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()>; +} diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 34c8d67c3d3..55ab5d63caa 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -40,6 +40,7 @@ pub trait KvEngine: + Clone + Debug + Unpin + + Checkpointable + 'static { /// A consistent read-only snapshot of the database diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 47fe16b4768..29351636694 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -303,6 +303,8 @@ mod flow_control_factors; pub use crate::flow_control_factors::*; mod table_properties; pub use crate::table_properties::*; +mod checkpoint; +pub use crate::checkpoint::*; // These modules contain more general traits, some of which may be implemented // by multiple types. diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index d3eb7f86461..c8fee703e63 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -38,8 +38,8 @@ use kvproto::{ kvrpcpb::ExtraOp as TxnExtraOp, metapb::{PeerRole, Region, RegionEpoch}, raft_cmdpb::{ - AdminCmdType, AdminRequest, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, - RaftCmdRequest, RaftCmdResponse, Request, + AdminCmdType, AdminRequest, AdminResponse, BatchSplitRequest, ChangePeerRequest, CmdType, + CommitMergeRequest, RaftCmdRequest, RaftCmdResponse, Request, }, raft_serverpb::{MergeState, PeerState, RaftApplyState, RaftTruncatedState, RegionLocalState}, }; @@ -1899,6 +1899,42 @@ mod confchange_cmd_metric { } } +// Validate the request and the split keys +pub fn extract_split_keys( + split_reqs: &BatchSplitRequest, + region_to_split: &Region, +) -> Result>> { + if split_reqs.get_requests().is_empty() { + return Err(box_err!("missing split requests")); + } + let mut keys: VecDeque> = VecDeque::with_capacity(split_reqs.get_requests().len() + 1); + for req in split_reqs.get_requests() { + let split_key = req.get_split_key(); + if split_key.is_empty() { + return Err(box_err!("missing split key")); + } + if split_key + <= keys + .back() + .map_or_else(|| region_to_split.get_start_key(), Vec::as_slice) + { + return Err(box_err!("invalid split request: {:?}", split_reqs)); + } + if req.get_new_peer_ids().len() != region_to_split.get_peers().len() { + return Err(box_err!( + "invalid new peer id count, need {:?}, but got {:?}", + region_to_split.get_peers(), + req.get_new_peer_ids() + )); + } + keys.push_back(split_key.to_vec()); + } + + util::check_key_in_region_exclusive(keys.back().unwrap(), region_to_split)?; + + Ok(keys) +} + // Admin commands related. impl ApplyDelegate where @@ -2368,37 +2404,8 @@ where PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); let split_reqs = req.get_splits(); - let right_derive = split_reqs.get_right_derive(); - if split_reqs.get_requests().is_empty() { - return Err(box_err!("missing split requests")); - } + let mut keys = extract_split_keys(split_reqs, &self.region)?; let mut derived = self.region.clone(); - let new_region_cnt = split_reqs.get_requests().len(); - let mut regions = Vec::with_capacity(new_region_cnt + 1); - let mut keys: VecDeque> = VecDeque::with_capacity(new_region_cnt + 1); - for req in split_reqs.get_requests() { - let split_key = req.get_split_key(); - if split_key.is_empty() { - return Err(box_err!("missing split key")); - } - if split_key - <= keys - .back() - .map_or_else(|| derived.get_start_key(), Vec::as_slice) - { - return Err(box_err!("invalid split request: {:?}", split_reqs)); - } - if req.get_new_peer_ids().len() != derived.get_peers().len() { - return Err(box_err!( - "invalid new peer id count, need {:?}, but got {:?}", - derived.get_peers(), - req.get_new_peer_ids() - )); - } - keys.push_back(split_key.to_vec()); - } - - util::check_key_in_region(keys.back().unwrap(), &self.region)?; info!( "split region"; @@ -2407,8 +2414,13 @@ where "region" => ?derived, "keys" => %KeysInfoFormatter(keys.iter()), ); + + let new_region_cnt = split_reqs.get_requests().len(); let new_version = derived.get_region_epoch().get_version() + new_region_cnt as u64; derived.mut_region_epoch().set_version(new_version); + + let right_derive = split_reqs.get_right_derive(); + let mut regions = Vec::with_capacity(new_region_cnt + 1); // Note that the split requests only contain ids for new regions, so we need // to handle new regions and old region separately. if right_derive { @@ -2423,6 +2435,7 @@ where regions.push(derived.clone()); } + // Init split regions' meta info let mut new_split_regions: HashMap = HashMap::default(); for req in split_reqs.get_requests() { let mut new_region = Region::default(); @@ -2453,6 +2466,11 @@ where regions.push(derived.clone()); } + // Generally, a peer is created in pending_create_peers when it is + // created by raft_message (or by split here) and removed from + // pending_create_peers when it has applied the snapshot. So, if the + // peer of the split region is already created by raft_message in + // pending_create_peers ,we decide to replace it. let mut replace_regions = HashSet::default(); { let mut pending_create_peers = ctx.pending_create_peers.lock().unwrap(); @@ -2498,6 +2516,9 @@ where self.tag, region_id, new_split_peer.peer_id, state ) } + // If the peer's state is already persisted, add some info in + // new_split_peer.result so that we will skip this region in later + // executions. already_exist_regions.push((*region_id, new_split_peer.peer_id)); new_split_peer.result = Some(format!("state {:?} exist in kv engine", state)); } From d9fe2ffd78f4db5ec2141c98e019d14ba980f121 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Tue, 1 Nov 2022 12:23:59 +0800 Subject: [PATCH 296/676] log-backup: set `checkpoint-ts -1` as sevice-safe-point when checkpoint advances (#13559) close tikv/tikv#13532 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- .../backup-stream/src/checkpoint_manager.rs | 61 +++++++++++++++++-- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 4b80eb44a2f..3a13acd2f4c 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -199,7 +199,7 @@ impl FlushObserver for BasicFlushObserver { .pd_cli .update_service_safe_point( format!("backup-stream-{}-{}", task, self.store_id), - TimeStamp::new(rts), + TimeStamp::new(rts - 1), // Add a service safe point for 30 mins (6x the default flush interval). // It would probably be safe. Duration::from_secs(1800), @@ -299,12 +299,19 @@ where #[cfg(test)] mod tests { - use std::assert_matches; - + use std::{ + assert_matches, + collections::HashMap, + sync::{Arc, RwLock}, + time::Duration, + }; + + use futures::future::ok; use kvproto::metapb::*; + use pd_client::{PdClient, PdFuture}; use txn_types::TimeStamp; - use super::RegionIdWithVersion; + use super::{BasicFlushObserver, FlushObserver, RegionIdWithVersion}; use crate::GetCheckpointResult; fn region(id: u64, version: u64, conf_version: u64) -> Region { @@ -342,4 +349,50 @@ mod tests { let r = mgr.get_from_region(RegionIdWithVersion::new(1, 33)); assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 24); } + + struct MockPdClient { + safepoint: RwLock>, + } + + impl PdClient for MockPdClient { + fn update_service_safe_point( + &self, + name: String, + safepoint: TimeStamp, + _ttl: Duration, + ) -> PdFuture<()> { + // let _ = self.safepoint.insert(name, safepoint); + self.safepoint.write().unwrap().insert(name, safepoint); + + Box::pin(ok(())) + } + } + + impl MockPdClient { + fn new() -> Self { + Self { + safepoint: RwLock::new(HashMap::default()), + } + } + + fn get_service_safe_point(&self, name: String) -> Option { + self.safepoint.read().unwrap().get(&name).copied() + } + } + + #[tokio::test] + async fn test_after() { + let store_id = 1; + let pd_cli = Arc::new(MockPdClient::new()); + let mut flush_observer = BasicFlushObserver::new(pd_cli.clone(), store_id); + let task = String::from("test"); + let rts = 12345; + + let r = flush_observer.after(&task, rts).await; + assert_eq!(r.is_ok(), true); + + let serivce_id = format!("backup-stream-{}-{}", task, store_id); + let r = pd_cli.get_service_safe_point(serivce_id).unwrap(); + assert_eq!(r.into_inner(), rts - 1); + } } From 6cf4100bd71a0bde491e4a05acca0587ae0b8232 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Tue, 1 Nov 2022 17:04:00 +0800 Subject: [PATCH 297/676] coprocessor_v2: fix incorrect shared library name (#13707) ref tikv/tikv#13585, close tikv/tikv#13708 This commit fixes test `registry_unload_plugin`. The test has been failing since the example crate renamed its name in #13585. Signed-off-by: Yilin Chen --- src/coprocessor_v2/plugin_registry.rs | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/coprocessor_v2/plugin_registry.rs b/src/coprocessor_v2/plugin_registry.rs index c02a652fc88..cbcba39995d 100644 --- a/src/coprocessor_v2/plugin_registry.rs +++ b/src/coprocessor_v2/plugin_registry.rs @@ -481,7 +481,7 @@ mod tests { fn initialize_library() -> PathBuf { let mut path = std::env::current_exe().unwrap(); - path.set_file_name(pkgname_to_libname("example-plugin")); + path.set_file_name(pkgname_to_libname("example-coprocessor-plugin")); path } @@ -491,7 +491,7 @@ mod tests { let loaded_plugin = unsafe { LoadedPlugin::new(&library_path).unwrap() }; - assert_eq!(loaded_plugin.name(), "example_plugin"); + assert_eq!(loaded_plugin.name(), "example_coprocessor_plugin"); assert_eq!(loaded_plugin.version(), &Version::parse("0.1.0").unwrap()); } @@ -504,10 +504,15 @@ mod tests { let plugin = registry.get_plugin(&plugin_name).unwrap(); - assert_eq!(plugin.name(), "example_plugin"); - assert_eq!(registry.loaded_plugin_names(), vec!["example_plugin"]); + assert_eq!(plugin.name(), "example_coprocessor_plugin"); assert_eq!( - registry.get_path_for_plugin("example_plugin").unwrap(), + registry.loaded_plugin_names(), + vec!["example_coprocessor_plugin"] + ); + assert_eq!( + registry + .get_path_for_plugin("example_coprocessor_plugin") + .unwrap(), library_path.as_os_str() ); } @@ -519,7 +524,7 @@ mod tests { let library_path_2 = library_path .parent() .unwrap() - .join(pkgname_to_libname("example-plugin-2")); + .join(pkgname_to_libname("example-coprocessor-plugin-2")); let registry = PluginRegistry::new(); let plugin_name = registry.load_plugin(&library_path).unwrap(); @@ -558,9 +563,10 @@ mod tests { let original_library_path = initialize_library(); let coprocessor_dir = std::env::temp_dir().join("coprocessors"); - let library_path = coprocessor_dir.join(pkgname_to_libname("example-plugin")); - let library_path_2 = coprocessor_dir.join(pkgname_to_libname("example-plugin-2")); - let plugin_name = "example_plugin"; + let library_path = coprocessor_dir.join(pkgname_to_libname("example-coprocessor-plugin")); + let library_path_2 = + coprocessor_dir.join(pkgname_to_libname("example-coprocessor-plugin-2")); + let plugin_name = "example_coprocessor_plugin"; // Make the coprocessor directory is empty. std::fs::create_dir_all(&coprocessor_dir).unwrap(); From 986bfde546aef4fb2acb0d18fe41c4cec6b5500e Mon Sep 17 00:00:00 2001 From: zhangguangchao <1614439+zgcbj@users.noreply.github.com> Date: Tue, 1 Nov 2022 17:24:00 +0800 Subject: [PATCH 298/676] raftstore fix typo (#13681) close tikv/tikv#13682 Signed-off-by: zhangguangchao <1614439+zgcbj@users.noreply.github.com> Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/peer_storage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 7f4b6778860..081149a6889 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -434,7 +434,7 @@ where } /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no - /// unavailable snapshot. + /// available snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { let mut snap_state = self.snap_state.borrow_mut(); let mut tried_cnt = self.snap_tried_cnt.borrow_mut(); From 26830eb6a371654035fca7ae7a723c45cbe30764 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Tue, 1 Nov 2022 17:42:00 +0800 Subject: [PATCH 299/676] scheduler: Optimize CPU usage of waking up (#13697) close tikv/tikv#13692, ref tikv/tikv#13692 Optimize CPU usage of waking up. 1. When popped entries are empty in `on_release_locks`, do not do anything in the high priority pool. This seems to be the reason of the performance regression mentioned in #13692 . 2. Skip waking up if the `LockWaitQueues` is empty. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- .../lock_manager/lock_waiting_queue.rs | 74 +++++++++++++++++-- src/storage/txn/scheduler.rs | 24 +++++- 2 files changed, 89 insertions(+), 9 deletions(-) diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index 16b3787bd7e..da8f2e2d289 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -59,7 +59,7 @@ use std::{ pin::Pin, result::Result, sync::{ - atomic::{AtomicU64, Ordering}, + atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, }, time::{Duration, Instant}, @@ -215,6 +215,7 @@ pub type DelayedNotifyAllFuture = Pin { queue_map: dashmap::DashMap, id_allocated: AtomicU64, + entries_count: AtomicUsize, lock_mgr: L, } @@ -229,6 +230,7 @@ impl LockWaitQueues { inner: Arc::new(LockWaitQueueInner { queue_map: dashmap::DashMap::new(), id_allocated: AtomicU64::new(1), + entries_count: AtomicUsize::new(0), lock_mgr, }), } @@ -256,10 +258,12 @@ impl LockWaitQueues { if lock_wait_entry.legacy_wake_up_index.is_none() { lock_wait_entry.legacy_wake_up_index = Some(key_state.value().legacy_wake_up_index); } + key_state .value_mut() .queue .push(lock_wait_entry.lock_wait_token, lock_wait_entry); + self.inner.entries_count.fetch_add(1, Ordering::SeqCst); let len = key_state.value_mut().queue.len(); drop(key_state); @@ -305,7 +309,7 @@ impl LockWaitQueues { ) -> Option<(Box, Option)> { let mut result = None; // For statistics. - let mut removed_waiters = 0; + let mut removed_waiters = 0usize; // We don't want other threads insert any more entries between finding the // queue is empty and removing the queue from the map. Wrap the logic @@ -334,6 +338,10 @@ impl LockWaitQueues { } } + self.inner + .entries_count + .fetch_sub(removed_waiters, Ordering::SeqCst); + // Remove the queue if it's emptied. v.queue.is_empty() }); @@ -341,7 +349,7 @@ impl LockWaitQueues { if removed_waiters != 0 { LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC .waiters - .sub(removed_waiters); + .sub(removed_waiters as i64); } if removed_key.is_some() { LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.keys.dec(); @@ -436,7 +444,7 @@ impl LockWaitQueues { let mut conflicting_start_ts = TimeStamp::zero(); let mut conflicting_commit_ts = TimeStamp::zero(); - let mut removed_waiters = 0; + let mut removed_waiters = 0usize; // We don't want other threads insert any more entries between finding the // queue is empty and removing the queue from the map. Wrap the logic @@ -479,6 +487,10 @@ impl LockWaitQueues { popped_lock_wait_entries.push(lock_wait_entry); } + self.inner + .entries_count + .fetch_sub(removed_waiters, Ordering::SeqCst); + // If the queue is empty, remove it from the map. v.queue.is_empty() }); @@ -486,7 +498,7 @@ impl LockWaitQueues { if removed_waiters != 0 { LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC .waiters - .sub(removed_waiters); + .sub(removed_waiters as i64); } if removed_key.is_some() { LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.keys.dec(); @@ -532,6 +544,7 @@ impl LockWaitQueues { // procedure. let removed_key = self.inner.queue_map.remove_if_mut(key, |_, v| { if let Some(res) = v.queue.remove(&lock_wait_token) { + self.inner.entries_count.fetch_sub(1, Ordering::SeqCst); LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.waiters.dec(); result = Some(res); } @@ -545,6 +558,20 @@ impl LockWaitQueues { result } + /// Gets the count of entries currently waiting in queues. + /// + /// Mind that the contents of the queues may be changed concurrently. + pub fn entry_count(&self) -> usize { + self.inner.entries_count.load(Ordering::SeqCst) + } + + /// Checks whether there's nothing at all waiting in queue. + /// + /// Mind that the contents of the queues may be changed concurrently. + pub fn is_empty(&self) -> bool { + self.entry_count() == 0 + } + #[allow(dead_code)] pub(super) fn get_lock_mgr(&self) -> &L { &self.inner.lock_mgr @@ -831,9 +858,13 @@ mod tests { #[test] fn test_simple_push_pop() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); + assert_eq!(queues.is_empty(), true); queues.mock_lock_wait(b"k1", 10, 5, false); queues.mock_lock_wait(b"k2", 11, 5, false); + assert_eq!(queues.entry_count(), 2); + assert_eq!(queues.is_empty(), false); queues .must_pop(b"k1", 5, 6) @@ -841,6 +872,8 @@ mod tests { .check_start_ts(10); queues.must_pop_none(b"k1", 5, 6); queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 1); + assert_eq!(queues.is_empty(), false); queues .must_pop(b"k2", 5, 6) @@ -848,11 +881,14 @@ mod tests { .check_start_ts(11); queues.must_pop_none(b"k2", 5, 6); queues.must_not_contain_key(b"k2"); + assert_eq!(queues.entry_count(), 0); + assert_eq!(queues.is_empty(), true); } #[test] fn test_popping_priority() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); queues.mock_lock_wait(b"k1", 10, 5, false); queues.mock_lock_wait(b"k1", 20, 5, false); @@ -860,6 +896,7 @@ mod tests { queues.mock_lock_wait(b"k1", 13, 5, false); // Duplication is possible considering network issues and RPC retrying. queues.mock_lock_wait(b"k1", 12, 5, false); + assert_eq!(queues.entry_count(), 5); // Ordered by start_ts for &expected_start_ts in &[10u64, 12, 12, 13, 20] { @@ -870,11 +907,13 @@ mod tests { } queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 0); } #[test] fn test_removing_by_token() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); queues.mock_lock_wait(b"k1", 10, 5, false); let token11 = queues.mock_lock_wait(b"k1", 11, 5, false).token; @@ -882,6 +921,7 @@ mod tests { let token13 = queues.mock_lock_wait(b"k1", 13, 5, false).token; queues.mock_lock_wait(b"k1", 14, 5, false); assert_eq!(queues.get_queue_length_of_key(b"k1"), 5); + assert_eq!(queues.entry_count(), 5); queues .remove_by_token(&Key::from_raw(b"k1"), token11) @@ -894,6 +934,7 @@ mod tests { .check_key(b"k1") .check_start_ts(13); assert_eq!(queues.get_queue_length_of_key(b"k1"), 3); + assert_eq!(queues.entry_count(), 3); // Removing not-existing entry takes no effect. assert!( @@ -907,15 +948,19 @@ mod tests { .is_none() ); assert_eq!(queues.get_queue_length_of_key(b"k1"), 3); + assert_eq!(queues.entry_count(), 3); queues.must_pop(b"k1", 5, 6).check_start_ts(10); queues.must_pop(b"k1", 5, 6).check_start_ts(12); queues.must_pop(b"k1", 5, 6).check_start_ts(14); + queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 0); } #[test] fn test_dropping_cancelled_entries() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); let h10 = queues.mock_lock_wait(b"k1", 10, 5, false); let h11 = queues.mock_lock_wait(b"k1", 11, 5, false); @@ -924,12 +969,14 @@ mod tests { queues.mock_lock_wait(b"k1", 14, 5, false); assert_eq!(queues.get_queue_length_of_key(b"k1"), 5); + assert_eq!(queues.entry_count(), 5); h10.cancel(); h11.cancel(); h13.cancel(); assert_eq!(queues.get_queue_length_of_key(b"k1"), 2); + assert_eq!(queues.entry_count(), 2); for &expected_start_ts in &[12u64, 14] { queues @@ -937,11 +984,13 @@ mod tests { .check_start_ts(expected_start_ts); } queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 0); } #[tokio::test] async fn test_delayed_notify_all() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); queues.mock_lock_wait(b"k1", 8, 5, false); @@ -952,6 +1001,7 @@ mod tests { ]; // Current queue: [8, 11, 12, 13] + assert_eq!(queues.entry_count(), 4); let (entry, delay_wake_up_future) = queues.must_pop_with_delayed_notify(b"k1", 5, 6); entry.check_key(b"k1").check_start_ts(8); @@ -959,6 +1009,7 @@ mod tests { // Current queue: [11*, 12*, 13*] (Items marked with * means it has // legacy_wake_up_index less than that in KeyLockWaitState, so it might // be woken up when calling delayed_notify_all). + assert_eq!(queues.entry_count(), 3); let handles2 = vec![ queues.mock_lock_wait(b"k1", 14, 5, false), @@ -967,6 +1018,7 @@ mod tests { ]; // Current queue: [11*, 12*, 13*, 14, 15, 16] + assert_eq!(queues.entry_count(), 6); assert!( handles1[0] @@ -988,9 +1040,11 @@ mod tests { ); // Current queue: [14, 15, 16] + assert_eq!(queues.entry_count(), 3); queues.mock_lock_wait(b"k1", 9, 5, false); // Current queue: [9, 14, 15, 16] + assert_eq!(queues.entry_count(), 4); // 9 will be woken up and delayed wake up should be scheduled. After delaying, // 14 to 16 should be all woken up later if they are all not resumable. @@ -1000,11 +1054,13 @@ mod tests { entry.check_key(b"k1").check_start_ts(9); // Current queue: [14*, 15*, 16*] + assert_eq!(queues.entry_count(), 3); queues.mock_lock_wait(b"k1", 17, 5, false); let handle18 = queues.mock_lock_wait(b"k1", 18, 5, false); // Current queue: [14*, 15*, 16*, 17, 18] + assert_eq!(queues.entry_count(), 5); // Wakes up 14, and stops at 15 which is resumable. Then, 15 should be returned // and the caller should be responsible for waking it up. @@ -1012,6 +1068,7 @@ mod tests { entry15.check_key(b"k1").check_start_ts(15); // Current queue: [16*, 17, 18] + assert_eq!(queues.entry_count(), 3); let mut it = handles2.into_iter(); // Receive 14. @@ -1050,6 +1107,7 @@ mod tests { ); // Current queue: [16*, 17, 18] + assert_eq!(queues.entry_count(), 3); let (entry, delayed_wake_up_future) = queues.must_pop_with_delayed_notify(b"k1", 7, 8); entry.check_key(b"k1").check_start_ts(16); @@ -1064,6 +1122,7 @@ mod tests { queues.must_have_next_entry(b"k1", 17); // Current queue: [17*, 18*] + assert_eq!(queues.entry_count(), 2); // Don't need to create new future if there already exists one for the key. let entry = queues.must_pop_with_no_delayed_notify(b"k1", 9, 10); @@ -1071,18 +1130,22 @@ mod tests { queues.must_have_next_entry(b"k1", 18); // Current queue: [18*] + assert_eq!(queues.entry_count(), 1); queues.mock_lock_wait(b"k1", 19, 5, false); // Current queue: [18*, 19] + assert_eq!(queues.entry_count(), 2); assert!(delayed_wake_up_future.await.is_none()); // 18 will be cancelled with ts of the latest wake-up event. expect_write_conflict(&handle18.wait_for_result().unwrap_err().0, 9, 10); // Current queue: [19] + assert_eq!(queues.entry_count(), 1); // Don't need to create new future if the queue is cleared. let entry = queues.must_pop_with_no_delayed_notify(b"k1", 9, 10); entry.check_key(b"k1").check_start_ts(19); // Current queue: empty + assert_eq!(queues.entry_count(), 0); queues.must_not_contain_key(b"k1"); // Calls delayed_notify_all on keys that not exists (maybe deleted due to @@ -1093,5 +1156,6 @@ mod tests { .is_none() ); queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 0); } } diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 4ccc868f30d..917c9fbaffc 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -48,6 +48,7 @@ use parking_lot::{Mutex, MutexGuard, RwLockWriteGuard}; use pd_client::{Feature, FeatureGate}; use raftstore::store::TxnExt; use resource_metering::{FutureExt, ResourceTagFactory}; +use smallvec::SmallVec; use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData}; use tikv_util::{ deadline::Deadline, quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE, @@ -790,8 +791,17 @@ impl Scheduler { } fn on_release_locks(&self, released_locks: ReleasedLocks) { - let mut legacy_wake_up_list = vec![]; - let mut delay_wake_up_futures = vec![]; + // This function is always called when holding the latch of the involved keys. + // So if we found the lock waiting queues are empty, there's no chance + // that other threads/commands adds new lock-wait entries to the keys + // concurrently. Therefore it's safe to skip waking up when we found the + // lock waiting queues are empty. + if self.inner.lock_wait_queues.is_empty() { + return; + } + + let mut legacy_wake_up_list = SmallVec::<[_; 4]>::new(); + let mut delay_wake_up_futures = SmallVec::<[_; 4]>::new(); let wake_up_delay_duration_ms = self .inner .pessimistic_lock_wake_up_delay_duration_ms @@ -817,13 +827,19 @@ impl Scheduler { } }); + if legacy_wake_up_list.is_empty() && delay_wake_up_futures.is_empty() { + return; + } + self.wake_up_legacy_pessimistic_locks(legacy_wake_up_list, delay_wake_up_futures); } fn wake_up_legacy_pessimistic_locks( &self, - legacy_wake_up_list: Vec<(Box, ReleasedLock)>, - delayed_wake_up_futures: Vec, + legacy_wake_up_list: impl IntoIterator, ReleasedLock)> + + Send + + 'static, + delayed_wake_up_futures: impl IntoIterator + Send + 'static, ) { let self1 = self.clone(); self.get_sched_pool(CommandPri::High) From de4cd130d0fba0ce47505205b41801f2d7cefa39 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Tue, 1 Nov 2022 17:58:00 +0800 Subject: [PATCH 300/676] raftstore-v2: add snapshot basic logic in peer storage (#13555) ref tikv/tikv#12842 this commit is part of support snapshot in raftstore v2 - add snapshot basic implementation in peer storage Signed-off-by: nolouch Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 30 +- components/raftstore-v2/src/fsm/apply.rs | 17 +- components/raftstore-v2/src/fsm/peer.rs | 9 +- .../raftstore-v2/src/operation/command/mod.rs | 13 +- components/raftstore-v2/src/operation/life.rs | 2 +- components/raftstore-v2/src/operation/mod.rs | 2 +- .../raftstore-v2/src/operation/ready/mod.rs | 26 +- .../src/operation/ready/snapshot.rs | 286 ++++++++++++++++++ components/raftstore-v2/src/raft/apply.rs | 16 +- components/raftstore-v2/src/raft/peer.rs | 16 +- components/raftstore-v2/src/raft/storage.rs | 219 ++++++++++++-- components/raftstore-v2/src/router/imp.rs | 13 +- .../src/router/internal_message.rs | 4 +- components/raftstore-v2/src/router/message.rs | 8 +- .../raftstore/src/store/async_io/mod.rs | 1 + .../raftlog_fetch.rs => async_io/read.rs} | 71 +++-- .../raftstore/src/store/entry_storage.rs | 26 +- components/raftstore/src/store/fsm/peer.rs | 9 +- components/raftstore/src/store/fsm/store.rs | 13 +- components/raftstore/src/store/metrics.rs | 1 + components/raftstore/src/store/mod.rs | 6 +- components/raftstore/src/store/msg.rs | 4 +- components/raftstore/src/store/peer.rs | 8 +- .../raftstore/src/store/peer_storage.rs | 35 ++- components/raftstore/src/store/transport.rs | 12 +- components/raftstore/src/store/worker/mod.rs | 4 - 26 files changed, 695 insertions(+), 156 deletions(-) create mode 100644 components/raftstore-v2/src/operation/ready/snapshot.rs rename components/raftstore/src/store/{worker/raftlog_fetch.rs => async_io/read.rs} (58%) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index b387300b40e..76d4fd16bea 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -20,8 +20,8 @@ use kvproto::{ }; use raft::INVALID_ID; use raftstore::store::{ - fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, RaftlogFetchRunner, - RaftlogFetchTask, StoreWriters, Transport, WriteSenders, + fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, ReadRunner, ReadTask, + StoreWriters, Transport, WriteSenders, }; use slog::Logger; use tikv_util::{ @@ -68,7 +68,7 @@ pub struct StoreContext { pub engine: ER, pub tablet_factory: Arc>, pub apply_pool: FuturePool, - pub log_fetch_scheduler: Scheduler, + pub read_scheduler: Scheduler>, } /// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. @@ -215,7 +215,7 @@ struct StorePollerBuilder { tablet_factory: Arc>, trans: T, router: StoreRouter, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, write_senders: WriteSenders, apply_pool: FuturePool, logger: Logger, @@ -230,7 +230,7 @@ impl StorePollerBuilder { tablet_factory: Arc>, trans: T, router: StoreRouter, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, store_writers: &mut StoreWriters, logger: Logger, store_meta: Arc>>, @@ -252,7 +252,7 @@ impl StorePollerBuilder { tablet_factory, trans, router, - log_fetch_scheduler, + read_scheduler, apply_pool, logger, write_senders: store_writers.senders(), @@ -271,7 +271,7 @@ impl StorePollerBuilder { region_id, self.store_id, self.engine.clone(), - self.log_fetch_scheduler.clone(), + self.read_scheduler.clone(), &self.logger, )? { Some(p) => p, @@ -324,7 +324,7 @@ where engine: self.engine.clone(), tablet_factory: self.tablet_factory.clone(), apply_pool: self.apply_pool.clone(), - log_fetch_scheduler: self.log_fetch_scheduler.clone(), + read_scheduler: self.read_scheduler.clone(), }; let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); StorePoller::new(poll_ctx, cfg_tracker) @@ -335,14 +335,14 @@ where /// raftstore. struct Workers { /// Worker for fetching raft logs asynchronously - log_fetch_worker: Worker, + async_read_worker: Worker, store_writers: StoreWriters, } impl Default for Workers { fn default() -> Self { Self { - log_fetch_worker: Worker::new("raftlog-fetch-worker"), + async_read_worker: Worker::new("async-read-worker"), store_writers: StoreWriters::default(), } } @@ -373,9 +373,9 @@ impl StoreSystem { workers .store_writers .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; - let log_fetch_scheduler = workers.log_fetch_worker.start( - "raftlog-fetch-worker", - RaftlogFetchRunner::new(router.clone(), raft_engine.clone()), + let read_scheduler = workers.async_read_worker.start( + "async-read-worker", + ReadRunner::new(router.clone(), raft_engine.clone()), ); let mut builder = StorePollerBuilder::new( @@ -385,7 +385,7 @@ impl StoreSystem { tablet_factory, trans, router.clone(), - log_fetch_scheduler, + read_scheduler, &mut workers.store_writers, self.logger.clone(), store_meta.clone(), @@ -435,7 +435,7 @@ impl StoreSystem { self.system.shutdown(); workers.store_writers.shutdown(); - workers.log_fetch_worker.stop(); + workers.async_read_worker.stop(); } } diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 4a1e05b8f75..c4eb03f350d 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -14,8 +14,12 @@ use crossbeam::channel::TryRecvError; use engine_traits::KvEngine; use futures::{Future, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; +use raftstore::store::ReadTask; use slog::Logger; -use tikv_util::mpsc::future::{self, Receiver, Sender, WakePolicy}; +use tikv_util::{ + mpsc::future::{self, Receiver, Sender, WakePolicy}, + worker::Scheduler, +}; use crate::{ raft::Apply, @@ -61,10 +65,18 @@ impl ApplyFsm { region_state: RegionLocalState, res_reporter: R, remote_tablet: CachedTablet, + read_scheduler: Scheduler>, logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); - let apply = Apply::new(peer, region_state, res_reporter, remote_tablet, logger); + let apply = Apply::new( + peer, + region_state, + res_reporter, + remote_tablet, + read_scheduler, + logger, + ); ( ApplyScheduler { sender: tx }, Self { @@ -86,6 +98,7 @@ impl ApplyFsm { match task { // TODO: flush by buffer size. ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, + ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), } // TODO: yield after some time. diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index a1beedef968..7083a9e529c 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -40,7 +40,7 @@ impl PeerFsm { pub fn new( cfg: &Config, tablet_factory: &dyn TabletFactory, - storage: Storage, + storage: Storage, ) -> Result> { let peer = Peer::new(cfg, tablet_factory, storage)?; info!(peer.logger, "create peer"); @@ -229,8 +229,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .fsm .peer_mut() .on_persisted(self.store_ctx, peer_id, ready_number), - PeerMsg::FetchedLogs(fetched_logs) => { - self.fsm.peer_mut().on_fetched_logs(fetched_logs) + PeerMsg::LogsFetched(fetched_logs) => { + self.fsm.peer_mut().on_logs_fetched(fetched_logs) + } + PeerMsg::SnapshotGenerated(snap_res) => { + self.fsm.peer_mut().on_snapshot_generated(snap_res) } PeerMsg::QueryDebugInfo(ch) => self.fsm.peer_mut().on_query_debug_info(ch), #[cfg(feature = "testexport")] diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index fe863a74b8a..21122e5559f 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -49,6 +49,7 @@ use tikv_util::{box_err, time::monotonic_raw_now}; use crate::{ batch::StoreContext, fsm::{ApplyFsm, ApplyResReporter, PeerFsmDelegate}, + operation::GenSnapTask, raft::{Apply, Peer}, router::{ApplyRes, ApplyTask, CmdResChannel, PeerMsg}, }; @@ -120,8 +121,16 @@ impl Peer { let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); let tablet = self.tablet().clone(); let logger = self.logger.clone(); - let (apply_scheduler, mut apply_fsm) = - ApplyFsm::new(self.peer().clone(), region_state, mailbox, tablet, logger); + let read_scheduler = self.storage().read_scheduler(); + let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( + self.peer().clone(), + region_state, + mailbox, + tablet, + read_scheduler, + logger, + ); + store_ctx .apply_pool .spawn(async move { apply_fsm.handle_all_tasks().await }) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 678cf6ece4b..7be70a9afe7 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -182,7 +182,7 @@ impl Store { self.store_id(), region, ctx.engine.clone(), - ctx.log_fetch_scheduler.clone(), + ctx.read_scheduler.clone(), &ctx.logger, ) .and_then(|s| PeerFsm::new(&ctx.cfg, &*ctx.tablet_factory, s)) diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 1eaeb21ec18..5b19db91b71 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -7,6 +7,6 @@ mod ready; pub use command::{AdminCmdResult, CommittedEntries, SimpleWriteDecoder, SimpleWriteEncoder}; pub use life::DestroyProgress; -pub use ready::AsyncWriter; +pub use ready::{AsyncWriter, GenSnapTask, SnapState}; pub(crate) use self::query::LocalReader; diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index cfc3d086163..62cb42ef253 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -18,6 +18,7 @@ //! There two steps can be processed concurrently. mod async_writer; +mod snapshot; use std::cmp; @@ -30,12 +31,15 @@ use raftstore::store::{util, ExtraStates, FetchedLogs, Transport, WriteTask}; use slog::{debug, error, trace, warn}; use tikv_util::time::{duration_to_sec, monotonic_raw_now}; -pub use self::async_writer::AsyncWriter; +pub use self::{ + async_writer::AsyncWriter, + snapshot::{GenSnapTask, SnapState}, +}; use crate::{ batch::StoreContext, fsm::PeerFsmDelegate, raft::{Peer, Storage}, - router::PeerTick, + router::{ApplyTask, PeerTick}, }; impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { /// Raft relies on periodic ticks to keep the state machine sync with other @@ -115,7 +119,7 @@ impl Peer { } /// Callback for fetching logs asynchronously. - pub fn on_fetched_logs(&mut self, fetched_logs: FetchedLogs) { + pub fn on_logs_fetched(&mut self, fetched_logs: FetchedLogs) { let FetchedLogs { context, logs } = fetched_logs; let low = logs.low; if !self.is_leader() { @@ -298,6 +302,14 @@ impl Peer { self.handle_raft_committed_entries(ctx, ready.take_committed_entries()); } + // Check whether there is a pending generate snapshot task, the task + // needs to be sent to the apply system. + // Always sending snapshot task after apply task, so it gets latest + // snapshot. + if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { + self.apply_scheduler().send(ApplyTask::Snapshot(gen_task)); + } + let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); self.storage_mut() @@ -385,14 +397,10 @@ impl Peer { } } -impl Storage { +impl Storage { /// Apply the ready to the storage. If there is any states need to be /// persisted, it will be written to `write_task`. - fn handle_raft_ready( - &mut self, - ready: &mut Ready, - write_task: &mut WriteTask, - ) { + fn handle_raft_ready(&mut self, ready: &mut Ready, write_task: &mut WriteTask) { let prev_raft_state = self.entry_storage().raft_state().clone(); let ever_persisted = self.ever_persisted(); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs new file mode 100644 index 00000000000..6f4b63630a9 --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -0,0 +1,286 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +//! This module contains snapshot relative processing logic. +//! +//! # Snapshot State +//! +//! generator and apply snapshot works asynchronously. the snap_sate indicates +//! the curren snapshot state. +//! +//! # Process Overview +//! +//! generate snapshot: +//! - Raft call `snapshot` interface to acquire a snapshot, then storage setup +//! the gen_snap_task. +//! - handle ready will send the gen_snap_task to the apply work +//! - apply worker schedule a gen tablet snapshot task to async read worker with +//! region state and apply state. +//! - async read worker generates the tablet snapshot and sends the result to +//! peer fsm, then Raft will get the snapshot. + +use std::{ + borrow::BorrowMut, + fmt::{self, Debug}, + mem, + sync::{ + atomic::{AtomicBool, AtomicU64, Ordering}, + mpsc, Arc, + }, +}; + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::raft_serverpb::{RaftSnapshotData, RegionLocalState}; +use protobuf::Message; +use raft::eraftpb::Snapshot; +use raftstore::store::{metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, ReadTask}; +use slog::{error, info}; +use tikv_util::{box_try, worker::Scheduler}; + +use crate::{ + fsm::ApplyResReporter, + raft::{Apply, Peer, Storage}, + router::{ApplyTask, PeerTick}, + Result, +}; + +#[derive(Debug)] +pub enum SnapState { + Relax, + Generating { + canceled: Arc, + index: Arc, + }, + Generated(Box), +} + +impl PartialEq for SnapState { + fn eq(&self, other: &SnapState) -> bool { + match (self, other) { + (&SnapState::Relax, &SnapState::Relax) + | (&SnapState::Generating { .. }, &SnapState::Generating { .. }) => true, + (&SnapState::Generated(ref snap1), &SnapState::Generated(ref snap2)) => { + *snap1 == *snap2 + } + _ => false, + } + } +} + +pub struct GenSnapTask { + region_id: u64, + // Fill it when you are going to generate the snapshot. + // index used to check if the gen task should be canceled. + index: Arc, + // Set it to true to cancel the task if necessary. + canceled: Arc, + // indicates whether the snapshot is triggered due to load balance + for_balance: bool, +} + +impl GenSnapTask { + pub fn new(region_id: u64, index: Arc, canceled: Arc) -> GenSnapTask { + GenSnapTask { + region_id, + index, + canceled, + for_balance: false, + } + } + + pub fn set_for_balance(&mut self) { + self.for_balance = true; + } +} + +impl Debug for GenSnapTask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("GenSnapTask") + .field("region_id", &self.region_id) + .finish() + } +} + +impl Peer { + pub fn on_snapshot_generated(&mut self, snapshot: Box) { + if self.storage_mut().on_snapshot_generated(snapshot) { + self.raft_group_mut().ping(); + self.set_has_ready(); + } + } +} + +impl Apply { + /// Handle snapshot. + /// + /// Will schedule a task to read worker and then generate a snapshot + /// asynchronously. + pub fn schedule_gen_snapshot(&mut self, snap_task: GenSnapTask) { + // Flush before do snapshot. + if snap_task.canceled.load(Ordering::SeqCst) { + return; + } + self.flush(); + + // Send generate snapshot task to region worker. + let (last_applied_index, last_applied_term) = self.apply_progress(); + snap_task.index.store(last_applied_index, Ordering::SeqCst); + let gen_tablet_sanp_task = ReadTask::GenTabletSnapshot { + region_id: snap_task.region_id, + tablet: self.tablet().clone(), + region_state: self.region_state().clone(), + last_applied_term, + last_applied_index, + for_balance: snap_task.for_balance, + canceled: snap_task.canceled.clone(), + }; + if let Err(e) = self.read_scheduler().schedule(gen_tablet_sanp_task) { + error!( + self.logger, + "schedule snapshot failed"; + "error" => ?e, + ); + snap_task.canceled.store(true, Ordering::SeqCst); + } + } +} + +impl Storage { + /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no + /// unavailable snapshot. + pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + let mut snap_state = self.snap_state_mut(); + match *snap_state { + SnapState::Generating { ref canceled, .. } => { + if canceled.load(Ordering::SeqCst) { + self.cancel_generating_snap(None); + } else { + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } + } + SnapState::Generated(ref s) => { + let SnapState::Generated(snap) = mem::replace(&mut *snap_state, SnapState::Relax) else { unreachable!() }; + if self.validate_snap(&snap, request_index) { + return Ok(*snap); + } + } + _ => {} + } + + if SnapState::Relax != *snap_state { + panic!( + "{:?} unexpected state: {:?}", + self.logger().list(), + *snap_state + ); + } + + info!( + self.logger(), + "requesting snapshot"; + "request_index" => request_index, + "request_peer" => to, + ); + let canceled = Arc::new(AtomicBool::new(false)); + let index = Arc::new(AtomicU64::new(0)); + *snap_state = SnapState::Generating { + canceled: canceled.clone(), + index: index.clone(), + }; + + let task = GenSnapTask::new(self.region().get_id(), index, canceled); + let mut gen_snap_task = self.gen_snap_task_mut(); + assert!(gen_snap_task.is_none()); + *gen_snap_task = Box::new(Some(task)); + Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )) + } + + /// Validate the snapshot. Returns true if it's valid. + fn validate_snap(&self, snap: &Snapshot, request_index: u64) -> bool { + let idx = snap.get_metadata().get_index(); + // TODO(nolouch): check tuncated index + if idx < request_index { + // stale snapshot, should generate again. + info!( + self.logger(), + "snapshot is stale, generate again"; + "snap_index" => idx, + "request_index" => request_index, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.stale.inc(); + return false; + } + + let mut snap_data = RaftSnapshotData::default(); + if let Err(e) = snap_data.merge_from_bytes(snap.get_data()) { + error!( + self.logger(), + "failed to decode snapshot, it may be corrupted"; + "err" => ?e, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.decode.inc(); + return false; + } + let snap_epoch = snap_data.get_region().get_region_epoch(); + let latest_epoch = self.region().get_region_epoch(); + if snap_epoch.get_conf_ver() < latest_epoch.get_conf_ver() { + info!( + self.logger(), + "snapshot epoch is stale"; + "snap_epoch" => ?snap_epoch, + "latest_epoch" => ?latest_epoch, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.epoch.inc(); + return false; + } + + true + } + + /// Cancel generating snapshot. + pub fn cancel_generating_snap(&self, compact_to: Option) { + let mut snap_state = self.snap_state_mut(); + let SnapState::Generating { + ref canceled, + ref index, + } = *snap_state else { return }; + + if let Some(idx) = compact_to { + let snap_index = index.load(Ordering::SeqCst); + if snap_index == 0 || idx <= snap_index + 1 { + return; + } + } + canceled.store(true, Ordering::SeqCst); + *snap_state = SnapState::Relax; + self.gen_snap_task_mut().take(); + info!( + self.logger(), + "snapshot is canceled"; + "compact_to" => compact_to, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.cancel.inc(); + } + + /// Try to switch snap state to generated. only `Generating` can switch to + /// `Generated`. + /// TODO: make the snap state more clearer, the snapshot must be consumed. + pub fn on_snapshot_generated(&self, snap: Box) -> bool { + let mut snap_state = self.snap_state_mut(); + let SnapState::Generating { + ref canceled, + ref index, + } = *snap_state else { return false }; + + if snap.get_metadata().get_index() < index.load(Ordering::SeqCst) { + return false; + } + // Should changed `SnapState::Generated` to `SnapState::Relax` when the + // snap is consumed or canceled. Such as leader changed, the state of generated + // should be reset. + *snap_state = SnapState::Generated(snap); + true + } +} diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 068e5124c0c..ff29b3ba029 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -4,8 +4,9 @@ use std::mem; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; -use raftstore::store::fsm::apply::DEFAULT_APPLY_WB_SIZE; +use raftstore::store::{fsm::apply::DEFAULT_APPLY_WB_SIZE, ReadTask}; use slog::Logger; +use tikv_util::worker::Scheduler; use super::Peer; use crate::{ @@ -34,6 +35,7 @@ pub struct Apply { region_state: RegionLocalState, res_reporter: R, + read_scheduler: Scheduler>, pub(crate) logger: Logger, } @@ -44,6 +46,7 @@ impl Apply { region_state: RegionLocalState, res_reporter: R, mut remote_tablet: CachedTablet, + read_scheduler: Scheduler>, logger: Logger, ) -> Self { Apply { @@ -57,6 +60,7 @@ impl Apply { applied_term: 0, admin_cmd_result: vec![], region_state, + read_scheduler, res_reporter, logger, } @@ -96,6 +100,11 @@ impl Apply { (self.applied_index, self.applied_term) } + #[inline] + pub fn read_scheduler(&self) -> &Scheduler> { + &self.read_scheduler + } + #[inline] pub fn region_state(&self) -> &RegionLocalState { &self.region_state @@ -116,6 +125,11 @@ impl Apply { self.tablet = tablet; } + #[inline] + pub fn tablet(&self) -> &EK { + &self.tablet + } + #[inline] pub fn peer(&self) -> &metapb::Peer { &self.peer diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 650c410cef9..8619b8cf2d4 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -28,7 +28,7 @@ const REGION_READ_PROGRESS_CAP: usize = 128; /// A peer that delegates commands between state machine and raft. pub struct Peer { - raft_group: RawNode>, + raft_group: RawNode>, tablet: CachedTablet, /// We use a cache for looking up peers. Not all peers exist in region's /// peer list, for example, an isolated peer may need to send/receive @@ -67,7 +67,7 @@ impl Peer { pub fn new( cfg: &Config, tablet_factory: &dyn TabletFactory, - storage: Storage, + storage: Storage, ) -> Result { let logger = storage.logger().clone(); @@ -178,7 +178,7 @@ impl Peer { } #[inline] - pub fn storage(&self) -> &Storage { + pub fn storage(&self) -> &Storage { self.raft_group.store() } @@ -203,7 +203,7 @@ impl Peer { } #[inline] - pub fn storage_mut(&mut self) -> &mut Storage { + pub fn storage_mut(&mut self) -> &mut Storage { self.raft_group.mut_store() } @@ -218,12 +218,12 @@ impl Peer { } #[inline] - pub fn entry_storage(&self) -> &EntryStorage { + pub fn entry_storage(&self) -> &EntryStorage { self.raft_group.store().entry_storage() } #[inline] - pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { + pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { self.raft_group.mut_store().entry_storage_mut() } @@ -238,12 +238,12 @@ impl Peer { } #[inline] - pub fn raft_group(&self) -> &RawNode> { + pub fn raft_group(&self) -> &RawNode> { &self.raft_group } #[inline] - pub fn raft_group_mut(&mut self) -> &mut RawNode> { + pub fn raft_group_mut(&mut self) -> &mut RawNode> { &mut self.raft_group } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index b08624b1185..19a52d4c5a2 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -1,8 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::fmt::{self, Debug, Formatter}; +use std::{ + cell::{RefCell, RefMut}, + fmt::{self, Debug, Formatter}, + sync::{mpsc::Receiver, Arc}, +}; -use engine_traits::{RaftEngine, RaftLogBatch}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ metapb::{self, Region}, raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, @@ -11,13 +15,14 @@ use raft::{ eraftpb::{ConfState, Entry, Snapshot}, GetEntriesContext, RaftState, INVALID_ID, }; -use raftstore::store::{ - util, EntryStorage, RaftlogFetchTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, -}; -use slog::{o, Logger}; +use raftstore::store::{util, EntryStorage, ReadTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; +use slog::{info, o, Logger}; use tikv_util::{box_err, store::find_peer, worker::Scheduler}; -use crate::Result; +use crate::{ + operation::{GenSnapTask, SnapState}, + Result, +}; pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { let region_id = region.get_id(); @@ -49,8 +54,8 @@ pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Resul /// A storage for raft. /// /// It's similar to `PeerStorage` in v1. -pub struct Storage { - entry_storage: EntryStorage, +pub struct Storage { + entry_storage: EntryStorage, peer: metapb::Peer, region_state: RegionLocalState, /// Whether states has been persisted before. If a peer is just created by @@ -58,9 +63,13 @@ pub struct Storage { /// at least once dispite whether the state changes since create. ever_persisted: bool, logger: Logger, + + /// Snapshot part. + snap_state: RefCell, + gen_snap_task: RefCell>>, } -impl Debug for Storage { +impl Debug for Storage { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, @@ -71,14 +80,14 @@ impl Debug for Storage { } } -impl Storage { +impl Storage { #[inline] - pub fn entry_storage(&self) -> &EntryStorage { + pub fn entry_storage(&self) -> &EntryStorage { &self.entry_storage } #[inline] - pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { + pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { &mut self.entry_storage } @@ -101,9 +110,19 @@ impl Storage { pub fn logger(&self) -> &Logger { &self.logger } + + #[inline] + pub fn snap_state_mut(&self) -> RefMut<'_, SnapState> { + self.snap_state.borrow_mut() + } + + #[inline] + pub fn gen_snap_task_mut(&self) -> RefMut<'_, Box>> { + self.gen_snap_task.borrow_mut() + } } -impl Storage { +impl Storage { /// Creates a new storage with uninit states. /// /// This should only be used for creating new peer from raft message. @@ -111,7 +130,7 @@ impl Storage { store_id: u64, region: Region, engine: ER, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, logger: &Logger, ) -> Result { let mut region_state = RegionLocalState::default(); @@ -122,7 +141,7 @@ impl Storage { RaftLocalState::default(), RaftApplyState::default(), engine, - log_fetch_scheduler, + read_scheduler, false, logger, ) @@ -136,9 +155,9 @@ impl Storage { region_id: u64, store_id: u64, engine: ER, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, logger: &Logger, - ) -> Result>> { + ) -> Result>> { let region_state = match engine.get_region_state(region_id) { Ok(Some(s)) => s, res => { @@ -174,7 +193,7 @@ impl Storage { raft_state, apply_state, engine, - log_fetch_scheduler, + read_scheduler, true, logger, ) @@ -187,7 +206,7 @@ impl Storage { raft_state: RaftLocalState, apply_state: RaftApplyState, engine: ER, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, persisted: bool, logger: &Logger, ) -> Result { @@ -206,7 +225,7 @@ impl Storage { raft_state, apply_state, region, - log_fetch_scheduler, + read_scheduler, )?; Ok(Storage { @@ -215,6 +234,8 @@ impl Storage { region_state, ever_persisted: persisted, logger, + snap_state: RefCell::new(SnapState::Relax), + gen_snap_task: RefCell::new(Box::new(None)), }) } @@ -223,6 +244,11 @@ impl Storage { self.entry_storage.raft_state() } + #[inline] + pub fn read_scheduler(&self) -> Scheduler> { + self.entry_storage.read_scheduler() + } + #[inline] pub fn apply_state(&self) -> &RaftApplyState { self.entry_storage.apply_state() @@ -241,6 +267,19 @@ impl Storage { self.ever_persisted = true; } + #[inline] + pub fn take_gen_snap_task(&mut self) -> Option { + self.gen_snap_task.get_mut().take() + } + + #[inline] + pub fn tablet_index(&self) -> u64 { + match self.region_state.get_state() { + PeerState::Tombstone | PeerState::Applying => 0, + _ => self.region_state.get_tablet_index(), + } + } + #[inline] pub fn set_region_state(&mut self, state: RegionLocalState) { self.region_state = state; @@ -253,7 +292,7 @@ impl Storage { } } -impl raft::Storage for Storage { +impl raft::Storage for Storage { fn initial_state(&self) -> raft::Result { let hard_state = self.raft_state().get_hard_state().clone(); // We will persist hard state no matter if it's initialized or not in @@ -306,24 +345,68 @@ impl raft::Storage for Storage { } fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { - Err(raft::Error::Store( - raft::StorageError::SnapshotTemporarilyUnavailable, - )) + self.snapshot(request_index, to) } } #[cfg(test)] mod tests { - use engine_traits::{RaftEngine, RaftEngineReadOnly, RaftLogBatch}; + use std::{ + sync::mpsc::{sync_channel, SyncSender}, + time::Duration, + }; + + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactoryV2}, + raft::RaftTestEngine, + }; + use engine_traits::{ + KvEngine, OpenOptions, RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletFactory, ALL_CFS, + }; use kvproto::{ metapb::{Peer, Region}, raft_serverpb::PeerState, }; - use raftstore::store::{RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; + use raft::{eraftpb::Snapshot as RaftSnapshot, Error as RaftError, StorageError}; + use raftstore::store::{ + AsyncReadNotifier, FetchedLogs, ReadRunner, ReadTask, RAFT_INIT_LOG_INDEX, + RAFT_INIT_LOG_TERM, + }; + use slog::o; use tempfile::TempDir; + use tikv_util::worker::{Runnable, Worker}; - #[test] - fn test_write_initial_states() { + use super::*; + use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes, tablet::CachedTablet}; + + #[derive(Clone)] + pub struct TestRouter { + ch: SyncSender>, + } + + impl TestRouter { + pub fn new() -> (Self, Receiver>) { + let (tx, rx) = sync_channel(1); + (Self { ch: tx }, rx) + } + } + + impl AsyncReadNotifier for TestRouter { + fn notify_logs_fetched(&self, _region_id: u64, _fetched_logs: FetchedLogs) { + unreachable!(); + } + + fn notify_snapshot_generated(&self, _region_id: u64, snapshot: Box) { + self.ch.send(snapshot).unwrap(); + } + } + + impl ApplyResReporter for TestRouter { + fn report(&self, _res: ApplyRes) {} + } + + fn new_region() -> Region { let mut region = Region::default(); region.set_id(4); let mut p = Peer::default(); @@ -332,12 +415,17 @@ mod tests { region.mut_peers().push(p); region.mut_region_epoch().set_version(2); region.mut_region_epoch().set_conf_ver(4); + region + } + #[test] + fn test_write_initial_states() { + let region = new_region(); let path = TempDir::new().unwrap(); let engine = engine_test::new_temp_engine(&path); let raft_engine = &engine.raft; let mut wb = raft_engine.log_batch(10); - super::write_initial_states(&mut wb, region.clone()).unwrap(); + write_initial_states(&mut wb, region.clone()).unwrap(); assert!(!wb.is_empty()); raft_engine.consume(&mut wb, true).unwrap(); @@ -358,4 +446,75 @@ mod tests { assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); assert_eq!(ts.get_term(), RAFT_INIT_LOG_TERM); } + + #[test] + fn test_storage_create_snapshot() { + let region = new_region(); + let path = TempDir::new().unwrap(); + let raft_engine = + engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) + .unwrap(); + let mut wb = raft_engine.log_batch(10); + write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + // building a tablet factory + let ops = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let factory = Arc::new(TestTabletFactoryV2::new( + path.path().join("tablet").as_path(), + ops, + cf_opts, + )); + // create tablet with region_id 1 + let tablet = factory + .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); + // setup read runner worker and peer storage + let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); + let sched = worker.scheduler(); + let logger = slog_global::borrow_global().new(o!()); + let mut s = Storage::new(4, 6, raft_engine.clone(), sched.clone(), &logger.clone()) + .unwrap() + .unwrap(); + let (router, rx) = TestRouter::new(); + worker.start(ReadRunner::new(router.clone(), raft_engine)); + // setup peer applyer + let mut apply = Apply::new( + region.get_peers()[0].clone(), + RegionLocalState::default(), + router, + CachedTablet::new(Some(tablet)), + sched, + logger, + ); + + // test get snapshot + let snap = s.snapshot(0, 0); + let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.schedule_gen_snapshot(gen_task); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.on_snapshot_generated(res); + let snap = match *s.snap_state.borrow() { + SnapState::Generated(ref snap) => *snap.clone(), + ref s => panic!("unexpected state: {:?}", s), + }; + assert_eq!(snap.get_metadata().get_index(), 0); + assert_eq!(snap.get_metadata().get_term(), 0); + assert!(snap.get_data().is_empty()); + + // test cancel snapshot + let snap = s.snapshot(0, 0); + let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.schedule_gen_snapshot(gen_task); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.cancel_generating_snap(None); + assert_eq!(*s.snap_state.borrow(), SnapState::Relax); + + // TODO: add test get twice snapshot and cancel once + } } diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 78abef13247..7c02ee10243 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -8,15 +8,20 @@ use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, }; -use raftstore::store::{FetchedLogs, LogFetchedNotifier, RegionSnapshot}; +use raft::eraftpb::Snapshot as RaftSnapshot; +use raftstore::store::{AsyncReadNotifier, FetchedLogs, RegionSnapshot}; use slog::Logger; use super::PeerMsg; use crate::{batch::StoreRouter, operation::LocalReader, StoreMeta}; -impl LogFetchedNotifier for StoreRouter { - fn notify(&self, region_id: u64, fetched: FetchedLogs) { - let _ = self.force_send(region_id, PeerMsg::FetchedLogs(fetched)); +impl AsyncReadNotifier for StoreRouter { + fn notify_logs_fetched(&self, region_id: u64, fetched_logs: FetchedLogs) { + let _ = self.force_send(region_id, PeerMsg::LogsFetched(fetched_logs)); + } + + fn notify_snapshot_generated(&self, region_id: u64, snapshot: Box) { + let _ = self.force_send(region_id, PeerMsg::SnapshotGenerated(snapshot)); } } diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index e9893bad968..1507d404297 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,13 +1,13 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use kvproto::raft_serverpb::RegionLocalState; use raftstore::store::fsm::ChangePeer; -use crate::operation::{AdminCmdResult, CommittedEntries}; +use crate::operation::{AdminCmdResult, CommittedEntries, GenSnapTask}; #[derive(Debug)] pub enum ApplyTask { CommittedEntries(CommittedEntries), + Snapshot(GenSnapTask), } #[derive(Debug, Default)] diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index c607e389135..64af4d41d71 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -3,7 +3,9 @@ // #[PerformanceCriticalPath] use std::fmt; +use engine_traits::Snapshot; use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; +use raft::eraftpb::Snapshot as RaftSnapshot; use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs}; use tikv_util::time::Instant; @@ -123,7 +125,8 @@ pub enum PeerMsg { Tick(PeerTick), /// Result of applying committed entries. The message can't be lost. ApplyRes(ApplyRes), - FetchedLogs(FetchedLogs), + LogsFetched(FetchedLogs), + SnapshotGenerated(Box), /// Start the FSM. Start, /// A message only used to notify a peer. @@ -173,7 +176,8 @@ impl fmt::Debug for PeerMsg { "Persisted peer_id {}, ready_number {}", peer_id, ready_number ), - PeerMsg::FetchedLogs(fetched) => write!(fmt, "FetchedLogs {:?}", fetched), + PeerMsg::LogsFetched(fetched) => write!(fmt, "LogsFetched {:?}", fetched), + PeerMsg::SnapshotGenerated(_) => write!(fmt, "SnapshotGenerated"), PeerMsg::QueryDebugInfo(_) => write!(fmt, "QueryDebugInfo"), #[cfg(feature = "testexport")] PeerMsg::WaitFlush(_) => write!(fmt, "FlushMessages"), diff --git a/components/raftstore/src/store/async_io/mod.rs b/components/raftstore/src/store/async_io/mod.rs index c9b2fad532f..56cc2d576e1 100644 --- a/components/raftstore/src/store/async_io/mod.rs +++ b/components/raftstore/src/store/async_io/mod.rs @@ -1,4 +1,5 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +pub mod read; pub mod write; pub mod write_router; diff --git a/components/raftstore/src/store/worker/raftlog_fetch.rs b/components/raftstore/src/store/async_io/read.rs similarity index 58% rename from components/raftstore/src/store/worker/raftlog_fetch.rs rename to components/raftstore/src/store/async_io/read.rs index b3de87f7715..30ce2102040 100644 --- a/components/raftstore/src/store/worker/raftlog_fetch.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -1,16 +1,21 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::fmt; +use std::{ + fmt, + marker::PhantomData, + sync::{atomic::AtomicBool, Arc}, +}; -use engine_traits::RaftEngine; +use engine_traits::{KvEngine, RaftEngine}; use fail::fail_point; -use raft::GetEntriesContext; +use kvproto::raft_serverpb::RegionLocalState; +use raft::{eraftpb::Snapshot as RaftSnapshot, GetEntriesContext}; use tikv_util::worker::Runnable; use crate::store::{RaftlogFetchResult, MAX_INIT_ENTRY_COUNT}; -pub enum Task { - PeerStorage { +pub enum ReadTask { + FetchLogs { region_id: u64, context: GetEntriesContext, low: u64, @@ -19,13 +24,23 @@ pub enum Task { tried_cnt: usize, term: u64, }, - // More to support, suck as fetch entries ayschronously when apply and schedule merge + + // GenTabletSnapshot is used to generate tablet snapshot. + GenTabletSnapshot { + region_id: u64, + tablet: EK, + region_state: RegionLocalState, + last_applied_term: u64, + last_applied_index: u64, + canceled: Arc, + for_balance: bool, + }, } -impl fmt::Display for Task { +impl fmt::Display for ReadTask { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Task::PeerStorage { + ReadTask::FetchLogs { region_id, context, low, @@ -38,6 +53,9 @@ impl fmt::Display for Task { "Fetch Raft Logs [region: {}, low: {}, high: {}, max_size: {}] for sending with context {:?}, tried: {}, term: {}", region_id, low, high, max_size, context, tried_cnt, term, ), + ReadTask::GenTabletSnapshot { region_id, .. } => { + write!(f, "Snapshot gen for {}", region_id) + } } } } @@ -49,38 +67,42 @@ pub struct FetchedLogs { } /// A router for receiving fetched result. -pub trait LogFetchedNotifier: Send { - fn notify(&self, region_id: u64, fetched: FetchedLogs); +pub trait AsyncReadNotifier: Send { + fn notify_logs_fetched(&self, region_id: u64, fetched: FetchedLogs); + fn notify_snapshot_generated(&self, region_id: u64, snapshot: Box); } -pub struct Runner +pub struct ReadRunner where + EK: KvEngine, ER: RaftEngine, - N: LogFetchedNotifier, + N: AsyncReadNotifier, { notifier: N, raft_engine: ER, + _phantom: PhantomData, } -impl Runner { - pub fn new(notifier: N, raft_engine: ER) -> Runner { - Runner { +impl ReadRunner { + pub fn new(notifier: N, raft_engine: ER) -> ReadRunner { + ReadRunner { notifier, raft_engine, + _phantom: PhantomData, } } } -impl Runnable for Runner +impl Runnable for ReadRunner where + EK: KvEngine, ER: RaftEngine, - N: LogFetchedNotifier, + N: AsyncReadNotifier, { - type Task = Task; - - fn run(&mut self, task: Task) { + type Task = ReadTask; + fn run(&mut self, task: ReadTask) { match task { - Task::PeerStorage { + ReadTask::FetchLogs { region_id, low, high, @@ -104,7 +126,7 @@ where .map(|c| (*c as u64) != high - low) .unwrap_or(false); fail_point!("worker_async_fetch_raft_log"); - self.notifier.notify( + self.notifier.notify_logs_fetched( region_id, FetchedLogs { context, @@ -119,6 +141,11 @@ where }, ); } + ReadTask::GenTabletSnapshot { region_id, .. } => { + // TODO: implement generate tablet snapshot for raftstore v2 + self.notifier + .notify_snapshot_generated(region_id, Box::new(RaftSnapshot::default())); + } } } } diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index a0828d12332..fcc3d535aa2 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -30,7 +30,7 @@ use super::{ metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use crate::{bytes_capacity, store::worker::RaftlogFetchTask, Result}; +use crate::{bytes_capacity, store::ReadTask, Result}; const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; const SHRINK_CACHE_CAPACITY: usize = 64; @@ -622,7 +622,7 @@ impl Default for CacheWarmupState { } /// A subset of `PeerStorage` that focus on accessing log entries. -pub struct EntryStorage { +pub struct EntryStorage { region_id: u64, peer_id: u64, raft_engine: ER, @@ -631,20 +631,20 @@ pub struct EntryStorage { apply_state: RaftApplyState, last_term: u64, applied_term: u64, - raftlog_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, raftlog_fetch_stats: AsyncFetchStats, async_fetch_results: RefCell>, cache_warmup_state: Option, } -impl EntryStorage { +impl EntryStorage { pub fn new( peer_id: u64, raft_engine: ER, mut raft_state: RaftLocalState, apply_state: RaftApplyState, region: &metapb::Region, - raftlog_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, ) -> Result { if let Err(e) = validate_states(region.id, &raft_engine, &mut raft_state, &apply_state) { return Err(box_err!( @@ -665,7 +665,7 @@ impl EntryStorage { apply_state, last_term, applied_term, - raftlog_fetch_scheduler, + read_scheduler, raftlog_fetch_stats: AsyncFetchStats::default(), async_fetch_results: RefCell::new(HashMap::default()), cache_warmup_state: None, @@ -862,8 +862,8 @@ impl EntryStorage { self.async_fetch_results .borrow_mut() .insert(low, RaftlogFetchState::Fetching(Instant::now_coarse())); - self.raftlog_fetch_scheduler - .schedule(RaftlogFetchTask::PeerStorage { + self.read_scheduler + .schedule(ReadTask::FetchLogs { region_id, context, low, @@ -1046,7 +1046,7 @@ impl EntryStorage { // Append the given entries to the raft log using previous last index or // self.last_index. - pub fn append(&mut self, entries: Vec, task: &mut WriteTask) { + pub fn append(&mut self, entries: Vec, task: &mut WriteTask) { if entries.is_empty() { return; } @@ -1242,13 +1242,17 @@ impl EntryStorage { pub fn clear(&mut self) { self.cache = EntryCache::default(); } + + pub fn read_scheduler(&self) -> Scheduler> { + self.read_scheduler.clone() + } } #[cfg(test)] pub mod tests { use std::sync::mpsc; - use engine_test::raft::RaftTestEngine; + use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; use engine_traits::RaftEngineReadOnly; use protobuf::Message; use raft::{GetEntriesContext, StorageError}; @@ -1273,7 +1277,7 @@ pub mod tests { } } - pub fn validate_cache(store: &EntryStorage, exp_ents: &[Entry]) { + pub fn validate_cache(store: &EntryStorage, exp_ents: &[Entry]) { assert_eq!(store.cache.cache, exp_ents); for e in exp_ents { let entry = store diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 63761321405..a800832ba82 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -94,11 +94,10 @@ use crate::{ util::{KeysInfoFormatter, LeaseState}, worker::{ new_change_peer_v2_request, Bucket, BucketRange, CleanupTask, ConsistencyCheckTask, - GcSnapshotTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadProgress, - RegionTask, SplitCheckTask, + GcSnapshotTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, }, CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, PeerTick, - ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, + ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, ReadTask, SignificantMsg, SnapKey, StoreMsg, WriteCallback, }, Error, Result, @@ -245,7 +244,7 @@ where store_id: u64, cfg: &Config, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, ) -> Result> { @@ -304,7 +303,7 @@ where store_id: u64, cfg: &Config, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, engines: Engines, region_id: u64, peer: metapb::Peer, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index c83309011ac..1179a535c7d 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -72,6 +72,7 @@ use crate::{ }, store::{ async_io::{ + read::{ReadRunner, ReadTask}, write::{StoreWriters, Worker as WriteWorker, WriteMsg}, write_router::WriteSenders, }, @@ -95,9 +96,9 @@ use crate::{ worker::{ AutoSplitController, CleanupRunner, CleanupSstRunner, CleanupSstTask, CleanupTask, CompactRunner, CompactTask, ConsistencyCheckRunner, ConsistencyCheckTask, - GcSnapshotRunner, GcSnapshotTask, PdRunner, RaftlogFetchRunner, RaftlogFetchTask, - RaftlogGcRunner, RaftlogGcTask, ReadDelegate, RefreshConfigRunner, RefreshConfigTask, - RegionRunner, RegionTask, SplitCheckTask, + GcSnapshotRunner, GcSnapshotTask, PdRunner, RaftlogGcRunner, RaftlogGcTask, + ReadDelegate, RefreshConfigRunner, RefreshConfigTask, RegionRunner, RegionTask, + SplitCheckTask, }, Callback, CasualMessage, GlobalReplicationState, InspectedRaftMessage, MergeResultKind, PdTask, PeerMsg, PeerTick, RaftCommand, SignificantMsg, SnapManager, StoreMsg, StoreTick, @@ -474,7 +475,7 @@ where // handle Compact, CleanupSst task pub cleanup_scheduler: Scheduler, pub raftlog_gc_scheduler: Scheduler, - pub raftlog_fetch_scheduler: Scheduler, + pub raftlog_fetch_scheduler: Scheduler>, pub region_scheduler: Scheduler>, pub apply_router: ApplyRouter, pub router: RaftRouter, @@ -1081,7 +1082,7 @@ pub struct RaftPollerBuilder { split_check_scheduler: Scheduler, cleanup_scheduler: Scheduler, raftlog_gc_scheduler: Scheduler, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, pub region_scheduler: Scheduler>, apply_router: ApplyRouter, pub router: RaftRouter, @@ -1531,7 +1532,7 @@ impl RaftBatchSystem { let raftlog_fetch_scheduler = workers.raftlog_fetch_worker.start( "raftlog-fetch-worker", - RaftlogFetchRunner::new(self.router.clone(), engines.raft.clone()), + ReadRunner::new(self.router.clone(), engines.raft.clone()), ); let compact_runner = CompactRunner::new(engines.kv.clone()); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 7ab47cc90c6..ea8ccc3219f 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -48,6 +48,7 @@ make_auto_flush_static_metric! { stale, decode, epoch, + cancel, } pub label_enum RegionHashType { diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index a60eb087562..2078ccabafc 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -30,6 +30,7 @@ mod worker; pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ + read::{AsyncReadNotifier, FetchedLogs, ReadRunner, ReadTask}, write::{ ExtraStates, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask, @@ -76,9 +77,8 @@ pub use self::{ util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, - CheckLeaderTask, FetchedLogs, FlowStatistics, FlowStatsReporter, KeyEntry, - LocalReadContext, LocalReader, LocalReaderCore, LogFetchedNotifier, PdTask, - RaftlogFetchRunner, RaftlogFetchTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, + CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, + LocalReader, LocalReaderCore, PdTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, TLS_LOCAL_READ_METRICS, diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 6851ebd30d8..b86700af8e6 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -26,9 +26,7 @@ use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; use tracker::{get_tls_tracker_token, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; -use super::{ - local_metrics::TimeTracker, region_meta::RegionMeta, worker::FetchedLogs, RegionSnapshot, -}; +use super::{local_metrics::TimeTracker, region_meta::RegionMeta, FetchedLogs, RegionSnapshot}; use crate::store::{ fsm::apply::{CatchUpLogs, ChangeObserver, TaskRes as ApplyTaskRes}, metrics::RaftEventDurationType, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index b06eb5c0c3f..b9cf76889b4 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -85,7 +85,7 @@ use crate::{ coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason, RoleChange}, errors::RAFTSTORE_IS_BUSY, store::{ - async_io::{write::WriteMsg, write_router::WriteRouter}, + async_io::{read::ReadTask, write::WriteMsg, write_router::WriteRouter}, fsm::{ apply::{self, CatchUpLogs}, store::{PollContext, RaftRouter}, @@ -97,8 +97,8 @@ use crate::{ txn_ext::LocksStatus, util::{admin_cmd_epoch_lookup, RegionReadProgress}, worker::{ - HeartbeatTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadExecutor, - ReadProgress, RegionTask, SplitCheckTask, + HeartbeatTask, RaftlogGcTask, ReadDelegate, ReadExecutor, ReadProgress, RegionTask, + SplitCheckTask, }, Callback, Config, GlobalReplicationState, PdTask, ReadCallback, ReadIndexContext, ReadResponse, TxnExt, WriteCallback, RAFT_INIT_LOG_INDEX, @@ -1041,7 +1041,7 @@ where store_id: u64, cfg: &Config, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, peer: metapb::Peer, diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 081149a6889..a53ca1e9258 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -36,8 +36,11 @@ use tikv_util::{ use super::{metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager}; use crate::{ store::{ - async_io::write::WriteTask, entry_storage::EntryStorage, fsm::GenSnapTask, - peer::PersistSnapshotResult, util, worker::RaftlogFetchTask, + async_io::{read::ReadTask, write::WriteTask}, + entry_storage::EntryStorage, + fsm::GenSnapTask, + peer::PersistSnapshotResult, + util, }, Error, Result, }; @@ -218,13 +221,13 @@ where region_scheduler: Scheduler>, snap_tried_cnt: RefCell, - entry_storage: EntryStorage, + entry_storage: EntryStorage, pub tag: String, } impl Deref for PeerStorage { - type Target = EntryStorage; + type Target = EntryStorage; #[inline] fn deref(&self) -> &Self::Target { @@ -286,7 +289,7 @@ where engines: Engines, region: &metapb::Region, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, peer_id: u64, tag: String, ) -> Result> { @@ -1136,21 +1139,19 @@ pub mod tests { use crate::{ coprocessor::CoprocessorHost, store::{ - async_io::write::write_to_db_for_test, + async_io::{read::ReadRunner, write::write_to_db_for_test}, bootstrap_store, entry_storage::tests::validate_cache, fsm::apply::compact_raft_log, initial_region, prepare_bootstrap_cluster, - worker::{ - make_region_worker_raftstore_cfg, FetchedLogs, LogFetchedNotifier, - RaftlogFetchRunner, RegionRunner, RegionTask, - }, + worker::{make_region_worker_raftstore_cfg, RegionRunner, RegionTask}, + AsyncReadNotifier, FetchedLogs, }, }; fn new_storage( region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, path: &TempDir, ) -> PeerStorage { let kv_db = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); @@ -1183,7 +1184,7 @@ pub mod tests { pub fn new_storage_from_ents( region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, path: &TempDir, ents: &[Entry], ) -> PeerStorage { @@ -1378,10 +1379,14 @@ pub mod tests { } } - impl LogFetchedNotifier for TestRouter { - fn notify(&self, _region_id: u64, fetched_logs: FetchedLogs) { + impl AsyncReadNotifier for TestRouter { + fn notify_logs_fetched(&self, _region_id: u64, fetched_logs: FetchedLogs) { self.ch.send(fetched_logs).unwrap(); } + + fn notify_snapshot_generated(&self, _region_id: u64, _snapshot: Box) { + unreachable!(); + } } #[test] @@ -1455,7 +1460,7 @@ pub mod tests { let raftlog_fetch_scheduler = raftlog_fetch_worker.scheduler(); let mut store = new_storage_from_ents(region_scheduler, raftlog_fetch_scheduler, &td, &ents); - raftlog_fetch_worker.start(RaftlogFetchRunner::new(router, store.engines.raft.clone())); + raftlog_fetch_worker.start(ReadRunner::new(router, store.engines.raft.clone())); store.compact_entry_cache(5); let mut e = store.entries(lo, hi, maxsize, GetEntriesContext::empty(true)); if e == Err(raft::Error::Store( diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index 19b825ac20c..d2bbe921eea 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -6,9 +6,10 @@ use std::sync::mpsc; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, Snapshot}; use kvproto::raft_serverpb::RaftMessage; +use raft::eraftpb::Snapshot as RaftSnapshot; use tikv_util::{error, warn}; -use super::worker::{FetchedLogs, LogFetchedNotifier}; +use super::{AsyncReadNotifier, FetchedLogs}; use crate::{ store::{CasualMessage, PeerMsg, RaftCommand, RaftRouter, SignificantMsg, StoreMsg}, DiscardReason, Error, Result, @@ -173,10 +174,15 @@ where } } -impl LogFetchedNotifier for RaftRouter { +impl AsyncReadNotifier for RaftRouter { #[inline] - fn notify(&self, region_id: u64, fetched: FetchedLogs) { + fn notify_logs_fetched(&self, region_id: u64, fetched: FetchedLogs) { // Ignore region not found as it may be removed. let _ = self.significant_send(region_id, SignificantMsg::RaftlogFetched(fetched)); } + + #[inline] + fn notify_snapshot_generated(&self, _region_id: u64, _snapshot: Box) { + unreachable!() + } } diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 4335369c3cb..cd7680ebc4a 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -8,7 +8,6 @@ mod compact; mod consistency_check; mod metrics; mod pd; -mod raftlog_fetch; mod raftlog_gc; mod read; mod refresh_config; @@ -31,9 +30,6 @@ pub use self::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, Runner as PdRunner, Task as PdTask, }, - raftlog_fetch::{ - FetchedLogs, LogFetchedNotifier, Runner as RaftlogFetchRunner, Task as RaftlogFetchTask, - }, raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, read::{ CachedReadDelegate, LocalReadContext, LocalReader, LocalReaderCore, From dd4299c6956f5f3472330e6fc8cc8fc16c4bd791 Mon Sep 17 00:00:00 2001 From: goldwind-ting <63939636+goldwind-ting@users.noreply.github.com> Date: Tue, 1 Nov 2022 18:20:00 +0800 Subject: [PATCH 301/676] fix typo (#13699) close tikv/tikv#13706 fix-typo: replace `threahold` with `threshold `. Signed-off-by: goldwind-ting <63939636+goldwind-ting@users.noreply.github.com> Co-authored-by: Yilin Chen Co-authored-by: Ti Chi Robot --- src/read_pool.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/read_pool.rs b/src/read_pool.rs index deb7336975c..4d9f7fd9264 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -32,10 +32,10 @@ use crate::{ // the duration to check auto-scale unified-thread-pool's thread const READ_POOL_THREAD_CHECK_DURATION: Duration = Duration::from_secs(10); // consider scale out read pool size if the average thread cpu usage is higher -// than this threahold. +// than this threshold. const READ_POOL_THREAD_HIGH_THRESHOLD: f64 = 0.8; // consider scale in read pool size if the average thread cpu usage is lower -// than this threahold. +// than this threshold. const READ_POOL_THREAD_LOW_THRESHOLD: f64 = 0.7; // avg running tasks per-thread that indicates read-pool is busy const RUNNING_TASKS_PER_THREAD_THRESHOLD: i64 = 3; From 497ae1b0a1f05dacdbe2f59d5b92ee99172e3a49 Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 1 Nov 2022 19:24:00 +0800 Subject: [PATCH 302/676] raft_client: Report store unreachable once until being connected again (#13677) close tikv/tikv#13676 Avoid reporting store unreachable again and again as broadcasting is time-consuming and blocks raftstore. Only send store unreachable when the store is ever connected. Signed-off-by: Connor1996 Co-authored-by: Xinye Tao --- components/batch-system/src/metrics.rs | 7 + components/batch-system/src/router.rs | 11 +- components/raftstore/src/store/fsm/peer.rs | 24 ++- components/raftstore/src/store/fsm/store.rs | 14 +- .../raftstore/src/store/local_metrics.rs | 6 + components/raftstore/src/store/metrics.rs | 39 +++-- .../test_raftstore/src/common-test.toml | 3 +- metrics/grafana/tikv_details.json | 95 ++++++++++- src/server/config.rs | 20 ++- src/server/raft_client.rs | 158 +++++++++++------- src/server/server.rs | 41 +++-- tests/integrations/config/mod.rs | 3 +- tests/integrations/config/test-custom.toml | 1 + tests/integrations/server/raft_client.rs | 62 ++++++- 14 files changed, 362 insertions(+), 122 deletions(-) diff --git a/components/batch-system/src/metrics.rs b/components/batch-system/src/metrics.rs index 9edcd656bf4..a4728f32ad7 100644 --- a/components/batch-system/src/metrics.rs +++ b/components/batch-system/src/metrics.rs @@ -10,4 +10,11 @@ lazy_static! { &["type"] ) .unwrap(); + + pub static ref BROADCAST_NORMAL_DURATION: Histogram = + register_histogram!( + "tikv_broadcast_normal_duration_seconds", + "Duration of broadcasting normals.", + exponential_buckets(0.001, 1.59, 20).unwrap() // max 10s + ).unwrap(); } diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 660ab014939..d96e65e1e99 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -12,12 +12,17 @@ use std::{ use collections::HashMap; use crossbeam::channel::{SendError, TrySendError}; -use tikv_util::{debug, info, lru::LruCache, Either}; +use tikv_util::{ + debug, info, + lru::LruCache, + time::{duration_to_sec, Instant}, + Either, +}; use crate::{ fsm::{Fsm, FsmScheduler, FsmState}, mailbox::{BasicMailbox, Mailbox}, - metrics::CHANNEL_FULL_COUNTER_VEC, + metrics::*, }; /// A struct that traces the approximate memory usage of router. @@ -306,10 +311,12 @@ where /// Try to notify all normal FSMs a message. pub fn broadcast_normal(&self, mut msg_gen: impl FnMut() -> N::Message) { + let timer = Instant::now_coarse(); let mailboxes = self.normals.lock().unwrap(); for mailbox in mailboxes.map.values() { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); } + BROADCAST_NORMAL_DURATION.observe(duration_to_sec(timer.saturating_elapsed()) as f64); } /// Try to notify all FSMs that the cluster is being shutdown. diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index a800832ba82..b7f7b005137 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -55,7 +55,7 @@ use tikv_util::{ mpsc::{self, LooseBoundedSender, Receiver}, store::{find_peer, is_learner, region_on_same_stores}, sys::{disk::DiskUsage, memory_usage_reaches_high_water}, - time::{monotonic_raw_now, Instant as TiInstant}, + time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, trace, warn, worker::{ScheduleError, Scheduler}, Either, @@ -605,6 +605,8 @@ where } pub fn handle_msgs(&mut self, msgs: &mut Vec>) { + let timer = TiInstant::now_coarse(); + let count = msgs.len(); for m in msgs.drain(..) { match m { PeerMsg::RaftMessage(msg) => { @@ -687,6 +689,12 @@ where } } self.on_loop_finished(); + self.ctx.raft_metrics.peer_msg_len.observe(count as f64); + self.ctx + .raft_metrics + .event_time + .peer_msg + .observe(duration_to_sec(timer.saturating_elapsed()) as f64); } #[inline] @@ -1382,7 +1390,7 @@ where SignificantMsg::CatchUpLogs(catch_up_logs) => { self.on_catch_up_logs_for_merge(catch_up_logs); } - SignificantMsg::StoreResolved { group_id, .. } => { + SignificantMsg::StoreResolved { group_id, store_id } => { let state = self.ctx.global_replication_state.lock().unwrap(); if state.status().get_mode() != ReplicationMode::DrAutoSync { return; @@ -1391,11 +1399,13 @@ where return; } drop(state); - self.fsm - .peer - .raft_group - .raft - .assign_commit_groups(&[(self.fsm.peer_id(), group_id)]); + if let Some(peer_id) = find_peer(self.region(), store_id).map(|p| p.get_id()) { + self.fsm + .peer + .raft_group + .raft + .assign_commit_groups(&[(peer_id, group_id)]); + } } SignificantMsg::CaptureChange { cmd, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 1179a535c7d..2bb2ea636e1 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -685,7 +685,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { fn on_tick(&mut self, tick: StoreTick) { - let t = TiInstant::now_coarse(); + let timer = TiInstant::now_coarse(); match tick { StoreTick::PdStoreHeartbeat => self.on_pd_store_heartbeat_tick(), StoreTick::SnapGc => self.on_snap_mgr_gc(), @@ -694,8 +694,10 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> StoreTick::ConsistencyCheck => self.on_consistency_check_tick(), StoreTick::CleanupImportSst => self.on_cleanup_import_sst_tick(), } - let elapsed = t.saturating_elapsed(); - RAFT_EVENT_DURATION + let elapsed = timer.saturating_elapsed(); + self.ctx + .raft_metrics + .event_time .get(tick.tag()) .observe(duration_to_sec(elapsed) as f64); slow_log!( @@ -707,6 +709,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } fn handle_msgs(&mut self, msgs: &mut Vec>) { + let timer = TiInstant::now_coarse(); for m in msgs.drain(..) { match m { StoreMsg::Tick(tick) => self.on_tick(tick), @@ -757,6 +760,11 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> StoreMsg::GcSnapshotFinish => self.register_snap_mgr_gc_tick(), } } + self.ctx + .raft_metrics + .event_time + .store_msg + .observe(duration_to_sec(timer.saturating_elapsed()) as f64); } fn start(&mut self, store: metapb::Store) { diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index aa33ae49fea..1648bd345ca 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -82,6 +82,8 @@ pub struct RaftMetrics { pub store_time: LocalHistogram, pub propose_wait_time: LocalHistogram, pub process_ready: LocalHistogram, + pub event_time: RaftEventDurationVec, + pub peer_msg_len: LocalHistogram, pub commit_log: LocalHistogram, pub write_block_wait: LocalHistogram, @@ -117,6 +119,8 @@ impl RaftMetrics { process_ready: PEER_RAFT_PROCESS_DURATION .with_label_values(&["ready"]) .local(), + event_time: RaftEventDurationVec::from(&RAFT_EVENT_DURATION_VEC), + peer_msg_len: PEER_MSG_LEN.local(), commit_log: PEER_COMMIT_LOG_HISTOGRAM.local(), write_block_wait: STORE_WRITE_MSG_BLOCK_WAIT_DURATION_HISTOGRAM.local(), waterfall_metrics, @@ -149,6 +153,8 @@ impl RaftMetrics { self.store_time.flush(); self.propose_wait_time.flush(); self.process_ready.flush(); + self.event_time.flush(); + self.peer_msg_len.flush(); self.commit_log.flush(); self.write_block_wait.flush(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index ea8ccc3219f..2fe6fce580e 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -87,16 +87,6 @@ make_auto_flush_static_metric! { finished, } - pub label_enum RaftEventDurationType { - compact_check, - pd_store_heartbeat, - snap_gc, - compact_lock_cf, - consistency_check, - cleanup_import_sst, - raft_engine_purge, - } - pub label_enum CompactionGuardAction { init, init_failure, @@ -104,10 +94,6 @@ make_auto_flush_static_metric! { skip_partition, } - pub struct RaftEventDuration : LocalHistogram { - "type" => RaftEventDurationType - } - pub struct RaftEntryFetches : LocalIntCounter { "type" => RaftEntryType } @@ -219,6 +205,18 @@ make_static_metric! { flashback_not_prepared } + pub label_enum RaftEventDurationType { + compact_check, + pd_store_heartbeat, + snap_gc, + compact_lock_cf, + consistency_check, + cleanup_import_sst, + raft_engine_purge, + peer_msg, + store_msg, + } + pub label_enum RaftLogGcSkippedReason { reserve_log, compact_idx_too_small, @@ -280,6 +278,10 @@ make_static_metric! { "type" => RaftInvalidProposal } + pub struct RaftEventDurationVec : LocalHistogram { + "type" => RaftEventDurationType + } + pub struct RaftLogGcSkippedCounterVec: LocalIntCounter { "reason" => RaftLogGcSkippedReason, } @@ -663,8 +665,13 @@ lazy_static! { &["type"], exponential_buckets(0.001, 1.59, 20).unwrap() // max 10s ).unwrap(); - pub static ref RAFT_EVENT_DURATION: RaftEventDuration = - auto_flush_from!(RAFT_EVENT_DURATION_VEC, RaftEventDuration); + + pub static ref PEER_MSG_LEN: Histogram = + register_histogram!( + "tikv_raftstore_peer_msg_len", + "Length of peer msg.", + exponential_buckets(1.0, 2.0, 20).unwrap() // max 1000s + ).unwrap(); pub static ref RAFT_READ_INDEX_PENDING_DURATION: Histogram = register_histogram!( diff --git a/components/test_raftstore/src/common-test.toml b/components/test_raftstore/src/common-test.toml index 50e62f67d28..a121a6c1e0e 100644 --- a/components/test_raftstore/src/common-test.toml +++ b/components/test_raftstore/src/common-test.toml @@ -24,7 +24,8 @@ grpc-raft-conn-num = 1 # Disable stats concurrency. procinfo performs too bad without optimization, # disable it to save CPU for real tests. stats-concurrency = 0 -raft-client-backoff-step = "5ms" +raft-client-max-backoff = "100ms" +raft-client-initial-reconnect-backoff = "100ms" [server.labels] diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 471bf4bea2e..ccac776b508 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -14426,7 +14426,7 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The time consumed by raftstore events (P99).99", + "description": "The max time consumed by raftstore events", "editable": true, "error": false, "fieldConfig": { @@ -14466,7 +14466,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", @@ -14476,12 +14476,25 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_event_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "exemplar": true, + "expr": "histogram_quantile(1.0, sum(rate(tikv_raftstore_event_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", + "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "C", "step": 4 + }, + { + "exemplar": true, + "expr": "histogram_quantile(1.0, sum(rate(tikv_broadcast_normal_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "broadcast_normal", + "refId": "A", + "step": 4 } ], "thresholds": [ @@ -14496,7 +14509,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "0.99 Duration of raft store events", + "title": "Max duration of raft store events", "tooltip": { "msResolution": false, "shared": true, @@ -14557,7 +14570,7 @@ "h": 8, "w": 12, "x": 0, - "y": 20 + "y": 21 }, "heatmap": {}, "hideZeroBuckets": true, @@ -14603,6 +14616,78 @@ "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The length of peer msgs for each round handling", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 21 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763572958, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(delta(tikv_raftstore_peer_msg_len_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "C", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Peer msg length distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "none", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null } ], "repeat": null, diff --git a/src/server/config.rs b/src/server/config.rs index 1959b77df00..ae5c70abe1d 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -90,9 +90,18 @@ pub struct Config { // When merge raft messages into a batch message, leave a buffer. #[online_config(skip)] pub raft_client_grpc_send_msg_buffer: usize, - #[online_config(skip)] pub raft_client_queue_size: usize, + // Test only + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(skip)] + pub raft_client_max_backoff: ReadableDuration, + // Test only + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(skip)] + pub raft_client_initial_reconnect_backoff: ReadableDuration, pub raft_msg_max_batch_size: usize, @@ -156,12 +165,6 @@ pub struct Config { #[online_config(skip)] pub forward_max_connections_per_address: usize, - // Test only. - #[doc(hidden)] - #[serde(skip_serializing)] - #[online_config(skip)] - pub raft_client_backoff_step: ReadableDuration, - #[doc(hidden)] #[online_config(skip)] /// When TiKV memory usage reaches `memory_usage_high_water` it will try to @@ -218,6 +221,8 @@ impl Default for Config { max_grpc_send_msg_len: DEFAULT_MAX_GRPC_SEND_MSG_LEN, raft_client_grpc_send_msg_buffer: 512 * 1024, raft_client_queue_size: 8192, + raft_client_max_backoff: ReadableDuration::secs(5), + raft_client_initial_reconnect_backoff: ReadableDuration::secs(1), raft_msg_max_batch_size: 128, grpc_compression_type: GrpcCompressionType::None, grpc_gzip_compression_level: DEFAULT_GRPC_GZIP_COMPRESSION_LEVEL, @@ -254,7 +259,6 @@ impl Default for Config { heavy_load_threshold: 75, heavy_load_wait_duration: None, enable_request_batch: true, - raft_client_backoff_step: ReadableDuration::secs(1), reject_messages_on_memory_ratio: 0.2, background_thread_count, end_point_slow_log_threshold: ReadableDuration::secs(1), diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index bc0e8a59303..7b29976f218 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -1,7 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - cmp, collections::VecDeque, ffi::CString, marker::{PhantomData, Unpin}, @@ -27,8 +26,8 @@ use futures::{ }; use futures_timer::Delay; use grpcio::{ - ChannelBuilder, ClientCStreamReceiver, ClientCStreamSender, Environment, RpcStatusCode, - WriteFlags, + Channel, ChannelBuilder, ClientCStreamReceiver, ClientCStreamSender, Environment, + RpcStatusCode, WriteFlags, }; use kvproto::{ raft_serverpb::{Done, RaftMessage}, @@ -550,10 +549,18 @@ where } } +#[derive(PartialEq)] +enum RaftCallRes { + // the call is not supported, probably due to visiting to older version TiKV + Fallback, + // the connection is aborted or closed + Disconnected, +} + struct RaftCall { sender: AsyncRaftSender, receiver: ClientCStreamReceiver, - lifetime: Option>, + lifetime: Option>, store_id: u64, } @@ -563,29 +570,31 @@ where B: Buffer + Unpin, E: KvEngine, { - fn clean_up(&mut self, sink_err: Option, recv_err: Option) { - error!("connection aborted"; "store_id" => self.store_id, "sink_error" => ?sink_err, "receiver_err" => ?recv_err, "addr" => %self.sender.addr); + async fn poll(&mut self) { + let res = futures::join!(&mut self.sender, &mut self.receiver); + if let (Ok(()), Ok(Done { .. })) = res { + info!("connection close"; "store_id" => self.store_id, "addr" => %self.sender.addr); + if let Some(tx) = self.lifetime.take() { + let _ = tx.send(RaftCallRes::Disconnected); + } + return; + } + let (sink_err, recv_err) = (res.0.err(), res.1.err()); + error!("connection aborted"; "store_id" => self.store_id, "sink_error" => ?sink_err, "receiver_err" => ?recv_err, "addr" => %self.sender.addr); if let Some(tx) = self.lifetime.take() { let should_fallback = [sink_err, recv_err] .iter() .any(|e| e.as_ref().map_or(false, grpc_error_is_unimplemented)); - if should_fallback { - // Asks backend to fallback. - let _ = tx.send(()); - return; - } - } - self.sender.router.broadcast_unreachable(self.store_id); - } - async fn poll(&mut self) { - let res = futures::join!(&mut self.sender, &mut self.receiver); - if let (Ok(()), Ok(Done { .. })) = res { - info!("connection close"; "store_id" => self.store_id, "addr" => %self.sender.addr); - return; + let res = if should_fallback { + // Asks backend to fallback. + RaftCallRes::Fallback + } else { + RaftCallRes::Disconnected + }; + let _ = tx.send(res); } - self.clean_up(res.0.err(), res.1.err()); } } @@ -686,7 +695,7 @@ where .inc_by(len as u64); } - fn connect(&self, addr: &str) -> TikvClient { + fn connect(&self, addr: &str) -> Channel { info!("server: new connection with tikv endpoint"; "addr" => addr, "store_id" => self.store_id); let cfg = self.builder.cfg.value(); @@ -697,16 +706,17 @@ where .default_compression_algorithm(cfg.grpc_compression_algorithm()) .default_gzip_compression_level(cfg.grpc_gzip_compression_level) .default_grpc_min_message_size_to_compress(cfg.grpc_min_message_size_to_compress) + .max_reconnect_backoff(cfg.raft_client_max_backoff.0) + .initial_reconnect_backoff(cfg.raft_client_initial_reconnect_backoff.0) // hack: so it's different args, grpc will always create a new connection. .raw_cfg_int( CString::new("random id").unwrap(), CONN_ID.fetch_add(1, Ordering::SeqCst), ); - let channel = self.builder.security_mgr.connect(cb, addr); - TikvClient::new(channel) + self.builder.security_mgr.connect(cb, addr) } - fn batch_call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver<()> { + fn batch_call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver { let (batch_sink, batch_stream) = client.batch_raft().unwrap(); let (tx, rx) = oneshot::channel(); let mut call = RaftCall { @@ -731,7 +741,7 @@ where rx } - fn call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver<()> { + fn call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver { let (sink, stream) = client.raft().unwrap(); let (tx, rx) = oneshot::channel(); let mut call = RaftCall { @@ -756,22 +766,23 @@ where } } -async fn maybe_backoff(backoff: Duration, last_wake_time: &mut Instant, retry_times: &mut u32) { - if *retry_times == 0 { - return; - } - let timeout = backoff * cmp::min(*retry_times, 5); +async fn maybe_backoff(backoff: Duration, last_wake_time: &mut Option) { let now = Instant::now(); - if *last_wake_time + timeout < now { - // We have spent long enough time in last retry, no need to backoff again. - *last_wake_time = now; - *retry_times = 0; + if let Some(last) = *last_wake_time { + if last + backoff < now { + // We have spent long enough time in last retry, no need to backoff again. + *last_wake_time = Some(now); + return; + } + } else { + *last_wake_time = Some(now); return; } - if let Err(e) = GLOBAL_TIMER_HANDLE.delay(now + timeout).compat().await { + + if let Err(e) = GLOBAL_TIMER_HANDLE.delay(now + backoff).compat().await { error_unknown!(?e; "failed to backoff"); } - *last_wake_time = Instant::now(); + *last_wake_time = Some(Instant::now()); } /// A future that drives the life cycle of a connection. @@ -793,12 +804,12 @@ async fn start( R: RaftStoreRouter + Unpin + Send + 'static, E: KvEngine, { - let mut last_wake_time = Instant::now(); - let mut retry_times = 0; - let backoff_duration = back_end.builder.cfg.value().raft_client_backoff_step.0; + let mut last_wake_time = None; + let mut first_time = true; + let backoff_duration = back_end.builder.cfg.value().raft_client_max_backoff.0; + let mut addr_channel = None; loop { - maybe_backoff(backoff_duration, &mut last_wake_time, &mut retry_times).await; - retry_times += 1; + maybe_backoff(backoff_duration, &mut last_wake_time).await; let f = back_end.resolve(); let addr = match f.await { Ok(addr) => { @@ -822,36 +833,65 @@ async fn start( continue; } }; - let client = back_end.connect(&addr); + + // reuse channel if the address is the same. + if addr_channel + .as_ref() + .map_or(true, |(_, prev_addr)| prev_addr != &addr) + { + addr_channel = Some((back_end.connect(&addr), addr.clone())); + } + let channel = addr_channel.as_ref().unwrap().0.clone(); + + debug!("connecting to store"; "store_id" => back_end.store_id, "addr" => %addr); + if !channel.wait_for_connected(backoff_duration).await { + error!("wait connect timeout"; "store_id" => back_end.store_id, "addr" => addr); + + // Clears pending messages to avoid consuming high memory when one node is + // shutdown. + back_end.clear_pending_message("unreachable"); + + // broadcast is time consuming operation which would blocks raftstore, so report + // unreachable only once until being connected again. + if first_time { + first_time = false; + back_end + .builder + .router + .broadcast_unreachable(back_end.store_id); + } + continue; + } else { + debug!("connection established"; "store_id" => back_end.store_id, "addr" => %addr); + } + + let client = TikvClient::new(channel); let f = back_end.batch_call(&client, addr.clone()); - let mut res = f.await; - if res == Ok(()) { - // If the call is setup successfully, it will never finish. Returning `Ok(())` - // means the batch_call is not supported, we are probably connect to - // an old version of TiKV. So we need to fallback to use legacy API. + let mut res = f.await; // block here until the stream call is closed or aborted. + if res == Ok(RaftCallRes::Fallback) { + // If the call is setup successfully, it will never finish. Returning + // `UnImplemented` means the batch_call is not supported, we are probably + // connect to an old version of TiKV. So we need to fallback to use + // legacy API. let f = back_end.call(&client, addr.clone()); res = f.await; } match res { - Ok(()) => { + Ok(RaftCallRes::Fallback) => { error!("connection fail"; "store_id" => back_end.store_id, "addr" => addr, "err" => "require fallback even with legacy API"); } - Err(_) => { + // Err(_) should be tx is dropped + Ok(RaftCallRes::Disconnected) | Err(_) => { error!("connection abort"; "store_id" => back_end.store_id, "addr" => addr); - if retry_times > 1 { - // Clears pending messages to avoid consuming high memory when one node is - // shutdown. - back_end.clear_pending_message("unreachable"); - } else { - // At least report failure in metrics. - REPORT_FAILURE_MSG_COUNTER - .with_label_values(&["unreachable", &back_end.store_id.to_string()]) - .inc_by(1); - } + REPORT_FAILURE_MSG_COUNTER + .with_label_values(&["unreachable", &back_end.store_id.to_string()]) + .inc_by(1); back_end .builder .router .broadcast_unreachable(back_end.store_id); + addr_channel = None; + first_time = false; } } } diff --git a/src/server/server.rs b/src/server/server.rs index 992b5cf6fa0..a4d82f1e347 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -340,7 +340,6 @@ pub mod test_router { use std::sync::mpsc::*; use engine_rocks::{RocksEngine, RocksSnapshot}; - use engine_traits::{KvEngine, Snapshot}; use kvproto::raft_serverpb::RaftMessage; use raftstore::{store::*, Result as RaftStoreResult}; @@ -348,13 +347,13 @@ pub mod test_router { #[derive(Clone)] pub struct TestRaftStoreRouter { - tx: Sender, + tx: Sender, StoreMsg>>, significant_msg_sender: Sender>, } impl TestRaftStoreRouter { pub fn new( - tx: Sender, + tx: Sender, StoreMsg>>, significant_msg_sender: Sender>, ) -> TestRaftStoreRouter { TestRaftStoreRouter { @@ -365,25 +364,26 @@ pub mod test_router { } impl StoreRouter for TestRaftStoreRouter { - fn send(&self, _: StoreMsg) -> RaftStoreResult<()> { - let _ = self.tx.send(1); + fn send(&self, msg: StoreMsg) -> RaftStoreResult<()> { + let _ = self.tx.send(Either::Right(msg)); Ok(()) } } - impl ProposalRouter for TestRaftStoreRouter { + impl ProposalRouter for TestRaftStoreRouter { fn send( &self, - _: RaftCommand, - ) -> std::result::Result<(), crossbeam::channel::TrySendError>> { - let _ = self.tx.send(1); + cmd: RaftCommand, + ) -> std::result::Result<(), crossbeam::channel::TrySendError>> + { + let _ = self.tx.send(Either::Left(PeerMsg::RaftCommand(cmd))); Ok(()) } } - impl CasualRouter for TestRaftStoreRouter { - fn send(&self, _: u64, _: CasualMessage) -> RaftStoreResult<()> { - let _ = self.tx.send(1); + impl CasualRouter for TestRaftStoreRouter { + fn send(&self, _: u64, msg: CasualMessage) -> RaftStoreResult<()> { + let _ = self.tx.send(Either::Left(PeerMsg::CasualMessage(msg))); Ok(()) } } @@ -400,13 +400,18 @@ pub mod test_router { } impl RaftStoreRouter for TestRaftStoreRouter { - fn send_raft_msg(&self, _: RaftMessage) -> RaftStoreResult<()> { - let _ = self.tx.send(1); + fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { + let _ = self + .tx + .send(Either::Left(PeerMsg::RaftMessage(InspectedRaftMessage { + heap_size: 0, + msg, + }))); Ok(()) } - fn broadcast_normal(&self, _: impl FnMut() -> PeerMsg) { - let _ = self.tx.send(1); + fn broadcast_normal(&self, mut f: impl FnMut() -> PeerMsg) { + let _ = self.tx.send(Either::Left(f())); } } } @@ -427,7 +432,7 @@ mod tests { }; use resource_metering::ResourceTagFactory; use security::SecurityConfig; - use tikv_util::quota_limiter::QuotaLimiter; + use tikv_util::{config::ReadableDuration, quota_limiter::QuotaLimiter}; use tokio::runtime::Builder as TokioBuilder; use super::{ @@ -487,6 +492,8 @@ mod tests { let mock_store_id = 5; let cfg = Config { addr: "127.0.0.1:0".to_owned(), + raft_client_max_backoff: ReadableDuration::millis(100), + raft_client_initial_reconnect_backoff: ReadableDuration::millis(100), ..Default::default() }; diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 90524079bfa..9bb2f7b88da 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -88,6 +88,8 @@ fn test_serde_custom_tikv_config() { max_grpc_send_msg_len: 6 * (1 << 20), raft_client_grpc_send_msg_buffer: 1234 * 1024, raft_client_queue_size: 1234, + raft_client_max_backoff: ReadableDuration::secs(5), + raft_client_initial_reconnect_backoff: ReadableDuration::secs(1), raft_msg_max_batch_size: 123, concurrent_send_snap_limit: 4, concurrent_recv_snap_limit: 4, @@ -117,7 +119,6 @@ fn test_serde_custom_tikv_config() { heavy_load_wait_duration: Some(ReadableDuration::millis(2)), enable_request_batch: false, background_thread_count: 999, - raft_client_backoff_step: ReadableDuration::secs(1), end_point_slow_log_threshold: ReadableDuration::secs(1), forward_max_connections_per_address: 5, reject_messages_on_memory_ratio: 0.8, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 17f82f9eb87..9c1837c1fbd 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -54,6 +54,7 @@ status-thread-pool-size = 1 max-grpc-send-msg-len = 6291456 raft-client-grpc-send-msg-buffer = 1263616 raft-client-queue-size = 1234 +raft-client-max-backoff = "5s" raft-msg-max-batch-size = 123 grpc-compression-type = "gzip" grpc-concurrency = 123 diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index 7ee38a72c87..edf4d0f1c65 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -23,14 +23,16 @@ use raft::eraftpb::Entry; use raftstore::{ errors::DiscardReason, router::{RaftStoreBlackHole, RaftStoreRouter}, + store::StoreMsg, }; use tikv::server::{ self, load_statistics::ThreadLoadPool, resolve, resolve::Callback, Config, ConnectionBuilder, RaftClient, StoreAddrResolver, TestRaftStoreRouter, }; use tikv_util::{ - config::VersionTrack, + config::{ReadableDuration, VersionTrack}, worker::{Builder as WorkerBuilder, LazyWorker}, + Either, }; use super::*; @@ -59,7 +61,10 @@ where T: StoreAddrResolver + 'static, { let env = Arc::new(Environment::new(2)); - let cfg = Arc::new(VersionTrack::new(Config::default())); + let mut config = Config::default(); + config.raft_client_max_backoff = ReadableDuration::millis(100); + config.raft_client_initial_reconnect_backoff = ReadableDuration::millis(100); + let cfg = Arc::new(VersionTrack::new(config)); let security_mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); let worker = LazyWorker::new("test-raftclient"); let loads = Arc::new(ThreadLoadPool::with_threshold(1000)); @@ -194,7 +199,6 @@ fn test_raft_client_reconnect() { raft_client.send(RaftMessage::default()).unwrap(); } raft_client.flush(); - rx.recv_timeout(Duration::from_secs(3)).unwrap(); // `send` should success after the mock server restarted. let service = MockKvForRaft::new(Arc::clone(&msg_count), batch_msg_count, true); @@ -207,6 +211,58 @@ fn test_raft_client_reconnect() { drop(mock_server); } +#[test] +// Test raft_client reports store unreachable only once until being connected +// again +fn test_raft_client_report_unreachable() { + let msg_count = Arc::new(AtomicUsize::new(0)); + let batch_msg_count = Arc::new(AtomicUsize::new(0)); + let service = MockKvForRaft::new(Arc::clone(&msg_count), Arc::clone(&batch_msg_count), true); + let (mut mock_server, port) = create_mock_server(service, 60100, 60200).unwrap(); + + let (tx, rx) = mpsc::channel(); + let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); + let router = TestRaftStoreRouter::new(tx, significant_msg_sender); + let mut raft_client = get_raft_client(router, StaticResolver::new(port)); + + // server is disconnected + mock_server.shutdown(); + drop(mock_server); + + raft_client.send(RaftMessage::default()).unwrap(); + let msg = rx.recv_timeout(Duration::from_millis(200)).unwrap(); + if let Either::Right(StoreMsg::StoreUnreachable { store_id }) = msg { + assert_eq!(store_id, 0); + } else { + panic!("expect StoreUnreachable"); + } + // no more unreachable message is sent until it's connected again. + rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); + + // restart the mock server. + let service = MockKvForRaft::new(Arc::clone(&msg_count), batch_msg_count, true); + let mut mock_server = create_mock_server_on(service, port); + + // make sure the connection is connected, otherwise the following sent messages + // may be dropped + std::thread::sleep(Duration::from_millis(200)); + (0..50).for_each(|_| raft_client.send(RaftMessage::default()).unwrap()); + raft_client.flush(); + check_msg_count(500, &msg_count, 50); + + // server is disconnected + mock_server.take().unwrap().shutdown(); + + let msg = rx.recv_timeout(Duration::from_millis(200)).unwrap(); + if let Either::Right(StoreMsg::StoreUnreachable { store_id }) = msg { + assert_eq!(store_id, 0); + } else { + panic!("expect StoreUnreachable"); + } + // no more unreachable message is sent until it's connected again. + rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); +} + #[test] fn test_batch_size_limit() { let msg_count = Arc::new(AtomicUsize::new(0)); From e1ca10e4735d7e3cce29c1b1a9be895d31b051cc Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Wed, 2 Nov 2022 11:24:00 +0800 Subject: [PATCH 303/676] txn_types: add info about last change to Lock and Write (#13698) ref tikv/tikv#13694 This commit adds support for serializing and parsing the infomation about the last change stored in the lock and write CF. Signed-off-by: Yilin Chen --- Cargo.lock | 2 +- Cargo.toml | 2 +- components/pd_client/src/util.rs | 1 + components/raftstore/src/store/txn_ext.rs | 8 +- components/tikv_kv/src/lib.rs | 4 + components/txn_types/src/lock.rs | 91 +++++++++++++++++-- components/txn_types/src/write.rs | 51 +++++++++++ src/storage/mod.rs | 2 + .../txn/actions/acquire_pessimistic_lock.rs | 5 + src/storage/types.rs | 8 ++ tests/failpoints/cases/test_merge.rs | 8 ++ tests/failpoints/cases/test_split_region.rs | 4 + tests/failpoints/cases/test_transaction.rs | 2 + .../failpoints/cases/test_transfer_leader.rs | 6 ++ tests/integrations/raftstore/test_merge.rs | 6 ++ tests/integrations/raftstore/test_multi.rs | 2 + .../raftstore/test_split_region.rs | 4 + .../raftstore/test_transfer_leader.rs | 4 + 18 files changed, 196 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bc757a3ecdf..f1152b2002e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2694,7 +2694,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#26e28e6a281abb927f91ef992eb8f93b39698ffa" +source = "git+https://github.com/pingcap/kvproto.git#65d0ae8fa853c1e41b43f329afbf60616bdd4d18" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/Cargo.toml b/Cargo.toml index d95dd1c67c1..756f36a0c50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -221,7 +221,7 @@ procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229 # kvproto at the same time. # After the PR to kvproto is merged, remember to comment this out and run `cargo update -p kvproto`. [patch.'https://github.com/pingcap/kvproto'] -# kvproto = { git = "https://github.com/your_github_id/kvproto", branch="your_branch" } +# kvproto = { git = "https://github.com/your_github_id/kvproto", branch = "your_branch" } [workspace] # See https://github.com/rust-lang/rfcs/blob/master/text/2957-cargo-features2.md diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 2aa74176627..da77783c167 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -854,6 +854,7 @@ pub fn check_resp_header(header: &ResponseHeader) -> Result<()> { ErrorType::Ok => Ok(()), ErrorType::DuplicatedEntry | ErrorType::EntryNotFound => Err(box_err!(err.get_message())), ErrorType::Unknown => Err(box_err!(err.get_message())), + ErrorType::InvalidValue => Err(box_err!(err.get_message())), } } diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 1270ae104c9..ccc4027e9d1 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -322,8 +322,10 @@ mod tests { primary: primary.to_vec().into_boxed_slice(), start_ts: 100.into(), ttl: 3000, - for_update_ts: 100.into(), - min_commit_ts: Default::default(), + for_update_ts: 110.into(), + min_commit_ts: 110.into(), + last_change_ts: 105.into(), + versions_to_last_change: 2, } } @@ -424,6 +426,8 @@ mod tests { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 5.into(), + versions_to_last_change: 2, }, deleted, ), diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 77f9a00efcb..9d4eb4a8370 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -1183,6 +1183,8 @@ mod unit_tests { ttl: 200, for_update_ts: 101.into(), min_commit_ts: 102.into(), + last_change_ts: 80.into(), + versions_to_last_change: 2, }, ), Modify::DeleteRange( @@ -1225,6 +1227,8 @@ mod unit_tests { ttl: 200, for_update_ts: 101.into(), min_commit_ts: 102.into(), + last_change_ts: 80.into(), + versions_to_last_change: 2, } .into_lock() .to_bytes(), diff --git a/components/txn_types/src/lock.rs b/components/txn_types/src/lock.rs index 96c96828bcb..3e666c29e40 100644 --- a/components/txn_types/src/lock.rs +++ b/components/txn_types/src/lock.rs @@ -33,6 +33,7 @@ const TXN_SIZE_PREFIX: u8 = b't'; const MIN_COMMIT_TS_PREFIX: u8 = b'c'; const ASYNC_COMMIT_PREFIX: u8 = b'a'; const ROLLBACK_TS_PREFIX: u8 = b'r'; +const LAST_CHANGE_PREFIX: u8 = b'l'; impl LockType { pub fn from_mutation(mutation: &Mutation) -> Option { @@ -85,6 +86,12 @@ pub struct Lock { // while committing is relatively expensive. So the solution is putting the ts of the rollback // to the lock. pub rollback_ts: Vec, + + /// The commit TS of the latest PUT/DELETE record + pub last_change_ts: TimeStamp, + /// The number of versions that need skipping from the latest version to + /// find the latest PUT/DELETE record + pub versions_to_last_change: u64, } impl std::fmt::Debug for Lock { @@ -108,6 +115,8 @@ impl std::fmt::Debug for Lock { .field("use_async_commit", &self.use_async_commit) .field("secondaries", &secondary_keys) .field("rollback_ts", &self.rollback_ts) + .field("last_change_ts", &self.last_change_ts) + .field("versions_to_last_change", &self.versions_to_last_change) .finish() } } @@ -135,6 +144,8 @@ impl Lock { use_async_commit: false, secondaries: Vec::default(), rollback_ts: Vec::default(), + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, } } @@ -151,6 +162,17 @@ impl Lock { self } + #[must_use] + pub fn set_last_change( + mut self, + last_change_ts: TimeStamp, + versions_to_last_change: u64, + ) -> Self { + self.last_change_ts = last_change_ts; + self.versions_to_last_change = versions_to_last_change; + self + } + pub fn to_bytes(&self) -> Vec { let mut b = Vec::with_capacity(self.pre_allocate_size()); b.push(self.lock_type.to_u8()); @@ -188,6 +210,11 @@ impl Lock { b.encode_u64(ts.into_inner()).unwrap(); } } + if !self.last_change_ts.is_zero() { + b.push(LAST_CHANGE_PREFIX); + b.encode_u64(self.last_change_ts.into_inner()).unwrap(); + b.encode_var_u64(self.versions_to_last_change).unwrap(); + } b } @@ -217,6 +244,9 @@ impl Lock { if !self.rollback_ts.is_empty() { size += 1 + MAX_VAR_U64_LEN + size_of::() * self.rollback_ts.len(); } + if !self.last_change_ts.is_zero() { + size += 1 + size_of::() + MAX_VAR_U64_LEN; + } size } @@ -253,6 +283,8 @@ impl Lock { let mut use_async_commit = false; let mut secondaries = Vec::new(); let mut rollback_ts = Vec::new(); + let mut last_change_ts = TimeStamp::zero(); + let mut versions_to_last_change = 0; while !b.is_empty() { match b.read_u8()? { SHORT_VALUE_PREFIX => { @@ -286,6 +318,10 @@ impl Lock { rollback_ts.push(number::decode_u64(&mut b)?.into()); } } + LAST_CHANGE_PREFIX => { + last_change_ts = number::decode_u64(&mut b)?.into(); + versions_to_last_change = number::decode_var_u64(&mut b)?; + } _ => { // To support forward compatibility, all fields should be serialized in order // and stop parsing if meets an unknown byte. @@ -302,7 +338,8 @@ impl Lock { for_update_ts, txn_size, min_commit_ts, - ); + ) + .set_last_change(last_change_ts, versions_to_last_change); if use_async_commit { lock = lock.use_async_commit(secondaries); } @@ -328,6 +365,7 @@ impl Lock { info.set_use_async_commit(self.use_async_commit); info.set_min_commit_ts(self.min_commit_ts.into_inner()); info.set_secondaries(self.secondaries.into()); + // The client does not care about last_change_ts and versions_to_last_version. info } @@ -434,6 +472,9 @@ pub struct PessimisticLock { pub ttl: u64, pub for_update_ts: TimeStamp, pub min_commit_ts: TimeStamp, + + pub last_change_ts: TimeStamp, + pub versions_to_last_change: u64, } impl PessimisticLock { @@ -448,6 +489,7 @@ impl PessimisticLock { 0, self.min_commit_ts, ) + .set_last_change(self.last_change_ts, self.versions_to_last_change) } // Same with `to_lock` but does not copy the primary key. @@ -462,6 +504,7 @@ impl PessimisticLock { 0, self.min_commit_ts, ) + .set_last_change(self.last_change_ts, self.versions_to_last_change) } pub fn memory_size(&self) -> usize { @@ -477,6 +520,8 @@ impl std::fmt::Debug for PessimisticLock { .field("ttl", &self.ttl) .field("for_update_ts", &self.for_update_ts) .field("min_commit_ts", &self.min_commit_ts) + .field("last_change_ts", &self.last_change_ts) + .field("versions_to_last_change", &self.versions_to_last_change) .finish() } } @@ -687,6 +732,17 @@ mod tests { 555.into(), ) .with_rollback_ts(vec![12.into(), 24.into(), 13.into()]), + Lock::new( + LockType::Lock, + b"pk".to_vec(), + 1.into(), + 10, + None, + 6.into(), + 16, + 8.into(), + ) + .set_last_change(4.into(), 2), ]; for (i, lock) in locks.drain(..).enumerate() { let v = lock.to_bytes(); @@ -931,7 +987,8 @@ mod tests { b"secondary_kkkkk2".to_vec(), b"secondary_k3k3k3k3k3k3".to_vec(), b"secondary_k4".to_vec(), - ]); + ]) + .set_last_change(80.into(), 4); assert_eq!( format!("{:?}", lock), @@ -939,7 +996,8 @@ mod tests { short_value: 73686F72745F76616C7565, for_update_ts: TimeStamp(101), txn_size: 10, \ min_commit_ts: TimeStamp(127), use_async_commit: true, \ secondaries: [7365636F6E646172795F6B31, 7365636F6E646172795F6B6B6B6B6B32, \ - 7365636F6E646172795F6B336B336B336B336B336B33, 7365636F6E646172795F6B34], rollback_ts: [] }" + 7365636F6E646172795F6B336B336B336B336B336B33, 7365636F6E646172795F6B34], rollback_ts: [], \ + last_change_ts: TimeStamp(80), versions_to_last_change: 4 }" ); log_wrappers::set_redact_info_log(true); let redact_result = format!("{:?}", lock); @@ -948,7 +1006,8 @@ mod tests { redact_result, "Lock { lock_type: Put, primary_key: ?, start_ts: TimeStamp(100), ttl: 3, \ short_value: ?, for_update_ts: TimeStamp(101), txn_size: 10, min_commit_ts: TimeStamp(127), \ - use_async_commit: true, secondaries: [?, ?, ?, ?], rollback_ts: [] }" + use_async_commit: true, secondaries: [?, ?, ?, ?], rollback_ts: [], \ + last_change_ts: TimeStamp(80), versions_to_last_change: 4 }" ); lock.short_value = None; @@ -957,7 +1016,8 @@ mod tests { format!("{:?}", lock), "Lock { lock_type: Put, primary_key: 706B, start_ts: TimeStamp(100), ttl: 3, short_value: , \ for_update_ts: TimeStamp(101), txn_size: 10, min_commit_ts: TimeStamp(127), \ - use_async_commit: true, secondaries: [], rollback_ts: [] }" + use_async_commit: true, secondaries: [], rollback_ts: [], last_change_ts: TimeStamp(80), \ + versions_to_last_change: 4 }" ); log_wrappers::set_redact_info_log(true); let redact_result = format!("{:?}", lock); @@ -966,7 +1026,8 @@ mod tests { redact_result, "Lock { lock_type: Put, primary_key: ?, start_ts: TimeStamp(100), ttl: 3, short_value: ?, \ for_update_ts: TimeStamp(101), txn_size: 10, min_commit_ts: TimeStamp(127), \ - use_async_commit: true, secondaries: [], rollback_ts: [] }" + use_async_commit: true, secondaries: [], rollback_ts: [], last_change_ts: TimeStamp(80), \ + versions_to_last_change: 4 }" ); } @@ -978,6 +1039,8 @@ mod tests { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 8.into(), + versions_to_last_change: 2, }; let expected_lock = Lock { lock_type: LockType::Pessimistic, @@ -991,6 +1054,8 @@ mod tests { use_async_commit: false, secondaries: vec![], rollback_ts: vec![], + last_change_ts: 8.into(), + versions_to_last_change: 2, }; assert_eq!(pessimistic_lock.to_lock(), expected_lock); assert_eq!(pessimistic_lock.into_lock(), expected_lock); @@ -1004,11 +1069,14 @@ mod tests { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 8.into(), + versions_to_last_change: 2, }; assert_eq!( format!("{:?}", pessimistic_lock), "PessimisticLock { primary_key: 7072696D617279, start_ts: TimeStamp(5), ttl: 1000, \ - for_update_ts: TimeStamp(10), min_commit_ts: TimeStamp(20) }" + for_update_ts: TimeStamp(10), min_commit_ts: TimeStamp(20), last_change_ts: TimeStamp(8), \ + versions_to_last_change: 2 }" ); log_wrappers::set_redact_info_log(true); let redact_result = format!("{:?}", pessimistic_lock); @@ -1016,7 +1084,8 @@ mod tests { assert_eq!( redact_result, "PessimisticLock { primary_key: ?, start_ts: TimeStamp(5), ttl: 1000, \ - for_update_ts: TimeStamp(10), min_commit_ts: TimeStamp(20) }" + for_update_ts: TimeStamp(10), min_commit_ts: TimeStamp(20), last_change_ts: TimeStamp(8), \ + versions_to_last_change: 2 }" ); } @@ -1028,8 +1097,10 @@ mod tests { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 8.into(), + versions_to_last_change: 2, }; - // 7 bytes for primary key, 16 bytes for Box<[u8]>, and 4 8-byte integers. - assert_eq!(lock.memory_size(), 7 + 16 + 4 * 8); + // 7 bytes for primary key, 16 bytes for Box<[u8]>, and 6 8-byte integers. + assert_eq!(lock.memory_size(), 7 + 16 + 6 * 8); } } diff --git a/components/txn_types/src/write.rs b/components/txn_types/src/write.rs index 755207ed3f3..411295de9ee 100644 --- a/components/txn_types/src/write.rs +++ b/components/txn_types/src/write.rs @@ -28,6 +28,7 @@ const FLAG_ROLLBACK: u8 = b'R'; const FLAG_OVERLAPPED_ROLLBACK: u8 = b'R'; const GC_FENCE_PREFIX: u8 = b'F'; +const LAST_CHANGE_PREFIX: u8 = b'l'; /// The short value for rollback records which are protected from being /// collapsed. @@ -150,6 +151,12 @@ pub struct Write { /// * `Some(ts)`: A commit record that has been rewritten due to overlapping /// rollback, and it's next version's `commit_ts` is `ts` pub gc_fence: Option, + + /// The commit TS of the latest PUT/DELETE record + pub last_change_ts: TimeStamp, + /// The number of versions that need skipping from this record + /// to find the latest PUT/DELETE record + pub versions_to_last_change: u64, } impl std::fmt::Debug for Write { @@ -169,6 +176,8 @@ impl std::fmt::Debug for Write { ) .field("has_overlapped_rollback", &self.has_overlapped_rollback) .field("gc_fence", &self.gc_fence) + .field("last_change_ts", &self.last_change_ts) + .field("versions_to_last_change", &self.versions_to_last_change) .finish() } } @@ -183,6 +192,8 @@ impl Write { short_value, has_overlapped_rollback: false, gc_fence: None, + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, } } @@ -200,6 +211,8 @@ impl Write { short_value, has_overlapped_rollback: false, gc_fence: None, + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, } } @@ -215,6 +228,17 @@ impl Write { self } + #[must_use] + pub fn set_last_change( + mut self, + last_change_ts: TimeStamp, + versions_to_last_change: u64, + ) -> Self { + self.last_change_ts = last_change_ts; + self.versions_to_last_change = versions_to_last_change; + self + } + #[inline] pub fn parse_type(mut b: &[u8]) -> Result { let write_type_bytes = b @@ -231,6 +255,8 @@ impl Write { short_value: self.short_value.as_deref(), has_overlapped_rollback: self.has_overlapped_rollback, gc_fence: self.gc_fence, + last_change_ts: self.last_change_ts, + versions_to_last_change: self.versions_to_last_change, } } } @@ -255,6 +281,13 @@ pub struct WriteRef<'a> { /// /// See [`Write::gc_fence`] for more detail. pub gc_fence: Option, + + /// The commit TS of the last PUT/DELETE record before this write record. + /// It only exists if this is a LOCK/ROLLBACK record. + pub last_change_ts: TimeStamp, + /// The number of versions that need skipping from this record + /// to find the latest PUT/DELETE record + pub versions_to_last_change: u64, } impl WriteRef<'_> { @@ -272,6 +305,8 @@ impl WriteRef<'_> { let mut short_value = None; let mut has_overlapped_rollback = false; let mut gc_fence = None; + let mut last_change_ts = TimeStamp::zero(); + let mut versions_to_last_change = 0; while !b.is_empty() { match b @@ -296,6 +331,10 @@ impl WriteRef<'_> { has_overlapped_rollback = true; } GC_FENCE_PREFIX => gc_fence = Some(number::decode_u64(&mut b)?.into()), + LAST_CHANGE_PREFIX => { + last_change_ts = number::decode_u64(&mut b)?.into(); + versions_to_last_change = number::decode_var_u64(&mut b)?; + } _ => { // To support forward compatibility, all fields should be serialized in order // and stop parsing if meets an unknown byte. @@ -310,6 +349,8 @@ impl WriteRef<'_> { short_value, has_overlapped_rollback, gc_fence, + last_change_ts, + versions_to_last_change, }) } @@ -329,6 +370,11 @@ impl WriteRef<'_> { b.push(GC_FENCE_PREFIX); b.encode_u64(ts.into_inner()).unwrap(); } + if !self.last_change_ts.is_zero() { + b.push(LAST_CHANGE_PREFIX); + b.encode_u64(self.last_change_ts.into_inner()).unwrap(); + b.encode_var_u64(self.versions_to_last_change).unwrap(); + } b } @@ -341,6 +387,9 @@ impl WriteRef<'_> { if self.gc_fence.is_some() { size += 1 + size_of::(); } + if !self.last_change_ts.is_zero() { + size += 1 + size_of::() + MAX_VAR_U64_LEN; + } size } @@ -389,6 +438,7 @@ impl WriteRef<'_> { self.short_value.map(|v| v.to_owned()), ) .set_overlapped_rollback(self.has_overlapped_rollback, self.gc_fence) + .set_last_change(self.last_change_ts, self.versions_to_last_change) } } @@ -447,6 +497,7 @@ mod tests { .set_overlapped_rollback(true, Some(2345678.into())), Write::new(WriteType::Put, 456.into(), Some(b"short_value".to_vec())) .set_overlapped_rollback(true, Some(421397468076048385.into())), + Write::new(WriteType::Lock, 456.into(), None).set_last_change(345.into(), 11), ]; for (i, write) in writes.drain(..).enumerate() { let v = write.as_ref().to_bytes(); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 33d1c4ddf97..3ce45689c49 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -9610,6 +9610,8 @@ mod tests { ttl: 3000, for_update_ts: 10.into(), min_commit_ts: 11.into(), + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, }, false ) diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 7c2f41d3e1b..e77e8b7ff59 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -142,6 +142,8 @@ pub fn acquire_pessimistic_lock( ttl: lock_ttl, for_update_ts, min_commit_ts, + last_change_ts: lock.last_change_ts, + versions_to_last_change: lock.versions_to_last_change, }; txn.put_pessimistic_lock(key, lock); } else { @@ -256,6 +258,9 @@ pub fn acquire_pessimistic_lock( ttl: lock_ttl, for_update_ts, min_commit_ts, + // TODO: calculate the two fields below from the latest write record + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, }; // When lock_only_if_exists is false, always accquire pessimitic lock, otherwise diff --git a/src/storage/types.rs b/src/storage/types.rs index c8303787a41..07219435800 100644 --- a/src/storage/types.rs +++ b/src/storage/types.rs @@ -52,6 +52,10 @@ impl MvccInfo { write_info.set_start_ts(write.start_ts.into_inner()); write_info.set_commit_ts(commit_ts.into_inner()); write_info.set_short_value(write.short_value.unwrap_or_default()); + if !write.last_change_ts.is_zero() { + write_info.set_last_change_ts(write.last_change_ts.into_inner()); + write_info.set_versions_to_last_change(write.versions_to_last_change); + } write_info }) .collect() @@ -70,6 +74,10 @@ impl MvccInfo { lock_info.set_start_ts(lock.ts.into_inner()); lock_info.set_primary(lock.primary); lock_info.set_short_value(lock.short_value.unwrap_or_default()); + if !lock.last_change_ts.is_zero() { + lock_info.set_last_change_ts(lock.last_change_ts.into_inner()); + lock_info.set_versions_to_last_change(lock.versions_to_last_change); + } mvcc_info.set_lock(lock_info); } let vv = extract_2pc_values(self.values); diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index c602fc6e4f7..fa4f6e9cb42 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -1346,6 +1346,8 @@ fn test_merge_with_concurrent_pessimistic_locking() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 15.into(), + versions_to_last_change: 3, }, )]) .unwrap(); @@ -1433,6 +1435,8 @@ fn test_merge_pessimistic_locks_with_concurrent_prewrite() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 15.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks @@ -1512,6 +1516,8 @@ fn test_retry_pending_prepare_merge_fail() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 15.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks @@ -1586,6 +1592,8 @@ fn test_merge_pessimistic_locks_propose_fail() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 15.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 9ed57b94091..416116c833b 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -943,6 +943,8 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { ttl: 3000, for_update_ts: (commit_ts + 10).into(), min_commit_ts: (commit_ts + 10).into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; let lock_c = PessimisticLock { primary: b"c".to_vec().into_boxed_slice(), @@ -950,6 +952,8 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { ttl: 3000, for_update_ts: (commit_ts + 10).into(), min_commit_ts: (commit_ts + 10).into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; { let mut locks = txn_ext.pessimistic_locks.write(); diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index e42a44047a4..564b5f393ec 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -566,6 +566,8 @@ fn test_concurrent_write_after_transfer_leader_invalidates_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks diff --git a/tests/failpoints/cases/test_transfer_leader.rs b/tests/failpoints/cases/test_transfer_leader.rs index cc6b043f0e5..ed4a8501188 100644 --- a/tests/failpoints/cases/test_transfer_leader.rs +++ b/tests/failpoints/cases/test_transfer_leader.rs @@ -134,6 +134,8 @@ fn test_delete_lock_proposed_after_proposing_locks_impl(transfer_msg_count: usiz ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }, )]) .unwrap(); @@ -211,6 +213,8 @@ fn test_delete_lock_proposed_before_proposing_locks() { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }, )]) .unwrap(); @@ -293,6 +297,8 @@ fn test_read_lock_after_become_follower() { ttl: 1000, for_update_ts, min_commit_ts: for_update_ts, + last_change_ts: start_ts.prev(), + versions_to_last_change: 1, }, )]) .unwrap(); diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 48adb2eb84c..c72ba5ac595 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -1298,6 +1298,8 @@ fn test_propose_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks @@ -1314,6 +1316,8 @@ fn test_propose_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks @@ -1421,6 +1425,8 @@ fn test_merge_pessimistic_locks_repeated_merge() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks diff --git a/tests/integrations/raftstore/test_multi.rs b/tests/integrations/raftstore/test_multi.rs index 2cda3b8a0b8..ef368bbe0cb 100644 --- a/tests/integrations/raftstore/test_multi.rs +++ b/tests/integrations/raftstore/test_multi.rs @@ -833,6 +833,8 @@ fn test_leader_drop_with_pessimistic_lock() { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 10.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }, )]) .unwrap(); diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 6ac72f668db..10771c57863 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -963,6 +963,8 @@ fn test_split_with_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; let lock_c = PessimisticLock { primary: b"c".to_vec().into_boxed_slice(), @@ -970,6 +972,8 @@ fn test_split_with_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; { let mut locks = txn_ext.pessimistic_locks.write(); diff --git a/tests/integrations/raftstore/test_transfer_leader.rs b/tests/integrations/raftstore/test_transfer_leader.rs index b0fade84d8b..b4f8c33d54d 100644 --- a/tests/integrations/raftstore/test_transfer_leader.rs +++ b/tests/integrations/raftstore/test_transfer_leader.rs @@ -304,6 +304,8 @@ fn test_propose_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; // Write a pessimistic lock to the in-memory pessimistic lock table. { @@ -344,6 +346,8 @@ fn test_memory_pessimistic_locks_status_after_transfer_leader_failure() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; // Write a pessimistic lock to the in-memory pessimistic lock table. txn_ext From de73806c165f31d728b323af8cd5c707500478b3 Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 2 Nov 2022 14:15:59 +0800 Subject: [PATCH 304/676] raftstore: restrict the total write size of each apply round (#13594) ref tikv/tikv#13313 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/config.rs | 12 ++ components/raftstore/src/store/fsm/apply.rs | 106 ++++++++++++++++-- .../integrations/config/dynamic/raftstore.rs | 2 + tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 5 files changed, 115 insertions(+), 7 deletions(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 4d9cd73d207..cbd83d0b85d 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -218,6 +218,14 @@ pub struct Config { pub dev_assert: bool, #[online_config(hidden)] pub apply_yield_duration: ReadableDuration, + /// yield the fsm when apply flushed data size exceeds this threshold. + /// the yield is check after commit, so the actual handled messages can be + /// bigger than the configed value. + // NOTE: the default value is much smaller than the default max raft batch msg size(0.2 + // * raft_entry_max_size), this is intentional because in the common case, a raft entry + // is unlikely to exceed this threshold, but in case when raftstore is the bottleneck, + // we still allow big raft batch for better throughput. + pub apply_yield_write_size: ReadableSize, #[serde(with = "perf_level_serde")] #[online_config(skip)] @@ -386,6 +394,7 @@ impl Default for Config { hibernate_regions: true, dev_assert: false, apply_yield_duration: ReadableDuration::millis(500), + apply_yield_write_size: ReadableSize::kb(32), perf_level: PerfLevel::Uninitialized, evict_cache_on_memory_ratio: 0.0, cmd_batch: true, @@ -898,6 +907,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["local_read_batch_size"]) .set(self.local_read_batch_size as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["apply_yield_write_size"]) + .set(self.apply_yield_write_size.0 as f64); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["apply_max_batch_size"]) .set(self.apply_batch_system.max_batch_size() as f64); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index c8fee703e63..a5da7b9c9f1 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -378,6 +378,7 @@ where perf_context: EK::PerfContext, yield_duration: Duration, + yield_msg_size: u64, store_id: u64, /// region_id -> (peer_id, is_splitting) @@ -467,6 +468,7 @@ where use_delete_range: cfg.use_delete_range, perf_context: engine.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), yield_duration: cfg.apply_yield_duration.0, + yield_msg_size: cfg.apply_yield_write_size.0, delete_ssts: vec![], pending_delete_ssts: vec![], store_id, @@ -635,7 +637,7 @@ where apply_state: delegate.apply_state.clone(), write_seqno: mem::take(&mut delegate.unfinished_write_seqno), exec_res: results, - metrics: delegate.metrics.clone(), + metrics: mem::take(&mut delegate.metrics), applied_term: delegate.applied_term, bucket_stat: delegate.buckets.clone().map(Box::new), }); @@ -1136,10 +1138,14 @@ where && apply_ctx.host.pre_persist(&self.region, false, Some(&cmd)) { apply_ctx.commit(self); - if let Some(start) = self.handle_start.as_ref() { - if start.saturating_elapsed() >= apply_ctx.yield_duration { - return ApplyResult::Yield; - } + if self.metrics.written_bytes >= apply_ctx.yield_msg_size + || self + .handle_start + .as_ref() + .map_or(Duration::ZERO, Instant::saturating_elapsed) + >= apply_ctx.yield_duration + { + return ApplyResult::Yield; } has_unflushed_data = false; } @@ -3576,7 +3582,6 @@ where RAFT_ENTRIES_CACHES_GAUGE.sub(dangle_size as i64); } - self.delegate.metrics = ApplyMetrics::default(); self.delegate.term = apply.term; if let Some(meta) = apply.bucket_meta.clone() { let buckets = self @@ -4096,6 +4101,7 @@ where } _ => {} } + self.apply_ctx.yield_msg_size = incoming.apply_yield_write_size.0; update_cfg(&incoming.apply_batch_system); } } @@ -4535,7 +4541,7 @@ mod tests { use tempfile::{Builder, TempDir}; use test_sst_importer::*; use tikv_util::{ - config::VersionTrack, + config::{ReadableSize, VersionTrack}, store::{new_learner_peer, new_peer}, worker::dummy_scheduler, }; @@ -5596,6 +5602,92 @@ mod tests { system.shutdown(); } + #[test] + fn test_apply_yield_with_msg_size() { + let (_path, engine) = create_tmp_engine("test-apply-yield"); + let (_import_dir, importer) = create_tmp_importer("test-apply-yield"); + let obs = ApplyObserver::default(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_query_observer(1, BoxQueryObserver::new(obs)); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let cfg = Arc::new(VersionTrack::new(Config::default())); + let (router, mut system) = create_apply_batch_system(&cfg.value()); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg: cfg.clone(), + sender, + region_scheduler, + coprocessor_host: host, + importer, + engine, + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("test-handle-raft".to_owned(), builder); + + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(2, 3)); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + router.schedule_task(1, Msg::Registration(reg)); + + let schedule_apply = |idx: u64, count: usize, size: usize| { + let mut entries = Vec::with_capacity(count); + for i in 0..count { + let put_entry = EntryBuilder::new(idx + i as u64, 3) + .put(format!("k{:03}", i).as_ref(), &vec![0; size - 4]) + .epoch(1, 3) + .build(); + entries.push(put_entry); + } + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 3, entries, vec![]))); + }; + + fn approximate_eq(a: u64, b: u64, delta: u64) { + assert!( + a >= b - delta && a <= b + delta, + "left: {}, right: {}, delta: {}", + a, + b, + delta + ); + } + + // schedule a batch with 512 keys and 64k total size will trigger 2 flush and + // yield. + schedule_apply(1, 512, 128); + let apply_res = fetch_apply_res(&rx); + approximate_eq(apply_res.metrics.written_bytes, 32768, 2048); + approximate_eq(apply_res.metrics.written_keys, 256, 15); + // the second part, note that resume apply not clean up the metrics + let apply_res = fetch_apply_res(&rx); + approximate_eq(apply_res.metrics.written_bytes, 32768, 2048); + approximate_eq(apply_res.metrics.written_keys, 256, 15); + + // update apply yeild size to 64kb + _ = cfg.update(|c| { + c.apply_yield_write_size = ReadableSize::kb(64); + Ok::<(), ()>(()) + }); + // only trigger one time of + schedule_apply(513, 512, 128); + let apply_res = fetch_apply_res(&rx); + approximate_eq(apply_res.metrics.written_bytes, 65536, 4096); + approximate_eq(apply_res.metrics.written_keys, 512, 20); + } + #[test] fn test_handle_ingest_sst() { let (_path, engine) = create_tmp_engine("test-ingest"); diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 38fdf5c175c..03bc7ba46c1 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -162,6 +162,7 @@ fn test_update_raftstore_config() { ("raftstore.apply-max-batch-size", "1234"), ("raftstore.store-max-batch-size", "4321"), ("raftstore.raft-entry-max-size", "32MiB"), + ("raftstore.apply-yield-write-size", "10KiB"), ]); cfg_controller.update(change).unwrap(); @@ -169,6 +170,7 @@ fn test_update_raftstore_config() { // config should be updated let mut raft_store = config.raft_store; raft_store.messages_per_tick = 12345; + raft_store.apply_yield_write_size = ReadableSize::kb(10); raft_store.raft_log_gc_threshold = 54321; raft_store.apply_batch_system.max_batch_size = Some(1234); raft_store.store_batch_system.max_batch_size = Some(4321); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 9bb2f7b88da..a61b66e1436 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -231,6 +231,7 @@ fn test_serde_custom_tikv_config() { hibernate_regions: false, dev_assert: true, apply_yield_duration: ReadableDuration::millis(333), + apply_yield_write_size: ReadableSize(12345), perf_level: PerfLevel::Disable, evict_cache_on_memory_ratio: 0.8, cmd_batch: false, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 9c1837c1fbd..a041b696158 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -190,6 +190,7 @@ merge-check-tick-interval = "11s" use-delete-range = true cleanup-import-sst-interval = "12m" local-read-batch-size = 33 +apply-yield-write-size = "12345B" apply-max-batch-size = 22 apply-pool-size = 4 apply-reschedule-duration = "3s" From 9f707fd941819f0bd22b2b03ea02f3e98d84a024 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Wed, 2 Nov 2022 15:44:00 +0800 Subject: [PATCH 305/676] storage: calculate last_change_ts in acquire_pessimistic_lock (#13717) ref tikv/tikv#13694 Information about the last change is calculated and stored in the pessimistic lock. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- components/txn_types/src/write.rs | 19 +++ src/storage/mvcc/reader/scanner/forward.rs | 34 +++-- .../txn/actions/acquire_pessimistic_lock.rs | 118 +++++++++++++++++- 3 files changed, 159 insertions(+), 12 deletions(-) diff --git a/components/txn_types/src/write.rs b/components/txn_types/src/write.rs index 411295de9ee..0c0994640d2 100644 --- a/components/txn_types/src/write.rs +++ b/components/txn_types/src/write.rs @@ -259,6 +259,25 @@ impl Write { versions_to_last_change: self.versions_to_last_change, } } + + /// Returns the new `last_change_ts` and `versions_to_last_change` according + /// to this write record. + pub fn next_last_change_info(&self, commit_ts: TimeStamp) -> (TimeStamp, u64) { + match self.write_type { + WriteType::Put | WriteType::Delete => (commit_ts, 1), + WriteType::Lock | WriteType::Rollback => { + // If `last_change_ts` is zero, do not set `last_change_ts` to indicate we don't + // know where is the last change. + // This should not happen if data is written in new version TiKV. If we hope to + // support data from old TiKV, consider iterating to the last change to find it. + if !self.last_change_ts.is_zero() { + (self.last_change_ts, self.versions_to_last_change + 1) + } else { + (TimeStamp::zero(), 0) + } + } + } + } } #[derive(PartialEq, Clone)] diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index c59c20fbe05..5d9d1b9bb83 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -886,6 +886,8 @@ pub mod test_util { pub commit_ts: TimeStamp, pub for_update_ts: TimeStamp, pub old_value: OldValue, + pub last_change_ts: TimeStamp, + pub versions_to_last_change: u64, } impl Default for EntryBuilder { @@ -898,6 +900,8 @@ pub mod test_util { commit_ts: 0.into(), for_update_ts: 0.into(), old_value: OldValue::None, + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, } } } @@ -931,6 +935,15 @@ pub mod test_util { self.old_value = OldValue::value(old_value.to_owned()); self } + pub fn last_change( + &mut self, + last_change_ts: TimeStamp, + versions_to_last_change: u64, + ) -> &mut Self { + self.last_change_ts = last_change_ts; + self.versions_to_last_change = versions_to_last_change; + self + } pub fn build_commit(&self, wt: WriteType, is_short_value: bool) -> TxnEntry { let write_key = Key::from_raw(&self.key).append_ts(self.commit_ts); let (key, value, short) = if is_short_value { @@ -949,7 +962,8 @@ pub mod test_util { None, ) }; - let write_value = Write::new(wt, self.start_ts, short); + let write_value = Write::new(wt, self.start_ts, short) + .set_last_change(self.last_change_ts, self.versions_to_last_change); TxnEntry::Commit { default: (key, value), write: (write_key.into_encoded(), write_value.as_ref().to_bytes()), @@ -984,7 +998,8 @@ pub mod test_util { self.for_update_ts, 0, 0.into(), - ); + ) + .set_last_change(self.last_change_ts, self.versions_to_last_change); TxnEntry::Prewrite { default: (key, value), lock: (lock_key.into_encoded(), lock_value.to_bytes()), @@ -2426,11 +2441,15 @@ mod delta_entry_tests { let mut entries_of_key = vec![]; if let Some((ts, lock_type, value)) = lock { - let max_commit_ts = writes - .last() - .cloned() - .map(|(_, commit_ts, ..)| commit_ts) - .unwrap_or(0); + let last_write = writes.last(); + let max_commit_ts = + last_write.map(|(_, commit_ts, ..)| *commit_ts).unwrap_or(0); + let (mut last_change_ts, mut versions_to_last_change) = (0,0); + // TODO: Remove `*lock_type == LockType::Pessimistic` after calculating last_change_ts for prewrite. + if *lock_type == LockType::Pessimistic && + let Some((_, commit_ts, WriteType::Put | WriteType::Delete, _)) = last_write { + (last_change_ts, versions_to_last_change) = (*commit_ts, 1); + } let for_update_ts = std::cmp::max(*ts, max_commit_ts + 1); if *ts <= to_ts { @@ -2441,6 +2460,7 @@ mod delta_entry_tests { .for_update_ts(for_update_ts.into()) .primary(key) .value(&value) + .last_change(last_change_ts.into(), versions_to_last_change) .build_prewrite(*lock_type, is_short_value(&value)); entries_of_key.push(entry); } diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index e77e8b7ff59..656b75bfbde 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -156,6 +156,8 @@ pub fn acquire_pessimistic_lock( // Following seek_write read the previous write. let (prev_write_loaded, mut prev_write) = (true, None); + let mut last_change_ts = TimeStamp::zero(); + let mut versions_to_last_change = 0; if let Some((commit_ts, write)) = reader.seek_write(&key, TimeStamp::max())? { // Find a previous write. if need_old_value { @@ -216,6 +218,8 @@ pub fn acquire_pessimistic_lock( // Check data constraint when acquiring pessimistic lock. check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; + (last_change_ts, versions_to_last_change) = write.next_last_change_info(commit_ts); + if need_value || need_check_existence { val = match write.write_type { // If it's a valid Write, no need to read again. @@ -258,9 +262,8 @@ pub fn acquire_pessimistic_lock( ttl: lock_ttl, for_update_ts, min_commit_ts, - // TODO: calculate the two fields below from the latest write record - last_change_ts: TimeStamp::zero(), - versions_to_last_change: 0, + last_change_ts, + versions_to_last_change, }; // When lock_only_if_exists is false, always accquire pessimitic lock, otherwise @@ -278,7 +281,7 @@ pub mod tests { use kvproto::kvrpcpb::Context; #[cfg(test)] use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; - use txn_types::TimeStamp; + use txn_types::{Lock, TimeStamp}; use super::*; use crate::storage::{ @@ -508,13 +511,14 @@ pub mod tests { key: &[u8], start_ts: impl Into, for_update_ts: impl Into, - ) { + ) -> Lock { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, None, true); let lock = reader.load_lock(&Key::from_raw(key)).unwrap().unwrap(); assert_eq!(lock.ts, start_ts.into()); assert_eq!(lock.for_update_ts, for_update_ts.into()); assert_eq!(lock.lock_type, LockType::Pessimistic); + lock } #[test] @@ -1465,4 +1469,108 @@ pub mod tests { } } } + + #[test] + fn test_calculate_last_change_ts() { + use engine_traits::CF_WRITE; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + let key = b"k"; + + // Latest version is a PUT + let write = Write::new(WriteType::Put, 15.into(), Some(b"value".to_vec())); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(20.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 10, 30); + let lock = must_pessimistic_locked(&mut engine, key, 10, 30); + assert_eq!(lock.last_change_ts, 20.into()); + assert_eq!(lock.versions_to_last_change, 1); + pessimistic_rollback::tests::must_success(&mut engine, key, 10, 30); + + // Latest version is a DELETE + let write = Write::new(WriteType::Delete, 40.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(50.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 60, 70); + let lock = must_pessimistic_locked(&mut engine, key, 60, 70); + assert_eq!(lock.last_change_ts, 50.into()); + assert_eq!(lock.versions_to_last_change, 1); + pessimistic_rollback::tests::must_success(&mut engine, key, 60, 70); + + // Latest version is a LOCK without last_change_ts + let write = Write::new(WriteType::Lock, 70.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(75.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 80, 80); + let lock = must_pessimistic_locked(&mut engine, key, 80, 80); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + pessimistic_rollback::tests::must_success(&mut engine, key, 80, 80); + + // Latest version is a ROLLBACK without last_change_ts + let write = Write::new(WriteType::Lock, 90.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(90.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 95, 95); + let lock = must_pessimistic_locked(&mut engine, key, 95, 95); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + pessimistic_rollback::tests::must_success(&mut engine, key, 95, 95); + + // Latest version is a LOCK with last_change_ts + let write = Write::new(WriteType::Lock, 100.into(), None).set_last_change(40.into(), 4); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(110.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 120, 130); + let lock = must_pessimistic_locked(&mut engine, key, 120, 130); + assert_eq!(lock.last_change_ts, 40.into()); + assert_eq!(lock.versions_to_last_change, 5); + pessimistic_rollback::tests::must_success(&mut engine, key, 120, 130); + + // Latest version is a ROLLBACK with last_change_ts + let write = Write::new(WriteType::Rollback, 120.into(), None).set_last_change(40.into(), 5); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(120.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 140, 140); + let lock = must_pessimistic_locked(&mut engine, key, 140, 140); + assert_eq!(lock.last_change_ts, 40.into()); + assert_eq!(lock.versions_to_last_change, 6); + pessimistic_rollback::tests::must_success(&mut engine, key, 140, 140); + } } From 97ab36eb7147cde02c1654595f99104155ac0c21 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Wed, 2 Nov 2022 17:06:00 +0800 Subject: [PATCH 306/676] txn: Add lock-with-conflict support to acquire_pessimistic_lock (#13680) ref tikv/tikv#13298 Add `lock_with_conflict` support to `acquire_pessimistic_lock`, and it's currently always disabled. Side changes: Updated the type for holding the results of `acquire_pessimistic_lock` (removed `PessimisticLockRes` and added new `PessimisticLockResults` and `PessimisticLockKeyResult`), and other necessary changes to adapt the new type. Signed-off-by: MyonKeminta --- src/server/service/kv.rs | 2 +- src/storage/errors.rs | 2 +- src/storage/lock_manager/lock_wait_context.rs | 15 +- .../lock_manager/lock_waiting_queue.rs | 10 +- src/storage/mod.rs | 102 +++++- src/storage/mvcc/reader/reader.rs | 1 + .../txn/actions/acquire_pessimistic_lock.rs | 309 +++++++++++++++--- .../txn/commands/acquire_pessimistic_lock.rs | 37 +-- src/storage/txn/commands/mod.rs | 4 +- src/storage/txn/mod.rs | 4 +- src/storage/types.rs | 195 +++++++++-- tests/failpoints/cases/test_storage.rs | 13 +- 12 files changed, 559 insertions(+), 135 deletions(-) diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 84015ddab57..8ac91031c33 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -2240,7 +2240,7 @@ txn_command_future!(future_prewrite, PrewriteRequest, PrewriteResponse, (v, resp txn_command_future!(future_acquire_pessimistic_lock, PessimisticLockRequest, PessimisticLockResponse, (v, resp, tracker) {{ match v { Ok(Ok(res)) => { - let (values, not_founds) = res.into_values_and_not_founds(); + let (values, not_founds) = res.into_legacy_values_and_not_founds(); resp.set_values(values.into()); resp.set_not_founds(not_founds); }, diff --git a/src/storage/errors.rs b/src/storage/errors.rs index 7ce5d925dfa..b5498e807f0 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -461,7 +461,7 @@ pub fn extract_key_errors(res: Result>>) -> Vec); diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs index 24a61876f44..7749ee983cb 100644 --- a/src/storage/lock_manager/lock_wait_context.rs +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -22,7 +22,8 @@ use crate::storage::{ lock_waiting_queue::{LockWaitQueues, PessimisticLockKeyCallback}, LockManager, LockWaitToken, }, - Error as StorageError, PessimisticLockRes, ProcessResult, StorageCallback, + types::PessimisticLockKeyResult, + Error as StorageError, ProcessResult, StorageCallback, }; pub struct LockWaitContextInner { @@ -124,7 +125,11 @@ impl LockWaitContext { } } - fn finish_request(&self, result: Result, is_canceling: bool) { + fn finish_request( + &self, + result: Result, + is_canceling: bool, + ) { if is_canceling { let entry = self .lock_wait_queues @@ -171,13 +176,13 @@ mod tests { lock_manager::{lock_waiting_queue::LockWaitEntry, MockLockManager}, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::{Error as TxnError, ErrorInner as TxnErrorInner}, - types::PessimisticLockParameters, + types::{PessimisticLockParameters, PessimisticLockResults}, ErrorInner as StorageErrorInner, Result as StorageResult, }; fn create_storage_cb() -> ( StorageCallback, - Receiver>>, + Receiver>>, ) { let (tx, rx) = channel(); let cb = StorageCallback::PessimisticLock(Box::new(move |r| tx.send(r).unwrap())); @@ -190,7 +195,7 @@ mod tests { ) -> ( LockWaitToken, LockWaitContext, - Receiver>>, + Receiver>>, ) { let (cb, rx) = create_storage_cb(); let token = lock_wait_queues.get_lock_mgr().allocate_token(); diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index da8f2e2d289..d3fb58b2a94 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -80,12 +80,12 @@ use crate::storage::{ metrics::*, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::Error as TxnError, - types::{PessimisticLockParameters, PessimisticLockRes}, + types::{PessimisticLockKeyResult, PessimisticLockParameters}, Error as StorageError, }; pub type CallbackWithSharedError = Box) + Send + 'static>; -pub type PessimisticLockKeyCallback = CallbackWithSharedError; +pub type PessimisticLockKeyCallback = CallbackWithSharedError; /// Represents an `AcquirePessimisticLock` request that's waiting for a lock, /// and contains the request's parameters. @@ -616,7 +616,7 @@ mod tests { struct TestLockWaitEntryHandle { token: LockWaitToken, - wake_up_rx: Receiver>, + wake_up_rx: Receiver>, cancel_cb: Box, } @@ -624,7 +624,7 @@ mod tests { fn wait_for_result_timeout( &self, timeout: Duration, - ) -> Option> { + ) -> Option> { match self.wake_up_rx.recv_timeout(timeout) { Ok(res) => Some(res), Err(RecvTimeoutError::Timeout) => None, @@ -635,7 +635,7 @@ mod tests { } } - fn wait_for_result(self) -> Result { + fn wait_for_result(self) -> Result { self.wake_up_rx .recv_timeout(Duration::from_secs(10)) .unwrap() diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 3ce45689c49..16043a348ce 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -107,7 +107,10 @@ pub use self::{ raw::RawStore, read_pool::{build_read_pool, build_read_pool_for_test}, txn::{Latches, Lock as LatchLock, ProcessResult, Scanner, SnapshotStore, Store}, - types::{PessimisticLockRes, PrewriteResult, SecondaryLocksStatus, StorageCallback, TxnStatus}, + types::{ + PessimisticLockKeyResult, PessimisticLockResults, PrewriteResult, SecondaryLocksStatus, + StorageCallback, TxnStatus, + }, }; use self::{kv::SnapContext, test_util::latest_feature_gate}; use crate::{ @@ -3185,7 +3188,11 @@ pub mod test_util { }; use super::*; - use crate::storage::{lock_manager::WaitTimeout, txn::commands}; + use crate::storage::{ + lock_manager::WaitTimeout, + txn::commands, + types::{PessimisticLockKeyResult, PessimisticLockResults}, + }; pub fn expect_none(x: Option) { assert_eq!(x, None); @@ -3253,10 +3260,52 @@ pub mod test_util { pub fn expect_pessimistic_lock_res_callback( done: Sender, - pessimistic_lock_res: PessimisticLockRes, - ) -> Callback> { - Box::new(move |res: Result>| { - assert_eq!(res.unwrap().unwrap(), pessimistic_lock_res); + pessimistic_lock_res: PessimisticLockResults, + ) -> Callback> { + fn key_res_matches_ignoring_error_content( + lhs: &PessimisticLockKeyResult, + rhs: &PessimisticLockKeyResult, + ) -> bool { + match (lhs, rhs) { + (PessimisticLockKeyResult::Empty, PessimisticLockKeyResult::Empty) => true, + (PessimisticLockKeyResult::Value(l), PessimisticLockKeyResult::Value(r)) => l == r, + ( + PessimisticLockKeyResult::Existence(l), + PessimisticLockKeyResult::Existence(r), + ) => l == r, + ( + PessimisticLockKeyResult::LockedWithConflict { + value: value1, + conflict_ts: ts1, + }, + PessimisticLockKeyResult::LockedWithConflict { + value: value2, + conflict_ts: ts2, + }, + ) => value1 == value2 && ts1 == ts2, + (PessimisticLockKeyResult::Waiting, PessimisticLockKeyResult::Waiting) => true, + (PessimisticLockKeyResult::Failed(_), PessimisticLockKeyResult::Failed(_)) => false, + _ => false, + } + } + + Box::new(move |res: Result>| { + let res = res.unwrap().unwrap(); + assert_eq!( + res.0.len(), + pessimistic_lock_res.0.len(), + "pessimistic lock result length not match, expected: {:?}, got: {:?}", + pessimistic_lock_res, + res + ); + for (expected, got) in pessimistic_lock_res.0.iter().zip(res.0.iter()) { + assert!( + key_res_matches_ignoring_error_content(expected, got), + "pessimistic lock result not match, expected: {:?}, got: {:?}", + pessimistic_lock_res, + res + ); + } done.send(0).unwrap(); }) } @@ -3271,7 +3320,7 @@ pub mod test_util { }) } - type PessimisticLockCommand = TypedCommand>; + type PessimisticLockCommand = TypedCommand>; pub fn new_acquire_pessimistic_lock_command( keys: Vec<(Key, bool)>, @@ -3445,6 +3494,7 @@ mod tests { tests::must_rollback, Error as TxnError, ErrorInner as TxnErrorInner, }, + types::{PessimisticLockKeyResult, PessimisticLockResults}, }, }; @@ -7712,16 +7762,33 @@ mod tests { let (key, val) = (Key::from_raw(b"key"), b"val".to_vec()); let (key2, val2) = (Key::from_raw(b"key2"), b"val2".to_vec()); + let results_values = |res: Vec>| { + PessimisticLockResults( + res.into_iter() + .map(|v| PessimisticLockKeyResult::Value(v)) + .collect::>(), + ) + }; + let results_existence = |res: Vec| { + PessimisticLockResults( + res.into_iter() + .map(|v| PessimisticLockKeyResult::Existence(v)) + .collect::>(), + ) + }; + let results_empty = + |len| PessimisticLockResults(vec![PessimisticLockKeyResult::Empty; len]); + // Key not exist for &(return_values, check_existence) in &[(false, false), (false, true), (true, false), (true, true)] { let pessimistic_lock_res = if return_values { - PessimisticLockRes::Values(vec![None]) + results_values(vec![None]) } else if check_existence { - PessimisticLockRes::Existence(vec![false]) + results_existence(vec![false]) } else { - PessimisticLockRes::Empty + results_empty(1) }; storage @@ -7769,7 +7836,7 @@ mod tests { false, false, ), - expect_pessimistic_lock_res_callback(tx.clone(), PessimisticLockRes::Empty), + expect_pessimistic_lock_res_callback(tx.clone(), results_empty(1)), ) .unwrap(); rx.recv().unwrap(); @@ -7802,8 +7869,8 @@ mod tests { rx.recv().unwrap(); } - // Needn't update max_ts when failing to read value - assert_eq!(cm.max_ts(), 10.into()); + // Always update max_ts when trying to read. + assert_eq!(cm.max_ts(), 20.into()); // Put key and key2. storage @@ -7872,19 +7939,18 @@ mod tests { rx.recv().unwrap(); } - // Needn't update max_ts when failing to read value - assert_eq!(cm.max_ts(), 10.into()); + assert_eq!(cm.max_ts(), 20.into()); // Return multiple values for &(return_values, check_existence) in &[(false, false), (false, true), (true, false), (true, true)] { let pessimistic_lock_res = if return_values { - PessimisticLockRes::Values(vec![Some(val.clone()), Some(val2.clone()), None]) + results_values(vec![Some(val.clone()), Some(val2.clone()), None]) } else if check_existence { - PessimisticLockRes::Existence(vec![true, true, false]) + results_existence(vec![true, true, false]) } else { - PessimisticLockRes::Empty + results_empty(3) }; storage .sched_txn_command( diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 6bf712050ac..321cc21427f 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -924,6 +924,7 @@ pub mod tests { TimeStamp::zero(), true, false, + false, ) .unwrap(); self.write(txn.into_modifies()); diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 656b75bfbde..9f645e389be 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -10,18 +10,26 @@ use crate::storage::{ ErrorInner, MvccTxn, Result as MvccResult, SnapshotReader, }, txn::actions::check_data_constraint::check_data_constraint, + types::PessimisticLockKeyResult, Snapshot, }; /// Acquires pessimistic lock on a single key. Optionally reads the previous /// value by the way. /// -/// When `need_value` is set, the first return value will be the previous value -/// of the key (possibly `None`). When `need_value` is not set but -/// `need_check_existence` is set, the first return value will be an empty value -/// (`Some(vec![])`) if the key exists before or `None` if not. If neither -/// `need_value` nor `need_check_existence` is set, the first return value is -/// always `None`. +/// When `need_value` is set, the first return value will be +/// `PessimisticLockKeyResult::Value`. When `need_value` is not set but +/// `need_check_existence` is set, the first return value will be +/// `PessimisticLockKeyResult::Existence`. If neither `need_value` nor +/// `need_check_existence` is set, the first return value will be +/// `PessimisticLockKeyResult::Empty`. +/// +/// If `allow_lock_with_conflict` is set, and the lock is acquired successfully +/// ignoring a write conflict, the first return value will be +/// `PessimisticLockKeyResult::LockedWithConflict` no matter how `need_value` +/// and `need_check_existence` are set, and the `for_update_ts` in +/// the actually-written lock will be equal to the `commit_ts` of the latest +/// Write record found on the key. /// /// The second return value will also contains the previous value of the key if /// `need_old_value` is set, or `OldValue::Unspecified` otherwise. @@ -32,13 +40,14 @@ pub fn acquire_pessimistic_lock( primary: &[u8], should_not_exist: bool, lock_ttl: u64, - for_update_ts: TimeStamp, + mut for_update_ts: TimeStamp, need_value: bool, need_check_existence: bool, min_commit_ts: TimeStamp, need_old_value: bool, lock_only_if_exists: bool, -) -> MvccResult<(Option, OldValue)> { + allow_lock_with_conflict: bool, +) -> MvccResult<(PessimisticLockKeyResult, OldValue)> { fail_point!("acquire_pessimistic_lock", |err| Err( crate::storage::mvcc::txn::make_txn_error(err, &key, reader.start_ts).into() )); @@ -54,9 +63,10 @@ pub fn acquire_pessimistic_lock( } .into()); } - // Update max_ts for Insert operation to guarantee linearizability and snapshot - // isolation - if should_not_exist { + // If any of `should_not_exist`, `need_value`, `need_check_existence` is set, + // it infers a read to the value, in which case max_ts need to be updated to + // guarantee the linearizability and snapshot isolation. + if should_not_exist || need_value || need_check_existence { txn.concurrency_manager.update_max_ts(for_update_ts); } @@ -64,7 +74,7 @@ pub fn acquire_pessimistic_lock( // `need_check_existence` and `need_old_value` are both set, we also load // the value even if `need_value` is false, so that it avoids // `load_old_value` doing repeated work. - let need_load_value = need_value || (need_check_existence && need_old_value); + let mut need_load_value = need_value || (need_check_existence && need_old_value); fn load_old_value( need_old_value: bool, @@ -90,19 +100,6 @@ pub fn acquire_pessimistic_lock( } } - /// Returns proper result according to the loaded value (if any) the - /// specified settings. - #[inline] - fn ret_val(need_value: bool, need_check_existence: bool, val: Option) -> Option { - if need_value { - val - } else if need_check_existence { - val.map(|_| vec![]) - } else { - None - } - } - let mut val = None; if let Some(lock) = reader.load_lock(&key)? { if lock.ts != reader.start_ts { @@ -116,6 +113,32 @@ pub fn acquire_pessimistic_lock( } .into()); } + + let locked_with_conflict_ts = + if allow_lock_with_conflict && for_update_ts < lock.for_update_ts { + // If the key is already locked by the same transaction with larger + // for_update_ts, and the current request has + // `allow_lock_with_conflict` set, we must consider + // these possibilities: + // * If a previous request successfully locked the key with conflict, but the + // response is lost due to some errors such as RPC failures. In this case, we + // return like the current request's result is locked_with_conflict, for + // idempotency concern. + // * The key is locked by a newer request with larger for_update_ts, and the + // current request is stale. We can't distinguish this case with the above + // one, but we don't need to handle this case since no one would need the + // current request's result anymore. + + // Load value if locked_with_conflict, so that when the client (TiDB) need to + // read the value during statement retry, it will be possible to read the value + // from cache instead of RPC. + need_load_value = true; + for_update_ts = lock.for_update_ts; + Some(lock.for_update_ts) + } else { + None + }; + if need_load_value { val = reader.get(&key, for_update_ts)?; } else if need_check_existence { @@ -151,9 +174,19 @@ pub fn acquire_pessimistic_lock( .acquire_pessimistic_lock .inc(); } - return Ok((ret_val(need_value, need_check_existence, val), old_value)); + return Ok(( + PessimisticLockKeyResult::new_success( + need_value, + need_check_existence, + locked_with_conflict_ts, + val, + ), + old_value, + )); } + let mut locked_with_conflict_ts = None; + // Following seek_write read the previous write. let (prev_write_loaded, mut prev_write) = (true, None); let mut last_change_ts = TimeStamp::zero(); @@ -172,15 +205,22 @@ pub fn acquire_pessimistic_lock( MVCC_CONFLICT_COUNTER .acquire_pessimistic_lock_conflict .inc(); - return Err(ErrorInner::WriteConflict { - start_ts: reader.start_ts, - conflict_start_ts: write.start_ts, - conflict_commit_ts: commit_ts, - key: key.into_raw()?, - primary: primary.to_vec(), - reason: WriteConflictReason::PessimisticRetry, + if allow_lock_with_conflict { + // TODO: New metrics. + locked_with_conflict_ts = Some(commit_ts); + for_update_ts = commit_ts; + need_load_value = true; + } else { + return Err(ErrorInner::WriteConflict { + start_ts: reader.start_ts, + conflict_start_ts: write.start_ts, + conflict_commit_ts: commit_ts, + key: key.into_raw()?, + primary: primary.to_vec(), + reason: WriteConflictReason::PessimisticRetry, + } + .into()); } - .into()); } // Handle rollback. @@ -215,12 +255,19 @@ pub fn acquire_pessimistic_lock( } } - // Check data constraint when acquiring pessimistic lock. - check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; + // Check data constraint when acquiring pessimistic lock. But in case we are + // going to lock it with write conflict, we do not check it since the + // statement will then retry. + if locked_with_conflict_ts.is_none() { + check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; + } (last_change_ts, versions_to_last_change) = write.next_last_change_info(commit_ts); - if need_value || need_check_existence { + // Load value if locked_with_conflict, so that when the client (TiDB) need to + // read the value during statement retry, it will be possible to read the value + // from cache instead of RPC. + if need_value || need_check_existence || locked_with_conflict_ts.is_some() { val = match write.write_type { // If it's a valid Write, no need to read again. WriteType::Put @@ -266,14 +313,22 @@ pub fn acquire_pessimistic_lock( versions_to_last_change, }; - // When lock_only_if_exists is false, always accquire pessimitic lock, otherwise + // When lock_only_if_exists is false, always acquire pessimistic lock, otherwise // do it when val exists if !lock_only_if_exists || val.is_some() { txn.put_pessimistic_lock(key, lock); } // TODO don't we need to commit the modifies in txn? - Ok((ret_val(need_value, need_check_existence, val), old_value)) + Ok(( + PessimisticLockKeyResult::new_success( + need_value, + need_check_existence, + locked_with_conflict_ts, + val, + ), + old_value, + )) } pub mod tests { @@ -300,6 +355,70 @@ pub mod tests { TestEngineBuilder, }; + #[cfg(test)] + pub fn acquire_pessimistic_lock_allow_lock_with_conflict( + engine: &mut E, + key: &[u8], + pk: &[u8], + start_ts: impl Into, + for_update_ts: impl Into, + need_value: bool, + need_check_existence: bool, + ) -> MvccResult { + let ctx = Context::default(); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let cm = ConcurrencyManager::new(0.into()); + let start_ts = start_ts.into(); + let mut txn = MvccTxn::new(start_ts, cm); + let mut reader = SnapshotReader::new(start_ts, snapshot, true); + let res = acquire_pessimistic_lock( + &mut txn, + &mut reader, + Key::from_raw(key), + pk, + false, + 1, + for_update_ts.into(), + need_value, + need_check_existence, + 0.into(), + false, + false, + true, + ); + if res.is_ok() { + let modifies = txn.into_modifies(); + if !modifies.is_empty() { + engine + .write(&ctx, WriteData::from_modifies(modifies)) + .unwrap(); + } + } + res.map(|r| r.0) + } + + #[cfg(test)] + pub fn must_succeed_allow_lock_with_conflict( + engine: &mut E, + key: &[u8], + pk: &[u8], + start_ts: impl Into, + for_update_ts: impl Into, + need_value: bool, + need_check_existence: bool, + ) -> PessimisticLockKeyResult { + acquire_pessimistic_lock_allow_lock_with_conflict( + engine, + key, + pk, + start_ts, + for_update_ts, + need_value, + need_check_existence, + ) + .unwrap() + } + pub fn must_succeed_impl( engine: &mut E, key: &[u8], @@ -333,6 +452,7 @@ pub mod tests { min_commit_ts, false, lock_only_if_exists, + false, ) .unwrap(); let modifies = txn.into_modifies(); @@ -341,7 +461,19 @@ pub mod tests { .write(&ctx, WriteData::from_modifies(modifies)) .unwrap(); } - res.0 + // TODO: Adapt to new interface + match res.0 { + PessimisticLockKeyResult::Value(v) => v, + PessimisticLockKeyResult::Existence(e) => { + if e { + Some(vec![]) + } else { + None + } + } + PessimisticLockKeyResult::Empty => None, + res => panic!("unexpected result: {:?}", res), + } } pub fn must_succeed( @@ -502,6 +634,7 @@ pub mod tests { min_commit_ts, false, lock_only_if_exists, + false, ) .unwrap_err() } @@ -1109,6 +1242,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, ) .unwrap(); assert_eq!(old_value, OldValue::None); @@ -1160,6 +1294,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, ) .unwrap(); assert_eq!( @@ -1194,6 +1329,7 @@ pub mod tests { min_commit_ts, true, false, + false, ) .unwrap(); assert_eq!( @@ -1237,6 +1373,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, )?; Ok(old_value) }); @@ -1290,6 +1427,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, ) .unwrap_err(); @@ -1324,6 +1462,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, ) .unwrap_err(); } @@ -1573,4 +1712,94 @@ pub mod tests { assert_eq!(lock.versions_to_last_change, 6); pessimistic_rollback::tests::must_success(&mut engine, key, 140, 140); } + + #[test] + fn test_lock_with_conflict() { + use pessimistic_rollback::tests::must_success as must_pessimistic_rollback; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 10); + must_commit(&mut engine, b"k1", 10, 20); + + // Normal cases. + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 30, false, false) + .assert_empty(); + must_pessimistic_rollback(&mut engine, b"k1", 10, 30); + must_unlocked(&mut engine, b"k1"); + + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 30, false, true) + .assert_existence(true); + must_pessimistic_rollback(&mut engine, b"k1", 10, 30); + must_unlocked(&mut engine, b"k1"); + + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 30, true, false) + .assert_value(Some(b"v1")); + must_pessimistic_rollback(&mut engine, b"k1", 10, 30); + must_unlocked(&mut engine, b"k1"); + + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 30, true, true) + .assert_value(Some(b"v1")); + must_pessimistic_rollback(&mut engine, b"k1", 10, 30); + must_unlocked(&mut engine, b"k1"); + + // Conflicting cases. + for &(need_value, need_check_existence) in + &[(false, false), (false, true), (true, false), (true, true)] + { + must_succeed_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 10, + 15, + need_value, + need_check_existence, + ) + .assert_locked_with_conflict(Some(b"v1"), 20); + must_pessimistic_locked(&mut engine, b"k1", 10, 20); + must_pessimistic_rollback(&mut engine, b"k1", 10, 20); + must_unlocked(&mut engine, b"k1"); + } + + // Idempotency + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 50, false, false) + .assert_empty(); + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 40, false, false) + .assert_locked_with_conflict(Some(b"v1"), 50); + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 15, false, false) + .assert_locked_with_conflict(Some(b"v1"), 50); + must_pessimistic_locked(&mut engine, b"k1", 10, 50); + must_pessimistic_rollback(&mut engine, b"k1", 10, 50); + must_unlocked(&mut engine, b"k1"); + + // Lock waiting. + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 50, false, false) + .assert_empty(); + let err = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 11, + 55, + false, + false, + ) + .unwrap_err(); + assert!(matches!(err, MvccError(box ErrorInner::KeyIsLocked(_)))); + let err = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 9, + 9, + false, + false, + ) + .unwrap_err(); + assert!(matches!(err, MvccError(box ErrorInner::KeyIsLocked(_)))); + must_pessimistic_locked(&mut engine, b"k1", 10, 50); + must_pessimistic_rollback(&mut engine, b"k1", 10, 50); + must_unlocked(&mut engine, b"k1"); + } } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 359f0abacd8..69a5179ab84 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -16,9 +16,9 @@ use crate::storage::{ }, Error, ErrorInner, Result, }, - types::PessimisticLockParameters, - Error as StorageError, ErrorInner as StorageErrorInner, PessimisticLockRes, ProcessResult, - Result as StorageResult, Snapshot, + types::{PessimisticLockParameters, PessimisticLockResults}, + Error as StorageError, ErrorInner as StorageErrorInner, ProcessResult, Result as StorageResult, + Snapshot, }; command! { @@ -26,7 +26,7 @@ command! { /// /// This can be rolled back with a [`PessimisticRollback`](Command::PessimisticRollback) command. AcquirePessimisticLock: - cmd_ty => StorageResult, + cmd_ty => StorageResult, display => "kv::command::acquirepessimisticlock keys({:?}) @ {} {} {} {:?} {} {} {} | {:?}", (keys, start_ts, lock_ttl, for_update_ts, wait_timeout, min_commit_ts, check_existence, lock_only_if_exists, ctx), content => { @@ -88,16 +88,7 @@ impl WriteCommand for AcquirePessimisticLock ); let rows = keys.len(); - let mut res = if self.return_values { - Ok(PessimisticLockRes::Values(vec![])) - } else if self.check_existence { - // If return_value is set, the existence status is implicitly included in the - // result. So check_existence only need to be explicitly handled if - // `return_values` is not set. - Ok(PessimisticLockRes::Existence(vec![])) - } else { - Ok(PessimisticLockRes::Empty) - }; + let mut res = Ok(PessimisticLockResults::with_capacity(rows)); let need_old_value = context.extra_op == ExtraOp::ReadOldValue; for (k, should_not_exist) in keys { match acquire_pessimistic_lock( @@ -113,11 +104,10 @@ impl WriteCommand for AcquirePessimisticLock self.min_commit_ts, need_old_value, self.lock_only_if_exists, + false, ) { - Ok((val, old_value)) => { - if self.return_values || self.check_existence { - res.as_mut().unwrap().push(val); - } + Ok((key_res, old_value)) => { + res.as_mut().unwrap().push(key_res); if old_value.resolved() { let key = k.append_ts(txn.start_ts); // MutationType is unknown in AcquirePessimisticLock stage. @@ -133,17 +123,6 @@ impl WriteCommand for AcquirePessimisticLock } } - // Some values are read, update max_ts - match &res { - Ok(PessimisticLockRes::Values(values)) if !values.is_empty() => { - txn.concurrency_manager.update_max_ts(self.for_update_ts); - } - Ok(PessimisticLockRes::Existence(values)) if !values.is_empty() => { - txn.concurrency_manager.update_max_ts(self.for_update_ts); - } - _ => (), - } - // no conflict let (pr, to_be_write, rows, ctx, lock_info) = if res.is_ok() { let pr = ProcessResult::PessimisticLockRes { res }; diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index f5331087ac1..4213eeb6b68 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -63,7 +63,7 @@ use crate::storage::{ mvcc::{Lock as MvccLock, MvccReader, ReleasedLock, SnapshotReader}, txn::{latch, ProcessResult, Result}, types::{ - MvccInfo, PessimisticLockParameters, PessimisticLockRes, PrewriteResult, + MvccInfo, PessimisticLockParameters, PessimisticLockResults, PrewriteResult, SecondaryLocksStatus, StorageCallbackType, TxnStatus, }, Result as StorageResult, Snapshot, Statistics, @@ -193,7 +193,7 @@ impl From for TypedCommand { } } -impl From for TypedCommand> { +impl From for TypedCommand> { fn from(mut req: PessimisticLockRequest) -> Self { let keys = req .take_mutations() diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index 5b71d60e3bf..1af3c9d63e6 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -40,7 +40,7 @@ pub use self::{ }; use crate::storage::{ mvcc::Error as MvccError, - types::{MvccInfo, PessimisticLockRes, PrewriteResult, SecondaryLocksStatus, TxnStatus}, + types::{MvccInfo, PessimisticLockResults, PrewriteResult, SecondaryLocksStatus, TxnStatus}, Error as StorageError, Result as StorageResult, }; @@ -73,7 +73,7 @@ pub enum ProcessResult { err: StorageError, }, PessimisticLockRes { - res: StorageResult, + res: StorageResult, }, SecondaryLocksStatus { status: SecondaryLocksStatus, diff --git a/src/storage/types.rs b/src/storage/types.rs index 07219435800..6ad4c8e26ef 100644 --- a/src/storage/types.rs +++ b/src/storage/types.rs @@ -8,6 +8,7 @@ use kvproto::kvrpcpb; use txn_types::{Key, Value}; use crate::storage::{ + errors::SharedError, lock_manager::WaitTimeout, mvcc::{Lock, LockType, TimeStamp, Write, WriteType}, txn::ProcessResult, @@ -155,42 +156,180 @@ pub struct PessimisticLockParameters { pub allow_lock_with_conflict: bool, } -#[derive(Clone, Debug, PartialEq)] -pub enum PessimisticLockRes { - /// The previous value is loaded while handling the `AcquirePessimisticLock` - /// command. The i-th item is the value of the i-th key in the - /// `AcquirePessimisticLock` command. - Values(Vec>), - /// Checked whether the key exists while handling the - /// `AcquirePessimisticLock` command. The i-th item is true if the i-th key - /// in the `AcquirePessimisticLock` command exists. - Existence(Vec), +/// Represents the result of pessimistic lock on a single key. +#[derive(Debug, Clone)] +pub enum PessimisticLockKeyResult { + /// The lock is acquired successfully, returning no additional information. Empty, + /// The lock is acquired successfully, and the previous value is read and + /// returned. + Value(Option), + /// The lock is acquired successfully, and also checked if the key exists + /// previously. + Existence(bool), + /// There is a write conflict, but the lock is acquired ignoring the write + /// conflict. + LockedWithConflict { + /// The previous value of the key. + value: Option, + /// The `commit_ts` of the latest Write record found on this key. This + /// is also the actual `for_update_ts` written to the lock. + conflict_ts: TimeStamp, + }, + /// The key is already locked and lock-waiting is needed. + Waiting, + /// Failed to acquire the lock due to some error. + Failed(SharedError), } -impl PessimisticLockRes { - pub fn push(&mut self, value: Option) { +impl PessimisticLockKeyResult { + pub fn new_success( + need_value: bool, + need_check_existence: bool, + locked_with_conflict_ts: Option, + value: Option, + ) -> Self { + if let Some(conflict_ts) = locked_with_conflict_ts { + Self::LockedWithConflict { value, conflict_ts } + } else if need_value { + Self::Value(value) + } else if need_check_existence { + Self::Existence(value.is_some()) + } else { + Self::Empty + } + } + + pub fn unwrap_value(self) -> Option { match self { - PessimisticLockRes::Values(v) => v.push(value), - PessimisticLockRes::Existence(v) => v.push(value.is_some()), - _ => panic!("unexpected PessimisticLockRes"), + Self::Value(v) => v, + x => panic!( + "pessimistic lock key result expected to be a value, got {:?}", + x + ), } } - pub fn into_values_and_not_founds(self) -> (Vec, Vec) { + pub fn unwrap_existence(self) -> bool { match self { - PessimisticLockRes::Values(vals) => vals - .into_iter() - .map(|v| { - let is_not_found = v.is_none(); - (v.unwrap_or_default(), is_not_found) - }) - .unzip(), - PessimisticLockRes::Existence(mut vals) => { - vals.iter_mut().for_each(|x| *x = !*x); - (vec![], vals) + Self::Existence(e) => e, + x => panic!( + "pessimistic lock key result expected to be existence, got {:?}", + x + ), + } + } + + pub fn assert_empty(&self) { + assert!(matches!(self, Self::Empty)); + } + + #[cfg(test)] + pub fn assert_value(&self, expected_value: Option<&[u8]>) { + match self { + Self::Value(v) if v.as_ref().map(|v| v.as_slice()) == expected_value => (), + x => panic!( + "pessimistic lock key result not match, expected Value({:?}), got {:?}", + expected_value, x + ), + } + } + + #[cfg(test)] + pub fn assert_existence(&self, expected_existence: bool) { + match self { + Self::Existence(e) if *e == expected_existence => (), + x => panic!( + "pessimistic lock key result not match, expected Existence({:?}), got {:?}", + expected_existence, x + ), + } + } + + #[cfg(test)] + pub fn assert_locked_with_conflict( + &self, + expected_value: Option<&[u8]>, + expected_conflict_ts: impl Into, + ) { + let expected_conflict_ts = expected_conflict_ts.into(); + match self { + Self::LockedWithConflict { value, conflict_ts } + if value.as_ref().map(|v| v.as_slice()) == expected_value + && *conflict_ts == expected_conflict_ts => {} + x => panic!( + "pessimistic lock key result not match, expected LockedWithConflict{{ value: {:?}, conflict_ts: {} }}, got {:?}", + expected_value, expected_conflict_ts, x + ), + } + } + + #[cfg(test)] + pub fn assert_waiting(&self) { + assert!(matches!(self, Self::Waiting)); + } + + #[cfg(test)] + pub fn unwrap_err(&self) -> SharedError { + match self { + Self::Failed(e) => e.clone(), + x => panic!( + "pessimistic lock key result not match expected Failed, got {:?}", + x, + ), + } + } +} + +#[derive(Clone, Debug, Default)] +pub struct PessimisticLockResults(pub Vec); + +impl PessimisticLockResults { + pub fn new() -> Self { + Self(vec![]) + } + + pub fn with_capacity(capacity: usize) -> Self { + Self(Vec::with_capacity(capacity)) + } + + pub fn push(&mut self, key_res: PessimisticLockKeyResult) { + self.0.push(key_res); + } + + pub fn into_legacy_values_and_not_founds(self) -> (Vec, Vec) { + if self.0.is_empty() { + return (vec![], vec![]); + } + + match &self.0[0] { + PessimisticLockKeyResult::Empty => { + self.0.into_iter().for_each(|res| res.assert_empty()); + (vec![], vec![]) + } + PessimisticLockKeyResult::Existence(_) => { + let not_founds = self.0.into_iter().map(|x| !x.unwrap_existence()).collect(); + (vec![], not_founds) + } + PessimisticLockKeyResult::Value(_) => { + let mut not_founds = Vec::with_capacity(self.0.len()); + let mut values = Vec::with_capacity(self.0.len()); + self.0.into_iter().for_each(|x| { + let v = x.unwrap_value(); + match v { + Some(v) => { + not_founds.push(false); + values.push(v); + } + None => { + not_founds.push(true); + values.push(vec![]); + } + } + }); + (values, not_founds) } - PessimisticLockRes::Empty => (vec![], vec![]), + _ => unreachable!(), } } } @@ -246,7 +385,7 @@ storage_callback! { Locks(Vec) ProcessResult::Locks { locks } => locks, TxnStatus(TxnStatus) ProcessResult::TxnStatus { txn_status } => txn_status, Prewrite(PrewriteResult) ProcessResult::PrewriteResult { result } => result, - PessimisticLock(Result) ProcessResult::PessimisticLockRes { res } => res, + PessimisticLock(Result) ProcessResult::PessimisticLockRes { res } => res, SecondaryLocksStatus(SecondaryLocksStatus) ProcessResult::SecondaryLocksStatus { status } => status, RawCompareAndSwap((Option, bool)) ProcessResult::RawCompareAndSwapRes { previous_value, succeed } => (previous_value, succeed), } diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index e0f68b721b5..43f1b504f25 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -389,7 +389,7 @@ fn test_pipelined_pessimistic_lock() { new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 10, 10, true, false), expect_pessimistic_lock_res_callback( tx.clone(), - PessimisticLockRes::Values(vec![None]), + PessimisticLockResults(vec![PessimisticLockKeyResult::Value(None)]), ), ) .unwrap(); @@ -452,7 +452,9 @@ fn test_pipelined_pessimistic_lock() { ), expect_pessimistic_lock_res_callback( tx.clone(), - PessimisticLockRes::Values(vec![Some(val.clone())]), + PessimisticLockResults(vec![PessimisticLockKeyResult::Value(Some( + val.clone(), + ))]), ), ) .unwrap(); @@ -475,7 +477,7 @@ fn test_pipelined_pessimistic_lock() { new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 50, 50, true, false), expect_pessimistic_lock_res_callback( tx.clone(), - PessimisticLockRes::Values(vec![Some(val.clone())]), + PessimisticLockResults(vec![PessimisticLockKeyResult::Value(Some(val.clone()))]), ), ) .unwrap(); @@ -498,7 +500,10 @@ fn test_pipelined_pessimistic_lock() { ), expect_pessimistic_lock_res_callback( tx, - PessimisticLockRes::Values(vec![Some(val), None]), + PessimisticLockResults(vec![ + PessimisticLockKeyResult::Value(Some(val)), + PessimisticLockKeyResult::Value(None), + ]), ), ) .unwrap(); From 7fd31d3fc529cddd190b38276ab050bc390c9b6b Mon Sep 17 00:00:00 2001 From: YangKeao Date: Thu, 3 Nov 2022 00:08:00 -0400 Subject: [PATCH 307/676] copr, json: support json path range selection (#13645) close tikv/tikv#13644 Signed-off-by: YangKeao Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/tidb_query_datatype/Cargo.toml | 2 +- .../src/codec/mysql/json/binary.rs | 22 +- .../src/codec/mysql/json/json_extract.rs | 319 +++++++-- .../src/codec/mysql/json/json_modify.rs | 2 +- .../src/codec/mysql/json/json_remove.rs | 7 +- .../src/codec/mysql/json/modifier.rs | 21 +- .../src/codec/mysql/json/path_expr.rs | 671 +++++++++--------- 8 files changed, 638 insertions(+), 408 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f1152b2002e..25a68864586 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6089,7 +6089,7 @@ dependencies = [ "lazy_static", "log_wrappers", "match-template", - "nom 5.1.0", + "nom 7.1.0", "num 0.3.0", "num-derive", "num-traits", diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index de8f0b41110..af7e7e08b9d 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -22,7 +22,7 @@ kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" log_wrappers = { workspace = true } match-template = "0.0.1" -nom = { version = "5.1.0", default-features = false, features = ["std"] } +nom = { version = "7.1.0", default-features = false, features = ["std"] } num = { version = "0.3", default-features = false } num-derive = "0.3" num-traits = "0.2" diff --git a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs index daeae751fb5..734ec1d4115 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs @@ -5,9 +5,29 @@ use std::convert::TryInto; use codec::number::NumberCodec; use super::{constants::*, JsonRef, JsonType, ERR_CONVERT_FAILED}; -use crate::codec::Result; +use crate::codec::{mysql::json::path_expr::ArrayIndex, Result}; impl<'a> JsonRef<'a> { + /// Gets the index from the ArrayIndex + /// + /// If the idx is greater than the count and is from right, it will return + /// `None` + /// + /// See `jsonPathArrayIndex.getIndexFromStart()` in TiDB + /// `types/json_path_expr.go` + pub fn array_get_index(&self, idx: ArrayIndex) -> Option { + match idx { + ArrayIndex::Left(idx) => Some(idx as usize), + ArrayIndex::Right(idx) => { + if self.get_elem_count() < 1 + (idx as usize) { + None + } else { + Some(self.get_elem_count() - 1 - (idx as usize)) + } + } + } + } + /// Gets the ith element in JsonRef /// /// See `arrayGetElem()` in TiDB `json/binary.go` diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs index d40451fc9b5..7e619e74c32 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs @@ -4,9 +4,10 @@ use collections::HashSet; use super::{ super::Result, - path_expr::{PathExpression, PathLeg, PATH_EXPR_ARRAY_INDEX_ASTERISK, PATH_EXPR_ASTERISK}, + path_expr::{PathExpression, PathLeg}, Json, JsonRef, JsonType, }; +use crate::codec::mysql::json::path_expr::{ArrayIndex, ArraySelection, KeySelection}; impl<'a> JsonRef<'a> { /// `extract` receives several path expressions as arguments, matches them @@ -21,8 +22,11 @@ impl<'a> JsonRef<'a> { let mut elem_list = Vec::with_capacity(path_expr_list.len()); for path_expr in path_expr_list { could_return_multiple_matches |= path_expr.contains_any_asterisk(); + could_return_multiple_matches |= path_expr.contains_any_range(); + elem_list.append(&mut extract_json(*self, &path_expr.legs)?) } + if elem_list.is_empty() { Ok(None) } else if could_return_multiple_matches { @@ -79,43 +83,86 @@ pub fn extract_json<'a>(j: JsonRef<'a>, path_legs: &[PathLeg]) -> Result match j.get_type() { + match current_leg { + PathLeg::ArraySelection(selection) => match j.get_type() { JsonType::Array => { let elem_count = j.get_elem_count(); - if i == PATH_EXPR_ARRAY_INDEX_ASTERISK { - for k in 0..elem_count { - append_if_ref_unique( - &mut ret, - &extract_json(j.array_get_elem(k)?, sub_path_legs)?, - ) + match selection { + ArraySelection::Asterisk => { + for k in 0..elem_count { + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(k)?, sub_path_legs)?, + ) + } + } + ArraySelection::Index(index) => { + if let Some(index) = j.array_get_index(*index) { + if index < elem_count { + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(index)?, sub_path_legs)?, + ) + } + } + } + ArraySelection::Range(start, end) => { + if let (Some(start), Some(mut end)) = + (j.array_get_index(*start), j.array_get_index(*end)) + { + if end >= elem_count { + end = elem_count - 1 + } + if start <= end { + for i in start..=end { + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(i)?, sub_path_legs)?, + ) + } + } + } } - } else if (i as usize) < elem_count { - append_if_ref_unique( - &mut ret, - &extract_json(j.array_get_elem(i as usize)?, sub_path_legs)?, - ) } } _ => { - if i as usize == 0 { - append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?) + // If the current object is not an array, still append them if the selection + // includes 0. But for asterisk, it still returns NULL. + // + // as the element is not array, don't use `array_get_index` + match selection { + ArraySelection::Index(ArrayIndex::Left(0)) => { + append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?) + } + ArraySelection::Range( + ArrayIndex::Left(0), + ArrayIndex::Right(0) | ArrayIndex::Left(_), + ) => { + // for [0 to Non-negative Number] and [0 to last], it extracts itself + append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?) + } + _ => {} } } }, - PathLeg::Key(ref key) => { + PathLeg::Key(key) => { if j.get_type() == JsonType::Object { - if key == PATH_EXPR_ASTERISK { - let elem_count = j.get_elem_count(); - for i in 0..elem_count { - append_if_ref_unique( - &mut ret, - &extract_json(j.object_get_val(i)?, sub_path_legs)?, - ) + match key { + KeySelection::Asterisk => { + let elem_count = j.get_elem_count(); + for i in 0..elem_count { + append_if_ref_unique( + &mut ret, + &extract_json(j.object_get_val(i)?, sub_path_legs)?, + ) + } + } + KeySelection::Key(key) => { + if let Some(idx) = j.object_search_key(key.as_bytes()) { + let val = j.object_get_val(idx)?; + append_if_ref_unique(&mut ret, &extract_json(val, sub_path_legs)?) + } } - } else if let Some(idx) = j.object_search_key(key.as_bytes()) { - let val = j.object_get_val(idx)?; - append_if_ref_unique(&mut ret, &extract_json(val, sub_path_legs)?) } } } @@ -154,10 +201,15 @@ mod tests { use super::{ super::path_expr::{ PathExpressionFlag, PATH_EXPRESSION_CONTAINS_ASTERISK, - PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, PATH_EXPR_ARRAY_INDEX_ASTERISK, + PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }, *, }; + use crate::codec::mysql::json::path_expr::{ArrayIndex, PATH_EXPRESSION_CONTAINS_RANGE}; + + fn select_from_left(index: usize) -> PathLeg { + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Left(index as u32))) + } #[test] fn test_json_extract() { @@ -168,7 +220,7 @@ mod tests { ( "[true, 2017]", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("true"), @@ -176,7 +228,7 @@ mod tests { ( "[true, 2017]", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PATH_EXPRESSION_CONTAINS_ASTERISK, }], Some("[true, 2017]"), @@ -184,7 +236,7 @@ mod tests { ( "[true, 2107]", vec![PathExpression { - legs: vec![PathLeg::Index(2)], + legs: vec![select_from_left(2)], flags: PathExpressionFlag::default(), }], None, @@ -192,7 +244,7 @@ mod tests { ( "6.18", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("6.18"), @@ -200,7 +252,7 @@ mod tests { ( "6.18", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -208,7 +260,7 @@ mod tests { ( "true", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("true"), @@ -216,7 +268,7 @@ mod tests { ( "true", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -224,7 +276,7 @@ mod tests { ( "6", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("6"), @@ -232,7 +284,7 @@ mod tests { ( "6", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -240,7 +292,7 @@ mod tests { ( "-6", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("-6"), @@ -248,7 +300,7 @@ mod tests { ( "-6", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -256,7 +308,7 @@ mod tests { ( r#"{"a": [1, 2, {"aa": "xx"}]}"#, vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -264,7 +316,7 @@ mod tests { ( r#"{"a": [1, 2, {"aa": "xx"}]}"#, vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some(r#"{"a": [1, 2, {"aa": "xx"}]}"#), @@ -273,7 +325,7 @@ mod tests { ( r#"{"a": "a1", "b": 20.08, "c": false}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from("c"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("c")))], flags: PathExpressionFlag::default(), }], Some("false"), @@ -281,7 +333,7 @@ mod tests { ( r#"{"a": "a1", "b": 20.08, "c": false}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from(PATH_EXPR_ASTERISK))], + legs: vec![PathLeg::Key(KeySelection::Asterisk)], flags: PATH_EXPRESSION_CONTAINS_ASTERISK, }], Some(r#"["a1", 20.08, false]"#), @@ -289,7 +341,7 @@ mod tests { ( r#"{"a": "a1", "b": 20.08, "c": false}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from("d"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("d")))], flags: PathExpressionFlag::default(), }], None, @@ -298,7 +350,10 @@ mod tests { ( "21", vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("c"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], None, @@ -306,7 +361,10 @@ mod tests { ( r#"{"g": {"a": "a1", "b": 20.08, "c": false}}"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("c"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some("[false]"), @@ -314,7 +372,10 @@ mod tests { ( r#"[{"a": "a1", "b": 20.08, "c": false}, true]"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("c"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some("[false]"), @@ -322,7 +383,7 @@ mod tests { ( r#"[[0, 1], [2, 3], [4, [5, 6]]]"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some("[[0, 1], 0, 1, 2, 3, 4, 5, 6]"), @@ -331,11 +392,11 @@ mod tests { r#"[[0, 1], [2, 3], [4, [5, 6]]]"#, vec![ PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }, PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }, ], @@ -344,7 +405,7 @@ mod tests { ( "[1]", vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some("[1]"), @@ -352,7 +413,10 @@ mod tests { ( r#"{"a": 1}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from("a")), PathLeg::Index(0)], + legs: vec![ + PathLeg::Key(KeySelection::Key(String::from("a"))), + select_from_left(0), + ], flags: PathExpressionFlag::default(), }], Some("1"), @@ -360,7 +424,7 @@ mod tests { ( r#"{"a": 1}"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some(r#"[{"a": 1}, 1]"#), @@ -369,10 +433,10 @@ mod tests { r#"{"a": 1}"#, vec![PathExpression { legs: vec![ - PathLeg::Index(0), - PathLeg::Index(0), - PathLeg::Index(0), - PathLeg::Key(String::from("a")), + select_from_left(0), + select_from_left(0), + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), ], flags: PathExpressionFlag::default(), }], @@ -383,8 +447,8 @@ mod tests { vec![PathExpression { legs: vec![ PathLeg::DoubleAsterisk, - PathLeg::Key(String::from("a")), - PathLeg::Key(String::from("*")), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::Key(KeySelection::Asterisk), ], flags: PATH_EXPRESSION_CONTAINS_ASTERISK | PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, @@ -395,11 +459,17 @@ mod tests { r#"[{"a": [3,4]}, {"b": 2 }]"#, vec![ PathExpression { - legs: vec![PathLeg::Index(0), PathLeg::Key(String::from("a"))], + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PathExpressionFlag::default(), }, PathExpression { - legs: vec![PathLeg::Index(1), PathLeg::Key(String::from("a"))], + legs: vec![ + select_from_left(1), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PathExpressionFlag::default(), }, ], @@ -408,11 +478,136 @@ mod tests { ( r#"[{"a": [1,1,1,1]}]"#, vec![PathExpression { - legs: vec![PathLeg::Index(0), PathLeg::Key(String::from("a"))], + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PathExpressionFlag::default(), }], Some("[1, 1, 1, 1]"), ), + ( + r#"[1,2,3,4]"#, + vec![PathExpression { + legs: vec![PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Left(2), + ))], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[2,3]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(0))), + ], + flags: PathExpressionFlag::default(), + }], + Some("4"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(1))), + ], + flags: PathExpressionFlag::default(), + }], + Some("3"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(100))), + ], + flags: PathExpressionFlag::default(), + }], + None, + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(0), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[2,3,4]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(100), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + None, + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Left(100), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[2,3,4]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(0), + ArrayIndex::Right(0), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[1,2,3,4]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(0), + ArrayIndex::Left(2), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[1,2,3]"), + ), ]; for (i, (js, exprs, expected)) in test_cases.drain(..).enumerate() { let j = js.parse(); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs index e8c709e9571..b359158d06b 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs @@ -33,7 +33,7 @@ impl<'a> JsonRef<'a> { )); } for expr in path_expr_list { - if expr.contains_any_asterisk() { + if expr.contains_any_asterisk() || expr.contains_any_range() { return Err(box_err!( "Invalid path expression: expected no asterisk, found {:?}", expr diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs index a350df91b06..bcb6fd01716 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs @@ -7,10 +7,9 @@ impl<'a> JsonRef<'a> { /// All path expressions cannot contain * or ** wildcard. /// If any error occurs, the input won't be changed. pub fn remove(&self, path_expr_list: &[PathExpression]) -> Result { - if path_expr_list - .iter() - .any(|expr| expr.legs.is_empty() || expr.contains_any_asterisk()) - { + if path_expr_list.iter().any(|expr| { + expr.legs.is_empty() || expr.contains_any_asterisk() || expr.contains_any_range() + }) { return Err(box_err!("Invalid path expression")); } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs index 0836eae9d5b..58fe8fbbbcb 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs @@ -11,6 +11,7 @@ use super::{ path_expr::{PathExpression, PathLeg}, Json, JsonRef, JsonType, }; +use crate::codec::mysql::json::path_expr::{ArraySelection, KeySelection}; /// A helper struct that derives a new JSON by combining and manipulating /// the encoded bytes directly. Only used by `json_replace`, `json_set`, @@ -88,7 +89,7 @@ impl<'a> BinaryModifier<'a> { } let parent_node = &result[0]; match last_leg { - PathLeg::Index(_) => { + PathLeg::ArraySelection(ArraySelection::Index(_)) => { // Record the parent node value offset, as it's actually relative to `old` self.to_be_modified_ptr = parent_node.as_ptr(); match parent_node.get_type() { @@ -109,7 +110,7 @@ impl<'a> BinaryModifier<'a> { } } } - PathLeg::Key(insert_key) => { + PathLeg::Key(KeySelection::Key(insert_key)) => { // Ignore constant if parent_node.get_type() != JsonType::Object { return Ok(()); @@ -168,21 +169,23 @@ impl<'a> BinaryModifier<'a> { } let parent_node = &result[0]; match last_leg { - PathLeg::Index(remove_idx) => { + PathLeg::ArraySelection(ArraySelection::Index(remove_idx)) => { if parent_node.get_type() == JsonType::Array { self.to_be_modified_ptr = parent_node.as_ptr(); let elems_count = parent_node.get_elem_count(); let mut elems = Vec::with_capacity(elems_count - 1); - let remove_idx = *remove_idx as usize; - for i in 0..elems_count { - if i != remove_idx { - elems.push(parent_node.array_get_elem(i)?); + if let Some(remove_idx) = parent_node.array_get_index(*remove_idx) { + for i in 0..elems_count { + if i != remove_idx { + elems.push(parent_node.array_get_elem(i)?); + } } + + self.new_value = Some(Json::from_ref_array(elems)?); } - self.new_value = Some(Json::from_ref_array(elems)?); } } - PathLeg::Key(remove_key) => { + PathLeg::Key(KeySelection::Key(remove_key)) => { // Ignore constant if parent_node.get_type() == JsonType::Object { self.to_be_modified_ptr = parent_node.as_ptr(); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs b/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs index a760f748348..fb707887885 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs @@ -25,375 +25,286 @@ // select json_extract('{"a": "b", "c": [1, "2"]}', '$.*') -> ["b", [1, "2"]] // ``` -use std::{iter::Peekable, str::CharIndices}; +use nom::{ + branch::alt, + bytes::complete::tag, + character::{ + complete, + complete::{char, none_of, satisfy, space0, space1}, + }, + combinator::{map, map_opt}, + multi::{many0, many1}, + sequence::{delimited, pair, tuple}, + IResult, +}; use super::json_unquote::unquote_string; -use crate::codec::{Error, Result}; +use crate::codec::Result; -pub const PATH_EXPR_ASTERISK: &str = "*"; - -#[derive(Clone, Debug, PartialEq)] -pub enum PathLeg { - /// `Key` indicates the path leg with '.key'. - Key(String), - /// `Index` indicates the path leg with form 'number'. - Index(i32), - /// `DoubleAsterisk` indicates the path leg with form '**'. - DoubleAsterisk, +fn lift_error_to_failure(err: nom::Err) -> nom::Err { + if let nom::Err::Error(err) = err { + nom::Err::Failure(err) + } else { + err + } } -// ArrayIndexAsterisk is for parsing '*' into a number. -// we need this number represent "all". -pub const PATH_EXPR_ARRAY_INDEX_ASTERISK: i32 = -1; - -pub type PathExpressionFlag = u8; - -pub const PATH_EXPRESSION_CONTAINS_ASTERISK: PathExpressionFlag = 0x01; -pub const PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK: PathExpressionFlag = 0x02; - -#[derive(Clone, Default, Debug, PartialEq)] -pub struct PathExpression { - pub legs: Vec, - pub flags: PathExpressionFlag, +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum ArrayIndex { + Left(u32), // `Left` represents an array index start from left + Right(u32), // `Right` represents an array index start from right } -impl PathExpression { - pub fn contains_any_asterisk(&self) -> bool { - (self.flags - & (PATH_EXPRESSION_CONTAINS_ASTERISK | PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK)) - != 0 - } +fn array_index_left(input: &str) -> IResult<&str, ArrayIndex> { + let (input, index) = complete::u32(input)?; + Ok((input, ArrayIndex::Left(index))) } -/// `box_json_path_err` creates an error from the slice position -/// The position is added with 1, to count from 1 as start -macro_rules! box_json_path_err { - ($e:expr) => {{ - box_err!( - "Invalid JSON path expression. The error is around character position {}.", - ($e) + 1 - ) - }}; +fn array_index_last(input: &str) -> IResult<&str, ArrayIndex> { + let (input, _) = tag("last")(input)?; + + Ok((input, ArrayIndex::Right(0))) } -struct PathExpressionTokenizer<'a> { - input: &'a str, +fn array_index_right(input: &str) -> IResult<&str, ArrayIndex> { + let (input, _) = tag("last")(input)?; + let (input, _) = space0(input)?; + let (input, _) = char('-')(input)?; + let (input, _) = space0(input)?; - char_iterator: Peekable>, + let (input, index) = complete::u32(input)?; + Ok((input, ArrayIndex::Right(index))) } -struct Position { - start: usize, - end: usize, +fn array_index(input: &str) -> IResult<&str, ArraySelection> { + map( + alt((array_index_left, array_index_right, array_index_last)), + |index| ArraySelection::Index(index), + )(input) } -/// PathExpressionToken represents a section in path expression and its position -enum PathExpressionToken { - Leg((PathLeg, Position)), - /// Represents the beginning "$" in the expression - Start(Position), +fn array_asterisk(input: &str) -> IResult<&str, ArraySelection> { + map(char('*'), |_| ArraySelection::Asterisk)(input) } -impl<'a> Iterator for PathExpressionTokenizer<'a> { - type Item = Result; - - /// Next will try to parse the next path leg and return - /// If it returns None, it means the input is over. - /// If it returns Some(Err(..)), it means the format is error. - /// If it returns Some(Ok(..)), it represents the next token. - fn next(&mut self) -> Option> { - self.trim_white_spaces(); - // Trim all spaces at first - if self.reached_end() { - return None; - }; - - let (start, ch) = *self.char_iterator.peek().unwrap(); - match ch { - '$' => { - self.char_iterator.next(); - Some(Ok(PathExpressionToken::Start(Position { - start, - end: self.current_index(), - }))) +fn array_range(input: &str) -> IResult<&str, ArraySelection> { + let (input, start) = array_index(input)?; + let (input, _) = space1(input)?; + let (input, _) = tag("to")(input)?; + let (before_last_index, _) = space1(input)?; + let (input, end) = array_index(before_last_index)?; + + match (start, end) { + (ArraySelection::Index(start), ArraySelection::Index(end)) => { + // specially check the position + let allowed = match (start, end) { + (ArrayIndex::Left(start), ArrayIndex::Left(end)) => start <= end, + (ArrayIndex::Right(start), ArrayIndex::Right(end)) => start >= end, + (..) => true, + }; + if !allowed { + // TODO: use a customized error kind, as the ErrorKind::Verify is designed + // to be used in `verify` combinator + return Err(nom::Err::Failure(nom::error::make_error( + before_last_index, + nom::error::ErrorKind::Verify, + ))); } - '.' => Some(self.next_key()), - '[' => Some(self.next_index()), - '*' => Some(self.next_double_asterisk()), - _ => Some(Err(box_json_path_err!(self.current_index()))), + Ok((input, ArraySelection::Range(start, end))) } + _ => unreachable!(), } } -impl<'a> PathExpressionTokenizer<'a> { - fn new(input: &'a str) -> PathExpressionTokenizer<'a> { - PathExpressionTokenizer { - input, - char_iterator: input.char_indices().peekable(), - } - } - - /// Returns the current index on the slice - fn current_index(&mut self) -> usize { - match self.char_iterator.peek() { - Some((start, _)) => *start, - None => self.input.len(), - } - } - - /// `trim_while_spaces` removes following spaces - fn trim_white_spaces(&mut self) { - while self - .char_iterator - .next_if(|(_, ch)| ch.is_whitespace()) - .is_some() - {} - } - - /// Returns whether the input has reached the end - fn reached_end(&mut self) -> bool { - return self.char_iterator.peek().is_none(); - } - - fn next_key(&mut self) -> Result { - let (start, _) = self.char_iterator.next().unwrap(); +#[derive(Clone, Debug, PartialEq)] +pub enum ArraySelection { + Asterisk, // `Asterisk` select all element from array. + Index(ArrayIndex), // `Index` select one element from array. + Range(ArrayIndex, ArrayIndex), // `Range` selects a closed-interval from array. +} - self.trim_white_spaces(); - if self.reached_end() { - return Err(box_json_path_err!(self.current_index())); - } +fn path_leg_array_selection(input: &str) -> IResult<&str, PathLeg> { + let (input, _) = char('[')(input)?; + let (input, _) = space0(input)?; + let (input, leg) = map( + alt((array_asterisk, array_range, array_index)), + |array_selection| PathLeg::ArraySelection(array_selection), + )(input) + .map_err(lift_error_to_failure)?; + let (input, _) = space0(input)?; + let (input, _) = char(']')(input).map_err(lift_error_to_failure)?; + + Ok((input, leg)) +} - match *self.char_iterator.peek().unwrap() { - (_, '*') => { - self.char_iterator.next().unwrap(); - - Ok(PathExpressionToken::Leg(( - PathLeg::Key(PATH_EXPR_ASTERISK.to_string()), - Position { - start, - end: self.current_index(), - }, - ))) - } - (mut key_start, '"') => { - // Skip this '"' character - key_start += 1; - self.char_iterator.next().unwrap(); +#[derive(Clone, Debug, PartialEq)] +pub enum KeySelection { + Asterisk, + Key(String), +} - // Next until the next '"' character - while self.char_iterator.next_if(|(_, ch)| *ch != '"').is_some() {} +fn key_selection_asterisk(input: &str) -> IResult<&str, KeySelection> { + map(char('*'), |_| KeySelection::Asterisk)(input) +} - // Now, it's a '"' or the end - if self.char_iterator.peek().is_none() { - return Err(box_json_path_err!(self.current_index())); +fn key_selection_key(input: &str) -> IResult<&str, KeySelection> { + let key_with_quote = map_opt( + delimited(char('"'), many1(none_of("\"")), char('"')), + |key: Vec<_>| { + let key: String = key.into_iter().collect(); + let key = unquote_string(&key).ok()?; + for ch in key.chars() { + if ch.is_control() { + return None; } + } + Some(KeySelection::Key(key)) + }, + ); + + let take_key_until_end = many1(satisfy(|ch| { + !(ch.is_whitespace() || ch == '.' || ch == '[' || ch == '*') + })); + let key_without_quote = map_opt(take_key_until_end, |key: Vec<_>| { + for (i, c) in key.iter().enumerate() { + if i == 0 && c.is_ascii_digit() { + return None; + } + if !c.is_ascii_alphanumeric() && *c != '_' && *c != '$' && c.is_ascii() { + return None; + } + } - // `key_end` is the index of '"' - let key_end = self.current_index(); - self.char_iterator.next().unwrap(); - - let key = unquote_string(unsafe { self.input.get_unchecked(key_start..key_end) })?; - for ch in key.chars() { - // According to JSON standard, a string cannot - // contain any ASCII control characters - if ch.is_control() { - // TODO: add the concrete error location - // after unquote, we lost the map between - // the character and input position. - return Err(box_json_path_err!(key_start)); - } - } + Some(KeySelection::Key(key.into_iter().collect())) + }); - Ok(PathExpressionToken::Leg(( - PathLeg::Key(key), - Position { - start, - end: self.current_index(), - }, - ))) - } - (key_start, _) => { - // We have to also check the current value - while self - .char_iterator - .next_if(|(_, ch)| { - !(ch.is_whitespace() || *ch == '.' || *ch == '[' || *ch == '*') - }) - .is_some() - {} - - // Now it reaches the end or a whitespace/./[/* - let key_end = self.current_index(); - - // The start character is not available - if key_end == key_start { - return Err(box_json_path_err!(key_start)); - } + alt((key_with_quote, key_without_quote))(input) +} - let key = unsafe { self.input.get_unchecked(key_start..key_end) }.to_string(); - - // It's not quoted, we'll have to validate whether it's an available ECMEScript - // identifier - for (i, c) in key.char_indices() { - if i == 0 && c.is_ascii_digit() { - return Err(box_json_path_err!(key_start + i)); - } - if !c.is_ascii_alphanumeric() && c != '_' && c != '$' && c.is_ascii() { - return Err(box_json_path_err!(key_start + i)); - } - } +fn path_leg_key(input: &str) -> IResult<&str, PathLeg> { + let (input, _) = char('.')(input)?; + let (input, _) = space0(input)?; - Ok(PathExpressionToken::Leg(( - PathLeg::Key(key), - Position { - start, - end: key_end, - }, - ))) - } - } - } + map( + alt((key_selection_key, key_selection_asterisk)), + |key_selection| PathLeg::Key(key_selection), + )(input) + .map_err(lift_error_to_failure) +} - fn next_index(&mut self) -> Result { - let (start, _) = self.char_iterator.next().unwrap(); +fn path_leg_double_asterisk(input: &str) -> IResult<&str, PathLeg> { + map(pair(char('*'), char('*')), |_| PathLeg::DoubleAsterisk)(input) +} - self.trim_white_spaces(); - if self.reached_end() { - return Err(box_json_path_err!(self.current_index())); - } +#[derive(Clone, Debug, PartialEq)] +pub enum PathLeg { + /// `Key` indicates the path leg with '.key'. + Key(KeySelection), + /// `ArraySelection` indicates the path leg with form '[...]'. + ArraySelection(ArraySelection), + /// `DoubleAsterisk` indicates the path leg with form '**'. + DoubleAsterisk, +} - return match self.char_iterator.next().unwrap() { - (_, '*') => { - // Then it's a glob array index - self.trim_white_spaces(); - if self.reached_end() { - return Err(box_json_path_err!(self.current_index())); - } +pub type PathExpressionFlag = u8; - if self.char_iterator.next_if(|(_, ch)| *ch == ']').is_none() { - return Err(box_json_path_err!(self.current_index())); - } +pub const PATH_EXPRESSION_CONTAINS_ASTERISK: PathExpressionFlag = 0x01; +pub const PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK: PathExpressionFlag = 0x02; +pub const PATH_EXPRESSION_CONTAINS_RANGE: PathExpressionFlag = 0x04; - Ok(PathExpressionToken::Leg(( - PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK), - Position { - start, - end: self.current_index(), - }, - ))) +fn path_expression(input: &str) -> IResult<&str, PathExpression> { + let mut flags = PathExpressionFlag::default(); + let (input, (_, _, legs)) = tuple(( + space0, + char('$'), + many0(delimited( + space0, + alt(( + path_leg_key, + path_leg_array_selection, + path_leg_double_asterisk, + )), + space0, + )), + ))(input)?; + + for leg in legs.iter() { + match leg { + PathLeg::DoubleAsterisk => flags |= PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + PathLeg::Key(KeySelection::Asterisk) => flags |= PATH_EXPRESSION_CONTAINS_ASTERISK, + PathLeg::ArraySelection(ArraySelection::Asterisk) => { + flags |= PATH_EXPRESSION_CONTAINS_ASTERISK } - (number_start, '0'..='9') => { - // Then it's a number array index - while self - .char_iterator - .next_if(|(_, ch)| ch.is_ascii_digit()) - .is_some() - {} - let number_end = self.current_index(); - - self.trim_white_spaces(); - // now, it reaches the end of input, or reaches a non-digit character - match self.char_iterator.peek() { - Some((_, ']')) => {} - Some((pos, _)) => { - return Err(box_json_path_err!(pos)); - } - None => { - return Err(box_json_path_err!(self.current_index())); - } - } - self.char_iterator.next().unwrap(); - - let index = self.input[number_start..number_end] - .parse::() - .map_err(|_| -> Error { box_json_path_err!(number_end) })?; - Ok(PathExpressionToken::Leg(( - PathLeg::Index(index), - Position { - start, - end: self.current_index(), - }, - ))) + PathLeg::ArraySelection(ArraySelection::Range(..)) => { + flags |= PATH_EXPRESSION_CONTAINS_RANGE } - (pos, _) => Err(box_json_path_err!(pos)), - }; + _ => {} + } } - fn next_double_asterisk(&mut self) -> Result { - let (start, _) = self.char_iterator.next().unwrap(); + Ok((input, PathExpression { legs, flags })) +} - match self.char_iterator.next() { - Some((end, '*')) => { - // Three or more asterisks are not allowed - if let Some((pos, '*')) = self.char_iterator.peek() { - return Err(box_json_path_err!(pos)); - } +#[derive(Clone, Default, Debug, PartialEq)] +pub struct PathExpression { + pub legs: Vec, + pub flags: PathExpressionFlag, +} - Ok(PathExpressionToken::Leg(( - PathLeg::DoubleAsterisk, - Position { start, end }, - ))) - } - Some((pos, _)) => Err(box_json_path_err!(pos)), - None => Err(box_json_path_err!(self.current_index())), - } +impl PathExpression { + pub fn contains_any_asterisk(&self) -> bool { + (self.flags + & (PATH_EXPRESSION_CONTAINS_ASTERISK | PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK)) + != 0 + } + + pub fn contains_any_range(&self) -> bool { + (self.flags & PATH_EXPRESSION_CONTAINS_RANGE) != 0 } } +/// `box_json_path_err` creates an error from the slice position +/// The position is added with 1, to count from 1 as start +macro_rules! box_json_path_err { + ($e:expr) => {{ + box_err!( + "Invalid JSON path expression. The error is around character position {}.", + ($e) + 1 + ) + }}; +} + /// Parses a JSON path expression. Returns a `PathExpression` /// object which can be used in `JSON_EXTRACT`, `JSON_SET` and so on. -pub fn parse_json_path_expr(path_expr: &str) -> Result { - let mut legs = Vec::new(); - let tokenizer = PathExpressionTokenizer::new(path_expr); - let mut flags = PathExpressionFlag::default(); - - let mut started = false; - let mut last_position = Position { start: 0, end: 0 }; - for (index, token) in tokenizer.enumerate() { - let token = token?; - - match token { - PathExpressionToken::Leg((leg, position)) => { - if !started { - return Err(box_json_path_err!(position.start)); +/// +/// See `parseJSONPathExpr` in TiDB `types/json_path_expr.go`. +pub fn parse_json_path_expr(path_expr_input: &str) -> Result { + let (left_input, path_expr) = match path_expression(path_expr_input) { + Ok(ret) => ret, + Err(err) => { + let input = match err { + nom::Err::Error(err) => err.input, + nom::Err::Failure(err) => err.input, + _ => { + unreachable!() } + }; - match &leg { - PathLeg::Key(key) => { - if key == PATH_EXPR_ASTERISK { - flags |= PATH_EXPRESSION_CONTAINS_ASTERISK - } - } - PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK) => { - flags |= PATH_EXPRESSION_CONTAINS_ASTERISK - } - PathLeg::DoubleAsterisk => flags |= PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, - _ => {} - } - - legs.push(leg.clone()); - last_position = position; - } - PathExpressionToken::Start(position) => { - started = true; - - if index != 0 { - return Err(box_json_path_err!(position.start)); - } - } + return Err(box_json_path_err!(path_expr_input.len() - input.len())); } - } + }; - // There is no available token - if !started { - return Err(box_json_path_err!(path_expr.len())); + // Some extra input is left + if !left_input.is_empty() { + return Err(box_json_path_err!(path_expr_input.len() - left_input.len())); } + // The last one cannot be the double asterisk - if !legs.is_empty() && legs.last().unwrap() == &PathLeg::DoubleAsterisk { - return Err(box_json_path_err!(last_position.end)); + if !path_expr.legs.is_empty() && path_expr.legs.last().unwrap() == &PathLeg::DoubleAsterisk { + return Err(box_json_path_err!(path_expr_input.len() - 1)); } - Ok(PathExpression { legs, flags }) + Ok(path_expr) } #[cfg(test)] @@ -429,7 +340,7 @@ mod tests { "$.a", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("a"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("a")))], flags: PathExpressionFlag::default(), }), ), @@ -438,8 +349,8 @@ mod tests { None, Some(PathExpression { legs: vec![ - PathLeg::Key(String::from("a")), - PathLeg::Key(String::from("$")), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::Key(KeySelection::Key(String::from("$"))), ], flags: PathExpressionFlag::default(), }), @@ -448,7 +359,7 @@ mod tests { "$.\"hello world\"", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("hello world"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("hello world")))], flags: PathExpressionFlag::default(), }), ), @@ -456,7 +367,7 @@ mod tests { "$. \"你好 世界\" ", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("你好 世界"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("你好 世界")))], flags: PathExpressionFlag::default(), }), ), @@ -464,7 +375,7 @@ mod tests { "$. ❤️ ", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("❤️"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("❤️")))], flags: PathExpressionFlag::default(), }), ), @@ -472,7 +383,7 @@ mod tests { "$. 你好 ", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("你好"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("你好")))], flags: PathExpressionFlag::default(), }), ), @@ -480,7 +391,9 @@ mod tests { "$[ 0 ]", None, Some(PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Index( + ArrayIndex::Left(0), + ))], flags: PathExpressionFlag::default(), }), ), @@ -488,7 +401,10 @@ mod tests { "$**.a", None, Some(PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("a"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }), ), @@ -496,7 +412,10 @@ mod tests { " $ ** . a", None, Some(PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("a"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }), ), @@ -504,7 +423,69 @@ mod tests { " $ ** . $", None, Some(PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("$"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("$"))), + ], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }), + ), + ( + " $ [ 1 to 10 ]", + None, + Some(PathExpression { + legs: vec![PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Left(10), + ))], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }), + ), + ( + " $ [ 1 to last - 10 ]", + None, + Some(PathExpression { + legs: vec![PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(10), + ))], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }), + ), + ( + " $ [ 1 to last-10 ]", + None, + Some(PathExpression { + legs: vec![PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(10), + ))], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }), + ), + ( + " $ ** [ 1 to last ]", + None, + Some(PathExpression { + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(0), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK + | PATH_EXPRESSION_CONTAINS_RANGE, + }), + ), + ( + " $ ** [ last ]", + None, + Some(PathExpression { + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(0))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }), ), @@ -536,8 +517,7 @@ mod tests { ), ( "$.\"\\u33\"", - // TODO: pass the position in the unquote unicode error - Some("Invalid unicode, byte len too short"), + Some("Invalid JSON path expression. The error is around character position 3."), None, ), ( @@ -547,7 +527,7 @@ mod tests { ), ( "$.\"a\\t\"", - Some("Invalid JSON path expression. The error is around character position 4."), + Some("Invalid JSON path expression. The error is around character position 3."), None, ), ( @@ -556,8 +536,23 @@ mod tests { None, ), ( - "$ [ 2147483648 ]", - Some("Invalid JSON path expression. The error is around character position 15."), + "$ [ 4294967296 ]", + Some("Invalid JSON path expression. The error is around character position 5."), + None, + ), + ( + "$ [ 1to2 ]", + Some("Invalid JSON path expression. The error is around character position 6."), + None, + ), + ( + "$ [ 2 to 1 ]", + Some("Invalid JSON path expression. The error is around character position 10."), + None, + ), + ( + "$ [ last - 10 to last - 20 ]", + Some("Invalid JSON path expression. The error is around character position 18."), None, ), ]; @@ -607,4 +602,22 @@ mod tests { assert_eq!(b, expected, "#{} expect {:?} but got {:?}", i, expected, b); } } + + #[test] + fn test_parse_json_path_expr_contains_any_range() { + let mut test_cases = vec![ + ("$.a[0]", false), + ("$.a[*]", false), + ("$**.a[0]", false), + ("$.a[1 to 2]", true), + ("$.a[1 to last - 2]", true), + ]; + for (i, (path_expr, expected)) in test_cases.drain(..).enumerate() { + let r = parse_json_path_expr(path_expr); + assert!(r.is_ok(), "#{} expect parse ok but got err {:?}", i, r); + let e = r.unwrap(); + let b = e.contains_any_range(); + assert_eq!(b, expected, "#{} expect {:?} but got {:?}", i, expected, b); + } + } } From e57dc6d7d6dc70d6b212c57772fc7da9f2c9f007 Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 3 Nov 2022 14:18:01 +0800 Subject: [PATCH 308/676] apply: avoid unnecessary clone (#13727) close tikv/tikv#13726 avoid unnecessary clone Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/apply.rs | 33 ++++++++++----------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index a5da7b9c9f1..6fce91114a7 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1263,7 +1263,7 @@ where apply_ctx: &mut ApplyContext, index: u64, term: u64, - cmd: RaftCmdRequest, + req: RaftCmdRequest, ) -> ApplyResult { if index == 0 { panic!( @@ -1273,11 +1273,10 @@ where } // Set sync log hint if the cmd requires so. - apply_ctx.sync_log_hint |= should_sync_log(&cmd); + apply_ctx.sync_log_hint |= should_sync_log(&req); - apply_ctx.host.pre_apply(&self.region, &cmd); - let (mut resp, exec_result, should_write) = - self.apply_raft_cmd(apply_ctx, index, term, &cmd); + apply_ctx.host.pre_apply(&self.region, &req); + let (mut cmd, exec_result, should_write) = self.apply_raft_cmd(apply_ctx, index, term, req); if let ApplyResult::WaitMergeSource(_) = exec_result { return exec_result; } @@ -1291,9 +1290,8 @@ where // TODO: if we have exec_result, maybe we should return this callback too. Outer // store will call it after handing exec result. - cmd_resp::bind_term(&mut resp, self.term); - let cmd_cb = self.find_pending(index, term, is_conf_change_cmd(&cmd)); - let cmd = Cmd::new(index, term, cmd, resp); + cmd_resp::bind_term(&mut cmd.response, self.term); + let cmd_cb = self.find_pending(index, term, is_conf_change_cmd(&cmd.request)); apply_ctx .applied_batch .push(cmd_cb, cmd, &self.observe_info, self.region_id()); @@ -1321,8 +1319,8 @@ where ctx: &mut ApplyContext, index: u64, term: u64, - req: &RaftCmdRequest, - ) -> (RaftCmdResponse, ApplyResult, bool) { + req: RaftCmdRequest, + ) -> (Cmd, ApplyResult, bool) { // if pending remove, apply should be aborted already. assert!(!self.pending_remove); @@ -1330,7 +1328,7 @@ where // E.g. `RaftApplyState` must not be changed. let mut origin_epoch = None; - let (resp, exec_result) = if ctx.host.pre_exec(&self.region, req, index, term) { + let (resp, exec_result) = if ctx.host.pre_exec(&self.region, &req, index, term) { // One of the observers want to filter execution of the command. let mut resp = RaftCmdResponse::default(); if !req.get_header().get_uuid().is_empty() { @@ -1342,7 +1340,7 @@ where ctx.exec_log_index = index; ctx.exec_log_term = term; ctx.kv_wb_mut().set_save_point(); - let (resp, exec_result) = match self.exec_raft_cmd(ctx, req) { + let (resp, exec_result) = match self.exec_raft_cmd(ctx, &req) { Ok(a) => { ctx.kv_wb_mut().pop_save_point().unwrap(); if req.has_admin_request() { @@ -1383,14 +1381,15 @@ where }; (resp, exec_result) }; + + let cmd = Cmd::new(index, term, req, resp); if let ApplyResult::WaitMergeSource(_) = exec_result { - return (resp, exec_result, false); + return (cmd, exec_result, false); } self.apply_state.set_applied_index(index); self.applied_term = term; - let cmd = Cmd::new(index, term, req.clone(), resp.clone()); let (modified_region, mut pending_handle_ssts) = match exec_result { ApplyResult::Res(ref e) => match e { ExecResult::SplitRegion { ref derived, .. } => (Some(derived.clone()), None), @@ -1469,7 +1468,7 @@ where } } if let Some(epoch) = origin_epoch { - let cmd_type = req.get_admin_request().get_cmd_type(); + let cmd_type = cmd.request.get_admin_request().get_cmd_type(); let epoch_state = admin_cmd_epoch_lookup(cmd_type); // The change-epoch behavior **MUST BE** equal to the settings in // `admin_cmd_epoch_lookup` @@ -1481,7 +1480,7 @@ where panic!( "{} apply admin cmd {:?} but epoch change is not expected, epoch state {:?}, before {:?}, after {:?}", self.tag, - req, + cmd.request, epoch_state, epoch, self.region.get_region_epoch() @@ -1489,7 +1488,7 @@ where } } - (resp, exec_result, should_write) + (cmd, exec_result, should_write) } fn destroy(&mut self, apply_ctx: &mut ApplyContext) { From e1ba8a278fe370b4ffb0ca38e6789f37cabcda05 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Thu, 3 Nov 2022 17:42:00 +0800 Subject: [PATCH 309/676] txn: set last_change_ts only on 6.5+ versions (#13728) ref tikv/tikv#13694 Some old versions or components (TiKV < 5.0, TiFlash) cannot handle unknown fields in Lock and Write. To avoid causing unexpected problems, we add a feature gate to the new field. We are not going to release this feature in TiKV 6.4, so I directly set the minimal version to 6.5. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- src/storage/mvcc/reader/scanner/forward.rs | 10 +++++++ .../txn/actions/acquire_pessimistic_lock.rs | 26 ++++++++++++++++--- src/storage/txn/sched_pool.rs | 17 +++++++++++- src/storage/txn/scheduler.rs | 3 +++ 4 files changed, 52 insertions(+), 4 deletions(-) diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 5d9d1b9bb83..709dc5803d1 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -2366,6 +2366,16 @@ mod delta_entry_tests { #[test] fn test_mess() { + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + // Set version to 6.5.0 to enable last_change_ts. + // TODO: Remove this after TiKV version reaches 6.5 + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.5.0").unwrap(); + set_tls_feature_gate(feature_gate); + // TODO: non-pessimistic lock should be returned enven if its ts < from_ts. // (key, lock, [commit1, commit2, ...]) // Values ends with 'L' will be made larger than `SHORT_VALUE_MAX_LEN` so it diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 9f645e389be..db4c2485d09 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -9,7 +9,10 @@ use crate::storage::{ metrics::{MVCC_CONFLICT_COUNTER, MVCC_DUPLICATE_CMD_COUNTER_VEC}, ErrorInner, MvccTxn, Result as MvccResult, SnapshotReader, }, - txn::actions::check_data_constraint::check_data_constraint, + txn::{ + actions::check_data_constraint::check_data_constraint, sched_pool::tls_can_enable, + scheduler::LAST_CHANGE_TS, + }, types::PessimisticLockKeyResult, Snapshot, }; @@ -262,7 +265,9 @@ pub fn acquire_pessimistic_lock( check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; } - (last_change_ts, versions_to_last_change) = write.next_last_change_info(commit_ts); + if tls_can_enable(LAST_CHANGE_TS) { + (last_change_ts, versions_to_last_change) = write.next_last_change_info(commit_ts); + } // Load value if locked_with_conflict, so that when the client (TiDB) need to // read the value during statement retry, it will be possible to read the value @@ -1612,11 +1617,19 @@ pub mod tests { #[test] fn test_calculate_last_change_ts() { use engine_traits::CF_WRITE; + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; let mut engine = TestEngineBuilder::new().build().unwrap(); let key = b"k"; - // Latest version is a PUT + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate.clone()); + + // Latest version is a PUT, but last_change_ts is enabled with cluster version + // higher than 6.5.0. let write = Write::new(WriteType::Put, 15.into(), Some(b"value".to_vec())); engine .put_cf( @@ -1628,6 +1641,13 @@ pub mod tests { .unwrap(); must_succeed(&mut engine, key, key, 10, 30); let lock = must_pessimistic_locked(&mut engine, key, 10, 30); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + pessimistic_rollback::tests::must_success(&mut engine, key, 10, 30); + // Set cluster version to 6.5.0, last_change_ts should work now. + feature_gate.set_version("6.5.0").unwrap(); + must_succeed(&mut engine, key, key, 10, 30); + let lock = must_pessimistic_locked(&mut engine, key, 10, 30); assert_eq!(lock.last_change_ts, 20.into()); assert_eq!(lock.versions_to_last_change, 1); pessimistic_rollback::tests::must_success(&mut engine, key, 10, 30); diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 78a891b650e..c7c69b5bbf4 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -9,6 +9,7 @@ use std::{ use collections::HashMap; use file_system::{set_io_type, IoType}; use kvproto::pdpb::QueryKind; +use pd_client::{Feature, FeatureGate}; use prometheus::local::*; use raftstore::store::WriteStats; use tikv_util::{ @@ -19,6 +20,7 @@ use tikv_util::{ use crate::storage::{ kv::{destroy_tls_engine, set_tls_engine, Engine, FlowStatsReporter, Statistics}, metrics::*, + test_util::latest_feature_gate, }; pub struct SchedLocalMetrics { @@ -28,13 +30,15 @@ pub struct SchedLocalMetrics { } thread_local! { - static TLS_SCHED_METRICS: RefCell = RefCell::new( + static TLS_SCHED_METRICS: RefCell = RefCell::new( SchedLocalMetrics { local_scan_details: HashMap::default(), command_keyread_histogram_vec: KV_COMMAND_KEYREAD_HISTOGRAM_VEC.local(), local_write_stats:WriteStats::default(), } ); + + static TLS_FEATURE_GATE: RefCell = RefCell::new(latest_feature_gate()); } #[derive(Clone)] @@ -58,6 +62,7 @@ impl SchedPool { engine: E, pool_size: usize, reporter: R, + feature_gate: FeatureGate, name_prefix: &str, ) -> Self { let engine = Arc::new(Mutex::new(engine)); @@ -75,6 +80,7 @@ impl SchedPool { .after_start(move || { set_tls_engine(engine.lock().unwrap().clone()); set_io_type(IoType::ForegroundWrite); + TLS_FEATURE_GATE.with(|c| *c.borrow_mut() = feature_gate.clone()); }) .before_stop(move || unsafe { // Safety: we ensure the `set_` and `destroy_` calls use the same engine type. @@ -134,3 +140,12 @@ pub fn tls_collect_keyread_histogram_vec(cmd: &str, count: f64) { .observe(count); }); } + +pub fn tls_can_enable(feature: Feature) -> bool { + TLS_FEATURE_GATE.with(|feature_gate| feature_gate.borrow().can_enable(feature)) +} + +#[cfg(test)] +pub fn set_tls_feature_gate(feature_gate: FeatureGate) { + TLS_FEATURE_GATE.with(|f| *f.borrow_mut() = feature_gate); +} diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 917c9fbaffc..9966e14812e 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -95,6 +95,7 @@ const TASKS_SLOTS_NUM: usize = 1 << 12; // 4096 slots. pub const DEFAULT_EXECUTION_DURATION_LIMIT: Duration = Duration::from_secs(24 * 60 * 60); const IN_MEMORY_PESSIMISTIC_LOCK: Feature = Feature::require(6, 0, 0); +pub const LAST_CHANGE_TS: Feature = Feature::require(6, 5, 0); /// Task is a running command. pub(super) struct Task { @@ -391,12 +392,14 @@ impl Scheduler { engine.clone(), config.scheduler_worker_pool_size, reporter.clone(), + feature_gate.clone(), "sched-worker-pool", ), high_priority_pool: SchedPool::new( engine, std::cmp::max(1, config.scheduler_worker_pool_size / 2), reporter, + feature_gate.clone(), "sched-high-pri-pool", ), control_mutex: Arc::new(tokio::sync::Mutex::new(false)), From 133769217ef897dbed04478de71d2c345973d867 Mon Sep 17 00:00:00 2001 From: Yexiang Zhang Date: Thu, 3 Nov 2022 23:46:00 +0800 Subject: [PATCH 310/676] *: upgrade pprof-rs to v0.11 (#13733) close tikv/tikv#13732, ref tikv/tikv#13732 Signed-off-by: mornyx Co-authored-by: Ti Chi Robot --- Cargo.lock | 49 ++++++++++++++++++++++++++++--------------------- Cargo.toml | 2 +- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 25a68864586..d49c13ae18c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -296,7 +296,7 @@ dependencies = [ "tikv_util", "tokio", "url", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -344,7 +344,7 @@ dependencies = [ "serde_json", "thiserror", "url", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -389,7 +389,7 @@ dependencies = [ "serde_json", "thiserror", "url", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -512,7 +512,7 @@ dependencies = [ "tonic", "txn_types", "url", - "uuid", + "uuid 0.8.2", "walkdir", "yatp", ] @@ -1337,11 +1337,11 @@ dependencies = [ [[package]] name = "debugid" -version = "0.7.2" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91cf5a8c2f2097e2a32627123508635d47ce10563d999ec1a95addf08b502ba" +checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" dependencies = [ - "uuid", + "uuid 1.2.1", ] [[package]] @@ -3859,8 +3859,9 @@ dependencies = [ [[package]] name = "pprof" -version = "0.9.1" -source = "git+https://github.com/tikv/pprof-rs.git?rev=3fed55af8fc6cf69dbd954a0321c799c5a111e4e#3fed55af8fc6cf69dbd954a0321c799c5a111e4e" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e20150f965e0e4c925982b9356da71c84bcd56cb66ef4e894825837cbcf6613e" dependencies = [ "backtrace", "cfg-if 1.0.0", @@ -4263,7 +4264,7 @@ dependencies = [ "tokio", "tracker", "txn_types", - "uuid", + "uuid 0.8.2", "yatp", ] @@ -5442,7 +5443,7 @@ dependencies = [ "tikv_util", "tokio", "txn_types", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -5562,21 +5563,21 @@ checksum = "343f3f510c2915908f155e94f17220b19ccfacf2a64a2a5d8004f2c3e311e7fd" [[package]] name = "symbolic-common" -version = "8.0.0" +version = "10.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0caab39ce6f074031b8fd3dd297bfda70a2d1f33c6e7cc1b737ac401f856448d" +checksum = "ac457d054f793cedfde6f32d21d692b8351cfec9084fefd0470c0373f6d799bc" dependencies = [ "debugid", - "memmap", + "memmap2", "stable_deref_trait", - "uuid", + "uuid 1.2.1", ] [[package]] name = "symbolic-demangle" -version = "8.0.0" +version = "10.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b77ecb5460a87faa37ed53521eed8f073c8339b7a5788c1f93efc09ce74e1b68" +checksum = "48808b846eef84e0ac06365dc620f028ae632355e5dcffc007bf1b2bf5eab17b" dependencies = [ "rustc-demangle", "symbolic-common", @@ -5857,7 +5858,7 @@ dependencies = [ "engine_traits", "keys", "kvproto", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -5974,7 +5975,7 @@ dependencies = [ "tokio", "toml", "txn_types", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -6171,7 +6172,7 @@ dependencies = [ "tipb", "tipb_helper", "twoway", - "uuid", + "uuid 0.8.2", ] [[package]] @@ -6295,7 +6296,7 @@ dependencies = [ "tracker", "txn_types", "url", - "uuid", + "uuid 0.8.2", "walkdir", "yatp", "zipf", @@ -6965,6 +6966,12 @@ dependencies = [ "serde", ] +[[package]] +name = "uuid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83" + [[package]] name = "valgrind_request" version = "1.1.0" diff --git a/Cargo.toml b/Cargo.toml index 756f36a0c50..4ccf0a2ad93 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -129,7 +129,7 @@ paste = "1.0" pd_client = { workspace = true } pin-project = "1.0" pnet_datalink = "0.23" -pprof = { git = "https://github.com/tikv/pprof-rs.git", rev = "3fed55af8fc6cf69dbd954a0321c799c5a111e4e", default-features = false, features = ["flamegraph", "protobuf-codec"] } +pprof = { version = "0.11", default-features = false, features = ["flamegraph", "protobuf-codec"] } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } From be76c441d714e85a381f3af8eb15caa40b7fb007 Mon Sep 17 00:00:00 2001 From: Lucas Date: Fri, 4 Nov 2022 10:00:01 +0800 Subject: [PATCH 311/676] Optimize slow-score mechanism to reduce the time-cost on recovery when IO hung. (#13654) close tikv/tikv#13648 1. Make the reporting of Slow-Score more timely. Optimize the reporting strategy of slow-score. Making it more timely, even if store-heartbeat is delayed because of IO delay in one TiKV node. 2. Awaken hibernated regions on healthy nodes timely when IO hang. If one TiKV node is hung by abnormal IO, we can awaken related hibernated regions in time, to trigger self-revoting in these raft-groups for recovery. Signed-off-by: Lucasliang --- components/raftstore/src/store/fsm/peer.rs | 12 +- components/raftstore/src/store/fsm/store.rs | 50 +++++++- components/raftstore/src/store/msg.rs | 5 + components/raftstore/src/store/worker/pd.rs | 121 ++++++++++++++++++-- components/tikv_util/src/store/mod.rs | 74 +++++++++++- components/tikv_util/src/store/peer.rs | 34 ------ components/tikv_util/src/store/region.rs | 15 +++ 7 files changed, 261 insertions(+), 50 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index b7f7b005137..69215ecaf70 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2660,7 +2660,17 @@ where match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { if self.fsm.hibernate_state.group_state() == GroupState::Idle { - self.reset_raft_tick(GroupState::Ordered); + if msg.get_extra_msg().forcely_awaken { + // Forcely awaken this region by manually setting this GroupState + // into Chaos to trigger a new voting in this RaftGroup. + self.reset_raft_tick(if !self.fsm.peer.is_leader() { + GroupState::Chaos + } else { + GroupState::Ordered + }); + } else { + self.reset_raft_tick(GroupState::Ordered); + } } if msg.get_extra_msg().get_type() == ExtraMessageType::MsgRegionWakeUp && self.fsm.peer.is_leader() diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 2bb2ea636e1..0f172b6c70f 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -36,7 +36,7 @@ use kvproto::{ metapb::{self, Region, RegionEpoch}, pdpb::{self, QueryStats, StoreStats}, raft_cmdpb::{AdminCmdType, AdminRequest}, - raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, + raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, replication_modepb::{ReplicationMode, ReplicationStatus}, }; use pd_client::{Feature, FeatureGate, PdClient}; @@ -53,7 +53,7 @@ use tikv_util::{ info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, slow_log, - store::find_peer, + store::{find_peer, region_on_stores}, sys as sys_util, sys::disk::{get_disk_status, DiskUsage}, time::{duration_to_sec, Instant as TiInstant}, @@ -758,6 +758,9 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> drop(syncer); } StoreMsg::GcSnapshotFinish => self.register_snap_mgr_gc_tick(), + StoreMsg::AwakenRegions { abnormal_stores } => { + self.on_wake_up_regions(abnormal_stores); + } } } self.ctx @@ -2447,11 +2450,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER ); stats.set_query_stats(query_stats); - let store_info = StoreInfo { + let store_info = Some(StoreInfo { kv_engine: self.ctx.engines.kv.clone(), raft_engine: self.ctx.engines.raft.clone(), capacity: self.ctx.cfg.capacity.0, - }; + }); let task = PdTask::StoreHeartbeat { stats, @@ -2534,6 +2537,45 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.register_compact_lock_cf_tick(); } + fn on_wake_up_regions(&self, abnormal_stores: Vec) { + info!("try to wake up all hibernated regions in this store"; + "to_all" => abnormal_stores.is_empty()); + let meta = self.ctx.store_meta.lock().unwrap(); + for region_id in meta.regions.keys() { + let region = &meta.regions[region_id]; + // Check whether the current region is not found on abnormal stores. If so, + // this region is not the target to be awaken. + if !region_on_stores(region, &abnormal_stores) { + continue; + } + let peer = { + match find_peer(region, self.ctx.store_id()) { + None => continue, + Some(p) => p.clone(), + } + }; + { + // Send MsgRegionWakeUp to Peer for awakening hibernated regions. + let mut message = RaftMessage::default(); + message.set_region_id(*region_id); + message.set_from_peer(peer.clone()); + message.set_to_peer(peer); + message.set_region_epoch(region.get_region_epoch().clone()); + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgRegionWakeUp); + msg.forcely_awaken = true; + message.set_extra_msg(msg); + if let Err(e) = self.ctx.router.send_raft_message(message) { + error!( + "send awaken region message failed"; + "region_id" => region_id, + "err" => ?e + ); + } + } + } + } + fn register_pd_store_heartbeat_tick(&self) { self.ctx.schedule_store_tick( StoreTick::PdStoreHeartbeat, diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index b86700af8e6..262f9fd64c5 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -845,6 +845,10 @@ where }, GcSnapshotFinish, + + AwakenRegions { + abnormal_stores: Vec, + }, } impl fmt::Debug for StoreMsg @@ -878,6 +882,7 @@ where write!(fmt, "UnsafeRecoveryCreatePeer") } StoreMsg::GcSnapshotFinish => write!(fmt, "GcSnapshotFinish"), + StoreMsg::AwakenRegions { .. } => write!(fmt, "AwakenRegions"), } } } diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index ec06d756fe9..fdfa1b44c85 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -53,6 +53,7 @@ use yatp::Remote; use crate::{ coprocessor::CoprocessorHost, + router::RaftStoreRouter, store::{ cmd_resp::new_error, metrics::*, @@ -151,7 +152,7 @@ where Heartbeat(HeartbeatTask), StoreHeartbeat { stats: pdpb::StoreStats, - store_info: StoreInfo, + store_info: Option>, report: Option, dr_autosync_status: Option, }, @@ -204,6 +205,9 @@ pub struct StoreStat { pub engine_last_total_bytes_read: u64, pub engine_last_total_keys_read: u64, pub engine_last_query_num: QueryStats, + pub engine_last_capacity_size: u64, + pub engine_last_used_size: u64, + pub engine_last_available_size: u64, pub last_report_ts: UnixSecs, pub region_bytes_read: LocalHistogram, @@ -229,6 +233,9 @@ impl Default for StoreStat { engine_total_keys_read: 0, engine_last_total_bytes_read: 0, engine_last_total_keys_read: 0, + engine_last_capacity_size: 0, + engine_last_used_size: 0, + engine_last_available_size: 0, engine_total_query_num: QueryStats::default(), engine_last_query_num: QueryStats::default(), @@ -733,6 +740,9 @@ fn hotspot_query_num_report_threshold() -> u64 { HOTSPOT_QUERY_RATE_THRESHOLD * 10 } +/// Max limitation of delayed store_heartbeat. +const STORE_HEARTBEAT_DELAY_LIMIT: u64 = 5 * 60; + // Slow score is a value that represents the speed of a store and ranges in [1, // 100]. It is maintained in the AIMD way. // If there are some inspecting requests timeout during a round, by default the @@ -829,6 +839,10 @@ impl SlowScore { self.last_update_time = Instant::now(); self.value } + + fn should_force_report_slow_store(&self) -> bool { + self.value >= OrderedFloat(100.0) && (self.last_tick_id % self.round_ticks == 0) + } } // RegionCpuMeteringCollector is used to collect the region-related CPU info. @@ -883,6 +897,7 @@ where // calls Runner's run() on Task received. scheduler: Scheduler>, stats_monitor: StatsMonitor, + store_heartbeat_interval: Duration, collector_reg_handle: CollectorRegHandle, region_cpu_records_collector: Option, @@ -958,6 +973,7 @@ where store_stat: StoreStat::default(), start_ts: UnixSecs::now(), scheduler, + store_heartbeat_interval, stats_monitor, collector_reg_handle, region_cpu_records_collector, @@ -1176,7 +1192,7 @@ where fn handle_store_heartbeat( &mut self, mut stats: pdpb::StoreStats, - store_info: StoreInfo, + store_info: Option>, store_report: Option, dr_autosync_status: Option, ) { @@ -1207,13 +1223,27 @@ where } stats = collect_report_read_peer_stats(HOTSPOT_REPORT_CAPACITY, report_peers, stats); - let (capacity, used_size, available) = match collect_engine_size( - &self.coprocessor_host, - Some(&store_info), - self.snap_mgr.get_total_snap_size().unwrap(), - ) { - Some((capacity, used_size, available)) => (capacity, used_size, available), - None => return, + let (capacity, used_size, available) = if store_info.is_some() { + match collect_engine_size( + &self.coprocessor_host, + store_info.as_ref(), + self.snap_mgr.get_total_snap_size().unwrap(), + ) { + Some((capacity, used_size, available)) => { + // Update last reported infos on engine_size. + self.store_stat.engine_last_capacity_size = capacity; + self.store_stat.engine_last_used_size = used_size; + self.store_stat.engine_last_available_size = available; + (capacity, used_size, available) + } + None => return, + } + } else { + ( + self.store_stat.engine_last_capacity_size, + self.store_stat.engine_last_used_size, + self.store_stat.engine_last_available_size, + ) }; stats.set_capacity(capacity); @@ -1251,7 +1281,14 @@ where self.store_stat .engine_last_query_num .fill_query_stats(&self.store_stat.engine_total_query_num); - self.store_stat.last_report_ts = UnixSecs::now(); + self.store_stat.last_report_ts = if store_info.is_some() { + UnixSecs::now() + } else { + // If `store_info` is None, the given Task::StoreHeartbeat should be a fake + // heartbeat to PD, we won't update the last_report_ts to avoid incorrectly + // marking current TiKV node in normal state. + self.store_stat.last_report_ts + }; self.store_stat.region_bytes_written.flush(); self.store_stat.region_keys_written.flush(); self.store_stat.region_bytes_read.flush(); @@ -1338,6 +1375,14 @@ where } } } + // Forcely awaken all hibernated regions if there existed slow stores in this + // cluster. + if let Some(awaken_regions) = resp.awaken_regions.take() { + info!("forcely awaken hibernated regions in this store"); + let _ = router.send_store_msg(StoreMsg::AwakenRegions { + abnormal_stores: awaken_regions.get_abnormal_stores().to_vec(), + }); + } } Err(e) => { error!("store heartbeat failed"; "err" => ?e); @@ -1786,6 +1831,55 @@ where health_service.set_serving_status("", status); } } + + /// Force to send a special heartbeat to pd when current store is hung on + /// some special circumstances, i.e. disk busy, handler busy and others. + fn handle_fake_store_heartbeat(&mut self) { + let mut stats = pdpb::StoreStats::default(); + stats.set_store_id(self.store_id); + stats.set_region_count(self.region_peers.len() as u32); + + let snap_stats = self.snap_mgr.stats(); + stats.set_sending_snap_count(snap_stats.sending_count as u32); + stats.set_receiving_snap_count(snap_stats.receiving_count as u32); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["sending"]) + .set(snap_stats.sending_count as i64); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["receiving"]) + .set(snap_stats.receiving_count as i64); + + stats.set_start_time(self.start_ts.into_inner() as u32); + + // This calling means that the current node cannot report heartbeat in normaly + // scheduler. That is, the current node must in `busy` state. + stats.set_is_busy(true); + + // We do not need to report store_info, so we just set `None` here. + let task = Task::StoreHeartbeat { + stats, + store_info: None, + report: None, + dr_autosync_status: None, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!("force report store heartbeat failed"; + "store_id" => self.store_id, + "err" => ?e + ); + } else { + warn!("scheduling store_heartbeat timeout, force report store slow score to pd."; + "store_id" => self.store_id, + ); + } + } + + fn is_store_heartbeat_delayed(&self) -> bool { + let now = UnixSecs::now(); + let interval_second = now.into_inner() - self.store_stat.last_report_ts.into_inner(); + (interval_second >= self.store_heartbeat_interval.as_secs()) + && (interval_second <= STORE_HEARTBEAT_DELAY_LIMIT) + } } fn calculate_region_cpu_records( @@ -2065,6 +2159,13 @@ where } if !self.slow_score.last_tick_finished { self.slow_score.record_timeout(); + // If the last slow_score already reached abnormal state and was delayed for + // reporting by `store-heartbeat` to PD, we should report it here manually as + // a FAKE `store-heartbeat`. + if self.slow_score.should_force_report_slow_store() && self.is_store_heartbeat_delayed() + { + self.handle_fake_store_heartbeat(); + } } let scheduler = self.scheduler.clone(); let id = self.slow_score.last_tick_id + 1; diff --git a/components/tikv_util/src/store/mod.rs b/components/tikv_util/src/store/mod.rs index 81afff2975a..f4bfea93519 100644 --- a/components/tikv_util/src/store/mod.rs +++ b/components/tikv_util/src/store/mod.rs @@ -9,6 +9,78 @@ pub use self::{ query_stats::{is_read_query, QueryStats}, region::{ check_key_in_region, check_key_in_region_exclusive, check_key_in_region_inclusive, - region_on_same_stores, + region_on_same_stores, region_on_stores, }, }; + +#[cfg(test)] +mod tests { + use kvproto::metapb::Region; + + use super::*; + + #[test] + fn test_on_same_store() { + let cases = vec![ + (vec![2, 3, 4], vec![], vec![1, 2, 3], vec![], false), + (vec![2, 3, 1], vec![], vec![1, 2, 3], vec![], true), + (vec![2, 3, 4], vec![], vec![1, 2], vec![], false), + (vec![1, 2, 3], vec![], vec![1, 2, 3], vec![], true), + (vec![1, 3], vec![2, 4], vec![1, 2], vec![3, 4], false), + (vec![1, 3], vec![2, 4], vec![1, 3], vec![], false), + (vec![1, 3], vec![2, 4], vec![], vec![2, 4], false), + (vec![1, 3], vec![2, 4], vec![3, 1], vec![4, 2], true), + ]; + + for (s1, s2, s3, s4, exp) in cases { + let mut r1 = Region::default(); + for (store_id, peer_id) in s1.into_iter().zip(0..) { + r1.mut_peers().push(new_peer(store_id, peer_id)); + } + for (store_id, peer_id) in s2.into_iter().zip(0..) { + r1.mut_peers().push(new_learner_peer(store_id, peer_id)); + } + + let mut r2 = Region::default(); + for (store_id, peer_id) in s3.into_iter().zip(10..) { + r2.mut_peers().push(new_peer(store_id, peer_id)); + } + for (store_id, peer_id) in s4.into_iter().zip(10..) { + r2.mut_peers().push(new_learner_peer(store_id, peer_id)); + } + let res = region_on_same_stores(&r1, &r2); + assert_eq!(res, exp, "{:?} vs {:?}", r1, r2); + } + } + + #[test] + fn test_check_region_on_store() { + let cases = vec![ + (vec![1, 2, 3], vec![], vec![], true), + (vec![2, 3, 1], vec![], vec![1], true), + (vec![1, 3, 2], vec![], vec![2, 3], true), + (vec![3, 2, 1], vec![], vec![4], false), + (vec![1, 2, 3], vec![], vec![2, 4], true), + (vec![1, 3], vec![2, 4], vec![2], true), + (vec![1, 3], vec![2, 4], vec![2, 3], true), + (vec![1, 3], vec![2], vec![4], false), + ]; + + for (s1, s2, target_stores, exp) in cases { + let mut region = Region::default(); + for (store_id, peer_id) in s1.into_iter().zip(0..) { + region.mut_peers().push(new_peer(store_id, peer_id)); + } + for (store_id, peer_id) in s2.into_iter().zip(0..) { + region.mut_peers().push(new_learner_peer(store_id, peer_id)); + } + + let res = region_on_stores(®ion, &target_stores); + assert_eq!( + res, exp, + "region {:?} exists on {:?}", + region, target_stores + ); + } + } +} diff --git a/components/tikv_util/src/store/peer.rs b/components/tikv_util/src/store/peer.rs index 1a9184134f0..59844bc957a 100644 --- a/components/tikv_util/src/store/peer.rs +++ b/components/tikv_util/src/store/peer.rs @@ -63,38 +63,4 @@ mod tests { assert!(remove_peer(&mut region, 1).is_none()); assert!(find_peer(®ion, 1).is_none()); } - - #[test] - fn test_on_same_store() { - let cases = vec![ - (vec![2, 3, 4], vec![], vec![1, 2, 3], vec![], false), - (vec![2, 3, 1], vec![], vec![1, 2, 3], vec![], true), - (vec![2, 3, 4], vec![], vec![1, 2], vec![], false), - (vec![1, 2, 3], vec![], vec![1, 2, 3], vec![], true), - (vec![1, 3], vec![2, 4], vec![1, 2], vec![3, 4], false), - (vec![1, 3], vec![2, 4], vec![1, 3], vec![], false), - (vec![1, 3], vec![2, 4], vec![], vec![2, 4], false), - (vec![1, 3], vec![2, 4], vec![3, 1], vec![4, 2], true), - ]; - - for (s1, s2, s3, s4, exp) in cases { - let mut r1 = Region::default(); - for (store_id, peer_id) in s1.into_iter().zip(0..) { - r1.mut_peers().push(new_peer(store_id, peer_id)); - } - for (store_id, peer_id) in s2.into_iter().zip(0..) { - r1.mut_peers().push(new_learner_peer(store_id, peer_id)); - } - - let mut r2 = Region::default(); - for (store_id, peer_id) in s3.into_iter().zip(10..) { - r2.mut_peers().push(new_peer(store_id, peer_id)); - } - for (store_id, peer_id) in s4.into_iter().zip(10..) { - r2.mut_peers().push(new_learner_peer(store_id, peer_id)); - } - let res = super::super::region_on_same_stores(&r1, &r2); - assert_eq!(res, exp, "{:?} vs {:?}", r1, r2); - } - } } diff --git a/components/tikv_util/src/store/region.rs b/components/tikv_util/src/store/region.rs index 17c3209e7d4..580d940ebeb 100644 --- a/components/tikv_util/src/store/region.rs +++ b/components/tikv_util/src/store/region.rs @@ -38,6 +38,21 @@ pub fn region_on_same_stores(lhs: &Region, rhs: &Region) -> bool { }) } +/// Check if the given region exists on stores, by checking whether any one of +/// the peers belonging to this region exist on the given stores. +pub fn region_on_stores(region: &Region, store_ids: &Vec) -> bool { + if store_ids.is_empty() { + return true; + } + // If one of peers in this region exists on any on in `store_ids`, it shows that + // the region exists on the given stores. + region.get_peers().iter().any(|p| { + store_ids + .iter() + .any(|store_id| *store_id == p.get_store_id()) + }) +} + #[cfg(test)] mod tests { use super::*; From c627407cbe265cab4965b4c8b5f324cee7e844bd Mon Sep 17 00:00:00 2001 From: Hu# Date: Fri, 4 Nov 2022 10:16:01 +0800 Subject: [PATCH 312/676] *: resolve when flashback meets error in the first batch (#13695) close tikv/tikv#13672, close tikv/tikv#13704, close tikv/tikv#13723 resolve when flashback meets error in the first batch. Signed-off-by: husharp Signed-off-by: JmPotato Co-authored-by: JmPotato Co-authored-by: Ti Chi Robot --- components/test_raftstore/src/util.rs | 35 ++- src/server/service/kv.rs | 28 +-- src/storage/mod.rs | 73 +++---- src/storage/mvcc/reader/reader.rs | 2 +- .../txn/actions/flashback_to_version.rs | 199 ++++++++++-------- .../txn/commands/flashback_to_version.rs | 109 ++++++---- .../flashback_to_version_read_phase.rs | 182 ++++++++++------ src/storage/txn/commands/mod.rs | 11 +- src/storage/txn/mod.rs | 4 +- tests/integrations/server/kv_service.rs | 133 ++++++++++-- 10 files changed, 517 insertions(+), 259 deletions(-) diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 3718dbce906..1e35dc0cf13 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -818,6 +818,35 @@ pub fn must_kv_read_equal(client: &TikvClient, ctx: Context, key: Vec, val: assert_eq!(get_resp.take_value(), val); } +// TODO: replace the redundant code +pub fn complete_data_commit(client: &TikvClient, ctx: &Context, ts: u64, k: Vec, v: Vec) { + // Prewrite + let prewrite_start_version = ts + 1; + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite( + client, + ctx.clone(), + vec![mutation], + k.clone(), + prewrite_start_version, + ); + // Commit + let commit_version = ts + 2; + must_kv_commit( + client, + ctx.clone(), + vec![k.clone()], + prewrite_start_version, + commit_version, + commit_version, + ); + // Get + must_kv_read_equal(client, ctx.clone(), k, v, ts + 3); +} + pub fn kv_read(client: &TikvClient, ctx: Context, key: Vec, ts: u64) -> GetResponse { let mut get_req = GetRequest::default(); get_req.set_context(ctx); @@ -1224,7 +1253,7 @@ pub fn must_flashback_to_version( version: u64, start_ts: u64, commit_ts: u64, -) -> FlashbackToVersionResponse { +) { let mut prepare_req = PrepareFlashbackToVersionRequest::default(); prepare_req.set_context(ctx.clone()); client @@ -1237,7 +1266,9 @@ pub fn must_flashback_to_version( req.version = version; req.start_key = b"a".to_vec(); req.end_key = b"z".to_vec(); - client.kv_flashback_to_version(&req).unwrap() + let resp = client.kv_flashback_to_version(&req).unwrap(); + assert!(!resp.has_region_error()); + assert!(resp.get_error().is_empty()); } // A helpful wrapper to make the test logic clear diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 8ac91031c33..7fc5bb77f31 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1779,24 +1779,26 @@ fn future_flashback_to_version< Err(e) => Err(e), Ok(_) => f.await?, }; - fail_point!("skip_finish_flashback_to_version", |_| { - Ok(FlashbackToVersionResponse::default()) - }); - // Send an `AdminCmdType::FinishFlashback` to unset the persistence state - // in `RegionLocalState` and region's meta, and when that - // admin cmd is applied, will update the memory - // state of the flashback - send_flashback_msg::( - &raft_router, - req.get_context(), - AdminCmdType::FinishFlashback, - ) - .await?; let mut resp = FlashbackToVersionResponse::default(); if let Some(err) = extract_region_error(&v) { resp.set_region_error(err); } else if let Err(e) = v { resp.set_error(format!("{}", e)); + } else { + // Only finish flashback when Flashback executed successfully. + fail_point!("skip_finish_flashback_to_version", |_| { + Ok(FlashbackToVersionResponse::default()) + }); + // Send an `AdminCmdType::FinishFlashback` to unset the persistence state + // in `RegionLocalState` and region's meta, and when that + // admin cmd is applied, will update the memory + // state of the flashback + send_flashback_msg::( + &raft_router, + req.get_context(), + AdminCmdType::FinishFlashback, + ) + .await?; } Ok(resp) } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 16043a348ce..1c2688dd8a8 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3472,7 +3472,7 @@ mod tests { use super::{ mvcc::tests::{must_unlocked, must_written}, test_util::*, - txn::FLASHBACK_BATCH_SIZE, + txn::{commands::new_flashback_to_version_read_phase_cmd, FLASHBACK_BATCH_SIZE}, *, }; use crate::{ @@ -4745,13 +4745,12 @@ mod tests { let version = write.2; storage .sched_txn_command( - commands::FlashbackToVersionReadPhase::new( + new_flashback_to_version_read_phase_cmd( start_ts, commit_ts, version, - None, - Some(key.clone()), - Some(key.clone()), + key.clone(), + Key::from_raw(b"z"), Context::default(), ), expect_ok_callback(tx.clone(), 2), @@ -4836,13 +4835,12 @@ mod tests { let commit_ts = *ts.incr(); storage .sched_txn_command( - commands::FlashbackToVersionReadPhase::new( + new_flashback_to_version_read_phase_cmd( start_ts, commit_ts, 2.into(), - None, - Some(Key::from_raw(b"k")), - Some(Key::from_raw(b"k")), + Key::from_raw(b"k"), + Key::from_raw(b"z"), Context::default(), ), expect_ok_callback(tx.clone(), 3), @@ -4859,13 +4857,12 @@ mod tests { let commit_ts = *ts.incr(); storage .sched_txn_command( - commands::FlashbackToVersionReadPhase::new( + new_flashback_to_version_read_phase_cmd( start_ts, commit_ts, 1.into(), - None, - Some(Key::from_raw(b"k")), - Some(Key::from_raw(b"k")), + Key::from_raw(b"k"), + Key::from_raw(b"z"), Context::default(), ), expect_ok_callback(tx, 4), @@ -4950,29 +4947,33 @@ mod tests { .0, ); } - // Flashback all records. - storage - .sched_txn_command( - commands::FlashbackToVersionReadPhase::new( - *ts.incr(), - *ts.incr(), - TimeStamp::zero(), - None, - Some(Key::from_raw(b"k")), - Some(Key::from_raw(b"k")), - Context::default(), - ), - expect_ok_callback(tx, 2), - ) - .unwrap(); - rx.recv().unwrap(); - for i in 1..=FLASHBACK_BATCH_SIZE * 4 { - let key = Key::from_raw(format!("k{}", i).as_bytes()); - expect_none( - block_on(storage.get(Context::default(), key, *ts.incr())) - .unwrap() - .0, - ); + // Flashback all records multiple times to make sure the flashback operation is + // idempotent. + let flashback_start_ts = *ts.incr(); + let flashback_commit_ts = *ts.incr(); + for _ in 0..10 { + storage + .sched_txn_command( + new_flashback_to_version_read_phase_cmd( + flashback_start_ts, + flashback_commit_ts, + TimeStamp::zero(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + Context::default(), + ), + expect_ok_callback(tx.clone(), 2), + ) + .unwrap(); + rx.recv().unwrap(); + for i in 1..=FLASHBACK_BATCH_SIZE * 4 { + let key = Key::from_raw(format!("k{}", i).as_bytes()); + expect_none( + block_on(storage.get(Context::default(), key, *ts.incr())) + .unwrap() + .0, + ); + } } } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 321cc21427f..d4767f3bb1a 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -2006,7 +2006,7 @@ pub mod tests { ], expect_is_remain: true, }, - // k1 and k2 have old version writes at version 8. + // k1 and k2 have old version writes at version 3. Case { start_key: None, end_key: None, diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 96f80b9389c..5a86a6caa7d 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -12,16 +12,13 @@ pub const FLASHBACK_BATCH_SIZE: usize = 256 + 1 /* To store the next key for mul pub fn flashback_to_version_read_lock( reader: &mut MvccReader, - next_lock_key: &Option, - end_key: &Option, + next_lock_key: Key, + end_key: &Key, statistics: &mut Statistics, ) -> TxnResult<(Vec<(Key, Lock)>, bool)> { - if next_lock_key.is_none() { - return Ok((vec![], false)); - } let key_locks_result = reader.scan_locks( - next_lock_key.as_ref(), - end_key.as_ref(), + Some(&next_lock_key), + Some(end_key), // To flashback `CF_LOCK`, we need to delete all locks. |_| true, FLASHBACK_BATCH_SIZE, @@ -32,74 +29,82 @@ pub fn flashback_to_version_read_lock( pub fn flashback_to_version_read_write( reader: &mut MvccReader, - key_locks_len: usize, - next_write_key: &Option, - end_key: &Option, + next_write_key: Key, + end_key: &Key, flashback_version: TimeStamp, flashback_start_ts: TimeStamp, flashback_commit_ts: TimeStamp, statistics: &mut Statistics, -) -> TxnResult<(Vec<(Key, Option)>, bool)> { - if next_write_key.is_none() { - return Ok((vec![], false)); - } else if key_locks_len >= FLASHBACK_BATCH_SIZE { - // The batch is full, we need to read the writes in the next batch later. - return Ok((vec![], true)); - } +) -> TxnResult)>> { // To flashback the data, we need to get all the latest keys first by scanning // every unique key in `CF_WRITE` and to get its corresponding old MVCC write // record if exists. - let (key_ts_old_writes, has_remain_writes) = reader.scan_writes( - next_write_key.as_ref(), - end_key.as_ref(), - Some(flashback_version), - // No need to find an old version for the key if its latest `commit_ts` is smaller - // than or equal to the version. - |key| key.decode_ts().unwrap_or(TimeStamp::zero()) > flashback_version, - FLASHBACK_BATCH_SIZE - key_locks_len, - )?; - statistics.add(&reader.statistics); - let mut key_old_writes = Vec::with_capacity(FLASHBACK_BATCH_SIZE - key_locks_len); - // Check the latest commit ts to make sure there is no commit change during the - // flashback, otherwise, we need to abort the flashback. - for (key, commit_ts, old_write) in key_ts_old_writes { - if commit_ts > flashback_commit_ts { - return Err(Error::from(ErrorInner::InvalidTxnTso { - start_ts: flashback_start_ts, - commit_ts: flashback_commit_ts, - })); + let mut key_old_writes = Vec::with_capacity(FLASHBACK_BATCH_SIZE); + let mut has_remain_writes = true; + let mut next_write_key = next_write_key; + // Try to read as many writes as possible in one batch. + while key_old_writes.len() < FLASHBACK_BATCH_SIZE && has_remain_writes { + let key_ts_old_writes; + (key_ts_old_writes, has_remain_writes) = reader.scan_writes( + Some(&next_write_key), + Some(end_key), + Some(flashback_version), + // No need to find an old version for the key if its latest `commit_ts` is smaller + // than or equal to the version. + |key| key.decode_ts().unwrap_or(TimeStamp::zero()) > flashback_version, + FLASHBACK_BATCH_SIZE - key_old_writes.len(), + )?; + statistics.add(&reader.statistics); + // If `has_remain_writes` is true, it means that the batch is full and we may + // need to read another round, so we have to update the `next_write_key` here. + if has_remain_writes { + next_write_key = key_ts_old_writes + .last() + .map(|(key, ..)| key.clone()) + .unwrap(); } - // Since the first flashback preparation phase make sure there will be no writes - // other than flashback after it, so we need to check if there is already a - // successful flashback result, and if so, just finish the flashback ASAP. - if commit_ts == flashback_commit_ts { - key_old_writes.clear(); - return Ok((key_old_writes, false)); + // Check the latest commit ts to make sure there is no commit change during the + // flashback, otherwise, we need to abort the flashback. + for (key, commit_ts, old_write) in key_ts_old_writes.into_iter() { + if commit_ts > flashback_commit_ts { + return Err(Error::from(ErrorInner::InvalidTxnTso { + start_ts: flashback_start_ts, + commit_ts: flashback_commit_ts, + })); + } + // Although the first flashback preparation phase makes sure there will be no + // writes other than flashback after it, we CAN NOT return directly here. + // Suppose the second phase procedure contains two batches to flashback. After + // the first batch is committed, if the region is down, the client will retry + // the flashback from the very first beginning, because the data in the + // first batch has been written the flashbacked data with the same + // `commit_ts`, So we need to skip it to ensure the following data will + // be flashbacked continuously. + // And some large key modifications will exceed the max txn size limit + // through the execution, the write will forcibly finish the batch of data. + // So it may happen that part of the keys in a batch may be flashbacked. + if commit_ts == flashback_commit_ts { + continue; + } + key_old_writes.push((key, old_write)); } - key_old_writes.push((key, old_write)); } - Ok((key_old_writes, has_remain_writes)) + Ok(key_old_writes) } -pub fn flashback_to_version( +// To flashback the `CF_LOCK`, we need to delete all locks records whose +// `start_ts` is greater than the specified version, and if it's not a +// short-value `LockType::Put`, we need to delete the actual data from +// `CF_DEFAULT` as well. +// TODO: `resolved_ts` should be taken into account. +pub fn flashback_to_version_lock( txn: &mut MvccTxn, reader: &mut SnapshotReader, - next_lock_key: &mut Option, - next_write_key: &mut Option, key_locks: Vec<(Key, Lock)>, - key_old_writes: Vec<(Key, Option)>, - start_ts: TimeStamp, - commit_ts: TimeStamp, -) -> TxnResult { - // To flashback the `CF_LOCK`, we need to delete all locks records whose - // `start_ts` is greater than the specified version, and if it's not a - // short-value `LockType::Put`, we need to delete the actual data from - // `CF_DEFAULT` as well. - // TODO: `resolved_ts` should be taken into account. +) -> TxnResult> { for (key, lock) in key_locks { if txn.write_size() >= MAX_TXN_WRITE_SIZE { - *next_lock_key = Some(key); - break; + return Ok(Some(key)); } // To guarantee rollback with start ts of the locks reader.start_ts = lock.ts; @@ -112,18 +117,37 @@ pub fn flashback_to_version( true, )?; } - // To flashback the `CF_WRITE` and `CF_DEFAULT`, we need to write a new MVCC - // record for each key in `self.keys` with its old value at `self.version`, - // specifically, the flashback will have the following behavior: - // - If a key doesn't exist at `self.version`, it will be put a - // `WriteType::Delete`. - // - If a key exists at `self.version`, it will be put the exact same record - // in `CF_WRITE` and `CF_DEFAULT` if needed with `self.commit_ts` and - // `self.start_ts`. + Ok(None) +} + +// To flashback the `CF_WRITE` and `CF_DEFAULT`, we need to write a new MVCC +// record for each key in `self.keys` with its old value at `self.version`, +// specifically, the flashback will have the following behavior: +// - If a key doesn't exist at `self.version`, it will be put a +// `WriteType::Delete`. +// - If a key exists at `self.version`, it will be put the exact same record +// in `CF_WRITE` and `CF_DEFAULT` if needed with `self.commit_ts` and +// `self.start_ts`. +pub fn flashback_to_version_write( + txn: &mut MvccTxn, + reader: &mut SnapshotReader, + key_old_writes: Vec<(Key, Option)>, + start_ts: TimeStamp, + commit_ts: TimeStamp, +) -> TxnResult> { for (key, old_write) in key_old_writes { + #[cfg(feature = "failpoints")] + { + let should_skip = || { + fail::fail_point!("flashback_skip_1_key_in_write", |_| true); + false + }; + if should_skip() { + continue; + } + } if txn.write_size() >= MAX_TXN_WRITE_SIZE { - *next_write_key = Some(key); - break; + return Ok(Some(key.clone())); } let new_write = if let Some(old_write) = old_write { // If it's not a short value and it's a `WriteType::Put`, we should put the old @@ -135,7 +159,11 @@ pub fn flashback_to_version( reader.load_data(&key, old_write.clone())?, ); } - Write::new(old_write.write_type, start_ts, old_write.short_value) + Write::new( + old_write.write_type, + start_ts, + old_write.short_value.clone(), + ) } else { // If the old write doesn't exist, we should put a `WriteType::Delete` record to // delete the current key when needed. @@ -148,7 +176,7 @@ pub fn flashback_to_version( }; txn.put_write(key.clone(), commit_ts, new_write.as_ref().to_bytes()); } - Ok(txn.modifies.len()) + Ok(None) } #[cfg(test)] @@ -179,43 +207,48 @@ pub mod tests { start_ts: impl Into, commit_ts: impl Into, ) -> usize { + let next_key = Key::from_raw(keys::next_key(key).as_slice()); let key = Key::from_raw(key); let (version, start_ts, commit_ts) = (version.into(), start_ts.into(), commit_ts.into()); let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); let mut statistics = Statistics::default(); + // Flashback the locks. let (key_locks, has_remain_locks) = - flashback_to_version_read_lock(&mut reader, &Some(key.clone()), &None, &mut statistics) + flashback_to_version_read_lock(&mut reader, key.clone(), &next_key, &mut statistics) .unwrap(); assert!(!has_remain_locks); - let (key_old_writes, has_remain_writes) = flashback_to_version_read_write( + let cm = ConcurrencyManager::new(TimeStamp::zero()); + let mut txn = MvccTxn::new(start_ts, cm.clone()); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut snap_reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); + flashback_to_version_lock(&mut txn, &mut snap_reader, key_locks).unwrap(); + let mut rows = txn.modifies.len(); + write(engine, &ctx, txn.into_modifies()); + // Flashback the writes. + let key_old_writes = flashback_to_version_read_write( &mut reader, - 0, - &Some(key.clone()), - &None, + key, + &next_key, version, start_ts, commit_ts, &mut statistics, ) .unwrap(); - assert!(!has_remain_writes); - let cm = ConcurrencyManager::new(TimeStamp::zero()); let mut txn = MvccTxn::new(start_ts, cm); let snapshot = engine.snapshot(Default::default()).unwrap(); - let mut reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); - let rows = flashback_to_version( + let mut snap_reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); + flashback_to_version_write( &mut txn, - &mut reader, - &mut None, - &mut Some(key), - key_locks, + &mut snap_reader, key_old_writes, start_ts, commit_ts, ) .unwrap(); + rows += txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); rows } diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 9b198724e3b..f20fd957ed7 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -1,18 +1,22 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use txn_types::{Key, Lock, TimeStamp, Write}; +use std::mem; + +use txn_types::{Key, TimeStamp}; use crate::storage::{ kv::WriteData, lock_manager::LockManager, mvcc::{MvccTxn, SnapshotReader}, txn::{ + actions::flashback_to_version::{flashback_to_version_lock, flashback_to_version_write}, commands::{ - Command, CommandExt, FlashbackToVersionReadPhase, ReaderWithStats, ReleasedLocks, - ResponsePolicy, TypedCommand, WriteCommand, WriteContext, WriteResult, + Command, CommandExt, FlashbackToVersionReadPhase, FlashbackToVersionState, + ReaderWithStats, ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, + WriteContext, WriteResult, }, - flashback_to_version, latch, Result, + latch, Result, }, ProcessResult, Snapshot, }; @@ -25,11 +29,9 @@ command! { start_ts: TimeStamp, commit_ts: TimeStamp, version: TimeStamp, - end_key: Option, - next_lock_key: Option, - next_write_key: Option, - key_locks: Vec<(Key, Lock)>, - key_old_writes: Vec<(Key, Option)>, + start_key: Key, + end_key: Key, + state: FlashbackToVersionState, } } @@ -39,24 +41,27 @@ impl CommandExt for FlashbackToVersion { request_type!(KvFlashbackToVersion); fn gen_lock(&self) -> latch::Lock { - latch::Lock::new( - self.key_locks - .iter() - .map(|(key, _)| key) - .chain(self.key_old_writes.iter().map(|(key, _)| key)), - ) + match &self.state { + FlashbackToVersionState::ScanLock { key_locks, .. } => { + latch::Lock::new(key_locks.iter().map(|(key, _)| key)) + } + FlashbackToVersionState::ScanWrite { key_old_writes, .. } => { + latch::Lock::new(key_old_writes.iter().map(|(key, _)| key)) + } + } } fn write_bytes(&self) -> usize { - self.key_locks - .iter() - .map(|(key, _)| key.as_encoded().len()) - .chain( - self.key_old_writes - .iter() - .map(|(key, _)| key.as_encoded().len()), - ) - .sum() + match &self.state { + FlashbackToVersionState::ScanLock { key_locks, .. } => key_locks + .iter() + .map(|(key, _)| key.as_encoded().len()) + .sum(), + FlashbackToVersionState::ScanWrite { key_old_writes, .. } => key_old_writes + .iter() + .map(|(key, _)| key.as_encoded().len()) + .sum(), + } } } @@ -67,42 +72,58 @@ impl WriteCommand for FlashbackToVersion { context.statistics, ); let mut txn = MvccTxn::new(TimeStamp::zero(), context.concurrency_manager); - - let mut next_lock_key = self.next_lock_key.take(); - let mut next_write_key = self.next_write_key.take(); - let rows = flashback_to_version( - &mut txn, - &mut reader, - &mut next_lock_key, - &mut next_write_key, - self.key_locks, - self.key_old_writes, - self.start_ts, - self.commit_ts, - )?; + // The state must be `ScanLock` or `ScanWrite` here. + match self.state { + FlashbackToVersionState::ScanLock { + ref mut next_lock_key, + ref mut key_locks, + } => { + if let Some(new_next_lock_key) = + flashback_to_version_lock(&mut txn, &mut reader, mem::take(key_locks))? + { + *next_lock_key = new_next_lock_key; + } + } + FlashbackToVersionState::ScanWrite { + ref mut next_write_key, + ref mut key_old_writes, + } => { + if let Some(new_next_write_key) = flashback_to_version_write( + &mut txn, + &mut reader, + mem::take(key_old_writes), + self.start_ts, + self.commit_ts, + )? { + *next_write_key = new_next_write_key; + } + } + } + let rows = txn.modifies.len(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.extra.for_flashback = true; Ok(WriteResult { ctx: self.ctx.clone(), to_be_write: write_data, rows, - pr: if next_lock_key.is_none() && next_write_key.is_none() { - ProcessResult::Res - } else { + pr: (move || { + fail_point!("flashback_failed_after_first_batch", |_| { + ProcessResult::Res + }); let next_cmd = FlashbackToVersionReadPhase { - ctx: self.ctx.clone(), + ctx: self.ctx, deadline: self.deadline, start_ts: self.start_ts, commit_ts: self.commit_ts, version: self.version, + start_key: self.start_key, end_key: self.end_key, - next_lock_key, - next_write_key, + state: self.state, }; ProcessResult::NextCommand { cmd: Command::FlashbackToVersionReadPhase(next_cmd), } - }, + })(), lock_info: None, released_locks: ReleasedLocks::new(), lock_guards: vec![], diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index 47348c8e188..d74c6f8d708 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -1,7 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use txn_types::{Key, TimeStamp}; +use txn_types::{Key, Lock, TimeStamp, Write}; use crate::storage::{ mvcc::MvccReader, @@ -13,9 +13,43 @@ use crate::storage::{ sched_pool::tls_collect_keyread_histogram_vec, Error, ErrorInner, Result, }, - ScanMode, Snapshot, Statistics, + Context, ScanMode, Snapshot, Statistics, }; +#[derive(Debug)] +pub enum FlashbackToVersionState { + ScanLock { + next_lock_key: Key, + key_locks: Vec<(Key, Lock)>, + }, + ScanWrite { + next_write_key: Key, + key_old_writes: Vec<(Key, Option)>, + }, +} + +pub fn new_flashback_to_version_read_phase_cmd( + start_ts: TimeStamp, + commit_ts: TimeStamp, + version: TimeStamp, + start_key: Key, + end_key: Key, + ctx: Context, +) -> TypedCommand<()> { + FlashbackToVersionReadPhase::new( + start_ts, + commit_ts, + version, + start_key.clone(), + end_key, + FlashbackToVersionState::ScanLock { + next_lock_key: start_key, + key_locks: Vec::new(), + }, + ctx, + ) +} + command! { FlashbackToVersionReadPhase: cmd_ty => (), @@ -24,9 +58,9 @@ command! { start_ts: TimeStamp, commit_ts: TimeStamp, version: TimeStamp, - end_key: Option, - next_lock_key: Option, - next_write_key: Option, + start_key: Key, + end_key: Key, + state: FlashbackToVersionState, } } @@ -58,62 +92,90 @@ impl ReadCommand for FlashbackToVersionReadPhase { commit_ts: self.commit_ts, })); } + let tag = self.tag().get_str(); + let mut read_again = false; let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &self.ctx); - // Scan the locks. - let (key_locks, has_remain_locks) = flashback_to_version_read_lock( - &mut reader, - &self.next_lock_key, - &self.end_key, - statistics, - )?; - // Scan the writes. - let (mut key_old_writes, has_remain_writes) = flashback_to_version_read_write( - &mut reader, - key_locks.len(), - &self.next_write_key, - &self.end_key, - self.version, - self.start_ts, - self.commit_ts, - statistics, - )?; - tls_collect_keyread_histogram_vec( - self.tag().get_str(), - (key_locks.len() + key_old_writes.len()) as f64, - ); - - if key_locks.is_empty() && key_old_writes.is_empty() { - Ok(ProcessResult::Res) - } else { - let next_lock_key = if has_remain_locks { - key_locks.last().map(|(key, _)| key.clone()) - } else { - None - }; - let next_write_key = if has_remain_writes && !key_old_writes.is_empty() { - key_old_writes.pop().map(|(key, _)| key) - } else if has_remain_writes && key_old_writes.is_empty() { - // We haven't read any write yet, so we need to read the writes in the next - // batch later. - self.next_write_key + // Separate the lock and write flashback to prevent from putting two writes for + // the same key in a single batch to make the TiCDC panic. + let next_state = match self.state { + FlashbackToVersionState::ScanLock { next_lock_key, .. } => { + let (mut key_locks, has_remain_locks) = flashback_to_version_read_lock( + &mut reader, + next_lock_key, + &self.end_key, + statistics, + )?; + if key_locks.is_empty() && !has_remain_locks { + // No more locks to flashback, continue to scan the writes. + read_again = true; + FlashbackToVersionState::ScanWrite { + next_write_key: self.start_key.clone(), + key_old_writes: Vec::new(), + } + } else { + assert!(!key_locks.is_empty()); + tls_collect_keyread_histogram_vec(tag, key_locks.len() as f64); + FlashbackToVersionState::ScanLock { + // DO NOT pop the last key as the next key when it's the only key to prevent + // from making flashback fall into an dead loop. + next_lock_key: if key_locks.len() > 1 { + key_locks.pop().map(|(key, _)| key).unwrap() + } else { + key_locks.last().map(|(key, _)| key.clone()).unwrap() + }, + key_locks, + } + } + } + FlashbackToVersionState::ScanWrite { next_write_key, .. } => { + let mut key_old_writes = flashback_to_version_read_write( + &mut reader, + next_write_key, + &self.end_key, + self.version, + self.start_ts, + self.commit_ts, + statistics, + )?; + if key_old_writes.is_empty() { + // No more writes to flashback, just return. + return Ok(ProcessResult::Res); + } + tls_collect_keyread_histogram_vec(tag, key_old_writes.len() as f64); + FlashbackToVersionState::ScanWrite { + next_write_key: if key_old_writes.len() > 1 { + key_old_writes.pop().map(|(key, _)| key).unwrap() + } else { + key_old_writes.last().map(|(key, _)| key.clone()).unwrap() + }, + key_old_writes, + } + } + }; + Ok(ProcessResult::NextCommand { + cmd: if read_again { + Command::FlashbackToVersionReadPhase(FlashbackToVersionReadPhase { + ctx: self.ctx, + deadline: self.deadline, + start_ts: self.start_ts, + commit_ts: self.commit_ts, + version: self.version, + start_key: self.start_key, + end_key: self.end_key, + state: next_state, + }) } else { - None - }; - let next_cmd = FlashbackToVersion { - ctx: self.ctx, - deadline: self.deadline, - start_ts: self.start_ts, - commit_ts: self.commit_ts, - version: self.version, - end_key: self.end_key, - key_locks, - key_old_writes, - next_lock_key, - next_write_key, - }; - Ok(ProcessResult::NextCommand { - cmd: Command::FlashbackToVersion(next_cmd), - }) - } + Command::FlashbackToVersion(FlashbackToVersion { + ctx: self.ctx, + deadline: self.deadline, + start_ts: self.start_ts, + commit_ts: self.commit_ts, + version: self.version, + start_key: self.start_key, + end_key: self.end_key, + state: next_state, + }) + }, + }) } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 4213eeb6b68..fc044a9fa78 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -40,7 +40,9 @@ pub use commit::Commit; pub use compare_and_swap::RawCompareAndSwap; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; pub use flashback_to_version::FlashbackToVersion; -pub use flashback_to_version_read_phase::FlashbackToVersionReadPhase; +pub use flashback_to_version_read_phase::{ + new_flashback_to_version_read_phase_cmd, FlashbackToVersionReadPhase, FlashbackToVersionState, +}; use kvproto::kvrpcpb::*; pub use mvcc_by_key::MvccByKey; pub use mvcc_by_start_ts::MvccByStartTs; @@ -353,13 +355,12 @@ impl From for TypedCommand> { impl From for TypedCommand<()> { fn from(mut req: FlashbackToVersionRequest) -> Self { - FlashbackToVersionReadPhase::new( + new_flashback_to_version_read_phase_cmd( req.get_start_ts().into(), req.get_commit_ts().into(), req.get_version().into(), - Some(Key::from_raw(req.get_end_key())), - Some(Key::from_raw(req.get_start_key())), - Some(Key::from_raw(req.get_start_key())), + Key::from_raw(req.get_start_key()), + Key::from_raw(req.get_end_key()), req.take_context(), ) } diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index 1af3c9d63e6..615ab98cb8c 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -24,8 +24,8 @@ pub use self::{ cleanup::cleanup, commit::commit, flashback_to_version::{ - flashback_to_version, flashback_to_version_read_lock, flashback_to_version_read_write, - FLASHBACK_BATCH_SIZE, + flashback_to_version_lock, flashback_to_version_read_lock, + flashback_to_version_read_write, flashback_to_version_write, FLASHBACK_BATCH_SIZE, }, gc::gc, prewrite::{prewrite, CommitKind, TransactionKind, TransactionProperties}, diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index f3e3bda8a24..cfbe6ff504e 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -1,6 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + char::from_u32, path::Path, sync::*, thread, @@ -42,6 +43,7 @@ use tikv::{ gc_worker::sync_gc, service::{batch_commands_request, batch_commands_response}, }, + storage::txn::FLASHBACK_BATCH_SIZE, }; use tikv_util::{ config::ReadableSize, @@ -597,13 +599,123 @@ fn test_mvcc_resolve_lock_gc_and_delete() { assert!(del_resp.error.is_empty()); } +#[test] +#[cfg(feature = "failpoints")] +fn test_mvcc_flashback_failed_after_first_batch() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let mut ts = 0; + for i in 0..FLASHBACK_BATCH_SIZE * 2 { + // Meet the constraints of the alphabetical order for test + let k = format!("key@{}", from_u32(i as u32).unwrap()).into_bytes(); + complete_data_commit(&client, &ctx, ts, k.clone(), b"value@0".to_vec()); + } + ts += 3; + let check_ts = ts; + for i in 0..FLASHBACK_BATCH_SIZE * 2 { + let k = format!("key@{}", from_u32(i as u32).unwrap()).into_bytes(); + complete_data_commit(&client, &ctx, ts, k.clone(), b"value@1".to_vec()); + } + ts += 3; + // Flashback + fail::cfg("flashback_failed_after_first_batch", "return").unwrap(); + fail::cfg("flashback_skip_1_key_in_write", "1*return").unwrap(); + must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); + fail::remove("flashback_skip_1_key_in_write"); + fail::remove("flashback_failed_after_first_batch"); + // skip for key@0 + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(0_u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@1".to_vec(), + ts + 2, + ); + // The first batch of writes are flashbacked. + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(1_u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts + 2, + ); + // Subsequent batches of writes are not flashbacked. + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(FLASHBACK_BATCH_SIZE as u32 - 1).unwrap()) + .as_bytes() + .to_vec(), + b"value@1".to_vec(), + ts + 2, + ); + // Flashback batch 2. + fail::cfg("flashback_failed_after_first_batch", "return").unwrap(); + must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); + fail::remove("flashback_failed_after_first_batch"); + // key@0 must be flahsbacked in the second batch firstly. + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(0_u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts + 2, + ); + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(FLASHBACK_BATCH_SIZE as u32 - 1).unwrap()) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts + 2, + ); + // 2 * (FLASHBACK_BATCH_SIZE - 1) - 1 keys are flashbacked. + must_kv_read_equal( + &client, + ctx.clone(), + format!( + "key@{}", + from_u32(2 * FLASHBACK_BATCH_SIZE as u32 - 3).unwrap() + ) + .as_bytes() + .to_vec(), + b"value@1".to_vec(), + ts + 2, + ); + // Flashback needs to be continued. + must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); + // Flashback again to check if any error occurs :) + must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); + ts += 2; + // Subsequent batches of writes are flashbacked. + must_kv_read_equal( + &client, + ctx, + format!( + "key@{}", + from_u32(2 * FLASHBACK_BATCH_SIZE as u32 - 3).unwrap() + ) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts, + ); +} + #[test] fn test_mvcc_flashback() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); let mut ts = 0; - let k = b"key".to_vec(); - for i in 0..10 { + // Need to write many batches. + for i in 0..2000 { let v = format!("value@{}", i).into_bytes(); + let k = format!("key@{}", i % 1000).into_bytes(); // Prewrite ts += 1; let prewrite_start_version = ts; @@ -634,6 +746,7 @@ fn test_mvcc_flashback() { must_kv_read_equal(&client, ctx.clone(), k.clone(), v.clone(), ts) } // Prewrite to leave a lock. + let k = b"key@1".to_vec(); ts += 1; let prewrite_start_version = ts; let mut mutation = Mutation::default(); @@ -651,19 +764,17 @@ fn test_mvcc_flashback() { let get_version = ts; let mut get_req = GetRequest::default(); get_req.set_context(ctx.clone()); - get_req.key = k.clone(); + get_req.key = k; get_req.version = get_version; let get_resp = client.kv_get(&get_req).unwrap(); assert!(!get_resp.has_region_error()); assert!(get_resp.get_error().has_locked()); assert!(get_resp.value.is_empty()); // Flashback - let flashback_resp = must_flashback_to_version(&client, ctx.clone(), 5, ts + 1, ts + 2); + must_flashback_to_version(&client, ctx.clone(), 5, ts + 1, ts + 2); ts += 2; - assert!(!flashback_resp.has_region_error()); - assert!(flashback_resp.get_error().is_empty()); // Should not meet the lock and can not get the latest data any more. - must_kv_read_equal(&client, ctx, k, b"value@1".to_vec(), ts); + must_kv_read_equal(&client, ctx, b"key@1".to_vec(), b"value@1".to_vec(), ts); } #[test] @@ -672,9 +783,7 @@ fn test_mvcc_flashback_block_rw() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); // Flashback - let flashback_resp = must_flashback_to_version(&client, ctx.clone(), 0, 1, 2); - assert!(!flashback_resp.has_region_error()); - assert!(flashback_resp.get_error().is_empty()); + must_flashback_to_version(&client, ctx.clone(), 0, 1, 2); // Try to read. let (k, v) = (b"key".to_vec(), b"value".to_vec()); // Get @@ -712,9 +821,7 @@ fn test_mvcc_flashback_block_scheduling() { let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); // Flashback - let flashback_resp = must_flashback_to_version(&client, ctx, 0, 1, 2); - assert!(!flashback_resp.has_region_error()); - assert!(flashback_resp.get_error().is_empty()); + must_flashback_to_version(&client, ctx, 0, 1, 2); // Try to transfer leader. let transfer_leader_resp = cluster.try_transfer_leader(1, new_peer(2, 2)); assert!( From 16d1e2a2c3e84634b7da13f3ee640ca6e5c08adc Mon Sep 17 00:00:00 2001 From: Yasuo Honda Date: Fri, 4 Nov 2022 12:42:01 +0900 Subject: [PATCH 313/676] *: Fix link to TiDB Release Notes (#13718) close tikv/tikv#13720 Signed-off-by: Yasuo Honda --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb19c34a583..26fd52f2bd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # TiKV Change Log All notable changes to this project are documented in this file. -See also [TiDB Changelog](https://github.com/pingcap/tidb/blob/master/CHANGELOG.md) and [PD Changelog](https://github.com/pingcap/pd/blob/master/CHANGELOG.md). +See also [TiDB Release Notes](https://github.com/pingcap/docs/blob/master/releases/release-notes.md) and [PD Changelog](https://github.com/pingcap/pd/blob/master/CHANGELOG.md). ## [5.3.0] - 2021-11-29 From 64fe6ce808ba1d3847ed02591e9dea3e022276db Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 4 Nov 2022 17:24:01 +0800 Subject: [PATCH 314/676] raftstore-v2: partially support tablet split (#13689) ref tikv/tikv#12842 Signed-off-by: SpadeA-Tang Co-authored-by: Xinye Tao --- components/raftstore-v2/src/fsm/apply.rs | 4 +- components/raftstore-v2/src/lib.rs | 1 + .../src/operation/command/admin/mod.rs | 13 +- .../src/operation/command/admin/split.rs | 623 ++++++++++++++++++ .../raftstore-v2/src/operation/command/mod.rs | 29 +- .../src/operation/ready/snapshot.rs | 1 + components/raftstore-v2/src/raft/apply.rs | 14 +- components/raftstore-v2/src/raft/storage.rs | 1 + components/raftstore/src/store/fsm/apply.rs | 148 ++++- components/raftstore/src/store/util.rs | 22 +- 10 files changed, 813 insertions(+), 43 deletions(-) create mode 100644 components/raftstore-v2/src/operation/command/admin/split.rs diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index c4eb03f350d..b8faf589760 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -11,7 +11,7 @@ use std::{ use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; -use engine_traits::KvEngine; +use engine_traits::{KvEngine, TabletFactory}; use futures::{Future, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; use raftstore::store::ReadTask; @@ -65,6 +65,7 @@ impl ApplyFsm { region_state: RegionLocalState, res_reporter: R, remote_tablet: CachedTablet, + tablet_factory: Arc>, read_scheduler: Scheduler>, logger: Logger, ) -> (ApplyScheduler, Self) { @@ -74,6 +75,7 @@ impl ApplyFsm { region_state, res_reporter, remote_tablet, + tablet_factory, read_scheduler, logger, ); diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 0c1a460298d..2f30ee9873d 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -23,6 +23,7 @@ #![allow(unused)] #![feature(let_else)] +#![feature(array_windows)] mod batch; mod bootstrap; diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 396e3ede98f..afaefeb9b7e 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -1,10 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. mod conf_change; +mod split; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ - raft_cmdpb::{AdminRequest, RaftCmdRequest}, + raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest}, raft_serverpb::PeerState, }; use protobuf::Message; @@ -19,6 +20,7 @@ use raftstore::{ Result, }; use slog::info; +pub use split::SplitResult; use tikv_util::box_err; use self::conf_change::ConfChangeResult; @@ -30,6 +32,7 @@ use crate::{ #[derive(Debug)] pub enum AdminCmdResult { + SplitRegion(SplitResult), ConfChange(ConfChangeResult), } @@ -72,7 +75,13 @@ impl Peer { self.propose_conf_change(ctx, req) } else { // propose other admin command. - unimplemented!() + match cmd_type { + AdminCmdType::Split => Err(box_err!( + "Split is deprecated. Please use BatchSplit instead." + )), + AdminCmdType::BatchSplit => self.propose_split(ctx, req), + _ => unimplemented!(), + } }; if let Err(e) = &res { info!( diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs new file mode 100644 index 00000000000..c0d8998c4ad --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -0,0 +1,623 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains batch split related processing logic. +//! +//! Process Overview +//! +//! Propose: +//! - Nothing special except for validating batch split requests (ex: split keys +//! are in ascending order). +//! +//! Execution: +//! - exec_batch_split: Create and initialize metapb::region for split regions +//! and derived regions. Then, create checkpoints of the current talbet for +//! split regions and derived region to make tablet physical isolated. Update +//! the parent region's region state without persistency. Send the new regions +//! (including derived region) back to raftstore. +//! +//! Result apply: +//! - todo +//! +//! Split peer creation and initlization: +//! - todo +//! +//! Split finish: +//! - todo + +use std::collections::VecDeque; + +use engine_traits::{ + Checkpointer, KvEngine, OpenOptions, RaftEngine, TabletFactory, CF_DEFAULT, SPLIT_PREFIX, +}; +use kvproto::{ + metapb::Region, + raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, + raft_serverpb::RegionLocalState, +}; +use protobuf::Message; +use raftstore::{ + coprocessor::split_observer::{is_valid_split_key, strip_timestamp_if_exists}, + store::{ + fsm::apply::validate_batch_split, + metrics::PEER_ADMIN_CMD_COUNTER, + util::{self, KeysInfoFormatter}, + PeerStat, ProposalContext, RAFT_INIT_LOG_INDEX, + }, + Result, +}; +use slog::{info, warn, Logger}; +use tikv_util::box_err; + +use crate::{ + batch::StoreContext, + fsm::ApplyResReporter, + operation::AdminCmdResult, + raft::{Apply, Peer}, + router::ApplyRes, +}; + +#[derive(Debug)] +pub struct SplitResult { + pub regions: Vec, + // The index of the derived region in `regions` + pub derived_index: usize, + pub tablet_index: u64, +} + +impl Peer { + pub fn propose_split( + &mut self, + store_ctx: &mut StoreContext, + mut req: RaftCmdRequest, + ) -> Result { + validate_batch_split(req.mut_admin_request(), self.region())?; + let mut proposal_ctx = ProposalContext::empty(); + proposal_ctx.insert(ProposalContext::SYNC_LOG); + proposal_ctx.insert(ProposalContext::SPLIT); + + let data = req.write_to_bytes().unwrap(); + self.propose_with_ctx(store_ctx, data, proposal_ctx.to_vec()) + } +} + +impl Apply { + pub fn apply_split( + &mut self, + req: &AdminRequest, + log_index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + info!( + self.logger, + "split is deprecated, redirect to use batch split"; + ); + let split = req.get_split().to_owned(); + let mut admin_req = AdminRequest::default(); + admin_req + .mut_splits() + .set_right_derive(split.get_right_derive()); + admin_req.mut_splits().mut_requests().push(split); + // This method is executed only when there are unapplied entries after being + // restarted. So there will be no callback, it's OK to return a response + // that does not matched with its request. + self.apply_batch_split(req, log_index) + } + + pub fn apply_batch_split( + &mut self, + req: &AdminRequest, + log_index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); + + let region = self.region_state().get_region(); + let region_id = region.get_id(); + validate_batch_split(req, self.region_state().get_region())?; + + let mut boundaries: Vec<&[u8]> = Vec::default(); + boundaries.push(self.region_state().get_region().get_start_key()); + for req in req.get_splits().get_requests() { + boundaries.push(req.get_split_key()); + } + boundaries.push(self.region_state().get_region().get_end_key()); + + info!( + self.logger, + "split region"; + "region" => ?region, + "boundaries" => %KeysInfoFormatter(boundaries.iter()), + ); + + let split_reqs = req.get_splits(); + let new_region_cnt = split_reqs.get_requests().len(); + let new_version = region.get_region_epoch().get_version() + new_region_cnt as u64; + + let mut derived_req = SplitRequest::default(); + derived_req.new_region_id = region.id; + let derived_req = &[derived_req]; + + let right_derive = split_reqs.get_right_derive(); + let reqs = if right_derive { + split_reqs.get_requests().iter().chain(derived_req) + } else { + derived_req.iter().chain(split_reqs.get_requests()) + }; + + let regions: Vec<_> = boundaries + .array_windows::<2>() + .zip(reqs) + .map(|([start_key, end_key], req)| { + let mut new_region = Region::default(); + new_region.set_id(req.get_new_region_id()); + new_region.set_region_epoch(region.get_region_epoch().to_owned()); + new_region.mut_region_epoch().set_version(new_version); + new_region.set_start_key(start_key.to_vec()); + new_region.set_end_key(end_key.to_vec()); + new_region.set_peers(region.get_peers().to_vec().into()); + // If the `req` is the `derived_req`, the peers are already set correctly and + // the following loop will not be executed due to the empty `new_peer_ids` in + // the `derived_req` + for (peer, peer_id) in new_region + .mut_peers() + .iter_mut() + .zip(req.get_new_peer_ids()) + { + peer.set_id(*peer_id); + } + new_region + }) + .collect(); + + let derived_index = if right_derive { regions.len() - 1 } else { 0 }; + + // We will create checkpoint of the current tablet for both derived region and + // split regions. Before the creation, we should flush the writes and remove the + // write batch + self.flush(); + + // todo(SpadeA): Here: we use a temporary solution that we use checkpoint API to + // clone new tablets. It may cause large jitter as we need to flush the + // memtable. And more what is more important is that after removing WAL, the API + // will never flush. + // We will freeze the memtable rather than flush it in the following PR. + let tablet = self.tablet().clone(); + let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { + panic!( + "{:?} fails to create checkpoint object: {:?}", + self.logger.list(), + e + ) + }); + + for new_region in ®ions { + let new_region_id = new_region.id; + if new_region_id == region_id { + continue; + } + + let split_temp_path = self.tablet_factory().tablet_path_with_prefix( + SPLIT_PREFIX, + new_region_id, + RAFT_INIT_LOG_INDEX, + ); + checkpointer + .create_at(&split_temp_path, None, 0) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to create checkpoint with path {:?}: {:?}", + self.logger.list(), + split_temp_path, + e + ) + }); + } + + let derived_path = self.tablet_factory().tablet_path(region_id, log_index); + checkpointer + .create_at(&derived_path, None, 0) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to create checkpoint with path {:?}: {:?}", + self.logger.list(), + derived_path, + e + ) + }); + let tablet = self + .tablet_factory() + .open_tablet(region_id, Some(log_index), OpenOptions::default()) + .unwrap(); + // Remove the old write batch. + self.write_batch_mut().take(); + self.publish_tablet(tablet); + + self.region_state_mut() + .set_region(regions[derived_index].clone()); + self.region_state_mut().set_tablet_index(log_index); + + let mut resp = AdminResponse::default(); + resp.mut_splits().set_regions(regions.clone().into()); + PEER_ADMIN_CMD_COUNTER.batch_split.success.inc(); + + Ok(( + resp, + AdminCmdResult::SplitRegion(SplitResult { + regions, + derived_index, + tablet_index: log_index, + }), + )) + } +} + +#[cfg(test)] +mod test { + use std::sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, + }; + + use collections::HashMap; + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::TestTabletFactoryV2, + raft, + }; + use engine_traits::{CfOptionsExt, Peekable, WriteBatch, ALL_CFS}; + use futures::channel::mpsc::unbounded; + use kvproto::{ + metapb::RegionEpoch, + raft_cmdpb::{AdminCmdType, BatchSplitRequest, PutRequest, RaftCmdResponse, SplitRequest}, + raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}, + }; + use raftstore::store::{cmd_resp::new_error, Config, ReadRunner}; + use slog::o; + use tempfile::TempDir; + use tikv_util::{ + codec::bytes::encode_bytes, + config::VersionTrack, + store::{new_learner_peer, new_peer}, + worker::{dummy_future_scheduler, dummy_scheduler, FutureScheduler, Scheduler, Worker}, + }; + + use super::*; + use crate::{ + fsm::{ApplyFsm, ApplyResReporter}, + raft::Apply, + tablet::CachedTablet, + }; + + struct MockReporter { + sender: Sender, + } + + impl MockReporter { + fn new() -> (Self, Receiver) { + let (tx, rx) = channel(); + (MockReporter { sender: tx }, rx) + } + } + + impl ApplyResReporter for MockReporter { + fn report(&self, apply_res: ApplyRes) { + let _ = self.sender.send(apply_res); + } + } + + fn new_split_req(key: &[u8], id: u64, children: Vec) -> SplitRequest { + let mut req = SplitRequest::default(); + req.set_split_key(key.to_vec()); + req.set_new_region_id(id); + req.set_new_peer_ids(children); + req + } + + fn assert_split( + apply: &mut Apply, + factory: &Arc, + parent_id: u64, + right_derived: bool, + new_region_ids: Vec, + split_keys: Vec>, + children_peers: Vec>, + log_index: u64, + region_boundries: Vec<(Vec, Vec)>, + expected_region_epoch: RegionEpoch, + expected_derived_index: usize, + ) { + let mut splits = BatchSplitRequest::default(); + splits.set_right_derive(right_derived); + + for ((new_region_id, children), split_key) in new_region_ids + .into_iter() + .zip(children_peers.clone()) + .zip(split_keys) + { + splits + .mut_requests() + .push(new_split_req(&split_key, new_region_id, children)); + } + + let mut req = AdminRequest::default(); + req.set_splits(splits); + + // Exec batch split + let (resp, apply_res) = apply.apply_batch_split(&req, log_index).unwrap(); + + let regions = resp.get_splits().get_regions(); + assert!(regions.len() == region_boundries.len()); + + let mut child_idx = 0; + for (i, region) in regions.iter().enumerate() { + assert_eq!(region.get_start_key().to_vec(), region_boundries[i].0); + assert_eq!(region.get_end_key().to_vec(), region_boundries[i].1); + assert_eq!(*region.get_region_epoch(), expected_region_epoch); + + if region.id == parent_id { + let state = apply.region_state(); + assert_eq!(state.tablet_index, log_index); + assert_eq!(state.get_region(), region); + let tablet_path = factory.tablet_path(region.id, log_index); + assert!(factory.exists_raw(&tablet_path)); + + match apply_res { + AdminCmdResult::SplitRegion(SplitResult { + derived_index, + tablet_index, + .. + }) => { + assert_eq!(expected_derived_index, derived_index); + assert_eq!(tablet_index, log_index); + } + _ => panic!(), + } + } else { + assert_eq! { + region.get_peers().iter().map(|peer| peer.id).collect::>(), + children_peers[child_idx] + } + child_idx += 1; + + let tablet_path = + factory.tablet_path_with_prefix(SPLIT_PREFIX, region.id, RAFT_INIT_LOG_INDEX); + assert!(factory.exists_raw(&tablet_path)); + } + } + } + + #[test] + fn test_split() { + let store_id = 2; + + let mut region = Region::default(); + region.set_id(1); + region.set_end_key(b"k10".to_vec()); + region.mut_region_epoch().set_version(3); + let peers = vec![new_peer(2, 3), new_peer(4, 5), new_learner_peer(6, 7)]; + region.set_peers(peers.into()); + + let logger = slog_global::borrow_global().new(o!()); + let path = TempDir::new().unwrap(); + let cf_opts = ALL_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let factory = Arc::new(TestTabletFactoryV2::new( + path.path(), + DbOptions::default(), + cf_opts, + )); + + let tablet = factory + .open_tablet( + region.id, + Some(5), + OpenOptions::default().set_create_new(true), + ) + .unwrap(); + + let mut region_state = RegionLocalState::default(); + region_state.set_state(PeerState::Normal); + region_state.set_region(region.clone()); + region_state.set_tablet_index(5); + + let (read_scheduler, rx) = dummy_scheduler(); + let (reporter, _) = MockReporter::new(); + let mut apply = Apply::new( + region + .get_peers() + .iter() + .find(|p| p.store_id == store_id) + .unwrap() + .clone(), + region_state, + reporter, + CachedTablet::new(Some(tablet)), + factory.clone(), + read_scheduler, + logger.clone(), + ); + + let mut splits = BatchSplitRequest::default(); + splits.set_right_derive(true); + splits.mut_requests().push(new_split_req(b"k1", 1, vec![])); + let mut req = AdminRequest::default(); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // 3 followers are required. + assert!(err.to_string().contains("invalid new peer id count")); + + splits.mut_requests().clear(); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // Empty requests should be rejected. + assert!(err.to_string().contains("missing split requests")); + + splits + .mut_requests() + .push(new_split_req(b"k11", 1, vec![11, 12, 13])); + req.set_splits(splits.clone()); + let resp = new_error(apply.apply_batch_split(&req, 0).unwrap_err()); + // Out of range keys should be rejected. + assert!( + resp.get_header().get_error().has_key_not_in_region(), + "{:?}", + resp + ); + + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"", 1, vec![11, 12, 13])); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // Empty key will not in any region exclusively. + assert!(err.to_string().contains("missing split key"), "{:?}", err); + + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"k2", 1, vec![11, 12, 13])); + splits + .mut_requests() + .push(new_split_req(b"k1", 1, vec![11, 12, 13])); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // keys should be in ascend order. + assert!( + err.to_string().contains("invalid split request"), + "{:?}", + err + ); + + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"k1", 1, vec![11, 12, 13])); + splits + .mut_requests() + .push(new_split_req(b"k2", 1, vec![11, 12])); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // All requests should be checked. + assert!(err.to_string().contains("id count"), "{:?}", err); + + let cases = vec![ + // region 1["", "k10"] + // After split: region 1 ["", "k09"], + // region 10 ["k09", "k10"] + ( + 1, + false, + vec![10], + vec![b"k09".to_vec()], + vec![vec![11, 12, 13]], + 10, + vec![ + (b"".to_vec(), b"k09".to_vec()), + (b"k09".to_vec(), b"k10".to_vec()), + ], + 4, + 0, + ), + // region 1 ["", "k09"] + // After split: region 20 ["", "k01"], + // region 1 ["k01", "k09"] + ( + 1, + true, + vec![20], + vec![b"k01".to_vec()], + vec![vec![21, 22, 23]], + 20, + vec![ + (b"".to_vec(), b"k01".to_vec()), + (b"k01".to_vec(), b"k09".to_vec()), + ], + 5, + 1, + ), + // region 1 ["k01", "k09"] + // After split: region 30 ["k01", "k02"], + // region 40 ["k02", "k03"], + // region 1 ["k03", "k09"] + ( + 1, + true, + vec![30, 40], + vec![b"k02".to_vec(), b"k03".to_vec()], + vec![vec![31, 32, 33], vec![41, 42, 43]], + 30, + vec![ + (b"k01".to_vec(), b"k02".to_vec()), + (b"k02".to_vec(), b"k03".to_vec()), + (b"k03".to_vec(), b"k09".to_vec()), + ], + 7, + 2, + ), + // region 1 ["k03", "k09"] + // After split: region 1 ["k03", "k07"], + // region 50 ["k07", "k08"], + // region 60 ["k08", "k09"] + ( + 1, + false, + vec![50, 60], + vec![b"k07".to_vec(), b"k08".to_vec()], + vec![vec![51, 52, 53], vec![61, 62, 63]], + 40, + vec![ + (b"k03".to_vec(), b"k07".to_vec()), + (b"k07".to_vec(), b"k08".to_vec()), + (b"k08".to_vec(), b"k09".to_vec()), + ], + 9, + 0, + ), + ]; + + for ( + parent_id, + right_derive, + new_region_ids, + split_keys, + children_peers, + log_index, + region_boundries, + version, + expected_derived_index, + ) in cases + { + let mut expected_epoch = RegionEpoch::new(); + expected_epoch.set_version(version); + + assert_split( + &mut apply, + &factory, + parent_id, + right_derive, + new_region_ids, + split_keys, + children_peers, + log_index, + region_boundries, + expected_epoch, + expected_derived_index, + ); + } + + // Split will create checkpoint tablet, so if there are some writes before + // split, they should be flushed immediately. + apply.apply_put(CF_DEFAULT, b"k04", b"v4").unwrap(); + assert!(!apply.write_batch_mut().as_ref().unwrap().is_empty()); + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"k05", 70, vec![71, 72, 73])); + req.set_splits(splits); + apply.apply_batch_split(&req, 50).unwrap(); + assert!(apply.write_batch_mut().is_none()); + assert_eq!(apply.tablet().get_value(b"k04").unwrap().unwrap(), b"v4"); + } +} diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 21122e5559f..d39788ac611 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -127,6 +127,7 @@ impl Peer { region_state, mailbox, tablet, + store_ctx.tablet_factory.clone(), read_scheduler, logger, ); @@ -182,17 +183,31 @@ impl Peer { } #[inline] - fn propose(&mut self, ctx: &mut StoreContext, data: Vec) -> Result { - ctx.raft_metrics.propose.normal.inc(); + fn propose( + &mut self, + store_ctx: &mut StoreContext, + data: Vec, + ) -> Result { + self.propose_with_ctx(store_ctx, data, vec![]) + } + + #[inline] + fn propose_with_ctx( + &mut self, + store_ctx: &mut StoreContext, + data: Vec, + proposal_ctx: Vec, + ) -> Result { + store_ctx.raft_metrics.propose.normal.inc(); PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data.len() as f64); - if data.len() as u64 > ctx.cfg.raft_entry_max_size.0 { + if data.len() as u64 > store_ctx.cfg.raft_entry_max_size.0 { return Err(Error::RaftEntryTooLarge { region_id: self.region_id(), entry_size: data.len() as u64, }); } let last_index = self.raft_group().raft.raft_log.last_index(); - self.raft_group_mut().propose(vec![], data)?; + self.raft_group_mut().propose(proposal_ctx, data)?; if self.raft_group().raft.raft_log.last_index() == last_index { // The message is dropped silently, this usually due to leader absence // or transferring leader. Both cases can be considered as NotLeader error. @@ -269,6 +284,7 @@ impl Peer { AdminCmdResult::ConfChange(conf_change) => { self.on_apply_res_conf_change(conf_change) } + AdminCmdResult::SplitRegion(_) => unimplemented!(), } } self.raft_group_mut() @@ -405,8 +421,8 @@ impl Apply { let admin_req = req.get_admin_request(); let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { AdminCmdType::CompactLog => unimplemented!(), - AdminCmdType::Split => unimplemented!(), - AdminCmdType::BatchSplit => unimplemented!(), + AdminCmdType::Split => self.apply_split(admin_req, entry.index)?, + AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, entry.index)?, AdminCmdType::PrepareMerge => unimplemented!(), AdminCmdType::CommitMerge => unimplemented!(), AdminCmdType::RollbackMerge => unimplemented!(), @@ -425,6 +441,7 @@ impl Apply { return Err(box_err!("invalid admin command type")); } }; + self.push_admin_result(admin_result); let mut resp = new_response(req.get_header()); resp.set_admin_response(admin_resp); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 6f4b63630a9..e0f4e5653de 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -1,4 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + //! This module contains snapshot relative processing logic. //! //! # Snapshot State diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index ff29b3ba029..06101da8d83 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -1,8 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::mem; +use std::{mem, sync::Arc}; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, TabletFactory}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{fsm::apply::DEFAULT_APPLY_WB_SIZE, ReadTask}; use slog::Logger; @@ -19,10 +19,13 @@ use crate::{ /// Apply applies all the committed commands to kv db. pub struct Apply { peer: metapb::Peer, + /// publish the update of the tablet remote_tablet: CachedTablet, tablet: EK, write_batch: Option, + tablet_factory: Arc>, + callbacks: Vec<(Vec, RaftCmdResponse)>, /// A flag indicates whether the peer is destroyed by applying admin @@ -46,6 +49,7 @@ impl Apply { region_state: RegionLocalState, res_reporter: R, mut remote_tablet: CachedTablet, + tablet_factory: Arc>, read_scheduler: Scheduler>, logger: Logger, ) -> Self { @@ -60,12 +64,18 @@ impl Apply { applied_term: 0, admin_cmd_result: vec![], region_state, + tablet_factory, read_scheduler, res_reporter, logger, } } + #[inline] + pub fn tablet_factory(&self) -> &Arc> { + &self.tablet_factory + } + #[inline] pub fn res_reporter(&self) -> &R { &self.res_reporter diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 19a52d4c5a2..aa642f5967f 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -485,6 +485,7 @@ mod tests { RegionLocalState::default(), router, CachedTablet::new(Some(tablet)), + factory, sched, logger, ); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 6fce91114a7..f5702092622 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -38,8 +38,8 @@ use kvproto::{ kvrpcpb::ExtraOp as TxnExtraOp, metapb::{PeerRole, Region, RegionEpoch}, raft_cmdpb::{ - AdminCmdType, AdminRequest, AdminResponse, BatchSplitRequest, ChangePeerRequest, CmdType, - CommitMergeRequest, RaftCmdRequest, RaftCmdResponse, Request, + AdminCmdType, AdminRequest, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, + RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, }, raft_serverpb::{MergeState, PeerState, RaftApplyState, RaftTruncatedState, RegionLocalState}, }; @@ -1904,40 +1904,37 @@ mod confchange_cmd_metric { } } -// Validate the request and the split keys -pub fn extract_split_keys( - split_reqs: &BatchSplitRequest, - region_to_split: &Region, -) -> Result>> { - if split_reqs.get_requests().is_empty() { +pub fn validate_batch_split(req: &AdminRequest, region: &Region) -> Result<()> { + if req.get_splits().get_requests().is_empty() { return Err(box_err!("missing split requests")); } - let mut keys: VecDeque> = VecDeque::with_capacity(split_reqs.get_requests().len() + 1); - for req in split_reqs.get_requests() { + + let split_reqs: &[SplitRequest] = req.get_splits().get_requests(); + let mut last_key = region.get_start_key(); + for req in split_reqs { let split_key = req.get_split_key(); if split_key.is_empty() { return Err(box_err!("missing split key")); } - if split_key - <= keys - .back() - .map_or_else(|| region_to_split.get_start_key(), Vec::as_slice) - { + + if split_key <= last_key { return Err(box_err!("invalid split request: {:?}", split_reqs)); } - if req.get_new_peer_ids().len() != region_to_split.get_peers().len() { + + if req.get_new_peer_ids().len() != region.get_peers().len() { return Err(box_err!( "invalid new peer id count, need {:?}, but got {:?}", - region_to_split.get_peers(), + region.get_peers(), req.get_new_peer_ids() )); } - keys.push_back(split_key.to_vec()); + + last_key = req.get_split_key(); } - util::check_key_in_region_exclusive(keys.back().unwrap(), region_to_split)?; + util::check_key_in_region_exclusive(last_key, region)?; - Ok(keys) + Ok(()) } // Admin commands related. @@ -2408,9 +2405,15 @@ where PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); - let split_reqs = req.get_splits(); - let mut keys = extract_split_keys(split_reqs, &self.region)?; let mut derived = self.region.clone(); + validate_batch_split(req, &derived)?; + + let split_reqs = req.get_splits(); + let mut keys: VecDeque<_> = split_reqs + .get_requests() + .iter() + .map(|req| req.get_split_key().to_vec()) + .collect(); info!( "split region"; @@ -6560,12 +6563,13 @@ mod tests { resp ); + splits.mut_requests().clear(); splits .mut_requests() .push(new_split_req(b"", 8, vec![9, 10, 11])); let resp = exec_split(&router, splits.clone()); - // Empty key should be rejected. - assert!(error_msg(&resp).contains("missing"), "{:?}", resp); + // Empty key will not in any region exclusively. + assert!(error_msg(&resp).contains("missing split key"), "{:?}", resp); splits.mut_requests().clear(); splits @@ -6784,4 +6788,100 @@ mod tests { rx.recv_timeout(Duration::from_millis(500)).unwrap(); system.shutdown(); } + + fn new_batch_split_request(keys: Vec>) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + for key in keys { + let mut split_req = SplitRequest::default(); + split_req.set_split_key(key); + split_req.set_new_peer_ids(vec![1]); + req.mut_splits().mut_requests().push(split_req); + } + req + } + + #[test] + fn test_validate_batch_split() { + let mut region = Region::default(); + region.set_start_key(b"k05".to_vec()); + region.set_end_key(b"k10".to_vec()); + region.set_peers(vec![new_peer(1, 2)].into()); + + let missing_error = "missing split requests"; + let invalid_error = "invalid split request"; + let not_in_region_error = "not in region"; + let empty_error = "missing split key"; + let peer_id_error = "invalid new peer id count"; + + // case: split is deprecated + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::Split); + let mut split_req = SplitRequest::default(); + split_req.set_split_key(b"k06".to_vec()); + req.set_split(split_req); + assert!( + validate_batch_split(&req, ®ion) + .unwrap_err() + .to_string() + .contains(missing_error) + ); + + // case: missing peer ids + let mut req = new_batch_split_request(vec![b"k07".to_vec()]); + req.mut_splits() + .mut_requests() + .get_mut(0) + .unwrap() + .new_peer_ids + .clear(); + assert!( + validate_batch_split(&req, ®ion) + .unwrap_err() + .to_string() + .contains(peer_id_error) + ); + + let fail_cases = vec![ + // case: default admin request should be rejected + (vec![], missing_error), + // case: empty split key + (vec![vec![]], empty_error), + // case: out of order split keys + ( + vec![b"k07".to_vec(), b"k08".to_vec(), b"k06".to_vec()], + invalid_error, + ), + // case: split keys are not in region range + ( + vec![b"k04".to_vec(), b"k07".to_vec(), b"k08".to_vec()], + invalid_error, + ), + // case: split keys are not in region range + ( + vec![b"k06".to_vec(), b"k07".to_vec(), b"k11".to_vec()], + not_in_region_error, + ), + // case: duplicated split keys + (vec![b"k06".to_vec(), b"k06".to_vec()], invalid_error), + ]; + + for (split_keys, fail_str) in fail_cases { + let req = if split_keys.is_empty() { + AdminRequest::default() + } else { + new_batch_split_request(split_keys) + }; + assert!( + validate_batch_split(&req, ®ion) + .unwrap_err() + .to_string() + .contains(fail_str) + ); + } + + // case: pass the validation + let req = new_batch_split_request(vec![b"k06".to_vec(), b"k07".to_vec(), b"k08".to_vec()]); + validate_batch_split(&req, ®ion).unwrap(); + } } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 1e571296e1a..2980f9931a5 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -742,29 +742,35 @@ pub fn conf_state_from_region(region: &metapb::Region) -> ConfState { pub struct KeysInfoFormatter< 'a, - I: std::iter::DoubleEndedIterator> - + std::iter::ExactSizeIterator> + T: 'a + AsRef<[u8]>, + I: std::iter::DoubleEndedIterator + + std::iter::ExactSizeIterator + Clone, >(pub I); impl< 'a, - I: std::iter::DoubleEndedIterator> - + std::iter::ExactSizeIterator> + T: 'a + AsRef<[u8]>, + I: std::iter::DoubleEndedIterator + + std::iter::ExactSizeIterator + Clone, -> fmt::Display for KeysInfoFormatter<'a, I> +> fmt::Display for KeysInfoFormatter<'a, T, I> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut it = self.0.clone(); match it.len() { 0 => write!(f, "(no key)"), - 1 => write!(f, "key {}", log_wrappers::Value::key(it.next().unwrap())), + 1 => write!( + f, + "key {}", + log_wrappers::Value::key(it.next().unwrap().as_ref()) + ), _ => write!( f, "{} keys range from {} to {}", it.len(), - log_wrappers::Value::key(it.next().unwrap()), - log_wrappers::Value::key(it.next_back().unwrap()) + log_wrappers::Value::key(it.next().unwrap().as_ref()), + log_wrappers::Value::key(it.next_back().unwrap().as_ref()) ), } } From dff77165fb3f906ac74891fdf0cd1333ac02bf9c Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Mon, 7 Nov 2022 12:11:49 +0800 Subject: [PATCH 315/676] cmd: fix raft engine ctl (#13108) ref tikv/tikv#11119 None Signed-off-by: tabokie --- cmd/tikv-ctl/src/cmd.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmd/tikv-ctl/src/cmd.rs b/cmd/tikv-ctl/src/cmd.rs index 2fec7ea9cef..eed2d7e8283 100644 --- a/cmd/tikv-ctl/src/cmd.rs +++ b/cmd/tikv-ctl/src/cmd.rs @@ -558,7 +558,11 @@ pub enum Cmd { version: u64, }, /// Control for Raft Engine - RaftEngineCtl { args: Vec }, + /// Usage: tikv-ctl raft-engine-ctl -- --help + RaftEngineCtl { + #[structopt(last = true)] + args: Vec, + }, #[structopt(external_subcommand)] External(Vec), } From 9e2305c2355d73e65fe9d5aa697b2454168c8f10 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 7 Nov 2022 15:09:50 +0800 Subject: [PATCH 316/676] storage: fix the flashback dead loop caused by a deleted key (#13744) close tikv/tikv#13743 The original thought is that when a key's latest MVCC record type is DELETE and the corresponding flashback operation is also DELETE, we skip it to avoid duplicated writing. However, this will cause the flashback to fall into a dead loop since the key doesn't have the written record with the flashback `commit_ts` and the flashback will always try to write it forever. Signed-off-by: JmPotato --- src/storage/mod.rs | 81 +++++++++++++++++++ .../txn/actions/flashback_to_version.rs | 14 +--- .../flashback_to_version_read_phase.rs | 2 +- 3 files changed, 86 insertions(+), 11 deletions(-) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 1c2688dd8a8..8b835bcfafd 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -4977,6 +4977,87 @@ mod tests { } } + #[test] + fn test_flashback_to_version_deleted_key() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let (tx, rx) = channel(); + let mut ts = TimeStamp::zero(); + let (k, v) = (Key::from_raw(b"k"), b"v".to_vec()); + // Write a key. + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![Mutation::make_put(k.clone(), v.clone())], + k.as_encoded().to_vec(), + *ts.incr(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new(vec![k.clone()], ts, *ts.incr(), Context::default()), + expect_value_callback(tx.clone(), 1, TxnStatus::committed(ts)), + ) + .unwrap(); + rx.recv().unwrap(); + expect_value( + v, + block_on(storage.get(Context::default(), k.clone(), ts)) + .unwrap() + .0, + ); + // Delete the key. + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![Mutation::make_delete(k.clone())], + k.as_encoded().to_vec(), + *ts.incr(), + ), + expect_ok_callback(tx.clone(), 2), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new(vec![k.clone()], ts, *ts.incr(), Context::default()), + expect_value_callback(tx.clone(), 3, TxnStatus::committed(ts)), + ) + .unwrap(); + rx.recv().unwrap(); + expect_none( + block_on(storage.get(Context::default(), Key::from_raw(b"k"), ts)) + .unwrap() + .0, + ); + // Flashback the key. + let flashback_start_ts = *ts.incr(); + let flashback_commit_ts = *ts.incr(); + storage + .sched_txn_command( + new_flashback_to_version_read_phase_cmd( + flashback_start_ts, + flashback_commit_ts, + 1.into(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + Context::default(), + ), + expect_ok_callback(tx, 4), + ) + .unwrap(); + rx.recv().unwrap(); + expect_none( + block_on(storage.get(Context::default(), k, flashback_commit_ts)) + .unwrap() + .0, + ); + } + #[test] fn test_high_priority_get_put() { let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 5a86a6caa7d..02095d4b46d 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -126,8 +126,7 @@ pub fn flashback_to_version_lock( // - If a key doesn't exist at `self.version`, it will be put a // `WriteType::Delete`. // - If a key exists at `self.version`, it will be put the exact same record -// in `CF_WRITE` and `CF_DEFAULT` if needed with `self.commit_ts` and -// `self.start_ts`. +// in `CF_WRITE` and `CF_DEFAULT` with `self.commit_ts` and `self.start_ts`. pub fn flashback_to_version_write( txn: &mut MvccTxn, reader: &mut SnapshotReader, @@ -167,11 +166,6 @@ pub fn flashback_to_version_write( } else { // If the old write doesn't exist, we should put a `WriteType::Delete` record to // delete the current key when needed. - if let Some((_, latest_write)) = reader.seek_write(&key, commit_ts)? { - if latest_write.write_type == WriteType::Delete { - continue; - } - } Write::new(WriteType::Delete, start_ts, None) }; txn.put_write(key.clone(), commit_ts, new_write.as_ref().to_bytes()); @@ -334,11 +328,11 @@ pub mod tests { must_get(&mut engine, k, ts, v); must_prewrite_delete(&mut engine, k, k, *ts.incr()); must_commit(&mut engine, k, ts, *ts.incr()); - // Since the key has been deleted, flashback to version 1 should not do - // anything. + // Though the key has been deleted, flashback to version 1 still needs to write + // a new `WriteType::Delete` with the flashback `commit_ts`. assert_eq!( must_flashback_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), - 0 + 1 ); must_get_none(&mut engine, k, ts); } diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index d74c6f8d708..cfc6856da9c 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -117,7 +117,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { tls_collect_keyread_histogram_vec(tag, key_locks.len() as f64); FlashbackToVersionState::ScanLock { // DO NOT pop the last key as the next key when it's the only key to prevent - // from making flashback fall into an dead loop. + // from making flashback fall into a dead loop. next_lock_key: if key_locks.len() > 1 { key_locks.pop().map(|(key, _)| key).unwrap() } else { From 9d6332398427cc5563731be7810454b2f20c2fd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20van=20Eeden?= Date: Mon, 7 Nov 2022 09:31:50 +0100 Subject: [PATCH 317/676] tidb_query_datatype: Fix Geometry FieldType support (#13652) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#13651 Signed-off-by: Daniël van Eeden --- components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs b/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs index 79c08ec5404..8d0e34dfdf7 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs @@ -73,6 +73,7 @@ pub trait V1CompatibleEncoder: DatumFlagAndPayloadEncoder { FieldTypeTp::VarChar | FieldTypeTp::VarString | FieldTypeTp::String + | FieldTypeTp::Geometry | FieldTypeTp::TinyBlob | FieldTypeTp::MediumBlob | FieldTypeTp::LongBlob From 7598dd26168b317f50ddc528600b7d85e97f9011 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Tue, 8 Nov 2022 11:43:50 +0800 Subject: [PATCH 318/676] storage: calculate last_change_ts in prewrite (#13721) ref tikv/tikv#13694 This commit implements the different cases when last_change_ts is calculated in prewrite: 1. Inherit from the pessimistic lock 2. Calculate it when checking the new version 3. Amend the pessimistic lock Signed-off-by: Yilin Chen --- src/storage/mvcc/reader/scanner/forward.rs | 13 +- src/storage/txn/actions/prewrite.rs | 295 ++++++++++++++++++++- src/storage/txn/commands/prewrite.rs | 194 +++++++++++++- src/storage/txn/store.rs | 23 +- 4 files changed, 508 insertions(+), 17 deletions(-) diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 709dc5803d1..03f44deed7c 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -2454,12 +2454,6 @@ mod delta_entry_tests { let last_write = writes.last(); let max_commit_ts = last_write.map(|(_, commit_ts, ..)| *commit_ts).unwrap_or(0); - let (mut last_change_ts, mut versions_to_last_change) = (0,0); - // TODO: Remove `*lock_type == LockType::Pessimistic` after calculating last_change_ts for prewrite. - if *lock_type == LockType::Pessimistic && - let Some((_, commit_ts, WriteType::Put | WriteType::Delete, _)) = last_write { - (last_change_ts, versions_to_last_change) = (*commit_ts, 1); - } let for_update_ts = std::cmp::max(*ts, max_commit_ts + 1); if *ts <= to_ts { @@ -2470,7 +2464,6 @@ mod delta_entry_tests { .for_update_ts(for_update_ts.into()) .primary(key) .value(&value) - .last_change(last_change_ts.into(), versions_to_last_change) .build_prewrite(*lock_type, is_short_value(&value)); entries_of_key.push(entry); } @@ -2610,10 +2603,12 @@ mod delta_entry_tests { // Do assertions one by one so that if it fails it won't print too long panic // message. for i in 0..std::cmp::max(actual.len(), expected.len()) { + // We don't care about last_change_ts here. Use a trick to ignore them. + let actual_erased = actual[i].erasing_last_change_ts(); assert_eq!( - actual[i], expected[i], + actual_erased, expected[i], "item {} not match: expected {:?}, but got {:?}", - i, &expected[i], &actual[i] + i, &expected[i], &actual_erased ); } }; diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 40709032d61..8abaf1428e4 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -21,7 +21,10 @@ use crate::storage::{ }, Error, ErrorInner, Lock, LockType, MvccTxn, Result, SnapshotReader, }, - txn::{actions::check_data_constraint::check_data_constraint, LockInfo}, + txn::{ + actions::check_data_constraint::check_data_constraint, sched_pool::tls_can_enable, + scheduler::LAST_CHANGE_TS, LockInfo, + }, Snapshot, }; @@ -62,7 +65,7 @@ pub fn prewrite( let lock_status = match reader.load_lock(&mutation.key)? { Some(lock) => mutation.check_lock(lock, pessimistic_action)?, None if matches!(pessimistic_action, DoPessimisticCheck) => { - amend_pessimistic_lock(&mutation, reader)?; + amend_pessimistic_lock(&mut mutation, reader)?; lock_amended = true; LockStatus::None } @@ -236,6 +239,8 @@ struct PrewriteMutation<'a> { lock_type: Option, lock_ttl: u64, + last_change_ts: TimeStamp, + versions_to_last_change: u64, should_not_exist: bool, should_not_write: bool, @@ -273,6 +278,8 @@ impl<'a> PrewriteMutation<'a> { lock_type, lock_ttl: txn_props.lock_ttl, + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, should_not_exist, should_not_write, @@ -320,6 +327,9 @@ impl<'a> PrewriteMutation<'a> { return Err(ErrorInner::KeyIsLocked(self.lock_info(lock)?).into()); } + self.last_change_ts = lock.last_change_ts; + self.versions_to_last_change = lock.versions_to_last_change; + if lock.lock_type == LockType::Pessimistic { // TODO: remove it in future if !self.txn_props.is_pessimistic() { @@ -350,7 +360,7 @@ impl<'a> PrewriteMutation<'a> { } fn check_for_newer_version( - &self, + &mut self, reader: &mut SnapshotReader, ) -> Result> { let mut seek_ts = TimeStamp::max(); @@ -365,6 +375,10 @@ impl<'a> PrewriteMutation<'a> { // TODO: Maybe we need to add a new error for the rolled back case. self.write_conflict_error(&write, commit_ts, WriteConflictReason::SelfRolledBack)?; } + if seek_ts == TimeStamp::max() { + (self.last_change_ts, self.versions_to_last_change) = + write.next_last_change_info(commit_ts); + } match self.txn_props.kind { TransactionKind::Optimistic(_) => { if commit_ts > self.txn_props.start_ts { @@ -440,6 +454,11 @@ impl<'a> PrewriteMutation<'a> { self.txn_props.txn_size, self.min_commit_ts, ); + // Only Lock needs to record `last_change_ts` in its write record, Put or Delete + // records themselves are effective changes. + if tls_can_enable(LAST_CHANGE_TS) && self.lock_type == Some(LockType::Lock) { + lock = lock.set_last_change(self.last_change_ts, self.versions_to_last_change); + } if let Some(value) = self.value { if is_short_value(&value) { @@ -503,7 +522,7 @@ impl<'a> PrewriteMutation<'a> { } fn check_assertion( - &self, + &mut self, reader: &mut SnapshotReader, write: &Option<(Write, TimeStamp)>, write_loaded: bool, @@ -694,11 +713,11 @@ fn async_commit_timestamps( // If the data is not changed after acquiring the lock, we can still prewrite // the key. fn amend_pessimistic_lock( - mutation: &PrewriteMutation<'_>, + mutation: &mut PrewriteMutation<'_>, reader: &mut SnapshotReader, ) -> Result<()> { let write = reader.seek_write(&mutation.key, TimeStamp::max())?; - if let Some((commit_ts, _)) = write.as_ref() { + if let Some((commit_ts, write)) = write.as_ref() { // The invariants of pessimistic locks are: // 1. lock's for_update_ts >= key's latest commit_ts // 2. lock's for_update_ts >= txn's start_ts @@ -727,6 +746,8 @@ fn amend_pessimistic_lock( } .into()); } + (mutation.last_change_ts, mutation.versions_to_last_change) = + write.next_last_change_info(*commit_ts); } // Used pipelined pessimistic lock acquiring in this txn but failed // Luckily no other txn modified this lock, amend it by treat it as optimistic @@ -2194,4 +2215,266 @@ pub mod tests { must_commit(&mut engine, key, 21, 22); must_pessimistic_prewrite_insert(&mut engine, key, value, key, 23, 23, DoConstraintCheck); } + + #[cfg(test)] + fn test_calculate_last_change_ts_from_latest_write_impl( + prewrite_func: impl Fn(&mut RocksEngine, LockType, /* start_ts */ u64), + ) { + use engine_traits::CF_WRITE; + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let key = b"k"; + + // Latest change ts should not be enabled on TiKV 6.4 + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate); + let write = Write::new(WriteType::Put, 5.into(), Some(b"value".to_vec())); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(8.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 10); + let lock = must_locked(&mut engine, key, 10); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 10, false); + + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.5.0").unwrap(); + set_tls_feature_gate(feature_gate); + + // Latest version is a PUT. But as we are prewriting a PUT, no need to record + // `last_change_ts`. + let write = Write::new(WriteType::Put, 15.into(), Some(b"value".to_vec())); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(20.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Put, 25); + let lock = must_locked(&mut engine, key, 25); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 25, false); + + // Latest version is a PUT + let write = Write::new(WriteType::Put, 30.into(), Some(b"value".to_vec())); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(35.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 40); + let lock = must_locked(&mut engine, key, 40); + assert_eq!(lock.last_change_ts, 35.into()); + assert_eq!(lock.versions_to_last_change, 1); + must_rollback(&mut engine, key, 40, false); + + // Latest version is a DELETE + let write = Write::new(WriteType::Delete, 45.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(50.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 55); + let lock = must_locked(&mut engine, key, 55); + assert_eq!(lock.last_change_ts, 50.into()); + assert_eq!(lock.versions_to_last_change, 1); + must_rollback(&mut engine, key, 55, false); + + // Latest version is a LOCK without last_change_ts. Set the last_change_ts of + // the new record to zero. + let write = Write::new(WriteType::Lock, 60.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(65.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 70); + let lock = must_locked(&mut engine, key, 70); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 70, false); + + // Latest version is a ROLLBACK without last_change_ts. Set the last_change_ts + // of the new record to zero. + let write = Write::new(WriteType::Rollback, 75.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(80.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 85); + let lock = must_locked(&mut engine, key, 85); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 85, false); + + // Latest version is a LOCK with last_change_ts + let write = Write::new(WriteType::Lock, 90.into(), None).set_last_change(20.into(), 6); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(95.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 100); + let lock = must_locked(&mut engine, key, 100); + assert_eq!(lock.last_change_ts, 20.into()); + assert_eq!(lock.versions_to_last_change, 7); + must_rollback(&mut engine, key, 100, false); + + // Latest version is a LOCK with last_change_ts + let write = Write::new(WriteType::Lock, 105.into(), None).set_last_change(20.into(), 8); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(110.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 120); + let lock = must_locked(&mut engine, key, 120); + assert_eq!(lock.last_change_ts, 20.into()); + assert_eq!(lock.versions_to_last_change, 9); + must_rollback(&mut engine, key, 120, false); + } + + #[test] + fn test_optimistic_txn_calculate_last_change_ts() { + test_calculate_last_change_ts_from_latest_write_impl(|engine, tp, start_ts| match tp { + LockType::Put => must_prewrite_put(engine, b"k", b"value", b"k", start_ts), + LockType::Delete => must_prewrite_delete(engine, b"k", b"k", start_ts), + LockType::Lock => must_prewrite_lock(engine, b"k", b"k", start_ts), + _ => unreachable!(), + }); + } + + #[test] + fn test_pessimistic_amend_txn_calculate_last_change_ts() { + test_calculate_last_change_ts_from_latest_write_impl(|engine, tp, start_ts| match tp { + LockType::Put => must_pessimistic_prewrite_put( + engine, + b"k", + b"value", + b"k", + start_ts, + start_ts, + DoPessimisticCheck, + ), + LockType::Delete => must_pessimistic_prewrite_delete( + engine, + b"k", + b"k", + start_ts, + start_ts, + DoPessimisticCheck, + ), + LockType::Lock => must_pessimistic_prewrite_lock( + engine, + b"k", + b"k", + start_ts, + start_ts, + DoPessimisticCheck, + ), + _ => unreachable!(), + }); + } + + #[test] + fn test_inherit_last_change_ts_from_pessimistic_lock() { + use engine_traits::CF_LOCK; + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.5.0").unwrap(); + set_tls_feature_gate(feature_gate); + + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let key = b"k"; + let put_lock = + |engine: &mut RocksEngine, ts: u64, last_change_ts: u64, versions_to_last_change| { + let lock = Lock::new( + LockType::Pessimistic, + key.to_vec(), + ts.into(), + 100, + None, + ts.into(), + 5, + ts.into(), + ) + .set_last_change(last_change_ts.into(), versions_to_last_change); + engine + .put_cf( + Default::default(), + CF_LOCK, + Key::from_raw(key), + lock.to_bytes(), + ) + .unwrap(); + }; + + // Prewrite LOCK from pessimistic lock without `last_change_ts` + put_lock(&mut engine, 10, 0, 0); + must_pessimistic_prewrite_lock(&mut engine, key, key, 10, 10, DoPessimisticCheck); + let lock = must_locked(&mut engine, key, 10); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 10, false); + + // Prewrite LOCK from pessimistic lock with `last_change_ts` + put_lock(&mut engine, 20, 15, 3); + must_pessimistic_prewrite_lock(&mut engine, key, key, 20, 20, DoPessimisticCheck); + let lock = must_locked(&mut engine, key, 20); + assert_eq!(lock.last_change_ts, 15.into()); + assert_eq!(lock.versions_to_last_change, 3); + must_rollback(&mut engine, key, 20, false); + + // Prewrite PUT from pessimistic lock with `last_change_ts` + put_lock(&mut engine, 30, 15, 5); + must_pessimistic_prewrite_put(&mut engine, key, b"value", key, 30, 30, DoPessimisticCheck); + let lock = must_locked(&mut engine, key, 30); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 30, false); + + // Prewrite DELETE from pessimistic lock with `last_change_ts` + put_lock(&mut engine, 40, 15, 5); + must_pessimistic_prewrite_delete(&mut engine, key, key, 40, 30, DoPessimisticCheck); + let lock = must_locked(&mut engine, key, 40); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 40, false); + } } diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 2b0915a5fdc..06f9cd1f818 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -848,7 +848,8 @@ fn handle_1pc_locks(txn: &mut MvccTxn, commit_ts: TimeStamp) -> ReleasedLocks { WriteType::from_lock_type(lock.lock_type).unwrap(), txn.start_ts, lock.short_value, - ); + ) + .set_last_change(lock.last_change_ts, lock.versions_to_last_change); // Transactions committed with 1PC should be impossible to overwrite rollback // records. txn.put_write(key.clone(), commit_ts, write.as_ref().to_bytes()); @@ -2505,4 +2506,195 @@ mod tests { assert_eq!(res.min_commit_ts, 18.into(), "{:?}", res); must_unlocked(&mut engine, b"k2"); } + + #[test] + fn test_1pc_calculate_last_change_ts() { + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + let cm = concurrency_manager::ConcurrencyManager::new(1.into()); + + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.5.0").unwrap(); + set_tls_feature_gate(feature_gate); + + let key = b"k"; + let value = b"v"; + must_prewrite_put(&mut engine, key, value, key, 10); + must_commit(&mut engine, key, 10, 20); + + // 1PC write a new LOCK + let mutations = vec![Mutation::make_lock(Key::from_raw(key))]; + let mut statistics = Statistics::default(); + let res = prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations.clone(), + key.to_vec(), + 30, + Some(40), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 30, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, 20.into()); + assert_eq!(write.versions_to_last_change, 1); + + // 1PC write another LOCK + let res = prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations, + key.to_vec(), + 50, + Some(60), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 50, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, 20.into()); + assert_eq!(write.versions_to_last_change, 2); + + // 1PC write a PUT + let mutations = vec![Mutation::make_put(Key::from_raw(key), b"v2".to_vec())]; + let res = prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations, + key.to_vec(), + 70, + Some(80), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 70, res.one_pc_commit_ts, WriteType::Put); + assert_eq!(write.last_change_ts, TimeStamp::zero()); + assert_eq!(write.versions_to_last_change, 0); + + // TiKV 6.4 should not have last_change_ts. + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate); + let mutations = vec![Mutation::make_lock(Key::from_raw(key))]; + let res = prewrite_with_cm( + &mut engine, + cm, + &mut statistics, + mutations, + key.to_vec(), + 80, + Some(90), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 80, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, TimeStamp::zero()); + assert_eq!(write.versions_to_last_change, 0); + } + + #[test] + fn test_pessimistic_1pc_calculate_last_change_ts() { + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + let cm = concurrency_manager::ConcurrencyManager::new(1.into()); + + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.5.0").unwrap(); + set_tls_feature_gate(feature_gate); + + let key = b"k"; + let value = b"v"; + must_prewrite_put(&mut engine, key, value, key, 10); + must_commit(&mut engine, key, 10, 20); + + // Pessimistic 1PC write a new LOCK + must_acquire_pessimistic_lock(&mut engine, key, key, 30, 30); + let mutations = vec![(Mutation::make_lock(Key::from_raw(key)), DoPessimisticCheck)]; + let mut statistics = Statistics::default(); + let res = pessimistic_prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations.clone(), + key.to_vec(), + 30, + 30, + Some(40), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 30, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, 20.into()); + assert_eq!(write.versions_to_last_change, 1); + + // Pessimistic 1PC write another LOCK + must_acquire_pessimistic_lock(&mut engine, key, key, 50, 50); + let res = pessimistic_prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations, + key.to_vec(), + 50, + 50, + Some(60), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 50, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, 20.into()); + assert_eq!(write.versions_to_last_change, 2); + + // Pessimistic 1PC write a PUT + must_acquire_pessimistic_lock(&mut engine, key, key, 70, 70); + let mutations = vec![( + Mutation::make_put(Key::from_raw(key), b"v2".to_vec()), + DoPessimisticCheck, + )]; + let res = pessimistic_prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations, + key.to_vec(), + 70, + 70, + Some(80), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 70, res.one_pc_commit_ts, WriteType::Put); + assert_eq!(write.last_change_ts, TimeStamp::zero()); + assert_eq!(write.versions_to_last_change, 0); + + // TiKV 6.4 should not have last_change_ts. + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate); + must_acquire_pessimistic_lock(&mut engine, key, key, 80, 80); + let mutations = vec![(Mutation::make_lock(Key::from_raw(key)), DoPessimisticCheck)]; + let res = pessimistic_prewrite_with_cm( + &mut engine, + cm, + &mut statistics, + mutations, + key.to_vec(), + 80, + 80, + Some(90), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 80, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, TimeStamp::zero()); + assert_eq!(write.versions_to_last_change, 0); + } } diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index b2f25cff640..9a38979c71b 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. use kvproto::kvrpcpb::IsolationLevel; -use txn_types::{Key, KvPair, OldValue, TimeStamp, TsSet, Value, WriteRef}; +use txn_types::{Key, KvPair, Lock, OldValue, TimeStamp, TsSet, Value, WriteRef}; use super::{Error, ErrorInner, Result}; use crate::storage::{ @@ -159,6 +159,27 @@ impl TxnEntry { } => old_value, } } + + pub fn erasing_last_change_ts(&self) -> TxnEntry { + let mut e = self.clone(); + match &mut e { + TxnEntry::Prewrite { + lock: (_, value), .. + } => { + let l = Lock::parse(value).unwrap(); + *value = l.set_last_change(TimeStamp::zero(), 0).to_bytes(); + } + TxnEntry::Commit { + write: (_, value), .. + } => { + let mut w = WriteRef::parse(value).unwrap(); + w.last_change_ts = TimeStamp::zero(); + w.versions_to_last_change = 0; + *value = w.to_bytes(); + } + } + e + } } impl TxnEntry { From 8eaa805dc440dc6c056547c5ebb3989c56dde0c3 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 8 Nov 2022 20:27:50 +0800 Subject: [PATCH 319/676] backup: udpate rusoto to support backup to ap-southeast-3 (#13750) close tikv/tikv#13751 Signed-off-by: 3pointer --- Cargo.lock | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d49c13ae18c..a1b238d0148 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4697,7 +4697,7 @@ dependencies = [ [[package]] name = "rusoto_core" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "base64", @@ -4721,7 +4721,7 @@ dependencies = [ [[package]] name = "rusoto_credential" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "chrono", @@ -4738,7 +4738,7 @@ dependencies = [ [[package]] name = "rusoto_kms" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "bytes", @@ -4751,7 +4751,7 @@ dependencies = [ [[package]] name = "rusoto_mock" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "chrono", @@ -4765,7 +4765,7 @@ dependencies = [ [[package]] name = "rusoto_s3" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "bytes", @@ -4779,7 +4779,7 @@ dependencies = [ [[package]] name = "rusoto_signature" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "base64", "bytes", @@ -4804,7 +4804,7 @@ dependencies = [ [[package]] name = "rusoto_sts" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "bytes", From 2a61f0777da4dfb635316d612f663932d0874b0c Mon Sep 17 00:00:00 2001 From: guoxiang1996 Date: Wed, 9 Nov 2022 13:11:51 +0800 Subject: [PATCH 320/676] debugger: fix compacting raftdb with tikv-ctl (#13742) close tikv/tikv#13515 Signed-off-by: kuiper Co-authored-by: Xinye Tao --- src/server/debug.rs | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/src/server/debug.rs b/src/server/debug.rs index 7f85aabcf50..6ee676ad1c4 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -120,6 +120,10 @@ impl From for debugpb::BottommostLevelCompaction { } } +trait InnerRocksEngineExtractor { + fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine>; +} + #[derive(Clone)] pub struct Debugger { engines: Engines, @@ -127,6 +131,26 @@ pub struct Debugger { cfg_controller: ConfigController, } +impl InnerRocksEngineExtractor for Debugger { + default fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine> { + match db { + DbType::Kv => Ok(&self.engines.kv), + DbType::Raft => Err(box_err!("Get raft db is not allowed")), + _ => Err(box_err!("invalid DB type")), + } + } +} + +impl InnerRocksEngineExtractor for Debugger { + fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine> { + match db { + DbType::Kv => Ok(&self.engines.kv), + DbType::Raft => Ok(&self.engines.raft), + _ => Err(box_err!("invalid DB type")), + } + } +} + impl Debugger { pub fn new( engines: Engines, @@ -163,14 +187,6 @@ impl Debugger { Ok(regions) } - fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine> { - match db { - DbType::Kv => Ok(&self.engines.kv), - DbType::Raft => Err(box_err!("Get raft db is not allowed")), - _ => Err(box_err!("invalid DB type")), - } - } - pub fn get(&self, db: DbType, cf: &str, key: &[u8]) -> Result> { validate_db_and_cf(db, cf)?; let db = self.get_db_from_type(db)?; @@ -2272,4 +2288,14 @@ mod tests { .get_api_version() ) } + + #[test] + fn test_compact() { + let debugger = new_debugger(); + let compact = |db, cf| debugger.compact(db, cf, &[0], &[0xFF], 1, Some("skip").into()); + compact(DbType::Kv, CF_DEFAULT).unwrap(); + compact(DbType::Kv, CF_LOCK).unwrap(); + compact(DbType::Kv, CF_WRITE).unwrap(); + compact(DbType::Raft, CF_DEFAULT).unwrap(); + } } From 8831c7d6032e46337c7c9e2970fa0942d6787403 Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 9 Nov 2022 16:21:51 +0800 Subject: [PATCH 321/676] raftstore: Introduce witness peer (#12972) ref tikv/tikv#12876 Introduce witness peer Signed-off-by: Connor1996 --- Cargo.toml | 4 +- .../operation/command/admin/conf_change.rs | 1 + .../raftstore-v2/src/operation/command/mod.rs | 5 +- .../raftstore-v2/src/operation/query/local.rs | 2 +- components/raftstore-v2/src/raft/peer.rs | 4 +- .../raftstore/src/store/entry_storage.rs | 2 +- components/raftstore/src/store/fsm/apply.rs | 248 +++++--- components/raftstore/src/store/fsm/peer.rs | 88 ++- .../raftstore/src/store/local_metrics.rs | 3 + components/raftstore/src/store/metrics.rs | 2 + components/raftstore/src/store/peer.rs | 50 +- .../raftstore/src/store/peer_storage.rs | 205 +++++-- components/raftstore/src/store/util.rs | 83 ++- .../src/store/worker/check_leader.rs | 2 +- .../raftstore/src/store/worker/metrics.rs | 1 + components/raftstore/src/store/worker/read.rs | 26 +- components/test_pd_client/src/pd.rs | 4 +- components/test_raftstore/src/util.rs | 1 - components/tikv_util/src/store/mod.rs | 5 +- components/tikv_util/src/store/peer.rs | 13 + components/tikv_util/src/store/region.rs | 8 +- src/server/raft_client.rs | 17 +- tests/integrations/raftstore/mod.rs | 1 + .../raftstore/test_unsafe_recovery.rs | 1 - tests/integrations/raftstore/test_witness.rs | 537 ++++++++++++++++++ 25 files changed, 1121 insertions(+), 192 deletions(-) create mode 100644 tests/integrations/raftstore/test_witness.rs diff --git a/Cargo.toml b/Cargo.toml index 4ccf0a2ad93..a408e4a84ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -220,8 +220,8 @@ procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229 # When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to # kvproto at the same time. # After the PR to kvproto is merged, remember to comment this out and run `cargo update -p kvproto`. -[patch.'https://github.com/pingcap/kvproto'] -# kvproto = { git = "https://github.com/your_github_id/kvproto", branch = "your_branch" } +# [patch.'https://github.com/pingcap/kvproto'] +# kvproto = { git = "https://github.com/your_github_id/kvproto", branch="your_branch" } [workspace] # See https://github.com/rust-lang/rfcs/blob/master/text/2957-cargo-features2.md diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 03d0690fe25..8b4b7fe293f 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -95,6 +95,7 @@ impl Peer { util::check_conf_change( &ctx.cfg, self.raft_group(), + self.region(), self.peer(), changes.as_ref(), &cc, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index d39788ac611..2d89c3494d3 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -199,7 +199,10 @@ impl Peer { proposal_ctx: Vec, ) -> Result { store_ctx.raft_metrics.propose.normal.inc(); - PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data.len() as f64); + store_ctx + .raft_metrics + .propose_log_size + .observe(data.len() as f64); if data.len() as u64 > store_ctx.cfg.raft_entry_max_size.0 { return Err(Error::RaftEntryTooLarge { region_id: self.region_id(), diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 12df1e7926f..0736bc13fd8 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -565,7 +565,7 @@ mod tests { region1.set_region_epoch(epoch13.clone()); let term6 = 6; let mut lease = Lease::new(Duration::seconds(10), Duration::milliseconds(2500)); - let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, "".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, 1)); let mut cmd = RaftCmdRequest::default(); let mut header = RaftRequestHeader::default(); diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 8619b8cf2d4..eb98851b3bb 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -128,12 +128,12 @@ impl Peer { destroy_progress: DestroyProgress::None, raft_group, logger, - pending_reads: ReadIndexQueue::new(tag.clone()), + pending_reads: ReadIndexQueue::new(tag), read_progress: Arc::new(RegionReadProgress::new( ®ion, applied_index, REGION_READ_PROGRESS_CAP, - tag, + peer_id, )), leader_lease: Lease::new( cfg.raft_store_max_leader_lease(), diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index fcc3d535aa2..705e2a776fa 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -1004,7 +1004,7 @@ impl EntryStorage { } #[inline] - pub fn set_applied_state(&mut self, apply_state: RaftApplyState) { + pub fn set_apply_state(&mut self, apply_state: RaftApplyState) { self.apply_state = apply_state; } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index f5702092622..8cb7f58baca 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -9,6 +9,7 @@ use std::{ cmp::{Ord, Ordering as CmpOrdering}, collections::VecDeque, fmt::{self, Debug, Formatter}, + io::BufRead, mem, ops::{Deref, DerefMut, Range as StdRange}, sync::{ @@ -36,7 +37,7 @@ use fail::fail_point; use kvproto::{ import_sstpb::SstMeta, kvrpcpb::ExtraOp as TxnExtraOp, - metapb::{PeerRole, Region, RegionEpoch}, + metapb::{self, PeerRole, Region, RegionEpoch}, raft_cmdpb::{ AdminCmdType, AdminRequest, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, @@ -45,6 +46,7 @@ use kvproto::{ }; use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; use prometheus::local::LocalHistogram; +use protobuf::{wire_format::WireType, CodedInputStream}; use raft::eraftpb::{ ConfChange, ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot, }; @@ -59,7 +61,7 @@ use tikv_util::{ memory::HeapSize, mpsc::{loose_bounded, LooseBoundedSender, Receiver}, safe_panic, slow_log, - store::{find_peer, find_peer_mut, is_learner, remove_peer}, + store::{find_peer, find_peer_by_id, find_peer_mut, is_learner, remove_peer}, time::{duration_to_sec, Instant}, warn, worker::Scheduler, @@ -816,6 +818,43 @@ fn should_sync_log(cmd: &RaftCmdRequest) -> bool { false } +fn can_witness_skip(entry: &Entry) -> bool { + // need to handle ConfChange entry type + if entry.get_entry_type() != EntryType::EntryNormal { + return false; + } + + // HACK: check admin request field in serialized data from `RaftCmdRequest` + // without deserializing all. It's done by checking the existence of the + // field number of `admin_request`. + // See the encoding in `write_to_with_cached_sizes()` of `RaftCmdRequest` in + // `raft_cmdpb.rs` for reference. + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return true; + } + let (mut field_number, wire_type) = is.read_tag_unpack().unwrap(); + // Header field is of number 1 + if field_number == 1 { + if wire_type != WireType::WireTypeLengthDelimited { + panic!("unexpected wire type"); + } + let len = is.read_raw_varint32().unwrap(); + // skip parsing the content of `Header` + is.consume(len as usize); + // read next field number + (field_number, _) = is.read_tag_unpack().unwrap(); + } + + // `Requests` field is of number 2 and `AdminRequest` field is of number 3. + // - If the next field is 2, there must be no admin request as in one + // `RaftCmdRequest`, either requests or admin_request is filled. + // - If the next field is 3, it's exactly an admin request. + // - If the next field is others, neither requests nor admin_request is filled, + // so there is no admin request. + field_number != 3 +} + /// A struct that stores the state related to Merge. /// /// When executing a `CommitMerge`, the source peer may have not applied @@ -895,12 +934,12 @@ pub struct ApplyDelegate where EK: KvEngine, { - /// The ID of the peer. - id: u64, /// The term of the Region. term: u64, /// The Region information of the peer. region: Region, + /// The Peer information. + peer: metapb::Peer, /// Peer_tag, "[region region_id] peer_id". tag: String, @@ -973,8 +1012,8 @@ where { fn from_registration(reg: Registration) -> ApplyDelegate { ApplyDelegate { - id: reg.id, tag: format!("[region {}] {}", reg.region.get_id(), reg.id), + peer: find_peer_by_id(®.region, reg.id).unwrap().clone(), region: reg.region, pending_remove: false, last_flush_applied_index: reg.apply_state.get_applied_index(), @@ -1006,7 +1045,7 @@ where } pub fn id(&self) -> u64 { - self.id + self.peer.get_id() } /// Handles all the committed_entries, namely, applies the committed @@ -1126,58 +1165,60 @@ where let data = entry.get_data(); if !data.is_empty() { - let cmd = util::parse_data_at(data, index, &self.tag); - - if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { - self.priority = Priority::Low; - } - let mut has_unflushed_data = - self.last_flush_applied_index != self.apply_state.get_applied_index(); - if (has_unflushed_data && should_write_to_engine(&cmd) - || apply_ctx.kv_wb().should_write_to_engine()) - && apply_ctx.host.pre_persist(&self.region, false, Some(&cmd)) - { - apply_ctx.commit(self); - if self.metrics.written_bytes >= apply_ctx.yield_msg_size - || self - .handle_start - .as_ref() - .map_or(Duration::ZERO, Instant::saturating_elapsed) - >= apply_ctx.yield_duration + if !self.peer.is_witness || !can_witness_skip(entry) { + let cmd = util::parse_data_at(data, index, &self.tag); + if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { + self.priority = Priority::Low; + } + let mut has_unflushed_data = + self.last_flush_applied_index != self.apply_state.get_applied_index(); + if (has_unflushed_data && should_write_to_engine(&cmd) + || apply_ctx.kv_wb().should_write_to_engine()) + && apply_ctx.host.pre_persist(&self.region, false, Some(&cmd)) { + apply_ctx.commit(self); + if self.metrics.written_bytes >= apply_ctx.yield_msg_size + || self + .handle_start + .as_ref() + .map_or(Duration::ZERO, Instant::saturating_elapsed) + >= apply_ctx.yield_duration + { + return ApplyResult::Yield; + } + has_unflushed_data = false; + } + if self.priority != apply_ctx.priority { + if has_unflushed_data { + apply_ctx.commit(self); + } return ApplyResult::Yield; } - has_unflushed_data = false; + + return self.process_raft_cmd(apply_ctx, index, term, cmd); } - if self.priority != apply_ctx.priority { - if has_unflushed_data { - apply_ctx.commit(self); + } else { + // we should observe empty cmd, aka leader change, + // read index during confchange, or other situations. + apply_ctx.host.on_empty_cmd(&self.region, index, term); + + // 1. When a peer become leader, it will send an empty entry. + // 2. When a leader tries to read index during transferring leader, + // it will also propose an empty entry. But that entry will not contain + // any associated callback. So no need to clear callback. + while let Some(mut cmd) = self.pending_cmds.pop_normal(u64::MAX, term - 1) { + if let Some(cb) = cmd.cb.take() { + apply_ctx + .applied_batch + .push_cb(cb, cmd_resp::err_resp(Error::StaleCommand, term)); } - return ApplyResult::Yield; } - - return self.process_raft_cmd(apply_ctx, index, term, cmd); } - // we should observe empty cmd, aka leader change, - // read index during confchange, or other situations. - apply_ctx.host.on_empty_cmd(&self.region, index, term); - self.apply_state.set_applied_index(index); self.applied_term = term; assert!(term > 0); - // 1. When a peer become leader, it will send an empty entry. - // 2. When a leader tries to read index during transferring leader, - // it will also propose an empty entry. But that entry will not contain - // any associated callback. So no need to clear callback. - while let Some(mut cmd) = self.pending_cmds.pop_normal(u64::MAX, term - 1) { - if let Some(cb) = cmd.cb.take() { - apply_ctx - .applied_batch - .push_cb(cb, cmd_resp::err_resp(Error::StaleCommand, term)); - } - } ApplyResult::None } @@ -1438,6 +1479,9 @@ where match *exec_result { ExecResult::ChangePeer(ref cp) => { self.region = cp.region.clone(); + if let Some(p) = find_peer_by_id(&self.region, self.id()) { + self.peer = p.clone(); + } } ExecResult::ComputeHash { .. } | ExecResult::VerifyHash { .. } @@ -1494,11 +1538,12 @@ where fn destroy(&mut self, apply_ctx: &mut ApplyContext) { self.stopped = true; apply_ctx.router.close(self.region_id()); + let id = self.id(); for cmd in self.pending_cmds.normals.drain(..) { - notify_region_removed(self.region.get_id(), self.id, cmd); + notify_region_removed(self.region.get_id(), id, cmd); } if let Some(cmd) = self.pending_cmds.conf_change.take() { - notify_region_removed(self.region.get_id(), self.id, cmd); + notify_region_removed(self.region.get_id(), id, cmd); } self.yield_state = None; @@ -1578,7 +1623,6 @@ where AdminCmdType::TransferLeader => self.exec_transfer_leader(request, ctx.exec_log_term), AdminCmdType::ComputeHash => self.exec_compute_hash(ctx, request), AdminCmdType::VerifyHash => self.exec_verify_hash(ctx, request), - // TODO: is it backward compatible to add new cmd_type? AdminCmdType::PrepareMerge => self.exec_prepare_merge(ctx, request), AdminCmdType::CommitMerge => self.exec_commit_merge(ctx, request), AdminCmdType::RollbackMerge => self.exec_rollback_merge(ctx, request), @@ -1884,23 +1928,22 @@ where mod confchange_cmd_metric { use super::*; - fn write_metric(cct: ConfChangeType, kind: &str) { - let metric = match cct { - ConfChangeType::AddNode => "add_peer", - ConfChangeType::RemoveNode => "remove_peer", - ConfChangeType::AddLearnerNode => "add_learner", - }; - PEER_ADMIN_CMD_COUNTER_VEC - .with_label_values(&[metric, kind]) - .inc(); - } - pub fn inc_all(cct: ConfChangeType) { - write_metric(cct, "all") + let metrics = match cct { + ConfChangeType::AddNode => &PEER_ADMIN_CMD_COUNTER.add_peer, + ConfChangeType::RemoveNode => &PEER_ADMIN_CMD_COUNTER.remove_peer, + ConfChangeType::AddLearnerNode => &PEER_ADMIN_CMD_COUNTER.add_learner, + }; + metrics.all.inc(); } pub fn inc_success(cct: ConfChangeType) { - write_metric(cct, "success") + let metrics = match cct { + ConfChangeType::AddNode => &PEER_ADMIN_CMD_COUNTER.add_peer, + ConfChangeType::RemoveNode => &PEER_ADMIN_CMD_COUNTER.remove_peer, + ConfChangeType::AddLearnerNode => &PEER_ADMIN_CMD_COUNTER.add_learner, + }; + metrics.success.inc(); } } @@ -1942,6 +1985,8 @@ impl ApplyDelegate where EK: KvEngine, { + // Legacy code for compatibility. All new conf changes are dispatched by + // ChangePeerV2 now. fn exec_change_peer( &mut self, ctx: &mut ApplyContext, @@ -1956,12 +2001,12 @@ where fail_point!( "apply_on_conf_change_1_3_1", - (self.id == 1 || self.id == 3) && self.region_id() == 1, + (self.id() == 1 || self.id() == 3) && self.region_id() == 1, |_| panic!("should not use return") ); fail_point!( "apply_on_conf_change_3_1", - self.id == 3 && self.region_id() == 1, + self.id() == 3 && self.region_id() == 1, |_| panic!("should not use return") ); fail_point!( @@ -1986,7 +2031,7 @@ where let add_ndoe_fp = || { fail_point!( "apply_on_add_node_1_2", - self.id == 2 && self.region_id() == 1, + self.id() == 2 && self.region_id() == 1, |_| {} ) }; @@ -2053,7 +2098,7 @@ where p )); } - if self.id == peer.get_id() { + if self.id() == peer.get_id() { // Remove ourself, we will destroy all region data later. // So we need not to apply following logs. self.stopped = true; @@ -2246,6 +2291,7 @@ where // The peer is already the requested role || (role, change_type) == (PeerRole::Voter, ConfChangeType::AddNode) || (role, change_type) == (PeerRole::Learner, ConfChangeType::AddLearnerNode) + || exist_peer.get_is_witness() != peer.get_is_witness() { error!( "can't add duplicated peer"; @@ -2253,7 +2299,7 @@ where "peer_id" => self.id(), "peer" => ?peer, "exist peer" => ?exist_peer, - "confchnage type" => ?change_type, + "confchange type" => ?change_type, "region" => ?&self.region ); return Err(box_err!( @@ -2307,7 +2353,7 @@ where "region_id" => self.region_id(), "peer_id" => self.id(), "expect_peer" => ?peer, - "get_peeer" => ?p + "get_peer" => ?p ); return Err(box_err!( "remove unmatched peer: expect: {:?}, get {:?}, ignore", @@ -2315,7 +2361,7 @@ where p )); } - if self.id == peer.get_id() { + if self.id() == peer.get_id() { // Remove ourself, we will destroy all region data later. // So we need not to apply following logs. self.stopped = true; @@ -2399,7 +2445,7 @@ where fail_point!("apply_before_split"); fail_point!( "apply_before_split_1_3", - self.id == 3 && self.region_id() == 1, + self.id() == 3 && self.region_id() == 1, |_| { unreachable!() } ); @@ -2582,7 +2628,7 @@ where fail_point!( "apply_after_split_1_3", - self.id == 3 && self.region_id() == 1, + self.id() == 3 && self.region_id() == 1, |_| { unreachable!() } ); @@ -2686,7 +2732,7 @@ where let apply_before_commit_merge = || { fail_point!( "apply_before_commit_merge_except_1_4", - self.region_id() == 1 && self.id != 4, + self.region_id() == 1 && self.id() != 4, |_| {} ); }; @@ -2958,7 +3004,7 @@ where let peer = req.get_transfer_leader().get_peer(); // Only execute TransferLeader if the expected new leader is self. - if peer.get_id() == self.id { + if peer.get_id() == self.id() { Ok((resp, ApplyResult::Res(ExecResult::TransferLeader { term }))) } else { Ok((resp, ApplyResult::None)) @@ -3534,7 +3580,7 @@ where "peer_id" => self.delegate.id(), "term" => reg.term ); - assert_eq!(self.delegate.id, reg.id); + assert_eq!(self.delegate.id(), reg.id); self.delegate.term = reg.term; self.delegate.clear_all_commands_as_stale(); self.delegate = ApplyDelegate::from_registration(reg); @@ -3681,7 +3727,7 @@ where PeerMsg::ApplyRes { res: TaskRes::Destroy { region_id: self.delegate.region_id(), - peer_id: self.delegate.id, + peer_id: self.delegate.id(), merge_from_snapshot: d.merge_from_snapshot, }, }, @@ -3762,6 +3808,10 @@ where if self.delegate.pending_remove || self.delegate.stopped { return; } + if self.delegate.peer.is_witness { + // witness shouldn't generate snapshot. + return; + } let applied_index = self.delegate.apply_state.get_applied_index(); let need_sync = apply_ctx .apply_res @@ -3779,7 +3829,7 @@ where self.delegate.maybe_write_apply_state(apply_ctx); fail_point!( "apply_on_handle_snapshot_1_1", - self.delegate.id == 1 && self.delegate.region_id() == 1, + self.delegate.id() == 1 && self.delegate.region_id() == 1, |_| unimplemented!() ); @@ -3805,7 +3855,7 @@ where .fetch_sub(1, Ordering::SeqCst); fail_point!( "apply_on_handle_snapshot_finish_1_1", - self.delegate.id == 1 && self.delegate.region_id() == 1, + self.delegate.id() == 1 && self.delegate.region_id() == 1, |_| unimplemented!() ); } @@ -4530,6 +4580,7 @@ mod tests { time::*, }; + use bytes::Bytes; use engine_panic::PanicEngine; use engine_test::kv::{new_engine, KvTestEngine, KvTestSnapshot}; use engine_traits::{Peekable as PeekableTrait, SyncMutable, WriteBatchExt}; @@ -4539,6 +4590,7 @@ mod tests { raft_cmdpb::*, }; use protobuf::Message; + use raft::eraftpb::{ConfChange, ConfChangeV2}; use sst_importer::Config as ImportConfig; use tempfile::{Builder, TempDir}; use test_sst_importer::*; @@ -4644,6 +4696,42 @@ mod tests { } } + #[test] + fn test_can_witness_skip() { + let mut entry = Entry::new(); + let mut req = RaftCmdRequest::default(); + entry.set_entry_type(EntryType::EntryNormal); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(can_witness_skip(&entry)); + + req.mut_admin_request() + .set_cmd_type(AdminCmdType::CompactLog); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + + let mut req = RaftCmdRequest::default(); + let mut request = Request::default(); + request.set_cmd_type(CmdType::Put); + req.set_requests(vec![request].into()); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(can_witness_skip(&entry)); + + entry.set_entry_type(EntryType::EntryConfChange); + let conf_change = ConfChange::new(); + let data = conf_change.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + + entry.set_entry_type(EntryType::EntryConfChangeV2); + let conf_change_v2 = ConfChangeV2::new(); + let data = conf_change_v2.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + } + #[test] fn test_should_sync_log() { // Admin command @@ -4841,10 +4929,14 @@ mod tests { ..Default::default() }; reg.region.set_id(2); + let mut peer = metapb::Peer::default(); + peer.set_id(1); + reg.region.mut_peers().push(peer.clone()); reg.apply_state.set_applied_index(3); router.schedule_task(2, Msg::Registration(reg.dup())); validate(&router, 2, move |delegate| { - assert_eq!(delegate.id, 1); + assert_eq!(delegate.id(), 1); + assert_eq!(delegate.peer, peer); assert_eq!(delegate.tag, "[region 2] 1"); assert_eq!(delegate.region, reg.region); assert!(!delegate.pending_remove); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 69215ecaf70..8c7ef17cfa6 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2489,12 +2489,14 @@ where // TODO: spin off the I/O code (delete_snapshot) let regions_to_destroy = match self.check_snapshot(&msg)? { Either::Left(key) => { - // If the snapshot file is not used again, then it's OK to - // delete them here. If the snapshot file will be reused when - // receiving, then it will fail to pass the check again, so - // missing snapshot files should not be noticed. - let s = self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; - self.ctx.snap_mgr.delete_snapshot(&key, s.as_ref(), false); + if let Some(key) = key { + // If the snapshot file is not used again, then it's OK to + // delete them here. If the snapshot file will be reused when + // receiving, then it will fail to pass the check again, so + // missing snapshot files should not be noticed. + let s = self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; + self.ctx.snap_mgr.delete_snapshot(&key, s.as_ref(), false); + } return Ok(()); } Either::Right(v) => v, @@ -2956,16 +2958,55 @@ where // Returns `Vec<(u64, bool)>` indicated (source_region_id, merge_to_this_peer) // if the `msg` doesn't contain a snapshot or this snapshot doesn't conflict // with any other snapshots or regions. Otherwise a `SnapKey` is returned. - fn check_snapshot(&mut self, msg: &RaftMessage) -> Result>> { + fn check_snapshot( + &mut self, + msg: &RaftMessage, + ) -> Result, Vec<(u64, bool)>>> { if !msg.get_message().has_snapshot() { return Ok(Either::Right(vec![])); } let region_id = msg.get_region_id(); let snap = msg.get_message().get_snapshot(); - let key = SnapKey::from_region_snap(region_id, snap); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap.get_data())?; + + let key = if !snap_data.get_meta().get_for_witness() { + // Check if snapshot file exists. + // No need to get snapshot for witness, as witness's empty snapshot bypass + // snapshot manager. + let key = SnapKey::from_region_snap(region_id, snap); + self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; + Some(key) + } else { + None + }; + + // If the index of snapshot is not newer than peer's apply index, it + // is possibly because there is witness -> non-witness switch, and the peer + // requests snapshot from leader but leader doesn't applies the switch yet. + // In that case, the snapshot is a witness snapshot whereas non-witness snapshot + // is expected. + if snap.get_metadata().get_index() < self.fsm.peer.get_store().applied_index() + && snap_data.get_meta().get_for_witness() != self.fsm.peer.is_witness() + { + info!( + "mismatch witness snapshot"; + "region_id" => region_id, + "peer_id" => self.fsm.peer_id(), + "for_witness" => snap_data.get_meta().get_for_witness(), + "is_witness" => self.fsm.peer.is_witness(), + "index" => snap.get_metadata().get_index(), + "applied_index" => self.fsm.peer.get_store().applied_index(), + ); + self.ctx + .raft_metrics + .message_dropped + .mismatch_witness_snapshot + .inc(); + return Ok(Either::Left(key)); + } + let snap_region = snap_data.take_region(); let peer_id = msg.get_to_peer().get_id(); let snap_enc_start_key = enc_start_key(&snap_region); @@ -3116,9 +3157,6 @@ where return Ok(Either::Left(key)); } - // Check if snapshot file exists. - self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; - // WARNING: The checking code must be above this line. // Now all checking passed. @@ -4955,6 +4993,16 @@ where let leader_id = self.fsm.peer.leader_id(); let request = msg.get_requests(); + // peer_id must be the same as peer's. + if let Err(e) = util::check_peer_id(msg, self.fsm.peer.peer_id()) { + self.ctx + .raft_metrics + .invalid_proposal + .mismatch_peer_id + .inc(); + return Err(e); + } + if self.fsm.peer.force_leader.is_some() { self.ctx.raft_metrics.invalid_proposal.force_leader.inc(); // in force leader state, forbid requests to make the recovery progress less @@ -4992,15 +5040,17 @@ where self.register_raft_base_tick(); return Err(Error::NotLeader(region_id, leader)); } - // peer_id must be the same as peer's. - if let Err(e) = util::check_peer_id(msg, self.fsm.peer.peer_id()) { - self.ctx - .raft_metrics - .invalid_proposal - .mismatch_peer_id - .inc(); - return Err(e); + + // Forbid requests when it's a witness unless it's transfer leader + if self.fsm.peer.is_witness() + && !(msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::TransferLeader) + { + self.ctx.raft_metrics.invalid_proposal.witness.inc(); + // TODO: use a dedicated error type + return Err(Error::RecoveryInProgress(self.region_id())); } + // check whether the peer is initialized. if !self.fsm.peer.is_initialized() { self.ctx diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 1648bd345ca..5cfbb645612 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -86,6 +86,7 @@ pub struct RaftMetrics { pub peer_msg_len: LocalHistogram, pub commit_log: LocalHistogram, pub write_block_wait: LocalHistogram, + pub propose_log_size: LocalHistogram, // waterfall metrics pub waterfall_metrics: bool, @@ -123,6 +124,7 @@ impl RaftMetrics { peer_msg_len: PEER_MSG_LEN.local(), commit_log: PEER_COMMIT_LOG_HISTOGRAM.local(), write_block_wait: STORE_WRITE_MSG_BLOCK_WAIT_DURATION_HISTOGRAM.local(), + propose_log_size: PEER_PROPOSE_LOG_SIZE_HISTOGRAM.local(), waterfall_metrics, wf_batch_wait: STORE_WF_BATCH_WAIT_DURATION_HISTOGRAM.local(), wf_send_to_queue: STORE_WF_SEND_TO_QUEUE_DURATION_HISTOGRAM.local(), @@ -157,6 +159,7 @@ impl RaftMetrics { self.peer_msg_len.flush(); self.commit_log.flush(); self.write_block_wait.flush(); + self.propose_log_size.flush(); if self.waterfall_metrics { self.wf_batch_wait.flush(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 2fe6fce580e..b0f44c30c0f 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -169,6 +169,7 @@ make_static_metric! { pub label_enum RaftDroppedMessage { mismatch_store_id, mismatch_region_epoch, + mismatch_witness_snapshot, stale_msg, region_overlap, region_no_peer, @@ -201,6 +202,7 @@ make_static_metric! { region_not_initialized, is_applying_snapshot, force_leader, + witness, flashback_in_progress, flashback_not_prepared } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index b9cf76889b4..ff55597b30e 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -84,6 +84,7 @@ use super::{ use crate::{ coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason, RoleChange}, errors::RAFTSTORE_IS_BUSY, + router::RaftStoreRouter, store::{ async_io::{read::ReadTask, write::WriteMsg, write_router::WriteRouter}, fsm::{ @@ -93,7 +94,8 @@ use crate::{ }, hibernate_state::GroupState, memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}, - msg::{ErrorCallback, PeerMsg, RaftCommand, SignificantMsg, StoreMsg}, + msg::{CasualMessage, ErrorCallback, PeerMsg, RaftCommand, SignificantMsg, StoreMsg}, + peer_storage::HandleSnapshotResult, txn_ext::LocksStatus, util::{admin_cmd_epoch_lookup, RegionReadProgress}, worker::{ @@ -466,6 +468,7 @@ pub struct PersistSnapshotResult { pub prev_region: metapb::Region, pub region: metapb::Region, pub destroy_regions: Vec, + pub for_witness: bool, } #[derive(Debug)] @@ -1076,6 +1079,7 @@ where skip_bcast_commit: true, pre_vote: cfg.prevote, max_committed_size_per_ready: MAX_COMMITTED_SIZE_PER_READY, + // TODO: if peer.is_witness { 0 } else { 1 }, ..Default::default() }; @@ -1150,7 +1154,7 @@ where region, applied_index, REGION_READ_PROGRESS_CAP, - tag.clone(), + peer_id, )), memtrace_raft_entries: 0, write_router: WriteRouter::new(tag), @@ -1684,6 +1688,11 @@ where self.raft_group.raft.state == StateRole::Leader } + #[inline] + pub fn is_witness(&self) -> bool { + self.peer.is_witness + } + #[inline] pub fn get_role(&self) -> StateRole { self.raft_group.raft.state @@ -2013,7 +2022,6 @@ where if p.get_id() == self.peer.get_id() { continue; } - // TODO if let Some(instant) = self.peer_heartbeats.get(&p.get_id()) { let elapsed = instant.saturating_elapsed(); if elapsed >= max_duration { @@ -2856,13 +2864,20 @@ where } } - if let HandleReadyResult::Snapshot { + if let HandleReadyResult::Snapshot(box HandleSnapshotResult { msgs, snap_region, destroy_regions, last_first_index, - } = res + for_witness, + }) = res { + if for_witness { + // inform next round to check apply status + ctx.router + .send_casual_msg(snap_region.get_id(), CasualMessage::SnapshotApplied) + .unwrap(); + } // When applying snapshot, there is no log applied and not compacted yet. self.raft_log_size_hint = 0; @@ -2874,6 +2889,7 @@ where prev_region: self.region().clone(), region: snap_region, destroy_regions, + for_witness, }), }); if self.last_compacted_idx == 0 && last_first_index >= RAFT_INIT_LOG_INDEX { @@ -2977,6 +2993,7 @@ where } else { vec![] }; + // Note that the `commit_index` and `commit_term` here may be used to // forward the commit index. So it must be less than or equal to persist // index. @@ -2985,6 +3002,7 @@ where self.raft_group.raft.raft_log.persisted, ); let commit_term = self.get_store().term(commit_index).unwrap(); + let mut apply = Apply::new( self.peer_id(), self.region_id, @@ -3094,6 +3112,9 @@ where "after" => ?peer, ); self.peer = peer; + // TODO: set priority for witness + // self.raft_group + // .set_priority(if self.peer.is_witness { 0 } else { 1 }); }; self.activate(ctx); @@ -3443,7 +3464,7 @@ where } let progress_to_be_updated = self.mut_store().applied_term() != applied_term; - self.mut_store().set_applied_state(apply_state); + self.mut_store().set_apply_state(apply_state); self.mut_store().set_applied_term(applied_term); self.peer_stat.written_keys += apply_metrics.written_keys; @@ -4331,9 +4352,10 @@ where }; let data = req.write_to_bytes()?; - - // TODO: use local histogram metrics - PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data.len() as f64); + poll_ctx + .raft_metrics + .propose_log_size + .observe(data.len() as f64); if data.len() as u64 > poll_ctx.cfg.raft_entry_max_size.0 { error!( @@ -4394,6 +4416,11 @@ where msg: &eraftpb::Message, peer_disk_usage: DiskUsage, ) -> bool { + if self.is_witness() { + // shouldn't transfer leader to witness peer + return true; + } + let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); if pending_snapshot || msg.get_from() != self.leader_id() @@ -4646,6 +4673,7 @@ where util::check_conf_change( &ctx.cfg, &self.raft_group, + self.region(), &self.peer, changes.as_ref(), &cc, @@ -4653,8 +4681,7 @@ where )?; ctx.raft_metrics.propose.conf_change.inc(); - // TODO: use local histogram metrics - PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data_size as f64); + ctx.raft_metrics.propose_log_size.observe(data_size as f64); info!( "propose conf change peer"; "region_id" => self.region_id, @@ -5011,6 +5038,7 @@ where Some(ForceLeaderState::ForceLeader { .. }) ) } + pub fn unsafe_recovery_maybe_finish_wait_apply(&mut self, force: bool) { if let Some(UnsafeRecoveryState::WaitApply { target_index, .. }) = &self.unsafe_recovery_state diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index a53ca1e9258..56b80c94dcc 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -30,7 +30,8 @@ use raft::{ Error as RaftError, GetEntriesContext, RaftState, Ready, Storage, StorageError, }; use tikv_util::{ - box_err, box_try, debug, defer, error, info, time::Instant, warn, worker::Scheduler, + box_err, box_try, debug, defer, error, info, store::find_peer_by_id, time::Instant, warn, + worker::Scheduler, }; use super::{metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager}; @@ -113,17 +114,21 @@ impl From for RaftError { } } +#[derive(PartialEq, Debug)] +pub struct HandleSnapshotResult { + pub msgs: Vec, + pub snap_region: metapb::Region, + /// The regions whose range are overlapped with this region + pub destroy_regions: Vec, + /// The first index before applying the snapshot. + pub last_first_index: u64, + pub for_witness: bool, +} + #[derive(PartialEq, Debug)] pub enum HandleReadyResult { SendIoTask, - Snapshot { - msgs: Vec, - snap_region: metapb::Region, - /// The regions whose range are overlapped with this region - destroy_regions: Vec, - /// The first index before applying the snapshot. - last_first_index: u64, - }, + Snapshot(Box), // use boxing to reduce total size of the enum NoIoTask, } @@ -214,6 +219,7 @@ where pub engines: Engines, peer_id: u64, + peer: Option, // when uninitialized the peer info is unknown. region: metapb::Region, snap_state: RefCell, @@ -314,6 +320,7 @@ where Ok(PeerStorage { engines, peer_id, + peer: find_peer_by_id(region, peer_id).cloned(), region: region.clone(), snap_state: RefCell::new(SnapState::Relax), gen_snap_task: RefCell::new(None), @@ -354,6 +361,7 @@ where #[inline] pub fn set_region(&mut self, region: metapb::Region) { + self.peer = find_peer_by_id(®ion, self.peer_id).cloned(); self.region = region; } @@ -439,16 +447,31 @@ where /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no /// available snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + if self.peer.as_ref().unwrap().is_witness { + // witness could be the leader for a while, do not generate snapshot now + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } + + if find_peer_by_id(&self.region, to).map_or(false, |p| p.is_witness) { + // generate an empty snapshot for witness directly + return Ok(util::new_empty_snapshot( + self.region.clone(), + self.applied_index(), + self.applied_term(), + true, // for witness + )); + } + let mut snap_state = self.snap_state.borrow_mut(); let mut tried_cnt = self.snap_tried_cnt.borrow_mut(); let mut tried = false; let mut last_canceled = false; if let SnapState::Generating { - ref canceled, - ref receiver, - .. - } = *snap_state + canceled, receiver, .. + } = &*snap_state { tried = true; last_canceled = canceled.load(Ordering::SeqCst); @@ -551,7 +574,7 @@ where snap: &Snapshot, task: &mut WriteTask, destroy_regions: &[metapb::Region], - ) -> Result { + ) -> Result<(metapb::Region, bool)> { info!( "begin to apply snapshot"; "region_id" => self.region.get_id(), @@ -561,8 +584,9 @@ where let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap.get_data())?; - let region_id = self.get_region_id(); + let for_witness = snap_data.get_meta().get_for_witness(); + let region_id = self.get_region_id(); let region = snap_data.take_region(); if region.get_id() != region_id { return Err(box_err!( @@ -597,24 +621,32 @@ where for r in destroy_regions { write_peer_state(kv_wb, r, PeerState::Tombstone, None)?; } - write_peer_state(kv_wb, ®ion, PeerState::Applying, None)?; - let last_index = snap.get_metadata().get_index(); + // Witness snapshot is applied atomically as no async applying operation to + // region worker, so no need to set the peer state to `Applying` + let state = if for_witness { + PeerState::Normal + } else { + PeerState::Applying + }; + write_peer_state(kv_wb, ®ion, state, None)?; + + let snap_index = snap.get_metadata().get_index(); + let snap_term = snap.get_metadata().get_term(); - self.raft_state_mut().set_last_index(last_index); - self.set_last_term(snap.get_metadata().get_term()); - self.apply_state_mut().set_applied_index(last_index); - let last_term = self.last_term(); - self.set_applied_term(last_term); + self.raft_state_mut().set_last_index(snap_index); + self.set_last_term(snap_term); + self.apply_state_mut().set_applied_index(snap_index); + self.set_applied_term(snap_term); // The snapshot only contains log which index > applied index, so // here the truncate state's (index, term) is in snapshot metadata. self.apply_state_mut() .mut_truncated_state() - .set_index(last_index); + .set_index(snap_index); self.apply_state_mut() .mut_truncated_state() - .set_term(snap.get_metadata().get_term()); + .set_term(snap_term); // `region` will be updated after persisting. // Although there is an interval that other metadata are updated while `region` @@ -634,7 +666,7 @@ where "state" => ?self.apply_state(), ); - Ok(region) + Ok((region, for_witness)) } /// Delete all meta belong to the region. Results are stored in `wb`. @@ -856,20 +888,23 @@ where let mut write_task = WriteTask::new(region_id, self.peer_id, ready.number()); - let mut res = HandleReadyResult::SendIoTask; - if !ready.snapshot().is_empty() { + let mut res = if ready.snapshot().is_empty() { + HandleReadyResult::SendIoTask + } else { fail_point!("raft_before_apply_snap"); let last_first_index = self.first_index().unwrap(); - let snap_region = + let (snap_region, for_witness) = self.apply_snapshot(ready.snapshot(), &mut write_task, &destroy_regions)?; - res = HandleReadyResult::Snapshot { + let res = HandleReadyResult::Snapshot(Box::new(HandleSnapshotResult { msgs: ready.take_persisted_messages(), snap_region, destroy_regions, last_first_index, - }; + for_witness, + })); fail_point!("raft_after_apply_snap"); + res }; if !ready.entries().is_empty() { @@ -930,7 +965,7 @@ where // - After `PrepareMerge` log is committed, the source region leader's lease // will be suspected immediately which makes the local reader not serve read // request. - // - No read request can be responsed in peer fsm during merging. These + // - No read request can be responded in peer fsm during merging. These // conditions are used to prevent reading **stale** data in the past. At // present, they are also used to prevent reading **corrupt** data. for r in &res.destroy_regions { @@ -942,7 +977,14 @@ where } } - self.schedule_applying_snapshot(); + if !res.for_witness { + self.schedule_applying_snapshot(); + } else { + // Bypass apply snapshot process for witness as the snapshot is empty, so mark + // status as finished directly here + let status = Arc::new(AtomicUsize::new(JOB_STATUS_FINISHED)); + self.set_snap_state(SnapState::Applying(Arc::clone(&status))); + } // The `region` is updated after persisting in order to stay consistent with the // one in `StoreMeta::regions` (will be updated soon). @@ -1133,7 +1175,10 @@ pub mod tests { Error as RaftError, GetEntriesContext, StorageError, }; use tempfile::{Builder, TempDir}; - use tikv_util::worker::{dummy_scheduler, LazyWorker, Scheduler, Worker}; + use tikv_util::{ + store::{new_peer, new_witness_peer}, + worker::{dummy_scheduler, LazyWorker, Scheduler, Worker}, + }; use super::*; use crate::{ @@ -1569,7 +1614,7 @@ pub mod tests { Option::>::None, ); worker.start_with_timer(runner); - let snap = s.snapshot(0, 0); + let snap = s.snapshot(0, 1); let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); assert_eq!(snap.unwrap_err(), unavailable); assert_eq!(*s.snap_tried_cnt.borrow(), 1); @@ -1593,11 +1638,11 @@ pub mod tests { let (tx, rx) = channel(); s.set_snap_state(gen_snap_for_test(rx)); // Empty channel should cause snapshot call to wait. - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); assert_eq!(*s.snap_tried_cnt.borrow(), 1); tx.send(snap.clone()).unwrap(); - assert_eq!(s.snapshot(0, 0), Ok(snap.clone())); + assert_eq!(s.snapshot(0, 1), Ok(snap.clone())); assert_eq!(*s.snap_tried_cnt.borrow(), 0); let (tx, rx) = channel(); @@ -1638,7 +1683,7 @@ pub mod tests { s.set_snap_state(gen_snap_for_test(rx)); *s.snap_tried_cnt.borrow_mut() = 1; // stale snapshot should be abandoned, snapshot index < truncated index. - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); assert_eq!(*s.snap_tried_cnt.borrow(), 1); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); @@ -1655,7 +1700,7 @@ pub mod tests { ref s => panic!("unexpected state {:?}", s), } // Disconnected channel should trigger another try. - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s.engines, &sched).unwrap_err(); assert_eq!(*s.snap_tried_cnt.borrow(), 2); @@ -1670,13 +1715,13 @@ pub mod tests { } // Scheduled job failed should trigger . - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s.engines, &sched).unwrap_err(); } // When retry too many times, it should report a different error. - match s.snapshot(0, 0) { + match s.snapshot(0, 1) { Err(RaftError::Store(StorageError::Other(_))) => {} res => panic!("unexpected res: {:?}", res), } @@ -1752,6 +1797,80 @@ pub mod tests { test_storage_create_snapshot_for_role("tikv", 5); } + #[test] + fn test_storage_create_snapshot_for_witness() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let mut cs = ConfState::default(); + cs.set_voters(vec![1, 2, 3]); + + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); + let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + let mut worker = Worker::new("region-worker").lazy_build("region-worker"); + let sched = worker.scheduler(); + let (dummy_scheduler, _) = dummy_scheduler(); + let mut s = new_storage_from_ents(sched.clone(), dummy_scheduler, &td, &ents); + let cfg = make_region_worker_raftstore_cfg(true); + let (router, _) = mpsc::sync_channel(100); + let runner = RegionRunner::new( + s.engines.kv.clone(), + mgr, + cfg, + CoprocessorHost::::default(), + router, + Option::>::None, + ); + worker.start_with_timer(runner); + + let mut r = s.region().clone(); + r.mut_peers().push(new_peer(2, 2)); + r.mut_peers().push(new_witness_peer(3, 3)); + + let mut kv_wb = s.engines.kv.write_batch(); + write_peer_state(&mut kv_wb, &r, PeerState::Normal, None).unwrap(); + kv_wb.write().unwrap(); + s.set_region(r); + + let wait_snapshot = |snap: raft::Result| -> Snapshot { + if let Ok(s) = snap { + return s; + } + let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); + assert_eq!(snap.unwrap_err(), unavailable); + assert_eq!(*s.snap_tried_cnt.borrow(), 1); + let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); + generate_and_schedule_snapshot(gen_task, &s.engines, &sched).unwrap(); + let snap = match *s.snap_state.borrow() { + SnapState::Generating { ref receiver, .. } => { + receiver.recv_timeout(Duration::from_secs(3)).unwrap() + } + ref s => panic!("unexpected state: {:?}", s), + }; + snap + }; + + // generate snapshot for peer + let snap = wait_snapshot(s.snapshot(0, 2)); + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); + assert!(!snap.get_data().is_empty()); + + // generate snapshot for witness peer + let snap = wait_snapshot(s.snapshot(0, 3)); + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); + assert!(!snap.get_data().is_empty()); + + let mut data = RaftSnapshotData::default(); + protobuf::Message::merge_from_bytes(&mut data, snap.get_data()).unwrap(); + assert_eq!(data.get_region().get_id(), 1); + assert_eq!(data.get_region().get_peers().len(), 3); + let files = data.get_meta().get_cf_files(); + for file in files { + assert_eq!(file.get_size(), 0); + } + } + #[test] fn test_storage_apply_snapshot() { let ents = vec![ @@ -1781,7 +1900,7 @@ pub mod tests { Option::>::None, ); worker.start(runner); - s1.snapshot(0, 0).unwrap_err(); + s1.snapshot(0, 1).unwrap_err(); let gen_task = s1.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s1.engines, &sched).unwrap(); @@ -1799,7 +1918,7 @@ pub mod tests { let mut s2 = new_storage(sched.clone(), dummy_scheduler.clone(), &td2); assert_eq!(s2.first_index(), Ok(s2.applied_index() + 1)); let mut write_task = WriteTask::new(s2.get_region_id(), s2.peer_id, 1); - let snap_region = s2.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); + let (snap_region, _) = s2.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap1.get_data()).unwrap(); assert_eq!(snap_region, snap_data.take_region(),); @@ -1816,7 +1935,7 @@ pub mod tests { let mut s3 = new_storage_from_ents(sched, dummy_scheduler, &td3, ents); validate_cache(&s3, &ents[1..]); let mut write_task = WriteTask::new(s3.get_region_id(), s3.peer_id, 1); - let snap_region = s3.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); + let (snap_region, _) = s3.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap1.get_data()).unwrap(); assert_eq!(snap_region, snap_data.take_region(),); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 2980f9931a5..b2180a8420d 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -20,20 +20,25 @@ use kvproto::{ kvrpcpb::{self, KeyRange, LeaderInfo}, metapb::{self, Peer, PeerRole, Region, RegionEpoch}, raft_cmdpb::{AdminCmdType, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest}, - raft_serverpb::RaftMessage, + raft_serverpb::{RaftMessage, RaftSnapshotData}, }; use protobuf::{self, Message}; use raft::{ - eraftpb::{self, ConfChangeType, ConfState, MessageType}, + eraftpb::{self, ConfChangeType, ConfState, MessageType, Snapshot}, Changer, RawNode, INVALID_INDEX, }; use raft_proto::ConfChangeI; -use tikv_util::{box_err, debug, info, store::region, time::monotonic_raw_now, Either}; +use tikv_util::{ + box_err, debug, info, + store::{find_peer_by_id, region}, + time::monotonic_raw_now, + Either, +}; use time::{Duration, Timespec}; use txn_types::{TimeStamp, WriteBatchFlags}; use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; -use crate::{coprocessor::CoprocessorHost, Error, Result}; +use crate::{coprocessor::CoprocessorHost, store::snap::SNAPSHOT_VERSION, Error, Result}; const INVALID_TIMESTAMP: u64 = u64::MAX; @@ -125,6 +130,27 @@ pub fn is_initial_msg(msg: &eraftpb::Message) -> bool { || (msg_type == MessageType::MsgHeartbeat && msg.get_commit() == INVALID_INDEX) } +pub fn new_empty_snapshot( + region: Region, + applied_index: u64, + applied_term: u64, + for_witness: bool, +) -> Snapshot { + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().set_index(applied_index); + snapshot.mut_metadata().set_term(applied_term); + snapshot + .mut_metadata() + .set_conf_state(conf_state_from_region(®ion)); + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region); + snap_data.set_file_size(0); + snap_data.set_version(SNAPSHOT_VERSION); + snap_data.mut_meta().set_for_witness(for_witness); + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + snapshot +} + const STR_CONF_CHANGE_ADD_NODE: &str = "AddNode"; const STR_CONF_CHANGE_REMOVE_NODE: &str = "RemoveNode"; const STR_CONF_CHANGE_ADDLEARNER_NODE: &str = "AddLearner"; @@ -869,6 +895,7 @@ impl<'a> ChangePeerI for &'a ChangePeerV2Request { pub fn check_conf_change( cfg: &Config, node: &RawNode, + region: &metapb::Region, leader: &metapb::Peer, change_peers: &[ChangePeerRequest], cc: &impl ConfChangeI, @@ -915,6 +942,18 @@ pub fn check_conf_change( } } + if region + .get_peers() + .iter() + .find(|p| p.get_id() == peer.get_id()) + .map_or(false, |p| p.get_is_witness() != peer.get_is_witness()) + { + return Err(box_err!( + "invalid conf change request: {:?}, can not switch witness in conf change", + cp + )); + } + if !check_dup.insert(peer.get_id()) { return Err(box_err!( "have multiple commands for the same peer {}", @@ -1122,9 +1161,19 @@ pub struct RegionReadProgress { } impl RegionReadProgress { - pub fn new(region: &Region, applied_index: u64, cap: usize, tag: String) -> RegionReadProgress { + pub fn new( + region: &Region, + applied_index: u64, + cap: usize, + peer_id: u64, + ) -> RegionReadProgress { RegionReadProgress { - core: Mutex::new(RegionReadProgressCore::new(region, applied_index, cap, tag)), + core: Mutex::new(RegionReadProgressCore::new( + region, + applied_index, + cap, + peer_id, + )), safe_ts: AtomicU64::from(0), } } @@ -1283,7 +1332,7 @@ impl RegionReadProgress { #[derive(Debug)] pub struct RegionReadProgressCore { - tag: String, + peer_id: u64, region_id: u64, applied_index: u64, // A wrapper of `(apply_index, safe_ts)` item, where the `read_state.ts` is the peer's current @@ -1355,17 +1404,24 @@ fn find_store_id(peer_list: &[Peer], peer_id: u64) -> Option { } impl RegionReadProgressCore { - fn new(region: &Region, applied_index: u64, cap: usize, tag: String) -> RegionReadProgressCore { + fn new( + region: &Region, + applied_index: u64, + cap: usize, + peer_id: u64, + ) -> RegionReadProgressCore { + // forbids stale read for witness + let is_witness = find_peer_by_id(region, peer_id).map_or(false, |p| p.is_witness); RegionReadProgressCore { - tag, + peer_id, region_id: region.get_id(), applied_index, read_state: ReadState::default(), leader_info: LocalLeaderInfo::new(region), pending_items: VecDeque::with_capacity(cap), last_merge_index: 0, - pause: false, - discard: false, + pause: is_witness, + discard: is_witness, } } @@ -1380,10 +1436,11 @@ impl RegionReadProgressCore { self.read_state.ts = cmp::min(source_safe_ts, target_safe_ts); info!( "reset safe_ts due to merge"; - "tag" => &self.tag, "source_safe_ts" => source_safe_ts, "target_safe_ts" => target_safe_ts, "safe_ts" => self.read_state.ts, + "region_id" => self.region_id, + "peer_id" => self.peer_id, ); if self.read_state.ts != target_safe_ts { Some(self.read_state.ts) @@ -2096,7 +2153,7 @@ mod tests { } let cap = 10; - let rrp = RegionReadProgress::new(&Default::default(), 10, cap, "".to_owned()); + let rrp = RegionReadProgress::new(&Default::default(), 10, cap, 1); for i in 1..=20 { rrp.update_safe_ts(i, i); } diff --git a/components/raftstore/src/store/worker/check_leader.rs b/components/raftstore/src/store/worker/check_leader.rs index 696caab7d69..ab83752d8c3 100644 --- a/components/raftstore/src/store/worker/check_leader.rs +++ b/components/raftstore/src/store/worker/check_leader.rs @@ -157,7 +157,7 @@ mod tests { region.set_start_key(kr.get_start_key().to_vec()); region.set_end_key(kr.get_end_key().to_vec()); region.set_peers(vec![kvproto::metapb::Peer::default()].into()); - let rrp = RegionReadProgress::new(®ion, 1, 1, "".to_owned()); + let rrp = RegionReadProgress::new(®ion, 1, 1, 1); rrp.update_safe_ts(1, safe_ts); assert_eq!(rrp.safe_ts(), safe_ts); meta.region_ranges.insert(enc_end_key(®ion), id); diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index 7a680e4d7a6..0d396eae575 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -56,6 +56,7 @@ make_static_metric! { channel_full, cache_miss, safe_ts, + witness, } pub struct LocalReadRejectCounter : LocalIntCounter { diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index d62f2f6c1db..2c92923fc4e 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -25,6 +25,7 @@ use tikv_util::{ codec::number::decode_u64, debug, error, lru::LruCache, + store::find_peer_by_id, time::{monotonic_raw_now, ThreadReadId}, }; use time::Timespec; @@ -563,9 +564,11 @@ impl ReadDelegate { } debug!( "reject stale read by safe ts"; - "tag" => &self.tag, - "safe ts" => safe_ts, - "read ts" => read_ts + "safe_ts" => safe_ts, + "read_ts" => read_ts, + + "region_id" => self.region.get_id(), + "peer_id" => self.peer_id, ); TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.safe_ts.inc()); let mut response = cmd_resp::new_error(Error::DataIsNotReady { @@ -581,7 +584,7 @@ impl ReadDelegate { pub fn mock(region_id: u64) -> Self { let mut region: metapb::Region = Default::default(); region.set_id(region_id); - let read_progress = Arc::new(RegionReadProgress::new(®ion, 0, 0, "mock".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion, 0, 0, 1)); ReadDelegate { region: Arc::new(region), peer_id: 1, @@ -782,6 +785,15 @@ where return Err(e); } + // Check witness + if find_peer_by_id(&delegate.region, delegate.peer_id) + .unwrap() + .is_witness + { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); + return Err(Error::RecoveryInProgress(region_id)); + } + // Check term. if let Err(e) = util::check_term(req, delegate.term) { debug!( @@ -1241,7 +1253,7 @@ mod tests { region1.set_region_epoch(epoch13.clone()); let term6 = 6; let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. - let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, "".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, 1)); let mut cmd = RaftCmdRequest::default(); let mut header = RaftRequestHeader::default(); @@ -1573,7 +1585,7 @@ mod tests { txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), txn_ext: Arc::new(TxnExt::default()), track_ver: TrackVer::new(), - read_progress: Arc::new(RegionReadProgress::new(®ion, 0, 0, "".to_owned())), + read_progress: Arc::new(RegionReadProgress::new(®ion, 0, 0, 1)), pending_remove: false, bucket_meta: None, }; @@ -1680,7 +1692,7 @@ mod tests { let leader = prs[0].clone(); region.set_region_epoch(region_epoch); let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. - let read_progress = Arc::new(RegionReadProgress::new(®ion, 1, 1, "".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion, 1, 1, 1)); // Register region lease.renew(monotonic_raw_now()); diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index f23bc7e3b12..513d08643a7 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -215,13 +215,13 @@ impl Operator { } else { ConfChangeType::AddNode }; - new_pd_change_peer(conf_change_type, peer.clone()) + new_pd_change_peer_v2(vec![change_peer(conf_change_type, peer.clone())]) } else { pdpb::RegionHeartbeatResponse::default() } } Operator::RemovePeer { ref peer, .. } => { - new_pd_change_peer(ConfChangeType::RemoveNode, peer.clone()) + new_pd_change_peer_v2(vec![change_peer(ConfChangeType::RemoveNode, peer.clone())]) } Operator::TransferLeader { ref peer, diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 1e35dc0cf13..eb8ab3fe885 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -301,7 +301,6 @@ pub fn new_transfer_leader_cmd(peer: metapb::Peer) -> AdminRequest { cmd } -#[allow(dead_code)] pub fn new_prepare_merge(target_region: metapb::Region) -> AdminRequest { let mut cmd = AdminRequest::default(); cmd.set_cmd_type(AdminCmdType::PrepareMerge); diff --git a/components/tikv_util/src/store/mod.rs b/components/tikv_util/src/store/mod.rs index f4bfea93519..9a36961c202 100644 --- a/components/tikv_util/src/store/mod.rs +++ b/components/tikv_util/src/store/mod.rs @@ -5,7 +5,10 @@ pub mod query_stats; pub mod region; pub use self::{ - peer::{find_peer, find_peer_mut, is_learner, new_learner_peer, new_peer, remove_peer}, + peer::{ + find_peer, find_peer_by_id, find_peer_mut, is_learner, new_learner_peer, new_peer, + new_witness_peer, remove_peer, + }, query_stats::{is_read_query, QueryStats}, region::{ check_key_in_region, check_key_in_region_exclusive, check_key_in_region_inclusive, diff --git a/components/tikv_util/src/store/peer.rs b/components/tikv_util/src/store/peer.rs index 59844bc957a..bbc96bb786f 100644 --- a/components/tikv_util/src/store/peer.rs +++ b/components/tikv_util/src/store/peer.rs @@ -16,6 +16,10 @@ pub fn find_peer_mut(region: &mut Region, store_id: u64) -> Option<&mut Peer> { .find(|p| p.get_store_id() == store_id) } +pub fn find_peer_by_id(region: &Region, peer_id: u64) -> Option<&Peer> { + region.get_peers().iter().find(|&p| p.get_id() == peer_id) +} + pub fn remove_peer(region: &mut Region, store_id: u64) -> Option { region .get_peers() @@ -45,6 +49,15 @@ pub fn is_learner(peer: &Peer) -> bool { peer.get_role() == PeerRole::Learner } +pub fn new_witness_peer(store_id: u64, peer_id: u64) -> Peer { + let mut peer = Peer::default(); + peer.set_store_id(store_id); + peer.set_id(peer_id); + peer.set_role(PeerRole::Voter); + peer.set_is_witness(true); + peer +} + #[cfg(test)] mod tests { use super::*; diff --git a/components/tikv_util/src/store/region.rs b/components/tikv_util/src/store/region.rs index 580d940ebeb..58af4e9fdfa 100644 --- a/components/tikv_util/src/store/region.rs +++ b/components/tikv_util/src/store/region.rs @@ -32,9 +32,11 @@ pub fn region_on_same_stores(lhs: &Region, rhs: &Region) -> bool { // Because every store can only have one replica for the same region, // so just one round check is enough. lhs.get_peers().iter().all(|lp| { - rhs.get_peers() - .iter() - .any(|rp| rp.get_store_id() == lp.get_store_id() && rp.get_role() == lp.get_role()) + rhs.get_peers().iter().any(|rp| { + rp.get_store_id() == lp.get_store_id() + && rp.get_role() == lp.get_role() + && rp.get_is_witness() == lp.get_is_witness() + }) }) } diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index 7b29976f218..0230174fb42 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -30,9 +30,10 @@ use grpcio::{ RpcStatusCode, WriteFlags, }; use kvproto::{ - raft_serverpb::{Done, RaftMessage}, + raft_serverpb::{Done, RaftMessage, RaftSnapshotData}, tikvpb::{BatchRaftMessage, TikvClient}, }; +use protobuf::Message; use raft::SnapshotStatus; use raftstore::{errors::DiscardReason, router::RaftStoreRouter}; use security::SecurityManager; @@ -483,11 +484,17 @@ where None => return, }; if msg.get_message().has_snapshot() { - self.send_snapshot_sock(msg); - continue; - } else { - self.buffer.push(msg); + let mut snapshot = RaftSnapshotData::default(); + snapshot + .merge_from_bytes(msg.get_message().get_snapshot().get_data()) + .unwrap(); + // Witness's snapshot must be empty, no need to send snapshot files + if !snapshot.get_meta().get_for_witness() { + self.send_snapshot_sock(msg); + continue; + } } + self.buffer.push(msg); } } } diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index 9d648c06c8c..08657f7e75a 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -32,3 +32,4 @@ mod test_transfer_leader; mod test_transport; mod test_unsafe_recovery; mod test_update_region_size; +mod test_witness; diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs index 505bd3bd0e4..a2c2ea75c64 100644 --- a/tests/integrations/raftstore/test_unsafe_recovery.rs +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -677,7 +677,6 @@ fn test_force_leader_on_hibernated_leader() { // previous follower. #[test] fn test_force_leader_on_hibernated_follower() { - test_util::init_log_for_test(); let mut cluster = new_node_cluster(0, 5); cluster.pd_client.disable_default_operator(); diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs new file mode 100644 index 00000000000..8e36510753e --- /dev/null +++ b/tests/integrations/raftstore/test_witness.rs @@ -0,0 +1,537 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{iter::FromIterator, sync::Arc, time::Duration}; + +use futures::executor::block_on; +use kvproto::{metapb, raft_cmdpb::ChangePeerRequest, raft_serverpb::PeerState}; +use pd_client::PdClient; +use raft::eraftpb::ConfChangeType; +use test_raftstore::*; +use tikv_util::store::find_peer; + +fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { + peer.set_role(metapb::PeerRole::Learner); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + cluster.pd_client.must_remove_peer(region_id, peer.clone()); + peer.set_is_witness(true); + peer.set_id(peer.get_id() + 10); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + peer.set_role(metapb::PeerRole::Voter); + cluster.pd_client.must_add_peer(region_id, peer.clone()); +} + +fn become_non_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { + peer.set_role(metapb::PeerRole::Learner); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + cluster.pd_client.must_remove_peer(region_id, peer.clone()); + peer.set_is_witness(false); + peer.set_id(peer.get_id() + 10); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + peer.set_role(metapb::PeerRole::Voter); + cluster.pd_client.must_add_peer(region_id, peer.clone()); +} + +// Test the case that region split or merge with witness peer +#[test] +fn test_witness_split_merge() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + let before = cluster + .apply_state(region.get_id(), nodes[2]) + .get_applied_index(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k2", b"v2"); + cluster.must_split(®ion, b"k2"); + must_get_none(&cluster.get_engine(3), b"k1"); + must_get_none(&cluster.get_engine(3), b"k2"); + // applied index of witness is updated + let after = cluster + .apply_state(region.get_id(), nodes[2]) + .get_applied_index(); + assert!(after - before >= 3); + + // the newly split peer should be witness as well + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k2"); + assert_ne!(left.get_id(), right.get_id()); + assert!(find_peer(&left, nodes[2]).unwrap().is_witness); + assert!(find_peer(&right, nodes[2]).unwrap().is_witness); + + // merge + pd_client.must_merge(left.get_id(), right.get_id()); + let after_merge = cluster.get_region(b"k1"); + assert!(find_peer(&after_merge, nodes[2]).unwrap().is_witness); + must_get_none(&cluster.get_engine(3), b"k1"); + must_get_none(&cluster.get_engine(3), b"k2"); + // epoch of witness is updated + assert_eq!( + cluster + .region_local_state(after_merge.get_id(), nodes[2]) + .get_region() + .get_region_epoch(), + after_merge.get_region_epoch() + ); + + // split again + cluster.must_split(&after_merge, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k2"); + assert!(find_peer(&left, nodes[2]).unwrap().is_witness); + assert!(find_peer(&right, nodes[2]).unwrap().is_witness); + + // can't merge with different witness location + let mut peer_on_store3 = find_peer(&left, nodes[2]).unwrap().clone(); + become_non_witness(&cluster, left.get_id(), &mut peer_on_store3); + let left = cluster.get_region(b"k1"); + let req = new_admin_request( + left.get_id(), + left.get_region_epoch(), + new_prepare_merge(right), + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains("peers doesn't match") + ); +} + +// Test flow of witness conf change +#[test] +fn test_witness_conf_change() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // can't switch witness by conf change + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + let mut peer = peer_on_store3.clone(); + peer.set_is_witness(true); + let mut cp = ChangePeerRequest::default(); + cp.set_change_type(ConfChangeType::AddLearnerNode); + cp.set_peer(peer); + let req = new_admin_request( + region.get_id(), + region.get_region_epoch(), + new_change_peer_v2_request(vec![cp]), + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert!(resp.get_header().has_error()); + + // add a new witness peer + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store3.clone()); + peer_on_store3.set_is_witness(true); + let applied_index = cluster.apply_state(1, 2).applied_index; + cluster + .pd_client + .must_add_peer(region.get_id(), peer_on_store3.clone()); + must_get_none(&cluster.get_engine(3), b"k1"); + let region = cluster.get_region(b"k1"); + cluster.wait_applied_index(region.get_id(), nodes[2], applied_index + 1); + assert_eq!( + cluster + .region_local_state(region.get_id(), nodes[2]) + .get_region(), + ®ion + ); + + // remove a witness peer + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store3); + + assert_eq!( + cluster + .region_local_state(region.get_id(), nodes[2]) + .get_state(), + PeerState::Tombstone + ); +} + +// #[test] +// // Test flow of switch witness +// fn test_witness_switch_witness() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// cluster.must_put(b"k1", b"v1"); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); +// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + +// // nonwitness -> witness +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// become_witness(&cluster, region.get_id(), &mut peer_on_store3); + +// std::thread::sleep(Duration::from_millis(100)); +// must_get_none(&cluster.get_engine(3), b"k1"); + +// // witness -> nonwitness +// peer_on_store3.set_role(metapb::PeerRole::Learner); +// cluster +// .pd_client +// .must_add_peer(region.get_id(), peer_on_store3.clone()); +// cluster +// .pd_client +// .must_remove_peer(region.get_id(), peer_on_store3.clone()); +// peer_on_store3.set_is_witness(false); +// cluster +// .pd_client +// .must_add_peer(region.get_id(), peer_on_store3.clone()); +// std::thread::sleep(Duration::from_millis(100)); +// must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); +// } + +// TODO: add back when switch witness is supported +// // Test the case that leader is forbidden to become witness +// #[test] +// fn test_witness_leader() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// cluster.must_put(b"k1", b"v1"); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// let mut peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); +// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + +// // can't make leader to witness +// peer_on_store1.set_is_witness(true); +// cluster +// .pd_client +// .add_peer(region.get_id(), peer_on_store1.clone()); + +// std::thread::sleep(Duration::from_millis(100)); +// assert_eq!( +// cluster.leader_of_region(region.get_id()).unwrap().store_id, +// 1 +// ); +// // leader changes to witness failed, so still can get the value +// must_get_equal(&cluster.get_engine(nodes[0]), b"k1", b"v1"); + +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// // can't transfer leader to witness +// cluster.transfer_leader(region.get_id(), &mut peer_on_store3); +// assert_eq!( +// cluster.leader_of_region(region.get_id()).unwrap().store_id, +// nodes[0], +// ); +// } + +// TODO: add back when election priority is supported +// // Test the case that witness can't be elected as leader based on election +// // priority when there is no log gap +// #[test] +// fn test_witness_election_priority() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// // nonwitness -> witness +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// become_witness(&cluster, region.get_id(), &mut peer_on_store3); +// cluster.must_put(b"k0", b"v0"); + +// // make sure logs are replicated to the witness +// std::thread::sleep(Duration::from_millis(100)); + +// for i in 1..10 { +// let node = +// cluster.leader_of_region(region.get_id()).unwrap().store_id; cluster. +// stop_node(node); let (k, v) = (format!("k{}", i), format!("v{}", i)); +// let key = k.as_bytes(); +// let value = v.as_bytes(); +// cluster.must_put(key, value); +// // the witness can't be elected as the leader when there is no log +// gap assert_ne!( +// cluster.leader_of_region(region.get_id()).unwrap().store_id, +// nodes[2], +// ); +// cluster.run_node(node).unwrap(); +// } +// } + +// TODO: add back when raft log gc logic is updated for witness +// // Test the case that truncated index won't advance when there is a witness +// even // if the gap gap exceeds the gc count limit +// #[test] +// fn test_witness_raftlog_gc_lagged_follower() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// cluster.must_put(b"k0", b"v0"); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); +// cluster.must_transfer_leader(region.get_id(), peer_on_store1); +// // nonwitness -> witness +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// become_witness(&cluster, region.get_id(), &mut peer_on_store3); + +// // make sure raft log gc is triggered +// std::thread::sleep(Duration::from_millis(200)); +// let mut before_states = HashMap::default(); +// for (&id, engines) in &cluster.engines { +// let mut state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); before_states.insert(id, +// state.take_truncated_state()); } + +// // one follower is down +// cluster.stop_node(nodes[1]); + +// // write some data to make log gap exceeds the gc limit +// for i in 1..1000 { +// let (k, v) = (format!("k{}", i), format!("v{}", i)); +// let key = k.as_bytes(); +// let value = v.as_bytes(); +// cluster.must_put(key, value); +// } + +// // the truncated index is not advanced +// for (&id, engines) in &cluster.engines { +// let state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); assert!(state.get_truncated_state(). +// get_index() - before_states[&id].get_index() < 10); } + +// // the follower is back online +// cluster.run_node(nodes[1]).unwrap(); +// cluster.must_put(b"k00", b"v00"); +// must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); +// // make sure raft log gc is triggered +// std::thread::sleep(Duration::from_millis(300)); + +// // the truncated index is advanced now, as all the peers has replicated +// for (&id, engines) in &cluster.engines { +// let state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); assert_ge!( +// state.get_truncated_state().get_index() - +// before_states[&id].get_index(), 900 +// ); +// } +// } + +// TODO: add back when raft log gc logic is updated for witness +// // Test the case that truncated index is advance when there is a lagged +// witness #[test] +// fn test_witness_raftlog_gc_lagged_witness() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); +// cluster.must_transfer_leader(region.get_id(), peer_on_store1); +// // nonwitness -> witness +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// become_witness(&cluster, region.get_id(), &mut peer_on_store3); +// cluster.must_put(b"k0", b"v0"); + +// // make sure raft log gc is triggered +// std::thread::sleep(Duration::from_millis(200)); +// let mut before_states = HashMap::default(); +// for (&id, engines) in &cluster.engines { +// let mut state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); before_states.insert(id, +// state.take_truncated_state()); } + +// // the witness is down +// cluster.stop_node(nodes[2]); + +// // write some data to make log gap exceeds the gc limit +// for i in 1..1000 { +// let (k, v) = (format!("k{}", i), format!("v{}", i)); +// let key = k.as_bytes(); +// let value = v.as_bytes(); +// cluster.must_put(key, value); +// } + +// // the witness is back online +// cluster.run_node(nodes[2]).unwrap(); + +// cluster.must_put(b"k00", b"v00"); +// std::thread::sleep(Duration::from_millis(200)); + +// // the truncated index is advanced +// for (&id, engines) in &cluster.engines { +// let state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); println!("{} {}", id, +// state.get_truncated_state().get_index()); assert_ge!( +// state.get_truncated_state().get_index() - +// before_states[&id].get_index(), 900 +// ); +// } +// } + +// Test the case replica read can't be performed on witness peer. +#[test] +fn test_witness_replica_read() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .call_command_on_node(nodes[2], request, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + } + ); +} + +fn must_get_error_recovery_in_progress( + cluster: &mut Cluster, + region: &metapb::Region, + cmd: kvproto::raft_cmdpb::Request, +) { + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![cmd], + true, + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + }, + "{:?}", + resp + ); +} + +// Test the case that witness replicate logs to lagging behind follower when +// leader is down +#[test] +fn test_witness_leader_down() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + + let mut peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); + // nonwitness -> witness + become_witness(&cluster, region.get_id(), &mut peer_on_store2); + + // the other follower is isolated + cluster.add_send_filter(IsolationFilterFactory::new(3)); + for i in 1..10 { + cluster.must_put(format!("k{}", i).as_bytes(), format!("v{}", i).as_bytes()); + } + // the leader is down + cluster.stop_node(1); + + // witness would help to replicate the logs + cluster.clear_send_filters(); + + // forbid writes + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + // forbid reads + let get = new_get_cmd(b"k1"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + // forbid read index + let read_index = new_read_index_cmd(); + must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store3); + cluster.must_put(b"k1", b"v1"); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[2], + ); + assert_eq!(cluster.must_get(b"k9"), Some(b"v9".to_vec())); +} From f2e89a4e80e2b99d7ac2abe74d38d4f6eac9ceb6 Mon Sep 17 00:00:00 2001 From: lijie Date: Wed, 9 Nov 2022 17:51:51 +0800 Subject: [PATCH 322/676] bump version to 6.5.0-alpha (#13740) close tikv/tikv#13763 Signed-off-by: lijie Co-authored-by: Yilin Chen Co-authored-by: Lifu Wu --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a1b238d0148..9aa43209906 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6177,7 +6177,7 @@ dependencies = [ [[package]] name = "tikv" -version = "6.4.0-alpha" +version = "6.5.0-alpha" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index a408e4a84ca..f75a4a6511f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "6.4.0-alpha" +version = "6.5.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From e87d16d0b17e507e4fea0923c8438ddfd0296a0e Mon Sep 17 00:00:00 2001 From: haojinming Date: Thu, 10 Nov 2022 10:37:53 +0800 Subject: [PATCH 323/676] cdc: remove deprecated config (#13762) close tikv/tikv#13761 raw-min-ts-outlier-threshold is marked as deprecated from v6.4. It should be removed from v6.5. https://docs.pingcap.com/zh/tidb/dev/tikv-configuration-file#raw-min-ts-outlier-threshold-%E4%BB%8E-v620-%E7%89%88%E6%9C%AC%E5%BC%80%E5%A7%8B%E5%BC%95%E5%85%A5 Signed-off-by: haojinming Co-authored-by: Ti Chi Robot --- src/config.rs | 13 ------------- tests/integrations/config/mod.rs | 1 - 2 files changed, 14 deletions(-) diff --git a/src/config.rs b/src/config.rs index c978b1bf90a..e9c4c2bb85b 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2566,9 +2566,6 @@ pub struct CdcConfig { // Deprecated! preserved for compatibility check. #[online_config(skip)] #[doc(hidden)] - pub raw_min_ts_outlier_threshold: ReadableDuration, - #[online_config(skip)] - #[doc(hidden)] #[serde(skip_serializing)] pub old_value_cache_size: usize, } @@ -2591,8 +2588,6 @@ impl Default for CdcConfig { sink_memory_quota: ReadableSize::mb(512), // 512MB memory for old value cache. old_value_cache_memory_quota: ReadableSize::mb(512), - // Trigger raw region outlier judgement if resolved_ts's lag is over 60s. - raw_min_ts_outlier_threshold: ReadableDuration::secs(60), // Deprecated! preserved for compatibility check. old_value_cache_size: 0, } @@ -2634,14 +2629,6 @@ impl CdcConfig { ); self.incremental_scan_ts_filter_ratio = default_cfg.incremental_scan_ts_filter_ratio; } - if self.raw_min_ts_outlier_threshold.is_zero() { - warn!( - "cdc.raw_min_ts_outlier_threshold should be larger than 0, - change it to {}", - default_cfg.raw_min_ts_outlier_threshold - ); - self.raw_min_ts_outlier_threshold = default_cfg.raw_min_ts_outlier_threshold; - } Ok(()) } } diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index a61b66e1436..5cb8c837fb1 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -790,7 +790,6 @@ fn test_serde_custom_tikv_config() { tso_worker_threads: 2, old_value_cache_memory_quota: ReadableSize::mb(14), sink_memory_quota: ReadableSize::mb(7), - raw_min_ts_outlier_threshold: ReadableDuration::secs(60), }; value.resolved_ts = ResolvedTsConfig { enable: true, From 0339d5188d6b9e35bc7250e1a2f3cdb008f238da Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Thu, 10 Nov 2022 15:35:53 +0800 Subject: [PATCH 324/676] storage: calculate last_change_ts in rollback (#13749) ref tikv/tikv#13694 This commit supports calculating last_change_ts when writing a new Rollback record. To get the correct last_change_ts, we always call seek_write to find the last write record before start_ts. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- src/storage/txn/actions/check_txn_status.rs | 41 +++++- .../txn/commands/check_secondary_locks.rs | 129 +++++++++++++++++- src/storage/txn/commands/check_txn_status.rs | 111 ++++++++++++++- 3 files changed, 274 insertions(+), 7 deletions(-) diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index 4c900e5a438..126c34ade92 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -8,6 +8,7 @@ use crate::storage::{ metrics::MVCC_CHECK_TXN_STATUS_COUNTER_VEC, reader::OverlappedWrite, ErrorInner, LockType, MvccTxn, ReleasedLock, Result, SnapshotReader, TxnCommitRecord, }, + txn::{sched_pool::tls_can_enable, scheduler::LAST_CHANGE_TS}, Snapshot, TxnStatus, }; @@ -134,7 +135,8 @@ pub fn check_txn_status_missing_lock( // Insert a Rollback to Write CF in case that a stale prewrite // command is received after a cleanup command. - if let Some(write) = action.construct_write(ts, overlapped_write) { + if let Some(mut write) = action.construct_write(ts, overlapped_write) { + update_last_change_for_rollback(reader, &mut write, &primary_key, ts)?; txn.put_write(primary_key, ts, write.as_ref().to_bytes()); } MVCC_CHECK_TXN_STATUS_COUNTER_VEC.rollback.inc(); @@ -168,7 +170,8 @@ pub fn rollback_lock( // Only the primary key of a pessimistic transaction needs to be protected. let protected: bool = is_pessimistic_txn && key.is_encoded_from(&lock.primary); - if let Some(write) = make_rollback(reader.start_ts, protected, overlapped_write) { + if let Some(mut write) = make_rollback(reader.start_ts, protected, overlapped_write) { + update_last_change_for_rollback(reader, &mut write, &key, lock.ts)?; txn.put_write(key.clone(), reader.start_ts, write.as_ref().to_bytes()); } @@ -192,6 +195,40 @@ pub fn collapse_prev_rollback( Ok(()) } +/// Updates the last_change_ts of a new Rollback record. +/// +/// When writing a new Rollback record, we don't always know about the +/// information about the last change. So, we will call `seek_write` again to +/// calculate the last_change_ts. +/// +/// The `seek_write` here is usually cheap because this functions is typically +/// called after `get_txn_commit_record` and `get_txn_commit_record` should have +/// moved the cursor around the record we want. +pub fn update_last_change_for_rollback( + reader: &mut SnapshotReader, + write: &mut Write, + key: &Key, + ts: TimeStamp, +) -> Result<()> { + // Also update the last_change_ts if we are writing an overlapped rollback to a + // LOCK record. Actually, because overlapped rollbacks are rare, it does not + // solve the inaccuracy caused by inserted rollback (and we don't intend it + // because it's uncommon). Just do it when it happens. + if tls_can_enable(LAST_CHANGE_TS) + && (write.write_type == WriteType::Lock || write.write_type == WriteType::Rollback) + { + if let Some((commit_ts, w)) = reader.seek_write(key, ts)? { + // Even with collapsed rollback, the deleted rollbacks will become tombstones + // which we probably need to skip them one by one. That's why we always use + // `next_last_change_info` here to calculate and count them in + // `versions_to_last_change`. + (write.last_change_ts, write.versions_to_last_change) = + w.next_last_change_info(commit_ts); + } + } + Ok(()) +} + /// Generate the Write record that should be written that means to perform a /// specified rollback operation. pub fn make_rollback( diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 1a4b547b6d7..bd494e91edc 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -8,7 +8,9 @@ use crate::storage::{ lock_manager::LockManager, mvcc::{LockType, MvccTxn, SnapshotReader, TimeStamp, TxnCommitRecord}, txn::{ - actions::check_txn_status::{collapse_prev_rollback, make_rollback}, + actions::check_txn_status::{ + collapse_prev_rollback, make_rollback, update_last_change_for_rollback, + }, commands::{ Command, CommandExt, ReaderWithStats, ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, WriteContext, WriteResult, @@ -119,7 +121,10 @@ impl WriteCommand for CheckSecondaryLocks { } // We must protect this rollback in case this rollback is collapsed and a stale // acquire_pessimistic_lock and prewrite succeed again. - if let Some(write) = make_rollback(self.start_ts, true, rollback_overlapped_write) { + if let Some(mut write) = + make_rollback(self.start_ts, true, rollback_overlapped_write) + { + update_last_change_for_rollback(&mut reader, &mut write, &key, self.start_ts)?; txn.put_write(key.clone(), self.start_ts, write.as_ref().to_bytes()); collapse_prev_rollback(&mut txn, &mut reader, &key)?; } @@ -165,14 +170,20 @@ impl WriteCommand for CheckSecondaryLocks { pub mod tests { use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::Context; + use tikv_kv::Statistics; use tikv_util::deadline::Deadline; + use txn_types::Mutation; use super::*; use crate::storage::{ kv::TestEngineBuilder, lock_manager::MockLockManager, mvcc::tests::*, - txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, + txn::{ + commands::{test_util::prewrite_with_cm, WriteCommand}, + scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, + tests::*, + }, Engine, }; @@ -343,4 +354,116 @@ pub mod tests { } must_get_overlapped_rollback(&mut engine, b"k1", 15, 13, WriteType::Lock, Some(0)); } + + // The main logic is almost identical to + // test_rollback_calculate_last_change_info of check_txn_status. But the small + // differences about handling lock CF make it difficult to reuse code. + #[test] + fn test_rollback_calculate_last_change_info() { + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let cm = ConcurrencyManager::new(1.into()); + let k = b"k"; + let mut statistics = Statistics::default(); + + must_prewrite_put(&mut engine, k, b"v1", k, 5); + must_commit(&mut engine, k, 5, 6); + must_prewrite_put(&mut engine, k, b"v2", k, 7); + must_commit(&mut engine, k, 7, 8); + must_prewrite_put(&mut engine, k, b"v3", k, 30); + must_commit(&mut engine, k, 30, 35); + + // TiKV 6.4 should not write last_change_ts. + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate); + must_success(&mut engine, k, 40, SecondaryLocksStatus::RolledBack); + let rollback = must_written(&mut engine, k, 40, 40, WriteType::Rollback); + assert!(rollback.last_change_ts.is_zero()); + assert_eq!(rollback.versions_to_last_change, 0); + + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.5.0").unwrap(); + set_tls_feature_gate(feature_gate); + + must_prewrite_put(&mut engine, k, b"v4", k, 45); + must_commit(&mut engine, k, 45, 50); + + // Rollback when there is no lock; prev writes: + // - 50: PUT + must_success(&mut engine, k, 55, SecondaryLocksStatus::RolledBack); + let rollback = must_written(&mut engine, k, 55, 55, WriteType::Rollback); + assert_eq!(rollback.last_change_ts, 50.into()); + assert_eq!(rollback.versions_to_last_change, 1); + + // Write a LOCK; prev writes: + // - 55: ROLLBACK + // - 50: PUT + let res = prewrite_with_cm( + &mut engine, + cm, + &mut statistics, + vec![Mutation::make_lock(Key::from_raw(k))], + k.to_vec(), + 60, + Some(70), + ) + .unwrap(); + assert!(!res.one_pc_commit_ts.is_zero()); + let lock_commit_ts = res.one_pc_commit_ts; + let lock = must_written(&mut engine, k, 60, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(lock.last_change_ts, 50.into()); + assert_eq!(lock.versions_to_last_change, 2); + + // Write another ROLLBACK by rolling back a pessimistic lock; prev writes: + // - 61: LOCK + // - 55: ROLLBACK + // - 50: PUT + must_acquire_pessimistic_lock(&mut engine, k, b"pk", 70, 75); + must_success(&mut engine, k, 70, SecondaryLocksStatus::RolledBack); + let rollback = must_written(&mut engine, k, 70, 70, WriteType::Rollback); + assert_eq!(rollback.last_change_ts, 50.into()); + assert_eq!(rollback.versions_to_last_change, 3); + + // last_change_ts should point to the latest record before start_ts; prev + // writes: + // - 8: PUT + must_acquire_pessimistic_lock(&mut engine, k, k, 10, 75); + must_success(&mut engine, k, 10, SecondaryLocksStatus::RolledBack); + must_unlocked(&mut engine, k); + let rollback = must_written(&mut engine, k, 10, 10, WriteType::Rollback); + assert_eq!(rollback.last_change_ts, 8.into()); + assert_eq!(rollback.versions_to_last_change, 1); + + // Overlapped rollback should not update the last_change_ts of PUT; prev writes: + // - 8: PUT <- rollback overlaps + // - 6: PUT + must_success(&mut engine, k, 8, SecondaryLocksStatus::RolledBack); + let put = must_written(&mut engine, k, 7, 8, WriteType::Put); + assert!(put.last_change_ts.is_zero()); + assert_eq!(put.versions_to_last_change, 0); + assert!(put.has_overlapped_rollback); + + // Overlapped rollback can update the last_change_ts of LOCK; writes: + // - 61: PUT <- rollback overlaps + // - 57: ROLLBACK (inserted later) + // - 55: ROLLBACK + // - 50: PUT + must_rollback(&mut engine, k, 57, true); + let rollback = must_written(&mut engine, k, 57, 57, WriteType::Rollback); + assert_eq!(rollback.last_change_ts, 50.into()); + assert_eq!(rollback.versions_to_last_change, 2); + must_success( + &mut engine, + k, + lock_commit_ts, + SecondaryLocksStatus::RolledBack, + ); + let lock = must_written(&mut engine, k, 60, lock_commit_ts, WriteType::Lock); + assert_eq!(lock.last_change_ts, 50.into()); + assert_eq!(lock.versions_to_last_change, 3); + } } diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 58f7f557448..b74e7d5cb7c 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -144,8 +144,9 @@ impl WriteCommand for CheckTxnStatus { pub mod tests { use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::{Context, PrewriteRequestPessimisticAction::*}; + use tikv_kv::Statistics; use tikv_util::deadline::Deadline; - use txn_types::{Key, WriteType}; + use txn_types::{Key, Mutation, WriteType}; use super::{TxnStatus::*, *}; use crate::storage::{ @@ -153,7 +154,9 @@ pub mod tests { lock_manager::MockLockManager, mvcc::tests::*, txn::{ - commands::{pessimistic_rollback, WriteCommand, WriteContext}, + commands::{ + pessimistic_rollback, test_util::prewrite_with_cm, WriteCommand, WriteContext, + }, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, }, @@ -1163,4 +1166,108 @@ pub mod tests { must_unlocked(&mut engine, k); must_get_rollback_ts(&mut engine, k, ts(50, 0)); } + + #[test] + fn test_rollback_calculate_last_change_info() { + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let cm = ConcurrencyManager::new(1.into()); + let k = b"k"; + let mut statistics = Statistics::default(); + + must_prewrite_put(&mut engine, k, b"v1", k, 5); + must_commit(&mut engine, k, 5, 6); + must_prewrite_put(&mut engine, k, b"v2", k, 7); + must_commit(&mut engine, k, 7, 8); + must_prewrite_put(&mut engine, k, b"v3", k, 30); + must_commit(&mut engine, k, 30, 35); + + // TiKV 6.4 should not write last_change_ts. + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate); + must_rollback(&mut engine, k, 40, true); + let rollback = must_written(&mut engine, k, 40, 40, WriteType::Rollback); + assert!(rollback.last_change_ts.is_zero()); + assert_eq!(rollback.versions_to_last_change, 0); + + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.5.0").unwrap(); + set_tls_feature_gate(feature_gate); + + must_prewrite_put(&mut engine, k, b"v4", k, 45); + must_commit(&mut engine, k, 45, 50); + + // Rollback when there is no lock; prev writes: + // - 50: PUT + must_rollback(&mut engine, k, 55, true); + let rollback = must_written(&mut engine, k, 55, 55, WriteType::Rollback); + assert_eq!(rollback.last_change_ts, 50.into()); + assert_eq!(rollback.versions_to_last_change, 1); + + // Write a LOCK; prev writes: + // - 55: ROLLBACK + // - 50: PUT + let res = prewrite_with_cm( + &mut engine, + cm, + &mut statistics, + vec![Mutation::make_lock(Key::from_raw(k))], + k.to_vec(), + 60, + Some(70), + ) + .unwrap(); + assert!(!res.one_pc_commit_ts.is_zero()); + let lock_commit_ts = res.one_pc_commit_ts; + let lock = must_written(&mut engine, k, 60, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(lock.last_change_ts, 50.into()); + assert_eq!(lock.versions_to_last_change, 2); + + // Write another ROLLBACK; prev writes: + // - 61: LOCK + // - 55: ROLLBACK + // - 50: PUT + must_rollback(&mut engine, k, 70, true); + let rollback = must_written(&mut engine, k, 70, 70, WriteType::Rollback); + assert_eq!(rollback.last_change_ts, 50.into()); + assert_eq!(rollback.versions_to_last_change, 3); + + // last_change_ts should point to the latest record before start_ts; prev + // writes: + // - 8: PUT + must_acquire_pessimistic_lock(&mut engine, k, k, 10, 75); + must_pessimistic_prewrite_put(&mut engine, k, b"v5", k, 10, 75, DoPessimisticCheck); + must_rollback(&mut engine, k, 10, true); + must_unlocked(&mut engine, k); + let rollback = must_written(&mut engine, k, 10, 10, WriteType::Rollback); + assert_eq!(rollback.last_change_ts, 8.into()); + assert_eq!(rollback.versions_to_last_change, 1); + + // Overlapped rollback should not update the last_change_ts of PUT; prev writes: + // - 8: PUT <- rollback overlaps + // - 6: PUT + must_rollback(&mut engine, k, 8, true); + let put = must_written(&mut engine, k, 7, 8, WriteType::Put); + assert!(put.last_change_ts.is_zero()); + assert_eq!(put.versions_to_last_change, 0); + assert!(put.has_overlapped_rollback); + + // Overlapped rollback can update the last_change_ts of LOCK; writes: + // - 61: PUT <- rollback overlaps + // - 57: ROLLBACK (inserted later) + // - 55: ROLLBACK + // - 50: PUT + must_rollback(&mut engine, k, 57, true); + let rollback = must_written(&mut engine, k, 57, 57, WriteType::Rollback); + assert_eq!(rollback.last_change_ts, 50.into()); + assert_eq!(rollback.versions_to_last_change, 2); + must_rollback(&mut engine, k, lock_commit_ts, true); + let lock = must_written(&mut engine, k, 60, lock_commit_ts, WriteType::Lock); + assert_eq!(lock.last_change_ts, 50.into()); + assert_eq!(lock.versions_to_last_change, 3); + } } From fe997db4db8a5a096f8a45c0db3eb3c2e5879262 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Thu, 10 Nov 2022 15:51:52 +0800 Subject: [PATCH 325/676] txn: save last change info in write records (#13759) ref tikv/tikv#13694 In this commit, the commit action will save the last_change_ts and versions_to_last_change in the lock to the write record. It is unncessary to check the write type because it is checked during prewrite. So, among the committable locks, only those of Lock type will have last_change_ts and versions_to_last_change. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- src/storage/mvcc/reader/reader.rs | 3 +- src/storage/mvcc/reader/scanner/forward.rs | 10 ------- src/storage/txn/actions/commit.rs | 32 +++++++++++++++++++++- src/storage/txn/actions/prewrite.rs | 7 ----- src/storage/txn/commands/prewrite.rs | 8 ------ 5 files changed, 33 insertions(+), 27 deletions(-) diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index d4767f3bb1a..8e35e00936e 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -1395,7 +1395,8 @@ pub mod tests { let (commit_ts, write) = reader.seek_write(&k, 20.into()).unwrap().unwrap(); assert_eq!(commit_ts, 20.into()); - assert_eq!(write, Write::new(WriteType::Lock, 10.into(), None)); + assert_eq!(write.write_type, WriteType::Lock); + assert_eq!(write.start_ts, 10.into()); assert_eq!(reader.statistics.write.seek, 1); assert_eq!(reader.statistics.write.next, 1); diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 03f44deed7c..32898f1bfe7 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -2366,16 +2366,6 @@ mod delta_entry_tests { #[test] fn test_mess() { - use pd_client::FeatureGate; - - use crate::storage::txn::sched_pool::set_tls_feature_gate; - - // Set version to 6.5.0 to enable last_change_ts. - // TODO: Remove this after TiKV version reaches 6.5 - let feature_gate = FeatureGate::default(); - feature_gate.set_version("6.5.0").unwrap(); - set_tls_feature_gate(feature_gate); - // TODO: non-pessimistic lock should be returned enven if its ts < from_ts. // (key, lock, [commit1, commit2, ...]) // Values ends with 'L' will be made larger than `SHORT_VALUE_MAX_LEN` so it diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index eb798090ba2..2ba4f527d0e 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -91,7 +91,8 @@ pub fn commit( WriteType::from_lock_type(lock.lock_type).unwrap(), reader.start_ts, lock.short_value.take(), - ); + ) + .set_last_change(lock.last_change_ts, lock.versions_to_last_change); for ts in &lock.rollback_ts { if *ts == commit_ts { @@ -320,4 +321,33 @@ pub mod tests { must_err(&mut engine, k, ts(60, 0), ts(65, 0)); must_succeed(&mut engine, k, ts(60, 0), ts(80, 0)); } + + #[test] + fn test_inherit_last_change_info_from_lock() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let k = b"k"; + must_prewrite_put(&mut engine, k, b"v1", k, 5); + must_succeed(&mut engine, k, 5, 10); + + // WriteType is Lock + must_prewrite_lock(&mut engine, k, k, 15); + let lock = must_locked(&mut engine, k, 15); + assert_eq!(lock.last_change_ts, 10.into()); + assert_eq!(lock.versions_to_last_change, 1); + must_succeed(&mut engine, k, 15, 20); + let write = must_written(&mut engine, k, 15, 20, WriteType::Lock); + assert_eq!(write.last_change_ts, 10.into()); + assert_eq!(write.versions_to_last_change, 1); + + // WriteType is Put + must_prewrite_put(&mut engine, k, b"v2", k, 25); + let lock = must_locked(&mut engine, k, 25); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_succeed(&mut engine, k, 25, 30); + let write = must_written(&mut engine, k, 25, 30, WriteType::Put); + assert!(write.last_change_ts.is_zero()); + assert_eq!(write.versions_to_last_change, 0); + } } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 8abaf1428e4..4c13a9d244b 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -2412,13 +2412,6 @@ pub mod tests { #[test] fn test_inherit_last_change_ts_from_pessimistic_lock() { use engine_traits::CF_LOCK; - use pd_client::FeatureGate; - - use crate::storage::txn::sched_pool::set_tls_feature_gate; - - let feature_gate = FeatureGate::default(); - feature_gate.set_version("6.5.0").unwrap(); - set_tls_feature_gate(feature_gate); let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let key = b"k"; diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 06f9cd1f818..6b54a1f88db 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -2516,10 +2516,6 @@ mod tests { let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); - let feature_gate = FeatureGate::default(); - feature_gate.set_version("6.5.0").unwrap(); - set_tls_feature_gate(feature_gate); - let key = b"k"; let value = b"v"; must_prewrite_put(&mut engine, key, value, key, 10); @@ -2606,10 +2602,6 @@ mod tests { let mut engine = TestEngineBuilder::new().build().unwrap(); let cm = concurrency_manager::ConcurrencyManager::new(1.into()); - let feature_gate = FeatureGate::default(); - feature_gate.set_version("6.5.0").unwrap(); - set_tls_feature_gate(feature_gate); - let key = b"k"; let value = b"v"; must_prewrite_put(&mut engine, key, value, key, 10); From 60ded8941095c4ae307979cedb4ef28f7e9fcf40 Mon Sep 17 00:00:00 2001 From: qupeng Date: Fri, 11 Nov 2022 12:57:53 +0800 Subject: [PATCH 326/676] cdc: set min-ts-interval to 200ms to reduce latency (#12839) close tikv/tikv#12840 cdc: set min-ts-interval to 200ms to reduce latency Signed-off-by: qupeng Co-authored-by: Neil Shen --- components/cdc/src/endpoint.rs | 7 +++++-- src/config.rs | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 8aa6aad3c29..4086c8623b5 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1399,7 +1399,10 @@ mod tests { #[test] fn test_api_version_check() { - let cfg = CdcConfig::default(); + let mut cfg = CdcConfig::default(); + // To make the case more stable. + cfg.min_ts_interval = ReadableDuration(Duration::from_secs(1)); + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = crate::channel::MemoryQuota::new(usize::MAX); @@ -1539,7 +1542,7 @@ mod tests { } let diff = cfg.diff(&updated_cfg); ep.run(Task::ChangeConfig(diff)); - assert_eq!(ep.config.min_ts_interval, ReadableDuration::secs(1)); + assert_eq!(ep.config.min_ts_interval, ReadableDuration::millis(200)); assert_eq!(ep.config.hibernate_regions_compatible, true); { diff --git a/src/config.rs b/src/config.rs index e9c4c2bb85b..c33c8e8b63c 100644 --- a/src/config.rs +++ b/src/config.rs @@ -2573,7 +2573,7 @@ pub struct CdcConfig { impl Default for CdcConfig { fn default() -> Self { Self { - min_ts_interval: ReadableDuration::secs(1), + min_ts_interval: ReadableDuration::millis(200), hibernate_regions_compatible: true, // 4 threads for incremental scan. incremental_scan_threads: 4, From d23bea3fa69ae0060e94e9066ec883769f49bf87 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 11 Nov 2022 14:55:53 +0800 Subject: [PATCH 327/676] raftstore-v2: add proposal conflict checker (#13737) close tikv/tikv#12842 The implementation is a bit simplified and adjusted for v2, including: - Not require term for every methods, because when leader's term is changed, it must become follower. In this case, the checker won't be accessed until it becomes leader again. So pass term in every methods is useless. - Use the checker to detect if it's splitting and merging. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/fsm/peer.rs | 2 +- .../src/operation/command/admin/mod.rs | 29 +- .../src/operation/command/admin/split.rs | 8 +- .../src/operation/command/control.rs | 428 ++++++++++++++++++ .../raftstore-v2/src/operation/command/mod.rs | 23 +- .../src/operation/command/write/mod.rs | 61 ++- .../operation/command/write/simple_write.rs | 31 +- components/raftstore-v2/src/operation/mod.rs | 4 +- .../raftstore-v2/src/operation/query/lease.rs | 8 +- .../raftstore-v2/src/operation/query/mod.rs | 22 +- .../raftstore-v2/src/operation/ready/mod.rs | 107 ++++- components/raftstore-v2/src/raft/peer.rs | 56 +-- .../src/router/response_channel.rs | 7 + .../raftstore-v2/tests/integrations/mod.rs | 2 + .../tests/integrations/test_read.rs | 10 +- 15 files changed, 698 insertions(+), 100 deletions(-) create mode 100644 components/raftstore-v2/src/operation/command/control.rs diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 7083a9e529c..b74f8b46b1c 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -241,6 +241,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } } // TODO: instead of propose pending commands immediately, we should use timeout. - self.fsm.peer.propose_pending_command(self.store_ctx); + self.fsm.peer.propose_pending_writes(self.store_ctx); } } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index afaefeb9b7e..c3fe2cceded 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -68,9 +68,13 @@ impl Peer { ch.report_error(resp); return; } - // To maintain propose order, we need to make pending proposal first. - self.propose_pending_command(ctx); let cmd_type = req.get_admin_request().get_cmd_type(); + if let Some(conflict) = self.proposal_control_mut().check_conflict(Some(cmd_type)) { + conflict.delay_channel(ch); + return; + } + // To maintain propose order, we need to make pending proposal first. + self.propose_pending_writes(ctx); let res = if apply::is_conf_change_cmd(&req) { self.propose_conf_change(ctx, req) } else { @@ -83,14 +87,19 @@ impl Peer { _ => unimplemented!(), } }; - if let Err(e) = &res { - info!( - self.logger, - "failed to propose admin command"; - "cmd_type" => ?cmd_type, - "error" => ?e, - ); + match &res { + Ok(index) => self + .proposal_control_mut() + .record_proposed_admin(cmd_type, *index), + Err(e) => { + info!( + self.logger, + "failed to propose admin command"; + "cmd_type" => ?cmd_type, + "error" => ?e, + ); + } } - self.post_propose_write(ctx, res, vec![ch]); + self.post_propose_command(ctx, res, vec![ch], true); } } diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index c0d8998c4ad..198819cfd7b 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -71,12 +71,10 @@ impl Peer { mut req: RaftCmdRequest, ) -> Result { validate_batch_split(req.mut_admin_request(), self.region())?; - let mut proposal_ctx = ProposalContext::empty(); - proposal_ctx.insert(ProposalContext::SYNC_LOG); - proposal_ctx.insert(ProposalContext::SPLIT); - + // We rely on ConflictChecker to detect conflicts, so no need to set proposal + // context. let data = req.write_to_bytes().unwrap(); - self.propose_with_ctx(store_ctx, data, proposal_ctx.to_vec()) + self.propose(store_ctx, data) } } diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs new file mode 100644 index 00000000000..5fb25b4e20d --- /dev/null +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -0,0 +1,428 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::LinkedList, mem, num::NonZeroU64}; + +use kvproto::{ + metapb, + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, +}; +use raftstore::{ + store::{ + cmd_resp, + fsm::apply, + msg::ErrorCallback, + util::{ + admin_cmd_epoch_lookup, AdminCmdEpochState, NORMAL_REQ_CHECK_CONF_VER, + NORMAL_REQ_CHECK_VER, + }, + }, + Error, +}; + +use crate::router::CmdResChannel; + +#[derive(Debug)] +pub struct ProposedAdminCmd { + cmd_type: AdminCmdType, + committed: bool, + epoch_state: AdminCmdEpochState, + index: u64, + /// Callbacks of commands that are conflict with on going admin command. + /// + /// Callbacks are delayed to avoid making client retry with arbitrary + /// backoff. + delayed_chs: Vec, +} + +impl ProposedAdminCmd { + fn new( + cmd_type: AdminCmdType, + epoch_state: AdminCmdEpochState, + index: u64, + ) -> ProposedAdminCmd { + ProposedAdminCmd { + cmd_type, + committed: false, + epoch_state, + index, + delayed_chs: Vec::new(), + } + } + + pub fn cmd_type(&self) -> AdminCmdType { + self.cmd_type + } + + /// Delay responding to channel until the command is applied so client won't + /// retry with arbitrary timeout. + pub fn delay_channel(&mut self, ch: CmdResChannel) { + self.delayed_chs.push(ch); + } + + /// Same as `delay_channel`, but accepts a batch. + pub fn delay_channels(&mut self, chs: Vec) { + if self.delayed_chs.is_empty() { + self.delayed_chs = chs; + } else { + self.delayed_chs.extend(chs); + } + } +} + +/// `ProposalControl` is a rewrite of `CmdEpochChecker` from v1. +/// +/// Admin command may change the epoch of a region. If a proposal is proposed +/// after the admin command is proposed but before the command is applied, the +/// proposal is probably to fail because of epoch not match. `ProposalControl` +/// aims to detect the failure early. With `ProposalControl`, users can assume +/// once a command is proposed, it's likely to succeed in the end. +/// +/// Compared to `CmdEpochChecker`, `ProposalControl` also traces the whole +/// lifetime of prepare merge. +pub struct ProposalControl { + // Use `LinkedList` to reduce memory footprint. In most cases, the list + // should be empty or 1 element. And access speed is not a concern. + proposed_admin_cmd: LinkedList, + pending_merge_index: u64, + term: u64, +} + +impl ProposalControl { + pub fn new(term: u64) -> ProposalControl { + ProposalControl { + proposed_admin_cmd: LinkedList::new(), + pending_merge_index: 0, + term, + } + } + + /// Clears all queued conflict callbacks if term changed. + /// + /// If term is changed, leader is probably changed. Clear all callbacks to + /// notify clients to retry with new leader. + #[inline] + pub fn maybe_update_term(&mut self, term: u64) { + match term.cmp(&self.term) { + std::cmp::Ordering::Equal => (), + std::cmp::Ordering::Greater => { + for cmd in mem::take(&mut self.proposed_admin_cmd) { + for cb in cmd.delayed_chs { + apply::notify_stale_req(term, cb); + } + } + self.term = term; + } + std::cmp::Ordering::Less => { + panic!("term should not decrease, old {}, new {}", self.term, term) + } + } + } + + /// Check if a proposal is conflict with proposed admin commands in current + /// term. If the proposal is an admin command, then its type should be + /// passed, otherwise just provide `None`. + /// + /// Returns None if passing the epoch check, otherwise returns the last + /// conflict conflict proposal meta. + pub fn check_conflict( + &mut self, + cmd_type: Option, + ) -> Option<&mut ProposedAdminCmd> { + let (check_ver, check_conf_ver) = match cmd_type { + None => (NORMAL_REQ_CHECK_VER, NORMAL_REQ_CHECK_CONF_VER), + Some(ty) => { + let epoch_state = admin_cmd_epoch_lookup(ty); + (epoch_state.check_ver, epoch_state.check_conf_ver) + } + }; + self.proposed_admin_cmd.iter_mut().rev().find(|cmd| { + (check_ver && cmd.epoch_state.change_ver) + || (check_conf_ver && cmd.epoch_state.change_conf_ver) + }) + } + + /// Record an admin proposal. + /// + /// Further requests that is conflict with the admin proposal will be + /// rejected in `check_proposal_conflict`. + pub fn record_proposed_admin(&mut self, cmd_type: AdminCmdType, index: u64) { + let epoch_state = admin_cmd_epoch_lookup(cmd_type); + if !epoch_state.change_conf_ver && !epoch_state.change_ver { + return; + } + + let conflict_cmd = self.proposed_admin_cmd.iter_mut().rev().find(|cmd| { + (epoch_state.check_ver && cmd.epoch_state.change_ver) + || (epoch_state.check_conf_ver && cmd.epoch_state.change_conf_ver) + }); + assert!(conflict_cmd.is_none(), "{:?}", conflict_cmd); + + if let Some(cmd) = self.proposed_admin_cmd.back() { + assert!(cmd.index < index, "{:?} {}", cmd, index); + } + self.proposed_admin_cmd + .push_back(ProposedAdminCmd::new(cmd_type, epoch_state, index)); + } + + /// Commit the admin commands. + #[inline] + pub fn commit_to(&mut self, index: u64, mut on_commit: impl FnMut(&ProposedAdminCmd)) { + if self.proposed_admin_cmd.is_empty() { + return; + } + + for cmd in &mut self.proposed_admin_cmd { + if cmd.committed { + continue; + } + if cmd.index <= index { + cmd.committed = true; + on_commit(cmd); + continue; + } + return; + } + } + + pub fn advance_apply(&mut self, index: u64, term: u64, region: &metapb::Region) { + while !self.proposed_admin_cmd.is_empty() { + let cmd = self.proposed_admin_cmd.front_mut().unwrap(); + if cmd.index <= index { + for ch in cmd.delayed_chs.drain(..) { + let mut resp = cmd_resp::new_error(Error::EpochNotMatch( + format!( + "current epoch of region {} is {:?}", + region.get_id(), + region.get_region_epoch(), + ), + vec![region.to_owned()], + )); + cmd_resp::bind_term(&mut resp, term); + ch.report_error(resp); + } + } else { + break; + } + self.proposed_admin_cmd.pop_front(); + } + } + + #[inline] + pub fn enter_prepare_merge(&mut self, prepare_merge_index: u64) { + self.pending_merge_index = prepare_merge_index; + } + + #[inline] + pub fn leave_prepare_merge(&mut self, prepare_merge_index: u64) { + if self.pending_merge_index != 0 { + assert_eq!(self.pending_merge_index, prepare_merge_index); + self.pending_merge_index = 0; + } + } + + /// Check if there is an on-going split command on current term. + /// + /// The answer is reliable only when the peer is leader. + #[inline] + pub fn is_splitting(&self) -> bool { + if self.proposed_admin_cmd.is_empty() { + return false; + } + // Split is deprecated in v2, only needs to check `BatchSplit`. + self.proposed_admin_cmd + .iter() + .any(|c| c.cmd_type == AdminCmdType::BatchSplit && c.committed) + } + + /// Check if there the current peer is waiting for being merged. + /// + /// The answer is reliable only when the peer is leader or `PrepareMerge` is + /// applied. + #[inline] + pub fn is_merging(&self) -> bool { + if self.proposed_admin_cmd.is_empty() { + return self.pending_merge_index != 0; + } + self.proposed_admin_cmd + .iter() + .any(|c| c.cmd_type == AdminCmdType::PrepareMerge && c.committed) + } +} + +impl Drop for ProposalControl { + fn drop(&mut self) { + for state in mem::take(&mut self.proposed_admin_cmd) { + for ch in state.delayed_chs { + apply::notify_stale_req(self.term, ch); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn new_admin_request(cmd_type: AdminCmdType) -> RaftCmdRequest { + let mut request = RaftCmdRequest::default(); + request.mut_admin_request().set_cmd_type(cmd_type); + request + } + + #[test] + fn test_proposal_control() { + let region = metapb::Region::default(); + + let mut control = ProposalControl::new(10); + assert_eq!(control.term, 10); + assert!( + control + .check_conflict(Some(AdminCmdType::BatchSplit)) + .is_none() + ); + control.record_proposed_admin(AdminCmdType::BatchSplit, 5); + assert_eq!(control.proposed_admin_cmd.len(), 1); + + // Both conflict with the split admin cmd + let conflict = control.check_conflict(None).unwrap(); + assert_eq!(conflict.index, 5); + assert_eq!(conflict.cmd_type, AdminCmdType::BatchSplit); + let conflict = control + .check_conflict(Some(AdminCmdType::PrepareMerge)) + .unwrap(); + assert_eq!(conflict.index, 5); + + assert!( + control + .check_conflict(Some(AdminCmdType::ChangePeerV2)) + .is_none() + ); + control.record_proposed_admin(AdminCmdType::ChangePeerV2, 6); + assert_eq!(control.proposed_admin_cmd.len(), 2); + + assert!(!control.is_splitting()); + assert!(!control.is_merging()); + + // Conflict with the change peer admin cmd + let conflict = control + .check_conflict(Some(AdminCmdType::ChangePeerV2)) + .unwrap(); + assert_eq!(conflict.index, 6); + // Conflict with the split admin cmd + let conflict = control.check_conflict(None).unwrap(); + assert_eq!(conflict.index, 5); + // Conflict with the change peer admin cmd + let conflict = control + .check_conflict(Some(AdminCmdType::PrepareMerge)) + .unwrap(); + assert_eq!(conflict.index, 6); + + let mut commit_split = false; + control.commit_to(4, |c| commit_split = c.cmd_type == AdminCmdType::BatchSplit); + assert!(!commit_split); + assert!(!control.is_splitting()); + control.commit_to(5, |c| commit_split = c.cmd_type == AdminCmdType::BatchSplit); + assert!(commit_split); + assert!(control.is_splitting()); + + control.advance_apply(4, 10, ®ion); + // Have no effect on `proposed_admin_cmd` + assert_eq!(control.proposed_admin_cmd.len(), 2); + assert!(control.is_splitting()); + + control.advance_apply(5, 10, ®ion); + // Left one change peer admin cmd + assert_eq!(control.proposed_admin_cmd.len(), 1); + assert!(!control.is_splitting()); + + assert!(control.check_conflict(None).is_none()); + let conflict = control + .check_conflict(Some(AdminCmdType::BatchSplit)) + .unwrap(); + assert_eq!(conflict.index, 6); + + // Change term to 11 + control.maybe_update_term(11); + assert!( + control + .check_conflict(Some(AdminCmdType::BatchSplit)) + .is_none() + ); + assert_eq!(control.term, 11); + // Should be empty + assert_eq!(control.proposed_admin_cmd.len(), 0); + + // Test attaching multiple callbacks. + control.record_proposed_admin(AdminCmdType::BatchSplit, 7); + let mut subs = vec![]; + for _ in 0..3 { + let conflict = control.check_conflict(None).unwrap(); + let (ch, sub) = CmdResChannel::pair(); + conflict.delay_channel(ch); + subs.push(sub); + } + // Delayed channel should not be notified immediately. + for sub in &subs { + assert!(!sub.has_result()); + } + control.advance_apply(7, 12, ®ion); + for sub in subs { + assert!(sub.has_result()); + let res = futures::executor::block_on(sub.result()).unwrap(); + assert!( + res.get_header().get_error().has_epoch_not_match(), + "{:?}", + res + ); + } + + // Should invoke callbacks when term is increased. + control.record_proposed_admin(AdminCmdType::BatchSplit, 8); + let (ch, sub) = CmdResChannel::pair(); + control.check_conflict(None).unwrap().delay_channel(ch); + control.maybe_update_term(13); + assert!(control.check_conflict(None).is_none()); + let res = futures::executor::block_on(sub.result()).unwrap(); + assert!( + res.get_header().get_error().has_stale_command(), + "{:?}", + res + ); + + // Should invoke callbacks when it's dropped. + control.record_proposed_admin(AdminCmdType::BatchSplit, 9); + let (ch, sub) = CmdResChannel::pair(); + control.check_conflict(None).unwrap().delay_channel(ch); + drop(control); + let res = futures::executor::block_on(sub.result()).unwrap(); + assert!( + res.get_header().get_error().has_stale_command(), + "{:?}", + res + ); + } + + #[test] + fn test_proposal_control_merge() { + let region = metapb::Region::default(); + + let mut control = ProposalControl::new(5); + assert!(!control.is_merging()); + control.record_proposed_admin(AdminCmdType::PrepareMerge, 5); + assert!(!control.is_merging()); + control.commit_to(5, |_| ()); + assert!(control.is_merging()); + control.advance_apply(5, 5, ®ion); + assert!(!control.is_merging()); + + control.record_proposed_admin(AdminCmdType::PrepareMerge, 6); + assert!(!control.is_merging()); + control.commit_to(6, |_| ()); + assert!(control.is_merging()); + control.enter_prepare_merge(6); + control.advance_apply(6, 5, ®ion); + assert!(control.is_merging()); + control.leave_prepare_merge(6); + assert!(!control.is_merging()); + } +} diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 2d89c3494d3..75575e9a19f 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -55,9 +55,11 @@ use crate::{ }; mod admin; +mod control; mod write; pub use admin::AdminCmdResult; +pub use control::ProposalControl; pub use write::{SimpleWriteDecoder, SimpleWriteEncoder}; use self::write::SimpleWrite; @@ -220,18 +222,28 @@ impl Peer { } #[inline] - fn enqueue_pending_proposal( + pub fn post_propose_command( &mut self, ctx: &mut StoreContext, - mut proposal: Proposal>, + res: Result, + ch: Vec, + call_proposed_on_success: bool, ) { - let applied_to_current_term = self.applied_to_current_term(); - if applied_to_current_term { + let idx = match res { + Ok(i) => i, + Err(e) => { + ch.report_error(cmd_resp::err_resp(e, self.term())); + return; + } + }; + let mut proposal = Proposal::new(idx, self.term(), ch); + if call_proposed_on_success { proposal.cb.notify_proposed(); } - proposal.must_pass_epoch_check = applied_to_current_term; + proposal.must_pass_epoch_check = self.applied_to_current_term(); proposal.propose_time = Some(*ctx.current_time.get_or_insert_with(monotonic_raw_now)); self.proposals_mut().push(proposal); + self.set_has_ready(); } #[inline] @@ -292,6 +304,7 @@ impl Peer { } self.raft_group_mut() .advance_apply_to(apply_res.applied_index); + self.proposal_control_advance_apply(apply_res.applied_index); let is_leader = self.is_leader(); let progress_to_be_updated = self.entry_storage().applied_term() != apply_res.applied_term; let entry_storage = self.entry_storage_mut(); diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index a760a5acfb2..59c5679f95f 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -7,9 +7,10 @@ use raftstore::{ cmd_resp, fsm::{apply, Proposal, MAX_PROPOSAL_SIZE_RATIO}, msg::ErrorCallback, - util, WriteCallback, + util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, + WriteCallback, }, - Result, + Error, Result, }; use crate::{ @@ -53,10 +54,17 @@ impl Peer { return; } // To maintain propose order, we need to make pending proposal first. - self.propose_pending_command(ctx); + self.propose_pending_writes(ctx); + if let Some(conflict) = self.proposal_control_mut().check_conflict(None) { + conflict.delay_channel(ch); + return; + } + // ProposalControl is reliable only when applied to current term. + let call_proposed_on_success = self.applied_to_current_term(); match SimpleWriteEncoder::new( req, (ctx.cfg.raft_entry_max_size.0 as f64 * MAX_PROPOSAL_SIZE_RATIO) as usize, + call_proposed_on_success, ) { Ok(mut encoder) => { encoder.add_response_channel(ch); @@ -65,35 +73,38 @@ impl Peer { } Err(req) => { let res = self.propose_command(ctx, req); - self.post_propose_write(ctx, res, vec![ch]); + self.post_propose_command(ctx, res, vec![ch], call_proposed_on_success); } } } - #[inline] - pub fn post_propose_write( - &mut self, - ctx: &mut StoreContext, - res: Result, - ch: Vec, - ) { - let idx = match res { - Ok(i) => i, - Err(e) => { - ch.report_error(cmd_resp::err_resp(e, self.term())); - return; - } - }; - let p = Proposal::new(idx, self.term(), ch); - self.enqueue_pending_proposal(ctx, p); - self.set_has_ready(); - } - - pub fn propose_pending_command(&mut self, ctx: &mut StoreContext) { + pub fn propose_pending_writes(&mut self, ctx: &mut StoreContext) { if let Some(encoder) = self.simple_write_encoder_mut().take() { + let call_proposed_on_success = if encoder.notify_proposed() { + // The request has pass conflict check and called all proposed callbacks. + false + } else { + // Epoch may have changed since last check. + let from_epoch = encoder.header().get_region_epoch(); + let res = util::compare_region_epoch( + from_epoch, + self.region(), + NORMAL_REQ_CHECK_CONF_VER, + NORMAL_REQ_CHECK_VER, + true, + ); + if let Err(mut e) = res { + // TODO: query sibling regions. + ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); + encoder.encode().1.report_error(cmd_resp::new_error(e)); + return; + } + // Only when it applies to current term, the epoch check can be reliable. + self.applied_to_current_term() + }; let (data, chs) = encoder.encode(); let res = self.propose(ctx, data); - self.post_propose_write(ctx, res, chs); + self.post_propose_command(ctx, res, chs, call_proposed_on_success); } } } diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index 364e2741868..ca9e7d39366 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -3,6 +3,7 @@ use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request}; use protobuf::{CodedInputStream, Message, SingularPtrField}; +use raftstore::store::WriteCallback; use slog::Logger; use crate::{operation::command::parse_at, router::CmdResChannel}; @@ -21,12 +22,18 @@ pub struct SimpleWriteEncoder { buf: Vec, channels: Vec, size_limit: usize, + notify_proposed: bool, } impl SimpleWriteEncoder { + /// Create an encoder. + /// + /// If `notify_proposed` is true, channels will be called `notify_proposed` + /// when it's appended. pub fn new( mut req: RaftCmdRequest, size_limit: usize, + notify_proposed: bool, ) -> Result { if !Self::allow_request(&req) { return Err(req); @@ -46,6 +53,7 @@ impl SimpleWriteEncoder { buf, channels: vec![], size_limit, + notify_proposed, }) } @@ -96,9 +104,24 @@ impl SimpleWriteEncoder { } #[inline] - pub fn add_response_channel(&mut self, ch: CmdResChannel) { + pub fn add_response_channel(&mut self, mut ch: CmdResChannel) { + if self.notify_proposed { + ch.notify_proposed(); + } self.channels.push(ch); } + + #[inline] + pub fn notify_proposed(&self) -> bool { + self.notify_proposed + } + + #[inline] + pub fn header(&self) -> &RaftRequestHeader { + self.header + .as_ref() + .unwrap_or_else(|| RaftRequestHeader::default_instance()) + } } #[derive(Debug)] @@ -382,7 +405,7 @@ mod tests { delete_req.set_key(delete_key.clone()); cmd.mut_requests().push(req); - let mut encoder = SimpleWriteEncoder::new(cmd.clone(), usize::MAX).unwrap(); + let mut encoder = SimpleWriteEncoder::new(cmd.clone(), usize::MAX, false).unwrap(); cmd.clear_requests(); req = Request::default(); @@ -471,7 +494,7 @@ mod tests { let mut req = Request::default(); req.set_cmd_type(CmdType::Invalid); invalid_cmd.mut_requests().push(req); - let fallback = SimpleWriteEncoder::new(invalid_cmd.clone(), usize::MAX).unwrap_err(); + let fallback = SimpleWriteEncoder::new(invalid_cmd.clone(), usize::MAX, false).unwrap_err(); let bytes = fallback.write_to_bytes().unwrap(); let logger = slog_global::borrow_global().new(o!()); let decoded = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap_err(); @@ -486,7 +509,7 @@ mod tests { put_req.set_key(b"key".to_vec()); put_req.set_value(b"".to_vec()); valid_cmd.mut_requests().push(req); - let mut encoder = SimpleWriteEncoder::new(valid_cmd.clone(), usize::MAX).unwrap(); + let mut encoder = SimpleWriteEncoder::new(valid_cmd.clone(), usize::MAX, false).unwrap(); // Only simple write command can be batched. encoder.amend(invalid_cmd.clone()).unwrap_err(); let mut valid_cmd2 = valid_cmd.clone(); diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 5b19db91b71..a110f4bf330 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -5,7 +5,9 @@ mod life; mod query; mod ready; -pub use command::{AdminCmdResult, CommittedEntries, SimpleWriteDecoder, SimpleWriteEncoder}; +pub use command::{ + AdminCmdResult, CommittedEntries, ProposalControl, SimpleWriteDecoder, SimpleWriteEncoder, +}; pub use life::DestroyProgress; pub use ready::{AsyncWriter, GenSnapTask, SnapState}; diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 1ae4aecd1cc..114080bcdbb 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -151,14 +151,14 @@ impl Peer { pub(crate) fn maybe_renew_leader_lease( &mut self, ts: Timespec, - store_meta: &mut Arc>>, + store_meta: &Mutex>, progress: Option, ) { // A nonleader peer should never has leader lease. let read_progress = if !should_renew_lease( self.is_leader(), - self.is_splitting(), - self.is_merging(), + self.proposal_control().is_splitting(), + self.proposal_control().is_merging(), self.has_force_leader(), ) { None @@ -186,7 +186,7 @@ impl Peer { // TODO: remove this block of code when snapshot is done; add the logic into // on_persist_snapshot. - pub(crate) fn add_reader_if_necessary(&mut self, store_meta: &mut Arc>>) { + pub(crate) fn add_reader_if_necessary(&mut self, store_meta: &Mutex>) { let mut meta = store_meta.lock().unwrap(); // TODO: remove this block of code when snapshot is done; add the logic into // on_persist_snapshot. diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 8b84b0788ce..77ca7b90074 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -216,7 +216,7 @@ impl Peer { self.pending_reads_mut().advance_leader_reads(states); if let Some(propose_time) = self.pending_reads().last_ready().map(|r| r.propose_time) { if !self.leader_lease_mut().is_suspect() { - self.maybe_renew_leader_lease(propose_time, &mut ctx.store_meta, None); + self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); } } @@ -288,6 +288,24 @@ impl Peer { && !self.has_pending_merge_state() } + #[inline] + pub fn ready_to_handle_read(&self) -> bool { + // TODO: It may cause read index to wait a long time. + + // There may be some values that are not applied by this leader yet but the old + // leader, if applied_term isn't equal to current term. + self.applied_to_current_term() + // There may be stale read if the old leader splits really slow, + // the new region may already elected a new leader while + // the old leader still think it owns the split range. + && !self.proposal_control().is_splitting() + // There may be stale read if a target leader is in another store and + // applied commit merge, written new values, but the sibling peer in + // this store does not apply commit merge, so the leader is not ready + // to read, until the merge is rollbacked. + && !self.proposal_control().is_merging() + } + fn send_read_command( &self, ctx: &mut StoreContext, @@ -409,7 +427,7 @@ impl Peer { // TODO: add coprocessor_host hook let progress = ReadProgress::applied_term(applied_term); // TODO: remove it - self.add_reader_if_necessary(&mut ctx.store_meta); + self.add_reader_if_necessary(&ctx.store_meta); let mut meta = ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id()).unwrap(); self.maybe_update_read_progress(reader, progress); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 62cb42ef253..6f6866b9671 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -24,10 +24,10 @@ use std::cmp; use engine_traits::{KvEngine, RaftEngine}; use error_code::ErrorCodeExt; -use kvproto::raft_serverpb::RaftMessage; +use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; use protobuf::Message as _; -use raft::{eraftpb, Ready}; -use raftstore::store::{util, ExtraStates, FetchedLogs, Transport, WriteTask}; +use raft::{eraftpb, Ready, StateRole}; +use raftstore::store::{util, ExtraStates, FetchedLogs, ReadProgress, Transport, WriteTask}; use slog::{debug, error, trace, warn}; use tikv_util::time::{duration_to_sec, monotonic_raw_now}; @@ -247,7 +247,7 @@ impl Peer { ctx.raft_metrics.commit_log.observe(duration_to_sec( (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), )); - self.maybe_renew_leader_lease(propose_time, &mut ctx.store_meta, None); + self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); break; } } @@ -288,6 +288,22 @@ impl Peer { |entry| entry.index == self.raft_group().raft.raft_log.last_index() )); + self.on_role_changed(ctx, &ready); + + if let Some(hs) = ready.hs() { + let prev_commit_index = self.entry_storage().commit_index(); + assert!( + hs.get_commit() >= prev_commit_index, + "{:?} {:?} {}", + self.logger.list(), + hs, + prev_commit_index + ); + if self.is_leader() && hs.get_commit() > prev_commit_index { + self.on_leader_commit_index_changed(hs.get_commit()); + } + } + if !ready.messages().is_empty() { debug_assert!(self.is_leader()); for msg in ready.take_messages() { @@ -395,6 +411,89 @@ impl Peer { pub fn on_wait_flush(&mut self, ch: crate::router::FlushChannel) { self.async_writer.subscirbe_flush(ch); } + + pub fn on_role_changed(&mut self, ctx: &mut StoreContext, ready: &Ready) { + // Update leader lease when the Raft state changes. + if let Some(ss) = ready.ss() { + let term = self.term(); + match ss.raft_state { + StateRole::Leader => { + // The local read can only be performed after a new leader has applied + // the first empty entry on its term. After that the lease expiring time + // should be updated to + // send_to_quorum_ts + max_lease + // as the comments in `Lease` explain. + // It is recommended to update the lease expiring time right after + // this peer becomes leader because it's more convenient to do it here and + // it has no impact on the correctness. + let progress_term = ReadProgress::term(term); + self.maybe_renew_leader_lease( + monotonic_raw_now(), + &ctx.store_meta, + Some(progress_term), + ); + debug!( + self.logger, + "becomes leader with lease"; + "lease" => ?self.leader_lease(), + ); + // If the predecessor reads index during transferring leader and receives + // quorum's heartbeat response after that, it may wait for applying to + // current term to apply the read. So broadcast eagerly to avoid unexpected + // latency. + self.raft_group_mut().skip_bcast_commit(false); + + // Exit entry cache warmup state when the peer becomes leader. + self.entry_storage_mut().clear_entry_cache_warmup_state(); + } + StateRole::Follower => { + self.leader_lease_mut().expire(); + self.storage_mut().cancel_generating_snap(None); + } + _ => {} + } + self.proposal_control_mut().maybe_update_term(term); + } + } + + /// If leader commits new admin commands, it may break lease assumption. So + /// we need to cancel lease whenever necessary. + /// + /// Note this method should be called before sending out any messages. + fn on_leader_commit_index_changed(&mut self, commit_index: u64) { + let mut committed_prepare_merge = false; + self.proposal_control_mut().commit_to(commit_index, |cmd| { + committed_prepare_merge |= cmd.cmd_type() == AdminCmdType::PrepareMerge + }); + // There are two types of operations that will change the ownership of a range: + // split and merge. + // + // - For split, after the split command is committed, it's + // possible that the same range is govened by different region on different + // nodes due to different apply progress. But because only the peers on the + // same node as old leader will campaign despite election timeout, so there + // will be no modification to the overlapped range until either the original + // leader apply the split command or an election timeout is passed since split + // is committed. We already forbid renewing lease after committing split, and + // original leader will update the reader delegate with latest epoch after + // applying split before the split peer starts campaign, so here the only thing + // we need to do is marking split is committed (which is done by `commit_to` + // above). It's correct to allow local read during split. + // + // - For merge, after the prepare merge command is committed, the target peers + // may apply commit merge at any time, so we need to forbid any type of read + // to avoid missing the modifications from target peers. + if committed_prepare_merge { + // After prepare_merge is committed and the leader broadcasts commit + // index to followers, the leader can not know when the target region + // merges majority of this region, also it can not know when the target + // region writes new values. + // To prevent unsafe local read, we suspect its leader lease. + self.leader_lease_mut().suspect(monotonic_raw_now()); + // Stop updating `safe_ts` + self.read_progress_mut().discard(); + } + } } impl Storage { diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index eb98851b3bb..dd53f47e152 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -18,7 +18,7 @@ use time::Timespec; use super::{storage::Storage, Apply}; use crate::{ fsm::{ApplyFsm, ApplyScheduler}, - operation::{AsyncWriter, DestroyProgress, SimpleWriteEncoder}, + operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteEncoder}, router::{CmdResChannel, QueryResChannel}, tablet::CachedTablet, Result, @@ -58,6 +58,9 @@ pub struct Peer { /// Transaction extensions related to this peer. txn_ext: Arc, txn_extra_op: Arc>, + + /// Check whether this proposal can be proposed based on its epoch. + proposal_control: ProposalControl, } impl Peer { @@ -142,6 +145,7 @@ impl Peer { region_buckets: None, txn_ext: Arc::default(), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), + proposal_control: ProposalControl::new(0), }; // If this region has only one peer and I am the one, campaign directly. @@ -153,6 +157,8 @@ impl Peer { peer.raft_group.campaign()?; peer.set_has_ready(); } + let term = peer.term(); + peer.proposal_control.maybe_update_term(term); Ok(peer) } @@ -323,18 +329,6 @@ impl Peer { self.raft_group.raft.term } - #[inline] - // TODO - pub fn is_splitting(&self) -> bool { - false - } - - #[inline] - // TODO - pub fn is_merging(&self) -> bool { - false - } - #[inline] // TODO pub fn has_force_leader(&self) -> bool { @@ -391,24 +385,6 @@ impl Peer { &self.proposals } - #[inline] - pub fn ready_to_handle_read(&self) -> bool { - // TODO: It may cause read index to wait a long time. - - // There may be some values that are not applied by this leader yet but the old - // leader, if applied_term isn't equal to current term. - self.applied_to_current_term() - // There may be stale read if the old leader splits really slow, - // the new region may already elected a new leader while - // the old leader still think it owns the split range. - && !self.is_splitting() - // There may be stale read if a target leader is in another store and - // applied commit merge, written new values, but the sibling peer in - // this store does not apply commit merge, so the leader is not ready - // to read, until the merge is rollbacked. - && !self.is_merging() - } - pub fn apply_scheduler(&self) -> &ApplyScheduler { self.apply_scheduler.as_ref().unwrap() } @@ -432,4 +408,22 @@ impl Peer { self.region_buckets.as_ref().map(|b| b.meta.clone()), ) } + + #[inline] + pub fn proposal_control_mut(&mut self) -> &mut ProposalControl { + &mut self.proposal_control + } + + #[inline] + pub fn proposal_control(&self) -> &ProposalControl { + &self.proposal_control + } + + #[inline] + pub fn proposal_control_advance_apply(&mut self, apply_index: u64) { + let region = self.raft_group.store().region(); + let term = self.term(); + self.proposal_control + .advance_apply(apply_index, term, region); + } } diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index d68c414ca5f..b6da3c804f0 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -221,6 +221,13 @@ impl BaseSubscriber { pub async fn result(self) -> Option { WaitResult { core: &self.core }.await } + + /// Test if the result is ready without any polling. + #[inline] + pub fn has_result(&self) -> bool { + let e = self.core.event.load(Ordering::Relaxed); + check_bit(e, fired_bit_of(PAYLOAD_EVENT)).is_some() + } } unsafe impl Send for BaseSubscriber {} diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index 50fb5c4e16a..740e64f7e29 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -5,6 +5,8 @@ #![feature(custom_test_frameworks)] #![test_runner(test_util::run_tests)] +// TODO: test conflict control in integration tests after split is supported. + mod cluster; mod test_basic_write; mod test_conf_change; diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs index 4f49757085f..2155a4775c6 100644 --- a/components/raftstore-v2/tests/integrations/test_read.rs +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -67,14 +67,8 @@ fn test_snap_without_read_index() { req.mut_requests().push(request_inner); let res = router.query(region_id, req.clone()).unwrap(); let resp = res.read().unwrap(); - // single node commited index should be 6. - assert_eq!(resp.read_index, 6); - - // run again, this time we expect the lease is not expired and the read index - // should be 0. - let res = router.query(region_id, req.clone()).unwrap(); - let resp = res.read().unwrap(); - // the request can be processed locally, read index should be 0. + // When it becomes leader, it will get a lease automatically because of empty + // entry. assert_eq!(resp.read_index, 0); // run with header read_quorum From 9c4f0d08fc47ab9e733b4464ffc75f44cff9ce50 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Fri, 11 Nov 2022 18:53:53 +0800 Subject: [PATCH 328/676] raftstore-v2: make generate snapshot works with checkpoint (#13736) ref tikv/tikv#12842 raftstore-v2: make generate snapshot works with checkpoint Signed-off-by: nolouch Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 14 ++- .../src/operation/ready/snapshot.rs | 41 ++++++- components/raftstore-v2/src/raft/storage.rs | 50 ++++++-- components/raftstore-v2/src/router/imp.rs | 4 +- components/raftstore-v2/src/router/message.rs | 4 +- .../tests/integrations/cluster.rs | 7 +- .../raftstore/src/store/async_io/read.rs | 114 ++++++++++++++++-- components/raftstore/src/store/mod.rs | 16 +-- .../raftstore/src/store/peer_storage.rs | 4 +- components/raftstore/src/store/snap.rs | 75 ++++++++++++ components/raftstore/src/store/transport.rs | 5 +- components/raftstore/src/store/worker/mod.rs | 3 +- 12 files changed, 279 insertions(+), 58 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 76d4fd16bea..0dbd0ed4b64 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -2,6 +2,7 @@ use std::{ ops::{Deref, DerefMut}, + path::Path, sync::{Arc, Mutex}, time::Duration, }; @@ -21,7 +22,7 @@ use kvproto::{ use raft::INVALID_ID; use raftstore::store::{ fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, ReadRunner, ReadTask, - StoreWriters, Transport, WriteSenders, + StoreWriters, TabletSnapManager, Transport, WriteSenders, }; use slog::Logger; use tikv_util::{ @@ -365,6 +366,7 @@ impl StoreSystem { trans: T, router: &StoreRouter, store_meta: Arc>>, + snap_mgr: TabletSnapManager, ) -> Result<()> where T: Transport + 'static, @@ -373,10 +375,12 @@ impl StoreSystem { workers .store_writers .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; - let read_scheduler = workers.async_read_worker.start( - "async-read-worker", - ReadRunner::new(router.clone(), raft_engine.clone()), - ); + + let mut read_runner = ReadRunner::new(router.clone(), raft_engine.clone()); + read_runner.set_snap_mgr(snap_mgr); + let read_scheduler = workers + .async_read_worker + .start("async-read-worker", read_runner); let mut builder = StorePollerBuilder::new( cfg.clone(), diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index e0f4e5653de..ad836ed2455 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -32,8 +32,8 @@ use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_serverpb::{RaftSnapshotData, RegionLocalState}; use protobuf::Message; use raft::eraftpb::Snapshot; -use raftstore::store::{metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, ReadTask}; -use slog::{error, info}; +use raftstore::store::{metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask}; +use slog::{error, info, warn}; use tikv_util::{box_try, worker::Scheduler}; use crate::{ @@ -68,6 +68,8 @@ impl PartialEq for SnapState { pub struct GenSnapTask { region_id: u64, + // The snapshot will be sent to the peer. + to_peer: u64, // Fill it when you are going to generate the snapshot. // index used to check if the gen task should be canceled. index: Arc, @@ -78,9 +80,15 @@ pub struct GenSnapTask { } impl GenSnapTask { - pub fn new(region_id: u64, index: Arc, canceled: Arc) -> GenSnapTask { + pub fn new( + region_id: u64, + to_peer: u64, + index: Arc, + canceled: Arc, + ) -> GenSnapTask { GenSnapTask { region_id, + to_peer, index, canceled, for_balance: false, @@ -101,7 +109,7 @@ impl Debug for GenSnapTask { } impl Peer { - pub fn on_snapshot_generated(&mut self, snapshot: Box) { + pub fn on_snapshot_generated(&mut self, snapshot: GenSnapRes) { if self.storage_mut().on_snapshot_generated(snapshot) { self.raft_group_mut().ping(); self.set_has_ready(); @@ -115,6 +123,15 @@ impl Apply { /// Will schedule a task to read worker and then generate a snapshot /// asynchronously. pub fn schedule_gen_snapshot(&mut self, snap_task: GenSnapTask) { + // Do not generate, the peer is removed. + if self.tombstone() { + snap_task.canceled.store(true, Ordering::SeqCst); + error!( + self.logger, + "cancel generating snapshot because it's already destroyed"; + ); + return; + } // Flush before do snapshot. if snap_task.canceled.load(Ordering::SeqCst) { return; @@ -126,6 +143,7 @@ impl Apply { snap_task.index.store(last_applied_index, Ordering::SeqCst); let gen_tablet_sanp_task = ReadTask::GenTabletSnapshot { region_id: snap_task.region_id, + to_peer: snap_task.to_peer, tablet: self.tablet().clone(), region_state: self.region_state().clone(), last_applied_term, @@ -189,7 +207,7 @@ impl Storage { index: index.clone(), }; - let task = GenSnapTask::new(self.region().get_id(), index, canceled); + let task = GenSnapTask::new(self.region().get_id(), to, index, canceled); let mut gen_snap_task = self.gen_snap_task_mut(); assert!(gen_snap_task.is_none()); *gen_snap_task = Box::new(Some(task)); @@ -268,7 +286,12 @@ impl Storage { /// Try to switch snap state to generated. only `Generating` can switch to /// `Generated`. /// TODO: make the snap state more clearer, the snapshot must be consumed. - pub fn on_snapshot_generated(&self, snap: Box) -> bool { + pub fn on_snapshot_generated(&self, res: GenSnapRes) -> bool { + if res.is_none() { + self.cancel_generating_snap(None); + return false; + } + let snap = res.unwrap(); let mut snap_state = self.snap_state_mut(); let SnapState::Generating { ref canceled, @@ -276,6 +299,12 @@ impl Storage { } = *snap_state else { return false }; if snap.get_metadata().get_index() < index.load(Ordering::SeqCst) { + warn!( + self.logger(), + "snapshot is staled, skip"; + "snap index" => snap.get_metadata().get_index(), + "required index" => index.load(Ordering::SeqCst), + ); return false; } // Should changed `SnapState::Generated` to `SnapState::Relax` when the diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index aa642f5967f..8abeeeef73d 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -370,8 +370,8 @@ mod tests { }; use raft::{eraftpb::Snapshot as RaftSnapshot, Error as RaftError, StorageError}; use raftstore::store::{ - AsyncReadNotifier, FetchedLogs, ReadRunner, ReadTask, RAFT_INIT_LOG_INDEX, - RAFT_INIT_LOG_TERM, + AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask, TabletSnapKey, + TabletSnapManager, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; use slog::o; use tempfile::TempDir; @@ -382,11 +382,11 @@ mod tests { #[derive(Clone)] pub struct TestRouter { - ch: SyncSender>, + ch: SyncSender, } impl TestRouter { - pub fn new() -> (Self, Receiver>) { + pub fn new() -> (Self, Receiver) { let (tx, rx) = sync_channel(1); (Self { ch: tx }, rx) } @@ -397,8 +397,8 @@ mod tests { unreachable!(); } - fn notify_snapshot_generated(&self, _region_id: u64, snapshot: Box) { - self.ch.send(snapshot).unwrap(); + fn notify_snapshot_generated(&self, _region_id: u64, res: GenSnapRes) { + self.ch.send(res).unwrap(); } } @@ -458,6 +458,8 @@ mod tests { write_initial_states(&mut wb, region.clone()).unwrap(); assert!(!wb.is_empty()); raft_engine.consume(&mut wb, true).unwrap(); + let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()); + mgr.init().unwrap(); // building a tablet factory let ops = DbOptions::default(); let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); @@ -478,7 +480,9 @@ mod tests { .unwrap() .unwrap(); let (router, rx) = TestRouter::new(); - worker.start(ReadRunner::new(router.clone(), raft_engine)); + let mut read_runner = ReadRunner::new(router.clone(), raft_engine); + read_runner.set_snap_mgr(mgr.clone()); + worker.start(read_runner); // setup peer applyer let mut apply = Apply::new( region.get_peers()[0].clone(), @@ -490,8 +494,8 @@ mod tests { logger, ); - // test get snapshot - let snap = s.snapshot(0, 0); + // Test get snapshot + let snap = s.snapshot(0, 7); let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); assert_eq!(snap.unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); @@ -504,11 +508,13 @@ mod tests { }; assert_eq!(snap.get_metadata().get_index(), 0); assert_eq!(snap.get_metadata().get_term(), 0); - assert!(snap.get_data().is_empty()); + assert_eq!(snap.get_data().is_empty(), false); + let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); + let checkpointer_path = mgr.get_tablet_checkpointer_path(&snap_key); + assert!(checkpointer_path.exists()); - // test cancel snapshot + // Test cancel snapshot let snap = s.snapshot(0, 0); - let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); assert_eq!(snap.unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); apply.schedule_gen_snapshot(gen_task); @@ -516,6 +522,24 @@ mod tests { s.cancel_generating_snap(None); assert_eq!(*s.snap_state.borrow(), SnapState::Relax); - // TODO: add test get twice snapshot and cancel once + // Test get twice snapshot and cancel once. + // get snapshot a + let snap = s.snapshot(0, 0); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task_a = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.set_apply_progress(1, 5); + apply.schedule_gen_snapshot(gen_task_a); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.cancel_generating_snap(None); + // cancel get snapshot a, try get snaphsot b + let snap = s.snapshot(0, 0); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task_b = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.set_apply_progress(10, 5); + apply.schedule_gen_snapshot(gen_task_b); + // on snapshot a and b + assert_eq!(s.on_snapshot_generated(res), false); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(s.on_snapshot_generated(res), true); } } diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 7c02ee10243..8cb65e40a3c 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -9,7 +9,7 @@ use kvproto::{ raft_serverpb::RaftMessage, }; use raft::eraftpb::Snapshot as RaftSnapshot; -use raftstore::store::{AsyncReadNotifier, FetchedLogs, RegionSnapshot}; +use raftstore::store::{AsyncReadNotifier, FetchedLogs, GenSnapRes, RegionSnapshot}; use slog::Logger; use super::PeerMsg; @@ -20,7 +20,7 @@ impl AsyncReadNotifier for StoreRouter { let _ = self.force_send(region_id, PeerMsg::LogsFetched(fetched_logs)); } - fn notify_snapshot_generated(&self, region_id: u64, snapshot: Box) { + fn notify_snapshot_generated(&self, region_id: u64, snapshot: GenSnapRes) { let _ = self.force_send(region_id, PeerMsg::SnapshotGenerated(snapshot)); } } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 64af4d41d71..cda9e971c66 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -6,7 +6,7 @@ use std::fmt; use engine_traits::Snapshot; use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; use raft::eraftpb::Snapshot as RaftSnapshot; -use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs}; +use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs, GenSnapRes}; use tikv_util::time::Instant; use super::{ @@ -126,7 +126,7 @@ pub enum PeerMsg { /// Result of applying committed entries. The message can't be lost. ApplyRes(ApplyRes), LogsFetched(FetchedLogs), - SnapshotGenerated(Box), + SnapshotGenerated(GenSnapRes), /// Start the FSM. Start, /// A message only used to notify a peer. diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 1d458d7a73e..6ac567907af 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -26,7 +26,9 @@ use kvproto::{ raft_serverpb::RaftMessage, }; use pd_client::RpcClient; -use raftstore::store::{region_meta::RegionMeta, Config, Transport, RAFT_INIT_LOG_INDEX}; +use raftstore::store::{ + region_meta::RegionMeta, Config, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, +}; use raftstore_v2::{ create_store_batch_system, router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, @@ -206,7 +208,7 @@ impl RunningState { let router = RaftRouter::new(store_id, router); let store_meta = router.store_meta().clone(); - + let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()); system .start( store_id, @@ -216,6 +218,7 @@ impl RunningState { transport.clone(), router.store_router(), store_meta.clone(), + snap_mgr, ) .unwrap(); diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index 30ce2102040..2da4869d24b 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -3,16 +3,25 @@ use std::{ fmt, marker::PhantomData, - sync::{atomic::AtomicBool, Arc}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, }; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{Checkpointer, KvEngine, RaftEngine}; use fail::fail_point; -use kvproto::raft_serverpb::RegionLocalState; -use raft::{eraftpb::Snapshot as RaftSnapshot, GetEntriesContext}; -use tikv_util::worker::Runnable; +use file_system::{IoType, WithIoType}; +use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; +use protobuf::Message; +use raft::{eraftpb::Snapshot, GetEntriesContext}; +use tikv_util::{error, info, time::Instant, worker::Runnable}; -use crate::store::{RaftlogFetchResult, MAX_INIT_ENTRY_COUNT}; +use crate::store::{ + util, + worker::metrics::{SNAP_COUNTER, SNAP_HISTOGRAM}, + RaftlogFetchResult, TabletSnapKey, TabletSnapManager, MAX_INIT_ENTRY_COUNT, +}; pub enum ReadTask { FetchLogs { @@ -28,6 +37,7 @@ pub enum ReadTask { // GenTabletSnapshot is used to generate tablet snapshot. GenTabletSnapshot { region_id: u64, + to_peer: u64, tablet: EK, region_state: RegionLocalState, last_applied_term: u64, @@ -53,8 +63,10 @@ impl fmt::Display for ReadTask { "Fetch Raft Logs [region: {}, low: {}, high: {}, max_size: {}] for sending with context {:?}, tried: {}, term: {}", region_id, low, high, max_size, context, tried_cnt, term, ), - ReadTask::GenTabletSnapshot { region_id, .. } => { - write!(f, "Snapshot gen for {}", region_id) + ReadTask::GenTabletSnapshot { + region_id, to_peer, .. + } => { + write!(f, "Snapshot gen for {}, to peer {}", region_id, to_peer) } } } @@ -66,10 +78,12 @@ pub struct FetchedLogs { pub logs: Box, } +pub type GenSnapRes = Option>; + /// A router for receiving fetched result. pub trait AsyncReadNotifier: Send { fn notify_logs_fetched(&self, region_id: u64, fetched: FetchedLogs); - fn notify_snapshot_generated(&self, region_id: u64, snapshot: Box); + fn notify_snapshot_generated(&self, region_id: u64, res: Option>); } pub struct ReadRunner @@ -80,6 +94,7 @@ where { notifier: N, raft_engine: ER, + sanp_mgr: Option, _phantom: PhantomData, } @@ -88,9 +103,34 @@ impl ReadRunner { ReadRunner { notifier, raft_engine, + sanp_mgr: None, _phantom: PhantomData, } } + + #[inline] + pub fn set_snap_mgr(&mut self, mgr: TabletSnapManager) { + self.sanp_mgr = Some(mgr); + } + + #[inline] + fn snap_mgr(&self) -> &TabletSnapManager { + self.sanp_mgr.as_ref().unwrap() + } + + fn generate_snap(&self, snap_key: &TabletSnapKey, tablet: EK) -> crate::Result<()> { + let checkpointer_path = self.snap_mgr().get_tablet_checkpointer_path(snap_key); + if checkpointer_path.as_path().exists() { + // Remove the old checkpoint directly. + std::fs::remove_dir_all(checkpointer_path.as_path())?; + } + // Here not checkpoint to a temporary directory first, the temporary directory + // logic already implemented in rocksdb. + let mut checkpointer = tablet.new_checkpointer()?; + + checkpointer.create_at(checkpointer_path.as_path(), None, 0)?; + Ok(()) + } } impl Runnable for ReadRunner @@ -141,10 +181,58 @@ where }, ); } - ReadTask::GenTabletSnapshot { region_id, .. } => { - // TODO: implement generate tablet snapshot for raftstore v2 - self.notifier - .notify_snapshot_generated(region_id, Box::new(RaftSnapshot::default())); + + ReadTask::GenTabletSnapshot { + region_id, + to_peer, + tablet, + region_state, + last_applied_term, + last_applied_index, + canceled, + for_balance, + } => { + SNAP_COUNTER.generate.start.inc(); + if canceled.load(Ordering::Relaxed) { + info!("generate snap is canceled"; "region_id" => region_id); + SNAP_COUNTER.generate.abort.inc(); + return; + } + let start = Instant::now(); + let _io_type_guard = WithIoType::new(if for_balance { + IoType::LoadBalance + } else { + IoType::Replication + }); + // the state should already checked in apply workers. + assert_ne!(region_state.get_state(), PeerState::Tombstone); + let mut snapshot = Snapshot::default(); + // Set snapshot metadata. + snapshot.mut_metadata().set_term(last_applied_term); + snapshot.mut_metadata().set_index(last_applied_index); + let conf_state = util::conf_state_from_region(region_state.get_region()); + snapshot.mut_metadata().set_conf_state(conf_state); + // Set snapshot data. + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region_state.get_region().clone()); + snap_data.mut_meta().set_for_balance(for_balance); + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + + // create checkpointer. + let snap_key = TabletSnapKey::from_region_snap(region_id, to_peer, &snapshot); + let mut res = None; + if let Err(e) = self.generate_snap(&snap_key, tablet) { + error!("failed to create checkpointer"; "region_id" => region_id, "error" => %e); + SNAP_COUNTER.generate.fail.inc(); + } else { + SNAP_COUNTER.generate.success.inc(); + SNAP_HISTOGRAM + .generate + .observe(start.saturating_elapsed_secs()); + res = Some(Box::new(snapshot)) + } + + self.notifier.notify_snapshot_generated(region_id, res); } } } diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 2078ccabafc..5d7455b2d1c 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -30,7 +30,7 @@ mod worker; pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ - read::{AsyncReadNotifier, FetchedLogs, ReadRunner, ReadTask}, + read::{AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask}, write::{ ExtraStates, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask, @@ -70,17 +70,17 @@ pub use self::{ check_abort, copy_snapshot, snap_io::{apply_sst_cf_file, build_sst_cf_file_list}, ApplyOptions, CfFile, Error as SnapError, SnapEntry, SnapKey, SnapManager, - SnapManagerBuilder, Snapshot, SnapshotStatistics, + SnapManagerBuilder, Snapshot, SnapshotStatistics, TabletSnapKey, TabletSnapManager, }, transport::{CasualRouter, ProposalRouter, SignificantRouter, StoreRouter, Transport}, txn_ext::{LocksStatus, PeerPessimisticLocks, PessimisticLockPair, TxnExt}, util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ - AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, - CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, LocalReadContext, - LocalReader, LocalReaderCore, PdTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, - ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, - SplitConfig, SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, - TLS_LOCAL_READ_METRICS, + metrics::TLS_LOCAL_READ_METRICS, AutoSplitController, Bucket, BucketRange, + CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, + KeyEntry, LocalReadContext, LocalReader, LocalReaderCore, PdTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, + TrackVer, WriteStats, }, }; diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 56b80c94dcc..0d10b1f36cf 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1190,7 +1190,7 @@ pub mod tests { fsm::apply::compact_raft_log, initial_region, prepare_bootstrap_cluster, worker::{make_region_worker_raftstore_cfg, RegionRunner, RegionTask}, - AsyncReadNotifier, FetchedLogs, + AsyncReadNotifier, FetchedLogs, GenSnapRes, }, }; @@ -1429,7 +1429,7 @@ pub mod tests { self.ch.send(fetched_logs).unwrap(); } - fn notify_snapshot_generated(&self, _region_id: u64, _snapshot: Box) { + fn notify_snapshot_generated(&self, _region_id: u64, _res: GenSnapRes) { unreachable!(); } } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index eda0ffaa9cb..9995582f13c 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1886,6 +1886,81 @@ impl SnapManagerBuilder { } } +#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct TabletSnapKey { + pub region_id: u64, + pub to_peer: u64, + pub term: u64, + pub idx: u64, +} + +impl TabletSnapKey { + #[inline] + pub fn new(region_id: u64, to_peer: u64, term: u64, idx: u64) -> TabletSnapKey { + TabletSnapKey { + region_id, + to_peer, + term, + idx, + } + } + + pub fn from_region_snap(region_id: u64, to_peer: u64, snap: &RaftSnapshot) -> TabletSnapKey { + let index = snap.get_metadata().get_index(); + let term = snap.get_metadata().get_term(); + TabletSnapKey::new(region_id, to_peer, term, index) + } +} + +impl Display for TabletSnapKey { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "{}_{}_{}_{}", + self.region_id, self.to_peer, self.term, self.idx + ) + } +} + +/// `TabletSnapManager` manager tablet snapshot and shared between raftstore v2. +/// It's similar `SnapManager`, but simpler in tablet version. +/// +/// TODO: +/// - add Limiter to control send/recv speed +/// - clean up expired tablet checkpointer +#[derive(Clone)] +pub struct TabletSnapManager { + // directory to store snapfile. + base: String, +} + +impl TabletSnapManager { + pub fn new>(path: T) -> Self { + Self { base: path.into() } + } + + pub fn init(&self) -> io::Result<()> { + // Initialize the directory if it doesn't exist. + let path = Path::new(&self.base); + if !path.exists() { + file_system::create_dir_all(path)?; + return Ok(()); + } + if !path.is_dir() { + return Err(io::Error::new( + ErrorKind::Other, + format!("{} should be a directory", path.display()), + )); + } + Ok(()) + } + + pub fn get_tablet_checkpointer_path(&self, key: &TabletSnapKey) -> PathBuf { + let prefix = format!("{}_{}", SNAP_GEN_PREFIX, key); + PathBuf::from(&self.base).join(&prefix) + } +} + #[cfg(test)] pub mod tests { use std::{ diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index d2bbe921eea..7f10e7cd249 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -6,10 +6,9 @@ use std::sync::mpsc; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, Snapshot}; use kvproto::raft_serverpb::RaftMessage; -use raft::eraftpb::Snapshot as RaftSnapshot; use tikv_util::{error, warn}; -use super::{AsyncReadNotifier, FetchedLogs}; +use super::{AsyncReadNotifier, FetchedLogs, GenSnapRes}; use crate::{ store::{CasualMessage, PeerMsg, RaftCommand, RaftRouter, SignificantMsg, StoreMsg}, DiscardReason, Error, Result, @@ -182,7 +181,7 @@ impl AsyncReadNotifier for RaftRouter { } #[inline] - fn notify_snapshot_generated(&self, _region_id: u64, _snapshot: Box) { + fn notify_snapshot_generated(&self, _region_id: u64, _snapshot: GenSnapRes) { unreachable!() } } diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index cd7680ebc4a..e021651ba3d 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -6,7 +6,7 @@ mod cleanup_snapshot; mod cleanup_sst; mod compact; mod consistency_check; -mod metrics; +pub mod metrics; mod pd; mod raftlog_gc; mod read; @@ -25,7 +25,6 @@ pub use self::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{Runner as CompactRunner, Task as CompactTask}, consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, - metrics::TLS_LOCAL_READ_METRICS, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, Runner as PdRunner, Task as PdTask, From 6b61f4a1e17e236db53d8ef3efc6338fa5ef159e Mon Sep 17 00:00:00 2001 From: Mike <842725815@qq.com> Date: Mon, 14 Nov 2022 13:19:54 +0800 Subject: [PATCH 329/676] Add curl command into docker image (#13765) ref pingcap/tidb-operator#4764, ref tikv/tikv#13781 Add curl command into docker image. Signed-off-by: mikechengwei <842725815@qq.com> --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index eca69ce3b8d..c4ad36dc6e7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -105,6 +105,9 @@ FROM pingcap/alpine-glibc COPY --from=builder /tikv/target/release/tikv-server /tikv-server COPY --from=builder /tikv/target/release/tikv-ctl /tikv-ctl +RUN apk add --no-cache \ + curl + EXPOSE 20160 20180 ENTRYPOINT ["/tikv-server"] From 23dba4fe66bf8abed8cb5c61a33655fd5edad902 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 14 Nov 2022 14:41:54 +0800 Subject: [PATCH 330/676] raftstore-v2: support tablet split (#13709) ref tikv/tikv#12842, ref tikv/tikv#13689 Signed-off-by: SpadeA-Tang --- components/batch-system/src/router.rs | 8 +- components/raftstore-v2/src/batch/store.rs | 8 +- components/raftstore-v2/src/fsm/peer.rs | 1 + components/raftstore-v2/src/fsm/store.rs | 11 +- .../src/operation/command/admin/mod.rs | 7 +- .../src/operation/command/admin/split.rs | 240 ++++++++++++++++-- .../raftstore-v2/src/operation/command/mod.rs | 12 +- components/raftstore-v2/src/operation/life.rs | 51 +++- components/raftstore-v2/src/operation/mod.rs | 2 +- components/raftstore-v2/src/raft/peer.rs | 140 ++++++++-- components/raftstore-v2/src/raft/storage.rs | 43 ++++ components/raftstore-v2/src/router/message.rs | 8 + .../tests/integrations/cluster.rs | 31 ++- .../raftstore-v2/tests/integrations/mod.rs | 1 + .../tests/integrations/test_split.rs | 183 +++++++++++++ components/raftstore/src/store/config.rs | 18 ++ 16 files changed, 704 insertions(+), 60 deletions(-) create mode 100644 components/raftstore-v2/tests/integrations/test_split.rs diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index d96e65e1e99..b863f1535f0 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -294,7 +294,7 @@ where } } - /// Force sending message to control FSM. + /// Sending message to control FSM. #[inline] pub fn send_control(&self, msg: C::Message) -> Result<(), TrySendError> { match self.control_box.try_send(msg, &self.control_scheduler) { @@ -309,6 +309,12 @@ where } } + /// Force sending message to control FSM. + #[inline] + pub fn force_send_control(&self, msg: C::Message) -> Result<(), SendError> { + self.control_box.force_send(msg, &self.control_scheduler) + } + /// Try to notify all normal FSMs a message. pub fn broadcast_normal(&self, mut msg_gen: impl FnMut() -> N::Message) { let timer = Instant::now_coarse(); diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 0dbd0ed4b64..605bbb95131 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -265,6 +265,7 @@ impl StorePollerBuilder { fn init(&self) -> Result>> { let mut regions = HashMap::default(); let cfg = self.cfg.value(); + let mut meta = self.store_meta.lock().unwrap(); self.engine .for_each_raft_group::(&mut |region_id| { assert_ne!(region_id, INVALID_ID); @@ -278,8 +279,11 @@ impl StorePollerBuilder { Some(p) => p, None => return Ok(()), }; - let pair = PeerFsm::new(&cfg, &*self.tablet_factory, storage)?; - let prev = regions.insert(region_id, pair); + let (sender, peer_fsm) = PeerFsm::new(&cfg, &*self.tablet_factory, storage)?; + meta.region_read_progress + .insert(region_id, peer_fsm.as_ref().peer().read_progress().clone()); + + let prev = regions.insert(region_id, (sender, peer_fsm)); if let Some((_, p)) = prev { return Err(box_err!( "duplicate region {:?} vs {:?}", diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index b74f8b46b1c..5abdcf31f0f 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -220,6 +220,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } PeerMsg::Tick(tick) => self.on_tick(tick), PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(self.store_ctx, res), + PeerMsg::SplitInit(msg) => self.fsm.peer.on_split_init(self.store_ctx, msg), PeerMsg::Start => self.on_start(), PeerMsg::Noop => unimplemented!(), PeerMsg::Persisted { diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 3be571bdfbc..0d390d5b51d 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -5,12 +5,17 @@ use std::time::SystemTime; use batch_system::Fsm; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; -use raftstore::store::{Config, ReadDelegate}; +use kvproto::{metapb::Region, raft_serverpb::RaftMessage}; +use raftstore::{ + coprocessor::RegionChangeReason, + store::{Config, ReadDelegate, RegionReadProgressRegistry}, +}; use slog::{o, Logger}; use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; use crate::{ batch::StoreContext, + raft::Peer, router::{StoreMsg, StoreTick}, tablet::CachedTablet, }; @@ -24,6 +29,8 @@ where pub readers: HashMap, /// region_id -> tablet cache pub tablet_caches: HashMap>, + /// region_id -> `RegionReadProgress` + pub region_read_progress: RegionReadProgressRegistry, } impl StoreMeta @@ -35,6 +42,7 @@ where store_id: None, readers: HashMap::default(), tablet_caches: HashMap::default(), + region_read_progress: RegionReadProgressRegistry::new(), } } } @@ -149,6 +157,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { StoreMsg::Start => self.on_start(), StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => self.fsm.store.on_raft_message(self.store_ctx, msg), + StoreMsg::SplitInit(msg) => self.fsm.store.on_split_init(self.store_ctx, msg), } } } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index c3fe2cceded..eb6560d239e 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -4,10 +4,7 @@ mod conf_change; mod split; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::{ - raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest}, - raft_serverpb::PeerState, -}; +use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest}; use protobuf::Message; use raft::prelude::ConfChangeV2; use raftstore::{ @@ -20,7 +17,7 @@ use raftstore::{ Result, }; use slog::info; -pub use split::SplitResult; +pub use split::{SplitInit, SplitResult}; use tikv_util::box_err; use self::conf_change::ConfChangeResult; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 198819cfd7b..2e43e69b44c 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -8,52 +8,58 @@ //! - Nothing special except for validating batch split requests (ex: split keys //! are in ascending order). //! -//! Execution: -//! - exec_batch_split: Create and initialize metapb::region for split regions +//! Apply: +//! - apply_batch_split: Create and initialize metapb::region for split regions //! and derived regions. Then, create checkpoints of the current talbet for //! split regions and derived region to make tablet physical isolated. Update //! the parent region's region state without persistency. Send the new regions //! (including derived region) back to raftstore. //! -//! Result apply: -//! - todo +//! On Apply Result: +//! - on_ready_split_region: Update the relevant in memory meta info of the +//! parent peer, then send to the store the relevant info needed to create and +//! initialize the split regions. //! //! Split peer creation and initlization: -//! - todo -//! -//! Split finish: -//! - todo +//! - on_split_init: In normal cases, the uninitialized split region will be +//! created by the store, and here init it using the data sent from the parent +//! peer. use std::collections::VecDeque; +use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{ - Checkpointer, KvEngine, OpenOptions, RaftEngine, TabletFactory, CF_DEFAULT, SPLIT_PREFIX, + Checkpointer, DeleteStrategy, KvEngine, OpenOptions, RaftEngine, RaftLogBatch, Range, + CF_DEFAULT, SPLIT_PREFIX, }; +use fail::fail_point; +use keys::enc_end_key; use kvproto::{ - metapb::Region, + metapb::{self, Region, RegionEpoch}, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, raft_serverpb::RegionLocalState, }; use protobuf::Message; +use raft::RawNode; use raftstore::{ - coprocessor::split_observer::{is_valid_split_key, strip_timestamp_if_exists}, + coprocessor::RegionChangeReason, store::{ fsm::apply::validate_batch_split, metrics::PEER_ADMIN_CMD_COUNTER, util::{self, KeysInfoFormatter}, - PeerStat, ProposalContext, RAFT_INIT_LOG_INDEX, + PeerPessimisticLocks, PeerStat, ProposalContext, RAFT_INIT_LOG_INDEX, }, Result, }; -use slog::{info, warn, Logger}; +use slog::{error, info, warn, Logger}; use tikv_util::box_err; use crate::{ batch::StoreContext, - fsm::ApplyResReporter, + fsm::{ApplyResReporter, PeerFsmDelegate}, operation::AdminCmdResult, - raft::{Apply, Peer}, - router::ApplyRes, + raft::{write_initial_states, Apply, Peer, Storage}, + router::{ApplyRes, PeerMsg, StoreMsg}, }; #[derive(Debug)] @@ -63,14 +69,23 @@ pub struct SplitResult { pub derived_index: usize, pub tablet_index: u64, } +pub struct SplitInit { + /// Split region + pub region: metapb::Region, + pub check_split: bool, + pub parent_is_leader: bool, + + /// In-memory pessimistic locks that should be inherited from parent region + pub locks: PeerPessimisticLocks, +} impl Peer { pub fn propose_split( &mut self, store_ctx: &mut StoreContext, - mut req: RaftCmdRequest, + req: RaftCmdRequest, ) -> Result { - validate_batch_split(req.mut_admin_request(), self.region())?; + validate_batch_split(req.get_admin_request(), self.region())?; // We rely on ConflictChecker to detect conflicts, so no need to set proposal // context. let data = req.write_to_bytes().unwrap(); @@ -247,6 +262,187 @@ impl Apply { } } +impl Peer { + pub fn on_ready_split_region( + &mut self, + store_ctx: &mut StoreContext, + derived_index: usize, + tablet_index: u64, + regions: Vec, + ) { + fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); + + let derived = ®ions[derived_index]; + let derived_epoch = derived.get_region_epoch().clone(); + let region_id = derived.get_id(); + + // Group in-memory pessimistic locks in the original region into new regions. + // The locks of new regions will be put into the corresponding new regions + // later. And the locks belonging to the old region will stay in the original + // map. + let region_locks = { + let mut pessimistic_locks = self.txn_ext().pessimistic_locks.write(); + info!(self.logger, "moving {} locks to new regions", pessimistic_locks.len();); + // Update the version so the concurrent reader will fail due to EpochNotMatch + // instead of PessimisticLockNotFound. + pessimistic_locks.version = derived_epoch.get_version(); + pessimistic_locks.group_by_regions(®ions, derived) + }; + fail_point!("on_split_invalidate_locks"); + + // Roughly estimate the size and keys for new regions. + let new_region_count = regions.len() as u64; + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); + self.set_region( + reader, + derived.clone(), + RegionChangeReason::Split, + tablet_index, + ); + } + + self.post_split(); + + let last_region_id = regions.last().unwrap().get_id(); + for (new_region, locks) in regions.into_iter().zip(region_locks) { + let new_region_id = new_region.get_id(); + if new_region_id == region_id { + continue; + } + + let split_init = PeerMsg::SplitInit(Box::new(SplitInit { + region: new_region, + parent_is_leader: self.is_leader(), + check_split: last_region_id == new_region_id, + locks, + })); + + // First, send init msg to peer directly. Returning error means the peer is not + // existed in which case we should redirect it to the store. + match store_ctx.router.force_send(new_region_id, split_init) { + Ok(_) => {} + Err(SendError(PeerMsg::SplitInit(msg))) => { + store_ctx + .router + .force_send_control(StoreMsg::SplitInit(msg)) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to send split peer intialization msg to store : {:?}", + self.logger.list(), + e + ) + }); + } + _ => unreachable!(), + } + } + } + + pub fn on_split_init( + &mut self, + store_ctx: &mut StoreContext, + split_init: Box, + ) { + let region_id = split_init.region.id; + let replace = split_init.region.get_region_epoch().get_version() + > self + .storage() + .region_state() + .get_region() + .get_region_epoch() + .get_version(); + + if !self.storage().is_initialized() || replace { + let split_temp_path = store_ctx.tablet_factory.tablet_path_with_prefix( + SPLIT_PREFIX, + region_id, + RAFT_INIT_LOG_INDEX, + ); + + let tablet = store_ctx + .tablet_factory + .load_tablet(&split_temp_path, region_id, RAFT_INIT_LOG_INDEX) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to load tablet {:?} :{:?}", + self.logger.list(), + split_temp_path, + e + ) + }); + + self.tablet_mut().set(tablet); + + let storage = Storage::with_split( + self.peer().get_store_id(), + &split_init.region, + store_ctx.engine.clone(), + store_ctx.read_scheduler.clone(), + &store_ctx.logger, + ) + .unwrap_or_else(|e| panic!("fail to create storage: {:?}", e)) + .unwrap(); + + let applied_index = storage.apply_state().get_applied_index(); + let peer_id = storage.peer().get_id(); + let raft_cfg = store_ctx.cfg.new_raft_config(peer_id, applied_index); + + let mut raft_group = RawNode::new(&raft_cfg, storage, &self.logger).unwrap(); + // If this region has only one peer and I am the one, campaign directly. + if split_init.region.get_peers().len() == 1 { + raft_group.campaign().unwrap(); + self.set_has_ready(); + } + self.set_raft_group(raft_group); + } else { + // todo: when reaching here (peer is initalized before and cannot be replaced), + // it is much complexer. + return; + } + + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + + info!( + self.logger, + "init split region"; + "region" => ?split_init.region, + ); + + // todo: GlobalReplicationState + + for p in split_init.region.get_peers() { + self.insert_peer_cache(p.clone()); + } + + if split_init.parent_is_leader { + if self.maybe_campaign() { + self.set_has_ready(); + } + + *self.txn_ext().pessimistic_locks.write() = split_init.locks; + // The new peer is likely to become leader, send a heartbeat immediately to + // reduce client query miss. + self.heartbeat_pd(store_ctx); + } + + meta.tablet_caches.insert(region_id, self.tablet().clone()); + meta.readers + .insert(region_id, self.generate_read_delegate()); + meta.region_read_progress + .insert(region_id, self.read_progress().clone()); + } + + if split_init.check_split { + // todo: check if the last region needs to split again + } + + self.schedule_apply_fsm(store_ctx); + } +} + #[cfg(test)] mod test { use std::sync::{ @@ -260,7 +456,7 @@ mod test { kv::TestTabletFactoryV2, raft, }; - use engine_traits::{CfOptionsExt, Peekable, WriteBatch, ALL_CFS}; + use engine_traits::{CfOptionsExt, Peekable, TabletFactory, WriteBatch, ALL_CFS}; use futures::channel::mpsc::unbounded; use kvproto::{ metapb::RegionEpoch, @@ -419,7 +615,7 @@ mod test { region_state.set_region(region.clone()); region_state.set_tablet_index(5); - let (read_scheduler, rx) = dummy_scheduler(); + let (read_scheduler, _rx) = dummy_scheduler(); let (reporter, _) = MockReporter::new(); let mut apply = Apply::new( region @@ -608,7 +804,9 @@ mod test { // Split will create checkpoint tablet, so if there are some writes before // split, they should be flushed immediately. apply.apply_put(CF_DEFAULT, b"k04", b"v4").unwrap(); - assert!(!apply.write_batch_mut().as_ref().unwrap().is_empty()); + assert!(!WriteBatch::is_empty( + apply.write_batch_mut().as_ref().unwrap() + )); splits.mut_requests().clear(); splits .mut_requests() diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 75575e9a19f..81365a162ec 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -39,7 +39,8 @@ use raftstore::{ local_metrics::RaftMetrics, metrics::*, msg::ErrorCallback, - util, WriteCallback, + util::{self, admin_cmd_epoch_lookup}, + WriteCallback, }, Error, Result, }; @@ -58,7 +59,7 @@ mod admin; mod control; mod write; -pub use admin::AdminCmdResult; +pub use admin::{AdminCmdResult, SplitInit, SplitResult}; pub use control::ProposalControl; pub use write::{SimpleWriteDecoder, SimpleWriteEncoder}; @@ -294,14 +295,21 @@ impl Peer { // region. return; } + for admin_res in apply_res.admin_result { match admin_res { AdminCmdResult::ConfChange(conf_change) => { self.on_apply_res_conf_change(conf_change) } + AdminCmdResult::SplitRegion(SplitResult { + regions, + derived_index, + tablet_index, + }) => self.on_ready_split_region(ctx, derived_index, tablet_index, regions), AdminCmdResult::SplitRegion(_) => unimplemented!(), } } + self.raft_group_mut() .advance_apply_to(apply_res.applied_index); self.proposal_control_advance_apply(apply_res.applied_index); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 7be70a9afe7..12c7d4ec544 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -13,16 +13,17 @@ use std::cmp; use batch_system::BasicMailbox; -use crossbeam::channel::TrySendError; +use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ metapb::Region, raft_serverpb::{PeerState, RaftMessage}, }; use raftstore::store::{util, ExtraStates, WriteTask}; -use slog::{debug, error, info}; +use slog::{debug, error, info, warn}; use tikv_util::store::find_peer; +use super::command::SplitInit; use crate::{ batch::StoreContext, fsm::{PeerFsm, Store}, @@ -89,6 +90,44 @@ impl DestroyProgress { } impl Store { + /// The method is called during split. + /// The creation process is: + /// 1. create an uninitialized peer if not existed before + /// 2. initialize the peer by the information sent from parent peer + #[inline] + pub fn on_split_init( + &mut self, + ctx: &mut StoreContext, + msg: Box, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let region_id = msg.region.id; + let mut raft_msg = Box::new(RaftMessage::default()); + raft_msg.set_region_id(region_id); + raft_msg.set_region_epoch(msg.region.get_region_epoch().clone()); + raft_msg.set_to_peer( + msg.region + .get_peers() + .iter() + .find(|p| p.get_store_id() == self.store_id()) + .unwrap() + .clone(), + ); + + // It will create the peer if it does not exist + self.on_raft_message(ctx, raft_msg); + + if let Err(SendError(m)) = ctx.router.force_send(region_id, PeerMsg::SplitInit(msg)) { + warn!( + self.logger(), + "Split peer is destroyed before sending the intialization msg"; + "split init msg" => ?m, + ) + } + } + /// When a message's recipient doesn't exist, it will be redirected to /// store. Store is responsible for checking if it's neccessary to create /// a peer to handle the message. @@ -174,8 +213,14 @@ impl Store { let mut region = Region::default(); region.set_id(region_id); region.set_region_epoch(from_epoch.clone()); + // Peer list doesn't have to be complete, as it's uninitialized. - region.mut_peers().push(from_peer.clone()); + // + // If the id of the from_peer is INVALID_ID, this msg must be sent from parent + // peer in the split execution in which case we do not add it into the region. + if from_peer.id != raft::INVALID_ID { + region.mut_peers().push(from_peer.clone()); + } region.mut_peers().push(to_peer.clone()); // We don't set the region range here as we allow range conflict. let (tx, fsm) = match Storage::uninit( diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index a110f4bf330..84835231398 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -11,4 +11,4 @@ pub use command::{ pub use life::DestroyProgress; pub use ready::{AsyncWriter, GenSnapTask, SnapState}; -pub(crate) use self::query::LocalReader; +pub(crate) use self::{command::SplitInit, query::LocalReader}; diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index dd53f47e152..6ebb3ed2056 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -2,21 +2,35 @@ use std::{mem, sync::Arc}; +use collections::HashMap; use crossbeam::atomic::AtomicCell; use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; -use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb}; +use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; -use raftstore::store::{ - util::{Lease, RegionReadProgress}, - Config, EntryStorage, ProposalQueue, ReadDelegate, ReadIndexQueue, TrackVer, TxnExt, +use raftstore::{ + coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, + store::{ + fsm::Proposal, + util::{Lease, RegionReadProgress}, + Config, EntryStorage, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, + TxnExt, + }, + Error, +}; +use slog::{debug, error, info, o, warn, Logger}; +use tikv_util::{ + box_err, + config::ReadableSize, + time::{monotonic_raw_now, Instant as TiInstant}, + worker::Scheduler, + Either, }; -use slog::Logger; -use tikv_util::{box_err, config::ReadableSize}; use time::Timespec; use super::{storage::Storage, Apply}; use crate::{ + batch::StoreContext, fsm::{ApplyFsm, ApplyScheduler}, operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteEncoder}, router::{CmdResChannel, QueryResChannel}, @@ -55,6 +69,8 @@ pub struct Peer { /// region buckets. region_buckets: Option, + last_region_buckets: Option, + /// Transaction extensions related to this peer. txn_ext: Arc, txn_extra_op: Arc>, @@ -76,22 +92,7 @@ impl Peer { let applied_index = storage.apply_state().get_applied_index(); let peer_id = storage.peer().get_id(); - - let raft_cfg = raft::Config { - id: peer_id, - election_tick: cfg.raft_election_timeout_ticks, - heartbeat_tick: cfg.raft_heartbeat_ticks, - min_election_tick: cfg.raft_min_election_timeout_ticks, - max_election_tick: cfg.raft_max_election_timeout_ticks, - max_size_per_msg: cfg.raft_max_size_per_msg.0, - max_inflight_msgs: cfg.raft_max_inflight_msgs, - applied: applied_index, - check_quorum: true, - skip_bcast_commit: true, - pre_vote: cfg.prevote, - max_committed_size_per_ready: ReadableSize::mb(16).0, - ..Default::default() - }; + let raft_cfg = cfg.new_raft_config(peer_id, applied_index); let region_id = storage.region().get_id(); let tablet_index = storage.region_state().get_tablet_index(); @@ -143,6 +144,7 @@ impl Peer { cfg.renew_leader_lease_advance_duration(), ), region_buckets: None, + last_region_buckets: None, txn_ext: Arc::default(), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), proposal_control: ProposalControl::new(0), @@ -173,6 +175,63 @@ impl Peer { self.region().get_id() } + /// Set the region of a peer. + /// + /// This will update the region of the peer, caller must ensure the region + /// has been preserved in a durable device. + pub fn set_region( + &mut self, + // host: &CoprocessorHost, + reader: &mut ReadDelegate, + region: metapb::Region, + reason: RegionChangeReason, + tablet_index: u64, + ) { + if self.region().get_region_epoch().get_version() < region.get_region_epoch().get_version() + { + // Epoch version changed, disable read on the local reader for this region. + self.leader_lease.expire_remote_lease(); + } + + let mut region_state = RegionLocalState::default(); + region_state.set_region(region.clone()); + region_state.set_tablet_index(tablet_index); + region_state.set_state(self.storage().region_state().get_state()); + self.storage_mut().set_region_state(region_state); + + let progress = ReadProgress::region(region); + // Always update read delegate's region to avoid stale region info after a + // follower becoming a leader. + self.maybe_update_read_progress(reader, progress); + + if self.is_leader() { + // Unlike v1, we should renew remote lease if it's leader. This is because v2 + // only provides read in local reader which requires passing the lease check. If + // lease check fails, it sends query to raftstore to make it renew the remote + // lease. However, raftstore will answer immediately if the `bound` in + // `leader_lease` is valid, so the remote lease will not be updated. + if let Some(progress) = self + .leader_lease + .maybe_new_remote_lease(self.term()) + .map(ReadProgress::leader_lease) + { + self.maybe_update_read_progress(reader, progress); + } + } + + // Update leader info + self.read_progress + .update_leader_info(self.leader_id(), self.term(), self.region()); + + { + let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); + pessimistic_locks.term = self.term(); + pessimistic_locks.version = self.region().get_region_epoch().get_version(); + } + + // todo: CoprocessorHost + } + #[inline] pub fn peer(&self) -> &metapb::Peer { self.raft_group.store().peer() @@ -253,6 +312,11 @@ impl Peer { &mut self.raft_group } + #[inline] + pub fn set_raft_group(&mut self, raft_group: RawNode>) { + self.raft_group = raft_group; + } + /// Mark the peer has a ready so it will be checked at the end of every /// processing round. #[inline] @@ -394,6 +458,38 @@ impl Peer { self.apply_scheduler = Some(apply_scheduler); } + #[inline] + pub fn post_split(&mut self) { + self.reset_region_buckets(); + } + + pub fn reset_region_buckets(&mut self) { + if self.region_buckets.is_some() { + self.last_region_buckets = self.region_buckets.take(); + } + } + + pub fn maybe_campaign(&mut self) -> bool { + if self.region().get_peers().len() <= 1 { + // The peer campaigned when it was created, no need to do it again. + return false; + } + + // If last peer is the leader of the region before split, it's intuitional for + // it to become the leader of new split region. + let _ = self.raft_group.campaign(); + true + } + + #[inline] + pub fn txn_ext(&self) -> &Arc { + &self.txn_ext + } + + pub fn heartbeat_pd(&self, store_ctx: &StoreContext) { + // todo + } + pub fn generate_read_delegate(&self) -> ReadDelegate { let peer_id = self.peer().get_id(); diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 8abeeeef73d..01285cc5a46 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -200,6 +200,49 @@ impl Storage { .map(Some) } + /// Creates a new storage for split peer. + /// + /// Except for region local state which uses the `region` provided with the + /// inital tablet index, all uses the inital states. + pub fn with_split( + store_id: u64, + region: &metapb::Region, + engine: ER, + read_scheduler: Scheduler>, + logger: &Logger, + ) -> Result>> { + let mut region_state = RegionLocalState::default(); + region_state.set_region(region.clone()); + region_state.set_state(PeerState::Normal); + region_state.set_tablet_index(RAFT_INIT_LOG_INDEX); + + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_term(RAFT_INIT_LOG_TERM); + + let mut raft_state = RaftLocalState::default(); + raft_state.set_last_index(RAFT_INIT_LOG_INDEX); + raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); + raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); + + Self::create( + store_id, + region_state, + raft_state, + apply_state, + engine, + read_scheduler, + true, + logger, + ) + .map(Some) + } + fn create( store_id: u64, region_state: RegionLocalState, diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index cda9e971c66..a4681d8a873 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -15,6 +15,7 @@ use super::{ }, ApplyRes, }; +use crate::operation::SplitInit; #[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] @@ -129,6 +130,8 @@ pub enum PeerMsg { SnapshotGenerated(GenSnapRes), /// Start the FSM. Start, + /// Messages from peer to peer in the same store + SplitInit(Box), /// A message only used to notify a peer. Noop, /// A message that indicates an asynchronous write has finished. @@ -167,6 +170,9 @@ impl fmt::Debug for PeerMsg { }, PeerMsg::ApplyRes(res) => write!(fmt, "ApplyRes {:?}", res), PeerMsg::Start => write!(fmt, "Startup"), + PeerMsg::SplitInit(_) => { + write!(fmt, "Split initialization") + } PeerMsg::Noop => write!(fmt, "Noop"), PeerMsg::Persisted { peer_id, @@ -187,6 +193,7 @@ impl fmt::Debug for PeerMsg { pub enum StoreMsg { RaftMessage(Box), + SplitInit(Box), Tick(StoreTick), Start, } @@ -195,6 +202,7 @@ impl fmt::Debug for StoreMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { StoreMsg::RaftMessage(_) => write!(fmt, "Raft Message"), + StoreMsg::SplitInit(_) => write!(fmt, "Split initialization"), StoreMsg::Tick(tick) => write!(fmt, "StoreTick {:?}", tick), StoreMsg::Start => write!(fmt, "Start store"), } diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 6ac567907af..d99c982fc97 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -21,13 +21,14 @@ use engine_test::{ use engine_traits::{OpenOptions, TabletFactory, ALL_CFS}; use futures::executor::block_on; use kvproto::{ - metapb::Store, + metapb::{self, RegionEpoch, Store}, raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, }; use pd_client::RpcClient; use raftstore::store::{ - region_meta::RegionMeta, Config, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, + region_meta::{RegionLocalState, RegionMeta}, + Config, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, }; use raftstore_v2::{ create_store_batch_system, @@ -147,6 +148,32 @@ impl TestRouter { req.mut_header().set_term(meta.raft_status.hard_state.term); req } + + pub fn region_detail(&self, region_id: u64) -> metapb::Region { + let RegionLocalState { + id, + start_key, + end_key, + epoch, + peers, + .. + } = self + .must_query_debug_info(region_id, Duration::from_secs(1)) + .unwrap() + .region_state; + let mut region = metapb::Region::default(); + region.set_id(id); + region.set_start_key(start_key); + region.set_end_key(end_key); + let mut region_epoch = RegionEpoch::default(); + region_epoch.set_conf_ver(epoch.conf_ver); + region_epoch.set_version(epoch.version); + region.set_region_epoch(region_epoch); + for peer in peers { + region.mut_peers().push(new_peer(peer.store_id, peer.id)); + } + region + } } pub struct RunningState { diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index 740e64f7e29..4fb9ebcc323 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -12,4 +12,5 @@ mod test_basic_write; mod test_conf_change; mod test_life; mod test_read; +mod test_split; mod test_status; diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs new file mode 100644 index 00000000000..97487a5d0c2 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -0,0 +1,183 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{thread, time::Duration}; + +use futures::executor::block_on; +use kvproto::{ + metapb, pdpb, + raft_cmdpb::{ + AdminCmdType, AdminRequest, CmdType, RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, + }, +}; +use raftstore_v2::router::PeerMsg; +use tikv_util::store::new_peer; + +use crate::cluster::{Cluster, TestRouter}; + +fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, +) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req +} + +fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { + let (msg, sub) = PeerMsg::raft_command(req); + router.send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + + // todo: when persistent implementation is ready, we can use tablet index of + // the parent to check whether the split is done. Now, just sleep a second. + thread::sleep(Duration::from_secs(1)); +} + +fn put(router: &mut TestRouter, region_id: u64, key: &[u8]) -> RaftCmdResponse { + let mut req = router.new_request_for(region_id); + + let mut put_req = Request::default(); + put_req.set_cmd_type(CmdType::Put); + put_req.mut_put().set_key(key.to_vec()); + put_req.mut_put().set_value(b"v1".to_vec()); + req.mut_requests().push(put_req); + + let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + router.send(region_id, msg).unwrap(); + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + block_on(sub.result()).unwrap() +} + +// Split the region according to the parameters +// return the updated original region +fn split_region( + router: &mut TestRouter, + region: metapb::Region, + peer: metapb::Peer, + split_region_id: u64, + split_peer: metapb::Peer, + left_key: &[u8], + right_key: &[u8], + split_key: &[u8], + right_derive: bool, +) -> (metapb::Region, metapb::Region) { + let region_id = region.id; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.mut_header().set_peer(peer); + + let mut split_id = pdpb::SplitId::new(); + split_id.new_region_id = split_region_id; + split_id.new_peer_ids = vec![split_peer.id]; + let admin_req = + new_batch_split_region_request(vec![split_key.to_vec()], vec![split_id], right_derive); + req.mut_requests().clear(); + req.set_admin_request(admin_req); + + must_split(region_id, req, router); + + let (left, right) = if !right_derive { + ( + router.region_detail(region_id), + router.region_detail(split_region_id), + ) + } else { + ( + router.region_detail(split_region_id), + router.region_detail(region_id), + ) + }; + + // The end key of left region is `split_key` + // So writing `right_key` will fail + let resp = put(router, left.id, right_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + // But `left_key` should succeed + let resp = put(router, left.id, left_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // Mirror of above case + let resp = put(router, right.id, left_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + let resp = put(router, right.id, right_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + assert_eq!(left.get_end_key(), split_key); + assert_eq!(right.get_start_key(), split_key); + assert_eq!(region.get_start_key(), left.get_start_key()); + assert_eq!(region.get_end_key(), right.get_end_key()); + + (left, right) +} + +#[test] +fn test_split() { + let cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let mut router = cluster.router(0); + // let factory = cluster.node(0).tablet_factory(); + + let region_id = 2; + let peer = new_peer(store_id, 3); + let region = router.region_detail(region_id); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + // Region 2 ["", ""] peer(1, 3) + // -> Region 2 ["", "k22"] peer(1, 3) + // Region 1000 ["k22", ""] peer(1, 10) + let (left, right) = split_region( + &mut router, + region, + peer.clone(), + 1000, + new_peer(store_id, 10), + b"k11", + b"k33", + b"k22", + false, + ); + + // Region 2 ["", "k22"] peer(1, 3) + // -> Region 2 ["", "k11"] peer(1, 3) + // Region 1001 ["k11", "k22"] peer(1, 11) + let _ = split_region( + &mut router, + left, + peer, + 1001, + new_peer(store_id, 11), + b"k00", + b"k11", + b"k11", + false, + ); + + // Region 1000 ["k22", ""] peer(1, 10) + // -> Region 1000 ["k22", "k33"] peer(1, 10) + // Region 1002 ["k33", ""] peer(1, 12) + let _ = split_region( + &mut router, + right, + new_peer(store_id, 10), + 1002, + new_peer(store_id, 12), + b"k22", + b"k33", + b"k33", + false, + ); +} diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index cbd83d0b85d..70cf6b67d1f 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -438,6 +438,24 @@ impl Config { Config::default() } + pub fn new_raft_config(&self, peer_id: u64, applied_index: u64) -> raft::Config { + raft::Config { + id: peer_id, + election_tick: self.raft_election_timeout_ticks, + heartbeat_tick: self.raft_heartbeat_ticks, + min_election_tick: self.raft_min_election_timeout_ticks, + max_election_tick: self.raft_max_election_timeout_ticks, + max_size_per_msg: self.raft_max_size_per_msg.0, + max_inflight_msgs: self.raft_max_inflight_msgs, + applied: applied_index, + check_quorum: true, + skip_bcast_commit: true, + pre_vote: self.prevote, + max_committed_size_per_ready: ReadableSize::mb(16).0, + ..Default::default() + } + } + pub fn raft_store_max_leader_lease(&self) -> TimeDuration { TimeDuration::from_std(self.raft_store_max_leader_lease.0).unwrap() } From a80ab9880d6c25fe9b962e582846360dd229783b Mon Sep 17 00:00:00 2001 From: YangKeao Date: Mon, 14 Nov 2022 21:57:55 -0500 Subject: [PATCH 331/676] copr: fix _ pattern in like behavior for old collation (#13785) close tikv/tikv#13769 Signed-off-by: YangKeao Co-authored-by: Ti Chi Robot --- .../src/codec/collation/charset.rs | 8 + .../src/codec/collation/mod.rs | 28 +++ components/tidb_query_expr/src/impl_like.rs | 185 +++++++++++++++--- components/tidb_query_expr/src/lib.rs | 47 ++++- 4 files changed, 233 insertions(+), 35 deletions(-) diff --git a/components/tidb_query_datatype/src/codec/collation/charset.rs b/components/tidb_query_datatype/src/codec/collation/charset.rs index 482e19cb999..9ea76f16b92 100644 --- a/components/tidb_query_datatype/src/codec/collation/charset.rs +++ b/components/tidb_query_datatype/src/codec/collation/charset.rs @@ -22,6 +22,10 @@ impl Charset for CharsetBinary { Some((data[0], 1)) } } + + fn charset() -> crate::Charset { + crate::Charset::Binary + } } pub struct CharsetUtf8mb4; @@ -48,6 +52,10 @@ impl Charset for CharsetUtf8mb4 { }) } } + + fn charset() -> crate::Charset { + crate::Charset::Utf8Mb4 + } } // gbk character data actually stored with utf8mb4 character encoding. diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index cdc21cbe35a..9fbef4f1ee2 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -41,6 +41,32 @@ macro_rules! match_template_collator { }} } +#[macro_export] +macro_rules! match_template_multiple_collators { + ((), (), $($tail:tt)*) => { + $($tail)* + }; + (($first:tt), ($match_exprs:tt), $($tail:tt)*) => { + match_template_multiple_collators! { + ($first,), ($match_exprs,), $($tail)* + } + }; + (($first:tt, $($t:tt)*), ($first_match_expr:tt, $($match_exprs:tt)*), $($tail:tt)*) => {{ + #[allow(unused_imports)] + use $crate::codec::collation::collator::*; + + match_template_collator! { + $first, match $first_match_expr { + Collation::$first => { + match_template_multiple_collators! { + ($($t)*), ($($match_exprs)*), $($tail)* + } + } + } + } + }}; +} + #[macro_export] macro_rules! match_template_charset { ($t:tt, $($tail:tt)*) => {{ @@ -67,6 +93,8 @@ pub trait Charset { fn validate(bstr: &[u8]) -> Result<()>; fn decode_one(data: &[u8]) -> Option<(Self::Char, usize)>; + + fn charset() -> crate::Charset; } pub trait Collator: 'static + std::marker::Send + std::marker::Sync + std::fmt::Debug { diff --git a/components/tidb_query_expr/src/impl_like.rs b/components/tidb_query_expr/src/impl_like.rs index 39dce827650..2fe99017fe0 100644 --- a/components/tidb_query_expr/src/impl_like.rs +++ b/components/tidb_query_expr/src/impl_like.rs @@ -6,17 +6,21 @@ use tidb_query_datatype::codec::{collation::*, data_type::*}; #[rpn_fn] #[inline] -pub fn like(target: BytesRef, pattern: BytesRef, escape: &i64) -> Result> { +pub fn like( + target: BytesRef, + pattern: BytesRef, + escape: &i64, +) -> Result> { let escape = *escape as u32; // current search positions in pattern and target. let (mut px, mut tx) = (0, 0); // positions for backtrace. let (mut next_px, mut next_tx) = (0, 0); while px < pattern.len() || tx < target.len() { - if let Some((c, mut poff)) = C::Charset::decode_one(&pattern[px..]) { + if let Some((c, mut poff)) = CS::decode_one(&pattern[px..]) { let code: u32 = c.into(); if code == '_' as u32 { - if let Some((_, toff)) = C::Charset::decode_one(&target[tx..]) { + if let Some((_, toff)) = CS::decode_one(&target[tx..]) { px += poff; tx += toff; continue; @@ -26,7 +30,7 @@ pub fn like(target: BytesRef, pattern: BytesRef, escape: &i64) -> R next_px = px; px += poff; next_tx = tx; - next_tx += if let Some((_, toff)) = C::Charset::decode_one(&target[tx..]) { + next_tx += if let Some((_, toff)) = CS::decode_one(&target[tx..]) { toff } else { 1 @@ -35,13 +39,13 @@ pub fn like(target: BytesRef, pattern: BytesRef, escape: &i64) -> R } else { if code == escape && px + poff < pattern.len() { px += poff; - poff = if let Some((_, off)) = C::Charset::decode_one(&pattern[px..]) { + poff = if let Some((_, off)) = CS::decode_one(&pattern[px..]) { off } else { break; } } - if let Some((_, toff)) = C::Charset::decode_one(&target[tx..]) { + if let Some((_, toff)) = CS::decode_one(&target[tx..]) { if let Ok(std::cmp::Ordering::Equal) = C::sort_compare(&target[tx..tx + toff], &pattern[px..px + poff]) { @@ -154,20 +158,6 @@ mod tests { Collation::Binary, Some(0), ), - ( - r#"夏威夷吉他"#, - r#"_____"#, - '\\', - Collation::Binary, - Some(0), - ), - ( - r#"🐶🍐🍳➕🥜🎗🐜"#, - r#"_______"#, - '\\', - Collation::Utf8Mb4Bin, - Some(1), - ), ( r#"IpHONE"#, r#"iPhone"#, @@ -182,14 +172,6 @@ mod tests { Collation::Utf8Mb4GeneralCi, Some(1), ), - (r#"🕺_"#, r#"🕺🕺🕺_"#, '🕺', Collation::Binary, Some(0)), - ( - r#"🕺_"#, - r#"🕺🕺🕺_"#, - '🕺', - Collation::Utf8Mb4GeneralCi, - Some(1), - ), (r#"baab"#, r#"b_%b"#, '\\', Collation::Utf8Mb4Bin, Some(1)), (r#"baab"#, r#"b%_b"#, '\\', Collation::Utf8Mb4Bin, Some(1)), (r#"bab"#, r#"b_%b"#, '\\', Collation::Utf8Mb4Bin, Some(1)), @@ -238,4 +220,151 @@ mod tests { ); } } + + #[test] + fn test_like_wide_character() { + let cases = vec![ + ( + r#"夏威夷吉他"#, + r#"_____"#, + '\\', + Collation::Binary, + Collation::Binary, + Collation::Binary, + Some(0), + ), + ( + r#"🐶🍐🍳➕🥜🎗🐜"#, + r#"_______"#, + '\\', + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + ( + r#"🕺_"#, + r#"🕺🕺🕺_"#, + '🕺', + Collation::Binary, + Collation::Binary, + Collation::Binary, + Some(0), + ), + ( + r#"🕺_"#, + r#"🕺🕺🕺_"#, + '🕺', + Collation::Utf8Mb4GeneralCi, + Collation::Utf8Mb4GeneralCi, + Collation::Utf8Mb4GeneralCi, + Some(1), + ), + // When the new collation framework is not enabled, the collation + // will always be binary Some related tests are added here + ( + r#"夏威夷吉他"#, + r#"_____"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + ( + r#"🐶🍐🍳➕🥜🎗🐜"#, + r#"_______"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + ( + r#"🕺_"#, + r#"🕺🕺🕺_"#, + '🕺', + Collation::Binary, + Collation::Binary, + Collation::Binary, + Some(0), + ), + ( + r#"🕺_"#, + r#"🕺🕺🕺_"#, + '🕺', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + // Will not match, because '_' matches only one byte. + ( + r#"测试"#, + r#"测_"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Binary, + Some(0), + ), + // Both of them should be decoded with binary charset, so that we'll + // compare byte with byte, but not comparing a long character with a + // byte. + ( + r#"测试"#, + r#"测%"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Binary, + Some(1), + ), + // This can happen when the new collation is not enabled, and TiDB + // doesn't push down the collation information. Using binary + // comparing order is fine, but we'll need to decode strings with + // their own charset (so '_' could match single character, rather + // than single byte). + ( + r#"测试"#, + r#"测_"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + ]; + for (target, pattern, escape, collation, target_collation, pattern_collation, expected) in + cases + { + let output = RpnFnScalarEvaluator::new() + .return_field_type( + FieldTypeBuilder::new() + .tp(FieldTypeTp::LongLong) + .collation(collation) + .build(), + ) + .push_param_with_field_type( + target.to_owned().into_bytes(), + FieldTypeBuilder::new() + .tp(FieldTypeTp::String) + .collation(target_collation), + ) + .push_param_with_field_type( + pattern.to_owned().into_bytes(), + FieldTypeBuilder::new() + .tp(FieldTypeTp::String) + .collation(pattern_collation), + ) + .push_param(escape as i64) + .evaluate(ScalarFuncSig::LikeSig) + .unwrap(); + assert_eq!( + output, expected, + "target={}, pattern={}, escape={}", + target, pattern, escape + ); + } + } } diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index 8bb1cc05480..5a25fe343d1 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -44,8 +44,12 @@ pub mod impl_time; use tidb_query_common::Result; use tidb_query_datatype::{ - codec::data_type::*, match_template_charset, match_template_collator, Charset, Collation, - FieldTypeAccessor, FieldTypeFlag, + codec::{ + collation::{Charset as _, Collator}, + data_type::*, + }, + match_template_charset, match_template_collator, match_template_multiple_collators, Charset, + Collation, FieldTypeAccessor, FieldTypeFlag, }; use tipb::{Expr, FieldType, ScalarFuncSig}; @@ -91,10 +95,39 @@ fn map_compare_in_string_sig(ret_field_type: &FieldType) -> Result { }) } -fn map_like_sig(ret_field_type: &FieldType) -> Result { - Ok(match_template_collator! { - TT, match ret_field_type.as_accessor().collation().map_err(tidb_query_datatype::codec::Error::from)? { - Collation::TT => like_fn_meta::() +fn map_like_sig(ret_field_type: &FieldType, children: &[Expr]) -> Result { + let ret_collation = ret_field_type + .as_accessor() + .collation() + .map_err(tidb_query_datatype::codec::Error::from)?; + let target_collation = children[0] + .get_field_type() + .as_accessor() + .collation() + .map_err(tidb_query_datatype::codec::Error::from)?; + let pattern_collation = children[1] + .get_field_type() + .as_accessor() + .collation() + .map_err(tidb_query_datatype::codec::Error::from)?; + + // If the target charset is the same with pattern charset, and is Utf8mb4, + // use their charset to decode bytes. If not, use the charset pushed down in + // the ret_field type to decode the bytes. + // + // This behavior is for the compatibility and correctness: The TiDB doesn't + // push down the collation information when the new collation framework is + // not enabled, and always use the binary collation. However, the `_` + // pattern considers not only the order of strings, but also the number of + // characters. Some characters more than 1 bytes cannot be matched by `_` if + // the new collation framework is not enabled. + Ok(match_template_multiple_collators! { + (TT, TC, PC), (ret_collation, target_collation, pattern_collation), { + if ::Charset::charset() == ::Charset::charset() { + like_fn_meta::::Charset>() + } else { + like_fn_meta::::Charset>() + } } }) } @@ -596,7 +629,7 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::JsonKeys2ArgsSig => json_keys_fn_meta(), ScalarFuncSig::JsonQuoteSig => json_quote_fn_meta(), // impl_like - ScalarFuncSig::LikeSig => map_like_sig(ft)?, + ScalarFuncSig::LikeSig => map_like_sig(ft, children)?, // impl_regexp ScalarFuncSig::RegexpSig => map_regexp_like_sig(ft)?, ScalarFuncSig::RegexpUtf8Sig => map_regexp_like_sig(ft)?, From 68da60482057b2ee85c81afd731a73fa47f152ac Mon Sep 17 00:00:00 2001 From: lizhenhuan <1916038084@qq.com> Date: Tue, 15 Nov 2022 11:57:54 +0800 Subject: [PATCH 332/676] Push Json_valid to tikv (#13572) close tikv/tikv#13571 Signed-off-by: lizhenhuan <1916038084@qq.com> Co-authored-by: Ti Chi Robot --- components/tidb_query_expr/src/impl_json.rs | 57 +++++++++++++++++++++ components/tidb_query_expr/src/lib.rs | 3 ++ 2 files changed, 60 insertions(+) diff --git a/components/tidb_query_expr/src/impl_json.rs b/components/tidb_query_expr/src/impl_json.rs index 1926cc648e0..0c905b7458c 100644 --- a/components/tidb_query_expr/src/impl_json.rs +++ b/components/tidb_query_expr/src/impl_json.rs @@ -204,6 +204,31 @@ fn quote(bytes: BytesRef) -> Result> { Ok(Some(result)) } +#[rpn_fn(nullable, raw_varg, min_args = 1, max_args = 1)] +#[inline] +fn json_valid(args: &[ScalarValueRef]) -> Result> { + assert_eq!(args.len(), 1); + let received_et = args[0].eval_type(); + let r = match args[0].to_owned().is_none() { + true => None, + _ => match received_et { + EvalType::Json => args[0].as_json().and(Some(1)), + EvalType::Bytes => match args[0].as_bytes() { + Some(p) => { + let tmp_str = + std::str::from_utf8(p).map_err(tidb_query_datatype::codec::Error::from)?; + let json: serde_json::error::Result = serde_json::from_str(tmp_str); + Some(json.is_ok() as Int) + } + _ => Some(0), + }, + _ => Some(0), + }, + }; + + Ok(r) +} + #[rpn_fn] #[inline] fn json_unquote(arg: BytesRef) -> Result> { @@ -826,6 +851,38 @@ mod tests { } } + #[test] + fn test_json_valid() { + let cases: Vec<(Vec, Option)> = vec![ + ( + vec![Some(Json::from_str(r#"{"a":1}"#).unwrap()).into()], + Some(1), + ), + (vec![Some(b"hello".to_vec()).into()], Some(0)), + (vec![Some(b"\"hello\"".to_vec()).into()], Some(1)), + (vec![Some(b"null".to_vec()).into()], Some(1)), + (vec![Some(Json::from_str(r#"{}"#).unwrap()).into()], Some(1)), + (vec![Some(Json::from_str(r#"[]"#).unwrap()).into()], Some(1)), + (vec![Some(b"2".to_vec()).into()], Some(1)), + (vec![Some(b"2.5".to_vec()).into()], Some(1)), + (vec![Some(b"2019-8-19".to_vec()).into()], Some(0)), + (vec![Some(b"\"2019-8-19\"".to_vec()).into()], Some(1)), + (vec![Some(2).into()], Some(0)), + (vec![Some(2.5).into()], Some(0)), + (vec![None::.into()], None), + (vec![None::.into()], None), + (vec![None::.into()], None), + ]; + + for (vargs, expected) in cases { + let output = RpnFnScalarEvaluator::new() + .push_params(vargs.clone()) + .evaluate(ScalarFuncSig::JsonValidJsonSig) + .unwrap(); + assert_eq!(output, expected, "{:?}", vargs); + } + } + #[test] fn test_json_contains() { let cases: Vec<(Vec, Option)> = vec![ diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index 5a25fe343d1..43b0602ebbb 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -628,6 +628,9 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::JsonKeysSig => json_keys_fn_meta(), ScalarFuncSig::JsonKeys2ArgsSig => json_keys_fn_meta(), ScalarFuncSig::JsonQuoteSig => json_quote_fn_meta(), + ScalarFuncSig::JsonValidJsonSig => json_valid_fn_meta(), + ScalarFuncSig::JsonValidStringSig => json_valid_fn_meta(), + ScalarFuncSig::JsonValidOthersSig => json_valid_fn_meta(), // impl_like ScalarFuncSig::LikeSig => map_like_sig(ft, children)?, // impl_regexp From 4407cb1b02474e7aebea0c5feb45aee01ecf42d5 Mon Sep 17 00:00:00 2001 From: xiongjiwei Date: Tue, 15 Nov 2022 13:35:55 +0800 Subject: [PATCH 333/676] txn: add a new field `txn_source` in write and lock (#13777) ref tikv/tikv#13779 This PR is the starting preparation of BDR, and only adds the ability to parse and serialize `txn_source` Signed-off-by: xiongjiwei Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- .../raftstore-v2/src/operation/command/mod.rs | 1 + components/raftstore/src/store/fsm/apply.rs | 1 + components/raftstore/src/store/peer.rs | 1 + components/raftstore/src/store/util.rs | 1 + components/resolved_ts/src/cmd.rs | 1 + components/txn_types/src/lock.rs | 52 ++++++++++++++++--- components/txn_types/src/write.rs | 33 ++++++++++++ src/storage/mvcc/mod.rs | 10 ++++ src/storage/mvcc/reader/reader.rs | 1 + src/storage/mvcc/txn.rs | 1 + src/storage/txn/actions/commit.rs | 20 ++++++- src/storage/txn/actions/prewrite.rs | 15 +++++- src/storage/txn/actions/tests.rs | 37 +++++++++++++ src/storage/txn/commands/prewrite.rs | 40 +++++++++++++- src/storage/txn/store.rs | 1 + tests/benches/hierarchy/mvcc/mod.rs | 2 + tests/benches/hierarchy/txn/mod.rs | 2 + 18 files changed, 210 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9aa43209906..2622ed983f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2694,7 +2694,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#65d0ae8fa853c1e41b43f329afbf60616bdd4d18" +source = "git+https://github.com/pingcap/kvproto.git#29a30c4ef9c52aafb1b1da73dd9df60857068114" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 81365a162ec..5d308986229 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -461,6 +461,7 @@ impl Apply { AdminCmdType::VerifyHash => unimplemented!(), AdminCmdType::PrepareFlashback => unimplemented!(), AdminCmdType::FinishFlashback => unimplemented!(), + AdminCmdType::BatchSwitchWitness => unimplemented!(), AdminCmdType::InvalidAdmin => { return Err(box_err!("invalid admin command type")); } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 8cb7f58baca..45eadb0b89f 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1629,6 +1629,7 @@ where AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { self.exec_flashback(ctx, request) } + AdminCmdType::BatchSwitchWitness => Err(box_err!("unsupported admin command type")), AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), }?; response.set_cmd_type(cmd_type); diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index ff55597b30e..9614161739a 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -5726,6 +5726,7 @@ mod tests { AdminCmdType::TransferLeader, AdminCmdType::ComputeHash, AdminCmdType::VerifyHash, + AdminCmdType::BatchSwitchWitness, ]; for tp in AdminCmdType::values() { let mut msg = RaftCmdRequest::default(); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index b2180a8420d..5f78065d32b 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -226,6 +226,7 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { AdminCmdEpochState::new(true, true, false, false) } + AdminCmdType::BatchSwitchWitness => unimplemented!(), } } diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index a1468e15bab..d3bda563a4f 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -419,6 +419,7 @@ mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }, Mutation::make_put(k1.clone(), b"v4".to_vec()), &None, diff --git a/components/txn_types/src/lock.rs b/components/txn_types/src/lock.rs index 3e666c29e40..28df70677a5 100644 --- a/components/txn_types/src/lock.rs +++ b/components/txn_types/src/lock.rs @@ -34,6 +34,7 @@ const MIN_COMMIT_TS_PREFIX: u8 = b'c'; const ASYNC_COMMIT_PREFIX: u8 = b'a'; const ROLLBACK_TS_PREFIX: u8 = b'r'; const LAST_CHANGE_PREFIX: u8 = b'l'; +const TXN_SOURCE_PREFIX: u8 = b's'; impl LockType { pub fn from_mutation(mutation: &Mutation) -> Option { @@ -92,6 +93,10 @@ pub struct Lock { /// The number of versions that need skipping from the latest version to /// find the latest PUT/DELETE record pub versions_to_last_change: u64, + /// The source of this txn. It is used by ticdc, if the value is 0 ticdc + /// will sync the kv change event to downstream, if it is not 0, ticdc + /// may ignore this change event. + pub txn_source: u8, } impl std::fmt::Debug for Lock { @@ -117,6 +122,7 @@ impl std::fmt::Debug for Lock { .field("rollback_ts", &self.rollback_ts) .field("last_change_ts", &self.last_change_ts) .field("versions_to_last_change", &self.versions_to_last_change) + .field("txn_source", &self.txn_source) .finish() } } @@ -146,6 +152,7 @@ impl Lock { rollback_ts: Vec::default(), last_change_ts: TimeStamp::zero(), versions_to_last_change: 0, + txn_source: 0, } } @@ -173,6 +180,13 @@ impl Lock { self } + #[inline] + #[must_use] + pub fn set_txn_source(mut self, source: u8) -> Self { + self.txn_source = source; + self + } + pub fn to_bytes(&self) -> Vec { let mut b = Vec::with_capacity(self.pre_allocate_size()); b.push(self.lock_type.to_u8()); @@ -215,6 +229,10 @@ impl Lock { b.encode_u64(self.last_change_ts.into_inner()).unwrap(); b.encode_var_u64(self.versions_to_last_change).unwrap(); } + if self.txn_source != 0 { + b.push(TXN_SOURCE_PREFIX); + b.push(self.txn_source); + } b } @@ -247,6 +265,9 @@ impl Lock { if !self.last_change_ts.is_zero() { size += 1 + size_of::() + MAX_VAR_U64_LEN; } + if self.txn_source != 0 { + size += 2; + } size } @@ -285,6 +306,7 @@ impl Lock { let mut rollback_ts = Vec::new(); let mut last_change_ts = TimeStamp::zero(); let mut versions_to_last_change = 0; + let mut txn_source = 0; while !b.is_empty() { match b.read_u8()? { SHORT_VALUE_PREFIX => { @@ -322,6 +344,9 @@ impl Lock { last_change_ts = number::decode_u64(&mut b)?.into(); versions_to_last_change = number::decode_var_u64(&mut b)?; } + TXN_SOURCE_PREFIX => { + txn_source = b.read_u8()?; + } _ => { // To support forward compatibility, all fields should be serialized in order // and stop parsing if meets an unknown byte. @@ -339,7 +364,8 @@ impl Lock { txn_size, min_commit_ts, ) - .set_last_change(last_change_ts, versions_to_last_change); + .set_last_change(last_change_ts, versions_to_last_change) + .set_txn_source(txn_source); if use_async_commit { lock = lock.use_async_commit(secondaries); } @@ -365,7 +391,8 @@ impl Lock { info.set_use_async_commit(self.use_async_commit); info.set_min_commit_ts(self.min_commit_ts.into_inner()); info.set_secondaries(self.secondaries.into()); - // The client does not care about last_change_ts and versions_to_last_version. + // The client does not care about last_change_ts, versions_to_last_version and + // txn_source. info } @@ -743,6 +770,18 @@ mod tests { 8.into(), ) .set_last_change(4.into(), 2), + Lock::new( + LockType::Lock, + b"pk".to_vec(), + 1.into(), + 10, + None, + 6.into(), + 16, + 8.into(), + ) + .set_last_change(4.into(), 2) + .set_txn_source(1), ]; for (i, lock) in locks.drain(..).enumerate() { let v = lock.to_bytes(); @@ -997,7 +1036,7 @@ mod tests { min_commit_ts: TimeStamp(127), use_async_commit: true, \ secondaries: [7365636F6E646172795F6B31, 7365636F6E646172795F6B6B6B6B6B32, \ 7365636F6E646172795F6B336B336B336B336B336B33, 7365636F6E646172795F6B34], rollback_ts: [], \ - last_change_ts: TimeStamp(80), versions_to_last_change: 4 }" + last_change_ts: TimeStamp(80), versions_to_last_change: 4, txn_source: 0 }" ); log_wrappers::set_redact_info_log(true); let redact_result = format!("{:?}", lock); @@ -1007,7 +1046,7 @@ mod tests { "Lock { lock_type: Put, primary_key: ?, start_ts: TimeStamp(100), ttl: 3, \ short_value: ?, for_update_ts: TimeStamp(101), txn_size: 10, min_commit_ts: TimeStamp(127), \ use_async_commit: true, secondaries: [?, ?, ?, ?], rollback_ts: [], \ - last_change_ts: TimeStamp(80), versions_to_last_change: 4 }" + last_change_ts: TimeStamp(80), versions_to_last_change: 4, txn_source: 0 }" ); lock.short_value = None; @@ -1017,7 +1056,7 @@ mod tests { "Lock { lock_type: Put, primary_key: 706B, start_ts: TimeStamp(100), ttl: 3, short_value: , \ for_update_ts: TimeStamp(101), txn_size: 10, min_commit_ts: TimeStamp(127), \ use_async_commit: true, secondaries: [], rollback_ts: [], last_change_ts: TimeStamp(80), \ - versions_to_last_change: 4 }" + versions_to_last_change: 4, txn_source: 0 }" ); log_wrappers::set_redact_info_log(true); let redact_result = format!("{:?}", lock); @@ -1027,7 +1066,7 @@ mod tests { "Lock { lock_type: Put, primary_key: ?, start_ts: TimeStamp(100), ttl: 3, short_value: ?, \ for_update_ts: TimeStamp(101), txn_size: 10, min_commit_ts: TimeStamp(127), \ use_async_commit: true, secondaries: [], rollback_ts: [], last_change_ts: TimeStamp(80), \ - versions_to_last_change: 4 }" + versions_to_last_change: 4, txn_source: 0 }" ); } @@ -1056,6 +1095,7 @@ mod tests { rollback_ts: vec![], last_change_ts: 8.into(), versions_to_last_change: 2, + txn_source: 0, }; assert_eq!(pessimistic_lock.to_lock(), expected_lock); assert_eq!(pessimistic_lock.into_lock(), expected_lock); diff --git a/components/txn_types/src/write.rs b/components/txn_types/src/write.rs index 0c0994640d2..6c46688defa 100644 --- a/components/txn_types/src/write.rs +++ b/components/txn_types/src/write.rs @@ -30,6 +30,8 @@ const FLAG_OVERLAPPED_ROLLBACK: u8 = b'R'; const GC_FENCE_PREFIX: u8 = b'F'; const LAST_CHANGE_PREFIX: u8 = b'l'; +const TXN_SOURCE_PREFIX: u8 = b'S'; + /// The short value for rollback records which are protected from being /// collapsed. const PROTECTED_ROLLBACK_SHORT_VALUE: &[u8] = b"p"; @@ -157,6 +159,8 @@ pub struct Write { /// The number of versions that need skipping from this record /// to find the latest PUT/DELETE record pub versions_to_last_change: u64, + /// The source of this txn. + pub txn_source: u8, } impl std::fmt::Debug for Write { @@ -178,6 +182,7 @@ impl std::fmt::Debug for Write { .field("gc_fence", &self.gc_fence) .field("last_change_ts", &self.last_change_ts) .field("versions_to_last_change", &self.versions_to_last_change) + .field("txn_source", &self.txn_source) .finish() } } @@ -194,6 +199,7 @@ impl Write { gc_fence: None, last_change_ts: TimeStamp::zero(), versions_to_last_change: 0, + txn_source: 0, } } @@ -213,6 +219,7 @@ impl Write { gc_fence: None, last_change_ts: TimeStamp::zero(), versions_to_last_change: 0, + txn_source: 0, } } @@ -239,6 +246,13 @@ impl Write { self } + #[inline] + #[must_use] + pub fn set_txn_source(mut self, source: u8) -> Self { + self.txn_source = source; + self + } + #[inline] pub fn parse_type(mut b: &[u8]) -> Result { let write_type_bytes = b @@ -257,6 +271,7 @@ impl Write { gc_fence: self.gc_fence, last_change_ts: self.last_change_ts, versions_to_last_change: self.versions_to_last_change, + txn_source: self.txn_source, } } @@ -307,6 +322,8 @@ pub struct WriteRef<'a> { /// The number of versions that need skipping from this record /// to find the latest PUT/DELETE record pub versions_to_last_change: u64, + /// The source of this txn. + pub txn_source: u8, } impl WriteRef<'_> { @@ -326,6 +343,7 @@ impl WriteRef<'_> { let mut gc_fence = None; let mut last_change_ts = TimeStamp::zero(); let mut versions_to_last_change = 0; + let mut txn_source = 0; while !b.is_empty() { match b @@ -354,6 +372,11 @@ impl WriteRef<'_> { last_change_ts = number::decode_u64(&mut b)?.into(); versions_to_last_change = number::decode_var_u64(&mut b)?; } + TXN_SOURCE_PREFIX => { + txn_source = b + .read_u8() + .map_err(|_| Error::from(ErrorInner::BadFormatWrite))? + } _ => { // To support forward compatibility, all fields should be serialized in order // and stop parsing if meets an unknown byte. @@ -370,6 +393,7 @@ impl WriteRef<'_> { gc_fence, last_change_ts, versions_to_last_change, + txn_source, }) } @@ -394,6 +418,10 @@ impl WriteRef<'_> { b.encode_u64(self.last_change_ts.into_inner()).unwrap(); b.encode_var_u64(self.versions_to_last_change).unwrap(); } + if self.txn_source != 0 { + b.push(TXN_SOURCE_PREFIX); + b.push(self.txn_source); + } b } @@ -409,6 +437,9 @@ impl WriteRef<'_> { if !self.last_change_ts.is_zero() { size += 1 + size_of::() + MAX_VAR_U64_LEN; } + if self.txn_source != 0 { + size += 2; + } size } @@ -458,6 +489,7 @@ impl WriteRef<'_> { ) .set_overlapped_rollback(self.has_overlapped_rollback, self.gc_fence) .set_last_change(self.last_change_ts, self.versions_to_last_change) + .set_txn_source(self.txn_source) } } @@ -517,6 +549,7 @@ mod tests { Write::new(WriteType::Put, 456.into(), Some(b"short_value".to_vec())) .set_overlapped_rollback(true, Some(421397468076048385.into())), Write::new(WriteType::Lock, 456.into(), None).set_last_change(345.into(), 11), + Write::new(WriteType::Lock, 456.into(), None).set_txn_source(1), ]; for (i, write) in writes.drain(..).enumerate() { let v = write.as_ref().to_bytes(); diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 6191c2ad46d..997cde71020 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -706,6 +706,16 @@ pub mod tests { assert_eq!(ts, commit_ts.into()); } + pub fn must_get_txn_source(engine: &mut E, key: &[u8], ts: u64, txn_source: u8) { + let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut reader = SnapshotReader::new(TimeStamp::from(ts), snapshot, true); + let write = reader + .get_write(&Key::from_raw(key), TimeStamp::from(ts)) + .unwrap() + .unwrap(); + assert_eq!(write.txn_source, txn_source); + } + pub fn must_get_commit_ts_none( engine: &mut E, key: &[u8], diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 8e35e00936e..2fe95c2c1dd 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -852,6 +852,7 @@ pub mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index a73f8b99027..66aa769d462 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -767,6 +767,7 @@ pub(crate) mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index 2ba4f527d0e..1b8018e2aad 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -92,7 +92,8 @@ pub fn commit( reader.start_ts, lock.short_value.take(), ) - .set_last_change(lock.last_change_ts, lock.versions_to_last_change); + .set_last_change(lock.last_change_ts, lock.versions_to_last_change) + .set_txn_source(lock.txn_source); for ts in &lock.rollback_ts { if *ts == commit_ts { @@ -117,7 +118,8 @@ pub mod tests { #[cfg(test)] use crate::storage::txn::tests::{ must_acquire_pessimistic_lock_for_large_txn, must_prewrite_delete, must_prewrite_lock, - must_prewrite_put, must_prewrite_put_for_large_txn, must_prewrite_put_impl, must_rollback, + must_prewrite_put, must_prewrite_put_for_large_txn, must_prewrite_put_impl, + must_prewrite_put_with_txn_soucre, must_rollback, }; #[cfg(test)] use crate::storage::{ @@ -350,4 +352,18 @@ pub mod tests { assert!(write.last_change_ts.is_zero()); assert_eq!(write.versions_to_last_change, 0); } + + #[test] + fn test_2pc_with_txn_source() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let k = b"k"; + // WriteType is Put + must_prewrite_put_with_txn_soucre(&mut engine, k, b"v2", k, 25, 1); + let lock = must_locked(&mut engine, k, 25); + assert_eq!(lock.txn_source, 1); + must_succeed(&mut engine, k, 25, 30); + let write = must_written(&mut engine, k, 25, 30, WriteType::Put); + assert_eq!(write.txn_source, 1); + } } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 4c13a9d244b..48caa3795af 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -172,6 +172,7 @@ pub struct TransactionProperties<'a> { pub need_old_value: bool, pub is_retry_request: bool, pub assertion_level: AssertionLevel, + pub txn_source: u8, } impl<'a> TransactionProperties<'a> { @@ -453,7 +454,8 @@ impl<'a> PrewriteMutation<'a> { self.txn_props.for_update_ts(), self.txn_props.txn_size, self.min_commit_ts, - ); + ) + .set_txn_source(self.txn_props.txn_source); // Only Lock needs to record `last_change_ts` in its write record, Put or Delete // records themselves are effective changes. if tls_can_enable(LAST_CHANGE_TS) && self.lock_type == Some(LockType::Lock) { @@ -795,6 +797,7 @@ pub mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } @@ -821,6 +824,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } @@ -1133,6 +1137,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }, Mutation::make_check_not_exists(Key::from_raw(key)), &None, @@ -1165,6 +1170,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; // calculated commit_ts = 43 ≤ 50, ok let (_, old_value) = prewrite( @@ -1215,6 +1221,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; // calculated commit_ts = 43 ≤ 50, ok let (_, old_value) = prewrite( @@ -1324,6 +1331,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let cases = vec![ @@ -1384,6 +1392,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let cases: Vec<_> = vec![ @@ -1655,6 +1664,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let snapshot = engine.snapshot(Default::default()).unwrap(); let cm = ConcurrencyManager::new(start_ts); @@ -1709,6 +1719,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let snapshot = engine.snapshot(Default::default()).unwrap(); let cm = ConcurrencyManager::new(start_ts); @@ -1850,6 +1861,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let (_, old_value) = prewrite( &mut txn, @@ -1886,6 +1898,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let (_, old_value) = prewrite( &mut txn, diff --git a/src/storage/txn/actions/tests.rs b/src/storage/txn/actions/tests.rs index fdf060d950d..79d31a08c9c 100644 --- a/src/storage/txn/actions/tests.rs +++ b/src/storage/txn/actions/tests.rs @@ -52,6 +52,7 @@ pub fn must_prewrite_put_impl( assertion_level, false, None, + 0, ); } @@ -90,6 +91,7 @@ pub fn must_prewrite_insert_impl( assertion_level, true, None, + 0, ); } @@ -111,8 +113,10 @@ pub fn must_prewrite_put_impl_with_should_not_exist( assertion_level: AssertionLevel, should_not_exist: bool, region_id: Option, + txn_source: u32, ) { let mut ctx = Context::default(); + ctx.set_txn_source(txn_source); if let Some(region_id) = region_id { ctx.region_id = region_id; } @@ -154,6 +158,7 @@ pub fn must_prewrite_put_impl_with_should_not_exist( need_old_value: false, is_retry_request, assertion_level, + txn_source: txn_source as u8, }, mutation, secondary_keys, @@ -215,6 +220,37 @@ pub fn must_prewrite_put_on_region( AssertionLevel::Off, false, Some(region_id), + 0, + ); +} + +pub fn must_prewrite_put_with_txn_soucre( + engine: &mut E, + key: &[u8], + value: &[u8], + pk: &[u8], + ts: impl Into, + txn_source: u32, +) { + must_prewrite_put_impl_with_should_not_exist( + engine, + key, + value, + pk, + &None, + ts.into(), + SkipPessimisticCheck, + 0, + TimeStamp::default(), + 0, + TimeStamp::default(), + TimeStamp::default(), + false, + Assertion::None, + AssertionLevel::Off, + false, + None, + txn_source, ); } @@ -422,6 +458,7 @@ fn default_txn_props( need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 6b54a1f88db..542c60819b5 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -508,6 +508,7 @@ impl Prewriter { need_old_value: extra_op == ExtraOp::ReadOldValue, is_retry_request: self.ctx.is_retry_request, assertion_level: self.assertion_level, + txn_source: self.ctx.get_txn_source() as u8, }; let async_commit_pk = self @@ -849,7 +850,8 @@ fn handle_1pc_locks(txn: &mut MvccTxn, commit_ts: TimeStamp) -> ReleasedLocks { txn.start_ts, lock.short_value, ) - .set_last_change(lock.last_change_ts, lock.versions_to_last_change); + .set_last_change(lock.last_change_ts, lock.versions_to_last_change) + .set_txn_source(lock.txn_source); // Transactions committed with 1PC should be impossible to overwrite rollback // records. txn.put_write(key.clone(), commit_ts, write.as_ref().to_bytes()); @@ -1073,6 +1075,42 @@ mod tests { assert_eq!(d.internal_delete_skipped_count, 0); } + #[test] + fn test_prewrite_1pc_with_txn_source() { + use crate::storage::mvcc::tests::{must_get, must_get_commit_ts, must_unlocked}; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + let cm = concurrency_manager::ConcurrencyManager::new(1.into()); + + let key = b"k"; + let value = b"v"; + let mutations = vec![Mutation::make_put(Key::from_raw(key), value.to_vec())]; + + let mut statistics = Statistics::default(); + let mut ctx = Context::default(); + ctx.set_txn_source(1); + let cmd = Prewrite::new( + mutations, + key.to_vec(), + TimeStamp::from(10), + 0, + false, + 0, + TimeStamp::default(), + TimeStamp::from(15), + None, + true, + AssertionLevel::Off, + ctx, + ); + prewrite_command(&mut engine, cm, &mut statistics, cmd).unwrap(); + + must_unlocked(&mut engine, key); + must_get(&mut engine, key, 12, value); + must_get_commit_ts(&mut engine, key, 10, 11); + must_get_txn_source(&mut engine, key, 11, 1); + } + #[test] fn test_prewrite_1pc() { use crate::storage::mvcc::tests::{must_get, must_get_commit_ts, must_unlocked}; diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index 9a38979c71b..46879d38e9f 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -726,6 +726,7 @@ mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }, Mutation::make_put(Key::from_raw(key), key.to_vec()), &None, diff --git a/tests/benches/hierarchy/mvcc/mod.rs b/tests/benches/hierarchy/mvcc/mod.rs index f57946a11cf..20740b4cb16 100644 --- a/tests/benches/hierarchy/mvcc/mod.rs +++ b/tests/benches/hierarchy/mvcc/mod.rs @@ -47,6 +47,7 @@ where need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; prewrite( &mut txn, @@ -97,6 +98,7 @@ fn mvcc_prewrite>(b: &mut Bencher<'_>, config: &B need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; prewrite( &mut txn, diff --git a/tests/benches/hierarchy/txn/mod.rs b/tests/benches/hierarchy/txn/mod.rs index 0bdb7ae8870..404266e2c6f 100644 --- a/tests/benches/hierarchy/txn/mod.rs +++ b/tests/benches/hierarchy/txn/mod.rs @@ -43,6 +43,7 @@ where need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; prewrite( &mut txn, @@ -90,6 +91,7 @@ fn txn_prewrite>(b: &mut Bencher<'_>, config: &Be need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; prewrite( &mut txn, From 6b240c9e6ebbb4a35bc0309ffbeb9a9b293a6aa8 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 15 Nov 2022 17:19:56 +0800 Subject: [PATCH 334/676] *: update toolchain (#13797) ref tikv/tikv#12842 Latest Rust stabilizes GAT, which is required for async func in trait without allocation. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- cmd/tikv-ctl/src/executor.rs | 2 +- cmd/tikv-ctl/src/main.rs | 6 +- .../backup-stream/src/metadata/client.rs | 2 +- components/backup/src/endpoint.rs | 2 +- components/backup/src/writer.rs | 4 +- components/batch-system/src/router.rs | 2 +- components/causal_ts/benches/tso.rs | 6 +- components/causal_ts/src/lib.rs | 1 - components/causal_ts/src/tso.rs | 2 +- components/cdc/src/endpoint.rs | 6 +- components/cdc/src/old_value.rs | 6 +- components/codec/src/byte.rs | 2 +- .../concurrency_manager/benches/lock_table.rs | 1 - .../concurrency_manager/src/lock_table.rs | 13 +-- components/encryption/src/crypter.rs | 2 +- .../encryption/src/encrypted_file/mod.rs | 4 +- components/encryption/src/manager/mod.rs | 10 +-- components/engine_panic/src/lib.rs | 1 - components/engine_rocks/src/lib.rs | 1 - components/engine_rocks/src/util.rs | 2 +- components/engine_test/src/lib.rs | 6 +- components/engine_traits/src/lib.rs | 1 - components/error_code/bin.rs | 2 +- .../external_storage/export/src/export.rs | 6 +- components/external_storage/src/hdfs.rs | 2 +- components/file_system/src/io_stats/proc.rs | 2 +- components/file_system/src/lib.rs | 8 +- components/raft_log_engine/src/engine.rs | 2 +- components/raft_log_engine/src/lib.rs | 1 - components/raftstore-v2/src/fsm/peer.rs | 2 +- components/raftstore-v2/src/lib.rs | 2 +- components/raftstore-v2/src/operation/life.rs | 2 +- .../tests/integrations/test_basic_write.rs | 2 +- .../tests/integrations/test_life.rs | 4 +- components/raftstore/src/lib.rs | 2 +- components/raftstore/src/store/config.rs | 10 +-- components/raftstore/src/store/fsm/apply.rs | 6 +- components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/fsm/store.rs | 4 +- components/raftstore/src/store/msg.rs | 1 + components/raftstore/src/store/snap.rs | 24 +++--- components/raftstore/src/store/snap/io.rs | 2 +- .../src/store/worker/split_controller.rs | 2 +- .../resolved_ts/tests/integrations/mod.rs | 2 +- .../resource_metering/src/recorder/mod.rs | 4 +- components/security/src/lib.rs | 2 +- components/server/src/server.rs | 6 +- components/server/src/signal_handler.rs | 2 +- components/sst_importer/src/import_file.rs | 10 +-- components/sst_importer/src/sst_importer.rs | 2 +- components/test_backup/src/lib.rs | 2 +- .../test_raftstore/src/transport_simulate.rs | 2 +- components/test_util/src/runner.rs | 4 +- .../tidb_query_aggr/src/impl_max_min.rs | 6 +- .../tidb_query_codegen/src/rpn_function.rs | 2 +- .../tidb_query_datatype/src/codec/convert.rs | 14 +-- .../src/codec/mysql/decimal.rs | 16 ++-- .../src/codec/mysql/duration.rs | 4 +- .../src/codec/mysql/json/binary.rs | 2 +- .../src/codec/mysql/time/extension.rs | 2 +- .../tidb_query_datatype/src/codec/overflow.rs | 6 +- .../tidb_query_datatype/src/codec/table.rs | 2 +- .../tidb_query_datatype/src/expr/ctx.rs | 4 +- .../src/simple_aggr_executor.rs | 2 +- .../src/top_n_executor.rs | 4 +- .../tidb_query_expr/src/impl_arithmetic.rs | 34 ++++---- components/tidb_query_expr/src/impl_cast.rs | 16 ++-- .../tidb_query_expr/src/impl_compare.rs | 8 +- components/tidb_query_expr/src/impl_math.rs | 4 +- components/tidb_query_expr/src/impl_op.rs | 4 +- components/tidb_query_expr/src/impl_time.rs | 8 +- components/tikv_kv/src/lib.rs | 1 - components/tikv_util/src/buffer_vec.rs | 86 ++++++++++--------- components/tikv_util/src/codec/bytes.rs | 2 +- components/tikv_util/src/config.rs | 28 +++--- components/tikv_util/src/lib.rs | 2 +- components/tikv_util/src/logger/file_log.rs | 2 +- components/tikv_util/src/sys/cgroup.rs | 30 +++---- components/tikv_util/src/sys/inspector.rs | 2 +- components/tikv_util/src/sys/thread.rs | 4 +- components/txn_types/src/types.rs | 2 +- fuzz/cli.rs | 18 ++-- rust-toolchain | 2 +- scripts/clippy | 6 +- src/config.rs | 34 +++----- src/coprocessor/endpoint.rs | 43 ++++------ src/coprocessor/statistics/analyze.rs | 4 +- src/coprocessor/statistics/histogram.rs | 2 +- src/coprocessor_v2/plugin_registry.rs | 10 +-- src/lib.rs | 3 +- src/read_pool.rs | 5 +- src/server/debug.rs | 2 +- src/server/engine_factory_v2.rs | 4 +- src/server/gc_worker/gc_manager.rs | 2 +- src/server/gc_worker/gc_worker.rs | 2 +- src/server/lock_manager/waiter_manager.rs | 4 +- src/server/node.rs | 2 +- src/server/service/diagnostics/log.rs | 8 +- src/server/service/diagnostics/sys.rs | 2 +- src/server/status_server/profile.rs | 4 +- src/storage/config.rs | 2 +- src/storage/txn/scheduler.rs | 2 +- .../misc/coprocessor/codec/mysql/json/mod.rs | 2 +- tests/failpoints/cases/test_snap.rs | 10 +-- tests/integrations/backup/mod.rs | 2 +- .../integrations/config/dynamic/raftstore.rs | 2 +- .../integrations/config/test_config_client.rs | 2 +- tests/integrations/coprocessor/test_select.rs | 2 +- 108 files changed, 330 insertions(+), 354 deletions(-) diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index aa2f604b547..80915dbc564 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -866,7 +866,7 @@ impl DebugExecutor for Debugger { self.region_size(region, cfs) .unwrap_or_else(|e| perror_and_exit("Debugger::region_size", e)) .into_iter() - .map(|(cf, size)| (cf.to_owned(), size as usize)) + .map(|(cf, size)| (cf.to_owned(), size)) .collect() } diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index ce39c121300..be5069397e4 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -68,7 +68,7 @@ fn main() { cfg }, |path| { - let s = fs::read_to_string(&path).unwrap(); + let s = fs::read_to_string(path).unwrap(); toml::from_str(&s).unwrap() }, ); @@ -169,7 +169,7 @@ fn main() { .unwrap(); let iv = Iv::from_slice(&file_info.iv).unwrap(); - let f = File::open(&infile).unwrap(); + let f = File::open(infile).unwrap(); let mut reader = DecrypterReader::new(f, mthd, &file_info.key, iv).unwrap(); io::copy(&mut reader, &mut outf).unwrap(); @@ -333,7 +333,7 @@ fn main() { let to_data_dir = to_data_dir.as_deref(); let to_host = to_host.as_deref(); let to_config = to_config.map_or_else(TikvConfig::default, |path| { - let s = fs::read_to_string(&path).unwrap(); + let s = fs::read_to_string(path).unwrap(); toml::from_str(&s).unwrap() }); debug_executor.diff_region(region, to_host, to_data_dir, &to_config, mgr); diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 2ebf553e1cb..b7f1fcb2025 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -546,7 +546,7 @@ impl MetadataClient { )) .await?; - let mut result = Vec::with_capacity(all.len() as usize + 1); + let mut result = Vec::with_capacity(all.len() + 1); if !prev.kvs.is_empty() { let kv = &mut prev.kvs[0]; if kv.value() > start_key.as_slice() { diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 92131381017..db6ff331d7f 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -1279,7 +1279,7 @@ pub mod tests { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() .path(temp.path()) - .cfs(&[ + .cfs([ engine_traits::CF_DEFAULT, engine_traits::CF_LOCK, engine_traits::CF_WRITE, diff --git a/components/backup/src/writer.rs b/components/backup/src/writer.rs index 4e0750bd7d8..7a853fe485f 100644 --- a/components/backup/src/writer.rs +++ b/components/backup/src/writer.rs @@ -443,7 +443,7 @@ mod tests { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() .path(temp.path()) - .cfs(&[engine_traits::CF_DEFAULT, engine_traits::CF_WRITE]) + .cfs([engine_traits::CF_DEFAULT, engine_traits::CF_WRITE]) .build() .unwrap(); let db = rocks.get_rocksdb(); @@ -480,7 +480,7 @@ mod tests { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() .path(temp.path()) - .cfs(&[ + .cfs([ engine_traits::CF_DEFAULT, engine_traits::CF_LOCK, engine_traits::CF_WRITE, diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index b863f1535f0..bfcb93c9d6b 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -322,7 +322,7 @@ where for mailbox in mailboxes.map.values() { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); } - BROADCAST_NORMAL_DURATION.observe(duration_to_sec(timer.saturating_elapsed()) as f64); + BROADCAST_NORMAL_DURATION.observe(duration_to_sec(timer.saturating_elapsed())); } /// Try to notify all FSMs that the cluster is being shutdown. diff --git a/components/causal_ts/benches/tso.rs b/components/causal_ts/benches/tso.rs index 72d381a4be7..f7e1980d15f 100644 --- a/components/causal_ts/benches/tso.rs +++ b/components/causal_ts/benches/tso.rs @@ -19,11 +19,7 @@ fn bench_batch_tso_list_pop(c: &mut Criterion) { batch_list.flush(); for i in 0..CAPACITY { batch_list - .push( - batch_size as u32, - TimeStamp::compose(i as u64, batch_size), - false, - ) + .push(batch_size as u32, TimeStamp::compose(i, batch_size), false) .unwrap(); } }, diff --git a/components/causal_ts/src/lib.rs b/components/causal_ts/src/lib.rs index 3eb59f35c36..ab57fbf734f 100644 --- a/components/causal_ts/src/lib.rs +++ b/components/causal_ts/src/lib.rs @@ -1,6 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -#![feature(map_first_last)] // For `BTreeMap::pop_first`. #![feature(div_duration)] #[macro_use] diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 5056cfe2ebd..e63c3c2c3ba 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -712,7 +712,7 @@ pub mod tests { for (i, (remain, usage, need_flush, expected)) in cases.into_iter().enumerate() { let batch_list = Arc::new(TsoBatchList { inner: Default::default(), - tso_remain: AtomicI32::new(remain as i32), + tso_remain: AtomicI32::new(remain), tso_usage: AtomicU32::new(usage), capacity: cache_multiplier, }); diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 4086c8623b5..4b6bbad6d35 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1012,11 +1012,7 @@ impl, E: KvEngine> Endpoint { let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); let raft_router = self.raft_router.clone(); - let regions: Vec = self - .capture_regions - .iter() - .map(|(region_id, _)| *region_id) - .collect(); + let regions: Vec = self.capture_regions.keys().copied().collect(); let cm: ConcurrencyManager = self.concurrency_manager.clone(); let hibernate_regions_compatible = self.config.hibernate_regions_compatible; let causal_ts_provider = self.causal_ts_provider.clone(); diff --git a/components/cdc/src/old_value.rs b/components/cdc/src/old_value.rs index 1149d8ce3e0..37e2781b766 100644 --- a/components/cdc/src/old_value.rs +++ b/components/cdc/src/old_value.rs @@ -341,8 +341,8 @@ mod tests { old_value_cache.cache.insert(key, value.clone()); } - assert_eq!(old_value_cache.cache.size(), size * cases as usize); - assert_eq!(old_value_cache.cache.len(), cases as usize); + assert_eq!(old_value_cache.cache.size(), size * cases); + assert_eq!(old_value_cache.cache.len(), cases); assert_eq!(old_value_cache.capacity(), capacity as usize); // Reduces capacity. @@ -360,7 +360,7 @@ mod tests { assert_eq!(old_value_cache.cache.size(), size * remaining_count); assert_eq!(old_value_cache.cache.len(), remaining_count); - assert_eq!(old_value_cache.capacity(), new_capacity as usize); + assert_eq!(old_value_cache.capacity(), new_capacity); for i in dropped_count..cases { let key = Key::from_raw(&i.to_be_bytes()); assert_eq!(old_value_cache.cache.get(&key).is_some(), true); diff --git a/components/codec/src/byte.rs b/components/codec/src/byte.rs index aa7baba9e75..8b5fd928edf 100644 --- a/components/codec/src/byte.rs +++ b/components/codec/src/byte.rs @@ -759,7 +759,7 @@ mod tests { for (exp, encoded) in cases { let mut path = env::temp_dir(); path.push("read-compact-codec-file"); - fs::write(&path, &encoded).unwrap(); + fs::write(&path, encoded).unwrap(); let f = File::open(&path).unwrap(); let mut rdr = BufReader::new(f); let decoded = rdr.read_compact_bytes().unwrap(); diff --git a/components/concurrency_manager/benches/lock_table.rs b/components/concurrency_manager/benches/lock_table.rs index f2d4a9b92c9..52c9bea960a 100644 --- a/components/concurrency_manager/benches/lock_table.rs +++ b/components/concurrency_manager/benches/lock_table.rs @@ -1,7 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. #![feature(test)] -#![feature(bench_black_box)] use std::{borrow::Cow, hint::black_box, mem::forget}; diff --git a/components/concurrency_manager/src/lock_table.rs b/components/concurrency_manager/src/lock_table.rs index bf7a224aa28..ad013a863a1 100644 --- a/components/concurrency_manager/src/lock_table.rs +++ b/components/concurrency_manager/src/lock_table.rs @@ -158,9 +158,9 @@ mod test { assert_eq!(counter.load(Ordering::SeqCst), 100); } - fn ts_check(lock: &Lock, ts: u64) -> Result<(), Lock> { + fn ts_check(lock: &Lock, ts: u64) -> Result<(), Box> { if lock.ts.into_inner() < ts { - Err(lock.clone()) + Err(Box::new(lock.clone())) } else { Ok(()) } @@ -193,7 +193,10 @@ mod test { lock_table.check_key(&key_k, |l| ts_check(l, 5)).unwrap(); // lock does not pass check_fn - assert_eq!(lock_table.check_key(&key_k, |l| ts_check(l, 20)), Err(lock)); + assert_eq!( + lock_table.check_key(&key_k, |l| ts_check(l, 20)), + Err(Box::new(lock)) + ); } #[tokio::test] @@ -247,13 +250,13 @@ mod test { // first lock does not pass check_fn assert_eq!( lock_table.check_range(Some(&Key::from_raw(b"a")), None, |_, l| ts_check(l, 25)), - Err(lock_k) + Err(Box::new(lock_k)) ); // first lock passes check_fn but the second does not assert_eq!( lock_table.check_range(None, None, |_, l| ts_check(l, 15)), - Err(lock_l) + Err(Box::new(lock_l)) ); } diff --git a/components/encryption/src/crypter.rs b/components/encryption/src/crypter.rs index 13286e416c9..7379b8a32a3 100644 --- a/components/encryption/src/crypter.rs +++ b/components/encryption/src/crypter.rs @@ -275,7 +275,7 @@ mod tests { let crypter = AesGcmCrypter::new(&key, iv); let (ciphertext, gcm_tag) = crypter.encrypt(&pt).unwrap(); assert_eq!(ciphertext, ct, "{}", hex::encode(&ciphertext)); - assert_eq!(gcm_tag.0.to_vec(), tag, "{}", hex::encode(&gcm_tag.0)); + assert_eq!(gcm_tag.0.to_vec(), tag, "{}", hex::encode(gcm_tag.0)); let plaintext = crypter.decrypt(&ct, gcm_tag).unwrap(); assert_eq!(plaintext, pt, "{}", hex::encode(&plaintext)); diff --git a/components/encryption/src/encrypted_file/mod.rs b/components/encryption/src/encrypted_file/mod.rs index 57b5527b7bf..9c76b857c70 100644 --- a/components/encryption/src/encrypted_file/mod.rs +++ b/components/encryption/src/encrypted_file/mod.rs @@ -64,7 +64,7 @@ impl<'a> EncryptedFile<'a> { let start = Instant::now(); // Write to a tmp file. // TODO what if a tmp file already exists? - let origin_path = self.base.join(&self.name); + let origin_path = self.base.join(self.name); let mut tmp_path = origin_path.clone(); tmp_path.set_extension(format!("{}.{}", thread_rng().next_u64(), TMP_FILE_SUFFIX)); let mut tmp_file = OpenOptions::new() @@ -92,7 +92,7 @@ impl<'a> EncryptedFile<'a> { // Replace old file with the tmp file aomticlly. rename(tmp_path, origin_path)?; - let base_dir = File::open(&self.base)?; + let base_dir = File::open(self.base)?; base_dir.sync_all()?; ENCRYPT_DECRPTION_FILE_HISTOGRAM diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 0f78e794629..0f3233d7819 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -815,7 +815,7 @@ mod tests { } fn new_mock_backend() -> Box { - Box::new(MockBackend::default()) + Box::::default() } fn new_key_manager_def( @@ -829,7 +829,7 @@ mod tests { } match DataKeyManager::new_previous_loaded( master_backend, - Box::new(MockBackend::default()), + Box::::default(), args, ) { Ok(None) => panic!("expected encryption"), @@ -932,7 +932,7 @@ mod tests { let manager = new_key_manager( &tmp_dir, Some(EncryptionMethod::Aes256Ctr), - Box::new(PlaintextBackend::default()), + Box::::default(), new_mock_backend() as Box, ); manager.err().unwrap(); @@ -1301,13 +1301,13 @@ mod tests { encrypt_fail: false, ..MockBackend::default() }); - let previous = Box::new(PlaintextBackend::default()) as Box; + let previous = Box::::default() as Box; let result = new_key_manager(&tmp_dir, None, wrong_key, previous); // When the master key is invalid, the key manager left a empty file dict and // return errors. assert!(result.is_err()); - let previous = Box::new(PlaintextBackend::default()) as Box; + let previous = Box::::default() as Box; new_key_manager(&tmp_dir, None, right_key, previous).unwrap(); } diff --git a/components/engine_panic/src/lib.rs b/components/engine_panic/src/lib.rs index 70c7f00ece8..93555f5ba5f 100644 --- a/components/engine_panic/src/lib.rs +++ b/components/engine_panic/src/lib.rs @@ -9,7 +9,6 @@ //! with your engine's own name; then fill in the implementations; remove //! the allow(unused) attribute; -#![feature(generic_associated_types)] #![allow(unused)] mod cf_names; diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index c1e23dac4a6..b6f3e36146c 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -16,7 +16,6 @@ //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] -#![feature(generic_associated_types)] #[allow(unused_extern_crates)] extern crate tikv_alloc; diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index f749f78851c..778e16c1a67 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -150,7 +150,7 @@ pub fn db_exist(path: &str) -> bool { // If path is not an empty directory but db has not been created, // `DB::list_column_families` fails and we can clean up the directory by // this indication. - fs::read_dir(&path).unwrap().next().is_some() + fs::read_dir(path).unwrap().next().is_some() } /// Returns a Vec of cf which is in `a' but not in `b'. diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index b460e97d4ce..ae834457757 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -55,6 +55,8 @@ //! storage engines, and that it be extracted into its own crate for use in //! TiKV, once the full requirements are better understood. +#![feature(let_chains)] + /// Types and constructors for the "raft" engine pub mod raft { #[cfg(feature = "test-engine-raft-panic")] @@ -345,7 +347,7 @@ pub mod kv { let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); // When the full directory path does not exsit, create will return error and in // this case, we just ignore it. - let _ = std::fs::File::create(&path); + let _ = std::fs::File::create(path); { let mut reg = self.registry.lock().unwrap(); if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { @@ -384,7 +386,7 @@ pub mod kv { } let db_path = self.tablet_path(region_id, suffix); - std::fs::rename(path, &db_path)?; + std::fs::rename(path, db_path)?; self.open_tablet( region_id, Some(suffix), diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 29351636694..b9cf8847751 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -251,7 +251,6 @@ #![cfg_attr(test, feature(test))] #![feature(min_specialization)] #![feature(assert_matches)] -#![feature(generic_associated_types)] #[macro_use(fail_point)] extern crate fail; diff --git a/components/error_code/bin.rs b/components/error_code/bin.rs index ba6a21ac6fa..8f1ad087355 100644 --- a/components/error_code/bin.rs +++ b/components/error_code/bin.rs @@ -18,7 +18,7 @@ fn main() { storage::ALL_ERROR_CODES.iter(), ]; let path = Path::new("./etc/error_code.toml"); - let mut f = fs::File::create(&path).unwrap(); + let mut f = fs::File::create(path).unwrap(); err_codes .into_iter() .flatten() diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/export/src/export.rs index 3cba0eaad8b..a36f3eba11e 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/export/src/export.rs @@ -186,7 +186,9 @@ fn create_backend_inner( Backend::Hdfs(hdfs) => { Box::new(HdfsStorage::new(&hdfs.remote, backend_config.hdfs_config)?) } - Backend::Noop(_) => Box::new(NoopStorage::default()) as Box, + Backend::Noop(_) => { + Box::::default() as Box + } #[cfg(feature = "cloud-aws")] Backend::S3(config) => { let mut s = S3Storage::from_input(config.clone())?; @@ -355,7 +357,7 @@ impl ExternalStorage for EncryptedExternalStorage { compression_reader_dispatcher(compression_type, inner)? }; let file_writer: &mut dyn Write = - &mut self.key_manager.create_file_for_write(&restore_name)?; + &mut self.key_manager.create_file_for_write(restore_name)?; let min_read_speed: usize = 8192; let mut input = encrypt_wrap_reader(file_crypter, reader)?; diff --git a/components/external_storage/src/hdfs.rs b/components/external_storage/src/hdfs.rs index 53574633c73..a9fa65dcdcf 100644 --- a/components/external_storage/src/hdfs.rs +++ b/components/external_storage/src/hdfs.rs @@ -101,7 +101,7 @@ impl ExternalStorage for HdfsStorage { } cmd_with_args.extend([&cmd_path, "dfs", "-put", "-", path]); info!("calling hdfs"; "cmd" => ?cmd_with_args); - let mut hdfs_cmd = Command::new(&cmd_with_args[0]) + let mut hdfs_cmd = Command::new(cmd_with_args[0]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) diff --git a/components/file_system/src/io_stats/proc.rs b/components/file_system/src/io_stats/proc.rs index 60c8cac9c36..51c74ae56a8 100644 --- a/components/file_system/src/io_stats/proc.rs +++ b/components/file_system/src/io_stats/proc.rs @@ -225,7 +225,7 @@ mod tests { .write(true) .create(true) .custom_flags(O_DIRECT) - .open(&file_path) + .open(file_path) .unwrap(); let w = vec![A512::default(); 8]; let base_local_bytes = id.fetch_io_bytes().unwrap(); diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index 36acbc65a91..058b2a3a5f9 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -426,7 +426,7 @@ pub fn reserve_space_for_recover>(data_dir: P, file_size: u64) -> delete_file_if_exist(&path)?; } fn do_reserve(dir: &Path, path: &Path, file_size: u64) -> io::Result<()> { - let f = File::create(&path)?; + let f = File::create(path)?; f.allocate(file_size)?; f.sync_all()?; sync_dir(dir) @@ -483,7 +483,7 @@ mod tests { // Ensure it works for non-existent file. let non_existent_file = dir_path.join("non_existent_file"); - get_file_size(&non_existent_file).unwrap_err(); + get_file_size(non_existent_file).unwrap_err(); } #[test] @@ -504,7 +504,7 @@ mod tests { assert_eq!(file_exists(&existent_file), true); let non_existent_file = dir_path.join("non_existent_file"); - assert_eq!(file_exists(&non_existent_file), false); + assert_eq!(file_exists(non_existent_file), false); } #[test] @@ -525,7 +525,7 @@ mod tests { assert_eq!(file_exists(&existent_file), false); let non_existent_file = dir_path.join("non_existent_file"); - delete_file_if_exist(&non_existent_file).unwrap(); + delete_file_if_exist(non_existent_file).unwrap(); } fn gen_rand_file>(path: P, size: usize) -> u32 { diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 1da553cb22e..35cacf620fc 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -311,7 +311,7 @@ impl RaftLogEngine { if !path.exists() || !path.is_dir() { return false; } - fs::read_dir(&path).unwrap().next().is_some() + fs::read_dir(path).unwrap().next().is_some() } pub fn raft_groups(&self) -> Vec { diff --git a/components/raft_log_engine/src/lib.rs b/components/raft_log_engine/src/lib.rs index 6156771afa8..8eda4e5ae24 100644 --- a/components/raft_log_engine/src/lib.rs +++ b/components/raft_log_engine/src/lib.rs @@ -16,7 +16,6 @@ //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] -#![feature(generic_associated_types)] #[macro_use] extern crate tikv_util; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 5abdcf31f0f..6fac2d88db0 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -187,7 +187,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.store_ctx .raft_metrics .propose_wait_time - .observe(duration_to_sec(send_time.saturating_elapsed()) as f64); + .observe(duration_to_sec(send_time.saturating_elapsed())); } fn on_tick(&mut self, tick: PeerTick) { diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 2f30ee9873d..15dd6b4afc1 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -22,7 +22,7 @@ // using a standalone modules. #![allow(unused)] -#![feature(let_else)] +#![feature(let_chains)] #![feature(array_windows)] mod batch; diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 12c7d4ec544..58628637159 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -104,7 +104,7 @@ impl Store { ER: RaftEngine, { let region_id = msg.region.id; - let mut raft_msg = Box::new(RaftMessage::default()); + let mut raft_msg = Box::::default(); raft_msg.set_region_id(region_id); raft_msg.set_region_epoch(msg.region.get_region_epoch().clone()); raft_msg.set_to_peer( diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs index 7c8bdb369a1..fc23e46e12f 100644 --- a/components/raftstore-v2/tests/integrations/test_basic_write.rs +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -96,7 +96,7 @@ fn test_basic_write() { ); // Make it step down and follower should reject write. - let mut msg = Box::new(RaftMessage::default()); + let mut msg = Box::::default(); msg.set_region_id(2); msg.set_to_peer(new_peer(1, 3)); msg.mut_region_epoch().set_conf_ver(INIT_EPOCH_CONF_VER); diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs index e905e7e4ac2..ed0ebcc9b8a 100644 --- a/components/raftstore-v2/tests/integrations/test_life.rs +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -71,7 +71,7 @@ fn test_life_by_message() { assert_peer_not_exist(test_region_id, test_peer_id, &router); // Build a correct message. - let mut msg = Box::new(RaftMessage::default()); + let mut msg = Box::::default(); msg.set_region_id(test_region_id); msg.set_to_peer(new_peer(1, test_peer_id)); msg.mut_region_epoch().set_conf_ver(1); @@ -147,7 +147,7 @@ fn test_destroy_by_larger_id() { let test_region_id = 4; let test_peer_id = 6; let init_term = 5; - let mut msg = Box::new(RaftMessage::default()); + let mut msg = Box::::default(); msg.set_region_id(test_region_id); msg.set_to_peer(new_peer(1, test_peer_id)); msg.mut_region_epoch().set_conf_ver(1); diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index 7b968af3c6a..e56678edec2 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -6,7 +6,7 @@ #![feature(min_specialization)] #![feature(box_patterns)] #![feature(hash_drain_filter)] -#![feature(let_else)] +#![feature(let_chains)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 70cf6b67d1f..454cf61a4c8 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -575,7 +575,7 @@ impl Config { let election_timeout = self.raft_base_tick_interval.as_millis() * self.raft_election_timeout_ticks as u64; - let lease = self.raft_store_max_leader_lease.as_millis() as u64; + let lease = self.raft_store_max_leader_lease.as_millis(); if election_timeout < lease { return Err(box_err!( "election timeout {} ms is less than lease {} ms", @@ -584,7 +584,7 @@ impl Config { )); } - let tick = self.raft_base_tick_interval.as_millis() as u64; + let tick = self.raft_base_tick_interval.as_millis(); if lease > election_timeout - tick { return Err(box_err!( "lease {} ms should not be greater than election timeout {} ms - 1 tick({} ms)", @@ -598,7 +598,7 @@ impl Config { return Err(box_err!("raftstore.merge-check-tick-interval can't be 0.")); } - let stale_state_check = self.peer_stale_state_check_interval.as_millis() as u64; + let stale_state_check = self.peer_stale_state_check_interval.as_millis(); if stale_state_check < election_timeout * 2 { return Err(box_err!( "peer stale state check interval {} ms is less than election timeout x 2 {} ms", @@ -613,7 +613,7 @@ impl Config { )); } - let abnormal_leader_missing = self.abnormal_leader_missing_duration.as_millis() as u64; + let abnormal_leader_missing = self.abnormal_leader_missing_duration.as_millis(); if abnormal_leader_missing < stale_state_check { return Err(box_err!( "abnormal leader missing {} ms is less than peer stale state check interval {} ms", @@ -622,7 +622,7 @@ impl Config { )); } - let max_leader_missing = self.max_leader_missing_duration.as_millis() as u64; + let max_leader_missing = self.max_leader_missing_duration.as_millis(); if max_leader_missing < abnormal_leader_missing { return Err(box_err!( "max leader missing {} ms is less than abnormal leader missing {} ms", diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 45eadb0b89f..bd582d1c24a 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -691,7 +691,7 @@ where } let elapsed = t.saturating_elapsed(); - STORE_APPLY_LOG_HISTOGRAM.observe(duration_to_sec(elapsed) as f64); + STORE_APPLY_LOG_HISTOGRAM.observe(duration_to_sec(elapsed)); for mut inspector in std::mem::take(&mut self.pending_latency_inspect) { inspector.record_apply_process(elapsed); inspector.finish(); @@ -5861,7 +5861,7 @@ mod tests { } } let sst_path = import_dir.path().join("test.sst"); - let (mut meta, data) = gen_sst_file_with_kvs(&sst_path, &kvs); + let (mut meta, data) = gen_sst_file_with_kvs(sst_path, &kvs); meta.set_region_id(1); meta.mut_region_epoch().set_conf_ver(1); meta.mut_region_epoch().set_version(3); @@ -5892,7 +5892,7 @@ mod tests { } } let sst_path = import_dir.path().join("test2.sst"); - let (mut meta, data) = gen_sst_file_with_kvs(&sst_path, &kvs); + let (mut meta, data) = gen_sst_file_with_kvs(sst_path, &kvs); meta.set_region_id(1); meta.mut_region_epoch().set_conf_ver(1); meta.mut_region_epoch().set_version(3); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 8c7ef17cfa6..63bb878838c 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -694,7 +694,7 @@ where .raft_metrics .event_time .peer_msg - .observe(duration_to_sec(timer.saturating_elapsed()) as f64); + .observe(duration_to_sec(timer.saturating_elapsed())); } #[inline] diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 0f172b6c70f..28c0db02eee 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -699,7 +699,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> .raft_metrics .event_time .get(tick.tag()) - .observe(duration_to_sec(elapsed) as f64); + .observe(duration_to_sec(elapsed)); slow_log!( elapsed, "[store {}] handle timeout {:?}", @@ -767,7 +767,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> .raft_metrics .event_time .store_msg - .observe(duration_to_sec(timer.saturating_elapsed()) as f64); + .observe(duration_to_sec(timer.saturating_elapsed())); } fn start(&mut self, store: metapb::Store) { diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 262f9fd64c5..a4c6c435741 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -719,6 +719,7 @@ pub struct InspectedRaftMessage { } /// Message that can be sent to a peer. +#[allow(clippy::large_enum_variant)] pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 9995582f13c..8ca5b26d02b 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -557,7 +557,7 @@ impl Snapshot { for (i, file_path) in file_paths.iter().enumerate() { if cf_file.size[i] > 0 { let path = Path::new(file_path); - let file = File::open(&path)?; + let file = File::open(path)?; cf_file .file_for_sending .push(Box::new(file) as Box); @@ -600,7 +600,7 @@ impl Snapshot { let f = OpenOptions::new() .write(true) .create_new(true) - .open(&file_path)?; + .open(file_path)?; cf_file.file_for_recving.push(CfFileForRecving { file: f, encrypter: None, @@ -788,7 +788,7 @@ impl Snapshot { if !for_send && !plain_file_used(cf_file.cf) { sst_importer::prepare_sst_for_ingestion( file_path, - &Path::new(&clone_file_paths[i]), + Path::new(&clone_file_paths[i]), self.mgr.encryption_key_manager.as_deref(), )?; } @@ -972,7 +972,7 @@ impl Snapshot { } else { // delete snapshot files according to meta file for clone_file_path in clone_file_paths { - delete_file_if_exist(&clone_file_path).unwrap(); + delete_file_if_exist(clone_file_path).unwrap(); } } @@ -983,7 +983,7 @@ impl Snapshot { try_delete_snapshot_files!(cf_file, gen_tmp_file_name); } else { for tmp_file_path in tmp_file_paths { - delete_file_if_exist(&tmp_file_path).unwrap(); + delete_file_if_exist(tmp_file_path).unwrap(); } } } @@ -994,7 +994,7 @@ impl Snapshot { try_delete_snapshot_files!(cf_file); } else { for file_path in &file_paths { - delete_file_if_exist(&file_path).unwrap(); + delete_file_if_exist(file_path).unwrap(); } if let Some(ref mgr) = self.mgr.encryption_key_manager { for file_path in &file_paths { @@ -1047,7 +1047,7 @@ impl Snapshot { snap_data.set_version(SNAPSHOT_VERSION); snap_data.set_meta(self.meta_file.meta.as_ref().unwrap().clone()); - SNAPSHOT_BUILD_TIME_HISTOGRAM.observe(duration_to_sec(t.saturating_elapsed()) as f64); + SNAPSHOT_BUILD_TIME_HISTOGRAM.observe(duration_to_sec(t.saturating_elapsed())); SNAPSHOT_KV_COUNT_HISTOGRAM.observe(total_count as f64); SNAPSHOT_SIZE_HISTOGRAM.observe(total_size as f64); info!( @@ -1115,7 +1115,7 @@ impl Snapshot { || (cf_file .file_paths() .iter() - .all(|file_path| file_exists(&Path::new(file_path)))) + .all(|file_path| file_exists(Path::new(file_path)))) }) && file_exists(&self.meta_file.path) } @@ -1184,7 +1184,7 @@ impl Snapshot { let tmp_paths = cf_file.tmp_file_paths(); let paths = cf_file.file_paths(); for (i, tmp_path) in tmp_paths.iter().enumerate() { - file_system::rename(&tmp_path, &paths[i])?; + file_system::rename(tmp_path, &paths[i])?; } } sync_dir(&self.dir_path)?; @@ -1488,7 +1488,7 @@ impl SnapManager { "{}_{}{}{}", DEL_RANGE_PREFIX, sst_id, SST_FILE_SUFFIX, TMP_FILE_SUFFIX ); - let path = PathBuf::from(&self.core.base).join(&filename); + let path = PathBuf::from(&self.core.base).join(filename); path.to_str().unwrap().to_string() } @@ -1802,7 +1802,7 @@ impl SnapManagerCore { } r?; } else { - file_system::rename(&tmp_file_path, &file_paths[i])?; + file_system::rename(tmp_file_path, &file_paths[i])?; } let file = Path::new(&file_paths[i]); let (checksum, size) = calc_checksum_and_size(file, mgr)?; @@ -1957,7 +1957,7 @@ impl TabletSnapManager { pub fn get_tablet_checkpointer_path(&self, key: &TabletSnapKey) -> PathBuf { let prefix = format!("{}_{}", SNAP_GEN_PREFIX, key); - PathBuf::from(&self.base).join(&prefix) + PathBuf::from(&self.base).join(prefix) } } diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 31bf3156c58..3cdee1e40f1 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -152,7 +152,7 @@ where Ok(new_sst_writer) => { let old_writer = sst_writer.replace(new_sst_writer); box_try!(old_writer.finish()); - box_try!(File::open(&prev_path).and_then(|f| f.sync_all())); + box_try!(File::open(prev_path).and_then(|f| f.sync_all())); } Err(e) => { let io_error = io::Error::new(io::ErrorKind::Other, e); diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index a211a8f0a60..7e00daa2764 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -361,7 +361,7 @@ impl RegionInfo { if n == 0 || self.key_ranges.len() < self.sample_num { self.key_ranges.push(key_range); } else { - let j = rand::thread_rng().gen_range(0..n) as usize; + let j = rand::thread_rng().gen_range(0..n); if j < self.sample_num { self.key_ranges[j] = key_range; } diff --git a/components/resolved_ts/tests/integrations/mod.rs b/components/resolved_ts/tests/integrations/mod.rs index da28758a5d2..7802108b92b 100644 --- a/components/resolved_ts/tests/integrations/mod.rs +++ b/components/resolved_ts/tests/integrations/mod.rs @@ -63,7 +63,7 @@ fn test_resolved_ts_basic() { sst_epoch.set_conf_ver(1); sst_epoch.set_version(4); - let (mut meta, data) = gen_sst_file(&sst_path, sst_range); + let (mut meta, data) = gen_sst_file(sst_path, sst_range); meta.set_region_id(r1.id); meta.set_region_epoch(sst_epoch); diff --git a/components/resource_metering/src/recorder/mod.rs b/components/resource_metering/src/recorder/mod.rs index 9ed6acfb74f..f0b2e88ee4e 100644 --- a/components/resource_metering/src/recorder/mod.rs +++ b/components/resource_metering/src/recorder/mod.rs @@ -303,8 +303,8 @@ pub fn init_recorder( ) { let recorder = RecorderBuilder::default() .precision_ms(precision_ms) - .add_sub_recorder(Box::new(CpuRecorder::default())) - .add_sub_recorder(Box::new(SummaryRecorder::default())) + .add_sub_recorder(Box::::default()) + .add_sub_recorder(Box::::default()) .build(); let mut recorder_worker = WorkerBuilder::new("resource-metering-recorder") .pending_capacity(256) diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index c0be3ba276b..cc87469426c 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -317,7 +317,7 @@ mod tests { .iter() .enumerate() { - fs::write(f, &[id as u8]).unwrap(); + fs::write(f, [id as u8]).unwrap(); } let mut c = cfg.clone(); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 82973946d96..e4f4dc83049 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -473,7 +473,7 @@ where let cur_port = cur_addr.port(); let lock_dir = get_lock_dir(); - let search_base = env::temp_dir().join(&lock_dir); + let search_base = env::temp_dir().join(lock_dir); file_system::create_dir_all(&search_base) .unwrap_or_else(|_| panic!("create {} failed", search_base.display())); @@ -542,7 +542,7 @@ where disk::set_disk_reserved_space(reserve_space); let path = Path::new(&self.config.storage.data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); - if let Err(e) = file_system::remove_file(&path) { + if let Err(e) = file_system::remove_file(path) { warn!("failed to remove space holder on starting: {}", e); } @@ -1480,7 +1480,7 @@ where .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); let placeholder_size: u64 = - file_system::get_file_size(&placeholer_file_path).unwrap_or(0); + file_system::get_file_size(placeholer_file_path).unwrap_or(0); let used_size = snap_size + kv_size + raft_size + placeholder_size; let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { diff --git a/components/server/src/signal_handler.rs b/components/server/src/signal_handler.rs index 88c2ddac9f4..a92845b843d 100644 --- a/components/server/src/signal_handler.rs +++ b/components/server/src/signal_handler.rs @@ -13,7 +13,7 @@ mod imp { #[allow(dead_code)] pub fn wait_for_signal(engines: Option>) { - let mut signals = Signals::new(&[SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]).unwrap(); + let mut signals = Signals::new([SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]).unwrap(); for signal in &mut signals { match signal { SIGTERM | SIGINT | SIGHUP => { diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index c4a0498a9a6..f766729a066 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -247,9 +247,9 @@ impl ImportDir { /// Make an import path base on the basic path and the file name. pub fn get_import_path(&self, file_name: &str) -> Result { - let save_path = self.root_dir.join(&file_name); - let temp_path = self.temp_dir.join(&file_name); - let clone_path = self.clone_dir.join(&file_name); + let save_path = self.root_dir.join(file_name); + let temp_path = self.temp_dir.join(file_name); + let clone_path = self.clone_dir.join(file_name); Ok(ImportPath { save: save_path, temp: temp_path, @@ -276,7 +276,7 @@ impl ImportDir { pub fn delete_file(&self, path: &Path, key_manager: Option<&DataKeyManager>) -> Result<()> { if path.exists() { - file_system::remove_file(&path)?; + file_system::remove_file(path)?; if let Some(manager) = key_manager { manager.delete_file(path.to_str().unwrap())?; } @@ -515,7 +515,7 @@ mod test { meta.get_region_epoch().get_version(), SST_SUFFIX, )); - let new_meta = path_to_sst_meta(&path).unwrap(); + let new_meta = path_to_sst_meta(path).unwrap(); assert_eq!(meta, new_meta); } } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 947d7e98e0c..abd616c5bc9 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -877,7 +877,7 @@ mod tests { for (i, &range) in cases.iter().enumerate() { let path = temp_dir.path().join(format!("{}.sst", i)); - let (meta, data) = gen_sst_file(&path, range); + let (meta, data) = gen_sst_file(path, range); let mut f = dir.create(&meta, key_manager.clone()).unwrap(); f.append(&data).unwrap(); diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index a45a3f52462..e990924c638 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -256,7 +256,7 @@ impl TestSuite { let mut batch = Vec::with_capacity(1024); let mut keys = Vec::with_capacity(1024); // Write 50 times to include more different ts. - let batch_size = cmp::min(cmp::max(key_count / 50, 1), 1024); + let batch_size = (key_count / 50).clamp(1, 1024); for _ in 0..versions { let mut j = 0; while j < key_count { diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index 00c12073511..06ff550aa64 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -273,7 +273,7 @@ pub struct DefaultFilterFactory(PhantomData); impl FilterFactory for DefaultFilterFactory { fn generate(&self, _: u64) -> Vec> { - vec![Box::new(F::default())] + vec![Box::::default()] } } diff --git a/components/test_util/src/runner.rs b/components/test_util/src/runner.rs index d05f7e98879..ee2b6548c23 100644 --- a/components/test_util/src/runner.rs +++ b/components/test_util/src/runner.rs @@ -61,11 +61,11 @@ pub fn run_test_with_hook(cases: &[&TestDescAndFn], hook: impl TestHook + Send + let f = match case.testfn { TestFn::StaticTestFn(f) => TestFn::DynTestFn(Box::new(move || { let _watcher = CaseLifeWatcher::new(name.clone(), hook.clone()); - f(); + f() })), TestFn::StaticBenchFn(f) => TestFn::DynBenchFn(Box::new(move |b| { let _watcher = CaseLifeWatcher::new(name.clone(), hook.clone()); - f(b); + f(b) })), ref f => panic!("unexpected testfn {:?}", f), }; diff --git a/components/tidb_query_aggr/src/impl_max_min.rs b/components/tidb_query_aggr/src/impl_max_min.rs index f4046c35440..c18710b3645 100644 --- a/components/tidb_query_aggr/src/impl_max_min.rs +++ b/components/tidb_query_aggr/src/impl_max_min.rs @@ -514,10 +514,10 @@ where self.extremum = value.copied() } } else { - let v1 = self.extremum.map(|x| x as i64); - let v2 = value.map(|x| *x as i64); + let v1: Option = self.extremum; + let v2: Option = value.copied(); if v1.cmp(&v2) == E::ORD { - self.extremum = value.copied() + self.extremum = v2; } } } diff --git a/components/tidb_query_codegen/src/rpn_function.rs b/components/tidb_query_codegen/src/rpn_function.rs index 864fce9afd8..dfdede3a3b3 100644 --- a/components/tidb_query_codegen/src/rpn_function.rs +++ b/components/tidb_query_codegen/src/rpn_function.rs @@ -385,7 +385,7 @@ impl parse::Parse for RpnFnAttr { )); } - if !is_varg && !is_raw_varg && (min_args != None || max_args != None) { + if !is_varg && !is_raw_varg && (min_args.is_some() || max_args.is_some()) { return Err(Error::new_spanned( config_items, "`min_args` or `max_args` is only available when `varg` or `raw_varg` presents", diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index 26ae799c4ff..418841547ca 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -186,7 +186,7 @@ pub fn integer_signed_lower_bound(tp: FieldTypeTp) -> i64 { /// `truncate_binary` truncates a buffer to the specified length. #[inline] pub fn truncate_binary(s: &mut Vec, flen: isize) { - if flen != crate::UNSPECIFIED_LENGTH as isize && s.len() > flen as usize { + if flen != crate::UNSPECIFIED_LENGTH && s.len() > flen as usize { s.truncate(flen as usize); } } @@ -431,7 +431,7 @@ impl ToInt for Decimal { fn to_int(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let dec = round_decimal_with_ctx(ctx, *self)?; let val = dec.as_i64(); - let err = Error::truncated_wrong_val("DECIMAL", &dec); + let err = Error::truncated_wrong_val("DECIMAL", dec); let r = val.into_result_with_overflow_err(ctx, err)?; r.to_int(ctx, tp) } @@ -440,7 +440,7 @@ impl ToInt for Decimal { fn to_uint(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let dec = round_decimal_with_ctx(ctx, *self)?; let val = dec.as_u64(); - let err = Error::truncated_wrong_val("DECIMAL", &dec); + let err = Error::truncated_wrong_val("DECIMAL", dec); let r = val.into_result_with_overflow_err(ctx, err)?; r.to_uint(ctx, tp) } @@ -639,7 +639,7 @@ pub fn produce_dec_with_specified_tp( // select (cast 111 as decimal(1)) causes a warning in MySQL. ctx.handle_overflow_err(Error::overflow( "Decimal", - &format!("({}, {})", flen, decimal), + format!("({}, {})", flen, decimal), ))?; dec = max_or_min_dec(dec.is_negative(), flen as u8, decimal as u8) } else if frac != decimal { @@ -648,7 +648,7 @@ pub fn produce_dec_with_specified_tp( .round(decimal as i8, RoundMode::HalfEven) .into_result_with_overflow_err( ctx, - Error::overflow("Decimal", &format!("({}, {})", flen, decimal)), + Error::overflow("Decimal", format!("({}, {})", flen, decimal)), )?; if !rounded.is_zero() && frac > decimal && rounded != old { if ctx.cfg.flag.contains(Flag::IN_INSERT_STMT) @@ -811,7 +811,7 @@ impl ConvertTo for &[u8] { .map_err(|err| -> Error { box_err!("Parse '{}' to float err: {:?}", vs, err) })?; // The `parse` will return Ok(inf) if the float string literal out of range if val.is_infinite() { - ctx.handle_truncate_err(Error::truncated_wrong_val("DOUBLE", &vs))?; + ctx.handle_truncate_err(Error::truncated_wrong_val("DOUBLE", vs))?; if val.is_sign_negative() { return Ok(f64::MIN); } else { @@ -1036,7 +1036,7 @@ fn exp_float_str_to_int_str<'a>( // And the intCnt may contain the len of `+/-`, // so here we use 21 here as the early detection. ctx.warnings - .append_warning(Error::overflow("BIGINT", &valid_float)); + .append_warning(Error::overflow("BIGINT", valid_float)); return Cow::Borrowed(valid_float); } if int_cnt <= 0 { diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 135a3cd2ce7..143ec6c7760 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -373,11 +373,11 @@ fn do_sub<'a>(mut lhs: &'a Decimal, mut rhs: &'a Decimal) -> Res { } let mut carry = 0; let mut res = res.map(|_| Decimal::new(int_cnt, frac_cnt, negative)); - let mut l_idx = l_start + l_int_word_cnt as usize + l_frac_word_cnt as usize; - let mut r_idx = r_start + r_int_word_cnt as usize + r_frac_word_cnt as usize; + let mut l_idx = l_start + l_int_word_cnt + l_frac_word_cnt as usize; + let mut r_idx = r_start + r_int_word_cnt + r_frac_word_cnt as usize; // adjust `l_idx` and `r_idx` to the same position of digits after the point. if l_frac_word_cnt > r_frac_word_cnt { - let l_stop = l_start + l_int_word_cnt as usize + r_frac_word_cnt as usize; + let l_stop = l_start + l_int_word_cnt + r_frac_word_cnt as usize; if l_frac_word_cnt < frac_word_to { // It happens only when suffix 0 exist(3.10000000000-2.00). idx_to -= (frac_word_to - l_frac_word_cnt) as usize; @@ -388,7 +388,7 @@ fn do_sub<'a>(mut lhs: &'a Decimal, mut rhs: &'a Decimal) -> Res { res.word_buf[idx_to] = lhs.word_buf[l_idx]; } } else { - let r_stop = r_start + r_int_word_cnt as usize + l_frac_word_cnt as usize; + let r_stop = r_start + r_int_word_cnt + l_frac_word_cnt as usize; if frac_word_to > r_frac_word_cnt { // It happens only when suffix 0 exist(3.00-2.00000000000). idx_to -= (frac_word_to - r_frac_word_cnt) as usize; @@ -802,7 +802,7 @@ fn do_mul(lhs: &Decimal, rhs: &Decimal) -> Res { word_cnt!(lhs.int_cnt + rhs.int_cnt) as usize, l_frac_word_cnt + r_frac_word_cnt, ); - let (mut old_int_word_to, mut old_frac_word_to) = (int_word_to as i32, frac_word_to as i32); + let (mut old_int_word_to, mut old_frac_word_to) = (int_word_to as i32, frac_word_to); let res = fix_word_cnt_err(int_word_to as u8, frac_word_to as u8, WORD_BUF_LEN); let (int_word_to, frac_word_to) = (res.0 as usize, res.1 as usize); let negative = lhs.negative != rhs.negative; @@ -1623,7 +1623,7 @@ impl Decimal { let mut inner_idx = 0; let mut word_idx = int_word_cnt as usize; let mut word = 0; - for c in bs[int_idx - int_cnt as usize..int_idx].iter().rev() { + for c in bs[int_idx - int_cnt..int_idx].iter().rev() { word += u32::from(c - b'0') * TEN_POW[inner_idx]; inner_idx += 1; if inner_idx == DIGITS_PER_WORD as usize { @@ -1642,7 +1642,7 @@ impl Decimal { word_idx = int_word_cnt as usize; word = 0; inner_idx = 0; - for &c in bs.iter().skip(int_idx + 1).take(frac_cnt as usize) { + for &c in bs.iter().skip(int_idx + 1).take(frac_cnt) { word = u32::from(c - b'0') + word * 10; inner_idx += 1; if inner_idx == DIGITS_PER_WORD as usize { @@ -2389,7 +2389,7 @@ impl Hash for Decimal { while idx < stop && self.word_buf[idx] == 0 { idx += 1; } - let start = idx as usize; + let start = idx; let int_word_cnt = stop - idx; int_word_cnt.hash(state); diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index 520c985f4b5..7279f788146 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -1070,7 +1070,7 @@ mod tests { #[test] fn test_checked_add_and_sub_duration() { /// `MAX_TIME_IN_SECS` is the maximum for mysql time type. - const MAX_TIME_IN_SECS: i64 = MAX_HOUR_PART as i64 * SECS_PER_HOUR as i64 + const MAX_TIME_IN_SECS: i64 = MAX_HOUR_PART as i64 * SECS_PER_HOUR + MAX_MINUTE_PART as i64 * SECS_PER_MINUTE + MAX_SECOND_PART as i64; @@ -1110,7 +1110,7 @@ mod tests { // UNSPECIFIED_FSP ( 8385959, - UNSPECIFIED_FSP as i8, + UNSPECIFIED_FSP, Ok(Duration::parse(&mut EvalContext::default(), "838:59:59", 0).unwrap()), false, ), diff --git a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs index 734ec1d4115..c965247b8da 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs @@ -82,7 +82,7 @@ impl<'a> JsonRef<'a> { pub fn val_entry_get(&self, val_entry_off: usize) -> Result> { let val_type: JsonType = self.value()[val_entry_off].try_into()?; let val_offset = - NumberCodec::decode_u32_le(&self.value()[val_entry_off + TYPE_LEN as usize..]) as usize; + NumberCodec::decode_u32_le(&self.value()[val_entry_off + TYPE_LEN..]) as usize; Ok(match val_type { JsonType::Literal => { let offset = val_entry_off + TYPE_LEN; diff --git a/components/tidb_query_datatype/src/codec/mysql/time/extension.rs b/components/tidb_query_datatype/src/codec/mysql/time/extension.rs index 7cc233e92d1..9289625ad84 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/extension.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/extension.rs @@ -95,7 +95,7 @@ impl DateTimeExtension for Time { } if week_year && days >= 52 * 7 { - weekday = (weekday + calc_days_in_year(year as i32)) % 7; + weekday = (weekday + calc_days_in_year(year)) % 7; if (!first_weekday && weekday < 4) || (first_weekday && weekday == 0) { year += 1; return (year, 1); diff --git a/components/tidb_query_datatype/src/codec/overflow.rs b/components/tidb_query_datatype/src/codec/overflow.rs index b1329e989c7..4a81b23a995 100644 --- a/components/tidb_query_datatype/src/codec/overflow.rs +++ b/components/tidb_query_datatype/src/codec/overflow.rs @@ -13,7 +13,7 @@ pub fn div_i64(a: i64, b: i64) -> Result { match a.overflowing_div(b) { (_res, true) => Err(Error::overflow( "UNSIGNED BIGINT", - &format!("({} / {})", a, b), + format!("({} / {})", a, b), )), (res, false) => Ok(res), } @@ -31,7 +31,7 @@ pub fn div_u64_with_i64(a: u64, b: i64) -> Result { if a != 0 && (b.overflowing_neg().0 as u64) <= a { Err(Error::overflow( "UNSIGNED BIGINT", - &format!("({} / {})", a, b), + format!("({} / {})", a, b), )) } else { Ok(0) @@ -53,7 +53,7 @@ pub fn div_i64_with_u64(a: i64, b: u64) -> Result { if a.overflowing_neg().0 as u64 >= b { Err(Error::overflow( "UNSIGNED BIGINT", - &format!("({} / {})", a, b), + format!("({} / {})", a, b), )) } else { Ok(0) diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 0c995487b3d..00f6c22347b 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -647,7 +647,7 @@ mod tests { let mut ctx = EvalContext::default(); let col_ids: Vec<_> = row.iter().map(|(&id, _)| id).collect(); - let col_values: Vec<_> = row.iter().map(|(_, v)| v.clone()).collect(); + let col_values: Vec<_> = row.values().cloned().collect(); let mut col_encoded: HashMap<_, _> = row .iter() .map(|(k, v)| { diff --git a/components/tidb_query_datatype/src/expr/ctx.rs b/components/tidb_query_datatype/src/expr/ctx.rs index 758f7b13736..c17cb7af922 100644 --- a/components/tidb_query_datatype/src/expr/ctx.rs +++ b/components/tidb_query_datatype/src/expr/ctx.rs @@ -143,7 +143,7 @@ impl EvalConfig { self.tz = tz; Ok(self) } - None => Err(Error::invalid_timezone(&format!("offset {}s", offset_sec))), + None => Err(Error::invalid_timezone(format!("offset {}s", offset_sec))), } } @@ -300,7 +300,7 @@ impl EvalContext { } let orig_str = String::from_utf8_lossy(bytes); self.warnings - .append_warning(Error::truncated_wrong_val("INTEGER", &orig_str)); + .append_warning(Error::truncated_wrong_val("INTEGER", orig_str)); if negative { Ok(i64::MIN) } else { diff --git a/components/tidb_query_executors/src/simple_aggr_executor.rs b/components/tidb_query_executors/src/simple_aggr_executor.rs index 75790428187..b6717a40fb5 100644 --- a/components/tidb_query_executors/src/simple_aggr_executor.rs +++ b/components/tidb_query_executors/src/simple_aggr_executor.rs @@ -207,7 +207,7 @@ impl AggregationExecutorImpl for SimpleAggregationImpl #[inline] fn groups_len(&self) -> usize { - if self.has_input_rows { 1 } else { 0 } + self.has_input_rows as usize } #[inline] diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index 06dc1ce956b..6ef8c6b2224 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -98,7 +98,7 @@ impl BatchTopNExecutor { Self { heap: BinaryHeap::new(), - eval_columns_buffer_unsafe: Box::new(Vec::new()), + eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), order_is_desc: order_is_desc.into_boxed_slice(), @@ -127,7 +127,7 @@ impl BatchTopNExecutor { Self { heap: BinaryHeap::new(), - eval_columns_buffer_unsafe: Box::new(Vec::new()), + eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), order_is_desc: order_is_desc.into_boxed_slice(), diff --git a/components/tidb_query_expr/src/impl_arithmetic.rs b/components/tidb_query_expr/src/impl_arithmetic.rs index 01776c1ad7a..2f48fec4693 100644 --- a/components/tidb_query_expr/src/impl_arithmetic.rs +++ b/components/tidb_query_expr/src/impl_arithmetic.rs @@ -44,7 +44,7 @@ impl ArithmeticOp for IntIntPlus { fn calc(lhs: &Int, rhs: &Int) -> Result> { lhs.checked_add(*rhs) - .ok_or_else(|| Error::overflow("BIGINT", &format!("({} + {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT", format!("({} + {})", lhs, rhs)).into()) .map(Some) } } @@ -61,10 +61,8 @@ impl ArithmeticOp for IntUintPlus { } else { (*rhs as u64).checked_sub(lhs.overflowing_neg().0 as u64) }; - res.ok_or_else(|| { - Error::overflow("BIGINT UNSIGNED", &format!("({} + {})", lhs, rhs)).into() - }) - .map(|v| Some(v as i64)) + res.ok_or_else(|| Error::overflow("BIGINT UNSIGNED", format!("({} + {})", lhs, rhs)).into()) + .map(|v| Some(v as i64)) } } @@ -89,7 +87,7 @@ impl ArithmeticOp for UintUintPlus { (*lhs as u64) .checked_add(*rhs as u64) .ok_or_else(|| { - Error::overflow("BIGINT UNSIGNED", &format!("({} + {})", lhs, rhs)).into() + Error::overflow("BIGINT UNSIGNED", format!("({} + {})", lhs, rhs)).into() }) .map(|v| Some(v as i64)) } @@ -104,7 +102,7 @@ impl ArithmeticOp for RealPlus { fn calc(lhs: &Real, rhs: &Real) -> Result> { let res = *lhs + *rhs; if !res.is_finite() { - return Err(Error::overflow("DOUBLE", &format!("({} + {})", lhs, rhs)).into()); + return Err(Error::overflow("DOUBLE", format!("({} + {})", lhs, rhs)).into()); } Ok(Some(res)) } @@ -130,7 +128,7 @@ impl ArithmeticOp for IntIntMinus { fn calc(lhs: &Int, rhs: &Int) -> Result> { lhs.checked_sub(*rhs) - .ok_or_else(|| Error::overflow("BIGINT", &format!("({} - {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT", format!("({} - {})", lhs, rhs)).into()) .map(Some) } } @@ -145,10 +143,10 @@ impl ArithmeticOp for IntUintMinus { if *lhs >= 0 { (*lhs as u64) .checked_sub(*rhs as u64) - .ok_or_else(|| Error::overflow("BIGINT", &format!("({} - {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT", format!("({} - {})", lhs, rhs)).into()) .map(|v| Some(v as i64)) } else { - Err(Error::overflow("BIGINT", &format!("({} - {})", lhs, rhs)).into()) + Err(Error::overflow("BIGINT", format!("({} - {})", lhs, rhs)).into()) } } } @@ -165,7 +163,7 @@ impl ArithmeticOp for UintIntMinus { } else { (*lhs as u64).checked_add(rhs.overflowing_neg().0 as u64) }; - res.ok_or_else(|| Error::overflow("BIGINT", &format!("({} - {})", lhs, rhs)).into()) + res.ok_or_else(|| Error::overflow("BIGINT", format!("({} - {})", lhs, rhs)).into()) .map(|v| Some(v as i64)) } } @@ -180,7 +178,7 @@ impl ArithmeticOp for UintUintMinus { (*lhs as u64) .checked_sub(*rhs as u64) .ok_or_else(|| { - Error::overflow("BIGINT UNSIGNED", &format!("({} - {})", lhs, rhs)).into() + Error::overflow("BIGINT UNSIGNED", format!("({} - {})", lhs, rhs)).into() }) .map(|v| Some(v as i64)) } @@ -195,7 +193,7 @@ impl ArithmeticOp for RealMinus { fn calc(lhs: &Real, rhs: &Real) -> Result> { let res = *lhs - *rhs; if !res.is_finite() { - return Err(Error::overflow("DOUBLE", &format!("({} - {})", lhs, rhs)).into()); + return Err(Error::overflow("DOUBLE", format!("({} - {})", lhs, rhs)).into()); } Ok(Some(res)) } @@ -332,7 +330,7 @@ impl ArithmeticOp for RealMultiply { fn calc(lhs: &Real, rhs: &Real) -> Result> { let res = *lhs * *rhs; if res.is_infinite() { - Err(Error::overflow("REAL", &format!("({} * {})", lhs, rhs)).into()) + Err(Error::overflow("REAL", format!("({} * {})", lhs, rhs)).into()) } else { Ok(Some(res)) } @@ -346,7 +344,7 @@ impl ArithmeticOp for IntIntMultiply { type T = Int; fn calc(lhs: &Int, rhs: &Int) -> Result> { lhs.checked_mul(*rhs) - .ok_or_else(|| Error::overflow("BIGINT", &format!("({} * {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT", format!("({} * {})", lhs, rhs)).into()) .map(Some) } } @@ -362,7 +360,7 @@ impl ArithmeticOp for IntUintMultiply { } else { None } - .ok_or_else(|| Error::overflow("BIGINT UNSIGNED", &format!("({} * {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT UNSIGNED", format!("({} * {})", lhs, rhs)).into()) .map(Some) } } @@ -386,7 +384,7 @@ impl ArithmeticOp for UintUintMultiply { (*lhs as u64) .checked_mul(*rhs as u64) .ok_or_else(|| { - Error::overflow("BIGINT UNSIGNED", &format!("({} * {})", lhs, rhs)).into() + Error::overflow("BIGINT UNSIGNED", format!("({} * {})", lhs, rhs)).into() }) .map(|v| Some(v as i64)) } @@ -500,7 +498,7 @@ impl ArithmeticOpWithCtx for RealDivide { } else { let result = *lhs / *rhs; if result.is_infinite() { - ctx.handle_overflow_err(Error::overflow("DOUBLE", &format!("{} / {}", lhs, rhs))) + ctx.handle_overflow_err(Error::overflow("DOUBLE", format!("{} / {}", lhs, rhs))) .map(|_| None)? } else { Some(result) diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 81a08b95e94..76e90f79c5b 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -242,7 +242,7 @@ pub fn get_cast_fn_rpn_node( func_meta, args_len: 1, field_type: to_field_type, - metadata: Box::new(tipb::InUnionMetadata::default()), + metadata: Box::::default(), }) } @@ -373,7 +373,7 @@ fn cast_string_as_int( ctx.warnings .append_warning(Error::cast_neg_int_as_unsigned()); } - Ok(Some(x as i64)) + Ok(Some(x)) } Err(err) => match *err.kind() { IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => { @@ -3118,7 +3118,7 @@ mod tests { (Json::from_bool(false).unwrap(), 0, false, false), (Json::none().unwrap(), 0, false, false), ( - Json::from_f64(((1u64 << 63) + (1u64 << 62)) as u64 as f64).unwrap(), + Json::from_f64(((1u64 << 63) + (1u64 << 62)) as f64).unwrap(), i64::MAX, true, false, @@ -4341,7 +4341,7 @@ mod tests { test_as_string_helper( ref_cs, |ctx, extra, val| { - let val = val.map(|x| *x as i64); + let val = val.copied(); cast_year_as_string(ctx, extra, &val.unwrap()) }, "cast_year_as_string", @@ -5026,10 +5026,8 @@ mod tests { let expect = match res_type { ResType::Zero => Decimal::zero(), ResType::Same => base_res, - ResType::TruncateToMax => max_decimal(res_flen as u8, res_decimal as u8), - ResType::TruncateToMin => { - max_or_min_dec(true, res_flen as u8, res_decimal as u8) - } + ResType::TruncateToMax => max_decimal(res_flen, res_decimal), + ResType::TruncateToMin => max_or_min_dec(true, res_flen, res_decimal), ResType::Round => { let r = base_res .round(res_decimal as i8, RoundMode::HalfEven) @@ -6697,7 +6695,7 @@ mod tests { Json::from_f64(i64::MAX as u64 as f64).unwrap(), Json::from_f64(i64::MIN as u64 as f64).unwrap(), Json::from_f64(i64::MIN as f64).unwrap(), - Json::from_f64(((1u64 << 63) + (1u64 << 62)) as u64 as f64).unwrap(), + Json::from_f64(((1u64 << 63) + (1u64 << 62)) as f64).unwrap(), Json::from_f64(-((1u64 << 63) as f64 + (1u64 << 62) as f64)).unwrap(), Json::from_f64(f64::from(f32::MIN)).unwrap(), Json::from_f64(f64::from(f32::MAX)).unwrap(), diff --git a/components/tidb_query_expr/src/impl_compare.rs b/components/tidb_query_expr/src/impl_compare.rs index a8dbf96d1cb..3eae996f249 100644 --- a/components/tidb_query_expr/src/impl_compare.rs +++ b/components/tidb_query_expr/src/impl_compare.rs @@ -361,7 +361,7 @@ pub fn greatest_cmp_string_as_time( Ok(t) => greatest = max(greatest, Some(t)), Err(_) => { return ctx - .handle_invalid_time_error(Error::invalid_time_format(&s)) + .handle_invalid_time_error(Error::invalid_time_format(s)) .map(|_| Ok(None))?; } } @@ -398,7 +398,7 @@ pub fn least_cmp_string_as_time( Ok(t) => least = min(least, Some(t)), Err(_) => { return ctx - .handle_invalid_time_error(Error::invalid_time_format(&s)) + .handle_invalid_time_error(Error::invalid_time_format(s)) .map(|_| Ok(None))?; } } @@ -434,7 +434,7 @@ pub fn greatest_cmp_string_as_date( Ok(t) => greatest = max(greatest, Some(t)), Err(_) => { return ctx - .handle_invalid_time_error(Error::invalid_time_format(&s)) + .handle_invalid_time_error(Error::invalid_time_format(s)) .map(|_| Ok(None))?; } } @@ -471,7 +471,7 @@ pub fn least_cmp_string_as_date( Ok(t) => least = min(least, Some(t)), Err(_) => { return ctx - .handle_invalid_time_error(Error::invalid_time_format(&s)) + .handle_invalid_time_error(Error::invalid_time_format(s)) .map(|_| Ok(None))?; } } diff --git a/components/tidb_query_expr/src/impl_math.rs b/components/tidb_query_expr/src/impl_math.rs index abd190d077a..beeeef288b4 100644 --- a/components/tidb_query_expr/src/impl_math.rs +++ b/components/tidb_query_expr/src/impl_math.rs @@ -226,7 +226,7 @@ impl Floor for FloorIntToInt { #[inline] fn abs_int(arg: &Int) -> Result> { match arg.checked_abs() { - None => Err(Error::overflow("BIGINT", &format!("abs({})", *arg)).into()), + None => Err(Error::overflow("BIGINT", format!("abs({})", *arg)).into()), Some(arg_abs) => Ok(Some(arg_abs)), } } @@ -288,7 +288,7 @@ fn radians(arg: &Real) -> Result> { pub fn exp(arg: &Real) -> Result> { let ret = arg.exp(); if ret.is_infinite() { - Err(Error::overflow("DOUBLE", &format!("exp({})", arg)).into()) + Err(Error::overflow("DOUBLE", format!("exp({})", arg)).into()) } else { Ok(Real::new(ret).ok()) } diff --git a/components/tidb_query_expr/src/impl_op.rs b/components/tidb_query_expr/src/impl_op.rs index 9081f623b8e..5289f427e93 100644 --- a/components/tidb_query_expr/src/impl_op.rs +++ b/components/tidb_query_expr/src/impl_op.rs @@ -64,7 +64,7 @@ pub fn unary_minus_uint(arg: Option<&Int>) -> Result> { Some(val) => { let uval = *val as u64; match uval.cmp(&(i64::MAX as u64 + 1)) { - Greater => Err(Error::overflow("BIGINT", &format!("-{}", uval)).into()), + Greater => Err(Error::overflow("BIGINT", format!("-{}", uval)).into()), Equal => Ok(Some(i64::MIN)), Less => Ok(Some(-*val)), } @@ -79,7 +79,7 @@ pub fn unary_minus_int(arg: Option<&Int>) -> Result> { match arg { Some(val) => { if *val == i64::MIN { - Err(Error::overflow("BIGINT", &format!("-{}", *val)).into()) + Err(Error::overflow("BIGINT", format!("-{}", *val)).into()) } else { Ok(Some(-*val)) } diff --git a/components/tidb_query_expr/src/impl_time.rs b/components/tidb_query_expr/src/impl_time.rs index 0f55e21bab5..aca40b658d6 100644 --- a/components/tidb_query_expr/src/impl_time.rs +++ b/components/tidb_query_expr/src/impl_time.rs @@ -256,7 +256,7 @@ pub fn add_string_and_duration( return match arg0.checked_add(*arg1) { Some(result) => Ok(writer.write(Some(duration_to_string(result).into_bytes()))), None => ctx - .handle_overflow_err(Error::overflow("DURATION", &format!("{} + {}", arg0, arg1))) + .handle_overflow_err(Error::overflow("DURATION", format!("{} + {}", arg0, arg1))) .map(|_| Ok(writer.write(None)))?, }; }; @@ -264,7 +264,7 @@ pub fn add_string_and_duration( return match arg0.checked_add(ctx, *arg1) { Some(result) => Ok(writer.write(Some(datetime_to_string(result).into_bytes()))), None => ctx - .handle_overflow_err(Error::overflow("DATETIME", &format!("{} + {}", arg0, arg1))) + .handle_overflow_err(Error::overflow("DATETIME", format!("{} + {}", arg0, arg1))) .map(|_| Ok(writer.write(None)))?, }; }; @@ -286,7 +286,7 @@ pub fn sub_string_and_duration( return match arg0.checked_sub(*arg1) { Some(result) => Ok(writer.write(Some(duration_to_string(result).into_bytes()))), None => ctx - .handle_overflow_err(Error::overflow("DURATION", &format!("{} - {}", arg0, arg1))) + .handle_overflow_err(Error::overflow("DURATION", format!("{} - {}", arg0, arg1))) .map(|_| Ok(writer.write(None)))?, }; }; @@ -294,7 +294,7 @@ pub fn sub_string_and_duration( return match arg0.checked_sub(ctx, *arg1) { Some(result) => Ok(writer.write(Some(datetime_to_string(result).into_bytes()))), None => ctx - .handle_overflow_err(Error::overflow("DATETIME", &format!("{} - {}", arg0, arg1))) + .handle_overflow_err(Error::overflow("DATETIME", format!("{} - {}", arg0, arg1))) .map(|_| Ok(writer.write(None)))?, }; }; diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 9d4eb4a8370..32f15786f79 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -6,7 +6,6 @@ //! [`RocksEngine`](RocksEngine) are used for testing only. #![feature(min_specialization)] -#![feature(generic_associated_types)] #[macro_use(fail_point)] extern crate fail; diff --git a/components/tikv_util/src/buffer_vec.rs b/components/tikv_util/src/buffer_vec.rs index d2247c011ec..78196577366 100644 --- a/components/tikv_util/src/buffer_vec.rs +++ b/components/tikv_util/src/buffer_vec.rs @@ -429,7 +429,7 @@ mod tests { assert_eq!(format!("{:?}", v), "[]"); assert!(v.is_empty()); - v.push(&[0xAA, 0x0, 0xB]); + v.push([0xAA, 0x0, 0xB]); assert_eq!(v.len(), 1); assert_eq!(v.total_len(), 3); assert!(!v.is_empty()); @@ -475,7 +475,7 @@ mod tests { assert!(v.is_empty()); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[0xCA, 0xB]); + v.push([0xCA, 0xB]); assert_eq!(v.len(), 1); assert_eq!(v.total_len(), 2); assert!(!v.is_empty()); @@ -488,8 +488,8 @@ mod tests { assert!(v.is_empty()); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[0xCA, 0xB]); - v.push(&[]); + v.push([0xCA, 0xB]); + v.push([]); assert_eq!(v.len(), 2); assert_eq!(v.total_len(), 2); assert!(!v.is_empty()); @@ -503,8 +503,8 @@ mod tests { assert_eq!(v[0], [0xCA, 0xB]); assert_eq!(format!("{:?}", v), "[CA0B]"); - v.push(&[]); - v.push(&[]); + v.push([]); + v.push([]); assert_eq!(v.len(), 3); assert_eq!(v.total_len(), 2); assert!(!v.is_empty()); @@ -513,7 +513,7 @@ mod tests { assert!(v[2].is_empty()); assert_eq!(format!("{:?}", v), "[CA0B, null, null]"); - v.push(&[0xC]); + v.push([0xC]); assert_eq!(v.len(), 4); assert_eq!(v.total_len(), 3); assert!(!v.is_empty()); @@ -540,7 +540,7 @@ mod tests { assert!(v[1].is_empty()); assert_eq!(format!("{:?}", v), "[null, null]"); - v.push(&[0xAC, 0xBB, 0x00]); + v.push([0xAC, 0xBB, 0x00]); assert_eq!(v.len(), 3); assert_eq!(v.total_len(), 3); assert!(!v.is_empty()); @@ -561,7 +561,7 @@ mod tests { assert_eq!(v[1], [0xAC, 0xBB, 0x00]); assert_eq!(format!("{:?}", v), "[null, ACBB00]"); - v.push(&[]); + v.push([]); assert_eq!(v.len(), 3); assert_eq!(v.total_len(), 3); assert!(!v.is_empty()); @@ -590,12 +590,12 @@ mod tests { assert!(v.is_empty()); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[0xA]); - v.push(&[0xB]); - v.push(&[0xC]); - v.push(&[0xD, 0xE]); - v.push(&[]); - v.push(&[]); + v.push([0xA]); + v.push([0xB]); + v.push([0xC]); + v.push([0xD, 0xE]); + v.push([]); + v.push([]); assert_eq!(v.len(), 6); assert_eq!(v.total_len(), 5); assert!(!v.is_empty()); @@ -630,14 +630,14 @@ mod tests { #[test] fn test_copy_from() { let mut v1 = BufferVec::new(); - v1.push(&[]); - v1.push(&[0xAA, 0xBB, 0x0C]); - v1.push(&[]); - v1.push(&[0x00]); + v1.push([]); + v1.push([0xAA, 0xBB, 0x0C]); + v1.push([]); + v1.push([0x00]); let mut v2 = BufferVec::new(); - v2.push(&[]); - v2.push(&[]); + v2.push([]); + v2.push([]); let mut v3 = v1.clone(); v3.copy_from(&v2); @@ -650,8 +650,8 @@ mod tests { assert_eq!(v3.total_len(), 3); assert_eq!(format!("{:?}", v3), "[null, AABB0C, null]"); - v3.push(&[]); - v3.push(&[0x00]); + v3.push([]); + v3.push([0x00]); assert_eq!(v3.len(), 5); assert_eq!(v3.total_len(), 4); assert_eq!(format!("{:?}", v3), "[null, AABB0C, null, null, 00]"); @@ -681,12 +681,12 @@ mod tests { assert_eq!(format!("{:?}", v3), "[]"); let mut v1 = BufferVec::new(); - v1.push(&[]); - v1.push(&[0xAA, 0xBB, 0x0C]); + v1.push([]); + v1.push([0xAA, 0xBB, 0x0C]); let mut v2 = BufferVec::new(); - v2.push(&[0x0C, 0x00]); - v2.push(&[]); + v2.push([0x0C, 0x00]); + v2.push([]); let mut v3 = v2.clone(); v3.copy_n_from(&v1, 0); @@ -694,7 +694,7 @@ mod tests { assert_eq!(v3.total_len(), 2); assert_eq!(format!("{:?}", v3), "[0C00, null]"); - v3.push(&[0xAA]); + v3.push([0xAA]); assert_eq!(v3.len(), 3); assert_eq!(v3.total_len(), 3); assert_eq!(format!("{:?}", v3), "[0C00, null, AA]"); @@ -705,16 +705,18 @@ mod tests { assert_eq!(v3.total_len(), 2); assert_eq!(format!("{:?}", v3), "[0C00, null, null]"); - v3.push(&[0xAA]); + v3.push([0xAA]); assert_eq!(v3.len(), 4); assert_eq!(v3.total_len(), 3); assert_eq!(format!("{:?}", v3), "[0C00, null, null, AA]"); - v3.extend(&[0xAA, 0xAB, 0xCC]); + v3.extend([0xAA, 0xAB, 0xCC]); assert_eq!(v3.len(), 5); assert_eq!(v3.total_len(), 6); assert_eq!(format!("{:?}", v3), "[0C00, null, null, AA, AAABCC]"); + // False positive: https://github.com/rust-lang/rust-clippy/issues/9111 + #[allow(clippy::needless_borrow)] v3.extend(&[]); assert_eq!(v3.len(), 6); assert_eq!(v3.total_len(), 6); @@ -761,7 +763,7 @@ mod tests { v.retain_by_array(&[]); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[]); + v.push([]); assert_eq!(format!("{:?}", v), "[null]"); v.retain_by_array(&[true]); @@ -770,8 +772,8 @@ mod tests { v.retain_by_array(&[false]); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[0xAA, 0x00]); - v.push(&[]); + v.push([0xAA, 0x00]); + v.push([]); assert_eq!(format!("{:?}", v), "[AA00, null]"); let mut v2 = v.clone(); @@ -790,8 +792,8 @@ mod tests { v2.retain_by_array(&[false, false]); assert_eq!(format!("{:?}", v2), "[]"); - v.push(&[]); - v.push(&[0xBB, 0x00, 0xA0]); + v.push([]); + v.push([0xBB, 0x00, 0xA0]); assert_eq!(format!("{:?}", v), "[AA00, null, null, BB00A0]"); let mut v2 = v.clone(); @@ -812,7 +814,7 @@ mod tests { v2.retain_by_array(&[false, false, true, true]); assert_eq!(format!("{:?}", v2), "[null, BB00A0]"); - v2.push(&[]); + v2.push([]); assert_eq!(format!("{:?}", v2), "[null, BB00A0, null]"); let mut v2 = v.clone(); @@ -841,12 +843,12 @@ mod tests { #[test] fn test_iter() { let mut v = BufferVec::new(); - v.push(&[]); - v.push(&[0xAA, 0xBB, 0x0C]); - v.push(&[]); - v.push(&[]); - v.push(&[0x00]); - v.push(&[]); + v.push([]); + v.push([0xAA, 0xBB, 0x0C]); + v.push([]); + v.push([]); + v.push([0x00]); + v.push([]); let mut it = v.iter(); assert_eq!(it.count(), 6); diff --git a/components/tikv_util/src/codec/bytes.rs b/components/tikv_util/src/codec/bytes.rs index df23090c9c7..b382f64739c 100644 --- a/components/tikv_util/src/codec/bytes.rs +++ b/components/tikv_util/src/codec/bytes.rs @@ -513,7 +513,7 @@ mod tests { desc ); let mut longer_encoded = encoded.clone(); - longer_encoded.extend(&[0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); + longer_encoded.extend([0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); assert!( !is_encoded_from(&longer_encoded, &raw, desc), "Encoded: {:?}, Raw: {:?}, desc: {}", diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index e11a4799bc0..c55cebea0ff 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -384,8 +384,8 @@ impl FromStr for ReadableDuration { if dur.is_sign_negative() { return Err("duration should be positive.".to_owned()); } - let secs = dur as u64 / SECOND as u64; - let micros = (dur as u64 % SECOND as u64) as u32 * 1_000; + let secs = dur as u64 / SECOND; + let micros = (dur as u64 % SECOND) as u32 * 1_000; Ok(ReadableDuration(Duration::new(secs, micros))) } } @@ -814,7 +814,7 @@ mod check_data_dir { } let ent = &*ent; let cur_dir = CStr::from_ptr(ent.mnt_dir).to_str().unwrap(); - if path.starts_with(&cur_dir) && cur_dir.len() >= fs.mnt_dir.len() { + if path.starts_with(cur_dir) && cur_dir.len() >= fs.mnt_dir.len() { fs.tp = CStr::from_ptr(ent.mnt_type).to_str().unwrap().to_owned(); fs.opts = CStr::from_ptr(ent.mnt_opts).to_str().unwrap().to_owned(); fs.fsname = CStr::from_ptr(ent.mnt_fsname).to_str().unwrap().to_owned(); @@ -844,7 +844,7 @@ mod check_data_dir { let block_dir = "/sys/block"; let mut device_dir = format!("{}/{}", block_dir, dev); if !Path::new(&device_dir).exists() { - let dir = fs::read_dir(&block_dir).map_err(|e| { + let dir = fs::read_dir(block_dir).map_err(|e| { ConfigError::FileSystem(format!( "{}: read block dir {:?} failed: {:?}", op, block_dir, e @@ -1554,7 +1554,7 @@ impl RaftDataStateMachine { fs::remove_dir_all(&trash).unwrap(); } else { info!("Removing file"; "path" => %path.display()); - fs::remove_file(&path).unwrap(); + fs::remove_file(path).unwrap(); Self::sync_dir(path.parent().unwrap()); } } @@ -1571,11 +1571,11 @@ impl RaftDataStateMachine { if !path.exists() || !path.is_dir() { return false; } - fs::read_dir(&path).unwrap().next().is_some() + fs::read_dir(path).unwrap().next().is_some() } fn sync_dir(dir: &Path) { - fs::File::open(&dir).and_then(|d| d.sync_all()).unwrap(); + fs::File::open(dir).and_then(|d| d.sync_all()).unwrap(); } } @@ -1789,8 +1789,8 @@ mod tests { ensure_dir_exist(&format!("{}", tmp_dir.to_path_buf().join("dir").display())).unwrap(); let nodes: &[&str] = if cfg!(target_os = "linux") { std::os::unix::fs::symlink( - &tmp_dir.to_path_buf().join("dir"), - &tmp_dir.to_path_buf().join("symlink"), + tmp_dir.to_path_buf().join("dir"), + tmp_dir.to_path_buf().join("symlink"), ) .unwrap(); &["non_existing", "dir", "symlink"] @@ -2116,10 +2116,10 @@ yyy = 100 let source_file = source.join("file"); let target_file = target.join("file"); if !target.exists() { - fs::create_dir_all(&target).unwrap(); + fs::create_dir_all(target).unwrap(); check(); } - fs::copy(&source_file, &target_file).unwrap(); + fs::copy(source_file, target_file).unwrap(); check(); state.after_dump_data_with_check(&check); } @@ -2130,14 +2130,14 @@ yyy = 100 if dst.exists() { fs::remove_dir_all(dst)?; } - fs::create_dir_all(&dst)?; + fs::create_dir_all(dst)?; for entry in fs::read_dir(src)? { let entry = entry?; let ty = entry.file_type()?; if ty.is_dir() { copy_dir(&entry.path(), &dst.join(entry.file_name()))?; } else { - fs::copy(entry.path(), &dst.join(entry.file_name()))?; + fs::copy(entry.path(), dst.join(entry.file_name()))?; } } Ok(()) @@ -2151,7 +2151,7 @@ yyy = 100 fs::create_dir_all(&target).unwrap(); // Write some data into source. let source_file = source.join("file"); - File::create(&source_file).unwrap(); + File::create(source_file).unwrap(); let backup = dir.path().join("backup"); diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index 98c73e80c6a..9421c0e174b 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -91,7 +91,7 @@ pub fn panic_mark_file_path>(data_dir: P) -> PathBuf { pub fn create_panic_mark_file>(data_dir: P) { let file = panic_mark_file_path(data_dir); - File::create(&file).unwrap(); + File::create(file).unwrap(); } // Copied from file_system to avoid cyclic dependency diff --git a/components/tikv_util/src/logger/file_log.rs b/components/tikv_util/src/logger/file_log.rs index 5b575638c19..fa7b7c67fca 100644 --- a/components/tikv_util/src/logger/file_log.rs +++ b/components/tikv_util/src/logger/file_log.rs @@ -134,7 +134,7 @@ impl Write for RotatingFileLogger { self.file.flush()?; let new_path = (self.rename)(&self.path)?; - fs::rename(&self.path, &new_path)?; + fs::rename(&self.path, new_path)?; self.file = open_log_file(&self.path)?; // Updates all rotators' states. diff --git a/components/tikv_util/src/sys/cgroup.rs b/components/tikv_util/src/sys/cgroup.rs index df15a2dac76..2cd420e5d51 100644 --- a/components/tikv_util/src/sys/cgroup.rs +++ b/components/tikv_util/src/sys/cgroup.rs @@ -94,7 +94,7 @@ impl CGroupSys { } else { format!("{}/memory.limit_in_bytes", path.to_str().unwrap()) }; - return read_to_string(&path) + return read_to_string(path) .map(|x| parse_memory_max(x.trim())) .ok() .flatten(); @@ -112,7 +112,7 @@ impl CGroupSys { if let Some((root, mount_point)) = self.mount_points.get(component) { if let Some(path) = build_path(group, root, mount_point) { let path = format!("{}/cpuset.cpus", path.to_str().unwrap()); - if let Ok(s) = read_to_string(&path) { + if let Ok(s) = read_to_string(path) { return parse_cpu_cores(s.trim()); } } @@ -131,14 +131,14 @@ impl CGroupSys { if let Some(path) = build_path(group, root, mount_point) { if self.is_v2 { let path = format!("{}/cpu.max", path.to_str().unwrap()); - if let Ok(buffer) = read_to_string(&path) { + if let Ok(buffer) = read_to_string(path) { return parse_cpu_quota_v2(buffer.trim()); } } else { let path1 = format!("{}/cpu.cfs_quota_us", path.to_str().unwrap()); let path2 = format!("{}/cpu.cfs_period_us", path.to_str().unwrap()); if let (Ok(buffer1), Ok(buffer2)) = - (read_to_string(&path1), read_to_string(&path2)) + (read_to_string(path1), read_to_string(path2)) { return parse_cpu_quota_v1(buffer1.trim(), buffer2.trim()); } @@ -356,7 +356,7 @@ fn parse_cpu_quota_v1(line1: &str, line2: &str) -> Option { if max > 0.0 { if let Ok(period) = line2.parse::() { if period > 0.0 { - return Some(max as f64 / period as f64); + return Some(max / period); } } } else { @@ -385,11 +385,11 @@ mod tests { fn test_parse_mountinfos_without_cgroup() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"").unwrap(); @@ -402,12 +402,12 @@ mod tests { fn test_cpuset_cpu_cpuacct() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"30 26 0:27 / /sys/fs/cgroup/cpuset,cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,cpu,cpuacct\n").unwrap(); @@ -432,12 +432,12 @@ mod tests { fn test_mountinfo_with_relative_path() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw\n").unwrap(); @@ -461,12 +461,12 @@ mod tests { fn test_conflicting_mountinfo() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw 1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw").unwrap(); @@ -491,12 +491,12 @@ mod tests { fn test_cgroup_without_mountinfo() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw\n").unwrap(); diff --git a/components/tikv_util/src/sys/inspector.rs b/components/tikv_util/src/sys/inspector.rs index 7b49b647706..d2ff80c6416 100644 --- a/components/tikv_util/src/sys/inspector.rs +++ b/components/tikv_util/src/sys/inspector.rs @@ -90,7 +90,7 @@ mod linux { fn disk_stat(dev: &Self::DiskID) -> Result, String> { let path = "/proc/diskstats"; - let lines = read_to_string(&path).map_err(|e| format!("open({}): {}", path, e))?; + let lines = read_to_string(path).map_err(|e| format!("open({}): {}", path, e))?; for line in lines.split('\n').map(|x| x.trim()) { let stat = procfs::DiskStat::from_line(line) .map_err(|e| format!("parse disk stat: {}", e))?; diff --git a/components/tikv_util/src/sys/thread.rs b/components/tikv_util/src/sys/thread.rs index 00a6e47b409..60c420661d0 100644 --- a/components/tikv_util/src/sys/thread.rs +++ b/components/tikv_util/src/sys/thread.rs @@ -121,7 +121,7 @@ mod imp { // Unsafe due to FFI. unsafe { let tid = libc::syscall(libc::SYS_gettid); - if libc::setpriority(libc::PRIO_PROCESS as u32, tid as u32, pri) != 0 { + if libc::setpriority(libc::PRIO_PROCESS, tid as u32, pri) != 0 { let e = Error::last_os_error(); return Err(e); } @@ -134,7 +134,7 @@ mod imp { unsafe { let tid = libc::syscall(libc::SYS_gettid); clear_errno(); - let ret = libc::getpriority(libc::PRIO_PROCESS as u32, tid as u32); + let ret = libc::getpriority(libc::PRIO_PROCESS, tid as u32); if ret == -1 { let e = Error::last_os_error(); if let Some(errno) = e.raw_os_error() { diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 5c9abf0d305..01133a71924 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -681,7 +681,7 @@ mod tests { let shorter_encoded = Key::from_encoded_slice(&encoded.0[..encoded_len - 9]); assert!(!shorter_encoded.is_encoded_from(&raw)); let mut longer_encoded = encoded.as_encoded().clone(); - longer_encoded.extend(&[0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); + longer_encoded.extend([0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); let longer_encoded = Key::from_encoded(longer_encoded); assert!(!longer_encoded.is_encoded_from(&raw)); diff --git a/fuzz/cli.rs b/fuzz/cli.rs index 96972d94565..201e659d8ba 100644 --- a/fuzz/cli.rs +++ b/fuzz/cli.rs @@ -31,7 +31,7 @@ lazy_static! { static ref FUZZ_ROOT: PathBuf = WORKSPACE_ROOT.join("fuzz"); static ref FUZZ_TARGETS: Vec = { let source = FUZZ_ROOT.join("targets/mod.rs"); - let targets_rs = fs::read_to_string(&source).unwrap(); + let targets_rs = fs::read_to_string(source).unwrap(); let match_fuzz_fs = regex::Regex::new(r"pub fn fuzz_(\w+)\(").unwrap(); let target_names = match_fuzz_fs .captures_iter(&targets_rs) @@ -110,7 +110,7 @@ fn write_fuzz_target_source_file(fuzzer: Fuzzer, target: &str) -> Result<()> { template_file_path.display() ))?; - let target_file_path = fuzzer.directory().join(&format!("src/bin/{}.rs", target)); + let target_file_path = fuzzer.directory().join(format!("src/bin/{}.rs", target)); let mut file = fs::OpenOptions::new() .write(true) .create(true) @@ -159,7 +159,7 @@ fn get_seed_dir(target: &str) -> PathBuf { /// Create corpus dir for fuzz target fn create_corpus_dir(base: impl AsRef, target: &str) -> Result { let base = base.as_ref(); - let corpus_dir = base.join(&format!("corpus-{}", target)); + let corpus_dir = base.join(format!("corpus-{}", target)); fs::create_dir_all(&corpus_dir).context(format!( "unable to create corpus dir for {}{}", base.display(), @@ -192,13 +192,13 @@ fn run_afl(target: &str) -> Result<()> { let corpus_dir = create_corpus_dir(fuzzer.directory(), target)?; pre_check( - Command::new("cargo").args(&["afl", "--version"]), + Command::new("cargo").args(["afl", "--version"]), "cargo install afl", )?; // 1. cargo afl build (in fuzzer-afl directory) let fuzzer_build = Command::new("cargo") - .args(&["afl", "build", "--bin", target]) + .args(["afl", "build", "--bin", target]) .current_dir(fuzzer.directory()) .spawn() .context(format!("Failed to build {}", fuzzer))? @@ -218,7 +218,7 @@ fn run_afl(target: &str) -> Result<()> { // ``` let instrumented_bin = WORKSPACE_ROOT.join("target/debug").join(target); let fuzzer_bin = Command::new("cargo") - .args(&["afl", "fuzz"]) + .args(["afl", "fuzz"]) .arg("-i") .arg(&seed_dir) .arg("-o") @@ -244,7 +244,7 @@ fn run_afl(target: &str) -> Result<()> { /// Run one target fuzz test using Honggfuzz fn run_honggfuzz(target: &str) -> Result<()> { pre_check( - Command::new("cargo").args(&["hfuzz", "version"]), + Command::new("cargo").args(["hfuzz", "version"]), "cargo install honggfuzz --version 0.5.45", )?; @@ -262,7 +262,7 @@ fn run_honggfuzz(target: &str) -> Result<()> { ); let fuzzer_bin = Command::new("cargo") - .args(&["hfuzz", "run", target]) + .args(["hfuzz", "run", target]) .env("RUSTFLAGS", &rust_flags) .env("HFUZZ_RUN_ARGS", &hfuzz_args) .current_dir(fuzzer.directory()) @@ -321,7 +321,7 @@ fn run_libfuzzer(target: &str) -> Result<()> { asan_options.push_str(" detect_odr_violation=0"); let fuzzer_bin = Command::new("cargo") - .args(&["run", "--target", target_platform, "--bin", target, "--"]) + .args(["run", "--target", target_platform, "--bin", target, "--"]) .arg(&corpus_dir) .arg(&seed_dir) .env("RUSTFLAGS", &rust_flags) diff --git a/rust-toolchain b/rust-toolchain index 2181086f8d2..4e5f9a4d82b 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -nightly-2022-07-31 +nightly-2022-11-15 diff --git a/scripts/clippy b/scripts/clippy index c5999ad670c..7685cddfeeb 100755 --- a/scripts/clippy +++ b/scripts/clippy @@ -19,11 +19,15 @@ fi # - Enables `significant_drop_in_scrutinee` after # https://github.com/rust-lang/rust-clippy/issues/8963 is fixed. # - `derive_partial_eq_without_eq` has compilation overhead. +# - Blocking issue for enabling `result_large_err` is the protobuf messages. +# - Blocking issue for clippy::large_enum_variant is the raftstore peer message. CLIPPY_LINTS=( -A clippy::module_inception \ + -A clippy::result_large_err \ + -A clippy::large_enum_variant \ -A clippy::should_implement_trait \ -A clippy::too_many_arguments \ - -A clippy::blacklisted_name \ + -A clippy::disallowed_names \ -A clippy::redundant_closure \ -A clippy::field_reassign_with_default \ -A clippy::wrong_self_convention \ diff --git a/src/config.rs b/src/config.rs index c33c8e8b63c..e9eca154d6e 100644 --- a/src/config.rs +++ b/src/config.rs @@ -113,12 +113,7 @@ fn memory_limit_for_cf(is_raft_db: bool, cf: &str, total_mem: u64) -> ReadableSi (false, CF_WRITE) => (0.15, 0, usize::MAX), _ => unreachable!(), }; - let mut size = (total_mem as f64 * ratio) as usize; - if size < min { - size = min; - } else if size > max { - size = max; - } + let size = ((total_mem as f64 * ratio) as usize).clamp(min, max); ReadableSize::mb(size as u64 / MIB) } @@ -182,13 +177,13 @@ impl Default for TitanCfConfig { impl TitanCfConfig { fn build_opts(&self) -> RocksTitanDbOptions { let mut opts = RocksTitanDbOptions::new(); - opts.set_min_blob_size(self.min_blob_size.0 as u64); + opts.set_min_blob_size(self.min_blob_size.0); opts.set_blob_file_compression(self.blob_file_compression.into()); opts.set_blob_cache(self.blob_cache_size.0 as usize, -1, false, 0.0); - opts.set_min_gc_batch_size(self.min_gc_batch_size.0 as u64); - opts.set_max_gc_batch_size(self.max_gc_batch_size.0 as u64); + opts.set_min_gc_batch_size(self.min_gc_batch_size.0); + opts.set_max_gc_batch_size(self.max_gc_batch_size.0); opts.set_discardable_ratio(self.discardable_ratio); - opts.set_merge_small_file_threshold(self.merge_small_file_threshold.0 as u64); + opts.set_merge_small_file_threshold(self.merge_small_file_threshold.0); opts.set_blob_run_mode(self.blob_run_mode.into()); opts.set_level_merge(self.level_merge); opts.set_range_merge(self.range_merge); @@ -254,10 +249,7 @@ fn get_background_job_limits_impl( ); // Cap max_sub_compactions to allow at least two compactions. let max_compactions = max_background_jobs - max_background_flushes; - let max_sub_compactions: u32 = cmp::max( - 1, - cmp::min(defaults.max_sub_compactions, (max_compactions - 1) as u32), - ); + let max_sub_compactions: u32 = (max_compactions - 1).clamp(1, defaults.max_sub_compactions); // Maximum background GC threads for Titan let max_titan_background_gc = cmp::min(defaults.max_titan_background_gc, cpu_num); @@ -1123,7 +1115,7 @@ impl Default for DbConfig { rate_limiter_auto_tuned: true, bytes_per_sync: ReadableSize::mb(1), wal_bytes_per_sync: ReadableSize::kb(512), - max_sub_compactions: bg_job_limits.max_sub_compactions as u32, + max_sub_compactions: bg_job_limits.max_sub_compactions, writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, enable_pipelined_write: false, @@ -1179,8 +1171,8 @@ impl DbConfig { } } - opts.set_bytes_per_sync(self.bytes_per_sync.0 as u64); - opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0 as u64); + opts.set_bytes_per_sync(self.bytes_per_sync.0); + opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0); opts.set_max_subcompactions(self.max_sub_compactions); opts.set_writable_file_max_buffer_size(self.writable_file_max_buffer_size.0 as i32); opts.set_use_direct_io_for_flush_and_compaction( @@ -1434,7 +1426,7 @@ impl Default for RaftDbConfig { info_log_keep_log_file_num: 10, info_log_dir: "".to_owned(), info_log_level: RocksLogLevel::Info, - max_sub_compactions: bg_job_limits.max_sub_compactions as u32, + max_sub_compactions: bg_job_limits.max_sub_compactions, writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, enable_pipelined_write: true, @@ -1481,8 +1473,8 @@ impl RaftDbConfig { opts.enable_unordered_write(self.enable_unordered_write); opts.allow_concurrent_memtable_write(self.allow_concurrent_memtable_write); opts.add_event_listener(RocksEventListener::new("raft", None)); - opts.set_bytes_per_sync(self.bytes_per_sync.0 as u64); - opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0 as u64); + opts.set_bytes_per_sync(self.bytes_per_sync.0); + opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0); // TODO maybe create a new env for raft engine if self.titan.enabled { opts.set_titandb_options(&self.titan.build_opts()); @@ -3633,7 +3625,7 @@ pub fn persist_config(config: &TikvConfig) -> Result<(), String> { } // Create parent directory if missing. - if let Err(e) = fs::create_dir_all(&store_path) { + if let Err(e) = fs::create_dir_all(store_path) { return Err(format!( "create parent directory '{}' failed: {}", store_path.to_str().unwrap(), diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 1b7d42a8575..5123534db88 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -1332,7 +1332,7 @@ mod tests { let config = Config { end_point_request_max_handle_duration: ReadableDuration::millis( - (PAYLOAD_SMALL + PAYLOAD_LARGE) as u64 * 2, + (PAYLOAD_SMALL + PAYLOAD_LARGE) * 2, ), ..Default::default() }; @@ -1357,23 +1357,22 @@ mod tests { // Request 1: Unary, success response. let handler_builder = Box::new(|_, _: &_| { - Ok(UnaryFixture::new_with_duration( - Ok(coppb::Response::default()), - PAYLOAD_SMALL as u64, + Ok( + UnaryFixture::new_with_duration(Ok(coppb::Response::default()), PAYLOAD_SMALL) + .into_boxed(), ) - .into_boxed()) }); let resp_future_1 = copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_1).unwrap()]).unwrap()); // Sleep a while to make sure that thread is spawn and snapshot is taken. - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Request 2: Unary, error response. let handler_builder = Box::new(|_, _: &_| { Ok( - UnaryFixture::new_with_duration(Err(box_err!("foo")), PAYLOAD_LARGE as u64) + UnaryFixture::new_with_duration(Err(box_err!("foo")), PAYLOAD_LARGE) .into_boxed(), ) }); @@ -1381,7 +1380,7 @@ mod tests { copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_2).unwrap()]).unwrap()); - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Response 1 let resp = &rx.recv().unwrap()[0]; @@ -1447,7 +1446,7 @@ mod tests { let handler_builder = Box::new(|_, _: &_| { Ok(UnaryFixture::new_with_duration_yieldable( Ok(coppb::Response::default()), - PAYLOAD_SMALL as u64, + PAYLOAD_SMALL, ) .into_boxed()) }); @@ -1456,21 +1455,20 @@ mod tests { let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_1).unwrap()]).unwrap()); // Sleep a while to make sure that thread is spawn and snapshot is taken. - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Request 2: Unary, error response. let handler_builder = Box::new(|_, _: &_| { - Ok(UnaryFixture::new_with_duration_yieldable( - Err(box_err!("foo")), - PAYLOAD_LARGE as u64, + Ok( + UnaryFixture::new_with_duration_yieldable(Err(box_err!("foo")), PAYLOAD_LARGE) + .into_boxed(), ) - .into_boxed()) }); let resp_future_2 = copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_2).unwrap()]).unwrap()); - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Response 1 // @@ -1524,18 +1522,17 @@ mod tests { // Request 1: Unary, success response. let handler_builder = Box::new(|_, _: &_| { - Ok(UnaryFixture::new_with_duration( - Ok(coppb::Response::default()), - PAYLOAD_LARGE as u64, + Ok( + UnaryFixture::new_with_duration(Ok(coppb::Response::default()), PAYLOAD_LARGE) + .into_boxed(), ) - .into_boxed()) }); let resp_future_1 = copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_1).unwrap()]).unwrap()); // Sleep a while to make sure that thread is spawn and snapshot is taken. - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Request 2: Stream. let handler_builder = Box::new(|_, _: &_| { @@ -1545,11 +1542,7 @@ mod tests { Err(box_err!("foo")), Ok(coppb::Response::default()), ], - vec![ - PAYLOAD_SMALL as u64, - PAYLOAD_LARGE as u64, - PAYLOAD_SMALL as u64, - ], + vec![PAYLOAD_SMALL, PAYLOAD_LARGE, PAYLOAD_SMALL], ) .into_boxed()) }); diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index ade8a007383..383f6161a1b 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -843,7 +843,7 @@ impl SampleBuilder { .map_or_else(|| 0_usize, |req| req.get_top_n_size() as usize), common_handle_col_ids: common_handle_ids, columns_info, - analyze_common_handle: common_handle_req != None, + analyze_common_handle: common_handle_req.is_some(), }) } @@ -1116,7 +1116,7 @@ impl AnalyzeSamplingResult { impl Default for AnalyzeSamplingResult { fn default() -> Self { - AnalyzeSamplingResult::new(Box::new(ReservoirRowSampleCollector::default())) + AnalyzeSamplingResult::new(Box::::default()) } } diff --git a/src/coprocessor/statistics/histogram.rs b/src/coprocessor/statistics/histogram.rs index 8797c38a721..b7a70600e39 100644 --- a/src/coprocessor/statistics/histogram.rs +++ b/src/coprocessor/statistics/histogram.rs @@ -29,7 +29,7 @@ impl Bucket { upper_bound, lower_bound, repeats, - ndv: if with_ndv { 1 } else { 0 }, + ndv: with_ndv as u64, } } diff --git a/src/coprocessor_v2/plugin_registry.rs b/src/coprocessor_v2/plugin_registry.rs index cbcba39995d..6262fe6bae9 100644 --- a/src/coprocessor_v2/plugin_registry.rs +++ b/src/coprocessor_v2/plugin_registry.rs @@ -130,7 +130,7 @@ impl PluginRegistry { // Simple helper functions for loading/unloading plugins. let maybe_load = |file: &PathBuf| { let mut hot_reload_registry = hot_reload_registry.write().unwrap(); - if is_library_file(&file) { + if is_library_file(file) { // Ignore errors. hot_reload_registry.load_plugin(file).ok(); } @@ -243,7 +243,7 @@ impl PluginRegistry { let dir_name = dir_name.into(); let mut loaded_plugins = Vec::new(); - for entry in std::fs::read_dir(&dir_name)? { + for entry in std::fs::read_dir(dir_name)? { if let Ok(file) = entry.map(|f| f.path()) { if is_library_file(&file) { // Ignore errors. @@ -489,7 +489,7 @@ mod tests { fn load_plugin() { let library_path = initialize_library(); - let loaded_plugin = unsafe { LoadedPlugin::new(&library_path).unwrap() }; + let loaded_plugin = unsafe { LoadedPlugin::new(library_path).unwrap() }; assert_eq!(loaded_plugin.name(), "example_coprocessor_plugin"); assert_eq!(loaded_plugin.version(), &Version::parse("0.1.0").unwrap()); @@ -548,7 +548,7 @@ mod tests { let registry = PluginRegistry::new(); - let plugin_name = registry.load_plugin(&library_path).unwrap(); + let plugin_name = registry.load_plugin(library_path).unwrap(); assert!(registry.get_plugin(&plugin_name).is_some()); @@ -576,7 +576,7 @@ mod tests { registry.start_hot_reloading(&coprocessor_dir).unwrap(); // trigger loading - std::fs::copy(&original_library_path, &library_path).unwrap(); + std::fs::copy(original_library_path, &library_path).unwrap(); // fs watcher detects changes in every 3 seconds, therefore, wait 4 seconds so // as to make sure the watcher is triggered. std::thread::sleep(Duration::from_secs(4)); diff --git a/src/lib.rs b/src/lib.rs index a961abc7d38..f4fcd1cd97c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,8 +25,7 @@ #![feature(box_patterns)] #![feature(drain_filter)] #![feature(deadline_api)] -#![feature(generic_associated_types)] -#![feature(let_else)] +#![feature(let_chains)] #[macro_use(fail_point)] extern crate fail; diff --git a/src/read_pool.rs b/src/read_pool.rs index 4d9f7fd9264..5212c4ae594 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -192,10 +192,7 @@ impl ReadPoolHandle { match self { ReadPoolHandle::FuturePools { read_pool_normal, .. - } => { - read_pool_normal.get_running_task_count() as usize - / read_pool_normal.get_pool_size() - } + } => read_pool_normal.get_running_task_count() / read_pool_normal.get_pool_size(), ReadPoolHandle::Yatp { running_tasks, pool_size, diff --git a/src/server/debug.rs b/src/server/debug.rs index 6ee676ad1c4..48435f72163 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -884,7 +884,7 @@ impl Debugger { res.push(("region.end_key".to_owned(), hex::encode(®ion.end_key))); res.push(( "region.middle_key_by_approximate_size".to_owned(), - hex::encode(&middle_key), + hex::encode(middle_key), )); Ok(res) diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index 323f414c05c..f370a08e280 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -156,7 +156,7 @@ impl TabletFactory for KvEngineFactoryV2 { let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); // When the full directory path does not exsit, create will return error and in // this case, we just ignore it. - let _ = std::fs::File::create(&path); + let _ = std::fs::File::create(path); debug!("tombstone tablet"; "region_id" => region_id, "suffix" => suffix); { let mut reg = self.registry.lock().unwrap(); @@ -201,7 +201,7 @@ impl TabletFactory for KvEngineFactoryV2 { } let db_path = self.tablet_path(region_id, suffix); - std::fs::rename(path, &db_path)?; + std::fs::rename(path, db_path)?; self.open_tablet( region_id, Some(suffix), diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index b80c17e5ff9..01e37727f11 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -197,7 +197,7 @@ fn set_status_metrics(state: GcManagerState) { ] { AUTO_GC_STATUS_GAUGE_VEC .with_label_values(&[s.tag()]) - .set(if state == *s { 1 } else { 0 }); + .set((state == *s) as i64); } } diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 8e345f0909b..81de11cbae9 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -492,7 +492,7 @@ where "versions" => gc_info.found_versions, ); } - if gc_info.deleted_versions as usize >= GC_LOG_DELETED_VERSION_THRESHOLD { + if gc_info.deleted_versions >= GC_LOG_DELETED_VERSION_THRESHOLD { debug!( "GC deleted plenty versions for a key"; "key" => %key, diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index 33164833fba..8cce7bc5da6 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -384,8 +384,8 @@ impl WaitTable { fn to_wait_for_entries(&self) -> Vec { self.waiter_pool - .iter() - .map(|(_, waiter)| { + .values() + .map(|waiter| { let mut wait_for_entry = WaitForEntry::default(); wait_for_entry.set_txn(waiter.start_ts.into_inner()); wait_for_entry.set_wait_for_txn(waiter.wait_info.lock_digest.ts.into_inner()); diff --git a/src/server/node.rs b/src/server/node.rs index 65dd592b490..0b654921f59 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -357,7 +357,7 @@ where because found data key that is not written by TiDB: {:?}", ident.api_version, self.api_version, - log_wrappers::hex_encode_upper(&unexpected_data_key) + log_wrappers::hex_encode_upper(unexpected_data_key) )); } } diff --git a/src/server/service/diagnostics/log.rs b/src/server/service/diagnostics/log.rs index 6f06bf17b30..8e77d65233e 100644 --- a/src/server/service/diagnostics/log.rs +++ b/src/server/service/diagnostics/log.rs @@ -559,7 +559,7 @@ Some invalid logs 2: Welcome to TiKV .unwrap(); let log_file2 = dir.path().join("tikv.2019-08-23T18-10-00.387.log"); - let mut file = File::create(&log_file2).unwrap(); + let mut file = File::create(log_file2).unwrap(); write!( file, r#"[2019/08/23 18:10:01.387 +08:00] [INFO] [foo.rs:100] [some message] [key=val] @@ -736,7 +736,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# // this file is ignored because its filename is not expected let log_file2 = dir.path().join("tikv.log.2"); - let mut file = File::create(&log_file2).unwrap(); + let mut file = File::create(log_file2).unwrap(); write!( file, r#"[2019/08/23 18:10:01.387 +08:00] [INFO] [foo.rs:100] [some message] [key=val] @@ -749,7 +749,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# .unwrap(); let log_file3 = dir.path().join("tikv.2019-08-23T18-11-02.123.log"); - let mut file = File::create(&log_file3).unwrap(); + let mut file = File::create(log_file3).unwrap(); write!( file, r#"[2019/08/23 18:11:53.387 +08:00] [INFO] [foo.rs:100] [some message] [key=val] @@ -766,7 +766,7 @@ Some invalid logs 2: Welcome to TiKV - test-filter"# // this file is ignored because its filename is not expected let log_file4 = dir.path().join("tikv.T.log"); - let mut file = File::create(&log_file4).unwrap(); + let mut file = File::create(log_file4).unwrap(); write!( file, r#"[2019/08/23 18:10:01.387 +08:00] [INFO] [foo.rs:100] [some message] [key=val] diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index e62028e66e6..17ed9a78b3f 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -201,7 +201,7 @@ fn nic_load_info(prev_nic: HashMap, collector: &mut Vec, collector: &mut Vec) { let current = ioload::IoLoad::snapshot(); - let rate = |cur, prev| (cur - prev) as f64; + let rate = |cur, prev| (cur - prev); for (name, cur) in current.into_iter() { let prev = match prev_io.get(&name) { Some(p) => p, diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index 3419c7df0c8..b3d91d3bea6 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -234,7 +234,7 @@ pub fn read_file(path: &str) -> Result, String> { pub fn jeprof_heap_profile(path: &str) -> Result, String> { info!("using jeprof to process {}", path); let output = Command::new("./jeprof") - .args(&["--show_bytes", "./bin/tikv-server", path, "--svg"]) + .args(["--show_bytes", "./bin/tikv-server", path, "--svg"]) .output() .map_err(|e| format!("jeprof: {}", e))?; if !output.status.success() { @@ -250,7 +250,7 @@ pub fn list_heap_profiles() -> Result, String> { None => return Ok(vec![]), }; - let dir = std::fs::read_dir(&path).map_err(|e| format!("read dir fail: {}", e))?; + let dir = std::fs::read_dir(path).map_err(|e| format!("read dir fail: {}", e))?; let mut profiles = Vec::new(); for item in dir { let item = match item { diff --git a/src/storage/config.rs b/src/storage/config.rs index 7f2e6820201..685272dbeee 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -78,7 +78,7 @@ impl Default for Config { scheduler_worker_pool_size: if cpu_num >= 16.0 { 8 } else { - std::cmp::max(1, std::cmp::min(4, cpu_num as usize)) + cpu_num.clamp(1., 4.) as usize }, scheduler_pending_write_threshold: ReadableSize::mb(DEFAULT_SCHED_PENDING_WRITE_MB), reserve_space: ReadableSize::gb(DEFAULT_RESERVED_SPACE_GB), diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 9966e14812e..3f5e48e8017 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -1658,7 +1658,7 @@ mod tests { assert!(latches.acquire(&mut lock, id)); } let unlocked = latches.release(&lock, id); - if id as u64 == max_id { + if id == max_id { assert!(unlocked.is_empty()); } else { assert_eq!(unlocked, vec![id + 1]); diff --git a/tests/benches/misc/coprocessor/codec/mysql/json/mod.rs b/tests/benches/misc/coprocessor/codec/mysql/json/mod.rs index 2fcc3915125..7796be6c53b 100644 --- a/tests/benches/misc/coprocessor/codec/mysql/json/mod.rs +++ b/tests/benches/misc/coprocessor/codec/mysql/json/mod.rs @@ -18,7 +18,7 @@ fn download_and_extract_file(url: &str) -> io::Result { .stderr(Stdio::null()) .spawn()?; let mut tar_child = Command::new("tar") - .args(&["xzf", "-", "--to-stdout"]) + .args(["xzf", "-", "--to-stdout"]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::null()) diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index 93acfffc258..dde25bff636 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -163,7 +163,7 @@ fn assert_snapshot(snap_dir: &str, region_id: u64, exist: bool) { let region_id = format!("{}", region_id); let timer = Instant::now(); loop { - for p in fs::read_dir(&snap_dir).unwrap() { + for p in fs::read_dir(snap_dir).unwrap() { let name = p.unwrap().file_name().into_string().unwrap(); let mut parts = name.split('_'); parts.next(); @@ -354,12 +354,12 @@ fn test_shutdown_when_snap_gc() { pd_client.must_add_peer(r1, new_learner_peer(2, 2)); // Snapshot directory on store 2 shouldn't be empty. - let snap_dir = cluster.get_snap_dir(2); + let snap_dir = &cluster.get_snap_dir(2); for i in 0..=100 { if i == 100 { panic!("store 2 snap dir must not be empty"); } - let dir = fs::read_dir(&snap_dir).unwrap(); + let dir = fs::read_dir(snap_dir).unwrap(); if dir.count() > 0 { break; } @@ -377,7 +377,7 @@ fn test_shutdown_when_snap_gc() { cluster.stop_node(2); let snap_dir = cluster.get_snap_dir(2); - let dir = fs::read_dir(&snap_dir).unwrap(); + let dir = fs::read_dir(snap_dir).unwrap(); if dir.count() == 0 { panic!("store 2 snap dir must not be empty"); } @@ -591,7 +591,7 @@ fn test_snapshot_gc_after_failed() { let mut snap_file_path = PathBuf::from(&snap_dir); snap_file_path.push(&f); let snap_file_path = snap_file_path.as_path(); - let mut file = match File::create(&snap_file_path) { + let mut file = match File::create(snap_file_path) { Err(why) => panic!("couldn't create {:?}: {}", snap_file_path, why), Ok(file) => file, }; diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index ff07d8a712a..f432fd72246 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -499,7 +499,7 @@ fn test_invalid_external_storage() { // Set backup directory read-only. TiKV fails to backup. let tmp = Builder::new().tempdir().unwrap(); - let f = File::open(&tmp.path()).unwrap(); + let f = File::open(tmp.path()).unwrap(); let mut perms = f.metadata().unwrap().permissions(); perms.set_readonly(true); f.set_permissions(perms.clone()).unwrap(); diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 03bc7ba46c1..70e70b3cbe6 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -76,7 +76,7 @@ fn start_raftstore( .as_path() .display() .to_string(); - Arc::new(SstImporter::new(&cfg.import, &p, None, cfg.storage.api_version()).unwrap()) + Arc::new(SstImporter::new(&cfg.import, p, None, cfg.storage.api_version()).unwrap()) }; let snap_mgr = { let p = dir diff --git a/tests/integrations/config/test_config_client.rs b/tests/integrations/config/test_config_client.rs index 6faa68f3932..b56987fa1dc 100644 --- a/tests/integrations/config/test_config_client.rs +++ b/tests/integrations/config/test_config_client.rs @@ -149,7 +149,7 @@ blob-run-mode = "normal" cfg_controller.update(change).unwrap(); let res = { let mut buf = Vec::new(); - let mut f = File::open(&cfg_controller.get_current().cfg_path).unwrap(); + let mut f = File::open(cfg_controller.get_current().cfg_path).unwrap(); f.read_to_end(&mut buf).unwrap(); buf }; diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 952516daf35..96ceb1c5c8c 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -764,7 +764,7 @@ fn test_order_by_pk_with_select_from_index() { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( &mut EvalContext::default(), - &[name_datum, (cnt as i64).into(), (id as i64).into()], + &[name_datum, cnt.into(), id.into()], ) .unwrap(); let result_encoded = datum::encode_value(&mut EvalContext::default(), &row).unwrap(); From d58343d03890c3970178bffc0e9fafdd3a0d7df0 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 15 Nov 2022 17:35:55 +0800 Subject: [PATCH 335/676] cdc: run CheckLeader in a dedicate thread (#13799) close tikv/tikv#13774 cdc: run CheckLeader in a dedicate thread Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- .../cdc/tests/failpoints/test_endpoint.rs | 8 ++++- components/cdc/tests/mod.rs | 36 +++++++++++++++++-- components/server/src/server.rs | 8 ++++- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index 6e208ccac90..3fdd6048971 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -8,7 +8,7 @@ use std::{ use api_version::{test_kv_format_impl, KvFormat}; use causal_ts::CausalTsProvider; -use cdc::{recv_timeout, OldValueCache, Task, Validate}; +use cdc::{recv_timeout, Delegate, OldValueCache, Task, Validate}; use futures::{executor::block_on, sink::SinkExt}; use grpcio::{ChannelBuilder, Environment, WriteFlags}; use kvproto::{cdcpb::*, kvrpcpb::*, tikvpb_grpc::TikvClient}; @@ -58,6 +58,12 @@ fn test_cdc_double_scan_deregister_impl() { new_event_feed(suite.get_region_cdc_client(1)); block_on(req_tx_1.send((req, WriteFlags::default()))).unwrap(); + // wait for the second connection register to the delegate. + suite.must_wait_delegate_condition( + 1, + Arc::new(|d: Option<&Delegate>| d.unwrap().downstreams().len() == 2), + ); + // close connection block_on(req_tx.close()).unwrap(); event_feed_wrap.replace(None); diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index c14a91de99a..9e6621ffbdf 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -1,9 +1,12 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::*, time::Duration}; +use std::{ + sync::*, + time::{Duration, Instant}, +}; use causal_ts::CausalTsProvider; -use cdc::{recv_timeout, CdcObserver, FeatureGate, MemoryQuota, Task}; +use cdc::{recv_timeout, CdcObserver, Delegate, FeatureGate, MemoryQuota, Task, Validate}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::RocksEngine; @@ -523,4 +526,33 @@ impl TestSuite { ) .unwrap(); } + + pub fn must_wait_delegate_condition( + &self, + region_id: u64, + cond: Arc) -> bool + Sync + Send>, + ) { + let scheduler = self.endpoints[®ion_id].scheduler(); + let start = Instant::now(); + loop { + sleep_ms(100); + let (tx, rx) = mpsc::sync_channel(1); + let c = cond.clone(); + let checker = move |d: Option<&Delegate>| { + tx.send(c(d)).unwrap(); + }; + scheduler + .schedule(Task::Validate(Validate::Region( + region_id, + Box::new(checker), + ))) + .unwrap(); + if rx.recv().unwrap() { + return; + } + if start.elapsed() > Duration::from_secs(5) { + panic!("wait delegate timeout"); + } + } + } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index e4f4dc83049..aa3a67591e2 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -232,6 +232,7 @@ struct TikvServer { concurrency_manager: ConcurrencyManager, env: Arc, background_worker: Worker, + check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, causal_ts_provider: Option>, // used for rawkv apiv2 @@ -360,6 +361,10 @@ where info!("Causal timestamp provider startup."); } + // Run check leader in a dedicate thread, because it is time sensitive + // and crucial to TiCDC replication lag. + let check_leader_worker = WorkerBuilder::new("check_leader").thread_count(1).create(); + TikvServer { config, cfg_controller: Some(cfg_controller), @@ -381,6 +386,7 @@ where concurrency_manager, env, background_worker, + check_leader_worker, flow_info_sender: None, flow_info_receiver: None, sst_worker: None, @@ -870,7 +876,7 @@ where self.coprocessor_host.clone().unwrap(), ); let check_leader_scheduler = self - .background_worker + .check_leader_worker .start("check-leader", check_leader_runner); let server_config = Arc::new(VersionTrack::new(self.config.server.clone())); From fca5a9ef2d80a88282de57f75e5243bd2cd14486 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Wed, 16 Nov 2022 09:45:55 +0800 Subject: [PATCH 336/676] storage: avoid repeating unnecessary checks on flashback keys (#13801) ref tikv/tikv#13800 - A tiny refactor to `flashback_to_version_read_write` to reduce the unnecessary checks on flashback keys. - Check the flashback state while validating the local read request to not let a flashback read phase request bypass. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- .../raftstore/src/store/worker/metrics.rs | 2 + components/raftstore/src/store/worker/read.rs | 17 ++++ src/storage/mvcc/reader/reader.rs | 90 +++++++------------ .../txn/actions/flashback_to_version.rs | 71 ++++----------- .../flashback_to_version_read_phase.rs | 1 - .../integrations/raftstore/test_flashback.rs | 16 +++- 6 files changed, 86 insertions(+), 111 deletions(-) diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index 0d396eae575..5861e27a508 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -57,6 +57,8 @@ make_static_metric! { cache_miss, safe_ts, witness, + flashback_not_prepared, + flashback_in_progress, } pub struct LocalReadRejectCounter : LocalIntCounter { diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 2c92923fc4e..0766a52a387 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -813,6 +813,23 @@ where return Ok(None); } + // Check whether the region is in the flashback state and the local read could + // be performed. + let is_in_flashback = delegate.region.is_in_flashback; + if let Err(e) = util::check_flashback_state(is_in_flashback, req, region_id) { + TLS_LOCAL_READ_METRICS.with(|m| match e { + Error::FlashbackNotPrepared(_) => { + m.borrow_mut().reject_reason.flashback_not_prepared.inc() + } + Error::FlashbackInProgress(_) => { + m.borrow_mut().reject_reason.flashback_in_progress.inc() + } + _ => unreachable!(), + }); + debug!("rejected by flashback state"; "is_in_flashback" => is_in_flashback, "tag" => &delegate.tag); + return Ok(None); + } + Ok(Some(delegate)) } } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 2fe95c2c1dd..61975aa666c 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -536,9 +536,8 @@ impl MvccReader { /// specified, it will scan the latest version for each key, if the key /// does not exist or is not visible at that point, an `Option::None` will /// be placed. The return type is: - /// * `(Vec<(key, commit_ts, Option)>, has_remain)`. + /// * `(Vec<(key, Option)>, has_remain)`. /// - `key` is the encoded key without commit ts. - /// - `commit_ts` is the latest commit ts of the key. /// - `write` is the PUT/DELETE write record at the given version. /// - `has_remain` indicates whether there MAY be remaining writes that /// can be scanned. @@ -554,9 +553,9 @@ impl MvccReader { version: Option, filter: F, limit: usize, - ) -> Result<(Vec<(Key, TimeStamp, Option)>, bool)> + ) -> Result<(Vec<(Key, Option)>, bool)> where - F: Fn(&Key) -> bool, + F: Fn(&Key /* user key */, TimeStamp /* latest `commit_ts` */) -> bool, { self.create_write_cursor()?; let cursor = self.write_cursor.as_mut().unwrap(); @@ -581,14 +580,17 @@ impl MvccReader { } } let commit_ts = key.decode_ts()?; - let user_key = key.clone().truncate_ts()?; - // To make sure we only check each unique key once and `filter(&key)` returns + let user_key = key.truncate_ts()?; + // To make sure we only check each unique user key once and the filter returns // true. - if (cur_key.is_some() && cur_key.clone().unwrap() == user_key) || !filter(&key) { + let is_same_user_key = cur_key.as_ref() == Some(&user_key); + if !is_same_user_key { + cur_key = Some(user_key.clone()); + } + if is_same_user_key || !filter(&user_key, commit_ts) { cursor.next(&mut self.statistics.write); continue; } - cur_key = Some(user_key.clone()); let mut write = None; let version_key = user_key.clone().append_ts(version); @@ -623,7 +625,7 @@ impl MvccReader { } } } - key_writes.push((user_key, commit_ts, write)); + key_writes.push((user_key, write)); if limit > 0 && key_writes.len() == limit { has_remain = true; break; @@ -1824,7 +1826,7 @@ pub mod tests { end_key: Option, version: Option, limit: usize, - expect_res: Vec<(Key, TimeStamp, Option)>, + expect_res: Vec<(Key, Option)>, expect_is_remain: bool, } @@ -1838,7 +1840,6 @@ pub mod tests { expect_res: vec![ ( Key::from_raw(b"k0"), - 1000.into(), Some(Write::new( WriteType::Put, 999.into(), @@ -1847,17 +1848,14 @@ pub mod tests { ), ( Key::from_raw(b"k1"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), ), ( Key::from_raw(b"k2"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), ), ( Key::from_raw(b"k3"), - 9.into(), Some(Write::new(WriteType::Put, 8.into(), Some(b"v3@8".to_vec()))), ), ], @@ -1870,20 +1868,17 @@ pub mod tests { version: Some(9), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), + (Key::from_raw(b"k0"), None), ( Key::from_raw(b"k1"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), ), ( Key::from_raw(b"k2"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), ), ( Key::from_raw(b"k3"), - 9.into(), Some(Write::new(WriteType::Put, 8.into(), Some(b"v3@8".to_vec()))), ), ], @@ -1896,20 +1891,17 @@ pub mod tests { version: Some(8), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), + (Key::from_raw(b"k0"), None), ( Key::from_raw(b"k1"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), ), ( Key::from_raw(b"k2"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), ), ( Key::from_raw(b"k3"), - 9.into(), Some(Write::new(WriteType::Put, 5.into(), Some(b"v3@5".to_vec()))), ), ], @@ -1921,20 +1913,17 @@ pub mod tests { version: Some(7), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), + (Key::from_raw(b"k0"), None), ( Key::from_raw(b"k1"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), ), ( Key::from_raw(b"k2"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), ), ( Key::from_raw(b"k3"), - 9.into(), Some(Write::new(WriteType::Put, 5.into(), Some(b"v3@5".to_vec()))), ), ], @@ -1946,20 +1935,17 @@ pub mod tests { version: Some(6), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), + (Key::from_raw(b"k0"), None), ( Key::from_raw(b"k1"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), ), ( Key::from_raw(b"k2"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), ), ( Key::from_raw(b"k3"), - 9.into(), Some(Write::new(WriteType::Put, 5.into(), Some(b"v3@5".to_vec()))), ), ], @@ -1972,18 +1958,16 @@ pub mod tests { version: Some(5), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), + (Key::from_raw(b"k0"), None), ( Key::from_raw(b"k1"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), ), ( Key::from_raw(b"k2"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), ), - (Key::from_raw(b"k3"), 9.into(), None), + (Key::from_raw(b"k3"), None), ], expect_is_remain: true, }, @@ -1993,18 +1977,16 @@ pub mod tests { version: Some(4), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), + (Key::from_raw(b"k0"), None), ( Key::from_raw(b"k1"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), ), ( Key::from_raw(b"k2"), - 4.into(), Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), ), - (Key::from_raw(b"k3"), 9.into(), None), + (Key::from_raw(b"k3"), None), ], expect_is_remain: true, }, @@ -2015,18 +1997,16 @@ pub mod tests { version: Some(3), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), + (Key::from_raw(b"k0"), None), ( Key::from_raw(b"k1"), - 4.into(), Some(Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec()))), ), ( Key::from_raw(b"k2"), - 4.into(), Some(Write::new(WriteType::Put, 1.into(), Some(b"v2@1".to_vec()))), ), - (Key::from_raw(b"k3"), 9.into(), None), + (Key::from_raw(b"k3"), None), ], expect_is_remain: true, }, @@ -2036,18 +2016,16 @@ pub mod tests { version: Some(2), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), + (Key::from_raw(b"k0"), None), ( Key::from_raw(b"k1"), - 4.into(), Some(Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec()))), ), ( Key::from_raw(b"k2"), - 4.into(), Some(Write::new(WriteType::Put, 1.into(), Some(b"v2@1".to_vec()))), ), - (Key::from_raw(b"k3"), 9.into(), None), + (Key::from_raw(b"k3"), None), ], expect_is_remain: true, }, @@ -2058,10 +2036,10 @@ pub mod tests { version: Some(1), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - (Key::from_raw(b"k1"), 4.into(), None), - (Key::from_raw(b"k2"), 4.into(), None), - (Key::from_raw(b"k3"), 9.into(), None), + (Key::from_raw(b"k0"), None), + (Key::from_raw(b"k1"), None), + (Key::from_raw(b"k2"), None), + (Key::from_raw(b"k3"), None), ], expect_is_remain: true, }, @@ -2071,7 +2049,7 @@ pub mod tests { end_key: None, version: Some(0), limit: 1, - expect_res: vec![(Key::from_raw(b"k0"), 1000.into(), None)], + expect_res: vec![(Key::from_raw(b"k0"), None)], expect_is_remain: true, }, Case { @@ -2080,10 +2058,10 @@ pub mod tests { version: Some(0), limit: 5, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - (Key::from_raw(b"k1"), 4.into(), None), - (Key::from_raw(b"k2"), 4.into(), None), - (Key::from_raw(b"k3"), 9.into(), None), + (Key::from_raw(b"k0"), None), + (Key::from_raw(b"k1"), None), + (Key::from_raw(b"k2"), None), + (Key::from_raw(b"k3"), None), ], expect_is_remain: false, }, @@ -2097,7 +2075,7 @@ pub mod tests { case.start_key.as_ref(), case.end_key.as_ref(), case.version.map(Into::into), - |_| true, + |_, _| true, case.limit, ) .unwrap(); diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 02095d4b46d..98e2e433632 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -4,7 +4,7 @@ use txn_types::{Key, Lock, TimeStamp, Write, WriteType}; use crate::storage::{ mvcc::{MvccReader, MvccTxn, SnapshotReader, MAX_TXN_WRITE_SIZE}, - txn::{actions::check_txn_status::rollback_lock, Error, ErrorInner, Result as TxnResult}, + txn::{actions::check_txn_status::rollback_lock, Result as TxnResult}, Snapshot, Statistics, }; @@ -32,63 +32,29 @@ pub fn flashback_to_version_read_write( next_write_key: Key, end_key: &Key, flashback_version: TimeStamp, - flashback_start_ts: TimeStamp, flashback_commit_ts: TimeStamp, statistics: &mut Statistics, ) -> TxnResult)>> { // To flashback the data, we need to get all the latest keys first by scanning // every unique key in `CF_WRITE` and to get its corresponding old MVCC write // record if exists. - let mut key_old_writes = Vec::with_capacity(FLASHBACK_BATCH_SIZE); - let mut has_remain_writes = true; - let mut next_write_key = next_write_key; - // Try to read as many writes as possible in one batch. - while key_old_writes.len() < FLASHBACK_BATCH_SIZE && has_remain_writes { - let key_ts_old_writes; - (key_ts_old_writes, has_remain_writes) = reader.scan_writes( - Some(&next_write_key), - Some(end_key), - Some(flashback_version), - // No need to find an old version for the key if its latest `commit_ts` is smaller - // than or equal to the version. - |key| key.decode_ts().unwrap_or(TimeStamp::zero()) > flashback_version, - FLASHBACK_BATCH_SIZE - key_old_writes.len(), - )?; - statistics.add(&reader.statistics); - // If `has_remain_writes` is true, it means that the batch is full and we may - // need to read another round, so we have to update the `next_write_key` here. - if has_remain_writes { - next_write_key = key_ts_old_writes - .last() - .map(|(key, ..)| key.clone()) - .unwrap(); - } - // Check the latest commit ts to make sure there is no commit change during the - // flashback, otherwise, we need to abort the flashback. - for (key, commit_ts, old_write) in key_ts_old_writes.into_iter() { - if commit_ts > flashback_commit_ts { - return Err(Error::from(ErrorInner::InvalidTxnTso { - start_ts: flashback_start_ts, - commit_ts: flashback_commit_ts, - })); - } - // Although the first flashback preparation phase makes sure there will be no - // writes other than flashback after it, we CAN NOT return directly here. - // Suppose the second phase procedure contains two batches to flashback. After - // the first batch is committed, if the region is down, the client will retry - // the flashback from the very first beginning, because the data in the - // first batch has been written the flashbacked data with the same - // `commit_ts`, So we need to skip it to ensure the following data will - // be flashbacked continuously. - // And some large key modifications will exceed the max txn size limit - // through the execution, the write will forcibly finish the batch of data. - // So it may happen that part of the keys in a batch may be flashbacked. - if commit_ts == flashback_commit_ts { - continue; - } - key_old_writes.push((key, old_write)); - } - } + let result = reader.scan_writes( + Some(&next_write_key), + Some(end_key), + Some(flashback_version), + |_, latest_commit_ts| { + // There is no any other write could happen after the flashback begins. + assert!(latest_commit_ts <= flashback_commit_ts); + // - No need to find an old version for the key if its latest `commit_ts` is + // smaller than or equal to the flashback version. + // - No need to flashback a key twice if its latest `commit_ts` is equal to the + // flashback `commit_ts`. + latest_commit_ts > flashback_version && latest_commit_ts < flashback_commit_ts + }, + FLASHBACK_BATCH_SIZE, + ); + statistics.add(&reader.statistics); + let (key_old_writes, _) = result?; Ok(key_old_writes) } @@ -226,7 +192,6 @@ pub mod tests { key, &next_key, version, - start_ts, commit_ts, &mut statistics, ) diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index cfc6856da9c..b1a83a49ff8 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -133,7 +133,6 @@ impl ReadCommand for FlashbackToVersionReadPhase { next_write_key, &self.end_key, self.version, - self.start_ts, self.commit_ts, statistics, )?; diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 5227e7ea6bc..7fff4dad606 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -258,7 +258,7 @@ fn test_flashback_for_local_read() { cluster.run(); cluster.must_put(b"k1", b"v1"); - let region = cluster.get_region(b"k1"); + let mut region = cluster.get_region(b"k1"); cluster.must_transfer_leader(region.get_id(), peer.clone()); // Check local read before prepare flashback @@ -318,6 +318,20 @@ fn test_flashback_for_local_read() { // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index); + + // A local read with flashback flag will also be blocked. + let mut req = new_request( + region.get_id(), + region.take_region_epoch(), + vec![new_get_cmd(b"k1")], + false, + ); + let new_leader = cluster.query_leader(1, region.get_id(), Duration::from_secs(1)); + req.mut_header().set_peer(new_leader.unwrap()); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let resp = cluster.call_command(req, Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_flashback_not_prepared()); } #[test] From 2704588c6aaa1a269bb91499229e358d23cc636b Mon Sep 17 00:00:00 2001 From: Jarvis Date: Wed, 16 Nov 2022 14:51:55 +0800 Subject: [PATCH 337/676] Protect Raft Engine Disk Usage (#13633) close tikv/tikv#13642 Signed-off-by: Jarvis Zheng Signed-off-by: Jarvis Co-authored-by: Xinye Tao --- Cargo.lock | 14 +- components/engine_panic/src/raft_engine.rs | 4 + components/engine_rocks/src/raft_engine.rs | 4 + components/engine_traits/src/raft_engine.rs | 4 + components/raft_log_engine/src/engine.rs | 8 ++ components/server/src/server.rs | 139 ++++++++++++++++---- components/tikv_util/Cargo.toml | 1 + components/tikv_util/src/sys/disk.rs | 9 ++ components/tikv_util/src/sys/mod.rs | 62 +++++++++ etc/config-template.toml | 6 + src/storage/config.rs | 4 + tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 13 files changed, 227 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2622ed983f5..1722d0385e8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3120,6 +3120,15 @@ dependencies = [ "tempdir", ] +[[package]] +name = "mnt" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1587ebb20a5b04738f16cffa7e2526f1b8496b84f92920facd518362ff1559eb" +dependencies = [ + "libc 0.2.132", +] + [[package]] name = "more-asserts" version = "0.2.1" @@ -4124,7 +4133,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#a0d29980f1448565a6d03f911ebb103c4266f1f4" +source = "git+https://github.com/tikv/raft-engine.git#82f6da7b8dff1856483e8e72a59dda903fb2499b" dependencies = [ "byteorder", "crc32fast", @@ -4158,7 +4167,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#a0d29980f1448565a6d03f911ebb103c4266f1f4" +source = "git+https://github.com/tikv/raft-engine.git#82f6da7b8dff1856483e8e72a59dda903fb2499b" dependencies = [ "clap 3.1.6", "env_logger", @@ -6475,6 +6484,7 @@ dependencies = [ "libc 0.2.132", "log", "log_wrappers", + "mnt", "nix 0.24.1", "num-traits", "num_cpus", diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 75e0e68269d..ad05e66c6fa 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -144,6 +144,10 @@ impl RaftEngine for PanicEngine { panic!() } + fn get_engine_path(&self) -> &str { + panic!() + } + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index b66a56caadf..da15b1708b8 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -339,6 +339,10 @@ impl RaftEngine for RocksEngine { Ok(used_size) } + fn get_engine_path(&self) -> &str { + self.as_inner().path() + } + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { self.put_msg(keys::STORE_IDENT_KEY, ident) } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index b7a3f50699c..7df681c96d5 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -68,6 +68,7 @@ pub struct RaftLogGcTask { pub to: u64, } +// TODO: Refactor common methods between Kv and Raft engine into a shared trait. pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send + 'static { type LogBatch: RaftLogBatch; @@ -140,6 +141,9 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send fn get_engine_size(&self) -> Result; + /// The path to the directory on the filesystem where the raft log is stored + fn get_engine_path(&self) -> &str; + /// Visit all available raft groups. /// /// If any error is returned, the iteration will stop. diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 35cacf620fc..a376adc25b7 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -305,6 +305,10 @@ impl RaftLogEngine { ))) } + pub fn path(&self) -> &str { + self.0.path() + } + /// If path is not an empty directory, we say db exists. pub fn exists(path: &str) -> bool { let path = Path::new(path); @@ -615,6 +619,10 @@ impl RaftEngine for RaftLogEngine { Ok(self.0.get_used_size() as u64) } + fn get_engine_path(&self) -> &str { + self.path() + } + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> where F: FnMut(u64) -> std::result::Result<(), E>, diff --git a/components/server/src/server.rs b/components/server/src/server.rs index aa3a67591e2..a5fb3fefaf9 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -116,7 +116,10 @@ use tikv_util::{ math::MovingAvgU32, metrics::INSTANCE_BACKEND_CPU_QUOTA, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, - sys::{cpu_time::ProcessStat, disk, register_memory_usage_high_water, SysQuota}, + sys::{ + cpu_time::ProcessStat, disk, path_in_diff_mount_point, register_memory_usage_high_water, + SysQuota, + }, thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, @@ -533,36 +536,66 @@ where // enough space to do compaction and region migration when TiKV recover. // This file is created in data_dir rather than db_path, because we must not // increase store size of db_path. + fn calculate_reserved_space(capacity: u64, reserved_size_from_config: u64) -> u64 { + let mut reserved_size = reserved_size_from_config; + if reserved_size_from_config != 0 { + reserved_size = + cmp::max((capacity as f64 * 0.05) as u64, reserved_size_from_config); + } + reserved_size + } + fn reserve_physical_space(data_dir: &String, available: u64, reserved_size: u64) { + let path = Path::new(data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); + if let Err(e) = file_system::remove_file(path) { + warn!("failed to remove space holder on starting: {}", e); + } + + // place holder file size is 20% of total reserved space. + if available > reserved_size { + file_system::reserve_space_for_recover(data_dir, reserved_size / 5) + .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) + .unwrap(); + } else { + warn!("no enough disk space left to create the place holder file"); + } + } + let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); let mut capacity = disk_stats.total_space(); if self.config.raft_store.capacity.0 > 0 { capacity = cmp::min(capacity, self.config.raft_store.capacity.0); } - let mut reserve_space = self.config.storage.reserve_space.0; - if self.config.storage.reserve_space.0 != 0 { - reserve_space = cmp::max( - (capacity as f64 * 0.05) as u64, - self.config.storage.reserve_space.0, - ); - } - disk::set_disk_reserved_space(reserve_space); - let path = - Path::new(&self.config.storage.data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); - if let Err(e) = file_system::remove_file(path) { - warn!("failed to remove space holder on starting: {}", e); - } + // reserve space for kv engine + let kv_reserved_size = + calculate_reserved_space(capacity, self.config.storage.reserve_space.0); + disk::set_disk_reserved_space(kv_reserved_size); + reserve_physical_space( + &self.config.storage.data_dir, + disk_stats.available_space(), + kv_reserved_size, + ); - let available = disk_stats.available_space(); - // place holder file size is 20% of total reserved space. - if available > reserve_space { - file_system::reserve_space_for_recover( - &self.config.storage.data_dir, - reserve_space / 5, - ) - .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) - .unwrap(); + let raft_data_dir = if self.config.raft_engine.enable { + self.config.raft_engine.config().dir } else { - warn!("no enough disk space left to create the place holder file"); + self.config.raft_store.raftdb_path.clone() + }; + + let separated_raft_mount_path = + path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); + if separated_raft_mount_path { + let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); + // reserve space for raft engine if raft engine is deployed separately + let raft_reserved_size = calculate_reserved_space( + raft_disk_stats.total_space(), + self.config.storage.reserve_raft_space.0, + ); + disk::set_raft_disk_reserved_space(raft_reserved_size); + reserve_physical_space( + &raft_data_dir, + raft_disk_stats.available_space(), + raft_reserved_size, + ); } } @@ -1448,13 +1481,28 @@ where let store_path = self.store_path.clone(); let snap_mgr = self.snap_mgr.clone().unwrap(); let reserve_space = disk::get_disk_reserved_space(); - if reserve_space == 0 { + let reserve_raft_space = disk::get_raft_disk_reserved_space(); + if reserve_space == 0 && reserve_raft_space == 0 { info!("disk space checker not enabled"); return; } + let raft_path = engines.raft.get_engine_path().to_string(); + let separated_raft_mount_path = + path_in_diff_mount_point(raft_path.as_str(), engines.kv.path()); + let raft_almost_full_threshold = reserve_raft_space; + let raft_already_full_threshold = reserve_raft_space / 2; let almost_full_threshold = reserve_space; let already_full_threshold = reserve_space / 2; + fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { + match (a, b) { + (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, + (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, + (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, + (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, + (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, + } + } self.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { let disk_stats = match fs2::statvfs(&store_path) { @@ -1481,6 +1529,33 @@ where .get_engine_size() .expect("get raft engine size"); + let mut raft_disk_status = disk::DiskUsage::Normal; + if separated_raft_mount_path && reserve_raft_space != 0 { + let raft_disk_stats = match fs2::statvfs(&raft_path) { + Err(e) => { + error!( + "get disk stat for raft engine failed"; + "raft engine path" => raft_path.clone(), + "err" => ?e + ); + return; + } + Ok(stats) => stats, + }; + let raft_disk_cap = raft_disk_stats.total_space(); + let mut raft_disk_available = + raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); + raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); + raft_disk_status = if raft_disk_available <= raft_already_full_threshold + { + disk::DiskUsage::AlreadyFull + } else if raft_disk_available <= raft_almost_full_threshold + { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + } let placeholer_file_path = PathBuf::from_str(&data_dir) .unwrap() .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); @@ -1488,7 +1563,11 @@ where let placeholder_size: u64 = file_system::get_file_size(placeholer_file_path).unwrap_or(0); - let used_size = snap_size + kv_size + raft_size + placeholder_size; + let used_size = if !separated_raft_mount_path { + snap_size + kv_size + raft_size + placeholder_size + } else { + snap_size + kv_size + placeholder_size + }; let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { disk_cap } else { @@ -1499,18 +1578,22 @@ where available = cmp::min(available, disk_stats.available_space()); let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. - let cur_disk_status = if available <= already_full_threshold { + let cur_kv_disk_status = if available <= already_full_threshold { disk::DiskUsage::AlreadyFull } else if available <= almost_full_threshold { disk::DiskUsage::AlmostFull } else { disk::DiskUsage::Normal }; + let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); if prev_disk_status != cur_disk_status { warn!( - "disk usage {:?}->{:?}, available={},snap={},kv={},raft={},capacity={}", + "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", prev_disk_status, cur_disk_status, + raft_disk_status, + cur_kv_disk_status, + separated_raft_mount_path, available, snap_size, kv_size, diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 5ff65b33df3..36faa552804 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -31,6 +31,7 @@ lazy_static = "1.3" libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } +mnt = "0.3.1" nix = "0.24" num-traits = "0.2" num_cpus = "1" diff --git a/components/tikv_util/src/sys/disk.rs b/components/tikv_util/src/sys/disk.rs index 3f2a60855ff..c8fe87a56b0 100644 --- a/components/tikv_util/src/sys/disk.rs +++ b/components/tikv_util/src/sys/disk.rs @@ -10,6 +10,7 @@ pub use kvproto::disk_usage::DiskUsage; // Percent is not configurable, But if you want to change, please make sure // the percent in both the init fs and store monitor are keep the same. static DISK_RESERVED_SPACE: AtomicU64 = AtomicU64::new(0); +static RAFT_DISK_RESERVED_SPACE: AtomicU64 = AtomicU64::new(0); static DISK_STATUS: AtomicI32 = AtomicI32::new(0); pub fn set_disk_reserved_space(v: u64) { @@ -20,6 +21,14 @@ pub fn get_disk_reserved_space() -> u64 { DISK_RESERVED_SPACE.load(Ordering::Acquire) } +pub fn set_raft_disk_reserved_space(v: u64) { + RAFT_DISK_RESERVED_SPACE.store(v, Ordering::Release) +} + +pub fn get_raft_disk_reserved_space() -> u64 { + RAFT_DISK_RESERVED_SPACE.load(Ordering::Acquire) +} + pub fn set_disk_status(status: DiskUsage) { let v = match status { DiskUsage::Normal => 0, diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index d17c821e995..dcc137f095c 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -9,11 +9,15 @@ pub mod ioload; pub mod thread; // re-export some traits for ease of use +#[cfg(target_os = "linux")] +use std::path::PathBuf; use std::sync::atomic::{AtomicU64, Ordering}; use fail::fail_point; #[cfg(target_os = "linux")] use lazy_static::lazy_static; +#[cfg(target_os = "linux")] +use mnt::get_mount; use sysinfo::RefreshKind; pub use sysinfo::{DiskExt, NetworkExt, ProcessExt, ProcessorExt, SystemExt}; @@ -156,3 +160,61 @@ pub fn cache_size(level: usize) -> Option { pub fn cache_line_size(level: usize) -> Option { read_size_in_cache(level, "coherency_line_size") } + +#[cfg(target_os = "linux")] +pub fn path_in_diff_mount_point(path1: &str, path2: &str) -> bool { + if path1.is_empty() || path2.is_empty() { + return false; + } + let path1 = PathBuf::from(path1); + let path2 = PathBuf::from(path2); + match (get_mount(&path1), get_mount(&path2)) { + (Err(e1), _) => { + warn!("Get mount point error for path {}, {}", path1.display(), e1); + false + } + (_, Err(e2)) => { + warn!("Get mount point error for path {}, {}", path2.display(), e2); + false + } + (Ok(None), _) => { + warn!("No mount point for {}", path1.display()); + false + } + (_, Ok(None)) => { + warn!("No mount point for {}", path2.display()); + false + } + (Ok(Some(mount1)), Ok(Some(mount2))) => mount1 != mount2, + } +} + +#[cfg(not(target_os = "linux"))] +pub fn path_in_diff_mount_point(_path1: &str, _path2: &str) -> bool { + return false; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[cfg(target_os = "linux")] + #[test] + fn test_path_in_diff_mount_point() { + let (empty_path1, path2) = ("", "/"); + let result = path_in_diff_mount_point(empty_path1, path2); + assert_eq!(result, false); + + let (no_mount_point_path, path2) = ("no_mount_point_path_w943nn", "/"); + let result = path_in_diff_mount_point(no_mount_point_path, path2); + assert_eq!(result, false); + + let (not_existed_path, path2) = ("/non_existed_path_eu2yndh", "/"); + let result = path_in_diff_mount_point(not_existed_path, path2); + assert_eq!(result, false); + + let (normal_path1, normal_path2) = ("/", "/"); + let result = path_in_diff_mount_point(normal_path1, normal_path2); + assert_eq!(result, false); + } +} diff --git a/etc/config-template.toml b/etc/config-template.toml index 92b6454ba29..a2b3ab13b00 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -253,6 +253,12 @@ ## Set it to 0 will cause no space is reserved at all. It's generally used for tests. # reserve-space = "5GB" +## Reserve some space for raft disk if raft disk is separated deployed with kv disk. +## `max(reserve-raft-space, raft disk capacity * 5%)` will be reserved exactly. +## +## Set it to 0 will cause no space is reserved at all. It's generally used for tests. +# reserve-raft-space = "1GB" + ## The maximum recovery time after rocksdb detects restorable background errors. When the data belonging ## to the data range is damaged, it will be reported to PD through heartbeat, and PD will add `remove-peer` ## operator to remove this damaged peer. When the damaged peer still exists in the current store, the diff --git a/src/storage/config.rs b/src/storage/config.rs index 685272dbeee..313f86ba048 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -29,6 +29,7 @@ const MAX_SCHED_CONCURRENCY: usize = 2 * 1024 * 1024; const DEFAULT_SCHED_PENDING_WRITE_MB: u64 = 100; const DEFAULT_RESERVED_SPACE_GB: u64 = 5; +const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, OnlineConfig)] #[serde(default)] @@ -50,6 +51,8 @@ pub struct Config { // Reserve disk space to make tikv would have enough space to compact when disk is full. pub reserve_space: ReadableSize, #[online_config(skip)] + pub reserve_raft_space: ReadableSize, + #[online_config(skip)] pub enable_async_apply_prewrite: bool, #[online_config(skip)] pub api_version: u8, @@ -82,6 +85,7 @@ impl Default for Config { }, scheduler_pending_write_threshold: ReadableSize::mb(DEFAULT_SCHED_PENDING_WRITE_MB), reserve_space: ReadableSize::gb(DEFAULT_RESERVED_SPACE_GB), + reserve_raft_space: ReadableSize::gb(DEFAULT_RESERVED_RAFT_SPACE_GB), enable_async_apply_prewrite: false, api_version: 1, enable_ttl: false, diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 5cb8c837fb1..93c07f2f411 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -671,6 +671,7 @@ fn test_serde_custom_tikv_config() { scheduler_worker_pool_size: 1, scheduler_pending_write_threshold: ReadableSize::kb(123), reserve_space: ReadableSize::gb(10), + reserve_raft_space: ReadableSize::gb(2), enable_async_apply_prewrite: true, api_version: 1, enable_ttl: true, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index a041b696158..e5c896238bc 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -97,6 +97,7 @@ scheduler-worker-pool-size = 1 scheduler-pending-write-threshold = "123KB" enable-async-apply-prewrite = true reserve-space = "10GB" +reserve-raft-space = "2GB" enable-ttl = true ttl-check-poll-interval = "0s" From 616b4402192b4d092f8d6727f5fe95f133e85bca Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Wed, 16 Nov 2022 17:13:56 +0800 Subject: [PATCH 338/676] txn: Add batch-resumed mode for acquire_pessimistic_lock storage command (#13687) ref tikv/tikv#13298 Add batch-resumed mode for acquire_pessimistic_lock storage command. Now the storage command `AcquirePessimisticLock` contains an enum to determine whether it's executing a normal request or it's a batch of requests resumed after waiting for another lock. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- components/txn_types/src/lib.rs | 4 +- components/txn_types/src/types.rs | 13 + src/storage/lock_manager/lock_wait_context.rs | 1 + .../lock_manager/lock_waiting_queue.rs | 5 + src/storage/metrics.rs | 1 + src/storage/mod.rs | 30 +- src/storage/mvcc/txn.rs | 4 + .../txn/commands/acquire_pessimistic_lock.rs | 178 ++++---- .../acquire_pessimistic_lock_resumed.rs | 414 ++++++++++++++++++ src/storage/txn/commands/atomic_store.rs | 2 +- .../txn/commands/check_secondary_locks.rs | 2 +- src/storage/txn/commands/check_txn_status.rs | 2 +- src/storage/txn/commands/cleanup.rs | 2 +- src/storage/txn/commands/commit.rs | 2 +- src/storage/txn/commands/compare_and_swap.rs | 2 +- .../txn/commands/flashback_to_version.rs | 2 +- src/storage/txn/commands/mod.rs | 34 +- src/storage/txn/commands/pause.rs | 2 +- .../txn/commands/pessimistic_rollback.rs | 2 +- src/storage/txn/commands/prewrite.rs | 21 +- src/storage/txn/commands/resolve_lock.rs | 2 +- src/storage/txn/commands/resolve_lock_lite.rs | 2 +- src/storage/txn/commands/rollback.rs | 2 +- src/storage/txn/commands/txn_heart_beat.rs | 2 +- src/storage/txn/latch.rs | 15 +- src/storage/txn/mod.rs | 1 + src/storage/txn/scheduler.rs | 11 +- src/storage/types.rs | 2 + tests/failpoints/cases/test_storage.rs | 6 +- 29 files changed, 621 insertions(+), 145 deletions(-) create mode 100644 src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs diff --git a/components/txn_types/src/lib.rs b/components/txn_types/src/lib.rs index edd89256d2b..a1a759b21b9 100644 --- a/components/txn_types/src/lib.rs +++ b/components/txn_types/src/lib.rs @@ -19,8 +19,8 @@ pub use lock::{Lock, LockType, PessimisticLock}; use thiserror::Error; pub use timestamp::{TimeStamp, TsSet, TSO_PHYSICAL_SHIFT_BITS}; pub use types::{ - is_short_value, Key, KvPair, Mutation, MutationType, OldValue, OldValues, TxnExtra, - TxnExtraScheduler, Value, WriteBatchFlags, SHORT_VALUE_MAX_LEN, + insert_old_value_if_resolved, is_short_value, Key, KvPair, Mutation, MutationType, OldValue, + OldValues, TxnExtra, TxnExtraScheduler, Value, WriteBatchFlags, SHORT_VALUE_MAX_LEN, }; pub use write::{Write, WriteRef, WriteType}; diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 01133a71924..6a2c953afc1 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -512,6 +512,19 @@ impl OldValue { // MutationType is the type of mutation of the current write. pub type OldValues = HashMap)>; +pub fn insert_old_value_if_resolved( + old_values: &mut OldValues, + key: Key, + start_ts: TimeStamp, + old_value: OldValue, + mutation_type: Option, +) { + if old_value.resolved() { + let key = key.append_ts(start_ts); + old_values.insert(key, (old_value, mutation_type)); + } +} + // Extra data fields filled by kvrpcpb::ExtraOp. #[derive(Default, Debug, Clone)] pub struct TxnExtra { diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs index 7749ee983cb..1d53bdc38ea 100644 --- a/src/storage/lock_manager/lock_wait_context.rs +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -258,6 +258,7 @@ mod tests { for_update_ts: 1.into(), ..Default::default() }, + should_not_exist: false, lock_wait_token: token, legacy_wake_up_index: None, key_cb: None, diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index d3fb58b2a94..4069bab5643 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -93,6 +93,9 @@ pub struct LockWaitEntry { pub key: Key, pub lock_hash: u64, pub parameters: PessimisticLockParameters, + // `parameters` provides parameter for a request, but `should_not_exist` is specified key-wise. + // Put it in a separated field. + pub should_not_exist: bool, pub lock_wait_token: LockWaitToken, pub legacy_wake_up_index: Option, pub key_cb: Option>, @@ -687,6 +690,7 @@ mod tests { min_commit_ts: 0.into(), check_existence: false, is_first_lock: false, + lock_only_if_exists: false, allow_lock_with_conflict: false, }; @@ -697,6 +701,7 @@ mod tests { key, lock_hash, parameters, + should_not_exist: false, lock_wait_token: token, legacy_wake_up_index: None, key_cb: Some(SyncWrapper::new(Box::new(move |res| tx.send(res).unwrap()))), diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 2bbe4b7b762..e84a7dfb4e9 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -126,6 +126,7 @@ make_auto_flush_static_metric! { batch_get_command, prewrite, acquire_pessimistic_lock, + acquire_pessimistic_lock_resumed, commit, cleanup, rollback, diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 8b835bcfafd..2032ffd86ae 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -96,7 +96,7 @@ use tikv_util::{ use tracker::{ clear_tls_tracker_token, set_tls_tracker_token, with_tls_tracker, TrackedFuture, TrackerToken, }; -use txn_types::{Key, KvPair, Lock, LockType, OldValues, TimeStamp, TsSet, Value}; +use txn_types::{Key, KvPair, Lock, LockType, TimeStamp, TsSet, Value}; pub use self::{ errors::{get_error_kind_from_header, get_tag_from_header, Error, ErrorHeaderKind, ErrorInner}, @@ -1416,7 +1416,7 @@ impl Storage { callback: Callback, ) -> Result<()> { use crate::storage::txn::commands::{ - AcquirePessimisticLock, Prewrite, PrewritePessimistic, + AcquirePessimisticLock, AcquirePessimisticLockResumed, Prewrite, PrewritePessimistic, }; let cmd: Command = cmd.into(); @@ -1452,6 +1452,18 @@ impl Storage { )?; check_key_size!(keys, self.max_key_size, callback); } + Command::AcquirePessimisticLockResumed(AcquirePessimisticLockResumed { + items, .. + }) => { + let keys = items.iter().map(|item| item.key.as_encoded()); + Self::check_api_version( + self.api_version, + cmd.ctx().api_version, + CommandKind::acquire_pessimistic_lock_resumed, + keys.clone(), + )?; + check_key_size!(keys, self.max_key_size, callback); + } _ => {} } with_tls_tracker(|tracker| { @@ -3341,9 +3353,9 @@ pub mod test_util { Some(WaitTimeout::Default), return_values, for_update_ts.next(), - OldValues::default(), check_existence, false, + false, Context::default(), ) } @@ -8193,7 +8205,7 @@ mod tests { Some(WaitTimeout::Millis(100)), false, 21.into(), - OldValues::default(), + false, false, false, Context::default(), @@ -8285,7 +8297,7 @@ mod tests { Some(WaitTimeout::Millis(5000)), false, (lock_ts + 1).into(), - OldValues::default(), + false, false, false, Context::default(), @@ -8870,7 +8882,7 @@ mod tests { None, false, 0.into(), - OldValues::default(), + false, false, false, Default::default(), @@ -8893,7 +8905,7 @@ mod tests { None, false, 0.into(), - OldValues::default(), + false, false, false, Default::default(), @@ -9123,7 +9135,7 @@ mod tests { None, false, TimeStamp::new(12), - OldValues::default(), + false, false, false, Context::default(), @@ -9149,7 +9161,7 @@ mod tests { None, false, TimeStamp::new(12), - OldValues::default(), + false, false, false, Context::default(), diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index 66aa769d462..4cc0ab57ffb 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -103,6 +103,10 @@ impl MvccTxn { self.write_size } + pub fn is_empty(&self) -> bool { + self.modifies.len() == 0 && self.locks_for_1pc.len() == 0 + } + pub(crate) fn put_lock(&mut self, key: Key, lock: &Lock) { let write = Modify::Put(CF_LOCK, key, lock.to_bytes()); self.write_size += write.size(); diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 69a5179ab84..6bd147cf02e 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -1,8 +1,9 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use kvproto::kvrpcpb::{ExtraOp, LockInfo}; -use txn_types::{Key, OldValues, TimeStamp, TxnExtra}; +use kvproto::kvrpcpb::ExtraOp; +use tikv_kv::Modify; +use txn_types::{insert_old_value_if_resolved, Key, OldValues, TimeStamp, TxnExtra}; use crate::storage::{ kv::WriteData, @@ -17,7 +18,7 @@ use crate::storage::{ Error, ErrorInner, Result, }, types::{PessimisticLockParameters, PessimisticLockResults}, - Error as StorageError, ErrorInner as StorageErrorInner, ProcessResult, Result as StorageResult, + Error as StorageError, PessimisticLockKeyResult, ProcessResult, Result as StorageResult, Snapshot, }; @@ -46,9 +47,9 @@ command! { /// later read in the same transaction. return_values: bool, min_commit_ts: TimeStamp, - old_values: OldValues, check_existence: bool, lock_only_if_exists: bool, + allow_lock_with_conflict: bool, } } @@ -69,17 +70,15 @@ impl CommandExt for AcquirePessimisticLock { gen_lock!(keys: multiple(|x| &x.0)); } -fn extract_lock_info_from_result(res: &StorageResult) -> &LockInfo { - match res { - Err(StorageError(box StorageErrorInner::Txn(Error(box ErrorInner::Mvcc(MvccError( - box MvccErrorInner::KeyIsLocked(info), - )))))) => info, - _ => panic!("unexpected mvcc error"), - } -} - impl WriteCommand for AcquirePessimisticLock { - fn process_write(mut self, snapshot: S, context: WriteContext<'_, L>) -> Result { + fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { + if self.allow_lock_with_conflict && self.keys.len() > 1 { + // Currently multiple keys with `allow_lock_with_conflict` set is not supported. + return Err(Error::from(ErrorInner::Other(box_err!( + "multiple keys in a single request with allowed_lock_with_conflict set is not allowed" + )))); + } + let (start_ts, ctx, keys) = (self.start_ts, self.ctx, self.keys); let mut txn = MvccTxn::new(start_ts, context.concurrency_manager); let mut reader = ReaderWithStats::new( @@ -87,9 +86,11 @@ impl WriteCommand for AcquirePessimisticLock context.statistics, ); - let rows = keys.len(); - let mut res = Ok(PessimisticLockResults::with_capacity(rows)); + let total_keys = keys.len(); + let mut res = PessimisticLockResults::with_capacity(total_keys); + let mut encountered_locks = vec![]; let need_old_value = context.extra_op == ExtraOp::ReadOldValue; + let mut old_values = OldValues::default(); for (k, should_not_exist) in keys { match acquire_pessimistic_lock( &mut txn, @@ -104,62 +105,79 @@ impl WriteCommand for AcquirePessimisticLock self.min_commit_ts, need_old_value, self.lock_only_if_exists, - false, + self.allow_lock_with_conflict, ) { Ok((key_res, old_value)) => { - res.as_mut().unwrap().push(key_res); - if old_value.resolved() { - let key = k.append_ts(txn.start_ts); - // MutationType is unknown in AcquirePessimisticLock stage. - let mutation_type = None; - self.old_values.insert(key, (old_value, mutation_type)); - } + res.push(key_res); + // MutationType is unknown in AcquirePessimisticLock stage. + insert_old_value_if_resolved(&mut old_values, k, txn.start_ts, old_value, None); } - Err(e @ MvccError(box MvccErrorInner::KeyIsLocked { .. })) => { - res = Err(e).map_err(Error::from).map_err(StorageError::from); + Err(MvccError(box MvccErrorInner::KeyIsLocked(lock_info))) => { + let request_parameters = PessimisticLockParameters { + pb_ctx: ctx.clone(), + primary: self.primary.clone(), + start_ts, + lock_ttl: self.lock_ttl, + for_update_ts: self.for_update_ts, + wait_timeout: self.wait_timeout, + return_values: self.return_values, + min_commit_ts: self.min_commit_ts, + check_existence: self.check_existence, + is_first_lock: self.is_first_lock, + lock_only_if_exists: self.lock_only_if_exists, + allow_lock_with_conflict: self.allow_lock_with_conflict, + }; + let lock_info = WriteResultLockInfo::new( + lock_info, + request_parameters, + k, + should_not_exist, + ); + encountered_locks.push(lock_info); + // Do not lock previously succeeded keys. + txn.clear(); + res.0.clear(); + res.push(PessimisticLockKeyResult::Waiting); break; } Err(e) => return Err(Error::from(e)), } } - // no conflict - let (pr, to_be_write, rows, ctx, lock_info) = if res.is_ok() { - let pr = ProcessResult::PessimisticLockRes { res }; - let extra = TxnExtra { - old_values: self.old_values, - // One pc status is unkown AcquirePessimisticLock stage. - one_pc: false, - for_flashback: false, - }; - let write_data = WriteData::new(txn.into_modifies(), extra); - (pr, write_data, rows, ctx, None) - } else { - let request_parameters = PessimisticLockParameters { - pb_ctx: ctx.clone(), - primary: self.primary.clone(), - start_ts: self.start_ts, - lock_ttl: self.lock_ttl, - for_update_ts: self.for_update_ts, - wait_timeout: self.wait_timeout, - return_values: self.return_values, - min_commit_ts: self.min_commit_ts, - check_existence: self.check_existence, - is_first_lock: self.is_first_lock, - allow_lock_with_conflict: false, - }; - let lock_info_pb = extract_lock_info_from_result(&res); - let lock_info = WriteResultLockInfo::new(lock_info_pb.clone(), request_parameters); - let pr = ProcessResult::PessimisticLockRes { res }; - // Wait for lock released - (pr, WriteData::default(), 0, ctx, Some(lock_info)) - }; + let modifies = txn.into_modifies(); + + let mut res = Ok(res); + + // If encountered lock and `wait_timeout` is `None` (which means no wait), + // return error directly here. + if !encountered_locks.is_empty() && self.wait_timeout.is_none() { + // Mind the difference of the protocols of legacy requests and resumable + // requests. For resumable requests (allow_lock_with_conflict == + // true), key errors are considered key by key instead of for the + // whole request. + let lock_info = encountered_locks.drain(..).next().unwrap().lock_info_pb; + let err = StorageError::from(Error::from(MvccError::from( + MvccErrorInner::KeyIsLocked(lock_info), + ))); + if self.allow_lock_with_conflict { + res.as_mut().unwrap().0[0] = PessimisticLockKeyResult::Failed(err.into()) + } else { + res = Err(err) + } + } + + let rows = if res.is_ok() { total_keys } else { 0 }; + + let pr = ProcessResult::PessimisticLockRes { res }; + + let to_be_write = make_write_data(modifies, old_values); + Ok(WriteResult { ctx, to_be_write, rows, pr, - lock_info, + lock_info: encountered_locks, released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, @@ -167,38 +185,16 @@ impl WriteCommand for AcquirePessimisticLock } } -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_gen_lock_info_from_result() { - let raw_key = b"key".to_vec(); - let key = Key::from_raw(&raw_key); - let ts = 100; - let is_first_lock = true; - let wait_timeout = WaitTimeout::from_encoded(200); - - let mut info = LockInfo::default(); - info.set_key(raw_key.clone()); - info.set_lock_version(ts); - info.set_lock_ttl(100); - let case = StorageError::from(StorageErrorInner::Txn(Error::from(ErrorInner::Mvcc( - MvccError::from(MvccErrorInner::KeyIsLocked(info.clone())), - )))); - let lock_info = WriteResultLockInfo::new( - extract_lock_info_from_result::<()>(&Err(case)).clone(), - PessimisticLockParameters { - is_first_lock, - wait_timeout, - ..Default::default() - }, - ); - assert_eq!(lock_info.lock_digest.ts, ts.into()); - assert_eq!(lock_info.lock_digest.hash, key.gen_hash()); - assert_eq!(lock_info.key.into_raw().unwrap(), raw_key); - assert_eq!(lock_info.parameters.is_first_lock, is_first_lock); - assert_eq!(lock_info.parameters.wait_timeout, wait_timeout); - assert_eq!(lock_info.lock_info_pb, info); +pub(super) fn make_write_data(modifies: Vec, old_values: OldValues) -> WriteData { + if !modifies.is_empty() { + let extra = TxnExtra { + old_values, + // One pc status is unknown in AcquirePessimisticLock stage. + one_pc: false, + for_flashback: false, + }; + WriteData::new(modifies, extra) + } else { + WriteData::default() } } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs new file mode 100644 index 00000000000..3a35fe6d1a7 --- /dev/null +++ b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs @@ -0,0 +1,414 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// #[PerformanceCriticalPath] +use kvproto::kvrpcpb::ExtraOp; +use txn_types::{insert_old_value_if_resolved, Key, OldValues}; + +use crate::storage::{ + lock_manager::{lock_waiting_queue::LockWaitEntry, LockManager, LockWaitToken}, + mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, SnapshotReader}, + txn::{ + acquire_pessimistic_lock, + commands::{ + acquire_pessimistic_lock::make_write_data, Command, CommandExt, ReleasedLocks, + ResponsePolicy, TypedCommand, WriteCommand, WriteContext, WriteResult, + WriteResultLockInfo, + }, + Error, Result, + }, + types::{PessimisticLockParameters, PessimisticLockResults}, + Error as StorageError, PessimisticLockKeyResult, ProcessResult, Result as StorageResult, + Snapshot, +}; + +#[derive(Debug)] +pub struct ResumedPessimisticLockItem { + pub key: Key, + pub should_not_exist: bool, + pub params: PessimisticLockParameters, + pub lock_wait_token: LockWaitToken, +} + +command! { + /// Acquire a Pessimistic lock on the keys. + /// + /// This can be rolled back with a [`PessimisticRollback`](Command::PessimisticRollback) command. + AcquirePessimisticLockResumed: + cmd_ty => StorageResult, + display => "kv::command::acquirepessimisticlockresumed {:?}", + (items), + content => { + items: Vec, + } +} + +impl CommandExt for AcquirePessimisticLockResumed { + ctx!(); + tag!(acquire_pessimistic_lock_resumed); + request_type!(KvPessimisticLock); + + property!(can_be_pipelined); + + fn write_bytes(&self) -> usize { + self.items + .iter() + .map(|item| item.key.as_encoded().len()) + .sum() + } + + gen_lock!(items: multiple(|x| &x.key)); +} + +impl WriteCommand for AcquirePessimisticLockResumed { + fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { + let mut modifies = vec![]; + let mut txn = None; + let mut reader: Option> = None; + + let total_keys = self.items.len(); + let mut res = PessimisticLockResults::with_capacity(total_keys); + let mut encountered_locks = vec![]; + let need_old_value = context.extra_op == ExtraOp::ReadOldValue; + let mut old_values = OldValues::default(); + + let mut new_locked_keys = Vec::with_capacity(total_keys); + + for item in self.items.into_iter() { + let ResumedPessimisticLockItem { + key, + should_not_exist, + params, + lock_wait_token, + } = item; + + // TODO: Refine the code for rebuilding txn state. + if txn + .as_ref() + .map_or(true, |t: &MvccTxn| t.start_ts != params.start_ts) + { + if let Some(prev_txn) = txn.replace(MvccTxn::new( + params.start_ts, + context.concurrency_manager.clone(), + )) { + modifies.extend(prev_txn.into_modifies()); + } + // TODO: Is it possible to reuse the same reader but change the start_ts stored + // in it? + if let Some(mut prev_reader) = reader.replace(SnapshotReader::new_with_ctx( + params.start_ts, + snapshot.clone(), + &self.ctx, + )) { + context.statistics.add(&prev_reader.take_statistics()); + } + } + let txn = txn.as_mut().unwrap(); + let reader = reader.as_mut().unwrap(); + + match acquire_pessimistic_lock( + txn, + reader, + key.clone(), + ¶ms.primary, + should_not_exist, + params.lock_ttl, + params.for_update_ts, + params.return_values, + params.check_existence, + params.min_commit_ts, + need_old_value, + params.lock_only_if_exists, + true, + ) { + Ok((key_res, old_value)) => { + res.push(key_res); + new_locked_keys.push((params.start_ts, key.clone())); + + insert_old_value_if_resolved( + &mut old_values, + key, + params.start_ts, + old_value, + None, + ); + } + Err(MvccError(box MvccErrorInner::KeyIsLocked(lock_info))) => { + let mut lock_info = + WriteResultLockInfo::new(lock_info, params, key, should_not_exist); + lock_info.lock_wait_token = lock_wait_token; + res.push(PessimisticLockKeyResult::Waiting); + encountered_locks.push(lock_info); + } + Err(e) => { + res.push(PessimisticLockKeyResult::Failed( + StorageError::from(Error::from(e)).into(), + )); + } + }; + } + + if let Some(txn) = txn { + if !txn.is_empty() { + modifies.extend(txn.into_modifies()); + } + } + if let Some(mut reader) = reader { + context.statistics.add(&reader.take_statistics()); + } + + let pr = ProcessResult::PessimisticLockRes { res: Ok(res) }; + let to_be_write = make_write_data(modifies, old_values); + + Ok(WriteResult { + ctx: self.ctx, + to_be_write, + rows: total_keys, + pr, + lock_info: encountered_locks, + released_locks: ReleasedLocks::new(), + lock_guards: vec![], + response_policy: ResponsePolicy::OnProposed, + }) + } +} + +impl AcquirePessimisticLockResumed { + pub fn from_lock_wait_entries( + lock_wait_entries: impl IntoIterator>, + ) -> TypedCommand> { + let items: Vec<_> = lock_wait_entries + .into_iter() + .map(|item| { + assert!(item.key_cb.is_none()); + ResumedPessimisticLockItem { + key: item.key, + should_not_exist: item.should_not_exist, + params: item.parameters, + lock_wait_token: item.lock_wait_token, + } + }) + .collect(); + + assert!(!items.is_empty()); + let ctx = items[0].params.pb_ctx.clone(); + // TODO: May it cause problem by using the first one as the pb_ctx of the + // Command? + Self::new(items, ctx) + } +} + +#[cfg(test)] +mod tests { + use concurrency_manager::ConcurrencyManager; + use kvproto::kvrpcpb::Context; + use rand::random; + use tikv_kv::Engine; + use txn_types::TimeStamp; + + use super::*; + use crate::storage::{ + lock_manager::{MockLockManager, WaitTimeout}, + mvcc::tests::{must_locked, write}, + txn::{ + commands::pessimistic_rollback::tests::must_success as must_pessimistic_rollback, + tests::{must_commit, must_pessimistic_locked, must_prewrite_put, must_rollback}, + }, + TestEngineBuilder, + }; + + #[allow(clippy::vec_box)] + fn must_success( + engine: &mut E, + lock_wait_entries: Vec>, + ) -> PessimisticLockResults { + let ctx = Context::default(); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let cm = ConcurrencyManager::new(TimeStamp::zero()); + + let items_info: Vec<_> = lock_wait_entries + .iter() + .map(|item| { + ( + item.lock_wait_token, + item.key.clone(), + item.parameters.clone(), + item.should_not_exist, + ) + }) + .collect(); + + let command = AcquirePessimisticLockResumed::from_lock_wait_entries(lock_wait_entries).cmd; + let result = command + .process_write( + snapshot, + WriteContext { + lock_mgr: &MockLockManager::new(), + concurrency_manager: cm, + extra_op: Default::default(), + statistics: &mut Default::default(), + async_apply_prewrite: false, + raw_ext: None, + }, + ) + .unwrap(); + let res = if let ProcessResult::PessimisticLockRes { res } = result.pr { + res.unwrap() + } else { + panic!("unexpected process result: {:?}", result.pr); + }; + + // Check correctness of returned lock info. + let mut lock_info_index = 0; + for (i, res) in res.0.iter().enumerate() { + if let PessimisticLockKeyResult::Waiting = res { + let (token, key, params, should_not_exist) = &items_info[i]; + let lock_info: &WriteResultLockInfo = &result.lock_info[lock_info_index]; + lock_info_index += 1; + + assert_eq!(lock_info.lock_wait_token, *token); + assert_eq!(&lock_info.key, key); + assert_eq!(&lock_info.parameters, params); + assert_eq!(lock_info.should_not_exist, *should_not_exist); + } + } + assert_eq!(lock_info_index, result.lock_info.len()); + + write(engine, &ctx, result.to_be_write.modifies); + res + } + + fn make_lock_waiting( + key: &[u8], + start_ts: impl Into, + for_update_ts: impl Into, + return_values: bool, + check_existence: bool, + ) -> Box { + let start_ts = start_ts.into(); + let for_update_ts = for_update_ts.into(); + assert!(for_update_ts >= start_ts); + let parameters = PessimisticLockParameters { + pb_ctx: Context::default(), + primary: key.to_vec(), + start_ts, + lock_ttl: 1000, + for_update_ts, + wait_timeout: Some(WaitTimeout::Millis(1000)), + return_values, + min_commit_ts: for_update_ts.next(), + check_existence, + is_first_lock: false, + lock_only_if_exists: false, + allow_lock_with_conflict: true, + }; + + let key = Key::from_raw(key); + let lock_hash = key.gen_hash(); + let entry = LockWaitEntry { + key, + lock_hash, + parameters, + should_not_exist: false, + lock_wait_token: LockWaitToken(Some(random())), + legacy_wake_up_index: Some(0), + key_cb: None, + }; + + Box::new(entry) + } + + #[test] + fn test_acquire_pessimistic_lock_resumed() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let res = must_success( + &mut engine, + vec![make_lock_waiting(b"k1", 10, 15, false, false)], + ); + assert_eq!(res.0.len(), 1); + res.0[0].assert_empty(); + must_pessimistic_locked(&mut engine, b"k1", 10, 15); + must_pessimistic_rollback(&mut engine, b"k1", 10, 15); + + let res = must_success( + &mut engine, + vec![ + make_lock_waiting(b"k1", 20, 25, false, false), + make_lock_waiting(b"k2", 20, 25, false, false), + make_lock_waiting(b"k3", 21, 26, false, false), + ], + ); + assert_eq!(res.0.len(), 3); + res.0.iter().for_each(|x| x.assert_empty()); + must_pessimistic_locked(&mut engine, b"k1", 20, 25); + must_pessimistic_locked(&mut engine, b"k2", 20, 25); + must_pessimistic_locked(&mut engine, b"k3", 21, 26); + + must_pessimistic_rollback(&mut engine, b"k1", 20, 25); + must_pessimistic_rollback(&mut engine, b"k2", 20, 25); + must_pessimistic_rollback(&mut engine, b"k3", 21, 26); + + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 30); + must_commit(&mut engine, b"k1", 30, 35); + must_prewrite_put(&mut engine, b"k2", b"v2", b"k1", 30); + must_prewrite_put(&mut engine, b"k3", b"v3", b"k3", 28); + must_commit(&mut engine, b"k3", 28, 29); + let res = must_success( + &mut engine, + vec![ + make_lock_waiting(b"k1", 31, 31, false, false), + make_lock_waiting(b"k2", 32, 32, false, false), + make_lock_waiting(b"k3", 33, 33, true, false), + make_lock_waiting(b"k4", 34, 34, false, true), + make_lock_waiting(b"k5", 35, 35, false, false), + ], + ); + assert_eq!(res.0.len(), 5); + res.0[0].assert_locked_with_conflict(Some(b"v1"), 35); + res.0[1].assert_waiting(); + res.0[2].assert_value(Some(b"v3")); + res.0[3].assert_existence(false); + res.0[4].assert_empty(); + must_pessimistic_locked(&mut engine, b"k1", 31, 35); + must_locked(&mut engine, b"k2", 30); + must_pessimistic_locked(&mut engine, b"k3", 33, 33); + must_pessimistic_locked(&mut engine, b"k4", 34, 34); + must_pessimistic_locked(&mut engine, b"k5", 35, 35); + + must_pessimistic_rollback(&mut engine, b"k1", 31, 35); + must_pessimistic_rollback(&mut engine, b"k3", 33, 33); + must_pessimistic_rollback(&mut engine, b"k4", 34, 34); + must_pessimistic_rollback(&mut engine, b"k5", 35, 35); + + must_prewrite_put(&mut engine, b"k4", b"v4", b"k4", 40); + must_prewrite_put(&mut engine, b"k6", b"v6", b"k4", 40); + let res = must_success( + &mut engine, + vec![ + make_lock_waiting(b"k1", 41, 41, false, false), + make_lock_waiting(b"k2", 41, 41, false, false), + make_lock_waiting(b"k3", 42, 42, false, false), + make_lock_waiting(b"k4", 42, 42, false, false), + make_lock_waiting(b"k5", 43, 43, false, false), + make_lock_waiting(b"k6", 43, 43, false, false), + ], + ); + assert_eq!(res.0.len(), 6); + for &i in &[0, 2, 4] { + res.0[i].assert_empty(); + } + for &i in &[1, 3, 5] { + res.0[i].assert_waiting(); + } + must_pessimistic_locked(&mut engine, b"k1", 41, 41); + must_pessimistic_locked(&mut engine, b"k3", 42, 42); + must_pessimistic_locked(&mut engine, b"k5", 43, 43); + + must_pessimistic_rollback(&mut engine, b"k1", 41, 41); + must_rollback(&mut engine, b"k2", 30, false); + must_pessimistic_rollback(&mut engine, b"k3", 43, 43); + must_rollback(&mut engine, b"k2", 40, false); + must_pessimistic_rollback(&mut engine, b"k5", 45, 45); + must_rollback(&mut engine, b"k2", 40, false); + } +} diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index b935d991eea..1df5c5b2cf8 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -58,7 +58,7 @@ impl WriteCommand for RawAtomicStore { to_be_write, rows, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: raw_ext.into_iter().map(|r| r.key_guard).collect(), response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index bd494e91edc..71adda7a274 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -158,7 +158,7 @@ impl WriteCommand for CheckSecondaryLocks { to_be_write: write_data, rows, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index b74e7d5cb7c..a118769a5db 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -132,7 +132,7 @@ impl WriteCommand for CheckTxnStatus { to_be_write: write_data, rows: 1, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index 0b82432e3cd..a6c529420d3 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -74,7 +74,7 @@ impl WriteCommand for Cleanup { to_be_write: write_data, rows: 1, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index 86e1f541306..910b7832ed1 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -74,7 +74,7 @@ impl WriteCommand for Commit { to_be_write: write_data, rows, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index 2fff0620b27..943fc6f69d1 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -112,7 +112,7 @@ impl WriteCommand for RawCompareAndSwap { to_be_write, rows, pr, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards, response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index f20fd957ed7..dabb6acfcc5 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -124,7 +124,7 @@ impl WriteCommand for FlashbackToVersion { cmd: Command::FlashbackToVersionReadPhase(next_cmd), } })(), - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index fc044a9fa78..2de3687d18d 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -5,6 +5,7 @@ #[macro_use] mod macros; pub(crate) mod acquire_pessimistic_lock; +pub(crate) mod acquire_pessimistic_lock_resumed; pub(crate) mod atomic_store; pub(crate) mod check_secondary_locks; pub(crate) mod check_txn_status; @@ -32,6 +33,7 @@ use std::{ }; pub use acquire_pessimistic_lock::AcquirePessimisticLock; +pub use acquire_pessimistic_lock_resumed::AcquirePessimisticLockResumed; pub use atomic_store::RawAtomicStore; pub use check_secondary_locks::CheckSecondaryLocks; pub use check_txn_status::CheckTxnStatus; @@ -56,11 +58,11 @@ pub use rollback::Rollback; use tikv_util::deadline::Deadline; use tracker::RequestType; pub use txn_heart_beat::TxnHeartBeat; -use txn_types::{Key, OldValues, TimeStamp, Value, Write}; +use txn_types::{Key, TimeStamp, Value, Write}; use crate::storage::{ kv::WriteData, - lock_manager::{self, LockManager, WaitTimeout}, + lock_manager::{self, LockManager, LockWaitToken, WaitTimeout}, metrics, mvcc::{Lock as MvccLock, MvccReader, ReleasedLock, SnapshotReader}, txn::{latch, ProcessResult, Result}, @@ -83,6 +85,7 @@ pub enum Command { Prewrite(Prewrite), PrewritePessimistic(PrewritePessimistic), AcquirePessimisticLock(AcquirePessimisticLock), + AcquirePessimisticLockResumed(AcquirePessimisticLockResumed), Commit(Commit), Cleanup(Cleanup), Rollback(Rollback), @@ -219,9 +222,9 @@ impl From for TypedCommand, + pub lock_info: Vec, pub released_locks: ReleasedLocks, pub lock_guards: Vec, pub response_policy: ResponsePolicy, @@ -399,22 +402,36 @@ pub struct WriteResult { pub struct WriteResultLockInfo { pub lock_digest: lock_manager::LockDigest, pub key: Key, + pub should_not_exist: bool, pub lock_info_pb: LockInfo, pub parameters: PessimisticLockParameters, + pub hash_for_latch: u64, + /// If a request is woken up after waiting for some lock, and it encounters + /// another lock again after resuming, this field will carry the token + /// that was already allocated before. + pub lock_wait_token: LockWaitToken, } impl WriteResultLockInfo { - pub fn new(lock_info_pb: LockInfo, parameters: PessimisticLockParameters) -> Self { + pub fn new( + lock_info_pb: LockInfo, + parameters: PessimisticLockParameters, + key: Key, + should_not_exist: bool, + ) -> Self { let lock = lock_manager::LockDigest { ts: lock_info_pb.get_lock_version().into(), - hash: Key::from_raw(lock_info_pb.get_key()).gen_hash(), + hash: key.gen_hash(), }; - let key = Key::from_raw(lock_info_pb.get_key()); + let hash_for_latch = latch::Lock::hash(&key); Self { lock_digest: lock, key, + should_not_exist, lock_info_pb, parameters, + hash_for_latch, + lock_wait_token: LockWaitToken(None), } } } @@ -568,6 +585,7 @@ impl Command { Command::Prewrite(t) => t, Command::PrewritePessimistic(t) => t, Command::AcquirePessimisticLock(t) => t, + Command::AcquirePessimisticLockResumed(t) => t, Command::Commit(t) => t, Command::Cleanup(t) => t, Command::Rollback(t) => t, @@ -593,6 +611,7 @@ impl Command { Command::Prewrite(t) => t, Command::PrewritePessimistic(t) => t, Command::AcquirePessimisticLock(t) => t, + Command::AcquirePessimisticLockResumed(t) => t, Command::Commit(t) => t, Command::Cleanup(t) => t, Command::Rollback(t) => t, @@ -636,6 +655,7 @@ impl Command { Command::Prewrite(t) => t.process_write(snapshot, context), Command::PrewritePessimistic(t) => t.process_write(snapshot, context), Command::AcquirePessimisticLock(t) => t.process_write(snapshot, context), + Command::AcquirePessimisticLockResumed(t) => t.process_write(snapshot, context), Command::Commit(t) => t.process_write(snapshot, context), Command::Cleanup(t) => t.process_write(snapshot, context), Command::Rollback(t) => t.process_write(snapshot, context), diff --git a/src/storage/txn/commands/pause.rs b/src/storage/txn/commands/pause.rs index 05bbb508bdc..3dc7d06d5ef 100644 --- a/src/storage/txn/commands/pause.rs +++ b/src/storage/txn/commands/pause.rs @@ -48,7 +48,7 @@ impl WriteCommand for Pause { to_be_write: WriteData::default(), rows: 0, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index b575787208a..c35c362f19e 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -90,7 +90,7 @@ impl WriteCommand for PessimisticRollback { to_be_write: write_data, rows, pr: ProcessResult::MultiRes { results: vec![] }, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 542c60819b5..2cd908412c3 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -14,7 +14,10 @@ use kvproto::kvrpcpb::{ PrewriteRequestPessimisticAction::{self, *}, }; use tikv_kv::SnapshotExt; -use txn_types::{Key, Mutation, OldValue, OldValues, TimeStamp, TxnExtra, Write, WriteType}; +use txn_types::{ + insert_old_value_if_resolved, Key, Mutation, OldValue, OldValues, TimeStamp, TxnExtra, Write, + WriteType, +}; use super::ReaderWithStats; use crate::storage::{ @@ -569,11 +572,13 @@ impl Prewriter { if need_min_commit_ts && final_min_commit_ts < ts { final_min_commit_ts = ts; } - if old_value.resolved() { - let key = key.append_ts(txn.start_ts); - self.old_values - .insert(key, (old_value, Some(mutation_type))); - } + insert_old_value_if_resolved( + &mut self.old_values, + key, + txn.start_ts, + old_value, + Some(mutation_type), + ); } Ok((..)) => { // If it needs min_commit_ts but min_commit_ts is zero, the lock @@ -681,7 +686,7 @@ impl Prewriter { to_be_write, rows, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards, response_policy: ResponsePolicy::OnApplied, @@ -700,7 +705,7 @@ impl Prewriter { to_be_write: WriteData::default(), rows, pr, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index b89e91593f9..463275b2e1f 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -145,7 +145,7 @@ impl WriteCommand for ResolveLock { to_be_write: write_data, rows, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index a31211c564e..d336d88a9ca 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -70,7 +70,7 @@ impl WriteCommand for ResolveLockLite { to_be_write: write_data, rows, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index 479f29cb276..52c05ae34c7 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -65,7 +65,7 @@ impl WriteCommand for Rollback { to_be_write: write_data, rows, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index 9bfbda5c748..f965b863494 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -90,7 +90,7 @@ impl WriteCommand for TxnHeartBeat { to_be_write: write_data, rows: 1, pr, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/latch.rs b/src/storage/txn/latch.rs index 86d16858bd3..12cc51207bb 100644 --- a/src/storage/txn/latch.rs +++ b/src/storage/txn/latch.rs @@ -116,14 +116,7 @@ impl Lock { I: IntoIterator, { // prevent from deadlock, so we sort and deduplicate the index - let mut required_hashes: Vec = keys - .into_iter() - .map(|key| { - let mut s = DefaultHasher::new(); - key.hash(&mut s); - s.finish() - }) - .collect(); + let mut required_hashes: Vec = keys.into_iter().map(|key| Self::hash(key)).collect(); required_hashes.sort_unstable(); required_hashes.dedup(); Lock { @@ -132,6 +125,12 @@ impl Lock { } } + pub fn hash(key: &K) -> u64 { + let mut s = DefaultHasher::new(); + key.hash(&mut s); + s.finish() + } + /// Returns true if all the required latches have be acquired, false /// otherwise. pub fn acquired(&self) -> bool { diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index 615ab98cb8c..86ceda2bdf1 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -45,6 +45,7 @@ use crate::storage::{ }; /// Process result of a command. +#[allow(clippy::large_enum_variant)] #[derive(Debug)] pub enum ProcessResult { Res, diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 3f5e48e8017..24ef7466e63 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -1075,8 +1075,10 @@ impl Scheduler { let mut pr = Some(pr); - // TODO: Lock wait handling here. - if let Some(lock_info) = lock_info { + if !lock_info.is_empty() { + assert_eq!(lock_info.len(), 1); + let lock_info = lock_info.into_iter().next().unwrap(); + // Only handle lock waiting if `wait_timeout` is set. Otherwise it indicates // that it's a lock-no-wait request and we need to report error // immediately. @@ -1427,6 +1429,7 @@ impl Scheduler { key: lock_info.key, lock_hash: lock_info.lock_digest.hash, parameters: lock_info.parameters, + should_not_exist: lock_info.should_not_exist, lock_wait_token, legacy_wake_up_index: None, key_cb: Some(ctx.get_callback_for_blocked_key().into()), @@ -1492,7 +1495,7 @@ mod tests { use kvproto::kvrpcpb::{BatchRollbackRequest, CheckTxnStatusRequest, Context}; use raftstore::store::{ReadStats, WriteStats}; use tikv_util::{config::ReadableSize, future::paired_future_callback}; - use txn_types::{Key, OldValues, TimeStamp}; + use txn_types::{Key, TimeStamp}; use super::*; use crate::storage::{ @@ -1575,7 +1578,7 @@ mod tests { Some(WaitTimeout::Default), false, TimeStamp::default(), - OldValues::default(), + false, false, false, Context::default(), diff --git a/src/storage/types.rs b/src/storage/types.rs index 6ad4c8e26ef..63bab09eb5c 100644 --- a/src/storage/types.rs +++ b/src/storage/types.rs @@ -131,6 +131,7 @@ pub struct PrewriteResult { pub one_pc_commit_ts: TimeStamp, } +#[derive(Clone, Debug, PartialEq)] #[cfg_attr(test, derive(Default))] pub struct PessimisticLockParameters { pub pb_ctx: kvrpcpb::Context, @@ -143,6 +144,7 @@ pub struct PessimisticLockParameters { pub min_commit_ts: TimeStamp, pub check_existence: bool, pub is_first_lock: bool, + pub lock_only_if_exists: bool, /// Whether it's allowed for an pessimistic lock request to acquire the lock /// even there is write conflict (i.e. the latest version's `commit_ts` is diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 43f1b504f25..dd8f49bbde3 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -42,7 +42,7 @@ use tikv::{ }, }; use tikv_util::{future::paired_future_callback, worker::dummy_scheduler, HandyRwLock}; -use txn_types::{Key, Mutation, OldValues, TimeStamp}; +use txn_types::{Key, Mutation, TimeStamp}; #[test] fn test_scheduler_leader_change_twice() { @@ -679,7 +679,7 @@ fn test_async_apply_prewrite_impl( None, false, 0.into(), - OldValues::default(), + false, false, false, ctx.clone(), @@ -1018,7 +1018,7 @@ fn test_async_apply_prewrite_1pc_impl( None, false, 0.into(), - OldValues::default(), + false, false, false, ctx.clone(), From 65ad2a52e6d64c5aef324877668cb554629b25b4 Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 17 Nov 2022 12:07:55 +0800 Subject: [PATCH 339/676] read: fix panic on witness check (#13767) close tikv/tikv#13764 After the remove peer conf-change is applied and before the peer is destroyed. There is a chance that local reader may not find the peer from region info. So this PR considers this case and fixes panic on witness check. Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/peer.rs | 5 ++ components/raftstore/src/store/worker/read.rs | 15 ++-- tests/failpoints/cases/mod.rs | 1 + tests/failpoints/cases/test_witness.rs | 71 +++++++++++++++++++ tests/integrations/raftstore/test_witness.rs | 2 +- 5 files changed, 84 insertions(+), 10 deletions(-) create mode 100644 tests/failpoints/cases/test_witness.rs diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 63bb878838c..b4c7d1fb097 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -3703,6 +3703,11 @@ where self.update_region(cp.region); fail_point!("change_peer_after_update_region"); + fail_point!( + "change_peer_after_update_region_store_3", + self.store_id() == 3, + |_| panic!("should not use return") + ); let now = Instant::now(); let (mut remove_self, mut need_ping) = (false, false); diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 0766a52a387..08e56aa7481 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -785,15 +785,6 @@ where return Err(e); } - // Check witness - if find_peer_by_id(&delegate.region, delegate.peer_id) - .unwrap() - .is_witness - { - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); - return Err(Error::RecoveryInProgress(region_id)); - } - // Check term. if let Err(e) = util::check_term(req, delegate.term) { debug!( @@ -813,6 +804,12 @@ where return Ok(None); } + // Check witness + if find_peer_by_id(&delegate.region, delegate.peer_id).map_or(true, |p| p.is_witness) { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); + return Err(Error::RecoveryInProgress(region_id)); + } + // Check whether the region is in the flashback state and the local read could // be performed. let is_in_flashback = delegate.region.is_in_flashback; diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index b291e86b88c..24a05f2ab9f 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -38,3 +38,4 @@ mod test_transaction; mod test_transfer_leader; mod test_ttl; mod test_unsafe_recovery; +mod test_witness; diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs new file mode 100644 index 00000000000..cee75ff44b9 --- /dev/null +++ b/tests/failpoints/cases/test_witness.rs @@ -0,0 +1,71 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{iter::FromIterator, sync::Arc, time::Duration}; + +use futures::executor::block_on; +use kvproto::metapb; +use pd_client::PdClient; +use test_raftstore::*; +use tikv_util::store::find_peer; + +fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { + peer.set_role(metapb::PeerRole::Learner); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + cluster.pd_client.must_remove_peer(region_id, peer.clone()); + peer.set_is_witness(true); + peer.set_id(peer.get_id() + 10); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + peer.set_role(metapb::PeerRole::Voter); + cluster.pd_client.must_add_peer(region_id, peer.clone()); +} + +// Test the case local reader works well with witness peer. +#[test] +fn test_witness_update_region_in_local_reader() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + cluster.must_put(b"k0", b"v0"); + + // update region but the peer is not destroyed yet + fail::cfg("change_peer_after_update_region_store_3", "pause").unwrap(); + + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store3.clone()); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request.clone(), Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + } + ); + + fail::remove("change_peer_after_update_region_store_3"); +} diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index 8e36510753e..a2518cc64ae 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -448,7 +448,7 @@ fn test_witness_replica_read() { request.mut_header().set_replica_read(true); let resp = cluster - .call_command_on_node(nodes[2], request, Duration::from_millis(100)) + .read(None, request, Duration::from_millis(100)) .unwrap(); assert_eq!( resp.get_header().get_error().get_recovery_in_progress(), From 7dfb42ec36b8b7022125c6ded68183ae3dc64063 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Thu, 17 Nov 2022 16:15:56 +0800 Subject: [PATCH 340/676] *: update crossbeam-channel to avoid spin at sending side (#13807) close tikv/tikv#13815 According to https://github.com/crossbeam-rs/crossbeam/pull/835, spinning at the sending side is probably a bad idea because of large critical section and it's fixed in the recent version. This commit updates crossbeam-channel. It will reduce CPU usage a bit and improve performance. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1722d0385e8..abb420d2264 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1139,9 +1139,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.1" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if 1.0.0", "crossbeam-utils 0.8.8", From cc7345a3b82a42c9fee1f917afd8a8b729032717 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Thu, 17 Nov 2022 16:59:56 +0800 Subject: [PATCH 341/676] tests: refine the raftstore flashback tests (#13808) ref tikv/tikv#13303 Refine the raftstore flashback tests. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- .../integrations/raftstore/test_flashback.rs | 258 ++++++++---------- 1 file changed, 107 insertions(+), 151 deletions(-) diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 7fff4dad606..89a61223fa2 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -9,17 +9,19 @@ use futures::{channel::oneshot, executor::block_on}; use kvproto::{ errorpb::FlashbackInProgress, metapb, - raft_cmdpb::{AdminCmdType, CmdType, Request}, + raft_cmdpb::{AdminCmdType, RaftCmdResponse, Request}, }; use raftstore::store::Callback; use test_raftstore::*; use txn_types::WriteBatchFlags; +const TEST_KEY: &[u8] = b"k1"; +const TEST_VALUE: &[u8] = b"v1"; + #[test] fn test_prepare_flashback_after_split() { let mut cluster = new_node_cluster(0, 3); cluster.run(); - cluster.must_transfer_leader(1, new_peer(1, 1)); let old_region = cluster.get_region(b"a"); @@ -126,56 +128,42 @@ fn test_prepare_flashback_after_conf_change() { fn test_flashback_unprepared() { let mut cluster = new_node_cluster(0, 3); cluster.run(); - - cluster.must_transfer_leader(1, new_peer(2, 2)); cluster.must_transfer_leader(1, new_peer(1, 1)); - let mut region = cluster.get_region(b"k1"); - let mut cmd = Request::default(); - cmd.set_cmd_type(CmdType::Put); - let mut req = new_request( - region.get_id(), - region.take_region_epoch(), - vec![cmd], - false, + let mut region = cluster.get_region(TEST_KEY); + must_get_flashback_not_prepared_error( + &mut cluster, + &mut region, + new_put_cmd(TEST_KEY, TEST_VALUE), ); - let new_leader = cluster.query_leader(1, region.get_id(), Duration::from_secs(1)); - req.mut_header().set_peer(new_leader.unwrap()); - req.mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let resp = cluster.call_command(req, Duration::from_secs(3)).unwrap(); - assert!(resp.get_header().get_error().has_flashback_not_prepared()); } #[test] fn test_flashback_for_schedule() { let mut cluster = new_node_cluster(0, 3); cluster.run(); - cluster.must_transfer_leader(1, new_peer(2, 2)); cluster.must_transfer_leader(1, new_peer(1, 1)); - // Prepare for flashback - let region = cluster.get_region(b"k1"); + // Prepare flashback. + let region = cluster.get_region(TEST_KEY); cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); - - // Verify the schedule is disabled. - let mut region = cluster.get_region(b"k3"); + // Make sure the schedule is disabled. + let mut region = cluster.get_region(TEST_KEY); let admin_req = new_transfer_leader_cmd(new_peer(2, 2)); let transfer_leader = new_admin_request(region.get_id(), ®ion.take_region_epoch(), admin_req); let resp = cluster .call_command_on_leader(transfer_leader, Duration::from_secs(3)) .unwrap(); - let e = resp.get_header().get_error(); assert_eq!( - e.get_flashback_in_progress(), + resp.get_header().get_error().get_flashback_in_progress(), &FlashbackInProgress { region_id: region.get_id(), ..Default::default() } ); - + // Finish flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); // Transfer leader to (2, 2) should succeed. cluster.must_transfer_leader(1, new_peer(2, 2)); @@ -187,27 +175,33 @@ fn test_flashback_for_write() { cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); - // Write for cluster - let value = vec![1_u8; 8096]; - multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); - - // Prepare for flashback - let region = cluster.get_region(b"k1"); + // Write without flashback flag. + let mut region = cluster.get_region(TEST_KEY); + must_request_without_flashback_flag( + &mut cluster, + &mut region.clone(), + new_put_cmd(TEST_KEY, TEST_VALUE), + ); + // Prepare flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); - // Write will be blocked - let value = vec![1_u8; 8096]; - must_get_error_flashback_in_progress(&mut cluster, ®ion, new_put_cmd(b"k1", &value)); - // Write with flashback flag will succeed - must_do_cmd_with_flashback_flag( + must_get_flashback_in_progress_error( &mut cluster, &mut region.clone(), - new_put_cmd(b"k1", &value), + new_put_cmd(TEST_KEY, TEST_VALUE), + ); + // Write with flashback flag will succeed. + must_request_with_flashback_flag( + &mut cluster, + &mut region.clone(), + new_put_cmd(TEST_KEY, TEST_VALUE), ); - cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); - - multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); + must_request_without_flashback_flag( + &mut cluster, + &mut region, + new_put_cmd(TEST_KEY, TEST_VALUE), + ); } #[test] @@ -216,30 +210,18 @@ fn test_flashback_for_read() { cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); - // Write for cluster - let value = vec![1_u8; 8096]; - multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); - // read for cluster - multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); - - // Prepare for flashback - let region = cluster.get_region(b"k1"); + // Read without flashback flag. + let mut region = cluster.get_region(TEST_KEY); + must_request_without_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + // Prepare flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); - - // read will be blocked - must_get_error_flashback_in_progress(&mut cluster, ®ion, new_get_cf_cmd("write", b"k1")); - - // Verify the read can be executed if add flashback flag in request's - // header. - must_do_cmd_with_flashback_flag( - &mut cluster, - &mut region.clone(), - new_get_cf_cmd("write", b"k1"), - ); - + // Read will be blocked. + must_get_flashback_in_progress_error(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + // Read with flashback flag will succeed. + must_request_with_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + // Finish flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); - - multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); + must_request_without_flashback_flag(&mut cluster, &mut region, new_get_cmd(TEST_KEY)); } // LocalReader will attempt to renew the lease. @@ -249,62 +231,44 @@ fn test_flashback_for_read() { fn test_flashback_for_local_read() { let mut cluster = new_node_cluster(0, 3); let election_timeout = configure_for_lease_read(&mut cluster, Some(50), None); - // Avoid triggering the log compaction in this test case. cluster.cfg.raft_store.raft_log_gc_threshold = 100; - + cluster.run(); + cluster.must_put(TEST_KEY, TEST_VALUE); + let mut region = cluster.get_region(TEST_KEY); let store_id = 3; let peer = new_peer(store_id, 3); - cluster.run(); - - cluster.must_put(b"k1", b"v1"); - let mut region = cluster.get_region(b"k1"); - cluster.must_transfer_leader(region.get_id(), peer.clone()); + cluster.must_transfer_leader(region.get_id(), peer); // Check local read before prepare flashback let state = cluster.raft_local_state(region.get_id(), store_id); let last_index = state.get_last_index(); // Make sure the leader transfer procedure timeouts. sleep(election_timeout * 2); - must_read_on_peer(&mut cluster, peer.clone(), region.clone(), b"k1", b"v1"); + must_request_without_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index); - // Prepare for flashback + // Prepare flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); - // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 1); // Wait for apply_res to set leader lease. sleep_ms(500); - - must_error_read_on_peer( - &mut cluster, - peer.clone(), - region.clone(), - b"k1", - Duration::from_secs(1), - ); - + // Read should fail. + must_get_flashback_in_progress_error(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); // Wait for the leader's lease to expire to ensure that a renew lease interval // has elapsed. sleep(election_timeout * 2); - must_error_read_on_peer( - &mut cluster, - peer.clone(), - region.clone(), - b"k1", - Duration::from_secs(1), - ); - + // Read should fail. + must_get_flashback_in_progress_error(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); // Also check read by propose was blocked let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 1); - + // Finish flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); - let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 2); @@ -313,25 +277,12 @@ fn test_flashback_for_local_read() { let last_index = state.get_last_index(); // Make sure the leader transfer procedure timeouts. sleep(election_timeout * 2); - must_read_on_peer(&mut cluster, peer, region.clone(), b"k1", b"v1"); - + must_request_without_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index); - // A local read with flashback flag will also be blocked. - let mut req = new_request( - region.get_id(), - region.take_region_epoch(), - vec![new_get_cmd(b"k1")], - false, - ); - let new_leader = cluster.query_leader(1, region.get_id(), Duration::from_secs(1)); - req.mut_header().set_peer(new_leader.unwrap()); - req.mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let resp = cluster.call_command(req, Duration::from_secs(3)).unwrap(); - assert!(resp.get_header().get_error().has_flashback_not_prepared()); + must_get_flashback_not_prepared_error(&mut cluster, &mut region, new_get_cmd(TEST_KEY)); } #[test] @@ -340,7 +291,7 @@ fn test_flashback_for_status_cmd_as_region_detail() { cluster.run(); let leader = cluster.leader_of_region(1).unwrap(); - let region = cluster.get_region(b"k1"); + let region = cluster.get_region(TEST_KEY); cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); let region_detail = cluster.region_detail(region.get_id(), leader.get_store_id()); @@ -434,58 +385,63 @@ fn must_check_flashback_state( ); } -fn multi_do_cmd(cluster: &mut Cluster, cmd: Request) { - for _ in 0..100 { - let mut reqs = vec![]; - for _ in 0..100 { - reqs.push(cmd.clone()); - } - cluster.batch_put(b"k1", reqs).unwrap(); - } -} - -fn must_do_cmd_with_flashback_flag( +fn request( cluster: &mut Cluster, region: &mut metapb::Region, - cmd: Request, -) { - // Verify the read can be executed if add flashback flag in request's - // header. - let mut req = new_request( + req: Request, + with_flashback_flag: bool, +) -> RaftCmdResponse { + let mut cmd_req = new_request( region.get_id(), region.take_region_epoch(), - vec![cmd], + vec![req], false, ); let new_leader = cluster.query_leader(1, region.get_id(), Duration::from_secs(1)); - req.mut_header().set_peer(new_leader.unwrap()); - req.mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let resp = cluster.call_command(req, Duration::from_secs(3)).unwrap(); + let header = cmd_req.mut_header(); + header.set_peer(new_leader.unwrap()); + if with_flashback_flag { + header.set_flags(WriteBatchFlags::FLASHBACK.bits()); + } + cluster + .call_command(cmd_req, Duration::from_secs(3)) + .unwrap() +} + +// Make sure the request could be executed with flashback flag. +fn must_request_with_flashback_flag( + cluster: &mut Cluster, + region: &mut metapb::Region, + req: Request, +) { + let resp = request(cluster, region, req, true); assert!(!resp.get_header().has_error()); } -fn must_get_error_flashback_in_progress( +fn must_get_flashback_not_prepared_error( cluster: &mut Cluster, - region: &metapb::Region, - cmd: Request, + region: &mut metapb::Region, + req: Request, ) { - for _ in 0..100 { - let mut reqs = vec![]; - for _ in 0..100 { - reqs.push(cmd.clone()); - } - match cluster.batch_put(b"k1", reqs) { - Ok(_) => {} - Err(e) => { - assert_eq!( - e.get_flashback_in_progress(), - &FlashbackInProgress { - region_id: region.get_id(), - ..Default::default() - } - ); - } - } - } + let resp = request(cluster, region, req, true); + assert!(resp.get_header().get_error().has_flashback_not_prepared()); +} + +// Make sure the request could be executed without flashback flag. +fn must_request_without_flashback_flag( + cluster: &mut Cluster, + region: &mut metapb::Region, + req: Request, +) { + let resp = request(cluster, region, req, false); + assert!(!resp.get_header().has_error()); +} + +fn must_get_flashback_in_progress_error( + cluster: &mut Cluster, + region: &mut metapb::Region, + req: Request, +) { + let resp = request(cluster, region, req, false); + assert!(resp.get_header().get_error().has_flashback_in_progress()); } From ec42962f9f26261580e5b7bd8ed62347921dba73 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 17 Nov 2022 17:57:56 +0800 Subject: [PATCH 342/676] cmd: support tikv-ctl to get the regions info within a given key range (#13768) ref tikv/tikv#13760 Provide a way to get the regions info within a given key range just like which in [pd-ctl](https://docs.pingcap.com/zh/tidb/dev/pd-control#region-key---formatrawencodehex-key) Also support limit for tikv-ctl raft region Signed-off-by: husharp Co-authored-by: Xinye Tao --- cmd/tikv-ctl/src/cmd.rs | 15 ++++++++-- cmd/tikv-ctl/src/executor.rs | 27 +++++++++++++++-- cmd/tikv-ctl/src/main.rs | 13 +++++++- cmd/tikv-ctl/src/util.rs | 58 ++++++++++++++++++++++++++++++++++++ 4 files changed, 107 insertions(+), 6 deletions(-) diff --git a/cmd/tikv-ctl/src/cmd.rs b/cmd/tikv-ctl/src/cmd.rs index eed2d7e8283..657d296109c 100644 --- a/cmd/tikv-ctl/src/cmd.rs +++ b/cmd/tikv-ctl/src/cmd.rs @@ -592,7 +592,6 @@ pub enum RaftCmd { #[structopt( short = "r", aliases = &["region"], - required_unless = "all-regions", conflicts_with = "all-regions", use_delimiter = true, require_delimiter = true, @@ -604,10 +603,22 @@ pub enum RaftCmd { // `regions` must be None when `all_regions` is present, // so we left `all_regions` unused. #[allow(dead_code)] - #[structopt(long, required_unless = "regions", conflicts_with = "regions")] + #[structopt(long, conflicts_with = "regions")] /// Print info for all regions all_regions: bool, + #[structopt(long, default_value = "")] + /// hex start key + start: String, + + #[structopt(long, default_value = "")] + /// hex end key + end: String, + + #[structopt(long, default_value = "16")] + /// Limit the number of keys to scan + limit: usize, + #[structopt(long)] /// Skip tombstone regions skip_tombstone: bool, diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 80915dbc564..b2d25a32d5b 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -23,7 +23,7 @@ use pd_client::{Config as PdConfig, PdClient, RpcClient}; use protobuf::Message; use raft::eraftpb::{ConfChange, ConfChangeV2, Entry, EntryType}; use raft_log_engine::RaftLogEngine; -use raftstore::store::INIT_EPOCH_CONF_VER; +use raftstore::store::{util::build_key_range, INIT_EPOCH_CONF_VER}; use security::SecurityManager; use serde_json::json; use tikv::{ @@ -151,17 +151,38 @@ pub trait DebugExecutor { println!("total region size: {}", convert_gbmb(total_size as u64)); } - fn dump_region_info(&self, region_ids: Option>, skip_tombstone: bool) { + fn dump_region_info( + &self, + region_ids: Option>, + start_key: &[u8], + end_key: &[u8], + limit: usize, + skip_tombstone: bool, + ) { let region_ids = region_ids.unwrap_or_else(|| self.get_all_regions_in_store()); let mut region_objects = serde_json::map::Map::new(); for region_id in region_ids { + if limit > 0 && region_objects.len() >= limit { + break; + } let r = self.get_region_info(region_id); if skip_tombstone { let region_state = r.region_local_state.as_ref(); if region_state.map_or(false, |s| s.get_state() == PeerState::Tombstone) { - return; + continue; } } + let region = r + .region_local_state + .as_ref() + .map(|s| s.get_region().clone()) + .unwrap(); + if !check_intersect_of_range( + &build_key_range(region.get_start_key(), region.get_end_key(), false), + &build_key_range(start_key, end_key, false), + ) { + continue; + } let region_object = json!({ "region_id": region_id, "region_local_state": r.region_local_state.map(|s| { diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index be5069397e4..72078d07f62 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -272,9 +272,20 @@ fn main() { RaftCmd::Region { regions, skip_tombstone, + start, + end, + limit, .. } => { - debug_executor.dump_region_info(regions, skip_tombstone); + let start_key = from_hex(&start).unwrap(); + let end_key = from_hex(&end).unwrap(); + debug_executor.dump_region_info( + regions, + &start_key, + &end_key, + limit, + skip_tombstone, + ); } }, Cmd::Size { region, cf } => { diff --git a/cmd/tikv-ctl/src/util.rs b/cmd/tikv-ctl/src/util.rs index d7e83511d3e..0e67c905e8d 100644 --- a/cmd/tikv-ctl/src/util.rs +++ b/cmd/tikv-ctl/src/util.rs @@ -2,6 +2,7 @@ use std::{borrow::ToOwned, error::Error, str, str::FromStr, u64}; +use kvproto::kvrpcpb::KeyRange; use server::setup::initial_logger; use tikv::config::TikvConfig; @@ -62,8 +63,27 @@ pub fn perror_and_exit(prefix: &str, e: E) -> ! { tikv_util::logger::exit_process_gracefully(-1); } +// Check if region's `key_range` intersects with `key_range_limit`. +pub fn check_intersect_of_range(key_range: &KeyRange, key_range_limit: &KeyRange) -> bool { + if !key_range.get_end_key().is_empty() + && !key_range_limit.get_start_key().is_empty() + && key_range.get_end_key() <= key_range_limit.get_start_key() + { + return false; + } + if !key_range_limit.get_end_key().is_empty() + && !key_range.get_start_key().is_empty() + && key_range_limit.get_end_key() < key_range.get_start_key() + { + return false; + } + true +} + #[cfg(test)] mod tests { + use raftstore::store::util::build_key_range; + use super::*; #[test] @@ -73,4 +93,42 @@ mod tests { assert_eq!(from_hex("0x74").unwrap(), result); assert_eq!(from_hex("0X74").unwrap(), result); } + + #[test] + fn test_included_region_in_range() { + // To avoid unfolding the code when `make format` is called + fn range(start: &[u8], end: &[u8]) -> KeyRange { + build_key_range(start, end, false) + } + let mut region = range(&[0x02], &[0x05]); + // region absolutely in range + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[0x06]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x05]))); + // region intersects with range + assert!(check_intersect_of_range(®ion, &range(&[0x04], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[0x04], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[0x03]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x03]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x02]))); // region is left-closed and right-open interval + // range absolutely in region also need to return true + assert!(check_intersect_of_range(®ion, &range(&[0x03], &[0x04]))); + // region not intersects with range + assert!(!check_intersect_of_range(®ion, &range(&[0x05], &[]))); // region is left-closed and right-open interval + assert!(!check_intersect_of_range(®ion, &range(&[0x06], &[]))); + assert!(!check_intersect_of_range(®ion, &range(&[], &[0x01]))); + // check last region + region = range(&[0x02], &[]); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x02]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[]))); + assert!(!check_intersect_of_range(®ion, &range(&[], &[0x01]))); + } } From 3179b12df572e17f87e0d3c8689ddbfffe468018 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Fri, 18 Nov 2022 14:01:56 +0800 Subject: [PATCH 343/676] mvcc: skip accumulated locks by a second get (#13784) ref tikv/tikv#13694 Prewrite and acquire_pessimistic_lock use get_write to get the latest PUT or DELETE record. Point get is implemented by PointGetter. In these cases, with last_change_ts and versions_to_last_change, now we use an additional get operation to find the record directly if versions_to_last_change reaches SEEK_BOUND. I think no additional metrics are needed because the count of get operation of the write CF represents represents it. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- src/storage/mvcc/reader/point_getter.rs | 35 ++++++++++-- src/storage/mvcc/reader/reader.rs | 74 ++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 8 deletions(-) diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 012189201c5..651762aa88e 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -5,6 +5,7 @@ use std::borrow::Cow; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::kvrpcpb::{IsolationLevel, WriteConflictReason}; +use tikv_kv::SEEK_BOUND; use txn_types::{Key, Lock, LockType, TimeStamp, TsSet, Value, WriteRef, WriteType}; use crate::storage::{ @@ -281,10 +282,9 @@ impl PointGetter { return Ok(None); } + let mut write = WriteRef::parse(self.write_cursor.value(&mut self.statistics.write))?; + let mut owned_value: Vec; // To work around lifetime problem loop { - // No need to compare user key because it uses prefix seek. - let write = WriteRef::parse(self.write_cursor.value(&mut self.statistics.write))?; - if !write.check_gc_fence_as_latest_version(self.ts) { return Ok(None); } @@ -315,13 +315,35 @@ impl PointGetter { return Ok(None); } WriteType::Lock | WriteType::Rollback => { - // Continue iterate next `write`. + if write.versions_to_last_change < SEEK_BOUND || write.last_change_ts.is_zero() + { + // Continue iterate next `write`. + } else { + let commit_ts = write.last_change_ts; + let key_with_ts = user_key.clone().append_ts(commit_ts); + match self.snapshot.get_cf(CF_WRITE, &key_with_ts)? { + Some(v) => owned_value = v, + None => return Ok(None), + } + self.statistics.write.get += 1; + write = WriteRef::parse(&owned_value)?; + assert!( + write.write_type == WriteType::Put + || write.write_type == WriteType::Delete, + "Write record pointed by last_change_ts {} should be Put or Delete, but got {:?}", + commit_ts, + write.write_type, + ); + continue; + } } } if !self.write_cursor.next(&mut self.statistics.write) { return Ok(None); } + // No need to compare user key because it uses prefix seek. + write = WriteRef::parse(self.write_cursor.value(&mut self.statistics.write))?; } } @@ -611,7 +633,7 @@ mod tests { must_get_value(&mut getter, b"foo2", b"foo2v"); let s = getter.take_statistics(); // We have to check every version - assert_seek_next_prev(&s.write, 1, 40, 0); + assert_seek_next_prev(&s.write, 1, 0, 0); assert_eq!( s.processed_size, Key::from_raw(b"foo2").len() @@ -621,7 +643,8 @@ mod tests { // Get again must_get_value(&mut getter, b"foo2", b"foo2v"); let s = getter.take_statistics(); - assert_seek_next_prev(&s.write, 1, 40, 0); + assert_seek_next_prev(&s.write, 1, 0, 0); + assert_eq!(s.write.get, 1); assert_eq!( s.processed_size, Key::from_raw(b"foo2").len() diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 61975aa666c..c8ca1a5f671 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -6,7 +6,7 @@ use kvproto::{ errorpb::{self, EpochNotMatch, StaleCommand}, kvrpcpb::Context, }; -use tikv_kv::SnapshotExt; +use tikv_kv::{SnapshotExt, SEEK_BOUND}; use txn_types::{Key, Lock, OldValue, TimeStamp, Value, Write, WriteRef, WriteType}; use crate::storage::{ @@ -382,7 +382,31 @@ impl MvccReader { WriteType::Delete => { return Ok(None); } - WriteType::Lock | WriteType::Rollback => ts = commit_ts.prev(), + WriteType::Lock | WriteType::Rollback => { + if write.versions_to_last_change < SEEK_BOUND + || write.last_change_ts.is_zero() + { + ts = commit_ts.prev(); + } else { + let commit_ts = write.last_change_ts; + let key_with_ts = key.clone().append_ts(commit_ts); + let Some(value) = self + .snapshot + .get_cf(CF_WRITE, &key_with_ts)? else { + return Ok(None); + }; + self.statistics.write.get += 1; + let write = WriteRef::parse(&value)?.to_owned(); + assert!( + write.write_type == WriteType::Put + || write.write_type == WriteType::Delete, + "Write record pointed by last_change_ts {} should be Put or Delete, but got {:?}", + commit_ts, + write.write_type, + ); + return Ok(Some((write, commit_ts))); + } + } } } None => return Ok(None), @@ -2499,4 +2523,50 @@ pub mod tests { assert_eq!(reader.statistics.write.seek_tombstone, *tombstones); } } + + #[test] + fn test_get_write_second_get() { + let path = tempfile::Builder::new() + .prefix("_test_storage_mvcc_reader_get_write_second_get") + .tempdir() + .unwrap(); + let path = path.path().to_str().unwrap(); + let region = make_region(1, vec![], vec![]); + let db = open_db(path, true); + let mut engine = RegionEngine::new(&db, ®ion); + + let (k, v) = (b"k", b"v"); + let m = Mutation::make_put(Key::from_raw(k), v.to_vec()); + engine.prewrite(m, k, 1); + engine.commit(k, 1, 2); + + // Write enough ROLLBACK/LOCK recrods + engine.rollback(k, 5); + for start_ts in (6..30).into_iter().step_by(2) { + engine.lock(k, start_ts, start_ts + 1); + } + + let snap = RegionSnapshot::::from_raw(db, region); + let mut reader = MvccReader::new(snap, None, false); + + let key = Key::from_raw(k); + // Get write record whose commit_ts = 2 + let w2 = reader + .get_write(&key, TimeStamp::new(2), None) + .unwrap() + .unwrap(); + + // Clear statistics first + reader.statistics = Statistics::default(); + let (write, commit_ts) = reader + .get_write_with_commit_ts(&key, 40.into(), None) + .unwrap() + .unwrap(); + assert_eq!(commit_ts, 2.into()); + assert_eq!(write, w2); + // versions_to_last_change should be large enough to trigger a second get + // instead of calling a series of next, so the count of next should be 0 instead + assert_eq!(reader.statistics.write.next, 0); + assert_eq!(reader.statistics.write.get, 1); + } } From be2aec1ee49b6b68a074a036db059eb0084080c8 Mon Sep 17 00:00:00 2001 From: Connor Date: Fri, 18 Nov 2022 14:41:55 +0800 Subject: [PATCH 344/676] test: fix flaky witness test (#13822) close tikv/tikv#13816 fix flaky witness test Signed-off-by: Connor1996 Co-authored-by: Xinye Tao --- components/test_raftstore/src/cluster.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index c4ac98180a6..f9088ff4e3b 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1202,7 +1202,7 @@ impl Cluster { self.get_engine(store_id) .get_msg_cf::(engine_traits::CF_RAFT, &key) .unwrap() - .unwrap() + .unwrap_or_default() } pub fn get_raft_local_state(&self, region_id: u64, store_id: u64) -> Option { From 5a1f11048714054232fd12bde01f11a1729e498d Mon Sep 17 00:00:00 2001 From: xiongjiwei Date: Fri, 18 Nov 2022 17:39:56 +0800 Subject: [PATCH 345/676] txn: make `txn_source` be u64 type (#13817) ref tikv/tikv#13779 Signed-off-by: xiongjiwei Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/txn_types/src/lock.rs | 14 +++++++++----- components/txn_types/src/write.rs | 14 ++++++-------- src/storage/mvcc/mod.rs | 2 +- src/storage/txn/actions/commit.rs | 22 ++++++++++++---------- src/storage/txn/actions/prewrite.rs | 2 +- src/storage/txn/actions/tests.rs | 6 +++--- src/storage/txn/commands/prewrite.rs | 2 +- 8 files changed, 34 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index abb420d2264..7425528342d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2694,7 +2694,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#29a30c4ef9c52aafb1b1da73dd9df60857068114" +source = "git+https://github.com/pingcap/kvproto.git#51120697d051df163ec8aa313ee1916a68b07984" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/txn_types/src/lock.rs b/components/txn_types/src/lock.rs index 28df70677a5..040487388f9 100644 --- a/components/txn_types/src/lock.rs +++ b/components/txn_types/src/lock.rs @@ -96,7 +96,11 @@ pub struct Lock { /// The source of this txn. It is used by ticdc, if the value is 0 ticdc /// will sync the kv change event to downstream, if it is not 0, ticdc /// may ignore this change event. - pub txn_source: u8, + /// + /// We use `u64` to reserve more space for future use. For now, the upper + /// application is limited to setting this value under `0x80`, + /// so there will no more cost to change it to `u64`. + pub txn_source: u64, } impl std::fmt::Debug for Lock { @@ -182,7 +186,7 @@ impl Lock { #[inline] #[must_use] - pub fn set_txn_source(mut self, source: u8) -> Self { + pub fn set_txn_source(mut self, source: u64) -> Self { self.txn_source = source; self } @@ -231,7 +235,7 @@ impl Lock { } if self.txn_source != 0 { b.push(TXN_SOURCE_PREFIX); - b.push(self.txn_source); + b.encode_var_u64(self.txn_source).unwrap(); } b } @@ -266,7 +270,7 @@ impl Lock { size += 1 + size_of::() + MAX_VAR_U64_LEN; } if self.txn_source != 0 { - size += 2; + size += 1 + MAX_VAR_U64_LEN; } size } @@ -345,7 +349,7 @@ impl Lock { versions_to_last_change = number::decode_var_u64(&mut b)?; } TXN_SOURCE_PREFIX => { - txn_source = b.read_u8()?; + txn_source = number::decode_var_u64(&mut b)?; } _ => { // To support forward compatibility, all fields should be serialized in order diff --git a/components/txn_types/src/write.rs b/components/txn_types/src/write.rs index 6c46688defa..52777e5e4b2 100644 --- a/components/txn_types/src/write.rs +++ b/components/txn_types/src/write.rs @@ -160,7 +160,7 @@ pub struct Write { /// to find the latest PUT/DELETE record pub versions_to_last_change: u64, /// The source of this txn. - pub txn_source: u8, + pub txn_source: u64, } impl std::fmt::Debug for Write { @@ -248,7 +248,7 @@ impl Write { #[inline] #[must_use] - pub fn set_txn_source(mut self, source: u8) -> Self { + pub fn set_txn_source(mut self, source: u64) -> Self { self.txn_source = source; self } @@ -323,7 +323,7 @@ pub struct WriteRef<'a> { /// to find the latest PUT/DELETE record pub versions_to_last_change: u64, /// The source of this txn. - pub txn_source: u8, + pub txn_source: u64, } impl WriteRef<'_> { @@ -373,9 +373,7 @@ impl WriteRef<'_> { versions_to_last_change = number::decode_var_u64(&mut b)?; } TXN_SOURCE_PREFIX => { - txn_source = b - .read_u8() - .map_err(|_| Error::from(ErrorInner::BadFormatWrite))? + txn_source = number::decode_var_u64(&mut b)?; } _ => { // To support forward compatibility, all fields should be serialized in order @@ -420,7 +418,7 @@ impl WriteRef<'_> { } if self.txn_source != 0 { b.push(TXN_SOURCE_PREFIX); - b.push(self.txn_source); + b.encode_var_u64(self.txn_source).unwrap(); } b } @@ -438,7 +436,7 @@ impl WriteRef<'_> { size += 1 + size_of::() + MAX_VAR_U64_LEN; } if self.txn_source != 0 { - size += 2; + size += 1 + MAX_VAR_U64_LEN; } size } diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 997cde71020..3dca7a219f9 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -706,7 +706,7 @@ pub mod tests { assert_eq!(ts, commit_ts.into()); } - pub fn must_get_txn_source(engine: &mut E, key: &[u8], ts: u64, txn_source: u8) { + pub fn must_get_txn_source(engine: &mut E, key: &[u8], ts: u64, txn_source: u64) { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = SnapshotReader::new(TimeStamp::from(ts), snapshot, true); let write = reader diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index 1b8018e2aad..bfb1d39f768 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -355,15 +355,17 @@ pub mod tests { #[test] fn test_2pc_with_txn_source() { - let mut engine = TestEngineBuilder::new().build().unwrap(); - - let k = b"k"; - // WriteType is Put - must_prewrite_put_with_txn_soucre(&mut engine, k, b"v2", k, 25, 1); - let lock = must_locked(&mut engine, k, 25); - assert_eq!(lock.txn_source, 1); - must_succeed(&mut engine, k, 25, 30); - let write = must_written(&mut engine, k, 25, 30, WriteType::Put); - assert_eq!(write.txn_source, 1); + for source in [0x1, 0x85] { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let k = b"k"; + // WriteType is Put + must_prewrite_put_with_txn_soucre(&mut engine, k, b"v2", k, 25, source); + let lock = must_locked(&mut engine, k, 25); + assert_eq!(lock.txn_source, source); + must_succeed(&mut engine, k, 25, 30); + let write = must_written(&mut engine, k, 25, 30, WriteType::Put); + assert_eq!(write.txn_source, source); + } } } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 48caa3795af..46c9774dd52 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -172,7 +172,7 @@ pub struct TransactionProperties<'a> { pub need_old_value: bool, pub is_retry_request: bool, pub assertion_level: AssertionLevel, - pub txn_source: u8, + pub txn_source: u64, } impl<'a> TransactionProperties<'a> { diff --git a/src/storage/txn/actions/tests.rs b/src/storage/txn/actions/tests.rs index 79d31a08c9c..e6872ef493f 100644 --- a/src/storage/txn/actions/tests.rs +++ b/src/storage/txn/actions/tests.rs @@ -113,7 +113,7 @@ pub fn must_prewrite_put_impl_with_should_not_exist( assertion_level: AssertionLevel, should_not_exist: bool, region_id: Option, - txn_source: u32, + txn_source: u64, ) { let mut ctx = Context::default(); ctx.set_txn_source(txn_source); @@ -158,7 +158,7 @@ pub fn must_prewrite_put_impl_with_should_not_exist( need_old_value: false, is_retry_request, assertion_level, - txn_source: txn_source as u8, + txn_source, }, mutation, secondary_keys, @@ -230,7 +230,7 @@ pub fn must_prewrite_put_with_txn_soucre( value: &[u8], pk: &[u8], ts: impl Into, - txn_source: u32, + txn_source: u64, ) { must_prewrite_put_impl_with_should_not_exist( engine, diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 2cd908412c3..cd24f54d13b 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -511,7 +511,7 @@ impl Prewriter { need_old_value: extra_op == ExtraOp::ReadOldValue, is_retry_request: self.ctx.is_retry_request, assertion_level: self.assertion_level, - txn_source: self.ctx.get_txn_source() as u8, + txn_source: self.ctx.get_txn_source(), }; let async_commit_pk = self From 07266ff2f99f47ca8be0fab0c6c6e84a29f270ea Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 21 Nov 2022 12:19:57 +0800 Subject: [PATCH 346/676] resolved_ts: remove the unused CDC sinker (#13795) close tikv/tikv#13794 Since CDC works independently of the `resolved_ts` component and we don't have any plan to integrate them, so the sinker code in the `resolved_ts` component could be removed to simplify the code. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- components/resolved_ts/src/advance.rs | 10 +-- components/resolved_ts/src/endpoint.rs | 112 +++++++++--------------- components/resolved_ts/src/lib.rs | 2 - components/resolved_ts/src/observer.rs | 24 +++-- components/resolved_ts/src/sinker.rs | 45 ---------- components/resolved_ts/tests/mod.rs | 6 +- components/server/src/server.rs | 2 - components/test_raftstore/src/server.rs | 3 +- 8 files changed, 58 insertions(+), 146 deletions(-) delete mode 100644 components/resolved_ts/src/sinker.rs diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 496c5c8fab8..a78e903bc72 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -47,20 +47,20 @@ use crate::{endpoint::Task, metrics::*}; const DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS: u64 = 5_000; // 5s -pub struct AdvanceTsWorker { +pub struct AdvanceTsWorker { pd_client: Arc, timer: SteadyTimer, worker: Runtime, - scheduler: Scheduler>, + scheduler: Scheduler, /// The concurrency manager for transactions. It's needed for CDC to check /// locks when calculating resolved_ts. concurrency_manager: ConcurrencyManager, } -impl AdvanceTsWorker { +impl AdvanceTsWorker { pub fn new( pd_client: Arc, - scheduler: Scheduler>, + scheduler: Scheduler, concurrency_manager: ConcurrencyManager, ) -> Self { let worker = Builder::new_multi_thread() @@ -81,7 +81,7 @@ impl AdvanceTsWorker { } } -impl AdvanceTsWorker { +impl AdvanceTsWorker { // Advance ts asynchronously and register RegisterAdvanceEvent when its done. pub fn advance_ts_for_regions( &self, diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 4f957e8266d..def3d512d3a 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -12,7 +12,7 @@ use std::{ }; use concurrency_manager::ConcurrencyManager; -use engine_traits::{KvEngine, Snapshot}; +use engine_traits::KvEngine; use grpcio::Environment; use kvproto::{metapb::Region, raft_cmdpb::AdminCmdType}; use online_config::{self, ConfigChange, ConfigManager, OnlineConfig}; @@ -23,7 +23,6 @@ use raftstore::{ store::{ fsm::StoreMeta, util::{self, RegionReadProgress, RegionReadProgressRegistry}, - RegionSnapshot, }, }; use security::SecurityManager; @@ -41,7 +40,6 @@ use crate::{ metrics::*, resolver::Resolver, scanner::{ScanEntry, ScanMode, ScanTask, ScannerPool}, - sinker::{CmdSinker, SinkCmd}, }; enum ResolverStatus { @@ -264,7 +262,7 @@ impl ObserveRegion { } } -pub struct Endpoint { +pub struct Endpoint { store_id: Option, cfg: ResolvedTsConfig, cfg_update_notify: Arc, @@ -272,28 +270,25 @@ pub struct Endpoint { region_read_progress: RegionReadProgressRegistry, regions: HashMap, scanner_pool: ScannerPool, - scheduler: Scheduler>, - sinker: C, - advance_worker: AdvanceTsWorker, + scheduler: Scheduler, + advance_worker: AdvanceTsWorker, _phantom: PhantomData<(T, E)>, } -impl Endpoint +impl Endpoint where T: 'static + RaftStoreRouter, E: KvEngine, - C: CmdSinker, { pub fn new( cfg: &ResolvedTsConfig, - scheduler: Scheduler>, + scheduler: Scheduler, raft_router: T, store_meta: Arc>, pd_client: Arc, concurrency_manager: ConcurrencyManager, env: Arc, security_mgr: Arc, - sinker: C, ) -> Self { let (region_read_progress, store_id) = { let meta = store_meta.lock().unwrap(); @@ -320,7 +315,6 @@ where region_read_progress, advance_worker, scanner_pool, - sinker, regions: HashMap::default(), _phantom: PhantomData::default(), }; @@ -502,64 +496,42 @@ where if regions.is_empty() { return; } - - let mut min_ts = TimeStamp::max(); for region_id in regions.iter() { if let Some(observe_region) = self.regions.get_mut(region_id) { if let ResolverStatus::Ready = observe_region.resolver_status { - let resolved_ts = observe_region.resolver.resolve(ts); - if resolved_ts < min_ts { - min_ts = resolved_ts; - } + let _ = observe_region.resolver.resolve(ts); } } } - self.sinker.sink_resolved_ts(regions, ts); } // Tracking or untracking locks with incoming commands that corresponding // observe id is valid. #[allow(clippy::drop_ref)] - fn handle_change_log( - &mut self, - cmd_batch: Vec, - snapshot: Option>, - ) { + fn handle_change_log(&mut self, cmd_batch: Vec) { let size = cmd_batch.iter().map(|b| b.size()).sum::(); RTS_CHANNEL_PENDING_CMD_BYTES.sub(size as i64); - let logs = cmd_batch - .into_iter() - .filter_map(|batch| { - if !batch.is_empty() { - if let Some(observe_region) = self.regions.get_mut(&batch.region_id) { - let observe_id = batch.rts_id; - let region_id = observe_region.meta.id; - if observe_region.handle.id == observe_id { - let logs = ChangeLog::encode_change_log(region_id, batch); - if let Err(e) = observe_region.track_change_log(&logs) { - drop(observe_region); - self.re_register_region(region_id, observe_id, e) - } - return Some(SinkCmd { - region_id, - observe_id, - logs, - }); - } else { - debug!("resolved ts CmdBatch discarded"; - "region_id" => batch.region_id, - "observe_id" => ?batch.rts_id, - "current" => ?observe_region.handle.id, - ); - } + for batch in cmd_batch { + if batch.is_empty() { + continue; + } + if let Some(observe_region) = self.regions.get_mut(&batch.region_id) { + let observe_id = batch.rts_id; + let region_id = observe_region.meta.id; + if observe_region.handle.id == observe_id { + let logs = ChangeLog::encode_change_log(region_id, batch); + if let Err(e) = observe_region.track_change_log(&logs) { + drop(observe_region); + self.re_register_region(region_id, observe_id, e); } + } else { + debug!("resolved ts CmdBatch discarded"; + "region_id" => batch.region_id, + "observe_id" => ?batch.rts_id, + "current" => ?observe_region.handle.id, + ); } - None - }) - .collect(); - match snapshot { - Some(snap) => self.sinker.sink_cmd_with_old_value(logs, snap), - None => self.sinker.sink_cmd(logs), + } } } @@ -615,7 +587,7 @@ where } } -pub enum Task { +pub enum Task { RegionUpdated(Region), RegionDestroyed(Region), RegisterRegion { @@ -638,7 +610,6 @@ pub enum Task { }, ChangeLog { cmd_batch: Vec, - snapshot: Option>, }, ScanLocks { region_id: u64, @@ -651,7 +622,7 @@ pub enum Task { }, } -impl fmt::Debug for Task { +impl fmt::Debug for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut de = f.debug_struct("ResolvedTsTask"); match self { @@ -710,21 +681,20 @@ impl fmt::Debug for Task { } } -impl fmt::Display for Task { +impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:?}", self) } } -impl Runnable for Endpoint +impl Runnable for Endpoint where T: 'static + RaftStoreRouter, E: KvEngine, - C: CmdSinker, { - type Task = Task; + type Task = Task; - fn run(&mut self, task: Task) { + fn run(&mut self, task: Task) { debug!("run resolved-ts task"; "task" => ?task); match task { Task::RegionDestroyed(region) => self.region_destroyed(region), @@ -742,10 +712,7 @@ where Task::ResolvedTsAdvanced { regions, ts } => { self.handle_resolved_ts_advanced(regions, ts) } - Task::ChangeLog { - cmd_batch, - snapshot, - } => self.handle_change_log(cmd_batch, snapshot), + Task::ChangeLog { cmd_batch } => self.handle_change_log(cmd_batch), Task::ScanLocks { region_id, observe_id, @@ -757,15 +724,15 @@ where } } -pub struct ResolvedTsConfigManager(Scheduler>); +pub struct ResolvedTsConfigManager(Scheduler); -impl ResolvedTsConfigManager { - pub fn new(scheduler: Scheduler>) -> ResolvedTsConfigManager { +impl ResolvedTsConfigManager { + pub fn new(scheduler: Scheduler) -> ResolvedTsConfigManager { ResolvedTsConfigManager(scheduler) } } -impl ConfigManager for ResolvedTsConfigManager { +impl ConfigManager for ResolvedTsConfigManager { fn dispatch(&mut self, change: ConfigChange) -> online_config::Result<()> { if let Err(e) = self.0.schedule(Task::ChangeConfig { change }) { error!("failed to schedule ChangeConfig task"; "err" => ?e); @@ -776,11 +743,10 @@ impl ConfigManager for ResolvedTsConfigManager { const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s -impl RunnableWithTimer for Endpoint +impl RunnableWithTimer for Endpoint where T: 'static + RaftStoreRouter, E: KvEngine, - C: CmdSinker, { fn on_timeout(&mut self) { let store_id = self.get_or_init_store_id(); diff --git a/components/resolved_ts/src/lib.rs b/components/resolved_ts/src/lib.rs index 5d4e233808d..eef1211a580 100644 --- a/components/resolved_ts/src/lib.rs +++ b/components/resolved_ts/src/lib.rs @@ -27,8 +27,6 @@ mod observer; pub use observer::*; mod advance; pub use advance::*; -mod sinker; -pub use sinker::*; mod endpoint; pub use endpoint::*; mod errors; diff --git a/components/resolved_ts/src/observer.rs b/components/resolved_ts/src/observer.rs index 9ff7b976ad4..7421beaad85 100644 --- a/components/resolved_ts/src/observer.rs +++ b/components/resolved_ts/src/observer.rs @@ -8,16 +8,16 @@ use tikv_util::worker::Scheduler; use crate::{cmd::lock_only_filter, endpoint::Task, metrics::RTS_CHANNEL_PENDING_CMD_BYTES}; -pub struct Observer { - scheduler: Scheduler>, +pub struct Observer { + scheduler: Scheduler, } -impl Observer { - pub fn new(scheduler: Scheduler>) -> Self { +impl Observer { + pub fn new(scheduler: Scheduler) -> Self { Observer { scheduler } } - pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { + pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { // The `resolved-ts` cmd observer will `mem::take` the `Vec`, use a // low priority to let it be the last observer and avoid affecting other // observers @@ -33,7 +33,7 @@ impl Observer { } } -impl Clone for Observer { +impl Clone for Observer { fn clone(&self) -> Self { Self { scheduler: self.scheduler.clone(), @@ -41,9 +41,9 @@ impl Clone for Observer { } } -impl Coprocessor for Observer {} +impl Coprocessor for Observer {} -impl CmdObserver for Observer { +impl CmdObserver for Observer { fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -64,7 +64,6 @@ impl CmdObserver for Observer { RTS_CHANNEL_PENDING_CMD_BYTES.add(size as i64); if let Err(e) = self.scheduler.schedule(Task::ChangeLog { cmd_batch: cmd_batches, - snapshot: None, }) { info!("failed to schedule change log event"; "err" => ?e); } @@ -82,7 +81,7 @@ impl CmdObserver for Observer { } } -impl RoleObserver for Observer { +impl RoleObserver for Observer { fn on_role_change(&self, ctx: &mut ObserverContext<'_>, role_change: &RoleChange) { // Stop to advance resolved ts after peer steps down to follower or candidate. // Do not need to check observe id because we expect all role change events are @@ -97,7 +96,7 @@ impl RoleObserver for Observer { } } -impl RegionChangeObserver for Observer { +impl RegionChangeObserver for Observer { fn on_region_changed( &self, ctx: &mut ObserverContext<'_>, @@ -139,7 +138,6 @@ impl RegionChangeObserver for Observer { mod test { use std::time::Duration; - use engine_rocks::RocksSnapshot; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::raft_cmdpb::*; use tikv::storage::kv::TestEngineBuilder; @@ -156,7 +154,7 @@ mod test { cmd } - fn expect_recv(rx: &mut ReceiverWrapper>, data: Vec) { + fn expect_recv(rx: &mut ReceiverWrapper, data: Vec) { if data.is_empty() { match rx.recv_timeout(Duration::from_millis(10)) { Err(std::sync::mpsc::RecvTimeoutError::Timeout) => return, diff --git a/components/resolved_ts/src/sinker.rs b/components/resolved_ts/src/sinker.rs deleted file mode 100644 index 383e5f7acc7..00000000000 --- a/components/resolved_ts/src/sinker.rs +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::marker::PhantomData; - -use engine_traits::Snapshot; -use raftstore::{coprocessor::ObserveId, store::RegionSnapshot}; -use txn_types::TimeStamp; - -use crate::cmd::ChangeLog; - -pub struct SinkCmd { - pub region_id: u64, - pub observe_id: ObserveId, - pub logs: Vec, -} - -pub trait CmdSinker: Send { - fn sink_cmd(&mut self, sink_cmd: Vec); - - fn sink_cmd_with_old_value(&mut self, sink_cmd: Vec, snapshot: RegionSnapshot); - - fn sink_resolved_ts(&mut self, regions: Vec, ts: TimeStamp); -} - -pub struct DummySinker(PhantomData); - -impl DummySinker { - pub fn new() -> Self { - Self(PhantomData::default()) - } -} - -impl Default for DummySinker { - fn default() -> Self { - Self::new() - } -} - -impl CmdSinker for DummySinker { - fn sink_cmd(&mut self, _sink_cmd: Vec) {} - - fn sink_cmd_with_old_value(&mut self, _sink_cmd: Vec, _snapshot: RegionSnapshot) {} - - fn sink_resolved_ts(&mut self, _regions: Vec, _ts: TimeStamp) {} -} diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 376aa216224..e8d2a6429ba 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -4,7 +4,6 @@ use std::{sync::*, time::Duration}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; -use engine_rocks::{RocksEngine, RocksSnapshot}; use futures::{executor::block_on, stream, SinkExt}; use grpcio::{ChannelBuilder, ClientUnaryReceiver, Environment, Result, WriteFlags}; use kvproto::{ @@ -28,8 +27,8 @@ pub fn init() { pub struct TestSuite { pub cluster: Cluster, - pub endpoints: HashMap>>, - pub obs: HashMap>, + pub endpoints: HashMap>, + pub obs: HashMap, tikv_cli: HashMap, import_cli: HashMap, concurrency_managers: HashMap, @@ -88,7 +87,6 @@ impl TestSuite { cm.clone(), env, sim.security_mgr.clone(), - resolved_ts::DummySinker::new(), ); concurrency_managers.insert(*id, cm); worker.start(rts_endpoint); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index a5fb3fefaf9..80d44b114b9 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1158,8 +1158,6 @@ where self.concurrency_manager.clone(), server.env(), self.security_mgr.clone(), - // TODO: replace to the cdc sinker - resolved_ts::DummySinker::new(), ); rts_worker.start_with_timer(rts_endpoint); self.to_stop.push(rts_worker); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 5ae1b1a13a6..42cefe60496 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -131,7 +131,7 @@ struct ServerMeta { raw_router: RaftRouter, raw_apply_router: ApplyRouter, gc_worker: GcWorker, SimulateStoreTransport>, - rts_worker: Option>>, + rts_worker: Option>, rsmeter_cleanup: Box, } @@ -362,7 +362,6 @@ impl ServerCluster { concurrency_manager.clone(), self.env.clone(), self.security_mgr.clone(), - resolved_ts::DummySinker::new(), ); // Start the worker rts_worker.start(rts_endpoint); From b0075db291323727a0c643ac75d1a91c1d2a61fe Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Mon, 21 Nov 2022 22:35:58 +0800 Subject: [PATCH 347/676] raftstore-v2: support send/recv tablet snapshot (#13776) ref tikv/tikv#12842 Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Xinye Tao --- components/raftstore-v2/src/raft/storage.rs | 2 +- .../raftstore/src/store/async_io/read.rs | 4 +- components/raftstore/src/store/snap.rs | 28 +- src/server/errors.rs | 5 +- src/server/mod.rs | 1 + src/server/snap.rs | 4 +- src/server/tablet_snap.rs | 557 ++++++++++++++++++ 7 files changed, 594 insertions(+), 7 deletions(-) create mode 100644 src/server/tablet_snap.rs diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 01285cc5a46..d2abb6818d8 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -553,7 +553,7 @@ mod tests { assert_eq!(snap.get_metadata().get_term(), 0); assert_eq!(snap.get_data().is_empty(), false); let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); - let checkpointer_path = mgr.get_tablet_checkpointer_path(&snap_key); + let checkpointer_path = mgr.get_final_path_for_gen(&snap_key); assert!(checkpointer_path.exists()); // Test cancel snapshot diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index 2da4869d24b..9e0215ca9c1 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -18,6 +18,7 @@ use raft::{eraftpb::Snapshot, GetEntriesContext}; use tikv_util::{error, info, time::Instant, worker::Runnable}; use crate::store::{ + snap::TABLET_SNAPSHOT_VERSION, util, worker::metrics::{SNAP_COUNTER, SNAP_HISTOGRAM}, RaftlogFetchResult, TabletSnapKey, TabletSnapManager, MAX_INIT_ENTRY_COUNT, @@ -119,7 +120,7 @@ impl ReadRunner { } fn generate_snap(&self, snap_key: &TabletSnapKey, tablet: EK) -> crate::Result<()> { - let checkpointer_path = self.snap_mgr().get_tablet_checkpointer_path(snap_key); + let checkpointer_path = self.snap_mgr().get_final_path_for_gen(snap_key); if checkpointer_path.as_path().exists() { // Remove the old checkpoint directly. std::fs::remove_dir_all(checkpointer_path.as_path())?; @@ -215,6 +216,7 @@ where // Set snapshot data. let mut snap_data = RaftSnapshotData::default(); snap_data.set_region(region_state.get_region().clone()); + snap_data.set_version(TABLET_SNAPSHOT_VERSION); snap_data.mut_meta().set_for_balance(for_balance); snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 8ca5b26d02b..a9f50d61edb 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -4,6 +4,7 @@ use std::{ cmp::{self, Ordering as CmpOrdering, Reverse}, error::Error as StdError, fmt::{self, Display, Formatter}, + fs, io::{self, ErrorKind, Read, Write}, path::{Path, PathBuf}, result, str, @@ -56,6 +57,7 @@ pub const SNAPSHOT_CFS_ENUM_PAIR: &[(CfNames, CfName)] = &[ (CfNames::write, CF_WRITE), ]; pub const SNAPSHOT_VERSION: u64 = 2; +pub const TABLET_SNAPSHOT_VERSION: u64 = 3; pub const IO_LIMITER_CHUNK_SIZE: usize = 4 * 1024; /// Name prefix for the self-generated snapshot file. @@ -1926,7 +1928,6 @@ impl Display for TabletSnapKey { /// It's similar `SnapManager`, but simpler in tablet version. /// /// TODO: -/// - add Limiter to control send/recv speed /// - clean up expired tablet checkpointer #[derive(Clone)] pub struct TabletSnapManager { @@ -1955,10 +1956,33 @@ impl TabletSnapManager { Ok(()) } - pub fn get_tablet_checkpointer_path(&self, key: &TabletSnapKey) -> PathBuf { + pub fn get_final_path_for_gen(&self, key: &TabletSnapKey) -> PathBuf { let prefix = format!("{}_{}", SNAP_GEN_PREFIX, key); PathBuf::from(&self.base).join(prefix) } + + pub fn get_final_path_for_recv(&self, key: &TabletSnapKey) -> PathBuf { + let prefix = format!("{}_{}", SNAP_REV_PREFIX, key); + PathBuf::from(&self.base).join(prefix) + } + pub fn get_tmp_path_for_recv(&self, key: &TabletSnapKey) -> PathBuf { + let prefix = format!("{}_{}{}", SNAP_REV_PREFIX, key, TMP_FILE_SUFFIX); + PathBuf::from(&self.base).join(prefix) + } + + pub fn delete_snapshot(&self, key: &TabletSnapKey) -> bool { + let path = self.get_final_path_for_gen(key); + if path.exists() && let Err(e) = fs::remove_dir_all(path.as_path()) { + error!( + "delete snapshot failed"; + "path" => %path.display(), + "err" => ?e, + ); + false + } else { + true + } + } } #[cfg(test)] diff --git a/src/server/errors.rs b/src/server/errors.rs index c7a41947f79..5936f365120 100644 --- a/src/server/errors.rs +++ b/src/server/errors.rs @@ -3,7 +3,7 @@ use std::{error::Error as StdError, io::Error as IoError, net::AddrParseError, result}; use engine_traits::Error as EngineTraitError; -use futures::channel::oneshot::Canceled; +use futures::channel::{mpsc::SendError, oneshot::Canceled}; use grpcio::Error as GrpcError; use hyper::Error as HttpError; use openssl::error::ErrorStack as OpenSslError; @@ -66,6 +66,9 @@ pub enum Error { #[error("{0:?}")] OpenSsl(#[from] OpenSslError), + + #[error("{0:?}")] + StreamDisconnect(#[from] SendError), } pub type Result = result::Result; diff --git a/src/server/mod.rs b/src/server/mod.rs index af1aa289de7..d926ca40b2a 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -20,6 +20,7 @@ pub mod server; pub mod service; pub mod snap; pub mod status_server; +pub mod tablet_snap; pub mod transport; pub mod ttl; diff --git a/src/server/snap.rs b/src/server/snap.rs index 49c38cb645b..0200c779383 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -47,7 +47,7 @@ use crate::tikv_util::sys::thread::ThreadBuildWrapper; pub type Callback = Box) + Send>; -const DEFAULT_POOL_SIZE: usize = 4; +pub const DEFAULT_POOL_SIZE: usize = 4; /// A task for either receiving Snapshot or sending Snapshot pub enum Task { @@ -83,7 +83,7 @@ struct SnapChunk { remain_bytes: usize, } -const SNAP_CHUNK_LEN: usize = 1024 * 1024; +pub const SNAP_CHUNK_LEN: usize = 1024 * 1024; impl Stream for SnapChunk { type Item = Result<(SnapshotChunk, WriteFlags)>; diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs new file mode 100644 index 00000000000..cbda159a83e --- /dev/null +++ b/src/server/tablet_snap.rs @@ -0,0 +1,557 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + convert::{TryFrom, TryInto}, + fs::{self, File}, + io::{Read, Write}, + marker::PhantomData, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + time::Duration, +}; + +use engine_traits::KvEngine; +use file_system::{IoType, WithIoType}; +use futures::{ + future::{Future, TryFutureExt}, + sink::{Sink, SinkExt}, + stream::{Stream, StreamExt, TryStreamExt}, +}; +use grpcio::{ + self, ChannelBuilder, ClientStreamingSink, Environment, RequestStream, RpcStatus, + RpcStatusCode, WriteFlags, +}; +use kvproto::{ + raft_serverpb::{Done, RaftMessage, RaftSnapshotData, SnapshotChunk}, + tikvpb::TikvClient, +}; +use protobuf::Message; +use raftstore::{ + router::RaftStoreRouter, + store::snap::{TabletSnapKey, TabletSnapManager}, +}; +use security::SecurityManager; +use tikv_util::{ + config::{Tracker, VersionTrack}, + time::Instant, + worker::Runnable, +}; +use tokio::runtime::{Builder as RuntimeBuilder, Runtime}; + +use super::{ + metrics::*, + snap::{Task, DEFAULT_POOL_SIZE, SNAP_CHUNK_LEN}, + Config, Error, Result, +}; +use crate::tikv_util::{sys::thread::ThreadBuildWrapper, time::Limiter}; + +struct RecvTabletSnapContext { + key: TabletSnapKey, + raft_msg: RaftMessage, + io_type: IoType, + start: Instant, + chunk_size: usize, +} + +impl RecvTabletSnapContext { + fn new(mut head: SnapshotChunk) -> Result { + if !head.has_message() { + return Err(box_err!("no raft message in the first chunk")); + } + + let chunk_size = match head.take_data().try_into() { + Ok(buff) => usize::from_ne_bytes(buff), + Err(_) => return Err(box_err!("failed to get chunk size")), + }; + let meta = head.take_message(); + let key = TabletSnapKey::from_region_snap( + meta.get_region_id(), + meta.get_to_peer().get_id(), + meta.get_message().get_snapshot(), + ); + let io_type = io_type_from_raft_message(&meta)?; + + Ok(RecvTabletSnapContext { + key, + raft_msg: meta, + io_type, + start: Instant::now(), + chunk_size, + }) + } + + fn finish>(self, raft_router: R) -> Result<()> { + let key = self.key; + if let Err(e) = raft_router.send_raft_msg(self.raft_msg) { + return Err(box_err!("{} failed to send snapshot to raft: {}", key, e)); + } + info!("saving all snapshot files"; "snap_key" => %key, "takes" => ?self.start.saturating_elapsed()); + Ok(()) + } +} + +fn io_type_from_raft_message(msg: &RaftMessage) -> Result { + let snapshot = msg.get_message().get_snapshot(); + let data = snapshot.get_data(); + let mut snapshot_data = RaftSnapshotData::default(); + snapshot_data.merge_from_bytes(data)?; + let snapshot_meta = snapshot_data.get_meta(); + if snapshot_meta.get_for_balance() { + Ok(IoType::LoadBalance) + } else { + Ok(IoType::Replication) + } +} + +async fn send_snap_files( + mgr: &TabletSnapManager, + mut sender: impl Sink<(SnapshotChunk, WriteFlags), Error = Error> + Unpin, + msg: RaftMessage, + key: TabletSnapKey, + limiter: Limiter, +) -> Result { + let path = mgr.get_final_path_for_gen(&key); + info!("begin to send snapshot file";"snap_key" => %key); + let files = fs::read_dir(&path)? + .map(|f| Ok(f?.path())) + .filter(|f| f.is_ok() && f.as_ref().unwrap().is_file()) + .collect::>>()?; + let io_type = io_type_from_raft_message(&msg)?; + let _with_io_type = WithIoType::new(io_type); + let mut total_sent = msg.compute_size() as u64; + let mut chunk = SnapshotChunk::default(); + chunk.set_message(msg); + chunk.set_data(usize::to_ne_bytes(SNAP_CHUNK_LEN).to_vec()); + sender + .feed((chunk, WriteFlags::default().buffer_hint(true))) + .await?; + for path in files { + let name = path.file_name().unwrap().to_str().unwrap(); + let mut buffer = Vec::with_capacity(SNAP_CHUNK_LEN); + buffer.push(name.len() as u8); + buffer.extend_from_slice(name.as_bytes()); + let mut f = File::open(&path)?; + let mut off = buffer.len(); + loop { + unsafe { + buffer.set_len(SNAP_CHUNK_LEN); + } + // it should break if readed len is zero or the buffer is full. + while off < SNAP_CHUNK_LEN { + let readed = f.read(&mut buffer[off..])?; + if readed == 0 { + unsafe { + buffer.set_len(off); + } + break; + } + off += readed; + } + limiter.consume(off); + total_sent += off as u64; + let mut chunk = SnapshotChunk::default(); + chunk.set_data(buffer); + sender + .feed((chunk, WriteFlags::default().buffer_hint(true))) + .await?; + // It should switch the next file if the read buffer len is less than the + // SNAP_CHUNK_LEN. + if off < SNAP_CHUNK_LEN { + break; + } + buffer = Vec::with_capacity(SNAP_CHUNK_LEN); + off = 0 + } + } + info!("sent all snap file finish"; "snap_key" => %key); + sender.close().await?; + Ok(total_sent) +} + +/// Send the snapshot to specified address. +/// +/// It will first send the normal raft snapshot message and then send the +/// snapshot file. +pub fn send_snap( + env: Arc, + mgr: TabletSnapManager, + security_mgr: Arc, + cfg: &Config, + addr: &str, + msg: RaftMessage, + limiter: Limiter, +) -> Result>> { + assert!(msg.get_message().has_snapshot()); + let timer = Instant::now(); + let send_timer = SEND_SNAP_HISTOGRAM.start_coarse_timer(); + let key = TabletSnapKey::from_region_snap( + msg.get_region_id(), + msg.get_to_peer().get_id(), + msg.get_message().get_snapshot(), + ); + + let cb = ChannelBuilder::new(env) + .stream_initial_window_size(cfg.grpc_stream_initial_window_size.0 as i32) + .keepalive_time(cfg.grpc_keepalive_time.0) + .keepalive_timeout(cfg.grpc_keepalive_timeout.0) + .default_compression_algorithm(cfg.grpc_compression_algorithm()) + .default_gzip_compression_level(cfg.grpc_gzip_compression_level) + .default_grpc_min_message_size_to_compress(cfg.grpc_min_message_size_to_compress); + + let channel = security_mgr.connect(cb, addr); + let client = TikvClient::new(channel); + let (sink, receiver) = client.snapshot()?; + let send_task = async move { + let sink = sink.sink_map_err(Error::from); + let total_size = send_snap_files(&mgr, sink, msg, key.clone(), limiter).await?; + let recv_result = receiver.map_err(Error::from).await; + send_timer.observe_duration(); + drop(client); + match recv_result { + Ok(_) => { + mgr.delete_snapshot(&key); + Ok(SendStat { + key, + total_size, + elapsed: timer.saturating_elapsed(), + }) + } + Err(e) => Err(e), + } + }; + Ok(send_task) +} + +async fn recv_snap_files( + snap_mgr: TabletSnapManager, + mut stream: impl Stream> + Unpin, + limit: Limiter, +) -> Result { + let head = stream + .next() + .await + .transpose()? + .ok_or_else(|| Error::Other("empty gRPC stream".into()))?; + let context = RecvTabletSnapContext::new(head)?; + let chunk_size = context.chunk_size; + let path = snap_mgr.get_tmp_path_for_recv(&context.key); + info!("begin to receive tablet snapshot files"; "file" => %path.display()); + fs::create_dir_all(&path)?; + let _with_io_type = WithIoType::new(context.io_type); + loop { + let mut chunk = match stream.next().await { + Some(Ok(mut c)) if !c.has_message() => c.take_data(), + Some(_) => { + return Err(box_err!("duplicated metadata")); + } + None => break, + }; + // the format of chunk: + // |--name_len--|--name--|--content--| + let len = chunk[0] as usize; + let file_name = box_try!(std::str::from_utf8(&chunk[1..len + 1])); + let p = path.join(file_name); + let mut f = File::create(&p)?; + let mut size = chunk.len() - len - 1; + f.write_all(&chunk[len + 1..])?; + // It should switch next file if the chunk size is less than the SNAP_CHUNK_LEN. + while chunk.len() >= chunk_size { + chunk = match stream.next().await { + Some(Ok(mut c)) if !c.has_message() => c.take_data(), + Some(_) => return Err(box_err!("duplicated metadata")), + None => return Err(box_err!("missing chunk")), + }; + f.write_all(&chunk[..])?; + limit.consume(chunk.len()); + size += chunk.len(); + } + debug!("received snap file"; "file" => %p.display(), "size" => size); + SNAP_LIMIT_TRANSPORT_BYTES_COUNTER_STATIC + .recv + .inc_by(size as u64); + f.sync_data()?; + } + info!("received all tablet snapshot file"; "snap_key" => %context.key); + let final_path = snap_mgr.get_final_path_for_recv(&context.key); + fs::rename(&path, final_path)?; + Ok(context) +} + +fn recv_snap + 'static>( + stream: RequestStream, + sink: ClientStreamingSink, + snap_mgr: TabletSnapManager, + raft_router: R, + limit: Limiter, +) -> impl Future> { + let recv_task = async move { + let stream = stream.map_err(Error::from); + let context = recv_snap_files(snap_mgr, stream, limit).await?; + context.finish(raft_router) + }; + async move { + match recv_task.await { + Ok(()) => sink.success(Done::default()).await.map_err(Error::from), + Err(e) => { + let status = RpcStatus::with_message(RpcStatusCode::UNKNOWN, format!("{:?}", e)); + sink.fail(status).await.map_err(Error::from) + } + } + } +} + +pub struct TabletRunner +where + E: KvEngine, + R: RaftStoreRouter + 'static, +{ + env: Arc, + snap_mgr: TabletSnapManager, + security_mgr: Arc, + pool: Runtime, + raft_router: R, + cfg_tracker: Tracker, + cfg: Config, + sending_count: Arc, + recving_count: Arc, + engine: PhantomData, + limiter: Limiter, +} + +impl TabletRunner +where + E: KvEngine, + R: RaftStoreRouter + 'static, +{ + pub fn new( + env: Arc, + snap_mgr: TabletSnapManager, + r: R, + security_mgr: Arc, + cfg: Arc>, + ) -> TabletRunner { + let config = cfg.value().clone(); + let cfg_tracker = cfg.tracker("tablet-sender".to_owned()); + let limit = i64::try_from(config.snap_max_write_bytes_per_sec.0) + .unwrap_or_else(|_| panic!("snap_max_write_bytes_per_sec > i64::max_value")); + let limiter = Limiter::new(if limit > 0 { + limit as f64 + } else { + f64::INFINITY + }); + + let snap_worker = TabletRunner { + env, + snap_mgr, + pool: RuntimeBuilder::new_multi_thread() + .thread_name(thd_name!("tablet-snap-sender")) + .worker_threads(DEFAULT_POOL_SIZE) + .after_start_wrapper(tikv_alloc::add_thread_memory_accessor) + .before_stop_wrapper(tikv_alloc::remove_thread_memory_accessor) + .build() + .unwrap(), + raft_router: r, + security_mgr, + cfg_tracker, + cfg: config, + sending_count: Arc::new(AtomicUsize::new(0)), + recving_count: Arc::new(AtomicUsize::new(0)), + engine: PhantomData, + limiter, + }; + snap_worker + } + + fn refresh_cfg(&mut self) { + if let Some(incoming) = self.cfg_tracker.any_new() { + let limit = if incoming.snap_max_write_bytes_per_sec.0 > 0 { + incoming.snap_max_write_bytes_per_sec.0 as f64 + } else { + f64::INFINITY + }; + self.limiter.set_speed_limit(limit); + info!("refresh snapshot manager config"; + "speed_limit"=> limit); + self.cfg = incoming.clone(); + } + } +} + +pub struct SendStat { + key: TabletSnapKey, + total_size: u64, + elapsed: Duration, +} + +impl Runnable for TabletRunner +where + E: KvEngine, + R: RaftStoreRouter + 'static, +{ + type Task = Task; + + fn run(&mut self, task: Task) { + match task { + Task::Recv { stream, sink } => { + let task_num = self.recving_count.load(Ordering::SeqCst); + if task_num >= self.cfg.concurrent_recv_snap_limit { + warn!("too many recving snapshot tasks, ignore"); + let status = RpcStatus::with_message( + RpcStatusCode::RESOURCE_EXHAUSTED, + format!( + "the number of received snapshot tasks {} exceeded the limitation {}", + task_num, self.cfg.concurrent_recv_snap_limit + ), + ); + self.pool.spawn(sink.fail(status)); + return; + } + SNAP_TASK_COUNTER_STATIC.recv.inc(); + + let snap_mgr = self.snap_mgr.clone(); + let raft_router = self.raft_router.clone(); + let recving_count = self.recving_count.clone(); + recving_count.fetch_add(1, Ordering::SeqCst); + let limit = self.limiter.clone(); + let task = async move { + let result = recv_snap(stream, sink, snap_mgr, raft_router, limit).await; + recving_count.fetch_sub(1, Ordering::SeqCst); + if let Err(e) = result { + error!("failed to recv snapshot"; "err" => %e); + } + }; + self.pool.spawn(task); + } + Task::Send { addr, msg, cb } => { + let region_id = msg.get_region_id(); + if self.sending_count.load(Ordering::SeqCst) >= self.cfg.concurrent_send_snap_limit + { + warn!( + "too many sending snapshot tasks, drop Send Snap[to: {}, snap: {:?}]", + addr, msg + ); + cb(Err(Error::Other("Too many sending snapshot tasks".into()))); + return; + } + SNAP_TASK_COUNTER_STATIC.send.inc(); + + let env = Arc::clone(&self.env); + let mgr = self.snap_mgr.clone(); + let security_mgr = Arc::clone(&self.security_mgr); + let sending_count = Arc::clone(&self.sending_count); + sending_count.fetch_add(1, Ordering::SeqCst); + let limit = self.limiter.clone(); + let send_task = + send_snap(env, mgr, security_mgr, &self.cfg.clone(), &addr, msg, limit); + let task = async move { + let res = match send_task { + Err(e) => Err(e), + Ok(f) => f.await, + }; + match res { + Ok(stat) => { + info!( + "sent snapshot"; + "region_id" => region_id, + "snap_key" => %stat.key, + "size" => stat.total_size, + "duration" => ?stat.elapsed + ); + cb(Ok(())); + } + Err(e) => { + error!("failed to send snap"; "to_addr" => addr, "region_id" => region_id, "err" => ?e); + cb(Err(e)); + } + }; + sending_count.fetch_sub(1, Ordering::SeqCst); + }; + + self.pool.spawn(task); + } + Task::RefreshConfigEvent => { + self.refresh_cfg(); + } + Task::Validate(f) => { + f(&self.cfg); + } + } + } +} + +#[cfg(test)] +mod tests { + use std::{ + fs::{create_dir_all, File}, + io::Write, + }; + + use futures::{ + channel::mpsc::{self}, + executor::block_on, + sink::SinkExt, + }; + use futures_util::StreamExt; + use grpcio::WriteFlags; + use kvproto::raft_serverpb::{RaftMessage, SnapshotChunk}; + use raftstore::store::snap::{TabletSnapKey, TabletSnapManager}; + use tempfile::TempDir; + use tikv_util::{store::new_peer, time::Limiter}; + + use super::{super::Error, recv_snap_files, send_snap_files, SNAP_CHUNK_LEN}; + + #[test] + fn test_send_tablet() { + let limiter = Limiter::new(f64::INFINITY); + let snap_key = TabletSnapKey::new(1, 1, 1, 1); + let mut msg = RaftMessage::default(); + msg.set_region_id(1); + msg.set_to_peer(new_peer(1, 1)); + msg.mut_message().mut_snapshot().mut_metadata().set_index(1); + msg.mut_message().mut_snapshot().mut_metadata().set_term(1); + let send_path = TempDir::new().unwrap(); + let send_snap_mgr = + TabletSnapManager::new(send_path.path().join("snap_dir").to_str().unwrap()); + let snap_path = send_snap_mgr.get_final_path_for_gen(&snap_key); + create_dir_all(snap_path.as_path()).unwrap(); + // send file should skip directory + create_dir_all(snap_path.join("dir")).unwrap(); + for i in 0..2 { + let mut f = File::create(snap_path.join(i.to_string())).unwrap(); + let count = SNAP_CHUNK_LEN - 2; + let mut data = std::iter::repeat("a".as_bytes()) + .take(count) + .collect::>(); + for buffer in data.iter_mut() { + f.write_all(buffer).unwrap(); + } + f.sync_data().unwrap(); + } + + let recv_path = TempDir::new().unwrap(); + let recv_snap_manager = + TabletSnapManager::new(recv_path.path().join("snap_dir").to_str().unwrap()); + let (tx, rx) = mpsc::unbounded(); + let sink = tx.sink_map_err(Error::from); + block_on(send_snap_files( + &send_snap_mgr, + sink, + msg, + snap_key.clone(), + limiter.clone(), + )) + .unwrap(); + + let stream = rx.map(|x: (SnapshotChunk, WriteFlags)| Ok(x.0)); + let final_path = recv_snap_manager.get_final_path_for_recv(&snap_key); + let r = block_on(recv_snap_files(recv_snap_manager, stream, limiter)).unwrap(); + assert_eq!(r.key, snap_key); + std::thread::sleep(std::time::Duration::from_secs(1)); + let dir = std::fs::read_dir(final_path).unwrap(); + assert_eq!(2, dir.count()); + send_snap_mgr.delete_snapshot(&snap_key); + assert!(!snap_path.exists()); + } +} From e41dabb7064a3f9ec2b99a59af7a6fa792b325d9 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 22 Nov 2022 11:31:58 +0800 Subject: [PATCH 348/676] *: make flashback not aware of raftstore router (#13828) ref tikv/tikv#13827 flashback is a transaction concept, better make it only interact with the storage layer instead of raftstore directly. This PR also converts raftstore errors to region errors for flashback, so it will can make client retry more reliable. Signed-off-by: Jay Lee --- components/tikv_kv/src/lib.rs | 14 ++- components/tikv_util/src/sys/mod.rs | 5 +- src/server/raftkv.rs | 86 ++++++++++++--- src/server/service/kv.rs | 162 ++++++---------------------- 4 files changed, 118 insertions(+), 149 deletions(-) diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 32f15786f79..3e15b399796 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -35,7 +35,7 @@ use engine_traits::{ CF_DEFAULT, CF_LOCK, }; use error_code::{self, ErrorCode, ErrorCodeExt}; -use futures::prelude::*; +use futures::{future::BoxFuture, prelude::*}; use into_other::IntoOther; use kvproto::{ errorpb::Error as ErrorHeader, @@ -347,6 +347,18 @@ pub trait Engine: Send + Clone + 'static { // Some engines have a `TxnExtraScheduler`. This method is to send the extra // to the scheduler. fn schedule_txn_extra(&self, _txn_extra: TxnExtra) {} + + /// Mark the start of flashback. + // It's an infrequent API, use trait object for simplicity. + fn start_flashback(&self, _ctx: &Context) -> BoxFuture<'static, Result<()>> { + Box::pin(futures::future::ready(Ok(()))) + } + + /// Mark the end of flashback. + // It's an infrequent API, use trait object for simplicity. + fn end_flashback(&self, _ctx: &Context) -> BoxFuture<'static, Result<()>> { + Box::pin(futures::future::ready(Ok(()))) + } } /// A Snapshot is a consistent view of the underlying engine at a given point in diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index dcc137f095c..8b5e846592f 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -191,14 +191,13 @@ pub fn path_in_diff_mount_point(path1: &str, path2: &str) -> bool { #[cfg(not(target_os = "linux"))] pub fn path_in_diff_mount_point(_path1: &str, _path2: &str) -> bool { - return false; + false } -#[cfg(test)] +#[cfg(all(test, target_os = "linux"))] mod tests { use super::*; - #[cfg(target_os = "linux")] #[test] fn test_path_in_diff_mount_point() { let (empty_path1, path2) = ("", "/"); diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index eaa13995650..8bef31eaebd 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -15,10 +15,14 @@ use std::{ use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use engine_traits::{CfName, KvEngine, MvccProperties, Snapshot}; +use futures::future::BoxFuture; use kvproto::{ errorpb, kvrpcpb::{Context, IsolationLevel}, - raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, Request, Response}, + raft_cmdpb::{ + AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, Request, + Response, + }, }; use raft::{ eraftpb::{self, MessageType}, @@ -37,7 +41,7 @@ use raftstore::{ }; use thiserror::Error; use tikv_kv::write_modifies; -use tikv_util::{codec::number::NumberEncoder, time::Instant}; +use tikv_util::{codec::number::NumberEncoder, future::paired_future_callback, time::Instant}; use txn_types::{Key, TimeStamp, TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::metrics::*; @@ -149,6 +153,49 @@ where } } +#[inline] +pub fn new_request_header(ctx: &Context) -> RaftRequestHeader { + let mut header = RaftRequestHeader::default(); + header.set_region_id(ctx.get_region_id()); + header.set_peer(ctx.get_peer().clone()); + header.set_region_epoch(ctx.get_region_epoch().clone()); + if ctx.get_term() != 0 { + header.set_term(ctx.get_term()); + } + header.set_sync_log(ctx.get_sync_log()); + header.set_replica_read(ctx.get_replica_read()); + header +} + +#[inline] +pub fn new_flashback_req(ctx: &Context, ty: AdminCmdType) -> RaftCmdRequest { + let header = new_request_header(ctx); + let mut req = RaftCmdRequest::default(); + req.set_header(header); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + req.mut_admin_request().set_cmd_type(ty); + req +} + +fn exec_admin>( + router: &S, + req: RaftCmdRequest, +) -> BoxFuture<'static, kv::Result<()>> { + let (cb, f) = paired_future_callback(); + let res = router.send_command( + req, + raftstore::store::Callback::write(cb), + RaftCmdExtraOpts::default(), + ); + Box::pin(async move { + res?; + let mut resp = box_try!(f.await); + check_raft_cmd_response(&mut resp.response)?; + Ok(()) + }) +} + /// `RaftKv` is a storage engine base on `RaftStore`. #[derive(Clone)] pub struct RaftKv @@ -181,26 +228,13 @@ where self.txn_extra_scheduler = Some(txn_extra_scheduler); } - fn new_request_header(&self, ctx: &Context) -> RaftRequestHeader { - let mut header = RaftRequestHeader::default(); - header.set_region_id(ctx.get_region_id()); - header.set_peer(ctx.get_peer().clone()); - header.set_region_epoch(ctx.get_region_epoch().clone()); - if ctx.get_term() != 0 { - header.set_term(ctx.get_term()); - } - header.set_sync_log(ctx.get_sync_log()); - header.set_replica_read(ctx.get_replica_read()); - header - } - fn exec_snapshot( &mut self, ctx: SnapContext<'_>, req: Request, cb: Callback>, ) -> Result<()> { - let mut header = self.new_request_header(ctx.pb_ctx); + let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { let mut data = [0u8; 8]; @@ -257,7 +291,7 @@ where let reqs: Vec = batch.modifies.into_iter().map(Into::into).collect(); let txn_extra = batch.extra; - let mut header = self.new_request_header(ctx); + let mut header = new_request_header(ctx); let mut flags = 0; if txn_extra.one_pc { flags |= WriteBatchFlags::ONE_PC.bits(); @@ -509,6 +543,24 @@ where } } } + + fn start_flashback(&self, ctx: &Context) -> BoxFuture<'static, kv::Result<()>> { + // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the + // later flashback. Once invoked, we will update the persistent region meta and + // the memory state of the flashback in Peer FSM to reject all read, write + // and scheduling operations for this region when propose/apply before we + // start the actual data flashback transaction command in the next phase. + let req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); + exec_admin(&self.router, req) + } + + fn end_flashback(&self, ctx: &Context) -> BoxFuture<'static, kv::Result<()>> { + // Send an `AdminCmdType::FinishFlashback` to unset the persistence state + // in `RegionLocalState` and region's meta, and when that admin cmd is applied, + // will update the memory state of the flashback + let req = new_flashback_req(ctx, AdminCmdType::FinishFlashback); + exec_admin(&self.router, req) + } } #[derive(Clone)] diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 7fc5bb77f31..54b0dc6782b 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -6,7 +6,6 @@ use std::{mem, sync::Arc}; use api_version::KvFormat; use fail::fail_point; use futures::{ - channel::oneshot, compat::Future01CompatExt, future::{self, Future, FutureExt, TryFutureExt}, sink::SinkExt, @@ -21,10 +20,7 @@ use kvproto::{ errorpb::{Error as RegionError, *}, kvrpcpb::*, mpp::*, - raft_cmdpb::{ - AdminCmdType, AdminRequest, CmdType, RaftCmdRequest, RaftRequestHeader, - Request as RaftRequest, - }, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request as RaftRequest}, raft_serverpb::*, tikvpb::*, }; @@ -47,9 +43,8 @@ use tikv_util::{ time::{duration_to_ms, duration_to_sec, Instant}, worker::Scheduler, }; -use tokio::sync::Mutex; use tracker::{set_tls_tracker_token, RequestInfo, RequestType, Tracker, GLOBAL_TRACKERS}; -use txn_types::{self, Key, WriteBatchFlags}; +use txn_types::{self, Key}; use super::batch::{BatcherBuilder, ReqBatcher}; use crate::{ @@ -60,6 +55,7 @@ use crate::{ Error, Proxy, Result as ServerResult, }, storage::{ + self, errors::{ extract_committed, extract_key_error, extract_key_errors, extract_kv_pairs, extract_region_error, map_kv_pairs, @@ -414,7 +410,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); - let resp = future_prepare_flashback_to_version(&self.storage, &self.ch, req); + let resp = future_prepare_flashback_to_version(&self.storage, req); let task = async move { let resp = resp.await?; let elapsed = begin_instant.saturating_elapsed(); @@ -445,7 +441,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); - let resp = future_flashback_to_version(&self.storage, &self.ch, req); + let resp = future_flashback_to_version(&self.storage, req); let task = async move { let resp = resp.await?; let elapsed = begin_instant.saturating_elapsed(); @@ -1093,7 +1089,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let copr_v2 = self.copr_v2.clone(); let pool_size = storage.get_normal_pool_size(); let batch_builder = BatcherBuilder::new(self.enable_req_batch, pool_size); - let ch = self.ch.clone(); let request_handler = stream.try_for_each(move |mut req| { let request_ids = req.take_request_ids(); let requests: Vec<_> = req.take_requests().into(); @@ -1110,7 +1105,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor id, req, &tx, - &ch, ); if let Some(batch) = batcher.as_mut() { batch.maybe_commit(&storage, &tx); @@ -1311,12 +1305,7 @@ fn response_batch_commands_request( poll_future_notify(task); } -fn handle_batch_commands_request< - T: RaftStoreRouter + 'static, - E: Engine, - L: LockManager, - F: KvFormat, ->( +fn handle_batch_commands_request( batcher: &mut Option, storage: &Storage, copr: &Endpoint, @@ -1325,7 +1314,6 @@ fn handle_batch_commands_request< id: u64, req: batch_commands_request::Request, tx: &Sender, - ch: &T, ) { // To simplify code and make the logic more clear. macro_rules! oneof { @@ -1428,8 +1416,8 @@ fn handle_batch_commands_request< ResolveLock, future_resolve_lock(storage), kv_resolve_lock; Gc, future_gc(), kv_gc; DeleteRange, future_delete_range(storage), kv_delete_range; - PrepareFlashbackToVersion, future_prepare_flashback_to_version(storage, ch), kv_prepare_flashback_to_version; - FlashbackToVersion, future_flashback_to_version(storage, ch), kv_flashback_to_version; + PrepareFlashbackToVersion, future_prepare_flashback_to_version(storage), kv_prepare_flashback_to_version; + FlashbackToVersion, future_flashback_to_version(storage), kv_flashback_to_version; RawBatchGet, future_raw_batch_get(storage), raw_batch_get; RawPut, future_raw_put(storage), raw_put; RawBatchPut, future_raw_batch_put(storage), raw_batch_put; @@ -1725,80 +1713,53 @@ fn future_delete_range( // Preparing the flashback for a region will "lock" the region so that // there is no any read, write or scheduling operation could be proposed before // the actual flashback operation. -fn future_prepare_flashback_to_version< - E: Engine, - L: LockManager, - F: KvFormat, - T: RaftStoreRouter + 'static, ->( +fn future_prepare_flashback_to_version( // Keep this param to hint the type of E for the compiler. - _storage: &Storage, - raft_router: &T, + storage: &Storage, req: PrepareFlashbackToVersionRequest, ) -> impl Future> { - let raft_router = Mutex::new(raft_router.clone()); + let f = storage.get_engine().start_flashback(req.get_context()); async move { - // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the - // later flashback. Once invoked, we will update the persistent region meta and - // the memory state of the flashback in Peer FSM to reject all read, write - // and scheduling operations for this region when propose/apply before we - // start the actual data flashback transaction command in the next phase. - send_flashback_msg::( - &raft_router, - req.get_context(), - AdminCmdType::PrepareFlashback, - ) - .await?; - Ok(PrepareFlashbackToVersionResponse::default()) + let res = f.await.map_err(storage::Error::from); + let mut resp = PrepareFlashbackToVersionResponse::default(); + if let Some(e) = extract_region_error(&res) { + resp.set_region_error(e); + } else if let Err(e) = res { + resp.set_error(format!("{}", e)); + } + Ok(resp) } } // Flashback the region to a specific point with the given `version`, please // make sure the region is "locked" by `PrepareFlashbackToVersion` first, // otherwise this request will fail. -fn future_flashback_to_version< - T: RaftStoreRouter + 'static, - E: Engine, - L: LockManager, - F: KvFormat, ->( +fn future_flashback_to_version( storage: &Storage, - raft_router: &T, req: FlashbackToVersionRequest, ) -> impl Future> { - let storage_clone = storage.clone(); - let raft_router = Mutex::new(raft_router.clone()); + let storage = storage.clone(); async move { // Perform the data flashback transaction command. We will check if the region // is in the flashback state when proposing the flashback modification. let (cb, f) = paired_future_callback(); - let res = storage_clone.sched_txn_command(req.clone().into(), cb); - // Avoid crossing `.await` to bypass the `Send` constraint. - drop(storage_clone); - let v = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - let mut resp = FlashbackToVersionResponse::default(); - if let Some(err) = extract_region_error(&v) { - resp.set_region_error(err); - } else if let Err(e) = v { - resp.set_error(format!("{}", e)); - } else { + let mut res = storage.sched_txn_command(req.clone().into(), cb); + if matches!(res, Ok(())) { + res = f.await.unwrap_or_else(|e| Err(box_err!(e))); + } + if matches!(res, Ok(())) { // Only finish flashback when Flashback executed successfully. fail_point!("skip_finish_flashback_to_version", |_| { Ok(FlashbackToVersionResponse::default()) }); - // Send an `AdminCmdType::FinishFlashback` to unset the persistence state - // in `RegionLocalState` and region's meta, and when that - // admin cmd is applied, will update the memory - // state of the flashback - send_flashback_msg::( - &raft_router, - req.get_context(), - AdminCmdType::FinishFlashback, - ) - .await?; + let f = storage.get_engine().end_flashback(req.get_context()); + res = f.await.map_err(storage::Error::from); + } + let mut resp = FlashbackToVersionResponse::default(); + if let Some(err) = extract_region_error(&res) { + resp.set_region_error(err); + } else if let Err(e) = res { + resp.set_error(format!("{}", e)); } Ok(resp) } @@ -2467,61 +2428,6 @@ fn needs_reject_raft_append(reject_messages_on_memory_ratio: f64) -> bool { false } -async fn send_flashback_msg + 'static, E: Engine>( - raft_router: &Mutex, - ctx: &Context, - cmd_type: AdminCmdType, -) -> ServerResult<()> { - let region_id = ctx.get_region_id(); - let (result_tx, result_rx) = oneshot::channel(); - let cb = Callback::write(Box::new(move |resp| { - if resp.response.get_header().has_error() { - result_tx.send(false).unwrap(); - error!("exec flashback msg failed"; - "region_id" => region_id, - "type" => ?cmd_type, - "error" => ?resp.response.get_header().get_error()); - return; - } - result_tx.send(true).unwrap(); - })); - let mut admin = AdminRequest::default(); - admin.set_cmd_type(cmd_type); - let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(region_id); - req.mut_header() - .set_region_epoch(ctx.get_region_epoch().clone()); - req.mut_header().set_peer(ctx.get_peer().clone()); - req.set_admin_request(admin); - req.mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - // call admin request directly - let raft_router = raft_router.lock().await; - if let Err(e) = raft_router.send_command( - req, - cb, - RaftCmdExtraOpts { - deadline: None, - disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, - }, - ) { - return Err(Error::Other(box_err!( - "send flashback msg {:?} failed for region {}, error {:?}", - cmd_type, - region_id, - e - ))); - } - if !result_rx.await? { - return Err(Error::Other(box_err!( - "wait flashback msg {:?} result failed for region {} failed", - cmd_type, - region_id - ))); - } - Ok(()) -} - #[cfg(test)] mod tests { use std::thread; From 8465f44a55b457edb583e1141e0a84b91d1dea17 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Tue, 22 Nov 2022 17:01:57 +0800 Subject: [PATCH 349/676] mvcc: skip accumulated locks using seek in forward scanner (#13819) ref tikv/tikv#13694 This commit adds support of skipping versions with a seek to the LatestKvPolicy of the forward scanner. Now we don't add this to other policies because they are usually used for large range of scanning. So, the influence of accumulated locks of certain keys should be amortized. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- src/storage/mvcc/reader/scanner/forward.rs | 86 +++++++++++++++++++++- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 32898f1bfe7..12300187739 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -472,12 +472,19 @@ impl ScanPolicy for LatestKvPolicy { } WriteType::Delete => break None, WriteType::Lock | WriteType::Rollback => { - // Continue iterate next `write`. + if write.versions_to_last_change < SEEK_BOUND || write.last_change_ts.is_zero() + { + // Continue iterate next `write`. + cursors.write.next(&mut statistics.write); + } else { + // Seek to the expected version directly. + let commit_ts = write.last_change_ts; + let key_with_ts = current_user_key.clone().append_ts(commit_ts); + cursors.write.seek(&key_with_ts, &mut statistics.write)?; + } } } - cursors.write.next(&mut statistics.write); - if !cursors.write.valid()? { // Key space ended. Needn't move write cursor to next key. return Ok(HandleRes::Skip(current_user_key)); @@ -1601,6 +1608,79 @@ mod latest_kv_tests { ); scanner.next().unwrap_err(); } + + #[test] + fn test_skip_versions_by_seek() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + must_prewrite_put(&mut engine, b"k1", b"v11", b"k1", 1); + must_commit(&mut engine, b"k1", 1, 5); + must_prewrite_put(&mut engine, b"k1", b"v12", b"k1", 6); + must_commit(&mut engine, b"k1", 6, 8); + must_prewrite_put(&mut engine, b"k2", b"v21", b"k2", 2); + must_commit(&mut engine, b"k2", 2, 6); + must_prewrite_put(&mut engine, b"k3", b"v31", b"k3", 3); + must_commit(&mut engine, b"k3", 3, 7); + + for start_ts in (10..30).into_iter().step_by(2) { + must_prewrite_lock(&mut engine, b"k1", b"k1", start_ts); + must_commit(&mut engine, b"k1", start_ts, start_ts + 1); + must_rollback(&mut engine, b"k3", start_ts + 1, true); + } + + must_prewrite_put(&mut engine, b"k1", b"v13", b"k1", 40); + must_commit(&mut engine, b"k1", 40, 45); + must_prewrite_put(&mut engine, b"k2", b"v22", b"k2", 41); + must_commit(&mut engine, b"k2", 41, 46); + must_prewrite_put(&mut engine, b"k3", b"v32", b"k3", 42); + must_commit(&mut engine, b"k3", 42, 47); + + // KEY | COMMIT_TS | TYPE | VALUE + // ----|-----------|----------|------- + // k1 | 45 | PUT | v13 + // k1 | 29 | LOCK | + // k1 | 27 | LOCK | + // k1 | ... | LOCK | + // k1 | 11 | LOCK | + // k1 | 8 | PUT | v12 + // k1 | 5 | PUT | v1 + // k2 | 46 | PUT | v22 + // k2 | 6 | PUT | v21 + // k3 | 47 | PUT | v32 + // k3 | 29 | ROLLBACK | + // k3 | 27 | ROLLBACK | + // k3 | ... | ROLLBACK | + // k3 | 11 | ROLLBACK | + // k3 | 7 | PUT | v31 + + let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut scanner = ScannerBuilder::new(snapshot, 35.into()) + .range(None, None) + .build() + .unwrap(); + + assert_eq!( + scanner.next().unwrap(), + Some((Key::from_raw(b"k1"), b"v12".to_vec())) + ); + let stats = scanner.take_statistics(); + assert_eq!(stats.write.next, 3); // skip k1@45, k1@8, k1@5 + assert_eq!(stats.write.seek, 2); // seek beginning and k1@8 + + assert_eq!( + scanner.next().unwrap(), + Some((Key::from_raw(b"k2"), b"v21".to_vec())) + ); + scanner.take_statistics(); + + assert_eq!( + scanner.next().unwrap(), + Some((Key::from_raw(b"k3"), b"v31".to_vec())) + ); + let stats = scanner.take_statistics(); + assert_le!(stats.write.next, 2); // skip k2@6, k3@47 + assert_eq!(stats.write.seek, 1); // seek k3@7 + } } #[cfg(test)] From d9ce7d7e3cb03f0e0645bbda5291873ce632030f Mon Sep 17 00:00:00 2001 From: Jianjun Liao <36503113+Leavrth@users.noreply.github.com> Date: Tue, 22 Nov 2022 18:19:58 +0800 Subject: [PATCH 350/676] backup: backup sub-ranges in one request (#13702) close tikv/tikv#13701 Signed-off-by: Leavrth Signed-off-by: Jianjun Liao <36503113+Leavrth@users.noreply.github.com> Co-authored-by: Neil Shen --- components/backup/src/endpoint.rs | 291 +++++++++++++++++++++++++++--- 1 file changed, 262 insertions(+), 29 deletions(-) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index db6ff331d7f..b880da7a3dc 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -19,7 +19,7 @@ use futures::{channel::mpsc::*, executor::block_on}; use kvproto::{ brpb::*, encryptionpb::EncryptionMethod, - kvrpcpb::{ApiVersion, Context, IsolationLevel}, + kvrpcpb::{ApiVersion, Context, IsolationLevel, KeyRange}, metapb::*, }; use online_config::OnlineConfig; @@ -59,6 +59,7 @@ const BACKUP_BATCH_LIMIT: usize = 1024; struct Request { start_key: Vec, end_key: Vec, + sub_ranges: Vec, start_ts: TimeStamp, end_ts: TimeStamp, limiter: Limiter, @@ -119,6 +120,7 @@ impl Task { request: Request { start_key: req.get_start_key().to_owned(), end_key: req.get_end_key().to_owned(), + sub_ranges: req.get_sub_ranges().to_owned(), start_ts: req.get_start_version().into(), end_ts: req.get_end_version().into(), backend: req.get_storage_backend().clone(), @@ -676,6 +678,8 @@ pub struct Endpoint { /// The progress of a backup task pub struct Progress { store_id: u64, + ranges: Vec<(Option, Option)>, + next_index: usize, next_start: Option, end_key: Option, region_info: R, @@ -685,7 +689,7 @@ pub struct Progress { } impl Progress { - fn new( + fn new_with_range( store_id: u64, next_start: Option, end_key: Option, @@ -693,14 +697,41 @@ impl Progress { codec: KeyValueCodec, cf: CfName, ) -> Self { - Progress { + let ranges = vec![(next_start, end_key)]; + Self::new_with_ranges(store_id, ranges, region_info, codec, cf) + } + + fn new_with_ranges( + store_id: u64, + ranges: Vec<(Option, Option)>, + region_info: R, + codec: KeyValueCodec, + cf: CfName, + ) -> Self { + let mut prs = Progress { store_id, - next_start, - end_key, + ranges, + next_index: 0, + next_start: None, + end_key: None, region_info, finished: false, codec, cf, + }; + prs.try_next(); + prs + } + + /// try the next range. If all the ranges are consumed, + /// set self.finish true. + fn try_next(&mut self) { + if self.ranges.len() > self.next_index { + (self.next_start, self.end_key) = self.ranges[self.next_index].clone(); + + self.next_index += 1; + } else { + self.finished = true; } } @@ -770,11 +801,12 @@ impl Progress { // region, we need to set the `finished` flag here in case // we run with `next_start` set to None if b.region.get_end_key().is_empty() || b.end_key == self.end_key { - self.finished = true; + self.try_next(); + } else { + self.next_start = b.end_key.clone(); } - self.next_start = b.end_key.clone(); } else { - self.finished = true; + self.try_next(); } branges } @@ -958,6 +990,39 @@ impl Endpoint { }); } + fn get_progress_by_req( + &self, + request: &Request, + codec: KeyValueCodec, + ) -> Arc>> { + if request.sub_ranges.is_empty() { + let start_key = codec.encode_backup_key(request.start_key.clone()); + let end_key = codec.encode_backup_key(request.end_key.clone()); + Arc::new(Mutex::new(Progress::new_with_range( + self.store_id, + start_key, + end_key, + self.region_info.clone(), + codec, + request.cf, + ))) + } else { + let mut ranges = Vec::with_capacity(request.sub_ranges.len()); + for k in &request.sub_ranges { + let start_key = codec.encode_backup_key(k.start_key.clone()); + let end_key = codec.encode_backup_key(k.end_key.clone()); + ranges.push((start_key, end_key)); + } + Arc::new(Mutex::new(Progress::new_with_ranges( + self.store_id, + ranges, + self.region_info.clone(), + codec, + request.cf, + ))) + } + } + pub fn handle_backup_task(&self, task: Task) { let Task { request, resp } = task; let codec = KeyValueCodec::new(request.is_raw_kv, self.api_version, request.dst_api_ver); @@ -996,17 +1061,9 @@ impl Endpoint { return; } } - let start_key = codec.encode_backup_key(request.start_key.clone()); - let end_key = codec.encode_backup_key(request.end_key.clone()); - let prs = Arc::new(Mutex::new(Progress::new( - self.store_id, - start_key, - end_key, - self.region_info.clone(), - codec, - request.cf, - ))); + let prs = self.get_progress_by_req(&request, codec); + let backend = match create_storage(&request.backend, self.get_config()) { Ok(backend) => backend, Err(err) => { @@ -1384,17 +1441,9 @@ pub mod tests { // Test seek backup range. let test_seek_backup_range = |start_key: &[u8], end_key: &[u8], expect: Vec<(&[u8], &[u8])>| { - let start_key = if start_key.is_empty() { - None - } else { - Some(Key::from_raw(start_key)) - }; - let end_key = if end_key.is_empty() { - None - } else { - Some(Key::from_raw(end_key)) - }; - let mut prs = Progress::new( + let start_key = (!start_key.is_empty()).then_some(Key::from_raw(start_key)); + let end_key = (!end_key.is_empty()).then_some(Key::from_raw(end_key)); + let mut prs = Progress::new_with_range( endpoint.store_id, start_key, end_key, @@ -1446,6 +1495,7 @@ pub mod tests { request: Request { start_key: start_key.to_vec(), end_key: end_key.to_vec(), + sub_ranges: Vec::new(), start_ts: 1.into(), end_ts: 1.into(), backend, @@ -1512,6 +1562,189 @@ pub mod tests { } } + #[test] + fn test_seek_ranges() { + let (_tmp, endpoint) = new_endpoint(); + + endpoint.region_info.set_regions(vec![ + (b"".to_vec(), b"1".to_vec(), 1), + (b"1".to_vec(), b"2".to_vec(), 2), + (b"3".to_vec(), b"4".to_vec(), 3), + (b"7".to_vec(), b"9".to_vec(), 4), + (b"9".to_vec(), b"".to_vec(), 5), + ]); + // Test seek backup range. + let test_seek_backup_ranges = + |sub_ranges: Vec<(&[u8], &[u8])>, expect: Vec<(&[u8], &[u8])>| { + let mut ranges = Vec::with_capacity(sub_ranges.len()); + for &(start_key, end_key) in &sub_ranges { + let start_key = (!start_key.is_empty()).then_some(Key::from_raw(start_key)); + let end_key = (!end_key.is_empty()).then_some(Key::from_raw(end_key)); + ranges.push((start_key, end_key)); + } + let mut prs = Progress::new_with_ranges( + endpoint.store_id, + ranges, + endpoint.region_info.clone(), + KeyValueCodec::new(false, ApiVersion::V1, ApiVersion::V1), + engine_traits::CF_DEFAULT, + ); + + let mut ranges = Vec::with_capacity(expect.len()); + while ranges.len() != expect.len() { + let n = (rand::random::() % 3) + 1; + let mut r = prs.forward(n); + // The returned backup ranges should <= n + assert!(r.len() <= n); + + if r.is_empty() { + // if return a empty vec then the progress is finished + assert_eq!( + ranges.len(), + expect.len(), + "got {:?}, expect {:?}", + ranges, + expect + ); + } + ranges.append(&mut r); + } + + for (a, b) in ranges.into_iter().zip(expect) { + assert_eq!( + a.start_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.0 + ); + assert_eq!( + a.end_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.1 + ); + } + }; + + // Test whether responses contain correct range. + #[allow(clippy::blocks_in_if_conditions)] + let test_handle_backup_task_ranges = + |sub_ranges: Vec<(&[u8], &[u8])>, expect: Vec<(&[u8], &[u8])>| { + let tmp = TempDir::new().unwrap(); + let backend = make_local_backend(tmp.path()); + let (tx, rx) = unbounded(); + + let mut ranges = Vec::with_capacity(sub_ranges.len()); + for &(start_key, end_key) in &sub_ranges { + let key_range = KeyRange { + start_key: start_key.to_vec(), + end_key: end_key.to_vec(), + ..Default::default() + }; + ranges.push(key_range); + } + let task = Task { + request: Request { + start_key: b"1".to_vec(), + end_key: b"2".to_vec(), + sub_ranges: ranges, + start_ts: 1.into(), + end_ts: 1.into(), + backend, + limiter: Limiter::new(f64::INFINITY), + cancel: Arc::default(), + is_raw_kv: false, + dst_api_ver: ApiVersion::V1, + cf: engine_traits::CF_DEFAULT, + compression_type: CompressionType::Unknown, + compression_level: 0, + cipher: CipherInfo::default(), + }, + resp: tx, + }; + endpoint.handle_backup_task(task); + let resps: Vec<_> = block_on(rx.collect()); + for a in &resps { + assert!( + expect + .iter() + .any(|b| { a.get_start_key() == b.0 && a.get_end_key() == b.1 }), + "{:?} {:?}", + resps, + expect + ); + } + assert_eq!(resps.len(), expect.len()); + }; + + // Backup range from case.0 to case.1, + // the case.2 is the expected results. + type Case<'a> = (Vec<(&'a [u8], &'a [u8])>, Vec<(&'a [u8], &'a [u8])>); + + let case: Vec> = vec![ + ( + vec![(b"", b"1"), (b"1", b"2")], + vec![(b"", b"1"), (b"1", b"2")], + ), + ( + vec![(b"", b"2"), (b"3", b"4")], + vec![(b"", b"1"), (b"1", b"2"), (b"3", b"4")], + ), + ( + vec![(b"7", b"8"), (b"8", b"9")], + vec![(b"7", b"8"), (b"8", b"9")], + ), + ( + vec![(b"8", b"9"), (b"6", b"8")], + vec![(b"8", b"9"), (b"7", b"8")], + ), + ( + vec![(b"8", b"85"), (b"88", b"89"), (b"7", b"8")], + vec![(b"8", b"85"), (b"88", b"89"), (b"7", b"8")], + ), + ( + vec![(b"8", b"85"), (b"", b"35"), (b"88", b"89"), (b"7", b"8")], + vec![ + (b"8", b"85"), + (b"", b"1"), + (b"1", b"2"), + (b"3", b"35"), + (b"88", b"89"), + (b"7", b"8"), + ], + ), + (vec![(b"", b"1")], vec![(b"", b"1")]), + (vec![(b"", b"2")], vec![(b"", b"1"), (b"1", b"2")]), + (vec![(b"1", b"2")], vec![(b"1", b"2")]), + (vec![(b"1", b"3")], vec![(b"1", b"2")]), + (vec![(b"1", b"4")], vec![(b"1", b"2"), (b"3", b"4")]), + (vec![(b"4", b"5")], vec![]), + (vec![(b"4", b"6")], vec![]), + (vec![(b"4", b"6"), (b"6", b"7")], vec![]), + (vec![(b"2", b"3"), (b"4", b"6"), (b"6", b"7")], vec![]), + (vec![(b"2", b"7")], vec![(b"3", b"4")]), + (vec![(b"7", b"8")], vec![(b"7", b"8")]), + ( + vec![(b"3", b"")], + vec![(b"3", b"4"), (b"7", b"9"), (b"9", b"")], + ), + (vec![(b"5", b"")], vec![(b"7", b"9"), (b"9", b"")]), + (vec![(b"7", b"")], vec![(b"7", b"9"), (b"9", b"")]), + (vec![(b"8", b"91")], vec![(b"8", b"9"), (b"9", b"91")]), + (vec![(b"8", b"")], vec![(b"8", b"9"), (b"9", b"")]), + ( + vec![(b"", b"")], + vec![ + (b"", b"1"), + (b"1", b"2"), + (b"3", b"4"), + (b"7", b"9"), + (b"9", b""), + ], + ), + ]; + for (ranges, expect_ranges) in case { + test_seek_backup_ranges(ranges.clone(), expect_ranges.clone()); + test_handle_backup_task_ranges(ranges, expect_ranges); + } + } + #[test] fn test_handle_backup_task() { let limiter = Arc::new(IoRateLimiter::new_for_test()); From 1c915f34a3387d91625fb92902df87d705a32afe Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 22 Nov 2022 22:15:58 +0800 Subject: [PATCH 351/676] *: remove legacy code (#13832) ref tikv/tikv#13827 green gc is developed for gc without waking hibernated peers. But since dynamic regions, it's not compatible with physical isolation. And it's never used in production due to correctness concern. Read index RPC is used by tiflash in the past. But now tiflash is using internal message forward instead of explicit RPC. And I'm not aware of any other project is utilizing the API. Signed-off-by: Jay Lee --- components/server/src/server.rs | 6 - components/test_raftstore/src/server.rs | 3 - components/test_raftstore/src/util.rs | 51 - .../gc_worker/applied_lock_collector.rs | 894 ------------------ src/server/gc_worker/gc_manager.rs | 1 - src/server/gc_worker/gc_worker.rs | 262 +---- src/server/gc_worker/mod.rs | 1 - src/server/metrics.rs | 2 - src/server/service/kv.rs | 263 +----- tests/failpoints/cases/test_gc_worker.rs | 284 ------ tests/failpoints/cases/test_kv_service.rs | 89 -- tests/integrations/server/gc_worker.rs | 258 +---- tests/integrations/server/kv_service.rs | 113 +-- 13 files changed, 9 insertions(+), 2218 deletions(-) delete mode 100644 src/server/gc_worker/applied_lock_collector.rs diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 80d44b114b9..625db3e951f 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1106,12 +1106,6 @@ where gc_worker .start(node.id()) .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); - gc_worker - .start_observe_lock_apply( - self.coprocessor_host.as_mut().unwrap(), - self.concurrency_manager.clone(), - ) - .unwrap_or_else(|e| fatal!("gc worker failed to observe lock apply: {}", e)); if let Err(e) = gc_worker.start_auto_gc(auto_gc_config, safe_point) { fatal!("failed to start auto_gc on storage, error: {}", e); } diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 42cefe60496..1b532932b30 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -341,9 +341,6 @@ impl ServerCluster { Arc::new(region_info_accessor.clone()), ); gc_worker.start(node_id).unwrap(); - gc_worker - .start_observe_lock_apply(&mut coprocessor_host, concurrency_manager.clone()) - .unwrap(); let rts_worker = if cfg.resolved_ts.enable { // Resolved ts worker diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index eb8ab3fe885..06c2da432c0 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -1083,57 +1083,6 @@ pub fn must_check_txn_status( resp } -pub fn must_physical_scan_lock( - client: &TikvClient, - ctx: Context, - max_ts: u64, - start_key: &[u8], - limit: usize, -) -> Vec { - let mut req = PhysicalScanLockRequest::default(); - req.set_context(ctx); - req.set_max_ts(max_ts); - req.set_start_key(start_key.to_owned()); - req.set_limit(limit as _); - let mut resp = client.physical_scan_lock(&req).unwrap(); - resp.take_locks().into() -} - -pub fn register_lock_observer(client: &TikvClient, max_ts: u64) -> RegisterLockObserverResponse { - let mut req = RegisterLockObserverRequest::default(); - req.set_max_ts(max_ts); - client.register_lock_observer(&req).unwrap() -} - -pub fn must_register_lock_observer(client: &TikvClient, max_ts: u64) { - let resp = register_lock_observer(client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); -} - -pub fn check_lock_observer(client: &TikvClient, max_ts: u64) -> CheckLockObserverResponse { - let mut req = CheckLockObserverRequest::default(); - req.set_max_ts(max_ts); - client.check_lock_observer(&req).unwrap() -} - -pub fn must_check_lock_observer(client: &TikvClient, max_ts: u64, clean: bool) -> Vec { - let mut resp = check_lock_observer(client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); - assert_eq!(resp.get_is_clean(), clean); - resp.take_locks().into() -} - -pub fn remove_lock_observer(client: &TikvClient, max_ts: u64) -> RemoveLockObserverResponse { - let mut req = RemoveLockObserverRequest::default(); - req.set_max_ts(max_ts); - client.remove_lock_observer(&req).unwrap() -} - -pub fn must_remove_lock_observer(client: &TikvClient, max_ts: u64) { - let resp = remove_lock_observer(client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); -} - pub fn get_tso(pd_client: &TestPdClient) -> u64 { block_on(pd_client.get_tso()).unwrap().into_inner() } diff --git a/src/server/gc_worker/applied_lock_collector.rs b/src/server/gc_worker/applied_lock_collector.rs deleted file mode 100644 index 9d0e16f4286..00000000000 --- a/src/server/gc_worker/applied_lock_collector.rs +++ /dev/null @@ -1,894 +0,0 @@ -// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - cmp::Ordering::*, - fmt::{self, Debug, Display}, - sync::{ - atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, Mutex, - }, -}; - -use concurrency_manager::ConcurrencyManager; -use engine_traits::{CfName, KvEngine, CF_LOCK}; -use keys::origin_key; -use kvproto::{kvrpcpb::LockInfo, raft_cmdpb::CmdType}; -use raftstore::coprocessor::{ - ApplySnapshotObserver, BoxApplySnapshotObserver, BoxQueryObserver, Cmd, Coprocessor, - CoprocessorHost, ObserverContext, QueryObserver, -}; -use tikv_util::worker::{Builder as WorkerBuilder, Runnable, ScheduleError, Scheduler, Worker}; -use txn_types::Key; - -// TODO: Use new error type for GcWorker instead of storage::Error. -use super::{Error, ErrorInner, Result}; -use crate::storage::{ - mvcc::{ErrorInner as MvccErrorInner, Lock, TimeStamp}, - txn::Error as TxnError, -}; - -const MAX_COLLECT_SIZE: usize = 1024; - -/// The state of the observer. Shared between all clones. -#[derive(Default)] -struct LockObserverState { - max_ts: AtomicU64, - - /// `is_clean` is true, only it's sure that all applying of stale locks - /// (locks with start_ts <= specified max_ts) are monitored and collected. - /// If there are too many stale locks or any error happens, `is_clean` - /// must be set to `false`. - is_clean: AtomicBool, -} - -impl LockObserverState { - fn load_max_ts(&self) -> TimeStamp { - self.max_ts.load(Ordering::Acquire).into() - } - - fn store_max_ts(&self, max_ts: TimeStamp) { - self.max_ts.store(max_ts.into_inner(), Ordering::Release) - } - - fn is_clean(&self) -> bool { - self.is_clean.load(Ordering::Acquire) - } - - fn mark_clean(&self) { - self.is_clean.store(true, Ordering::Release); - } - - fn mark_dirty(&self) { - self.is_clean.store(false, Ordering::Release); - } -} - -pub type Callback = Box) + Send>; - -enum LockCollectorTask { - // Messages from observer - ObservedLocks(Vec<(Key, Lock)>), - - // Messages from client - StartCollecting { - max_ts: TimeStamp, - callback: Callback<()>, - }, - GetCollectedLocks { - max_ts: TimeStamp, - callback: Callback<(Vec, bool)>, - }, - StopCollecting { - max_ts: TimeStamp, - callback: Callback<()>, - }, -} - -impl Debug for LockCollectorTask { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - LockCollectorTask::ObservedLocks(locks) => f - .debug_struct("ObservedLocks") - .field("locks", locks) - .finish(), - LockCollectorTask::StartCollecting { max_ts, .. } => f - .debug_struct("StartCollecting") - .field("max_ts", max_ts) - .finish(), - LockCollectorTask::GetCollectedLocks { max_ts, .. } => f - .debug_struct("GetCollectedLocks") - .field("max_ts", max_ts) - .finish(), - LockCollectorTask::StopCollecting { max_ts, .. } => f - .debug_struct("StopCollecting") - .field("max_ts", max_ts) - .finish(), - } - } -} - -impl Display for LockCollectorTask { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Debug::fmt(&self, f) - } -} - -/// `LockObserver` observes apply events and apply snapshot events. If it -/// happens in CF_LOCK, it checks the `start_ts`s of the locks being written. If -/// a lock's `start_ts` <= specified `max_ts` in the `state`, it will send the -/// lock to through the `sender`, so the receiver can collect it. -#[derive(Clone)] -struct LockObserver { - state: Arc, - sender: Scheduler, -} - -impl LockObserver { - pub fn new(state: Arc, sender: Scheduler) -> Self { - Self { state, sender } - } - - pub fn register(self, coprocessor_host: &mut CoprocessorHost) { - coprocessor_host - .registry - .register_apply_snapshot_observer(1, BoxApplySnapshotObserver::new(self.clone())); - coprocessor_host - .registry - .register_query_observer(1, BoxQueryObserver::new(self)); - } - - fn send(&self, locks: Vec<(Key, Lock)>) { - #[cfg(feature = "failpoints")] - let injected_full = (|| { - fail_point!("lock_observer_send_full", |_| { - info!("[failpoint] injected lock observer channel full"; "locks" => ?locks); - true - }); - false - })(); - #[cfg(not(feature = "failpoints"))] - let injected_full = false; - - let res = if injected_full { - Err(ScheduleError::Full(LockCollectorTask::ObservedLocks(locks))) - } else { - self.sender - .schedule(LockCollectorTask::ObservedLocks(locks)) - }; - - match res { - Ok(()) => (), - Err(ScheduleError::Stopped(_)) => { - error!("lock observer failed to send locks because collector is stopped"); - } - Err(ScheduleError::Full(_)) => { - fail_point!("lock_observer_before_mark_dirty_on_full"); - self.state.mark_dirty(); - warn!("cannot collect all applied lock because channel is full"); - } - } - } -} - -impl Coprocessor for LockObserver {} - -impl QueryObserver for LockObserver { - fn post_apply_query(&self, _: &mut ObserverContext<'_>, cmd: &Cmd) { - fail_point!("notify_lock_observer_query"); - let max_ts = self.state.load_max_ts(); - if max_ts.is_zero() { - return; - } - - if !self.state.is_clean() { - return; - } - - let mut locks = vec![]; - // For each put in CF_LOCK, collect it if its ts <= max_ts. - for req in cmd.request.get_requests() { - if req.get_cmd_type() != CmdType::Put { - continue; - } - let put_request = req.get_put(); - if put_request.get_cf() != CF_LOCK { - continue; - } - - let lock = match Lock::parse(put_request.get_value()) { - Ok(l) => l, - Err(e) => { - error!(?e; - "cannot parse lock"; - "value" => log_wrappers::Value::value(put_request.get_value()), - ); - self.state.mark_dirty(); - return; - } - }; - - if lock.ts <= max_ts { - let key = Key::from_encoded_slice(put_request.get_key()); - locks.push((key, lock)); - } - } - if !locks.is_empty() { - self.send(locks); - } - } -} - -impl ApplySnapshotObserver for LockObserver { - fn apply_plain_kvs( - &self, - _: &mut ObserverContext<'_>, - cf: CfName, - kv_pairs: &[(Vec, Vec)], - ) { - fail_point!("notify_lock_observer_snapshot"); - if cf != CF_LOCK { - return; - } - - let max_ts = self.state.load_max_ts(); - if max_ts.is_zero() { - return; - } - - if !self.state.is_clean() { - return; - } - - let locks: Result> = kv_pairs - .iter() - .map(|(key, value)| { - Lock::parse(value) - .map(|lock| (key, lock)) - .map_err(|e| ErrorInner::Txn(TxnError::from_mvcc(e)).into()) - }) - .filter(|result| result.is_err() || result.as_ref().unwrap().1.ts <= max_ts) - .map(|result| { - // `apply_plain_keys` will be invoked with the data_key in RocksDB layer. So we - // need to remove the `z` prefix. - result.map(|(key, lock)| (Key::from_encoded_slice(origin_key(key)), lock)) - }) - .collect(); - - match locks { - Err(e) => { - error!(?e; "cannot parse lock"); - self.state.mark_dirty() - } - Ok(l) => self.send(l), - } - } - - fn apply_sst(&self, _: &mut ObserverContext<'_>, cf: CfName, _path: &str) { - if cf == CF_LOCK { - error!("cannot collect all applied lock: snapshot of lock cf applied from sst file"); - self.state.mark_dirty(); - } - } -} - -struct LockCollectorRunner { - observer_state: Arc, - - collected_locks: Vec<(Key, Lock)>, -} - -impl LockCollectorRunner { - pub fn new(observer_state: Arc) -> Self { - Self { - observer_state, - collected_locks: vec![], - } - } - - fn handle_observed_locks(&mut self, mut locks: Vec<(Key, Lock)>) { - if self.collected_locks.len() >= MAX_COLLECT_SIZE { - return; - } - - if locks.len() + self.collected_locks.len() >= MAX_COLLECT_SIZE { - self.observer_state.mark_dirty(); - info!("lock collector marked dirty because received too many locks"); - locks.truncate(MAX_COLLECT_SIZE - self.collected_locks.len()); - } - self.collected_locks.extend(locks); - } - - fn start_collecting(&mut self, max_ts: TimeStamp) -> Result<()> { - let curr_max_ts = self.observer_state.load_max_ts(); - match max_ts.cmp(&curr_max_ts) { - Less => Err(box_err!( - "collecting locks with a greater max_ts: {}", - curr_max_ts - )), - Equal => { - // Stale request. Ignore it. - Ok(()) - } - Greater => { - info!("start collecting locks"; "max_ts" => max_ts); - self.collected_locks.clear(); - // TODO: `is_clean` may be unexpectedly set to false here, if any error happens - // on a previous observing. It need to be solved, although it's very unlikely to - // happen and doesn't affect correctness of data. - self.observer_state.mark_clean(); - self.observer_state.store_max_ts(max_ts); - Ok(()) - } - } - } - - fn get_collected_locks(&mut self, max_ts: TimeStamp) -> Result<(Vec, bool)> { - let curr_max_ts = self.observer_state.load_max_ts(); - if curr_max_ts != max_ts { - warn!( - "trying to fetch collected locks but now collecting with another max_ts"; - "req_max_ts" => max_ts, - "current_max_ts" => curr_max_ts, - ); - return Err(box_err!( - "trying to fetch collected locks but now collecting with another max_ts" - )); - } - - let locks: Result<_> = self - .collected_locks - .iter() - .map(|(k, l)| { - k.to_raw() - .map(|raw_key| l.clone().into_lock_info(raw_key)) - .map_err(|e| Error::from(TxnError::from_mvcc(e))) - }) - .collect(); - - Ok((locks?, self.observer_state.is_clean())) - } - - fn stop_collecting(&mut self, max_ts: TimeStamp) -> Result<()> { - let res = self.observer_state.max_ts.compare_exchange( - max_ts.into_inner(), - 0, - Ordering::SeqCst, - Ordering::SeqCst, - ); - if res.is_ok() { - self.collected_locks.clear(); - info!("stop collecting locks"; "max_ts" => max_ts); - Ok(()) - } else { - warn!( - "trying to stop collecting locks, but now collecting with a different max_ts"; - "stopping_max_ts" => max_ts, - "current_max_ts" => TimeStamp::new(res.unwrap_err()), - ); - Err(box_err!("collecting locks with another max_ts")) - } - } -} - -impl Runnable for LockCollectorRunner { - type Task = LockCollectorTask; - - fn run(&mut self, task: LockCollectorTask) { - match task { - LockCollectorTask::ObservedLocks(locks) => self.handle_observed_locks(locks), - LockCollectorTask::StartCollecting { max_ts, callback } => { - callback(self.start_collecting(max_ts)) - } - LockCollectorTask::GetCollectedLocks { max_ts, callback } => { - callback(self.get_collected_locks(max_ts)) - } - LockCollectorTask::StopCollecting { max_ts, callback } => { - callback(self.stop_collecting(max_ts)) - } - } - } -} - -pub struct AppliedLockCollector { - worker: Mutex, - scheduler: Scheduler, - concurrency_manager: ConcurrencyManager, -} - -impl AppliedLockCollector { - pub fn new( - coprocessor_host: &mut CoprocessorHost, - concurrency_manager: ConcurrencyManager, - ) -> Result { - let worker = Mutex::new(WorkerBuilder::new("lock-collector").create()); - - let state = Arc::new(LockObserverState::default()); - let runner = LockCollectorRunner::new(Arc::clone(&state)); - let scheduler = worker.lock().unwrap().start("lock-collector", runner); - let observer = LockObserver::new(state, scheduler.clone()); - - observer.register(coprocessor_host); - - // Start the worker - - Ok(Self { - worker, - scheduler, - concurrency_manager, - }) - } - - pub fn stop(&self) { - self.worker.lock().unwrap().stop(); - } - - /// Starts collecting applied locks whose `start_ts` <= `max_ts`. Only one - /// `max_ts` is valid at one time. - pub fn start_collecting(&self, max_ts: TimeStamp, callback: Callback<()>) -> Result<()> { - // Before starting collecting, check the concurrency manager to avoid later - // prewrite requests uses a min_commit_ts less than the safepoint. - // `max_ts` here is the safepoint of the current round of GC. - // Ths is similar to that we update max_ts and check memory lock when handling - // other transactional read requests. However this is done at start_collecting - // instead of physical_scan_locks. The reason is that, to fully scan a TiKV - // store, it might needs more than one physical_scan_lock requests. However - // memory lock needs to be checked before scanning the locks, and we can't know - // the `end_key` of the scan range at that time. As a result, each - // physical_scan_lock request will cause scanning memory lock from the start_key - // to the very-end of the TiKV node, which is a waste. But since we always start - // collecting applied locks before physical scan lock, so a better idea is to - // check the memory lock before physical_scan_lock. - self.concurrency_manager.update_max_ts(max_ts); - self.concurrency_manager - .read_range_check(None, None, |key, lock| { - // `Lock::check_ts_conflict` can't be used here, because LockType::Lock - // can't be ignored in this case. - if lock.ts <= max_ts { - Err(TxnError::from_mvcc(MvccErrorInner::KeyIsLocked( - lock.clone().into_lock_info(key.to_raw()?), - ))) - } else { - Ok(()) - } - })?; - self.scheduler - .schedule(LockCollectorTask::StartCollecting { max_ts, callback }) - .map_err(|e| box_err!("failed to schedule task: {:?}", e)) - } - - /// Get the collected locks after `start_collecting`. Only valid when - /// `max_ts` matches the `max_ts` provided to `start_collecting`. - /// Collects at most `MAX_COLLECT_SIZE` locks. If there are (even - /// potentially) more locks than `MAX_COLLECT_SIZE` or any error happens, - /// the flag `is_clean` will be unset, which represents - /// `AppliedLockCollector` cannot collect all locks. - pub fn get_collected_locks( - &self, - max_ts: TimeStamp, - callback: Callback<(Vec, bool)>, - ) -> Result<()> { - self.scheduler - .schedule(LockCollectorTask::GetCollectedLocks { max_ts, callback }) - .map_err(|e| box_err!("failed to schedule task: {:?}", e)) - } - - /// Stop collecting locks. Only valid when `max_ts` matches the `max_ts` - /// provided to `start_collecting`. - pub fn stop_collecting(&self, max_ts: TimeStamp, callback: Callback<()>) -> Result<()> { - self.scheduler - .schedule(LockCollectorTask::StopCollecting { max_ts, callback }) - .map_err(|e| box_err!("failed to schedule task: {:?}", e)) - } -} - -impl Drop for AppliedLockCollector { - fn drop(&mut self) { - self.stop(); - } -} - -#[cfg(test)] -mod tests { - use std::sync::mpsc::channel; - - use engine_test::kv::KvTestEngine; - use engine_traits::CF_DEFAULT; - use futures::executor::block_on; - use kvproto::{ - kvrpcpb::Op, - metapb::Region, - raft_cmdpb::{PutRequest, RaftCmdRequest, RaftCmdResponse, Request as RaftRequest}, - }; - use txn_types::LockType; - - use super::*; - - fn lock_info_to_kv(mut lock_info: LockInfo) -> (Vec, Vec) { - let key = Key::from_raw(lock_info.get_key()).into_encoded(); - let lock = Lock::new( - match lock_info.get_lock_type() { - Op::Put => LockType::Put, - Op::Del => LockType::Delete, - Op::Lock => LockType::Lock, - Op::PessimisticLock => LockType::Pessimistic, - _ => unreachable!(), - }, - lock_info.take_primary_lock(), - lock_info.get_lock_version().into(), - lock_info.get_lock_ttl(), - None, - 0.into(), - lock_info.get_txn_size(), - 0.into(), - ); - let value = lock.to_bytes(); - (key, value) - } - - fn make_apply_request( - key: Vec, - value: Vec, - cf: &str, - cmd_type: CmdType, - ) -> RaftRequest { - let mut put_req = PutRequest::default(); - put_req.set_cf(cf.to_owned()); - put_req.set_key(key); - put_req.set_value(value); - - let mut req = RaftRequest::default(); - req.set_cmd_type(cmd_type); - req.set_put(put_req); - req - } - - fn make_raft_cmd(requests: Vec) -> Cmd { - let mut req = RaftCmdRequest::default(); - req.set_requests(requests.into()); - Cmd::new(0, 0, req, RaftCmdResponse::default()) - } - - fn new_test_collector() -> (AppliedLockCollector, CoprocessorHost) { - let mut coprocessor_host = CoprocessorHost::default(); - let collector = - AppliedLockCollector::new(&mut coprocessor_host, ConcurrencyManager::new(1.into())) - .unwrap(); - (collector, coprocessor_host) - } - - fn start_collecting(c: &AppliedLockCollector, max_ts: u64) -> Result<()> { - let (tx, rx) = channel(); - c.start_collecting(max_ts.into(), Box::new(move |r| tx.send(r).unwrap())) - .and_then(move |()| rx.recv().unwrap()) - } - - fn get_collected_locks(c: &AppliedLockCollector, max_ts: u64) -> Result<(Vec, bool)> { - let (tx, rx) = channel(); - c.get_collected_locks(max_ts.into(), Box::new(move |r| tx.send(r).unwrap())) - .unwrap(); - rx.recv().unwrap() - } - - fn stop_collecting(c: &AppliedLockCollector, max_ts: u64) -> Result<()> { - let (tx, rx) = channel(); - c.stop_collecting(max_ts.into(), Box::new(move |r| tx.send(r).unwrap())) - .unwrap(); - rx.recv().unwrap() - } - - #[test] - fn test_start_stop() { - let (c, _) = new_test_collector(); - // Not started. - get_collected_locks(&c, 1).unwrap_err(); - stop_collecting(&c, 1).unwrap_err(); - - // Started. - start_collecting(&c, 2).unwrap(); - assert_eq!(c.concurrency_manager.max_ts(), 2.into()); - get_collected_locks(&c, 2).unwrap(); - stop_collecting(&c, 2).unwrap(); - // Stopped. - get_collected_locks(&c, 2).unwrap_err(); - stop_collecting(&c, 2).unwrap_err(); - - // When start_collecting is invoked with a larger ts, the later one will - // ovewrite the previous one. - start_collecting(&c, 3).unwrap(); - assert_eq!(c.concurrency_manager.max_ts(), 3.into()); - get_collected_locks(&c, 3).unwrap(); - get_collected_locks(&c, 4).unwrap_err(); - start_collecting(&c, 4).unwrap(); - assert_eq!(c.concurrency_manager.max_ts(), 4.into()); - get_collected_locks(&c, 3).unwrap_err(); - get_collected_locks(&c, 4).unwrap(); - // Do not allow aborting previous observing with a smaller max_ts. - start_collecting(&c, 3).unwrap_err(); - get_collected_locks(&c, 3).unwrap_err(); - get_collected_locks(&c, 4).unwrap(); - // Do not allow stoping observing with a different max_ts. - stop_collecting(&c, 3).unwrap_err(); - stop_collecting(&c, 5).unwrap_err(); - stop_collecting(&c, 4).unwrap(); - } - - #[test] - fn test_check_memlock_on_start() { - let (c, _) = new_test_collector(); - let cm = c.concurrency_manager.clone(); - - let mem_lock = |k: &[u8], ts: u64, lock_type| { - let key = Key::from_raw(k); - let guard = block_on(cm.lock_key(&key)); - guard.with_lock(|lock| { - *lock = Some(txn_types::Lock::new( - lock_type, - k.to_vec(), - ts.into(), - 100, - None, - 0.into(), - 1, - 20.into(), - )); - }); - guard - }; - - let guard = mem_lock(b"a", 100, LockType::Put); - start_collecting(&c, 90).unwrap(); - stop_collecting(&c, 90).unwrap(); - start_collecting(&c, 100).unwrap_err(); - // Use get_collected_locks to check it's not collecting. - get_collected_locks(&c, 100).unwrap_err(); - start_collecting(&c, 110).unwrap_err(); - get_collected_locks(&c, 110).unwrap_err(); - drop(guard); - - let guard = mem_lock(b"b", 100, LockType::Lock); - start_collecting(&c, 90).unwrap(); - stop_collecting(&c, 90).unwrap(); - start_collecting(&c, 100).unwrap_err(); - get_collected_locks(&c, 100).unwrap_err(); - start_collecting(&c, 110).unwrap_err(); - get_collected_locks(&c, 110).unwrap_err(); - drop(guard); - - start_collecting(&c, 200).unwrap(); - stop_collecting(&c, 200).unwrap(); - } - - #[test] - fn test_apply() { - let locks: Vec<_> = vec![ - (b"k0", 10), - (b"k1", 110), - (b"k5", 100), - (b"k2", 101), - (b"k3", 90), - (b"k2", 99), - ] - .into_iter() - .map(|(k, ts)| { - let mut lock_info = LockInfo::default(); - lock_info.set_key(k.to_vec()); - lock_info.set_primary_lock(k.to_vec()); - lock_info.set_lock_type(Op::Put); - lock_info.set_lock_version(ts); - lock_info - }) - .collect(); - let lock_kvs: Vec<_> = locks - .iter() - .map(|lock| lock_info_to_kv(lock.clone())) - .collect(); - - let (c, coprocessor_host) = new_test_collector(); - let mut expected_result = vec![]; - - start_collecting(&c, 100).unwrap(); - assert_eq!(get_collected_locks(&c, 100).unwrap(), (vec![], true)); - - // Only puts in lock cf will be monitered. - let req = vec![ - make_apply_request( - lock_kvs[0].0.clone(), - lock_kvs[0].1.clone(), - CF_LOCK, - CmdType::Put, - ), - make_apply_request(b"1".to_vec(), b"1".to_vec(), CF_DEFAULT, CmdType::Put), - make_apply_request(b"2".to_vec(), b"2".to_vec(), CF_LOCK, CmdType::Delete), - ]; - coprocessor_host.post_apply(&Region::default(), &make_raft_cmd(req)); - expected_result.push(locks[0].clone()); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_result.clone(), true) - ); - - // When start collecting with the same max_ts again, shouldn't clean up the - // observer state. - start_collecting(&c, 100).unwrap(); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_result.clone(), true) - ); - - // Only locks with ts <= 100 will be collected. - let req: Vec<_> = lock_kvs - .iter() - .map(|(k, v)| make_apply_request(k.clone(), v.clone(), CF_LOCK, CmdType::Put)) - .collect(); - expected_result.extend( - locks - .iter() - .filter(|l| l.get_lock_version() <= 100) - .cloned(), - ); - coprocessor_host.post_apply(&Region::default(), &make_raft_cmd(req.clone())); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_result, true) - ); - - // When start_collecting is double-invoked again with larger ts, the previous - // results are dropped. - start_collecting(&c, 110).unwrap(); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (vec![], true)); - coprocessor_host.post_apply(&Region::default(), &make_raft_cmd(req)); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks, true)); - } - - #[test] - fn test_apply_snapshot() { - let locks: Vec<_> = vec![ - (b"k0", 10), - (b"k1", 110), - (b"k5", 100), - (b"k2", 101), - (b"k3", 90), - (b"k2", 99), - ] - .into_iter() - .map(|(k, ts)| { - let mut lock_info = LockInfo::default(); - lock_info.set_key(k.to_vec()); - lock_info.set_primary_lock(k.to_vec()); - lock_info.set_lock_type(Op::Put); - lock_info.set_lock_version(ts); - lock_info - }) - .collect(); - let lock_kvs: Vec<_> = locks - .iter() - .map(|lock| lock_info_to_kv(lock.clone())) - .map(|(k, v)| (keys::data_key(&k), v)) - .collect(); - - let (c, coprocessor_host) = new_test_collector(); - start_collecting(&c, 100).unwrap(); - - // Apply plain file to other CFs. Nothing happens. - coprocessor_host.post_apply_plain_kvs_from_snapshot( - &Region::default(), - CF_DEFAULT, - &lock_kvs, - ); - assert_eq!(get_collected_locks(&c, 100).unwrap(), (vec![], true)); - - // Apply plain file to lock cf. Locks with ts before 100 will be collected. - let expected_locks: Vec<_> = locks - .iter() - .filter(|l| l.get_lock_version() <= 100) - .cloned() - .collect(); - coprocessor_host.post_apply_plain_kvs_from_snapshot(&Region::default(), CF_LOCK, &lock_kvs); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_locks.clone(), true) - ); - // Fetch result twice gets the same result. - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_locks.clone(), true) - ); - - // When stale start_collecting request arrives, the previous collected results - // shouldn't be dropped. - start_collecting(&c, 100).unwrap(); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_locks.clone(), true) - ); - start_collecting(&c, 90).unwrap_err(); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_locks, true) - ); - - // When start_collecting is double-invoked again with larger ts, the previous - // results are dropped. - start_collecting(&c, 110).unwrap(); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (vec![], true)); - coprocessor_host.post_apply_plain_kvs_from_snapshot(&Region::default(), CF_LOCK, &lock_kvs); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks.clone(), true)); - - // Apply SST file to other cfs. Nothing happens. - coprocessor_host.post_apply_sst_from_snapshot(&Region::default(), CF_DEFAULT, ""); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks.clone(), true)); - - // Apply SST file to lock cf is not supported. This will cause error and - // therefore `is_clean` will be set to false. - coprocessor_host.post_apply_sst_from_snapshot(&Region::default(), CF_LOCK, ""); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks, false)); - } - - #[test] - fn test_not_clean() { - let (c, coprocessor_host) = new_test_collector(); - start_collecting(&c, 1).unwrap(); - // When error happens, `is_clean` should be set to false. - // The value is not a valid lock. - let (k, v) = (Key::from_raw(b"k1").into_encoded(), b"v1".to_vec()); - let req = make_apply_request(k.clone(), v.clone(), CF_LOCK, CmdType::Put); - coprocessor_host.post_apply(&Region::default(), &make_raft_cmd(vec![req])); - assert_eq!(get_collected_locks(&c, 1).unwrap(), (vec![], false)); - - // `is_clean` should be reset after invoking `start_collecting`. - start_collecting(&c, 2).unwrap(); - assert_eq!(get_collected_locks(&c, 2).unwrap(), (vec![], true)); - coprocessor_host.post_apply_plain_kvs_from_snapshot( - &Region::default(), - CF_LOCK, - &[(keys::data_key(&k), v)], - ); - assert_eq!(get_collected_locks(&c, 2).unwrap(), (vec![], false)); - - start_collecting(&c, 3).unwrap(); - assert_eq!(get_collected_locks(&c, 3).unwrap(), (vec![], true)); - - // If there are too many locks, `is_clean` should be set to false. - let mut lock = LockInfo::default(); - lock.set_key(b"k2".to_vec()); - lock.set_primary_lock(b"k2".to_vec()); - lock.set_lock_type(Op::Put); - lock.set_lock_version(1); - - let batch_generate_locks = |count| { - let (k, v) = lock_info_to_kv(lock.clone()); - let req = make_apply_request(k, v, CF_LOCK, CmdType::Put); - let raft_cmd = make_raft_cmd(vec![req; count]); - coprocessor_host.post_apply(&Region::default(), &raft_cmd); - }; - - batch_generate_locks(MAX_COLLECT_SIZE - 1); - let (locks, is_clean) = get_collected_locks(&c, 3).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE - 1); - assert!(is_clean); - - batch_generate_locks(1); - let (locks, is_clean) = get_collected_locks(&c, 3).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE); - assert!(!is_clean); - - batch_generate_locks(1); - // If there are more locks, they will be dropped. - let (locks, is_clean) = get_collected_locks(&c, 3).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE); - assert!(!is_clean); - - start_collecting(&c, 4).unwrap(); - assert_eq!(get_collected_locks(&c, 4).unwrap(), (vec![], true)); - - batch_generate_locks(MAX_COLLECT_SIZE - 5); - let (locks, is_clean) = get_collected_locks(&c, 4).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE - 5); - assert!(is_clean); - - batch_generate_locks(10); - let (locks, is_clean) = get_collected_locks(&c, 4).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE); - assert!(!is_clean); - } -} diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index 01e37727f11..4f528d8c356 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -653,7 +653,6 @@ mod tests { } => callback, GcTask::GcKeys { .. } => unreachable!(), GcTask::RawGcKeys { .. } => unreachable!(), - GcTask::PhysicalScanLock { .. } => unreachable!(), GcTask::OrphanVersions { .. } => unreachable!(), GcTask::Validate(_) => unreachable!(), }; diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 81de11cbae9..9e3f79654bc 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -22,16 +22,9 @@ use engine_traits::{ }; use file_system::{IoType, WithIoType}; use futures::executor::block_on; -use kvproto::{ - kvrpcpb::{Context, LockInfo}, - metapb::Region, -}; +use kvproto::{kvrpcpb::Context, metapb::Region}; use pd_client::{FeatureGate, PdClient}; -use raftstore::{ - coprocessor::{CoprocessorHost, RegionInfoProvider}, - router::RaftStoreRouter, - store::msg::StoreMsg, -}; +use raftstore::{coprocessor::RegionInfoProvider, router::RaftStoreRouter, store::msg::StoreMsg}; use tikv_kv::{CfStatistics, CursorBuilder, Modify, SnapContext}; use tikv_util::{ config::{Tracker, VersionTrack}, @@ -43,7 +36,6 @@ use tikv_util::{ use txn_types::{Key, TimeStamp}; use super::{ - applied_lock_collector::{AppliedLockCollector, Callback as LockCollectorCallback}, check_need_gc, compaction_filter::{ CompactionFilterInitializer, GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED, @@ -115,14 +107,6 @@ where callback: Callback<()>, region_info_provider: Arc, }, - PhysicalScanLock { - ctx: Context, - max_ts: TimeStamp, - start_key: Key, - limit: usize, - callback: Callback>, - region_info_provider: Arc, - }, /// If GC in compaction filter is enabled, versions on default CF will be /// handled with `DB::delete` in write CF's compaction filter. However if /// the compaction filter finds the DB is stalled, it will send the task @@ -149,7 +133,6 @@ where GcTask::GcKeys { .. } => GcCommandKind::gc_keys, GcTask::RawGcKeys { .. } => GcCommandKind::raw_gc_keys, GcTask::UnsafeDestroyRange { .. } => GcCommandKind::unsafe_destroy_range, - GcTask::PhysicalScanLock { .. } => GcCommandKind::physical_scan_lock, GcTask::OrphanVersions { .. } => GcCommandKind::orphan_versions, #[cfg(any(test, feature = "testexport"))] GcTask::Validate(_) => GcCommandKind::validate_config, @@ -179,10 +162,6 @@ where .field("start_key", &format!("{}", start_key)) .field("end_key", &format!("{}", end_key)) .finish(), - GcTask::PhysicalScanLock { max_ts, .. } => f - .debug_struct("PhysicalScanLock") - .field("max_ts", max_ts) - .finish(), GcTask::OrphanVersions { id, wb } => f .debug_struct("OrphanVersions") .field("id", id) @@ -870,47 +849,6 @@ where Ok(()) } - fn handle_physical_scan_lock( - &mut self, - _: &Context, - max_ts: TimeStamp, - start_key: &Key, - limit: usize, - regions_provider: Arc, - ) -> Result> { - let store_id = self.store_id; - let regions = box_try!(regions_provider.get_regions_in_range(start_key.as_encoded(), &[])) - .into_iter() - .filter(move |r| find_peer(r, store_id).is_some()); - - let mut first_round = true; - let mut locks = Vec::new(); - for region in regions { - let start_key = { - if first_round { - first_round = false; - start_key.clone() - } else { - Key::from_raw(region.get_start_key()) - } - }; - let snap = self.get_snapshot(store_id, ®ion)?; - let mut reader = MvccReader::new(snap, Some(ScanMode::Forward), false); - let (locks_this_region, _) = reader - .scan_locks(Some(&start_key), None, |l| l.ts <= max_ts, limit) - .map_err(TxnError::from_mvcc)?; - - locks.extend(locks_this_region); - } - - let mut lock_infos = Vec::with_capacity(locks.len()); - for (key, lock) in locks { - let raw_key = key.into_raw().map_err(TxnError::from_mvcc)?; - lock_infos.push(lock.into_lock_info(raw_key)); - } - Ok(lock_infos) - } - fn update_statistics_metrics(&mut self, key_mode: GcKeyMode) { if let Some(mut_stats) = self.stats_map.get_mut(&key_mode) { let stats = mem::take(mut_stats); @@ -1064,31 +1002,6 @@ where end_key ); } - GcTask::PhysicalScanLock { - ctx, - max_ts, - start_key, - limit, - callback, - region_info_provider, - } => { - let res = self.handle_physical_scan_lock( - &ctx, - max_ts, - &start_key, - limit, - region_info_provider, - ); - update_metrics(res.is_err()); - callback(res); - slow_log!( - T timer, - "PhysicalScanLock start_key {:?}, max_ts {}, limit {}", - start_key, - max_ts, - limit, - ); - } GcTask::OrphanVersions { mut wb, id } => { info!("handling GcTask::OrphanVersions"; "id" => id); let mut wopts = WriteOptions::default(); @@ -1121,9 +1034,6 @@ fn handle_gc_task_schedule_error(e: ScheduleError>) -> Res GcTask::Gc { callback, .. } | GcTask::UnsafeDestroyRange { callback, .. } => { callback(Err(Error::from(ErrorInner::GcWorkerTooBusy))) } - GcTask::PhysicalScanLock { callback, .. } => { - callback(Err(Error::from(ErrorInner::GcWorkerTooBusy))) - } // Attention: If you are adding a new GcTask, do not forget to call the callback if it has a // callback. GcTask::GcKeys { .. } | GcTask::RawGcKeys { .. } | GcTask::OrphanVersions { .. } => {} @@ -1184,8 +1094,6 @@ where worker: Arc>>>, worker_scheduler: Scheduler>, - applied_lock_collector: Option>, - gc_manager_handle: Arc>>, feature_gate: FeatureGate, } @@ -1207,7 +1115,6 @@ where refs: self.refs.clone(), worker: self.worker.clone(), worker_scheduler: self.worker_scheduler.clone(), - applied_lock_collector: self.applied_lock_collector.clone(), gc_manager_handle: self.gc_manager_handle.clone(), feature_gate: self.feature_gate.clone(), region_info_provider: self.region_info_provider.clone(), @@ -1259,7 +1166,6 @@ where refs: Arc::new(AtomicUsize::new(1)), worker: Arc::new(Mutex::new(worker)), worker_scheduler, - applied_lock_collector: None, gc_manager_handle: Arc::new(Mutex::new(None)), feature_gate, region_info_provider, @@ -1314,20 +1220,6 @@ where Ok(()) } - pub fn start_observe_lock_apply( - &mut self, - coprocessor_host: &mut CoprocessorHost, - concurrency_manager: ConcurrencyManager, - ) -> Result<()> { - assert!(self.applied_lock_collector.is_none()); - let collector = Arc::new(AppliedLockCollector::new( - coprocessor_host, - concurrency_manager, - )?); - self.applied_lock_collector = Some(collector); - Ok(()) - } - pub fn stop(&self) -> Result<()> { // Stop GcManager. if let Some(h) = self.gc_manager_handle.lock().unwrap().take() { @@ -1387,61 +1279,6 @@ where pub fn get_config_manager(&self) -> GcWorkerConfigManager { self.config_manager.clone() } - - pub fn physical_scan_lock( - &self, - ctx: Context, - max_ts: TimeStamp, - start_key: Key, - limit: usize, - callback: Callback>, - ) -> Result<()> { - GC_COMMAND_COUNTER_VEC_STATIC.physical_scan_lock.inc(); - - self.worker_scheduler - .schedule(GcTask::PhysicalScanLock { - ctx, - max_ts, - start_key, - limit, - callback, - region_info_provider: self.region_info_provider.clone(), - }) - .or_else(handle_gc_task_schedule_error) - } - - pub fn start_collecting( - &self, - max_ts: TimeStamp, - callback: LockCollectorCallback<()>, - ) -> Result<()> { - self.applied_lock_collector - .as_ref() - .ok_or_else(|| box_err!("applied_lock_collector not supported")) - .and_then(move |c| c.start_collecting(max_ts, callback)) - } - - pub fn get_collected_locks( - &self, - max_ts: TimeStamp, - callback: LockCollectorCallback<(Vec, bool)>, - ) -> Result<()> { - self.applied_lock_collector - .as_ref() - .ok_or_else(|| box_err!("applied_lock_collector not supported")) - .and_then(move |c| c.get_collected_locks(max_ts, callback)) - } - - pub fn stop_collecting( - &self, - max_ts: TimeStamp, - callback: LockCollectorCallback<()>, - ) -> Result<()> { - self.applied_lock_collector - .as_ref() - .ok_or_else(|| box_err!("applied_lock_collector not supported")) - .and_then(move |c| c.stop_collecting(max_ts, callback)) - } } #[cfg(any(test, feature = "testexport"))] @@ -1626,7 +1463,7 @@ mod tests { use std::{ collections::{BTreeMap, BTreeSet}, path::Path, - sync::mpsc::{self, channel}, + sync::mpsc, thread, time::Duration, }; @@ -1635,23 +1472,18 @@ mod tests { use engine_rocks::{util::get_cf_handle, RocksEngine}; use engine_traits::Peekable as _; use futures::executor::block_on; - use kvproto::{ - kvrpcpb::{ApiVersion, Op}, - metapb::Peer, - }; + use kvproto::{kvrpcpb::ApiVersion, metapb::Peer}; use raft::StateRole; use raftstore::{ coprocessor::{ region_info_accessor::{MockRegionInfoProvider, RegionInfoAccessor}, - RegionChangeEvent, + CoprocessorHost, RegionChangeEvent, }, router::RaftStoreBlackHole, }; use tempfile::Builder; use tikv_kv::Snapshot; - use tikv_util::{ - codec::number::NumberEncoder, future::paired_future_callback, store::new_peer, - }; + use tikv_util::store::new_peer; use txn_types::Mutation; use super::{test_gc_worker::MultiRocksEngine, *}; @@ -1955,88 +1787,6 @@ mod tests { .unwrap(); } - #[test] - fn test_physical_scan_lock() { - let store_id = 1; - let engine = TestEngineBuilder::new().build().unwrap(); - let prefixed_engine = PrefixedEngine(engine); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr( - prefixed_engine.clone(), - MockLockManager::new(), - ) - .build() - .unwrap(); - let (tx, _rx) = mpsc::channel(); - let mut region = Region::default(); - region.mut_peers().push(new_peer(store_id, 0)); - let mut gc_worker = GcWorker::new( - prefixed_engine, - RaftStoreBlackHole, - tx, - GcConfig::default(), - FeatureGate::default(), - Arc::new(MockRegionInfoProvider::new(vec![region])), - ); - gc_worker.start(store_id).unwrap(); - - let physical_scan_lock = |max_ts: u64, start_key, limit| { - let (cb, f) = paired_future_callback(); - gc_worker - .physical_scan_lock(Context::default(), max_ts.into(), start_key, limit, cb) - .unwrap(); - block_on(f).unwrap() - }; - - let mut expected_lock_info = Vec::new(); - - // Put locks into the storage. - for i in 0..50 { - let mut k = vec![]; - k.encode_u64(i).unwrap(); - let v = k.clone(); - - let mutation = Mutation::make_put(Key::from_raw(&k), v); - - let lock_ts = 10 + i % 3; - - // Collect all locks with ts <= 11 to check the result of physical_scan_lock. - if lock_ts <= 11 { - let mut info = LockInfo::default(); - info.set_primary_lock(k.clone()); - info.set_lock_version(lock_ts); - info.set_key(k.clone()); - info.set_lock_type(Op::Put); - expected_lock_info.push(info) - } - - let (tx, rx) = channel(); - storage - .sched_txn_command( - commands::Prewrite::with_defaults(vec![mutation], k, lock_ts.into()), - Box::new(move |res| tx.send(res).unwrap()), - ) - .unwrap(); - rx.recv() - .unwrap() - .unwrap() - .locks - .into_iter() - .for_each(|r| r.unwrap()); - } - - let res = physical_scan_lock(11, Key::from_raw(b""), 50).unwrap(); - assert_eq!(res, expected_lock_info); - - let res = physical_scan_lock(11, Key::from_raw(b""), 5).unwrap(); - assert_eq!(res[..], expected_lock_info[..5]); - - let mut start_key = vec![]; - start_key.encode_u64(4).unwrap(); - let res = physical_scan_lock(11, Key::from_raw(&start_key), 6).unwrap(); - // expected_locks[3] is the key 4. - assert_eq!(res[..], expected_lock_info[3..9]); - } - #[test] fn test_gc_keys_with_region_info_provider() { let store_id = 1; diff --git a/src/server/gc_worker/mod.rs b/src/server/gc_worker/mod.rs index 5b43b9b4be3..a5b8837cd2e 100644 --- a/src/server/gc_worker/mod.rs +++ b/src/server/gc_worker/mod.rs @@ -1,6 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -mod applied_lock_collector; pub mod compaction_filter; mod config; mod gc_manager; diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 3e07a75899f..23f8256835b 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -51,7 +51,6 @@ make_auto_flush_static_metric! { raw_compare_and_swap, raw_checksum, unsafe_destroy_range, - physical_scan_lock, register_lock_observer, check_lock_observer, remove_lock_observer, @@ -71,7 +70,6 @@ make_auto_flush_static_metric! { gc_keys, raw_gc_keys, unsafe_destroy_range, - physical_scan_lock, validate_config, orphan_versions, } diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 54b0dc6782b..fa2235b51e7 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -20,7 +20,6 @@ use kvproto::{ errorpb::{Error as RegionError, *}, kvrpcpb::*, mpp::*, - raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request as RaftRequest}, raft_serverpb::*, tikvpb::*, }; @@ -31,7 +30,7 @@ use raftstore::{ store::{ memory::{MEMTRACE_APPLYS, MEMTRACE_RAFT_ENTRIES, MEMTRACE_RAFT_MESSAGES}, metrics::RAFT_ENTRIES_CACHES_GAUGE, - Callback, CasualMessage, CheckLeaderTask, RaftCmdExtraOpts, + Callback, CasualMessage, CheckLeaderTask, }, DiscardReason, Error as RaftStoreError, Result as RaftStoreResult, }; @@ -519,169 +518,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ctx.spawn(task); } - fn register_lock_observer( - &mut self, - ctx: RpcContext<'_>, - req: RegisterLockObserverRequest, - sink: UnarySink, - ) { - let begin_instant = Instant::now(); - - let (cb, f) = paired_future_callback(); - let res = self.gc_worker.start_collecting(req.get_max_ts().into(), cb); - - let task = async move { - // Here except for the receiving error of `futures::channel::oneshot`, - // other errors will be returned as the successful response of rpc. - let res = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - let mut resp = RegisterLockObserverResponse::default(); - if let Err(e) = res { - resp.set_error(format!("{}", e)); - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .register_lock_observer - .observe(duration_to_sec(begin_instant.saturating_elapsed())); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "register_lock_observer" - ); - GRPC_MSG_FAIL_COUNTER.register_lock_observer.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - - fn check_lock_observer( - &mut self, - ctx: RpcContext<'_>, - req: CheckLockObserverRequest, - sink: UnarySink, - ) { - let begin_instant = Instant::now(); - - let (cb, f) = paired_future_callback(); - let res = self - .gc_worker - .get_collected_locks(req.get_max_ts().into(), cb); - - let task = async move { - let res = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - let mut resp = CheckLockObserverResponse::default(); - match res { - Ok((locks, is_clean)) => { - resp.set_is_clean(is_clean); - resp.set_locks(locks.into()); - } - Err(e) => resp.set_error(format!("{}", e)), - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .check_lock_observer - .observe(duration_to_sec(begin_instant.saturating_elapsed())); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "check_lock_observer" - ); - GRPC_MSG_FAIL_COUNTER.check_lock_observer.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - - fn remove_lock_observer( - &mut self, - ctx: RpcContext<'_>, - req: RemoveLockObserverRequest, - sink: UnarySink, - ) { - let begin_instant = Instant::now(); - - let (cb, f) = paired_future_callback(); - let res = self.gc_worker.stop_collecting(req.get_max_ts().into(), cb); - - let task = async move { - let res = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - let mut resp = RemoveLockObserverResponse::default(); - if let Err(e) = res { - resp.set_error(format!("{}", e)); - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .remove_lock_observer - .observe(duration_to_sec(begin_instant.saturating_elapsed())); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "remove_lock_observer" - ); - GRPC_MSG_FAIL_COUNTER.remove_lock_observer.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - - fn physical_scan_lock( - &mut self, - ctx: RpcContext<'_>, - mut req: PhysicalScanLockRequest, - sink: UnarySink, - ) { - let begin_instant = Instant::now(); - - let (cb, f) = paired_future_callback(); - let res = self.gc_worker.physical_scan_lock( - req.take_context(), - req.get_max_ts().into(), - Key::from_raw(req.get_start_key()), - req.get_limit() as _, - cb, - ); - - let task = async move { - let res = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - let mut resp = PhysicalScanLockResponse::default(); - match res { - Ok(locks) => resp.set_locks(locks.into()), - Err(e) => resp.set_error(format!("{}", e)), - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .physical_scan_lock - .observe(duration_to_sec(begin_instant.saturating_elapsed())); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "physical_scan_lock" - ); - GRPC_MSG_FAIL_COUNTER.physical_scan_lock.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - fn unsafe_destroy_range( &mut self, ctx: RpcContext<'_>, @@ -976,103 +812,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ctx.spawn(task); } - fn read_index( - &mut self, - ctx: RpcContext<'_>, - req: ReadIndexRequest, - sink: UnarySink, - ) { - forward_unary!(self.proxy, read_index, ctx, req, sink); - let begin_instant = Instant::now(); - - let region_id = req.get_context().get_region_id(); - let mut cmd = RaftCmdRequest::default(); - let mut header = RaftRequestHeader::default(); - let mut inner_req = RaftRequest::default(); - inner_req.set_cmd_type(CmdType::ReadIndex); - inner_req.mut_read_index().set_start_ts(req.get_start_ts()); - for r in req.get_ranges() { - let mut range = kvproto::kvrpcpb::KeyRange::default(); - range.set_start_key(Key::from_raw(r.get_start_key()).into_encoded()); - range.set_end_key(Key::from_raw(r.get_end_key()).into_encoded()); - inner_req.mut_read_index().mut_key_ranges().push(range); - } - header.set_region_id(req.get_context().get_region_id()); - header.set_peer(req.get_context().get_peer().clone()); - header.set_region_epoch(req.get_context().get_region_epoch().clone()); - if req.get_context().get_term() != 0 { - header.set_term(req.get_context().get_term()); - } - header.set_sync_log(req.get_context().get_sync_log()); - header.set_read_quorum(true); - cmd.set_header(header); - cmd.set_requests(vec![inner_req].into()); - - let (cb, f) = paired_future_callback(); - - // We must deal with all requests which acquire read-quorum in raftstore-thread, - // so just send it as an command. - if let Err(e) = self - .ch - .send_command(cmd, Callback::read(cb), RaftCmdExtraOpts::default()) - { - // Retrun region error instead a gRPC error. - let mut resp = ReadIndexResponse::default(); - resp.set_region_error(raftstore_error_to_region_error(e, region_id)); - ctx.spawn( - async move { - sink.success(resp).await?; - ServerResult::Ok(()) - } - .map_err(|_| ()) - .map(|_| ()), - ); - return; - } - - let task = async move { - let mut res = f.await?; - let mut resp = ReadIndexResponse::default(); - if res.response.get_header().has_error() { - resp.set_region_error(res.response.mut_header().take_error()); - } else { - let mut raft_resps = res.response.take_responses(); - if raft_resps.len() != 1 { - error!( - "invalid read index response"; - "region_id" => region_id, - "response" => ?raft_resps - ); - resp.mut_region_error().set_message(format!( - "Internal Error: invalid response: {:?}", - raft_resps - )); - } else { - let mut read_index_resp = raft_resps[0].take_read_index(); - if read_index_resp.has_locked() { - resp.set_locked(read_index_resp.take_locked()); - } else { - resp.set_read_index(read_index_resp.get_read_index()); - } - } - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .read_index - .observe(begin_instant.saturating_elapsed_secs()); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "read_index" - ); - GRPC_MSG_FAIL_COUNTER.read_index.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - fn batch_commands( &mut self, ctx: RpcContext<'_>, diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index 5845d4d4eb7..3dbb7ffc7b0 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -6,7 +6,6 @@ use std::{ time::Duration, }; -use collections::HashMap; use engine_traits::{Peekable, WriteBatch}; use grpcio::{ChannelBuilder, Environment}; use keys::data_key; @@ -28,289 +27,6 @@ use tikv::{ use tikv_util::HandyRwLock; use txn_types::{Key, TimeStamp}; -// In theory, raft can propose conf change as long as there is no pending one. -// Replicas don't apply logs synchronously, so it's possible the old leader is -// removed before the new leader applies all logs. -// In the current implementation, the new leader rejects conf change until it -// applies all logs. It guarantees the correctness of green GC. This test is to -// prevent breaking it in the future. -#[test] -fn test_collect_lock_from_stale_leader() { - let mut cluster = new_server_cluster(0, 2); - cluster.pd_client.disable_default_operator(); - let region_id = cluster.run_conf_change(); - let leader = cluster.leader_of_region(region_id).unwrap(); - - // Create clients. - let env = Arc::new(Environment::new(1)); - let mut clients = HashMap::default(); - for node_id in cluster.get_node_ids() { - let channel = - ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(node_id)); - let client = TikvClient::new(channel); - clients.insert(node_id, client); - } - - // Start transferring the region to store 2. - let new_peer = new_peer(2, 1003); - cluster.pd_client.must_add_peer(region_id, new_peer.clone()); - - // Create the ctx of the first region. - let leader_client = clients.get(&leader.get_store_id()).unwrap(); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader.clone()); - ctx.set_region_epoch(cluster.get_region_epoch(region_id)); - - // Pause the new peer applying so that when it becomes the leader, it doesn't - // apply all logs. - let new_leader_apply_fp = "on_handle_apply_1003"; - fail::cfg(new_leader_apply_fp, "pause").unwrap(); - must_kv_prewrite( - leader_client, - ctx, - vec![new_mutation(Op::Put, b"k1", b"v")], - b"k1".to_vec(), - 10, - ); - - // Leader election only considers the progress of appending logs, so it can - // succeed. - cluster.must_transfer_leader(region_id, new_peer.clone()); - // It shouldn't succeed in the current implementation. - cluster.pd_client.remove_peer(region_id, leader.clone()); - std::thread::sleep(Duration::from_secs(1)); - cluster.pd_client.must_have_peer(region_id, leader); - - // Must scan the lock from the old leader. - let locks = must_physical_scan_lock(leader_client, Context::default(), 100, b"", 10); - assert_eq!(locks.len(), 1); - assert_eq!(locks[0].get_key(), b"k1"); - - // Can't scan the lock from the new leader. - let leader_client = clients.get(&new_peer.get_store_id()).unwrap(); - must_register_lock_observer(leader_client, 100); - let locks = must_check_lock_observer(leader_client, 100, true); - assert!(locks.is_empty()); - let locks = must_physical_scan_lock(leader_client, Context::default(), 100, b"", 10); - assert!(locks.is_empty()); - - fail::remove(new_leader_apply_fp); -} - -#[test] -fn test_observer_send_error() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - - let max_ts = 100; - must_register_lock_observer(&client, max_ts); - must_kv_prewrite( - &client, - ctx.clone(), - vec![new_mutation(Op::Put, b"k1", b"v")], - b"k1".to_vec(), - 10, - ); - assert_eq!(must_check_lock_observer(&client, max_ts, true).len(), 1); - - let observer_send_fp = "lock_observer_send_full"; - fail::cfg(observer_send_fp, "return").unwrap(); - must_kv_prewrite( - &client, - ctx, - vec![new_mutation(Op::Put, b"k2", b"v")], - b"k1".to_vec(), - 10, - ); - let resp = check_lock_observer(&client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); - // Should mark dirty if fails to send locks. - assert!(!resp.get_is_clean()); -} - -#[test] -fn test_notify_observer_after_apply() { - fn retry_until(mut f: impl FnMut() -> bool) { - for _ in 0..100 { - sleep_ms(10); - if f() { - break; - } - } - } - - let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); - cluster.pd_client.disable_default_operator(); - let post_apply_query_fp = "notify_lock_observer_query"; - let apply_plain_kvs_fp = "notify_lock_observer_snapshot"; - - // Write a lock and pause before notifying the lock observer. - let max_ts = 100; - must_register_lock_observer(&client, max_ts); - fail::cfg(post_apply_query_fp, "pause").unwrap(); - let key = b"k"; - let (client_clone, ctx_clone) = (client.clone(), ctx.clone()); - let handle = std::thread::spawn(move || { - must_kv_prewrite( - &client_clone, - ctx_clone, - vec![new_mutation(Op::Put, key, b"v")], - key.to_vec(), - 10, - ); - }); - // We can use physical_scan_lock to get the lock because we notify the lock - // observer after writing data to the rocskdb. - let mut locks = vec![]; - retry_until(|| { - assert!(must_check_lock_observer(&client, max_ts, true).is_empty()); - locks.extend(must_physical_scan_lock( - &client, - ctx.clone(), - max_ts, - b"", - 100, - )); - !locks.is_empty() - }); - assert_eq!(locks.len(), 1); - assert_eq!(locks[0].get_key(), key); - assert!(must_check_lock_observer(&client, max_ts, true).is_empty()); - fail::remove(post_apply_query_fp); - handle.join().unwrap(); - assert_eq!(must_check_lock_observer(&client, max_ts, true).len(), 1); - - // Add a new store. - let store_id = cluster.add_new_engine(); - let channel = ChannelBuilder::new(Arc::new(Environment::new(1))) - .connect(&cluster.sim.rl().get_addr(store_id)); - let replica_client = TikvClient::new(channel); - - // Add a new peer and pause before notifying the lock observer. - must_register_lock_observer(&replica_client, max_ts); - fail::cfg(apply_plain_kvs_fp, "pause").unwrap(); - cluster - .pd_client - .must_add_peer(ctx.get_region_id(), new_peer(store_id, store_id)); - // We can use physical_scan_lock to get the lock because we notify the lock - // observer after writing data to the rocksdb. - let mut locks = vec![]; - retry_until(|| { - assert!(must_check_lock_observer(&replica_client, max_ts, true).is_empty()); - locks.extend(must_physical_scan_lock( - &replica_client, - ctx.clone(), - max_ts, - b"", - 100, - )); - !locks.is_empty() - }); - assert_eq!(locks.len(), 1); - assert_eq!(locks[0].get_key(), key); - assert!(must_check_lock_observer(&replica_client, max_ts, true).is_empty()); - fail::remove(apply_plain_kvs_fp); - retry_until(|| !must_check_lock_observer(&replica_client, max_ts, true).is_empty()); - assert_eq!( - must_check_lock_observer(&replica_client, max_ts, true).len(), - 1 - ); -} - -// It may cause locks missing during green GC if the raftstore notifies the lock -// observer before writing data to the rocksdb: -// - Store-1 transfers a region to store-2 and store-2 is applying logs. -// - GC worker registers lock observer on store-2 after calling lock observer's -// callback and before finishing applying which means the lock won't be -// observed. -// - GC worker scans locks on each store independently. It's possible GC worker -// has scanned all locks on store-2 and hasn't scanned locks on store-1. -// - Store-2 applies all logs and removes the peer on store-1. -// - GC worker can't scan the lock on store-1 because the peer has been -// destroyed. -// - GC worker can't get the lock from store-2 because it can't observe the lock -// and has scanned it. -#[test] -fn test_collect_applying_locks() { - let mut cluster = new_server_cluster(0, 2); - cluster.pd_client.disable_default_operator(); - let region_id = cluster.run_conf_change(); - let leader = cluster.leader_of_region(region_id).unwrap(); - - // Create clients. - let env = Arc::new(Environment::new(1)); - let mut clients = HashMap::default(); - for node_id in cluster.get_node_ids() { - let channel = - ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(node_id)); - let client = TikvClient::new(channel); - clients.insert(node_id, client); - } - - // Start transferring the region to store 2. - let new_peer = new_peer(2, 1003); - cluster.pd_client.must_add_peer(region_id, new_peer.clone()); - - // Create the ctx of the first region. - let store_1_client = clients.get(&leader.get_store_id()).unwrap(); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader.clone()); - ctx.set_region_epoch(cluster.get_region_epoch(region_id)); - - // Pause store-2 after calling observer callbacks and before writing to the - // rocksdb. - let new_leader_apply_fp = "post_handle_apply_1003"; - fail::cfg(new_leader_apply_fp, "pause").unwrap(); - - // Write 1 lock. - must_kv_prewrite( - store_1_client, - ctx, - vec![new_mutation(Op::Put, b"k1", b"v")], - b"k1".to_vec(), - 10, - ); - // Wait for store-2 applying. - std::thread::sleep(Duration::from_secs(3)); - - // Starting the process of green GC at safe point 20: - // 1. Register lock observers on all stores. - // 2. Scan locks physically on each store independently. - // 3. Get locks from all observers. - let safe_point = 20; - - // Register lock observers. - clients.iter().for_each(|(_, c)| { - must_register_lock_observer(c, safe_point); - }); - - // Finish scanning locks on store-2 and find nothing. - let store_2_client = clients.get(&new_peer.get_store_id()).unwrap(); - let locks = must_physical_scan_lock(store_2_client, Context::default(), safe_point, b"", 1); - assert!(locks.is_empty(), "{:?}", locks); - - // Transfer the region from store-1 to store-2. - fail::remove(new_leader_apply_fp); - cluster.must_transfer_leader(region_id, new_peer); - cluster.pd_client.must_remove_peer(region_id, leader); - // Wait for store-1 desroying the region. - std::thread::sleep(Duration::from_secs(3)); - - // Scan locks on store-1 after the region has been destroyed. - let locks = must_physical_scan_lock(store_1_client, Context::default(), safe_point, b"", 1); - assert!(locks.is_empty(), "{:?}", locks); - - // Check lock observers. - let mut locks = vec![]; - clients.iter().for_each(|(_, c)| { - locks.extend(must_check_lock_observer(c, safe_point, true)); - }); - // Must observe the applying lock even through we can't use scan to get it. - assert_eq!(locks.len(), 1); - assert_eq!(locks[0].get_key(), b"k1"); -} - // Test write CF's compaction filter can call `orphan_versions_handler` // correctly. #[test] diff --git a/tests/failpoints/cases/test_kv_service.rs b/tests/failpoints/cases/test_kv_service.rs index 1f7e35b5691..b81673af0e2 100644 --- a/tests/failpoints/cases/test_kv_service.rs +++ b/tests/failpoints/cases/test_kv_service.rs @@ -42,95 +42,6 @@ fn test_kv_scan_memory_lock() { fail::remove("raftkv_async_snapshot_err"); } -#[test] -fn test_scan_lock_push_async_commit() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - - for (use_green_gc, ts) in &[(false, 100), (true, 200)] { - // We will perform a async commit transaction with start_ts == `ts`. - // First, try pushing max_ts to `ts + 10`. - if *use_green_gc { - let mut req = RegisterLockObserverRequest::default(); - req.set_max_ts(ts + 10); - let resp = client.register_lock_observer(&req).unwrap(); - assert_eq!(resp.error.len(), 0); - } else { - let mut req = ScanLockRequest::default(); - req.set_context(ctx.clone()); - req.set_max_version(ts + 10); - let resp = client.kv_scan_lock(&req).unwrap(); - assert!(!resp.has_region_error()); - assert!(!resp.has_error()); - } - - let k1 = b"k1"; - let v1 = b"v1"; - - // The following code simulates another case: prewrite is locking the memlock, - // and then another scan lock operation request meets the memlock. - - fail::cfg("before-set-lock-in-memory", "pause").unwrap(); - let client1 = client.clone(); - let ctx1 = ctx.clone(); - let handle1 = std::thread::spawn(move || { - let mut prewrite = PrewriteRequest::default(); - prewrite.set_context(ctx1); - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k1.to_vec()); - mutation.set_value(v1.to_vec()); - prewrite.set_mutations(vec![mutation].into()); - prewrite.set_primary_lock(k1.to_vec()); - prewrite.set_start_version(*ts); - prewrite.set_lock_ttl(1000); - prewrite.set_use_async_commit(true); - - let resp = client1.kv_prewrite(&prewrite).unwrap(); - assert!(!resp.has_region_error()); - assert_eq!(resp.get_errors(), &[]); - // min_commit_ts should be the last scan_lock ts + 1. - assert_eq!(resp.min_commit_ts, ts + 11); - }); - - // Wait for the prewrite acquires the memlock - std::thread::sleep(Duration::from_millis(200)); - - let client1 = client.clone(); - let ctx1 = ctx.clone(); - let handle2 = std::thread::spawn(move || { - if *use_green_gc { - let mut req = RegisterLockObserverRequest::default(); - req.set_max_ts(ts + 20); - let resp = client1.register_lock_observer(&req).unwrap(); - assert!(!resp.error.is_empty()); - } else { - let mut req = ScanLockRequest::default(); - req.set_context(ctx1); - req.set_max_version(ts + 20); - let resp = client1.kv_scan_lock(&req).unwrap(); - assert!(!resp.has_region_error()); - assert!(resp.has_error()); - } - }); - - fail::remove("before-set-lock-in-memory"); - - handle1.join().unwrap(); - handle2.join().unwrap(); - - // Commit the key so that next turn of test will work. - let mut req = CommitRequest::default(); - req.set_context(ctx.clone()); - req.set_start_version(*ts); - req.set_commit_version(ts + 11); - req.set_keys(vec![k1.to_vec()].into()); - let resp = client.kv_commit(&req).unwrap(); - assert!(!resp.has_region_error()); - assert!(!resp.has_error()); - assert_eq!(resp.commit_version, ts + 11); - } -} - #[test] fn test_snapshot_not_block_grpc() { let (cluster, leader, ctx) = must_new_cluster_mul(1); diff --git a/tests/integrations/server/gc_worker.rs b/tests/integrations/server/gc_worker.rs index 36f9eed9ca8..cfadde84405 100644 --- a/tests/integrations/server/gc_worker.rs +++ b/tests/integrations/server/gc_worker.rs @@ -2,271 +2,15 @@ use std::sync::Arc; -use collections::HashMap; use engine_traits::{Peekable, CF_WRITE}; use grpcio::{ChannelBuilder, Environment}; use keys::data_key; -use kvproto::{kvrpcpb::*, metapb, tikvpb::TikvClient}; +use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; use test_raftstore::*; use tikv::server::gc_worker::sync_gc; use tikv_util::HandyRwLock; use txn_types::Key; -#[test] -fn test_physical_scan_lock() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - - // Generate kvs like k10, v10, ts=10; k11, v11, ts=11; ... - let kv: Vec<_> = (10..20) - .map(|i| (i, vec![b'k', i as u8], vec![b'v', i as u8])) - .collect(); - - for (ts, k, v) in &kv { - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), *ts); - } - - let all_locks: Vec<_> = kv - .into_iter() - .map(|(ts, k, _)| { - // Create a LockInfo that matches the prewrite request in `must_kv_prewrite`. - let mut lock_info = LockInfo::default(); - lock_info.set_primary_lock(k.clone()); - lock_info.set_lock_version(ts); - lock_info.set_key(k); - lock_info.set_lock_ttl(3000); - lock_info.set_lock_type(Op::Put); - lock_info.set_min_commit_ts(ts + 1); - lock_info - }) - .collect(); - - let check_result = |got_locks: &[_], expected_locks: &[_]| { - for i in 0..std::cmp::max(got_locks.len(), expected_locks.len()) { - assert_eq!(got_locks[i], expected_locks[i], "lock {} mismatch", i); - } - }; - - check_result( - &must_physical_scan_lock(&client, ctx.clone(), 30, b"", 100), - &all_locks, - ); - check_result( - &must_physical_scan_lock(&client, ctx.clone(), 15, b"", 100), - &all_locks[0..=5], - ); - check_result( - &must_physical_scan_lock(&client, ctx.clone(), 10, b"", 100), - &all_locks[0..1], - ); - check_result( - &must_physical_scan_lock(&client, ctx.clone(), 9, b"", 100), - &[], - ); - check_result( - &must_physical_scan_lock(&client, ctx, 30, &[b'k', 13], 5), - &all_locks[3..8], - ); -} - -#[test] -fn test_applied_lock_collector() { - let mut cluster = new_server_cluster(0, 3); - cluster.pd_client.disable_default_operator(); - cluster.run(); - - // Create all stores' clients. - let env = Arc::new(Environment::new(1)); - let mut clients = HashMap::default(); - for node_id in cluster.get_node_ids() { - let channel = - ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(node_id)); - let client = TikvClient::new(channel); - clients.insert(node_id, client); - } - - // Create the ctx of the first region. - let region = cluster.get_region(b""); - let region_id = region.get_id(); - let leader_peer = cluster.leader_of_region(region_id).unwrap(); - let leader_store_id = leader_peer.get_store_id(); - let leader_client = clients.get(&leader_store_id).unwrap(); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader_peer); - ctx.set_region_epoch(cluster.get_region_epoch(region_id)); - - // It's used to make sure all stores applies all logs. - let wait_for_apply = |cluster: &mut Cluster<_>, region: &metapb::Region| { - let cluster = &mut *cluster; - region.get_peers().iter().for_each(|p| { - let mut retry_times = 1; - loop { - let resp = - async_read_on_peer(cluster, p.clone(), region.clone(), b"key", true, true) - .recv() - .unwrap(); - if !resp.get_header().has_error() { - return; - } - if retry_times >= 50 { - panic!("failed to read on {:?}: {:?}", p, resp); - } - retry_times += 1; - sleep_ms(20); - } - }); - }; - - let check_lock = |lock: &LockInfo, k: &[u8], pk: &[u8], ts| { - assert_eq!(lock.get_key(), k); - assert_eq!(lock.get_primary_lock(), pk); - assert_eq!(lock.get_lock_version(), ts); - }; - - // Register lock observer at safe point 10000. - let mut safe_point = 10000; - clients.iter().for_each(|(_, c)| { - // Should report error when checking non-existent observer. - assert!(!check_lock_observer(c, safe_point).get_error().is_empty()); - must_register_lock_observer(c, safe_point); - assert!(must_check_lock_observer(c, safe_point, true).is_empty()); - }); - - // Lock observer should only collect values in lock CF. - let key = b"key0"; - must_kv_prewrite( - leader_client, - ctx.clone(), - vec![new_mutation(Op::Put, key, &b"v".repeat(1024))], - key.to_vec(), - 1, - ); - must_kv_commit(leader_client, ctx.clone(), vec![key.to_vec()], 1, 2, 2); - wait_for_apply(&mut cluster, ®ion); - clients.iter().for_each(|(_, c)| { - let locks = must_check_lock_observer(c, safe_point, true); - assert_eq!(locks.len(), 1); - check_lock(&locks[0], key, key, 1); - }); - - // Lock observer shouldn't collect locks after the safe point. - must_kv_prewrite( - leader_client, - ctx.clone(), - vec![new_mutation(Op::Put, key, b"v")], - key.to_vec(), - safe_point + 1, - ); - wait_for_apply(&mut cluster, ®ion); - clients.iter().for_each(|(_, c)| { - let locks = must_check_lock_observer(c, safe_point, true); - assert_eq!(locks.len(), 1); - check_lock(&locks[0], key, key, 1); - }); - - // Write 999 locks whose timestamp is less than the safe point. - let mutations = (1..1000) - .map(|i| new_mutation(Op::Put, format!("key{}", i).as_bytes(), b"v")) - .collect(); - must_kv_prewrite(leader_client, ctx.clone(), mutations, b"key1".to_vec(), 10); - wait_for_apply(&mut cluster, ®ion); - clients.iter().for_each(|(_, c)| { - let locks = must_check_lock_observer(c, safe_point, true); - // Plus the first lock. - assert_eq!(locks.len(), 1000); - }); - - // Add a new store and register lock observer. - let store_id = cluster.add_new_engine(); - let channel = - ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(store_id)); - let client = TikvClient::new(channel); - must_register_lock_observer(&client, safe_point); - - // Add a new peer. Lock observer should collect all locks from snapshot. - let peer = new_peer(store_id, store_id); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_none_pending_peer(peer); - wait_for_apply(&mut cluster, ®ion); - let locks = must_check_lock_observer(&client, safe_point, true); - assert_eq!(locks.len(), 999); - - // Should be dirty when collects too many locks. - let mutations = (1000..1100) - .map(|i| new_mutation(Op::Put, format!("key{}", i).as_bytes(), b"v")) - .collect(); - must_kv_prewrite( - leader_client, - ctx.clone(), - mutations, - b"key1000".to_vec(), - 100, - ); - wait_for_apply(&mut cluster, ®ion); - clients.insert(store_id, client); - clients.iter().for_each(|(_, c)| { - let resp = check_lock_observer(c, safe_point); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); - assert!(!resp.get_is_clean()); - // MAX_COLLECT_SIZE is 1024. - assert_eq!(resp.get_locks().len(), 1024); - }); - - // Reregister and check. It shouldn't clean up state. - clients.iter().for_each(|(_, c)| { - must_register_lock_observer(c, safe_point); - let resp = check_lock_observer(c, safe_point); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); - assert!(!resp.get_is_clean()); - // MAX_COLLECT_SIZE is 1024. - assert_eq!(resp.get_locks().len(), 1024); - }); - - // Register lock observer at a later safe point. Lock observer should reset its - // state. - safe_point += 1; - clients.iter().for_each(|(_, c)| { - must_register_lock_observer(c, safe_point); - assert!(must_check_lock_observer(c, safe_point, true).is_empty()); - // Can't register observer with smaller max_ts. - assert!( - !register_lock_observer(c, safe_point - 1) - .get_error() - .is_empty() - ); - assert!(must_check_lock_observer(c, safe_point, true).is_empty()); - }); - let leader_client = clients.get(&leader_store_id).unwrap(); - must_kv_prewrite( - leader_client, - ctx, - vec![new_mutation(Op::Put, b"key1100", b"v")], - b"key1100".to_vec(), - safe_point, - ); - wait_for_apply(&mut cluster, ®ion); - clients.iter().for_each(|(_, c)| { - // Should collect locks according to the new max ts. - let locks = must_check_lock_observer(c, safe_point, true); - assert_eq!(locks.len(), 1, "{:?}", locks); - // Shouldn't remove it with a wrong max ts. - assert!( - !remove_lock_observer(c, safe_point - 1) - .get_error() - .is_empty() - ); - let locks = must_check_lock_observer(c, safe_point, true); - assert_eq!(locks.len(), 1, "{:?}", locks); - // Remove lock observers. - must_remove_lock_observer(c, safe_point); - assert!(!check_lock_observer(c, safe_point).get_error().is_empty()); - }); -} - // Since v5.0 GC bypasses Raft, which means GC scans/deletes records with // `keys::DATA_PREFIX`. This case ensures it's performed correctly. #[test] diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index cfbe6ff504e..7e3f718dac9 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -20,7 +20,7 @@ use grpcio_health::{proto::HealthCheckRequest, *}; use kvproto::{ coprocessor::*, debugpb, - kvrpcpb::{self, PrewriteRequestPessimisticAction::*, *}, + kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, metapb, raft_serverpb, raft_serverpb::*, tikvpb::*, @@ -936,32 +936,6 @@ fn test_split_region_impl(is_raw_kv: bool) { ); } -#[test] -fn test_read_index() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - - // Read index - let mut req = ReadIndexRequest::default(); - req.set_context(ctx.clone()); - let mut resp = client.read_index(&req).unwrap(); - let last_index = resp.get_read_index(); - assert_eq!(last_index > 0, true); - - // Raw put - let (k, v) = (b"key".to_vec(), b"value".to_vec()); - let mut put_req = RawPutRequest::default(); - put_req.set_context(ctx); - put_req.key = k; - put_req.value = v; - let put_resp = client.raw_put(&put_req).unwrap(); - assert!(!put_resp.has_region_error()); - assert!(put_resp.error.is_empty()); - - // Read index again - resp = client.read_index(&req).unwrap(); - assert_eq!(last_index + 1, resp.get_read_index()); -} - #[test] fn test_debug_get() { let (cluster, debug_client, store_id) = must_new_cluster_and_debug_client(); @@ -1457,90 +1431,6 @@ fn test_async_commit_check_txn_status() { assert_ne!(resp.get_action(), Action::MinCommitTsPushed); } -#[test] -fn test_read_index_check_memory_locks() { - let mut cluster = new_server_cluster(0, 3); - cluster.cfg.raft_store.hibernate_regions = false; - cluster.run(); - - // make sure leader has been elected. - assert_eq!(cluster.must_get(b"k"), None); - - let region = cluster.get_region(b""); - let leader = cluster.leader_of_region(region.get_id()).unwrap(); - let leader_cm = cluster.sim.rl().get_concurrency_manager(leader.get_id()); - - let keys: Vec<_> = vec![b"k", b"l"] - .into_iter() - .map(|k| Key::from_raw(k)) - .collect(); - let guards = block_on(leader_cm.lock_keys(keys.iter())); - let lock = Lock::new( - LockType::Put, - b"k".to_vec(), - 1.into(), - 20000, - None, - 1.into(), - 1, - 2.into(), - ); - guards[0].with_lock(|l| *l = Some(lock.clone())); - - // read on follower - let mut follower_peer = None; - let peers = region.get_peers(); - for p in peers { - if p.get_id() != leader.get_id() { - follower_peer = Some(p.clone()); - break; - } - } - let follower_peer = follower_peer.unwrap(); - let addr = cluster.sim.rl().get_addr(follower_peer.get_store_id()); - - let env = Arc::new(Environment::new(1)); - let channel = ChannelBuilder::new(env).connect(&addr); - let client = TikvClient::new(channel); - - let mut ctx = Context::default(); - ctx.set_region_id(region.get_id()); - ctx.set_region_epoch(region.get_region_epoch().clone()); - ctx.set_peer(follower_peer); - - let read_index = |ranges: &[(&[u8], &[u8])]| { - let mut req = ReadIndexRequest::default(); - let start_ts = block_on(cluster.pd_client.get_tso()).unwrap(); - req.set_context(ctx.clone()); - req.set_start_ts(start_ts.into_inner()); - for &(start_key, end_key) in ranges { - let mut range = kvrpcpb::KeyRange::default(); - range.set_start_key(start_key.to_vec()); - range.set_end_key(end_key.to_vec()); - req.mut_ranges().push(range); - } - let resp = client.read_index(&req).unwrap(); - (resp, start_ts) - }; - - // wait a while until the node updates its own max ts - thread::sleep(Duration::from_millis(300)); - - let (resp, start_ts) = read_index(&[(b"l", b"yz")]); - assert!(!resp.has_locked()); - assert_eq!(leader_cm.max_ts(), start_ts); - - let (resp, start_ts) = read_index(&[(b"a", b"b"), (b"j", b"k0")]); - assert_eq!(resp.get_locked(), &lock.into_lock_info(b"k".to_vec())); - assert_eq!(leader_cm.max_ts(), start_ts); - - drop(guards); - - let (resp, start_ts) = read_index(&[(b"a", b"z")]); - assert!(!resp.has_locked()); - assert_eq!(leader_cm.max_ts(), start_ts); -} - #[test] fn test_prewrite_check_max_commit_ts() { let mut cluster = new_server_cluster(0, 1); @@ -1882,7 +1772,6 @@ fn test_tikv_forwarding() { req.set_split_key(b"k1".to_vec()); req }); - test_func_init!(client, ctx, call_opt, read_index, ReadIndexRequest); // Test if duplex can be redirect correctly. let cases = vec![ From 71980d382426b77d135c5d1c9576d363c298dc2d Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 23 Nov 2022 08:37:57 +0800 Subject: [PATCH 352/676] *: remove redundant code for prewrite and commit. (#13747) ref tikv/tikv#13303 remove redundant code for prewrite and commit. Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- components/test_raftstore/src/util.rs | 16 ++- .../resource_metering/test_read_keys.rs | 26 +---- tests/integrations/server/kv_service.rs | 102 ++---------------- 3 files changed, 21 insertions(+), 123 deletions(-) diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 06c2da432c0..14661344316 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -817,10 +817,15 @@ pub fn must_kv_read_equal(client: &TikvClient, ctx: Context, key: Vec, val: assert_eq!(get_resp.take_value(), val); } -// TODO: replace the redundant code -pub fn complete_data_commit(client: &TikvClient, ctx: &Context, ts: u64, k: Vec, v: Vec) { +pub fn write_and_read_key( + client: &TikvClient, + ctx: &Context, + ts: &mut u64, + k: Vec, + v: Vec, +) { // Prewrite - let prewrite_start_version = ts + 1; + let prewrite_start_version = *ts + 1; let mut mutation = Mutation::default(); mutation.set_op(Op::Put); mutation.set_key(k.clone()); @@ -833,7 +838,7 @@ pub fn complete_data_commit(client: &TikvClient, ctx: &Context, ts: u64, k: Vec< prewrite_start_version, ); // Commit - let commit_version = ts + 2; + let commit_version = *ts + 2; must_kv_commit( client, ctx.clone(), @@ -843,7 +848,8 @@ pub fn complete_data_commit(client: &TikvClient, ctx: &Context, ts: u64, k: Vec< commit_version, ); // Get - must_kv_read_equal(client, ctx.clone(), k, v, ts + 3); + *ts += 3; + must_kv_read_equal(client, ctx.clone(), k, v, *ts); } pub fn kv_read(client: &TikvClient, ctx: Context, key: Vec, ts: u64) -> GetResponse { diff --git a/tests/integrations/resource_metering/test_read_keys.rs b/tests/integrations/resource_metering/test_read_keys.rs index 87ad50024ad..35ef0e2ba88 100644 --- a/tests/integrations/resource_metering/test_read_keys.rs +++ b/tests/integrations/resource_metering/test_read_keys.rs @@ -50,31 +50,7 @@ pub fn test_read_keys() { let (k, v) = (n.clone(), n); // Prewrite. - ts += 1; - let prewrite_start_version = ts; - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite( - &client, - ctx.clone(), - vec![mutation], - k.clone(), - prewrite_start_version, - ); - - // Commit. - ts += 1; - let commit_version = ts; - must_kv_commit( - &client, - ctx.clone(), - vec![k.clone()], - prewrite_start_version, - commit_version, - commit_version, - ); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); } // PointGet diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 7e3f718dac9..f6db3386007 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -276,33 +276,7 @@ fn test_mvcc_basic() { let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; - - // Prewrite - ts += 1; - let prewrite_start_version = ts; - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite( - &client, - ctx.clone(), - vec![mutation], - k.clone(), - prewrite_start_version, - ); - - // Commit - ts += 1; - let commit_version = ts; - must_kv_commit( - &client, - ctx.clone(), - vec![k.clone()], - prewrite_start_version, - commit_version, - commit_version, - ); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); // Get ts += 1; @@ -365,33 +339,7 @@ fn test_mvcc_rollback_and_cleanup() { let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; - - // Prewrite - ts += 1; - let prewrite_start_version = ts; - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v); - must_kv_prewrite( - &client, - ctx.clone(), - vec![mutation], - k.clone(), - prewrite_start_version, - ); - - // Commit - ts += 1; - let commit_version = ts; - must_kv_commit( - &client, - ctx.clone(), - vec![k.clone()], - prewrite_start_version, - commit_version, - commit_version, - ); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v); // Prewrite puts some locks. ts += 1; @@ -607,13 +555,15 @@ fn test_mvcc_flashback_failed_after_first_batch() { for i in 0..FLASHBACK_BATCH_SIZE * 2 { // Meet the constraints of the alphabetical order for test let k = format!("key@{}", from_u32(i as u32).unwrap()).into_bytes(); - complete_data_commit(&client, &ctx, ts, k.clone(), b"value@0".to_vec()); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), b"value@0".to_vec()); + ts -= 3; } ts += 3; let check_ts = ts; for i in 0..FLASHBACK_BATCH_SIZE * 2 { let k = format!("key@{}", from_u32(i as u32).unwrap()).into_bytes(); - complete_data_commit(&client, &ctx, ts, k.clone(), b"value@1".to_vec()); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), b"value@1".to_vec()); + ts -= 3; } ts += 3; // Flashback @@ -716,34 +666,7 @@ fn test_mvcc_flashback() { for i in 0..2000 { let v = format!("value@{}", i).into_bytes(); let k = format!("key@{}", i % 1000).into_bytes(); - // Prewrite - ts += 1; - let prewrite_start_version = ts; - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite( - &client, - ctx.clone(), - vec![mutation], - k.clone(), - prewrite_start_version, - ); - // Commit - ts += 1; - let commit_version = ts; - must_kv_commit( - &client, - ctx.clone(), - vec![k.clone()], - prewrite_start_version, - commit_version, - commit_version, - ); - // Get - ts += 1; - must_kv_read_equal(&client, ctx.clone(), k.clone(), v.clone(), ts) + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); } // Prewrite to leave a lock. let k = b"key@1".to_vec(); @@ -837,15 +760,8 @@ fn test_mvcc_flashback_block_scheduling() { fn test_mvcc_flashback_unprepared() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); - // Prewrite - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 1); - // Commit - must_kv_commit(&client, ctx.clone(), vec![k.clone()], 1, 2, 2); - must_kv_read_equal(&client, ctx.clone(), k.clone(), v.clone(), 3); + let mut ts = 0; + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); // Try to flashback without preparing first. let mut req = FlashbackToVersionRequest::default(); req.set_context(ctx.clone()); From fd197f08314ce31860f439d2f84f7d77058745c7 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 23 Nov 2022 10:27:58 +0800 Subject: [PATCH 353/676] server: wait for reset-to-version task completion (#13830) close tikv/tikv#13829 Fix a segfault when executing reset-to-version with tikv-ctl Signed-off-by: tabokie --- src/server/reset_to_version.rs | 82 ++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/src/server/reset_to_version.rs b/src/server/reset_to_version.rs index e1faccd9b3f..1ea98acc1c8 100644 --- a/src/server/reset_to_version.rs +++ b/src/server/reset_to_version.rs @@ -85,10 +85,7 @@ impl ResetToVersionWorker { fn next_write(&mut self) -> Result, Write)>> { if self.write_iter.valid().unwrap() { - let mut state = self - .state - .lock() - .expect("failed to lock ResetToVersionWorker::state"); + let mut state = self.state.lock().unwrap(); debug_assert!(matches!( *state, ResetToVersionState::RemovingWrite { scanned: _ } @@ -149,16 +146,14 @@ impl ResetToVersionWorker { let mut has_more = true; for _ in 0..batch_size { if self.lock_iter.valid().unwrap() { - let mut state = self - .state - .lock() - .expect("failed to lock ResetToVersionWorker::state"); - debug_assert!(matches!( - *state, - ResetToVersionState::RemovingLock { scanned: _ } - )); - *state.scanned() += 1; - drop(state); + { + let mut state = self.state.lock().unwrap(); + debug_assert!(matches!( + *state, + ResetToVersionState::RemovingLock { scanned: _ } + )); + *state.scanned() += 1; + } box_try!(wb.delete_cf(CF_LOCK, self.lock_iter.key())); self.lock_iter.next().unwrap(); @@ -197,6 +192,12 @@ impl Clone for ResetToVersionManager { } } +impl Drop for ResetToVersionManager { + fn drop(&mut self) { + self.wait(); + } +} + #[allow(dead_code)] impl ResetToVersionManager { pub fn new(engine: RocksEngine) -> Self { @@ -221,30 +222,32 @@ impl ResetToVersionManager { let mut worker = ResetToVersionWorker::new(write_iter, lock_iter, ts, self.state.clone()); let mut wb = self.engine.write_batch(); let props = tikv_util::thread_group::current_properties(); - if self.worker_handle.borrow().is_some() { - warn!("A reset-to-version process is already in progress! Wait until it finish first."); - self.wait(); - } - *self.worker_handle.borrow_mut() = Some(std::thread::Builder::new() - .name("reset_to_version".to_string()) - .spawn_wrapper(move || { - tikv_util::thread_group::set_properties(props); - tikv_alloc::add_thread_memory_accessor(); + self.wait(); - while worker.process_next_batch(BATCH_SIZE, &mut wb).expect("reset_to_version failed when removing invalid writes") { - } - *worker.state.lock() - .expect("failed to lock `ResetToVersionWorker::state` in `ResetToVersionWorker::process_next_batch`") - = ResetToVersionState::RemovingLock { scanned: 0 }; - while worker.process_next_batch_lock(BATCH_SIZE, &mut wb).expect("reset_to_version failed when removing invalid locks") { - } - *worker.state.lock() - .expect("failed to lock `ResetToVersionWorker::state` in `ResetToVersionWorker::process_next_batch_lock`") - = ResetToVersionState::Done; - info!("Reset to version done!"); - tikv_alloc::remove_thread_memory_accessor(); - }) - .expect("failed to spawn reset_to_version thread")); + *self.worker_handle.borrow_mut() = Some( + std::thread::Builder::new() + .name("reset_to_version".to_string()) + .spawn_wrapper(move || { + tikv_util::thread_group::set_properties(props); + tikv_alloc::add_thread_memory_accessor(); + + while worker + .process_next_batch(BATCH_SIZE, &mut wb) + .expect("process_next_batch") + {} + *worker.state.lock().unwrap() = + ResetToVersionState::RemovingLock { scanned: 0 }; + while worker + .process_next_batch_lock(BATCH_SIZE, &mut wb) + .expect("process_next_batch_lock") + {} + *worker.state.lock().unwrap() = ResetToVersionState::Done; + info!("Reset to version done!"); + + tikv_alloc::remove_thread_memory_accessor(); + }) + .expect("failed to spawn reset_to_version thread"), + ); } /// Current process state. @@ -257,7 +260,10 @@ impl ResetToVersionManager { /// Wait until the process finished. pub fn wait(&self) { - self.worker_handle.take().unwrap().join().unwrap(); + if let Some(handle) = self.worker_handle.take() { + info!("Wait for the reset-to-version task to complete."); + handle.join().unwrap(); + } } } From 101032b97ea1ceaeef7f01459e1523fcd3c56509 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Wed, 23 Nov 2022 10:53:58 +0800 Subject: [PATCH 354/676] support restoring a batch of KV files (#13786) ref tikv/tikv#13788 Signed-off-by: joccau Signed-off-by: Zak Zhao <57036248+joccau@users.noreply.github.com> Co-authored-by: Ti Chi Robot --- .../external_storage/export/src/export.rs | 9 +- components/sst_importer/src/metrics.rs | 5 + src/import/sst_service.rs | 313 +++++++++++++----- 3 files changed, 239 insertions(+), 88 deletions(-) diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/export/src/export.rs index a36f3eba11e..ea02ebe2c6f 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/export/src/export.rs @@ -22,13 +22,10 @@ use encryption::DataKeyManager; use external_storage::dylib_client; #[cfg(feature = "cloud-storage-grpc")] use external_storage::grpc_client; -use external_storage::{ - compression_reader_dispatcher, encrypt_wrap_reader, record_storage_create, BackendConfig, - HdfsStorage, -}; pub use external_storage::{ - read_external_storage_into_file, ExternalStorage, LocalStorage, NoopStorage, RestoreConfig, - UnpinReader, + compression_reader_dispatcher, encrypt_wrap_reader, read_external_storage_into_file, + record_storage_create, BackendConfig, ExternalStorage, HdfsStorage, LocalStorage, NoopStorage, + RestoreConfig, UnpinReader, }; use futures_io::AsyncRead; #[cfg(feature = "cloud-gcp")] diff --git a/components/sst_importer/src/metrics.rs b/components/sst_importer/src/metrics.rs index 08f095078d5..cd14f6feb56 100644 --- a/components/sst_importer/src/metrics.rs +++ b/components/sst_importer/src/metrics.rs @@ -56,6 +56,11 @@ lazy_static! { "tikv_import_download_bytes", "Bucketed histogram of importer download bytes", exponential_buckets(1024.0, 2.0, 20).unwrap() + ).unwrap(); + pub static ref IMPORTER_APPLY_BYTES: Histogram = register_histogram!( + "tikv_import_apply_bytes", + "Bucketed histogram of importer apply bytes", + exponential_buckets(1024.0, 2.0, 20).unwrap() ) .unwrap(); pub static ref IMPORTER_INGEST_DURATION: HistogramVec = register_histogram_vec!( diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index fff9c79cec2..61d181b5c2f 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -443,38 +443,82 @@ where sst_importer::metrics::IMPORTER_APPLY_DURATION .with_label_values(&["queue"]) .observe(start.saturating_elapsed().as_secs_f64()); - + let mut start_apply = Instant::now(); let mut futs = vec![]; let mut apply_resp = ApplyResponse::default(); let context = req.take_context(); - let meta = req.get_meta(); + let mut rules = req.take_rewrite_rules(); + let mut metas = req.take_metas(); + // For compatibility with old requests. + if req.has_meta() { + metas.push(req.take_meta()); + rules.push(req.take_rewrite_rule()); + } let result = (|| -> Result<()> { - let temp_file = - importer.do_download_kv_file(meta, req.get_storage_backend(), &limiter)?; - let mut reqs = RequestCollector::from_cf(meta.get_cf()); let mut cmd_reqs = vec![]; - let mut build_req_fn = build_apply_request( - raft_size.0, - &mut reqs, - cmd_reqs.as_mut(), - meta.get_is_delete(), - meta.get_cf(), - context.clone(), - ); - let range = importer.do_apply_kv_file( - meta.get_start_key(), - meta.get_end_key(), - meta.get_restore_ts(), - temp_file, - req.get_rewrite_rule(), - &mut build_req_fn, - )?; - drop(build_req_fn); - if !reqs.is_empty() { - let cmd = make_request(&mut reqs, context); + let mut reqs_default = RequestCollector::from_cf(CF_DEFAULT); + let mut reqs_write = RequestCollector::from_cf(CF_WRITE); + let mut req_default_size = 0_u64; + let mut req_write_size = 0_u64; + let mut range: Option = None; + + for (i, meta) in metas.iter().enumerate() { + let (reqs, req_size) = if meta.get_cf() == CF_DEFAULT { + (&mut reqs_default, &mut req_default_size) + } else { + (&mut reqs_write, &mut req_write_size) + }; + + let mut build_req_fn = build_apply_request( + req_size, + raft_size.0, + reqs, + cmd_reqs.as_mut(), + meta.get_is_delete(), + meta.get_cf(), + context.clone(), + ); + + let temp_file = + importer.do_download_kv_file(meta, req.get_storage_backend(), &limiter)?; + let r: Option = importer.do_apply_kv_file( + meta.get_start_key(), + meta.get_end_key(), + meta.get_restore_ts(), + temp_file, + &rules[i], + &mut build_req_fn, + )?; + + if let Some(mut r) = r { + range = match range { + Some(mut v) => { + let s = v.take_start().min(r.take_start()); + let e = v.take_end().max(r.take_end()); + Some(Range { + start: s, + end: e, + ..Default::default() + }) + } + None => Some(r), + }; + } + } + + if !reqs_default.is_empty() { + let cmd = make_request(&mut reqs_default, context.clone()); cmd_reqs.push(cmd); + IMPORTER_APPLY_BYTES.observe(req_default_size as _); } + if !reqs_write.is_empty() { + let cmd = make_request(&mut reqs_write, context); + cmd_reqs.push(cmd); + IMPORTER_APPLY_BYTES.observe(req_write_size as _); + } + + start_apply = Instant::now(); for cmd in cmd_reqs { let (cb, future) = paired_future_callback(); match router.send_command(cmd, Callback::write(cb), RaftCmdExtraOpts::default()) @@ -507,19 +551,21 @@ where if r.response.get_header().has_error() { let mut import_err = kvproto::import_sstpb::Error::default(); let err = r.response.get_header().get_error(); - import_err - .set_message("failed to complete raft command".to_string()); + import_err.set_message("failed to complete raft command".to_string()); // FIXME: if there are many errors, we may lose some of them here. - import_err - .set_store_error(err.clone()); - warn!("failed to apply the file to the store"; "error" => ?err, "file" => %meta.get_name()); + import_err.set_store_error(err.clone()); + warn!("failed to apply the file to the store"; "error" => ?err); resp.set_error(import_err); } } } resp })); + // Records how long the apply task waits to be scheduled. + sst_importer::metrics::IMPORTER_APPLY_DURATION + .with_label_values(&["apply"]) + .observe(start_apply.saturating_elapsed().as_secs_f64()); sst_importer::metrics::IMPORTER_APPLY_DURATION .with_label_values(&["finish"]) .observe(start.saturating_elapsed().as_secs_f64()); @@ -861,9 +907,9 @@ enum RequestCollector { /// This is used for write CF because resolved ts observer hates duplicated /// key in the same request. RetainLastTs(HashMap, (Request, u64)>), - /// Collector favor that simple collect all items. - /// This is used for default CF. - KeepAll(Vec), + /// Collector favor that simple collect all items, and it do not contains + /// duplicated key-value. This is used for default CF. + KeepAll(HashMap, Request>), } impl RequestCollector { @@ -879,9 +925,9 @@ impl RequestCollector { } fn accept(&mut self, req: Request) { + let k = key_from_request(&req); match self { RequestCollector::RetainLastTs(ref mut reqs) => { - let k = key_from_request(&req); let (encoded_key, ts) = match Key::split_on_ts_for(k) { Ok(k) => k, Err(err) => { @@ -897,7 +943,9 @@ impl RequestCollector { reqs.insert(encoded_key.to_owned(), (req, ts.into_inner())); } } - RequestCollector::KeepAll(ref mut a) => a.push(req), + RequestCollector::KeepAll(ref mut reqs) => { + reqs.insert(k.to_owned(), req); + } } } @@ -906,7 +954,7 @@ impl RequestCollector { RequestCollector::RetainLastTs(ref mut reqs) => { reqs.drain().map(|(_, (req, _))| req).collect() } - RequestCollector::KeepAll(ref mut reqs) => std::mem::take(reqs), + RequestCollector::KeepAll(ref mut reqs) => reqs.drain().map(|(_, req)| req).collect(), } } @@ -956,6 +1004,7 @@ fn make_request(reqs: &mut RequestCollector, context: Context) -> RaftCmdRequest // in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 // will panic if found duplicated entry during Vec. fn build_apply_request<'a, 'b>( + req_size: &'a mut u64, raft_size: u64, reqs: &'a mut RequestCollector, cmd_reqs: &'a mut Vec, @@ -966,51 +1015,41 @@ fn build_apply_request<'a, 'b>( where 'a: 'b, { - let mut req_size = 0_u64; - // use callback to collect kv data. - if is_delete { - Box::new(move |k: Vec, _v: Vec| { - let mut req = Request::default(); - let mut del = DeleteRequest::default(); + Box::new(move |k: Vec, v: Vec| { + let mut req = Request::default(); + if is_delete { + let mut del = DeleteRequest::default(); del.set_key(k); del.set_cf(cf.to_string()); req.set_cmd_type(CmdType::Delete); req.set_delete(del); - req_size += req.compute_size() as u64; - reqs.accept(req); - // When the request size get grow to half of the max request size, - // build the request and add it to a batch. - if req_size > raft_size / 2 { - req_size = 0; - let cmd = make_request(reqs, context.clone()); - cmd_reqs.push(cmd); - } - }) - } else { - Box::new(move |k: Vec, v: Vec| { + } else { if cf == CF_WRITE && !write_needs_restore(&v) { return; } - let mut req = Request::default(); let mut put = PutRequest::default(); - put.set_key(k); put.set_value(v); put.set_cf(cf.to_string()); req.set_cmd_type(CmdType::Put); req.set_put(put); - req_size += req.compute_size() as u64; - reqs.accept(req); - if req_size > raft_size / 2 { - req_size = 0; - let cmd = make_request(reqs, context.clone()); - cmd_reqs.push(cmd); - } - }) - } + } + + // When the request size get grow to max request size, + // build the request and add it to a batch. + if *req_size + req.compute_size() as u64 > raft_size * 7 / 8 { + IMPORTER_APPLY_BYTES.observe(*req_size as _); + *req_size = 0; + let cmd = make_request(reqs, context.clone()); + cmd_reqs.push(cmd); + } + + *req_size += req.compute_size() as u64; + reqs.accept(req); + }) } fn write_needs_restore(write: &[u8]) -> bool { @@ -1063,23 +1102,42 @@ mod test { fn default_req(key: &[u8], val: &[u8], start_ts: u64) -> Request { let (k, v) = default(key, val, start_ts); - req(k, v, CF_DEFAULT) + req(k, v, CF_DEFAULT, CmdType::Put) } fn write_req(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> Request { let (k, v) = write(key, ty, commit_ts, start_ts); - req(k, v, CF_WRITE) + let cmd_type = if ty == WriteType::Delete { + CmdType::Delete + } else { + CmdType::Put + }; + + req(k, v, CF_WRITE, cmd_type) } - fn req(k: Vec, v: Vec, cf: &str) -> Request { + fn req(k: Vec, v: Vec, cf: &str, cmd_type: CmdType) -> Request { let mut req = Request::default(); - let mut put = PutRequest::default(); + req.set_cmd_type(cmd_type); + + match cmd_type { + CmdType::Put => { + let mut put = PutRequest::default(); + put.set_key(k); + put.set_value(v); + put.set_cf(cf.to_string()); - put.set_key(k); - put.set_value(v); - put.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Put); - req.set_put(put); + req.set_put(put) + } + CmdType::Delete => { + let mut del = DeleteRequest::default(); + del.set_cf(cf.to_string()); + del.set_key(k); + + req.set_delete(del); + } + _ => panic!("invalid input cmd_type"), + } req } @@ -1088,26 +1146,36 @@ mod test { #[derive(Debug)] struct Case { cf: &'static str, + is_delete: bool, mutations: Vec<(Vec, Vec)>, expected_reqs: Vec, } fn run_case(c: &Case) { - let mut v = vec![]; - let mut coll = RequestCollector::from_cf(c.cf); - let mut builder = - build_apply_request(1024, &mut coll, &mut v, false, c.cf, Context::new()); + let mut cmds = vec![]; + let mut reqs = RequestCollector::from_cf(c.cf); + let mut req_size = 0_u64; + + let mut builder = build_apply_request( + &mut req_size, + 1024, + &mut reqs, + &mut cmds, + c.is_delete, + c.cf, + Context::new(), + ); for (k, v) in c.mutations.clone() { builder(k, v); } drop(builder); - if !coll.is_empty() { - let cmd = make_request(&mut coll, Context::new()); - v.push(cmd); + if !reqs.is_empty() { + let cmd = make_request(&mut reqs, Context::new()); + cmds.push(cmd); } - let mut req1: HashMap<_, _> = v + let mut req1: HashMap<_, _> = cmds .into_iter() .flat_map(|mut x| x.take_requests().into_iter()) .map(|req| { @@ -1126,12 +1194,13 @@ mod test { let cases = vec![ Case { cf: CF_WRITE, + is_delete: false, mutations: vec![ write(b"foo", Lock, 42, 41), write(b"foo", Put, 40, 39), write(b"bar", Put, 38, 37), write(b"baz", Put, 34, 31), - write(b"bar", Delete, 28, 17), + write(b"bar", Put, 28, 17), ], expected_reqs: vec![ write_req(b"foo", Put, 40, 39), @@ -1139,8 +1208,24 @@ mod test { write_req(b"baz", Put, 34, 31), ], }, + Case { + cf: CF_WRITE, + is_delete: true, + mutations: vec![ + write(b"foo", Delete, 40, 39), + write(b"bar", Delete, 38, 37), + write(b"baz", Delete, 34, 31), + write(b"bar", Delete, 28, 17), + ], + expected_reqs: vec![ + write_req(b"foo", Delete, 40, 39), + write_req(b"bar", Delete, 38, 37), + write_req(b"baz", Delete, 34, 31), + ], + }, Case { cf: CF_DEFAULT, + is_delete: false, mutations: vec![ default(b"aria", b"The planet where flowers bloom.", 123), default( @@ -1149,6 +1234,7 @@ mod test { 178, ), default(b"beyond", b"Calling your name.", 278), + default(b"beyond", b"Calling your name.", 278), ], expected_reqs: vec![ default_req(b"aria", b"The planet where flowers bloom.", 123), @@ -1166,4 +1252,67 @@ mod test { run_case(&case); } } + + #[test] + fn test_request_collector_with_write_cf() { + let mut request_collector = RequestCollector::from_cf(CF_WRITE); + assert_eq!(request_collector.is_empty(), true); + let reqs = vec![ + write_req(b"foo", WriteType::Put, 40, 39), + write_req(b"aar", WriteType::Put, 38, 37), + write_req(b"foo", WriteType::Put, 34, 31), + write_req(b"zzz", WriteType::Put, 41, 40), + ]; + let reqs_result = vec![ + write_req(b"aar", WriteType::Put, 38, 37), + write_req(b"foo", WriteType::Put, 40, 39), + write_req(b"zzz", WriteType::Put, 41, 40), + ]; + + for req in reqs { + request_collector.accept(req); + } + assert_eq!(request_collector.is_empty(), false); + let mut reqs = request_collector.drain(); + reqs.sort_by(|r1, r2| { + let k1 = key_from_request(r1); + let k2 = key_from_request(r2); + k1.cmp(k2) + }); + assert_eq!(reqs, reqs_result); + assert_eq!(request_collector.is_empty(), true); + } + + #[test] + fn test_request_collector_with_default_cf() { + let mut request_collector = RequestCollector::from_cf(CF_DEFAULT); + assert_eq!(request_collector.is_empty(), true); + let reqs = vec![ + default_req(b"foo", b"", 39), + default_req(b"zzz", b"", 40), + default_req(b"foo", b"", 37), + default_req(b"foo", b"", 39), + ]; + let reqs_result = vec![ + default_req(b"foo", b"", 37), + default_req(b"foo", b"", 39), + default_req(b"zzz", b"", 40), + ]; + + for req in reqs { + request_collector.accept(req); + } + assert_eq!(request_collector.is_empty(), false); + let mut reqs = request_collector.drain(); + reqs.sort_by(|r1, r2| { + let k1 = key_from_request(r1); + let (k1, ts1) = Key::split_on_ts_for(k1).unwrap(); + let k2 = key_from_request(r2); + let (k2, ts2) = Key::split_on_ts_for(k2).unwrap(); + + k1.cmp(k2).then(ts1.cmp(&ts2)) + }); + assert_eq!(reqs, reqs_result); + assert_eq!(request_collector.is_empty(), true); + } } From 8ce818be228796492eb843048d2996b75a4d714d Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Wed, 23 Nov 2022 13:41:57 +0800 Subject: [PATCH 355/676] txn: revert "calculate last_change_ts in rollback" (#13834) ref tikv/tikv#13694, ref tikv/tikv#13749 This commit reverts #13749 (calculate last_change_ts in rollback). If we calculate last_change_ts, consider the following case: Key k has a write record with commit_ts = 5. 1. Prewrite k, start_ts = 10 2. Rollback k, start_ts = 30, last_commit_ts = 5. 3. Commit k, start_ts = 10, commit_ts = 20 Then, read with ts = 40, it will get an incorrect last_commit_ts from the rollback record. There is no easy way to handle the rollback case. I choose to give up calculating it. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- src/storage/mvcc/reader/reader.rs | 1 - src/storage/mvcc/reader/scanner/forward.rs | 11 +- src/storage/txn/actions/check_txn_status.rs | 41 +----- .../txn/commands/check_secondary_locks.rs | 129 +----------------- src/storage/txn/commands/check_txn_status.rs | 108 ++------------- 5 files changed, 23 insertions(+), 267 deletions(-) diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index c8ca1a5f671..a6aae85761f 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -2541,7 +2541,6 @@ pub mod tests { engine.commit(k, 1, 2); // Write enough ROLLBACK/LOCK recrods - engine.rollback(k, 5); for start_ts in (6..30).into_iter().step_by(2) { engine.lock(k, start_ts, start_ts + 1); } diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 12300187739..6672842fab9 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -1625,7 +1625,8 @@ mod latest_kv_tests { for start_ts in (10..30).into_iter().step_by(2) { must_prewrite_lock(&mut engine, b"k1", b"k1", start_ts); must_commit(&mut engine, b"k1", start_ts, start_ts + 1); - must_rollback(&mut engine, b"k3", start_ts + 1, true); + must_prewrite_lock(&mut engine, b"k3", b"k1", start_ts); + must_commit(&mut engine, b"k3", start_ts, start_ts + 1); } must_prewrite_put(&mut engine, b"k1", b"v13", b"k1", 40); @@ -1647,10 +1648,10 @@ mod latest_kv_tests { // k2 | 46 | PUT | v22 // k2 | 6 | PUT | v21 // k3 | 47 | PUT | v32 - // k3 | 29 | ROLLBACK | - // k3 | 27 | ROLLBACK | - // k3 | ... | ROLLBACK | - // k3 | 11 | ROLLBACK | + // k3 | 29 | LOCK | + // k3 | 27 | LOCK | + // k3 | ... | LOCK | + // k3 | 11 | LOCK | // k3 | 7 | PUT | v31 let snapshot = engine.snapshot(Default::default()).unwrap(); diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index 126c34ade92..4c900e5a438 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -8,7 +8,6 @@ use crate::storage::{ metrics::MVCC_CHECK_TXN_STATUS_COUNTER_VEC, reader::OverlappedWrite, ErrorInner, LockType, MvccTxn, ReleasedLock, Result, SnapshotReader, TxnCommitRecord, }, - txn::{sched_pool::tls_can_enable, scheduler::LAST_CHANGE_TS}, Snapshot, TxnStatus, }; @@ -135,8 +134,7 @@ pub fn check_txn_status_missing_lock( // Insert a Rollback to Write CF in case that a stale prewrite // command is received after a cleanup command. - if let Some(mut write) = action.construct_write(ts, overlapped_write) { - update_last_change_for_rollback(reader, &mut write, &primary_key, ts)?; + if let Some(write) = action.construct_write(ts, overlapped_write) { txn.put_write(primary_key, ts, write.as_ref().to_bytes()); } MVCC_CHECK_TXN_STATUS_COUNTER_VEC.rollback.inc(); @@ -170,8 +168,7 @@ pub fn rollback_lock( // Only the primary key of a pessimistic transaction needs to be protected. let protected: bool = is_pessimistic_txn && key.is_encoded_from(&lock.primary); - if let Some(mut write) = make_rollback(reader.start_ts, protected, overlapped_write) { - update_last_change_for_rollback(reader, &mut write, &key, lock.ts)?; + if let Some(write) = make_rollback(reader.start_ts, protected, overlapped_write) { txn.put_write(key.clone(), reader.start_ts, write.as_ref().to_bytes()); } @@ -195,40 +192,6 @@ pub fn collapse_prev_rollback( Ok(()) } -/// Updates the last_change_ts of a new Rollback record. -/// -/// When writing a new Rollback record, we don't always know about the -/// information about the last change. So, we will call `seek_write` again to -/// calculate the last_change_ts. -/// -/// The `seek_write` here is usually cheap because this functions is typically -/// called after `get_txn_commit_record` and `get_txn_commit_record` should have -/// moved the cursor around the record we want. -pub fn update_last_change_for_rollback( - reader: &mut SnapshotReader, - write: &mut Write, - key: &Key, - ts: TimeStamp, -) -> Result<()> { - // Also update the last_change_ts if we are writing an overlapped rollback to a - // LOCK record. Actually, because overlapped rollbacks are rare, it does not - // solve the inaccuracy caused by inserted rollback (and we don't intend it - // because it's uncommon). Just do it when it happens. - if tls_can_enable(LAST_CHANGE_TS) - && (write.write_type == WriteType::Lock || write.write_type == WriteType::Rollback) - { - if let Some((commit_ts, w)) = reader.seek_write(key, ts)? { - // Even with collapsed rollback, the deleted rollbacks will become tombstones - // which we probably need to skip them one by one. That's why we always use - // `next_last_change_info` here to calculate and count them in - // `versions_to_last_change`. - (write.last_change_ts, write.versions_to_last_change) = - w.next_last_change_info(commit_ts); - } - } - Ok(()) -} - /// Generate the Write record that should be written that means to perform a /// specified rollback operation. pub fn make_rollback( diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 71adda7a274..4802535c054 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -8,9 +8,7 @@ use crate::storage::{ lock_manager::LockManager, mvcc::{LockType, MvccTxn, SnapshotReader, TimeStamp, TxnCommitRecord}, txn::{ - actions::check_txn_status::{ - collapse_prev_rollback, make_rollback, update_last_change_for_rollback, - }, + actions::check_txn_status::{collapse_prev_rollback, make_rollback}, commands::{ Command, CommandExt, ReaderWithStats, ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, WriteContext, WriteResult, @@ -121,10 +119,7 @@ impl WriteCommand for CheckSecondaryLocks { } // We must protect this rollback in case this rollback is collapsed and a stale // acquire_pessimistic_lock and prewrite succeed again. - if let Some(mut write) = - make_rollback(self.start_ts, true, rollback_overlapped_write) - { - update_last_change_for_rollback(&mut reader, &mut write, &key, self.start_ts)?; + if let Some(write) = make_rollback(self.start_ts, true, rollback_overlapped_write) { txn.put_write(key.clone(), self.start_ts, write.as_ref().to_bytes()); collapse_prev_rollback(&mut txn, &mut reader, &key)?; } @@ -170,20 +165,14 @@ impl WriteCommand for CheckSecondaryLocks { pub mod tests { use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::Context; - use tikv_kv::Statistics; use tikv_util::deadline::Deadline; - use txn_types::Mutation; use super::*; use crate::storage::{ kv::TestEngineBuilder, lock_manager::MockLockManager, mvcc::tests::*, - txn::{ - commands::{test_util::prewrite_with_cm, WriteCommand}, - scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, - tests::*, - }, + txn::{commands::WriteCommand, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*}, Engine, }; @@ -354,116 +343,4 @@ pub mod tests { } must_get_overlapped_rollback(&mut engine, b"k1", 15, 13, WriteType::Lock, Some(0)); } - - // The main logic is almost identical to - // test_rollback_calculate_last_change_info of check_txn_status. But the small - // differences about handling lock CF make it difficult to reuse code. - #[test] - fn test_rollback_calculate_last_change_info() { - use pd_client::FeatureGate; - - use crate::storage::txn::sched_pool::set_tls_feature_gate; - - let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); - let cm = ConcurrencyManager::new(1.into()); - let k = b"k"; - let mut statistics = Statistics::default(); - - must_prewrite_put(&mut engine, k, b"v1", k, 5); - must_commit(&mut engine, k, 5, 6); - must_prewrite_put(&mut engine, k, b"v2", k, 7); - must_commit(&mut engine, k, 7, 8); - must_prewrite_put(&mut engine, k, b"v3", k, 30); - must_commit(&mut engine, k, 30, 35); - - // TiKV 6.4 should not write last_change_ts. - let feature_gate = FeatureGate::default(); - feature_gate.set_version("6.4.0").unwrap(); - set_tls_feature_gate(feature_gate); - must_success(&mut engine, k, 40, SecondaryLocksStatus::RolledBack); - let rollback = must_written(&mut engine, k, 40, 40, WriteType::Rollback); - assert!(rollback.last_change_ts.is_zero()); - assert_eq!(rollback.versions_to_last_change, 0); - - let feature_gate = FeatureGate::default(); - feature_gate.set_version("6.5.0").unwrap(); - set_tls_feature_gate(feature_gate); - - must_prewrite_put(&mut engine, k, b"v4", k, 45); - must_commit(&mut engine, k, 45, 50); - - // Rollback when there is no lock; prev writes: - // - 50: PUT - must_success(&mut engine, k, 55, SecondaryLocksStatus::RolledBack); - let rollback = must_written(&mut engine, k, 55, 55, WriteType::Rollback); - assert_eq!(rollback.last_change_ts, 50.into()); - assert_eq!(rollback.versions_to_last_change, 1); - - // Write a LOCK; prev writes: - // - 55: ROLLBACK - // - 50: PUT - let res = prewrite_with_cm( - &mut engine, - cm, - &mut statistics, - vec![Mutation::make_lock(Key::from_raw(k))], - k.to_vec(), - 60, - Some(70), - ) - .unwrap(); - assert!(!res.one_pc_commit_ts.is_zero()); - let lock_commit_ts = res.one_pc_commit_ts; - let lock = must_written(&mut engine, k, 60, res.one_pc_commit_ts, WriteType::Lock); - assert_eq!(lock.last_change_ts, 50.into()); - assert_eq!(lock.versions_to_last_change, 2); - - // Write another ROLLBACK by rolling back a pessimistic lock; prev writes: - // - 61: LOCK - // - 55: ROLLBACK - // - 50: PUT - must_acquire_pessimistic_lock(&mut engine, k, b"pk", 70, 75); - must_success(&mut engine, k, 70, SecondaryLocksStatus::RolledBack); - let rollback = must_written(&mut engine, k, 70, 70, WriteType::Rollback); - assert_eq!(rollback.last_change_ts, 50.into()); - assert_eq!(rollback.versions_to_last_change, 3); - - // last_change_ts should point to the latest record before start_ts; prev - // writes: - // - 8: PUT - must_acquire_pessimistic_lock(&mut engine, k, k, 10, 75); - must_success(&mut engine, k, 10, SecondaryLocksStatus::RolledBack); - must_unlocked(&mut engine, k); - let rollback = must_written(&mut engine, k, 10, 10, WriteType::Rollback); - assert_eq!(rollback.last_change_ts, 8.into()); - assert_eq!(rollback.versions_to_last_change, 1); - - // Overlapped rollback should not update the last_change_ts of PUT; prev writes: - // - 8: PUT <- rollback overlaps - // - 6: PUT - must_success(&mut engine, k, 8, SecondaryLocksStatus::RolledBack); - let put = must_written(&mut engine, k, 7, 8, WriteType::Put); - assert!(put.last_change_ts.is_zero()); - assert_eq!(put.versions_to_last_change, 0); - assert!(put.has_overlapped_rollback); - - // Overlapped rollback can update the last_change_ts of LOCK; writes: - // - 61: PUT <- rollback overlaps - // - 57: ROLLBACK (inserted later) - // - 55: ROLLBACK - // - 50: PUT - must_rollback(&mut engine, k, 57, true); - let rollback = must_written(&mut engine, k, 57, 57, WriteType::Rollback); - assert_eq!(rollback.last_change_ts, 50.into()); - assert_eq!(rollback.versions_to_last_change, 2); - must_success( - &mut engine, - k, - lock_commit_ts, - SecondaryLocksStatus::RolledBack, - ); - let lock = must_written(&mut engine, k, 60, lock_commit_ts, WriteType::Lock); - assert_eq!(lock.last_change_ts, 50.into()); - assert_eq!(lock.versions_to_last_change, 3); - } } diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index a118769a5db..34948109f4b 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -144,9 +144,8 @@ impl WriteCommand for CheckTxnStatus { pub mod tests { use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::{Context, PrewriteRequestPessimisticAction::*}; - use tikv_kv::Statistics; use tikv_util::deadline::Deadline; - use txn_types::{Key, Mutation, WriteType}; + use txn_types::{Key, WriteType}; use super::{TxnStatus::*, *}; use crate::storage::{ @@ -154,9 +153,7 @@ pub mod tests { lock_manager::MockLockManager, mvcc::tests::*, txn::{ - commands::{ - pessimistic_rollback, test_util::prewrite_with_cm, WriteCommand, WriteContext, - }, + commands::{pessimistic_rollback, WriteCommand, WriteContext}, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, }, @@ -1169,105 +1166,24 @@ pub mod tests { #[test] fn test_rollback_calculate_last_change_info() { - use pd_client::FeatureGate; - - use crate::storage::txn::sched_pool::set_tls_feature_gate; - let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); - let cm = ConcurrencyManager::new(1.into()); let k = b"k"; - let mut statistics = Statistics::default(); + + // Below is a case explaining why we don't calculate last_change_ts for + // rollback. must_prewrite_put(&mut engine, k, b"v1", k, 5); must_commit(&mut engine, k, 5, 6); + must_prewrite_put(&mut engine, k, b"v2", k, 7); + // When we calculate last_change_ts here, we will get 6. + must_rollback(&mut engine, k, 10, true); + // But we can still commit with ts 8, then the last_change_ts of the rollback + // will be incorrect. must_commit(&mut engine, k, 7, 8); - must_prewrite_put(&mut engine, k, b"v3", k, 30); - must_commit(&mut engine, k, 30, 35); - - // TiKV 6.4 should not write last_change_ts. - let feature_gate = FeatureGate::default(); - feature_gate.set_version("6.4.0").unwrap(); - set_tls_feature_gate(feature_gate); - must_rollback(&mut engine, k, 40, true); - let rollback = must_written(&mut engine, k, 40, 40, WriteType::Rollback); - assert!(rollback.last_change_ts.is_zero()); - assert_eq!(rollback.versions_to_last_change, 0); - - let feature_gate = FeatureGate::default(); - feature_gate.set_version("6.5.0").unwrap(); - set_tls_feature_gate(feature_gate); - must_prewrite_put(&mut engine, k, b"v4", k, 45); - must_commit(&mut engine, k, 45, 50); - - // Rollback when there is no lock; prev writes: - // - 50: PUT - must_rollback(&mut engine, k, 55, true); - let rollback = must_written(&mut engine, k, 55, 55, WriteType::Rollback); - assert_eq!(rollback.last_change_ts, 50.into()); - assert_eq!(rollback.versions_to_last_change, 1); - - // Write a LOCK; prev writes: - // - 55: ROLLBACK - // - 50: PUT - let res = prewrite_with_cm( - &mut engine, - cm, - &mut statistics, - vec![Mutation::make_lock(Key::from_raw(k))], - k.to_vec(), - 60, - Some(70), - ) - .unwrap(); - assert!(!res.one_pc_commit_ts.is_zero()); - let lock_commit_ts = res.one_pc_commit_ts; - let lock = must_written(&mut engine, k, 60, res.one_pc_commit_ts, WriteType::Lock); - assert_eq!(lock.last_change_ts, 50.into()); - assert_eq!(lock.versions_to_last_change, 2); - - // Write another ROLLBACK; prev writes: - // - 61: LOCK - // - 55: ROLLBACK - // - 50: PUT - must_rollback(&mut engine, k, 70, true); - let rollback = must_written(&mut engine, k, 70, 70, WriteType::Rollback); - assert_eq!(rollback.last_change_ts, 50.into()); - assert_eq!(rollback.versions_to_last_change, 3); - - // last_change_ts should point to the latest record before start_ts; prev - // writes: - // - 8: PUT - must_acquire_pessimistic_lock(&mut engine, k, k, 10, 75); - must_pessimistic_prewrite_put(&mut engine, k, b"v5", k, 10, 75, DoPessimisticCheck); - must_rollback(&mut engine, k, 10, true); - must_unlocked(&mut engine, k); let rollback = must_written(&mut engine, k, 10, 10, WriteType::Rollback); - assert_eq!(rollback.last_change_ts, 8.into()); - assert_eq!(rollback.versions_to_last_change, 1); - - // Overlapped rollback should not update the last_change_ts of PUT; prev writes: - // - 8: PUT <- rollback overlaps - // - 6: PUT - must_rollback(&mut engine, k, 8, true); - let put = must_written(&mut engine, k, 7, 8, WriteType::Put); - assert!(put.last_change_ts.is_zero()); - assert_eq!(put.versions_to_last_change, 0); - assert!(put.has_overlapped_rollback); - - // Overlapped rollback can update the last_change_ts of LOCK; writes: - // - 61: PUT <- rollback overlaps - // - 57: ROLLBACK (inserted later) - // - 55: ROLLBACK - // - 50: PUT - must_rollback(&mut engine, k, 57, true); - let rollback = must_written(&mut engine, k, 57, 57, WriteType::Rollback); - assert_eq!(rollback.last_change_ts, 50.into()); - assert_eq!(rollback.versions_to_last_change, 2); - must_rollback(&mut engine, k, lock_commit_ts, true); - let lock = must_written(&mut engine, k, 60, lock_commit_ts, WriteType::Lock); - assert_eq!(lock.last_change_ts, 50.into()); - assert_eq!(lock.versions_to_last_change, 3); + assert!(rollback.last_change_ts.is_zero()); + assert_eq!(rollback.versions_to_last_change, 0); } } From 3ab299cff9273ac53e9f57b751c3f45e116c5958 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 23 Nov 2022 14:07:58 +0800 Subject: [PATCH 356/676] tikv_kv: make async_snapshot return future (#13836) ref tikv/tikv#13827 so we can reduce allocation and keep compatible with both v1 and v2. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/tikv_kv/src/btree_engine.rs | 11 +- components/tikv_kv/src/lib.rs | 43 +++---- components/tikv_kv/src/mock_engine.rs | 5 +- components/tikv_kv/src/rocksdb_engine.rs | 36 +++--- src/lib.rs | 1 + src/server/gc_worker/gc_worker.rs | 36 +++--- src/server/raftkv.rs | 126 +++++++++++---------- src/storage/mod.rs | 18 ++- tests/benches/misc/raftkv/mod.rs | 10 +- tests/failpoints/cases/test_coprocessor.rs | 4 +- 10 files changed, 144 insertions(+), 146 deletions(-) diff --git a/components/tikv_kv/src/btree_engine.rs b/components/tikv_kv/src/btree_engine.rs index 473b993bf39..45ce6a6ffe8 100644 --- a/components/tikv_kv/src/btree_engine.rs +++ b/components/tikv_kv/src/btree_engine.rs @@ -14,6 +14,7 @@ use std::{ use collections::HashMap; use engine_panic::PanicEngine; use engine_traits::{CfName, IterOptions, ReadOptions, CF_DEFAULT, CF_LOCK, CF_WRITE}; +use futures::Future; use kvproto::kvrpcpb::Context; use txn_types::{Key, Value}; @@ -100,15 +101,11 @@ impl Engine for BTreeEngine { Ok(()) } + type SnapshotRes = impl Future> + Send; /// warning: It returns a fake snapshot whose content will be affected by /// the later modifies! - fn async_snapshot( - &mut self, - _ctx: SnapContext<'_>, - cb: EngineCallback, - ) -> EngineResult<()> { - cb(Ok(BTreeEngineSnapshot::new(self))); - Ok(()) + fn async_snapshot(&mut self, _ctx: SnapContext<'_>) -> Self::SnapshotRes { + futures::future::ready(Ok(BTreeEngineSnapshot::new(self))) } } diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 3e15b399796..b5f19832419 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -6,6 +6,7 @@ //! [`RocksEngine`](RocksEngine) are used for testing only. #![feature(min_specialization)] +#![feature(type_alias_impl_trait)] #[macro_use(fail_point)] extern crate fail; @@ -35,7 +36,7 @@ use engine_traits::{ CF_DEFAULT, CF_LOCK, }; use error_code::{self, ErrorCode, ErrorCodeExt}; -use futures::{future::BoxFuture, prelude::*}; +use futures::{compat::Future01CompatExt, future::BoxFuture, prelude::*}; use into_other::IntoOther; use kvproto::{ errorpb::Error as ErrorHeader, @@ -45,7 +46,7 @@ use kvproto::{ use pd_client::BucketMeta; use raftstore::store::{PessimisticLockPair, TxnExt}; use thiserror::Error; -use tikv_util::{deadline::Deadline, escape, time::ThreadReadId}; +use tikv_util::{deadline::Deadline, escape, time::ThreadReadId, timer::GLOBAL_TIMER_HANDLE}; use tracker::with_tls_tracker; use txn_types::{Key, PessimisticLock, TimeStamp, TxnExtra, Value}; @@ -61,7 +62,7 @@ pub use self::{ }; pub const SEEK_BOUND: u64 = 8; -const DEFAULT_TIMEOUT_SECS: u64 = 5; +const DEFAULT_TIMEOUT: Duration = Duration::from_secs(5); pub type Callback = Box) + Send>; pub type ExtCallback = Box; @@ -277,7 +278,8 @@ pub trait Engine: Send + Clone + 'static { /// region_modifies records each region's modifications. fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()>; - fn async_snapshot(&mut self, ctx: SnapContext<'_>, cb: Callback) -> Result<()>; + type SnapshotRes: Future> + Send + 'static; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes; /// Precheck request which has write with it's context. fn precheck_write_with_ctx(&self, _ctx: &Context) -> Result<()> { @@ -302,17 +304,21 @@ pub trait Engine: Send + Clone + 'static { } fn write(&self, ctx: &Context, batch: WriteData) -> Result<()> { - let timeout = Duration::from_secs(DEFAULT_TIMEOUT_SECS); - wait_op!(|cb| self.async_write(ctx, batch, cb), timeout) - .unwrap_or_else(|| Err(Error::from(ErrorInner::Timeout(timeout)))) + wait_op!(|cb| self.async_write(ctx, batch, cb), DEFAULT_TIMEOUT) + .unwrap_or_else(|| Err(Error::from(ErrorInner::Timeout(DEFAULT_TIMEOUT)))) } fn release_snapshot(&mut self) {} fn snapshot(&mut self, ctx: SnapContext<'_>) -> Result { - let timeout = Duration::from_secs(DEFAULT_TIMEOUT_SECS); - wait_op!(|cb| self.async_snapshot(ctx, cb), timeout) - .unwrap_or_else(|| Err(Error::from(ErrorInner::Timeout(timeout)))) + let deadline = Instant::now() + DEFAULT_TIMEOUT; + let timeout = GLOBAL_TIMER_HANDLE.delay(deadline).compat(); + futures::executor::block_on(async move { + futures::select! { + res = self.async_snapshot(ctx).fuse() => res, + _ = timeout.fuse() => Err(Error::from(ErrorInner::Timeout(DEFAULT_TIMEOUT))), + } + }) } fn put(&self, ctx: &Context, key: Key, value: Value) -> Result<()> { @@ -598,15 +604,10 @@ pub fn snapshot( ctx: SnapContext<'_>, ) -> impl std::future::Future> { let begin = Instant::now(); - let (callback, future) = - tikv_util::future::paired_must_called_future_callback(drop_snapshot_callback::); - let val = engine.async_snapshot(ctx, callback); + let val = engine.async_snapshot(ctx); // make engine not cross yield point async move { - val?; // propagate error - let result = future - .map_err(|cancel| Error::from(ErrorInner::Other(box_err!(cancel)))) - .await?; + let result = val.await; with_tls_tracker(|tracker| { tracker.metrics.get_snapshot_nanos += begin.elapsed().as_nanos() as u64; }); @@ -615,14 +616,6 @@ pub fn snapshot( } } -pub fn drop_snapshot_callback() -> Result { - let bt = backtrace::Backtrace::new(); - warn!("async snapshot callback is dropped"; "backtrace" => ?bt); - let mut err = ErrorHeader::default(); - err.set_message("async snapshot callback is dropped".to_string()); - Err(Error::from(ErrorInner::Request(err))) -} - /// Write modifications into a `BaseRocksEngine` instance. pub fn write_modifies(kv_engine: &impl LocalEngine, modifies: Vec) -> Result<()> { fail_point!("rockskv_write_modifies", |_| Err(box_err!("write failed"))); diff --git a/components/tikv_kv/src/mock_engine.rs b/components/tikv_kv/src/mock_engine.rs index 84605a04084..376c2d1fb1f 100644 --- a/components/tikv_kv/src/mock_engine.rs +++ b/components/tikv_kv/src/mock_engine.rs @@ -157,8 +157,9 @@ impl Engine for MockEngine { self.base.modify_on_kv_engine(region_modifies) } - fn async_snapshot(&mut self, ctx: SnapContext<'_>, cb: Callback) -> Result<()> { - self.base.async_snapshot(ctx, cb) + type SnapshotRes = ::SnapshotRes; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { + self.base.async_snapshot(ctx) } fn async_write(&self, ctx: &Context, batch: WriteData, write_cb: Callback<()>) -> Result<()> { diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 0ef9b5b274c..8b0dd28646a 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -18,6 +18,7 @@ use engine_traits::{ CfName, Engines, IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, }; use file_system::IoRateLimiter; +use futures::{channel::oneshot, Future}; use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb}; use raftstore::coprocessor::CoprocessorHost; use tempfile::{Builder, TempDir}; @@ -34,7 +35,7 @@ const TEMP_DIR: &str = ""; enum Task { Write(Vec, Callback<()>), - Snapshot(Callback>), + Snapshot(oneshot::Sender>), Pause(Duration), } @@ -56,7 +57,9 @@ impl Runnable for Runner { fn run(&mut self, t: Task) { match t { Task::Write(modifies, cb) => cb(write_modifies(&self.0.kv, modifies)), - Task::Snapshot(cb) => cb(Ok(Arc::new(self.0.kv.snapshot()))), + Task::Snapshot(sender) => { + let _ = sender.send(Arc::new(self.0.kv.snapshot())); + } Task::Pause(dur) => std::thread::sleep(dur), } } @@ -253,18 +256,23 @@ impl Engine for RocksEngine { Ok(()) } - fn async_snapshot(&mut self, _: SnapContext<'_>, cb: Callback) -> Result<()> { - fail_point!("rockskv_async_snapshot", |_| Err(box_err!( - "snapshot failed" - ))); - fail_point!("rockskv_async_snapshot_not_leader", |_| { - Err(self.not_leader_error()) - }); - if self.not_leader.load(Ordering::SeqCst) { - return Err(self.not_leader_error()); - } - box_try!(self.sched.schedule(Task::Snapshot(cb))); - Ok(()) + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, _: SnapContext<'_>) -> Self::SnapshotRes { + let res = (|| { + fail_point!("rockskv_async_snapshot", |_| Err(box_err!( + "snapshot failed" + ))); + if self.not_leader.load(Ordering::SeqCst) { + return Err(self.not_leader_error()); + } + let (tx, rx) = oneshot::channel(); + if self.sched.schedule(Task::Snapshot(tx)).is_err() { + return Err(box_err!("failed to schedule snapshot")); + } + Ok(rx) + })(); + + async move { Ok(res?.await.unwrap()) } } } diff --git a/src/lib.rs b/src/lib.rs index f4fcd1cd97c..43d5db81458 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,6 +26,7 @@ #![feature(drain_filter)] #![feature(deadline_api)] #![feature(let_chains)] +#![feature(type_alias_impl_trait)] #[macro_use(fail_point)] extern crate fail; diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 9e3f79654bc..9c3c289ecf7 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -1287,6 +1287,7 @@ pub mod test_gc_worker { use collections::HashMap; use engine_rocks::{RocksEngine, RocksSnapshot}; + use futures::Future; use kvproto::{ kvrpcpb::Context, metapb::{Peer, Region}, @@ -1378,22 +1379,16 @@ pub mod test_gc_worker { self.0.async_write(ctx, batch, callback) } - fn async_snapshot( - &mut self, - ctx: SnapContext<'_>, - callback: EngineCallback, - ) -> EngineResult<()> { - self.0.async_snapshot( - ctx, - Box::new(move |r| { - callback(r.map(|snap| { - let mut region = Region::default(); - // Add a peer to pass initialized check. - region.mut_peers().push(Peer::default()); - RegionSnapshot::from_snapshot(snap, Arc::new(region)) - })) - }), - ) + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { + let f = self.0.async_snapshot(ctx); + async move { + let snap = f.await?; + let mut region = Region::default(); + // Add a peer to pass initialized check. + region.mut_peers().push(Peer::default()); + Ok(RegionSnapshot::from_snapshot(snap, Arc::new(region))) + } } } @@ -1441,18 +1436,15 @@ pub mod test_gc_worker { self.engines.lock().unwrap()[&ctx.region_id].async_write(ctx, batch, callback) } - fn async_snapshot( - &mut self, - ctx: SnapContext<'_>, - callback: EngineCallback, - ) -> EngineResult<()> { + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { let region_id = ctx.pb_ctx.region_id; self.engines .lock() .unwrap() .get_mut(®ion_id) .unwrap() - .async_snapshot(ctx, callback) + .async_snapshot(ctx) } } } diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index 8bef31eaebd..6dc84f951ee 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -15,7 +15,7 @@ use std::{ use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use engine_traits::{CfName, KvEngine, MvccProperties, Snapshot}; -use futures::future::BoxFuture; +use futures::{future::BoxFuture, Future}; use kvproto::{ errorpb, kvrpcpb::{Context, IsolationLevel}, @@ -41,7 +41,11 @@ use raftstore::{ }; use thiserror::Error; use tikv_kv::write_modifies; -use tikv_util::{codec::number::NumberEncoder, future::paired_future_callback, time::Instant}; +use tikv_util::{ + codec::number::NumberEncoder, + future::{paired_future_callback, paired_must_called_future_callback}, + time::Instant, +}; use txn_types::{Key, TimeStamp, TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::metrics::*; @@ -196,6 +200,14 @@ fn exec_admin>( }) } +pub fn drop_snapshot_callback() -> kv::Result { + let bt = backtrace::Backtrace::new(); + warn!("async snapshot callback is dropped"; "backtrace" => ?bt); + let mut err = errorpb::Error::default(); + err.set_message("async snapshot callback is dropped".to_string()); + Err(kv::Error::from(kv::ErrorInner::Request(err))) +} + /// `RaftKv` is a storage engine base on `RaftStore`. #[derive(Clone)] pub struct RaftKv @@ -228,41 +240,6 @@ where self.txn_extra_scheduler = Some(txn_extra_scheduler); } - fn exec_snapshot( - &mut self, - ctx: SnapContext<'_>, - req: Request, - cb: Callback>, - ) -> Result<()> { - let mut header = new_request_header(ctx.pb_ctx); - let mut flags = 0; - if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { - let mut data = [0u8; 8]; - (&mut data[..]) - .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) - .unwrap(); - flags |= WriteBatchFlags::STALE_READ.bits(); - header.set_flag_data(data.into()); - } - if ctx.for_flashback { - flags |= WriteBatchFlags::FLASHBACK.bits(); - } - header.set_flags(flags); - - let mut cmd = RaftCmdRequest::default(); - cmd.set_header(header); - cmd.set_requests(vec![req].into()); - self.router - .read( - ctx.read_id, - cmd, - StoreCallback::read(Box::new(move |resp| { - cb(on_read_result(resp).map_err(Error::into)); - })), - ) - .map_err(From::from) - } - fn exec_write_requests( &self, ctx: &Context, @@ -462,14 +439,14 @@ where }) } - fn async_snapshot( - &mut self, - mut ctx: SnapContext<'_>, - cb: Callback, - ) -> kv::Result<()> { - fail_point!("raftkv_async_snapshot_err", |_| Err(box_err!( - "injected error for async_snapshot" - ))); + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, mut ctx: SnapContext<'_>) -> Self::SnapshotRes { + let mut res: kv::Result<()> = (|| { + fail_point!("raftkv_async_snapshot_err", |_| { + Err(box_err!("injected error for async_snapshot")) + }); + Ok(()) + })(); let mut req = Request::default(); req.set_cmd_type(CmdType::Snap); @@ -481,10 +458,46 @@ where } ASYNC_REQUESTS_COUNTER_VEC.snapshot.all.inc(); let begin_instant = Instant::now_coarse(); - self.exec_snapshot( - ctx, - req, - Box::new(move |res| match res { + let (cb, f) = paired_must_called_future_callback(drop_snapshot_callback); + + let mut header = new_request_header(ctx.pb_ctx); + let mut flags = 0; + if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { + let mut data = [0u8; 8]; + (&mut data[..]) + .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) + .unwrap(); + flags |= WriteBatchFlags::STALE_READ.bits(); + header.set_flag_data(data.into()); + } + if ctx.for_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + + let mut cmd = RaftCmdRequest::default(); + cmd.set_header(header); + cmd.set_requests(vec![req].into()); + if res.is_ok() { + res = self + .router + .read( + ctx.read_id, + cmd, + StoreCallback::read(Box::new(move |resp| { + cb(on_read_result(resp).map_err(Error::into)); + })), + ) + .map_err(kv::Error::from); + } + async move { + // It's impossible to return cancel because the callback will be invoked if it's + // destroyed. + let res = match res { + Ok(()) => f.await.unwrap(), + Err(e) => Err(e), + }; + match res { Ok(CmdRes::Resp(mut r)) => { let e = if r .get(0) @@ -496,27 +509,22 @@ where } else { invalid_resp_type(CmdType::Snap, r[0].get_cmd_type()).into() }; - cb(Err(e)) + Err(e) } Ok(CmdRes::Snap(s)) => { ASYNC_REQUESTS_DURATIONS_VEC .snapshot .observe(begin_instant.saturating_elapsed_secs()); ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); - cb(Ok(s)) + Ok(s) } Err(e) => { let status_kind = get_status_kind_from_engine_error(&e); ASYNC_REQUESTS_COUNTER_VEC.snapshot.get(status_kind).inc(); - cb(Err(e)) + Err(e) } - }), - ) - .map_err(|e| { - let status_kind = get_status_kind_from_error(&e); - ASYNC_REQUESTS_COUNTER_VEC.snapshot.get(status_kind).inc(); - e.into() - }) + } + } } fn release_snapshot(&mut self) { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 2032ffd86ae..55d8575101c 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -2983,18 +2983,14 @@ impl Engine for TxnTestEngine { self.engine.modify_on_kv_engine(region_modifies) } - fn async_snapshot( - &mut self, - ctx: SnapContext<'_>, - cb: tikv_kv::Callback, - ) -> tikv_kv::Result<()> { + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { let txn_ext = self.txn_ext.clone(); - self.engine.async_snapshot( - ctx, - Box::new(move |snapshot| { - cb(snapshot.map(|snapshot| TxnTestSnapshot { snapshot, txn_ext })) - }), - ) + let f = self.engine.async_snapshot(ctx); + async move { + let snapshot = f.await?; + Ok(TxnTestSnapshot { snapshot, txn_ext }) + } } fn async_write( diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index a949570ebe1..bc4786ae73e 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -6,6 +6,7 @@ use collections::HashSet; use crossbeam::channel::TrySendError; use engine_rocks::{RocksEngine, RocksSnapshot}; use engine_traits::{KvEngine, ALL_CFS, CF_DEFAULT}; +use futures::future::FutureExt; use kvproto::{ kvrpcpb::{Context, ExtraOp as TxnExtraOp}, metapb::Region, @@ -191,14 +192,15 @@ fn bench_async_snapshot(b: &mut test::Bencher) { ctx.set_region_epoch(region.get_region_epoch().clone()); ctx.set_peer(leader); b.iter(|| { - let on_finished: EngineCallback> = Box::new(move |results| { - let _ = test::black_box(results); - }); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() }; - kv.async_snapshot(snap_ctx, on_finished).unwrap(); + let f = kv.async_snapshot(snap_ctx); + let res = f.map(|res| { + let _ = test::black_box(res); + }); + let _ = test::black_box(res); }); } diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index 481e533a879..c515b8d66cb 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -138,10 +138,10 @@ fn test_snapshot_failed() { #[test] fn test_snapshot_failed_2() { let product = ProductTable::new(); - let (_, endpoint) = init_with_data(&product, &[]); + let (store, endpoint) = init_with_data(&product, &[]); let req = DagSelect::from(&product).build(); - fail::cfg("rockskv_async_snapshot_not_leader", "return()").unwrap(); + store.get_engine().trigger_not_leader(); let resp = handle_request(&endpoint, req); assert!(resp.get_region_error().has_not_leader()); From 970f5623672ae13c411092f9f208a0674dfd9ce8 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Wed, 23 Nov 2022 20:23:58 +0800 Subject: [PATCH 357/676] mvcc: fix get_write may return Delete when skipping versions (#13840) close tikv/tikv#13839 The mvcc get_write methods should return None if the found record is a Delete. This constraint is broken after the introduction of skipping locks, causing panic when reading values. Now, it's changed to use the same logic in the loop (just like the PointGetter and the ForwardScanner) to avoid the problem. Signed-off-by: Yilin Chen --- src/storage/mvcc/reader/reader.rs | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index a6aae85761f..dd6bff6a157 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -367,8 +367,9 @@ impl MvccReader { mut ts: TimeStamp, gc_fence_limit: Option, ) -> Result> { + let mut seek_res = self.seek_write(key, ts)?; loop { - match self.seek_write(key, ts)? { + match seek_res { Some((commit_ts, write)) => { if let Some(limit) = gc_fence_limit { if !write.as_ref().check_gc_fence_as_latest_version(limit) { @@ -404,13 +405,15 @@ impl MvccReader { commit_ts, write.write_type, ); - return Ok(Some((write, commit_ts))); + seek_res = Some((commit_ts, write)); + continue; } } } } None => return Ok(None), } + seek_res = self.seek_write(key, ts)?; } } @@ -2540,11 +2543,20 @@ pub mod tests { engine.prewrite(m, k, 1); engine.commit(k, 1, 2); - // Write enough ROLLBACK/LOCK recrods + // Write enough LOCK recrods for start_ts in (6..30).into_iter().step_by(2) { engine.lock(k, start_ts, start_ts + 1); } + let m = Mutation::make_delete(Key::from_raw(k)); + engine.prewrite(m, k, 45); + engine.commit(k, 45, 46); + + // Write enough LOCK recrods + for start_ts in (50..80).into_iter().step_by(2) { + engine.lock(k, start_ts, start_ts + 1); + } + let snap = RegionSnapshot::::from_raw(db, region); let mut reader = MvccReader::new(snap, None, false); @@ -2567,5 +2579,17 @@ pub mod tests { // instead of calling a series of next, so the count of next should be 0 instead assert_eq!(reader.statistics.write.next, 0); assert_eq!(reader.statistics.write.get, 1); + + // Clear statistics first + reader.statistics = Statistics::default(); + let res = reader + .get_write_with_commit_ts(&key, 80.into(), None) + .unwrap(); + // If the type is Delete, get_write_with_commit_ts should return None. + assert!(res.is_none()); + // versions_to_last_change should be large enough to trigger a second get + // instead of calling a series of next, so the count of next should be 0 instead + assert_eq!(reader.statistics.write.next, 0); + assert_eq!(reader.statistics.write.get, 1); } } From 213e5020c96404f0a5d56fc45f37c22359c7a1f9 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 24 Nov 2022 11:27:59 +0800 Subject: [PATCH 358/676] raftstore-v2: pd worker (#13773) ref tikv/tikv#12842 Signed-off-by: tabokie Co-authored-by: SpadeA-Tang --- Cargo.lock | 6 + components/raftstore-v2/Cargo.toml | 6 + components/raftstore-v2/src/batch/store.rs | 52 ++- components/raftstore-v2/src/fsm/peer.rs | 2 +- components/raftstore-v2/src/fsm/store.rs | 42 ++- components/raftstore-v2/src/lib.rs | 2 + .../operation/command/admin/conf_change.rs | 40 ++- .../src/operation/command/admin/split.rs | 23 +- .../raftstore-v2/src/operation/command/mod.rs | 3 +- components/raftstore-v2/src/operation/life.rs | 4 +- components/raftstore-v2/src/operation/mod.rs | 1 + components/raftstore-v2/src/operation/pd.rs | 230 ++++++++++++ .../raftstore-v2/src/operation/ready/mod.rs | 18 +- components/raftstore-v2/src/raft/peer.rs | 92 ++++- components/raftstore-v2/src/worker/mod.rs | 5 + components/raftstore-v2/src/worker/pd/mod.rs | 327 ++++++++++++++++++ .../src/worker/pd/region_heartbeat.rs | 256 ++++++++++++++ .../raftstore-v2/src/worker/pd/split.rs | 99 ++++++ .../src/worker/pd/store_heartbeat.rs | 293 ++++++++++++++++ .../src/worker/pd/update_max_timestamp.rs | 114 ++++++ .../tests/integrations/cluster.rs | 30 +- .../raftstore-v2/tests/integrations/mod.rs | 1 + .../tests/integrations/test_pd_heartbeat.rs | 60 ++++ .../tests/integrations/test_split.rs | 2 +- components/test_pd/src/mocker/service.rs | 27 +- 25 files changed, 1682 insertions(+), 53 deletions(-) create mode 100644 components/raftstore-v2/src/operation/pd.rs create mode 100644 components/raftstore-v2/src/worker/mod.rs create mode 100644 components/raftstore-v2/src/worker/pd/mod.rs create mode 100644 components/raftstore-v2/src/worker/pd/region_heartbeat.rs create mode 100644 components/raftstore-v2/src/worker/pd/split.rs create mode 100644 components/raftstore-v2/src/worker/pd/store_heartbeat.rs create mode 100644 components/raftstore-v2/src/worker/pd/update_max_timestamp.rs create mode 100644 components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs diff --git a/Cargo.lock b/Cargo.lock index 7425528342d..487d2712249 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4282,22 +4282,27 @@ name = "raftstore-v2" version = "0.1.0" dependencies = [ "batch-system", + "causal_ts", "collections", + "concurrency_manager", "crossbeam", "engine_test", "engine_traits", "error_code", "fail", "file_system", + "fs2", "futures 0.3.15", "keys", "kvproto", "log_wrappers", "pd_client", + "prometheus", "protobuf", "raft", "raft-proto", "raftstore", + "resource_metering", "slog", "slog-global", "smallvec", @@ -4308,6 +4313,7 @@ dependencies = [ "time", "tracker", "txn_types", + "yatp", ] [[package]] diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 8bb91b40bb9..1679732ccda 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -30,27 +30,33 @@ cloud-azure = ["raftstore/cloud-azure"] [dependencies] batch-system = { workspace = true } +causal_ts = { workspace = true } collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" engine_traits = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } +fs2 = "0.4" futures = { version = "0.3", features = ["compat"] } keys = { workspace = true } kvproto = { git = "https://github.com/pingcap/kvproto.git" } log_wrappers = { workspace = true } pd_client = { workspace = true } +prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0" } raftstore = { workspace = true } +resource_metering = { workspace = true } slog = "2.3" smallvec = "1.4" tikv_util = { workspace = true } time = "0.1" tracker = { workspace = true } txn_types = { workspace = true } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] engine_test = { workspace = true } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 605bbb95131..1eea2017571 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -3,14 +3,19 @@ use std::{ ops::{Deref, DerefMut}, path::Path, - sync::{Arc, Mutex}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, Mutex, + }, time::Duration, }; use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, HandleResult, HandlerBuilder, PollHandler, }; +use causal_ts::CausalTsProviderImpl; use collections::HashMap; +use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{Sender, TrySendError}; use engine_traits::{Engines, KvEngine, RaftEngine, TabletFactory}; use file_system::{set_io_type, IoType}; @@ -19,6 +24,7 @@ use kvproto::{ metapb::Store, raft_serverpb::{PeerState, RaftMessage}, }; +use pd_client::PdClient; use raft::INVALID_ID; use raftstore::store::{ fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, ReadRunner, ReadTask, @@ -43,6 +49,7 @@ use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, + worker::{PdRunner, PdTask}, Error, Result, }; @@ -70,6 +77,7 @@ pub struct StoreContext { pub tablet_factory: Arc>, pub apply_pool: FuturePool, pub read_scheduler: Scheduler>, + pub pd_scheduler: Scheduler, } /// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. @@ -217,6 +225,7 @@ struct StorePollerBuilder { trans: T, router: StoreRouter, read_scheduler: Scheduler>, + pd_scheduler: Scheduler, write_senders: WriteSenders, apply_pool: FuturePool, logger: Logger, @@ -232,6 +241,7 @@ impl StorePollerBuilder { trans: T, router: StoreRouter, read_scheduler: Scheduler>, + pd_scheduler: Scheduler, store_writers: &mut StoreWriters, logger: Logger, store_meta: Arc>>, @@ -254,6 +264,7 @@ impl StorePollerBuilder { trans, router, read_scheduler, + pd_scheduler, apply_pool, logger, write_senders: store_writers.senders(), @@ -330,6 +341,7 @@ where tablet_factory: self.tablet_factory.clone(), apply_pool: self.apply_pool.clone(), read_scheduler: self.read_scheduler.clone(), + pd_scheduler: self.pd_scheduler.clone(), }; let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); StorePoller::new(poll_ctx, cfg_tracker) @@ -341,6 +353,7 @@ where struct Workers { /// Worker for fetching raft logs asynchronously async_read_worker: Worker, + pd_worker: Worker, store_writers: StoreWriters, } @@ -348,6 +361,7 @@ impl Default for Workers { fn default() -> Self { Self { async_read_worker: Worker::new("async-read-worker"), + pd_worker: Worker::new("pd-worker"), store_writers: StoreWriters::default(), } } @@ -358,23 +372,36 @@ pub struct StoreSystem { system: BatchSystem, StoreFsm>, workers: Option>, logger: Logger, + shutdown: Arc, } impl StoreSystem { - pub fn start( + pub fn start( &mut self, store_id: u64, cfg: Arc>, raft_engine: ER, tablet_factory: Arc>, trans: T, + pd_client: Arc, router: &StoreRouter, store_meta: Arc>>, snap_mgr: TabletSnapManager, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Result<()> where T: Transport + 'static, + C: PdClient + 'static, { + let sync_router = Mutex::new(router.clone()); + pd_client.handle_reconnect(move || { + sync_router + .lock() + .unwrap() + .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); + }); + let mut workers = Workers::default(); workers .store_writers @@ -386,6 +413,22 @@ impl StoreSystem { .async_read_worker .start("async-read-worker", read_runner); + let pd_scheduler = workers.pd_worker.start( + "pd-worker", + PdRunner::new( + store_id, + pd_client, + raft_engine.clone(), + tablet_factory.clone(), + router.clone(), + workers.pd_worker.remote(), + concurrency_manager, + causal_ts_provider, + self.logger.clone(), + self.shutdown.clone(), + ), + ); + let mut builder = StorePollerBuilder::new( cfg.clone(), store_id, @@ -394,6 +437,7 @@ impl StoreSystem { trans, router.clone(), read_scheduler, + pd_scheduler, &mut workers.store_writers, self.logger.clone(), store_meta.clone(), @@ -433,6 +477,8 @@ impl StoreSystem { } pub fn shutdown(&mut self) { + self.shutdown.store(true, Ordering::Relaxed); + if self.workers.is_none() { return; } @@ -444,6 +490,7 @@ impl StoreSystem { workers.store_writers.shutdown(); workers.async_read_worker.stop(); + workers.pd_worker.stop(); } } @@ -520,6 +567,7 @@ where system, workers: None, logger: logger.clone(), + shutdown: Arc::new(AtomicBool::new(false)), }; (StoreRouter { router, logger }, system) } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 6fac2d88db0..cd93463a524 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -193,9 +193,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, fn on_tick(&mut self, tick: PeerTick) { match tick { PeerTick::Raft => self.on_raft_tick(), + PeerTick::PdHeartbeat => self.on_pd_heartbeat(), PeerTick::RaftLogGc => unimplemented!(), PeerTick::SplitRegionCheck => unimplemented!(), - PeerTick::PdHeartbeat => unimplemented!(), PeerTick::CheckMerge => unimplemented!(), PeerTick::CheckPeerStaleState => unimplemented!(), PeerTick::EntryCacheEvict => unimplemented!(), diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 0d390d5b51d..546ec95a604 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -1,17 +1,22 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; use batch_system::Fsm; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; +use futures::{compat::Future01CompatExt, FutureExt}; use kvproto::{metapb::Region, raft_serverpb::RaftMessage}; use raftstore::{ coprocessor::RegionChangeReason, store::{Config, ReadDelegate, RegionReadProgressRegistry}, }; -use slog::{o, Logger}; -use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; +use slog::{info, o, Logger}; +use tikv_util::{ + future::poll_future_notify, + is_zero_duration, + mpsc::{self, LooseBoundedSender, Receiver}, +}; use crate::{ batch::StoreContext, @@ -82,7 +87,7 @@ impl Store { } pub struct StoreFsm { - store: Store, + pub store: Store, receiver: Receiver, } @@ -126,8 +131,8 @@ impl Fsm for StoreFsm { } pub struct StoreFsmDelegate<'a, EK: KvEngine, ER: RaftEngine, T> { - fsm: &'a mut StoreFsm, - store_ctx: &'a mut StoreContext, + pub fsm: &'a mut StoreFsm, + pub store_ctx: &'a mut StoreContext, } impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { @@ -145,10 +150,33 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { .duration_since(SystemTime::UNIX_EPOCH) .map_or(0, |d| d.as_secs()), ); + + self.on_pd_store_heartbeat(); + } + + pub fn schedule_tick(&mut self, tick: StoreTick, timeout: Duration) { + if !is_zero_duration(&timeout) { + let mb = self.store_ctx.router.control_mailbox(); + let logger = self.fsm.store.logger().clone(); + let delay = self.store_ctx.timer.delay(timeout).compat().map(move |_| { + if let Err(e) = mb.force_send(StoreMsg::Tick(tick)) { + info!( + logger, + "failed to schedule store tick, are we shutting down?"; + "tick" => ?tick, + "err" => ?e + ); + } + }); + poll_future_notify(delay); + } } fn on_tick(&mut self, tick: StoreTick) { - unimplemented!() + match tick { + StoreTick::PdStoreHeartbeat => self.on_pd_store_heartbeat(), + _ => unimplemented!(), + } } pub fn handle_msgs(&mut self, store_msg_buf: &mut Vec) { diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 15dd6b4afc1..7dea9d55901 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -24,6 +24,7 @@ #![allow(unused)] #![feature(let_chains)] #![feature(array_windows)] +#![feature(div_duration)] mod batch; mod bootstrap; @@ -32,6 +33,7 @@ mod operation; mod raft; pub mod router; mod tablet; +mod worker; pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 8b4b7fe293f..69e318c3a2e 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -7,6 +7,8 @@ //! - Apply after conf change is committed //! - Update raft state using the result of conf change +use std::time::Instant; + use collections::HashSet; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ @@ -39,12 +41,12 @@ use crate::{ #[derive(Default, Debug)] pub struct ConfChangeResult { pub index: u64, - // The proposed ConfChangeV2 or (legacy) ConfChange - // ConfChange (if it is) will convert to ConfChangeV2 + // The proposed ConfChangeV2 or (legacy) ConfChange. + // ConfChange (if it is) will be converted to ConfChangeV2. pub conf_change: ConfChangeV2, // The change peer requests come along with ConfChangeV2 - // or (legacy) ConfChange, for ConfChange, it only contains - // one element + // or (legacy) ConfChange. For ConfChange, it only contains + // one element. pub changes: Vec, pub region_state: RegionLocalState, } @@ -127,7 +129,11 @@ impl Peer { Ok(proposal_index) } - pub fn on_apply_res_conf_change(&mut self, conf_change: ConfChangeResult) { + pub fn on_apply_res_conf_change( + &mut self, + ctx: &mut StoreContext, + conf_change: ConfChangeResult, + ) { // TODO: cancel generating snapshot. // Snapshot is applied in memory without waiting for all entries being @@ -150,6 +156,7 @@ impl Peer { "notify pd with change peer region"; "region" => ?self.region(), ); + self.region_heartbeat_pd(ctx); let demote_self = tikv_util::store::is_learner(self.peer()); if remove_self || demote_self { warn!(self.logger, "removing or demoting leader"; "remove" => remove_self, "demote" => demote_self); @@ -157,12 +164,23 @@ impl Peer { self.raft_group_mut() .raft .become_follower(term, raft::INVALID_ID); - } else if conf_change.changes.iter().any(|c| { - matches!( - c.get_change_type(), - ConfChangeType::AddNode | ConfChangeType::AddLearnerNode - ) - }) { + } + let mut has_new_peer = None; + for c in conf_change.changes { + let peer_id = c.get_peer().get_id(); + match c.get_change_type() { + ConfChangeType::AddNode | ConfChangeType::AddLearnerNode => { + if has_new_peer.is_none() { + has_new_peer = Some(Instant::now()); + } + self.add_peer_heartbeat(peer_id, has_new_peer.unwrap()); + } + ConfChangeType::RemoveNode => { + self.remove_peer_heartbeat(peer_id); + } + } + } + if has_new_peer.is_some() { // Speed up snapshot instead of waiting another heartbeat. self.raft_group_mut().ping(); self.set_has_ready(); diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 2e43e69b44c..2782b436439 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -305,6 +305,21 @@ impl Peer { self.post_split(); + if self.is_leader() { + self.region_heartbeat_pd(store_ctx); + // Notify pd immediately to let it update the region meta. + info!( + self.logger, + "notify pd with split"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "split_count" => regions.len(), + ); + // Now pd only uses ReportBatchSplit for history operation show, + // so we send it independently here. + self.report_batch_split_pd(store_ctx, regions.to_vec()); + } + let last_region_id = regions.last().unwrap().get_id(); for (new_region, locks) in regions.into_iter().zip(region_locks) { let new_region_id = new_region.get_id(); @@ -397,7 +412,7 @@ impl Peer { } self.set_raft_group(raft_group); } else { - // todo: when reaching here (peer is initalized before and cannot be replaced), + // TODO: when reaching here (peer is initalized before and cannot be replaced), // it is much complexer. return; } @@ -411,7 +426,7 @@ impl Peer { "region" => ?split_init.region, ); - // todo: GlobalReplicationState + // TODO: GlobalReplicationState for p in split_init.region.get_peers() { self.insert_peer_cache(p.clone()); @@ -425,7 +440,7 @@ impl Peer { *self.txn_ext().pessimistic_locks.write() = split_init.locks; // The new peer is likely to become leader, send a heartbeat immediately to // reduce client query miss. - self.heartbeat_pd(store_ctx); + self.region_heartbeat_pd(store_ctx); } meta.tablet_caches.insert(region_id, self.tablet().clone()); @@ -436,7 +451,7 @@ impl Peer { } if split_init.check_split { - // todo: check if the last region needs to split again + // TODO: check if the last region needs to split again } self.schedule_apply_fsm(store_ctx); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 5d308986229..7e69a3f1c7c 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -299,14 +299,13 @@ impl Peer { for admin_res in apply_res.admin_result { match admin_res { AdminCmdResult::ConfChange(conf_change) => { - self.on_apply_res_conf_change(conf_change) + self.on_apply_res_conf_change(ctx, conf_change) } AdminCmdResult::SplitRegion(SplitResult { regions, derived_index, tablet_index, }) => self.on_ready_split_region(ctx, derived_index, tablet_index, regions), - AdminCmdResult::SplitRegion(_) => unimplemented!(), } } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 58628637159..60884f63b03 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -280,7 +280,7 @@ impl Peer { /// are split. It's a waste to use snapshot to restore newly split /// tablet. #[inline] - pub fn postpond_destroy(&self) -> bool { + pub fn postponed_destroy(&self) -> bool { let entry_storage = self.storage().entry_storage(); // TODO: check actual split index instead of commit index. entry_storage.applied_index() != entry_storage.commit_index() @@ -293,7 +293,7 @@ impl Peer { /// memory states. pub fn start_destroy(&mut self, write_task: &mut WriteTask) { let entry_storage = self.storage().entry_storage(); - if self.postpond_destroy() { + if self.postponed_destroy() { return; } let first_index = entry_storage.first_index(); diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 84835231398..7df897f2b26 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -2,6 +2,7 @@ mod command; mod life; +mod pd; mod query; mod ready; diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs new file mode 100644 index 00000000000..659fab00754 --- /dev/null +++ b/components/raftstore-v2/src/operation/pd.rs @@ -0,0 +1,230 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements the interactions with pd. + +use std::cmp; + +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use kvproto::{metapb, pdpb}; +use raftstore::store::Transport; +use slog::error; +use tikv_util::time::InstantExt; + +use crate::{ + batch::StoreContext, + fsm::{PeerFsmDelegate, Store, StoreFsmDelegate}, + raft::Peer, + router::{PeerTick, StoreTick}, + worker::{PdRegionHeartbeatTask, PdTask}, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { + #[inline] + pub fn on_pd_store_heartbeat(&mut self) { + self.fsm.store.store_heartbeat_pd(self.store_ctx); + self.schedule_tick( + StoreTick::PdStoreHeartbeat, + self.store_ctx.cfg.pd_store_heartbeat_tick_interval.0, + ); + } +} + +impl Store { + pub fn store_heartbeat_pd(&self, ctx: &StoreContext) + where + EK: KvEngine, + ER: RaftEngine, + { + let mut stats = pdpb::StoreStats::default(); + + stats.set_store_id(self.store_id()); + { + let meta = ctx.store_meta.lock().unwrap(); + stats.set_region_count(meta.tablet_caches.len() as u32); + } + + stats.set_sending_snap_count(0); + stats.set_receiving_snap_count(0); + + stats.set_start_time(self.start_time().unwrap() as u32); + + stats.set_bytes_written(0); + stats.set_keys_written(0); + stats.set_is_busy(false); + + // stats.set_query_stats(query_stats); + + let task = PdTask::StoreHeartbeat { stats }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!(self.logger(), "notify pd failed"; + "store_id" => self.store_id(), + "err" => ?e + ); + } + } +} + +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + #[inline] + pub fn on_pd_heartbeat(&mut self) { + self.fsm.peer_mut().update_peer_statistics(); + if self.fsm.peer().is_leader() { + self.fsm.peer_mut().region_heartbeat_pd(self.store_ctx); + } + // TODO: hibernate region + self.schedule_tick(PeerTick::PdHeartbeat); + } +} + +impl Peer { + #[inline] + pub fn region_heartbeat_pd(&self, ctx: &StoreContext) { + let task = PdTask::RegionHeartbeat(PdRegionHeartbeatTask { + term: self.term(), + region: self.region().clone(), + down_peers: self.collect_down_peers(ctx.cfg.max_peer_down_duration.0), + peer: self.peer().clone(), + pending_peers: self.collect_pending_peers(ctx), + written_bytes: self.self_stat().written_bytes, + written_keys: self.self_stat().written_keys, + approximate_size: None, + approximate_keys: None, + wait_data_peers: Vec::new(), + }); + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "err" => ?e, + ); + return; + } + fail_point!("schedule_check_split"); + } + + /// Collects all pending peers and update `peers_start_pending_time`. + fn collect_pending_peers(&self, ctx: &StoreContext) -> Vec { + let mut pending_peers = Vec::with_capacity(self.region().get_peers().len()); + let status = self.raft_group().status(); + let truncated_idx = self + .storage() + .apply_state() + .get_truncated_state() + .get_index(); + + if status.progress.is_none() { + return pending_peers; + } + + // TODO: update `peers_start_pending_time`. + + let progresses = status.progress.unwrap().iter(); + for (&id, progress) in progresses { + if id == self.peer_id() { + continue; + } + // The `matched` is 0 only in these two cases: + // 1. Current leader hasn't communicated with this peer. + // 2. This peer does not exist yet(maybe it is created but not initialized) + // + // The correctness of region merge depends on the fact that all target peers + // must exist during merging. (PD rely on `pending_peers` to check whether all + // target peers exist) + // + // So if the `matched` is 0, it must be a pending peer. + // It can be ensured because `truncated_index` must be greater than + // `RAFT_INIT_LOG_INDEX`(5). + if progress.matched < truncated_idx { + if let Some(p) = self.peer_from_cache(id) { + pending_peers.push(p); + } else { + if ctx.cfg.dev_assert { + panic!( + "{:?} failed to get peer {} from cache", + self.logger.list(), + id + ); + } + error!( + self.logger, + "failed to get peer from cache"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "get_peer_id" => id, + ); + } + } + } + pending_peers + } + + #[inline] + pub fn destroy_peer_pd(&self, ctx: &StoreContext) { + let task = PdTask::DestroyPeer { + region_id: self.region_id(), + }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd with DestroyPeer"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "err" => %e, + ); + } + } + + #[inline] + pub fn ask_batch_split_pd(&self, ctx: &StoreContext, split_keys: Vec>) { + let task = PdTask::AskBatchSplit { + region: self.region().clone(), + split_keys, + peer: self.peer().clone(), + right_derive: ctx.cfg.right_derive_when_split, + }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd with AskBatchSplit"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "err" => %e, + ); + } + } + + #[inline] + pub fn report_batch_split_pd( + &self, + ctx: &StoreContext, + regions: Vec, + ) { + let task = PdTask::ReportBatchSplit { regions }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd with ReportBatchSplit"; + "err" => %e, + ); + } + } + + #[inline] + pub fn update_max_timestamp_pd(&self, ctx: &StoreContext, initial_status: u64) { + let task = PdTask::UpdateMaxTimestamp { + region_id: self.region_id(), + initial_status, + txn_ext: self.txn_ext().clone(), + }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd with UpdateMaxTimestamp"; + "err" => %e, + ); + } + } +} diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 6f6866b9671..9e639f233cc 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -20,13 +20,13 @@ mod async_writer; mod snapshot; -use std::cmp; +use std::{cmp, time::Instant}; use engine_traits::{KvEngine, RaftEngine}; use error_code::ErrorCodeExt; use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; use protobuf::Message as _; -use raft::{eraftpb, Ready, StateRole}; +use raft::{eraftpb, Ready, StateRole, INVALID_ID}; use raftstore::store::{util, ExtraStates, FetchedLogs, ReadProgress, Transport, WriteTask}; use slog::{debug, error, trace, warn}; use tikv_util::time::{duration_to_sec, monotonic_raw_now}; @@ -41,6 +41,7 @@ use crate::{ raft::{Peer, Storage}, router::{ApplyTask, PeerTick}, }; + impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { /// Raft relies on periodic ticks to keep the state machine sync with other /// peers. @@ -111,7 +112,11 @@ impl Peer { } // TODO: drop all msg append when the peer is uninitialized and has conflict // ranges with other peers. - self.insert_peer_cache(msg.take_from_peer()); + let from_peer = msg.take_from_peer(); + if self.is_leader() && from_peer.get_id() != INVALID_ID { + self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); + } + self.insert_peer_cache(from_peer); if let Err(e) = self.raft_group_mut().step(msg.take_message()) { error!(self.logger, "raft step error"; "err" => ?e); } @@ -271,7 +276,7 @@ impl Peer { } ctx.has_ready = true; - if !self.raft_group().has_ready() && (self.serving() || self.postpond_destroy()) { + if !self.raft_group().has_ready() && (self.serving() || self.postponed_destroy()) { #[cfg(feature = "testexport")] self.async_writer.notify_flush(); return; @@ -443,8 +448,13 @@ impl Peer { // latency. self.raft_group_mut().skip_bcast_commit(false); + // A more recent read may happen on the old leader. So max ts should + // be updated after a peer becomes leader. + self.require_updating_max_ts(ctx); // Exit entry cache warmup state when the peer becomes leader. self.entry_storage_mut().clear_entry_cache_warmup_state(); + + self.region_heartbeat_pd(ctx); } StateRole::Follower => { self.leader_lease_mut().expire(); diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 6ebb3ed2056..a9730a036e7 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -1,11 +1,15 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{mem, sync::Arc}; +use std::{ + mem, + sync::{atomic::Ordering, Arc}, + time::{Duration, Instant}, +}; use collections::HashMap; use crossbeam::atomic::AtomicCell; use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; -use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_serverpb::RegionLocalState}; +use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; use raftstore::{ @@ -35,6 +39,7 @@ use crate::{ operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteEncoder}, router::{CmdResChannel, QueryResChannel}, tablet::CachedTablet, + worker::PdTask, Result, }; @@ -44,10 +49,16 @@ const REGION_READ_PROGRESS_CAP: usize = 128; pub struct Peer { raft_group: RawNode>, tablet: CachedTablet, + + /// Statistics for self. + self_stat: PeerStat, + /// We use a cache for looking up peers. Not all peers exist in region's /// peer list, for example, an isolated peer may need to send/receive /// messages with unknown peers after recovery. peer_cache: Vec, + /// Statistics for other peers, only maintained when self is the leader. + peer_heartbeats: HashMap, /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. @@ -123,7 +134,9 @@ impl Peer { let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { tablet, + self_stat: PeerStat::default(), peer_cache: vec![], + peer_heartbeats: HashMap::default(), raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), @@ -229,7 +242,7 @@ impl Peer { pessimistic_locks.version = self.region().get_region_epoch().get_version(); } - // todo: CoprocessorHost + // TODO: CoprocessorHost } #[inline] @@ -317,6 +330,11 @@ impl Peer { self.raft_group = raft_group; } + #[inline] + pub fn self_stat(&self) -> &PeerStat { + &self.self_stat + } + /// Mark the peer has a ready so it will be checked at the end of every /// processing round. #[inline] @@ -364,6 +382,57 @@ impl Peer { .cloned() } + #[inline] + pub fn update_peer_statistics(&mut self) { + if !self.is_leader() { + self.peer_heartbeats.clear(); + return; + } + + if self.peer_heartbeats.len() == self.region().get_peers().len() { + return; + } + + // Insert heartbeats in case that some peers never response heartbeats. + let region = self.raft_group.store().region(); + for peer in region.get_peers() { + self.peer_heartbeats + .entry(peer.get_id()) + .or_insert_with(Instant::now); + } + } + + #[inline] + pub fn add_peer_heartbeat(&mut self, peer_id: u64, now: Instant) { + self.peer_heartbeats.insert(peer_id, now); + } + + #[inline] + pub fn remove_peer_heartbeat(&mut self, peer_id: u64) { + self.peer_heartbeats.remove(&peer_id); + } + + pub fn collect_down_peers(&self, max_duration: Duration) -> Vec { + let mut down_peers = Vec::new(); + let now = Instant::now(); + for p in self.region().get_peers() { + if p.get_id() == self.peer_id() { + continue; + } + if let Some(instant) = self.peer_heartbeats.get(&p.get_id()) { + let elapsed = instant.saturating_duration_since(now); + if elapsed >= max_duration { + let mut stats = pdpb::PeerStats::default(); + stats.set_peer(p.clone()); + stats.set_down_seconds(elapsed.as_secs()); + down_peers.push(stats); + } + } + } + // TODO: `refill_disk_full_peers` + down_peers + } + #[inline] pub fn is_leader(&self) -> bool { self.raft_group.raft.state == StateRole::Leader @@ -486,10 +555,6 @@ impl Peer { &self.txn_ext } - pub fn heartbeat_pd(&self, store_ctx: &StoreContext) { - // todo - } - pub fn generate_read_delegate(&self) -> ReadDelegate { let peer_id = self.peer().get_id(); @@ -522,4 +587,17 @@ impl Peer { self.proposal_control .advance_apply(apply_index, term, region); } + + // TODO: find a better place to put all txn related stuff. + pub fn require_updating_max_ts(&self, ctx: &StoreContext) { + let epoch = self.region().get_region_epoch(); + let term_low_bits = self.term() & ((1 << 32) - 1); // 32 bits + let version_lot_bits = epoch.get_version() & ((1 << 31) - 1); // 31 bits + let initial_status = (term_low_bits << 32) | (version_lot_bits << 1); + self.txn_ext + .max_ts_sync_status + .store(initial_status, Ordering::SeqCst); + + self.update_max_timestamp_pd(ctx, initial_status); + } } diff --git a/components/raftstore-v2/src/worker/mod.rs b/components/raftstore-v2/src/worker/mod.rs new file mode 100644 index 00000000000..ad8249d22a4 --- /dev/null +++ b/components/raftstore-v2/src/worker/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod pd; + +pub use pd::{RegionHeartbeatTask as PdRegionHeartbeatTask, Runner as PdRunner, Task as PdTask}; diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs new file mode 100644 index 00000000000..132678e21f2 --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -0,0 +1,327 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Display, Formatter}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use causal_ts::CausalTsProviderImpl; +use collections::HashMap; +use concurrency_manager::ConcurrencyManager; +use engine_traits::{KvEngine, RaftEngine, TabletFactory}; +use kvproto::{metapb, pdpb}; +use pd_client::PdClient; +use raftstore::store::{util::KeysInfoFormatter, TxnExt}; +use slog::{error, info, Logger}; +use tikv_util::{time::UnixSecs, worker::Runnable}; +use yatp::{task::future::TaskCell, Remote}; + +use crate::{batch::StoreRouter, router::PeerMsg}; + +mod region_heartbeat; +mod split; +mod store_heartbeat; +mod update_max_timestamp; + +pub use region_heartbeat::RegionHeartbeatTask; + +pub enum Task { + RegionHeartbeat(RegionHeartbeatTask), + StoreHeartbeat { + stats: pdpb::StoreStats, + // TODO: StoreReport, StoreDrAutoSyncStatus + }, + DestroyPeer { + region_id: u64, + }, + AskBatchSplit { + region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + }, + ReportBatchSplit { + regions: Vec, + }, + UpdateMaxTimestamp { + region_id: u64, + initial_status: u64, + txn_ext: Arc, + }, +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match *self { + Task::RegionHeartbeat(ref hb_task) => write!( + f, + "region heartbeat for region {:?}, leader {}", + hb_task.region, + hb_task.peer.get_id(), + ), + Task::StoreHeartbeat { ref stats, .. } => { + write!(f, "store heartbeat stats: {:?}", stats) + } + Task::DestroyPeer { ref region_id } => { + write!(f, "destroy peer of region {}", region_id) + } + Task::AskBatchSplit { + ref region, + ref split_keys, + .. + } => write!( + f, + "ask split region {} with {}", + region.get_id(), + KeysInfoFormatter(split_keys.iter()) + ), + Task::ReportBatchSplit { ref regions } => write!(f, "report split {:?}", regions), + Task::UpdateMaxTimestamp { region_id, .. } => write!( + f, + "update the max timestamp for region {} in the concurrency manager", + region_id + ), + } + } +} + +pub struct Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + store_id: u64, + pd_client: Arc, + raft_engine: ER, + tablet_factory: Arc>, + router: StoreRouter, + + remote: Remote, + + region_peers: HashMap, + + // For store_heartbeat. + start_ts: UnixSecs, + store_stat: store_heartbeat::StoreStat, + + // For region_heartbeat. + region_cpu_records: HashMap, + is_hb_receiver_scheduled: bool, + + // For update_max_timestamp. + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, + + logger: Logger, + shutdown: Arc, +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn new( + store_id: u64, + pd_client: Arc, + raft_engine: ER, + tablet_factory: Arc>, + router: StoreRouter, + remote: Remote, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 + logger: Logger, + shutdown: Arc, + ) -> Self { + Self { + store_id, + pd_client, + raft_engine, + tablet_factory, + router, + remote, + region_peers: HashMap::default(), + start_ts: UnixSecs::zero(), + store_stat: store_heartbeat::StoreStat::default(), + region_cpu_records: HashMap::default(), + is_hb_receiver_scheduled: false, + concurrency_manager, + causal_ts_provider, + logger, + shutdown, + } + } +} + +impl Runnable for Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + type Task = Task; + + fn run(&mut self, task: Task) { + self.maybe_schedule_heartbeat_receiver(); + match task { + Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), + Task::StoreHeartbeat { stats } => self.handle_store_heartbeat(stats), + Task::DestroyPeer { region_id } => self.handle_destroy_peer(region_id), + Task::AskBatchSplit { + region, + split_keys, + peer, + right_derive, + } => self.handle_ask_batch_split(region, split_keys, peer, right_derive), + Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), + Task::UpdateMaxTimestamp { + region_id, + initial_status, + txn_ext, + } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), + } + } +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + fn handle_destroy_peer(&mut self, region_id: u64) { + match self.region_peers.remove(®ion_id) { + None => {} + Some(_) => { + info!(self.logger, "remove peer statistic record in pd"; "region_id" => region_id) + } + } + } +} + +pub mod requests { + use kvproto::raft_cmdpb::{ + AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, + SplitRequest, + }; + use raft::eraftpb::ConfChangeType; + + use super::*; + + pub fn send_admin_request( + logger: &Logger, + router: &StoreRouter, + region_id: u64, + epoch: metapb::RegionEpoch, + peer: metapb::Peer, + request: AdminRequest, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let cmd_type = request.get_cmd_type(); + + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_region_epoch(epoch); + req.mut_header().set_peer(peer); + req.set_admin_request(request); + + let (msg, _) = PeerMsg::raft_command(req); + if let Err(e) = router.send(region_id, msg) { + error!( + logger, + "send request failed"; + "region_id" => region_id, "cmd_type" => ?cmd_type, "err" => ?e, + ); + } + } + + pub fn new_change_peer_request( + change_type: ConfChangeType, + peer: metapb::Peer, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ChangePeer); + req.mut_change_peer().set_change_type(change_type); + req.mut_change_peer().set_peer(peer); + req + } + + pub fn new_change_peer_v2_request(changes: Vec) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ChangePeerV2); + let change_peer_reqs = changes + .into_iter() + .map(|mut c| { + let mut cp = ChangePeerRequest::default(); + cp.set_change_type(c.get_change_type()); + cp.set_peer(c.take_peer()); + cp + }) + .collect(); + let mut cp = ChangePeerV2Request::default(); + cp.set_changes(change_peer_reqs); + req.set_change_peer_v2(cp); + req + } + + pub fn new_split_region_request( + split_key: Vec, + new_region_id: u64, + peer_ids: Vec, + right_derive: bool, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::Split); + req.mut_split().set_split_key(split_key); + req.mut_split().set_new_region_id(new_region_id); + req.mut_split().set_new_peer_ids(peer_ids); + req.mut_split().set_right_derive(right_derive); + req + } + + pub fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req + } + + pub fn new_transfer_leader_request( + peer: metapb::Peer, + peers: Vec, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::TransferLeader); + req.mut_transfer_leader().set_peer(peer); + req.mut_transfer_leader().set_peers(peers.into()); + req + } + + pub fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::PrepareMerge); + req.mut_prepare_merge() + .set_target(merge.get_target().to_owned()); + req + } +} diff --git a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs new file mode 100644 index 00000000000..ad0293d0b6d --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs @@ -0,0 +1,256 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + metapb, pdpb, + raft_cmdpb::{ + AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, + SplitRequest, + }, + raft_serverpb::RaftMessage, + replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, +}; +use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, PdClient, RegionStat}; +use raft::eraftpb::ConfChangeType; +use slog::{debug, error, info}; +use tikv_util::{store::QueryStats, time::UnixSecs}; + +use super::{requests::*, Runner}; + +pub struct RegionHeartbeatTask { + pub term: u64, + pub region: metapb::Region, + pub peer: metapb::Peer, + pub down_peers: Vec, + pub pending_peers: Vec, + pub written_bytes: u64, + pub written_keys: u64, + pub approximate_size: Option, + pub approximate_keys: Option, + pub wait_data_peers: Vec, + // TODO: RegionReplicationStatus +} + +#[derive(Default)] +pub struct PeerStat { + pub read_bytes: u64, + pub read_keys: u64, + pub query_stats: QueryStats, + // last_region_report_attributes records the state of the last region heartbeat + pub last_region_report_read_bytes: u64, + pub last_region_report_read_keys: u64, + pub last_region_report_query_stats: QueryStats, + pub last_region_report_written_bytes: u64, + pub last_region_report_written_keys: u64, + pub last_region_report_ts: UnixSecs, + // last_store_report_attributes records the state of the last store heartbeat + pub last_store_report_read_bytes: u64, + pub last_store_report_read_keys: u64, + pub last_store_report_query_stats: QueryStats, + pub approximate_keys: u64, + pub approximate_size: u64, +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_region_heartbeat(&mut self, task: RegionHeartbeatTask) { + // HACK! In order to keep the compatible of protos, we use 0 to identify + // the size uninitialized regions, and use 1 to identify the empty regions. + // + // See tikv/tikv#11114 for details. + let approximate_size = match task.approximate_size { + Some(0) => 1, + Some(v) => v, + None => 0, // size uninitialized + }; + let approximate_keys = task.approximate_keys.unwrap_or_default(); + let region_id = task.region.get_id(); + + let peer_stat = self + .region_peers + .entry(region_id) + .or_insert_with(PeerStat::default); + peer_stat.approximate_size = approximate_size; + peer_stat.approximate_keys = approximate_keys; + + let read_bytes_delta = peer_stat.read_bytes - peer_stat.last_region_report_read_bytes; + let read_keys_delta = peer_stat.read_keys - peer_stat.last_region_report_read_keys; + let written_bytes_delta = task.written_bytes - peer_stat.last_region_report_written_bytes; + let written_keys_delta = task.written_keys - peer_stat.last_region_report_written_keys; + let query_stats = peer_stat + .query_stats + .sub_query_stats(&peer_stat.last_region_report_query_stats); + let mut last_report_ts = peer_stat.last_region_report_ts; + if last_report_ts.is_zero() { + last_report_ts = self.start_ts; + } + peer_stat.last_region_report_written_bytes = task.written_bytes; + peer_stat.last_region_report_written_keys = task.written_keys; + peer_stat.last_region_report_read_bytes = peer_stat.read_bytes; + peer_stat.last_region_report_read_keys = peer_stat.read_keys; + peer_stat.last_region_report_query_stats = peer_stat.query_stats.clone(); + let unix_secs_now = UnixSecs::now(); + peer_stat.last_region_report_ts = unix_secs_now; + + // Calculate the CPU usage since the last region heartbeat. + let cpu_usage = { + // Take out the region CPU record. + let cpu_time_duration = Duration::from_millis( + self.region_cpu_records.remove(®ion_id).unwrap_or(0) as u64, + ); + let interval_second = unix_secs_now.into_inner() - last_report_ts.into_inner(); + // Keep consistent with the calculation of cpu_usages in a store heartbeat. + // See components/tikv_util/src/metrics/threads_linux.rs for more details. + if interval_second > 0 { + ((cpu_time_duration.as_secs_f64() * 100.0) / interval_second as f64) as u64 + } else { + 0 + } + }; + + let region_stat = RegionStat { + down_peers: task.down_peers, + pending_peers: task.pending_peers, + written_bytes: written_bytes_delta, + written_keys: written_keys_delta, + read_bytes: read_bytes_delta, + read_keys: read_keys_delta, + query_stats: query_stats.0, + approximate_size, + approximate_keys, + last_report_ts, + cpu_usage, + }; + self.store_stat + .region_bytes_written + .observe(region_stat.written_bytes as f64); + self.store_stat + .region_keys_written + .observe(region_stat.written_keys as f64); + self.store_stat + .region_bytes_read + .observe(region_stat.read_bytes as f64); + self.store_stat + .region_keys_read + .observe(region_stat.read_keys as f64); + + let resp = self.pd_client.region_heartbeat( + task.term, + task.region.clone(), + task.peer, + region_stat, + None, + ); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + debug!( + logger, + "failed to send heartbeat"; + "region_id" => task.region.get_id(), + "err" => ?e + ); + } + }; + self.remote.spawn(f); + } + + pub fn maybe_schedule_heartbeat_receiver(&mut self) { + if self.is_hb_receiver_scheduled { + return; + } + let router = self.router.clone(); + let store_id = self.store_id; + let logger = self.logger.clone(); + + let fut = + self.pd_client + .handle_region_heartbeat_response(self.store_id, move |mut resp| { + let region_id = resp.get_region_id(); + let epoch = resp.take_region_epoch(); + let peer = resp.take_target_peer(); + + if resp.has_change_peer() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["change peer"]) + .inc(); + + let mut change_peer = resp.take_change_peer(); + info!( + logger, + "try to change peer"; + "region_id" => region_id, + "change_type" => ?change_peer.get_change_type(), + "peer" => ?change_peer.get_peer() + ); + let req = new_change_peer_request( + change_peer.get_change_type(), + change_peer.take_peer(), + ); + send_admin_request(&logger, &router, region_id, epoch, peer, req); + } else if resp.has_change_peer_v2() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["change peer"]) + .inc(); + + let mut change_peer_v2 = resp.take_change_peer_v2(); + info!( + logger, + "try to change peer"; + "region_id" => region_id, + "changes" => ?change_peer_v2.get_changes(), + ); + let req = new_change_peer_v2_request(change_peer_v2.take_changes().into()); + send_admin_request(&logger, &router, region_id, epoch, peer, req); + } else if resp.has_transfer_leader() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["transfer leader"]) + .inc(); + + let mut transfer_leader = resp.take_transfer_leader(); + info!( + logger, + "try to transfer leader"; + "region_id" => region_id, + "from_peer" => ?peer, + "to_peer" => ?transfer_leader.get_peer(), + "to_peers" => ?transfer_leader.get_peers(), + ); + let req = new_transfer_leader_request( + transfer_leader.take_peer(), + transfer_leader.take_peers().into(), + ); + send_admin_request(&logger, &router, region_id, epoch, peer, req); + } else if resp.has_split_region() { + // TODO + info!(logger, "pd asks for split but ignored"); + } else if resp.has_merge() { + // TODO + info!(logger, "pd asks for merge but ignored"); + } else { + PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["noop"]).inc(); + } + }); + let logger = self.logger.clone(); + let f = async move { + match fut.await { + Ok(_) => { + info!( + logger, + "region heartbeat response handler exit"; + "store_id" => store_id, + ); + } + Err(e) => panic!("unexpected error: {:?}", e), + } + }; + self.remote.spawn(f); + self.is_hb_receiver_scheduled = true; + } +} diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs new file mode 100644 index 00000000000..3cb85f6698c --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -0,0 +1,99 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + metapb, pdpb, + raft_cmdpb::{AdminCmdType, AdminRequest, SplitRequest}, +}; +use pd_client::PdClient; +use slog::{info, warn}; + +use super::{requests::*, Runner}; + +fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, +) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_ask_batch_split( + &mut self, + mut region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + ) { + if split_keys.is_empty() { + info!(self.logger, "empty split key, skip ask batch split"; + "region_id" => region.get_id()); + return; + } + let resp = self + .pd_client + .ask_batch_split(region.clone(), split_keys.len()); + let router = self.router.clone(); + let logger = self.logger.clone(); + let f = async move { + match resp.await { + Ok(mut resp) => { + info!( + logger, + "try to batch split region"; + "region_id" => region.get_id(), + "new_region_ids" => ?resp.get_ids(), + "region" => ?region, + ); + + let req = new_batch_split_region_request( + split_keys, + resp.take_ids().into(), + right_derive, + ); + let region_id = region.get_id(); + let epoch = region.take_region_epoch(); + send_admin_request(&logger, &router, region_id, epoch, peer, req); + } + Err(e) => { + warn!( + logger, + "ask batch split failed"; + "region_id" => region.get_id(), + "err" => ?e, + ); + } + } + }; + self.remote.spawn(f); + } + + pub fn handle_report_batch_split(&mut self, regions: Vec) { + let resp = self.pd_client.report_batch_split(regions); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + warn!(logger, "report split failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } +} diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs new file mode 100644 index 00000000000..1caa96a5225 --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs @@ -0,0 +1,293 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::cmp; + +use collections::HashMap; +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use kvproto::pdpb; +use pd_client::{ + metrics::{ + REGION_READ_BYTES_HISTOGRAM, REGION_READ_KEYS_HISTOGRAM, REGION_WRITTEN_BYTES_HISTOGRAM, + REGION_WRITTEN_KEYS_HISTOGRAM, STORE_SIZE_GAUGE_VEC, + }, + PdClient, +}; +use prometheus::local::LocalHistogram; +use slog::{error, warn}; +use tikv_util::{metrics::RecordPairVec, store::QueryStats, time::UnixSecs, topn::TopN}; + +use super::Runner; + +const HOTSPOT_REPORT_CAPACITY: usize = 1000; + +fn hotspot_key_report_threshold() -> u64 { + const HOTSPOT_KEY_RATE_THRESHOLD: u64 = 128; + fail_point!("mock_hotspot_threshold", |_| { 0 }); + HOTSPOT_KEY_RATE_THRESHOLD * 10 +} + +fn hotspot_byte_report_threshold() -> u64 { + const HOTSPOT_BYTE_RATE_THRESHOLD: u64 = 8 * 1024; + fail_point!("mock_hotspot_threshold", |_| { 0 }); + HOTSPOT_BYTE_RATE_THRESHOLD * 10 +} + +fn hotspot_query_num_report_threshold() -> u64 { + const HOTSPOT_QUERY_RATE_THRESHOLD: u64 = 128; + fail_point!("mock_hotspot_threshold", |_| { 0 }); + HOTSPOT_QUERY_RATE_THRESHOLD * 10 +} + +pub struct StoreStat { + pub engine_total_bytes_read: u64, + pub engine_total_keys_read: u64, + pub engine_total_query_num: QueryStats, + pub engine_last_total_bytes_read: u64, + pub engine_last_total_keys_read: u64, + pub engine_last_query_num: QueryStats, + pub last_report_ts: UnixSecs, + + pub region_bytes_read: LocalHistogram, + pub region_keys_read: LocalHistogram, + pub region_bytes_written: LocalHistogram, + pub region_keys_written: LocalHistogram, + + pub store_cpu_usages: RecordPairVec, + pub store_read_io_rates: RecordPairVec, + pub store_write_io_rates: RecordPairVec, +} + +impl Default for StoreStat { + fn default() -> StoreStat { + StoreStat { + region_bytes_read: REGION_READ_BYTES_HISTOGRAM.local(), + region_keys_read: REGION_READ_KEYS_HISTOGRAM.local(), + region_bytes_written: REGION_WRITTEN_BYTES_HISTOGRAM.local(), + region_keys_written: REGION_WRITTEN_KEYS_HISTOGRAM.local(), + + last_report_ts: UnixSecs::zero(), + engine_total_bytes_read: 0, + engine_total_keys_read: 0, + engine_last_total_bytes_read: 0, + engine_last_total_keys_read: 0, + engine_total_query_num: QueryStats::default(), + engine_last_query_num: QueryStats::default(), + + store_cpu_usages: RecordPairVec::default(), + store_read_io_rates: RecordPairVec::default(), + store_write_io_rates: RecordPairVec::default(), + } + } +} + +#[derive(Default, Clone)] +struct PeerCmpReadStat { + pub region_id: u64, + pub report_stat: u64, +} + +impl Ord for PeerCmpReadStat { + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.report_stat.cmp(&other.report_stat) + } +} + +impl Eq for PeerCmpReadStat {} + +impl PartialEq for PeerCmpReadStat { + fn eq(&self, other: &Self) -> bool { + self.report_stat == other.report_stat + } +} + +impl PartialOrd for PeerCmpReadStat { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.report_stat.cmp(&other.report_stat)) + } +} + +fn collect_report_read_peer_stats( + capacity: usize, + mut report_read_stats: HashMap, + mut stats: pdpb::StoreStats, +) -> pdpb::StoreStats { + if report_read_stats.len() < capacity * 3 { + for (_, read_stat) in report_read_stats { + stats.peer_stats.push(read_stat); + } + return stats; + } + let mut keys_topn_report = TopN::new(capacity); + let mut bytes_topn_report = TopN::new(capacity); + let mut stats_topn_report = TopN::new(capacity); + for read_stat in report_read_stats.values() { + let mut cmp_stat = PeerCmpReadStat::default(); + cmp_stat.region_id = read_stat.region_id; + let mut key_cmp_stat = cmp_stat.clone(); + key_cmp_stat.report_stat = read_stat.read_keys; + keys_topn_report.push(key_cmp_stat); + let mut byte_cmp_stat = cmp_stat.clone(); + byte_cmp_stat.report_stat = read_stat.read_bytes; + bytes_topn_report.push(byte_cmp_stat); + let mut query_cmp_stat = cmp_stat.clone(); + query_cmp_stat.report_stat = get_read_query_num(read_stat.get_query_stats()); + stats_topn_report.push(query_cmp_stat); + } + + for x in keys_topn_report { + if let Some(report_stat) = report_read_stats.remove(&x.region_id) { + stats.peer_stats.push(report_stat); + } + } + + for x in bytes_topn_report { + if let Some(report_stat) = report_read_stats.remove(&x.region_id) { + stats.peer_stats.push(report_stat); + } + } + + for x in stats_topn_report { + if let Some(report_stat) = report_read_stats.remove(&x.region_id) { + stats.peer_stats.push(report_stat); + } + } + stats +} + +fn get_read_query_num(stat: &pdpb::QueryStats) -> u64 { + stat.get_get() + stat.get_coprocessor() + stat.get_scan() +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_store_heartbeat(&mut self, mut stats: pdpb::StoreStats) { + let mut report_peers = HashMap::default(); + for (region_id, region_peer) in &mut self.region_peers { + let read_bytes = region_peer.read_bytes - region_peer.last_store_report_read_bytes; + let read_keys = region_peer.read_keys - region_peer.last_store_report_read_keys; + let query_stats = region_peer + .query_stats + .sub_query_stats(®ion_peer.last_store_report_query_stats); + region_peer.last_store_report_read_bytes = region_peer.read_bytes; + region_peer.last_store_report_read_keys = region_peer.read_keys; + region_peer + .last_store_report_query_stats + .fill_query_stats(®ion_peer.query_stats); + if read_bytes < hotspot_byte_report_threshold() + && read_keys < hotspot_key_report_threshold() + && query_stats.get_read_query_num() < hotspot_query_num_report_threshold() + { + continue; + } + let mut read_stat = pdpb::PeerStat::default(); + read_stat.set_region_id(*region_id); + read_stat.set_read_keys(read_keys); + read_stat.set_read_bytes(read_bytes); + read_stat.set_query_stats(query_stats.0); + report_peers.insert(*region_id, read_stat); + } + + stats = collect_report_read_peer_stats(HOTSPOT_REPORT_CAPACITY, report_peers, stats); + let (capacity, used_size, available) = self.collect_engine_size().unwrap_or_default(); + if available == 0 { + warn!(self.logger, "no available space"); + } + + stats.set_capacity(capacity); + stats.set_used_size(used_size); + stats.set_available(available); + stats.set_bytes_read( + self.store_stat.engine_total_bytes_read - self.store_stat.engine_last_total_bytes_read, + ); + stats.set_keys_read( + self.store_stat.engine_total_keys_read - self.store_stat.engine_last_total_keys_read, + ); + + self.store_stat + .engine_total_query_num + .add_query_stats(stats.get_query_stats()); // add write query stat + let res = self + .store_stat + .engine_total_query_num + .sub_query_stats(&self.store_stat.engine_last_query_num); + stats.set_query_stats(res.0); + + stats.set_cpu_usages(self.store_stat.store_cpu_usages.clone().into()); + stats.set_read_io_rates(self.store_stat.store_read_io_rates.clone().into()); + stats.set_write_io_rates(self.store_stat.store_write_io_rates.clone().into()); + + let mut interval = pdpb::TimeInterval::default(); + interval.set_start_timestamp(self.store_stat.last_report_ts.into_inner()); + stats.set_interval(interval); + self.store_stat.engine_last_total_bytes_read = self.store_stat.engine_total_bytes_read; + self.store_stat.engine_last_total_keys_read = self.store_stat.engine_total_keys_read; + self.store_stat + .engine_last_query_num + .fill_query_stats(&self.store_stat.engine_total_query_num); + self.store_stat.last_report_ts = UnixSecs::now(); + self.store_stat.region_bytes_written.flush(); + self.store_stat.region_keys_written.flush(); + self.store_stat.region_bytes_read.flush(); + self.store_stat.region_keys_read.flush(); + + STORE_SIZE_GAUGE_VEC + .with_label_values(&["capacity"]) + .set(capacity as i64); + STORE_SIZE_GAUGE_VEC + .with_label_values(&["available"]) + .set(available as i64); + STORE_SIZE_GAUGE_VEC + .with_label_values(&["used"]) + .set(used_size as i64); + + // TODO: slow score + + let router = self.router.clone(); + let resp = self.pd_client.store_heartbeat(stats, None, None); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + error!(logger, "store heartbeat failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } + + /// Returns (capacity, used, available). + fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { + let disk_stats = match fs2::statvfs(self.tablet_factory.tablets_path()) { + Err(e) => { + error!( + self.logger, + "get disk stat for rocksdb failed"; + "engine_path" => self.tablet_factory.tablets_path().display(), + "err" => ?e + ); + return None; + } + Ok(stats) => stats, + }; + let disk_cap = disk_stats.total_space(); + // TODO: custom capacity. + let capacity = disk_cap; + // TODO: accurate snapshot size and kv engines size. + let snap_size = 0; + let kv_size = 0; + let used_size = snap_size + + kv_size + + self + .raft_engine + .get_engine_size() + .expect("raft engine used size"); + let mut available = capacity.checked_sub(used_size).unwrap_or_default(); + // We only care about rocksdb SST file size, so we should check disk available + // here. + available = cmp::min(available, disk_stats.available_space()); + Some((capacity, used_size, available)) + } +} diff --git a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs new file mode 100644 index 00000000000..cbfecb8171d --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs @@ -0,0 +1,114 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{atomic::Ordering, Arc}, + time::{Duration, Instant}, +}; + +use causal_ts::CausalTsProvider; +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use futures::{compat::Future01CompatExt, FutureExt}; +use pd_client::PdClient; +use raftstore::{store::TxnExt, Result}; +use slog::{info, warn}; +use tikv_util::{box_err, timer::GLOBAL_TIMER_HANDLE}; +use txn_types::TimeStamp; + +use super::Runner; + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_update_max_timestamp( + &mut self, + region_id: u64, + initial_status: u64, + txn_ext: Arc, + ) { + let pd_client = self.pd_client.clone(); + let concurrency_manager = self.concurrency_manager.clone(); + let causal_ts_provider = self.causal_ts_provider.clone(); + let logger = self.logger.clone(); + let shutdown = self.shutdown.clone(); + + let f = async move { + let mut success = false; + while txn_ext.max_ts_sync_status.load(Ordering::SeqCst) == initial_status + && !shutdown.load(Ordering::Relaxed) + { + // On leader transfer / region merge, RawKV API v2 need to + // invoke causal_ts_provider.flush() to renew + // cached TSO, to ensure that the next TSO + // returned by causal_ts_provider.get_ts() on current + // store must be larger than the store where the leader is on + // before. + // + // And it won't break correctness of transaction commands, as + // causal_ts_provider.flush() is implemented as + // pd_client.get_tso() + renew TSO cached. + let res: Result = if let Some(causal_ts_provider) = &causal_ts_provider { + causal_ts_provider + .async_flush() + .await + .map_err(|e| box_err!(e)) + } else { + pd_client.get_tso().await.map_err(Into::into) + }; + + match res { + Ok(ts) => { + concurrency_manager.update_max_ts(ts); + success = txn_ext + .max_ts_sync_status + .compare_exchange( + initial_status, + initial_status | 1, + Ordering::SeqCst, + Ordering::SeqCst, + ) + .is_ok(); + break; + } + Err(e) => { + warn!( + logger, + "failed to update max timestamp for region {}: {:?}", region_id, e + ); + } + } + } + + if success { + info!(logger, "succeed to update max timestamp"; "region_id" => region_id); + } else { + info!( + logger, + "updating max timestamp is stale"; + "region_id" => region_id, + "initial_status" => initial_status, + ); + } + }; + + #[cfg(feature = "failpoints")] + let delay = (|| { + fail_point!("delay_update_max_ts", |_| true); + false + })(); + #[cfg(not(feature = "failpoints"))] + let delay = false; + + if delay { + info!(self.logger, "[failpoint] delay update max ts for 1s"; "region_id" => region_id); + let deadline = Instant::now() + Duration::from_secs(1); + self.remote + .spawn(GLOBAL_TIMER_HANDLE.delay(deadline).compat().then(|_| f)); + } else { + self.remote.spawn(f); + } + } +} diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index d99c982fc97..ef1f7411ac9 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -11,7 +11,9 @@ use std::{ time::{Duration, Instant}, }; +use causal_ts::CausalTsProviderImpl; use collections::HashSet; +use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{self, Receiver, Sender, TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, @@ -189,10 +191,12 @@ pub struct RunningState { impl RunningState { fn new( - pd_client: &RpcClient, + pd_client: &Arc, path: &Path, cfg: Arc>, transport: TestTransport, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, logger: &Logger, ) -> (TestRouter, Self) { let cf_opts = ALL_CFS @@ -208,7 +212,7 @@ impl RunningState { let raft_engine = engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), None) .unwrap(); - let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client, logger.clone()); + let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client.as_ref(), logger.clone()); let store_id = bootstrap.bootstrap_store().unwrap(); let mut store = Store::default(); store.set_id(store_id); @@ -243,9 +247,12 @@ impl RunningState { raft_engine.clone(), factory.clone(), transport.clone(), + pd_client.clone(), router.store_router(), store_meta.clone(), snap_mgr, + concurrency_manager, + causal_ts_provider, ) .unwrap(); @@ -269,7 +276,7 @@ impl Drop for RunningState { } pub struct TestNode { - pd_client: RpcClient, + pd_client: Arc, path: TempDir, running_state: Option, logger: Logger, @@ -277,7 +284,7 @@ pub struct TestNode { impl TestNode { fn with_pd(pd_server: &test_pd::Server, logger: Logger) -> TestNode { - let pd_client = test_pd::util::new_client(pd_server.bind_addrs(), None); + let pd_client = Arc::new(test_pd::util::new_client(pd_server.bind_addrs(), None)); let path = TempDir::new().unwrap(); TestNode { @@ -289,8 +296,15 @@ impl TestNode { } fn start(&mut self, cfg: Arc>, trans: TestTransport) -> TestRouter { - let (router, state) = - RunningState::new(&self.pd_client, self.path.path(), cfg, trans, &self.logger); + let (router, state) = RunningState::new( + &self.pd_client, + self.path.path(), + cfg, + trans, + ConcurrencyManager::new(1.into()), + None, + &self.logger, + ); self.running_state = Some(state); router } @@ -299,6 +313,10 @@ impl TestNode { &self.running_state().unwrap().factory } + pub fn pd_client(&self) -> &Arc { + &self.pd_client + } + fn stop(&mut self) { if let Some(state) = std::mem::take(&mut self.running_state) { let mut meta = state.store_meta.lock().unwrap(); diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index 4fb9ebcc323..52c8ba5e1f8 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -11,6 +11,7 @@ mod cluster; mod test_basic_write; mod test_conf_change; mod test_life; +mod test_pd_heartbeat; mod test_read; mod test_split; mod test_status; diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs new file mode 100644 index 00000000000..c22ef4908bf --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -0,0 +1,60 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use futures::executor::block_on; +use kvproto::raft_cmdpb::{RaftCmdRequest, StatusCmdType}; +use pd_client::PdClient; +use tikv_util::store::new_peer; + +use crate::cluster::Cluster; + +#[test] +fn test_region_heartbeat() { + let region_id = 2; + let cluster = Cluster::with_node_count(1, None); + let router = cluster.router(0); + + // When there is only one peer, it should campaign immediately. + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(1, 3)); + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionLeader); + let res = router.query(region_id, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + assert_eq!( + *status_resp.get_region_leader().get_leader(), + new_peer(1, 3) + ); + + for _ in 0..5 { + let resp = block_on( + cluster + .node(0) + .pd_client() + .get_region_leader_by_id(region_id), + ) + .unwrap(); + if let Some((region, peer)) = resp { + assert_eq!(region.get_id(), region_id); + assert_eq!(peer.get_id(), 3); + assert_eq!(peer.get_store_id(), 1); + return; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + panic!("failed to get region leader"); +} + +#[test] +fn test_store_heartbeat() { + let cluster = Cluster::with_node_count(1, None); + let store_id = cluster.node(0).id(); + for _ in 0..5 { + let stats = block_on(cluster.node(0).pd_client().get_store_stats_async(store_id)).unwrap(); + if stats.get_start_time() > 0 { + assert_ne!(stats.get_capacity(), 0); + return; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + panic!("failed to get store stats"); +} diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs index 97487a5d0c2..336a9c9d038 100644 --- a/components/raftstore-v2/tests/integrations/test_split.rs +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -39,7 +39,7 @@ fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { router.send(region_id, msg).unwrap(); block_on(sub.result()).unwrap(); - // todo: when persistent implementation is ready, we can use tablet index of + // TODO: when persistent implementation is ready, we can use tablet index of // the parent to check whether the split is done. Now, just sleep a second. thread::sleep(Duration::from_secs(1)); } diff --git a/components/test_pd/src/mocker/service.rs b/components/test_pd/src/mocker/service.rs index 2ff5c178c67..45dd6e5661d 100644 --- a/components/test_pd/src/mocker/service.rs +++ b/components/test_pd/src/mocker/service.rs @@ -19,7 +19,7 @@ pub struct Service { id_allocator: AtomicUsize, members_resp: Mutex>, is_bootstrapped: AtomicBool, - stores: Mutex>, + stores: Mutex>, regions: Mutex>, leaders: Mutex>, feature_gate: Mutex, @@ -47,7 +47,10 @@ impl Service { /// Add an arbitrary store. pub fn add_store(&self, store: Store) { let store_id = store.get_id(); - self.stores.lock().unwrap().insert(store_id, store); + self.stores + .lock() + .unwrap() + .insert(store_id, (store, StoreStats::new())); } pub fn set_cluster_version(&self, version: String) { @@ -107,7 +110,7 @@ impl PdMocker for Service { self.stores .lock() .unwrap() - .insert(store.get_id(), store.clone()); + .insert(store.get_id(), (store.clone(), StoreStats::new())); self.regions .lock() .unwrap() @@ -138,9 +141,10 @@ impl PdMocker for Service { let mut resp = GetStoreResponse::default(); let stores = self.stores.lock().unwrap(); match stores.get(&req.get_store_id()) { - Some(store) => { + Some((store, stats)) => { resp.set_header(Service::header()); resp.set_store(store.clone()); + resp.set_stats(stats.clone()); Some(Ok(resp)) } None => { @@ -160,7 +164,7 @@ impl PdMocker for Service { resp.set_header(Service::header()); let exclude_tombstone = req.get_exclude_tombstone_stores(); let stores = self.stores.lock().unwrap(); - for store in stores.values() { + for (store, _) in stores.values() { if exclude_tombstone && store.get_state() == StoreState::Tombstone { continue; } @@ -244,11 +248,22 @@ impl PdMocker for Service { Some(Ok(resp)) } - fn store_heartbeat(&self, _: &StoreHeartbeatRequest) -> Option> { + fn store_heartbeat( + &self, + req: &StoreHeartbeatRequest, + ) -> Option> { let mut resp = StoreHeartbeatResponse::default(); let header = Service::header(); resp.set_header(header); resp.set_cluster_version(self.feature_gate.lock().unwrap().to_owned()); + if let Some((_, stats)) = self + .stores + .lock() + .unwrap() + .get_mut(&req.get_stats().get_store_id()) + { + *stats = req.get_stats().clone(); + } Some(Ok(resp)) } From dc347f98ae70e8f829a55d4df1f002b4ad17e72d Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Thu, 24 Nov 2022 12:43:58 +0800 Subject: [PATCH 359/676] scheduler: Support keeping some latches for another command when releasing latches. (#13833) ref tikv/tikv#13298, ref tikv/tikv#13826 Makes latches support keeping some of the latch slots when releasing, and these kept latch slots can be derived by another scheduler command. This is necessary for supporting resuming pessimistic lock requests when releasing pessimistic locks. It provides a mechanism to avoid the latch from being acquired by other later-arriving requests at the time between the lock-releasing's end and the resumed pessimistic lock command's beginning. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- src/storage/txn/latch.rs | 230 +++++++++++++++++++++++++++++++++-- src/storage/txn/scheduler.rs | 4 +- 2 files changed, 224 insertions(+), 10 deletions(-) diff --git a/src/storage/txn/latch.rs b/src/storage/txn/latch.rs index 12cc51207bb..a662d9bab79 100644 --- a/src/storage/txn/latch.rs +++ b/src/storage/txn/latch.rs @@ -79,6 +79,11 @@ impl Latch { self.waiting.push_back(Some((key_hash, cid))); } + /// Pushes the cid to the front of the queue. Be careful when using it. + fn push_preemptive(&mut self, key_hash: u64, cid: u64) { + self.waiting.push_front(Some((key_hash, cid))); + } + /// For some hot keys, the waiting list maybe very long, so we should shrink /// the waiting VecDeque after pop. fn maybe_shrink(&mut self) { @@ -137,6 +142,12 @@ impl Lock { self.required_hashes.len() == self.owned_count } + /// Force set the state of the `Lock` to be already-acquired. Be careful + /// when using it. + pub fn force_assume_acquired(&mut self) { + self.owned_count = self.required_hashes.len(); + } + pub fn is_write_lock(&self) -> bool { !self.required_hashes.is_empty() } @@ -196,19 +207,62 @@ impl Latches { /// Releases all latches owned by the `lock` of command with ID `who`, /// returns the wakeup list. /// + /// Optionally, this function can release partial of the given `Lock` while + /// leaving the renaming unlocked, so that some of the latches can be + /// used in another command. This can be done by passing the cid of the + /// command who will use the kept latch slots later, and the `Lock` that + /// need to be kept via the parameter `keep_latches_for_next_cmd`. Note + /// that the lock in it is assumed to be a subset of the parameter + /// `lock` which is going to be released. + /// /// Preconditions: the caller must ensure the command is at the front of the /// latches. - pub fn release(&self, lock: &Lock, who: u64) -> Vec { + pub fn release( + &self, + lock: &Lock, + who: u64, + keep_latches_for_next_cmd: Option<(u64, &Lock)>, + ) -> Vec { + // Used to + let dummy_vec = vec![]; + let (keep_latches_for_cid, mut keep_latches_it) = match keep_latches_for_next_cmd { + Some((cid, lock)) => (Some(cid), lock.required_hashes.iter().peekable()), + None => (None, dummy_vec.iter().peekable()), + }; + + // `keep_latches_it` must be sorted and deduped since it's retrieved from a + // `Lock` object. + let mut wakeup_list: Vec = vec![]; for &key_hash in &lock.required_hashes[..lock.owned_count] { let mut latch = self.lock_latch(key_hash); let (v, front) = latch.pop_front(key_hash).unwrap(); assert_eq!(front, who); assert_eq!(v, key_hash); - if let Some(wakeup) = latch.get_first_req_by_hash(key_hash) { - wakeup_list.push(wakeup); + + let keep_for_next_cmd = if let Some(&&next_keep_hash) = keep_latches_it.peek() { + assert!(next_keep_hash >= key_hash); + if next_keep_hash == key_hash { + keep_latches_it.next(); + true + } else { + false + } + } else { + false + }; + + if !keep_for_next_cmd { + if let Some(wakeup) = latch.get_first_req_by_hash(key_hash) { + wakeup_list.push(wakeup); + } + } else { + latch.push_preemptive(key_hash, keep_latches_for_cid.unwrap()); } } + + assert!(keep_latches_it.next().is_none()); + wakeup_list } @@ -220,6 +274,8 @@ impl Latches { #[cfg(test)] mod tests { + use std::iter::once; + use super::*; #[test] @@ -242,7 +298,7 @@ mod tests { assert_eq!(acquired_b, false); // a release lock, and get wakeup list - let wakeup = latches.release(&lock_a, cid_a); + let wakeup = latches.release(&lock_a, cid_a, None); assert_eq!(wakeup[0], cid_b); // b acquire lock success @@ -277,7 +333,7 @@ mod tests { assert_eq!(acquired_c, false); // a release lock, and get wakeup list - let wakeup = latches.release(&lock_a, cid_a); + let wakeup = latches.release(&lock_a, cid_a, None); assert_eq!(wakeup[0], cid_c); // c acquire lock failed again, cause b occupied slot 4 @@ -285,7 +341,7 @@ mod tests { assert_eq!(acquired_c, false); // b release lock, and get wakeup list - let wakeup = latches.release(&lock_b, cid_b); + let wakeup = latches.release(&lock_b, cid_b, None); assert_eq!(wakeup[0], cid_c); // finally c acquire lock success @@ -326,7 +382,7 @@ mod tests { assert_eq!(acquired_d, false); // a release lock, and get wakeup list - let wakeup = latches.release(&lock_a, cid_a); + let wakeup = latches.release(&lock_a, cid_a, None); assert_eq!(wakeup[0], cid_c); // c acquire lock success @@ -334,11 +390,169 @@ mod tests { assert_eq!(acquired_c, true); // b release lock, and get wakeup list - let wakeup = latches.release(&lock_b, cid_b); + let wakeup = latches.release(&lock_b, cid_b, None); assert_eq!(wakeup[0], cid_d); // finally d acquire lock success acquired_d = latches.acquire(&mut lock_d, cid_d); assert_eq!(acquired_d, true); } + + fn check_latch_holder(latches: &Latches, key: &[u8], expected_holder_cid: Option) { + let hash = Lock::hash(&key); + let actual_holder = latches.lock_latch(hash).get_first_req_by_hash(hash); + assert_eq!(actual_holder, expected_holder_cid); + } + + fn is_latches_empty(latches: &Latches) -> bool { + for i in 0..(latches.size as u64) { + if !latches.lock_latch(i).waiting.iter().all(|x| x.is_none()) { + return false; + } + } + true + } + + fn test_partially_releasing_impl(size: usize) { + let latches = Latches::new(size); + + // Single key. + let key = b"k1"; + let mut lock = Lock::new(once(key)); + assert!(latches.acquire(&mut lock, 1)); + assert!(!is_latches_empty(&latches)); + let mut lock2 = Lock::new(once(key)); + let wakeup = latches.release(&lock, 1, Some((2, &lock2))); + assert!(wakeup.is_empty()); + check_latch_holder(&latches, key, Some(2)); + lock2.force_assume_acquired(); + let wakeup = latches.release(&lock2, 2, None); + assert!(wakeup.is_empty()); + assert!(is_latches_empty(&latches)); + + // Single key with queueing commands. + let mut lock = Lock::new(once(key)); + let mut queueing_lock = Lock::new(once(key)); + assert!(latches.acquire(&mut lock, 3)); + assert!(!latches.acquire(&mut queueing_lock, 4)); + let mut lock2 = Lock::new(once(key)); + let wakeup = latches.release(&lock, 3, Some((5, &lock2))); + assert!(wakeup.is_empty()); + check_latch_holder(&latches, key, Some(5)); + lock2.force_assume_acquired(); + let wakeup = latches.release(&lock2, 5, None); + assert_eq!(wakeup, vec![4u64]); + assert!(latches.acquire(&mut queueing_lock, 4)); + let wakeup = latches.release(&queueing_lock, 4, None); + assert!(wakeup.is_empty()); + assert!(is_latches_empty(&latches)); + + // Multi keys, keep all. + let keys = vec![b"k1", b"k2", b"k3", b"k4"]; + let mut lock = Lock::new(keys.iter()); + assert!(latches.acquire(&mut lock, 11)); + let mut lock2 = Lock::new(keys.iter()); + let wakeup = latches.release(&lock, 11, Some((12, &lock2))); + assert!(wakeup.is_empty()); + for &key in &keys { + check_latch_holder(&latches, key, Some(12)); + } + assert!(!is_latches_empty(&latches)); + lock2.force_assume_acquired(); + let wakeup = latches.release(&lock2, 12, None); + assert!(wakeup.is_empty()); + assert!(is_latches_empty(&latches)); + + // Multi keys, keep all, with queueing command. + let mut lock = Lock::new(keys.iter()); + assert!(latches.acquire(&mut lock, 11)); + let mut queueing_locks: Vec<_> = keys.iter().map(|k| Lock::new(once(k))).collect(); + for (cid, lock) in (12..16).zip(queueing_locks.iter_mut()) { + assert!(!latches.acquire(lock, cid)); + } + let mut lock2 = Lock::new(keys.iter()); + let wakeup = latches.release(&lock, 11, Some((17, &lock2))); + assert!(wakeup.is_empty()); + for &key in &keys { + check_latch_holder(&latches, key, Some(17)); + } + assert!(!is_latches_empty(&latches)); + lock2.force_assume_acquired(); + let mut wakeup = latches.release(&lock2, 17, None); + wakeup.sort_unstable(); + // Wake up queueing commands. + assert_eq!(wakeup, vec![12u64, 13, 14, 15]); + for (cid, mut lock) in (12..16).zip(queueing_locks) { + assert!(latches.acquire(&mut lock, cid)); + let wakeup = latches.release(&lock, cid, None); + assert!(wakeup.is_empty()); + } + assert!(is_latches_empty(&latches)); + + // 4 keys, keep 2 of them. + for (i1, &k1) in keys[0..3].iter().enumerate() { + for &k2 in keys[i1 + 1..4].iter() { + let mut lock = Lock::new(keys.iter()); + assert!(latches.acquire(&mut lock, 21)); + let mut lock2 = Lock::new(vec![k1, k2]); + let wakeup = latches.release(&lock, 21, Some((22, &lock2))); + assert!(wakeup.is_empty()); + check_latch_holder(&latches, k1, Some(22)); + check_latch_holder(&latches, k2, Some(22)); + lock2.force_assume_acquired(); + let wakeup = latches.release(&lock2, 22, None); + assert!(wakeup.is_empty()); + assert!(is_latches_empty(&latches)); + } + } + + // 4 keys keep 2 of them, with queueing commands. + for (i1, &k1) in keys[0..3].iter().enumerate() { + for (i2, &k2) in keys[i1 + 1..4].iter().enumerate() { + let mut lock = Lock::new(keys.iter()); + assert!(latches.acquire(&mut lock, 21)); + + let mut queueing_locks: Vec<_> = keys.iter().map(|k| Lock::new(once(k))).collect(); + for (cid, lock) in (22..26).zip(queueing_locks.iter_mut()) { + assert!(!latches.acquire(lock, cid)); + } + + let mut lock2 = Lock::new(vec![k1, k2]); + let mut wakeup = latches.release(&lock, 21, Some((27, &lock2))); + assert_eq!(wakeup.len(), 2); + + // The latch of k1 and k2 is preempted, and queueing locks on the other two keys + // will be woken up. + let preempted_cids = vec![(i1 + 22) as u64, (i1 + 1 + i2 + 22) as u64]; + let expected_wakeup_cids: Vec<_> = (22..26u64) + .filter(|x| !preempted_cids.contains(x)) + .collect(); + wakeup.sort_unstable(); + assert_eq!(wakeup, expected_wakeup_cids); + + check_latch_holder(&latches, k1, Some(27)); + check_latch_holder(&latches, k2, Some(27)); + + lock2.force_assume_acquired(); + let mut wakeup = latches.release(&lock2, 27, None); + wakeup.sort_unstable(); + assert_eq!(wakeup, preempted_cids); + + for (cid, mut lock) in (22..26).zip(queueing_locks) { + assert!(latches.acquire(&mut lock, cid)); + let wakeup = latches.release(&lock, cid, None); + assert!(wakeup.is_empty()); + } + + assert!(is_latches_empty(&latches)); + } + } + } + + #[test] + fn test_partially_releasing() { + test_partially_releasing_impl(256); + test_partially_releasing_impl(4); + test_partially_releasing_impl(2); + } } diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 24ef7466e63..bc1598d65fa 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -449,7 +449,7 @@ impl Scheduler { /// Releases all the latches held by a command. fn release_lock(&self, lock: &Lock, cid: u64) { - let wakeup_list = self.inner.latches.release(lock, cid); + let wakeup_list = self.inner.latches.release(lock, cid, None); for wcid in wakeup_list { self.try_to_wake_up(wcid); } @@ -1660,7 +1660,7 @@ mod tests { if id != 0 { assert!(latches.acquire(&mut lock, id)); } - let unlocked = latches.release(&lock, id); + let unlocked = latches.release(&lock, id, None); if id == max_id { assert!(unlocked.is_empty()); } else { From f7ba20232fa232329e55eac00898042bf0560721 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Fri, 25 Nov 2022 10:21:59 +0800 Subject: [PATCH 360/676] reader: fix the invisible write record flashback bug (#13845) close tikv/tikv#13844 Fix the bug that when a key's last write record is `WriteType::Lock` or `WriteType::Rollback`, the `scan_writes` will return the invisible write record as the result, which will affect the flashback correctness. Signed-off-by: JmPotato --- src/storage/mvcc/reader/reader.rs | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index dd6bff6a157..c7cb9194068 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -638,13 +638,18 @@ impl MvccReader { break; } WriteType::Lock | WriteType::Rollback => { - // We should find the latest visible version after it. + // Only return the PUT/DELETE write record. + write = None; + // Reach the end. + if !cursor.valid()? { + break; + } + // Try to find the latest visible version before it. let key = Key::from_encoded_slice(cursor.key(&mut self.statistics.write)); // Could not find the visible version, current cursor is on the next - // key, so we set both `write` and `cur_key` to `None`. + // key, so we set `cur_key` to `None`. if key.truncate_ts()? != user_key { - write = None; cur_key = None; break; } @@ -1836,6 +1841,13 @@ pub mod tests { 8, ); engine.commit(b"k3", 8, 9); + // Prewrite and rollback k4. + engine.prewrite( + Mutation::make_put(Key::from_raw(b"k4"), b"v4@1".to_vec()), + b"k4", + 10, + ); + engine.rollback(b"k4", 10); // Current MVCC keys in `CF_WRITE` should be: // PUT k0 -> v0@999 @@ -1847,6 +1859,7 @@ pub mod tests { // PUT k3 -> v3@8 // ROLLBACK k3 -> v3@7 // PUT k3 -> v3@5 + // ROLLBACK k4 -> v4@1 struct Case { start_key: Option, @@ -2083,15 +2096,25 @@ pub mod tests { start_key: None, end_key: None, version: Some(0), - limit: 5, + limit: 6, expect_res: vec![ (Key::from_raw(b"k0"), None), (Key::from_raw(b"k1"), None), (Key::from_raw(b"k2"), None), (Key::from_raw(b"k3"), None), + (Key::from_raw(b"k4"), None), ], expect_is_remain: false, }, + // Test the invisible record. + Case { + start_key: Some(Key::from_raw(b"k4")), + end_key: None, + version: Some(10), + limit: 1, + expect_res: vec![(Key::from_raw(b"k4"), None)], + expect_is_remain: true, + }, ]; for (idx, case) in cases.iter().enumerate() { From 998cb30d4d6087a40454a5787b56f0a151ae24e0 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Fri, 25 Nov 2022 14:47:59 +0800 Subject: [PATCH 361/676] raftstore-v2: support apply snapshot (#13734) ref tikv/tikv#12842 aftstore-v2: support apply snapshot This PR implementation apply snapshot relative changes. - add snap worker for multi-rocksdb Signed-off-by: bufferflies <1045931706@qq.com> --- components/engine_test/src/lib.rs | 7 +- components/raftstore-v2/src/batch/store.rs | 12 ++- .../src/operation/ready/async_writer.rs | 11 ++- .../raftstore-v2/src/operation/ready/mod.rs | 44 +++++++-- .../src/operation/ready/snapshot.rs | 97 ++++++++++++++++++- components/raftstore-v2/src/raft/storage.rs | 66 ++++++++++++- .../tests/integrations/cluster.rs | 47 +++++++-- .../tests/integrations/test_conf_change.rs | 40 +++++++- .../raftstore/src/store/async_io/read.rs | 2 +- .../raftstore/src/store/async_io/write.rs | 21 +++- .../raftstore/src/store/entry_storage.rs | 10 ++ components/raftstore/src/store/snap.rs | 9 +- src/server/tablet_snap.rs | 10 +- 13 files changed, 331 insertions(+), 45 deletions(-) diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index ae834457757..77bd2d3be7c 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -128,12 +128,17 @@ pub mod kv { db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>, ) -> Self { - Self { + let factory = Self { root_path: root_path.to_path_buf(), db_opt, cf_opts, root_db: Arc::new(Mutex::default()), + }; + let tablet_path = factory.tablets_path(); + if !tablet_path.exists() { + std::fs::create_dir_all(tablet_path).unwrap(); } + factory } fn create_tablet(&self, tablet_path: &Path) -> Result { diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 1eea2017571..199e8cafbd8 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -77,6 +77,7 @@ pub struct StoreContext { pub tablet_factory: Arc>, pub apply_pool: FuturePool, pub read_scheduler: Scheduler>, + pub snap_mgr: TabletSnapManager, pub pd_scheduler: Scheduler, } @@ -230,6 +231,7 @@ struct StorePollerBuilder { apply_pool: FuturePool, logger: Logger, store_meta: Arc>>, + snap_mgr: TabletSnapManager, } impl StorePollerBuilder { @@ -245,6 +247,7 @@ impl StorePollerBuilder { store_writers: &mut StoreWriters, logger: Logger, store_meta: Arc>>, + snap_mgr: TabletSnapManager, ) -> Self { let pool_size = cfg.value().apply_batch_system.pool_size; let max_pool_size = std::cmp::max( @@ -269,6 +272,7 @@ impl StorePollerBuilder { logger, write_senders: store_writers.senders(), store_meta, + snap_mgr, } } @@ -341,6 +345,7 @@ where tablet_factory: self.tablet_factory.clone(), apply_pool: self.apply_pool.clone(), read_scheduler: self.read_scheduler.clone(), + snap_mgr: self.snap_mgr.clone(), pd_scheduler: self.pd_scheduler.clone(), }; let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); @@ -408,7 +413,7 @@ impl StoreSystem { .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; let mut read_runner = ReadRunner::new(router.clone(), raft_engine.clone()); - read_runner.set_snap_mgr(snap_mgr); + read_runner.set_snap_mgr(snap_mgr.clone()); let read_scheduler = workers .async_read_worker .start("async-read-worker", read_runner); @@ -441,6 +446,7 @@ impl StoreSystem { &mut workers.store_writers, self.logger.clone(), store_meta.clone(), + snap_mgr, ); self.workers = Some(workers); let peers = builder.init()?; @@ -512,7 +518,7 @@ impl StoreRouter { ) -> std::result::Result<(), TrySendError>> { let id = msg.get_region_id(); let peer_msg = PeerMsg::RaftMessage(msg); - let store_msg = match self.try_send(id, peer_msg) { + let store_msg = match self.router.try_send(id, peer_msg) { Either::Left(Ok(())) => return Ok(()), Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m)))) => { return Err(TrySendError::Full(m)); @@ -523,7 +529,7 @@ impl StoreRouter { Either::Right(PeerMsg::RaftMessage(m)) => StoreMsg::RaftMessage(m), _ => unreachable!(), }; - match self.send_control(store_msg) { + match self.router.send_control(store_msg) { Ok(()) => Ok(()), Err(TrySendError::Full(StoreMsg::RaftMessage(m))) => Err(TrySendError::Full(m)), Err(TrySendError::Disconnected(StoreMsg::RaftMessage(m))) => { diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index d5673d76a40..a7bce44fe05 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -22,6 +22,7 @@ struct UnpersistedReady { /// Max number of following ready whose data to be persisted is empty. max_empty_number: u64, raft_msgs: Vec>, + has_snapshot: bool, } /// A writer that handles asynchronous writes. @@ -70,6 +71,7 @@ impl AsyncWriter { fn send(&mut self, ctx: &mut impl WriteRouterContext, task: WriteTask) { let ready_number = task.ready_number(); + let has_snapshot = task.has_snapshot; self.write_router.send_write_msg( ctx, self.unpersisted_readies.back().map(|r| r.number), @@ -79,6 +81,7 @@ impl AsyncWriter { number: ready_number, max_empty_number: ready_number, raft_msgs: vec![], + has_snapshot, }); } @@ -108,9 +111,9 @@ impl AsyncWriter { ctx: &mut impl WriteRouterContext, ready_number: u64, logger: &Logger, - ) -> Vec> { + ) -> (Vec>, bool) { if self.persisted_number >= ready_number { - return vec![]; + return (vec![], false); } let last_unpersisted = self.unpersisted_readies.back(); @@ -124,11 +127,13 @@ impl AsyncWriter { } let mut raft_messages = vec![]; + let mut has_snapshot = false; // There must be a match in `self.unpersisted_readies`. loop { let Some(v) = self.unpersisted_readies.pop_front() else { panic!("{:?} ready number not found {}", logger.list(), ready_number); }; + has_snapshot |= v.has_snapshot; if v.number > ready_number { panic!( "{:?} ready number not matched {:?} vs {}", @@ -151,7 +156,7 @@ impl AsyncWriter { self.write_router .check_new_persisted(ctx, self.persisted_number); - raft_messages + (raft_messages, has_snapshot) } pub fn persisted_number(&self) -> u64 { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 9e639f233cc..1c8c9d80338 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -22,9 +22,12 @@ mod snapshot; use std::{cmp, time::Instant}; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, MiscExt, OpenOptions, RaftEngine, TabletFactory}; use error_code::ErrorCodeExt; -use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; +use kvproto::{ + raft_cmdpb::AdminCmdType, + raft_serverpb::{PeerState, RaftMessage, RaftSnapshotData}, +}; use protobuf::Message as _; use raft::{eraftpb, Ready, StateRole, INVALID_ID}; use raftstore::store::{util, ExtraStates, FetchedLogs, ReadProgress, Transport, WriteTask}; @@ -40,6 +43,7 @@ use crate::{ fsm::PeerFsmDelegate, raft::{Peer, Storage}, router::{ApplyTask, PeerTick}, + Result, }; impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { @@ -334,7 +338,7 @@ impl Peer { let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); self.storage_mut() - .handle_raft_ready(&mut ready, &mut write_task); + .handle_raft_ready(ctx, &mut ready, &mut write_task); if !ready.persisted_messages().is_empty() { write_task.messages = ready .take_persisted_messages() @@ -388,17 +392,27 @@ impl Peer { error!(self.logger, "peer id not matched"; "persisted_peer_id" => peer_id, "persisted_number" => ready_number); return; } - let persisted_message = self - .async_writer - .on_persisted(ctx, ready_number, &self.logger); + let (persisted_message, has_snapshot) = + self.async_writer + .on_persisted(ctx, ready_number, &self.logger); for msgs in persisted_message { for msg in msgs { self.send_raft_message(ctx, msg); } } + let persisted_number = self.async_writer.persisted_number(); self.raft_group_mut().on_persist_ready(persisted_number); let persisted_index = self.raft_group().raft.raft_log.persisted; + /// The apply snapshot process order would be: + /// - Get the snapshot from the ready + /// - Wait for async writer to load this tablet + /// In this step, the snapshot has loaded finish, but some apply state + /// need to update. + if has_snapshot { + self.on_applied_snapshot(ctx); + } + self.storage_mut() .entry_storage_mut() .update_cache_persisted(persisted_index); @@ -509,11 +523,25 @@ impl Peer { impl Storage { /// Apply the ready to the storage. If there is any states need to be /// persisted, it will be written to `write_task`. - fn handle_raft_ready(&mut self, ready: &mut Ready, write_task: &mut WriteTask) { + fn handle_raft_ready( + &mut self, + ctx: &mut StoreContext, + ready: &mut Ready, + write_task: &mut WriteTask, + ) { let prev_raft_state = self.entry_storage().raft_state().clone(); let ever_persisted = self.ever_persisted(); - // TODO: handle snapshot + if !ready.snapshot().is_empty() { + if let Err(e) = self.apply_snapshot( + ready.snapshot(), + write_task, + ctx.snap_mgr.clone(), + ctx.tablet_factory.clone(), + ) { + error!(self.logger(),"failed to apply snapshot";"error" => ?e) + } + } let entry_storage = self.entry_storage_mut(); if !ready.entries().is_empty() { diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index ad836ed2455..32e8a3f8ff8 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -28,19 +28,22 @@ use std::{ }, }; -use engine_traits::{KvEngine, RaftEngine}; -use kvproto::raft_serverpb::{RaftSnapshotData, RegionLocalState}; +use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; +use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; use protobuf::Message; use raft::eraftpb::Snapshot; -use raftstore::store::{metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask}; +use raftstore::store::{ + metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, + TabletSnapManager, Transport, WriteTask, +}; use slog::{error, info, warn}; -use tikv_util::{box_try, worker::Scheduler}; +use tikv_util::{box_err, box_try, worker::Scheduler}; use crate::{ fsm::ApplyResReporter, raft::{Apply, Peer, Storage}, router::{ApplyTask, PeerTick}, - Result, + Result, StoreContext, }; #[derive(Debug)] @@ -115,6 +118,25 @@ impl Peer { self.set_has_ready(); } } + + pub fn on_applied_snapshot(&mut self, ctx: &mut StoreContext) { + let persisted_index = self.raft_group().raft.raft_log.persisted; + let first_index = self.storage().entry_storage().first_index(); + if first_index == persisted_index + 1 { + let region_id = self.region_id(); + let tablet = ctx + .tablet_factory + .open_tablet(region_id, Some(persisted_index), OpenOptions::default()) + .unwrap(); + self.tablet_mut().set(tablet); + self.schedule_apply_fsm(ctx); + self.storage_mut().on_applied_snapshot(); + self.raft_group_mut().advance_apply_to(persisted_index); + self.read_progress_mut() + .update_applied_core(persisted_index); + info!(self.logger, "apply tablet snapshot completely"); + } + } } impl Apply { @@ -313,4 +335,69 @@ impl Storage { *snap_state = SnapState::Generated(snap); true } + + pub fn on_applied_snapshot(&mut self) { + let mut entry = self.entry_storage_mut(); + let term = entry.truncated_term(); + let index = entry.truncated_index(); + entry.set_applied_term(term); + entry.apply_state_mut().set_applied_index(index); + self.region_state_mut().set_tablet_index(index); + } + + pub fn apply_snapshot( + &mut self, + snap: &Snapshot, + task: &mut WriteTask, + snap_mgr: TabletSnapManager, + tablet_factory: Arc>, + ) -> Result<()> { + let region_id = self.region().get_id(); + let peer_id = self.peer().get_id(); + info!( + self.logger(), + "begin to apply snapshot"; + ); + + let mut snap_data = RaftSnapshotData::default(); + snap_data.merge_from_bytes(snap.get_data())?; + let region = snap_data.take_region(); + if region.get_id() != region_id { + return Err(box_err!( + "mismatch region id {}!={}", + region_id, + region.get_id() + )); + } + + let last_index = snap.get_metadata().get_index(); + let last_term = snap.get_metadata().get_term(); + self.region_state_mut().set_state(PeerState::Normal); + self.region_state_mut().set_region(region); + self.entry_storage_mut() + .raft_state_mut() + .set_last_index(last_index); + self.entry_storage_mut().set_truncated_index(last_index); + self.entry_storage_mut().set_truncated_term(last_term); + self.entry_storage_mut().set_last_term(last_term); + + let key = TabletSnapKey::new(region_id, peer_id, last_term, last_index); + let mut path = snap_mgr.final_recv_path(&key); + let logger = self.logger().clone(); + // The snapshot require no additional processing such as ingest them to DB, but + // it should load it into the factory after it persisted. + let hook = move || { + if let Err(e) = tablet_factory.load_tablet(path.as_path(), region_id, last_index) { + panic!( + "{:?} failed to load tablet, path: {}, {:?}", + logger.list(), + path.display(), + e + ); + } + }; + task.persisted_cb = (Some(Box::new(hook))); + task.has_snapshot = true; + Ok(()) + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index d2abb6818d8..b3ad56af4fd 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -15,7 +15,9 @@ use raft::{ eraftpb::{ConfState, Entry, Snapshot}, GetEntriesContext, RaftState, INVALID_ID, }; -use raftstore::store::{util, EntryStorage, ReadTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; +use raftstore::store::{ + util, EntryStorage, ReadTask, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, +}; use slog::{info, o, Logger}; use tikv_util::{box_err, store::find_peer, worker::Scheduler}; @@ -282,6 +284,11 @@ impl Storage { }) } + #[inline] + pub fn region_state_mut(&mut self) -> &mut RegionLocalState { + &mut self.region_state + } + #[inline] pub fn raft_state(&self) -> &RaftLocalState { self.entry_storage.raft_state() @@ -413,8 +420,8 @@ mod tests { }; use raft::{eraftpb::Snapshot as RaftSnapshot, Error as RaftError, StorageError}; use raftstore::store::{ - AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask, TabletSnapKey, - TabletSnapManager, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask, + TabletSnapKey, TabletSnapManager, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; use slog::o; use tempfile::TempDir; @@ -490,6 +497,57 @@ mod tests { assert_eq!(ts.get_term(), RAFT_INIT_LOG_TERM); } + #[test] + fn test_apply_snapshot() { + let region = new_region(); + let path = TempDir::new().unwrap(); + let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()); + mgr.init().unwrap(); + let raft_engine = + engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) + .unwrap(); + let mut wb = raft_engine.log_batch(10); + write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + // building a tablet factory + let ops = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let factory = Arc::new(TestTabletFactoryV2::new( + path.path().join("tablet").as_path(), + ops, + cf_opts, + )); + let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); + let sched = worker.scheduler(); + let logger = slog_global::borrow_global().new(o!()); + let mut s = Storage::new(4, 6, raft_engine.clone(), sched, &logger.clone()) + .unwrap() + .unwrap(); + + let snapshot = new_empty_snapshot(region.clone(), 10, 1, false); + let mut task = WriteTask::new(region.get_id(), 5, 0); + s.apply_snapshot(&snapshot, &mut task, mgr, factory) + .unwrap(); + + // It can be set before load tablet. + assert_eq!(PeerState::Normal, s.region_state().get_state()); + assert_eq!(10, s.entry_storage().truncated_index()); + assert_eq!(1, s.entry_storage().truncated_term()); + assert_eq!(1, s.entry_storage().last_term()); + assert_eq!(10, s.entry_storage().raft_state().last_index); + // This index can't be set before load tablet. + assert_ne!(10, s.entry_storage().applied_index()); + assert_ne!(1, s.entry_storage().applied_term()); + assert_ne!(10, s.region_state().get_tablet_index()); + assert!(task.persisted_cb.is_some()); + + s.on_applied_snapshot(); + assert_eq!(10, s.entry_storage().applied_index()); + assert_eq!(1, s.entry_storage().applied_term()); + assert_eq!(10, s.region_state().get_tablet_index()); + } + #[test] fn test_storage_create_snapshot() { let region = new_region(); @@ -553,7 +611,7 @@ mod tests { assert_eq!(snap.get_metadata().get_term(), 0); assert_eq!(snap.get_data().is_empty(), false); let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); - let checkpointer_path = mgr.get_final_path_for_gen(&snap_key); + let checkpointer_path = mgr.tablet_gen_path(&snap_key); assert!(checkpointer_path.exists()); // Test cancel snapshot diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index ef1f7411ac9..24184233117 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -28,9 +28,10 @@ use kvproto::{ raft_serverpb::RaftMessage, }; use pd_client::RpcClient; +use raft::eraftpb::MessageType; use raftstore::store::{ region_meta::{RegionLocalState, RegionMeta}, - Config, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, + Config, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, }; use raftstore_v2::{ create_store_batch_system, @@ -198,7 +199,7 @@ impl RunningState { concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, logger: &Logger, - ) -> (TestRouter, Self) { + ) -> (TestRouter, TabletSnapManager, Self) { let cf_opts = ALL_CFS .iter() .copied() @@ -240,6 +241,7 @@ impl RunningState { let router = RaftRouter::new(store_id, router); let store_meta = router.store_meta().clone(); let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()); + snap_mgr.init().unwrap(); system .start( store_id, @@ -250,7 +252,7 @@ impl RunningState { pd_client.clone(), router.store_router(), store_meta.clone(), - snap_mgr, + snap_mgr.clone(), concurrency_manager, causal_ts_provider, ) @@ -265,7 +267,7 @@ impl RunningState { transport, store_meta, }; - (TestRouter(router), state) + (TestRouter(router), snap_mgr, state) } } @@ -280,23 +282,24 @@ pub struct TestNode { path: TempDir, running_state: Option, logger: Logger, + snap_mgr: Option, } impl TestNode { fn with_pd(pd_server: &test_pd::Server, logger: Logger) -> TestNode { let pd_client = Arc::new(test_pd::util::new_client(pd_server.bind_addrs(), None)); let path = TempDir::new().unwrap(); - TestNode { pd_client, path, running_state: None, logger, + snap_mgr: None, } } fn start(&mut self, cfg: Arc>, trans: TestTransport) -> TestRouter { - let (router, state) = RunningState::new( + let (router, snap_mgr, state) = RunningState::new( &self.pd_client, self.path.path(), cfg, @@ -306,6 +309,7 @@ impl TestNode { &self.logger, ); self.running_state = Some(state); + self.snap_mgr = Some(snap_mgr); router } @@ -336,6 +340,10 @@ impl TestNode { self.running_state.as_ref() } + pub fn snap_mgr(&self) -> Option<&TabletSnapManager> { + self.snap_mgr.as_ref() + } + pub fn id(&self) -> u64 { self.running_state().unwrap().store_id } @@ -485,6 +493,33 @@ impl Cluster { continue; } }; + // Simulate already received the snapshot. + if msg.get_message().get_msg_type() == MessageType::MsgSnapshot { + let from_offset = match self + .nodes + .iter() + .position(|n| n.id() == msg.get_from_peer().get_store_id()) + { + Some(offset) => offset, + None => { + debug!(self.logger, "failed to find snapshot source node"; "message" => ?msg); + continue; + } + }; + let key = TabletSnapKey::new( + region_id, + msg.get_to_peer().get_id(), + msg.get_message().get_snapshot().get_metadata().get_term(), + msg.get_message().get_snapshot().get_metadata().get_index(), + ); + let from_snap_mgr = self.node(from_offset).snap_mgr().unwrap(); + let to_snap_mgr = self.node(offset).snap_mgr().unwrap(); + let gen_path = from_snap_mgr.tablet_gen_path(&key); + let recv_path = to_snap_mgr.final_recv_path(&key); + assert!(gen_path.exists()); + std::fs::rename(gen_path, recv_path.clone()).unwrap(); + assert!(recv_path.exists()); + } regions.insert(msg.get_region_id()); if let Err(e) = self.routers[offset].send_raft_message(msg) { debug!(self.logger, "failed to send raft message"; "err" => ?e); diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index f9479786a7b..558962f8ef6 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -1,9 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{self, time::Duration}; -use kvproto::raft_cmdpb::AdminCmdType; +use engine_traits::{OpenOptions, Peekable, TabletFactory}; +use kvproto::raft_cmdpb::{AdminCmdType, CmdType, Request}; use raft::prelude::ConfChangeType; +use raftstore_v2::router::{PeerMsg, PeerTick}; use tikv_util::store::new_learner_peer; use crate::cluster::Cluster; @@ -11,6 +13,7 @@ use crate::cluster::Cluster; #[test] fn test_simple_change() { let cluster = Cluster::with_node_count(2, None); + let region_id = 2; let router0 = cluster.router(0); let mut req = router0.new_request_for(2); let admin_req = req.mut_admin_request(); @@ -29,6 +32,7 @@ fn test_simple_change() { let meta = router0 .must_query_debug_info(2, Duration::from_secs(3)) .unwrap(); + let match_index = meta.raft_apply.applied_index; assert_eq!(meta.region_state.epoch.version, epoch.get_version()); assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); assert_eq!(meta.region_state.peers, vec![leader_peer, new_peer]); @@ -46,6 +50,38 @@ fn test_simple_change() { meta.raft_status.soft_state.leader_id, req.get_header().get_peer().get_id() ); + // Trigger the raft tick to replica the log to the learner and execute the + // snapshot task. + router0 + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + cluster.dispatch(region_id, vec![]); + + // write one kv after snapshot + let (key, val) = (b"key", b"value"); + let mut write_req = router0.new_request_for(region_id); + let mut put_req = Request::default(); + put_req.set_cmd_type(CmdType::Put); + put_req.mut_put().set_key(key.to_vec()); + put_req.mut_put().set_value(val.to_vec()); + write_req.mut_requests().push(put_req); + let (msg, _) = PeerMsg::raft_command(write_req.clone()); + router0.send(region_id, msg).unwrap(); + std::thread::sleep(Duration::from_millis(1000)); + cluster.dispatch(region_id, vec![]); + + let meta = router1 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + // the learner truncated index muse be equal the leader applied index and can + // read the new written kv. + assert_eq!(match_index, meta.raft_apply.truncated_state.index); + assert!(meta.raft_apply.applied_index >= match_index); + let tablet_factory = cluster.node(1).tablet_factory(); + let tablet = tablet_factory + .open_tablet(region_id, None, OpenOptions::default().set_cache_only(true)) + .unwrap(); + assert_eq!(tablet.get_value(key).unwrap().unwrap(), val); req.mut_header() .mut_region_epoch() diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index 9e0215ca9c1..5dc01b40ef3 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -120,7 +120,7 @@ impl ReadRunner { } fn generate_snap(&self, snap_key: &TabletSnapKey, tablet: EK) -> crate::Result<()> { - let checkpointer_path = self.snap_mgr().get_final_path_for_gen(snap_key); + let checkpointer_path = self.snap_mgr().tablet_gen_path(snap_key); if checkpointer_path.as_path().exists() { // Remove the old checkpoint directly. std::fs::remove_dir_all(checkpointer_path.as_path())?; diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index e534a17fad1..354a796c99c 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -169,12 +169,15 @@ where ready_number: u64, pub send_time: Instant, pub raft_wb: Option, + // called after writing to kvdb and raftdb. + pub persisted_cb: Option>, pub entries: Vec, pub cut_logs: Option<(u64, u64)>, pub raft_state: Option, pub extra_write: ExtraWrite, pub messages: Vec, pub trackers: Vec, + pub has_snapshot: bool, } impl WriteTask @@ -195,6 +198,8 @@ where extra_write: ExtraWrite::None, messages: vec![], trackers: vec![], + persisted_cb: None, + has_snapshot: false, } } @@ -361,6 +366,7 @@ where pub extra_batch_write: ExtraBatchWrite, pub state_size: usize, pub tasks: Vec>, + pub persisted_cbs: Vec>, // region_id -> (peer_id, ready_number) pub readies: HashMap, } @@ -377,6 +383,7 @@ where extra_batch_write: ExtraBatchWrite::None, state_size: 0, tasks: vec![], + persisted_cbs: vec![], readies: HashMap::default(), } } @@ -430,7 +437,9 @@ where ); } } - + if let Some(v) = task.persisted_cb.take() { + self.persisted_cbs.push(v); + }; self.tasks.push(task); } @@ -511,6 +520,12 @@ where } } + fn after_write_all(&mut self) { + for hook in mem::take(&mut self.persisted_cbs) { + hook(); + } + } + fn after_write_to_raft_db(&mut self, metrics: &StoreWriteMetrics) { if metrics.waterfall_metrics { let now = std::time::Instant::now(); @@ -706,10 +721,8 @@ where write_kv_time = duration_to_sec(now.saturating_elapsed()); STORE_WRITE_KVDB_DURATION_HISTOGRAM.observe(write_kv_time); } - self.batch.after_write_to_kv_db(&self.metrics); } - fail_point!("raft_between_save"); let mut write_raft_time = 0f64; @@ -746,6 +759,8 @@ where self.batch.after_write_to_raft_db(&self.metrics); + self.batch.after_write_all(); + fail_point!("raft_before_follower_send"); let mut now = Instant::now(); diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 705e2a776fa..c6278c890f7 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -958,6 +958,16 @@ impl EntryStorage { } } + #[inline] + pub fn set_truncated_index(&mut self, index: u64) { + self.apply_state.mut_truncated_state().set_index(index) + } + + #[inline] + pub fn set_truncated_term(&mut self, term: u64) { + self.apply_state.mut_truncated_state().set_term(term) + } + #[inline] pub fn first_index(&self) -> u64 { first_index(&self.apply_state) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index a9f50d61edb..19b9622657d 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1956,22 +1956,23 @@ impl TabletSnapManager { Ok(()) } - pub fn get_final_path_for_gen(&self, key: &TabletSnapKey) -> PathBuf { + pub fn tablet_gen_path(&self, key: &TabletSnapKey) -> PathBuf { let prefix = format!("{}_{}", SNAP_GEN_PREFIX, key); PathBuf::from(&self.base).join(prefix) } - pub fn get_final_path_for_recv(&self, key: &TabletSnapKey) -> PathBuf { + pub fn final_recv_path(&self, key: &TabletSnapKey) -> PathBuf { let prefix = format!("{}_{}", SNAP_REV_PREFIX, key); PathBuf::from(&self.base).join(prefix) } - pub fn get_tmp_path_for_recv(&self, key: &TabletSnapKey) -> PathBuf { + + pub fn tmp_recv_path(&self, key: &TabletSnapKey) -> PathBuf { let prefix = format!("{}_{}{}", SNAP_REV_PREFIX, key, TMP_FILE_SUFFIX); PathBuf::from(&self.base).join(prefix) } pub fn delete_snapshot(&self, key: &TabletSnapKey) -> bool { - let path = self.get_final_path_for_gen(key); + let path = self.tablet_gen_path(key); if path.exists() && let Err(e) = fs::remove_dir_all(path.as_path()) { error!( "delete snapshot failed"; diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index cbda159a83e..837ec294fce 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -112,7 +112,7 @@ async fn send_snap_files( key: TabletSnapKey, limiter: Limiter, ) -> Result { - let path = mgr.get_final_path_for_gen(&key); + let path = mgr.tablet_gen_path(&key); info!("begin to send snapshot file";"snap_key" => %key); let files = fs::read_dir(&path)? .map(|f| Ok(f?.path())) @@ -236,7 +236,7 @@ async fn recv_snap_files( .ok_or_else(|| Error::Other("empty gRPC stream".into()))?; let context = RecvTabletSnapContext::new(head)?; let chunk_size = context.chunk_size; - let path = snap_mgr.get_tmp_path_for_recv(&context.key); + let path = snap_mgr.tmp_recv_path(&context.key); info!("begin to receive tablet snapshot files"; "file" => %path.display()); fs::create_dir_all(&path)?; let _with_io_type = WithIoType::new(context.io_type); @@ -274,7 +274,7 @@ async fn recv_snap_files( f.sync_data()?; } info!("received all tablet snapshot file"; "snap_key" => %context.key); - let final_path = snap_mgr.get_final_path_for_recv(&context.key); + let final_path = snap_mgr.final_recv_path(&context.key); fs::rename(&path, final_path)?; Ok(context) } @@ -514,7 +514,7 @@ mod tests { let send_path = TempDir::new().unwrap(); let send_snap_mgr = TabletSnapManager::new(send_path.path().join("snap_dir").to_str().unwrap()); - let snap_path = send_snap_mgr.get_final_path_for_gen(&snap_key); + let snap_path = send_snap_mgr.tablet_gen_path(&snap_key); create_dir_all(snap_path.as_path()).unwrap(); // send file should skip directory create_dir_all(snap_path.join("dir")).unwrap(); @@ -545,7 +545,7 @@ mod tests { .unwrap(); let stream = rx.map(|x: (SnapshotChunk, WriteFlags)| Ok(x.0)); - let final_path = recv_snap_manager.get_final_path_for_recv(&snap_key); + let final_path = recv_snap_manager.final_recv_path(&snap_key); let r = block_on(recv_snap_files(recv_snap_manager, stream, limiter)).unwrap(); assert_eq!(r.key, snap_key); std::thread::sleep(std::time::Duration::from_secs(1)); From 531f7a00f75793ab246f89fb7ee597e097d58494 Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Fri, 25 Nov 2022 17:25:59 +0800 Subject: [PATCH 362/676] storage: skip accumulated locks on non-existing keys (#13820) ref tikv/tikv#13694 For non-existing keys, we will record last_change_ts as 0 and set a non-zero versions_to_last_change. So, if we encounter such a write record when reading, we can quickly know this key does not exist. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- components/txn_types/src/lock.rs | 10 +++-- components/txn_types/src/write.rs | 15 ++++--- src/storage/mod.rs | 2 +- src/storage/mvcc/reader/point_getter.rs | 27 +++++++++++- src/storage/mvcc/reader/reader.rs | 42 +++++++++++++++++-- src/storage/mvcc/reader/scanner/forward.rs | 24 +++++++---- .../txn/actions/acquire_pessimistic_lock.rs | 20 ++++++--- src/storage/txn/actions/prewrite.rs | 17 ++++++++ 8 files changed, 128 insertions(+), 29 deletions(-) diff --git a/components/txn_types/src/lock.rs b/components/txn_types/src/lock.rs index 040487388f9..c8e37823bc4 100644 --- a/components/txn_types/src/lock.rs +++ b/components/txn_types/src/lock.rs @@ -91,7 +91,9 @@ pub struct Lock { /// The commit TS of the latest PUT/DELETE record pub last_change_ts: TimeStamp, /// The number of versions that need skipping from the latest version to - /// find the latest PUT/DELETE record + /// find the latest PUT/DELETE record. + /// If versions_to_last_change > 0 but last_change_ts == 0, the key does not + /// have a PUT/DELETE record. pub versions_to_last_change: u64, /// The source of this txn. It is used by ticdc, if the value is 0 ticdc /// will sync the kv change event to downstream, if it is not 0, ticdc @@ -228,7 +230,7 @@ impl Lock { b.encode_u64(ts.into_inner()).unwrap(); } } - if !self.last_change_ts.is_zero() { + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { b.push(LAST_CHANGE_PREFIX); b.encode_u64(self.last_change_ts.into_inner()).unwrap(); b.encode_var_u64(self.versions_to_last_change).unwrap(); @@ -266,7 +268,7 @@ impl Lock { if !self.rollback_ts.is_empty() { size += 1 + MAX_VAR_U64_LEN + size_of::() * self.rollback_ts.len(); } - if !self.last_change_ts.is_zero() { + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { size += 1 + size_of::() + MAX_VAR_U64_LEN; } if self.txn_source != 0 { @@ -773,7 +775,7 @@ mod tests { 16, 8.into(), ) - .set_last_change(4.into(), 2), + .set_last_change(0.into(), 2), Lock::new( LockType::Lock, b"pk".to_vec(), diff --git a/components/txn_types/src/write.rs b/components/txn_types/src/write.rs index 52777e5e4b2..1a20518e423 100644 --- a/components/txn_types/src/write.rs +++ b/components/txn_types/src/write.rs @@ -281,11 +281,11 @@ impl Write { match self.write_type { WriteType::Put | WriteType::Delete => (commit_ts, 1), WriteType::Lock | WriteType::Rollback => { - // If `last_change_ts` is zero, do not set `last_change_ts` to indicate we don't - // know where is the last change. + // If neither `last_change_ts` nor `versions_to_last_change` exists, do not + // set `last_change_ts` to indicate we don't know where is the last change. // This should not happen if data is written in new version TiKV. If we hope to // support data from old TiKV, consider iterating to the last change to find it. - if !self.last_change_ts.is_zero() { + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { (self.last_change_ts, self.versions_to_last_change + 1) } else { (TimeStamp::zero(), 0) @@ -320,7 +320,9 @@ pub struct WriteRef<'a> { /// It only exists if this is a LOCK/ROLLBACK record. pub last_change_ts: TimeStamp, /// The number of versions that need skipping from this record - /// to find the latest PUT/DELETE record + /// to find the latest PUT/DELETE record. + /// If versions_to_last_change > 0 but last_change_ts == 0, the key does not + /// have a PUT/DELETE record before this write record. pub versions_to_last_change: u64, /// The source of this txn. pub txn_source: u64, @@ -411,7 +413,7 @@ impl WriteRef<'_> { b.push(GC_FENCE_PREFIX); b.encode_u64(ts.into_inner()).unwrap(); } - if !self.last_change_ts.is_zero() { + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { b.push(LAST_CHANGE_PREFIX); b.encode_u64(self.last_change_ts.into_inner()).unwrap(); b.encode_var_u64(self.versions_to_last_change).unwrap(); @@ -432,7 +434,7 @@ impl WriteRef<'_> { if self.gc_fence.is_some() { size += 1 + size_of::(); } - if !self.last_change_ts.is_zero() { + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { size += 1 + size_of::() + MAX_VAR_U64_LEN; } if self.txn_source != 0 { @@ -547,6 +549,7 @@ mod tests { Write::new(WriteType::Put, 456.into(), Some(b"short_value".to_vec())) .set_overlapped_rollback(true, Some(421397468076048385.into())), Write::new(WriteType::Lock, 456.into(), None).set_last_change(345.into(), 11), + Write::new(WriteType::Lock, 456.into(), None).set_last_change(0.into(), 11), Write::new(WriteType::Lock, 456.into(), None).set_txn_source(1), ]; for (i, write) in writes.drain(..).enumerate() { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 55d8575101c..3e55d81f5d2 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -9767,7 +9767,7 @@ mod tests { for_update_ts: 10.into(), min_commit_ts: 11.into(), last_change_ts: TimeStamp::zero(), - versions_to_last_change: 0, + versions_to_last_change: 1, }, false ) diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 651762aa88e..2f215986ca9 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -315,8 +315,10 @@ impl PointGetter { return Ok(None); } WriteType::Lock | WriteType::Rollback => { - if write.versions_to_last_change < SEEK_BOUND || write.last_change_ts.is_zero() - { + if write.versions_to_last_change > 0 && write.last_change_ts.is_zero() { + return Ok(None); + } + if write.versions_to_last_change < SEEK_BOUND { // Continue iterate next `write`. } else { let commit_ts = write.last_change_ts; @@ -1266,4 +1268,25 @@ mod tests { must_get_value(&mut batch_getter_ok, key4, val4); must_get_value(&mut batch_getter_ok, key5, val5); } + + #[test] + fn test_point_get_non_exist_skip_lock() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let k = b"k"; + + // Write enough LOCK recrods + for start_ts in (1..30).into_iter().step_by(2) { + must_prewrite_lock(&mut engine, k, k, start_ts); + must_commit(&mut engine, k, start_ts, start_ts + 1); + } + + let mut getter = new_point_getter(&mut engine, 40.into()); + must_get_none(&mut getter, k); + let s = getter.take_statistics(); + // We can know the key doesn't exist without skipping all these locks according + // to last_change_ts and versions_to_last_change. + assert_eq!(s.write.seek, 1); + assert_eq!(s.write.next, 0); + assert_eq!(s.write.get, 0); + } } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index c7cb9194068..752a8f0d00a 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -384,9 +384,10 @@ impl MvccReader { return Ok(None); } WriteType::Lock | WriteType::Rollback => { - if write.versions_to_last_change < SEEK_BOUND - || write.last_change_ts.is_zero() - { + if write.versions_to_last_change > 0 && write.last_change_ts.is_zero() { + return Ok(None); + } + if write.versions_to_last_change < SEEK_BOUND { ts = commit_ts.prev(); } else { let commit_ts = write.last_change_ts; @@ -1679,6 +1680,10 @@ pub mod tests { for_update_ts, 0, TimeStamp::zero(), + ) + .set_last_change( + TimeStamp::zero(), + (lock_type == LockType::Lock || lock_type == LockType::Pessimistic) as u64, ), ) }) @@ -2615,4 +2620,35 @@ pub mod tests { assert_eq!(reader.statistics.write.next, 0); assert_eq!(reader.statistics.write.get, 1); } + + #[test] + fn test_get_write_not_exist_skip_lock() { + let path = tempfile::Builder::new() + .prefix("_test_storage_mvcc_reader_get_write_not_exist_skip_lock") + .tempdir() + .unwrap(); + let path = path.path().to_str().unwrap(); + let region = make_region(1, vec![], vec![]); + let db = open_db(path, true); + let mut engine = RegionEngine::new(&db, ®ion); + let k = b"k"; + + // Write enough LOCK recrods + for start_ts in (6..30).into_iter().step_by(2) { + engine.lock(k, start_ts, start_ts + 1); + } + + let snap = RegionSnapshot::::from_raw(db, region); + let mut reader = MvccReader::new(snap, None, false); + + let res = reader + .get_write_with_commit_ts(&Key::from_raw(k), 40.into(), None) + .unwrap(); + // We can know the key doesn't exist without skipping all these locks according + // to last_change_ts and versions_to_last_change. + assert!(res.is_none()); + assert_eq!(reader.statistics.write.seek, 1); + assert_eq!(reader.statistics.write.next, 0); + assert_eq!(reader.statistics.write.get, 0); + } } diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index 6672842fab9..8828033c8a1 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -472,8 +472,10 @@ impl ScanPolicy for LatestKvPolicy { } WriteType::Delete => break None, WriteType::Lock | WriteType::Rollback => { - if write.versions_to_last_change < SEEK_BOUND || write.last_change_ts.is_zero() - { + if write.versions_to_last_change > 0 && write.last_change_ts.is_zero() { + break None; + } + if write.versions_to_last_change < SEEK_BOUND { // Continue iterate next `write`. cursors.write.next(&mut statistics.write); } else { @@ -1619,14 +1621,16 @@ mod latest_kv_tests { must_commit(&mut engine, b"k1", 6, 8); must_prewrite_put(&mut engine, b"k2", b"v21", b"k2", 2); must_commit(&mut engine, b"k2", 2, 6); - must_prewrite_put(&mut engine, b"k3", b"v31", b"k3", 3); - must_commit(&mut engine, b"k3", 3, 7); + must_prewrite_put(&mut engine, b"k4", b"v41", b"k4", 3); + must_commit(&mut engine, b"k4", 3, 7); for start_ts in (10..30).into_iter().step_by(2) { must_prewrite_lock(&mut engine, b"k1", b"k1", start_ts); must_commit(&mut engine, b"k1", start_ts, start_ts + 1); must_prewrite_lock(&mut engine, b"k3", b"k1", start_ts); must_commit(&mut engine, b"k3", start_ts, start_ts + 1); + must_prewrite_lock(&mut engine, b"k4", b"k1", start_ts); + must_commit(&mut engine, b"k4", start_ts, start_ts + 1); } must_prewrite_put(&mut engine, b"k1", b"v13", b"k1", 40); @@ -1652,7 +1656,11 @@ mod latest_kv_tests { // k3 | 27 | LOCK | // k3 | ... | LOCK | // k3 | 11 | LOCK | - // k3 | 7 | PUT | v31 + // k4 | 29 | LOCK | + // k4 | 27 | LOCK | + // k4 | ... | LOCK | + // k4 | 11 | LOCK | + // k4 | 7 | PUT | v41 let snapshot = engine.snapshot(Default::default()).unwrap(); let mut scanner = ScannerBuilder::new(snapshot, 35.into()) @@ -1676,11 +1684,11 @@ mod latest_kv_tests { assert_eq!( scanner.next().unwrap(), - Some((Key::from_raw(b"k3"), b"v31".to_vec())) + Some((Key::from_raw(b"k4"), b"v41".to_vec())) ); let stats = scanner.take_statistics(); - assert_le!(stats.write.next, 2); // skip k2@6, k3@47 - assert_eq!(stats.write.seek, 1); // seek k3@7 + assert_le!(stats.write.next, 1 + SEEK_BOUND as usize); // skip k2@6, near_seek to k4 (8 times next) + assert_eq!(stats.write.seek, 2); // seek k4, k4@7 } } diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index db4c2485d09..8e7c4d95118 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -192,8 +192,7 @@ pub fn acquire_pessimistic_lock( // Following seek_write read the previous write. let (prev_write_loaded, mut prev_write) = (true, None); - let mut last_change_ts = TimeStamp::zero(); - let mut versions_to_last_change = 0; + let (mut last_change_ts, mut versions_to_last_change); if let Some((commit_ts, write)) = reader.seek_write(&key, TimeStamp::max())? { // Find a previous write. if need_old_value { @@ -265,9 +264,7 @@ pub fn acquire_pessimistic_lock( check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; } - if tls_can_enable(LAST_CHANGE_TS) { - (last_change_ts, versions_to_last_change) = write.next_last_change_info(commit_ts); - } + (last_change_ts, versions_to_last_change) = write.next_last_change_info(commit_ts); // Load value if locked_with_conflict, so that when the client (TiDB) need to // read the value during statement retry, it will be possible to read the value @@ -296,6 +293,13 @@ pub fn acquire_pessimistic_lock( } }; } + } else { + // last_change_ts == 0 && versions_to_last_change > 0 means the key actually + // does not exist. + (last_change_ts, versions_to_last_change) = (TimeStamp::zero(), 1); + } + if !tls_can_enable(LAST_CHANGE_TS) { + (last_change_ts, versions_to_last_change) = (TimeStamp::zero(), 0); } let old_value = load_old_value( @@ -1731,6 +1735,12 @@ pub mod tests { assert_eq!(lock.last_change_ts, 40.into()); assert_eq!(lock.versions_to_last_change, 6); pessimistic_rollback::tests::must_success(&mut engine, key, 140, 140); + + // Lock on a key with no write record + must_succeed(&mut engine, b"k2", b"k2", 150, 150); + let lock = must_pessimistic_locked(&mut engine, b"k2", 150, 150); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 1); } #[test] diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 46c9774dd52..f2de9df0004 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -439,6 +439,12 @@ impl<'a> PrewriteMutation<'a> { return Ok(Some((write, commit_ts))); } + // If seek_ts is max and it goes here, there is no write record for this key. + if seek_ts == TimeStamp::max() { + // last_change_ts == 0 && versions_to_last_change > 0 means the key actually + // does not exist. + (self.last_change_ts, self.versions_to_last_change) = (TimeStamp::zero(), 1); + } Ok(None) } @@ -750,6 +756,10 @@ fn amend_pessimistic_lock( } (mutation.last_change_ts, mutation.versions_to_last_change) = write.next_last_change_info(*commit_ts); + } else { + // last_change_ts == 0 && versions_to_last_change > 0 means the key actually + // does not exist. + (mutation.last_change_ts, mutation.versions_to_last_change) = (TimeStamp::zero(), 1); } // Used pipelined pessimistic lock acquiring in this txn but failed // Luckily no other txn modified this lock, amend it by treat it as optimistic @@ -2241,6 +2251,13 @@ pub mod tests { let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); let key = b"k"; + // Latest version does not exist + prewrite_func(&mut engine, LockType::Lock, 2); + let lock = must_locked(&mut engine, key, 2); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 1); + must_rollback(&mut engine, key, 2, false); + // Latest change ts should not be enabled on TiKV 6.4 let feature_gate = FeatureGate::default(); feature_gate.set_version("6.4.0").unwrap(); From c26a7cd6e1be2ff6fc25f138a788db0b6baf0b54 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 28 Nov 2022 13:46:00 +0800 Subject: [PATCH 363/676] storage: introduce hint_min_ts to speed up the flashback progress (#13842) ref tikv/tikv#13800 Introduce `hint_min_ts` during the flashback progress to only flashback those keys that have version changed as much as possible. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- components/cdc/src/old_value.rs | 4 +- components/tikv_kv/src/cursor.rs | 24 +- components/tikv_kv/src/lib.rs | 1 + src/storage/mvcc/reader/reader.rs | 344 +++--------------- src/storage/mvcc/reader/scanner/mod.rs | 6 +- .../txn/actions/flashback_to_version.rs | 59 +-- .../txn/commands/flashback_to_version.rs | 16 +- .../flashback_to_version_read_phase.rs | 20 +- 8 files changed, 128 insertions(+), 346 deletions(-) diff --git a/components/cdc/src/old_value.rs b/components/cdc/src/old_value.rs index 37e2781b766..d91266c92c2 100644 --- a/components/cdc/src/old_value.rs +++ b/components/cdc/src/old_value.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::ops::Deref; +use std::ops::{Bound, Deref}; use engine_traits::{ReadOptions, CF_DEFAULT, CF_WRITE}; use getset::CopyGetters; @@ -261,7 +261,7 @@ fn new_write_cursor_on_key(snapshot: &S, key: &Key) -> Cursor .range(Some(key.clone()), upper) // Use bloom filter to speed up seeking on a given prefix. .prefix_seek(true) - .hint_max_ts(Some(ts)) + .hint_max_ts(Some(Bound::Included(ts))) .build() .unwrap() } diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index 2c9a071fbbb..576aa5cfa76 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -432,10 +432,10 @@ pub struct CursorBuilder<'a, S: Snapshot> { prefix_seek: bool, upper_bound: Option, lower_bound: Option, - // hint for we will only scan data with commit ts >= hint_min_ts - hint_min_ts: Option, - // hint for we will only scan data with commit ts <= hint_max_ts - hint_max_ts: Option, + // hint for we will only scan data with commit_ts >/>= hint_min_ts + hint_min_ts: Option>, + // hint for we will only scan data with commit_ts >, key_only: bool, max_skippable_internal_keys: u64, } @@ -506,8 +506,8 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { /// Default is empty. #[inline] #[must_use] - pub fn hint_min_ts(mut self, min_ts: Option) -> Self { - self.hint_min_ts = min_ts; + pub fn hint_min_ts(mut self, ts_bound: Option>) -> Self { + self.hint_min_ts = ts_bound; self } @@ -516,8 +516,8 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { /// Default is empty. #[inline] #[must_use] - pub fn hint_max_ts(mut self, max_ts: Option) -> Self { - self.hint_max_ts = max_ts; + pub fn hint_max_ts(mut self, ts_bound: Option>) -> Self { + self.hint_max_ts = ts_bound; self } @@ -550,11 +550,11 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { None }; let mut iter_opt = IterOptions::new(l_bound, u_bound, self.fill_cache); - if let Some(ts) = self.hint_min_ts { - iter_opt.set_hint_min_ts(Bound::Included(ts.into_inner())); + if let Some(ts_bound) = self.hint_min_ts { + iter_opt.set_hint_min_ts(ts_bound.map(TimeStamp::into_inner)); } - if let Some(ts) = self.hint_max_ts { - iter_opt.set_hint_max_ts(Bound::Included(ts.into_inner())); + if let Some(ts_bound) = self.hint_max_ts { + iter_opt.set_hint_max_ts(ts_bound.map(TimeStamp::into_inner)); } iter_opt.set_key_only(self.key_only); iter_opt.set_max_skippable_internal_keys(self.max_skippable_internal_keys); diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index b5f19832419..ac452fead37 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -5,6 +5,7 @@ //! [`Server`](crate::server::Server). The [`BTreeEngine`](kv::BTreeEngine) and //! [`RocksEngine`](RocksEngine) are used for testing only. +#![feature(bound_map)] #![feature(min_specialization)] #![feature(type_alias_impl_trait)] diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 752a8f0d00a..8e92ffd6be2 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -1,6 +1,8 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] +use std::ops::Bound; + use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::{ errorpb::{self, EpochNotMatch, StaleCommand}, @@ -127,6 +129,8 @@ pub struct MvccReader { lower_bound: Option, upper_bound: Option, + hint_min_ts: Option>, + /// None means following operations are performed on a single user key, /// i.e., different versions of the same key. It can use prefix seek to /// speed up reads from the write-cf. @@ -154,6 +158,7 @@ impl MvccReader { write_cursor: None, lower_bound: None, upper_bound: None, + hint_min_ts: None, scan_mode, current_key: None, fill_cache, @@ -171,6 +176,7 @@ impl MvccReader { write_cursor: None, lower_bound: None, upper_bound: None, + hint_min_ts: None, scan_mode, current_key: None, fill_cache: !ctx.get_not_fill_cache(), @@ -471,6 +477,8 @@ impl MvccReader { .prefix_seek(self.scan_mode.is_none()) .scan_mode(self.get_scan_mode(true)) .range(self.lower_bound.clone(), self.upper_bound.clone()) + // `hint_min_ts` filters data by the `commit_ts`. + .hint_min_ts(self.hint_min_ts) .build()?; self.write_cursor = Some(cursor); } @@ -559,29 +567,23 @@ impl MvccReader { Ok((locks, has_remain)) } - /// Scan the writes to get all the latest keys with their corresponding - /// PUT/DELETE write records at the given version, if the version is not - /// specified, it will scan the latest version for each key, if the key - /// does not exist or is not visible at that point, an `Option::None` will - /// be placed. The return type is: - /// * `(Vec<(key, Option)>, has_remain)`. - /// - `key` is the encoded key without commit ts. - /// - `write` is the PUT/DELETE write record at the given version. - /// - `has_remain` indicates whether there MAY be remaining writes that + /// Scan the writes to get all the latest user keys. The return type is: + /// * `(Vec, has_remain)`. + /// - `key` is the encoded user key without `commit_ts`. + /// - `has_remain` indicates whether there MAY be remaining user keys that /// can be scanned. /// /// This function is mainly used by /// `txn::commands::FlashbackToVersionReadPhase` /// and `txn::commands::FlashbackToVersion` to achieve the MVCC /// overwriting. - pub fn scan_writes( + pub fn scan_latest_user_keys( &mut self, start: Option<&Key>, end: Option<&Key>, - version: Option, filter: F, limit: usize, - ) -> Result<(Vec<(Key, Option)>, bool)> + ) -> Result<(Vec, bool)> where F: Fn(&Key /* user key */, TimeStamp /* latest `commit_ts` */) -> bool, { @@ -594,10 +596,8 @@ impl MvccReader { if !ok { return Ok((vec![], false)); } - // Use the latest version as the default value if the version is not given. - let version = version.unwrap_or_else(TimeStamp::max); - let mut cur_key = None; - let mut key_writes = Vec::with_capacity(limit); + let mut cur_user_key = None; + let mut keys = Vec::with_capacity(limit); let mut has_remain = false; while cursor.valid()? { let key = Key::from_encoded_slice(cursor.key(&mut self.statistics.write)); @@ -611,62 +611,28 @@ impl MvccReader { let user_key = key.truncate_ts()?; // To make sure we only check each unique user key once and the filter returns // true. - let is_same_user_key = cur_key.as_ref() == Some(&user_key); + let is_same_user_key = cur_user_key.as_ref() == Some(&user_key); if !is_same_user_key { - cur_key = Some(user_key.clone()); + cur_user_key = Some(user_key.clone()); } if is_same_user_key || !filter(&user_key, commit_ts) { cursor.next(&mut self.statistics.write); continue; } - - let mut write = None; - let version_key = user_key.clone().append_ts(version); - // Try to seek to the key with the specified version. - if cursor.near_seek(&version_key, &mut self.statistics.write)? - && Key::is_user_key_eq( - cursor.key(&mut self.statistics.write), - user_key.as_encoded(), - ) - { - while cursor.valid()? { - write = - Some(WriteRef::parse(cursor.value(&mut self.statistics.write))?.to_owned()); - // Move to the next key. - cursor.next(&mut self.statistics.write); - match write.as_ref().unwrap().write_type { - WriteType::Put | WriteType::Delete => { - break; - } - WriteType::Lock | WriteType::Rollback => { - // Only return the PUT/DELETE write record. - write = None; - // Reach the end. - if !cursor.valid()? { - break; - } - // Try to find the latest visible version before it. - let key = - Key::from_encoded_slice(cursor.key(&mut self.statistics.write)); - // Could not find the visible version, current cursor is on the next - // key, so we set `cur_key` to `None`. - if key.truncate_ts()? != user_key { - cur_key = None; - break; - } - } - } - } - } - key_writes.push((user_key, write)); - if limit > 0 && key_writes.len() == limit { + keys.push(user_key.clone()); + if limit > 0 && keys.len() == limit { has_remain = true; break; } + // Seek once to skip all the writes of the same user key. + cursor.near_seek( + &user_key.append_ts(TimeStamp::zero()), + &mut self.statistics.write, + )?; } - self.statistics.write.processed_keys += key_writes.len(); - resource_metering::record_read_keys(key_writes.len() as u32); - Ok((key_writes, has_remain)) + self.statistics.write.processed_keys += keys.len(); + resource_metering::record_read_keys(keys.len() as u32); + Ok((keys, has_remain)) } pub fn scan_keys( @@ -778,6 +744,10 @@ impl MvccReader { self.lower_bound = lower; self.upper_bound = upper; } + + pub fn set_hint_min_ts(&mut self, ts_bound: Option>) { + self.hint_min_ts = ts_bound; + } } #[cfg(test)] @@ -1767,9 +1737,9 @@ pub mod tests { } #[test] - fn test_scan_writes() { + fn test_scan_latest_user_keys() { let path = tempfile::Builder::new() - .prefix("_test_storage_mvcc_reader_scan_writes") + .prefix("_test_storage_mvcc_reader_scan_latest_user_keys") .tempdir() .unwrap(); let path = path.path().to_str().unwrap(); @@ -1869,267 +1839,65 @@ pub mod tests { struct Case { start_key: Option, end_key: Option, - version: Option, limit: usize, - expect_res: Vec<(Key, Option)>, + expect_res: Vec, expect_is_remain: bool, } let cases = vec![ - // Get all latest writes with the unspecified version. - Case { - start_key: None, - end_key: None, - version: None, - limit: 4, - expect_res: vec![ - ( - Key::from_raw(b"k0"), - Some(Write::new( - WriteType::Put, - 999.into(), - Some(b"v0@999".to_vec()), - )), - ), - ( - Key::from_raw(b"k1"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - Some(Write::new(WriteType::Put, 8.into(), Some(b"v3@8".to_vec()))), - ), - ], - expect_is_remain: true, - }, - // k0 is invisible at version 9. - Case { - start_key: None, - end_key: None, - version: Some(9), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), None), - ( - Key::from_raw(b"k1"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - Some(Write::new(WriteType::Put, 8.into(), Some(b"v3@8".to_vec()))), - ), - ], - expect_is_remain: true, - }, - // k3 has an old version write at version 8. - Case { - start_key: None, - end_key: None, - version: Some(8), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), None), - ( - Key::from_raw(b"k1"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - Some(Write::new(WriteType::Put, 5.into(), Some(b"v3@5".to_vec()))), - ), - ], - expect_is_remain: true, - }, - Case { - start_key: None, - end_key: None, - version: Some(7), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), None), - ( - Key::from_raw(b"k1"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - Some(Write::new(WriteType::Put, 5.into(), Some(b"v3@5".to_vec()))), - ), - ], - expect_is_remain: true, - }, - Case { - start_key: None, - end_key: None, - version: Some(6), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), None), - ( - Key::from_raw(b"k1"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - Some(Write::new(WriteType::Put, 5.into(), Some(b"v3@5".to_vec()))), - ), - ], - expect_is_remain: true, - }, - // k3 doesn't exist at version 5. - Case { - start_key: None, - end_key: None, - version: Some(5), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), None), - ( - Key::from_raw(b"k1"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - (Key::from_raw(b"k3"), None), - ], - expect_is_remain: true, - }, + // Test the limit. Case { start_key: None, end_key: None, - version: Some(4), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), None), - ( - Key::from_raw(b"k1"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - (Key::from_raw(b"k3"), None), - ], + limit: 1, + expect_res: vec![Key::from_raw(b"k0")], expect_is_remain: true, }, - // k1 and k2 have old version writes at version 3. Case { start_key: None, end_key: None, - version: Some(3), - limit: 4, + limit: 6, expect_res: vec![ - (Key::from_raw(b"k0"), None), - ( - Key::from_raw(b"k1"), - Some(Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - Some(Write::new(WriteType::Put, 1.into(), Some(b"v2@1".to_vec()))), - ), - (Key::from_raw(b"k3"), None), + Key::from_raw(b"k0"), + Key::from_raw(b"k1"), + Key::from_raw(b"k2"), + Key::from_raw(b"k3"), + Key::from_raw(b"k4"), ], - expect_is_remain: true, + expect_is_remain: false, }, + // Test the start/end key. Case { - start_key: None, + start_key: Some(Key::from_raw(b"k2")), end_key: None, - version: Some(2), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), None), - ( - Key::from_raw(b"k1"), - Some(Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - Some(Write::new(WriteType::Put, 1.into(), Some(b"v2@1".to_vec()))), - ), - (Key::from_raw(b"k3"), None), + Key::from_raw(b"k2"), + Key::from_raw(b"k3"), + Key::from_raw(b"k4"), ], - expect_is_remain: true, + expect_is_remain: false, }, - // All keys don't exist at version 1. Case { start_key: None, - end_key: None, - version: Some(1), + end_key: Some(Key::from_raw(b"k3")), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), None), - (Key::from_raw(b"k1"), None), - (Key::from_raw(b"k2"), None), - (Key::from_raw(b"k3"), None), - ], - expect_is_remain: true, - }, - // Test the limit. - Case { - start_key: None, - end_key: None, - version: Some(0), - limit: 1, - expect_res: vec![(Key::from_raw(b"k0"), None)], - expect_is_remain: true, - }, - Case { - start_key: None, - end_key: None, - version: Some(0), - limit: 6, - expect_res: vec![ - (Key::from_raw(b"k0"), None), - (Key::from_raw(b"k1"), None), - (Key::from_raw(b"k2"), None), - (Key::from_raw(b"k3"), None), - (Key::from_raw(b"k4"), None), + Key::from_raw(b"k0"), + Key::from_raw(b"k1"), + Key::from_raw(b"k2"), ], expect_is_remain: false, }, - // Test the invisible record. - Case { - start_key: Some(Key::from_raw(b"k4")), - end_key: None, - version: Some(10), - limit: 1, - expect_res: vec![(Key::from_raw(b"k4"), None)], - expect_is_remain: true, - }, ]; for (idx, case) in cases.iter().enumerate() { let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); let mut reader = MvccReader::new(snap, Some(ScanMode::Forward), false); let res = reader - .scan_writes( + .scan_latest_user_keys( case.start_key.as_ref(), case.end_key.as_ref(), - case.version.map(Into::into), |_, _| true, case.limit, ) diff --git a/src/storage/mvcc/reader/scanner/mod.rs b/src/storage/mvcc/reader/scanner/mod.rs index 664a4fed99e..5b87cca7f7a 100644 --- a/src/storage/mvcc/reader/scanner/mod.rs +++ b/src/storage/mvcc/reader/scanner/mod.rs @@ -4,6 +4,8 @@ mod backward; mod forward; +use std::ops::Bound; + use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::kvrpcpb::{ExtraOp, IsolationLevel}; use txn_types::{ @@ -330,8 +332,8 @@ impl ScannerConfig { .range(lower, upper) .fill_cache(self.fill_cache) .scan_mode(scan_mode) - .hint_min_ts(hint_min_ts) - .hint_max_ts(hint_max_ts) + .hint_min_ts(hint_min_ts.map(|ts| Bound::Included(ts))) + .hint_max_ts(hint_max_ts.map(|ts| Bound::Included(ts))) .build()?; Ok(cursor) } diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 98e2e433632..71f50715a20 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -1,5 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::ops::Bound; + use txn_types::{Key, Lock, TimeStamp, Write, WriteType}; use crate::storage::{ @@ -34,14 +36,17 @@ pub fn flashback_to_version_read_write( flashback_version: TimeStamp, flashback_commit_ts: TimeStamp, statistics: &mut Statistics, -) -> TxnResult)>> { - // To flashback the data, we need to get all the latest keys first by scanning - // every unique key in `CF_WRITE` and to get its corresponding old MVCC write - // record if exists. - let result = reader.scan_writes( +) -> TxnResult> { + // Filter out the SST that does not have a newer version than + // `flashback_version` in `CF_WRITE`, i.e, whose latest `commit_ts` <= + // `flashback_version`. By doing this, we can only flashback those keys that + // have version changed since `flashback_version` as much as possible. + reader.set_hint_min_ts(Some(Bound::Excluded(flashback_version))); + // To flashback the data, we need to get all the latest visible keys first by + // scanning every unique key in `CF_WRITE`. + let keys_result = reader.scan_latest_user_keys( Some(&next_write_key), Some(end_key), - Some(flashback_version), |_, latest_commit_ts| { // There is no any other write could happen after the flashback begins. assert!(latest_commit_ts <= flashback_commit_ts); @@ -54,8 +59,8 @@ pub fn flashback_to_version_read_write( FLASHBACK_BATCH_SIZE, ); statistics.add(&reader.statistics); - let (key_old_writes, _) = result?; - Ok(key_old_writes) + let (keys, _) = keys_result?; + Ok(keys) } // To flashback the `CF_LOCK`, we need to delete all locks records whose @@ -87,20 +92,22 @@ pub fn flashback_to_version_lock( } // To flashback the `CF_WRITE` and `CF_DEFAULT`, we need to write a new MVCC -// record for each key in `self.keys` with its old value at `self.version`, +// record for each key in keys with its old value at `flashback_version`, // specifically, the flashback will have the following behavior: -// - If a key doesn't exist at `self.version`, it will be put a -// `WriteType::Delete`. -// - If a key exists at `self.version`, it will be put the exact same record -// in `CF_WRITE` and `CF_DEFAULT` with `self.commit_ts` and `self.start_ts`. +// - If a key doesn't exist or isn't invisible at `flashback_version`, it will +// be put a `WriteType::Delete`. +// - If a key exists and is visible at `flashback_version`, it will be put the +// exact same record in `CF_WRITE` and `CF_DEFAULT` with `self.commit_ts` +// and `self.start_ts`. pub fn flashback_to_version_write( txn: &mut MvccTxn, reader: &mut SnapshotReader, - key_old_writes: Vec<(Key, Option)>, - start_ts: TimeStamp, - commit_ts: TimeStamp, + keys: Vec, + flashback_version: TimeStamp, + flashback_start_ts: TimeStamp, + flashback_commit_ts: TimeStamp, ) -> TxnResult> { - for (key, old_write) in key_old_writes { + for key in keys { #[cfg(feature = "failpoints")] { let should_skip = || { @@ -114,27 +121,32 @@ pub fn flashback_to_version_write( if txn.write_size() >= MAX_TXN_WRITE_SIZE { return Ok(Some(key.clone())); } + let old_write = reader.get_write(&key, flashback_version)?; let new_write = if let Some(old_write) = old_write { // If it's not a short value and it's a `WriteType::Put`, we should put the old // value in `CF_DEFAULT` with `self.start_ts` as well. if old_write.short_value.is_none() && old_write.write_type == WriteType::Put { txn.put_value( key.clone(), - start_ts, + flashback_start_ts, reader.load_data(&key, old_write.clone())?, ); } Write::new( old_write.write_type, - start_ts, + flashback_start_ts, old_write.short_value.clone(), ) } else { // If the old write doesn't exist, we should put a `WriteType::Delete` record to // delete the current key when needed. - Write::new(WriteType::Delete, start_ts, None) + Write::new(WriteType::Delete, flashback_start_ts, None) }; - txn.put_write(key.clone(), commit_ts, new_write.as_ref().to_bytes()); + txn.put_write( + key.clone(), + flashback_commit_ts, + new_write.as_ref().to_bytes(), + ); } Ok(None) } @@ -187,7 +199,7 @@ pub mod tests { let mut rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); // Flashback the writes. - let key_old_writes = flashback_to_version_read_write( + let keys = flashback_to_version_read_write( &mut reader, key, &next_key, @@ -202,7 +214,8 @@ pub mod tests { flashback_to_version_write( &mut txn, &mut snap_reader, - key_old_writes, + keys, + version, start_ts, commit_ts, ) diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index dabb6acfcc5..d53a3a5c3be 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -45,9 +45,7 @@ impl CommandExt for FlashbackToVersion { FlashbackToVersionState::ScanLock { key_locks, .. } => { latch::Lock::new(key_locks.iter().map(|(key, _)| key)) } - FlashbackToVersionState::ScanWrite { key_old_writes, .. } => { - latch::Lock::new(key_old_writes.iter().map(|(key, _)| key)) - } + FlashbackToVersionState::ScanWrite { keys, .. } => latch::Lock::new(keys.iter()), } } @@ -57,10 +55,9 @@ impl CommandExt for FlashbackToVersion { .iter() .map(|(key, _)| key.as_encoded().len()) .sum(), - FlashbackToVersionState::ScanWrite { key_old_writes, .. } => key_old_writes - .iter() - .map(|(key, _)| key.as_encoded().len()) - .sum(), + FlashbackToVersionState::ScanWrite { keys, .. } => { + keys.iter().map(|key| key.as_encoded().len()).sum() + } } } } @@ -86,12 +83,13 @@ impl WriteCommand for FlashbackToVersion { } FlashbackToVersionState::ScanWrite { ref mut next_write_key, - ref mut key_old_writes, + ref mut keys, } => { if let Some(new_next_write_key) = flashback_to_version_write( &mut txn, &mut reader, - mem::take(key_old_writes), + mem::take(keys), + self.version, self.start_ts, self.commit_ts, )? { diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index b1a83a49ff8..b41506c320b 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -1,7 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use txn_types::{Key, Lock, TimeStamp, Write}; +use txn_types::{Key, Lock, TimeStamp}; use crate::storage::{ mvcc::MvccReader, @@ -24,7 +24,7 @@ pub enum FlashbackToVersionState { }, ScanWrite { next_write_key: Key, - key_old_writes: Vec<(Key, Option)>, + keys: Vec, }, } @@ -110,7 +110,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { read_again = true; FlashbackToVersionState::ScanWrite { next_write_key: self.start_key.clone(), - key_old_writes: Vec::new(), + keys: Vec::new(), } } else { assert!(!key_locks.is_empty()); @@ -128,7 +128,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { } } FlashbackToVersionState::ScanWrite { next_write_key, .. } => { - let mut key_old_writes = flashback_to_version_read_write( + let mut keys = flashback_to_version_read_write( &mut reader, next_write_key, &self.end_key, @@ -136,18 +136,18 @@ impl ReadCommand for FlashbackToVersionReadPhase { self.commit_ts, statistics, )?; - if key_old_writes.is_empty() { + if keys.is_empty() { // No more writes to flashback, just return. return Ok(ProcessResult::Res); } - tls_collect_keyread_histogram_vec(tag, key_old_writes.len() as f64); + tls_collect_keyread_histogram_vec(tag, keys.len() as f64); FlashbackToVersionState::ScanWrite { - next_write_key: if key_old_writes.len() > 1 { - key_old_writes.pop().map(|(key, _)| key).unwrap() + next_write_key: if keys.len() > 1 { + keys.pop().unwrap() } else { - key_old_writes.last().map(|(key, _)| key.clone()).unwrap() + keys.last().unwrap().clone() }, - key_old_writes, + keys, } } }; From bfe29655a2a1634c268a7fb3986892be7dc1f305 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Mon, 28 Nov 2022 17:54:00 +0800 Subject: [PATCH 364/676] scheduler: Support resuming after meeting lock for AcquirePessimisticLock requests (#13826) ref tikv/tikv#13298 Update scheduler's logic to support resuming a woken-up AcquirePessimisticLock request. It also includes changes to Latches to support deriving latches after a command released some locks (instead of releasing the latches and acquire them again). Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/test_raftstore/src/util.rs | 45 +- src/server/service/kv.rs | 58 +- src/storage/errors.rs | 47 +- src/storage/lock_manager/lock_wait_context.rs | 20 +- .../lock_manager/lock_waiting_queue.rs | 4 +- src/storage/mod.rs | 628 +++++++++++++++++- src/storage/txn/commands/mod.rs | 7 +- src/storage/txn/scheduler.rs | 381 +++++++++-- src/storage/types.rs | 42 ++ tests/integrations/server/kv_service.rs | 226 ++++++- tests/integrations/server/lock_manager.rs | 5 +- 12 files changed, 1358 insertions(+), 107 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 487d2712249..14c12716ee2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2694,7 +2694,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#51120697d051df163ec8aa313ee1916a68b07984" +source = "git+https://github.com/pingcap/kvproto.git#fdbd9fa2b8f402420c9f7bc8fe47b0e41412ad55" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 14661344316..e4b185b9509 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -1022,6 +1022,39 @@ pub fn kv_pessimistic_lock( kv_pessimistic_lock_with_ttl(client, ctx, keys, ts, for_update_ts, return_values, 20) } +pub fn kv_pessimistic_lock_resumable( + client: &TikvClient, + ctx: Context, + keys: Vec>, + ts: u64, + for_update_ts: u64, + wait_timeout: Option, + return_values: bool, + check_existence: bool, +) -> PessimisticLockResponse { + let mut req = PessimisticLockRequest::default(); + req.set_context(ctx); + let primary = keys[0].clone(); + let mut mutations = vec![]; + for key in keys { + let mut mutation = Mutation::default(); + mutation.set_op(Op::PessimisticLock); + mutation.set_key(key); + mutations.push(mutation); + } + req.set_mutations(mutations.into()); + req.primary_lock = primary; + req.start_version = ts; + req.for_update_ts = for_update_ts; + req.lock_ttl = 20; + req.is_first_lock = false; + req.wait_timeout = wait_timeout.unwrap_or(-1); + req.set_wake_up_mode(PessimisticLockWakeUpMode::WakeUpModeForceLock); + req.return_values = return_values; + req.check_existence = check_existence; + client.kv_pessimistic_lock(&req).unwrap() +} + pub fn kv_pessimistic_lock_with_ttl( client: &TikvClient, ctx: Context, @@ -1057,12 +1090,18 @@ pub fn must_kv_pessimistic_lock(client: &TikvClient, ctx: Context, key: Vec, assert!(resp.errors.is_empty(), "{:?}", resp.get_errors()); } -pub fn must_kv_pessimistic_rollback(client: &TikvClient, ctx: Context, key: Vec, ts: u64) { +pub fn must_kv_pessimistic_rollback( + client: &TikvClient, + ctx: Context, + key: Vec, + ts: u64, + for_update_ts: u64, +) { let mut req = PessimisticRollbackRequest::default(); req.set_context(ctx); req.set_keys(vec![key].into_iter().collect()); req.start_version = ts; - req.for_update_ts = ts; + req.for_update_ts = for_update_ts; let resp = client.kv_pessimistic_rollback(&req).unwrap(); assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); assert!(resp.errors.is_empty(), "{:?}", resp.get_errors()); @@ -1306,7 +1345,7 @@ impl PeerClient { } pub fn must_kv_pessimistic_rollback(&self, key: Vec, ts: u64) { - must_kv_pessimistic_rollback(&self.cli, self.ctx.clone(), key, ts) + must_kv_pessimistic_rollback(&self.cli, self.ctx.clone(), key, ts, ts) } } diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index fa2235b51e7..7a61a313eca 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -57,7 +57,7 @@ use crate::{ self, errors::{ extract_committed, extract_key_error, extract_key_errors, extract_kv_pairs, - extract_region_error, map_kv_pairs, + extract_region_error, extract_region_error_from_error, map_kv_pairs, }, kv::Engine, lock_manager::LockManager, @@ -1887,12 +1887,12 @@ fn future_raw_coprocessor( } macro_rules! txn_command_future { - ($fn_name: ident, $req_ty: ident, $resp_ty: ident, ($req: ident) $prelude: stmt; ($v: ident, $resp: ident, $tracker: ident) { $else_branch: expr }) => { + ($fn_name: ident, $req_ty: ident, $resp_ty: ident, ($req: ident) {$($prelude: stmt)*}; ($v: ident, $resp: ident, $tracker: ident) { $else_branch: expr }) => { fn $fn_name( storage: &Storage, $req: $req_ty, ) -> impl Future> { - $prelude + $($prelude)* let $tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( $req.get_context(), RequestType::Unknown, @@ -1939,22 +1939,42 @@ txn_command_future!(future_prewrite, PrewriteRequest, PrewriteResponse, (v, resp } resp.set_errors(extract_key_errors(v.map(|v| v.locks)).into()); }}); -txn_command_future!(future_acquire_pessimistic_lock, PessimisticLockRequest, PessimisticLockResponse, (v, resp, tracker) {{ - match v { - Ok(Ok(res)) => { - let (values, not_founds) = res.into_legacy_values_and_not_founds(); - resp.set_values(values.into()); - resp.set_not_founds(not_founds); - }, - Err(e) | Ok(Err(e)) => { - resp.set_errors(vec![extract_key_error(&e)].into()) - }, - } - GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { - tracker.write_scan_detail(resp.mut_exec_details_v2().mut_scan_detail_v2()); - tracker.write_write_detail(resp.mut_exec_details_v2().mut_write_detail()); - }); -}}); +txn_command_future!(future_acquire_pessimistic_lock, PessimisticLockRequest, PessimisticLockResponse, + (req) { + let mode = req.get_wake_up_mode() + }; + (v, resp, tracker) {{ + match v { + Ok(Ok(res)) => { + match mode { + PessimisticLockWakeUpMode::WakeUpModeForceLock => { + let (res, error) = res.into_pb(); + resp.set_results(res.into()); + if let Some(e) = error { + if let Some(region_error) = extract_region_error_from_error(&e.0) { + resp.set_region_error(region_error); + } else { + resp.set_errors(vec![extract_key_error(&e.0)].into()); + } + } + } + PessimisticLockWakeUpMode::WakeUpModeNormal => { + let (values, not_founds) = res.into_legacy_values_and_not_founds(); + resp.set_values(values.into()); + resp.set_not_founds(not_founds); + } + } + }, + Err(e) | Ok(Err(e)) => { + resp.set_errors(vec![extract_key_error(&e)].into()) + }, + } + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + tracker.write_scan_detail(resp.mut_exec_details_v2().mut_scan_detail_v2()); + tracker.write_write_detail(resp.mut_exec_details_v2().mut_write_detail()); + }); + }} +); txn_command_future!(future_pessimistic_rollback, PessimisticRollbackRequest, PessimisticRollbackResponse, (v, resp) { resp.set_errors(extract_key_errors(v).into()) }); diff --git a/src/storage/errors.rs b/src/storage/errors.rs index b5498e807f0..2b41cf23ea2 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -238,45 +238,45 @@ pub fn get_tag_from_header(header: &errorpb::Error) -> &'static str { get_error_kind_from_header(header).get_str() } -pub fn extract_region_error(res: &Result) -> Option { - match *res { +pub fn extract_region_error_from_error(e: &Error) -> Option { + match e { // TODO: use `Error::cause` instead. - Err(Error(box ErrorInner::Kv(KvError(box KvErrorInner::Request(ref e))))) - | Err(Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Engine(KvError( + Error(box ErrorInner::Kv(KvError(box KvErrorInner::Request(ref e)))) + | Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Engine(KvError( box KvErrorInner::Request(ref e), - )))))) - | Err(Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( + ))))) + | Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( box MvccErrorInner::Kv(KvError(box KvErrorInner::Request(ref e))), - )))))) => Some(e.to_owned()), - Err(Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::MaxTimestampNotSynced { + ))))) => Some(e.to_owned()), + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::MaxTimestampNotSynced { .. - })))) => { + }))) => { let mut err = errorpb::Error::default(); err.set_max_timestamp_not_synced(Default::default()); Some(err) } - Err(Error(box ErrorInner::SchedTooBusy)) => { + Error(box ErrorInner::SchedTooBusy) => { let mut err = errorpb::Error::default(); let mut server_is_busy_err = errorpb::ServerIsBusy::default(); server_is_busy_err.set_reason(SCHEDULER_IS_BUSY.to_owned()); err.set_server_is_busy(server_is_busy_err); Some(err) } - Err(Error(box ErrorInner::GcWorkerTooBusy)) => { + Error(box ErrorInner::GcWorkerTooBusy) => { let mut err = errorpb::Error::default(); let mut server_is_busy_err = errorpb::ServerIsBusy::default(); server_is_busy_err.set_reason(GC_WORKER_IS_BUSY.to_owned()); err.set_server_is_busy(server_is_busy_err); Some(err) } - Err(Error(box ErrorInner::Closed)) => { + Error(box ErrorInner::Closed) => { // TiKV is closing, return an RegionError to tell the client that this region is // unavailable temporarily, the client should retry the request in other TiKVs. let mut err = errorpb::Error::default(); err.set_message("TiKV is Closing".to_string()); Some(err) } - Err(Error(box ErrorInner::DeadlineExceeded)) => { + Error(box ErrorInner::DeadlineExceeded) => { let mut err = errorpb::Error::default(); let mut server_is_busy_err = errorpb::ServerIsBusy::default(); server_is_busy_err.set_reason(DEADLINE_EXCEEDED.to_owned()); @@ -287,6 +287,13 @@ pub fn extract_region_error(res: &Result) -> Option { } } +pub fn extract_region_error(res: &Result) -> Option { + match res { + Ok(_) => None, + Err(e) => extract_region_error_from_error(e), + } +} + pub fn extract_committed(err: &Error) -> Option { match *err { Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( @@ -463,17 +470,23 @@ pub fn extract_key_errors(res: Result>>) -> Vec); +pub struct SharedError(pub Arc); + +impl SharedError { + pub fn inner(&self) -> &ErrorInner { + &self.0.0 + } +} impl From for SharedError { fn from(e: ErrorInner) -> Self { - Self(Arc::new(e)) + Self(Arc::new(Error::from(e))) } } impl From for SharedError { fn from(e: Error) -> Self { - Self(Arc::from(e.0)) + Self(Arc::new(e)) } } @@ -483,7 +496,7 @@ impl TryFrom for Error { type Error = (); fn try_from(e: SharedError) -> std::result::Result { - Arc::try_unwrap(e.0).map(Into::into).map_err(|_| ()) + Arc::try_unwrap(e.0).map_err(|_| ()) } } diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs index 1d53bdc38ea..b8aaa7f1927 100644 --- a/src/storage/lock_manager/lock_wait_context.rs +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -23,7 +23,7 @@ use crate::storage::{ LockManager, LockWaitToken, }, types::PessimisticLockKeyResult, - Error as StorageError, ProcessResult, StorageCallback, + Error as StorageError, PessimisticLockResults, ProcessResult, StorageCallback, }; pub struct LockWaitContextInner { @@ -157,9 +157,19 @@ impl LockWaitContext { return; } - // The following code is only valid after implementing the new lock-waiting - // model. - unreachable!(); + let key_res = match result { + Ok(key_res) => { + assert!(!matches!(key_res, PessimisticLockKeyResult::Waiting)); + key_res + } + Err(e) => PessimisticLockKeyResult::Failed(e), + }; + + let mut res = PessimisticLockResults::with_capacity(1); + res.push(key_res); + let pr = ProcessResult::PessimisticLockRes { res: Ok(res) }; + + ctx_inner.cb.execute(pr); } } @@ -176,7 +186,7 @@ mod tests { lock_manager::{lock_waiting_queue::LockWaitEntry, MockLockManager}, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, txn::{Error as TxnError, ErrorInner as TxnErrorInner}, - types::{PessimisticLockParameters, PessimisticLockResults}, + types::PessimisticLockParameters, ErrorInner as StorageErrorInner, Result as StorageResult, }; diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index 4069bab5643..90a2c369cca 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -841,11 +841,11 @@ mod tests { } fn expect_write_conflict( - err: &StorageErrorInner, + err: &StorageError, expect_conflict_start_ts: impl Into, expect_conflict_commit_ts: impl Into, ) { - match err { + match &*err.0 { StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( box MvccErrorInner::WriteConflict { conflict_start_ts, diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 3e55d81f5d2..32d033e7497 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3266,6 +3266,17 @@ pub mod test_util { }) } + pub fn expect_value_with_checker_callback( + done: Sender, + id: i32, + check: impl FnOnce(T) + Send + 'static, + ) -> Callback { + Box::new(move |x: Result| { + check(x.unwrap()); + done.send(id).unwrap(); + }) + } + pub fn expect_pessimistic_lock_res_callback( done: Sender, pessimistic_lock_res: PessimisticLockResults, @@ -3330,6 +3341,40 @@ pub mod test_util { type PessimisticLockCommand = TypedCommand>; + impl PessimisticLockCommand { + pub fn allow_lock_with_conflict(mut self, v: bool) -> Self { + if let Command::AcquirePessimisticLock(commands::AcquirePessimisticLock { + allow_lock_with_conflict, + .. + }) = &mut self.cmd + { + *allow_lock_with_conflict = v; + } else { + panic!( + "expects AcquirePessimisticLock command, got: {:?}", + self.cmd + ); + } + self + } + + pub fn lock_wait_timeout(mut self, timeout: Option) -> Self { + if let Command::AcquirePessimisticLock(commands::AcquirePessimisticLock { + wait_timeout, + .. + }) = &mut self.cmd + { + *wait_timeout = timeout; + } else { + panic!( + "expects AcquirePessimisticLock command, got: {:?}", + self.cmd + ); + } + self + } + } + pub fn new_acquire_pessimistic_lock_command( keys: Vec<(Key, bool)>, start_ts: impl Into, @@ -3337,7 +3382,27 @@ pub mod test_util { return_values: bool, check_existence: bool, ) -> PessimisticLockCommand { - let primary = keys[0].0.clone().to_raw().unwrap(); + new_acquire_pessimistic_lock_command_with_pk( + keys, + None, + start_ts, + for_update_ts, + return_values, + check_existence, + ) + } + + pub fn new_acquire_pessimistic_lock_command_with_pk( + keys: Vec<(Key, bool)>, + pk: Option<&[u8]>, + start_ts: impl Into, + for_update_ts: impl Into, + return_values: bool, + check_existence: bool, + ) -> PessimisticLockCommand { + let primary = pk + .map(|k| k.to_vec()) + .unwrap_or_else(|| keys[0].0.clone().to_raw().unwrap()); let for_update_ts: TimeStamp = for_update_ts.into(); commands::AcquirePessimisticLock::new( keys, @@ -8074,6 +8139,567 @@ mod tests { test_pessimistic_lock_impl(true); } + fn must_have_locks( + storage: &Storage, + ts: u64, + start_key: &[u8], + end_key: &[u8], + expected_locks: &[( + // key + &[u8], + Op, + // start_ts + u64, + // for_update_ts + u64, + )], + ) { + let locks = block_on(storage.scan_lock( + Context::default(), + ts.into(), + Some(Key::from_raw(start_key)), + Some(Key::from_raw(end_key)), + 100, + )) + .unwrap(); + assert_eq!( + locks.len(), + expected_locks.len(), + "lock count not match, expected: {:?}; got: {:?}", + expected_locks, + locks + ); + for (lock_info, (expected_key, expected_op, expected_start_ts, expected_for_update_ts)) in + locks.into_iter().zip(expected_locks.iter()) + { + assert_eq!(lock_info.get_key(), *expected_key); + assert_eq!(lock_info.get_lock_type(), *expected_op); + assert_eq!(lock_info.get_lock_version(), *expected_start_ts); + assert_eq!(lock_info.get_lock_for_update_ts(), *expected_for_update_ts); + } + } + + fn test_pessimistic_lock_resumable_impl( + pipelined_pessimistic_lock: bool, + in_memory_lock: bool, + ) { + type Res = PessimisticLockKeyResult; + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .pipelined_pessimistic_lock(pipelined_pessimistic_lock) + .in_memory_pessimistic_lock(in_memory_lock) + .build() + .unwrap(); + let (tx, rx) = channel(); + + let results_empty = + |len| PessimisticLockResults(vec![PessimisticLockKeyResult::Empty; len]); + + for case_num in 0..4 { + let key = |i| vec![b'k', case_num, i]; + // Put key "k1". + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(1)), b"v1".to_vec())], + key(1), + 10.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(&key(1))], + 10.into(), + 20.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Put key "k2". + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(2)), b"v2".to_vec())], + key(2), + 30.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(&key(2))], + 30.into(), + 40.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Lock "k3", and we will pessimistic-rollback it. + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(3)), false)], + 20, + 20, + false, + false, + ), + expect_pessimistic_lock_res_callback(tx.clone(), results_empty(1)), + ) + .unwrap(); + rx.recv().unwrap(); + + // Prewrite "k4", and we will commit it + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(4)), b"v4".to_vec())], + key(4), + 30.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Prewrite "k5", and we will roll it back + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(5)), b"v5".to_vec())], + key(5), + 30.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Prewrite "k6", and it won't cause conflict after committing. + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(6)), b"v6".to_vec())], + key(6), + 10.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + } + + for &(case_num, return_values, check_existence) in &[ + (0, false, false), + (1, false, true), + (2, true, false), + (3, true, true), + ] { + let key = |i| vec![b'k', case_num, i]; + let expected_results = if return_values { + vec![ + Res::Value(Some(b"v1".to_vec())), + Res::LockedWithConflict { + value: Some(b"v2".to_vec()), + conflict_ts: 40.into(), + }, + Res::Value(None), + Res::LockedWithConflict { + value: Some(b"v4".to_vec()), + conflict_ts: 40.into(), + }, + Res::LockedWithConflict { + value: None, + conflict_ts: 30.into(), + }, + Res::Value(Some(b"v6".to_vec())), + ] + } else if check_existence { + vec![ + Res::Existence(true), + Res::LockedWithConflict { + value: Some(b"v2".to_vec()), + conflict_ts: 40.into(), + }, + Res::Existence(false), + Res::LockedWithConflict { + value: Some(b"v4".to_vec()), + conflict_ts: 40.into(), + }, + Res::LockedWithConflict { + value: None, + conflict_ts: 30.into(), + }, + Res::Existence(true), + ] + } else { + vec![ + Res::Empty, + Res::LockedWithConflict { + value: Some(b"v2".to_vec()), + conflict_ts: 40.into(), + }, + Res::Empty, + Res::LockedWithConflict { + value: Some(b"v4".to_vec()), + conflict_ts: 40.into(), + }, + Res::LockedWithConflict { + value: None, + conflict_ts: 30.into(), + }, + Res::Empty, + ] + }; + + // k1 & k2 + for (i, k) in &[(0, key(1)), (1, key(2))] { + let i = *i; + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(k), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true), + expect_pessimistic_lock_res_callback( + tx.clone(), + PessimisticLockResults(vec![expected_results[i].clone()]), + ), + ) + .unwrap(); + rx.recv().unwrap(); + } + + // k3 + // Report KeyIsLocked if no wait + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(3)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(None), + expect_value_with_checker_callback( + tx.clone(), + 0, + |res: Result| { + let e = res.unwrap().0[0].unwrap_err(); + match e.inner() { + ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( + mvcc::Error(box mvcc::ErrorInner::KeyIsLocked(..)), + ))) => (), + e => panic!("unexpected error chain: {:?}", e), + } + }, + ), + ) + .unwrap(); + rx.recv().unwrap(); + + // Lock wait + let (tx1, rx1) = channel(); + // k3 + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(3)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[2].clone()]), + ), + ) + .unwrap(); + rx1.recv_timeout(Duration::from_millis(100)).unwrap_err(); + + delete_pessimistic_lock(&storage, Key::from_raw(&key(3)), 20, 20); + rx1.recv().unwrap(); + + // k4 + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(4)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[3].clone()]), + ), + ) + .unwrap(); + rx1.recv_timeout(Duration::from_millis(100)).unwrap_err(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(&key(4))], + 30.into(), + 40.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + rx1.recv().unwrap(); + + // k5 + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(5)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[4].clone()]), + ), + ) + .unwrap(); + rx1.recv_timeout(Duration::from_millis(100)).unwrap_err(); + storage + .sched_txn_command( + commands::Rollback::new( + vec![Key::from_raw(&key(5))], + 30.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + rx1.recv().unwrap(); + + // k6 + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(6)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[5].clone()]), + ), + ) + .unwrap(); + rx1.recv_timeout(Duration::from_millis(100)).unwrap_err(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(&key(6))], + 10.into(), + 20.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + rx1.recv().unwrap(); + + must_have_locks( + &storage, + 50, + &key(0), + &key(10), + &[ + (&key(1), Op::PessimisticLock, 25, 25), + (&key(2), Op::PessimisticLock, 25, 40), + (&key(3), Op::PessimisticLock, 25, 25), + (&key(4), Op::PessimisticLock, 25, 40), + (&key(5), Op::PessimisticLock, 25, 30), + (&key(6), Op::PessimisticLock, 25, 25), + ], + ); + + // Test idempotency + for i in 0..6usize { + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(i as u8 + 1)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[i].clone()]), + ), + ) + .unwrap(); + rx1.recv().unwrap(); + } + } + + // Check the channel is clear to avoid misusing in the above test code. + tx.send(100).unwrap(); + assert_eq!(rx.recv().unwrap(), 100); + + // Test request queueing. + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(b"k21"), false)], + 10, + 10, + false, + false, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback(tx, results_empty(1)), + ) + .unwrap(); + rx.recv().unwrap(); + + let channels: Vec<_> = (0..4).map(|_| channel()).collect(); + let start_ts = &[20, 50, 30, 40]; + for i in 0..4 { + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(b"k21"), false)], + start_ts[i], + start_ts[i], + false, + false, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback(channels[i].0.clone(), results_empty(1)), + ) + .unwrap(); + channels[i] + .1 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + } + + delete_pessimistic_lock(&storage, Key::from_raw(b"k21"), 10, 10); + channels[0].1.recv().unwrap(); + channels[2] + .1 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + + delete_pessimistic_lock(&storage, Key::from_raw(b"k21"), 20, 20); + channels[2].1.recv().unwrap(); + channels[3] + .1 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + + delete_pessimistic_lock(&storage, Key::from_raw(b"k21"), 30, 30); + channels[3].1.recv().unwrap(); + channels[1] + .1 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + + delete_pessimistic_lock(&storage, Key::from_raw(b"k21"), 40, 40); + channels[1].1.recv().unwrap(); + } + + #[test] + fn test_pessimistic_lock_resumable() { + for &pipelined_pessimistic_lock in &[false, true] { + for &in_memory_lock in &[false, true] { + test_pessimistic_lock_resumable_impl(pipelined_pessimistic_lock, in_memory_lock); + } + } + } + #[allow(clippy::large_enum_variant)] pub enum Msg { WaitFor { diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 2de3687d18d..c09ca934fa0 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -212,6 +212,11 @@ impl From for TypedCommand false, + PessimisticLockWakeUpMode::WakeUpModeForceLock => true, + }; + AcquirePessimisticLock::new( keys, req.take_primary_lock(), @@ -224,7 +229,7 @@ impl From for TypedCommand = SmallVec<[T; 4]>; + /// Task is a running command. pub(super) struct Task { pub(super) cid: u64, @@ -135,8 +143,9 @@ struct TaskContext { task: Option, lock: Lock, - cb: Option, + cb: Option, pr: Option, + woken_up_resumable_lock_requests: SVec>, // The one who sets `owned` from false to true is allowed to take // `cb` and `pr` safely. owned: AtomicBool, @@ -150,9 +159,11 @@ struct TaskContext { } impl TaskContext { - fn new(task: Task, cb: StorageCallback) -> TaskContext { + fn new(task: Task, cb: SchedulerTaskCallback, prepared_latches: Option) -> TaskContext { let tag = task.cmd.tag(); - let lock = task.cmd.gen_lock(); + let lock = prepared_latches.unwrap_or_else(|| task.cmd.gen_lock()); + // The initial locks should be either all acquired or all not acquired. + assert!(lock.owned_count == 0 || lock.owned_count == lock.required_hashes.len()); // Write command should acquire write lock. if !task.cmd.readonly() && !lock.is_write_lock() { panic!("write lock is expected for command {}", task.cmd); @@ -168,6 +179,7 @@ impl TaskContext { lock, cb: Some(cb), pr: None, + woken_up_resumable_lock_requests: smallvec![], owned: AtomicBool::new(false), write_bytes, tag, @@ -194,6 +206,42 @@ impl TaskContext { } } +pub enum SchedulerTaskCallback { + NormalRequestCallback(StorageCallback), + LockKeyCallbacks(Vec>), +} + +impl SchedulerTaskCallback { + fn execute(self, pr: ProcessResult) { + match self { + Self::NormalRequestCallback(cb) => cb.execute(pr), + Self::LockKeyCallbacks(cbs) => match pr { + ProcessResult::Failed { err } + | ProcessResult::PessimisticLockRes { res: Err(err) } => { + let err = SharedError::from(err); + for cb in cbs { + cb(Err(err.clone())); + } + } + ProcessResult::PessimisticLockRes { res: Ok(v) } => { + assert_eq!(v.0.len(), cbs.len()); + for (res, cb) in v.0.into_iter().zip(cbs) { + cb(Ok(res)) + } + } + _ => unreachable!(), + }, + } + } + + fn unwrap_normal_request_callback(self) -> StorageCallback { + match self { + Self::NormalRequestCallback(cb) => cb, + _ => panic!(""), + } + } +} + struct SchedulerInner { // slot_id -> { cid -> `TaskContext` } in the slot. task_slots: Vec>>>, @@ -260,8 +308,13 @@ impl SchedulerInner { self.task_slots[id_index(cid)].lock() } - fn new_task_context(&self, task: Task, callback: StorageCallback) -> TaskContext { - let tctx = TaskContext::new(task, callback); + fn new_task_context( + &self, + task: Task, + callback: SchedulerTaskCallback, + prepared_latches: Option, + ) -> TaskContext { + let tctx = TaskContext::new(task, callback, prepared_latches); let running_write_bytes = self .running_write_bytes .fetch_add(tctx.write_bytes, Ordering::AcqRel) as i64; @@ -287,13 +340,16 @@ impl SchedulerInner { /// If the task is been processing, it should be owned. /// If it has been finished, then it is not in the slot. /// In both cases, cb should be None. Otherwise, cb should be some. - fn try_own_and_take_cb(&self, cid: u64) -> Option { + fn try_own_and_take_cb(&self, cid: u64) -> Option { self.get_task_slot(cid) .get_mut(&cid) .and_then(|tctx| if tctx.try_own() { tctx.cb.take() } else { None }) } - fn take_task_cb_and_pr(&self, cid: u64) -> (Option, Option) { + fn take_task_cb_and_pr( + &self, + cid: u64, + ) -> (Option, Option) { self.get_task_slot(cid) .get_mut(&cid) .map(|tctx| (tctx.cb.take(), tctx.pr.take())) @@ -304,6 +360,20 @@ impl SchedulerInner { self.get_task_slot(cid).get_mut(&cid).unwrap().pr = Some(pr); } + fn store_lock_changes( + &self, + cid: u64, + woken_up_resumable_lock_requests: SVec>, + ) { + self.get_task_slot(cid) + .get_mut(&cid) + .map(move |tctx| { + assert!(tctx.woken_up_resumable_lock_requests.is_empty()); + tctx.woken_up_resumable_lock_requests = woken_up_resumable_lock_requests; + }) + .unwrap(); + } + fn too_busy(&self, region_id: u64) -> bool { fail_point!("txn_scheduler_busy", |_| true); self.running_write_bytes.load(Ordering::Acquire) >= self.sched_pending_write_threshold @@ -444,21 +514,41 @@ impl Scheduler { }); return; } - self.schedule_command(cmd, callback); + self.schedule_command( + None, + cmd, + SchedulerTaskCallback::NormalRequestCallback(callback), + None, + ); } /// Releases all the latches held by a command. - fn release_lock(&self, lock: &Lock, cid: u64) { - let wakeup_list = self.inner.latches.release(lock, cid, None); + fn release_latches( + &self, + lock: Lock, + cid: u64, + keep_latches_for_next_cmd: Option<(u64, &Lock)>, + ) { + let wakeup_list = self + .inner + .latches + .release(&lock, cid, keep_latches_for_next_cmd); for wcid in wakeup_list { self.try_to_wake_up(wcid); } } - fn schedule_command(&self, cmd: Command, callback: StorageCallback) { - let cid = self.inner.gen_id(); + fn schedule_command( + &self, + specified_cid: Option, + cmd: Command, + callback: SchedulerTaskCallback, + prepared_latches: Option, + ) { + let cid = specified_cid.unwrap_or_else(|| self.inner.gen_id()); let tracker = get_tls_tracker_token(); debug!("received new command"; "cid" => cid, "cmd" => ?cmd, "tracker" => ?tracker); + let tag = cmd.tag(); let priority_tag = get_priority_tag(cmd.priority()); SCHED_STAGE_COUNTER_VEC.get(tag).new.inc(); @@ -469,7 +559,7 @@ impl Scheduler { let mut task_slot = self.inner.get_task_slot(cid); let tctx = task_slot.entry(cid).or_insert_with(|| { self.inner - .new_task_context(Task::new(cid, tracker, cmd), callback) + .new_task_context(Task::new(cid, tracker, cmd), callback, prepared_latches) }); if self.inner.latches.acquire(&mut tctx.lock, cid) { @@ -567,6 +657,28 @@ impl Scheduler { } } + fn schedule_awakened_pessimistic_locks( + &self, + cid: u64, + mut awakened_entries: SVec>, + latches: Lock, + ) { + let key_callbacks: Vec<_> = awakened_entries + .iter_mut() + .map(|i| i.key_cb.take().unwrap().into_inner()) + .collect(); + + let cmd = commands::AcquirePessimisticLockResumed::from_lock_wait_entries(awakened_entries); + + // TODO: Make flow control take effect on this thing. + self.schedule_command( + Some(cid), + cmd.into(), + SchedulerTaskCallback::LockKeyCallbacks(key_callbacks), + Some(latches), + ); + } + // pub for test pub fn get_sched_pool(&self, priority: CommandPri) -> &SchedPool { if priority == CommandPri::High { @@ -662,7 +774,10 @@ impl Scheduler { cb.execute(pr); } - self.release_lock(&tctx.lock, cid); + if !tctx.woken_up_resumable_lock_requests.is_empty() { + self.put_back_lock_wait_entries(tctx.woken_up_resumable_lock_requests); + } + self.release_latches(tctx.lock, cid, None); } /// Event handler for the success of read. @@ -676,12 +791,12 @@ impl Scheduler { let tctx = self.inner.dequeue_task_context(cid); if let ProcessResult::NextCommand { cmd } = pr { SCHED_STAGE_COUNTER_VEC.get(tag).next_cmd.inc(); - self.schedule_command(cmd, tctx.cb.unwrap()); + self.schedule_command(None, cmd, tctx.cb.unwrap(), None); } else { tctx.cb.unwrap().execute(pr); } - self.release_lock(&tctx.lock, cid); + self.release_latches(tctx.lock, cid, None); } /// Event handler for the success of write. @@ -715,19 +830,25 @@ impl Scheduler { drop(lock_guards); let tctx = self.inner.dequeue_task_context(cid); + let mut do_wake_up = !tctx.woken_up_resumable_lock_requests.is_empty(); // If pipelined pessimistic lock or async apply prewrite takes effect, it's not // guaranteed that the proposed or committed callback is surely invoked, which // takes and invokes `tctx.cb(tctx.pr)`. if let Some(cb) = tctx.cb { let pr = match result { Ok(()) => pr.or(tctx.pr).unwrap(), - Err(e) => ProcessResult::Failed { - err: StorageError::from(e), - }, + Err(e) => { + if !Self::is_undetermined_error(&e) { + do_wake_up = false; + } + ProcessResult::Failed { + err: StorageError::from(e), + } + } }; if let ProcessResult::NextCommand { cmd } = pr { SCHED_STAGE_COUNTER_VEC.get(tag).next_cmd.inc(); - self.schedule_command(cmd, cb); + self.schedule_command(None, cmd, cb, None); } else { cb.execute(pr); } @@ -735,7 +856,34 @@ impl Scheduler { assert!(pipelined || async_apply_prewrite); } - self.release_lock(&tctx.lock, cid); + // TODO: Update lock wait relationships after acquiring some locks. + + if do_wake_up { + let woken_up_resumable_lock_requests = tctx.woken_up_resumable_lock_requests; + let next_cid = self.inner.gen_id(); + let mut next_latches = + Self::gen_latches_for_lock_wait_entries(woken_up_resumable_lock_requests.iter()); + + self.release_latches(tctx.lock, cid, Some((next_cid, &next_latches))); + + next_latches.force_assume_acquired(); + self.schedule_awakened_pessimistic_locks( + next_cid, + woken_up_resumable_lock_requests, + next_latches, + ); + } else { + if !tctx.woken_up_resumable_lock_requests.is_empty() { + self.put_back_lock_wait_entries(tctx.woken_up_resumable_lock_requests); + } + self.release_latches(tctx.lock, cid, None); + } + } + + fn gen_latches_for_lock_wait_entries<'a>( + entries: impl IntoIterator>, + ) -> Lock { + Lock::new(entries.into_iter().map(|entry| &entry.key)) } /// Event handler for the request of waiting for lock @@ -793,18 +941,19 @@ impl Scheduler { ); } - fn on_release_locks(&self, released_locks: ReleasedLocks) { + fn on_release_locks(&self, released_locks: ReleasedLocks) -> SVec> { // This function is always called when holding the latch of the involved keys. // So if we found the lock waiting queues are empty, there's no chance // that other threads/commands adds new lock-wait entries to the keys // concurrently. Therefore it's safe to skip waking up when we found the // lock waiting queues are empty. if self.inner.lock_wait_queues.is_empty() { - return; + return smallvec![]; } - let mut legacy_wake_up_list = SmallVec::<[_; 4]>::new(); - let mut delay_wake_up_futures = SmallVec::<[_; 4]>::new(); + let mut legacy_wake_up_list = SVec::new(); + let mut delay_wake_up_futures = SVec::new(); + let mut resumable_wake_up_list = SVec::new(); let wake_up_delay_duration_ms = self .inner .pessimistic_lock_wake_up_delay_duration_ms @@ -822,19 +971,21 @@ impl Scheduler { None => return, }; - // TODO: Currently there are only legacy requests. When resumable requests are - // supported, do not put them to the `legacy_wake_up_list`. - legacy_wake_up_list.push((lock_wait_entry, released_lock)); + if lock_wait_entry.parameters.allow_lock_with_conflict { + resumable_wake_up_list.push(lock_wait_entry); + } else { + legacy_wake_up_list.push((lock_wait_entry, released_lock)); + } if let Some(f) = delay_wake_up_future { delay_wake_up_futures.push(f); } }); - if legacy_wake_up_list.is_empty() && delay_wake_up_futures.is_empty() { - return; + if !legacy_wake_up_list.is_empty() || !delay_wake_up_futures.is_empty() { + self.wake_up_legacy_pessimistic_locks(legacy_wake_up_list, delay_wake_up_futures); } - self.wake_up_legacy_pessimistic_locks(legacy_wake_up_list, delay_wake_up_futures); + resumable_wake_up_list } fn wake_up_legacy_pessimistic_locks( @@ -880,9 +1031,15 @@ impl Scheduler { .unwrap(); } + fn is_undetermined_error(_e: &tikv_kv::Error) -> bool { + // TODO: If there's some cases that `engine.async_write` returns error but it's + // still possible that the data is successfully written, return true. + false + } + fn early_response( cid: u64, - cb: StorageCallback, + cb: SchedulerTaskCallback, pr: ProcessResult, tag: CommandKind, stage: CommandStageKind, @@ -1076,24 +1233,50 @@ impl Scheduler { let mut pr = Some(pr); if !lock_info.is_empty() { - assert_eq!(lock_info.len(), 1); - let lock_info = lock_info.into_iter().next().unwrap(); - - // Only handle lock waiting if `wait_timeout` is set. Otherwise it indicates - // that it's a lock-no-wait request and we need to report error - // immediately. - if lock_info.parameters.wait_timeout.is_some() { - assert_eq!(to_be_write.size(), 0); - pr = Some(ProcessResult::Res); - // allow_lock_with_conflict is not supported yet in this version. - assert!(!lock_info.parameters.allow_lock_with_conflict); - - scheduler.on_wait_for_lock(&ctx, cid, lock_info, tracker); + if tag == CommandKind::acquire_pessimistic_lock { + assert_eq!(lock_info.len(), 1); + let lock_info = lock_info.into_iter().next().unwrap(); + + // Only handle lock waiting if `wait_timeout` is set. Otherwise it indicates + // that it's a lock-no-wait request and we need to report error + // immediately. + if lock_info.parameters.wait_timeout.is_some() { + assert_eq!(to_be_write.size(), 0); + pr = Some(ProcessResult::Res); + + scheduler.on_wait_for_lock(&ctx, cid, lock_info, tracker); + } else { + // For requests with `allow_lock_with_conflict`, key errors are set key-wise. + // TODO: It's better to return this error from + // `commands::AcquirePessimisticLocks::process_write`. + if lock_info.parameters.allow_lock_with_conflict { + pr = Some(ProcessResult::PessimisticLockRes { + res: Err(StorageError::from(Error::from(MvccError::from( + MvccErrorInner::KeyIsLocked(lock_info.lock_info_pb), + )))), + }); + } + } + } else if tag == CommandKind::acquire_pessimistic_lock_resumed { + // Some requests meets lock again after waiting and resuming. + scheduler.on_wait_for_lock_after_resuming(cid, pr.as_mut().unwrap(), lock_info); + } else { + // WriteResult returning lock info is only expected to exist for pessimistic + // lock requests. + unreachable!(); } } - if !released_locks.is_empty() { - scheduler.on_release_locks(released_locks); + let woken_up_resumable_entries = if !released_locks.is_empty() { + scheduler.on_release_locks(released_locks) + } else { + smallvec![] + }; + + if !woken_up_resumable_entries.is_empty() { + scheduler + .inner + .store_lock_changes(cid, woken_up_resumable_entries); } if to_be_write.modifies.is_empty() { @@ -1101,7 +1284,8 @@ impl Scheduler { return; } - if tag == CommandKind::acquire_pessimistic_lock + if (tag == CommandKind::acquire_pessimistic_lock + || tag == CommandKind::acquire_pessimistic_lock_resumed) && pessimistic_lock_mode == PessimisticLockMode::InMemory && self.try_write_in_memory_pessimistic_locks( txn_ext.as_deref(), @@ -1418,11 +1602,11 @@ impl Scheduler { lock_info.key.clone(), self.inner.lock_wait_queues.clone(), lock_wait_token, - cb, + cb.unwrap_normal_request_callback(), lock_info.parameters.allow_lock_with_conflict, ); let first_batch_cb = ctx.get_callback_for_first_write_batch(); - task_ctx.cb = Some(first_batch_cb); + task_ctx.cb = Some(SchedulerTaskCallback::NormalRequestCallback(first_batch_cb)); drop(slot); let lock_wait_entry = Box::new(LockWaitEntry { @@ -1437,6 +1621,95 @@ impl Scheduler { (ctx, lock_wait_entry, lock_info.lock_info_pb) } + + fn make_lock_waiting_after_resuming( + &self, + lock_info: WriteResultLockInfo, + cb: PessimisticLockKeyCallback, + ) -> Box { + Box::new(LockWaitEntry { + key: lock_info.key, + lock_hash: lock_info.lock_digest.hash, + parameters: lock_info.parameters, + should_not_exist: lock_info.should_not_exist, + lock_wait_token: lock_info.lock_wait_token, + legacy_wake_up_index: None, + key_cb: Some(cb.into()), + }) + } + + fn on_wait_for_lock_after_resuming( + &self, + cid: u64, + pr: &mut ProcessResult, + lock_info: Vec, + ) { + if lock_info.is_empty() { + return; + } + + // TODO: Update lock wait relationship. + + let results = match pr { + ProcessResult::PessimisticLockRes { + res: Ok(PessimisticLockResults(res)), + } => res, + _ => unreachable!(), + }; + + let mut slot = self.inner.get_task_slot(cid); + let task_ctx = slot.get_mut(&cid).unwrap(); + let cbs = match task_ctx.cb { + Some(SchedulerTaskCallback::LockKeyCallbacks(ref mut v)) => v, + _ => unreachable!(), + }; + assert_eq!(results.len(), cbs.len()); + + let finished_len = results.len() - lock_info.len(); + + let original_results = std::mem::replace(results, Vec::with_capacity(finished_len)); + let original_cbs = std::mem::replace(cbs, Vec::with_capacity(finished_len)); + let mut lock_wait_entries = SmallVec::<[_; 10]>::with_capacity(lock_info.len()); + let mut lock_info_it = lock_info.into_iter(); + + for (result, cb) in original_results.into_iter().zip(original_cbs) { + if let PessimisticLockKeyResult::Waiting = &result { + let lock_info = lock_info_it.next().unwrap(); + let lock_info_pb = lock_info.lock_info_pb.clone(); + let entry = self.make_lock_waiting_after_resuming(lock_info, cb); + lock_wait_entries.push((entry, lock_info_pb)); + } else { + results.push(result); + cbs.push(cb); + } + } + + assert!(lock_info_it.next().is_none()); + assert_eq!(results.len(), cbs.len()); + + // Release the mutex in the latch slot. + drop(slot); + + // Add to the lock waiting queue. + // TODO: the request may be canceled from lock manager at this time. If so, it + // should not be added to the queue. + for (entry, lock_info_pb) in lock_wait_entries { + self.inner + .lock_wait_queues + .push_lock_wait(entry, lock_info_pb); + } + } + + fn put_back_lock_wait_entries(&self, entries: impl IntoIterator>) { + for entry in entries.into_iter() { + // TODO: Do not pass `default` as the lock info. Here we need another method + // `put_back_lock_wait`, which doesn't require updating lock info and + // additionally checks if the lock wait entry is already canceled. + self.inner + .lock_wait_queues + .push_lock_wait(entry, Default::default()); + } + } } pub async fn get_raw_ext( @@ -1717,7 +1990,7 @@ mod tests { block_on(f).unwrap(), Err(StorageError(box StorageErrorInner::DeadlineExceeded)) )); - scheduler.release_lock(&lock, cid); + scheduler.release_latches(lock, cid, None); // A new request should not be blocked. let mut req = BatchRollbackRequest::default(); @@ -1952,7 +2225,7 @@ mod tests { // When releasing the lock, the queuing tasks should be all waken up without // stack overflow. - scheduler.release_lock(&lock, cid); + scheduler.release_latches(lock, cid, None); // A new request should not be blocked. let mut req = BatchRollbackRequest::default(); diff --git a/src/storage/types.rs b/src/storage/types.rs index 63bab09eb5c..c7da00c9ace 100644 --- a/src/storage/types.rs +++ b/src/storage/types.rs @@ -299,6 +299,48 @@ impl PessimisticLockResults { self.0.push(key_res); } + pub fn into_pb(self) -> (Vec, Option) { + let mut error = None; + let res = self + .0 + .into_iter() + .map(|res| { + let mut res_pb = kvrpcpb::PessimisticLockKeyResult::default(); + match res { + PessimisticLockKeyResult::Empty => { + res_pb.set_type(kvrpcpb::PessimisticLockKeyResultType::LockResultNormal) + } + PessimisticLockKeyResult::Value(v) => { + res_pb.set_type(kvrpcpb::PessimisticLockKeyResultType::LockResultNormal); + res_pb.set_existence(v.is_some()); + res_pb.set_value(v.unwrap_or_default()); + } + PessimisticLockKeyResult::Existence(e) => { + res_pb.set_type(kvrpcpb::PessimisticLockKeyResultType::LockResultNormal); + res_pb.set_existence(e); + } + PessimisticLockKeyResult::LockedWithConflict { value, conflict_ts } => { + res_pb.set_type( + kvrpcpb::PessimisticLockKeyResultType::LockResultLockedWithConflict, + ); + res_pb.set_existence(value.is_some()); + res_pb.set_value(value.unwrap_or_default()); + res_pb.set_locked_with_conflict_ts(conflict_ts.into_inner()); + } + PessimisticLockKeyResult::Waiting => unreachable!(), + PessimisticLockKeyResult::Failed(e) => { + if error.is_none() { + error = Some(e) + } + res_pb.set_type(kvrpcpb::PessimisticLockKeyResultType::LockResultFailed); + } + } + res_pb + }) + .collect(); + (res, error) + } + pub fn into_legacy_values_and_not_founds(self) -> (Vec, Vec) { if self.0.is_empty() { return (vec![], vec![]); diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index f6db3386007..effe9698f30 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -1198,7 +1198,229 @@ fn test_pessimistic_lock() { assert_eq!(resp.get_values().to_vec(), vec![v.clone(), vec![]]); assert_eq!(resp.get_not_founds().to_vec(), vec![false, true]); } - must_kv_pessimistic_rollback(&client, ctx.clone(), k.clone(), 40); + must_kv_pessimistic_rollback(&client, ctx.clone(), k.clone(), 40, 40); + } +} + +#[test] +fn test_pessimistic_lock_resumable() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + + // Resumable pessimistic lock request with multi-key is not supported yet. + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![b"k1".to_vec(), b"k2".to_vec()], + 1, + 1, + None, + false, + false, + ); + assert_eq!(resp.get_results(), &[]); + assert_ne!(resp.get_errors().len(), 0); + + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + + // Prewrite + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite(&client, ctx.clone(), vec![mutation.clone()], k.clone(), 5); + + // No wait + let start_time = Instant::now(); + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![k.clone()], + 8, + 8, + None, + false, + false, + ); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert!(start_time.elapsed() < Duration::from_millis(200)); + assert_eq!(resp.errors.len(), 1); + assert!(resp.errors[0].has_locked()); + assert_eq!(resp.get_results().len(), 1); + assert_eq!( + resp.get_results()[0].get_type(), + PessimisticLockKeyResultType::LockResultFailed + ); + + // Wait Timeout + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![k.clone()], + 8, + 8, + Some(1), + false, + false, + ); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert_eq!(resp.errors.len(), 1); + assert!(resp.errors[0].has_locked()); + assert_eq!(resp.get_results().len(), 1); + assert_eq!( + resp.get_results()[0].get_type(), + PessimisticLockKeyResultType::LockResultFailed + ); + + must_kv_commit(&client, ctx.clone(), vec![k.clone()], 5, 9, 9); + + let mut curr_ts = 10; + + for &(return_values, check_existence) in + &[(false, false), (false, true), (true, false), (true, true)] + { + let prewrite_start_ts = curr_ts; + let commit_ts = curr_ts + 5; + let test_lock_ts = curr_ts + 10; + curr_ts += 20; + + // Prewrite + must_kv_prewrite( + &client, + ctx.clone(), + vec![mutation.clone()], + k.clone(), + prewrite_start_ts, + ); + + let (tx, rx) = std::sync::mpsc::channel(); + let handle = { + let client = client.clone(); + let k = k.clone(); + let ctx = ctx.clone(); + thread::spawn(move || { + let res = kv_pessimistic_lock_resumable( + &client, + ctx, + vec![k], + test_lock_ts, + test_lock_ts, + Some(1000), + return_values, + check_existence, + ); + tx.send(()).unwrap(); + res + }) + }; + // Blocked for lock waiting. + rx.recv_timeout(Duration::from_millis(100)).unwrap_err(); + + must_kv_commit( + &client, + ctx.clone(), + vec![k.clone()], + prewrite_start_ts, + commit_ts, + commit_ts, + ); + rx.recv_timeout(Duration::from_millis(1000)).unwrap(); + let resp = handle.join().unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert_eq!(resp.errors.len(), 0); + assert_eq!(resp.get_results().len(), 1); + let res = &resp.get_results()[0]; + if return_values { + assert_eq!( + res.get_type(), + PessimisticLockKeyResultType::LockResultNormal + ); + assert_eq!(res.get_value(), b"value"); + assert_eq!(res.get_existence(), true); + assert_eq!(res.get_locked_with_conflict_ts(), 0); + } else if check_existence { + assert_eq!( + res.get_type(), + PessimisticLockKeyResultType::LockResultNormal + ); + assert_eq!(res.get_value(), b""); + assert_eq!(res.get_existence(), true); + assert_eq!(res.get_locked_with_conflict_ts(), 0); + } else { + assert_eq!( + res.get_type(), + PessimisticLockKeyResultType::LockResultNormal + ); + assert_eq!(res.get_value(), b""); + assert_eq!(res.get_existence(), false); + assert_eq!(res.get_locked_with_conflict_ts(), 0); + } + + must_kv_pessimistic_rollback(&client, ctx.clone(), k.clone(), test_lock_ts, test_lock_ts); + } + + for &(return_values, check_existence) in + &[(false, false), (false, true), (true, false), (true, true)] + { + let test_lock_ts = curr_ts; + let prewrite_start_ts = curr_ts + 10; + let commit_ts = curr_ts + 11; + curr_ts += 20; + // Prewrite + must_kv_prewrite( + &client, + ctx.clone(), + vec![mutation.clone()], + k.clone(), + prewrite_start_ts, + ); + + let (tx, rx) = std::sync::mpsc::channel(); + let handle = { + let client = client.clone(); + let k = k.clone(); + let ctx = ctx.clone(); + thread::spawn(move || { + let res = kv_pessimistic_lock_resumable( + &client, + ctx, + vec![k], + test_lock_ts, + test_lock_ts, + Some(1000), + return_values, + check_existence, + ); + tx.send(()).unwrap(); + res + }) + }; + // Blocked for lock waiting. + rx.recv_timeout(Duration::from_millis(100)).unwrap_err(); + must_kv_commit( + &client, + ctx.clone(), + vec![k.clone()], + prewrite_start_ts, + commit_ts, + commit_ts, + ); + rx.recv_timeout(Duration::from_millis(1000)).unwrap(); + let resp = handle.join().unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert_eq!(resp.errors.len(), 0); + assert_eq!(resp.get_results().len(), 1); + assert_eq!( + resp.get_results()[0].get_type(), + PessimisticLockKeyResultType::LockResultLockedWithConflict + ); + assert_eq!(resp.get_results()[0].get_value(), v); + assert_eq!(resp.get_results()[0].get_existence(), true); + assert_eq!( + resp.get_results()[0].get_locked_with_conflict_ts(), + commit_ts + ); + + must_kv_pessimistic_rollback(&client, ctx.clone(), k.clone(), test_lock_ts, commit_ts); } } @@ -1816,7 +2038,7 @@ fn test_get_lock_wait_info_api() { entries[0].resource_group_tag, b"resource_group_tag2".to_vec() ); - must_kv_pessimistic_rollback(&client, ctx, b"a".to_vec(), 20); + must_kv_pessimistic_rollback(&client, ctx, b"a".to_vec(), 20, 20); handle.join().unwrap(); } diff --git a/tests/integrations/server/lock_manager.rs b/tests/integrations/server/lock_manager.rs index d796d9c1f66..43032dd8cc3 100644 --- a/tests/integrations/server/lock_manager.rs +++ b/tests/integrations/server/lock_manager.rs @@ -42,8 +42,9 @@ fn deadlock(client: &TikvClient, ctx: Context, key1: &[u8], ts: u64) -> bool { handle.join().unwrap(); // Clean up - must_kv_pessimistic_rollback(client, ctx.clone(), key1.clone(), ts); - must_kv_pessimistic_rollback(client, ctx, key2.clone(), ts + 1); + + must_kv_pessimistic_rollback(client, ctx.clone(), key1.clone(), ts, ts); + must_kv_pessimistic_rollback(client, ctx, key2.clone(), ts + 1, ts + 1); assert_eq!(resp.errors.len(), 1); if resp.errors[0].has_deadlock() { From 31ca8b9bc127b820dfabc6ee259514be819a3eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 29 Nov 2022 13:09:59 +0800 Subject: [PATCH 365/676] tikv_utils: make retry returns a `Send` future even the result type is `!Sync` (#13753) close tikv/tikv#13811 This PR have modified the `retry_ext` function in the `tikv_utils::stream` package, making the `final_result` not live across await points any more. So even the result type is `!Sync`, `retry` returns a `Send` future now. Signed-off-by: hillium --- components/tikv_util/src/stream.rs | 86 ++++++++++++++++++++++++------ 1 file changed, 71 insertions(+), 15 deletions(-) diff --git a/components/tikv_util/src/stream.rs b/components/tikv_util/src/stream.rs index 8f892659f68..fb29d1c91f0 100644 --- a/components/tikv_util/src/stream.rs +++ b/components/tikv_util/src/stream.rs @@ -152,24 +152,28 @@ where })(); let mut retry_wait_dur = Duration::from_secs(1); - - let mut final_result = action().await; - for _ in 1..max_retry_times { - if let Err(e) = &final_result { - if let Some(ref mut f) = ext.on_failure { - f(e); - } - if e.is_retryable() { - let backoff = thread_rng().gen_range(0..1000); - sleep(retry_wait_dur + Duration::from_millis(backoff)).await; - retry_wait_dur = MAX_RETRY_DELAY.min(retry_wait_dur * 2); - final_result = action().await; - continue; + let mut retry_time = 0; + loop { + match action().await { + Ok(r) => return Ok(r), + Err(e) => { + if let Some(ref mut f) = ext.on_failure { + f(&e); + } + if !e.is_retryable() { + return Err(e); + } + retry_time += 1; + if retry_time > max_retry_times { + return Err(e); + } } } - break; + + let backoff = thread_rng().gen_range(0..1000); + sleep(retry_wait_dur + Duration::from_millis(backoff)).await; + retry_wait_dur = MAX_RETRY_DELAY.min(retry_wait_dur * 2); } - final_result } // Return an error if the future does not finish by the timeout @@ -206,3 +210,55 @@ impl RetryError for HttpDispatchError { true } } + +#[cfg(test)] +mod tests { + use std::{cell::RefCell, pin::Pin}; + + use futures::{Future, FutureExt}; + use rusoto_core::HttpDispatchError; + + use super::RetryError; + use crate::stream::retry; + + #[derive(Debug)] + struct TriviallyRetry; + + impl RetryError for TriviallyRetry { + fn is_retryable(&self) -> bool { + true + } + } + + fn assert_send(_t: T) {} + + #[test] + fn test_retry_is_send_even_return_type_not_sync() { + struct BangSync(Option>); + let fut = retry(|| futures::future::ok::<_, HttpDispatchError>(BangSync(None))); + assert_send(fut) + } + + fn gen_action_fail_for( + n_times: usize, + ) -> impl FnMut() -> Pin>>> { + let mut n = 0; + move || { + if n < n_times { + n += 1; + futures::future::err(TriviallyRetry).boxed() + } else { + futures::future::ok(()).boxed() + } + } + } + + #[tokio::test] + async fn test_failure() { + fail::cfg("retry_count", "return(2)").unwrap(); + let r = retry(gen_action_fail_for(3)).await; + assert!(r.is_err(), "{:?}", r); + let r = retry(gen_action_fail_for(1)).await; + assert!(r.is_ok(), "{:?}", r); + } +} From 2f4374ee4e15b2c8054c06e5a50bdaeeea475472 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Tue, 29 Nov 2022 13:51:59 +0800 Subject: [PATCH 366/676] filter out invalid k-v events when applying kv-file on PiTR (#13852) close tikv/tikv#13853, ref pingcap/tidb#39398 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- src/import/sst_service.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 61d181b5c2f..a0d2ab5f4ee 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -1017,8 +1017,13 @@ where { // use callback to collect kv data. Box::new(move |k: Vec, v: Vec| { - let mut req = Request::default(); + // Need to skip the empty key/value that could break the transaction or cause + // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. + if k.is_empty() || v.is_empty() { + return; + } + let mut req = Request::default(); if is_delete { let mut del = DeleteRequest::default(); del.set_key(k); @@ -1201,6 +1206,7 @@ mod test { write(b"bar", Put, 38, 37), write(b"baz", Put, 34, 31), write(b"bar", Put, 28, 17), + (Vec::default(), Vec::default()), ], expected_reqs: vec![ write_req(b"foo", Put, 40, 39), @@ -1235,6 +1241,7 @@ mod test { ), default(b"beyond", b"Calling your name.", 278), default(b"beyond", b"Calling your name.", 278), + default(b"PingCap", b"", 300), ], expected_reqs: vec![ default_req(b"aria", b"The planet where flowers bloom.", 123), From f8a397657072539501ca636e2f2dd4f3a85693f4 Mon Sep 17 00:00:00 2001 From: xiongjiwei Date: Tue, 29 Nov 2022 18:32:00 +0800 Subject: [PATCH 367/676] cdc: filter out the event cause by cdc write (#13796) ref tikv/tikv#13779 filter out the event cause by cdc write Signed-off-by: xiongjiwei Co-authored-by: Ti Chi Robot --- components/cdc/src/delegate.rs | 80 ++++++++++- components/cdc/src/endpoint.rs | 20 +++ components/cdc/src/initializer.rs | 78 ++++++++++- components/cdc/src/service.rs | 10 +- components/cdc/tests/integrations/test_cdc.rs | 126 ++++++++++++++++++ components/cdc/tests/mod.rs | 30 ++++- 6 files changed, 332 insertions(+), 12 deletions(-) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index de38a7b1fc8..120806588dc 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -129,6 +129,7 @@ pub struct Downstream { sink: Option, state: Arc>, kv_api: ChangeDataRequestKvApi, + filter_loop: bool, } impl Downstream { @@ -142,6 +143,7 @@ impl Downstream { req_id: u64, conn_id: ConnId, kv_api: ChangeDataRequestKvApi, + filter_loop: bool, ) -> Downstream { Downstream { id: DownstreamId::new(), @@ -152,6 +154,7 @@ impl Downstream { sink: None, state: Arc::new(AtomicCell::new(DownstreamState::default())), kv_api, + filter_loop, } } @@ -203,6 +206,10 @@ impl Downstream { self.id } + pub fn get_filter_loop(&self) -> bool { + self.filter_loop + } + pub fn get_state(&self) -> Arc> { self.state.clone() } @@ -471,6 +478,7 @@ impl Delegate { region_id: u64, request_id: u64, entries: Vec>, + filter_loop: bool, ) -> Result> { let entries_len = entries.len(); let mut rows = vec![Vec::with_capacity(entries_len)]; @@ -527,6 +535,10 @@ impl Delegate { row_size = 0; } } + // if the `txn_source` is not 0 and we should filter it out, skip this event. + if row.txn_source != 0 && filter_loop { + continue; + } if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { rows.push(Vec::with_capacity(entries_len)); current_rows_size = 0; @@ -620,6 +632,48 @@ impl Delegate { if entries.is_empty() { return Ok(()); } + + let downstreams = self.downstreams(); + assert!( + !downstreams.is_empty(), + "region {} miss downstream", + self.region_id + ); + + let mut need_filter = false; + for ds in downstreams { + if ds.filter_loop { + need_filter = true; + break; + } + } + + // collect the change event cause by user write, which is `txn_source` = 0. + // for changefeed which only need the user write, send the `filtered`, or else, + // send them all. + let filtered = if need_filter { + let filtered = entries + .iter() + .filter(|x| x.txn_source == 0) + .cloned() + .collect::>(); + if filtered.is_empty() { + None + } else { + Some(Event { + region_id: self.region_id, + index, + event: Some(Event_oneof_event::Entries(EventEntries { + entries: filtered.into(), + ..Default::default() + })), + ..Default::default() + }) + } + } else { + None + }; + let event_entries = EventEntries { entries: entries.into(), ..Default::default() @@ -630,6 +684,7 @@ impl Delegate { event: Some(Event_oneof_event::Entries(event_entries)), ..Default::default() }; + let send = move |downstream: &Downstream| { // No ready downstream or a downstream that does not match the kv_api type, will // be ignored. There will be one region that contains both Txn & Raw entries. @@ -637,7 +692,15 @@ impl Delegate { if !downstream.state.load().ready_for_change_events() || downstream.kv_api != kv_api { return Ok(()); } - let event = change_data_event.clone(); + if downstream.filter_loop && filtered.is_none() { + return Ok(()); + } + + let event = if downstream.filter_loop { + filtered.clone().unwrap() + } else { + change_data_event.clone() + }; // Do not force send for real time change data events. let force_send = false; downstream.sink_event(event, force_send) @@ -918,6 +981,7 @@ fn decode_write( } }; let commit_ts = if write.write_type == WriteType::Rollback { + assert_eq!(write.txn_source, 0); 0 } else { key.decode_ts().unwrap().into_inner() @@ -926,6 +990,8 @@ fn decode_write( row.commit_ts = commit_ts; row.key = key.truncate_ts().unwrap().into_raw().unwrap(); row.op_type = op_type as _; + // used for filter out the event. see `txn_source` field for more detail. + row.txn_source = write.txn_source; set_event_row_type(row, r_type); if let Some(value) = write.short_value { row.value = value; @@ -952,6 +1018,8 @@ fn decode_lock(key: Vec, lock: Lock, row: &mut EventRow, has_value: &mut boo row.start_ts = lock.ts.into_inner(); row.key = key.into_raw().unwrap(); row.op_type = op_type as _; + // used for filter out the event. see `txn_source` field for more detail. + row.txn_source = lock.txn_source; set_event_row_type(row, EventLogType::Prewrite); if let Some(value) = lock.short_value { row.value = value; @@ -1021,6 +1089,7 @@ mod tests { request_id, ConnId::new(), ChangeDataRequestKvApi::TiDb, + false, ); downstream.set_sink(sink); let mut delegate = Delegate::new(region_id, Default::default()); @@ -1138,7 +1207,14 @@ mod tests { let mut epoch = RegionEpoch::default(); epoch.set_conf_ver(region_version); epoch.set_version(region_version); - Downstream::new(peer, epoch, id, ConnId::new(), ChangeDataRequestKvApi::TiDb) + Downstream::new( + peer, + epoch, + id, + ConnId::new(), + ChangeDataRequestKvApi::TiDb, + false, + ) }; // Create a new delegate. diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 4b6bbad6d35..6d64754d042 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -620,6 +620,7 @@ impl, E: KvEngine> Endpoint { let api_version = self.api_version; let downstream_id = downstream.get_id(); let downstream_state = downstream.get_state(); + let filter_loop = downstream.get_filter_loop(); // Register must follow OpenConn, so the connection must be available. let conn = self.connections.get_mut(&conn_id).unwrap(); @@ -746,6 +747,7 @@ impl, E: KvEngine> Endpoint { build_resolver: is_new_delegate, ts_filter_ratio: self.config.incremental_scan_ts_filter_ratio, kv_api, + filter_loop, }; let raft_router = self.raft_router.clone(); @@ -1423,6 +1425,7 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::RawKv, + false, ); req.set_kv_api(ChangeDataRequestKvApi::RawKv); suite.run(Task::Register { @@ -1458,6 +1461,7 @@ mod tests { 2, conn_id, ChangeDataRequestKvApi::TxnKv, + false, ); req.set_kv_api(ChangeDataRequestKvApi::TxnKv); suite.run(Task::Register { @@ -1494,6 +1498,7 @@ mod tests { 3, conn_id, ChangeDataRequestKvApi::TxnKv, + false, ); req.set_kv_api(ChangeDataRequestKvApi::TxnKv); suite.run(Task::Register { @@ -1672,6 +1677,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req, @@ -1718,6 +1724,7 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); // Enable batch resolved ts in the test. let version = FeatureGate::batch_resolved_ts(); @@ -1740,6 +1747,7 @@ mod tests { 2, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req.clone(), @@ -1776,6 +1784,7 @@ mod tests { 3, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req, @@ -1820,6 +1829,7 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.add_local_reader(100); suite.run(Task::Register { @@ -1851,6 +1861,7 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req, @@ -1926,6 +1937,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); downstream.get_state().store(DownstreamState::Normal); // Enable batch resolved ts in the test. @@ -1962,6 +1974,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); downstream.get_state().store(DownstreamState::Normal); suite.add_region(2, 100); @@ -2007,6 +2020,7 @@ mod tests { 3, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); downstream.get_state().store(DownstreamState::Normal); suite.add_region(3, 100); @@ -2077,6 +2091,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); let downstream_id = downstream.get_id(); suite.run(Task::Register { @@ -2119,6 +2134,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); let new_downstream_id = downstream.get_id(); suite.run(Task::Register { @@ -2170,6 +2186,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req, @@ -2224,6 +2241,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); downstream.get_state().store(DownstreamState::Normal); suite.run(Task::Register { @@ -2341,6 +2359,7 @@ mod tests { 0, conn_id_a, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req.clone(), @@ -2364,6 +2383,7 @@ mod tests { 0, conn_id_b, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req.clone(), diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 36c1636a7e8..38c8603900e 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -96,6 +96,8 @@ pub(crate) struct Initializer { pub(crate) ts_filter_ratio: f64, pub(crate) kv_api: ChangeDataRequestKvApi, + + pub(crate) filter_loop: bool, } impl Initializer { @@ -425,8 +427,12 @@ impl Initializer { async fn sink_scan_events(&mut self, entries: Vec>, done: bool) -> Result<()> { let mut barrier = None; - let mut events = - Delegate::convert_to_grpc_events(self.region_id, self.request_id, entries)?; + let mut events = Delegate::convert_to_grpc_events( + self.region_id, + self.request_id, + entries, + self.filter_loop, + )?; if done { let (cb, fut) = tikv_util::future::paired_future_callback(); events.push(CdcEvent::Barrier(Some(cb))); @@ -558,13 +564,17 @@ mod tests { use engine_rocks::RocksEngine; use engine_traits::{MiscExt, CF_WRITE}; use futures::{executor::block_on, StreamExt}; - use kvproto::{cdcpb::Event_oneof_event, errorpb::Error as ErrorHeader}; + use kvproto::{ + cdcpb::{EventLogType, Event_oneof_event}, + errorpb::Error as ErrorHeader, + }; use raftstore::{coprocessor::ObserveHandle, store::RegionSnapshot}; use test_raftstore::MockRaftStoreRouter; use tikv::storage::{ kv::Engine, txn::tests::{ must_acquire_pessimistic_lock, must_commit, must_prewrite_delete, must_prewrite_put, + must_prewrite_put_with_txn_soucre, }, TestEngineBuilder, }; @@ -601,6 +611,7 @@ mod tests { buffer: usize, engine: Option, kv_api: ChangeDataRequestKvApi, + filter_loop: bool, ) -> ( LazyWorker, Runtime, @@ -645,6 +656,7 @@ mod tests { build_resolver: true, ts_filter_ratio: 1.0, // always enable it. kv_api, + filter_loop, }; (receiver_worker, pool, initializer, rx, drain) @@ -686,6 +698,7 @@ mod tests { buffer, engine.kv_engine(), ChangeDataRequestKvApi::TiDb, + false, ); let check_result = || loop { let task = rx.recv().unwrap(); @@ -754,6 +767,53 @@ mod tests { worker.stop(); } + #[test] + fn test_initializer_filter_loop() { + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); + + let mut total_bytes = 0; + + for i in 10..100 { + let (k, v) = (&[b'k', i], &[b'v', i]); + total_bytes += k.len(); + total_bytes += v.len(); + let ts = TimeStamp::new(i as _); + must_prewrite_put_with_txn_soucre(&mut engine, k, v, k, ts, 1); + } + + let snap = engine.snapshot(Default::default()).unwrap(); + // Buffer must be large enough to unblock async incremental scan. + let buffer = 1000; + let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + total_bytes, + buffer, + engine.kv_engine(), + ChangeDataRequestKvApi::TiDb, + true, + ); + let th = pool.spawn(async move { + initializer + .async_incremental_scan(snap, Region::default()) + .await + .unwrap(); + }); + let mut drain = drain.drain(); + while let Some((event, _)) = block_on(drain.next()) { + let event = match event { + CdcEvent::Event(x) if x.event.is_some() => x.event.unwrap(), + _ => continue, + }; + let entries = match event { + Event_oneof_event::Entries(mut x) => x.take_entries().into_vec(), + _ => continue, + }; + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].get_type(), EventLogType::Initialized); + } + block_on(th).unwrap(); + worker.stop(); + } + // Test `hint_min_ts` works fine with `ExtraOp::ReadOldValue`. // Whether `DeltaScanner` emits correct old values or not is already tested by // another case `test_old_value_with_hint_min_ts`, so here we only care about @@ -782,6 +842,7 @@ mod tests { 1000, engine.kv_engine(), ChangeDataRequestKvApi::TiDb, + false, ); initializer.checkpoint_ts = checkpoint_ts.into(); let mut drain = drain.drain(); @@ -840,8 +901,13 @@ mod tests { fn test_initializer_deregister_downstream() { let total_bytes = 1; let buffer = 1; - let (mut worker, _pool, mut initializer, rx, _drain) = - mock_initializer(total_bytes, buffer, None, ChangeDataRequestKvApi::TiDb); + let (mut worker, _pool, mut initializer, rx, _drain) = mock_initializer( + total_bytes, + buffer, + None, + ChangeDataRequestKvApi::TiDb, + false, + ); // Errors reported by region should deregister region. initializer.build_resolver = false; @@ -891,7 +957,7 @@ mod tests { let total_bytes = 1; let buffer = 1; let (mut worker, pool, mut initializer, _rx, _drain) = - mock_initializer(total_bytes, buffer, None, kv_api); + mock_initializer(total_bytes, buffer, None, kv_api, false); let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); let raft_router = MockRaftStoreRouter::new(); diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index e7bec568f67..f9665283c45 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -240,8 +240,14 @@ impl ChangeData for Service { semver::Version::new(0, 0, 0) } }; - let downstream = - Downstream::new(peer.clone(), region_epoch, req_id, conn_id, req_kvapi); + let downstream = Downstream::new( + peer.clone(), + region_epoch, + req_id, + conn_id, + req_kvapi, + request.filter_loop, + ); let ret = scheduler .schedule(Task::Register { request, diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index 3be68c5905c..f2f09622a52 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -2359,3 +2359,129 @@ fn test_prewrite_without_value() { let event = receive_event(false); assert_eq!(event.get_events()[0].get_entries().entries[0].commit_ts, 14); } + +#[test] +fn test_filter_loop() { + test_kv_format_impl!(test_filter_loop_impl); +} + +fn test_filter_loop_impl() { + let mut suite = TestSuite::new(1, F::TAG); + let mut req = suite.new_changedata_request(1); + req.set_extra_op(ExtraOp::ReadOldValue); + req.set_filter_loop(true); + let (mut req_tx, event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let row = &es.take_entries().to_vec()[0]; + assert_eq!(row.get_type(), EventLogType::Initialized); + } + other => panic!("unknown event {:?}", other), + } + + // Insert value, simulate INSERT INTO. + let mut m1 = Mutation::default(); + let k1 = b"xk1".to_vec(); + m1.set_op(Op::Insert); + m1.key = k1.clone(); + m1.value = b"v1".to_vec(); + suite.must_kv_prewrite_with_source(1, vec![m1], k1.clone(), 10.into(), 1); + let mut m2 = Mutation::default(); + let k2 = b"xk2".to_vec(); + m2.set_op(Op::Insert); + m2.key = k2.clone(); + m2.value = b"v2".to_vec(); + suite.must_kv_prewrite_with_source(1, vec![m2], k2.clone(), 12.into(), 0); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + let row = &events[0]; + assert_eq!(row.get_value(), b"v2"); + assert_eq!(row.get_old_value(), b""); + assert_eq!(row.get_type(), EventLogType::Prewrite); + assert_eq!(row.get_start_ts(), 12); + } + other => panic!("unknown event {:?}", other), + } + suite.must_kv_commit_with_source(1, vec![k1], 10.into(), 15.into(), 1); + suite.must_kv_commit_with_source(1, vec![k2], 12.into(), 17.into(), 0); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + let row = &events[0]; + assert_eq!(row.get_type(), EventLogType::Commit); + assert_eq!(row.get_commit_ts(), 17); + } + other => panic!("unknown event {:?}", other), + } + + // Rollback + let mut m3 = Mutation::default(); + let k3 = b"xk3".to_vec(); + m3.set_op(Op::Put); + m3.key = k3.clone(); + m3.value = b"v3".to_vec(); + suite.must_kv_prewrite_with_source(1, vec![m3], k3.clone(), 30.into(), 1); + suite.must_kv_rollback(1, vec![k3], 30.into()); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + let row = &events[0]; + assert_eq!(row.get_type(), EventLogType::Rollback); + assert_eq!(row.get_commit_ts(), 0); + } + other => panic!("unknown event {:?}", other), + } + + // Update value + let k1 = b"xk1".to_vec(); + let mut m4 = Mutation::default(); + m4.set_op(Op::Put); + m4.key = k1.clone(); + m4.value = vec![b'3'; 5120]; + suite.must_kv_prewrite_with_source(1, vec![m4], k1.clone(), 40.into(), 1); + suite.must_kv_commit_with_source(1, vec![k1], 40.into(), 42.into(), 1); + let k2 = b"xk2".to_vec(); + let mut m5 = Mutation::default(); + m5.set_op(Op::Put); + m5.key = k2.clone(); + m5.value = vec![b'4'; 5121]; + suite.must_kv_prewrite(1, vec![m5], k2.clone(), 44.into()); + suite.must_kv_commit(1, vec![k2.clone()], 44.into(), 46.into()); + let mut events = receive_event(false).events.to_vec(); + if events.len() == 1 { + events.extend(receive_event(false).events.into_iter()); + } + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + assert_eq!(events[0].get_type(), EventLogType::Prewrite); + assert_eq!(events[0].get_start_ts(), 44); + assert_eq!(events[0].get_key(), k2.as_slice()); + } + other => panic!("unknown event {:?}", other), + } + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + assert_eq!(events[0].get_type(), EventLogType::Commit); + assert_eq!(events[0].get_commit_ts(), 46); + assert_eq!(events[0].get_key(), k2.as_slice()); + } + other => panic!("unknown event {:?}", other), + } + + event_feed_wrap.replace(None); + suite.stop(); +} diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index 9e6621ffbdf..feb994f8bb1 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -269,9 +269,22 @@ impl TestSuite { muts: Vec, pk: Vec, ts: TimeStamp, + ) { + self.must_kv_prewrite_with_source(region_id, muts, pk, ts, 0); + } + + pub fn must_kv_prewrite_with_source( + &mut self, + region_id: u64, + muts: Vec, + pk: Vec, + ts: TimeStamp, + txn_source: u64, ) { let mut prewrite_req = PrewriteRequest::default(); - prewrite_req.set_context(self.get_context(region_id)); + let mut context = self.get_context(region_id); + context.set_txn_source(txn_source); + prewrite_req.set_context(context); prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; prewrite_req.start_version = ts.into_inner(); @@ -314,9 +327,22 @@ impl TestSuite { keys: Vec>, start_ts: TimeStamp, commit_ts: TimeStamp, + ) { + self.must_kv_commit_with_source(region_id, keys, start_ts, commit_ts, 0); + } + + pub fn must_kv_commit_with_source( + &mut self, + region_id: u64, + keys: Vec>, + start_ts: TimeStamp, + commit_ts: TimeStamp, + txn_source: u64, ) { let mut commit_req = CommitRequest::default(); - commit_req.set_context(self.get_context(region_id)); + let mut context = self.get_context(region_id); + context.set_txn_source(txn_source); + commit_req.set_context(context); commit_req.start_version = start_ts.into_inner(); commit_req.set_keys(keys.into_iter().collect()); commit_req.commit_version = commit_ts.into_inner(); From d23618e25c9ad435de81a4d98c657fdea59a49b6 Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Tue, 29 Nov 2022 21:00:00 +0800 Subject: [PATCH 368/676] cop: support batch coprocessor processing in tikv (#13850) close tikv/tikv#13849 Support batch coprocessor task processing in tikv. All the tasks would be passed to tikv in a single RPC request, they would be executed concurrently and all the results would be sent back in one response. More tests and investigations are needed together with the coprocessor client changes. Signed-off-by: cfzjywxk --- Cargo.lock | 6 +- src/coprocessor/endpoint.rs | 131 ++++++++- tests/integrations/coprocessor/test_select.rs | 252 +++++++++++++++++- 3 files changed, 372 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 14c12716ee2..7a036117bfb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2251,9 +2251,9 @@ dependencies = [ [[package]] name = "grpcio" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9bcdd3694fa08158334501af37bdf5b4f00b1865b602d917e3cd74ecf80cd0a" +checksum = "1f2506de56197d01821c2d1d21082d2dcfd6c82d7a1d6e04d33f37aab6130632" dependencies = [ "futures-executor", "futures-util", @@ -2694,7 +2694,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#fdbd9fa2b8f402420c9f7bc8fe47b0e41412ad55" +source = "git+https://github.com/pingcap/kvproto.git#e53d558bc6d7d8b7bb2d283cdf6dda52a2615632" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 5123534db88..51927cd6b56 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -1,6 +1,8 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{borrow::Cow, future::Future, marker::PhantomData, sync::Arc, time::Duration}; +use std::{ + borrow::Cow, future::Future, iter::FromIterator, marker::PhantomData, sync::Arc, time::Duration, +}; use ::tracker::{ set_tls_tracker_token, with_tls_tracker, RequestInfo, RequestType, GLOBAL_TRACKERS, @@ -485,7 +487,7 @@ impl Endpoint { #[inline] pub fn parse_and_handle_unary_request( &self, - req: coppb::Request, + mut req: coppb::Request, peer: Option, ) -> impl Future> { let tracker = GLOBAL_TRACKERS.insert(::tracker::Tracker::new(RequestInfo::new( @@ -493,23 +495,27 @@ impl Endpoint { RequestType::Unknown, req.start_ts, ))); + let result_of_batch = self.process_batch_tasks(&mut req, &peer); set_tls_tracker_token(tracker); let result_of_future = self .parse_request_and_check_memory_locks(req, peer, false) .map(|(handler_builder, req_ctx)| self.handle_unary_request(req_ctx, handler_builder)); - async move { let res = match result_of_future { - Err(e) => make_error_response(e).into(), + Err(e) => { + let mut res = make_error_response(e); + let batch_res = result_of_batch.await; + res.set_batch_responses(batch_res.into()); + res.into() + } Ok(handle_fut) => { - let mut response = handle_fut - .await - .unwrap_or_else(|e| make_error_response(e).into()); - let scan_detail_v2 = response.mut_exec_details_v2().mut_scan_detail_v2(); + let (handle_res, batch_res) = futures::join!(handle_fut, result_of_batch); + let mut res = handle_res.unwrap_or_else(|e| make_error_response(e).into()); + res.set_batch_responses(batch_res.into()); GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { - tracker.write_scan_detail(scan_detail_v2); + tracker.write_scan_detail(res.mut_exec_details_v2().mut_scan_detail_v2()); }); - response + res } }; GLOBAL_TRACKERS.remove(tracker); @@ -517,6 +523,75 @@ impl Endpoint { } } + // process_batch_tasks process the input batched coprocessor tasks if any, + // prepare all the requests and schedule them into the read pool, then + // collect all the responses and convert them into the `StoreBatchResponse` + // type. + pub fn process_batch_tasks( + &self, + req: &mut coppb::Request, + peer: &Option, + ) -> impl Future> { + let mut batch_futs = Vec::with_capacity(req.tasks.len()); + let batch_reqs: Vec<(coppb::Request, u64)> = req + .take_tasks() + .iter_mut() + .map(|task| { + let mut new_req = req.clone(); + new_req.ranges = task.take_ranges(); + let new_context = new_req.mut_context(); + new_context.set_region_id(task.get_region_id()); + new_context.set_region_epoch(task.take_region_epoch()); + new_context.set_peer(task.take_peer()); + (new_req, task.get_task_id()) + }) + .collect(); + for (cur_req, task_id) in batch_reqs.into_iter() { + let request_info = RequestInfo::new( + cur_req.get_context(), + RequestType::Unknown, + cur_req.start_ts, + ); + let mut response = coppb::StoreBatchTaskResponse::new(); + response.set_task_id(task_id); + match self.parse_request_and_check_memory_locks(cur_req, peer.clone(), false) { + Ok((handler_builder, req_ctx)) => { + let cur_tracker = GLOBAL_TRACKERS.insert(::tracker::Tracker::new(request_info)); + set_tls_tracker_token(cur_tracker); + let fut = self.handle_unary_request(req_ctx, handler_builder); + let fut = async move { + let res = fut.await; + match res { + Ok(mut resp) => { + response.set_data(resp.take_data()); + response.set_region_error(resp.take_region_error()); + response.set_locked(resp.take_locked()); + response.set_other_error(resp.take_other_error()); + GLOBAL_TRACKERS.with_tracker(cur_tracker, |tracker| { + tracker.write_scan_detail( + response.mut_exec_details_v2().mut_scan_detail_v2(), + ); + }); + } + Err(e) => { + make_error_batch_response(&mut response, e); + } + } + GLOBAL_TRACKERS.remove(cur_tracker); + response + }; + + batch_futs.push(future::Either::Left(fut)); + } + Err(e) => batch_futs.push(future::Either::Right(async move { + make_error_batch_response(&mut response, e); + response + })), + } + } + stream::FuturesOrdered::from_iter(batch_futs).collect() + } + /// The real implementation of handling a stream request. /// /// It first retrieves a snapshot, then builds the `RequestHandler` over the @@ -654,6 +729,42 @@ impl Endpoint { } } +fn make_error_batch_response(batch_resp: &mut coppb::StoreBatchTaskResponse, e: Error) { + warn!( + "batch cop task error-response"; + "err" => %e + ); + let tag; + match e { + Error::Region(e) => { + tag = storage::get_tag_from_header(&e); + batch_resp.set_region_error(e); + } + Error::Locked(info) => { + tag = "meet_lock"; + batch_resp.set_locked(info); + } + Error::DeadlineExceeded => { + tag = "deadline_exceeded"; + batch_resp.set_other_error(e.to_string()); + } + Error::MaxPendingTasksExceeded => { + tag = "max_pending_tasks_exceeded"; + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason(e.to_string()); + let mut errorpb = errorpb::Error::default(); + errorpb.set_message(e.to_string()); + errorpb.set_server_is_busy(server_is_busy_err); + batch_resp.set_region_error(errorpb); + } + Error::Other(_) => { + tag = "other"; + batch_resp.set_other_error(e.to_string()); + } + }; + COPR_REQ_ERROR.with_label_values(&[tag]).inc(); +} + fn make_error_response(e: Error) -> coppb::Response { warn!( "error-response"; diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 96ceb1c5c8c..c802b697872 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -2,13 +2,16 @@ use std::{cmp, thread, time::Duration}; +use engine_traits::CF_LOCK; use kvproto::{ - coprocessor::{Request, Response}, - kvrpcpb::{Context, IsolationLevel}, + coprocessor::{Request, Response, StoreBatchTask}, + errorpb, + kvrpcpb::{Context, IsolationLevel, LockInfo}, }; -use protobuf::Message; +use protobuf::{Message, SingularPtrField}; use raftstore::store::Bucket; use test_coprocessor::*; +use test_raftstore::{Cluster, ServerCluster}; use test_storage::*; use tidb_query_datatype::{ codec::{datum, Datum}, @@ -24,7 +27,7 @@ use tipb::{ AnalyzeColumnsReq, AnalyzeReq, AnalyzeType, ChecksumRequest, Chunk, Expr, ExprType, ScalarFuncSig, SelectResponse, }; -use txn_types::TimeStamp; +use txn_types::{Key, Lock, LockType, TimeStamp}; const FLAG_IGNORE_TRUNCATE: u64 = 1; const FLAG_TRUNCATE_AS_WARNING: u64 = 1 << 1; @@ -2006,3 +2009,244 @@ fn test_buckets() { wait_refresh_buckets(0); } + +#[test] +fn test_batch_request() { + let data = vec![ + (1, Some("name:0"), 2), + (2, Some("name:4"), 3), + (4, Some("name:3"), 1), + (5, Some("name:1"), 4), + (9, Some("name:8"), 7), + (10, Some("name:6"), 8), + ]; + + let product = ProductTable::new(); + let (mut cluster, raft_engine, ctx) = new_raft_engine(1, ""); + let (_, endpoint, _) = + init_data_with_engine_and_commit(ctx.clone(), raft_engine, &product, &data, true); + + // Split the region into [1, 2], [4, 5], [9, 10]. + let region = + cluster.get_region(Key::from_raw(&product.get_record_range(1, 1).start).as_encoded()); + let split_key = Key::from_raw(&product.get_record_range(3, 3).start); + cluster.must_split(®ion, split_key.as_encoded()); + let second_region = + cluster.get_region(Key::from_raw(&product.get_record_range(4, 4).start).as_encoded()); + let second_split_key = Key::from_raw(&product.get_record_range(8, 8).start); + cluster.must_split(&second_region, second_split_key.as_encoded()); + + struct HandleRange { + start: i64, + end: i64, + } + + enum QueryResult { + Valid(Vec<(i64, Option<&'static str>, i64)>), + ErrRegion, + ErrLocked, + ErrOther, + } + + // Each case has four fields: + // 1. The input scan handle range. + // 2. The expected output results. + // 3. Should the coprocessor request contain invalid region epoch. + // 4. Should the scanned key be locked. + let cases = vec![ + // Basic valid case. + ( + vec![ + HandleRange { start: 1, end: 2 }, + HandleRange { start: 3, end: 5 }, + ], + vec![ + QueryResult::Valid(vec![(1_i64, Some("name:0"), 2_i64), (2, Some("name:4"), 3)]), + QueryResult::Valid(vec![(4, Some("name:3"), 1), (5, Some("name:1"), 4)]), + ], + false, + false, + ), + // Original task is valid, batch tasks are not all valid. + ( + vec![ + HandleRange { start: 1, end: 2 }, + HandleRange { start: 4, end: 6 }, + HandleRange { start: 9, end: 11 }, + HandleRange { start: 1, end: 3 }, // Input range [1, 4) crosses two region ranges. + HandleRange { start: 4, end: 8 }, // Input range [4, 9] crosses two region ranges. + ], + vec![ + QueryResult::Valid(vec![(1, Some("name:0"), 2), (2, Some("name:4"), 3)]), + QueryResult::Valid(vec![(4, Some("name:3"), 1), (5, Some("name:1"), 4)]), + QueryResult::Valid(vec![(9, Some("name:8"), 7), (10, Some("name:6"), 8)]), + QueryResult::ErrOther, + QueryResult::ErrOther, + ], + false, + false, + ), + // Original task is invalid, batch tasks are not all valid. + ( + vec![HandleRange { start: 1, end: 3 }], + vec![QueryResult::ErrOther], + false, + false, + ), + // Invalid epoch case. + ( + vec![ + HandleRange { start: 1, end: 3 }, + HandleRange { start: 4, end: 6 }, + ], + vec![QueryResult::ErrRegion, QueryResult::ErrRegion], + true, + false, + ), + // Locked error case. + ( + vec![ + HandleRange { start: 1, end: 2 }, + HandleRange { start: 4, end: 6 }, + ], + vec![QueryResult::ErrLocked, QueryResult::ErrLocked], + false, + true, + ), + ]; + let prepare_req = + |cluster: &mut Cluster, ranges: &Vec| -> Request { + let original_range = ranges.get(0).unwrap(); + let key_range = product.get_record_range(original_range.start, original_range.end); + let region_key = Key::from_raw(&key_range.start); + let mut req = DagSelect::from(&product) + .key_ranges(vec![key_range]) + .build_with(ctx.clone(), &[0]); + let mut new_ctx = Context::default(); + let new_region = cluster.get_region(region_key.as_encoded()); + let leader = cluster.leader_of_region(new_region.get_id()).unwrap(); + new_ctx.set_region_id(new_region.get_id()); + new_ctx.set_region_epoch(new_region.get_region_epoch().clone()); + new_ctx.set_peer(leader); + req.set_context(new_ctx); + req.set_start_ts(100); + + let batch_handle_ranges = &ranges.as_slice()[1..]; + for handle_range in batch_handle_ranges.iter() { + let range_start_key = Key::from_raw( + &product + .get_record_range(handle_range.start, handle_range.end) + .start, + ); + let batch_region = cluster.get_region(range_start_key.as_encoded()); + let batch_leader = cluster.leader_of_region(batch_region.get_id()).unwrap(); + let batch_key_ranges = + vec![product.get_record_range(handle_range.start, handle_range.end)]; + let mut store_batch_task = StoreBatchTask::new(); + store_batch_task.set_region_id(batch_region.get_id()); + store_batch_task.set_region_epoch(batch_region.get_region_epoch().clone()); + store_batch_task.set_peer(batch_leader); + store_batch_task.set_ranges(batch_key_ranges.into()); + req.tasks.push(store_batch_task); + } + req + }; + let verify_response = |result: &QueryResult, + data: &[u8], + region_err: &SingularPtrField, + locked: &SingularPtrField, + other_err: &String| { + match result { + QueryResult::Valid(res) => { + let expected_len = res.len(); + let mut sel_resp = SelectResponse::default(); + sel_resp.merge_from_bytes(data).unwrap(); + let mut row_count = 0; + let spliter = DagChunkSpliter::new(sel_resp.take_chunks().into(), 3); + for (row, (id, name, cnt)) in spliter.zip(res) { + let name_datum = name.map(|s| s.as_bytes()).into(); + let expected_encoded = datum::encode_value( + &mut EvalContext::default(), + &[Datum::I64(*id), name_datum, Datum::I64(*cnt)], + ) + .unwrap(); + let result_encoded = + datum::encode_value(&mut EvalContext::default(), &row).unwrap(); + assert_eq!(result_encoded, &*expected_encoded); + row_count += 1; + } + assert_eq!(row_count, expected_len); + } + QueryResult::ErrRegion => { + assert!(region_err.is_some()); + } + QueryResult::ErrLocked => { + assert!(locked.is_some()); + } + QueryResult::ErrOther => { + assert!(!other_err.is_empty()) + } + } + }; + + for (ranges, results, invalid_epoch, key_is_locked) in cases.iter() { + let mut req = prepare_req(&mut cluster, ranges); + if *invalid_epoch { + req.context + .as_mut() + .unwrap() + .region_epoch + .as_mut() + .unwrap() + .version -= 1; + for batch_task in req.tasks.iter_mut() { + batch_task.region_epoch.as_mut().unwrap().version -= 1; + } + } else if *key_is_locked { + for range in ranges.iter() { + let lock_key = + Key::from_raw(&product.get_record_range(range.start, range.start).start); + let lock = Lock::new( + LockType::Put, + lock_key.as_encoded().clone(), + 10.into(), + 10, + None, + TimeStamp::zero(), + 1, + TimeStamp::zero(), + ); + cluster.must_put_cf(CF_LOCK, lock_key.as_encoded(), lock.to_bytes().as_slice()); + } + } + let mut resp = handle_request(&endpoint, req); + let batch_results = resp.take_batch_responses().to_vec(); + for (i, result) in results.iter().enumerate() { + if i == 0 { + verify_response( + result, + resp.get_data(), + &resp.region_error, + &resp.locked, + &resp.other_error, + ); + } else { + let batch_resp = batch_results.get(i - 1).unwrap(); + verify_response( + result, + batch_resp.get_data(), + &batch_resp.region_error, + &batch_resp.locked, + &batch_resp.other_error, + ); + }; + } + if *key_is_locked { + for range in ranges.iter() { + let lock_key = + Key::from_raw(&product.get_record_range(range.start, range.start).start); + cluster.must_delete_cf(CF_LOCK, lock_key.as_encoded()); + } + } + } +} From c1aceb003b9da06b75b70a1e545d52b994ab67dc Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 29 Nov 2022 21:58:00 +0800 Subject: [PATCH 369/676] server, storage: make flashback compatible with resolved_ts (#13823) ref tikv/tikv#13787 - Prewrite and commit `self.start_key` independently to prevent `resolved_ts` from advancing during the flashback process. - Roll back all keys before prewriting `self.start_key` during the preparing flashback. - Add a test case for CDC compatibility. Signed-off-by: JmPotato --- components/cdc/tests/integrations/test_cdc.rs | 115 ++++++++- components/cdc/tests/mod.rs | 48 ++++ components/raftstore/src/store/fsm/peer.rs | 9 +- components/test_raftstore/src/util.rs | 10 +- src/server/service/kv.rs | 16 +- src/storage/mod.rs | 157 ++++++------ src/storage/mvcc/reader/reader.rs | 46 ++-- .../txn/actions/flashback_to_version.rs | 233 +++++++++++++++--- .../txn/commands/flashback_to_version.rs | 83 +++++-- .../flashback_to_version_read_phase.rs | 165 +++++++------ src/storage/txn/commands/mod.rs | 17 +- src/storage/txn/mod.rs | 4 +- 12 files changed, 672 insertions(+), 231 deletions(-) diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index f2f09622a52..b9c285406d4 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -12,7 +12,7 @@ use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; use tikv::server::DEFAULT_CLUSTER_ID; -use tikv_util::HandyRwLock; +use tikv_util::{config::ReadableDuration, HandyRwLock}; use txn_types::{Key, Lock, LockType}; use crate::{new_event_feed, TestSuite, TestSuiteBuilder}; @@ -2485,3 +2485,116 @@ fn test_filter_loop_impl() { event_feed_wrap.replace(None); suite.stop(); } + +#[test] +fn test_flashback() { + let mut cluster = new_server_cluster(0, 1); + cluster.cfg.resolved_ts.advance_ts_interval = ReadableDuration::millis(50); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + + let key = Key::from_raw(b"a"); + let region = suite.cluster.get_region(key.as_encoded()); + let region_id = region.get_id(); + let req = suite.new_changedata_request(region_id); + let (mut req_tx, _, receive_event) = new_event_feed(suite.get_region_cdc_client(region_id)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let event = receive_event(false); + event.events.into_iter().for_each(|e| { + match e.event.unwrap() { + // Even if there is no write, + // it should always outputs an Initialized event. + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + } + }); + // Sleep a while to make sure the stream is registered. + sleep_ms(1000); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + for i in 0..2 { + let (k, v) = ( + format!("key{}", i).as_bytes().to_vec(), + format!("value{}", i).as_bytes().to_vec(), + ); + // Prewrite + let start_ts1 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k.clone(), start_ts1); + // Commit + let commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_commit(1, vec![k.clone()], start_ts1, commit_ts); + } + let (start_key, end_key) = (b"key0".to_vec(), b"key2".to_vec()); + // Prepare flashback. + let flashback_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_prepare_flashback(region_id, &start_key, flashback_start_ts); + // resolved ts should not be advanced anymore. + let mut counter = 0; + let mut last_resolved_ts = 0; + loop { + let event = receive_event(true); + if let Some(resolved_ts) = event.resolved_ts.as_ref() { + if resolved_ts.ts == last_resolved_ts { + counter += 1; + } + last_resolved_ts = resolved_ts.ts; + } + if counter > 20 { + break; + } + sleep_ms(50); + } + // Flashback. + let flashback_commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_flashback( + region_id, + &start_key, + &end_key, + flashback_start_ts, + flashback_commit_ts, + start_ts, + ); + // Check the flashback event. + let mut resolved_ts = 0; + let mut event_counter = 0; + loop { + let mut cde = receive_event(true); + if cde.get_resolved_ts().get_ts() > resolved_ts { + resolved_ts = cde.get_resolved_ts().get_ts(); + } + let events = cde.mut_events(); + if !events.is_empty() { + assert_eq!(events.len(), 1); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + event_counter += 1; + let e = &entries.entries[0]; + assert!(e.commit_ts > resolved_ts); + assert_eq!(e.get_op_type(), EventRowOpType::Delete); + match e.get_type() { + EventLogType::Committed => { + // First entry should be a 1PC flashback. + assert_eq!(e.get_key(), b"key1"); + assert_eq!(event_counter, 1); + } + EventLogType::Commit => { + // Second entry should be a 2PC commit. + assert_eq!(e.get_key(), b"key0"); + assert_eq!(event_counter, 2); + break; + } + _ => panic!("unknown event type {:?}", e.get_type()), + } + } + other => panic!("unknown event {:?}", other), + } + } + } +} diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index feb994f8bb1..87619deb92b 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -581,4 +581,52 @@ impl TestSuite { } } } + + pub fn must_kv_prepare_flashback( + &mut self, + region_id: u64, + start_key: &[u8], + start_ts: TimeStamp, + ) { + let mut prepare_flashback_req = PrepareFlashbackToVersionRequest::default(); + prepare_flashback_req.set_context(self.get_context(region_id)); + prepare_flashback_req.set_start_key(start_key.to_vec()); + prepare_flashback_req.set_start_ts(start_ts.into_inner()); + let prepare_flashback_resp = self + .get_tikv_client(region_id) + .kv_prepare_flashback_to_version(&prepare_flashback_req) + .unwrap(); + assert!( + !prepare_flashback_resp.has_region_error(), + "{:?}", + prepare_flashback_resp.get_region_error() + ); + } + + pub fn must_kv_flashback( + &mut self, + region_id: u64, + start_key: &[u8], + end_key: &[u8], + start_ts: TimeStamp, + commit_ts: TimeStamp, + version: TimeStamp, + ) { + let mut flashback_req = FlashbackToVersionRequest::default(); + flashback_req.set_context(self.get_context(region_id)); + flashback_req.set_start_key(start_key.to_vec()); + flashback_req.set_end_key(end_key.to_vec()); + flashback_req.set_start_ts(start_ts.into_inner()); + flashback_req.set_commit_ts(commit_ts.into_inner()); + flashback_req.set_version(version.into_inner()); + let flashback_resp = self + .get_tikv_client(region_id) + .kv_flashback_to_version(&flashback_req) + .unwrap(); + assert!( + !flashback_resp.has_region_error(), + "{:?}", + flashback_resp.get_region_error() + ); + } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index b4c7d1fb097..f6498222d27 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1330,8 +1330,15 @@ where ) { fail_point!("raft_on_capture_change"); let region_id = self.region_id(); - let msg = + let mut msg = new_read_index_request(region_id, region_epoch.clone(), self.fsm.peer.peer.clone()); + // Allow to capture change even is in flashback state. + // TODO: add a test case for this kind of situation. + if self.fsm.peer.is_in_flashback { + let mut flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); + flags.insert(WriteBatchFlags::FLASHBACK); + msg.mut_header().set_flags(flags.bits()); + } let apply_router = self.ctx.apply_router.clone(); self.propose_raft_command_internal( msg, diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index e4b185b9509..64bdca19025 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -1249,6 +1249,10 @@ pub fn must_flashback_to_version( ) { let mut prepare_req = PrepareFlashbackToVersionRequest::default(); prepare_req.set_context(ctx.clone()); + prepare_req.set_start_ts(start_ts); + prepare_req.set_version(version); + prepare_req.set_start_key(b"a".to_vec()); + prepare_req.set_end_key(b"z".to_vec()); client .kv_prepare_flashback_to_version(&prepare_req) .unwrap(); @@ -1256,9 +1260,9 @@ pub fn must_flashback_to_version( req.set_context(ctx); req.set_start_ts(start_ts); req.set_commit_ts(commit_ts); - req.version = version; - req.start_key = b"a".to_vec(); - req.end_key = b"z".to_vec(); + req.set_version(version); + req.set_start_key(b"a".to_vec()); + req.set_end_key(b"z".to_vec()); let resp = client.kv_flashback_to_version(&req).unwrap(); assert!(!resp.has_region_error()); assert!(resp.get_error().is_empty()); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 7a61a313eca..7c40ab659eb 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1452,14 +1452,26 @@ fn future_delete_range( // Preparing the flashback for a region will "lock" the region so that // there is no any read, write or scheduling operation could be proposed before // the actual flashback operation. +// NOTICE: the caller needs to make sure the version we want to flashback won't +// be between any transactions that have not been fully committed. fn future_prepare_flashback_to_version( // Keep this param to hint the type of E for the compiler. storage: &Storage, req: PrepareFlashbackToVersionRequest, ) -> impl Future> { - let f = storage.get_engine().start_flashback(req.get_context()); + let storage = storage.clone(); async move { - let res = f.await.map_err(storage::Error::from); + let f = storage.get_engine().start_flashback(req.get_context()); + let mut res = f.await.map_err(storage::Error::from); + if matches!(res, Ok(())) { + // After the region is put into the flashback state, we need to do a special + // prewrite to prevent `resolved_ts` from advancing. + let (cb, f) = paired_future_callback(); + res = storage.sched_txn_command(req.clone().into(), cb); + if matches!(res, Ok(())) { + res = f.await.unwrap_or_else(|e| Err(box_err!(e))); + } + } let mut resp = PrepareFlashbackToVersionResponse::default(); if let Some(e) = extract_region_error(&res) { resp.set_region_error(e); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 32d033e7497..b87ab8c4a6d 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3545,7 +3545,10 @@ mod tests { use super::{ mvcc::tests::{must_unlocked, must_written}, test_util::*, - txn::{commands::new_flashback_to_version_read_phase_cmd, FLASHBACK_BATCH_SIZE}, + txn::{ + commands::{new_flashback_rollback_lock_cmd, new_flashback_write_cmd}, + FLASHBACK_BATCH_SIZE, + }, *, }; use crate::{ @@ -4816,20 +4819,14 @@ mod tests { let (key, value) = write.0.clone().into_key_value(); // The version we want to flashback to. let version = write.2; - storage - .sched_txn_command( - new_flashback_to_version_read_phase_cmd( - start_ts, - commit_ts, - version, - key.clone(), - Key::from_raw(b"z"), - Context::default(), - ), - expect_ok_callback(tx.clone(), 2), - ) - .unwrap(); - rx.recv().unwrap(); + run_flashback_to_version( + &storage, + start_ts, + commit_ts, + version, + key.clone(), + Key::from_raw(b"z"), + ); if let Mutation::Put(..) = write.0 { expect_value( value.unwrap(), @@ -4847,6 +4844,44 @@ mod tests { } } + fn run_flashback_to_version( + storage: &Storage, + start_ts: TimeStamp, + commit_ts: TimeStamp, + version: TimeStamp, + start_key: Key, + end_key: Key, + ) { + let (tx, rx) = channel(); + storage + .sched_txn_command( + new_flashback_rollback_lock_cmd( + start_ts, + version, + start_key.clone(), + end_key.clone(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + new_flashback_write_cmd( + start_ts, + commit_ts, + version, + start_key, + end_key, + Context::default(), + ), + expect_ok_callback(tx, 1), + ) + .unwrap(); + rx.recv().unwrap(); + } + #[test] fn test_flashback_to_version_lock() { let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) @@ -4890,7 +4925,7 @@ mod tests { b"k".to_vec(), *ts.incr(), ), - expect_ok_callback(tx.clone(), 2), + expect_ok_callback(tx, 2), ) .unwrap(); rx.recv().unwrap(); @@ -4906,20 +4941,14 @@ mod tests { let start_ts = *ts.incr(); let commit_ts = *ts.incr(); - storage - .sched_txn_command( - new_flashback_to_version_read_phase_cmd( - start_ts, - commit_ts, - 2.into(), - Key::from_raw(b"k"), - Key::from_raw(b"z"), - Context::default(), - ), - expect_ok_callback(tx.clone(), 3), - ) - .unwrap(); - rx.recv().unwrap(); + run_flashback_to_version( + &storage, + start_ts, + commit_ts, + 2.into(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + ); expect_value( b"v@1".to_vec(), block_on(storage.get(Context::default(), Key::from_raw(b"k"), commit_ts)) @@ -4928,20 +4957,14 @@ mod tests { ); let start_ts = *ts.incr(); let commit_ts = *ts.incr(); - storage - .sched_txn_command( - new_flashback_to_version_read_phase_cmd( - start_ts, - commit_ts, - 1.into(), - Key::from_raw(b"k"), - Key::from_raw(b"z"), - Context::default(), - ), - expect_ok_callback(tx, 4), - ) - .unwrap(); - rx.recv().unwrap(); + run_flashback_to_version( + &storage, + start_ts, + commit_ts, + 1.into(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + ); expect_none( block_on(storage.get(Context::default(), Key::from_raw(b"k"), commit_ts)) .unwrap() @@ -5025,20 +5048,14 @@ mod tests { let flashback_start_ts = *ts.incr(); let flashback_commit_ts = *ts.incr(); for _ in 0..10 { - storage - .sched_txn_command( - new_flashback_to_version_read_phase_cmd( - flashback_start_ts, - flashback_commit_ts, - TimeStamp::zero(), - Key::from_raw(b"k"), - Key::from_raw(b"z"), - Context::default(), - ), - expect_ok_callback(tx.clone(), 2), - ) - .unwrap(); - rx.recv().unwrap(); + run_flashback_to_version( + &storage, + flashback_start_ts, + flashback_commit_ts, + TimeStamp::zero(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + ); for i in 1..=FLASHBACK_BATCH_SIZE * 4 { let key = Key::from_raw(format!("k{}", i).as_bytes()); expect_none( @@ -5098,7 +5115,7 @@ mod tests { storage .sched_txn_command( commands::Commit::new(vec![k.clone()], ts, *ts.incr(), Context::default()), - expect_value_callback(tx.clone(), 3, TxnStatus::committed(ts)), + expect_value_callback(tx, 3, TxnStatus::committed(ts)), ) .unwrap(); rx.recv().unwrap(); @@ -5110,20 +5127,14 @@ mod tests { // Flashback the key. let flashback_start_ts = *ts.incr(); let flashback_commit_ts = *ts.incr(); - storage - .sched_txn_command( - new_flashback_to_version_read_phase_cmd( - flashback_start_ts, - flashback_commit_ts, - 1.into(), - Key::from_raw(b"k"), - Key::from_raw(b"z"), - Context::default(), - ), - expect_ok_callback(tx, 4), - ) - .unwrap(); - rx.recv().unwrap(); + run_flashback_to_version( + &storage, + flashback_start_ts, + flashback_commit_ts, + 1.into(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + ); expect_none( block_on(storage.get(Context::default(), k, flashback_commit_ts)) .unwrap() diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 8e92ffd6be2..0ada3a12d5d 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -185,17 +185,12 @@ impl MvccReader { } } - /// load the value associated with `key` and pointed by `write` - fn load_data(&mut self, key: &Key, write: Write) -> Result { - assert_eq!(write.write_type, WriteType::Put); - if let Some(val) = write.short_value { - return Ok(val); - } + /// get the value of a user key with the given `start_ts`. + pub fn get_value(&mut self, key: &Key, start_ts: TimeStamp) -> Result> { if self.scan_mode.is_some() { self.create_data_cursor()?; } - - let k = key.clone().append_ts(write.start_ts); + let k = key.clone().append_ts(start_ts); let val = if let Some(ref mut cursor) = self.data_cursor { cursor .get(&k, &mut self.statistics.data)? @@ -204,13 +199,25 @@ impl MvccReader { self.statistics.data.get += 1; self.snapshot.get(&k)? }; + if val.is_some() { + self.statistics.data.processed_keys += 1; + } + Ok(val) + } - match val { - Some(val) => { - self.statistics.data.processed_keys += 1; - Ok(val) - } - None => Err(default_not_found_error(k.into_encoded(), "get")), + /// load the value associated with `key` and pointed by `write` + fn load_data(&mut self, key: &Key, write: Write) -> Result { + assert_eq!(write.write_type, WriteType::Put); + if let Some(val) = write.short_value { + return Ok(val); + } + let start_ts = write.start_ts; + match self.get_value(key, start_ts)? { + Some(val) => Ok(val), + None => Err(default_not_found_error( + key.clone().append_ts(start_ts).into_encoded(), + "get", + )), } } @@ -2011,8 +2018,17 @@ pub mod tests { engine.write(case.modifies); let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); let mut reader = MvccReader::new(snap, case.scan_mode, false); - let result = reader.load_data(&case.key, case.write); + let result = reader.load_data(&case.key, case.write.clone()); assert_eq!(format!("{:?}", result), format!("{:?}", case.expected)); + if let Ok(expected) = case.expected { + if expected == long_value.to_vec() { + let result = reader + .get_value(&case.key, case.write.start_ts) + .unwrap() + .unwrap(); + assert_eq!(format!("{:?}", result), format!("{:?}", expected)); + } + } } } diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 71f50715a20..e719ca24a26 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -2,7 +2,7 @@ use std::ops::Bound; -use txn_types::{Key, Lock, TimeStamp, Write, WriteType}; +use txn_types::{Key, Lock, LockType, TimeStamp, Write, WriteType}; use crate::storage::{ mvcc::{MvccReader, MvccTxn, SnapshotReader, MAX_TXN_WRITE_SIZE}, @@ -12,26 +12,27 @@ use crate::storage::{ pub const FLASHBACK_BATCH_SIZE: usize = 256 + 1 /* To store the next key for multiple batches */; -pub fn flashback_to_version_read_lock( - reader: &mut MvccReader, +pub fn flashback_to_version_read_lock( + reader: &mut MvccReader, next_lock_key: Key, end_key: &Key, statistics: &mut Statistics, -) -> TxnResult<(Vec<(Key, Lock)>, bool)> { - let key_locks_result = reader.scan_locks( +) -> TxnResult> { + let result = reader.scan_locks( Some(&next_lock_key), Some(end_key), - // To flashback `CF_LOCK`, we need to delete all locks. |_| true, FLASHBACK_BATCH_SIZE, ); statistics.add(&reader.statistics); - Ok(key_locks_result?) + let (key_locks, _) = result?; + Ok(key_locks) } -pub fn flashback_to_version_read_write( - reader: &mut MvccReader, +pub fn flashback_to_version_read_write( + reader: &mut MvccReader, next_write_key: Key, + start_key: &Key, end_key: &Key, flashback_version: TimeStamp, flashback_commit_ts: TimeStamp, @@ -47,14 +48,17 @@ pub fn flashback_to_version_read_write( let keys_result = reader.scan_latest_user_keys( Some(&next_write_key), Some(end_key), - |_, latest_commit_ts| { + |key, latest_commit_ts| { // There is no any other write could happen after the flashback begins. assert!(latest_commit_ts <= flashback_commit_ts); + // - Skip the `start_key`. // - No need to find an old version for the key if its latest `commit_ts` is // smaller than or equal to the flashback version. // - No need to flashback a key twice if its latest `commit_ts` is equal to the // flashback `commit_ts`. - latest_commit_ts > flashback_version && latest_commit_ts < flashback_commit_ts + key != start_key + && latest_commit_ts > flashback_version + && latest_commit_ts < flashback_commit_ts }, FLASHBACK_BATCH_SIZE, ); @@ -63,12 +67,9 @@ pub fn flashback_to_version_read_write( Ok(keys) } -// To flashback the `CF_LOCK`, we need to delete all locks records whose -// `start_ts` is greater than the specified version, and if it's not a -// short-value `LockType::Put`, we need to delete the actual data from -// `CF_DEFAULT` as well. -// TODO: `resolved_ts` should be taken into account. -pub fn flashback_to_version_lock( +// At the very first beginning of flashback, we need to rollback all locks in +// `CF_LOCK`. +pub fn rollback_locks( txn: &mut MvccTxn, reader: &mut SnapshotReader, key_locks: Vec<(Key, Lock)>, @@ -123,9 +124,9 @@ pub fn flashback_to_version_write( } let old_write = reader.get_write(&key, flashback_version)?; let new_write = if let Some(old_write) = old_write { - // If it's not a short value and it's a `WriteType::Put`, we should put the old + // If it's a `WriteType::Put` without the short value, we should put the old // value in `CF_DEFAULT` with `self.start_ts` as well. - if old_write.short_value.is_none() && old_write.write_type == WriteType::Put { + if old_write.write_type == WriteType::Put && old_write.short_value.is_none() { txn.put_value( key.clone(), flashback_start_ts, @@ -142,21 +143,94 @@ pub fn flashback_to_version_write( // delete the current key when needed. Write::new(WriteType::Delete, flashback_start_ts, None) }; + txn.put_write(key, flashback_commit_ts, new_write.as_ref().to_bytes()); + } + Ok(None) +} + +// Prewrite the `key_to_lock`, namely the `self.start_key`, to do a special 2PC +// transaction. +pub fn prewrite_flashback_key( + txn: &mut MvccTxn, + reader: &mut SnapshotReader, + key_to_lock: &Key, + flashback_version: TimeStamp, + flashback_start_ts: TimeStamp, +) -> TxnResult<()> { + let old_write = reader.get_write(key_to_lock, flashback_version)?; + // Flashback the value in `CF_DEFAULT` as well if the old write is a + // `WriteType::Put` without the short value. + if let Some(old_write) = old_write.as_ref() { + if old_write.write_type == WriteType::Put + && old_write.short_value.is_none() + // If the value with `flashback_start_ts` already exists, we don't need to write again. + && reader.reader.get_value(key_to_lock, flashback_start_ts)?.is_none() + { + txn.put_value( + key_to_lock.clone(), + flashback_start_ts, + reader.load_data(key_to_lock, old_write.clone())?, + ); + } + } + txn.put_lock( + key_to_lock.clone(), + &Lock::new( + old_write.as_ref().map_or(LockType::Delete, |write| { + if write.write_type == WriteType::Delete { + LockType::Delete + } else { + LockType::Put + } + }), + key_to_lock.as_encoded().to_vec(), + flashback_start_ts, + 0, + old_write.and_then(|write| write.short_value), + TimeStamp::zero(), + 1, + TimeStamp::zero(), + ), + ); + Ok(()) +} + +pub fn commit_flashback_key( + txn: &mut MvccTxn, + reader: &mut SnapshotReader, + key_to_commit: &Key, + flashback_start_ts: TimeStamp, + flashback_commit_ts: TimeStamp, +) -> TxnResult<()> { + if let Some(mut lock) = reader.load_lock(key_to_commit)? { txn.put_write( - key.clone(), + key_to_commit.clone(), + flashback_commit_ts, + Write::new( + WriteType::from_lock_type(lock.lock_type).unwrap(), + flashback_start_ts, + lock.short_value.take(), + ) + .set_last_change(lock.last_change_ts, lock.versions_to_last_change) + .set_txn_source(lock.txn_source) + .as_ref() + .to_bytes(), + ); + txn.unlock_key( + key_to_commit.clone(), + lock.is_pessimistic_txn(), flashback_commit_ts, - new_write.as_ref().to_bytes(), ); } - Ok(None) + Ok(()) } #[cfg(test)] pub mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::Context; + use kvproto::kvrpcpb::{Context, PrewriteRequestPessimisticAction::DoPessimisticCheck}; use tikv_kv::ScanMode; - use txn_types::TimeStamp; + use txn_types::{TimeStamp, SHORT_VALUE_MAX_LEN}; use super::*; use crate::storage::{ @@ -172,42 +246,82 @@ pub mod tests { Engine, TestEngineBuilder, }; - fn must_flashback_to_version( + fn must_rollback_lock( engine: &mut E, key: &[u8], version: impl Into, start_ts: impl Into, - commit_ts: impl Into, ) -> usize { let next_key = Key::from_raw(keys::next_key(key).as_slice()); let key = Key::from_raw(key); - let (version, start_ts, commit_ts) = (version.into(), start_ts.into(), commit_ts.into()); + let (version, start_ts) = (version.into(), start_ts.into()); let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); let mut statistics = Statistics::default(); - // Flashback the locks. - let (key_locks, has_remain_locks) = - flashback_to_version_read_lock(&mut reader, key.clone(), &next_key, &mut statistics) - .unwrap(); - assert!(!has_remain_locks); + let key_locks = + flashback_to_version_read_lock(&mut reader, key, &next_key, &mut statistics).unwrap(); let cm = ConcurrencyManager::new(TimeStamp::zero()); - let mut txn = MvccTxn::new(start_ts, cm.clone()); + let mut txn = MvccTxn::new(start_ts, cm); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut snap_reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); - flashback_to_version_lock(&mut txn, &mut snap_reader, key_locks).unwrap(); - let mut rows = txn.modifies.len(); + rollback_locks(&mut txn, &mut snap_reader, key_locks).unwrap(); + let rows = txn.modifies.len(); + write(engine, &ctx, txn.into_modifies()); + rows + } + + fn must_prewrite_flashback_key( + engine: &mut E, + key: &[u8], + version: impl Into, + start_ts: impl Into, + ) -> usize { + let (version, start_ts) = (version.into(), start_ts.into()); + let cm = ConcurrencyManager::new(TimeStamp::zero()); + let mut txn = MvccTxn::new(start_ts, cm); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let ctx = Context::default(); + let mut snap_reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); + prewrite_flashback_key( + &mut txn, + &mut snap_reader, + &Key::from_raw(key), + version, + start_ts, + ) + .unwrap(); + let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); + rows + } + + fn must_flashback_to_version( + engine: &mut E, + key: &[u8], + version: impl Into, + start_ts: impl Into, + commit_ts: impl Into, + ) -> usize { + let next_key = Key::from_raw(keys::next_key(key).as_slice()); + let key = Key::from_raw(key); + let (version, start_ts, commit_ts) = (version.into(), start_ts.into(), commit_ts.into()); + let ctx = Context::default(); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); + let mut statistics = Statistics::default(); // Flashback the writes. let keys = flashback_to_version_read_write( &mut reader, key, + &Key::from_raw(b""), &next_key, version, commit_ts, &mut statistics, ) .unwrap(); + let cm = ConcurrencyManager::new(TimeStamp::zero()); let mut txn = MvccTxn::new(start_ts, cm); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut snap_reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); @@ -220,7 +334,7 @@ pub mod tests { commit_ts, ) .unwrap(); - rows += txn.modifies.len(); + let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); rows } @@ -317,8 +431,6 @@ pub mod tests { #[test] fn test_flashback_to_version_pessimistic() { - use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; - let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k"; let (v1, v2, v3) = (b"v1", b"v2", b"v3"); @@ -335,7 +447,8 @@ pub mod tests { // Flashback to version 17 with start_ts = 35, commit_ts = 40. // Distinguish from pessimistic start_ts 30 to make sure rollback ts is by lock // ts. - assert_eq!(must_flashback_to_version(&mut engine, k, 17, 35, 40), 3); + assert_eq!(must_rollback_lock(&mut engine, k, 17, 35), 2); + assert_eq!(must_flashback_to_version(&mut engine, k, 17, 35, 40), 1); // Pessimistic Prewrite Put(k -> v3) with stat_ts = 30 will be error with // Rollback. @@ -365,4 +478,46 @@ pub mod tests { 0 ); } + + #[test] + fn test_duplicated_prewrite_flashback_key() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let mut ts = TimeStamp::zero(); + let (k, v) = (b"k", [u8::MAX; SHORT_VALUE_MAX_LEN + 1]); + must_prewrite_put(&mut engine, k, &v, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, ts, &v); + + let flashback_start_ts = *ts.incr(); + // Rollback nothing. + assert_eq!( + must_rollback_lock(&mut engine, k, ts, flashback_start_ts), + 0 + ); + // Lock and write the value of `k`. + assert_eq!( + must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), + 2 + ); + // Unlock `k`, put rollback record and delete the value of `k`. + assert_eq!( + must_rollback_lock(&mut engine, k, ts, flashback_start_ts), + 3 + ); + // Lock and write the value of `k`. + assert_eq!( + must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), + 2 + ); + // Only unlock `k` since there is an overlapped rollback record. + assert_eq!( + must_rollback_lock(&mut engine, k, ts, flashback_start_ts), + 1 + ); + // Only lock `k` since the value of `k` has already existed. + assert_eq!( + must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), + 1 + ); + } } diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index d53a3a5c3be..a1936cee647 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -10,7 +10,10 @@ use crate::storage::{ lock_manager::LockManager, mvcc::{MvccTxn, SnapshotReader}, txn::{ - actions::flashback_to_version::{flashback_to_version_lock, flashback_to_version_write}, + actions::flashback_to_version::{ + commit_flashback_key, flashback_to_version_write, prewrite_flashback_key, + rollback_locks, + }, commands::{ Command, CommandExt, FlashbackToVersionReadPhase, FlashbackToVersionState, ReaderWithStats, ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, @@ -42,22 +45,26 @@ impl CommandExt for FlashbackToVersion { fn gen_lock(&self) -> latch::Lock { match &self.state { - FlashbackToVersionState::ScanLock { key_locks, .. } => { + FlashbackToVersionState::RollbackLock { key_locks, .. } => { latch::Lock::new(key_locks.iter().map(|(key, _)| key)) } - FlashbackToVersionState::ScanWrite { keys, .. } => latch::Lock::new(keys.iter()), + FlashbackToVersionState::Prewrite { key_to_lock } => latch::Lock::new([key_to_lock]), + FlashbackToVersionState::FlashbackWrite { keys, .. } => latch::Lock::new(keys.iter()), + FlashbackToVersionState::Commit { key_to_commit } => latch::Lock::new([key_to_commit]), } } fn write_bytes(&self) -> usize { match &self.state { - FlashbackToVersionState::ScanLock { key_locks, .. } => key_locks + FlashbackToVersionState::RollbackLock { key_locks, .. } => key_locks .iter() .map(|(key, _)| key.as_encoded().len()) .sum(), - FlashbackToVersionState::ScanWrite { keys, .. } => { + FlashbackToVersionState::Prewrite { key_to_lock } => key_to_lock.as_encoded().len(), + FlashbackToVersionState::FlashbackWrite { keys, .. } => { keys.iter().map(|key| key.as_encoded().len()).sum() } + FlashbackToVersionState::Commit { key_to_commit } => key_to_commit.as_encoded().len(), } } } @@ -69,19 +76,26 @@ impl WriteCommand for FlashbackToVersion { context.statistics, ); let mut txn = MvccTxn::new(TimeStamp::zero(), context.concurrency_manager); - // The state must be `ScanLock` or `ScanWrite` here. match self.state { - FlashbackToVersionState::ScanLock { + FlashbackToVersionState::RollbackLock { ref mut next_lock_key, ref mut key_locks, } => { if let Some(new_next_lock_key) = - flashback_to_version_lock(&mut txn, &mut reader, mem::take(key_locks))? + rollback_locks(&mut txn, &mut reader, mem::take(key_locks))? { *next_lock_key = new_next_lock_key; } } - FlashbackToVersionState::ScanWrite { + // TODO: add some test cases for the special prewrite key. + FlashbackToVersionState::Prewrite { ref key_to_lock } => prewrite_flashback_key( + &mut txn, + &mut reader, + key_to_lock, + self.version, + self.start_ts, + )?, + FlashbackToVersionState::FlashbackWrite { ref mut next_write_key, ref mut keys, } => { @@ -96,30 +110,53 @@ impl WriteCommand for FlashbackToVersion { *next_write_key = new_next_write_key; } } + FlashbackToVersionState::Commit { ref key_to_commit } => commit_flashback_key( + &mut txn, + &mut reader, + key_to_commit, + self.start_ts, + self.commit_ts, + )?, } let rows = txn.modifies.len(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); + // To let the flashback modification could be proposed and applied successfully. write_data.extra.for_flashback = true; + // To let the CDC treat the flashback modification as an 1PC transaction. + if matches!(self.state, FlashbackToVersionState::FlashbackWrite { .. }) { + write_data.extra.one_pc = true; + } Ok(WriteResult { ctx: self.ctx.clone(), to_be_write: write_data, rows, pr: (move || { - fail_point!("flashback_failed_after_first_batch", |_| { - ProcessResult::Res - }); - let next_cmd = FlashbackToVersionReadPhase { - ctx: self.ctx, - deadline: self.deadline, - start_ts: self.start_ts, - commit_ts: self.commit_ts, - version: self.version, - start_key: self.start_key, - end_key: self.end_key, - state: self.state, - }; + if matches!( + self.state, + FlashbackToVersionState::Prewrite { .. } + | FlashbackToVersionState::Commit { .. } + ) { + return ProcessResult::Res; + } + + #[cfg(feature = "failpoints")] + if matches!(self.state, FlashbackToVersionState::FlashbackWrite { .. }) { + fail_point!("flashback_failed_after_first_batch", |_| { + ProcessResult::Res + }); + } + ProcessResult::NextCommand { - cmd: Command::FlashbackToVersionReadPhase(next_cmd), + cmd: Command::FlashbackToVersionReadPhase(FlashbackToVersionReadPhase { + ctx: self.ctx, + deadline: self.deadline, + start_ts: self.start_ts, + commit_ts: self.commit_ts, + version: self.version, + start_key: self.start_key, + end_key: self.end_key, + state: self.state, + }), } })(), lock_info: vec![], diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index b41506c320b..d27225a9bf7 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -18,19 +18,24 @@ use crate::storage::{ #[derive(Debug)] pub enum FlashbackToVersionState { - ScanLock { + RollbackLock { next_lock_key: Key, key_locks: Vec<(Key, Lock)>, }, - ScanWrite { + Prewrite { + key_to_lock: Key, + }, + FlashbackWrite { next_write_key: Key, keys: Vec, }, + Commit { + key_to_commit: Key, + }, } -pub fn new_flashback_to_version_read_phase_cmd( +pub fn new_flashback_rollback_lock_cmd( start_ts: TimeStamp, - commit_ts: TimeStamp, version: TimeStamp, start_key: Key, end_key: Key, @@ -38,11 +43,11 @@ pub fn new_flashback_to_version_read_phase_cmd( ) -> TypedCommand<()> { FlashbackToVersionReadPhase::new( start_ts, - commit_ts, + TimeStamp::zero(), version, start_key.clone(), end_key, - FlashbackToVersionState::ScanLock { + FlashbackToVersionState::RollbackLock { next_lock_key: start_key, key_locks: Vec::new(), }, @@ -50,6 +55,28 @@ pub fn new_flashback_to_version_read_phase_cmd( ) } +pub fn new_flashback_write_cmd( + start_ts: TimeStamp, + commit_ts: TimeStamp, + version: TimeStamp, + start_key: Key, + end_key: Key, + ctx: Context, +) -> TypedCommand<()> { + FlashbackToVersionReadPhase::new( + start_ts, + commit_ts, + version, + start_key.clone(), + end_key, + FlashbackToVersionState::FlashbackWrite { + next_write_key: start_key, + keys: Vec::new(), + }, + ctx, + ) +} + command! { FlashbackToVersionReadPhase: cmd_ty => (), @@ -76,48 +103,41 @@ impl CommandExt for FlashbackToVersionReadPhase { } } -/// FlashbackToVersion contains two phases: -/// 1. Read phase: -/// - Scan all locks to delete them all later. -/// - Scan all the latest writes to flashback them all later. -/// 2. Write phase: -/// - Delete all locks we scanned at the read phase. -/// - Write the old MVCC version writes for the keys we scanned at the read -/// phase. +/// The whole flashback progress contains four phases: +/// 1. [PrepareFlashback] RollbackLock phase: +/// - Scan all locks. +/// - Rollback all these locks. +/// 2. [PrepareFlashback] Prewrite phase: +/// - Prewrite the `self.start_key` specifically to prevent the +/// `resolved_ts` from advancing. +/// 3. [FinishFlashback] FlashbackWrite phase: +/// - Scan all the latest writes and their corresponding values at +/// `self.version`. +/// - Write the old MVCC version writes again for all these keys with +/// `self.commit_ts` excluding the `self.start_key`. +/// 4. [FinishFlashback] Commit phase: +/// - Commit the `self.start_key` we write at the second phase to finish the +/// flashback. impl ReadCommand for FlashbackToVersionReadPhase { fn process_read(self, snapshot: S, statistics: &mut Statistics) -> Result { - if self.commit_ts <= self.start_ts { - return Err(Error::from(ErrorInner::InvalidTxnTso { - start_ts: self.start_ts, - commit_ts: self.commit_ts, - })); - } let tag = self.tag().get_str(); - let mut read_again = false; let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &self.ctx); - // Separate the lock and write flashback to prevent from putting two writes for - // the same key in a single batch to make the TiCDC panic. let next_state = match self.state { - FlashbackToVersionState::ScanLock { next_lock_key, .. } => { - let (mut key_locks, has_remain_locks) = flashback_to_version_read_lock( + FlashbackToVersionState::RollbackLock { next_lock_key, .. } => { + let mut key_locks = flashback_to_version_read_lock( &mut reader, next_lock_key, &self.end_key, statistics, )?; - if key_locks.is_empty() && !has_remain_locks { - // No more locks to flashback, continue to scan the writes. - read_again = true; - FlashbackToVersionState::ScanWrite { - next_write_key: self.start_key.clone(), - keys: Vec::new(), + if key_locks.is_empty() { + // No more locks to rollback, continue to the prewrite phase. + FlashbackToVersionState::Prewrite { + key_to_lock: self.start_key.clone(), } } else { - assert!(!key_locks.is_empty()); tls_collect_keyread_histogram_vec(tag, key_locks.len() as f64); - FlashbackToVersionState::ScanLock { - // DO NOT pop the last key as the next key when it's the only key to prevent - // from making flashback fall into a dead loop. + FlashbackToVersionState::RollbackLock { next_lock_key: if key_locks.len() > 1 { key_locks.pop().map(|(key, _)| key).unwrap() } else { @@ -127,54 +147,59 @@ impl ReadCommand for FlashbackToVersionReadPhase { } } } - FlashbackToVersionState::ScanWrite { next_write_key, .. } => { + FlashbackToVersionState::FlashbackWrite { next_write_key, .. } => { + if self.commit_ts <= self.start_ts { + return Err(Error::from(ErrorInner::InvalidTxnTso { + start_ts: self.start_ts, + commit_ts: self.commit_ts, + })); + } + // If the key is not locked, it means that the key has been committed before and + // we are in a retry. + if next_write_key == self.start_key && reader.load_lock(&next_write_key)?.is_none() + { + return Ok(ProcessResult::Res); + } let mut keys = flashback_to_version_read_write( &mut reader, next_write_key, + &self.start_key, &self.end_key, self.version, self.commit_ts, statistics, )?; if keys.is_empty() { - // No more writes to flashback, just return. - return Ok(ProcessResult::Res); - } - tls_collect_keyread_histogram_vec(tag, keys.len() as f64); - FlashbackToVersionState::ScanWrite { - next_write_key: if keys.len() > 1 { - keys.pop().unwrap() - } else { - keys.last().unwrap().clone() - }, - keys, + FlashbackToVersionState::Commit { + key_to_commit: self.start_key.clone(), + } + } else { + tls_collect_keyread_histogram_vec(tag, keys.len() as f64); + FlashbackToVersionState::FlashbackWrite { + // DO NOT pop the last key as the next key when it's the only key to prevent + // from making flashback fall into a dead loop. + next_write_key: if keys.len() > 1 { + keys.pop().unwrap() + } else { + keys.last().unwrap().clone() + }, + keys, + } } } + _ => unreachable!(), }; Ok(ProcessResult::NextCommand { - cmd: if read_again { - Command::FlashbackToVersionReadPhase(FlashbackToVersionReadPhase { - ctx: self.ctx, - deadline: self.deadline, - start_ts: self.start_ts, - commit_ts: self.commit_ts, - version: self.version, - start_key: self.start_key, - end_key: self.end_key, - state: next_state, - }) - } else { - Command::FlashbackToVersion(FlashbackToVersion { - ctx: self.ctx, - deadline: self.deadline, - start_ts: self.start_ts, - commit_ts: self.commit_ts, - version: self.version, - start_key: self.start_key, - end_key: self.end_key, - state: next_state, - }) - }, + cmd: Command::FlashbackToVersion(FlashbackToVersion { + ctx: self.ctx, + deadline: self.deadline, + start_ts: self.start_ts, + commit_ts: self.commit_ts, + version: self.version, + start_key: self.start_key, + end_key: self.end_key, + state: next_state, + }), }) } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index c09ca934fa0..4d3f32fa9cd 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -43,7 +43,8 @@ pub use compare_and_swap::RawCompareAndSwap; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; pub use flashback_to_version::FlashbackToVersion; pub use flashback_to_version_read_phase::{ - new_flashback_to_version_read_phase_cmd, FlashbackToVersionReadPhase, FlashbackToVersionState, + new_flashback_rollback_lock_cmd, new_flashback_write_cmd, FlashbackToVersionReadPhase, + FlashbackToVersionState, }; use kvproto::kvrpcpb::*; pub use mvcc_by_key::MvccByKey; @@ -361,9 +362,21 @@ impl From for TypedCommand> { } } +impl From for TypedCommand<()> { + fn from(mut req: PrepareFlashbackToVersionRequest) -> Self { + new_flashback_rollback_lock_cmd( + req.get_start_ts().into(), + req.get_version().into(), + Key::from_raw(req.get_start_key()), + Key::from_raw(req.get_end_key()), + req.take_context(), + ) + } +} + impl From for TypedCommand<()> { fn from(mut req: FlashbackToVersionRequest) -> Self { - new_flashback_to_version_read_phase_cmd( + new_flashback_write_cmd( req.get_start_ts().into(), req.get_commit_ts().into(), req.get_version().into(), diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index 86ceda2bdf1..f6884b0efb8 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -24,8 +24,8 @@ pub use self::{ cleanup::cleanup, commit::commit, flashback_to_version::{ - flashback_to_version_lock, flashback_to_version_read_lock, - flashback_to_version_read_write, flashback_to_version_write, FLASHBACK_BATCH_SIZE, + flashback_to_version_read_lock, flashback_to_version_read_write, + flashback_to_version_write, rollback_locks, FLASHBACK_BATCH_SIZE, }, gc::gc, prewrite::{prewrite, CommitKind, TransactionKind, TransactionProperties}, From 916d5137b475da88452fde3e20b08126d5cdf3cf Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Tue, 29 Nov 2022 22:50:00 +0800 Subject: [PATCH 370/676] cop: set the error field properly for the batch cop task response (#13857) close tikv/tikv#13856 Fill the batch task response error field properly, the error field should be None if no error happens. Signed-off-by: cfzjywxk --- src/coprocessor/endpoint.rs | 8 ++++++-- tests/integrations/coprocessor/test_select.rs | 9 +++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 51927cd6b56..1fefb2a55ae 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -564,8 +564,12 @@ impl Endpoint { match res { Ok(mut resp) => { response.set_data(resp.take_data()); - response.set_region_error(resp.take_region_error()); - response.set_locked(resp.take_locked()); + if let Some(err) = resp.region_error.take() { + response.set_region_error(err); + } + if let Some(lock_info) = resp.locked.take() { + response.set_locked(lock_info); + } response.set_other_error(resp.take_other_error()); GLOBAL_TRACKERS.with_tracker(cur_tracker, |tracker| { tracker.write_scan_detail( diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index c802b697872..ad195f62774 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -2176,14 +2176,23 @@ fn test_batch_request() { row_count += 1; } assert_eq!(row_count, expected_len); + assert!(region_err.is_none()); + assert!(locked.is_none()); + assert!(other_err.is_empty()); } QueryResult::ErrRegion => { assert!(region_err.is_some()); + assert!(locked.is_none()); + assert!(other_err.is_empty()); } QueryResult::ErrLocked => { + assert!(region_err.is_none()); assert!(locked.is_some()); + assert!(other_err.is_empty()); } QueryResult::ErrOther => { + assert!(region_err.is_none()); + assert!(locked.is_none()); assert!(!other_err.is_empty()) } } From 05aed39fb8d693bde91cff2ef94c7251ed513f56 Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Wed, 30 Nov 2022 11:20:00 +0800 Subject: [PATCH 371/676] cop: disable the coprocessor cache path for batched task processing (#13859) ref tikv/tikv#13858 Disable the coprocessor cache path for the batched task processing, the derived fields from the original task could not be used by the batched tasks. Signed-off-by: cfzjywxk --- src/coprocessor/endpoint.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 1fefb2a55ae..3274700d812 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -538,6 +538,9 @@ impl Endpoint { .iter_mut() .map(|task| { let mut new_req = req.clone(); + // Disable the coprocessor cache path for the batched tasks, the + // coprocessor cache related fields are not passed in the "task" by now. + new_req.is_cache_enabled = false; new_req.ranges = task.take_ranges(); let new_context = new_req.mut_context(); new_context.set_region_id(task.get_region_id()); From 2e18d0da5f19d1231ad0b48a38d6c85cf4ac32db Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 30 Nov 2022 14:50:00 +0800 Subject: [PATCH 372/676] tikv_kv: make async_write return stream (#13854) ref tikv/tikv#13827 This PR abstracts write interface with Stream trait so that we can keep compatible with both v1 and v2. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/tikv_kv/src/btree_engine.rs | 23 +- components/tikv_kv/src/lib.rs | 100 ++++++-- components/tikv_kv/src/mock_engine.rs | 23 +- components/tikv_kv/src/rocksdb_engine.rs | 72 ++++-- components/tikv_util/src/future.rs | 20 ++ components/tikv_util/src/mpsc/future.rs | 74 +++++- src/server/gc_worker/gc_worker.rs | 24 +- src/server/raftkv.rs | 310 ++++++++++++++--------- src/storage/mod.rs | 95 +++---- src/storage/raw/raw_mvcc.rs | 22 +- src/storage/txn/scheduler.rs | 182 ++++++------- tests/Cargo.toml | 1 + tests/benches/hierarchy/mvcc/mod.rs | 2 +- tests/benches/misc/raftkv/mod.rs | 15 +- 15 files changed, 591 insertions(+), 373 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7a036117bfb..f1d02f06af9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5983,6 +5983,7 @@ dependencies = [ "tidb_query_executors", "tidb_query_expr", "tikv", + "tikv_kv", "tikv_util", "time", "tipb", diff --git a/components/tikv_kv/src/btree_engine.rs b/components/tikv_kv/src/btree_engine.rs index 45ce6a6ffe8..35f666896f3 100644 --- a/components/tikv_kv/src/btree_engine.rs +++ b/components/tikv_kv/src/btree_engine.rs @@ -14,14 +14,14 @@ use std::{ use collections::HashMap; use engine_panic::PanicEngine; use engine_traits::{CfName, IterOptions, ReadOptions, CF_DEFAULT, CF_LOCK, CF_WRITE}; -use futures::Future; +use futures::{future, stream, Future, Stream}; use kvproto::kvrpcpb::Context; use txn_types::{Key, Value}; use super::SnapContext; use crate::{ - Callback as EngineCallback, DummySnapshotExt, Engine, Error as EngineError, - ErrorInner as EngineErrorInner, Iterator, Modify, Result as EngineResult, Snapshot, WriteData, + DummySnapshotExt, Engine, Error as EngineError, ErrorInner as EngineErrorInner, Iterator, + Modify, OnAppliedCb, Result as EngineResult, Snapshot, WriteData, WriteEvent, }; type RwLockTree = RwLock>; @@ -87,18 +87,21 @@ impl Engine for BTreeEngine { unimplemented!(); } + type WriteRes = impl Stream + Send; fn async_write( &self, _ctx: &Context, batch: WriteData, - cb: EngineCallback<()>, - ) -> EngineResult<()> { - if batch.modifies.is_empty() { - return Err(EngineError::from(EngineErrorInner::EmptyRequest)); - } - cb(write_modifies(self, batch.modifies)); + _subscribed: u8, + _on_applied: Option, + ) -> Self::WriteRes { + let res = if batch.modifies.is_empty() { + Err(EngineError::from(EngineErrorInner::EmptyRequest)) + } else { + write_modifies(self, batch.modifies) + }; - Ok(()) + stream::once(future::ready(WriteEvent::Finished(res))) } type SnapshotRes = impl Future> + Send; diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index ac452fead37..07cae3ace65 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -66,6 +66,7 @@ pub const SEEK_BOUND: u64 = 8; const DEFAULT_TIMEOUT: Duration = Duration::from_secs(5); pub type Callback = Box) + Send>; +pub type OnAppliedCb = Box) + Send>; pub type ExtCallback = Box; pub type Result = result::Result; @@ -154,7 +155,7 @@ impl From for raft_cmdpb::Request { // For test purpose only. // It's used to simulate observer actions in `rocksdb_engine`. See -// `RocksEngine::async_write_ext()`. +// `RocksEngine::async_write()`. impl From for Modify { fn from(mut req: raft_cmdpb::Request) -> Modify { let name_to_cf = |name: &str| -> Option { @@ -249,6 +250,37 @@ impl WriteData { } } +/// Events that can subscribed from the `WriteSubscriber`. +pub enum WriteEvent { + Proposed, + Committed, + /// The write is either aborted or applied. + Finished(Result<()>), +} + +impl WriteEvent { + pub const EVENT_PROPOSED: u8 = 1; + pub const EVENT_COMMITTED: u8 = 1 << 1; + pub const ALL_EVENTS: u8 = Self::EVENT_PROPOSED | Self::EVENT_COMMITTED; + pub const BASIC_EVENT: u8 = 0; + + #[inline] + pub fn event_capacity(subscribed: u8) -> usize { + 1 + Self::subscribed_proposed(subscribed) as usize + + Self::subscribed_committed(subscribed) as usize + } + + #[inline] + pub fn subscribed_proposed(ev: u8) -> bool { + ev & Self::EVENT_PROPOSED != 0 + } + + #[inline] + pub fn subscribed_committed(ev: u8) -> bool { + ev & Self::EVENT_COMMITTED != 0 + } +} + #[derive(Debug, Clone, Default)] pub struct SnapContext<'a> { pub pb_ctx: &'a Context, @@ -280,6 +312,10 @@ pub trait Engine: Send + Clone + 'static { fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()>; type SnapshotRes: Future> + Send + 'static; + /// Get a snapshot asynchronously. + /// + /// Note the snapshot is queried immediately no matter whether the returned + /// future is polled or not. fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes; /// Precheck request which has write with it's context. @@ -287,26 +323,42 @@ pub trait Engine: Send + Clone + 'static { Ok(()) } - fn async_write(&self, ctx: &Context, batch: WriteData, write_cb: Callback<()>) -> Result<()>; - - /// Writes data to the engine asynchronously with some extensions. + type WriteRes: Stream + Unpin + Send + 'static; + /// Writes data to the engine asynchronously. + /// + /// You can subscribe special events like `EVENT_PROPOSED` and + /// `EVENT_COMMITTED`. /// - /// When the write request is proposed successfully, the `proposed_cb` is - /// invoked. When the write request is finished, the `write_cb` is invoked. - fn async_write_ext( + /// `on_applied` is called right in the processing thread before being + /// fed to the stream. + /// + /// Note the write is started no matter whether the returned stream is + /// polled or not. + fn async_write( &self, ctx: &Context, batch: WriteData, - write_cb: Callback<()>, - _proposed_cb: Option, - _committed_cb: Option, - ) -> Result<()> { - self.async_write(ctx, batch, write_cb) - } + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes; fn write(&self, ctx: &Context, batch: WriteData) -> Result<()> { - wait_op!(|cb| self.async_write(ctx, batch, cb), DEFAULT_TIMEOUT) - .unwrap_or_else(|| Err(Error::from(ErrorInner::Timeout(DEFAULT_TIMEOUT)))) + let f = write(self, ctx, batch, None); + let timeout = GLOBAL_TIMER_HANDLE + .delay(Instant::now() + DEFAULT_TIMEOUT) + .compat(); + + futures::executor::block_on(async move { + futures::select! { + res = f.fuse() => { + if let Some(res) = res { + return res; + } + }, + _ = timeout.fuse() => (), + }; + Err(Error::from(ErrorInner::Timeout(DEFAULT_TIMEOUT))) + }) } fn release_snapshot(&mut self) {} @@ -617,6 +669,24 @@ pub fn snapshot( } } +pub fn write( + engine: &E, + ctx: &Context, + batch: WriteData, + on_applied: Option, +) -> impl std::future::Future>> { + let mut res = engine.async_write(ctx, batch, WriteEvent::BASIC_EVENT, on_applied); + async move { + loop { + match res.next().await { + Some(WriteEvent::Finished(res)) => return Some(res), + Some(_) => (), + None => return None, + } + } + } +} + /// Write modifications into a `BaseRocksEngine` instance. pub fn write_modifies(kv_engine: &impl LocalEngine, modifies: Vec) -> Result<()> { fail_point!("rockskv_write_modifies", |_| Err(box_err!("write failed"))); diff --git a/components/tikv_kv/src/mock_engine.rs b/components/tikv_kv/src/mock_engine.rs index 376c2d1fb1f..f3d89940f4e 100644 --- a/components/tikv_kv/src/mock_engine.rs +++ b/components/tikv_kv/src/mock_engine.rs @@ -9,7 +9,7 @@ use collections::HashMap; use kvproto::kvrpcpb::Context; use super::Result; -use crate::{Callback, Engine, ExtCallback, Modify, RocksEngine, SnapContext, WriteData}; +use crate::{Engine, Modify, OnAppliedCb, RocksEngine, SnapContext, WriteData, WriteEvent}; /// A mock engine is a simple wrapper around RocksEngine /// but with the ability to assert the modifies, @@ -162,31 +162,26 @@ impl Engine for MockEngine { self.base.async_snapshot(ctx) } - fn async_write(&self, ctx: &Context, batch: WriteData, write_cb: Callback<()>) -> Result<()> { - self.async_write_ext(ctx, batch, write_cb, None, None) - } - - fn async_write_ext( + type WriteRes = ::WriteRes; + fn async_write( &self, ctx: &Context, batch: WriteData, - write_cb: Callback<()>, - proposed_cb: Option, - committed_cb: Option, - ) -> Result<()> { + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { if let Some(expected_modifies) = self.expected_modifies.as_ref() { let mut expected_writes = expected_modifies.0.lock().unwrap(); check_expected_write( &mut expected_writes, &batch.modifies, - proposed_cb.is_some(), - committed_cb.is_some(), + WriteEvent::subscribed_proposed(subscribed), + WriteEvent::subscribed_committed(subscribed), ); } let mut last_modifies = self.last_modifies.lock().unwrap(); last_modifies.push(batch.modifies.clone()); - self.base - .async_write_ext(ctx, batch, write_cb, proposed_cb, committed_cb) + self.base.async_write(ctx, batch, subscribed, on_applied) } } diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 8b0dd28646a..565ea0accaa 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -2,10 +2,12 @@ use std::{ fmt::{self, Debug, Display, Formatter}, + pin::Pin, sync::{ atomic::{AtomicBool, Ordering}, Arc, Mutex, }, + task::Poll, time::Duration, }; @@ -18,7 +20,10 @@ use engine_traits::{ CfName, Engines, IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, }; use file_system::IoRateLimiter; -use futures::{channel::oneshot, Future}; +use futures::{ + channel::{mpsc, oneshot}, + stream, Future, Stream, +}; use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb}; use raftstore::coprocessor::CoprocessorHost; use tempfile::{Builder, TempDir}; @@ -26,9 +31,10 @@ use tikv_util::worker::{Runnable, Scheduler, Worker}; use txn_types::{Key, Value}; use super::{ - write_modifies, Callback, DummySnapshotExt, Engine, Error, ErrorInner, ExtCallback, + write_modifies, Callback, DummySnapshotExt, Engine, Error, ErrorInner, Iterator as EngineIterator, Modify, Result, SnapContext, Snapshot, WriteData, }; +use crate::{OnAppliedCb, WriteEvent}; // Duplicated in test_engine_builder const TEMP_DIR: &str = ""; @@ -226,34 +232,48 @@ impl Engine for RocksEngine { Ok(()) } - fn async_write(&self, ctx: &Context, batch: WriteData, cb: Callback<()>) -> Result<()> { - self.async_write_ext(ctx, batch, cb, None, None) - } - - fn async_write_ext( + type WriteRes = impl Stream + Send + 'static; + fn async_write( &self, - _: &Context, + _ctx: &Context, batch: WriteData, - cb: Callback<()>, - proposed_cb: Option, - committed_cb: Option, - ) -> Result<()> { - fail_point!("rockskv_async_write", |_| Err(box_err!("write failed"))); - - if batch.modifies.is_empty() { - return Err(Error::from(ErrorInner::EmptyRequest)); - } + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + let (mut tx, mut rx) = mpsc::channel::(WriteEvent::event_capacity(subscribed)); + let res = (move || { + fail_point!("rockskv_async_write", |_| Err(box_err!("write failed"))); + + if batch.modifies.is_empty() { + return Err(Error::from(ErrorInner::EmptyRequest)); + } - let batch = self.pre_propose(batch)?; + let batch = self.pre_propose(batch)?; - if let Some(cb) = proposed_cb { - cb(); - } - if let Some(cb) = committed_cb { - cb(); - } - box_try!(self.sched.schedule(Task::Write(batch.modifies, cb))); - Ok(()) + if WriteEvent::subscribed_proposed(subscribed) { + let _ = tx.try_send(WriteEvent::Proposed); + } + if WriteEvent::subscribed_committed(subscribed) { + let _ = tx.try_send(WriteEvent::Committed); + } + let cb = Box::new(move |mut res| { + if let Some(cb) = on_applied { + cb(&mut res); + } + let _ = tx.try_send(WriteEvent::Finished(res)); + }); + box_try!(self.sched.schedule(Task::Write(batch.modifies, cb))); + Ok(()) + })(); + let mut res = Some(res); + stream::poll_fn(move |cx| { + if res.as_ref().map_or(false, |r| r.is_err()) { + return Poll::Ready(res.take().map(WriteEvent::Finished)); + } + // If it's none, it means an error is returned, it should not be polled again. + assert!(res.is_some()); + Pin::new(&mut rx).poll_next(cx) + }) } type SnapshotRes = impl Future> + Send; diff --git a/components/tikv_util/src/future.rs b/components/tikv_util/src/future.rs index 5f4c5b43817..7b22bebb482 100644 --- a/components/tikv_util/src/future.rs +++ b/components/tikv_util/src/future.rs @@ -197,6 +197,18 @@ impl ArcWake for PollAtWake { } } +/// Poll the future immediately. If the future is ready, returns the result. +/// Otherwise just ignore the future. +#[inline] +pub fn try_poll(f: impl Future) -> Option { + futures::executor::block_on(async move { + futures::select_biased! { + res = f.fuse() => Some(res), + _ = futures::future::ready(()).fuse() => None, + } + }) +} + #[cfg(test)] mod tests { use std::sync::atomic::AtomicUsize; @@ -232,4 +244,12 @@ mod tests { // 3. future gets ready, ignore NOTIFIED assert_eq!(poll_times.load(Ordering::SeqCst), 2); } + + #[test] + fn test_try_poll() { + let f = futures::future::ready(1); + assert_eq!(try_poll(f), Some(1)); + let f = futures::future::pending::<()>(); + assert_eq!(try_poll(f), None); + } } diff --git a/components/tikv_util/src/mpsc/future.rs b/components/tikv_util/src/mpsc/future.rs index 1e9f94c2f2d..00598f5295d 100644 --- a/components/tikv_util/src/mpsc/future.rs +++ b/components/tikv_util/src/mpsc/future.rs @@ -10,10 +10,49 @@ use std::{ use crossbeam::{ channel::{SendError, TryRecvError}, - queue::SegQueue, + queue::{ArrayQueue, SegQueue}, }; use futures::{task::AtomicWaker, Stream, StreamExt}; +enum QueueType { + Unbounded(SegQueue), + Bounded(ArrayQueue), +} + +impl QueueType { + fn len(&self) -> usize { + match self { + QueueType::Unbounded(q) => q.len(), + QueueType::Bounded(q) => q.len(), + } + } + + fn bounded(cap: usize) -> QueueType { + QueueType::Bounded(ArrayQueue::new(cap)) + } + + fn unbounded() -> QueueType { + QueueType::Unbounded(SegQueue::new()) + } + + fn push_back(&self, t: T) -> Result<(), SendError> { + match self { + QueueType::Unbounded(q) => { + q.push(t); + Ok(()) + } + QueueType::Bounded(q) => q.push(t).map_err(SendError), + } + } + + fn pop_front(&self) -> Option { + match self { + QueueType::Unbounded(q) => q.pop(), + QueueType::Bounded(q) => q.pop(), + } + } +} + #[derive(Clone, Copy)] pub enum WakePolicy { Immediately, @@ -21,7 +60,7 @@ pub enum WakePolicy { } struct Queue { - queue: SegQueue, + queue: QueueType, waker: AtomicWaker, liveness: AtomicUsize, policy: WakePolicy, @@ -62,9 +101,9 @@ impl Sender { pub fn send_with(&self, t: T, policy: WakePolicy) -> Result<(), SendError> { let queue = unsafe { &*self.queue }; if queue.liveness.load(Ordering::Acquire) & RECEIVER_COUNT_BASE != 0 { - queue.queue.push(t); + let res = queue.queue.push_back(t); queue.wake(policy); - return Ok(()); + return res; } Err(SendError(t)) } @@ -110,12 +149,12 @@ impl Stream for Receiver { #[inline] fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let queue = unsafe { &*self.queue }; - if let Some(t) = queue.queue.pop() { + if let Some(t) = queue.queue.pop_front() { return Poll::Ready(Some(t)); } queue.waker.register(cx.waker()); // In case the message is pushed right before registering waker. - if let Some(t) = queue.queue.pop() { + if let Some(t) = queue.queue.pop_front() { return Poll::Ready(Some(t)); } if queue.liveness.load(Ordering::Acquire) & !RECEIVER_COUNT_BASE != 0 { @@ -129,7 +168,7 @@ impl Receiver { #[inline] pub fn try_recv(&mut self) -> Result { let queue = unsafe { &*self.queue }; - if let Some(t) = queue.queue.pop() { + if let Some(t) = queue.queue.pop_front() { return Ok(t); } if queue.liveness.load(Ordering::Acquire) & !RECEIVER_COUNT_BASE != 0 { @@ -156,9 +195,19 @@ impl Drop for Receiver { unsafe impl Send for Receiver {} +#[inline] pub fn unbounded(policy: WakePolicy) -> (Sender, Receiver) { + with_queue(QueueType::unbounded(), policy) +} + +#[inline] +pub fn bounded(cap: usize, policy: WakePolicy) -> (Sender, Receiver) { + with_queue(QueueType::bounded(cap), policy) +} + +fn with_queue(queue: QueueType, policy: WakePolicy) -> (Sender, Receiver) { let queue = Box::into_raw(Box::new(Queue { - queue: SegQueue::new(), + queue, waker: AtomicWaker::new(), liveness: AtomicUsize::new(SENDER_COUNT_BASE | RECEIVER_COUNT_BASE), policy, @@ -430,4 +479,13 @@ mod tests { drop(tx1); assert!(dropped.load(Ordering::SeqCst)); } + + #[test] + fn test_bounded() { + let (tx, mut rx) = super::bounded(1, WakePolicy::Immediately); + tx.send(1).unwrap(); + tx.send(2).unwrap_err(); + assert_eq!(rx.try_recv().unwrap(), 1); + rx.try_recv().unwrap_err(); + } } diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 9c3c289ecf7..0a162a58230 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -1293,16 +1293,13 @@ pub mod test_gc_worker { metapb::{Peer, Region}, }; use raftstore::store::RegionSnapshot; - use tikv_kv::write_modifies; + use tikv_kv::{write_modifies, OnAppliedCb}; use txn_types::{Key, TimeStamp}; use crate::{ server::gc_worker::{GcSafePointProvider, Result as GcWorkerResult}, storage::{ - kv::{ - self, Callback as EngineCallback, Modify, Result as EngineResult, SnapContext, - WriteData, - }, + kv::{self, Modify, Result as EngineResult, SnapContext, WriteData}, Engine, }, }; @@ -1355,12 +1352,14 @@ pub mod test_gc_worker { write_modifies(&self.kv_engine().unwrap(), modifies) } + type WriteRes = ::WriteRes; fn async_write( &self, ctx: &Context, mut batch: WriteData, - callback: EngineCallback<()>, - ) -> EngineResult<()> { + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { batch.modifies.iter_mut().for_each(|modify| match modify { Modify::Delete(_, ref mut key) => { *key = Key::from_encoded(keys::data_key(key.as_encoded())); @@ -1376,7 +1375,7 @@ pub mod test_gc_worker { *end_key = Key::from_encoded(keys::data_end_key(end_key.as_encoded())); } }); - self.0.async_write(ctx, batch, callback) + self.0.async_write(ctx, batch, subscribed, on_applied) } type SnapshotRes = impl Future> + Send; @@ -1427,13 +1426,16 @@ pub mod test_gc_worker { Ok(()) } + type WriteRes = ::WriteRes; fn async_write( &self, ctx: &Context, batch: WriteData, - callback: EngineCallback<()>, - ) -> EngineResult<()> { - self.engines.lock().unwrap()[&ctx.region_id].async_write(ctx, batch, callback) + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + self.engines.lock().unwrap()[&ctx.region_id] + .async_write(ctx, batch, subscribed, on_applied) } type SnapshotRes = impl Future> + Send; diff --git a/src/server/raftkv.rs b/src/server/raftkv.rs index 6dc84f951ee..b6890262007 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv.rs @@ -3,19 +3,25 @@ // #[PerformanceCriticalPath] use std::{ borrow::Cow, + cell::UnsafeCell, fmt::{self, Debug, Display, Formatter}, io::Error as IoError, mem, num::NonZeroU64, + pin::Pin, result, - sync::{Arc, RwLock}, + sync::{ + atomic::{AtomicU8, Ordering}, + Arc, RwLock, + }, + task::Poll, time::Duration, }; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use engine_traits::{CfName, KvEngine, MvccProperties, Snapshot}; -use futures::{future::BoxFuture, Future}; +use futures::{future::BoxFuture, task::AtomicWaker, Future, Stream, StreamExt}; use kvproto::{ errorpb, kvrpcpb::{Context, IsolationLevel}, @@ -35,12 +41,12 @@ use raftstore::{ errors::Error as RaftServerError, router::{LocalReadRouter, RaftStoreRouter}, store::{ - Callback as StoreCallback, RaftCmdExtraOpts, ReadIndexContext, ReadResponse, + self, Callback as StoreCallback, RaftCmdExtraOpts, ReadIndexContext, ReadResponse, RegionSnapshot, WriteResponse, }, }; use thiserror::Error; -use tikv_kv::write_modifies; +use tikv_kv::{write_modifies, OnAppliedCb, WriteEvent}; use tikv_util::{ codec::number::NumberEncoder, future::{paired_future_callback, paired_must_called_future_callback}, @@ -51,10 +57,7 @@ use txn_types::{Key, TimeStamp, TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::metrics::*; use crate::storage::{ self, kv, - kv::{ - Callback, Engine, Error as KvError, ErrorInner as KvErrorInner, ExtCallback, Modify, - SnapContext, WriteData, - }, + kv::{Engine, Error as KvError, ErrorInner as KvErrorInner, Modify, SnapContext, WriteData}, }; #[derive(Debug, Error)] @@ -78,19 +81,6 @@ pub enum Error { Timeout(Duration), } -fn get_status_kind_from_error(e: &Error) -> RequestStatusKind { - match *e { - Error::RequestFailed(ref header) => { - RequestStatusKind::from(storage::get_error_kind_from_header(header)) - } - Error::Io(_) => RequestStatusKind::err_io, - Error::Server(_) => RequestStatusKind::err_server, - Error::InvalidResponse(_) => RequestStatusKind::err_invalid_resp, - Error::InvalidRequest(_) => RequestStatusKind::err_invalid_req, - Error::Timeout(_) => RequestStatusKind::err_timeout, - } -} - fn get_status_kind_from_engine_error(e: &kv::Error) -> RequestStatusKind { match *e { KvError(box KvErrorInner::Request(ref header)) => { @@ -208,6 +198,95 @@ pub fn drop_snapshot_callback() -> kv::Result { Err(kv::Error::from(kv::ErrorInner::Request(err))) } +struct WriteResCore { + ev: AtomicU8, + result: UnsafeCell>>, + wake: AtomicWaker, +} + +struct WriteResSub { + notified_ev: u8, + core: Arc, +} + +unsafe impl Send for WriteResSub {} + +impl Stream for WriteResSub { + type Item = WriteEvent; + + #[inline] + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let mut s = self.as_mut(); + let mut cur_ev = s.core.ev.load(Ordering::Acquire); + if cur_ev == s.notified_ev { + s.core.wake.register(cx.waker()); + cur_ev = s.core.ev.load(Ordering::Acquire); + if cur_ev == s.notified_ev { + return Poll::Pending; + } + } + s.notified_ev = cur_ev; + match cur_ev { + WriteEvent::EVENT_PROPOSED => Poll::Ready(Some(WriteEvent::Proposed)), + WriteEvent::EVENT_COMMITTED => Poll::Ready(Some(WriteEvent::Committed)), + u8::MAX => { + let result = unsafe { (*s.core.result.get()).take().unwrap() }; + Poll::Ready(Some(WriteEvent::Finished(result))) + } + e => panic!("unexpected event {}", e), + } + } +} + +#[derive(Clone)] +struct WriteResFeed { + core: Arc, +} + +unsafe impl Send for WriteResFeed {} + +impl WriteResFeed { + fn pair() -> (Self, WriteResSub) { + let core = Arc::new(WriteResCore { + ev: AtomicU8::new(0), + result: UnsafeCell::new(None), + wake: AtomicWaker::new(), + }); + ( + Self { core: core.clone() }, + WriteResSub { + notified_ev: 0, + core, + }, + ) + } + + fn notify_proposed(&self) { + self.core + .ev + .store(WriteEvent::EVENT_PROPOSED, Ordering::Release); + self.core.wake.wake(); + } + + fn notify_committed(&self) { + self.core + .ev + .store(WriteEvent::EVENT_COMMITTED, Ordering::Release); + self.core.wake.wake(); + } + + fn notify(&self, result: kv::Result<()>) { + unsafe { + (*self.core.result.get()) = Some(result); + } + self.core.ev.store(u8::MAX, Ordering::Release); + self.core.wake.wake(); + } +} + /// `RaftKv` is a storage engine base on `RaftStore`. #[derive(Clone)] pub struct RaftKv @@ -239,66 +318,6 @@ where pub fn set_txn_extra_scheduler(&mut self, txn_extra_scheduler: Arc) { self.txn_extra_scheduler = Some(txn_extra_scheduler); } - - fn exec_write_requests( - &self, - ctx: &Context, - batch: WriteData, - write_cb: Callback>, - proposed_cb: Option, - committed_cb: Option, - ) -> Result<()> { - #[cfg(feature = "failpoints")] - { - // If rid is some, only the specified region reports error. - // If rid is None, all regions report error. - let raftkv_early_error_report_fp = || -> Result<()> { - fail_point!("raftkv_early_error_report", |rid| { - let region_id = ctx.get_region_id(); - rid.and_then(|rid| { - let rid: u64 = rid.parse().unwrap(); - if rid == region_id { None } else { Some(()) } - }) - .ok_or_else(|| RaftServerError::RegionNotFound(region_id).into()) - }); - Ok(()) - }; - raftkv_early_error_report_fp()?; - } - - let reqs: Vec = batch.modifies.into_iter().map(Into::into).collect(); - let txn_extra = batch.extra; - let mut header = new_request_header(ctx); - let mut flags = 0; - if txn_extra.one_pc { - flags |= WriteBatchFlags::ONE_PC.bits(); - } - if txn_extra.for_flashback { - flags |= WriteBatchFlags::FLASHBACK.bits(); - } - header.set_flags(flags); - - let mut cmd = RaftCmdRequest::default(); - cmd.set_header(header); - cmd.set_requests(reqs.into()); - - self.schedule_txn_extra(txn_extra); - - let cb = StoreCallback::write_ext( - Box::new(move |resp| { - write_cb(on_write_result(resp).map_err(Error::into)); - }), - proposed_cb, - committed_cb, - ); - let extra_opts = RaftCmdExtraOpts { - deadline: batch.deadline, - disk_full_opt: batch.disk_full_opt, - }; - self.router.send_command(cmd, cb, extra_opts)?; - - Ok(()) - } } fn invalid_resp_type(exp: CmdType, act: CmdType) -> Error { @@ -383,59 +402,116 @@ where } } + type WriteRes = impl Stream + Send + Unpin; fn async_write( &self, ctx: &Context, batch: WriteData, - write_cb: Callback<()>, - ) -> kv::Result<()> { - self.async_write_ext(ctx, batch, write_cb, None, None) - } - - fn async_write_ext( - &self, - ctx: &Context, - batch: WriteData, - write_cb: Callback<()>, - proposed_cb: Option, - committed_cb: Option, - ) -> kv::Result<()> { - fail_point!("raftkv_async_write"); - if batch.modifies.is_empty() { - return Err(KvError::from(KvErrorInner::EmptyRequest)); - } + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + let mut res = (|| { + fail_point!("raftkv_async_write"); + if batch.modifies.is_empty() { + return Err(KvError::from(KvErrorInner::EmptyRequest)); + } + Ok(()) + })(); ASYNC_REQUESTS_COUNTER_VEC.write.all.inc(); let begin_instant = Instant::now_coarse(); - self.exec_write_requests( - ctx, - batch, - Box::new(move |res| match res { + if res.is_ok() { + // If rid is some, only the specified region reports error. + // If rid is None, all regions report error. + res = (|| { + fail_point!("raftkv_early_error_report", |rid| { + let region_id = ctx.get_region_id(); + rid.and_then(|rid| { + let rid: u64 = rid.parse().unwrap(); + if rid == region_id { None } else { Some(()) } + }) + .ok_or_else(|| RaftServerError::RegionNotFound(region_id).into()) + }); + Ok(()) + })(); + } + + let reqs: Vec = batch.modifies.into_iter().map(Into::into).collect(); + let txn_extra = batch.extra; + let mut header = new_request_header(ctx); + let mut flags = 0; + if txn_extra.one_pc { + flags |= WriteBatchFlags::ONE_PC.bits(); + } + if txn_extra.for_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + + let mut cmd = RaftCmdRequest::default(); + cmd.set_header(header); + cmd.set_requests(reqs.into()); + + self.schedule_txn_extra(txn_extra); + + let (tx, rx) = WriteResFeed::pair(); + let proposed_cb = if !WriteEvent::subscribed_proposed(subscribed) { + None + } else { + let tx = tx.clone(); + Some(Box::new(move || tx.notify_proposed()) as store::ExtCallback) + }; + let committed_cb = if !WriteEvent::subscribed_committed(subscribed) { + None + } else { + let tx = tx.clone(); + Some(Box::new(move || tx.notify_committed()) as store::ExtCallback) + }; + let applied_tx = tx.clone(); + let applied_cb = Box::new(move |resp: WriteResponse| { + let mut res = match on_write_result::(resp) { Ok(CmdRes::Resp(_)) => { + fail_point!("raftkv_async_write_finish"); + Ok(()) + } + Ok(CmdRes::Snap(_)) => Err(box_err!("unexpect snapshot, should mutate instead.")), + Err(e) => Err(kv::Error::from(e)), + }; + if let Some(cb) = on_applied { + cb(&mut res); + } + applied_tx.notify(res); + }); + + let cb = StoreCallback::write_ext(applied_cb, proposed_cb, committed_cb); + let extra_opts = RaftCmdExtraOpts { + deadline: batch.deadline, + disk_full_opt: batch.disk_full_opt, + }; + if res.is_ok() { + res = self + .router + .send_command(cmd, cb, extra_opts) + .map_err(kv::Error::from); + } + if res.is_err() { + tx.notify(res); + } + rx.inspect(move |ev| { + let WriteEvent::Finished(res) = ev else { return }; + match res { + Ok(()) => { ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); ASYNC_REQUESTS_DURATIONS_VEC .write .observe(begin_instant.saturating_elapsed_secs()); - fail_point!("raftkv_async_write_finish"); - write_cb(Ok(())) - } - Ok(CmdRes::Snap(_)) => { - write_cb(Err(box_err!("unexpect snapshot, should mutate instead."))) } Err(e) => { - let status_kind = get_status_kind_from_engine_error(&e); + let status_kind = get_status_kind_from_engine_error(e); ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); - write_cb(Err(e)) } - }), - proposed_cb, - committed_cb, - ) - .map_err(|e| { - let status_kind = get_status_kind_from_error(&e); - ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); - e.into() + } }) } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index b87ab8c4a6d..32cd7c11000 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -64,6 +64,7 @@ use std::{ borrow::Cow, iter, marker::PhantomData, + mem, sync::{ atomic::{self, AtomicBool, AtomicU64}, Arc, @@ -87,9 +88,10 @@ use pd_client::FeatureGate; use raftstore::store::{util::build_key_range, ReadStats, TxnExt, WriteStats}; use rand::prelude::*; use resource_metering::{FutureExt, ResourceTagFactory}; -use tikv_kv::SnapshotExt; +use tikv_kv::{OnAppliedCb, SnapshotExt}; use tikv_util::{ deadline::Deadline, + future::try_poll, quota_limiter::QuotaLimiter, time::{duration_to_ms, Instant, ThreadReadId}, }; @@ -1548,11 +1550,18 @@ impl Storage { let mut batch = WriteData::from_modifies(modifies); batch.set_allowed_on_disk_almost_full(); - self.engine.async_write( + let res = kv::write( + &self.engine, &ctx, batch, - Box::new(|res| callback(res.map_err(Error::from))), - )?; + Some(Box::new(|res| { + callback(mem::replace(res, Ok(())).map_err(Error::from)) + })), + ); + // TODO: perhaps change delete_range API to return future. + if let Some(Some(Err(e))) = try_poll(res) { + return Err(Error::from(e)); + } KV_COMMAND_COUNTER_VEC_STATIC.delete_range.inc(); Ok(()) } @@ -1951,14 +1960,12 @@ impl Storage { let mut batch = WriteData::from_modifies(vec![m]); batch.set_allowed_on_disk_almost_full(); - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2054,14 +2061,12 @@ impl Storage { let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls, ts.unwrap()); let mut batch = WriteData::from_modifies(modifies); batch.set_allowed_on_disk_almost_full(); - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2118,14 +2123,12 @@ impl Storage { let m = Self::raw_delete_request_to_modify(cf, key, ts.unwrap()); let mut batch = WriteData::from_modifies(vec![m]); batch.set_allowed_on_disk_almost_full(); - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2171,14 +2174,12 @@ impl Storage { batch.set_allowed_on_disk_almost_full(); // TODO: special notification channel for API V2. - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2231,14 +2232,12 @@ impl Storage { .collect(); let mut batch = WriteData::from_modifies(modifies); batch.set_allowed_on_disk_almost_full(); - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2993,13 +2992,15 @@ impl Engine for TxnTestEngine { } } + type WriteRes = E::WriteRes; fn async_write( &self, ctx: &Context, batch: WriteData, - write_cb: tikv_kv::Callback<()>, - ) -> tikv_kv::Result<()> { - self.engine.async_write(ctx, batch, write_cb) + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + self.engine.async_write(ctx, batch, subscribed, on_applied) } } diff --git a/src/storage/raw/raw_mvcc.rs b/src/storage/raw/raw_mvcc.rs index 6d86203e8f2..8c4ad5da08b 100644 --- a/src/storage/raw/raw_mvcc.rs +++ b/src/storage/raw/raw_mvcc.rs @@ -232,11 +232,7 @@ impl Iterator for RawMvccIterator { #[cfg(test)] mod tests { - use std::{ - fmt::Debug, - iter::Iterator as StdIterator, - sync::mpsc::{channel, Sender}, - }; + use std::iter::Iterator as StdIterator; use api_version::{ApiV2, KvFormat, RawValue}; use engine_traits::{raw_ttl::ttl_to_expire_ts, CF_DEFAULT}; @@ -244,21 +240,13 @@ mod tests { use tikv_kv::{Engine, Iterator as EngineIterator, Modify, WriteData}; use super::*; - use crate::storage::{raw::encoded::RawEncodeSnapshot, TestEngineBuilder}; - - fn expect_ok_callback(done: Sender, id: i32) -> tikv_kv::Callback { - Box::new(move |x: tikv_kv::Result| { - x.unwrap(); - done.send(id).unwrap(); - }) - } + use crate::storage::{kv, raw::encoded::RawEncodeSnapshot, TestEngineBuilder}; #[test] fn test_raw_mvcc_snapshot() { // Use `Engine` to be independent to `Storage`. // Do not set "api version" to use `Engine` as a raw RocksDB. let mut engine = TestEngineBuilder::new().build().unwrap(); - let (tx, rx) = channel(); let ctx = Context::default(); // TODO: Consider another way other than hard coding, to generate keys' prefix @@ -291,10 +279,8 @@ mod tests { ApiV2::encode_raw_value_owned(raw_value), ); let batch = WriteData::from_modifies(vec![m]); - engine - .async_write(&ctx, batch, expect_ok_callback(tx.clone(), 0)) - .unwrap(); - rx.recv().unwrap(); + let res = futures::executor::block_on(kv::write(&engine, &ctx, batch, None)).unwrap(); + res.unwrap(); } // snapshot diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 6fd9d150478..4657decf66f 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -39,7 +39,7 @@ use collections::HashMap; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; use crossbeam::utils::CachePadded; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; -use futures::compat::Future01CompatExt; +use futures::{compat::Future01CompatExt, StreamExt}; use kvproto::{ kvrpcpb::{self, CommandPri, Context, DiskFullOpt, ExtraOp}, pdpb::QueryKind, @@ -49,7 +49,7 @@ use pd_client::{Feature, FeatureGate}; use raftstore::store::TxnExt; use resource_metering::{FutureExt, ResourceTagFactory}; use smallvec::{smallvec, SmallVec}; -use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData}; +use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData, WriteEvent}; use tikv_util::{ deadline::Deadline, quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE, }; @@ -63,8 +63,8 @@ use crate::{ errors::SharedError, get_causal_ts, get_priority_tag, get_raw_key_guard, kv::{ - self, with_tls_engine, Engine, ExtCallback, FlowStatsReporter, Result as EngineResult, - SnapContext, Statistics, + self, with_tls_engine, Engine, FlowStatsReporter, Result as EngineResult, SnapContext, + Statistics, }, lock_manager::{ self, @@ -346,18 +346,11 @@ impl SchedulerInner { .and_then(|tctx| if tctx.try_own() { tctx.cb.take() } else { None }) } - fn take_task_cb_and_pr( - &self, - cid: u64, - ) -> (Option, Option) { + fn take_task_cb(&self, cid: u64) -> Option { self.get_task_slot(cid) .get_mut(&cid) - .map(|tctx| (tctx.cb.take(), tctx.pr.take())) - .unwrap_or((None, None)) - } - - fn store_pr(&self, cid: u64, pr: ProcessResult) { - self.get_task_slot(cid).get_mut(&cid).unwrap().pr = Some(pr); + .map(|tctx| tctx.cb.take()) + .unwrap_or(None) } fn store_lock_changes( @@ -1133,7 +1126,6 @@ impl Scheduler { let write_bytes = task.cmd.write_bytes(); let tag = task.cmd.tag(); let cid = task.cid; - let priority = task.cmd.priority(); let tracker = task.tracker; let scheduler = self.clone(); let quota_limiter = self.inner.quota_limiter.clone(); @@ -1313,65 +1305,16 @@ impl Scheduler { to_be_write.deadline = Some(deadline); let sched = scheduler.clone(); - let sched_pool = scheduler.get_sched_pool(priority).pool.clone(); - - let (proposed_cb, committed_cb): (Option, Option) = - match response_policy { - ResponsePolicy::OnApplied => (None, None), - ResponsePolicy::OnCommitted => { - self.inner.store_pr(cid, pr.take().unwrap()); - let sched = scheduler.clone(); - // Currently, the only case that response is returned after finishing - // commit is async applying prewrites for async commit transactions. - // The committed callback is not guaranteed to be invoked. So store - // the `pr` to the tctx instead of capturing it to the closure. - let committed_cb = Box::new(move || { - fail_point!("before_async_apply_prewrite_finish", |_| {}); - let (cb, pr) = sched.inner.take_task_cb_and_pr(cid); - Self::early_response( - cid, - cb.unwrap(), - pr.unwrap(), - tag, - CommandStageKind::async_apply_prewrite, - ); - }); - is_async_apply_prewrite = true; - (None, Some(committed_cb)) - } - ResponsePolicy::OnProposed => { - if pipelined { - // The normal write process is respond to clients and release - // latches after async write finished. If pipelined pessimistic - // locking is enabled, the process becomes parallel and there are - // two msgs for one command: - // 1. Msg::PipelinedWrite: respond to clients - // 2. Msg::WriteFinished: deque context and release latches - // The proposed callback is not guaranteed to be invoked. So store - // the `pr` to the tctx instead of capturing it to the closure. - self.inner.store_pr(cid, pr.take().unwrap()); - let sched = scheduler.clone(); - // Currently, the only case that response is returned after finishing - // proposed phase is pipelined pessimistic lock. - // TODO: Unify the code structure of pipelined pessimistic lock and - // async apply prewrite. - let proposed_cb = Box::new(move || { - fail_point!("before_pipelined_write_finish", |_| {}); - let (cb, pr) = sched.inner.take_task_cb_and_pr(cid); - Self::early_response( - cid, - cb.unwrap(), - pr.unwrap(), - tag, - CommandStageKind::pipelined_write, - ); - }); - (Some(proposed_cb), None) - } else { - (None, None) - } - } - }; + + let mut subscribed = WriteEvent::BASIC_EVENT; + match response_policy { + ResponsePolicy::OnCommitted => { + subscribed |= WriteEvent::EVENT_COMMITTED; + is_async_apply_prewrite = true; + } + ResponsePolicy::OnProposed if pipelined => subscribed |= WriteEvent::EVENT_PROPOSED, + _ => (), + } if self.inner.flow_controller.enabled() { if self.inner.flow_controller.is_unlimited(region_id) { @@ -1447,15 +1390,11 @@ impl Scheduler { // transfer leader command must be later than this write command because this // write command has been sent to the raftstore. Then, we don't need to worry // this request will fail due to the voluntary leader transfer. - let _downgraded_guard = pessimistic_locks_guard.and_then(|guard| { + let downgraded_guard = pessimistic_locks_guard.and_then(|guard| { (!removed_pessimistic_locks.is_empty()).then(|| RwLockWriteGuard::downgrade(guard)) }); - - // The callback to receive async results of write prepare from the storage - // engine. - let engine_cb = Box::new(move |result: EngineResult<()>| { - let ok = result.is_ok(); - if ok && !removed_pessimistic_locks.is_empty() { + let on_applied = Box::new(move |res: &mut kv::Result<()>| { + if res.is_ok() && !removed_pessimistic_locks.is_empty() { // Removing pessimistic locks when it succeeds to apply. This should be done in // the apply thread, to make sure it happens before other admin commands are // executed. @@ -1472,15 +1411,69 @@ impl Scheduler { } } } + }); - sched_pool - .spawn(async move { + let mut res = unsafe { + with_tls_engine(|e: &mut E| { + e.async_write(&ctx, to_be_write, subscribed, Some(on_applied)) + }) + }; + drop(downgraded_guard); + + while let Some(ev) = res.next().await { + match ev { + WriteEvent::Committed => { + let early_return = (|| { + fail_point!("before_async_apply_prewrite_finish", |_| false); + true + })(); + if WriteEvent::subscribed_committed(subscribed) && early_return { + // Currently, the only case that response is returned after finishing + // commit is async applying prewrites for async commit transactions. + let cb = scheduler.inner.take_task_cb(cid); + Self::early_response( + cid, + cb.unwrap(), + pr.take().unwrap(), + tag, + CommandStageKind::async_apply_prewrite, + ); + } + } + WriteEvent::Proposed => { + let early_return = (|| { + fail_point!("before_pipelined_write_finish", |_| false); + true + })(); + if WriteEvent::subscribed_proposed(subscribed) && early_return { + // The normal write process is respond to clients and release + // latches after async write finished. If pipelined pessimistic + // locking is enabled, the process becomes parallel and there are + // two msgs for one command: + // 1. Msg::PipelinedWrite: respond to clients + // 2. Msg::WriteFinished: deque context and release latches + // Currently, the only case that response is returned after finishing + // proposed phase is pipelined pessimistic lock. + // TODO: Unify the code structure of pipelined pessimistic lock and + // async apply prewrite. + let cb = scheduler.inner.take_task_cb(cid); + Self::early_response( + cid, + cb.unwrap(), + pr.take().unwrap(), + tag, + CommandStageKind::pipelined_write, + ); + } + } + WriteEvent::Finished(res) => { fail_point!("scheduler_async_write_finish"); + let ok = res.is_ok(); sched.on_write_finished( cid, pr, - result, + res, lock_guards, pipelined, is_async_apply_prewrite, @@ -1498,23 +1491,14 @@ impl Scheduler { sched.inner.flow_controller.unconsume(region_id, write_size); } } - }) - .unwrap() - }); - - // Safety: `self.sched_pool` ensures a TLS engine exists. - unsafe { - with_tls_engine(|engine: &mut E| { - if let Err(e) = - engine.async_write_ext(&ctx, to_be_write, engine_cb, proposed_cb, committed_cb) - { - SCHED_STAGE_COUNTER_VEC.get(tag).async_write_err.inc(); - - info!("engine async_write failed"; "cid" => cid, "err" => ?e); - scheduler.finish_with_err(cid, e); + return; } - }) + } } + // If it's not finished while the channel is closed, it means the write + // is undeterministic. in this case, we don't know whether the + // request is finished or not, so we should not release latch as + // it may break correctness. } /// Returns whether it succeeds to write pessimistic locks to the in-memory diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 043e3ad2d23..5f29d44a53d 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -146,6 +146,7 @@ test_sst_importer = { workspace = true } test_storage = { workspace = true } test_util = { workspace = true } tidb_query_datatype = { workspace = true } +tikv_kv = { workspace = true } tipb_helper = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } diff --git a/tests/benches/hierarchy/mvcc/mod.rs b/tests/benches/hierarchy/mvcc/mod.rs index 20740b4cb16..7a79b984aaf 100644 --- a/tests/benches/hierarchy/mvcc/mod.rs +++ b/tests/benches/hierarchy/mvcc/mod.rs @@ -60,7 +60,7 @@ where .unwrap(); } let write_data = WriteData::from_modifies(txn.into_modifies()); - let _ = engine.async_write(&ctx, write_data, Box::new(move |_| {})); + let _ = tikv_kv::write(engine, &ctx, write_data, None); let keys: Vec = kvs.iter().map(|(k, _)| Key::from_raw(k)).collect(); let snapshot = engine.snapshot(Default::default()).unwrap(); (snapshot, keys) diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index bc4786ae73e..d567edd5add 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -226,17 +226,18 @@ fn bench_async_write(b: &mut test::Bencher) { ctx.set_region_epoch(region.get_region_epoch().clone()); ctx.set_peer(leader); b.iter(|| { - let on_finished: EngineCallback<()> = Box::new(|_| { - test::black_box(()); - }); - kv.async_write( + let f = tikv_kv::write( + &kv, &ctx, WriteData::from_modifies(vec![Modify::Delete( CF_DEFAULT, Key::from_encoded(b"fooo".to_vec()), )]), - on_finished, - ) - .unwrap(); + None, + ); + let res = f.map(|res| { + let _ = test::black_box(res); + }); + let _ = test::black_box(res); }); } From fbaaab32100292a54909b69649d15ee0e75fe58e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 30 Nov 2022 17:50:00 +0800 Subject: [PATCH 373/676] log-backup: implement subscribe flushing (#13810) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#13824 Implements the new gRPC interface "SubscribeFlush". Signed-off-by: hillium Signed-off-by: Yu Juncen Signed-off-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- .../backup-stream/src/checkpoint_manager.rs | 164 +++++++++++++++++- components/backup-stream/src/endpoint.rs | 22 ++- components/backup-stream/src/errors.rs | 20 +++ components/backup-stream/src/service.rs | 12 ++ components/backup-stream/tests/mod.rs | 118 ++++++++++++- components/error_code/src/backup_stream.rs | 7 +- 6 files changed, 328 insertions(+), 15 deletions(-) diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 3a13acd2f4c..e9f930e8563 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -2,16 +2,25 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; +use futures::{ + channel::mpsc::{self as async_mpsc, Receiver, Sender}, + SinkExt, StreamExt, +}; +use grpcio::{RpcStatus, RpcStatusCode, ServerStreamingSink, WriteFlags}; use kvproto::{ errorpb::{Error as PbError, *}, + logbackuppb::{FlushEvent, SubscribeFlushEventResponse}, metapb::Region, }; use pd_client::PdClient; -use tikv_util::{info, worker::Scheduler}; +use tikv_util::{box_err, defer, info, warn, worker::Scheduler}; use txn_types::TimeStamp; +use uuid::Uuid; use crate::{ - errors::{Error, Result}, + annotate, + errors::{Error, ReportableResult, Result}, + future, metadata::{store::MetaStore, Checkpoint, CheckpointProvider, MetadataClient}, metrics, try_send, RegionCheckpointOperation, Task, }; @@ -20,11 +29,85 @@ use crate::{ /// This information is provided for the `advancer` in checkpoint V3, /// which involved a central node (typically TiDB) for collecting all regions' /// checkpoint then advancing the global checkpoint. -#[derive(Debug, Default)] +#[derive(Default)] pub struct CheckpointManager { items: HashMap, + manager_handle: Option>, +} + +impl std::fmt::Debug for CheckpointManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CheckpointManager") + .field("items", &self.items) + .finish() + } +} + +enum SubscriptionOp { + Add(Subscription), + Emit(Box<[FlushEvent]>), +} + +struct SubscriptionManager { + subscribers: HashMap, + input: Receiver, } +impl SubscriptionManager { + pub async fn main_loop(mut self) { + info!("subscription manager started!"); + defer! { info!("subscription manager exit.") } + while let Some(msg) = self.input.next().await { + match msg { + SubscriptionOp::Add(sub) => { + self.subscribers.insert(Uuid::new_v4(), sub); + } + SubscriptionOp::Emit(events) => { + let mut canceled = vec![]; + for (id, sub) in &mut self.subscribers { + let send_all = async { + for es in events.chunks(1024) { + let mut resp = SubscribeFlushEventResponse::new(); + resp.set_events(es.to_vec().into()); + sub.feed((resp, WriteFlags::default())).await?; + } + sub.flush().await + }; + + match send_all.await { + Err(grpcio::Error::RemoteStopped) => { + canceled.push(*id); + } + Err(err) => { + Error::from(err).report("sending subscription"); + } + _ => {} + } + } + + for c in canceled { + match self.subscribers.remove(&c) { + Some(mut sub) => { + info!("client is gone, removing subscription"; "id" => %c); + sub.close().await.report_if_err(format_args!( + "during removing subscription {}", + c + )) + } + None => { + warn!("BUG: the subscriber has been removed before we are going to remove it."; "id" => %c); + } + } + } + } + } + } + } +} + +// Note: can we make it more generic...? +pub type Subscription = ServerStreamingSink; + /// The result of getting a checkpoint. /// The possibility of failed to getting checkpoint is pretty high: /// because there is a gap between region leader change and flushing. @@ -76,8 +159,81 @@ impl CheckpointManager { self.items.clear(); } + pub fn spawn_subscription_mgr(&mut self) -> future![()] { + let (tx, rx) = async_mpsc::channel(1024); + let sub = SubscriptionManager { + subscribers: Default::default(), + input: rx, + }; + self.manager_handle = Some(tx); + sub.main_loop() + } + + pub fn update_region_checkpoints(&mut self, region_and_checkpoint: Vec<(Region, TimeStamp)>) { + for (region, checkpoint) in ®ion_and_checkpoint { + self.do_update(region, *checkpoint); + } + + self.notify(region_and_checkpoint.into_iter()); + } + /// update a region checkpoint in need. + #[cfg(test)] pub fn update_region_checkpoint(&mut self, region: &Region, checkpoint: TimeStamp) { + self.do_update(region, checkpoint); + self.notify(std::iter::once((region.clone(), checkpoint))); + } + + pub fn add_subscriber(&mut self, sub: Subscription) -> future![Result<()>] { + let mgr = self.manager_handle.as_ref().cloned(); + + // NOTE: we cannot send the real error into the client directly because once + // we send the subscription into the sink, we cannot fetch it again :( + async move { + let mgr = mgr.ok_or(Error::Other(box_err!("subscription manager not get ready"))); + let mut mgr = match mgr { + Ok(mgr) => mgr, + Err(err) => { + sub.fail(RpcStatus::with_message( + RpcStatusCode::UNAVAILABLE, + "subscription manager not get ready.".to_owned(), + )) + .await + .map_err(|err| { + annotate!(err, "failed to send request to subscriber manager") + })?; + return Err(err); + } + }; + mgr.send(SubscriptionOp::Add(sub)) + .await + .map_err(|err| annotate!(err, "failed to send request to subscriber manager"))?; + Ok(()) + } + } + + fn notify(&mut self, items: impl Iterator) { + if let Some(mgr) = self.manager_handle.as_mut() { + let r = items + .map(|(r, ts)| { + let mut f = FlushEvent::new(); + f.set_checkpoint(ts.into_inner()); + f.set_start_key(r.start_key); + f.set_end_key(r.end_key); + f + }) + .collect::>(); + let event_size = r.len(); + let res = mgr.try_send(SubscriptionOp::Emit(r)); + // Note: perhaps don't batch in the channel but batch in the receiver side? + // If so, we can control the memory usage better. + if let Err(err) = res { + warn!("the channel is full, dropping some events."; "length" => %event_size, "err" => %err); + } + } + } + + fn do_update(&mut self, region: &Region, checkpoint: TimeStamp) { let e = self.items.entry(region.get_id()); e.and_modify(|old_cp| { if old_cp.checkpoint < checkpoint @@ -199,7 +355,7 @@ impl FlushObserver for BasicFlushObserver { .pd_cli .update_service_safe_point( format!("backup-stream-{}-{}", task, self.store_id), - TimeStamp::new(rts - 1), + TimeStamp::new(rts.saturating_sub(1)), // Add a service safe point for 30 mins (6x the default flush interval). // It would probably be safe. Duration::from_secs(1800), diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 22a415ca6bb..2ebeee2ea66 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -42,7 +42,7 @@ use crate::{ annotate, checkpoint_manager::{ BasicFlushObserver, CheckpointManager, CheckpointV3FlushObserver, FlushObserver, - GetCheckpointResult, RegionIdWithVersion, + GetCheckpointResult, RegionIdWithVersion, Subscription, }, errors::{Error, Result}, event_loader::{InitialDataLoader, PendingMemoryQuota}, @@ -165,6 +165,8 @@ where ((config.num_threads + 1) / 2).max(1), ); pool.spawn(op_loop); + let mut checkpoint_mgr = CheckpointManager::default(); + pool.spawn(checkpoint_mgr.spawn_subscription_mgr()); Endpoint { meta_client, range_router, @@ -183,7 +185,7 @@ where region_operator, failover_time: None, config, - checkpoint_mgr: Default::default(), + checkpoint_mgr, } } } @@ -887,11 +889,7 @@ where // Let's clear all stale checkpoints first. // Or they may slow down the global checkpoint. self.checkpoint_mgr.clear(); - for (region, checkpoint) in u { - debug!("setting region checkpoint"; "region" => %region.get_id(), "ts" => %checkpoint); - self.checkpoint_mgr - .update_region_checkpoint(®ion, checkpoint) - } + self.checkpoint_mgr.update_region_checkpoints(u); } RegionCheckpointOperation::Get(g, cb) => { let _guard = self.pool.handle().enter(); @@ -911,6 +909,14 @@ where .collect()), } } + RegionCheckpointOperation::Subscribe(sub) => { + let fut = self.checkpoint_mgr.add_subscriber(sub); + self.pool.spawn(async move { + if let Err(err) = fut.await { + err.report("adding subscription"); + } + }); + } } } @@ -957,6 +963,7 @@ pub enum RegionSet { pub enum RegionCheckpointOperation { Update(Vec<(Region, TimeStamp)>), Get(RegionSet, Box) + Send>), + Subscribe(Subscription), } impl fmt::Debug for RegionCheckpointOperation { @@ -964,6 +971,7 @@ impl fmt::Debug for RegionCheckpointOperation { match self { Self::Update(arg0) => f.debug_tuple("Update").field(arg0).finish(), Self::Get(arg0, _) => f.debug_tuple("Get").field(arg0).finish(), + Self::Subscribe(_) => f.debug_tuple("Subscription").finish(), } } } diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index b34e7126360..a3f76e0255f 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -6,6 +6,7 @@ use std::{ use error_code::ErrorCodeExt; use etcd_client::Error as EtcdError; +use grpcio::Error as GrpcError; use kvproto::{errorpb::Error as StoreError, metapb::*}; use pd_client::Error as PdError; use protobuf::ProtobufError; @@ -18,6 +19,8 @@ use crate::{endpoint::Task, metrics}; #[derive(ThisError, Debug)] pub enum Error { + #[error("gRPC meet error {0}")] + Grpc(#[from] GrpcError), #[error("Etcd meet error {0}")] Etcd(#[from] EtcdError), #[error("Protobuf meet error {0}")] @@ -66,6 +69,7 @@ impl ErrorCodeExt for Error { Error::Other(_) => OTHER, Error::RaftStore(_) => RAFTSTORE, Error::ObserveCanceled(..) => OBSERVE_CANCELED, + Error::Grpc(_) => GRPC, } } } @@ -115,6 +119,22 @@ where } } +pub trait ReportableResult { + fn report_if_err(self, context: impl ToString); +} + +impl ReportableResult for StdResult<(), E> +where + Error: From, +{ + #[inline(always)] + fn report_if_err(self, context: impl ToString) { + if let Err(err) = self { + Error::from(err).report(context.to_string()) + } + } +} + /// Like `errors.Annotate` in Go. /// Wrap an unknown error with [`Error::Other`]. #[macro_export(crate)] diff --git a/components/backup-stream/src/service.rs b/components/backup-stream/src/service.rs index 47a149973b2..9d312a984d1 100644 --- a/components/backup-stream/src/service.rs +++ b/components/backup-stream/src/service.rs @@ -89,4 +89,16 @@ impl LogBackup for Service { )); try_send!(self.endpoint, t); } + + fn subscribe_flush_event( + &mut self, + _ctx: grpcio::RpcContext<'_>, + _req: kvproto::logbackuppb::SubscribeFlushEventRequest, + sink: grpcio::ServerStreamingSink, + ) { + try_send!( + self.endpoint, + Task::RegionCheckpointsOp(RegionCheckpointOperation::Subscribe(sink)) + ); + } } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 2cc6016aeb1..57932acae0d 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -19,13 +19,15 @@ use backup_stream::{ }, observer::BackupStreamObserver, router::Router, - Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, + Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Service, Task, }; -use futures::{executor::block_on, AsyncWriteExt, Future}; -use grpcio::ChannelBuilder; +use futures::{executor::block_on, AsyncWriteExt, Future, Stream, StreamExt, TryStreamExt}; +use grpcio::{ChannelBuilder, Server, ServerBuilder}; use kvproto::{ brpb::{CompressionType, Local, Metadata, StorageBackend}, kvrpcpb::*, + logbackuppb::{SubscribeFlushEventRequest, SubscribeFlushEventResponse}, + logbackuppb_grpc::{create_log_backup, LogBackupClient}, tikvpb::*, }; use pd_client::PdClient; @@ -156,6 +158,8 @@ impl SuiteBuilder { }, obs: Default::default(), tikv_cli: Default::default(), + log_backup_cli: Default::default(), + servers: Default::default(), env: Arc::new(grpcio::Environment::new(1)), cluster, @@ -172,6 +176,8 @@ impl SuiteBuilder { cfg_f(&mut cfg); for id in 1..=(n as u64) { suite.start_endpoint(id, cfg.clone()); + let cli = suite.start_log_backup_client_on(id); + suite.log_backup_cli.insert(id, cli); } // We must wait until the endpoints get ready to watching the metastore, or some // modifies may be lost. Either make Endpoint::with_client wait until watch did @@ -222,8 +228,11 @@ pub struct Suite { meta_store: ErrorStore, cluster: Cluster, tikv_cli: HashMap, + log_backup_cli: HashMap, obs: HashMap, env: Arc, + // The place to make services live as long as suite. + servers: Vec, temp_files: TempDir, flushed_files: TempDir, @@ -263,6 +272,51 @@ impl Suite { worker } + /// create a subscription stream. this has simply asserted no error, because + /// in theory observing flushing should not emit error. change that if + /// needed. + fn flush_stream(&self) -> impl Stream { + let streams = self + .log_backup_cli + .iter() + .map(|(id, cli)| { + let stream = cli + .subscribe_flush_event(&{ + let mut r = SubscribeFlushEventRequest::default(); + r.set_client_id(format!("test-{}", id)); + r + }) + .unwrap_or_else(|err| panic!("failed to subscribe on {} because {}", id, err)); + let id = *id; + stream.map_ok(move |x| (id, x)).map(move |x| { + x.unwrap_or_else(move |err| panic!("failed to rec from {} because {}", id, err)) + }) + }) + .collect::>(); + + futures::stream::select_all(streams) + } + + fn start_log_backup_client_on(&mut self, id: u64) -> LogBackupClient { + let endpoint = self + .endpoints + .get(&id) + .expect("must register endpoint first"); + + let serv = Service::new(endpoint.scheduler()); + let builder = + ServerBuilder::new(self.env.clone()).register_service(create_log_backup(serv)); + let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); + server.start(); + let (_, port) = server.bind_addrs().next().unwrap(); + let addr = format!("127.0.0.1:{}", port); + let channel = ChannelBuilder::new(self.env.clone()).connect(&addr); + println!("connecting channel to {} for store {}", addr, id); + let client = LogBackupClient::new(channel); + self.servers.push(server); + client + } + fn start_endpoint(&mut self, id: u64, mut cfg: BackupStreamConfig) { let cluster = &mut self.cluster; let worker = self.endpoints.get_mut(&id).unwrap(); @@ -747,8 +801,10 @@ mod test { errors::Error, router::TaskSelector, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; + use futures::{Stream, StreamExt}; use pd_client::PdClient; use tikv_util::{box_err, defer, info, HandyRwLock}; + use tokio::time::timeout; use txn_types::{Key, TimeStamp}; use crate::{ @@ -1174,4 +1230,60 @@ mod test { checkpoint ); } + + async fn collect_current(mut s: impl Stream + Unpin, goal: usize) -> Vec { + let mut r = vec![]; + while let Ok(Some(x)) = timeout(Duration::from_secs(10), s.next()).await { + r.push(x); + if r.len() >= goal { + return r; + } + } + r + } + + #[test] + fn subscribe_flushing() { + let mut suite = super::SuiteBuilder::new_named("sub_flush").build(); + let stream = suite.flush_stream(); + for i in 1..10 { + let split_key = make_split_key_at_record(1, i * 20); + suite.must_split(&split_key); + suite.must_shuffle_leader(suite.cluster.get_region_id(&split_key)); + } + + let round1 = run_async_test(suite.write_records(0, 128, 1)); + suite.must_register_task(1, "sub_flush"); + let round2 = run_async_test(suite.write_records(256, 128, 1)); + suite.sync(); + suite.force_flush_files("sub_flush"); + + let mut items = run_async_test(async { + collect_current( + stream.flat_map(|(_, r)| futures::stream::iter(r.events.into_iter())), + 10, + ) + .await + }); + + items.sort_by(|x, y| x.start_key.cmp(&y.start_key)); + + println!("{:?}", items); + assert_eq!(items.len(), 10); + + assert_eq!(items.first().unwrap().start_key, Vec::::default()); + for w in items.windows(2) { + let a = &w[0]; + let b = &w[1]; + assert!(a.checkpoint > 512); + assert!(b.checkpoint > 512); + assert_eq!(a.end_key, b.start_key); + } + assert_eq!(items.last().unwrap().end_key, Vec::::default()); + + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(|x| x.as_slice()), + )); + } } diff --git a/components/error_code/src/backup_stream.rs b/components/error_code/src/backup_stream.rs index 9448169cc05..a4b28b0e9ee 100644 --- a/components/error_code/src/backup_stream.rs +++ b/components/error_code/src/backup_stream.rs @@ -41,12 +41,17 @@ define_error_codes! { ), RAFTREQ => ("RaftReq", "Error happened when sending raft command.", - "This is an internal error, please ask the community for help." + "This is an internal error, most of them are happen while initial scanning and can be simply retried." ), RAFTSTORE => ("RaftStore", "Error happened reported from raft store.", "This is an internal error, please ask the community for help." ), + GRPC => ("gRPC", + "Error happened during executing gRPC", + "This error is often relative to the network, please check the network connection and network config, say, TLS config." + ), + OTHER => ("Unknown", "Some random error happens.", "This is an generic error, please check the error message for further information." From 372ea1af320e8ba4e63dce989ddf6331e37142ac Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Wed, 30 Nov 2022 21:48:02 +0800 Subject: [PATCH 374/676] pitr: support skipping download kv files when pitr (#13802) close tikv/tikv#13788, close pingcap/tidb#39102 Signed-off-by: joccau Signed-off-by: Zak Zhao <57036248+joccau@users.noreply.github.com> Co-authored-by: Ti Chi Robot Co-authored-by: 3pointer --- components/backup-stream/tests/mod.rs | 2 +- components/error_code/src/sst_importer.rs | 3 +- .../external_storage/export/src/export.rs | 6 +- components/external_storage/src/lib.rs | 85 ++ components/sst_importer/src/config.rs | 10 + components/sst_importer/src/errors.rs | 20 +- components/sst_importer/src/sst_importer.rs | 782 ++++++++++++++++-- .../tikv_util/src/codec/stream_event.rs | 12 +- src/import/sst_service.rs | 31 +- tests/integrations/config/mod.rs | 1 + 10 files changed, 848 insertions(+), 104 deletions(-) diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 57932acae0d..7256cd62c03 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -530,7 +530,7 @@ impl Suite { decoder.close().await.unwrap(); let content = decoder.into_inner(); - let mut iter = EventIterator::new(content); + let mut iter = EventIterator::new(&content); loop { if !iter.valid() { break; diff --git a/components/error_code/src/sst_importer.rs b/components/error_code/src/sst_importer.rs index 2eb6177458b..001f4f146f6 100644 --- a/components/error_code/src/sst_importer.rs +++ b/components/error_code/src/sst_importer.rs @@ -21,5 +21,6 @@ define_error_codes!( TTL_NOT_ENABLED => ("TtlNotEnabled", "", ""), TTL_LEN_NOT_EQUALS_TO_PAIRS => ("TtlLenNotEqualsToPairs", "", ""), INCOMPATIBLE_API_VERSION => ("IncompatibleApiVersion", "", ""), - INVALID_KEY_MODE => ("InvalidKeyMode", "", "") + INVALID_KEY_MODE => ("InvalidKeyMode", "", ""), + RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", "") ); diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/export/src/export.rs index ea02ebe2c6f..10363bf92b2 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/export/src/export.rs @@ -23,9 +23,9 @@ use external_storage::dylib_client; #[cfg(feature = "cloud-storage-grpc")] use external_storage::grpc_client; pub use external_storage::{ - compression_reader_dispatcher, encrypt_wrap_reader, read_external_storage_into_file, - record_storage_create, BackendConfig, ExternalStorage, HdfsStorage, LocalStorage, NoopStorage, - RestoreConfig, UnpinReader, + compression_reader_dispatcher, encrypt_wrap_reader, read_external_storage_info_buff, + read_external_storage_into_file, record_storage_create, BackendConfig, ExternalStorage, + HdfsStorage, LocalStorage, NoopStorage, RestoreConfig, UnpinReader, MIN_READ_SPEED, }; use futures_io::AsyncRead; #[cfg(feature = "cloud-gcp")] diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index 6bcbcfc839f..e1c57608197 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -296,3 +296,88 @@ pub async fn read_external_storage_into_file( Ok(()) } + +pub const MIN_READ_SPEED: usize = 8192; + +pub async fn read_external_storage_info_buff( + reader: &mut (dyn AsyncRead + Unpin), + speed_limiter: &Limiter, + expected_length: u64, + expected_sha256: Option>, + min_read_speed: usize, +) -> io::Result> { + // the minimum speed of reading data, in bytes/second. + // if reading speed is slower than this rate, we will stop with + // a "TimedOut" error. + // (at 8 KB/s for a 2 MB buffer, this means we timeout after 4m16s.) + let read_speed = if min_read_speed > 0 { + min_read_speed + } else { + MIN_READ_SPEED + }; + let dur = Duration::from_secs((READ_BUF_SIZE / read_speed) as u64); + let mut output = Vec::new(); + let mut buffer = vec![0u8; READ_BUF_SIZE]; + + loop { + // separate the speed limiting from actual reading so it won't + // affect the timeout calculation. + let bytes_read = timeout(dur, reader.read(&mut buffer)) + .await + .map_err(|_| io::ErrorKind::TimedOut)??; + if bytes_read == 0 { + break; + } + + speed_limiter.consume(bytes_read).await; + output.append(&mut buffer[..bytes_read].to_vec()); + } + + // check length of file + if expected_length > 0 && output.len() != expected_length as usize { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "length not match, downloaded size {}, expected {}", + output.len(), + expected_length + ), + )); + } + // check sha256 of file + if let Some(sha256) = expected_sha256 { + let mut hasher = Hasher::new(MessageDigest::sha256()).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher failed to init: {}", err), + ) + })?; + hasher.update(&output).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher udpate failed: {}", err), + ) + })?; + + let cal_sha256 = hasher.finish().map_or_else( + |err| { + Err(io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher finish failed: {}", err), + )) + }, + |bytes| Ok(bytes.to_vec()), + )?; + if !sha256.eq(&cal_sha256) { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "sha256 not match, expect: {:?}, calculate: {:?}", + sha256, cal_sha256, + ), + )); + } + } + + Ok(output) +} diff --git a/components/sst_importer/src/config.rs b/components/sst_importer/src/config.rs index ef74a40fd01..ac789e2f4ae 100644 --- a/components/sst_importer/src/config.rs +++ b/components/sst_importer/src/config.rs @@ -14,6 +14,8 @@ pub struct Config { /// /// Default is 10m. pub import_mode_timeout: ReadableDuration, + /// the ratio of system memory used for import. + pub memory_use_ratio: f64, } impl Default for Config { @@ -22,6 +24,7 @@ impl Default for Config { num_threads: 8, stream_channel_window: 128, import_mode_timeout: ReadableDuration::minutes(10), + memory_use_ratio: 0.3, } } } @@ -43,6 +46,13 @@ impl Config { ); self.stream_channel_window = default_cfg.stream_channel_window; } + if self.memory_use_ratio > 0.5 || self.memory_use_ratio < 0.0 { + warn!( + "import.mem_ratio should belong to [0.0, 0.5], change it to {}", + default_cfg.memory_use_ratio, + ); + self.memory_use_ratio = default_cfg.memory_use_ratio; + } Ok(()) } } diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index 51aabcbec01..7ff940fff12 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -8,7 +8,7 @@ use encryption::Error as EncryptionError; use error_code::{self, ErrorCode, ErrorCodeExt}; use futures::channel::oneshot::Canceled; use grpcio::Error as GrpcError; -use kvproto::{import_sstpb, kvrpcpb::ApiVersion}; +use kvproto::{errorpb, import_sstpb, kvrpcpb::ApiVersion}; use tikv_util::codec::Error as CodecError; use uuid::Error as UuidError; @@ -122,6 +122,9 @@ pub enum Error { storage_api_version: ApiVersion, key: String, }, + + #[error("resource is not enough {0}")] + ResourceNotEnough(String), } impl Error { @@ -149,7 +152,19 @@ pub type Result = result::Result; impl From for import_sstpb::Error { fn from(e: Error) -> import_sstpb::Error { let mut err = import_sstpb::Error::default(); - err.set_message(format!("{}", e)); + match e { + Error::ResourceNotEnough(ref msg) => { + let mut import_err = errorpb::Error::default(); + import_err.set_message(msg.clone()); + import_err.set_server_is_busy(errorpb::ServerIsBusy::default()); + err.set_store_error(import_err); + err.set_message(format!("{}", e)); + } + _ => { + err.set_message(format!("{}", e)); + } + } + err } } @@ -181,6 +196,7 @@ impl ErrorCodeExt for Error { Error::TtlLenNotEqualsToPairs => error_code::sst_importer::TTL_LEN_NOT_EQUALS_TO_PAIRS, Error::IncompatibleApiVersion => error_code::sst_importer::INCOMPATIBLE_API_VERSION, Error::InvalidKeyMode { .. } => error_code::sst_importer::INVALID_KEY_MODE, + Error::ResourceNotEnough(_) => error_code::sst_importer::RESOURCE_NOT_ENOUTH, } } } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index abd616c5bc9..c024bca8e6d 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -4,10 +4,14 @@ use std::{ borrow::Cow, collections::HashMap, fs::File, - io::{self, prelude::*, BufReader}, + io::{self, BufReader, Read}, ops::Bound, path::{Path, PathBuf}, - sync::Arc, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::Duration, }; use dashmap::DashMap; @@ -18,6 +22,7 @@ use engine_traits::{ IterOptions, Iterator, KvEngine, RefIterable, SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, }; +use external_storage_export::{compression_reader_dispatcher, encrypt_wrap_reader, RestoreConfig}; use file_system::{get_io_rate_limiter, OpenOptions}; use futures::executor::ThreadPool; use kvproto::{ @@ -26,7 +31,10 @@ use kvproto::{ kvrpcpb::ApiVersion, }; use tikv_util::{ - codec::stream_event::{EventIterator, Iterator as EIterator}, + codec::stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, + config::ReadableSize, + stream::block_on_external_io, + sys::SysQuota, time::{Instant, Limiter}, }; use txn_types::{Key, TimeStamp, WriteRef}; @@ -39,6 +47,32 @@ use crate::{ Config, Error, Result, }; +#[derive(Clone, PartialEq, Debug)] +pub enum CacheKvFile { + Mem(Arc>), + Fs(Arc), +} + +impl CacheKvFile { + // get the ref count of item. + pub fn ref_count(&self) -> usize { + match self { + CacheKvFile::Mem(buff) => Arc::strong_count(buff), + CacheKvFile::Fs(path) => Arc::strong_count(path), + } + } + + // check the item is expired. + pub fn is_expired(&self, start: &Instant) -> bool { + match self { + // The expired duration for memeory is 60s. + CacheKvFile::Mem(_) => start.saturating_elapsed() >= Duration::from_secs(60), + // The expired duration for local file is 10min. + CacheKvFile::Fs(_) => start.saturating_elapsed() >= Duration::from_secs(600), + } + } +} + /// SstImporter manages SST files that are waiting for ingesting. pub struct SstImporter { dir: ImportDir, @@ -47,7 +81,9 @@ pub struct SstImporter { // TODO: lift api_version as a type parameter. api_version: ApiVersion, compression_types: HashMap, - file_locks: Arc>, + file_locks: Arc>, + mem_use: AtomicU64, + mem_limit: ReadableSize, } impl SstImporter { @@ -58,6 +94,10 @@ impl SstImporter { api_version: ApiVersion, ) -> Result { let switcher = ImportModeSwitcher::new(cfg); + + let memory_limit = (SysQuota::memory_limit_in_bytes() as f64) * cfg.memory_use_ratio; + info!("sst importer memory limit when apply"; "size" => ?memory_limit); + Ok(SstImporter { dir: ImportDir::new(root)?, key_manager, @@ -65,6 +105,8 @@ impl SstImporter { api_version, compression_types: HashMap::with_capacity(2), file_locks: Arc::new(DashMap::default()), + mem_use: AtomicU64::new(0), + mem_limit: ReadableSize(memory_limit as u64), }) } @@ -292,12 +334,249 @@ impl SstImporter { Ok(()) } + pub fn shrink_by_tick(&self) -> usize { + let mut shrink_buff_size: usize = 0; + let mut retain_buff_size: usize = 0; + let mut shrink_files: Vec = Vec::default(); + let mut retain_file_count = 0_usize; + + self.file_locks.retain(|_, (c, start)| { + let mut need_retain = true; + match c { + CacheKvFile::Mem(buff) => { + let buflen = buff.len(); + // The term of recycle memeory is 60s. + if c.ref_count() == 1 && c.is_expired(start) { + need_retain = false; + shrink_buff_size += buflen; + } else { + retain_buff_size += buflen; + } + } + CacheKvFile::Fs(path) => { + let p = path.to_path_buf(); + // The term of recycle file is 10min. + if c.ref_count() == 1 && c.is_expired(start) { + need_retain = false; + shrink_files.push(p); + } else { + retain_file_count += 1; + } + } + } + + need_retain + }); + + if self.import_support_download() { + let shrink_file_count = shrink_files.len(); + info!("shrink space by tick"; "shrink files count" => shrink_file_count, "retain files count" => retain_file_count); + + for f in shrink_files { + if let Err(e) = file_system::remove_file(&f) { + info!("failed to remove file"; "filename" => ?f, "error" => ?e); + } + } + shrink_file_count + } else { + info!("shrink cache by tick"; "shrink size" => shrink_buff_size, "retain size" => retain_buff_size); + self.dec_mem(shrink_buff_size as _); + shrink_buff_size + } + } + + // If mem_limit is 0, which represent download kv-file when import. + // Or read kv-file into buffer directly. + pub fn import_support_download(&self) -> bool { + self.mem_limit == ReadableSize(0) + } + + fn inc_mem_and_check(&self, meta: &KvMeta) -> bool { + let size = meta.get_length(); + let old = self.mem_use.fetch_add(size, Ordering::SeqCst); + + // If the memory is limited, roll backup the mem_use and return false. + if old + size > self.mem_limit.0 { + self.mem_use.fetch_sub(size, Ordering::SeqCst); + false + } else { + true + } + } + + fn dec_mem(&self, size: u64) { + self.mem_use.fetch_sub(size, Ordering::SeqCst); + } + + pub fn do_read_kv_file( + &self, + meta: &KvMeta, + rewrite_rule: &RewriteRule, + ext_storage: Arc, + speed_limiter: &Limiter, + ) -> Result { + let start = Instant::now(); + let dst_name = format!("{}_{}", meta.get_name(), meta.get_range_offset()); + + let mut lock = self + .file_locks + .entry(dst_name) + .or_insert((CacheKvFile::Mem(Arc::default()), Instant::now())); + + if let CacheKvFile::Mem(buff) = &lock.0 { + if !buff.is_empty() { + lock.1 = Instant::now(); + return Ok(lock.0.clone()); + } + } + + if !self.inc_mem_and_check(meta) { + return Err(Error::ResourceNotEnough(String::from("memory is limited"))); + } + + let expected_sha256 = { + let sha256 = meta.get_sha256().to_vec(); + if !sha256.is_empty() { + Some(sha256) + } else { + None + } + }; + let file_length = meta.get_length(); + let range = { + let range_length = meta.get_range_length(); + if range_length == 0 { + None + } else { + Some((meta.get_range_offset(), range_length)) + } + }; + let restore_config = external_storage_export::RestoreConfig { + range, + compression_type: Some(meta.get_compression_type()), + expected_sha256, + file_crypter: None, + }; + + let buff = self.read_kv_files_from_external_storage( + file_length, + meta.get_name(), + ext_storage, + speed_limiter, + restore_config, + )?; + + IMPORTER_DOWNLOAD_BYTES.observe(file_length as _); + IMPORTER_APPLY_DURATION + .with_label_values(&["download"]) + .observe(start.saturating_elapsed().as_secs_f64()); + + let rewrite_buff = self.rewrite_kv_file(buff, rewrite_rule)?; + *lock = (CacheKvFile::Mem(Arc::new(rewrite_buff)), Instant::now()); + Ok(lock.0.clone()) + } + + pub fn create_external_storage( + &self, + backend: &StorageBackend, + support_kms: bool, + ) -> Result> { + let ext_storage = external_storage_export::create_storage(backend, Default::default())?; + // kv-files needn't are decrypted with KMS when download currently because these + // files are not encrypted when log-backup. It is different from + // sst-files because sst-files is encrypted when saved with rocksdb env + // with KMS. to do: support KMS when log-backup and restore point. + let ext_storage = match (support_kms, self.key_manager.clone()) { + (true, Some(key_manager)) => { + Box::new(external_storage_export::EncryptedExternalStorage { + key_manager, + storage: ext_storage, + }) + } + _ => ext_storage, + }; + Ok(ext_storage) + } + + fn read_kv_files_from_external_storage( + &self, + file_length: u64, + file_name: &str, + ext_storage: Arc, + speed_limiter: &Limiter, + restore_config: RestoreConfig, + ) -> Result> { + let RestoreConfig { + range, + compression_type, + expected_sha256, + file_crypter, + } = restore_config; + + let mut reader = { + let inner = if let Some((off, len)) = range { + ext_storage.read_part(file_name, off, len) + } else { + ext_storage.read(file_name) + }; + + let inner = compression_reader_dispatcher(compression_type, inner)?; + encrypt_wrap_reader(file_crypter, inner)? + }; + + let r = block_on_external_io(external_storage_export::read_external_storage_info_buff( + &mut reader, + speed_limiter, + file_length, + expected_sha256, + external_storage_export::MIN_READ_SPEED, + )); + let url = ext_storage.url()?.to_string(); + let buff = r.map_err(|e| Error::CannotReadExternalStorage { + url: url.to_string(), + name: file_name.to_string(), + err: e, + local_path: PathBuf::default(), + })?; + + Ok(buff) + } + + pub fn read_from_kv_file( + &self, + meta: &KvMeta, + rewrite_rule: &RewriteRule, + ext_storage: Arc, + backend: &StorageBackend, + speed_limiter: &Limiter, + ) -> Result>> { + let c = if self.import_support_download() { + self.do_download_kv_file(meta, backend, speed_limiter)? + } else { + self.do_read_kv_file(meta, rewrite_rule, ext_storage, speed_limiter)? + }; + match c { + // If cache memroy, it has been rewrite, return buffer directly. + CacheKvFile::Mem(buff) => Ok(buff), + // If cache file name, it need to read and rewrite. + CacheKvFile::Fs(path) => { + let file = File::open(path.as_ref())?; + let mut reader = BufReader::new(file); + let mut buffer = Vec::new(); + reader.read_to_end(&mut buffer)?; + + let rewrite_buff = self.rewrite_kv_file(buffer, rewrite_rule)?; + Ok(Arc::new(rewrite_buff)) + } + } + } + pub fn do_download_kv_file( &self, meta: &KvMeta, backend: &StorageBackend, speed_limiter: &Limiter, - ) -> Result { + ) -> Result { let offset = meta.get_range_offset(); let src_name = meta.get_name(); let dst_name = format!("{}_{}", src_name, offset); @@ -309,14 +588,15 @@ impl SstImporter { } else { None }; - if path.save.exists() { - return Ok(path.save); - } - let lock = self.file_locks.entry(dst_name.to_string()).or_default(); + let mut lock = self + .file_locks + .entry(dst_name) + .or_insert((CacheKvFile::Fs(Arc::new(path.save.clone())), Instant::now())); if path.save.exists() { - return Ok(path.save); + lock.1 = Instant::now(); + return Ok(lock.0.clone()); } let range_length = meta.get_range_length(); @@ -336,16 +616,17 @@ impl SstImporter { src_name, path.temp.clone(), backend, - // kv-files needn't are decrypted with KMS when download currently because these files - // are not encrypted when log-backup. It is different from sst-files - // because sst-files is encrypted when saved with rocksdb env with KMS. - // to do: support KMS when log-backup and restore point. false, // don't support encrypt for now. speed_limiter, restore_config, )?; - info!("download file finished {}, offset {}", src_name, offset); + info!( + "download file finished {}, offset {}, length {}", + src_name, + offset, + meta.get_length() + ); if let Some(p) = path.save.parent() { // we have v1 prefix in file name. @@ -358,89 +639,100 @@ impl SstImporter { })?; } - file_system::rename(path.temp, path.save.clone())?; - - drop(lock); - self.file_locks.remove(&dst_name); - + file_system::rename(path.temp, path.save)?; IMPORTER_APPLY_DURATION .with_label_values(&["download"]) .observe(start.saturating_elapsed().as_secs_f64()); - Ok(path.save) + lock.1 = Instant::now(); + Ok(lock.0.clone()) } - pub fn do_apply_kv_file>( + pub fn rewrite_kv_file( &self, - start_key: &[u8], - end_key: &[u8], - restore_ts: u64, - file_path: P, + file_buff: Vec, rewrite_rule: &RewriteRule, - build_fn: &mut dyn FnMut(Vec, Vec), - ) -> Result> { - // iterator file and performs rewrites and apply. - let file = File::open(&file_path)?; - let mut reader = BufReader::new(file); - let mut buffer = Vec::new(); - reader.read_to_end(&mut buffer)?; - - let mut event_iter = EventIterator::new(buffer); - + ) -> Result> { let old_prefix = rewrite_rule.get_old_key_prefix(); let new_prefix = rewrite_rule.get_new_key_prefix(); - - let perform_rewrite = old_prefix != new_prefix; + // if old_prefix equals new_prefix, do not need rewrite. + if old_prefix == new_prefix { + return Ok(file_buff); + } // perform iteration and key rewrite. + let mut new_buff = Vec::with_capacity(file_buff.len()); + let mut event_iter = EventIterator::new(file_buff.as_slice()); let mut key = new_prefix.to_vec(); let new_prefix_data_key_len = key.len(); + + let start = Instant::now(); + loop { + if !event_iter.valid() { + break; + } + event_iter.next()?; + + // perform rewrite + let old_key = event_iter.key(); + if !old_key.starts_with(old_prefix) { + return Err(Error::WrongKeyPrefix { + what: "Key in file", + key: old_key.to_vec(), + prefix: old_prefix.to_vec(), + }); + } + key.truncate(new_prefix_data_key_len); + key.extend_from_slice(&old_key[old_prefix.len()..]); + let value = event_iter.value(); + + let encoded = EventEncoder::encode_event(&key, value); + for slice in encoded { + new_buff.append(&mut slice.as_ref().to_owned()); + } + } + + IMPORTER_APPLY_DURATION + .with_label_values(&["rewrite"]) + .observe(start.saturating_elapsed().as_secs_f64()); + Ok(new_buff) + } + + pub fn do_apply_kv_file( + &self, + start_key: &[u8], + end_key: &[u8], + start_ts: u64, + restore_ts: u64, + file_buff: Arc>, + build_fn: &mut dyn FnMut(Vec, Vec), + ) -> Result> { + let mut event_iter = EventIterator::new(file_buff.as_slice()); let mut smallest_key = None; let mut largest_key = None; - let mut total_key = 0; let mut ts_not_expected = 0; let mut not_in_range = 0; - let start = Instant::now(); + loop { if !event_iter.valid() { break; } total_key += 1; event_iter.next()?; - INPORTER_APPLY_COUNT.with_label_values(&["key_meet"]).inc(); - let ts = Key::decode_ts_from(event_iter.key())?; - if ts > TimeStamp::new(restore_ts) { + + let key = event_iter.key().to_vec(); + let value = event_iter.value().to_vec(); + let ts = Key::decode_ts_from(&key)?; + if ts < TimeStamp::new(start_ts) || ts > TimeStamp::new(restore_ts) { // we assume the keys in file are sorted by ts. // so if we met the key not satisfy the ts. // we can easily filter the remain keys. ts_not_expected += 1; continue; } - if perform_rewrite { - let old_key = event_iter.key(); - - if !old_key.starts_with(old_prefix) { - return Err(Error::WrongKeyPrefix { - what: "Key in file", - key: old_key.to_vec(), - prefix: old_prefix.to_vec(), - }); - } - key.truncate(new_prefix_data_key_len); - key.extend_from_slice(&old_key[old_prefix.len()..]); - - debug!( - "perform rewrite new key: {:?}, new key prefix: {:?}, old key prefix: {:?}", - log_wrappers::Value::key(&key), - log_wrappers::Value::key(new_prefix), - log_wrappers::Value::key(old_prefix), - ); - } else { - key = event_iter.key().to_vec(); - } if check_key_in_range(&key, 0, start_key, end_key).is_err() { // key not in range, we can simply skip this key here. // the client make sure the correct region will download and apply the same @@ -451,28 +743,21 @@ impl SstImporter { not_in_range += 1; continue; } - let value = event_iter.value().to_vec(); - build_fn(key.clone(), value); - let iter_key = key.clone(); - smallest_key = smallest_key.map_or_else( - || Some(iter_key.clone()), - |v: Vec| Some(v.min(iter_key.clone())), - ); - - largest_key = largest_key.map_or_else( - || Some(iter_key.clone()), - |v: Vec| Some(v.max(iter_key.clone())), - ); + build_fn(key.clone(), value); + smallest_key = smallest_key + .map_or_else(|| Some(key.clone()), |v: Vec| Some(v.min(key.clone()))); + largest_key = largest_key + .map_or_else(|| Some(key.clone()), |v: Vec| Some(v.max(key.clone()))); } - info!("build download request file done"; "total keys" => %total_key, + if total_key != not_in_range { + info!("build download request file done"; "total keys" => %total_key, "ts filtered keys" => %ts_not_expected, - "range filtered keys" => %not_in_range, - "file" => %file_path.as_ref().display()); + "range filtered keys" => %not_in_range); + } - let label = if perform_rewrite { "rewrite" } else { "normal" }; IMPORTER_APPLY_DURATION - .with_label_values(&[label]) + .with_label_values(&["normal"]) .observe(start.saturating_elapsed().as_secs_f64()); match (smallest_key, largest_key) { @@ -809,12 +1094,17 @@ fn is_after_end_bound>(value: &[u8], bound: &Bound) -> bool { #[cfg(test)] mod tests { - use std::io::{self, BufWriter}; + use std::{ + io::{self, BufWriter, Write}, + ops::Sub, + usize, + }; use engine_traits::{ collect, EncryptionMethod, Error as TraitError, ExternalSstFileInfo, Iterable, Iterator, RefIterable, SstReader, SstWriter, CF_DEFAULT, DATA_CFS, }; + use external_storage_export::read_external_storage_info_buff; use file_system::File; use openssl::hash::{Hasher, MessageDigest}; use tempfile::Builder; @@ -1035,7 +1325,8 @@ mod tests { }) } - fn create_sample_external_kv_file() -> Result<(tempfile::TempDir, StorageBackend, KvMeta)> { + fn create_sample_external_kv_file() + -> Result<(tempfile::TempDir, StorageBackend, KvMeta, Vec)> { let ext_dir = tempfile::tempdir()?; let file_name = "v1/t000001/abc.log"; let file_path = ext_dir.path().join(file_name); @@ -1047,6 +1338,7 @@ mod tests { (b"t1_r01".to_vec(), b"tidb".to_vec()), (b"t1_r02".to_vec(), b"tikv".to_vec()), (b"t1_r03".to_vec(), b"pingcap".to_vec()), + (b"t1_r04".to_vec(), b"test for PITR".to_vec()), ]; let mut sha256 = Hasher::new(MessageDigest::sha256()).unwrap(); @@ -1067,7 +1359,7 @@ mod tests { kv_meta.set_sha256(sha256.finish().unwrap().to_vec()); let backend = external_storage_export::make_local_backend(ext_dir.path()); - Ok((ext_dir, backend, kv_meta)) + Ok((ext_dir, backend, kv_meta, buff.buffer().to_vec())) } fn create_sample_external_rawkv_sst_file( @@ -1245,6 +1537,249 @@ mod tests { assert_eq!(err.kind(), io::ErrorKind::TimedOut); } + #[test] + fn test_read_external_storage_info_buff() { + let data = &b"input some data, used to test read buff"[..]; + let mut reader = data; + let len = reader.len() as _; + let sha_256 = { + let mut hasher = Hasher::new(MessageDigest::sha256()).unwrap(); + hasher.update(data).unwrap(); + hasher.finish().unwrap().to_vec() + }; + + // test successfully. + let output = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len, + Some(sha_256.clone()), + 0, + )) + .unwrap(); + assert_eq!(&output, data); + + // test without expected_sha245. + reader = data; + let output = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len, + None, + 0, + )) + .unwrap(); + assert_eq!(&output, data); + + // test with wrong expectd_len. + reader = data; + let err = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len + 1, + Some(sha_256.clone()), + 0, + )) + .unwrap_err(); + assert!(err.to_string().contains("length not match")); + + // test with wrong expected_sha256. + reader = data; + let err = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len, + Some(sha_256[..sha_256.len() - 1].to_vec()), + 0, + )) + .unwrap_err(); + assert!(err.to_string().contains("sha256 not match")); + } + + #[test] + fn test_read_external_storage_info_buff_timed_out() { + use futures_util::stream::{pending, TryStreamExt}; + + let mut input = pending::>().into_async_read(); + let err = block_on_external_io(read_external_storage_info_buff( + &mut input, + &Limiter::new(f64::INFINITY), + 0, + None, + usize::MAX, + )) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::TimedOut); + } + + #[test] + fn test_do_read_kv_file() { + // create a sample kv file. + let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager), + ApiVersion::V1, + ) + .unwrap(); + let ext_storage = { + let inner = importer.create_external_storage(&backend, false).unwrap(); + Arc::new(inner) + }; + + // test do_read_kv_file() + let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); + let output = importer + .do_read_kv_file( + &kv_meta, + rewrite_rule, + ext_storage, + &Limiter::new(f64::INFINITY), + ) + .unwrap(); + + assert_eq!(CacheKvFile::Mem(Arc::new(buff.clone())), output); + + // Do not shrint nothing. + let shrink_size = importer.shrink_by_tick(); + assert_eq!(shrink_size, 0); + assert_eq!(importer.file_locks.len(), 1); + + // drop the refcnt + drop(output); + let shrink_size = importer.shrink_by_tick(); + assert_eq!(shrink_size, 0); + assert_eq!(importer.file_locks.len(), 1); + + // set expired instance in Dashmap + for mut kv in importer.file_locks.iter_mut() { + kv.1 = Instant::now().sub(Duration::from_secs(61)); + } + let shrink_size = importer.shrink_by_tick(); + assert_eq!(shrink_size, buff.len()); + assert!(importer.file_locks.is_empty()); + } + + #[test] + fn test_read_kv_files_from_external_storage() { + // create a sample kv file. + let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager), + ApiVersion::V1, + ) + .unwrap(); + let ext_storage = { + let inner = importer.create_external_storage(&backend, false).unwrap(); + Arc::new(inner) + }; + + // test read all of the file. + let restore_config = external_storage_export::RestoreConfig { + expected_sha256: Some(kv_meta.get_sha256().to_vec()), + ..Default::default() + }; + + let output = importer + .read_kv_files_from_external_storage( + kv_meta.get_length(), + kv_meta.get_name(), + ext_storage.clone(), + &Limiter::new(f64::INFINITY), + restore_config, + ) + .unwrap(); + assert_eq!( + buff, + output, + "we are testing addition with {} and {}", + buff.len(), + output.len() + ); + + // test read range of the file. + let (offset, len) = (5, 16); + let restore_config = external_storage_export::RestoreConfig { + range: Some((offset, len)), + ..Default::default() + }; + + let output = importer + .read_kv_files_from_external_storage( + len, + kv_meta.get_name(), + ext_storage, + &Limiter::new(f64::INFINITY), + restore_config, + ) + .unwrap(); + assert_eq!(&buff[offset as _..(offset + len) as _], &output[..]); + } + + #[test] + fn test_do_download_kv_file() { + // create a sample kv file. + let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let cfg = Config { + memory_use_ratio: 0.0, + ..Default::default() + }; + let importer = + SstImporter::new(&cfg, import_dir, Some(key_manager), ApiVersion::V1).unwrap(); + let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); + let ext_storage = { + let inner = importer.create_external_storage(&backend, false).unwrap(); + Arc::new(inner) + }; + let path = importer + .dir + .get_import_path( + format!("{}_{}", kv_meta.get_name(), kv_meta.get_range_offset()).as_str(), + ) + .unwrap(); + + // test do_download_kv_file(). + assert!(importer.import_support_download()); + let output = importer + .read_from_kv_file( + &kv_meta, + rewrite_rule, + ext_storage, + &backend, + &Limiter::new(f64::INFINITY), + ) + .unwrap(); + assert_eq!(*output, buff); + check_file_exists(&path.save, None); + + // test shrink nothing. + let shrint_files_cnt = importer.shrink_by_tick(); + assert_eq!(shrint_files_cnt, 0); + + // set expired instance in Dashmap. + for mut kv in importer.file_locks.iter_mut() { + kv.1 = Instant::now().sub(Duration::from_secs(601)); + } + let shrint_files_cnt = importer.shrink_by_tick(); + assert_eq!(shrint_files_cnt, 1); + check_file_not_exists(&path.save, None); + } + #[test] fn test_download_file_from_external_storage_for_sst() { // creates a sample SST file. @@ -1285,7 +1820,7 @@ mod tests { #[test] fn test_download_file_from_external_storage_for_kv() { - let (_temp_dir, backend, kv_meta) = create_sample_external_kv_file().unwrap(); + let (_temp_dir, backend, kv_meta, _) = create_sample_external_kv_file().unwrap(); let (_, key_manager) = new_key_manager_for_test(); let import_dir = tempfile::tempdir().unwrap(); @@ -2081,4 +2616,79 @@ mod tests { assert_eq!(sst_reader.compression_name(), expected_compression_name); } } + + #[test] + fn test_import_support_download() { + let import_dir = tempfile::tempdir().unwrap(); + let importer = + SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); + assert_eq!(importer.import_support_download(), false); + + let import_dir = tempfile::tempdir().unwrap(); + let importer = SstImporter::new( + &Config { + memory_use_ratio: 0.0, + ..Default::default() + }, + import_dir, + None, + ApiVersion::V1, + ) + .unwrap(); + assert_eq!(importer.import_support_download(), true); + } + + #[test] + fn test_inc_mem_and_check() { + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let importer = + SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); + assert_eq!(importer.mem_use.load(Ordering::SeqCst), 0); + + // test inc_mem_and_check() and dec_mem() successfully. + let meta = KvMeta { + length: 100, + ..Default::default() + }; + let check = importer.inc_mem_and_check(&meta); + assert!(check); + assert_eq!(importer.mem_use.load(Ordering::SeqCst), meta.get_length()); + + importer.dec_mem(meta.get_length()); + assert_eq!(importer.mem_use.load(Ordering::SeqCst), 0); + + // test inc_mem_and_check() failed. + let meta = KvMeta { + length: u64::MAX, + ..Default::default() + }; + let check = importer.inc_mem_and_check(&meta); + assert!(!check); + } + + #[test] + fn test_dashmap_lock() { + let import_dir = tempfile::tempdir().unwrap(); + let importer = + SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); + + let key = "file1"; + let value = (CacheKvFile::Mem(Arc::default()), Instant::now()); + let lock = importer.file_locks.entry(key.to_string()).or_insert(value); + + // test locked by try_entry() + let lock2 = importer.file_locks.try_entry(key.to_string()); + assert!(lock2.is_none()); + let lock2 = importer.file_locks.try_get(key); + assert!(lock2.is_locked()); + + // test unlocked by entry() + drop(lock); + let v = importer.file_locks.get(key).unwrap(); + assert_eq!(v.0.ref_count(), 1); + + let _buff = v.0.clone(); + assert_eq!(v.0.ref_count(), 2); + } } diff --git a/components/tikv_util/src/codec/stream_event.rs b/components/tikv_util/src/codec/stream_event.rs index b44d239197b..5b00cad6372 100644 --- a/components/tikv_util/src/codec/stream_event.rs +++ b/components/tikv_util/src/codec/stream_event.rs @@ -16,8 +16,8 @@ pub trait Iterator { fn value(&self) -> &[u8]; } -pub struct EventIterator { - buf: Vec, +pub struct EventIterator<'a> { + buf: &'a [u8], offset: usize, key_offset: usize, value_offset: usize, @@ -25,8 +25,8 @@ pub struct EventIterator { value_len: usize, } -impl EventIterator { - pub fn new(buf: Vec) -> EventIterator { +impl EventIterator<'_> { + pub fn new(buf: &[u8]) -> EventIterator<'_> { EventIterator { buf, offset: 0, @@ -44,7 +44,7 @@ impl EventIterator { } } -impl Iterator for EventIterator { +impl Iterator for EventIterator<'_> { fn next(&mut self) -> Result<()> { if self.valid() { self.key_len = self.get_size() as usize; @@ -141,7 +141,7 @@ mod tests { vals.push(val); } - let mut iter = EventIterator::new(event); + let mut iter = EventIterator::new(&event); let mut index = 0_usize; loop { diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index a0d2ab5f4ee..b28f745267e 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -5,6 +5,7 @@ use std::{ future::Future, path::PathBuf, sync::{Arc, Mutex}, + time::Duration, }; use collections::HashSet; @@ -39,6 +40,7 @@ use tikv_util::{ sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, }; +use tokio::time::sleep; use txn_types::{Key, WriteRef, WriteType}; use super::make_rpc_error; @@ -82,7 +84,7 @@ where ) -> ImportSstService { let props = tikv_util::thread_group::current_properties(); let threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads) + .pool_size(cfg.num_threads + 1) .name_prefix("sst-importer") .after_start_wrapper(move || { tikv_util::thread_group::set_properties(props.clone()); @@ -93,6 +95,8 @@ where .create() .unwrap(); importer.start_switch_mode_check(&threads, engine.clone()); + threads.spawn_ok(Self::tick(importer.clone())); + ImportSstService { cfg, engine, @@ -105,6 +109,13 @@ where } } + async fn tick(importer: Arc) { + loop { + sleep(Duration::from_secs(10)).await; + importer.shrink_by_tick(); + } + } + fn acquire_lock(task_slots: &Arc>>, meta: &SstMeta) -> Result { let mut slots = task_slots.lock().unwrap(); let p = sst_meta_to_path(meta)?; @@ -462,6 +473,11 @@ where let mut req_default_size = 0_u64; let mut req_write_size = 0_u64; let mut range: Option = None; + let ext_storage = { + let inner = + importer.create_external_storage(req.get_storage_backend(), false)?; + Arc::from(inner) + }; for (i, meta) in metas.iter().enumerate() { let (reqs, req_size) = if meta.get_cf() == CF_DEFAULT { @@ -480,14 +496,19 @@ where context.clone(), ); - let temp_file = - importer.do_download_kv_file(meta, req.get_storage_backend(), &limiter)?; + let buff = importer.read_from_kv_file( + meta, + &rules[i], + Arc::clone(&ext_storage), + req.get_storage_backend(), + &limiter, + )?; let r: Option = importer.do_apply_kv_file( meta.get_start_key(), meta.get_end_key(), + meta.get_start_ts(), meta.get_restore_ts(), - temp_file, - &rules[i], + buff, &mut build_req_fn, )?; diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 93c07f2f411..ff01788c370 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -765,6 +765,7 @@ fn test_serde_custom_tikv_config() { num_threads: 123, stream_channel_window: 123, import_mode_timeout: ReadableDuration::secs(1453), + memory_use_ratio: 0.3, }; value.panic_when_unexpected_key_or_data = true; value.gc = GcConfig { From e52eb4d59c796a1f585c9d26660463d0e95a9d61 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Thu, 1 Dec 2022 11:06:01 +0800 Subject: [PATCH 375/676] fix the issue that TiKV starts failed. (#13863) close tikv/tikv#13862 Signed-off-by: joccau --- src/import/sst_service.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index b28f745267e..2bf0226136f 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -5,6 +5,7 @@ use std::{ future::Future, path::PathBuf, sync::{Arc, Mutex}, + thread::sleep, time::Duration, }; @@ -40,7 +41,6 @@ use tikv_util::{ sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, }; -use tokio::time::sleep; use txn_types::{Key, WriteRef, WriteType}; use super::make_rpc_error; @@ -95,7 +95,8 @@ where .create() .unwrap(); importer.start_switch_mode_check(&threads, engine.clone()); - threads.spawn_ok(Self::tick(importer.clone())); + let importer_clone = importer.clone(); + threads.spawn_ok(async { Self::tick(importer_clone) }); ImportSstService { cfg, @@ -109,9 +110,9 @@ where } } - async fn tick(importer: Arc) { + fn tick(importer: Arc) { loop { - sleep(Duration::from_secs(10)).await; + sleep(Duration::from_secs(10)); importer.shrink_by_tick(); } } From 0f1d45a8e6ff420ee76b08b43226b801c7d033a8 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 1 Dec 2022 13:06:01 +0800 Subject: [PATCH 376/676] tikv_kv: introduce raft extension (#13864) ref tikv/tikv#13827 So anything related to raft will call raft extension instead of router. This makes it easier to introduce new raftstore implementations. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/raftstore/src/store/region_meta.rs | 20 +- components/server/src/server.rs | 37 ++-- components/test_raftstore/src/server.rs | 28 +-- components/test_storage/src/sync_storage.rs | 8 +- components/tikv_kv/Cargo.toml | 1 + components/tikv_kv/src/lib.rs | 14 ++ components/tikv_kv/src/mock_engine.rs | 5 + components/tikv_kv/src/raft_extension.rs | 69 +++++++ components/tikv_kv/src/rocksdb_engine.rs | 32 +++- src/server/gc_worker/gc_worker.rs | 85 ++------- src/server/raft_client.rs | 109 ++++------- src/server/{raftkv.rs => raftkv/mod.rs} | 28 ++- src/server/raftkv/raft_extension.rs | 177 ++++++++++++++++++ src/server/resolve.rs | 44 ++--- src/server/server.rs | 53 +++--- src/server/service/debug.rs | 117 ++---------- src/server/service/kv.rs | 146 +++++---------- src/server/snap.rs | 38 +--- src/server/tablet_snap.rs | 38 +--- src/server/transport.rs | 37 ++-- tests/failpoints/cases/test_gc_metrics.rs | 3 - .../integrations/config/dynamic/gc_worker.rs | 10 +- tests/integrations/config/dynamic/snap.rs | 3 +- tests/integrations/server/raft_client.rs | 38 ++-- 25 files changed, 578 insertions(+), 563 deletions(-) create mode 100644 components/tikv_kv/src/raft_extension.rs rename src/server/{raftkv.rs => raftkv/mod.rs} (96%) create mode 100644 src/server/raftkv/raft_extension.rs diff --git a/Cargo.lock b/Cargo.lock index f1d02f06af9..1ccf961796e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6453,6 +6453,7 @@ dependencies = [ "pd_client", "prometheus", "prometheus-static-metric", + "raft", "raftstore", "slog", "slog-global", diff --git a/components/raftstore/src/store/region_meta.rs b/components/raftstore/src/store/region_meta.rs index 0370c7604ec..7de687e9dbb 100644 --- a/components/raftstore/src/store/region_meta.rs +++ b/components/raftstore/src/store/region_meta.rs @@ -60,7 +60,7 @@ pub struct RaftHardState { pub commit: u64, } -#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq)] pub enum RaftStateRole { Follower, Candidate, @@ -178,12 +178,27 @@ pub struct RegionPeer { pub id: u64, pub store_id: u64, pub role: RaftPeerRole, + pub is_witness: bool, } impl PartialEq for RegionPeer { #[inline] fn eq(&self, other: &metapb::Peer) -> bool { - self.id == other.id && self.store_id == other.store_id && self.role == other.role + // May not be sufficent, but always correct. + let s: metapb::Peer = (*self).into(); + s == *other + } +} + +impl From for metapb::Peer { + fn from(p: RegionPeer) -> Self { + metapb::Peer { + id: p.id, + store_id: p.store_id, + role: p.role.into(), + is_witness: p.is_witness, + ..Default::default() + } } } @@ -247,6 +262,7 @@ impl RegionMeta { id: peer.get_id(), store_id: peer.get_store_id(), role: peer.get_role().into(), + is_witness: peer.is_witness, }); } let merge_state = if local_state.has_merge_state() { diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 625db3e951f..3ce38d0c79e 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -76,8 +76,8 @@ use raftstore::{ RaftBatchSystem, RaftRouter, StoreMeta, MULTI_FILES_SNAPSHOT_FEATURE, PENDING_MSG_CAP, }, memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, - AutoSplitController, CheckLeaderRunner, GlobalReplicationState, LocalReader, SnapManager, - SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, + AutoSplitController, CheckLeaderRunner, LocalReader, SnapManager, SnapManagerBuilder, + SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, RaftRouterCompactedEventSender, }; @@ -221,8 +221,7 @@ struct TikvServer { flow_info_sender: Option>, flow_info_receiver: Option>, system: Option>, - resolver: resolve::PdStoreAddrResolver, - state: Arc>, + resolver: Option, store_path: PathBuf, snap_mgr: Option, // Will be filled in `init_servers`. encryption_key_manager: Option>, @@ -260,8 +259,7 @@ struct Servers { backup_stream_scheduler: Option>, } -type LocalServer = - Server, resolve::PdStoreAddrResolver, LocalRaftKv>; +type LocalServer = Server>; type LocalRaftKv = RaftKv>; impl TikvServer @@ -323,8 +321,6 @@ where let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); - let (resolver, state) = - resolve::new_resolver(Arc::clone(&pd_client), &background_worker, router.clone()); let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), @@ -375,8 +371,7 @@ where pd_client, router, system: Some(system), - resolver, - state, + resolver: None, store_path, snap_mgr: None, encryption_key_manager: None, @@ -655,14 +650,10 @@ where fn init_gc_worker( &mut self, - ) -> GcWorker< - RaftKv>, - RaftRouter, - > { + ) -> GcWorker>> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( engines.engine.clone(), - self.router.clone(), self.flow_info_sender.take().unwrap(), self.config.gc.clone(), self.pd_client.feature_gate().clone(), @@ -823,6 +814,13 @@ where )), ); + let (resolver, state) = resolve::new_resolver( + self.pd_client.clone(), + &self.background_worker, + storage.get_engine().raft_extension().clone(), + ); + self.resolver = Some(resolver); + ReplicaReadLockChecker::new(self.concurrency_manager.clone()) .register(self.coprocessor_host.as_mut().unwrap()); @@ -930,7 +928,7 @@ where raft_store.clone(), self.config.storage.api_version(), self.pd_client.clone(), - self.state.clone(), + state, self.background_worker.clone(), Some(health_service.clone()), None, @@ -953,8 +951,7 @@ where Arc::clone(&self.quota_limiter), ), coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), - self.router.clone(), - self.resolver.clone(), + self.resolver.clone().unwrap(), snap_mgr.clone(), gc_worker.clone(), check_leader_scheduler, @@ -1203,7 +1200,7 @@ where let debug_service = DebugService::new( engines.engines.clone(), servers.server.get_debug_thread_pool().clone(), - self.router.clone(), + engines.engine.raft_extension().clone(), self.cfg_controller.as_ref().unwrap().clone(), ); if servers @@ -1242,7 +1239,7 @@ where .start( servers.node.id(), self.pd_client.clone(), - self.resolver.clone(), + self.resolver.clone().unwrap(), self.security_mgr.clone(), &self.config.pessimistic_txn, ) diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 1b532932b30..ea9868afdbd 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -33,7 +33,7 @@ use pd_client::PdClient; use raftstore::{ coprocessor::{CoprocessorHost, RegionInfoAccessor}, errors::Error as RaftError, - router::{LocalReadRouter, RaftStoreBlackHole, RaftStoreRouter, ServerRaftStoreRouter}, + router::{LocalReadRouter, RaftStoreRouter, ServerRaftStoreRouter}, store::{ fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, msg::RaftCmdExtraOpts, @@ -64,7 +64,7 @@ use tikv::{ }, storage::{ self, - kv::SnapContext, + kv::{FakeExtension, SnapContext}, txn::flow_controller::{EngineFlowController, FlowController}, Engine, }, @@ -84,10 +84,11 @@ use super::*; use crate::Config; type SimulateStoreTransport = SimulateTransport>; -type SimulateServerTransport = - SimulateTransport>; pub type SimulateEngine = RaftKv; +type SimulateRaftExtension = ::RaftExtension; +type SimulateServerTransport = + SimulateTransport>; #[derive(Default, Clone)] pub struct AddressMap { @@ -125,12 +126,12 @@ impl StoreAddrResolver for AddressMap { struct ServerMeta { node: Node, - server: Server, + server: Server, sim_router: SimulateStoreTransport, sim_trans: SimulateServerTransport, raw_router: RaftRouter, raw_apply_router: ApplyRouter, - gc_worker: GcWorker, SimulateStoreTransport>, + gc_worker: GcWorker>, rts_worker: Option>, rsmeter_cleanup: Box, } @@ -152,7 +153,7 @@ pub struct ServerCluster { snap_paths: HashMap, snap_mgrs: HashMap, pd_client: Arc, - raft_client: RaftClient, + raft_client: RaftClient, concurrency_managers: HashMap, env: Arc, pub causal_ts_providers: HashMap>, @@ -176,7 +177,7 @@ impl ServerCluster { Arc::default(), security_mgr.clone(), map.clone(), - RaftStoreBlackHole, + FakeExtension, worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); @@ -218,7 +219,7 @@ impl ServerCluster { pub fn get_gc_worker( &self, node_id: u64, - ) -> &GcWorker, SimulateStoreTransport> { + ) -> &GcWorker> { &self.metas.get(&node_id).unwrap().gc_worker } @@ -334,7 +335,6 @@ impl ServerCluster { let (tx, _rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( engine.clone(), - sim_router.clone(), tx, cfg.gc.clone(), Default::default(), @@ -353,7 +353,7 @@ impl ServerCluster { let rts_endpoint = resolved_ts::Endpoint::new( &cfg.resolved_ts, rts_worker.scheduler(), - raft_router.clone(), + raft_router, store_meta.clone(), self.pd_client.clone(), concurrency_manager.clone(), @@ -401,6 +401,7 @@ impl ServerCluster { cfg.quota.max_delay_duration, cfg.quota.enable_auto_tune, )); + let extension = engine.raft_extension().clone(); let store = create_raft_storage::<_, _, _, F, _>( engine, &cfg.storage, @@ -445,7 +446,7 @@ impl ServerCluster { // Create pd client, snapshot manager, server. let (resolver, state) = - resolve::new_resolver(Arc::clone(&self.pd_client), &bg_worker, router.clone()); + resolve::new_resolver(Arc::clone(&self.pd_client), &bg_worker, extension.clone()); let snap_mgr = SnapManagerBuilder::default() .max_write_bytes_per_sec(cfg.server.snap_max_write_bytes_per_sec.0 as i64) .max_total_size(cfg.server.snap_max_total_size.0) @@ -483,7 +484,7 @@ impl ServerCluster { let debug_service = DebugService::new( engines.clone(), debug_thread_handle, - raft_router, + extension, ConfigController::default(), ); @@ -520,7 +521,6 @@ impl ServerCluster { store.clone(), copr.clone(), copr_v2.clone(), - sim_router.clone(), resolver.clone(), snap_mgr.clone(), gc_worker.clone(), diff --git a/components/test_storage/src/sync_storage.rs b/components/test_storage/src/sync_storage.rs index fa53688ea75..3d6e1e139e5 100644 --- a/components/test_storage/src/sync_storage.rs +++ b/components/test_storage/src/sync_storage.rs @@ -12,10 +12,7 @@ use kvproto::{ kvrpcpb::{ChecksumAlgorithm, Context, GetRequest, KeyRange, LockInfo, RawGetRequest}, metapb, }; -use raftstore::{ - coprocessor::{region_info_accessor::MockRegionInfoProvider, RegionInfoProvider}, - router::RaftStoreBlackHole, -}; +use raftstore::coprocessor::{region_info_accessor::MockRegionInfoProvider, RegionInfoProvider}; use tikv::{ server::gc_worker::{AutoGcConfig, GcConfig, GcSafePointProvider, GcWorker}, storage::{ @@ -106,7 +103,7 @@ impl SyncTestStorageBuilder { /// Only used for test purpose. #[derive(Clone)] pub struct SyncTestStorage { - gc_worker: GcWorker, + gc_worker: GcWorker, store: Storage, } @@ -123,7 +120,6 @@ impl SyncTestStorage { let (tx, _rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( storage.get_engine(), - RaftStoreBlackHole, tx, config, Default::default(), diff --git a/components/tikv_kv/Cargo.toml b/components/tikv_kv/Cargo.toml index 6ee74371674..8aa64d0def6 100644 --- a/components/tikv_kv/Cargo.toml +++ b/components/tikv_kv/Cargo.toml @@ -41,6 +41,7 @@ log_wrappers = { workspace = true } pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" +raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raftstore = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 07cae3ace65..f78b2243331 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -8,6 +8,7 @@ #![feature(bound_map)] #![feature(min_specialization)] #![feature(type_alias_impl_trait)] +#![feature(associated_type_defaults)] #[macro_use(fail_point)] extern crate fail; @@ -18,6 +19,7 @@ mod btree_engine; mod cursor; pub mod metrics; mod mock_engine; +mod raft_extension; mod raftstore_impls; mod rocksdb_engine; mod stats; @@ -55,6 +57,7 @@ pub use self::{ btree_engine::{BTreeEngine, BTreeEngineIterator, BTreeEngineSnapshot}, cursor::{Cursor, CursorBuilder}, mock_engine::{ExpectedWrite, MockEngineBuilder}, + raft_extension::{FakeExtension, RaftExtension}, rocksdb_engine::{RocksEngine, RocksSnapshot}, stats::{ CfStatistics, FlowStatistics, FlowStatsReporter, StageLatencyStats, Statistics, @@ -306,6 +309,12 @@ pub trait Engine: Send + Clone + 'static { /// Currently, only multi-rocksdb version will return `None`. fn kv_engine(&self) -> Option; + type RaftExtension: raft_extension::RaftExtension = FakeExtension; + /// Get the underlying raft extension. + fn raft_extension(&self) -> &Self::RaftExtension { + unimplemented!() + } + /// Write modifications into internal local engine directly. /// /// region_modifies records each region's modifications. @@ -418,6 +427,11 @@ pub trait Engine: Send + Clone + 'static { fn end_flashback(&self, _ctx: &Context) -> BoxFuture<'static, Result<()>> { Box::pin(futures::future::ready(Ok(()))) } + + /// Application may operate on local engine directly, the method is to hint + /// the engine there is probably a notable difference in range, so + /// engine may update its statistics. + fn hint_change_in_range(&self, _start_key: Vec, _end_key: Vec) {} } /// A Snapshot is a consistent view of the underlying engine at a given point in diff --git a/components/tikv_kv/src/mock_engine.rs b/components/tikv_kv/src/mock_engine.rs index f3d89940f4e..dc812e84d93 100644 --- a/components/tikv_kv/src/mock_engine.rs +++ b/components/tikv_kv/src/mock_engine.rs @@ -153,6 +153,11 @@ impl Engine for MockEngine { self.base.kv_engine() } + type RaftExtension = ::RaftExtension; + fn raft_extension(&self) -> &Self::RaftExtension { + self.base.raft_extension() + } + fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()> { self.base.modify_on_kv_engine(region_modifies) } diff --git a/components/tikv_kv/src/raft_extension.rs b/components/tikv_kv/src/raft_extension.rs new file mode 100644 index 00000000000..26c9e687ef6 --- /dev/null +++ b/components/tikv_kv/src/raft_extension.rs @@ -0,0 +1,69 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! TiKV uses raft under the hook to provide consistency between replicas. +//! Though technically, `Engine` trait should hide the details of raft, but in +//! some cases it's unavoidable to access raft interface somehow. This module +//! supports the access pattern via extension. + +use futures::future::BoxFuture; +use kvproto::{ + metapb::{Region, RegionEpoch}, + raft_serverpb::RaftMessage, +}; +use raft::SnapshotStatus; +use raftstore::store::region_meta::RegionMeta; + +use crate::Result; + +/// An interface to provide direct access to raftstore layer. +pub trait RaftExtension: Clone + Send { + /// Feed the message to the raft group. + /// + /// If it's a `key_message` is true, it will log a warning if the message + /// failed to send. + fn feed(&self, _msg: RaftMessage, _key_message: bool) {} + + /// Retport the message is rejected by the remote peer. + fn report_reject_message(&self, _region_id: u64, _from_peer_id: u64) {} + + /// Report the target peer is unreachable. + fn report_peer_unreachable(&self, _region_id: u64, _to_peer_id: u64) {} + + /// Report the target store is unreachable. + fn report_store_unreachable(&self, _store_id: u64) {} + + /// Report the status of snapshot. + fn report_snapshot_status(&self, _region_id: u64, _to_peer_id: u64, _status: SnapshotStatus) {} + + /// Report the address of a store is resolved. + fn report_resolved(&self, _store_id: u64, _group_id: u64) {} + + /// Split the region with the given keys. + /// + /// Use `BoxFuture` for simplicity as it's not performance critical path. + fn split( + &self, + _region_id: u64, + _region_epoch: RegionEpoch, + _split_keys: Vec>, + _source: String, + ) -> BoxFuture<'static, Result>> { + Box::pin(async move { Err(box_err!("raft split is not supported")) }) + } + + /// Get the region meta of the given region. + fn query_region(&self, _region_id: u64) -> BoxFuture<'static, Result> { + Box::pin(async move { Err(box_err!("query region is not supported")) }) + } + + /// Ask the raft group to do a consistency check. + fn check_consistency(&self, _region_id: u64) -> BoxFuture<'static, Result<()>> { + Box::pin(async move { Err(box_err!("consistency check is not supported")) }) + } +} + +/// An extension that does nothing or panic on all operations. +#[derive(Clone)] +pub struct FakeExtension; + +impl RaftExtension for FakeExtension {} diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 565ea0accaa..26e2c735254 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -34,7 +34,7 @@ use super::{ write_modifies, Callback, DummySnapshotExt, Engine, Error, ErrorInner, Iterator as EngineIterator, Modify, Result, SnapContext, Snapshot, WriteData, }; -use crate::{OnAppliedCb, WriteEvent}; +use crate::{FakeExtension, OnAppliedCb, RaftExtension, WriteEvent}; // Duplicated in test_engine_builder const TEMP_DIR: &str = ""; @@ -87,12 +87,26 @@ impl Drop for RocksEngineCore { /// /// This is intended for **testing use only**. #[derive(Clone)] -pub struct RocksEngine { +pub struct RocksEngine { core: Arc>, sched: Scheduler, engines: Engines, not_leader: Arc, coprocessor: CoprocessorHost, + ext: RE, +} + +impl RocksEngine { + pub fn with_raft_extension(self, ext: NRE) -> RocksEngine { + RocksEngine { + core: self.core, + sched: self.sched, + engines: self.engines, + not_leader: self.not_leader, + coprocessor: self.coprocessor, + ext, + } + } } impl RocksEngine { @@ -132,9 +146,12 @@ impl RocksEngine { not_leader: Arc::new(AtomicBool::new(false)), engines, coprocessor: CoprocessorHost::default(), + ext: FakeExtension, }) } +} +impl RocksEngine { pub fn trigger_not_leader(&self) { self.not_leader.store(true, Ordering::SeqCst); } @@ -196,13 +213,13 @@ impl RocksEngine { } } -impl Display for RocksEngine { +impl Display for RocksEngine { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "RocksDB") } } -impl Debug for RocksEngine { +impl Debug for RocksEngine { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, @@ -212,7 +229,7 @@ impl Debug for RocksEngine { } } -impl Engine for RocksEngine { +impl Engine for RocksEngine { type Snap = Arc; type Local = BaseRocksEngine; @@ -220,6 +237,11 @@ impl Engine for RocksEngine { Some(self.engines.kv.clone()) } + type RaftExtension = RE; + fn raft_extension(&self) -> &Self::RaftExtension { + &self.ext + } + fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()> { let modifies = region_modifies.into_values().flatten().collect(); write_modifies(&self.engines.kv, modifies) diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 0a162a58230..1ccac8860c6 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -24,7 +24,7 @@ use file_system::{IoType, WithIoType}; use futures::executor::block_on; use kvproto::{kvrpcpb::Context, metapb::Region}; use pd_client::{FeatureGate, PdClient}; -use raftstore::{coprocessor::RegionInfoProvider, router::RaftStoreRouter, store::msg::StoreMsg}; +use raftstore::coprocessor::RegionInfoProvider; use tikv_kv::{CfStatistics, CursorBuilder, Modify, SnapContext}; use tikv_util::{ config::{Tracker, VersionTrack}, @@ -174,15 +174,10 @@ where } /// Used to perform GC operations on the engine. -pub struct GcRunner -where - E: Engine, - RR: RaftStoreRouter, -{ +pub struct GcRunner { store_id: u64, engine: E, - raft_store_router: RR, flow_info_sender: Sender, /// Used to limit the write flow of GC. @@ -283,15 +278,10 @@ fn init_snap_ctx(store_id: u64, region: &Region) -> Context { ctx } -impl GcRunner -where - E: Engine, - RR: RaftStoreRouter, -{ +impl GcRunner { pub fn new( store_id: u64, engine: E, - raft_store_router: RR, flow_info_sender: Sender, cfg_tracker: Tracker, cfg: GcConfig, @@ -304,7 +294,6 @@ where Self { store_id, engine, - raft_store_router, flow_info_sender, limiter, cfg, @@ -797,15 +786,10 @@ where .send(FlowInfo::AfterUnsafeDestroyRange(ctx.region_id)) .unwrap(); - self.raft_store_router - .send_store_msg(StoreMsg::ClearRegionSizeInRange { - start_key: start_key.as_encoded().to_vec(), - end_key: end_key.as_encoded().to_vec(), - }) - .unwrap_or_else(|e| { - // Warn and ignore it. - warn!("unsafe destroy range: failed sending ClearRegionSizeInRange"; "err" => ?e); - }); + self.engine.hint_change_in_range( + start_key.as_encoded().to_vec(), + end_key.as_encoded().to_vec(), + ); } else { let cfs = &[CF_LOCK, CF_DEFAULT, CF_WRITE]; let keys = vec![start_key.clone(), end_key.clone()]; @@ -889,11 +873,7 @@ where } } -impl Runnable for GcRunner -where - E: Engine, - RR: RaftStoreRouter, -{ +impl Runnable for GcRunner { type Task = GcTask; #[inline] @@ -1072,16 +1052,12 @@ pub fn sync_gc( } /// Used to schedule GC operations. -pub struct GcWorker +pub struct GcWorker where E: Engine, - RR: RaftStoreRouter + 'static, { engine: E, - /// `raft_store_router` is useful to signal raftstore clean region size - /// informations. - raft_store_router: RR, /// Used to signal unsafe destroy range is executed. flow_info_sender: Option>, region_info_provider: Arc, @@ -1098,18 +1074,13 @@ where feature_gate: FeatureGate, } -impl Clone for GcWorker -where - E: Engine, - RR: RaftStoreRouter, -{ +impl Clone for GcWorker { #[inline] fn clone(&self) -> Self { self.refs.fetch_add(1, Ordering::SeqCst); Self { engine: self.engine.clone(), - raft_store_router: self.raft_store_router.clone(), flow_info_sender: self.flow_info_sender.clone(), config_manager: self.config_manager.clone(), refs: self.refs.clone(), @@ -1122,11 +1093,7 @@ where } } -impl Drop for GcWorker -where - E: Engine, - RR: RaftStoreRouter + 'static, -{ +impl Drop for GcWorker { #[inline] fn drop(&mut self) { let refs = self.refs.fetch_sub(1, Ordering::SeqCst); @@ -1142,25 +1109,19 @@ where } } -impl GcWorker -where - E: Engine, - RR: RaftStoreRouter, -{ +impl GcWorker { pub fn new( engine: E, - raft_store_router: RR, flow_info_sender: Sender, cfg: GcConfig, feature_gate: FeatureGate, region_info_provider: Arc, - ) -> GcWorker { + ) -> Self { let worker_builder = WorkerBuilder::new("gc-worker").pending_capacity(GC_MAX_PENDING_TASKS); let worker = worker_builder.create().lazy_build("gc-worker"); let worker_scheduler = worker.scheduler(); GcWorker { engine, - raft_store_router, flow_info_sender: Some(flow_info_sender), config_manager: GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg))), refs: Arc::new(AtomicUsize::new(1)), @@ -1211,7 +1172,6 @@ where let runner = GcRunner::new( store_id, self.engine.clone(), - self.raft_store_router.clone(), self.flow_info_sender.take().unwrap(), self.config_manager.0.clone().tracker("gc-woker".to_owned()), self.config_manager.value().clone(), @@ -1468,12 +1428,9 @@ mod tests { use futures::executor::block_on; use kvproto::{kvrpcpb::ApiVersion, metapb::Peer}; use raft::StateRole; - use raftstore::{ - coprocessor::{ - region_info_accessor::{MockRegionInfoProvider, RegionInfoAccessor}, - CoprocessorHost, RegionChangeEvent, - }, - router::RaftStoreBlackHole, + use raftstore::coprocessor::{ + region_info_accessor::{MockRegionInfoProvider, RegionInfoAccessor}, + CoprocessorHost, RegionChangeEvent, }; use tempfile::Builder; use tikv_kv::Snapshot; @@ -1620,7 +1577,6 @@ mod tests { let mut gc_worker = GcWorker::new( engine, - RaftStoreBlackHole, tx, GcConfig::default(), gate, @@ -1797,7 +1753,6 @@ mod tests { let mut gc_worker = GcWorker::new( prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcConfig::default(), feature_gate, @@ -1889,7 +1844,6 @@ mod tests { let mut runner = GcRunner::new( store_id, prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 @@ -1952,7 +1906,6 @@ mod tests { let mut runner = GcRunner::new( store_id, prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 @@ -2054,7 +2007,6 @@ mod tests { let mut runner = GcRunner::new( 1, prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 @@ -2183,7 +2135,6 @@ mod tests { let mut gc_worker = GcWorker::new( engine.clone(), - RaftStoreBlackHole, tx, GcConfig::default(), gate, @@ -2313,7 +2264,7 @@ mod tests { ) -> ( MultiRocksEngine, Arc, - GcRunner, + GcRunner, Vec, mpsc::Receiver, ) { @@ -2369,7 +2320,6 @@ mod tests { let gc_runner = GcRunner::new( store_id, engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 @@ -2548,7 +2498,6 @@ mod tests { let mut gc_runner = GcRunner::new( store_id, engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index 0230174fb42..fa12600bb98 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -3,7 +3,7 @@ use std::{ collections::VecDeque, ffi::CString, - marker::{PhantomData, Unpin}, + marker::Unpin, mem, pin::Pin, result, @@ -16,7 +16,6 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::queue::ArrayQueue; -use engine_traits::KvEngine; use futures::{ channel::oneshot, compat::Future01CompatExt, @@ -35,8 +34,9 @@ use kvproto::{ }; use protobuf::Message; use raft::SnapshotStatus; -use raftstore::{errors::DiscardReason, router::RaftStoreRouter}; +use raftstore::errors::DiscardReason; use security::SecurityManager; +use tikv_kv::RaftExtension; use tikv_util::{ config::{Tracker, VersionTrack}, lru::LruCache, @@ -346,18 +346,16 @@ impl Buffer for MessageBuffer { } /// Reporter reports whether a snapshot is sent successfully. -struct SnapshotReporter { - raft_router: T, - engine: PhantomData, +struct SnapshotReporter { + raft_router: R, region_id: u64, to_peer_id: u64, to_store_id: u64, } -impl SnapshotReporter +impl SnapshotReporter where - T: RaftStoreRouter + 'static, - E: KvEngine, + R: RaftExtension + 'static, { pub fn report(&self, status: SnapshotStatus) { debug!( @@ -374,43 +372,21 @@ where .inc(); } - if let Err(e) = - self.raft_router - .report_snapshot_status(self.region_id, self.to_peer_id, status) - { - error!(?e; - "report snapshot to peer failes"; - "to_peer_id" => self.to_peer_id, - "to_store_id" => self.to_store_id, - "region_id" => self.region_id, - ); - } + self.raft_router + .report_snapshot_status(self.region_id, self.to_peer_id, status); } } -fn report_unreachable(router: &R, msg: &RaftMessage) -where - R: RaftStoreRouter, - E: KvEngine, -{ +fn report_unreachable(router: &impl RaftExtension, msg: &RaftMessage) { let to_peer = msg.get_to_peer(); if msg.get_message().has_snapshot() { let store = to_peer.store_id.to_string(); REPORT_FAILURE_MSG_COUNTER .with_label_values(&["snapshot", &*store]) .inc(); - let res = router.report_snapshot_status(msg.region_id, to_peer.id, SnapshotStatus::Failure); - if let Err(e) = res { - error!( - ?e; - "reporting snapshot to peer fails"; - "to_peer_id" => to_peer.id, - "to_store_id" => to_peer.store_id, - "region_id" => msg.region_id, - ); - } + router.report_snapshot_status(msg.region_id, to_peer.id, SnapshotStatus::Failure); } - let _ = router.report_unreachable(msg.region_id, to_peer.id); + router.report_peer_unreachable(msg.region_id, to_peer.id); } fn grpc_error_is_unimplemented(e: &grpcio::Error) -> bool { @@ -422,7 +398,7 @@ fn grpc_error_is_unimplemented(e: &grpcio::Error) -> bool { } /// Struct tracks the lifetime of a `raft` or `batch_raft` RPC. -struct AsyncRaftSender { +struct AsyncRaftSender { sender: ClientCStreamSender, queue: Arc, buffer: B, @@ -430,23 +406,20 @@ struct AsyncRaftSender { snap_scheduler: Scheduler, addr: String, flush_timeout: Option, - _engine: PhantomData, } -impl AsyncRaftSender +impl AsyncRaftSender where - R: RaftStoreRouter + 'static, + R: RaftExtension + 'static, B: Buffer, - E: KvEngine, { - fn new_snapshot_reporter(&self, msg: &RaftMessage) -> SnapshotReporter { + fn new_snapshot_reporter(&self, msg: &RaftMessage) -> SnapshotReporter { let region_id = msg.get_region_id(); let to_peer_id = msg.get_to_peer().get_id(); let to_store_id = msg.get_to_peer().get_store_id(); SnapshotReporter { raft_router: self.router.clone(), - engine: PhantomData, region_id, to_peer_id, to_store_id, @@ -499,11 +472,10 @@ where } } -impl Future for AsyncRaftSender +impl Future for AsyncRaftSender where - R: RaftStoreRouter + Unpin + 'static, + R: RaftExtension + Unpin + 'static, B: Buffer + Unpin, - E: KvEngine, { type Output = grpcio::Result<()>; @@ -564,18 +536,17 @@ enum RaftCallRes { Disconnected, } -struct RaftCall { - sender: AsyncRaftSender, +struct RaftCall { + sender: AsyncRaftSender, receiver: ClientCStreamReceiver, lifetime: Option>, store_id: u64, } -impl RaftCall +impl RaftCall where - R: RaftStoreRouter + Unpin + 'static, + R: RaftExtension + Unpin + 'static, B: Buffer + Unpin, - E: KvEngine, { async fn poll(&mut self) { let res = futures::join!(&mut self.sender, &mut self.receiver); @@ -640,18 +611,16 @@ impl ConnectionBuilder { /// StreamBackEnd watches lifetime of a connection and handles reconnecting, /// spawn new RPC. -struct StreamBackEnd { +struct StreamBackEnd { store_id: u64, queue: Arc, builder: ConnectionBuilder, - engine: PhantomData, } -impl StreamBackEnd +impl StreamBackEnd where S: StoreAddrResolver, - R: RaftStoreRouter + Unpin + 'static, - E: KvEngine, + R: RaftExtension + Unpin + 'static, { fn resolve(&self) -> impl Future> { let (tx, rx) = oneshot::channel(); @@ -735,7 +704,6 @@ where snap_scheduler: self.builder.snap_scheduler.clone(), addr, flush_timeout: None, - _engine: PhantomData::, }, receiver: batch_stream, lifetime: Some(tx), @@ -760,7 +728,6 @@ where snap_scheduler: self.builder.snap_scheduler.clone(), addr, flush_timeout: None, - _engine: PhantomData::, }, receiver: stream, lifetime: Some(tx), @@ -802,14 +769,13 @@ async fn maybe_backoff(backoff: Duration, last_wake_time: &mut Option) /// 4. fallback to legacy API if incompatible /// /// Every failure during the process should trigger retry automatically. -async fn start( - back_end: StreamBackEnd, +async fn start( + back_end: StreamBackEnd, conn_id: usize, pool: Arc>, ) where S: StoreAddrResolver + Send, - R: RaftStoreRouter + Unpin + Send + 'static, - E: KvEngine, + R: RaftExtension + Unpin + Send + 'static, { let mut last_wake_time = None; let mut first_time = true; @@ -865,7 +831,7 @@ async fn start( back_end .builder .router - .broadcast_unreachable(back_end.store_id); + .report_store_unreachable(back_end.store_id); } continue; } else { @@ -896,7 +862,7 @@ async fn start( back_end .builder .router - .broadcast_unreachable(back_end.store_id); + .report_store_unreachable(back_end.store_id); addr_channel = None; first_time = false; } @@ -955,24 +921,22 @@ struct CachedQueue { /// } /// raft_client.flush(); /// ``` -pub struct RaftClient { +pub struct RaftClient { pool: Arc>, cache: LruCache<(u64, usize), CachedQueue>, need_flush: Vec<(u64, usize)>, full_stores: Vec<(u64, usize)>, future_pool: Arc>, builder: ConnectionBuilder, - engine: PhantomData, last_hash: (u64, u64), } -impl RaftClient +impl RaftClient where S: StoreAddrResolver + Send + 'static, - R: RaftStoreRouter + Unpin + Send + 'static, - E: KvEngine, + R: RaftExtension + Unpin + Send + 'static, { - pub fn new(builder: ConnectionBuilder) -> RaftClient { + pub fn new(builder: ConnectionBuilder) -> Self { let future_pool = Arc::new( yatp::Builder::new(thd_name!("raft-stream")) .max_thread_count(1) @@ -985,7 +949,6 @@ where full_stores: vec![], future_pool, builder, - engine: PhantomData::, last_hash: (0, 0), } } @@ -1018,7 +981,6 @@ where store_id, queue: queue.clone(), builder: self.builder.clone(), - engine: PhantomData::, }; self.future_pool .spawn(start(back_end, conn_id, self.pool.clone())); @@ -1170,7 +1132,7 @@ where } } -impl Clone for RaftClient +impl Clone for RaftClient where S: Clone, R: Clone, @@ -1183,7 +1145,6 @@ where full_stores: vec![], future_pool: self.future_pool.clone(), builder: self.builder.clone(), - engine: PhantomData::, last_hash: (0, 0), } } diff --git a/src/server/raftkv.rs b/src/server/raftkv/mod.rs similarity index 96% rename from src/server/raftkv.rs rename to src/server/raftkv/mod.rs index b6890262007..6c7169d043c 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv/mod.rs @@ -1,5 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. +mod raft_extension; + // #[PerformanceCriticalPath] use std::{ borrow::Cow, @@ -34,6 +36,7 @@ use raft::{ eraftpb::{self, MessageType}, StateRole, }; +pub use raft_extension::RaftRouterWrap; use raftstore::{ coprocessor::{ dispatcher::BoxReadIndexObserver, Coprocessor, CoprocessorHost, ReadIndexObserver, @@ -42,7 +45,7 @@ use raftstore::{ router::{LocalReadRouter, RaftStoreRouter}, store::{ self, Callback as StoreCallback, RaftCmdExtraOpts, ReadIndexContext, ReadResponse, - RegionSnapshot, WriteResponse, + RegionSnapshot, StoreMsg, WriteResponse, }, }; use thiserror::Error; @@ -294,7 +297,7 @@ where E: KvEngine, S: RaftStoreRouter + LocalReadRouter + 'static, { - router: S, + router: RaftRouterWrap, engine: E, txn_extra_scheduler: Option>, region_leaders: Arc>>, @@ -308,7 +311,7 @@ where /// Create a RaftKv using specified configuration. pub fn new(router: S, engine: E, region_leaders: Arc>>) -> RaftKv { RaftKv { - router, + router: RaftRouterWrap::new(router), engine, txn_extra_scheduler: None, region_leaders, @@ -359,6 +362,12 @@ where Some(self.engine.clone()) } + type RaftExtension = RaftRouterWrap; + #[inline] + fn raft_extension(&self) -> &Self::RaftExtension { + &self.router + } + fn modify_on_kv_engine( &self, mut region_modifies: HashMap>, @@ -635,7 +644,7 @@ where // and scheduling operations for this region when propose/apply before we // start the actual data flashback transaction command in the next phase. let req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); - exec_admin(&self.router, req) + exec_admin(&*self.router, req) } fn end_flashback(&self, ctx: &Context) -> BoxFuture<'static, kv::Result<()>> { @@ -643,7 +652,16 @@ where // in `RegionLocalState` and region's meta, and when that admin cmd is applied, // will update the memory state of the flashback let req = new_flashback_req(ctx, AdminCmdType::FinishFlashback); - exec_admin(&self.router, req) + exec_admin(&*self.router, req) + } + + fn hint_change_in_range(&self, start_key: Vec, end_key: Vec) { + self.router + .send_store_msg(StoreMsg::ClearRegionSizeInRange { start_key, end_key }) + .unwrap_or_else(|e| { + // Warn and ignore it. + warn!("unsafe destroy range: failed sending ClearRegionSizeInRange"; "err" => ?e); + }); } } diff --git a/src/server/raftkv/raft_extension.rs b/src/server/raftkv/raft_extension.rs new file mode 100644 index 00000000000..d3178842489 --- /dev/null +++ b/src/server/raftkv/raft_extension.rs @@ -0,0 +1,177 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; + +use futures::future::BoxFuture; +use kvproto::{ + metapb::{Region, RegionEpoch}, + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, + raft_serverpb::RaftMessage, +}; +use raft::SnapshotStatus; +use raftstore::{ + router::RaftStoreRouter, + store::{ + region_meta::{RaftStateRole, RegionMeta}, + CasualMessage, + }, +}; +use tikv_util::future::paired_future_callback; + +use crate::storage::kv; + +#[derive(Clone)] +pub struct RaftRouterWrap { + router: S, + _phantom: PhantomData, +} + +impl RaftRouterWrap { + pub fn new(router: S) -> Self { + Self { + router, + _phantom: PhantomData, + } + } +} + +impl Deref for RaftRouterWrap { + type Target = S; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.router + } +} + +impl DerefMut for RaftRouterWrap { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.router + } +} + +impl tikv_kv::RaftExtension for RaftRouterWrap +where + S: RaftStoreRouter + 'static, + E: engine_traits::KvEngine, +{ + #[inline] + fn feed(&self, msg: RaftMessage, key_message: bool) { + let region_id = msg.get_region_id(); + let msg_ty = msg.get_message().get_msg_type(); + // Channel full and region not found are ignored unless it's a key message. + if let Err(e) = self.router.send_raft_msg(msg) && key_message { + error!("failed to send raft message"; "region_id" => region_id, "msg_ty" => ?msg_ty, "err" => ?e); + } + } + + #[inline] + fn report_reject_message(&self, region_id: u64, from_peer_id: u64) { + let m = CasualMessage::RejectRaftAppend { + peer_id: from_peer_id, + }; + let _ = self.router.send_casual_msg(region_id, m); + } + + #[inline] + fn report_peer_unreachable(&self, region_id: u64, to_peer_id: u64) { + let _ = self.router.report_unreachable(region_id, to_peer_id); + } + + #[inline] + fn report_store_unreachable(&self, store_id: u64) { + self.router.broadcast_unreachable(store_id); + } + + #[inline] + fn report_snapshot_status(&self, region_id: u64, to_peer_id: u64, status: SnapshotStatus) { + if let Err(e) = self + .router + .report_snapshot_status(region_id, to_peer_id, status) + { + error!(?e; + "report snapshot to peer failes"; + "to_peer_id" => to_peer_id, + "status" => ?status, + "region_id" => region_id, + ); + } + } + + #[inline] + fn report_resolved(&self, store_id: u64, group_id: u64) { + self.router.report_resolved(store_id, group_id); + } + + #[inline] + fn split( + &self, + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: String, + ) -> BoxFuture<'static, kv::Result>> { + let (cb, rx) = paired_future_callback(); + let req = CasualMessage::SplitRegion { + region_epoch, + split_keys, + callback: raftstore::store::Callback::write(cb), + source: source.into(), + }; + let res = self.router.send_casual_msg(region_id, req); + Box::pin(async move { + res?; + let mut admin_resp = box_try!(rx.await); + super::check_raft_cmd_response(&mut admin_resp.response)?; + let regions = admin_resp + .response + .mut_admin_response() + .mut_splits() + .take_regions(); + Ok(regions.into()) + }) + } + + /// Get the region meta of the given region. + #[inline] + fn query_region(&self, region_id: u64) -> BoxFuture<'static, kv::Result> { + let (cb, rx) = paired_future_callback(); + let res = self + .router + .send_casual_msg(region_id, CasualMessage::AccessPeer(cb)); + Box::pin(async move { + res?; + Ok(box_try!(rx.await)) + }) + } + + /// Ask the raft group to do a consistency check. + fn check_consistency(&self, region_id: u64) -> BoxFuture<'static, kv::Result<()>> { + let region = self.query_region(region_id); + let router = self.router.clone(); + Box::pin(async move { + let meta: RegionMeta = region.await?; + let leader_id = meta.raft_status.soft_state.leader_id; + let mut leader = None; + for peer in meta.region_state.peers { + if peer.id == leader_id { + leader = Some(peer.into()); + } + } + if meta.raft_status.soft_state.raft_state != RaftStateRole::Leader { + return Err(raftstore::Error::NotLeader(region_id, leader).into()); + } + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_peer(leader.unwrap()); + req.mut_admin_request() + .set_cmd_type(AdminCmdType::ComputeHash); + let f = super::exec_admin(&router, req); + f.await + }) + } +} diff --git a/src/server/resolve.rs b/src/server/resolve.rs index acf60ae783f..c831ff28d17 100644 --- a/src/server/resolve.rs +++ b/src/server/resolve.rs @@ -2,15 +2,14 @@ use std::{ fmt::{self, Display, Formatter}, - marker::PhantomData, sync::{Arc, Mutex}, }; use collections::HashMap; -use engine_traits::KvEngine; use kvproto::replication_modepb::ReplicationMode; use pd_client::{take_peer_address, PdClient}; -use raftstore::{router::RaftStoreRouter, store::GlobalReplicationState}; +use raftstore::store::GlobalReplicationState; +use tikv_kv::RaftExtension; use tikv_util::{ time::Instant, worker::{Runnable, Scheduler, Worker}, @@ -52,24 +51,21 @@ struct StoreAddr { } /// A runner for resolving store addresses. -struct Runner +struct Runner where T: PdClient, - RR: RaftStoreRouter, - E: KvEngine, + R: RaftExtension, { pd_client: Arc, store_addrs: HashMap, state: Arc>, - router: RR, - engine: PhantomData, + router: R, } -impl Runner +impl Runner where T: PdClient, - RR: RaftStoreRouter, - E: KvEngine, + R: RaftExtension, { fn resolve(&mut self, store_id: u64) -> Result { if let Some(s) = self.store_addrs.get(&store_id) { @@ -128,11 +124,10 @@ where } } -impl Runnable for Runner +impl Runnable for Runner where T: PdClient, - RR: RaftStoreRouter, - E: KvEngine, + R: RaftExtension, { type Task = Task; fn run(&mut self, task: Task) { @@ -157,15 +152,14 @@ impl PdStoreAddrResolver { } /// Creates a new `PdStoreAddrResolver`. -pub fn new_resolver( +pub fn new_resolver( pd_client: Arc, worker: &Worker, - router: RR, + router: R, ) -> (PdStoreAddrResolver, Arc>) where T: PdClient + 'static, - RR: RaftStoreRouter, - E: KvEngine, + R: RaftExtension + 'static, { let state = Arc::new(Mutex::new(GlobalReplicationState::default())); let runner = Runner { @@ -173,7 +167,6 @@ where store_addrs: HashMap::default(), state: state.clone(), router, - engine: PhantomData, }; let scheduler = worker.start("addr-resolver", runner); let resolver = PdStoreAddrResolver::new(scheduler); @@ -190,16 +183,12 @@ impl StoreAddrResolver for PdStoreAddrResolver { #[cfg(test)] mod tests { - use std::{ - marker::PhantomData, net::SocketAddr, ops::Sub, str::FromStr, sync::Arc, thread, - time::Duration, - }; + use std::{net::SocketAddr, ops::Sub, str::FromStr, sync::Arc, thread, time::Duration}; use collections::HashMap; - use engine_test::kv::KvTestEngine; use kvproto::metapb; use pd_client::{PdClient, Result}; - use raftstore::router::RaftStoreBlackHole; + use tikv_kv::FakeExtension; use super::*; @@ -236,7 +225,7 @@ mod tests { store } - fn new_runner(store: metapb::Store) -> Runner { + fn new_runner(store: metapb::Store) -> Runner { let client = MockPdClient { start: Instant::now(), store, @@ -245,8 +234,7 @@ mod tests { pd_client: Arc::new(client), store_addrs: HashMap::default(), state: Default::default(), - router: RaftStoreBlackHole, - engine: PhantomData, + router: FakeExtension, } } diff --git a/src/server/server.rs b/src/server/server.rs index a4d82f1e347..1921483e37b 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -13,10 +13,7 @@ use futures::{compat::Stream01CompatExt, stream::StreamExt}; use grpcio::{ChannelBuilder, Environment, ResourceQuota, Server as GrpcServer, ServerBuilder}; use grpcio_health::{create_health, HealthService, ServingStatus}; use kvproto::tikvpb::*; -use raftstore::{ - router::RaftStoreRouter, - store::{CheckLeaderTask, SnapManager}, -}; +use raftstore::store::{CheckLeaderTask, SnapManager}; use security::SecurityManager; use tikv_util::{ config::VersionTrack, @@ -58,8 +55,7 @@ pub const STATS_THREAD_PREFIX: &str = "transport-stats"; /// /// It hosts various internal components, including gRPC, the raftstore router /// and a snapshot worker. -pub struct Server + 'static, S: StoreAddrResolver + 'static, E: Engine> -{ +pub struct Server { env: Arc, /// A GrpcServer builder or a GrpcServer. /// @@ -68,8 +64,8 @@ pub struct Server + 'static, S: StoreAddrResolver + grpc_mem_quota: ResourceQuota, local_addr: SocketAddr, // Transport. - trans: ServerTransport, - raft_router: T, + trans: ServerTransport, + raft_router: E::RaftExtension, // For sending/receiving snapshots. snap_mgr: SnapManager, snap_worker: LazyWorker, @@ -83,8 +79,11 @@ pub struct Server + 'static, S: StoreAddrResolver + timer: Handle, } -impl + Unpin, S: StoreAddrResolver + 'static, E: Engine> - Server +impl Server +where + S: StoreAddrResolver + 'static, + E: Engine, + E::RaftExtension: Unpin, { #[allow(clippy::too_many_arguments)] pub fn new( @@ -94,10 +93,9 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En storage: Storage, copr: Endpoint, copr_v2: coprocessor_v2::Endpoint, - raft_router: T, resolver: S, snap_mgr: SnapManager, - gc_worker: GcWorker, + gc_worker: GcWorker, check_leader_scheduler: Scheduler, env: Arc, yatp_read_pool: Option, @@ -124,6 +122,7 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En let snap_worker = Worker::new("snap-handler"); let lazy_worker = snap_worker.lazy_build("snap-handler"); + let raft_ext = storage.get_engine().raft_extension().clone(); let proxy = Proxy::new(security_mgr.clone(), &env, Arc::new(cfg.value().clone())); let kv_service = KvService::new( @@ -132,7 +131,6 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En gc_worker, copr, copr_v2, - raft_router.clone(), lazy_worker.scheduler(), check_leader_scheduler, Arc::clone(&grpc_thread_load), @@ -170,7 +168,7 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En Arc::clone(cfg), security_mgr.clone(), resolver, - raft_router.clone(), + raft_ext.clone(), lazy_worker.scheduler(), grpc_thread_load.clone(), ); @@ -185,7 +183,7 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En grpc_mem_quota: mem_quota, local_addr: addr, trans, - raft_router, + raft_router: raft_ext, snap_mgr, snap_worker: lazy_worker, stats_pool, @@ -207,7 +205,7 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En self.snap_worker.scheduler() } - pub fn transport(&self) -> ServerTransport { + pub fn transport(&self) -> ServerTransport { self.trans.clone() } @@ -341,7 +339,7 @@ pub mod test_router { use engine_rocks::{RocksEngine, RocksSnapshot}; use kvproto::raft_serverpb::RaftMessage; - use raftstore::{store::*, Result as RaftStoreResult}; + use raftstore::{router::RaftStoreRouter, store::*, Result as RaftStoreResult}; use super::*; @@ -428,6 +426,7 @@ mod tests { use kvproto::raft_serverpb::RaftMessage; use raftstore::{ coprocessor::region_info_accessor::MockRegionInfoProvider, + router::RaftStoreRouter, store::{transport::Transport, *}, }; use resource_metering::ResourceTagFactory; @@ -445,8 +444,8 @@ mod tests { use crate::{ config::CoprReadPoolConfig, coprocessor::{self, readpool_impl}, - server::TestRaftStoreRouter, - storage::{lock_manager::MockLockManager, TestStorageBuilderApiV1}, + server::{raftkv::RaftRouterWrap, TestRaftStoreRouter}, + storage::{lock_manager::MockLockManager, TestEngineBuilder, TestStorageBuilderApiV1}, }; #[derive(Clone)] @@ -497,13 +496,19 @@ mod tests { ..Default::default() }; - let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) - .build() - .unwrap(); - let (tx, rx) = mpsc::channel(); let (significant_msg_sender, significant_msg_receiver) = mpsc::channel(); let router = TestRaftStoreRouter::new(tx, significant_msg_sender); + let engine = TestEngineBuilder::new() + .build() + .unwrap() + .with_raft_extension(RaftRouterWrap::new(router.clone())); + + let storage = + TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) + .build() + .unwrap(); + let env = Arc::new( EnvBuilder::new() .cq_count(1) @@ -514,7 +519,6 @@ mod tests { let (tx, _rx) = mpsc::channel(); let mut gc_worker = GcWorker::new( storage.get_engine(), - router.clone(), tx, Default::default(), Default::default(), @@ -556,7 +560,6 @@ mod tests { storage, copr, copr_v2, - router.clone(), MockResolver { quick_fail: Arc::clone(&quick_fail), addr: Arc::clone(&addr), diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index 30cc8342959..ae0d53bacda 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -1,9 +1,8 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use engine_rocks::RocksEngine; -use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine}; +use engine_traits::{Engines, MiscExt, RaftEngine}; use futures::{ - channel::oneshot, future::{Future, FutureExt, TryFutureExt}, sink::SinkExt, stream::{self, TryStreamExt}, @@ -12,17 +11,8 @@ use grpcio::{ Error as GrpcError, RpcContext, RpcStatus, RpcStatusCode, ServerStreamingSink, UnarySink, WriteFlags, }; -use kvproto::{ - debugpb::{self, *}, - raft_cmdpb::{ - AdminCmdType, AdminRequest, RaftCmdRequest, RaftRequestHeader, RegionDetailResponse, - StatusCmdType, StatusRequest, - }, -}; -use raftstore::{ - router::RaftStoreRouter, - store::msg::{Callback, RaftCmdExtraOpts}, -}; +use kvproto::debugpb::{self, *}; +use tikv_kv::RaftExtension; use tikv_util::metrics; use tokio::runtime::Handle; @@ -53,28 +43,26 @@ fn error_to_grpc_error(tag: &'static str, e: Error) -> GrpcError { /// Service handles the RPC messages for the `Debug` service. #[derive(Clone)] -pub struct Service> { +pub struct Service { pool: Handle, debugger: Debugger, raft_router: T, - _phantom: std::marker::PhantomData, } -impl> Service { - /// Constructs a new `Service` with `Engines`, a `RaftStoreRouter` and a +impl Service { + /// Constructs a new `Service` with `Engines`, a `RaftExtension` and a /// `GcWorker`. pub fn new( engines: Engines, pool: Handle, raft_router: T, cfg_controller: ConfigController, - ) -> Service { + ) -> Self { let debugger = Debugger::new(engines, cfg_controller); Service { pool, debugger, raft_router, - _phantom: Default::default(), } } @@ -99,9 +87,7 @@ impl> Service { } } -impl + 'static> debugpb::Debug - for Service -{ +impl debugpb::Debug for Service { fn get(&mut self, ctx: RpcContext<'_>, mut req: GetRequest, sink: UnarySink) { const TAG: &str = "debug_get"; @@ -386,18 +372,14 @@ impl + 'static> debugpb::De sink: UnarySink, ) { let region_id = req.get_region_id(); - let debugger = self.debugger.clone(); - let router1 = self.raft_router.clone(); - let router2 = self.raft_router.clone(); - - let consistency_check_task = async move { - let store_id = debugger.get_store_ident()?.store_id; - let detail = region_detail(router2, region_id, store_id).await?; - consistency_check(router1, detail).await + let f = self.raft_router.check_consistency(region_id); + let task = async move { + box_try!(f.await); + Ok(()) }; let f = self .pool - .spawn(consistency_check_task) + .spawn(task) .map(|res| res.unwrap()) .map_ok(|_| RegionConsistencyCheckResponse::default()); self.handle_response(ctx, sink, f, "check_region_consistency"); @@ -537,79 +519,6 @@ impl + 'static> debugpb::De } } -fn region_detail>( - raft_router: T, - region_id: u64, - store_id: u64, -) -> impl Future> { - let mut header = RaftRequestHeader::default(); - header.set_region_id(region_id); - header.mut_peer().set_store_id(store_id); - let mut status_request = StatusRequest::default(); - status_request.set_cmd_type(StatusCmdType::RegionDetail); - let mut raft_cmd = RaftCmdRequest::default(); - raft_cmd.set_header(header); - raft_cmd.set_status_request(status_request); - - let (tx, rx) = oneshot::channel(); - let cb = Callback::read(Box::new(|resp| tx.send(resp).unwrap())); - - async move { - raft_router - .send_command(raft_cmd, cb, RaftCmdExtraOpts::default()) - .map_err(|e| Error::Other(Box::new(e)))?; - - let mut r = rx.map_err(|e| Error::Other(Box::new(e))).await?; - - if r.response.get_header().has_error() { - let e = r.response.get_header().get_error(); - warn!("region_detail got error"; "err" => ?e); - return Err(Error::Other(e.message.clone().into())); - } - - let detail = r.response.take_status_response().take_region_detail(); - debug!("region_detail got region detail"; "detail" => ?detail); - let leader_store_id = detail.get_leader().get_store_id(); - if leader_store_id != store_id { - let msg = format!("Leader is on store {}", leader_store_id); - return Err(Error::Other(msg.into())); - } - Ok(detail) - } -} - -fn consistency_check>( - raft_router: T, - mut detail: RegionDetailResponse, -) -> impl Future> { - let mut header = RaftRequestHeader::default(); - header.set_region_id(detail.get_region().get_id()); - header.set_peer(detail.take_leader()); - let mut admin_request = AdminRequest::default(); - admin_request.set_cmd_type(AdminCmdType::ComputeHash); - let mut raft_cmd = RaftCmdRequest::default(); - raft_cmd.set_header(header); - raft_cmd.set_admin_request(admin_request); - - let (tx, rx) = oneshot::channel(); - let cb = Callback::read(Box::new(|resp| tx.send(resp).unwrap())); - - async move { - raft_router - .send_command(raft_cmd, cb, RaftCmdExtraOpts::default()) - .map_err(|e| Error::Other(Box::new(e)))?; - - let r = rx.map_err(|e| Error::Other(Box::new(e))).await?; - - if r.response.get_header().has_error() { - let e = r.response.get_header().get_error(); - warn!("consistency-check got error"; "err" => ?e); - return Err(Error::Other(e.message.clone().into())); - } - Ok(()) - } -} - mod region_size_response { pub type Entry = kvproto::debugpb::RegionSizeResponseEntry; } diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 7c40ab659eb..db50dfe459e 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -15,26 +15,19 @@ use grpcio::{ ClientStreamingSink, DuplexSink, Error as GrpcError, RequestStream, Result as GrpcResult, RpcContext, RpcStatus, RpcStatusCode, ServerStreamingSink, UnarySink, WriteFlags, }; -use kvproto::{ - coprocessor::*, - errorpb::{Error as RegionError, *}, - kvrpcpb::*, - mpp::*, - raft_serverpb::*, - tikvpb::*, -}; +use kvproto::{coprocessor::*, kvrpcpb::*, mpp::*, raft_serverpb::*, tikvpb::*}; use protobuf::RepeatedField; use raft::eraftpb::MessageType; use raftstore::{ - router::RaftStoreRouter, store::{ memory::{MEMTRACE_APPLYS, MEMTRACE_RAFT_ENTRIES, MEMTRACE_RAFT_MESSAGES}, metrics::RAFT_ENTRIES_CACHES_GAUGE, - Callback, CasualMessage, CheckLeaderTask, + CheckLeaderTask, }, - DiscardReason, Error as RaftStoreError, Result as RaftStoreResult, + Error as RaftStoreError, Result as RaftStoreResult, }; use tikv_alloc::trace::MemoryTraceGuard; +use tikv_kv::RaftExtension; use tikv_util::{ future::{paired_future_callback, poll_future_notify}, mpsc::future::{unbounded, BatchReceiver, Sender, WakePolicy}, @@ -69,18 +62,16 @@ const GRPC_MSG_MAX_BATCH_SIZE: usize = 128; const GRPC_MSG_NOTIFY_SIZE: usize = 8; /// Service handles the RPC messages for the `Tikv` service. -pub struct Service + 'static, E: Engine, L: LockManager, F: KvFormat> { +pub struct Service { store_id: u64, /// Used to handle requests related to GC. - gc_worker: GcWorker, + gc_worker: GcWorker, // For handling KV requests. storage: Storage, // For handling coprocessor requests. copr: Endpoint, // For handling corprocessor v2 requests. copr_v2: coprocessor_v2::Endpoint, - // For handling raft messages. - ch: T, // For handling snapshot. snap_scheduler: Scheduler, // For handling `CheckLeader` request. @@ -96,13 +87,7 @@ pub struct Service + 'static, E: Engine, L: LockMan reject_messages_on_memory_ratio: f64, } -impl< - T: RaftStoreRouter + Clone + 'static, - E: Engine + Clone, - L: LockManager + Clone, - F: KvFormat, -> Clone for Service -{ +impl Clone for Service { fn clone(&self) -> Self { Service { store_id: self.store_id, @@ -110,7 +95,6 @@ impl< storage: self.storage.clone(), copr: self.copr.clone(), copr_v2: self.copr_v2.clone(), - ch: self.ch.clone(), snap_scheduler: self.snap_scheduler.clone(), check_leader_scheduler: self.check_leader_scheduler.clone(), enable_req_batch: self.enable_req_batch, @@ -121,17 +105,14 @@ impl< } } -impl + 'static, E: Engine, L: LockManager, F: KvFormat> - Service -{ +impl Service { /// Constructs a new `Service` which provides the `Tikv` service. pub fn new( store_id: u64, storage: Storage, - gc_worker: GcWorker, + gc_worker: GcWorker, copr: Endpoint, copr_v2: coprocessor_v2::Endpoint, - ch: T, snap_scheduler: Scheduler, check_leader_scheduler: Scheduler, grpc_thread_load: Arc, @@ -145,7 +126,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor storage, copr, copr_v2, - ch, snap_scheduler, check_leader_scheduler, enable_req_batch, @@ -157,7 +137,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor fn handle_raft_message( store_id: u64, - ch: &T, + ch: &E::RaftExtension, msg: RaftMessage, reject: bool, ) -> RaftStoreResult<()> { @@ -172,13 +152,11 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor RAFT_APPEND_REJECTS.inc(); let id = msg.get_region_id(); let peer_id = msg.get_message().get_from(); - let m = CasualMessage::RejectRaftAppend { peer_id }; - let _ = ch.send_casual_msg(id, m); + ch.report_reject_message(id, peer_id); return Ok(()); } - // `send_raft_msg` may return `RaftStoreError::RegionNotFound` or - // `RaftStoreError::Transport(DiscardReason::Full)` - ch.send_raft_msg(msg) + ch.feed(msg, false); + Ok(()) } } @@ -228,9 +206,7 @@ macro_rules! set_total_time { }; } -impl + 'static, E: Engine, L: LockManager, F: KvFormat> Tikv - for Service -{ +impl Tikv for Service { handle_request!(kv_get, future_get, GetRequest, GetResponse, has_time_detail); handle_request!(kv_scan, future_scan, ScanRequest, ScanResponse); handle_request!( @@ -614,7 +590,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor sink: ClientStreamingSink, ) { let store_id = self.store_id; - let ch = self.ch.clone(); + let ch = self.storage.get_engine().raft_extension().clone(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; let res = async move { @@ -657,7 +633,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ) { info!("batch_raft RPC is called, new gRPC stream established"); let store_id = self.store_id; - let ch = self.ch.clone(); + let ch = self.storage.get_engine().raft_extension().clone(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; let res = async move { @@ -726,7 +702,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let begin_instant = Instant::now(); let region_id = req.get_context().get_region_id(); - let (cb, f) = paired_future_callback(); let mut split_keys = if req.is_raw_kv { if !req.get_split_key().is_empty() { vec![F::encode_raw_key_owned(req.take_split_key(), None).into_encoded()] @@ -747,52 +722,45 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor } }; split_keys.sort(); - let req = CasualMessage::SplitRegion { - region_epoch: req.take_context().take_region_epoch(), + let engine = self.storage.get_engine(); + let f = engine.raft_extension().split( + region_id, + req.take_context().take_region_epoch(), split_keys, - callback: Callback::write(cb), - source: ctx.peer().into(), - }; - - if let Err(e) = self.ch.send_casual_msg(region_id, req) { - // Retrun region error instead a gRPC error. - let mut resp = SplitRegionResponse::default(); - resp.set_region_error(raftstore_error_to_region_error(e, region_id)); - ctx.spawn( - async move { - sink.success(resp).await?; - ServerResult::Ok(()) - } - .map_err(|_| ()) - .map(|_| ()), - ); - return; - } + ctx.peer(), + ); let task = async move { - let mut res = f.await?; + let res = f.await; let mut resp = SplitRegionResponse::default(); - if res.response.get_header().has_error() { - resp.set_region_error(res.response.mut_header().take_error()); - } else { - let admin_resp = res.response.mut_admin_response(); - let regions: Vec<_> = admin_resp.mut_splits().take_regions().into(); - if regions.len() < 2 { - error!( - "invalid split response"; - "region_id" => region_id, - "resp" => ?admin_resp - ); - resp.mut_region_error().set_message(format!( - "Internal Error: invalid response: {:?}", - admin_resp - )); - } else { - if regions.len() == 2 { - resp.set_left(regions[0].clone()); - resp.set_right(regions[1].clone()); + match res { + Ok(regions) => { + if regions.len() < 2 { + error!( + "invalid split response"; + "region_id" => region_id, + "resp" => ?regions + ); + resp.mut_region_error().set_message(format!( + "Internal Error: invalid response: {:?}", + regions + )); + } else { + if regions.len() == 2 { + resp.set_left(regions[0].clone()); + resp.set_right(regions[1].clone()); + } + resp.set_regions(regions.into()); + } + } + Err(e) => { + let err: crate::storage::Result<()> = Err(e.into()); + if let Some(err) = extract_region_error(&err) { + resp.set_region_error(err) + } else { + resp.mut_region_error() + .set_message(format!("failed to split: {:?}", err)); } - resp.set_regions(regions.into()); } } sink.success(resp).await?; @@ -2159,20 +2127,6 @@ fn collect_batch_resp(v: &mut MeasuredBatchResponse, mut e: MeasuredSingleRespon v.measures.push(e.measure); } -fn raftstore_error_to_region_error(e: RaftStoreError, region_id: u64) -> RegionError { - if let RaftStoreError::Transport(DiscardReason::Disconnected) = e { - // `From::from(RaftStoreError) -> RegionError` treats `Disconnected` as `Other`. - let mut region_error = RegionError::default(); - let region_not_found = RegionNotFound { - region_id, - ..Default::default() - }; - region_error.set_region_not_found(region_not_found); - return region_error; - } - e.into() -} - fn needs_reject_raft_append(reject_messages_on_memory_ratio: f64) -> bool { fail_point!("needs_reject_raft_append", |_| true); if reject_messages_on_memory_ratio < f64::EPSILON { diff --git a/src/server/snap.rs b/src/server/snap.rs index 0200c779383..8fe737c2e60 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -3,7 +3,6 @@ use std::{ fmt::{self, Display, Formatter}, io::{Read, Write}, - marker::PhantomData, pin::Pin, sync::{ atomic::{AtomicUsize, Ordering}, @@ -12,7 +11,6 @@ use std::{ time::Duration, }; -use engine_traits::KvEngine; use file_system::{IoType, WithIoType}; use futures::{ future::{Future, TryFutureExt}, @@ -29,11 +27,9 @@ use kvproto::{ tikvpb::TikvClient, }; use protobuf::Message; -use raftstore::{ - router::RaftStoreRouter, - store::{SnapEntry, SnapKey, SnapManager, Snapshot}, -}; +use raftstore::store::{SnapEntry, SnapKey, SnapManager, Snapshot}; use security::SecurityManager; +use tikv_kv::RaftExtension; use tikv_util::{ config::{Tracker, VersionTrack}, time::Instant, @@ -260,7 +256,7 @@ impl RecvSnapContext { }) } - fn finish>(self, raft_router: R) -> Result<()> { + fn finish(self, raft_router: R) -> Result<()> { let _with_io_type = WithIoType::new(self.io_type); let key = self.key; if let Some(mut file) = self.file { @@ -271,15 +267,13 @@ impl RecvSnapContext { return Err(e); } } - if let Err(e) = raft_router.send_raft_msg(self.raft_msg) { - return Err(box_err!("{} failed to send snapshot to raft: {}", key, e)); - } + raft_router.feed(self.raft_msg, true); info!("saving all snapshot files"; "snap_key" => %key, "takes" => ?self.start.saturating_elapsed()); Ok(()) } } -fn recv_snap + 'static>( +fn recv_snap( stream: RequestStream, sink: ClientStreamingSink, snap_mgr: SnapManager, @@ -331,11 +325,7 @@ fn recv_snap + 'static>( } } -pub struct Runner -where - E: KvEngine, - R: RaftStoreRouter + 'static, -{ +pub struct Runner { env: Arc, snap_mgr: SnapManager, pool: Runtime, @@ -345,21 +335,16 @@ where cfg: Config, sending_count: Arc, recving_count: Arc, - engine: PhantomData, } -impl Runner -where - E: KvEngine, - R: RaftStoreRouter + 'static, -{ +impl Runner { pub fn new( env: Arc, snap_mgr: SnapManager, r: R, security_mgr: Arc, cfg: Arc>, - ) -> Runner { + ) -> Self { let cfg_tracker = cfg.clone().tracker("snap-sender".to_owned()); let snap_worker = Runner { env, @@ -377,7 +362,6 @@ where cfg: cfg.value().clone(), sending_count: Arc::new(AtomicUsize::new(0)), recving_count: Arc::new(AtomicUsize::new(0)), - engine: PhantomData, }; snap_worker } @@ -404,11 +388,7 @@ where } } -impl Runnable for Runner -where - E: KvEngine, - R: RaftStoreRouter + 'static, -{ +impl Runnable for Runner { type Task = Task; fn run(&mut self, task: Task) { diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index 837ec294fce..5dd83deb092 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -4,7 +4,6 @@ use std::{ convert::{TryFrom, TryInto}, fs::{self, File}, io::{Read, Write}, - marker::PhantomData, sync::{ atomic::{AtomicUsize, Ordering}, Arc, @@ -12,7 +11,6 @@ use std::{ time::Duration, }; -use engine_traits::KvEngine; use file_system::{IoType, WithIoType}; use futures::{ future::{Future, TryFutureExt}, @@ -28,11 +26,9 @@ use kvproto::{ tikvpb::TikvClient, }; use protobuf::Message; -use raftstore::{ - router::RaftStoreRouter, - store::snap::{TabletSnapKey, TabletSnapManager}, -}; +use raftstore::store::snap::{TabletSnapKey, TabletSnapManager}; use security::SecurityManager; +use tikv_kv::RaftExtension; use tikv_util::{ config::{Tracker, VersionTrack}, time::Instant, @@ -82,11 +78,9 @@ impl RecvTabletSnapContext { }) } - fn finish>(self, raft_router: R) -> Result<()> { + fn finish(self, raft_router: R) -> Result<()> { let key = self.key; - if let Err(e) = raft_router.send_raft_msg(self.raft_msg) { - return Err(box_err!("{} failed to send snapshot to raft: {}", key, e)); - } + raft_router.feed(self.raft_msg, true); info!("saving all snapshot files"; "snap_key" => %key, "takes" => ?self.start.saturating_elapsed()); Ok(()) } @@ -279,7 +273,7 @@ async fn recv_snap_files( Ok(context) } -fn recv_snap + 'static>( +fn recv_snap( stream: RequestStream, sink: ClientStreamingSink, snap_mgr: TabletSnapManager, @@ -302,11 +296,7 @@ fn recv_snap + 'static>( } } -pub struct TabletRunner -where - E: KvEngine, - R: RaftStoreRouter + 'static, -{ +pub struct TabletRunner { env: Arc, snap_mgr: TabletSnapManager, security_mgr: Arc, @@ -316,22 +306,17 @@ where cfg: Config, sending_count: Arc, recving_count: Arc, - engine: PhantomData, limiter: Limiter, } -impl TabletRunner -where - E: KvEngine, - R: RaftStoreRouter + 'static, -{ +impl TabletRunner { pub fn new( env: Arc, snap_mgr: TabletSnapManager, r: R, security_mgr: Arc, cfg: Arc>, - ) -> TabletRunner { + ) -> Self { let config = cfg.value().clone(); let cfg_tracker = cfg.tracker("tablet-sender".to_owned()); let limit = i64::try_from(config.snap_max_write_bytes_per_sec.0) @@ -358,7 +343,6 @@ where cfg: config, sending_count: Arc::new(AtomicUsize::new(0)), recving_count: Arc::new(AtomicUsize::new(0)), - engine: PhantomData, limiter, }; snap_worker @@ -385,11 +369,7 @@ pub struct SendStat { elapsed: Duration, } -impl Runnable for TabletRunner -where - E: KvEngine, - R: RaftStoreRouter + 'static, -{ +impl Runnable for TabletRunner { type Task = Task; fn run(&mut self, task: Task) { diff --git a/src/server/transport.rs b/src/server/transport.rs index e52bead3934..1303eff81f5 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -1,56 +1,45 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. -use std::marker::PhantomData; - -use engine_traits::KvEngine; use kvproto::raft_serverpb::RaftMessage; -use raftstore::{router::RaftStoreRouter, store::Transport, Result as RaftStoreResult}; +use raftstore::{store::Transport, Result as RaftStoreResult}; +use tikv_kv::RaftExtension; use crate::server::{raft_client::RaftClient, resolve::StoreAddrResolver}; -pub struct ServerTransport +pub struct ServerTransport where - T: RaftStoreRouter + 'static, + T: RaftExtension + 'static, S: StoreAddrResolver + 'static, - E: KvEngine, { - raft_client: RaftClient, - engine: PhantomData, + raft_client: RaftClient, } -impl Clone for ServerTransport +impl Clone for ServerTransport where - T: RaftStoreRouter + 'static, + T: RaftExtension + 'static, S: StoreAddrResolver + 'static, - E: KvEngine, { fn clone(&self) -> Self { ServerTransport { raft_client: self.raft_client.clone(), - engine: PhantomData, } } } -impl ServerTransport +impl ServerTransport where - E: KvEngine, - T: RaftStoreRouter + 'static, + T: RaftExtension + 'static, S: StoreAddrResolver + 'static, { - pub fn new(raft_client: RaftClient) -> ServerTransport { - ServerTransport { - raft_client, - engine: PhantomData, - } + pub fn new(raft_client: RaftClient) -> Self { + ServerTransport { raft_client } } } -impl Transport for ServerTransport +impl Transport for ServerTransport where - T: RaftStoreRouter + Unpin + 'static, + T: RaftExtension + Unpin + 'static, S: StoreAddrResolver + Unpin + 'static, - E: KvEngine, { fn send(&mut self, msg: RaftMessage) -> RaftStoreResult<()> { match self.raft_client.send(msg) { diff --git a/tests/failpoints/cases/test_gc_metrics.rs b/tests/failpoints/cases/test_gc_metrics.rs index e698031f0bc..348b81aaea7 100644 --- a/tests/failpoints/cases/test_gc_metrics.rs +++ b/tests/failpoints/cases/test_gc_metrics.rs @@ -19,7 +19,6 @@ use raftstore::{ coprocessor::{ region_info_accessor::MockRegionInfoProvider, CoprocessorHost, RegionChangeEvent, }, - router::RaftStoreBlackHole, RegionInfoAccessor, }; use tikv::{ @@ -142,7 +141,6 @@ fn test_txn_gc_keys_handled() { feature_gate.set_version("5.0.0").unwrap(); let mut gc_worker = GcWorker::new( prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcConfig::default(), feature_gate, @@ -286,7 +284,6 @@ fn test_raw_gc_keys_handled() { let feature_gate = FeatureGate::default(); let mut gc_worker = GcWorker::new( prefixed_engine, - RaftStoreBlackHole, tx, GcConfig::default(), feature_gate, diff --git a/tests/integrations/config/dynamic/gc_worker.rs b/tests/integrations/config/dynamic/gc_worker.rs index e8b437f941a..623833c3b27 100644 --- a/tests/integrations/config/dynamic/gc_worker.rs +++ b/tests/integrations/config/dynamic/gc_worker.rs @@ -5,9 +5,7 @@ use std::{ time::Duration, }; -use raftstore::{ - coprocessor::region_info_accessor::MockRegionInfoProvider, router::RaftStoreBlackHole, -}; +use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; use tikv::{ config::{ConfigController, Module, TikvConfig}, server::gc_worker::{GcConfig, GcTask, GcWorker}, @@ -27,15 +25,11 @@ fn test_gc_config_validate() { fn setup_cfg_controller( cfg: TikvConfig, -) -> ( - GcWorker, - ConfigController, -) { +) -> (GcWorker, ConfigController) { let engine = TestEngineBuilder::new().build().unwrap(); let (tx, _rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( engine, - RaftStoreBlackHole, tx, cfg.gc.clone(), Default::default(), diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index 5b9ef72b4c3..1a82ec8005e 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -15,6 +15,7 @@ use tikv::{ config::{ConfigController, TikvConfig}, server::{ config::{Config as ServerConfig, ServerConfigManager}, + raftkv::RaftRouterWrap, snap::{Runner as SnapHandler, Task as SnapTask}, }, }; @@ -60,7 +61,7 @@ fn start_server( let snap_runner = SnapHandler::new( Arc::clone(&env), snap_mgr.clone(), - raft_router, + RaftRouterWrap::new(raft_router), security_mgr, Arc::clone(&server_config), ); diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index edf4d0f1c65..fa7a86f12c4 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -9,7 +9,6 @@ use std::{ time::Duration, }; -use engine_rocks::RocksEngine; use futures::{FutureExt, StreamExt, TryStreamExt}; use grpcio::{ ClientStreamingSink, Environment, RequestStream, RpcContext, RpcStatus, RpcStatusCode, Server, @@ -20,15 +19,12 @@ use kvproto::{ tikvpb::BatchRaftMessage, }; use raft::eraftpb::Entry; -use raftstore::{ - errors::DiscardReason, - router::{RaftStoreBlackHole, RaftStoreRouter}, - store::StoreMsg, -}; +use raftstore::{errors::DiscardReason, store::StoreMsg}; use tikv::server::{ - self, load_statistics::ThreadLoadPool, resolve, resolve::Callback, Config, ConnectionBuilder, - RaftClient, StoreAddrResolver, TestRaftStoreRouter, + self, load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, resolve::Callback, + Config, ConnectionBuilder, RaftClient, StoreAddrResolver, TestRaftStoreRouter, }; +use tikv_kv::{FakeExtension, RaftExtension}; use tikv_util::{ config::{ReadableDuration, VersionTrack}, worker::{Builder as WorkerBuilder, LazyWorker}, @@ -55,9 +51,9 @@ impl StoreAddrResolver for StaticResolver { } } -fn get_raft_client(router: R, resolver: T) -> RaftClient +fn get_raft_client(router: R, resolver: T) -> RaftClient where - R: RaftStoreRouter + Unpin + 'static, + R: RaftExtension + Unpin + 'static, T: StoreAddrResolver + 'static, { let env = Arc::new(Environment::new(2)); @@ -80,10 +76,8 @@ where RaftClient::new(builder) } -fn get_raft_client_by_port( - port: u16, -) -> RaftClient { - get_raft_client(RaftStoreBlackHole, StaticResolver::new(port)) +fn get_raft_client_by_port(port: u16) -> RaftClient { + get_raft_client(FakeExtension, StaticResolver::new(port)) } #[derive(Clone)] @@ -183,7 +177,8 @@ fn test_raft_client_reconnect() { let (tx, rx) = mpsc::channel(); let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); let router = TestRaftStoreRouter::new(tx, significant_msg_sender); - let mut raft_client = get_raft_client(router, StaticResolver::new(port)); + let wrap = RaftRouterWrap::new(router); + let mut raft_client = get_raft_client(wrap, StaticResolver::new(port)); (0..50).for_each(|_| raft_client.send(RaftMessage::default()).unwrap()); raft_client.flush(); @@ -223,7 +218,8 @@ fn test_raft_client_report_unreachable() { let (tx, rx) = mpsc::channel(); let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); let router = TestRaftStoreRouter::new(tx, significant_msg_sender); - let mut raft_client = get_raft_client(router, StaticResolver::new(port)); + let wrap = RaftRouterWrap::new(router); + let mut raft_client = get_raft_client(wrap, StaticResolver::new(port)); // server is disconnected mock_server.shutdown(); @@ -386,15 +382,14 @@ fn test_tombstone_block_list() { let bg_worker = WorkerBuilder::new(thd_name!("background")) .thread_count(2) .create(); - let resolver = - resolve::new_resolver::<_, _, RocksEngine>(pd_client, &bg_worker, RaftStoreBlackHole).0; + let resolver = resolve::new_resolver(pd_client, &bg_worker, FakeExtension).0; let msg_count = Arc::new(AtomicUsize::new(0)); let batch_msg_count = Arc::new(AtomicUsize::new(0)); let service = MockKvForRaft::new(Arc::clone(&msg_count), Arc::clone(&batch_msg_count), true); let (_mock_server, port) = create_mock_server(service, 60200, 60300).unwrap(); - let mut raft_client = get_raft_client(RaftStoreBlackHole, resolver); + let mut raft_client = get_raft_client(FakeExtension, resolver); let mut store1 = metapb::Store::default(); store1.set_id(1); @@ -443,9 +438,8 @@ fn test_store_allowlist() { let bg_worker = WorkerBuilder::new(thd_name!("background")) .thread_count(2) .create(); - let resolver = - resolve::new_resolver::<_, _, RocksEngine>(pd_client, &bg_worker, RaftStoreBlackHole).0; - let mut raft_client = get_raft_client(RaftStoreBlackHole, resolver); + let resolver = resolve::new_resolver(pd_client, &bg_worker, FakeExtension).0; + let mut raft_client = get_raft_client(FakeExtension, resolver); let msg_count1 = Arc::new(AtomicUsize::new(0)); let batch_msg_count1 = Arc::new(AtomicUsize::new(0)); From 4df0ad4856b4acd65c3a0c5e1dd8f9a71443dfab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 1 Dec 2022 15:18:01 +0800 Subject: [PATCH 377/676] log-backup: use `openssl` to replace `rustls` implement in backup-stream (#13851) ref tikv/tikv#13867 Signed-off-by: hillium Signed-off-by: Yu Juncen Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- Cargo.lock | 466 ++++++++++-------- components/backup-stream/Cargo.toml | 8 +- .../backup-stream/src/checkpoint_manager.rs | 2 +- .../src/metadata/store/lazy_etcd.rs | 28 +- components/cloud/aws/Cargo.toml | 2 +- components/security/Cargo.toml | 4 - components/security/src/lib.rs | 41 +- components/server/Cargo.toml | 2 +- components/server/src/server.rs | 8 +- 9 files changed, 334 insertions(+), 227 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1ccf961796e..a553d16f822 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -130,7 +130,7 @@ dependencies = [ "lexical-core", "multiversion", "num 0.4.0", - "rand 0.8.3", + "rand 0.8.5", "regex", "serde", "serde_derive", @@ -231,9 +231,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.22" +version = "0.1.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8df72488e87761e772f14ae0c2480396810e51b2c2ade912f97f0f7e5b95e3c" +checksum = "1e805d94e6b5001b651426cf4cd446b1ab5f319d27bab5c644f61de0a804360c" dependencies = [ "proc-macro2", "quote", @@ -299,6 +299,51 @@ dependencies = [ "uuid 0.8.2", ] +[[package]] +name = "axum" +version = "0.5.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acee9fd5073ab6b045a275b3e709c163dd36c90685219cb21804a147b58dba43" +dependencies = [ + "async-trait", + "axum-core", + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "hyper", + "itoa 1.0.1", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde", + "sync_wrapper", + "tokio", + "tower", + "tower-http", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e5939e02c56fecd5c017c37df4238c0a839fa76b7f97acdd7efb804fd181cc" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "mime", + "tower-layer", + "tower-service", +] + [[package]] name = "azure" version = "0.0.1" @@ -336,7 +381,7 @@ dependencies = [ "http", "log", "oauth2", - "rand 0.8.3", + "rand 0.8.5", "reqwest", "rustc_version 0.4.0", "serde", @@ -438,7 +483,7 @@ dependencies = [ "prometheus", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "security", "serde", "serde_derive", @@ -481,6 +526,7 @@ dependencies = [ "futures-io", "grpcio", "hex 0.4.2", + "indexmap", "kvproto", "lazy_static", "log_wrappers", @@ -491,9 +537,10 @@ dependencies = [ "protobuf", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "regex", "resolved_ts", + "security", "slog", "slog-global", "tempdir", @@ -508,7 +555,7 @@ dependencies = [ "tikv_util", "tokio", "tokio-stream", - "tokio-util 0.7.2", + "tokio-util", "tonic", "txn_types", "url", @@ -971,7 +1018,7 @@ dependencies = [ "libc 0.2.132", "panic_hook", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "static_assertions", "thiserror", "tikv_alloc", @@ -995,7 +1042,7 @@ dependencies = [ "futures 0.3.15", "kvproto", "parking_lot 0.12.0", - "rand 0.8.3", + "rand 0.8.5", "tikv_alloc", "tikv_util", "tokio", @@ -1455,7 +1502,7 @@ dependencies = [ "openssl", "prometheus", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "serde", "serde_derive", "slog", @@ -1525,7 +1572,7 @@ dependencies = [ "prometheus-static-metric", "protobuf", "raft", - "rand 0.8.3", + "rand 0.8.5", "regex", "rocksdb", "serde", @@ -1674,15 +1721,19 @@ dependencies = [ [[package]] name = "etcd-client" -version = "0.7.2" -source = "git+https://github.com/pingcap/etcd-client?rev=e0321a1990ee561cf042973666c0db61c8d82364#e0321a1990ee561cf042973666c0db61c8d82364" +version = "0.10.2" +source = "git+https://github.com/pingcap/etcd-client?rev=14a6f8731f1890d5fd2f6e16a9f0d0a306b0599e#14a6f8731f1890d5fd2f6e16a9f0d0a306b0599e" dependencies = [ "http", + "hyper", + "hyper-openssl", + "openssl", "prost", "tokio", "tokio-stream", "tonic", "tonic-build", + "tower", "tower-service", "visible", ] @@ -1724,7 +1775,7 @@ dependencies = [ "openssl", "prometheus", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "rusoto_core", "rust-ini", "slog", @@ -1734,7 +1785,7 @@ dependencies = [ "tikv_alloc", "tikv_util", "tokio", - "tokio-util 0.7.2", + "tokio-util", "url", ] @@ -1775,7 +1826,7 @@ dependencies = [ "tempfile", "tikv_util", "tokio", - "tokio-util 0.7.2", + "tokio-util", "url", ] @@ -1787,7 +1838,7 @@ checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" dependencies = [ "lazy_static", "log", - "rand 0.8.3", + "rand 0.8.5", ] [[package]] @@ -1823,7 +1874,7 @@ dependencies = [ "parking_lot 0.12.0", "prometheus", "prometheus-static-metric", - "rand 0.8.3", + "rand 0.8.5", "serde", "slog", "slog-global", @@ -1872,9 +1923,9 @@ dependencies = [ [[package]] name = "fixedbitset" -version = "0.2.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" @@ -2304,9 +2355,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.3" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" +checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" dependencies = [ "bytes", "fnv", @@ -2317,7 +2368,7 @@ dependencies = [ "indexmap", "slab", "tokio", - "tokio-util 0.6.6", + "tokio-util", "tracing", ] @@ -2401,31 +2452,37 @@ dependencies = [ [[package]] name = "http" -version = "0.2.4" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" +checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 0.4.4", + "itoa 1.0.1", ] [[package]] name = "http-body" -version = "0.4.2" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", "http", "pin-project-lite", ] +[[package]] +name = "http-range-header" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" + [[package]] name = "httparse" -version = "1.4.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" [[package]] name = "httpdate" @@ -2441,9 +2498,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.11" +version = "0.14.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b61cf2d1aebcf6e6352c97b81dc2244ca29194be1b276f5d8ad5c6330fffb11" +checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" dependencies = [ "bytes", "futures-channel", @@ -2454,7 +2511,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 0.4.4", + "itoa 1.0.1", "pin-project-lite", "socket2", "tokio", @@ -2943,6 +3000,12 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" +[[package]] +name = "matchit" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" + [[package]] name = "md-5" version = "0.9.1" @@ -3016,9 +3079,9 @@ dependencies = [ [[package]] name = "mime" -version = "0.3.14" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd1d63acd1b78403cc0c325605908475dd9b9a3acbf65ed8bcab97e27014afcf" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "minimal-lexical" @@ -3058,7 +3121,7 @@ dependencies = [ "kernel32-sys", "libc 0.2.132", "log", - "miow 0.2.2", + "miow", "net2", "slab", "winapi 0.2.8", @@ -3066,15 +3129,14 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.0" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2" +checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ "libc 0.2.132", "log", - "miow 0.3.7", - "ntapi", - "winapi 0.3.9", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.42.0", ] [[package]] @@ -3101,15 +3163,6 @@ dependencies = [ "ws2_32-sys", ] -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi 0.3.9", -] - [[package]] name = "mmap" version = "0.1.1" @@ -3445,7 +3498,7 @@ dependencies = [ "chrono", "getrandom 0.2.3", "http", - "rand 0.8.3", + "rand 0.8.5", "reqwest", "serde", "serde_json", @@ -3466,9 +3519,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.10.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" +checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" [[package]] name = "online_config" @@ -3633,7 +3686,7 @@ dependencies = [ "libc 0.2.132", "redox_syscall 0.2.11", "smallvec", - "windows-sys", + "windows-sys 0.32.0", ] [[package]] @@ -3724,9 +3777,9 @@ dependencies = [ [[package]] name = "petgraph" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7" +checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" dependencies = [ "fixedbitset", "indexmap", @@ -3758,7 +3811,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d43f3220d96e0080cc9ea234978ccd80d904eafb17be31bb0f76daaea6493082" dependencies = [ "phf_shared", - "rand 0.8.3", + "rand 0.8.5", ] [[package]] @@ -3792,9 +3845,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.6" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc0e1f259c92177c30a4c9d177246edd0a3568b25756a977d0632cf8fa37e905" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" @@ -3895,6 +3948,16 @@ version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" +[[package]] +name = "prettyplease" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c142c0e46b57171fe0c528bee8c5b7569e80f0c17e377cd0e30ea57dbc11bb51" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -3933,11 +3996,11 @@ checksum = "369a6ed065f249a159e06c45752c780bda2fb53c995718f9e484d08daa9eb42e" [[package]] name = "proc-macro2" -version = "1.0.36" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] @@ -4006,9 +4069,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de5e2533f59d08fcf364fd374ebda0692a70bd6d7e66ef97f306f45c6c5d8020" +checksum = "a0841812012b2d4a6145fae9a6af1534873c32aa67fff26bd09f8fa42c83f95a" dependencies = [ "bytes", "prost-derive", @@ -4016,27 +4079,31 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603" +checksum = "1d8b442418ea0822409d9e7d047cbf1e7e9e1760b172bf9982cf29d517c93511" dependencies = [ "bytes", - "heck 0.3.1", + "heck 0.4.0", "itertools", + "lazy_static", "log", "multimap", "petgraph", + "prettyplease", "prost", "prost-types", + "regex", + "syn", "tempfile", "which", ] [[package]] name = "prost-derive" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "600d2f334aa05acb02a755e217ef1ab6dea4d51b58b7846588b747edec04efba" +checksum = "164ae68b6587001ca506d3bf7f1000bfa248d0e1217b618108fba4ec1d0cc306" dependencies = [ "anyhow", "itertools", @@ -4047,9 +4114,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "603bbd6394701d13f3f25aada59c7de9d35a6a5887cfc156181234a44002771b" +checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a" dependencies = [ "bytes", "prost", @@ -4125,7 +4192,7 @@ dependencies = [ "getset", "protobuf", "raft-proto", - "rand 0.8.3", + "rand 0.8.5", "slog", "thiserror", ] @@ -4254,7 +4321,7 @@ dependencies = [ "protobuf", "raft", "raft-proto", - "rand 0.8.3", + "rand 0.8.5", "resource_metering", "serde", "serde_derive", @@ -4339,19 +4406,18 @@ dependencies = [ "libc 0.2.132", "rand_chacha 0.2.1", "rand_core 0.5.1", - "rand_hc 0.2.0", + "rand_hc", ] [[package]] name = "rand" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc 0.2.132", "rand_chacha 0.3.0", "rand_core 0.6.2", - "rand_hc 0.3.0", ] [[package]] @@ -4416,15 +4482,6 @@ dependencies = [ "rand_core 0.5.1", ] -[[package]] -name = "rand_hc" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" -dependencies = [ - "rand_core 0.6.2", -] - [[package]] name = "rand_isaac" version = "0.3.0" @@ -4636,7 +4693,7 @@ dependencies = [ "pin-project", "procinfo", "prometheus", - "rand 0.8.3", + "rand 0.8.5", "serde", "serde_derive", "slog", @@ -4881,19 +4938,6 @@ dependencies = [ "semver 1.0.4", ] -[[package]] -name = "rustls" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" -dependencies = [ - "base64", - "log", - "ring", - "sct", - "webpki", -] - [[package]] name = "rustversion" version = "1.0.4" @@ -4937,16 +4981,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "sct" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "seahash" version = "4.1.0" @@ -4965,7 +4999,6 @@ dependencies = [ "serde_json", "tempfile", "tikv_util", - "tonic", ] [[package]] @@ -5193,7 +5226,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "resolved_ts", "resource_metering", "security", @@ -5410,9 +5443,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.4.4" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" +checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ "libc 0.2.132", "winapi 0.3.9", @@ -5600,13 +5633,13 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.86" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" +checksum = "a864042229133ada95abf3b54fdc62ef5ccabe9515b64717bcb9a1919e59445d" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] [[package]] @@ -5708,7 +5741,7 @@ checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" dependencies = [ "cfg-if 1.0.0", "libc 0.2.132", - "rand 0.8.3", + "rand 0.8.5", "redox_syscall 0.2.11", "remove_dir_all", "winapi 0.3.9", @@ -5752,7 +5785,7 @@ dependencies = [ "grpcio", "kvproto", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "tempfile", "test_raftstore", "tidb_query_common", @@ -5847,7 +5880,7 @@ dependencies = [ "protobuf", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "resolved_ts", "resource_metering", "security", @@ -5903,7 +5936,7 @@ dependencies = [ "fail", "grpcio", "kvproto", - "rand 0.8.3", + "rand 0.8.5", "rand_isaac", "security", "slog", @@ -5960,7 +5993,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "rand_xorshift", "resource_metering", "security", @@ -6174,7 +6207,7 @@ dependencies = [ "panic_hook", "profiler", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "regex", "safemem", "serde", @@ -6351,7 +6384,7 @@ dependencies = [ "raft-engine-ctl", "raft_log_engine", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "regex", "security", "serde_json", @@ -6506,7 +6539,7 @@ dependencies = [ "prometheus", "prometheus-static-metric", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "regex", "rusoto_core", "serde", @@ -6574,16 +6607,16 @@ dependencies = [ [[package]] name = "tokio" -version = "1.17.0" +version = "1.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" +checksum = "a9e03c497dc955702ba729190dc4aac6f2a0ce97f913e5b1b5912fc5039d9099" dependencies = [ + "autocfg", "bytes", "libc 0.2.132", "memchr", - "mio 0.8.0", + "mio 0.8.5", "num_cpus", - "once_cell", "parking_lot 0.12.0", "pin-project-lite", "signal-hook-registry", @@ -6645,22 +6678,11 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-rustls" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" -dependencies = [ - "rustls", - "tokio", - "webpki", -] - [[package]] name = "tokio-stream" -version = "0.1.8" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -6678,20 +6700,6 @@ dependencies = [ "tokio-executor", ] -[[package]] -name = "tokio-util" -version = "0.6.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-util" version = "0.7.2" @@ -6704,6 +6712,7 @@ dependencies = [ "futures-sink", "pin-project-lite", "tokio", + "tracing", ] [[package]] @@ -6717,12 +6726,13 @@ dependencies = [ [[package]] name = "tonic" -version = "0.5.2" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "796c5e1cd49905e65dd8e700d4cb1dffcbfdb4fc9d017de08c1a537afd83627c" +checksum = "55b9af819e54b8f33d453655bef9b9acc171568fb49523078d0cc4e7484200ec" dependencies = [ "async-stream 0.3.3", "async-trait", + "axum", "base64", "bytes", "futures-core", @@ -6737,9 +6747,8 @@ dependencies = [ "prost", "prost-derive", "tokio", - "tokio-rustls", "tokio-stream", - "tokio-util 0.6.6", + "tokio-util", "tower", "tower-layer", "tower-service", @@ -6749,10 +6758,11 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.5.2" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12b52d07035516c2b74337d2ac7746075e7dcae7643816c1b12c5ff8a7484c08" +checksum = "48c6fd7c2581e36d63388a9e04c350c21beb7a8b059580b2e93993c526899ddc" dependencies = [ + "prettyplease", "proc-macro2", "prost-build", "quote", @@ -6761,24 +6771,43 @@ dependencies = [ [[package]] name = "tower" -version = "0.4.8" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f60422bc7fefa2f3ec70359b8ff1caff59d785877eb70595904605bcc412470f" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", "indexmap", "pin-project", - "rand 0.8.3", + "pin-project-lite", + "rand 0.8.5", "slab", "tokio", - "tokio-stream", - "tokio-util 0.6.6", + "tokio-util", "tower-layer", "tower-service", "tracing", ] +[[package]] +name = "tower-http" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" +dependencies = [ + "bitflags", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-range-header", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-layer" version = "0.3.1" @@ -6787,9 +6816,9 @@ checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" [[package]] name = "tower-service" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" @@ -6883,7 +6912,7 @@ dependencies = [ "kvproto", "log_wrappers", "panic_hook", - "rand 0.8.3", + "rand 0.8.5", "slog", "thiserror", "tikv_alloc", @@ -6917,6 +6946,12 @@ dependencies = [ "matches", ] +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" + [[package]] name = "unicode-normalization" version = "0.1.12" @@ -6938,12 +6973,6 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7007dbd421b92cc6e28410fe7362e2e0a2503394908f417b68ec8d1c364c4e20" -[[package]] -name = "unicode-xid" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" - [[package]] name = "untrusted" version = "0.7.1" @@ -7063,6 +7092,12 @@ version = "0.10.2+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "wasm-bindgen" version = "0.2.79" @@ -7141,16 +7176,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab146130f5f790d45f82aeeb09e55a256573373ec64409fc19a6fb82fb1032ae" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "which" version = "4.2.4" @@ -7211,43 +7236,100 @@ version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3df6e476185f92a12c072be4a189a0210dcdcf512a1891d6dff9edb874deadc6" dependencies = [ - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_msvc", + "windows_aarch64_msvc 0.32.0", + "windows_i686_gnu 0.32.0", + "windows_i686_msvc 0.32.0", + "windows_x86_64_gnu 0.32.0", + "windows_x86_64_msvc 0.32.0", ] +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc 0.42.0", + "windows_i686_gnu 0.42.0", + "windows_i686_msvc 0.42.0", + "windows_x86_64_gnu 0.42.0", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc 0.42.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" + [[package]] name = "windows_aarch64_msvc" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8e92753b1c443191654ec532f14c199742964a061be25d77d7a96f09db20bf5" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" + [[package]] name = "windows_i686_gnu" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a711c68811799e017b6038e0922cb27a5e2f43a2ddb609fe0b6f3eeda9de615" +[[package]] +name = "windows_i686_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" + [[package]] name = "windows_i686_msvc" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "146c11bb1a02615db74680b32a68e2d61f553cc24c4eb5b4ca10311740e44172" +[[package]] +name = "windows_i686_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" + [[package]] name = "windows_x86_64_gnu" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c912b12f7454c6620635bbff3450962753834be2a594819bd5e945af18ec64bc" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" + [[package]] name = "windows_x86_64_msvc" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "504a2476202769977a040c6364301a3f65d0cc9e3fb08600b2bda150a0488316" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" + [[package]] name = "winreg" version = "0.7.0" @@ -7306,7 +7388,7 @@ dependencies = [ "num_cpus", "parking_lot_core 0.9.1", "prometheus", - "rand 0.8.3", + "rand 0.8.5", ] [[package]] diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 0f3b97461bb..e5863f44c4d 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -32,7 +32,7 @@ engine_traits = { workspace = true } error_code = { workspace = true } # We cannot update the etcd-client to latest version because of the cyclic requirement. # Also we need wait until https://github.com/etcdv3/etcd-client/pull/43/files to be merged. -etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "e0321a1990ee561cf042973666c0db61c8d82364", features = ["pub-response-field", "tls"] } +etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "14a6f8731f1890d5fd2f6e16a9f0d0a306b0599e", features = ["pub-response-field", "tls-openssl-vendored"] } external_storage = { workspace = true } external_storage_export = { workspace = true } fail = "0.5" @@ -42,6 +42,9 @@ futures-io = "0.3" grpcio = { workspace = true } hex = "0.4" + +# Fixing ahash cyclic dep: https://github.com/tkaitchuck/ahash/issues/95 +indexmap = "=1.6.2" kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.4" log_wrappers = { workspace = true } @@ -54,6 +57,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raftstore = { workspace = true } regex = "1" resolved_ts = { workspace = true } +security = { path = "../security" } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1" @@ -65,7 +69,7 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "macros", "time", "sync"] } tokio-stream = "0.1" tokio-util = { version = "0.7", features = ["compat"] } -tonic = "0.5" +tonic = "0.8" txn_types = { workspace = true } uuid = "0.8" yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index e9f930e8563..f34211ef7a5 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -329,7 +329,7 @@ pub trait FlushObserver: Send + 'static { /// Note the new resolved ts cannot be greater than the old resolved ts. async fn rewrite_resolved_ts( &mut self, - #[allow(unused_variables)] task: &str, + #[allow(unused_variables)] _task: &str, ) -> Option { None } diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 8cd6b87ec71..6fc3a5332ea 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -2,9 +2,9 @@ use std::{sync::Arc, time::Duration}; -use etcd_client::{ConnectOptions, Error as EtcdError, TlsOptions}; +use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; -use tikv_util::stream::RetryError; +use tikv_util::stream::{RetryError, RetryExt}; use tokio::sync::OnceCell; use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; @@ -15,8 +15,9 @@ const RPC_TIMEOUT: Duration = Duration::from_secs(30); #[derive(Clone)] pub struct LazyEtcdClient(Arc); +#[derive(Debug)] pub struct ConnectionConfig { - pub tls: Option, + pub tls: Option, pub keep_alive_interval: Duration, pub keep_alive_timeout: Duration, } @@ -26,12 +27,16 @@ impl ConnectionConfig { fn to_connection_options(&self) -> ConnectOptions { let mut opts = ConnectOptions::new(); if let Some(tls) = &self.tls { - opts = opts.with_tls(tls.clone()) + opts = opts.with_openssl_tls( + OpenSslClientConfig::default() + .ca_cert_pem(&tls.ca) + .client_cert_pem_and_key(&tls.client_cert, &tls.client_key.0), + ) } opts = opts .with_keep_alive(self.keep_alive_interval, self.keep_alive_timeout) - .with_timeout(RPC_TIMEOUT) - .keep_alive_while_idle(false); + .with_keep_alive_while_idle(false) + .with_timeout(RPC_TIMEOUT); opts } @@ -68,7 +73,9 @@ fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { EtcdError::InvalidArgs(_) | EtcdError::InvalidUri(_) | EtcdError::Utf8Error(_) - | EtcdError::InvalidHeaderValue(_) => false, + | EtcdError::InvalidHeaderValue(_) + | EtcdError::EndpointError(_) + | EtcdError::OpenSsl(_) => false, EtcdError::TransportError(_) | EtcdError::IoError(_) | EtcdError::WatchError(_) @@ -84,6 +91,7 @@ fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { } } +#[derive(Debug)] struct RetryableEtcdError(EtcdError); impl RetryError for RetryableEtcdError { @@ -103,7 +111,11 @@ where F: Future>, { use futures::TryFutureExt; - let r = tikv_util::stream::retry(move || action().err_into::()).await; + let r = tikv_util::stream::retry_ext( + move || action().err_into::(), + RetryExt::default().with_fail_hook(|err| println!("meet error {:?}", err)), + ) + .await; r.map_err(|err| err.0.into()) } diff --git a/components/cloud/aws/Cargo.toml b/components/cloud/aws/Cargo.toml index 964048121d6..e539c67f571 100644 --- a/components/cloud/aws/Cargo.toml +++ b/components/cloud/aws/Cargo.toml @@ -38,7 +38,7 @@ tikv_util = { workspace = true } # better to not use slog-global, but pass in the logger tokio = { version = "1.5", features = ["time"] } url = "2.0" -uuid = "0.8" +uuid = { version = "0.8", features = ["v4"] } [dev-dependencies] futures = "0.3" diff --git a/components/security/Cargo.toml b/components/security/Cargo.toml index 4599b1df43e..a9cdd620d12 100644 --- a/components/security/Cargo.toml +++ b/components/security/Cargo.toml @@ -4,9 +4,6 @@ version = "0.0.1" edition = "2018" publish = false -[features] -tonic = ["dep:tonic"] - [dependencies] collections = { workspace = true } encryption = { workspace = true } @@ -15,7 +12,6 @@ serde = "1.0" serde_derive = "1.0" serde_json = "1.0" tikv_util = { workspace = true } -tonic = { version = "0.5", features = ["tls"], optional = true } [dev-dependencies] tempfile = "3.0" diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index cc87469426c..52f438236fd 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -18,8 +18,6 @@ use grpcio::{ RpcContext, RpcStatus, RpcStatusCode, ServerBuilder, ServerChecker, ServerCredentialsBuilder, ServerCredentialsFetcher, }; -#[cfg(feature = "tonic")] -use tonic::transport::{channel::ClientTlsConfig, Certificate, Identity}; #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Default)] #[serde(default)] @@ -70,6 +68,23 @@ fn load_key(tag: &str, path: &str) -> Result, Box> { type CertResult = Result<(Vec, Vec, Vec), Box>; +type Pem = Box<[u8]>; + +pub struct Secret(pub Pem); + +impl std::fmt::Debug for Secret { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Secret").finish() + } +} + +#[derive(Debug)] +pub struct ClientSuite { + pub ca: Pem, + pub client_cert: Pem, + pub client_key: Secret, +} + impl SecurityConfig { /// Validates ca, cert and private key. pub fn validate(&self) -> Result<(), Box> { @@ -124,21 +139,13 @@ impl SecurityManager { }) } - #[cfg(feature = "tonic")] - /// Make a tonic tls config via the config. - pub fn tonic_tls_config(&self) -> Option { - let (ca, cert, key) = self.cfg.load_certs().unwrap_or_default(); - if ca.is_empty() && cert.is_empty() && key.is_empty() { - return None; - } - let mut cfg = ClientTlsConfig::new(); - if !ca.is_empty() { - cfg = cfg.ca_certificate(Certificate::from_pem(ca)); - } - if !cert.is_empty() && !key.is_empty() { - cfg = cfg.identity(Identity::from_pem(cert, key)); - } - Some(cfg) + pub fn client_suite(&self) -> Result> { + let (ca, cert, key) = self.cfg.load_certs()?; + Ok(ClientSuite { + ca: ca.into_boxed_slice(), + client_cert: cert.into_boxed_slice(), + client_key: Secret(key.into_boxed_slice()), + }) } pub fn connect(&self, mut cb: ChannelBuilder, addr: &str) -> Channel { diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index 1f4d98b2847..7a40340b64e 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -69,7 +69,7 @@ raftstore = { workspace = true, features = ["engine_rocks"] } rand = "0.8" resolved_ts = { workspace = true } resource_metering = { workspace = true } -security = { workspace = true, features = ["tonic"] } +security = { workspace = true } serde_json = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 3ce38d0c79e..b52abc960d8 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -990,7 +990,13 @@ where ConnectionConfig { keep_alive_interval: self.config.server.grpc_keepalive_time.0, keep_alive_timeout: self.config.server.grpc_keepalive_timeout.0, - tls: self.security_mgr.tonic_tls_config(), + tls: self + .security_mgr + .client_suite() + .map_err(|err| { + warn!("Failed to load client TLS suite, ignoring TLS config."; "err" => %err); + }) + .ok(), }, ); let backup_stream_endpoint = backup_stream::Endpoint::new( From 6bccbf89dd579ddd7df79f10b77441efb4e39bab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 1 Dec 2022 16:50:01 +0800 Subject: [PATCH 378/676] import: cache storage when possible (#13783) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#13798 - Make the import process asynchronous. - Added caching if client requires. Signed-off-by: hillium Signed-off-by: Yu Juncen Signed-off-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Signed-off-by: hillium --- Cargo.lock | 9 +- components/backup-stream/src/router.rs | 18 +- components/cloud/aws/src/s3.rs | 6 +- components/cloud/azure/src/azblob.rs | 6 +- components/cloud/gcp/src/gcs.rs | 8 +- components/cloud/src/blob.rs | 10 +- .../external_storage/export/src/dylib.rs | 2 +- .../external_storage/export/src/export.rs | 41 ++-- .../external_storage/src/dylib_client.rs | 2 +- .../external_storage/src/grpc_client.rs | 2 +- components/external_storage/src/hdfs.rs | 11 +- components/external_storage/src/lib.rs | 91 ++++++-- components/external_storage/src/local.rs | 6 +- components/external_storage/src/noop.rs | 9 +- components/sst_importer/Cargo.toml | 1 + .../sst_importer/src/caching/cache_map.rs | 211 ++++++++++++++++++ components/sst_importer/src/caching/mod.rs | 4 + .../sst_importer/src/caching/storage_cache.rs | 58 +++++ components/sst_importer/src/import_mode.rs | 27 ++- components/sst_importer/src/lib.rs | 1 + components/sst_importer/src/metrics.rs | 5 + components/sst_importer/src/sst_importer.rs | 189 ++++++++++++---- components/sst_importer/src/util.rs | 8 + src/import/sst_service.rs | 75 ++++--- 24 files changed, 604 insertions(+), 196 deletions(-) create mode 100644 components/sst_importer/src/caching/cache_map.rs create mode 100644 components/sst_importer/src/caching/mod.rs create mode 100644 components/sst_importer/src/caching/storage_cache.rs diff --git a/Cargo.lock b/Cargo.lock index a553d16f822..2b237c8c25f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1373,9 +1373,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "5.2.0" +version = "5.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8858831f7781322e539ea39e72449c46b059638250c14344fec8d0aa6e539c" +checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" dependencies = [ "cfg-if 1.0.0", "num_cpus", @@ -5368,9 +5368,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.9.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" +checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" [[package]] name = "smartstring" @@ -5479,6 +5479,7 @@ dependencies = [ "log_wrappers", "openssl", "prometheus", + "rand 0.8.3", "serde", "serde_derive", "slog", diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 56bd00bba87..ead124c103a 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -1506,11 +1506,10 @@ struct TaskRange { #[cfg(test)] mod tests { - use std::{ffi::OsStr, marker::Unpin, time::Duration}; + use std::{ffi::OsStr, time::Duration}; - use external_storage::NoopStorage; + use external_storage::{ExternalData, NoopStorage}; use futures::AsyncReadExt; - use futures_io::AsyncRead; use kvproto::brpb::{Local, Noop, StorageBackend, StreamBackupTaskInfo}; use tikv_util::{ codec::number::NumberEncoder, @@ -1929,16 +1928,11 @@ mod tests { self.inner.write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { self.inner.read(name) } - fn read_part( - &self, - name: &str, - off: u64, - len: u64, - ) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { self.inner.read_part(name, off, len) } } @@ -2277,11 +2271,11 @@ mod tests { } } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> external_storage::ExternalData<'_> { self.s.read(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> external_storage::ExternalData<'_> { self.s.read_part(name, off, len) } } diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index 469cac97d6c..a7ea47ec9d2 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -222,7 +222,7 @@ impl S3Storage { key.to_owned() } - fn get_range(&self, name: &str, range: Option) -> Box { + fn get_range(&self, name: &str, range: Option) -> cloud::blob::BlobStream<'_> { let key = self.maybe_prefix_key(name); let bucket = self.config.bucket.bucket.clone(); debug!("read file from s3 storage"; "key" => %key); @@ -595,11 +595,11 @@ impl BlobStorage for S3Storage { }) } - fn get(&self, name: &str) -> Box { + fn get(&self, name: &str) -> cloud::blob::BlobStream<'_> { self.get_range(name, None) } - fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + fn get_part(&self, name: &str, off: u64, len: u64) -> cloud::blob::BlobStream<'_> { // inclusive, bytes=0-499 -> [0, 499] self.get_range(name, Some(format!("bytes={}-{}", off, off + len - 1))) } diff --git a/components/cloud/azure/src/azblob.rs b/components/cloud/azure/src/azblob.rs index 5bf02696de7..12b6149fad5 100644 --- a/components/cloud/azure/src/azblob.rs +++ b/components/cloud/azure/src/azblob.rs @@ -558,7 +558,7 @@ impl AzureStorage { &self, name: &str, range: Option>, - ) -> Box { + ) -> cloud::blob::BlobStream<'_> { let name = self.maybe_prefix_key(name); debug!("read file from Azure storage"; "key" => %name); let t = async move { @@ -602,11 +602,11 @@ impl BlobStorage for AzureStorage { uploader.run(&mut reader, content_length).await } - fn get(&self, name: &str) -> Box { + fn get(&self, name: &str) -> cloud::blob::BlobStream<'_> { self.get_range(name, None) } - fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + fn get_part(&self, name: &str, off: u64, len: u64) -> cloud::blob::BlobStream<'_> { self.get_range(name, Some(off..off + len)) } } diff --git a/components/cloud/gcp/src/gcs.rs b/components/cloud/gcp/src/gcs.rs index 01f69a6d245..61e432c9431 100644 --- a/components/cloud/gcp/src/gcs.rs +++ b/components/cloud/gcp/src/gcs.rs @@ -347,14 +347,14 @@ impl GcsStorage { Ok(res) } - fn error_to_async_read(kind: io::ErrorKind, e: E) -> Box + fn error_to_async_read(kind: io::ErrorKind, e: E) -> cloud::blob::BlobStream<'static> where E: Into>, { Box::new(error_stream(io::Error::new(kind, e)).into_async_read()) } - fn get_range(&self, name: &str, range: Option) -> Box { + fn get_range(&self, name: &str, range: Option) -> cloud::blob::BlobStream<'_> { let bucket = self.config.bucket.bucket.to_string(); let name = self.maybe_prefix_key(name); debug!("read file from GCS storage"; "key" => %name); @@ -513,11 +513,11 @@ impl BlobStorage for GcsStorage { Ok::<_, io::Error>(()) } - fn get(&self, name: &str) -> Box { + fn get(&self, name: &str) -> cloud::blob::BlobStream<'_> { self.get_range(name, None) } - fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + fn get_part(&self, name: &str, off: u64, len: u64) -> cloud::blob::BlobStream<'_> { // inclusive, bytes=0-499 -> [0, 499] self.get_range(name, Some(format!("bytes={}-{}", off, off + len - 1))) } diff --git a/components/cloud/src/blob.rs b/components/cloud/src/blob.rs index d80d3a47a28..84ca77042d7 100644 --- a/components/cloud/src/blob.rs +++ b/components/cloud/src/blob.rs @@ -19,6 +19,8 @@ pub trait BlobConfig: 'static + Send + Sync { /// wrappers exists. pub struct PutResource(pub Box); +pub type BlobStream<'a> = Box; + impl AsyncRead for PutResource { fn poll_read( self: Pin<&mut Self>, @@ -45,10 +47,10 @@ pub trait BlobStorage: 'static + Send + Sync { async fn put(&self, name: &str, reader: PutResource, content_length: u64) -> io::Result<()>; /// Read all contents of the given path. - fn get(&self, name: &str) -> Box; + fn get(&self, name: &str) -> BlobStream<'_>; /// Read part of contents of the given path. - fn get_part(&self, name: &str, off: u64, len: u64) -> Box; + fn get_part(&self, name: &str, off: u64, len: u64) -> BlobStream<'_>; } impl BlobConfig for dyn BlobStorage { @@ -72,11 +74,11 @@ impl BlobStorage for Box { fut.await } - fn get(&self, name: &str) -> Box { + fn get(&self, name: &str) -> BlobStream<'_> { (**self).get(name) } - fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + fn get_part(&self, name: &str, off: u64, len: u64) -> BlobStream<'_> { (**self).get_part(name, off, len) } } diff --git a/components/external_storage/export/src/dylib.rs b/components/external_storage/export/src/dylib.rs index a02f5f2fade..308973de95e 100644 --- a/components/external_storage/export/src/dylib.rs +++ b/components/external_storage/export/src/dylib.rs @@ -188,7 +188,7 @@ pub mod staticlib { .map_err(anyhow_to_io_log_error) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> crate::ExternalData<'_> { unimplemented!("use restore instead of read") } diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/export/src/export.rs index 10363bf92b2..ad31dc363ae 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/export/src/export.rs @@ -3,11 +3,7 @@ //! To use External storage with protobufs as an application, import this //! module. external_storage contains the actual library code //! Cloud provider backends are under components/cloud -use std::{ - io::{self, Write}, - path::Path, - sync::Arc, -}; +use std::{io, path::Path, sync::Arc}; use async_trait::async_trait; #[cfg(feature = "cloud-aws")] @@ -24,22 +20,19 @@ use external_storage::dylib_client; use external_storage::grpc_client; pub use external_storage::{ compression_reader_dispatcher, encrypt_wrap_reader, read_external_storage_info_buff, - read_external_storage_into_file, record_storage_create, BackendConfig, ExternalStorage, - HdfsStorage, LocalStorage, NoopStorage, RestoreConfig, UnpinReader, MIN_READ_SPEED, + read_external_storage_into_file, record_storage_create, BackendConfig, ExternalData, + ExternalStorage, HdfsStorage, LocalStorage, NoopStorage, RestoreConfig, UnpinReader, + MIN_READ_SPEED, }; -use futures_io::AsyncRead; #[cfg(feature = "cloud-gcp")] pub use gcp::{Config as GcsConfig, GcsStorage}; pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; #[cfg(any(feature = "cloud-gcp", feature = "cloud-aws", feature = "cloud-azure"))] use kvproto::brpb::{AzureBlobStorage, Gcs, S3}; use kvproto::brpb::{CloudDynamic, Noop, StorageBackend}; +use tikv_util::time::{Instant, Limiter}; #[cfg(feature = "cloud-storage-dylib")] use tikv_util::warn; -use tikv_util::{ - stream::block_on_external_io, - time::{Instant, Limiter}, -}; #[cfg(feature = "cloud-storage-dylib")] use crate::dylib; @@ -307,13 +300,13 @@ impl std::ops::Deref for BlobStore { } } -pub struct EncryptedExternalStorage { +pub struct EncryptedExternalStorage { pub key_manager: Arc, - pub storage: Box, + pub storage: S, } #[async_trait] -impl ExternalStorage for EncryptedExternalStorage { +impl ExternalStorage for EncryptedExternalStorage { fn name(&self) -> &'static str { self.storage.name() } @@ -323,13 +316,13 @@ impl ExternalStorage for EncryptedExternalStorage { async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()> { self.storage.write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { self.storage.read(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { self.storage.read_part(name, off, len) } - fn restore( + async fn restore( &self, storage_name: &str, restore_name: std::path::PathBuf, @@ -353,19 +346,19 @@ impl ExternalStorage for EncryptedExternalStorage { compression_reader_dispatcher(compression_type, inner)? }; - let file_writer: &mut dyn Write = - &mut self.key_manager.create_file_for_write(restore_name)?; + let file_writer = self.key_manager.create_file_for_write(&restore_name)?; let min_read_speed: usize = 8192; let mut input = encrypt_wrap_reader(file_crypter, reader)?; - block_on_external_io(read_external_storage_into_file( + read_external_storage_into_file( &mut input, file_writer, speed_limiter, expected_length, expected_sha256, min_read_speed, - )) + ) + .await } } @@ -383,11 +376,11 @@ impl ExternalStorage for BlobStore { .await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { (**self).get(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { (**self).get_part(name, off, len) } } diff --git a/components/external_storage/src/dylib_client.rs b/components/external_storage/src/dylib_client.rs index 6d6dc35cf8a..9e2748c2011 100644 --- a/components/external_storage/src/dylib_client.rs +++ b/components/external_storage/src/dylib_client.rs @@ -92,7 +92,7 @@ impl ExternalStorage for ExternalStorageClient { .map_err(anyhow_to_io_log_error) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> crate::ExternalData<'_> { unimplemented!("use restore instead of read") } diff --git a/components/external_storage/src/grpc_client.rs b/components/external_storage/src/grpc_client.rs index 3d715dfcd47..e836d8fb58a 100644 --- a/components/external_storage/src/grpc_client.rs +++ b/components/external_storage/src/grpc_client.rs @@ -95,7 +95,7 @@ impl ExternalStorage for ExternalStorageClient { .map_err(anyhow_to_io_log_error) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> crate::ExternalData<'_> { unimplemented!("use restore instead of read") } diff --git a/components/external_storage/src/hdfs.rs b/components/external_storage/src/hdfs.rs index a9fa65dcdcf..17556490320 100644 --- a/components/external_storage/src/hdfs.rs +++ b/components/external_storage/src/hdfs.rs @@ -7,7 +7,7 @@ use tokio::{io as async_io, process::Command}; use tokio_util::compat::FuturesAsyncReadCompatExt; use url::Url; -use crate::{ExternalStorage, UnpinReader}; +use crate::{ExternalData, ExternalStorage, UnpinReader}; /// Convert `hdfs:///path` to `/path` fn try_convert_to_path(url: &Url) -> &str { @@ -131,16 +131,11 @@ impl ExternalStorage for HdfsStorage { } } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> ExternalData<'_> { unimplemented!("currently only HDFS export is implemented") } - fn read_part( - &self, - _name: &str, - _off: u64, - _len: u64, - ) -> Box { + fn read_part(&self, _name: &str, _off: u64, _len: u64) -> ExternalData<'_> { unimplemented!("currently only HDFS export is implemented") } } diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index e1c57608197..c344f09968b 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -26,7 +26,7 @@ use futures_util::AsyncReadExt; use kvproto::brpb::CompressionType; use openssl::hash::{Hasher, MessageDigest}; use tikv_util::{ - stream::{block_on_external_io, READ_BUF_SIZE}, + stream::READ_BUF_SIZE, time::{Instant, Limiter}, }; use tokio::time::timeout; @@ -58,6 +58,8 @@ pub fn record_storage_create(start: Instant, storage: &dyn ExternalStorage) { /// signature of write.) see https://github.com/rust-lang/rust/issues/63033 pub struct UnpinReader(pub Box); +pub type ExternalData<'a> = Box; + #[derive(Debug, Default)] pub struct BackendConfig { pub s3_multi_part_size: usize, @@ -73,10 +75,10 @@ pub struct RestoreConfig { } /// a reader dispatcher for different compression type. -pub fn compression_reader_dispatcher<'a>( +pub fn compression_reader_dispatcher( compression_type: Option, - inner: Box, -) -> io::Result> { + inner: ExternalData<'_>, +) -> io::Result> { match compression_type { Some(c) => match c { // The log files generated from TiKV v6.2.0 use the default value (0). @@ -107,13 +109,13 @@ pub trait ExternalStorage: 'static + Send + Sync { async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()>; /// Read all contents of the given path. - fn read(&self, name: &str) -> Box; + fn read(&self, name: &str) -> ExternalData<'_>; /// Read part of contents of the given path. - fn read_part(&self, name: &str, off: u64, len: u64) -> Box; + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_>; /// Read from external storage and restore to the given path - fn restore( + async fn restore( &self, storage_name: &str, restore_name: std::path::PathBuf, @@ -137,22 +139,23 @@ pub trait ExternalStorage: 'static + Send + Sync { compression_reader_dispatcher(compression_type, inner)? }; - let output: &mut dyn Write = &mut File::create(restore_name)?; + let output = File::create(restore_name)?; // the minimum speed of reading data, in bytes/second. // if reading speed is slower than this rate, we will stop with // a "TimedOut" error. // (at 8 KB/s for a 2 MB buffer, this means we timeout after 4m16s.) let min_read_speed: usize = 8192; - let mut input = encrypt_wrap_reader(file_crypter, reader)?; + let input = encrypt_wrap_reader(file_crypter, reader)?; - block_on_external_io(read_external_storage_into_file( - &mut input, + read_external_storage_into_file( + input, output, speed_limiter, expected_length, expected_sha256, min_read_speed, - )) + ) + .await } } @@ -170,13 +173,32 @@ impl ExternalStorage for Arc { (**self).write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { (**self).read(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { (**self).read_part(name, off, len) } + + async fn restore( + &self, + storage_name: &str, + restore_name: std::path::PathBuf, + expected_length: u64, + speed_limiter: &Limiter, + restore_config: RestoreConfig, + ) -> io::Result<()> { + self.as_ref() + .restore( + storage_name, + restore_name, + expected_length, + speed_limiter, + restore_config, + ) + .await + } } #[async_trait] @@ -193,21 +215,40 @@ impl ExternalStorage for Box { self.as_ref().write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { self.as_ref().read(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { self.as_ref().read_part(name, off, len) } + + async fn restore( + &self, + storage_name: &str, + restore_name: std::path::PathBuf, + expected_length: u64, + speed_limiter: &Limiter, + restore_config: RestoreConfig, + ) -> io::Result<()> { + self.as_ref() + .restore( + storage_name, + restore_name, + expected_length, + speed_limiter, + restore_config, + ) + .await + } } /// Wrap the reader with file_crypter. /// Return the reader directly if file_crypter is None. -pub fn encrypt_wrap_reader<'a>( +pub fn encrypt_wrap_reader( file_crypter: Option, - reader: Box, -) -> io::Result> { + reader: ExternalData<'_>, +) -> io::Result> { let input = match file_crypter { Some(x) => Box::new(DecrypterReader::new( reader, @@ -221,14 +262,18 @@ pub fn encrypt_wrap_reader<'a>( Ok(input) } -pub async fn read_external_storage_into_file( - input: &mut (dyn AsyncRead + Unpin), - output: &mut dyn Write, +pub async fn read_external_storage_into_file( + mut input: In, + mut output: Out, speed_limiter: &Limiter, expected_length: u64, expected_sha256: Option>, min_read_speed: usize, -) -> io::Result<()> { +) -> io::Result<()> +where + In: AsyncRead + Unpin, + Out: Write, +{ let dur = Duration::from_secs((READ_BUF_SIZE / min_read_speed) as u64); // do the I/O copy from external_storage to the local file. diff --git a/components/external_storage/src/local.rs b/components/external_storage/src/local.rs index 4b22de96a6a..0bf6be65107 100644 --- a/components/external_storage/src/local.rs +++ b/components/external_storage/src/local.rs @@ -3,14 +3,12 @@ use std::{ fs::File as StdFile, io::{self, BufReader, Read, Seek}, - marker::Unpin, path::{Path, PathBuf}, sync::Arc, }; use async_trait::async_trait; use futures::io::AllowStdIo; -use futures_io::AsyncRead; use futures_util::stream::TryStreamExt; use rand::Rng; use tikv_util::stream::error_stream; @@ -119,7 +117,7 @@ impl ExternalStorage for LocalStorage { self.base_dir.sync_all().await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> crate::ExternalData<'_> { debug!("read file from local storage"; "name" => %name, "base" => %self.base.display()); // We used std i/o here for removing the requirement of tokio reactor when @@ -131,7 +129,7 @@ impl ExternalStorage for LocalStorage { } } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> crate::ExternalData<'_> { debug!("read part of file from local storage"; "name" => %name, "off" => %off, "len" => %len, "base" => %self.base.display()); diff --git a/components/external_storage/src/noop.rs b/components/external_storage/src/noop.rs index 42746742624..50e9c43c7bc 100644 --- a/components/external_storage/src/noop.rs +++ b/components/external_storage/src/noop.rs @@ -1,14 +1,11 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::marker::Unpin; - use async_trait::async_trait; -use futures_io::AsyncRead; use tokio::io; use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; use super::ExternalStorage; -use crate::UnpinReader; +use crate::{ExternalData, UnpinReader}; /// A storage saves files into void. /// It is mainly for test use. @@ -44,11 +41,11 @@ impl ExternalStorage for NoopStorage { Ok(()) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> ExternalData<'_> { Box::new(io::empty().compat()) } - fn read_part(&self, _name: &str, _off: u64, _len: u64) -> Box { + fn read_part(&self, _name: &str, _off: u64, _len: u64) -> ExternalData<'_> { Box::new(io::empty().compat()) } } diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index 6b5fbd9127f..0bba773418b 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -31,6 +31,7 @@ lazy_static = "1.3" log_wrappers = { workspace = true } openssl = "0.10" prometheus = { version = "0.13", default-features = false } +rand = "0.8" serde = "1.0" serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/sst_importer/src/caching/cache_map.rs b/components/sst_importer/src/caching/cache_map.rs new file mode 100644 index 00000000000..e88e5c3545d --- /dev/null +++ b/components/sst_importer/src/caching/cache_map.rs @@ -0,0 +1,211 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + time::Duration, +}; + +use dashmap::{mapref::entry::Entry, DashMap}; +use futures::Future; + +use crate::metrics::EXT_STORAGE_CACHE_COUNT; + +#[derive(Clone, Default)] +pub struct CacheMap(Arc>); + +impl CacheMap { + #[cfg(test)] + pub fn with_inner(inner: CacheMapInner) -> Self { + Self(Arc::new(inner)) + } +} + +pub trait ShareOwned { + type Shared: 'static; + + fn share_owned(&self) -> Self::Shared; +} + +impl ShareOwned for T { + type Shared = T; + + fn share_owned(&self) -> Self::Shared { + *self + } +} + +pub trait MakeCache: 'static { + type Cached: std::fmt::Debug + ShareOwned + Send + Sync + 'static; + type Error; + + fn make_cache(&self) -> std::result::Result; +} + +#[derive(Debug)] +pub struct CacheMapInner { + cached: DashMap>, + now: AtomicUsize, + + gc_threshold: usize, +} + +impl Default for CacheMapInner { + fn default() -> Self { + Self { + cached: DashMap::default(), + now: Default::default(), + gc_threshold: 20, + } + } +} + +impl CacheMapInner { + #[cfg(test)] + pub fn with_gc_threshold(n: usize) -> Self { + Self { + gc_threshold: n, + ..Self::default() + } + } +} + +#[derive(Debug)] +struct Cached { + resource: R, + last_used: usize, +} + +impl Cached { + fn new(resource: R) -> Self { + Self { + resource, + last_used: 0, + } + } + + fn resource_owned(&mut self, now: usize) -> ::Shared { + self.last_used = now; + self.resource.share_owned() + } +} + +impl CacheMapInner { + fn now(&self) -> usize { + self.now.load(Ordering::SeqCst) + } + + fn tick(&self) { + let now = self.now.fetch_add(1usize, Ordering::SeqCst); + self.cached.retain(|name, cache| { + let need_hold = now.saturating_sub(cache.last_used) < self.gc_threshold; + if !need_hold { + info!("Removing cache due to expired."; "name" => %name, "entry" => ?cache); + } + need_hold + }); + } +} + +impl CacheMap { + pub fn gc_loop(&self) -> impl Future + Send + 'static { + let this = Arc::downgrade(&self.0); + async move { + loop { + tokio::time::sleep(Duration::from_secs(30)).await; + match this.upgrade() { + Some(inner) => inner.tick(), + None => return, + } + } + } + } + + pub fn cached_or_create( + &self, + cache_key: &str, + backend: &M, + ) -> std::result::Result<::Shared, M::Error> { + let s = self.0.cached.get_mut(cache_key); + match s { + Some(mut s) => { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["hit"]).inc(); + Ok(s.value_mut().resource_owned(self.0.now())) + } + None => { + drop(s); + let e = self.0.cached.entry(cache_key.to_owned()); + match e { + Entry::Occupied(mut v) => { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["hit"]).inc(); + Ok(v.get_mut().resource_owned(self.0.now())) + } + Entry::Vacant(v) => { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["miss"]).inc(); + let pool = backend.make_cache()?; + info!("Insert storage cache."; "name" => %cache_key, "cached" => ?pool); + let shared = pool.share_owned(); + v.insert(Cached::new(pool)); + Ok(shared) + } + } + } + } + } +} + +#[cfg(test)] +mod tests { + use std::{ + convert::Infallible, + sync::atomic::{AtomicBool, Ordering}, + }; + + use super::{CacheMap, CacheMapInner, MakeCache}; + + #[derive(Default)] + struct CacheChecker(AtomicBool); + + impl MakeCache for CacheChecker { + type Cached = (); + type Error = Infallible; + + fn make_cache(&self) -> std::result::Result { + self.0.store(true, Ordering::SeqCst); + Ok(()) + } + } + + impl CacheChecker { + fn made_cache(&self) -> bool { + self.0.load(Ordering::SeqCst) + } + } + + #[test] + fn test_basic() { + let cached = CacheMapInner::with_gc_threshold(1); + let cached = CacheMap::with_inner(cached); + + let check_cache = |key, should_make_cache: bool| { + let c = CacheChecker::default(); + cached.cached_or_create(key, &c).unwrap(); + assert_eq!(c.made_cache(), should_make_cache); + }; + + check_cache("hello", true); + check_cache("hello", false); + check_cache("world", true); + + cached.0.tick(); + check_cache("hello", false); + + cached.0.tick(); + check_cache("world", true); + + cached.0.tick(); + check_cache("hello", true); + } +} diff --git a/components/sst_importer/src/caching/mod.rs b/components/sst_importer/src/caching/mod.rs new file mode 100644 index 00000000000..9e55717c601 --- /dev/null +++ b/components/sst_importer/src/caching/mod.rs @@ -0,0 +1,4 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub mod cache_map; +pub mod storage_cache; diff --git a/components/sst_importer/src/caching/storage_cache.rs b/components/sst_importer/src/caching/storage_cache.rs new file mode 100644 index 00000000000..23732545b92 --- /dev/null +++ b/components/sst_importer/src/caching/storage_cache.rs @@ -0,0 +1,58 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use external_storage_export::ExternalStorage; +use kvproto::brpb::StorageBackend; + +use super::cache_map::{MakeCache, ShareOwned}; +use crate::{Error, Result}; + +impl ShareOwned for StoragePool { + type Shared = Arc; + + fn share_owned(&self) -> Self::Shared { + self.get() + } +} + +impl MakeCache for StorageBackend { + type Cached = StoragePool; + type Error = Error; + + fn make_cache(&self) -> Result { + StoragePool::create(self, 16) + } +} + +pub struct StoragePool(Box<[Arc]>); + +impl StoragePool { + fn create(backend: &StorageBackend, size: usize) -> Result { + let mut r = Vec::with_capacity(size); + for _ in 0..size { + let s = external_storage_export::create_storage(backend, Default::default())?; + r.push(Arc::from(s)); + } + Ok(Self(r.into_boxed_slice())) + } + + fn get(&self) -> Arc { + use rand::Rng; + let idx = rand::thread_rng().gen_range(0..self.0.len()); + Arc::clone(&self.0[idx]) + } +} + +impl std::fmt::Debug for StoragePool { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let url = self + .get() + .url() + .map(|u| u.to_string()) + .unwrap_or_else(|_| "".to_owned()); + f.debug_tuple("StoragePool") + .field(&format_args!("{}", url)) + .finish() + } +} diff --git a/components/sst_importer/src/import_mode.rs b/components/sst_importer/src/import_mode.rs index 0e793e2bc2b..5f5b5d1060e 100644 --- a/components/sst_importer/src/import_mode.rs +++ b/components/sst_importer/src/import_mode.rs @@ -9,10 +9,10 @@ use std::{ }; use engine_traits::{CfOptions, DbOptions, KvEngine}; -use futures::executor::ThreadPool; use futures_util::compat::Future01CompatExt; use kvproto::import_sstpb::*; use tikv_util::timer::GLOBAL_TIMER_HANDLE; +use tokio::runtime::Handle; use super::{Config, Result}; @@ -88,7 +88,7 @@ impl ImportModeSwitcher { ImportModeSwitcher { inner, is_import } } - pub fn start(&self, executor: &ThreadPool, db: E) { + pub fn start(&self, executor: &Handle, db: E) { // spawn a background future to put TiKV back into normal mode after timeout let inner = self.inner.clone(); let switcher = Arc::downgrade(&inner); @@ -117,7 +117,7 @@ impl ImportModeSwitcher { } } }; - executor.spawn_ok(timer_loop); + executor.spawn(timer_loop); } pub fn enter_normal_mode(&self, db: &E, mf: RocksDbMetricsFn) -> Result { @@ -243,7 +243,6 @@ mod tests { use std::thread; use engine_traits::{KvEngine, CF_DEFAULT}; - use futures::executor::ThreadPoolBuilder; use tempfile::Builder; use test_sst_importer::{new_test_engine, new_test_engine_with_options}; use tikv_util::config::ReadableDuration; @@ -306,14 +305,13 @@ mod tests { fn mf(_cf: &str, _name: &str, _v: f64) {} let cfg = Config::default(); - let threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads) - .name_prefix("sst-importer") - .create() + let threads = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() .unwrap(); let switcher = ImportModeSwitcher::new(&cfg); - switcher.start(&threads, db.clone()); + switcher.start(threads.handle(), db.clone()); check_import_options(&db, &normal_db_options, &normal_cf_options); assert!(switcher.enter_import_mode(&db, mf).unwrap()); check_import_options(&db, &import_db_options, &import_cf_options); @@ -344,19 +342,20 @@ mod tests { import_mode_timeout: ReadableDuration::millis(300), ..Config::default() }; - let threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads) - .name_prefix("sst-importer") - .create() + + let threads = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() .unwrap(); let switcher = ImportModeSwitcher::new(&cfg); - switcher.start(&threads, db.clone()); + switcher.start(threads.handle(), db.clone()); check_import_options(&db, &normal_db_options, &normal_cf_options); switcher.enter_import_mode(&db, mf).unwrap(); check_import_options(&db, &import_db_options, &import_cf_options); thread::sleep(Duration::from_secs(1)); + threads.block_on(tokio::task::yield_now()); check_import_options(&db, &normal_db_options, &normal_cf_options); } diff --git a/components/sst_importer/src/lib.rs b/components/sst_importer/src/lib.rs index ec0222d416a..4d25201253a 100644 --- a/components/sst_importer/src/lib.rs +++ b/components/sst_importer/src/lib.rs @@ -19,6 +19,7 @@ mod sst_writer; mod util; #[macro_use] pub mod import_mode; +mod caching; pub mod metrics; pub mod sst_importer; diff --git a/components/sst_importer/src/metrics.rs b/components/sst_importer/src/metrics.rs index cd14f6feb56..e7eeefd3e82 100644 --- a/components/sst_importer/src/metrics.rs +++ b/components/sst_importer/src/metrics.rs @@ -101,4 +101,9 @@ lazy_static! { "Bucketed histogram of importer apply count", &["type"] ).unwrap(); + pub static ref EXT_STORAGE_CACHE_COUNT: IntCounterVec = register_int_counter_vec!( + "tikv_import_storage_cache", + "The operations over storage cache", + &["operation"] + ).unwrap(); } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index c024bca8e6d..3e06eb76899 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -22,9 +22,10 @@ use engine_traits::{ IterOptions, Iterator, KvEngine, RefIterable, SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, }; -use external_storage_export::{compression_reader_dispatcher, encrypt_wrap_reader, RestoreConfig}; +use external_storage_export::{ + compression_reader_dispatcher, encrypt_wrap_reader, ExternalStorage, RestoreConfig, +}; use file_system::{get_io_rate_limiter, OpenOptions}; -use futures::executor::ThreadPool; use kvproto::{ brpb::{CipherInfo, StorageBackend}, import_sstpb::*, @@ -37,16 +38,31 @@ use tikv_util::{ sys::SysQuota, time::{Instant, Limiter}, }; +use tokio::runtime::{Handle, Runtime}; use txn_types::{Key, TimeStamp, WriteRef}; use crate::{ + caching::cache_map::CacheMap, import_file::{ImportDir, ImportFile}, import_mode::{ImportModeSwitcher, RocksDbMetricsFn}, metrics::*, sst_writer::{RawSstWriter, TxnSstWriter}, - Config, Error, Result, + util, Config, Error, Result, }; +#[derive(Default, Debug, Clone)] +pub struct DownloadExt<'a> { + cache_key: Option<&'a str>, +} + +impl<'a> DownloadExt<'a> { + pub fn cache_key(self, key: &'a str) -> Self { + Self { + cache_key: Some(key), + } + } +} + #[derive(Clone, PartialEq, Debug)] pub enum CacheKvFile { Mem(Arc>), @@ -81,6 +97,9 @@ pub struct SstImporter { // TODO: lift api_version as a type parameter. api_version: ApiVersion, compression_types: HashMap, + + cached_storage: CacheMap, + download_rt: Runtime, file_locks: Arc>, mem_use: AtomicU64, mem_limit: ReadableSize, @@ -94,6 +113,11 @@ impl SstImporter { api_version: ApiVersion, ) -> Result { let switcher = ImportModeSwitcher::new(cfg); + let cached_storage = CacheMap::default(); + let download_rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + download_rt.spawn(cached_storage.gc_loop()); let memory_limit = (SysQuota::memory_limit_in_bytes() as f64) * cfg.memory_use_ratio; info!("sst importer memory limit when apply"; "size" => ?memory_limit); @@ -105,6 +129,8 @@ impl SstImporter { api_version, compression_types: HashMap::with_capacity(2), file_locks: Arc::new(DashMap::default()), + cached_storage, + download_rt, mem_use: AtomicU64::new(0), mem_limit: ReadableSize(memory_limit as u64), }) @@ -122,7 +148,7 @@ impl SstImporter { } } - pub fn start_switch_mode_check(&self, executor: &ThreadPool, db: E) { + pub fn start_switch_mode_check(&self, executor: &Handle, db: E) { self.switcher.start(executor, db); } @@ -216,7 +242,7 @@ impl SstImporter { // // This method returns the *inclusive* key range (`[start, end]`) of SST // file created, or returns None if the SST is empty. - pub fn download( + pub async fn download_ext( &self, meta: &SstMeta, backend: &StorageBackend, @@ -225,6 +251,7 @@ impl SstImporter { crypter: Option, speed_limiter: Limiter, engine: E, + ext: DownloadExt<'_>, ) -> Result> { debug!("download start"; "meta" => ?meta, @@ -233,7 +260,7 @@ impl SstImporter { "rewrite_rule" => ?rewrite_rule, "speed_limit" => speed_limiter.speed_limit(), ); - match self.do_download::( + let r = self.do_download_ext::( meta, backend, name, @@ -241,7 +268,9 @@ impl SstImporter { crypter, &speed_limiter, engine, - ) { + ext, + ); + match r.await { Ok(r) => { info!("download"; "meta" => ?meta, "name" => name, "range" => ?r); Ok(r) @@ -274,6 +303,49 @@ impl SstImporter { support_kms: bool, speed_limiter: &Limiter, restore_config: external_storage_export::RestoreConfig, + ) -> Result<()> { + self.download_rt + .block_on(self.async_download_file_from_external_storage( + file_length, + src_file_name, + dst_file, + backend, + support_kms, + speed_limiter, + "", + restore_config, + )) + } + + /// Create an external storage by the backend, and cache it with the key. + /// If the cache exists, return it directly. + pub fn external_storage_or_cache( + &self, + backend: &StorageBackend, + cache_id: &str, + ) -> Result> { + // prepare to download the file from the external_storage + // TODO: pass a config to support hdfs + let ext_storage = if cache_id.is_empty() { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["skip"]).inc(); + let s = external_storage_export::create_storage(backend, Default::default())?; + Arc::from(s) + } else { + self.cached_storage.cached_or_create(cache_id, backend)? + }; + Ok(ext_storage) + } + + async fn async_download_file_from_external_storage( + &self, + file_length: u64, + src_file_name: &str, + dst_file: std::path::PathBuf, + backend: &StorageBackend, + support_kms: bool, + speed_limiter: &Limiter, + cache_key: &str, + restore_config: external_storage_export::RestoreConfig, ) -> Result<()> { let start_read = Instant::now(); if let Some(p) = dst_file.parent() { @@ -285,34 +357,22 @@ impl SstImporter { } })?; } - // prepare to download the file from the external_storage - // TODO: pass a config to support hdfs - let ext_storage = external_storage_export::create_storage(backend, Default::default())?; - let url = ext_storage.url()?.to_string(); - let ext_storage: Box = if support_kms { - if let Some(key_manager) = &self.key_manager { - Box::new(external_storage_export::EncryptedExternalStorage { - key_manager: (*key_manager).clone(), - storage: ext_storage, - }) as _ - } else { - ext_storage as _ - } - } else { - ext_storage as _ - }; + let ext_storage = self.external_storage_or_cache(backend, cache_key)?; + let ext_storage = self.wrap_kms(ext_storage, support_kms); - let result = ext_storage.restore( - src_file_name, - dst_file.clone(), - file_length, - speed_limiter, - restore_config, - ); + let result = ext_storage + .restore( + src_file_name, + dst_file.clone(), + file_length, + speed_limiter, + restore_config, + ) + .await; IMPORTER_DOWNLOAD_BYTES.observe(file_length as _); result.map_err(|e| Error::CannotReadExternalStorage { - url: url.to_string(), + url: util::url_for(&ext_storage), name: src_file_name.to_owned(), local_path: dst_file.clone(), err: e, @@ -329,7 +389,7 @@ impl SstImporter { debug!("downloaded file succeed"; "name" => src_file_name, - "url" => %url, + "url" => %util::url_for(&ext_storage), ); Ok(()) } @@ -476,26 +536,24 @@ impl SstImporter { Ok(lock.0.clone()) } - pub fn create_external_storage( + pub fn wrap_kms( &self, - backend: &StorageBackend, + ext_storage: Arc, support_kms: bool, - ) -> Result> { - let ext_storage = external_storage_export::create_storage(backend, Default::default())?; + ) -> Arc { // kv-files needn't are decrypted with KMS when download currently because these // files are not encrypted when log-backup. It is different from // sst-files because sst-files is encrypted when saved with rocksdb env // with KMS. to do: support KMS when log-backup and restore point. - let ext_storage = match (support_kms, self.key_manager.clone()) { + match (support_kms, self.key_manager.clone()) { (true, Some(key_manager)) => { - Box::new(external_storage_export::EncryptedExternalStorage { + Arc::new(external_storage_export::EncryptedExternalStorage { key_manager, storage: ext_storage, }) } _ => ext_storage, - }; - Ok(ext_storage) + } } fn read_kv_files_from_external_storage( @@ -771,7 +829,31 @@ impl SstImporter { } } - fn do_download( + // raw download, without ext, compatibility to old tests. + #[cfg(test)] + fn download( + &self, + meta: &SstMeta, + backend: &StorageBackend, + name: &str, + rewrite_rule: &RewriteRule, + crypter: Option, + speed_limiter: Limiter, + engine: E, + ) -> Result> { + self.download_rt.block_on(self.download_ext( + meta, + backend, + name, + rewrite_rule, + crypter, + speed_limiter, + engine, + DownloadExt::default(), + )) + } + + async fn do_download_ext( &self, meta: &SstMeta, backend: &StorageBackend, @@ -780,6 +862,7 @@ impl SstImporter { crypter: Option, speed_limiter: &Limiter, engine: E, + ext: DownloadExt<'_>, ) -> Result> { let path = self.dir.join(meta)?; @@ -794,15 +877,17 @@ impl SstImporter { ..Default::default() }; - self.download_file_from_external_storage( + self.async_download_file_from_external_storage( meta.length, name, path.temp.clone(), backend, true, speed_limiter, + ext.cache_key.unwrap_or(""), restore_config, - )?; + ) + .await?; // now validate the SST file. let env = get_env(self.key_manager.clone(), get_io_rate_limiter())?; @@ -1628,8 +1713,11 @@ mod tests { ) .unwrap(); let ext_storage = { - let inner = importer.create_external_storage(&backend, false).unwrap(); - Arc::new(inner) + let inner = importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ); + inner }; // test do_read_kv_file() @@ -1681,7 +1769,10 @@ mod tests { ) .unwrap(); let ext_storage = { - let inner = importer.create_external_storage(&backend, false).unwrap(); + let inner = importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ); Arc::new(inner) }; @@ -1743,8 +1834,10 @@ mod tests { SstImporter::new(&cfg, import_dir, Some(key_manager), ApiVersion::V1).unwrap(); let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); let ext_storage = { - let inner = importer.create_external_storage(&backend, false).unwrap(); - Arc::new(inner) + importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ) }; let path = importer .dir diff --git a/components/sst_importer/src/util.rs b/components/sst_importer/src/util.rs index dce63314073..501061e92c0 100644 --- a/components/sst_importer/src/util.rs +++ b/components/sst_importer/src/util.rs @@ -4,6 +4,7 @@ use std::path::Path; use encryption::DataKeyManager; use engine_traits::EncryptionKeyManager; +use external_storage_export::ExternalStorage; use file_system::File; use super::Result; @@ -64,6 +65,13 @@ pub fn prepare_sst_for_ingestion, Q: AsRef>( Ok(()) } +pub fn url_for(storage: &E) -> String { + storage + .url() + .map(|url| url.to_string()) + .unwrap_or_else(|err| format!("ErrUrl({})", err)) +} + #[cfg(test)] mod tests { use std::{path::Path, sync::Arc}; diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 2bf0226136f..9d45052fea9 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -5,20 +5,13 @@ use std::{ future::Future, path::PathBuf, sync::{Arc, Mutex}, - thread::sleep, time::Duration, }; use collections::HashSet; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; -use futures::{ - executor::{ThreadPool, ThreadPoolBuilder}, - future::join_all, - sink::SinkExt, - stream::TryStreamExt, - TryFutureExt, -}; +use futures::{future::join_all, sink::SinkExt, stream::TryStreamExt, TryFutureExt}; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, }; @@ -34,13 +27,17 @@ use raftstore::{ router::RaftStoreRouter, store::{Callback, RaftCmdExtraOpts, RegionSnapshot}, }; -use sst_importer::{error_inc, metrics::*, sst_meta_to_path, Config, Error, Result, SstImporter}; +use sst_importer::{ + error_inc, metrics::*, sst_importer::DownloadExt, sst_meta_to_path, Config, Error, Result, + SstImporter, +}; use tikv_util::{ config::ReadableSize, future::{create_stream_with_buffer, paired_future_callback}, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, }; +use tokio::{runtime::Runtime, time::sleep}; use txn_types::{Key, WriteRef, WriteType}; use super::make_rpc_error; @@ -58,7 +55,7 @@ where cfg: Config, engine: E, router: Router, - threads: ThreadPool, + threads: Arc, importer: Arc, limiter: Limiter, task_slots: Arc>>, @@ -83,25 +80,25 @@ where importer: Arc, ) -> ImportSstService { let props = tikv_util::thread_group::current_properties(); - let threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads + 1) - .name_prefix("sst-importer") + let threads = tokio::runtime::Builder::new_multi_thread() + .worker_threads(cfg.num_threads) + .enable_all() + .thread_name("sst-importer") .after_start_wrapper(move || { tikv_util::thread_group::set_properties(props.clone()); tikv_alloc::add_thread_memory_accessor(); set_io_type(IoType::Import); }) .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) - .create() + .build() .unwrap(); - importer.start_switch_mode_check(&threads, engine.clone()); - let importer_clone = importer.clone(); - threads.spawn_ok(async { Self::tick(importer_clone) }); + importer.start_switch_mode_check(threads.handle(), engine.clone()); + threads.spawn(Self::tick(importer.clone())); ImportSstService { cfg, engine, - threads, + threads: Arc::new(threads), router, importer, limiter: Limiter::new(f64::INFINITY), @@ -110,9 +107,9 @@ where } } - fn tick(importer: Arc) { + async fn tick(importer: Arc) { loop { - sleep(Duration::from_secs(10)); + sleep(Duration::from_secs(10)).await; importer.shrink_by_tick(); } } @@ -311,8 +308,8 @@ macro_rules! impl_write { $crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(buf_driver); - self.threads.spawn_ok(handle_task); + self.threads.spawn(buf_driver); + self.threads.spawn(handle_task); } }; } @@ -395,8 +392,8 @@ where crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(buf_driver); - self.threads.spawn_ok(handle_task); + self.threads.spawn(buf_driver); + self.threads.spawn(handle_task); } // clear_files the KV files after apply finished. @@ -431,7 +428,7 @@ where let resp = Ok(resp); crate::send_rpc_response!(resp, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } // Downloads KV file and performs key-rewrite then apply kv into this tikv @@ -475,9 +472,14 @@ where let mut req_write_size = 0_u64; let mut range: Option = None; let ext_storage = { - let inner = - importer.create_external_storage(req.get_storage_backend(), false)?; - Arc::from(inner) + let inner = importer.wrap_kms( + importer.external_storage_or_cache( + req.get_storage_backend(), + req.get_storage_cache_id(), + )?, + false, + ); + inner }; for (i, meta) in metas.iter().enumerate() { @@ -594,7 +596,7 @@ where debug!("finished apply kv file with {:?}", resp); crate::send_rpc_response!(resp, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } /// Downloads the file and performs key-rewrite for later ingesting. @@ -627,7 +629,7 @@ where .into_option() .filter(|c| c.cipher_type != EncryptionMethod::Plaintext); - let res = importer.download::( + let res = importer.download_ext::( req.get_sst(), req.get_storage_backend(), req.get_name(), @@ -635,9 +637,10 @@ where cipher, limiter, engine, + DownloadExt::default().cache_key(req.get_storage_cache_id()), ); let mut resp = DownloadResponse::default(); - match res { + match res.await { Ok(range) => match range { Some(r) => resp.set_range(r), None => resp.set_is_empty(true), @@ -648,7 +651,7 @@ where crate::send_rpc_response!(resp, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } /// Ingest the file by sending a raft command to raftstore. @@ -694,7 +697,7 @@ where Self::release_lock(&task_slots, &meta).unwrap(); crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } /// Ingest multiple files by sending a raft command to raftstore. @@ -745,7 +748,7 @@ where } crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } fn compact( @@ -794,7 +797,7 @@ where crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } fn set_download_speed_limit( @@ -885,7 +888,7 @@ where } let _ = sink.close().await; }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } impl_write!(write, WriteRequest, WriteResponse, Chunk, new_txn_writer); From 8d9698f82bbd9a0a1c4ace0dffe1fadcd37df07e Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 1 Dec 2022 19:38:02 +0800 Subject: [PATCH 379/676] storage: Unify the flashback reader and fix the start_key bug (#13860) close pingcap/tiflash#6379, ref tikv/tikv#13800, close tikv/tikv#13861 SnapshotReader typically uses its own start_ts for something. Since it doesn't need the start_ts of SnapshotReader itself, we can unify the reader into MvccReader. And the start key from the client is actually a range, which is used to limit the upper bound of this flashback when scanning data, so it may not be a real key. Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- components/cdc/tests/integrations/test_cdc.rs | 2 +- components/cdc/tests/mod.rs | 2 + src/storage/mvcc/reader/reader.rs | 2 +- .../txn/actions/flashback_to_version.rs | 234 ++++++++++++------ .../txn/commands/flashback_to_version.rs | 16 +- .../flashback_to_version_read_phase.rs | 76 ++++-- tests/integrations/server/kv_service.rs | 32 ++- 7 files changed, 249 insertions(+), 115 deletions(-) diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index b9c285406d4..73f46fe6427 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -2533,7 +2533,7 @@ fn test_flashback() { let (start_key, end_key) = (b"key0".to_vec(), b"key2".to_vec()); // Prepare flashback. let flashback_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); - suite.must_kv_prepare_flashback(region_id, &start_key, flashback_start_ts); + suite.must_kv_prepare_flashback(region_id, &start_key, &end_key, flashback_start_ts); // resolved ts should not be advanced anymore. let mut counter = 0; let mut last_resolved_ts = 0; diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index 87619deb92b..77e50bb10b2 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -586,11 +586,13 @@ impl TestSuite { &mut self, region_id: u64, start_key: &[u8], + end_key: &[u8], start_ts: TimeStamp, ) { let mut prepare_flashback_req = PrepareFlashbackToVersionRequest::default(); prepare_flashback_req.set_context(self.get_context(region_id)); prepare_flashback_req.set_start_key(start_key.to_vec()); + prepare_flashback_req.set_end_key(end_key.to_vec()); prepare_flashback_req.set_start_ts(start_ts.into_inner()); let prepare_flashback_resp = self .get_tikv_client(region_id) diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 0ada3a12d5d..4847dbb8428 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -206,7 +206,7 @@ impl MvccReader { } /// load the value associated with `key` and pointed by `write` - fn load_data(&mut self, key: &Key, write: Write) -> Result { + pub fn load_data(&mut self, key: &Key, write: Write) -> Result { assert_eq!(write.write_type, WriteType::Put); if let Some(val) = write.short_value { return Ok(val); diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index e719ca24a26..4b05c8eef8f 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -7,7 +7,7 @@ use txn_types::{Key, Lock, LockType, TimeStamp, Write, WriteType}; use crate::storage::{ mvcc::{MvccReader, MvccTxn, SnapshotReader, MAX_TXN_WRITE_SIZE}, txn::{actions::check_txn_status::rollback_lock, Result as TxnResult}, - Snapshot, Statistics, + Snapshot, }; pub const FLASHBACK_BATCH_SIZE: usize = 256 + 1 /* To store the next key for multiple batches */; @@ -16,7 +16,6 @@ pub fn flashback_to_version_read_lock( reader: &mut MvccReader, next_lock_key: Key, end_key: &Key, - statistics: &mut Statistics, ) -> TxnResult> { let result = reader.scan_locks( Some(&next_lock_key), @@ -24,7 +23,6 @@ pub fn flashback_to_version_read_lock( |_| true, FLASHBACK_BATCH_SIZE, ); - statistics.add(&reader.statistics); let (key_locks, _) = result?; Ok(key_locks) } @@ -36,7 +34,6 @@ pub fn flashback_to_version_read_write( end_key: &Key, flashback_version: TimeStamp, flashback_commit_ts: TimeStamp, - statistics: &mut Statistics, ) -> TxnResult> { // Filter out the SST that does not have a newer version than // `flashback_version` in `CF_WRITE`, i.e, whose latest `commit_ts` <= @@ -51,7 +48,7 @@ pub fn flashback_to_version_read_write( |key, latest_commit_ts| { // There is no any other write could happen after the flashback begins. assert!(latest_commit_ts <= flashback_commit_ts); - // - Skip the `start_key`. + // - Skip the `start_key` which as prewrite key. // - No need to find an old version for the key if its latest `commit_ts` is // smaller than or equal to the flashback version. // - No need to flashback a key twice if its latest `commit_ts` is equal to the @@ -62,7 +59,6 @@ pub fn flashback_to_version_read_write( }, FLASHBACK_BATCH_SIZE, ); - statistics.add(&reader.statistics); let (keys, _) = keys_result?; Ok(keys) } @@ -71,9 +67,10 @@ pub fn flashback_to_version_read_write( // `CF_LOCK`. pub fn rollback_locks( txn: &mut MvccTxn, - reader: &mut SnapshotReader, + snapshot: impl Snapshot, key_locks: Vec<(Key, Lock)>, ) -> TxnResult> { + let mut reader = SnapshotReader::new(txn.start_ts, snapshot, false); for (key, lock) in key_locks { if txn.write_size() >= MAX_TXN_WRITE_SIZE { return Ok(Some(key)); @@ -82,7 +79,7 @@ pub fn rollback_locks( reader.start_ts = lock.ts; rollback_lock( txn, - reader, + &mut reader, key.clone(), &lock, lock.is_pessimistic_txn(), @@ -102,7 +99,7 @@ pub fn rollback_locks( // and `self.start_ts`. pub fn flashback_to_version_write( txn: &mut MvccTxn, - reader: &mut SnapshotReader, + reader: &mut MvccReader, keys: Vec, flashback_version: TimeStamp, flashback_start_ts: TimeStamp, @@ -122,7 +119,7 @@ pub fn flashback_to_version_write( if txn.write_size() >= MAX_TXN_WRITE_SIZE { return Ok(Some(key.clone())); } - let old_write = reader.get_write(&key, flashback_version)?; + let old_write = reader.get_write(&key, flashback_version, None)?; let new_write = if let Some(old_write) = old_write { // If it's a `WriteType::Put` without the short value, we should put the old // value in `CF_DEFAULT` with `self.start_ts` as well. @@ -152,19 +149,19 @@ pub fn flashback_to_version_write( // transaction. pub fn prewrite_flashback_key( txn: &mut MvccTxn, - reader: &mut SnapshotReader, + reader: &mut MvccReader, key_to_lock: &Key, flashback_version: TimeStamp, flashback_start_ts: TimeStamp, ) -> TxnResult<()> { - let old_write = reader.get_write(key_to_lock, flashback_version)?; + let old_write = reader.get_write(key_to_lock, flashback_version, None)?; // Flashback the value in `CF_DEFAULT` as well if the old write is a // `WriteType::Put` without the short value. if let Some(old_write) = old_write.as_ref() { if old_write.write_type == WriteType::Put && old_write.short_value.is_none() // If the value with `flashback_start_ts` already exists, we don't need to write again. - && reader.reader.get_value(key_to_lock, flashback_start_ts)?.is_none() + && reader.get_value(key_to_lock, flashback_start_ts)?.is_none() { txn.put_value( key_to_lock.clone(), @@ -197,7 +194,7 @@ pub fn prewrite_flashback_key( pub fn commit_flashback_key( txn: &mut MvccTxn, - reader: &mut SnapshotReader, + reader: &mut MvccReader, key_to_commit: &Key, flashback_start_ts: TimeStamp, flashback_commit_ts: TimeStamp, @@ -225,6 +222,16 @@ pub fn commit_flashback_key( Ok(()) } +pub fn get_first_user_key( + reader: &mut MvccReader, + start_key: &Key, + end_key: &Key, +) -> TxnResult> { + let (mut keys_result, _) = + reader.scan_latest_user_keys(Some(start_key), Some(end_key), |_, _| true, 1)?; + Ok(keys_result.pop()) +} + #[cfg(test)] pub mod tests { use concurrency_manager::ConcurrencyManager; @@ -249,23 +256,17 @@ pub mod tests { fn must_rollback_lock( engine: &mut E, key: &[u8], - version: impl Into, start_ts: impl Into, ) -> usize { let next_key = Key::from_raw(keys::next_key(key).as_slice()); let key = Key::from_raw(key); - let (version, start_ts) = (version.into(), start_ts.into()); let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); - let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); - let mut statistics = Statistics::default(); - let key_locks = - flashback_to_version_read_lock(&mut reader, key, &next_key, &mut statistics).unwrap(); + let mut reader = MvccReader::new_with_ctx(snapshot.clone(), Some(ScanMode::Forward), &ctx); + let key_locks = flashback_to_version_read_lock(&mut reader, key, &next_key).unwrap(); let cm = ConcurrencyManager::new(TimeStamp::zero()); - let mut txn = MvccTxn::new(start_ts, cm); - let snapshot = engine.snapshot(Default::default()).unwrap(); - let mut snap_reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); - rollback_locks(&mut txn, &mut snap_reader, key_locks).unwrap(); + let mut txn = MvccTxn::new(start_ts.into(), cm); + rollback_locks(&mut txn, snapshot, key_locks).unwrap(); let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); rows @@ -282,21 +283,22 @@ pub mod tests { let mut txn = MvccTxn::new(start_ts, cm); let snapshot = engine.snapshot(Default::default()).unwrap(); let ctx = Context::default(); - let mut snap_reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); - prewrite_flashback_key( - &mut txn, - &mut snap_reader, - &Key::from_raw(key), - version, - start_ts, - ) - .unwrap(); + let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); + let prewrite_key = if let Some(first_key) = + get_first_user_key(&mut reader, &Key::from_raw(key), &Key::from_raw(b"z")).unwrap() + { + first_key + } else { + // If the key is None return directly + return 0; + }; + prewrite_flashback_key(&mut txn, &mut reader, &prewrite_key, version, start_ts).unwrap(); let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); rows } - fn must_flashback_to_version( + fn must_flashback_write_to_version( engine: &mut E, key: &[u8], version: impl Into, @@ -309,7 +311,6 @@ pub mod tests { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); - let mut statistics = Statistics::default(); // Flashback the writes. let keys = flashback_to_version_read_write( &mut reader, @@ -318,29 +319,41 @@ pub mod tests { &next_key, version, commit_ts, - &mut statistics, ) .unwrap(); let cm = ConcurrencyManager::new(TimeStamp::zero()); let mut txn = MvccTxn::new(start_ts, cm); + flashback_to_version_write(&mut txn, &mut reader, keys, version, start_ts, commit_ts) + .unwrap(); + let rows = txn.modifies.len(); + write(engine, &ctx, txn.into_modifies()); + rows + } + + fn must_commit_flashback_key( + engine: &mut E, + key: &[u8], + start_ts: impl Into, + commit_ts: impl Into, + ) -> usize { + let (start_ts, commit_ts) = (start_ts.into(), commit_ts.into()); + let cm = ConcurrencyManager::new(TimeStamp::zero()); + let mut txn = MvccTxn::new(start_ts, cm); let snapshot = engine.snapshot(Default::default()).unwrap(); - let mut snap_reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); - flashback_to_version_write( - &mut txn, - &mut snap_reader, - keys, - version, - start_ts, - commit_ts, - ) - .unwrap(); + let ctx = Context::default(); + let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); + let key_to_lock = + get_first_user_key(&mut reader, &Key::from_raw(key), &Key::from_raw(b"z")) + .unwrap() + .unwrap(); + commit_flashback_key(&mut txn, &mut reader, &key_to_lock, start_ts, commit_ts).unwrap(); let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); rows } #[test] - fn test_flashback_to_version() { + fn test_flashback_write_to_version() { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let k = b"k"; @@ -368,50 +381,50 @@ pub mod tests { must_get(&mut engine, k, *ts.incr(), v2); // Flashback to version 1 with start_ts = 14, commit_ts = 15. assert_eq!( - must_flashback_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, *ts.incr()); // Flashback to version 2 with start_ts = 17, commit_ts = 18. assert_eq!( - must_flashback_to_version(&mut engine, k, 2, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 2, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 5 with start_ts = 20, commit_ts = 21. assert_eq!( - must_flashback_to_version(&mut engine, k, 5, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 5, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 7 with start_ts = 23, commit_ts = 24. assert_eq!( - must_flashback_to_version(&mut engine, k, 7, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 7, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 10 with start_ts = 26, commit_ts = 27. assert_eq!( - must_flashback_to_version(&mut engine, k, 10, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 10, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, *ts.incr()); // Flashback to version 13 with start_ts = 29, commit_ts = 30. assert_eq!( - must_flashback_to_version(&mut engine, k, 13, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 13, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v2); // Flashback to version 27 with start_ts = 32, commit_ts = 33. assert_eq!( - must_flashback_to_version(&mut engine, k, 27, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 27, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, *ts.incr()); } #[test] - fn test_flashback_to_version_deleted() { + fn test_flashback_write_to_version_deleted() { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let (k, v) = (b"k", b"v"); @@ -423,14 +436,14 @@ pub mod tests { // Though the key has been deleted, flashback to version 1 still needs to write // a new `WriteType::Delete` with the flashback `commit_ts`. assert_eq!( - must_flashback_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, ts); } #[test] - fn test_flashback_to_version_pessimistic() { + fn test_flashback_write_to_version_pessimistic() { let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k"; let (v1, v2, v3) = (b"v1", b"v2", b"v3"); @@ -447,8 +460,11 @@ pub mod tests { // Flashback to version 17 with start_ts = 35, commit_ts = 40. // Distinguish from pessimistic start_ts 30 to make sure rollback ts is by lock // ts. - assert_eq!(must_rollback_lock(&mut engine, k, 17, 35), 2); - assert_eq!(must_flashback_to_version(&mut engine, k, 17, 35, 40), 1); + assert_eq!(must_rollback_lock(&mut engine, k, 35), 2); + assert_eq!( + must_flashback_write_to_version(&mut engine, k, 17, 35, 40), + 1 + ); // Pessimistic Prewrite Put(k -> v3) with stat_ts = 30 will be error with // Rollback. @@ -457,7 +473,7 @@ pub mod tests { } #[test] - fn test_duplicated_flashback_to_version() { + fn test_duplicated_flashback_write_to_version() { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let (k, v) = (b"k", b"v"); @@ -467,14 +483,14 @@ pub mod tests { let start_ts = *ts.incr(); let commit_ts = *ts.incr(); assert_eq!( - must_flashback_to_version(&mut engine, k, 1, start_ts, commit_ts), + must_flashback_write_to_version(&mut engine, k, 1, start_ts, commit_ts), 1 ); must_get_none(&mut engine, k, ts); // Flashback again with the same `start_ts` and `commit_ts` should not do // anything. assert_eq!( - must_flashback_to_version(&mut engine, k, 1, start_ts, commit_ts), + must_flashback_write_to_version(&mut engine, k, 1, start_ts, commit_ts), 0 ); } @@ -490,34 +506,106 @@ pub mod tests { let flashback_start_ts = *ts.incr(); // Rollback nothing. - assert_eq!( - must_rollback_lock(&mut engine, k, ts, flashback_start_ts), - 0 - ); + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); // Lock and write the value of `k`. assert_eq!( must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), 2 ); + // Retry Prepare // Unlock `k`, put rollback record and delete the value of `k`. - assert_eq!( - must_rollback_lock(&mut engine, k, ts, flashback_start_ts), - 3 - ); + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 3); // Lock and write the value of `k`. assert_eq!( must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), 2 ); + // Retry Prepare // Only unlock `k` since there is an overlapped rollback record. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 1); + // Only lock `k` since the value of `k` has already existed. assert_eq!( - must_rollback_lock(&mut engine, k, ts, flashback_start_ts), + must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), 1 ); - // Only lock `k` since the value of `k` has already existed. + } + + #[test] + fn test_prewrite_with_special_key() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let mut ts = TimeStamp::zero(); + let (prewrite_key, prewrite_val) = (b"b", b"val"); + must_prewrite_put( + &mut engine, + prewrite_key, + prewrite_val, + prewrite_key, + *ts.incr(), + ); + must_commit(&mut engine, prewrite_key, ts, *ts.incr()); + must_get(&mut engine, prewrite_key, ts, prewrite_val); + let (k, v1, v2) = (b"c", b"v1", b"v2"); + must_prewrite_put(&mut engine, k, v1, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_prewrite_put(&mut engine, k, v2, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, ts, v2); + // Check for prewrite key b"b". + let ctx = Context::default(); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); + let first_key = get_first_user_key(&mut reader, &Key::from_raw(b""), &Key::from_raw(b"z")) + .unwrap_or_else(|_| Some(Key::from_raw(b""))) + .unwrap(); + assert_eq!(first_key, Key::from_raw(prewrite_key)); + + // case 1: start key is before all keys, flashback b"c". + let start_key = b"a"; + let (flashback_start_ts, flashback_commit_ts) = (*ts.incr(), *ts.incr()); + // Rollback nothing. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); + // Prewrite "prewrite_key" not "start_key". assert_eq!( - must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), + must_prewrite_flashback_key(&mut engine, start_key, 4, flashback_start_ts), + 1 + ); + // Flashback (b"c", v2) to (b"c", v1). + assert_eq!( + must_flashback_write_to_version( + &mut engine, + k, + 4, + flashback_start_ts, + flashback_commit_ts + ), 1 ); + // Put prewrite record and Unlock, will commit "prewrite_key" not "start_key". + assert_eq!( + must_commit_flashback_key( + &mut engine, + start_key, + flashback_start_ts, + flashback_commit_ts + ), + 2 + ); + must_get(&mut engine, k, ts, v1); + must_get(&mut engine, prewrite_key, ts, prewrite_val); + + // case 2: start key is after all keys, prewrite will return None. + let start_key = b"d"; + let flashback_start_ts = *ts.incr(); + // Rollback nothing. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); + // Prewrite null. + assert_eq!( + must_prewrite_flashback_key(&mut engine, start_key, 4, flashback_start_ts), + 0 + ); + // case 3: start key is valid, end_key is invalid, prewrite key will be None. + let first_key = get_first_user_key(&mut reader, &Key::from_raw(b"a"), &Key::from_raw(b"")) + .unwrap_or_else(|_| Some(Key::from_raw(b""))); + assert_eq!(first_key, None); } } diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index a1936cee647..13de0c9b183 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -3,12 +3,13 @@ // #[PerformanceCriticalPath] use std::mem; +use tikv_kv::ScanMode; use txn_types::{Key, TimeStamp}; use crate::storage::{ kv::WriteData, lock_manager::LockManager, - mvcc::{MvccTxn, SnapshotReader}, + mvcc::{MvccReader, MvccTxn}, txn::{ actions::flashback_to_version::{ commit_flashback_key, flashback_to_version_write, prewrite_flashback_key, @@ -16,8 +17,7 @@ use crate::storage::{ }, commands::{ Command, CommandExt, FlashbackToVersionReadPhase, FlashbackToVersionState, - ReaderWithStats, ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, - WriteContext, WriteResult, + ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, WriteContext, WriteResult, }, latch, Result, }, @@ -71,10 +71,8 @@ impl CommandExt for FlashbackToVersion { impl WriteCommand for FlashbackToVersion { fn process_write(mut self, snapshot: S, context: WriteContext<'_, L>) -> Result { - let mut reader = ReaderWithStats::new( - SnapshotReader::new_with_ctx(self.version, snapshot, &self.ctx), - context.statistics, - ); + let mut reader = + MvccReader::new_with_ctx(snapshot.clone(), Some(ScanMode::Forward), &self.ctx); let mut txn = MvccTxn::new(TimeStamp::zero(), context.concurrency_manager); match self.state { FlashbackToVersionState::RollbackLock { @@ -82,12 +80,11 @@ impl WriteCommand for FlashbackToVersion { ref mut key_locks, } => { if let Some(new_next_lock_key) = - rollback_locks(&mut txn, &mut reader, mem::take(key_locks))? + rollback_locks(&mut txn, snapshot, mem::take(key_locks))? { *next_lock_key = new_next_lock_key; } } - // TODO: add some test cases for the special prewrite key. FlashbackToVersionState::Prewrite { ref key_to_lock } => prewrite_flashback_key( &mut txn, &mut reader, @@ -126,6 +123,7 @@ impl WriteCommand for FlashbackToVersion { if matches!(self.state, FlashbackToVersionState::FlashbackWrite { .. }) { write_data.extra.one_pc = true; } + context.statistics.add(&reader.statistics); Ok(WriteResult { ctx: self.ctx.clone(), to_be_write: write_data, diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index d27225a9bf7..9ac5014b7f3 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -6,6 +6,7 @@ use txn_types::{Key, Lock, TimeStamp}; use crate::storage::{ mvcc::MvccReader, txn::{ + actions::flashback_to_version::get_first_user_key, commands::{ Command, CommandExt, FlashbackToVersion, ProcessResult, ReadCommand, TypedCommand, }, @@ -122,19 +123,31 @@ impl ReadCommand for FlashbackToVersionReadPhase { fn process_read(self, snapshot: S, statistics: &mut Statistics) -> Result { let tag = self.tag().get_str(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &self.ctx); + let mut start_key = self.start_key.clone(); let next_state = match self.state { FlashbackToVersionState::RollbackLock { next_lock_key, .. } => { - let mut key_locks = flashback_to_version_read_lock( - &mut reader, - next_lock_key, - &self.end_key, - statistics, - )?; + let mut key_locks = + flashback_to_version_read_lock(&mut reader, next_lock_key, &self.end_key)?; if key_locks.is_empty() { - // No more locks to rollback, continue to the prewrite phase. - FlashbackToVersionState::Prewrite { - key_to_lock: self.start_key.clone(), - } + // - No more locks to rollback, continue to the Prewrite Phase. + // - The start key from the client is actually a range which is used to limit + // the upper bound of this flashback when scanning data, so it may not be a + // real key. In the Prewrite Phase, we make sure that the start key is a real + // key and take this key as a lock for the 2pc. So When overwriting the write, + // we skip the immediate write of this key and instead put it after the + // completion of the 2pc. + // - To make sure the key locked in the latch is the same as the actual key + // written, we pass it to the key in `process_write' after getting it. + let key_to_lock = if let Some(first_key) = + get_first_user_key(&mut reader, &self.start_key, &self.end_key)? + { + first_key + } else { + // If the key is None return directly + statistics.add(&reader.statistics); + return Ok(ProcessResult::Res); + }; + FlashbackToVersionState::Prewrite { key_to_lock } } else { tls_collect_keyread_histogram_vec(tag, key_locks.len() as f64); FlashbackToVersionState::RollbackLock { @@ -147,31 +160,53 @@ impl ReadCommand for FlashbackToVersionReadPhase { } } } - FlashbackToVersionState::FlashbackWrite { next_write_key, .. } => { + FlashbackToVersionState::FlashbackWrite { + mut next_write_key, .. + } => { if self.commit_ts <= self.start_ts { return Err(Error::from(ErrorInner::InvalidTxnTso { start_ts: self.start_ts, commit_ts: self.commit_ts, })); } - // If the key is not locked, it means that the key has been committed before and - // we are in a retry. - if next_write_key == self.start_key && reader.load_lock(&next_write_key)?.is_none() - { - return Ok(ProcessResult::Res); + if next_write_key == self.start_key { + // The start key from the client is actually a range which is used to limit the + // upper bound of this flashback when scanning data, so it may not be a real + // key. In the Prewrite Phase, we make sure that the start + // key is a real key and take this key as a lock for the + // 2pc. So When overwriting the write, we skip the immediate + // write of this key and instead put it after the completion + // of the 2pc. + next_write_key = if let Some(first_key) = + get_first_user_key(&mut reader, &self.start_key, &self.end_key)? + { + first_key + } else { + // If the key is None return directly + statistics.add(&reader.statistics); + return Ok(ProcessResult::Res); + }; + // Commit key needs to match the Prewrite key, which is set as the first user + // key. + start_key = next_write_key.clone(); + // If the key is not locked, it means that the key has been committed before and + // we are in a retry. + if reader.load_lock(&next_write_key)?.is_none() { + statistics.add(&reader.statistics); + return Ok(ProcessResult::Res); + } } let mut keys = flashback_to_version_read_write( &mut reader, next_write_key, - &self.start_key, + &start_key, &self.end_key, self.version, self.commit_ts, - statistics, )?; if keys.is_empty() { FlashbackToVersionState::Commit { - key_to_commit: self.start_key.clone(), + key_to_commit: start_key.clone(), } } else { tls_collect_keyread_histogram_vec(tag, keys.len() as f64); @@ -189,6 +224,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { } _ => unreachable!(), }; + statistics.add(&reader.statistics); Ok(ProcessResult::NextCommand { cmd: Command::FlashbackToVersion(FlashbackToVersion { ctx: self.ctx, @@ -196,7 +232,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { start_ts: self.start_ts, commit_ts: self.commit_ts, version: self.version, - start_key: self.start_key, + start_key, end_key: self.end_key, state: next_state, }), diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index effe9698f30..12cff74861d 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -572,11 +572,11 @@ fn test_mvcc_flashback_failed_after_first_batch() { must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); fail::remove("flashback_skip_1_key_in_write"); fail::remove("flashback_failed_after_first_batch"); - // skip for key@0 + // skip for key@1 must_kv_read_equal( &client, ctx.clone(), - format!("key@{}", from_u32(0_u32).unwrap()) + format!("key@{}", from_u32(1_u32).unwrap()) .as_bytes() .to_vec(), b"value@1".to_vec(), @@ -586,7 +586,7 @@ fn test_mvcc_flashback_failed_after_first_batch() { must_kv_read_equal( &client, ctx.clone(), - format!("key@{}", from_u32(1_u32).unwrap()) + format!("key@{}", from_u32(2_u32).unwrap()) .as_bytes() .to_vec(), b"value@0".to_vec(), @@ -596,7 +596,7 @@ fn test_mvcc_flashback_failed_after_first_batch() { must_kv_read_equal( &client, ctx.clone(), - format!("key@{}", from_u32(FLASHBACK_BATCH_SIZE as u32 - 1).unwrap()) + format!("key@{}", from_u32(FLASHBACK_BATCH_SIZE as u32).unwrap()) .as_bytes() .to_vec(), b"value@1".to_vec(), @@ -606,11 +606,11 @@ fn test_mvcc_flashback_failed_after_first_batch() { fail::cfg("flashback_failed_after_first_batch", "return").unwrap(); must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); fail::remove("flashback_failed_after_first_batch"); - // key@0 must be flahsbacked in the second batch firstly. + // key@1 must be flahsbacked in the second batch firstly. must_kv_read_equal( &client, ctx.clone(), - format!("key@{}", from_u32(0_u32).unwrap()) + format!("key@{}", from_u32(1_u32).unwrap()) .as_bytes() .to_vec(), b"value@0".to_vec(), @@ -619,19 +619,19 @@ fn test_mvcc_flashback_failed_after_first_batch() { must_kv_read_equal( &client, ctx.clone(), - format!("key@{}", from_u32(FLASHBACK_BATCH_SIZE as u32 - 1).unwrap()) + format!("key@{}", from_u32(FLASHBACK_BATCH_SIZE as u32).unwrap()) .as_bytes() .to_vec(), b"value@0".to_vec(), ts + 2, ); - // 2 * (FLASHBACK_BATCH_SIZE - 1) - 1 keys are flashbacked. + // 2 * (FLASHBACK_BATCH_SIZE - 1) keys are flashbacked. must_kv_read_equal( &client, ctx.clone(), format!( "key@{}", - from_u32(2 * FLASHBACK_BATCH_SIZE as u32 - 3).unwrap() + from_u32(2 * FLASHBACK_BATCH_SIZE as u32 - 2).unwrap() ) .as_bytes() .to_vec(), @@ -646,16 +646,26 @@ fn test_mvcc_flashback_failed_after_first_batch() { // Subsequent batches of writes are flashbacked. must_kv_read_equal( &client, - ctx, + ctx.clone(), format!( "key@{}", - from_u32(2 * FLASHBACK_BATCH_SIZE as u32 - 3).unwrap() + from_u32(2 * FLASHBACK_BATCH_SIZE as u32 - 2).unwrap() ) .as_bytes() .to_vec(), b"value@0".to_vec(), ts, ); + // key@0 which used as prewrite lock also need to be flahsbacked. + must_kv_read_equal( + &client, + ctx, + format!("key@{}", from_u32(0_u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts, + ); } #[test] From a4a4a43692b09b157b4d5cae1f2471ff57573ed4 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Thu, 1 Dec 2022 22:28:02 +0800 Subject: [PATCH 380/676] scheduler/lock_manager: Handle the corner case that resumable pessimstic lock request is pushed to queue after cancelling (#13865) close tikv/tikv#13298 scheduler/lock_manager: Handle the corner case that resumable pessimstic lock request is pushed to queue after cancelling. When a lock-waiting request is woken up and continues its execution, in some cases it's possible that it encounters other transaction's lock agian. In this case, the entry will be put to the lock waiting queue again. However, there might be problem when LockManager tries to cancel the request (due to timeout or other possible errors. This PR handles this case. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- src/server/lock_manager/mod.rs | 8 +- src/server/lock_manager/waiter_manager.rs | 43 +--- src/storage/lock_manager/lock_wait_context.rs | 203 +++++++++++++++--- .../lock_manager/lock_waiting_queue.rs | 86 ++++++-- src/storage/lock_manager/mod.rs | 25 ++- src/storage/mod.rs | 99 +++++---- .../acquire_pessimistic_lock_resumed.rs | 35 ++- src/storage/txn/commands/mod.rs | 10 +- src/storage/txn/scheduler.rs | 47 ++-- src/storage/types.rs | 1 - tests/failpoints/cases/test_storage.rs | 166 +++++++++++++- 11 files changed, 553 insertions(+), 170 deletions(-) diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index ae60467124b..243d533a0e5 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -37,10 +37,10 @@ use crate::{ }, storage::{ lock_manager::{ - DiagnosticContext, KeyLockWaitInfo, LockManager as LockManagerTrait, LockWaitToken, - UpdateWaitForEvent, WaitTimeout, + CancellationCallback, DiagnosticContext, KeyLockWaitInfo, + LockManager as LockManagerTrait, LockWaitToken, UpdateWaitForEvent, WaitTimeout, }, - DynamicConfigs as StorageDynamicConfigs, Error as StorageError, + DynamicConfigs as StorageDynamicConfigs, }, }; @@ -248,7 +248,7 @@ impl LockManagerTrait for LockManager { wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, ) { let timeout = match timeout { diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index 8cce7bc5da6..467580645d3 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -27,13 +27,12 @@ use tikv_util::{ }; use tokio::task::spawn_local; use tracker::GLOBAL_TRACKERS; -use txn_types::Key; use super::{config::Config, deadlock::Scheduler as DetectorScheduler, metrics::*}; use crate::storage::{ lock_manager::{ - DiagnosticContext, KeyLockWaitInfo, LockDigest, LockWaitToken, UpdateWaitForEvent, - WaitTimeout, + CancellationCallback, DiagnosticContext, KeyLockWaitInfo, LockDigest, LockWaitToken, + UpdateWaitForEvent, WaitTimeout, }, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, TimeStamp}, txn::Error as TxnError, @@ -107,9 +106,6 @@ pub type Callback = Box) + Send>; #[allow(clippy::large_enum_variant)] pub enum Task { - SetKeyWakeUpDelayCallback { - cb: Box, - }, WaitFor { token: LockWaitToken, region_id: u64, @@ -119,7 +115,7 @@ pub enum Task { start_ts: TimeStamp, wait_info: KeyLockWaitInfo, timeout: WaitTimeout, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, start_waiting_time: Instant, }, @@ -158,9 +154,6 @@ impl Debug for Task { impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { - Task::SetKeyWakeUpDelayCallback { .. } => { - write!(f, "setting key wake up delay callback") - } Task::WaitFor { token, start_ts, @@ -206,7 +199,7 @@ pub(crate) struct Waiter { // term: u64, pub(crate) start_ts: TimeStamp, pub(crate) wait_info: KeyLockWaitInfo, - pub(crate) cancel_callback: Box, + pub(crate) cancel_callback: CancellationCallback, pub diag_ctx: DiagnosticContext, delay: Delay, start_waiting_time: Instant, @@ -219,7 +212,7 @@ impl Waiter { _term: u64, start_ts: TimeStamp, wait_info: KeyLockWaitInfo, - cancel_callback: Box, + cancel_callback: CancellationCallback, deadline: Instant, diag_ctx: DiagnosticContext, start_waiting_time: Instant, @@ -280,7 +273,7 @@ impl Waiter { pub(super) fn cancel_no_timeout( wait_info: KeyLockWaitInfo, - cancel_callback: Box, + cancel_callback: CancellationCallback, ) { let lock_info = wait_info.lock_info; let error = MvccError::from(MvccErrorInner::KeyIsLocked(lock_info)); @@ -311,8 +304,6 @@ struct WaitTable { wait_table: HashMap<(u64, TimeStamp), LockWaitToken>, waiter_pool: HashMap, waiter_count: Arc, - - wake_up_key_delay_callback: Option>, } impl WaitTable { @@ -321,17 +312,9 @@ impl WaitTable { wait_table: HashMap::default(), waiter_pool: HashMap::default(), waiter_count, - wake_up_key_delay_callback: None, } } - fn set_wake_up_key_delay_callback( - &mut self, - cb: Option>, - ) { - self.wake_up_key_delay_callback = cb; - } - #[cfg(test)] fn count(&self) -> usize { self.waiter_pool.len() @@ -430,7 +413,7 @@ impl Scheduler { start_ts: TimeStamp, wait_info: KeyLockWaitInfo, timeout: WaitTimeout, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, ) { self.notify_scheduler(Task::WaitFor { @@ -447,13 +430,6 @@ impl Scheduler { }); } - pub fn set_key_wake_up_delay_callback( - &self, - cb: Box, - ) { - self.notify_scheduler(Task::SetKeyWakeUpDelayCallback { cb }); - } - pub fn remove_lock_wait(&self, token: LockWaitToken) { self.notify_scheduler(Task::RemoveLockWait { token }); } @@ -610,11 +586,6 @@ impl WaiterManager { impl FutureRunnable for WaiterManager { fn run(&mut self, task: Task) { match task { - Task::SetKeyWakeUpDelayCallback { cb } => { - self.wait_table - .borrow_mut() - .set_wake_up_key_delay_callback(Some(cb)); - } Task::WaitFor { token, region_id, diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs index b8aaa7f1927..32c99867a3f 100644 --- a/src/storage/lock_manager/lock_wait_context.rs +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -11,21 +11,30 @@ //! of a single `AcquirePessimisticLock` request, and ensuring the internal //! callback for returning response through RPC is called at most only once. -use std::{convert::TryInto, result::Result, sync::Arc}; +use std::{ + convert::TryInto, + result::Result, + sync::{ + atomic::{AtomicBool, Ordering}, + mpsc, Arc, + }, +}; use parking_lot::Mutex; use txn_types::Key; use crate::storage::{ errors::SharedError, - lock_manager::{ - lock_waiting_queue::{LockWaitQueues, PessimisticLockKeyCallback}, - LockManager, LockWaitToken, - }, + lock_manager::{lock_waiting_queue::LockWaitQueues, LockManager, LockWaitToken}, types::PessimisticLockKeyResult, Error as StorageError, PessimisticLockResults, ProcessResult, StorageCallback, }; +// The arguments are: (result, is_canceled_before_enqueueing). +pub type PessimisticLockKeyCallback = + Box, bool) + Send + 'static>; +pub type CancellationCallback = Box; + pub struct LockWaitContextInner { /// The callback for finishing the current AcquirePessimisticLock request. /// Usually, requests are accepted from RPC, and in this case calling @@ -53,6 +62,123 @@ pub struct LockWaitContextSharedState { /// The key on which lock waiting occurs. key: Key, + + /// When a lock-waiting request (allow_lock_with_conflict == true) is + /// resumed, it's theoretically possible that the request meets lock + /// again, therefore it may need to be pushed to the lock waiting queue + /// again. Since the request is popped out from the queue when resuming + /// (which means the lock wait entry doesn't exist in the lock waiting + /// queue during the resumed execution), it's possible that timeout or + /// deadlock happens from `WaiterManager` during that time, which will + /// try to cancel the request. Therefore it leads to such a corner case: + /// + /// 1. (scheduler) A request enters lock waiting state, so an entry is + /// pushed to the `LockWaitQueues`, and a message is sent to + /// `LockManager`. + /// 2. (scheduler) After a while the entry is popped out and resumed + /// from the `LockWaitQueues`. + /// 3. (scheduler) The request resumes execution but still finds lock + /// on the key. + /// * This is possible to be caused by delayed-waking up or encountering + /// error when writing a lock-releasing command to the engine. + /// 4. (lock_manager) At the same time, `LockManager` tries to cancel + /// the request due to timeout. But when calling `finish_request`, + /// the entry cannot be found from the `LockWaitQueues`. So it + /// believes that the entry is already popped out and resumed and does + /// nothing. + /// 5. (scheduler) An entry is pushed to the `LockWaitQueues` due to + /// encountering lock at step 3. 6. Then the request becomes unable to + /// be canceled by timeout or other possible errors. In worst cases, + /// the request may stuck in TiKV forever. + /// + /// To solve this problem, a `is_canceled` flag should be set when + /// `LockManager` tries to cancel it, before accessing the + /// `LockWaitQueues`; when an entry is pushed to the `LockWaitQueues`, + /// check if `is_canceled` is set after locking its inner map (ensures + /// exclusive access with `LockManager`), and if it's set, cancel the + /// request like how `LockManager` should have done. + /// + /// The request should be canceled with the error that occurs in + /// `LockManager`. `external_error_tx` and `external_error_rx` are used + /// to pass this error in this case. + /// + /// `is_canceled` marks if the request is canceled from outside. Usually + /// this is caused by timeout or deadlock detected. When this flag is + /// marked true, the request must not be put into the lock waiting queue + /// since nobody will wake it up for timeout and it may stuck forever. + is_canceled: AtomicBool, + + /// The sender for passing errors in some cancellation cases. See comments + /// in [`is_canceled`](LockWaitContextSharedState::is_canceled) for details. + /// It's only possible to be used in `LockManager`, so there's no contention + /// on the mutex. + external_error_tx: Mutex>>, + + /// The sender for passing errors in some cancellation cases. See comments + /// in [`is_canceled`](LockWaitContextSharedState::is_canceled) for details. + /// It's only possible to be used when scheduler tries to push to + /// `LockWaitQueues`, so there's no contention on the mutex. + external_error_rx: Mutex>>, +} + +impl LockWaitContextSharedState { + fn new(lock_wait_token: LockWaitToken, key: Key, cb: StorageCallback) -> Self { + let inner = LockWaitContextInner { cb }; + let (tx, rx) = mpsc::channel(); + Self { + ctx_inner: Mutex::new(Some(inner)), + key, + lock_wait_token, + is_canceled: AtomicBool::new(false), + external_error_tx: Mutex::new(Some(tx)), + external_error_rx: Mutex::new(Some(rx)), + } + } + + #[cfg(test)] + pub fn new_dummy(lock_wait_token: LockWaitToken, key: Key) -> Self { + let (tx, rx) = mpsc::channel(); + Self { + ctx_inner: Mutex::new(None), + key, + lock_wait_token, + is_canceled: AtomicBool::new(false), + external_error_tx: Mutex::new(Some(tx)), + external_error_rx: Mutex::new(Some(rx)), + } + } + + pub fn is_canceled(&self) -> bool { + self.is_canceled.load(Ordering::Acquire) + } + + /// Gets the external error. It's assumed that the external error must have + /// been set and consumes it. This function is expected to be called at + /// most only once. Only used to handle the case that cancelling and + /// resuming happens concurrently. + pub(in crate::storage) fn get_external_error(&self) -> StorageError { + self.external_error_rx + .lock() + .take() + .unwrap() + .recv() + .unwrap() + } + + /// Stores the external error. This function is expected to be called at + /// most only once. Only used to handle the case that cancelling and + /// resuming happens concurrently. + fn put_external_error(&self, error: StorageError) { + if let Err(e) = self.external_error_tx.lock().take().unwrap().send(error) { + debug!("failed to set external error"; "err" => ?e); + } + } +} + +enum FinishRequestKind { + Executed, + Canceled, + CanceledBeforeEnqueueing, } #[derive(Clone)] @@ -70,13 +196,8 @@ impl LockWaitContext { cb: StorageCallback, allow_lock_with_conflict: bool, ) -> Self { - let inner = LockWaitContextInner { cb }; Self { - shared_states: Arc::new(LockWaitContextSharedState { - ctx_inner: Mutex::new(Some(inner)), - key, - lock_wait_token, - }), + shared_states: Arc::new(LockWaitContextSharedState::new(lock_wait_token, key, cb)), lock_wait_queues, allow_lock_with_conflict, } @@ -105,8 +226,13 @@ impl LockWaitContext { /// key. pub fn get_callback_for_blocked_key(&self) -> PessimisticLockKeyCallback { let ctx = self.clone(); - Box::new(move |res| { - ctx.finish_request(res, false); + Box::new(move |res, is_canceled_before_enqueueing| { + let kind = if is_canceled_before_enqueueing { + FinishRequestKind::CanceledBeforeEnqueueing + } else { + FinishRequestKind::Executed + }; + ctx.finish_request(res, kind); }) } @@ -118,31 +244,45 @@ impl LockWaitContext { /// This function is assumed to be called when the lock-waiting request is /// queueing but canceled outside, so it includes an operation to actively /// remove the entry from the lock waiting queue. - pub fn get_callback_for_cancellation(&self) -> impl FnOnce(StorageError) { + pub fn get_callback_for_cancellation(&self) -> CancellationCallback { let ctx = self.clone(); - move |e| { - ctx.finish_request(Err(e.into()), true); - } + Box::new(move |e| { + ctx.finish_request(Err(e.into()), FinishRequestKind::Canceled); + }) } fn finish_request( &self, result: Result, - is_canceling: bool, + finish_kind: FinishRequestKind, ) { - if is_canceling { - let entry = self - .lock_wait_queues - .remove_by_token(&self.shared_states.key, self.shared_states.lock_wait_token); - if entry.is_none() { - // Already popped out from the queue so that it will be woken up normally. Do - // nothing. - return; + match finish_kind { + FinishRequestKind::Executed => { + self.lock_wait_queues + .get_lock_mgr() + .remove_lock_wait(self.shared_states.lock_wait_token); + } + FinishRequestKind::Canceled => { + self.shared_states + .is_canceled + .store(true, Ordering::Release); + + let entry = self + .lock_wait_queues + .remove_by_token(&self.shared_states.key, self.shared_states.lock_wait_token); + if entry.is_none() { + // It's absent in the queue infers that it's already popped out from the queue + // so that it will be woken up normally. However + // it may still meet lock and tries to enter waiting state again. In such case, + // the request should be canceled. Store the error here so + // that it can be used for cancellation in that case, where + // there will be a `finish_request(None, false)` invocation). + self.shared_states + .put_external_error(result.unwrap_err().try_into().unwrap()); + return; + } } - } else { - self.lock_wait_queues - .get_lock_mgr() - .remove_lock_wait(self.shared_states.lock_wait_token); + FinishRequestKind::CanceledBeforeEnqueueing => {} } // When this is executed, the waiter is either woken up from the queue or @@ -243,7 +383,7 @@ mod tests { // Nothing happens currently. (ctx.get_callback_for_first_write_batch()).execute(ProcessResult::Res); rx.recv_timeout(Duration::from_millis(20)).unwrap_err(); - (ctx.get_callback_for_blocked_key())(Err(SharedError::from(write_conflict()))); + (ctx.get_callback_for_blocked_key())(Err(SharedError::from(write_conflict())), false); let res = rx.recv().unwrap().unwrap_err(); assert!(matches!( &res, @@ -270,6 +410,7 @@ mod tests { }, should_not_exist: false, lock_wait_token: token, + req_states: ctx.get_shared_states().clone(), legacy_wake_up_index: None, key_cb: None, }), diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index 90a2c369cca..663c6729962 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -57,7 +57,6 @@ use std::{ future::Future, pin::Pin, - result::Result, sync::{ atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, @@ -65,7 +64,7 @@ use std::{ time::{Duration, Instant}, }; -use dashmap; +use dashmap::{self, mapref::entry::Entry as DashMapEntry}; use futures_util::compat::Future01CompatExt; use keyed_priority_queue::KeyedPriorityQueue; use kvproto::kvrpcpb; @@ -75,18 +74,17 @@ use tikv_util::{time::InstantExt, timer::GLOBAL_TIMER_HANDLE}; use txn_types::{Key, TimeStamp}; use crate::storage::{ - errors::SharedError, - lock_manager::{LockManager, LockWaitToken}, + lock_manager::{ + lock_wait_context::{LockWaitContextSharedState, PessimisticLockKeyCallback}, + LockManager, LockWaitToken, + }, metrics::*, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, - txn::Error as TxnError, - types::{PessimisticLockKeyResult, PessimisticLockParameters}, - Error as StorageError, + txn::{Error as TxnError, ErrorInner as TxnErrorInner}, + types::PessimisticLockParameters, + Error as StorageError, ErrorInner as StorageErrorInner, }; -pub type CallbackWithSharedError = Box) + Send + 'static>; -pub type PessimisticLockKeyCallback = CallbackWithSharedError; - /// Represents an `AcquirePessimisticLock` request that's waiting for a lock, /// and contains the request's parameters. pub struct LockWaitEntry { @@ -97,6 +95,7 @@ pub struct LockWaitEntry { // Put it in a separated field. pub should_not_exist: bool, pub lock_wait_token: LockWaitToken, + pub req_states: Arc, pub legacy_wake_up_index: Option, pub key_cb: Option>, } @@ -248,15 +247,26 @@ impl LockWaitQueues { current_lock: kvrpcpb::LockInfo, ) { let mut new_key = false; - let mut key_state = self - .inner - .queue_map - .entry(lock_wait_entry.key.clone()) - .or_insert_with(|| { - new_key = true; - KeyLockWaitState::new() - }); - key_state.current_lock = current_lock; + + let map_entry = self.inner.queue_map.entry(lock_wait_entry.key.clone()); + + // If it's not the first time the request is put into the queue, the request + // might be canceled from outside when the entry is temporarily absent + // in the queue. In this case, the cancellation operation is not done. + // Do it here. For details about this corner case, see document of + // `LockWaitContext::is_canceled` field. + if lock_wait_entry.req_states.is_canceled() { + self.on_push_canceled_entry(lock_wait_entry, map_entry); + return; + } + + let mut key_state = map_entry.or_insert_with(|| { + new_key = true; + KeyLockWaitState::new() + }); + if !current_lock.key.is_empty() { + key_state.current_lock = current_lock; + } if lock_wait_entry.legacy_wake_up_index.is_none() { lock_wait_entry.legacy_wake_up_index = Some(key_state.value().legacy_wake_up_index); @@ -277,6 +287,32 @@ impl LockWaitQueues { } } + fn on_push_canceled_entry( + &self, + lock_wait_entry: Box, + key_state: DashMapEntry<'_, Key, KeyLockWaitState, impl std::hash::BuildHasher>, + ) { + let mut err = lock_wait_entry.req_states.get_external_error(); + + if let DashMapEntry::Occupied(key_state_entry) = key_state { + if let StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( + MvccError(box MvccErrorInner::KeyIsLocked(lock_info)), + )))) = &mut err + { + // Update the lock info in the error to the latest if possible. + let latest_lock_info = &key_state_entry.get().current_lock; + if !latest_lock_info.key.is_empty() { + *lock_info = latest_lock_info.clone(); + } + } + } + + // `key_state` is dropped here, so the mutex in the queue map is released. + + let cb = lock_wait_entry.key_cb.unwrap().into_inner(); + cb(Err(err.into()), true); + } + /// Dequeues the head of the lock waiting queue of the specified key, /// assuming the popped entry will be woken up. /// @@ -437,6 +473,8 @@ impl LockWaitQueues { prev_delay_ms = current_delay_ms; } + fail_point!("lock_waiting_queue_before_delayed_notify_all"); + self.delayed_notify_all(&key, notify_id) } @@ -523,7 +561,7 @@ impl LockWaitQueues { reason: kvrpcpb::WriteConflictReason::PessimisticRetry, }, ))); - cb(Err(e.into())); + cb(Err(e.into()), false); } // Return the item to be woken up in resumable way. @@ -612,9 +650,10 @@ mod tests { use super::*; use crate::storage::{ + errors::SharedError, lock_manager::{lock_wait_context::LockWaitContext, MockLockManager, WaitTimeout}, txn::ErrorInner as TxnErrorInner, - ErrorInner as StorageErrorInner, StorageCallback, + ErrorInner as StorageErrorInner, PessimisticLockKeyResult, StorageCallback, }; struct TestLockWaitEntryHandle { @@ -703,8 +742,11 @@ mod tests { parameters, should_not_exist: false, lock_wait_token: token, + req_states: dummy_ctx.get_shared_states().clone(), legacy_wake_up_index: None, - key_cb: Some(SyncWrapper::new(Box::new(move |res| tx.send(res).unwrap()))), + key_cb: Some(SyncWrapper::new(Box::new(move |res, _| { + tx.send(res).unwrap() + }))), }); let cancel_callback = dummy_ctx.get_callback_for_cancellation(); diff --git a/src/storage/lock_manager/mod.rs b/src/storage/lock_manager/mod.rs index 3ba9c7f7905..75b133a808f 100644 --- a/src/storage/lock_manager/mod.rs +++ b/src/storage/lock_manager/mod.rs @@ -9,12 +9,13 @@ use std::{ time::Duration, }; -use collections::HashMap; +use collections::{HashMap, HashSet}; use kvproto::{kvrpcpb::LockInfo, metapb::RegionEpoch}; use parking_lot::Mutex; use tracker::TrackerToken; use txn_types::{Key, TimeStamp}; +pub use crate::storage::lock_manager::lock_wait_context::CancellationCallback; use crate::{ server::lock_manager::{waiter_manager, waiter_manager::Callback}, storage::{ @@ -147,7 +148,7 @@ pub trait LockManager: Clone + Send + Sync + 'static { wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, ); @@ -170,8 +171,7 @@ pub trait LockManager: Clone + Send + Sync + 'static { #[derive(Clone)] pub struct MockLockManager { allocated_token: Arc, - waiters: - Arc)>>>, + waiters: Arc>>, } impl MockLockManager { @@ -205,7 +205,7 @@ impl LockManager for MockLockManager { wait_info: KeyLockWaitInfo, _is_first_lock: bool, _timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, _diag_ctx: DiagnosticContext, ) { self.waiters @@ -230,4 +230,19 @@ impl MockLockManager { cancel_callback(StorageError::from(TxnError::from(error))); } } + + pub fn simulate_timeout(&self, token: LockWaitToken) { + if let Some((wait_info, cancel_callback)) = self.waiters.lock().remove(&token) { + let error = MvccError::from(MvccErrorInner::KeyIsLocked(wait_info.lock_info)); + cancel_callback(StorageError::from(TxnError::from(error))); + } + } + + pub fn get_all_tokens(&self) -> HashSet { + self.waiters + .lock() + .iter() + .map(|(&token, _)| token) + .collect() + } } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 32cd7c11000..caed0f57c91 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -66,7 +66,7 @@ use std::{ marker::PhantomData, mem, sync::{ - atomic::{self, AtomicBool, AtomicU64}, + atomic::{self, AtomicBool, AtomicU64, Ordering}, Arc, }, }; @@ -3102,6 +3102,12 @@ impl TestStorageBuilder { self } + pub fn wake_up_delay_duration(self, duration_ms: u64) -> Self { + self.wake_up_delay_duration_ms + .store(duration_ms, Ordering::Relaxed); + self + } + pub fn set_api_version(mut self, api_version: ApiVersion) -> Self { self.config.set_api_version(api_version); self @@ -3196,6 +3202,9 @@ pub mod test_util { }, }; + use futures_executor::block_on; + use kvproto::kvrpcpb::Op; + use super::*; use crate::storage::{ lock_manager::WaitTimeout, @@ -3506,6 +3515,46 @@ pub mod test_util { feature_gate.set_version(env!("CARGO_PKG_VERSION")).unwrap(); feature_gate } + + pub fn must_have_locks( + storage: &Storage, + ts: u64, + start_key: &[u8], + end_key: &[u8], + expected_locks: &[( + // key + &[u8], + Op, + // start_ts + u64, + // for_update_ts + u64, + )], + ) { + let locks = block_on(storage.scan_lock( + Context::default(), + ts.into(), + Some(Key::from_raw(start_key)), + Some(Key::from_raw(end_key)), + 100, + )) + .unwrap(); + assert_eq!( + locks.len(), + expected_locks.len(), + "lock count not match, expected: {:?}; got: {:?}", + expected_locks, + locks + ); + for (lock_info, (expected_key, expected_op, expected_start_ts, expected_for_update_ts)) in + locks.into_iter().zip(expected_locks.iter()) + { + assert_eq!(lock_info.get_key(), *expected_key); + assert_eq!(lock_info.get_lock_type(), *expected_op); + assert_eq!(lock_info.get_lock_version(), *expected_start_ts); + assert_eq!(lock_info.get_lock_for_update_ts(), *expected_for_update_ts); + } + } } /// All statistics related to KvGet/KvBatchGet. @@ -3561,8 +3610,8 @@ mod tests { Error as KvError, ErrorInner as EngineErrorInner, ExpectedWrite, MockEngineBuilder, }, lock_manager::{ - DiagnosticContext, KeyLockWaitInfo, LockDigest, LockWaitToken, UpdateWaitForEvent, - WaitTimeout, + CancellationCallback, DiagnosticContext, KeyLockWaitInfo, LockDigest, + LockWaitToken, UpdateWaitForEvent, WaitTimeout, }, mvcc::LockType, txn::{ @@ -8151,46 +8200,6 @@ mod tests { test_pessimistic_lock_impl(true); } - fn must_have_locks( - storage: &Storage, - ts: u64, - start_key: &[u8], - end_key: &[u8], - expected_locks: &[( - // key - &[u8], - Op, - // start_ts - u64, - // for_update_ts - u64, - )], - ) { - let locks = block_on(storage.scan_lock( - Context::default(), - ts.into(), - Some(Key::from_raw(start_key)), - Some(Key::from_raw(end_key)), - 100, - )) - .unwrap(); - assert_eq!( - locks.len(), - expected_locks.len(), - "lock count not match, expected: {:?}; got: {:?}", - expected_locks, - locks - ); - for (lock_info, (expected_key, expected_op, expected_start_ts, expected_for_update_ts)) in - locks.into_iter().zip(expected_locks.iter()) - { - assert_eq!(lock_info.get_key(), *expected_key); - assert_eq!(lock_info.get_lock_type(), *expected_op); - assert_eq!(lock_info.get_lock_version(), *expected_start_ts); - assert_eq!(lock_info.get_lock_for_update_ts(), *expected_for_update_ts); - } - } - fn test_pessimistic_lock_resumable_impl( pipelined_pessimistic_lock: bool, in_memory_lock: bool, @@ -8723,7 +8732,7 @@ mod tests { wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, }, RemoveLockWait { @@ -8763,7 +8772,7 @@ mod tests { wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, ) { self.tx diff --git a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs index 3a35fe6d1a7..a66f8228755 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs @@ -1,11 +1,19 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::{ + fmt::{Debug, Formatter}, + sync::Arc, +}; + // #[PerformanceCriticalPath] use kvproto::kvrpcpb::ExtraOp; use txn_types::{insert_old_value_if_resolved, Key, OldValues}; use crate::storage::{ - lock_manager::{lock_waiting_queue::LockWaitEntry, LockManager, LockWaitToken}, + lock_manager::{ + lock_wait_context::LockWaitContextSharedState, lock_waiting_queue::LockWaitEntry, + LockManager, LockWaitToken, + }, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, SnapshotReader}, txn::{ acquire_pessimistic_lock, @@ -21,12 +29,23 @@ use crate::storage::{ Snapshot, }; -#[derive(Debug)] pub struct ResumedPessimisticLockItem { pub key: Key, pub should_not_exist: bool, pub params: PessimisticLockParameters, pub lock_wait_token: LockWaitToken, + pub req_states: Arc, +} + +impl Debug for ResumedPessimisticLockItem { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ResumedPessimisticLockItem") + .field("key", &self.key) + .field("should_not_exist", &self.should_not_exist) + .field("params", &self.params) + .field("lock_wait_token", &self.lock_wait_token) + .finish() + } } command! { @@ -61,6 +80,7 @@ impl CommandExt for AcquirePessimisticLockResumed { impl WriteCommand for AcquirePessimisticLockResumed { fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { + fail_point!("acquire_pessimistic_lock_resumed_before_process_write"); let mut modifies = vec![]; let mut txn = None; let mut reader: Option> = None; @@ -79,6 +99,7 @@ impl WriteCommand for AcquirePessimisticLockR should_not_exist, params, lock_wait_token, + req_states, } = item; // TODO: Refine the code for rebuilding txn state. @@ -136,6 +157,7 @@ impl WriteCommand for AcquirePessimisticLockR let mut lock_info = WriteResultLockInfo::new(lock_info, params, key, should_not_exist); lock_info.lock_wait_token = lock_wait_token; + lock_info.req_states = Some(req_states); res.push(PessimisticLockKeyResult::Waiting); encountered_locks.push(lock_info); } @@ -185,6 +207,7 @@ impl AcquirePessimisticLockResumed { should_not_exist: item.should_not_exist, params: item.parameters, lock_wait_token: item.lock_wait_token, + req_states: item.req_states, } }) .collect(); @@ -304,16 +327,20 @@ mod tests { let key = Key::from_raw(key); let lock_hash = key.gen_hash(); + let token = LockWaitToken(Some(random())); + // The tests in this file doesn't need a valid req_state. Set a dummy value + // here. + let req_states = Arc::new(LockWaitContextSharedState::new_dummy(token, key.clone())); let entry = LockWaitEntry { key, lock_hash, parameters, should_not_exist: false, - lock_wait_token: LockWaitToken(Some(random())), + lock_wait_token: token, legacy_wake_up_index: Some(0), + req_states, key_cb: None, }; - Box::new(entry) } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 4d3f32fa9cd..7d835462acf 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -30,6 +30,7 @@ use std::{ iter, marker::PhantomData, ops::{Deref, DerefMut}, + sync::Arc, }; pub use acquire_pessimistic_lock::AcquirePessimisticLock; @@ -63,7 +64,10 @@ use txn_types::{Key, TimeStamp, Value, Write}; use crate::storage::{ kv::WriteData, - lock_manager::{self, LockManager, LockWaitToken, WaitTimeout}, + lock_manager::{ + self, lock_wait_context::LockWaitContextSharedState, LockManager, LockWaitToken, + WaitTimeout, + }, metrics, mvcc::{Lock as MvccLock, MvccReader, ReleasedLock, SnapshotReader}, txn::{latch, ProcessResult, Result}, @@ -428,6 +432,9 @@ pub struct WriteResultLockInfo { /// another lock again after resuming, this field will carry the token /// that was already allocated before. pub lock_wait_token: LockWaitToken, + /// For resumed pessimistic lock requests, this is needed to check if it's + /// canceled outside. + pub req_states: Option>, } impl WriteResultLockInfo { @@ -450,6 +457,7 @@ impl WriteResultLockInfo { parameters, hash_for_latch, lock_wait_token: LockWaitToken(None), + req_states: None, } } } diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 4657decf66f..bfbb860e545 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -68,11 +68,8 @@ use crate::{ }, lock_manager::{ self, - lock_wait_context::LockWaitContext, - lock_waiting_queue::{ - CallbackWithSharedError, DelayedNotifyAllFuture, LockWaitEntry, LockWaitQueues, - PessimisticLockKeyCallback, - }, + lock_wait_context::{LockWaitContext, PessimisticLockKeyCallback}, + lock_waiting_queue::{DelayedNotifyAllFuture, LockWaitEntry, LockWaitQueues}, DiagnosticContext, LockManager, LockWaitToken, }, metrics::*, @@ -208,7 +205,7 @@ impl TaskContext { pub enum SchedulerTaskCallback { NormalRequestCallback(StorageCallback), - LockKeyCallbacks(Vec>), + LockKeyCallbacks(Vec), } impl SchedulerTaskCallback { @@ -220,13 +217,13 @@ impl SchedulerTaskCallback { | ProcessResult::PessimisticLockRes { res: Err(err) } => { let err = SharedError::from(err); for cb in cbs { - cb(Err(err.clone())); + cb(Err(err.clone()), false); } } ProcessResult::PessimisticLockRes { res: Ok(v) } => { assert_eq!(v.0.len(), cbs.len()); for (res, cb) in v.0.into_iter().zip(cbs) { - cb(Ok(res)) + cb(Ok(res), false) } } _ => unreachable!(), @@ -652,9 +649,9 @@ impl Scheduler { fn schedule_awakened_pessimistic_locks( &self, - cid: u64, + specified_cid: Option, + prepared_latches: Option, mut awakened_entries: SVec>, - latches: Lock, ) { let key_callbacks: Vec<_> = awakened_entries .iter_mut() @@ -665,10 +662,10 @@ impl Scheduler { // TODO: Make flow control take effect on this thing. self.schedule_command( - Some(cid), + specified_cid, cmd.into(), SchedulerTaskCallback::LockKeyCallbacks(key_callbacks), - Some(latches), + prepared_latches, ); } @@ -861,9 +858,9 @@ impl Scheduler { next_latches.force_assume_acquired(); self.schedule_awakened_pessimistic_locks( - next_cid, + Some(next_cid), + Some(next_latches), woken_up_resumable_lock_requests, - next_latches, ); } else { if !tctx.woken_up_resumable_lock_requests.is_empty() { @@ -929,7 +926,7 @@ impl Scheduler { wait_info, is_first_lock, wait_timeout, - Box::new(lock_req_ctx.get_callback_for_cancellation()), + lock_req_ctx.get_callback_for_cancellation(), diag_ctx, ); } @@ -1004,19 +1001,23 @@ impl Scheduler { reason: kvrpcpb::WriteConflictReason::PessimisticRetry, }, ))); - cb(Err(e.into())); + cb(Err(e.into()), false); } for f in delayed_wake_up_futures { + let self2 = self1.clone(); self1 .get_sched_pool(CommandPri::High) .pool .spawn(async move { let res = f.await; - // It returns only None currently. - // TODO: Handle not-none case when supporting resumable pessimistic lock - // requests. - assert!(res.is_none()); + if let Some(resumable_lock_wait_entry) = res { + self2.schedule_awakened_pessimistic_locks( + None, + None, + smallvec![resumable_lock_wait_entry], + ); + } }) .unwrap(); } @@ -1593,12 +1594,15 @@ impl Scheduler { task_ctx.cb = Some(SchedulerTaskCallback::NormalRequestCallback(first_batch_cb)); drop(slot); + assert!(lock_info.req_states.is_none()); + let lock_wait_entry = Box::new(LockWaitEntry { key: lock_info.key, lock_hash: lock_info.lock_digest.hash, parameters: lock_info.parameters, should_not_exist: lock_info.should_not_exist, lock_wait_token, + req_states: ctx.get_shared_states().clone(), legacy_wake_up_index: None, key_cb: Some(ctx.get_callback_for_blocked_key().into()), }); @@ -1617,6 +1621,9 @@ impl Scheduler { parameters: lock_info.parameters, should_not_exist: lock_info.should_not_exist, lock_wait_token: lock_info.lock_wait_token, + // This must be called after an execution fo AcquirePessimisticLockResumed, in which + // case there must be a valid req_state. + req_states: lock_info.req_states.unwrap(), legacy_wake_up_index: None, key_cb: Some(cb.into()), }) diff --git a/src/storage/types.rs b/src/storage/types.rs index c7da00c9ace..b4e91811843 100644 --- a/src/storage/types.rs +++ b/src/storage/types.rs @@ -271,7 +271,6 @@ impl PessimisticLockKeyResult { assert!(matches!(self, Self::Waiting)); } - #[cfg(test)] pub fn unwrap_err(&self) -> SharedError { match self { Self::Failed(e) => e.clone(), diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index dd8f49bbde3..40c78dfabde 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -4,7 +4,7 @@ use std::{ sync::{ atomic::{AtomicBool, Ordering}, mpsc::{channel, RecvTimeoutError}, - Arc, + Arc, Mutex, }, thread, time::Duration, @@ -512,6 +512,170 @@ fn test_pipelined_pessimistic_lock() { delete_pessimistic_lock(&storage, key, 60, 60); } +fn test_pessimistic_lock_resumable_blocked_twice_impl(canceled_when_resumed: bool) { + let lock_mgr = MockLockManager::new(); + let storage = TestStorageBuilderApiV1::new(lock_mgr.clone()) + .wake_up_delay_duration(100) + .build() + .unwrap(); + let (tx, rx) = channel(); + + let empty = PessimisticLockResults(vec![PessimisticLockKeyResult::Empty]); + + fail::cfg("lock_waiting_queue_before_delayed_notify_all", "pause").unwrap(); + let (first_resume_tx, first_resume_rx) = channel(); + let (first_resume_continue_tx, first_resume_continue_rx) = channel(); + let first_resume_tx = Mutex::new(first_resume_tx); + let first_resume_continue_rx = Mutex::new(first_resume_continue_rx); + fail::cfg_callback( + "acquire_pessimistic_lock_resumed_before_process_write", + move || { + // Notify that the failpoint is reached, and block until it receives a continue + // signal. + first_resume_tx.lock().unwrap().send(()).unwrap(); + first_resume_continue_rx.lock().unwrap().recv().unwrap(); + }, + ) + .unwrap(); + + let key = Key::from_raw(b"key"); + + // Lock the key. + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 10, 10, false, false), + expect_pessimistic_lock_res_callback(tx, empty.clone()), + ) + .unwrap(); + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + + // Another non-resumable request blocked. + let (tx_blocked_1, rx_blocked_1) = channel(); + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 11, 11, false, false), + expect_fail_callback(tx_blocked_1, 0, |e| match e { + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(mvcc::Error( + box mvcc::ErrorInner::WriteConflict { .. }, + ))))) => (), + e => panic!("unexpected error chain: {:?}", e), + }), + ) + .unwrap(); + rx_blocked_1 + .recv_timeout(Duration::from_millis(50)) + .unwrap_err(); + + let tokens_before = lock_mgr.get_all_tokens(); + // Another resumable request blocked, and is queued behind the above one. + let (tx_blocked_2, rx_blocked_2) = channel(); + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 12, 12, false, false) + .allow_lock_with_conflict(true), + if !canceled_when_resumed { + expect_pessimistic_lock_res_callback(tx_blocked_2, empty.clone()) + } else { + expect_value_with_checker_callback( + tx_blocked_2, + 0, + |res: storage::Result| { + let res = res.unwrap().0; + assert_eq!(res.len(), 1); + let e = res[0].unwrap_err(); + match e.inner() { + ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(mvcc::Error( + box mvcc::ErrorInner::KeyIsLocked(_), + )))) => (), + e => panic!("unexpected error chain: {:?}", e), + } + }, + ) + }, + ) + .unwrap(); + rx_blocked_2 + .recv_timeout(Duration::from_millis(50)) + .unwrap_err(); + // Find the lock wait token of the above request. + let tokens_after = lock_mgr.get_all_tokens(); + let token_of_12 = { + use std::ops::Sub; + let diff = tokens_after.sub(&tokens_before); + assert_eq!(diff.len(), 1); + diff.into_iter().next().unwrap() + }; + + // Release the lock, so that the former (non-resumable) request will be woken + // up, and the other one (resumable) will be woken up after delaying for + // `wake_up_delay_duration`. + delete_pessimistic_lock(&storage, key.clone(), 10, 10); + rx_blocked_1.recv_timeout(Duration::from_secs(1)).unwrap(); + + // The key should be unlocked at this time. + must_have_locks(&storage, 100, b"", b"\xff\xff\xff", &[]); + + // Simulate the transaction at ts=11 retries the pessimistic lock request, and + // succeeds. + let (tx, rx) = channel(); + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 11, 11, false, false), + expect_pessimistic_lock_res_callback(tx, empty), + ) + .unwrap(); + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + + // Remove `pause` in delayed wake up, so that the request of txn 12 can be woken + // up. + fail::remove("lock_waiting_queue_before_delayed_notify_all"); + first_resume_rx.recv().unwrap(); + + if canceled_when_resumed { + lock_mgr.simulate_timeout(token_of_12); + } + + fail::remove("acquire_pessimistic_lock_resumed_before_process_write"); + first_resume_continue_tx.send(()).unwrap(); + + if canceled_when_resumed { + rx_blocked_2.recv_timeout(Duration::from_secs(1)).unwrap(); + must_have_locks( + &storage, + 100, + b"", + b"\xff\xff\xff", + &[(&key.to_raw().unwrap(), Op::PessimisticLock, 11, 11)], + ); + } else { + rx_blocked_2 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + must_have_locks( + &storage, + 100, + b"", + b"\xff\xff\xff", + &[(&key.to_raw().unwrap(), Op::PessimisticLock, 11, 11)], + ); + delete_pessimistic_lock(&storage, key.clone(), 11, 11); + rx_blocked_2.recv_timeout(Duration::from_secs(1)).unwrap(); + must_have_locks( + &storage, + 100, + b"", + b"\xff\xff\xff", + &[(&key.to_raw().unwrap(), Op::PessimisticLock, 12, 12)], + ); + } +} + +#[test] +fn test_pessimistic_lock_resumable_blocked_twice() { + test_pessimistic_lock_resumable_blocked_twice_impl(false); + test_pessimistic_lock_resumable_blocked_twice_impl(true); +} + #[test] fn test_async_commit_prewrite_with_stale_max_ts() { test_async_commit_prewrite_with_stale_max_ts_impl::(); From 7a3764fcb79c33f65303efc463cf4d0188fc7d0c Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 2 Dec 2022 10:50:01 +0800 Subject: [PATCH 381/676] pd-client: pd client version 2 (#13696) close tikv/tikv#13673 None Signed-off-by: tabokie Signed-off-by: Xinye Tao --- Cargo.lock | 8 +- Cargo.toml | 21 +- cmd/tikv-ctl/Cargo.toml | 2 +- components/api_version/Cargo.toml | 2 +- components/backup-stream/Cargo.toml | 6 +- components/backup/Cargo.toml | 4 +- components/causal_ts/Cargo.toml | 2 +- components/cdc/Cargo.toml | 2 +- components/cloud/Cargo.toml | 2 +- components/cloud/aws/Cargo.toml | 2 +- components/cloud/azure/Cargo.toml | 2 +- components/cloud/gcp/Cargo.toml | 2 +- components/concurrency_manager/Cargo.toml | 2 +- components/encryption/Cargo.toml | 2 +- components/encryption/export/Cargo.toml | 2 +- components/engine_panic/Cargo.toml | 2 +- components/engine_rocks/Cargo.toml | 2 +- components/engine_rocks_helper/Cargo.toml | 2 +- components/engine_tirocks/Cargo.toml | 2 +- components/engine_traits/Cargo.toml | 2 +- components/error_code/Cargo.toml | 2 +- components/external_storage/Cargo.toml | 2 +- components/external_storage/export/Cargo.toml | 2 +- components/into_other/Cargo.toml | 2 +- components/keys/Cargo.toml | 2 +- components/pd_client/Cargo.toml | 7 +- components/pd_client/src/client.rs | 39 +- components/pd_client/src/client_v2.rs | 1408 +++++++++++++++++ components/pd_client/src/lib.rs | 7 +- components/pd_client/src/tso.rs | 59 +- components/pd_client/src/util.rs | 64 +- components/raft_log_engine/Cargo.toml | 2 +- components/raftstore-v2/Cargo.toml | 2 +- components/raftstore/Cargo.toml | 4 +- components/resolved_ts/Cargo.toml | 2 +- components/resource_metering/Cargo.toml | 2 +- components/server/Cargo.toml | 4 +- components/sst_importer/Cargo.toml | 2 +- components/test_backup/Cargo.toml | 2 +- components/test_coprocessor/Cargo.toml | 4 +- components/test_pd/Cargo.toml | 2 +- components/test_pd/src/server.rs | 25 +- components/test_pd/src/util.rs | 21 +- components/test_pd_client/Cargo.toml | 4 +- components/test_raftstore/Cargo.toml | 4 +- components/test_sst_importer/Cargo.toml | 2 +- components/test_storage/Cargo.toml | 2 +- components/test_util/Cargo.toml | 2 +- components/tidb_query_aggr/Cargo.toml | 2 +- components/tidb_query_common/Cargo.toml | 4 +- components/tidb_query_datatype/Cargo.toml | 4 +- components/tidb_query_executors/Cargo.toml | 6 +- components/tidb_query_expr/Cargo.toml | 2 +- components/tikv_kv/Cargo.toml | 2 +- components/tikv_util/Cargo.toml | 6 +- components/tipb_helper/Cargo.toml | 2 +- components/tracker/Cargo.toml | 2 +- components/txn_types/Cargo.toml | 2 +- tests/Cargo.toml | 8 +- tests/failpoints/cases/mod.rs | 1 + tests/failpoints/cases/test_pd_client.rs | 141 +- .../failpoints/cases/test_pd_client_legacy.rs | 230 +++ tests/integrations/pd/mod.rs | 1 + tests/integrations/pd/test_rpc_client.rs | 305 ++-- .../integrations/pd/test_rpc_client_legacy.rs | 691 ++++++++ 65 files changed, 2756 insertions(+), 404 deletions(-) create mode 100644 components/pd_client/src/client_v2.rs create mode 100644 tests/failpoints/cases/test_pd_client_legacy.rs create mode 100644 tests/integrations/pd/test_rpc_client_legacy.rs diff --git a/Cargo.lock b/Cargo.lock index 2b237c8c25f..8b178015fa1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2317,18 +2317,18 @@ dependencies = [ [[package]] name = "grpcio-compiler" -version = "0.9.0" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4caa0700833147dcfbe4f0758bd92545cc0f4506ee7fa154e499745a8b24e86c" +checksum = "ed97a17310fd00ff4109357584a00244e2a785d05b7ee0ef4d1e8fb1d84266df" dependencies = [ "protobuf", ] [[package]] name = "grpcio-health" -version = "0.10.0" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "641a95bace445aed36b31ae8731513c4c4d1d3dcdbc05aaeeefefe4fd673ada1" +checksum = "a37eae605cd21f144b7c7fd0e64e57af9f73d132756fef5b706db110c3ec7ea0" dependencies = [ "futures-executor", "futures-util", diff --git a/Cargo.toml b/Cargo.toml index f75a4a6511f..104157fdf24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,8 +97,8 @@ futures-timer = "3.0" futures-util = { version = "0.3.1", default-features = false, features = ["io", "async-await"] } fxhash = "0.2.1" getset = "0.1" -grpcio = { version = "0.10.3", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } -grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } +grpcio = { workspace = true } +grpcio-health = { workspace = true } hex = "0.4" http = "0" hyper = { version = "0.14", features = ["full"] } @@ -107,7 +107,7 @@ into_other = { workspace = true } itertools = "0.10" keyed_priority_queue = "0.4" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" libloading = "0.7" @@ -165,17 +165,17 @@ tikv_alloc = { workspace = true } tikv_kv = { workspace = true } tikv_util = { workspace = true } time = "0.1" -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } tokio = { version = "1.17", features = ["full"] } tokio-openssl = "0.6" -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } toml = "0.5" tracker = { workspace = true } txn_types = { workspace = true } url = "2" uuid = { version = "0.8.1", features = ["serde", "v4"] } walkdir = "2" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] api_version = { workspace = true, features = ["testexport"] } @@ -221,7 +221,7 @@ procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229 # kvproto at the same time. # After the PR to kvproto is merged, remember to comment this out and run `cargo update -p kvproto`. # [patch.'https://github.com/pingcap/kvproto'] -# kvproto = { git = "https://github.com/your_github_id/kvproto", branch="your_branch" } +# kvproto = { git = "https://github.com/your_github_id/kvproto", branch = "your_branch" } [workspace] # See https://github.com/rust-lang/rfcs/blob/master/text/2957-cargo-features2.md @@ -368,7 +368,12 @@ tipb_helper = { path = "components/tipb_helper" } tracker = { path = "components/tracker" } txn_types = { path = "components/txn_types" } # External libs -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { version = "0.10.4", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } +grpcio-health = { version = "0.10.4", default-features = false, features = ["protobuf-codec"] } +tipb = { git = "https://github.com/pingcap/tipb.git" } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } [profile.dev.package.grpcio-sys] debug = false diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index 3b2d1dd2f75..1e0699f64cf 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -62,7 +62,7 @@ gag = "1.0" grpcio = { workspace = true } hex = "0.4" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } diff --git a/components/api_version/Cargo.toml b/components/api_version/Cargo.toml index 421c01a1514..7362ca25ccc 100644 --- a/components/api_version/Cargo.toml +++ b/components/api_version/Cargo.toml @@ -11,7 +11,7 @@ testexport = [] bitflags = "1.0.1" codec = { workspace = true } engine_traits = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } match-template = "0.0.1" thiserror = "1.0" tikv_alloc = { workspace = true } diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index e5863f44c4d..b1a61580cb6 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -39,13 +39,11 @@ fail = "0.5" file_system = { workspace = true } futures = "0.3" futures-io = "0.3" - grpcio = { workspace = true } hex = "0.4" - # Fixing ahash cyclic dep: https://github.com/tkaitchuck/ahash/issues/95 indexmap = "=1.6.2" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.4" log_wrappers = { workspace = true } online_config = { workspace = true } @@ -72,7 +70,7 @@ tokio-util = { version = "0.7", features = ["compat"] } tonic = "0.8" txn_types = { workspace = true } uuid = "0.8" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] async-trait = "0.1" diff --git a/components/backup/Cargo.toml b/components/backup/Cargo.toml index 17439a0f615..27f7d68e8e3 100644 --- a/components/backup/Cargo.toml +++ b/components/backup/Cargo.toml @@ -52,7 +52,7 @@ futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } hex = "0.4" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } online_config = { workspace = true } @@ -74,7 +74,7 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-stream = "0.1" txn_types = { workspace = true } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] rand = "0.8" diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index d05e9b66ddd..a5dd62cd5d2 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -16,7 +16,7 @@ enum_dispatch = "0.3.8" error_code = { workspace = true } fail = "0.5" futures = { version = "0.3" } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } parking_lot = "0.12" diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index 27ce81c57b4..94d80bf1d9f 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -42,7 +42,7 @@ futures-timer = "3.0" getset = "0.1" grpcio = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } online_config = { workspace = true } diff --git a/components/cloud/Cargo.toml b/components/cloud/Cargo.toml index 45ae2b40b23..10f8b113b2b 100644 --- a/components/cloud/Cargo.toml +++ b/components/cloud/Cargo.toml @@ -9,7 +9,7 @@ async-trait = "0.1" derive_more = "0.99.3" error_code = { workspace = true } futures-io = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" openssl = "0.10" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } diff --git a/components/cloud/aws/Cargo.toml b/components/cloud/aws/Cargo.toml index e539c67f571..5d28e09e8f4 100644 --- a/components/cloud/aws/Cargo.toml +++ b/components/cloud/aws/Cargo.toml @@ -22,7 +22,7 @@ grpcio = { workspace = true } http = "0.2.0" hyper = "0.14" hyper-tls = "0.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" md5 = "0.7.0" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } diff --git a/components/cloud/azure/Cargo.toml b/components/cloud/azure/Cargo.toml index 3d8b01e893b..c08dc76fdff 100644 --- a/components/cloud/azure/Cargo.toml +++ b/components/cloud/azure/Cargo.toml @@ -14,7 +14,7 @@ chrono = "0.4" cloud = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } oauth2 = { version = "4.0.0", default-features = false } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } diff --git a/components/cloud/gcp/Cargo.toml b/components/cloud/gcp/Cargo.toml index f0446fa096d..5074a3c9da4 100644 --- a/components/cloud/gcp/Cargo.toml +++ b/components/cloud/gcp/Cargo.toml @@ -11,7 +11,7 @@ futures-util = { version = "0.3", default-features = false, features = ["io"] } http = "0.2.0" hyper = "0.14" hyper-tls = "0.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } # better to not use slog-global, but pass in the logger slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } diff --git a/components/concurrency_manager/Cargo.toml b/components/concurrency_manager/Cargo.toml index 2d008cf49f1..e225cbe0519 100644 --- a/components/concurrency_manager/Cargo.toml +++ b/components/concurrency_manager/Cargo.toml @@ -6,7 +6,7 @@ version = "0.0.1" [dependencies] fail = "0.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } parking_lot = "0.12" tikv_util = { workspace = true } tokio = { version = "1.5", features = ["macros", "sync", "time"] } diff --git a/components/encryption/Cargo.toml b/components/encryption/Cargo.toml index b66ef2aa147..18b6cb7305c 100644 --- a/components/encryption/Cargo.toml +++ b/components/encryption/Cargo.toml @@ -21,7 +21,7 @@ file_system = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["std", "io"] } hex = "0.4.2" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" online_config = { workspace = true } openssl = "0.10" diff --git a/components/encryption/export/Cargo.toml b/components/encryption/export/Cargo.toml index f76c2b8f03c..fc4fe59d3fb 100644 --- a/components/encryption/export/Cargo.toml +++ b/components/encryption/export/Cargo.toml @@ -18,7 +18,7 @@ derive_more = "0.99.3" encryption = { workspace = true } error_code = { workspace = true } file_system = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } openssl = "0.10" protobuf = { version = "2.8", features = ["bytes"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/engine_panic/Cargo.toml b/components/engine_panic/Cargo.toml index c5703994c73..55e42f2595f 100644 --- a/components/engine_panic/Cargo.toml +++ b/components/engine_panic/Cargo.toml @@ -7,7 +7,7 @@ publish = false [dependencies] engine_traits = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } tikv_alloc = { workspace = true } # FIXME: Remove this dep from the engine_traits interface diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index 44dd708271d..a0e3e878c54 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -32,7 +32,7 @@ engine_traits = { workspace = true } fail = "0.5" file_system = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.4.0" log_wrappers = { workspace = true } num_cpus = "1" diff --git a/components/engine_rocks_helper/Cargo.toml b/components/engine_rocks_helper/Cargo.toml index 16e79a3b007..ec66aa474a9 100644 --- a/components/engine_rocks_helper/Cargo.toml +++ b/components/engine_rocks_helper/Cargo.toml @@ -24,5 +24,5 @@ tikv_util = { workspace = true } [dev-dependencies] engine_test = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } tempfile = "3.0" diff --git a/components/engine_tirocks/Cargo.toml b/components/engine_tirocks/Cargo.toml index 8ecce112579..07c2a7ec42c 100644 --- a/components/engine_tirocks/Cargo.toml +++ b/components/engine_tirocks/Cargo.toml @@ -24,6 +24,6 @@ tracker = { workspace = true } txn_types = { workspace = true } [dev-dependencies] -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } rand = "0.8" tempfile = "3.0" diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index c2e9d729868..d38962e71c9 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -12,7 +12,7 @@ case_macros = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } protobuf = "2" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } diff --git a/components/error_code/Cargo.toml b/components/error_code/Cargo.toml index 484f8d24ad3..b98fc8dfcb5 100644 --- a/components/error_code/Cargo.toml +++ b/components/error_code/Cargo.toml @@ -14,7 +14,7 @@ path = "bin.rs" [dependencies] grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } serde = { version = "1.0", features = ["derive"] } diff --git a/components/external_storage/Cargo.toml b/components/external_storage/Cargo.toml index 8c92b79583e..839e34e3f22 100644 --- a/components/external_storage/Cargo.toml +++ b/components/external_storage/Cargo.toml @@ -29,7 +29,7 @@ futures-executor = "0.3" futures-io = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true, optional = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" libloading = { optional = true, version = "0.7.0" } openssl = "0.10" diff --git a/components/external_storage/export/Cargo.toml b/components/external_storage/export/Cargo.toml index 076bdd9d0dd..61e9bfa58df 100644 --- a/components/external_storage/export/Cargo.toml +++ b/components/external_storage/export/Cargo.toml @@ -65,7 +65,7 @@ futures-io = { version = "0.3" } futures-util = { version = "0.3", default-features = false, features = ["io"] } gcp = { optional = true, workspace = true } grpcio = { workspace = true, optional = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = { optional = true, version = "1.3" } libloading = { optional = true, version = "0.7.0" } once_cell = { optional = true, version = "1.3.1" } diff --git a/components/into_other/Cargo.toml b/components/into_other/Cargo.toml index 39989a4bf75..d31f04f4e12 100644 --- a/components/into_other/Cargo.toml +++ b/components/into_other/Cargo.toml @@ -6,5 +6,5 @@ publish = false [dependencies] engine_traits = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } diff --git a/components/keys/Cargo.toml b/components/keys/Cargo.toml index f8318237b20..5f2bf5935ee 100644 --- a/components/keys/Cargo.toml +++ b/components/keys/Cargo.toml @@ -6,7 +6,7 @@ publish = false [dependencies] byteorder = "1.2" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } diff --git a/components/pd_client/Cargo.toml b/components/pd_client/Cargo.toml index c2ee9982bcd..c25e37f23b5 100644 --- a/components/pd_client/Cargo.toml +++ b/components/pd_client/Cargo.toml @@ -6,6 +6,7 @@ publish = false [features] failpoints = ["fail/failpoints"] +testexport = [] [dependencies] collections = { workspace = true } @@ -13,7 +14,7 @@ error_code = { workspace = true } fail = "0.5" futures = "0.3" grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } @@ -28,6 +29,6 @@ thiserror = "1.0" tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1", features = ["sync"] } -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } txn_types = { workspace = true } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index e25e4a595bb..9f466a6a351 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -42,8 +42,8 @@ use super::{ UnixSecs, REQUEST_TIMEOUT, }; -const CQ_COUNT: usize = 1; -const CLIENT_PREFIX: &str = "pd"; +pub const CQ_COUNT: usize = 1; +pub const CLIENT_PREFIX: &str = "pd"; pub struct RpcClient { cluster_id: u64, @@ -86,7 +86,7 @@ impl RpcClient { ); let pd_connector = PdConnector::new(env.clone(), security_mgr.clone()); for i in 0..retries { - match pd_connector.validate_endpoints(cfg).await { + match pd_connector.validate_endpoints(cfg, true).await { Ok((client, target, members, tso)) => { let cluster_id = members.get_header().get_cluster_id(); let rpc_client = RpcClient { @@ -97,7 +97,7 @@ impl RpcClient { client, members, target, - tso, + tso.unwrap(), cfg.enable_forwarding, )), monitor: monitor.clone(), @@ -554,13 +554,16 @@ impl PdClient for RpcClient { .client_stub .get_region_by_id_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_by_id", e) + panic!( + "fail to request PD {} err {:?}", + "get_region_leader_by_id", e + ) }) }; Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) + .with_label_values(&["get_region_leader_by_id"]) .observe(duration_to_sec(timer.saturating_elapsed())); check_resp_header(resp.get_header())?; if resp.has_region() && resp.has_leader() { @@ -1088,27 +1091,3 @@ impl PdClient for RpcClient { .execute() } } - -pub struct DummyPdClient { - pub next_ts: TimeStamp, -} - -impl DummyPdClient { - pub fn new() -> DummyPdClient { - DummyPdClient { - next_ts: TimeStamp::zero(), - } - } -} - -impl Default for DummyPdClient { - fn default() -> Self { - Self::new() - } -} - -impl PdClient for DummyPdClient { - fn batch_get_tso(&self, _count: u32) -> PdFuture { - Box::pin(future::ok(self.next_ts)) - } -} diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs new file mode 100644 index 00000000000..55f0c31b3c5 --- /dev/null +++ b/components/pd_client/src/client_v2.rs @@ -0,0 +1,1408 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! PD Client V2 +//! +//! In V1, the connection to PD and related states are all shared under a +//! `RwLock`. The maintenance of these states are implemented in a +//! decentralized way: each request will try to rebuild the connection on its +//! own if it encounters a network error. +//! +//! In V2, the responsibility to maintain the connection is moved into one +//! single long-running coroutine, namely [`reconnect_loop`]. Users of the +//! connection subscribe changes instead of altering it themselves. + +use std::{ + collections::HashMap, + fmt::Debug, + pin::Pin, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::{Duration, Instant as StdInstant}, + u64, +}; + +use fail::fail_point; +use futures::{ + compat::{Compat, Future01CompatExt}, + executor::block_on, + future::FutureExt, + select, + sink::SinkExt, + stream::{Stream, StreamExt}, + task::{Context, Poll}, +}; +use grpcio::{ + CallOption, Channel, ClientDuplexReceiver, ConnectivityState, EnvBuilder, Environment, + Error as GrpcError, Result as GrpcResult, WriteFlags, +}; +use kvproto::{ + metapb, + pdpb::{ + self, GetMembersResponse, PdClient as PdClientStub, RegionHeartbeatRequest, + RegionHeartbeatResponse, ReportBucketsRequest, TsoRequest, TsoResponse, + }, + replication_modepb::{ReplicationStatus, StoreDrAutoSyncStatus}, +}; +use security::SecurityManager; +use tikv_util::{ + box_err, + config::ReadableDuration, + error, info, + mpsc::future as mpsc, + slow_log, thd_name, + time::{duration_to_sec, Instant}, + timer::GLOBAL_TIMER_HANDLE, + warn, +}; +use tokio::sync::{broadcast, mpsc as tokio_mpsc}; +use txn_types::TimeStamp; + +use super::{ + client::{CLIENT_PREFIX, CQ_COUNT}, + metrics::*, + util::{check_resp_header, PdConnector, TargetInfo}, + Config, Error, FeatureGate, RegionInfo, Result, UnixSecs, + REQUEST_TIMEOUT as REQUEST_TIMEOUT_SEC, +}; +use crate::PdFuture; + +fn request_timeout() -> Duration { + fail_point!("pd_client_v2_request_timeout", |s| { + use std::str::FromStr; + ReadableDuration::from_str(&s.unwrap()).unwrap().0 + }); + Duration::from_secs(REQUEST_TIMEOUT_SEC) +} + +/// Immutable context for making new connections. +struct ConnectContext { + cfg: Config, + connector: PdConnector, +} + +#[derive(Clone)] +struct RawClient { + stub: PdClientStub, + target_info: TargetInfo, + members: GetMembersResponse, +} + +impl RawClient { + async fn connect(ctx: &ConnectContext) -> Result { + // -1 means the max. + let retries = match ctx.cfg.retry_max_count { + -1 => std::isize::MAX, + v => v.saturating_add(1), + }; + for i in 0..retries { + match ctx.connector.validate_endpoints(&ctx.cfg, false).await { + Ok((stub, target_info, members, _)) => { + return Ok(RawClient { + stub, + target_info, + members, + }); + } + Err(e) => { + if i as usize % ctx.cfg.retry_log_every == 0 { + warn!("validate PD endpoints failed"; "err" => ?e); + } + let _ = GLOBAL_TIMER_HANDLE + .delay(StdInstant::now() + ctx.cfg.retry_interval.0) + .compat() + .await; + } + } + } + Err(box_err!("PD endpoints are invalid")) + } + + /// Returns Ok(true) when a new connection is established. + async fn maybe_reconnect(&mut self, ctx: &ConnectContext, force: bool) -> Result { + PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); + let start = Instant::now(); + + let members = self.members.clone(); + let direct_connected = self.target_info.direct_connected(); + slow_log!(start.saturating_elapsed(), "try reconnect pd"); + let (stub, target_info, members, _) = match ctx + .connector + .reconnect_pd( + members, + direct_connected, + force, + ctx.cfg.enable_forwarding, + false, + ) + .await + { + Err(e) => { + PD_RECONNECT_COUNTER_VEC + .with_label_values(&["failure"]) + .inc(); + return Err(e); + } + Ok(None) => { + PD_RECONNECT_COUNTER_VEC + .with_label_values(&["no-need"]) + .inc(); + return Ok(false); + } + Ok(Some(tuple)) => { + PD_RECONNECT_COUNTER_VEC + .with_label_values(&["success"]) + .inc(); + tuple + } + }; + + fail_point!("pd_client_v2_reconnect", |_| Ok(true)); + + self.stub = stub; + self.target_info = target_info; + self.members = members; + + info!("trying to update PD client done"; "spend" => ?start.saturating_elapsed()); + Ok(true) + } +} + +struct CachedRawClientCore { + context: ConnectContext, + + latest: Mutex, + version: AtomicU64, + on_reconnect_tx: broadcast::Sender<()>, +} + +/// A shared [`RawClient`] with a local copy of cache. +pub struct CachedRawClient { + core: Arc, + should_reconnect_tx: broadcast::Sender, + on_reconnect_rx: broadcast::Receiver<()>, + + cache: RawClient, + cache_version: u64, +} + +impl Clone for CachedRawClient { + fn clone(&self) -> Self { + Self { + core: self.core.clone(), + should_reconnect_tx: self.should_reconnect_tx.clone(), + on_reconnect_rx: self.core.on_reconnect_tx.subscribe(), + cache: self.cache.clone(), + cache_version: self.cache_version, + } + } +} + +impl CachedRawClient { + fn new( + cfg: Config, + env: Arc, + security_mgr: Arc, + should_reconnect_tx: broadcast::Sender, + ) -> Self { + let lame_stub = PdClientStub::new(Channel::lame(env.clone(), "0.0.0.0:0")); + let client = RawClient { + stub: lame_stub, + target_info: TargetInfo::new("0.0.0.0:0".to_string(), ""), + members: GetMembersResponse::new(), + }; + let context = ConnectContext { + cfg, + connector: PdConnector::new(env, security_mgr), + }; + let (tx, rx) = broadcast::channel(1); + let core = CachedRawClientCore { + context, + latest: Mutex::new(client.clone()), + version: AtomicU64::new(0), + on_reconnect_tx: tx, + }; + Self { + core: Arc::new(core), + should_reconnect_tx, + on_reconnect_rx: rx, + cache: client, + cache_version: 0, + } + } + + #[inline] + fn refresh_cache(&mut self) -> bool { + if self.cache_version < self.core.version.load(Ordering::Acquire) { + let latest = self.core.latest.lock().unwrap(); + self.cache = (*latest).clone(); + self.cache_version = self.core.version.load(Ordering::Relaxed); + true + } else { + false + } + } + + #[inline] + fn publish_cache(&mut self) { + let latest_version = { + let mut latest = self.core.latest.lock().unwrap(); + *latest = self.cache.clone(); + let _ = self.core.on_reconnect_tx.send(()); + self.core.version.fetch_add(1, Ordering::Relaxed) + 1 + }; + debug_assert!(self.cache_version < latest_version); + self.cache_version = latest_version; + } + + #[inline] + async fn wait_for_a_new_client( + rx: &mut broadcast::Receiver<()>, + current_version: u64, + latest_version: &AtomicU64, + ) -> bool { + let deadline = StdInstant::now() + request_timeout(); + loop { + if GLOBAL_TIMER_HANDLE + .timeout(Compat::new(Box::pin(rx.recv())), deadline) + .compat() + .await + .is_ok() + { + if current_version < latest_version.load(Ordering::Acquire) { + return true; + } + } else { + return false; + } + } + } + + /// Refreshes the local cache with latest client, then waits for the + /// connection to be ready. + /// The connection must be available if this function returns `Ok(())`. + async fn wait_for_ready(&mut self) -> Result<()> { + self.refresh_cache(); + if self.channel().check_connectivity_state(false) == ConnectivityState::GRPC_CHANNEL_READY { + return Ok(()); + } + select! { + r = self + .cache + .stub + .client + .channel() + .wait_for_connected(request_timeout()) + .fuse() => + { + if r { + return Ok(()); + } + } + r = Self::wait_for_a_new_client( + &mut self.on_reconnect_rx, + self.cache_version, + &self.core.version, + ).fuse() => { + if r { + assert!(self.refresh_cache()); + return Ok(()); + } + } + } + let _ = self.should_reconnect_tx.send(self.cache_version); + Err(box_err!( + "Connection unavailable {:?}", + self.channel().check_connectivity_state(false) + )) + } + + /// Makes the first connection. + async fn connect(&mut self) -> Result<()> { + self.cache = RawClient::connect(&self.core.context).await?; + self.publish_cache(); + Ok(()) + } + + /// Increases global version only when a new connection is established. + /// Might panic if `wait_for_ready` isn't called up-front. + async fn reconnect(&mut self) -> Result { + let force = (|| { + fail_point!("pd_client_force_reconnect", |_| true); + self.channel().check_connectivity_state(true) + == ConnectivityState::GRPC_CHANNEL_SHUTDOWN + })(); + if self + .cache + .maybe_reconnect(&self.core.context, force) + .await? + { + self.publish_cache(); + return Ok(true); + } + Ok(false) + } + + #[inline] + fn check_resp(&mut self, resp: GrpcResult) -> GrpcResult { + if matches!( + resp, + Err(GrpcError::RpcFailure(_) | GrpcError::RemoteStopped | GrpcError::RpcFinished(_)) + ) { + let _ = self.should_reconnect_tx.send(self.cache_version); + } + resp + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn stub(&self) -> &PdClientStub { + &self.cache.stub + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn channel(&self) -> &Channel { + self.cache.stub.client.channel() + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn call_option(&self) -> CallOption { + self.cache.target_info.call_option() + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn cluster_id(&self) -> u64 { + self.cache.members.get_header().get_cluster_id() + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn header(&self) -> pdpb::RequestHeader { + let mut header = pdpb::RequestHeader::default(); + header.set_cluster_id(self.cluster_id()); + header + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[cfg(feature = "testexport")] + #[inline] + fn leader(&self) -> pdpb::Member { + self.cache.members.get_leader().clone() + } + + #[inline] + fn initialized(&self) -> bool { + self.cache_version != 0 + } +} + +async fn reconnect_loop( + mut client: CachedRawClient, + cfg: Config, + mut should_reconnect: broadcast::Receiver, +) { + if let Err(e) = client.connect().await { + error!("failed to connect pd"; "err" => ?e); + return; + } + let backoff = (|| { + fail_point!("pd_client_v2_backoff", |s| { + use std::str::FromStr; + ReadableDuration::from_str(&s.unwrap()).unwrap().0 + }); + request_timeout() + })(); + let mut last_connect = StdInstant::now(); + loop { + if client.channel().wait_for_connected(request_timeout()).await { + let state = ConnectivityState::GRPC_CHANNEL_READY; + select! { + // Checks for leader change periodically. + _ = client + .channel() + .wait_for_state_change(state, cfg.update_interval.0) + .fuse() => {} + v = should_reconnect.recv().fuse() => { + match v { + Ok(v) if v < client.cache_version => continue, + Ok(_) => {} + Err(broadcast::error::RecvError::Lagged(_)) => continue, + Err(broadcast::error::RecvError::Closed) => break, + } + } + } + } + let target = last_connect + backoff; + if target > StdInstant::now() { + let _ = GLOBAL_TIMER_HANDLE.delay(target).compat().await; + } + last_connect = StdInstant::now(); + if let Err(e) = client.reconnect().await { + warn!("failed to reconnect pd"; "err" => ?e); + } + } +} + +#[derive(Clone)] +pub struct RpcClient { + pub raw_client: CachedRawClient, + feature_gate: FeatureGate, +} + +impl RpcClient { + pub fn new( + cfg: &Config, + shared_env: Option>, + security_mgr: Arc, + ) -> Result { + let env = shared_env.unwrap_or_else(|| { + Arc::new( + EnvBuilder::new() + .cq_count(CQ_COUNT) + .name_prefix(thd_name!(CLIENT_PREFIX)) + .build(), + ) + }); + + // Use broadcast channel for the lagging feature. + let (tx, rx) = broadcast::channel(1); + let raw_client = CachedRawClient::new(cfg.clone(), env, security_mgr, tx); + raw_client + .stub() + .spawn(reconnect_loop(raw_client.clone(), cfg.clone(), rx)); + + Ok(Self { + raw_client, + feature_gate: Default::default(), + }) + } + + #[inline] + pub fn subscribe_reconnect(&self) -> broadcast::Receiver<()> { + self.raw_client.clone().on_reconnect_rx + } + + #[cfg(feature = "testexport")] + pub fn feature_gate(&self) -> &FeatureGate { + &self.feature_gate + } + + #[cfg(feature = "testexport")] + pub fn get_leader(&mut self) -> pdpb::Member { + block_on(self.raw_client.wait_for_ready()).unwrap(); + self.raw_client.leader() + } + + #[cfg(feature = "testexport")] + pub fn reconnect(&mut self) -> Result { + block_on(self.raw_client.wait_for_ready())?; + block_on(self.raw_client.reconnect()) + } + + #[cfg(feature = "testexport")] + pub fn reset_to_lame_client(&mut self) { + let env = self.raw_client.core.context.connector.env.clone(); + let lame = PdClientStub::new(Channel::lame(env, "0.0.0.0:0")); + self.raw_client.core.latest.lock().unwrap().stub = lame.clone(); + self.raw_client.cache.stub = lame; + } + + #[cfg(feature = "testexport")] + pub fn initialized(&self) -> bool { + self.raw_client.initialized() + } +} + +pub trait PdClient { + type ResponseChannel: Stream>; + + fn create_region_heartbeat_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<( + mpsc::Sender, + Self::ResponseChannel, + )>; + + fn create_report_region_buckets_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result>; + + fn create_tso_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<(mpsc::Sender, Self::ResponseChannel)>; + + fn fetch_cluster_id(&mut self) -> Result; + + fn load_global_config(&mut self, list: Vec) -> PdFuture>; + + fn watch_global_config( + &mut self, + ) -> Result>; + + fn bootstrap_cluster( + &mut self, + stores: metapb::Store, + region: metapb::Region, + ) -> Result>; + + fn is_cluster_bootstrapped(&mut self) -> Result; + + fn alloc_id(&mut self) -> Result; + + fn is_recovering_marked(&mut self) -> Result; + + fn put_store(&mut self, store: metapb::Store) -> Result>; + + fn get_store_and_stats(&mut self, store_id: u64) + -> PdFuture<(metapb::Store, pdpb::StoreStats)>; + + fn get_store(&mut self, store_id: u64) -> Result { + block_on(self.get_store_and_stats(store_id)).map(|r| r.0) + } + + fn get_all_stores(&mut self, exclude_tombstone: bool) -> Result>; + + fn get_cluster_config(&mut self) -> Result; + + fn get_region_and_leader( + &mut self, + key: &[u8], + ) -> PdFuture<(metapb::Region, Option)>; + + fn get_region(&mut self, key: &[u8]) -> Result { + block_on(self.get_region_and_leader(key)).map(|r| r.0) + } + + fn get_region_info(&mut self, key: &[u8]) -> Result { + block_on(self.get_region_and_leader(key)).map(|r| RegionInfo::new(r.0, r.1)) + } + + fn get_region_by_id(&mut self, region_id: u64) -> PdFuture>; + + fn get_region_leader_by_id( + &mut self, + region_id: u64, + ) -> PdFuture>; + + fn ask_split(&mut self, region: metapb::Region) -> PdFuture; + + fn ask_batch_split( + &mut self, + region: metapb::Region, + count: usize, + ) -> PdFuture; + + fn store_heartbeat( + &mut self, + stats: pdpb::StoreStats, + store_report: Option, + dr_autosync_status: Option, + ) -> PdFuture; + + fn report_batch_split(&mut self, regions: Vec) -> PdFuture<()>; + + fn scatter_region(&mut self, region: RegionInfo) -> Result<()>; + + fn get_gc_safe_point(&mut self) -> PdFuture; + + fn get_operator(&mut self, region_id: u64) -> Result; + + fn update_service_safe_point( + &mut self, + name: String, + safe_point: TimeStamp, + ttl: Duration, + ) -> PdFuture<()>; + + fn report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) -> PdFuture<()>; +} + +pub struct CachedDuplexResponse { + latest: tokio_mpsc::Receiver>, + cache: Option>, +} + +impl CachedDuplexResponse { + fn new() -> (tokio_mpsc::Sender>, Self) { + let (tx, rx) = tokio_mpsc::channel(1); + ( + tx, + Self { + latest: rx, + cache: None, + }, + ) + } +} + +impl Stream for CachedDuplexResponse { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + loop { + if let Some(ref mut receiver) = self.cache { + match Pin::new(receiver).poll_next(cx) { + Poll::Ready(Some(Ok(item))) => return Poll::Ready(Some(Ok(item))), + Poll::Pending => return Poll::Pending, + // If it's None or there's error, we need to update receiver. + _ => {} + } + } + + match Pin::new(&mut self.latest).poll_recv(cx) { + Poll::Ready(Some(receiver)) => self.cache = Some(receiver), + Poll::Ready(None) => return Poll::Ready(None), + Poll::Pending => return Poll::Pending, + } + } + } +} + +impl PdClient for RpcClient { + type ResponseChannel = CachedDuplexResponse; + + fn create_region_heartbeat_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<( + mpsc::Sender, + Self::ResponseChannel, + )> { + // TODO: use bounded channel. + let (tx, rx) = mpsc::unbounded(wake_policy); + let (resp_tx, resp_rx) = CachedDuplexResponse::::new(); + let mut raw_client = self.raw_client.clone(); + let mut requests = Box::pin(rx).map(|r| { + fail::fail_point!("region_heartbeat_send_failed", |_| { + Err(grpcio::Error::RemoteStopped) + }); + Ok((r, WriteFlags::default())) + }); + self.raw_client.stub().spawn(async move { + loop { + if let Err(e) = raw_client.wait_for_ready().await { + warn!("failed to acquire client for RegionHeartbeat stream"; "err" => ?e); + continue; + } + let (mut hb_tx, hb_rx) = raw_client + .stub() + .region_heartbeat_opt(raw_client.call_option()) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "region_heartbeat", e) + }); + if resp_tx.send(hb_rx).await.is_err() { + break; + } + let res = hb_tx.send_all(&mut requests).await; + if res.is_ok() { + // requests are drained. + break; + } else { + let res = raw_client.check_resp(res); + warn!("region heartbeat stream exited"; "res" => ?res); + } + let _ = hb_tx.close().await; + } + }); + Ok((tx, resp_rx)) + } + + fn create_report_region_buckets_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result> { + let (tx, rx) = mpsc::unbounded(wake_policy); + let mut raw_client = self.raw_client.clone(); + let mut requests = Box::pin(rx).map(|r| Ok((r, WriteFlags::default()))); + self.raw_client.stub().spawn(async move { + loop { + if let Err(e) = raw_client.wait_for_ready().await { + warn!("failed to acquire client for ReportRegionBuckets stream"; "err" => ?e); + continue; + } + let (mut bk_tx, bk_rx) = raw_client + .stub() + .report_buckets_opt(raw_client.call_option()) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "report_region_buckets", e) + }); + select! { + send_res = bk_tx.send_all(&mut requests).fuse() => { + if send_res.is_ok() { + // requests are drained. + break; + } else { + let res = raw_client.check_resp(send_res); + warn!("region buckets stream exited: {:?}", res); + } + } + recv_res = bk_rx.fuse() => { + let res = raw_client.check_resp(recv_res); + warn!("region buckets stream exited: {:?}", res); + } + } + let _ = bk_tx.close().await; + } + }); + Ok(tx) + } + + fn create_tso_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<(mpsc::Sender, Self::ResponseChannel)> { + let (tx, rx) = mpsc::unbounded(wake_policy); + let (resp_tx, resp_rx) = CachedDuplexResponse::::new(); + let mut raw_client = self.raw_client.clone(); + let mut requests = Box::pin(rx).map(|r| Ok((r, WriteFlags::default()))); + self.raw_client.stub().spawn(async move { + loop { + if let Err(e) = raw_client.wait_for_ready().await { + warn!("failed to acquire client for Tso stream"; "err" => ?e); + continue; + } + let (mut tso_tx, tso_rx) = raw_client + .stub() + .tso_opt(raw_client.call_option()) + .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "tso", e)); + if resp_tx.send(tso_rx).await.is_err() { + break; + } + let res = tso_tx.send_all(&mut requests).await; + if res.is_ok() { + // requests are drained. + break; + } else { + let res = raw_client.check_resp(res); + warn!("tso exited"; "res" => ?res); + } + let _ = tso_tx.close().await; + } + }); + Ok((tx, resp_rx)) + } + + fn load_global_config(&mut self, list: Vec) -> PdFuture> { + use kvproto::pdpb::LoadGlobalConfigRequest; + let mut req = LoadGlobalConfigRequest::new(); + req.set_names(list.into()); + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + let fut = raw_client.stub().load_global_config_async(&req)?; + match fut.await { + Ok(grpc_response) => { + let mut res = HashMap::with_capacity(grpc_response.get_items().len()); + for c in grpc_response.get_items() { + if c.has_error() { + error!("failed to load global config with key {:?}", c.get_error()); + } else { + res.insert(c.get_name().to_owned(), c.get_value().to_owned()); + } + } + Ok(res) + } + Err(err) => Err(box_err!("{:?}", err)), + } + }) + } + + fn watch_global_config( + &mut self, + ) -> Result> { + let req = pdpb::WatchGlobalConfigRequest::default(); + block_on(self.raw_client.wait_for_ready())?; + Ok(self.raw_client.stub().watch_global_config(&req)?) + } + + fn fetch_cluster_id(&mut self) -> Result { + if !self.raw_client.initialized() { + block_on(self.raw_client.wait_for_ready())?; + } + let id = self.raw_client.cluster_id(); + assert!(id > 0); + Ok(id) + } + + fn bootstrap_cluster( + &mut self, + stores: metapb::Store, + region: metapb::Region, + ) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["bootstrap_cluster"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::BootstrapRequest::default(); + req.set_header(self.raw_client.header()); + req.set_store(stores); + req.set_region(region); + + let resp = self.raw_client.stub().bootstrap_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp.replication_status.take()) + } + + fn is_cluster_bootstrapped(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["is_cluster_bootstrapped"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::IsBootstrappedRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().is_bootstrapped_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.get_bootstrapped()) + } + + fn alloc_id(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["alloc_id"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::AllocIdRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().alloc_id_opt( + &req, + self.raw_client + .call_option() + .timeout(Duration::from_secs(10)), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + let id = resp.get_id(); + if id == 0 { + return Err(box_err!("pd alloc weird id 0")); + } + Ok(id) + } + + fn is_recovering_marked(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["is_recovering_marked"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::IsSnapshotRecoveringRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().is_snapshot_recovering_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.get_marked()) + } + + fn put_store(&mut self, store: metapb::Store) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["put_store"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::PutStoreRequest::default(); + req.set_header(self.raw_client.header()); + req.set_store(store); + + let resp = self.raw_client.stub().put_store_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.replication_status.take()) + } + + fn get_store_and_stats( + &mut self, + store_id: u64, + ) -> PdFuture<(metapb::Store, pdpb::StoreStats)> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetStoreRequest::default(); + req.set_store_id(store_id); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_store_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_store_and_stats", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_store_and_stats"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + let store = resp.take_store(); + if store.get_state() != metapb::StoreState::Tombstone { + Ok((store, resp.take_stats())) + } else { + Err(Error::StoreTombstone(format!("{:?}", store))) + } + }) + } + + fn get_all_stores(&mut self, exclude_tombstone: bool) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_all_stores"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::GetAllStoresRequest::default(); + req.set_header(self.raw_client.header()); + req.set_exclude_tombstone_stores(exclude_tombstone); + + let resp = self.raw_client.stub().get_all_stores_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.take_stores().into()) + } + + fn get_cluster_config(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_cluster_config"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::GetClusterConfigRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().get_cluster_config_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.take_cluster()) + } + + fn get_region_and_leader( + &mut self, + key: &[u8], + ) -> PdFuture<(metapb::Region, Option)> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetRegionRequest::default(); + req.set_region_key(key.to_vec()); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_region_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_async_opt", e) + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_region"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + let region = if resp.has_region() { + resp.take_region() + } else { + return Err(Error::RegionNotFound(req.region_key)); + }; + let leader = if resp.has_leader() { + Some(resp.take_leader()) + } else { + None + }; + Ok((region, leader)) + }) + } + + fn get_region_by_id(&mut self, region_id: u64) -> PdFuture> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetRegionByIdRequest::default(); + req.set_region_id(region_id); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_region_by_id_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_by_id", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_region_by_id"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + if resp.has_region() { + Ok(Some(resp.take_region())) + } else { + Ok(None) + } + }) + } + + fn get_region_leader_by_id( + &mut self, + region_id: u64, + ) -> PdFuture> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetRegionByIdRequest::default(); + req.set_region_id(region_id); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_region_by_id_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!( + "fail to request PD {} err {:?}", + "get_region_leader_by_id", e + ); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_region_leader_by_id"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + if resp.has_region() && resp.has_leader() { + Ok(Some((resp.take_region(), resp.take_leader()))) + } else { + Ok(None) + } + }) + } + + fn ask_split(&mut self, region: metapb::Region) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::AskSplitRequest::default(); + req.set_region(region); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .ask_split_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "ask_split", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["ask_split"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp) + }) + } + + fn ask_batch_split( + &mut self, + region: metapb::Region, + count: usize, + ) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::AskBatchSplitRequest::default(); + req.set_region(region); + req.set_split_count(count as u32); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .ask_batch_split_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "ask_batch_split", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["ask_batch_split"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp) + }) + } + + fn store_heartbeat( + &mut self, + mut stats: pdpb::StoreStats, + store_report: Option, + dr_autosync_status: Option, + ) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::StoreHeartbeatRequest::default(); + stats + .mut_interval() + .set_end_timestamp(UnixSecs::now().into_inner()); + req.set_stats(stats); + if let Some(report) = store_report { + req.set_store_report(report); + } + if let Some(status) = dr_autosync_status { + req.set_dr_autosync_status(status); + } + + let mut raw_client = self.raw_client.clone(); + let feature_gate = self.feature_gate.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .store_heartbeat_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "store_heartbeat", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["store_heartbeat"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + match feature_gate.set_version(resp.get_cluster_version()) { + Err(_) => warn!("invalid cluster version: {}", resp.get_cluster_version()), + Ok(true) => info!("set cluster version to {}", resp.get_cluster_version()), + _ => {} + }; + Ok(resp) + }) + } + + fn report_batch_split(&mut self, regions: Vec) -> PdFuture<()> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::ReportBatchSplitRequest::default(); + req.set_regions(regions.into()); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .report_batch_split_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "report_batch_split", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["report_batch_split"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(()) + }) + } + + fn scatter_region(&mut self, mut region: RegionInfo) -> Result<()> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["scatter_region"]) + .start_coarse_timer(); + + let mut req = pdpb::ScatterRegionRequest::default(); + req.set_region_id(region.get_id()); + if let Some(leader) = region.leader.take() { + req.set_leader(leader); + } + req.set_region(region.region); + + block_on(self.raw_client.wait_for_ready())?; + req.set_header(self.raw_client.header()); + let resp = self.raw_client.stub().scatter_region_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header()) + } + + fn get_gc_safe_point(&mut self) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetGcSafePointRequest::default(); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_gc_safe_point_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_gc_saft_point", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_gc_saft_point"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp.get_safe_point()) + }) + } + + fn get_operator(&mut self, region_id: u64) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_operator"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::GetOperatorRequest::default(); + req.set_header(self.raw_client.header()); + req.set_region_id(region_id); + + let resp = self.raw_client.stub().get_operator_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp) + } + + fn update_service_safe_point( + &mut self, + name: String, + safe_point: TimeStamp, + ttl: Duration, + ) -> PdFuture<()> { + let timer = Instant::now_coarse(); + let mut req = pdpb::UpdateServiceGcSafePointRequest::default(); + req.set_service_id(name.into()); + req.set_ttl(ttl.as_secs() as _); + req.set_safe_point(safe_point.into_inner()); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .update_service_gc_safe_point_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!( + "fail to request PD {} err {:?}", + "update_service_safe_point", e + ); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["update_service_safe_point"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(()) + }) + } + + fn report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) -> PdFuture<()> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::ReportMinResolvedTsRequest::default(); + req.set_store_id(store_id); + req.set_min_resolved_ts(min_resolved_ts); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .report_min_resolved_ts_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "min_resolved_ts", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["min_resolved_ts"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(()) + }) + } +} diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index e4350e3d396..8674130c799 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -1,8 +1,12 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(let_chains)] + #[allow(unused_extern_crates)] extern crate tikv_alloc; mod client; +mod client_v2; mod feature_gate; pub mod metrics; mod tso; @@ -23,7 +27,8 @@ use tikv_util::time::{Instant, UnixSecs}; use txn_types::TimeStamp; pub use self::{ - client::{DummyPdClient, RpcClient}, + client::RpcClient, + client_v2::{PdClient as PdClientV2, RpcClient as RpcClientV2}, config::Config, errors::{Error, Result}, feature_gate::{Feature, FeatureGate}, diff --git a/components/pd_client/src/tso.rs b/components/pd_client/src/tso.rs index a19d7af8f06..feec5061a8c 100644 --- a/components/pd_client/src/tso.rs +++ b/components/pd_client/src/tso.rs @@ -180,40 +180,41 @@ impl<'a> Stream for TsoRequestStream<'a> { fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let pending_requests = self.pending_requests.clone(); let mut pending_requests = pending_requests.borrow_mut(); - let mut requests = Vec::new(); - while requests.len() < MAX_BATCH_SIZE && pending_requests.len() < MAX_PENDING_COUNT { - match self.request_rx.poll_recv(cx) { - Poll::Ready(Some(sender)) => { - requests.push(sender); + if pending_requests.len() < MAX_PENDING_COUNT { + let mut requests = Vec::new(); + while requests.len() < MAX_BATCH_SIZE { + match self.request_rx.poll_recv(cx) { + Poll::Ready(Some(sender)) => { + requests.push(sender); + } + Poll::Ready(None) if requests.is_empty() => { + return Poll::Ready(None); + } + _ => break, } - Poll::Ready(None) if requests.is_empty() => { - return Poll::Ready(None); - } - _ => break, + } + if !requests.is_empty() { + let mut req = TsoRequest::default(); + req.mut_header().cluster_id = self.cluster_id; + req.count = requests.iter().map(|r| r.count).sum(); + + let request_group = RequestGroup { + tso_request: req.clone(), + requests, + }; + pending_requests.push_back(request_group); + PD_PENDING_TSO_REQUEST_GAUGE.set(pending_requests.len() as i64); + + let write_flags = WriteFlags::default().buffer_hint(false); + return Poll::Ready(Some((req, write_flags))); } } - if !requests.is_empty() { - let mut req = TsoRequest::default(); - req.mut_header().cluster_id = self.cluster_id; - req.count = requests.iter().map(|r| r.count).sum(); - - let request_group = RequestGroup { - tso_request: req.clone(), - requests, - }; - pending_requests.push_back(request_group); - PD_PENDING_TSO_REQUEST_GAUGE.set(pending_requests.len() as i64); - - let write_flags = WriteFlags::default().buffer_hint(false); - Poll::Ready(Some((req, write_flags))) - } else { - // Set the waker to the context, then the stream can be waked up after the - // pending queue is no longer full. - self.self_waker.register(cx.waker()); - Poll::Pending - } + // Set the waker to the context, then the stream can be waked up after the + // pending queue is no longer full. + self.self_waker.register(cx.waker()); + Poll::Pending } } diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index da77783c167..72c8cc16b04 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -51,13 +51,14 @@ const MAX_RETRY_DURATION: Duration = Duration::from_secs(10); const GLOBAL_RECONNECT_INTERVAL: Duration = Duration::from_millis(100); // 0.1s pub const REQUEST_RECONNECT_INTERVAL: Duration = Duration::from_secs(1); // 1s +#[derive(Clone)] pub struct TargetInfo { target_url: String, via: String, } impl TargetInfo { - fn new(target_url: String, via: &str) -> TargetInfo { + pub(crate) fn new(target_url: String, via: &str) -> TargetInfo { TargetInfo { target_url, via: trim_http_prefix(via).to_string(), @@ -340,7 +341,13 @@ impl Client { async move { let direct_connected = self.inner.rl().target_info().direct_connected(); connector - .reconnect_pd(members, direct_connected, force, self.enable_forwarding) + .reconnect_pd( + members, + direct_connected, + force, + self.enable_forwarding, + true, + ) .await } }; @@ -383,7 +390,7 @@ impl Client { fail_point!("pd_client_reconnect", |_| Ok(())); - self.update_client(client, target_info, members, tso); + self.update_client(client, target_info, members, tso.unwrap()); info!("trying to update PD client done"; "spend" => ?start.saturating_elapsed()); Ok(()) } @@ -521,11 +528,13 @@ pub type StubTuple = ( PdClientStub, TargetInfo, GetMembersResponse, - TimestampOracle, + // Only used by RpcClient, not by RpcClientV2. + Option, ); +#[derive(Clone)] pub struct PdConnector { - env: Arc, + pub(crate) env: Arc, security_mgr: Arc, } @@ -534,7 +543,7 @@ impl PdConnector { PdConnector { env, security_mgr } } - pub async fn validate_endpoints(&self, cfg: &Config) -> Result { + pub async fn validate_endpoints(&self, cfg: &Config, build_tso: bool) -> Result { let len = cfg.endpoints.len(); let mut endpoints_set = HashSet::with_capacity_and_hasher(len, Default::default()); let mut members = None; @@ -575,7 +584,7 @@ impl PdConnector { match members { Some(members) => { let res = self - .reconnect_pd(members, true, true, cfg.enable_forwarding) + .reconnect_pd(members, true, true, cfg.enable_forwarding, build_tso) .await? .unwrap(); info!("all PD endpoints are consistent"; "endpoints" => ?cfg.endpoints); @@ -593,7 +602,9 @@ impl PdConnector { .max_send_message_len(-1) .max_receive_message_len(-1) .keepalive_time(Duration::from_secs(10)) - .keepalive_timeout(Duration::from_secs(3)); + .keepalive_timeout(Duration::from_secs(3)) + .max_reconnect_backoff(Duration::from_secs(5)) + .initial_reconnect_backoff(Duration::from_secs(1)); self.security_mgr.connect(cb, addr_trim) }; fail_point!("cluster_id_is_not_ready", |_| { @@ -602,7 +613,7 @@ impl PdConnector { GetMembersResponse::default(), )) }); - let client = PdClientStub::new(channel); + let client = PdClientStub::new(channel.clone()); let option = CallOption::default().timeout(Duration::from_secs(REQUEST_TIMEOUT)); let response = client .get_members_async_opt(&GetMembersRequest::default(), option) @@ -680,12 +691,13 @@ impl PdConnector { // not empty and it can connect the leader now which represents the network // partition problem to leader may be recovered 3. the member information of // PD has been changed - async fn reconnect_pd( + pub async fn reconnect_pd( &self, members_resp: GetMembersResponse, direct_connected: bool, force: bool, enable_forwarding: bool, + build_tso: bool, ) -> Result> { let resp = self.load_members(&members_resp).await?; let leader = resp.get_leader(); @@ -699,11 +711,15 @@ impl PdConnector { match res { Some((client, target_url)) => { let info = TargetInfo::new(target_url, ""); - let tso = TimestampOracle::new( - resp.get_header().get_cluster_id(), - &client, - info.call_option(), - )?; + let tso = if build_tso { + Some(TimestampOracle::new( + resp.get_header().get_cluster_id(), + &client, + info.call_option(), + )?) + } else { + None + }; return Ok(Some((client, info, resp, tso))); } None => { @@ -714,11 +730,15 @@ impl PdConnector { } if enable_forwarding && has_network_error { if let Ok(Some((client, info))) = self.try_forward(members, leader).await { - let tso = TimestampOracle::new( - resp.get_header().get_cluster_id(), - &client, - info.call_option(), - )?; + let tso = if build_tso { + Some(TimestampOracle::new( + resp.get_header().get_cluster_id(), + &client, + info.call_option(), + )?) + } else { + None + }; return Ok(Some((client, info, resp, tso))); } } @@ -774,7 +794,9 @@ impl PdConnector { loop { let (res, has_network_err) = self.connect_member(leader).await?; match res { - Some((client, ep, _)) => return Ok((Some((client, ep)), has_network_err)), + Some((client, ep, _)) => { + return Ok((Some((client, ep)), has_network_err)); + } None => { if has_network_err && retry_times > 0 diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 2b9d2de73ff..0ee185fd365 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" encryption = { workspace = true } engine_traits = { workspace = true } file_system = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.4.0" num_cpus = "1" online_config = { workspace = true } diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 1679732ccda..46ed20f8d10 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -41,7 +41,7 @@ file_system = { workspace = true } fs2 = "0.4" futures = { version = "0.3", features = ["compat"] } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 54eb07e8161..548693b71ac 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -56,7 +56,7 @@ grpcio-health = { version = "0.10", default-features = false, features = ["proto into_other = { workspace = true } itertools = "0.10" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } @@ -90,7 +90,7 @@ tokio = { version = "1.5", features = ["sync", "rt-multi-thread"] } tracker = { workspace = true } txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] encryption_export = { workspace = true } diff --git a/components/resolved_ts/Cargo.toml b/components/resolved_ts/Cargo.toml index d4a7e3d1ca2..10a555678c3 100644 --- a/components/resolved_ts/Cargo.toml +++ b/components/resolved_ts/Cargo.toml @@ -31,7 +31,7 @@ fail = "0.5" futures = "0.3" grpcio = { workspace = true } hex = "0.4" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } online_config = { workspace = true } diff --git a/components/resource_metering/Cargo.toml b/components/resource_metering/Cargo.toml index acb2dff89d3..20ed4ea2eda 100644 --- a/components/resource_metering/Cargo.toml +++ b/components/resource_metering/Cargo.toml @@ -8,7 +8,7 @@ collections = { workspace = true } crossbeam = "0.8" futures = "0.3" grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index 7a40340b64e..b27846ad5a3 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -56,7 +56,7 @@ grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } hex = "0.4" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } @@ -81,7 +81,7 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } toml = "0.5" txn_types = { workspace = true } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [target.'cfg(unix)'.dependencies] signal-hook = "0.3" diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index 0bba773418b..d0e2ff7eca8 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -26,7 +26,7 @@ futures = { version = "0.3", features = ["thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } openssl = "0.10" diff --git a/components/test_backup/Cargo.toml b/components/test_backup/Cargo.toml index 902e57d5eed..1798b50c82b 100644 --- a/components/test_backup/Cargo.toml +++ b/components/test_backup/Cargo.toml @@ -23,7 +23,7 @@ futures = "0.3" futures-executor = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } protobuf = "2" rand = "0.8" tempfile = "3.0" diff --git a/components/test_coprocessor/Cargo.toml b/components/test_coprocessor/Cargo.toml index a3bb3f8e476..03047d75e87 100644 --- a/components/test_coprocessor/Cargo.toml +++ b/components/test_coprocessor/Cargo.toml @@ -25,7 +25,7 @@ collections = { workspace = true } concurrency_manager = { workspace = true } engine_rocks = { workspace = true } futures = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } protobuf = "2" resource_metering = { workspace = true } test_storage = { workspace = true } @@ -33,5 +33,5 @@ tidb_query_common = { workspace = true } tidb_query_datatype = { workspace = true } tikv = { workspace = true } tikv_util = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } txn_types = { workspace = true } diff --git a/components/test_pd/Cargo.toml b/components/test_pd/Cargo.toml index d9163706895..a478e6ee325 100644 --- a/components/test_pd/Cargo.toml +++ b/components/test_pd/Cargo.toml @@ -9,7 +9,7 @@ collections = { workspace = true } fail = "0.5" futures = "0.3" grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } pd_client = { workspace = true } security = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index 79b095ef0d9..9e1a2b3bb0f 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -242,18 +242,19 @@ impl Pd for PdMock { let header = Service::header(); let tso_logical = self.tso_logical.clone(); let fut = async move { - resp.send_all(&mut req.map_ok(move |r| { - let logical = - tso_logical.fetch_add(r.count as i64, Ordering::SeqCst) + r.count as i64; - let mut res = TsoResponse::default(); - res.set_header(header.clone()); - res.mut_timestamp().physical = 42; - res.mut_timestamp().logical = logical; - res.count = r.count; - (res, WriteFlags::default()) - })) - .await - .unwrap(); + // Tolerate errors like RpcFinished(None). + let _ = resp + .send_all(&mut req.map_ok(move |r| { + let logical = + tso_logical.fetch_add(r.count as i64, Ordering::SeqCst) + r.count as i64; + let mut res = TsoResponse::default(); + res.set_header(header.clone()); + res.mut_timestamp().physical = 42; + res.mut_timestamp().logical = logical; + res.count = r.count; + (res, WriteFlags::default()) + })) + .await; let _ = resp.close().await; }; ctx.spawn(fut); diff --git a/components/test_pd/src/util.rs b/components/test_pd/src/util.rs index 1b05196c346..b1a22b93c47 100644 --- a/components/test_pd/src/util.rs +++ b/components/test_pd/src/util.rs @@ -2,7 +2,7 @@ use std::sync::Arc; -use pd_client::{Config, RpcClient}; +use pd_client::{Config, RpcClient, RpcClientV2}; use security::{SecurityConfig, SecurityManager}; use tikv_util::config::ReadableDuration; @@ -23,6 +23,13 @@ pub fn new_client(eps: Vec<(String, u16)>, mgr: Option>) -> RpcClient::new(&cfg, None, mgr).unwrap() } +pub fn new_client_v2(eps: Vec<(String, u16)>, mgr: Option>) -> RpcClientV2 { + let cfg = new_config(eps); + let mgr = + mgr.unwrap_or_else(|| Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap())); + RpcClientV2::new(&cfg, None, mgr).unwrap() +} + pub fn new_client_with_update_interval( eps: Vec<(String, u16)>, mgr: Option>, @@ -34,3 +41,15 @@ pub fn new_client_with_update_interval( mgr.unwrap_or_else(|| Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap())); RpcClient::new(&cfg, None, mgr).unwrap() } + +pub fn new_client_v2_with_update_interval( + eps: Vec<(String, u16)>, + mgr: Option>, + interval: ReadableDuration, +) -> RpcClientV2 { + let mut cfg = new_config(eps); + cfg.update_interval = interval; + let mgr = + mgr.unwrap_or_else(|| Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap())); + RpcClientV2::new(&cfg, None, mgr).unwrap() +} diff --git a/components/test_pd_client/Cargo.toml b/components/test_pd_client/Cargo.toml index ad2b20de5a0..9f67752b4c5 100644 --- a/components/test_pd_client/Cargo.toml +++ b/components/test_pd_client/Cargo.toml @@ -10,7 +10,7 @@ fail = "0.5" futures = "0.3" grpcio = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } pd_client = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } @@ -18,5 +18,5 @@ slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debu slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } txn_types = { workspace = true } diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index fb627dccb11..71c214ae21d 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -40,7 +40,7 @@ futures = "0.3" grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } pd_client = { workspace = true } @@ -61,5 +61,5 @@ test_util = { workspace = true } tikv = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } txn_types = { workspace = true } diff --git a/components/test_sst_importer/Cargo.toml b/components/test_sst_importer/Cargo.toml index b0c3e96ef5a..f951a6755e6 100644 --- a/components/test_sst_importer/Cargo.toml +++ b/components/test_sst_importer/Cargo.toml @@ -13,5 +13,5 @@ crc32fast = "1.2" engine_rocks = { workspace = true } engine_traits = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } diff --git a/components/test_storage/Cargo.toml b/components/test_storage/Cargo.toml index 04adc4e6de4..b1172b5d559 100644 --- a/components/test_storage/Cargo.toml +++ b/components/test_storage/Cargo.toml @@ -24,7 +24,7 @@ test-engines-panic = [ api_version = { workspace = true } collections = { workspace = true } futures = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } pd_client = { workspace = true } raftstore = { workspace = true } test_raftstore = { workspace = true } diff --git a/components/test_util/Cargo.toml b/components/test_util/Cargo.toml index 8aca28b092b..740132353f3 100644 --- a/components/test_util/Cargo.toml +++ b/components/test_util/Cargo.toml @@ -16,7 +16,7 @@ collections = { workspace = true } encryption_export = { workspace = true } fail = "0.5" grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } rand = "0.8" rand_isaac = "0.3" security = { workspace = true } diff --git a/components/tidb_query_aggr/Cargo.toml b/components/tidb_query_aggr/Cargo.toml index db8d9d64faf..facc9d32f36 100644 --- a/components/tidb_query_aggr/Cargo.toml +++ b/components/tidb_query_aggr/Cargo.toml @@ -12,7 +12,7 @@ tidb_query_common = { workspace = true } tidb_query_datatype = { workspace = true } tidb_query_expr = { workspace = true } tikv_util = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } [dev-dependencies] panic_hook = { workspace = true } diff --git a/components/tidb_query_common/Cargo.toml b/components/tidb_query_common/Cargo.toml index 05133b130e7..3dd1693ba0d 100644 --- a/components/tidb_query_common/Cargo.toml +++ b/components/tidb_query_common/Cargo.toml @@ -11,7 +11,7 @@ async-trait = "0.1" derive_more = "0.99.3" error_code = { workspace = true } futures = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } @@ -20,7 +20,7 @@ serde_json = "1.0" thiserror = "1.0" tikv_util = { workspace = true } time = "0.1" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] byteorder = "1.2" diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index af7e7e08b9d..e9d96e16284 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -18,7 +18,7 @@ collections = { workspace = true } encoding_rs = { git = "https://github.com/xiongjiwei/encoding_rs.git", rev = "68e0bc5a72a37a78228d80cd98047326559cf43c" } error_code = { workspace = true } hex = "0.4" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } match-template = "0.0.1" @@ -38,4 +38,4 @@ thiserror = "1.0" tidb_query_common = { workspace = true } tikv_alloc = { workspace = true } tikv_util = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } diff --git a/components/tidb_query_executors/Cargo.toml b/components/tidb_query_executors/Cargo.toml index e448340eddf..123c306c125 100644 --- a/components/tidb_query_executors/Cargo.toml +++ b/components/tidb_query_executors/Cargo.toml @@ -12,7 +12,7 @@ collections = { workspace = true } fail = "0.5" futures = { version = "0.3", features = ["compat"] } itertools = "0.10" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } match-template = "0.0.1" protobuf = { version = "2.8", features = ["bytes"] } @@ -24,8 +24,8 @@ tidb_query_common = { workspace = true } tidb_query_datatype = { workspace = true } tidb_query_expr = { workspace = true } tikv_util = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +tipb = { workspace = true } +yatp = { workspace = true } [dev-dependencies] anyhow = "1.0" diff --git a/components/tidb_query_expr/Cargo.toml b/components/tidb_query_expr/Cargo.toml index 1ca4a46b6dd..95f37308e59 100644 --- a/components/tidb_query_expr/Cargo.toml +++ b/components/tidb_query_expr/Cargo.toml @@ -30,7 +30,7 @@ tidb_query_common = { workspace = true } tidb_query_datatype = { workspace = true } tikv_util = { workspace = true } time = "0.1" -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } twoway = "0.2.0" uuid = { version = "0.8.1", features = ["v4"] } diff --git a/components/tikv_kv/Cargo.toml b/components/tikv_kv/Cargo.toml index 8aa64d0def6..2911c7738c6 100644 --- a/components/tikv_kv/Cargo.toml +++ b/components/tikv_kv/Cargo.toml @@ -36,7 +36,7 @@ fail = "0.5" file_system = { workspace = true } futures = { version = "0.3", features = ["thread-pool", "compat"] } into_other = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 36faa552804..12c3983ef2d 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -26,7 +26,7 @@ futures = { version = "0.3", features = ["compat", "thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } http = "0.2.0" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } @@ -56,10 +56,10 @@ tikv_alloc = { workspace = true } time = "0.1" tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-executor = "0.1" -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } tracker = { workspace = true } url = "2" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } diff --git a/components/tipb_helper/Cargo.toml b/components/tipb_helper/Cargo.toml index 31d2c290fdc..bfbadabaea3 100644 --- a/components/tipb_helper/Cargo.toml +++ b/components/tipb_helper/Cargo.toml @@ -7,4 +7,4 @@ publish = false [dependencies] codec = { workspace = true } tidb_query_datatype = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } diff --git a/components/tracker/Cargo.toml b/components/tracker/Cargo.toml index b369fab9628..84a3f5da0ab 100644 --- a/components/tracker/Cargo.toml +++ b/components/tracker/Cargo.toml @@ -7,7 +7,7 @@ publish = false [dependencies] collections = { workspace = true } crossbeam-utils = "0.8" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1" parking_lot = "0.12" pin-project = "1" diff --git a/components/txn_types/Cargo.toml b/components/txn_types/Cargo.toml index 9ccfe0bb323..0c357ef1dd6 100644 --- a/components/txn_types/Cargo.toml +++ b/components/txn_types/Cargo.toml @@ -11,7 +11,7 @@ codec = { workspace = true } collections = { workspace = true } error_code = { workspace = true } farmhash = "1.1.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } slog = "2.3" thiserror = "1.0" diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 5f29d44a53d..ae6c6984487 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -41,11 +41,11 @@ path = "benches/deadlock_detector/mod.rs" [features] default = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] -failpoints = ["fail/failpoints", "tikv/failpoints"] +failpoints = ["fail/failpoints", "tikv/failpoints", "pd_client/failpoints"] cloud-aws = ["external_storage_export/cloud-aws"] cloud-gcp = ["external_storage_export/cloud-gcp"] cloud-azure = ["external_storage_export/cloud-azure"] -testexport = ["raftstore/testexport", "tikv/testexport"] +testexport = ["raftstore/testexport", "tikv/testexport", "pd_client/testexport"] profiling = ["profiler/profiling"] test-engine-kv-rocksdb = [ @@ -83,7 +83,7 @@ file_system = { workspace = true } futures = "0.3" grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } libc = "0.2" log_wrappers = { workspace = true } more-asserts = "0.2" @@ -106,7 +106,7 @@ tidb_query_expr = { workspace = true } tikv = { workspace = true } tikv_util = { workspace = true } time = "0.1" -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } toml = "0.5" txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index 24a05f2ab9f..9c90211c073 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -20,6 +20,7 @@ mod test_memory_usage_limit; mod test_merge; mod test_metrics_overflow; mod test_pd_client; +mod test_pd_client_legacy; mod test_pending_peers; mod test_rawkv; mod test_read_execution_tracker; diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index eb22ac29e45..635b199291b 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -1,4 +1,4 @@ -// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ sync::{mpsc, Arc}, @@ -6,34 +6,35 @@ use std::{ time::Duration, }; +use futures::executor::block_on; use grpcio::EnvBuilder; use kvproto::metapb::*; -use pd_client::{PdClient, RegionInfo, RegionStat, RpcClient}; +use pd_client::{PdClientV2, RegionInfo, RpcClientV2}; use security::{SecurityConfig, SecurityManager}; use test_pd::{mocker::*, util::*, Server as MockServer}; use tikv_util::config::ReadableDuration; fn new_test_server_and_client( update_interval: ReadableDuration, -) -> (MockServer, RpcClient) { +) -> (MockServer, RpcClientV2) { let server = MockServer::new(1); let eps = server.bind_addrs(); - let client = new_client_with_update_interval(eps, None, update_interval); + let client = new_client_v2_with_update_interval(eps, None, update_interval); (server, client) } macro_rules! request { ($client: ident => block_on($func: tt($($arg: expr),*))) => { (stringify!($func), { - let client = $client.clone(); + let mut client = $client.clone(); Box::new(move || { - let _ = futures::executor::block_on(client.$func($($arg),*)); + let _ = block_on(client.$func($($arg),*)); }) }) }; ($client: ident => $func: tt($($arg: expr),*)) => { (stringify!($func), { - let client = $client.clone(); + let mut client = $client.clone(); Box::new(move || { let _ = client.$func($($arg),*); }) @@ -44,13 +45,12 @@ macro_rules! request { #[test] fn test_pd_client_deadlock() { let (_server, client) = new_test_server_and_client(ReadableDuration::millis(100)); - let client = Arc::new(client); let pd_client_reconnect_fp = "pd_client_reconnect"; // It contains all interfaces of PdClient. let test_funcs: Vec<(_, Box)> = vec![ request!(client => reconnect()), - request!(client => get_cluster_id()), + request!(client => fetch_cluster_id()), request!(client => bootstrap_cluster(Store::default(), Region::default())), request!(client => is_cluster_bootstrapped()), request!(client => alloc_id()), @@ -60,19 +60,15 @@ fn test_pd_client_deadlock() { request!(client => get_cluster_config()), request!(client => get_region(b"")), request!(client => get_region_info(b"")), - request!(client => block_on(get_region_async(b""))), - request!(client => block_on(get_region_info_async(b""))), request!(client => block_on(get_region_by_id(0))), - request!(client => block_on(region_heartbeat(0, Region::default(), Peer::default(), RegionStat::default(), None))), request!(client => block_on(ask_split(Region::default()))), request!(client => block_on(ask_batch_split(Region::default(), 1))), request!(client => block_on(store_heartbeat(Default::default(), None, None))), request!(client => block_on(report_batch_split(vec![]))), request!(client => scatter_region(RegionInfo::new(Region::default(), None))), request!(client => block_on(get_gc_safe_point())), - request!(client => block_on(get_store_stats_async(0))), + request!(client => block_on(get_store_and_stats(0))), request!(client => get_operator(0)), - request!(client => block_on(get_tso())), request!(client => load_global_config(vec![])), ]; @@ -87,10 +83,6 @@ fn test_pd_client_deadlock() { func(); tx.send(()).unwrap(); }); - // Only allow to reconnect once for a func. - client.handle_reconnect(move || { - fail::cfg(pd_client_reconnect_fp, "return").unwrap(); - }); // Remove the fail point to let the PD client thread go on. fail::remove(pd_client_reconnect_fp); @@ -107,7 +99,7 @@ fn test_pd_client_deadlock() { #[test] fn test_load_global_config() { - let (mut _server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let (mut _server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); let res = futures::executor::block_on(async move { client .load_global_config( @@ -125,12 +117,11 @@ fn test_load_global_config() { #[test] fn test_watch_global_config_on_closed_server() { - let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); - let client = Arc::new(client); + let (mut server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); use futures::StreamExt; let j = std::thread::spawn(move || { - futures::executor::block_on(async move { - let mut r = client.watch_global_config().unwrap(); + let mut r = client.watch_global_config().unwrap(); + block_on(async move { let mut i: usize = 0; while let Some(r) = r.next().await { match r { @@ -181,11 +172,11 @@ fn test_slow_periodical_update() { // client1 updates leader frequently (100ms). cfg.update_interval = ReadableDuration(Duration::from_millis(100)); - let _client1 = RpcClient::new(&cfg, Some(env.clone()), mgr.clone()).unwrap(); + let _client1 = RpcClientV2::new(&cfg, Some(env.clone()), mgr.clone()).unwrap(); // client2 never updates leader in the test. cfg.update_interval = ReadableDuration(Duration::from_secs(100)); - let client2 = RpcClient::new(&cfg, Some(env), mgr).unwrap(); + let mut client2 = RpcClientV2::new(&cfg, Some(env), mgr).unwrap(); fail::cfg(pd_client_reconnect_fp, "pause").unwrap(); // Wait for the PD client thread blocking on the fail point. @@ -208,23 +199,95 @@ fn test_slow_periodical_update() { handle.join().unwrap(); } -// Reconnection will be speed limited. +fn run_on_bad_connection(client: &mut RpcClientV2, mut f: F) +where + F: FnMut(&mut RpcClientV2), +{ + let pd_client_force_reconnect_fp = "pd_client_force_reconnect"; + if !client.initialized() { + client.is_cluster_bootstrapped().unwrap(); + } + client.reset_to_lame_client(); + fail::cfg(pd_client_force_reconnect_fp, "return").unwrap(); + f(client); + fail::remove(pd_client_force_reconnect_fp); +} + #[test] -fn test_reconnect_limit() { - let pd_client_reconnect_fp = "pd_client_reconnect"; - let (_server, client) = new_test_server_and_client(ReadableDuration::secs(100)); +fn test_backoff() { + let pd_client_v2_timeout_fp = "pd_client_v2_request_timeout"; + fail::cfg(pd_client_v2_timeout_fp, "return(5ms)").unwrap(); + // Backoff larger than timeout, so that the second request following the failed + // one can hit backoff. + let pd_client_v2_backoff_fp = "pd_client_v2_backoff"; + fail::cfg(pd_client_v2_backoff_fp, "return(100ms)").unwrap(); + let (_server, mut client) = new_test_server_and_client(ReadableDuration::secs(100)); - // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. - thread::sleep(Duration::from_millis(200)); + run_on_bad_connection(&mut client, |c| { + c.is_cluster_bootstrapped().unwrap_err(); + if c.is_cluster_bootstrapped().is_ok() { + // try again in case the first connect is too early. + run_on_bad_connection(c, |c2| { + c2.is_cluster_bootstrapped().unwrap_err(); + c2.is_cluster_bootstrapped().unwrap_err(); + std::thread::sleep(Duration::from_millis(100)); + c2.is_cluster_bootstrapped().unwrap(); + }); + return; + } + std::thread::sleep(Duration::from_millis(100)); + c.is_cluster_bootstrapped().unwrap(); + }); + + fail::remove(pd_client_v2_timeout_fp); + fail::remove(pd_client_v2_backoff_fp); +} + +#[test] +fn test_retry() { + let pd_client_v2_timeout_fp = "pd_client_v2_request_timeout"; + fail::cfg(pd_client_v2_timeout_fp, "return(10ms)").unwrap(); + // Disable backoff. + let pd_client_v2_backoff_fp = "pd_client_v2_backoff"; + fail::cfg(pd_client_v2_backoff_fp, "return(0s)").unwrap(); + let (_server, mut client) = new_test_server_and_client(ReadableDuration::secs(100)); - // The first reconnection will succeed, and the last_update will not be updated. - fail::cfg(pd_client_reconnect_fp, "return").unwrap(); - client.reconnect().unwrap(); - // The subsequent reconnection will be cancelled. - for _ in 0..10 { - let ret = client.reconnect(); - assert!(format!("{:?}", ret.unwrap_err()).contains("cancel reconnection")); + fn test_retry_success(client: &mut RpcClientV2, mut f: F) + where + F: FnMut(&mut RpcClientV2) -> pd_client::Result, + R: std::fmt::Debug, + { + run_on_bad_connection(client, |c| { + f(c).unwrap_err(); + f(c).unwrap(); + }); } - fail::remove(pd_client_reconnect_fp); + test_retry_success(&mut client, |c| { + c.bootstrap_cluster(Store::default(), Region::default()) + }); + test_retry_success(&mut client, |c| c.is_cluster_bootstrapped()); + test_retry_success(&mut client, |c| c.alloc_id()); + test_retry_success(&mut client, |c| c.put_store(Store::default())); + test_retry_success(&mut client, |c| c.get_store(0)); + test_retry_success(&mut client, |c| c.get_all_stores(false)); + test_retry_success(&mut client, |c| c.get_cluster_config()); + test_retry_success(&mut client, |c| c.get_region_info(b"")); + test_retry_success(&mut client, |c| block_on(c.get_region_by_id(0))); + test_retry_success(&mut client, |c| { + block_on(c.ask_batch_split(Region::default(), 1)) + }); + test_retry_success(&mut client, |c| { + block_on(c.store_heartbeat(Default::default(), None, None)) + }); + test_retry_success(&mut client, |c| block_on(c.report_batch_split(vec![]))); + test_retry_success(&mut client, |c| { + c.scatter_region(RegionInfo::new(Region::default(), None)) + }); + test_retry_success(&mut client, |c| block_on(c.get_gc_safe_point())); + test_retry_success(&mut client, |c| c.get_operator(0)); + test_retry_success(&mut client, |c| block_on(c.load_global_config(vec![]))); + + fail::remove(pd_client_v2_timeout_fp); + fail::remove(pd_client_v2_backoff_fp); } diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs new file mode 100644 index 00000000000..eb22ac29e45 --- /dev/null +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -0,0 +1,230 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{mpsc, Arc}, + thread, + time::Duration, +}; + +use grpcio::EnvBuilder; +use kvproto::metapb::*; +use pd_client::{PdClient, RegionInfo, RegionStat, RpcClient}; +use security::{SecurityConfig, SecurityManager}; +use test_pd::{mocker::*, util::*, Server as MockServer}; +use tikv_util::config::ReadableDuration; + +fn new_test_server_and_client( + update_interval: ReadableDuration, +) -> (MockServer, RpcClient) { + let server = MockServer::new(1); + let eps = server.bind_addrs(); + let client = new_client_with_update_interval(eps, None, update_interval); + (server, client) +} + +macro_rules! request { + ($client: ident => block_on($func: tt($($arg: expr),*))) => { + (stringify!($func), { + let client = $client.clone(); + Box::new(move || { + let _ = futures::executor::block_on(client.$func($($arg),*)); + }) + }) + }; + ($client: ident => $func: tt($($arg: expr),*)) => { + (stringify!($func), { + let client = $client.clone(); + Box::new(move || { + let _ = client.$func($($arg),*); + }) + }) + }; +} + +#[test] +fn test_pd_client_deadlock() { + let (_server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let client = Arc::new(client); + let pd_client_reconnect_fp = "pd_client_reconnect"; + + // It contains all interfaces of PdClient. + let test_funcs: Vec<(_, Box)> = vec![ + request!(client => reconnect()), + request!(client => get_cluster_id()), + request!(client => bootstrap_cluster(Store::default(), Region::default())), + request!(client => is_cluster_bootstrapped()), + request!(client => alloc_id()), + request!(client => put_store(Store::default())), + request!(client => get_store(0)), + request!(client => get_all_stores(false)), + request!(client => get_cluster_config()), + request!(client => get_region(b"")), + request!(client => get_region_info(b"")), + request!(client => block_on(get_region_async(b""))), + request!(client => block_on(get_region_info_async(b""))), + request!(client => block_on(get_region_by_id(0))), + request!(client => block_on(region_heartbeat(0, Region::default(), Peer::default(), RegionStat::default(), None))), + request!(client => block_on(ask_split(Region::default()))), + request!(client => block_on(ask_batch_split(Region::default(), 1))), + request!(client => block_on(store_heartbeat(Default::default(), None, None))), + request!(client => block_on(report_batch_split(vec![]))), + request!(client => scatter_region(RegionInfo::new(Region::default(), None))), + request!(client => block_on(get_gc_safe_point())), + request!(client => block_on(get_store_stats_async(0))), + request!(client => get_operator(0)), + request!(client => block_on(get_tso())), + request!(client => load_global_config(vec![])), + ]; + + for (name, func) in test_funcs { + fail::cfg(pd_client_reconnect_fp, "pause").unwrap(); + // Wait for the PD client thread blocking on the fail point. + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + + let (tx, rx) = mpsc::channel(); + let handle = thread::spawn(move || { + func(); + tx.send(()).unwrap(); + }); + // Only allow to reconnect once for a func. + client.handle_reconnect(move || { + fail::cfg(pd_client_reconnect_fp, "return").unwrap(); + }); + // Remove the fail point to let the PD client thread go on. + fail::remove(pd_client_reconnect_fp); + + let timeout = Duration::from_millis(500); + if rx.recv_timeout(timeout).is_err() { + panic!("PdClient::{}() hangs", name); + } + handle.join().unwrap(); + } + + drop(client); + fail::remove(pd_client_reconnect_fp); +} + +#[test] +fn test_load_global_config() { + let (mut _server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let res = futures::executor::block_on(async move { + client + .load_global_config( + ["abc", "123", "xyz"] + .iter() + .map(|x| x.to_string()) + .collect::>(), + ) + .await + }); + for (k, v) in res.unwrap() { + assert_eq!(k, format!("/global/config/{}", v)) + } +} + +#[test] +fn test_watch_global_config_on_closed_server() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let client = Arc::new(client); + use futures::StreamExt; + let j = std::thread::spawn(move || { + futures::executor::block_on(async move { + let mut r = client.watch_global_config().unwrap(); + let mut i: usize = 0; + while let Some(r) = r.next().await { + match r { + Ok(res) => { + let change = &res.get_changes()[0]; + assert_eq!( + change + .get_name() + .split('/') + .collect::>() + .last() + .unwrap() + .to_owned(), + format!("{:?}", i) + ); + assert_eq!(change.get_value().to_owned(), format!("{:?}", i)); + i += 1; + } + Err(e) => { + if let grpcio::Error::RpcFailure(e) = e { + // 14-UNAVAILABLE + assert_eq!(e.code(), grpcio::RpcStatusCode::from(14)); + break; + } else { + panic!("other error occur {:?}", e) + } + } + } + } + }); + }); + thread::sleep(Duration::from_millis(200)); + server.stop(); + j.join().unwrap(); +} + +// Updating pd leader may be slow, we need to make sure it does not block other +// RPC in the same gRPC Environment. +#[test] +fn test_slow_periodical_update() { + let pd_client_reconnect_fp = "pd_client_reconnect"; + let server = MockServer::new(1); + let eps = server.bind_addrs(); + + let mut cfg = new_config(eps); + let env = Arc::new(EnvBuilder::new().cq_count(1).build()); + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + + // client1 updates leader frequently (100ms). + cfg.update_interval = ReadableDuration(Duration::from_millis(100)); + let _client1 = RpcClient::new(&cfg, Some(env.clone()), mgr.clone()).unwrap(); + + // client2 never updates leader in the test. + cfg.update_interval = ReadableDuration(Duration::from_secs(100)); + let client2 = RpcClient::new(&cfg, Some(env), mgr).unwrap(); + + fail::cfg(pd_client_reconnect_fp, "pause").unwrap(); + // Wait for the PD client thread blocking on the fail point. + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + + let (tx, rx) = mpsc::channel(); + let handle = thread::spawn(move || { + client2.alloc_id().unwrap(); + tx.send(()).unwrap(); + }); + + let timeout = Duration::from_millis(500); + if rx.recv_timeout(timeout).is_err() { + panic!("pd client2 is blocked"); + } + + // Clean up the fail point. + fail::remove(pd_client_reconnect_fp); + handle.join().unwrap(); +} + +// Reconnection will be speed limited. +#[test] +fn test_reconnect_limit() { + let pd_client_reconnect_fp = "pd_client_reconnect"; + let (_server, client) = new_test_server_and_client(ReadableDuration::secs(100)); + + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + + // The first reconnection will succeed, and the last_update will not be updated. + fail::cfg(pd_client_reconnect_fp, "return").unwrap(); + client.reconnect().unwrap(); + // The subsequent reconnection will be cancelled. + for _ in 0..10 { + let ret = client.reconnect(); + assert!(format!("{:?}", ret.unwrap_err()).contains("cancel reconnection")); + } + + fail::remove(pd_client_reconnect_fp); +} diff --git a/tests/integrations/pd/mod.rs b/tests/integrations/pd/mod.rs index 2cadf7db2b5..eb9b6cc092a 100644 --- a/tests/integrations/pd/mod.rs +++ b/tests/integrations/pd/mod.rs @@ -1,3 +1,4 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. mod test_rpc_client; +mod test_rpc_client_legacy; diff --git a/tests/integrations/pd/test_rpc_client.rs b/tests/integrations/pd/test_rpc_client.rs index 5f44cc0137b..23841ba5dfd 100644 --- a/tests/integrations/pd/test_rpc_client.rs +++ b/tests/integrations/pd/test_rpc_client.rs @@ -1,26 +1,40 @@ -// Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - sync::{ - atomic::{AtomicUsize, Ordering}, - mpsc, Arc, - }, - thread, - time::Duration, -}; +use std::{sync::Arc, thread, time::Duration}; use error_code::ErrorCodeExt; -use futures::executor::block_on; +use futures::{executor::block_on, StreamExt}; use grpcio::{EnvBuilder, Error as GrpcError, RpcStatus, RpcStatusCode}; use kvproto::{metapb, pdpb}; -use pd_client::{Error as PdError, Feature, PdClient, PdConnector, RegionStat, RpcClient}; -use raftstore::store; +use pd_client::{Error as PdError, Feature, PdClientV2, PdConnector, RpcClientV2}; use security::{SecurityConfig, SecurityManager}; use test_pd::{mocker::*, util::*, Server as MockServer}; -use tikv_util::config::ReadableDuration; -use tokio::runtime::Builder; +use tikv_util::{config::ReadableDuration, mpsc::future::WakePolicy, thd_name}; +use tokio::runtime::{Builder, Runtime}; use txn_types::TimeStamp; +fn setup_runtime() -> Runtime { + Builder::new_multi_thread() + .thread_name(thd_name!("poller")) + .worker_threads(1) + .enable_all() + .build() + .unwrap() +} + +fn must_get_tso(client: &mut RpcClientV2, count: u32) -> TimeStamp { + let (tx, mut responses) = client.create_tso_stream(WakePolicy::Immediately).unwrap(); + let mut req = pdpb::TsoRequest::default(); + req.mut_header().cluster_id = client.fetch_cluster_id().unwrap(); + req.count = count; + tx.send(req).unwrap(); + let resp = block_on(responses.next()).unwrap().unwrap(); + let ts = resp.timestamp.unwrap(); + let physical = ts.physical as u64; + let logical = ts.logical as u64; + TimeStamp::compose(physical, logical) +} + #[test] fn test_retry_rpc_client() { let eps_count = 1; @@ -32,7 +46,7 @@ fn test_retry_rpc_client() { server.stop(); let child = thread::spawn(move || { let cfg = new_config(m_eps); - RpcClient::new(&cfg, None, m_mgr).unwrap(); + RpcClientV2::new(&cfg, None, m_mgr).unwrap(); }); thread::sleep(Duration::from_millis(500)); server.start(&mgr, eps); @@ -41,12 +55,14 @@ fn test_retry_rpc_client() { #[test] fn test_rpc_client() { + let rt = setup_runtime(); + let _g = rt.enter(); let eps_count = 1; let server = MockServer::new(eps_count); let eps = server.bind_addrs(); - let client = new_client(eps.clone(), None); - assert_ne!(client.get_cluster_id().unwrap(), 0); + let mut client = new_client_v2(eps.clone(), None); + assert_ne!(client.fetch_cluster_id().unwrap(), 0); let store_id = client.alloc_id().unwrap(); let mut store = metapb::Store::default(); @@ -89,38 +105,32 @@ fn test_rpc_client() { .unwrap(); assert_eq!(tmp_region.get_id(), region.get_id()); - let ts = block_on(client.get_tso()).unwrap(); + let ts = must_get_tso(&mut client, 1); assert_ne!(ts, TimeStamp::zero()); - let ts100 = block_on(client.batch_get_tso(100)).unwrap(); + let ts100 = must_get_tso(&mut client, 100); assert_eq!(ts.logical() + 100, ts100.logical()); let mut prev_id = 0; for _ in 0..100 { - let client = new_client(eps.clone(), None); + let mut client = new_client_v2(eps.clone(), None); let alloc_id = client.alloc_id().unwrap(); assert!(alloc_id > prev_id); prev_id = alloc_id; } - let poller = Builder::new_multi_thread() - .thread_name(thd_name!("poller")) - .worker_threads(1) - .build() + let (tx, mut responses) = client + .create_region_heartbeat_stream(WakePolicy::Immediately) .unwrap(); - let (tx, rx) = mpsc::channel(); - let f = client.handle_region_heartbeat_response(1, move |resp| { - let _ = tx.send(resp); - }); - poller.spawn(f); - poller.spawn(client.region_heartbeat( - store::RAFT_INIT_LOG_TERM, - region.clone(), - peer.clone(), - RegionStat::default(), - None, - )); - rx.recv_timeout(Duration::from_secs(3)).unwrap(); + let mut req = pdpb::RegionHeartbeatRequest::default(); + req.set_region(region.clone()); + req.set_leader(peer.clone()); + tx.send(req).unwrap(); + block_on(tokio::time::timeout( + Duration::from_secs(3), + responses.next(), + )) + .unwrap(); let region_info = client.get_region_info(region_key).unwrap(); assert_eq!(region_info.region, region); @@ -150,26 +160,14 @@ fn test_connect_follower() { // test switch cfg.enable_forwarding = false; let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); - let client1 = RpcClient::new(&cfg, None, mgr).unwrap(); + let mut client1 = RpcClientV2::new(&cfg, None, mgr).unwrap(); fail::cfg(connect_leader_fp, "return").unwrap(); - // RECONNECT_INTERVAL_SEC is 1s. - thread::sleep(Duration::from_secs(1)); - let res = format!("{}", client1.alloc_id().unwrap_err()); - let err = format!( - "{}", - PdError::Grpc(GrpcError::RpcFailure(RpcStatus::with_message( - RpcStatusCode::UNAVAILABLE, - "".to_string(), - ))) - ); - assert_eq!(res, err); + client1.alloc_id().unwrap_err(); cfg.enable_forwarding = true; let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); - let client = RpcClient::new(&cfg, None, mgr).unwrap(); - // RECONNECT_INTERVAL_SEC is 1s. - thread::sleep(Duration::from_secs(1)); - let leader_addr = client1.get_leader().get_client_urls()[0].clone(); + let mut client = RpcClientV2::new(&cfg, None, mgr).unwrap(); + let leader_addr = client.get_leader().get_client_urls()[0].clone(); let res = format!("{}", client.alloc_id().unwrap_err()); let err = format!( "{}", @@ -188,7 +186,7 @@ fn test_get_tombstone_stores() { let eps_count = 1; let server = MockServer::new(eps_count); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); let mut all_stores = vec![]; let store_id = client.alloc_id().unwrap(); @@ -242,7 +240,7 @@ fn test_get_tombstone_store() { let eps_count = 1; let server = MockServer::new(eps_count); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); let mut all_stores = vec![]; let store_id = client.alloc_id().unwrap(); @@ -264,7 +262,7 @@ fn test_get_tombstone_store() { store99.set_state(metapb::StoreState::Tombstone); server.default_handler().add_store(store99.clone()); - let r = block_on(client.get_store_async(99)); + let r = client.get_store(99); assert_eq!(r.unwrap_err().error_code(), error_code::pd::STORE_TOMBSTONE); } @@ -273,7 +271,7 @@ fn test_reboot() { let eps_count = 1; let server = MockServer::with_case(eps_count, Arc::new(AlreadyBootstrapped)); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); assert!(!client.is_cluster_bootstrapped().unwrap()); @@ -299,7 +297,7 @@ fn test_validate_endpoints() { let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); let connector = PdConnector::new(env, mgr); - assert!(block_on(connector.validate_endpoints(&new_config(eps))).is_err()); + assert!(block_on(connector.validate_endpoints(&new_config(eps), true)).is_err()); } #[test] @@ -318,66 +316,7 @@ fn test_validate_endpoints_retry() { eps.pop(); let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); let connector = PdConnector::new(env, mgr); - assert!(block_on(connector.validate_endpoints(&new_config(eps))).is_err()); -} - -fn test_retry(func: F) { - let eps_count = 1; - // Retry mocker returns `Err(_)` for most request, here two thirds are `Err(_)`. - let retry = Arc::new(Retry::new(3)); - let server = MockServer::with_case(eps_count, retry); - let eps = server.bind_addrs(); - - let client = new_client(eps, None); - - for _ in 0..3 { - func(&client); - } -} - -#[test] -fn test_retry_async() { - let r#async = |client: &RpcClient| { - block_on(client.get_region_by_id(1)).unwrap(); - }; - test_retry(r#async); -} - -#[test] -fn test_retry_sync() { - let sync = |client: &RpcClient| { - client.get_store(1).unwrap(); - }; - test_retry(sync) -} - -fn test_not_retry(func: F) { - let eps_count = 1; - // NotRetry mocker returns Ok() with error header first, and next returns Ok() - // without any error header. - let not_retry = Arc::new(NotRetry::new()); - let server = MockServer::with_case(eps_count, not_retry); - let eps = server.bind_addrs(); - - let client = new_client(eps, None); - - func(&client); -} - -#[test] -fn test_not_retry_async() { - let r#async = |client: &RpcClient| { - block_on(client.get_region_by_id(1)).unwrap_err(); - }; - test_not_retry(r#async); -} - -#[test] -fn test_not_retry_sync() { - let sync = |client: &RpcClient| { - client.get_store(1).unwrap_err(); - }; - test_not_retry(sync); + assert!(block_on(connector.validate_endpoints(&new_config(eps), true)).is_err()); } #[test] @@ -386,7 +325,7 @@ fn test_incompatible_version() { let server = MockServer::with_case(1, incompatible); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); let resp = block_on(client.ask_batch_split(metapb::Region::default(), 2)); assert_eq!( @@ -402,7 +341,7 @@ fn restart_leader(mgr: SecurityManager) { MockServer::::with_configuration(&mgr, vec![("127.0.0.1".to_owned(), 0); 3], None); let eps = server.bind_addrs(); - let client = new_client(eps.clone(), Some(Arc::clone(&mgr))); + let mut client = new_client_v2(eps.clone(), Some(Arc::clone(&mgr))); // Put a region. let store_id = client.alloc_id().unwrap(); let mut store = metapb::Store::default(); @@ -453,12 +392,8 @@ fn test_change_leader_async() { let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); let eps = server.bind_addrs(); - let counter = Arc::new(AtomicUsize::new(0)); - let client = new_client(eps, None); - let counter1 = Arc::clone(&counter); - client.handle_reconnect(move || { - counter1.fetch_add(1, Ordering::SeqCst); - }); + let mut client = new_client_v2(eps, None); + let mut reconnect_recv = client.subscribe_reconnect(); let leader = client.get_leader(); for _ in 0..5 { @@ -467,7 +402,10 @@ fn test_change_leader_async() { let new = client.get_leader(); if new != leader { - assert!(counter.load(Ordering::SeqCst) >= 1); + assert!(matches!( + reconnect_recv.try_recv(), + Ok(_) | Err(tokio::sync::broadcast::error::TryRecvError::Lagged(_)) + )); return; } thread::sleep(LeaderChange::get_leader_interval()); @@ -482,7 +420,7 @@ fn test_pd_client_ok_when_cluster_not_ready() { let server = MockServer::with_case(3, Arc::new(AlreadyBootstrapped)); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); fail::cfg(pd_client_cluster_id_zero, "return()").unwrap(); // wait 100ms to let client load member. thread::sleep(Duration::from_millis(101)); @@ -492,36 +430,33 @@ fn test_pd_client_ok_when_cluster_not_ready() { #[test] fn test_pd_client_heartbeat_send_failed() { + let rt = setup_runtime(); + let _g = rt.enter(); let pd_client_send_fail_fp = "region_heartbeat_send_failed"; fail::cfg(pd_client_send_fail_fp, "return()").unwrap(); let server = MockServer::with_case(1, Arc::new(AlreadyBootstrapped)); let eps = server.bind_addrs(); - let client = new_client(eps, None); - let poller = Builder::new_multi_thread() - .thread_name(thd_name!("poller")) - .worker_threads(1) - .build() + let mut client = new_client_v2(eps, None); + + let (tx, mut responses) = client + .create_region_heartbeat_stream(WakePolicy::Immediately) .unwrap(); - let (tx, rx) = mpsc::channel(); - let f = - client.handle_region_heartbeat_response(1, move |resp| tx.send(resp).unwrap_or_default()); - poller.spawn(f); - let heartbeat_send_fail = |ok| { + let mut heartbeat_send_fail = |ok| { let mut region = metapb::Region::default(); region.set_id(1); - poller.spawn(client.region_heartbeat( - store::RAFT_INIT_LOG_TERM, - region, - metapb::Peer::default(), - RegionStat::default(), - None, + let mut req = pdpb::RegionHeartbeatRequest::default(); + req.set_region(region); + tx.send(req).unwrap(); + + let rsp = block_on(tokio::time::timeout( + Duration::from_millis(100), + responses.next(), )); - let rsp = rx.recv_timeout(Duration::from_millis(100)); if ok { assert!(rsp.is_ok()); - assert_eq!(rsp.unwrap().get_region_id(), 1); + assert_eq!(rsp.unwrap().unwrap().unwrap().get_region_id(), 1); } else { rsp.unwrap_err(); } @@ -545,35 +480,28 @@ fn test_pd_client_heartbeat_send_failed() { #[test] fn test_region_heartbeat_on_leader_change() { + let rt = setup_runtime(); + let _g = rt.enter(); let eps_count = 3; let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); let eps = server.bind_addrs(); - let client = new_client(eps, None); - let poller = Builder::new_multi_thread() - .thread_name(thd_name!("poller")) - .worker_threads(1) - .build() - .unwrap(); - let (tx, rx) = mpsc::channel(); - let f = client.handle_region_heartbeat_response(1, move |resp| { - tx.send(resp).unwrap(); - }); - poller.spawn(f); - let region = metapb::Region::default(); - let peer = metapb::Peer::default(); - let stat = RegionStat::default(); - poller.spawn(client.region_heartbeat( - store::RAFT_INIT_LOG_TERM, - region.clone(), - peer.clone(), - stat.clone(), - None, - )); - rx.recv_timeout(LeaderChange::get_leader_interval()) + let mut client = new_client_v2(eps, None); + + let (tx, mut responses) = client + .create_region_heartbeat_stream(WakePolicy::Immediately) .unwrap(); - let heartbeat_on_leader_change = |count| { + tx.send(pdpb::RegionHeartbeatRequest::default()).unwrap(); + block_on(tokio::time::timeout( + LeaderChange::get_leader_interval(), + responses.next(), + )) + .unwrap() + .unwrap() + .unwrap(); + + let mut heartbeat_on_leader_change = |count| { let mut leader = client.get_leader(); for _ in 0..count { loop { @@ -587,15 +515,14 @@ fn test_region_heartbeat_on_leader_change() { thread::sleep(LeaderChange::get_leader_interval()); } } - poller.spawn(client.region_heartbeat( - store::RAFT_INIT_LOG_TERM, - region.clone(), - peer.clone(), - stat.clone(), - None, - )); - rx.recv_timeout(LeaderChange::get_leader_interval()) - .unwrap(); + tx.send(pdpb::RegionHeartbeatRequest::default()).unwrap(); + block_on(tokio::time::timeout( + LeaderChange::get_leader_interval(), + responses.next(), + )) + .unwrap() + .unwrap() + .unwrap(); }; // Change PD leader once then heartbeat PD. @@ -612,18 +539,17 @@ fn test_periodical_update() { let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); let eps = server.bind_addrs(); - let counter = Arc::new(AtomicUsize::new(0)); - let client = new_client_with_update_interval(eps, None, ReadableDuration::secs(3)); - let counter1 = Arc::clone(&counter); - client.handle_reconnect(move || { - counter1.fetch_add(1, Ordering::SeqCst); - }); + let mut client = new_client_v2_with_update_interval(eps, None, ReadableDuration::secs(3)); + let mut reconnect_recv = client.subscribe_reconnect(); let leader = client.get_leader(); for _ in 0..5 { let new = client.get_leader(); if new != leader { - assert!(counter.load(Ordering::SeqCst) >= 1); + assert!(matches!( + reconnect_recv.try_recv(), + Ok(_) | Err(tokio::sync::broadcast::error::TryRecvError::Lagged(_)) + )); return; } thread::sleep(LeaderChange::get_leader_interval()); @@ -641,13 +567,14 @@ fn test_cluster_version() { let feature_b = Feature::require(5, 0, 0); let feature_c = Feature::require(5, 0, 1); - let client = new_client(eps, None); - let feature_gate = client.feature_gate(); + let mut client = new_client_v2(eps, None); + let feature_gate = client.feature_gate().clone(); assert!(!feature_gate.can_enable(feature_a)); - let emit_heartbeat = || { + let mut client_clone = client.clone(); + let mut emit_heartbeat = || { let req = pdpb::StoreStats::default(); - block_on(client.store_heartbeat(req, /* store_report= */ None, None)).unwrap(); + block_on(client_clone.store_heartbeat(req, /* store_report= */ None, None)).unwrap(); }; let set_cluster_version = |version: &str| { diff --git a/tests/integrations/pd/test_rpc_client_legacy.rs b/tests/integrations/pd/test_rpc_client_legacy.rs new file mode 100644 index 00000000000..d2ff6d6ac11 --- /dev/null +++ b/tests/integrations/pd/test_rpc_client_legacy.rs @@ -0,0 +1,691 @@ +// Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicUsize, Ordering}, + mpsc, Arc, + }, + thread, + time::Duration, +}; + +use error_code::ErrorCodeExt; +use futures::executor::block_on; +use grpcio::{EnvBuilder, Error as GrpcError, RpcStatus, RpcStatusCode}; +use kvproto::{metapb, pdpb}; +use pd_client::{Error as PdError, Feature, PdClient, PdConnector, RegionStat, RpcClient}; +use raftstore::store; +use security::{SecurityConfig, SecurityManager}; +use test_pd::{mocker::*, util::*, Server as MockServer}; +use tikv_util::config::ReadableDuration; +use tokio::runtime::Builder; +use txn_types::TimeStamp; + +#[test] +fn test_retry_rpc_client() { + let eps_count = 1; + let mut server = MockServer::new(eps_count); + let eps = server.bind_addrs(); + let m_eps = eps.clone(); + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let m_mgr = mgr.clone(); + server.stop(); + let child = thread::spawn(move || { + let cfg = new_config(m_eps); + RpcClient::new(&cfg, None, m_mgr).unwrap(); + }); + thread::sleep(Duration::from_millis(500)); + server.start(&mgr, eps); + child.join().unwrap(); +} + +#[test] +fn test_rpc_client() { + let eps_count = 1; + let server = MockServer::new(eps_count); + let eps = server.bind_addrs(); + + let client = new_client(eps.clone(), None); + assert_ne!(client.get_cluster_id().unwrap(), 0); + + let store_id = client.alloc_id().unwrap(); + let mut store = metapb::Store::default(); + store.set_id(store_id); + debug!("bootstrap store {:?}", store); + + let peer_id = client.alloc_id().unwrap(); + let mut peer = metapb::Peer::default(); + peer.set_id(peer_id); + peer.set_store_id(store_id); + + let region_id = client.alloc_id().unwrap(); + let mut region = metapb::Region::default(); + region.set_id(region_id); + region.mut_peers().push(peer.clone()); + debug!("bootstrap region {:?}", region); + + client + .bootstrap_cluster(store.clone(), region.clone()) + .unwrap(); + assert_eq!(client.is_cluster_bootstrapped().unwrap(), true); + + let tmp_stores = client.get_all_stores(false).unwrap(); + assert_eq!(tmp_stores.len(), 1); + assert_eq!(tmp_stores[0], store); + + let tmp_store = client.get_store(store_id).unwrap(); + assert_eq!(tmp_store.get_id(), store.get_id()); + + let region_key = region.get_start_key(); + let tmp_region = client.get_region(region_key).unwrap(); + assert_eq!(tmp_region.get_id(), region.get_id()); + + let region_info = client.get_region_info(region_key).unwrap(); + assert_eq!(region_info.region, region); + assert_eq!(region_info.leader, None); + + let tmp_region = block_on(client.get_region_by_id(region_id)) + .unwrap() + .unwrap(); + assert_eq!(tmp_region.get_id(), region.get_id()); + + let ts = block_on(client.get_tso()).unwrap(); + assert_ne!(ts, TimeStamp::zero()); + + let ts100 = block_on(client.batch_get_tso(100)).unwrap(); + assert_eq!(ts.logical() + 100, ts100.logical()); + + let mut prev_id = 0; + for _ in 0..100 { + let client = new_client(eps.clone(), None); + let alloc_id = client.alloc_id().unwrap(); + assert!(alloc_id > prev_id); + prev_id = alloc_id; + } + + let poller = Builder::new_multi_thread() + .thread_name(thd_name!("poller")) + .worker_threads(1) + .build() + .unwrap(); + let (tx, rx) = mpsc::channel(); + let f = client.handle_region_heartbeat_response(1, move |resp| { + let _ = tx.send(resp); + }); + poller.spawn(f); + poller.spawn(client.region_heartbeat( + store::RAFT_INIT_LOG_TERM, + region.clone(), + peer.clone(), + RegionStat::default(), + None, + )); + rx.recv_timeout(Duration::from_secs(3)).unwrap(); + + let region_info = client.get_region_info(region_key).unwrap(); + assert_eq!(region_info.region, region); + assert_eq!(region_info.leader.unwrap(), peer); + + block_on(client.store_heartbeat( + pdpb::StoreStats::default(), + None, // store_report + None, + )) + .unwrap(); + block_on(client.ask_batch_split(metapb::Region::default(), 1)).unwrap(); + block_on(client.report_batch_split(vec![metapb::Region::default(), metapb::Region::default()])) + .unwrap(); + + let region_info = client.get_region_info(region_key).unwrap(); + client.scatter_region(region_info).unwrap(); +} + +#[test] +fn test_connect_follower() { + let connect_leader_fp = "connect_leader"; + let server = MockServer::new(2); + let eps = server.bind_addrs(); + let mut cfg = new_config(eps); + + // test switch + cfg.enable_forwarding = false; + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let client1 = RpcClient::new(&cfg, None, mgr).unwrap(); + fail::cfg(connect_leader_fp, "return").unwrap(); + // RECONNECT_INTERVAL_SEC is 1s. + thread::sleep(Duration::from_secs(1)); + let res = format!("{}", client1.alloc_id().unwrap_err()); + let err = format!( + "{}", + PdError::Grpc(GrpcError::RpcFailure(RpcStatus::with_message( + RpcStatusCode::UNAVAILABLE, + "".to_string(), + ))) + ); + assert_eq!(res, err); + + cfg.enable_forwarding = true; + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let client = RpcClient::new(&cfg, None, mgr).unwrap(); + // RECONNECT_INTERVAL_SEC is 1s. + thread::sleep(Duration::from_secs(1)); + let leader_addr = client1.get_leader().get_client_urls()[0].clone(); + let res = format!("{}", client.alloc_id().unwrap_err()); + let err = format!( + "{}", + PdError::Grpc(GrpcError::RpcFailure(RpcStatus::with_message( + RpcStatusCode::UNAVAILABLE, + leader_addr, + ))) + ); + assert_eq!(res, err); + + fail::remove(connect_leader_fp); +} + +#[test] +fn test_get_tombstone_stores() { + let eps_count = 1; + let server = MockServer::new(eps_count); + let eps = server.bind_addrs(); + let client = new_client(eps, None); + + let mut all_stores = vec![]; + let store_id = client.alloc_id().unwrap(); + let mut store = metapb::Store::default(); + store.set_id(store_id); + let region_id = client.alloc_id().unwrap(); + let mut region = metapb::Region::default(); + region.set_id(region_id); + client.bootstrap_cluster(store.clone(), region).unwrap(); + + all_stores.push(store); + assert_eq!(client.is_cluster_bootstrapped().unwrap(), true); + let s = client.get_all_stores(false).unwrap(); + assert_eq!(s, all_stores); + + // Add tombstone store. + let mut store99 = metapb::Store::default(); + store99.set_id(99); + store99.set_state(metapb::StoreState::Tombstone); + server.default_handler().add_store(store99.clone()); + + // do not include tombstone. + let s = client.get_all_stores(true).unwrap(); + assert_eq!(s, all_stores); + + all_stores.push(store99.clone()); + all_stores.sort_by_key(|a| a.get_id()); + // include tombstone, there should be 2 stores. + let mut s = client.get_all_stores(false).unwrap(); + s.sort_by_key(|a| a.get_id()); + assert_eq!(s, all_stores); + + // Add another tombstone store. + let mut store199 = store99; + store199.set_id(199); + server.default_handler().add_store(store199.clone()); + + all_stores.push(store199); + all_stores.sort_by_key(|a| a.get_id()); + let mut s = client.get_all_stores(false).unwrap(); + s.sort_by_key(|a| a.get_id()); + assert_eq!(s, all_stores); + + client.get_store(store_id).unwrap(); + client.get_store(99).unwrap_err(); + client.get_store(199).unwrap_err(); +} + +#[test] +fn test_get_tombstone_store() { + let eps_count = 1; + let server = MockServer::new(eps_count); + let eps = server.bind_addrs(); + let client = new_client(eps, None); + + let mut all_stores = vec![]; + let store_id = client.alloc_id().unwrap(); + let mut store = metapb::Store::default(); + store.set_id(store_id); + let region_id = client.alloc_id().unwrap(); + let mut region = metapb::Region::default(); + region.set_id(region_id); + client.bootstrap_cluster(store.clone(), region).unwrap(); + + all_stores.push(store); + assert_eq!(client.is_cluster_bootstrapped().unwrap(), true); + let s = client.get_all_stores(false).unwrap(); + assert_eq!(s, all_stores); + + // Add tombstone store. + let mut store99 = metapb::Store::default(); + store99.set_id(99); + store99.set_state(metapb::StoreState::Tombstone); + server.default_handler().add_store(store99.clone()); + + let r = block_on(client.get_store_async(99)); + assert_eq!(r.unwrap_err().error_code(), error_code::pd::STORE_TOMBSTONE); +} + +#[test] +fn test_reboot() { + let eps_count = 1; + let server = MockServer::with_case(eps_count, Arc::new(AlreadyBootstrapped)); + let eps = server.bind_addrs(); + let client = new_client(eps, None); + + assert!(!client.is_cluster_bootstrapped().unwrap()); + + match client.bootstrap_cluster(metapb::Store::default(), metapb::Region::default()) { + Err(PdError::ClusterBootstrapped(_)) => (), + _ => { + panic!("failed, should return ClusterBootstrapped"); + } + } +} + +#[test] +fn test_validate_endpoints() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(Split::new())); + let env = Arc::new( + EnvBuilder::new() + .cq_count(1) + .name_prefix(thd_name!("test-pd")) + .build(), + ); + let eps = server.bind_addrs(); + + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let connector = PdConnector::new(env, mgr); + assert!(block_on(connector.validate_endpoints(&new_config(eps), false)).is_err()); +} + +#[test] +fn test_validate_endpoints_retry() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(Split::new())); + let env = Arc::new( + EnvBuilder::new() + .cq_count(1) + .name_prefix(thd_name!("test-pd")) + .build(), + ); + let mut eps = server.bind_addrs(); + let mock_port = 65535; + eps.insert(0, ("127.0.0.1".to_string(), mock_port)); + eps.pop(); + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let connector = PdConnector::new(env, mgr); + assert!(block_on(connector.validate_endpoints(&new_config(eps), false)).is_err()); +} + +fn test_retry(func: F) { + let eps_count = 1; + // Retry mocker returns `Err(_)` for most request, here two thirds are `Err(_)`. + let retry = Arc::new(Retry::new(3)); + let server = MockServer::with_case(eps_count, retry); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + + for _ in 0..3 { + func(&client); + } +} + +#[test] +fn test_retry_async() { + let r#async = |client: &RpcClient| { + block_on(client.get_region_by_id(1)).unwrap(); + }; + test_retry(r#async); +} + +#[test] +fn test_retry_sync() { + let sync = |client: &RpcClient| { + client.get_store(1).unwrap(); + }; + test_retry(sync) +} + +fn test_not_retry(func: F) { + let eps_count = 1; + // NotRetry mocker returns Ok() with error header first, and next returns Ok() + // without any error header. + let not_retry = Arc::new(NotRetry::new()); + let server = MockServer::with_case(eps_count, not_retry); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + + func(&client); +} + +#[test] +fn test_not_retry_async() { + let r#async = |client: &RpcClient| { + block_on(client.get_region_by_id(1)).unwrap_err(); + }; + test_not_retry(r#async); +} + +#[test] +fn test_not_retry_sync() { + let sync = |client: &RpcClient| { + client.get_store(1).unwrap_err(); + }; + test_not_retry(sync); +} + +#[test] +fn test_incompatible_version() { + let incompatible = Arc::new(Incompatible); + let server = MockServer::with_case(1, incompatible); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + + let resp = block_on(client.ask_batch_split(metapb::Region::default(), 2)); + assert_eq!( + resp.unwrap_err().to_string(), + PdError::Incompatible.to_string() + ); +} + +fn restart_leader(mgr: SecurityManager) { + let mgr = Arc::new(mgr); + // Service has only one GetMembersResponse, so the leader never changes. + let mut server = + MockServer::::with_configuration(&mgr, vec![("127.0.0.1".to_owned(), 0); 3], None); + let eps = server.bind_addrs(); + + let client = new_client(eps.clone(), Some(Arc::clone(&mgr))); + // Put a region. + let store_id = client.alloc_id().unwrap(); + let mut store = metapb::Store::default(); + store.set_id(store_id); + + let peer_id = client.alloc_id().unwrap(); + let mut peer = metapb::Peer::default(); + peer.set_id(peer_id); + peer.set_store_id(store_id); + + let region_id = client.alloc_id().unwrap(); + let mut region = metapb::Region::default(); + region.set_id(region_id); + region.mut_peers().push(peer); + client.bootstrap_cluster(store, region.clone()).unwrap(); + + let region = block_on(client.get_region_by_id(region.get_id())) + .unwrap() + .unwrap(); + + // Stop servers and restart them again. + server.stop(); + server.start(&mgr, eps); + + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + + let region = block_on(client.get_region_by_id(region.get_id())).unwrap(); + assert_eq!(region.unwrap().get_id(), region_id); +} + +#[test] +fn test_restart_leader_insecure() { + let mgr = SecurityManager::new(&SecurityConfig::default()).unwrap(); + restart_leader(mgr) +} + +#[test] +fn test_restart_leader_secure() { + let security_cfg = test_util::new_security_cfg(None); + let mgr = SecurityManager::new(&security_cfg).unwrap(); + restart_leader(mgr) +} + +#[test] +fn test_change_leader_async() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); + let eps = server.bind_addrs(); + + let counter = Arc::new(AtomicUsize::new(0)); + let client = new_client(eps, None); + let counter1 = Arc::clone(&counter); + client.handle_reconnect(move || { + counter1.fetch_add(1, Ordering::SeqCst); + }); + let leader = client.get_leader(); + + for _ in 0..5 { + let region = block_on(client.get_region_by_id(1)); + region.ok(); + + let new = client.get_leader(); + if new != leader { + assert!(counter.load(Ordering::SeqCst) >= 1); + return; + } + thread::sleep(LeaderChange::get_leader_interval()); + } + + panic!("failed, leader should changed"); +} + +#[test] +fn test_pd_client_ok_when_cluster_not_ready() { + let pd_client_cluster_id_zero = "cluster_id_is_not_ready"; + let server = MockServer::with_case(3, Arc::new(AlreadyBootstrapped)); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + fail::cfg(pd_client_cluster_id_zero, "return()").unwrap(); + // wait 100ms to let client load member. + thread::sleep(Duration::from_millis(101)); + assert_eq!(client.reconnect().is_err(), true); + fail::remove(pd_client_cluster_id_zero); +} + +#[test] +fn test_pd_client_heartbeat_send_failed() { + let pd_client_send_fail_fp = "region_heartbeat_send_failed"; + fail::cfg(pd_client_send_fail_fp, "return()").unwrap(); + let server = MockServer::with_case(1, Arc::new(AlreadyBootstrapped)); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + let poller = Builder::new_multi_thread() + .thread_name(thd_name!("poller")) + .worker_threads(1) + .build() + .unwrap(); + let (tx, rx) = mpsc::channel(); + let f = + client.handle_region_heartbeat_response(1, move |resp| tx.send(resp).unwrap_or_default()); + poller.spawn(f); + + let heartbeat_send_fail = |ok| { + let mut region = metapb::Region::default(); + region.set_id(1); + poller.spawn(client.region_heartbeat( + store::RAFT_INIT_LOG_TERM, + region, + metapb::Peer::default(), + RegionStat::default(), + None, + )); + let rsp = rx.recv_timeout(Duration::from_millis(100)); + if ok { + assert!(rsp.is_ok()); + assert_eq!(rsp.unwrap().get_region_id(), 1); + } else { + rsp.unwrap_err(); + } + + let region = block_on(client.get_region_by_id(1)); + if ok { + assert!(region.is_ok()); + let r = region.unwrap(); + assert!(r.is_some()); + assert_eq!(1, r.unwrap().get_id()); + } else { + region.unwrap_err(); + } + }; + // send fail if network is block. + heartbeat_send_fail(false); + fail::remove(pd_client_send_fail_fp); + // send success after network recovered. + heartbeat_send_fail(true); +} + +#[test] +fn test_region_heartbeat_on_leader_change() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + let poller = Builder::new_multi_thread() + .thread_name(thd_name!("poller")) + .worker_threads(1) + .build() + .unwrap(); + let (tx, rx) = mpsc::channel(); + let f = client.handle_region_heartbeat_response(1, move |resp| { + tx.send(resp).unwrap(); + }); + poller.spawn(f); + let region = metapb::Region::default(); + let peer = metapb::Peer::default(); + let stat = RegionStat::default(); + poller.spawn(client.region_heartbeat( + store::RAFT_INIT_LOG_TERM, + region.clone(), + peer.clone(), + stat.clone(), + None, + )); + rx.recv_timeout(LeaderChange::get_leader_interval()) + .unwrap(); + + let heartbeat_on_leader_change = |count| { + let mut leader = client.get_leader(); + for _ in 0..count { + loop { + let _ = block_on(client.get_region_by_id(1)); + let new = client.get_leader(); + if leader != new { + leader = new; + info!("leader changed!"); + break; + } + thread::sleep(LeaderChange::get_leader_interval()); + } + } + poller.spawn(client.region_heartbeat( + store::RAFT_INIT_LOG_TERM, + region.clone(), + peer.clone(), + stat.clone(), + None, + )); + rx.recv_timeout(LeaderChange::get_leader_interval()) + .unwrap(); + }; + + // Change PD leader once then heartbeat PD. + heartbeat_on_leader_change(1); + + // Change PD leader twice without update the heartbeat sender, then heartbeat + // PD. + heartbeat_on_leader_change(2); +} + +#[test] +fn test_periodical_update() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); + let eps = server.bind_addrs(); + + let counter = Arc::new(AtomicUsize::new(0)); + let client = new_client_with_update_interval(eps, None, ReadableDuration::secs(3)); + let counter1 = Arc::clone(&counter); + client.handle_reconnect(move || { + counter1.fetch_add(1, Ordering::SeqCst); + }); + let leader = client.get_leader(); + + for _ in 0..5 { + let new = client.get_leader(); + if new != leader { + assert!(counter.load(Ordering::SeqCst) >= 1); + return; + } + thread::sleep(LeaderChange::get_leader_interval()); + } + + panic!("failed, leader should changed"); +} + +#[test] +fn test_cluster_version() { + let server = MockServer::::new(3); + let eps = server.bind_addrs(); + + let feature_a = Feature::require(0, 0, 1); + let feature_b = Feature::require(5, 0, 0); + let feature_c = Feature::require(5, 0, 1); + + let client = new_client(eps, None); + let feature_gate = client.feature_gate(); + assert!(!feature_gate.can_enable(feature_a)); + + let emit_heartbeat = || { + let req = pdpb::StoreStats::default(); + block_on(client.store_heartbeat(req, /* store_report= */ None, None)).unwrap(); + }; + + let set_cluster_version = |version: &str| { + let h = server.default_handler(); + h.set_cluster_version(version.to_owned()); + }; + + // Empty version string will be treated as invalid. + emit_heartbeat(); + assert!(!feature_gate.can_enable(feature_a)); + + // Explicitly invalid version string. + set_cluster_version("invalid-version"); + emit_heartbeat(); + assert!(!feature_gate.can_enable(feature_a)); + + // Correct version string. + set_cluster_version("5.0.0"); + emit_heartbeat(); + assert!(feature_gate.can_enable(feature_a)); + assert!(feature_gate.can_enable(feature_b)); + assert!(!feature_gate.can_enable(feature_c)); + + // Version can't go backwards. + set_cluster_version("4.99"); + emit_heartbeat(); + assert!(feature_gate.can_enable(feature_b)); + assert!(!feature_gate.can_enable(feature_c)); + + // After reconnect the version should be still accessable. + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + client.reconnect().unwrap(); + assert!(feature_gate.can_enable(feature_b)); + assert!(!feature_gate.can_enable(feature_c)); + + // Version can go forwards. + set_cluster_version("5.0.1"); + emit_heartbeat(); + assert!(feature_gate.can_enable(feature_c)); +} From 4b4fc4390d90e72eb87e49a223e756e6ba1e2688 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Fri, 2 Dec 2022 18:50:01 +0800 Subject: [PATCH 382/676] raftstore: allow a read-only flashback request to be propsed (#13871) close tikv/tikv#13870, fix tikv/tikv#13870 Because the flashback read request must be proposed after the `PrepareFlashback` and it won't have any side effects, it's safe to allow a read-only flashback request to be proposed. In this way, we can also fix #13870. Signed-off-by: JmPotato --- Cargo.lock | 2 +- components/raftstore/src/store/fsm/apply.rs | 7 +++- components/raftstore/src/store/fsm/peer.rs | 14 +++++-- components/raftstore/src/store/util.rs | 3 +- components/raftstore/src/store/worker/read.rs | 2 +- .../txn/actions/flashback_to_version.rs | 41 ++++++++++++++++++- .../flashback_to_version_read_phase.rs | 13 ++++-- .../integrations/raftstore/test_flashback.rs | 27 +++++++++++- tests/integrations/server/kv_service.rs | 2 +- 9 files changed, 95 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8b178015fa1..063657d29bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5479,7 +5479,7 @@ dependencies = [ "log_wrappers", "openssl", "prometheus", - "rand 0.8.3", + "rand 0.8.5", "serde", "serde_derive", "slog", diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index bd582d1c24a..a3d0bdb2712 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1588,7 +1588,12 @@ where let include_region = req.get_header().get_region_epoch().get_version() >= self.last_merge_version; check_region_epoch(req, &self.region, include_region)?; - check_flashback_state(self.region.get_is_in_flashback(), req, self.region_id())?; + check_flashback_state( + self.region.get_is_in_flashback(), + req, + self.region_id(), + false, + )?; if req.has_admin_request() { self.exec_admin_cmd(ctx, req) } else { diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index f6498222d27..e3f268bf02c 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5107,8 +5107,13 @@ where _ => {} }; // Check whether the region is in the flashback state and the request could be - // proposed. - if let Err(e) = util::check_flashback_state(self.fsm.peer.is_in_flashback, msg, region_id) { + // proposed. Skip the not prepared error because the + // `self.fsm.peer.is_in_flashback` may not be the latest right after applying + // the `PrepareFlashback` admin command, we will let it pass here and check in + // the apply phase. + if let Err(e) = + util::check_flashback_state(self.fsm.peer.is_in_flashback, msg, region_id, true) + { match e { Error::FlashbackInProgress(_) => self .ctx @@ -6278,7 +6283,10 @@ where fn on_set_flashback_state(&mut self, is_in_flashback: bool) { // Set flashback memory - self.fsm.peer.is_in_flashback = is_in_flashback; + self.fsm.peer.is_in_flashback = (|| { + fail_point!("keep_peer_fsm_flashback_state_false", |_| false); + is_in_flashback + })(); // Let the leader lease to None to ensure that local reads are not executed. self.fsm.peer.leader_lease_mut().expire_remote_lease(); } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 5f78065d32b..df5f4543f76 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -318,6 +318,7 @@ pub fn check_flashback_state( is_in_flashback: bool, req: &RaftCmdRequest, region_id: u64, + skip_not_prepared: bool, ) -> Result<()> { // The admin flashback cmd could be proposed/applied under any state. if req.has_admin_request() @@ -335,7 +336,7 @@ pub fn check_flashback_state( } // If the region is not in the flashback state, the flashback request itself // should be rejected. - if !is_in_flashback && is_flashback_request { + if !is_in_flashback && is_flashback_request && !skip_not_prepared { return Err(Error::FlashbackNotPrepared(region_id)); } Ok(()) diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 08e56aa7481..c78a51866ae 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -813,7 +813,7 @@ where // Check whether the region is in the flashback state and the local read could // be performed. let is_in_flashback = delegate.region.is_in_flashback; - if let Err(e) = util::check_flashback_state(is_in_flashback, req, region_id) { + if let Err(e) = util::check_flashback_state(is_in_flashback, req, region_id, false) { TLS_LOCAL_READ_METRICS.with(|m| match e { Error::FlashbackNotPrepared(_) => { m.borrow_mut().reject_reason.flashback_not_prepared.inc() diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 4b05c8eef8f..8a65debcdbf 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -5,8 +5,8 @@ use std::ops::Bound; use txn_types::{Key, Lock, LockType, TimeStamp, Write, WriteType}; use crate::storage::{ - mvcc::{MvccReader, MvccTxn, SnapshotReader, MAX_TXN_WRITE_SIZE}, - txn::{actions::check_txn_status::rollback_lock, Result as TxnResult}, + mvcc::{self, MvccReader, MvccTxn, SnapshotReader, MAX_TXN_WRITE_SIZE}, + txn::{self, actions::check_txn_status::rollback_lock, Result as TxnResult}, Snapshot, }; @@ -218,10 +218,47 @@ pub fn commit_flashback_key( lock.is_pessimistic_txn(), flashback_commit_ts, ); + } else { + return Err(txn::Error::from_mvcc(mvcc::ErrorInner::TxnLockNotFound { + start_ts: flashback_start_ts, + commit_ts: flashback_commit_ts, + key: key_to_commit.to_raw()?, + })); } Ok(()) } +// Check if the flashback has been finished before. +pub fn check_flashback_commit( + reader: &mut MvccReader, + key_to_commit: &Key, + flashback_start_ts: TimeStamp, + flashback_commit_ts: TimeStamp, +) -> TxnResult { + match reader.load_lock(key_to_commit)? { + // If the lock exists, it means the flashback hasn't been finished. + Some(lock) => { + if lock.ts == flashback_start_ts { + return Ok(false); + } + } + // If the lock doesn't exist and the flashback commit record exists, it means the flashback + // has been finished. + None => { + if let Some(write) = reader.get_write(key_to_commit, flashback_commit_ts, None)? { + if write.start_ts == flashback_start_ts { + return Ok(true); + } + } + } + } + Err(txn::Error::from_mvcc(mvcc::ErrorInner::TxnLockNotFound { + start_ts: flashback_start_ts, + commit_ts: flashback_commit_ts, + key: key_to_commit.to_raw()?, + })) +} + pub fn get_first_user_key( reader: &mut MvccReader, start_key: &Key, diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index 9ac5014b7f3..672a504a1f1 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -6,7 +6,7 @@ use txn_types::{Key, Lock, TimeStamp}; use crate::storage::{ mvcc::MvccReader, txn::{ - actions::flashback_to_version::get_first_user_key, + actions::flashback_to_version::{check_flashback_commit, get_first_user_key}, commands::{ Command, CommandExt, FlashbackToVersion, ProcessResult, ReadCommand, TypedCommand, }, @@ -189,9 +189,14 @@ impl ReadCommand for FlashbackToVersionReadPhase { // Commit key needs to match the Prewrite key, which is set as the first user // key. start_key = next_write_key.clone(); - // If the key is not locked, it means that the key has been committed before and - // we are in a retry. - if reader.load_lock(&next_write_key)?.is_none() { + // If the key has already been committed by the flashback, it means that we are + // in a retry. It's safe to just return directly. + if check_flashback_commit( + &mut reader, + &start_key, + self.start_ts, + self.commit_ts, + )? { statistics.add(&reader.statistics); return Ok(ProcessResult::Res); } diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 89a61223fa2..afc2a658081 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -18,6 +18,28 @@ use txn_types::WriteBatchFlags; const TEST_KEY: &[u8] = b"k1"; const TEST_VALUE: &[u8] = b"v1"; +#[test] +#[cfg(feature = "failpoints")] +fn test_read_after_prepare_flashback() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + let region = cluster.get_region(TEST_KEY); + fail::cfg("keep_peer_fsm_flashback_state_false", "return").unwrap(); + // Prepare flashback. + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); + // Read with flashback flag will succeed even the peer fsm does not updated its + // `is_in_flashback` flag. + must_request_with_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + // Writing with flashback flag will succeed since the ApplyFSM owns the + // latest `is_in_flashback` flag. + must_request_with_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + fail::remove("keep_peer_fsm_flashback_state_false"); + // Finish flashback. + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); +} + #[test] fn test_prepare_flashback_after_split() { let mut cluster = new_node_cluster(0, 3); @@ -281,8 +303,9 @@ fn test_flashback_for_local_read() { // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index); - // A local read with flashback flag will also be blocked. - must_get_flashback_not_prepared_error(&mut cluster, &mut region, new_get_cmd(TEST_KEY)); + // A local read with flashback flag will not be blocked since it won't have any + // side effects. + must_request_with_flashback_flag(&mut cluster, &mut region, new_get_cmd(TEST_KEY)); } #[test] diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 12cff74861d..f4200ab20da 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -781,7 +781,7 @@ fn test_mvcc_flashback_unprepared() { req.start_key = b"a".to_vec(); req.end_key = b"z".to_vec(); let resp = client.kv_flashback_to_version(&req).unwrap(); - assert!(resp.get_region_error().has_flashback_not_prepared()); + assert!(resp.get_error().contains("txn lock not found")); must_kv_read_equal(&client, ctx, k, v, 6); } From 76844d2e2c9a32d2ddedf204501506a2c764548d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 5 Dec 2022 11:36:03 +0800 Subject: [PATCH 383/676] log_backup: fix pitr panic (#13875) close tikv/tikv#13874 Signed-off-by: hillium --- src/import/sst_service.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 9d45052fea9..283f8f802e3 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -12,6 +12,7 @@ use collections::HashSet; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; use futures::{future::join_all, sink::SinkExt, stream::TryStreamExt, TryFutureExt}; +use futures_executor::{ThreadPool, ThreadPoolBuilder}; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, }; @@ -56,6 +57,12 @@ where engine: E, router: Router, threads: Arc, + // For now, PiTR cannot be executed in the tokio runtime because it is synchronous and may + // blocks. (tokio is so strict... it panics if we do insane things like blocking in an async + // context.) + // We need to execute these code in a context which allows blocking. + // FIXME: Make PiTR restore asynchronous. Get rid of this pool. + block_threads: Arc, importer: Arc, limiter: Limiter, task_slots: Arc>>, @@ -92,6 +99,18 @@ where .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) .build() .unwrap(); + let props = tikv_util::thread_group::current_properties(); + let block_threads = ThreadPoolBuilder::new() + .pool_size(cfg.num_threads) + .name_prefix("sst-importer") + .after_start_wrapper(move || { + tikv_util::thread_group::set_properties(props.clone()); + tikv_alloc::add_thread_memory_accessor(); + set_io_type(IoType::Import); + }) + .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) + .create() + .unwrap(); importer.start_switch_mode_check(threads.handle(), engine.clone()); threads.spawn(Self::tick(importer.clone())); @@ -99,6 +118,7 @@ where cfg, engine, threads: Arc::new(threads), + block_threads: Arc::new(block_threads), router, importer, limiter: Limiter::new(f64::INFINITY), @@ -596,7 +616,7 @@ where debug!("finished apply kv file with {:?}", resp); crate::send_rpc_response!(resp, sink, label, timer); }; - self.threads.spawn(handle_task); + self.block_threads.spawn_ok(handle_task); } /// Downloads the file and performs key-rewrite for later ingesting. From e0c9c1aa470d530f5868269e6e02cf70517344ef Mon Sep 17 00:00:00 2001 From: Hu# Date: Mon, 5 Dec 2022 13:50:02 +0800 Subject: [PATCH 384/676] storage: Resolve EBS flashback unlimit range (#13881) close tikv/tikv#13879 Resolve EBS flashback unlimit range. Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- src/storage/mod.rs | 12 ++--- .../txn/actions/flashback_to_version.rs | 54 +++++++++++-------- .../txn/commands/flashback_to_version.rs | 2 +- .../flashback_to_version_read_phase.rs | 19 ++++--- src/storage/txn/commands/mod.rs | 4 +- tests/integrations/server/kv_service.rs | 46 ++++++++++++++-- 6 files changed, 95 insertions(+), 42 deletions(-) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index caed0f57c91..6f06e55937f 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -4875,7 +4875,7 @@ mod tests { commit_ts, version, key.clone(), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); if let Mutation::Put(..) = write.0 { expect_value( @@ -4900,7 +4900,7 @@ mod tests { commit_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, ) { let (tx, rx) = channel(); storage @@ -4997,7 +4997,7 @@ mod tests { commit_ts, 2.into(), Key::from_raw(b"k"), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); expect_value( b"v@1".to_vec(), @@ -5013,7 +5013,7 @@ mod tests { commit_ts, 1.into(), Key::from_raw(b"k"), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); expect_none( block_on(storage.get(Context::default(), Key::from_raw(b"k"), commit_ts)) @@ -5104,7 +5104,7 @@ mod tests { flashback_commit_ts, TimeStamp::zero(), Key::from_raw(b"k"), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); for i in 1..=FLASHBACK_BATCH_SIZE * 4 { let key = Key::from_raw(format!("k{}", i).as_bytes()); @@ -5183,7 +5183,7 @@ mod tests { flashback_commit_ts, 1.into(), Key::from_raw(b"k"), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); expect_none( block_on(storage.get(Context::default(), k, flashback_commit_ts)) diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 8a65debcdbf..c1127142f14 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -15,11 +15,11 @@ pub const FLASHBACK_BATCH_SIZE: usize = 256 + 1 /* To store the next key for mul pub fn flashback_to_version_read_lock( reader: &mut MvccReader, next_lock_key: Key, - end_key: &Key, + end_key: Option<&Key>, ) -> TxnResult> { let result = reader.scan_locks( Some(&next_lock_key), - Some(end_key), + end_key, |_| true, FLASHBACK_BATCH_SIZE, ); @@ -31,7 +31,7 @@ pub fn flashback_to_version_read_write( reader: &mut MvccReader, next_write_key: Key, start_key: &Key, - end_key: &Key, + end_key: Option<&Key>, flashback_version: TimeStamp, flashback_commit_ts: TimeStamp, ) -> TxnResult> { @@ -44,7 +44,7 @@ pub fn flashback_to_version_read_write( // scanning every unique key in `CF_WRITE`. let keys_result = reader.scan_latest_user_keys( Some(&next_write_key), - Some(end_key), + end_key, |key, latest_commit_ts| { // There is no any other write could happen after the flashback begins. assert!(latest_commit_ts <= flashback_commit_ts); @@ -262,10 +262,10 @@ pub fn check_flashback_commit( pub fn get_first_user_key( reader: &mut MvccReader, start_key: &Key, - end_key: &Key, + end_key: Option<&Key>, ) -> TxnResult> { let (mut keys_result, _) = - reader.scan_latest_user_keys(Some(start_key), Some(end_key), |_, _| true, 1)?; + reader.scan_latest_user_keys(Some(start_key), end_key, |_, _| true, 1)?; Ok(keys_result.pop()) } @@ -300,7 +300,8 @@ pub mod tests { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot.clone(), Some(ScanMode::Forward), &ctx); - let key_locks = flashback_to_version_read_lock(&mut reader, key, &next_key).unwrap(); + let key_locks = + flashback_to_version_read_lock(&mut reader, key, Some(next_key).as_ref()).unwrap(); let cm = ConcurrencyManager::new(TimeStamp::zero()); let mut txn = MvccTxn::new(start_ts.into(), cm); rollback_locks(&mut txn, snapshot, key_locks).unwrap(); @@ -321,8 +322,12 @@ pub mod tests { let snapshot = engine.snapshot(Default::default()).unwrap(); let ctx = Context::default(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); - let prewrite_key = if let Some(first_key) = - get_first_user_key(&mut reader, &Key::from_raw(key), &Key::from_raw(b"z")).unwrap() + let prewrite_key = if let Some(first_key) = get_first_user_key( + &mut reader, + &Key::from_raw(key), + Some(Key::from_raw(b"z")).as_ref(), + ) + .unwrap() { first_key } else { @@ -342,7 +347,7 @@ pub mod tests { start_ts: impl Into, commit_ts: impl Into, ) -> usize { - let next_key = Key::from_raw(keys::next_key(key).as_slice()); + let next_key = Key::from_raw_maybe_unbounded(keys::next_key(key).as_slice()); let key = Key::from_raw(key); let (version, start_ts, commit_ts) = (version.into(), start_ts.into(), commit_ts.into()); let ctx = Context::default(); @@ -353,7 +358,7 @@ pub mod tests { &mut reader, key, &Key::from_raw(b""), - &next_key, + next_key.as_ref(), version, commit_ts, ) @@ -379,10 +384,13 @@ pub mod tests { let snapshot = engine.snapshot(Default::default()).unwrap(); let ctx = Context::default(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); - let key_to_lock = - get_first_user_key(&mut reader, &Key::from_raw(key), &Key::from_raw(b"z")) - .unwrap() - .unwrap(); + let key_to_lock = get_first_user_key( + &mut reader, + &Key::from_raw(key), + Some(Key::from_raw(b"z")).as_ref(), + ) + .unwrap() + .unwrap(); commit_flashback_key(&mut txn, &mut reader, &key_to_lock, start_ts, commit_ts).unwrap(); let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); @@ -591,9 +599,13 @@ pub mod tests { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); - let first_key = get_first_user_key(&mut reader, &Key::from_raw(b""), &Key::from_raw(b"z")) - .unwrap_or_else(|_| Some(Key::from_raw(b""))) - .unwrap(); + let first_key = get_first_user_key( + &mut reader, + &Key::from_raw(b""), + Some(Key::from_raw(b"z")).as_ref(), + ) + .unwrap_or_else(|_| Some(Key::from_raw(b""))) + .unwrap(); assert_eq!(first_key, Key::from_raw(prewrite_key)); // case 1: start key is before all keys, flashback b"c". @@ -640,9 +652,9 @@ pub mod tests { must_prewrite_flashback_key(&mut engine, start_key, 4, flashback_start_ts), 0 ); - // case 3: start key is valid, end_key is invalid, prewrite key will be None. - let first_key = get_first_user_key(&mut reader, &Key::from_raw(b"a"), &Key::from_raw(b"")) + // case 3: for last region, end_key will be None, prewrite key will valid. + let first_key = get_first_user_key(&mut reader, &Key::from_raw(b"a"), None) .unwrap_or_else(|_| Some(Key::from_raw(b""))); - assert_eq!(first_key, None); + assert_eq!(first_key, Some(Key::from_raw(prewrite_key))); } } diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 13de0c9b183..3999042fe27 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -33,7 +33,7 @@ command! { commit_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, state: FlashbackToVersionState, } } diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index 672a504a1f1..d885c974db4 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -39,7 +39,7 @@ pub fn new_flashback_rollback_lock_cmd( start_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, ctx: Context, ) -> TypedCommand<()> { FlashbackToVersionReadPhase::new( @@ -61,7 +61,7 @@ pub fn new_flashback_write_cmd( commit_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, ctx: Context, ) -> TypedCommand<()> { FlashbackToVersionReadPhase::new( @@ -87,7 +87,7 @@ command! { commit_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, state: FlashbackToVersionState, } } @@ -126,8 +126,11 @@ impl ReadCommand for FlashbackToVersionReadPhase { let mut start_key = self.start_key.clone(); let next_state = match self.state { FlashbackToVersionState::RollbackLock { next_lock_key, .. } => { - let mut key_locks = - flashback_to_version_read_lock(&mut reader, next_lock_key, &self.end_key)?; + let mut key_locks = flashback_to_version_read_lock( + &mut reader, + next_lock_key, + self.end_key.as_ref(), + )?; if key_locks.is_empty() { // - No more locks to rollback, continue to the Prewrite Phase. // - The start key from the client is actually a range which is used to limit @@ -139,7 +142,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { // - To make sure the key locked in the latch is the same as the actual key // written, we pass it to the key in `process_write' after getting it. let key_to_lock = if let Some(first_key) = - get_first_user_key(&mut reader, &self.start_key, &self.end_key)? + get_first_user_key(&mut reader, &self.start_key, self.end_key.as_ref())? { first_key } else { @@ -178,7 +181,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { // write of this key and instead put it after the completion // of the 2pc. next_write_key = if let Some(first_key) = - get_first_user_key(&mut reader, &self.start_key, &self.end_key)? + get_first_user_key(&mut reader, &self.start_key, self.end_key.as_ref())? { first_key } else { @@ -205,7 +208,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { &mut reader, next_write_key, &start_key, - &self.end_key, + self.end_key.as_ref(), self.version, self.commit_ts, )?; diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 7d835462acf..7eee81ae23e 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -372,7 +372,7 @@ impl From for TypedCommand<()> { req.get_start_ts().into(), req.get_version().into(), Key::from_raw(req.get_start_key()), - Key::from_raw(req.get_end_key()), + Key::from_raw_maybe_unbounded(req.get_end_key()), req.take_context(), ) } @@ -385,7 +385,7 @@ impl From for TypedCommand<()> { req.get_commit_ts().into(), req.get_version().into(), Key::from_raw(req.get_start_key()), - Key::from_raw(req.get_end_key()), + Key::from_raw_maybe_unbounded(req.get_end_key()), req.take_context(), ) } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index f4200ab20da..3dec0b57798 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -606,7 +606,7 @@ fn test_mvcc_flashback_failed_after_first_batch() { fail::cfg("flashback_failed_after_first_batch", "return").unwrap(); must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); fail::remove("flashback_failed_after_first_batch"); - // key@1 must be flahsbacked in the second batch firstly. + // key@1 must be flashbacked in the second batch firstly. must_kv_read_equal( &client, ctx.clone(), @@ -777,14 +777,52 @@ fn test_mvcc_flashback_unprepared() { req.set_context(ctx.clone()); req.set_start_ts(4); req.set_commit_ts(5); - req.version = 0; - req.start_key = b"a".to_vec(); - req.end_key = b"z".to_vec(); + req.set_version(0); + req.set_start_key(b"a".to_vec()); + req.set_end_key(b"z".to_vec()); let resp = client.kv_flashback_to_version(&req).unwrap(); assert!(resp.get_error().contains("txn lock not found")); must_kv_read_equal(&client, ctx, k, v, 6); } +#[test] +fn test_mvcc_flashback_with_unlimit_range() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + let mut ts = 0; + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); + must_kv_read_equal(&client, ctx.clone(), k.clone(), v, 6); + + let mut prepare_req = PrepareFlashbackToVersionRequest::default(); + prepare_req.set_context(ctx.clone()); + prepare_req.set_start_ts(6); + prepare_req.set_version(0); + prepare_req.set_start_key(b"".to_vec()); + prepare_req.set_end_key(b"".to_vec()); + client + .kv_prepare_flashback_to_version(&prepare_req) + .unwrap(); + let mut req = FlashbackToVersionRequest::default(); + req.set_context(ctx.clone()); + req.set_start_ts(6); + req.set_commit_ts(7); + req.set_version(0); + req.set_start_key(b"".to_vec()); + req.set_end_key(b"".to_vec()); + let resp = client.kv_flashback_to_version(&req).unwrap(); + assert!(!resp.has_region_error()); + assert!(resp.get_error().is_empty()); + + let mut get_req = GetRequest::default(); + get_req.set_context(ctx); + get_req.key = k; + get_req.version = 7; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(!get_resp.has_error()); + assert_eq!(get_resp.value, b"".to_vec()); +} + // raft related RPC is tested as parts of test_snapshot.rs, so skip here. #[test] From c8250e58e7316911617fde5d2d43c578bbd23100 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Mon, 5 Dec 2022 15:04:03 +0800 Subject: [PATCH 385/676] raftstore: split raft write batch on 1GiB limit (#13872) close tikv/tikv#13848 Fix panic when the size of one single write exceeds 2GiB. Signed-off-by: tabokie Co-authored-by: Ti Chi Robot --- components/raft_log_engine/src/engine.rs | 23 +-- .../raftstore/src/store/async_io/write.rs | 163 ++++++++++-------- .../src/store/async_io/write_tests.rs | 124 ++++++++++++- tests/integrations/pd/test_rpc_client.rs | 2 +- 4 files changed, 220 insertions(+), 92 deletions(-) diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index a376adc25b7..c952f18dbc4 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -540,30 +540,21 @@ impl RaftEngine for RaftLogEngine { fn append(&self, raft_group_id: u64, entries: Vec) -> Result { let mut batch = Self::LogBatch::default(); - batch - .0 - .add_entries::(raft_group_id, &entries) - .map_err(transfer_error)?; - self.0.write(&mut batch.0, false).map_err(transfer_error) + batch.append(raft_group_id, entries)?; + self.consume(&mut batch, false) } fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { let mut batch = Self::LogBatch::default(); - batch - .0 - .put_message(STORE_STATE_ID, STORE_IDENT_KEY.to_vec(), ident) - .map_err(transfer_error)?; - self.0.write(&mut batch.0, true).map_err(transfer_error)?; + batch.put_store_ident(ident)?; + self.consume(&mut batch, true)?; Ok(()) } fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { let mut batch = Self::LogBatch::default(); - batch - .0 - .put_message(raft_group_id, RAFT_LOG_STATE_KEY.to_vec(), state) - .map_err(transfer_error)?; - self.0.write(&mut batch.0, false).map_err(transfer_error)?; + batch.put_raft_state(raft_group_id, state)?; + self.consume(&mut batch, false)?; Ok(()) } @@ -585,7 +576,7 @@ impl RaftEngine for RaftLogEngine { old_first_index.push(self.0.first_index(task.raft_group_id)); } - self.0.write(&mut batch.0, false).map_err(transfer_error)?; + self.consume(&mut batch, false)?; let mut total = 0; for (old_first_index, task) in old_first_index.iter().zip(tasks) { diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 354a796c99c..d17223e5acf 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -27,7 +27,7 @@ use protobuf::Message; use raft::eraftpb::Entry; use tikv_util::{ box_err, - config::{Tracker, VersionTrack}, + config::{ReadableSize, Tracker, VersionTrack}, debug, info, slow_log, sys::thread::StdThreadBuildWrapper, thd_name, @@ -54,6 +54,7 @@ const KV_WB_SHRINK_SIZE: usize = 1024 * 1024; const KV_WB_DEFAULT_SIZE: usize = 16 * 1024; const RAFT_WB_SHRINK_SIZE: usize = 10 * 1024 * 1024; const RAFT_WB_DEFAULT_SIZE: usize = 256 * 1024; +const RAFT_WB_SPLIT_SIZE: usize = ReadableSize::gb(1).0 as usize; /// Notify the event to the specified region. pub trait PersistedNotifier: Clone + Send + 'static { @@ -360,8 +361,12 @@ where EK: KvEngine, ER: RaftEngine, { - pub raft_wb: ER::LogBatch, - // Write raft state once for a region everytime writing to disk + // When a single batch becomes too large, we uses multiple batches each containing atomic + // writes. + pub raft_wbs: Vec, + // Write states once for a region everytime writing to disk. + // These states only corresponds to entries inside `raft_wbs.last()`. States for other write + // batches must be inlined early. pub raft_states: HashMap, pub extra_batch_write: ExtraBatchWrite, pub state_size: usize, @@ -369,6 +374,7 @@ where pub persisted_cbs: Vec>, // region_id -> (peer_id, ready_number) pub readies: HashMap, + pub(crate) raft_wb_split_size: usize, } impl WriteTaskBatch @@ -378,41 +384,77 @@ where { fn new(raft_wb: ER::LogBatch) -> Self { Self { - raft_wb, + raft_wbs: vec![raft_wb], raft_states: HashMap::default(), extra_batch_write: ExtraBatchWrite::None, state_size: 0, tasks: vec![], persisted_cbs: vec![], readies: HashMap::default(), + raft_wb_split_size: RAFT_WB_SPLIT_SIZE, } } + #[inline] + fn flush_states_to_raft_wb(&mut self, raft_engine: &ER) { + let wb = self.raft_wbs.last_mut().unwrap(); + for (region_id, state) in self.raft_states.drain() { + wb.put_raft_state(region_id, &state).unwrap(); + } + if let ExtraBatchWrite::V2(extra_states_map) = &mut self.extra_batch_write { + for (region_id, state) in extra_states_map.drain() { + let mut tombstone = false; + if let Some(region_state) = state.region_state { + if region_state.get_state() == PeerState::Tombstone { + tombstone = true; + raft_engine + .clean( + region_id, + first_index(&state.apply_state), + state.raft_state.as_ref().unwrap(), + wb, + ) + .unwrap(); + } + wb.put_region_state(region_id, ®ion_state).unwrap(); + } + if !tombstone { + wb.put_apply_state(region_id, &state.apply_state).unwrap(); + } + } + } + self.state_size = 0; + } + /// Add write task to this batch - fn add_write_task(&mut self, mut task: WriteTask) { + fn add_write_task(&mut self, raft_engine: &ER, mut task: WriteTask) { if let Err(e) = task.valid() { panic!("task is not valid: {:?}", e); } - if let Some(raft_wb) = task.raft_wb.take() { - self.raft_wb.merge(raft_wb).unwrap(); + + if self.raft_wb_split_size > 0 + && self.raft_wbs.last().unwrap().persist_size() >= self.raft_wb_split_size + { + self.flush_states_to_raft_wb(raft_engine); + self.raft_wbs + .push(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); } - let entries = std::mem::take(&mut task.entries); - self.raft_wb.append(task.region_id, entries).unwrap(); + let raft_wb = self.raft_wbs.last_mut().unwrap(); + if let Some(wb) = task.raft_wb.take() { + raft_wb.merge(wb).unwrap(); + } + raft_wb + .append(task.region_id, std::mem::take(&mut task.entries)) + .unwrap(); if let Some((from, to)) = task.cut_logs { - self.raft_wb.cut_logs(task.region_id, from, to); + raft_wb.cut_logs(task.region_id, from, to); } - if let Some(raft_state) = task.raft_state.take() { - if self - .raft_states - .insert(task.region_id, raft_state) - .is_none() - { - self.state_size += std::mem::size_of::(); - } + if let Some(raft_state) = task.raft_state.take() + && self.raft_states.insert(task.region_id, raft_state).is_none() { + self.state_size += std::mem::size_of::(); } - self.state_size += self .extra_batch_write .merge(task.region_id, &mut task.extra_write); @@ -460,41 +502,16 @@ where #[inline] fn get_raft_size(&self) -> usize { - self.state_size + self.raft_wb.persist_size() + self.state_size + + self + .raft_wbs + .iter() + .map(|wb| wb.persist_size()) + .sum::() } fn before_write_to_db(&mut self, engine: &ER, metrics: &StoreWriteMetrics) { - // Put raft state to raft writebatch - for (region_id, state) in self.raft_states.drain() { - self.raft_wb.put_raft_state(region_id, &state).unwrap(); - } - if let ExtraBatchWrite::V2(extra_states_map) = &mut self.extra_batch_write { - for (region_id, state) in extra_states_map.drain() { - let mut tombstone = false; - if let Some(region_state) = state.region_state { - if region_state.get_state() == PeerState::Tombstone { - tombstone = true; - engine - .clean( - region_id, - first_index(&state.apply_state), - state.raft_state.as_ref().unwrap(), - &mut self.raft_wb, - ) - .unwrap(); - } - self.raft_wb - .put_region_state(region_id, ®ion_state) - .unwrap(); - } - if !tombstone { - self.raft_wb - .put_apply_state(region_id, &state.apply_state) - .unwrap(); - } - } - } - self.state_size = 0; + self.flush_states_to_raft_wb(engine); if metrics.waterfall_metrics { let now = std::time::Instant::now(); for task in &self.tasks { @@ -677,7 +694,7 @@ where } pub fn handle_write_task(&mut self, task: WriteTask) { - self.batch.add_write_task(task); + self.batch.add_write_task(&self.raft_engine, task); } pub fn write_to_db(&mut self, notify: bool) { @@ -726,24 +743,27 @@ where fail_point!("raft_between_save"); let mut write_raft_time = 0f64; - if !self.batch.raft_wb.is_empty() { + if !self.batch.raft_wbs[0].is_empty() { fail_point!("raft_before_save_on_store_1", self.store_id == 1, |_| {}); let now = Instant::now(); self.perf_context.start_observe(); - self.raft_engine - .consume_and_shrink( - &mut self.batch.raft_wb, - true, - RAFT_WB_SHRINK_SIZE, - RAFT_WB_DEFAULT_SIZE, - ) - .unwrap_or_else(|e| { - panic!( - "store {}: {} failed to write to raft engine: {:?}", - self.store_id, self.tag, e - ); - }); + for i in 0..self.batch.raft_wbs.len() { + self.raft_engine + .consume_and_shrink( + &mut self.batch.raft_wbs[i], + true, + RAFT_WB_SHRINK_SIZE, + RAFT_WB_DEFAULT_SIZE, + ) + .unwrap_or_else(|e| { + panic!( + "store {}: {} failed to write to raft engine: {:?}", + self.store_id, self.tag, e + ); + }); + } + self.batch.raft_wbs.truncate(1); let trackers: Vec<_> = self .batch .tasks @@ -946,7 +966,7 @@ pub fn write_to_db_for_test( ER: RaftEngine, { let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); - batch.add_write_task(task); + batch.add_write_task(&engines.raft, task); batch.before_write_to_db(&engines.raft, &StoreWriteMetrics::new(false)); if let ExtraBatchWrite::V1(kv_wb) = &mut batch.extra_batch_write { if !kv_wb.is_empty() { @@ -957,13 +977,12 @@ pub fn write_to_db_for_test( }); } } - if !batch.raft_wb.is_empty() { - engines - .raft - .consume(&mut batch.raft_wb, true) - .unwrap_or_else(|e| { + if !batch.raft_wbs[0].is_empty() { + for wb in &mut batch.raft_wbs { + engines.raft.consume(wb, true).unwrap_or_else(|e| { panic!("test failed to write to raft engine: {:?}", e); }); + } } } diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 1642c90d075..727502b6ca4 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -273,7 +273,7 @@ fn test_worker() { task_1.raft_state = Some(new_raft_state(5, 123, 6, 8)); task_1.messages.append(&mut vec![RaftMessage::default()]); - t.worker.batch.add_write_task(task_1); + t.worker.batch.add_write_task(&engines.raft, task_1); let mut task_2 = WriteTask::::new(region_2, 2, 15); init_write_batch(&engines, &mut task_2); @@ -287,7 +287,7 @@ fn test_worker() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.worker.batch.add_write_task(task_2); + t.worker.batch.add_write_task(&engines.raft, task_2); let mut task_3 = WriteTask::::new(region_1, 1, 11); init_write_batch(&engines, &mut task_3); @@ -303,7 +303,7 @@ fn test_worker() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.worker.batch.add_write_task(task_3); + t.worker.batch.add_write_task(&engines.raft, task_3); t.worker.write_to_db(true); @@ -337,6 +337,124 @@ fn test_worker() { must_have_same_count_msg(5, &t.msg_rx); } +#[test] +fn test_worker_split_raft_wb() { + let path = Builder::new().prefix("async-io-worker").tempdir().unwrap(); + let engines = new_temp_engine(&path); + let mut t = TestWorker::new(&Config::default(), &engines); + + let mut run_test = |region_1: u64, region_2: u64, split: (bool, bool)| { + let raft_key_1 = 17 + region_1; + let raft_key_2 = 27 + region_1; + let raft_key_3 = 37 + region_1; + let mut expected_wbs = 1; + + let mut task_1 = WriteTask::::new(region_1, 1, 10); + init_write_batch(&engines, &mut task_1); + task_1.extra_write = ExtraWrite::V2(ExtraStates::new(RaftApplyState { + applied_index: 10, + ..Default::default() + })); + put_raft_kv(task_1.raft_wb.as_mut(), raft_key_1); + task_1.entries.append(&mut vec![ + new_entry(5, 5), + new_entry(6, 5), + new_entry(7, 5), + new_entry(8, 5), + ]); + task_1.raft_state = Some(new_raft_state(5, 123, 6, 8)); + t.worker.batch.add_write_task(&engines.raft, task_1); + + let mut task_2 = WriteTask::::new(region_2, 2, 15); + init_write_batch(&engines, &mut task_2); + task_2.extra_write = ExtraWrite::V2(ExtraStates::new(RaftApplyState { + applied_index: 16, + ..Default::default() + })); + put_raft_kv(task_2.raft_wb.as_mut(), raft_key_2); + task_2 + .entries + .append(&mut vec![new_entry(20, 15), new_entry(21, 15)]); + task_2.raft_state = Some(new_raft_state(15, 234, 20, 21)); + if split.0 { + expected_wbs += 1; + t.worker.batch.raft_wb_split_size = 1; + } else { + t.worker.batch.raft_wb_split_size = 0; + } + t.worker.batch.add_write_task(&engines.raft, task_2); + + let mut task_3 = WriteTask::::new(region_1, 1, 11); + init_write_batch(&engines, &mut task_3); + task_3.extra_write = ExtraWrite::V2(ExtraStates::new(RaftApplyState { + applied_index: 25, + ..Default::default() + })); + put_raft_kv(task_3.raft_wb.as_mut(), raft_key_3); + delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), raft_key_1); + task_3 + .entries + .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); + task_3.cut_logs = Some((8, 9)); + task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); + if split.1 { + expected_wbs += 1; + t.worker.batch.raft_wb_split_size = 1; + } else { + t.worker.batch.raft_wb_split_size = 0; + } + t.worker.batch.add_write_task(&engines.raft, task_3); + + assert_eq!(t.worker.batch.raft_wbs.len(), expected_wbs); + t.worker.write_to_db(true); + assert_eq!(t.worker.batch.raft_wbs.len(), 1); + + must_have_same_notifies(vec![(region_1, (1, 11)), (region_2, (2, 15))], &t.notify_rx); + + assert_eq!(test_raft_kv(&engines.raft, raft_key_1), false); + assert_eq!(test_raft_kv(&engines.raft, raft_key_2), true); + assert_eq!(test_raft_kv(&engines.raft, raft_key_3), true); + + must_have_entries_and_state( + &engines.raft, + vec![ + ( + region_1, + vec![new_entry(5, 5), new_entry(6, 6), new_entry(7, 7)], + new_raft_state(7, 124, 6, 7), + ), + ( + region_2, + vec![new_entry(20, 15), new_entry(21, 15)], + new_raft_state(15, 234, 20, 21), + ), + ], + ); + assert_eq!( + engines.raft.get_apply_state(region_1).unwrap(), + Some(RaftApplyState { + applied_index: 25, + ..Default::default() + }) + ); + assert_eq!( + engines.raft.get_apply_state(region_2).unwrap(), + Some(RaftApplyState { + applied_index: 16, + ..Default::default() + }) + ); + }; + + let mut first_region = 1; + for a in [true, false] { + for b in [true, false] { + run_test(first_region, first_region + 1, (a, b)); + first_region += 10; + } + } +} + #[test] fn test_basic_flow() { let region_1 = 1; diff --git a/tests/integrations/pd/test_rpc_client.rs b/tests/integrations/pd/test_rpc_client.rs index 23841ba5dfd..ca37318aa8b 100644 --- a/tests/integrations/pd/test_rpc_client.rs +++ b/tests/integrations/pd/test_rpc_client.rs @@ -112,7 +112,7 @@ fn test_rpc_client() { assert_eq!(ts.logical() + 100, ts100.logical()); let mut prev_id = 0; - for _ in 0..100 { + for _ in 0..10 { let mut client = new_client_v2(eps.clone(), None); let alloc_id = client.alloc_id().unwrap(); assert!(alloc_id > prev_id); From b79b86e965e18e4196d0a8db5a5d13c9a868c77d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 5 Dec 2022 16:44:03 +0800 Subject: [PATCH 386/676] log-backup: make the safepoint lifetime 24hours (#13885) close tikv/tikv#13889, close pingcap/tidb#39603 Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- components/backup-stream/src/checkpoint_manager.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index f34211ef7a5..e316b6e05c3 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -356,9 +356,13 @@ impl FlushObserver for BasicFlushObserver { .update_service_safe_point( format!("backup-stream-{}-{}", task, self.store_id), TimeStamp::new(rts.saturating_sub(1)), - // Add a service safe point for 30 mins (6x the default flush interval). - // It would probably be safe. - Duration::from_secs(1800), + // Add a service safe point for 24 hours. (the same as fatal error.) + // We make it the same duration as we meet fatal errors because TiKV may be + // SIGKILL'ed after it meets fatal error and before it successfully updated the + // fatal error safepoint. + // TODO: We'd better make the coordinator, who really + // calculates the checkpoint to register service safepoint. + Duration::from_secs(60 * 60 * 24), ) .await { From 7d4b6c6d65537d4ce6cd2ff98573b539c38a35b8 Mon Sep 17 00:00:00 2001 From: Hu# Date: Mon, 5 Dec 2022 17:26:03 +0800 Subject: [PATCH 387/676] pd_client: move ReadableDuration to failpoint (#13878) ref tikv/tikv#13673 remove `unused_imports` when make release. Signed-off-by: husharp Signed-off-by: Xinye Tao Co-authored-by: Xinye Tao --- components/pd_client/src/client_v2.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 55f0c31b3c5..3d17a94a494 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -47,9 +47,7 @@ use kvproto::{ }; use security::SecurityManager; use tikv_util::{ - box_err, - config::ReadableDuration, - error, info, + box_err, error, info, mpsc::future as mpsc, slow_log, thd_name, time::{duration_to_sec, Instant}, @@ -71,6 +69,8 @@ use crate::PdFuture; fn request_timeout() -> Duration { fail_point!("pd_client_v2_request_timeout", |s| { use std::str::FromStr; + + use tikv_util::config::ReadableDuration; ReadableDuration::from_str(&s.unwrap()).unwrap().0 }); Duration::from_secs(REQUEST_TIMEOUT_SEC) @@ -412,6 +412,8 @@ async fn reconnect_loop( let backoff = (|| { fail_point!("pd_client_v2_backoff", |s| { use std::str::FromStr; + + use tikv_util::config::ReadableDuration; ReadableDuration::from_str(&s.unwrap()).unwrap().0 }); request_timeout() From 280d53b75c1d3e821435c440eb6493af536edabb Mon Sep 17 00:00:00 2001 From: JmPotato Date: Tue, 6 Dec 2022 02:50:03 +0800 Subject: [PATCH 388/676] raftstore: remove is_in_flashback field in peer fsm (#13877) close tikv/tikv#13868 - Remove `is_in_flashback` field and use the region meta as the only source of truth in `PeerFSM`. - Add a corresponding test case. - Some minor refinement to the code and tests. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/peer.rs | 27 +++-- components/raftstore/src/store/peer.rs | 5 +- .../txn/actions/flashback_to_version.rs | 109 +++++++++++------- .../flashback_to_version_read_phase.rs | 35 ++++-- .../integrations/raftstore/test_flashback.rs | 52 +++++++-- tests/integrations/server/kv_service.rs | 27 ++++- 6 files changed, 176 insertions(+), 79 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index e3f268bf02c..62eadb97076 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1334,7 +1334,7 @@ where new_read_index_request(region_id, region_epoch.clone(), self.fsm.peer.peer.clone()); // Allow to capture change even is in flashback state. // TODO: add a test case for this kind of situation. - if self.fsm.peer.is_in_flashback { + if self.region().is_in_flashback { let mut flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); flags.insert(WriteBatchFlags::FLASHBACK); msg.mut_header().set_flags(flags.bits()); @@ -4894,9 +4894,7 @@ where } ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), - ExecResult::SetFlashbackState { region } => { - self.on_set_flashback_state(region.get_is_in_flashback()) - } + ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), } } @@ -5108,11 +5106,11 @@ where }; // Check whether the region is in the flashback state and the request could be // proposed. Skip the not prepared error because the - // `self.fsm.peer.is_in_flashback` may not be the latest right after applying + // `self.region().is_in_flashback` may not be the latest right after applying // the `PrepareFlashback` admin command, we will let it pass here and check in // the apply phase. if let Err(e) = - util::check_flashback_state(self.fsm.peer.is_in_flashback, msg, region_id, true) + util::check_flashback_state(self.region().is_in_flashback, msg, region_id, true) { match e { Error::FlashbackInProgress(_) => self @@ -6281,12 +6279,17 @@ where self.fsm.has_ready = true; } - fn on_set_flashback_state(&mut self, is_in_flashback: bool) { - // Set flashback memory - self.fsm.peer.is_in_flashback = (|| { - fail_point!("keep_peer_fsm_flashback_state_false", |_| false); - is_in_flashback - })(); + fn on_set_flashback_state(&mut self, region: metapb::Region) { + // Update the region meta. + self.update_region((|| { + #[cfg(feature = "failpoints")] + fail_point!("keep_peer_fsm_flashback_state_false", |_| { + let mut region = region.clone(); + region.is_in_flashback = false; + region + }); + region + })()); // Let the leader lease to None to ensure that local reads are not executed. self.fsm.peer.leader_lease_mut().expire_remote_lease(); } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9614161739a..100544bd0f4 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1030,8 +1030,6 @@ where /// lead_transferee if this peer(leader) is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, - // Used as the memory state for Flashback to reject RW/Schedule before proposing. - pub is_in_flashback: bool, pub snapshot_recovery_state: Option, } @@ -1167,7 +1165,6 @@ where last_region_buckets: None, lead_transferee: raft::INVALID_ID, unsafe_recovery_state: None, - is_in_flashback: region.get_is_in_flashback(), snapshot_recovery_state: None, }; @@ -3531,7 +3528,7 @@ where self.force_leader.is_some(), ) { None - } else if self.is_in_flashback { + } else if self.region().is_in_flashback { debug!( "prevents renew lease while in flashback state"; "region_id" => self.region_id, diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index c1127142f14..819cfd0631c 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -1,7 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::ops::Bound; - use txn_types::{Key, Lock, LockType, TimeStamp, Write, WriteType}; use crate::storage::{ @@ -35,11 +33,6 @@ pub fn flashback_to_version_read_write( flashback_version: TimeStamp, flashback_commit_ts: TimeStamp, ) -> TxnResult> { - // Filter out the SST that does not have a newer version than - // `flashback_version` in `CF_WRITE`, i.e, whose latest `commit_ts` <= - // `flashback_version`. By doing this, we can only flashback those keys that - // have version changed since `flashback_version` as much as possible. - reader.set_hint_min_ts(Some(Bound::Excluded(flashback_version))); // To flashback the data, we need to get all the latest visible keys first by // scanning every unique key in `CF_WRITE`. let keys_result = reader.scan_latest_user_keys( @@ -241,15 +234,30 @@ pub fn check_flashback_commit( if lock.ts == flashback_start_ts { return Ok(false); } + error!( + "check flashback commit exception: lock not found"; + "key_to_commit" => log_wrappers::Value::key(key_to_commit.as_encoded()), + "flashback_start_ts" => flashback_start_ts, + "flashback_commit_ts" => flashback_commit_ts, + "lock" => ?lock, + ); } // If the lock doesn't exist and the flashback commit record exists, it means the flashback // has been finished. None => { - if let Some(write) = reader.get_write(key_to_commit, flashback_commit_ts, None)? { - if write.start_ts == flashback_start_ts { + let write_res = reader.seek_write(key_to_commit, flashback_commit_ts)?; + if let Some((commit_ts, ref write)) = write_res { + if commit_ts == flashback_commit_ts && write.start_ts == flashback_start_ts { return Ok(true); } } + error!( + "check flashback commit exception: write record mismatched"; + "key_to_commit" => log_wrappers::Value::key(key_to_commit.as_encoded()), + "flashback_start_ts" => flashback_start_ts, + "flashback_commit_ts" => flashback_commit_ts, + "write" => ?write_res, + ); } } Err(txn::Error::from_mvcc(mvcc::ErrorInner::TxnLockNotFound { @@ -263,9 +271,15 @@ pub fn get_first_user_key( reader: &mut MvccReader, start_key: &Key, end_key: Option<&Key>, + flashback_version: TimeStamp, ) -> TxnResult> { - let (mut keys_result, _) = - reader.scan_latest_user_keys(Some(start_key), end_key, |_, _| true, 1)?; + let (mut keys_result, _) = reader.scan_latest_user_keys( + Some(start_key), + end_key, + // Make sure we will get the same first user key each time. + |_, latest_commit_ts| latest_commit_ts > flashback_version, + 1, + )?; Ok(keys_result.pop()) } @@ -326,6 +340,7 @@ pub mod tests { &mut reader, &Key::from_raw(key), Some(Key::from_raw(b"z")).as_ref(), + version, ) .unwrap() { @@ -375,10 +390,11 @@ pub mod tests { fn must_commit_flashback_key( engine: &mut E, key: &[u8], + version: impl Into, start_ts: impl Into, commit_ts: impl Into, ) -> usize { - let (start_ts, commit_ts) = (start_ts.into(), commit_ts.into()); + let (version, start_ts, commit_ts) = (version.into(), start_ts.into(), commit_ts.into()); let cm = ConcurrencyManager::new(TimeStamp::zero()); let mut txn = MvccTxn::new(start_ts, cm); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -388,6 +404,7 @@ pub mod tests { &mut reader, &Key::from_raw(key), Some(Key::from_raw(b"z")).as_ref(), + version, ) .unwrap() .unwrap(); @@ -545,9 +562,11 @@ pub mod tests { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let (k, v) = (b"k", [u8::MAX; SHORT_VALUE_MAX_LEN + 1]); - must_prewrite_put(&mut engine, k, &v, k, *ts.incr()); - must_commit(&mut engine, k, ts, *ts.incr()); - must_get(&mut engine, k, ts, &v); + for _ in 0..2 { + must_prewrite_put(&mut engine, k, &v, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, ts, &v); + } let flashback_start_ts = *ts.incr(); // Rollback nothing. @@ -579,30 +598,23 @@ pub mod tests { fn test_prewrite_with_special_key() { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); - let (prewrite_key, prewrite_val) = (b"b", b"val"); - must_prewrite_put( - &mut engine, - prewrite_key, - prewrite_val, - prewrite_key, - *ts.incr(), - ); - must_commit(&mut engine, prewrite_key, ts, *ts.incr()); - must_get(&mut engine, prewrite_key, ts, prewrite_val); - let (k, v1, v2) = (b"c", b"v1", b"v2"); - must_prewrite_put(&mut engine, k, v1, k, *ts.incr()); - must_commit(&mut engine, k, ts, *ts.incr()); - must_prewrite_put(&mut engine, k, v2, k, *ts.incr()); - must_commit(&mut engine, k, ts, *ts.incr()); - must_get(&mut engine, k, ts, v2); + let (prewrite_key, k, v) = (b"b", b"c", b"val"); + for k in [prewrite_key, k] { + let (start_ts, commit_ts) = (*ts.incr(), *ts.incr()); + must_prewrite_put(&mut engine, k, v, k, start_ts); + must_commit(&mut engine, k, start_ts, commit_ts); + must_get(&mut engine, k, commit_ts, v); + } // Check for prewrite key b"b". let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); + let flashback_version = TimeStamp::zero(); let first_key = get_first_user_key( &mut reader, &Key::from_raw(b""), Some(Key::from_raw(b"z")).as_ref(), + flashback_version, ) .unwrap_or_else(|_| Some(Key::from_raw(b""))) .unwrap(); @@ -615,7 +627,12 @@ pub mod tests { assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); // Prewrite "prewrite_key" not "start_key". assert_eq!( - must_prewrite_flashback_key(&mut engine, start_key, 4, flashback_start_ts), + must_prewrite_flashback_key( + &mut engine, + start_key, + flashback_version, + flashback_start_ts + ), 1 ); // Flashback (b"c", v2) to (b"c", v1). @@ -623,7 +640,7 @@ pub mod tests { must_flashback_write_to_version( &mut engine, k, - 4, + flashback_version, flashback_start_ts, flashback_commit_ts ), @@ -634,14 +651,14 @@ pub mod tests { must_commit_flashback_key( &mut engine, start_key, + flashback_version, flashback_start_ts, flashback_commit_ts ), 2 ); - must_get(&mut engine, k, ts, v1); - must_get(&mut engine, prewrite_key, ts, prewrite_val); - + must_get_none(&mut engine, prewrite_key, ts); + must_get_none(&mut engine, k, ts); // case 2: start key is after all keys, prewrite will return None. let start_key = b"d"; let flashback_start_ts = *ts.incr(); @@ -649,12 +666,22 @@ pub mod tests { assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); // Prewrite null. assert_eq!( - must_prewrite_flashback_key(&mut engine, start_key, 4, flashback_start_ts), + must_prewrite_flashback_key( + &mut engine, + start_key, + flashback_version, + flashback_start_ts + ), 0 ); - // case 3: for last region, end_key will be None, prewrite key will valid. - let first_key = get_first_user_key(&mut reader, &Key::from_raw(b"a"), None) - .unwrap_or_else(|_| Some(Key::from_raw(b""))); - assert_eq!(first_key, Some(Key::from_raw(prewrite_key))); + must_get_none(&mut engine, prewrite_key, ts); + must_get_none(&mut engine, k, ts); + // case 3: for last region, end_key will be None, prewrite key will be valid. + assert_eq!( + get_first_user_key(&mut reader, &Key::from_raw(b"a"), None, flashback_version) + .unwrap() + .unwrap(), + Key::from_raw(prewrite_key) + ); } } diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index d885c974db4..769171d46e0 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -1,5 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::ops::Bound; + // #[PerformanceCriticalPath] use txn_types::{Key, Lock, TimeStamp}; @@ -109,20 +111,25 @@ impl CommandExt for FlashbackToVersionReadPhase { /// - Scan all locks. /// - Rollback all these locks. /// 2. [PrepareFlashback] Prewrite phase: -/// - Prewrite the `self.start_key` specifically to prevent the -/// `resolved_ts` from advancing. +/// - Prewrite the first user key after `self.start_key` specifically to +/// prevent the `resolved_ts` from advancing. /// 3. [FinishFlashback] FlashbackWrite phase: /// - Scan all the latest writes and their corresponding values at /// `self.version`. /// - Write the old MVCC version writes again for all these keys with -/// `self.commit_ts` excluding the `self.start_key`. +/// `self.commit_ts` excluding the first user key after `self.start_key`. /// 4. [FinishFlashback] Commit phase: -/// - Commit the `self.start_key` we write at the second phase to finish the -/// flashback. +/// - Commit the first user key after `self.start_key` we write at the +/// second phase to finish the flashback. impl ReadCommand for FlashbackToVersionReadPhase { fn process_read(self, snapshot: S, statistics: &mut Statistics) -> Result { let tag = self.tag().get_str(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &self.ctx); + // Filter out the SST that does not have a newer version than `self.version` in + // `CF_WRITE`, i.e, whose latest `commit_ts` <= `self.version` in the later + // scan. By doing this, we can only flashback those keys that have version + // changed since `self.version` as much as possible. + reader.set_hint_min_ts(Some(Bound::Excluded(self.version))); let mut start_key = self.start_key.clone(); let next_state = match self.state { FlashbackToVersionState::RollbackLock { next_lock_key, .. } => { @@ -141,9 +148,12 @@ impl ReadCommand for FlashbackToVersionReadPhase { // completion of the 2pc. // - To make sure the key locked in the latch is the same as the actual key // written, we pass it to the key in `process_write' after getting it. - let key_to_lock = if let Some(first_key) = - get_first_user_key(&mut reader, &self.start_key, self.end_key.as_ref())? - { + let key_to_lock = if let Some(first_key) = get_first_user_key( + &mut reader, + &self.start_key, + self.end_key.as_ref(), + self.version, + )? { first_key } else { // If the key is None return directly @@ -180,9 +190,12 @@ impl ReadCommand for FlashbackToVersionReadPhase { // 2pc. So When overwriting the write, we skip the immediate // write of this key and instead put it after the completion // of the 2pc. - next_write_key = if let Some(first_key) = - get_first_user_key(&mut reader, &self.start_key, self.end_key.as_ref())? - { + next_write_key = if let Some(first_key) = get_first_user_key( + &mut reader, + &self.start_key, + self.end_key.as_ref(), + self.version, + )? { first_key } else { // If the key is None return directly diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index afc2a658081..7d0ec219534 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -359,30 +359,62 @@ fn test_flashback_for_apply_snapshot() { must_check_flashback_state(&mut cluster, 1, 1, false); must_check_flashback_state(&mut cluster, 1, 3, false); - // Make store 3 isolated. cluster.add_send_filter(IsolationFilterFactory::new(3)); - // Write some data to trigger snapshot. - for i in 100..110 { - let key = format!("k{}", i); - let value = format!("v{}", i); - cluster.must_put_cf("write", key.as_bytes(), value.as_bytes()); + let mut region = cluster.get_region(TEST_KEY); + for _ in 0..10 { + must_request_without_flashback_flag( + &mut cluster, + &mut region.clone(), + new_put_cf_cmd("write", TEST_KEY, TEST_VALUE), + ) } - // Prepare for flashback cluster.must_send_wait_flashback_msg(1, AdminCmdType::PrepareFlashback); must_check_flashback_state(&mut cluster, 1, 1, true); must_check_flashback_state(&mut cluster, 1, 3, false); - // Add store 3 back. cluster.clear_send_filters(); must_check_flashback_state(&mut cluster, 1, 1, true); must_check_flashback_state(&mut cluster, 1, 3, true); + cluster.must_send_wait_flashback_msg(1, AdminCmdType::FinishFlashback); + must_check_flashback_state(&mut cluster, 1, 1, false); + must_check_flashback_state(&mut cluster, 1, 3, false); + // Prepare for flashback + cluster.must_send_wait_flashback_msg(1, AdminCmdType::PrepareFlashback); + must_check_flashback_state(&mut cluster, 1, 1, true); + must_check_flashback_state(&mut cluster, 1, 3, true); + // Make store 3 isolated. + cluster.add_send_filter(IsolationFilterFactory::new(3)); + // Write some flashback data to trigger snapshot. + for _ in 0..10 { + must_request_with_flashback_flag( + &mut cluster, + &mut region.clone(), + new_put_cf_cmd("write", TEST_KEY, TEST_VALUE), + ) + } + // Finish flashback. cluster.must_send_wait_flashback_msg(1, AdminCmdType::FinishFlashback); must_check_flashback_state(&mut cluster, 1, 1, false); + must_check_flashback_state(&mut cluster, 1, 3, true); + // Wait for a while before adding store 3 back to make sure only it does not + // receive the `FinishFlashback` message. + sleep(Duration::from_secs(1)); + // Add store 3 back. + cluster.clear_send_filters(); + must_check_flashback_state(&mut cluster, 1, 1, false); must_check_flashback_state(&mut cluster, 1, 3, false); + // Make store 3 become leader. + cluster.must_transfer_leader(region.get_id(), new_peer(3, 3)); + // Region should not in the flashback state. + must_request_without_flashback_flag( + &mut cluster, + &mut region, + new_put_cmd(TEST_KEY, TEST_VALUE), + ); } fn must_check_flashback_state( @@ -438,7 +470,7 @@ fn must_request_with_flashback_flag( req: Request, ) { let resp = request(cluster, region, req, true); - assert!(!resp.get_header().has_error()); + assert!(!resp.get_header().has_error(), "{:?}", resp); } fn must_get_flashback_not_prepared_error( @@ -457,7 +489,7 @@ fn must_request_without_flashback_flag( req: Request, ) { let resp = request(cluster, region, req, false); - assert!(!resp.get_header().has_error()); + assert!(!resp.get_header().has_error(), "{:?}", resp); } fn must_get_flashback_in_progress_error( diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 3dec0b57798..5c536fce124 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -782,7 +782,32 @@ fn test_mvcc_flashback_unprepared() { req.set_end_key(b"z".to_vec()); let resp = client.kv_flashback_to_version(&req).unwrap(); assert!(resp.get_error().contains("txn lock not found")); - must_kv_read_equal(&client, ctx, k, v, 6); + must_kv_read_equal(&client, ctx.clone(), k.clone(), v, 6); + // Flashback with preparing. + must_flashback_to_version(&client, ctx.clone(), 0, 6, 7); + let mut get_req = GetRequest::default(); + get_req.set_context(ctx.clone()); + get_req.key = k; + get_req.version = 7; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(!get_resp.has_error()); + assert_eq!(get_resp.value, b"".to_vec()); + // Mock the flashback retry. + let mut req = FlashbackToVersionRequest::default(); + req.set_context(ctx); + req.set_start_ts(6); + req.set_commit_ts(7); + req.version = 0; + req.start_key = b"a".to_vec(); + req.end_key = b"z".to_vec(); + let resp = client.kv_flashback_to_version(&req).unwrap(); + assert!(!resp.has_region_error()); + assert!(resp.get_error().is_empty()); + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(!get_resp.has_error()); + assert_eq!(get_resp.value, b"".to_vec()); } #[test] From 909787e828098d68ae86a70df3175cbbbacd2796 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 7 Dec 2022 13:18:04 +0800 Subject: [PATCH 389/676] raft_engine: remove redundant methods (#13900) ref tikv/tikv#12842 These methods are defined in log batch already, redefine them again introduce unnecessary maintenance. And these methods are also confusing as they don't sync by default, which is very easy to make mistake. Signed-off-by: Jay Lee --- components/engine_panic/src/raft_engine.rs | 20 ++------- components/engine_rocks/src/raft_engine.rs | 27 +++--------- components/engine_traits/src/raft_engine.rs | 21 +++------- components/raft_log_engine/src/engine.rs | 36 +++------------- components/raftstore-v2/src/bootstrap.rs | 5 ++- .../raftstore/src/store/peer_storage.rs | 41 +++++++++++-------- components/raftstore/src/store/snap.rs | 9 ++-- components/server/src/raft_engine_switch.rs | 7 +++- src/server/debug.rs | 9 ++-- tests/integrations/server/kv_service.rs | 12 ++++-- 10 files changed, 74 insertions(+), 113 deletions(-) diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index ad05e66c6fa..603eb118c5c 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -108,14 +108,6 @@ impl RaftEngine for PanicEngine { panic!() } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - panic!() - } - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { - panic!() - } - fn gc(&self, raft_group_id: u64, mut from: u64, to: u64) -> Result { panic!() } @@ -148,10 +140,6 @@ impl RaftEngine for PanicEngine { panic!() } - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { - panic!() - } - fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> where F: FnMut(u64) -> std::result::Result<(), E>, @@ -159,10 +147,6 @@ impl RaftEngine for PanicEngine { { panic!() } - - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { - panic!() - } } impl RaftLogBatch for PanicWriteBatch { @@ -209,4 +193,8 @@ impl RaftLogBatch for PanicWriteBatch { fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { panic!() } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + panic!() + } } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index da15b1708b8..79cd8350519 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -3,8 +3,8 @@ // #[PerformanceCriticalPath] use engine_traits::{ Error, Iterable, KvEngine, MiscExt, Mutable, Peekable, RaftEngine, RaftEngineDebug, - RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, Result, SyncMutable, WriteBatch, - WriteBatchExt, WriteOptions, CF_DEFAULT, RAFT_LOG_MULTI_GET_CNT, + RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, Result, WriteBatch, WriteBatchExt, + WriteOptions, CF_DEFAULT, RAFT_LOG_MULTI_GET_CNT, }; use kvproto::{ metapb::Region, @@ -286,17 +286,6 @@ impl RaftEngine for RocksEngine { Ok(()) } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - let mut wb = self.write_batch(); - let buf = Vec::with_capacity(1024); - wb.append_impl(raft_group_id, &entries, buf)?; - self.consume(&mut wb, false) - } - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { - self.put_msg(&keys::raft_state_key(raft_group_id), state) - } - fn batch_gc(&self, groups: Vec) -> Result { let mut total = 0; let mut raft_wb = self.write_batch_with_cap(4 * 1024); @@ -343,10 +332,6 @@ impl RaftEngine for RocksEngine { self.as_inner().path() } - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { - self.put_msg(keys::STORE_IDENT_KEY, ident) - } - fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> where F: FnMut(u64) -> std::result::Result<(), E>, @@ -374,10 +359,6 @@ impl RaftEngine for RocksEngine { Some(e) => Err(e), } } - - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { - self.put_msg(keys::RECOVER_STATE_KEY, state) - } } impl RaftLogBatch for RocksWriteBatchVec { @@ -431,6 +412,10 @@ impl RaftLogBatch for RocksWriteBatchVec { fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { self.put_msg(&keys::apply_state_key(raft_group_id), state) } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + self.put_msg(keys::RECOVER_STATE_KEY, state) + } } impl RocksWriteBatchVec { diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 7df681c96d5..0c5e0f49854 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -98,15 +98,6 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send batch: &mut Self::LogBatch, ) -> Result<()>; - /// Append some log entries and return written bytes. - /// - /// Note: `RaftLocalState` won't be updated in this call. - fn append(&self, raft_group_id: u64, entries: Vec) -> Result; - - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()>; - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()>; - /// Like `cut_logs` but the range could be very large. Return the deleted /// count. Generally, `from` can be passed in `0`. fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result; @@ -151,12 +142,6 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send where F: FnMut(u64) -> std::result::Result<(), E>, E: From; - - /// Indicate whether region states should be recovered from raftdb and - /// replay raft logs. - /// When kvdb's write-ahead-log is disabled, the sequence number of the last - /// boot time is saved. - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()>; } pub trait RaftLogBatch: Send { @@ -175,6 +160,12 @@ pub trait RaftLogBatch: Send { fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()>; fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()>; + /// Indicate whether region states should be recovered from raftdb and + /// replay raft logs. + /// When kvdb's write-ahead-log is disabled, the sequence number of the last + /// boot time is saved. + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()>; + /// The data size of this RaftLogBatch. fn persist_size(&self) -> usize; diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index c952f18dbc4..587f31bae93 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -412,6 +412,12 @@ impl RaftLogBatchTrait for RaftLogBatch { .put_message(raft_group_id, APPLY_STATE_KEY.to_vec(), state) .map_err(transfer_error) } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + self.0 + .put_message(STORE_STATE_ID, RECOVER_STATE_KEY.to_vec(), state) + .map_err(transfer_error) + } } impl RaftEngineReadOnly for RaftLogEngine { @@ -538,26 +544,6 @@ impl RaftEngine for RaftLogEngine { Ok(()) } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - let mut batch = Self::LogBatch::default(); - batch.append(raft_group_id, entries)?; - self.consume(&mut batch, false) - } - - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { - let mut batch = Self::LogBatch::default(); - batch.put_store_ident(ident)?; - self.consume(&mut batch, true)?; - Ok(()) - } - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { - let mut batch = Self::LogBatch::default(); - batch.put_raft_state(raft_group_id, state)?; - self.consume(&mut batch, false)?; - Ok(()) - } - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { self.batch_gc(vec![RaftLogGcTask { raft_group_id, @@ -626,16 +612,6 @@ impl RaftEngine for RaftLogEngine { } Ok(()) } - - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { - let mut batch = Self::LogBatch::default(); - batch - .0 - .put_message(STORE_STATE_ID, RECOVER_STATE_KEY.to_vec(), state) - .map_err(transfer_error)?; - self.0.write(&mut batch.0, true).map_err(transfer_error)?; - Ok(()) - } } fn transfer_error(e: RaftEngineError) -> engine_traits::Error { diff --git a/components/raftstore-v2/src/bootstrap.rs b/components/raftstore-v2/src/bootstrap.rs index 6700db4d45f..b505b37a75b 100644 --- a/components/raftstore-v2/src/bootstrap.rs +++ b/components/raftstore-v2/src/bootstrap.rs @@ -97,8 +97,9 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { let mut ident = StoreIdent::default(); ident.set_cluster_id(self.cluster_id); ident.set_store_id(id); - self.engine.put_store_ident(&ident)?; - self.engine.sync()?; + let mut lb = self.engine.log_batch(1); + lb.put_store_ident(&ident)?; + self.engine.consume(&mut lb, true)?; fail_point!("node_after_bootstrap_store", |_| Err(box_err!( "injected error: node_after_bootstrap_store" ))); diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 0d10b1f36cf..ce25544bcd8 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -183,7 +183,9 @@ fn init_raft_state( raft_state.last_index = RAFT_INIT_LOG_INDEX; raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); - engines.raft.put_raft_state(region.get_id(), &raft_state)?; + let mut lb = engines.raft.log_batch(0); + lb.put_raft_state(region.get_id(), &raft_state)?; + engines.raft.consume(&mut lb, true)?; } Ok(raft_state) } @@ -2077,32 +2079,35 @@ pub mod tests { let initial_state = s.initial_state().unwrap(); assert_eq!(initial_state.hard_state, *raft_state.get_hard_state()); + let mut lb = engines.raft.log_batch(4096); // last_index < commit_index is invalid. raft_state.set_last_index(11); - engines - .raft - .append(1, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) + lb.append(1, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) .unwrap(); raft_state.mut_hard_state().set_commit(12); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); raft_state.set_last_index(20); let entries = (12..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - engines.raft.append(1, entries).unwrap(); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.append(1, entries).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); s = build_storage().unwrap(); let initial_state = s.initial_state().unwrap(); assert_eq!(initial_state.hard_state, *raft_state.get_hard_state()); // Missing last log is invalid. raft_state.set_last_index(21); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); raft_state.set_last_index(20); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); // applied_index > commit_index is invalid. let mut apply_state = RaftApplyState::default(); @@ -2132,7 +2137,8 @@ pub mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); engines.raft.gc(1, 0, 21).unwrap(); - engines.raft.append(1, entries).unwrap(); + lb.append(1, entries).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); raft_state.mut_hard_state().set_commit(14); s = build_storage().unwrap(); let initial_state = s.initial_state().unwrap(); @@ -2143,27 +2149,28 @@ pub mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); entries[0].set_term(RAFT_INIT_LOG_TERM - 1); - engines.raft.append(1, entries).unwrap(); + lb.append(1, entries).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); // hard state term miss match is invalid. let entries = (14..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - engines.raft.append(1, entries).unwrap(); + lb.append(1, entries).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM - 1); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); // last index < recorded_commit_index is invalid. engines.raft.gc(1, 0, 21).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.set_last_index(13); - engines - .raft - .append(1, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) + lb.append(1, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) .unwrap(); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 19b9622657d..8cb44e3718c 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -2006,8 +2006,9 @@ pub mod tests { raft::RaftTestEngine, }; use engine_traits::{ - Engines, ExternalSstFileInfo, KvEngine, RaftEngine, Snapshot as EngineSnapshot, SstExt, - SstWriter, SstWriterBuilder, SyncMutable, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + Engines, ExternalSstFileInfo, KvEngine, RaftEngine, RaftLogBatch, + Snapshot as EngineSnapshot, SstExt, SstWriter, SstWriterBuilder, SyncMutable, ALL_CFS, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ encryptionpb::EncryptionMethod, @@ -2114,6 +2115,7 @@ pub mod tests { let kv: KvTestEngine = open_test_db(p.join("kv").as_path(), kv_db_opt, kv_cf_opts)?; let raft: RaftTestEngine = engine_test::raft::new_engine(p.join("raft").to_str().unwrap(), raft_db_opt)?; + let mut lb = raft.log_batch(regions.len() * 128); for ®ion_id in regions { // Put apply state into kv engine. let mut apply_state = RaftApplyState::default(); @@ -2123,7 +2125,7 @@ pub mod tests { apply_entry.set_term(0); apply_state.mut_truncated_state().set_index(10); kv.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; - raft.append(region_id, vec![apply_entry])?; + lb.append(region_id, vec![apply_entry])?; // Put region info into kv engine. let region = gen_test_region(region_id, 1, 1); @@ -2131,6 +2133,7 @@ pub mod tests { region_state.set_region(region); kv.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), ®ion_state)?; } + raft.consume(&mut lb, false).unwrap(); Ok(Engines::new(kv, raft)) } diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index 29144c8ca18..637088efa88 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -193,11 +193,11 @@ fn run_dump_raft_engine_worker( new_engine: &RocksEngine, count_size: &Arc, ) { + let mut batch = new_engine.log_batch(0); while let Ok(id) = rx.recv() { let state = old_engine.get_raft_state(id).unwrap().unwrap(); - new_engine.put_raft_state(id, &state).unwrap(); + batch.put_raft_state(id, &state).unwrap(); if let Some(last_index) = old_engine.last_index(id) { - let mut batch = new_engine.log_batch(0); let mut begin = old_engine.first_index(id).unwrap(); while begin <= last_index { let end = std::cmp::min(begin + 1024, last_index + 1); @@ -210,6 +210,9 @@ fn run_dump_raft_engine_worker( count_size.fetch_add(size, Ordering::Relaxed); } } + if !batch.is_empty() { + new_engine.consume(&mut batch, false).unwrap(); + } } } diff --git a/src/server/debug.rs b/src/server/debug.rs index 48435f72163..666e2ca33e7 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -16,8 +16,8 @@ use engine_rocks::{ }; use engine_traits::{ Engines, IterOptions, Iterable, Iterator as EngineIterator, Mutable, MvccProperties, Peekable, - RaftEngine, Range, RangePropertiesExt, SyncMutable, WriteBatch, WriteBatchExt, WriteOptions, - CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + RaftEngine, RaftLogBatch, Range, RangePropertiesExt, SyncMutable, WriteBatch, WriteBatchExt, + WriteOptions, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ debugpb::{self, Db as DbType}, @@ -735,7 +735,10 @@ impl Debugger { &keys::apply_state_key(region_id), &new_raft_apply_state )); - box_try!(raft.put_raft_state(region_id, &new_raft_local_state)); + let mut lb = raft.log_batch(0); + box_try!(lb.put_raft_state(region_id, &new_raft_local_state)); + // Will sync later. + box_try!(raft.consume(&mut lb, false)); let deleted_logs = box_try!(raft.gc(region_id, applied_index + 1, last_index + 1)); raft.sync().unwrap(); kv.sync().unwrap(); diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 5c536fce124..496c587a7b9 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -11,8 +11,8 @@ use std::{ use api_version::{ApiV1, ApiV1Ttl, ApiV2, KvFormat}; use concurrency_manager::ConcurrencyManager; use engine_traits::{ - MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, SyncMutable, CF_DEFAULT, CF_LOCK, CF_RAFT, - CF_WRITE, + MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, RaftLogBatch, SyncMutable, CF_DEFAULT, + CF_LOCK, CF_RAFT, CF_WRITE, }; use futures::{executor::block_on, future, SinkExt, StreamExt, TryStreamExt}; use grpcio::*; @@ -965,7 +965,9 @@ fn test_debug_raft_log() { entry.set_index(log_index); entry.set_entry_type(eraftpb::EntryType::EntryNormal); entry.set_data(vec![42].into()); - engine.append(region_id, vec![entry.clone()]).unwrap(); + let mut lb = engine.log_batch(0); + lb.append(region_id, vec![entry.clone()]).unwrap(); + engine.consume(&mut lb, false).unwrap(); assert_eq!( engine.get_entry(region_id, log_index).unwrap().unwrap(), entry @@ -999,7 +1001,9 @@ fn test_debug_region_info() { let region_id = 100; let mut raft_state = raft_serverpb::RaftLocalState::default(); raft_state.set_last_index(42); - raft_engine.put_raft_state(region_id, &raft_state).unwrap(); + let mut lb = raft_engine.log_batch(0); + lb.put_raft_state(region_id, &raft_state).unwrap(); + raft_engine.consume(&mut lb, false).unwrap(); assert_eq!( raft_engine.get_raft_state(region_id).unwrap().unwrap(), raft_state From 044aa15c6bf1474d5c38d2c45311acc8cffe5b0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20van=20Eeden?= Date: Wed, 7 Dec 2022 06:30:04 +0100 Subject: [PATCH 390/676] *: Update sysinfo dependency (#13385) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ref tikv/tikv#11276 Signed-off-by: Daniël van Eeden Co-authored-by: Ti Chi Robot --- Cargo.lock | 24 ++++-------- Cargo.toml | 2 +- components/tikv_util/Cargo.toml | 2 +- components/tikv_util/src/sys/mod.rs | 4 +- src/server/service/diagnostics/mod.rs | 2 +- src/server/service/diagnostics/sys.rs | 55 +++++++++++++-------------- 6 files changed, 39 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 063657d29bb..eb5145959af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1443,12 +1443,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "doc-comment" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "923dea538cea0aa3025e8685b20d6ee21ef99c4f77e954a30febbaac5ec73a97" - [[package]] name = "dyn-clone" version = "1.0.4" @@ -3341,9 +3335,9 @@ dependencies = [ [[package]] name = "ntapi" -version = "0.3.3" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26e041cd983acbc087e30fcba770380cfa352d0e392e175b2344ebaf7ea0602" +checksum = "bc51db7b362b205941f71232e56c625156eb9a929f8cf74a428fd5bc094a4afc" dependencies = [ "winapi 0.3.9", ] @@ -4511,9 +4505,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.5.0" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ "autocfg", "crossbeam-deque", @@ -4523,14 +4517,13 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.0" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils 0.8.8", - "lazy_static", "num_cpus", ] @@ -5651,13 +5644,12 @@ checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" [[package]] name = "sysinfo" -version = "0.16.4" +version = "0.26.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c280c91abd1aed2e36be1bc8f56fbc7a2acbb2b58fbcac9641510179cc72dd9" +checksum = "ade661fa5e048ada64ad7901713301c21d2dbc5b65ee7967de8826c111452960" dependencies = [ "cfg-if 1.0.0", "core-foundation-sys", - "doc-comment", "libc 0.2.132", "ntapi", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index 104157fdf24..61d6da6946d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -153,7 +153,7 @@ smallvec = "1.4" sst_importer = { workspace = true } strum = { version = "0.20", features = ["derive"] } sync_wrapper = "0.1.1" -sysinfo = "0.16" +sysinfo = "0.26" tempfile = "3.0" thiserror = "1.0" tidb_query_aggr = { workspace = true } diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 12c3983ef2d..663eb2b681f 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -50,7 +50,7 @@ slog-async = "2.3" slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } slog-json = "2.3" slog-term = "2.4" -sysinfo = "0.16" +sysinfo = "0.26" thiserror = "1.0" tikv_alloc = { workspace = true } time = "0.1" diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 8b5e846592f..35d417db650 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -19,7 +19,7 @@ use lazy_static::lazy_static; #[cfg(target_os = "linux")] use mnt::get_mount; use sysinfo::RefreshKind; -pub use sysinfo::{DiskExt, NetworkExt, ProcessExt, ProcessorExt, SystemExt}; +pub use sysinfo::{CpuExt, DiskExt, NetworkExt, ProcessExt, SystemExt}; use crate::config::{ReadableSize, KIB}; @@ -92,7 +92,7 @@ impl SysQuota { fn sysinfo_memory_limit_in_bytes() -> u64 { let system = sysinfo::System::new_with_specifics(RefreshKind::new().with_memory()); - system.get_total_memory() * KIB + system.total_memory() * KIB } } diff --git a/src/server/service/diagnostics/mod.rs b/src/server/service/diagnostics/mod.rs index abede000858..354108e6ab9 100644 --- a/src/server/service/diagnostics/mod.rs +++ b/src/server/service/diagnostics/mod.rs @@ -119,7 +119,7 @@ impl Diagnostics for Service { let load = ( sys::cpu_time_snapshot(), system - .get_networks() + .networks() .into_iter() .map(|(n, d)| (n.to_owned(), sys::NicSnapshot::from_network_data(d))) .collect(), diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index 17ed9a78b3f..6e9585ab2c9 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -26,12 +26,12 @@ pub struct NicSnapshot { impl NicSnapshot { pub fn from_network_data(data: &impl NetworkExt) -> NicSnapshot { NicSnapshot { - rx_bytes: data.get_total_received(), - tx_bytes: data.get_total_transmitted(), - rx_packets: data.get_total_packets_received(), - tx_packets: data.get_total_packets_transmitted(), - rx_errors: data.get_total_errors_on_received(), - tx_errors: data.get_total_errors_on_transmitted(), + rx_bytes: data.total_received(), + tx_bytes: data.total_transmitted(), + rx_packets: data.total_packets_received(), + tx_packets: data.total_packets_transmitted(), + rx_errors: data.total_errors_on_received(), + tx_errors: data.total_errors_on_transmitted(), } } @@ -62,7 +62,7 @@ fn cpu_load_info(prev_cpu: CpuTimeSnapshot, collector: &mut Vec) let infos = { let mut system = SYS_INFO.lock().unwrap(); system.refresh_system(); - let load = system.get_load_average(); + let load = system.load_average(); vec![ ("load1", load.one), ("load5", load.five), @@ -129,12 +129,12 @@ fn cpu_load_info(prev_cpu: CpuTimeSnapshot, collector: &mut Vec) fn mem_load_info(collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_memory(); - let total_memory = system.get_total_memory() * KIB; - let used_memory = system.get_used_memory() * KIB; - let free_memory = system.get_free_memory() * KIB; - let total_swap = system.get_total_swap() * KIB; - let used_swap = system.get_used_swap() * KIB; - let free_swap = system.get_free_swap() * KIB; + let total_memory = system.total_memory() * KIB; + let used_memory = system.used_memory() * KIB; + let free_memory = system.free_memory() * KIB; + let total_swap = system.total_swap() * KIB; + let used_swap = system.used_swap() * KIB; + let free_swap = system.free_swap() * KIB; drop(system); let used_memory_pct = (used_memory as f64) / (total_memory as f64); let free_memory_pct = (free_memory as f64) / (total_memory as f64); @@ -182,7 +182,7 @@ fn nic_load_info(prev_nic: HashMap, collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_cpu(); - let processor = match system.get_processors().iter().next() { + let processor = match system.cpus().iter().next() { Some(p) => p, None => return, }; let mut infos = vec![ ("cpu-logical-cores", SysQuota::cpu_cores_quota().to_string()), ("cpu-physical-cores", num_cpus::get_physical().to_string()), - ("cpu-frequency", format!("{}MHz", processor.get_frequency())), - ("cpu-vendor-id", processor.get_vendor_id().to_string()), + ("cpu-frequency", format!("{}MHz", processor.frequency())), + ("cpu-vendor-id", processor.vendor_id().to_string()), ]; // Depend on Rust lib return CPU arch not matching // Golang lib so need this match loop to conversion @@ -362,26 +362,23 @@ fn disk_hardware_info(collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_disks_list(); system.refresh_disks(); - let disks = system.get_disks(); + let disks = system.disks(); for disk in disks { - let file_sys = std::str::from_utf8(disk.get_file_system()).unwrap_or("unknown"); + let file_sys = std::str::from_utf8(disk.file_system()).unwrap_or("unknown"); if file_sys == "rootfs" { continue; } - let total = disk.get_total_space(); - let free = disk.get_available_space(); + let total = disk.total_space(); + let free = disk.available_space(); let used = total - free; let free_pct = (free as f64) / (total as f64); let used_pct = (used as f64) / (total as f64); let infos = vec![ - ("type", format!("{:?}", disk.get_type())), + ("type", format!("{:?}", disk.type_())), ("fstype", file_sys.to_string()), ( "path", - disk.get_mount_point() - .to_str() - .unwrap_or("unknown") - .to_string(), + disk.mount_point().to_str().unwrap_or("unknown").to_string(), ), ("total", total.to_string()), ("free", free.to_string()), @@ -398,7 +395,7 @@ fn disk_hardware_info(collector: &mut Vec) { } let mut item = ServerInfoItem::default(); item.set_tp("disk".to_string()); - item.set_name(disk.get_name().to_str().unwrap_or("disk").to_string()); + item.set_name(disk.name().to_str().unwrap_or("disk").to_string()); item.set_pairs(pairs.into()); collector.push(item); } @@ -515,7 +512,7 @@ fn get_transparent_hugepage() -> Option { pub fn process_info(collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_processes(); - let processes = system.get_processes(); + let processes = system.processes(); for (pid, p) in processes.iter() { if p.cmd().is_empty() { continue; @@ -555,7 +552,7 @@ mod tests { system.refresh_networks_list(); system.refresh_all(); system - .get_networks() + .networks() .into_iter() .map(|(n, d)| (n.to_owned(), NicSnapshot::from_network_data(d))) .collect() From e9eb8c95f5660dbf979decc4739c27b5c9c55080 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Wed, 7 Dec 2022 17:28:04 +0800 Subject: [PATCH 391/676] backup: allow to backup during the flashback (#13895) ref tikv/tikv#13787, close pingcap/tidb#39639 - Allow to backup during the flashback by passing the flashback flag. - Allow the checksum request to get the snapshot during the flashback progress. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- components/backup/src/endpoint.rs | 1 + components/raftstore/src/store/fsm/peer.rs | 4 ++- components/tikv_kv/src/lib.rs | 4 +-- components/txn_types/src/types.rs | 4 +-- src/coprocessor/endpoint.rs | 6 +++- src/coprocessor/mod.rs | 4 +++ src/server/raftkv/mod.rs | 4 +-- .../txn/commands/acquire_pessimistic_lock.rs | 2 +- .../txn/commands/flashback_to_version.rs | 2 +- src/storage/txn/commands/prewrite.rs | 2 +- src/storage/txn/scheduler.rs | 2 +- tests/integrations/backup/mod.rs | 30 ++++++++++++++++ .../integrations/raftstore/test_flashback.rs | 35 ++++++++++++++++++- 13 files changed, 87 insertions(+), 13 deletions(-) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index b880da7a3dc..0469ffa30a7 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -330,6 +330,7 @@ impl BackupRange { assert!(!ctx.get_replica_read()); let snap_ctx = SnapContext { pb_ctx: &ctx, + allowed_in_flashback: self.region.is_in_flashback, ..Default::default() }; diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 62eadb97076..9460daf812d 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5108,7 +5108,9 @@ where // proposed. Skip the not prepared error because the // `self.region().is_in_flashback` may not be the latest right after applying // the `PrepareFlashback` admin command, we will let it pass here and check in - // the apply phase. + // the apply phase and because a read-only request doesn't need to be applied, + // so it will be allowed during the flashback progress, for example, a snapshot + // request. if let Err(e) = util::check_flashback_state(self.region().is_in_flashback, msg, region_id, true) { diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index f78b2243331..bf277282bd8 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -294,8 +294,8 @@ pub struct SnapContext<'a> { // `key_ranges` is used in replica read. It will send to // the leader via raft "read index" to check memory locks. pub key_ranges: Vec, - // Marks that this read is a FlashbackToVersionReadPhase. - pub for_flashback: bool, + // Marks that this snapshot request is allowed in the flashback state. + pub allowed_in_flashback: bool, } /// Engine defines the common behaviour for a storage engine type. diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 6a2c953afc1..60e64bf444a 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -532,8 +532,8 @@ pub struct TxnExtra { // Marks that this transaction is a 1PC transaction. RaftKv should set this flag // in the raft command request. pub one_pc: bool, - // Marks that this transaction is a flashback transaction. - pub for_flashback: bool, + // Marks that this transaction is allowed in the flashback state. + pub allowed_in_flashback: bool, } impl TxnExtra { diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 3274700d812..54fcaeb0489 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -171,7 +171,7 @@ impl Endpoint { let mut input = CodedInputStream::from_bytes(&data); input.set_recursion_limit(self.recursion_limit); - let req_ctx: ReqContext; + let mut req_ctx: ReqContext; let builder: RequestHandlerBuilder; match req.get_tp() { @@ -316,6 +316,9 @@ impl Endpoint { cache_match_version, self.perf_level, ); + // Checksum is allowed during the flashback period to make sure the tool such + // like BR can work. + req_ctx.allowed_in_flashback = true; with_tls_tracker(|tracker| { tracker.req_info.request_type = RequestType::CoprocessorChecksum; tracker.req_info.start_ts = start_ts; @@ -358,6 +361,7 @@ impl Endpoint { let mut snap_ctx = SnapContext { pb_ctx: &ctx.context, start_ts: Some(ctx.txn_start_ts), + allowed_in_flashback: ctx.allowed_in_flashback, ..Default::default() }; // need to pass start_ts and ranges to check memory locks for replica read diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index 8acd5325a1e..140d3c0476e 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -142,6 +142,9 @@ pub struct ReqContext { /// Perf level pub perf_level: PerfLevel, + + /// Whether the request is allowed in the flashback state. + pub allowed_in_flashback: bool, } impl ReqContext { @@ -181,6 +184,7 @@ impl ReqContext { lower_bound, upper_bound, perf_level, + allowed_in_flashback: false, } } diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 6c7169d043c..b12e56ee7a0 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -453,7 +453,7 @@ where if txn_extra.one_pc { flags |= WriteBatchFlags::ONE_PC.bits(); } - if txn_extra.for_flashback { + if txn_extra.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); @@ -555,7 +555,7 @@ where flags |= WriteBatchFlags::STALE_READ.bits(); header.set_flag_data(data.into()); } - if ctx.for_flashback { + if ctx.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 6bd147cf02e..2afdadaad80 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -191,7 +191,7 @@ pub(super) fn make_write_data(modifies: Vec, old_values: OldValues) -> W old_values, // One pc status is unknown in AcquirePessimisticLock stage. one_pc: false, - for_flashback: false, + allowed_in_flashback: false, }; WriteData::new(modifies, extra) } else { diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 3999042fe27..72b100f567b 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -118,7 +118,7 @@ impl WriteCommand for FlashbackToVersion { let rows = txn.modifies.len(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); // To let the flashback modification could be proposed and applied successfully. - write_data.extra.for_flashback = true; + write_data.extra.allowed_in_flashback = true; // To let the CDC treat the flashback modification as an 1PC transaction. if matches!(self.state, FlashbackToVersionState::FlashbackWrite { .. }) { write_data.extra.one_pc = true; diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index cd24f54d13b..b34c4eb752b 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -672,7 +672,7 @@ impl Prewriter { old_values: self.old_values, // Set one_pc flag in TxnExtra to let CDC skip handling the resolver. one_pc: self.try_one_pc, - for_flashback: false, + allowed_in_flashback: false, }; // Here the lock guards are taken and will be released after the write finishes. // If an error (KeyIsLocked or WriteConflict) occurs before, these lock guards diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index bfbb860e545..13a74895803 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -702,7 +702,7 @@ impl Scheduler { Command::FlashbackToVersionReadPhase { .. } | Command::FlashbackToVersion { .. } ) { - snap_ctx.for_flashback = true; + snap_ctx.allowed_in_flashback = true; } // The program is currently in scheduler worker threads. // Safety: `self.inner.worker_pool` should ensure that a TLS engine exists. diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index f432fd72246..4cfd4be07be 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -598,3 +598,33 @@ fn calculated_commit_ts_after_commit() { commit_ts }); } + +#[test] +fn test_backup_in_flashback() { + let mut suite = TestSuite::new(3, 144 * 1024 * 1024, ApiVersion::V1); + suite.must_kv_put(3, 1); + // Prepare the flashback. + let region = suite.cluster.get_region(b"key_0"); + suite.cluster.must_send_wait_flashback_msg( + region.get_id(), + kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, + ); + // Start the backup. + let tmp = Builder::new().tempdir().unwrap(); + let backup_ts = suite.alloc_ts(); + let storage_path = make_unique_dir(tmp.path()); + let rx = suite.backup( + vec![], // start + vec![], // end + 0.into(), // begin_ts + backup_ts, + &storage_path, + ); + let resp = block_on(rx.collect::>()); + assert!(!resp[0].has_error()); + // Finish the flashback. + suite.cluster.must_send_wait_flashback_msg( + region.get_id(), + kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, + ); +} diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 7d0ec219534..e50ca59fdff 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -9,7 +9,7 @@ use futures::{channel::oneshot, executor::block_on}; use kvproto::{ errorpb::FlashbackInProgress, metapb, - raft_cmdpb::{AdminCmdType, RaftCmdResponse, Request}, + raft_cmdpb::{AdminCmdType, CmdType, RaftCmdResponse, Request}, }; use raftstore::store::Callback; use test_raftstore::*; @@ -18,6 +18,39 @@ use txn_types::WriteBatchFlags; const TEST_KEY: &[u8] = b"k1"; const TEST_VALUE: &[u8] = b"v1"; +#[test] +fn test_allow_read_only_request() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + let mut region = cluster.get_region(TEST_KEY); + let mut snap_req = Request::default(); + snap_req.set_cmd_type(CmdType::Snap); + // Get snapshot normally. + let snap_resp = request(&mut cluster, &mut region.clone(), snap_req.clone(), false); + assert!(!snap_resp.get_header().has_error()); + // Get snapshot with flashback flag without in the flashback state. + let snap_resp = request(&mut cluster, &mut region.clone(), snap_req.clone(), true); + assert!(!snap_resp.get_header().has_error()); + // Get snapshot with flashback flag with in the flashback state. + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); + let snap_resp = request(&mut cluster, &mut region.clone(), snap_req.clone(), true); + assert!(!snap_resp.get_header().has_error()); + // Get snapshot without flashback flag with in the flashback state. + let snap_resp = request(&mut cluster, &mut region, snap_req, false); + assert!( + snap_resp + .get_header() + .get_error() + .has_flashback_in_progress(), + "{:?}", + snap_resp + ); + // Finish flashback. + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); +} + #[test] #[cfg(feature = "failpoints")] fn test_read_after_prepare_flashback() { From d7096ebae6bbedeb3993cb276cf6420daf7f769a Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 7 Dec 2022 18:06:05 +0800 Subject: [PATCH 392/676] raftstore-v2: use snapshot to initialize split (#13886) ref tikv/tikv#12842 Create a new storage introduces unnecessary complexity and corner cases. As split is an initialization just like snapshot, this PR reuses snapshot to make the process a lot simpler and more robust. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/fsm/apply.rs | 2 +- components/raftstore-v2/src/fsm/peer.rs | 4 + .../src/operation/command/admin/split.rs | 202 ++++++++++-------- .../raftstore-v2/src/operation/command/mod.rs | 2 +- .../raftstore-v2/src/operation/query/lease.rs | 33 --- .../raftstore-v2/src/operation/query/mod.rs | 4 +- .../raftstore-v2/src/operation/ready/mod.rs | 2 +- .../src/operation/ready/snapshot.rs | 49 ++++- components/raftstore-v2/src/raft/peer.rs | 16 +- components/raftstore-v2/src/raft/storage.rs | 52 +---- components/raftstore-v2/src/router/message.rs | 8 + 11 files changed, 194 insertions(+), 180 deletions(-) diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index b8faf589760..2aa42da2e42 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -37,7 +37,7 @@ pub trait ApplyResReporter { impl, S: FsmScheduler> ApplyResReporter for Mailbox { fn report(&self, apply_res: ApplyRes) { // TODO: check shutdown. - self.force_send(PeerMsg::ApplyRes(apply_res)).unwrap(); + let _ = self.force_send(PeerMsg::ApplyRes(apply_res)); } } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index cd93463a524..cf85522df90 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -221,6 +221,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerMsg::Tick(tick) => self.on_tick(tick), PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(self.store_ctx, res), PeerMsg::SplitInit(msg) => self.fsm.peer.on_split_init(self.store_ctx, msg), + PeerMsg::SplitInitFinish(region_id) => self + .fsm + .peer + .on_split_init_finish(self.store_ctx, region_id), PeerMsg::Start => self.on_start(), PeerMsg::Noop => unimplemented!(), PeerMsg::Persisted { diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 2782b436439..0b97d726a2e 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -25,8 +25,9 @@ //! created by the store, and here init it using the data sent from the parent //! peer. -use std::collections::VecDeque; +use std::{cmp, collections::VecDeque}; +use collections::HashSet; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{ Checkpointer, DeleteStrategy, KvEngine, OpenOptions, RaftEngine, RaftLogBatch, Range, @@ -37,17 +38,18 @@ use keys::enc_end_key; use kvproto::{ metapb::{self, Region, RegionEpoch}, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, - raft_serverpb::RegionLocalState, + raft_serverpb::{RaftMessage, RaftSnapshotData, RegionLocalState}, }; use protobuf::Message; -use raft::RawNode; +use raft::{prelude::Snapshot, RawNode, INVALID_ID}; use raftstore::{ coprocessor::RegionChangeReason, store::{ fsm::apply::validate_batch_split, metrics::PEER_ADMIN_CMD_COUNTER, + snap::TABLET_SNAPSHOT_VERSION, util::{self, KeysInfoFormatter}, - PeerPessimisticLocks, PeerStat, ProposalContext, RAFT_INIT_LOG_INDEX, + PeerPessimisticLocks, PeerStat, ProposalContext, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, Result, }; @@ -69,16 +71,38 @@ pub struct SplitResult { pub derived_index: usize, pub tablet_index: u64, } + +#[derive(Debug)] pub struct SplitInit { /// Split region pub region: metapb::Region, pub check_split: bool, - pub parent_is_leader: bool, + pub scheduled: bool, + pub source_leader: bool, + pub source_id: u64, /// In-memory pessimistic locks that should be inherited from parent region pub locks: PeerPessimisticLocks, } +impl SplitInit { + fn to_snapshot(&self) -> Snapshot { + let mut snapshot = Snapshot::default(); + // Set snapshot metadata. + snapshot.mut_metadata().set_term(RAFT_INIT_LOG_TERM); + snapshot.mut_metadata().set_index(RAFT_INIT_LOG_INDEX); + let conf_state = util::conf_state_from_region(&self.region); + snapshot.mut_metadata().set_conf_state(conf_state); + // Set snapshot data. + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(self.region.clone()); + snap_data.set_version(TABLET_SNAPSHOT_VERSION); + snap_data.mut_meta().set_for_balance(false); + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + snapshot + } +} + impl Peer { pub fn propose_split( &mut self, @@ -263,7 +287,7 @@ impl Apply { } impl Peer { - pub fn on_ready_split_region( + pub fn on_apply_res_split( &mut self, store_ctx: &mut StoreContext, derived_index: usize, @@ -321,16 +345,20 @@ impl Peer { } let last_region_id = regions.last().unwrap().get_id(); + let mut new_ids = HashSet::default(); for (new_region, locks) in regions.into_iter().zip(region_locks) { let new_region_id = new_region.get_id(); if new_region_id == region_id { continue; } + new_ids.insert(new_region_id); let split_init = PeerMsg::SplitInit(Box::new(SplitInit { region: new_region, - parent_is_leader: self.is_leader(), + source_leader: self.is_leader(), + source_id: region_id, check_split: last_region_id == new_region_id, + scheduled: false, locks, })); @@ -353,108 +381,104 @@ impl Peer { _ => unreachable!(), } } + self.split_trace_mut().push((tablet_index, new_ids)); } pub fn on_split_init( &mut self, store_ctx: &mut StoreContext, - split_init: Box, + mut split_init: Box, ) { let region_id = split_init.region.id; - let replace = split_init.region.get_region_epoch().get_version() - > self - .storage() - .region_state() - .get_region() - .get_region_epoch() - .get_version(); - - if !self.storage().is_initialized() || replace { - let split_temp_path = store_ctx.tablet_factory.tablet_path_with_prefix( - SPLIT_PREFIX, - region_id, - RAFT_INIT_LOG_INDEX, - ); - - let tablet = store_ctx - .tablet_factory - .load_tablet(&split_temp_path, region_id, RAFT_INIT_LOG_INDEX) - .unwrap_or_else(|e| { - panic!( - "{:?} fails to load tablet {:?} :{:?}", - self.logger.list(), - split_temp_path, - e - ) - }); - - self.tablet_mut().set(tablet); - - let storage = Storage::with_split( - self.peer().get_store_id(), - &split_init.region, - store_ctx.engine.clone(), - store_ctx.read_scheduler.clone(), - &store_ctx.logger, - ) - .unwrap_or_else(|e| panic!("fail to create storage: {:?}", e)) - .unwrap(); - - let applied_index = storage.apply_state().get_applied_index(); - let peer_id = storage.peer().get_id(); - let raft_cfg = store_ctx.cfg.new_raft_config(peer_id, applied_index); - - let mut raft_group = RawNode::new(&raft_cfg, storage, &self.logger).unwrap(); - // If this region has only one peer and I am the one, campaign directly. - if split_init.region.get_peers().len() == 1 { - raft_group.campaign().unwrap(); - self.set_has_ready(); - } - self.set_raft_group(raft_group); - } else { - // TODO: when reaching here (peer is initalized before and cannot be replaced), - // it is much complexer. + if self.storage().is_initialized() && self.persisted_index() >= RAFT_INIT_LOG_INDEX { + let _ = store_ctx + .router + .force_send(split_init.source_id, PeerMsg::SplitInitFinish(region_id)); return; } - { - let mut meta = store_ctx.store_meta.lock().unwrap(); + if self.storage().is_initialized() || self.raft_group().snap().is_some() { + // It accepts a snapshot already but not finish applied yet. + let prev = self.storage_mut().split_init_mut().replace(split_init); + assert!(prev.is_none(), "{:?}", prev); + return; + } - info!( - self.logger, - "init split region"; - "region" => ?split_init.region, + split_init.scheduled = true; + let snap = split_init.to_snapshot(); + let mut msg = raft::eraftpb::Message::default(); + msg.set_to(self.peer_id()); + msg.set_from(self.leader_id()); + msg.set_msg_type(raft::eraftpb::MessageType::MsgSnapshot); + msg.set_snapshot(snap); + msg.set_term(cmp::max(self.term(), RAFT_INIT_LOG_TERM)); + let res = self.raft_group_mut().step(msg); + let accept_snap = self.raft_group().snap().is_some(); + if res.is_err() || !accept_snap { + panic!( + "{:?} failed to accept snapshot {:?} with error {}", + self.logger.list(), + res, + accept_snap ); + } + let prev = self.storage_mut().split_init_mut().replace(split_init); + assert!(prev.is_none(), "{:?}", prev); + self.set_has_ready(); + } - // TODO: GlobalReplicationState - - for p in split_init.region.get_peers() { - self.insert_peer_cache(p.clone()); - } - - if split_init.parent_is_leader { - if self.maybe_campaign() { - self.set_has_ready(); - } - - *self.txn_ext().pessimistic_locks.write() = split_init.locks; - // The new peer is likely to become leader, send a heartbeat immediately to - // reduce client query miss. - self.region_heartbeat_pd(store_ctx); - } + pub fn post_split_init( + &mut self, + store_ctx: &mut StoreContext, + split_init: Box, + ) { + if split_init.source_leader + && self.leader_id() == INVALID_ID + && self.term() == RAFT_INIT_LOG_TERM + { + let _ = self.raft_group_mut().campaign(); + self.set_has_ready(); - meta.tablet_caches.insert(region_id, self.tablet().clone()); - meta.readers - .insert(region_id, self.generate_read_delegate()); - meta.region_read_progress - .insert(region_id, self.read_progress().clone()); + *self.txn_ext().pessimistic_locks.write() = split_init.locks; + // The new peer is likely to become leader, send a heartbeat immediately to + // reduce client query miss. + self.region_heartbeat_pd(store_ctx); } + let region_id = self.region_id(); if split_init.check_split { // TODO: check if the last region needs to split again } + let _ = store_ctx + .router + .force_send(split_init.source_id, PeerMsg::SplitInitFinish(region_id)); + } - self.schedule_apply_fsm(store_ctx); + pub fn on_split_init_finish(&mut self, ctx: &mut StoreContext, region_id: u64) { + let mut found = false; + for (tablet_index, ids) in self.split_trace_mut() { + if ids.remove(®ion_id) { + found = true; + break; + } + } + assert!(found, "{:?} {}", self.logger.list(), region_id); + let split_trace = self.split_trace_mut(); + let mut off = 0; + let mut admin_flushed = 0; + for (tablet_index, ids) in split_trace.iter() { + if !ids.is_empty() { + break; + } + admin_flushed = *tablet_index; + off += 1; + } + if off > 0 { + // There should be very few elements in the vector. + split_trace.drain(..off); + // Persist admin flushed. + self.set_has_ready(); + } } } diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 7e69a3f1c7c..bd175ef7a4d 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -305,7 +305,7 @@ impl Peer { regions, derived_index, tablet_index, - }) => self.on_ready_split_region(ctx, derived_index, tablet_index, regions), + }) => self.on_apply_res_split(ctx, derived_index, tablet_index, regions), } } diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 114080bcdbb..bbff28b272f 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -175,45 +175,12 @@ impl Peer { self.maybe_update_read_progress(reader, progress); } if let Some(progress) = read_progress { - // TODO: remove it - self.add_reader_if_necessary(store_meta); - let mut meta = store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id()).unwrap(); self.maybe_update_read_progress(reader, progress); } } - // TODO: remove this block of code when snapshot is done; add the logic into - // on_persist_snapshot. - pub(crate) fn add_reader_if_necessary(&mut self, store_meta: &Mutex>) { - let mut meta = store_meta.lock().unwrap(); - // TODO: remove this block of code when snapshot is done; add the logic into - // on_persist_snapshot. - let reader = meta.readers.get_mut(&self.region_id()); - if reader.is_none() { - let region = self.region().clone(); - let region_id = region.get_id(); - let peer_id = self.peer_id(); - let delegate = ReadDelegate { - region: Arc::new(region), - peer_id, - term: self.term(), - applied_term: self.entry_storage().applied_term(), - leader_lease: None, - last_valid_ts: Timespec::new(0, 0), - tag: format!("[region {}] {}", region_id, peer_id), - read_progress: self.read_progress().clone(), - pending_remove: false, - bucket_meta: None, - txn_extra_op: Default::default(), - txn_ext: Default::default(), - track_ver: TrackVer::new(), - }; - meta.readers.insert(self.region_id(), delegate); - } - } - pub(crate) fn maybe_update_read_progress( &self, reader: &mut ReadDelegate, diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 77ca7b90074..3a3052ab902 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -388,7 +388,7 @@ impl Peer { // V2 doesn't persist commit index and term, fill them with in-memory values. meta.raft_apply.commit_index = cmp::min( self.raft_group().raft.raft_log.committed, - self.raft_group().raft.raft_log.persisted, + self.persisted_index(), ); meta.raft_apply.commit_term = self .raft_group() @@ -426,8 +426,6 @@ impl Peer { if progress_to_be_updated && self.is_leader() { // TODO: add coprocessor_host hook let progress = ReadProgress::applied_term(applied_term); - // TODO: remove it - self.add_reader_if_necessary(&ctx.store_meta); let mut meta = ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id()).unwrap(); self.maybe_update_read_progress(reader, progress); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 1c8c9d80338..6a91c25f1f6 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -403,7 +403,7 @@ impl Peer { let persisted_number = self.async_writer.persisted_number(); self.raft_group_mut().on_persist_ready(persisted_number); - let persisted_index = self.raft_group().raft.raft_log.persisted; + let persisted_index = self.persisted_index(); /// The apply snapshot process order would be: /// - Get the snapshot from the ready /// - Wait for async writer to load this tablet diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 32e8a3f8ff8..4cd4b5265d8 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -21,20 +21,20 @@ use std::{ borrow::BorrowMut, fmt::{self, Debug}, - mem, + fs, mem, sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, mpsc, Arc, }, }; -use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; +use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory, SPLIT_PREFIX}; use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; use protobuf::Message; use raft::eraftpb::Snapshot; use raftstore::store::{ metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, - TabletSnapManager, Transport, WriteTask, + TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, }; use slog::{error, info, warn}; use tikv_util::{box_err, box_try, worker::Scheduler}; @@ -120,7 +120,7 @@ impl Peer { } pub fn on_applied_snapshot(&mut self, ctx: &mut StoreContext) { - let persisted_index = self.raft_group().raft.raft_log.persisted; + let persisted_index = self.persisted_index(); let first_index = self.storage().entry_storage().first_index(); if first_index == persisted_index + 1 { let region_id = self.region_id(); @@ -132,9 +132,26 @@ impl Peer { self.schedule_apply_fsm(ctx); self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(persisted_index); + { + let mut meta = ctx.store_meta.lock().unwrap(); + meta.tablet_caches.insert(region_id, self.tablet().clone()); + meta.readers + .insert(region_id, self.generate_read_delegate()); + meta.region_read_progress + .insert(region_id, self.read_progress().clone()); + } self.read_progress_mut() .update_applied_core(persisted_index); - info!(self.logger, "apply tablet snapshot completely"); + let split = self.storage_mut().split_init_mut().take(); + if split.as_ref().map_or(true, |s| { + !s.scheduled || persisted_index != RAFT_INIT_LOG_INDEX + }) { + info!(self.logger, "apply tablet snapshot completely"); + } + if let Some(init) = split { + info!(self.logger, "init with snapshot finished"); + self.post_split_init(ctx, init); + } } } } @@ -381,8 +398,18 @@ impl Storage { self.entry_storage_mut().set_truncated_term(last_term); self.entry_storage_mut().set_last_term(last_term); - let key = TabletSnapKey::new(region_id, peer_id, last_term, last_index); - let mut path = snap_mgr.final_recv_path(&key); + let (path, clean_split) = match self.split_init_mut() { + // If index not match, the peer may accept a newer snapshot after split. + Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => ( + tablet_factory.tablet_path_with_prefix(SPLIT_PREFIX, region_id, last_index), + false, + ), + si => { + let key = TabletSnapKey::new(region_id, peer_id, last_term, last_index); + (snap_mgr.final_recv_path(&key), si.is_some()) + } + }; + let logger = self.logger().clone(); // The snapshot require no additional processing such as ingest them to DB, but // it should load it into the factory after it persisted. @@ -395,6 +422,14 @@ impl Storage { e ); } + if clean_split { + let path = tablet_factory.tablet_path_with_prefix( + SPLIT_PREFIX, + region_id, + RAFT_INIT_LOG_INDEX, + ); + let _ = fs::remove_dir_all(path); + } }; task.persisted_cb = (Some(Box::new(hook))); task.has_snapshot = true; diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index a9730a036e7..16e3e54d5f2 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -6,7 +6,7 @@ use std::{ time::{Duration, Instant}, }; -use collections::HashMap; +use collections::{HashMap, HashSet}; use crossbeam::atomic::AtomicCell; use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; @@ -88,6 +88,9 @@ pub struct Peer { /// Check whether this proposal can be proposed based on its epoch. proposal_control: ProposalControl, + + // Trace which peers have not finished split. + split_trace: Vec<(u64, HashSet)>, } impl Peer { @@ -161,6 +164,7 @@ impl Peer { txn_ext: Arc::default(), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), proposal_control: ProposalControl::new(0), + split_trace: vec![], }; // If this region has only one peer and I am the one, campaign directly. @@ -330,6 +334,11 @@ impl Peer { self.raft_group = raft_group; } + #[inline] + pub fn persisted_index(&self) -> u64 { + self.raft_group.raft.raft_log.persisted + } + #[inline] pub fn self_stat(&self) -> &PeerStat { &self.self_stat @@ -600,4 +609,9 @@ impl Peer { self.update_max_timestamp_pd(ctx, initial_status); } + + #[inline] + pub fn split_trace_mut(&mut self) -> &mut Vec<(u64, HashSet)> { + &mut self.split_trace + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index b3ad56af4fd..369a25984bf 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -22,7 +22,7 @@ use slog::{info, o, Logger}; use tikv_util::{box_err, store::find_peer, worker::Scheduler}; use crate::{ - operation::{GenSnapTask, SnapState}, + operation::{GenSnapTask, SnapState, SplitInit}, Result, }; @@ -69,6 +69,7 @@ pub struct Storage { /// Snapshot part. snap_state: RefCell, gen_snap_task: RefCell>>, + split_init: Option>, } impl Debug for Storage { @@ -202,49 +203,6 @@ impl Storage { .map(Some) } - /// Creates a new storage for split peer. - /// - /// Except for region local state which uses the `region` provided with the - /// inital tablet index, all uses the inital states. - pub fn with_split( - store_id: u64, - region: &metapb::Region, - engine: ER, - read_scheduler: Scheduler>, - logger: &Logger, - ) -> Result>> { - let mut region_state = RegionLocalState::default(); - region_state.set_region(region.clone()); - region_state.set_state(PeerState::Normal); - region_state.set_tablet_index(RAFT_INIT_LOG_INDEX); - - let mut apply_state = RaftApplyState::default(); - apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); - apply_state - .mut_truncated_state() - .set_index(RAFT_INIT_LOG_INDEX); - apply_state - .mut_truncated_state() - .set_term(RAFT_INIT_LOG_TERM); - - let mut raft_state = RaftLocalState::default(); - raft_state.set_last_index(RAFT_INIT_LOG_INDEX); - raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); - raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); - - Self::create( - store_id, - region_state, - raft_state, - apply_state, - engine, - read_scheduler, - true, - logger, - ) - .map(Some) - } - fn create( store_id: u64, region_state: RegionLocalState, @@ -281,6 +239,7 @@ impl Storage { logger, snap_state: RefCell::new(SnapState::Relax), gen_snap_task: RefCell::new(Box::new(None)), + split_init: None, }) } @@ -289,6 +248,11 @@ impl Storage { &mut self.region_state } + #[inline] + pub fn split_init_mut(&mut self) -> &mut Option> { + &mut self.split_init + } + #[inline] pub fn raft_state(&self) -> &RaftLocalState { self.entry_storage.raft_state() diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index a4681d8a873..13037bd1a26 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -132,6 +132,7 @@ pub enum PeerMsg { Start, /// Messages from peer to peer in the same store SplitInit(Box), + SplitInitFinish(u64), /// A message only used to notify a peer. Noop, /// A message that indicates an asynchronous write has finished. @@ -173,6 +174,13 @@ impl fmt::Debug for PeerMsg { PeerMsg::SplitInit(_) => { write!(fmt, "Split initialization") } + PeerMsg::SplitInitFinish(region_id) => { + write!( + fmt, + "Split initialization finished from region {}", + region_id + ) + } PeerMsg::Noop => write!(fmt, "Noop"), PeerMsg::Persisted { peer_id, From 0d5a292a6d62bb29cd686e7b91538f6c44040866 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 7 Dec 2022 19:44:04 +0800 Subject: [PATCH 393/676] *: always share block cache (#13903) close tikv/tikv#12936 Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- cmd/tikv-ctl/src/executor.rs | 7 +- components/engine_rocks/src/engine.rs | 12 +- components/engine_rocks/src/rocks_metrics.rs | 28 +-- components/server/src/raft_engine_switch.rs | 3 +- components/server/src/server.rs | 50 ++--- components/snap_recovery/src/init_cluster.rs | 9 +- .../test_raftstore/src/common-test.toml | 1 - components/test_raftstore/src/util.rs | 7 +- components/tikv_kv/src/rocksdb_engine.rs | 7 +- etc/config-template.toml | 8 +- src/config.rs | 188 ++++++------------ src/server/engine_factory.rs | 30 ++- src/server/engine_factory_v2.rs | 59 ++---- src/storage/config.rs | 12 +- src/storage/config_manager.rs | 6 - src/storage/kv/test_engine_builder.rs | 3 +- src/storage/mod.rs | 6 +- tests/failpoints/cases/test_storage.rs | 3 +- tests/integrations/config/mod.rs | 3 +- tests/integrations/config/test-custom.toml | 1 - 20 files changed, 133 insertions(+), 310 deletions(-) diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index b2d25a32d5b..1c42d728ca9 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -62,7 +62,6 @@ pub fn new_debug_executor( .map(Arc::new); let cache = cfg.storage.block_cache.build_shared_cache(); - let shared_block_cache = cache.is_some(); let env = cfg .build_shared_rocks_env(key_manager.clone(), None /* io_rate_limiter */) .unwrap(); @@ -75,11 +74,10 @@ pub fn new_debug_executor( .build_cf_opts(&cache, None, cfg.storage.api_version()); let kv_path = PathBuf::from(kv_path).canonicalize().unwrap(); let kv_path = kv_path.to_str().unwrap(); - let mut kv_db = match new_engine_opt(kv_path, kv_db_opts, kv_cfs_opts) { + let kv_db = match new_engine_opt(kv_path, kv_db_opts, kv_cfs_opts) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - kv_db.set_shared_block_cache(shared_block_cache); let cfg_controller = ConfigController::default(); if !cfg.raft_engine.enable { @@ -91,11 +89,10 @@ pub fn new_debug_executor( error!("raft db not exists: {}", raft_path); tikv_util::logger::exit_process_gracefully(-1); } - let mut raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { + let raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - raft_db.set_shared_block_cache(shared_block_cache); let debugger = Debugger::new(Engines::new(kv_db, raft_db), cfg_controller); Box::new(debugger) as Box } else { diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 41066c85756..720a92a8bdd 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -25,7 +25,6 @@ use crate::{ #[derive(Clone, Debug)] pub struct RocksEngine { db: Arc, - shared_block_cache: bool, support_multi_batch_write: bool, } @@ -37,7 +36,6 @@ impl RocksEngine { pub fn from_db(db: Arc) -> Self { RocksEngine { db: db.clone(), - shared_block_cache: false, support_multi_batch_write: db.get_db_options().is_enable_multi_batch_write(), } } @@ -50,14 +48,6 @@ impl RocksEngine { self.db.clone() } - pub fn set_shared_block_cache(&mut self, enable: bool) { - self.shared_block_cache = enable; - } - - pub fn shared_block_cache(&self) -> bool { - self.shared_block_cache - } - pub fn support_multi_batch_write(&self) -> bool { self.support_multi_batch_write } @@ -95,7 +85,7 @@ impl KvEngine for RocksEngine { } } } - flush_engine_properties(&self.db, instance, self.shared_block_cache); + flush_engine_properties(&self.db, instance); flush_engine_iostall_properties(&self.db, instance); } diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 4a88c6675ed..026ef36cce7 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -931,7 +931,7 @@ pub fn flush_engine_iostall_properties(engine: &DB, name: &str) { } } -pub fn flush_engine_properties(engine: &DB, name: &str, shared_block_cache: bool) { +pub fn flush_engine_properties(engine: &DB, name: &str) { for cf in engine.cf_names() { let handle = crate::util::get_cf_handle(engine, cf).unwrap(); // It is important to monitor each cf's size, especially the "raft" and "lock" @@ -941,13 +941,6 @@ pub fn flush_engine_properties(engine: &DB, name: &str, shared_block_cache: bool .with_label_values(&[name, cf]) .set(cf_used_size as i64); - if !shared_block_cache { - let block_cache_usage = engine.get_block_cache_usage_cf(handle); - STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, cf]) - .set(block_cache_usage as i64); - } - let blob_cache_usage = engine.get_blob_cache_usage_cf(handle); STORE_ENGINE_BLOB_CACHE_USAGE_GAUGE_VEC .with_label_values(&[name, cf]) @@ -1110,15 +1103,13 @@ pub fn flush_engine_properties(engine: &DB, name: &str, shared_block_cache: bool .set(d as i64); } - if shared_block_cache { - // Since block cache is shared, getting cache size from any CF is fine. Here we - // get from default CF. - let handle = crate::util::get_cf_handle(engine, CF_DEFAULT).unwrap(); - let block_cache_usage = engine.get_block_cache_usage_cf(handle); - STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, "all"]) - .set(block_cache_usage as i64); - } + // Since block cache is shared, getting cache size from any CF is fine. Here we + // get from default CF. + let handle = crate::util::get_cf_handle(engine, CF_DEFAULT).unwrap(); + let block_cache_usage = engine.get_block_cache_usage_cf(handle); + STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC + .with_label_values(&[name, "all"]) + .set(block_cache_usage as i64); } // For property metrics @@ -1627,8 +1618,7 @@ mod tests { flush_engine_histogram_metrics(*tp, HistogramData::default(), "kv"); } - let shared_block_cache = false; - flush_engine_properties(engine.as_inner(), "kv", shared_block_cache); + flush_engine_properties(engine.as_inner(), "kv"); let handle = engine.as_inner().cf_handle("default").unwrap(); let info = engine .as_inner() diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index 637088efa88..ba489f1be0f 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -237,6 +237,7 @@ mod tests { cfg.raft_store.raftdb_path = raftdb_path.to_str().unwrap().to_owned(); cfg.raftdb.wal_dir = raftdb_wal_path.to_str().unwrap().to_owned(); cfg.raft_engine.mut_config().dir = raft_engine_path.to_str().unwrap().to_owned(); + let cache = cfg.storage.block_cache.build_shared_cache(); // Dump logs from RocksEngine to RaftLogEngine. let raft_engine = RaftLogEngine::new( @@ -251,7 +252,7 @@ mod tests { let raftdb = engine_rocks::util::new_engine_opt( &cfg.raft_store.raftdb_path, cfg.raftdb.build_opt(), - cfg.raftdb.build_cf_opts(&None), + cfg.raftdb.build_cf_opts(&cache), ) .unwrap(); let mut batch = raftdb.log_batch(0); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index b52abc960d8..e93b18fed96 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -807,7 +807,6 @@ where tikv::config::Module::Storage, Box::new(StorageConfigManger::new( self.tablet_factory.as_ref().unwrap().clone(), - self.config.storage.block_cache.shared, ttl_scheduler, flow_controller, storage.get_scheduler(), @@ -1676,14 +1675,9 @@ where } pub trait ConfiguredRaftEngine: RaftEngine { - fn build( - _: &TikvConfig, - _: &Arc, - _: &Option>, - _: &Option, - ) -> Self; + fn build(_: &TikvConfig, _: &Arc, _: &Option>, _: &Cache) -> Self; fn as_rocks_engine(&self) -> Option<&RocksEngine>; - fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool); + fn register_config(&self, _cfg_controller: &mut ConfigController); } impl ConfiguredRaftEngine for T { @@ -1691,14 +1685,14 @@ impl ConfiguredRaftEngine for T { _: &TikvConfig, _: &Arc, _: &Option>, - _: &Option, + _: &Cache, ) -> Self { unimplemented!() } default fn as_rocks_engine(&self) -> Option<&RocksEngine> { None } - default fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool) {} + default fn register_config(&self, _cfg_controller: &mut ConfigController) {} } impl ConfiguredRaftEngine for RocksEngine { @@ -1706,7 +1700,7 @@ impl ConfiguredRaftEngine for RocksEngine { config: &TikvConfig, env: &Arc, key_manager: &Option>, - block_cache: &Option, + block_cache: &Cache, ) -> Self { let mut raft_data_state_machine = RaftDataStateMachine::new( &config.storage.data_dir, @@ -1720,10 +1714,8 @@ impl ConfiguredRaftEngine for RocksEngine { let mut raft_db_opts = config_raftdb.build_opt(); raft_db_opts.set_env(env.clone()); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let mut raftdb = - engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) - .expect("failed to open raftdb"); - raftdb.set_shared_block_cache(block_cache.is_some()); + let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) + .expect("failed to open raftdb"); if should_dump { let raft_engine = @@ -1741,14 +1733,10 @@ impl ConfiguredRaftEngine for RocksEngine { Some(self) } - fn register_config(&self, cfg_controller: &mut ConfigController, share_cache: bool) { + fn register_config(&self, cfg_controller: &mut ConfigController) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new( - Arc::new(self.clone()), - DbType::Raft, - share_cache, - )), + Box::new(DbConfigManger::new(Arc::new(self.clone()), DbType::Raft)), ); } } @@ -1758,7 +1746,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { config: &TikvConfig, env: &Arc, key_manager: &Option>, - block_cache: &Option, + block_cache: &Cache, ) -> Self { let mut raft_data_state_machine = RaftDataStateMachine::new( &config.storage.data_dir, @@ -1812,16 +1800,13 @@ impl TikvServer { ); // Create kv engine. - let mut builder = KvEngineFactoryBuilder::new(env, &self.config, &self.store_path) + let builder = KvEngineFactoryBuilder::new(env, &self.config, &self.store_path, block_cache) .compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { router: Mutex::new(self.router.clone()), })) .region_info_accessor(self.region_info_accessor.clone()) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); - if let Some(cache) = block_cache { - builder = builder.block_cache(cache); - } let factory = Arc::new(builder.build()); let kv_engine = factory .create_shared_db() @@ -1831,16 +1816,10 @@ impl TikvServer { let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new( - factory.clone(), - DbType::Kv, - self.config.storage.block_cache.shared, - )), + Box::new(DbConfigManger::new(factory.clone(), DbType::Kv)), ); self.tablet_factory = Some(factory.clone()); - engines - .raft - .register_config(cfg_controller, self.config.storage.block_cache.shared); + engines.raft.register_config(cfg_controller); let engines_info = Arc::new(EnginesResourceInfo::new( factory, @@ -2129,8 +2108,9 @@ mod test { config.rocksdb.lockcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); let env = Arc::new(Env::default()); let path = Builder::new().prefix("test-update").tempdir().unwrap(); + let cache = config.storage.block_cache.build_shared_cache(); - let builder = KvEngineFactoryBuilder::new(env, &config, path.path()); + let builder = KvEngineFactoryBuilder::new(env, &config, path.path(), cache); let factory = builder.build_v2(); for i in 1..6 { diff --git a/components/snap_recovery/src/init_cluster.rs b/components/snap_recovery/src/init_cluster.rs index fe6c559da27..08a45073309 100644 --- a/components/snap_recovery/src/init_cluster.rs +++ b/components/snap_recovery/src/init_cluster.rs @@ -316,14 +316,11 @@ pub fn create_local_engine_service( let db_path = config .infer_kv_engine_path(None) .map_err(|e| format!("infer kvdb path: {}", e))?; - let mut kv_db = match new_engine_opt(&db_path, db_opts, cf_opts) { + let kv_db = match new_engine_opt(&db_path, db_opts, cf_opts) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - let shared_block_cache = block_cache.is_some(); - kv_db.set_shared_block_cache(shared_block_cache); - // init raft engine, either is rocksdb or raft engine if !config.raft_engine.enable { // rocksdb @@ -333,12 +330,10 @@ pub fn create_local_engine_service( let raft_path = config .infer_raft_db_path(None) .map_err(|e| format!("infer raftdb path: {}", e))?; - let mut raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { + let raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - // let mut raft_db = RocksEngine::from_db(Arc::new(raft_db)); - raft_db.set_shared_block_cache(shared_block_cache); let local_engines = LocalEngines::new(Engines::new(kv_db, raft_db)); Ok(Box::new(local_engines) as Box) diff --git a/components/test_raftstore/src/common-test.toml b/components/test_raftstore/src/common-test.toml index a121a6c1e0e..334291f7213 100644 --- a/components/test_raftstore/src/common-test.toml +++ b/components/test_raftstore/src/common-test.toml @@ -34,7 +34,6 @@ scheduler-concurrency = 10 scheduler-worker-pool-size = 1 [storage.block-cache] -shared = true capacity = "64MB" [pd] diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 64bdca19025..c6b70fa24f0 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -596,11 +596,8 @@ pub fn create_test_engine( let raft_engine = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); - let mut builder = - KvEngineFactoryBuilder::new(env, &cfg, dir.path()).sst_recovery_sender(Some(scheduler)); - if let Some(cache) = cache { - builder = builder.block_cache(cache); - } + let mut builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path(), cache) + .sst_recovery_sender(Some(scheduler)); if let Some(router) = router { builder = builder.compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { router: Mutex::new(router), diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 26e2c735254..065766ae254 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -114,7 +114,6 @@ impl RocksEngine { path: &str, db_opts: Option, cfs_opts: Vec<(CfName, RocksCfOptions)>, - shared_block_cache: bool, io_rate_limiter: Option>, ) -> Result { info!("RocksEngine: creating for path"; "path" => path); @@ -134,11 +133,7 @@ impl RocksEngine { let db = engine_rocks::util::new_engine_opt(&path, db_opts, cfs_opts)?; // It does not use the raft_engine, so it is ok to fill with the same // rocksdb. - let mut kv_engine = db.clone(); - let mut raft_engine = db; - kv_engine.set_shared_block_cache(shared_block_cache); - raft_engine.set_shared_block_cache(shared_block_cache); - let engines = Engines::new(kv_engine, raft_engine); + let engines = Engines::new(db.clone(), db); let sched = worker.start("engine-rocksdb", Runner(engines.clone())); Ok(RocksEngine { sched, diff --git a/etc/config-template.toml b/etc/config-template.toml index a2b3ab13b00..3ddbb6fc879 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -271,17 +271,11 @@ ## Set to 0 to disable this feature if you want to panic immediately when encountering such an error. # background-error-recovery-window = "1h" -[storage.block-cache] -## Whether to create a shared block cache for all RocksDB column families. -## ## Block cache is used by RocksDB to cache uncompressed blocks. Big block cache can speed up read. ## It is recommended to turn on shared block cache. Since only the total cache size need to be ## set, it is easier to config. In most cases it should be able to auto-balance cache usage ## between column families with standard LRU algorithm. -## -## The rest of config in the storage.block-cache session is effective only when shared block cache -## is on. -# shared = true +[storage.block-cache] ## Size of the shared block cache. Normally it should be tuned to 30%-50% of system's total memory. ## When the config is not set, it is decided by the sum of the following fields or their default diff --git a/src/config.rs b/src/config.rs index e9eca154d6e..97bab103af2 100644 --- a/src/config.rs +++ b/src/config.rs @@ -27,8 +27,7 @@ use engine_rocks::{ properties::MvccPropertiesCollectorFactory, raw::{ BlockBasedOptions, Cache, ChecksumType, CompactionPriority, DBCompactionStyle, - DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, LRUCacheOptions, - PrepopulateBlockCache, + DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, PrepopulateBlockCache, }, util::{FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform}, RaftDbLogger, RangePropertiesCollectorFactory, RawMvccPropertiesCollectorFactory, @@ -503,17 +502,11 @@ macro_rules! write_into_metrics { } macro_rules! build_cf_opt { - ($opt:ident, $cf_name:ident, $cache:ident, $region_info_provider:ident) => {{ + ($opt:ident, $cf_name:ident, $cache:expr, $region_info_provider:ident) => {{ let mut block_base_opts = BlockBasedOptions::new(); block_base_opts.set_block_size($opt.block_size.0 as usize); block_base_opts.set_no_block_cache($opt.disable_block_cache); - if let Some(cache) = $cache { - block_base_opts.set_block_cache(cache); - } else { - let mut cache_opts = LRUCacheOptions::new(); - cache_opts.set_capacity($opt.block_cache_size.0 as usize); - block_base_opts.set_block_cache(&Cache::new_lru_cache(cache_opts)); - } + block_base_opts.set_block_cache($cache); block_base_opts.set_cache_index_and_filter_blocks($opt.cache_index_and_filter_blocks); block_base_opts .set_pin_l0_filter_and_index_blocks_in_cache($opt.pin_l0_filter_and_index_blocks); @@ -664,7 +657,7 @@ impl Default for DefaultCfConfig { impl DefaultCfConfig { pub fn build_opt( &self, - cache: &Option, + cache: &Cache, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, ) -> RocksCfOptions { @@ -780,7 +773,7 @@ impl Default for WriteCfConfig { impl WriteCfConfig { pub fn build_opt( &self, - cache: &Option, + cache: &Cache, region_info_accessor: Option<&RegionInfoAccessor>, ) -> RocksCfOptions { let mut cf_opts = build_cf_opt!(self, CF_WRITE, cache, region_info_accessor); @@ -876,7 +869,7 @@ impl Default for LockCfConfig { } impl LockCfConfig { - pub fn build_opt(&self, cache: &Option) -> RocksCfOptions { + pub fn build_opt(&self, cache: &Cache) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; let mut cf_opts = build_cf_opt!(self, CF_LOCK, cache, no_region_info_accessor); cf_opts @@ -952,7 +945,7 @@ impl Default for RaftCfConfig { } impl RaftCfConfig { - pub fn build_opt(&self, cache: &Option) -> RocksCfOptions { + pub fn build_opt(&self, cache: &Cache) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; let mut cf_opts = build_cf_opt!(self, CF_RAFT, cache, no_region_info_accessor); cf_opts @@ -1192,7 +1185,7 @@ impl DbConfig { pub fn build_cf_opts( &self, - cache: &Option, + cache: &Cache, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, ) -> Vec<(&'static str, RocksCfOptions)> { @@ -1328,7 +1321,7 @@ impl Default for RaftDefaultCfConfig { } impl RaftDefaultCfConfig { - pub fn build_opt(&self, cache: &Option) -> RocksCfOptions { + pub fn build_opt(&self, cache: &Cache) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; let mut cf_opts = build_cf_opt!(self, CF_DEFAULT, cache, no_region_info_accessor); let f = FixedPrefixSliceTransform::new(region_raft_prefix_len()); @@ -1483,7 +1476,7 @@ impl RaftDbConfig { opts } - pub fn build_cf_opts(&self, cache: &Option) -> Vec<(&'static str, RocksCfOptions)> { + pub fn build_cf_opts(&self, cache: &Cache) -> Vec<(&'static str, RocksCfOptions)> { vec![(CF_DEFAULT, self.defaultcf.build_opt(cache))] } @@ -1549,15 +1542,13 @@ pub enum DbType { pub struct DbConfigManger> { tablet_accessor: Arc, db_type: DbType, - shared_block_cache: bool, } impl> DbConfigManger { - pub fn new(tablet_accessor: Arc, db_type: DbType, shared_block_cache: bool) -> Self { + pub fn new(tablet_accessor: Arc, db_type: DbType) -> Self { DbConfigManger { tablet_accessor, db_type, - shared_block_cache, } } @@ -1595,33 +1586,6 @@ impl> DbConfigManger { Ok(()) } - fn set_block_cache_size(&self, cf: &str, size: ReadableSize) -> Result<(), Box> { - self.validate_cf(cf)?; - if self.shared_block_cache { - return Err("shared block cache is enabled, change cache size through \ - block-cache.capacity in storage module instead" - .into()); - } - // for multi-rocks, shared block cache has to be enabled and thus should - // shortcut in the above if statement. - assert!(self.tablet_accessor.is_single_engine()); - let mut error_collector = TabletErrorCollector::new(); - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - let r = db - .get_options_cf(cf) - .and_then(|opt| opt.set_block_cache_capacity(size.0)); - if r.is_err() { - error_collector.add_result(region_id, suffix, r); - } - }); - // Write config to metric - CONFIG_ROCKSDB_GAUGE - .with_label_values(&[cf, "block_cache_size"]) - .set(size.0 as f64); - error_collector.take_result() - } - fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> Result<(), Box> { let mut error_collector = TabletErrorCollector::new(); self.tablet_accessor @@ -1710,9 +1674,11 @@ impl + Send + Sync> ConfigManager for DbConfigMan if let ConfigValue::Module(mut cf_change) = cf_change { // defaultcf -> default let cf_name = &cf_name[..(cf_name.len() - 2)]; - if let Some(v) = cf_change.remove("block_cache_size") { + if cf_change.remove("block_cache_size").is_some() { // currently we can't modify block_cache_size via set_options_cf - self.set_block_cache_size(cf_name, v.into())?; + return Err("shared block cache is enabled, change cache size through \ + block-cache.capacity in storage module instead" + .into()); } if let Some(ConfigValue::Module(titan_change)) = cf_change.remove("titan") { for (name, value) in titan_change { @@ -3228,20 +3194,11 @@ impl TikvConfig { } } else { // Adjust `memory_usage_limit` if necessary. - if self.storage.block_cache.shared { - if let Some(cap) = self.storage.block_cache.capacity { - let limit = (cap.0 as f64 / BLOCK_CACHE_RATE * MEMORY_USAGE_LIMIT_RATE) as u64; - self.memory_usage_limit = Some(ReadableSize(limit)); - } else { - self.memory_usage_limit = Some(Self::suggested_memory_usage_limit()); - } - } else { - let cap = self.rocksdb.defaultcf.block_cache_size.0 - + self.rocksdb.writecf.block_cache_size.0 - + self.rocksdb.lockcf.block_cache_size.0 - + self.raftdb.defaultcf.block_cache_size.0; - let limit = (cap as f64 / BLOCK_CACHE_RATE * MEMORY_USAGE_LIMIT_RATE) as u64; + if let Some(cap) = self.storage.block_cache.capacity { + let limit = (cap.0 as f64 / BLOCK_CACHE_RATE * MEMORY_USAGE_LIMIT_RATE) as u64; self.memory_usage_limit = Some(ReadableSize(limit)); + } else { + self.memory_usage_limit = Some(Self::suggested_memory_usage_limit()); } } @@ -3407,7 +3364,7 @@ impl TikvConfig { // individual block cache sizes. Otherwise use the sum of block cache // size of all column families as the shared cache size. let cache_cfg = &mut self.storage.block_cache; - if cache_cfg.shared && cache_cfg.capacity.is_none() { + if cache_cfg.capacity.is_none() { cache_cfg.capacity = Some(ReadableSize( self.rocksdb.defaultcf.block_cache_size.0 + self.rocksdb.writecf.block_cache_size.0 @@ -4061,6 +4018,7 @@ mod tests { use api_version::{ApiV1, KvFormat}; use case_macros::*; + use engine_rocks::raw::LRUCacheOptions; use engine_traits::{CfOptions as _, DbOptions as _, DummyFactory}; use futures::executor::block_on; use grpcio::ResourceQuota; @@ -4487,7 +4445,6 @@ mod tests { None, cfg.storage.api_version(), ), - true, None, ) .unwrap(); @@ -4504,21 +4461,16 @@ mod tests { rx, ))); - let (shared, cfg_controller) = (cfg.storage.block_cache.shared, ConfigController::new(cfg)); + let cfg_controller = ConfigController::new(cfg); cfg_controller.register( Module::Rocksdb, - Box::new(DbConfigManger::new( - Arc::new(engine.clone()), - DbType::Kv, - shared, - )), + Box::new(DbConfigManger::new(Arc::new(engine.clone()), DbType::Kv)), ); let (scheduler, receiver) = dummy_scheduler(); cfg_controller.register( Module::Storage, Box::new(StorageConfigManger::new( Arc::new(DummyFactory::new(Some(engine), "".to_string())), - shared, scheduler, flow_controller.clone(), storage.get_scheduler(), @@ -4651,7 +4603,6 @@ mod tests { cfg.rocksdb.defaultcf.block_cache_size = ReadableSize::mb(8); cfg.rocksdb.rate_bytes_per_sec = ReadableSize::mb(64); cfg.rocksdb.rate_limiter_auto_tuned = false; - cfg.storage.block_cache.shared = false; cfg.validate().unwrap(); let (storage, cfg_controller, ..) = new_engines::(cfg); let db = storage.get_engine().get_rocksdb(); @@ -4690,7 +4641,6 @@ mod tests { let cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); assert_eq!(cf_opts.get_disable_auto_compactions(), false); assert_eq!(cf_opts.get_target_file_size_base(), ReadableSize::mb(64).0); - assert_eq!(cf_opts.get_block_cache_capacity(), ReadableSize::mb(8).0); let mut change = HashMap::new(); change.insert( @@ -4701,22 +4651,11 @@ mod tests { "rocksdb.defaultcf.target-file-size-base".to_owned(), "32MB".to_owned(), ); - change.insert( - "rocksdb.defaultcf.block-cache-size".to_owned(), - "256MB".to_owned(), - ); cfg_controller.update(change).unwrap(); let cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); assert_eq!(cf_opts.get_disable_auto_compactions(), true); assert_eq!(cf_opts.get_target_file_size_base(), ReadableSize::mb(32).0); - assert_eq!(cf_opts.get_block_cache_capacity(), ReadableSize::mb(256).0); - - // Can not update block cache through storage module - // when shared block cache is disabled - cfg_controller - .update_config("storage.block-cache.capacity", "512MB") - .unwrap_err(); } #[test] @@ -4746,7 +4685,6 @@ mod tests { #[test] fn test_change_shared_block_cache() { let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); - cfg.storage.block_cache.shared = true; cfg.validate().unwrap(); let (storage, cfg_controller, ..) = new_engines::(cfg); let db = storage.get_engine().get_rocksdb(); @@ -4813,7 +4751,6 @@ mod tests { #[test] fn test_change_ttl_check_poll_interval() { let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); - cfg.storage.block_cache.shared = true; cfg.validate().unwrap(); let (_, cfg_controller, mut rx, _) = new_engines::(cfg); @@ -5118,50 +5055,47 @@ mod tests { #[test] fn test_compaction_guard() { + let cache = Cache::new_lru_cache(LRUCacheOptions::new()); // Test comopaction guard disabled. - { - let config = DefaultCfConfig { - target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: false, - ..Default::default() - }; - let provider = Some(MockRegionInfoProvider::new(vec![])); - let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /* cache */, provider); - assert_eq!( - config.target_file_size_base.0, - cf_opts.get_target_file_size_base() - ); - } + let config = DefaultCfConfig { + target_file_size_base: ReadableSize::mb(16), + enable_compaction_guard: false, + ..Default::default() + }; + let provider = Some(MockRegionInfoProvider::new(vec![])); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, provider); + assert_eq!( + config.target_file_size_base.0, + cf_opts.get_target_file_size_base() + ); + // Test compaction guard enabled but region info provider is missing. - { - let config = DefaultCfConfig { - target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, - ..Default::default() - }; - let provider: Option = None; - let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /* cache */, provider); - assert_eq!( - config.target_file_size_base.0, - cf_opts.get_target_file_size_base() - ); - } + let config = DefaultCfConfig { + target_file_size_base: ReadableSize::mb(16), + enable_compaction_guard: true, + ..Default::default() + }; + let provider: Option = None; + let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, provider); + assert_eq!( + config.target_file_size_base.0, + cf_opts.get_target_file_size_base() + ); + // Test compaction guard enabled. - { - let config = DefaultCfConfig { - target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, - compaction_guard_min_output_file_size: ReadableSize::mb(4), - compaction_guard_max_output_file_size: ReadableSize::mb(64), - ..Default::default() - }; - let provider = Some(MockRegionInfoProvider::new(vec![])); - let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /* cache */, provider); - assert_eq!( - config.compaction_guard_max_output_file_size.0, - cf_opts.get_target_file_size_base() - ); - } + let config = DefaultCfConfig { + target_file_size_base: ReadableSize::mb(16), + enable_compaction_guard: true, + compaction_guard_min_output_file_size: ReadableSize::mb(4), + compaction_guard_max_output_file_size: ReadableSize::mb(64), + ..Default::default() + }; + let provider = Some(MockRegionInfoProvider::new(vec![])); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, provider); + assert_eq!( + config.compaction_guard_max_output_file_size.0, + cf_opts.get_target_file_size_base() + ); } #[test] diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 7e8a1457500..7e36efcb98f 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -24,7 +24,7 @@ use crate::config::{DbConfig, TikvConfig, DEFAULT_ROCKSDB_SUB_DIR}; struct FactoryInner { env: Arc, region_info_accessor: Option, - block_cache: Option, + block_cache: Cache, rocksdb_config: Arc, store_path: PathBuf, api_version: ApiVersion, @@ -39,12 +39,17 @@ pub struct KvEngineFactoryBuilder { } impl KvEngineFactoryBuilder { - pub fn new(env: Arc, config: &TikvConfig, store_path: impl Into) -> Self { + pub fn new( + env: Arc, + config: &TikvConfig, + store_path: impl Into, + cache: Cache, + ) -> Self { Self { inner: FactoryInner { env, region_info_accessor: None, - block_cache: None, + block_cache: cache, rocksdb_config: Arc::new(config.rocksdb.clone()), store_path: store_path.into(), api_version: config.storage.api_version(), @@ -61,11 +66,6 @@ impl KvEngineFactoryBuilder { self } - pub fn block_cache(mut self, cache: Cache) -> Self { - self.inner.block_cache = Some(cache); - self - } - pub fn flow_listener(mut self, listener: FlowListener) -> Self { self.inner.flow_listener = Some(listener); self @@ -158,16 +158,10 @@ impl KvEngineFactory { kv_db_opts, kv_cfs_opts, ); - let mut kv_engine = match kv_engine { - Ok(e) => e, - Err(e) => { - error!("failed to create kv engine"; "path" => %tablet_path.display(), "err" => ?e); - return Err(e); - } - }; - let shared_block_cache = self.inner.block_cache.is_some(); - kv_engine.set_shared_block_cache(shared_block_cache); - Ok(kv_engine) + if let Err(e) = &kv_engine { + error!("failed to create kv engine"; "path" => %tablet_path.display(), "err" => ?e); + } + kv_engine } pub fn on_tablet_created(&self, region_id: u64, suffix: u64) { diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index f370a08e280..a55ebca6555 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -237,7 +237,8 @@ impl TabletAccessor for KvEngineFactoryV2 { #[cfg(test)] mod tests { - use engine_traits::{OpenOptions, TabletFactory, CF_WRITE, SPLIT_PREFIX}; + use engine_traits::{OpenOptions, CF_WRITE, SPLIT_PREFIX}; + use tempfile::TempDir; use super::*; use crate::{config::TikvConfig, server::KvEngineFactoryBuilder}; @@ -257,18 +258,19 @@ mod tests { }; } - #[test] - fn test_kvengine_factory() { + fn create_test_tablet_factory(name: &'static str) -> (TempDir, KvEngineFactoryBuilder) { let cfg = TEST_CONFIG.clone(); - assert!(cfg.storage.block_cache.shared); let cache = cfg.storage.block_cache.build_shared_cache(); - let dir = test_util::temp_dir("test_kvengine_factory", false); + let dir = test_util::temp_dir(name, false); let env = cfg.build_shared_rocks_env(None, None).unwrap(); - let mut builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); - if let Some(cache) = cache { - builder = builder.block_cache(cache); - } + let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path(), cache); + (dir, builder) + } + + #[test] + fn test_kvengine_factory() { + let (_dir, builder) = create_test_tablet_factory("test_kvengine_factory"); let factory = builder.build(); let shared_db = factory.create_shared_db().unwrap(); @@ -307,16 +309,7 @@ mod tests { #[test] fn test_kvengine_factory_root_db_implicit_creation() { - let cfg = TEST_CONFIG.clone(); - assert!(cfg.storage.block_cache.shared); - let cache = cfg.storage.block_cache.build_shared_cache(); - let dir = test_util::temp_dir("test_kvengine_factory", false); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let mut builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); - if let Some(cache) = cache { - builder = builder.block_cache(cache); - } + let (_dir, builder) = create_test_tablet_factory("test_kvengine_factory"); let factory = builder.build(); // root_db should be created implicitly here @@ -346,16 +339,7 @@ mod tests { #[test] fn test_kvengine_factory_v2() { - let cfg = TEST_CONFIG.clone(); - assert!(cfg.storage.block_cache.shared); - let cache = cfg.storage.block_cache.build_shared_cache(); - let dir = test_util::temp_dir("test_kvengine_factory_v2", false); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let mut builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); - if let Some(cache) = cache { - builder = builder.block_cache(cache); - } + let (_dir, builder) = create_test_tablet_factory("test_kvengine_factory_v2"); let factory = builder.build_v2(); let tablet = factory @@ -443,16 +427,7 @@ mod tests { #[test] fn test_existed_db_not_in_registry() { - let cfg = TEST_CONFIG.clone(); - assert!(cfg.storage.block_cache.shared); - let cache = cfg.storage.block_cache.build_shared_cache(); - let dir = test_util::temp_dir("test_kvengine_factory_v2", false); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let mut builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); - if let Some(cache) = cache { - builder = builder.block_cache(cache); - } + let (_dir, builder) = create_test_tablet_factory("test_kvengine_factory_v2"); let factory = builder.build_v2(); let tablet = factory @@ -493,11 +468,7 @@ mod tests { #[test] fn test_get_live_tablets() { - let cfg = TEST_CONFIG.clone(); - let dir = test_util::temp_dir("test_get_live_tablets", false); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path()); + let (_dir, builder) = create_test_tablet_factory("test_get_live_tablets"); let factory = builder.build_v2(); factory .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) diff --git a/src/storage/config.rs b/src/storage/config.rs index 313f86ba048..3501cefa252 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -194,7 +194,7 @@ impl Default for FlowControlConfig { #[serde(rename_all = "kebab-case")] pub struct BlockCacheConfig { #[online_config(skip)] - pub shared: bool, + pub shared: Option, pub capacity: Option, #[online_config(skip)] pub num_shard_bits: i32, @@ -209,7 +209,7 @@ pub struct BlockCacheConfig { impl Default for BlockCacheConfig { fn default() -> BlockCacheConfig { BlockCacheConfig { - shared: true, + shared: None, capacity: None, num_shard_bits: 6, strict_capacity_limit: false, @@ -229,9 +229,9 @@ impl BlockCacheConfig { } } - pub fn build_shared_cache(&self) -> Option { - if !self.shared { - return None; + pub fn build_shared_cache(&self) -> Cache { + if self.shared == Some(false) { + warn!("storage.block-cache.shared is deprecated, cache is always shared."); } let capacity = match self.capacity { None => { @@ -248,7 +248,7 @@ impl BlockCacheConfig { if let Some(allocator) = self.new_memory_allocator() { cache_opts.set_memory_allocator(allocator); } - Some(Cache::new_lru_cache(cache_opts)) + Cache::new_lru_cache(cache_opts) } fn new_memory_allocator(&self) -> Option { diff --git a/src/storage/config_manager.rs b/src/storage/config_manager.rs index de3b13408f0..3cda77ab5a2 100644 --- a/src/storage/config_manager.rs +++ b/src/storage/config_manager.rs @@ -21,7 +21,6 @@ use crate::{ pub struct StorageConfigManger { tablet_factory: Arc + Send + Sync>, - shared_block_cache: bool, ttl_checker_scheduler: Scheduler, flow_controller: Arc, scheduler: TxnScheduler, @@ -33,14 +32,12 @@ unsafe impl Sync for StorageConfigManger impl StorageConfigManger { pub fn new( tablet_factory: Arc + Send + Sync>, - shared_block_cache: bool, ttl_checker_scheduler: Scheduler, flow_controller: Arc, scheduler: TxnScheduler, ) -> Self { StorageConfigManger { tablet_factory, - shared_block_cache, ttl_checker_scheduler, flow_controller, scheduler, @@ -51,9 +48,6 @@ impl StorageConfigManger { impl ConfigManager for StorageConfigManger { fn dispatch(&mut self, mut change: ConfigChange) -> CfgResult<()> { if let Some(ConfigValue::Module(mut block_cache)) = change.remove("block_cache") { - if !self.shared_block_cache { - return Err("shared block cache is disabled".into()); - } if let Some(size) = block_cache.remove("capacity") { if size != ConfigValue::None { let s: ReadableSize = size.into(); diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index f0192372e4b..f02ee31c5f2 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -110,8 +110,7 @@ impl TestEngineBuilder { _ => (*cf, RocksCfOptions::default()), }) .collect(); - let engine = - RocksEngine::new(&path, None, cfs_opts, cache.is_some(), self.io_rate_limiter)?; + let engine = RocksEngine::new(&path, None, cfs_opts, self.io_rate_limiter)?; Ok(engine) } } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 6f06e55937f..79f48c68a88 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -4146,11 +4146,7 @@ mod tests { (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), ]; RocksEngine::new( - &path, - None, - cfs_opts, - cache.is_some(), - None, // io_rate_limiter + &path, None, cfs_opts, None, // io_rate_limiter ) } .unwrap(); diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 40c78dfabde..2ea66ef1222 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -262,13 +262,12 @@ fn test_scale_scheduler_pool() { rx, ))); - let cfg_controller = ConfigController::new(cfg.clone()); + let cfg_controller = ConfigController::new(cfg); let (scheduler, _receiver) = dummy_scheduler(); cfg_controller.register( Module::Storage, Box::new(StorageConfigManger::new( Arc::new(DummyFactory::new(Some(kv_engine), "".to_string())), - cfg.storage.block_cache.shared, scheduler, flow_controller, storage.get_scheduler(), diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index ff01788c370..6341f3a9e27 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -684,7 +684,7 @@ fn test_serde_custom_tikv_config() { hard_pending_compaction_bytes_limit: ReadableSize(1), }, block_cache: BlockCacheConfig { - shared: true, + shared: None, capacity: Some(ReadableSize::gb(40)), num_shard_bits: 10, strict_capacity_limit: true, @@ -886,7 +886,6 @@ fn test_do_not_use_unified_readpool_with_legacy_config() { fn test_block_cache_backward_compatible() { let content = read_file_in_project_dir("integrations/config/test-cache-compatible.toml"); let mut cfg: TikvConfig = toml::from_str(&content).unwrap(); - assert!(cfg.storage.block_cache.shared); assert!(cfg.storage.block_cache.capacity.is_none()); cfg.compatible_adjust(); assert!(cfg.storage.block_cache.capacity.is_some()); diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index e5c896238bc..f22538a6f78 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -102,7 +102,6 @@ enable-ttl = true ttl-check-poll-interval = "0s" [storage.block-cache] -shared = true capacity = "40GB" num-shard-bits = 10 strict-capacity-limit = true From 3122786dddb7f85732a5515e1a367adc8865c33c Mon Sep 17 00:00:00 2001 From: hehechen Date: Thu, 8 Dec 2022 11:34:04 +0800 Subject: [PATCH 394/676] resolved_ts: pass full safe_ts timestamp to observer (#13897) close tikv/tikv#13896 Signed-off-by: hehechen Co-authored-by: Xinye Tao --- components/raftstore/src/store/util.rs | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index df5f4543f76..41409a49448 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -35,7 +35,7 @@ use tikv_util::{ Either, }; use time::{Duration, Timespec}; -use txn_types::{TimeStamp, WriteBatchFlags}; +use txn_types::WriteBatchFlags; use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; use crate::{coprocessor::CoprocessorHost, store::snap::SNAPSHOT_VERSION, Error, Result}; @@ -1186,11 +1186,7 @@ impl RegionReadProgress { if !core.pause { self.safe_ts.store(ts, AtomicOrdering::Release); // No need to update leader safe ts here. - coprocessor.on_update_safe_ts( - core.region_id, - TimeStamp::new(ts).physical(), - INVALID_TIMESTAMP, - ) + coprocessor.on_update_safe_ts(core.region_id, ts, INVALID_TIMESTAMP) } } } @@ -1232,11 +1228,7 @@ impl RegionReadProgress { self.safe_ts.store(ts, AtomicOrdering::Release); // After region merge, self safe ts may decrease, so leader safe ts should be // reset. - coprocessor.on_update_safe_ts( - core.region_id, - TimeStamp::new(ts).physical(), - TimeStamp::new(ts).physical(), - ) + coprocessor.on_update_safe_ts(core.region_id, ts, ts) } } } @@ -1261,9 +1253,7 @@ impl RegionReadProgress { } } } - let self_phy_ts = TimeStamp::new(self.safe_ts()).physical(); - let leader_phy_ts = TimeStamp::new(rs.get_safe_ts()).physical(); - coprocessor.on_update_safe_ts(leader_info.region_id, self_phy_ts, leader_phy_ts) + coprocessor.on_update_safe_ts(leader_info.region_id, self.safe_ts(), rs.get_safe_ts()) } // whether the provided `LeaderInfo` is same as ours core.leader_info.leader_term == leader_info.term From 3e0b8ddc6ab5fdf80afcce5884021c2015204256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 8 Dec 2022 14:30:04 +0800 Subject: [PATCH 395/676] gc_worker: disable gc if have negative ratio (#13904) close tikv/tikv#13909, ref pingcap/tidb#39602 GC would be skipped once the `ratio_threshold` is negative or infinity. Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- src/server/gc_worker/compaction_filter.rs | 16 ++++++++++++++++ src/server/gc_worker/mod.rs | 16 ++++++++++++++++ tests/failpoints/cases/test_table_properties.rs | 2 +- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index bd5896296bb..4c494d6f01f 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -685,6 +685,15 @@ pub fn check_need_gc( context: &CompactionFilterContext, ) -> bool { let check_props = |props: &MvccProperties| -> (bool, bool /* skip_more_checks */) { + // Disable GC directly once the config is negative or +inf. + // Disabling GC is useful in some abnormal scenarios where the transaction model + // would be break (e.g. writes with higher commit TS would be written BEFORE + // writes with lower commit TS, or write data with TS lower than current GC safe + // point). Use this at your own risk. + if ratio_threshold.is_sign_negative() || ratio_threshold.is_infinite() { + return (false, false); + } + if props.min_ts > safe_point { return (false, false); } @@ -970,6 +979,13 @@ pub mod tests { let default_key = Key::from_encoded_slice(b"zkey").append_ts(100.into()); let default_key = default_key.into_encoded(); assert!(raw_engine.get_value(&default_key).unwrap().is_none()); + + // If the ratio threshold is less than 0, GC would be skipped. + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", 210); + must_commit(&mut engine, b"zkey", 210, 220); + gc_runner.ratio_threshold = Some(-1.0); + gc_runner.safe_point(256).gc(&raw_engine); + must_get(&mut engine, b"zkey", 210, &value); } // Test dirty versions before a deletion mark can be handled correctly. diff --git a/src/server/gc_worker/mod.rs b/src/server/gc_worker/mod.rs index a5b8837cd2e..75b7441fbcb 100644 --- a/src/server/gc_worker/mod.rs +++ b/src/server/gc_worker/mod.rs @@ -26,6 +26,14 @@ pub use crate::storage::{Callback, Error, ErrorInner, Result}; // Returns true if it needs gc. // This is for optimization purpose, does not mean to be accurate. fn check_need_gc(safe_point: TimeStamp, ratio_threshold: f64, props: &MvccProperties) -> bool { + // Disable GC directly once the config is negative or +inf. + // Disabling GC is useful in some abnormal scenarios where the transaction model + // would be break (e.g. writes with higher commit TS would be written BEFORE + // writes with lower commit TS, or write data with TS lower than current GC safe + // point). Use this at your own risk. + if ratio_threshold.is_sign_negative() || ratio_threshold.is_infinite() { + return false; + } // Always GC. if ratio_threshold < 1.0 { return true; @@ -77,6 +85,14 @@ mod tests { props } + #[test] + fn test_check_need_gc() { + let props = MvccProperties::default(); + assert!(!check_need_gc(TimeStamp::max(), -1.0, &props)); + assert!(!check_need_gc(TimeStamp::max(), f64::INFINITY, &props)); + assert!(check_need_gc(TimeStamp::max(), 0.9, &props)); + } + #[test] fn test_need_gc() { let path = tempfile::Builder::new() diff --git a/tests/failpoints/cases/test_table_properties.rs b/tests/failpoints/cases/test_table_properties.rs index 905bcfbd690..536149d48b5 100644 --- a/tests/failpoints/cases/test_table_properties.rs +++ b/tests/failpoints/cases/test_table_properties.rs @@ -91,7 +91,7 @@ fn test_check_need_gc() { // Set ratio_threshold, let (props.num_versions as f64 > props.num_rows as // f64 * ratio_threshold) return true - gc_runner.ratio_threshold = Option::Some(f64::MIN); + gc_runner.ratio_threshold = Option::Some(0.0f64); // is_bottommost_level = false do_gc(&raw_engine, 1, &mut gc_runner, &dir); From daa0c8f47ec11e1e4fc5acd543a113b95a6c3551 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Thu, 8 Dec 2022 22:26:04 +0800 Subject: [PATCH 396/676] =?UTF-8?q?log-backup=EF=BC=9ARetry=20to=20get=20t?= =?UTF-8?q?asks=20with=20etcd-cli=20from=20etcd=20when=20TiKV=20starts.=20?= =?UTF-8?q?(#13907)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#13898 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- .../backup-stream/src/checkpoint_manager.rs | 4 +- components/backup-stream/src/endpoint.rs | 55 ++++++++++++++++++- .../backup-stream/src/metadata/client.rs | 5 ++ components/backup-stream/src/metadata/mod.rs | 2 +- .../src/metadata/store/lazy_etcd.rs | 7 ++- components/backup-stream/src/metadata/test.rs | 4 +- src/import/sst_service.rs | 2 +- 7 files changed, 70 insertions(+), 9 deletions(-) diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index e316b6e05c3..8c3de3d34ce 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -458,7 +458,7 @@ where } #[cfg(test)] -mod tests { +pub mod tests { use std::{ assert_matches, collections::HashMap, @@ -510,7 +510,7 @@ mod tests { assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 24); } - struct MockPdClient { + pub struct MockPdClient { safepoint: RwLock>, } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 2ebeee2ea66..ec6b0dd41fb 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -273,7 +273,22 @@ where meta_client: MetadataClient, scheduler: Scheduler, ) -> Result<()> { - let tasks = meta_client.get_tasks().await?; + let tasks; + loop { + let r = meta_client.get_tasks().await; + match r { + Ok(t) => { + tasks = t; + break; + } + Err(e) => { + e.report("failed to get backup stream task"); + tokio::time::sleep(Duration::from_secs(5)).await; + continue; + } + } + } + for task in tasks.inner { info!("backup stream watch task"; "task" => ?task); if task.is_paused { @@ -1165,3 +1180,41 @@ where self.run_task(task) } } + +#[cfg(test)] +mod test { + use engine_rocks::RocksEngine; + use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; + use test_raftstore::MockRaftStoreRouter; + use tikv_util::worker::dummy_scheduler; + + use crate::{ + checkpoint_manager::tests::MockPdClient, endpoint, endpoint::Endpoint, metadata::test, Task, + }; + + #[tokio::test] + async fn test_start() { + let cli = test::test_meta_cli(); + let (sched, mut rx) = dummy_scheduler(); + let task = test::simple_task("simple_3"); + cli.insert_task_with_range(&task, &[]).await.unwrap(); + + fail::cfg("failed_to_get_tasks", "1*return").unwrap(); + Endpoint::<_, MockRegionInfoProvider, RocksEngine, MockRaftStoreRouter, MockPdClient>::start_and_watch_tasks(cli, sched).await.unwrap(); + fail::remove("failed_to_get_tasks"); + + let _t1 = rx.recv().unwrap(); + let t2 = rx.recv().unwrap(); + + match t2 { + Task::WatchTask(t) => match t { + endpoint::TaskOp::AddTask(t) => { + assert_eq!(t.info, task.info); + assert!(!t.is_paused); + } + _ => panic!("not match TaskOp type"), + }, + _ => panic!("not match Task type {:?}", t2), + } + } +} diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index b7f1fcb2025..2c0fd2577fc 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -353,6 +353,11 @@ impl MetadataClient { defer! { super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_fetch"]).observe(now.saturating_elapsed().as_secs_f64()) } + fail::fail_point!("failed_to_get_tasks", |_| { + Err(Error::MalformedMetadata( + "faild to connect etcd client".to_string(), + )) + }); let snap = self.meta_store.snapshot().await?; let kvs = snap.get(Keys::Prefix(MetaKey::tasks())).await?; diff --git a/components/backup-stream/src/metadata/mod.rs b/components/backup-stream/src/metadata/mod.rs index 4c387533e49..a616ace2dc6 100644 --- a/components/backup-stream/src/metadata/mod.rs +++ b/components/backup-stream/src/metadata/mod.rs @@ -4,7 +4,7 @@ mod client; pub mod keys; mod metrics; pub mod store; -mod test; +pub mod test; pub use client::{Checkpoint, CheckpointProvider, MetadataClient, MetadataEvent, StreamTask}; pub use store::lazy_etcd::{ConnectionConfig, LazyEtcdClient}; diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 6fc3a5332ea..88d44b09252 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -4,7 +4,10 @@ use std::{sync::Arc, time::Duration}; use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; -use tikv_util::stream::{RetryError, RetryExt}; +use tikv_util::{ + info, + stream::{RetryError, RetryExt}, +}; use tokio::sync::OnceCell; use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; @@ -113,7 +116,7 @@ where use futures::TryFutureExt; let r = tikv_util::stream::retry_ext( move || action().err_into::(), - RetryExt::default().with_fail_hook(|err| println!("meet error {:?}", err)), + RetryExt::default().with_fail_hook(|err| info!("retry it"; "err" => ?err)), ) .await; r.map_err(|err| err.0.into()) diff --git a/components/backup-stream/src/metadata/test.rs b/components/backup-stream/src/metadata/test.rs index ec2a30efbf3..a57722089bf 100644 --- a/components/backup-stream/src/metadata/test.rs +++ b/components/backup-stream/src/metadata/test.rs @@ -16,11 +16,11 @@ use crate::{ metadata::{store::SlashEtcStore, MetadataEvent}, }; -fn test_meta_cli() -> MetadataClient { +pub fn test_meta_cli() -> MetadataClient { MetadataClient::new(SlashEtcStore::default(), 42) } -fn simple_task(name: &str) -> StreamTask { +pub fn simple_task(name: &str) -> StreamTask { let mut task = StreamTask::default(); task.info.set_name(name.to_owned()); task.info.set_start_ts(1); diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 283f8f802e3..bdb552e8923 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -1064,7 +1064,7 @@ where Box::new(move |k: Vec, v: Vec| { // Need to skip the empty key/value that could break the transaction or cause // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. - if k.is_empty() || v.is_empty() { + if k.is_empty() || (!is_delete && v.is_empty()) { return; } From 56ed2f62dcf668f6b2ac720002d9d05dc0671fcf Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 9 Dec 2022 12:48:05 +0800 Subject: [PATCH 397/676] metrics: fix coprocessor cpu alert (#10878) close tikv/tikv#13918 Signed-off-by: tabokie Signed-off-by: Xinye Tao --- metrics/alertmanager/tikv.rules.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/metrics/alertmanager/tikv.rules.yml b/metrics/alertmanager/tikv.rules.yml index 9b25637d14f..19f8085866e 100644 --- a/metrics/alertmanager/tikv.rules.yml +++ b/metrics/alertmanager/tikv.rules.yml @@ -253,17 +253,17 @@ groups: value: '{{ $value }}' summary: TiKV pending {{ $labels.type }} request is high - - alert: TiKV_batch_request_snapshot_nums - expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 + - alert: TiKV_coprocessor_cpu_util + expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / (count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) by (instance) * 0.9) >= 1 for: 1m labels: env: ENV_LABELS_ENV level: warning - expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 + expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / (count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) by (instance) * 0.9) >= 1 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' - summary: TiKV batch request snapshot nums is high + summary: TiKV coprocessor CPU utilization exceeds 90% - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 From 97585fb86f1437438ff7e5514965d48a427bb94c Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Thu, 8 Dec 2022 21:52:04 -0800 Subject: [PATCH 398/676] Improve readability through syntax sugar (#13899) ref tikv/tikv#13908 Nit: use lib API to improve readability Signed-off-by: Yang Zhang Co-authored-by: Xinye Tao --- components/raftstore/src/store/fsm/peer.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 9460daf812d..47c9357e1c4 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5028,13 +5028,12 @@ where // ReadIndex can be processed on the replicas. let is_read_index_request = request.len() == 1 && request[0].get_cmd_type() == CmdType::ReadIndex; - let mut read_only = true; - for r in msg.get_requests() { - match r.get_cmd_type() { - CmdType::Get | CmdType::Snap | CmdType::ReadIndex => (), - _ => read_only = false, - } - } + let read_only = msg.get_requests().iter().all(|r| { + matches!( + r.get_cmd_type(), + CmdType::Get | CmdType::Snap | CmdType::ReadIndex, + ) + }); let region_id = self.region_id(); let allow_replica_read = read_only && msg.get_header().get_replica_read(); let flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); From 0921ad0d3b6be791f067e9a45b74cffccf9d5810 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 9 Dec 2022 15:22:04 +0800 Subject: [PATCH 399/676] *: clean up tablet factory (#13912) ref tikv/tikv#12842 In the past, there are 5 types of tablet factory: - Dummy factory - Test Factory for v1 and v2 - Production factory for v1 and v2. Tablet factory also requires TabletAccessor trait, so there is 10 implementations. If there are bugs, we need to change at least 5 places, and 10 places in worst cases. This PR simplifies the code by limiting the scope of engine factory, so it only needs to provide create, destroy and exist check three functions. All other functions are moved to type `TabletRegistry`. Also `TabletFactory` is only for v2 usage, so there will be only 3 different tablet factories and no accessor trait. It should be a lot easier to adjust the behavior of tablet factory. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/engine_panic/src/engine.rs | 12 +- components/engine_rocks/src/engine.rs | 14 +- components/engine_test/src/lib.rs | 315 +---------- components/engine_traits/Cargo.toml | 1 + components/engine_traits/src/engine.rs | 343 +----------- components/engine_traits/src/lib.rs | 2 + components/engine_traits/src/tablet.rs | 398 ++++++++++++++ components/raftstore-v2/src/batch/store.rs | 30 +- components/raftstore-v2/src/fsm/apply.rs | 9 +- components/raftstore-v2/src/fsm/peer.rs | 6 +- components/raftstore-v2/src/fsm/store.rs | 28 +- components/raftstore-v2/src/lib.rs | 1 - .../src/operation/command/admin/mod.rs | 2 +- .../src/operation/command/admin/split.rs | 59 +-- .../raftstore-v2/src/operation/command/mod.rs | 5 +- components/raftstore-v2/src/operation/life.rs | 2 +- components/raftstore-v2/src/operation/pd.rs | 2 +- .../raftstore-v2/src/operation/query/lease.rs | 2 +- .../raftstore-v2/src/operation/query/local.rs | 65 ++- .../raftstore-v2/src/operation/ready/mod.rs | 4 +- .../src/operation/ready/snapshot.rs | 34 +- components/raftstore-v2/src/raft/apply.rs | 17 +- components/raftstore-v2/src/raft/peer.rs | 29 +- components/raftstore-v2/src/raft/storage.rs | 36 +- components/raftstore-v2/src/router/imp.rs | 10 +- components/raftstore-v2/src/tablet.rs | 102 ---- components/raftstore-v2/src/worker/pd/mod.rs | 8 +- .../src/worker/pd/store_heartbeat.rs | 4 +- .../tests/failpoints/test_basic_write.rs | 10 +- .../tests/integrations/cluster.rs | 61 +-- .../tests/integrations/test_basic_write.rs | 12 +- .../tests/integrations/test_conf_change.rs | 12 +- .../tests/integrations/test_life.rs | 20 +- .../tests/integrations/test_pd_heartbeat.rs | 2 +- .../tests/integrations/test_read.rs | 12 +- .../tests/integrations/test_split.rs | 10 +- .../tests/integrations/test_status.rs | 2 +- components/server/src/server.rs | 89 ++-- components/test_raftstore/src/util.rs | 9 +- src/config/configurable.rs | 141 +++++ src/{config.rs => config/mod.rs} | 146 ++---- src/server/engine_factory.rs | 261 ++++------ src/server/engine_factory_v2.rs | 487 ------------------ src/server/mod.rs | 1 - src/storage/config_manager.rs | 38 +- .../flow_controller/tablet_flow_controller.rs | 121 +++-- tests/failpoints/cases/test_storage.rs | 3 +- 48 files changed, 1027 insertions(+), 1951 deletions(-) create mode 100644 components/engine_traits/src/tablet.rs delete mode 100644 components/raftstore-v2/src/tablet.rs create mode 100644 src/config/configurable.rs rename src/{config.rs => config/mod.rs} (97%) delete mode 100644 src/server/engine_factory_v2.rs diff --git a/Cargo.lock b/Cargo.lock index eb5145959af..a7d72121032 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1626,6 +1626,7 @@ name = "engine_traits" version = "0.0.1" dependencies = [ "case_macros", + "collections", "error_code", "fail", "file_system", diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index a296c3df9d8..6bca7d46485 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -2,7 +2,7 @@ use engine_traits::{ IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, Result, SyncMutable, - TabletAccessor, WriteOptions, + WriteOptions, }; use crate::{db_vector::PanicDbVector, snapshot::PanicSnapshot, write_batch::PanicWriteBatch}; @@ -24,16 +24,6 @@ impl KvEngine for PanicEngine { } } -impl TabletAccessor for PanicEngine { - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &PanicEngine)) { - panic!() - } - - fn is_single_engine(&self) -> bool { - panic!() - } -} - impl Peekable for PanicEngine { type DbVector = PanicDbVector; diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 720a92a8bdd..0c37120e7fc 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -2,9 +2,7 @@ use std::{any::Any, sync::Arc}; -use engine_traits::{ - IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable, TabletAccessor, -}; +use engine_traits::{IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable}; use rocksdb::{DBIterator, Writable, DB}; use crate::{ @@ -99,16 +97,6 @@ impl KvEngine for RocksEngine { } } -impl TabletAccessor for RocksEngine { - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { - f(0, 0, self); - } - - fn is_single_engine(&self) -> bool { - true - } -} - impl Iterable for RocksEngine { type Iterator = RocksEngineIterator; diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 77bd2d3be7c..605feedc7bd 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -76,12 +76,8 @@ pub mod raft { /// Types and constructors for the "kv" engine pub mod kv { - use std::{ - path::{Path, PathBuf}, - sync::{Arc, Mutex}, - }; + use std::path::Path; - use collections::HashMap; #[cfg(feature = "test-engine-kv-panic")] pub use engine_panic::{ PanicEngine as KvTestEngine, PanicEngineIterator as KvTestEngineIterator, @@ -92,11 +88,7 @@ pub mod kv { RocksEngine as KvTestEngine, RocksEngineIterator as KvTestEngineIterator, RocksSnapshot as KvTestSnapshot, RocksWriteBatchVec as KvTestWriteBatch, }; - use engine_traits::{ - CfOptions, CfOptionsExt, MiscExt, OpenOptions, Result, TabletAccessor, TabletFactory, - CF_DEFAULT, - }; - use tikv_util::box_err; + use engine_traits::{MiscExt, Result, TabletFactory}; use crate::ctor::{CfOptions as KvTestCfOptions, DbOptions, KvEngineConstructorExt}; @@ -112,317 +104,40 @@ pub mod kv { KvTestEngine::new_kv_engine_opt(path, db_opt, cfs_opts) } - const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; + const TOMBSTONE_SUFFIX: &str = ".tombstone"; #[derive(Clone)] pub struct TestTabletFactory { - root_path: PathBuf, db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>, - root_db: Arc>>, } impl TestTabletFactory { - pub fn new( - root_path: &Path, - db_opt: DbOptions, - cf_opts: Vec<(&'static str, KvTestCfOptions)>, - ) -> Self { - let factory = Self { - root_path: root_path.to_path_buf(), - db_opt, - cf_opts, - root_db: Arc::new(Mutex::default()), - }; - let tablet_path = factory.tablets_path(); - if !tablet_path.exists() { - std::fs::create_dir_all(tablet_path).unwrap(); - } - factory + pub fn new(db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>) -> Self { + Self { db_opt, cf_opts } } + } - fn create_tablet(&self, tablet_path: &Path) -> Result { + impl TabletFactory for TestTabletFactory { + fn open_tablet(&self, _id: u64, _suffix: Option, path: &Path) -> Result { KvTestEngine::new_kv_engine_opt( - tablet_path.to_str().unwrap(), + path.to_str().unwrap(), self.db_opt.clone(), self.cf_opts.clone(), ) } - } - - impl TabletFactory for TestTabletFactory { - fn create_shared_db(&self) -> Result { - let tablet_path = self.tablet_path(0, 0); - let tablet = self.create_tablet(&tablet_path)?; - let mut root_db = self.root_db.lock().unwrap(); - root_db.replace(tablet.clone()); - Ok(tablet) - } - - /// See the comment above the same name method in KvEngineFactory - fn open_tablet( - &self, - _id: u64, - _suffix: Option, - options: OpenOptions, - ) -> Result { - if let Some(db) = self.root_db.lock().unwrap().as_ref() { - if options.create_new() { - return Err(box_err!("root tablet {} already exists", db.path())); - } - return Ok(db.clone()); - } - // No need for mutex protection here since root_db creation only occurs at - // tikv bootstrap time when there is no racing issue. - if options.create_new() || options.create() { - return self.create_shared_db(); - } - - Err(box_err!("root tablet has not been initialized")) - } - - fn open_tablet_raw( - &self, - _path: &Path, - _id: u64, - _suffix: u64, - _options: OpenOptions, - ) -> Result { - self.create_shared_db() - } - - fn exists_raw(&self, _path: &Path) -> bool { - false - } - - #[inline] - fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { - self.root_path.join("db") - } - #[inline] - fn tablets_path(&self) -> PathBuf { - Path::new(&self.root_path).join("tablets") - } - - #[inline] - fn destroy_tablet(&self, _id: u64, _suffix: u64) -> engine_traits::Result<()> { - Ok(()) - } - - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - let db = self.root_db.lock().unwrap(); - let opt = db.as_ref().unwrap().get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity)?; + fn destroy_tablet(&self, _id: u64, _suffix: Option, path: &Path) -> Result<()> { + let tombstone_path = path.join(TOMBSTONE_SUFFIX); + std::fs::remove_dir_all(&tombstone_path)?; + std::fs::rename(path, &tombstone_path)?; + std::fs::remove_dir_all(tombstone_path)?; Ok(()) } - } - - impl TabletAccessor for TestTabletFactory { - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &KvTestEngine)) { - let db = self.root_db.lock().unwrap(); - let db = db.as_ref().unwrap(); - f(0, 0, db); - } - - fn is_single_engine(&self) -> bool { - true - } - } - - #[derive(Clone)] - pub struct TestTabletFactoryV2 { - inner: TestTabletFactory, - // region_id -> (tablet, tablet_suffix) - registry: Arc>>, - } - - impl TestTabletFactoryV2 { - pub fn new( - root_path: &Path, - db_opt: DbOptions, - cf_opts: Vec<(&'static str, KvTestCfOptions)>, - ) -> Self { - Self { - inner: TestTabletFactory::new(root_path, db_opt, cf_opts), - registry: Arc::default(), - } - } - } - - impl TabletFactory for TestTabletFactoryV2 { - /// See the comment above the same name method in KvEngineFactoryV2 - fn open_tablet( - &self, - id: u64, - suffix: Option, - mut options: OpenOptions, - ) -> Result { - if options.create_new() && suffix.is_none() { - return Err(box_err!( - "suffix should be provided when creating new tablet" - )); - } - - if options.create_new() || options.create() { - options = options.set_cache_only(false); - } - - let mut reg = self.registry.lock().unwrap(); - if let Some(suffix) = suffix { - if let Some((cached_tablet, cached_suffix)) = reg.get(&id) && *cached_suffix == suffix { - // Target tablet exist in the cache - if options.create_new() { - return Err(box_err!("region {} {} already exists", id, cached_tablet.path())); - } - return Ok(cached_tablet.clone()); - } else if !options.cache_only() { - let tablet_path = self.tablet_path(id, suffix); - let tablet = self.open_tablet_raw(&tablet_path, id, suffix, options.clone())?; - if !options.skip_cache() { - reg.insert(id, (tablet.clone(), suffix)); - } - return Ok(tablet); - } - } else if let Some((tablet, _)) = reg.get(&id) { - return Ok(tablet.clone()); - } - - Err(box_err!( - "tablet with region id {} suffix {:?} does not exist", - id, - suffix - )) - } - - fn open_tablet_raw( - &self, - path: &Path, - id: u64, - _suffix: u64, - options: OpenOptions, - ) -> Result { - let engine_exist = KvTestEngine::exists(path.to_str().unwrap_or_default()); - // Even though neither options.create nor options.create_new are true, if the - // tablet files already exists, we will open it by calling - // inner.create_tablet. In this case, the tablet exists but not in the cache - // (registry). - if !options.create() && !options.create_new() && !engine_exist { - return Err(box_err!( - "path {} does not have db", - path.to_str().unwrap_or_default() - )); - }; - - if options.create_new() && engine_exist { - return Err(box_err!( - "region {} {} already exists", - id, - path.to_str().unwrap() - )); - } - - self.inner.create_tablet(path) - } - #[inline] - fn create_shared_db(&self) -> Result { - self.open_tablet(0, Some(0), OpenOptions::default().set_create_new(true)) - } - - #[inline] - fn exists_raw(&self, path: &Path) -> bool { + fn exists(&self, path: &Path) -> bool { KvTestEngine::exists(path.to_str().unwrap_or_default()) } - - #[inline] - fn tablets_path(&self) -> PathBuf { - self.inner.root_path.join("tablets") - } - - #[inline] - fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf { - self.inner - .root_path - .join(format!("tablets/{}{}_{}", prefix, id, suffix)) - } - - #[inline] - fn mark_tombstone(&self, region_id: u64, suffix: u64) { - let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); - // When the full directory path does not exsit, create will return error and in - // this case, we just ignore it. - let _ = std::fs::File::create(path); - { - let mut reg = self.registry.lock().unwrap(); - if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { - reg.insert(region_id, (cached_tablet, cached_suffix)); - } - } - } - - #[inline] - fn is_tombstoned(&self, region_id: u64, suffix: u64) -> bool { - self.tablet_path(region_id, suffix) - .join(TOMBSTONE_MARK) - .exists() - } - - #[inline] - fn destroy_tablet(&self, region_id: u64, suffix: u64) -> engine_traits::Result<()> { - let path = self.tablet_path(region_id, suffix); - { - let mut reg = self.registry.lock().unwrap(); - if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { - reg.insert(region_id, (cached_tablet, cached_suffix)); - } - } - let _ = std::fs::remove_dir_all(path); - Ok(()) - } - - #[inline] - fn load_tablet(&self, path: &Path, region_id: u64, suffix: u64) -> Result { - { - let reg = self.registry.lock().unwrap(); - if let Some((db, db_suffix)) = reg.get(®ion_id) && *db_suffix == suffix { - return Err(box_err!("region {} {} already exists", region_id, db.path())); - } - } - - let db_path = self.tablet_path(region_id, suffix); - std::fs::rename(path, db_path)?; - self.open_tablet( - region_id, - Some(suffix), - OpenOptions::default().set_create(true), - ) - } - - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - let reg = self.registry.lock().unwrap(); - // pick up any tablet and set the shared block cache capacity - if let Some((_id, (tablet, _suffix))) = (*reg).iter().next() { - let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity)?; - } - Ok(()) - } - } - - impl TabletAccessor for TestTabletFactoryV2 { - #[inline] - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &KvTestEngine)) { - let reg = self.registry.lock().unwrap(); - for (id, (tablet, suffix)) in &*reg { - f(*id, *suffix, tablet) - } - } - - // it have multi tablets. - fn is_single_engine(&self) -> bool { - false - } } } diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index d38962e71c9..fcfcbdb2799 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -9,6 +9,7 @@ failpoints = ["fail/failpoints"] [dependencies] case_macros = { workspace = true } +collections = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 55ab5d63caa..e12ea074015 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -1,14 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - fmt::Debug, - io::Write, - path::{Path, PathBuf}, - str, - vec::Vec, -}; - -use tikv_util::error; +use std::{fmt::Debug, str}; use crate::*; @@ -74,336 +66,3 @@ pub trait KvEngine: true } } - -/// TabletAccessor is the trait to access all the tablets with provided accessor -/// -/// For single rocksdb instance, it essentially accesses the global kvdb with -/// the accessor For multi rocksdb instances, it accesses all the tablets with -/// the accessor -pub trait TabletAccessor { - /// Loop visit all opened tablets by the specified function. - fn for_each_opened_tablet(&self, _f: &mut (dyn FnMut(u64, u64, &EK))); - - /// return true if it's single engine; - /// return false if it's a multi-tablet factory; - fn is_single_engine(&self) -> bool; -} - -/// max error count to log -const MAX_ERROR_COUNT: u32 = 5; - -/// TabletErrorCollector is the facility struct to handle errors when using -/// TabletAccessor::for_each_opened_tablet -/// -/// It will choose the last failed result as the final result, meanwhile logging -/// errors up to MAX_ERROR_COUNT. -pub struct TabletErrorCollector { - errors: Vec, - max_error_count: u32, - error_count: u32, - result: std::result::Result<(), Box>, -} - -impl TabletErrorCollector { - pub fn new() -> Self { - Self { - errors: vec![], - max_error_count: MAX_ERROR_COUNT, - error_count: 0, - result: Ok(()), - } - } - - pub fn add_result(&mut self, region_id: u64, suffix: u64, result: Result<()>) { - if result.is_ok() { - return; - } - self.result = Err(Box::from(result.err().unwrap())); - self.error_count += 1; - if self.error_count > self.max_error_count { - return; - } - writeln!( - &mut self.errors, - "Tablet {}_{} encountered error: {:?}.", - region_id, suffix, self.result - ) - .unwrap(); - } - - fn flush_error(&self) { - if self.error_count > 0 { - error!( - "Total count {}. Sample errors: {}", - self.error_count, - str::from_utf8(&self.errors).unwrap() - ); - } - } - - pub fn take_result(&mut self) -> std::result::Result<(), Box> { - std::mem::replace(&mut self.result, Ok(())) - } - - pub fn get_error_count(&self) -> u32 { - self.error_count - } -} - -impl Default for TabletErrorCollector { - fn default() -> Self { - Self::new() - } -} - -impl Drop for TabletErrorCollector { - fn drop(&mut self) { - self.flush_error() - } -} - -/// OpenOptionsn is used for specifiying the way of opening a tablet. -#[derive(Default, Clone)] -pub struct OpenOptions { - // create tablet if non-exist - create: bool, - create_new: bool, - read_only: bool, - cache_only: bool, - skip_cache: bool, -} - -impl OpenOptions { - /// Sets the option to create a tablet, or open it if it already exists. - pub fn set_create(mut self, create: bool) -> Self { - self.create = create; - self - } - - /// Sets the option to create a new tablet, failing if it already exists. - pub fn set_create_new(mut self, create_new: bool) -> Self { - self.create_new = create_new; - self - } - - /// Sets the option for read only - pub fn set_read_only(mut self, read_only: bool) -> Self { - self.read_only = read_only; - self - } - - /// Sets the option for only reading from cache. - pub fn set_cache_only(mut self, cache_only: bool) -> Self { - self.cache_only = cache_only; - self - } - - /// Sets the option to open a tablet without updating the cache. - pub fn set_skip_cache(mut self, skip_cache: bool) -> Self { - self.skip_cache = skip_cache; - self - } - - pub fn create(&self) -> bool { - self.create - } - - pub fn create_new(&self) -> bool { - self.create_new - } - - pub fn read_only(&self) -> bool { - self.read_only - } - - pub fn cache_only(&self) -> bool { - self.cache_only - } - - pub fn skip_cache(&self) -> bool { - self.skip_cache - } -} - -pub const SPLIT_PREFIX: &str = "split_"; -pub const MERGE_PREFIX: &str = "merge_"; - -/// A factory trait to create new engine. -// It should be named as `EngineFactory` for consistency, but we are about to -// rename engine to tablet, so always use tablet for new traits/types. -pub trait TabletFactory: TabletAccessor + Send + Sync { - /// Open the tablet with id and suffix according to the OpenOptions. - /// - /// The id is likely the region Id, the suffix could be the current raft log - /// index. They together could specify a unique path for a region's - /// tablet. The reason to have suffix is that we can keep more than one - /// tablet for a region. - fn open_tablet(&self, id: u64, suffix: Option, options: OpenOptions) -> Result; - - /// Open tablet by raw path without updating cache. - fn open_tablet_raw( - &self, - path: &Path, - id: u64, - suffix: u64, - options: OpenOptions, - ) -> Result; - - /// Create the shared db for v1 - fn create_shared_db(&self) -> Result; - - /// Destroy the tablet and its data - fn destroy_tablet(&self, id: u64, suffix: u64) -> Result<()>; - - /// Check if the tablet with specified id/suffix exists - #[inline] - fn exists(&self, id: u64, suffix: u64) -> bool { - self.exists_raw(&self.tablet_path(id, suffix)) - } - - /// Check if the tablet with specified path exists - fn exists_raw(&self, path: &Path) -> bool; - - /// Get the tablet path by id and suffix - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { - self.tablet_path_with_prefix("", id, suffix) - } - - /// Get the tablet path by id and suffix - /// - /// Used in special situations - /// Ex: split/merge. - fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf; - - /// Tablets root path - fn tablets_path(&self) -> PathBuf; - - /// Load the tablet from path for id and suffix--for scenarios such as - /// applying snapshot - fn load_tablet(&self, _path: &Path, _id: u64, _suffix: u64) -> Result { - unimplemented!(); - } - - /// Mark the tablet with specified id and suffix tombostone - fn mark_tombstone(&self, _id: u64, _suffix: u64) { - unimplemented!(); - } - - /// Check if the tablet with specified id and suffix tombostone - fn is_tombstoned(&self, _region_id: u64, _suffix: u64) -> bool { - unimplemented!(); - } - - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()>; -} - -pub struct DummyFactory -where - EK: CfOptionsExt + Clone + Send + 'static, -{ - pub engine: Option, - pub root_path: String, -} - -impl TabletFactory for DummyFactory -where - EK: CfOptionsExt + Clone + Send + Sync + 'static, -{ - fn create_shared_db(&self) -> Result { - Ok(self.engine.as_ref().unwrap().clone()) - } - - fn open_tablet(&self, _id: u64, _suffix: Option, _options: OpenOptions) -> Result { - Ok(self.engine.as_ref().unwrap().clone()) - } - - fn open_tablet_raw( - &self, - _path: &Path, - _id: u64, - _suffix: u64, - _options: OpenOptions, - ) -> Result { - Ok(self.engine.as_ref().unwrap().clone()) - } - - fn destroy_tablet(&self, _id: u64, _suffix: u64) -> Result<()> { - Ok(()) - } - - fn exists_raw(&self, _path: &Path) -> bool { - true - } - - fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { - PathBuf::from(&self.root_path) - } - - fn tablets_path(&self) -> PathBuf { - PathBuf::from(&self.root_path) - } - - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - let opt = self - .engine - .as_ref() - .unwrap() - .get_options_cf(CF_DEFAULT) - .unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity) - } -} - -impl TabletAccessor for DummyFactory -where - EK: CfOptionsExt + Clone + Send + 'static, -{ - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &EK)) { - if let Some(engine) = &self.engine { - f(0, 0, engine); - } - } - - fn is_single_engine(&self) -> bool { - true - } -} - -impl DummyFactory -where - EK: CfOptionsExt + Clone + Send + 'static, -{ - pub fn new(engine: Option, root_path: String) -> DummyFactory { - DummyFactory { engine, root_path } - } -} - -impl Default for DummyFactory { - fn default() -> Self { - Self::new(None, "/tmp".to_string()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_tablet_error_collector_ok() { - let mut err = TabletErrorCollector::new(); - err.add_result(1, 1, Ok(())); - err.take_result().unwrap(); - assert_eq!(err.get_error_count(), 0); - } - - #[test] - fn test_tablet_error_collector_err() { - let mut err = TabletErrorCollector::new(); - err.add_result(1, 1, Ok(())); - err.add_result(1, 1, Err(Status::with_code(Code::Aborted).into())); - err.add_result(1, 1, Err(Status::with_code(Code::NotFound).into())); - err.add_result(1, 1, Ok(())); - err.take_result().unwrap_err(); - assert_eq!(err.get_error_count(), 2); - } -} diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index b9cf8847751..6a140230fd5 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -294,6 +294,8 @@ mod sst_partitioner; pub use crate::sst_partitioner::*; mod range_properties; pub use crate::{mvcc_properties::*, range_properties::*}; +mod tablet; +pub use tablet::*; mod ttl_properties; pub use crate::ttl_properties::*; mod perf_context; diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs new file mode 100644 index 00000000000..988cd343fe3 --- /dev/null +++ b/components/engine_traits/src/tablet.rs @@ -0,0 +1,398 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + path::{Path, PathBuf}, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, +}; + +use collections::HashMap; +use tikv_util::box_err; + +use crate::{Error, Result}; + +#[derive(Debug)] +struct LatestTablet { + data: Mutex>, + version: AtomicU64, +} + +/// Tablet may change during split, merge and applying snapshot. So we need a +/// shared value to reflect the latest tablet. `CachedTablet` provide cache that +/// can speed up common access. +#[derive(Clone, Debug)] +pub struct CachedTablet { + latest: Arc>, + cache: Option, + version: u64, +} + +impl CachedTablet { + #[inline] + fn new(data: Option) -> Self { + CachedTablet { + latest: Arc::new(LatestTablet { + data: Mutex::new(data.clone()), + version: AtomicU64::new(0), + }), + cache: data, + version: 0, + } + } + + pub fn set(&mut self, data: EK) { + self.version = { + let mut latest_data = self.latest.data.lock().unwrap(); + *latest_data = Some(data.clone()); + self.latest.version.fetch_add(1, Ordering::Relaxed) + 1 + }; + self.cache = Some(data); + } + + /// Get the tablet from cache without checking if it's up to date. + #[inline] + pub fn cache(&self) -> Option<&EK> { + self.cache.as_ref() + } + + /// Get the latest tablet. + #[inline] + pub fn latest(&mut self) -> Option<&EK> { + if self.latest.version.load(Ordering::Relaxed) > self.version { + let latest_data = self.latest.data.lock().unwrap(); + self.version = self.latest.version.load(Ordering::Relaxed); + self.cache = latest_data.clone(); + } + self.cache() + } +} + +/// A factory trait to create new tablet for multi-rocksdb architecture. +// It should be named as `EngineFactory` for consistency, but we are about to +// rename engine to tablet, so always use tablet for new traits/types. +pub trait TabletFactory: Send + Sync { + /// Open the tablet in `path`. + /// + /// `id` and `suffix` is used to mark the identity of tablet. The id is + /// likely the region Id, the suffix could be the current raft log + /// index. The reason to have suffix is that we can keep more than one + /// tablet for a region. + fn open_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result; + + /// Destroy the tablet and its data + fn destroy_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result<()>; + + /// Check if the tablet with specified path exists + fn exists(&self, path: &Path) -> bool; +} + +pub struct SingletonFactory { + tablet: EK, +} + +impl SingletonFactory { + pub fn new(tablet: EK) -> Self { + SingletonFactory { tablet } + } +} + +impl TabletFactory for SingletonFactory { + /// Open the tablet in `path`. + /// + /// `id` and `suffix` is used to mark the identity of tablet. The id is + /// likely the region Id, the suffix could be the current raft log + /// index. The reason to have suffix is that we can keep more than one + /// tablet for a region. + fn open_tablet(&self, _id: u64, _suffix: Option, _path: &Path) -> Result { + Ok(self.tablet.clone()) + } + + /// Destroy the tablet and its data + fn destroy_tablet(&self, _id: u64, _suffix: Option, _path: &Path) -> Result<()> { + Ok(()) + } + + /// Check if the tablet with specified path exists + fn exists(&self, _path: &Path) -> bool { + true + } +} + +/// A global registry for all tablets. +struct TabletRegistryInner { + // region_id, suffix -> tablet + tablets: Mutex>>, + tombstone: Mutex>, + factory: Box>, + root: PathBuf, +} + +pub struct TabletRegistry { + // One may consider to add cache to speed up access. But it also makes it more + // difficult to gc stale cache. + tablets: Arc>, +} + +impl Clone for TabletRegistry { + fn clone(&self) -> Self { + Self { + tablets: self.tablets.clone(), + } + } +} + +unsafe impl Send for TabletRegistry {} +unsafe impl Sync for TabletRegistry {} + +impl TabletRegistry { + pub fn new(factory: Box>, path: impl Into) -> Result { + let root = path.into(); + std::fs::create_dir_all(&root)?; + Ok(TabletRegistry { + tablets: Arc::new(TabletRegistryInner { + tablets: Mutex::new(HashMap::default()), + factory, + root, + tombstone: Mutex::default(), + }), + }) + } + + pub fn tablet_name(&self, prefix: &str, id: u64, suffix: u64) -> String { + format!("{}{}_{}", prefix, id, suffix) + } + + pub fn tablet_root(&self) -> &Path { + &self.tablets.root + } + + pub fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + let name = self.tablet_name("", id, suffix); + self.tablets.root.join(name) + } + + /// Gets a tablet. + pub fn get(&self, id: u64) -> Option> + where + EK: Clone, + { + let tablets = self.tablets.tablets.lock().unwrap(); + tablets.get(&id).cloned() + } + + /// Gets a tablet, create a default one if it doesn't exist. + pub fn get_or_default(&self, id: u64) -> CachedTablet + where + EK: Clone, + { + let mut tablets = self.tablets.tablets.lock().unwrap(); + tablets + .entry(id) + .or_insert_with(|| CachedTablet::new(None)) + .clone() + } + + pub fn tablet_factory(&self) -> &dyn TabletFactory { + self.tablets.factory.as_ref() + } + + pub fn remove(&self, id: u64) { + self.tablets.tablets.lock().unwrap().remove(&id); + } + + /// Load the tablet and set it as the latest. + /// + /// If the tablet doesn't exist, it will create an empty one. + pub fn load(&self, id: u64, suffix: u64, create: bool) -> Result> + where + EK: Clone, + { + let path = self.tablet_path(id, suffix); + if !create && !self.tablets.factory.exists(&path) { + return Err(Error::Other(box_err!( + "tablet ({}, {:?}) doesn't exist", + id, + suffix + ))); + } + let tablet = self.tablets.factory.open_tablet(id, Some(suffix), &path)?; + let mut cached = self.get_or_default(id); + cached.set(tablet); + Ok(cached) + } + + /// Destroy the tablet and its data + pub fn mark_tombstone(&self, id: u64, suffix: u64) { + self.tablets.tombstone.lock().unwrap().push((id, suffix)); + } + + /// Loop over all opened tablets. Note, it's possible that the visited + /// tablet is not the latest one. If latest one is required, you may + /// either: + /// - loop several times to make it likely to visit all tablets. + /// - send commands to fsms instead, which can guarantee latest tablet is + /// visisted. + pub fn for_each_opened_tablet(&self, mut f: impl FnMut(u64, &mut CachedTablet) -> bool) { + let mut tablets = self.tablets.tablets.lock().unwrap(); + for (id, tablet) in tablets.iter_mut() { + if !f(*id, tablet) { + return; + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cached_tablet() { + let mut cached_tablet = CachedTablet::new(None); + assert_eq!(cached_tablet.cache(), None); + assert_eq!(cached_tablet.latest(), None); + + cached_tablet = CachedTablet::new(Some(1)); + assert_eq!(cached_tablet.cache().cloned(), Some(1)); + assert_eq!(cached_tablet.latest().cloned(), Some(1)); + + // Setting tablet will refresh cache immediately. + cached_tablet.set(2); + assert_eq!(cached_tablet.cache().cloned(), Some(2)); + + // Test `latest()` will use cache. + // Unsafe modify the data. + let old_data = *cached_tablet.latest.data.lock().unwrap(); + *cached_tablet.latest.data.lock().unwrap() = Some(0); + assert_eq!(cached_tablet.latest().cloned(), old_data); + // Restore the data. + *cached_tablet.latest.data.lock().unwrap() = old_data; + + let mut cloned = cached_tablet.clone(); + // Clone should reuse cache. + assert_eq!(cloned.cache().cloned(), Some(2)); + cloned.set(1); + assert_eq!(cloned.cache().cloned(), Some(1)); + assert_eq!(cloned.latest().cloned(), Some(1)); + + // Local cache won't be refreshed until querying latest. + assert_eq!(cached_tablet.cache().cloned(), Some(2)); + assert_eq!(cached_tablet.latest().cloned(), Some(1)); + assert_eq!(cached_tablet.cache().cloned(), Some(1)); + } + + #[test] + fn test_singleton_factory() { + let tablet = Arc::new(1); + let singleton = SingletonFactory::new(tablet.clone()); + let registry = TabletRegistry::new(Box::new(singleton), "").unwrap(); + registry.load(1, 1, true).unwrap(); + let mut cached = registry.get(1).unwrap(); + assert_eq!(cached.latest().cloned(), Some(tablet.clone())); + + registry.load(2, 1, true).unwrap(); + let mut count = 0; + registry.for_each_opened_tablet(|id, cached| { + assert!(&[1, 2].contains(&id), "{}", id); + assert_eq!(cached.latest().cloned(), Some(tablet.clone())); + count += 1; + true + }); + assert_eq!(count, 2); + + // Destroy should be ignored. + registry + .tablet_factory() + .destroy_tablet(2, Some(1), ®istry.tablet_path(2, 1)) + .unwrap(); + + // Exist check should always succeed. + registry.load(3, 1, false).unwrap(); + let mut cached = registry.get(3).unwrap(); + assert_eq!(cached.latest().cloned(), Some(tablet)); + } + + type Record = Arc<(u64, u64)>; + + struct MemoryTablet { + tablet: Mutex>, + } + + impl TabletFactory for MemoryTablet { + fn open_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result { + let mut tablet = self.tablet.lock().unwrap(); + if tablet.contains_key(path) { + return Err(Error::Other(box_err!("tablet is opened"))); + } + tablet.insert(path.to_owned(), Arc::new((id, suffix.unwrap_or(0)))); + Ok(tablet[path].clone()) + } + + fn exists(&self, path: &Path) -> bool { + let tablet = self.tablet.lock().unwrap(); + tablet.contains_key(path) + } + + fn destroy_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result<()> { + let prev = self.tablet.lock().unwrap().remove(path).unwrap(); + assert_eq!((id, suffix.unwrap_or(0)), *prev); + Ok(()) + } + } + + #[test] + fn test_tablet_registry() { + let factory = MemoryTablet { + tablet: Mutex::new(HashMap::default()), + }; + let registry = TabletRegistry::new(Box::new(factory), "").unwrap(); + + let mut tablet_1_10 = registry.load(1, 10, true).unwrap(); + // It's open already, load it twice should report lock error. + registry.load(1, 10, true).unwrap_err(); + let mut cached = registry.get(1).unwrap(); + assert_eq!(cached.latest(), tablet_1_10.latest()); + + let tablet_path = registry.tablet_path(1, 10); + assert!(registry.tablet_factory().exists(&tablet_path)); + + let tablet_path = registry.tablet_path(1, 11); + assert!(!registry.tablet_factory().exists(&tablet_path)); + // Not exist tablet should report error. + registry.load(1, 11, false).unwrap_err(); + assert!(registry.get(2).is_none()); + // Though path not exist, but we should be able to create an empty one. + assert_eq!(registry.get_or_default(2).latest(), None); + assert!(!registry.tablet_factory().exists(&tablet_path)); + + // Load new suffix should update cache. + registry.load(1, 11, true).unwrap(); + assert_ne!(cached.latest(), tablet_1_10.cache()); + let tablet_path = registry.tablet_path(1, 11); + assert!(registry.tablet_factory().exists(&tablet_path)); + + let mut count = 0; + registry.for_each_opened_tablet(|_, _| { + count += 1; + true + }); + assert_eq!(count, 2); + + registry.remove(2); + assert!(registry.get(2).is_none()); + count = 0; + registry.for_each_opened_tablet(|_, _| { + count += 1; + true + }); + assert_eq!(count, 1); + + let name = registry.tablet_name("prefix_", 12, 30); + assert_eq!(name, "prefix_12_30"); + let normal_name = registry.tablet_name("", 20, 15); + let normal_tablet_path = registry.tablet_path(20, 15); + assert_eq!(registry.tablet_root().join(normal_name), normal_tablet_path); + } +} diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 199e8cafbd8..96cbee19e4e 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -17,7 +17,7 @@ use causal_ts::CausalTsProviderImpl; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{Sender, TrySendError}; -use engine_traits::{Engines, KvEngine, RaftEngine, TabletFactory}; +use engine_traits::{Engines, KvEngine, RaftEngine, TabletRegistry}; use file_system::{set_io_type, IoType}; use futures::{compat::Future01CompatExt, FutureExt}; use kvproto::{ @@ -72,9 +72,9 @@ pub struct StoreContext { pub timer: SteadyTimer, pub write_senders: WriteSenders, /// store meta - pub store_meta: Arc>>, + pub store_meta: Arc>, pub engine: ER, - pub tablet_factory: Arc>, + pub tablet_registry: TabletRegistry, pub apply_pool: FuturePool, pub read_scheduler: Scheduler>, pub snap_mgr: TabletSnapManager, @@ -222,7 +222,7 @@ struct StorePollerBuilder { cfg: Arc>, store_id: u64, engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, trans: T, router: StoreRouter, read_scheduler: Scheduler>, @@ -230,7 +230,7 @@ struct StorePollerBuilder { write_senders: WriteSenders, apply_pool: FuturePool, logger: Logger, - store_meta: Arc>>, + store_meta: Arc>, snap_mgr: TabletSnapManager, } @@ -239,14 +239,14 @@ impl StorePollerBuilder { cfg: Arc>, store_id: u64, engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, trans: T, router: StoreRouter, read_scheduler: Scheduler>, pd_scheduler: Scheduler, store_writers: &mut StoreWriters, logger: Logger, - store_meta: Arc>>, + store_meta: Arc>, snap_mgr: TabletSnapManager, ) -> Self { let pool_size = cfg.value().apply_batch_system.pool_size; @@ -263,7 +263,7 @@ impl StorePollerBuilder { cfg, store_id, engine, - tablet_factory, + tablet_registry, trans, router, read_scheduler, @@ -294,7 +294,7 @@ impl StorePollerBuilder { Some(p) => p, None => return Ok(()), }; - let (sender, peer_fsm) = PeerFsm::new(&cfg, &*self.tablet_factory, storage)?; + let (sender, peer_fsm) = PeerFsm::new(&cfg, &self.tablet_registry, storage)?; meta.region_read_progress .insert(region_id, peer_fsm.as_ref().peer().read_progress().clone()); @@ -342,7 +342,7 @@ where write_senders: self.write_senders.clone(), store_meta: self.store_meta.clone(), engine: self.engine.clone(), - tablet_factory: self.tablet_factory.clone(), + tablet_registry: self.tablet_registry.clone(), apply_pool: self.apply_pool.clone(), read_scheduler: self.read_scheduler.clone(), snap_mgr: self.snap_mgr.clone(), @@ -386,11 +386,11 @@ impl StoreSystem { store_id: u64, cfg: Arc>, raft_engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, trans: T, pd_client: Arc, router: &StoreRouter, - store_meta: Arc>>, + store_meta: Arc>, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 @@ -424,7 +424,7 @@ impl StoreSystem { store_id, pd_client, raft_engine.clone(), - tablet_factory.clone(), + tablet_registry.clone(), router.clone(), workers.pd_worker.remote(), concurrency_manager, @@ -438,7 +438,7 @@ impl StoreSystem { cfg.clone(), store_id, raft_engine, - tablet_factory, + tablet_registry, trans, router.clone(), read_scheduler, @@ -462,8 +462,6 @@ impl StoreSystem { for (region_id, (tx, fsm)) in peers { meta.readers .insert(region_id, fsm.peer().generate_read_delegate()); - meta.tablet_caches - .insert(region_id, fsm.peer().tablet().clone()); address.push(region_id); mailboxes.push(( diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 2aa42da2e42..6e2921a0c0d 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -11,7 +11,7 @@ use std::{ use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; -use engine_traits::{KvEngine, TabletFactory}; +use engine_traits::{KvEngine, TabletRegistry}; use futures::{Future, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; use raftstore::store::ReadTask; @@ -24,7 +24,6 @@ use tikv_util::{ use crate::{ raft::Apply, router::{ApplyRes, ApplyTask, PeerMsg}, - tablet::CachedTablet, }; /// A trait for reporting apply result. @@ -64,8 +63,7 @@ impl ApplyFsm { peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, - remote_tablet: CachedTablet, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, read_scheduler: Scheduler>, logger: Logger, ) -> (ApplyScheduler, Self) { @@ -74,8 +72,7 @@ impl ApplyFsm { peer, region_state, res_reporter, - remote_tablet, - tablet_factory, + tablet_registry, read_scheduler, logger, ); diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index cf85522df90..6254e1975fd 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -6,7 +6,7 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; -use engine_traits::{KvEngine, RaftEngine, TabletFactory}; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use raftstore::store::{Config, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ @@ -39,10 +39,10 @@ pub struct PeerFsm { impl PeerFsm { pub fn new( cfg: &Config, - tablet_factory: &dyn TabletFactory, + tablet_registry: &TabletRegistry, storage: Storage, ) -> Result> { - let peer = Peer::new(cfg, tablet_factory, storage)?; + let peer = Peer::new(cfg, tablet_registry, storage)?; info!(peer.logger, "create peer"); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); let fsm = Box::new(PeerFsm { diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 546ec95a604..73702500e19 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -22,41 +22,17 @@ use crate::{ batch::StoreContext, raft::Peer, router::{StoreMsg, StoreTick}, - tablet::CachedTablet, }; -pub struct StoreMeta -where - E: KvEngine, -{ +#[derive(Default)] +pub struct StoreMeta { pub store_id: Option, /// region_id -> reader pub readers: HashMap, - /// region_id -> tablet cache - pub tablet_caches: HashMap>, /// region_id -> `RegionReadProgress` pub region_read_progress: RegionReadProgressRegistry, } -impl StoreMeta -where - E: KvEngine, -{ - pub fn new() -> StoreMeta { - StoreMeta { - store_id: None, - readers: HashMap::default(), - tablet_caches: HashMap::default(), - region_read_progress: RegionReadProgressRegistry::new(), - } - } -} - -impl Default for StoreMeta { - fn default() -> Self { - Self::new() - } -} pub struct Store { id: u64, // Unix time when it's started. diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 7dea9d55901..2a9d5faabd5 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -32,7 +32,6 @@ mod fsm; mod operation; mod raft; pub mod router; -mod tablet; mod worker; pub(crate) use batch::StoreContext; diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index eb6560d239e..c1e25474701 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -17,7 +17,7 @@ use raftstore::{ Result, }; use slog::info; -pub use split::{SplitInit, SplitResult}; +pub use split::{SplitInit, SplitResult, SPLIT_PREFIX}; use tikv_util::box_err; use self::conf_change::ConfChangeResult; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 0b97d726a2e..13a5d168915 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -30,8 +30,7 @@ use std::{cmp, collections::VecDeque}; use collections::HashSet; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{ - Checkpointer, DeleteStrategy, KvEngine, OpenOptions, RaftEngine, RaftLogBatch, Range, - CF_DEFAULT, SPLIT_PREFIX, + Checkpointer, DeleteStrategy, KvEngine, RaftEngine, RaftLogBatch, Range, CF_DEFAULT, }; use fail::fail_point; use keys::enc_end_key; @@ -64,6 +63,8 @@ use crate::{ router::{ApplyRes, PeerMsg, StoreMsg}, }; +pub const SPLIT_PREFIX: &str = "split_"; + #[derive(Debug)] pub struct SplitResult { pub regions: Vec, @@ -225,17 +226,15 @@ impl Apply { ) }); + let reg = self.tablet_registry(); for new_region in ®ions { let new_region_id = new_region.id; if new_region_id == region_id { continue; } - let split_temp_path = self.tablet_factory().tablet_path_with_prefix( - SPLIT_PREFIX, - new_region_id, - RAFT_INIT_LOG_INDEX, - ); + let name = reg.tablet_name(SPLIT_PREFIX, new_region_id, RAFT_INIT_LOG_INDEX); + let split_temp_path = reg.tablet_root().join(name); checkpointer .create_at(&split_temp_path, None, 0) .unwrap_or_else(|e| { @@ -248,7 +247,7 @@ impl Apply { }); } - let derived_path = self.tablet_factory().tablet_path(region_id, log_index); + let derived_path = self.tablet_registry().tablet_path(region_id, log_index); checkpointer .create_at(&derived_path, None, 0) .unwrap_or_else(|e| { @@ -259,9 +258,11 @@ impl Apply { e ) }); - let tablet = self + let reg = self.tablet_registry(); + let path = reg.tablet_path(region_id, log_index); + let tablet = reg .tablet_factory() - .open_tablet(region_id, Some(log_index), OpenOptions::default()) + .open_tablet(region_id, Some(log_index), &path) .unwrap(); // Remove the old write batch. self.write_batch_mut().take(); @@ -492,10 +493,10 @@ mod test { use collections::HashMap; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::TestTabletFactoryV2, + kv::TestTabletFactory, raft, }; - use engine_traits::{CfOptionsExt, Peekable, TabletFactory, WriteBatch, ALL_CFS}; + use engine_traits::{CfOptionsExt, Peekable, TabletRegistry, WriteBatch, ALL_CFS}; use futures::channel::mpsc::unbounded; use kvproto::{ metapb::RegionEpoch, @@ -516,7 +517,6 @@ mod test { use crate::{ fsm::{ApplyFsm, ApplyResReporter}, raft::Apply, - tablet::CachedTablet, }; struct MockReporter { @@ -546,7 +546,6 @@ mod test { fn assert_split( apply: &mut Apply, - factory: &Arc, parent_id: u64, right_derived: bool, new_region_ids: Vec, @@ -589,8 +588,9 @@ mod test { let state = apply.region_state(); assert_eq!(state.tablet_index, log_index); assert_eq!(state.get_region(), region); - let tablet_path = factory.tablet_path(region.id, log_index); - assert!(factory.exists_raw(&tablet_path)); + let reg = apply.tablet_registry(); + let tablet_path = reg.tablet_path(region.id, log_index); + assert!(reg.tablet_factory().exists(&tablet_path)); match apply_res { AdminCmdResult::SplitRegion(SplitResult { @@ -610,9 +610,10 @@ mod test { } child_idx += 1; - let tablet_path = - factory.tablet_path_with_prefix(SPLIT_PREFIX, region.id, RAFT_INIT_LOG_INDEX); - assert!(factory.exists_raw(&tablet_path)); + let reg = apply.tablet_registry(); + let tablet_name = reg.tablet_name(SPLIT_PREFIX, region.id, RAFT_INIT_LOG_INDEX); + let path = reg.tablet_root().join(tablet_name); + assert!(reg.tablet_factory().exists(&path)); } } } @@ -635,19 +636,9 @@ mod test { .copied() .map(|cf| (cf, CfOptions::default())) .collect(); - let factory = Arc::new(TestTabletFactoryV2::new( - path.path(), - DbOptions::default(), - cf_opts, - )); - - let tablet = factory - .open_tablet( - region.id, - Some(5), - OpenOptions::default().set_create_new(true), - ) - .unwrap(); + let factory = Box::new(TestTabletFactory::new(DbOptions::default(), cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); + reg.load(region.id, 5, true).unwrap(); let mut region_state = RegionLocalState::default(); region_state.set_state(PeerState::Normal); @@ -665,8 +656,7 @@ mod test { .clone(), region_state, reporter, - CachedTablet::new(Some(tablet)), - factory.clone(), + reg, read_scheduler, logger.clone(), ); @@ -827,7 +817,6 @@ mod test { assert_split( &mut apply, - &factory, parent_id, right_derive, new_region_ids, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index bd175ef7a4d..3bb6b7b3852 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -59,7 +59,7 @@ mod admin; mod control; mod write; -pub use admin::{AdminCmdResult, SplitInit, SplitResult}; +pub use admin::{AdminCmdResult, SplitInit, SplitResult, SPLIT_PREFIX}; pub use control::ProposalControl; pub use write::{SimpleWriteDecoder, SimpleWriteEncoder}; @@ -129,8 +129,7 @@ impl Peer { self.peer().clone(), region_state, mailbox, - tablet, - store_ctx.tablet_factory.clone(), + store_ctx.tablet_registry.clone(), read_scheduler, logger, ); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 60884f63b03..ca610de1bfc 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -230,7 +230,7 @@ impl Store { ctx.read_scheduler.clone(), &ctx.logger, ) - .and_then(|s| PeerFsm::new(&ctx.cfg, &*ctx.tablet_factory, s)) + .and_then(|s| PeerFsm::new(&ctx.cfg, &ctx.tablet_registry, s)) { Ok(p) => p, res => { diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 659fab00754..7df27670a35 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -41,7 +41,7 @@ impl Store { stats.set_store_id(self.store_id()); { let meta = ctx.store_meta.lock().unwrap(); - stats.set_region_count(meta.tablet_caches.len() as u32); + stats.set_region_count(meta.readers.len() as u32); } stats.set_sending_snap_count(0); diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index bbff28b272f..4455ea099f4 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -151,7 +151,7 @@ impl Peer { pub(crate) fn maybe_renew_leader_lease( &mut self, ts: Timespec, - store_meta: &Mutex>, + store_meta: &Mutex, progress: Option, ) { // A nonleader peer should never has leader lease. diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 0736bc13fd8..120e64cb872 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -8,7 +8,7 @@ use std::{ use batch_system::Router; use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletRegistry}; use kvproto::{ errorpb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse}, @@ -34,7 +34,6 @@ use txn_types::WriteBatchFlags; use crate::{ fsm::StoreMeta, router::{PeerMsg, QueryResult}, - tablet::CachedTablet, StoreRouter, }; @@ -69,15 +68,20 @@ where E: KvEngine, C: MsgRouter, { - pub fn new(store_meta: Arc>>, router: C, logger: Logger) -> Self { + pub fn new( + store_meta: Arc>, + reg: TabletRegistry, + router: C, + logger: Logger, + ) -> Self { Self { - local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta)), + local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta, reg)), router, logger, } } - pub fn store_meta(&self) -> &Arc>> { + pub fn store_meta(&self) -> &Arc> { self.local_reader.store_meta() } @@ -300,15 +304,16 @@ struct StoreMetaDelegate where E: KvEngine, { - store_meta: Arc>>, + store_meta: Arc>, + reg: TabletRegistry, } impl StoreMetaDelegate where E: KvEngine, { - pub fn new(store_meta: Arc>>) -> StoreMetaDelegate { - StoreMetaDelegate { store_meta } + pub fn new(store_meta: Arc>, reg: TabletRegistry) -> StoreMetaDelegate { + StoreMetaDelegate { store_meta, reg } } } @@ -317,7 +322,7 @@ where E: KvEngine, { type Executor = CachedReadDelegate; - type StoreMeta = Arc>>; + type StoreMeta = Arc>; fn store_id(&self) -> Option { self.store_meta.as_ref().lock().unwrap().store_id @@ -330,7 +335,7 @@ where let reader = meta.readers.get(®ion_id).cloned(); if let Some(reader) = reader { // If reader is not None, cache must not be None. - let cached_tablet = meta.tablet_caches.get(®ion_id).cloned().unwrap(); + let cached_tablet = self.reg.get(region_id).unwrap(); return ( meta.readers.len(), Some(CachedReadDelegate { @@ -431,9 +436,9 @@ mod tests { use crossbeam::{atomic::AtomicCell, channel::TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, TestTabletFactoryV2}, + kv::{KvTestEngine, TestTabletFactory}, }; - use engine_traits::{MiscExt, OpenOptions, Peekable, SyncMutable, TabletFactory, ALL_CFS}; + use engine_traits::{MiscExt, Peekable, SyncMutable, ALL_CFS}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; use raftstore::store::{ @@ -470,7 +475,8 @@ mod tests { #[allow(clippy::type_complexity)] fn new_reader( store_id: u64, - store_meta: Arc>>, + store_meta: Arc>, + reg: TabletRegistry, ) -> ( LocalReader, Receiver<(u64, PeerMsg)>, @@ -478,6 +484,7 @@ mod tests { let (ch, rx) = MockRouter::new(); let mut reader = LocalReader::new( store_meta, + reg, ch, Logger::root(slog::Discard, o!("key1" => "value1")), ); @@ -544,10 +551,11 @@ mod tests { .prefix("test-local-reader") .tempdir() .unwrap(); - let factory = Arc::new(TestTabletFactoryV2::new(path.path(), ops, cf_opts)); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); - let store_meta = Arc::new(Mutex::new(StoreMeta::new())); - let (mut reader, mut rx) = new_reader(store_id, store_meta.clone()); + let store_meta = Arc::new(Mutex::new(StoreMeta::default())); + let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), reg.clone()); let (mix_tx, mix_rx) = sync_channel(1); let handler = mock_raftstore(mix_rx); @@ -623,11 +631,7 @@ mod tests { }; meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data - let tablet1 = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - let cache = CachedTablet::new(Some(tablet1)); - meta.tablet_caches.insert(1, cache); + reg.load(1, 10, true).unwrap(); } let (ch_tx, ch_rx) = sync_channel(1); @@ -738,10 +742,11 @@ mod tests { .prefix("test-local-reader") .tempdir() .unwrap(); - let factory = Arc::new(TestTabletFactoryV2::new(path.path(), ops, cf_opts)); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); let store_meta = - StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::::new()))); + StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::default())), reg.clone()); let tablet1; let tablet2; @@ -753,24 +758,18 @@ mod tests { meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data - tablet1 = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); + reg.load(1, 10, true).unwrap(); + tablet1 = reg.get(1).unwrap().latest().unwrap().clone(); tablet1.put(b"a1", b"val1").unwrap(); - let cache = CachedTablet::new(Some(tablet1.clone())); - meta.tablet_caches.insert(1, cache); // Create read_delegate with region id 2 let read_delegate = ReadDelegate::mock(2); meta.readers.insert(2, read_delegate); // create tablet with region_id 1 and prepare some data - tablet2 = factory - .open_tablet(2, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); + reg.load(2, 10, true).unwrap(); + tablet2 = reg.get(2).unwrap().latest().unwrap().clone(); tablet2.put(b"a2", b"val2").unwrap(); - let cache = CachedTablet::new(Some(tablet2.clone())); - meta.tablet_caches.insert(2, cache); } let (_, delegate) = store_meta.get_executor_and_len(1); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 6a91c25f1f6..c252ad7d231 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -22,7 +22,7 @@ mod snapshot; use std::{cmp, time::Instant}; -use engine_traits::{KvEngine, MiscExt, OpenOptions, RaftEngine, TabletFactory}; +use engine_traits::{KvEngine, MiscExt, RaftEngine}; use error_code::ErrorCodeExt; use kvproto::{ raft_cmdpb::AdminCmdType, @@ -537,7 +537,7 @@ impl Storage { ready.snapshot(), write_task, ctx.snap_mgr.clone(), - ctx.tablet_factory.clone(), + ctx.tablet_registry.clone(), ) { error!(self.logger(),"failed to apply snapshot";"error" => ?e) } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 4cd4b5265d8..8ac27ba2466 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -28,7 +28,7 @@ use std::{ }, }; -use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory, SPLIT_PREFIX}; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; use protobuf::Message; use raft::eraftpb::Snapshot; @@ -41,6 +41,7 @@ use tikv_util::{box_err, box_try, worker::Scheduler}; use crate::{ fsm::ApplyResReporter, + operation::command::SPLIT_PREFIX, raft::{Apply, Peer, Storage}, router::{ApplyTask, PeerTick}, Result, StoreContext, @@ -124,17 +125,14 @@ impl Peer { let first_index = self.storage().entry_storage().first_index(); if first_index == persisted_index + 1 { let region_id = self.region_id(); - let tablet = ctx - .tablet_factory - .open_tablet(region_id, Some(persisted_index), OpenOptions::default()) + ctx.tablet_registry + .load(region_id, persisted_index, false) .unwrap(); - self.tablet_mut().set(tablet); self.schedule_apply_fsm(ctx); self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(persisted_index); { let mut meta = ctx.store_meta.lock().unwrap(); - meta.tablet_caches.insert(region_id, self.tablet().clone()); meta.readers .insert(region_id, self.generate_read_delegate()); meta.region_read_progress @@ -217,6 +215,7 @@ impl Storage { } } SnapState::Generated(ref s) => { + // TODO: `to` may not be equal to the generated snapshot. let SnapState::Generated(snap) = mem::replace(&mut *snap_state, SnapState::Relax) else { unreachable!() }; if self.validate_snap(&snap, request_index) { return Ok(*snap); @@ -367,7 +366,7 @@ impl Storage { snap: &Snapshot, task: &mut WriteTask, snap_mgr: TabletSnapManager, - tablet_factory: Arc>, + reg: TabletRegistry, ) -> Result<()> { let region_id = self.region().get_id(); let peer_id = self.peer().get_id(); @@ -400,10 +399,10 @@ impl Storage { let (path, clean_split) = match self.split_init_mut() { // If index not match, the peer may accept a newer snapshot after split. - Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => ( - tablet_factory.tablet_path_with_prefix(SPLIT_PREFIX, region_id, last_index), - false, - ), + Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => { + let name = reg.tablet_name(SPLIT_PREFIX, region_id, last_index); + (reg.tablet_root().join(name), false) + } si => { let key = TabletSnapKey::new(region_id, peer_id, last_term, last_index); (snap_mgr.final_recv_path(&key), si.is_some()) @@ -414,20 +413,19 @@ impl Storage { // The snapshot require no additional processing such as ingest them to DB, but // it should load it into the factory after it persisted. let hook = move || { - if let Err(e) = tablet_factory.load_tablet(path.as_path(), region_id, last_index) { + let target_path = reg.tablet_path(region_id, last_index); + if let Err(e) = std::fs::rename(&path, &target_path) { panic!( - "{:?} failed to load tablet, path: {}, {:?}", + "{:?} failed to load tablet, path: {} -> {}, {:?}", logger.list(), path.display(), + target_path.display(), e ); } if clean_split { - let path = tablet_factory.tablet_path_with_prefix( - SPLIT_PREFIX, - region_id, - RAFT_INIT_LOG_INDEX, - ); + let name = reg.tablet_name(SPLIT_PREFIX, region_id, last_index); + let path = reg.tablet_root().join(name); let _ = fs::remove_dir_all(path); } }; diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 06101da8d83..421c2c476f7 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -2,7 +2,7 @@ use std::{mem, sync::Arc}; -use engine_traits::{KvEngine, TabletFactory}; +use engine_traits::{CachedTablet, KvEngine, TabletRegistry}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{fsm::apply::DEFAULT_APPLY_WB_SIZE, ReadTask}; use slog::Logger; @@ -13,7 +13,6 @@ use crate::{ fsm::ApplyResReporter, operation::AdminCmdResult, router::{ApplyRes, CmdResChannel}, - tablet::CachedTablet, }; /// Apply applies all the committed commands to kv db. @@ -24,7 +23,7 @@ pub struct Apply { tablet: EK, write_batch: Option, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, callbacks: Vec<(Vec, RaftCmdResponse)>, @@ -48,11 +47,13 @@ impl Apply { peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, - mut remote_tablet: CachedTablet, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, read_scheduler: Scheduler>, logger: Logger, ) -> Self { + let mut remote_tablet = tablet_registry + .get(region_state.get_region().get_id()) + .unwrap(); Apply { peer, tablet: remote_tablet.latest().unwrap().clone(), @@ -64,7 +65,7 @@ impl Apply { applied_term: 0, admin_cmd_result: vec![], region_state, - tablet_factory, + tablet_registry, read_scheduler, res_reporter, logger, @@ -72,8 +73,8 @@ impl Apply { } #[inline] - pub fn tablet_factory(&self) -> &Arc> { - &self.tablet_factory + pub fn tablet_registry(&self) -> &TabletRegistry { + &self.tablet_registry } #[inline] diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 16e3e54d5f2..9101a9328f3 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -8,7 +8,7 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::atomic::AtomicCell; -use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; +use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletRegistry}; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; @@ -38,7 +38,6 @@ use crate::{ fsm::{ApplyFsm, ApplyScheduler}, operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteEncoder}, router::{CmdResChannel, QueryResChannel}, - tablet::CachedTablet, worker::PdTask, Result, }; @@ -99,7 +98,7 @@ impl Peer { /// If peer is destroyed, `None` is returned. pub fn new( cfg: &Config, - tablet_factory: &dyn TabletFactory, + tablet_registry: &TabletRegistry, storage: Storage, ) -> Result { let logger = storage.logger().clone(); @@ -110,33 +109,19 @@ impl Peer { let region_id = storage.region().get_id(); let tablet_index = storage.region_state().get_tablet_index(); + let cached_tablet = tablet_registry.get_or_default(region_id); // Another option is always create tablet even if tablet index is 0. But this // can introduce race when gc old tablet and create new peer. - let tablet = if tablet_index != 0 { - if !tablet_factory.exists(region_id, tablet_index) { - return Err(box_err!( - "missing tablet {} for region {}", - tablet_index, - region_id - )); - } + if tablet_index != 0 { // TODO: Perhaps we should stop create the tablet automatically. - Some(tablet_factory.open_tablet( - region_id, - Some(tablet_index), - OpenOptions::default().set_create(true), - )?) - } else { - None - }; - - let tablet = CachedTablet::new(tablet); + tablet_registry.load(region_id, tablet_index, false)?; + } let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { - tablet, + tablet: cached_tablet, self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 369a25984bf..5211d293e0f 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -372,11 +372,11 @@ mod tests { use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, TestTabletFactoryV2}, + kv::{KvTestEngine, TestTabletFactory}, raft::RaftTestEngine, }; use engine_traits::{ - KvEngine, OpenOptions, RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletFactory, ALL_CFS, + KvEngine, RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletRegistry, ALL_CFS, }; use kvproto::{ metapb::{Peer, Region}, @@ -392,7 +392,7 @@ mod tests { use tikv_util::worker::{Runnable, Worker}; use super::*; - use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes, tablet::CachedTablet}; + use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes}; #[derive(Clone)] pub struct TestRouter { @@ -477,11 +477,8 @@ mod tests { // building a tablet factory let ops = DbOptions::default(); let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); - let factory = Arc::new(TestTabletFactoryV2::new( - path.path().join("tablet").as_path(), - ops, - cf_opts, - )); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path().join("tablet")).unwrap(); let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); let sched = worker.scheduler(); let logger = slog_global::borrow_global().new(o!()); @@ -491,8 +488,7 @@ mod tests { let snapshot = new_empty_snapshot(region.clone(), 10, 1, false); let mut task = WriteTask::new(region.get_id(), 5, 0); - s.apply_snapshot(&snapshot, &mut task, mgr, factory) - .unwrap(); + s.apply_snapshot(&snapshot, &mut task, mgr, reg).unwrap(); // It can be set before load tablet. assert_eq!(PeerState::Normal, s.region_state().get_state()); @@ -528,15 +524,9 @@ mod tests { // building a tablet factory let ops = DbOptions::default(); let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); - let factory = Arc::new(TestTabletFactoryV2::new( - path.path().join("tablet").as_path(), - ops, - cf_opts, - )); - // create tablet with region_id 1 - let tablet = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path().join("tablet")).unwrap(); + reg.load(region.get_id(), 10, true).unwrap(); // setup read runner worker and peer storage let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); let sched = worker.scheduler(); @@ -548,13 +538,14 @@ mod tests { let mut read_runner = ReadRunner::new(router.clone(), raft_engine); read_runner.set_snap_mgr(mgr.clone()); worker.start(read_runner); + let mut state = RegionLocalState::default(); + state.set_region(region.clone()); // setup peer applyer let mut apply = Apply::new( region.get_peers()[0].clone(), - RegionLocalState::default(), + state, router, - CachedTablet::new(Some(tablet)), - factory, + reg, sched, logger, ); @@ -577,6 +568,7 @@ mod tests { let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); let checkpointer_path = mgr.tablet_gen_path(&snap_key); assert!(checkpointer_path.exists()); + s.snapshot(0, 7).unwrap(); // Test cancel snapshot let snap = s.snapshot(0, 0); diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 8cb65e40a3c..3dda00eb270 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -3,7 +3,7 @@ use std::sync::{Arc, Mutex}; use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, @@ -49,15 +49,15 @@ where } impl RaftRouter { - pub fn new(store_id: u64, router: StoreRouter) -> Self { - let mut store_meta = StoreMeta::new(); + pub fn new(store_id: u64, reg: TabletRegistry, router: StoreRouter) -> Self { + let mut store_meta = StoreMeta::default(); store_meta.store_id = Some(store_id); let store_meta = Arc::new(Mutex::new(store_meta)); let logger = router.logger().clone(); RaftRouter { router: router.clone(), - local_reader: LocalReader::new(store_meta, router, logger), + local_reader: LocalReader::new(store_meta, reg, router, logger), } } @@ -69,7 +69,7 @@ impl RaftRouter { self.router.send(addr, msg) } - pub fn store_meta(&self) -> &Arc>> { + pub fn store_meta(&self) -> &Arc> { self.local_reader.store_meta() } diff --git a/components/raftstore-v2/src/tablet.rs b/components/raftstore-v2/src/tablet.rs deleted file mode 100644 index 7765f5c07b6..00000000000 --- a/components/raftstore-v2/src/tablet.rs +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use std::sync::{ - atomic::{AtomicU64, Ordering}, - Arc, Mutex, -}; - -#[derive(Debug)] -struct LatestTablet { - data: Mutex>, - version: AtomicU64, -} - -/// Tablet may change during split, merge and applying snapshot. So we need a -/// shared value to reflect the latest tablet. `CachedTablet` provide cache that -/// can speed up common access. -#[derive(Clone, Debug)] -pub struct CachedTablet { - latest: Arc>, - cache: Option, - version: u64, -} - -impl CachedTablet { - #[inline] - pub fn new(data: Option) -> Self { - CachedTablet { - latest: Arc::new(LatestTablet { - data: Mutex::new(data.clone()), - version: AtomicU64::new(0), - }), - cache: data, - version: 0, - } - } - - pub fn set(&mut self, data: EK) { - self.version = { - let mut latest_data = self.latest.data.lock().unwrap(); - *latest_data = Some(data.clone()); - self.latest.version.fetch_add(1, Ordering::Relaxed) + 1 - }; - self.cache = Some(data); - } - - /// Get the tablet from cache without checking if it's up to date. - #[inline] - pub fn cache(&self) -> Option<&EK> { - self.cache.as_ref() - } - - /// Get the latest tablet. - #[inline] - pub fn latest(&mut self) -> Option<&EK> { - if self.latest.version.load(Ordering::Relaxed) > self.version { - let latest_data = self.latest.data.lock().unwrap(); - self.version = self.latest.version.load(Ordering::Relaxed); - self.cache = latest_data.clone(); - } - self.cache() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_cached_tablet() { - let mut cached_tablet = CachedTablet::new(None); - assert_eq!(cached_tablet.cache(), None); - assert_eq!(cached_tablet.latest(), None); - - cached_tablet = CachedTablet::new(Some(1)); - assert_eq!(cached_tablet.cache().cloned(), Some(1)); - assert_eq!(cached_tablet.latest().cloned(), Some(1)); - - // Setting tablet will refresh cache immediately. - cached_tablet.set(2); - assert_eq!(cached_tablet.cache().cloned(), Some(2)); - - // Test `latest()` will use cache. - // Unsafe modify the data. - let old_data = *cached_tablet.latest.data.lock().unwrap(); - *cached_tablet.latest.data.lock().unwrap() = Some(0); - assert_eq!(cached_tablet.latest().cloned(), old_data); - // Restore the data. - *cached_tablet.latest.data.lock().unwrap() = old_data; - - let mut cloned = cached_tablet.clone(); - // Clone should reuse cache. - assert_eq!(cloned.cache().cloned(), Some(2)); - cloned.set(1); - assert_eq!(cloned.cache().cloned(), Some(1)); - assert_eq!(cloned.latest().cloned(), Some(1)); - - // Local cache won't be refreshed until querying latest. - assert_eq!(cached_tablet.cache().cloned(), Some(2)); - assert_eq!(cached_tablet.latest().cloned(), Some(1)); - assert_eq!(cached_tablet.cache().cloned(), Some(1)); - } -} diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index 132678e21f2..9803039e392 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -11,7 +11,7 @@ use std::{ use causal_ts::CausalTsProviderImpl; use collections::HashMap; use concurrency_manager::ConcurrencyManager; -use engine_traits::{KvEngine, RaftEngine, TabletFactory}; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; use pd_client::PdClient; use raftstore::store::{util::KeysInfoFormatter, TxnExt}; @@ -97,7 +97,7 @@ where store_id: u64, pd_client: Arc, raft_engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, router: StoreRouter, remote: Remote, @@ -130,7 +130,7 @@ where store_id: u64, pd_client: Arc, raft_engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, router: StoreRouter, remote: Remote, concurrency_manager: ConcurrencyManager, @@ -142,7 +142,7 @@ where store_id, pd_client, raft_engine, - tablet_factory, + tablet_registry, router, remote, region_peers: HashMap::default(), diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs index 1caa96a5225..8f49e7f025f 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs @@ -260,12 +260,12 @@ where /// Returns (capacity, used, available). fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { - let disk_stats = match fs2::statvfs(self.tablet_factory.tablets_path()) { + let disk_stats = match fs2::statvfs(self.tablet_registry.tablet_root()) { Err(e) => { error!( self.logger, "get disk stat for rocksdb failed"; - "engine_path" => self.tablet_factory.tablets_path().display(), + "engine_path" => self.tablet_registry.tablet_root().display(), "err" => ?e ); return None; diff --git a/components/raftstore-v2/tests/failpoints/test_basic_write.rs b/components/raftstore-v2/tests/failpoints/test_basic_write.rs index 4bf4201f67c..be5ccf8316c 100644 --- a/components/raftstore-v2/tests/failpoints/test_basic_write.rs +++ b/components/raftstore-v2/tests/failpoints/test_basic_write.rs @@ -2,7 +2,7 @@ use std::{assert_matches::assert_matches, time::Duration}; -use engine_traits::{OpenOptions, Peekable, TabletFactory}; +use engine_traits::Peekable; use futures::executor::block_on; use kvproto::raft_cmdpb::{CmdType, Request}; use raftstore_v2::router::PeerMsg; @@ -13,7 +13,7 @@ use crate::cluster::Cluster; #[test] fn test_write_batch_rollback() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; let mut req = router.new_request_for(2); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); @@ -25,10 +25,8 @@ fn test_write_batch_rollback() { // Make several entries to batch in apply thread. fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); - let tablet_factory = cluster.node(0).tablet_factory(); - let tablet = tablet_factory - .open_tablet(2, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); + let tablet_registry = cluster.node(0).tablet_registry(); + let tablet = tablet_registry.get(2).unwrap().latest().unwrap().clone(); // Good proposal should be committed. let (msg, mut sub0) = PeerMsg::raft_command(req.clone()); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 24184233117..3e2ced3df3c 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -5,7 +5,7 @@ use std::{ path::Path, sync::{ atomic::{AtomicUsize, Ordering}, - Arc, Mutex, + Arc, }, thread, time::{Duration, Instant}, @@ -17,10 +17,10 @@ use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{self, Receiver, Sender, TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, TestTabletFactoryV2}, + kv::{KvTestEngine, TestTabletFactory}, raft::RaftTestEngine, }; -use engine_traits::{OpenOptions, TabletFactory, ALL_CFS}; +use engine_traits::{TabletRegistry, ALL_CFS}; use futures::executor::block_on; use kvproto::{ metapb::{self, RegionEpoch, Store}, @@ -36,7 +36,7 @@ use raftstore::store::{ use raftstore_v2::{ create_store_batch_system, router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, - Bootstrap, StoreMeta, StoreSystem, + Bootstrap, StoreSystem, }; use slog::{debug, o, Logger}; use tempfile::TempDir; @@ -46,7 +46,6 @@ use tikv_util::{ store::new_peer, }; -#[derive(Clone)] pub struct TestRouter(RaftRouter); impl Deref for TestRouter { @@ -182,12 +181,10 @@ impl TestRouter { pub struct RunningState { store_id: u64, pub raft_engine: RaftTestEngine, - pub factory: Arc, + pub registry: TabletRegistry, pub system: StoreSystem, pub cfg: Arc>, pub transport: TestTransport, - // We need this to clear the ref counts of CachedTablet when shutdown - store_meta: Arc>>, } impl RunningState { @@ -205,11 +202,8 @@ impl RunningState { .copied() .map(|cf| (cf, CfOptions::default())) .collect(); - let factory = Arc::new(TestTabletFactoryV2::new( - path, - DbOptions::default(), - cf_opts, - )); + let factory = Box::new(TestTabletFactory::new(DbOptions::default(), cf_opts)); + let registry = TabletRegistry::new(factory, path).unwrap(); let raft_engine = engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), None) .unwrap(); @@ -218,17 +212,17 @@ impl RunningState { let mut store = Store::default(); store.set_id(store_id); if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { - if factory.exists(region.get_id(), RAFT_INIT_LOG_INDEX) { + let factory = registry.tablet_factory(); + let path = registry.tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); + if factory.exists(&path) { + registry.remove(region.get_id()); factory - .destroy_tablet(region.get_id(), RAFT_INIT_LOG_INDEX) + .destroy_tablet(region.get_id(), Some(RAFT_INIT_LOG_INDEX), &path) .unwrap(); } + // Create the tablet without loading it in cache. factory - .open_tablet( - region.get_id(), - Some(RAFT_INIT_LOG_INDEX), - OpenOptions::default().set_create_new(true), - ) + .open_tablet(region.get_id(), Some(RAFT_INIT_LOG_INDEX), &path) .unwrap(); } @@ -238,7 +232,7 @@ impl RunningState { logger.clone(), ); - let router = RaftRouter::new(store_id, router); + let router = RaftRouter::new(store_id, registry.clone(), router); let store_meta = router.store_meta().clone(); let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()); snap_mgr.init().unwrap(); @@ -247,11 +241,11 @@ impl RunningState { store_id, cfg.clone(), raft_engine.clone(), - factory.clone(), + registry.clone(), transport.clone(), pd_client.clone(), router.store_router(), - store_meta.clone(), + store_meta, snap_mgr.clone(), concurrency_manager, causal_ts_provider, @@ -261,11 +255,10 @@ impl RunningState { let state = Self { store_id, raft_engine, - factory, + registry, system, cfg, transport, - store_meta, }; (TestRouter(router), snap_mgr, state) } @@ -313,8 +306,8 @@ impl TestNode { router } - pub fn tablet_factory(&self) -> &Arc { - &self.running_state().unwrap().factory + pub fn tablet_registry(&self) -> &TabletRegistry { + &self.running_state().unwrap().registry } pub fn pd_client(&self) -> &Arc { @@ -322,10 +315,7 @@ impl TestNode { } fn stop(&mut self) { - if let Some(state) = std::mem::take(&mut self.running_state) { - let mut meta = state.store_meta.lock().unwrap(); - meta.tablet_caches.clear(); - } + self.running_state.take(); } fn restart(&mut self) -> TestRouter { @@ -420,7 +410,7 @@ pub struct Cluster { pd_server: test_pd::Server, nodes: Vec, receivers: Vec>, - routers: Vec, + pub routers: Vec, logger: Logger, } @@ -463,18 +453,15 @@ impl Cluster { } pub fn restart(&mut self, offset: usize) { + self.routers.remove(offset); let router = self.nodes[offset].restart(); - self.routers[offset] = router; + self.routers.insert(offset, router); } pub fn node(&self, offset: usize) -> &TestNode { &self.nodes[offset] } - pub fn router(&self, offset: usize) -> TestRouter { - self.routers[offset].clone() - } - /// Send messages and wait for side effects are all handled. #[allow(clippy::vec_box)] pub fn dispatch(&self, region_id: u64, mut msgs: Vec>) { diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs index fc23e46e12f..a0d3d1ac34a 100644 --- a/components/raftstore-v2/tests/integrations/test_basic_write.rs +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -2,7 +2,7 @@ use std::{assert_matches::assert_matches, time::Duration}; -use engine_traits::{OpenOptions, Peekable, TabletFactory}; +use engine_traits::Peekable; use futures::executor::block_on; use kvproto::{ raft_cmdpb::{CmdType, Request}, @@ -18,7 +18,7 @@ use crate::cluster::Cluster; #[test] fn test_basic_write() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; let mut req = router.new_request_for(2); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); @@ -113,7 +113,7 @@ fn test_basic_write() { #[test] fn test_put_delete() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; let mut req = router.new_request_for(2); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); @@ -123,10 +123,8 @@ fn test_put_delete() { router.wait_applied_to_current_term(2, Duration::from_secs(3)); - let tablet_factory = cluster.node(0).tablet_factory(); - let tablet = tablet_factory - .open_tablet(2, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); + let registry = cluster.node(0).tablet_registry(); + let tablet = registry.get(2).unwrap().latest().unwrap().clone(); assert!(tablet.get_value(b"key").unwrap().is_none()); let (msg, mut sub) = PeerMsg::raft_command(req.clone()); router.send(2, msg).unwrap(); diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 558962f8ef6..4f3ffbbf24c 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -2,7 +2,7 @@ use std::{self, time::Duration}; -use engine_traits::{OpenOptions, Peekable, TabletFactory}; +use engine_traits::Peekable; use kvproto::raft_cmdpb::{AdminCmdType, CmdType, Request}; use raft::prelude::ConfChangeType; use raftstore_v2::router::{PeerMsg, PeerTick}; @@ -14,7 +14,7 @@ use crate::cluster::Cluster; fn test_simple_change() { let cluster = Cluster::with_node_count(2, None); let region_id = 2; - let router0 = cluster.router(0); + let router0 = &cluster.routers[0]; let mut req = router0.new_request_for(2); let admin_req = req.mut_admin_request(); admin_req.set_cmd_type(AdminCmdType::ChangePeer); @@ -39,7 +39,7 @@ fn test_simple_change() { // So heartbeat will create a learner. cluster.dispatch(2, vec![]); - let router1 = cluster.router(1); + let router1 = &cluster.routers[1]; let meta = router1 .must_query_debug_info(2, Duration::from_secs(3)) .unwrap(); @@ -77,10 +77,8 @@ fn test_simple_change() { // read the new written kv. assert_eq!(match_index, meta.raft_apply.truncated_state.index); assert!(meta.raft_apply.applied_index >= match_index); - let tablet_factory = cluster.node(1).tablet_factory(); - let tablet = tablet_factory - .open_tablet(region_id, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); + let registry = cluster.node(1).tablet_registry(); + let tablet = registry.get(region_id).unwrap().latest().unwrap().clone(); assert_eq!(tablet.get_value(key).unwrap().unwrap(), val); req.mut_header() diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs index ed0ebcc9b8a..805cda15471 100644 --- a/components/raftstore-v2/tests/integrations/test_life.rs +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -64,11 +64,11 @@ fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb #[test] fn test_life_by_message() { let mut cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; let test_region_id = 4; let test_peer_id = 5; let test_leader_id = 6; - assert_peer_not_exist(test_region_id, test_peer_id, &router); + assert_peer_not_exist(test_region_id, test_peer_id, router); // Build a correct message. let mut msg = Box::::default(); @@ -85,7 +85,7 @@ fn test_life_by_message() { let mut wrong_msg = msg.clone(); f(&mut wrong_msg); router.send_raft_message(wrong_msg).unwrap(); - assert_peer_not_exist(test_region_id, test_peer_id, &router); + assert_peer_not_exist(test_region_id, test_peer_id, router); }; // Check mismatch store id. @@ -113,7 +113,7 @@ fn test_life_by_message() { // The peer should survive restart. cluster.restart(0); - let router = cluster.router(0); + let router = &cluster.routers[0]; let meta = router .must_query_debug_info(test_region_id, timeout) .unwrap(); @@ -129,13 +129,13 @@ fn test_life_by_message() { let mut tombstone_msg = msg.clone(); tombstone_msg.set_is_tombstone(true); router.send_raft_message(tombstone_msg).unwrap(); - assert_peer_not_exist(test_region_id, test_peer_id, &router); + assert_peer_not_exist(test_region_id, test_peer_id, router); assert_tombstone(raft_engine, test_region_id, &new_peer(1, test_peer_id)); // Restart should not recreate tombstoned peer. cluster.restart(0); - let router = cluster.router(0); - assert_peer_not_exist(test_region_id, test_peer_id, &router); + let router = &cluster.routers[0]; + assert_peer_not_exist(test_region_id, test_peer_id, router); let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; assert_tombstone(raft_engine, test_region_id, &new_peer(1, test_peer_id)); } @@ -143,7 +143,7 @@ fn test_life_by_message() { #[test] fn test_destroy_by_larger_id() { let mut cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; let test_region_id = 4; let test_peer_id = 6; let init_term = 5; @@ -180,7 +180,7 @@ fn test_destroy_by_larger_id() { let mut larger_id_msg = smaller_id_msg; larger_id_msg.set_to_peer(new_peer(1, test_peer_id + 1)); router.send_raft_message(larger_id_msg).unwrap(); - assert_peer_not_exist(test_region_id, test_peer_id, &router); + assert_peer_not_exist(test_region_id, test_peer_id, router); let meta = router .must_query_debug_info(test_region_id, timeout) .unwrap(); @@ -189,7 +189,7 @@ fn test_destroy_by_larger_id() { // New peer should survive restart. cluster.restart(0); - let router = cluster.router(0); + let router = &cluster.routers[0]; let meta = router .must_query_debug_info(test_region_id, timeout) .unwrap(); diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs index c22ef4908bf..96bcbbccf7a 100644 --- a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -11,7 +11,7 @@ use crate::cluster::Cluster; fn test_region_heartbeat() { let region_id = 2; let cluster = Cluster::with_node_count(1, None); - let router = cluster.router(0); + let router = &cluster.routers[0]; // When there is only one peer, it should campaign immediately. let mut req = RaftCmdRequest::default(); diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs index 2155a4775c6..07ae8b44bf3 100644 --- a/components/raftstore-v2/tests/integrations/test_read.rs +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -13,7 +13,7 @@ fn test_read_index() { let mut config = v2_default_config(); config.raft_store_max_leader_lease = ReadableDuration::millis(150); let cluster = Cluster::with_config(config); - let router = cluster.router(0); + let router = &cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(region_id); @@ -58,7 +58,7 @@ fn test_read_index() { #[test] fn test_snap_without_read_index() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(region_id); @@ -84,7 +84,7 @@ fn test_snap_without_read_index() { #[test] fn test_query_with_write_cmd() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(2); @@ -111,7 +111,7 @@ fn test_query_with_write_cmd() { #[test] fn test_snap_with_invalid_parameter() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(region_id); @@ -163,8 +163,8 @@ fn test_snap_with_invalid_parameter() { #[test] fn test_local_read() { - let cluster = Cluster::default(); - let mut router = cluster.router(0); + let mut cluster = Cluster::default(); + let router = &mut cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(region_id); diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs index 336a9c9d038..60495b151e8 100644 --- a/components/raftstore-v2/tests/integrations/test_split.rs +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -126,9 +126,9 @@ fn split_region( #[test] fn test_split() { - let cluster = Cluster::default(); + let mut cluster = Cluster::default(); let store_id = cluster.node(0).id(); - let mut router = cluster.router(0); + let router = &mut cluster.routers[0]; // let factory = cluster.node(0).tablet_factory(); let region_id = 2; @@ -140,7 +140,7 @@ fn test_split() { // -> Region 2 ["", "k22"] peer(1, 3) // Region 1000 ["k22", ""] peer(1, 10) let (left, right) = split_region( - &mut router, + router, region, peer.clone(), 1000, @@ -155,7 +155,7 @@ fn test_split() { // -> Region 2 ["", "k11"] peer(1, 3) // Region 1001 ["k11", "k22"] peer(1, 11) let _ = split_region( - &mut router, + router, left, peer, 1001, @@ -170,7 +170,7 @@ fn test_split() { // -> Region 1000 ["k22", "k33"] peer(1, 10) // Region 1002 ["k33", ""] peer(1, 12) let _ = split_region( - &mut router, + router, right, new_peer(store_id, 10), 1002, diff --git a/components/raftstore-v2/tests/integrations/test_status.rs b/components/raftstore-v2/tests/integrations/test_status.rs index 1f7415d9da3..59c23c4180f 100644 --- a/components/raftstore-v2/tests/integrations/test_status.rs +++ b/components/raftstore-v2/tests/integrations/test_status.rs @@ -8,7 +8,7 @@ use crate::cluster::Cluster; #[test] fn test_status() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; // When there is only one peer, it should campaign immediately. let mut req = RaftCmdRequest::default(); req.mut_header().set_peer(new_peer(1, 3)); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index e93b18fed96..a50e3a39667 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -44,8 +44,8 @@ use engine_rocks::{ }; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ - CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, - TabletFactory, CF_DEFAULT, CF_LOCK, CF_WRITE, + CachedTablet, CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, + RaftEngine, SingletonFactory, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, }; use error_code::ErrorCodeExt; use file_system::{ @@ -238,7 +238,7 @@ struct TikvServer { sst_worker: Option>>, quota_limiter: Arc, causal_ts_provider: Option>, // used for rawkv apiv2 - tablet_factory: Option + Send + Sync>>, + tablet_registry: Option>, br_snap_recovery_mode: bool, // use for br snapshot recovery } @@ -390,7 +390,7 @@ where sst_worker: None, quota_limiter, causal_ts_provider, - tablet_factory: None, + tablet_registry: None, br_snap_recovery_mode: is_recovering_marked, } } @@ -806,7 +806,7 @@ where cfg_controller.register( tikv::config::Module::Storage, Box::new(StorageConfigManger::new( - self.tablet_factory.as_ref().unwrap().clone(), + self.tablet_registry.as_ref().unwrap().clone(), ttl_scheduler, flow_controller, storage.get_scheduler(), @@ -1366,7 +1366,7 @@ where // for recording the latest tablet for each region. // `cached_latest_tablets` is passed to `update` to avoid memory // allocation each time when calling `update`. - let mut cached_latest_tablets: HashMap = HashMap::new(); + let mut cached_latest_tablets = HashMap::default(); self.background_worker .spawn_interval_task(DEFAULT_METRICS_FLUSH_INTERVAL, move || { let now = Instant::now(); @@ -1736,7 +1736,7 @@ impl ConfiguredRaftEngine for RocksEngine { fn register_config(&self, cfg_controller: &mut ConfigController) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new(Arc::new(self.clone()), DbType::Raft)), + Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), ); } } @@ -1800,29 +1800,33 @@ impl TikvServer { ); // Create kv engine. - let builder = KvEngineFactoryBuilder::new(env, &self.config, &self.store_path, block_cache) + let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) .compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { router: Mutex::new(self.router.clone()), })) .region_info_accessor(self.region_info_accessor.clone()) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); - let factory = Arc::new(builder.build()); + let factory = Box::new(builder.build()); let kv_engine = factory - .create_shared_db() + .create_shared_db(&self.store_path) .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); - let engines = Engines::new(kv_engine, raft_engine); + let engines = Engines::new(kv_engine.clone(), raft_engine); let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new(factory.clone(), DbType::Kv)), + Box::new(DbConfigManger::new(kv_engine.clone(), DbType::Kv)), ); - self.tablet_factory = Some(factory.clone()); + let reg = TabletRegistry::new(Box::new(SingletonFactory::new(kv_engine)), &self.store_path) + .unwrap(); + // It always use the singleton kv_engine, use arbitrary id and suffix. + reg.load(0, 0, false).unwrap(); + self.tablet_registry = Some(reg.clone()); engines.raft.register_config(cfg_controller); let engines_info = Arc::new(EnginesResourceInfo::new( - factory, + reg, engines.raft.as_rocks_engine().cloned(), 180, // max_samples_to_preserve )); @@ -1974,7 +1978,7 @@ impl EngineMetricsManager { } pub struct EnginesResourceInfo { - tablet_factory: Arc + Sync + Send>, + tablet_registry: TabletRegistry, raft_engine: Option, latest_normalized_pending_bytes: AtomicU32, normalized_pending_bytes_collector: MovingAvgU32, @@ -1984,12 +1988,12 @@ impl EnginesResourceInfo { const SCALE_FACTOR: u64 = 100; fn new( - tablet_factory: Arc + Sync + Send>, + tablet_registry: TabletRegistry, raft_engine: Option, max_samples_to_preserve: usize, ) -> Self { EnginesResourceInfo { - tablet_factory, + tablet_registry, raft_engine, latest_normalized_pending_bytes: AtomicU32::new(0), normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), @@ -1999,7 +2003,7 @@ impl EnginesResourceInfo { pub fn update( &self, _now: Instant, - cached_latest_tablets: &mut HashMap, + cached_latest_tablets: &mut HashMap>, ) { let mut normalized_pending_bytes = 0; @@ -2022,19 +2026,11 @@ impl EnginesResourceInfo { fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); } - self.tablet_factory - .for_each_opened_tablet( - &mut |id, suffix, db: &RocksEngine| match cached_latest_tablets.entry(id) { - collections::HashMapEntry::Occupied(mut slot) => { - if slot.get().0 < suffix { - slot.insert((suffix, db.clone())); - } - } - collections::HashMapEntry::Vacant(slot) => { - slot.insert((suffix, db.clone())); - } - }, - ); + self.tablet_registry + .for_each_opened_tablet(|id, db: &mut CachedTablet| { + cached_latest_tablets.insert(id, db.clone()); + true + }); // todo(SpadeA): Now, there's a potential race condition problem where the // tablet could be destroyed after the clone and before the fetching @@ -2045,7 +2041,8 @@ impl EnginesResourceInfo { // propose another PR to tackle it such as destory tablet lazily in a GC // thread. - for (_, (_, tablet)) in cached_latest_tablets.iter() { + for (_, cache) in cached_latest_tablets.iter_mut() { + let Some(tablet) = cache.latest() else { continue }; for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { fetch_engine_cf(tablet, cf, &mut normalized_pending_bytes); } @@ -2089,10 +2086,8 @@ mod test { sync::{atomic::Ordering, Arc}, }; - use engine_rocks::{raw::Env, RocksEngine}; - use engine_traits::{ - FlowControlFactorsExt, MiscExt, OpenOptions, SyncMutable, TabletFactory, CF_DEFAULT, - }; + use engine_rocks::raw::Env; + use engine_traits::{FlowControlFactorsExt, MiscExt, SyncMutable, TabletRegistry, CF_DEFAULT}; use tempfile::Builder; use tikv::{config::TikvConfig, server::KvEngineFactoryBuilder}; use tikv_util::{config::ReadableSize, time::Instant}; @@ -2110,18 +2105,15 @@ mod test { let path = Builder::new().prefix("test-update").tempdir().unwrap(); let cache = config.storage.block_cache.build_shared_cache(); - let builder = KvEngineFactoryBuilder::new(env, &config, path.path(), cache); - let factory = builder.build_v2(); + let factory = KvEngineFactoryBuilder::new(env, &config, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), path.path()).unwrap(); for i in 1..6 { - let _ = factory - .open_tablet(i, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); + reg.load(i, 10, true).unwrap(); } - let tablet = factory - .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) - .unwrap(); + let mut cached = reg.get(1).unwrap(); + let mut tablet = cached.latest().unwrap(); // Prepare some data for two tablets of the same region. So we can test whether // we fetch the bytes from the latest one. for i in 1..21 { @@ -2135,9 +2127,8 @@ mod test { .unwrap() .unwrap(); - let tablet = factory - .open_tablet(1, Some(20), OpenOptions::default().set_create_new(true)) - .unwrap(); + reg.load(1, 20, true).unwrap(); + tablet = cached.latest().unwrap(); for i in 1..11 { tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); @@ -2152,9 +2143,9 @@ mod test { assert!(old_pending_compaction_bytes > new_pending_compaction_bytes); - let engines_info = Arc::new(EnginesResourceInfo::new(Arc::new(factory), None, 10)); + let engines_info = Arc::new(EnginesResourceInfo::new(reg, None, 10)); - let mut cached_latest_tablets: HashMap = HashMap::new(); + let mut cached_latest_tablets = HashMap::default(); engines_info.update(Instant::now(), &mut cached_latest_tablets); // The memory allocation should be reserved diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index c6b70fa24f0..3a4ed373e8c 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -16,8 +16,7 @@ use encryption_export::{ use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, TabletFactory, ALL_CFS, - CF_DEFAULT, CF_RAFT, + Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, ALL_CFS, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; use futures::executor::block_on; @@ -596,15 +595,15 @@ pub fn create_test_engine( let raft_engine = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); - let mut builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path(), cache) - .sst_recovery_sender(Some(scheduler)); + let mut builder = + KvEngineFactoryBuilder::new(env, &cfg, cache).sst_recovery_sender(Some(scheduler)); if let Some(router) = router { builder = builder.compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { router: Mutex::new(router), })); } let factory = builder.build(); - let engine = factory.create_shared_db().unwrap(); + let engine = factory.create_shared_db(dir.path()).unwrap(); let engines = Engines::new(engine, raft_engine); (engines, key_manager, dir, sst_worker) } diff --git a/src/config/configurable.rs b/src/config/configurable.rs new file mode 100644 index 00000000000..7cbcc731eb6 --- /dev/null +++ b/src/config/configurable.rs @@ -0,0 +1,141 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{error::Error, io::Write}; + +use engine_rocks::RocksEngine; +use engine_traits::{ + CachedTablet, CfOptionsExt, DbOptions, DbOptionsExt, TabletRegistry, CF_DEFAULT, +}; + +pub type ConfigRes = Result<(), Box>; + +pub trait ConfigurableDb { + fn set_db_config(&self, opts: &[(&str, &str)]) -> ConfigRes; + fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> ConfigRes; + fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes; + fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes; + fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes; +} + +impl ConfigurableDb for RocksEngine { + fn set_db_config(&self, opts: &[(&str, &str)]) -> ConfigRes { + self.set_db_options(opts).map_err(Box::from) + } + + fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> ConfigRes { + self.set_options_cf(cf, opts).map_err(Box::from) + } + + fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes { + let mut opt = self.get_db_options(); + opt.set_rate_bytes_per_sec(rate_bytes_per_sec) + .map_err(Box::from) + } + + fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes { + let mut opt = self.get_db_options(); + opt.set_rate_limiter_auto_tuned(auto_tuned) + .map_err(Box::new)?; + // double check the new state + let new_auto_tuned = opt.get_rate_limiter_auto_tuned(); + if new_auto_tuned == Some(auto_tuned) { + Ok(()) + } else { + Err(engine_traits::Status::with_error( + engine_traits::Code::IoError, + "fail to set rate_limiter_auto_tuned", + ) + .into()) + } + } + + fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes { + let opt = self.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap + opt.set_block_cache_capacity(capacity as u64) + .map_err(Box::from) + } +} + +fn loop_registry( + registry: &TabletRegistry, + mut f: impl FnMut(&mut CachedTablet) -> std::result::Result>, +) -> ConfigRes { + let mut error_count = 0; + let mut res = Ok(()); + let mut error_samples: Vec = vec![]; + registry.for_each_opened_tablet(|id, cache| match f(cache) { + Ok(b) => b, + Err(e) => { + error_count += 1; + res = Err(e); + if error_count <= 3 { + writeln!( + error_samples, + "Tablet {} {:?} encountered error: {:?}.", + id, + cache.cache().map(|c| c.as_inner().path()), + res + ) + .unwrap(); + } + true + } + }); + if error_count > 0 { + error!( + "Total count {}. Sample errors: {}", + error_count, + std::str::from_utf8(&error_samples).unwrap() + ); + } + res +} + +impl ConfigurableDb for TabletRegistry { + fn set_db_config(&self, opts: &[(&str, &str)]) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_db_config(opts)?; + } + Ok(true) + }) + } + + fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_cf_config(cf, opts)?; + } + Ok(true) + }) + } + + fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_rate_bytes_per_sec(rate_bytes_per_sec)? + } + Ok(true) + }) + } + + fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_rate_limiter_auto_tuned(auto_tuned)? + } + Ok(true) + }) + } + + fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_shared_block_cache_capacity(capacity)?; + Ok(false) + } else { + Ok(true) + } + }) + } +} diff --git a/src/config.rs b/src/config/mod.rs similarity index 97% rename from src/config.rs rename to src/config/mod.rs index 97bab103af2..2b0818e93d3 100644 --- a/src/config.rs +++ b/src/config/mod.rs @@ -5,6 +5,8 @@ //! TiKV is configured through the `TikvConfig` type, which is in turn //! made up of many other configuration types. +mod configurable; + use std::{ cmp, collections::{HashMap, HashSet}, @@ -20,6 +22,7 @@ use std::{ use api_version::ApiV1Ttl; use causal_ts::Config as CausalTsConfig; +pub use configurable::{ConfigRes, ConfigurableDb}; use encryption_export::DataKeyManager; use engine_rocks::{ config::{self as rocks_config, BlobRunMode, CompressionType, LogLevel as RocksLogLevel}, @@ -36,8 +39,8 @@ use engine_rocks::{ DEFAULT_PROP_SIZE_INDEX_DISTANCE, }; use engine_traits::{ - CfOptions as _, CfOptionsExt, DbOptions as _, DbOptionsExt, MiscExt, TabletAccessor, - TabletErrorCollector, TitanCfOptions as _, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + CfOptions as _, DbOptions as _, MiscExt, TitanCfOptions as _, CF_DEFAULT, CF_LOCK, CF_RAFT, + CF_WRITE, }; use file_system::IoRateLimiter; use keys::region_raft_prefix_len; @@ -1539,36 +1542,21 @@ pub enum DbType { Raft, } -pub struct DbConfigManger> { - tablet_accessor: Arc, +pub struct DbConfigManger { + db: D, db_type: DbType, } -impl> DbConfigManger { - pub fn new(tablet_accessor: Arc, db_type: DbType) -> Self { - DbConfigManger { - tablet_accessor, - db_type, - } - } - - fn set_db_config(&self, opts: &[(&str, &str)]) -> Result<(), Box> { - let mut error_collector = TabletErrorCollector::new(); - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - error_collector.add_result(region_id, suffix, db.set_db_options(opts)); - }); - error_collector.take_result() +impl DbConfigManger { + pub fn new(db: D, db_type: DbType) -> Self { + DbConfigManger { db, db_type } } +} +impl DbConfigManger { fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> Result<(), Box> { - let mut error_collector = TabletErrorCollector::new(); self.validate_cf(cf)?; - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - error_collector.add_result(region_id, suffix, db.set_options_cf(cf, opts)); - }); - error_collector.take_result()?; + self.db.set_cf_config(cf, opts)?; // Write config to metric for (cfg_name, cfg_value) in opts { @@ -1586,73 +1574,6 @@ impl> DbConfigManger { Ok(()) } - fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> Result<(), Box> { - let mut error_collector = TabletErrorCollector::new(); - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - let mut opt = db.get_db_options(); - let r = opt.set_rate_bytes_per_sec(rate_bytes_per_sec); - if r.is_err() { - error_collector.add_result(region_id, suffix, r); - } - }); - error_collector.take_result() - } - - fn set_rate_limiter_auto_tuned( - &self, - rate_limiter_auto_tuned: bool, - ) -> Result<(), Box> { - let mut error_collector = TabletErrorCollector::new(); - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - let mut opt = db.get_db_options(); - let r = opt.set_rate_limiter_auto_tuned(rate_limiter_auto_tuned); - if r.is_err() { - error_collector.add_result(region_id, suffix, r); - } else { - // double check the new state - let new_auto_tuned = opt.get_rate_limiter_auto_tuned(); - if new_auto_tuned.is_none() - || new_auto_tuned.unwrap() != rate_limiter_auto_tuned - { - error_collector.add_result( - region_id, - suffix, - Err(engine_traits::Status::with_error( - engine_traits::Code::IoError, - "fail to set rate_limiter_auto_tuned", - ) - .into()), - ); - } - } - }); - - error_collector.take_result() - } - - fn set_max_background_jobs(&self, max_background_jobs: i32) -> Result<(), Box> { - self.set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; - Ok(()) - } - - fn set_max_background_flushes( - &self, - max_background_flushes: i32, - ) -> Result<(), Box> { - self.set_db_config(&[( - "max_background_flushes", - &max_background_flushes.to_string(), - )])?; - Ok(()) - } - - fn set_max_subcompactions(&self, max_subcompactions: u32) -> Result<(), Box> { - self.set_db_config(&[("max_subcompactions", &max_subcompactions.to_string())])?; - Ok(()) - } - fn validate_cf(&self, cf: &str) -> Result<(), Box> { match (self.db_type, cf) { (DbType::Kv, CF_DEFAULT) @@ -1665,7 +1586,7 @@ impl> DbConfigManger { } } -impl + Send + Sync> ConfigManager for DbConfigManger { +impl ConfigManager for DbConfigManger { fn dispatch(&mut self, change: ConfigChange) -> Result<(), Box> { let change_str = format!("{:?}", change); let mut change: Vec<(String, ConfigValue)> = change.into_iter().collect(); @@ -1698,7 +1619,8 @@ impl + Send + Sync> ConfigManager for DbConfigMan .next() { let rate_bytes_per_sec: ReadableSize = rate_bytes_config.1.into(); - self.set_rate_bytes_per_sec(rate_bytes_per_sec.0 as i64)?; + self.db + .set_rate_bytes_per_sec(rate_bytes_per_sec.0 as i64)?; } if let Some(rate_bytes_config) = change @@ -1706,37 +1628,43 @@ impl + Send + Sync> ConfigManager for DbConfigMan .next() { let rate_limiter_auto_tuned: bool = rate_bytes_config.1.into(); - self.set_rate_limiter_auto_tuned(rate_limiter_auto_tuned)?; + self.db + .set_rate_limiter_auto_tuned(rate_limiter_auto_tuned)?; } if let Some(background_jobs_config) = change .drain_filter(|(name, _)| name == "max_background_jobs") .next() { - let max_background_jobs = background_jobs_config.1.into(); - self.set_max_background_jobs(max_background_jobs)?; + let max_background_jobs: i32 = background_jobs_config.1.into(); + self.db + .set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; } if let Some(background_subcompactions_config) = change .drain_filter(|(name, _)| name == "max_sub_compactions") .next() { - let max_subcompactions = background_subcompactions_config.1.into(); - self.set_max_subcompactions(max_subcompactions)?; + let max_subcompactions: u32 = background_subcompactions_config.1.into(); + self.db + .set_db_config(&[("max_subcompactions", &max_subcompactions.to_string())])?; } if let Some(background_flushes_config) = change .drain_filter(|(name, _)| name == "max_background_flushes") .next() { - let max_background_flushes = background_flushes_config.1.into(); - self.set_max_background_flushes(max_background_flushes)?; + let max_background_flushes: i32 = background_flushes_config.1.into(); + self.db.set_db_config(&[( + "max_background_flushes", + &max_background_flushes.to_string(), + )])?; } if !change.is_empty() { let change = config_value_to_string(change); let change_slice = config_to_slice(&change); - self.set_db_config(&change_slice)?; + self.db.set_db_config(&change_slice)?; } info!( "rocksdb config changed"; @@ -4019,7 +3947,7 @@ mod tests { use api_version::{ApiV1, KvFormat}; use case_macros::*; use engine_rocks::raw::LRUCacheOptions; - use engine_traits::{CfOptions as _, DbOptions as _, DummyFactory}; + use engine_traits::{CfOptions as _, CfOptionsExt, DbOptions as _, DbOptionsExt}; use futures::executor::block_on; use grpcio::ResourceQuota; use itertools::Itertools; @@ -4464,13 +4392,13 @@ mod tests { let cfg_controller = ConfigController::new(cfg); cfg_controller.register( Module::Rocksdb, - Box::new(DbConfigManger::new(Arc::new(engine.clone()), DbType::Kv)), + Box::new(DbConfigManger::new(engine.clone(), DbType::Kv)), ); let (scheduler, receiver) = dummy_scheduler(); cfg_controller.register( Module::Storage, Box::new(StorageConfigManger::new( - Arc::new(DummyFactory::new(Some(engine), "".to_string())), + engine, scheduler, flow_controller.clone(), storage.get_scheduler(), @@ -5305,9 +5233,11 @@ mod tests { ); } + static CONFIG_TEMPLATE: &str = include_str!("../../etc/config-template.toml"); + #[test] fn test_config_template_is_valid() { - let template_config = std::include_str!("../etc/config-template.toml") + let template_config = CONFIG_TEMPLATE .lines() .map(|l| l.strip_prefix('#').unwrap_or(l)) .join("\n"); @@ -5318,7 +5248,7 @@ mod tests { #[test] fn test_config_template_no_superfluous_keys() { - let template_config = std::include_str!("../etc/config-template.toml") + let template_config = CONFIG_TEMPLATE .lines() .map(|l| l.strip_prefix('#').unwrap_or(l)) .join("\n"); @@ -5336,7 +5266,7 @@ mod tests { #[test] fn test_config_template_matches_default() { - let template_config = std::include_str!("../etc/config-template.toml") + let template_config = CONFIG_TEMPLATE .lines() .map(|l| l.strip_prefix('#').unwrap_or(l)) .join("\n"); diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 7e36efcb98f..01dc1e4a786 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -1,24 +1,17 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - path::{Path, PathBuf}, - sync::{Arc, Mutex}, -}; +use std::{path::Path, sync::Arc}; use engine_rocks::{ raw::{Cache, Env}, - CompactedEventSender, CompactionListener, FlowListener, RocksCompactionJobInfo, RocksEngine, - RocksEventListener, -}; -use engine_traits::{ - CfOptions, CfOptionsExt, CompactionJobInfo, OpenOptions, Result, TabletAccessor, TabletFactory, - CF_DEFAULT, CF_WRITE, + CompactedEventSender, CompactionListener, FlowListener, RocksCfOptions, RocksCompactionJobInfo, + RocksDbOptions, RocksEngine, RocksEventListener, }; +use engine_traits::{CompactionJobInfo, MiscExt, Result, TabletFactory, CF_DEFAULT, CF_WRITE}; use kvproto::kvrpcpb::ApiVersion; use raftstore::RegionInfoAccessor; use tikv_util::worker::Scheduler; -use super::engine_factory_v2::KvEngineFactoryV2; use crate::config::{DbConfig, TikvConfig, DEFAULT_ROCKSDB_SUB_DIR}; struct FactoryInner { @@ -26,11 +19,9 @@ struct FactoryInner { region_info_accessor: Option, block_cache: Cache, rocksdb_config: Arc, - store_path: PathBuf, api_version: ApiVersion, flow_listener: Option, sst_recovery_sender: Option>, - root_db: Mutex>, } pub struct KvEngineFactoryBuilder { @@ -39,23 +30,16 @@ pub struct KvEngineFactoryBuilder { } impl KvEngineFactoryBuilder { - pub fn new( - env: Arc, - config: &TikvConfig, - store_path: impl Into, - cache: Cache, - ) -> Self { + pub fn new(env: Arc, config: &TikvConfig, cache: Cache) -> Self { Self { inner: FactoryInner { env, region_info_accessor: None, block_cache: cache, rocksdb_config: Arc::new(config.rocksdb.clone()), - store_path: store_path.into(), api_version: config.storage.api_version(), flow_listener: None, sst_recovery_sender: None, - root_db: Mutex::default(), }, compact_event_sender: None, } @@ -90,14 +74,6 @@ impl KvEngineFactoryBuilder { compact_event_sender: self.compact_event_sender.clone(), } } - - pub fn build_v2(self) -> KvEngineFactoryV2 { - let factory = KvEngineFactory { - inner: Arc::new(self.inner), - compact_event_sender: self.compact_event_sender.clone(), - }; - KvEngineFactoryV2::new(factory) - } } #[derive(Clone)] @@ -129,172 +105,125 @@ impl KvEngineFactory { )) } - pub fn create_tablet( - &self, - tablet_path: &Path, - region_id: u64, - suffix: u64, - ) -> Result { + fn db_opts(&self) -> RocksDbOptions { // Create kv engine. - let mut kv_db_opts = self.inner.rocksdb_config.build_opt(); - kv_db_opts.set_env(self.inner.env.clone()); - kv_db_opts.add_event_listener(RocksEventListener::new( + let mut db_opts = self.inner.rocksdb_config.build_opt(); + db_opts.set_env(self.inner.env.clone()); + db_opts.add_event_listener(RocksEventListener::new( "kv", self.inner.sst_recovery_sender.clone(), )); if let Some(filter) = self.create_raftstore_compaction_listener() { - kv_db_opts.add_event_listener(filter); + db_opts.add_event_listener(filter); } - if let Some(listener) = &self.inner.flow_listener { - kv_db_opts.add_event_listener(listener.clone_with(region_id, suffix)); - } - let kv_cfs_opts = self.inner.rocksdb_config.build_cf_opts( + db_opts + } + + fn cf_opts(&self) -> Vec<(&str, RocksCfOptions)> { + self.inner.rocksdb_config.build_cf_opts( &self.inner.block_cache, self.inner.region_info_accessor.as_ref(), self.inner.api_version, - ); - let kv_engine = engine_rocks::util::new_engine_opt( - tablet_path.to_str().unwrap(), - kv_db_opts, - kv_cfs_opts, - ); + ) + } + + /// Create a shared db. + /// + /// It will always create in path/DEFAULT_DB_SUB_DIR. + pub fn create_shared_db(&self, path: &Path) -> Result { + let mut db_opts = self.db_opts(); + let cf_opts = self.cf_opts(); + if let Some(listener) = &self.inner.flow_listener { + db_opts.add_event_listener(listener.clone()); + } + let target_path = path.join(DEFAULT_ROCKSDB_SUB_DIR); + let kv_engine = + engine_rocks::util::new_engine_opt(target_path.to_str().unwrap(), db_opts, cf_opts); if let Err(e) = &kv_engine { - error!("failed to create kv engine"; "path" => %tablet_path.display(), "err" => ?e); + error!("failed to create kv engine"; "path" => %path.display(), "err" => ?e); } kv_engine } +} - pub fn on_tablet_created(&self, region_id: u64, suffix: u64) { - if let Some(listener) = &self.inner.flow_listener { - let listener = listener.clone_with(region_id, suffix); - listener.on_created(); +impl TabletFactory for KvEngineFactory { + fn open_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result { + let mut db_opts = self.db_opts(); + let cf_opts = self.cf_opts(); + if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = suffix { + db_opts.add_event_listener(listener.clone_with(id, suffix)); } + let kv_engine = + engine_rocks::util::new_engine_opt(path.to_str().unwrap(), db_opts, cf_opts); + if let Err(e) = &kv_engine { + error!("failed to create tablet"; "id" => id, "suffix" => ?suffix, "path" => %path.display(), "err" => ?e); + } else if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = suffix { + listener.clone_with(id, suffix).on_created(); + } + kv_engine } - pub fn destroy_tablet(&self, tablet_path: &Path) -> engine_traits::Result<()> { - info!("destroy tablet"; "path" => %tablet_path.display()); + fn destroy_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result<()> { + info!("destroy tablet"; "path" => %path.display(), "id" => id, "suffix" => ?suffix); // Create kv engine. - let mut kv_db_opts = self.inner.rocksdb_config.build_opt(); - kv_db_opts.set_env(self.inner.env.clone()); - if let Some(filter) = self.create_raftstore_compaction_listener() { - kv_db_opts.add_event_listener(filter); - } - let _kv_cfs_opts = self.inner.rocksdb_config.build_cf_opts( - &self.inner.block_cache, - self.inner.region_info_accessor.as_ref(), - self.inner.api_version, - ); + let _db_opts = self.db_opts(); + let _cf_opts = self.cf_opts(); // TODOTODO: call rust-rocks or tirocks to destroy_engine; // engine_rocks::util::destroy_engine( - // tablet_path.to_str().unwrap(), + // path.to_str().unwrap(), // kv_db_opts, // kv_cfs_opts, // )?; - let _ = std::fs::remove_dir_all(tablet_path); - Ok(()) - } - - pub fn on_tablet_destroy(&self, region_id: u64, suffix: u64) { - if let Some(listener) = &self.inner.flow_listener { - let listener = listener.clone_with(region_id, suffix); - listener.on_destroyed(); - } - } - - pub fn store_path(&self) -> PathBuf { - self.inner.store_path.clone() - } - - #[inline] - fn kv_engine_path(&self) -> PathBuf { - self.inner.store_path.join(DEFAULT_ROCKSDB_SUB_DIR) - } -} - -impl TabletFactory for KvEngineFactory { - #[inline] - fn create_shared_db(&self) -> Result { - let root_path = self.kv_engine_path(); - let tablet = self.create_tablet(&root_path, 0, 0)?; - let mut root_db = self.inner.root_db.lock().unwrap(); - root_db.replace(tablet.clone()); - Ok(tablet) - } - - /// Open the root tablet according to the OpenOptions. - /// - /// If options.create_new is true, create the root tablet. If the tablet - /// exists, it will fail. - /// - /// If options.create is true, open the the root tablet if it exists or - /// create it otherwise. - fn open_tablet( - &self, - _id: u64, - _suffix: Option, - options: OpenOptions, - ) -> Result { - if let Some(db) = self.inner.root_db.lock().unwrap().as_ref() { - if options.create_new() { - return Err(box_err!( - "root tablet {} already exists", - db.as_inner().path() - )); - } - return Ok(db.clone()); + let _ = std::fs::remove_dir_all(path); + if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = suffix { + listener.clone_with(id, suffix).on_destroyed(); } - // No need for mutex protection here since root_db creation only occurs at - // tikv bootstrap time when there is no racing issue. - if options.create_new() || options.create() { - return self.create_shared_db(); - } - - Err(box_err!("root tablet has not been initialized")) - } - - fn open_tablet_raw( - &self, - _path: &Path, - _id: u64, - _suffix: u64, - _options: OpenOptions, - ) -> Result { - self.create_shared_db() - } - - fn exists_raw(&self, _path: &Path) -> bool { - false - } - - fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { - self.kv_engine_path() - } - - fn tablets_path(&self) -> PathBuf { - self.kv_engine_path() - } - - #[inline] - fn destroy_tablet(&self, _id: u64, _suffix: u64) -> engine_traits::Result<()> { Ok(()) } - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - let db = self.inner.root_db.lock().unwrap(); - let opt = db.as_ref().unwrap().get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity)?; - Ok(()) + fn exists(&self, path: &Path) -> bool { + RocksEngine::exists(path.to_str().unwrap()) } } -impl TabletAccessor for KvEngineFactory { - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { - let db = self.inner.root_db.lock().unwrap(); - let db = db.as_ref().unwrap(); - f(0, 0, db); - } - - fn is_single_engine(&self) -> bool { - true +#[cfg(test)] +mod tests { + use std::path::Path; + + use engine_traits::TabletRegistry; + + use super::*; + use crate::config::TikvConfig; + + #[test] + fn test_engine_factory() { + let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + let common_test_cfg = manifest_dir.join("components/test_raftstore/src/common-test.toml"); + let cfg = TikvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { + panic!( + "invalid auto generated configuration file {}, err {}", + manifest_dir.display(), + e + ); + }); + let cache = cfg.storage.block_cache.build_shared_cache(); + let dir = test_util::temp_dir("test-engine-factory", false); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), dir.path()).unwrap(); + let path = reg.tablet_path(1, 3); + assert!(!reg.tablet_factory().exists(&path)); + let engine = reg.tablet_factory().open_tablet(1, Some(3), &path).unwrap(); + assert!(reg.tablet_factory().exists(&path)); + // Second attempt should fail with lock. + reg.tablet_factory() + .open_tablet(1, Some(3), &path) + .unwrap_err(); + drop(engine); + reg.tablet_factory() + .destroy_tablet(1, Some(3), &path) + .unwrap(); + assert!(!reg.tablet_factory().exists(&path)); } } diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs deleted file mode 100644 index a55ebca6555..00000000000 --- a/src/server/engine_factory_v2.rs +++ /dev/null @@ -1,487 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - path::{Path, PathBuf}, - sync::{Arc, Mutex}, -}; - -use collections::HashMap; -use engine_rocks::RocksEngine; -use engine_traits::{ - CfOptions, CfOptionsExt, MiscExt, OpenOptions, Result, TabletAccessor, TabletFactory, - CF_DEFAULT, -}; - -use crate::server::engine_factory::KvEngineFactory; - -const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; - -#[derive(Clone)] -pub struct KvEngineFactoryV2 { - inner: KvEngineFactory, - // region_id -> (tablet, tablet_suffix) - registry: Arc>>, -} - -impl KvEngineFactoryV2 { - pub fn new(inner: KvEngineFactory) -> Self { - KvEngineFactoryV2 { - inner, - registry: Arc::new(Mutex::new(HashMap::default())), - } - } -} - -impl TabletFactory for KvEngineFactoryV2 { - /// open a tablet according to the OpenOptions. - /// - /// If options.cache_only is true, only open the relevant tablet from - /// `registry`. - /// - /// If options.create_new is true, create a tablet by id and suffix. If the - /// tablet exists, it will fail. - /// - /// If options.create is true, open the tablet with id and suffix if it - /// exists or create it otherwise. - /// - /// If options.skip_cache is true, cache will not be updated. - /// - /// Note: options.cache_only and options.create and/or options.create_new - /// cannot be true simultaneously - fn open_tablet( - &self, - id: u64, - suffix: Option, - mut options: OpenOptions, - ) -> Result { - if options.create_new() && suffix.is_none() { - return Err(box_err!( - "suffix should be provided when creating new tablet" - )); - } - - if options.create() || options.create_new() { - options = options.set_cache_only(false); - } - - let mut reg = self.registry.lock().unwrap(); - if let Some(suffix) = suffix { - if let Some((cached_tablet, cached_suffix)) = reg.get(&id) && *cached_suffix == suffix { - // Target tablet exist in the cache - if options.create_new() { - return Err(box_err!( - "region {} {} already exists", - id, - cached_tablet.as_inner().path() - )); - } - return Ok(cached_tablet.clone()); - } else if !options.cache_only() { - let tablet_path = self.tablet_path(id, suffix); - let tablet = self.open_tablet_raw(&tablet_path, id, suffix, options.clone())?; - if !options.skip_cache() { - debug!("Insert a tablet"; "key" => ?(id, suffix)); - reg.insert(id, (tablet.clone(), suffix)); - } - return Ok(tablet); - } - } else if let Some((tablet, _)) = reg.get(&id) { - return Ok(tablet.clone()); - } - - Err(box_err!( - "tablet with region id {} suffix {:?} does not exist", - id, - suffix - )) - } - - fn open_tablet_raw( - &self, - path: &Path, - id: u64, - suffix: u64, - options: OpenOptions, - ) -> Result { - let engine_exist = RocksEngine::exists(path.to_str().unwrap_or_default()); - // Even though neither options.create nor options.create_new are true, if the - // tablet files already exists, we will open it by calling - // inner.create_tablet. In this case, the tablet exists but not in the cache - // (registry). - if !options.create() && !options.create_new() && !engine_exist { - return Err(box_err!( - "path {} does not have db", - path.to_str().unwrap_or_default() - )); - }; - - if options.create_new() && engine_exist { - return Err(box_err!( - "region {} {} already exists", - id, - path.to_str().unwrap() - )); - } - - let tablet = self.inner.create_tablet(path, id, suffix)?; - debug!("open tablet"; "key" => ?(id, suffix)); - self.inner.on_tablet_created(id, suffix); - Ok(tablet) - } - - #[inline] - fn create_shared_db(&self) -> Result { - self.open_tablet(0, Some(0), OpenOptions::default().set_create_new(true)) - } - - #[inline] - fn exists_raw(&self, path: &Path) -> bool { - RocksEngine::exists(path.to_str().unwrap_or_default()) - } - - #[inline] - fn tablets_path(&self) -> PathBuf { - self.inner.store_path().join("tablets") - } - - #[inline] - fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf { - self.inner - .store_path() - .join(format!("tablets/{}{}_{}", prefix, id, suffix)) - } - - #[inline] - fn mark_tombstone(&self, region_id: u64, suffix: u64) { - let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); - // When the full directory path does not exsit, create will return error and in - // this case, we just ignore it. - let _ = std::fs::File::create(path); - debug!("tombstone tablet"; "region_id" => region_id, "suffix" => suffix); - { - let mut reg = self.registry.lock().unwrap(); - if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { - reg.insert(region_id, (cached_tablet, cached_suffix)); - } - } - } - - #[inline] - fn is_tombstoned(&self, region_id: u64, suffix: u64) -> bool { - self.tablet_path(region_id, suffix) - .join(TOMBSTONE_MARK) - .exists() - } - - #[inline] - fn destroy_tablet(&self, region_id: u64, suffix: u64) -> engine_traits::Result<()> { - let path = self.tablet_path(region_id, suffix); - { - let mut reg = self.registry.lock().unwrap(); - if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { - reg.insert(region_id, (cached_tablet, cached_suffix)); - } - } - self.inner.destroy_tablet(&path)?; - self.inner.on_tablet_destroy(region_id, suffix); - Ok(()) - } - - #[inline] - fn load_tablet(&self, path: &Path, region_id: u64, suffix: u64) -> Result { - { - let reg = self.registry.lock().unwrap(); - if let Some((db, db_suffix)) = reg.get(®ion_id) && *db_suffix == suffix { - return Err(box_err!( - "region {} {} already exists", - region_id, - db.as_inner().path() - )); - } - } - - let db_path = self.tablet_path(region_id, suffix); - std::fs::rename(path, db_path)?; - self.open_tablet( - region_id, - Some(suffix), - OpenOptions::default().set_create(true), - ) - } - - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - let reg = self.registry.lock().unwrap(); - // pick up any tablet and set the shared block cache capacity - if let Some((_id, (tablet, _suffix))) = (*reg).iter().next() { - let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity)?; - } - Ok(()) - } -} - -impl TabletAccessor for KvEngineFactoryV2 { - #[inline] - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { - let reg = self.registry.lock().unwrap(); - for (id, (tablet, suffix)) in &*reg { - f(*id, *suffix, tablet) - } - } - - // it have multi tablets. - fn is_single_engine(&self) -> bool { - false - } -} - -#[cfg(test)] -mod tests { - use engine_traits::{OpenOptions, CF_WRITE, SPLIT_PREFIX}; - use tempfile::TempDir; - - use super::*; - use crate::{config::TikvConfig, server::KvEngineFactoryBuilder}; - - lazy_static! { - static ref TEST_CONFIG: TikvConfig = { - let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); - let common_test_cfg = - manifest_dir.join("components/test_raftstore/src/common-test.toml"); - TikvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { - panic!( - "invalid auto generated configuration file {}, err {}", - manifest_dir.display(), - e - ); - }) - }; - } - - fn create_test_tablet_factory(name: &'static str) -> (TempDir, KvEngineFactoryBuilder) { - let cfg = TEST_CONFIG.clone(); - let cache = cfg.storage.block_cache.build_shared_cache(); - let dir = test_util::temp_dir(name, false); - let env = cfg.build_shared_rocks_env(None, None).unwrap(); - - let builder = KvEngineFactoryBuilder::new(env, &cfg, dir.path(), cache); - (dir, builder) - } - - #[test] - fn test_kvengine_factory() { - let (_dir, builder) = create_test_tablet_factory("test_kvengine_factory"); - let factory = builder.build(); - let shared_db = factory.create_shared_db().unwrap(); - - // V1 can only create tablet once - factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap_err(); - - let tablet = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create(true)) - .unwrap(); - assert_eq!(tablet.as_inner().path(), shared_db.as_inner().path()); - let tablet = factory - .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) - .unwrap(); - assert_eq!(tablet.as_inner().path(), shared_db.as_inner().path()); - let tablet = factory - .open_tablet(1, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); - assert_eq!(tablet.as_inner().path(), shared_db.as_inner().path()); - let mut count = 0; - factory.for_each_opened_tablet(&mut |id, suffix, _tablet| { - assert!(id == 0); - assert!(suffix == 0); - count += 1; - }); - assert_eq!(count, 1); - assert!(factory.is_single_engine()); - assert!(shared_db.is_single_engine()); - factory - .set_shared_block_cache_capacity(1024 * 1024) - .unwrap(); - let opt = shared_db.get_options_cf(CF_DEFAULT).unwrap(); - assert_eq!(opt.get_block_cache_capacity(), 1024 * 1024); - } - - #[test] - fn test_kvengine_factory_root_db_implicit_creation() { - let (_dir, builder) = create_test_tablet_factory("test_kvengine_factory"); - let factory = builder.build(); - - // root_db should be created implicitly here - let tablet = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create(true)) - .unwrap(); - - // error is expected since root_db is created already - factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap_err(); - - let mut count = 0; - factory.for_each_opened_tablet(&mut |id, suffix, _tablet| { - assert!(id == 0); - assert!(suffix == 0); - count += 1; - }); - assert_eq!(count, 1); - assert!(factory.is_single_engine()); - factory - .set_shared_block_cache_capacity(1024 * 1024) - .unwrap(); - let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); - assert_eq!(opt.get_block_cache_capacity(), 1024 * 1024); - } - - #[test] - fn test_kvengine_factory_v2() { - let (_dir, builder) = create_test_tablet_factory("test_kvengine_factory_v2"); - - let factory = builder.build_v2(); - let tablet = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - let tablet2 = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create(true)) - .unwrap(); - assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); - let tablet2 = factory - .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) - .unwrap(); - assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); - let tablet2 = factory - .open_tablet(1, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); - assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); - - // Only both region id and suffix match can get the tablet from the cache. - factory - .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) - .unwrap_err(); - - let tablet_path = factory.tablet_path(1, 10); - let result = factory.open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)); - result.unwrap_err(); - - factory - .set_shared_block_cache_capacity(1024 * 1024) - .unwrap(); - let opt = tablet.get_options_cf(CF_WRITE).unwrap(); - assert_eq!(opt.get_block_cache_capacity(), 1024 * 1024); - - assert!(factory.exists(1, 10)); - assert!(!factory.exists(1, 11)); - assert!(!factory.exists(2, 10)); - assert!(!factory.exists(2, 11)); - assert!(factory.exists_raw(&tablet_path)); - assert!(!factory.is_tombstoned(1, 10)); - factory.load_tablet(&tablet_path, 1, 10).unwrap_err(); - factory.load_tablet(&tablet_path, 1, 20).unwrap(); - // After we load it as with the new id or suffix, we should be unable to get it - // with the old id and suffix in the cache. - factory - .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) - .unwrap_err(); - factory - .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) - .unwrap(); - - factory - .open_tablet(1, Some(30), OpenOptions::default().set_create_new(true)) - .unwrap(); - // After open a tablet with the same id but higher suffix, we cannot get the old - // one from cache. - factory - .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) - .unwrap_err(); - // Destroy/mark tombstone the old tablet will not unregister the new tablet in - // the cache - factory.mark_tombstone(1, 20); - factory - .open_tablet(1, Some(30), OpenOptions::default().set_cache_only(true)) - .unwrap(); - factory.destroy_tablet(1, 20).unwrap(); - factory - .open_tablet(1, Some(30), OpenOptions::default().set_cache_only(true)) - .unwrap(); - - factory.mark_tombstone(1, 30); - assert!(factory.is_tombstoned(1, 30)); - factory.destroy_tablet(1, 30).unwrap(); - - let result = factory.open_tablet(1, Some(30), OpenOptions::default()); - result.unwrap_err(); - - assert!(!factory.is_single_engine()); - - assert!( - factory - .tablet_path_with_prefix(SPLIT_PREFIX, 1, 10) - .ends_with("split_1_10") - ); - } - - #[test] - fn test_existed_db_not_in_registry() { - let (_dir, builder) = create_test_tablet_factory("test_kvengine_factory_v2"); - - let factory = builder.build_v2(); - let tablet = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - drop(tablet); - let (tablet, _) = factory.registry.lock().unwrap().remove(&1).unwrap(); - drop(tablet); - factory - .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) - .unwrap_err(); - - let tablet_path = factory.tablet_path(1, 10); - let tablet = factory - .open_tablet_raw(&tablet_path, 1, 10, OpenOptions::default()) - .unwrap(); - // the tablet will not inserted in the cache - factory - .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) - .unwrap_err(); - drop(tablet); - - let tablet_path = factory.tablet_path(1, 20); - // No such tablet, so error will be returned. - factory - .open_tablet_raw(&tablet_path, 1, 10, OpenOptions::default()) - .unwrap_err(); - - let _ = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create(true)) - .unwrap(); - - // Now, it should be in the cache. - factory - .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) - .unwrap(); - } - - #[test] - fn test_get_live_tablets() { - let (_dir, builder) = create_test_tablet_factory("test_get_live_tablets"); - let factory = builder.build_v2(); - factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - factory - .open_tablet(2, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - let mut count = 0; - factory.for_each_opened_tablet(&mut |id, suffix, _tablet| { - assert!(id == 1 || id == 2); - assert!(suffix == 10); - count += 1; - }); - assert_eq!(count, 2); - } -} diff --git a/src/server/mod.rs b/src/server/mod.rs index d926ca40b2a..1b41dfc4e56 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -6,7 +6,6 @@ mod raft_client; pub mod config; pub mod debug; mod engine_factory; -mod engine_factory_v2; pub mod errors; pub mod gc_worker; pub mod load_statistics; diff --git a/src/storage/config_manager.rs b/src/storage/config_manager.rs index 3cda77ab5a2..b6a5f9d58ab 100644 --- a/src/storage/config_manager.rs +++ b/src/storage/config_manager.rs @@ -4,7 +4,7 @@ use std::{convert::TryInto, sync::Arc}; -use engine_traits::{KvEngine, TabletFactory, CF_DEFAULT}; +use engine_traits::{ALL_CFS, CF_DEFAULT}; use file_system::{get_io_rate_limiter, IoPriority, IoType}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use strum::IntoEnumIterator; @@ -15,29 +15,30 @@ use tikv_util::{ }; use crate::{ + config::ConfigurableDb, server::{ttl::TtlCheckerTask, CONFIG_ROCKSDB_GAUGE}, storage::{lock_manager::LockManager, txn::flow_controller::FlowController, TxnScheduler}, }; -pub struct StorageConfigManger { - tablet_factory: Arc + Send + Sync>, +pub struct StorageConfigManger { + configurable_db: K, ttl_checker_scheduler: Scheduler, flow_controller: Arc, scheduler: TxnScheduler, } -unsafe impl Send for StorageConfigManger {} -unsafe impl Sync for StorageConfigManger {} +unsafe impl Send for StorageConfigManger {} +unsafe impl Sync for StorageConfigManger {} -impl StorageConfigManger { +impl StorageConfigManger { pub fn new( - tablet_factory: Arc + Send + Sync>, + configurable_db: K, ttl_checker_scheduler: Scheduler, flow_controller: Arc, scheduler: TxnScheduler, ) -> Self { StorageConfigManger { - tablet_factory, + configurable_db, ttl_checker_scheduler, flow_controller, scheduler, @@ -45,13 +46,16 @@ impl StorageConfigManger { } } -impl ConfigManager for StorageConfigManger { +impl ConfigManager + for StorageConfigManger +{ fn dispatch(&mut self, mut change: ConfigChange) -> CfgResult<()> { if let Some(ConfigValue::Module(mut block_cache)) = change.remove("block_cache") { if let Some(size) = block_cache.remove("capacity") { if size != ConfigValue::None { let s: ReadableSize = size.into(); - self.tablet_factory.set_shared_block_cache_capacity(s.0)?; + self.configurable_db + .set_shared_block_cache_capacity(s.0 as usize)?; // Write config to metric CONFIG_ROCKSDB_GAUGE .with_label_values(&[CF_DEFAULT, "block_cache_size"]) @@ -67,15 +71,11 @@ impl ConfigManager for StorageConfigMan if let Some(v) = flow_control.remove("enable") { let enable: bool = v.into(); let enable_str = if enable { "true" } else { "false" }; - self.tablet_factory.for_each_opened_tablet( - &mut |_region_id, _suffix, tablet: &K| { - for cf in tablet.cf_names() { - tablet - .set_options_cf(cf, &[("disable_write_stall", enable_str)]) - .unwrap(); - } - }, - ); + for cf in ALL_CFS { + self.configurable_db + .set_cf_config(cf, &[("disable_write_stall", enable_str)]) + .unwrap(); + } self.flow_controller.enable(enable); } } else if let Some(v) = change.get("scheduler_worker_pool_size") { diff --git a/src/storage/txn/flow_controller/tablet_flow_controller.rs b/src/storage/txn/flow_controller/tablet_flow_controller.rs index 17a5900bea7..973ed245ac8 100644 --- a/src/storage/txn/flow_controller/tablet_flow_controller.rs +++ b/src/storage/txn/flow_controller/tablet_flow_controller.rs @@ -11,9 +11,9 @@ use std::{ time::Duration, }; -use collections::HashMap; +use collections::{HashMap, HashMapEntry}; use engine_rocks::FlowInfo; -use engine_traits::{CfNamesExt, FlowControlFactorsExt, OpenOptions, TabletFactory}; +use engine_traits::{CfNamesExt, FlowControlFactorsExt, TabletRegistry}; use rand::Rng; use tikv_util::{sys::thread::StdThreadBuildWrapper, time::Limiter}; @@ -47,9 +47,9 @@ impl Drop for TabletFlowController { } impl TabletFlowController { - pub fn new( + pub fn new( config: &FlowControlConfig, - tablet_factory: Arc + Send + Sync>, + registry: TabletRegistry, flow_info_receiver: Receiver, ) -> Self { let (tx, rx) = mpsc::sync_channel(5); @@ -69,7 +69,7 @@ impl TabletFlowController { handle: Some(FlowInfoDispatcher::start( rx, flow_info_receiver, - tablet_factory, + registry, flow_checkers, limiters, config.clone(), @@ -86,10 +86,10 @@ impl TabletFlowController { struct FlowInfoDispatcher; impl FlowInfoDispatcher { - fn start( + fn start( rx: Receiver, flow_info_receiver: Receiver, - tablet_factory: Arc + Send + Sync>, + registry: TabletRegistry, flow_checkers: Arc>>>, limiters: Limiters, config: FlowControlConfig, @@ -116,32 +116,6 @@ impl FlowInfoDispatcher { Err(_) => {} } - let insert_limiter_and_checker = |region_id, suffix| -> FlowChecker { - let engine = tablet_factory - .open_tablet( - region_id, - Some(suffix), - OpenOptions::default().set_cache_only(true), - ) - .unwrap(); - let mut v = limiters.as_ref().write().unwrap(); - let discard_ratio = Arc::new(AtomicU32::new(0)); - let limiter = v.entry(region_id).or_insert(( - Arc::new( - ::builder(f64::INFINITY) - .refill(Duration::from_millis(1)) - .build(), - ), - discard_ratio, - )); - FlowChecker::new_with_tablet_suffix( - &config, - engine, - limiter.1.clone(), - limiter.0.clone(), - suffix, - ) - }; let msg = flow_info_receiver.recv_deadline(deadline); match msg.clone() { Ok(FlowInfo::L0(_cf, _, region_id, suffix)) @@ -165,22 +139,43 @@ impl FlowInfoDispatcher { } Ok(FlowInfo::Created(region_id, suffix)) => { let mut checkers = flow_checkers.as_ref().write().unwrap(); - let checker = checkers - .entry(region_id) - .or_insert_with(|| insert_limiter_and_checker(region_id, suffix)); + let checker = match checkers.entry(region_id) { + HashMapEntry::Occupied(e) => e.into_mut(), + HashMapEntry::Vacant(e) => { + let engine = if let Some(mut c) = registry.get(region_id) && let Some(t) = c.latest() { + t.clone() + } else { + continue; + }; + let mut v = limiters.as_ref().write().unwrap(); + let discard_ratio = Arc::new(AtomicU32::new(0)); + let limiter = v.entry(region_id).or_insert(( + Arc::new( + ::builder(f64::INFINITY) + .refill(Duration::from_millis(1)) + .build(), + ), + discard_ratio, + )); + e.insert(FlowChecker::new_with_tablet_suffix( + &config, + engine, + limiter.1.clone(), + limiter.0.clone(), + suffix, + )) + }, + }; // check if the checker's engine is exactly (region_id, suffix) // if checker.suffix < suffix, it means its tablet is old and needs the // refresh if checker.tablet_suffix() < suffix { - let engine = tablet_factory - .open_tablet( - region_id, - Some(suffix), - OpenOptions::default().set_cache_only(true), - ) - .unwrap(); - checker.set_engine(engine); - checker.set_tablet_suffix(suffix); + let cached = registry.get(region_id); + // None means the region is destroyed. + if let Some(mut c) = cached && let Some(engine) = c.latest() { + checker.set_engine(engine.clone()); + checker.set_tablet_suffix(suffix); + } } } Ok(FlowInfo::Destroyed(region_id, suffix)) => { @@ -296,35 +291,43 @@ impl TabletFlowController { #[cfg(test)] mod tests { use engine_rocks::FlowInfo; - use engine_traits::DummyFactory; + use engine_traits::SingletonFactory; + use tempfile::TempDir; use super::{ super::{singleton_flow_controller::tests::*, FlowController}, *, }; - fn create_tablet_flow_controller() -> (FlowController, mpsc::SyncSender, EngineStub) { + fn create_tablet_flow_controller() -> ( + TempDir, + FlowController, + mpsc::SyncSender, + TabletRegistry, + ) { let (tx, rx) = mpsc::sync_channel(0); - let root_path = "/tmp"; + let temp_dir = tempfile::tempdir().unwrap(); let stub = EngineStub::new(); - let factory = DummyFactory::::new(Some(stub.clone()), root_path.to_string()); - let tablet_factory = Arc::new(factory); + let factory = Box::new(SingletonFactory::new(stub)); + let registry = TabletRegistry::new(factory, temp_dir.path()).unwrap(); ( + temp_dir, FlowController::Tablet(TabletFlowController::new( &FlowControlConfig::default(), - tablet_factory, + registry.clone(), rx, )), tx, - stub, + registry, ) } #[test] fn test_tablet_flow_controller_basic() { - let (flow_controller, tx, _) = create_tablet_flow_controller(); + let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; + reg.load(region_id, tablet_suffix, false).unwrap(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); tx.send(FlowInfo::L0Intra( @@ -348,9 +351,11 @@ mod tests { #[test] fn test_tablet_flow_controller_memtable() { - let (flow_controller, tx, stub) = create_tablet_flow_controller(); + let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; + let mut cached = reg.load(region_id, tablet_suffix, false).unwrap(); + let stub = cached.latest().unwrap().clone(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); tx.send(FlowInfo::L0Intra( @@ -365,9 +370,11 @@ mod tests { #[test] fn test_tablet_flow_controller_l0() { - let (flow_controller, tx, stub) = create_tablet_flow_controller(); + let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; + let mut cached = reg.load(region_id, tablet_suffix, false).unwrap(); + let stub = cached.latest().unwrap().clone(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); tx.send(FlowInfo::L0Intra( @@ -382,9 +389,11 @@ mod tests { #[test] fn test_tablet_flow_controller_pending_compaction_bytes() { - let (flow_controller, tx, stub) = create_tablet_flow_controller(); + let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; + let mut cached = reg.load(region_id, tablet_suffix, false).unwrap(); + let stub = cached.latest().unwrap().clone(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); tx.send(FlowInfo::L0Intra( diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 2ea66ef1222..2508b544285 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -12,7 +12,6 @@ use std::{ use api_version::{ApiV1, ApiV2, KvFormat}; use collections::HashMap; -use engine_traits::DummyFactory; use errors::{extract_key_error, extract_region_error}; use futures::executor::block_on; use grpcio::*; @@ -267,7 +266,7 @@ fn test_scale_scheduler_pool() { cfg_controller.register( Module::Storage, Box::new(StorageConfigManger::new( - Arc::new(DummyFactory::new(Some(kv_engine), "".to_string())), + kv_engine, scheduler, flow_controller, storage.get_scheduler(), From d86a449d7f5b656cef28576f166e73291f501d77 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 9 Dec 2022 15:54:07 +0800 Subject: [PATCH 400/676] raftstore-v2: add DATA_PREFIX (#13917) ref tikv/tikv#12842 Because v2 doesn't share rocksdb, so it's possible to not write a prefix when writing keys. However, because rocksdb doesn't support specifying infinite upper bound in various APIs like properties, so we should solve those issues before landing prefix-less write. Signed-off-by: Jay Lee --- components/raftstore-v2/src/fsm/apply.rs | 23 ++++++++++++--- .../src/operation/command/admin/split.rs | 17 +++++++---- .../raftstore-v2/src/operation/command/mod.rs | 8 ++--- .../src/operation/command/write/mod.rs | 22 +++++++++++--- components/raftstore-v2/src/raft/apply.rs | 28 +++++++++++------- .../tests/failpoints/test_basic_write.rs | 18 ++++++------ .../tests/integrations/cluster.rs | 19 ++++++++++-- .../tests/integrations/test_basic_write.rs | 15 +++++----- .../tests/integrations/test_conf_change.rs | 29 +++++++++---------- 9 files changed, 115 insertions(+), 64 deletions(-) diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 6e2921a0c0d..7e9a135b498 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -7,17 +7,19 @@ use std::{ Arc, }, task::{Context, Poll}, + time::{Duration, Instant}, }; use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, TabletRegistry}; -use futures::{Future, StreamExt}; +use futures::{compat::Future01CompatExt, Future, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; use raftstore::store::ReadTask; use slog::Logger; use tikv_util::{ mpsc::future::{self, Receiver, Sender, WakePolicy}, + timer::GLOBAL_TIMER_HANDLE, worker::Scheduler, }; @@ -89,9 +91,22 @@ impl ApplyFsm { impl ApplyFsm { pub async fn handle_all_tasks(&mut self) { loop { - let mut task = match self.receiver.next().await { - Some(t) => t, - None => return, + let timeout = GLOBAL_TIMER_HANDLE + .delay(Instant::now() + Duration::from_secs(10)) + .compat(); + let res = futures::select! { + res = self.receiver.next().fuse() => res, + _ = timeout.fuse() => None, + }; + let mut task = match res { + Some(r) => r, + None => { + self.apply.release_memory(); + match self.receiver.next().await { + Some(t) => t, + None => return, + } + } }; loop { match task { diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 13a5d168915..7de49a716c3 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -265,7 +265,7 @@ impl Apply { .open_tablet(region_id, Some(log_index), &path) .unwrap(); // Remove the old write batch. - self.write_batch_mut().take(); + self.write_batch.take(); self.publish_tablet(tablet); self.region_state_mut() @@ -832,16 +832,21 @@ mod test { // Split will create checkpoint tablet, so if there are some writes before // split, they should be flushed immediately. apply.apply_put(CF_DEFAULT, b"k04", b"v4").unwrap(); - assert!(!WriteBatch::is_empty( - apply.write_batch_mut().as_ref().unwrap() - )); + assert!(!WriteBatch::is_empty(apply.write_batch.as_ref().unwrap())); splits.mut_requests().clear(); splits .mut_requests() .push(new_split_req(b"k05", 70, vec![71, 72, 73])); req.set_splits(splits); apply.apply_batch_split(&req, 50).unwrap(); - assert!(apply.write_batch_mut().is_none()); - assert_eq!(apply.tablet().get_value(b"k04").unwrap().unwrap(), b"v4"); + assert!(apply.write_batch.is_none()); + assert_eq!( + apply + .tablet() + .get_value(&keys::data_key(b"k04")) + .unwrap() + .unwrap(), + b"v4" + ); } } diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 3bb6b7b3852..3ee3430a140 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -341,14 +341,14 @@ impl Apply { } if !e.get_data().is_empty() { let mut set_save_point = false; - if let Some(wb) = self.write_batch_mut() { + if let Some(wb) = &mut self.write_batch { wb.set_save_point(); set_save_point = true; } let resp = match self.apply_entry(&e).await { Ok(resp) => resp, Err(e) => { - if let Some(wb) = self.write_batch_mut() { + if let Some(wb) = &mut self.write_batch { if set_save_point { wb.rollback_to_save_point().unwrap(); } else { @@ -500,7 +500,7 @@ impl Apply { #[inline] pub fn flush(&mut self) { - if let Some(wb) = self.write_batch_mut() && !wb.is_empty() { + if let Some(wb) = &mut self.write_batch && !wb.is_empty() { let mut write_opt = WriteOptions::default(); write_opt.set_disable_wal(true); if let Err(e) = wb.write_opt(&write_opt) { @@ -509,7 +509,7 @@ impl Apply { if wb.data_size() <= APPLY_WB_SHRINK_SIZE { wb.clear(); } else { - self.write_batch_mut().take(); + self.write_batch.take(); } } let callbacks = self.callbacks_mut(); diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 59c5679f95f..f9cac15d899 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -113,11 +113,21 @@ impl Apply { #[inline] pub fn apply_put(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { util::check_key_in_region(key, self.region_state().get_region())?; + // Technically it's OK to remove prefix for raftstore v2. But rocksdb doesn't + // support specifying infinite upper bound in various APIs. + keys::data_key_with_buffer(key, &mut self.key_buffer); + self.ensure_write_buffer(); let res = if cf.is_empty() || cf == CF_DEFAULT { // TODO: use write_vector - self.write_batch_or_default().put(key, value) + self.write_batch + .as_mut() + .unwrap() + .put(&self.key_buffer, value) } else { - self.write_batch_or_default().put_cf(cf, key, value) + self.write_batch + .as_mut() + .unwrap() + .put_cf(cf, &self.key_buffer, value) }; res.unwrap_or_else(|e| { panic!( @@ -138,11 +148,15 @@ impl Apply { #[inline] pub fn apply_delete(&mut self, cf: &str, key: &[u8]) -> Result<()> { util::check_key_in_region(key, self.region_state().get_region())?; + keys::data_key_with_buffer(key, &mut self.key_buffer); let res = if cf.is_empty() || cf == CF_DEFAULT { // TODO: use write_vector - self.write_batch_or_default().delete(key) + self.write_batch.as_mut().unwrap().delete(&self.key_buffer) } else { - self.write_batch_or_default().delete_cf(cf, key) + self.write_batch + .as_mut() + .unwrap() + .delete_cf(cf, &self.key_buffer) }; res.unwrap_or_else(|e| { panic!( diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 421c2c476f7..d4a4cf61602 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -2,7 +2,7 @@ use std::{mem, sync::Arc}; -use engine_traits::{CachedTablet, KvEngine, TabletRegistry}; +use engine_traits::{CachedTablet, KvEngine, TabletRegistry, WriteBatch}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{fsm::apply::DEFAULT_APPLY_WB_SIZE, ReadTask}; use slog::Logger; @@ -21,7 +21,9 @@ pub struct Apply { /// publish the update of the tablet remote_tablet: CachedTablet, tablet: EK, - write_batch: Option, + pub write_batch: Option, + /// A buffer for encoding key. + pub key_buffer: Vec, tablet_registry: TabletRegistry, @@ -67,6 +69,7 @@ impl Apply { region_state, tablet_registry, read_scheduler, + key_buffer: vec![], res_reporter, logger, } @@ -88,16 +91,11 @@ impl Apply { } #[inline] - pub fn write_batch_mut(&mut self) -> &mut Option { - &mut self.write_batch - } - - #[inline] - pub fn write_batch_or_default(&mut self) -> &mut EK::WriteBatch { - if self.write_batch.is_none() { - self.write_batch = Some(self.tablet.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE)); + pub fn ensure_write_buffer(&mut self) { + if self.write_batch.is_some() { + return; } - self.write_batch.as_mut().unwrap() + self.write_batch = Some(self.tablet.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE)); } #[inline] @@ -170,4 +168,12 @@ impl Apply { pub fn take_admin_result(&mut self) -> Vec { mem::take(&mut self.admin_cmd_result) } + + #[inline] + pub fn release_memory(&mut self) { + mem::take(&mut self.key_buffer); + if self.write_batch.as_ref().map_or(false, |wb| wb.is_empty()) { + self.write_batch = None; + } + } } diff --git a/components/raftstore-v2/tests/failpoints/test_basic_write.rs b/components/raftstore-v2/tests/failpoints/test_basic_write.rs index be5ccf8316c..b20984a9837 100644 --- a/components/raftstore-v2/tests/failpoints/test_basic_write.rs +++ b/components/raftstore-v2/tests/failpoints/test_basic_write.rs @@ -12,8 +12,8 @@ use crate::cluster::Cluster; /// Check if write batch is correctly maintained during apply. #[test] fn test_write_batch_rollback() { - let cluster = Cluster::default(); - let router = &cluster.routers[0]; + let mut cluster = Cluster::default(); + let router = &mut cluster.routers[0]; let mut req = router.new_request_for(2); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); @@ -25,9 +25,6 @@ fn test_write_batch_rollback() { // Make several entries to batch in apply thread. fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); - let tablet_registry = cluster.node(0).tablet_registry(); - let tablet = tablet_registry.get(2).unwrap().latest().unwrap().clone(); - // Good proposal should be committed. let (msg, mut sub0) = PeerMsg::raft_command(req.clone()); router.send(2, msg).unwrap(); @@ -58,8 +55,10 @@ fn test_write_batch_rollback() { ); let resp = block_on(sub1.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_matches!(tablet.get_value(b"key"), Ok(None)); - assert_eq!(tablet.get_value(b"key1").unwrap().unwrap(), b"value"); + + let snap = router.stale_snapshot(2); + assert_matches!(snap.get_value(b"key"), Ok(None)); + assert_eq!(snap.get_value(b"key1").unwrap().unwrap(), b"value"); fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); @@ -91,6 +90,7 @@ fn test_write_batch_rollback() { ); let resp = block_on(sub1.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_matches!(tablet.get_value(b"key2"), Ok(None)); - assert_eq!(tablet.get_value(b"key3").unwrap().unwrap(), b"value"); + let snap = router.stale_snapshot(2); + assert_matches!(snap.get_value(b"key2"), Ok(None)); + assert_eq!(snap.get_value(b"key3").unwrap().unwrap(), b"value"); } diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 3e2ced3df3c..11f8094612b 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -17,21 +17,21 @@ use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{self, Receiver, Sender, TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, TestTabletFactory}, + kv::{KvTestEngine, KvTestSnapshot, TestTabletFactory}, raft::RaftTestEngine, }; use engine_traits::{TabletRegistry, ALL_CFS}; use futures::executor::block_on; use kvproto::{ metapb::{self, RegionEpoch, Store}, - raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, Request}, raft_serverpb::RaftMessage, }; use pd_client::RpcClient; use raft::eraftpb::MessageType; use raftstore::store::{ region_meta::{RegionLocalState, RegionMeta}, - Config, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, + Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, }; use raftstore_v2::{ create_store_batch_system, @@ -45,6 +45,7 @@ use tikv_util::{ config::{ReadableDuration, VersionTrack}, store::new_peer, }; +use txn_types::WriteBatchFlags; pub struct TestRouter(RaftRouter); @@ -151,6 +152,17 @@ impl TestRouter { req } + pub fn stale_snapshot(&mut self, region_id: u64) -> RegionSnapshot { + let mut req = self.new_request_for(region_id); + let header = req.mut_header(); + header.set_flags(WriteBatchFlags::STALE_READ.bits()); + header.set_flag_data(vec![0; 8]); + let mut snap_req = Request::default(); + snap_req.set_cmd_type(CmdType::Snap); + req.mut_requests().push(snap_req); + block_on(self.get_snapshot(req)).unwrap() + } + pub fn region_detail(&self, region_id: u64) -> metapb::Region { let RegionLocalState { id, @@ -306,6 +318,7 @@ impl TestNode { router } + #[allow(dead_code)] pub fn tablet_registry(&self) -> &TabletRegistry { &self.running_state().unwrap().registry } diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs index a0d3d1ac34a..807d64de756 100644 --- a/components/raftstore-v2/tests/integrations/test_basic_write.rs +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -112,8 +112,8 @@ fn test_basic_write() { #[test] fn test_put_delete() { - let cluster = Cluster::default(); - let router = &cluster.routers[0]; + let mut cluster = Cluster::default(); + let router = &mut cluster.routers[0]; let mut req = router.new_request_for(2); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); @@ -123,16 +123,16 @@ fn test_put_delete() { router.wait_applied_to_current_term(2, Duration::from_secs(3)); - let registry = cluster.node(0).tablet_registry(); - let tablet = registry.get(2).unwrap().latest().unwrap().clone(); - assert!(tablet.get_value(b"key").unwrap().is_none()); + let snap = router.stale_snapshot(2); + assert!(snap.get_value(b"key").unwrap().is_none()); let (msg, mut sub) = PeerMsg::raft_command(req.clone()); router.send(2, msg).unwrap(); assert!(block_on(sub.wait_proposed())); assert!(block_on(sub.wait_committed())); let resp = block_on(sub.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_eq!(tablet.get_value(b"key").unwrap().unwrap(), b"value"); + let snap = router.stale_snapshot(2); + assert_eq!(snap.get_value(b"key").unwrap().unwrap(), b"value"); let mut delete_req = Request::default(); delete_req.set_cmd_type(CmdType::Delete); @@ -145,5 +145,6 @@ fn test_put_delete() { assert!(block_on(sub.wait_committed())); let resp = block_on(sub.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_matches!(tablet.get_value(b"key"), Ok(None)); + let snap = router.stale_snapshot(2); + assert_matches!(snap.get_value(b"key"), Ok(None)); } diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 4f3ffbbf24c..1b9ca50daf7 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -12,10 +12,9 @@ use crate::cluster::Cluster; #[test] fn test_simple_change() { - let cluster = Cluster::with_node_count(2, None); + let mut cluster = Cluster::with_node_count(2, None); let region_id = 2; - let router0 = &cluster.routers[0]; - let mut req = router0.new_request_for(2); + let mut req = cluster.routers[0].new_request_for(2); let admin_req = req.mut_admin_request(); admin_req.set_cmd_type(AdminCmdType::ChangePeer); admin_req @@ -24,12 +23,12 @@ fn test_simple_change() { let store_id = cluster.node(1).id(); let new_peer = new_learner_peer(store_id, 10); admin_req.mut_change_peer().set_peer(new_peer.clone()); - let resp = router0.command(2, req.clone()).unwrap(); + let resp = cluster.routers[0].command(2, req.clone()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); let epoch = req.get_header().get_region_epoch(); let new_conf_ver = epoch.get_conf_ver() + 1; let leader_peer = req.get_header().get_peer().clone(); - let meta = router0 + let meta = cluster.routers[0] .must_query_debug_info(2, Duration::from_secs(3)) .unwrap(); let match_index = meta.raft_apply.applied_index; @@ -39,8 +38,7 @@ fn test_simple_change() { // So heartbeat will create a learner. cluster.dispatch(2, vec![]); - let router1 = &cluster.routers[1]; - let meta = router1 + let meta = cluster.routers[1] .must_query_debug_info(2, Duration::from_secs(3)) .unwrap(); assert_eq!(meta.raft_status.id, 10, "{:?}", meta); @@ -52,34 +50,33 @@ fn test_simple_change() { ); // Trigger the raft tick to replica the log to the learner and execute the // snapshot task. - router0 + cluster.routers[0] .send(region_id, PeerMsg::Tick(PeerTick::Raft)) .unwrap(); cluster.dispatch(region_id, vec![]); // write one kv after snapshot let (key, val) = (b"key", b"value"); - let mut write_req = router0.new_request_for(region_id); + let mut write_req = cluster.routers[0].new_request_for(region_id); let mut put_req = Request::default(); put_req.set_cmd_type(CmdType::Put); put_req.mut_put().set_key(key.to_vec()); put_req.mut_put().set_value(val.to_vec()); write_req.mut_requests().push(put_req); let (msg, _) = PeerMsg::raft_command(write_req.clone()); - router0.send(region_id, msg).unwrap(); + cluster.routers[0].send(region_id, msg).unwrap(); std::thread::sleep(Duration::from_millis(1000)); cluster.dispatch(region_id, vec![]); - let meta = router1 + let meta = cluster.routers[1] .must_query_debug_info(region_id, Duration::from_secs(3)) .unwrap(); // the learner truncated index muse be equal the leader applied index and can // read the new written kv. assert_eq!(match_index, meta.raft_apply.truncated_state.index); assert!(meta.raft_apply.applied_index >= match_index); - let registry = cluster.node(1).tablet_registry(); - let tablet = registry.get(region_id).unwrap().latest().unwrap().clone(); - assert_eq!(tablet.get_value(key).unwrap().unwrap(), val); + let snap = cluster.routers[1].stale_snapshot(2); + assert_eq!(snap.get_value(key).unwrap().unwrap(), val); req.mut_header() .mut_region_epoch() @@ -87,12 +84,12 @@ fn test_simple_change() { req.mut_admin_request() .mut_change_peer() .set_change_type(ConfChangeType::RemoveNode); - let resp = router0.command(2, req.clone()).unwrap(); + let resp = cluster.routers[0].command(2, req.clone()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); let epoch = req.get_header().get_region_epoch(); let new_conf_ver = epoch.get_conf_ver() + 1; let leader_peer = req.get_header().get_peer().clone(); - let meta = router0 + let meta = cluster.routers[0] .must_query_debug_info(2, Duration::from_secs(3)) .unwrap(); assert_eq!(meta.region_state.epoch.version, epoch.get_version()); From d20569b0922533b5b01a2bd34f5f778045148e15 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 12 Dec 2022 17:02:51 +0800 Subject: [PATCH 401/676] Raftstore-v2: transfer leader (#13793) ref tikv/tikv#12842 Signed-off-by: SpadeA-Tang --- Cargo.lock | 3 + components/raftstore-v2/Cargo.toml | 3 + components/raftstore-v2/src/batch/store.rs | 6 + components/raftstore-v2/src/fsm/peer.rs | 52 ++- .../src/operation/command/admin/mod.rs | 20 + .../command/admin/transfer_leader.rs | 421 ++++++++++++++++++ .../raftstore-v2/src/operation/command/mod.rs | 11 +- .../raftstore-v2/src/operation/ready/mod.rs | 19 +- components/raftstore-v2/src/raft/peer.rs | 49 +- .../raftstore-v2/tests/integrations/mod.rs | 1 + .../integrations/test_transfer_leader.rs | 154 +++++++ components/raftstore/src/store/fsm/mod.rs | 2 +- components/raftstore/src/store/mod.rs | 7 +- components/raftstore/src/store/peer.rs | 4 +- 14 files changed, 732 insertions(+), 20 deletions(-) create mode 100644 components/raftstore-v2/src/operation/command/admin/transfer_leader.rs create mode 100644 components/raftstore-v2/tests/integrations/test_transfer_leader.rs diff --git a/Cargo.lock b/Cargo.lock index a7d72121032..9b3eccfda51 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4344,6 +4344,7 @@ name = "raftstore-v2" version = "0.1.0" dependencies = [ "batch-system", + "bytes", "causal_ts", "collections", "concurrency_manager", @@ -4358,12 +4359,14 @@ dependencies = [ "keys", "kvproto", "log_wrappers", + "parking_lot 0.12.0", "pd_client", "prometheus", "protobuf", "raft", "raft-proto", "raftstore", + "rand 0.8.5", "resource_metering", "slog", "slog-global", diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 46ed20f8d10..1d6b67ad129 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -30,6 +30,7 @@ cloud-azure = ["raftstore/cloud-azure"] [dependencies] batch-system = { workspace = true } +bytes = "1.0" causal_ts = { workspace = true } collections = { workspace = true } concurrency_manager = { workspace = true } @@ -43,12 +44,14 @@ futures = { version = "0.3", features = ["compat"] } keys = { workspace = true } kvproto = { workspace = true } log_wrappers = { workspace = true } +parking_lot = "0.12" pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0" } raftstore = { workspace = true } +rand = "0.8.3" resource_metering = { workspace = true } slog = "2.3" smallvec = "1.4" diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 96cbee19e4e..997f8da7a9c 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -21,6 +21,7 @@ use engine_traits::{Engines, KvEngine, RaftEngine, TabletRegistry}; use file_system::{set_io_type, IoType}; use futures::{compat::Future01CompatExt, FutureExt}; use kvproto::{ + disk_usage::DiskUsage, metapb::Store, raft_serverpb::{PeerState, RaftMessage}, }; @@ -77,6 +78,10 @@ pub struct StoreContext { pub tablet_registry: TabletRegistry, pub apply_pool: FuturePool, pub read_scheduler: Scheduler>, + + /// Disk usage for the store itself. + pub self_disk_usage: DiskUsage, + pub snap_mgr: TabletSnapManager, pub pd_scheduler: Scheduler, } @@ -345,6 +350,7 @@ where tablet_registry: self.tablet_registry.clone(), apply_pool: self.apply_pool.clone(), read_scheduler: self.read_scheduler.clone(), + self_disk_usage: DiskUsage::Normal, snap_mgr: self.snap_mgr.clone(), pd_scheduler: self.pd_scheduler.clone(), }; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 6254e1975fd..c4dded64e62 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -6,8 +6,8 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; -use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; -use raftstore::store::{Config, Transport}; +use engine_traits::{KvEngine, RaftEngine, TabletFactory, TabletRegistry}; +use raftstore::store::{Config, LocksStatus, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ is_zero_duration, @@ -34,6 +34,7 @@ pub struct PeerFsm { /// twice accidentally. tick_registry: u16, is_stopped: bool, + reactivate_memory_lock_ticks: usize, } impl PeerFsm { @@ -51,6 +52,7 @@ impl PeerFsm { receiver: rx, tick_registry: 0, is_stopped: false, + reactivate_memory_lock_ticks: 0, }); Ok((tx, fsm)) } @@ -127,6 +129,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, Self { fsm, store_ctx } } + #[inline] + fn schedule_pending_ticks(&mut self) { + let pending_ticks = self.fsm.peer.take_pending_ticks(); + for tick in pending_ticks { + if tick == PeerTick::ReactivateMemoryLock { + self.fsm.reactivate_memory_lock_ticks = 0; + } + self.schedule_tick(tick); + } + } + pub fn schedule_tick(&mut self, tick: PeerTick) { assert!(PeerTick::VARIANT_COUNT <= u16::BITS as usize); let idx = tick as usize; @@ -200,7 +213,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerTick::CheckPeerStaleState => unimplemented!(), PeerTick::EntryCacheEvict => unimplemented!(), PeerTick::CheckLeaderLease => unimplemented!(), - PeerTick::ReactivateMemoryLock => unimplemented!(), + PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), PeerTick::ReportBuckets => unimplemented!(), PeerTick::CheckLongUncommitted => unimplemented!(), } @@ -209,7 +222,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec) { for msg in peer_msgs_buf.drain(..) { match msg { - PeerMsg::RaftMessage(msg) => self.fsm.peer.on_raft_message(self.store_ctx, msg), + PeerMsg::RaftMessage(msg) => { + self.fsm.peer.on_raft_message(self.store_ctx, msg); + self.schedule_pending_ticks(); + } PeerMsg::RaftQuery(cmd) => { self.on_receive_command(cmd.send_time); self.on_query(cmd.request, cmd.ch) @@ -248,4 +264,32 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, // TODO: instead of propose pending commands immediately, we should use timeout. self.fsm.peer.propose_pending_writes(self.store_ctx); } + + pub fn on_reactivate_memory_lock_tick(&mut self) { + let mut pessimistic_locks = self.fsm.peer.txn_ext().pessimistic_locks.write(); + + // If it is not leader, we needn't reactivate by tick. In-memory pessimistic + // lock will be enabled when this region becomes leader again. + // And this tick is currently only used for the leader transfer failure case. + if !self.fsm.peer().is_leader() + || pessimistic_locks.status != LocksStatus::TransferringLeader + { + return; + } + + self.fsm.reactivate_memory_lock_ticks += 1; + let transferring_leader = self.fsm.peer.raft_group().raft.lead_transferee.is_some(); + // `lead_transferee` is not set immediately after the lock status changes. So, + // we need the tick count condition to avoid reactivating too early. + if !transferring_leader + && self.fsm.reactivate_memory_lock_ticks + >= self.store_ctx.cfg.reactive_memory_lock_timeout_tick + { + pessimistic_locks.status = LocksStatus::Normal; + self.fsm.reactivate_memory_lock_ticks = 0; + } else { + drop(pessimistic_locks); + self.schedule_tick(PeerTick::ReactivateMemoryLock); + } + } } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index c1e25474701..388bf72e01e 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -2,6 +2,7 @@ mod conf_change; mod split; +mod transfer_leader; use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest}; @@ -19,6 +20,7 @@ use raftstore::{ use slog::info; pub use split::{SplitInit, SplitResult, SPLIT_PREFIX}; use tikv_util::box_err; +use txn_types::WriteBatchFlags; use self::conf_change::ConfChangeResult; use crate::{ @@ -29,8 +31,11 @@ use crate::{ #[derive(Debug)] pub enum AdminCmdResult { + // No side effect produced by the command + None, SplitRegion(SplitResult), ConfChange(ConfChangeResult), + TransferLeader(u64), } impl Peer { @@ -81,6 +86,21 @@ impl Peer { "Split is deprecated. Please use BatchSplit instead." )), AdminCmdType::BatchSplit => self.propose_split(ctx, req), + AdminCmdType::TransferLeader => { + // Containing TRANSFER_LEADER_PROPOSAL flag means the this transfer leader + // request should be proposed to the raft group + if WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + .contains(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL) + { + let data = req.write_to_bytes().unwrap(); + self.propose_with_ctx(ctx, data, vec![]) + } else { + if self.propose_transfer_leader(ctx, req, ch) { + self.set_has_ready(); + } + return; + } + } _ => unimplemented!(), } }; diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs new file mode 100644 index 00000000000..71853d0007b --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -0,0 +1,421 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::cmp::Ordering; + +use bytes::Bytes; +use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; +use fail::fail_point; +use kvproto::{ + disk_usage::DiskUsage, + metapb, + raft_cmdpb::{ + AdminCmdType, AdminRequest, AdminResponse, CmdType, PutRequest, RaftCmdRequest, Request, + TransferLeaderRequest, + }, +}; +use parking_lot::RwLockWriteGuard; +use raft::{eraftpb, ProgressState, Storage}; +use raftstore::{ + store::{ + fsm::new_admin_request, make_transfer_leader_response, metrics::PEER_ADMIN_CMD_COUNTER, + LocksStatus, Transport, TRANSFER_LEADER_COMMAND_REPLY_CTX, + }, + Result, +}; +use rand::prelude::SliceRandom; +use slog::info; +use txn_types::WriteBatchFlags; + +use super::AdminCmdResult; +use crate::{ + batch::StoreContext, + fsm::{ApplyResReporter, PeerFsmDelegate}, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerMsg, PeerTick}, +}; + +fn get_transfer_leader_cmd(msg: &RaftCmdRequest) -> Option<&TransferLeaderRequest> { + if !msg.has_admin_request() { + return None; + } + let req = msg.get_admin_request(); + if !req.has_transfer_leader() { + return None; + } + + Some(req.get_transfer_leader()) +} + +impl Peer { + /// Return true if the transfer leader request is accepted. + /// + /// When transferring leadership begins, leader sends a pre-transfer + /// to target follower first to ensures it's ready to become leader. + /// After that the real transfer leader process begin. + /// + /// 1. pre_transfer_leader on leader: + /// Leader will send a MsgTransferLeader to follower. + /// 2. execute_transfer_leader on follower + /// If follower passes all necessary checks, it will reply an + /// ACK with type MsgTransferLeader and its promised applied index. + /// 3. ready_to_transfer_leader on leader: + /// Leader checks if it's appropriate to transfer leadership. If it + /// does, it calls raft transfer_leader API to do the remaining work. + /// + /// Additional steps when there are remaining pessimistic + /// locks to propose (detected in function on_transfer_leader_msg). + /// 1. Leader firstly proposes pessimistic locks and then proposes a + /// TransferLeader command. + /// 2. The follower applies the TransferLeader command and replies an + /// ACK with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. + /// + /// See also: tikv/rfcs#37. + pub fn propose_transfer_leader( + &mut self, + ctx: &mut StoreContext, + req: RaftCmdRequest, + ch: CmdResChannel, + ) -> bool { + ctx.raft_metrics.propose.transfer_leader.inc(); + + let transfer_leader = get_transfer_leader_cmd(&req).unwrap(); + let prs = self.raft_group().raft.prs(); + + // Find the target with the largest matched index among the candidate + // transferee peers + let (_, peers) = transfer_leader + .get_peers() + .iter() + .filter(|peer| peer.id != self.peer().id) + .fold((0, vec![]), |(max_matched, mut chosen), p| { + if let Some(pr) = prs.get(p.id) { + match pr.matched.cmp(&max_matched) { + Ordering::Greater => (pr.matched, vec![p]), + Ordering::Equal => { + chosen.push(p); + (max_matched, chosen) + } + Ordering::Less => (max_matched, chosen), + } + } else { + (max_matched, chosen) + } + }); + let peer = match peers.len() { + 0 => transfer_leader.get_peer(), + 1 => peers.get(0).unwrap(), + _ => peers.choose(&mut rand::thread_rng()).unwrap(), + }; + + let transferee = if peer.id == self.peer().id { + false + } else { + self.pre_transfer_leader(peer) + }; + + // transfer leader command doesn't need to replicate log and apply, so we + // return immediately. Note that this command may fail, we can view it just as + // an advice + ch.set_result(make_transfer_leader_response()); + + transferee + } + + fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { + if self.raft_group().raft.has_pending_conf() { + info!( + self.logger, + "reject transfer leader due to pending conf change"; + "peer" => ?peer, + ); + return false; + } + + // Broadcast heartbeat to make sure followers commit the entries immediately. + // It's only necessary to ping the target peer, but ping all for simplicity. + self.raft_group_mut().ping(); + + // todo: entry cache warmup + + let mut msg = eraftpb::Message::new(); + msg.set_to(peer.get_id()); + msg.set_msg_type(eraftpb::MessageType::MsgTransferLeader); + msg.set_from(self.peer_id()); + // log term here represents the term of last log. For leader, the term of last + // log is always its current term. Not just set term because raft library + // forbids setting it for MsgTransferLeader messages. + msg.set_log_term(self.term()); + self.raft_group_mut().raft.msgs.push(msg); + true + } + + pub fn on_transfer_leader_msg( + &mut self, + ctx: &mut StoreContext, + msg: &eraftpb::Message, + peer_disk_usage: DiskUsage, + ) { + // log_term is set by original leader, represents the term last log is written + // in, which should be equal to the original leader's term. + if msg.get_log_term() != self.term() { + return; + } + + if !self.is_leader() { + self.execute_transfer_leader(ctx, msg.get_from(), peer_disk_usage, false); + } else { + let from = match self.peer_from_cache(msg.get_from()) { + Some(p) => p, + None => return, + }; + match self.ready_to_transfer_leader(ctx, msg.get_index(), &from) { + Some(reason) => { + info!( + self.logger, + "reject to transfer leader"; + "to" => ?from, + "reason" => reason, + "index" => msg.get_index(), + "last_index" => self.storage().last_index().unwrap_or_default(), + ); + } + None => { + self.propose_pending_writes(ctx); + if self.propose_locks_before_transfer_leader(ctx, msg) { + // If some pessimistic locks are just proposed, we propose another + // TransferLeader command instead of transferring leader immediately. + info!( + self.logger, + "propose transfer leader command"; + "to" => ?from, + ); + let mut cmd = + new_admin_request(self.region().get_id(), self.peer().clone()); + cmd.mut_header() + .set_region_epoch(self.region().get_region_epoch().clone()); + // Set this flag to propose this command like a normal proposal. + cmd.mut_header() + .set_flags(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL.bits()); + cmd.mut_admin_request() + .set_cmd_type(AdminCmdType::TransferLeader); + cmd.mut_admin_request().mut_transfer_leader().set_peer(from); + if let (PeerMsg::RaftCommand(req), sub) = PeerMsg::raft_command(cmd) { + self.on_admin_command(ctx, req.request, req.ch); + } else { + unreachable!(); + } + } else { + info!( + self.logger, + "transfer leader"; + "peer" => ?from, + ); + self.raft_group_mut().transfer_leader(from.get_id()); + } + } + } + } + } + + pub fn execute_transfer_leader( + &mut self, + ctx: &mut StoreContext, + from: u64, + peer_disk_usage: DiskUsage, + reply_cmd: bool, // whether it is a reply to a TransferLeader command + ) { + let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); + if pending_snapshot + || from != self.leader_id() + // Transfer leader to node with disk full will lead to write availablity downback. + // But if the current leader is disk full, and send such request, we should allow it, + // because it may be a read leader balance request. + || (!matches!(ctx.self_disk_usage, DiskUsage::Normal) && + matches!(peer_disk_usage,DiskUsage::Normal)) + { + info!( + self.logger, + "reject transferring leader"; + "from" => from, + "pending_snapshot" => pending_snapshot, + "disk_usage" => ?ctx.self_disk_usage, + ); + return; + } + + let mut msg = eraftpb::Message::new(); + msg.set_from(self.peer_id()); + msg.set_to(self.leader_id()); + msg.set_msg_type(eraftpb::MessageType::MsgTransferLeader); + msg.set_index(self.storage().apply_state().applied_index); + msg.set_log_term(self.term()); + if reply_cmd { + msg.set_context(Bytes::from_static(TRANSFER_LEADER_COMMAND_REPLY_CTX)); + } + self.raft_group_mut().raft.msgs.push(msg); + } + + fn ready_to_transfer_leader( + &self, + ctx: &mut StoreContext, + mut index: u64, + peer: &metapb::Peer, + ) -> Option<&'static str> { + let status = self.raft_group().status(); + let progress = status.progress.unwrap(); + + if !progress.conf().voters().contains(peer.id) { + return Some("non voter"); + } + + for (id, pr) in progress.iter() { + if pr.state == ProgressState::Snapshot { + return Some("pending snapshot"); + } + if *id == peer.id && index == 0 { + // index will be zero if it's sent from an instance without + // pre-transfer-leader feature. Set it to matched to make it + // possible to transfer leader to an older version. It may be + // useful during rolling restart. + index = pr.matched; + } + } + + if self.raft_group().raft.has_pending_conf() + || self.raft_group().raft.pending_conf_index > index + { + return Some("pending conf change"); + } + + if self.storage().last_index().unwrap_or_default() + >= index + ctx.cfg.leader_transfer_max_log_lag + { + return Some("log gap"); + } + None + } + + // Returns whether we should propose another TransferLeader command. This is + // for: + // - Considering the amount of pessimistic locks can be big, it can reduce + // unavailable time caused by waiting for the transferee catching up logs. + // - Make transferring leader strictly after write commands that executes before + // proposing the locks, preventing unexpected lock loss. + fn propose_locks_before_transfer_leader( + &mut self, + ctx: &mut StoreContext, + msg: &eraftpb::Message, + ) -> bool { + // 1. Disable in-memory pessimistic locks. + + // Clone to make borrow checker happy when registering ticks. + let txn_ext = self.txn_ext().clone(); + let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); + + // If the message context == TRANSFER_LEADER_COMMAND_REPLY_CTX, the message + // is a reply to a transfer leader command before. If the locks status remain + // in the TransferringLeader status, we can safely initiate transferring leader + // now. + // If it's not in TransferringLeader status now, it is probably because several + // ticks have passed after proposing the locks in the last time and we + // reactivate the memory locks. Then, we should propose the locks again. + if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX + && pessimistic_locks.status == LocksStatus::TransferringLeader + { + return false; + } + + // If it is not writable, it's probably because it's a retried TransferLeader + // and the locks have been proposed. But we still need to return true to + // propose another TransferLeader command. Otherwise, some write requests that + // have marked some locks as deleted will fail because raft rejects more + // proposals. + // It is OK to return true here if it's in other states like MergingRegion or + // NotLeader. In those cases, the locks will fail to propose and nothing will + // happen. + if !pessimistic_locks.is_writable() { + return true; + } + pessimistic_locks.status = LocksStatus::TransferringLeader; + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + + // 2. Propose pessimistic locks + if pessimistic_locks.is_empty() { + return false; + } + // FIXME: Raft command has size limit. Either limit the total size of + // pessimistic locks in a region, or split commands here. + let mut cmd = RaftCmdRequest::default(); + { + // Downgrade to a read guard, do not block readers in the scheduler as far as + // possible. + let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); + fail_point!("invalidate_locks_before_transfer_leader"); + for (key, (lock, deleted)) in &*pessimistic_locks { + if *deleted { + continue; + } + let mut put = PutRequest::default(); + put.set_cf(CF_LOCK.to_string()); + put.set_key(key.as_encoded().to_owned()); + put.set_value(lock.to_lock().to_bytes()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Put); + req.set_put(put); + cmd.mut_requests().push(req); + } + } + if cmd.get_requests().is_empty() { + // If the map is not empty but all locks are deleted, it is possible that a + // write command has just marked locks deleted but not proposed yet. + // It might cause that command to fail if we skip proposing the + // extra TransferLeader command here. + return true; + } + cmd.mut_header().set_region_id(self.region_id()); + cmd.mut_header() + .set_region_epoch(self.region().get_region_epoch().clone()); + cmd.mut_header().set_peer(self.peer().clone()); + info!( + self.logger, + "propose {} locks before transferring leader", cmd.get_requests().len(); + ); + let (PeerMsg::RaftCommand(req), sub) = PeerMsg::raft_command(cmd) else {unreachable!()}; + self.on_write_command(ctx, req.request, req.ch); + true + } +} + +impl Apply { + pub fn apply_transfer_leader( + &mut self, + req: &AdminRequest, + term: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + PEER_ADMIN_CMD_COUNTER.transfer_leader.all.inc(); + let resp = AdminResponse::default(); + + let peer = req.get_transfer_leader().get_peer(); + // Only execute TransferLeader if the expected new leader is self. + if peer.get_id() == self.peer().get_id() { + Ok((resp, AdminCmdResult::TransferLeader(term))) + } else { + Ok((resp, AdminCmdResult::None)) + } + } +} + +impl Peer { + pub fn on_transfer_leader(&mut self, ctx: &mut StoreContext, term: u64) { + // If the term has changed between proposing and executing the TransferLeader + // request, ignore it because this request may be stale. + if term != self.term() { + return; + } + + // Reply to leader that it is ready to transfer leader now. + self.execute_transfer_leader(ctx, self.leader_id(), DiskUsage::Normal, true); + + self.set_has_ready(); + } +} diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 3ee3430a140..3d0a17ece62 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -297,6 +297,7 @@ impl Peer { for admin_res in apply_res.admin_result { match admin_res { + AdminCmdResult::None => unreachable!(), AdminCmdResult::ConfChange(conf_change) => { self.on_apply_res_conf_change(ctx, conf_change) } @@ -305,6 +306,7 @@ impl Peer { derived_index, tablet_index, }) => self.on_apply_res_split(ctx, derived_index, tablet_index, regions), + AdminCmdResult::TransferLeader(term) => self.on_transfer_leader(ctx, term), } } @@ -448,7 +450,9 @@ impl Apply { AdminCmdType::PrepareMerge => unimplemented!(), AdminCmdType::CommitMerge => unimplemented!(), AdminCmdType::RollbackMerge => unimplemented!(), - AdminCmdType::TransferLeader => unreachable!(), + AdminCmdType::TransferLeader => { + self.apply_transfer_leader(admin_req, entry.term)? + } AdminCmdType::ChangePeer => { self.apply_conf_change(entry.get_index(), admin_req, conf_change.unwrap())? } @@ -465,7 +469,10 @@ impl Apply { } }; - self.push_admin_result(admin_result); + match admin_result { + AdminCmdResult::None => (), + _ => self.push_admin_result(admin_result), + } let mut resp = new_response(req.get_header()); resp.set_admin_response(admin_resp); Ok(resp) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index c252ad7d231..baf66dfa6fc 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -29,7 +29,7 @@ use kvproto::{ raft_serverpb::{PeerState, RaftMessage, RaftSnapshotData}, }; use protobuf::Message as _; -use raft::{eraftpb, Ready, StateRole, INVALID_ID}; +use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::store::{util, ExtraStates, FetchedLogs, ReadProgress, Transport, WriteTask}; use slog::{debug, error, trace, warn}; use tikv_util::time::{duration_to_sec, monotonic_raw_now}; @@ -114,16 +114,20 @@ impl Peer { unimplemented!(); // return; } + // TODO: drop all msg append when the peer is uninitialized and has conflict // ranges with other peers. let from_peer = msg.take_from_peer(); if self.is_leader() && from_peer.get_id() != INVALID_ID { self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); } - self.insert_peer_cache(from_peer); - if let Err(e) = self.raft_group_mut().step(msg.take_message()) { + self.insert_peer_cache(msg.take_from_peer()); + if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { + self.on_transfer_leader_msg(ctx, msg.get_message(), msg.disk_usage) + } else if let Err(e) = self.raft_group_mut().step(msg.take_message()) { error!(self.logger, "raft step error"; "err" => ?e); } + self.set_has_ready(); } @@ -407,8 +411,8 @@ impl Peer { /// The apply snapshot process order would be: /// - Get the snapshot from the ready /// - Wait for async writer to load this tablet - /// In this step, the snapshot has loaded finish, but some apply state - /// need to update. + /// In this step, the snapshot loading has been finished, but some apply + /// state need to update. if has_snapshot { self.on_applied_snapshot(ctx); } @@ -462,9 +466,13 @@ impl Peer { // latency. self.raft_group_mut().skip_bcast_commit(false); + // Init the in-memory pessimistic lock table when the peer becomes leader. + self.activate_in_memory_pessimistic_locks(); + // A more recent read may happen on the old leader. So max ts should // be updated after a peer becomes leader. self.require_updating_max_ts(ctx); + // Exit entry cache warmup state when the peer becomes leader. self.entry_storage_mut().clear_entry_cache_warmup_state(); @@ -473,6 +481,7 @@ impl Peer { StateRole::Follower => { self.leader_lease_mut().expire(); self.storage_mut().cancel_generating_snap(None); + self.clear_in_memory_pessimistic_locks(); } _ => {} } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 9101a9328f3..02bbb03c35e 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -17,8 +17,8 @@ use raftstore::{ store::{ fsm::Proposal, util::{Lease, RegionReadProgress}, - Config, EntryStorage, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, - TxnExt, + Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, + ReadProgress, TrackVer, TxnExt, }, Error, }; @@ -37,7 +37,7 @@ use crate::{ batch::StoreContext, fsm::{ApplyFsm, ApplyScheduler}, operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteEncoder}, - router::{CmdResChannel, QueryResChannel}, + router::{CmdResChannel, PeerTick, QueryResChannel}, worker::PdTask, Result, }; @@ -85,6 +85,8 @@ pub struct Peer { txn_ext: Arc, txn_extra_op: Arc>, + pending_ticks: Vec, + /// Check whether this proposal can be proposed based on its epoch. proposal_control: ProposalControl, @@ -149,6 +151,7 @@ impl Peer { txn_ext: Arc::default(), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), proposal_control: ProposalControl::new(0), + pending_ticks: Vec::new(), split_trace: vec![], }; @@ -521,6 +524,46 @@ impl Peer { self.apply_scheduler = Some(apply_scheduler); } + /// Whether the snapshot is handling. + /// See the comments of `check_snap_status` for more details. + #[inline] + pub fn is_handling_snapshot(&self) -> bool { + // todo: This method may be unnecessary now? + false + } + + /// Returns `true` if the raft group has replicated a snapshot but not + /// committed it yet. + #[inline] + pub fn has_pending_snapshot(&self) -> bool { + self.raft_group().snap().is_some() + } + + #[inline] + pub fn add_pending_tick(&mut self, tick: PeerTick) { + self.pending_ticks.push(tick); + } + + #[inline] + pub fn take_pending_ticks(&mut self) -> Vec { + mem::take(&mut self.pending_ticks) + } + + pub fn activate_in_memory_pessimistic_locks(&mut self) { + let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::Normal; + pessimistic_locks.term = self.term(); + pessimistic_locks.version = self.region().get_region_epoch().get_version(); + } + + pub fn clear_in_memory_pessimistic_locks(&mut self) { + let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::NotLeader; + pessimistic_locks.clear(); + pessimistic_locks.term = self.term(); + pessimistic_locks.version = self.region().get_region_epoch().get_version(); + } + #[inline] pub fn post_split(&mut self) { self.reset_region_buckets(); diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index 52c8ba5e1f8..c3061be0d2b 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -15,3 +15,4 @@ mod test_pd_heartbeat; mod test_read; mod test_split; mod test_status; +mod test_transfer_leader; diff --git a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs new file mode 100644 index 00000000000..7096f06b1d2 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs @@ -0,0 +1,154 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::Peekable; +use futures::executor::block_on; +use kvproto::{ + metapb, + raft_cmdpb::{AdminCmdType, CmdType, Request, TransferLeaderRequest}, +}; +use raft::prelude::ConfChangeType; +use raftstore_v2::router::PeerMsg; +use tikv_util::store::new_peer; + +use crate::cluster::Cluster; + +fn put_data( + region_id: u64, + cluster: &Cluster, + node_off: usize, + node_off_for_verify: usize, + key: &[u8], +) { + let router = &cluster.routers[node_off]; + let mut req = router.new_request_for(region_id); + let mut put_req = Request::default(); + put_req.set_cmd_type(CmdType::Put); + put_req.mut_put().set_key(key[1..].to_vec()); + put_req.mut_put().set_value(b"value".to_vec()); + req.mut_requests().push(put_req); + + router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + + // router.wait_applied_to_current_term(2, Duration::from_secs(3)); + let tablet_registry = cluster.node(node_off).tablet_registry(); + let tablet = tablet_registry + .get(region_id) + .unwrap() + .latest() + .unwrap() + .clone(); + assert!(tablet.get_value(key).unwrap().is_none()); + + let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + router.send(region_id, msg).unwrap(); + std::thread::sleep(std::time::Duration::from_millis(10)); + cluster.dispatch(region_id, vec![]); + assert!(block_on(sub.wait_proposed())); + + std::thread::sleep(std::time::Duration::from_millis(10)); + cluster.dispatch(region_id, vec![]); + // triage send snapshot + std::thread::sleep(std::time::Duration::from_millis(100)); + cluster.dispatch(region_id, vec![]); + assert!(block_on(sub.wait_committed())); + + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); + + // Verify the data is ready in the other node + let tablet_registry = cluster.node(node_off_for_verify).tablet_registry(); + let tablet = tablet_registry + .get(region_id) + .unwrap() + .latest() + .unwrap() + .clone(); + assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); +} + +pub fn must_transfer_leader( + cluster: &Cluster, + region_id: u64, + from_off: usize, + to_off: usize, + to_peer: metapb::Peer, +) { + let router = &cluster.routers[from_off]; + let router2 = &cluster.routers[to_off]; + let mut req = router.new_request_for(region_id); + let mut transfer_req = TransferLeaderRequest::default(); + transfer_req.set_peer(to_peer.clone()); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::TransferLeader); + admin_req.set_transfer_leader(transfer_req); + let resp = router.command(region_id, req).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + cluster.dispatch(region_id, vec![]); + + let meta = router + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.soft_state.leader_id, to_peer.id); + let meta = router2 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.soft_state.leader_id, to_peer.id); +} + +#[test] +fn test_transfer_leader() { + let cluster = Cluster::with_node_count(3, None); + let region_id = 2; + let router0 = &cluster.routers[0]; + + let mut req = router0.new_request_for(region_id); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddNode); + let store_id = cluster.node(1).id(); + let peer1 = new_peer(store_id, 10); + admin_req.mut_change_peer().set_peer(peer1.clone()); + let req_clone = req.clone(); + let resp = router0.command(region_id, req_clone).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let epoch = req.get_header().get_region_epoch(); + let new_conf_ver = epoch.get_conf_ver() + 1; + let leader_peer = req.get_header().get_peer().clone(); + let meta = router0 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + assert_eq!(meta.region_state.peers, vec![leader_peer, peer1.clone()]); + let peer0_id = meta.raft_status.id; + + // So heartbeat will create a learner. + cluster.dispatch(region_id, vec![]); + let router1 = &cluster.routers[1]; + let meta = router1 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(peer0_id, meta.raft_status.soft_state.leader_id); + assert_eq!(meta.raft_status.id, peer1.id, "{:?}", meta); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + cluster.dispatch(region_id, vec![]); + + // Ensure follower has latest entries before transfer leader. + put_data(region_id, &cluster, 0, 1, b"zkey1"); + + // Perform transfer leader + must_transfer_leader(&cluster, region_id, 0, 1, peer1); + + // Before transfer back to peer0, put some data again. + put_data(region_id, &cluster, 1, 0, b"zkey2"); + + // Perform transfer leader + let store_id = cluster.node(0).id(); + must_transfer_leader(&cluster, region_id, 1, 0, new_peer(store_id, peer0_id)); +} diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index a9b954552d3..2f700eec9bf 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -16,7 +16,7 @@ pub use self::{ ChangePeer, ExecResult, GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, Registration, TaskRes as ApplyTaskRes, }, - peer::{DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO}, + peer::{new_admin_request, DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO}, store::{ create_raft_batch_system, RaftBatchSystem, RaftPollerBuilder, RaftRouter, StoreInfo, StoreMeta, diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 5d7455b2d1c..65417732adf 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -54,9 +54,10 @@ pub use self::{ StoreMsg, StoreTick, WriteCallback, WriteResponse, }, peer::{ - can_amend_read, get_sync_log_from_request, propose_read_index, should_renew_lease, Peer, - PeerStat, ProposalContext, ProposalQueue, RequestInspector, RequestPolicy, - SnapshotRecoveryWaitApplySyncer, + can_amend_read, get_sync_log_from_request, make_transfer_leader_response, + propose_read_index, should_renew_lease, Peer, PeerStat, ProposalContext, ProposalQueue, + RequestInspector, RequestPolicy, SnapshotRecoveryWaitApplySyncer, + TRANSFER_LEADER_COMMAND_REPLY_CTX, }, peer_storage::{ clear_meta, do_snapshot, write_initial_apply_state, write_initial_raft_state, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 100544bd0f4..22b822c2115 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4513,7 +4513,7 @@ where self.raft_group.raft.msgs.push(msg); } - /// Return true to if the transfer leader request is accepted. + /// Return true if the transfer leader request is accepted. /// /// When transferring leadership begins, leader sends a pre-transfer /// to target follower first to ensures it's ready to become leader. @@ -5655,7 +5655,7 @@ fn is_request_urgent(req: &RaftCmdRequest) -> bool { ) } -fn make_transfer_leader_response() -> RaftCmdResponse { +pub fn make_transfer_leader_response() -> RaftCmdResponse { let mut response = AdminResponse::default(); response.set_cmd_type(AdminCmdType::TransferLeader); response.set_transfer_leader(TransferLeaderResponse::default()); From fa6122e43b68cd0787f43428f321814f5e25fad4 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 13 Dec 2022 15:24:51 +0800 Subject: [PATCH 402/676] *: add raftstore v2 only tablet optimization (#13924) ref tikv/tikv#12842 - No WAL should be written - raftcf is dropped - No concurrent write - No multi batch write - Use smaller bloomfilter ratio to reduce memory footprint Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 7 +- cmd/tikv-ctl/src/executor.rs | 27 ++--- cmd/tikv-ctl/src/main.rs | 15 ++- components/engine_rocks/src/options.rs | 1 + components/engine_rocks/src/raw.rs | 2 +- components/engine_rocks/src/util.rs | 4 +- components/engine_test/src/lib.rs | 6 +- components/engine_traits/Cargo.toml | 1 + components/engine_traits/src/tablet.rs | 106 +++++++++++++---- .../src/operation/command/admin/split.rs | 16 +-- .../raftstore-v2/src/operation/query/local.rs | 15 ++- .../src/operation/ready/snapshot.rs | 7 +- components/raftstore-v2/src/raft/peer.rs | 16 ++- components/raftstore-v2/src/raft/storage.rs | 10 +- .../tests/integrations/cluster.rs | 25 ++-- .../tests/integrations/test_basic_write.rs | 6 +- .../tests/integrations/test_conf_change.rs | 6 +- components/server/src/server.rs | 15 ++- components/snap_recovery/src/init_cluster.rs | 20 ++-- src/config/mod.rs | 110 +++++++++++++----- src/server/engine_factory.rs | 89 +++++++++----- src/storage/config.rs | 10 ++ src/storage/kv/test_engine_builder.rs | 18 ++- src/storage/mod.rs | 20 +++- .../flow_controller/tablet_flow_controller.rs | 14 ++- tests/integrations/config/mod.rs | 17 ++- tests/integrations/config/test-custom.toml | 12 ++ tests/integrations/storage/test_titan.rs | 8 +- 28 files changed, 413 insertions(+), 190 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9b3eccfda51..494846ccb0d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1630,6 +1630,7 @@ dependencies = [ "error_code", "fail", "file_system", + "keys", "kvproto", "log_wrappers", "protobuf", @@ -2875,7 +2876,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#bd07e9e598db63574cf06edaeea3c4687eadff59" +source = "git+https://github.com/tikv/rust-rocksdb.git#f94fdd30dd94f6fd22c8052edfd2c4039d9f2fbd" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2894,7 +2895,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#bd07e9e598db63574cf06edaeea3c4687eadff59" +source = "git+https://github.com/tikv/rust-rocksdb.git#f94fdd30dd94f6fd22c8052edfd2c4039d9f2fbd" dependencies = [ "bzip2-sys", "cc", @@ -4757,7 +4758,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#bd07e9e598db63574cf06edaeea3c4687eadff59" +source = "git+https://github.com/tikv/rust-rocksdb.git#f94fdd30dd94f6fd22c8052edfd2c4039d9f2fbd" dependencies = [ "libc 0.2.132", "librocksdb_sys", diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 1c42d728ca9..42b08c629e7 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -1,8 +1,7 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - borrow::ToOwned, cmp::Ordering, path::PathBuf, pin::Pin, str, string::ToString, sync::Arc, - time::Duration, u64, + borrow::ToOwned, cmp::Ordering, pin::Pin, str, string::ToString, sync::Arc, time::Duration, u64, }; use encryption_export::data_key_manager_from_config; @@ -28,7 +27,10 @@ use security::SecurityManager; use serde_json::json; use tikv::{ config::{ConfigController, TikvConfig}, - server::debug::{BottommostLevelCompaction, Debugger, RegionInfo}, + server::{ + debug::{BottommostLevelCompaction, Debugger, RegionInfo}, + KvEngineFactoryBuilder, + }, }; use tikv_util::escape; @@ -45,7 +47,6 @@ type MvccInfoStream = Pin, MvccInfo), Stri pub fn new_debug_executor( cfg: &TikvConfig, data_dir: Option<&str>, - skip_paranoid_checks: bool, host: Option<&str>, mgr: Arc, ) -> Box { @@ -55,7 +56,6 @@ pub fn new_debug_executor( // TODO: perhaps we should allow user skip specifying data path. let data_dir = data_dir.unwrap(); - let kv_path = cfg.infer_kv_engine_path(Some(data_dir)).unwrap(); let key_manager = data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .unwrap() @@ -66,15 +66,10 @@ pub fn new_debug_executor( .build_shared_rocks_env(key_manager.clone(), None /* io_rate_limiter */) .unwrap(); - let mut kv_db_opts = cfg.rocksdb.build_opt(); - kv_db_opts.set_env(env.clone()); - kv_db_opts.set_paranoid_checks(!skip_paranoid_checks); - let kv_cfs_opts = cfg - .rocksdb - .build_cf_opts(&cache, None, cfg.storage.api_version()); - let kv_path = PathBuf::from(kv_path).canonicalize().unwrap(); - let kv_path = kv_path.to_str().unwrap(); - let kv_db = match new_engine_opt(kv_path, kv_db_opts, kv_cfs_opts) { + let factory = KvEngineFactoryBuilder::new(env.clone(), cfg, cache) + .lite(true) + .build(); + let kv_db = match factory.create_shared_db(data_dir) { Ok(db) => db, Err(e) => handle_engine_error(e), }; @@ -83,7 +78,7 @@ pub fn new_debug_executor( if !cfg.raft_engine.enable { let mut raft_db_opts = cfg.raftdb.build_opt(); raft_db_opts.set_env(env); - let raft_db_cf_opts = cfg.raftdb.build_cf_opts(&cache); + let raft_db_cf_opts = cfg.raftdb.build_cf_opts(factory.block_cache()); let raft_path = cfg.infer_raft_db_path(Some(data_dir)).unwrap(); if !db_exist(&raft_path) { error!("raft db not exists: {}", raft_path); @@ -380,7 +375,7 @@ pub trait DebugExecutor { to_config: &TikvConfig, mgr: Arc, ) { - let rhs_debug_executor = new_debug_executor(to_config, to_data_dir, false, to_host, mgr); + let rhs_debug_executor = new_debug_executor(to_config, to_data_dir, to_host, mgr); let r1 = self.get_region_info(region); let r2 = rhs_debug_executor.get_region_info(region); diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 72078d07f62..77888f36fa7 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -59,7 +59,7 @@ fn main() { // Initialize configuration and security manager. let cfg_path = opt.config.as_ref(); - let cfg = cfg_path.map_or_else( + let mut cfg = cfg_path.map_or_else( || { let mut cfg = TikvConfig::default(); cfg.log.level = tikv_util::logger::get_level_by_string("warn") @@ -249,9 +249,8 @@ fn main() { .exit(); } - let skip_paranoid_checks = opt.skip_paranoid_checks; - let debug_executor = - new_debug_executor(&cfg, data_dir, skip_paranoid_checks, host, Arc::clone(&mgr)); + cfg.rocksdb.paranoid_checks = Some(!opt.skip_paranoid_checks); + let debug_executor = new_debug_executor(&cfg, data_dir, host, Arc::clone(&mgr)); match cmd { Cmd::Print { cf, key } => { @@ -643,7 +642,7 @@ fn compact_whole_cluster( .name(format!("compact-{}", addr)) .spawn_wrapper(move || { tikv_alloc::add_thread_memory_accessor(); - let debug_executor = new_debug_executor(&cfg, None, false, Some(&addr), mgr); + let debug_executor = new_debug_executor(&cfg, None, Some(&addr), mgr); for cf in cfs { debug_executor.compact( Some(&addr), @@ -687,14 +686,14 @@ fn run_ldb_command(args: Vec, cfg: &TikvConfig) { .unwrap() .map(Arc::new); let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); - let mut opts = cfg.rocksdb.build_opt(); + let mut opts = cfg.rocksdb.build_opt(None); opts.set_env(env); engine_rocks::raw::run_ldb_tool(&args, &opts); } fn run_sst_dump_command(args: Vec, cfg: &TikvConfig) { - let opts = cfg.rocksdb.build_opt(); + let opts = cfg.rocksdb.build_opt(None); engine_rocks::raw::run_sst_dump_tool(&args, &opts); } @@ -714,7 +713,7 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, let stderr = BufferRedirect::stderr().unwrap(); let stdout = BufferRedirect::stdout().unwrap(); - let opts = cfg.rocksdb.build_opt(); + let opts = cfg.rocksdb.build_opt(None); match run_and_wait_child_process(|| engine_rocks::raw::run_sst_dump_tool(&args, &opts)) { Ok(code) => { diff --git a/components/engine_rocks/src/options.rs b/components/engine_rocks/src/options.rs index c50c7734f79..7579c92ba79 100644 --- a/components/engine_rocks/src/options.rs +++ b/components/engine_rocks/src/options.rs @@ -40,6 +40,7 @@ impl From for RocksWriteOptions { let mut r = RawWriteOptions::default(); r.set_sync(opts.sync()); r.set_no_slowdown(opts.no_slowdown()); + r.disable_wal(opts.disable_wal()); // TODO: enable it. r.set_memtable_insert_hint_per_batch(false); RocksWriteOptions(r) diff --git a/components/engine_rocks/src/raw.rs b/components/engine_rocks/src/raw.rs index 1a8718588b2..4c2dd71b2a2 100644 --- a/components/engine_rocks/src/raw.rs +++ b/components/engine_rocks/src/raw.rs @@ -14,6 +14,6 @@ pub use rocksdb::{ DBCompactionFilter, DBCompactionStyle, DBCompressionType, DBEntryType, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, DBTitanDBBlobRunMode, Env, EventListener, IngestExternalFileOptions, LRUCacheOptions, MemoryAllocator, PerfContext, - PrepopulateBlockCache, Range, SliceTransform, TablePropertiesCollector, + PrepopulateBlockCache, Range, SliceTransform, Statistics, TablePropertiesCollector, TablePropertiesCollectorFactory, }; diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 778e16c1a67..786dfec04d1 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -11,7 +11,7 @@ use slog_global::warn; use crate::{ cf_options::RocksCfOptions, db_options::RocksDbOptions, engine::RocksEngine, r2e, - rocks_metrics_defs::*, + raw::Statistics, rocks_metrics_defs::*, }; pub fn new_temp_engine(path: &tempfile::TempDir) -> Engines { @@ -28,7 +28,7 @@ pub fn new_default_engine(path: &str) -> Result { pub fn new_engine(path: &str, cfs: &[&str]) -> Result { let mut db_opts = RocksDbOptions::default(); - db_opts.enable_statistics(true); + db_opts.set_statistics(&Statistics::new_titan()); let cf_opts = cfs.iter().map(|name| (*name, Default::default())).collect(); new_engine_opt(path, db_opts, cf_opts) } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 605feedc7bd..16849acd5b8 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -88,7 +88,7 @@ pub mod kv { RocksEngine as KvTestEngine, RocksEngineIterator as KvTestEngineIterator, RocksSnapshot as KvTestSnapshot, RocksWriteBatchVec as KvTestWriteBatch, }; - use engine_traits::{MiscExt, Result, TabletFactory}; + use engine_traits::{MiscExt, Result, TabletContext, TabletFactory}; use crate::ctor::{CfOptions as KvTestCfOptions, DbOptions, KvEngineConstructorExt}; @@ -119,7 +119,7 @@ pub mod kv { } impl TabletFactory for TestTabletFactory { - fn open_tablet(&self, _id: u64, _suffix: Option, path: &Path) -> Result { + fn open_tablet(&self, _ctx: TabletContext, path: &Path) -> Result { KvTestEngine::new_kv_engine_opt( path.to_str().unwrap(), self.db_opt.clone(), @@ -127,7 +127,7 @@ pub mod kv { ) } - fn destroy_tablet(&self, _id: u64, _suffix: Option, path: &Path) -> Result<()> { + fn destroy_tablet(&self, _ctx: TabletContext, path: &Path) -> Result<()> { let tombstone_path = path.join(TOMBSTONE_SUFFIX); std::fs::remove_dir_all(&tombstone_path)?; std::fs::rename(path, &tombstone_path)?; diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index fcfcbdb2799..2370f1c9e7e 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -13,6 +13,7 @@ collections = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } +keys = { workspace = true } kvproto = { workspace = true } log_wrappers = { workspace = true } protobuf = "2" diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index 988cd343fe3..acecb976f58 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -1,6 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + fmt::{self, Debug, Formatter}, path::{Path, PathBuf}, sync::{ atomic::{AtomicU64, Ordering}, @@ -9,6 +10,7 @@ use std::{ }; use collections::HashMap; +use kvproto::metapb::Region; use tikv_util::box_err; use crate::{Error, Result}; @@ -69,20 +71,67 @@ impl CachedTablet { } } +/// Context to be passed to `TabletFactory`. +#[derive(Clone)] +pub struct TabletContext { + /// ID of the tablet. It is usually the region ID. + pub id: u64, + /// Suffix the tablet. It is usually the index that the tablet starts accept + /// incremental modification. The reason to have suffix is that we can keep + /// more than one tablet for a region. + pub suffix: Option, + /// The expected start key of the tablet. The key should be in the format + /// tablet is actually stored, for example should have `z` prefix. + /// + /// Any key that is smaller than this key can be considered obsolete. + pub start_key: Box<[u8]>, + /// The expected end key of the tablet. The key should be in the format + /// tablet is actually stored, for example should have `z` prefix. + /// + /// Any key that is larger than or equal to this key can be considered + /// obsolete. + pub end_key: Box<[u8]>, +} + +impl Debug for TabletContext { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("TabletContext") + .field("id", &self.id) + .field("suffix", &self.suffix) + .field("start_key", &log_wrappers::Value::key(&self.start_key)) + .field("end_key", &log_wrappers::Value::key(&self.end_key)) + .finish() + } +} + +impl TabletContext { + pub fn new(region: &Region, suffix: Option) -> Self { + TabletContext { + id: region.get_id(), + suffix, + start_key: keys::data_key(region.get_start_key()).into_boxed_slice(), + end_key: keys::data_end_key(region.get_end_key()).into_boxed_slice(), + } + } + + /// Create a context that assumes there is only one region and it covers the + /// whole key space. Normally you should only use this in tests. + pub fn with_infinite_region(id: u64, suffix: Option) -> Self { + let mut region = Region::default(); + region.set_id(id); + Self::new(®ion, suffix) + } +} + /// A factory trait to create new tablet for multi-rocksdb architecture. // It should be named as `EngineFactory` for consistency, but we are about to // rename engine to tablet, so always use tablet for new traits/types. pub trait TabletFactory: Send + Sync { /// Open the tablet in `path`. - /// - /// `id` and `suffix` is used to mark the identity of tablet. The id is - /// likely the region Id, the suffix could be the current raft log - /// index. The reason to have suffix is that we can keep more than one - /// tablet for a region. - fn open_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result; + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result; /// Destroy the tablet and its data - fn destroy_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result<()>; + fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()>; /// Check if the tablet with specified path exists fn exists(&self, path: &Path) -> bool; @@ -105,12 +154,12 @@ impl TabletFactory for SingletonFactory { /// likely the region Id, the suffix could be the current raft log /// index. The reason to have suffix is that we can keep more than one /// tablet for a region. - fn open_tablet(&self, _id: u64, _suffix: Option, _path: &Path) -> Result { + fn open_tablet(&self, _ctx: TabletContext, _path: &Path) -> Result { Ok(self.tablet.clone()) } /// Destroy the tablet and its data - fn destroy_tablet(&self, _id: u64, _suffix: Option, _path: &Path) -> Result<()> { + fn destroy_tablet(&self, _ctx: TabletContext, _path: &Path) -> Result<()> { Ok(()) } @@ -205,19 +254,21 @@ impl TabletRegistry { /// Load the tablet and set it as the latest. /// /// If the tablet doesn't exist, it will create an empty one. - pub fn load(&self, id: u64, suffix: u64, create: bool) -> Result> + pub fn load(&self, ctx: TabletContext, create: bool) -> Result> where EK: Clone, { - let path = self.tablet_path(id, suffix); + assert!(ctx.suffix.is_some()); + let id = ctx.id; + let path = self.tablet_path(id, ctx.suffix.unwrap()); if !create && !self.tablets.factory.exists(&path) { return Err(Error::Other(box_err!( "tablet ({}, {:?}) doesn't exist", id, - suffix + ctx.suffix ))); } - let tablet = self.tablets.factory.open_tablet(id, Some(suffix), &path)?; + let tablet = self.tablets.factory.open_tablet(ctx, &path)?; let mut cached = self.get_or_default(id); cached.set(tablet); Ok(cached) @@ -288,11 +339,13 @@ mod tests { let tablet = Arc::new(1); let singleton = SingletonFactory::new(tablet.clone()); let registry = TabletRegistry::new(Box::new(singleton), "").unwrap(); - registry.load(1, 1, true).unwrap(); + let mut ctx = TabletContext::with_infinite_region(1, Some(1)); + registry.load(ctx.clone(), true).unwrap(); let mut cached = registry.get(1).unwrap(); assert_eq!(cached.latest().cloned(), Some(tablet.clone())); - registry.load(2, 1, true).unwrap(); + ctx.id = 2; + registry.load(ctx.clone(), true).unwrap(); let mut count = 0; registry.for_each_opened_tablet(|id, cached| { assert!(&[1, 2].contains(&id), "{}", id); @@ -305,11 +358,12 @@ mod tests { // Destroy should be ignored. registry .tablet_factory() - .destroy_tablet(2, Some(1), ®istry.tablet_path(2, 1)) + .destroy_tablet(ctx.clone(), ®istry.tablet_path(2, 1)) .unwrap(); // Exist check should always succeed. - registry.load(3, 1, false).unwrap(); + ctx.id = 3; + registry.load(ctx, false).unwrap(); let mut cached = registry.get(3).unwrap(); assert_eq!(cached.latest().cloned(), Some(tablet)); } @@ -321,12 +375,12 @@ mod tests { } impl TabletFactory for MemoryTablet { - fn open_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result { + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { let mut tablet = self.tablet.lock().unwrap(); if tablet.contains_key(path) { return Err(Error::Other(box_err!("tablet is opened"))); } - tablet.insert(path.to_owned(), Arc::new((id, suffix.unwrap_or(0)))); + tablet.insert(path.to_owned(), Arc::new((ctx.id, ctx.suffix.unwrap_or(0)))); Ok(tablet[path].clone()) } @@ -335,9 +389,9 @@ mod tests { tablet.contains_key(path) } - fn destroy_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result<()> { + fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()> { let prev = self.tablet.lock().unwrap().remove(path).unwrap(); - assert_eq!((id, suffix.unwrap_or(0)), *prev); + assert_eq!((ctx.id, ctx.suffix.unwrap_or(0)), *prev); Ok(()) } } @@ -349,9 +403,10 @@ mod tests { }; let registry = TabletRegistry::new(Box::new(factory), "").unwrap(); - let mut tablet_1_10 = registry.load(1, 10, true).unwrap(); + let mut ctx = TabletContext::with_infinite_region(1, Some(10)); + let mut tablet_1_10 = registry.load(ctx.clone(), true).unwrap(); // It's open already, load it twice should report lock error. - registry.load(1, 10, true).unwrap_err(); + registry.load(ctx.clone(), true).unwrap_err(); let mut cached = registry.get(1).unwrap(); assert_eq!(cached.latest(), tablet_1_10.latest()); @@ -361,14 +416,15 @@ mod tests { let tablet_path = registry.tablet_path(1, 11); assert!(!registry.tablet_factory().exists(&tablet_path)); // Not exist tablet should report error. - registry.load(1, 11, false).unwrap_err(); + ctx.suffix = Some(11); + registry.load(ctx.clone(), false).unwrap_err(); assert!(registry.get(2).is_none()); // Though path not exist, but we should be able to create an empty one. assert_eq!(registry.get_or_default(2).latest(), None); assert!(!registry.tablet_factory().exists(&tablet_path)); // Load new suffix should update cache. - registry.load(1, 11, true).unwrap(); + registry.load(ctx, true).unwrap(); assert_ne!(cached.latest(), tablet_1_10.cache()); let tablet_path = registry.tablet_path(1, 11); assert!(registry.tablet_factory().exists(&tablet_path)); diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 7de49a716c3..157150126b4 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -30,7 +30,8 @@ use std::{cmp, collections::VecDeque}; use collections::HashSet; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{ - Checkpointer, DeleteStrategy, KvEngine, RaftEngine, RaftLogBatch, Range, CF_DEFAULT, + Checkpointer, DeleteStrategy, KvEngine, RaftEngine, RaftLogBatch, Range, TabletContext, + CF_DEFAULT, }; use fail::fail_point; use keys::enc_end_key; @@ -260,10 +261,8 @@ impl Apply { }); let reg = self.tablet_registry(); let path = reg.tablet_path(region_id, log_index); - let tablet = reg - .tablet_factory() - .open_tablet(region_id, Some(log_index), &path) - .unwrap(); + let ctx = TabletContext::new(®ions[derived_index], Some(log_index)); + let tablet = reg.tablet_factory().open_tablet(ctx, &path).unwrap(); // Remove the old write batch. self.write_batch.take(); self.publish_tablet(tablet); @@ -496,7 +495,7 @@ mod test { kv::TestTabletFactory, raft, }; - use engine_traits::{CfOptionsExt, Peekable, TabletRegistry, WriteBatch, ALL_CFS}; + use engine_traits::{CfOptionsExt, Peekable, TabletRegistry, WriteBatch, DATA_CFS}; use futures::channel::mpsc::unbounded; use kvproto::{ metapb::RegionEpoch, @@ -631,14 +630,15 @@ mod test { let logger = slog_global::borrow_global().new(o!()); let path = TempDir::new().unwrap(); - let cf_opts = ALL_CFS + let cf_opts = DATA_CFS .iter() .copied() .map(|cf| (cf, CfOptions::default())) .collect(); let factory = Box::new(TestTabletFactory::new(DbOptions::default(), cf_opts)); let reg = TabletRegistry::new(factory, path.path()).unwrap(); - reg.load(region.id, 5, true).unwrap(); + let ctx = TabletContext::new(®ion, Some(5)); + reg.load(ctx, true).unwrap(); let mut region_state = RegionLocalState::default(); region_state.set_state(PeerState::Normal); diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 120e64cb872..19f9a7e91b9 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -438,7 +438,7 @@ mod tests { ctor::{CfOptions, DbOptions}, kv::{KvTestEngine, TestTabletFactory}, }; - use engine_traits::{MiscExt, Peekable, SyncMutable, ALL_CFS}; + use engine_traits::{MiscExt, Peekable, SyncMutable, TabletContext, DATA_CFS}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; use raftstore::store::{ @@ -546,7 +546,7 @@ mod tests { // Building a tablet factory let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let path = Builder::new() .prefix("test-local-reader") .tempdir() @@ -631,7 +631,8 @@ mod tests { }; meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data - reg.load(1, 10, true).unwrap(); + let ctx = TabletContext::new(®ion1, Some(10)); + reg.load(ctx, true).unwrap(); } let (ch_tx, ch_rx) = sync_channel(1); @@ -737,7 +738,7 @@ mod tests { fn test_read_delegate() { // Building a tablet factory let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let path = Builder::new() .prefix("test-local-reader") .tempdir() @@ -758,7 +759,8 @@ mod tests { meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data - reg.load(1, 10, true).unwrap(); + let mut ctx = TabletContext::with_infinite_region(1, Some(10)); + reg.load(ctx, true).unwrap(); tablet1 = reg.get(1).unwrap().latest().unwrap().clone(); tablet1.put(b"a1", b"val1").unwrap(); @@ -767,7 +769,8 @@ mod tests { meta.readers.insert(2, read_delegate); // create tablet with region_id 1 and prepare some data - reg.load(2, 10, true).unwrap(); + ctx = TabletContext::with_infinite_region(2, Some(10)); + reg.load(ctx, true).unwrap(); tablet2 = reg.get(2).unwrap().latest().unwrap().clone(); tablet2.put(b"a2", b"val2").unwrap(); } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 8ac27ba2466..5bf9fc27269 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -28,7 +28,7 @@ use std::{ }, }; -use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine, TabletContext, TabletRegistry}; use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; use protobuf::Message; use raft::eraftpb::Snapshot; @@ -125,9 +125,8 @@ impl Peer { let first_index = self.storage().entry_storage().first_index(); if first_index == persisted_index + 1 { let region_id = self.region_id(); - ctx.tablet_registry - .load(region_id, persisted_index, false) - .unwrap(); + let tablet_ctx = TabletContext::new(self.region(), Some(persisted_index)); + ctx.tablet_registry.load(tablet_ctx, false).unwrap(); self.schedule_apply_fsm(ctx); self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(persisted_index); diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 02bbb03c35e..6111e75e691 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -8,7 +8,7 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::atomic::AtomicCell; -use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletRegistry}; +use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletContext, TabletRegistry}; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; @@ -111,16 +111,20 @@ impl Peer { let region_id = storage.region().get_id(); let tablet_index = storage.region_state().get_tablet_index(); + + let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; + let region = raft_group.store().region_state().get_region().clone(); + let cached_tablet = tablet_registry.get_or_default(region_id); - // Another option is always create tablet even if tablet index is 0. But this - // can introduce race when gc old tablet and create new peer. + // We can't create tablet if tablet index is 0. It can introduce race when gc + // old tablet and create new peer. We also can't get the correct range of the + // region, which is required for kv data gc. if tablet_index != 0 { + let ctx = TabletContext::new(®ion, Some(tablet_index)); // TODO: Perhaps we should stop create the tablet automatically. - tablet_registry.load(region_id, tablet_index, false)?; + tablet_registry.load(ctx, false)?; } - let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; - let region = raft_group.store().region_state().get_region().clone(); let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { tablet: cached_tablet, diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 5211d293e0f..a27e79549e1 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -376,7 +376,8 @@ mod tests { raft::RaftTestEngine, }; use engine_traits::{ - KvEngine, RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletRegistry, ALL_CFS, + KvEngine, RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletContext, TabletRegistry, + DATA_CFS, }; use kvproto::{ metapb::{Peer, Region}, @@ -476,7 +477,7 @@ mod tests { raft_engine.consume(&mut wb, true).unwrap(); // building a tablet factory let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); let reg = TabletRegistry::new(factory, path.path().join("tablet")).unwrap(); let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); @@ -523,10 +524,11 @@ mod tests { mgr.init().unwrap(); // building a tablet factory let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); let reg = TabletRegistry::new(factory, path.path().join("tablet")).unwrap(); - reg.load(region.get_id(), 10, true).unwrap(); + let tablet_ctx = TabletContext::new(®ion, Some(10)); + reg.load(tablet_ctx, true).unwrap(); // setup read runner worker and peer storage let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); let sched = worker.scheduler(); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 11f8094612b..b09f351b066 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -20,7 +20,7 @@ use engine_test::{ kv::{KvTestEngine, KvTestSnapshot, TestTabletFactory}, raft::RaftTestEngine, }; -use engine_traits::{TabletRegistry, ALL_CFS}; +use engine_traits::{TabletContext, TabletRegistry, DATA_CFS}; use futures::executor::block_on; use kvproto::{ metapb::{self, RegionEpoch, Store}, @@ -47,6 +47,18 @@ use tikv_util::{ }; use txn_types::WriteBatchFlags; +pub fn check_skip_wal(path: &str) { + let mut found = false; + for f in std::fs::read_dir(path).unwrap() { + let e = f.unwrap(); + if e.path().extension().map_or(false, |ext| ext == "log") { + found = true; + assert_eq!(e.metadata().unwrap().len(), 0, "{}", e.path().display()); + } + } + assert!(found, "no WAL found in {}", path); +} + pub struct TestRouter(RaftRouter); impl Deref for TestRouter { @@ -209,7 +221,7 @@ impl RunningState { causal_ts_provider: Option>, logger: &Logger, ) -> (TestRouter, TabletSnapManager, Self) { - let cf_opts = ALL_CFS + let cf_opts = DATA_CFS .iter() .copied() .map(|cf| (cf, CfOptions::default())) @@ -226,16 +238,13 @@ impl RunningState { if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { let factory = registry.tablet_factory(); let path = registry.tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); + let ctx = TabletContext::new(®ion, Some(RAFT_INIT_LOG_INDEX)); if factory.exists(&path) { registry.remove(region.get_id()); - factory - .destroy_tablet(region.get_id(), Some(RAFT_INIT_LOG_INDEX), &path) - .unwrap(); + factory.destroy_tablet(ctx.clone(), &path).unwrap(); } // Create the tablet without loading it in cache. - factory - .open_tablet(region.get_id(), Some(RAFT_INIT_LOG_INDEX), &path) - .unwrap(); + factory.open_tablet(ctx, &path).unwrap(); } let (router, mut system) = create_store_batch_system::( diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs index 807d64de756..29f665758d6 100644 --- a/components/raftstore-v2/tests/integrations/test_basic_write.rs +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -12,7 +12,7 @@ use raftstore::store::{INIT_EPOCH_CONF_VER, INIT_EPOCH_VER}; use raftstore_v2::router::PeerMsg; use tikv_util::store::new_peer; -use crate::cluster::Cluster; +use crate::cluster::{check_skip_wal, Cluster}; /// Test basic write flow. #[test] @@ -147,4 +147,8 @@ fn test_put_delete() { assert!(!resp.get_header().has_error(), "{:?}", resp); let snap = router.stale_snapshot(2); assert_matches!(snap.get_value(b"key"), Ok(None)); + + // Check if WAL is skipped for basic writes. + let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); + check_skip_wal(cached.latest().unwrap().as_inner().path()); } diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 1b9ca50daf7..db62ae4a75a 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -8,7 +8,7 @@ use raft::prelude::ConfChangeType; use raftstore_v2::router::{PeerMsg, PeerTick}; use tikv_util::store::new_learner_peer; -use crate::cluster::Cluster; +use crate::cluster::{check_skip_wal, Cluster}; #[test] fn test_simple_change() { @@ -97,4 +97,8 @@ fn test_simple_change() { assert_eq!(meta.region_state.peers, vec![leader_peer]); // TODO: check if the peer is removed once life trace is implemented or // snapshot is implemented. + + // Check if WAL is skipped for admin command. + let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); + check_skip_wal(cached.latest().unwrap().as_inner().path()); } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index a50e3a39667..065afd8ec0c 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -45,7 +45,7 @@ use engine_rocks::{ use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ CachedTablet, CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, - RaftEngine, SingletonFactory, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, + RaftEngine, SingletonFactory, TabletContext, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, }; use error_code::ErrorCodeExt; use file_system::{ @@ -1821,7 +1821,8 @@ impl TikvServer { let reg = TabletRegistry::new(Box::new(SingletonFactory::new(kv_engine)), &self.store_path) .unwrap(); // It always use the singleton kv_engine, use arbitrary id and suffix. - reg.load(0, 0, false).unwrap(); + let ctx = TabletContext::with_infinite_region(0, Some(0)); + reg.load(ctx, false).unwrap(); self.tablet_registry = Some(reg.clone()); engines.raft.register_config(cfg_controller); @@ -2087,7 +2088,9 @@ mod test { }; use engine_rocks::raw::Env; - use engine_traits::{FlowControlFactorsExt, MiscExt, SyncMutable, TabletRegistry, CF_DEFAULT}; + use engine_traits::{ + FlowControlFactorsExt, MiscExt, SyncMutable, TabletContext, TabletRegistry, CF_DEFAULT, + }; use tempfile::Builder; use tikv::{config::TikvConfig, server::KvEngineFactoryBuilder}; use tikv_util::{config::ReadableSize, time::Instant}; @@ -2109,7 +2112,8 @@ mod test { let reg = TabletRegistry::new(Box::new(factory), path.path()).unwrap(); for i in 1..6 { - reg.load(i, 10, true).unwrap(); + let ctx = TabletContext::with_infinite_region(i, Some(10)); + reg.load(ctx, true).unwrap(); } let mut cached = reg.get(1).unwrap(); @@ -2127,7 +2131,8 @@ mod test { .unwrap() .unwrap(); - reg.load(1, 20, true).unwrap(); + let ctx = TabletContext::with_infinite_region(1, Some(20)); + reg.load(ctx, true).unwrap(); tablet = cached.latest().unwrap(); for i in 1..11 { diff --git a/components/snap_recovery/src/init_cluster.rs b/components/snap_recovery/src/init_cluster.rs index 08a45073309..9147810f03c 100644 --- a/components/snap_recovery/src/init_cluster.rs +++ b/components/snap_recovery/src/init_cluster.rs @@ -10,7 +10,10 @@ use pd_client::{Error as PdError, PdClient}; use raft_log_engine::RaftLogEngine; use raftstore::store::initial_region; use thiserror::Error; -use tikv::{config::TikvConfig, server::config::Config as ServerConfig}; +use tikv::{ + config::TikvConfig, + server::{config::Config as ServerConfig, KvEngineFactoryBuilder}, +}; use tikv_util::config::{ReadableDuration, ReadableSize, VersionTrack}; const CLUSTER_BOOTSTRAPPED_MAX_RETRY: u64 = 60; @@ -308,15 +311,10 @@ pub fn create_local_engine_service( let block_cache = config.storage.block_cache.build_shared_cache(); // init rocksdb / kv db - let mut db_opts = config.rocksdb.build_opt(); - db_opts.set_env(env.clone()); - let cf_opts = config - .rocksdb - .build_cf_opts(&block_cache, None, config.storage.api_version()); - let db_path = config - .infer_kv_engine_path(None) - .map_err(|e| format!("infer kvdb path: {}", e))?; - let kv_db = match new_engine_opt(&db_path, db_opts, cf_opts) { + let factory = KvEngineFactoryBuilder::new(env.clone(), config, block_cache) + .lite(true) + .build(); + let kv_db = match factory.create_shared_db(&config.storage.data_dir) { Ok(db) => db, Err(e) => handle_engine_error(e), }; @@ -326,7 +324,7 @@ pub fn create_local_engine_service( // rocksdb let mut raft_db_opts = config.raftdb.build_opt(); raft_db_opts.set_env(env); - let raft_db_cf_opts = config.raftdb.build_cf_opts(&block_cache); + let raft_db_cf_opts = config.raftdb.build_cf_opts(factory.block_cache()); let raft_path = config .infer_raft_db_path(None) .map_err(|e| format!("infer raftdb path: {}", e))?; diff --git a/src/config/mod.rs b/src/config/mod.rs index 2b0818e93d3..0945eb7ca21 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -31,6 +31,7 @@ use engine_rocks::{ raw::{ BlockBasedOptions, Cache, ChecksumType, CompactionPriority, DBCompactionStyle, DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, PrepopulateBlockCache, + Statistics, }, util::{FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform}, RaftDbLogger, RangePropertiesCollectorFactory, RawMvccPropertiesCollectorFactory, @@ -80,7 +81,7 @@ use crate::{ ttl::TtlCompactionFilterFactory, Config as ServerConfig, CONFIG_ROCKSDB_GAUGE, }, - storage::config::{Config as StorageConfig, DEFAULT_DATA_DIR}, + storage::config::{Config as StorageConfig, EngineType, DEFAULT_DATA_DIR}, }; pub const DEFAULT_ROCKSDB_SUB_DIR: &str = "db"; @@ -107,6 +108,15 @@ pub const LAST_CONFIG_FILE: &str = "last_tikv.toml"; const TMP_CONFIG_FILE: &str = "tmp_tikv.toml"; const MAX_BLOCK_SIZE: usize = 32 * MIB as usize; +fn bloom_filter_ratio(et: EngineType) -> f64 { + match et { + EngineType::RaftKv => 0.1, + // In v2, every peer has its own tablet. The data scale is about tens of + // GiBs. We only need a small portion for those key. + EngineType::RaftKv2 => 0.005, + } +} + fn memory_limit_for_cf(is_raft_db: bool, cf: &str, total_mem: u64) -> ReadableSize { let (ratio, min, max) = match (is_raft_db, cf) { (true, CF_DEFAULT) => (0.02, RAFT_MIN_MEM, RAFT_MAX_MEM), @@ -663,8 +673,10 @@ impl DefaultCfConfig { cache: &Cache, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, + for_engine: EngineType, ) -> RocksCfOptions { let mut cf_opts = build_cf_opt!(self, CF_DEFAULT, cache, region_info_accessor); + cf_opts.set_memtable_prefix_bloom_size_ratio(bloom_filter_ratio(for_engine)); let f = RangePropertiesCollectorFactory { prop_size_index_distance: self.prop_size_index_distance, prop_keys_index_distance: self.prop_keys_index_distance, @@ -778,6 +790,7 @@ impl WriteCfConfig { &self, cache: &Cache, region_info_accessor: Option<&RegionInfoAccessor>, + for_engine: EngineType, ) -> RocksCfOptions { let mut cf_opts = build_cf_opt!(self, CF_WRITE, cache, region_info_accessor); // Prefix extractor(trim the timestamp at tail) for write cf. @@ -788,7 +801,7 @@ impl WriteCfConfig { ) .unwrap(); // Create prefix bloom filter for memtable. - cf_opts.set_memtable_prefix_bloom_size_ratio(0.1); + cf_opts.set_memtable_prefix_bloom_size_ratio(bloom_filter_ratio(for_engine)); // Collects user defined properties. cf_opts.add_table_properties_collector_factory( "tikv.mvcc-properties-collector", @@ -872,7 +885,7 @@ impl Default for LockCfConfig { } impl LockCfConfig { - pub fn build_opt(&self, cache: &Cache) -> RocksCfOptions { + pub fn build_opt(&self, cache: &Cache, for_engine: EngineType) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; let mut cf_opts = build_cf_opt!(self, CF_LOCK, cache, no_region_info_accessor); cf_opts @@ -883,7 +896,7 @@ impl LockCfConfig { prop_keys_index_distance: self.prop_keys_index_distance, }; cf_opts.add_table_properties_collector_factory("tikv.range-properties-collector", f); - cf_opts.set_memtable_prefix_bloom_size_ratio(0.1); + cf_opts.set_memtable_prefix_bloom_size_ratio(bloom_filter_ratio(for_engine)); cf_opts.set_titan_cf_options(&self.titan.build_opts()); cf_opts } @@ -1058,14 +1071,16 @@ pub struct DbConfig { pub use_direct_io_for_flush_and_compaction: bool, #[online_config(skip)] pub enable_pipelined_write: bool, - // deprecated. TiKV will use a new write mode when set `enable_pipelined_write` false and fall - // back to write mode in 3.0 when set `enable_pipelined_write` true. The code of - // multi-batch-write in RocksDB has been removed. #[online_config(skip)] - #[serde(skip_serializing)] - pub enable_multi_batch_write: bool, + pub enable_multi_batch_write: Option, #[online_config(skip)] pub enable_unordered_write: bool, + #[online_config(skip)] + pub allow_concurrent_memtable_write: Option, + // Dangerous option only for programming use. + #[online_config(skip)] + #[serde(skip)] + pub paranoid_checks: Option, #[online_config(submodule)] pub defaultcf: DefaultCfConfig, #[online_config(submodule)] @@ -1115,8 +1130,10 @@ impl Default for DbConfig { writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, enable_pipelined_write: false, - enable_multi_batch_write: true, // deprecated + enable_multi_batch_write: None, // deprecated enable_unordered_write: false, + allow_concurrent_memtable_write: None, + paranoid_checks: None, defaultcf: DefaultCfConfig::default(), writecf: WriteCfConfig::default(), lockcf: LockCfConfig::default(), @@ -1127,7 +1144,19 @@ impl Default for DbConfig { } impl DbConfig { - pub fn build_opt(&self) -> RocksDbOptions { + pub fn optimize_for(&mut self, engine: EngineType) { + match engine { + EngineType::RaftKv => { + self.allow_concurrent_memtable_write.get_or_insert(true); + } + EngineType::RaftKv2 => { + self.enable_multi_batch_write.get_or_insert(false); + self.allow_concurrent_memtable_write.get_or_insert(false); + } + } + } + + pub fn build_opt(&self, stats: Option<&Statistics>) -> RocksDbOptions { let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { @@ -1143,7 +1172,12 @@ impl DbConfig { opts.set_max_manifest_file_size(self.max_manifest_file_size.0); opts.create_if_missing(self.create_if_missing); opts.set_max_open_files(self.max_open_files); - opts.enable_statistics(self.enable_statistics); + if self.enable_statistics { + match stats { + Some(stats) => opts.set_statistics(stats), + None => opts.set_statistics(&Statistics::new_titan()), + } + } opts.set_stats_dump_period_sec(self.stats_dump_period.as_secs() as usize); opts.set_compaction_readahead_size(self.compaction_readahead_size.0); opts.set_max_log_file_size(self.info_log_max_size.0); @@ -1175,9 +1209,19 @@ impl DbConfig { self.use_direct_io_for_flush_and_compaction, ); opts.enable_pipelined_write(self.enable_pipelined_write); - let enable_multi_batch_write = !self.enable_pipelined_write && !self.enable_unordered_write; + let mut enable_multi_batch_write = + !self.enable_pipelined_write && !self.enable_unordered_write; + if self.allow_concurrent_memtable_write == Some(false) + && self.enable_multi_batch_write == Some(false) + { + enable_multi_batch_write = false + } opts.enable_multi_batch_write(enable_multi_batch_write); opts.enable_unordered_write(self.enable_unordered_write); + opts.allow_concurrent_memtable_write(self.allow_concurrent_memtable_write.unwrap_or(true)); + if let Some(b) = self.paranoid_checks { + opts.set_paranoid_checks(b); + } opts.set_info_log(RocksdbLogger::default()); opts.set_info_log_level(self.info_log_level.into()); if self.titan.enabled { @@ -1191,21 +1235,24 @@ impl DbConfig { cache: &Cache, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, + for_engine: EngineType, ) -> Vec<(&'static str, RocksCfOptions)> { - vec![ - ( - CF_DEFAULT, - self.defaultcf - .build_opt(cache, region_info_accessor, api_version), - ), - (CF_LOCK, self.lockcf.build_opt(cache)), - ( - CF_WRITE, - self.writecf.build_opt(cache, region_info_accessor), - ), - // TODO: remove CF_RAFT. - (CF_RAFT, self.raftcf.build_opt(cache)), - ] + let mut cf_opts = Vec::with_capacity(4); + cf_opts.push(( + CF_DEFAULT, + self.defaultcf + .build_opt(cache, region_info_accessor, api_version, for_engine), + )); + cf_opts.push((CF_LOCK, self.lockcf.build_opt(cache, for_engine))); + cf_opts.push(( + CF_WRITE, + self.writecf + .build_opt(cache, region_info_accessor, for_engine), + )); + if for_engine == EngineType::RaftKv { + cf_opts.push((CF_RAFT, self.raftcf.build_opt(cache))); + } + cf_opts } fn validate(&mut self) -> Result<(), Box> { @@ -1452,7 +1499,9 @@ impl RaftDbConfig { opts.set_max_manifest_file_size(self.max_manifest_file_size.0); opts.create_if_missing(self.create_if_missing); opts.set_max_open_files(self.max_open_files); - opts.enable_statistics(self.enable_statistics); + if self.enable_statistics { + opts.set_statistics(&Statistics::new_titan()); + } opts.set_stats_dump_period_sec(self.stats_dump_period.as_secs() as usize); opts.set_compaction_readahead_size(self.compaction_readahead_size.0); opts.set_max_log_file_size(self.info_log_max_size.0); @@ -3010,6 +3059,8 @@ impl TikvConfig { config::canonicalize_sub_path(&self.storage.data_dir, "log-backup-temp")?; } + self.rocksdb.optimize_for(self.storage.engine); + self.rocksdb.validate()?; self.raftdb.validate()?; self.raft_engine.validate()?; @@ -4367,11 +4418,12 @@ mod tests { assert_eq!(F::TAG, cfg.storage.api_version()); let engine = RocksDBEngine::new( &cfg.storage.data_dir, - Some(cfg.rocksdb.build_opt()), + Some(cfg.rocksdb.build_opt(None)), cfg.rocksdb.build_cf_opts( &cfg.storage.block_cache.build_shared_cache(), None, cfg.storage.api_version(), + cfg.storage.engine, ), None, ) diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 01dc1e4a786..2680c778f02 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -3,16 +3,21 @@ use std::{path::Path, sync::Arc}; use engine_rocks::{ - raw::{Cache, Env}, + raw::{Cache, Env, Statistics}, CompactedEventSender, CompactionListener, FlowListener, RocksCfOptions, RocksCompactionJobInfo, RocksDbOptions, RocksEngine, RocksEventListener, }; -use engine_traits::{CompactionJobInfo, MiscExt, Result, TabletFactory, CF_DEFAULT, CF_WRITE}; +use engine_traits::{ + CompactionJobInfo, MiscExt, Result, TabletContext, TabletFactory, CF_DEFAULT, CF_WRITE, +}; use kvproto::kvrpcpb::ApiVersion; use raftstore::RegionInfoAccessor; use tikv_util::worker::Scheduler; -use crate::config::{DbConfig, TikvConfig, DEFAULT_ROCKSDB_SUB_DIR}; +use crate::{ + config::{DbConfig, TikvConfig, DEFAULT_ROCKSDB_SUB_DIR}, + storage::config::EngineType, +}; struct FactoryInner { env: Arc, @@ -22,6 +27,8 @@ struct FactoryInner { api_version: ApiVersion, flow_listener: Option, sst_recovery_sender: Option>, + statistics: Statistics, + lite: bool, } pub struct KvEngineFactoryBuilder { @@ -40,6 +47,8 @@ impl KvEngineFactoryBuilder { api_version: config.storage.api_version(), flow_listener: None, sst_recovery_sender: None, + statistics: Statistics::new_titan(), + lite: false, }, compact_event_sender: None, } @@ -68,6 +77,14 @@ impl KvEngineFactoryBuilder { self } + /// Set whether enable lite mode. + /// + /// In lite mode, most listener/filters will not be installed. + pub fn lite(mut self, lite: bool) -> Self { + self.inner.lite = lite; + self + } + pub fn build(self) -> KvEngineFactory { KvEngineFactory { inner: Arc::new(self.inner), @@ -107,32 +124,43 @@ impl KvEngineFactory { fn db_opts(&self) -> RocksDbOptions { // Create kv engine. - let mut db_opts = self.inner.rocksdb_config.build_opt(); + let mut db_opts = self + .inner + .rocksdb_config + .build_opt(Some(&self.inner.statistics)); db_opts.set_env(self.inner.env.clone()); - db_opts.add_event_listener(RocksEventListener::new( - "kv", - self.inner.sst_recovery_sender.clone(), - )); - if let Some(filter) = self.create_raftstore_compaction_listener() { - db_opts.add_event_listener(filter); + if !self.inner.lite { + db_opts.add_event_listener(RocksEventListener::new( + "kv", + self.inner.sst_recovery_sender.clone(), + )); + if let Some(filter) = self.create_raftstore_compaction_listener() { + db_opts.add_event_listener(filter); + } } db_opts } - fn cf_opts(&self) -> Vec<(&str, RocksCfOptions)> { + fn cf_opts(&self, for_engine: EngineType) -> Vec<(&str, RocksCfOptions)> { self.inner.rocksdb_config.build_cf_opts( &self.inner.block_cache, self.inner.region_info_accessor.as_ref(), self.inner.api_version, + for_engine, ) } + pub fn block_cache(&self) -> &Cache { + &self.inner.block_cache + } + /// Create a shared db. /// /// It will always create in path/DEFAULT_DB_SUB_DIR. - pub fn create_shared_db(&self, path: &Path) -> Result { + pub fn create_shared_db(&self, path: impl AsRef) -> Result { + let path = path.as_ref(); let mut db_opts = self.db_opts(); - let cf_opts = self.cf_opts(); + let cf_opts = self.cf_opts(EngineType::RaftKv); if let Some(listener) = &self.inner.flow_listener { db_opts.add_event_listener(listener.clone()); } @@ -147,27 +175,27 @@ impl KvEngineFactory { } impl TabletFactory for KvEngineFactory { - fn open_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result { + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { let mut db_opts = self.db_opts(); - let cf_opts = self.cf_opts(); - if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = suffix { - db_opts.add_event_listener(listener.clone_with(id, suffix)); + let cf_opts = self.cf_opts(EngineType::RaftKv2); + if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { + db_opts.add_event_listener(listener.clone_with(ctx.id, suffix)); } let kv_engine = engine_rocks::util::new_engine_opt(path.to_str().unwrap(), db_opts, cf_opts); if let Err(e) = &kv_engine { - error!("failed to create tablet"; "id" => id, "suffix" => ?suffix, "path" => %path.display(), "err" => ?e); - } else if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = suffix { - listener.clone_with(id, suffix).on_created(); + error!("failed to create tablet"; "id" => ctx.id, "suffix" => ?ctx.suffix, "path" => %path.display(), "err" => ?e); + } else if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { + listener.clone_with(ctx.id, suffix).on_created(); } kv_engine } - fn destroy_tablet(&self, id: u64, suffix: Option, path: &Path) -> Result<()> { - info!("destroy tablet"; "path" => %path.display(), "id" => id, "suffix" => ?suffix); + fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()> { + info!("destroy tablet"; "path" => %path.display(), "id" => ctx.id, "suffix" => ?ctx.suffix); // Create kv engine. let _db_opts = self.db_opts(); - let _cf_opts = self.cf_opts(); + let _cf_opts = self.cf_opts(EngineType::RaftKv2); // TODOTODO: call rust-rocks or tirocks to destroy_engine; // engine_rocks::util::destroy_engine( // path.to_str().unwrap(), @@ -175,8 +203,8 @@ impl TabletFactory for KvEngineFactory { // kv_cfs_opts, // )?; let _ = std::fs::remove_dir_all(path); - if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = suffix { - listener.clone_with(id, suffix).on_destroyed(); + if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { + listener.clone_with(ctx.id, suffix).on_destroyed(); } Ok(()) } @@ -214,15 +242,20 @@ mod tests { let reg = TabletRegistry::new(Box::new(factory), dir.path()).unwrap(); let path = reg.tablet_path(1, 3); assert!(!reg.tablet_factory().exists(&path)); - let engine = reg.tablet_factory().open_tablet(1, Some(3), &path).unwrap(); + let mut tablet_ctx = TabletContext::with_infinite_region(1, Some(3)); + let engine = reg + .tablet_factory() + .open_tablet(tablet_ctx.clone(), &path) + .unwrap(); assert!(reg.tablet_factory().exists(&path)); // Second attempt should fail with lock. reg.tablet_factory() - .open_tablet(1, Some(3), &path) + .open_tablet(tablet_ctx.clone(), &path) .unwrap_err(); drop(engine); + tablet_ctx.suffix = Some(3); reg.tablet_factory() - .destroy_tablet(1, Some(3), &path) + .destroy_tablet(tablet_ctx, &path) .unwrap(); assert!(!reg.tablet_factory().exists(&path)); } diff --git a/src/storage/config.rs b/src/storage/config.rs index 3501cefa252..68d739c1639 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -31,12 +31,21 @@ const DEFAULT_SCHED_PENDING_WRITE_MB: u64 = 100; const DEFAULT_RESERVED_SPACE_GB: u64 = 5; const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "kebab-case")] +pub enum EngineType { + RaftKv, + RaftKv2, +} + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct Config { #[online_config(skip)] pub data_dir: String, + #[online_config(skip)] + pub engine: EngineType, // Replaced by `GcConfig.ratio_threshold`. Keep it for backward compatibility. #[online_config(skip)] pub gc_ratio_threshold: f64, @@ -75,6 +84,7 @@ impl Default for Config { let cpu_num = SysQuota::cpu_cores_quota(); Config { data_dir: DEFAULT_DATA_DIR.to_owned(), + engine: EngineType::RaftKv, gc_ratio_threshold: DEFAULT_GC_RATIO_THRESHOLD, max_key_size: DEFAULT_MAX_KEY_SIZE, scheduler_concurrency: DEFAULT_SCHED_CONCURRENCY, diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index f02ee31c5f2..12a7776e434 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -12,7 +12,7 @@ use kvproto::kvrpcpb::ApiVersion; use tikv_util::config::ReadableSize; use crate::storage::{ - config::BlockCacheConfig, + config::{BlockCacheConfig, EngineType}, kv::{Result, RocksEngine}, }; @@ -102,10 +102,20 @@ impl TestEngineBuilder { .map(|cf| match *cf { CF_DEFAULT => ( CF_DEFAULT, - cfg_rocksdb.defaultcf.build_opt(&cache, None, api_version), + cfg_rocksdb + .defaultcf + .build_opt(&cache, None, api_version, EngineType::RaftKv), + ), + CF_LOCK => ( + CF_LOCK, + cfg_rocksdb.lockcf.build_opt(&cache, EngineType::RaftKv), + ), + CF_WRITE => ( + CF_WRITE, + cfg_rocksdb + .writecf + .build_opt(&cache, None, EngineType::RaftKv), ), - CF_LOCK => (CF_LOCK, cfg_rocksdb.lockcf.build_opt(&cache)), - CF_WRITE => (CF_WRITE, cfg_rocksdb.writecf.build_opt(&cache, None)), CF_RAFT => (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), _ => (*cf, RocksCfOptions::default()), }) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 79f48c68a88..05d5c743d76 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3593,6 +3593,7 @@ mod tests { use txn_types::{Mutation, PessimisticLock, WriteType, SHORT_VALUE_MAX_LEN}; use super::{ + config::EngineType, mvcc::tests::{must_unlocked, must_written}, test_util::*, txn::{ @@ -4137,12 +4138,23 @@ mod tests { let cfs_opts = vec![ ( CF_DEFAULT, + cfg_rocksdb.defaultcf.build_opt( + &cache, + None, + ApiVersion::V1, + EngineType::RaftKv, + ), + ), + ( + CF_LOCK, + cfg_rocksdb.lockcf.build_opt(&cache, EngineType::RaftKv), + ), + ( + CF_WRITE, cfg_rocksdb - .defaultcf - .build_opt(&cache, None, ApiVersion::V1), + .writecf + .build_opt(&cache, None, EngineType::RaftKv), ), - (CF_LOCK, cfg_rocksdb.lockcf.build_opt(&cache)), - (CF_WRITE, cfg_rocksdb.writecf.build_opt(&cache, None)), (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), ]; RocksEngine::new( diff --git a/src/storage/txn/flow_controller/tablet_flow_controller.rs b/src/storage/txn/flow_controller/tablet_flow_controller.rs index 973ed245ac8..922e986874a 100644 --- a/src/storage/txn/flow_controller/tablet_flow_controller.rs +++ b/src/storage/txn/flow_controller/tablet_flow_controller.rs @@ -291,7 +291,7 @@ impl TabletFlowController { #[cfg(test)] mod tests { use engine_rocks::FlowInfo; - use engine_traits::SingletonFactory; + use engine_traits::{SingletonFactory, TabletContext}; use tempfile::TempDir; use super::{ @@ -327,7 +327,8 @@ mod tests { let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; - reg.load(region_id, tablet_suffix, false).unwrap(); + let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); + reg.load(tablet_context, false).unwrap(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); tx.send(FlowInfo::L0Intra( @@ -354,7 +355,8 @@ mod tests { let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; - let mut cached = reg.load(region_id, tablet_suffix, false).unwrap(); + let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); + let mut cached = reg.load(tablet_context, false).unwrap(); let stub = cached.latest().unwrap().clone(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); @@ -373,7 +375,8 @@ mod tests { let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; - let mut cached = reg.load(region_id, tablet_suffix, false).unwrap(); + let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); + let mut cached = reg.load(tablet_context, false).unwrap(); let stub = cached.latest().unwrap().clone(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); @@ -392,7 +395,8 @@ mod tests { let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; - let mut cached = reg.load(region_id, tablet_suffix, false).unwrap(); + let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); + let mut cached = reg.load(tablet_context, false).unwrap(); let stub = cached.latest().unwrap().clone(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 6341f3a9e27..73dfdbaa977 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -32,7 +32,7 @@ use tikv::{ lock_manager::Config as PessimisticTxnConfig, Config as ServerConfig, }, storage::config::{ - BlockCacheConfig, Config as StorageConfig, FlowControlConfig, IoRateLimitConfig, + BlockCacheConfig, Config as StorageConfig, EngineType, FlowControlConfig, IoRateLimitConfig, }, }; use tikv_util::config::{LogFormat, ReadableDuration, ReadableSize}; @@ -308,7 +308,9 @@ fn test_serde_custom_tikv_config() { writable_file_max_buffer_size: ReadableSize::mb(12), use_direct_io_for_flush_and_compaction: true, enable_pipelined_write: false, - enable_multi_batch_write: true, + enable_multi_batch_write: Some(true), + paranoid_checks: None, + allow_concurrent_memtable_write: Some(false), enable_unordered_write: true, defaultcf: DefaultCfConfig { block_size: ReadableSize::kb(12), @@ -665,6 +667,7 @@ fn test_serde_custom_tikv_config() { raft_engine_config.memory_limit = Some(RaftEngineReadableSize::gb(1)); value.storage = StorageConfig { data_dir: "/var".to_owned(), + engine: EngineType::RaftKv2, gc_ratio_threshold: 1.2, max_key_size: 4096, scheduler_concurrency: 123, @@ -758,8 +761,13 @@ fn test_serde_custom_tikv_config() { ..Default::default() }; value.backup_stream = BackupStreamConfig { - num_threads: 12, - ..Default::default() + max_flush_interval: ReadableDuration::secs(11), + num_threads: 7, + enable: true, + temp_path: "./stream".to_string(), + file_size_limit: ReadableSize::gb(5), + initial_scan_pending_memory_quota: ReadableSize::kb(2), + initial_scan_rate_limit: ReadableSize::mb(3), }; value.import = ImportConfig { num_threads: 123, @@ -817,6 +825,7 @@ fn test_serde_custom_tikv_config() { } } +#[track_caller] fn diff_config(lhs: &TikvConfig, rhs: &TikvConfig) { let lhs_str = format!("{:?}", lhs); let rhs_str = format!("{:?}", rhs); diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index f22538a6f78..961eb59a77b 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -90,6 +90,7 @@ a = "b" [storage] data-dir = "/var" +engine = "raft-kv2" gc-ratio-threshold = 1.2 max-key-size = 4096 scheduler-concurrency = 123 @@ -268,7 +269,9 @@ max-sub-compactions = 12 writable-file-max-buffer-size = "12MB" use-direct-io-for-flush-and-compaction = true enable-pipelined-write = false +enable-multi-batch-write = true enable-unordered-write = true +allow-concurrent-memtable-write = false [rocksdb.titan] enabled = true @@ -624,6 +627,15 @@ batch-size = 7 s3-multi-part-size = "15MB" sst-max-size = "789MB" +[log-backup] +max-flush-interval = "11s" +num-threads = 7 +enable = true +temp-path = "./stream" +file-size-limit = "5GiB" +initial-scan-pending-memory-quota = "2KiB" +initial-scan-rate-limit = "3MiB" + [backup.hadoop] home = "/root/hadoop" linux-user = "hadoop" diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index c0a9ee8b1ed..d1abbcb924c 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -159,10 +159,10 @@ fn test_delete_files_in_range_for_titan() { cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize(0); cfg.rocksdb.defaultcf.titan.discardable_ratio = 0.4; cfg.rocksdb.defaultcf.titan.min_blob_size = ReadableSize(0); - let kv_db_opts = cfg.rocksdb.build_opt(); - let kv_cfs_opts = cfg - .rocksdb - .build_cf_opts(&cache, None, cfg.storage.api_version()); + let kv_db_opts = cfg.rocksdb.build_opt(None); + let kv_cfs_opts = + cfg.rocksdb + .build_cf_opts(&cache, None, cfg.storage.api_version(), cfg.storage.engine); let raft_path = path.path().join(Path::new("titan")); let engines = Engines::new( From 69cdc1e2e25a8dd623973295322e96138d77cf79 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 13 Dec 2022 17:08:52 +0800 Subject: [PATCH 403/676] raftstore-v2: deny unused (#13933) ref tikv/tikv#12842 Now most structs and functions are used, add the lint back to keep code clean. Signed-off-by: Jay Lee --- components/raftstore-v2/Cargo.toml | 4 +- components/raftstore-v2/src/batch/store.rs | 34 ++++----- components/raftstore-v2/src/fsm/apply.rs | 12 +--- components/raftstore-v2/src/fsm/peer.rs | 11 ++- components/raftstore-v2/src/fsm/store.rs | 7 +- components/raftstore-v2/src/lib.rs | 1 - .../operation/command/admin/conf_change.rs | 9 +-- .../src/operation/command/admin/mod.rs | 21 ++---- .../src/operation/command/admin/split.rs | 70 +++++++------------ .../command/admin/transfer_leader.rs | 8 +-- .../src/operation/command/control.rs | 13 +--- .../raftstore-v2/src/operation/command/mod.rs | 37 +++------- .../src/operation/command/write/mod.rs | 20 +++--- .../operation/command/write/simple_write.rs | 4 +- components/raftstore-v2/src/operation/life.rs | 4 +- components/raftstore-v2/src/operation/pd.rs | 17 ++--- .../raftstore-v2/src/operation/query/lease.rs | 9 ++- .../raftstore-v2/src/operation/query/local.rs | 17 ++--- .../raftstore-v2/src/operation/query/mod.rs | 26 +++---- .../src/operation/query/replica.rs | 5 +- .../src/operation/ready/async_writer.rs | 1 - .../raftstore-v2/src/operation/ready/mod.rs | 30 +++----- .../src/operation/ready/snapshot.rs | 32 ++++----- components/raftstore-v2/src/raft/apply.rs | 9 +-- components/raftstore-v2/src/raft/peer.rs | 34 ++------- components/raftstore-v2/src/raft/storage.rs | 29 ++++---- components/raftstore-v2/src/router/imp.rs | 2 - .../src/router/internal_message.rs | 2 - components/raftstore-v2/src/router/message.rs | 2 - .../src/router/response_channel.rs | 2 +- components/raftstore-v2/src/worker/mod.rs | 4 +- components/raftstore-v2/src/worker/pd/mod.rs | 51 +------------- .../src/worker/pd/region_heartbeat.rs | 13 +--- .../src/worker/pd/store_heartbeat.rs | 1 - 34 files changed, 165 insertions(+), 376 deletions(-) diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 1d6b67ad129..4d3d44ec6fd 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -71,9 +71,9 @@ test_util = { workspace = true } [[test]] name = "raftstore-v2-failpoints" path = "tests/failpoints/mod.rs" -required-features = ["failpoints", "testexport"] +required-features = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] [[test]] name = "raftstore-v2-integrations" path = "tests/integrations/mod.rs" -required-features = ["testexport"] +required-features = ["testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 997f8da7a9c..ac767bcd7ce 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -2,7 +2,6 @@ use std::{ ops::{Deref, DerefMut}, - path::Path, sync::{ atomic::{AtomicBool, Ordering}, Arc, Mutex, @@ -16,15 +15,10 @@ use batch_system::{ use causal_ts::CausalTsProviderImpl; use collections::HashMap; use concurrency_manager::ConcurrencyManager; -use crossbeam::channel::{Sender, TrySendError}; -use engine_traits::{Engines, KvEngine, RaftEngine, TabletRegistry}; +use crossbeam::channel::TrySendError; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use file_system::{set_io_type, IoType}; -use futures::{compat::Future01CompatExt, FutureExt}; -use kvproto::{ - disk_usage::DiskUsage, - metapb::Store, - raft_serverpb::{PeerState, RaftMessage}, -}; +use kvproto::{disk_usage::DiskUsage, raft_serverpb::RaftMessage}; use pd_client::PdClient; use raft::INVALID_ID; use raftstore::store::{ @@ -35,8 +29,6 @@ use slog::Logger; use tikv_util::{ box_err, config::{Tracker, VersionTrack}, - defer, - future::poll_future_notify, sys::SysQuota, time::Instant as TiInstant, timer::SteadyTimer, @@ -50,7 +42,7 @@ use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, - worker::{PdRunner, PdTask}, + worker::pd, Error, Result, }; @@ -83,7 +75,7 @@ pub struct StoreContext { pub self_disk_usage: DiskUsage, pub snap_mgr: TabletSnapManager, - pub pd_scheduler: Scheduler, + pub pd_scheduler: Scheduler, } /// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. @@ -208,7 +200,7 @@ impl PollHandler>>]) {} + fn end(&mut self, _batch: &mut [Option>>]) {} fn pause(&mut self) { if self.poll_ctx.trans.need_flush() { @@ -231,7 +223,7 @@ struct StorePollerBuilder { trans: T, router: StoreRouter, read_scheduler: Scheduler>, - pd_scheduler: Scheduler, + pd_scheduler: Scheduler, write_senders: WriteSenders, apply_pool: FuturePool, logger: Logger, @@ -248,7 +240,7 @@ impl StorePollerBuilder { trans: T, router: StoreRouter, read_scheduler: Scheduler>, - pd_scheduler: Scheduler, + pd_scheduler: Scheduler, store_writers: &mut StoreWriters, logger: Logger, store_meta: Arc>, @@ -285,7 +277,7 @@ impl StorePollerBuilder { fn init(&self) -> Result>> { let mut regions = HashMap::default(); let cfg = self.cfg.value(); - let mut meta = self.store_meta.lock().unwrap(); + let meta = self.store_meta.lock().unwrap(); self.engine .for_each_raft_group::(&mut |region_id| { assert_ne!(region_id, INVALID_ID); @@ -317,7 +309,7 @@ impl StorePollerBuilder { Ok(regions) } - fn clean_up_tablets(&self, peers: &HashMap>) -> Result<()> { + fn clean_up_tablets(&self, _peers: &HashMap>) -> Result<()> { // TODO: list all available tablets and destroy those which are not in the // peers. Ok(()) @@ -332,7 +324,7 @@ where { type Handler = StorePoller; - fn build(&mut self, priority: batch_system::Priority) -> Self::Handler { + fn build(&mut self, _priority: batch_system::Priority) -> Self::Handler { let cfg = self.cfg.value().clone(); let poll_ctx = StoreContext { logger: self.logger.clone(), @@ -426,7 +418,7 @@ impl StoreSystem { let pd_scheduler = workers.pd_worker.start( "pd-worker", - PdRunner::new( + pd::Runner::new( store_id, pd_client, raft_engine.clone(), @@ -440,7 +432,7 @@ impl StoreSystem { ), ); - let mut builder = StorePollerBuilder::new( + let builder = StorePollerBuilder::new( cfg.clone(), store_id, raft_engine, diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 7e9a135b498..2065c5d7fd4 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -1,19 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - pin::Pin, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - task::{Context, Poll}, - time::{Duration, Instant}, -}; +use std::time::{Duration, Instant}; use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, TabletRegistry}; -use futures::{compat::Future01CompatExt, Future, FutureExt, StreamExt}; +use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; use raftstore::store::ReadTask; use slog::Logger; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index c4dded64e62..8d497a7e4e5 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -6,17 +6,15 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; -use engine_traits::{KvEngine, RaftEngine, TabletFactory, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use raftstore::store::{Config, LocksStatus, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, time::{duration_to_sec, Instant}, - yatp_pool::FuturePool, }; -use super::ApplyFsm; use crate::{ batch::StoreContext, raft::{Peer, Storage}, @@ -237,10 +235,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerMsg::Tick(tick) => self.on_tick(tick), PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(self.store_ctx, res), PeerMsg::SplitInit(msg) => self.fsm.peer.on_split_init(self.store_ctx, msg), - PeerMsg::SplitInitFinish(region_id) => self - .fsm - .peer - .on_split_init_finish(self.store_ctx, region_id), + PeerMsg::SplitInitFinish(region_id) => { + self.fsm.peer.on_split_init_finish(region_id) + } PeerMsg::Start => self.on_start(), PeerMsg::Noop => unimplemented!(), PeerMsg::Persisted { diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 73702500e19..349d5ad3252 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -6,11 +6,7 @@ use batch_system::Fsm; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use futures::{compat::Future01CompatExt, FutureExt}; -use kvproto::{metapb::Region, raft_serverpb::RaftMessage}; -use raftstore::{ - coprocessor::RegionChangeReason, - store::{Config, ReadDelegate, RegionReadProgressRegistry}, -}; +use raftstore::store::{Config, ReadDelegate, RegionReadProgressRegistry}; use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, @@ -20,7 +16,6 @@ use tikv_util::{ use crate::{ batch::StoreContext, - raft::Peer, router::{StoreMsg, StoreTick}, }; diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 2a9d5faabd5..bac66b34acc 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -21,7 +21,6 @@ // Functionalities like read, write, etc should be implemented in [`operation`] // using a standalone modules. -#![allow(unused)] #![feature(let_chains)] #![feature(array_windows)] #![feature(div_duration)] diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 69e318c3a2e..4bda7eedf32 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -9,7 +9,6 @@ use std::time::Instant; -use collections::HashSet; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ metapb::{self, PeerRole}, @@ -18,7 +17,6 @@ use kvproto::{ }; use protobuf::Message; use raft::prelude::*; -use raft_proto::ConfChangeI; use raftstore::{ store::{ metrics::{PEER_ADMIN_CMD_COUNTER_VEC, PEER_PROPOSE_LOG_SIZE_HISTOGRAM}, @@ -34,7 +32,6 @@ use super::AdminCmdResult; use crate::{ batch::StoreContext, raft::{Apply, Peer}, - router::ApplyRes, }; /// The apply result of conf change. @@ -56,7 +53,7 @@ impl Peer { pub fn propose_conf_change( &mut self, ctx: &mut StoreContext, - mut req: RaftCmdRequest, + req: RaftCmdRequest, ) -> Result { if self.raft_group().raft.has_pending_conf() { info!( @@ -67,7 +64,6 @@ impl Peer { } let data = req.write_to_bytes()?; let admin = req.get_admin_request(); - let leader_role = self.peer().get_role(); if admin.has_change_peer() { self.propose_conf_change_imp(ctx, admin.get_change_peer(), data) } else if admin.has_change_peer_v2() { @@ -229,7 +225,6 @@ impl Apply { legacy: bool, ) -> Result<(AdminResponse, AdminCmdResult)> { let region = self.region_state().get_region(); - let peer_id = self.peer().get_id(); let change_kind = ConfChangeKind::confchange_kind(changes.len()); info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch()); let mut new_region = region.clone(); @@ -284,7 +279,7 @@ impl Apply { } let mut resp = AdminResponse::default(); resp.mut_change_peer().set_region(new_region); - let mut conf_change = ConfChangeResult { + let conf_change = ConfChangeResult { index, conf_change: cc, changes: changes.to_vec(), diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 388bf72e01e..d07c1b4a35c 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -5,29 +5,16 @@ mod split; mod transfer_leader; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest}; +use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; use protobuf::Message; -use raft::prelude::ConfChangeV2; -use raftstore::{ - store::{ - self, cmd_resp, - fsm::apply, - msg::ErrorCallback, - util::{ChangePeerI, ConfChangeKind}, - }, - Result, -}; +use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; pub use split::{SplitInit, SplitResult, SPLIT_PREFIX}; use tikv_util::box_err; use txn_types::WriteBatchFlags; use self::conf_change::ConfChangeResult; -use crate::{ - batch::StoreContext, - raft::{Apply, Peer}, - router::CmdResChannel, -}; +use crate::{batch::StoreContext, raft::Peer, router::CmdResChannel}; #[derive(Debug)] pub enum AdminCmdResult { @@ -43,7 +30,7 @@ impl Peer { pub fn on_admin_command( &mut self, ctx: &mut StoreContext, - mut req: RaftCmdRequest, + req: RaftCmdRequest, ch: CmdResChannel, ) { if !self.serving() { diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 157150126b4..8ca4c7a55f6 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -25,43 +25,37 @@ //! created by the store, and here init it using the data sent from the parent //! peer. -use std::{cmp, collections::VecDeque}; +use std::cmp; use collections::HashSet; -use crossbeam::channel::{SendError, TrySendError}; -use engine_traits::{ - Checkpointer, DeleteStrategy, KvEngine, RaftEngine, RaftLogBatch, Range, TabletContext, - CF_DEFAULT, -}; +use crossbeam::channel::SendError; +use engine_traits::{Checkpointer, KvEngine, RaftEngine, TabletContext}; use fail::fail_point; -use keys::enc_end_key; use kvproto::{ - metapb::{self, Region, RegionEpoch}, + metapb::{self, Region}, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, - raft_serverpb::{RaftMessage, RaftSnapshotData, RegionLocalState}, + raft_serverpb::RaftSnapshotData, }; use protobuf::Message; -use raft::{prelude::Snapshot, RawNode, INVALID_ID}; +use raft::{prelude::Snapshot, INVALID_ID}; use raftstore::{ - coprocessor::RegionChangeReason, store::{ fsm::apply::validate_batch_split, metrics::PEER_ADMIN_CMD_COUNTER, snap::TABLET_SNAPSHOT_VERSION, util::{self, KeysInfoFormatter}, - PeerPessimisticLocks, PeerStat, ProposalContext, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + PeerPessimisticLocks, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, Result, }; -use slog::{error, info, warn, Logger}; -use tikv_util::box_err; +use slog::info; use crate::{ batch::StoreContext, - fsm::{ApplyResReporter, PeerFsmDelegate}, + fsm::ApplyResReporter, operation::AdminCmdResult, - raft::{write_initial_states, Apply, Peer, Storage}, - router::{ApplyRes, PeerMsg, StoreMsg}, + raft::{Apply, Peer}, + router::{PeerMsg, StoreMsg}, }; pub const SPLIT_PREFIX: &str = "split_"; @@ -314,17 +308,10 @@ impl Peer { }; fail_point!("on_split_invalidate_locks"); - // Roughly estimate the size and keys for new regions. - let new_region_count = regions.len() as u64; { let mut meta = store_ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); - self.set_region( - reader, - derived.clone(), - RegionChangeReason::Split, - tablet_index, - ); + self.set_region(reader, derived.clone(), tablet_index); } self.post_split(); @@ -454,9 +441,9 @@ impl Peer { .force_send(split_init.source_id, PeerMsg::SplitInitFinish(region_id)); } - pub fn on_split_init_finish(&mut self, ctx: &mut StoreContext, region_id: u64) { + pub fn on_split_init_finish(&mut self, region_id: u64) { let mut found = false; - for (tablet_index, ids) in self.split_trace_mut() { + for (_, ids) in self.split_trace_mut() { if ids.remove(®ion_id) { found = true; break; @@ -476,6 +463,8 @@ impl Peer { if off > 0 { // There should be very few elements in the vector. split_trace.drain(..off); + // TODO: save admin_flushed. + assert_ne!(admin_flushed, 0); // Persist admin flushed. self.set_has_ready(); } @@ -484,39 +473,30 @@ impl Peer { #[cfg(test)] mod test { - use std::sync::{ - mpsc::{channel, Receiver, Sender}, - Arc, - }; + use std::sync::mpsc::{channel, Receiver, Sender}; - use collections::HashMap; use engine_test::{ ctor::{CfOptions, DbOptions}, kv::TestTabletFactory, - raft, }; - use engine_traits::{CfOptionsExt, Peekable, TabletRegistry, WriteBatch, DATA_CFS}; - use futures::channel::mpsc::unbounded; + use engine_traits::{ + Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, + }; use kvproto::{ metapb::RegionEpoch, - raft_cmdpb::{AdminCmdType, BatchSplitRequest, PutRequest, RaftCmdResponse, SplitRequest}, - raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}, + raft_cmdpb::{BatchSplitRequest, SplitRequest}, + raft_serverpb::{PeerState, RegionLocalState}, }; - use raftstore::store::{cmd_resp::new_error, Config, ReadRunner}; + use raftstore::store::cmd_resp::new_error; use slog::o; use tempfile::TempDir; use tikv_util::{ - codec::bytes::encode_bytes, - config::VersionTrack, store::{new_learner_peer, new_peer}, - worker::{dummy_future_scheduler, dummy_scheduler, FutureScheduler, Scheduler, Worker}, + worker::dummy_scheduler, }; use super::*; - use crate::{ - fsm::{ApplyFsm, ApplyResReporter}, - raft::Apply, - }; + use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes}; struct MockReporter { sender: Sender, diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 71853d0007b..e8105a66322 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -18,7 +18,7 @@ use raft::{eraftpb, ProgressState, Storage}; use raftstore::{ store::{ fsm::new_admin_request, make_transfer_leader_response, metrics::PEER_ADMIN_CMD_COUNTER, - LocksStatus, Transport, TRANSFER_LEADER_COMMAND_REPLY_CTX, + LocksStatus, TRANSFER_LEADER_COMMAND_REPLY_CTX, }, Result, }; @@ -29,7 +29,7 @@ use txn_types::WriteBatchFlags; use super::AdminCmdResult; use crate::{ batch::StoreContext, - fsm::{ApplyResReporter, PeerFsmDelegate}, + fsm::ApplyResReporter, raft::{Apply, Peer}, router::{CmdResChannel, PeerMsg, PeerTick}, }; @@ -199,7 +199,7 @@ impl Peer { cmd.mut_admin_request() .set_cmd_type(AdminCmdType::TransferLeader); cmd.mut_admin_request().mut_transfer_leader().set_peer(from); - if let (PeerMsg::RaftCommand(req), sub) = PeerMsg::raft_command(cmd) { + if let PeerMsg::RaftCommand(req) = PeerMsg::raft_command(cmd).0 { self.on_admin_command(ctx, req.request, req.ch); } else { unreachable!(); @@ -380,7 +380,7 @@ impl Peer { self.logger, "propose {} locks before transferring leader", cmd.get_requests().len(); ); - let (PeerMsg::RaftCommand(req), sub) = PeerMsg::raft_command(cmd) else {unreachable!()}; + let PeerMsg::RaftCommand(req) = PeerMsg::raft_command(cmd).0 else {unreachable!()}; self.on_write_command(ctx, req.request, req.ch); true } diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs index 5fb25b4e20d..b330d0093fe 100644 --- a/components/raftstore-v2/src/operation/command/control.rs +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -1,11 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{collections::LinkedList, mem, num::NonZeroU64}; +use std::{collections::LinkedList, mem}; -use kvproto::{ - metapb, - raft_cmdpb::{AdminCmdType, RaftCmdRequest}, -}; +use kvproto::{metapb, raft_cmdpb::AdminCmdType}; use raftstore::{ store::{ cmd_resp, @@ -263,12 +260,6 @@ impl Drop for ProposalControl { mod tests { use super::*; - fn new_admin_request(cmd_type: AdminCmdType) -> RaftCmdRequest { - let mut request = RaftCmdRequest::default(); - request.mut_admin_request().set_cmd_type(cmd_type); - request - } - #[test] fn test_proposal_control() { let region = metapb::Region::default(); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 3d0a17ece62..6daa8f2770c 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -16,13 +16,9 @@ //! - Applied result are sent back to peer fsm, and update memory state in //! `on_apply_res`. -use std::cmp; - -use batch_system::{Fsm, FsmScheduler, Mailbox}; use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; -use kvproto::{ - raft_cmdpb::{AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader}, - raft_serverpb::RegionLocalState, +use kvproto::raft_cmdpb::{ + AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, }; use protobuf::Message; use raft::eraftpb::{ConfChange, ConfChangeV2, Entry, EntryType}; @@ -31,16 +27,12 @@ use raftstore::{ store::{ cmd_resp, fsm::{ - apply::{ - self, APPLY_WB_SHRINK_SIZE, DEFAULT_APPLY_WB_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP, - }, + apply::{self, APPLY_WB_SHRINK_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP}, Proposal, }, local_metrics::RaftMetrics, - metrics::*, msg::ErrorCallback, - util::{self, admin_cmd_epoch_lookup}, - WriteCallback, + util, WriteCallback, }, Error, Result, }; @@ -50,9 +42,8 @@ use tikv_util::{box_err, time::monotonic_raw_now}; use crate::{ batch::StoreContext, fsm::{ApplyFsm, ApplyResReporter, PeerFsmDelegate}, - operation::GenSnapTask, raft::{Apply, Peer}, - router::{ApplyRes, ApplyTask, CmdResChannel, PeerMsg}, + router::{ApplyRes, ApplyTask, CmdResChannel}, }; mod admin; @@ -122,7 +113,6 @@ impl Peer { pub fn schedule_apply_fsm(&mut self, store_ctx: &mut StoreContext) { let region_state = self.storage().region_state().clone(); let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); - let tablet = self.tablet().clone(); let logger = self.logger.clone(); let read_scheduler = self.storage().read_scheduler(); let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( @@ -165,7 +155,7 @@ impl Peer { return Err(e); } if let Err(mut e) = util::check_region_epoch(req, self.region(), true) { - if let Error::EpochNotMatch(_, new_regions) = &mut e { + if let Error::EpochNotMatch(_, _new_regions) = &mut e { // TODO: query sibling regions. metrics.invalid_proposal.epoch_not_match.inc(); } @@ -247,15 +237,10 @@ impl Peer { } #[inline] - pub fn schedule_apply_committed_entries( - &mut self, - ctx: &mut StoreContext, - committed_entries: Vec, - ) { - let last_entry = match committed_entries.last() { - Some(e) => e, - None => return, - }; + pub fn schedule_apply_committed_entries(&mut self, committed_entries: Vec) { + if committed_entries.is_empty() { + return; + } let current_term = self.term(); let mut entry_and_proposals = vec![]; let queue = self.proposals_mut(); @@ -511,7 +496,7 @@ impl Apply { let mut write_opt = WriteOptions::default(); write_opt.set_disable_wal(true); if let Err(e) = wb.write_opt(&write_opt) { - panic!("failed to write data: {:?}", self.logger.list()); + panic!("failed to write data: {:?}: {:?}", self.logger.list(), e); } if wb.data_size() <= APPLY_WB_SHRINK_SIZE { wb.clear(); diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index f9cac15d899..92f260bad26 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -1,16 +1,15 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{KvEngine, Mutable, RaftEngine, CF_DEFAULT}; -use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; +use kvproto::raft_cmdpb::RaftCmdRequest; use raftstore::{ store::{ cmd_resp, - fsm::{apply, Proposal, MAX_PROPOSAL_SIZE_RATIO}, + fsm::{apply, MAX_PROPOSAL_SIZE_RATIO}, msg::ErrorCallback, util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, - WriteCallback, }, - Error, Result, + Result, }; use crate::{ @@ -24,7 +23,6 @@ mod simple_write; pub use simple_write::{SimpleWriteDecoder, SimpleWriteEncoder}; pub use self::simple_write::SimpleWrite; -use super::CommittedEntries; impl Peer { #[inline] @@ -93,7 +91,7 @@ impl Peer { NORMAL_REQ_CHECK_VER, true, ); - if let Err(mut e) = res { + if let Err(e) = res { // TODO: query sibling regions. ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); encoder.encode().1.report_error(cmd_resp::new_error(e)); @@ -173,12 +171,12 @@ impl Apply { #[inline] pub fn apply_delete_range( &mut self, - cf: &str, - start_key: &[u8], - end_key: &[u8], - notify_only: bool, + _cf: &str, + _start_key: &[u8], + _end_key: &[u8], + _notify_only: bool, ) -> Result<()> { - /// TODO: reuse the same delete as split/merge. + // TODO: reuse the same delete as split/merge. Ok(()) } } diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index ca9e7d39366..c4cb9d6bc89 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -31,7 +31,7 @@ impl SimpleWriteEncoder { /// If `notify_proposed` is true, channels will be called `notify_proposed` /// when it's appended. pub fn new( - mut req: RaftCmdRequest, + req: RaftCmdRequest, size_limit: usize, notify_proposed: bool, ) -> Result { @@ -346,7 +346,7 @@ fn encode(req: &Request, buf: &mut Vec) { #[inline] fn decode<'a>(buf: &mut &'a [u8]) -> Option> { - let (tag, mut left) = buf.split_first()?; + let (tag, left) = buf.split_first()?; match *tag { PUT_TAG => { let (cf, left) = decode_cf(left); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index ca610de1bfc..60889908aa0 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -149,7 +149,6 @@ impl Store { } else { return; }; - let msg_type = msg.get_message().get_msg_type(); let from_peer = msg.get_from_peer(); let to_peer = msg.get_to_peer(); // Now the peer should not exist. @@ -239,9 +238,10 @@ impl Store { } }; let mailbox = BasicMailbox::new(tx, fsm, ctx.router.state_cnt().clone()); - if let Err((p, _)) = ctx + if ctx .router .send_and_register(region_id, mailbox, PeerMsg::Start) + .is_err() { panic!( "[region {}] {} failed to register peer", diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 7df27670a35..d80cee3c7d1 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -2,21 +2,18 @@ //! This module implements the interactions with pd. -use std::cmp; - use engine_traits::{KvEngine, RaftEngine}; use fail::fail_point; use kvproto::{metapb, pdpb}; use raftstore::store::Transport; use slog::error; -use tikv_util::time::InstantExt; use crate::{ batch::StoreContext, fsm::{PeerFsmDelegate, Store, StoreFsmDelegate}, raft::Peer, router::{PeerTick, StoreTick}, - worker::{PdRegionHeartbeatTask, PdTask}, + worker::pd, }; impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { @@ -55,7 +52,7 @@ impl Store { // stats.set_query_stats(query_stats); - let task = PdTask::StoreHeartbeat { stats }; + let task = pd::Task::StoreHeartbeat { stats }; if let Err(e) = ctx.pd_scheduler.schedule(task) { error!(self.logger(), "notify pd failed"; "store_id" => self.store_id(), @@ -80,7 +77,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, impl Peer { #[inline] pub fn region_heartbeat_pd(&self, ctx: &StoreContext) { - let task = PdTask::RegionHeartbeat(PdRegionHeartbeatTask { + let task = pd::Task::RegionHeartbeat(pd::RegionHeartbeatTask { term: self.term(), region: self.region().clone(), down_peers: self.collect_down_peers(ctx.cfg.max_peer_down_duration.0), @@ -163,7 +160,7 @@ impl Peer { #[inline] pub fn destroy_peer_pd(&self, ctx: &StoreContext) { - let task = PdTask::DestroyPeer { + let task = pd::Task::DestroyPeer { region_id: self.region_id(), }; if let Err(e) = ctx.pd_scheduler.schedule(task) { @@ -179,7 +176,7 @@ impl Peer { #[inline] pub fn ask_batch_split_pd(&self, ctx: &StoreContext, split_keys: Vec>) { - let task = PdTask::AskBatchSplit { + let task = pd::Task::AskBatchSplit { region: self.region().clone(), split_keys, peer: self.peer().clone(), @@ -202,7 +199,7 @@ impl Peer { ctx: &StoreContext, regions: Vec, ) { - let task = PdTask::ReportBatchSplit { regions }; + let task = pd::Task::ReportBatchSplit { regions }; if let Err(e) = ctx.pd_scheduler.schedule(task) { error!( self.logger, @@ -214,7 +211,7 @@ impl Peer { #[inline] pub fn update_max_timestamp_pd(&self, ctx: &StoreContext, initial_status: u64) { - let task = PdTask::UpdateMaxTimestamp { + let task = pd::Task::UpdateMaxTimestamp { region_id: self.region_id(), initial_status, txn_ext: self.txn_ext().clone(), diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 4455ea099f4..ca92729ee6f 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -1,13 +1,13 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::{Arc, Mutex}; +use std::sync::Mutex; use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_cmdpb::RaftCmdRequest; use raftstore::store::{ can_amend_read, fsm::apply::notify_stale_req, metrics::RAFT_READ_INDEX_PENDING_COUNT, msg::ReadCallback, propose_read_index, should_renew_lease, util::LeaseState, ReadDelegate, - ReadIndexRequest, ReadProgress, TrackVer, Transport, + ReadIndexRequest, ReadProgress, Transport, }; use slog::debug; use tikv_util::time::monotonic_raw_now; @@ -99,10 +99,9 @@ impl Peer { /// /// awake the read tasks waiting in frontend (such as unified thread pool) /// In v1, it's named as response_read. - pub(crate) fn respond_read_index( + pub(crate) fn respond_read_index( &self, read_index_req: &mut ReadIndexRequest, - ctx: &mut StoreContext, ) { debug!( self.logger, @@ -111,7 +110,7 @@ impl Peer { ); RAFT_READ_INDEX_PENDING_COUNT.sub(read_index_req.cmds().len() as i64); let time = monotonic_raw_now(); - for (req, ch, mut read_index) in read_index_req.take_cmds().drain(..) { + for (_, ch, mut read_index) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { GLOBAL_TRACKERS.with_tracker(*tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 19f9a7e91b9..d24a4b9d899 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -17,17 +17,12 @@ use raftstore::{ errors::RAFTSTORE_IS_BUSY, store::{ cmd_resp, util::LeaseState, LocalReadContext, LocalReaderCore, ReadDelegate, ReadExecutor, - ReadExecutorProvider, RegionSnapshot, RequestInspector, RequestPolicy, - TLS_LOCAL_READ_METRICS, + ReadExecutorProvider, RegionSnapshot, RequestPolicy, TLS_LOCAL_READ_METRICS, }, Error, Result, }; use slog::{debug, Logger}; -use tikv_util::{ - box_err, - codec::number::decode_u64, - time::{monotonic_raw_now, ThreadReadId}, -}; +use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now}; use time::Timespec; use txn_types::WriteBatchFlags; @@ -202,13 +197,13 @@ where let mut err = errorpb::Error::default(); match MsgRouter::send(&self.router, region_id, msg) { Ok(()) => return Ok(sub.result().await), - Err(TrySendError::Full(c)) => { + Err(TrySendError::Full(_)) => { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.channel_full.inc()); err.set_message(RAFTSTORE_IS_BUSY.to_owned()); err.mut_server_is_busy() .set_reason(RAFTSTORE_IS_BUSY.to_owned()); } - Err(TrySendError::Disconnected(c)) => { + Err(TrySendError::Disconnected(_)) => { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); err.set_message(format!("region {} is missing", region_id)); err.mut_region_not_found().set_region_id(region_id); @@ -235,7 +230,7 @@ where let region_id = req.header.get_ref().region_id; TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().renew_lease_advance.inc()); // Send a read query which may renew the lease - let (msg, sub) = PeerMsg::raft_query(req.clone()); + let msg = PeerMsg::raft_query(req.clone()).0; if let Err(e) = MsgRouter::send(&self.router, region_id, msg) { debug!( self.logger, @@ -685,7 +680,7 @@ mod tests { ch_tx.clone(), )) .unwrap(); - let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); + block_on(reader.snapshot(cmd.clone())).unwrap(); // Updating lease makes cache miss. assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 3a3052ab902..ea66719314c 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -11,23 +11,21 @@ //! Follower's read index and replica read is implemenented replica module. //! Leader's read index and lease renew is implemented in lease module. -use std::{cmp, sync::Arc}; +use std::cmp; use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ errorpb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, StatusCmdType}, - raft_serverpb::RaftApplyState, }; use raft::Ready; use raftstore::{ errors::RAFTSTORE_IS_BUSY, store::{ - cmd_resp, fsm::ApplyMetrics, local_metrics::RaftMetrics, - metrics::RAFT_READ_INDEX_PENDING_COUNT, msg::ErrorCallback, region_meta::RegionMeta, util, - util::LeaseState, GroupState, ReadCallback, ReadIndexContext, ReadProgress, RequestPolicy, - Transport, + cmd_resp, local_metrics::RaftMetrics, metrics::RAFT_READ_INDEX_PENDING_COUNT, + msg::ErrorCallback, region_meta::RegionMeta, util, util::LeaseState, GroupState, + ReadIndexContext, ReadProgress, RequestPolicy, Transport, }, Error, Result, }; @@ -40,8 +38,7 @@ use crate::{ fsm::PeerFsmDelegate, raft::Peer, router::{ - message::RaftRequest, ApplyRes, DebugInfoChannel, PeerMsg, QueryResChannel, QueryResult, - ReadResponse, + message::RaftRequest, DebugInfoChannel, PeerMsg, QueryResChannel, QueryResult, ReadResponse, }, }; @@ -146,7 +143,6 @@ impl Peer { // TODO: add flashback_state check // Check whether the store has the right peer to handle the request. - let leader_id = self.leader_id(); let request = msg.get_requests(); // TODO: add force leader @@ -158,7 +154,7 @@ impl Peer { let allow_replica_read = msg.get_header().get_replica_read(); if !self.is_leader() && !is_read_index_request && !allow_replica_read { raft_metrics.invalid_proposal.not_leader.inc(); - return Err(Error::NotLeader(self.region_id(), None)); + return Err(Error::NotLeader(self.region_id(), self.leader())); } // peer_id must be the same as peer's. @@ -186,7 +182,7 @@ impl Peer { fn read_index( &mut self, ctx: &mut StoreContext, - mut req: RaftCmdRequest, + req: RaftCmdRequest, ch: QueryResChannel, ) { // TODO: add pre_read_index to handle splitting or merging @@ -222,7 +218,7 @@ impl Peer { if self.ready_to_handle_read() { while let Some(mut read) = self.pending_reads_mut().pop_front() { - self.respond_read_index(&mut read, ctx); + self.respond_read_index(&mut read); } } } @@ -264,9 +260,9 @@ impl Peer { && read.cmds()[0].0.get_requests()[0].get_cmd_type() == CmdType::ReadIndex; if is_read_index_request { - self.respond_read_index(&mut read, ctx); + self.respond_read_index(&mut read); } else if self.ready_to_handle_unsafe_replica_read(read.read_index.unwrap()) { - self.respond_replica_read(&mut read, ctx); + self.respond_replica_read(&mut read); } else { // TODO: `ReadIndex` requests could be blocked. self.pending_reads_mut().push_front(read); @@ -416,7 +412,7 @@ impl Peer { self.post_pending_read_index_on_replica(ctx) } else if self.ready_to_handle_read() { while let Some(mut read) = self.pending_reads_mut().pop_front() { - self.respond_read_index(&mut read, ctx); + self.respond_read_index(&mut read); } } self.pending_reads_mut().gc(); diff --git a/components/raftstore-v2/src/operation/query/replica.rs b/components/raftstore-v2/src/operation/query/replica.rs index 9433cd10c52..fb00adbbc5a 100644 --- a/components/raftstore-v2/src/operation/query/replica.rs +++ b/components/raftstore-v2/src/operation/query/replica.rs @@ -62,10 +62,9 @@ impl Peer { self.set_has_ready(); } - pub(crate) fn respond_replica_read( + pub(crate) fn respond_replica_read( &self, read_index_req: &mut ReadIndexRequest, - ctx: &mut StoreContext, ) { debug!( self.logger, @@ -74,7 +73,7 @@ impl Peer { ); RAFT_READ_INDEX_PENDING_COUNT.sub(read_index_req.cmds().len() as i64); let time = monotonic_raw_now(); - for (req, ch, mut read_index) in read_index_req.take_cmds().drain(..) { + for (req, ch, _) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { GLOBAL_TRACKERS.with_tracker(*tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index a7bce44fe05..e89854f39f4 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -86,7 +86,6 @@ impl AsyncWriter { } fn merge(&mut self, task: WriteTask) -> Option> { - let ready_number = task.ready_number(); if self.unpersisted_readies.is_empty() { // If this ready don't need to be persisted and there is no previous unpersisted // ready, we can safely consider it is persisted so the persisted msgs can be diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index baf66dfa6fc..e9046af2831 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -22,12 +22,9 @@ mod snapshot; use std::{cmp, time::Instant}; -use engine_traits::{KvEngine, MiscExt, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine}; use error_code::ErrorCodeExt; -use kvproto::{ - raft_cmdpb::AdminCmdType, - raft_serverpb::{PeerState, RaftMessage, RaftSnapshotData}, -}; +use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::store::{util, ExtraStates, FetchedLogs, ReadProgress, Transport, WriteTask}; @@ -43,7 +40,6 @@ use crate::{ fsm::PeerFsmDelegate, raft::{Peer, Storage}, router::{ApplyTask, PeerTick}, - Result, }; impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { @@ -169,11 +165,7 @@ impl Peer { /// /// If the recipient can't be found, `None` is returned. #[inline] - fn build_raft_message( - &mut self, - ctx: &mut StoreContext, - msg: eraftpb::Message, - ) -> Option { + fn build_raft_message(&mut self, msg: eraftpb::Message) -> Option { let to_peer = match self.peer_from_cache(msg.to) { Some(p) => p, None => { @@ -265,7 +257,7 @@ impl Peer { } } } - self.schedule_apply_committed_entries(ctx, committed_entries); + self.schedule_apply_committed_entries(committed_entries); } /// Processing the ready of raft. A detail description of how it's handled @@ -320,7 +312,7 @@ impl Peer { if !ready.messages().is_empty() { debug_assert!(self.is_leader()); for msg in ready.take_messages() { - if let Some(msg) = self.build_raft_message(ctx, msg) { + if let Some(msg) = self.build_raft_message(msg) { self.send_raft_message(ctx, msg); } } @@ -347,7 +339,7 @@ impl Peer { write_task.messages = ready .take_persisted_messages() .into_iter() - .flat_map(|m| self.build_raft_message(ctx, m)) + .flat_map(|m| self.build_raft_message(m)) .collect(); } if !self.serving() { @@ -408,11 +400,11 @@ impl Peer { let persisted_number = self.async_writer.persisted_number(); self.raft_group_mut().on_persist_ready(persisted_number); let persisted_index = self.persisted_index(); - /// The apply snapshot process order would be: - /// - Get the snapshot from the ready - /// - Wait for async writer to load this tablet - /// In this step, the snapshot loading has been finished, but some apply - /// state need to update. + // The apply snapshot process order would be: + // - Get the snapshot from the ready + // - Wait for async writer to load this tablet + // In this step, the snapshot loading has been finished, but some apply + // state need to update. if has_snapshot { self.on_applied_snapshot(ctx); } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 5bf9fc27269..86817ab17d3 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -19,17 +19,16 @@ //! peer fsm, then Raft will get the snapshot. use std::{ - borrow::BorrowMut, fmt::{self, Debug}, fs, mem, sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, - mpsc, Arc, + Arc, }, }; use engine_traits::{KvEngine, RaftEngine, TabletContext, TabletRegistry}; -use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; +use kvproto::raft_serverpb::{PeerState, RaftSnapshotData}; use protobuf::Message; use raft::eraftpb::Snapshot; use raftstore::store::{ @@ -37,13 +36,12 @@ use raftstore::store::{ TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, }; use slog::{error, info, warn}; -use tikv_util::{box_err, box_try, worker::Scheduler}; +use tikv_util::box_err; use crate::{ fsm::ApplyResReporter, operation::command::SPLIT_PREFIX, raft::{Apply, Peer, Storage}, - router::{ApplyTask, PeerTick}, Result, StoreContext, }; @@ -60,11 +58,9 @@ pub enum SnapState { impl PartialEq for SnapState { fn eq(&self, other: &SnapState) -> bool { match (self, other) { - (&SnapState::Relax, &SnapState::Relax) - | (&SnapState::Generating { .. }, &SnapState::Generating { .. }) => true, - (&SnapState::Generated(ref snap1), &SnapState::Generated(ref snap2)) => { - *snap1 == *snap2 - } + (SnapState::Relax, SnapState::Relax) + | (SnapState::Generating { .. }, SnapState::Generating { .. }) => true, + (SnapState::Generated(snap1), SnapState::Generated(snap2)) => *snap1 == *snap2, _ => false, } } @@ -203,8 +199,8 @@ impl Storage { /// unavailable snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { let mut snap_state = self.snap_state_mut(); - match *snap_state { - SnapState::Generating { ref canceled, .. } => { + match &*snap_state { + SnapState::Generating { canceled, .. } => { if canceled.load(Ordering::SeqCst) { self.cancel_generating_snap(None); } else { @@ -213,7 +209,7 @@ impl Storage { )); } } - SnapState::Generated(ref s) => { + SnapState::Generated(_) => { // TODO: `to` may not be equal to the generated snapshot. let SnapState::Generated(snap) = mem::replace(&mut *snap_state, SnapState::Relax) else { unreachable!() }; if self.validate_snap(&snap, request_index) { @@ -331,9 +327,9 @@ impl Storage { let snap = res.unwrap(); let mut snap_state = self.snap_state_mut(); let SnapState::Generating { - ref canceled, - ref index, - } = *snap_state else { return false }; + index, + .. + } = &*snap_state else { return false }; if snap.get_metadata().get_index() < index.load(Ordering::SeqCst) { warn!( @@ -352,7 +348,7 @@ impl Storage { } pub fn on_applied_snapshot(&mut self) { - let mut entry = self.entry_storage_mut(); + let entry = self.entry_storage_mut(); let term = entry.truncated_term(); let index = entry.truncated_index(); entry.set_applied_term(term); @@ -428,7 +424,7 @@ impl Storage { let _ = fs::remove_dir_all(path); } }; - task.persisted_cb = (Some(Box::new(hook))); + task.persisted_cb = Some(Box::new(hook)); task.has_snapshot = true; Ok(()) } diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index d4a4cf61602..30ced7bdbd7 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{mem, sync::Arc}; +use std::mem; use engine_traits::{CachedTablet, KvEngine, TabletRegistry, WriteBatch}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; @@ -8,12 +8,7 @@ use raftstore::store::{fsm::apply::DEFAULT_APPLY_WB_SIZE, ReadTask}; use slog::Logger; use tikv_util::worker::Scheduler; -use super::Peer; -use crate::{ - fsm::ApplyResReporter, - operation::AdminCmdResult, - router::{ApplyRes, CmdResChannel}, -}; +use crate::{operation::AdminCmdResult, router::CmdResChannel}; /// Apply applies all the committed commands to kv db. pub struct Apply { diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 6111e75e691..f211313e1b5 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -12,33 +12,19 @@ use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletContext, TabletReg use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; -use raftstore::{ - coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, - store::{ - fsm::Proposal, - util::{Lease, RegionReadProgress}, - Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, - ReadProgress, TrackVer, TxnExt, - }, - Error, +use raftstore::store::{ + util::{Lease, RegionReadProgress}, + Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, + ReadProgress, TxnExt, }; -use slog::{debug, error, info, o, warn, Logger}; -use tikv_util::{ - box_err, - config::ReadableSize, - time::{monotonic_raw_now, Instant as TiInstant}, - worker::Scheduler, - Either, -}; -use time::Timespec; +use slog::Logger; -use super::{storage::Storage, Apply}; +use super::storage::Storage; use crate::{ batch::StoreContext, - fsm::{ApplyFsm, ApplyScheduler}, + fsm::ApplyScheduler, operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteEncoder}, router::{CmdResChannel, PeerTick, QueryResChannel}, - worker::PdTask, Result, }; @@ -193,7 +179,6 @@ impl Peer { // host: &CoprocessorHost, reader: &mut ReadDelegate, region: metapb::Region, - reason: RegionChangeReason, tablet_index: u64, ) { if self.region().get_region_epoch().get_version() < region.get_region_epoch().get_version() @@ -489,11 +474,6 @@ impl Peer { &mut self.destroy_progress } - #[inline] - pub(crate) fn has_applied_to_current_term(&self) -> bool { - self.entry_storage().applied_term() == self.term() - } - #[inline] pub fn simple_write_encoder_mut(&mut self) -> &mut Option { &mut self.raw_write_encoder diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index a27e79549e1..49a0f547e1a 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -3,7 +3,6 @@ use std::{ cell::{RefCell, RefMut}, fmt::{self, Debug, Formatter}, - sync::{mpsc::Receiver, Arc}, }; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; @@ -15,10 +14,8 @@ use raft::{ eraftpb::{ConfState, Entry, Snapshot}, GetEntriesContext, RaftState, INVALID_ID, }; -use raftstore::store::{ - util, EntryStorage, ReadTask, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, -}; -use slog::{info, o, Logger}; +use raftstore::store::{util, EntryStorage, ReadTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; +use slog::{o, Logger}; use tikv_util::{box_err, store::find_peer, worker::Scheduler}; use crate::{ @@ -366,31 +363,29 @@ impl raft::Storage for Storage { #[cfg(test)] mod tests { use std::{ - sync::mpsc::{sync_channel, SyncSender}, + sync::mpsc::{sync_channel, Receiver, SyncSender}, time::Duration, }; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, TestTabletFactory}, - raft::RaftTestEngine, + kv::TestTabletFactory, }; use engine_traits::{ - KvEngine, RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletContext, TabletRegistry, - DATA_CFS, + RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS, }; use kvproto::{ metapb::{Peer, Region}, raft_serverpb::PeerState, }; - use raft::{eraftpb::Snapshot as RaftSnapshot, Error as RaftError, StorageError}; + use raft::{Error as RaftError, StorageError}; use raftstore::store::{ - util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask, - TabletSnapKey, TabletSnapManager, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, + TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; use slog::o; use tempfile::TempDir; - use tikv_util::worker::{Runnable, Worker}; + use tikv_util::worker::Worker; use super::*; use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes}; @@ -480,7 +475,7 @@ mod tests { let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); let reg = TabletRegistry::new(factory, path.path().join("tablet")).unwrap(); - let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); + let worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); let sched = worker.scheduler(); let logger = slog_global::borrow_global().new(o!()); let mut s = Storage::new(4, 6, raft_engine.clone(), sched, &logger.clone()) @@ -533,7 +528,7 @@ mod tests { let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); let sched = worker.scheduler(); let logger = slog_global::borrow_global().new(o!()); - let mut s = Storage::new(4, 6, raft_engine.clone(), sched.clone(), &logger.clone()) + let s = Storage::new(4, 6, raft_engine.clone(), sched.clone(), &logger.clone()) .unwrap() .unwrap(); let (router, rx) = TestRouter::new(); @@ -577,7 +572,7 @@ mod tests { assert_eq!(snap.unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); apply.schedule_gen_snapshot(gen_task); - let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + rx.recv_timeout(Duration::from_secs(1)).unwrap(); s.cancel_generating_snap(None); assert_eq!(*s.snap_state.borrow(), SnapState::Relax); diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 3dda00eb270..7208a6b5bef 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -8,9 +8,7 @@ use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, }; -use raft::eraftpb::Snapshot as RaftSnapshot; use raftstore::store::{AsyncReadNotifier, FetchedLogs, GenSnapRes, RegionSnapshot}; -use slog::Logger; use super::PeerMsg; use crate::{batch::StoreRouter, operation::LocalReader, StoreMeta}; diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 1507d404297..224723bf4ad 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,7 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use raftstore::store::fsm::ChangePeer; - use crate::operation::{AdminCmdResult, CommittedEntries, GenSnapTask}; #[derive(Debug)] diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 13037bd1a26..447efe8ee1a 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -3,9 +3,7 @@ // #[PerformanceCriticalPath] use std::fmt; -use engine_traits::Snapshot; use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; -use raft::eraftpb::Snapshot as RaftSnapshot; use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs, GenSnapRes}; use tikv_util::time::Instant; diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index b6da3c804f0..423c9e8e326 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -404,7 +404,7 @@ impl ReadCallback for QueryResChannel { type Response = QueryResult; #[inline] - fn set_result(mut self, res: QueryResult) { + fn set_result(self, res: QueryResult) { self.set_result(res); } diff --git a/components/raftstore-v2/src/worker/mod.rs b/components/raftstore-v2/src/worker/mod.rs index ad8249d22a4..3d4e69fdcf6 100644 --- a/components/raftstore-v2/src/worker/mod.rs +++ b/components/raftstore-v2/src/worker/mod.rs @@ -1,5 +1,3 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -mod pd; - -pub use pd::{RegionHeartbeatTask as PdRegionHeartbeatTask, Runner as PdRunner, Task as PdTask}; +pub mod pd; diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index 9803039e392..80e12dc53c7 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -2,10 +2,7 @@ use std::{ fmt::{self, Display, Formatter}, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, + sync::{atomic::AtomicBool, Arc}, }; use causal_ts::CausalTsProviderImpl; @@ -204,10 +201,9 @@ where } } -pub mod requests { +mod requests { use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, - SplitRequest, }; use raft::eraftpb::ConfChangeType; @@ -271,41 +267,6 @@ pub mod requests { req } - pub fn new_split_region_request( - split_key: Vec, - new_region_id: u64, - peer_ids: Vec, - right_derive: bool, - ) -> AdminRequest { - let mut req = AdminRequest::default(); - req.set_cmd_type(AdminCmdType::Split); - req.mut_split().set_split_key(split_key); - req.mut_split().set_new_region_id(new_region_id); - req.mut_split().set_new_peer_ids(peer_ids); - req.mut_split().set_right_derive(right_derive); - req - } - - pub fn new_batch_split_region_request( - split_keys: Vec>, - ids: Vec, - right_derive: bool, - ) -> AdminRequest { - let mut req = AdminRequest::default(); - req.set_cmd_type(AdminCmdType::BatchSplit); - req.mut_splits().set_right_derive(right_derive); - let mut requests = Vec::with_capacity(ids.len()); - for (mut id, key) in ids.into_iter().zip(split_keys) { - let mut split = SplitRequest::default(); - split.set_split_key(key); - split.set_new_region_id(id.get_new_region_id()); - split.set_new_peer_ids(id.take_new_peer_ids()); - requests.push(split); - } - req.mut_splits().set_requests(requests.into()); - req - } - pub fn new_transfer_leader_request( peer: metapb::Peer, peers: Vec, @@ -316,12 +277,4 @@ pub mod requests { req.mut_transfer_leader().set_peers(peers.into()); req } - - pub fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { - let mut req = AdminRequest::default(); - req.set_cmd_type(AdminCmdType::PrepareMerge); - req.mut_prepare_merge() - .set_target(merge.get_target().to_owned()); - req - } } diff --git a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs index ad0293d0b6d..4096467087a 100644 --- a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs @@ -3,18 +3,9 @@ use std::time::Duration; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::{ - metapb, pdpb, - raft_cmdpb::{ - AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, - SplitRequest, - }, - raft_serverpb::RaftMessage, - replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, -}; +use kvproto::{metapb, pdpb}; use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, PdClient, RegionStat}; -use raft::eraftpb::ConfChangeType; -use slog::{debug, error, info}; +use slog::{debug, info}; use tikv_util::{store::QueryStats, time::UnixSecs}; use super::{requests::*, Runner}; diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs index 8f49e7f025f..2fbe378cff8 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs @@ -247,7 +247,6 @@ where // TODO: slow score - let router = self.router.clone(); let resp = self.pd_client.store_heartbeat(stats, None, None); let logger = self.logger.clone(); let f = async move { From 931cf7fd75c12900332a3a458f54ef7ef496c68b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20van=20Eeden?= Date: Tue, 13 Dec 2022 15:04:52 +0100 Subject: [PATCH 404/676] *: Update security policy (#13929) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#13928 Signed-off-by: Daniël van Eeden Co-authored-by: Xiaoguang Sun Co-authored-by: Ti Chi Robot --- README.md | 2 +- security/SECURITY.md => SECURITY.md | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) rename security/SECURITY.md => SECURITY.md (98%) diff --git a/README.md b/README.md index 65bad6835ee..4b3e7e6c397 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,7 @@ A third-party security auditing was performed by Cure53. See the full report [he To report a security vulnerability, please send an email to [TiKV-security](mailto:tikv-security@lists.cncf.io) group. -See [Security](./security/SECURITY.md) for the process and policy followed by the TiKV project. +See [Security](SECURITY.md) for the process and policy followed by the TiKV project. ## Communication diff --git a/security/SECURITY.md b/SECURITY.md similarity index 98% rename from security/SECURITY.md rename to SECURITY.md index 353a70f039f..30be9e0daf0 100644 --- a/security/SECURITY.md +++ b/SECURITY.md @@ -18,6 +18,8 @@ The following are the versions that we support for security updates | Version | Supported | | ------- | ------------------ | +| 6.x | :white_check_mark: | +| 5.x | :white_check_mark: | | 4.x | :white_check_mark: | | 3.x | :white_check_mark: | | 2.x | :white_check_mark: | @@ -94,4 +96,4 @@ IvCICV7zG1cyuM/Z2Y7/TJ+upvahP46nM3s3G15b8FYuTSmRN1Kp9+mBt2BHqOy1 ulx+VF4Lf9n3ydf593Nha9bMJ/rnSp01 =XbYK -----END PGP PUBLIC KEY BLOCK----- -``` \ No newline at end of file +``` From 5f2282594d356705abd39c42741ba902c1db6ede Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 14 Dec 2022 12:00:53 +0800 Subject: [PATCH 405/676] engine_trait: introduce flush state (#13925) ref tikv/tikv#12842 Flush state is used to trace persisted apply index. This is the first PR to remove WAL for raftstore v2. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 + components/engine_panic/src/raft_engine.rs | 40 +++- components/engine_rocks/src/engine.rs | 14 +- components/engine_rocks/src/event_listener.rs | 25 ++- components/engine_rocks/src/raft_engine.rs | 54 ++++- components/engine_traits/src/flush.rs | 202 ++++++++++++++++++ components/engine_traits/src/lib.rs | 2 + components/engine_traits/src/raft_engine.rs | 48 ++++- components/raft_log_engine/Cargo.toml | 4 + components/raft_log_engine/src/engine.rs | 200 ++++++++++++++++- components/raft_log_engine/src/lib.rs | 1 + components/raftstore-v2/src/operation/life.rs | 2 +- components/raftstore-v2/src/raft/storage.rs | 12 +- .../tests/integrations/test_life.rs | 6 +- .../raftstore/src/store/async_io/write.rs | 5 +- .../src/store/async_io/write_tests.rs | 12 +- 16 files changed, 579 insertions(+), 50 deletions(-) create mode 100644 components/engine_traits/src/flush.rs diff --git a/Cargo.lock b/Cargo.lock index 494846ccb0d..432d2ce3c26 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4251,6 +4251,7 @@ dependencies = [ name = "raft_log_engine" version = "0.0.1" dependencies = [ + "codec", "encryption", "engine_traits", "file_system", @@ -4265,6 +4266,7 @@ dependencies = [ "serde_derive", "slog", "slog-global", + "tempfile", "tikv_util", "time", "tracker", diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 603eb118c5c..f5e0c424db0 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -47,11 +47,23 @@ impl RaftEngineReadOnly for PanicEngine { panic!() } - fn get_region_state(&self, raft_group_id: u64) -> Result> { + fn get_region_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + panic!() + } + + fn get_apply_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { panic!() } - fn get_apply_state(&self, raft_group_id: u64) -> Result> { + fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result> { panic!() } @@ -186,11 +198,31 @@ impl RaftLogBatch for PanicWriteBatch { panic!() } - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { + fn put_region_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RegionLocalState, + ) -> Result<()> { + panic!() + } + + fn put_apply_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RaftApplyState, + ) -> Result<()> { panic!() } - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { + fn put_flushed_index( + &mut self, + raft_group_id: u64, + cf: &str, + tablet_index: u64, + apply_index: u64, + ) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 0c37120e7fc..70f6562e94b 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -2,7 +2,9 @@ use std::{any::Any, sync::Arc}; -use engine_traits::{IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable}; +use engine_traits::{ + FlushState, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable, +}; use rocksdb::{DBIterator, Writable, DB}; use crate::{ @@ -24,6 +26,7 @@ use crate::{ pub struct RocksEngine { db: Arc, support_multi_batch_write: bool, + flush_state: Option>, } impl RocksEngine { @@ -35,6 +38,7 @@ impl RocksEngine { RocksEngine { db: db.clone(), support_multi_batch_write: db.get_db_options().is_enable_multi_batch_write(), + flush_state: None, } } @@ -49,6 +53,14 @@ impl RocksEngine { pub fn support_multi_batch_write(&self) -> bool { self.support_multi_batch_write } + + pub fn set_flush_state(&mut self, flush_state: Arc) { + self.flush_state = Some(flush_state); + } + + pub fn flush_state(&self) -> Option> { + self.flush_state.clone() + } } impl KvEngine for RocksEngine { diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index ad7a9de455f..8bf3035bc55 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -1,10 +1,11 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +use engine_traits::{PersistenceListener, RaftEngine}; use file_system::{get_io_type, set_io_type, IoType}; use regex::Regex; use rocksdb::{ - CompactionJobInfo, DBBackgroundErrorReason, FlushJobInfo, IngestionInfo, MutableStatus, - SubcompactionJobInfo, WriteStallInfo, + CompactionJobInfo, DBBackgroundErrorReason, FlushJobInfo, IngestionInfo, MemTableInfo, + MutableStatus, SubcompactionJobInfo, WriteStallInfo, }; use tikv_util::{error, metrics::CRITICAL_ERROR, set_panic_mark, warn, worker::Scheduler}; @@ -178,6 +179,26 @@ fn resolve_sst_filename_from_err(err: &str) -> Option { Some(filename) } +pub struct RocksPersistenceListener(PersistenceListener); + +impl RocksPersistenceListener { + pub fn new(listener: PersistenceListener) -> RocksPersistenceListener { + RocksPersistenceListener(listener) + } +} + +impl rocksdb::EventListener for RocksPersistenceListener { + fn on_memtable_sealed(&self, info: &MemTableInfo) { + self.0 + .on_memtable_sealed(info.cf_name().to_string(), info.first_seqno()); + } + + fn on_flush_completed(&self, job: &FlushJobInfo) { + self.0 + .on_flush_completed(job.cf_name(), job.smallest_seqno()); + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index 79cd8350519..9095ef27dfd 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -144,14 +144,26 @@ impl RaftEngineReadOnly for RocksEngine { self.get_msg_cf(CF_DEFAULT, keys::PREPARE_BOOTSTRAP_KEY) } - fn get_region_state(&self, raft_group_id: u64) -> Result> { - let key = keys::region_state_key(raft_group_id); - self.get_msg_cf(CF_DEFAULT, &key) + // Following methods are used by raftstore v2 only, which always use raft log + // engine. + fn get_region_state( + &self, + _raft_group_id: u64, + _apply_index: u64, + ) -> Result> { + panic!() } - fn get_apply_state(&self, raft_group_id: u64) -> Result> { - let key = keys::apply_state_key(raft_group_id); - self.get_msg_cf(CF_DEFAULT, &key) + fn get_apply_state( + &self, + _raft_group_id: u64, + _apply_index: u64, + ) -> Result> { + panic!() + } + + fn get_flushed_index(&self, _raft_group_id: u64, _cf: &str) -> Result> { + panic!() } fn get_recover_state(&self) -> Result> { @@ -405,12 +417,34 @@ impl RaftLogBatch for RocksWriteBatchVec { self.delete(keys::PREPARE_BOOTSTRAP_KEY) } - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { - self.put_msg(&keys::region_state_key(raft_group_id), state) + // Following methods are used by raftstore v2 only, which always use raft log + // engine. + fn put_region_state( + &mut self, + _raft_group_id: u64, + _apply_index: u64, + _state: &RegionLocalState, + ) -> Result<()> { + panic!() + } + + fn put_apply_state( + &mut self, + _raft_group_id: u64, + _apply_index: u64, + _state: &RaftApplyState, + ) -> Result<()> { + panic!() } - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { - self.put_msg(&keys::apply_state_key(raft_group_id), state) + fn put_flushed_index( + &mut self, + _raft_group_id: u64, + _cf: &str, + _tablet_index: u64, + _apply_index: u64, + ) -> Result<()> { + panic!() } fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs new file mode 100644 index 00000000000..9de5369ab54 --- /dev/null +++ b/components/engine_traits/src/flush.rs @@ -0,0 +1,202 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! A helper class to detect flush event and trace apply index. +//! +//! The whole idea is when all CFs have flushed to disk, then the apply index +//! should be able to be advanced to the latest. The implementations depends on +//! the assumption that memtable/write buffer is frozen one by one and flushed +//! one by one. +//! +//! Because apply index can be arbitrary value after restart, so apply related +//! states like `RaftApplyState` and `RegionLocalState` are mapped to index. +//! Once apply index is confirmed, the latest states before apply index should +//! be used as the start state. + +use std::{ + mem, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, +}; + +use kvproto::raft_serverpb::{RaftApplyState, RegionLocalState}; +use tikv_util::Either; + +use crate::{RaftEngine, RaftLogBatch}; + +#[derive(Debug)] +enum StateChange { + ApplyState(RaftApplyState), + RegionState(RegionLocalState), +} + +/// States that is related to apply progress. +#[derive(Default, Debug)] +struct StateChanges { + /// apply index, state change + changes: Vec<(u64, StateChange)>, +} + +struct FlushProgress { + cf: String, + id: u64, + apply_index: u64, + state_changes: StateChanges, +} + +/// A share state between raftstore and underlying engine. +/// +/// raftstore will update state changes and corresponding apply index, when +/// flush, `PersistenceListener` will query states related to the memtable +/// and persist the relation to raft engine. +#[derive(Default, Debug)] +pub struct FlushState { + applied_index: AtomicU64, + changes: Mutex, +} + +impl FlushState { + /// Set the latest applied index. + #[inline] + pub fn set_applied_index(&self, index: u64) { + self.applied_index.store(index, Ordering::Release); + } + + /// Query the applied index. + #[inline] + pub fn applied_index(&self) -> u64 { + self.applied_index.load(Ordering::Acquire) + } + + /// Record an apply state change. + /// + /// This can be triggered by admin command like compact log. General log + /// apply will not trigger the change, instead they are recorded by + /// `set_applied_index`. + #[inline] + pub fn update_apply_state(&self, index: u64, state: RaftApplyState) { + self.changes + .lock() + .unwrap() + .changes + .push((index, StateChange::ApplyState(state))); + } + + /// Record a region state change. + /// + /// This can be triggered by admin command like split/merge. + #[inline] + pub fn update_region_state(&self, index: u64, state: RegionLocalState) { + self.changes + .lock() + .unwrap() + .changes + .push((index, StateChange::RegionState(state))); + } + + /// Check if there is any state change. + #[inline] + pub fn is_empty(&self) -> bool { + self.changes.lock().unwrap().changes.is_empty() + } + + /// Get the last changed state. + #[inline] + pub fn last_state(&self) -> Option<(u64, Either)> { + let changes = self.changes.lock().unwrap(); + let (index, state) = changes.changes.last()?; + let state = match state { + StateChange::ApplyState(state) => Either::Left(state.clone()), + StateChange::RegionState(state) => Either::Right(state.clone()), + }; + Some((*index, state)) + } +} + +/// A flush listener that maps memtable to apply index and persist the relation +/// to raft engine. +pub struct PersistenceListener { + region_id: u64, + tablet_index: u64, + state: Arc, + progress: Mutex>, + raft: ER, +} + +impl PersistenceListener { + pub fn new(region_id: u64, tablet_index: u64, state: Arc, raft: ER) -> Self { + Self { + region_id, + tablet_index, + state, + progress: Mutex::new(Vec::new()), + raft, + } + } +} + +impl PersistenceListener { + pub fn flush_state(&self) -> &Arc { + &self.state + } + + /// Called when memtable is frozen. + /// + /// `id` should be unique between memtables, which is used to identify + /// memtable in the flushed event. + pub fn on_memtable_sealed(&self, cf: String, id: u64) { + // The correctness relies on the assumption that there will be only one + // thread writting to the DB and increasing apply index. + let mut state_changes = self.state.changes.lock().unwrap(); + // Query within lock so it's correct even in manually flush. + let apply_index = self.state.applied_index.load(Ordering::SeqCst); + let changes = mem::take(&mut *state_changes); + drop(state_changes); + self.progress.lock().unwrap().push(FlushProgress { + cf, + id, + apply_index, + state_changes: changes, + }); + } + + /// Called a memtable finished flushing. + pub fn on_flush_completed(&self, cf: &str, id: u64) { + // Maybe we should hook the compaction to avoid the file is compacted before + // being recorded. + let pr = { + let mut prs = self.progress.lock().unwrap(); + let pos = prs + .iter() + .position(|pr| pr.cf == cf && pr.id == id) + .unwrap(); + prs.swap_remove(pos) + }; + let mut batch = self.raft.log_batch(1); + // TODO: It's possible that flush succeeds but fails to call + // `on_flush_completed` before exit. In this case the flushed data will + // be replayed again after restarted. To solve the problem, we need to + // (1) persist flushed file numbers in `on_flush_begin` and (2) check + // the file number in `on_compaction_begin`. After restart, (3) check if the + // file exists. If (1) && ((2) || (3)), then we don't need to replay the data. + for (index, change) in pr.state_changes.changes { + match &change { + StateChange::ApplyState(state) => { + batch.put_apply_state(self.region_id, index, state).unwrap(); + } + StateChange::RegionState(state) => { + batch + .put_region_state(self.region_id, index, state) + .unwrap(); + } + } + } + if pr.apply_index != 0 { + batch + .put_flushed_index(self.region_id, cf, self.tablet_index, pr.apply_index) + .unwrap(); + } + self.raft.consume(&mut batch, true).unwrap(); + } +} diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 6a140230fd5..db95f5621e0 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -277,6 +277,8 @@ mod engine; pub use crate::engine::*; mod file_system; pub use crate::file_system::*; +mod flush; +pub use flush::*; mod import; pub use import::*; mod misc; diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 0c5e0f49854..8b29e07707a 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -19,8 +19,20 @@ pub trait RaftEngineReadOnly: Sync + Send + 'static { fn get_prepare_bootstrap_region(&self) -> Result>; fn get_raft_state(&self, raft_group_id: u64) -> Result>; - fn get_region_state(&self, raft_group_id: u64) -> Result>; - fn get_apply_state(&self, raft_group_id: u64) -> Result>; + /// Get the latest region state not after the apply index. + fn get_region_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result>; + /// Get the latest apply state not after the apply index. + fn get_apply_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result>; + /// Get the flushed index of the given CF. + fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result>; fn get_recover_state(&self) -> Result>; fn get_entry(&self, raft_group_id: u64, index: u64) -> Result>; @@ -157,8 +169,36 @@ pub trait RaftLogBatch: Send { fn remove_prepare_bootstrap_region(&mut self) -> Result<()>; fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()>; - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()>; - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()>; + fn put_region_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RegionLocalState, + ) -> Result<()>; + fn put_apply_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RaftApplyState, + ) -> Result<()>; + + /// Record the flushed apply index. + /// + /// There are two types of apply index: + /// 1. Normal apply index that only related to single tablet. These apply + /// indexes are recorded using its own CF. + /// 2. Apply index that can affect other tablets, like split, merge. These + /// apply indexes are recorded using special Raft CF. + /// + /// Because a peer may have multiple tablets (only one is latest), we use + /// `tablet_index` to avoid conflicts. + fn put_flushed_index( + &mut self, + raft_group_id: u64, + cf: &str, + tablet_index: u64, + apply_index: u64, + ) -> Result<()>; /// Indicate whether region states should be recovered from raftdb and /// replay raft logs. diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 0ee185fd365..8a336177706 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -7,6 +7,7 @@ edition = "2018" [dependencies] encryption = { workspace = true } engine_traits = { workspace = true } +codec = { workspace = true } file_system = { workspace = true } kvproto = { workspace = true } lazy_static = "1.4.0" @@ -22,3 +23,6 @@ slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global tikv_util = { workspace = true } time = "0.1" tracker = { workspace = true } + +[dev-dependencies] +tempfile = "3.0" diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 587f31bae93..7be02e8b6e2 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -7,11 +7,12 @@ use std::{ sync::Arc, }; +use codec::number::NumberCodec; use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter}; use engine_traits::{ CacheStats, EncryptionKeyManager, EncryptionMethod, PerfContextExt, PerfContextKind, PerfLevel, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, - RaftLogGcTask, Result, + RaftLogGcTask, Result, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use file_system::{IoOp, IoRateLimiter, IoType}; use kvproto::{ @@ -290,6 +291,36 @@ impl FileSystem for ManagedFileSystem { } } +/// Convert a cf to id for encoding. +fn cf_to_id(cf: &str) -> u8 { + match cf { + CF_DEFAULT => 0, + CF_LOCK => 1, + CF_WRITE => 2, + CF_RAFT => 3, + _ => panic!("unrecognized cf {}", cf), + } +} + +/// Encode a key in the format `{prefix}{num}`. +fn encode_key(prefix: &'static [u8], num: u64) -> [u8; 9] { + debug_assert_eq!(prefix.len(), 1); + let mut buf = [0; 9]; + buf[..prefix.len()].copy_from_slice(prefix); + NumberCodec::encode_u64(&mut buf[prefix.len()..], num); + buf +} + +/// Encode a flush key in the format `{flush key prefix}{cf_id}{tablet_index}`. +fn encode_flushed_key(cf: &str, tablet_index: u64) -> [u8; 10] { + debug_assert_eq!(FLUSH_STATE_KEY.len(), 1); + let mut buf = [0; 10]; + buf[..FLUSH_STATE_KEY.len()].copy_from_slice(FLUSH_STATE_KEY); + buf[FLUSH_STATE_KEY.len()] = cf_to_id(cf); + NumberCodec::encode_u64(&mut buf[FLUSH_STATE_KEY.len() + 1..], tablet_index); + buf +} + #[derive(Clone)] pub struct RaftLogEngine(Arc>); @@ -348,6 +379,7 @@ const PREPARE_BOOTSTRAP_REGION_KEY: &[u8] = &[0x02]; const REGION_STATE_KEY: &[u8] = &[0x03]; const APPLY_STATE_KEY: &[u8] = &[0x04]; const RECOVER_STATE_KEY: &[u8] = &[0x05]; +const FLUSH_STATE_KEY: &[u8] = &[0x06]; impl RaftLogBatchTrait for RaftLogBatch { fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { @@ -401,18 +433,44 @@ impl RaftLogBatchTrait for RaftLogBatch { Ok(()) } - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { + fn put_region_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RegionLocalState, + ) -> Result<()> { + let key = encode_key(REGION_STATE_KEY, apply_index); self.0 - .put_message(raft_group_id, REGION_STATE_KEY.to_vec(), state) + .put_message(raft_group_id, key.to_vec(), state) .map_err(transfer_error) } - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { + fn put_apply_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RaftApplyState, + ) -> Result<()> { + let key = encode_key(APPLY_STATE_KEY, apply_index); self.0 - .put_message(raft_group_id, APPLY_STATE_KEY.to_vec(), state) + .put_message(raft_group_id, key.to_vec(), state) .map_err(transfer_error) } + fn put_flushed_index( + &mut self, + raft_group_id: u64, + cf: &str, + tablet_index: u64, + apply_index: u64, + ) -> Result<()> { + let key = encode_flushed_key(cf, tablet_index); + let mut value = vec![0; 8]; + NumberCodec::encode_u64(&mut value, apply_index); + self.0.put(raft_group_id, key.to_vec(), value); + Ok(()) + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.0 .put_message(STORE_STATE_ID, RECOVER_STATE_KEY.to_vec(), state) @@ -471,16 +529,72 @@ impl RaftEngineReadOnly for RaftLogEngine { .map_err(transfer_error) } - fn get_region_state(&self, raft_group_id: u64) -> Result> { + fn get_region_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + let mut state = None; self.0 - .get_message(raft_group_id, REGION_STATE_KEY) - .map_err(transfer_error) + .scan_messages( + raft_group_id, + Some(REGION_STATE_KEY), + Some(APPLY_STATE_KEY), + true, + |key, value| { + let index = NumberCodec::decode_u64(&key[REGION_STATE_KEY.len()..]); + if index > apply_index { + true + } else { + state = Some(value); + false + } + }, + ) + .map_err(transfer_error)?; + Ok(state) } - fn get_apply_state(&self, raft_group_id: u64) -> Result> { + fn get_apply_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + let mut state = None; self.0 - .get_message(raft_group_id, APPLY_STATE_KEY) - .map_err(transfer_error) + .scan_messages( + raft_group_id, + Some(APPLY_STATE_KEY), + Some(RECOVER_STATE_KEY), + true, + |key, value| { + let index = NumberCodec::decode_u64(&key[REGION_STATE_KEY.len()..]); + if index > apply_index { + true + } else { + state = Some(value); + false + } + }, + ) + .map_err(transfer_error)?; + Ok(state) + } + + fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result> { + let mut start = [0; 2]; + start[..FLUSH_STATE_KEY.len()].copy_from_slice(FLUSH_STATE_KEY); + start[FLUSH_STATE_KEY.len()] = cf_to_id(cf); + let mut end = start; + end[FLUSH_STATE_KEY.len()] += 1; + let mut index = None; + self.0 + .scan_raw_messages(raft_group_id, Some(&start), Some(&end), true, |_, v| { + index = Some(NumberCodec::decode_u64(v)); + false + }) + .map_err(transfer_error)?; + Ok(index) } fn get_recover_state(&self) -> Result> { @@ -624,3 +738,67 @@ fn transfer_error(e: RaftEngineError) -> engine_traits::Error { } } } + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + + use engine_traits::ALL_CFS; + + use super::*; + + #[test] + fn test_apply_related_states() { + let dir = tempfile::tempdir().unwrap(); + let cfg = RaftEngineConfig { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = RaftLogEngine::new(cfg, None, None).unwrap(); + assert_matches!(engine.get_region_state(2, u64::MAX), Ok(None)); + assert_matches!(engine.get_apply_state(2, u64::MAX), Ok(None)); + for cf in ALL_CFS { + assert_matches!(engine.get_flushed_index(2, cf), Ok(None)); + } + + let mut wb = engine.log_batch(10); + let mut region_state = RegionLocalState::default(); + region_state.mut_region().set_id(3); + wb.put_region_state(2, 1, ®ion_state).unwrap(); + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(3); + wb.put_apply_state(2, 3, &apply_state).unwrap(); + for cf in ALL_CFS.iter().take(2) { + wb.put_flushed_index(2, cf, 5, 4).unwrap(); + } + engine.consume(&mut wb, false).unwrap(); + + for cf in ALL_CFS.iter().take(2) { + assert_matches!(engine.get_flushed_index(2, cf), Ok(Some(4))); + } + for cf in ALL_CFS.iter().skip(2) { + assert_matches!(engine.get_flushed_index(2, cf), Ok(None)); + } + + let mut region_state2 = region_state.clone(); + region_state2.mut_region().set_id(5); + wb.put_region_state(2, 4, ®ion_state2).unwrap(); + let mut apply_state2 = apply_state.clone(); + apply_state2.set_applied_index(5); + wb.put_apply_state(2, 5, &apply_state2).unwrap(); + for cf in ALL_CFS { + wb.put_flushed_index(2, cf, 6, 5).unwrap(); + } + engine.consume(&mut wb, false).unwrap(); + + assert_matches!(engine.get_region_state(2, 0), Ok(None)); + assert_matches!(engine.get_region_state(2, 1), Ok(Some(s)) if s == region_state); + assert_matches!(engine.get_region_state(2, 4), Ok(Some(s)) if s == region_state2); + assert_matches!(engine.get_apply_state(2, 0), Ok(None)); + assert_matches!(engine.get_apply_state(2, 3), Ok(Some(s)) if s == apply_state); + assert_matches!(engine.get_apply_state(2, 5), Ok(Some(s)) if s == apply_state2); + for cf in ALL_CFS { + assert_matches!(engine.get_flushed_index(2, cf), Ok(Some(5))); + } + } +} diff --git a/components/raft_log_engine/src/lib.rs b/components/raft_log_engine/src/lib.rs index 8eda4e5ae24..25899ddf2bb 100644 --- a/components/raft_log_engine/src/lib.rs +++ b/components/raft_log_engine/src/lib.rs @@ -16,6 +16,7 @@ //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] +#![feature(assert_matches)] #[macro_use] extern crate tikv_util; diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 60889908aa0..3e459340b0e 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -175,7 +175,7 @@ impl Store { return; } let from_epoch = msg.get_region_epoch(); - let local_state = match ctx.engine.get_region_state(region_id) { + let local_state = match ctx.engine.get_region_state(region_id, 0) { Ok(s) => s, Err(e) => { error!(self.logger(), "failed to get region state"; "region_id" => region_id, "err" => ?e); diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 49a0f547e1a..f3678767693 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -29,7 +29,7 @@ pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Resul let mut state = RegionLocalState::default(); state.set_region(region); state.set_tablet_index(RAFT_INIT_LOG_INDEX); - wb.put_region_state(region_id, &state)?; + wb.put_region_state(region_id, 0, &state)?; let mut apply_state = RaftApplyState::default(); apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); @@ -39,7 +39,7 @@ pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Resul apply_state .mut_truncated_state() .set_term(RAFT_INIT_LOG_TERM); - wb.put_apply_state(region_id, &apply_state)?; + wb.put_apply_state(region_id, 0, &apply_state)?; let mut raft_state = RaftLocalState::default(); raft_state.set_last_index(RAFT_INIT_LOG_INDEX); @@ -158,7 +158,7 @@ impl Storage { read_scheduler: Scheduler>, logger: &Logger, ) -> Result>> { - let region_state = match engine.get_region_state(region_id) { + let region_state = match engine.get_region_state(region_id, 0) { Ok(Some(s)) => s, res => { return Err(box_err!( @@ -180,7 +180,7 @@ impl Storage { } }; - let apply_state = match engine.get_apply_state(region_id) { + let apply_state = match engine.get_apply_state(region_id, 0) { Ok(Some(s)) => s, res => { return Err(box_err!("failed to get apply state: {:?}", res)); @@ -439,7 +439,7 @@ mod tests { assert!(!wb.is_empty()); raft_engine.consume(&mut wb, true).unwrap(); - let local_state = raft_engine.get_region_state(4).unwrap().unwrap(); + let local_state = raft_engine.get_region_state(4, 0).unwrap().unwrap(); assert_eq!(local_state.get_state(), PeerState::Normal); assert_eq!(*local_state.get_region(), region); assert_eq!(local_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); @@ -450,7 +450,7 @@ mod tests { assert_eq!(hs.get_term(), RAFT_INIT_LOG_TERM); assert_eq!(hs.get_commit(), RAFT_INIT_LOG_INDEX); - let apply_state = raft_engine.get_apply_state(4).unwrap().unwrap(); + let apply_state = raft_engine.get_apply_state(4, 0).unwrap().unwrap(); assert_eq!(apply_state.get_applied_index(), RAFT_INIT_LOG_INDEX); let ts = apply_state.get_truncated_state(); assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs index 805cda15471..5f44b2d5813 100644 --- a/components/raftstore-v2/tests/integrations/test_life.rs +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -49,8 +49,8 @@ fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb raft_engine.get_all_entries_to(region_id, &mut buf).unwrap(); assert!(buf.is_empty(), "{:?}", buf); assert_matches!(raft_engine.get_raft_state(region_id), Ok(None)); - assert_matches!(raft_engine.get_apply_state(region_id), Ok(None)); - let region_state = raft_engine.get_region_state(region_id).unwrap().unwrap(); + assert_matches!(raft_engine.get_apply_state(region_id, 0), Ok(None)); + let region_state = raft_engine.get_region_state(region_id, 0).unwrap().unwrap(); assert_matches!(region_state.get_state(), PeerState::Tombstone); assert!( region_state.get_region().get_peers().contains(peer), @@ -121,7 +121,7 @@ fn test_life_by_message() { let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; raft_engine.get_raft_state(test_region_id).unwrap().unwrap(); raft_engine - .get_apply_state(test_region_id) + .get_apply_state(test_region_id, 0) .unwrap() .unwrap(); diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index d17223e5acf..14fbd192d0d 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -416,10 +416,11 @@ where ) .unwrap(); } - wb.put_region_state(region_id, ®ion_state).unwrap(); + wb.put_region_state(region_id, 0, ®ion_state).unwrap(); } if !tombstone { - wb.put_apply_state(region_id, &state.apply_state).unwrap(); + wb.put_apply_state(region_id, 0, &state.apply_state) + .unwrap(); } } } diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 727502b6ca4..625e9f3c4a5 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -431,14 +431,14 @@ fn test_worker_split_raft_wb() { ], ); assert_eq!( - engines.raft.get_apply_state(region_1).unwrap(), + engines.raft.get_apply_state(region_1, 0).unwrap(), Some(RaftApplyState { applied_index: 25, ..Default::default() }) ); assert_eq!( - engines.raft.get_apply_state(region_2).unwrap(), + engines.raft.get_apply_state(region_2, 0).unwrap(), Some(RaftApplyState { applied_index: 16, ..Default::default() @@ -634,18 +634,18 @@ fn test_basic_flow_with_states() { ], ); assert_eq!( - engines.raft.get_apply_state(region_1).unwrap().unwrap(), + engines.raft.get_apply_state(region_1, 0).unwrap().unwrap(), apply_state_3 ); assert_eq!( - engines.raft.get_apply_state(region_2).unwrap().unwrap(), + engines.raft.get_apply_state(region_2, 0).unwrap().unwrap(), apply_state_2 ); assert_eq!( - engines.raft.get_region_state(region_1).unwrap().unwrap(), + engines.raft.get_region_state(region_1, 0).unwrap().unwrap(), region_state_1 ); - assert_eq!(engines.raft.get_region_state(region_2).unwrap(), None); + assert_eq!(engines.raft.get_region_state(region_2, 0).unwrap(), None); must_have_same_count_msg(6, &t.msg_rx); From cfdb31fe3679687a698490b1e783570672a0238d Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 14 Dec 2022 13:30:52 +0800 Subject: [PATCH 406/676] async-io: make v2 a log batch (#13935) ref tikv/tikv#12842 We now map states with apply index, so there is nothing to merge. Use a log batch for better flexibility to introduce more extra writes. Signed-off-by: Jay Lee --- components/raftstore-v2/src/operation/life.rs | 23 ++- .../raftstore-v2/src/operation/ready/mod.rs | 16 +- components/raftstore-v2/src/raft/peer.rs | 5 + components/raftstore-v2/src/raft/storage.rs | 6 +- .../tests/integrations/test_life.rs | 7 +- .../raftstore/src/store/async_io/write.rs | 157 +++++------------- .../src/store/async_io/write_tests.rs | 58 +++---- components/raftstore/src/store/mod.rs | 5 +- 8 files changed, 111 insertions(+), 166 deletions(-) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 3e459340b0e..d9f706c32a1 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -14,12 +14,12 @@ use std::cmp; use batch_system::BasicMailbox; use crossbeam::channel::{SendError, TrySendError}; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ metapb::Region, raft_serverpb::{PeerState, RaftMessage}, }; -use raftstore::store::{util, ExtraStates, WriteTask}; +use raftstore::store::{util, WriteTask}; use slog::{debug, error, info, warn}; use tikv_util::store::find_peer; @@ -175,7 +175,7 @@ impl Store { return; } let from_epoch = msg.get_region_epoch(); - let local_state = match ctx.engine.get_region_state(region_id, 0) { + let local_state = match ctx.engine.get_region_state(region_id, u64::MAX) { Ok(s) => s, Err(e) => { error!(self.logger(), "failed to get region state"; "region_id" => region_id, "err" => ?e); @@ -304,13 +304,20 @@ impl Peer { Some((f, l)) => Some((cmp::min(first_index, f), cmp::max(last_index, l))), }; } - let mut extra_states = ExtraStates::new(entry_storage.apply_state().clone()); + let raft_engine = self.entry_storage().raft_engine(); let mut region_state = self.storage().region_state().clone(); + let region_id = region_state.get_region().get_id(); + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(2)); + // We only use raft-log-engine for v2, first index is not important. + let raft_state = self.entry_storage().raft_state(); + raft_engine.clean(region_id, 0, raft_state, lb).unwrap(); // Write worker will do the clean up when meeting tombstone state. region_state.set_state(PeerState::Tombstone); - extra_states.set_region_state(region_state); - extra_states.set_raft_state(entry_storage.raft_state().clone()); - write_task.extra_write.set_v2(extra_states); + let applied_index = self.entry_storage().applied_index(); + lb.put_region_state(region_id, applied_index, ®ion_state) + .unwrap(); self.destroy_progress_mut().start(); } @@ -325,6 +332,6 @@ impl Peer { // new peer. Ignore error as it's just a best effort. let _ = ctx.router.send_raft_message(msg); } - // TODO: close apply mailbox. + self.clear_apply_scheduler(); } } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index e9046af2831..47f6523cc82 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -22,12 +22,12 @@ mod snapshot; use std::{cmp, time::Instant}; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use error_code::ErrorCodeExt; use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; -use raftstore::store::{util, ExtraStates, FetchedLogs, ReadProgress, Transport, WriteTask}; +use raftstore::store::{util, FetchedLogs, ReadProgress, Transport, WriteTask}; use slog::{debug, error, trace, warn}; use tikv_util::time::{duration_to_sec, monotonic_raw_now}; @@ -555,9 +555,15 @@ impl Storage { write_task.raft_state = Some(entry_storage.raft_state().clone()); } if !ever_persisted { - let mut extra_states = ExtraStates::new(self.apply_state().clone()); - extra_states.set_region_state(self.region_state().clone()); - write_task.extra_write.set_v2(extra_states); + let region_id = self.region().get_id(); + let raft_engine = self.entry_storage().raft_engine(); + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(3)); + lb.put_apply_state(region_id, 0, self.apply_state()) + .unwrap(); + lb.put_region_state(region_id, 0, self.region_state()) + .unwrap(); self.set_ever_persisted(); } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index f211313e1b5..21795eb3293 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -508,6 +508,11 @@ impl Peer { self.apply_scheduler = Some(apply_scheduler); } + #[inline] + pub fn clear_apply_scheduler(&mut self) { + self.apply_scheduler.take(); + } + /// Whether the snapshot is handling. /// See the comments of `check_snap_status` for more details. #[inline] diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index f3678767693..889674c514c 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -158,7 +158,7 @@ impl Storage { read_scheduler: Scheduler>, logger: &Logger, ) -> Result>> { - let region_state = match engine.get_region_state(region_id, 0) { + let region_state = match engine.get_region_state(region_id, u64::MAX) { Ok(Some(s)) => s, res => { return Err(box_err!( @@ -180,7 +180,7 @@ impl Storage { } }; - let apply_state = match engine.get_apply_state(region_id, 0) { + let apply_state = match engine.get_apply_state(region_id, u64::MAX) { Ok(Some(s)) => s, res => { return Err(box_err!("failed to get apply state: {:?}", res)); @@ -450,7 +450,7 @@ mod tests { assert_eq!(hs.get_term(), RAFT_INIT_LOG_TERM); assert_eq!(hs.get_commit(), RAFT_INIT_LOG_INDEX); - let apply_state = raft_engine.get_apply_state(4, 0).unwrap().unwrap(); + let apply_state = raft_engine.get_apply_state(4, u64::MAX).unwrap().unwrap(); assert_eq!(apply_state.get_applied_index(), RAFT_INIT_LOG_INDEX); let ts = apply_state.get_truncated_state(); assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs index 5f44b2d5813..a2ae0bbb9f8 100644 --- a/components/raftstore-v2/tests/integrations/test_life.rs +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -49,8 +49,11 @@ fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb raft_engine.get_all_entries_to(region_id, &mut buf).unwrap(); assert!(buf.is_empty(), "{:?}", buf); assert_matches!(raft_engine.get_raft_state(region_id), Ok(None)); - assert_matches!(raft_engine.get_apply_state(region_id, 0), Ok(None)); - let region_state = raft_engine.get_region_state(region_id, 0).unwrap().unwrap(); + assert_matches!(raft_engine.get_apply_state(region_id, u64::MAX), Ok(None)); + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); assert_matches!(region_state.get_state(), PeerState::Tombstone); assert!( region_state.get_region().get_peers().contains(peer), diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 14fbd192d0d..b69b3484e0c 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -20,9 +20,7 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; -use kvproto::raft_serverpb::{ - PeerState, RaftApplyState, RaftLocalState, RaftMessage, RegionLocalState, -}; +use kvproto::raft_serverpb::{RaftLocalState, RaftMessage}; use protobuf::Message; use raft::eraftpb::Entry; use tikv_util::{ @@ -39,7 +37,6 @@ use super::write_router::WriteSenders; use crate::{ store::{ config::Config, - entry_storage::first_index, fsm::RaftRouter, local_metrics::{RaftSendMessageMetrics, StoreWriteMetrics, TimeTracker}, metrics::*, @@ -89,22 +86,24 @@ where /// /// For now, applying snapshot needs to persist some extra states. For v1, /// these states are written to KvEngine. For v2, they are written to -/// RaftEngine. +/// RaftEngine. Although in v2 these states are also written to raft engine, +/// but we have to use `ExtraState` as they should be written as the last +/// updates. // TODO: perhaps we should always pass states instead of a write batch even // for v1. -pub enum ExtraWrite { +pub enum ExtraWrite { None, V1(W), - V2(ExtraStates), + V2(L), } -impl ExtraWrite { +impl ExtraWrite { #[inline] pub fn is_empty(&self) -> bool { match self { ExtraWrite::None => true, ExtraWrite::V1(w) => w.is_empty(), - _ => false, + ExtraWrite::V2(l) => l.is_empty(), } } @@ -113,7 +112,7 @@ impl ExtraWrite { match self { ExtraWrite::None => 0, ExtraWrite::V1(w) => w.data_size(), - ExtraWrite::V2(m) => mem::size_of_val(m), + ExtraWrite::V2(l) => l.persist_size(), } } @@ -140,18 +139,22 @@ impl ExtraWrite { } #[inline] - pub fn set_v2(&mut self, extra_states: ExtraStates) { - if let ExtraWrite::V1(_) = self { + pub fn ensure_v2(&mut self, log_batch: impl FnOnce() -> L) -> &mut L { + if let ExtraWrite::None = self { + *self = ExtraWrite::V2(log_batch()); + } else if let ExtraWrite::V1(_) = self { unreachable!("v1 and v2 are mixed used"); - } else { - *self = ExtraWrite::V2(extra_states); + } + match self { + ExtraWrite::V2(l) => l, + _ => unreachable!(), } } #[inline] - pub fn v2_mut(&mut self) -> Option<&mut ExtraStates> { - if let ExtraWrite::V2(m) = self { - Some(m) + pub fn v2_mut(&mut self) -> Option<&mut L> { + if let ExtraWrite::V2(l) = self { + Some(l) } else { None } @@ -175,7 +178,7 @@ where pub entries: Vec, pub cut_logs: Option<(u64, u64)>, pub raft_state: Option, - pub extra_write: ExtraWrite, + pub extra_write: ExtraWrite, pub messages: Vec, pub trackers: Vec, pub has_snapshot: bool, @@ -264,57 +267,27 @@ where } } -/// These states are set only in raftstore V2. -#[derive(Default)] -pub struct ExtraStates { - apply_state: RaftApplyState, - region_state: Option, - // Set only want to destroy the raft group in write worker. - raft_state: Option, -} - -impl ExtraStates { - #[inline] - pub fn new(apply_state: RaftApplyState) -> Self { - Self { - apply_state, - region_state: None, - raft_state: None, - } - } - - #[inline] - pub fn set_region_state(&mut self, region_state: RegionLocalState) { - self.region_state = Some(region_state); - } - - #[inline] - pub fn set_raft_state(&mut self, raft_state: RaftLocalState) { - self.raft_state = Some(raft_state); - } -} - -pub enum ExtraBatchWrite { +pub enum ExtraBatchWrite { None, V1(W), - V2(HashMap), + V2(L), } -impl ExtraBatchWrite { +impl ExtraBatchWrite { #[inline] fn clear(&mut self) { match self { ExtraBatchWrite::None => {} ExtraBatchWrite::V1(w) => w.clear(), - ExtraBatchWrite::V2(m) => m.clear(), + // No clear in in `RaftLogBatch`. + ExtraBatchWrite::V2(_) => *self = ExtraBatchWrite::None, } } /// Merge the extra_write with this batch. /// /// If there is any new states inserted, return the size of the state. - fn merge(&mut self, region_id: u64, extra_write: &mut ExtraWrite) -> usize { - let mut inserted = false; + fn merge(&mut self, extra_write: &mut ExtraWrite) { match mem::replace(extra_write, ExtraWrite::None) { ExtraWrite::None => (), ExtraWrite::V1(wb) => match self { @@ -322,35 +295,11 @@ impl ExtraBatchWrite { ExtraBatchWrite::V1(kv_wb) => kv_wb.merge(wb).unwrap(), ExtraBatchWrite::V2(_) => unreachable!("v2 and v1 are mixed used"), }, - ExtraWrite::V2(extra_states) => match self { - ExtraBatchWrite::None => { - let mut map = HashMap::default(); - map.insert(region_id, extra_states); - *self = ExtraBatchWrite::V2(map); - inserted = true; - } + ExtraWrite::V2(lb) => match self { + ExtraBatchWrite::None => *self = ExtraBatchWrite::V2(lb), ExtraBatchWrite::V1(_) => unreachable!("v2 and v1 are mixed used"), - ExtraBatchWrite::V2(extra_states_map) => match extra_states_map.entry(region_id) { - collections::HashMapEntry::Occupied(mut slot) => { - slot.get_mut().apply_state = extra_states.apply_state; - if let Some(region_state) = extra_states.region_state { - slot.get_mut().region_state = Some(region_state); - } - if let Some(raft_state) = extra_states.raft_state { - slot.get_mut().raft_state = Some(raft_state); - } - } - collections::HashMapEntry::Vacant(slot) => { - slot.insert(extra_states); - inserted = true; - } - }, + ExtraBatchWrite::V2(raft_wb) => raft_wb.merge(lb).unwrap(), }, - }; - if inserted { - std::mem::size_of::() - } else { - 0 } } } @@ -368,7 +317,7 @@ where // These states only corresponds to entries inside `raft_wbs.last()`. States for other write // batches must be inlined early. pub raft_states: HashMap, - pub extra_batch_write: ExtraBatchWrite, + pub extra_batch_write: ExtraBatchWrite, pub state_size: usize, pub tasks: Vec>, pub persisted_cbs: Vec>, @@ -396,35 +345,16 @@ where } #[inline] - fn flush_states_to_raft_wb(&mut self, raft_engine: &ER) { + fn flush_states_to_raft_wb(&mut self) { let wb = self.raft_wbs.last_mut().unwrap(); for (region_id, state) in self.raft_states.drain() { wb.put_raft_state(region_id, &state).unwrap(); } - if let ExtraBatchWrite::V2(extra_states_map) = &mut self.extra_batch_write { - for (region_id, state) in extra_states_map.drain() { - let mut tombstone = false; - if let Some(region_state) = state.region_state { - if region_state.get_state() == PeerState::Tombstone { - tombstone = true; - raft_engine - .clean( - region_id, - first_index(&state.apply_state), - state.raft_state.as_ref().unwrap(), - wb, - ) - .unwrap(); - } - wb.put_region_state(region_id, 0, ®ion_state).unwrap(); - } - if !tombstone { - wb.put_apply_state(region_id, 0, &state.apply_state) - .unwrap(); - } - } - } self.state_size = 0; + if let ExtraBatchWrite::V2(_) = self.extra_batch_write { + let ExtraBatchWrite::V2(lb) = mem::replace(&mut self.extra_batch_write, ExtraBatchWrite::None) else { unreachable!() }; + wb.merge(lb).unwrap(); + } } /// Add write task to this batch @@ -436,7 +366,7 @@ where if self.raft_wb_split_size > 0 && self.raft_wbs.last().unwrap().persist_size() >= self.raft_wb_split_size { - self.flush_states_to_raft_wb(raft_engine); + self.flush_states_to_raft_wb(); self.raft_wbs .push(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); } @@ -456,9 +386,7 @@ where && self.raft_states.insert(task.region_id, raft_state).is_none() { self.state_size += std::mem::size_of::(); } - self.state_size += self - .extra_batch_write - .merge(task.region_id, &mut task.extra_write); + self.extra_batch_write.merge(&mut task.extra_write); if let Some(prev_readies) = self .readies @@ -511,8 +439,8 @@ where .sum::() } - fn before_write_to_db(&mut self, engine: &ER, metrics: &StoreWriteMetrics) { - self.flush_states_to_raft_wb(engine); + fn before_write_to_db(&mut self, metrics: &StoreWriteMetrics) { + self.flush_states_to_raft_wb(); if metrics.waterfall_metrics { let now = std::time::Instant::now(); for task in &self.tasks { @@ -705,8 +633,7 @@ where let timer = Instant::now(); - self.batch - .before_write_to_db(&self.raft_engine, &self.metrics); + self.batch.before_write_to_db(&self.metrics); fail_point!("raft_before_save"); @@ -968,7 +895,7 @@ pub fn write_to_db_for_test( { let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); batch.add_write_task(&engines.raft, task); - batch.before_write_to_db(&engines.raft, &StoreWriteMetrics::new(false)); + batch.before_write_to_db(&StoreWriteMetrics::new(false)); if let ExtraBatchWrite::V1(kv_wb) = &mut batch.extra_batch_write { if !kv_wb.is_empty() { let mut write_opts = WriteOptions::new(); diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 625e9f3c4a5..6007b39489e 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -6,7 +6,7 @@ use collections::HashSet; use crossbeam::channel::unbounded; use engine_test::{kv::KvTestEngine, new_temp_engine, raft::RaftTestEngine}; use engine_traits::{Engines, Mutable, Peekable, RaftEngineReadOnly, WriteBatchExt}; -use kvproto::raft_serverpb::RaftMessage; +use kvproto::raft_serverpb::{RaftApplyState, RaftMessage, RegionLocalState}; use tempfile::Builder; use super::*; @@ -350,11 +350,11 @@ fn test_worker_split_raft_wb() { let mut expected_wbs = 1; let mut task_1 = WriteTask::::new(region_1, 1, 10); - init_write_batch(&engines, &mut task_1); - task_1.extra_write = ExtraWrite::V2(ExtraStates::new(RaftApplyState { - applied_index: 10, - ..Default::default() - })); + task_1.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_1 = RaftApplyState::default(); + apply_state_1.set_applied_index(10); + let lb = task_1.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 10, &apply_state_1).unwrap(); put_raft_kv(task_1.raft_wb.as_mut(), raft_key_1); task_1.entries.append(&mut vec![ new_entry(5, 5), @@ -366,11 +366,11 @@ fn test_worker_split_raft_wb() { t.worker.batch.add_write_task(&engines.raft, task_1); let mut task_2 = WriteTask::::new(region_2, 2, 15); - init_write_batch(&engines, &mut task_2); - task_2.extra_write = ExtraWrite::V2(ExtraStates::new(RaftApplyState { - applied_index: 16, - ..Default::default() - })); + task_2.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_2 = RaftApplyState::default(); + apply_state_2.set_applied_index(16); + let lb = task_2.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_2, 16, &apply_state_2).unwrap(); put_raft_kv(task_2.raft_wb.as_mut(), raft_key_2); task_2 .entries @@ -385,11 +385,11 @@ fn test_worker_split_raft_wb() { t.worker.batch.add_write_task(&engines.raft, task_2); let mut task_3 = WriteTask::::new(region_1, 1, 11); - init_write_batch(&engines, &mut task_3); - task_3.extra_write = ExtraWrite::V2(ExtraStates::new(RaftApplyState { - applied_index: 25, - ..Default::default() - })); + task_3.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_3 = RaftApplyState::default(); + apply_state_3.set_applied_index(25); + let lb = task_3.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 25, &apply_state_3).unwrap(); put_raft_kv(task_3.raft_wb.as_mut(), raft_key_3); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), raft_key_1); task_3 @@ -431,14 +431,14 @@ fn test_worker_split_raft_wb() { ], ); assert_eq!( - engines.raft.get_apply_state(region_1, 0).unwrap(), + engines.raft.get_apply_state(region_1, 25).unwrap(), Some(RaftApplyState { applied_index: 25, ..Default::default() }) ); assert_eq!( - engines.raft.get_apply_state(region_2, 0).unwrap(), + engines.raft.get_apply_state(region_2, 16).unwrap(), Some(RaftApplyState { applied_index: 16, ..Default::default() @@ -559,14 +559,14 @@ fn test_basic_flow_with_states() { task_1.raft_wb = Some(engines.raft.log_batch(0)); let mut apply_state_1 = RaftApplyState::default(); apply_state_1.applied_index = 2; - let mut extra_state = ExtraStates::new(apply_state_1); let mut region_state_1 = RegionLocalState::default(); region_state_1 .mut_region() .mut_region_epoch() .set_version(3); - extra_state.region_state = Some(region_state_1.clone()); - task_1.extra_write.set_v2(extra_state); + let lb = task_1.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 2, &apply_state_1).unwrap(); + lb.put_region_state(region_1, 2, ®ion_state_1).unwrap(); put_raft_kv(task_1.raft_wb.as_mut(), 17); task_1 .entries @@ -582,8 +582,8 @@ fn test_basic_flow_with_states() { task_2.raft_wb = Some(engines.raft.log_batch(0)); let mut apply_state_2 = RaftApplyState::default(); apply_state_2.applied_index = 30; - let extra_state = ExtraStates::new(apply_state_2.clone()); - task_2.extra_write.set_v2(extra_state); + let lb = task_2.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(2, 30, &apply_state_2).unwrap(); put_raft_kv(task_2.raft_wb.as_mut(), 27); task_2 .entries @@ -599,8 +599,8 @@ fn test_basic_flow_with_states() { task_3.raft_wb = Some(engines.raft.log_batch(0)); let mut apply_state_3 = RaftApplyState::default(); apply_state_3.applied_index = 5; - let extra_state = ExtraStates::new(apply_state_3.clone()); - task_3.extra_write.set_v2(extra_state); + let lb = task_3.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 5, &apply_state_3).unwrap(); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); task_3.entries.append(&mut vec![new_entry(6, 6)]); @@ -634,18 +634,18 @@ fn test_basic_flow_with_states() { ], ); assert_eq!( - engines.raft.get_apply_state(region_1, 0).unwrap().unwrap(), + engines.raft.get_apply_state(region_1, 5).unwrap().unwrap(), apply_state_3 ); assert_eq!( - engines.raft.get_apply_state(region_2, 0).unwrap().unwrap(), + engines.raft.get_apply_state(region_2, 30).unwrap().unwrap(), apply_state_2 ); assert_eq!( - engines.raft.get_region_state(region_1, 0).unwrap().unwrap(), + engines.raft.get_region_state(region_1, 2).unwrap().unwrap(), region_state_1 ); - assert_eq!(engines.raft.get_region_state(region_2, 0).unwrap(), None); + assert_eq!(engines.raft.get_region_state(region_2, 1).unwrap(), None); must_have_same_count_msg(6, &t.msg_rx); diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 65417732adf..0846e8362b3 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -31,10 +31,7 @@ pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ read::{AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask}, - write::{ - ExtraStates, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, - WriteTask, - }, + write::{PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask}, write_router::{WriteRouter, WriteRouterContext, WriteSenders}, }, bootstrap::{ From 98e8bfb35e6cf87e51916ba229fc269b5c984187 Mon Sep 17 00:00:00 2001 From: lijie Date: Wed, 14 Dec 2022 14:41:04 +0800 Subject: [PATCH 407/676] chore: bump version to v6.6.0-alpha (#13938) Signed-off-by: lijie --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 432d2ce3c26..244ca504858 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6226,7 +6226,7 @@ dependencies = [ [[package]] name = "tikv" -version = "6.5.0-alpha" +version = "6.6.0-alpha" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index 61d6da6946d..66516206dd0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "6.5.0-alpha" +version = "6.6.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From e591a41bfe29899c42bbf3e8856ea6dd50deb548 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Fri, 16 Dec 2022 10:22:52 +0800 Subject: [PATCH 408/676] storage: Update lock wait info after acquiring locks (#13902) ref tikv/tikv#13298 Update lock wait info after acquiring locks, so that in case there is resumable pessimistic lock requests that's waiting in TiKV, the latest waiting relationship can be used to provide the diagnostic information and do deadlock detection. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- src/server/lock_manager/waiter_manager.rs | 12 +- .../lock_manager/lock_waiting_queue.rs | 72 ++++++++++- src/storage/lock_manager/mod.rs | 1 - src/storage/mvcc/txn.rs | 31 ++++- .../txn/actions/acquire_pessimistic_lock.rs | 4 +- src/storage/txn/actions/check_txn_status.rs | 2 +- .../txn/actions/flashback_to_version.rs | 1 + src/storage/txn/actions/prewrite.rs | 13 +- .../txn/commands/acquire_pessimistic_lock.rs | 2 + .../acquire_pessimistic_lock_resumed.rs | 8 +- src/storage/txn/commands/atomic_store.rs | 1 + .../txn/commands/check_secondary_locks.rs | 2 + src/storage/txn/commands/check_txn_status.rs | 2 + src/storage/txn/commands/cleanup.rs | 2 + src/storage/txn/commands/commit.rs | 2 + src/storage/txn/commands/compare_and_swap.rs | 1 + .../txn/commands/flashback_to_version.rs | 1 + src/storage/txn/commands/mod.rs | 1 + src/storage/txn/commands/pause.rs | 1 + .../txn/commands/pessimistic_rollback.rs | 2 + src/storage/txn/commands/prewrite.rs | 8 +- src/storage/txn/commands/resolve_lock.rs | 2 + src/storage/txn/commands/resolve_lock_lite.rs | 2 + src/storage/txn/commands/rollback.rs | 2 + src/storage/txn/commands/txn_heart_beat.rs | 4 +- src/storage/txn/scheduler.rs | 49 +++++++- tests/integrations/server/lock_manager.rs | 113 +++++++++++++++++- 27 files changed, 315 insertions(+), 26 deletions(-) diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index 467580645d3..d8271998653 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -340,7 +340,10 @@ impl WaitTable { Some(waiter) } - fn update_waiter(&mut self, update_event: &UpdateWaitForEvent) -> Option { + fn update_waiter( + &mut self, + update_event: &UpdateWaitForEvent, + ) -> Option<(KeyLockWaitInfo, DiagnosticContext)> { let waiter = self.waiter_pool.get_mut(&update_event.token)?; assert_eq!(waiter.wait_info.key, update_event.wait_info.key); @@ -351,9 +354,8 @@ impl WaitTable { } let result = std::mem::replace(&mut waiter.wait_info, update_event.wait_info.clone()); - waiter.diag_ctx = update_event.diag_ctx.clone(); - Some(result) + Some((result, waiter.diag_ctx.clone())) } fn take_waiter_by_lock_digest( @@ -542,11 +544,11 @@ impl WaiterManager { continue; } - if let Some(previous_wait_info) = previous_wait_info { + if let Some((previous_wait_info, diag_ctx)) = previous_wait_info { self.detector_scheduler .clean_up_wait_for(event.start_ts, previous_wait_info); self.detector_scheduler - .detect(event.start_ts, event.wait_info, event.diag_ctx); + .detect(event.start_ts, event.wait_info, diag_ctx); } } } diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index 663c6729962..a81248fe9e2 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -76,7 +76,7 @@ use txn_types::{Key, TimeStamp}; use crate::storage::{ lock_manager::{ lock_wait_context::{LockWaitContextSharedState, PessimisticLockKeyCallback}, - LockManager, LockWaitToken, + KeyLockWaitInfo, LockDigest, LockManager, LockWaitToken, UpdateWaitForEvent, }, metrics::*, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, @@ -599,6 +599,36 @@ impl LockWaitQueues { result } + pub fn update_lock_wait(&self, lock_info: Vec) { + let mut update_wait_for_events = vec![]; + for lock_info in lock_info { + let key = Key::from_raw(lock_info.get_key()); + if let Some(mut key_state) = self.inner.queue_map.get_mut(&key) { + key_state.current_lock = lock_info; + update_wait_for_events.reserve(key_state.queue.len()); + for (&token, entry) in key_state.queue.iter() { + let event = UpdateWaitForEvent { + token, + start_ts: entry.parameters.start_ts, + is_first_lock: entry.parameters.is_first_lock, + wait_info: KeyLockWaitInfo { + key: key.clone(), + lock_digest: LockDigest { + ts: key_state.current_lock.lock_version.into(), + hash: entry.lock_hash, + }, + lock_info: key_state.current_lock.clone(), + }, + }; + update_wait_for_events.push(event); + } + } + } + if !update_wait_for_events.is_empty() { + self.inner.lock_mgr.update_wait_for(update_wait_for_events); + } + } + /// Gets the count of entries currently waiting in queues. /// /// Mind that the contents of the queues may be changed concurrently. @@ -1205,4 +1235,44 @@ mod tests { queues.must_not_contain_key(b"k1"); assert_eq!(queues.entry_count(), 0); } + + #[bench] + fn bench_update_lock_wait_empty(b: &mut test::Bencher) { + let queues = LockWaitQueues::new(MockLockManager::new()); + queues.mock_lock_wait(b"k1", 5, 6, false); + + let mut lock_info = kvrpcpb::LockInfo::default(); + let key = b"t\x00\x00\x00\x00\x00\x00\x00\x01_r\x00\x00\x00\x00\x00\x00\x00\x01"; + lock_info.set_key(key.to_vec()); + lock_info.set_primary_lock(key.to_vec()); + lock_info.set_lock_version(10); + lock_info.set_lock_for_update_ts(10); + let lock_info = vec![lock_info]; + + b.iter(|| { + queues.update_lock_wait(lock_info.clone()); + }); + } + + #[bench] + fn bench_update_lock_wait_queue_len_512(b: &mut test::Bencher) { + let queues = LockWaitQueues::new(MockLockManager::new()); + + let key = b"t\x00\x00\x00\x00\x00\x00\x00\x01_r\x00\x00\x00\x00\x00\x00\x00\x01"; + + for i in 0..512 { + queues.mock_lock_wait(key, 15 + i, 10, true); + } + + let mut lock_info = kvrpcpb::LockInfo::default(); + lock_info.set_key(key.to_vec()); + lock_info.set_primary_lock(key.to_vec()); + lock_info.set_lock_version(10); + lock_info.set_lock_for_update_ts(10); + let lock_info = vec![lock_info]; + + b.iter(|| { + queues.update_lock_wait(lock_info.clone()); + }); + } } diff --git a/src/storage/lock_manager/mod.rs b/src/storage/lock_manager/mod.rs index 75b133a808f..5c103f40f82 100644 --- a/src/storage/lock_manager/mod.rs +++ b/src/storage/lock_manager/mod.rs @@ -115,7 +115,6 @@ pub struct UpdateWaitForEvent { pub start_ts: TimeStamp, pub is_first_lock: bool, pub wait_info: KeyLockWaitInfo, - pub diag_ctx: DiagnosticContext, } /// `LockManager` manages transactions waiting for locks held by other diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index 4cc0ab57ffb..9e87bf748b7 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -5,6 +5,7 @@ use std::fmt; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; +use kvproto::kvrpcpb::LockInfo; use txn_types::{Key, Lock, PessimisticLock, TimeStamp, Value}; use super::metrics::{GC_DELETE_VERSIONS_HISTOGRAM, MVCC_VERSIONS_HISTOGRAM}; @@ -64,6 +65,11 @@ pub struct MvccTxn { // `writes`, so it can be further processed. The elements are tuples representing // (key, lock, remove_pessimistic_lock) pub(crate) locks_for_1pc: Vec<(Key, Lock, bool)>, + // Collects the information of locks that are acquired in this MvccTxn. Locks that already + // exists but updated in this MvccTxn won't be collected. The collected information will be + // used to update the lock waiting information and redo deadlock detection, if there are some + // pessimistic lock requests waiting on the keys. + pub(crate) new_locks: Vec, // `concurrency_manager` is used to set memory locks for prewritten keys. // Prewritten locks of async commit transactions should be visible to // readers before they are written to the engine. @@ -84,7 +90,8 @@ impl MvccTxn { start_ts, write_size: 0, modifies: vec![], - locks_for_1pc: Vec::new(), + locks_for_1pc: vec![], + new_locks: vec![], concurrency_manager, guards: vec![], } @@ -99,6 +106,10 @@ impl MvccTxn { std::mem::take(&mut self.guards) } + pub fn take_new_locks(&mut self) -> Vec { + std::mem::take(&mut self.new_locks) + } + pub fn write_size(&self) -> usize { self.write_size } @@ -107,7 +118,12 @@ impl MvccTxn { self.modifies.len() == 0 && self.locks_for_1pc.len() == 0 } - pub(crate) fn put_lock(&mut self, key: Key, lock: &Lock) { + // Write a lock. If the key doesn't have lock before, `is_new` should be set. + pub(crate) fn put_lock(&mut self, key: Key, lock: &Lock, is_new: bool) { + if is_new { + self.new_locks + .push(lock.clone().into_lock_info(key.to_raw().unwrap())); + } let write = Modify::Put(CF_LOCK, key, lock.to_bytes()); self.write_size += write.size(); self.modifies.push(write); @@ -117,7 +133,13 @@ impl MvccTxn { self.locks_for_1pc.push((key, lock, remove_pessimstic_lock)); } - pub(crate) fn put_pessimistic_lock(&mut self, key: Key, lock: PessimisticLock) { + // Write a pessimistic lock. If the key doesn't have lock before, `is_new` + // should be set. + pub(crate) fn put_pessimistic_lock(&mut self, key: Key, lock: PessimisticLock, is_new: bool) { + if is_new { + self.new_locks + .push(lock.to_lock().into_lock_info(key.to_raw().unwrap())); + } self.modifies.push(Modify::PessimisticLock(key, lock)) } @@ -198,12 +220,13 @@ impl MvccTxn { } lock.rollback_ts.push(self.start_ts); - self.put_lock(key.clone(), &lock); + self.put_lock(key.clone(), &lock, false); } pub(crate) fn clear(&mut self) { self.write_size = 0; self.modifies.clear(); + self.new_locks.clear(); self.locks_for_1pc.clear(); self.guards.clear(); } diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 8e7c4d95118..fcffd500c8e 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -171,7 +171,7 @@ pub fn acquire_pessimistic_lock( last_change_ts: lock.last_change_ts, versions_to_last_change: lock.versions_to_last_change, }; - txn.put_pessimistic_lock(key, lock); + txn.put_pessimistic_lock(key, lock, false); } else { MVCC_DUPLICATE_CMD_COUNTER_VEC .acquire_pessimistic_lock @@ -325,7 +325,7 @@ pub fn acquire_pessimistic_lock( // When lock_only_if_exists is false, always acquire pessimistic lock, otherwise // do it when val exists if !lock_only_if_exists || val.is_some() { - txn.put_pessimistic_lock(key, lock); + txn.put_pessimistic_lock(key, lock, true); } // TODO don't we need to commit the modifies in txn? diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index 4c900e5a438..88982d6da72 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -70,7 +70,7 @@ pub fn check_txn_status_lock_exists( lock.min_commit_ts = current_ts; } - txn.put_lock(primary_key, &lock); + txn.put_lock(primary_key, &lock, false); MVCC_CHECK_TXN_STATUS_COUNTER_VEC.update_ts.inc(); } diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 819cfd0631c..2710935efb1 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -181,6 +181,7 @@ pub fn prewrite_flashback_key( 1, TimeStamp::zero(), ), + false, // Assuming flashback transactions won't participate any lock conflicts. ); Ok(()) } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index f2de9df0004..1e655846d08 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -153,7 +153,9 @@ pub fn prewrite( OldValue::Unspecified }; - let final_min_commit_ts = mutation.write_lock(lock_status, txn)?; + let is_new_lock = !matches!(pessimistic_action, DoPessimisticCheck) || lock_amended; + + let final_min_commit_ts = mutation.write_lock(lock_status, txn, is_new_lock)?; fail_point!("after_prewrite_one_key"); @@ -448,7 +450,12 @@ impl<'a> PrewriteMutation<'a> { Ok(None) } - fn write_lock(self, lock_status: LockStatus, txn: &mut MvccTxn) -> Result { + fn write_lock( + self, + lock_status: LockStatus, + txn: &mut MvccTxn, + is_new_lock: bool, + ) -> Result { let mut try_one_pc = self.try_one_pc(); let mut lock = Lock::new( @@ -506,7 +513,7 @@ impl<'a> PrewriteMutation<'a> { if try_one_pc { txn.put_locks_for_1pc(self.key, lock, lock_status.has_pessimistic_lock()); } else { - txn.put_lock(self.key, &lock); + txn.put_lock(self.key, &lock, is_new_lock); } final_min_commit_ts diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 2afdadaad80..58c33706bbc 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -144,6 +144,7 @@ impl WriteCommand for AcquirePessimisticLock } } + let new_acquired_locks = txn.take_new_locks(); let modifies = txn.into_modifies(); let mut res = Ok(res); @@ -179,6 +180,7 @@ impl WriteCommand for AcquirePessimisticLock pr, lock_info: encountered_locks, released_locks: ReleasedLocks::new(), + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, }) diff --git a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs index a66f8228755..7640edd7c0c 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs @@ -82,6 +82,7 @@ impl WriteCommand for AcquirePessimisticLockR fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { fail_point!("acquire_pessimistic_lock_resumed_before_process_write"); let mut modifies = vec![]; + let mut new_acquired_locks = vec![]; let mut txn = None; let mut reader: Option> = None; @@ -107,10 +108,11 @@ impl WriteCommand for AcquirePessimisticLockR .as_ref() .map_or(true, |t: &MvccTxn| t.start_ts != params.start_ts) { - if let Some(prev_txn) = txn.replace(MvccTxn::new( + if let Some(mut prev_txn) = txn.replace(MvccTxn::new( params.start_ts, context.concurrency_manager.clone(), )) { + new_acquired_locks.extend(prev_txn.take_new_locks()); modifies.extend(prev_txn.into_modifies()); } // TODO: Is it possible to reuse the same reader but change the start_ts stored @@ -169,8 +171,9 @@ impl WriteCommand for AcquirePessimisticLockR }; } - if let Some(txn) = txn { + if let Some(mut txn) = txn { if !txn.is_empty() { + new_acquired_locks.extend(txn.take_new_locks()); modifies.extend(txn.into_modifies()); } } @@ -188,6 +191,7 @@ impl WriteCommand for AcquirePessimisticLockR pr, lock_info: encountered_locks, released_locks: ReleasedLocks::new(), + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, }) diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index 1df5c5b2cf8..9a54895e7e2 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -60,6 +60,7 @@ impl WriteCommand for RawAtomicStore { pr: ProcessResult::Res, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards: raw_ext.into_iter().map(|r| r.key_guard).collect(), response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 4802535c054..a19a5d82bb6 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -146,6 +146,7 @@ impl WriteCommand for CheckSecondaryLocks { rows = 1; } let pr = ProcessResult::SecondaryLocksStatus { status: result }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -155,6 +156,7 @@ impl WriteCommand for CheckSecondaryLocks { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 34948109f4b..895c753b160 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -125,6 +125,7 @@ impl WriteCommand for CheckTxnStatus { released_locks.push(released); let pr = ProcessResult::TxnStatus { txn_status }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -134,6 +135,7 @@ impl WriteCommand for CheckTxnStatus { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index a6c529420d3..302c4fe1308 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -67,6 +67,7 @@ impl WriteCommand for Cleanup { true, )?); + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -76,6 +77,7 @@ impl WriteCommand for Cleanup { pr: ProcessResult::Res, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index 910b7832ed1..4f05df8fe83 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -67,6 +67,7 @@ impl WriteCommand for Commit { let pr = ProcessResult::TxnStatus { txn_status: TxnStatus::committed(self.commit_ts), }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -76,6 +77,7 @@ impl WriteCommand for Commit { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index 943fc6f69d1..ca9213b57d3 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -114,6 +114,7 @@ impl WriteCommand for RawCompareAndSwap { pr, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards, response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 72b100f567b..da12bc8906c 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -159,6 +159,7 @@ impl WriteCommand for FlashbackToVersion { })(), lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 7eee81ae23e..2d79ebc97cc 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -417,6 +417,7 @@ pub struct WriteResult { pub pr: ProcessResult, pub lock_info: Vec, pub released_locks: ReleasedLocks, + pub new_acquired_locks: Vec, pub lock_guards: Vec, pub response_policy: ResponsePolicy, } diff --git a/src/storage/txn/commands/pause.rs b/src/storage/txn/commands/pause.rs index 3dc7d06d5ef..5d3aa7f6d2f 100644 --- a/src/storage/txn/commands/pause.rs +++ b/src/storage/txn/commands/pause.rs @@ -50,6 +50,7 @@ impl WriteCommand for Pause { pr: ProcessResult::Res, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index c35c362f19e..b34399cb64a 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -83,6 +83,7 @@ impl WriteCommand for PessimisticRollback { released_locks.push(released_lock?); } + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -92,6 +93,7 @@ impl WriteCommand for PessimisticRollback { pr: ProcessResult::MultiRes { results: vec![] }, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index b34c4eb752b..fbd4bf5984a 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -678,6 +678,7 @@ impl Prewriter { // If an error (KeyIsLocked or WriteConflict) occurs before, these lock guards // are dropped along with `txn` automatically. let lock_guards = txn.take_guards(); + let new_acquired_locks = txn.take_new_locks(); let mut to_be_write = WriteData::new(txn.into_modifies(), extra); to_be_write.set_disk_full_opt(self.ctx.get_disk_full_opt()); @@ -688,6 +689,7 @@ impl Prewriter { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards, response_policy: ResponsePolicy::OnApplied, } @@ -707,6 +709,7 @@ impl Prewriter { pr, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, } @@ -870,8 +873,9 @@ fn handle_1pc_locks(txn: &mut MvccTxn, commit_ts: TimeStamp) -> ReleasedLocks { /// Change all 1pc locks in txn to 2pc locks. pub(in crate::storage::txn) fn fallback_1pc_locks(txn: &mut MvccTxn) { - for (key, lock, _) in std::mem::take(&mut txn.locks_for_1pc) { - txn.put_lock(key, &lock); + for (key, lock, remove_pessimistic_lock) in std::mem::take(&mut txn.locks_for_1pc) { + let is_new_lock = !remove_pessimistic_lock; + txn.put_lock(key, &lock, is_new_lock); } } diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index 463275b2e1f..f3d141807e8 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -138,6 +138,7 @@ impl WriteCommand for ResolveLock { cmd: Command::ResolveLockReadPhase(next_cmd), } }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -147,6 +148,7 @@ impl WriteCommand for ResolveLock { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index d336d88a9ca..63fe201596d 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -63,6 +63,7 @@ impl WriteCommand for ResolveLockLite { }); } + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -72,6 +73,7 @@ impl WriteCommand for ResolveLockLite { pr: ProcessResult::Res, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index 52c05ae34c7..f3b674f4916 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -58,6 +58,7 @@ impl WriteCommand for Rollback { released_locks.push(released_lock); } + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -67,6 +68,7 @@ impl WriteCommand for Rollback { pr: ProcessResult::Res, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index f965b863494..448395fc436 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -67,7 +67,7 @@ impl WriteCommand for TxnHeartBeat { Some(mut lock) if lock.ts == self.start_ts => { if lock.ttl < self.advise_ttl { lock.ttl = self.advise_ttl; - txn.put_lock(self.primary_key.clone(), &lock); + txn.put_lock(self.primary_key.clone(), &lock, false); } lock } @@ -83,6 +83,7 @@ impl WriteCommand for TxnHeartBeat { let pr = ProcessResult::TxnStatus { txn_status: TxnStatus::uncommitted(lock, false), }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -92,6 +93,7 @@ impl WriteCommand for TxnHeartBeat { pr, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 13a74895803..d96e3e7c97f 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -798,6 +798,7 @@ impl Scheduler { lock_guards: Vec, pipelined: bool, async_apply_prewrite: bool, + new_acquired_locks: Vec, tag: CommandKind, ) { // TODO: Does async apply prewrite worth a special metric here? @@ -846,7 +847,7 @@ impl Scheduler { assert!(pipelined || async_apply_prewrite); } - // TODO: Update lock wait relationships after acquiring some locks. + self.on_acquired_locks_finished(new_acquired_locks); if do_wake_up { let woken_up_resumable_lock_requests = tctx.woken_up_resumable_lock_requests; @@ -978,6 +979,28 @@ impl Scheduler { resumable_wake_up_list } + fn on_acquired_locks_finished(&self, new_acquired_locks: Vec) { + if new_acquired_locks.is_empty() || self.inner.lock_wait_queues.is_empty() { + return; + } + + // If there are not too many new locks, do not spawn the task to the high + // priority pool since it may consume more CPU. + if new_acquired_locks.len() < 30 { + self.inner + .lock_wait_queues + .update_lock_wait(new_acquired_locks); + } else { + let lock_wait_queues = self.inner.lock_wait_queues.clone(); + self.get_sched_pool(CommandPri::High) + .pool + .spawn(async move { + lock_wait_queues.update_lock_wait(new_acquired_locks); + }) + .unwrap(); + } + } + fn wake_up_legacy_pessimistic_locks( &self, legacy_wake_up_list: impl IntoIterator, ReleasedLock)> @@ -1201,6 +1224,7 @@ impl Scheduler { pr, lock_info, released_locks, + new_acquired_locks, lock_guards, response_policy, } = match deadline @@ -1273,7 +1297,16 @@ impl Scheduler { } if to_be_write.modifies.is_empty() { - scheduler.on_write_finished(cid, pr, Ok(()), lock_guards, false, false, tag); + scheduler.on_write_finished( + cid, + pr, + Ok(()), + lock_guards, + false, + false, + new_acquired_locks, + tag, + ); return; } @@ -1294,7 +1327,16 @@ impl Scheduler { engine.schedule_txn_extra(to_be_write.extra); }) } - scheduler.on_write_finished(cid, pr, Ok(()), lock_guards, false, false, tag); + scheduler.on_write_finished( + cid, + pr, + Ok(()), + lock_guards, + false, + false, + new_acquired_locks, + tag, + ); return; } @@ -1478,6 +1520,7 @@ impl Scheduler { lock_guards, pipelined, is_async_apply_prewrite, + new_acquired_locks, tag, ); KV_COMMAND_KEYWRITE_HISTOGRAM_VEC diff --git a/tests/integrations/server/lock_manager.rs b/tests/integrations/server/lock_manager.rs index 43032dd8cc3..289b10303a8 100644 --- a/tests/integrations/server/lock_manager.rs +++ b/tests/integrations/server/lock_manager.rs @@ -1,6 +1,14 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, thread, time::Duration}; +use std::{ + sync::{ + mpsc, + mpsc::{RecvTimeoutError, TryRecvError}, + Arc, + }, + thread, + time::Duration, +}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ @@ -283,3 +291,106 @@ fn test_detect_deadlock_when_merge_region() { must_transfer_leader(&mut cluster, b"", 1); } } + +#[test] +fn test_detect_deadlock_when_updating_wait_info() { + use kvproto::kvrpcpb::PessimisticLockKeyResultType::*; + let mut cluster = new_cluster_for_deadlock_test(3); + + let key1 = b"key1"; + let key2 = b"key2"; + let (client, ctx) = build_leader_client(&mut cluster, key1); + let client = Arc::new(client); + + fn async_pessimistic_lock( + client: Arc, + ctx: Context, + key: &[u8], + ts: u64, + ) -> mpsc::Receiver { + let (tx, rx) = mpsc::channel(); + let key = vec![key.to_vec()]; + thread::spawn(move || { + let resp = + kv_pessimistic_lock_resumable(&client, ctx, key, ts, ts, Some(1000), false, false); + tx.send(resp).unwrap(); + }); + rx + } + + // key1: txn 11 and 12 waits for 10 + // key2: txn 11 waits for 12 + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![key1.to_vec()], + 10, + 10, + Some(1000), + false, + false, + ); + assert!(resp.region_error.is_none()); + assert!(resp.errors.is_empty()); + assert_eq!(resp.results[0].get_type(), LockResultNormal); + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![key2.to_vec()], + 12, + 12, + Some(1000), + false, + false, + ); + assert!(resp.region_error.is_none()); + assert!(resp.errors.is_empty()); + assert_eq!(resp.results[0].get_type(), LockResultNormal); + let rx_txn11_k1 = async_pessimistic_lock(client.clone(), ctx.clone(), key1, 11); + let rx_txn12_k1 = async_pessimistic_lock(client.clone(), ctx.clone(), key1, 12); + let rx_txn11_k2 = async_pessimistic_lock(client.clone(), ctx.clone(), key2, 11); + // All blocked. + assert_eq!( + rx_txn11_k1 + .recv_timeout(Duration::from_millis(50)) + .unwrap_err(), + RecvTimeoutError::Timeout + ); + assert_eq!(rx_txn12_k1.try_recv().unwrap_err(), TryRecvError::Empty); + assert_eq!(rx_txn11_k2.try_recv().unwrap_err(), TryRecvError::Empty); + + // Release lock at ts=10 on key1 so that txn 11 will be granted the lock. + must_kv_pessimistic_rollback(&client, ctx.clone(), key1.to_vec(), 10, 10); + let resp = rx_txn11_k1 + .recv_timeout(Duration::from_millis(200)) + .unwrap(); + assert!(resp.region_error.is_none()); + assert!(resp.errors.is_empty()); + assert_eq!(resp.results[0].get_type(), LockResultNormal); + // And then 12 waits for k1 on key1, which forms a deadlock. + let resp = rx_txn12_k1 + .recv_timeout(Duration::from_millis(1000)) + .unwrap(); + assert!(resp.region_error.is_none()); + assert!(resp.errors[0].has_deadlock()); + assert_eq!(resp.results[0].get_type(), LockResultFailed); + // Check correctness of the wait chain. + let wait_chain = resp.errors[0].get_deadlock().get_wait_chain(); + assert_eq!(wait_chain[0].get_txn(), 11); + assert_eq!(wait_chain[0].get_wait_for_txn(), 12); + assert_eq!(wait_chain[0].get_key(), key2); + assert_eq!(wait_chain[1].get_txn(), 12); + assert_eq!(wait_chain[1].get_wait_for_txn(), 11); + assert_eq!(wait_chain[1].get_key(), key1); + + // Clean up. + must_kv_pessimistic_rollback(&client, ctx.clone(), key1.to_vec(), 11, 11); + must_kv_pessimistic_rollback(&client, ctx.clone(), key2.to_vec(), 12, 12); + let resp = rx_txn11_k2 + .recv_timeout(Duration::from_millis(500)) + .unwrap(); + assert!(resp.region_error.is_none()); + assert!(resp.errors.is_empty()); + assert_eq!(resp.results[0].get_type(), LockResultNormal); + must_kv_pessimistic_rollback(&client, ctx, key2.to_vec(), 11, 11); +} From 87b0eafff97155514fc9dbec82001dfb7efb2a1e Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 16 Dec 2022 15:28:52 +0800 Subject: [PATCH 409/676] ratstore-v2: strip ts from split keys before propose (#13948) ref tikv/tikv#12842 Signed-off-by: SpadeA-Tang --- Cargo.lock | 1 + components/raftstore-v2/Cargo.toml | 1 + .../src/operation/command/admin/split.rs | 100 +++++++++++++++++- 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 244ca504858..11aa05f2140 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4359,6 +4359,7 @@ dependencies = [ "file_system", "fs2", "futures 0.3.15", + "itertools", "keys", "kvproto", "log_wrappers", diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 4d3d44ec6fd..d9b1d65aebc 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -41,6 +41,7 @@ fail = "0.5" file_system = { workspace = true } fs2 = "0.4" futures = { version = "0.3", features = ["compat"] } +itertools = "0.10" keys = { workspace = true } kvproto = { workspace = true } log_wrappers = { workspace = true } diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 8ca4c7a55f6..7ae0b68a327 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -31,6 +31,7 @@ use collections::HashSet; use crossbeam::channel::SendError; use engine_traits::{Checkpointer, KvEngine, RaftEngine, TabletContext}; use fail::fail_point; +use itertools::Itertools; use kvproto::{ metapb::{self, Region}, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, @@ -39,6 +40,7 @@ use kvproto::{ use protobuf::Message; use raft::{prelude::Snapshot, INVALID_ID}; use raftstore::{ + coprocessor::split_observer::{is_valid_split_key, strip_timestamp_if_exists}, store::{ fsm::apply::validate_batch_split, metrics::PEER_ADMIN_CMD_COUNTER, @@ -48,7 +50,8 @@ use raftstore::{ }, Result, }; -use slog::info; +use slog::{error, info, warn, Logger}; +use tikv_util::box_err; use crate::{ batch::StoreContext, @@ -99,13 +102,66 @@ impl SplitInit { } } +// validate split request and strip ts from split keys if needed +fn pre_propose_split(logger: &Logger, req: &mut AdminRequest, region: &Region) -> Result<()> { + if !req.has_splits() { + return Err(box_err!( + "cmd_type is BatchSplit but it doesn't have splits request, message maybe \ + corrupted!" + .to_owned() + )); + } + + let mut requests: Vec = req.mut_splits().take_requests().into(); + let ajusted_splits = std::mem::take(&mut requests) + .into_iter() + .enumerate() + .filter_map(|(i, mut split)| { + let key = split.take_split_key(); + let key = strip_timestamp_if_exists(key); + if is_valid_split_key(&key, i, region) { + split.split_key = key; + Some(split) + } else { + None + } + }) + .coalesce(|prev, curr| { + // Make sure that the split keys are sorted and unique. + if prev.split_key < curr.split_key { + Err((prev, curr)) + } else { + warn!( + logger, + "skip invalid split key: key should not be larger than the previous."; + "key" => log_wrappers::Value::key(&curr.split_key), + "previous" => log_wrappers::Value::key(&prev.split_key), + ); + Ok(prev) + } + }) + .collect::>(); + + if ajusted_splits.is_empty() { + error!( + logger, + "failed to handle split req, no valid key found for split"; + ); + Err(box_err!("no valid key found for split.".to_owned())) + } else { + // Rewrite the splits. + req.mut_splits().set_requests(ajusted_splits.into()); + Ok(()) + } +} + impl Peer { pub fn propose_split( &mut self, store_ctx: &mut StoreContext, - req: RaftCmdRequest, + mut req: RaftCmdRequest, ) -> Result { - validate_batch_split(req.get_admin_request(), self.region())?; + pre_propose_split(&self.logger, req.mut_admin_request(), self.region())?; // We rely on ConflictChecker to detect conflicts, so no need to set proposal // context. let data = req.write_to_bytes().unwrap(); @@ -494,6 +550,7 @@ mod test { store::{new_learner_peer, new_peer}, worker::dummy_scheduler, }; + use txn_types::Key; use super::*; use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes}; @@ -597,6 +654,43 @@ mod test { } } + #[test] + fn test_propose() { + let logger = slog_global::borrow_global().new(o!()); + + let mut region = Region::default(); + region.set_end_key(b"k10".to_vec()); + + let mut req = AdminRequest::default(); + let err = pre_propose_split(&logger, &mut req, ®ion).unwrap_err(); + assert!( + err.to_string() + .contains("cmd_type is BatchSplit but it doesn't have splits") + ); + + let mut splits = BatchSplitRequest::default(); + req.set_splits(splits.clone()); + let err = pre_propose_split(&logger, &mut req, ®ion).unwrap_err(); + assert!(err.to_string().contains("no valid key found")); + + splits.mut_requests().push(new_split_req(b"", 0, vec![])); + splits.mut_requests().push(new_split_req(b"k03", 0, vec![])); + splits.mut_requests().push(new_split_req(b"k02", 0, vec![])); + splits.mut_requests().push(new_split_req(b"k11", 0, vec![])); + let split_key = Key::from_raw(b"k06"); + let split_key_with_ts = split_key.clone().append_ts(10.into()); + splits + .mut_requests() + .push(new_split_req(split_key_with_ts.as_encoded(), 0, vec![])); + + req.set_splits(splits); + pre_propose_split(&logger, &mut req, ®ion).unwrap(); + let split_reqs = req.get_splits().get_requests(); + assert_eq!(split_reqs.len(), 2); + assert_eq!(split_reqs[0].get_split_key(), b"k03"); + assert_eq!(split_reqs[1].get_split_key(), split_key.as_encoded()); + } + #[test] fn test_split() { let store_id = 2; From 416f7b7504a2766edb2c7b7b4a5b8c6e24485440 Mon Sep 17 00:00:00 2001 From: Jay Date: Sat, 17 Dec 2022 13:08:53 +0800 Subject: [PATCH 410/676] raftstore-v2: introduce apply trace (#13939) ref tikv/tikv#12842 raftstore v2 disables WAL for all tablets and store all states to raft engine. To be able to recover from restart, we need to build some relations between raft engine and tablets flush. In the previous PR, flush indexes are stored in raft engine by `PersistenceListener`. In this PR, ApplyTrace is introduced to anaylze apply index after restart. And it will trigger persistence for more apply progress like split. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.toml | 2 +- components/engine_panic/src/misc.rs | 2 +- components/engine_rocks/src/engine.rs | 14 +- components/engine_rocks/src/event_listener.rs | 160 ++++- components/engine_rocks/src/file_system.rs | 4 +- components/engine_rocks/src/misc.rs | 9 +- components/engine_test/src/lib.rs | 65 +- components/engine_traits/src/cf_defs.rs | 1 + components/engine_traits/src/flush.rs | 187 +++--- components/engine_traits/src/lib.rs | 2 + components/engine_traits/src/misc.rs | 5 +- components/engine_traits/src/tablet.rs | 7 +- components/raftstore-v2/src/bootstrap.rs | 2 +- components/raftstore-v2/src/fsm/apply.rs | 12 +- components/raftstore-v2/src/fsm/peer.rs | 9 + components/raftstore-v2/src/lib.rs | 2 + .../src/operation/command/admin/split.rs | 33 +- .../raftstore-v2/src/operation/command/mod.rs | 96 +-- .../src/operation/command/write/mod.rs | 16 +- components/raftstore-v2/src/operation/mod.rs | 5 +- .../src/operation/ready/apply_trace.rs | 585 ++++++++++++++++++ .../raftstore-v2/src/operation/ready/mod.rs | 31 +- .../src/operation/ready/snapshot.rs | 39 +- components/raftstore-v2/src/raft/apply.rs | 51 +- components/raftstore-v2/src/raft/mod.rs | 2 +- components/raftstore-v2/src/raft/peer.rs | 63 +- components/raftstore-v2/src/raft/storage.rs | 179 +----- .../src/router/internal_message.rs | 5 +- components/raftstore-v2/src/router/message.rs | 14 + .../raftstore-v2/tests/failpoints/mod.rs | 1 + .../tests/failpoints/test_trace_apply.rs | 7 + .../tests/integrations/cluster.rs | 69 ++- .../raftstore-v2/tests/integrations/mod.rs | 1 + .../tests/integrations/test_split.rs | 73 +++ .../tests/integrations/test_trace_apply.rs | 211 +++++++ .../raftstore/src/store/async_io/write.rs | 11 + .../raftstore/src/store/compaction_guard.rs | 4 +- .../raftstore/src/store/region_snapshot.rs | 2 +- components/server/src/server.rs | 2 +- src/server/engine_factory.rs | 24 +- src/storage/mod.rs | 6 +- .../failpoints/cases/test_table_properties.rs | 10 +- tests/integrations/raftstore/test_stats.rs | 6 +- .../raftstore/test_update_region_size.rs | 2 +- tests/integrations/storage/test_titan.rs | 6 +- 45 files changed, 1613 insertions(+), 424 deletions(-) create mode 100644 components/raftstore-v2/src/operation/ready/apply_trace.rs create mode 100644 components/raftstore-v2/tests/failpoints/test_trace_apply.rs create mode 100644 components/raftstore-v2/tests/integrations/test_trace_apply.rs diff --git a/Cargo.toml b/Cargo.toml index 66516206dd0..e09b422299e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -338,7 +338,7 @@ pd_client = { path = "components/pd_client" } profiler = { path = "components/profiler" } raft_log_engine = { path = "components/raft_log_engine" } raftstore = { path = "components/raftstore", default-features = false } -raftstore_v2 = { path = "components/raftstore-v2", default-features = false } +raftstore-v2 = { path = "components/raftstore-v2", default-features = false } resolved_ts = { path = "components/resolved_ts" } resource_metering = { path = "components/resource_metering" } security = { path = "components/security" } diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 82012b84ed6..8c983051438 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -5,7 +5,7 @@ use engine_traits::{DeleteStrategy, MiscExt, Range, Result}; use crate::engine::PanicEngine; impl MiscExt for PanicEngine { - fn flush_cfs(&self, wait: bool) -> Result<()> { + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 70f6562e94b..0c37120e7fc 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -2,9 +2,7 @@ use std::{any::Any, sync::Arc}; -use engine_traits::{ - FlushState, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable, -}; +use engine_traits::{IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable}; use rocksdb::{DBIterator, Writable, DB}; use crate::{ @@ -26,7 +24,6 @@ use crate::{ pub struct RocksEngine { db: Arc, support_multi_batch_write: bool, - flush_state: Option>, } impl RocksEngine { @@ -38,7 +35,6 @@ impl RocksEngine { RocksEngine { db: db.clone(), support_multi_batch_write: db.get_db_options().is_enable_multi_batch_write(), - flush_state: None, } } @@ -53,14 +49,6 @@ impl RocksEngine { pub fn support_multi_batch_write(&self) -> bool { self.support_multi_batch_write } - - pub fn set_flush_state(&mut self, flush_state: Arc) { - self.flush_state = Some(flush_state); - } - - pub fn flush_state(&self) -> Option> { - self.flush_state.clone() - } } impl KvEngine for RocksEngine { diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 8bf3035bc55..b940fcb39f3 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{PersistenceListener, RaftEngine}; +use engine_traits::PersistenceListener; use file_system::{get_io_type, set_io_type, IoType}; use regex::Regex; use rocksdb::{ @@ -179,29 +179,40 @@ fn resolve_sst_filename_from_err(err: &str) -> Option { Some(filename) } -pub struct RocksPersistenceListener(PersistenceListener); +pub struct RocksPersistenceListener(PersistenceListener); -impl RocksPersistenceListener { - pub fn new(listener: PersistenceListener) -> RocksPersistenceListener { +impl RocksPersistenceListener { + pub fn new(listener: PersistenceListener) -> RocksPersistenceListener { RocksPersistenceListener(listener) } } -impl rocksdb::EventListener for RocksPersistenceListener { +impl rocksdb::EventListener for RocksPersistenceListener { fn on_memtable_sealed(&self, info: &MemTableInfo) { self.0 - .on_memtable_sealed(info.cf_name().to_string(), info.first_seqno()); + .on_memtable_sealed(info.cf_name().to_string(), info.earliest_seqno()); } fn on_flush_completed(&self, job: &FlushJobInfo) { self.0 - .on_flush_completed(job.cf_name(), job.smallest_seqno()); + .on_flush_completed(job.cf_name(), job.largest_seqno()); } } #[cfg(test)] mod tests { + use std::sync::{ + mpsc::{self, Sender}, + Arc, Mutex, + }; + + use engine_traits::{ + FlushProgress, FlushState, MiscExt, StateStorage, SyncMutable, CF_DEFAULT, DATA_CFS, + }; + use tempfile::Builder; + use super::*; + use crate::{util, RocksCfOptions, RocksDbOptions}; #[test] fn test_resolve_sst_filename() { @@ -209,4 +220,139 @@ mod tests { let filename = resolve_sst_filename_from_err(err).unwrap(); assert_eq!(filename, "/000398.sst"); } + + type Record = (u64, u64, FlushProgress); + + #[derive(Default)] + struct MemStorage { + records: Mutex>, + } + + impl StateStorage for MemStorage { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress) { + self.records + .lock() + .unwrap() + .push((region_id, tablet_index, pr)); + } + } + + struct FlushTrack { + sealed: Mutex>, + block_flush: Arc>, + } + + impl rocksdb::EventListener for FlushTrack { + fn on_memtable_sealed(&self, _: &MemTableInfo) { + let _ = self.sealed.lock().unwrap().send(()); + } + + fn on_flush_begin(&self, _: &FlushJobInfo) { + drop(self.block_flush.lock().unwrap()) + } + } + + #[test] + fn test_persistence_listener() { + let temp_dir = Builder::new() + .prefix("test_persistence_listener") + .tempdir() + .unwrap(); + let (region_id, tablet_index) = (2, 3); + + let storage = Arc::new(MemStorage::default()); + let state = Arc::new(FlushState::default()); + let listener = + PersistenceListener::new(region_id, tablet_index, state.clone(), storage.clone()); + let mut db_opt = RocksDbOptions::default(); + db_opt.add_event_listener(RocksPersistenceListener::new(listener)); + let (tx, rx) = mpsc::channel(); + let block_flush = Arc::new(Mutex::new(())); + db_opt.add_event_listener(FlushTrack { + sealed: Mutex::new(tx), + block_flush: block_flush.clone(), + }); + + let mut cf_opts: Vec<_> = DATA_CFS + .iter() + .map(|cf| (*cf, RocksCfOptions::default())) + .collect(); + cf_opts[0].1.set_max_write_buffer_number(4); + cf_opts[0].1.set_min_write_buffer_number_to_merge(2); + cf_opts[0].1.set_write_buffer_size(1024); + cf_opts[0].1.set_disable_auto_compactions(true); + let db = util::new_engine_opt(temp_dir.path().to_str().unwrap(), db_opt, cf_opts).unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + let sst_count = || { + std::fs::read_dir(temp_dir.path()) + .unwrap() + .filter(|p| { + let p = match p { + Ok(p) => p, + Err(_) => return false, + }; + p.path().extension().map_or(false, |ext| ext == "sst") + }) + .count() + }; + // Although flush is triggered, but there is nothing to flush. + assert_eq!(sst_count(), 0); + assert_eq!(storage.records.lock().unwrap().len(), 0); + + // Flush one key should work. + state.set_applied_index(2); + db.put_cf(CF_DEFAULT, b"k0", b"v0").unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + assert_eq!(sst_count(), 1); + let record = storage.records.lock().unwrap().pop().unwrap(); + assert_eq!(storage.records.lock().unwrap().len(), 0); + assert_eq!(record.0, region_id); + assert_eq!(record.1, tablet_index); + assert_eq!(record.2.applied_index(), 2); + + // When puts and deletes are mixed, the puts may be deleted during flush. + state.set_applied_index(3); + db.put_cf(CF_DEFAULT, b"k0", b"v0").unwrap(); + db.delete_cf(CF_DEFAULT, b"k0").unwrap(); + db.delete_cf(CF_DEFAULT, b"k1").unwrap(); + db.put_cf(CF_DEFAULT, b"k1", b"v1").unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + assert_eq!(sst_count(), 2); + let record = storage.records.lock().unwrap().pop().unwrap(); + assert_eq!(storage.records.lock().unwrap().len(), 0); + assert_eq!(record.0, region_id); + assert_eq!(record.1, tablet_index); + assert_eq!(record.2.applied_index(), 3); + // Detail check of `FlushProgress` will be done in raftstore-v2 tests. + + // Drain all the events. + while rx.try_recv().is_ok() {} + state.set_applied_index(4); + let block = block_flush.lock(); + // Seal twice to trigger flush. Seal third to make a seqno conflict, in + // which case flush largest seqno will be equal to seal earliest seqno. + let mut key_count = 2; + for i in 0..3 { + while rx.try_recv().is_err() { + db.put(format!("k{key_count}").as_bytes(), &[0; 512]) + .unwrap(); + key_count += 1; + } + state.set_applied_index(5 + i); + } + drop(block); + // Memtable is seal before put, so there must be still one KV in memtable. + db.flush_cf(CF_DEFAULT, true).unwrap(); + rx.try_recv().unwrap(); + // There is 2 sst before this round, and then 4 are merged into 2, so there + // should be 4 ssts. + assert_eq!(sst_count(), 4); + let records = storage.records.lock().unwrap(); + // Although it seals 4 times, but only create 2 SSTs, so only 2 records. + assert_eq!(records.len(), 2); + // The indexes of two merged flush state are 4 and 5, so merged value is 5. + assert_eq!(records[0].2.applied_index(), 5); + // The last two flush state is 6 and 7. + assert_eq!(records[1].2.applied_index(), 7); + } } diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index f3211d52d68..5fc0ed7f6e2 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -82,13 +82,13 @@ mod tests { db.put(&data_key(b"a1"), &value).unwrap(); db.put(&data_key(b"a2"), &value).unwrap(); assert_eq!(stats.fetch(IoType::Flush, IoOp::Write), 0); - db.flush_cfs(true /* wait */).unwrap(); + db.flush_cfs(&[], true /* wait */).unwrap(); assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.put(&data_key(b"a2"), &value).unwrap(); db.put(&data_key(b"a3"), &value).unwrap(); - db.flush_cfs(true /* wait */).unwrap(); + db.flush_cfs(&[], true /* wait */).unwrap(); assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 482686ffd1a..9ef2ed079b2 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -126,11 +126,16 @@ impl RocksEngine { } impl MiscExt for RocksEngine { - fn flush_cfs(&self, wait: bool) -> Result<()> { + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { let mut handles = vec![]; - for cf in self.cf_names() { + for cf in cfs { handles.push(util::get_cf_handle(self.as_inner(), cf)?); } + if handles.is_empty() { + for cf in self.cf_names() { + handles.push(util::get_cf_handle(self.as_inner(), cf)?); + } + } self.as_inner().flush_cfs(&handles, wait).map_err(r2e) } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 16849acd5b8..2d89929a4b2 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -119,9 +119,10 @@ pub mod kv { } impl TabletFactory for TestTabletFactory { - fn open_tablet(&self, _ctx: TabletContext, path: &Path) -> Result { - KvTestEngine::new_kv_engine_opt( + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { + KvTestEngine::new_tablet( path.to_str().unwrap(), + ctx, self.db_opt.clone(), self.cf_opts.clone(), ) @@ -155,7 +156,7 @@ pub mod ctor { use std::sync::Arc; use encryption::DataKeyManager; - use engine_traits::Result; + use engine_traits::{Result, StateStorage, TabletContext}; use file_system::IoRateLimiter; /// Kv engine construction @@ -188,6 +189,14 @@ pub mod ctor { db_opt: DbOptions, cf_opts: Vec<(&str, CfOptions)>, ) -> Result; + + /// Create a new engine specific for multi rocks. + fn new_tablet( + path: &str, + ctx: TabletContext, + db_opt: DbOptions, + cf_opts: Vec<(&str, CfOptions)>, + ) -> Result; } /// Raft engine construction @@ -200,6 +209,7 @@ pub mod ctor { pub struct DbOptions { key_manager: Option>, rate_limiter: Option>, + state_storage: Option>, enable_multi_batch_write: bool, } @@ -212,6 +222,10 @@ pub mod ctor { self.rate_limiter = rate_limiter; } + pub fn set_state_storage(&mut self, state_storage: Arc) { + self.state_storage = Some(state_storage); + } + pub fn set_enable_multi_batch_write(&mut self, enable: bool) { self.enable_multi_batch_write = enable; } @@ -329,6 +343,15 @@ pub mod ctor { ) -> Result { Ok(PanicEngine) } + + fn new_tablet( + _path: &str, + _ctx: engine_traits::TabletContext, + _db_opt: DbOptions, + _cf_opts: Vec<(&str, CfOptions)>, + ) -> Result { + Ok(PanicEngine) + } } impl RaftEngineConstructorExt for engine_panic::PanicEngine { @@ -343,9 +366,11 @@ pub mod ctor { get_env, properties::{MvccPropertiesCollectorFactory, RangePropertiesCollectorFactory}, util::new_engine_opt as rocks_new_engine_opt, - RocksCfOptions, RocksDbOptions, + RocksCfOptions, RocksDbOptions, RocksPersistenceListener, + }; + use engine_traits::{ + CfOptions as _, PersistenceListener, Result, TabletContext, CF_DEFAULT, }; - use engine_traits::{CfOptions as _, Result, CF_DEFAULT}; use super::{ CfOptions, DbOptions, KvEngineConstructorExt, RaftDbOptions, RaftEngineConstructorExt, @@ -376,6 +401,36 @@ pub mod ctor { .collect(); rocks_new_engine_opt(path, rocks_db_opts, rocks_cfs_opts) } + + fn new_tablet( + path: &str, + ctx: TabletContext, + db_opt: DbOptions, + cf_opts: Vec<(&str, CfOptions)>, + ) -> Result { + let mut rocks_db_opts = RocksDbOptions::default(); + let env = get_env(db_opt.key_manager.clone(), db_opt.rate_limiter)?; + rocks_db_opts.set_env(env); + rocks_db_opts.enable_unordered_write(false); + rocks_db_opts.enable_pipelined_write(false); + rocks_db_opts.enable_multi_batch_write(false); + rocks_db_opts.allow_concurrent_memtable_write(false); + if let Some(storage) = db_opt.state_storage + && let Some(flush_state) = ctx.flush_state { + let listener = PersistenceListener::new( + ctx.id, + ctx.suffix.unwrap(), + flush_state, + storage, + ); + rocks_db_opts.add_event_listener(RocksPersistenceListener::new(listener)); + } + let rocks_cfs_opts = cf_opts + .iter() + .map(|(name, opt)| (*name, get_rocks_cf_opts(opt))) + .collect(); + rocks_new_engine_opt(path, rocks_db_opts, rocks_cfs_opts) + } } impl RaftEngineConstructorExt for engine_rocks::RocksEngine { diff --git a/components/engine_traits/src/cf_defs.rs b/components/engine_traits/src/cf_defs.rs index e3fe95ec3b6..1658f49053c 100644 --- a/components/engine_traits/src/cf_defs.rs +++ b/components/engine_traits/src/cf_defs.rs @@ -9,6 +9,7 @@ pub const CF_RAFT: CfName = "raft"; pub const LARGE_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE]; pub const ALL_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE, CF_RAFT]; pub const DATA_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE]; +pub const DATA_CFS_LEN: usize = DATA_CFS.len(); pub fn name_to_cf(name: &str) -> Option { if name.is_empty() { diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index 9de5369ab54..cfed95f0426 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -13,36 +13,36 @@ //! be used as the start state. use std::{ - mem, + collections::LinkedList, sync::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, }, }; -use kvproto::raft_serverpb::{RaftApplyState, RegionLocalState}; -use tikv_util::Either; - use crate::{RaftEngine, RaftLogBatch}; #[derive(Debug)] -enum StateChange { - ApplyState(RaftApplyState), - RegionState(RegionLocalState), +pub struct FlushProgress { + cf: String, + apply_index: u64, + earliest_seqno: u64, } -/// States that is related to apply progress. -#[derive(Default, Debug)] -struct StateChanges { - /// apply index, state change - changes: Vec<(u64, StateChange)>, -} +impl FlushProgress { + fn merge(&mut self, pr: FlushProgress) { + debug_assert_eq!(self.cf, pr.cf); + debug_assert!(self.apply_index <= pr.apply_index); + self.apply_index = pr.apply_index; + } -struct FlushProgress { - cf: String, - id: u64, - apply_index: u64, - state_changes: StateChanges, + pub fn applied_index(&self) -> u64 { + self.apply_index + } + + pub fn cf(&self) -> &str { + &self.cf + } } /// A share state between raftstore and underlying engine. @@ -53,7 +53,6 @@ struct FlushProgress { #[derive(Default, Debug)] pub struct FlushState { applied_index: AtomicU64, - changes: Mutex, } impl FlushState { @@ -68,135 +67,113 @@ impl FlushState { pub fn applied_index(&self) -> u64 { self.applied_index.load(Ordering::Acquire) } +} - /// Record an apply state change. - /// - /// This can be triggered by admin command like compact log. General log - /// apply will not trigger the change, instead they are recorded by - /// `set_applied_index`. - #[inline] - pub fn update_apply_state(&self, index: u64, state: RaftApplyState) { - self.changes - .lock() - .unwrap() - .changes - .push((index, StateChange::ApplyState(state))); - } - - /// Record a region state change. - /// - /// This can be triggered by admin command like split/merge. - #[inline] - pub fn update_region_state(&self, index: u64, state: RegionLocalState) { - self.changes - .lock() - .unwrap() - .changes - .push((index, StateChange::RegionState(state))); - } - - /// Check if there is any state change. - #[inline] - pub fn is_empty(&self) -> bool { - self.changes.lock().unwrap().changes.is_empty() - } - - /// Get the last changed state. - #[inline] - pub fn last_state(&self) -> Option<(u64, Either)> { - let changes = self.changes.lock().unwrap(); - let (index, state) = changes.changes.last()?; - let state = match state { - StateChange::ApplyState(state) => Either::Left(state.clone()), - StateChange::RegionState(state) => Either::Right(state.clone()), - }; - Some((*index, state)) - } +/// A helper trait to avoid exposing `RaftEngine` to `TabletFactory`. +pub trait StateStorage: Sync + Send { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress); } /// A flush listener that maps memtable to apply index and persist the relation /// to raft engine. -pub struct PersistenceListener { +pub struct PersistenceListener { region_id: u64, tablet_index: u64, state: Arc, - progress: Mutex>, - raft: ER, + progress: Mutex>, + storage: Arc, } -impl PersistenceListener { - pub fn new(region_id: u64, tablet_index: u64, state: Arc, raft: ER) -> Self { +impl PersistenceListener { + pub fn new( + region_id: u64, + tablet_index: u64, + state: Arc, + storage: Arc, + ) -> Self { Self { region_id, tablet_index, state, - progress: Mutex::new(Vec::new()), - raft, + progress: Mutex::new(LinkedList::new()), + storage, } } } -impl PersistenceListener { +impl PersistenceListener { pub fn flush_state(&self) -> &Arc { &self.state } /// Called when memtable is frozen. /// - /// `id` should be unique between memtables, which is used to identify - /// memtable in the flushed event. - pub fn on_memtable_sealed(&self, cf: String, id: u64) { + /// `earliest_seqno` should be the smallest seqno of the memtable. + pub fn on_memtable_sealed(&self, cf: String, earliest_seqno: u64) { // The correctness relies on the assumption that there will be only one // thread writting to the DB and increasing apply index. - let mut state_changes = self.state.changes.lock().unwrap(); - // Query within lock so it's correct even in manually flush. + // Apply index will be set within DB lock, so it's correct even with manual + // flush. let apply_index = self.state.applied_index.load(Ordering::SeqCst); - let changes = mem::take(&mut *state_changes); - drop(state_changes); - self.progress.lock().unwrap().push(FlushProgress { + self.progress.lock().unwrap().push_back(FlushProgress { cf, - id, apply_index, - state_changes: changes, + earliest_seqno, }); } /// Called a memtable finished flushing. - pub fn on_flush_completed(&self, cf: &str, id: u64) { + /// + /// `largest_seqno` should be the largest seqno of the generated file. + pub fn on_flush_completed(&self, cf: &str, largest_seqno: u64) { // Maybe we should hook the compaction to avoid the file is compacted before // being recorded. let pr = { let mut prs = self.progress.lock().unwrap(); - let pos = prs - .iter() - .position(|pr| pr.cf == cf && pr.id == id) - .unwrap(); - prs.swap_remove(pos) + let mut cursor = prs.cursor_front_mut(); + let mut flushed_pr = None; + while let Some(pr) = cursor.current() { + if pr.cf != cf { + cursor.move_next(); + continue; + } + // Note flushed largest_seqno equals to earliest_seqno of next memtable. + if pr.earliest_seqno < largest_seqno { + match &mut flushed_pr { + None => flushed_pr = cursor.remove_current(), + Some(flushed_pr) => { + flushed_pr.merge(cursor.remove_current().unwrap()); + } + } + continue; + } + break; + } + match flushed_pr { + Some(pr) => pr, + None => panic!("{} not found in {:?}", cf, prs), + } }; - let mut batch = self.raft.log_batch(1); + self.storage + .persist_progress(self.region_id, self.tablet_index, pr); + } +} + +impl StateStorage for R { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress) { + if pr.apply_index == 0 { + return; + } + let mut batch = self.log_batch(1); // TODO: It's possible that flush succeeds but fails to call // `on_flush_completed` before exit. In this case the flushed data will // be replayed again after restarted. To solve the problem, we need to // (1) persist flushed file numbers in `on_flush_begin` and (2) check // the file number in `on_compaction_begin`. After restart, (3) check if the // file exists. If (1) && ((2) || (3)), then we don't need to replay the data. - for (index, change) in pr.state_changes.changes { - match &change { - StateChange::ApplyState(state) => { - batch.put_apply_state(self.region_id, index, state).unwrap(); - } - StateChange::RegionState(state) => { - batch - .put_region_state(self.region_id, index, state) - .unwrap(); - } - } - } - if pr.apply_index != 0 { - batch - .put_flushed_index(self.region_id, cf, self.tablet_index, pr.apply_index) - .unwrap(); - } - self.raft.consume(&mut batch, true).unwrap(); + batch + .put_flushed_index(region_id, &pr.cf, tablet_index, pr.apply_index) + .unwrap(); + self.consume(&mut batch, true).unwrap(); } } diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index db95f5621e0..b75c3e7b7c0 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -251,6 +251,8 @@ #![cfg_attr(test, feature(test))] #![feature(min_specialization)] #![feature(assert_matches)] +#![feature(linked_list_cursors)] +#![feature(let_chains)] #[macro_use(fail_point)] extern crate fail; diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 18991038ee8..34502634280 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -38,7 +38,10 @@ pub enum DeleteStrategy { } pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { - fn flush_cfs(&self, wait: bool) -> Result<()>; + /// Flush all specified column families at once. + /// + /// If `cfs` is empty, it will try to flush all available column families. + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()>; fn flush_cf(&self, cf: &str, wait: bool) -> Result<()>; diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index acecb976f58..46b020cf138 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -13,7 +13,7 @@ use collections::HashMap; use kvproto::metapb::Region; use tikv_util::box_err; -use crate::{Error, Result}; +use crate::{Error, FlushState, Result}; #[derive(Debug)] struct LatestTablet { @@ -91,6 +91,10 @@ pub struct TabletContext { /// Any key that is larger than or equal to this key can be considered /// obsolete. pub end_key: Box<[u8]>, + /// The states to be persisted when flush is triggered. + /// + /// If not set, apply may not be resumed correctly. + pub flush_state: Option>, } impl Debug for TabletContext { @@ -111,6 +115,7 @@ impl TabletContext { suffix, start_key: keys::data_key(region.get_start_key()).into_boxed_slice(), end_key: keys::data_end_key(region.get_end_key()).into_boxed_slice(), + flush_state: None, } } diff --git a/components/raftstore-v2/src/bootstrap.rs b/components/raftstore-v2/src/bootstrap.rs index b505b37a75b..62bc9e4b8c5 100644 --- a/components/raftstore-v2/src/bootstrap.rs +++ b/components/raftstore-v2/src/bootstrap.rs @@ -15,7 +15,7 @@ use raftstore::store::initial_region; use slog::{debug, error, info, warn, Logger}; use tikv_util::{box_err, box_try}; -use crate::{raft::write_initial_states, Result}; +use crate::{operation::write_initial_states, Result}; const MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT: u64 = 60; const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs(3); diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 2065c5d7fd4..07a577e0c35 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -1,10 +1,13 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::{Duration, Instant}; +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; -use engine_traits::{KvEngine, TabletRegistry}; +use engine_traits::{FlushState, KvEngine, TabletRegistry}; use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; use raftstore::store::ReadTask; @@ -16,6 +19,7 @@ use tikv_util::{ }; use crate::{ + operation::DataTrace, raft::Apply, router::{ApplyRes, ApplyTask, PeerMsg}, }; @@ -59,6 +63,8 @@ impl ApplyFsm { res_reporter: R, tablet_registry: TabletRegistry, read_scheduler: Scheduler>, + flush_state: Arc, + log_recovery: Option>, logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); @@ -68,6 +74,8 @@ impl ApplyFsm { res_reporter, tablet_registry, read_scheduler, + flush_state, + log_recovery, logger, ); ( diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 8d497a7e4e5..793e7a340f2 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -254,6 +254,15 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.fsm.peer_mut().on_snapshot_generated(snap_res) } PeerMsg::QueryDebugInfo(ch) => self.fsm.peer_mut().on_query_debug_info(ch), + PeerMsg::DataFlushed { + cf, + tablet_index, + flushed_index, + } => { + self.fsm + .peer_mut() + .on_data_flushed(cf, tablet_index, flushed_index); + } #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index bac66b34acc..9ddb577ab5c 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -24,6 +24,7 @@ #![feature(let_chains)] #![feature(array_windows)] #![feature(div_duration)] +#![feature(box_into_inner)] mod batch; mod bootstrap; @@ -37,4 +38,5 @@ pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; +pub use operation::StateStorage; pub use raftstore::{Error, Result}; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 7ae0b68a327..6255b3ba9b9 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -29,7 +29,7 @@ use std::cmp; use collections::HashSet; use crossbeam::channel::SendError; -use engine_traits::{Checkpointer, KvEngine, RaftEngine, TabletContext}; +use engine_traits::{Checkpointer, KvEngine, RaftEngine, RaftLogBatch, TabletContext}; use fail::fail_point; use itertools::Itertools; use kvproto::{ @@ -309,12 +309,16 @@ impl Apply { e ) }); + // Remove the old write batch. + self.write_batch.take(); let reg = self.tablet_registry(); let path = reg.tablet_path(region_id, log_index); - let ctx = TabletContext::new(®ions[derived_index], Some(log_index)); + let mut ctx = TabletContext::new(®ions[derived_index], Some(log_index)); + // Now the tablet is flushed, so all previous states should be persisted. + // Reusing the tablet should not be a problem. + // TODO: Should we avoid flushing for the old tablet? + ctx.flush_state = Some(self.flush_state().clone()); let tablet = reg.tablet_factory().open_tablet(ctx, &path).unwrap(); - // Remove the old write batch. - self.write_batch.take(); self.publish_tablet(tablet); self.region_state_mut() @@ -425,6 +429,11 @@ impl Peer { } } self.split_trace_mut().push((tablet_index, new_ids)); + let region_state = self.storage().region_state().clone(); + self.state_changes_mut() + .put_region_state(region_id, tablet_index, ®ion_state) + .unwrap(); + self.set_has_extra_write(); } pub fn on_split_init( @@ -521,15 +530,21 @@ impl Peer { split_trace.drain(..off); // TODO: save admin_flushed. assert_ne!(admin_flushed, 0); + self.storage_mut() + .apply_trace_mut() + .on_admin_flush(admin_flushed); // Persist admin flushed. - self.set_has_ready(); + self.set_has_extra_write(); } } } #[cfg(test)] mod test { - use std::sync::mpsc::{channel, Receiver, Sender}; + use std::sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, + }; use engine_test::{ ctor::{CfOptions, DbOptions}, @@ -732,6 +747,8 @@ mod test { reporter, reg, read_scheduler, + Arc::default(), + None, logger.clone(), ); @@ -905,14 +922,14 @@ mod test { // Split will create checkpoint tablet, so if there are some writes before // split, they should be flushed immediately. - apply.apply_put(CF_DEFAULT, b"k04", b"v4").unwrap(); + apply.apply_put(CF_DEFAULT, 50, b"k04", b"v4").unwrap(); assert!(!WriteBatch::is_empty(apply.write_batch.as_ref().unwrap())); splits.mut_requests().clear(); splits .mut_requests() .push(new_split_req(b"k05", 70, vec![71, 72, 73])); req.set_splits(splits); - apply.apply_batch_split(&req, 50).unwrap(); + apply.apply_batch_split(&req, 51).unwrap(); assert!(apply.write_batch.is_none()); assert_eq!( apply diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 6daa8f2770c..8d55beca636 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -121,6 +121,8 @@ impl Peer { mailbox, store_ctx.tablet_registry.clone(), read_scheduler, + self.flush_state().clone(), + self.storage().apply_trace().log_recovery(), logger, ); @@ -266,6 +268,7 @@ impl Peer { entry_and_proposals, }; self.apply_scheduler() + .unwrap() .send(ApplyTask::CommittedEntries(apply)); } @@ -280,7 +283,7 @@ impl Peer { return; } - for admin_res in apply_res.admin_result { + for admin_res in Vec::from(apply_res.admin_result) { match admin_res { AdminCmdResult::None => unreachable!(), AdminCmdResult::ConfChange(conf_change) => { @@ -290,7 +293,12 @@ impl Peer { regions, derived_index, tablet_index, - }) => self.on_apply_res_split(ctx, derived_index, tablet_index, regions), + }) => { + self.storage_mut() + .apply_trace_mut() + .on_admin_modify(tablet_index); + self.on_apply_res_split(ctx, derived_index, tablet_index, regions) + } AdminCmdResult::TransferLeader(term) => self.on_transfer_leader(ctx, term), } } @@ -308,6 +316,7 @@ impl Peer { if !is_leader { entry_storage.compact_entry_cache(apply_res.applied_index + 1); } + self.on_data_modified(apply_res.modifications); self.handle_read_on_apply( ctx, apply_res.applied_term, @@ -317,6 +326,17 @@ impl Peer { } } +impl Apply { + #[inline] + fn should_skip(&self, off: usize, index: u64) -> bool { + let log_recovery = self.log_recovery(); + if log_recovery.is_none() { + return false; + } + log_recovery.as_ref().unwrap()[off] >= index + } +} + impl Apply { #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { @@ -357,11 +377,12 @@ impl Apply { #[inline] async fn apply_entry(&mut self, entry: &Entry) -> Result { let mut conf_change = None; + let log_index = entry.get_index(); let req = match entry.get_entry_type() { EntryType::EntryNormal => match SimpleWriteDecoder::new( &self.logger, entry.get_data(), - entry.get_index(), + log_index, entry.get_term(), ) { Ok(decoder) => { @@ -375,16 +396,21 @@ impl Apply { let res = Ok(new_response(decoder.header())); for req in decoder { match req { - SimpleWrite::Put(put) => self.apply_put(put.cf, put.key, put.value)?, + SimpleWrite::Put(put) => { + self.apply_put(put.cf, log_index, put.key, put.value)?; + } SimpleWrite::Delete(delete) => { - self.apply_delete(delete.cf, delete.key)? + self.apply_delete(delete.cf, log_index, delete.key)?; + } + SimpleWrite::DeleteRange(dr) => { + self.apply_delete_range( + dr.cf, + log_index, + dr.start_key, + dr.end_key, + dr.notify_only, + )?; } - SimpleWrite::DeleteRange(dr) => self.apply_delete_range( - dr.cf, - dr.start_key, - dr.end_key, - dr.notify_only, - )?, } } return res; @@ -392,34 +418,18 @@ impl Apply { Err(req) => req, }, EntryType::EntryConfChange => { - let cc: ConfChange = parse_at( - &self.logger, - entry.get_data(), - entry.get_index(), - entry.get_term(), - ); - let req: RaftCmdRequest = parse_at( - &self.logger, - cc.get_context(), - entry.get_index(), - entry.get_term(), - ); + let cc: ConfChange = + parse_at(&self.logger, entry.get_data(), log_index, entry.get_term()); + let req: RaftCmdRequest = + parse_at(&self.logger, cc.get_context(), log_index, entry.get_term()); conf_change = Some(cc.into_v2()); req } EntryType::EntryConfChangeV2 => { - let cc: ConfChangeV2 = parse_at( - &self.logger, - entry.get_data(), - entry.get_index(), - entry.get_term(), - ); - let req: RaftCmdRequest = parse_at( - &self.logger, - cc.get_context(), - entry.get_index(), - entry.get_term(), - ); + let cc: ConfChangeV2 = + parse_at(&self.logger, entry.get_data(), log_index, entry.get_term()); + let req: RaftCmdRequest = + parse_at(&self.logger, cc.get_context(), log_index, entry.get_term()); conf_change = Some(cc); req } @@ -430,8 +440,8 @@ impl Apply { let admin_req = req.get_admin_request(); let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { AdminCmdType::CompactLog => unimplemented!(), - AdminCmdType::Split => self.apply_split(admin_req, entry.index)?, - AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, entry.index)?, + AdminCmdType::Split => self.apply_split(admin_req, log_index)?, + AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, log_index)?, AdminCmdType::PrepareMerge => unimplemented!(), AdminCmdType::CommitMerge => unimplemented!(), AdminCmdType::RollbackMerge => unimplemented!(), @@ -439,10 +449,10 @@ impl Apply { self.apply_transfer_leader(admin_req, entry.term)? } AdminCmdType::ChangePeer => { - self.apply_conf_change(entry.get_index(), admin_req, conf_change.unwrap())? + self.apply_conf_change(log_index, admin_req, conf_change.unwrap())? } AdminCmdType::ChangePeerV2 => { - self.apply_conf_change_v2(entry.get_index(), admin_req, conf_change.unwrap())? + self.apply_conf_change_v2(log_index, admin_req, conf_change.unwrap())? } AdminCmdType::ComputeHash => unimplemented!(), AdminCmdType::VerifyHash => unimplemented!(), @@ -468,16 +478,17 @@ impl Apply { // backward compatibility. CmdType::Put => { let put = r.get_put(); - self.apply_put(put.get_cf(), put.get_key(), put.get_value())?; + self.apply_put(put.get_cf(), log_index, put.get_key(), put.get_value())?; } CmdType::Delete => { let delete = r.get_delete(); - self.apply_delete(delete.get_cf(), delete.get_key())?; + self.apply_delete(delete.get_cf(), log_index, delete.get_key())?; } CmdType::DeleteRange => { let dr = r.get_delete_range(); self.apply_delete_range( dr.get_cf(), + log_index, dr.get_start_key(), dr.get_end_key(), dr.get_notify_only(), @@ -515,7 +526,8 @@ impl Apply { let (index, term) = self.apply_progress(); apply_res.applied_index = index; apply_res.applied_term = term; - apply_res.admin_result = self.take_admin_result(); + apply_res.admin_result = self.take_admin_result().into_boxed_slice(); + apply_res.modifications = *self.modifications_mut(); self.res_reporter().report(apply_res); } } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 92f260bad26..c4cc1646963 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -14,6 +14,7 @@ use raftstore::{ use crate::{ batch::StoreContext, + operation::cf_offset, raft::{Apply, Peer}, router::CmdResChannel, }; @@ -109,7 +110,11 @@ impl Peer { impl Apply { #[inline] - pub fn apply_put(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + pub fn apply_put(&mut self, cf: &str, index: u64, key: &[u8], value: &[u8]) -> Result<()> { + let off = cf_offset(cf); + if self.should_skip(off, index) { + return Ok(()); + } util::check_key_in_region(key, self.region_state().get_region())?; // Technically it's OK to remove prefix for raftstore v2. But rocksdb doesn't // support specifying infinite upper bound in various APIs. @@ -140,11 +145,16 @@ impl Apply { fail::fail_point!("APPLY_PUT", |_| Err(raftstore::Error::Other( "aborted by failpoint".into() ))); + self.modifications_mut()[off] = index; Ok(()) } #[inline] - pub fn apply_delete(&mut self, cf: &str, key: &[u8]) -> Result<()> { + pub fn apply_delete(&mut self, cf: &str, index: u64, key: &[u8]) -> Result<()> { + let off = cf_offset(cf); + if self.should_skip(off, index) { + return Ok(()); + } util::check_key_in_region(key, self.region_state().get_region())?; keys::data_key_with_buffer(key, &mut self.key_buffer); let res = if cf.is_empty() || cf == CF_DEFAULT { @@ -165,6 +175,7 @@ impl Apply { e ); }); + self.modifications_mut()[off] = index; Ok(()) } @@ -172,6 +183,7 @@ impl Apply { pub fn apply_delete_range( &mut self, _cf: &str, + _index: u64, _start_key: &[u8], _end_key: &[u8], _notify_only: bool, diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 7df897f2b26..beb47f9a08f 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -10,6 +10,9 @@ pub use command::{ AdminCmdResult, CommittedEntries, ProposalControl, SimpleWriteDecoder, SimpleWriteEncoder, }; pub use life::DestroyProgress; -pub use ready::{AsyncWriter, GenSnapTask, SnapState}; +pub use ready::{ + cf_offset, write_initial_states, ApplyTrace, AsyncWriter, DataTrace, GenSnapTask, SnapState, + StateStorage, +}; pub(crate) use self::{command::SplitInit, query::LocalReader}; diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs new file mode 100644 index 00000000000..0b7521f2634 --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -0,0 +1,585 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! In raftstore v2, WAL is always disabled for tablet. So we need a way to +//! trace what have been persisted what haven't, and recover those missing +//! data when restart. +//! +//! In summary, we trace the persist progress by recording flushed event. +//! Because memtable is flushed one by one, so a flushed memtable must contain +//! all the data within the CF before some certain apply index. So the minimun +//! flushed apply index + 1 of all data CFs is the recovery start point. In +//! some cases, a CF may not have any updates at all for a long time. In some +//! cases, we may still need to recover from smaller index even if flushed +//! index of all data CFs have advanced. So a special flushed index is +//! introduced and stored with raft CF (only using the name, raft CF is +//! dropped). It's the recommended recovery start point. How these two indexes +//! interact with each other can be found in the `ApplyTrace::recover` and +//! `ApplyTrace::maybe_advance_admin_flushed`. +//! +//! The correctness of raft cf index relies on the fact that: +//! - apply is sequential, so if any apply index is updated to apply trace, all +//! modification events before that must be processed. +//! - admin commands that marked by raft cf index must flush all data before +//! being executed. Note this contraint is not just for recovery, but also +//! necessary to guarantee safety of operations like split init or log gc. +//! So data of logs before raft cf index must be applied and flushed to disk. +//! +//! All apply related states are associated with an apply index. During +//! recovery states corresponding to the start index should be used. + +use std::{cmp, sync::Mutex}; + +use engine_traits::{ + FlushProgress, KvEngine, RaftEngine, RaftLogBatch, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, + CF_WRITE, DATA_CFS, DATA_CFS_LEN, +}; +use kvproto::{ + metapb::Region, + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, +}; +use raftstore::store::{ReadTask, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; +use slog::Logger; +use tikv_util::{box_err, worker::Scheduler}; + +use crate::{ + raft::{Peer, Storage}, + router::PeerMsg, + Result, StoreRouter, +}; + +/// Write states for the given region. The region is supposed to have all its +/// data persisted and not governed by any raft group before. +pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { + let region_id = region.get_id(); + + let mut state = RegionLocalState::default(); + state.set_region(region); + state.set_tablet_index(RAFT_INIT_LOG_INDEX); + wb.put_region_state(region_id, RAFT_INIT_LOG_INDEX, &state)?; + + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_term(RAFT_INIT_LOG_TERM); + wb.put_apply_state(region_id, RAFT_INIT_LOG_INDEX, &apply_state)?; + + let mut raft_state = RaftLocalState::default(); + raft_state.set_last_index(RAFT_INIT_LOG_INDEX); + raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); + raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); + wb.put_raft_state(region_id, &raft_state)?; + + for cf in ALL_CFS { + wb.put_flushed_index(region_id, cf, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_INDEX)?; + } + + Ok(()) +} + +fn to_static_cf(cf: &str) -> &'static str { + match cf { + CF_DEFAULT => CF_DEFAULT, + CF_RAFT => CF_RAFT, + CF_WRITE => CF_WRITE, + CF_LOCK => CF_LOCK, + _ => unreachable!("unexpected cf: {cf}"), + } +} + +pub struct StateStorage { + raft_engine: ER, + router: Mutex>, +} + +impl StateStorage { + pub fn new(raft_engine: ER, router: StoreRouter) -> Self { + Self { + raft_engine, + router: Mutex::new(router), + } + } +} + +impl engine_traits::StateStorage for StateStorage { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress) { + let cf = to_static_cf(pr.cf()); + let flushed_index = pr.applied_index(); + self.raft_engine + .persist_progress(region_id, tablet_index, pr); + let _ = self.router.lock().unwrap().send( + region_id, + PeerMsg::DataFlushed { + cf, + tablet_index, + flushed_index, + }, + ); + } +} + +/// An alias of frequent use type that each data cf has a u64. +pub type DataTrace = [u64; DATA_CFS_LEN]; + +#[derive(Clone, Copy, Default)] +struct Progress { + flushed: u64, + /// The index of last entry that has modification to the CF. + /// + /// If `flushed` == `last_modified`, then all data in the CF is persisted. + last_modified: u64, +} + +pub fn cf_offset(cf: &str) -> usize { + let cf = if cf.is_empty() { CF_DEFAULT } else { cf }; + DATA_CFS.iter().position(|c| *c == cf).expect(cf) +} + +/// `ApplyTrace` is used to track the indexes of modifications and flushes. +/// +/// It has 3 core functionalities: +/// - recover from stopped state and figure out the correct log replay start +/// point. +/// - trace the admin flushed index and issue persistence once admin operation +/// is considered finished. Note only those admin commands that needs to +/// interact with other peers will be traced. +/// - support query the flushed progress without actually scanning raft engine, +/// which is useful for cleaning up stale flush records. +#[derive(Default)] +pub struct ApplyTrace { + /// The modified indexes and flushed index of each data CF. + data_cfs: Box<[Progress; DATA_CFS_LEN]>, + /// The modified indexes and flushed index of raft CF. + /// + /// raft CF is a virtual CF that only used for recording apply index of + /// certain admin commands (like split/merge). So there is no flush at all. + /// The `flushed` field is advanced when the admin command doesn't need to + /// be replayed after restart. A write should be triggered to persist the + /// record. + admin: Progress, + /// Index that is issued to be written. It may not be truely persisted. + persisted_applied: u64, + /// `true` means the raft cf record should be persisted in next ready. + try_persist: bool, +} + +impl ApplyTrace { + fn recover(region_id: u64, engine: &impl RaftEngine) -> Result<(Self, RegionLocalState)> { + let mut trace = ApplyTrace::default(); + // Get all the recorded apply index from data CFs. + for (off, cf) in DATA_CFS.iter().enumerate() { + // There should be at least one record. + let i = engine.get_flushed_index(region_id, cf)?.unwrap(); + trace.data_cfs[off].flushed = i; + trace.data_cfs[off].last_modified = i; + } + let i = engine.get_flushed_index(region_id, CF_RAFT)?.unwrap(); + // Index of raft CF means all data before that must be persisted. + trace.admin.flushed = i; + trace.admin.last_modified = i; + trace.persisted_applied = i; + let applied_region_state = engine + .get_region_state(region_id, trace.admin.flushed)? + .unwrap(); + Ok((trace, applied_region_state)) + } + + fn on_flush(&mut self, cf: &str, index: u64) { + let off = cf_offset(cf); + // Technically it should always be true. + if index > self.data_cfs[off].flushed { + self.data_cfs[off].flushed = index; + } + } + + fn on_modify(&mut self, cf: &str, index: u64) { + let off = cf_offset(cf); + self.data_cfs[off].last_modified = index; + } + + pub fn on_admin_flush(&mut self, index: u64) { + if index > self.admin.flushed { + self.admin.flushed = index; + self.try_persist = true; + } + } + + pub fn on_admin_modify(&mut self, index: u64) { + self.admin.last_modified = index; + } + + fn persisted_apply_index(&self) -> u64 { + self.admin.flushed + } + + // All events before `mem_index` must be consumed before calling this function. + fn maybe_advance_admin_flushed(&mut self, mem_index: u64) { + if self.admin.flushed < self.admin.last_modified { + return; + } + let min_flushed = self + .data_cfs + .iter() + // Only unflushed CFs are considered. Flushed CF always have uptodate changes + // persisted. + .filter_map(|pr| { + if pr.last_modified != pr.flushed { + Some(pr.flushed) + } else { + None + } + }) + .min(); + // At best effort, we can only advance the index to `mem_index`. + let candidate = cmp::min(mem_index, min_flushed.unwrap_or(u64::MAX)); + if candidate > self.admin.flushed { + self.admin.flushed = candidate; + if candidate > self.persisted_applied + 100 { + self.try_persist = true; + } + } + // TODO: persist admin.flushed every 10 minutes. + } + + /// Get the flushed indexes of all data CF that is needed when recoverying + /// logs. + /// + /// Logs may be replayed from the persisted apply index, but those data may + /// have been flushed in the past, so we need the flushed indexes to decide + /// what logs can be skipped for certain CFs. If all CFs are flushed before + /// the persisted apply index, then there is nothing to skipped, so + /// `None` is returned. + #[inline] + pub fn log_recovery(&self) -> Option> { + let mut flushed_indexes = [0; DATA_CFS_LEN]; + for (off, pr) in self.data_cfs.iter().enumerate() { + flushed_indexes[off] = pr.flushed; + } + for i in flushed_indexes { + if i > self.admin.flushed { + return Some(Box::new(flushed_indexes)); + } + } + None + } + + pub fn reset_snapshot(&mut self, index: u64) { + for pr in self.data_cfs.iter_mut() { + pr.flushed = index; + pr.last_modified = index; + } + self.admin.flushed = index; + self.persisted_applied = index; + self.try_persist = false; + } + + #[inline] + pub fn reset_should_persist(&mut self) { + self.try_persist = false; + } + + #[inline] + pub fn should_persist(&self) -> bool { + self.try_persist + } +} + +impl Storage { + /// Creates a new storage with uninit states. + /// + /// This should only be used for creating new peer from raft message. + pub fn uninit( + store_id: u64, + region: Region, + engine: ER, + read_scheduler: Scheduler>, + logger: &Logger, + ) -> Result { + let mut region_state = RegionLocalState::default(); + region_state.set_region(region); + Self::create( + store_id, + region_state, + RaftLocalState::default(), + RaftApplyState::default(), + engine, + read_scheduler, + false, + ApplyTrace::default(), + logger, + ) + } + + /// Creates a new storage. + /// + /// All metadata should be initialized before calling this method. If the + /// region is destroyed, `None` will be returned. + pub fn new( + region_id: u64, + store_id: u64, + engine: ER, + read_scheduler: Scheduler>, + logger: &Logger, + ) -> Result>> { + // Check latest region state to determine whether the peer is destroyed. + let region_state = match engine.get_region_state(region_id, u64::MAX) { + Ok(Some(s)) => s, + res => { + return Err(box_err!( + "failed to get region state for region {}: {:?}", + region_id, + res + )); + } + }; + + if region_state.get_state() == PeerState::Tombstone { + return Ok(None); + } + + let (trace, region_state) = ApplyTrace::recover(region_id, &engine)?; + + let raft_state = match engine.get_raft_state(region_id) { + Ok(Some(s)) => s, + res => { + return Err(box_err!("failed to get raft state: {:?}", res)); + } + }; + + let applied_index = trace.persisted_apply_index(); + let mut apply_state = match engine.get_apply_state(region_id, applied_index) { + Ok(Some(s)) => s, + res => { + return Err(box_err!("failed to get apply state: {:?}", res)); + } + }; + apply_state.set_applied_index(applied_index); + + Self::create( + store_id, + region_state, + raft_state, + apply_state, + engine, + read_scheduler, + true, + trace, + logger, + ) + .map(Some) + } + + /// Write initial persist trace for uninit peer. + pub fn init_apply_trace(&self, write_task: &mut WriteTask) { + let region_id = self.region().get_id(); + let raft_engine = self.entry_storage().raft_engine(); + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(3)); + lb.put_apply_state(region_id, 0, self.apply_state()) + .unwrap(); + lb.put_region_state(region_id, 0, self.region_state()) + .unwrap(); + for cf in ALL_CFS { + lb.put_flushed_index(region_id, cf, 0, 0).unwrap(); + } + } + + pub fn record_apply_trace(&mut self, write_task: &mut WriteTask) { + let region_id = self.region().get_id(); + let raft_engine = self.entry_storage().raft_engine(); + let tablet_index = self.tablet_index(); + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(1)); + let trace = self.apply_trace_mut(); + lb.put_flushed_index(region_id, CF_RAFT, tablet_index, trace.admin.flushed) + .unwrap(); + trace.try_persist = false; + trace.persisted_applied = trace.admin.flushed; + } +} + +impl Peer { + pub fn on_data_flushed(&mut self, cf: &str, tablet_index: u64, index: u64) { + if tablet_index < self.storage().tablet_index() { + // Stale tablet. + return; + } + let apply_index = self.storage().entry_storage().applied_index(); + let apply_trace = self.storage_mut().apply_trace_mut(); + apply_trace.on_flush(cf, index); + apply_trace.maybe_advance_admin_flushed(apply_index); + } + + pub fn on_data_modified(&mut self, modification: DataTrace) { + let apply_index = self.storage().entry_storage().applied_index(); + let apply_trace = self.storage_mut().apply_trace_mut(); + for (cf, index) in DATA_CFS.iter().zip(modification) { + if index != 0 { + apply_trace.on_modify(cf, index); + } + } + apply_trace.maybe_advance_admin_flushed(apply_index); + } +} + +#[cfg(test)] +mod tests { + use engine_traits::RaftEngineReadOnly; + use kvproto::metapb::Peer; + use tempfile::TempDir; + + use super::*; + + fn new_region() -> Region { + let mut region = Region::default(); + region.set_id(4); + let mut p = Peer::default(); + p.set_id(5); + p.set_store_id(6); + region.mut_peers().push(p); + region.mut_region_epoch().set_version(2); + region.mut_region_epoch().set_conf_ver(4); + region + } + + #[test] + fn test_write_initial_states() { + let region = new_region(); + let path = TempDir::new().unwrap(); + let engine = engine_test::new_temp_engine(&path); + let raft_engine = &engine.raft; + let mut wb = raft_engine.log_batch(10); + write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + + let local_state = raft_engine.get_region_state(4, u64::MAX).unwrap().unwrap(); + assert_eq!(local_state.get_state(), PeerState::Normal); + assert_eq!(*local_state.get_region(), region); + assert_eq!(local_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + assert_eq!( + local_state, + raft_engine + .get_region_state(4, RAFT_INIT_LOG_INDEX) + .unwrap() + .unwrap() + ); + assert_eq!( + None, + raft_engine + .get_region_state(4, RAFT_INIT_LOG_INDEX - 1) + .unwrap() + ); + + let raft_state = raft_engine.get_raft_state(4).unwrap().unwrap(); + assert_eq!(raft_state.get_last_index(), RAFT_INIT_LOG_INDEX); + let hs = raft_state.get_hard_state(); + assert_eq!(hs.get_term(), RAFT_INIT_LOG_TERM); + assert_eq!(hs.get_commit(), RAFT_INIT_LOG_INDEX); + + let apply_state = raft_engine.get_apply_state(4, u64::MAX).unwrap().unwrap(); + assert_eq!(apply_state.get_applied_index(), RAFT_INIT_LOG_INDEX); + let ts = apply_state.get_truncated_state(); + assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); + assert_eq!(ts.get_term(), RAFT_INIT_LOG_TERM); + assert_eq!( + apply_state, + raft_engine + .get_apply_state(4, RAFT_INIT_LOG_INDEX) + .unwrap() + .unwrap() + ); + assert_eq!( + None, + raft_engine + .get_apply_state(4, RAFT_INIT_LOG_INDEX - 1) + .unwrap() + ); + } + + #[test] + fn test_apply_trace() { + let mut trace = ApplyTrace::default(); + assert_eq!(0, trace.persisted_apply_index()); + // If there is no modifications, index should be advanced anyway. + trace.maybe_advance_admin_flushed(2); + assert_eq!(2, trace.persisted_apply_index()); + for cf in DATA_CFS { + trace.on_modify(cf, 3); + } + trace.maybe_advance_admin_flushed(3); + // Modification is not flushed. + assert_eq!(2, trace.persisted_apply_index()); + for cf in DATA_CFS { + trace.on_flush(cf, 3); + } + trace.maybe_advance_admin_flushed(3); + // No admin is recorded, index should be advanced. + assert_eq!(3, trace.persisted_apply_index()); + trace.on_admin_modify(4); + for cf in DATA_CFS { + trace.on_flush(cf, 4); + } + for cf in DATA_CFS { + trace.on_modify(cf, 4); + } + trace.maybe_advance_admin_flushed(4); + // Unflushed admin modification should hold index. + assert_eq!(3, trace.persisted_apply_index()); + trace.on_admin_flush(4); + trace.maybe_advance_admin_flushed(4); + // Admin is flushed, index should be advanced. + assert_eq!(4, trace.persisted_apply_index()); + for cf in DATA_CFS { + trace.on_flush(cf, 5); + } + trace.maybe_advance_admin_flushed(4); + // Though all data CFs are flushed, but index should not be + // advanced as we don't know whether there is admin modification. + assert_eq!(4, trace.persisted_apply_index()); + for cf in DATA_CFS { + trace.on_modify(cf, 5); + } + trace.maybe_advance_admin_flushed(5); + // Because modify is recorded, so we know there should be no admin + // modification and index can be advanced. + assert_eq!(5, trace.persisted_apply_index()); + } + + #[test] + fn test_advance_admin_flushed() { + let cases = &[ + // When all are flushed, admin index should be advanced to latest. + ([(2, 2), (3, 3), (5, 5)], (3, 3), 5, 5), + ([(2, 2), (3, 3), (5, 5)], (5, 3), 6, 6), + // Any unflushed result should block advancing. + ([(2, 3), (3, 3), (5, 5)], (2, 2), 5, 2), + ([(2, 4), (3, 4), (5, 6)], (2, 2), 6, 2), + // But it should not make index go back. + ([(2, 4), (3, 4), (5, 6)], (3, 3), 6, 3), + // Unflush admin should not be advanced. + ([(2, 2), (3, 3), (5, 5)], (2, 3), 5, 2), + // Flushed may race with modification. + ([(2, 2), (3, 3), (6, 5)], (2, 2), 5, 5), + ([(8, 2), (9, 3), (7, 5)], (4, 4), 5, 5), + ([(8, 2), (9, 3), (7, 5)], (5, 5), 5, 5), + ([(2, 3), (9, 3), (7, 5)], (2, 2), 5, 2), + ]; + for (case, (data_cfs, admin, mem_index, exp)) in cases.iter().enumerate() { + let mut trace = ApplyTrace::default(); + for (i, (flushed, modified)) in data_cfs.iter().enumerate() { + trace.data_cfs[i].flushed = *flushed; + trace.data_cfs[i].last_modified = *modified; + } + trace.admin.flushed = admin.0; + trace.admin.last_modified = admin.1; + trace.maybe_advance_admin_flushed(*mem_index); + assert_eq!(trace.admin.flushed, *exp, "{case}"); + } + } +} diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 47f6523cc82..a6df9049285 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -17,12 +17,14 @@ //! //! There two steps can be processed concurrently. +mod apply_trace; mod async_writer; mod snapshot; use std::{cmp, time::Instant}; -use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +pub use apply_trace::{cf_offset, write_initial_states, ApplyTrace, DataTrace, StateStorage}; +use engine_traits::{KvEngine, RaftEngine}; use error_code::ErrorCodeExt; use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; use protobuf::Message as _; @@ -269,6 +271,7 @@ impl Peer { #[inline] pub fn handle_raft_ready(&mut self, ctx: &mut StoreContext) { let has_ready = self.reset_has_ready(); + let has_extra_write = self.reset_has_extra_write(); if !has_ready || self.destroy_progress().started() { #[cfg(feature = "testexport")] self.async_writer.notify_flush(); @@ -276,7 +279,10 @@ impl Peer { } ctx.has_ready = true; - if !self.raft_group().has_ready() && (self.serving() || self.postponed_destroy()) { + if !has_extra_write + && !self.raft_group().has_ready() + && (self.serving() || self.postponed_destroy()) + { #[cfg(feature = "testexport")] self.async_writer.notify_flush(); return; @@ -328,11 +334,14 @@ impl Peer { // Always sending snapshot task after apply task, so it gets latest // snapshot. if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { - self.apply_scheduler().send(ApplyTask::Snapshot(gen_task)); + self.apply_scheduler() + .unwrap() + .send(ApplyTask::Snapshot(gen_task)); } let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); + self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); if !ready.persisted_messages().is_empty() { @@ -554,17 +563,13 @@ impl Storage { if !ever_persisted || prev_raft_state != *entry_storage.raft_state() { write_task.raft_state = Some(entry_storage.raft_state().clone()); } - if !ever_persisted { - let region_id = self.region().get_id(); - let raft_engine = self.entry_storage().raft_engine(); - let lb = write_task - .extra_write - .ensure_v2(|| raft_engine.log_batch(3)); - lb.put_apply_state(region_id, 0, self.apply_state()) - .unwrap(); - lb.put_region_state(region_id, 0, self.region_state()) - .unwrap(); + // If snapshot initializes the peer, we don't need to write apply trace again. + if !self.ever_persisted() { + self.init_apply_trace(write_task); self.set_ever_persisted(); } + if self.apply_trace().should_persist() { + self.record_apply_trace(write_task); + } } } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 86817ab17d3..1f4a1fee268 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -27,7 +27,7 @@ use std::{ }, }; -use engine_traits::{KvEngine, RaftEngine, TabletContext, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, CF_RAFT}; use kvproto::raft_serverpb::{PeerState, RaftSnapshotData}; use protobuf::Message; use raft::eraftpb::Snapshot; @@ -121,7 +121,11 @@ impl Peer { let first_index = self.storage().entry_storage().first_index(); if first_index == persisted_index + 1 { let region_id = self.region_id(); - let tablet_ctx = TabletContext::new(self.region(), Some(persisted_index)); + self.reset_flush_state(); + let flush_state = self.flush_state().clone(); + let mut tablet_ctx = TabletContext::new(self.region(), Some(persisted_index)); + // Use a new FlushState to avoid conflicts with the old one. + tablet_ctx.flush_state = Some(flush_state); ctx.tablet_registry.load(tablet_ctx, false).unwrap(); self.schedule_apply_fsm(ctx); self.storage_mut().on_applied_snapshot(); @@ -353,7 +357,7 @@ impl Storage { let index = entry.truncated_index(); entry.set_applied_term(term); entry.apply_state_mut().set_applied_index(index); - self.region_state_mut().set_tablet_index(index); + self.apply_trace_mut().reset_snapshot(index); } pub fn apply_snapshot( @@ -383,14 +387,27 @@ impl Storage { let last_index = snap.get_metadata().get_index(); let last_term = snap.get_metadata().get_term(); - self.region_state_mut().set_state(PeerState::Normal); - self.region_state_mut().set_region(region); - self.entry_storage_mut() - .raft_state_mut() - .set_last_index(last_index); - self.entry_storage_mut().set_truncated_index(last_index); - self.entry_storage_mut().set_truncated_term(last_term); - self.entry_storage_mut().set_last_term(last_term); + let region_state = self.region_state_mut(); + region_state.set_state(PeerState::Normal); + region_state.set_region(region); + region_state.set_tablet_index(last_index); + let entry_storage = self.entry_storage_mut(); + entry_storage.raft_state_mut().set_last_index(last_index); + entry_storage.set_truncated_index(last_index); + entry_storage.set_truncated_term(last_term); + entry_storage.set_last_term(last_term); + + self.apply_trace_mut().reset_should_persist(); + self.set_ever_persisted(); + let lb = task + .extra_write + .ensure_v2(|| self.entry_storage().raft_engine().log_batch(3)); + lb.put_apply_state(region_id, last_index, self.apply_state()) + .unwrap(); + lb.put_region_state(region_id, last_index, self.region_state()) + .unwrap(); + lb.put_flushed_index(region_id, CF_RAFT, last_index, last_index) + .unwrap(); let (path, clean_split) = match self.split_init_mut() { // If index not match, the peer may accept a newer snapshot after split. diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 30ced7bdbd7..56379f2a15f 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -1,14 +1,17 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::mem; +use std::{mem, sync::Arc}; -use engine_traits::{CachedTablet, KvEngine, TabletRegistry, WriteBatch}; +use engine_traits::{CachedTablet, FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{fsm::apply::DEFAULT_APPLY_WB_SIZE, ReadTask}; use slog::Logger; use tikv_util::worker::Scheduler; -use crate::{operation::AdminCmdResult, router::CmdResChannel}; +use crate::{ + operation::{AdminCmdResult, DataTrace}, + router::CmdResChannel, +}; /// Apply applies all the committed commands to kv db. pub struct Apply { @@ -27,9 +30,17 @@ pub struct Apply { /// A flag indicates whether the peer is destroyed by applying admin /// command. tombstone: bool, - applied_index: u64, applied_term: u64, + /// The largest index that have modified each column family. + modifications: DataTrace, admin_cmd_result: Vec, + flush_state: Arc, + /// The flushed indexes of each column family before being restarted. + /// + /// If an apply index is less than the flushed index, the log can be + /// skipped. `None` means logs should apply to all required column + /// families. + log_recovery: Option>, region_state: RegionLocalState, @@ -46,6 +57,8 @@ impl Apply { res_reporter: R, tablet_registry: TabletRegistry, read_scheduler: Scheduler>, + flush_state: Arc, + log_recovery: Option>, logger: Logger, ) -> Self { let mut remote_tablet = tablet_registry @@ -58,14 +71,16 @@ impl Apply { write_batch: None, callbacks: vec![], tombstone: false, - applied_index: 0, applied_term: 0, + modifications: [0; DATA_CFS_LEN], admin_cmd_result: vec![], region_state, tablet_registry, read_scheduler, key_buffer: vec![], res_reporter, + flush_state, + log_recovery, logger, } } @@ -95,13 +110,20 @@ impl Apply { #[inline] pub fn set_apply_progress(&mut self, index: u64, term: u64) { - self.applied_index = index; + self.flush_state.set_applied_index(index); self.applied_term = term; + if self.log_recovery.is_none() { + return; + } + let log_recovery = self.log_recovery.as_ref().unwrap(); + if log_recovery.iter().all(|v| index >= *v) { + self.log_recovery.take(); + } } #[inline] pub fn apply_progress(&self) -> (u64, u64) { - (self.applied_index, self.applied_term) + (self.flush_state.applied_index(), self.applied_term) } #[inline] @@ -171,4 +193,19 @@ impl Apply { self.write_batch = None; } } + + #[inline] + pub fn modifications_mut(&mut self) -> &mut DataTrace { + &mut self.modifications + } + + #[inline] + pub fn flush_state(&self) -> &Arc { + &self.flush_state + } + + #[inline] + pub fn log_recovery(&self) -> &Option> { + &self.log_recovery + } } diff --git a/components/raftstore-v2/src/raft/mod.rs b/components/raftstore-v2/src/raft/mod.rs index c1d6a522d79..495d7ad87ed 100644 --- a/components/raftstore-v2/src/raft/mod.rs +++ b/components/raftstore-v2/src/raft/mod.rs @@ -6,4 +6,4 @@ mod storage; pub use apply::Apply; pub use peer::Peer; -pub use storage::{write_initial_states, Storage}; +pub use storage::Storage; diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 21795eb3293..82e9b6011ca 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -8,14 +8,16 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::atomic::AtomicCell; -use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletContext, TabletRegistry}; +use engine_traits::{ + CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, +}; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; use raftstore::store::{ util::{Lease, RegionReadProgress}, Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, - ReadProgress, TxnExt, + ReadProgress, TxnExt, WriteTask, }; use slog::Logger; @@ -53,6 +55,8 @@ pub struct Peer { /// Set to true if any side effect needs to be handled. has_ready: bool, + /// Sometimes there is no ready at all, but we need to trigger async write. + has_extra_write: bool, /// Writer for persisting side effects asynchronously. pub(crate) async_writer: AsyncWriter, @@ -78,6 +82,13 @@ pub struct Peer { // Trace which peers have not finished split. split_trace: Vec<(u64, HashSet)>, + + /// Apply related State changes that needs to be persisted to raft engine. + /// + /// To make recovery correct, we need to persist all state changes before + /// advancing apply index. + state_changes: Option>, + flush_state: Arc, } impl Peer { @@ -102,11 +113,13 @@ impl Peer { let region = raft_group.store().region_state().get_region().clone(); let cached_tablet = tablet_registry.get_or_default(region_id); + let flush_state: Arc = Arc::default(); // We can't create tablet if tablet index is 0. It can introduce race when gc // old tablet and create new peer. We also can't get the correct range of the // region, which is required for kv data gc. if tablet_index != 0 { - let ctx = TabletContext::new(®ion, Some(tablet_index)); + let mut ctx = TabletContext::new(®ion, Some(tablet_index)); + ctx.flush_state = Some(flush_state.clone()); // TODO: Perhaps we should stop create the tablet automatically. tablet_registry.load(ctx, false)?; } @@ -122,6 +135,7 @@ impl Peer { async_writer: AsyncWriter::new(region_id, peer_id), apply_scheduler: None, has_ready: false, + has_extra_write: false, destroy_progress: DestroyProgress::None, raft_group, logger, @@ -143,6 +157,8 @@ impl Peer { proposal_control: ProposalControl::new(0), pending_ticks: Vec::new(), split_trace: vec![], + state_changes: None, + flush_state, }; // If this region has only one peer and I am the one, campaign directly. @@ -334,6 +350,17 @@ impl Peer { mem::take(&mut self.has_ready) } + #[inline] + pub fn set_has_extra_write(&mut self) { + self.set_has_ready(); + self.has_extra_write = true; + } + + #[inline] + pub fn reset_has_extra_write(&mut self) -> bool { + mem::take(&mut self.has_extra_write) + } + #[inline] pub fn insert_peer_cache(&mut self, peer: metapb::Peer) { for p in self.raft_group.store().region().get_peers() { @@ -499,8 +526,8 @@ impl Peer { &self.proposals } - pub fn apply_scheduler(&self) -> &ApplyScheduler { - self.apply_scheduler.as_ref().unwrap() + pub fn apply_scheduler(&self) -> Option<&ApplyScheduler> { + self.apply_scheduler.as_ref() } #[inline] @@ -631,4 +658,30 @@ impl Peer { pub fn split_trace_mut(&mut self) -> &mut Vec<(u64, HashSet)> { &mut self.split_trace } + + #[inline] + pub fn flush_state(&self) -> &Arc { + &self.flush_state + } + + pub fn reset_flush_state(&mut self) { + self.flush_state = Arc::default(); + } + + #[inline] + pub fn state_changes_mut(&mut self) -> &mut ER::LogBatch { + if self.state_changes.is_none() { + self.state_changes = Some(Box::new(self.entry_storage().raft_engine().log_batch(0))); + } + self.state_changes.as_mut().unwrap() + } + + #[inline] + pub fn merge_state_changes_to(&mut self, task: &mut WriteTask) { + if self.state_changes.is_none() { + return; + } + task.extra_write + .merge_v2(Box::into_inner(self.state_changes.take().unwrap())); + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 889674c514c..de58d39cce5 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -5,51 +5,24 @@ use std::{ fmt::{self, Debug, Formatter}, }; -use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ - metapb::{self, Region}, + metapb, raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; use raft::{ eraftpb::{ConfState, Entry, Snapshot}, GetEntriesContext, RaftState, INVALID_ID, }; -use raftstore::store::{util, EntryStorage, ReadTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; +use raftstore::store::{util, EntryStorage, ReadTask}; use slog::{o, Logger}; use tikv_util::{box_err, store::find_peer, worker::Scheduler}; use crate::{ - operation::{GenSnapTask, SnapState, SplitInit}, + operation::{ApplyTrace, GenSnapTask, SnapState, SplitInit}, Result, }; -pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { - let region_id = region.get_id(); - - let mut state = RegionLocalState::default(); - state.set_region(region); - state.set_tablet_index(RAFT_INIT_LOG_INDEX); - wb.put_region_state(region_id, 0, &state)?; - - let mut apply_state = RaftApplyState::default(); - apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); - apply_state - .mut_truncated_state() - .set_index(RAFT_INIT_LOG_INDEX); - apply_state - .mut_truncated_state() - .set_term(RAFT_INIT_LOG_TERM); - wb.put_apply_state(region_id, 0, &apply_state)?; - - let mut raft_state = RaftLocalState::default(); - raft_state.set_last_index(RAFT_INIT_LOG_INDEX); - raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); - raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); - wb.put_raft_state(region_id, &raft_state)?; - - Ok(()) -} - /// A storage for raft. /// /// It's similar to `PeerStorage` in v1. @@ -67,6 +40,8 @@ pub struct Storage { snap_state: RefCell, gen_snap_task: RefCell>>, split_init: Option>, + /// The flushed index of all CFs. + apply_trace: ApplyTrace, } impl Debug for Storage { @@ -120,87 +95,20 @@ impl Storage { pub fn gen_snap_task_mut(&self) -> RefMut<'_, Box>> { self.gen_snap_task.borrow_mut() } -} -impl Storage { - /// Creates a new storage with uninit states. - /// - /// This should only be used for creating new peer from raft message. - pub fn uninit( - store_id: u64, - region: Region, - engine: ER, - read_scheduler: Scheduler>, - logger: &Logger, - ) -> Result { - let mut region_state = RegionLocalState::default(); - region_state.set_region(region); - Self::create( - store_id, - region_state, - RaftLocalState::default(), - RaftApplyState::default(), - engine, - read_scheduler, - false, - logger, - ) + #[inline] + pub fn apply_trace_mut(&mut self) -> &mut ApplyTrace { + &mut self.apply_trace } - /// Creates a new storage. - /// - /// All metadata should be initialized before calling this method. If the - /// region is destroyed, `None` will be returned. - pub fn new( - region_id: u64, - store_id: u64, - engine: ER, - read_scheduler: Scheduler>, - logger: &Logger, - ) -> Result>> { - let region_state = match engine.get_region_state(region_id, u64::MAX) { - Ok(Some(s)) => s, - res => { - return Err(box_err!( - "failed to get region state for region {}: {:?}", - region_id, - res - )); - } - }; - - if region_state.get_state() == PeerState::Tombstone { - return Ok(None); - } - - let raft_state = match engine.get_raft_state(region_id) { - Ok(Some(s)) => s, - res => { - return Err(box_err!("failed to get raft state: {:?}", res)); - } - }; - - let apply_state = match engine.get_apply_state(region_id, u64::MAX) { - Ok(Some(s)) => s, - res => { - return Err(box_err!("failed to get apply state: {:?}", res)); - } - }; - - Self::create( - store_id, - region_state, - raft_state, - apply_state, - engine, - read_scheduler, - true, - logger, - ) - .map(Some) + #[inline] + pub fn apply_trace(&self) -> &ApplyTrace { + &self.apply_trace } +} - fn create( +impl Storage { + pub(crate) fn create( store_id: u64, region_state: RegionLocalState, raft_state: RaftLocalState, @@ -208,6 +116,7 @@ impl Storage { engine: ER, read_scheduler: Scheduler>, persisted: bool, + apply_trace: ApplyTrace, logger: &Logger, ) -> Result { let peer = find_peer(region_state.get_region(), store_id); @@ -237,6 +146,7 @@ impl Storage { snap_state: RefCell::new(SnapState::Relax), gen_snap_task: RefCell::new(Box::new(None)), split_init: None, + apply_trace, }) } @@ -265,6 +175,9 @@ impl Storage { self.entry_storage.apply_state() } + /// Check if the storage is initialized. + /// + /// The storage is considered initialized when data is applied in memory. #[inline] pub fn is_initialized(&self) -> bool { self.region_state.get_tablet_index() != 0 @@ -363,7 +276,10 @@ impl raft::Storage for Storage { #[cfg(test)] mod tests { use std::{ - sync::mpsc::{sync_channel, Receiver, SyncSender}, + sync::{ + mpsc::{sync_channel, Receiver, SyncSender}, + Arc, + }, time::Duration, }; @@ -371,9 +287,7 @@ mod tests { ctor::{CfOptions, DbOptions}, kv::TestTabletFactory, }; - use engine_traits::{ - RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS, - }; + use engine_traits::{RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS}; use kvproto::{ metapb::{Peer, Region}, raft_serverpb::PeerState, @@ -381,14 +295,16 @@ mod tests { use raft::{Error as RaftError, StorageError}; use raftstore::store::{ util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, - TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + TabletSnapKey, TabletSnapManager, WriteTask, }; use slog::o; use tempfile::TempDir; use tikv_util::worker::Worker; use super::*; - use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes}; + use crate::{ + fsm::ApplyResReporter, operation::write_initial_states, raft::Apply, router::ApplyRes, + }; #[derive(Clone)] pub struct TestRouter { @@ -428,35 +344,6 @@ mod tests { region } - #[test] - fn test_write_initial_states() { - let region = new_region(); - let path = TempDir::new().unwrap(); - let engine = engine_test::new_temp_engine(&path); - let raft_engine = &engine.raft; - let mut wb = raft_engine.log_batch(10); - write_initial_states(&mut wb, region.clone()).unwrap(); - assert!(!wb.is_empty()); - raft_engine.consume(&mut wb, true).unwrap(); - - let local_state = raft_engine.get_region_state(4, 0).unwrap().unwrap(); - assert_eq!(local_state.get_state(), PeerState::Normal); - assert_eq!(*local_state.get_region(), region); - assert_eq!(local_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); - - let raft_state = raft_engine.get_raft_state(4).unwrap().unwrap(); - assert_eq!(raft_state.get_last_index(), RAFT_INIT_LOG_INDEX); - let hs = raft_state.get_hard_state(); - assert_eq!(hs.get_term(), RAFT_INIT_LOG_TERM); - assert_eq!(hs.get_commit(), RAFT_INIT_LOG_INDEX); - - let apply_state = raft_engine.get_apply_state(4, u64::MAX).unwrap().unwrap(); - assert_eq!(apply_state.get_applied_index(), RAFT_INIT_LOG_INDEX); - let ts = apply_state.get_truncated_state(); - assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); - assert_eq!(ts.get_term(), RAFT_INIT_LOG_TERM); - } - #[test] fn test_apply_snapshot() { let region = new_region(); @@ -474,7 +361,7 @@ mod tests { let ops = DbOptions::default(); let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); - let reg = TabletRegistry::new(factory, path.path().join("tablet")).unwrap(); + let reg = TabletRegistry::new(factory, path.path().join("tablets")).unwrap(); let worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); let sched = worker.scheduler(); let logger = slog_global::borrow_global().new(o!()); @@ -495,7 +382,7 @@ mod tests { // This index can't be set before load tablet. assert_ne!(10, s.entry_storage().applied_index()); assert_ne!(1, s.entry_storage().applied_term()); - assert_ne!(10, s.region_state().get_tablet_index()); + assert_eq!(10, s.region_state().get_tablet_index()); assert!(task.persisted_cb.is_some()); s.on_applied_snapshot(); @@ -521,7 +408,7 @@ mod tests { let ops = DbOptions::default(); let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); - let reg = TabletRegistry::new(factory, path.path().join("tablet")).unwrap(); + let reg = TabletRegistry::new(factory, path.path().join("tablets")).unwrap(); let tablet_ctx = TabletContext::new(®ion, Some(10)); reg.load(tablet_ctx, true).unwrap(); // setup read runner worker and peer storage @@ -544,6 +431,8 @@ mod tests { router, reg, sched, + Arc::default(), + None, logger, ); diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 224723bf4ad..67f2dec6160 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use crate::operation::{AdminCmdResult, CommittedEntries, GenSnapTask}; +use crate::operation::{AdminCmdResult, CommittedEntries, DataTrace, GenSnapTask}; #[derive(Debug)] pub enum ApplyTask { @@ -12,5 +12,6 @@ pub enum ApplyTask { pub struct ApplyRes { pub applied_index: u64, pub applied_term: u64, - pub admin_result: Vec, + pub admin_result: Box<[AdminCmdResult]>, + pub modifications: DataTrace, } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 447efe8ee1a..b387e729f8d 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -139,6 +139,11 @@ pub enum PeerMsg { ready_number: u64, }, QueryDebugInfo(DebugInfoChannel), + DataFlushed { + cf: &'static str, + tablet_index: u64, + flushed_index: u64, + }, /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), @@ -191,6 +196,15 @@ impl fmt::Debug for PeerMsg { PeerMsg::LogsFetched(fetched) => write!(fmt, "LogsFetched {:?}", fetched), PeerMsg::SnapshotGenerated(_) => write!(fmt, "SnapshotGenerated"), PeerMsg::QueryDebugInfo(_) => write!(fmt, "QueryDebugInfo"), + PeerMsg::DataFlushed { + cf, + tablet_index, + flushed_index, + } => write!( + fmt, + "DataFlushed cf {}, tablet_index {}, flushed_index {}", + cf, tablet_index, flushed_index + ), #[cfg(feature = "testexport")] PeerMsg::WaitFlush(_) => write!(fmt, "FlushMessages"), } diff --git a/components/raftstore-v2/tests/failpoints/mod.rs b/components/raftstore-v2/tests/failpoints/mod.rs index 26403f2f0a3..84f1de2803d 100644 --- a/components/raftstore-v2/tests/failpoints/mod.rs +++ b/components/raftstore-v2/tests/failpoints/mod.rs @@ -10,3 +10,4 @@ mod cluster; mod test_basic_write; mod test_bootstrap; +mod test_trace_apply; diff --git a/components/raftstore-v2/tests/failpoints/test_trace_apply.rs b/components/raftstore-v2/tests/failpoints/test_trace_apply.rs new file mode 100644 index 00000000000..15bf39d17ba --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_trace_apply.rs @@ -0,0 +1,7 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// TODO: check if it can recover from: +// - split not start +// - split not finish +// - two pending split the second one finished before the first one +// - all split finish diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index b09f351b066..55ad823b99d 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -36,7 +36,7 @@ use raftstore::store::{ use raftstore_v2::{ create_store_batch_system, router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, - Bootstrap, StoreSystem, + Bootstrap, StateStorage, StoreSystem, }; use slog::{debug, o, Logger}; use tempfile::TempDir; @@ -59,6 +59,13 @@ pub fn check_skip_wal(path: &str) { assert!(found, "no WAL found in {}", path); } +pub fn new_put_request(key: impl Into>, value: impl Into>) -> Request { + let mut req = Request::default(); + req.set_cmd_type(CmdType::Put); + req.mut_put().set_key(key.into()); + req.mut_put().set_value(value.into()); + req +} pub struct TestRouter(RaftRouter); impl Deref for TestRouter { @@ -209,6 +216,7 @@ pub struct RunningState { pub system: StoreSystem, pub cfg: Arc>, pub transport: TestTransport, + snap_mgr: TabletSnapManager, } impl RunningState { @@ -220,21 +228,33 @@ impl RunningState { concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, logger: &Logger, - ) -> (TestRouter, TabletSnapManager, Self) { - let cf_opts = DATA_CFS - .iter() - .copied() - .map(|cf| (cf, CfOptions::default())) - .collect(); - let factory = Box::new(TestTabletFactory::new(DbOptions::default(), cf_opts)); - let registry = TabletRegistry::new(factory, path).unwrap(); + ) -> (TestRouter, Self) { let raft_engine = engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), None) .unwrap(); + let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client.as_ref(), logger.clone()); let store_id = bootstrap.bootstrap_store().unwrap(); let mut store = Store::default(); store.set_id(store_id); + + let (router, mut system) = create_store_batch_system::( + &cfg.value(), + store_id, + logger.clone(), + ); + let cf_opts = DATA_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let mut db_opt = DbOptions::default(); + db_opt.set_state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + router.clone(), + ))); + let factory = Box::new(TestTabletFactory::new(db_opt, cf_opts)); + let registry = TabletRegistry::new(factory, path.join("tablets")).unwrap(); if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { let factory = registry.tablet_factory(); let path = registry.tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); @@ -247,12 +267,6 @@ impl RunningState { factory.open_tablet(ctx, &path).unwrap(); } - let (router, mut system) = create_store_batch_system::( - &cfg.value(), - store_id, - logger.clone(), - ); - let router = RaftRouter::new(store_id, registry.clone(), router); let store_meta = router.store_meta().clone(); let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()); @@ -280,8 +294,9 @@ impl RunningState { system, cfg, transport, + snap_mgr, }; - (TestRouter(router), snap_mgr, state) + (TestRouter(router), state) } } @@ -296,7 +311,6 @@ pub struct TestNode { path: TempDir, running_state: Option, logger: Logger, - snap_mgr: Option, } impl TestNode { @@ -308,12 +322,11 @@ impl TestNode { path, running_state: None, logger, - snap_mgr: None, } } fn start(&mut self, cfg: Arc>, trans: TestTransport) -> TestRouter { - let (router, snap_mgr, state) = RunningState::new( + let (router, state) = RunningState::new( &self.pd_client, self.path.path(), cfg, @@ -323,7 +336,6 @@ impl TestNode { &self.logger, ); self.running_state = Some(state); - self.snap_mgr = Some(snap_mgr); router } @@ -352,10 +364,6 @@ impl TestNode { self.running_state.as_ref() } - pub fn snap_mgr(&self) -> Option<&TabletSnapManager> { - self.snap_mgr.as_ref() - } - pub fn id(&self) -> u64 { self.running_state().unwrap().store_id } @@ -521,8 +529,8 @@ impl Cluster { msg.get_message().get_snapshot().get_metadata().get_term(), msg.get_message().get_snapshot().get_metadata().get_index(), ); - let from_snap_mgr = self.node(from_offset).snap_mgr().unwrap(); - let to_snap_mgr = self.node(offset).snap_mgr().unwrap(); + let from_snap_mgr = &self.node(from_offset).running_state().unwrap().snap_mgr; + let to_snap_mgr = &self.node(offset).running_state().unwrap().snap_mgr; let gen_path = from_snap_mgr.tablet_gen_path(&key); let recv_path = to_snap_mgr.final_recv_path(&key); assert!(gen_path.exists()); @@ -549,3 +557,12 @@ impl Cluster { } } } + +impl Drop for Cluster { + fn drop(&mut self) { + self.routers.clear(); + for node in &mut self.nodes { + node.stop(); + } + } +} diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index c3061be0d2b..fbf54eaa243 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -15,4 +15,5 @@ mod test_pd_heartbeat; mod test_read; mod test_split; mod test_status; +mod test_trace_apply; mod test_transfer_leader; diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs index 60495b151e8..df806063249 100644 --- a/components/raftstore-v2/tests/integrations/test_split.rs +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -2,6 +2,7 @@ use std::{thread, time::Duration}; +use engine_traits::{RaftEngineReadOnly, CF_RAFT}; use futures::executor::block_on; use kvproto::{ metapb, pdpb, @@ -9,6 +10,7 @@ use kvproto::{ AdminCmdType, AdminRequest, CmdType, RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, }, }; +use raftstore::store::{INIT_EPOCH_VER, RAFT_INIT_LOG_INDEX}; use raftstore_v2::router::PeerMsg; use tikv_util::store::new_peer; @@ -128,6 +130,7 @@ fn split_region( fn test_split() { let mut cluster = Cluster::default(); let store_id = cluster.node(0).id(); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); let router = &mut cluster.routers[0]; // let factory = cluster.node(0).tablet_factory(); @@ -139,6 +142,8 @@ fn test_split() { // Region 2 ["", ""] peer(1, 3) // -> Region 2 ["", "k22"] peer(1, 3) // Region 1000 ["k22", ""] peer(1, 10) + let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); let (left, right) = split_region( router, region, @@ -150,6 +155,23 @@ fn test_split() { b"k22", false, ); + let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); + assert_ne!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + INIT_EPOCH_VER + 1 + ); + let region_state0 = raft_engine + .get_region_state(2, region_state.get_tablet_index()) + .unwrap() + .unwrap(); + assert_eq!(region_state, region_state0); + let flushed_index = raft_engine.get_flushed_index(2, CF_RAFT).unwrap().unwrap(); + assert!( + flushed_index >= region_state.get_tablet_index(), + "{flushed_index} >= {}", + region_state.get_tablet_index() + ); // Region 2 ["", "k22"] peer(1, 3) // -> Region 2 ["", "k11"] peer(1, 3) @@ -165,10 +187,35 @@ fn test_split() { b"k11", false, ); + let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); + assert_ne!( + region_state.get_tablet_index(), + region_state0.get_tablet_index() + ); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + INIT_EPOCH_VER + 2 + ); + let region_state1 = raft_engine + .get_region_state(2, region_state.get_tablet_index()) + .unwrap() + .unwrap(); + assert_eq!(region_state, region_state1); + let flushed_index = raft_engine.get_flushed_index(2, CF_RAFT).unwrap().unwrap(); + assert!( + flushed_index >= region_state.get_tablet_index(), + "{flushed_index} >= {}", + region_state.get_tablet_index() + ); // Region 1000 ["k22", ""] peer(1, 10) // -> Region 1000 ["k22", "k33"] peer(1, 10) // Region 1002 ["k33", ""] peer(1, 12) + let region_state = raft_engine + .get_region_state(1000, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); let _ = split_region( router, right, @@ -180,4 +227,30 @@ fn test_split() { b"k33", false, ); + let region_state = raft_engine + .get_region_state(1000, u64::MAX) + .unwrap() + .unwrap(); + assert_ne!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + INIT_EPOCH_VER + 2 + ); + let region_state2 = raft_engine + .get_region_state(1000, region_state.get_tablet_index()) + .unwrap() + .unwrap(); + assert_eq!(region_state, region_state2); + let flushed_index = raft_engine.get_flushed_index(2, CF_RAFT).unwrap().unwrap(); + assert!( + flushed_index >= region_state.get_tablet_index(), + "{flushed_index} >= {}", + region_state.get_tablet_index() + ); } + +// TODO: test split race with +// - created peer +// - created peer with pending snapshot +// - created peer with persisting snapshot +// - created peer with persisted snapshot diff --git a/components/raftstore-v2/tests/integrations/test_trace_apply.rs b/components/raftstore-v2/tests/integrations/test_trace_apply.rs new file mode 100644 index 00000000000..def064e6d29 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_trace_apply.rs @@ -0,0 +1,211 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{path::Path, time::Duration}; + +use engine_traits::{DbOptionsExt, MiscExt, Peekable, CF_LOCK, CF_WRITE, DATA_CFS}; +use futures::executor::block_on; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use raftstore_v2::router::PeerMsg; + +use crate::cluster::{new_put_request, Cluster}; + +fn count_file(path: &Path, pat: impl Fn(&Path) -> bool) -> usize { + let mut count = 0; + for path in std::fs::read_dir(path).unwrap() { + if pat(&path.unwrap().path()) { + count += 1; + } + } + count +} + +fn count_sst(path: &Path) -> usize { + count_file(path, |path| { + path.extension().map_or(false, |ext| ext == "sst") + }) +} + +fn count_info_log(path: &Path) -> usize { + count_file(path, |path| { + path.file_name() + .unwrap() + .to_string_lossy() + .starts_with("LOG") + }) +} + +/// Test if data will be recovered correctly after being restarted. +#[test] +fn test_data_recovery() { + let mut cluster = Cluster::default(); + let registry = cluster.node(0).tablet_registry(); + let tablet_2_path = registry.tablet_path(2, RAFT_INIT_LOG_INDEX); + // The rocksdb is a bootstrapped tablet, so it will be opened and closed in + // bootstrap, and then open again in fsm initialization. + assert_eq!(count_info_log(&tablet_2_path), 2); + let router = &mut cluster.routers[0]; + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + // Write 100 keys to default CF and not flush. + let mut req = router.new_request_for(2); + for i in 0..100 { + let put_req = new_put_request(format!("key{}", i), format!("value{}", i)); + req.clear_requests(); + req.mut_requests().push(put_req); + router + .send(2, PeerMsg::raft_command(req.clone()).0) + .unwrap(); + } + + // Write 100 keys to write CF and flush half. + let mut sub = None; + for i in 0..50 { + let mut put_req = new_put_request(format!("key{}", i), format!("value{}", i)); + put_req.mut_put().set_cf(CF_WRITE.to_owned()); + req.clear_requests(); + req.mut_requests().push(put_req); + let (ch, s) = PeerMsg::raft_command(req.clone()); + router.send(2, ch).unwrap(); + sub = Some(s); + } + let resp = block_on(sub.take().unwrap().result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cf(CF_WRITE, true).unwrap(); + let router = &mut cluster.routers[0]; + for i in 50..100 { + let mut put_req = new_put_request(format!("key{}", i), format!("value{}", i)); + put_req.mut_put().set_cf(CF_WRITE.to_owned()); + req.clear_requests(); + req.mut_requests().push(put_req); + router + .send(2, PeerMsg::raft_command(req.clone()).0) + .unwrap(); + } + + // Write 100 keys to lock CF and flush all. + for i in 0..100 { + let mut put_req = new_put_request(format!("key{}", i), format!("value{}", i)); + put_req.mut_put().set_cf(CF_LOCK.to_owned()); + req.clear_requests(); + req.mut_requests().push(put_req); + let (ch, s) = PeerMsg::raft_command(req.clone()); + router.send(2, ch).unwrap(); + sub = Some(s); + } + let resp = block_on(sub.take().unwrap().result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cf(CF_LOCK, true).unwrap(); + + // Make sure all keys must be written. + let router = &mut cluster.routers[0]; + let snap = router.stale_snapshot(2); + for cf in DATA_CFS { + for i in 0..100 { + let key = format!("key{}", i); + let value = snap.get_value_cf(cf, key.as_bytes()).unwrap(); + assert_eq!( + value.as_deref(), + Some(format!("value{}", i).as_bytes()), + "{} {}", + cf, + key + ); + } + } + let registry = cluster.node(0).tablet_registry(); + cached = registry.get(2).unwrap(); + cached + .latest() + .unwrap() + .set_db_options(&[("avoid_flush_during_shutdown", "true")]) + .unwrap(); + drop((snap, cached)); + + cluster.restart(0); + + let registry = cluster.node(0).tablet_registry(); + cached = registry.get(2).unwrap(); + cached + .latest() + .unwrap() + .set_db_options(&[("avoid_flush_during_shutdown", "true")]) + .unwrap(); + let router = &mut cluster.routers[0]; + + // Write another key to ensure all data are recovered. + let put_req = new_put_request("key101", "value101"); + req.clear_requests(); + req.mut_requests().push(put_req); + let (msg, sub) = PeerMsg::raft_command(req.clone()); + router.send(2, msg).unwrap(); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // After being restarted, all unflushed logs should be applied again. So there + // should be no missing data. + let snap = router.stale_snapshot(2); + for cf in DATA_CFS { + for i in 0..100 { + let key = format!("key{}", i); + let value = snap.get_value_cf(cf, key.as_bytes()).unwrap(); + assert_eq!( + value.as_deref(), + Some(format!("value{}", i).as_bytes()), + "{} {}", + cf, + key + ); + } + } + + // There is a restart, so LOG file should be rotate. + assert_eq!(count_info_log(&tablet_2_path), 3); + // We only trigger Flush twice, so there should be only 2 files. And because WAL + // is disabled, so when rocksdb is restarted, there should be no WAL to recover, + // so no additional flush will be triggered. + assert_eq!(count_sst(&tablet_2_path), 2); + + cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cfs(DATA_CFS, true).unwrap(); + + // Although all CFs are triggered again, but recovery should only write: + // 1. [0, 101) to CF_DEFAULT + // 2. [50, 100) to CF_WRITE + // + // So there will be only 2 memtables to be flushed. + assert_eq!(count_sst(&tablet_2_path), 4); + + drop((snap, cached)); + + cluster.restart(0); + + let router = &mut cluster.routers[0]; + + assert_eq!(count_info_log(&tablet_2_path), 4); + // Because data is flushed before restarted, so all data can be read + // immediately. + let snap = router.stale_snapshot(2); + for cf in DATA_CFS { + for i in 0..100 { + let key = format!("key{}", i); + let value = snap.get_value_cf(cf, key.as_bytes()).unwrap(); + assert_eq!( + value.as_deref(), + Some(format!("value{}", i).as_bytes()), + "{} {}", + cf, + key + ); + } + } + // Trigger flush again. + cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cfs(DATA_CFS, true).unwrap(); + + // There is no recovery, so there should be nothing to flush. + assert_eq!(count_sst(&tablet_2_path), 4); +} diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index b69b3484e0c..9b13ce6af9b 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -151,6 +151,17 @@ impl ExtraWrite { } } + #[inline] + pub fn merge_v2(&mut self, log_batch: L) { + if let ExtraWrite::None = self { + *self = ExtraWrite::V2(log_batch); + } else if let ExtraWrite::V1(_) = self { + unreachable!("v1 and v2 are mixed used"); + } else if let ExtraWrite::V2(l) = self { + l.merge(log_batch).unwrap(); + } + } + #[inline] pub fn v2_mut(&mut self) -> Option<&mut L> { if let ExtraWrite::V2(l) = self { diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index 78dbccbf585..d43e33a4e08 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -447,14 +447,14 @@ mod tests { db.put(b"za1", b"").unwrap(); db.put(b"zb1", &value).unwrap(); db.put(b"zc1", &value).unwrap(); - db.flush_cfs(true /* wait */).unwrap(); + db.flush_cfs(&[], true /* wait */).unwrap(); db.put(b"zb2", &value).unwrap(); db.put(b"zc2", &value).unwrap(); db.put(b"zc3", &value).unwrap(); db.put(b"zc4", &value).unwrap(); db.put(b"zc5", &value).unwrap(); db.put(b"zc6", &value).unwrap(); - db.flush_cfs(true /* wait */).unwrap(); + db.flush_cfs(&[], true /* wait */).unwrap(); db.compact_range( CF_DEFAULT, None, // start_key None, // end_key diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index fe58a2587a7..ccf5f94e39e 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -460,7 +460,7 @@ mod tests { let db = &engines.kv; for &(ref k, level) in &levels { db.put(&data_key(k), k).unwrap(); - db.flush_cfs(true).unwrap(); + db.flush_cfs(&[], true).unwrap(); data.push((k.to_vec(), k.to_vec())); db.compact_files_in_range(Some(&data_key(k)), Some(&data_key(k)), Some(level)) .unwrap(); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 065afd8ec0c..8d64ff74c8b 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -2109,7 +2109,7 @@ mod test { let cache = config.storage.block_cache.build_shared_cache(); let factory = KvEngineFactoryBuilder::new(env, &config, cache).build(); - let reg = TabletRegistry::new(Box::new(factory), path.path()).unwrap(); + let reg = TabletRegistry::new(Box::new(factory), path.path().join("tablets")).unwrap(); for i in 1..6 { let ctx = TabletContext::with_infinite_region(i, Some(10)); diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 2680c778f02..c21599f47a6 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -5,10 +5,11 @@ use std::{path::Path, sync::Arc}; use engine_rocks::{ raw::{Cache, Env, Statistics}, CompactedEventSender, CompactionListener, FlowListener, RocksCfOptions, RocksCompactionJobInfo, - RocksDbOptions, RocksEngine, RocksEventListener, + RocksDbOptions, RocksEngine, RocksEventListener, RocksPersistenceListener, }; use engine_traits::{ - CompactionJobInfo, MiscExt, Result, TabletContext, TabletFactory, CF_DEFAULT, CF_WRITE, + CompactionJobInfo, MiscExt, PersistenceListener, Result, StateStorage, TabletContext, + TabletFactory, CF_DEFAULT, CF_WRITE, }; use kvproto::kvrpcpb::ApiVersion; use raftstore::RegionInfoAccessor; @@ -28,6 +29,7 @@ struct FactoryInner { flow_listener: Option, sst_recovery_sender: Option>, statistics: Statistics, + state_storage: Option>, lite: bool, } @@ -48,6 +50,7 @@ impl KvEngineFactoryBuilder { flow_listener: None, sst_recovery_sender: None, statistics: Statistics::new_titan(), + state_storage: None, lite: false, }, compact_event_sender: None, @@ -85,6 +88,13 @@ impl KvEngineFactoryBuilder { self } + /// A storage for persisting flush states, which is used for recovering when + /// disable WAL. Only work for v2. + pub fn state_storage(mut self, storage: Arc) -> Self { + self.inner.state_storage = Some(storage); + self + } + pub fn build(self) -> KvEngineFactory { KvEngineFactory { inner: Arc::new(self.inner), @@ -181,6 +191,16 @@ impl TabletFactory for KvEngineFactory { if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { db_opts.add_event_listener(listener.clone_with(ctx.id, suffix)); } + if let Some(storage) = &self.inner.state_storage + && let Some(flush_state) = ctx.flush_state { + let listener = PersistenceListener::new( + ctx.id, + ctx.suffix.unwrap(), + flush_state, + storage.clone(), + ); + db_opts.add_event_listener(RocksPersistenceListener::new(listener)); + } let kv_engine = engine_rocks::util::new_engine_opt(path.to_str().unwrap(), db_opts, cf_opts); if let Err(e) = &kv_engine { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 05d5c743d76..9a383d71338 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -75,7 +75,9 @@ use api_version::{ApiV1, ApiV2, KeyMode, KvFormat, RawValue}; use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; use collections::HashMap; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; -use engine_traits::{raw_ttl::ttl_to_expire_ts, CfName, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS}; +use engine_traits::{ + raw_ttl::ttl_to_expire_ts, CfName, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, DATA_CFS_LEN, +}; use futures::prelude::*; use kvproto::{ kvrpcpb::{ @@ -1538,7 +1540,7 @@ impl Storage { [(Some(start_key.as_encoded()), Some(end_key.as_encoded()))], )?; - let mut modifies = Vec::with_capacity(DATA_CFS.len()); + let mut modifies = Vec::with_capacity(DATA_CFS_LEN); for cf in DATA_CFS { modifies.push(Modify::DeleteRange( cf, diff --git a/tests/failpoints/cases/test_table_properties.rs b/tests/failpoints/cases/test_table_properties.rs index 536149d48b5..559ad5b0746 100644 --- a/tests/failpoints/cases/test_table_properties.rs +++ b/tests/failpoints/cases/test_table_properties.rs @@ -82,12 +82,12 @@ fn test_check_need_gc() { // TEST 2: props.num_versions as f64 > props.num_rows as f64 * ratio_threshold // return true. do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); do_gc(&raw_engine, 2, &mut gc_runner, &dir); do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); // Set ratio_threshold, let (props.num_versions as f64 > props.num_rows as // f64 * ratio_threshold) return true @@ -185,7 +185,7 @@ fn test_skip_gc_by_check() { let mut gc_runner = TestGcRunner::new(0); do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); // The min_mvcc_ts ts > gc safepoint, check_need_gc return false, don't call // dofilter @@ -208,12 +208,12 @@ fn test_skip_gc_by_check() { // TEST 2:When is_bottommost_level = false, // write data to level2 do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); do_gc(&raw_engine, 2, &mut gc_runner, &dir); do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); // Set ratio_threshold, let (props.num_versions as f64 > props.num_rows as // f64 * ratio_threshold) return false diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index 22d23f7adba..67e5e261dab 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -27,7 +27,7 @@ fn check_available(cluster: &mut Cluster) { for i in 0..1000 { let last_available = stats.get_available(); cluster.must_put(format!("k{}", i).as_bytes(), &value); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); sleep_ms(20); let stats = pd_client.get_store_stats(1).unwrap(); @@ -58,7 +58,7 @@ fn test_simple_store_stats(cluster: &mut Cluster) { } let engine = cluster.get_engine(1); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); let last_stats = pd_client.get_store_stats(1).unwrap(); assert_eq!(last_stats.get_region_count(), 1); @@ -67,7 +67,7 @@ fn test_simple_store_stats(cluster: &mut Cluster) { let region = pd_client.get_region(b"").unwrap(); cluster.must_split(®ion, b"k2"); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); // wait report region count after split for _ in 0..100 { diff --git a/tests/integrations/raftstore/test_update_region_size.rs b/tests/integrations/raftstore/test_update_region_size.rs index ee4fb79ac62..f2ff0d4f217 100644 --- a/tests/integrations/raftstore/test_update_region_size.rs +++ b/tests/integrations/raftstore/test_update_region_size.rs @@ -9,7 +9,7 @@ use tikv_util::config::*; fn flush(cluster: &mut Cluster) { for engines in cluster.engines.values() { - engines.kv.flush_cfs(true).unwrap(); + engines.kv.flush_cfs(&[], true).unwrap(); } } diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index d1abbcb924c..412f9f5a777 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -211,7 +211,7 @@ fn test_delete_files_in_range_for_titan() { .unwrap(); // Flush and compact the kvs into L6. - engines.kv.flush_cfs(true).unwrap(); + engines.kv.flush_cfs(&[], true).unwrap(); engines.kv.compact_files_in_range(None, None, None).unwrap(); let db = engines.kv.as_inner(); let value = db.get_property_int("rocksdb.num-files-at-level0").unwrap(); @@ -254,9 +254,9 @@ fn test_delete_files_in_range_for_titan() { // Used to trigger titan gc let engine = &engines.kv; engine.put(b"1", b"1").unwrap(); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); engine.put(b"2", b"2").unwrap(); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); engine .compact_files_in_range(Some(b"0"), Some(b"3"), Some(1)) .unwrap(); From 77e4896507501059131e8951523d1050fb464b0f Mon Sep 17 00:00:00 2001 From: fengou1 <85682690+fengou1@users.noreply.github.com> Date: Mon, 19 Dec 2022 11:46:54 +0800 Subject: [PATCH 411/676] br: enlarge the raft client backoff in recovery mode since ebs restore volume very poor during restore (#13954) close tikv/tikv#13955 fix: enlarge the raft client backoff in recovery mode since ebs restore volume very poor during restor Signed-off-by: fengou1 --- components/snap_recovery/src/init_cluster.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/components/snap_recovery/src/init_cluster.rs b/components/snap_recovery/src/init_cluster.rs index 9147810f03c..42c1d0b1882 100644 --- a/components/snap_recovery/src/init_cluster.rs +++ b/components/snap_recovery/src/init_cluster.rs @@ -92,6 +92,12 @@ pub fn enter_snap_recovery_mode(config: &mut TikvConfig) { // disable resolve ts during the recovery config.resolved_ts.enable = false; + // ebs volume has very poor performance during restore, it easy to cause the + // raft client timeout, at the same time clean up all message included + // significant message. restore is not memory sensetive, we may keep + // messages as much as possible during the network disturbing in recovery mode + config.server.raft_client_max_backoff = ReadableDuration::secs(20); + // Disable region split during recovering. config.coprocessor.region_max_size = Some(ReadableSize::gb(MAX_REGION_SIZE)); config.coprocessor.region_split_size = ReadableSize::gb(MAX_REGION_SIZE); From a069c1b139622e274710a5ebbdfec539edb37e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 19 Dec 2022 14:24:53 +0800 Subject: [PATCH 412/676] log-backup: make PITR available when using partial cert chain (#13961) close tikv/tikv#13959 This PR sets X509_V_FLAG_PARTIAL_CHAIN, so we would trust the CA even there isn't a root CA provided. Signed-off-by: hillium Co-authored-by: qupeng --- components/backup-stream/src/metadata/store/lazy_etcd.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 88d44b09252..b712a23973d 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -4,6 +4,7 @@ use std::{sync::Arc, time::Duration}; use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; +use openssl::x509::verify::X509VerifyFlags; use tikv_util::{ info, stream::{RetryError, RetryExt}, @@ -33,6 +34,12 @@ impl ConnectionConfig { opts = opts.with_openssl_tls( OpenSslClientConfig::default() .ca_cert_pem(&tls.ca) + // Some of users may prefer using multi-level self-signed certs. + // In this scenario, we must set this flag or openssl would probably complain it cannot found the root CA. + // (Because the flags we provide allows users providing exactly one CA cert.) + // We haven't make it configurable because it is enabled in gRPC by default too. + // TODO: Perhaps implement grpc-io based etcd client, fully remove the difference between gRPC TLS and our custom TLS? + .manually(|c| c.cert_store_mut().set_flags(X509VerifyFlags::PARTIAL_CHAIN)) .client_cert_pem_and_key(&tls.client_cert, &tls.client_key.0), ) } From a0cff586f524df996438287c9c4d7cfc6a762b6c Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 19 Dec 2022 17:02:54 +0800 Subject: [PATCH 413/676] Makefile: check before sort deps (#13951) close tikv/tikv#13950, ref tikv/tikv#13950 cargo sort will somehow modify Cargo.toml even there is nothing to change. This PR avoid cache being invalidated by check first. Signed-off-by: Jay Lee --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f60fb16bcb0..6e8cada8b6f 100644 --- a/Makefile +++ b/Makefile @@ -334,7 +334,7 @@ pre-format: unset-override format: pre-format @cargo fmt - @cargo sort -w >/dev/null + @cargo sort -w -c &>/dev/null || cargo sort -w >/dev/null doc: @cargo doc --workspace --document-private-items \ From 27fa5bb0876a07bc9850343f036189feb7b9d978 Mon Sep 17 00:00:00 2001 From: Hu# Date: Mon, 19 Dec 2022 17:20:54 +0800 Subject: [PATCH 414/676] Skip write `prewrite_lock` in flashback locks (#13960) close tikv/tikv#13958, ref tikv/tikv#13958 Since the rollback ts for flashback are derived from the ts of the lock, we wrote prewrite lock which start_ts is flashback in Prewrite Phase. So the Prewrite lock we wrote in the flashback was rollbacked when we retry prepare. This introduces the case mentioned in https://github.com/tikv/tikv/issues/13958 The solution is: if such a lock exists, skip it and go to the Commit Phase. Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- src/storage/mod.rs | 68 +++++++++++++++++++ .../txn/actions/flashback_to_version.rs | 28 ++++---- .../flashback_to_version_read_phase.rs | 1 + 3 files changed, 82 insertions(+), 15 deletions(-) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 9a383d71338..60e9b965c5d 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -5202,6 +5202,74 @@ mod tests { ); } + #[test] + fn test_mvcc_flashback_retry_prepare() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let (tx, rx) = channel(); + let mut ts = TimeStamp::zero(); + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![Mutation::make_put(Key::from_raw(b"k"), b"v@1".to_vec())], + b"k".to_vec(), + *ts.incr(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k")], + ts, + *ts.incr(), + Context::default(), + ), + expect_value_callback(tx.clone(), 1, TxnStatus::committed(ts)), + ) + .unwrap(); + rx.recv().unwrap(); + expect_value( + b"v@1".to_vec(), + block_on(storage.get(Context::default(), Key::from_raw(b"k"), ts)) + .unwrap() + .0, + ); + // Try to prepare flashback first. + let flashback_start_ts = *ts.incr(); + let flashback_commit_ts = *ts.incr(); + storage + .sched_txn_command( + new_flashback_rollback_lock_cmd( + flashback_start_ts, + TimeStamp::zero(), + Key::from_raw(b"k"), + Some(Key::from_raw(b"z")), + Context::default(), + ), + expect_ok_callback(tx, 0), + ) + .unwrap(); + rx.recv().unwrap(); + // Mock the prepare flashback retry. + run_flashback_to_version( + &storage, + flashback_start_ts, + flashback_commit_ts, + TimeStamp::zero(), + Key::from_raw(b"k"), + Some(Key::from_raw(b"z")), + ); + expect_none( + block_on(storage.get(Context::default(), Key::from_raw(b"k"), flashback_commit_ts)) + .unwrap() + .0, + ); + } + #[test] fn test_high_priority_get_put() { let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 2710935efb1..f44854159c0 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -14,11 +14,13 @@ pub fn flashback_to_version_read_lock( reader: &mut MvccReader, next_lock_key: Key, end_key: Option<&Key>, + flashback_start_ts: TimeStamp, ) -> TxnResult> { let result = reader.scan_locks( Some(&next_lock_key), end_key, - |_| true, + // Skip the `prewrite_lock`. This lock will appear when retrying prepare + |lock| lock.ts != flashback_start_ts, FLASHBACK_BATCH_SIZE, ); let (key_locks, _) = result?; @@ -147,6 +149,9 @@ pub fn prewrite_flashback_key( flashback_version: TimeStamp, flashback_start_ts: TimeStamp, ) -> TxnResult<()> { + if reader.load_lock(key_to_lock)?.is_some() { + return Ok(()); + } let old_write = reader.get_write(key_to_lock, flashback_version, None)?; // Flashback the value in `CF_DEFAULT` as well if the old write is a // `WriteType::Put` without the short value. @@ -310,15 +315,17 @@ pub mod tests { key: &[u8], start_ts: impl Into, ) -> usize { + let start_ts = start_ts.into(); let next_key = Key::from_raw(keys::next_key(key).as_slice()); let key = Key::from_raw(key); let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot.clone(), Some(ScanMode::Forward), &ctx); let key_locks = - flashback_to_version_read_lock(&mut reader, key, Some(next_key).as_ref()).unwrap(); + flashback_to_version_read_lock(&mut reader, key, Some(next_key).as_ref(), start_ts) + .unwrap(); let cm = ConcurrencyManager::new(TimeStamp::zero()); - let mut txn = MvccTxn::new(start_ts.into(), cm); + let mut txn = MvccTxn::new(start_ts, cm); rollback_locks(&mut txn, snapshot, key_locks).unwrap(); let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); @@ -578,20 +585,11 @@ pub mod tests { 2 ); // Retry Prepare - // Unlock `k`, put rollback record and delete the value of `k`. - assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 3); - // Lock and write the value of `k`. - assert_eq!( - must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), - 2 - ); - // Retry Prepare - // Only unlock `k` since there is an overlapped rollback record. - assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 1); - // Only lock `k` since the value of `k` has already existed. + // Skip `k` no need to write again. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); assert_eq!( must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), - 1 + 0 ); } diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index 769171d46e0..7fdc86288c2 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -137,6 +137,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { &mut reader, next_lock_key, self.end_key.as_ref(), + self.start_ts, )?; if key_locks.is_empty() { // - No more locks to rollback, continue to the Prewrite Phase. From 545ffb82db0475bd77d288207793720a2e71fd93 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 19 Dec 2022 18:32:54 +0800 Subject: [PATCH 415/676] *: impl raftkv2 snapshot and write (#13957) ref tikv/tikv#12842 This PR implements snapshot and write for raftkv2. Write are refactored to only use simple codec to reduce allocation and parse overhead, it also makes code simpler. Snapshot are refactored to return future instead of async function. Otherwise it can't `Engine` requirement. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + Cargo.toml | 1 + components/raftstore-v2/src/fsm/peer.rs | 21 +- components/raftstore-v2/src/fsm/store.rs | 4 + components/raftstore-v2/src/lib.rs | 3 +- .../src/operation/command/admin/mod.rs | 12 +- .../command/admin/transfer_leader.rs | 34 +- .../raftstore-v2/src/operation/command/mod.rs | 58 +--- .../src/operation/command/write/mod.rs | 44 ++- .../operation/command/write/simple_write.rs | 294 ++++++++--------- components/raftstore-v2/src/operation/mod.rs | 3 +- .../raftstore-v2/src/operation/query/local.rs | 98 +++--- .../raftstore-v2/src/operation/query/mod.rs | 10 +- .../raftstore-v2/src/operation/ready/mod.rs | 37 ++- components/raftstore-v2/src/raft/peer.rs | 8 +- components/raftstore-v2/src/router/imp.rs | 8 +- components/raftstore-v2/src/router/message.rs | 107 +++--- components/raftstore-v2/src/router/mod.rs | 4 +- .../src/router/response_channel.rs | 270 +++++++++++++-- components/raftstore-v2/src/worker/pd/mod.rs | 2 +- .../tests/failpoints/test_basic_write.rs | 31 +- .../tests/integrations/cluster.rs | 28 +- .../tests/integrations/test_basic_write.rs | 83 ++--- .../tests/integrations/test_conf_change.rs | 24 +- .../tests/integrations/test_read.rs | 16 +- .../tests/integrations/test_split.rs | 27 +- .../tests/integrations/test_trace_apply.rs | 68 ++-- .../integrations/test_transfer_leader.rs | 21 +- components/raftstore/src/store/fsm/apply.rs | 4 +- components/raftstore/src/store/fsm/peer.rs | 8 +- components/raftstore/src/store/peer.rs | 6 +- components/raftstore/src/store/util.rs | 101 +++--- components/raftstore/src/store/worker/read.rs | 8 +- src/server/mod.rs | 1 + src/server/raftkv/mod.rs | 2 +- src/server/raftkv2/mod.rs | 307 ++++++++++++++++++ 36 files changed, 1145 insertions(+), 609 deletions(-) create mode 100644 src/server/raftkv2/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 11aa05f2140..97f540aa100 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6306,6 +6306,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", + "raftstore-v2", "rand 0.7.3", "regex", "reqwest", diff --git a/Cargo.toml b/Cargo.toml index e09b422299e..4c8af61e554 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -136,6 +136,7 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft_log_engine = { workspace = true } raftstore = { workspace = true, features = ["engine_rocks"] } +raftstore-v2 = { workspace = true } rand = "0.7.3" regex = "1.3" resource_metering = { workspace = true } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 793e7a340f2..20e7a8f3c2b 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -228,9 +228,20 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.on_receive_command(cmd.send_time); self.on_query(cmd.request, cmd.ch) } - PeerMsg::RaftCommand(cmd) => { + PeerMsg::AdminCommand(cmd) => { self.on_receive_command(cmd.send_time); - self.on_command(cmd.request, cmd.ch) + self.fsm + .peer_mut() + .on_admin_command(self.store_ctx, cmd.request, cmd.ch) + } + PeerMsg::SimpleWrite(write) => { + self.on_receive_command(write.send_time); + self.fsm.peer_mut().on_simple_write( + self.store_ctx, + write.header, + write.data, + write.ch, + ); } PeerMsg::Tick(tick) => self.on_tick(tick), PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(self.store_ctx, res), @@ -263,6 +274,12 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .peer_mut() .on_data_flushed(cf, tablet_index, flushed_index); } + PeerMsg::PeerUnreachable { to_peer_id } => { + self.fsm.peer_mut().on_peer_unreachable(to_peer_id) + } + PeerMsg::StoreUnreachable { to_store_id } => { + self.fsm.peer_mut().on_store_unreachable(to_store_id) + } #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 349d5ad3252..bd31de69496 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -157,6 +157,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => self.fsm.store.on_raft_message(self.store_ctx, msg), StoreMsg::SplitInit(msg) => self.fsm.store.on_split_init(self.store_ctx, msg), + StoreMsg::StoreUnreachable { to_store_id } => self + .fsm + .store + .on_store_unreachable(self.store_ctx, to_store_id), } } } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 9ddb577ab5c..848e5fda8b2 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -25,6 +25,7 @@ #![feature(array_windows)] #![feature(div_duration)] #![feature(box_into_inner)] +#![feature(assert_matches)] mod batch; mod bootstrap; @@ -38,5 +39,5 @@ pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; -pub use operation::StateStorage; +pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{Error, Result}; diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index d07c1b4a35c..fcb968a2195 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -37,7 +37,16 @@ impl Peer { apply::notify_req_region_removed(self.region_id(), ch); return; } - if let Err(e) = self.validate_command(&req, &mut ctx.raft_metrics) { + if !req.has_admin_request() { + let e = box_err!("{:?} expect only execute admin command", self.logger.list()); + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + let cmd_type = req.get_admin_request().get_cmd_type(); + if let Err(e) = + self.validate_command(req.get_header(), Some(cmd_type), &mut ctx.raft_metrics) + { let resp = cmd_resp::new_error(e); ch.report_error(resp); return; @@ -57,7 +66,6 @@ impl Peer { ch.report_error(resp); return; } - let cmd_type = req.get_admin_request().get_cmd_type(); if let Some(conflict) = self.proposal_control_mut().check_conflict(Some(cmd_type)) { conflict.delay_channel(ch); return; diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index e8105a66322..1c25b363d59 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -9,7 +9,7 @@ use kvproto::{ disk_usage::DiskUsage, metapb, raft_cmdpb::{ - AdminCmdType, AdminRequest, AdminResponse, CmdType, PutRequest, RaftCmdRequest, Request, + AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest, RaftRequestHeader, TransferLeaderRequest, }, }; @@ -30,6 +30,7 @@ use super::AdminCmdResult; use crate::{ batch::StoreContext, fsm::ApplyResReporter, + operation::command::write::SimpleWriteEncoder, raft::{Apply, Peer}, router::{CmdResChannel, PeerMsg, PeerTick}, }; @@ -199,7 +200,7 @@ impl Peer { cmd.mut_admin_request() .set_cmd_type(AdminCmdType::TransferLeader); cmd.mut_admin_request().mut_transfer_leader().set_peer(from); - if let PeerMsg::RaftCommand(req) = PeerMsg::raft_command(cmd).0 { + if let PeerMsg::AdminCommand(req) = PeerMsg::admin_command(cmd).0 { self.on_admin_command(ctx, req.request, req.ch); } else { unreachable!(); @@ -345,7 +346,8 @@ impl Peer { } // FIXME: Raft command has size limit. Either limit the total size of // pessimistic locks in a region, or split commands here. - let mut cmd = RaftCmdRequest::default(); + let mut encoder = SimpleWriteEncoder::with_capacity(512); + let mut lock_count = 0; { // Downgrade to a read guard, do not block readers in the scheduler as far as // possible. @@ -355,33 +357,27 @@ impl Peer { if *deleted { continue; } - let mut put = PutRequest::default(); - put.set_cf(CF_LOCK.to_string()); - put.set_key(key.as_encoded().to_owned()); - put.set_value(lock.to_lock().to_bytes()); - let mut req = Request::default(); - req.set_cmd_type(CmdType::Put); - req.set_put(put); - cmd.mut_requests().push(req); + lock_count += 1; + encoder.put(CF_LOCK, key.as_encoded(), &lock.to_lock().to_bytes()); } } - if cmd.get_requests().is_empty() { + if lock_count == 0 { // If the map is not empty but all locks are deleted, it is possible that a // write command has just marked locks deleted but not proposed yet. // It might cause that command to fail if we skip proposing the // extra TransferLeader command here. return true; } - cmd.mut_header().set_region_id(self.region_id()); - cmd.mut_header() - .set_region_epoch(self.region().get_region_epoch().clone()); - cmd.mut_header().set_peer(self.peer().clone()); + let mut header = Box::::default(); + header.set_region_id(self.region_id()); + header.set_region_epoch(self.region().get_region_epoch().clone()); + header.set_peer(self.peer().clone()); info!( self.logger, - "propose {} locks before transferring leader", cmd.get_requests().len(); + "propose {} locks before transferring leader", lock_count; ); - let PeerMsg::RaftCommand(req) = PeerMsg::raft_command(cmd).0 else {unreachable!()}; - self.on_write_command(ctx, req.request, req.ch); + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; + self.on_simple_write(ctx, write.header, write.data, write.ch); true } } diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 8d55beca636..f6ac6ac7077 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -36,12 +36,11 @@ use raftstore::{ }, Error, Result, }; -use slog::error; use tikv_util::{box_err, time::monotonic_raw_now}; use crate::{ batch::StoreContext, - fsm::{ApplyFsm, ApplyResReporter, PeerFsmDelegate}, + fsm::{ApplyFsm, ApplyResReporter}, raft::{Apply, Peer}, router::{ApplyRes, ApplyTask, CmdResChannel}, }; @@ -52,7 +51,9 @@ mod write; pub use admin::{AdminCmdResult, SplitInit, SplitResult, SPLIT_PREFIX}; pub use control::ProposalControl; -pub use write::{SimpleWriteDecoder, SimpleWriteEncoder}; +pub use write::{ + SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, +}; use self::write::SimpleWrite; @@ -86,23 +87,6 @@ fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { resp } -impl<'a, EK: KvEngine, ER: RaftEngine, T> PeerFsmDelegate<'a, EK, ER, T> { - #[inline] - pub fn on_command(&mut self, req: RaftCmdRequest, ch: CmdResChannel) { - if !req.get_requests().is_empty() { - self.fsm - .peer_mut() - .on_write_command(self.store_ctx, req, ch) - } else if req.has_admin_request() { - self.fsm - .peer_mut() - .on_admin_command(self.store_ctx, req, ch) - } else if req.has_status_request() { - error!(self.fsm.logger(), "status command should be sent by Query"); - } - } -} - impl Peer { /// Schedule an apply fsm to apply logs in the background. /// @@ -134,17 +118,17 @@ impl Peer { } #[inline] - fn validate_command(&self, req: &RaftCmdRequest, metrics: &mut RaftMetrics) -> Result<()> { - if let Err(e) = util::check_store_id(req, self.peer().get_store_id()) { + fn validate_command( + &self, + header: &RaftRequestHeader, + admin_type: Option, + metrics: &mut RaftMetrics, + ) -> Result<()> { + if let Err(e) = util::check_store_id(header, self.peer().get_store_id()) { metrics.invalid_proposal.mismatch_store_id.inc(); return Err(e); } - for r in req.get_requests() { - if let CmdType::Get | CmdType::Snap | CmdType::ReadIndex = r.get_cmd_type() { - return Err(box_err!("internal error: query can't be sent as command")); - } - } - if let Err(e) = util::check_peer_id(req, self.peer().get_id()) { + if let Err(e) = util::check_peer_id(header, self.peer().get_id()) { metrics.invalid_proposal.mismatch_peer_id.inc(); return Err(e); } @@ -152,11 +136,11 @@ impl Peer { metrics.invalid_proposal.not_leader.inc(); return Err(Error::NotLeader(self.region_id(), self.leader())); } - if let Err(e) = util::check_term(req, self.term()) { + if let Err(e) = util::check_term(header, self.term()) { metrics.invalid_proposal.stale_command.inc(); return Err(e); } - if let Err(mut e) = util::check_region_epoch(req, self.region(), true) { + if let Err(mut e) = util::check_region_epoch(header, admin_type, self.region(), true) { if let Error::EpochNotMatch(_, _new_regions) = &mut e { // TODO: query sibling regions. metrics.invalid_proposal.epoch_not_match.inc(); @@ -166,16 +150,6 @@ impl Peer { Ok(()) } - #[inline] - fn propose_command( - &mut self, - ctx: &mut StoreContext, - req: RaftCmdRequest, - ) -> Result { - let data = req.write_to_bytes().unwrap(); - self.propose(ctx, data) - } - #[inline] fn propose( &mut self, @@ -379,7 +353,7 @@ impl Apply { let mut conf_change = None; let log_index = entry.get_index(); let req = match entry.get_entry_type() { - EntryType::EntryNormal => match SimpleWriteDecoder::new( + EntryType::EntryNormal => match SimpleWriteReqDecoder::new( &self.logger, entry.get_data(), log_index, @@ -435,7 +409,7 @@ impl Apply { } }; - util::check_region_epoch(&req, self.region_state().get_region(), true)?; + util::check_req_region_epoch(&req, self.region_state().get_region(), true)?; if req.has_admin_request() { let admin_req = req.get_admin_request(); let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index c4cc1646963..6ea6064a002 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -1,7 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{KvEngine, Mutable, RaftEngine, CF_DEFAULT}; -use kvproto::raft_cmdpb::RaftCmdRequest; +use kvproto::raft_cmdpb::RaftRequestHeader; use raftstore::{ store::{ cmd_resp, @@ -21,16 +21,19 @@ use crate::{ mod simple_write; -pub use simple_write::{SimpleWriteDecoder, SimpleWriteEncoder}; +pub use simple_write::{ + SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, +}; pub use self::simple_write::SimpleWrite; impl Peer { #[inline] - pub fn on_write_command( + pub fn on_simple_write( &mut self, ctx: &mut StoreContext, - mut req: RaftCmdRequest, + header: Box, + data: SimpleWriteBinary, ch: CmdResChannel, ) { if !self.serving() { @@ -38,16 +41,13 @@ impl Peer { return; } if let Some(encoder) = self.simple_write_encoder_mut() { - match encoder.amend(req) { - Ok(()) => { - encoder.add_response_channel(ch); - self.set_has_ready(); - return; - } - Err(r) => req = r, + if encoder.amend(&header, &data) { + encoder.add_response_channel(ch); + self.set_has_ready(); + return; } } - if let Err(e) = self.validate_command(&req, &mut ctx.raft_metrics) { + if let Err(e) = self.validate_command(&header, None, &mut ctx.raft_metrics) { let resp = cmd_resp::new_error(e); ch.report_error(resp); return; @@ -60,21 +60,15 @@ impl Peer { } // ProposalControl is reliable only when applied to current term. let call_proposed_on_success = self.applied_to_current_term(); - match SimpleWriteEncoder::new( - req, + let mut encoder = SimpleWriteReqEncoder::new( + header, + data, (ctx.cfg.raft_entry_max_size.0 as f64 * MAX_PROPOSAL_SIZE_RATIO) as usize, call_proposed_on_success, - ) { - Ok(mut encoder) => { - encoder.add_response_channel(ch); - self.set_has_ready(); - self.simple_write_encoder_mut().replace(encoder); - } - Err(req) => { - let res = self.propose_command(ctx, req); - self.post_propose_command(ctx, res, vec![ch], call_proposed_on_success); - } - } + ); + encoder.add_response_channel(ch); + self.set_has_ready(); + self.simple_write_encoder_mut().replace(encoder); } pub fn propose_pending_writes(&mut self, ctx: &mut StoreContext) { diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index c4cb9d6bc89..57c01fca9d8 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -1,8 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; -use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request}; -use protobuf::{CodedInputStream, Message, SingularPtrField}; +use kvproto::raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}; +use protobuf::{CodedInputStream, Message}; use raftstore::store::WriteCallback; use slog::Logger; @@ -13,83 +13,62 @@ use crate::{operation::command::parse_at, router::CmdResChannel}; // TODO: use protobuf blob request seems better. const MAGIC_PREFIX: u8 = 0x00; +#[derive(Clone, Debug)] +#[repr(transparent)] +pub struct SimpleWriteBinary { + buf: Box<[u8]>, +} + /// We usually use `RaftCmdRequest` for read write request. But the codec is /// not efficient enough for simple request. `SimpleWrite` is introduce to make /// codec alloc less and fast. #[derive(Debug)] -pub struct SimpleWriteEncoder { - header: SingularPtrField, +pub struct SimpleWriteReqEncoder { + header: Box, buf: Vec, channels: Vec, size_limit: usize, notify_proposed: bool, } -impl SimpleWriteEncoder { - /// Create an encoder. +impl SimpleWriteReqEncoder { + /// Create a request encoder. /// /// If `notify_proposed` is true, channels will be called `notify_proposed` /// when it's appended. pub fn new( - req: RaftCmdRequest, + header: Box, + bin: SimpleWriteBinary, size_limit: usize, notify_proposed: bool, - ) -> Result { - if !Self::allow_request(&req) { - return Err(req); - } - + ) -> SimpleWriteReqEncoder { let mut buf = Vec::with_capacity(256); buf.push(MAGIC_PREFIX); - req.get_header() - .write_length_delimited_to_vec(&mut buf) - .unwrap(); + header.write_length_delimited_to_vec(&mut buf).unwrap(); + buf.extend_from_slice(&bin.buf); - for r in req.get_requests() { - encode(r, &mut buf); - } - Ok(SimpleWriteEncoder { - header: req.header, + SimpleWriteReqEncoder { + header, buf, channels: vec![], size_limit, notify_proposed, - }) - } - - fn allow_request(req: &RaftCmdRequest) -> bool { - if !req.has_status_request() && !req.has_admin_request() { - // TODO: skip the check and make caller use `SimpleWrite` directly. - for r in req.get_requests() { - if r.get_cmd_type() != CmdType::Put - && r.get_cmd_type() != CmdType::Delete - && r.get_cmd_type() != CmdType::DeleteRange - { - return false; - } - } - } else { - return false; - }; - true + } } + /// Encode the simple write into the buffer dispite header check. + /// + /// Return false if the buffer limit is reached or the write can be amended. #[inline] - pub fn amend(&mut self, req: RaftCmdRequest) -> Result<(), RaftCmdRequest> { - if Self::allow_request(&req) && req.header == self.header { - let last_length = self.buf.len(); - for r in req.get_requests() { - encode(r, &mut self.buf); - } - // The default size limit is 8 * 0.4 = 3.2MiB. - if self.buf.len() < self.size_limit { - Ok(()) - } else { - self.buf.truncate(last_length); - Err(req) - } + pub fn amend(&mut self, header: &RaftRequestHeader, bin: &SimpleWriteBinary) -> bool { + if *self.header != *header { + return false; + } + if self.buf.len() + bin.buf.len() < self.size_limit { + self.buf.extend_from_slice(&bin.buf); + true } else { - Err(req) + false } } @@ -118,9 +97,7 @@ impl SimpleWriteEncoder { #[inline] pub fn header(&self) -> &RaftRequestHeader { - self.header - .as_ref() - .unwrap_or_else(|| RaftRequestHeader::default_instance()) + &self.header } } @@ -152,19 +129,63 @@ pub enum SimpleWrite<'a> { DeleteRange(DeleteRange<'a>), } +#[derive(Clone)] +pub struct SimpleWriteEncoder { + buf: Vec, +} + +impl SimpleWriteEncoder { + #[inline] + pub fn with_capacity(cap: usize) -> SimpleWriteEncoder { + SimpleWriteEncoder { + buf: Vec::with_capacity(cap), + } + } + + #[inline] + pub fn put(&mut self, cf: &str, key: &[u8], value: &[u8]) { + encode(SimpleWrite::Put(Put { cf, key, value }), &mut self.buf); + } + + #[inline] + pub fn delete(&mut self, cf: &str, key: &[u8]) { + encode(SimpleWrite::Delete(Delete { cf, key }), &mut self.buf); + } + + #[inline] + pub fn delete_range(&mut self, cf: &str, start_key: &[u8], end_key: &[u8], notify_only: bool) { + encode( + SimpleWrite::DeleteRange(DeleteRange { + cf, + start_key, + end_key, + notify_only, + }), + &mut self.buf, + ); + } + + #[inline] + pub fn encode(self) -> SimpleWriteBinary { + SimpleWriteBinary { + buf: self.buf.into_boxed_slice(), + } + } +} + #[derive(Debug)] -pub struct SimpleWriteDecoder<'a> { +pub struct SimpleWriteReqDecoder<'a> { header: RaftRequestHeader, buf: &'a [u8], } -impl<'a> SimpleWriteDecoder<'a> { +impl<'a> SimpleWriteReqDecoder<'a> { pub fn new( logger: &Logger, buf: &'a [u8], index: u64, term: u64, - ) -> Result, RaftCmdRequest> { + ) -> Result, RaftCmdRequest> { match buf.first().cloned() { Some(MAGIC_PREFIX) => { let mut is = CodedInputStream::from_bytes(&buf[1..]); @@ -179,7 +200,7 @@ impl<'a> SimpleWriteDecoder<'a> { ), }; let read = is.pos(); - Ok(SimpleWriteDecoder { + Ok(SimpleWriteReqDecoder { header, buf: &buf[1 + read as usize..], }) @@ -194,7 +215,7 @@ impl<'a> SimpleWriteDecoder<'a> { } } -impl<'a> Iterator for SimpleWriteDecoder<'a> { +impl<'a> Iterator for SimpleWriteReqDecoder<'a> { type Item = SimpleWrite<'a>; #[inline] @@ -310,37 +331,27 @@ fn decode_cf(buf: &[u8]) -> (&str, &[u8]) { } } -// TODO: we need a way to verify every field is encoded. -#[inline] -fn encode(req: &Request, buf: &mut Vec) { - match req.get_cmd_type() { - CmdType::Put => { +#[inline(always)] +fn encode(simple_write: SimpleWrite<'_>, buf: &mut Vec) { + match simple_write { + SimpleWrite::Put(put) => { buf.push(PUT_TAG); - let put_req = req.get_put(); - encode_cf(put_req.get_cf(), buf); - encode_bytes(put_req.get_key(), buf); - encode_bytes(put_req.get_value(), buf); + encode_cf(put.cf, buf); + encode_bytes(put.key, buf); + encode_bytes(put.value, buf); } - CmdType::Delete => { + SimpleWrite::Delete(delete) => { buf.push(DELETE_TAG); - let delete_req = req.get_delete(); - encode_cf(delete_req.get_cf(), buf); - encode_bytes(delete_req.get_key(), buf); + encode_cf(delete.cf, buf); + encode_bytes(delete.key, buf); } - CmdType::DeleteRange => { + SimpleWrite::DeleteRange(dr) => { buf.push(DELETE_RANGE_TAG); - let delete_range_req = req.get_delete_range(); - encode_cf(delete_range_req.get_cf(), buf); - encode_bytes(delete_range_req.get_start_key(), buf); - encode_bytes(delete_range_req.get_end_key(), buf); - buf.push(delete_range_req.get_notify_only() as u8); + encode_cf(dr.cf, buf); + encode_bytes(dr.start_key, buf); + encode_bytes(dr.end_key, buf); + buf.push(dr.notify_only as u8); } - CmdType::Invalid - | CmdType::Get - | CmdType::Snap - | CmdType::Prewrite - | CmdType::IngestSst - | CmdType::ReadIndex => unreachable!("not supported type should be filtered already"), } } @@ -380,57 +391,32 @@ fn decode<'a>(buf: &mut &'a [u8]) -> Option> { #[cfg(test)] mod tests { + use kvproto::raft_cmdpb::{CmdType, Request}; use slog::o; use super::*; #[test] fn test_codec() { - let mut cmd = RaftCmdRequest::default(); - cmd.mut_header().set_term(2); - - let mut req = Request::default(); - req.set_cmd_type(CmdType::Put); - let put_req = req.mut_put(); - put_req.set_cf(CF_DEFAULT.to_string()); - put_req.set_key(b"key".to_vec()); - put_req.set_value(b"".to_vec()); - cmd.mut_requests().push(req); - - req = Request::default(); - req.set_cmd_type(CmdType::Delete); - let delete_req = req.mut_delete(); + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, b"key", b""); let delete_key = vec![0; 1024]; - delete_req.set_cf(CF_WRITE.to_string()); - delete_req.set_key(delete_key.clone()); - cmd.mut_requests().push(req); - - let mut encoder = SimpleWriteEncoder::new(cmd.clone(), usize::MAX, false).unwrap(); - cmd.clear_requests(); - - req = Request::default(); - req.set_cmd_type(CmdType::DeleteRange); - let delete_range_req = req.mut_delete_range(); - delete_range_req.set_cf(CF_LOCK.to_string()); - delete_range_req.set_start_key(b"key".to_vec()); - delete_range_req.set_end_key(b"key".to_vec()); - delete_range_req.set_notify_only(true); - cmd.mut_requests().push(req); - - req = Request::default(); - req.set_cmd_type(CmdType::DeleteRange); - let delete_range_req = req.mut_delete_range(); - delete_range_req.set_cf("cf".to_string()); - delete_range_req.set_start_key(b"key".to_vec()); - delete_range_req.set_end_key(b"key".to_vec()); - delete_range_req.set_notify_only(false); - cmd.mut_requests().push(req); - - encoder.amend(cmd.clone()).unwrap(); - let (bytes, _) = encoder.encode(); + encoder.delete(CF_WRITE, &delete_key); + let bin = encoder.encode(); + + let mut header = Box::::default(); + header.set_term(2); + let mut req_encoder = SimpleWriteReqEncoder::new(header.clone(), bin, usize::MAX, false); + + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.delete_range(CF_LOCK, b"key", b"key", true); + encoder.delete_range("cf", b"key", b"key", false); + req_encoder.amend(&header, &encoder.encode()); + + let (bytes, _) = req_encoder.encode(); let logger = slog_global::borrow_global().new(o!()); - let mut decoder = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap(); - assert_eq!(decoder.header(), cmd.get_header()); + let mut decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); + assert_eq!(*decoder.header(), *header); let write = decoder.next().unwrap(); let SimpleWrite::Put(put) = write else { panic!("should be put") }; assert_eq!(put.cf, CF_DEFAULT); @@ -488,38 +474,40 @@ mod tests { #[test] fn test_invalid() { - let mut invalid_cmd = RaftCmdRequest::default(); - invalid_cmd.mut_header().set_term(2); + let mut raft_cmd = RaftCmdRequest::default(); + raft_cmd.mut_header().set_term(2); let mut req = Request::default(); req.set_cmd_type(CmdType::Invalid); - invalid_cmd.mut_requests().push(req); - let fallback = SimpleWriteEncoder::new(invalid_cmd.clone(), usize::MAX, false).unwrap_err(); - let bytes = fallback.write_to_bytes().unwrap(); + raft_cmd.mut_requests().push(req); + let bytes = raft_cmd.write_to_bytes().unwrap(); let logger = slog_global::borrow_global().new(o!()); - let decoded = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap_err(); - assert_eq!(decoded, invalid_cmd); + let decoded = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap_err(); + // SimpleWriteReqDecoder should be able to decode naive RaftCmdRequest. + assert_eq!(decoded, raft_cmd); - let mut valid_cmd = RaftCmdRequest::default(); - valid_cmd.mut_header().set_term(3); - let mut req = Request::default(); - req.set_cmd_type(CmdType::Put); - let put_req = req.mut_put(); - put_req.set_cf(CF_DEFAULT.to_string()); - put_req.set_key(b"key".to_vec()); - put_req.set_value(b"".to_vec()); - valid_cmd.mut_requests().push(req); - let mut encoder = SimpleWriteEncoder::new(valid_cmd.clone(), usize::MAX, false).unwrap(); - // Only simple write command can be batched. - encoder.amend(invalid_cmd.clone()).unwrap_err(); - let mut valid_cmd2 = valid_cmd.clone(); - valid_cmd2.mut_header().set_term(4); + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, b"key", b""); + let bin = encoder.encode(); + + let mut header = Box::::default(); + header.set_term(2); + let mut req_encoder = SimpleWriteReqEncoder::new(header.clone(), bin.clone(), 512, false); + + let mut header2 = Box::::default(); + header2.set_term(4); // Only simple write command with same header can be batched. - encoder.amend(valid_cmd2).unwrap_err(); + assert!(!req_encoder.amend(&header2, &bin)); + + // Batch should not excceed max size limit. + let large_value = vec![0; 512]; + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, b"key", &large_value); + assert!(!req_encoder.amend(&header, &encoder.encode())); - let (bytes, _) = encoder.encode(); - let mut decoder = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap(); - assert_eq!(decoder.header(), valid_cmd.get_header()); + let (bytes, _) = req_encoder.encode(); + let mut decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); + assert_eq!(*decoder.header(), *header); let req = decoder.next().unwrap(); let SimpleWrite::Put(put) = req else { panic!("should be put") }; assert_eq!(put.cf, CF_DEFAULT); diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index beb47f9a08f..5e6971b3346 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -7,7 +7,8 @@ mod query; mod ready; pub use command::{ - AdminCmdResult, CommittedEntries, ProposalControl, SimpleWriteDecoder, SimpleWriteEncoder, + AdminCmdResult, CommittedEntries, ProposalControl, SimpleWriteBinary, SimpleWriteEncoder, + SimpleWriteReqDecoder, SimpleWriteReqEncoder, }; pub use life::DestroyProgress; pub use ready::{ diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index d24a4b9d899..1878ead40c2 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -9,6 +9,7 @@ use std::{ use batch_system::Router; use crossbeam::channel::TrySendError; use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletRegistry}; +use futures::Future; use kvproto::{ errorpb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse}, @@ -22,7 +23,7 @@ use raftstore::{ Error, Result, }; use slog::{debug, Logger}; -use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now}; +use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now, Either}; use time::Timespec; use txn_types::WriteBatchFlags; @@ -32,7 +33,7 @@ use crate::{ StoreRouter, }; -pub trait MsgRouter: Send { +pub trait MsgRouter: Clone + Send { fn send(&self, addr: u64, msg: PeerMsg) -> std::result::Result<(), TrySendError>; } @@ -103,9 +104,9 @@ where fn try_get_snapshot( &mut self, - req: RaftCmdRequest, + req: &RaftCmdRequest, ) -> std::result::Result>, RaftCmdResponse> { - match self.pre_propose_raft_command(&req) { + match self.pre_propose_raft_command(req) { Ok(Some((mut delegate, policy))) => match policy { RequestPolicy::ReadLocal => { let region = Arc::clone(&delegate.region); @@ -121,7 +122,7 @@ where TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); // Try renew lease in advance - self.maybe_renew_lease_in_advance(&delegate, &req, snapshot_ts); + self.maybe_renew_lease_in_advance(&delegate, req, snapshot_ts); Ok(Some(snap)) } RequestPolicy::StaleRead => { @@ -156,63 +157,84 @@ where } } - pub async fn snapshot( + pub fn snapshot( &mut self, mut req: RaftCmdRequest, - ) -> std::result::Result, RaftCmdResponse> { + ) -> impl Future, RaftCmdResponse>> + Send + { let region_id = req.header.get_ref().region_id; - if let Some(snap) = self.try_get_snapshot(req.clone())? { - return Ok(snap); - } + let res = match self.try_get_snapshot(&req) { + res @ (Ok(Some(_)) | Err(_)) => Either::Left(res), + Ok(None) => Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())), + }; - if let Some(query_res) = self.try_to_renew_lease(region_id, &req).await? { - // If query successful, try again. - if query_res.read().is_some() { - req.mut_header().set_read_quorum(false); - if let Some(snap) = self.try_get_snapshot(req)? { - return Ok(snap); + async move { + match res { + Either::Left(Ok(Some(snap))) => return Ok(snap), + Either::Left(Err(e)) => return Err(e), + Either::Right((fut, mut reader)) => { + if let Some(query_res) = fut.await? + && query_res.read().is_some() + { + // If query successful, try again. + req.mut_header().set_read_quorum(false); + if let Some(snap) = reader.try_get_snapshot(&req)? { + return Ok(snap); + } + } } + Either::Left(Ok(None)) => unreachable!(), } - } - let mut err = errorpb::Error::default(); - err.set_message(format!( - "Fail to get snapshot from LocalReader for region {}. \ - Maybe due to `not leader`, `region not found` or `not applied to the current term`", - region_id - )); - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - Err(resp) + let mut err = errorpb::Error::default(); + err.set_message(format!( + "Fail to get snapshot from LocalReader for region {}. \ + Maybe due to `not leader`, `region not found` or `not applied to the current term`", + region_id + )); + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + Err(resp) + } } // try to renew the lease by sending read query where the reading process may // renew the lease - async fn try_to_renew_lease( + fn try_to_renew_lease( &self, region_id: u64, req: &RaftCmdRequest, - ) -> std::result::Result, RaftCmdResponse> { + ) -> impl Future, RaftCmdResponse>> { let (msg, sub) = PeerMsg::raft_query(req.clone()); - let mut err = errorpb::Error::default(); - match MsgRouter::send(&self.router, region_id, msg) { - Ok(()) => return Ok(sub.result().await), + let res = match MsgRouter::send(&self.router, region_id, msg) { + Ok(()) => Ok(sub), Err(TrySendError::Full(_)) => { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.channel_full.inc()); + let mut err = errorpb::Error::default(); err.set_message(RAFTSTORE_IS_BUSY.to_owned()); err.mut_server_is_busy() .set_reason(RAFTSTORE_IS_BUSY.to_owned()); + Err(err) } Err(TrySendError::Disconnected(_)) => { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); + let mut err = errorpb::Error::default(); err.set_message(format!("region {} is missing", region_id)); err.mut_region_not_found().set_region_id(region_id); + Err(err) } - } + }; - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - Err(resp) + async move { + match res { + Ok(sub) => Ok(sub.result().await), + Err(e) => { + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(e); + Err(resp) + } + } + } } // If the remote lease will be expired in near future send message @@ -449,6 +471,7 @@ mod tests { use super::*; use crate::router::{QueryResult, ReadResponse}; + #[derive(Clone)] struct MockRouter { p_router: SyncSender<(u64, PeerMsg)>, } @@ -681,10 +704,11 @@ mod tests { )) .unwrap(); block_on(reader.snapshot(cmd.clone())).unwrap(); - // Updating lease makes cache miss. + // Updating lease makes cache miss. And because the cache is updated on cloned + // copy, so the old cache will still need to be updated again. assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), - 4 + 5 ); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index ea66719314c..eb58dcbbc23 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -128,7 +128,7 @@ impl Peer { } // Check store_id, make sure that the msg is dispatched to the right place. - if let Err(e) = util::check_store_id(msg, self.peer().get_store_id()) { + if let Err(e) = util::check_store_id(msg.get_header(), self.peer().get_store_id()) { raft_metrics.invalid_proposal.mismatch_store_id.inc(); return Err(e); } @@ -158,7 +158,7 @@ impl Peer { } // peer_id must be the same as peer's. - if let Err(e) = util::check_peer_id(msg, self.peer_id()) { + if let Err(e) = util::check_peer_id(msg.get_header(), self.peer_id()) { raft_metrics.invalid_proposal.mismatch_peer_id.inc(); return Err(e); } @@ -166,13 +166,13 @@ impl Peer { // TODO: check applying snapshot // Check whether the term is stale. - if let Err(e) = util::check_term(msg, self.term()) { + if let Err(e) = util::check_term(msg.get_header(), self.term()) { raft_metrics.invalid_proposal.stale_command.inc(); return Err(e); } // TODO: add check of sibling region for split - util::check_region_epoch(msg, self.region(), true) + util::check_req_region_epoch(msg, self.region(), true) } // For these cases it won't be proposed: @@ -340,7 +340,7 @@ impl Peer { } fn query_status(&mut self, req: &RaftCmdRequest, resp: &mut RaftCmdResponse) -> Result<()> { - util::check_store_id(req, self.peer().get_store_id())?; + util::check_store_id(req.get_header(), self.peer().get_store_id())?; let cmd_type = req.get_status_request().get_cmd_type(); let status_resp = resp.mut_status_response(); status_resp.set_cmd_type(cmd_type); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index a6df9049285..f9a6c3a34d4 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -31,7 +31,10 @@ use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::store::{util, FetchedLogs, ReadProgress, Transport, WriteTask}; use slog::{debug, error, trace, warn}; -use tikv_util::time::{duration_to_sec, monotonic_raw_now}; +use tikv_util::{ + store::find_peer, + time::{duration_to_sec, monotonic_raw_now}, +}; pub use self::{ async_writer::AsyncWriter, @@ -39,11 +42,25 @@ pub use self::{ }; use crate::{ batch::StoreContext, - fsm::PeerFsmDelegate, + fsm::{PeerFsmDelegate, Store}, raft::{Peer, Storage}, - router::{ApplyTask, PeerTick}, + router::{ApplyTask, PeerMsg, PeerTick}, }; +impl Store { + pub fn on_store_unreachable( + &mut self, + ctx: &mut StoreContext, + to_store_id: u64, + ) where + EK: KvEngine, + ER: RaftEngine, + { + ctx.router + .broadcast_normal(|| PeerMsg::StoreUnreachable { to_store_id }); + } +} + impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { /// Raft relies on periodic ticks to keep the state machine sync with other /// peers. @@ -61,6 +78,20 @@ impl Peer { self.raft_group_mut().tick() } + pub fn on_peer_unreachable(&mut self, to_peer_id: u64) { + if self.is_leader() { + self.raft_group_mut().report_unreachable(to_peer_id); + } + } + + pub fn on_store_unreachable(&mut self, to_store_id: u64) { + if self.is_leader() { + if let Some(peer_id) = find_peer(self.region(), to_store_id).map(|p| p.get_id()) { + self.raft_group_mut().report_unreachable(peer_id); + } + } + } + pub fn on_raft_message( &mut self, ctx: &mut StoreContext, diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 82e9b6011ca..500b166065f 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -25,7 +25,7 @@ use super::storage::Storage; use crate::{ batch::StoreContext, fsm::ApplyScheduler, - operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteEncoder}, + operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteReqEncoder}, router::{CmdResChannel, PeerTick, QueryResChannel}, Result, }; @@ -49,7 +49,7 @@ pub struct Peer { /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. - raw_write_encoder: Option, + raw_write_encoder: Option, proposals: ProposalQueue>, apply_scheduler: Option, @@ -502,12 +502,12 @@ impl Peer { } #[inline] - pub fn simple_write_encoder_mut(&mut self) -> &mut Option { + pub fn simple_write_encoder_mut(&mut self) -> &mut Option { &mut self.raw_write_encoder } #[inline] - pub fn simple_write_encoder(&self) -> &Option { + pub fn simple_write_encoder(&self) -> &Option { &self.raw_write_encoder } diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 7208a6b5bef..e838cefb743 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -4,6 +4,7 @@ use std::sync::{Arc, Mutex}; use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use futures::Future; use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, @@ -78,10 +79,11 @@ impl RaftRouter { self.router.send_raft_message(msg) } - pub async fn get_snapshot( + pub fn snapshot( &mut self, req: RaftCmdRequest, - ) -> std::result::Result, RaftCmdResponse> { - self.local_reader.snapshot(req).await + ) -> impl Future, RaftCmdResponse>> + Send + { + self.local_reader.snapshot(req) } } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index b387e729f8d..d5635574978 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -1,9 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::fmt; -use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; +use kvproto::{ + raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, + raft_serverpb::RaftMessage, +}; use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs, GenSnapRes}; use tikv_util::time::Instant; @@ -13,7 +15,7 @@ use super::{ }, ApplyRes, }; -use crate::operation::SplitInit; +use crate::operation::{SimpleWriteBinary, SplitInit}; #[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] @@ -91,6 +93,7 @@ impl StoreTick { } /// Command that can be handled by raftstore. +#[derive(Debug)] pub struct RaftRequest { pub send_time: Instant, pub request: RaftCmdRequest, @@ -107,7 +110,16 @@ impl RaftRequest { } } +#[derive(Debug)] +pub struct SimpleWrite { + pub send_time: Instant, + pub header: Box, + pub data: SimpleWriteBinary, + pub ch: CmdResChannel, +} + /// Message that can be sent to a peer. +#[derive(Debug)] pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target @@ -118,7 +130,9 @@ pub enum PeerMsg { RaftQuery(RaftRequest), /// Command changes the inernal states. It will be transformed into logs and /// applied on all replicas. - RaftCommand(RaftRequest), + SimpleWrite(SimpleWrite), + /// Command that contains admin requests. + AdminCommand(RaftRequest), /// Tick is periodical task. If target peer doesn't exist there is a /// potential that the raft node will not work anymore. Tick(PeerTick), @@ -144,6 +158,12 @@ pub enum PeerMsg { tablet_index: u64, flushed_index: u64, }, + PeerUnreachable { + to_peer_id: u64, + }, + StoreUnreachable { + to_store_id: u64, + }, /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), @@ -155,76 +175,33 @@ impl PeerMsg { (PeerMsg::RaftQuery(RaftRequest::new(req, ch)), sub) } - pub fn raft_command(req: RaftCmdRequest) -> (Self, CmdResSubscriber) { + pub fn admin_command(req: RaftCmdRequest) -> (Self, CmdResSubscriber) { let (ch, sub) = CmdResChannel::pair(); - (PeerMsg::RaftCommand(RaftRequest::new(req, ch)), sub) + (PeerMsg::AdminCommand(RaftRequest::new(req, ch)), sub) } -} -impl fmt::Debug for PeerMsg { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - PeerMsg::RaftMessage(_) => write!(fmt, "Raft Message"), - PeerMsg::RaftQuery(_) => write!(fmt, "Raft Query"), - PeerMsg::RaftCommand(_) => write!(fmt, "Raft Command"), - PeerMsg::Tick(tick) => write! { - fmt, - "{:?}", - tick - }, - PeerMsg::ApplyRes(res) => write!(fmt, "ApplyRes {:?}", res), - PeerMsg::Start => write!(fmt, "Startup"), - PeerMsg::SplitInit(_) => { - write!(fmt, "Split initialization") - } - PeerMsg::SplitInitFinish(region_id) => { - write!( - fmt, - "Split initialization finished from region {}", - region_id - ) - } - PeerMsg::Noop => write!(fmt, "Noop"), - PeerMsg::Persisted { - peer_id, - ready_number, - } => write!( - fmt, - "Persisted peer_id {}, ready_number {}", - peer_id, ready_number - ), - PeerMsg::LogsFetched(fetched) => write!(fmt, "LogsFetched {:?}", fetched), - PeerMsg::SnapshotGenerated(_) => write!(fmt, "SnapshotGenerated"), - PeerMsg::QueryDebugInfo(_) => write!(fmt, "QueryDebugInfo"), - PeerMsg::DataFlushed { - cf, - tablet_index, - flushed_index, - } => write!( - fmt, - "DataFlushed cf {}, tablet_index {}, flushed_index {}", - cf, tablet_index, flushed_index - ), - #[cfg(feature = "testexport")] - PeerMsg::WaitFlush(_) => write!(fmt, "FlushMessages"), - } + pub fn simple_write( + header: Box, + data: SimpleWriteBinary, + ) -> (Self, CmdResSubscriber) { + let (ch, sub) = CmdResChannel::pair(); + ( + PeerMsg::SimpleWrite(SimpleWrite { + send_time: Instant::now(), + header, + data, + ch, + }), + sub, + ) } } +#[derive(Debug)] pub enum StoreMsg { RaftMessage(Box), SplitInit(Box), Tick(StoreTick), Start, -} - -impl fmt::Debug for StoreMsg { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - match *self { - StoreMsg::RaftMessage(_) => write!(fmt, "Raft Message"), - StoreMsg::SplitInit(_) => write!(fmt, "Split initialization"), - StoreMsg::Tick(tick) => write!(fmt, "StoreTick {:?}", tick), - StoreMsg::Start => write!(fmt, "Start store"), - } - } + StoreUnreachable { to_store_id: u64 }, } diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index a09b0593b80..d6846f61e4b 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -15,7 +15,7 @@ pub use self::{ internal_message::ApplyRes, message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, response_channel::{ - CmdResChannel, DebugInfoChannel, DebugInfoSubscriber, QueryResChannel, QueryResult, - ReadResponse, + CmdResChannel, CmdResChannelBuilder, CmdResEvent, CmdResStream, CmdResSubscriber, + DebugInfoChannel, DebugInfoSubscriber, QueryResChannel, QueryResult, ReadResponse, }, }; diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index 423c9e8e326..01c1565ec62 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -24,7 +24,7 @@ use std::{ task::{Context, Poll}, }; -use futures::task::AtomicWaker; +use futures::{task::AtomicWaker, FutureExt, Stream}; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, raft_cmdpb::RaftCmdResponse}; use raftstore::store::{ local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, @@ -47,7 +47,11 @@ struct EventCore { /// Event 0 and Event 31 is reserved as payload and cancel respectively. /// Other events should be defined within [1, 30]. event: AtomicU64, + /// Even a channel supports multiple events, it's not necessary to trigger + /// all of them. `event_mask` is used to filter unnecessary events. + event_mask: u32, res: UnsafeCell>, + before_set: UnsafeCell>>, // Waker can be changed, need to use `AtomicWaker` to guarantee no data race. waker: AtomicWaker, } @@ -57,6 +61,10 @@ unsafe impl Send for EventCore {} const PAYLOAD_EVENT: u64 = 0; const CANCEL_EVENT: u64 = 31; +const fn event_mask_bit_of(event: u64) -> u32 { + 1 << event +} + #[inline] const fn subscribed_bit_of(event: u64) -> u64 { 1 << (event * 2) @@ -67,23 +75,14 @@ const fn fired_bit_of(event: u64) -> u64 { 1 << (event * 2 + 1) } -impl Default for EventCore { - #[inline] - fn default() -> Self { - Self { - event: AtomicU64::new(0), - res: UnsafeCell::new(None), - waker: AtomicWaker::new(), - } - } -} - impl EventCore { #[inline] fn notify_event(&self, event: u64) { - let previous = self.event.fetch_or(fired_bit_of(event), Ordering::AcqRel); - if previous & subscribed_bit_of(event) != 0 { - self.waker.wake() + if self.event_mask & event_mask_bit_of(event) != 0 { + let previous = self.event.fetch_or(fired_bit_of(event), Ordering::AcqRel); + if previous & subscribed_bit_of(event) != 0 { + self.waker.wake() + } } } @@ -91,8 +90,11 @@ impl EventCore { /// /// After this call, no events should be notified. #[inline] - fn set_result(&self, result: Res) { + fn set_result(&self, mut result: Res) { unsafe { + if let Some(cb) = (*self.before_set.get()).take() { + cb(&mut result); + } *self.res.get() = Some(result); } let previous = self.event.fetch_or( @@ -173,7 +175,7 @@ impl<'a, Res> Future for WaitEvent<'a, Res> { } struct WaitResult<'a, Res> { - core: &'a EventCore, + sub: &'a BaseSubscriber, } impl<'a, Res> Future for WaitResult<'a, Res> { @@ -181,16 +183,16 @@ impl<'a, Res> Future for WaitResult<'a, Res> { #[inline] fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - let event = &self.core.event; + let event = &self.sub.core.event; let fired_bit = fired_bit_of(PAYLOAD_EVENT); let mut e = event.load(Ordering::Relaxed); if check_bit(e, fired_bit).is_some() { unsafe { - return Poll::Ready((*self.core.res.get()).take()); + return Poll::Ready((*self.sub.core.res.get()).take()); } } let subscribed_bit = subscribed_bit_of(PAYLOAD_EVENT); - self.core.waker.register(cx.waker()); + self.sub.core.waker.register(cx.waker()); loop { match event.compare_exchange_weak( e, @@ -203,7 +205,7 @@ impl<'a, Res> Future for WaitResult<'a, Res> { }; if check_bit(e, fired_bit).is_some() { unsafe { - return Poll::Ready((*self.core.res.get()).take()); + return Poll::Ready((*self.sub.core.res.get()).take()); } } } @@ -219,7 +221,7 @@ impl BaseSubscriber { /// Wait for the result. #[inline] pub async fn result(self) -> Option { - WaitResult { core: &self.core }.await + WaitResult { sub: &self }.await } /// Test if the result is ready without any polling. @@ -242,7 +244,17 @@ impl BaseChannel { /// Creates a pair of channel and subscriber. #[inline] pub fn pair() -> (Self, BaseSubscriber) { - let core: Arc> = Arc::default(); + Self::with_mask(u32::MAX) + } + + fn with_mask(mask: u32) -> (Self, BaseSubscriber) { + let core: Arc> = Arc::new(EventCore { + event: AtomicU64::new(0), + res: UnsafeCell::new(None), + event_mask: mask, + before_set: UnsafeCell::new(None), + waker: AtomicWaker::new(), + }); (Self { core: core.clone() }, BaseSubscriber { core }) } @@ -283,6 +295,122 @@ impl CmdResSubscriber { } } +#[derive(Clone, Copy, Debug)] +enum CmdResPollStage { + ExpectProposed, + ExpectCommitted, + ExpectResult, + Drained, +} + +impl CmdResPollStage { + #[inline] + fn init(event_mask: u32) -> CmdResPollStage { + if event_mask & event_mask_bit_of(CmdResChannel::PROPOSED_EVENT) != 0 { + CmdResPollStage::ExpectProposed + } else if event_mask & event_mask_bit_of(CmdResChannel::COMMITTED_EVENT) != 0 { + CmdResPollStage::ExpectCommitted + } else { + CmdResPollStage::ExpectResult + } + } + + #[inline] + fn next(&mut self, event_mask: u32) { + *self = match self { + CmdResPollStage::ExpectProposed => { + if event_mask & event_mask_bit_of(CmdResChannel::COMMITTED_EVENT) == 0 { + CmdResPollStage::ExpectResult + } else { + CmdResPollStage::ExpectCommitted + } + } + CmdResPollStage::ExpectCommitted => CmdResPollStage::ExpectResult, + CmdResPollStage::ExpectResult => CmdResPollStage::Drained, + CmdResPollStage::Drained => CmdResPollStage::Drained, + } + } +} + +#[derive(Debug)] +pub enum CmdResEvent { + Proposed, + Committed, + Finished(RaftCmdResponse), +} + +pub struct CmdResStream { + sub: CmdResSubscriber, + stage: CmdResPollStage, +} + +impl CmdResStream { + #[inline] + pub fn new(sub: CmdResSubscriber) -> Self { + Self { + stage: CmdResPollStage::init(sub.core.event_mask), + sub, + } + } +} + +impl Stream for CmdResStream { + type Item = CmdResEvent; + + #[inline] + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let stream = self.get_mut(); + loop { + match stream.stage { + CmdResPollStage::ExpectProposed => { + match (WaitEvent { + event: CmdResChannel::PROPOSED_EVENT, + core: &stream.sub.core, + }) + .poll_unpin(cx) + { + Poll::Pending => return Poll::Pending, + Poll::Ready(b) => { + stream.stage.next(stream.sub.core.event_mask); + if b { + return Poll::Ready(Some(CmdResEvent::Proposed)); + } + } + } + } + CmdResPollStage::ExpectCommitted => { + match (WaitEvent { + event: CmdResChannel::COMMITTED_EVENT, + core: &stream.sub.core, + }) + .poll_unpin(cx) + { + Poll::Pending => return Poll::Pending, + Poll::Ready(b) => { + stream.stage.next(stream.sub.core.event_mask); + if b { + return Poll::Ready(Some(CmdResEvent::Committed)); + } + } + } + } + CmdResPollStage::ExpectResult => { + match (WaitResult { sub: &stream.sub }).poll_unpin(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(res) => { + stream.stage.next(stream.sub.core.event_mask); + if let Some(res) = res { + return Poll::Ready(Some(CmdResEvent::Finished(res))); + } + } + } + } + CmdResPollStage::Drained => return Poll::Ready(None), + } + } + } +} + pub type CmdResChannel = BaseChannel; impl Debug for CmdResChannel { @@ -291,6 +419,46 @@ impl Debug for CmdResChannel { } } +#[derive(Default)] +pub struct CmdResChannelBuilder { + event_mask: u32, + before_set: Option>, +} + +impl CmdResChannelBuilder { + #[inline] + pub fn subscribe_proposed(&mut self) -> &mut Self { + self.event_mask |= event_mask_bit_of(CmdResChannel::PROPOSED_EVENT); + self + } + + #[inline] + pub fn subscribe_committed(&mut self) -> &mut Self { + self.event_mask |= event_mask_bit_of(CmdResChannel::COMMITTED_EVENT); + self + } + + #[inline] + pub fn before_set( + &mut self, + f: impl FnOnce(&mut RaftCmdResponse) + Send + 'static, + ) -> &mut Self { + self.before_set = Some(Box::new(f)); + self + } + + #[inline] + pub fn build(self) -> (CmdResChannel, CmdResSubscriber) { + let (c, s) = CmdResChannel::with_mask(self.event_mask); + if let Some(f) = self.before_set { + unsafe { + *c.core.before_set.get() = Some(f); + } + } + (c, s) + } +} + impl CmdResChannel { // Valid range is [1, 30] const PROPOSED_EVENT: u64 = 1; @@ -424,14 +592,28 @@ impl fmt::Debug for QueryResChannel { pub type DebugInfoChannel = BaseChannel; pub type DebugInfoSubscriber = BaseSubscriber; +impl Debug for DebugInfoChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "DebugInfoChannel") + } +} + #[cfg(feature = "testexport")] pub type FlushChannel = BaseChannel<()>; #[cfg(feature = "testexport")] pub type FlushSubscriber = BaseSubscriber<()>; +impl Debug for FlushChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "FlushChannel") + } +} + #[cfg(test)] mod tests { - use futures::executor::block_on; + use std::assert_matches::assert_matches; + + use futures::{executor::block_on, StreamExt}; use super::*; @@ -482,4 +664,44 @@ mod tests { chan.set_result(read.clone()); assert_eq!(block_on(sub.result()).unwrap(), read); } + + #[test] + fn test_cmd_res_stream() { + let mut builder = CmdResChannelBuilder::default(); + builder.before_set(|res| { + res.mut_header().set_current_term(6); + }); + let (chan, sub) = builder.build(); + let mut stream = CmdResStream::new(sub); + chan.set_result(RaftCmdResponse::default()); + assert_matches!(block_on(stream.next()), Some(CmdResEvent::Finished(res)) if res.get_header().get_current_term() == 6); + + // When using builder, no event is subscribed by default. + let (mut chan, sub) = CmdResChannelBuilder::default().build(); + let mut stream = CmdResStream::new(sub); + chan.notify_proposed(); + chan.notify_committed(); + drop(chan); + assert_matches!(block_on(stream.next()), None); + + let mut builder = CmdResChannelBuilder::default(); + builder.subscribe_proposed(); + let (mut chan, sub) = builder.build(); + let mut stream = CmdResStream::new(sub); + chan.notify_proposed(); + chan.notify_committed(); + assert_matches!(block_on(stream.next()), Some(CmdResEvent::Proposed)); + drop(chan); + assert_matches!(block_on(stream.next()), None); + + let mut builder = CmdResChannelBuilder::default(); + builder.subscribe_committed(); + let (mut chan, sub) = builder.build(); + let mut stream = CmdResStream::new(sub); + chan.notify_proposed(); + chan.notify_committed(); + assert_matches!(block_on(stream.next()), Some(CmdResEvent::Committed)); + drop(chan); + assert_matches!(block_on(stream.next()), None); + } } diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index 80e12dc53c7..15bb2e73ff8 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -228,7 +228,7 @@ mod requests { req.mut_header().set_peer(peer); req.set_admin_request(request); - let (msg, _) = PeerMsg::raft_command(req); + let (msg, _) = PeerMsg::admin_command(req); if let Err(e) = router.send(region_id, msg) { error!( logger, diff --git a/components/raftstore-v2/tests/failpoints/test_basic_write.rs b/components/raftstore-v2/tests/failpoints/test_basic_write.rs index b20984a9837..55d85b90fa4 100644 --- a/components/raftstore-v2/tests/failpoints/test_basic_write.rs +++ b/components/raftstore-v2/tests/failpoints/test_basic_write.rs @@ -2,10 +2,9 @@ use std::{assert_matches::assert_matches, time::Duration}; -use engine_traits::Peekable; +use engine_traits::{Peekable, CF_DEFAULT}; use futures::executor::block_on; -use kvproto::raft_cmdpb::{CmdType, Request}; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use crate::cluster::Cluster; @@ -14,27 +13,25 @@ use crate::cluster::Cluster; fn test_write_batch_rollback() { let mut cluster = Cluster::default(); let router = &mut cluster.routers[0]; - let mut req = router.new_request_for(2); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(b"key".to_vec()); - put_req.mut_put().set_value(b"value".to_vec()); - req.mut_requests().push(put_req.clone()); + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); router.wait_applied_to_current_term(2, Duration::from_secs(3)); // Make several entries to batch in apply thread. fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); // Good proposal should be committed. - let (msg, mut sub0) = PeerMsg::raft_command(req.clone()); + let (msg, mut sub0) = PeerMsg::simple_write(header.clone(), put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub0.wait_proposed())); assert!(block_on(sub0.wait_committed())); // If the write batch is correctly initialized, next write should not contain // last result. - req.mut_requests()[0].mut_put().set_key(b"key1".to_vec()); - let (msg, mut sub1) = PeerMsg::raft_command(req.clone()); + put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key1", b"value"); + let (msg, mut sub1) = PeerMsg::simple_write(header.clone(), put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub1.wait_proposed())); assert!(block_on(sub1.wait_committed())); @@ -63,16 +60,18 @@ fn test_write_batch_rollback() { fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); // Trigger error again, so an initialized write batch should be rolled back. - req.mut_requests()[0].mut_put().set_key(b"key2".to_vec()); - let (msg, mut sub0) = PeerMsg::raft_command(req.clone()); + put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key2", b"value"); + let (msg, mut sub0) = PeerMsg::simple_write(header.clone(), put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub0.wait_proposed())); assert!(block_on(sub0.wait_committed())); // If the write batch is correctly rollbacked, next write should not contain // last result. - req.mut_requests()[0].mut_put().set_key(b"key3".to_vec()); - let (msg, mut sub1) = PeerMsg::raft_command(req.clone()); + put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key3", b"value"); + let (msg, mut sub1) = PeerMsg::simple_write(header, put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub1.wait_proposed())); assert!(block_on(sub1.wait_committed())); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 55ad823b99d..732afb38f98 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -24,7 +24,7 @@ use engine_traits::{TabletContext, TabletRegistry, DATA_CFS}; use futures::executor::block_on; use kvproto::{ metapb::{self, RegionEpoch, Store}, - raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, Request}, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, Request}, raft_serverpb::RaftMessage, }; use pd_client::RpcClient; @@ -36,7 +36,7 @@ use raftstore::store::{ use raftstore_v2::{ create_store_batch_system, router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, - Bootstrap, StateStorage, StoreSystem, + Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, }; use slog::{debug, o, Logger}; use tempfile::TempDir; @@ -59,13 +59,6 @@ pub fn check_skip_wal(path: &str) { assert!(found, "no WAL found in {}", path); } -pub fn new_put_request(key: impl Into>, value: impl Into>) -> Request { - let mut req = Request::default(); - req.set_cmd_type(CmdType::Put); - req.mut_put().set_key(key.into()); - req.mut_put().set_value(value.into()); - req -} pub struct TestRouter(RaftRouter); impl Deref for TestRouter { @@ -104,8 +97,19 @@ impl TestRouter { None } - pub fn command(&self, region_id: u64, req: RaftCmdRequest) -> Option { - let (msg, sub) = PeerMsg::raft_command(req); + pub fn simple_write( + &self, + region_id: u64, + header: Box, + write: SimpleWriteEncoder, + ) -> Option { + let (msg, sub) = PeerMsg::simple_write(header, write.encode()); + self.send(region_id, msg).unwrap(); + block_on(sub.result()) + } + + pub fn admin_command(&self, region_id: u64, req: RaftCmdRequest) -> Option { + let (msg, sub) = PeerMsg::admin_command(req); self.send(region_id, msg).unwrap(); block_on(sub.result()) } @@ -179,7 +183,7 @@ impl TestRouter { let mut snap_req = Request::default(); snap_req.set_cmd_type(CmdType::Snap); req.mut_requests().push(snap_req); - block_on(self.get_snapshot(req)).unwrap() + block_on(self.snapshot(req)).unwrap() } pub fn region_detail(&self, region_id: u64) -> metapb::Region { diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs index 29f665758d6..cb8d71840cf 100644 --- a/components/raftstore-v2/tests/integrations/test_basic_write.rs +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -2,14 +2,11 @@ use std::{assert_matches::assert_matches, time::Duration}; -use engine_traits::Peekable; +use engine_traits::{Peekable, CF_DEFAULT}; use futures::executor::block_on; -use kvproto::{ - raft_cmdpb::{CmdType, Request}, - raft_serverpb::RaftMessage, -}; +use kvproto::raft_serverpb::RaftMessage; use raftstore::store::{INIT_EPOCH_CONF_VER, INIT_EPOCH_VER}; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use tikv_util::store::new_peer; use crate::cluster::{check_skip_wal, Cluster}; @@ -19,17 +16,14 @@ use crate::cluster::{check_skip_wal, Cluster}; fn test_basic_write() { let cluster = Cluster::default(); let router = &cluster.routers[0]; - let mut req = router.new_request_for(2); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(b"key".to_vec()); - put_req.mut_put().set_value(b"value".to_vec()); - req.mut_requests().push(put_req); + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); router.wait_applied_to_current_term(2, Duration::from_secs(3)); // Good proposal should be committed. - let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + let (msg, mut sub) = PeerMsg::simple_write(header.clone(), put.clone().encode()); router.send(2, msg).unwrap(); assert!(block_on(sub.wait_proposed())); assert!(block_on(sub.wait_committed())); @@ -37,9 +31,9 @@ fn test_basic_write() { assert!(!resp.get_header().has_error(), "{:?}", resp); // Store id should be checked. - let mut invalid_req = req.clone(); - invalid_req.mut_header().set_peer(new_peer(3, 3)); - let resp = router.command(2, invalid_req.clone()).unwrap(); + let mut invalid_header = header.clone(); + invalid_header.set_peer(new_peer(3, 3)); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); assert!( resp.get_header().get_error().has_store_not_match(), "{:?}", @@ -47,36 +41,27 @@ fn test_basic_write() { ); // Peer id should be checked. - let mut invalid_req = req.clone(); - invalid_req.mut_header().set_peer(new_peer(1, 1)); - let resp = router.command(2, invalid_req.clone()).unwrap(); + invalid_header = header.clone(); + invalid_header.set_peer(new_peer(1, 1)); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); assert!(resp.get_header().has_error(), "{:?}", resp); // Epoch should be checked. - let mut invalid_req = req.clone(); - invalid_req - .mut_header() + invalid_header = header.clone(); + invalid_header .mut_region_epoch() .set_version(INIT_EPOCH_VER - 1); - let resp = router.command(2, invalid_req.clone()).unwrap(); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); assert!( resp.get_header().get_error().has_epoch_not_match(), "{:?}", resp ); - // It's wrong to send query to write command. - let mut invalid_req = req.clone(); - let mut snap_req = Request::default(); - snap_req.set_cmd_type(CmdType::Snap); - invalid_req.mut_requests().push(snap_req); - let resp = router.command(2, invalid_req.clone()).unwrap(); - assert!(resp.get_header().has_error(), "{:?}", resp); - // Term should be checked if set. - let mut invalid_req = req.clone(); - invalid_req.mut_header().set_term(1); - let resp = router.command(2, invalid_req).unwrap(); + invalid_header = header.clone(); + invalid_header.set_term(1); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); assert!( resp.get_header().get_error().has_stale_command(), "{:?}", @@ -84,11 +69,9 @@ fn test_basic_write() { ); // Too large message can cause regression and should be rejected. - let mut invalid_req = req.clone(); - invalid_req.mut_requests()[0] - .mut_put() - .set_value(vec![0; 8 * 1024 * 1024]); - let resp = router.command(2, invalid_req).unwrap(); + let mut invalid_put = SimpleWriteEncoder::with_capacity(9 * 1024 * 1024); + invalid_put.put(CF_DEFAULT, b"key", &vec![0; 8 * 1024 * 1024]); + let resp = router.simple_write(2, header.clone(), invalid_put).unwrap(); assert!( resp.get_header().get_error().has_raft_entry_too_large(), "{:?}", @@ -106,7 +89,7 @@ fn test_basic_write() { raft_message.set_from(4); raft_message.set_term(8); router.send_raft_message(msg).unwrap(); - let resp = router.command(2, req).unwrap(); + let resp = router.simple_write(2, header, put).unwrap(); assert!(resp.get_header().get_error().has_not_leader(), "{:?}", resp); } @@ -114,18 +97,15 @@ fn test_basic_write() { fn test_put_delete() { let mut cluster = Cluster::default(); let router = &mut cluster.routers[0]; - let mut req = router.new_request_for(2); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(b"key".to_vec()); - put_req.mut_put().set_value(b"value".to_vec()); - req.mut_requests().push(put_req); + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); router.wait_applied_to_current_term(2, Duration::from_secs(3)); let snap = router.stale_snapshot(2); assert!(snap.get_value(b"key").unwrap().is_none()); - let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + let (msg, mut sub) = PeerMsg::simple_write(header.clone(), put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub.wait_proposed())); assert!(block_on(sub.wait_committed())); @@ -134,12 +114,9 @@ fn test_put_delete() { let snap = router.stale_snapshot(2); assert_eq!(snap.get_value(b"key").unwrap().unwrap(), b"value"); - let mut delete_req = Request::default(); - delete_req.set_cmd_type(CmdType::Delete); - delete_req.mut_delete().set_key(b"key".to_vec()); - req.clear_requests(); - req.mut_requests().push(delete_req); - let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + let mut delete = SimpleWriteEncoder::with_capacity(64); + delete.delete(CF_DEFAULT, b"key"); + let (msg, mut sub) = PeerMsg::simple_write(header, delete.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub.wait_proposed())); assert!(block_on(sub.wait_committed())); diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index db62ae4a75a..8a075bb9a35 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -2,10 +2,13 @@ use std::{self, time::Duration}; -use engine_traits::Peekable; -use kvproto::raft_cmdpb::{AdminCmdType, CmdType, Request}; +use engine_traits::{Peekable, CF_DEFAULT}; +use kvproto::raft_cmdpb::AdminCmdType; use raft::prelude::ConfChangeType; -use raftstore_v2::router::{PeerMsg, PeerTick}; +use raftstore_v2::{ + router::{PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; use tikv_util::store::new_learner_peer; use crate::cluster::{check_skip_wal, Cluster}; @@ -23,7 +26,7 @@ fn test_simple_change() { let store_id = cluster.node(1).id(); let new_peer = new_learner_peer(store_id, 10); admin_req.mut_change_peer().set_peer(new_peer.clone()); - let resp = cluster.routers[0].command(2, req.clone()).unwrap(); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); let epoch = req.get_header().get_region_epoch(); let new_conf_ver = epoch.get_conf_ver() + 1; @@ -57,13 +60,10 @@ fn test_simple_change() { // write one kv after snapshot let (key, val) = (b"key", b"value"); - let mut write_req = cluster.routers[0].new_request_for(region_id); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(key.to_vec()); - put_req.mut_put().set_value(val.to_vec()); - write_req.mut_requests().push(put_req); - let (msg, _) = PeerMsg::raft_command(write_req.clone()); + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, _) = PeerMsg::simple_write(header, put.encode()); cluster.routers[0].send(region_id, msg).unwrap(); std::thread::sleep(Duration::from_millis(1000)); cluster.dispatch(region_id, vec![]); @@ -84,7 +84,7 @@ fn test_simple_change() { req.mut_admin_request() .mut_change_peer() .set_change_type(ConfChangeType::RemoveNode); - let resp = cluster.routers[0].command(2, req.clone()).unwrap(); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); let epoch = req.get_header().get_region_epoch(); let new_conf_ver = epoch.get_conf_ver() + 1; diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs index 07ae8b44bf3..f9575ff8da1 100644 --- a/components/raftstore-v2/tests/integrations/test_read.rs +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -1,8 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use engine_traits::CF_DEFAULT; use futures::executor::block_on; use kvproto::raft_cmdpb::{CmdType, Request}; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use tikv_util::{config::ReadableDuration, store::new_peer}; use txn_types::WriteBatchFlags; @@ -39,14 +40,11 @@ fn test_read_index() { std::thread::sleep(std::time::Duration::from_millis(200)); let read_req = req.clone(); // the read lease should be expired and renewed by write - let mut req = router.new_request_for(region_id); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(b"key".to_vec()); - put_req.mut_put().set_value(b"value".to_vec()); - req.mut_requests().push(put_req); + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); - let (msg, sub) = PeerMsg::raft_command(req.clone()); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); router.send(region_id, msg).unwrap(); block_on(sub.result()).unwrap(); @@ -172,7 +170,7 @@ fn test_local_read() { request_inner.set_cmd_type(CmdType::Snap); req.mut_requests().push(request_inner); - block_on(async { router.get_snapshot(req.clone()).await.unwrap() }); + block_on(async { router.snapshot(req.clone()).await.unwrap() }); let res = router.query(region_id, req.clone()).unwrap(); let resp = res.read().unwrap(); // The read index will be 0 as the retry process in the `get_snapshot` will diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs index df806063249..d5bc784857e 100644 --- a/components/raftstore-v2/tests/integrations/test_split.rs +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -2,16 +2,14 @@ use std::{thread, time::Duration}; -use engine_traits::{RaftEngineReadOnly, CF_RAFT}; +use engine_traits::{RaftEngineReadOnly, CF_DEFAULT, CF_RAFT}; use futures::executor::block_on; use kvproto::{ metapb, pdpb, - raft_cmdpb::{ - AdminCmdType, AdminRequest, CmdType, RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, - }, + raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest, RaftCmdResponse, SplitRequest}, }; use raftstore::store::{INIT_EPOCH_VER, RAFT_INIT_LOG_INDEX}; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use tikv_util::store::new_peer; use crate::cluster::{Cluster, TestRouter}; @@ -37,7 +35,7 @@ fn new_batch_split_region_request( } fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { - let (msg, sub) = PeerMsg::raft_command(req); + let (msg, sub) = PeerMsg::admin_command(req); router.send(region_id, msg).unwrap(); block_on(sub.result()).unwrap(); @@ -47,19 +45,10 @@ fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { } fn put(router: &mut TestRouter, region_id: u64, key: &[u8]) -> RaftCmdResponse { - let mut req = router.new_request_for(region_id); - - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(key.to_vec()); - put_req.mut_put().set_value(b"v1".to_vec()); - req.mut_requests().push(put_req); - - let (msg, mut sub) = PeerMsg::raft_command(req.clone()); - router.send(region_id, msg).unwrap(); - assert!(block_on(sub.wait_proposed())); - assert!(block_on(sub.wait_committed())); - block_on(sub.result()).unwrap() + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, b"v1"); + router.simple_write(region_id, header, put).unwrap() } // Split the region according to the parameters diff --git a/components/raftstore-v2/tests/integrations/test_trace_apply.rs b/components/raftstore-v2/tests/integrations/test_trace_apply.rs index def064e6d29..71682ff52a4 100644 --- a/components/raftstore-v2/tests/integrations/test_trace_apply.rs +++ b/components/raftstore-v2/tests/integrations/test_trace_apply.rs @@ -2,12 +2,12 @@ use std::{path::Path, time::Duration}; -use engine_traits::{DbOptionsExt, MiscExt, Peekable, CF_LOCK, CF_WRITE, DATA_CFS}; +use engine_traits::{DbOptionsExt, MiscExt, Peekable, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS}; use futures::executor::block_on; use raftstore::store::RAFT_INIT_LOG_INDEX; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; -use crate::cluster::{new_put_request, Cluster}; +use crate::cluster::Cluster; fn count_file(path: &Path, pat: impl Fn(&Path) -> bool) -> usize { let mut count = 0; @@ -47,25 +47,30 @@ fn test_data_recovery() { router.wait_applied_to_current_term(2, Duration::from_secs(3)); // Write 100 keys to default CF and not flush. - let mut req = router.new_request_for(2); + let header = Box::new(router.new_request_for(2).take_header()); for i in 0..100 { - let put_req = new_put_request(format!("key{}", i), format!("value{}", i)); - req.clear_requests(); - req.mut_requests().push(put_req); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_DEFAULT, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); router - .send(2, PeerMsg::raft_command(req.clone()).0) + .send(2, PeerMsg::simple_write(header.clone(), put.encode()).0) .unwrap(); } // Write 100 keys to write CF and flush half. let mut sub = None; for i in 0..50 { - let mut put_req = new_put_request(format!("key{}", i), format!("value{}", i)); - put_req.mut_put().set_cf(CF_WRITE.to_owned()); - req.clear_requests(); - req.mut_requests().push(put_req); - let (ch, s) = PeerMsg::raft_command(req.clone()); - router.send(2, ch).unwrap(); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_WRITE, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + let (msg, s) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); sub = Some(s); } let resp = block_on(sub.take().unwrap().result()).unwrap(); @@ -75,23 +80,27 @@ fn test_data_recovery() { cached.latest().unwrap().flush_cf(CF_WRITE, true).unwrap(); let router = &mut cluster.routers[0]; for i in 50..100 { - let mut put_req = new_put_request(format!("key{}", i), format!("value{}", i)); - put_req.mut_put().set_cf(CF_WRITE.to_owned()); - req.clear_requests(); - req.mut_requests().push(put_req); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_WRITE, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); router - .send(2, PeerMsg::raft_command(req.clone()).0) + .send(2, PeerMsg::simple_write(header.clone(), put.encode()).0) .unwrap(); } // Write 100 keys to lock CF and flush all. for i in 0..100 { - let mut put_req = new_put_request(format!("key{}", i), format!("value{}", i)); - put_req.mut_put().set_cf(CF_LOCK.to_owned()); - req.clear_requests(); - req.mut_requests().push(put_req); - let (ch, s) = PeerMsg::raft_command(req.clone()); - router.send(2, ch).unwrap(); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_LOCK, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + let (msg, s) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); sub = Some(s); } let resp = block_on(sub.take().unwrap().result()).unwrap(); @@ -137,12 +146,9 @@ fn test_data_recovery() { let router = &mut cluster.routers[0]; // Write another key to ensure all data are recovered. - let put_req = new_put_request("key101", "value101"); - req.clear_requests(); - req.mut_requests().push(put_req); - let (msg, sub) = PeerMsg::raft_command(req.clone()); - router.send(2, msg).unwrap(); - let resp = block_on(sub.result()).unwrap(); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key101", b"value101"); + let resp = router.simple_write(2, header, put).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); // After being restarted, all unflushed logs should be applied again. So there diff --git a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs index 7096f06b1d2..d031d6b1eba 100644 --- a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs +++ b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs @@ -2,14 +2,14 @@ use std::time::Duration; -use engine_traits::Peekable; +use engine_traits::{Peekable, CF_DEFAULT}; use futures::executor::block_on; use kvproto::{ metapb, - raft_cmdpb::{AdminCmdType, CmdType, Request, TransferLeaderRequest}, + raft_cmdpb::{AdminCmdType, TransferLeaderRequest}, }; use raft::prelude::ConfChangeType; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use tikv_util::store::new_peer; use crate::cluster::Cluster; @@ -22,12 +22,6 @@ fn put_data( key: &[u8], ) { let router = &cluster.routers[node_off]; - let mut req = router.new_request_for(region_id); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(key[1..].to_vec()); - put_req.mut_put().set_value(b"value".to_vec()); - req.mut_requests().push(put_req); router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); @@ -41,7 +35,10 @@ fn put_data( .clone(); assert!(tablet.get_value(key).unwrap().is_none()); - let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, &key[1..], b"value"); + let (msg, mut sub) = PeerMsg::simple_write(header, put.encode()); router.send(region_id, msg).unwrap(); std::thread::sleep(std::time::Duration::from_millis(10)); cluster.dispatch(region_id, vec![]); @@ -84,7 +81,7 @@ pub fn must_transfer_leader( let admin_req = req.mut_admin_request(); admin_req.set_cmd_type(AdminCmdType::TransferLeader); admin_req.set_transfer_leader(transfer_req); - let resp = router.command(region_id, req).unwrap(); + let resp = router.admin_command(region_id, req).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); cluster.dispatch(region_id, vec![]); @@ -114,7 +111,7 @@ fn test_transfer_leader() { let peer1 = new_peer(store_id, 10); admin_req.mut_change_peer().set_peer(peer1.clone()); let req_clone = req.clone(); - let resp = router0.command(region_id, req_clone).unwrap(); + let resp = router0.admin_command(region_id, req_clone).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); let epoch = req.get_header().get_region_epoch(); let new_conf_ver = epoch.get_conf_ver() + 1; diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index a3d0bdb2712..affa0205e8f 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -90,7 +90,7 @@ use crate::{ peer::Peer, peer_storage::{write_initial_apply_state, write_peer_state}, util::{ - self, admin_cmd_epoch_lookup, check_flashback_state, check_region_epoch, + self, admin_cmd_epoch_lookup, check_flashback_state, check_req_region_epoch, compare_region_epoch, ChangePeerI, ConfChangeKind, KeysInfoFormatter, LatencyInspector, }, Config, RegionSnapshot, RegionTask, WriteCallback, @@ -1587,7 +1587,7 @@ where // Include region for epoch not match after merge may cause key not in range. let include_region = req.get_header().get_region_epoch().get_version() >= self.last_merge_version; - check_region_epoch(req, &self.region, include_region)?; + check_req_region_epoch(req, &self.region, include_region)?; check_flashback_state( self.region.get_is_in_flashback(), req, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 47c9357e1c4..311e7e58a12 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -4985,7 +4985,7 @@ where msg: &RaftCmdRequest, ) -> Result> { // Check store_id, make sure that the msg is dispatched to the right place. - if let Err(e) = util::check_store_id(msg, self.store_id()) { + if let Err(e) = util::check_store_id(msg.get_header(), self.store_id()) { self.ctx .raft_metrics .invalid_proposal @@ -5004,7 +5004,7 @@ where let request = msg.get_requests(); // peer_id must be the same as peer's. - if let Err(e) = util::check_peer_id(msg, self.fsm.peer.peer_id()) { + if let Err(e) = util::check_peer_id(msg.get_header(), self.fsm.peer.peer_id()) { self.ctx .raft_metrics .invalid_proposal @@ -5084,12 +5084,12 @@ where ))); } // Check whether the term is stale. - if let Err(e) = util::check_term(msg, self.fsm.peer.term()) { + if let Err(e) = util::check_term(msg.get_header(), self.fsm.peer.term()) { self.ctx.raft_metrics.invalid_proposal.stale_command.inc(); return Err(e); } - match util::check_region_epoch(msg, self.fsm.peer.region(), true) { + match util::check_req_region_epoch(msg, self.fsm.peer.region(), true) { Err(Error::EpochNotMatch(m, mut new_regions)) => { // Attach the region which might be split from the current region. But it // doesn't matter if the region is not split from the current region. If the diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 22b822c2115..86d16b07506 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -76,8 +76,8 @@ use super::{ read_queue::{ReadIndexQueue, ReadIndexRequest}, transport::Transport, util::{ - self, check_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, ConfChangeKind, - Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, + self, check_req_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, + ConfChangeKind, Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, }, DestroyPeerJob, LocalReadContext, }; @@ -4708,7 +4708,7 @@ where ) -> ReadResponse { let region = self.region().clone(); if check_epoch { - if let Err(e) = check_region_epoch(&req, ®ion, true) { + if let Err(e) = check_req_region_epoch(&req, ®ion, true) { debug!("epoch not match"; "region_id" => region.get_id(), "err" => ?e); let mut response = cmd_resp::new_error(e); cmd_resp::bind_term(&mut response, self.term()); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 41409a49448..78f024997cf 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -19,7 +19,9 @@ use engine_traits::KvEngine; use kvproto::{ kvrpcpb::{self, KeyRange, LeaderInfo}, metapb::{self, Peer, PeerRole, Region, RegionEpoch}, - raft_cmdpb::{AdminCmdType, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest}, + raft_cmdpb::{ + AdminCmdType, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, RaftRequestHeader, + }, raft_serverpb::{RaftMessage, RaftSnapshotData}, }; use protobuf::{self, Message}; @@ -235,28 +237,45 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat pub static NORMAL_REQ_CHECK_VER: bool = true; pub static NORMAL_REQ_CHECK_CONF_VER: bool = false; -pub fn check_region_epoch( +pub fn check_req_region_epoch( req: &RaftCmdRequest, region: &metapb::Region, include_region: bool, ) -> Result<()> { - let (check_ver, check_conf_ver) = if !req.has_admin_request() { - // for get/set/delete, we don't care conf_version. - (NORMAL_REQ_CHECK_VER, NORMAL_REQ_CHECK_CONF_VER) + let admin_ty = if !req.has_admin_request() { + None } else { - let epoch_state = admin_cmd_epoch_lookup(req.get_admin_request().get_cmd_type()); - (epoch_state.check_ver, epoch_state.check_conf_ver) + Some(req.get_admin_request().get_cmd_type()) + }; + check_region_epoch(req.get_header(), admin_ty, region, include_region) +} + +pub fn check_region_epoch( + header: &RaftRequestHeader, + admin_ty: Option, + region: &metapb::Region, + include_region: bool, +) -> Result<()> { + let (check_ver, check_conf_ver) = match admin_ty { + None => { + // for get/set/delete, we don't care conf_version. + (NORMAL_REQ_CHECK_VER, NORMAL_REQ_CHECK_CONF_VER) + } + Some(ty) => { + let epoch_state = admin_cmd_epoch_lookup(ty); + (epoch_state.check_ver, epoch_state.check_conf_ver) + } }; if !check_ver && !check_conf_ver { return Ok(()); } - if !req.get_header().has_region_epoch() { + if !header.has_region_epoch() { return Err(box_err!("missing epoch!")); } - let from_epoch = req.get_header().get_region_epoch(); + let from_epoch = header.get_region_epoch(); compare_region_epoch( from_epoch, region, @@ -351,8 +370,8 @@ pub fn is_region_epoch_equal( } #[inline] -pub fn check_store_id(req: &RaftCmdRequest, store_id: u64) -> Result<()> { - let peer = req.get_header().get_peer(); +pub fn check_store_id(header: &RaftRequestHeader, store_id: u64) -> Result<()> { + let peer = header.get_peer(); if peer.get_store_id() == store_id { Ok(()) } else { @@ -364,8 +383,7 @@ pub fn check_store_id(req: &RaftCmdRequest, store_id: u64) -> Result<()> { } #[inline] -pub fn check_term(req: &RaftCmdRequest, term: u64) -> Result<()> { - let header = req.get_header(); +pub fn check_term(header: &RaftRequestHeader, term: u64) -> Result<()> { if header.get_term() == 0 || term <= header.get_term() + 1 { Ok(()) } else { @@ -376,8 +394,7 @@ pub fn check_term(req: &RaftCmdRequest, term: u64) -> Result<()> { } #[inline] -pub fn check_peer_id(req: &RaftCmdRequest, peer_id: u64) -> Result<()> { - let header = req.get_header(); +pub fn check_peer_id(header: &RaftRequestHeader, peer_id: u64) -> Result<()> { if header.get_peer().get_id() == peer_id { Ok(()) } else { @@ -2001,34 +2018,34 @@ mod tests { #[test] fn test_check_store_id() { - let mut req = RaftCmdRequest::default(); - req.mut_header().mut_peer().set_store_id(1); - check_store_id(&req, 1).unwrap(); - check_store_id(&req, 2).unwrap_err(); + let mut header = RaftRequestHeader::default(); + header.mut_peer().set_store_id(1); + check_store_id(&header, 1).unwrap(); + check_store_id(&header, 2).unwrap_err(); } #[test] fn test_check_peer_id() { - let mut req = RaftCmdRequest::default(); - req.mut_header().mut_peer().set_id(1); - check_peer_id(&req, 1).unwrap(); - check_peer_id(&req, 2).unwrap_err(); + let mut header = RaftRequestHeader::default(); + header.mut_peer().set_id(1); + check_peer_id(&header, 1).unwrap(); + check_peer_id(&header, 2).unwrap_err(); } #[test] fn test_check_term() { - let mut req = RaftCmdRequest::default(); - req.mut_header().set_term(7); - check_term(&req, 7).unwrap(); - check_term(&req, 8).unwrap(); + let mut header = RaftRequestHeader::default(); + header.set_term(7); + check_term(&header, 7).unwrap(); + check_term(&header, 8).unwrap(); // If header's term is 2 verions behind current term, // leadership may have been changed away. - check_term(&req, 9).unwrap_err(); - check_term(&req, 10).unwrap_err(); + check_term(&header, 9).unwrap_err(); + check_term(&header, 10).unwrap_err(); } #[test] - fn test_check_region_epoch() { + fn test_check_req_region_epoch() { let mut epoch = RegionEpoch::default(); epoch.set_conf_ver(2); epoch.set_version(2); @@ -2036,7 +2053,7 @@ mod tests { region.set_region_epoch(epoch.clone()); // Epoch is required for most requests even if it's empty. - check_region_epoch(&RaftCmdRequest::default(), ®ion, false).unwrap_err(); + check_req_region_epoch(&RaftCmdRequest::default(), ®ion, false).unwrap_err(); // These admin commands do not require epoch. for ty in &[ @@ -2051,11 +2068,11 @@ mod tests { req.set_admin_request(admin); // It is Okay if req does not have region epoch. - check_region_epoch(&req, ®ion, false).unwrap(); + check_req_region_epoch(&req, ®ion, false).unwrap(); req.mut_header().set_region_epoch(epoch.clone()); - check_region_epoch(&req, ®ion, true).unwrap(); - check_region_epoch(&req, ®ion, false).unwrap(); + check_req_region_epoch(&req, ®ion, true).unwrap(); + check_req_region_epoch(&req, ®ion, false).unwrap(); } // These admin commands requires epoch.version. @@ -2073,7 +2090,7 @@ mod tests { req.set_admin_request(admin); // Error if req does not have region epoch. - check_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); let mut stale_version_epoch = epoch.clone(); stale_version_epoch.set_version(1); @@ -2081,14 +2098,14 @@ mod tests { stale_region.set_region_epoch(stale_version_epoch.clone()); req.mut_header() .set_region_epoch(stale_version_epoch.clone()); - check_region_epoch(&req, &stale_region, false).unwrap(); + check_req_region_epoch(&req, &stale_region, false).unwrap(); let mut latest_version_epoch = epoch.clone(); latest_version_epoch.set_version(3); for epoch in &[stale_version_epoch, latest_version_epoch] { req.mut_header().set_region_epoch(epoch.clone()); - check_region_epoch(&req, ®ion, false).unwrap_err(); - check_region_epoch(&req, ®ion, true).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, true).unwrap_err(); } } @@ -2109,21 +2126,21 @@ mod tests { req.set_admin_request(admin); // Error if req does not have region epoch. - check_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); let mut stale_conf_epoch = epoch.clone(); stale_conf_epoch.set_conf_ver(1); let mut stale_region = metapb::Region::default(); stale_region.set_region_epoch(stale_conf_epoch.clone()); req.mut_header().set_region_epoch(stale_conf_epoch.clone()); - check_region_epoch(&req, &stale_region, false).unwrap(); + check_req_region_epoch(&req, &stale_region, false).unwrap(); let mut latest_conf_epoch = epoch.clone(); latest_conf_epoch.set_conf_ver(3); for epoch in &[stale_conf_epoch, latest_conf_epoch] { req.mut_header().set_region_epoch(epoch.clone()); - check_region_epoch(&req, ®ion, false).unwrap_err(); - check_region_epoch(&req, ®ion, true).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, true).unwrap_err(); } } } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index c78a51866ae..a20fcefdbdb 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -760,7 +760,7 @@ where } let store_id = self.store_id.get().unwrap(); - if let Err(e) = util::check_store_id(req, store_id) { + if let Err(e) = util::check_store_id(req.get_header(), store_id) { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.store_id_mismatch.inc()); debug!("rejected by store id not match"; "err" => %e); return Err(e); @@ -780,13 +780,13 @@ where fail_point!("localreader_on_find_delegate"); // Check peer id. - if let Err(e) = util::check_peer_id(req, delegate.peer_id) { + if let Err(e) = util::check_peer_id(req.get_header(), delegate.peer_id) { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.peer_id_mismatch.inc()); return Err(e); } // Check term. - if let Err(e) = util::check_term(req, delegate.term) { + if let Err(e) = util::check_term(req.get_header(), delegate.term) { debug!( "check term"; "delegate_term" => delegate.term, @@ -797,7 +797,7 @@ where } // Check region epoch. - if util::check_region_epoch(req, &delegate.region, false).is_err() { + if util::check_req_region_epoch(req, &delegate.region, false).is_err() { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.epoch.inc()); // Stale epoch, redirect it to raftstore to get the latest region. debug!("rejected by epoch not match"; "tag" => &delegate.tag); diff --git a/src/server/mod.rs b/src/server/mod.rs index 1b41dfc4e56..0e4a3616a6c 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -13,6 +13,7 @@ pub mod lock_manager; pub mod node; mod proxy; pub mod raftkv; +mod raftkv2; mod reset_to_version; pub mod resolve; pub mod server; diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index b12e56ee7a0..607d5af71f3 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -84,7 +84,7 @@ pub enum Error { Timeout(Duration), } -fn get_status_kind_from_engine_error(e: &kv::Error) -> RequestStatusKind { +pub fn get_status_kind_from_engine_error(e: &kv::Error) -> RequestStatusKind { match *e { KvError(box KvErrorInner::Request(ref header)) => { RequestStatusKind::from(storage::get_error_kind_from_header(header)) diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs new file mode 100644 index 00000000000..5bcdd131d72 --- /dev/null +++ b/src/server/raftkv2/mod.rs @@ -0,0 +1,307 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{mem, pin::Pin, task::Poll}; + +use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; +use futures::{Future, Stream, StreamExt}; +use kvproto::{ + raft_cmdpb::{CmdType, RaftCmdRequest, Request}, + raft_serverpb::RaftMessage, +}; +use raftstore::store::RegionSnapshot; +use raftstore_v2::{ + router::{ + message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, + }, + SimpleWriteEncoder, StoreRouter, +}; +use tikv_kv::{Modify, RaftExtension, WriteEvent}; +use tikv_util::{codec::number::NumberEncoder, time::Instant}; +use txn_types::WriteBatchFlags; + +use super::{ + metrics::{ASYNC_REQUESTS_COUNTER_VEC, ASYNC_REQUESTS_DURATIONS_VEC}, + raftkv::{get_status_kind_from_engine_error, new_request_header}, +}; + +#[derive(Clone)] +pub struct RaftExtensionImpl { + router: StoreRouter, +} + +impl RaftExtension for RaftExtensionImpl { + #[inline] + fn feed(&self, msg: RaftMessage, key_message: bool) { + let region_id = msg.get_region_id(); + let msg_ty = msg.get_message().get_msg_type(); + // Channel full and region not found are ignored unless it's a key message. + if let Err(e) = self.router.send_raft_message(Box::new(msg)) && key_message { + error!("failed to send raft message"; "region_id" => region_id, "msg_ty" => ?msg_ty, "err" => ?e); + } + } + + fn report_reject_message(&self, _region_id: u64, _from_peer_id: u64) { + // TODO:reject the message on connection side instead of go through + // raft layer. + } + + fn report_peer_unreachable(&self, region_id: u64, to_peer_id: u64) { + let _ = self + .router + .send(region_id, PeerMsg::PeerUnreachable { to_peer_id }); + } + + fn report_store_unreachable(&self, _store_id: u64) {} + + fn report_snapshot_status( + &self, + _region_id: u64, + _to_peer_id: u64, + _status: raft::SnapshotStatus, + ) { + } + + fn report_resolved(&self, _store_id: u64, _group_id: u64) {} + + fn split( + &self, + _region_id: u64, + _region_epoch: kvproto::metapb::RegionEpoch, + _split_keys: Vec>, + _source: String, + ) -> futures::future::BoxFuture<'static, tikv_kv::Result>> { + Box::pin(async move { Err(box_err!("raft split is not supported")) }) + } + + fn query_region( + &self, + _region_id: u64, + ) -> futures::future::BoxFuture< + 'static, + tikv_kv::Result, + > { + Box::pin(async move { Err(box_err!("query region is not supported")) }) + } +} + +struct Transform { + resp: CmdResStream, + early_err: Option, +} + +impl Stream for Transform { + type Item = WriteEvent; + + fn poll_next( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let stream = self.get_mut(); + if stream.early_err.is_some() { + return Poll::Ready(Some(WriteEvent::Finished(Err(stream + .early_err + .take() + .unwrap())))); + } + match stream.resp.poll_next_unpin(cx) { + Poll::Pending => Poll::Pending, + Poll::Ready(Some(CmdResEvent::Proposed)) => Poll::Ready(Some(WriteEvent::Proposed)), + Poll::Ready(Some(CmdResEvent::Committed)) => Poll::Ready(Some(WriteEvent::Committed)), + Poll::Ready(Some(CmdResEvent::Finished(mut resp))) => { + let res = if !resp.get_header().has_error() { + Ok(()) + } else { + Err(tikv_kv::Error::from(resp.take_header().take_error())) + }; + Poll::Ready(Some(WriteEvent::Finished(res))) + } + Poll::Ready(None) => Poll::Ready(None), + } + } +} + +#[derive(Clone)] +pub struct RaftKv2 { + router: RaftRouter, +} + +impl RaftKv2 { + #[allow(unused)] + pub fn new(router: RaftRouter) -> RaftKv2 { + RaftKv2 { router } + } +} + +impl tikv_kv::Engine for RaftKv2 { + type Snap = RegionSnapshot; + type Local = EK; + + #[inline] + fn kv_engine(&self) -> Option { + None + } + + type RaftExtension = RaftExtensionImpl; + + fn modify_on_kv_engine( + &self, + _region_modifies: collections::HashMap>, + ) -> tikv_kv::Result<()> { + // TODO + Ok(()) + } + + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, mut ctx: tikv_kv::SnapContext<'_>) -> Self::SnapshotRes { + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + if !ctx.key_ranges.is_empty() && ctx.start_ts.map_or(false, |ts| !ts.is_zero()) { + req.mut_read_index() + .set_start_ts(ctx.start_ts.as_ref().unwrap().into_inner()); + req.mut_read_index() + .set_key_ranges(mem::take(&mut ctx.key_ranges).into()); + } + ASYNC_REQUESTS_COUNTER_VEC.snapshot.all.inc(); + let begin_instant = Instant::now_coarse(); + + let mut header = new_request_header(ctx.pb_ctx); + let mut flags = 0; + if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { + let mut data = [0u8; 8]; + (&mut data[..]) + .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) + .unwrap(); + flags |= WriteBatchFlags::STALE_READ.bits(); + header.set_flag_data(data.into()); + } + if ctx.allowed_in_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + + let mut cmd = RaftCmdRequest::default(); + cmd.set_header(header); + cmd.set_requests(vec![req].into()); + let f = self.router.snapshot(cmd); + async move { + let res = f.await; + match res { + Ok(snap) => { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot + .observe(begin_instant.saturating_elapsed_secs()); + ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); + Ok(snap) + } + Err(mut resp) => { + if resp + .get_responses() + .get(0) + .map_or(false, |r| r.get_read_index().has_locked()) + { + let locked = resp.mut_responses()[0].mut_read_index().take_locked(); + Err(tikv_kv::Error::from(tikv_kv::ErrorInner::KeyIsLocked( + locked, + ))) + } else if resp.get_header().has_error() { + let err = tikv_kv::Error::from(resp.take_header().take_error()); + let status_kind = get_status_kind_from_engine_error(&err); + ASYNC_REQUESTS_COUNTER_VEC.snapshot.get(status_kind).inc(); + Err(err) + } else { + Err(box_err!("unexpected response: {:?}", resp)) + } + } + } + } + } + + type WriteRes = impl Stream + Send + Unpin; + fn async_write( + &self, + ctx: &kvproto::kvrpcpb::Context, + batch: tikv_kv::WriteData, + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + let region_id = ctx.region_id; + ASYNC_REQUESTS_COUNTER_VEC.write.all.inc(); + let begin_instant = Instant::now_coarse(); + let mut header = Box::new(new_request_header(ctx)); + let mut flags = 0; + if batch.extra.one_pc { + flags |= WriteBatchFlags::ONE_PC.bits(); + } + if batch.extra.allowed_in_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + + self.schedule_txn_extra(batch.extra); + let mut encoder = SimpleWriteEncoder::with_capacity(128); + for m in batch.modifies { + match m { + Modify::Put(cf, k, v) => encoder.put(cf, k.as_encoded(), &v), + Modify::Delete(cf, k) => encoder.delete(cf, k.as_encoded()), + Modify::PessimisticLock(k, lock) => { + encoder.put(CF_LOCK, k.as_encoded(), &lock.into_lock().to_bytes()) + } + Modify::DeleteRange(cf, start_key, end_key, notify_only) => encoder.delete_range( + cf, + start_key.as_encoded(), + end_key.as_encoded(), + notify_only, + ), + } + } + let data = encoder.encode(); + let mut builder = CmdResChannelBuilder::default(); + if WriteEvent::subscribed_proposed(subscribed) { + builder.subscribe_proposed(); + } + if WriteEvent::subscribed_committed(subscribed) { + builder.subscribe_committed(); + } + if let Some(cb) = on_applied { + builder.before_set(move |resp| { + let mut res = if !resp.get_header().has_error() { + Ok(()) + } else { + Err(tikv_kv::Error::from(resp.get_header().get_error().clone())) + }; + cb(&mut res); + }); + } + let (ch, sub) = builder.build(); + let msg = PeerMsg::SimpleWrite(SimpleWrite { + header, + data, + ch, + send_time: Instant::now_coarse(), + }); + let res = self + .router + .store_router() + .send(region_id, msg) + .map_err(|e| tikv_kv::Error::from(raftstore_v2::Error::from(e))); + (Transform { + resp: CmdResStream::new(sub), + early_err: res.err(), + }) + .inspect(move |ev| { + let WriteEvent::Finished(res) = ev else { return }; + match res { + Ok(()) => { + ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); + ASYNC_REQUESTS_DURATIONS_VEC + .write + .observe(begin_instant.saturating_elapsed_secs()); + } + Err(e) => { + let status_kind = get_status_kind_from_engine_error(e); + ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); + } + } + }) + } +} From 70e15257e92524fa57c100619dd555bf58bf7853 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 20 Dec 2022 10:56:54 +0800 Subject: [PATCH 416/676] engine: adapt engine metrics for multi-rocks (#13942) ref tikv/tikv#12842 None Signed-off-by: tabokie --- Cargo.lock | 6 +- components/engine_panic/src/misc.rs | 20 +- components/engine_panic/src/raft_engine.rs | 4 - components/engine_rocks/src/engine.rs | 41 +- components/engine_rocks/src/lib.rs | 7 +- components/engine_rocks/src/misc.rs | 11 +- components/engine_rocks/src/raft_engine.rs | 4 - components/engine_rocks/src/rocks_metrics.rs | 499 ++++++++++++------ components/engine_rocks/src/util.rs | 4 +- components/engine_traits/src/engine.rs | 9 +- components/engine_traits/src/misc.rs | 19 + components/engine_traits/src/raft_engine.rs | 1 - components/server/src/lib.rs | 1 + components/server/src/server.rs | 99 +++- components/server/src/signal_handler.rs | 30 +- components/test_raftstore/src/cluster.rs | 10 +- components/test_raftstore/src/server.rs | 2 + components/test_raftstore/src/util.rs | 15 +- etc/config-template.toml | 6 - metrics/grafana/tikv_details.json | 4 +- src/config/mod.rs | 31 +- src/server/debug.rs | 37 +- src/server/engine_factory.rs | 15 +- src/server/service/debug.rs | 17 +- tests/failpoints/cases/test_pd_client.rs | 15 +- tests/integrations/config/mod.rs | 4 +- tests/integrations/config/test-custom.toml | 2 - .../raftstore/test_compact_lock_cf.rs | 8 +- 28 files changed, 599 insertions(+), 322 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 97f540aa100..cb371b739af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2876,7 +2876,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#f94fdd30dd94f6fd22c8052edfd2c4039d9f2fbd" +source = "git+https://github.com/tikv/rust-rocksdb.git#0ef7101a061c513c684ad68acd15f01c8548b43a" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2895,7 +2895,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#f94fdd30dd94f6fd22c8052edfd2c4039d9f2fbd" +source = "git+https://github.com/tikv/rust-rocksdb.git#0ef7101a061c513c684ad68acd15f01c8548b43a" dependencies = [ "bzip2-sys", "cc", @@ -4761,7 +4761,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#f94fdd30dd94f6fd22c8052edfd2c4039d9f2fbd" +source = "git+https://github.com/tikv/rust-rocksdb.git#0ef7101a061c513c684ad68acd15f01c8548b43a" dependencies = [ "libc 0.2.132", "librocksdb_sys", diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 8c983051438..730f44a7e2f 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -1,10 +1,28 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{DeleteStrategy, MiscExt, Range, Result}; +use engine_traits::{DeleteStrategy, MiscExt, Range, Result, StatisticsReporter}; use crate::engine::PanicEngine; +pub struct PanicReporter; + +impl StatisticsReporter for PanicReporter { + fn new(name: &str) -> Self { + panic!() + } + + fn collect(&mut self, engine: &PanicEngine) { + panic!() + } + + fn flush(&mut self) { + panic!() + } +} + impl MiscExt for PanicEngine { + type StatisticsReporter = PanicReporter; + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { panic!() } diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index f5e0c424db0..59c0422902c 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -136,10 +136,6 @@ impl RaftEngine for PanicEngine { panic!() } - fn reset_statistics(&self) { - panic!() - } - fn dump_stats(&self) -> Result { panic!() } diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 0c37120e7fc..0e73de357e5 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -6,17 +6,7 @@ use engine_traits::{IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Resu use rocksdb::{DBIterator, Writable, DB}; use crate::{ - db_vector::RocksDbVector, - options::RocksReadOptions, - r2e, - rocks_metrics::{ - flush_engine_histogram_metrics, flush_engine_iostall_properties, flush_engine_properties, - flush_engine_ticker_metrics, - }, - rocks_metrics_defs::{ - ENGINE_HIST_TYPES, ENGINE_TICKER_TYPES, TITAN_ENGINE_HIST_TYPES, TITAN_ENGINE_TICKER_TYPES, - }, - util::get_cf_handle, + db_vector::RocksDbVector, options::RocksReadOptions, r2e, util::get_cf_handle, RocksEngineIterator, RocksSnapshot, }; @@ -62,35 +52,6 @@ impl KvEngine for RocksEngine { self.db.sync_wal().map_err(r2e) } - fn flush_metrics(&self, instance: &str) { - for t in ENGINE_TICKER_TYPES { - let v = self.db.get_and_reset_statistics_ticker_count(*t); - flush_engine_ticker_metrics(*t, v, instance); - } - for t in ENGINE_HIST_TYPES { - if let Some(v) = self.db.get_statistics_histogram(*t) { - flush_engine_histogram_metrics(*t, v, instance); - } - } - if self.db.is_titan() { - for t in TITAN_ENGINE_TICKER_TYPES { - let v = self.db.get_and_reset_statistics_ticker_count(*t); - flush_engine_ticker_metrics(*t, v, instance); - } - for t in TITAN_ENGINE_HIST_TYPES { - if let Some(v) = self.db.get_statistics_histogram(*t) { - flush_engine_histogram_metrics(*t, v, instance); - } - } - } - flush_engine_properties(&self.db, instance); - flush_engine_iostall_properties(&self.db, instance); - } - - fn reset_statistics(&self) { - self.db.reset_statistics(); - } - fn bad_downcast(&self) -> &T { let e: &dyn Any = &self.db; e.downcast_ref().expect("bad engine downcast") diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index b6f3e36146c..94a4c23a3c4 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -16,6 +16,8 @@ //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] +#![feature(let_chains)] +#![feature(option_get_or_insert_default)] #[allow(unused_extern_crates)] extern crate tikv_alloc; @@ -104,7 +106,10 @@ pub mod file_system; mod raft_engine; -pub use rocksdb::{set_perf_flags, set_perf_level, PerfContext, PerfFlag, PerfFlags, PerfLevel}; +pub use rocksdb::{ + set_perf_flags, set_perf_level, PerfContext, PerfFlag, PerfFlags, PerfLevel, + Statistics as RocksStatistics, +}; pub mod flow_control_factors; pub use flow_control_factors::*; diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 9ef2ed079b2..4761183546e 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -8,8 +8,8 @@ use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; use crate::{ - engine::RocksEngine, r2e, rocks_metrics_defs::*, sst::RocksSstWriterBuilder, util, - RocksSstWriter, + engine::RocksEngine, r2e, rocks_metrics::RocksStatisticsReporter, rocks_metrics_defs::*, + sst::RocksSstWriterBuilder, util, RocksSstWriter, }; pub const MAX_DELETE_COUNT_BY_KEY: usize = 2048; @@ -126,6 +126,8 @@ impl RocksEngine { } impl MiscExt for RocksEngine { + type StatisticsReporter = RocksStatisticsReporter; + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { let mut handles = vec![]; for cf in cfs { @@ -277,11 +279,6 @@ impl MiscExt for RocksEngine { s.extend_from_slice(v.as_bytes()); } - // more stats if enable_statistics is true. - if let Some(v) = self.as_inner().get_statistics() { - s.extend_from_slice(v.as_bytes()); - } - Ok(box_try!(String::from_utf8(s))) } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index 9095ef27dfd..cb4c5682252 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -325,10 +325,6 @@ impl RaftEngine for RocksEngine { KvEngine::flush_metrics(self, instance) } - fn reset_statistics(&self) { - KvEngine::reset_statistics(self) - } - fn dump_stats(&self) -> Result { MiscExt::dump_stats(self) } diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 026ef36cce7..d77f5f2dc99 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -1,14 +1,15 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::CF_DEFAULT; +use collections::HashMap; +use engine_traits::{StatisticsReporter, CF_DEFAULT}; use lazy_static::lazy_static; use prometheus::*; use prometheus_static_metric::*; use rocksdb::{ - DBStatisticsHistogramType as HistType, DBStatisticsTickerType as TickerType, HistogramData, DB, + DBStatisticsHistogramType as HistType, DBStatisticsTickerType as TickerType, HistogramData, }; -use crate::rocks_metrics_defs::*; +use crate::{engine::RocksEngine, rocks_metrics_defs::*, RocksStatistics}; make_auto_flush_static_metric! { pub label_enum TickerName { @@ -910,206 +911,355 @@ pub fn flush_engine_histogram_metrics(t: HistType, value: HistogramData, name: & } } -pub fn flush_engine_iostall_properties(engine: &DB, name: &str) { - let stall_num = ROCKSDB_IOSTALL_KEY.len(); - let mut counter = vec![0; stall_num]; - for cf in engine.cf_names() { - let handle = crate::util::get_cf_handle(engine, cf).unwrap(); - if let Some(info) = engine.get_map_property_cf(handle, ROCKSDB_CFSTATS) { - for i in 0..stall_num { - let value = info.get_property_int_value(ROCKSDB_IOSTALL_KEY[i]); - counter[i] += value as i64; - } - } else { - return; - } - } - for i in 0..stall_num { - STORE_ENGINE_WRITE_STALL_REASON_GAUGE_VEC - .with_label_values(&[name, ROCKSDB_IOSTALL_TYPE[i]]) - .set(counter[i]); - } +#[derive(Default, Clone)] +struct CfLevelStats { + num_files: Option, + // sum(compression_ratio_i * num_files_i) + weighted_compression_ratio: Option, + num_blob_files: Option, } -pub fn flush_engine_properties(engine: &DB, name: &str) { - for cf in engine.cf_names() { - let handle = crate::util::get_cf_handle(engine, cf).unwrap(); - // It is important to monitor each cf's size, especially the "raft" and "lock" - // column families. - let cf_used_size = crate::util::get_engine_cf_used_size(engine, handle); - STORE_ENGINE_SIZE_GAUGE_VEC - .with_label_values(&[name, cf]) - .set(cf_used_size as i64); - - let blob_cache_usage = engine.get_blob_cache_usage_cf(handle); - STORE_ENGINE_BLOB_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, cf]) - .set(blob_cache_usage as i64); - - // TODO: find a better place to record these metrics. - // Refer: https://github.com/facebook/rocksdb/wiki/Memory-usage-in-RocksDB - // For index and filter blocks memory - if let Some(readers_mem) = engine.get_property_int_cf(handle, ROCKSDB_TABLE_READERS_MEM) { - STORE_ENGINE_MEMORY_GAUGE_VEC - .with_label_values(&[name, cf, "readers-mem"]) - .set(readers_mem as i64); - } - - // For memtable - if let Some(mem_table) = engine.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) - { - STORE_ENGINE_MEMORY_GAUGE_VEC - .with_label_values(&[name, cf, "mem-tables"]) - .set(mem_table as i64); - } +#[derive(Default)] +struct CfStats { + used_size: Option, + blob_cache_size: Option, + readers_mem: Option, + mem_tables: Option, + num_keys: Option, + pending_compaction_bytes: Option, + num_immutable_mem_table: Option, + live_blob_size: Option, + num_live_blob_file: Option, + num_obsolete_blob_file: Option, + live_blob_file_size: Option, + obsolete_blob_file_size: Option, + blob_file_discardable_ratio_le0: Option, + blob_file_discardable_ratio_le20: Option, + blob_file_discardable_ratio_le50: Option, + blob_file_discardable_ratio_le80: Option, + blob_file_discardable_ratio_le100: Option, + levels: Vec, +} - // TODO: add cache usage and pinned usage. +#[derive(Default)] +struct DbStats { + num_snapshots: Option, + oldest_snapshot_time: Option, + block_cache_size: Option, + stall_num: Vec>, +} - if let Some(num_keys) = engine.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { - STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC - .with_label_values(&[name, cf]) - .set(num_keys as i64); - } +pub struct RocksStatisticsReporter { + name: String, + db_stats: DbStats, + cf_stats: HashMap, +} - // Pending compaction bytes - if let Some(pending_compaction_bytes) = - crate::util::get_cf_pending_compaction_bytes(engine, handle) - { - STORE_ENGINE_PENDING_COMPACTION_BYTES_VEC - .with_label_values(&[name, cf]) - .set(pending_compaction_bytes as i64); +impl StatisticsReporter for RocksStatisticsReporter { + fn new(name: &str) -> Self { + Self { + name: name.to_owned(), + db_stats: DbStats::default(), + cf_stats: HashMap::default(), } + } - let opts = engine.get_options_cf(handle); - for level in 0..opts.get_num_levels() { - // Compression ratio at levels + fn collect(&mut self, engine: &RocksEngine) { + let db = engine.as_inner(); + let stall_num = ROCKSDB_IOSTALL_KEY.len(); + self.db_stats.stall_num.resize(stall_num, None); + for cf in db.cf_names() { + let cf_stats = self.cf_stats.entry(cf.to_owned()).or_default(); + let handle = crate::util::get_cf_handle(db, cf).unwrap(); + // It is important to monitor each cf's size, especially the "raft" and "lock" + // column families. + *cf_stats.used_size.get_or_insert_default() += + crate::util::get_engine_cf_used_size(db, handle); + *cf_stats.blob_cache_size.get_or_insert_default() += db.get_blob_cache_usage_cf(handle); + // TODO: find a better place to record these metrics. + // Refer: https://github.com/facebook/rocksdb/wiki/Memory-usage-in-RocksDB + // For index and filter blocks memory + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TABLE_READERS_MEM) { + *cf_stats.readers_mem.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) { + *cf_stats.mem_tables.get_or_insert_default() += v; + } + // TODO: add cache usage and pinned usage. + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { + *cf_stats.num_keys.get_or_insert_default() += v; + } + if let Some(v) = crate::util::get_cf_pending_compaction_bytes(db, handle) { + *cf_stats.pending_compaction_bytes.get_or_insert_default() += v; + } + if let Some(v) = crate::util::get_cf_num_immutable_mem_table(db, handle) { + *cf_stats.num_immutable_mem_table.get_or_insert_default() += v; + } + // Titan. + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_SIZE) { + *cf_stats.live_blob_size.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_LIVE_BLOB_FILE) { + *cf_stats.num_live_blob_file.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_OBSOLETE_BLOB_FILE) + { + *cf_stats.num_obsolete_blob_file.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_FILE_SIZE) { + *cf_stats.live_blob_file_size.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_OBSOLETE_BLOB_FILE_SIZE) + { + *cf_stats.obsolete_blob_file_size.get_or_insert_default() += v; + } if let Some(v) = - crate::util::get_engine_compression_ratio_at_level(engine, handle, level) + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE0_FILE) { - STORE_ENGINE_COMPRESSION_RATIO_VEC - .with_label_values(&[name, cf, &level.to_string()]) - .set(v); + *cf_stats + .blob_file_discardable_ratio_le0 + .get_or_insert_default() += v; } - - // Num files at levels - if let Some(v) = crate::util::get_cf_num_files_at_level(engine, handle, level) { - STORE_ENGINE_NUM_FILES_AT_LEVEL_VEC - .with_label_values(&[name, cf, &level.to_string()]) - .set(v as i64); + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE20_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le20 + .get_or_insert_default() += v; } - - // Titan Num blob files at levels - if let Some(v) = crate::util::get_cf_num_blob_files_at_level(engine, handle, level) { - STORE_ENGINE_TITANDB_NUM_BLOB_FILES_AT_LEVEL_VEC - .with_label_values(&[name, cf, &level.to_string()]) - .set(v as i64); + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE50_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le50 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE80_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le80 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE100_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le100 + .get_or_insert_default() += v; + } + // Level stats. + let opts = db.get_options_cf(handle); + if cf_stats.levels.len() < opts.get_num_levels() { + cf_stats + .levels + .resize(opts.get_num_levels(), CfLevelStats::default()); + } + for level in 0..opts.get_num_levels() { + if let Some(num_files) = crate::util::get_cf_num_files_at_level(db, handle, level) { + *cf_stats.levels[level].num_files.get_or_insert_default() += num_files; + if let Some(ratio) = + crate::util::get_engine_compression_ratio_at_level(db, handle, level) + { + *cf_stats.levels[level] + .weighted_compression_ratio + .get_or_insert_default() += num_files as f64 * ratio; + } + } + if let Some(v) = crate::util::get_cf_num_blob_files_at_level(db, handle, level) { + *cf_stats.levels[level] + .num_blob_files + .get_or_insert_default() += v; + } } - } - - // Num immutable mem-table - if let Some(v) = crate::util::get_cf_num_immutable_mem_table(engine, handle) { - STORE_ENGINE_NUM_IMMUTABLE_MEM_TABLE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); - } - // Titan live blob size - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_SIZE) { - STORE_ENGINE_TITANDB_LIVE_BLOB_SIZE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + if let Some(info) = db.get_map_property_cf(handle, ROCKSDB_CFSTATS) { + for i in 0..stall_num { + *self.db_stats.stall_num[i].get_or_insert_default() += + info.get_property_int_value(ROCKSDB_IOSTALL_KEY[i]); + } + } } - // Titan num live blob file - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_LIVE_BLOB_FILE) { - STORE_ENGINE_TITANDB_NUM_LIVE_BLOB_FILE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + // For snapshot + *self.db_stats.num_snapshots.get_or_insert_default() += + db.get_property_int(ROCKSDB_NUM_SNAPSHOTS).unwrap_or(0); + let oldest_snapshot_time = + db.get_property_int(ROCKSDB_OLDEST_SNAPSHOT_TIME) + .map_or(0, |t| { + let now = time::get_time().sec as u64; + // RocksDB returns 0 if no snapshots. + if t > 0 && now > t { now - t } else { 0 } + }); + if oldest_snapshot_time > self.db_stats.oldest_snapshot_time.unwrap_or(0) { + *self.db_stats.oldest_snapshot_time.get_or_insert_default() = oldest_snapshot_time; } - // Titan num obsolete blob file - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_OBSOLETE_BLOB_FILE) - { - STORE_ENGINE_TITANDB_NUM_OBSOLETE_BLOB_FILE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + // Since block cache is shared, getting cache size from any CF/DB is fine. Here + // we get from default CF. + if self.db_stats.block_cache_size.is_none() { + let handle = crate::util::get_cf_handle(db, CF_DEFAULT).unwrap(); + *self.db_stats.block_cache_size.get_or_insert_default() = + db.get_block_cache_usage_cf(handle); } + } - // Titan live blob file size - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_FILE_SIZE) { - STORE_ENGINE_TITANDB_LIVE_BLOB_FILE_SIZE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); - } + fn flush(&mut self) { + for (cf, cf_stats) in &self.cf_stats { + if let Some(v) = cf_stats.used_size { + STORE_ENGINE_SIZE_GAUGE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_cache_size { + STORE_ENGINE_BLOB_CACHE_USAGE_GAUGE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.readers_mem { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "readers-mem"]) + .set(v as i64); + } + if let Some(v) = cf_stats.mem_tables { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "mem-tables"]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_keys { + STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.pending_compaction_bytes { + STORE_ENGINE_PENDING_COMPACTION_BYTES_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + for (level, level_stats) in cf_stats.levels.iter().enumerate() { + if let Some(num_files) = level_stats.num_files { + STORE_ENGINE_NUM_FILES_AT_LEVEL_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(num_files as i64); + if num_files > 0 && let Some(ratio) = level_stats.weighted_compression_ratio { + let normalized_compression_ratio = + ratio / num_files as f64; + STORE_ENGINE_COMPRESSION_RATIO_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(normalized_compression_ratio); + } + } + if let Some(v) = level_stats.num_blob_files { + STORE_ENGINE_TITANDB_NUM_BLOB_FILES_AT_LEVEL_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(v as i64); + } + } - // Titan obsolete blob file size - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_OBSOLETE_BLOB_FILE_SIZE) - { - STORE_ENGINE_TITANDB_OBSOLETE_BLOB_FILE_SIZE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + if let Some(v) = cf_stats.num_immutable_mem_table { + STORE_ENGINE_NUM_IMMUTABLE_MEM_TABLE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.live_blob_size { + STORE_ENGINE_TITANDB_LIVE_BLOB_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_live_blob_file { + STORE_ENGINE_TITANDB_NUM_LIVE_BLOB_FILE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_obsolete_blob_file { + STORE_ENGINE_TITANDB_NUM_OBSOLETE_BLOB_FILE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.live_blob_file_size { + STORE_ENGINE_TITANDB_LIVE_BLOB_FILE_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.obsolete_blob_file_size { + STORE_ENGINE_TITANDB_OBSOLETE_BLOB_FILE_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le0 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le0"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le20 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le20"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le50 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le50"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le80 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le80"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le100 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le100"]) + .set(v as i64); + } } - // Titan blob file discardable ratio - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE0_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le0"]) - .set(v as i64); - } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE20_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le20"]) + if let Some(v) = self.db_stats.num_snapshots { + STORE_ENGINE_NUM_SNAPSHOTS_GAUGE_VEC + .with_label_values(&[&self.name]) .set(v as i64); } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE50_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le50"]) + if let Some(v) = self.db_stats.oldest_snapshot_time { + STORE_ENGINE_OLDEST_SNAPSHOT_DURATION_GAUGE_VEC + .with_label_values(&[&self.name]) .set(v as i64); } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE80_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le80"]) + if let Some(v) = self.db_stats.block_cache_size { + STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC + .with_label_values(&[&self.name, "all"]) .set(v as i64); } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE100_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le100"]) - .set(v as i64); + let stall_num = ROCKSDB_IOSTALL_KEY.len(); + for i in 0..stall_num { + if let Some(v) = self.db_stats.stall_num[i] { + STORE_ENGINE_WRITE_STALL_REASON_GAUGE_VEC + .with_label_values(&[&self.name, ROCKSDB_IOSTALL_TYPE[i]]) + .set(v as i64); + } } } +} - // For snapshot - if let Some(n) = engine.get_property_int(ROCKSDB_NUM_SNAPSHOTS) { - STORE_ENGINE_NUM_SNAPSHOTS_GAUGE_VEC - .with_label_values(&[name]) - .set(n as i64); +pub fn flush_engine_statistics(statistics: &RocksStatistics, name: &str, is_titan: bool) { + for t in ENGINE_TICKER_TYPES { + let v = statistics.get_and_reset_ticker_count(*t); + flush_engine_ticker_metrics(*t, v, name); } - if let Some(t) = engine.get_property_int(ROCKSDB_OLDEST_SNAPSHOT_TIME) { - // RocksDB returns 0 if no snapshots. - let now = time::get_time().sec as u64; - let d = if t > 0 && now > t { now - t } else { 0 }; - STORE_ENGINE_OLDEST_SNAPSHOT_DURATION_GAUGE_VEC - .with_label_values(&[name]) - .set(d as i64); + for t in ENGINE_HIST_TYPES { + if let Some(v) = statistics.get_histogram(*t) { + flush_engine_histogram_metrics(*t, v, name); + } + } + if is_titan { + for t in TITAN_ENGINE_TICKER_TYPES { + let v = statistics.get_and_reset_ticker_count(*t); + flush_engine_ticker_metrics(*t, v, name); + } + for t in TITAN_ENGINE_HIST_TYPES { + if let Some(v) = statistics.get_histogram(*t) { + flush_engine_histogram_metrics(*t, v, name); + } + } } - - // Since block cache is shared, getting cache size from any CF is fine. Here we - // get from default CF. - let handle = crate::util::get_cf_handle(engine, CF_DEFAULT).unwrap(); - let block_cache_usage = engine.get_block_cache_usage_cf(handle); - STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, "all"]) - .set(block_cache_usage as i64); } // For property metrics @@ -1618,11 +1768,8 @@ mod tests { flush_engine_histogram_metrics(*tp, HistogramData::default(), "kv"); } - flush_engine_properties(engine.as_inner(), "kv"); - let handle = engine.as_inner().cf_handle("default").unwrap(); - let info = engine - .as_inner() - .get_map_property_cf(handle, ROCKSDB_CFSTATS); - assert!(info.is_some()); + let mut reporter = RocksStatisticsReporter::new("kv"); + reporter.collect(&engine); + reporter.flush(); } } diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 786dfec04d1..407cf8ee611 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -11,7 +11,7 @@ use slog_global::warn; use crate::{ cf_options::RocksCfOptions, db_options::RocksDbOptions, engine::RocksEngine, r2e, - raw::Statistics, rocks_metrics_defs::*, + rocks_metrics_defs::*, RocksStatistics, }; pub fn new_temp_engine(path: &tempfile::TempDir) -> Engines { @@ -28,7 +28,7 @@ pub fn new_default_engine(path: &str) -> Result { pub fn new_engine(path: &str, cfs: &[&str]) -> Result { let mut db_opts = RocksDbOptions::default(); - db_opts.set_statistics(&Statistics::new_titan()); + db_opts.set_statistics(&RocksStatistics::new_titan()); let cf_opts = cfs.iter().map(|name| (*name, Default::default())).collect(); new_engine_opt(path, db_opts, cf_opts) } diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index e12ea074015..e76765e2ed6 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -47,10 +47,11 @@ pub trait KvEngine: /// Flush metrics to prometheus /// /// `instance` is the label of the metric to flush. - fn flush_metrics(&self, _instance: &str) {} - - /// Reset internal statistics - fn reset_statistics(&self) {} + fn flush_metrics(&self, instance: &str) { + let mut reporter = Self::StatisticsReporter::new(instance); + reporter.collect(self); + reporter.flush(); + } /// Cast to a concrete engine type /// diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 34502634280..edfea511d35 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -37,7 +37,26 @@ pub enum DeleteStrategy { DeleteByWriter { sst_path: String }, } +/// `StatisticsReporter` can be used to report engine's private statistics to +/// prometheus metrics. For one single engine, using it is equivalent to calling +/// `KvEngine::flush_metrics("name")`. For multiple engines, it can aggregate +/// statistics accordingly. +/// Note that it is not responsible for managing the statistics from +/// user-provided collectors that are potentially shared between engines. +pub trait StatisticsReporter { + fn new(name: &str) -> Self; + + /// Collect statistics from one single engine. + fn collect(&mut self, engine: &T); + + /// Aggregate and report statistics to prometheus metrics counters. The + /// statistics are not cleared afterwards. + fn flush(&mut self); +} + pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { + type StatisticsReporter: StatisticsReporter; + /// Flush all specified column families at once. /// /// If `cfs` is empty, it will try to flush all available column families. diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 8b29e07707a..7b0e04d0ab5 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -136,7 +136,6 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send fn flush_stats(&self) -> Option { None } - fn reset_statistics(&self) {} fn stop(&self) {} diff --git a/components/server/src/lib.rs b/components/server/src/lib.rs index 57793792289..5107a20eeab 100644 --- a/components/server/src/lib.rs +++ b/components/server/src/lib.rs @@ -2,6 +2,7 @@ #![allow(incomplete_features)] #![feature(specialization)] +#![feature(let_chains)] #[macro_use] extern crate tikv_util; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 8d64ff74c8b..470e3a41861 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -38,14 +38,15 @@ use cdc::{CdcConfigManager, MemoryQuota}; use concurrency_manager::ConcurrencyManager; use encryption_export::{data_key_manager_from_config, DataKeyManager}; use engine_rocks::{ - from_rocks_compression_type, + flush_engine_statistics, from_rocks_compression_type, raw::{Cache, Env}, - FlowInfo, RocksEngine, + FlowInfo, RocksEngine, RocksStatistics, }; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ CachedTablet, CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, - RaftEngine, SingletonFactory, TabletContext, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, + RaftEngine, SingletonFactory, StatisticsReporter, TabletContext, TabletRegistry, CF_DEFAULT, + CF_LOCK, CF_WRITE, }; use error_code::ErrorCodeExt; use file_system::{ @@ -169,7 +170,11 @@ fn run_impl(config: TikvConfig) { tikv.run_status_server(); tikv.init_quota_tuning_task(tikv.quota_limiter.clone()); - signal_handler::wait_for_signal(Some(tikv.engines.take().unwrap().engines)); + signal_handler::wait_for_signal( + Some(tikv.engines.take().unwrap().engines), + tikv.kv_statistics.clone(), + tikv.raft_statistics.clone(), + ); tikv.stop(); } @@ -226,6 +231,8 @@ struct TikvServer { snap_mgr: Option, // Will be filled in `init_servers`. encryption_key_manager: Option>, engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, servers: Option>, region_info_accessor: RegionInfoAccessor, coprocessor_host: Option>, @@ -376,6 +383,8 @@ where snap_mgr: None, encryption_key_manager: None, engines: None, + kv_statistics: None, + raft_statistics: None, servers: None, region_info_accessor, coprocessor_host, @@ -1204,6 +1213,8 @@ where // Debug service. let debug_service = DebugService::new( engines.engines.clone(), + self.kv_statistics.clone(), + self.raft_statistics.clone(), servers.server.get_debug_thread_pool().clone(), engines.engine.raft_extension().clone(), self.cfg_controller.as_ref().unwrap().clone(), @@ -1356,7 +1367,11 @@ where engines_info: Arc, ) { let mut engine_metrics = EngineMetricsManager::::new( - self.engines.as_ref().unwrap().engines.clone(), + self.tablet_registry.clone().unwrap(), + self.kv_statistics.clone(), + self.config.rocksdb.titan.enabled, + self.engines.as_ref().unwrap().engines.raft.clone(), + self.raft_statistics.clone(), ); let mut io_metrics = IoMetricsManager::new(fetcher); let engines_info_clone = engines_info.clone(); @@ -1675,7 +1690,12 @@ where } pub trait ConfiguredRaftEngine: RaftEngine { - fn build(_: &TikvConfig, _: &Arc, _: &Option>, _: &Cache) -> Self; + fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Cache, + ) -> (Self, Option>); fn as_rocks_engine(&self) -> Option<&RocksEngine>; fn register_config(&self, _cfg_controller: &mut ConfigController); } @@ -1686,7 +1706,7 @@ impl ConfiguredRaftEngine for T { _: &Arc, _: &Option>, _: &Cache, - ) -> Self { + ) -> (Self, Option>) { unimplemented!() } default fn as_rocks_engine(&self) -> Option<&RocksEngine> { @@ -1701,7 +1721,7 @@ impl ConfiguredRaftEngine for RocksEngine { env: &Arc, key_manager: &Option>, block_cache: &Cache, - ) -> Self { + ) -> (Self, Option>) { let mut raft_data_state_machine = RaftDataStateMachine::new( &config.storage.data_dir, &config.raft_engine.config().dir, @@ -1713,6 +1733,8 @@ impl ConfiguredRaftEngine for RocksEngine { let config_raftdb = &config.raftdb; let mut raft_db_opts = config_raftdb.build_opt(); raft_db_opts.set_env(env.clone()); + let statistics = Arc::new(RocksStatistics::new_titan()); + raft_db_opts.set_statistics(statistics.as_ref()); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) .expect("failed to open raftdb"); @@ -1726,7 +1748,7 @@ impl ConfiguredRaftEngine for RocksEngine { drop(raft_engine); raft_data_state_machine.after_dump_data(); } - raftdb + (raftdb, Some(statistics)) } fn as_rocks_engine(&self) -> Option<&RocksEngine> { @@ -1747,7 +1769,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { env: &Arc, key_manager: &Option>, block_cache: &Cache, - ) -> Self { + ) -> (Self, Option>) { let mut raft_data_state_machine = RaftDataStateMachine::new( &config.storage.data_dir, &config.raft_store.raftdb_path, @@ -1776,7 +1798,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { drop(raftdb); raft_data_state_machine.after_dump_data(); } - raft_engine + (raft_engine, None) } } @@ -1792,12 +1814,13 @@ impl TikvServer { .unwrap(); // Create raft engine - let raft_engine = CER::build( + let (raft_engine, raft_statistics) = CER::build( &self.config, &env, &self.encryption_key_manager, &block_cache, ); + self.raft_statistics = raft_statistics; // Create kv engine. let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) @@ -1811,6 +1834,7 @@ impl TikvServer { let kv_engine = factory .create_shared_db(&self.store_path) .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); + self.kv_statistics = Some(factory.rocks_statistics()); let engines = Engines::new(kv_engine.clone(), raft_engine); let cfg_controller = self.cfg_controller.as_mut().unwrap(); @@ -1954,25 +1978,58 @@ impl Stop for LazyWorker { } } -pub struct EngineMetricsManager { - engines: Engines, +pub struct EngineMetricsManager { + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, last_reset: Instant, } -impl EngineMetricsManager { - pub fn new(engines: Engines) -> Self { +impl EngineMetricsManager { + pub fn new( + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + ) -> Self { EngineMetricsManager { - engines, + tablet_registry, + kv_statistics, + kv_is_titan, + raft_engine, + raft_statistics, last_reset: Instant::now(), } } pub fn flush(&mut self, now: Instant) { - KvEngine::flush_metrics(&self.engines.kv, "kv"); - self.engines.raft.flush_metrics("raft"); + let mut reporter = EK::StatisticsReporter::new("kv"); + self.tablet_registry + .for_each_opened_tablet(|_, db: &mut CachedTablet| { + if let Some(db) = db.latest() { + reporter.collect(db); + } + true + }); + reporter.flush(); + self.raft_engine.flush_metrics("raft"); + + if let Some(s) = self.kv_statistics.as_ref() { + flush_engine_statistics(s, "kv", self.kv_is_titan); + } + if let Some(s) = self.raft_statistics.as_ref() { + flush_engine_statistics(s, "raft", false); + } if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { - KvEngine::reset_statistics(&self.engines.kv); - self.engines.raft.reset_statistics(); + if let Some(s) = self.kv_statistics.as_ref() { + s.reset(); + } + if let Some(s) = self.raft_statistics.as_ref() { + s.reset(); + } self.last_reset = now; } } diff --git a/components/server/src/signal_handler.rs b/components/server/src/signal_handler.rs index a92845b843d..0977a1ed814 100644 --- a/components/server/src/signal_handler.rs +++ b/components/server/src/signal_handler.rs @@ -1,18 +1,29 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +use std::sync::Arc; + +use engine_rocks::RocksStatistics; +use engine_traits::{Engines, KvEngine, RaftEngine}; + pub use self::imp::wait_for_signal; #[cfg(unix)] mod imp { - use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine}; + use engine_traits::MiscExt; use signal_hook::{ consts::{SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2}, iterator::Signals, }; use tikv_util::metrics; + use super::*; + #[allow(dead_code)] - pub fn wait_for_signal(engines: Option>) { + pub fn wait_for_signal( + engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, + ) { let mut signals = Signals::new([SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]).unwrap(); for signal in &mut signals { match signal { @@ -25,7 +36,13 @@ mod imp { info!("{}", metrics::dump(false)); if let Some(ref engines) = engines { info!("{:?}", MiscExt::dump_stats(&engines.kv)); + if let Some(s) = kv_statistics.as_ref() && let Some(s) = s.to_string() { + info!("{:?}", s); + } info!("{:?}", RaftEngine::dump_stats(&engines.raft)); + if let Some(s) = raft_statistics.as_ref() && let Some(s) = s.to_string() { + info!("{:?}", s); + } } } // TODO: handle more signal @@ -37,7 +54,12 @@ mod imp { #[cfg(not(unix))] mod imp { - use engine_traits::{Engines, KvEngine, RaftEngine}; + use super::*; - pub fn wait_for_signal(_: Option>) {} + pub fn wait_for_signal( + _: Option>, + _: Option>, + _: Option>, + ) { + } } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index f9088ff4e3b..833e8131746 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -12,7 +12,7 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; use encryption_export::DataKeyManager; -use engine_rocks::{RocksEngine, RocksSnapshot}; +use engine_rocks::{RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, WriteBatch, @@ -170,6 +170,8 @@ pub struct Cluster { group_props: HashMap, pub sst_workers: Vec>, pub sst_workers_map: HashMap, + pub kv_statistics: Vec>, + pub raft_statistics: Vec>>, pub sim: Arc>, pub pd_client: Arc, } @@ -205,6 +207,8 @@ impl Cluster { pd_client, sst_workers: vec![], sst_workers_map: HashMap::default(), + kv_statistics: vec![], + raft_statistics: vec![], } } @@ -240,12 +244,14 @@ impl Cluster { } fn create_engine(&mut self, router: Option>) { - let (engines, key_manager, dir, sst_worker) = + let (engines, key_manager, dir, sst_worker, kv_statistics, raft_statistics) = create_test_engine(router, self.io_rate_limiter.clone(), &self.cfg); self.dbs.push(engines); self.key_managers.push(key_manager); self.paths.push(dir); self.sst_workers.push(sst_worker); + self.kv_statistics.push(kv_statistics); + self.raft_statistics.push(raft_statistics); } pub fn create_engines(&mut self) { diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index ea9868afdbd..e3cfb298c59 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -483,6 +483,8 @@ impl ServerCluster { let debug_thread_handle = debug_thread_pool.handle().clone(); let debug_service = DebugService::new( engines.clone(), + None, + None, debug_thread_handle, extension, ConfigController::default(), diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 3a4ed373e8c..d5c2eefa6d6 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -13,7 +13,7 @@ use collections::HashMap; use encryption_export::{ data_key_manager_from_config, DataKeyManager, FileConfig, MasterKeyConfig, }; -use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot}; +use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, ALL_CFS, CF_DEFAULT, CF_RAFT, @@ -575,6 +575,8 @@ pub fn create_test_engine( Option>, TempDir, LazyWorker, + Arc, + Option>, ) { let dir = test_util::temp_dir("test_cluster", cfg.prefer_mem); let mut cfg = cfg.clone(); @@ -593,7 +595,7 @@ pub fn create_test_engine( let sst_worker = LazyWorker::new("sst-recovery"); let scheduler = sst_worker.scheduler(); - let raft_engine = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); + let (raft_engine, raft_statistics) = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); let mut builder = KvEngineFactoryBuilder::new(env, &cfg, cache).sst_recovery_sender(Some(scheduler)); @@ -605,7 +607,14 @@ pub fn create_test_engine( let factory = builder.build(); let engine = factory.create_shared_db(dir.path()).unwrap(); let engines = Engines::new(engine, raft_engine); - (engines, key_manager, dir, sst_worker) + ( + engines, + key_manager, + dir, + sst_worker, + factory.rocks_statistics(), + raft_statistics, + ) } pub fn configure_for_request_snapshot(cluster: &mut Cluster) { diff --git a/etc/config-template.toml b/etc/config-template.toml index 3ddbb6fc879..8820d2e0675 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -559,11 +559,6 @@ ## Max RocksDB WAL size in total # max-total-wal-size = "4GB" -## RocksDB Statistics provides cumulative stats over time. -## Turning statistics on will introduce about 5%-10% overhead for RocksDB, but it can help you to -## know the internal status of RocksDB. -# enable-statistics = true - ## Dump statistics periodically in information logs. ## Same as RocksDB's default value (10 min). # stats-dump-period = "10m" @@ -972,7 +967,6 @@ # max-manifest-file-size = "20MB" # create-if-missing = true -# enable-statistics = true # stats-dump-period = "10m" ## Raft RocksDB WAL directory. diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index ccac776b508..0c2116818dc 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -30428,11 +30428,11 @@ "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_compression_ratio{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}) by (level)", + "expr": "avg(tikv_engine_compression_ratio{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}) by (cf, level)", "format": "time_series", "hide": false, "intervalFactor": 2, - "legendFormat": "level - {{level}}", + "legendFormat": "{{cf}} - level - {{level}}", "metric": "", "refId": "A", "step": 10 diff --git a/src/config/mod.rs b/src/config/mod.rs index 0945eb7ca21..a9cfdb93505 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -31,13 +31,12 @@ use engine_rocks::{ raw::{ BlockBasedOptions, Cache, ChecksumType, CompactionPriority, DBCompactionStyle, DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, PrepopulateBlockCache, - Statistics, }, util::{FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform}, RaftDbLogger, RangePropertiesCollectorFactory, RawMvccPropertiesCollectorFactory, - RocksCfOptions, RocksDbOptions, RocksEngine, RocksEventListener, RocksTitanDbOptions, - RocksdbLogger, TtlPropertiesCollectorFactory, DEFAULT_PROP_KEYS_INDEX_DISTANCE, - DEFAULT_PROP_SIZE_INDEX_DISTANCE, + RocksCfOptions, RocksDbOptions, RocksEngine, RocksEventListener, RocksStatistics, + RocksTitanDbOptions, RocksdbLogger, TtlPropertiesCollectorFactory, + DEFAULT_PROP_KEYS_INDEX_DISTANCE, DEFAULT_PROP_SIZE_INDEX_DISTANCE, }; use engine_traits::{ CfOptions as _, DbOptions as _, MiscExt, TitanCfOptions as _, CF_DEFAULT, CF_LOCK, CF_RAFT, @@ -1038,6 +1037,8 @@ pub struct DbConfig { pub create_if_missing: bool, pub max_open_files: i32, #[online_config(skip)] + #[doc(hidden)] + #[serde(skip_serializing)] pub enable_statistics: bool, #[online_config(skip)] pub stats_dump_period: ReadableDuration, @@ -1156,7 +1157,7 @@ impl DbConfig { } } - pub fn build_opt(&self, stats: Option<&Statistics>) -> RocksDbOptions { + pub fn build_opt(&self, stats: Option<&RocksStatistics>) -> RocksDbOptions { let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { @@ -1172,11 +1173,9 @@ impl DbConfig { opts.set_max_manifest_file_size(self.max_manifest_file_size.0); opts.create_if_missing(self.create_if_missing); opts.set_max_open_files(self.max_open_files); - if self.enable_statistics { - match stats { - Some(stats) => opts.set_statistics(stats), - None => opts.set_statistics(&Statistics::new_titan()), - } + match stats { + Some(stats) => opts.set_statistics(stats), + None => opts.set_statistics(&RocksStatistics::new_titan()), } opts.set_stats_dump_period_sec(self.stats_dump_period.as_secs() as usize); opts.set_compaction_readahead_size(self.compaction_readahead_size.0); @@ -1296,6 +1295,9 @@ impl DbConfig { ) .into()); } + if !self.enable_statistics { + warn!("kvdb: ignoring `enable_statistics`, statistics is always on.") + } Ok(()) } @@ -1411,6 +1413,8 @@ pub struct RaftDbConfig { pub create_if_missing: bool, pub max_open_files: i32, #[online_config(skip)] + #[doc(hidden)] + #[serde(skip_serializing)] pub enable_statistics: bool, #[online_config(skip)] pub stats_dump_period: ReadableDuration, @@ -1499,9 +1503,7 @@ impl RaftDbConfig { opts.set_max_manifest_file_size(self.max_manifest_file_size.0); opts.create_if_missing(self.create_if_missing); opts.set_max_open_files(self.max_open_files); - if self.enable_statistics { - opts.set_statistics(&Statistics::new_titan()); - } + opts.set_statistics(&RocksStatistics::new_titan()); opts.set_stats_dump_period_sec(self.stats_dump_period.as_secs() as usize); opts.set_compaction_readahead_size(self.compaction_readahead_size.0); opts.set_max_log_file_size(self.info_log_max_size.0); @@ -1544,6 +1546,9 @@ impl RaftDbConfig { ); } } + if !self.enable_statistics { + warn!("raftdb: ignoring `enable_statistics`, statistics is always on.") + } Ok(()) } } diff --git a/src/server/debug.rs b/src/server/debug.rs index 666e2ca33e7..9445133239f 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -5,6 +5,7 @@ use std::{ iter::FromIterator, path::Path, result, + sync::Arc, thread::{Builder as ThreadBuilder, JoinHandle}, }; @@ -12,12 +13,12 @@ use collections::HashSet; use engine_rocks::{ raw::{CompactOptions, DBBottommostLevelCompaction}, util::get_cf_handle, - RocksEngine, RocksEngineIterator, RocksMvccProperties, RocksWriteBatchVec, + RocksEngine, RocksEngineIterator, RocksMvccProperties, RocksStatistics, RocksWriteBatchVec, }; use engine_traits::{ - Engines, IterOptions, Iterable, Iterator as EngineIterator, Mutable, MvccProperties, Peekable, - RaftEngine, RaftLogBatch, Range, RangePropertiesExt, SyncMutable, WriteBatch, WriteBatchExt, - WriteOptions, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + Engines, IterOptions, Iterable, Iterator as EngineIterator, MiscExt, Mutable, MvccProperties, + Peekable, RaftEngine, RaftLogBatch, Range, RangePropertiesExt, SyncMutable, WriteBatch, + WriteBatchExt, WriteOptions, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ debugpb::{self, Db as DbType}, @@ -127,6 +128,8 @@ trait InnerRocksEngineExtractor { #[derive(Clone)] pub struct Debugger { engines: Engines, + kv_statistics: Option>, + raft_statistics: Option>, reset_to_version_manager: ResetToVersionManager, cfg_controller: ConfigController, } @@ -159,15 +162,41 @@ impl Debugger { let reset_to_version_manager = ResetToVersionManager::new(engines.kv.clone()); Debugger { engines, + kv_statistics: None, + raft_statistics: None, reset_to_version_manager, cfg_controller, } } + pub fn set_kv_statistics(&mut self, s: Option>) { + self.kv_statistics = s; + } + + pub fn set_raft_statistics(&mut self, s: Option>) { + self.raft_statistics = s; + } + pub fn get_engine(&self) -> &Engines { &self.engines } + pub fn dump_kv_stats(&self) -> Result { + let mut kv_str = box_try!(MiscExt::dump_stats(&self.engines.kv)); + if let Some(s) = self.kv_statistics.as_ref() && let Some(s) = s.to_string() { + kv_str.push_str(&s); + } + Ok(kv_str) + } + + pub fn dump_raft_stats(&self) -> Result { + let mut raft_str = box_try!(RaftEngine::dump_stats(&self.engines.raft)); + if let Some(s) = self.raft_statistics.as_ref() && let Some(s) = s.to_string() { + raft_str.push_str(&s); + } + Ok(raft_str) + } + /// Get all regions holding region meta data from raft CF in KV storage. pub fn get_all_regions_in_store(&self) -> Result> { let db = &self.engines.kv; diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index c21599f47a6..2c31c9522b1 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -3,9 +3,9 @@ use std::{path::Path, sync::Arc}; use engine_rocks::{ - raw::{Cache, Env, Statistics}, + raw::{Cache, Env}, CompactedEventSender, CompactionListener, FlowListener, RocksCfOptions, RocksCompactionJobInfo, - RocksDbOptions, RocksEngine, RocksEventListener, RocksPersistenceListener, + RocksDbOptions, RocksEngine, RocksEventListener, RocksPersistenceListener, RocksStatistics, }; use engine_traits::{ CompactionJobInfo, MiscExt, PersistenceListener, Result, StateStorage, TabletContext, @@ -28,7 +28,7 @@ struct FactoryInner { api_version: ApiVersion, flow_listener: Option, sst_recovery_sender: Option>, - statistics: Statistics, + statistics: Arc, state_storage: Option>, lite: bool, } @@ -40,6 +40,7 @@ pub struct KvEngineFactoryBuilder { impl KvEngineFactoryBuilder { pub fn new(env: Arc, config: &TikvConfig, cache: Cache) -> Self { + let statistics = Arc::new(RocksStatistics::new_titan()); Self { inner: FactoryInner { env, @@ -49,7 +50,7 @@ impl KvEngineFactoryBuilder { api_version: config.storage.api_version(), flow_listener: None, sst_recovery_sender: None, - statistics: Statistics::new_titan(), + statistics, state_storage: None, lite: false, }, @@ -132,12 +133,16 @@ impl KvEngineFactory { )) } + pub fn rocks_statistics(&self) -> Arc { + self.inner.statistics.clone() + } + fn db_opts(&self) -> RocksDbOptions { // Create kv engine. let mut db_opts = self .inner .rocksdb_config - .build_opt(Some(&self.inner.statistics)); + .build_opt(Some(self.inner.statistics.as_ref())); db_opts.set_env(self.inner.env.clone()); if !self.inner.lite { db_opts.add_event_listener(RocksEventListener::new( diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index ae0d53bacda..e0ec9173ad5 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -1,7 +1,9 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use engine_rocks::RocksEngine; -use engine_traits::{Engines, MiscExt, RaftEngine}; +use std::sync::Arc; + +use engine_rocks::{RocksEngine, RocksStatistics}; +use engine_traits::{Engines, RaftEngine}; use futures::{ future::{Future, FutureExt, TryFutureExt}, sink::SinkExt, @@ -54,11 +56,15 @@ impl Service { /// `GcWorker`. pub fn new( engines: Engines, + kv_statistics: Option>, + raft_statistics: Option>, pool: Handle, raft_router: T, cfg_controller: ConfigController, ) -> Self { - let debugger = Debugger::new(engines, cfg_controller); + let mut debugger = Debugger::new(engines, cfg_controller); + debugger.set_kv_statistics(kv_statistics); + debugger.set_raft_statistics(raft_statistics); Service { pool, debugger, @@ -353,9 +359,8 @@ impl debugpb::Debug for Service pd_client::Result, R: std::fmt::Debug, { - run_on_bad_connection(client, |c| { - f(c).unwrap_err(); - f(c).unwrap(); - }); + let mut success = false; + for _ in 0..3 { + run_on_bad_connection(client, |c| { + f(c).unwrap_err(); + success = f(c).is_ok(); + }); + if success { + return; + } + } + panic!("failed to retry after three attempts"); } test_retry_success(&mut client, |c| { diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 73dfdbaa977..4e22463503a 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -289,7 +289,7 @@ fn test_serde_custom_tikv_config() { max_manifest_file_size: ReadableSize::mb(12), create_if_missing: false, max_open_files: 12_345, - enable_statistics: false, + enable_statistics: true, stats_dump_period: ReadableDuration::minutes(12), compaction_readahead_size: ReadableSize::kb(1), info_log_max_size: ReadableSize::kb(1), @@ -584,7 +584,7 @@ fn test_serde_custom_tikv_config() { max_manifest_file_size: ReadableSize::mb(12), create_if_missing: false, max_open_files: 12_345, - enable_statistics: false, + enable_statistics: true, stats_dump_period: ReadableDuration::minutes(12), compaction_readahead_size: ReadableSize::kb(1), info_log_max_size: ReadableSize::kb(1), diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 961eb59a77b..900e1c36aa6 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -252,7 +252,6 @@ max-background-flushes = 4 max-manifest-file-size = "12MB" create-if-missing = false max-open-files = 12345 -enable-statistics = false stats-dump-period = "12m" compaction-readahead-size = "1KB" info-log-max-size = "1KB" @@ -504,7 +503,6 @@ max-background-flushes = 4 max-manifest-file-size = "12MB" create-if-missing = false max-open-files = 12345 -enable-statistics = false stats-dump-period = "12m" compaction-readahead-size = "1KB" info-log-max-size = "1KB" diff --git a/tests/integrations/raftstore/test_compact_lock_cf.rs b/tests/integrations/raftstore/test_compact_lock_cf.rs index c8ee96c7c67..fbc7629c73f 100644 --- a/tests/integrations/raftstore/test_compact_lock_cf.rs +++ b/tests/integrations/raftstore/test_compact_lock_cf.rs @@ -15,11 +15,9 @@ fn flush_then_check(cluster: &mut Cluster, interval: u64, writt flush(cluster); // Wait for compaction. sleep_ms(interval * 2); - for engines in cluster.engines.values() { - let compact_write_bytes = engines - .kv - .as_inner() - .get_statistics_ticker_count(DBStatisticsTickerType::CompactWriteBytes); + for statistics in &cluster.kv_statistics { + let compact_write_bytes = + statistics.get_ticker_count(DBStatisticsTickerType::CompactWriteBytes); if written { assert!(compact_write_bytes > 0); } else { From 679c773040d78000e90989a08a9461c1a816963f Mon Sep 17 00:00:00 2001 From: Lucas Date: Tue, 20 Dec 2022 19:58:55 +0800 Subject: [PATCH 417/676] raftstore-v2: fix compilation errors. (#13969) close tikv/tikv#13970 Fix compilation errors. Signed-off-by: Lucasliang --- components/raftstore-v2/src/router/response_channel.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index 01c1565ec62..2cb75acccfc 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -603,6 +603,7 @@ pub type FlushChannel = BaseChannel<()>; #[cfg(feature = "testexport")] pub type FlushSubscriber = BaseSubscriber<()>; +#[cfg(feature = "testexport")] impl Debug for FlushChannel { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "FlushChannel") From 99c70a3859489152b88e3aa064746221c3240877 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 21 Dec 2022 13:26:54 +0800 Subject: [PATCH 418/676] *: implement other essential functions for raftkv2 (#13967) ref tikv/tikv#12842 Functions like flashback will be supported in next milestone. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 8 + components/raftstore-v2/src/fsm/peer.rs | 8 + components/raftstore-v2/src/lib.rs | 2 +- .../src/operation/command/admin/mod.rs | 2 +- .../src/operation/command/admin/split.rs | 48 +++- .../raftstore-v2/src/operation/command/mod.rs | 2 +- components/raftstore-v2/src/operation/mod.rs | 4 +- components/raftstore-v2/src/operation/pd.rs | 18 +- .../src/operation/ready/snapshot.rs | 23 ++ components/raftstore-v2/src/router/imp.rs | 5 + components/raftstore-v2/src/router/message.rs | 31 ++- components/raftstore-v2/src/worker/pd/mod.rs | 16 +- .../src/worker/pd/region_heartbeat.rs | 6 +- .../raftstore-v2/src/worker/pd/split.rs | 4 +- components/raftstore/src/store/fsm/peer.rs | 93 +++----- components/raftstore/src/store/util.rs | 41 ++++ components/server/src/server.rs | 9 +- components/test_raftstore/src/server.rs | 7 +- components/tikv_kv/src/lib.rs | 2 +- components/tikv_kv/src/mock_engine.rs | 2 +- components/tikv_kv/src/rocksdb_engine.rs | 4 +- src/server/mod.rs | 3 +- src/server/node.rs | 165 +++++--------- src/server/raftkv/mod.rs | 4 +- src/server/raftkv2/mod.rs | 128 +++++------ src/server/raftkv2/node.rs | 210 ++++++++++++++++++ src/server/raftkv2/raft_extension.rs | 109 +++++++++ src/server/server.rs | 2 +- src/server/service/kv.rs | 4 +- 29 files changed, 670 insertions(+), 290 deletions(-) create mode 100644 src/server/raftkv2/node.rs create mode 100644 src/server/raftkv2/raft_extension.rs diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index ac767bcd7ce..98075969c66 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -508,6 +508,14 @@ impl StoreRouter { &self.logger } + #[inline] + pub fn check_send(&self, addr: u64, msg: PeerMsg) -> crate::Result<()> { + match self.router.send(addr, msg) { + Ok(()) => Ok(()), + Err(e) => Err(raftstore::router::handle_send_error(addr, e)), + } + } + pub fn send_raft_message( &self, msg: Box, diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 20e7a8f3c2b..f5425295347 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -280,6 +280,14 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerMsg::StoreUnreachable { to_store_id } => { self.fsm.peer_mut().on_store_unreachable(to_store_id) } + PeerMsg::SnapshotSent { to_peer_id, status } => { + self.fsm.peer_mut().on_snapshot_sent(to_peer_id, status) + } + PeerMsg::RequestSplit { request, ch } => { + self.fsm + .peer_mut() + .on_request_split(self.store_ctx, request, ch) + } #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 848e5fda8b2..d8327549da6 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -40,4 +40,4 @@ pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; -pub use raftstore::{Error, Result}; +pub use raftstore::{store::Config, Error, Result}; diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index fcb968a2195..9b7dce8570f 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -9,7 +9,7 @@ use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; use protobuf::Message; use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; -pub use split::{SplitInit, SplitResult, SPLIT_PREFIX}; +pub use split::{RequestSplit, SplitInit, SplitResult, SPLIT_PREFIX}; use tikv_util::box_err; use txn_types::WriteBatchFlags; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 6255b3ba9b9..8bf23da0fd6 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -33,7 +33,7 @@ use engine_traits::{Checkpointer, KvEngine, RaftEngine, RaftLogBatch, TabletCont use fail::fail_point; use itertools::Itertools; use kvproto::{ - metapb::{self, Region}, + metapb::{self, Region, RegionEpoch}, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, raft_serverpb::RaftSnapshotData, }; @@ -42,6 +42,7 @@ use raft::{prelude::Snapshot, INVALID_ID}; use raftstore::{ coprocessor::split_observer::{is_valid_split_key, strip_timestamp_if_exists}, store::{ + cmd_resp, fsm::apply::validate_batch_split, metrics::PEER_ADMIN_CMD_COUNTER, snap::TABLET_SNAPSHOT_VERSION, @@ -58,7 +59,8 @@ use crate::{ fsm::ApplyResReporter, operation::AdminCmdResult, raft::{Apply, Peer}, - router::{PeerMsg, StoreMsg}, + router::{CmdResChannel, PeerMsg, StoreMsg}, + Error, }; pub const SPLIT_PREFIX: &str = "split_"; @@ -155,7 +157,49 @@ fn pre_propose_split(logger: &Logger, req: &mut AdminRequest, region: &Region) - } } +#[derive(Debug)] +pub struct RequestSplit { + pub epoch: RegionEpoch, + pub split_keys: Vec>, + pub source: Box, +} + impl Peer { + pub fn on_request_split( + &mut self, + ctx: &mut StoreContext, + rs: RequestSplit, + ch: CmdResChannel, + ) { + info!( + self.logger, + "on split"; + "split_keys" => %KeysInfoFormatter(rs.split_keys.iter()), + "source" => &rs.source, + ); + if !self.is_leader() { + // region on this store is no longer leader, skipped. + info!(self.logger, "not leader, skip."); + ch.set_result(cmd_resp::new_error(Error::NotLeader( + self.region_id(), + self.leader(), + ))); + return; + } + if let Err(e) = util::validate_split_region( + self.region_id(), + self.peer_id(), + self.region(), + &rs.epoch, + &rs.split_keys, + ) { + info!(self.logger, "invalid split request"; "err" => ?e, "source" => &rs.source); + ch.set_result(cmd_resp::new_error(e)); + return; + } + self.ask_batch_split_pd(ctx, rs.split_keys, ch); + } + pub fn propose_split( &mut self, store_ctx: &mut StoreContext, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index f6ac6ac7077..116edec91c3 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -49,7 +49,7 @@ mod admin; mod control; mod write; -pub use admin::{AdminCmdResult, SplitInit, SplitResult, SPLIT_PREFIX}; +pub use admin::{AdminCmdResult, RequestSplit, SplitInit, SplitResult, SPLIT_PREFIX}; pub use control::ProposalControl; pub use write::{ SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 5e6971b3346..80443f0ef60 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -7,8 +7,8 @@ mod query; mod ready; pub use command::{ - AdminCmdResult, CommittedEntries, ProposalControl, SimpleWriteBinary, SimpleWriteEncoder, - SimpleWriteReqDecoder, SimpleWriteReqEncoder, + AdminCmdResult, CommittedEntries, ProposalControl, RequestSplit, SimpleWriteBinary, + SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, }; pub use life::DestroyProgress; pub use ready::{ diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index d80cee3c7d1..1c62c092878 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -12,7 +12,7 @@ use crate::{ batch::StoreContext, fsm::{PeerFsmDelegate, Store, StoreFsmDelegate}, raft::Peer, - router::{PeerTick, StoreTick}, + router::{CmdResChannel, PeerTick, StoreTick}, worker::pd, }; @@ -93,8 +93,6 @@ impl Peer { error!( self.logger, "failed to notify pd"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), "err" => ?e, ); return; @@ -148,8 +146,6 @@ impl Peer { error!( self.logger, "failed to get peer from cache"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), "get_peer_id" => id, ); } @@ -167,27 +163,29 @@ impl Peer { error!( self.logger, "failed to notify pd with DestroyPeer"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), "err" => %e, ); } } #[inline] - pub fn ask_batch_split_pd(&self, ctx: &StoreContext, split_keys: Vec>) { + pub fn ask_batch_split_pd( + &self, + ctx: &StoreContext, + split_keys: Vec>, + ch: CmdResChannel, + ) { let task = pd::Task::AskBatchSplit { region: self.region().clone(), split_keys, peer: self.peer().clone(), right_derive: ctx.cfg.right_derive_when_split, + ch, }; if let Err(e) = ctx.pd_scheduler.schedule(task) { error!( self.logger, "failed to notify pd with AskBatchSplit"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), "err" => %e, ); } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 1f4a1fee268..7339df22fa9 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -116,6 +116,29 @@ impl Peer { } } + pub fn on_snapshot_sent(&mut self, to_peer_id: u64, status: raft::SnapshotStatus) { + let to_peer = match self.peer_from_cache(to_peer_id) { + Some(peer) => peer, + None => { + // If to_peer is gone, ignore this snapshot status + warn!( + self.logger, + "peer not found, ignore snapshot status"; + "to_peer_id" => to_peer_id, + "status" => ?status, + ); + return; + } + }; + info!( + self.logger, + "report snapshot status"; + "to" => ?to_peer, + "status" => ?status, + ); + self.raft_group_mut().report_snapshot(to_peer_id, status); + } + pub fn on_applied_snapshot(&mut self, ctx: &mut StoreContext) { let persisted_index = self.persisted_index(); let first_index = self.storage().entry_storage().first_index(); diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index e838cefb743..3f10e08dee2 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -68,6 +68,11 @@ impl RaftRouter { self.router.send(addr, msg) } + #[inline] + pub fn check_send(&self, addr: u64, msg: PeerMsg) -> crate::Result<()> { + self.router.check_send(addr, msg) + } + pub fn store_meta(&self) -> &Arc> { self.local_reader.store_meta() } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index d5635574978..4c36f474ea9 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -3,6 +3,7 @@ // #[PerformanceCriticalPath] use kvproto::{ + metapb, raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, raft_serverpb::RaftMessage, }; @@ -15,7 +16,7 @@ use super::{ }, ApplyRes, }; -use crate::operation::{SimpleWriteBinary, SplitInit}; +use crate::operation::{RequestSplit, SimpleWriteBinary, SplitInit}; #[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] @@ -164,6 +165,15 @@ pub enum PeerMsg { StoreUnreachable { to_store_id: u64, }, + /// Reports whether the snapshot sending is successful or not. + SnapshotSent { + to_peer_id: u64, + status: raft::SnapshotStatus, + }, + RequestSplit { + request: RequestSplit, + ch: CmdResChannel, + }, /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), @@ -195,6 +205,25 @@ impl PeerMsg { sub, ) } + + pub fn request_split( + epoch: metapb::RegionEpoch, + split_keys: Vec>, + source: String, + ) -> (Self, CmdResSubscriber) { + let (ch, sub) = CmdResChannel::pair(); + ( + PeerMsg::RequestSplit { + request: RequestSplit { + epoch, + split_keys, + source: source.into_boxed_str(), + }, + ch, + }, + sub, + ) + } } #[derive(Debug)] diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index 15bb2e73ff8..18b01a8026a 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -16,7 +16,10 @@ use slog::{error, info, Logger}; use tikv_util::{time::UnixSecs, worker::Runnable}; use yatp::{task::future::TaskCell, Remote}; -use crate::{batch::StoreRouter, router::PeerMsg}; +use crate::{ + batch::StoreRouter, + router::{CmdResChannel, PeerMsg}, +}; mod region_heartbeat; mod split; @@ -39,6 +42,7 @@ pub enum Task { split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + ch: CmdResChannel, }, ReportBatchSplit { regions: Vec, @@ -174,7 +178,8 @@ where split_keys, peer, right_derive, - } => self.handle_ask_batch_split(region, split_keys, peer, right_derive), + ch, + } => self.handle_ask_batch_split(region, split_keys, peer, right_derive, ch), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), Task::UpdateMaxTimestamp { region_id, @@ -208,6 +213,7 @@ mod requests { use raft::eraftpb::ConfChangeType; use super::*; + use crate::router::RaftRequest; pub fn send_admin_request( logger: &Logger, @@ -216,6 +222,7 @@ mod requests { epoch: metapb::RegionEpoch, peer: metapb::Peer, request: AdminRequest, + ch: Option, ) where EK: KvEngine, ER: RaftEngine, @@ -228,7 +235,10 @@ mod requests { req.mut_header().set_peer(peer); req.set_admin_request(request); - let (msg, _) = PeerMsg::admin_command(req); + let msg = match ch { + Some(ch) => PeerMsg::AdminCommand(RaftRequest::new(req, ch)), + None => PeerMsg::admin_command(req).0, + }; if let Err(e) = router.send(region_id, msg) { error!( logger, diff --git a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs index 4096467087a..31f84801ed2 100644 --- a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs @@ -184,7 +184,7 @@ where change_peer.get_change_type(), change_peer.take_peer(), ); - send_admin_request(&logger, &router, region_id, epoch, peer, req); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); } else if resp.has_change_peer_v2() { PD_HEARTBEAT_COUNTER_VEC .with_label_values(&["change peer"]) @@ -198,7 +198,7 @@ where "changes" => ?change_peer_v2.get_changes(), ); let req = new_change_peer_v2_request(change_peer_v2.take_changes().into()); - send_admin_request(&logger, &router, region_id, epoch, peer, req); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); } else if resp.has_transfer_leader() { PD_HEARTBEAT_COUNTER_VEC .with_label_values(&["transfer leader"]) @@ -217,7 +217,7 @@ where transfer_leader.take_peer(), transfer_leader.take_peers().into(), ); - send_admin_request(&logger, &router, region_id, epoch, peer, req); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); } else if resp.has_split_region() { // TODO info!(logger, "pd asks for split but ignored"); diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index 3cb85f6698c..cb7c3ad9308 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -9,6 +9,7 @@ use pd_client::PdClient; use slog::{info, warn}; use super::{requests::*, Runner}; +use crate::router::CmdResChannel; fn new_batch_split_region_request( split_keys: Vec>, @@ -42,6 +43,7 @@ where split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + ch: CmdResChannel, ) { if split_keys.is_empty() { info!(self.logger, "empty split key, skip ask batch split"; @@ -71,7 +73,7 @@ where ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); - send_admin_request(&logger, &router, region_id, epoch, peer, req); + send_admin_request(&logger, &router, region_id, epoch, peer, req, Some(ch)); } Err(e) => { warn!( diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 311e7e58a12..bad3ac2077d 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5553,7 +5553,34 @@ where "split_keys" => %KeysInfoFormatter(split_keys.iter()), "source" => source, ); - if let Err(e) = self.validate_split_region(®ion_epoch, &split_keys) { + + if !self.fsm.peer.is_leader() { + // region on this store is no longer leader, skipped. + info!( + "not leader, skip proposing split"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + cb.invoke_with_response(new_error(Error::NotLeader( + self.region_id(), + self.fsm.peer.get_peer_from_cache(self.fsm.peer.leader_id()), + ))); + return; + } + if let Err(e) = util::validate_split_region( + self.fsm.region_id(), + self.fsm.peer_id(), + self.region(), + ®ion_epoch, + &split_keys, + ) { + info!( + "invalid split request"; + "err" => ?e, + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "source" => %source + ); cb.invoke_with_response(new_error(e)); return; } @@ -5583,70 +5610,6 @@ where } } - fn validate_split_region( - &mut self, - epoch: &metapb::RegionEpoch, - split_keys: &[Vec], - ) -> Result<()> { - if split_keys.is_empty() { - error!( - "no split key is specified."; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return Err(box_err!("{} no split key is specified.", self.fsm.peer.tag)); - } - for key in split_keys { - if key.is_empty() { - error!( - "split key should not be empty!!!"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return Err(box_err!( - "{} split key should not be empty", - self.fsm.peer.tag - )); - } - } - if !self.fsm.peer.is_leader() { - // region on this store is no longer leader, skipped. - info!( - "not leader, skip."; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return Err(Error::NotLeader( - self.region_id(), - self.fsm.peer.get_peer_from_cache(self.fsm.peer.leader_id()), - )); - } - - let region = self.fsm.peer.region(); - let latest_epoch = region.get_region_epoch(); - - // This is a little difference for `check_region_epoch` in region split case. - // Here we just need to check `version` because `conf_ver` will be update - // to the latest value of the peer, and then send to PD. - if latest_epoch.get_version() != epoch.get_version() { - info!( - "epoch changed, retry later"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - "prev_epoch" => ?region.get_region_epoch(), - "epoch" => ?epoch, - ); - return Err(Error::EpochNotMatch( - format!( - "{} epoch changed {:?} != {:?}, retry later", - self.fsm.peer.tag, latest_epoch, epoch - ), - vec![region.to_owned()], - )); - } - Ok(()) - } - fn on_approximate_region_size(&mut self, size: u64) { self.fsm.peer.approximate_size = Some(size); self.register_split_region_check_tick(); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 78f024997cf..2d27b56fda5 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -1621,6 +1621,47 @@ impl LatencyInspector { } } +pub fn validate_split_region( + region_id: u64, + peer_id: u64, + region: &Region, + epoch: &RegionEpoch, + split_keys: &[Vec], +) -> Result<()> { + if split_keys.is_empty() { + return Err(box_err!( + "[region {}] {} no split key is specified.", + region_id, + peer_id + )); + } + + let latest_epoch = region.get_region_epoch(); + // This is a little difference for `check_region_epoch` in region split case. + // Here we just need to check `version` because `conf_ver` will be update + // to the latest value of the peer, and then send to PD. + if latest_epoch.get_version() != epoch.get_version() { + return Err(Error::EpochNotMatch( + format!( + "[region {}] {} epoch changed {:?} != {:?}, retry later", + region_id, peer_id, latest_epoch, epoch + ), + vec![region.to_owned()], + )); + } + for key in split_keys { + if key.is_empty() { + return Err(box_err!( + "[region {}] {} split key should not be empty", + region_id, + peer_id + )); + } + check_key_in_region(key, region)?; + } + Ok(()) +} + #[cfg(test)] mod tests { use std::thread; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 470e3a41861..ffc5272c673 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -92,7 +92,6 @@ use tikv::{ read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, server::{ config::{Config as ServerConfig, ServerConfigManager}, - create_raft_storage, gc_worker::{AutoGcConfig, GcWorker}, lock_manager::LockManager, raftkv::ReplicaReadLockChecker, @@ -108,7 +107,7 @@ use tikv::{ config_manager::StorageConfigManger, mvcc::MvccConsistencyCheckObserver, txn::flow_controller::{EngineFlowController, FlowController}, - Engine, + Engine, Storage, }, }; use tikv_util::{ @@ -797,7 +796,7 @@ where storage_read_pools.handle() }; - let storage = create_raft_storage::<_, _, _, F, _>( + let storage = Storage::<_, _, F>::from_engine( engines.engine.clone(), &self.config.storage, storage_read_pool_handle, @@ -825,7 +824,7 @@ where let (resolver, state) = resolve::new_resolver( self.pd_client.clone(), &self.background_worker, - storage.get_engine().raft_extension().clone(), + storage.get_engine().raft_extension(), ); self.resolver = Some(resolver); @@ -1216,7 +1215,7 @@ where self.kv_statistics.clone(), self.raft_statistics.clone(), servers.server.get_debug_thread_pool().clone(), - engines.engine.raft_extension().clone(), + engines.engine.raft_extension(), self.cfg_controller.as_ref().unwrap().clone(), ); if servers diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index e3cfb298c59..12d9982fea6 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -52,7 +52,6 @@ use tikv::{ import::{ImportSstService, SstImporter}, read_pool::ReadPool, server::{ - create_raft_storage, gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, @@ -66,7 +65,7 @@ use tikv::{ self, kv::{FakeExtension, SnapContext}, txn::flow_controller::{EngineFlowController, FlowController}, - Engine, + Engine, Storage, }, }; use tikv_util::{ @@ -401,8 +400,8 @@ impl ServerCluster { cfg.quota.max_delay_duration, cfg.quota.enable_auto_tune, )); - let extension = engine.raft_extension().clone(); - let store = create_raft_storage::<_, _, _, F, _>( + let extension = engine.raft_extension(); + let store = Storage::<_, _, F>::from_engine( engine, &cfg.storage, storage_read_pool.handle(), diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index bf277282bd8..5af54ee61b6 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -311,7 +311,7 @@ pub trait Engine: Send + Clone + 'static { type RaftExtension: raft_extension::RaftExtension = FakeExtension; /// Get the underlying raft extension. - fn raft_extension(&self) -> &Self::RaftExtension { + fn raft_extension(&self) -> Self::RaftExtension { unimplemented!() } diff --git a/components/tikv_kv/src/mock_engine.rs b/components/tikv_kv/src/mock_engine.rs index dc812e84d93..69a61d58963 100644 --- a/components/tikv_kv/src/mock_engine.rs +++ b/components/tikv_kv/src/mock_engine.rs @@ -154,7 +154,7 @@ impl Engine for MockEngine { } type RaftExtension = ::RaftExtension; - fn raft_extension(&self) -> &Self::RaftExtension { + fn raft_extension(&self) -> Self::RaftExtension { self.base.raft_extension() } diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 065766ae254..21099974d2d 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -233,8 +233,8 @@ impl Engine for RocksEngine { } type RaftExtension = RE; - fn raft_extension(&self) -> &Self::RaftExtension { - &self.ext + fn raft_extension(&self) -> Self::RaftExtension { + self.ext.clone() } fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()> { diff --git a/src/server/mod.rs b/src/server/mod.rs index 0e4a3616a6c..0bb6da62ac7 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -32,10 +32,11 @@ pub use self::{ config::{Config, ServerConfigManager, DEFAULT_CLUSTER_ID, DEFAULT_LISTENING_ADDR}, errors::{Error, Result}, metrics::{CONFIG_ROCKSDB_GAUGE, CPU_CORES_QUOTA_GAUGE, MEM_TRACE_SUM_GAUGE}, - node::{create_raft_storage, Node}, + node::Node, proxy::{build_forward_option, get_target_address, Proxy}, raft_client::{ConnectionBuilder, RaftClient}, raftkv::RaftKv, + raftkv2::{NodeV2, RaftKv2}, resolve::{PdStoreAddrResolver, StoreAddrResolver}, server::{Server, GRPC_THREAD_PREFIX}, transport::ServerTransport, diff --git a/src/server/node.rs b/src/server/node.rs index 0b654921f59..e36e980e1d3 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -6,7 +6,7 @@ use std::{ time::Duration, }; -use api_version::{api_v2::TIDB_RANGES_COMPLEMENT, KvFormat}; +use api_version::api_v2::TIDB_RANGES_COMPLEMENT; use causal_ts::CausalTsProviderImpl; use concurrency_manager::ConcurrencyManager; use engine_traits::{Engines, Iterable, KvEngine, RaftEngine, DATA_CFS, DATA_KEY_PREFIX_LEN}; @@ -14,10 +14,9 @@ use grpcio_health::HealthService; use kvproto::{ kvrpcpb::ApiVersion, metapb, raft_serverpb::StoreIdent, replication_modepb::ReplicationStatus, }; -use pd_client::{Error as PdError, FeatureGate, PdClient, INVALID_ID}; +use pd_client::{Error as PdError, PdClient, INVALID_ID}; use raftstore::{ coprocessor::dispatcher::CoprocessorHost, - router::{LocalReadRouter, RaftStoreRouter}, store::{ self, fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, @@ -25,68 +24,69 @@ use raftstore::{ RefreshConfigTask, SnapManager, SplitCheckTask, Transport, }, }; -use resource_metering::{CollectorRegHandle, ResourceTagFactory}; +use resource_metering::CollectorRegHandle; use tikv_util::{ config::VersionTrack, - quota_limiter::QuotaLimiter, worker::{LazyWorker, Scheduler, Worker}, }; -use super::{RaftKv, Result}; -use crate::{ - import::SstImporter, - read_pool::ReadPoolHandle, - server::Config as ServerConfig, - storage::{ - config::Config as StorageConfig, kv::FlowStatsReporter, lock_manager, - txn::flow_controller::FlowController, DynamicConfigs as StorageDynamicConfigs, Storage, - }, -}; +use super::Result; +use crate::{import::SstImporter, server::Config as ServerConfig}; const MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT: u64 = 60; const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs(3); -/// Creates a new storage engine which is backed by the Raft consensus -/// protocol. -pub fn create_raft_storage< - S, - EK, - R: FlowStatsReporter, - F: KvFormat, - LM: lock_manager::LockManager, ->( - engine: RaftKv, - cfg: &StorageConfig, - read_pool: ReadPoolHandle, - lock_mgr: LM, - concurrency_manager: ConcurrencyManager, - dynamic_configs: StorageDynamicConfigs, - flow_controller: Arc, - reporter: R, - resource_tag_factory: ResourceTagFactory, - quota_limiter: Arc, - feature_gate: FeatureGate, - causal_ts_provider: Option>, -) -> Result, LM, F>> -where - S: RaftStoreRouter + LocalReadRouter + 'static, - EK: KvEngine, -{ - let store = Storage::from_engine( - engine, - cfg, - read_pool, - lock_mgr, - concurrency_manager, - dynamic_configs, - flow_controller, - reporter, - resource_tag_factory, - quota_limiter, - feature_gate, - causal_ts_provider, - )?; - Ok(store) +pub(crate) fn init_store(store: Option, cfg: &ServerConfig) -> metapb::Store { + let mut store = store.unwrap_or_default(); + store.set_id(INVALID_ID); + if store.get_address().is_empty() { + if cfg.advertise_addr.is_empty() { + store.set_address(cfg.addr.clone()); + if store.get_peer_address().is_empty() { + store.set_peer_address(cfg.addr.clone()); + } + } else { + store.set_address(cfg.advertise_addr.clone()); + if store.get_peer_address().is_empty() { + store.set_peer_address(cfg.advertise_addr.clone()); + } + } + } + if store.get_status_address().is_empty() { + if cfg.advertise_status_addr.is_empty() { + store.set_status_address(cfg.status_addr.clone()); + } else { + store.set_status_address(cfg.advertise_status_addr.clone()) + } + } + if store.get_version().is_empty() { + store.set_version(env!("CARGO_PKG_VERSION").to_string()); + } + + if let Ok(path) = std::env::current_exe() { + if let Some(path) = path.parent() { + store.set_deploy_path(path.to_string_lossy().to_string()); + } + }; + + store.set_start_timestamp(chrono::Local::now().timestamp()); + if store.get_git_hash().is_empty() { + store.set_git_hash( + option_env!("TIKV_BUILD_GIT_HASH") + .unwrap_or("Unknown git hash") + .to_string(), + ); + } + + let mut labels = Vec::new(); + for (k, v) in &cfg.labels { + let mut label = metapb::StoreLabel::default(); + label.set_key(k.to_owned()); + label.set_value(v.to_owned()); + labels.push(label); + } + store.set_labels(labels.into()); + store } /// A wrapper for the raftstore which runs Multi-Raft. @@ -123,58 +123,7 @@ where health_service: Option, default_store: Option, ) -> Node { - let mut store = match default_store { - None => metapb::Store::default(), - Some(s) => s, - }; - store.set_id(INVALID_ID); - if store.get_address().is_empty() { - if cfg.advertise_addr.is_empty() { - store.set_address(cfg.addr.clone()); - if store.get_peer_address().is_empty() { - store.set_peer_address(cfg.addr.clone()); - } - } else { - store.set_address(cfg.advertise_addr.clone()); - if store.get_peer_address().is_empty() { - store.set_peer_address(cfg.advertise_addr.clone()); - } - } - } - if store.get_status_address().is_empty() { - if cfg.advertise_status_addr.is_empty() { - store.set_status_address(cfg.status_addr.clone()); - } else { - store.set_status_address(cfg.advertise_status_addr.clone()) - } - } - if store.get_version().is_empty() { - store.set_version(env!("CARGO_PKG_VERSION").to_string()); - } - - if let Ok(path) = std::env::current_exe() { - if let Some(path) = path.parent() { - store.set_deploy_path(path.to_string_lossy().to_string()); - } - }; - - store.set_start_timestamp(chrono::Local::now().timestamp()); - if store.get_git_hash().is_empty() { - store.set_git_hash( - option_env!("TIKV_BUILD_GIT_HASH") - .unwrap_or("Unknown git hash") - .to_string(), - ); - } - - let mut labels = Vec::new(); - for (k, v) in &cfg.labels { - let mut label = metapb::StoreLabel::default(); - label.set_key(k.to_owned()); - label.set_value(v.to_owned()); - labels.push(label); - } - store.set_labels(labels.into()); + let store = init_store(default_store, cfg); Node { cluster_id: cfg.cluster_id, diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 607d5af71f3..c50c42c9fc6 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -364,8 +364,8 @@ where type RaftExtension = RaftRouterWrap; #[inline] - fn raft_extension(&self) -> &Self::RaftExtension { - &self.router + fn raft_extension(&self) -> Self::RaftExtension { + self.router.clone() } fn modify_on_kv_engine( diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 5bcdd131d72..f850cc74d19 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -1,89 +1,36 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{mem, pin::Pin, task::Poll}; +mod node; +mod raft_extension; +use std::{ + mem, + pin::Pin, + sync::{Arc, RwLock}, + task::Poll, +}; + +use collections::HashSet; use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; use futures::{Future, Stream, StreamExt}; -use kvproto::{ - raft_cmdpb::{CmdType, RaftCmdRequest, Request}, - raft_serverpb::RaftMessage, -}; +use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; +pub use node::NodeV2; use raftstore::store::RegionSnapshot; use raftstore_v2::{ router::{ message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, }, - SimpleWriteEncoder, StoreRouter, + SimpleWriteEncoder, }; -use tikv_kv::{Modify, RaftExtension, WriteEvent}; +use tikv_kv::{Modify, WriteEvent}; use tikv_util::{codec::number::NumberEncoder, time::Instant}; -use txn_types::WriteBatchFlags; +use txn_types::{TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::{ metrics::{ASYNC_REQUESTS_COUNTER_VEC, ASYNC_REQUESTS_DURATIONS_VEC}, raftkv::{get_status_kind_from_engine_error, new_request_header}, }; -#[derive(Clone)] -pub struct RaftExtensionImpl { - router: StoreRouter, -} - -impl RaftExtension for RaftExtensionImpl { - #[inline] - fn feed(&self, msg: RaftMessage, key_message: bool) { - let region_id = msg.get_region_id(); - let msg_ty = msg.get_message().get_msg_type(); - // Channel full and region not found are ignored unless it's a key message. - if let Err(e) = self.router.send_raft_message(Box::new(msg)) && key_message { - error!("failed to send raft message"; "region_id" => region_id, "msg_ty" => ?msg_ty, "err" => ?e); - } - } - - fn report_reject_message(&self, _region_id: u64, _from_peer_id: u64) { - // TODO:reject the message on connection side instead of go through - // raft layer. - } - - fn report_peer_unreachable(&self, region_id: u64, to_peer_id: u64) { - let _ = self - .router - .send(region_id, PeerMsg::PeerUnreachable { to_peer_id }); - } - - fn report_store_unreachable(&self, _store_id: u64) {} - - fn report_snapshot_status( - &self, - _region_id: u64, - _to_peer_id: u64, - _status: raft::SnapshotStatus, - ) { - } - - fn report_resolved(&self, _store_id: u64, _group_id: u64) {} - - fn split( - &self, - _region_id: u64, - _region_epoch: kvproto::metapb::RegionEpoch, - _split_keys: Vec>, - _source: String, - ) -> futures::future::BoxFuture<'static, tikv_kv::Result>> { - Box::pin(async move { Err(box_err!("raft split is not supported")) }) - } - - fn query_region( - &self, - _region_id: u64, - ) -> futures::future::BoxFuture< - 'static, - tikv_kv::Result, - > { - Box::pin(async move { Err(box_err!("query region is not supported")) }) - } -} - struct Transform { resp: CmdResStream, early_err: Option, @@ -123,12 +70,25 @@ impl Stream for Transform { #[derive(Clone)] pub struct RaftKv2 { router: RaftRouter, + txn_extra_scheduler: Option>, + region_leaders: Arc>>, } impl RaftKv2 { #[allow(unused)] - pub fn new(router: RaftRouter) -> RaftKv2 { - RaftKv2 { router } + pub fn new( + router: RaftRouter, + region_leaders: Arc>>, + ) -> RaftKv2 { + RaftKv2 { + router, + region_leaders, + txn_extra_scheduler: None, + } + } + + pub fn set_txn_extra_scheduler(&mut self, txn_extra_scheduler: Arc) { + self.txn_extra_scheduler = Some(txn_extra_scheduler); } } @@ -141,7 +101,11 @@ impl tikv_kv::Engine for RaftKv2 { None } - type RaftExtension = RaftExtensionImpl; + type RaftExtension = raft_extension::Extension; + #[inline] + fn raft_extension(&self) -> Self::RaftExtension { + raft_extension::Extension::new(self.router.store_router().clone()) + } fn modify_on_kv_engine( &self, @@ -282,8 +246,8 @@ impl tikv_kv::Engine for RaftKv2 { let res = self .router .store_router() - .send(region_id, msg) - .map_err(|e| tikv_kv::Error::from(raftstore_v2::Error::from(e))); + .check_send(region_id, msg) + .map_err(tikv_kv::Error::from); (Transform { resp: CmdResStream::new(sub), early_err: res.err(), @@ -304,4 +268,22 @@ impl tikv_kv::Engine for RaftKv2 { } }) } + + #[inline] + fn precheck_write_with_ctx(&self, ctx: &kvproto::kvrpcpb::Context) -> tikv_kv::Result<()> { + let region_id = ctx.get_region_id(); + match self.region_leaders.read().unwrap().get(®ion_id) { + Some(_) => Ok(()), + None => Err(raftstore_v2::Error::NotLeader(region_id, None).into()), + } + } + + #[inline] + fn schedule_txn_extra(&self, txn_extra: TxnExtra) { + if let Some(tx) = self.txn_extra_scheduler.as_ref() { + if !txn_extra.is_empty() { + tx.schedule(txn_extra); + } + } + } } diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs new file mode 100644 index 00000000000..59daa053aa3 --- /dev/null +++ b/src/server/raftkv2/node.rs @@ -0,0 +1,210 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::{Arc, Mutex}; + +use causal_ts::CausalTsProviderImpl; +use concurrency_manager::ConcurrencyManager; +use engine_traits::{KvEngine, RaftEngine, TabletContext, TabletRegistry}; +use kvproto::{metapb, replication_modepb::ReplicationStatus}; +use pd_client::PdClient; +use raftstore::store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}; +use raftstore_v2::{router::RaftRouter, Bootstrap, StoreSystem}; +use slog::{info, o, Logger}; +use tikv_util::{config::VersionTrack, worker::Worker}; + +use crate::server::{node::init_store, Result}; + +// TODO: we will rename another better name like RaftStore later. +pub struct NodeV2 { + cluster_id: u64, + store: metapb::Store, + store_cfg: Arc>, + system: StoreSystem, + has_started: bool, + + pd_client: Arc, + state: Arc>, + bg_worker: Worker, + registry: TabletRegistry, + logger: Logger, +} + +impl NodeV2 +where + C: PdClient, + EK: KvEngine, + ER: RaftEngine, +{ + /// Creates a new Node. + pub fn new( + system: StoreSystem, + cfg: &crate::server::Config, + store_cfg: Arc>, + pd_client: Arc, + state: Arc>, + bg_worker: Worker, + store: Option, + registry: TabletRegistry, + ) -> NodeV2 { + let store = init_store(store, cfg); + + NodeV2 { + cluster_id: cfg.cluster_id, + store, + store_cfg, + pd_client, + system, + has_started: false, + state, + bg_worker, + registry, + logger: slog_global::borrow_global().new(o!()), + } + } + + pub fn try_bootstrap_store(&mut self, raft_engine: &ER) -> Result<()> { + let store_id = Bootstrap::new( + raft_engine, + self.cluster_id, + &*self.pd_client, + self.logger.clone(), + ) + .bootstrap_store()?; + self.store.set_id(store_id); + Ok(()) + } + + /// Starts the Node. It tries to bootstrap cluster if the cluster is not + /// bootstrapped yet. Then it spawns a thread to run the raftstore in + /// background. + pub fn start( + &mut self, + raft_engine: ER, + trans: T, + router: &RaftRouter, + snap_mgr: TabletSnapManager, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 + ) -> Result<()> + where + T: Transport + 'static, + { + let store_id = self.id(); + { + let mut meta = router.store_meta().lock().unwrap(); + meta.store_id = Some(store_id); + } + if let Some(region) = Bootstrap::new( + &raft_engine, + self.cluster_id, + &*self.pd_client, + self.logger.clone(), + ) + .bootstrap_first_region(&self.store, store_id)? + { + let path = self + .registry + .tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); + let ctx = TabletContext::new(®ion, Some(RAFT_INIT_LOG_INDEX)); + // TODO: make follow line can recover from abort. + self.registry + .tablet_factory() + .open_tablet(ctx, &path) + .unwrap(); + } + + // Put store only if the cluster is bootstrapped. + info!(self.logger, "put store to PD"; "store" => ?&self.store); + let status = self.pd_client.put_store(self.store.clone())?; + self.load_all_stores(status); + + self.start_store( + raft_engine, + trans, + router, + snap_mgr, + concurrency_manager, + causal_ts_provider, + )?; + + Ok(()) + } + + /// Gets the store id. + pub fn id(&self) -> u64 { + self.store.get_id() + } + + /// Gets a copy of Store which is registered to Pd. + pub fn store(&self) -> metapb::Store { + self.store.clone() + } + + // TODO: support updating dynamic configuration. + + // TODO: check api version. + // Do we really need to do the check giving we don't consider support upgrade + // ATM? + + fn load_all_stores(&mut self, status: Option) { + info!(self.logger, "initializing replication mode"; "status" => ?status, "store_id" => self.store.id); + let stores = match self.pd_client.get_all_stores(false) { + Ok(stores) => stores, + Err(e) => panic!("failed to load all stores: {:?}", e), + }; + let mut state = self.state.lock().unwrap(); + if let Some(s) = status { + state.set_status(s); + } + for mut store in stores { + state + .group + .register_store(store.id, store.take_labels().into()); + } + } + + fn start_store( + &mut self, + raft_engine: ER, + trans: T, + router: &RaftRouter, + snap_mgr: TabletSnapManager, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 + ) -> Result<()> + where + T: Transport + 'static, + { + let store_id = self.store.get_id(); + info!(self.logger, "start raft store thread"; "store_id" => store_id); + + if self.has_started { + return Err(box_err!("{} is already started", store_id)); + } + self.has_started = true; + let cfg = self.store_cfg.clone(); + + self.system.start( + store_id, + cfg, + raft_engine, + self.registry.clone(), + trans, + self.pd_client.clone(), + router.store_router(), + router.store_meta().clone(), + snap_mgr, + concurrency_manager, + causal_ts_provider, + )?; + Ok(()) + } + + /// Stops the Node. + pub fn stop(&mut self) { + let store_id = self.store.get_id(); + info!(self.logger, "stop raft store thread"; "store_id" => store_id); + self.system.shutdown(); + self.bg_worker.stop(); + } +} diff --git a/src/server/raftkv2/raft_extension.rs b/src/server/raftkv2/raft_extension.rs new file mode 100644 index 00000000000..f2f433999b9 --- /dev/null +++ b/src/server/raftkv2/raft_extension.rs @@ -0,0 +1,109 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::raft_serverpb::RaftMessage; +use raftstore_v2::{ + router::{DebugInfoChannel, PeerMsg, StoreMsg}, + StoreRouter, +}; + +#[derive(Clone)] +pub struct Extension { + router: StoreRouter, +} + +impl Extension { + pub fn new(router: StoreRouter) -> Self { + Extension { router } + } +} + +impl tikv_kv::RaftExtension for Extension { + #[inline] + fn feed(&self, msg: RaftMessage, key_message: bool) { + let region_id = msg.get_region_id(); + let msg_ty = msg.get_message().get_msg_type(); + // Channel full and region not found are ignored unless it's a key message. + if let Err(e) = self.router.send_raft_message(Box::new(msg)) && key_message { + error!("failed to send raft message"; "region_id" => region_id, "msg_ty" => ?msg_ty, "err" => ?e); + } + } + + #[inline] + fn report_reject_message(&self, _region_id: u64, _from_peer_id: u64) { + // TODO:reject the message on connection side instead of go through + // raft layer. + } + + #[inline] + fn report_peer_unreachable(&self, region_id: u64, to_peer_id: u64) { + let _ = self + .router + .send(region_id, PeerMsg::PeerUnreachable { to_peer_id }); + } + + #[inline] + fn report_store_unreachable(&self, to_store_id: u64) { + let _ = self + .router + .send_control(StoreMsg::StoreUnreachable { to_store_id }); + } + + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: raft::SnapshotStatus, + ) { + let _ = self + .router + .force_send(region_id, PeerMsg::SnapshotSent { to_peer_id, status }); + } + + fn report_resolved(&self, _store_id: u64, _group_id: u64) { + // TODO: support commit group + } + + fn split( + &self, + region_id: u64, + region_epoch: kvproto::metapb::RegionEpoch, + split_keys: Vec>, + source: String, + ) -> futures::future::BoxFuture<'static, tikv_kv::Result>> { + let (msg, sub) = PeerMsg::request_split(region_epoch, split_keys, source); + let res = self.router.check_send(region_id, msg); + Box::pin(async move { + res?; + let mut resp = match sub.result().await { + Some(r) => r, + None => return Err(box_err!("split is aborted")), + }; + if !resp.get_header().has_error() { + let regions = resp.mut_admin_response().mut_splits().take_regions(); + Ok(regions.into()) + } else { + Err(tikv_kv::Error::from(resp.mut_header().take_error())) + } + }) + } + + fn query_region( + &self, + region_id: u64, + ) -> futures::future::BoxFuture< + 'static, + tikv_kv::Result, + > { + let (ch, sub) = DebugInfoChannel::pair(); + let msg = PeerMsg::QueryDebugInfo(ch); + let res = self.router.check_send(region_id, msg); + Box::pin(async move { + res?; + match sub.result().await { + Some(res) => Ok(res), + None => Err(box_err!("query region is aborted")), + } + }) + } +} diff --git a/src/server/server.rs b/src/server/server.rs index 1921483e37b..428aee31090 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -122,7 +122,7 @@ where let snap_worker = Worker::new("snap-handler"); let lazy_worker = snap_worker.lazy_build("snap-handler"); - let raft_ext = storage.get_engine().raft_extension().clone(); + let raft_ext = storage.get_engine().raft_extension(); let proxy = Proxy::new(security_mgr.clone(), &env, Arc::new(cfg.value().clone())); let kv_service = KvService::new( diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index db50dfe459e..88ed0c99443 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -590,7 +590,7 @@ impl Tikv for Service { sink: ClientStreamingSink, ) { let store_id = self.store_id; - let ch = self.storage.get_engine().raft_extension().clone(); + let ch = self.storage.get_engine().raft_extension(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; let res = async move { @@ -633,7 +633,7 @@ impl Tikv for Service { ) { info!("batch_raft RPC is called, new gRPC stream established"); let store_id = self.store_id; - let ch = self.storage.get_engine().raft_extension().clone(); + let ch = self.storage.get_engine().raft_extension(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; let res = async move { From 186e242b5169b9cbc932ce5f6fb657108650a470 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 21 Dec 2022 20:28:55 +0800 Subject: [PATCH 419/676] raftstore-v2: implement a simplified version of CoprocessorHost (#13901) ref tikv/tikv#12842 Signed-off-by: SpadeA-Tang Signed-off-by: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Co-authored-by: Xinye Tao --- components/raftstore-v2/src/batch/store.rs | 30 ++++++++++--- components/raftstore-v2/src/fsm/mod.rs | 2 +- components/raftstore-v2/src/fsm/store.rs | 16 ++++++- components/raftstore-v2/src/lib.rs | 2 +- .../operation/command/admin/conf_change.rs | 8 ++++ .../src/operation/command/admin/split.rs | 13 +++++- components/raftstore-v2/src/operation/life.rs | 17 +++++++- .../raftstore-v2/src/operation/ready/mod.rs | 19 +++++++- .../src/operation/ready/snapshot.rs | 16 +++++-- components/raftstore-v2/src/raft/peer.rs | 43 ++++++++++++++++--- .../tests/integrations/cluster.rs | 22 +++++++--- src/server/lock_manager/deadlock.rs | 15 ++++++- src/server/lock_manager/mod.rs | 5 +++ src/server/raftkv2/node.rs | 6 ++- 14 files changed, 183 insertions(+), 31 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 98075969c66..8a1f60f3717 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -20,10 +20,13 @@ use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use file_system::{set_io_type, IoType}; use kvproto::{disk_usage::DiskUsage, raft_serverpb::RaftMessage}; use pd_client::PdClient; -use raft::INVALID_ID; -use raftstore::store::{ - fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, ReadRunner, ReadTask, - StoreWriters, TabletSnapManager, Transport, WriteSenders, +use raft::{StateRole, INVALID_ID}; +use raftstore::{ + coprocessor::RegionChangeEvent, + store::{ + fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, ReadRunner, ReadTask, + StoreWriters, TabletSnapManager, Transport, WriteSenders, + }, }; use slog::Logger; use tikv_util::{ @@ -39,7 +42,10 @@ use tikv_util::{ use time::Timespec; use crate::{ - fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, + fsm::{ + LockManagerNotifier, PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, + StoreMeta, + }, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, worker::pd, @@ -76,6 +82,8 @@ pub struct StoreContext { pub snap_mgr: TabletSnapManager, pub pd_scheduler: Scheduler, + + pub lock_manager_notifier: Arc, } /// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. @@ -229,6 +237,7 @@ struct StorePollerBuilder { logger: Logger, store_meta: Arc>, snap_mgr: TabletSnapManager, + lock_manager_notifier: Arc, } impl StorePollerBuilder { @@ -245,6 +254,7 @@ impl StorePollerBuilder { logger: Logger, store_meta: Arc>, snap_mgr: TabletSnapManager, + lock_manager_notifier: Arc, ) -> Self { let pool_size = cfg.value().apply_batch_system.pool_size; let max_pool_size = std::cmp::max( @@ -270,6 +280,7 @@ impl StorePollerBuilder { write_senders: store_writers.senders(), store_meta, snap_mgr, + lock_manager_notifier, } } @@ -291,6 +302,12 @@ impl StorePollerBuilder { Some(p) => p, None => return Ok(()), }; + self.lock_manager_notifier.on_region_changed( + storage.region_state().get_region(), + RegionChangeEvent::Create, + StateRole::Follower, + ); + let (sender, peer_fsm) = PeerFsm::new(&cfg, &self.tablet_registry, storage)?; meta.region_read_progress .insert(region_id, peer_fsm.as_ref().peer().read_progress().clone()); @@ -345,6 +362,7 @@ where self_disk_usage: DiskUsage::Normal, snap_mgr: self.snap_mgr.clone(), pd_scheduler: self.pd_scheduler.clone(), + lock_manager_notifier: self.lock_manager_notifier.clone(), }; let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); StorePoller::new(poll_ctx, cfg_tracker) @@ -392,6 +410,7 @@ impl StoreSystem { snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 + lock_manager_notifier: Arc, ) -> Result<()> where T: Transport + 'static, @@ -445,6 +464,7 @@ impl StoreSystem { self.logger.clone(), store_meta.clone(), snap_mgr, + lock_manager_notifier, ); self.workers = Some(workers); let peers = builder.init()?; diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs index b3d0e0483ba..442c6b050ce 100644 --- a/components/raftstore-v2/src/fsm/mod.rs +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -11,4 +11,4 @@ mod store; pub use apply::{ApplyFsm, ApplyResReporter, ApplyScheduler}; pub use peer::{PeerFsm, PeerFsmDelegate, SenderFsmPair}; -pub use store::{Store, StoreFsm, StoreFsmDelegate, StoreMeta}; +pub use store::{LockManagerNotifier, Store, StoreFsm, StoreFsmDelegate, StoreMeta}; diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index bd31de69496..6e2dfe4a75f 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -6,7 +6,12 @@ use batch_system::Fsm; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use futures::{compat::Future01CompatExt, FutureExt}; -use raftstore::store::{Config, ReadDelegate, RegionReadProgressRegistry}; +use kvproto::metapb::Region; +use raft::StateRole; +use raftstore::{ + coprocessor::{RegionChangeEvent, RoleChange}, + store::{Config, ReadDelegate, RegionReadProgressRegistry}, +}; use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, @@ -165,3 +170,12 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { } } } + +// A simplified version of CoprocessorHost used to convey information to +// LockManager only. +// It is replaced by CoprocessorHost in the future. +pub trait LockManagerNotifier: Send + Sync { + fn on_role_change(&self, region: &Region, role_change: RoleChange); + + fn on_region_changed(&self, region: &Region, event: RegionChangeEvent, role: StateRole); +} diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index d8327549da6..cb769b6594a 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -38,6 +38,6 @@ mod worker; pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; pub use bootstrap::Bootstrap; -pub use fsm::StoreMeta; +pub use fsm::{LockManagerNotifier, StoreMeta}; pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{store::Config, Error, Result}; diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 4bda7eedf32..ec0b78e717a 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -18,6 +18,7 @@ use kvproto::{ use protobuf::Message; use raft::prelude::*; use raftstore::{ + coprocessor::{RegionChangeEvent, RegionChangeReason}, store::{ metrics::{PEER_ADMIN_CMD_COUNTER_VEC, PEER_PROPOSE_LOG_SIZE_HISTOGRAM}, util::{self, ChangePeerI, ConfChangeKind}, @@ -146,6 +147,13 @@ impl Peer { let remove_self = conf_change.region_state.get_state() == PeerState::Tombstone; self.storage_mut() .set_region_state(conf_change.region_state); + + ctx.lock_manager_notifier.on_region_changed( + self.region(), + RegionChangeEvent::Update(RegionChangeReason::ChangePeer), + self.get_role(), + ); + if self.is_leader() { info!( self.logger, diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 8bf23da0fd6..870c203f07d 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -40,7 +40,10 @@ use kvproto::{ use protobuf::Message; use raft::{prelude::Snapshot, INVALID_ID}; use raftstore::{ - coprocessor::split_observer::{is_valid_split_key, strip_timestamp_if_exists}, + coprocessor::{ + split_observer::{is_valid_split_key, strip_timestamp_if_exists}, + RegionChangeReason, + }, store::{ cmd_resp, fsm::apply::validate_batch_split, @@ -415,7 +418,13 @@ impl Peer { { let mut meta = store_ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); - self.set_region(reader, derived.clone(), tablet_index); + self.set_region( + &store_ctx.lock_manager_notifier, + reader, + derived.clone(), + RegionChangeReason::Split, + tablet_index, + ); } self.post_split(); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index d9f706c32a1..73db4e760d1 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -19,7 +19,10 @@ use kvproto::{ metapb::Region, raft_serverpb::{PeerState, RaftMessage}, }; -use raftstore::store::{util, WriteTask}; +use raftstore::{ + coprocessor::RegionChangeEvent, + store::{util, WriteTask}, +}; use slog::{debug, error, info, warn}; use tikv_util::store::find_peer; @@ -291,7 +294,11 @@ impl Peer { /// /// After destroy is finished, `finish_destroy` should be called to clean up /// memory states. - pub fn start_destroy(&mut self, write_task: &mut WriteTask) { + pub fn start_destroy( + &mut self, + ctx: &mut StoreContext, + write_task: &mut WriteTask, + ) { let entry_storage = self.storage().entry_storage(); if self.postponed_destroy() { return; @@ -319,6 +326,12 @@ impl Peer { lb.put_region_state(region_id, applied_index, ®ion_state) .unwrap(); self.destroy_progress_mut().start(); + + ctx.lock_manager_notifier.on_region_changed( + self.region(), + RegionChangeEvent::Destroy, + self.get_role(), + ); } /// Do clean up for destroy. The peer is permanently destroyed when diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index f9a6c3a34d4..9463aae3d73 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -29,7 +29,10 @@ use error_code::ErrorCodeExt; use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; -use raftstore::store::{util, FetchedLogs, ReadProgress, Transport, WriteTask}; +use raftstore::{ + coprocessor::RoleChange, + store::{util, FetchedLogs, ReadProgress, Transport, WriteTask}, +}; use slog::{debug, error, trace, warn}; use tikv_util::{ store::find_peer, @@ -68,6 +71,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, if self.fsm.peer_mut().tick() { self.fsm.peer_mut().set_has_ready(); } + self.fsm.peer_mut().refresh_lead_transferee(); + self.schedule_tick(PeerTick::Raft); } } @@ -383,7 +388,7 @@ impl Peer { .collect(); } if !self.serving() { - self.start_destroy(&mut write_task); + self.start_destroy(ctx, &mut write_task); } // Ready number should increase monotonically. assert!(self.async_writer.known_largest_number() < ready.number()); @@ -517,8 +522,18 @@ impl Peer { } _ => {} } + ctx.lock_manager_notifier.on_role_change( + self.region(), + RoleChange { + state: ss.raft_state, + leader_id: ss.leader_id, + prev_lead_transferee: self.lead_transferee(), + vote: self.raft_group().raft.vote, + }, + ); self.proposal_control_mut().maybe_update_term(term); } + self.refresh_lead_transferee(); } /// If leader commits new admin commands, it may break lease assumption. So diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 7339df22fa9..6c027517454 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -31,9 +31,12 @@ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletReg use kvproto::raft_serverpb::{PeerState, RaftSnapshotData}; use protobuf::Message; use raft::eraftpb::Snapshot; -use raftstore::store::{ - metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, - TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, +use raftstore::{ + coprocessor::RegionChangeEvent, + store::{ + metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, + TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, + }, }; use slog::{error, info, warn}; use tikv_util::box_err; @@ -150,7 +153,14 @@ impl Peer { // Use a new FlushState to avoid conflicts with the old one. tablet_ctx.flush_state = Some(flush_state); ctx.tablet_registry.load(tablet_ctx, false).unwrap(); + self.schedule_apply_fsm(ctx); + ctx.lock_manager_notifier.on_region_changed( + self.region(), + RegionChangeEvent::Create, + self.get_role(), + ); + self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(persisted_index); { diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 500b166065f..0e38f0dd5a1 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -14,17 +14,20 @@ use engine_traits::{ use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; -use raftstore::store::{ - util::{Lease, RegionReadProgress}, - Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, - ReadProgress, TxnExt, WriteTask, +use raftstore::{ + coprocessor::{RegionChangeEvent, RegionChangeReason}, + store::{ + util::{Lease, RegionReadProgress}, + Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, + ReadProgress, TxnExt, WriteTask, + }, }; use slog::Logger; use super::storage::Storage; use crate::{ batch::StoreContext, - fsm::ApplyScheduler, + fsm::{ApplyScheduler, LockManagerNotifier}, operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteReqEncoder}, router::{CmdResChannel, PeerTick, QueryResChannel}, Result, @@ -67,6 +70,9 @@ pub struct Peer { read_progress: Arc, leader_lease: Lease, + /// lead_transferee if this peer(leader) is in a leadership transferring. + lead_transferee: u64, + /// region buckets. region_buckets: Option, last_region_buckets: Option, @@ -155,6 +161,7 @@ impl Peer { txn_ext: Arc::default(), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), proposal_control: ProposalControl::new(0), + lead_transferee: raft::INVALID_ID, pending_ticks: Vec::new(), split_trace: vec![], state_changes: None, @@ -192,9 +199,10 @@ impl Peer { /// has been preserved in a durable device. pub fn set_region( &mut self, - // host: &CoprocessorHost, + lock_manager_observer: &Arc, reader: &mut ReadDelegate, region: metapb::Region, + reason: RegionChangeReason, tablet_index: u64, ) { if self.region().get_region_epoch().get_version() < region.get_region_epoch().get_version() @@ -239,7 +247,13 @@ impl Peer { pessimistic_locks.version = self.region().get_region_epoch().get_version(); } - // TODO: CoprocessorHost + if self.serving() { + lock_manager_observer.on_region_changed( + self.region(), + RegionChangeEvent::Update(reason), + self.get_role(), + ); + } } #[inline] @@ -395,6 +409,11 @@ impl Peer { .cloned() } + #[inline] + pub fn get_role(&self) -> StateRole { + self.raft_group.raft.state + } + #[inline] pub fn update_peer_statistics(&mut self) { if !self.is_leader() { @@ -641,6 +660,16 @@ impl Peer { .advance_apply(apply_index, term, region); } + #[inline] + pub fn lead_transferee(&self) -> u64 { + self.lead_transferee + } + + #[inline] + pub fn refresh_lead_transferee(&mut self) { + self.lead_transferee = self.raft_group.raft.lead_transferee.unwrap_or_default(); + } + // TODO: find a better place to put all txn related stuff. pub fn require_updating_max_ts(&self, ctx: &StoreContext) { let epoch = self.region().get_region_epoch(); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 732afb38f98..a454b0aa842 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -28,15 +28,18 @@ use kvproto::{ raft_serverpb::RaftMessage, }; use pd_client::RpcClient; -use raft::eraftpb::MessageType; -use raftstore::store::{ - region_meta::{RegionLocalState, RegionMeta}, - Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, +use raft::{eraftpb::MessageType, StateRole}; +use raftstore::{ + coprocessor::{RegionChangeEvent, RoleChange}, + store::{ + region_meta::{RegionLocalState, RegionMeta}, + Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, + }, }; use raftstore_v2::{ create_store_batch_system, router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, - Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, + Bootstrap, LockManagerNotifier, SimpleWriteEncoder, StateStorage, StoreSystem, }; use slog::{debug, o, Logger}; use tempfile::TempDir; @@ -288,6 +291,7 @@ impl RunningState { snap_mgr.clone(), concurrency_manager, causal_ts_provider, + Arc::new(DummyLockManagerObserver {}), ) .unwrap(); @@ -570,3 +574,11 @@ impl Drop for Cluster { } } } + +struct DummyLockManagerObserver {} + +impl LockManagerNotifier for DummyLockManagerObserver { + fn on_region_changed(&self, _: &metapb::Region, _: RegionChangeEvent, _: StateRole) {} + + fn on_role_change(&self, _: &metapb::Region, _: RoleChange) {} +} diff --git a/src/server/lock_manager/deadlock.rs b/src/server/lock_manager/deadlock.rs index 9583df80dd6..a9a31c68b8f 100644 --- a/src/server/lock_manager/deadlock.rs +++ b/src/server/lock_manager/deadlock.rs @@ -28,6 +28,7 @@ use raftstore::{ }, store::util::is_region_initialized, }; +use raftstore_v2::LockManagerNotifier; use security::SecurityManager; use tikv_util::{ future::paired_future_callback, @@ -524,7 +525,7 @@ const LEADER_KEY: &[u8] = b""; /// way to change the node from the leader of deadlock detector to follower, and /// vice versa. #[derive(Clone)] -pub(crate) struct RoleChangeNotifier { +pub struct RoleChangeNotifier { /// The id of the valid leader region. // raftstore.coprocessor needs it to be Sync + Send. leader_region_id: Arc>, @@ -606,6 +607,18 @@ impl RegionChangeObserver for RoleChangeNotifier { } } +impl LockManagerNotifier for RoleChangeNotifier { + fn on_role_change(&self, region: &Region, role_change: RoleChange) { + let mut ctx = ObserverContext::new(region); + RoleObserver::on_role_change(self, &mut ctx, &role_change); + } + + fn on_region_changed(&self, region: &Region, event: RegionChangeEvent, role: StateRole) { + let mut ctx = ObserverContext::new(region); + RegionChangeObserver::on_region_changed(self, &mut ctx, event, role); + } +} + struct Inner { /// The role of the deadlock detector. Default is `Role::Follower`. role: Role, diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index 243d533a0e5..44c31fcab1e 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -205,6 +205,11 @@ impl LockManager { role_change_notifier.register(host); } + /// Creates a `RoleChangeNotifier` of the deadlock detector worker + pub fn new_notifier(&self) -> RoleChangeNotifier { + RoleChangeNotifier::new(self.detector_scheduler.clone()) + } + /// Creates a `DeadlockService` to handle deadlock detect requests from /// other nodes. pub fn deadlock_service(&self) -> DeadlockService { diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index 59daa053aa3..57bc575ff05 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -8,7 +8,7 @@ use engine_traits::{KvEngine, RaftEngine, TabletContext, TabletRegistry}; use kvproto::{metapb, replication_modepb::ReplicationStatus}; use pd_client::PdClient; use raftstore::store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}; -use raftstore_v2::{router::RaftRouter, Bootstrap, StoreSystem}; +use raftstore_v2::{router::RaftRouter, Bootstrap, LockManagerNotifier, StoreSystem}; use slog::{info, o, Logger}; use tikv_util::{config::VersionTrack, worker::Worker}; @@ -85,6 +85,7 @@ where snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 + lock_manager_observer: Arc, ) -> Result<()> where T: Transport + 'static, @@ -125,6 +126,7 @@ where snap_mgr, concurrency_manager, causal_ts_provider, + lock_manager_observer, )?; Ok(()) @@ -171,6 +173,7 @@ where snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 + lock_manager_observer: Arc, ) -> Result<()> where T: Transport + 'static, @@ -196,6 +199,7 @@ where snap_mgr, concurrency_manager, causal_ts_provider, + lock_manager_observer, )?; Ok(()) } From fbff71d0026c365dc69bcafb9c6574872c5342be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 22 Dec 2022 11:40:55 +0800 Subject: [PATCH 420/676] log-backup: enhance logs (#13913) close tikv/tikv#13914 Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 19 ++- .../backup-stream/src/subscription_manager.rs | 17 ++- .../backup-stream/src/subscription_track.rs | 92 +++++++++++-- components/backup-stream/src/utils.rs | 130 ++++++++++++++++++ 4 files changed, 237 insertions(+), 21 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index ec6b0dd41fb..c50c70a2eec 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -1070,12 +1070,21 @@ pub enum ObserveOp { impl std::fmt::Debug for ObserveOp { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Start { region } => f.debug_struct("Start").field("region", region).finish(), - Self::Stop { region } => f.debug_struct("Stop").field("region", region).finish(), - Self::Destroy { region } => f.debug_struct("Destroy").field("region", region).finish(), + Self::Start { region } => f + .debug_struct("Start") + .field("region", &utils::debug_region(region)) + .finish(), + Self::Stop { region } => f + .debug_struct("Stop") + .field("region", &utils::debug_region(region)) + .finish(), + Self::Destroy { region } => f + .debug_struct("Destroy") + .field("region", &utils::debug_region(region)) + .finish(), Self::RefreshResolver { region } => f .debug_struct("RefreshResolver") - .field("region", region) + .field("region", &utils::debug_region(region)) .finish(), Self::NotifyFailToStartObserve { region, @@ -1083,7 +1092,7 @@ impl std::fmt::Debug for ObserveOp { err, } => f .debug_struct("NotifyFailToStartObserve") - .field("region", region) + .field("region", &utils::debug_region(region)) .field("handle", handle) .field("err", err) .finish(), diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 83181829b43..624392f3df8 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -165,7 +165,7 @@ impl ScanCmd { } = self; let begin = Instant::now_coarse(); let stat = initial_scan.do_initial_scan(region, *last_checkpoint, handle.clone())?; - info!("initial scanning of leader transforming finished!"; "takes" => ?begin.saturating_elapsed(), "region" => %region.get_id(), "from_ts" => %last_checkpoint); + info!("initial scanning finished!"; "takes" => ?begin.saturating_elapsed(), "from_ts" => %last_checkpoint, utils::slog_region(region)); utils::record_cf_stat("lock", &stat.lock); utils::record_cf_stat("write", &stat.write); utils::record_cf_stat("default", &stat.data); @@ -414,7 +414,7 @@ where true, false, ) - .map_err(|err| warn!("check epoch and stop failed."; "err" => %err)) + .map_err(|err| warn!("check epoch and stop failed."; utils::slog_region(region), "err" => %err)) .is_ok() }); } @@ -455,13 +455,16 @@ where "take" => ?now.saturating_elapsed(), "timedout" => %timedout); } let cps = self.subs.resolve_with(min_ts); - let min_region = cps.iter().min_by_key(|(_, rts)| rts); + let min_region = cps.iter().min_by_key(|rs| rs.checkpoint); // If there isn't any region observed, the `min_ts` can be used as resolved ts // safely. - let rts = min_region.map(|(_, rts)| *rts).unwrap_or(min_ts); - info!("getting checkpoint"; "defined_by_region" => ?min_region.map(|r| r.0.get_id()), "checkpoint" => %rts); + let rts = min_region.map(|rs| rs.checkpoint).unwrap_or(min_ts); + info!("getting checkpoint"; "defined_by_region" => ?min_region); self.subs.warn_if_gap_too_huge(rts); - callback(ResolvedRegions::new(rts, cps)); + callback(ResolvedRegions::new( + rts, + cps.into_iter().map(|r| (r.region, r.checkpoint)).collect(), + )); } } } @@ -583,7 +586,7 @@ where exists = true; let should_remove = old.handle().id == handle.id; if !should_remove { - warn!("stale retry command"; "region" => ?region, "handle" => ?handle, "old_handle" => ?old.handle()); + warn!("stale retry command"; utils::slog_region(®ion), "handle" => ?handle, "old_handle" => ?old.handle()); } should_remove }); diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 50c3c6c1143..6b51f983a3b 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -57,6 +57,63 @@ impl RegionSubscription { } } +#[derive(PartialEq, Eq)] +pub enum CheckpointType { + MinTs, + StartTsOfInitialScan, + StartTsOfTxn(Option>), +} + +impl std::fmt::Debug for CheckpointType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::MinTs => write!(f, "MinTs"), + Self::StartTsOfInitialScan => write!(f, "StartTsOfInitialScan"), + Self::StartTsOfTxn(arg0) => f + .debug_tuple("StartTsOfTxn") + .field(&format_args!( + "{}", + utils::redact(&arg0.as_ref().map(|x| x.as_ref()).unwrap_or(&[])) + )) + .finish(), + } + } +} + +pub struct ResolveResult { + pub region: Region, + pub checkpoint: TimeStamp, + pub checkpoint_type: CheckpointType, +} + +impl std::fmt::Debug for ResolveResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ResolveResult") + .field("region", &self.region.get_id()) + .field("checkpoint", &self.checkpoint) + .field("checkpoint_type", &self.checkpoint_type) + .finish() + } +} + +impl ResolveResult { + fn resolve(sub: &mut RegionSubscription, min_ts: TimeStamp) -> Self { + let ts = sub.resolver.resolve(min_ts); + let ty = if ts == min_ts { + CheckpointType::MinTs + } else if sub.resolver.in_phase_one() { + CheckpointType::StartTsOfInitialScan + } else { + CheckpointType::StartTsOfTxn(sub.resolver.sample_far_lock()) + }; + Self { + region: sub.meta.clone(), + checkpoint: ts, + checkpoint_type: ty, + } + } +} + impl SubscriptionTracer { /// clear the current `SubscriptionTracer`. pub fn clear(&self) { @@ -91,11 +148,11 @@ impl SubscriptionTracer { /// try advance the resolved ts with the min ts of in-memory locks. /// returns the regions and theirs resolved ts. - pub fn resolve_with(&self, min_ts: TimeStamp) -> Vec<(Region, TimeStamp)> { + pub fn resolve_with(&self, min_ts: TimeStamp) -> Vec { self.0 .iter_mut() // Don't advance the checkpoint ts of removed region. - .map(|mut s| (s.meta.clone(), s.resolver.resolve(min_ts))) + .map(|mut s| ResolveResult::resolve(s.value_mut(), min_ts)) .collect() } @@ -140,7 +197,7 @@ impl SubscriptionTracer { false } None => { - warn!("trying to deregister region not registered"; "region_id" => %region_id); + debug!("trying to deregister region not registered"; "region_id" => %region_id); false } } @@ -156,7 +213,7 @@ impl SubscriptionTracer { let mut sub = match self.get_subscription_of(new_region.get_id()) { Some(sub) => sub, None => { - warn!("backup stream observer refreshing void subscription."; "new_region" => ?new_region); + warn!("backup stream observer refreshing void subscription."; utils::slog_region(new_region)); return true; } }; @@ -258,6 +315,12 @@ impl std::fmt::Debug for FutureLock { } impl TwoPhaseResolver { + /// try to get one of the key of the oldest lock in the resolver. + pub fn sample_far_lock(&self) -> Option> { + let (_, keys) = self.resolver.locks().first_key_value()?; + keys.iter().next().cloned() + } + pub fn in_phase_one(&self) -> bool { self.stable_ts.is_some() } @@ -348,6 +411,8 @@ impl std::fmt::Debug for TwoPhaseResolver { #[cfg(test)] mod test { + use std::sync::Arc; + use kvproto::metapb::{Region, RegionEpoch}; use raftstore::coprocessor::ObserveHandle; use txn_types::TimeStamp; @@ -433,15 +498,24 @@ mod test { subs.deregister_region_if(®ion(5, 8, 1), |_, _| true); drop(region4_sub); - let mut rs = subs.resolve_with(TimeStamp::new(1000)); + let mut rs = subs + .resolve_with(TimeStamp::new(1000)) + .into_iter() + .map(|r| (r.region, r.checkpoint, r.checkpoint_type)) + .collect::>(); rs.sort_by_key(|k| k.0.get_id()); + use crate::subscription_track::CheckpointType::*; assert_eq!( rs, vec![ - (region(1, 1, 1), TimeStamp::new(42)), - (region(2, 2, 1), TimeStamp::new(1000)), - (region(3, 4, 1), TimeStamp::new(1000)), - (region(4, 8, 1), TimeStamp::new(128)), + (region(1, 1, 1), 42.into(), StartTsOfInitialScan), + (region(2, 2, 1), 1000.into(), MinTs), + (region(3, 4, 1), 1000.into(), MinTs), + ( + region(4, 8, 1), + 128.into(), + StartTsOfTxn(Some(Arc::from(b"Alpi".as_slice()))) + ), ] ); } diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 6ecea21f2f5..1746882690f 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -3,6 +3,7 @@ use core::pin::Pin; use std::{ borrow::Borrow, + cell::RefCell, collections::{hash_map::RandomState, BTreeMap, HashMap}, ops::{Bound, RangeBounds}, path::Path, @@ -20,6 +21,7 @@ use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; use futures::{channel::mpsc, executor::block_on, ready, task::Poll, FutureExt, StreamExt}; use kvproto::{ brpb::CompressionType, + metapb::Region, raft_cmdpb::{CmdType, Request}, }; use raft::StateRole; @@ -743,6 +745,109 @@ impl CompressionWriter for ZstdCompressionWriter { } } +/// make a pair of key range to impl Debug which prints [start_key,$end_key). +pub fn debug_key_range<'ret, 'a: 'ret, 'b: 'ret>( + start: &'a [u8], + end: &'b [u8], +) -> impl std::fmt::Debug + 'ret { + DebugKeyRange::<'a, 'b>(start, end) +} + +struct DebugKeyRange<'start, 'end>(&'start [u8], &'end [u8]); + +impl<'start, 'end> std::fmt::Debug for DebugKeyRange<'start, 'end> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let end_key = if self.1.is_empty() { + Either::Left("inf") + } else { + Either::Right(redact(&self.1)) + }; + let end_key: &dyn std::fmt::Display = match &end_key { + Either::Left(x) => x, + Either::Right(y) => y, + }; + write!(f, "[{},{})", redact(&self.0), end_key) + } +} + +/// make a [`Region`](kvproto::metapb::Region) implements [`slog::KV`], which +/// prints its fields like `[r.id=xxx] [r.ver=xxx] ...` +pub fn slog_region(r: &Region) -> impl slog::KV + '_ { + SlogRegion(r) +} + +/// make a [`Region`](kvproto::metapb::Region) implements +/// [`Debug`](std::fmt::Debug), which prints its essential fields. +pub fn debug_region(r: &Region) -> impl std::fmt::Debug + '_ { + DebugRegion(r) +} + +struct DebugRegion<'a>(&'a Region); + +impl<'a> std::fmt::Debug for DebugRegion<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let r = self.0; + f.debug_struct("Region") + .field("id", &r.get_id()) + .field("ver", &r.get_region_epoch().get_version()) + .field("conf_ver", &r.get_region_epoch().get_conf_ver()) + .field( + "range", + &debug_key_range(r.get_start_key(), r.get_end_key()), + ) + .field( + "peers", + &debug_iter(r.get_peers().iter().map(|p| p.store_id)), + ) + .finish() + } +} + +struct SlogRegion<'a>(&'a Region); + +impl<'a> slog::KV for SlogRegion<'a> { + fn serialize( + &self, + _record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + let r = self.0; + serializer.emit_u64("r.id", r.get_id())?; + serializer.emit_u64("r.ver", r.get_region_epoch().get_version())?; + serializer.emit_u64("r.conf_ver", r.get_region_epoch().get_conf_ver())?; + serializer.emit_arguments( + "r.range", + &format_args!("{:?}", debug_key_range(r.get_start_key(), r.get_end_key())), + )?; + serializer.emit_arguments( + "r.peers", + &format_args!("{:?}", debug_iter(r.get_peers().iter().map(|p| p.store_id))), + )?; + Ok(()) + } +} + +pub fn debug_iter(t: impl Iterator) -> impl std::fmt::Debug { + DebugIter(RefCell::new(t)) +} + +struct DebugIter>(RefCell); + +impl> std::fmt::Debug for DebugIter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut is_first = true; + while let Some(x) = self.0.borrow_mut().next() { + if !is_first { + write!(f, ",{:?}", x)?; + } else { + write!(f, "{:?}", x)?; + is_first = false; + } + } + Ok(()) + } +} + #[cfg(test)] mod test { use std::{ @@ -755,10 +860,35 @@ mod test { use engine_traits::WriteOptions; use futures::executor::block_on; + use kvproto::metapb::{Region, RegionEpoch}; use tokio::io::{AsyncWriteExt, BufReader}; use crate::utils::{is_in_range, CallbackWaitGroup, SegmentMap}; + #[test] + fn test_redact() { + log_wrappers::set_redact_info_log(true); + let mut region = Region::default(); + region.set_id(42); + region.set_start_key(b"TiDB".to_vec()); + region.set_end_key(b"TiDC".to_vec()); + region.set_region_epoch({ + let mut r = RegionEpoch::default(); + r.set_version(108); + r.set_conf_ver(352); + r + }); + + // Can we make a better way to test this? + assert_eq!( + "Region { id: 42, ver: 108, conf_ver: 352, range: [?,?), peers: }", + format!("{:?}", super::debug_region(®ion)) + ); + + let range = super::debug_key_range(b"alpha", b"omega"); + assert_eq!("[?,?)", format!("{:?}", range)); + } + #[test] fn test_range_functions() { #[derive(Debug)] From 0cc15e4e7211e406f183917e155d5fcf43e44c6a Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 22 Dec 2022 12:30:55 +0800 Subject: [PATCH 421/676] Raftstore-v2: txn_ext and bucket_meta of RegionSnapshot should be inited (#13911) ref tikv/tikv#12842 Signed-off-by: SpadeA-Tang --- .../raftstore-v2/src/operation/query/local.rs | 79 ++++++++++++------- 1 file changed, 49 insertions(+), 30 deletions(-) diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 1878ead40c2..812cf2354fa 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -107,41 +107,52 @@ where req: &RaftCmdRequest, ) -> std::result::Result>, RaftCmdResponse> { match self.pre_propose_raft_command(req) { - Ok(Some((mut delegate, policy))) => match policy { - RequestPolicy::ReadLocal => { - let region = Arc::clone(&delegate.region); - let snap = RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); - // Ensures the snapshot is acquired before getting the time - atomic::fence(atomic::Ordering::Release); - let snapshot_ts = monotonic_raw_now(); - - if !delegate.is_in_leader_lease(snapshot_ts) { - return Ok(None); + Ok(Some((mut delegate, policy))) => { + let mut snap = match policy { + RequestPolicy::ReadLocal => { + let region = Arc::clone(&delegate.region); + let snap = + RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + // Ensures the snapshot is acquired before getting the time + atomic::fence(atomic::Ordering::Release); + let snapshot_ts = monotonic_raw_now(); + + if !delegate.is_in_leader_lease(snapshot_ts) { + return Ok(None); + } + + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_requests.inc()); + + // Try renew lease in advance + self.maybe_renew_lease_in_advance(&delegate, req, snapshot_ts); + snap } + RequestPolicy::StaleRead => { + let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); + delegate.check_stale_read_safe(read_ts)?; - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); + let region = Arc::clone(&delegate.region); + let snap = + RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); - // Try renew lease in advance - self.maybe_renew_lease_in_advance(&delegate, req, snapshot_ts); - Ok(Some(snap)) - } - RequestPolicy::StaleRead => { - let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - delegate.check_stale_read_safe(read_ts)?; + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_requests.inc()); - let region = Arc::clone(&delegate.region); - let snap = RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + delegate.check_stale_read_safe(read_ts)?; - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); + snap + } + _ => unreachable!(), + }; - delegate.check_stale_read_safe(read_ts)?; + snap.txn_ext = Some(delegate.txn_ext.clone()); + snap.bucket_meta = delegate.bucket_meta.clone(); - TLS_LOCAL_READ_METRICS - .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); - Ok(Some(snap)) - } - _ => unreachable!(), - }, + Ok(Some(snap)) + } Ok(None) => Ok(None), Err(e) => { let mut response = cmd_resp::new_error(e); @@ -458,6 +469,7 @@ mod tests { use engine_traits::{MiscExt, Peekable, SyncMutable, TabletContext, DATA_CFS}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; + use pd_client::BucketMeta; use raftstore::store::{ util::Lease, ReadCallback, ReadProgress, RegionReadProgress, TrackVer, TxnExt, TLS_LOCAL_READ_METRICS, @@ -628,6 +640,8 @@ mod tests { // Register region 1 lease.renew(monotonic_raw_now()); let remote = lease.maybe_new_remote_lease(term6).unwrap(); + let txn_ext = Arc::new(TxnExt::default()); + let bucket_meta = Arc::new(BucketMeta::default()); { let mut meta = store_meta.as_ref().lock().unwrap(); @@ -641,11 +655,11 @@ mod tests { leader_lease: Some(remote), last_valid_ts: Timespec::new(0, 0), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), - txn_ext: Arc::new(TxnExt::default()), + txn_ext: txn_ext.clone(), read_progress: read_progress.clone(), pending_remove: false, track_ver: TrackVer::new(), - bucket_meta: None, + bucket_meta: Some(bucket_meta.clone()), }; meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data @@ -675,6 +689,11 @@ mod tests { // the applied term by the above thread, the snapshot will be acquired by // retrying. let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); + assert!(Arc::ptr_eq(snap.txn_ext.as_ref().unwrap(), &txn_ext)); + assert!(Arc::ptr_eq( + snap.bucket_meta.as_ref().unwrap(), + &bucket_meta + )); assert_eq!(*snap.get_region(), region1); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), From 36570b8e2fea93e46b1b4028b5ac7cbda72fbe67 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 22 Dec 2022 16:26:56 +0800 Subject: [PATCH 422/676] raftstore-v2: make coprocessor work (#13978) ref tikv/tikv#12842 Coprocessor is necessary for a working daemon. This PR adjusts coprocessor to make it work with raftstore v2. And split check is also added for auto splitting. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 - components/raftstore-v2/Cargo.toml | 1 - components/raftstore-v2/src/batch/store.rs | 128 +++++++------ components/raftstore-v2/src/fsm/mod.rs | 2 +- components/raftstore-v2/src/fsm/peer.rs | 2 +- components/raftstore-v2/src/fsm/store.rs | 16 +- components/raftstore-v2/src/lib.rs | 2 +- .../operation/command/admin/conf_change.rs | 12 +- .../src/operation/command/admin/mod.rs | 9 +- .../src/operation/command/admin/split.rs | 181 +++++++----------- .../command/admin/transfer_leader.rs | 7 +- .../raftstore-v2/src/operation/command/mod.rs | 12 +- .../src/operation/command/write/mod.rs | 2 + components/raftstore-v2/src/operation/life.rs | 19 +- components/raftstore-v2/src/operation/mod.rs | 2 +- components/raftstore-v2/src/operation/pd.rs | 12 +- .../src/operation/ready/async_writer.rs | 2 +- .../raftstore-v2/src/operation/ready/mod.rs | 17 +- .../src/operation/ready/snapshot.rs | 19 +- components/raftstore-v2/src/raft/apply.rs | 7 +- components/raftstore-v2/src/raft/peer.rs | 67 ++++--- components/raftstore-v2/src/router/imp.rs | 73 ++++++- .../src/router/internal_message.rs | 3 + components/raftstore-v2/src/router/message.rs | 2 +- .../tests/integrations/cluster.rs | 27 +-- .../tests/integrations/test_split.rs | 29 ++- .../raftstore/src/coprocessor/dispatcher.rs | 127 +++++++++++- components/raftstore/src/coprocessor/mod.rs | 2 +- .../src/coprocessor/split_check/half.rs | 18 +- .../src/coprocessor/split_check/keys.rs | 57 ++---- .../src/coprocessor/split_check/size.rs | 127 ++++-------- .../src/coprocessor/split_check/table.rs | 14 +- components/raftstore/src/lib.rs | 1 + components/raftstore/src/router.rs | 110 ++++++++++- components/raftstore/src/store/fsm/store.rs | 13 +- .../src/store/worker/consistency_check.rs | 48 ++--- .../raftstore/src/store/worker/split_check.rs | 164 ++++++++-------- src/server/lock_manager/deadlock.rs | 15 +- src/server/lock_manager/mod.rs | 5 - src/server/raftkv2/node.rs | 19 +- 40 files changed, 783 insertions(+), 591 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cb371b739af..67ca50ba1ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4359,7 +4359,6 @@ dependencies = [ "file_system", "fs2", "futures 0.3.15", - "itertools", "keys", "kvproto", "log_wrappers", diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index d9b1d65aebc..4d3d44ec6fd 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -41,7 +41,6 @@ fail = "0.5" file_system = { workspace = true } fs2 = "0.4" futures = { version = "0.3", features = ["compat"] } -itertools = "0.10" keys = { workspace = true } kvproto = { workspace = true } log_wrappers = { workspace = true } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 8a1f60f3717..642f6e745f0 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -22,10 +22,10 @@ use kvproto::{disk_usage::DiskUsage, raft_serverpb::RaftMessage}; use pd_client::PdClient; use raft::{StateRole, INVALID_ID}; use raftstore::{ - coprocessor::RegionChangeEvent, + coprocessor::{CoprocessorHost, RegionChangeEvent}, store::{ fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, ReadRunner, ReadTask, - StoreWriters, TabletSnapManager, Transport, WriteSenders, + SplitCheckRunner, SplitCheckTask, StoreWriters, TabletSnapManager, Transport, WriteSenders, }, }; use slog::Logger; @@ -42,10 +42,7 @@ use tikv_util::{ use time::Timespec; use crate::{ - fsm::{ - LockManagerNotifier, PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, - StoreMeta, - }, + fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, worker::pd, @@ -56,6 +53,7 @@ use crate::{ pub struct StoreContext { /// A logger without any KV. It's clean for creating new PeerFSM. pub logger: Logger, + pub coprocessor_host: CoprocessorHost, /// The transport for sending messages to peers on other stores. pub trans: T, pub current_time: Option, @@ -69,21 +67,17 @@ pub struct StoreContext { pub tick_batch: Vec, /// The precise timer for scheduling tick. pub timer: SteadyTimer, - pub write_senders: WriteSenders, + pub schedulers: Schedulers, /// store meta pub store_meta: Arc>, pub engine: ER, pub tablet_registry: TabletRegistry, pub apply_pool: FuturePool, - pub read_scheduler: Scheduler>, /// Disk usage for the store itself. pub self_disk_usage: DiskUsage, pub snap_mgr: TabletSnapManager, - pub pd_scheduler: Scheduler, - - pub lock_manager_notifier: Arc, } /// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. @@ -225,19 +219,17 @@ impl PollHandler { cfg: Arc>, + coprocessor_host: CoprocessorHost, store_id: u64, engine: ER, tablet_registry: TabletRegistry, trans: T, router: StoreRouter, - read_scheduler: Scheduler>, - pd_scheduler: Scheduler, - write_senders: WriteSenders, + schedulers: Schedulers, apply_pool: FuturePool, logger: Logger, store_meta: Arc>, snap_mgr: TabletSnapManager, - lock_manager_notifier: Arc, } impl StorePollerBuilder { @@ -248,13 +240,11 @@ impl StorePollerBuilder { tablet_registry: TabletRegistry, trans: T, router: StoreRouter, - read_scheduler: Scheduler>, - pd_scheduler: Scheduler, - store_writers: &mut StoreWriters, + schedulers: Schedulers, logger: Logger, store_meta: Arc>, snap_mgr: TabletSnapManager, - lock_manager_notifier: Arc, + coprocessor_host: CoprocessorHost, ) -> Self { let pool_size = cfg.value().apply_batch_system.pool_size; let max_pool_size = std::cmp::max( @@ -273,14 +263,12 @@ impl StorePollerBuilder { tablet_registry, trans, router, - read_scheduler, - pd_scheduler, apply_pool, logger, - write_senders: store_writers.senders(), + schedulers, store_meta, snap_mgr, - lock_manager_notifier, + coprocessor_host, } } @@ -296,17 +284,20 @@ impl StorePollerBuilder { region_id, self.store_id, self.engine.clone(), - self.read_scheduler.clone(), + self.schedulers.read.clone(), &self.logger, )? { Some(p) => p, None => return Ok(()), }; - self.lock_manager_notifier.on_region_changed( - storage.region_state().get_region(), - RegionChangeEvent::Create, - StateRole::Follower, - ); + + if storage.is_initialized() { + self.coprocessor_host.on_region_changed( + storage.region(), + RegionChangeEvent::Create, + StateRole::Follower, + ); + } let (sender, peer_fsm) = PeerFsm::new(&cfg, &self.tablet_registry, storage)?; meta.region_read_progress @@ -353,37 +344,49 @@ where router: self.router.clone(), tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], timer: SteadyTimer::default(), - write_senders: self.write_senders.clone(), + schedulers: self.schedulers.clone(), store_meta: self.store_meta.clone(), engine: self.engine.clone(), tablet_registry: self.tablet_registry.clone(), apply_pool: self.apply_pool.clone(), - read_scheduler: self.read_scheduler.clone(), self_disk_usage: DiskUsage::Normal, snap_mgr: self.snap_mgr.clone(), - pd_scheduler: self.pd_scheduler.clone(), - lock_manager_notifier: self.lock_manager_notifier.clone(), + coprocessor_host: self.coprocessor_host.clone(), }; let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); StorePoller::new(poll_ctx, cfg_tracker) } } +#[derive(Clone)] +pub struct Schedulers { + pub read: Scheduler>, + pub pd: Scheduler, + pub write: WriteSenders, + + // Following is not maintained by raftstore itself. + pub split_check: Scheduler, +} + /// A set of background threads that will processing offloaded work from /// raftstore. struct Workers { /// Worker for fetching raft logs asynchronously - async_read_worker: Worker, - pd_worker: Worker, - store_writers: StoreWriters, + async_read: Worker, + pd: Worker, + async_write: StoreWriters, + + // Following is not maintained by raftstore itself. + background: Worker, } -impl Default for Workers { - fn default() -> Self { +impl Workers { + fn new(background: Worker) -> Self { Self { - async_read_worker: Worker::new("async-read-worker"), - pd_worker: Worker::new("pd-worker"), - store_writers: StoreWriters::default(), + async_read: Worker::new("async-read-worker"), + pd: Worker::new("pd-worker"), + async_write: StoreWriters::default(), + background, } } } @@ -410,7 +413,8 @@ impl StoreSystem { snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 - lock_manager_notifier: Arc, + coprocessor_host: CoprocessorHost, + background: Worker, ) -> Result<()> where T: Transport + 'static, @@ -424,18 +428,16 @@ impl StoreSystem { .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); }); - let mut workers = Workers::default(); + let mut workers = Workers::new(background); workers - .store_writers + .async_write .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; let mut read_runner = ReadRunner::new(router.clone(), raft_engine.clone()); read_runner.set_snap_mgr(snap_mgr.clone()); - let read_scheduler = workers - .async_read_worker - .start("async-read-worker", read_runner); + let read_scheduler = workers.async_read.start("async-read-worker", read_runner); - let pd_scheduler = workers.pd_worker.start( + let pd_scheduler = workers.pd.start( "pd-worker", pd::Runner::new( store_id, @@ -443,7 +445,7 @@ impl StoreSystem { raft_engine.clone(), tablet_registry.clone(), router.clone(), - workers.pd_worker.remote(), + workers.pd.remote(), concurrency_manager, causal_ts_provider, self.logger.clone(), @@ -451,6 +453,22 @@ impl StoreSystem { ), ); + let split_check_scheduler = workers.background.start( + "split-check", + SplitCheckRunner::with_registry( + tablet_registry.clone(), + router.clone(), + coprocessor_host.clone(), + ), + ); + + let schedulers = Schedulers { + read: read_scheduler, + pd: pd_scheduler, + write: workers.async_write.senders(), + split_check: split_check_scheduler, + }; + let builder = StorePollerBuilder::new( cfg.clone(), store_id, @@ -458,13 +476,11 @@ impl StoreSystem { tablet_registry, trans, router.clone(), - read_scheduler, - pd_scheduler, - &mut workers.store_writers, + schedulers, self.logger.clone(), store_meta.clone(), snap_mgr, - lock_manager_notifier, + coprocessor_host, ); self.workers = Some(workers); let peers = builder.init()?; @@ -510,9 +526,9 @@ impl StoreSystem { self.system.shutdown(); - workers.store_writers.shutdown(); - workers.async_read_worker.stop(); - workers.pd_worker.stop(); + workers.async_write.shutdown(); + workers.async_read.stop(); + workers.pd.stop(); } } diff --git a/components/raftstore-v2/src/fsm/mod.rs b/components/raftstore-v2/src/fsm/mod.rs index 442c6b050ce..b3d0e0483ba 100644 --- a/components/raftstore-v2/src/fsm/mod.rs +++ b/components/raftstore-v2/src/fsm/mod.rs @@ -11,4 +11,4 @@ mod store; pub use apply::{ApplyFsm, ApplyResReporter, ApplyScheduler}; pub use peer::{PeerFsm, PeerFsmDelegate, SenderFsmPair}; -pub use store::{LockManagerNotifier, Store, StoreFsm, StoreFsmDelegate, StoreMeta}; +pub use store::{Store, StoreFsm, StoreFsmDelegate, StoreMeta}; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index f5425295347..1ef9e198130 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -206,7 +206,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerTick::Raft => self.on_raft_tick(), PeerTick::PdHeartbeat => self.on_pd_heartbeat(), PeerTick::RaftLogGc => unimplemented!(), - PeerTick::SplitRegionCheck => unimplemented!(), + PeerTick::SplitRegionCheck => self.on_split_region_check(), PeerTick::CheckMerge => unimplemented!(), PeerTick::CheckPeerStaleState => unimplemented!(), PeerTick::EntryCacheEvict => unimplemented!(), diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 6e2dfe4a75f..bd31de69496 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -6,12 +6,7 @@ use batch_system::Fsm; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use futures::{compat::Future01CompatExt, FutureExt}; -use kvproto::metapb::Region; -use raft::StateRole; -use raftstore::{ - coprocessor::{RegionChangeEvent, RoleChange}, - store::{Config, ReadDelegate, RegionReadProgressRegistry}, -}; +use raftstore::store::{Config, ReadDelegate, RegionReadProgressRegistry}; use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, @@ -170,12 +165,3 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { } } } - -// A simplified version of CoprocessorHost used to convey information to -// LockManager only. -// It is replaced by CoprocessorHost in the future. -pub trait LockManagerNotifier: Send + Sync { - fn on_role_change(&self, region: &Region, role_change: RoleChange); - - fn on_region_changed(&self, region: &Region, event: RegionChangeEvent, role: StateRole); -} diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index cb769b6594a..d8327549da6 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -38,6 +38,6 @@ mod worker; pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; pub use bootstrap::Bootstrap; -pub use fsm::{LockManagerNotifier, StoreMeta}; +pub use fsm::StoreMeta; pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{store::Config, Error, Result}; diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index ec0b78e717a..5a6c91d3567 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -147,13 +147,6 @@ impl Peer { let remove_self = conf_change.region_state.get_state() == PeerState::Tombstone; self.storage_mut() .set_region_state(conf_change.region_state); - - ctx.lock_manager_notifier.on_region_changed( - self.region(), - RegionChangeEvent::Update(RegionChangeReason::ChangePeer), - self.get_role(), - ); - if self.is_leader() { info!( self.logger, @@ -190,6 +183,11 @@ impl Peer { self.set_has_ready(); } } + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Update(RegionChangeReason::ChangePeer), + self.raft_group().raft.state, + ); if remove_self { self.mark_for_destroy(None); } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 9b7dce8570f..0b3d588abf7 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -9,7 +9,7 @@ use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; use protobuf::Message; use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; -pub use split::{RequestSplit, SplitInit, SplitResult, SPLIT_PREFIX}; +pub use split::{RequestSplit, SplitFlowControl, SplitInit, SplitResult, SPLIT_PREFIX}; use tikv_util::box_err; use txn_types::WriteBatchFlags; @@ -30,7 +30,7 @@ impl Peer { pub fn on_admin_command( &mut self, ctx: &mut StoreContext, - req: RaftCmdRequest, + mut req: RaftCmdRequest, ch: CmdResChannel, ) { if !self.serving() { @@ -43,6 +43,11 @@ impl Peer { ch.report_error(resp); return; } + if let Err(e) = ctx.coprocessor_host.pre_propose(self.region(), &mut req) { + let resp = cmd_resp::new_error(e.into()); + ch.report_error(resp); + return; + } let cmd_type = req.get_admin_request().get_cmd_type(); if let Err(e) = self.validate_command(req.get_header(), Some(cmd_type), &mut ctx.raft_metrics) diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 870c203f07d..64388333fee 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -25,44 +25,40 @@ //! created by the store, and here init it using the data sent from the parent //! peer. -use std::cmp; +use std::{borrow::Cow, cmp}; use collections::HashSet; use crossbeam::channel::SendError; use engine_traits::{Checkpointer, KvEngine, RaftEngine, RaftLogBatch, TabletContext}; use fail::fail_point; -use itertools::Itertools; use kvproto::{ metapb::{self, Region, RegionEpoch}, + pdpb::CheckPolicy, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, raft_serverpb::RaftSnapshotData, }; use protobuf::Message; use raft::{prelude::Snapshot, INVALID_ID}; use raftstore::{ - coprocessor::{ - split_observer::{is_valid_split_key, strip_timestamp_if_exists}, - RegionChangeReason, - }, + coprocessor::RegionChangeReason, store::{ cmd_resp, - fsm::apply::validate_batch_split, + fsm::{apply::validate_batch_split, ApplyMetrics}, metrics::PEER_ADMIN_CMD_COUNTER, snap::TABLET_SNAPSHOT_VERSION, util::{self, KeysInfoFormatter}, - PeerPessimisticLocks, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + PeerPessimisticLocks, SplitCheckTask, Transport, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, Result, }; -use slog::{error, info, warn, Logger}; -use tikv_util::box_err; +use slog::info; use crate::{ batch::StoreContext, - fsm::ApplyResReporter, + fsm::{ApplyResReporter, PeerFsmDelegate}, operation::AdminCmdResult, raft::{Apply, Peer}, - router::{CmdResChannel, PeerMsg, StoreMsg}, + router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, Error, }; @@ -107,67 +103,68 @@ impl SplitInit { } } -// validate split request and strip ts from split keys if needed -fn pre_propose_split(logger: &Logger, req: &mut AdminRequest, region: &Region) -> Result<()> { - if !req.has_splits() { - return Err(box_err!( - "cmd_type is BatchSplit but it doesn't have splits request, message maybe \ - corrupted!" - .to_owned() - )); - } - - let mut requests: Vec = req.mut_splits().take_requests().into(); - let ajusted_splits = std::mem::take(&mut requests) - .into_iter() - .enumerate() - .filter_map(|(i, mut split)| { - let key = split.take_split_key(); - let key = strip_timestamp_if_exists(key); - if is_valid_split_key(&key, i, region) { - split.split_key = key; - Some(split) - } else { - None - } - }) - .coalesce(|prev, curr| { - // Make sure that the split keys are sorted and unique. - if prev.split_key < curr.split_key { - Err((prev, curr)) - } else { - warn!( - logger, - "skip invalid split key: key should not be larger than the previous."; - "key" => log_wrappers::Value::key(&curr.split_key), - "previous" => log_wrappers::Value::key(&prev.split_key), - ); - Ok(prev) - } - }) - .collect::>(); - - if ajusted_splits.is_empty() { - error!( - logger, - "failed to handle split req, no valid key found for split"; - ); - Err(box_err!("no valid key found for split.".to_owned())) - } else { - // Rewrite the splits. - req.mut_splits().set_requests(ajusted_splits.into()); - Ok(()) - } -} - #[derive(Debug)] pub struct RequestSplit { pub epoch: RegionEpoch, pub split_keys: Vec>, - pub source: Box, + pub source: Cow<'static, str>, +} + +#[derive(Default, Debug)] +pub struct SplitFlowControl { + size_diff_hint: i64, + skip_split_count: u64, + may_skip_split_check: bool, +} + +impl PeerFsmDelegate<'_, EK, ER, T> { + pub fn on_split_region_check(&mut self) { + if !self.fsm.peer_mut().on_split_region_check(self.store_ctx) { + self.schedule_tick(PeerTick::SplitRegionCheck) + } + } } impl Peer { + /// Handle split check. + /// + /// Returns true means the check tick is consumed, no need to schedule + /// another tick. + pub fn on_split_region_check(&mut self, ctx: &mut StoreContext) -> bool { + if !self.is_leader() { + return true; + } + let is_generating_snapshot = self.storage().is_generating_snapshot(); + let control = self.split_flow_control_mut(); + if control.may_skip_split_check + && control.size_diff_hint < ctx.cfg.region_split_check_diff().0 as i64 + { + return true; + } + if ctx.schedulers.split_check.is_busy() { + return false; + } + if is_generating_snapshot && control.skip_split_count < 3 { + control.skip_split_count += 1; + return false; + } + let task = + SplitCheckTask::split_check(self.region().clone(), true, CheckPolicy::Scan, None); + if let Err(e) = ctx.schedulers.split_check.schedule(task) { + info!(self.logger, "failed to schedule split check"; "err" => ?e); + } + let control = self.split_flow_control_mut(); + control.may_skip_split_check = true; + control.size_diff_hint = 0; + control.skip_split_count = 0; + false + } + + pub fn update_split_flow_control(&mut self, metrics: &ApplyMetrics) { + let control = self.split_flow_control_mut(); + control.size_diff_hint += metrics.size_diff_hint; + } + pub fn on_request_split( &mut self, ctx: &mut StoreContext, @@ -178,7 +175,7 @@ impl Peer { self.logger, "on split"; "split_keys" => %KeysInfoFormatter(rs.split_keys.iter()), - "source" => &rs.source, + "source" => %&rs.source, ); if !self.is_leader() { // region on this store is no longer leader, skipped. @@ -196,7 +193,7 @@ impl Peer { &rs.epoch, &rs.split_keys, ) { - info!(self.logger, "invalid split request"; "err" => ?e, "source" => &rs.source); + info!(self.logger, "invalid split request"; "err" => ?e, "source" => %&rs.source); ch.set_result(cmd_resp::new_error(e)); return; } @@ -206,9 +203,9 @@ impl Peer { pub fn propose_split( &mut self, store_ctx: &mut StoreContext, - mut req: RaftCmdRequest, + req: RaftCmdRequest, ) -> Result { - pre_propose_split(&self.logger, req.mut_admin_request(), self.region())?; + validate_batch_split(req.get_admin_request(), self.region())?; // We rely on ConflictChecker to detect conflicts, so no need to set proposal // context. let data = req.write_to_bytes().unwrap(); @@ -419,7 +416,7 @@ impl Peer { let mut meta = store_ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); self.set_region( - &store_ctx.lock_manager_notifier, + &store_ctx.coprocessor_host, reader, derived.clone(), RegionChangeReason::Split, @@ -442,6 +439,7 @@ impl Peer { // Now pd only uses ReportBatchSplit for history operation show, // so we send it independently here. self.report_batch_split_pd(store_ctx, regions.to_vec()); + self.add_pending_tick(PeerTick::SplitRegionCheck); } let last_region_id = regions.last().unwrap().get_id(); @@ -552,7 +550,7 @@ impl Peer { let region_id = self.region_id(); if split_init.check_split { - // TODO: check if the last region needs to split again + self.add_pending_tick(PeerTick::SplitRegionCheck); } let _ = store_ctx .router @@ -581,7 +579,6 @@ impl Peer { if off > 0 { // There should be very few elements in the vector. split_trace.drain(..off); - // TODO: save admin_flushed. assert_ne!(admin_flushed, 0); self.storage_mut() .apply_trace_mut() @@ -618,7 +615,6 @@ mod test { store::{new_learner_peer, new_peer}, worker::dummy_scheduler, }; - use txn_types::Key; use super::*; use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes}; @@ -722,43 +718,6 @@ mod test { } } - #[test] - fn test_propose() { - let logger = slog_global::borrow_global().new(o!()); - - let mut region = Region::default(); - region.set_end_key(b"k10".to_vec()); - - let mut req = AdminRequest::default(); - let err = pre_propose_split(&logger, &mut req, ®ion).unwrap_err(); - assert!( - err.to_string() - .contains("cmd_type is BatchSplit but it doesn't have splits") - ); - - let mut splits = BatchSplitRequest::default(); - req.set_splits(splits.clone()); - let err = pre_propose_split(&logger, &mut req, ®ion).unwrap_err(); - assert!(err.to_string().contains("no valid key found")); - - splits.mut_requests().push(new_split_req(b"", 0, vec![])); - splits.mut_requests().push(new_split_req(b"k03", 0, vec![])); - splits.mut_requests().push(new_split_req(b"k02", 0, vec![])); - splits.mut_requests().push(new_split_req(b"k11", 0, vec![])); - let split_key = Key::from_raw(b"k06"); - let split_key_with_ts = split_key.clone().append_ts(10.into()); - splits - .mut_requests() - .push(new_split_req(split_key_with_ts.as_encoded(), 0, vec![])); - - req.set_splits(splits); - pre_propose_split(&logger, &mut req, ®ion).unwrap(); - let split_reqs = req.get_splits().get_requests(); - assert_eq!(split_reqs.len(), 2); - assert_eq!(split_reqs[0].get_split_key(), b"k03"); - assert_eq!(split_reqs[1].get_split_key(), split_key.as_encoded()); - } - #[test] fn test_split() { let store_id = 2; diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 1c25b363d59..12bd7bbf491 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -35,7 +35,7 @@ use crate::{ router::{CmdResChannel, PeerMsg, PeerTick}, }; -fn get_transfer_leader_cmd(msg: &RaftCmdRequest) -> Option<&TransferLeaderRequest> { +fn transfer_leader_cmd(msg: &RaftCmdRequest) -> Option<&TransferLeaderRequest> { if !msg.has_admin_request() { return None; } @@ -79,7 +79,7 @@ impl Peer { ) -> bool { ctx.raft_metrics.propose.transfer_leader.inc(); - let transfer_leader = get_transfer_leader_cmd(&req).unwrap(); + let transfer_leader = transfer_leader_cmd(&req).unwrap(); let prs = self.raft_group().raft.prs(); // Find the target with the largest matched index among the candidate @@ -108,7 +108,7 @@ impl Peer { _ => peers.choose(&mut rand::thread_rng()).unwrap(), }; - let transferee = if peer.id == self.peer().id { + let transferee = if peer.id == self.peer_id() { false } else { self.pre_transfer_leader(peer) @@ -212,6 +212,7 @@ impl Peer { "peer" => ?from, ); self.raft_group_mut().transfer_leader(from.get_id()); + self.refresh_leader_transferee(); } } } diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 116edec91c3..fce01f19277 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -16,6 +16,8 @@ //! - Applied result are sent back to peer fsm, and update memory state in //! `on_apply_res`. +use std::mem; + use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; use kvproto::raft_cmdpb::{ AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, @@ -49,7 +51,9 @@ mod admin; mod control; mod write; -pub use admin::{AdminCmdResult, RequestSplit, SplitInit, SplitResult, SPLIT_PREFIX}; +pub use admin::{ + AdminCmdResult, RequestSplit, SplitFlowControl, SplitInit, SplitResult, SPLIT_PREFIX, +}; pub use control::ProposalControl; pub use write::{ SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, @@ -277,6 +281,9 @@ impl Peer { } } + self.update_split_flow_control(&apply_res.metrics); + self.update_stat(&apply_res.metrics); + self.raft_group_mut() .advance_apply_to(apply_res.applied_index); self.proposal_control_advance_apply(apply_res.applied_index); @@ -483,6 +490,8 @@ impl Apply { if let Err(e) = wb.write_opt(&write_opt) { panic!("failed to write data: {:?}: {:?}", self.logger.list(), e); } + self.metrics.written_bytes += wb.data_size() as u64; + self.metrics.written_keys += wb.count() as u64; if wb.data_size() <= APPLY_WB_SHRINK_SIZE { wb.clear(); } else { @@ -502,6 +511,7 @@ impl Apply { apply_res.applied_term = term; apply_res.admin_result = self.take_admin_result().into_boxed_slice(); apply_res.modifications = *self.modifications_mut(); + apply_res.metrics = mem::take(&mut self.metrics); self.res_reporter().report(apply_res); } } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 6ea6064a002..51beeee7dea 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -139,6 +139,7 @@ impl Apply { fail::fail_point!("APPLY_PUT", |_| Err(raftstore::Error::Other( "aborted by failpoint".into() ))); + self.metrics.size_diff_hint += (self.key_buffer.len() + value.len()) as i64; self.modifications_mut()[off] = index; Ok(()) } @@ -169,6 +170,7 @@ impl Apply { e ); }); + self.metrics.size_diff_hint -= self.key_buffer.len() as i64; self.modifications_mut()[off] = index; Ok(()) } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 73db4e760d1..59e0e532faa 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -19,10 +19,7 @@ use kvproto::{ metapb::Region, raft_serverpb::{PeerState, RaftMessage}, }; -use raftstore::{ - coprocessor::RegionChangeEvent, - store::{util, WriteTask}, -}; +use raftstore::store::{util, WriteTask}; use slog::{debug, error, info, warn}; use tikv_util::store::find_peer; @@ -229,7 +226,7 @@ impl Store { self.store_id(), region, ctx.engine.clone(), - ctx.read_scheduler.clone(), + ctx.schedulers.read.clone(), &ctx.logger, ) .and_then(|s| PeerFsm::new(&ctx.cfg, &ctx.tablet_registry, s)) @@ -294,11 +291,7 @@ impl Peer { /// /// After destroy is finished, `finish_destroy` should be called to clean up /// memory states. - pub fn start_destroy( - &mut self, - ctx: &mut StoreContext, - write_task: &mut WriteTask, - ) { + pub fn start_destroy(&mut self, write_task: &mut WriteTask) { let entry_storage = self.storage().entry_storage(); if self.postponed_destroy() { return; @@ -326,12 +319,6 @@ impl Peer { lb.put_region_state(region_id, applied_index, ®ion_state) .unwrap(); self.destroy_progress_mut().start(); - - ctx.lock_manager_notifier.on_region_changed( - self.region(), - RegionChangeEvent::Destroy, - self.get_role(), - ); } /// Do clean up for destroy. The peer is permanently destroyed when diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 80443f0ef60..f0a2624203a 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -8,7 +8,7 @@ mod ready; pub use command::{ AdminCmdResult, CommittedEntries, ProposalControl, RequestSplit, SimpleWriteBinary, - SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, + SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, }; pub use life::DestroyProgress; pub use ready::{ diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 1c62c092878..894f39f278b 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -53,7 +53,7 @@ impl Store { // stats.set_query_stats(query_stats); let task = pd::Task::StoreHeartbeat { stats }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!(self.logger(), "notify pd failed"; "store_id" => self.store_id(), "err" => ?e @@ -89,7 +89,7 @@ impl Peer { approximate_keys: None, wait_data_peers: Vec::new(), }); - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd"; @@ -159,7 +159,7 @@ impl Peer { let task = pd::Task::DestroyPeer { region_id: self.region_id(), }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd with DestroyPeer"; @@ -182,7 +182,7 @@ impl Peer { right_derive: ctx.cfg.right_derive_when_split, ch, }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd with AskBatchSplit"; @@ -198,7 +198,7 @@ impl Peer { regions: Vec, ) { let task = pd::Task::ReportBatchSplit { regions }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd with ReportBatchSplit"; @@ -214,7 +214,7 @@ impl Peer { initial_status, txn_ext: self.txn_ext().clone(), }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd with UpdateMaxTimestamp"; diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index e89854f39f4..a2707b6d411 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -201,7 +201,7 @@ where ER: RaftEngine, { fn write_senders(&self) -> &WriteSenders { - &self.write_senders + &self.schedulers.write } fn config(&self) -> &Config { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 9463aae3d73..fcab8728916 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -30,7 +30,7 @@ use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::{ - coprocessor::RoleChange, + coprocessor::{RegionChangeEvent, RoleChange}, store::{util, FetchedLogs, ReadProgress, Transport, WriteTask}, }; use slog::{debug, error, trace, warn}; @@ -71,8 +71,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, if self.fsm.peer_mut().tick() { self.fsm.peer_mut().set_has_ready(); } - self.fsm.peer_mut().refresh_lead_transferee(); - self.schedule_tick(PeerTick::Raft); } } @@ -388,7 +386,12 @@ impl Peer { .collect(); } if !self.serving() { - self.start_destroy(ctx, &mut write_task); + self.start_destroy(&mut write_task); + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Destroy, + self.raft_group().raft.state, + ); } // Ready number should increase monotonically. assert!(self.async_writer.known_largest_number() < ready.number()); @@ -522,18 +525,18 @@ impl Peer { } _ => {} } - ctx.lock_manager_notifier.on_role_change( + let target = self.refresh_leader_transferee(); + ctx.coprocessor_host.on_role_change( self.region(), RoleChange { state: ss.raft_state, leader_id: ss.leader_id, - prev_lead_transferee: self.lead_transferee(), + prev_lead_transferee: target, vote: self.raft_group().raft.vote, }, ); self.proposal_control_mut().maybe_update_term(term); } - self.refresh_lead_transferee(); } /// If leader commits new admin commands, it may break lease assumption. So diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 6c027517454..8598d1cc41d 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -30,7 +30,7 @@ use std::{ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, CF_RAFT}; use kvproto::raft_serverpb::{PeerState, RaftSnapshotData}; use protobuf::Message; -use raft::eraftpb::Snapshot; +use raft::{eraftpb::Snapshot, StateRole}; use raftstore::{ coprocessor::RegionChangeEvent, store::{ @@ -143,6 +143,11 @@ impl Peer { } pub fn on_applied_snapshot(&mut self, ctx: &mut StoreContext) { + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Create, + StateRole::Follower, + ); let persisted_index = self.persisted_index(); let first_index = self.storage().entry_storage().first_index(); if first_index == persisted_index + 1 { @@ -153,14 +158,7 @@ impl Peer { // Use a new FlushState to avoid conflicts with the old one. tablet_ctx.flush_state = Some(flush_state); ctx.tablet_registry.load(tablet_ctx, false).unwrap(); - self.schedule_apply_fsm(ctx); - ctx.lock_manager_notifier.on_region_changed( - self.region(), - RegionChangeEvent::Create, - self.get_role(), - ); - self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(persisted_index); { @@ -232,6 +230,11 @@ impl Apply { } impl Storage { + pub fn is_generating_snapshot(&self) -> bool { + let snap_state = self.snap_state_mut(); + matches!(*snap_state, SnapState::Generating { .. }) + } + /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no /// unavailable snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 56379f2a15f..5539de3d617 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -4,7 +4,10 @@ use std::{mem, sync::Arc}; use engine_traits::{CachedTablet, FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; -use raftstore::store::{fsm::apply::DEFAULT_APPLY_WB_SIZE, ReadTask}; +use raftstore::store::{ + fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, + ReadTask, +}; use slog::Logger; use tikv_util::worker::Scheduler; @@ -46,6 +49,7 @@ pub struct Apply { res_reporter: R, read_scheduler: Scheduler>, + pub(crate) metrics: ApplyMetrics, pub(crate) logger: Logger, } @@ -81,6 +85,7 @@ impl Apply { res_reporter, flush_state, log_recovery, + metrics: ApplyMetrics::default(), logger, } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 0e38f0dd5a1..25285f289a7 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -15,8 +15,9 @@ use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::Regio use pd_client::BucketStat; use raft::{RawNode, StateRole}; use raftstore::{ - coprocessor::{RegionChangeEvent, RegionChangeReason}, + coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, store::{ + fsm::ApplyMetrics, util::{Lease, RegionReadProgress}, Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, TxnExt, WriteTask, @@ -27,8 +28,10 @@ use slog::Logger; use super::storage::Storage; use crate::{ batch::StoreContext, - fsm::{ApplyScheduler, LockManagerNotifier}, - operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteReqEncoder}, + fsm::ApplyScheduler, + operation::{ + AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, + }, router::{CmdResChannel, PeerTick, QueryResChannel}, Result, }; @@ -70,9 +73,6 @@ pub struct Peer { read_progress: Arc, leader_lease: Lease, - /// lead_transferee if this peer(leader) is in a leadership transferring. - lead_transferee: u64, - /// region buckets. region_buckets: Option, last_region_buckets: Option, @@ -88,6 +88,7 @@ pub struct Peer { // Trace which peers have not finished split. split_trace: Vec<(u64, HashSet)>, + split_flow_control: SplitFlowControl, /// Apply related State changes that needs to be persisted to raft engine. /// @@ -95,6 +96,9 @@ pub struct Peer { /// advancing apply index. state_changes: Option>, flush_state: Arc, + + /// lead_transferee if this peer(leader) is in a leadership transferring. + leader_transferee: u64, } impl Peer { @@ -161,11 +165,12 @@ impl Peer { txn_ext: Arc::default(), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), proposal_control: ProposalControl::new(0), - lead_transferee: raft::INVALID_ID, pending_ticks: Vec::new(), split_trace: vec![], state_changes: None, flush_state, + split_flow_control: SplitFlowControl::default(), + leader_transferee: raft::INVALID_ID, }; // If this region has only one peer and I am the one, campaign directly. @@ -199,7 +204,7 @@ impl Peer { /// has been preserved in a durable device. pub fn set_region( &mut self, - lock_manager_observer: &Arc, + host: &CoprocessorHost, reader: &mut ReadDelegate, region: metapb::Region, reason: RegionChangeReason, @@ -248,10 +253,10 @@ impl Peer { } if self.serving() { - lock_manager_observer.on_region_changed( + host.on_region_changed( self.region(), RegionChangeEvent::Update(reason), - self.get_role(), + self.state_role(), ); } } @@ -351,6 +356,12 @@ impl Peer { &self.self_stat } + #[inline] + pub fn update_stat(&mut self, metrics: &ApplyMetrics) { + self.self_stat.written_bytes += metrics.written_bytes; + self.self_stat.written_keys += metrics.written_keys; + } + /// Mark the peer has a ready so it will be checked at the end of every /// processing round. #[inline] @@ -409,11 +420,6 @@ impl Peer { .cloned() } - #[inline] - pub fn get_role(&self) -> StateRole { - self.raft_group.raft.state - } - #[inline] pub fn update_peer_statistics(&mut self) { if !self.is_leader() { @@ -465,6 +471,11 @@ impl Peer { down_peers } + #[inline] + pub fn state_role(&self) -> StateRole { + self.raft_group.raft.state + } + #[inline] pub fn is_leader(&self) -> bool { self.raft_group.raft.state == StateRole::Leader @@ -660,16 +671,6 @@ impl Peer { .advance_apply(apply_index, term, region); } - #[inline] - pub fn lead_transferee(&self) -> u64 { - self.lead_transferee - } - - #[inline] - pub fn refresh_lead_transferee(&mut self) { - self.lead_transferee = self.raft_group.raft.lead_transferee.unwrap_or_default(); - } - // TODO: find a better place to put all txn related stuff. pub fn require_updating_max_ts(&self, ctx: &StoreContext) { let epoch = self.region().get_region_epoch(); @@ -713,4 +714,20 @@ impl Peer { task.extra_write .merge_v2(Box::into_inner(self.state_changes.take().unwrap())); } + + #[inline] + pub fn split_flow_control_mut(&mut self) -> &mut SplitFlowControl { + &mut self.split_flow_control + } + + #[inline] + pub fn refresh_leader_transferee(&mut self) -> u64 { + mem::replace( + &mut self.leader_transferee, + self.raft_group + .raft + .lead_transferee + .unwrap_or(raft::INVALID_ID), + ) + } } diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 3f10e08dee2..a03459c96d2 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -1,6 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::{Arc, Mutex}; +use std::{ + borrow::Cow, + sync::{Arc, Mutex}, +}; use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; @@ -10,9 +13,14 @@ use kvproto::{ raft_serverpb::RaftMessage, }; use raftstore::store::{AsyncReadNotifier, FetchedLogs, GenSnapRes, RegionSnapshot}; +use slog::warn; -use super::PeerMsg; -use crate::{batch::StoreRouter, operation::LocalReader, StoreMeta}; +use super::{CmdResChannel, PeerMsg}; +use crate::{ + batch::StoreRouter, + operation::{LocalReader, RequestSplit}, + StoreMeta, +}; impl AsyncReadNotifier for StoreRouter { fn notify_logs_fetched(&self, region_id: u64, fetched_logs: FetchedLogs) { @@ -24,6 +32,65 @@ impl AsyncReadNotifier for StoreRouter { } } +impl raftstore::coprocessor::StoreHandle for StoreRouter { + fn update_approximate_size(&self, _region_id: u64, _size: u64) { + // TODO + } + + fn update_approximate_keys(&self, _region_id: u64, _keys: u64) { + // TODO + } + + fn ask_split( + &self, + region_id: u64, + region_epoch: kvproto::metapb::RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ) { + let (ch, _) = CmdResChannel::pair(); + let res = self.send( + region_id, + PeerMsg::RequestSplit { + request: RequestSplit { + epoch: region_epoch, + split_keys, + source, + }, + ch, + }, + ); + if let Err(e) = res { + warn!( + self.logger(), + "failed to send ask split"; + "region_id" => region_id, + "err" => %e, + ); + } + } + + fn refresh_region_buckets( + &self, + _region_id: u64, + _region_epoch: kvproto::metapb::RegionEpoch, + _buckets: Vec, + _bucket_ranges: Option>, + ) { + // TODO + } + + fn update_compute_hash_result( + &self, + _region_id: u64, + _index: u64, + _context: Vec, + _hash: Vec, + ) { + // TODO + } +} + /// A router that routes messages to the raftstore pub struct RaftRouter where diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 67f2dec6160..4c317a22abd 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,5 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use raftstore::store::fsm::ApplyMetrics; + use crate::operation::{AdminCmdResult, CommittedEntries, DataTrace, GenSnapTask}; #[derive(Debug)] @@ -14,4 +16,5 @@ pub struct ApplyRes { pub applied_term: u64, pub admin_result: Box<[AdminCmdResult]>, pub modifications: DataTrace, + pub metrics: ApplyMetrics, } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 4c36f474ea9..faed3c0751d 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -217,7 +217,7 @@ impl PeerMsg { request: RequestSplit { epoch, split_keys, - source: source.into_boxed_str(), + source: source.into(), }, ch, }, diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index a454b0aa842..891a97b5d86 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -28,9 +28,9 @@ use kvproto::{ raft_serverpb::RaftMessage, }; use pd_client::RpcClient; -use raft::{eraftpb::MessageType, StateRole}; +use raft::eraftpb::MessageType; use raftstore::{ - coprocessor::{RegionChangeEvent, RoleChange}, + coprocessor::CoprocessorHost, store::{ region_meta::{RegionLocalState, RegionMeta}, Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, @@ -39,7 +39,7 @@ use raftstore::{ use raftstore_v2::{ create_store_batch_system, router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, - Bootstrap, LockManagerNotifier, SimpleWriteEncoder, StateStorage, StoreSystem, + Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, }; use slog::{debug, o, Logger}; use tempfile::TempDir; @@ -47,6 +47,7 @@ use test_pd::mocker::Service; use tikv_util::{ config::{ReadableDuration, VersionTrack}, store::new_peer, + worker::Worker, }; use txn_types::WriteBatchFlags; @@ -224,6 +225,7 @@ pub struct RunningState { pub cfg: Arc>, pub transport: TestTransport, snap_mgr: TabletSnapManager, + background: Worker, } impl RunningState { @@ -278,6 +280,12 @@ impl RunningState { let store_meta = router.store_meta().clone(); let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()); snap_mgr.init().unwrap(); + + let coprocessor_host = CoprocessorHost::new( + router.store_router().clone(), + raftstore::coprocessor::Config::default(), + ); + let background = Worker::new("background"); system .start( store_id, @@ -291,7 +299,8 @@ impl RunningState { snap_mgr.clone(), concurrency_manager, causal_ts_provider, - Arc::new(DummyLockManagerObserver {}), + coprocessor_host, + background.clone(), ) .unwrap(); @@ -303,6 +312,7 @@ impl RunningState { cfg, transport, snap_mgr, + background, }; (TestRouter(router), state) } @@ -311,6 +321,7 @@ impl RunningState { impl Drop for RunningState { fn drop(&mut self) { self.system.shutdown(); + self.background.stop(); } } @@ -574,11 +585,3 @@ impl Drop for Cluster { } } } - -struct DummyLockManagerObserver {} - -impl LockManagerNotifier for DummyLockManagerObserver { - fn on_region_changed(&self, _: &metapb::Region, _: RegionChangeEvent, _: StateRole) {} - - fn on_role_change(&self, _: &metapb::Region, _: RoleChange) {} -} diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs index d5bc784857e..3b315a2d943 100644 --- a/components/raftstore-v2/tests/integrations/test_split.rs +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -11,6 +11,7 @@ use kvproto::{ use raftstore::store::{INIT_EPOCH_VER, RAFT_INIT_LOG_INDEX}; use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use tikv_util::store::new_peer; +use txn_types::{Key, TimeStamp}; use crate::cluster::{Cluster, TestRouter}; @@ -61,6 +62,7 @@ fn split_region( split_peer: metapb::Peer, left_key: &[u8], right_key: &[u8], + propose_key: &[u8], split_key: &[u8], right_derive: bool, ) -> (metapb::Region, metapb::Region) { @@ -75,7 +77,7 @@ fn split_region( split_id.new_region_id = split_region_id; split_id.new_peer_ids = vec![split_peer.id]; let admin_req = - new_batch_split_region_request(vec![split_key.to_vec()], vec![split_id], right_derive); + new_batch_split_region_request(vec![propose_key.to_vec()], vec![split_id], right_derive); req.mut_requests().clear(); req.set_admin_request(admin_req); @@ -133,7 +135,7 @@ fn test_split() { // Region 1000 ["k22", ""] peer(1, 10) let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); - let (left, right) = split_region( + let (left, mut right) = split_region( router, region, peer.clone(), @@ -142,6 +144,7 @@ fn test_split() { b"k11", b"k33", b"k22", + b"k22", false, ); let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); @@ -174,6 +177,7 @@ fn test_split() { b"k00", b"k11", b"k11", + b"k11", false, ); let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); @@ -205,7 +209,7 @@ fn test_split() { .unwrap() .unwrap(); assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); - let _ = split_region( + right = split_region( router, right, new_peer(store_id, 10), @@ -214,8 +218,10 @@ fn test_split() { b"k22", b"k33", b"k33", + b"k33", false, - ); + ) + .1; let region_state = raft_engine .get_region_state(1000, u64::MAX) .unwrap() @@ -236,6 +242,21 @@ fn test_split() { "{flushed_index} >= {}", region_state.get_tablet_index() ); + + let split_key = Key::from_raw(b"k44").append_ts(TimeStamp::zero()); + let actual_split_key = split_key.clone().truncate_ts().unwrap(); + split_region( + router, + right, + new_peer(store_id, 12), + 1003, + new_peer(store_id, 13), + b"k33", + b"k55", + split_key.as_encoded(), + actual_split_key.as_encoded(), + false, + ); } // TODO: test split race with diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 99228aef44c..794a46b8e3a 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -1,11 +1,11 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] called by Fsm on_ready_compute_hash -use std::{marker::PhantomData, mem, ops::Deref}; +use std::{borrow::Cow, marker::PhantomData, mem, ops::Deref}; use engine_traits::{CfName, KvEngine}; use kvproto::{ - metapb::Region, + metapb::{Region, RegionEpoch}, pdpb::CheckPolicy, raft_cmdpb::{ComputeHashRequest, RaftCmdRequest}, }; @@ -13,8 +13,120 @@ use protobuf::Message; use raft::eraftpb; use tikv_util::box_try; -use super::*; -use crate::store::CasualRouter; +use super::{split_observer::SplitObserver, *}; +use crate::store::BucketRange; + +/// A handle for coprocessor to schedule some command back to raftstore. +pub trait StoreHandle: Clone + Send { + fn update_approximate_size(&self, region_id: u64, size: u64); + fn update_approximate_keys(&self, region_id: u64, keys: u64); + fn ask_split( + &self, + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ); + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ); + fn update_compute_hash_result( + &self, + region_id: u64, + index: u64, + context: Vec, + hash: Vec, + ); +} + +#[derive(Clone, Debug, PartialEq)] +pub enum SchedTask { + UpdateApproximateSize { + region_id: u64, + size: u64, + }, + UpdateApproximateKeys { + region_id: u64, + keys: u64, + }, + AskSplit { + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + }, + RefreshRegionBuckets { + region_id: u64, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + }, + UpdateComputeHashResult { + region_id: u64, + index: u64, + hash: Vec, + context: Vec, + }, +} + +impl StoreHandle for std::sync::mpsc::SyncSender { + fn update_approximate_size(&self, region_id: u64, size: u64) { + let _ = self.try_send(SchedTask::UpdateApproximateSize { region_id, size }); + } + + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + let _ = self.try_send(SchedTask::UpdateApproximateKeys { region_id, keys }); + } + + fn ask_split( + &self, + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ) { + let _ = self.try_send(SchedTask::AskSplit { + region_id, + region_epoch, + split_keys, + source, + }); + } + + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + let _ = self.try_send(SchedTask::RefreshRegionBuckets { + region_id, + region_epoch, + buckets, + bucket_ranges, + }); + } + + fn update_compute_hash_result( + &self, + region_id: u64, + index: u64, + context: Vec, + hash: Vec, + ) { + let _ = self.try_send(SchedTask::UpdateComputeHashResult { + region_id, + index, + context, + hash, + }); + } +} struct Entry { priority: u32, @@ -339,10 +451,8 @@ where } impl CoprocessorHost { - pub fn new + Clone + Send + 'static>( - ch: C, - cfg: Config, - ) -> CoprocessorHost { + pub fn new(ch: C, cfg: Config) -> CoprocessorHost { + // TODO load coprocessors from configuration let mut registry = Registry::default(); registry.register_split_check_observer( 200, @@ -357,6 +467,7 @@ impl CoprocessorHost { 400, BoxSplitCheckObserver::new(TableCheckObserver::default()), ); + registry.register_admin_observer(100, BoxAdminObserver::new(SplitObserver)); CoprocessorHost { registry, cfg } } diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 7ac783c0d6d..022a44de463 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -33,7 +33,7 @@ pub use self::{ dispatcher::{ BoxAdminObserver, BoxApplySnapshotObserver, BoxCmdObserver, BoxConsistencyCheckObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, - BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, + BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, StoreHandle, }, error::{Error, Result}, region_info_accessor::{ diff --git a/components/raftstore/src/coprocessor/split_check/half.rs b/components/raftstore/src/coprocessor/split_check/half.rs index 8f572eb1f9f..fafa41e44b5 100644 --- a/components/raftstore/src/coprocessor/split_check/half.rs +++ b/components/raftstore/src/coprocessor/split_check/half.rs @@ -140,8 +140,8 @@ mod tests { *, }; use crate::{ - coprocessor::{Config, CoprocessorHost}, - store::{BucketRange, CasualMessage, SplitCheckRunner, SplitCheckTask}, + coprocessor::{dispatcher::SchedTask, Config, CoprocessorHost}, + store::{BucketRange, SplitCheckRunner, SplitCheckTask}, }; #[test] @@ -451,15 +451,11 @@ mod tests { )); loop { - if let Ok(( - _, - CasualMessage::RefreshRegionBuckets { - region_epoch: _, - buckets, - bucket_ranges, - .. - }, - )) = rx.try_recv() + if let Ok(SchedTask::RefreshRegionBuckets { + buckets, + bucket_ranges, + .. + }) = rx.try_recv() { assert_eq!(buckets.len(), bucket_ranges.unwrap().len()); assert_eq!(buckets.len(), 5); diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index e2e58933e57..92e159d233f 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -1,10 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - marker::PhantomData, - sync::{Arc, Mutex}, -}; - use engine_traits::{KvEngine, Range}; use error_code::ErrorCodeExt; use kvproto::{metapb::Region, pdpb::CheckPolicy}; @@ -19,7 +14,7 @@ use super::{ size::get_approximate_split_keys, Host, }; -use crate::store::{CasualMessage, CasualRouter}; +use crate::coprocessor::dispatcher::StoreHandle; pub struct Checker { max_keys_count: u64, @@ -116,29 +111,19 @@ where } #[derive(Clone)] -pub struct KeysCheckObserver { - router: Arc>, - _phantom: PhantomData, +pub struct KeysCheckObserver { + router: C, } -impl, E> KeysCheckObserver -where - E: KvEngine, -{ - pub fn new(router: C) -> KeysCheckObserver { - KeysCheckObserver { - router: Arc::new(Mutex::new(router)), - _phantom: PhantomData, - } +impl KeysCheckObserver { + pub fn new(router: C) -> KeysCheckObserver { + KeysCheckObserver { router } } } -impl Coprocessor for KeysCheckObserver {} +impl Coprocessor for KeysCheckObserver {} -impl + Send, E> SplitCheckObserver for KeysCheckObserver -where - E: KvEngine, -{ +impl SplitCheckObserver for KeysCheckObserver { fn add_checker( &self, ctx: &mut ObserverContext<'_>, @@ -172,15 +157,7 @@ where } }; - let res = CasualMessage::RegionApproximateKeys { keys: region_keys }; - if let Err(e) = self.router.lock().unwrap().send(region_id, res) { - warn!( - "failed to send approximate region keys"; - "region_id" => region_id, - "err" => %e, - "error_code" => %e.error_code(), - ); - } + self.router.update_approximate_keys(region_id, region_keys); REGION_KEYS_HISTOGRAM.observe(region_keys as f64); // if bucket checker using scan is added, to utilize the scan, @@ -253,8 +230,8 @@ mod tests { *, }; use crate::{ - coprocessor::{Config, CoprocessorHost}, - store::{CasualMessage, SplitCheckRunner, SplitCheckTask}, + coprocessor::{dispatcher::SchedTask, Config, CoprocessorHost}, + store::{SplitCheckRunner, SplitCheckTask}, }; fn put_data(engine: &impl KvEngine, mut start_idx: u64, end_idx: u64, fill_short_value: bool) { @@ -323,8 +300,8 @@ mod tests { )); // keys has not reached the max_keys 100 yet. match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { assert_eq!(region_id, region.get_id()); } others => panic!("expect recv empty, but got {:?}", others), @@ -427,8 +404,8 @@ mod tests { )); // keys has not reached the max_keys 100 yet. match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { assert_eq!(region_id, region.get_id()); } others => panic!("expect recv empty, but got {:?}", others), @@ -599,8 +576,8 @@ mod tests { )); // keys has not reached the max_keys 100 yet. match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { assert_eq!(region_id, region.get_id()); } others => panic!("expect recv empty, but got {:?}", others), diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index bdcf817365c..1f4a33d7af7 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -1,10 +1,5 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - marker::PhantomData, - sync::{Arc, Mutex}, -}; - use engine_traits::{KvEngine, Range}; use error_code::ErrorCodeExt; use kvproto::{metapb::Region, pdpb::CheckPolicy}; @@ -17,7 +12,7 @@ use super::{ }, calc_split_keys_count, Host, }; -use crate::store::{CasualMessage, CasualRouter}; +use crate::coprocessor::dispatcher::StoreHandle; pub struct Checker { max_size: u64, @@ -116,29 +111,19 @@ where } #[derive(Clone)] -pub struct SizeCheckObserver { - router: Arc>, - _phantom: PhantomData, +pub struct SizeCheckObserver { + router: C, } -impl, E> SizeCheckObserver -where - E: KvEngine, -{ - pub fn new(router: C) -> SizeCheckObserver { - SizeCheckObserver { - router: Arc::new(Mutex::new(router)), - _phantom: PhantomData, - } +impl SizeCheckObserver { + pub fn new(router: C) -> SizeCheckObserver { + SizeCheckObserver { router } } } -impl Coprocessor for SizeCheckObserver {} +impl Coprocessor for SizeCheckObserver {} -impl + Send, E> SplitCheckObserver for SizeCheckObserver -where - E: KvEngine, -{ +impl SplitCheckObserver for SizeCheckObserver { fn add_checker( &self, ctx: &mut ObserverContext<'_>, @@ -173,15 +158,7 @@ where }; // send it to raftstore to update region approximate size - let res = CasualMessage::RegionApproximateSize { size: region_size }; - if let Err(e) = self.router.lock().unwrap().send(region_id, res) { - warn!( - "failed to send approximate region size"; - "region_id" => region_id, - "err" => %e, - "error_code" => %e.error_code(), - ); - } + self.router.update_approximate_size(region_id, region_size); let need_bucket_checker = host.cfg.enable_region_bucket && region_size >= 2 * host.cfg.region_bucket_size.0; @@ -256,7 +233,7 @@ pub fn get_approximate_split_keys( #[cfg(test)] pub mod tests { - use std::{iter, sync::mpsc, u64}; + use std::{assert_matches::assert_matches, iter, sync::mpsc, u64}; use collections::HashSet; use engine_test::{ @@ -276,30 +253,31 @@ pub mod tests { use super::{Checker, *}; use crate::{ - coprocessor::{Config, CoprocessorHost, ObserverContext, SplitChecker}, - store::{BucketRange, CasualMessage, KeyEntry, SplitCheckRunner, SplitCheckTask}, + coprocessor::{ + dispatcher::SchedTask, Config, CoprocessorHost, ObserverContext, SplitChecker, + }, + store::{BucketRange, KeyEntry, SplitCheckRunner, SplitCheckTask}, }; fn must_split_at_impl( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, exp_region: &Region, exp_split_keys: Vec>, ignore_split_keys: bool, ) { loop { match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) + | Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } - Ok(( + Ok(SchedTask::AskSplit { region_id, - CasualMessage::SplitRegion { - region_epoch, - split_keys, - .. - }, - )) => { + region_epoch, + split_keys, + .. + }) => { assert_eq!(region_id, exp_region.get_id()); assert_eq!(®ion_epoch, exp_region.get_region_epoch()); if !ignore_split_keys { @@ -307,14 +285,13 @@ pub mod tests { } break; } - Ok((_region_id, CasualMessage::RefreshRegionBuckets { .. })) => {} others => panic!("expect split check result, but got {:?}", others), } } } pub fn must_split_at( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, exp_region: &Region, exp_split_keys: Vec>, ) { @@ -322,50 +299,36 @@ pub mod tests { } pub fn must_split_with( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, exp_region: &Region, exp_split_keys_count: usize, ) { loop { match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) + | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } - Ok(( + Ok(SchedTask::AskSplit { region_id, - CasualMessage::SplitRegion { - region_epoch, - split_keys, - .. - }, - )) => { + region_epoch, + split_keys, + .. + }) => { assert_eq!(region_id, exp_region.get_id()); assert_eq!(®ion_epoch, exp_region.get_region_epoch()); assert_eq!(split_keys.len(), exp_split_keys_count); break; } - Ok((_region_id, CasualMessage::RefreshRegionBuckets { .. })) => {} others => panic!("expect split check result, but got {:?}", others), } } } - pub fn must_generate_buckets( - rx: &mpsc::Receiver<(u64, CasualMessage)>, - exp_buckets_keys: &[Vec], - ) { + pub fn must_generate_buckets(rx: &mpsc::Receiver, exp_buckets_keys: &[Vec]) { loop { - if let Ok(( - _, - CasualMessage::RefreshRegionBuckets { - region_epoch: _, - mut buckets, - bucket_ranges: _, - .. - }, - )) = rx.try_recv() - { + if let Ok(SchedTask::RefreshRegionBuckets { mut buckets, .. }) = rx.try_recv() { let mut i = 0; if !exp_buckets_keys.is_empty() { let bucket = buckets.pop().unwrap(); @@ -383,23 +346,14 @@ pub mod tests { } pub fn must_generate_buckets_approximate( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, bucket_range: Option, min_leap: i32, max_leap: i32, mvcc: bool, ) { loop { - if let Ok(( - _, - CasualMessage::RefreshRegionBuckets { - region_epoch: _, - mut buckets, - bucket_ranges: _, - .. - }, - )) = rx.try_recv() - { + if let Ok(SchedTask::RefreshRegionBuckets { mut buckets, .. }) = rx.try_recv() { let bucket_keys = buckets.pop().unwrap().keys; if let Some(bucket_range) = bucket_range { assert!(!bucket_keys.is_empty()); @@ -489,12 +443,7 @@ pub mod tests { None, )); // size has not reached the max_size 100 yet. - match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) => { - assert_eq!(region_id, region.get_id()); - } - others => panic!("expect recv empty, but got {:?}", others), - } + assert_matches!(rx.try_recv(), Ok(SchedTask::UpdateApproximateSize { region_id, .. }) if region_id == region.get_id()); for i in 7..11 { let s = keys::data_key(format!("{:04}", i).as_bytes()); diff --git a/components/raftstore/src/coprocessor/split_check/table.rs b/components/raftstore/src/coprocessor/split_check/table.rs index 9b5220938fd..684e87e1693 100644 --- a/components/raftstore/src/coprocessor/split_check/table.rs +++ b/components/raftstore/src/coprocessor/split_check/table.rs @@ -238,8 +238,8 @@ mod tests { use super::*; use crate::{ - coprocessor::{Config, CoprocessorHost}, - store::{CasualMessage, SplitCheckRunner, SplitCheckTask}, + coprocessor::{dispatcher::SchedTask, Config, CoprocessorHost}, + store::{SplitCheckRunner, SplitCheckTask}, }; /// Composes table record and index prefix: `t[table_id]`. @@ -353,9 +353,9 @@ mod tests { let key = Key::from_raw(&gen_table_prefix(id)); loop { match rx.try_recv() { - Ok((_, CasualMessage::RegionApproximateSize { .. })) - | Ok((_, CasualMessage::RegionApproximateKeys { .. })) => (), - Ok((_, CasualMessage::SplitRegion { split_keys, .. })) => { + Ok(SchedTask::UpdateApproximateSize { .. }) + | Ok(SchedTask::UpdateApproximateKeys { .. }) => (), + Ok(SchedTask::AskSplit { split_keys, .. }) => { assert_eq!(split_keys, vec![key.into_encoded()]); break; } @@ -365,8 +365,8 @@ mod tests { } else { loop { match rx.try_recv() { - Ok((_, CasualMessage::RegionApproximateSize { .. })) - | Ok((_, CasualMessage::RegionApproximateKeys { .. })) => (), + Ok(SchedTask::UpdateApproximateSize { .. }) + | Ok(SchedTask::UpdateApproximateKeys { .. }) => (), Err(mpsc::TryRecvError::Empty) => { break; } diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index e56678edec2..6104ae7b7cf 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -7,6 +7,7 @@ #![feature(box_patterns)] #![feature(hash_drain_filter)] #![feature(let_chains)] +#![feature(assert_matches)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 1ded8be3886..0f22eb483a0 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -1,10 +1,14 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use std::borrow::Cow; + // #[PerformanceCriticalPath] use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine, Snapshot}; -use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; +use error_code::ErrorCodeExt; +use kvproto::{metapb, raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; use raft::SnapshotStatus; +use slog_global::warn; use tikv_util::time::ThreadReadId; use crate::{ @@ -276,3 +280,107 @@ impl RaftStoreRouter for RaftRouter { batch_system::Router::broadcast_normal(self, msg_gen) } } + +// Because `CasualRouter` needs an generic while `RaftRotuer` doesn't. We have +// to bridge two by manually implementations. Using functions to reduce +// duplicated codes. + +impl crate::coprocessor::StoreHandle for RaftRouter { + fn update_approximate_size(&self, region_id: u64, size: u64) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::RegionApproximateSize { size }, + ) { + warn!( + "failed to send approximate region size"; + "region_id" => region_id, + "err" => %e, + "error_code" => %e.error_code(), + ); + } + } + + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::RegionApproximateKeys { keys }, + ) { + warn!( + "failed to send approximate region keys"; + "region_id" => region_id, + "err" => %e, + "error_code" => %e.error_code(), + ); + } + } + + fn ask_split( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::SplitRegion { + region_epoch, + split_keys, + callback: Callback::None, + source, + }, + ) { + warn!( + "failed to send ask split"; + "region_id" => region_id, + "err" => %e, + ); + } + } + + fn update_compute_hash_result( + &self, + region_id: u64, + index: u64, + context: Vec, + hash: Vec, + ) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::ComputeHashResult { + index, + context, + hash, + }, + ) { + warn!( + "failed to send hash compute result"; + "region_id" => region_id, + "err" => %e, + ); + } + } + + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + let _ = CasualRouter::send( + self, + region_id, + CasualMessage::RefreshRegionBuckets { + region_epoch, + buckets, + bucket_ranges, + cb: Callback::None, + }, + ); + } +} diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 28c0db02eee..3cadcce5a82 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -66,10 +66,7 @@ use time::{self, Timespec}; use crate::{ bytes_capacity, - coprocessor::{ - split_observer::SplitObserver, BoxAdminObserver, CoprocessorHost, RegionChangeEvent, - RegionChangeReason, - }, + coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, store::{ async_io::{ read::{ReadRunner, ReadTask}, @@ -1467,7 +1464,7 @@ impl RaftBatchSystem { mgr: SnapManager, pd_worker: LazyWorker>, store_meta: Arc>, - mut coprocessor_host: CoprocessorHost, + coprocessor_host: CoprocessorHost, importer: Arc, split_check_scheduler: Scheduler, background_worker: Worker, @@ -1480,12 +1477,6 @@ impl RaftBatchSystem { ) -> Result<()> { assert!(self.workers.is_none()); // TODO: we can get cluster meta regularly too later. - - // TODO load coprocessors from configuration - coprocessor_host - .registry - .register_admin_observer(100, BoxAdminObserver::new(SplitObserver)); - let purge_worker = if engines.raft.need_manual_purge() { let worker = Worker::new("purge-worker"); let raft_clone = engines.raft.clone(); diff --git a/components/raftstore/src/store/worker/consistency_check.rs b/components/raftstore/src/store/worker/consistency_check.rs index b3bd7ef32d0..fef2bae332c 100644 --- a/components/raftstore/src/store/worker/consistency_check.rs +++ b/components/raftstore/src/store/worker/consistency_check.rs @@ -9,8 +9,8 @@ use tikv_util::{error, info, warn, worker::Runnable}; use super::metrics::*; use crate::{ - coprocessor::CoprocessorHost, - store::{metrics::*, CasualMessage, CasualRouter}, + coprocessor::{dispatcher::StoreHandle, CoprocessorHost}, + store::metrics::*, }; /// Consistency checking task. @@ -44,12 +44,12 @@ impl Display for Task { } } -pub struct Runner> { +pub struct Runner { router: C, coprocessor_host: CoprocessorHost, } -impl> Runner { +impl Runner { pub fn new(router: C, cop_host: CoprocessorHost) -> Runner { Runner { router, @@ -85,18 +85,8 @@ impl> Runner { for (ctx, sum) in hashes { let mut checksum = Vec::with_capacity(4); checksum.write_u32::(sum).unwrap(); - let msg = CasualMessage::ComputeHashResult { - index, - context: ctx, - hash: checksum, - }; - if let Err(e) = self.router.send(region.get_id(), msg) { - warn!( - "failed to send hash compute result"; - "region_id" => region.get_id(), - "err" => %e, - ); - } + self.router + .update_compute_hash_result(region.get_id(), index, ctx, checksum); } timer.observe_duration(); @@ -106,7 +96,7 @@ impl> Runner { impl Runnable for Runner where EK: KvEngine, - C: CasualRouter, + C: StoreHandle, { type Task = Task; @@ -124,7 +114,7 @@ where #[cfg(test)] mod tests { - use std::{sync::mpsc, time::Duration}; + use std::{assert_matches::assert_matches, sync::mpsc, time::Duration}; use byteorder::{BigEndian, WriteBytesExt}; use engine_test::kv::{new_engine, KvTestEngine}; @@ -135,7 +125,8 @@ mod tests { use super::*; use crate::coprocessor::{ - BoxConsistencyCheckObserver, ConsistencyCheckMethod, RawConsistencyCheckObserver, + dispatcher::SchedTask, BoxConsistencyCheckObserver, ConsistencyCheckMethod, + RawConsistencyCheckObserver, }; #[test] @@ -177,21 +168,8 @@ mod tests { checksum_bytes.write_u32::(sum).unwrap(); let res = rx.recv_timeout(Duration::from_secs(3)).unwrap(); - match res { - ( - region_id, - CasualMessage::ComputeHashResult { - index, - hash, - context, - }, - ) => { - assert_eq!(region_id, region.get_id()); - assert_eq!(index, 10); - assert_eq!(context, vec![0]); - assert_eq!(hash, checksum_bytes); - } - e => panic!("unexpected {:?}", e), - } + assert_matches!(res, SchedTask::UpdateComputeHashResult { region_id, index, hash, context} if + region_id == region.get_id() && index == 10 && context == vec![0] && hash == checksum_bytes + ); } } diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index d1c531070ac..b6bc5fca65f 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -7,15 +7,16 @@ use std::{ mem, }; -use engine_traits::{CfName, IterOptions, Iterable, Iterator, KvEngine, CF_WRITE, LARGE_CFS}; +use engine_traits::{ + CfName, IterOptions, Iterable, Iterator, KvEngine, TabletRegistry, CF_WRITE, LARGE_CFS, +}; use file_system::{IoType, WithIoType}; use itertools::Itertools; -use kvproto::{ - metapb::{Region, RegionEpoch}, - pdpb::CheckPolicy, -}; +use kvproto::{metapb::Region, pdpb::CheckPolicy}; use online_config::{ConfigChange, OnlineConfig}; -use tikv_util::{box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable}; +use tikv_util::{ + box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable, Either, +}; use txn_types::Key; use super::metrics::*; @@ -23,10 +24,10 @@ use super::metrics::*; use crate::coprocessor::Config; use crate::{ coprocessor::{ + dispatcher::StoreHandle, split_observer::{is_valid_split_key, strip_timestamp_if_exists}, CoprocessorHost, SplitCheckerHost, }, - store::{Callback, CasualMessage, CasualRouter}, Result, }; @@ -131,10 +132,10 @@ where } } -#[derive(Default, Clone, Debug)] +#[derive(Default, Clone, Debug, PartialEq)] pub struct BucketRange(pub Vec, pub Vec); -#[derive(Default, Clone, Debug)] +#[derive(Default, Clone, Debug, PartialEq)] pub struct Bucket { // new proposed split keys under the bucket for split // if it does not need split, it's empty @@ -219,23 +220,30 @@ impl Display for Task { } } -pub struct Runner -where - E: KvEngine, -{ - engine: E, +pub struct Runner { + // We can't just use `TabletRegistry` here, otherwise v1 may create many + // invalid records and cause other problems. + engine: Either>, router: S, - coprocessor: CoprocessorHost, + coprocessor: CoprocessorHost, } -impl Runner -where - E: KvEngine, - S: CasualRouter, -{ - pub fn new(engine: E, router: S, coprocessor: CoprocessorHost) -> Runner { +impl Runner { + pub fn new(engine: EK, router: S, coprocessor: CoprocessorHost) -> Runner { Runner { - engine, + engine: Either::Left(engine), + router, + coprocessor, + } + } + + pub fn with_registry( + registry: TabletRegistry, + router: S, + coprocessor: CoprocessorHost, + ) -> Runner { + Runner { + engine: Either::Right(registry), router, coprocessor, } @@ -243,8 +251,9 @@ where fn approximate_check_bucket( &self, + tablet: &EK, region: &Region, - host: &mut SplitCheckerHost<'_, E>, + host: &mut SplitCheckerHost<'_, EK>, bucket_ranges: Option>, ) -> Result<()> { let ranges = bucket_ranges.clone().unwrap_or_else(|| { @@ -258,7 +267,7 @@ where let mut bucket = region.clone(); bucket.set_start_key(range.0.clone()); bucket.set_end_key(range.1.clone()); - let bucket_entry = host.approximate_bucket_keys(&bucket, &self.engine)?; + let bucket_entry = host.approximate_bucket_keys(&bucket, tablet)?; debug!( "bucket_entry size {} keys count {}", bucket_entry.size, @@ -328,14 +337,11 @@ where region: &Region, bucket_ranges: Option>, ) { - let _ = self.router.send( + self.router.refresh_region_buckets( region.get_id(), - CasualMessage::RefreshRegionBuckets { - region_epoch: region.get_region_epoch().clone(), - buckets, - bucket_ranges, - cb: Callback::None, - }, + region.get_region_epoch().clone(), + buckets, + bucket_ranges, ); } @@ -350,6 +356,20 @@ where policy: CheckPolicy, bucket_ranges: Option>, ) { + let mut cached; + let tablet = match &self.engine { + Either::Left(e) => e, + Either::Right(r) => match r.get(region.get_id()) { + Some(c) => { + cached = Some(c); + match cached.as_mut().unwrap().latest() { + Some(t) => t, + None => return, + } + } + None => return, + }, + }; let region_id = region.get_id(); let is_key_range = start_key.is_some() && end_key.is_some(); let start_key = if is_key_range { @@ -372,9 +392,9 @@ where "policy" => ?policy, ); CHECK_SPILT_COUNTER.all.inc(); - let mut host = - self.coprocessor - .new_split_checker_host(region, &self.engine, auto_split, policy); + let mut host = self + .coprocessor + .new_split_checker_host(region, tablet, auto_split, policy); if host.skip() { debug!("skip split check"; @@ -390,6 +410,7 @@ where CheckPolicy::Scan => { match self.scan_split_keys( &mut host, + tablet, region, is_key_range, &start_key, @@ -408,11 +429,11 @@ where } } } - CheckPolicy::Approximate => match host.approximate_split_keys(region, &self.engine) { + CheckPolicy::Approximate => match host.approximate_split_keys(region, tablet) { Ok(keys) => { if host.enable_region_bucket() { if let Err(e) = - self.approximate_check_bucket(region, &mut host, bucket_ranges) + self.approximate_check_bucket(tablet, region, &mut host, bucket_ranges) { error!(%e; "approximate_check_bucket failed"; @@ -437,6 +458,7 @@ where ); match self.scan_split_keys( &mut host, + tablet, region, is_key_range, &start_key, @@ -461,12 +483,8 @@ where if !split_keys.is_empty() { let region_epoch = region.get_region_epoch().clone(); - let msg = new_split_region(region_epoch, split_keys, "split checker"); - let res = self.router.send(region_id, msg); - if let Err(e) = res { - warn!("failed to send check result"; "region_id" => region_id, "err" => %e); - } - + self.router + .ask_split(region_id, region_epoch, split_keys, "split checker".into()); CHECK_SPILT_COUNTER.success.inc(); } else { debug!( @@ -484,7 +502,8 @@ where /// If it's Some(vec![]), skip generating buckets. fn scan_split_keys( &self, - host: &mut SplitCheckerHost<'_, E>, + host: &mut SplitCheckerHost<'_, EK>, + tablet: &EK, region: &Region, is_key_range: bool, start_key: &[u8], @@ -505,12 +524,8 @@ where (!host.enable_region_bucket(), &empty_bucket) }; - MergedIterator::<::Iterator>::new( - &self.engine, - LARGE_CFS, - start_key, - end_key, - false, + MergedIterator::<::Iterator>::new( + tablet, LARGE_CFS, start_key, end_key, false, ) .map(|mut iter| { let mut size = 0; @@ -595,14 +610,8 @@ where "bucket_count" => buckets.len(), "bucket_size" => bucket_size, ); - let _ = self.router.send( - region.get_id(), - CasualMessage::RegionApproximateSize { size }, - ); - let _ = self.router.send( - region.get_id(), - CasualMessage::RegionApproximateKeys { keys }, - ); + self.router.update_approximate_size(region.get_id(), size); + self.router.update_approximate_keys(region.get_id(), keys); })?; if host.enable_region_bucket() { @@ -632,10 +641,10 @@ where } } -impl Runnable for Runner +impl Runnable for Runner where - E: KvEngine, - S: CasualRouter, + EK: KvEngine, + S: StoreHandle, { type Task = Task; fn run(&mut self, task: Task) { @@ -659,13 +668,28 @@ where Task::ChangeConfig(c) => self.change_cfg(c), Task::ApproximateBuckets(region) => { if self.coprocessor.cfg.enable_region_bucket { + let mut cached; + let tablet = match &self.engine { + Either::Left(e) => e, + Either::Right(r) => match r.get(region.get_id()) { + Some(c) => { + cached = Some(c); + match cached.as_mut().unwrap().latest() { + Some(t) => t, + None => return, + } + } + None => return, + }, + }; let mut host = self.coprocessor.new_split_checker_host( ®ion, - &self.engine, + tablet, false, CheckPolicy::Approximate, ); - if let Err(e) = self.approximate_check_bucket(®ion, &mut host, None) { + if let Err(e) = self.approximate_check_bucket(tablet, ®ion, &mut host, None) + { error!(%e; "approximate_check_bucket failed"; "region_id" => region.get_id(), @@ -678,19 +702,3 @@ where } } } - -fn new_split_region( - region_epoch: RegionEpoch, - split_keys: Vec>, - source: &'static str, -) -> CasualMessage -where - E: KvEngine, -{ - CasualMessage::SplitRegion { - region_epoch, - split_keys, - callback: Callback::None, - source: source.into(), - } -} diff --git a/src/server/lock_manager/deadlock.rs b/src/server/lock_manager/deadlock.rs index a9a31c68b8f..9583df80dd6 100644 --- a/src/server/lock_manager/deadlock.rs +++ b/src/server/lock_manager/deadlock.rs @@ -28,7 +28,6 @@ use raftstore::{ }, store::util::is_region_initialized, }; -use raftstore_v2::LockManagerNotifier; use security::SecurityManager; use tikv_util::{ future::paired_future_callback, @@ -525,7 +524,7 @@ const LEADER_KEY: &[u8] = b""; /// way to change the node from the leader of deadlock detector to follower, and /// vice versa. #[derive(Clone)] -pub struct RoleChangeNotifier { +pub(crate) struct RoleChangeNotifier { /// The id of the valid leader region. // raftstore.coprocessor needs it to be Sync + Send. leader_region_id: Arc>, @@ -607,18 +606,6 @@ impl RegionChangeObserver for RoleChangeNotifier { } } -impl LockManagerNotifier for RoleChangeNotifier { - fn on_role_change(&self, region: &Region, role_change: RoleChange) { - let mut ctx = ObserverContext::new(region); - RoleObserver::on_role_change(self, &mut ctx, &role_change); - } - - fn on_region_changed(&self, region: &Region, event: RegionChangeEvent, role: StateRole) { - let mut ctx = ObserverContext::new(region); - RegionChangeObserver::on_region_changed(self, &mut ctx, event, role); - } -} - struct Inner { /// The role of the deadlock detector. Default is `Role::Follower`. role: Role, diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index 44c31fcab1e..243d533a0e5 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -205,11 +205,6 @@ impl LockManager { role_change_notifier.register(host); } - /// Creates a `RoleChangeNotifier` of the deadlock detector worker - pub fn new_notifier(&self) -> RoleChangeNotifier { - RoleChangeNotifier::new(self.detector_scheduler.clone()) - } - /// Creates a `DeadlockService` to handle deadlock detect requests from /// other nodes. pub fn deadlock_service(&self) -> DeadlockService { diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index 57bc575ff05..b3a445a1f7e 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -7,8 +7,11 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine, TabletContext, TabletRegistry}; use kvproto::{metapb, replication_modepb::ReplicationStatus}; use pd_client::PdClient; -use raftstore::store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}; -use raftstore_v2::{router::RaftRouter, Bootstrap, LockManagerNotifier, StoreSystem}; +use raftstore::{ + coprocessor::CoprocessorHost, + store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}, +}; +use raftstore_v2::{router::RaftRouter, Bootstrap, StoreSystem}; use slog::{info, o, Logger}; use tikv_util::{config::VersionTrack, worker::Worker}; @@ -85,7 +88,8 @@ where snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 - lock_manager_observer: Arc, + coprocessor_host: CoprocessorHost, + background: Worker, ) -> Result<()> where T: Transport + 'static, @@ -126,7 +130,8 @@ where snap_mgr, concurrency_manager, causal_ts_provider, - lock_manager_observer, + coprocessor_host, + background, )?; Ok(()) @@ -173,7 +178,8 @@ where snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 - lock_manager_observer: Arc, + coprocessor_host: CoprocessorHost, + background: Worker, ) -> Result<()> where T: Transport + 'static, @@ -199,7 +205,8 @@ where snap_mgr, concurrency_manager, causal_ts_provider, - lock_manager_observer, + coprocessor_host, + background, )?; Ok(()) } From a499caf0d809e530f46ac8125ae07817328145d0 Mon Sep 17 00:00:00 2001 From: lance6716 Date: Thu, 22 Dec 2022 19:34:55 +0800 Subject: [PATCH 423/676] import: log L0 SST ranges when too many files in ingest (#13979) close tikv/tikv#13980 Signed-off-by: lance6716 Co-authored-by: Ti Chi Robot --- components/engine_panic/src/misc.rs | 4 ++ components/engine_rocks/src/misc.rs | 87 +++++++++++++++++++++++++++- components/engine_traits/src/misc.rs | 2 + src/import/sst_service.rs | 11 ++++ 4 files changed, 103 insertions(+), 1 deletion(-) diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 730f44a7e2f..561d2892ca9 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -48,6 +48,10 @@ impl MiscExt for PanicEngine { panic!() } + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>> { + panic!() + } + fn get_engine_used_size(&self) -> Result { panic!() } diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 4761183546e..75b193bdcf9 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -238,6 +238,24 @@ impl MiscExt for RocksEngine { Ok(false) } + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>> { + let handle = util::get_cf_handle(self.as_inner(), cf)?; + let ret = self + .as_inner() + .get_column_family_meta_data(handle) + .get_level(level) + .get_files() + .iter() + .map(|sst_meta| { + ( + sst_meta.get_smallestkey().to_vec(), + sst_meta.get_largestkey().to_vec(), + ) + }) + .collect(); + Ok(ret) + } + fn get_engine_used_size(&self) -> Result { let mut used_size: u64 = 0; for cf in ALL_CFS { @@ -333,7 +351,8 @@ impl MiscExt for RocksEngine { #[cfg(test)] mod tests { use engine_traits::{ - DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, ALL_CFS, + CompactExt, DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, + ALL_CFS, }; use tempfile::Builder; @@ -581,4 +600,70 @@ mod tests { .unwrap(); check_data(&db, &[cf], kvs_left.as_slice()); } + + #[test] + fn test_get_sst_key_ranges() { + let path = Builder::new() + .prefix("test_get_sst_key_ranges") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + + let mut opts = RocksDbOptions::default(); + opts.create_if_missing(true); + opts.enable_multi_batch_write(true); + + let mut cf_opts = RocksCfOptions::default(); + // Prefix extractor(trim the timestamp at tail) for write cf. + cf_opts + .set_prefix_extractor( + "FixedSuffixSliceTransform", + crate::util::FixedSuffixSliceTransform::new(8), + ) + .unwrap_or_else(|err| panic!("{:?}", err)); + // Create prefix bloom filter for memtable. + cf_opts.set_memtable_prefix_bloom_size_ratio(0.1_f64); + let cf = "default"; + let db = new_engine_opt(path_str, opts, vec![(cf, cf_opts)]).unwrap(); + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![ + (b"k1", b"v1"), + (b"k2", b"v2"), + (b"k6", b"v3"), + (b"k7", b"v4"), + ]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k7".to_vec())]; + assert_eq!(sst_range, expected); + + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![(b"k3", b"v1"), (b"k4", b"v2"), (b"k8", b"v3")]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![ + (b"k3".to_vec(), b"k8".to_vec()), + (b"k1".to_vec(), b"k7".to_vec()), + ]; + assert_eq!(sst_range, expected); + + db.compact_range(cf, None, None, false, 1).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + assert_eq!(sst_range.len(), 0); + let sst_range = db.get_sst_key_ranges(cf, 1).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k8".to_vec())]; + assert_eq!(sst_range, expected); + } } diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index edfea511d35..a7679256f21 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -84,6 +84,8 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn ingest_maybe_slowdown_writes(&self, cf: &str) -> Result; + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>>; + /// Gets total used size of rocksdb engine, including: /// * total size (bytes) of all SST files. /// * total size (bytes) of active and unflushed immutable memtables. diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index bdb552e8923..8ce6f9961fb 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -185,6 +185,17 @@ where .ingest_maybe_slowdown_writes(CF_WRITE) .expect("cf") { + match self.engine.get_sst_key_ranges(CF_WRITE, 0) { + Ok(l0_sst_ranges) => { + warn!( + "sst ingest is too slow"; + "sst_ranges" => ?l0_sst_ranges, + ); + } + Err(e) => { + error!("get sst key ranges failed"; "err" => ?e); + } + } let mut errorpb = errorpb::Error::default(); let err = "too many sst files are ingesting"; let mut server_is_busy_err = errorpb::ServerIsBusy::default(); From 90505f52857faf7ac993c8ae493bce5b2fdc270d Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 23 Dec 2022 13:34:14 +0800 Subject: [PATCH 424/676] *: support start with raftkv2 (#13981) ref tikv/tikv#12842 Not all functionality are supported, this is just a naive pure KV system with transaction support. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + cmd/tikv-server/src/main.rs | 10 +- components/raftstore-v2/src/batch/store.rs | 40 +- components/raftstore-v2/src/lib.rs | 1 + components/raftstore-v2/src/worker/pd/mod.rs | 30 +- .../src/worker/pd/update_max_timestamp.rs | 3 +- .../tests/integrations/cluster.rs | 4 +- components/raftstore/src/store/snap.rs | 29 + components/server/Cargo.toml | 1 + components/server/src/lib.rs | 1 + components/server/src/server.rs | 12 +- components/server/src/server2.rs | 1759 +++++++++++++++++ components/test_raftstore/src/server.rs | 8 +- components/tikv_util/src/sys/mod.rs | 19 +- src/server/raftkv2/node.rs | 75 +- src/server/server.rs | 44 +- src/server/service/kv.rs | 7 +- src/server/status_server/mod.rs | 61 +- tests/integrations/server/status_server.rs | 8 +- 19 files changed, 1980 insertions(+), 133 deletions(-) create mode 100644 components/server/src/server2.rs diff --git a/Cargo.lock b/Cargo.lock index 67ca50ba1ec..cf53d09da09 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5226,6 +5226,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", + "raftstore-v2", "rand 0.8.5", "resolved_ts", "resource_metering", diff --git a/cmd/tikv-server/src/main.rs b/cmd/tikv-server/src/main.rs index b366cd7849f..1d846d72bdb 100644 --- a/cmd/tikv-server/src/main.rs +++ b/cmd/tikv-server/src/main.rs @@ -7,7 +7,10 @@ use std::{path::Path, process}; use clap::{crate_authors, App, Arg}; use serde_json::{Map, Value}; use server::setup::{ensure_no_unrecognized_config, validate_and_persist_config}; -use tikv::config::{to_flatten_config_info, TikvConfig}; +use tikv::{ + config::{to_flatten_config_info, TikvConfig}, + storage::config::EngineType, +}; fn main() { let build_timestamp = option_env!("TIKV_BUILD_TIME"); @@ -207,5 +210,8 @@ fn main() { process::exit(0); } - server::server::run_tikv(config); + match config.storage.engine { + EngineType::RaftKv => server::server::run_tikv(config), + EngineType::RaftKv2 => server::server2::run_tikv(config), + } } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 642f6e745f0..0d5f984107c 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -35,7 +35,7 @@ use tikv_util::{ sys::SysQuota, time::Instant as TiInstant, timer::SteadyTimer, - worker::{Scheduler, Worker}, + worker::{LazyWorker, Scheduler, Worker}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, Either, }; @@ -373,7 +373,7 @@ pub struct Schedulers { struct Workers { /// Worker for fetching raft logs asynchronously async_read: Worker, - pd: Worker, + pd: LazyWorker, async_write: StoreWriters, // Following is not maintained by raftstore itself. @@ -381,10 +381,10 @@ struct Workers { } impl Workers { - fn new(background: Worker) -> Self { + fn new(background: Worker, pd: LazyWorker) -> Self { Self { async_read: Worker::new("async-read-worker"), - pd: Worker::new("pd-worker"), + pd, async_write: StoreWriters::default(), background, } @@ -415,6 +415,7 @@ impl StoreSystem { causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, background: Worker, + pd_worker: LazyWorker, ) -> Result<()> where T: Transport + 'static, @@ -428,7 +429,7 @@ impl StoreSystem { .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); }); - let mut workers = Workers::new(background); + let mut workers = Workers::new(background, pd_worker); workers .async_write .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; @@ -437,21 +438,18 @@ impl StoreSystem { read_runner.set_snap_mgr(snap_mgr.clone()); let read_scheduler = workers.async_read.start("async-read-worker", read_runner); - let pd_scheduler = workers.pd.start( - "pd-worker", - pd::Runner::new( - store_id, - pd_client, - raft_engine.clone(), - tablet_registry.clone(), - router.clone(), - workers.pd.remote(), - concurrency_manager, - causal_ts_provider, - self.logger.clone(), - self.shutdown.clone(), - ), - ); + workers.pd.start(pd::Runner::new( + store_id, + pd_client, + raft_engine.clone(), + tablet_registry.clone(), + router.clone(), + workers.pd.remote(), + concurrency_manager, + causal_ts_provider, + self.logger.clone(), + self.shutdown.clone(), + )); let split_check_scheduler = workers.background.start( "split-check", @@ -464,7 +462,7 @@ impl StoreSystem { let schedulers = Schedulers { read: read_scheduler, - pd: pd_scheduler, + pd: workers.pd.scheduler(), write: workers.async_write.senders(), split_check: split_check_scheduler, }; diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index d8327549da6..7ddb1687d91 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -41,3 +41,4 @@ pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{store::Config, Error, Result}; +pub use worker::pd::{FlowReporter, Task as PdTask}; diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index 18b01a8026a..cc977e68236 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -11,9 +11,12 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; use pd_client::PdClient; -use raftstore::store::{util::KeysInfoFormatter, TxnExt}; +use raftstore::store::{util::KeysInfoFormatter, FlowStatsReporter, ReadStats, TxnExt, WriteStats}; use slog::{error, info, Logger}; -use tikv_util::{time::UnixSecs, worker::Runnable}; +use tikv_util::{ + time::UnixSecs, + worker::{Runnable, Scheduler}, +}; use yatp::{task::future::TaskCell, Remote}; use crate::{ @@ -206,6 +209,29 @@ where } } +#[derive(Clone)] +pub struct FlowReporter { + _scheduler: Scheduler, +} + +impl FlowReporter { + pub fn new(scheduler: Scheduler) -> Self { + FlowReporter { + _scheduler: scheduler, + } + } +} + +impl FlowStatsReporter for FlowReporter { + fn report_read_stats(&self, _read_stats: ReadStats) { + // TODO + } + + fn report_write_stats(&self, _write_stats: WriteStats) { + // TODO + } +} + mod requests { use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, diff --git a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs index cbfecb8171d..0de3fb9a87c 100644 --- a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs +++ b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs @@ -7,7 +7,6 @@ use std::{ use causal_ts::CausalTsProvider; use engine_traits::{KvEngine, RaftEngine}; -use fail::fail_point; use futures::{compat::Future01CompatExt, FutureExt}; use pd_client::PdClient; use raftstore::{store::TxnExt, Result}; @@ -96,7 +95,7 @@ where #[cfg(feature = "failpoints")] let delay = (|| { - fail_point!("delay_update_max_ts", |_| true); + fail::fail_point!("delay_update_max_ts", |_| true); false })(); #[cfg(not(feature = "failpoints"))] diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 891a97b5d86..ca166eab950 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -47,7 +47,7 @@ use test_pd::mocker::Service; use tikv_util::{ config::{ReadableDuration, VersionTrack}, store::new_peer, - worker::Worker, + worker::{LazyWorker, Worker}, }; use txn_types::WriteBatchFlags; @@ -286,6 +286,7 @@ impl RunningState { raftstore::coprocessor::Config::default(), ); let background = Worker::new("background"); + let pd_worker = LazyWorker::new("pd-worker"); system .start( store_id, @@ -301,6 +302,7 @@ impl RunningState { causal_ts_provider, coprocessor_host, background.clone(), + pd_worker, ) .unwrap(); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 8cb44e3718c..04aef985e3b 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1984,6 +1984,35 @@ impl TabletSnapManager { true } } + + pub fn total_snap_size(&self) -> Result { + let mut total_size = 0; + for entry in file_system::read_dir(&self.base)? { + let entry = match entry { + Ok(e) => e, + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + + let path = entry.path(); + // Generated snapshots are just checkpoints, only counts received snapshots. + if !path + .file_name() + .and_then(|n| n.to_str()) + .map_or(true, |n| n.starts_with(SNAP_REV_PREFIX)) + { + continue; + } + for e in file_system::read_dir(path)? { + match e.and_then(|e| e.metadata()) { + Ok(m) => total_size += m.len(), + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + } + } + } + Ok(total_size) + } } #[cfg(test)] diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index b27846ad5a3..acdca09b29c 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -66,6 +66,7 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft_log_engine = { workspace = true } raftstore = { workspace = true, features = ["engine_rocks"] } +raftstore-v2 = { workspace = true } rand = "0.8" resolved_ts = { workspace = true } resource_metering = { workspace = true } diff --git a/components/server/src/lib.rs b/components/server/src/lib.rs index 5107a20eeab..d5c8e352a88 100644 --- a/components/server/src/lib.rs +++ b/components/server/src/lib.rs @@ -12,4 +12,5 @@ pub mod setup; pub mod memory; pub mod raft_engine_switch; pub mod server; +pub mod server2; pub mod signal_handler; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index ffc5272c673..d7a05fff115 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -123,6 +123,7 @@ use tikv_util::{ thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + Either, }; use tokio::runtime::Builder; @@ -959,9 +960,9 @@ where ), coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), self.resolver.clone().unwrap(), - snap_mgr.clone(), + Either::Left(snap_mgr.clone()), gc_worker.clone(), - check_leader_scheduler, + Some(check_leader_scheduler), self.env.clone(), unified_read_pool, debug_thread_pool, @@ -1649,7 +1650,7 @@ where self.config.server.status_thread_pool_size, self.cfg_controller.take().unwrap(), Arc::new(self.config.security.clone()), - self.router.clone(), + self.engines.as_ref().unwrap().engine.raft_extension(), self.store_path.clone(), ) { Ok(status_server) => Box::new(status_server), @@ -1951,13 +1952,12 @@ fn get_lock_dir() -> String { /// A small trait for components which can be trivially stopped. Lets us keep /// a list of these in `TiKV`, rather than storing each component individually. -trait Stop { +pub(crate) trait Stop { fn stop(self: Box); } -impl Stop for StatusServer +impl Stop for StatusServer where - E: 'static, R: 'static + Send, { fn stop(self: Box) { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs new file mode 100644 index 00000000000..cfda8feb233 --- /dev/null +++ b/components/server/src/server2.rs @@ -0,0 +1,1759 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module startups all the components of a TiKV server. +//! +//! It is responsible for reading from configs, starting up the various server +//! components, and handling errors (mostly by aborting and reporting to the +//! user). +//! +//! The entry point is `run_tikv`. +//! +//! Components are often used to initialize other components, and/or must be +//! explicitly stopped. We keep these components in the `TikvServer` struct. + +use std::{ + cmp, + collections::HashMap, + env, + net::SocketAddr, + path::{Path, PathBuf}, + str::FromStr, + sync::{ + atomic::{AtomicU32, AtomicU64, Ordering}, + mpsc, Arc, + }, + time::Duration, + u64, +}; + +use api_version::{dispatch_api_version, KvFormat}; +use causal_ts::CausalTsProviderImpl; +use concurrency_manager::ConcurrencyManager; +use encryption_export::{data_key_manager_from_config, DataKeyManager}; +use engine_rocks::{ + flush_engine_statistics, + raw::{Cache, Env}, + FlowInfo, RocksEngine, RocksStatistics, +}; +use engine_traits::{ + CachedTablet, CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, + RaftEngine, StatisticsReporter, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, +}; +use error_code::ErrorCodeExt; +use file_system::{ + get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor, + MetricsManager as IoMetricsManager, +}; +use futures::executor::block_on; +use grpcio::{EnvBuilder, Environment}; +use grpcio_health::HealthService; +use kvproto::{deadlock::create_deadlock, diagnosticspb::create_diagnostics, kvrpcpb::ApiVersion}; +use pd_client::{PdClient, RpcClient}; +use raft_log_engine::RaftLogEngine; +use raftstore::{ + coprocessor::{ + BoxConsistencyCheckObserver, ConsistencyCheckMethod, CoprocessorHost, + RawConsistencyCheckObserver, + }, + store::{memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, SplitConfigManager, TabletSnapManager}, + RegionInfoAccessor, +}; +use security::SecurityManager; +use tikv::{ + config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, + coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, + coprocessor_v2, + read_pool::{build_yatp_read_pool, ReadPool}, + server::{ + config::{Config as ServerConfig, ServerConfigManager}, + gc_worker::{AutoGcConfig, GcWorker}, + lock_manager::LockManager, + raftkv::ReplicaReadLockChecker, + resolve, + service::DiagnosticsService, + status_server::StatusServer, + KvEngineFactoryBuilder, NodeV2, RaftKv2, Server, CPU_CORES_QUOTA_GAUGE, DEFAULT_CLUSTER_ID, + GRPC_THREAD_PREFIX, + }, + storage::{ + self, + config_manager::StorageConfigManger, + mvcc::MvccConsistencyCheckObserver, + txn::flow_controller::{FlowController, TabletFlowController}, + Engine, Storage, + }, +}; +use tikv_util::{ + check_environment_variables, + config::{ensure_dir_exist, RaftDataStateMachine, VersionTrack}, + math::MovingAvgU32, + metrics::INSTANCE_BACKEND_CPU_QUOTA, + quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, + sys::{ + cpu_time::ProcessStat, disk, path_in_diff_mount_point, register_memory_usage_high_water, + SysQuota, + }, + thread_group::GroupProperties, + time::{Instant, Monitor}, + worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + Either, +}; +use tokio::runtime::Builder; + +use crate::{ + memory::*, raft_engine_switch::*, server::Stop, setup::*, signal_handler, + tikv_util::sys::thread::ThreadBuildWrapper, +}; + +// minimum number of core kept for background requests +const BACKGROUND_REQUEST_CORE_LOWER_BOUND: f64 = 1.0; +// max ratio of core quota for background requests +const BACKGROUND_REQUEST_CORE_MAX_RATIO: f64 = 0.95; +// default ratio of core quota for background requests = core_number * 0.5 +const BACKGROUND_REQUEST_CORE_DEFAULT_RATIO: f64 = 0.5; +// indication of TiKV instance is short of cpu +const SYSTEM_BUSY_THRESHOLD: f64 = 0.80; +// indication of TiKV instance in healthy state when cpu usage is in [0.5, 0.80) +const SYSTEM_HEALTHY_THRESHOLD: f64 = 0.50; +// pace of cpu quota adjustment +const CPU_QUOTA_ADJUSTMENT_PACE: f64 = 200.0; // 0.2 vcpu + +#[inline] +fn run_impl(config: TikvConfig) { + let mut tikv = TikvServer::::init::(config); + + // Must be called after `TikvServer::init`. + let memory_limit = tikv.config.memory_usage_limit.unwrap().0; + let high_water = (tikv.config.memory_usage_high_water * memory_limit as f64) as u64; + register_memory_usage_high_water(high_water); + + tikv.check_conflict_addr(); + tikv.init_fs(); + tikv.init_yatp(); + tikv.init_encryption(); + let fetcher = tikv.init_io_utility(); + let listener = tikv.init_flow_receiver(); + let (raft_engine, engines_info) = tikv.init_raw_engines(listener); + tikv.init_engines(raft_engine); + let server_config = tikv.init_servers::(); + tikv.register_services(); + tikv.init_metrics_flusher(fetcher, engines_info); + tikv.init_storage_stats_task(); + tikv.run_server(server_config); + tikv.run_status_server(); + tikv.init_quota_tuning_task(tikv.quota_limiter.clone()); + + // TODO: support signal dump stats + signal_handler::wait_for_signal( + None as Option>, + tikv.kv_statistics.clone(), + tikv.raft_statistics.clone(), + ); + tikv.stop(); +} + +/// Run a TiKV server. Returns when the server is shutdown by the user, in which +/// case the server will be properly stopped. +pub fn run_tikv(config: TikvConfig) { + // Sets the global logger ASAP. + // It is okay to use the config w/o `validate()`, + // because `initial_logger()` handles various conditions. + initial_logger(&config); + + // Print version information. + let build_timestamp = option_env!("TIKV_BUILD_TIME"); + tikv::log_tikv_info(build_timestamp); + + // Print resource quota. + SysQuota::log_quota(); + CPU_CORES_QUOTA_GAUGE.set(SysQuota::cpu_cores_quota()); + + // Do some prepare works before start. + pre_start(); + + let _m = Monitor::default(); + + dispatch_api_version!(config.storage.api_version(), { + if !config.raft_engine.enable { + run_impl::(config) + } else { + run_impl::(config) + } + }) +} + +const RESERVED_OPEN_FDS: u64 = 1000; + +const DEFAULT_METRICS_FLUSH_INTERVAL: Duration = Duration::from_millis(10_000); +const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); +const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); +const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); +const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); + +/// A complete TiKV server. +struct TikvServer { + config: TikvConfig, + cfg_controller: Option, + security_mgr: Arc, + pd_client: Arc, + flow_info_sender: Option>, + flow_info_receiver: Option>, + node: Option>, + resolver: Option, + store_path: PathBuf, + snap_mgr: Option, // Will be filled in `init_servers`. + encryption_key_manager: Option>, + engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, + servers: Option>, + region_info_accessor: Option, + coprocessor_host: Option>, + to_stop: Vec>, + lock_files: Vec, + concurrency_manager: ConcurrencyManager, + env: Arc, + background_worker: Worker, + sst_worker: Option>>, + quota_limiter: Arc, + causal_ts_provider: Option>, // used for rawkv apiv2 + tablet_registry: Option>, +} + +struct TikvEngines { + raft_engine: ER, + engine: RaftKv2, +} + +struct Servers { + lock_mgr: LockManager, + server: LocalServer, +} + +type LocalServer = Server>; + +impl TikvServer +where + ER: RaftEngine, +{ + fn init(mut config: TikvConfig) -> TikvServer { + tikv_util::thread_group::set_properties(Some(GroupProperties::default())); + // It is okay use pd config and security config before `init_config`, + // because these configs must be provided by command line, and only + // used during startup process. + let security_mgr = Arc::new( + SecurityManager::new(&config.security) + .unwrap_or_else(|e| fatal!("failed to create security manager: {}", e)), + ); + let env = Arc::new( + EnvBuilder::new() + .cq_count(config.server.grpc_concurrency) + .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) + .build(), + ); + let pd_client = + Self::connect_to_pd_cluster(&mut config, env.clone(), Arc::clone(&security_mgr)); + + // Initialize and check config + let cfg_controller = Self::init_config(config); + let config = cfg_controller.get_current(); + + let store_path = Path::new(&config.storage.data_dir).to_owned(); + + let thread_count = config.server.background_thread_count; + let background_worker = WorkerBuilder::new("background") + .thread_count(thread_count) + .create(); + + // Initialize concurrency manager + let latest_ts = block_on(pd_client.get_tso()).expect("failed to get timestamp from PD"); + let concurrency_manager = ConcurrencyManager::new(latest_ts); + + // use different quota for front-end and back-end requests + let quota_limiter = Arc::new(QuotaLimiter::new( + config.quota.foreground_cpu_time, + config.quota.foreground_write_bandwidth, + config.quota.foreground_read_bandwidth, + config.quota.background_cpu_time, + config.quota.background_write_bandwidth, + config.quota.background_read_bandwidth, + config.quota.max_delay_duration, + config.quota.enable_auto_tune, + )); + + let mut causal_ts_provider = None; + if let ApiVersion::V2 = F::TAG { + let tso = block_on(causal_ts::BatchTsoProvider::new_opt( + pd_client.clone(), + config.causal_ts.renew_interval.0, + config.causal_ts.alloc_ahead_buffer.0, + config.causal_ts.renew_batch_min_size, + config.causal_ts.renew_batch_max_size, + )); + if let Err(e) = tso { + fatal!("Causal timestamp provider initialize failed: {:?}", e); + } + causal_ts_provider = Some(Arc::new(tso.unwrap().into())); + info!("Causal timestamp provider startup."); + } + + TikvServer { + config, + cfg_controller: Some(cfg_controller), + security_mgr, + pd_client, + node: None, + resolver: None, + store_path, + snap_mgr: None, + encryption_key_manager: None, + engines: None, + kv_statistics: None, + raft_statistics: None, + servers: None, + region_info_accessor: None, + coprocessor_host: None, + to_stop: vec![], + lock_files: vec![], + concurrency_manager, + env, + background_worker, + flow_info_sender: None, + flow_info_receiver: None, + sst_worker: None, + quota_limiter, + causal_ts_provider, + tablet_registry: None, + } + } + + /// Initialize and check the config + /// + /// Warnings are logged and fatal errors exist. + /// + /// # Fatal errors + /// + /// - If `dynamic config` feature is enabled and failed to register config + /// to PD + /// - If some critical configs (like data dir) are differrent from last run + /// - If the config can't pass `validate()` + /// - If the max open file descriptor limit is not high enough to support + /// the main database and the raft database. + fn init_config(mut config: TikvConfig) -> ConfigController { + validate_and_persist_config(&mut config, true); + + ensure_dir_exist(&config.storage.data_dir).unwrap(); + if !config.rocksdb.wal_dir.is_empty() { + ensure_dir_exist(&config.rocksdb.wal_dir).unwrap(); + } + if config.raft_engine.enable { + ensure_dir_exist(&config.raft_engine.config().dir).unwrap(); + } else { + ensure_dir_exist(&config.raft_store.raftdb_path).unwrap(); + if !config.raftdb.wal_dir.is_empty() { + ensure_dir_exist(&config.raftdb.wal_dir).unwrap(); + } + } + + check_system_config(&config); + + tikv_util::set_panic_hook(config.abort_on_panic, &config.storage.data_dir); + + info!( + "using config"; + "config" => serde_json::to_string(&config).unwrap(), + ); + if config.panic_when_unexpected_key_or_data { + info!("panic-when-unexpected-key-or-data is on"); + tikv_util::set_panic_when_unexpected_key_or_data(true); + } + + config.write_into_metrics(); + + ConfigController::new(config) + } + + fn connect_to_pd_cluster( + config: &mut TikvConfig, + env: Arc, + security_mgr: Arc, + ) -> Arc { + let pd_client = Arc::new( + RpcClient::new(&config.pd, Some(env), security_mgr) + .unwrap_or_else(|e| fatal!("failed to create rpc client: {}", e)), + ); + + let cluster_id = pd_client + .get_cluster_id() + .unwrap_or_else(|e| fatal!("failed to get cluster id: {}", e)); + if cluster_id == DEFAULT_CLUSTER_ID { + fatal!("cluster id can't be {}", DEFAULT_CLUSTER_ID); + } + config.server.cluster_id = cluster_id; + info!( + "connect to PD cluster"; + "cluster_id" => cluster_id + ); + + pd_client + } + + fn check_conflict_addr(&mut self) { + let cur_addr: SocketAddr = self + .config + .server + .addr + .parse() + .expect("failed to parse into a socket address"); + let cur_ip = cur_addr.ip(); + let cur_port = cur_addr.port(); + let lock_dir = get_lock_dir(); + + let search_base = env::temp_dir().join(lock_dir); + file_system::create_dir_all(&search_base) + .unwrap_or_else(|_| panic!("create {} failed", search_base.display())); + + for entry in file_system::read_dir(&search_base).unwrap().flatten() { + if !entry.file_type().unwrap().is_file() { + continue; + } + let file_path = entry.path(); + let file_name = file_path.file_name().unwrap().to_str().unwrap(); + if let Ok(addr) = file_name.replace('_', ":").parse::() { + let ip = addr.ip(); + let port = addr.port(); + if cur_port == port + && (cur_ip == ip || cur_ip.is_unspecified() || ip.is_unspecified()) + { + let _ = try_lock_conflict_addr(file_path); + } + } + } + + let cur_path = search_base.join(cur_addr.to_string().replace(':', "_")); + let cur_file = try_lock_conflict_addr(cur_path); + self.lock_files.push(cur_file); + } + + fn init_fs(&mut self) { + let lock_path = self.store_path.join(Path::new("LOCK")); + + let f = File::create(lock_path.as_path()) + .unwrap_or_else(|e| fatal!("failed to create lock at {}: {}", lock_path.display(), e)); + if f.try_lock_exclusive().is_err() { + fatal!( + "lock {} failed, maybe another instance is using this directory.", + self.store_path.display() + ); + } + self.lock_files.push(f); + + if tikv_util::panic_mark_file_exists(&self.config.storage.data_dir) { + fatal!( + "panic_mark_file {} exists, there must be something wrong with the db. \ + Do not remove the panic_mark_file and force the TiKV node to restart. \ + Please contact TiKV maintainers to investigate the issue. \ + If needed, use scale in and scale out to replace the TiKV node. \ + https://docs.pingcap.com/tidb/stable/scale-tidb-using-tiup", + tikv_util::panic_mark_file_path(&self.config.storage.data_dir).display() + ); + } + + // We truncate a big file to make sure that both raftdb and kvdb of TiKV have + // enough space to do compaction and region migration when TiKV recover. + // This file is created in data_dir rather than db_path, because we must not + // increase store size of db_path. + fn calculate_reserved_space(capacity: u64, reserved_size_from_config: u64) -> u64 { + let mut reserved_size = reserved_size_from_config; + if reserved_size_from_config != 0 { + reserved_size = + cmp::max((capacity as f64 * 0.05) as u64, reserved_size_from_config); + } + reserved_size + } + fn reserve_physical_space(data_dir: &String, available: u64, reserved_size: u64) { + let path = Path::new(data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); + if let Err(e) = file_system::remove_file(path) { + warn!("failed to remove space holder on starting: {}", e); + } + + // place holder file size is 20% of total reserved space. + if available > reserved_size { + file_system::reserve_space_for_recover(data_dir, reserved_size / 5) + .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) + .unwrap(); + } else { + warn!("no enough disk space left to create the place holder file"); + } + } + + let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); + let mut capacity = disk_stats.total_space(); + if self.config.raft_store.capacity.0 > 0 { + capacity = cmp::min(capacity, self.config.raft_store.capacity.0); + } + // reserve space for kv engine + let kv_reserved_size = + calculate_reserved_space(capacity, self.config.storage.reserve_space.0); + disk::set_disk_reserved_space(kv_reserved_size); + reserve_physical_space( + &self.config.storage.data_dir, + disk_stats.available_space(), + kv_reserved_size, + ); + + let raft_data_dir = if self.config.raft_engine.enable { + self.config.raft_engine.config().dir + } else { + self.config.raft_store.raftdb_path.clone() + }; + + let separated_raft_mount_path = + path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); + if separated_raft_mount_path { + let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); + // reserve space for raft engine if raft engine is deployed separately + let raft_reserved_size = calculate_reserved_space( + raft_disk_stats.total_space(), + self.config.storage.reserve_raft_space.0, + ); + disk::set_raft_disk_reserved_space(raft_reserved_size); + reserve_physical_space( + &raft_data_dir, + raft_disk_stats.available_space(), + raft_reserved_size, + ); + } + } + + fn init_yatp(&self) { + yatp::metrics::set_namespace(Some("tikv")); + prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL0_CHANCE.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL_ELAPSED.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_POLL_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_TIMES.clone())).unwrap(); + } + + fn init_encryption(&mut self) { + self.encryption_key_manager = data_key_manager_from_config( + &self.config.security.encryption, + &self.config.storage.data_dir, + ) + .map_err(|e| { + panic!( + "Encryption failed to initialize: {}. code: {}", + e, + e.error_code() + ) + }) + .unwrap() + .map(Arc::new); + } + + fn init_flow_receiver(&mut self) -> engine_rocks::FlowListener { + let (tx, rx) = mpsc::channel(); + self.flow_info_sender = Some(tx.clone()); + self.flow_info_receiver = Some(rx); + engine_rocks::FlowListener::new(tx) + } + + fn init_engines(&mut self, raft_engine: ER) { + let tablet_registry = self.tablet_registry.clone().unwrap(); + let mut node = NodeV2::new( + &self.config.server, + self.pd_client.clone(), + None, + tablet_registry, + ); + node.try_bootstrap_store(&self.config.raft_store, &raft_engine) + .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); + assert_ne!(node.id(), 0); + + let router = node.router(); + let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( + router.store_router().clone(), + self.config.coprocessor.clone(), + ); + let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + + let engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); + + self.engines = Some(TikvEngines { + raft_engine, + engine, + }); + self.node = Some(node); + self.coprocessor_host = Some(coprocessor_host); + self.region_info_accessor = Some(region_info_accessor); + } + + fn init_gc_worker(&mut self) -> GcWorker> { + let engines = self.engines.as_ref().unwrap(); + let gc_worker = GcWorker::new( + engines.engine.clone(), + self.flow_info_sender.take().unwrap(), + self.config.gc.clone(), + self.pd_client.feature_gate().clone(), + Arc::new(self.region_info_accessor.clone().unwrap()), + ); + + let cfg_controller = self.cfg_controller.as_mut().unwrap(); + cfg_controller.register( + tikv::config::Module::Gc, + Box::new(gc_worker.get_config_manager()), + ); + + gc_worker + } + + fn init_servers(&mut self) -> Arc> { + let flow_controller = Arc::new(FlowController::Tablet(TabletFlowController::new( + &self.config.storage.flow_control, + self.tablet_registry.clone().unwrap(), + self.flow_info_receiver.take().unwrap(), + ))); + let mut gc_worker = self.init_gc_worker(); + let ttl_checker = Box::new(LazyWorker::new("ttl-checker")); + let ttl_scheduler = ttl_checker.scheduler(); + + let cfg_controller = self.cfg_controller.as_mut().unwrap(); + + cfg_controller.register( + tikv::config::Module::Quota, + Box::new(QuotaLimitConfigManager::new(Arc::clone( + &self.quota_limiter, + ))), + ); + + cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + + let lock_mgr = LockManager::new(&self.config.pessimistic_txn); + cfg_controller.register( + tikv::config::Module::PessimisticTxn, + Box::new(lock_mgr.config_manager()), + ); + lock_mgr.register_detector_role_change_observer(self.coprocessor_host.as_mut().unwrap()); + + let engines = self.engines.as_ref().unwrap(); + + let pd_worker = LazyWorker::new("pd-worker"); + let pd_sender = raftstore_v2::FlowReporter::new(pd_worker.scheduler()); + + let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + Some(build_yatp_read_pool( + &self.config.readpool.unified, + pd_sender.clone(), + engines.engine.clone(), + )) + } else { + None + }; + + // The `DebugService` and `DiagnosticsService` will share the same thread pool + let props = tikv_util::thread_group::current_properties(); + let debug_thread_pool = Arc::new( + Builder::new_multi_thread() + .thread_name(thd_name!("debugger")) + .worker_threads(1) + .after_start_wrapper(move || { + tikv_alloc::add_thread_memory_accessor(); + tikv_util::thread_group::set_properties(props.clone()); + }) + .before_stop_wrapper(tikv_alloc::remove_thread_memory_accessor) + .build() + .unwrap(), + ); + + // Start resource metering. + let (recorder_notifier, collector_reg_handle, resource_tag_factory, recorder_worker) = + resource_metering::init_recorder(self.config.resource_metering.precision.as_millis()); + self.to_stop.push(recorder_worker); + let (reporter_notifier, data_sink_reg_handle, reporter_worker) = + resource_metering::init_reporter( + self.config.resource_metering.clone(), + collector_reg_handle, + ); + self.to_stop.push(reporter_worker); + let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( + self.config.resource_metering.receiver_address.clone(), + self.env.clone(), + data_sink_reg_handle, + ); + self.to_stop.push(single_target_worker); + + let cfg_manager = resource_metering::ConfigManager::new( + self.config.resource_metering.clone(), + recorder_notifier, + reporter_notifier, + address_change_notifier, + ); + cfg_controller.register( + tikv::config::Module::ResourceMetering, + Box::new(cfg_manager), + ); + + let storage_read_pool_handle = if self.config.readpool.storage.use_unified_pool() { + unified_read_pool.as_ref().unwrap().handle() + } else { + let storage_read_pools = ReadPool::from(storage::build_read_pool( + &self.config.readpool.storage, + pd_sender.clone(), + engines.engine.clone(), + )); + storage_read_pools.handle() + }; + + let storage = Storage::<_, _, F>::from_engine( + engines.engine.clone(), + &self.config.storage, + storage_read_pool_handle, + lock_mgr.clone(), + self.concurrency_manager.clone(), + lock_mgr.get_storage_dynamic_configs(), + flow_controller.clone(), + pd_sender.clone(), + resource_tag_factory.clone(), + Arc::clone(&self.quota_limiter), + self.pd_client.feature_gate().clone(), + self.causal_ts_provider.clone(), + ) + .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); + cfg_controller.register( + tikv::config::Module::Storage, + Box::new(StorageConfigManger::new( + self.tablet_registry.as_ref().unwrap().clone(), + ttl_scheduler, + flow_controller, + storage.get_scheduler(), + )), + ); + + let (resolver, state) = resolve::new_resolver( + self.pd_client.clone(), + &self.background_worker, + storage.get_engine().raft_extension(), + ); + self.resolver = Some(resolver); + + ReplicaReadLockChecker::new(self.concurrency_manager.clone()) + .register(self.coprocessor_host.as_mut().unwrap()); + + // Create snapshot manager, server. + let snap_path = self + .store_path + .join(Path::new("tablet_snap")) + .to_str() + .unwrap() + .to_owned(); + + let snap_mgr = TabletSnapManager::new(snap_path); + + // Create coprocessor endpoint. + let cop_read_pool_handle = if self.config.readpool.coprocessor.use_unified_pool() { + unified_read_pool.as_ref().unwrap().handle() + } else { + let cop_read_pools = ReadPool::from(coprocessor::readpool_impl::build_read_pool( + &self.config.readpool.coprocessor, + pd_sender, + engines.engine.clone(), + )); + cop_read_pools.handle() + }; + + let server_config = Arc::new(VersionTrack::new(self.config.server.clone())); + + self.config + .raft_store + .validate( + self.config.coprocessor.region_split_size, + self.config.coprocessor.enable_region_bucket, + self.config.coprocessor.region_bucket_size, + ) + .unwrap_or_else(|e| fatal!("failed to validate raftstore config {}", e)); + let raft_store = Arc::new(VersionTrack::new(self.config.raft_store.clone())); + let health_service = HealthService::default(); + + let node = self.node.as_ref().unwrap(); + + self.snap_mgr = Some(snap_mgr.clone()); + // Create server + let server = Server::new( + node.id(), + &server_config, + &self.security_mgr, + storage, + coprocessor::Endpoint::new( + &server_config.value(), + cop_read_pool_handle, + self.concurrency_manager.clone(), + resource_tag_factory, + Arc::clone(&self.quota_limiter), + ), + coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), + self.resolver.clone().unwrap(), + Either::Right(snap_mgr.clone()), + gc_worker.clone(), + None, + self.env.clone(), + unified_read_pool, + debug_thread_pool, + health_service, + ) + .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); + cfg_controller.register( + tikv::config::Module::Server, + Box::new(ServerConfigManager::new( + server.get_snap_worker_scheduler(), + server_config.clone(), + server.get_grpc_mem_quota().clone(), + )), + ); + + let split_config_manager = + SplitConfigManager::new(Arc::new(VersionTrack::new(self.config.split.clone()))); + cfg_controller.register(tikv::config::Module::Split, Box::new(split_config_manager)); + + // `ConsistencyCheckObserver` must be registered before `Node::start`. + let safe_point = Arc::new(AtomicU64::new(0)); + let observer = match self.config.coprocessor.consistency_check_method { + ConsistencyCheckMethod::Mvcc => BoxConsistencyCheckObserver::new( + MvccConsistencyCheckObserver::new(safe_point.clone()), + ), + ConsistencyCheckMethod::Raw => { + BoxConsistencyCheckObserver::new(RawConsistencyCheckObserver::default()) + } + }; + self.coprocessor_host + .as_mut() + .unwrap() + .registry + .register_consistency_check_observer(100, observer); + + self.node + .as_mut() + .unwrap() + .start( + engines.raft_engine.clone(), + server.transport(), + snap_mgr, + self.concurrency_manager.clone(), + self.causal_ts_provider.clone(), + self.coprocessor_host.clone().unwrap(), + self.background_worker.clone(), + pd_worker, + raft_store, + &state, + ) + .unwrap_or_else(|e| fatal!("failed to start node: {}", e)); + + // Start auto gc. Must after `Node::start` because `node_id` is initialized + // there. + let store_id = self.node.as_ref().unwrap().id(); + let auto_gc_config = AutoGcConfig::new( + self.pd_client.clone(), + self.region_info_accessor.clone().unwrap(), + store_id, + ); + gc_worker + .start(store_id) + .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); + if let Err(e) = gc_worker.start_auto_gc(auto_gc_config, safe_point) { + fatal!("failed to start auto_gc on storage, error: {}", e); + } + + initial_metric(&self.config.metric); + + self.servers = Some(Servers { lock_mgr, server }); + + server_config + } + + fn register_services(&mut self) { + let servers = self.servers.as_mut().unwrap(); + + // Create Diagnostics service + let diag_service = DiagnosticsService::new( + servers.server.get_debug_thread_pool().clone(), + self.config.log.file.filename.clone(), + self.config.slow_log_file.clone(), + ); + if servers + .server + .register_service(create_diagnostics(diag_service)) + .is_some() + { + fatal!("failed to register diagnostics service"); + } + + // Lock manager. + if servers + .server + .register_service(create_deadlock(servers.lock_mgr.deadlock_service())) + .is_some() + { + fatal!("failed to register deadlock service"); + } + + servers + .lock_mgr + .start( + self.node.as_ref().unwrap().id(), + self.pd_client.clone(), + self.resolver.clone().unwrap(), + self.security_mgr.clone(), + &self.config.pessimistic_txn, + ) + .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); + } + + fn init_io_utility(&mut self) -> BytesFetcher { + let stats_collector_enabled = file_system::init_io_stats_collector() + .map_err(|e| warn!("failed to init I/O stats collector: {}", e)) + .is_ok(); + + let limiter = Arc::new( + self.config + .storage + .io_rate_limit + .build(!stats_collector_enabled /* enable_statistics */), + ); + let fetcher = if stats_collector_enabled { + BytesFetcher::FromIoStatsCollector() + } else { + BytesFetcher::FromRateLimiter(limiter.statistics().unwrap()) + }; + // Set up IO limiter even when rate limit is disabled, so that rate limits can + // be dynamically applied later on. + set_io_rate_limiter(Some(limiter)); + fetcher + } + + fn init_metrics_flusher( + &mut self, + fetcher: BytesFetcher, + engines_info: Arc, + ) { + let mut engine_metrics = EngineMetricsManager::::new( + self.tablet_registry.clone().unwrap(), + self.kv_statistics.clone(), + self.config.rocksdb.titan.enabled, + self.engines.as_ref().unwrap().raft_engine.clone(), + self.raft_statistics.clone(), + ); + let mut io_metrics = IoMetricsManager::new(fetcher); + let engines_info_clone = engines_info.clone(); + + // region_id -> (suffix, tablet) + // `update` of EnginesResourceInfo is called perodically which needs this map + // for recording the latest tablet for each region. + // `cached_latest_tablets` is passed to `update` to avoid memory + // allocation each time when calling `update`. + let mut cached_latest_tablets = HashMap::default(); + self.background_worker + .spawn_interval_task(DEFAULT_METRICS_FLUSH_INTERVAL, move || { + let now = Instant::now(); + engine_metrics.flush(now); + io_metrics.flush(now); + engines_info_clone.update(now, &mut cached_latest_tablets); + }); + if let Some(limiter) = get_io_rate_limiter() { + limiter.set_low_priority_io_adjustor_if_needed(Some(engines_info)); + } + + let mut mem_trace_metrics = MemoryTraceManager::default(); + mem_trace_metrics.register_provider(MEMTRACE_RAFTSTORE.clone()); + mem_trace_metrics.register_provider(MEMTRACE_COPROCESSOR.clone()); + self.background_worker + .spawn_interval_task(DEFAULT_MEMTRACE_FLUSH_INTERVAL, move || { + let now = Instant::now(); + mem_trace_metrics.flush(now); + }); + } + + // Only background cpu quota tuning is implemented at present. iops and frontend + // quota tuning is on the way + fn init_quota_tuning_task(&self, quota_limiter: Arc) { + // No need to do auto tune when capacity is really low + if SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO + < BACKGROUND_REQUEST_CORE_LOWER_BOUND + { + return; + }; + + // Determine the base cpu quota + let base_cpu_quota = + // if cpu quota is not specified, start from optimistic case + if quota_limiter.cputime_limiter(false).is_infinite() { + 1000_f64 + * f64::max( + BACKGROUND_REQUEST_CORE_LOWER_BOUND, + SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_DEFAULT_RATIO, + ) + } else { + quota_limiter.cputime_limiter(false) / 1000_f64 + }; + + // Calculate the celling and floor quota + let celling_quota = f64::min( + base_cpu_quota * 2.0, + 1_000_f64 * SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO, + ); + let floor_quota = f64::max( + base_cpu_quota * 0.5, + 1_000_f64 * BACKGROUND_REQUEST_CORE_LOWER_BOUND, + ); + + let mut proc_stats: ProcessStat = ProcessStat::cur_proc_stat().unwrap(); + self.background_worker.spawn_interval_task( + DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL, + move || { + if quota_limiter.auto_tune_enabled() { + let cputime_limit = quota_limiter.cputime_limiter(false); + let old_quota = if cputime_limit.is_infinite() { + base_cpu_quota + } else { + cputime_limit / 1000_f64 + }; + let cpu_usage = match proc_stats.cpu_usage() { + Ok(r) => r, + Err(_e) => 0.0, + }; + // Try tuning quota when cpu_usage is correctly collected. + // rule based tuning: + // - if instance is busy, shrink cpu quota for analyze by one quota pace until + // lower bound is hit; + // - if instance cpu usage is healthy, no op; + // - if instance is idle, increase cpu quota by one quota pace until upper + // bound is hit. + if cpu_usage > 0.0f64 { + let mut target_quota = old_quota; + + let cpu_util = cpu_usage / SysQuota::cpu_cores_quota(); + if cpu_util >= SYSTEM_BUSY_THRESHOLD { + target_quota = + f64::max(target_quota - CPU_QUOTA_ADJUSTMENT_PACE, floor_quota); + } else if cpu_util < SYSTEM_HEALTHY_THRESHOLD { + target_quota = + f64::min(target_quota + CPU_QUOTA_ADJUSTMENT_PACE, celling_quota); + } + + if old_quota != target_quota { + quota_limiter.set_cpu_time_limit(target_quota as usize, false); + debug!( + "cpu_time_limiter tuned for backend request"; + "cpu_util" => ?cpu_util, + "new quota" => ?target_quota); + INSTANCE_BACKEND_CPU_QUOTA.set(target_quota as i64); + } + } + } + }, + ); + } + + fn init_storage_stats_task(&self) { + let config_disk_capacity: u64 = self.config.raft_store.capacity.0; + let data_dir = self.config.storage.data_dir.clone(); + let store_path = self.store_path.clone(); + let snap_mgr = self.snap_mgr.clone().unwrap(); + let reserve_space = disk::get_disk_reserved_space(); + let reserve_raft_space = disk::get_raft_disk_reserved_space(); + if reserve_space == 0 && reserve_raft_space == 0 { + info!("disk space checker not enabled"); + return; + } + let raft_engine = self.engines.as_ref().unwrap().raft_engine.clone(); + let tablet_registry = self.tablet_registry.clone().unwrap(); + let raft_path = raft_engine.get_engine_path().to_string(); + let separated_raft_mount_path = + path_in_diff_mount_point(raft_path.as_str(), tablet_registry.tablet_root()); + let raft_almost_full_threshold = reserve_raft_space; + let raft_already_full_threshold = reserve_raft_space / 2; + + let almost_full_threshold = reserve_space; + let already_full_threshold = reserve_space / 2; + fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { + match (a, b) { + (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, + (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, + (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, + (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, + (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, + } + } + self.background_worker + .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { + let disk_stats = match fs2::statvfs(&store_path) { + Err(e) => { + error!( + "get disk stat for kv store failed"; + "kv path" => store_path.to_str(), + "err" => ?e + ); + return; + } + Ok(stats) => stats, + }; + let disk_cap = disk_stats.total_space(); + let snap_size = snap_mgr.total_snap_size().unwrap(); + + let mut kv_size = 0; + tablet_registry.for_each_opened_tablet(|_, cached| { + if let Some(tablet) = cached.latest() { + kv_size += tablet.get_engine_used_size().unwrap_or(0); + } + true + }); + + let raft_size = raft_engine + .get_engine_size() + .expect("get raft engine size"); + + let mut raft_disk_status = disk::DiskUsage::Normal; + if separated_raft_mount_path && reserve_raft_space != 0 { + let raft_disk_stats = match fs2::statvfs(&raft_path) { + Err(e) => { + error!( + "get disk stat for raft engine failed"; + "raft engine path" => raft_path.clone(), + "err" => ?e + ); + return; + } + Ok(stats) => stats, + }; + let raft_disk_cap = raft_disk_stats.total_space(); + let mut raft_disk_available = + raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); + raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); + raft_disk_status = if raft_disk_available <= raft_already_full_threshold + { + disk::DiskUsage::AlreadyFull + } else if raft_disk_available <= raft_almost_full_threshold + { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + } + let placeholer_file_path = PathBuf::from_str(&data_dir) + .unwrap() + .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); + + let placeholder_size: u64 = + file_system::get_file_size(placeholer_file_path).unwrap_or(0); + + let used_size = if !separated_raft_mount_path { + snap_size + kv_size + raft_size + placeholder_size + } else { + snap_size + kv_size + placeholder_size + }; + let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { + disk_cap + } else { + config_disk_capacity + }; + + let mut available = capacity.checked_sub(used_size).unwrap_or_default(); + available = cmp::min(available, disk_stats.available_space()); + + let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. + let cur_kv_disk_status = if available <= already_full_threshold { + disk::DiskUsage::AlreadyFull + } else if available <= almost_full_threshold { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); + if prev_disk_status != cur_disk_status { + warn!( + "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", + prev_disk_status, + cur_disk_status, + raft_disk_status, + cur_kv_disk_status, + separated_raft_mount_path, + available, + snap_size, + kv_size, + raft_size, + capacity + ); + } + disk::set_disk_status(cur_disk_status); + }) + } + + fn init_sst_recovery_sender(&mut self) -> Option> { + if !self + .config + .storage + .background_error_recovery_window + .is_zero() + { + let sst_worker = Box::new(LazyWorker::new("sst-recovery")); + let scheduler = sst_worker.scheduler(); + self.sst_worker = Some(sst_worker); + Some(scheduler) + } else { + None + } + } + + fn run_server(&mut self, server_config: Arc>) { + let server = self.servers.as_mut().unwrap(); + server + .server + .build_and_bind() + .unwrap_or_else(|e| fatal!("failed to build server: {}", e)); + server + .server + .start(server_config, self.security_mgr.clone()) + .unwrap_or_else(|e| fatal!("failed to start server: {}", e)); + } + + fn run_status_server(&mut self) { + // Create a status server. + let status_enabled = !self.config.server.status_addr.is_empty(); + if status_enabled { + let mut status_server = match StatusServer::new( + self.config.server.status_thread_pool_size, + self.cfg_controller.take().unwrap(), + Arc::new(self.config.security.clone()), + self.engines.as_ref().unwrap().engine.raft_extension(), + self.store_path.clone(), + ) { + Ok(status_server) => Box::new(status_server), + Err(e) => { + error_unknown!(%e; "failed to start runtime for status service"); + return; + } + }; + // Start the status server. + if let Err(e) = status_server.start(self.config.server.status_addr.clone()) { + error_unknown!(%e; "failed to bind addr for status service"); + } else { + self.to_stop.push(status_server); + } + } + } + + fn stop(mut self) { + tikv_util::thread_group::mark_shutdown(); + let mut servers = self.servers.unwrap(); + servers + .server + .stop() + .unwrap_or_else(|e| fatal!("failed to stop server: {}", e)); + + self.node.as_mut().unwrap().stop(); + self.region_info_accessor.as_mut().unwrap().stop(); + + servers.lock_mgr.stop(); + + if let Some(sst_worker) = self.sst_worker { + sst_worker.stop_worker(); + } + + self.to_stop.into_iter().for_each(|s| s.stop()); + } +} + +pub trait ConfiguredRaftEngine: RaftEngine { + fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Cache, + ) -> (Self, Option>); + fn as_rocks_engine(&self) -> Option<&RocksEngine>; + fn register_config(&self, _cfg_controller: &mut ConfigController); +} + +impl ConfiguredRaftEngine for T { + default fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Cache, + ) -> (Self, Option>) { + unimplemented!() + } + default fn as_rocks_engine(&self) -> Option<&RocksEngine> { + None + } + default fn register_config(&self, _cfg_controller: &mut ConfigController) {} +} + +impl ConfiguredRaftEngine for RocksEngine { + fn build( + config: &TikvConfig, + env: &Arc, + key_manager: &Option>, + block_cache: &Cache, + ) -> (Self, Option>) { + let mut raft_data_state_machine = RaftDataStateMachine::new( + &config.storage.data_dir, + &config.raft_engine.config().dir, + &config.raft_store.raftdb_path, + ); + let should_dump = raft_data_state_machine.before_open_target(); + + let raft_db_path = &config.raft_store.raftdb_path; + let config_raftdb = &config.raftdb; + let mut raft_db_opts = config_raftdb.build_opt(); + raft_db_opts.set_env(env.clone()); + let statistics = Arc::new(RocksStatistics::new_titan()); + raft_db_opts.set_statistics(statistics.as_ref()); + let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); + let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) + .expect("failed to open raftdb"); + + if should_dump { + let raft_engine = + RaftLogEngine::new(config.raft_engine.config(), key_manager.clone(), None) + .expect("failed to open raft engine for migration"); + dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /* threads */); + raft_engine.stop(); + drop(raft_engine); + raft_data_state_machine.after_dump_data(); + } + (raftdb, Some(statistics)) + } + + fn as_rocks_engine(&self) -> Option<&RocksEngine> { + Some(self) + } + + fn register_config(&self, cfg_controller: &mut ConfigController) { + cfg_controller.register( + tikv::config::Module::Raftdb, + Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), + ); + } +} + +impl ConfiguredRaftEngine for RaftLogEngine { + fn build( + config: &TikvConfig, + env: &Arc, + key_manager: &Option>, + block_cache: &Cache, + ) -> (Self, Option>) { + let mut raft_data_state_machine = RaftDataStateMachine::new( + &config.storage.data_dir, + &config.raft_store.raftdb_path, + &config.raft_engine.config().dir, + ); + let should_dump = raft_data_state_machine.before_open_target(); + + let raft_config = config.raft_engine.config(); + let raft_engine = + RaftLogEngine::new(raft_config, key_manager.clone(), get_io_rate_limiter()) + .expect("failed to open raft engine"); + + if should_dump { + let config_raftdb = &config.raftdb; + let mut raft_db_opts = config_raftdb.build_opt(); + raft_db_opts.set_env(env.clone()); + let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); + let raftdb = engine_rocks::util::new_engine_opt( + &config.raft_store.raftdb_path, + raft_db_opts, + raft_cf_opts, + ) + .expect("failed to open raftdb for migration"); + dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /* threads */); + raftdb.stop(); + drop(raftdb); + raft_data_state_machine.after_dump_data(); + } + (raft_engine, None) + } +} + +impl TikvServer { + fn init_raw_engines( + &mut self, + flow_listener: engine_rocks::FlowListener, + ) -> (CER, Arc) { + let block_cache = self.config.storage.block_cache.build_shared_cache(); + let env = self + .config + .build_shared_rocks_env(self.encryption_key_manager.clone(), get_io_rate_limiter()) + .unwrap(); + + // Create raft engine + let (raft_engine, raft_statistics) = CER::build( + &self.config, + &env, + &self.encryption_key_manager, + &block_cache, + ); + self.raft_statistics = raft_statistics; + + // Create kv engine. + let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) + .sst_recovery_sender(self.init_sst_recovery_sender()) + .flow_listener(flow_listener); + let factory = Box::new(builder.build()); + self.kv_statistics = Some(factory.rocks_statistics()); + let registry = TabletRegistry::new(factory, self.store_path.join("tablets")) + .unwrap_or_else(|e| fatal!("failed to create tablet registry {:?}", e)); + let cfg_controller = self.cfg_controller.as_mut().unwrap(); + cfg_controller.register( + tikv::config::Module::Rocksdb, + Box::new(DbConfigManger::new(registry.clone(), DbType::Kv)), + ); + self.tablet_registry = Some(registry.clone()); + raft_engine.register_config(cfg_controller); + + let engines_info = Arc::new(EnginesResourceInfo::new( + registry, + raft_engine.as_rocks_engine().cloned(), + 180, // max_samples_to_preserve + )); + + (raft_engine, engines_info) + } +} + +/// Various sanity-checks and logging before running a server. +/// +/// Warnings are logged. +/// +/// # Logs +/// +/// The presence of these environment variables that affect the database +/// behavior is logged. +/// +/// - `GRPC_POLL_STRATEGY` +/// - `http_proxy` and `https_proxy` +/// +/// # Warnings +/// +/// - if `net.core.somaxconn` < 32768 +/// - if `net.ipv4.tcp_syncookies` is not 0 +/// - if `vm.swappiness` is not 0 +/// - if data directories are not on SSDs +/// - if the "TZ" environment variable is not set on unix +fn pre_start() { + check_environment_variables(); + for e in tikv_util::config::check_kernel() { + warn!( + "check: kernel"; + "err" => %e + ); + } +} + +fn check_system_config(config: &TikvConfig) { + info!("beginning system configuration check"); + let mut rocksdb_max_open_files = config.rocksdb.max_open_files; + if config.rocksdb.titan.enabled { + // Titan engine maintains yet another pool of blob files and uses the same max + // number of open files setup as rocksdb does. So we double the max required + // open files here + rocksdb_max_open_files *= 2; + } + if let Err(e) = tikv_util::config::check_max_open_fds( + RESERVED_OPEN_FDS + (rocksdb_max_open_files + config.raftdb.max_open_files) as u64, + ) { + fatal!("{}", e); + } + + // Check RocksDB data dir + if let Err(e) = tikv_util::config::check_data_dir(&config.storage.data_dir) { + warn!( + "check: rocksdb-data-dir"; + "path" => &config.storage.data_dir, + "err" => %e + ); + } + // Check raft data dir + if let Err(e) = tikv_util::config::check_data_dir(&config.raft_store.raftdb_path) { + warn!( + "check: raftdb-path"; + "path" => &config.raft_store.raftdb_path, + "err" => %e + ); + } +} + +fn try_lock_conflict_addr>(path: P) -> File { + let f = File::create(path.as_ref()).unwrap_or_else(|e| { + fatal!( + "failed to create lock at {}: {}", + path.as_ref().display(), + e + ) + }); + + if f.try_lock_exclusive().is_err() { + fatal!( + "{} already in use, maybe another instance is binding with this address.", + path.as_ref().file_name().unwrap().to_str().unwrap() + ); + } + f +} + +#[cfg(unix)] +fn get_lock_dir() -> String { + format!("{}_TIKV_LOCK_FILES", unsafe { libc::getuid() }) +} + +#[cfg(not(unix))] +fn get_lock_dir() -> String { + "TIKV_LOCK_FILES".to_owned() +} + +pub struct EngineMetricsManager { + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + last_reset: Instant, +} + +impl EngineMetricsManager { + pub fn new( + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + ) -> Self { + EngineMetricsManager { + tablet_registry, + kv_statistics, + kv_is_titan, + raft_engine, + raft_statistics, + last_reset: Instant::now(), + } + } + + pub fn flush(&mut self, now: Instant) { + let mut reporter = EK::StatisticsReporter::new("kv"); + self.tablet_registry + .for_each_opened_tablet(|_, db: &mut CachedTablet| { + if let Some(db) = db.latest() { + reporter.collect(db); + } + true + }); + reporter.flush(); + self.raft_engine.flush_metrics("raft"); + + if let Some(s) = self.kv_statistics.as_ref() { + flush_engine_statistics(s, "kv", self.kv_is_titan); + } + if let Some(s) = self.raft_statistics.as_ref() { + flush_engine_statistics(s, "raft", false); + } + if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { + if let Some(s) = self.kv_statistics.as_ref() { + s.reset(); + } + if let Some(s) = self.raft_statistics.as_ref() { + s.reset(); + } + self.last_reset = now; + } + } +} + +pub struct EnginesResourceInfo { + tablet_registry: TabletRegistry, + raft_engine: Option, + latest_normalized_pending_bytes: AtomicU32, + normalized_pending_bytes_collector: MovingAvgU32, +} + +impl EnginesResourceInfo { + const SCALE_FACTOR: u64 = 100; + + fn new( + tablet_registry: TabletRegistry, + raft_engine: Option, + max_samples_to_preserve: usize, + ) -> Self { + EnginesResourceInfo { + tablet_registry, + raft_engine, + latest_normalized_pending_bytes: AtomicU32::new(0), + normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), + } + } + + pub fn update( + &self, + _now: Instant, + cached_latest_tablets: &mut HashMap>, + ) { + let mut normalized_pending_bytes = 0; + + fn fetch_engine_cf(engine: &RocksEngine, cf: &str, normalized_pending_bytes: &mut u32) { + if let Ok(cf_opts) = engine.get_options_cf(cf) { + if let Ok(Some(b)) = engine.get_cf_pending_compaction_bytes(cf) { + if cf_opts.get_soft_pending_compaction_bytes_limit() > 0 { + *normalized_pending_bytes = std::cmp::max( + *normalized_pending_bytes, + (b * EnginesResourceInfo::SCALE_FACTOR + / cf_opts.get_soft_pending_compaction_bytes_limit()) + as u32, + ); + } + } + } + } + + if let Some(raft_engine) = &self.raft_engine { + fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); + } + + self.tablet_registry + .for_each_opened_tablet(|id, db: &mut CachedTablet| { + cached_latest_tablets.insert(id, db.clone()); + true + }); + + // todo(SpadeA): Now, there's a potential race condition problem where the + // tablet could be destroyed after the clone and before the fetching + // which could result in programme panic. It's okay now as the single global + // kv_engine will not be destroyed in normal operation and v2 is not + // ready for operation. Furthermore, this race condition is general to v2 as + // tablet clone is not a case exclusively happened here. We should + // propose another PR to tackle it such as destory tablet lazily in a GC + // thread. + + for (_, cache) in cached_latest_tablets.iter_mut() { + let Some(tablet) = cache.latest() else { continue }; + for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { + fetch_engine_cf(tablet, cf, &mut normalized_pending_bytes); + } + } + + // Clear ensures that these tablets are not hold forever. + cached_latest_tablets.clear(); + + let (_, avg) = self + .normalized_pending_bytes_collector + .add(normalized_pending_bytes); + self.latest_normalized_pending_bytes.store( + std::cmp::max(normalized_pending_bytes, avg), + Ordering::Relaxed, + ); + } +} + +impl IoBudgetAdjustor for EnginesResourceInfo { + fn adjust(&self, total_budgets: usize) -> usize { + let score = self.latest_normalized_pending_bytes.load(Ordering::Relaxed) as f32 + / Self::SCALE_FACTOR as f32; + // Two reasons for adding `sqrt` on top: + // 1) In theory the convergence point is independent of the value of pending + // bytes (as long as backlog generating rate equals consuming rate, which is + // determined by compaction budgets), a convex helps reach that point while + // maintaining low level of pending bytes. + // 2) Variance of compaction pending bytes grows with its magnitude, a filter + // with decreasing derivative can help balance such trend. + let score = score.sqrt(); + // The target global write flow slides between Bandwidth / 2 and Bandwidth. + let score = 0.5 + score / 2.0; + (total_budgets as f32 * score) as usize + } +} + +#[cfg(test)] +mod test { + use std::{ + collections::HashMap, + sync::{atomic::Ordering, Arc}, + }; + + use engine_rocks::raw::Env; + use engine_traits::{ + FlowControlFactorsExt, MiscExt, SyncMutable, TabletContext, TabletRegistry, CF_DEFAULT, + }; + use tempfile::Builder; + use tikv::{config::TikvConfig, server::KvEngineFactoryBuilder}; + use tikv_util::{config::ReadableSize, time::Instant}; + + use super::EnginesResourceInfo; + + #[test] + fn test_engines_resource_info_update() { + let mut config = TikvConfig::default(); + config.rocksdb.defaultcf.disable_auto_compactions = true; + config.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.writecf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.lockcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + let env = Arc::new(Env::default()); + let path = Builder::new().prefix("test-update").tempdir().unwrap(); + let cache = config.storage.block_cache.build_shared_cache(); + + let factory = KvEngineFactoryBuilder::new(env, &config, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), path.path().join("tablets")).unwrap(); + + for i in 1..6 { + let ctx = TabletContext::with_infinite_region(i, Some(10)); + reg.load(ctx, true).unwrap(); + } + + let mut cached = reg.get(1).unwrap(); + let mut tablet = cached.latest().unwrap(); + // Prepare some data for two tablets of the same region. So we can test whether + // we fetch the bytes from the latest one. + for i in 1..21 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } + } + let old_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); + + let ctx = TabletContext::with_infinite_region(1, Some(20)); + reg.load(ctx, true).unwrap(); + tablet = cached.latest().unwrap(); + + for i in 1..11 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } + } + let new_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); + + assert!(old_pending_compaction_bytes > new_pending_compaction_bytes); + + let engines_info = Arc::new(EnginesResourceInfo::new(reg, None, 10)); + + let mut cached_latest_tablets = HashMap::default(); + engines_info.update(Instant::now(), &mut cached_latest_tablets); + + // The memory allocation should be reserved + assert!(cached_latest_tablets.capacity() >= 5); + // The tablet cache should be cleared + assert!(cached_latest_tablets.is_empty()); + + // The latest_normalized_pending_bytes should be equal to the pending compaction + // bytes of tablet_1_20 + assert_eq!( + (new_pending_compaction_bytes * 100) as u32, + engines_info + .latest_normalized_pending_bytes + .load(Ordering::Relaxed) + ); + } +} diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 12d9982fea6..2521347ec18 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -523,9 +523,9 @@ impl ServerCluster { copr.clone(), copr_v2.clone(), resolver.clone(), - snap_mgr.clone(), + tikv_util::Either::Left(snap_mgr.clone()), gc_worker.clone(), - check_leader_scheduler.clone(), + Some(check_leader_scheduler.clone()), self.env.clone(), None, debug_thread_pool.clone(), @@ -795,6 +795,10 @@ impl Cluster { } panic!("failed to get snapshot of region {}", region_id); } + + pub fn raft_extension(&self, node_id: u64) -> SimulateRaftExtension { + self.sim.rl().storages[&node_id].raft_extension() + } } pub fn new_server_cluster(id: u64, count: usize) -> Cluster { diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 35d417db650..49e6812b81f 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -9,9 +9,10 @@ pub mod ioload; pub mod thread; // re-export some traits for ease of use -#[cfg(target_os = "linux")] -use std::path::PathBuf; -use std::sync::atomic::{AtomicU64, Ordering}; +use std::{ + path::Path, + sync::atomic::{AtomicU64, Ordering}, +}; use fail::fail_point; #[cfg(target_os = "linux")] @@ -162,13 +163,13 @@ pub fn cache_line_size(level: usize) -> Option { } #[cfg(target_os = "linux")] -pub fn path_in_diff_mount_point(path1: &str, path2: &str) -> bool { - if path1.is_empty() || path2.is_empty() { +pub fn path_in_diff_mount_point(path1: impl AsRef, path2: impl AsRef) -> bool { + let (path1, path2) = (path1.as_ref(), path2.as_ref()); + let empty_path = |p: &Path| p.to_str().map_or(false, |s| s.is_empty()); + if empty_path(path1) || empty_path(path2) { return false; } - let path1 = PathBuf::from(path1); - let path2 = PathBuf::from(path2); - match (get_mount(&path1), get_mount(&path2)) { + match (get_mount(path1), get_mount(path2)) { (Err(e1), _) => { warn!("Get mount point error for path {}, {}", path1.display(), e1); false @@ -190,7 +191,7 @@ pub fn path_in_diff_mount_point(path1: &str, path2: &str) -> bool { } #[cfg(not(target_os = "linux"))] -pub fn path_in_diff_mount_point(_path1: &str, _path2: &str) -> bool { +pub fn path_in_diff_mount_point(_path1: impl AsRef, _path2: impl AsRef) -> bool { false } diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index b3a445a1f7e..bcfd542035b 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -11,9 +11,12 @@ use raftstore::{ coprocessor::CoprocessorHost, store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}, }; -use raftstore_v2::{router::RaftRouter, Bootstrap, StoreSystem}; +use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreSystem}; use slog::{info, o, Logger}; -use tikv_util::{config::VersionTrack, worker::Worker}; +use tikv_util::{ + config::VersionTrack, + worker::{LazyWorker, Worker}, +}; use crate::server::{node::init_store, Result}; @@ -21,13 +24,10 @@ use crate::server::{node::init_store, Result}; pub struct NodeV2 { cluster_id: u64, store: metapb::Store, - store_cfg: Arc>, - system: StoreSystem, + system: Option<(RaftRouter, StoreSystem)>, has_started: bool, pd_client: Arc, - state: Arc>, - bg_worker: Worker, registry: TabletRegistry, logger: Logger, } @@ -40,12 +40,8 @@ where { /// Creates a new Node. pub fn new( - system: StoreSystem, cfg: &crate::server::Config, - store_cfg: Arc>, pd_client: Arc, - state: Arc>, - bg_worker: Worker, store: Option, registry: TabletRegistry, ) -> NodeV2 { @@ -54,18 +50,19 @@ where NodeV2 { cluster_id: cfg.cluster_id, store, - store_cfg, pd_client, - system, + system: None, has_started: false, - state, - bg_worker, registry, logger: slog_global::borrow_global().new(o!()), } } - pub fn try_bootstrap_store(&mut self, raft_engine: &ER) -> Result<()> { + pub fn try_bootstrap_store( + &mut self, + cfg: &raftstore_v2::Config, + raft_engine: &ER, + ) -> Result<()> { let store_id = Bootstrap::new( raft_engine, self.cluster_id, @@ -74,9 +71,19 @@ where ) .bootstrap_store()?; self.store.set_id(store_id); + let (router, system) = + raftstore_v2::create_store_batch_system(cfg, store_id, self.logger.clone()); + self.system = Some(( + RaftRouter::new(store_id, self.registry.clone(), router), + system, + )); Ok(()) } + pub fn router(&self) -> &RaftRouter { + &self.system.as_ref().unwrap().0 + } + /// Starts the Node. It tries to bootstrap cluster if the cluster is not /// bootstrapped yet. Then it spawns a thread to run the raftstore in /// background. @@ -84,19 +91,21 @@ where &mut self, raft_engine: ER, trans: T, - router: &RaftRouter, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, background: Worker, + pd_worker: LazyWorker, + store_cfg: Arc>, + state: &Mutex, ) -> Result<()> where T: Transport + 'static, { let store_id = self.id(); { - let mut meta = router.store_meta().lock().unwrap(); + let mut meta = self.router().store_meta().lock().unwrap(); meta.store_id = Some(store_id); } if let Some(region) = Bootstrap::new( @@ -121,17 +130,18 @@ where // Put store only if the cluster is bootstrapped. info!(self.logger, "put store to PD"; "store" => ?&self.store); let status = self.pd_client.put_store(self.store.clone())?; - self.load_all_stores(status); + self.load_all_stores(state, status); self.start_store( raft_engine, trans, - router, snap_mgr, concurrency_manager, causal_ts_provider, coprocessor_host, background, + pd_worker, + store_cfg, )?; Ok(()) @@ -142,6 +152,10 @@ where self.store.get_id() } + pub fn logger(&self) -> Logger { + self.logger.clone() + } + /// Gets a copy of Store which is registered to Pd. pub fn store(&self) -> metapb::Store { self.store.clone() @@ -153,13 +167,17 @@ where // Do we really need to do the check giving we don't consider support upgrade // ATM? - fn load_all_stores(&mut self, status: Option) { + fn load_all_stores( + &mut self, + state: &Mutex, + status: Option, + ) { info!(self.logger, "initializing replication mode"; "status" => ?status, "store_id" => self.store.id); let stores = match self.pd_client.get_all_stores(false) { Ok(stores) => stores, Err(e) => panic!("failed to load all stores: {:?}", e), }; - let mut state = self.state.lock().unwrap(); + let mut state = state.lock().unwrap(); if let Some(s) = status { state.set_status(s); } @@ -174,12 +192,13 @@ where &mut self, raft_engine: ER, trans: T, - router: &RaftRouter, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, background: Worker, + pd_worker: LazyWorker, + store_cfg: Arc>, ) -> Result<()> where T: Transport + 'static, @@ -191,11 +210,12 @@ where return Err(box_err!("{} is already started", store_id)); } self.has_started = true; - let cfg = self.store_cfg.clone(); - self.system.start( + let (router, system) = self.system.as_mut().unwrap(); + + system.start( store_id, - cfg, + store_cfg, raft_engine, self.registry.clone(), trans, @@ -207,6 +227,7 @@ where causal_ts_provider, coprocessor_host, background, + pd_worker, )?; Ok(()) } @@ -214,8 +235,8 @@ where /// Stops the Node. pub fn stop(&mut self) { let store_id = self.store.get_id(); + let Some((_, mut system)) = self.system.take() else { return }; info!(self.logger, "stop raft store thread"; "store_id" => store_id); - self.system.shutdown(); - self.bg_worker.stop(); + system.shutdown(); } } diff --git a/src/server/server.rs b/src/server/server.rs index 428aee31090..22ab1682309 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -13,7 +13,7 @@ use futures::{compat::Stream01CompatExt, stream::StreamExt}; use grpcio::{ChannelBuilder, Environment, ResourceQuota, Server as GrpcServer, ServerBuilder}; use grpcio_health::{create_health, HealthService, ServingStatus}; use kvproto::tikvpb::*; -use raftstore::store::{CheckLeaderTask, SnapManager}; +use raftstore::store::{CheckLeaderTask, SnapManager, TabletSnapManager}; use security::SecurityManager; use tikv_util::{ config::VersionTrack, @@ -39,7 +39,7 @@ use crate::{ coprocessor::Endpoint, coprocessor_v2, read_pool::ReadPool, - server::{gc_worker::GcWorker, Proxy}, + server::{gc_worker::GcWorker, tablet_snap::TabletRunner, Proxy}, storage::{lock_manager::LockManager, Engine, Storage}, tikv_util::sys::thread::ThreadBuildWrapper, }; @@ -67,7 +67,7 @@ pub struct Server { trans: ServerTransport, raft_router: E::RaftExtension, // For sending/receiving snapshots. - snap_mgr: SnapManager, + snap_mgr: Either, snap_worker: LazyWorker, // Currently load statistics is done in the thread. @@ -94,9 +94,9 @@ where copr: Endpoint, copr_v2: coprocessor_v2::Endpoint, resolver: S, - snap_mgr: SnapManager, + snap_mgr: Either, gc_worker: GcWorker, - check_leader_scheduler: Scheduler, + check_leader_scheduler: Option>, env: Arc, yatp_read_pool: Option, debug_thread_pool: Arc, @@ -252,14 +252,28 @@ where cfg: Arc>, security_mgr: Arc, ) -> Result<()> { - let snap_runner = SnapHandler::new( - Arc::clone(&self.env), - self.snap_mgr.clone(), - self.raft_router.clone(), - security_mgr, - Arc::clone(&cfg), - ); - self.snap_worker.start(snap_runner); + match self.snap_mgr.clone() { + Either::Left(mgr) => { + let snap_runner = SnapHandler::new( + self.env.clone(), + mgr, + self.raft_router.clone(), + security_mgr, + cfg, + ); + self.snap_worker.start(snap_runner); + } + Either::Right(mgr) => { + let snap_runner = TabletRunner::new( + self.env.clone(), + mgr, + self.raft_router.clone(), + security_mgr, + cfg, + ); + self.snap_worker.start(snap_runner); + } + } let mut grpc_server = self.builder_or_server.take().unwrap().right().unwrap(); info!("listening on addr"; "addr" => &self.local_addr); @@ -564,9 +578,9 @@ mod tests { quick_fail: Arc::clone(&quick_fail), addr: Arc::clone(&addr), }, - SnapManager::new(""), + Either::Left(SnapManager::new("")), gc_worker, - check_leader_scheduler, + Some(check_leader_scheduler), env, None, debug_thread_pool, diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 88ed0c99443..68a200b045e 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -65,6 +65,7 @@ const GRPC_MSG_NOTIFY_SIZE: usize = 8; pub struct Service { store_id: u64, /// Used to handle requests related to GC. + // TODO: make it Some after GC is supported for v2. gc_worker: GcWorker, // For handling KV requests. storage: Storage, @@ -75,7 +76,7 @@ pub struct Service { // For handling snapshot. snap_scheduler: Scheduler, // For handling `CheckLeader` request. - check_leader_scheduler: Scheduler, + check_leader_scheduler: Option>, enable_req_batch: bool, @@ -114,7 +115,7 @@ impl Service { copr: Endpoint, copr_v2: coprocessor_v2::Endpoint, snap_scheduler: Scheduler, - check_leader_scheduler: Scheduler, + check_leader_scheduler: Option>, grpc_thread_load: Arc, enable_req_batch: bool, proxy: Proxy, @@ -908,6 +909,7 @@ impl Tikv for Service { let (cb, resp) = paired_future_callback(); let check_leader_scheduler = self.check_leader_scheduler.clone(); let task = async move { + let Some(check_leader_scheduler) = check_leader_scheduler else { return Err(box_err!("check leader is not supported")) }; check_leader_scheduler .schedule(CheckLeaderTask::CheckLeader { leaders, cb }) .map_err(|e| Error::Other(format!("{}", e).into()))?; @@ -945,6 +947,7 @@ impl Tikv for Service { let (cb, resp) = paired_future_callback(); let check_leader_scheduler = self.check_leader_scheduler.clone(); let task = async move { + let Some(check_leader_scheduler) = check_leader_scheduler else { return Err(box_err!("check leader is not supported")) }; check_leader_scheduler .schedule(CheckLeaderTask::GetStoreTs { key_range, cb }) .map_err(|e| Error::Other(format!("{}", e).into()))?; diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 78302550fd5..2f87c5d0264 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -4,7 +4,6 @@ mod profile; use std::{ error::Error as StdError, - marker::PhantomData, net::SocketAddr, path::PathBuf, pin::Pin, @@ -16,7 +15,6 @@ use std::{ use async_stream::stream; use collections::HashMap; -use engine_traits::KvEngine; use flate2::{write::GzEncoder, Compression}; use futures::{ compat::{Compat01As03, Stream01CompatExt}, @@ -45,10 +43,10 @@ pub use profile::{ read_file, start_one_cpu_profile, start_one_heap_profile, }; use prometheus::TEXT_FORMAT; -use raftstore::store::{transport::CasualRouter, CasualMessage}; use regex::Regex; use security::{self, SecurityConfig}; use serde_json::Value; +use tikv_kv::RaftExtension; use tikv_util::{ logger::set_log_level, metrics::{dump, dump_to}, @@ -82,7 +80,7 @@ struct LogLevelRequest { pub log_level: LogLevel, } -pub struct StatusServer { +pub struct StatusServer { thread_pool: Runtime, tx: Sender<()>, rx: Option>, @@ -91,12 +89,10 @@ pub struct StatusServer { router: R, security_config: Arc, store_path: PathBuf, - _snap: PhantomData, } -impl StatusServer +impl StatusServer where - E: 'static, R: 'static + Send, { pub fn new( @@ -124,7 +120,6 @@ where router, security_config, store_path, - _snap: PhantomData, }) } @@ -423,10 +418,9 @@ where } } -impl StatusServer +impl StatusServer where - E: KvEngine, - R: 'static + Send + CasualRouter + Clone, + R: 'static + Send + RaftExtension + Clone, { pub async fn dump_region_meta(req: Request, router: R) -> hyper::Result> { lazy_static! { @@ -451,33 +445,18 @@ where )); } }; - let (tx, rx) = oneshot::channel(); - match router.send( - id, - CasualMessage::AccessPeer(Box::new(move |meta| { - if let Err(meta) = tx.send(meta) { - error!("receiver dropped, region meta: {:?}", meta) - } - })), - ) { - Ok(_) => (), - Err(raftstore::Error::RegionNotFound(_)) => { + let f = router.query_region(id); + let meta = match f.await { + Ok(meta) => meta, + Err(tikv_kv::Error(box tikv_kv::ErrorInner::Request(header))) + if header.has_region_not_found() => + { return not_found(format!("region({}) not found", id)); } Err(err) => { return Ok(make_response( StatusCode::INTERNAL_SERVER_ERROR, - format!("channel pending or disconnect: {}", err), - )); - } - } - - let meta = match rx.await { - Ok(meta) => meta, - Err(_) => { - return Ok(make_response( - StatusCode::INTERNAL_SERVER_ERROR, - "query cancelled", + format!("query failed: {}", err), )); } }; @@ -938,17 +917,21 @@ mod tests { use std::{env, io::Read, path::PathBuf, sync::Arc}; use collections::HashSet; - use engine_test::kv::KvTestEngine; use flate2::read::GzDecoder; - use futures::{executor::block_on, future::ok, prelude::*}; + use futures::{ + executor::block_on, + future::{ok, BoxFuture}, + prelude::*, + }; use http::header::{HeaderValue, ACCEPT_ENCODING}; use hyper::{body::Buf, client::HttpConnector, Body, Client, Method, Request, StatusCode, Uri}; use hyper_openssl::HttpsConnector; use online_config::OnlineConfig; use openssl::ssl::{SslConnector, SslFiletype, SslMethod}; - use raftstore::store::{transport::CasualRouter, CasualMessage}; + use raftstore::store::region_meta::RegionMeta; use security::SecurityConfig; use test_util::new_security_cfg; + use tikv_kv::RaftExtension; use tikv_util::logger::get_log_level; use crate::{ @@ -959,9 +942,9 @@ mod tests { #[derive(Clone)] struct MockRouter; - impl CasualRouter for MockRouter { - fn send(&self, region_id: u64, _: CasualMessage) -> raftstore::Result<()> { - Err(raftstore::Error::RegionNotFound(region_id)) + impl RaftExtension for MockRouter { + fn query_region(&self, region_id: u64) -> BoxFuture<'static, tikv_kv::Result> { + Box::pin(async move { Err(raftstore::Error::RegionNotFound(region_id).into()) }) } } diff --git a/tests/integrations/server/status_server.rs b/tests/integrations/server/status_server.rs index 455465d87cb..929a7c286ae 100644 --- a/tests/integrations/server/status_server.rs +++ b/tests/integrations/server/status_server.rs @@ -5,9 +5,8 @@ use std::{error::Error, net::SocketAddr, sync::Arc}; use hyper::{body, Client, StatusCode, Uri}; use raftstore::store::region_meta::RegionMeta; use security::SecurityConfig; -use test_raftstore::{new_server_cluster, Simulator}; +use test_raftstore::new_server_cluster; use tikv::{config::ConfigController, server::status_server::StatusServer}; -use tikv_util::HandyRwLock; async fn check(authority: SocketAddr, region_id: u64) -> Result<(), Box> { let client = Client::new(); @@ -39,13 +38,12 @@ fn test_region_meta_endpoint() { let peer = region.get_peers().get(0); assert!(peer.is_some()); let store_id = peer.unwrap().get_store_id(); - let router = cluster.sim.rl().get_router(store_id); - assert!(router.is_some()); + let router = cluster.raft_extension(store_id); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), - router.unwrap(), + router, std::env::temp_dir(), ) .unwrap(); From a422de9d27d96d2bfc627f9e53f655bd9fd00b9f Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 23 Dec 2022 13:50:14 +0800 Subject: [PATCH 425/676] *: make gc not write db directly (#13982) ref tikv/tikv#12842 We rely on non-concurrent memtable write for dynamic regions to achieve best performance. This PR makes sure writes of compaction filter be redirected to apply thread when dynamic regions is enabled. The solution may miss data if TiKV crashes before writes are flushed to disk. Note even for v1, it's also possible to leave garbage if writes to rocksdb fail. We need to scan default CFs and check for orphan versions. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/fsm/apply.rs | 1 + components/raftstore-v2/src/fsm/peer.rs | 6 + .../raftstore-v2/src/operation/command/mod.rs | 26 ++++ .../src/operation/command/write/mod.rs | 33 ++++- .../src/router/internal_message.rs | 2 + components/raftstore-v2/src/router/message.rs | 14 ++ src/server/gc_worker/compaction_filter.rs | 128 +++++++++++++----- src/server/gc_worker/gc_worker.rs | 82 +++++++++-- .../gc_worker/rawkv_compaction_filter.rs | 13 +- src/server/raftkv2/mod.rs | 47 ++++--- tests/failpoints/cases/test_gc_worker.rs | 2 +- 11 files changed, 277 insertions(+), 77 deletions(-) diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 07a577e0c35..c0eabd2120e 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -113,6 +113,7 @@ impl ApplyFsm { // TODO: flush by buffer size. ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), + ApplyTask::UnsafeWrite(raw_write) => self.apply.apply_unsafe_write(raw_write), } // TODO: yield after some time. diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 1ef9e198130..4b22554e694 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -243,6 +243,12 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, write.ch, ); } + PeerMsg::UnsafeWrite(write) => { + self.on_receive_command(write.send_time); + self.fsm + .peer_mut() + .on_unsafe_write(self.store_ctx, write.data); + } PeerMsg::Tick(tick) => self.on_tick(tick), PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(self.store_ctx, res), PeerMsg::SplitInit(msg) => self.fsm.peer.on_split_init(self.store_ctx, msg), diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index fce01f19277..49040a20278 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -319,6 +319,32 @@ impl Apply { } impl Apply { + pub fn apply_unsafe_write(&mut self, data: Box<[u8]>) { + let decoder = match SimpleWriteReqDecoder::new(&self.logger, &data, u64::MAX, u64::MAX) { + Ok(decoder) => decoder, + Err(req) => unreachable!("unexpected request: {:?}", req), + }; + for req in decoder { + match req { + SimpleWrite::Put(put) => { + let _ = self.apply_put(put.cf, u64::MAX, put.key, put.value); + } + SimpleWrite::Delete(delete) => { + let _ = self.apply_delete(delete.cf, u64::MAX, delete.key); + } + SimpleWrite::DeleteRange(dr) => { + let _ = self.apply_delete_range( + dr.cf, + u64::MAX, + dr.start_key, + dr.end_key, + dr.notify_only, + ); + } + } + } + } + #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 51beeee7dea..ad6e537b956 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -16,7 +16,7 @@ use crate::{ batch::StoreContext, operation::cf_offset, raft::{Apply, Peer}, - router::CmdResChannel, + router::{ApplyTask, CmdResChannel}, }; mod simple_write; @@ -71,6 +71,29 @@ impl Peer { self.simple_write_encoder_mut().replace(encoder); } + #[inline] + pub fn on_unsafe_write( + &mut self, + ctx: &mut StoreContext, + data: SimpleWriteBinary, + ) { + if !self.serving() { + return; + } + let bin = SimpleWriteReqEncoder::new( + Box::::default(), + data, + ctx.cfg.raft_entry_max_size.0 as usize, + false, + ) + .encode() + .0 + .into_boxed_slice(); + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::UnsafeWrite(bin)); + } + } + pub fn propose_pending_writes(&mut self, ctx: &mut StoreContext) { if let Some(encoder) = self.simple_write_encoder_mut().take() { let call_proposed_on_success = if encoder.notify_proposed() { @@ -140,7 +163,9 @@ impl Apply { "aborted by failpoint".into() ))); self.metrics.size_diff_hint += (self.key_buffer.len() + value.len()) as i64; - self.modifications_mut()[off] = index; + if index != u64::MAX { + self.modifications_mut()[off] = index; + } Ok(()) } @@ -171,7 +196,9 @@ impl Apply { ); }); self.metrics.size_diff_hint -= self.key_buffer.len() as i64; - self.modifications_mut()[off] = index; + if index != u64::MAX { + self.modifications_mut()[off] = index; + } Ok(()) } diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 4c317a22abd..05e1baea1cf 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -8,6 +8,8 @@ use crate::operation::{AdminCmdResult, CommittedEntries, DataTrace, GenSnapTask} pub enum ApplyTask { CommittedEntries(CommittedEntries), Snapshot(GenSnapTask), + /// Writes that doesn't care consistency. + UnsafeWrite(Box<[u8]>), } #[derive(Debug, Default)] diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index faed3c0751d..a69f6b5ead6 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -119,6 +119,12 @@ pub struct SimpleWrite { pub ch: CmdResChannel, } +#[derive(Debug)] +pub struct UnsafeWrite { + pub send_time: Instant, + pub data: SimpleWriteBinary, +} + /// Message that can be sent to a peer. #[derive(Debug)] pub enum PeerMsg { @@ -132,6 +138,7 @@ pub enum PeerMsg { /// Command changes the inernal states. It will be transformed into logs and /// applied on all replicas. SimpleWrite(SimpleWrite), + UnsafeWrite(UnsafeWrite), /// Command that contains admin requests. AdminCommand(RaftRequest), /// Tick is periodical task. If target peer doesn't exist there is a @@ -206,6 +213,13 @@ impl PeerMsg { ) } + pub fn unsafe_write(data: SimpleWriteBinary) -> Self { + PeerMsg::UnsafeWrite(UnsafeWrite { + send_time: Instant::now(), + data, + }) + } + pub fn request_split( epoch: metapb::RegionEpoch, split_keys: Vec>, diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 4c494d6f01f..5d33346a844 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -20,9 +20,7 @@ use engine_rocks::{ }, RocksEngine, RocksMvccProperties, RocksWriteBatchVec, }; -use engine_traits::{ - KvEngine, MiscExt, Mutable, MvccProperties, WriteBatch, WriteBatchExt, WriteOptions, -}; +use engine_traits::{KvEngine, MiscExt, MvccProperties, WriteBatch, WriteOptions}; use file_system::{IoType, WithIoType}; use pd_client::{Feature, FeatureGate}; use prometheus::{local::*, *}; @@ -30,6 +28,7 @@ use raftstore::coprocessor::RegionInfoProvider; use tikv_util::{ time::Instant, worker::{ScheduleError, Scheduler}, + Either, }; use txn_types::{Key, TimeStamp, WriteRef, WriteType}; @@ -51,7 +50,7 @@ const COMPACTION_FILTER_GC_FEATURE: Feature = Feature::require(5, 0, 0); // these fields are not available when constructing // `WriteCompactionFilterFactory`. pub struct GcContext { - pub(crate) db: RocksEngine, + pub(crate) db: Option, pub(crate) store_id: u64, pub(crate) safe_point: Arc, pub(crate) cfg_tracker: GcWorkerConfigManager, @@ -154,7 +153,7 @@ where ); } -impl CompactionFilterInitializer for EK +impl CompactionFilterInitializer for Option where EK: KvEngine, { @@ -171,7 +170,7 @@ where } } -impl CompactionFilterInitializer for RocksEngine { +impl CompactionFilterInitializer for Option { fn init_compaction_filter( &self, store_id: u64, @@ -237,7 +236,10 @@ impl CompactionFilterFactory for WriteCompactionFilterFactory { "ratio_threshold" => ratio_threshold, ); - if db.is_stalled_or_stopped() { + if db + .as_ref() + .map_or(false, RocksEngine::is_stalled_or_stopped) + { debug!("skip gc in compaction filter because the DB is stalled"); return std::ptr::null_mut(); } @@ -277,13 +279,60 @@ impl CompactionFilterFactory for WriteCompactionFilterFactory { } } +pub struct DeleteBatch { + pub batch: Either>, +} + +impl DeleteBatch { + fn new(db: &Option) -> Self + where + EK: KvEngine, + { + Self { + batch: match db { + Some(db) => Either::Left(db.write_batch_with_cap(DEFAULT_DELETE_BATCH_SIZE)), + None => Either::Right(Vec::with_capacity(64)), + }, + } + } + + // `key` has prefix `DATA_KEY`. + fn delete(&mut self, key: &[u8], ts: TimeStamp) -> Result<(), String> { + match &mut self.batch { + Either::Left(batch) => { + let key = Key::from_encoded_slice(key).append_ts(ts); + batch.delete(key.as_encoded())?; + } + Either::Right(keys) => { + let key = Key::from_encoded_slice(keys::origin_key(key)).append_ts(ts); + keys.push(key); + } + } + Ok(()) + } + + fn is_empty(&self) -> bool { + match &self.batch { + Either::Left(batch) => batch.is_empty(), + Either::Right(keys) => keys.is_empty(), + } + } + + pub fn count(&self) -> usize { + match &self.batch { + Either::Left(batch) => batch.count(), + Either::Right(keys) => keys.len(), + } + } +} + struct WriteCompactionFilter { safe_point: u64, - engine: RocksEngine, + engine: Option, is_bottommost_level: bool, encountered_errors: bool, - write_batch: RocksWriteBatchVec, + write_batch: DeleteBatch, gc_scheduler: Scheduler>, // A key batch which is going to be sent to the GC worker. mvcc_deletions: Vec, @@ -312,7 +361,7 @@ struct WriteCompactionFilter { impl WriteCompactionFilter { fn new( - engine: RocksEngine, + engine: Option, safe_point: u64, context: &CompactionFilterContext, gc_scheduler: Scheduler>, @@ -322,7 +371,7 @@ impl WriteCompactionFilter { assert!(safe_point > 0); debug!("gc in compaction filter"; "safe_point" => safe_point); - let write_batch = engine.write_batch_with_cap(DEFAULT_DELETE_BATCH_SIZE); + let write_batch = DeleteBatch::new(&engine); WriteCompactionFilter { safe_point, engine, @@ -469,9 +518,8 @@ impl WriteCompactionFilter { fn handle_filtered_write(&mut self, write: WriteRef<'_>) -> Result<(), String> { if write.short_value.is_none() && write.write_type == WriteType::Put { - let prefix = Key::from_encoded_slice(&self.mvcc_key_prefix); - let def_key = prefix.append_ts(write.start_ts).into_encoded(); - self.write_batch.delete(&def_key)?; + self.write_batch + .delete(&self.mvcc_key_prefix, write.start_ts)?; } Ok(()) } @@ -499,24 +547,40 @@ impl WriteCompactionFilter { } if self.write_batch.count() > DEFAULT_DELETE_BATCH_COUNT || force { - let mut wopts = WriteOptions::default(); - wopts.set_no_slowdown(true); - if let Err(e) = do_flush(&mut self.write_batch, &wopts) { - let wb = mem::replace( - &mut self.write_batch, - self.engine.write_batch_with_cap(DEFAULT_DELETE_BATCH_SIZE), - ); - self.orphan_versions += wb.count(); - let id = ORPHAN_VERSIONS_ID.fetch_add(1, Ordering::Relaxed); - let task = GcTask::OrphanVersions { wb, id }; + let err = match &mut self.write_batch.batch { + Either::Left(wb) => { + let mut wopts = WriteOptions::default(); + wopts.set_no_slowdown(true); + match do_flush(wb, &wopts) { + Ok(()) => { + wb.clear(); + return Ok(()); + } + Err(e) => Some(e), + } + } + Either::Right(_) => None, + }; + + let wb = mem::replace(&mut self.write_batch, DeleteBatch::new(&self.engine)); + self.orphan_versions += wb.count(); + let id = ORPHAN_VERSIONS_ID.fetch_add(1, Ordering::Relaxed); + let region_info_provider = self.regions_provider.1.clone(); + let task = GcTask::OrphanVersions { + wb, + id, + region_info_provider, + }; + if let Some(e) = &err { warn!( - "compaction filter flush fail, dispatch to gc worker"; - "task" => %task, "err" => ?e, + "compaction filter flush fail, dispatch to gc worker"; + "task" => %task, "err" => ?e, ); - self.schedule_gc_task(task, true); - return Err(e); } - self.write_batch.clear(); + self.schedule_gc_task(task, true); + if let Some(err) = err { + return Err(err); + } } Ok(()) } @@ -607,7 +671,9 @@ impl Drop for WriteCompactionFilter { if let Err(e) = self.flush_pending_writes_if_need(true) { error!("compaction filter flush writes fail"; "err" => ?e); } - self.engine.sync_wal().unwrap(); + if let Some(engine) = &self.engine { + engine.sync_wal().unwrap(); + } self.switch_key_metrics(); self.flush_metrics(); @@ -831,7 +897,7 @@ pub mod test_utils { let mut gc_context_opt = GC_CONTEXT.lock().unwrap(); *gc_context_opt = Some(GcContext { - db: engine.clone(), + db: Some(engine.clone()), store_id: 1, safe_point, cfg_tracker, diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 1ccac8860c6..106b36f61ad 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -38,7 +38,7 @@ use txn_types::{Key, TimeStamp}; use super::{ check_need_gc, compaction_filter::{ - CompactionFilterInitializer, GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED, + CompactionFilterInitializer, DeleteBatch, GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED, GC_COMPACTION_FILTER_MVCC_DELETION_WASTED, GC_COMPACTION_FILTER_ORPHAN_VERSIONS, }, config::{GcConfig, GcWorkerConfigManager}, @@ -118,7 +118,11 @@ where /// until `DefaultCompactionFilter` is introduced. /// /// The tracking issue: . - OrphanVersions { wb: E::WriteBatch, id: usize }, + OrphanVersions { + wb: DeleteBatch, + id: usize, + region_info_provider: Arc, + }, #[cfg(any(test, feature = "testexport"))] Validate(Box), } @@ -162,7 +166,7 @@ where .field("start_key", &format!("{}", start_key)) .field("end_key", &format!("{}", end_key)) .finish(), - GcTask::OrphanVersions { id, wb } => f + GcTask::OrphanVersions { id, wb, .. } => f .debug_struct("OrphanVersions") .field("id", id) .field("count", &wb.count()) @@ -871,6 +875,46 @@ impl GcRunner { tikv_kv::snapshot(&mut self.engine, snap_ctx).await })?) } + + fn flush_deletes(&mut self, deletes: Vec, provider: Arc) { + let mut region_modifies = HashMap::default(); + // Should not panic. + let regions = match get_regions_for_range_of_keys(self.store_id, &deletes, provider) { + Ok(r) => r, + Err(e) => { + error!("failed to flush deletes, will leave garbage"; "err" => ?e); + return; + } + }; + if regions.is_empty() { + error!("no region is found, will leave garbage"); + return; + } + let mut keys = deletes.into_iter().peekable(); + let mut modifies = vec![]; + for region in ®ions { + let start_key = region.get_start_key(); + let end_key = region.get_end_key(); + while let Some(key) = keys.peek() { + if key.as_encoded().as_slice() < start_key { + error!("key is not in any region, will leave garbage"; "key" => %key); + keys.next(); + continue; + } + if !end_key.is_empty() && key.as_encoded().as_slice() >= end_key { + break; + } + modifies.push(Modify::Delete(CF_DEFAULT, keys.next().unwrap())); + } + if !modifies.is_empty() { + region_modifies.insert(region.id, modifies); + modifies = vec![]; + } + } + if let Err(e) = self.engine.modify_on_kv_engine(region_modifies) { + error!("failed to flush deletes, will leave garbage"; "err" => ?e); + } + } } impl Runnable for GcRunner { @@ -982,19 +1026,29 @@ impl Runnable for GcRunner { end_key ); } - GcTask::OrphanVersions { mut wb, id } => { - info!("handling GcTask::OrphanVersions"; "id" => id); - let mut wopts = WriteOptions::default(); - wopts.set_sync(true); - if let Err(e) = wb.write_opt(&wopts) { - error!("write GcTask::OrphanVersions fail"; "id" => id, "err" => ?e); - update_metrics(true); - return; + GcTask::OrphanVersions { + wb, + id, + region_info_provider, + } => { + let count = wb.count(); + match wb.batch { + Either::Left(mut wb) => { + info!("handling GcTask::OrphanVersions"; "id" => id); + let mut wopts = WriteOptions::default(); + wopts.set_sync(true); + if let Err(e) = wb.write_opt(&wopts) { + error!("write GcTask::OrphanVersions fail"; "id" => id, "err" => ?e); + update_metrics(true); + return; + } + info!("write GcTask::OrphanVersions success"; "id" => id); + } + Either::Right(deletes) => self.flush_deletes(deletes, region_info_provider), } - info!("write GcTask::OrphanVersions success"; "id" => id); GC_COMPACTION_FILTER_ORPHAN_VERSIONS .with_label_values(&[STAT_TXN_KEYMODE, "cleaned"]) - .inc_by(wb.count() as u64); + .inc_by(count as u64); update_metrics(false); } #[cfg(any(test, feature = "testexport"))] @@ -1144,7 +1198,7 @@ impl GcWorker { ); info!("initialize compaction filter to perform GC when necessary"); - self.engine.kv_engine().unwrap().init_compaction_filter( + self.engine.kv_engine().init_compaction_filter( cfg.self_store_id, safe_point.clone(), self.config_manager.clone(), diff --git a/src/server/gc_worker/rawkv_compaction_filter.rs b/src/server/gc_worker/rawkv_compaction_filter.rs index b1174d7d4f3..5e3913f4d40 100644 --- a/src/server/gc_worker/rawkv_compaction_filter.rs +++ b/src/server/gc_worker/rawkv_compaction_filter.rs @@ -48,7 +48,6 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { }; //---------------- GC context END -------------- - let db = gc_context.db.clone(); let gc_scheduler = gc_context.gc_scheduler.clone(); let store_id = gc_context.store_id; let region_info_provider = gc_context.region_info_provider.clone(); @@ -71,7 +70,11 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { "ratio_threshold" => ratio_threshold, ); - if db.is_stalled_or_stopped() { + if gc_context + .db + .as_ref() + .map_or(false, RocksEngine::is_stalled_or_stopped) + { debug!("skip gc in compaction filter because the DB is stalled"); return std::ptr::null_mut(); } @@ -91,7 +94,6 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { } let filter = RawCompactionFilter::new( - db, safe_point, gc_scheduler, current, @@ -105,7 +107,6 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { struct RawCompactionFilter { safe_point: u64, - engine: RocksEngine, is_bottommost_level: bool, gc_scheduler: Scheduler>, current_ts: u64, @@ -135,8 +136,6 @@ impl Drop for RawCompactionFilter { fn drop(&mut self) { self.raw_gc_mvcc_deletions(); - self.engine.sync_wal().unwrap(); - self.switch_key_metrics(); self.flush_metrics(); } @@ -172,7 +171,6 @@ impl CompactionFilter for RawCompactionFilter { impl RawCompactionFilter { fn new( - engine: RocksEngine, safe_point: u64, gc_scheduler: Scheduler>, ts: u64, @@ -184,7 +182,6 @@ impl RawCompactionFilter { debug!("gc in compaction filter"; "safe_point" => safe_point); RawCompactionFilter { safe_point, - engine, is_bottommost_level: context.is_bottommost_level(), gc_scheduler, current_ts: ts, diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index f850cc74d19..526a1fab3ca 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -20,7 +20,7 @@ use raftstore_v2::{ router::{ message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, }, - SimpleWriteEncoder, + SimpleWriteBinary, SimpleWriteEncoder, }; use tikv_kv::{Modify, WriteEvent}; use tikv_util::{codec::number::NumberEncoder, time::Instant}; @@ -67,6 +67,26 @@ impl Stream for Transform { } } +fn modifies_to_simple_write(modifies: Vec) -> SimpleWriteBinary { + let mut encoder = SimpleWriteEncoder::with_capacity(128); + for m in modifies { + match m { + Modify::Put(cf, k, v) => encoder.put(cf, k.as_encoded(), &v), + Modify::Delete(cf, k) => encoder.delete(cf, k.as_encoded()), + Modify::PessimisticLock(k, lock) => { + encoder.put(CF_LOCK, k.as_encoded(), &lock.into_lock().to_bytes()) + } + Modify::DeleteRange(cf, start_key, end_key, notify_only) => encoder.delete_range( + cf, + start_key.as_encoded(), + end_key.as_encoded(), + notify_only, + ), + } + } + encoder.encode() +} + #[derive(Clone)] pub struct RaftKv2 { router: RaftRouter, @@ -109,9 +129,12 @@ impl tikv_kv::Engine for RaftKv2 { fn modify_on_kv_engine( &self, - _region_modifies: collections::HashMap>, + region_modifies: collections::HashMap>, ) -> tikv_kv::Result<()> { - // TODO + for (region_id, batch) in region_modifies { + let bin = modifies_to_simple_write(batch); + let _ = self.router.send(region_id, PeerMsg::unsafe_write(bin)); + } Ok(()) } @@ -202,23 +225,7 @@ impl tikv_kv::Engine for RaftKv2 { header.set_flags(flags); self.schedule_txn_extra(batch.extra); - let mut encoder = SimpleWriteEncoder::with_capacity(128); - for m in batch.modifies { - match m { - Modify::Put(cf, k, v) => encoder.put(cf, k.as_encoded(), &v), - Modify::Delete(cf, k) => encoder.delete(cf, k.as_encoded()), - Modify::PessimisticLock(k, lock) => { - encoder.put(CF_LOCK, k.as_encoded(), &lock.into_lock().to_bytes()) - } - Modify::DeleteRange(cf, start_key, end_key, notify_only) => encoder.delete_range( - cf, - start_key.as_encoded(), - end_key.as_encoded(), - notify_only, - ), - } - } - let data = encoder.encode(); + let data = modifies_to_simple_write(batch.modifies); let mut builder = CmdResChannelBuilder::default(); if WriteEvent::subscribed_proposed(subscribed) { builder.subscribe_proposed(); diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index 3dbb7ffc7b0..d24ec85f040 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -6,7 +6,7 @@ use std::{ time::Duration, }; -use engine_traits::{Peekable, WriteBatch}; +use engine_traits::Peekable; use grpcio::{ChannelBuilder, Environment}; use keys::data_key; use kvproto::{kvrpcpb::*, metapb::Region, tikvpb::TikvClient}; From 8864e9bac814554bbe4acaddd01282f5fdae3dae Mon Sep 17 00:00:00 2001 From: Jay Date: Sat, 24 Dec 2022 13:02:14 +0800 Subject: [PATCH 426/676] v2: fix several panics (#13986) ref tikv/tikv#12842 Perf context is disabled for now as we don't have shared kv engine. And fix region info access panic by filter out uninitialized role change. There are also several other fixes. Signed-off-by: Jay Lee --- components/cdc/src/observer.rs | 2 + components/engine_rocks/src/rocks_metrics.rs | 19 ++--- components/raftstore-v2/src/fsm/peer.rs | 1 + .../raftstore-v2/src/operation/ready/mod.rs | 1 + components/raftstore-v2/src/raft/storage.rs | 6 +- .../tests/integrations/cluster.rs | 3 +- components/raftstore/src/coprocessor/mod.rs | 3 + .../src/coprocessor/region_info_accessor.rs | 78 ++++++++++++++++--- components/raftstore/src/store/peer.rs | 1 + components/raftstore/src/store/snap.rs | 15 ++-- components/server/src/server2.rs | 5 +- src/config/mod.rs | 4 + src/coprocessor/tracker.rs | 34 +++++--- src/server/service/kv.rs | 7 +- src/server/tablet_snap.rs | 4 +- src/storage/metrics.rs | 26 ++++--- 16 files changed, 143 insertions(+), 66 deletions(-) diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 7c33d21aadd..696bc6341ee 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -272,6 +272,7 @@ mod tests { leader_id: 2, prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, + initialized: true, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { @@ -299,6 +300,7 @@ mod tests { leader_id: raft::INVALID_ID, prev_lead_transferee: 3, vote: 3, + initialized: true, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index d77f5f2dc99..24ac9eee0b4 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -946,7 +946,7 @@ struct DbStats { num_snapshots: Option, oldest_snapshot_time: Option, block_cache_size: Option, - stall_num: Vec>, + stall_num: Option<[u64; ROCKSDB_IOSTALL_KEY.len()]>, } pub struct RocksStatisticsReporter { @@ -966,8 +966,6 @@ impl StatisticsReporter for RocksStatisticsReporter { fn collect(&mut self, engine: &RocksEngine) { let db = engine.as_inner(); - let stall_num = ROCKSDB_IOSTALL_KEY.len(); - self.db_stats.stall_num.resize(stall_num, None); for cf in db.cf_names() { let cf_stats = self.cf_stats.entry(cf.to_owned()).or_default(); let handle = crate::util::get_cf_handle(db, cf).unwrap(); @@ -1074,9 +1072,9 @@ impl StatisticsReporter for RocksStatisticsReporter { } if let Some(info) = db.get_map_property_cf(handle, ROCKSDB_CFSTATS) { - for i in 0..stall_num { - *self.db_stats.stall_num[i].get_or_insert_default() += - info.get_property_int_value(ROCKSDB_IOSTALL_KEY[i]); + let stall_num = self.db_stats.stall_num.get_or_insert_default(); + for (key, val) in ROCKSDB_IOSTALL_KEY.iter().zip(stall_num) { + *val += info.get_property_int_value(key); } } } @@ -1228,12 +1226,11 @@ impl StatisticsReporter for RocksStatisticsReporter { .with_label_values(&[&self.name, "all"]) .set(v as i64); } - let stall_num = ROCKSDB_IOSTALL_KEY.len(); - for i in 0..stall_num { - if let Some(v) = self.db_stats.stall_num[i] { + if let Some(stall_num) = &self.db_stats.stall_num { + for (ty, val) in ROCKSDB_IOSTALL_TYPE.iter().zip(stall_num) { STORE_ENGINE_WRITE_STALL_REASON_GAUGE_VEC - .with_label_values(&[&self.name, ROCKSDB_IOSTALL_TYPE[i]]) - .set(v as i64); + .with_label_values(&[&self.name, ty]) + .set(*val as i64); } } } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 4b22554e694..734c2bf93d4 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -188,6 +188,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, fn on_start(&mut self) { self.schedule_tick(PeerTick::Raft); + self.schedule_tick(PeerTick::SplitRegionCheck); if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index fcab8728916..854fd965d9e 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -533,6 +533,7 @@ impl Peer { leader_id: ss.leader_id, prev_lead_transferee: target, vote: self.raft_group().raft.vote, + initialized: self.storage().is_initialized(), }, ); self.proposal_control_mut().maybe_update_term(term); diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index de58d39cce5..bce313eab83 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -348,8 +348,7 @@ mod tests { fn test_apply_snapshot() { let region = new_region(); let path = TempDir::new().unwrap(); - let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()); - mgr.init().unwrap(); + let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()).unwrap(); let raft_engine = engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) .unwrap(); @@ -402,8 +401,7 @@ mod tests { write_initial_states(&mut wb, region.clone()).unwrap(); assert!(!wb.is_empty()); raft_engine.consume(&mut wb, true).unwrap(); - let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()); - mgr.init().unwrap(); + let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()).unwrap(); // building a tablet factory let ops = DbOptions::default(); let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index ca166eab950..064fd9d1cad 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -278,8 +278,7 @@ impl RunningState { let router = RaftRouter::new(store_id, registry.clone(), router); let store_meta = router.store_meta().clone(); - let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()); - snap_mgr.init().unwrap(); + let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()).unwrap(); let coprocessor_host = CoprocessorHost::new( router.store_router().clone(), diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 022a44de463..5100e9d4632 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -268,15 +268,18 @@ pub struct RoleChange { pub prev_lead_transferee: u64, /// Which peer is voted by itself. pub vote: u64, + pub initialized: bool, } impl RoleChange { + #[cfg(feature = "testexport")] pub fn new(state: StateRole) -> Self { RoleChange { state, leader_id: raft::INVALID_ID, prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, + initialized: true, } } } diff --git a/components/raftstore/src/coprocessor/region_info_accessor.rs b/components/raftstore/src/coprocessor/region_info_accessor.rs index 338cf3962c4..37403310baf 100644 --- a/components/raftstore/src/coprocessor/region_info_accessor.rs +++ b/components/raftstore/src/coprocessor/region_info_accessor.rs @@ -46,11 +46,26 @@ use super::{ /// `RaftStoreEvent` Represents events dispatched from raftstore coprocessor. #[derive(Debug)] pub enum RaftStoreEvent { - CreateRegion { region: Region, role: StateRole }, - UpdateRegion { region: Region, role: StateRole }, - DestroyRegion { region: Region }, - RoleChange { region: Region, role: StateRole }, - UpdateRegionBuckets { region: Region, buckets: usize }, + CreateRegion { + region: Region, + role: StateRole, + }, + UpdateRegion { + region: Region, + role: StateRole, + }, + DestroyRegion { + region: Region, + }, + RoleChange { + region: Region, + role: StateRole, + initialized: bool, + }, + UpdateRegionBuckets { + region: Region, + buckets: usize, + }, } impl RaftStoreEvent { @@ -191,7 +206,11 @@ impl RoleObserver for RegionEventListener { fn on_role_change(&self, context: &mut ObserverContext<'_>, role_change: &RoleChange) { let region = context.region().clone(); let role = role_change.state; - let event = RaftStoreEvent::RoleChange { region, role }; + let event = RaftStoreEvent::RoleChange { + region, + role, + initialized: role_change.initialized, + }; self.scheduler .schedule(RegionInfoQuery::RaftStoreEvent(event)) .unwrap(); @@ -426,7 +445,10 @@ impl RegionCollector { // They are impossible to equal, or they cannot overlap. assert_ne!( region.get_region_epoch().get_version(), - current_region.get_region_epoch().get_version() + current_region.get_region_epoch().get_version(), + "{:?} vs {:?}", + region, + current_region, ); // Remove it since it's a out-of-date region info. if clear_regions_in_range { @@ -492,6 +514,10 @@ impl RegionCollector { // epoch is properly set and an Update message was sent. return; } + if let RaftStoreEvent::RoleChange { initialized, .. } = &event && !initialized { + // Ignore uninitialized peers. + return; + } if !self.check_region_range(region, true) { debug!( "Received stale event"; @@ -511,7 +537,7 @@ impl RegionCollector { RaftStoreEvent::DestroyRegion { region } => { self.handle_destroy_region(region); } - RaftStoreEvent::RoleChange { region, role } => { + RaftStoreEvent::RoleChange { region, role, .. } => { self.handle_role_change(region, role); } RaftStoreEvent::UpdateRegionBuckets { region, buckets } => { @@ -988,10 +1014,16 @@ mod tests { } } - fn must_change_role(c: &mut RegionCollector, region: &Region, role: StateRole) { + fn must_change_role( + c: &mut RegionCollector, + region: &Region, + role: StateRole, + initialized: bool, + ) { c.handle_raftstore_event(RaftStoreEvent::RoleChange { region: region.clone(), role, + initialized, }); if let Some(r) = c.regions.get(®ion.get_id()) { @@ -1037,6 +1069,12 @@ mod tests { c.handle_raftstore_event(RaftStoreEvent::RoleChange { region: new_region(1, b"k1", b"k2", 0), role: StateRole::Leader, + initialized: true, + }); + c.handle_raftstore_event(RaftStoreEvent::RoleChange { + region: new_region(1, b"", b"", 3), + role: StateRole::Leader, + initialized: false, }); check_collection(&c, &[]); @@ -1198,9 +1236,15 @@ mod tests { &mut c, &new_region(1, b"k0", b"k1", 2), StateRole::Candidate, + true, ); must_create_region(&mut c, &new_region(5, b"k99", b"", 2), StateRole::Follower); - must_change_role(&mut c, &new_region(2, b"k2", b"k8", 2), StateRole::Leader); + must_change_role( + &mut c, + &new_region(2, b"k2", b"k8", 2), + StateRole::Leader, + true, + ); must_update_region(&mut c, &new_region(2, b"k3", b"k7", 3), StateRole::Leader); // test region buckets update must_update_region_buckets(&mut c, &new_region(2, b"k3", b"k7", 3), 4); @@ -1343,7 +1387,12 @@ mod tests { // which haven't been handled. must_create_region(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Follower); must_update_region(&mut c, &new_region(2, b"k1", b"k9", 1), StateRole::Follower); - must_change_role(&mut c, &new_region(2, b"k1", b"k9", 1), StateRole::Leader); + must_change_role( + &mut c, + &new_region(2, b"k1", b"k9", 1), + StateRole::Leader, + true, + ); must_update_region(&mut c, &new_region(2, b"k1", b"k5", 2), StateRole::Leader); // TODO: In fact, region 2's role should be follower. However because it's // previous state was removed while creating updating region 4, it can't be @@ -1364,7 +1413,12 @@ mod tests { // handled. must_update_region(&mut c, &new_region(2, b"k1", b"k9", 3), StateRole::Leader); must_update_region(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Follower); - must_change_role(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Leader); + must_change_role( + &mut c, + &new_region(4, b"k5", b"k9", 2), + StateRole::Leader, + true, + ); must_destroy_region(&mut c, new_region(4, b"k5", b"k9", 2)); check_collection( &c, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 86d16b07506..a72bb59d8bf 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2282,6 +2282,7 @@ where leader_id: ss.leader_id, prev_lead_transferee: self.lead_transferee, vote: self.raft_group.raft.vote, + initialized: self.is_initialized(), }, ); self.cmd_epoch_checker.maybe_update_term(self.term()); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 04aef985e3b..05decd62815 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1932,20 +1932,15 @@ impl Display for TabletSnapKey { #[derive(Clone)] pub struct TabletSnapManager { // directory to store snapfile. - base: String, + base: PathBuf, } impl TabletSnapManager { - pub fn new>(path: T) -> Self { - Self { base: path.into() } - } - - pub fn init(&self) -> io::Result<()> { + pub fn new>(path: T) -> io::Result { // Initialize the directory if it doesn't exist. - let path = Path::new(&self.base); + let path = path.into(); if !path.exists() { - file_system::create_dir_all(path)?; - return Ok(()); + file_system::create_dir_all(&path)?; } if !path.is_dir() { return Err(io::Error::new( @@ -1953,7 +1948,7 @@ impl TabletSnapManager { format!("{} should be a directory", path.display()), )); } - Ok(()) + Ok(Self { base: path }) } pub fn tablet_gen_path(&self, key: &TabletSnapKey) -> PathBuf { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index cfda8feb233..620a6b20b74 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -747,7 +747,10 @@ where .unwrap() .to_owned(); - let snap_mgr = TabletSnapManager::new(snap_path); + let snap_mgr = match TabletSnapManager::new(&snap_path) { + Ok(mgr) => mgr, + Err(e) => fatal!("failed to create snapshot manager at {}: {}", snap_path, e), + }; // Create coprocessor endpoint. let cop_read_pool_handle = if self.config.readpool.coprocessor.use_unified_pool() { diff --git a/src/config/mod.rs b/src/config/mod.rs index a9cfdb93505..808dd22299c 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2988,6 +2988,10 @@ impl TikvConfig { .to_owned(); } + if self.storage.engine == EngineType::RaftKv2 { + self.raft_store.store_io_pool_size = cmp::max(self.raft_store.store_io_pool_size, 1); + } + self.raft_store.raftdb_path = self.infer_raft_db_path(None)?; self.raft_engine.config.dir = self.infer_raft_engine_path(None)?; diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index 24290701457..d6e146adf11 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -147,7 +147,11 @@ impl Tracker { _ => unreachable!(), } - self.with_perf_context(|perf_context| perf_context.start_observe()); + self.with_perf_context(|perf_context| { + if let Some(c) = perf_context { + c.start_observe(); + } + }); self.current_stage = TrackerState::ItemBegan(now); } @@ -160,7 +164,9 @@ impl Tracker { self.total_storage_stats.add(&storage_stats); } self.with_perf_context(|perf_context| { - perf_context.report_metrics(&[get_tls_tracker_token()]) + if let Some(c) = perf_context { + c.report_metrics(&[get_tls_tracker_token()]); + } }); self.current_stage = TrackerState::ItemFinished(now); } else { @@ -355,7 +361,7 @@ impl Tracker { fn with_perf_context(&self, f: F) -> T where - F: FnOnce(&mut Box) -> T, + F: FnOnce(&mut Option>) -> T, { thread_local! { static SELECT: RefCell>> = RefCell::new(None); @@ -379,15 +385,19 @@ impl Tracker { }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - let perf_context = c.get_or_insert_with(|| unsafe { - with_tls_engine::(|engine| { - Box::new(engine.kv_engine().unwrap().get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), - )) - }) - }); - f(perf_context) + if c.is_none() { + *c = unsafe { + with_tls_engine::(|engine| { + engine.kv_engine().map(|engine| { + Box::new(engine.get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), + )) as Box + }) + }) + }; + } + f(&mut c) }) } } diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 68a200b045e..66fc5060e68 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -945,9 +945,12 @@ impl Tikv for Service { ) { let key_range = request.take_key_range(); let (cb, resp) = paired_future_callback(); - let check_leader_scheduler = self.check_leader_scheduler.clone(); + let check_leader_scheduler = match self.check_leader_scheduler.clone() { + Some(s) => s, + // Avoid print errors if it's not supported. + None => return, + }; let task = async move { - let Some(check_leader_scheduler) = check_leader_scheduler else { return Err(box_err!("check leader is not supported")) }; check_leader_scheduler .schedule(CheckLeaderTask::GetStoreTs { key_range, cb }) .map_err(|e| Error::Other(format!("{}", e).into()))?; diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index 5dd83deb092..b5d989d5370 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -493,7 +493,7 @@ mod tests { msg.mut_message().mut_snapshot().mut_metadata().set_term(1); let send_path = TempDir::new().unwrap(); let send_snap_mgr = - TabletSnapManager::new(send_path.path().join("snap_dir").to_str().unwrap()); + TabletSnapManager::new(send_path.path().join("snap_dir").to_str().unwrap()).unwrap(); let snap_path = send_snap_mgr.tablet_gen_path(&snap_key); create_dir_all(snap_path.as_path()).unwrap(); // send file should skip directory @@ -512,7 +512,7 @@ mod tests { let recv_path = TempDir::new().unwrap(); let recv_snap_manager = - TabletSnapManager::new(recv_path.path().join("snap_dir").to_str().unwrap()); + TabletSnapManager::new(recv_path.path().join("snap_dir").to_str().unwrap()).unwrap(); let (tx, rx) = mpsc::unbounded(); let sink = tx.sink_map_err(Error::from); block_on(send_snap_files( diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index e84a7dfb4e9..080ff2c5951 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -347,17 +347,23 @@ where }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - let perf_context = c.get_or_insert_with(|| { - with_tls_engine(|engine: &mut E| { - Box::new(engine.kv_engine().unwrap().get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Storage(cmd.get_str()), - )) - }) - }); - perf_context.start_observe(); + if c.is_none() { + *c = with_tls_engine(|engine: &mut E| { + engine.kv_engine().map(|c| { + Box::new(c.get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Storage(cmd.get_str()), + )) as Box + }) + }); + }; + if let Some(c) = &mut *c { + c.start_observe(); + } let res = f(); - perf_context.report_metrics(&[get_tls_tracker_token()]); + if let Some(c) = &mut *c { + c.report_metrics(&[get_tls_tracker_token()]); + } res }) } From 8ec3cea85259f4eb91db5a49ca2ab6631ffbd6fe Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 27 Dec 2022 10:00:16 +0800 Subject: [PATCH 427/676] raftstore-v2: fix several errors (#13988) ref tikv/tikv#12842 This PR fixes several issues: - raftstore CPU usage missing in grafana - restart fails because incorrect path asserts - restart fails because of missing flush records - get snapshots fails occasionally because of lease not renew Signed-off-by: Jay Lee --- .../raftstore-v2/src/operation/query/local.rs | 90 ++++++++++++------- .../raftstore-v2/src/operation/query/mod.rs | 2 + .../src/operation/ready/snapshot.rs | 10 ++- .../tests/integrations/test_split.rs | 22 ++++- components/raftstore/src/store/fsm/peer.rs | 2 + components/raftstore/src/store/mod.rs | 8 +- components/raftstore/src/store/region_meta.rs | 11 ++- metrics/grafana/performance_write.json | 2 +- metrics/grafana/tikv_details.json | 2 +- metrics/grafana/tikv_summary.json | 2 +- metrics/grafana/tikv_trouble_shooting.json | 2 +- src/config/mod.rs | 8 +- 12 files changed, 115 insertions(+), 46 deletions(-) diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 812cf2354fa..482de719308 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -17,8 +17,11 @@ use kvproto::{ use raftstore::{ errors::RAFTSTORE_IS_BUSY, store::{ - cmd_resp, util::LeaseState, LocalReadContext, LocalReaderCore, ReadDelegate, ReadExecutor, - ReadExecutorProvider, RegionSnapshot, RequestPolicy, TLS_LOCAL_READ_METRICS, + cmd_resp, + util::LeaseState, + worker_metrics::{self, TLS_LOCAL_READ_METRICS}, + LocalReadContext, LocalReaderCore, ReadDelegate, ReadExecutor, ReadExecutorProvider, + RegionSnapshot, RequestPolicy, }, Error, Result, }; @@ -94,6 +97,8 @@ where Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), // It can not handle other policies. + // TODO: we should only abort when lease expires. For other cases we should retry + // infinitely. Ok(_) => Ok(None), Err(e) => Err(e), } @@ -179,33 +184,46 @@ where Ok(None) => Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())), }; + worker_metrics::maybe_tls_local_read_metrics_flush(); + async move { match res { - Either::Left(Ok(Some(snap))) => return Ok(snap), - Either::Left(Err(e)) => return Err(e), + Either::Left(Ok(Some(snap))) => Ok(snap), + Either::Left(Err(e)) => Err(e), Either::Right((fut, mut reader)) => { - if let Some(query_res) = fut.await? - && query_res.read().is_some() - { - // If query successful, try again. - req.mut_header().set_read_quorum(false); - if let Some(snap) = reader.try_get_snapshot(&req)? { - return Ok(snap); + let err = match fut.await? { + Some(query_res) => { + if query_res.read().is_some() { + // If query successful, try again. + req.mut_header().set_read_quorum(false); + if let Some(snap) = reader.try_get_snapshot(&req)? { + return Ok(snap); + } else { + let mut err = errorpb::Error::default(); + err.set_message(format!("no delegate found for {}", region_id)); + err + } + } else { + let QueryResult::Response(res) = query_res else { unreachable!() }; + assert!(res.get_header().has_error(), "{:?}", res); + return Err(res); } - } + } + None => { + let mut err = errorpb::Error::default(); + err.set_message(format!( + "failed to extend lease: canceled: {}", + region_id + )); + err + } + }; + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + Err(resp) } Either::Left(Ok(None)) => unreachable!(), } - - let mut err = errorpb::Error::default(); - err.set_message(format!( - "Fail to get snapshot from LocalReader for region {}. \ - Maybe due to `not leader`, `region not found` or `not applied to the current term`", - region_id - )); - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - Err(resp) } } @@ -216,7 +234,12 @@ where region_id: u64, req: &RaftCmdRequest, ) -> impl Future, RaftCmdResponse>> { - let (msg, sub) = PeerMsg::raft_query(req.clone()); + let mut req = req.clone(); + // Remote lease is updated step by step. It's possible local reader expires + // while the raftstore doesn't. So we need to trigger an update + // explicitly. TODO: find a way to reduce the triggered heartbeats. + req.mut_header().set_read_quorum(true); + let (msg, sub) = PeerMsg::raft_query(req); let res = match MsgRouter::send(&self.router, region_id, msg) { Ok(()) => Ok(sub), Err(TrySendError::Full(_)) => { @@ -471,8 +494,8 @@ mod tests { use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; use pd_client::BucketMeta; use raftstore::store::{ - util::Lease, ReadCallback, ReadProgress, RegionReadProgress, TrackVer, TxnExt, - TLS_LOCAL_READ_METRICS, + util::Lease, worker_metrics::TLS_LOCAL_READ_METRICS, ReadCallback, ReadProgress, + RegionReadProgress, TrackVer, TxnExt, }; use slog::o; use tempfile::Builder; @@ -556,13 +579,16 @@ mod tests { match msg { // send the result back to local reader - PeerMsg::RaftQuery(query) => ReadCallback::set_result( - query.ch, - QueryResult::Read(ReadResponse { - read_index: 0, - txn_extra_op: Default::default(), - }), - ), + PeerMsg::RaftQuery(query) => { + assert!(query.request.get_header().get_read_quorum()); + ReadCallback::set_result( + query.ch, + QueryResult::Read(ReadResponse { + read_index: 0, + txn_extra_op: Default::default(), + }), + ) + } _ => unreachable!(), } ch_tx.send(rx).unwrap(); diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index eb58dcbbc23..4ffb4bcdcec 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -380,6 +380,8 @@ impl Peer { entry_storage.apply_state(), GroupState::Ordered, self.raft_group().status(), + self.raft_group().raft.raft_log.last_index(), + self.raft_group().raft.raft_log.persisted, ); // V2 doesn't persist commit index and term, fill them with in-memory values. meta.raft_apply.commit_index = cmp::min( diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 8598d1cc41d..3208ecb25ae 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -27,7 +27,7 @@ use std::{ }, }; -use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, CF_RAFT}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, ALL_CFS}; use kvproto::raft_serverpb::{PeerState, RaftSnapshotData}; use protobuf::Message; use raft::{eraftpb::Snapshot, StateRole}; @@ -442,8 +442,12 @@ impl Storage { .unwrap(); lb.put_region_state(region_id, last_index, self.region_state()) .unwrap(); - lb.put_flushed_index(region_id, CF_RAFT, last_index, last_index) - .unwrap(); + // We assume there should be flush records in all CFs. Skip any CF here may + // break the constraint. + for cf in ALL_CFS { + lb.put_flushed_index(region_id, cf, last_index, last_index) + .unwrap(); + } let (path, clean_split) = match self.split_init_mut() { // If index not match, the peer may accept a newer snapshot after split. diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs index 3b315a2d943..1174a428011 100644 --- a/components/raftstore-v2/tests/integrations/test_split.rs +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -2,7 +2,7 @@ use std::{thread, time::Duration}; -use engine_traits::{RaftEngineReadOnly, CF_DEFAULT, CF_RAFT}; +use engine_traits::{Peekable, RaftEngineReadOnly, CF_DEFAULT, CF_RAFT}; use futures::executor::block_on; use kvproto::{ metapb, pdpb, @@ -257,6 +257,26 @@ fn test_split() { actual_split_key.as_encoded(), false, ); + + // Split should survive restart. + drop(raft_engine); + cluster.restart(0); + let region_and_key = vec![ + (2, b"k00"), + (1000, b"k22"), + (1001, b"k11"), + (1002, b"k33"), + (1003, b"k55"), + ]; + for (region_id, key) in region_and_key { + let snapshot = cluster.routers[0].stale_snapshot(region_id); + assert!( + snapshot.get_value(key).unwrap().is_some(), + "{} {:?}", + region_id, + key + ); + } } // TODO: test split race with diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index bad3ac2077d..225126f0edb 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -1122,6 +1122,8 @@ where store.apply_state(), self.fsm.hibernate_state.group_state(), peer.raft_group.status(), + peer.raft_group.raft.raft_log.last_index(), + peer.raft_group.raft.raft_log.persisted, )) } CasualMessage::QueryRegionLeaderResp { region, leader } => { diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 0846e8362b3..62561c63cbc 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -74,10 +74,10 @@ pub use self::{ txn_ext::{LocksStatus, PeerPessimisticLocks, PessimisticLockPair, TxnExt}, util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ - metrics::TLS_LOCAL_READ_METRICS, AutoSplitController, Bucket, BucketRange, - CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, - KeyEntry, LocalReadContext, LocalReader, LocalReaderCore, PdTask, ReadDelegate, - ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + metrics as worker_metrics, AutoSplitController, Bucket, BucketRange, CachedReadDelegate, + CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, + LocalReadContext, LocalReader, LocalReaderCore, PdTask, ReadDelegate, ReadExecutor, + ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, }, diff --git a/components/raftstore/src/store/region_meta.rs b/components/raftstore/src/store/region_meta.rs index 7de687e9dbb..4d44673e057 100644 --- a/components/raftstore/src/store/region_meta.rs +++ b/components/raftstore/src/store/region_meta.rs @@ -93,6 +93,8 @@ pub struct RaftStatus { pub applied: u64, pub voters: HashMap, pub learners: HashMap, + pub last_index: u64, + pub persisted_index: u64, } impl<'a> From> for RaftStatus { @@ -126,6 +128,8 @@ impl<'a> From> for RaftStatus { applied, voters, learners, + last_index: 0, + persisted_index: 0, } } } @@ -250,6 +254,8 @@ impl RegionMeta { apply_state: &raft_serverpb::RaftApplyState, group_state: GroupState, raft_status: Status<'_>, + last_index: u64, + persisted_index: u64, ) -> Self { let region = local_state.get_region(); let epoch = region.get_region_epoch(); @@ -270,10 +276,13 @@ impl RegionMeta { } else { None }; + let mut raft_status: RaftStatus = raft_status.into(); + raft_status.last_index = last_index; + raft_status.persisted_index = persisted_index; Self { group_state, - raft_status: raft_status.into(), + raft_status, raft_apply: RaftApplyState { applied_index: apply_state.get_applied_index(), commit_index: apply_state.get_commit_index(), diff --git a/metrics/grafana/performance_write.json b/metrics/grafana/performance_write.json index c289d979dc8..ddb9621b97a 100644 --- a/metrics/grafana/performance_write.json +++ b/metrics/grafana/performance_write.json @@ -3029,7 +3029,7 @@ "query": { "datasourceId": 1, "model": { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 0c2116818dc..cff4b5f7742 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -5816,7 +5816,7 @@ "query": { "datasourceId": 1, "model": { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", diff --git a/metrics/grafana/tikv_summary.json b/metrics/grafana/tikv_summary.json index b19478464a2..847ac5ef289 100644 --- a/metrics/grafana/tikv_summary.json +++ b/metrics/grafana/tikv_summary.json @@ -3109,7 +3109,7 @@ "query": { "datasourceId": 1, "model": { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", diff --git a/metrics/grafana/tikv_trouble_shooting.json b/metrics/grafana/tikv_trouble_shooting.json index 735c1f305f7..bf1fd5baacf 100644 --- a/metrics/grafana/tikv_trouble_shooting.json +++ b/metrics/grafana/tikv_trouble_shooting.json @@ -1326,7 +1326,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", diff --git a/src/config/mod.rs b/src/config/mod.rs index 808dd22299c..2074c992519 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3018,12 +3018,18 @@ impl TikvConfig { return Err("raftdb.wal_dir can't be same as rocksdb.wal_dir".into()); } + let kv_data_exists = if self.storage.engine == EngineType::RaftKv { + RocksEngine::exists(&kv_db_path) + } else { + Path::new(&self.storage.data_dir).join("tablets").exists() + }; + RaftDataStateMachine::new( &self.storage.data_dir, &self.raft_store.raftdb_path, &self.raft_engine.config.dir, ) - .validate(RocksEngine::exists(&kv_db_path))?; + .validate(kv_data_exists)?; // Check blob file dir is empty when titan is disabled if !self.rocksdb.titan.enabled { From 5806cd134335f0f29ce7a0acfe21bb06d7b6bbc1 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 27 Dec 2022 10:46:19 +0800 Subject: [PATCH 428/676] support check leader (#13987) ref tikv/tikv#12842 This PR makes check leader works by introducing a trait `ExternRegionInfo`. Signed-off-by: Jay Lee --- components/raftstore-v2/src/batch/store.rs | 3 +- components/raftstore-v2/src/fsm/store.rs | 98 ++++++++++++++++++- .../src/operation/command/admin/split.rs | 1 + components/raftstore-v2/src/operation/life.rs | 4 + .../raftstore-v2/src/operation/query/local.rs | 12 +-- .../src/operation/ready/snapshot.rs | 1 + components/raftstore-v2/src/router/imp.rs | 4 +- components/raftstore/src/store/fsm/store.rs | 36 +++++++ .../src/store/worker/check_leader.rs | 52 ++++------ components/raftstore/src/store/worker/read.rs | 10 +- components/server/src/server.rs | 2 +- components/server/src/server2.rs | 21 +++- components/test_raftstore/src/server.rs | 2 +- src/server/raftkv2/node.rs | 4 - src/server/server.rs | 4 +- src/server/service/kv.rs | 11 +-- 16 files changed, 192 insertions(+), 73 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 0d5f984107c..bcfa6ca0771 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -276,7 +276,7 @@ impl StorePollerBuilder { fn init(&self) -> Result>> { let mut regions = HashMap::default(); let cfg = self.cfg.value(); - let meta = self.store_meta.lock().unwrap(); + let mut meta = self.store_meta.lock().unwrap(); self.engine .for_each_raft_group::(&mut |region_id| { assert_ne!(region_id, INVALID_ID); @@ -298,6 +298,7 @@ impl StorePollerBuilder { StateRole::Follower, ); } + meta.set_region(storage.region(), storage.is_initialized(), &self.logger); let (sender, peer_fsm) = PeerFsm::new(&cfg, &self.tablet_registry, storage)?; meta.region_read_progress diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index bd31de69496..cb7aa99b179 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -1,12 +1,20 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::{Duration, SystemTime}; +use std::{ + collections::BTreeMap, + ops::Bound::{Excluded, Unbounded}, + time::{Duration, SystemTime}, +}; use batch_system::Fsm; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use futures::{compat::Future01CompatExt, FutureExt}; -use raftstore::store::{Config, ReadDelegate, RegionReadProgressRegistry}; +use keys::{data_end_key, data_key}; +use kvproto::metapb::Region; +use raftstore::store::{ + fsm::store::StoreRegionMeta, Config, ReadDelegate, RegionReadProgressRegistry, +}; use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, @@ -19,13 +27,95 @@ use crate::{ router::{StoreMsg, StoreTick}, }; -#[derive(Default)] pub struct StoreMeta { - pub store_id: Option, + pub store_id: u64, /// region_id -> reader pub readers: HashMap, /// region_id -> `RegionReadProgress` pub region_read_progress: RegionReadProgressRegistry, + /// (region_end_key, epoch.version) -> region_id + /// + /// Unlinke v1, ranges in v2 may be overlapped. So we use version + /// to avoid end key conflict. + pub(crate) region_ranges: BTreeMap<(Vec, u64), u64>, + /// region_id -> (region, initialized) + pub(crate) regions: HashMap, +} + +impl StoreMeta { + pub fn new(store_id: u64) -> StoreMeta { + StoreMeta { + store_id, + readers: HashMap::default(), + region_read_progress: RegionReadProgressRegistry::default(), + region_ranges: BTreeMap::default(), + regions: HashMap::default(), + } + } + + pub fn set_region(&mut self, region: &Region, initialized: bool, logger: &Logger) { + let region_id = region.get_id(); + let version = region.get_region_epoch().get_version(); + let prev = self + .regions + .insert(region_id, (region.clone(), initialized)); + // `prev` only makes sense when it's initialized. + if let Some((prev, prev_init)) = prev && prev_init { + assert!(initialized, "{:?} region corrupted", logger.list()); + if prev.get_region_epoch().get_version() != version { + let prev_id = self.region_ranges.remove(&(data_end_key(prev.get_end_key()), prev.get_region_epoch().get_version())); + assert_eq!(prev_id, Some(region_id), "{:?} region corrupted", logger.list()); + } else { + assert!(self.region_ranges.get(&(data_end_key(prev.get_end_key()), version)).is_some(), "{:?} region corrupted", logger.list()); + return; + } + } + if initialized { + assert!( + self.region_ranges + .insert((data_end_key(region.get_end_key()), version), region_id) + .is_none(), + "{:?} region corrupted", + logger.list() + ); + } + } +} + +impl StoreRegionMeta for StoreMeta { + #[inline] + fn store_id(&self) -> u64 { + self.store_id + } + + #[inline] + fn region_read_progress(&self) -> &RegionReadProgressRegistry { + &self.region_read_progress + } + + #[inline] + fn search_region( + &self, + start_key: &[u8], + end_key: &[u8], + mut visitor: impl FnMut(&kvproto::metapb::Region), + ) { + let start_key = data_key(start_key); + for (_, id) in self + .region_ranges + .range((Excluded((start_key, 0)), Unbounded::<(Vec, u64)>)) + { + let (region, initialized) = &self.regions[id]; + if !initialized { + continue; + } + if end_key.is_empty() || end_key > region.get_start_key() { + visitor(region); + } else { + break; + } + } + } } pub struct Store { diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 64388333fee..391f0253439 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -414,6 +414,7 @@ impl Peer { { let mut meta = store_ctx.store_meta.lock().unwrap(); + meta.set_region(derived, true, &self.logger); let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); self.set_region( &store_ctx.coprocessor_host, diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 59e0e532faa..d61f11e7ada 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -237,6 +237,10 @@ impl Store { return; } }; + ctx.store_meta + .lock() + .unwrap() + .set_region(fsm.peer().region(), false, fsm.logger()); let mailbox = BasicMailbox::new(tx, fsm, ctx.router.state_cnt().clone()); if ctx .router diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 482de719308..2cb5497d789 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -81,7 +81,7 @@ where } pub fn store_meta(&self) -> &Arc> { - self.local_reader.store_meta() + &self.local_reader.store_meta().store_meta } pub fn pre_propose_raft_command( @@ -376,7 +376,7 @@ where type StoreMeta = Arc>; fn store_id(&self) -> Option { - self.store_meta.as_ref().lock().unwrap().store_id + Some(self.store_meta.as_ref().lock().unwrap().store_id) } /// get the ReadDelegate with region_id and the number of delegates in the @@ -397,10 +397,6 @@ where } (meta.readers.len(), None) } - - fn store_meta(&self) -> &Self::StoreMeta { - &self.store_meta - } } struct SnapRequestInspector<'r> { @@ -610,7 +606,7 @@ mod tests { let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); let reg = TabletRegistry::new(factory, path.path()).unwrap(); - let store_meta = Arc::new(Mutex::new(StoreMeta::default())); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), reg.clone()); let (mix_tx, mix_rx) = sync_channel(1); let handler = mock_raftstore(mix_rx); @@ -811,7 +807,7 @@ mod tests { let reg = TabletRegistry::new(factory, path.path()).unwrap(); let store_meta = - StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::default())), reg.clone()); + StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(1))), reg.clone()); let tablet1; let tablet2; diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 3208ecb25ae..ce8327c2012 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -163,6 +163,7 @@ impl Peer { self.raft_group_mut().advance_apply_to(persisted_index); { let mut meta = ctx.store_meta.lock().unwrap(); + meta.set_region(self.region(), true, &self.logger); meta.readers .insert(region_id, self.generate_read_delegate()); meta.region_read_progress diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index a03459c96d2..668d7591a40 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -116,9 +116,7 @@ where impl RaftRouter { pub fn new(store_id: u64, reg: TabletRegistry, router: StoreRouter) -> Self { - let mut store_meta = StoreMeta::default(); - store_meta.store_id = Some(store_id); - let store_meta = Arc::new(Mutex::new(store_meta)); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); let logger = router.logger().clone(); RaftRouter { diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 3cadcce5a82..310c33b95b2 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -115,6 +115,14 @@ pub struct StoreInfo { pub capacity: u64, } +/// A trait that provide the meta information that can be accessed outside +/// of raftstore. +pub trait StoreRegionMeta: Send { + fn store_id(&self) -> u64; + fn region_read_progress(&self) -> &RegionReadProgressRegistry; + fn search_region(&self, start_key: &[u8], end_key: &[u8], visitor: impl FnMut(&Region)); +} + pub struct StoreMeta { pub store_id: Option, /// region_end_key -> region_id @@ -154,6 +162,34 @@ pub struct StoreMeta { pub damaged_ranges: HashMap, Vec)>, } +impl StoreRegionMeta for StoreMeta { + #[inline] + fn store_id(&self) -> u64 { + self.store_id.unwrap() + } + + #[inline] + fn search_region(&self, start_key: &[u8], end_key: &[u8], mut visitor: impl FnMut(&Region)) { + let start_key = data_key(start_key); + for (_, id) in self + .region_ranges + .range((Excluded(start_key), Unbounded::>)) + { + let region = &self.regions[id]; + if end_key.is_empty() || end_key > region.get_start_key() { + visitor(region); + } else { + break; + } + } + } + + #[inline] + fn region_read_progress(&self) -> &RegionReadProgressRegistry { + &self.region_read_progress + } +} + impl StoreMeta { pub fn new(vote_capacity: usize) -> StoreMeta { StoreMeta { diff --git a/components/raftstore/src/store/worker/check_leader.rs b/components/raftstore/src/store/worker/check_leader.rs index ab83752d8c3..c4646de35a4 100644 --- a/components/raftstore/src/store/worker/check_leader.rs +++ b/components/raftstore/src/store/worker/check_leader.rs @@ -1,27 +1,25 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::Bound::{Excluded, Unbounded}, fmt, sync::{Arc, Mutex}, }; use engine_traits::KvEngine; use fail::fail_point; -use keys::{data_end_key, data_key, enc_start_key}; use kvproto::kvrpcpb::{KeyRange, LeaderInfo}; use tikv_util::worker::Runnable; use crate::{ coprocessor::CoprocessorHost, - store::{fsm::store::StoreMeta, util::RegionReadProgressRegistry}, + store::{fsm::store::StoreRegionMeta, util::RegionReadProgressRegistry}, }; -pub struct Runner +pub struct Runner where E: KvEngine, { - store_meta: Arc>, + store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, coprocessor: CoprocessorHost, } @@ -55,12 +53,13 @@ impl fmt::Display for Task { } } -impl Runner +impl Runner where + S: StoreRegionMeta, E: KvEngine, { - pub fn new(store_meta: Arc>, coprocessor: CoprocessorHost) -> Runner { - let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); + pub fn new(store_meta: Arc>, coprocessor: CoprocessorHost) -> Self { + let region_read_progress = store_meta.lock().unwrap().region_read_progress().clone(); Runner { region_read_progress, store_meta, @@ -82,48 +81,39 @@ where .unwrap_or(0) }) } else { - let (start_key, end_key) = ( - data_key(key_range.get_start_key()), - data_end_key(key_range.get_end_key()), - ); // `store_safe_ts` won't be accessed frequently (like per-request or // per-transaction), also this branch won't entry because the request key range // is empty currently (in v5.1) keep this branch for robustness and future use, // so it is okay getting `store_safe_ts` from `store_meta` (behide a mutex) let meta = self.store_meta.lock().unwrap(); - meta.region_read_progress.with(|registry| { - meta.region_ranges - // get overlapped regions - .range((Excluded(start_key), Unbounded)) - .take_while(|(_, id)| end_key > enc_start_key(&meta.regions[*id])) - // get the min `safe_ts` - .map(|(_, id)| { - registry.get(id).unwrap().safe_ts() - }) - .filter(|ts| *ts != 0) // ts == 0 means the peer is uninitialized - .min() - .unwrap_or(0) + meta.region_read_progress().with(|registry| { + let mut min_ts = u64::MAX; + meta.search_region(key_range.get_start_key(), key_range.get_end_key(), |r| { + let ts = registry.get(&r.get_id()).unwrap().safe_ts(); + // ts == 0 means the peer is uninitialized + if ts != 0 && ts < min_ts { + min_ts = ts; + } + }); + if min_ts == u64::MAX { 0 } else { min_ts } }) } } } -impl Runnable for Runner -where - E: KvEngine, -{ +impl Runnable for Runner { type Task = Task; fn run(&mut self, task: Task) { match task { Task::CheckLeader { leaders, cb } => { fail_point!( "before_check_leader_store_2", - self.store_meta.lock().unwrap().store_id == Some(2), + self.store_meta.lock().unwrap().store_id() == 2, |_| {} ); fail_point!( "before_check_leader_store_3", - self.store_meta.lock().unwrap().store_id == Some(3), + self.store_meta.lock().unwrap().store_id() == 3, |_| {} ); let regions = self @@ -146,7 +136,7 @@ mod tests { use kvproto::metapb::Region; use super::*; - use crate::store::util::RegionReadProgress; + use crate::store::{fsm::StoreMeta, util::RegionReadProgress}; #[test] fn test_get_range_min_safe_ts() { diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a20fcefdbdb..a8fc2e6e3df 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -294,8 +294,6 @@ pub trait ReadExecutorProvider: Send + Clone + 'static { /// get the ReadDelegate with region_id and the number of delegates in the /// StoreMeta fn get_executor_and_len(&self, region_id: u64) -> (usize, Option); - - fn store_meta(&self) -> &Self::StoreMeta; } #[derive(Clone)] @@ -346,10 +344,6 @@ where } (meta.readers.len(), None) } - - fn store_meta(&self) -> &Self::StoreMeta { - &self.store_meta - } } /// #[RaftstoreCommon] @@ -716,8 +710,8 @@ where } } - pub fn store_meta(&self) -> &S::StoreMeta { - self.store_meta.store_meta() + pub fn store_meta(&self) -> &S { + &self.store_meta } // Ideally `get_delegate` should return `Option<&ReadDelegate>`, but if so the diff --git a/components/server/src/server.rs b/components/server/src/server.rs index d7a05fff115..73b42d96d22 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -962,7 +962,7 @@ where self.resolver.clone().unwrap(), Either::Left(snap_mgr.clone()), gc_worker.clone(), - Some(check_leader_scheduler), + check_leader_scheduler, self.env.clone(), unified_read_pool, debug_thread_pool, diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 620a6b20b74..7f81d931181 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -55,7 +55,10 @@ use raftstore::{ BoxConsistencyCheckObserver, ConsistencyCheckMethod, CoprocessorHost, RawConsistencyCheckObserver, }, - store::{memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, SplitConfigManager, TabletSnapManager}, + store::{ + memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, CheckLeaderRunner, SplitConfigManager, + TabletSnapManager, + }, RegionInfoAccessor, }; use security::SecurityManager; @@ -214,6 +217,7 @@ struct TikvServer { concurrency_manager: ConcurrencyManager, env: Arc, background_worker: Worker, + check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, causal_ts_provider: Option>, // used for rawkv apiv2 @@ -297,6 +301,10 @@ where info!("Causal timestamp provider startup."); } + // Run check leader in a dedicate thread, because it is time sensitive + // and crucial to TiCDC replication lag. + let check_leader_worker = WorkerBuilder::new("check_leader").thread_count(1).create(); + TikvServer { config, cfg_controller: Some(cfg_controller), @@ -318,6 +326,7 @@ where concurrency_manager, env, background_worker, + check_leader_worker, flow_info_sender: None, flow_info_receiver: None, sst_worker: None, @@ -764,6 +773,14 @@ where cop_read_pools.handle() }; + let check_leader_runner = CheckLeaderRunner::new( + self.node.as_ref().unwrap().router().store_meta().clone(), + self.coprocessor_host.clone().unwrap(), + ); + let check_leader_scheduler = self + .check_leader_worker + .start("check-leader", check_leader_runner); + let server_config = Arc::new(VersionTrack::new(self.config.server.clone())); self.config @@ -797,7 +814,7 @@ where self.resolver.clone().unwrap(), Either::Right(snap_mgr.clone()), gc_worker.clone(), - None, + check_leader_scheduler, self.env.clone(), unified_read_pool, debug_thread_pool, diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 2521347ec18..0ec60e468ee 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -525,7 +525,7 @@ impl ServerCluster { resolver.clone(), tikv_util::Either::Left(snap_mgr.clone()), gc_worker.clone(), - Some(check_leader_scheduler.clone()), + check_leader_scheduler.clone(), self.env.clone(), None, debug_thread_pool.clone(), diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index bcfd542035b..ed6f16e8bec 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -104,10 +104,6 @@ where T: Transport + 'static, { let store_id = self.id(); - { - let mut meta = self.router().store_meta().lock().unwrap(); - meta.store_id = Some(store_id); - } if let Some(region) = Bootstrap::new( &raft_engine, self.cluster_id, diff --git a/src/server/server.rs b/src/server/server.rs index 22ab1682309..4c1f5e7ef69 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -96,7 +96,7 @@ where resolver: S, snap_mgr: Either, gc_worker: GcWorker, - check_leader_scheduler: Option>, + check_leader_scheduler: Scheduler, env: Arc, yatp_read_pool: Option, debug_thread_pool: Arc, @@ -580,7 +580,7 @@ mod tests { }, Either::Left(SnapManager::new("")), gc_worker, - Some(check_leader_scheduler), + check_leader_scheduler, env, None, debug_thread_pool, diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 66fc5060e68..6c85741f64a 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -76,7 +76,7 @@ pub struct Service { // For handling snapshot. snap_scheduler: Scheduler, // For handling `CheckLeader` request. - check_leader_scheduler: Option>, + check_leader_scheduler: Scheduler, enable_req_batch: bool, @@ -115,7 +115,7 @@ impl Service { copr: Endpoint, copr_v2: coprocessor_v2::Endpoint, snap_scheduler: Scheduler, - check_leader_scheduler: Option>, + check_leader_scheduler: Scheduler, grpc_thread_load: Arc, enable_req_batch: bool, proxy: Proxy, @@ -909,7 +909,6 @@ impl Tikv for Service { let (cb, resp) = paired_future_callback(); let check_leader_scheduler = self.check_leader_scheduler.clone(); let task = async move { - let Some(check_leader_scheduler) = check_leader_scheduler else { return Err(box_err!("check leader is not supported")) }; check_leader_scheduler .schedule(CheckLeaderTask::CheckLeader { leaders, cb }) .map_err(|e| Error::Other(format!("{}", e).into()))?; @@ -945,11 +944,7 @@ impl Tikv for Service { ) { let key_range = request.take_key_range(); let (cb, resp) = paired_future_callback(); - let check_leader_scheduler = match self.check_leader_scheduler.clone() { - Some(s) => s, - // Avoid print errors if it's not supported. - None => return, - }; + let check_leader_scheduler = self.check_leader_scheduler.clone(); let task = async move { check_leader_scheduler .schedule(CheckLeaderTask::GetStoreTs { key_range, cb }) From 929b329af491d40e11648e2606124c8d877ba37a Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Tue, 27 Dec 2022 11:00:17 +0800 Subject: [PATCH 429/676] fix the bug that send mistake peer snapshot (#13915) ref tikv/tikv#12842 Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Ti Chi Robot --- .../raftstore-v2/src/operation/ready/mod.rs | 2 +- .../src/operation/ready/snapshot.rs | 159 +++++++++--------- components/raftstore-v2/src/raft/storage.rs | 44 +++-- .../raftstore/src/store/async_io/read.rs | 6 +- 4 files changed, 116 insertions(+), 95 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 854fd965d9e..0e911e48255 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -520,7 +520,7 @@ impl Peer { } StateRole::Follower => { self.leader_lease_mut().expire(); - self.storage_mut().cancel_generating_snap(None); + self.storage_mut().cancel_generating_snap(None, None); self.clear_in_memory_pessimistic_locks(); } _ => {} diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index ce8327c2012..149505f0af4 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -20,7 +20,7 @@ use std::{ fmt::{self, Debug}, - fs, mem, + fs, sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, Arc, @@ -101,6 +101,10 @@ impl GenSnapTask { pub fn set_for_balance(&mut self) { self.for_balance = true; } + + pub fn to_peer(&self) -> u64 { + self.to_peer + } } impl Debug for GenSnapTask { @@ -232,40 +236,38 @@ impl Apply { impl Storage { pub fn is_generating_snapshot(&self) -> bool { - let snap_state = self.snap_state_mut(); - matches!(*snap_state, SnapState::Generating { .. }) + let snap_states = self.snap_states.borrow_mut(); + for (_, state) in snap_states.iter() { + if matches!(*state, SnapState::Generating { .. }) { + return true; + } + } + false } /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no /// unavailable snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { - let mut snap_state = self.snap_state_mut(); - match &*snap_state { - SnapState::Generating { canceled, .. } => { - if canceled.load(Ordering::SeqCst) { - self.cancel_generating_snap(None); - } else { - return Err(raft::Error::Store( - raft::StorageError::SnapshotTemporarilyUnavailable, - )); + if let Some(state) = self.snap_states.borrow_mut().get_mut(&to) { + match state { + SnapState::Generating { ref canceled, .. } => { + if canceled.load(Ordering::SeqCst) { + self.cancel_generating_snap(Some(to), None); + } else { + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } } - } - SnapState::Generated(_) => { - // TODO: `to` may not be equal to the generated snapshot. - let SnapState::Generated(snap) = mem::replace(&mut *snap_state, SnapState::Relax) else { unreachable!() }; - if self.validate_snap(&snap, request_index) { - return Ok(*snap); + SnapState::Generated(ref s) => { + let snap = *s.clone(); + *state = SnapState::Relax; + if self.validate_snap(&snap, request_index) { + return Ok(snap); + } } - } - _ => {} - } - - if SnapState::Relax != *snap_state { - panic!( - "{:?} unexpected state: {:?}", - self.logger().list(), - *snap_state - ); + _ => {} + }; } info!( @@ -276,15 +278,18 @@ impl Storage { ); let canceled = Arc::new(AtomicBool::new(false)); let index = Arc::new(AtomicU64::new(0)); - *snap_state = SnapState::Generating { - canceled: canceled.clone(), - index: index.clone(), - }; - - let task = GenSnapTask::new(self.region().get_id(), to, index, canceled); let mut gen_snap_task = self.gen_snap_task_mut(); - assert!(gen_snap_task.is_none()); - *gen_snap_task = Box::new(Some(task)); + if gen_snap_task.is_none() { + self.snap_states.borrow_mut().insert( + to, + SnapState::Generating { + canceled: canceled.clone(), + index: index.clone(), + }, + ); + let task = GenSnapTask::new(self.region().get_id(), to, index, canceled); + *gen_snap_task = Box::new(Some(task)); + } Err(raft::Error::Store( raft::StorageError::SnapshotTemporarilyUnavailable, )) @@ -332,28 +337,32 @@ impl Storage { true } - /// Cancel generating snapshot. - pub fn cancel_generating_snap(&self, compact_to: Option) { - let mut snap_state = self.snap_state_mut(); - let SnapState::Generating { - ref canceled, - ref index, - } = *snap_state else { return }; - - if let Some(idx) = compact_to { - let snap_index = index.load(Ordering::SeqCst); - if snap_index == 0 || idx <= snap_index + 1 { - return; + pub fn cancel_generating_snap(&self, to: Option, compact_to: Option) { + if let Some(id) = to { + let mut states = self.snap_states.borrow_mut(); + if let Some(state) = states.get(&id) { + let SnapState::Generating { + ref index, + .. + } = *state else { return }; + if let Some(idx) = compact_to { + let snap_index = index.load(Ordering::SeqCst); + if snap_index == 0 || idx <= snap_index + 1 { + return; + } + } + info!( + self.logger(), + "snapshot is canceled"; + "compact_to" => compact_to, + ); + self.cancel_snap_task(to); + states.remove(&id); } + } else { + self.cancel_snap_task(to); + self.snap_states.borrow_mut().clear(); } - canceled.store(true, Ordering::SeqCst); - *snap_state = SnapState::Relax; - self.gen_snap_task_mut().take(); - info!( - self.logger(), - "snapshot is canceled"; - "compact_to" => compact_to, - ); STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.cancel.inc(); } @@ -362,29 +371,27 @@ impl Storage { /// TODO: make the snap state more clearer, the snapshot must be consumed. pub fn on_snapshot_generated(&self, res: GenSnapRes) -> bool { if res.is_none() { - self.cancel_generating_snap(None); + self.cancel_generating_snap(None, None); return false; } - let snap = res.unwrap(); - let mut snap_state = self.snap_state_mut(); - let SnapState::Generating { - index, - .. - } = &*snap_state else { return false }; - - if snap.get_metadata().get_index() < index.load(Ordering::SeqCst) { - warn!( - self.logger(), - "snapshot is staled, skip"; - "snap index" => snap.get_metadata().get_index(), - "required index" => index.load(Ordering::SeqCst), - ); - return false; + let (snapshot, to_peer_id) = *res.unwrap(); + if let Some(state) = self.snap_states.borrow_mut().get_mut(&to_peer_id) { + let SnapState::Generating { + ref index, + .. + } = *state else { return false }; + if snapshot.get_metadata().get_index() < index.load(Ordering::SeqCst) { + warn!( + self.logger(), + "snapshot is staled, skip"; + "snap index" => snapshot.get_metadata().get_index(), + "required index" => index.load(Ordering::SeqCst), + "to_peer_id" => to_peer_id, + ); + return false; + } + *state = SnapState::Generated(Box::new(snapshot)); } - // Should changed `SnapState::Generated` to `SnapState::Relax` when the - // snap is consumed or canceled. Such as leader changed, the state of generated - // should be reset. - *snap_state = SnapState::Generated(snap); true } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index bce313eab83..1015b5aaac7 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -5,6 +5,7 @@ use std::{ fmt::{self, Debug, Formatter}, }; +use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ metapb, @@ -37,8 +38,8 @@ pub struct Storage { logger: Logger, /// Snapshot part. - snap_state: RefCell, - gen_snap_task: RefCell>>, + pub snap_states: RefCell>, + pub gen_snap_task: RefCell>>, split_init: Option>, /// The flushed index of all CFs. apply_trace: ApplyTrace, @@ -87,13 +88,23 @@ impl Storage { } #[inline] - pub fn snap_state_mut(&self) -> RefMut<'_, SnapState> { - self.snap_state.borrow_mut() + pub fn gen_snap_task_mut(&self) -> RefMut<'_, Box>> { + self.gen_snap_task.borrow_mut() } #[inline] - pub fn gen_snap_task_mut(&self) -> RefMut<'_, Box>> { - self.gen_snap_task.borrow_mut() + pub fn cancel_snap_task(&self, to_peer_id: Option) { + if to_peer_id.is_none() { + self.gen_snap_task.borrow_mut().take(); + return; + } + let to = to_peer_id.unwrap(); + let mut task = self.gen_snap_task.borrow_mut(); + if let Some(t) = &**task { + if to == t.to_peer() { + *task = Box::new(None); + }; + } } #[inline] @@ -143,7 +154,7 @@ impl Storage { region_state, ever_persisted: persisted, logger, - snap_state: RefCell::new(SnapState::Relax), + snap_states: RefCell::new(HashMap::default()), gen_snap_task: RefCell::new(Box::new(None)), split_init: None, apply_trace, @@ -435,14 +446,17 @@ mod tests { ); // Test get snapshot - let snap = s.snapshot(0, 7); + let to_peer_id = 7; + let snap = s.snapshot(0, to_peer_id); let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); assert_eq!(snap.unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); apply.schedule_gen_snapshot(gen_task); let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); s.on_snapshot_generated(res); - let snap = match *s.snap_state.borrow() { + assert_eq!(s.snapshot(0, 8).unwrap_err(), unavailable); + assert!(s.snap_states.borrow().get(&8).is_some()); + let snap = match *s.snap_states.borrow().get(&to_peer_id).unwrap() { SnapState::Generated(ref snap) => *snap.clone(), ref s => panic!("unexpected state: {:?}", s), }; @@ -452,16 +466,16 @@ mod tests { let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); let checkpointer_path = mgr.tablet_gen_path(&snap_key); assert!(checkpointer_path.exists()); - s.snapshot(0, 7).unwrap(); + s.snapshot(0, to_peer_id).unwrap(); // Test cancel snapshot - let snap = s.snapshot(0, 0); + let snap = s.snapshot(0, 7); assert_eq!(snap.unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); apply.schedule_gen_snapshot(gen_task); - rx.recv_timeout(Duration::from_secs(1)).unwrap(); - s.cancel_generating_snap(None); - assert_eq!(*s.snap_state.borrow(), SnapState::Relax); + let _res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.cancel_generating_snap(None, None); + assert!(s.snap_states.borrow().get(&to_peer_id).is_none()); // Test get twice snapshot and cancel once. // get snapshot a @@ -471,7 +485,7 @@ mod tests { apply.set_apply_progress(1, 5); apply.schedule_gen_snapshot(gen_task_a); let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); - s.cancel_generating_snap(None); + s.cancel_generating_snap(None, None); // cancel get snapshot a, try get snaphsot b let snap = s.snapshot(0, 0); assert_eq!(snap.unwrap_err(), unavailable); diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index 5dc01b40ef3..b298ed3529e 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -79,12 +79,12 @@ pub struct FetchedLogs { pub logs: Box, } -pub type GenSnapRes = Option>; +pub type GenSnapRes = Option>; /// A router for receiving fetched result. pub trait AsyncReadNotifier: Send { fn notify_logs_fetched(&self, region_id: u64, fetched: FetchedLogs); - fn notify_snapshot_generated(&self, region_id: u64, res: Option>); + fn notify_snapshot_generated(&self, region_id: u64, res: GenSnapRes); } pub struct ReadRunner @@ -231,7 +231,7 @@ where SNAP_HISTOGRAM .generate .observe(start.saturating_elapsed_secs()); - res = Some(Box::new(snapshot)) + res = Some(Box::new((snapshot, to_peer))) } self.notifier.notify_snapshot_generated(region_id, res); From f21361d9f8af7f61ed41e4f408bc9a3e6cc83b0e Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 27 Dec 2022 12:40:17 +0800 Subject: [PATCH 430/676] raftstore-v2: compact and gc raft logs (#13846) ref tikv/tikv#12842 Signed-off-by: tabokie --- Cargo.lock | 1 + components/engine_panic/src/raft_engine.rs | 11 +- components/engine_rocks/src/raft_engine.rs | 33 +- components/engine_traits/src/lib.rs | 2 +- components/engine_traits/src/raft_engine.rs | 26 +- components/raft_log_engine/src/engine.rs | 105 ++++-- components/raftstore-v2/Cargo.toml | 1 + components/raftstore-v2/src/fsm/peer.rs | 6 +- .../operation/command/admin/compact_log.rs | 304 ++++++++++++++++++ .../src/operation/command/admin/mod.rs | 11 +- .../src/operation/command/admin/split.rs | 24 +- .../raftstore-v2/src/operation/command/mod.rs | 17 +- .../src/operation/ready/apply_trace.rs | 8 +- .../raftstore-v2/src/operation/ready/mod.rs | 14 +- .../src/operation/ready/snapshot.rs | 51 +-- components/raftstore-v2/src/raft/peer.rs | 42 +++ components/raftstore-v2/src/raft/storage.rs | 4 +- components/raftstore-v2/src/router/message.rs | 6 +- components/raftstore/src/store/fsm/peer.rs | 11 +- .../raftstore/src/store/peer_storage.rs | 7 +- .../raftstore/src/store/worker/raftlog_gc.rs | 70 ++-- src/server/debug.rs | 7 +- tests/failpoints/cases/test_snap.rs | 4 +- tests/failpoints/cases/test_stale_peer.rs | 4 +- 24 files changed, 578 insertions(+), 191 deletions(-) create mode 100644 components/raftstore-v2/src/operation/command/admin/compact_log.rs diff --git a/Cargo.lock b/Cargo.lock index cf53d09da09..5f7ca0b8c7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4377,6 +4377,7 @@ dependencies = [ "tempfile", "test_pd", "test_util", + "thiserror", "tikv_util", "time", "tracker", diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 59c0422902c..c3de53b4932 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -120,7 +120,16 @@ impl RaftEngine for PanicEngine { panic!() } - fn gc(&self, raft_group_id: u64, mut from: u64, to: u64) -> Result { + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()> { + panic!() + } + + fn delete_all_but_one_states_before( + &self, + raft_group_id: u64, + apply_index: u64, + batch: &mut Self::LogBatch, + ) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index cb4c5682252..d5331a2ce29 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -3,8 +3,8 @@ // #[PerformanceCriticalPath] use engine_traits::{ Error, Iterable, KvEngine, MiscExt, Mutable, Peekable, RaftEngine, RaftEngineDebug, - RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, Result, WriteBatch, WriteBatchExt, - WriteOptions, CF_DEFAULT, RAFT_LOG_MULTI_GET_CNT, + RaftEngineReadOnly, RaftLogBatch, Result, WriteBatch, WriteBatchExt, WriteOptions, CF_DEFAULT, + RAFT_LOG_MULTI_GET_CNT, }; use kvproto::{ metapb::Region, @@ -298,27 +298,18 @@ impl RaftEngine for RocksEngine { Ok(()) } - fn batch_gc(&self, groups: Vec) -> Result { - let mut total = 0; - let mut raft_wb = self.write_batch_with_cap(4 * 1024); - for task in groups { - total += self.gc_impl(task.raft_group_id, task.from, task.to, &mut raft_wb)?; - } - // TODO: disable WAL here. - if !WriteBatch::is_empty(&raft_wb) { - raft_wb.write()?; - } - Ok(total) + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()> { + self.gc_impl(raft_group_id, from, to, batch)?; + Ok(()) } - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { - let mut raft_wb = self.write_batch_with_cap(1024); - let total = self.gc_impl(raft_group_id, from, to, &mut raft_wb)?; - // TODO: disable WAL here. - if !WriteBatch::is_empty(&raft_wb) { - raft_wb.write()?; - } - Ok(total) + fn delete_all_but_one_states_before( + &self, + _raft_group_id: u64, + _apply_index: u64, + _batch: &mut Self::LogBatch, + ) -> Result<()> { + panic!() } fn flush_metrics(&self, instance: &str) { diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index b75c3e7b7c0..bc54a5e7627 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -339,7 +339,7 @@ pub use crate::range::*; mod raft_engine; pub use raft_engine::{ - CacheStats, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, + CacheStats, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, RAFT_LOG_MULTI_GET_CNT, }; diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 7b0e04d0ab5..9e95ae95e14 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -74,12 +74,6 @@ pub trait RaftEngineDebug: RaftEngine + Sync + Send + 'static { } } -pub struct RaftLogGcTask { - pub raft_group_id: u64, - pub from: u64, - pub to: u64, -} - // TODO: Refactor common methods between Kv and Raft engine into a shared trait. pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send + 'static { type LogBatch: RaftLogBatch; @@ -110,17 +104,17 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send batch: &mut Self::LogBatch, ) -> Result<()>; - /// Like `cut_logs` but the range could be very large. Return the deleted - /// count. Generally, `from` can be passed in `0`. - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result; + /// Like `cut_logs` but the range could be very large. + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()>; - fn batch_gc(&self, tasks: Vec) -> Result { - let mut total = 0; - for task in tasks { - total += self.gc(task.raft_group_id, task.from, task.to)?; - } - Ok(total) - } + /// Delete all but the latest one of states that are associated with smaller + /// apply_index. + fn delete_all_but_one_states_before( + &self, + raft_group_id: u64, + apply_index: u64, + batch: &mut Self::LogBatch, + ) -> Result<()>; fn need_manual_purge(&self) -> bool { false diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 7be02e8b6e2..7c98adf325f 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -11,8 +11,8 @@ use codec::number::NumberCodec; use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter}; use engine_traits::{ CacheStats, EncryptionKeyManager, EncryptionMethod, PerfContextExt, PerfContextKind, PerfLevel, - RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, - RaftLogGcTask, Result, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, Result, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use file_system::{IoOp, IoRateLimiter, IoType}; use kvproto::{ @@ -301,6 +301,7 @@ fn cf_to_id(cf: &str) -> u8 { _ => panic!("unrecognized cf {}", cf), } } +const MAX_CF_ID: u8 = 3; /// Encode a key in the format `{prefix}{num}`. fn encode_key(prefix: &'static [u8], num: u64) -> [u8; 9] { @@ -380,6 +381,8 @@ const REGION_STATE_KEY: &[u8] = &[0x03]; const APPLY_STATE_KEY: &[u8] = &[0x04]; const RECOVER_STATE_KEY: &[u8] = &[0x05]; const FLUSH_STATE_KEY: &[u8] = &[0x06]; +// All keys are of the same length. +const KEY_PREFIX_LEN: usize = RAFT_LOG_STATE_KEY.len(); impl RaftLogBatchTrait for RaftLogBatch { fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { @@ -658,34 +661,80 @@ impl RaftEngine for RaftLogEngine { Ok(()) } - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { - self.batch_gc(vec![RaftLogGcTask { - raft_group_id, - from, - to, - }]) + fn gc( + &self, + raft_group_id: u64, + _from: u64, + to: u64, + batch: &mut Self::LogBatch, + ) -> Result<()> { + batch + .0 + .add_command(raft_group_id, Command::Compact { index: to }); + Ok(()) } - fn batch_gc(&self, tasks: Vec) -> Result { - let mut batch = self.log_batch(tasks.len()); - let mut old_first_index = Vec::with_capacity(tasks.len()); - for task in &tasks { - batch - .0 - .add_command(task.raft_group_id, Command::Compact { index: task.to }); - old_first_index.push(self.0.first_index(task.raft_group_id)); - } - - self.consume(&mut batch, false)?; - - let mut total = 0; - for (old_first_index, task) in old_first_index.iter().zip(tasks) { - let new_first_index = self.0.first_index(task.raft_group_id); - if let (Some(old), Some(new)) = (old_first_index, new_first_index) { - total += new.saturating_sub(*old); - } - } - Ok(total as usize) + fn delete_all_but_one_states_before( + &self, + raft_group_id: u64, + apply_index: u64, + batch: &mut Self::LogBatch, + ) -> Result<()> { + // Makes sure REGION_STATE_KEY is the smallest and FLUSH_STATE_KEY is the + // largest. + debug_assert!(REGION_STATE_KEY < APPLY_STATE_KEY); + debug_assert!(APPLY_STATE_KEY < FLUSH_STATE_KEY); + + let mut end = [0; KEY_PREFIX_LEN + 1]; + end[..KEY_PREFIX_LEN].copy_from_slice(FLUSH_STATE_KEY); + end[KEY_PREFIX_LEN] = MAX_CF_ID + 1; + let mut found_region_state = false; + let mut found_apply_state = false; + let mut found_flush_state = [false; MAX_CF_ID as usize + 1]; + self.0 + .scan_raw_messages( + raft_group_id, + Some(REGION_STATE_KEY), + Some(&end), + true, + |key, _| { + match &key[..KEY_PREFIX_LEN] { + REGION_STATE_KEY + if NumberCodec::decode_u64(&key[KEY_PREFIX_LEN..]) <= apply_index => + { + if found_region_state { + batch.0.delete(raft_group_id, key.to_vec()); + } else { + found_region_state = true; + } + } + APPLY_STATE_KEY + if NumberCodec::decode_u64(&key[KEY_PREFIX_LEN..]) <= apply_index => + { + if found_apply_state { + batch.0.delete(raft_group_id, key.to_vec()); + } else { + found_apply_state = true; + } + } + FLUSH_STATE_KEY => { + let cf_id = key[KEY_PREFIX_LEN]; + let tablet_index = NumberCodec::decode_u64(&key[KEY_PREFIX_LEN + 1..]); + if cf_id <= MAX_CF_ID && tablet_index <= apply_index { + if found_flush_state[cf_id as usize] { + batch.0.delete(raft_group_id, key.to_vec()); + } else { + found_flush_state[cf_id as usize] = true; + } + } + } + _ => {} + } + true + }, + ) + .map_err(transfer_error)?; + Ok(()) } fn need_manual_purge(&self) -> bool { diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 4d3d44ec6fd..6726c5ed742 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -55,6 +55,7 @@ rand = "0.8.3" resource_metering = { workspace = true } slog = "2.3" smallvec = "1.4" +thiserror = "1.0" tikv_util = { workspace = true } time = "0.1" tracker = { workspace = true } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 734c2bf93d4..22145ecdcaa 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -189,6 +189,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, fn on_start(&mut self) { self.schedule_tick(PeerTick::Raft); self.schedule_tick(PeerTick::SplitRegionCheck); + self.schedule_tick(PeerTick::PdHeartbeat); + self.schedule_tick(PeerTick::CompactLog); if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } @@ -206,11 +208,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, match tick { PeerTick::Raft => self.on_raft_tick(), PeerTick::PdHeartbeat => self.on_pd_heartbeat(), - PeerTick::RaftLogGc => unimplemented!(), + PeerTick::CompactLog => self.on_compact_log_tick(), PeerTick::SplitRegionCheck => self.on_split_region_check(), PeerTick::CheckMerge => unimplemented!(), PeerTick::CheckPeerStaleState => unimplemented!(), - PeerTick::EntryCacheEvict => unimplemented!(), + PeerTick::EntryCacheEvict => self.on_entry_cache_evict(), PeerTick::CheckLeaderLease => unimplemented!(), PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), PeerTick::ReportBuckets => unimplemented!(), diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs new file mode 100644 index 00000000000..af8fb5acc47 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -0,0 +1,304 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains processing logic of the following: +//! +//! # `CompactLog` and `EntryCacheEvict` ticks +//! +//! On region leader, periodically compacts useless Raft logs from the +//! underlying log engine, and evicts logs from entry cache if it reaches memory +//! limit. +//! +//! # `CompactLog` command +//! +//! Updates truncated index, and compacts logs if the corresponding changes have +//! been persisted in kvdb. + +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; +use protobuf::Message; +use raftstore::{ + store::{fsm::new_admin_request, needs_evict_entry_cache, Transport}, + Result, +}; +use slog::{debug, error, info}; +use tikv_util::{box_err, Either}; + +use crate::{ + batch::StoreContext, + fsm::{ApplyResReporter, PeerFsmDelegate}, + operation::AdminCmdResult, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerTick}, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + pub fn on_compact_log_tick(&mut self) { + if !self.fsm.peer().is_leader() { + // `compact_cache_to` is called when apply, there is no need to call + // `compact_to` here, snapshot generating has already been cancelled + // when the role becomes follower. + return; + } + self.schedule_tick(PeerTick::CompactLog); + + self.fsm + .peer_mut() + .maybe_propose_compact_log(self.store_ctx); + + self.on_entry_cache_evict(); + } + + pub fn on_entry_cache_evict(&mut self) { + if needs_evict_entry_cache(self.store_ctx.cfg.evict_cache_on_memory_ratio) { + self.fsm + .peer_mut() + .entry_storage_mut() + .evict_entry_cache(true); + if !self.fsm.peer().entry_storage().is_entry_cache_empty() { + self.schedule_tick(PeerTick::EntryCacheEvict); + } + } + } +} + +impl Peer { + // Mirrors v1::on_raft_gc_log_tick. + fn maybe_propose_compact_log(&mut self, store_ctx: &mut StoreContext) { + // As leader, we would not keep caches for the peers that didn't response + // heartbeat in the last few seconds. That happens probably because + // another TiKV is down. In this case if we do not clean up the cache, + // it may keep growing. + let drop_cache_duration = + store_ctx.cfg.raft_heartbeat_interval() + store_ctx.cfg.raft_entry_cache_life_time.0; + let cache_alive_limit = std::time::Instant::now() - drop_cache_duration; + + // Leader will replicate the compact log command to followers, + // If we use current replicated_index (like 10) as the compact index, + // when we replicate this log, the newest replicated_index will be 11, + // but we only compact the log to 10, not 11, at that time, + // the first index is 10, and replicated_index is 11, with an extra log, + // and we will do compact again with compact index 11, in cycles... + // So we introduce a threshold, if replicated index - first index > threshold, + // we will try to compact log. + // raft log entries[..............................................] + // ^ ^ + // |-----------------threshold------------ | + // first_index replicated_index + // `alive_cache_idx` is the smallest `replicated_index` of healthy up nodes. + // `alive_cache_idx` is only used to gc cache. + let applied_idx = self.entry_storage().applied_index(); + let truncated_idx = self.entry_storage().truncated_index(); + let first_idx = self.entry_storage().first_index(); + let last_idx = self.entry_storage().last_index(); + + let (mut replicated_idx, mut alive_cache_idx) = (last_idx, last_idx); + for (peer_id, p) in self.raft_group().raft.prs().iter() { + if replicated_idx > p.matched { + replicated_idx = p.matched; + } + if self.peer_heartbeat_is_fresh(*peer_id, &cache_alive_limit) { + if alive_cache_idx > p.matched && p.matched >= truncated_idx { + alive_cache_idx = p.matched; + } else if p.matched == 0 { + // the new peer is still applying snapshot, do not compact cache now + alive_cache_idx = 0; + } + } + } + + // When an election happened or a new peer is added, replicated_idx can be 0. + if replicated_idx > 0 { + assert!( + last_idx >= replicated_idx, + "expect last index {} >= replicated index {}", + last_idx, + replicated_idx + ); + } + + // leader may call `get_term()` on the latest replicated index, so compact + // entries before `alive_cache_idx` instead of `alive_cache_idx + 1`. + self.entry_storage_mut() + .compact_entry_cache(std::cmp::min(alive_cache_idx, applied_idx + 1)); + + let mut compact_idx = if applied_idx > first_idx + && applied_idx - first_idx >= store_ctx.cfg.raft_log_gc_count_limit() + || self.approximate_raft_log_size() >= store_ctx.cfg.raft_log_gc_size_limit().0 + { + std::cmp::max(first_idx + (last_idx - first_idx) / 2, replicated_idx) + } else if replicated_idx < first_idx + || last_idx - first_idx < 3 + || replicated_idx - first_idx < store_ctx.cfg.raft_log_gc_threshold + && self.maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) + { + return; + } else { + replicated_idx + }; + assert!(compact_idx >= first_idx); + // Have no idea why subtract 1 here, but original code did this by magic. + compact_idx -= 1; + if compact_idx < first_idx { + return; + } + + // Create a compact log request and notify directly. + // TODO: move this into a function + let term = self.raft_group().raft.raft_log.term(compact_idx).unwrap(); + + let mut req = new_admin_request(self.region_id(), self.peer().clone()); + let mut admin = AdminRequest::default(); + admin.set_cmd_type(AdminCmdType::CompactLog); + admin.mut_compact_log().set_compact_index(compact_idx); + admin.mut_compact_log().set_compact_term(term); + req.set_admin_request(admin); + + let (ch, _) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + + self.reset_skip_compact_log_ticks(); + } +} + +#[derive(Debug)] +pub struct CompactLogResult { + index: u64, + compact_index: u64, + compact_term: u64, +} + +impl Peer { + pub fn propose_compact_log( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) -> Result { + let compact_log = req.get_admin_request().get_compact_log(); + // TODO: add unit tests to cover all the message integrity checks. + if compact_log.get_compact_term() == 0 { + info!( + self.logger, + "compact term missing, skip"; + "command" => ?compact_log + ); + // old format compact log command, safe to ignore. + return Err(box_err!( + "command format is outdated, please upgrade leader" + )); + } + + let data = req.write_to_bytes().unwrap(); + self.propose(store_ctx, data) + } +} + +impl Apply { + pub fn apply_compact_log( + &mut self, + req: &AdminRequest, + index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + Ok(( + AdminResponse::default(), + AdminCmdResult::CompactLog(CompactLogResult { + index, + compact_index: req.get_compact_log().get_compact_index(), + compact_term: req.get_compact_log().get_compact_term(), + }), + )) + } +} + +impl Peer { + pub fn on_apply_res_compact_log( + &mut self, + store_ctx: &mut StoreContext, + res: CompactLogResult, + ) { + let first_index = self.entry_storage().first_index(); + if res.compact_index <= first_index { + debug!( + self.logger, + "compact index <= first index, no need to compact"; + "compact_index" => res.compact_index, + "first_index" => first_index, + ); + return; + } + // TODO: check is_merging + // TODO: check entry_cache_warmup_state + self.entry_storage_mut() + .compact_entry_cache(res.compact_index); + self.storage_mut() + .cancel_generating_snap_due_to_compacted(res.compact_index); + + let truncated_state = self + .entry_storage_mut() + .apply_state_mut() + .mut_truncated_state(); + let old_truncated = truncated_state.get_index(); + truncated_state.set_index(res.compact_index); + truncated_state.set_term(res.compact_term); + + let region_id = self.region_id(); + // TODO: get around this clone. + let apply_state = self.entry_storage().apply_state().clone(); + self.state_changes_mut() + .put_apply_state(region_id, res.index, &apply_state) + .unwrap(); + self.set_has_extra_write(); + + self.maybe_compact_log_from_engine(store_ctx, Either::Right(old_truncated)); + } + + #[inline] + pub fn on_advance_persisted_apply_index( + &mut self, + store_ctx: &mut StoreContext, + old_persisted: u64, + ) { + let new_persisted = self.storage().apply_trace().persisted_apply_index(); + if old_persisted < new_persisted { + // TODO: batch it. + if let Err(e) = store_ctx.engine.delete_all_but_one_states_before( + self.region_id(), + new_persisted, + self.state_changes_mut(), + ) { + error!(self.logger, "failed to delete raft states"; "err" => ?e); + } else { + self.set_has_extra_write(); + } + self.maybe_compact_log_from_engine(store_ctx, Either::Left(old_persisted)); + } + } + + pub fn maybe_compact_log_from_engine( + &mut self, + store_ctx: &mut StoreContext, + old_index: Either, + ) { + let truncated = self.entry_storage().truncated_index(); + let persisted = self.storage().apply_trace().persisted_apply_index(); + match old_index { + Either::Left(old_persisted) if old_persisted >= truncated => return, + Either::Right(old_truncated) if old_truncated >= persisted => return, + _ => {} + } + let compact_index = std::cmp::min(truncated, persisted); + // Raft Engine doesn't care about first index. + if let Err(e) = + store_ctx + .engine + .gc(self.region_id(), 0, compact_index, self.state_changes_mut()) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } else { + self.set_has_extra_write(); + let applied = self.storage().apply_state().get_applied_index(); + let total_cnt = applied - self.storage().entry_storage().first_index() + 1; + let remain_cnt = applied - compact_index; + self.update_approximate_raft_log_size(|s| s * remain_cnt / total_cnt); + } + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 0b3d588abf7..9afd50a5305 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -1,19 +1,22 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod compact_log; mod conf_change; mod split; mod transfer_leader; +use compact_log::CompactLogResult; +use conf_change::ConfChangeResult; use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; use protobuf::Message; use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; -pub use split::{RequestSplit, SplitFlowControl, SplitInit, SplitResult, SPLIT_PREFIX}; +use split::SplitResult; +pub use split::{RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; use tikv_util::box_err; use txn_types::WriteBatchFlags; -use self::conf_change::ConfChangeResult; use crate::{batch::StoreContext, raft::Peer, router::CmdResChannel}; #[derive(Debug)] @@ -23,6 +26,7 @@ pub enum AdminCmdResult { SplitRegion(SplitResult), ConfChange(ConfChangeResult), TransferLeader(u64), + CompactLog(CompactLogResult), } impl Peer { @@ -93,7 +97,7 @@ impl Peer { .contains(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL) { let data = req.write_to_bytes().unwrap(); - self.propose_with_ctx(ctx, data, vec![]) + self.propose(ctx, data) } else { if self.propose_transfer_leader(ctx, req, ch) { self.set_has_ready(); @@ -101,6 +105,7 @@ impl Peer { return; } } + AdminCmdType::CompactLog => self.propose_compact_log(ctx, req), _ => unimplemented!(), } }; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 391f0253439..2154eb20e90 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -388,13 +388,11 @@ impl Peer { pub fn on_apply_res_split( &mut self, store_ctx: &mut StoreContext, - derived_index: usize, - tablet_index: u64, - regions: Vec, + res: SplitResult, ) { fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); - let derived = ®ions[derived_index]; + let derived = &res.regions[res.derived_index]; let derived_epoch = derived.get_region_epoch().clone(); let region_id = derived.get_id(); @@ -408,7 +406,7 @@ impl Peer { // Update the version so the concurrent reader will fail due to EpochNotMatch // instead of PessimisticLockNotFound. pessimistic_locks.version = derived_epoch.get_version(); - pessimistic_locks.group_by_regions(®ions, derived) + pessimistic_locks.group_by_regions(&res.regions, derived) }; fail_point!("on_split_invalidate_locks"); @@ -421,7 +419,7 @@ impl Peer { reader, derived.clone(), RegionChangeReason::Split, - tablet_index, + res.tablet_index, ); } @@ -433,19 +431,17 @@ impl Peer { info!( self.logger, "notify pd with split"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), - "split_count" => regions.len(), + "split_count" => res.regions.len(), ); // Now pd only uses ReportBatchSplit for history operation show, // so we send it independently here. - self.report_batch_split_pd(store_ctx, regions.to_vec()); + self.report_batch_split_pd(store_ctx, res.regions.to_vec()); self.add_pending_tick(PeerTick::SplitRegionCheck); } - let last_region_id = regions.last().unwrap().get_id(); + let last_region_id = res.regions.last().unwrap().get_id(); let mut new_ids = HashSet::default(); - for (new_region, locks) in regions.into_iter().zip(region_locks) { + for (new_region, locks) in res.regions.into_iter().zip(region_locks) { let new_region_id = new_region.get_id(); if new_region_id == region_id { continue; @@ -480,10 +476,10 @@ impl Peer { _ => unreachable!(), } } - self.split_trace_mut().push((tablet_index, new_ids)); + self.split_trace_mut().push((res.tablet_index, new_ids)); let region_state = self.storage().region_state().clone(); self.state_changes_mut() - .put_region_state(region_id, tablet_index, ®ion_state) + .put_region_state(region_id, res.tablet_index, ®ion_state) .unwrap(); self.set_has_extra_write(); } diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 49040a20278..7fa2fa776c2 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -51,9 +51,7 @@ mod admin; mod control; mod write; -pub use admin::{ - AdminCmdResult, RequestSplit, SplitFlowControl, SplitInit, SplitResult, SPLIT_PREFIX, -}; +pub use admin::{AdminCmdResult, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; pub use control::ProposalControl; pub use write::{ SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, @@ -267,17 +265,14 @@ impl Peer { AdminCmdResult::ConfChange(conf_change) => { self.on_apply_res_conf_change(ctx, conf_change) } - AdminCmdResult::SplitRegion(SplitResult { - regions, - derived_index, - tablet_index, - }) => { + AdminCmdResult::SplitRegion(res) => { self.storage_mut() .apply_trace_mut() - .on_admin_modify(tablet_index); - self.on_apply_res_split(ctx, derived_index, tablet_index, regions) + .on_admin_modify(res.tablet_index); + self.on_apply_res_split(ctx, res) } AdminCmdResult::TransferLeader(term) => self.on_transfer_leader(ctx, term), + AdminCmdResult::CompactLog(res) => self.on_apply_res_compact_log(ctx, res), } } @@ -446,7 +441,7 @@ impl Apply { if req.has_admin_request() { let admin_req = req.get_admin_request(); let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { - AdminCmdType::CompactLog => unimplemented!(), + AdminCmdType::CompactLog => self.apply_compact_log(admin_req, entry.index)?, AdminCmdType::Split => self.apply_split(admin_req, log_index)?, AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, log_index)?, AdminCmdType::PrepareMerge => unimplemented!(), diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 0b7521f2634..d5aa93b587a 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -6,7 +6,7 @@ //! //! In summary, we trace the persist progress by recording flushed event. //! Because memtable is flushed one by one, so a flushed memtable must contain -//! all the data within the CF before some certain apply index. So the minimun +//! all the data within the CF before certain apply index. So the minimun //! flushed apply index + 1 of all data CFs is the recovery start point. In //! some cases, a CF may not have any updates at all for a long time. In some //! cases, we may still need to recover from smaller index even if flushed @@ -121,7 +121,7 @@ impl engine_traits::StateStorage for StateStorage< } } -/// An alias of frequent use type that each data cf has a u64. +/// Mapping from data cf to an u64 index. pub type DataTrace = [u64; DATA_CFS_LEN]; #[derive(Clone, Copy, Default)] @@ -211,7 +211,7 @@ impl ApplyTrace { self.admin.last_modified = index; } - fn persisted_apply_index(&self) -> u64 { + pub fn persisted_apply_index(&self) -> u64 { self.admin.flushed } @@ -237,7 +237,7 @@ impl ApplyTrace { let candidate = cmp::min(mem_index, min_flushed.unwrap_or(u64::MAX)); if candidate > self.admin.flushed { self.admin.flushed = candidate; - if candidate > self.persisted_applied + 100 { + if self.admin.flushed > self.persisted_applied + 100 { self.try_persist = true; } } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 0e911e48255..14010fc9fe2 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -31,7 +31,7 @@ use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, - store::{util, FetchedLogs, ReadProgress, Transport, WriteTask}, + store::{needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteTask}, }; use slog::{debug, error, trace, warn}; use tikv_util::{ @@ -275,7 +275,7 @@ impl Peer { // asynchronously. if self.is_leader() { for entry in committed_entries.iter().rev() { - // TODO: handle raft_log_size_hint + self.update_approximate_raft_log_size(|s| s + entry.get_data().len() as u64); let propose_time = self .proposals() .find_propose_time(entry.get_term(), entry.get_index()); @@ -293,6 +293,10 @@ impl Peer { } } } + if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { + // Compact all cached entries instead of half evict. + self.entry_storage_mut().evict_entry_cache(false); + } self.schedule_apply_committed_entries(committed_entries); } @@ -375,9 +379,12 @@ impl Peer { let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); + let prev_persisted = self.storage().apply_trace().persisted_apply_index(); self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); + self.on_advance_persisted_apply_index(ctx, prev_persisted); + if !ready.persisted_messages().is_empty() { write_task.messages = ready .take_persisted_messages() @@ -517,10 +524,11 @@ impl Peer { self.entry_storage_mut().clear_entry_cache_warmup_state(); self.region_heartbeat_pd(ctx); + self.add_pending_tick(PeerTick::CompactLog); } StateRole::Follower => { self.leader_lease_mut().expire(); - self.storage_mut().cancel_generating_snap(None, None); + self.storage_mut().cancel_generating_snap(None); self.clear_in_memory_pessimistic_locks(); } _ => {} diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 149505f0af4..e1a36ed8ec7 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -252,7 +252,7 @@ impl Storage { match state { SnapState::Generating { ref canceled, .. } => { if canceled.load(Ordering::SeqCst) { - self.cancel_generating_snap(Some(to), None); + self.cancel_generating_snap(Some(to)); } else { return Err(raft::Error::Store( raft::StorageError::SnapshotTemporarilyUnavailable, @@ -337,41 +337,56 @@ impl Storage { true } - pub fn cancel_generating_snap(&self, to: Option, compact_to: Option) { - if let Some(id) = to { + pub fn cancel_generating_snap(&self, to_peer: Option) { + if let Some(id) = to_peer { let mut states = self.snap_states.borrow_mut(); - if let Some(state) = states.get(&id) { - let SnapState::Generating { - ref index, - .. - } = *state else { return }; - if let Some(idx) = compact_to { - let snap_index = index.load(Ordering::SeqCst); - if snap_index == 0 || idx <= snap_index + 1 { - return; - } - } + if let Some(state) = states.get(&id) + && matches!(*state, SnapState::Generating { .. }) + { info!( self.logger(), "snapshot is canceled"; - "compact_to" => compact_to, + "to_peer" => to_peer, ); - self.cancel_snap_task(to); + self.cancel_snap_task(to_peer); states.remove(&id); } } else { - self.cancel_snap_task(to); + self.cancel_snap_task(to_peer); self.snap_states.borrow_mut().clear(); } STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.cancel.inc(); } + pub fn cancel_generating_snap_due_to_compacted(&self, compact_to: u64) { + let mut states = self.snap_states.borrow_mut(); + states.retain(|id, state| { + let SnapState::Generating { + ref index, + .. + } = *state else { return true; }; + let snap_index = index.load(Ordering::SeqCst); + if snap_index == 0 || compact_to <= snap_index + 1 { + return true; + } + info!( + self.logger(), + "snapshot is canceled"; + "compact_to" => compact_to, + "to_peer" => id, + ); + self.cancel_snap_task(Some(*id)); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.cancel.inc(); + false + }); + } + /// Try to switch snap state to generated. only `Generating` can switch to /// `Generated`. /// TODO: make the snap state more clearer, the snapshot must be consumed. pub fn on_snapshot_generated(&self, res: GenSnapRes) -> bool { if res.is_none() { - self.cancel_generating_snap(None, None); + self.cancel_generating_snap(None); return false; } let (snapshot, to_peer_id) = *res.unwrap(); diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 25285f289a7..ca5aafa3bfb 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -53,6 +53,10 @@ pub struct Peer { /// Statistics for other peers, only maintained when self is the leader. peer_heartbeats: HashMap, + /// For raft log compaction. + skip_compact_log_ticks: usize, + approximate_raft_log_size: u64, + /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. raw_write_encoder: Option, @@ -140,6 +144,8 @@ impl Peer { self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), + skip_compact_log_ticks: 0, + approximate_raft_log_size: 0, raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), @@ -450,6 +456,16 @@ impl Peer { self.peer_heartbeats.remove(&peer_id); } + /// Returns whether or not the peer sent heartbeat after the provided + /// deadline time. + #[inline] + pub fn peer_heartbeat_is_fresh(&self, peer_id: u64, deadline: &Instant) -> bool { + matches!( + self.peer_heartbeats.get(&peer_id), + Some(last_heartbeat) if *last_heartbeat >= *deadline + ) + } + pub fn collect_down_peers(&self, max_duration: Duration) -> Vec { let mut down_peers = Vec::new(); let now = Instant::now(); @@ -471,6 +487,31 @@ impl Peer { down_peers } + #[inline] + pub fn reset_skip_compact_log_ticks(&mut self) { + self.skip_compact_log_ticks = 0; + } + + #[inline] + pub fn maybe_skip_compact_log(&mut self, max_skip_ticks: usize) -> bool { + if self.skip_compact_log_ticks < max_skip_ticks { + self.skip_compact_log_ticks += 1; + true + } else { + false + } + } + + #[inline] + pub fn approximate_raft_log_size(&self) -> u64 { + self.approximate_raft_log_size + } + + #[inline] + pub fn update_approximate_raft_log_size(&mut self, f: impl Fn(u64) -> u64) { + self.approximate_raft_log_size = f(self.approximate_raft_log_size); + } + #[inline] pub fn state_role(&self) -> StateRole { self.raft_group.raft.state @@ -698,6 +739,7 @@ impl Peer { self.flush_state = Arc::default(); } + // Note: Call `set_has_extra_write` after adding new state changes. #[inline] pub fn state_changes_mut(&mut self) -> &mut ER::LogBatch { if self.state_changes.is_none() { diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 1015b5aaac7..959f817ebd7 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -474,7 +474,7 @@ mod tests { let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); apply.schedule_gen_snapshot(gen_task); let _res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); - s.cancel_generating_snap(None, None); + s.cancel_generating_snap(None); assert!(s.snap_states.borrow().get(&to_peer_id).is_none()); // Test get twice snapshot and cancel once. @@ -485,7 +485,7 @@ mod tests { apply.set_apply_progress(1, 5); apply.schedule_gen_snapshot(gen_task_a); let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); - s.cancel_generating_snap(None, None); + s.cancel_generating_snap(None); // cancel get snapshot a, try get snaphsot b let snap = s.snapshot(0, 0); assert_eq!(snap.unwrap_err(), unavailable); diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index a69f6b5ead6..cd88a23c744 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -22,7 +22,7 @@ use crate::operation::{RequestSplit, SimpleWriteBinary, SplitInit}; #[repr(u8)] pub enum PeerTick { Raft = 0, - RaftLogGc = 1, + CompactLog = 1, SplitRegionCheck = 2, PdHeartbeat = 3, CheckMerge = 4, @@ -41,7 +41,7 @@ impl PeerTick { pub fn tag(self) -> &'static str { match self { PeerTick::Raft => "raft", - PeerTick::RaftLogGc => "raft_log_gc", + PeerTick::CompactLog => "compact_log", PeerTick::SplitRegionCheck => "split_region_check", PeerTick::PdHeartbeat => "pd_heartbeat", PeerTick::CheckMerge => "check_merge", @@ -57,7 +57,7 @@ impl PeerTick { pub const fn all_ticks() -> &'static [PeerTick] { const TICKS: &[PeerTick] = &[ PeerTick::Raft, - PeerTick::RaftLogGc, + PeerTick::CompactLog, PeerTick::SplitRegionCheck, PeerTick::PdHeartbeat, PeerTick::CheckMerge, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 225126f0edb..1b484df5316 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -54,7 +54,7 @@ use tikv_util::{ box_err, debug, defer, error, escape, info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, store::{find_peer, is_learner, region_on_same_stores}, - sys::{disk::DiskUsage, memory_usage_reaches_high_water}, + sys::disk::DiskUsage, time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, trace, warn, worker::{ScheduleError, Scheduler}, @@ -5425,12 +5425,9 @@ where fail_point!("on_entry_cache_evict_tick", |_| {}); if needs_evict_entry_cache(self.ctx.cfg.evict_cache_on_memory_ratio) { self.fsm.peer.mut_store().evict_entry_cache(true); - } - let mut _usage = 0; - if memory_usage_reaches_high_water(&mut _usage) - && !self.fsm.peer.get_store().is_entry_cache_empty() - { - self.register_entry_cache_evict_tick(); + if !self.fsm.peer.get_store().is_entry_cache_empty() { + self.register_entry_cache_evict_tick(); + } } } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index ce25544bcd8..c9e460d1cbc 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -2124,7 +2124,8 @@ pub mod tests { assert!(build_storage().is_err()); // It should not recover if corresponding log doesn't exist. - engines.raft.gc(1, 14, 15).unwrap(); + engines.raft.gc(1, 14, 15, &mut lb).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); apply_state.set_commit_index(14); apply_state.set_commit_term(RAFT_INIT_LOG_TERM); engines @@ -2136,7 +2137,7 @@ pub mod tests { let entries = (14..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - engines.raft.gc(1, 0, 21).unwrap(); + engines.raft.gc(1, 0, 21, &mut lb).unwrap(); lb.append(1, entries).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); raft_state.mut_hard_state().set_commit(14); @@ -2164,7 +2165,7 @@ pub mod tests { assert!(build_storage().is_err()); // last index < recorded_commit_index is invalid. - engines.raft.gc(1, 0, 21).unwrap(); + engines.raft.gc(1, 0, 21, &mut lb).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.set_last_index(13); lb.append(1, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index f93213dfa0d..ce829ed61b2 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -3,10 +3,9 @@ use std::{ error::Error as StdError, fmt::{self, Display, Formatter}, - sync::mpsc::Sender, }; -use engine_traits::{Engines, KvEngine, RaftEngine, RaftLogGcTask}; +use engine_traits::{Engines, KvEngine, RaftEngine}; use file_system::{IoType, WithIoType}; use thiserror::Error; use tikv_util::{ @@ -73,7 +72,6 @@ enum Error { pub struct Runner { tasks: Vec, engines: Engines, - gc_entries: Option>, compact_sync_interval: Duration, } @@ -82,25 +80,15 @@ impl Runner { Runner { engines, tasks: vec![], - gc_entries: None, compact_sync_interval: compact_log_interval, } } - /// Does the GC job and returns the count of logs collected. - fn gc_raft_log(&mut self, regions: Vec) -> Result { - fail::fail_point!("worker_gc_raft_log", |s| { - Ok(s.and_then(|s| s.parse().ok()).unwrap_or(0)) - }); - let deleted = box_try!(self.engines.raft.batch_gc(regions)); - fail::fail_point!("worker_gc_raft_log_finished", |_| { Ok(deleted) }); - Ok(deleted) - } - - fn report_collected(&self, collected: usize) { - if let Some(ref ch) = self.gc_entries { - ch.send(collected).unwrap(); - } + fn raft_log_gc(&mut self, mut batch: ER::LogBatch) -> Result<(), Error> { + fail::fail_point!("worker_gc_raft_log", |_| Ok(())); + box_try!(self.engines.raft.consume(&mut batch, false)); + fail::fail_point!("worker_gc_raft_log_finished"); + Ok(()) } fn flush(&mut self) { @@ -115,9 +103,11 @@ impl Runner { panic!("failed to sync kv_engine in raft_log_gc: {:?}", e); }); RAFT_LOG_GC_KV_SYNC_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); + let tasks = std::mem::take(&mut self.tasks); - let mut groups = Vec::with_capacity(tasks.len()); let mut cbs = Vec::new(); + let mut batch = self.engines.raft.log_batch(tasks.len()); + let start = Instant::now(); for t in tasks { debug!("gc raft log"; "region_id" => t.region_id, "start_index" => t.start_idx, "end_index" => t.end_idx); if let Some(cb) = t.cb { @@ -137,28 +127,22 @@ impl Runner { "end_index" => t.end_idx, ); } - groups.push(RaftLogGcTask { - raft_group_id: t.region_id, - from: t.start_idx, - to: t.end_idx, - }); - } - let start = Instant::now(); - match self.gc_raft_log(groups) { - Err(e) => { + if let Err(e) = self + .engines + .raft + .gc(t.region_id, t.start_idx, t.end_idx, &mut batch) + { error!("failed to gc"; "err" => %e); - self.report_collected(0); RAFT_LOG_GC_FAILED.inc(); } - Ok(n) => { - debug!("gc log entries"; "entry_count" => n); - self.report_collected(n); - RAFT_LOG_GC_DELETED_KEYS_HISTOGRAM.observe(n as f64); - } + } + if let Err(e) = self.raft_log_gc(batch) { + error!("failed to write gc task"; "err" => %e); + RAFT_LOG_GC_FAILED.inc(); } RAFT_LOG_GC_WRITE_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); for cb in cbs { - cb() + cb(); } } } @@ -201,7 +185,7 @@ where #[cfg(test)] mod tests { - use std::{sync::mpsc, time::Duration}; + use std::time::Duration; use engine_traits::{RaftEngine, RaftLogBatch, ALL_CFS}; use raft::eraftpb::Entry; @@ -218,9 +202,7 @@ mod tests { let kv_db = engine_test::kv::new_engine(path_raft.to_str().unwrap(), ALL_CFS).unwrap(); let engines = Engines::new(kv_db, raft_db.clone()); - let (tx, rx) = mpsc::channel(); let mut runner = Runner { - gc_entries: Some(tx), engines, tasks: vec![], compact_sync_interval: Duration::from_secs(5), @@ -237,17 +219,15 @@ mod tests { raft_db.consume(&mut raft_wb, false /* sync */).unwrap(); let tbls = vec![ - (Task::gc(region_id, 0, 10), 10, (0, 10), (10, 100)), - (Task::gc(region_id, 0, 50), 40, (0, 50), (50, 100)), - (Task::gc(region_id, 50, 50), 0, (0, 50), (50, 100)), - (Task::gc(region_id, 50, 60), 10, (0, 60), (60, 100)), + (Task::gc(region_id, 0, 10), (0, 10), (10, 100)), + (Task::gc(region_id, 0, 50), (0, 50), (50, 100)), + (Task::gc(region_id, 50, 50), (0, 50), (50, 100)), + (Task::gc(region_id, 50, 60), (0, 60), (60, 100)), ]; - for (task, expected_collectd, not_exist_range, exist_range) in tbls { + for (task, not_exist_range, exist_range) in tbls { runner.run(task); runner.flush(); - let res = rx.recv_timeout(Duration::from_secs(3)).unwrap(); - assert_eq!(res, expected_collectd); raft_log_must_not_exist(&raft_db, 1, not_exist_range.0, not_exist_range.1); raft_log_must_exist(&raft_db, 1, exist_range.0, exist_range.1); } diff --git a/src/server/debug.rs b/src/server/debug.rs index 9445133239f..c16621f4d85 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -766,10 +766,8 @@ impl Debugger { )); let mut lb = raft.log_batch(0); box_try!(lb.put_raft_state(region_id, &new_raft_local_state)); - // Will sync later. - box_try!(raft.consume(&mut lb, false)); - let deleted_logs = box_try!(raft.gc(region_id, applied_index + 1, last_index + 1)); - raft.sync().unwrap(); + box_try!(raft.gc(region_id, applied_index + 1, last_index + 1, &mut lb)); + box_try!(raft.consume(&mut lb, true)); kv.sync().unwrap(); info!( @@ -779,7 +777,6 @@ impl Debugger { "new_raft_local_state" => ?new_raft_local_state, "old_raft_apply_state" => ?old_raft_apply_state, "new_raft_apply_state" => ?new_raft_apply_state, - "deleted logs" => deleted_logs, ); } diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index dde25bff636..a6a4a1824f3 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -698,9 +698,9 @@ fn test_snapshot_clean_up_logs_with_unfinished_log_gc() { // Disable default max peer number check. pd_client.disable_default_operator(); cluster.run(); - // Simulate raft log gc are pending in queue. + // Simulate raft log gc tasks are lost during shutdown. let fp = "worker_gc_raft_log"; - fail::cfg(fp, "return(0)").unwrap(); + fail::cfg(fp, "return").unwrap(); let state = cluster.truncated_state(1, 3); for i in 0..30 { diff --git a/tests/failpoints/cases/test_stale_peer.rs b/tests/failpoints/cases/test_stale_peer.rs index 0321772661d..1a4ef0b0afc 100644 --- a/tests/failpoints/cases/test_stale_peer.rs +++ b/tests/failpoints/cases/test_stale_peer.rs @@ -301,9 +301,9 @@ fn test_destroy_clean_up_logs_with_unfinished_log_gc() { // Disable default max peer number check. pd_client.disable_default_operator(); cluster.run(); - // Simulate raft log gc are pending in queue. + // Simulate raft log gc tasks are lost during shutdown. let fp = "worker_gc_raft_log"; - fail::cfg(fp, "return(0)").unwrap(); + fail::cfg(fp, "return").unwrap(); let state = cluster.truncated_state(1, 3); for i in 0..30 { From 7be952a6c6655f1fd6e7860d051d061502334e75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 27 Dec 2022 14:46:16 +0800 Subject: [PATCH 431/676] log-backup: applied some change to make better RPO (#13940) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#13941, ref pingcap/tidb#39620 - If failed to get initial snapshot, remove the subscription as soon as possible. - Added a cache of getting checkpoint. This cache is lease-based -- the lease time is simply the tick interval of the coordinator. - Make the channel size huger for don't blocking the main loop when many regions migrating. Signed-off-by: hillium Signed-off-by: hillium Signed-off-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: Ti Chi Robot --- .../backup-stream/src/checkpoint_manager.rs | 15 ++++ components/backup-stream/src/event_loader.rs | 3 +- .../src/metadata/checkpoint_cache.rs | 71 +++++++++++++++++++ .../backup-stream/src/metadata/client.rs | 34 +++++++-- components/backup-stream/src/metadata/mod.rs | 1 + .../backup-stream/src/subscription_manager.rs | 2 +- .../backup-stream/src/subscription_track.rs | 19 ++--- 7 files changed, 129 insertions(+), 16 deletions(-) create mode 100644 components/backup-stream/src/metadata/checkpoint_cache.rs diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 8c3de3d34ce..5cf4292faa3 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -186,6 +186,16 @@ impl CheckpointManager { pub fn add_subscriber(&mut self, sub: Subscription) -> future![Result<()>] { let mgr = self.manager_handle.as_ref().cloned(); + let initial_data = self + .items + .values() + .map(|v| FlushEvent { + start_key: v.region.start_key.clone(), + end_key: v.region.end_key.clone(), + checkpoint: v.checkpoint.into_inner(), + ..Default::default() + }) + .collect::>(); // NOTE: we cannot send the real error into the client directly because once // we send the subscription into the sink, we cannot fetch it again :( @@ -208,6 +218,11 @@ impl CheckpointManager { mgr.send(SubscriptionOp::Add(sub)) .await .map_err(|err| annotate!(err, "failed to send request to subscriber manager"))?; + mgr.send(SubscriptionOp::Emit(initial_data)) + .await + .map_err(|err| { + annotate!(err, "failed to send initial data to subscriber manager") + })?; Ok(()) } } diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 27c05b5b875..6222f058cd4 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -236,7 +236,8 @@ where ) -> Result { let mut last_err = None; for _ in 0..MAX_GET_SNAPSHOT_RETRY { - let r = self.observe_over(region, cmd()); + let c = cmd(); + let r = self.observe_over(region, c); match r { Ok(s) => { return Ok(s); diff --git a/components/backup-stream/src/metadata/checkpoint_cache.rs b/components/backup-stream/src/metadata/checkpoint_cache.rs new file mode 100644 index 00000000000..50573d003d8 --- /dev/null +++ b/components/backup-stream/src/metadata/checkpoint_cache.rs @@ -0,0 +1,71 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use tikv_util::time::Instant; +use txn_types::TimeStamp; + +/// The lease time of a checkpoint. +/// 12s is the default interval of the coornaditor tick. +const CACHE_LEASE_TIME: Duration = Duration::from_secs(12); + +pub struct CheckpointCache { + last_access: Instant, + checkpoint: TimeStamp, + + cache_lease_time: Duration, +} + +impl Default for CheckpointCache { + fn default() -> Self { + Self { + last_access: Instant::now_coarse(), + checkpoint: TimeStamp::zero(), + + cache_lease_time: CACHE_LEASE_TIME, + } + } +} + +impl CheckpointCache { + #[cfg(test)] + pub fn with_cache_lease(lease: Duration) -> Self { + Self { + cache_lease_time: lease, + ..Self::default() + } + } + + pub fn update(&mut self, checkpoint: impl Into) { + self.last_access = Instant::now_coarse(); + self.checkpoint = self.checkpoint.max(checkpoint.into()) + } + + pub fn get(&self) -> Option { + if self.checkpoint.is_zero() + || self.last_access.saturating_elapsed() > self.cache_lease_time + { + return None; + } + Some(self.checkpoint) + } +} + +#[cfg(test)] +mod test { + use std::time::Duration; + + use super::CheckpointCache; + + #[test] + fn test_basic() { + let mut c = CheckpointCache::with_cache_lease(Duration::from_millis(100)); + assert_eq!(c.get(), None); + c.update(42); + assert_eq!(c.get(), Some(42.into())); + c.update(41); + assert_eq!(c.get(), Some(42.into())); + std::thread::sleep(Duration::from_millis(200)); + assert_eq!(c.get(), None); + } +} diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 2c0fd2577fc..97e8d2140b5 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -1,7 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Ordering, collections::HashMap, fmt::Debug, path::Path}; +use std::{cmp::Ordering, collections::HashMap, fmt::Debug, path::Path, sync::Arc}; +use dashmap::DashMap; use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, metapb::Region, @@ -11,6 +12,7 @@ use tokio_stream::StreamExt; use txn_types::TimeStamp; use super::{ + checkpoint_cache::CheckpointCache, keys::{self, KeyValue, MetaKey}, store::{ CondTransaction, Condition, GetExtra, Keys, KvEvent, KvEventType, MetaStore, Snapshot, @@ -26,6 +28,7 @@ use crate::{ #[derive(Clone)] pub struct MetadataClient { store_id: u64, + caches: Arc>, pub(crate) meta_store: Store, } @@ -239,6 +242,7 @@ impl MetadataClient { pub fn new(store: Store, store_id: u64) -> Self { Self { meta_store: store, + caches: Arc::default(), store_id, } } @@ -698,21 +702,41 @@ impl MetadataClient { Ok(min_checkpoint) } + fn cached_checkpoint(&self, task: &str) -> Option { + self.caches + .get(task) + .and_then(|x| x.value().get()) + .map(|x| Checkpoint { + provider: CheckpointProvider::Global, + ts: x, + }) + } + + fn update_cache(&self, task: &str, checkpoint: TimeStamp) { + let mut c = self.caches.entry(task.to_owned()).or_default(); + c.value_mut().update(checkpoint); + } + pub async fn get_region_checkpoint(&self, task: &str, region: &Region) -> Result { + if let Some(c) = self.cached_checkpoint(task) { + return Ok(c); + } let key = MetaKey::next_bakcup_ts_of_region(task, region); let s = self.meta_store.snapshot().await?; let r = s.get(Keys::Key(key.clone())).await?; - match r.len() { + let cp = match r.len() { 0 => { let global_cp = self.global_checkpoint_of(task).await?; let cp = match global_cp { None => self.get_task_start_ts_checkpoint(task).await?, Some(cp) => cp, }; - Ok(cp) + cp } - _ => Ok(Checkpoint::from_kv(&r[0])?), - } + _ => Checkpoint::from_kv(&r[0])?, + }; + self.update_cache(task, cp.ts); + Ok(cp) } } diff --git a/components/backup-stream/src/metadata/mod.rs b/components/backup-stream/src/metadata/mod.rs index a616ace2dc6..20887a24b02 100644 --- a/components/backup-stream/src/metadata/mod.rs +++ b/components/backup-stream/src/metadata/mod.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod checkpoint_cache; mod client; pub mod keys; mod metrics; diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 624392f3df8..91b4c096e7d 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -281,7 +281,7 @@ impl ScanPoolHandle { } /// The default channel size. -const MESSAGE_BUFFER_SIZE: usize = 4096; +const MESSAGE_BUFFER_SIZE: usize = 32768; /// The operator for region subscription. /// It make a queue for operations over the `SubscriptionTracer`, generally, diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 6b51f983a3b..a24076661bb 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -2,7 +2,10 @@ use std::{sync::Arc, time::Duration}; -use dashmap::{mapref::one::RefMut, DashMap}; +use dashmap::{ + mapref::{entry::Entry, one::RefMut}, + DashMap, +}; use kvproto::metapb::Region; use raftstore::coprocessor::*; use resolved_ts::Resolver; @@ -185,21 +188,19 @@ impl SubscriptionTracer { if_cond: impl FnOnce(&RegionSubscription, &Region) -> bool, ) -> bool { let region_id = region.get_id(); - let remove_result = self.0.remove(®ion_id); + let remove_result = self.0.entry(region_id); match remove_result { - Some((_, mut v)) => { - if if_cond(&v, region) { + Entry::Occupied(mut x) => { + if if_cond(x.get(), region) { TRACK_REGION.dec(); - v.stop(); + x.get_mut().stop(); + let v = x.remove(); info!("stop listen stream from store"; "observer" => ?v, "region_id"=> %region_id); return true; } false } - None => { - debug!("trying to deregister region not registered"; "region_id" => %region_id); - false - } + Entry::Vacant(_) => false, } } From c3903b81ff42102130f05dc2d8d6debd49cafc1a Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 27 Dec 2022 16:24:16 +0800 Subject: [PATCH 432/676] server: improve the ergonomics of sharing things between tablets (#13984) ref tikv/tikv#12842 Signed-off-by: tabokie --- Cargo.lock | 6 +- cmd/tikv-ctl/src/executor.rs | 3 +- cmd/tikv-ctl/src/main.rs | 14 +- components/engine_rocks/src/db_options.rs | 22 +- components/engine_rocks/src/raw.rs | 12 +- components/engine_rocks/src/rocks_metrics.rs | 7 +- .../engine_rocks/src/rocks_metrics_defs.rs | 5 +- components/engine_rocks/src/write_batch.rs | 11 +- components/server/src/raft_engine_switch.rs | 2 +- components/server/src/server.rs | 7 +- components/server/src/server2.rs | 7 +- components/snap_recovery/src/init_cluster.rs | 3 +- etc/config-template.toml | 11 + src/config/mod.rs | 237 ++++++++++++++---- src/server/engine_factory.rs | 22 +- src/storage/kv/test_engine_builder.rs | 10 +- src/storage/mod.rs | 11 +- tests/integrations/config/mod.rs | 8 + tests/integrations/config/test-custom.toml | 6 + tests/integrations/storage/test_titan.rs | 13 +- 20 files changed, 292 insertions(+), 125 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5f7ca0b8c7b..4c510da6d77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2876,7 +2876,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#0ef7101a061c513c684ad68acd15f01c8548b43a" +source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2895,7 +2895,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#0ef7101a061c513c684ad68acd15f01c8548b43a" +source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" dependencies = [ "bzip2-sys", "cc", @@ -4761,7 +4761,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#0ef7101a061c513c684ad68acd15f01c8548b43a" +source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" dependencies = [ "libc 0.2.132", "librocksdb_sys", diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 42b08c629e7..94610face44 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -76,8 +76,7 @@ pub fn new_debug_executor( let cfg_controller = ConfigController::default(); if !cfg.raft_engine.enable { - let mut raft_db_opts = cfg.raftdb.build_opt(); - raft_db_opts.set_env(env); + let raft_db_opts = cfg.raftdb.build_opt(env, None); let raft_db_cf_opts = cfg.raftdb.build_cf_opts(factory.block_cache()); let raft_path = cfg.infer_raft_db_path(Some(data_dir)).unwrap(); if !db_exist(&raft_path) { diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 77888f36fa7..30cd7035bef 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -681,20 +681,20 @@ fn read_fail_file(path: &str) -> Vec<(String, String)> { list } -fn run_ldb_command(args: Vec, cfg: &TikvConfig) { +fn build_rocks_opts(cfg: &TikvConfig) -> engine_rocks::RocksDbOptions { let key_manager = data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .unwrap() .map(Arc::new); let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); - let mut opts = cfg.rocksdb.build_opt(None); - opts.set_env(env); + cfg.rocksdb.build_opt(&cfg.rocksdb.build_resources(env)) +} - engine_rocks::raw::run_ldb_tool(&args, &opts); +fn run_ldb_command(args: Vec, cfg: &TikvConfig) { + engine_rocks::raw::run_ldb_tool(&args, &build_rocks_opts(cfg)); } fn run_sst_dump_command(args: Vec, cfg: &TikvConfig) { - let opts = cfg.rocksdb.build_opt(None); - engine_rocks::raw::run_sst_dump_tool(&args, &opts); + engine_rocks::raw::run_sst_dump_tool(&args, &build_rocks_opts(cfg)); } fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, cfg: &TikvConfig) { @@ -713,7 +713,7 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, let stderr = BufferRedirect::stderr().unwrap(); let stdout = BufferRedirect::stdout().unwrap(); - let opts = cfg.rocksdb.build_opt(None); + let opts = build_rocks_opts(cfg); match run_and_wait_child_process(|| engine_rocks::raw::run_sst_dump_tool(&args, &opts)) { Ok(code) => { diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index f4044c44449..f437cc7b433 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -66,23 +66,29 @@ impl DbOptions for RocksDbOptions { } fn get_rate_bytes_per_sec(&self) -> Option { - self.0.get_rate_bytes_per_sec() + self.0.get_rate_limiter().map(|r| r.get_bytes_per_second()) } fn set_rate_bytes_per_sec(&mut self, rate_bytes_per_sec: i64) -> Result<()> { - self.0 - .set_rate_bytes_per_sec(rate_bytes_per_sec) - .map_err(|e| box_err!(e)) + if let Some(r) = self.0.get_rate_limiter() { + r.set_bytes_per_second(rate_bytes_per_sec); + } else { + return Err(box_err!("rate limiter not found")); + } + Ok(()) } fn get_rate_limiter_auto_tuned(&self) -> Option { - self.0.get_auto_tuned() + self.0.get_rate_limiter().map(|r| r.get_auto_tuned()) } fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()> { - self.0 - .set_auto_tuned(rate_limiter_auto_tuned) - .map_err(|e| box_err!(e)) + if let Some(r) = self.0.get_rate_limiter() { + r.set_auto_tuned(rate_limiter_auto_tuned); + } else { + return Err(box_err!("rate limiter not found")); + } + Ok(()) } fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { diff --git a/components/engine_rocks/src/raw.rs b/components/engine_rocks/src/raw.rs index 4c2dd71b2a2..e940fdd2cd7 100644 --- a/components/engine_rocks/src/raw.rs +++ b/components/engine_rocks/src/raw.rs @@ -10,10 +10,10 @@ pub use rocksdb::{ new_compaction_filter_raw, run_ldb_tool, run_sst_dump_tool, BlockBasedOptions, Cache, ChecksumType, CompactOptions, CompactionFilter, CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, - CompactionJobInfo, CompactionOptions, CompactionPriority, DBBottommostLevelCompaction, - DBCompactionFilter, DBCompactionStyle, DBCompressionType, DBEntryType, DBRateLimiterMode, - DBRecoveryMode, DBStatisticsTickerType, DBTitanDBBlobRunMode, Env, EventListener, - IngestExternalFileOptions, LRUCacheOptions, MemoryAllocator, PerfContext, - PrepopulateBlockCache, Range, SliceTransform, Statistics, TablePropertiesCollector, - TablePropertiesCollectorFactory, + CompactionJobInfo, CompactionOptions, CompactionPriority, ConcurrentTaskLimiter, + DBBottommostLevelCompaction, DBCompactionFilter, DBCompactionStyle, DBCompressionType, + DBEntryType, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, DBTitanDBBlobRunMode, + Env, EventListener, IngestExternalFileOptions, LRUCacheOptions, MemoryAllocator, PerfContext, + PrepopulateBlockCache, Range, RateLimiter, SliceTransform, Statistics, + TablePropertiesCollector, TablePropertiesCollectorFactory, WriteBufferManager, }; diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 24ac9eee0b4..522696cb150 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -582,12 +582,6 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .discardable .inc_by(value); } - TickerType::TitanGcSample => { - STORE_ENGINE_BLOB_GC_ACTION - .get(name_enum) - .sample - .inc_by(value); - } TickerType::TitanGcSmallFile => { STORE_ENGINE_BLOB_GC_ACTION .get(name_enum) @@ -612,6 +606,7 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .trigger_next .inc_by(value); } + // TODO: Some tickers are ignored. _ => {} } } diff --git a/components/engine_rocks/src/rocks_metrics_defs.rs b/components/engine_rocks/src/rocks_metrics_defs.rs index fc23871b90f..042949f1c09 100644 --- a/components/engine_rocks/src/rocks_metrics_defs.rs +++ b/components/engine_rocks/src/rocks_metrics_defs.rs @@ -138,8 +138,11 @@ pub const TITAN_ENGINE_TICKER_TYPES: &[TickerType] = &[ TickerType::TitanGcNoNeed, TickerType::TitanGcRemain, TickerType::TitanGcDiscardable, - TickerType::TitanGcSample, TickerType::TitanGcSmallFile, + TickerType::TitanGcLevelMergeMark, + TickerType::TitanGcLevelMergeDelete, + TickerType::TitanGcNoNeed, + TickerType::TitanGcRemain, TickerType::TitanGcFailure, TickerType::TitanGcSuccess, TickerType::TitanGcTriggerNext, diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index 6171ca7ee38..a46edfb0a4a 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -101,15 +101,18 @@ impl RocksWriteBatchVec { impl engine_traits::WriteBatch for RocksWriteBatchVec { fn write_opt(&mut self, opts: &WriteOptions) -> Result { let opt: RocksWriteOptions = opts.into(); + let mut seq = 0; if self.support_write_batch_vec { + // FIXME(tabokie): Callback for empty write batch won't be called. self.get_db() - .multi_batch_write(self.as_inner(), &opt.into_raw()) - .map_err(r2e) + .multi_batch_write_callback(self.as_inner(), &opt.into_raw(), |s| seq = s) + .map_err(r2e)?; } else { self.get_db() - .write_seq_opt(&self.wbs[0], &opt.into_raw()) - .map_err(r2e) + .write_callback(&self.wbs[0], &opt.into_raw(), |s| seq = s) + .map_err(r2e)?; } + Ok(seq) } fn data_size(&self) -> usize { diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index ba489f1be0f..d0637a04b0a 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -251,7 +251,7 @@ mod tests { // Prepare some data for the RocksEngine. let raftdb = engine_rocks::util::new_engine_opt( &cfg.raft_store.raftdb_path, - cfg.raftdb.build_opt(), + cfg.raftdb.build_opt(Default::default(), None), cfg.raftdb.build_cf_opts(&cache), ) .unwrap(); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 73b42d96d22..3c926969ce2 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1731,10 +1731,8 @@ impl ConfiguredRaftEngine for RocksEngine { let raft_db_path = &config.raft_store.raftdb_path; let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); let statistics = Arc::new(RocksStatistics::new_titan()); - raft_db_opts.set_statistics(statistics.as_ref()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), Some(&statistics)); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) .expect("failed to open raftdb"); @@ -1784,8 +1782,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { if should_dump { let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), None); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); let raftdb = engine_rocks::util::new_engine_opt( &config.raft_store.raftdb_path, diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 7f81d931181..5beddf60151 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -1323,10 +1323,8 @@ impl ConfiguredRaftEngine for RocksEngine { let raft_db_path = &config.raft_store.raftdb_path; let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); let statistics = Arc::new(RocksStatistics::new_titan()); - raft_db_opts.set_statistics(statistics.as_ref()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), Some(&statistics)); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) .expect("failed to open raftdb"); @@ -1376,8 +1374,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { if should_dump { let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), None); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); let raftdb = engine_rocks::util::new_engine_opt( &config.raft_store.raftdb_path, diff --git a/components/snap_recovery/src/init_cluster.rs b/components/snap_recovery/src/init_cluster.rs index 42c1d0b1882..e7818b3f888 100644 --- a/components/snap_recovery/src/init_cluster.rs +++ b/components/snap_recovery/src/init_cluster.rs @@ -328,8 +328,7 @@ pub fn create_local_engine_service( // init raft engine, either is rocksdb or raft engine if !config.raft_engine.enable { // rocksdb - let mut raft_db_opts = config.raftdb.build_opt(); - raft_db_opts.set_env(env); + let raft_db_opts = config.raftdb.build_opt(env, None); let raft_db_cf_opts = config.raftdb.build_cf_opts(factory.block_cache()); let raft_path = config .infer_raft_db_path(None) diff --git a/etc/config-template.toml b/etc/config-template.toml index 8820d2e0675..62623afed0e 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -626,6 +626,11 @@ ## RocksDB log levels # info-log-level = "info" +## Memory usage limit for Raft Engine. Undersized write buffers will be flushed to satisfy the +## requirement. +## No limit when not specified. +# write-buffer-limit = "1GB" + ## Options for `Titan`. [rocksdb.titan] ## Enables or disables `Titan`. Note that Titan is still an experimental feature. Once @@ -848,6 +853,9 @@ ## # checksum = "crc32c" +## The maximum number of concurrent compaction tasks. 0 stands for no limit. +# max-compactions = 0 + ## Options for "Default" Column Family for `Titan`. [rocksdb.defaultcf.titan] ## The smallest value to store in blob files. Value smaller than @@ -935,6 +943,7 @@ # format-version = 2 # prepopulate-block-cache = "disabled" # checksum = "crc32c" +# max-compactions = 0 [rocksdb.lockcf] # compression-per-level = ["no", "no", "no", "no", "no", "no", "no"] @@ -959,6 +968,7 @@ # format-version = 2 # prepopulate-block-cache = "disabled" # checksum = "crc32c" +# max-compactions = 0 [raftdb] # max-background-jobs = 4 @@ -1020,6 +1030,7 @@ # format-version = 2 # prepopulate-block-cache = "disabled" # checksum = "crc32c" +# max-compactions = 0 [raft-engine] ## Determines whether to use Raft Engine to store raft logs. When it is diff --git a/src/config/mod.rs b/src/config/mod.rs index 2074c992519..8886711f948 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -29,8 +29,9 @@ use engine_rocks::{ get_env, properties::MvccPropertiesCollectorFactory, raw::{ - BlockBasedOptions, Cache, ChecksumType, CompactionPriority, DBCompactionStyle, - DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, PrepopulateBlockCache, + BlockBasedOptions, Cache, ChecksumType, CompactionPriority, ConcurrentTaskLimiter, + DBCompactionStyle, DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, + PrepopulateBlockCache, RateLimiter, WriteBufferManager, }, util::{FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform}, RaftDbLogger, RangePropertiesCollectorFactory, RawMvccPropertiesCollectorFactory, @@ -361,6 +362,8 @@ macro_rules! cf_config { #[serde(with = "rocks_config::checksum_serde")] #[online_config(skip)] pub checksum: ChecksumType, + #[online_config(skip)] + pub max_compactions: u32, #[online_config(submodule)] pub titan: TitanCfConfig, } @@ -514,7 +517,13 @@ macro_rules! write_into_metrics { } macro_rules! build_cf_opt { - ($opt:ident, $cf_name:ident, $cache:expr, $region_info_provider:ident) => {{ + ( + $opt:ident, + $cf_name:ident, + $cache:expr, + $compaction_limiter:expr, + $region_info_provider:ident + ) => {{ let mut block_base_opts = BlockBasedOptions::new(); block_base_opts.set_block_size($opt.block_size.0 as usize); block_base_opts.set_no_block_cache($opt.disable_block_cache); @@ -599,10 +608,18 @@ macro_rules! build_cf_opt { warn!("compaction guard is disabled due to region info provider not available") } } + if let Some(r) = $compaction_limiter { + cf_opts.set_compaction_thread_limiter(r); + } cf_opts }}; } +pub struct CfResources { + pub cache: Cache, + pub compaction_thread_limiters: HashMap<&'static str, ConcurrentTaskLimiter>, +} + cf_config!(DefaultCfConfig); impl Default for DefaultCfConfig { @@ -661,6 +678,7 @@ impl Default for DefaultCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan: TitanCfConfig::default(), } } @@ -669,12 +687,18 @@ impl Default for DefaultCfConfig { impl DefaultCfConfig { pub fn build_opt( &self, - cache: &Cache, + shared: &CfResources, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, for_engine: EngineType, ) -> RocksCfOptions { - let mut cf_opts = build_cf_opt!(self, CF_DEFAULT, cache, region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_DEFAULT, + &shared.cache, + shared.compaction_thread_limiters.get(CF_DEFAULT), + region_info_accessor + ); cf_opts.set_memtable_prefix_bloom_size_ratio(bloom_filter_ratio(for_engine)); let f = RangePropertiesCollectorFactory { prop_size_index_distance: self.prop_size_index_distance, @@ -779,6 +803,7 @@ impl Default for WriteCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan, } } @@ -787,11 +812,17 @@ impl Default for WriteCfConfig { impl WriteCfConfig { pub fn build_opt( &self, - cache: &Cache, + shared: &CfResources, region_info_accessor: Option<&RegionInfoAccessor>, for_engine: EngineType, ) -> RocksCfOptions { - let mut cf_opts = build_cf_opt!(self, CF_WRITE, cache, region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_WRITE, + &shared.cache, + shared.compaction_thread_limiters.get(CF_WRITE), + region_info_accessor + ); // Prefix extractor(trim the timestamp at tail) for write cf. cf_opts .set_prefix_extractor( @@ -878,15 +909,22 @@ impl Default for LockCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan, } } } impl LockCfConfig { - pub fn build_opt(&self, cache: &Cache, for_engine: EngineType) -> RocksCfOptions { + pub fn build_opt(&self, shared: &CfResources, for_engine: EngineType) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; - let mut cf_opts = build_cf_opt!(self, CF_LOCK, cache, no_region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_LOCK, + &shared.cache, + shared.compaction_thread_limiters.get(CF_LOCK), + no_region_info_accessor + ); cf_opts .set_prefix_extractor("NoopSliceTransform", NoopSliceTransform) .unwrap(); @@ -954,15 +992,22 @@ impl Default for RaftCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan, } } } impl RaftCfConfig { - pub fn build_opt(&self, cache: &Cache) -> RocksCfOptions { + pub fn build_opt(&self, shared: &CfResources) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; - let mut cf_opts = build_cf_opt!(self, CF_RAFT, cache, no_region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_RAFT, + &shared.cache, + shared.compaction_thread_limiters.get(CF_RAFT), + no_region_info_accessor + ); cf_opts .set_prefix_extractor("NoopSliceTransform", NoopSliceTransform) .unwrap(); @@ -1078,6 +1123,16 @@ pub struct DbConfig { pub enable_unordered_write: bool, #[online_config(skip)] pub allow_concurrent_memtable_write: Option, + #[online_config(skip)] + pub write_buffer_limit: Option, + #[online_config(skip)] + #[doc(hidden)] + #[serde(skip_serializing)] + pub write_buffer_stall_ratio: f32, + #[online_config(skip)] + #[doc(hidden)] + #[serde(skip_serializing)] + pub write_buffer_flush_oldest_first: bool, // Dangerous option only for programming use. #[online_config(skip)] #[serde(skip)] @@ -1094,6 +1149,15 @@ pub struct DbConfig { pub titan: TitanDbConfig, } +#[derive(Clone)] +pub struct DbResources { + // DB Options. + pub env: Arc, + pub statistics: Arc, + pub rate_limiter: Option>, + pub write_buffer_manager: Option>, +} + impl Default for DbConfig { fn default() -> DbConfig { let bg_job_limits = get_background_job_limits(&KVDB_DEFAULT_BACKGROUND_JOB_LIMITS); @@ -1134,6 +1198,9 @@ impl Default for DbConfig { enable_multi_batch_write: None, // deprecated enable_unordered_write: false, allow_concurrent_memtable_write: None, + write_buffer_limit: None, + write_buffer_stall_ratio: 0.0, + write_buffer_flush_oldest_first: false, paranoid_checks: None, defaultcf: DefaultCfConfig::default(), writecf: WriteCfConfig::default(), @@ -1157,7 +1224,33 @@ impl DbConfig { } } - pub fn build_opt(&self, stats: Option<&RocksStatistics>) -> RocksDbOptions { + pub fn build_resources(&self, env: Arc) -> DbResources { + let rate_limiter = if self.rate_bytes_per_sec.0 > 0 { + Some(Arc::new(RateLimiter::new_writeampbased_with_auto_tuned( + self.rate_bytes_per_sec.0 as i64, + (self.rate_limiter_refill_period.as_millis() * 1000) as i64, + 10, // fairness + self.rate_limiter_mode, + self.rate_limiter_auto_tuned, + ))) + } else { + None + }; + DbResources { + env, + statistics: Arc::new(RocksStatistics::new_titan()), + rate_limiter, + write_buffer_manager: self.write_buffer_limit.map(|limit| { + Arc::new(WriteBufferManager::new( + limit.0 as usize, + self.write_buffer_stall_ratio, + self.write_buffer_flush_oldest_first, + )) + }), + } + } + + pub fn build_opt(&self, shared: &DbResources) -> RocksDbOptions { let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { @@ -1173,33 +1266,11 @@ impl DbConfig { opts.set_max_manifest_file_size(self.max_manifest_file_size.0); opts.create_if_missing(self.create_if_missing); opts.set_max_open_files(self.max_open_files); - match stats { - Some(stats) => opts.set_statistics(stats), - None => opts.set_statistics(&RocksStatistics::new_titan()), - } opts.set_stats_dump_period_sec(self.stats_dump_period.as_secs() as usize); opts.set_compaction_readahead_size(self.compaction_readahead_size.0); opts.set_max_log_file_size(self.info_log_max_size.0); opts.set_log_file_time_to_roll(self.info_log_roll_time.as_secs()); opts.set_keep_log_file_num(self.info_log_keep_log_file_num); - if self.rate_bytes_per_sec.0 > 0 { - if self.rate_limiter_auto_tuned { - opts.set_writeampbasedratelimiter_with_auto_tuned( - self.rate_bytes_per_sec.0 as i64, - (self.rate_limiter_refill_period.as_millis() * 1000) as i64, - self.rate_limiter_mode, - self.rate_limiter_auto_tuned, - ); - } else { - opts.set_ratelimiter_with_auto_tuned( - self.rate_bytes_per_sec.0 as i64, - (self.rate_limiter_refill_period.as_millis() * 1000) as i64, - self.rate_limiter_mode, - self.rate_limiter_auto_tuned, - ); - } - } - opts.set_bytes_per_sync(self.bytes_per_sync.0); opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0); opts.set_max_subcompactions(self.max_sub_compactions); @@ -1226,12 +1297,52 @@ impl DbConfig { if self.titan.enabled { opts.set_titandb_options(&self.titan.build_opts()); } + opts.set_env(shared.env.clone()); + opts.set_statistics(&shared.statistics); + if let Some(r) = &shared.rate_limiter { + opts.set_rate_limiter(r); + } + if let Some(r) = &shared.write_buffer_manager { + opts.set_write_buffer_manager(r); + } opts } + pub fn build_cf_resources(&self, cache: Cache) -> CfResources { + let mut compaction_thread_limiters = HashMap::new(); + if self.defaultcf.max_compactions > 0 { + compaction_thread_limiters.insert( + CF_DEFAULT, + ConcurrentTaskLimiter::new(CF_DEFAULT, self.defaultcf.max_compactions), + ); + } + if self.writecf.max_compactions > 0 { + compaction_thread_limiters.insert( + CF_WRITE, + ConcurrentTaskLimiter::new(CF_WRITE, self.writecf.max_compactions), + ); + } + if self.lockcf.max_compactions > 0 { + compaction_thread_limiters.insert( + CF_LOCK, + ConcurrentTaskLimiter::new(CF_LOCK, self.lockcf.max_compactions), + ); + } + if self.raftcf.max_compactions > 0 { + compaction_thread_limiters.insert( + CF_RAFT, + ConcurrentTaskLimiter::new(CF_RAFT, self.raftcf.max_compactions), + ); + } + CfResources { + cache, + compaction_thread_limiters, + } + } + pub fn build_cf_opts( &self, - cache: &Cache, + shared: &CfResources, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, for_engine: EngineType, @@ -1240,16 +1351,16 @@ impl DbConfig { cf_opts.push(( CF_DEFAULT, self.defaultcf - .build_opt(cache, region_info_accessor, api_version, for_engine), + .build_opt(shared, region_info_accessor, api_version, for_engine), )); - cf_opts.push((CF_LOCK, self.lockcf.build_opt(cache, for_engine))); + cf_opts.push((CF_LOCK, self.lockcf.build_opt(shared, for_engine))); cf_opts.push(( CF_WRITE, self.writecf - .build_opt(cache, region_info_accessor, for_engine), + .build_opt(shared, region_info_accessor, for_engine), )); if for_engine == EngineType::RaftKv { - cf_opts.push((CF_RAFT, self.raftcf.build_opt(cache))); + cf_opts.push((CF_RAFT, self.raftcf.build_opt(shared))); } cf_opts } @@ -1367,6 +1478,7 @@ impl Default for RaftDefaultCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan: TitanCfConfig::default(), } } @@ -1374,8 +1486,19 @@ impl Default for RaftDefaultCfConfig { impl RaftDefaultCfConfig { pub fn build_opt(&self, cache: &Cache) -> RocksCfOptions { + let limiter = if self.max_compactions > 0 { + Some(ConcurrentTaskLimiter::new(CF_DEFAULT, self.max_compactions)) + } else { + None + }; let no_region_info_accessor: Option<&RegionInfoAccessor> = None; - let mut cf_opts = build_cf_opt!(self, CF_DEFAULT, cache, no_region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_DEFAULT, + cache, + limiter.as_ref(), + no_region_info_accessor + ); let f = FixedPrefixSliceTransform::new(region_raft_prefix_len()); cf_opts .set_memtable_insert_hint_prefix_extractor("RaftPrefixSliceTransform", f) @@ -1488,7 +1611,7 @@ impl Default for RaftDbConfig { } impl RaftDbConfig { - pub fn build_opt(&self) -> RocksDbOptions { + pub fn build_opt(&self, env: Arc, statistics: Option<&RocksStatistics>) -> RocksDbOptions { let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { @@ -1503,7 +1626,10 @@ impl RaftDbConfig { opts.set_max_manifest_file_size(self.max_manifest_file_size.0); opts.create_if_missing(self.create_if_missing); opts.set_max_open_files(self.max_open_files); - opts.set_statistics(&RocksStatistics::new_titan()); + match statistics { + Some(s) => opts.set_statistics(s), + None => opts.set_statistics(&RocksStatistics::new_titan()), + } opts.set_stats_dump_period_sec(self.stats_dump_period.as_secs() as usize); opts.set_compaction_readahead_size(self.compaction_readahead_size.0); opts.set_max_log_file_size(self.info_log_max_size.0); @@ -1526,7 +1652,7 @@ impl RaftDbConfig { if self.titan.enabled { opts.set_titandb_options(&self.titan.build_opts()); } - + opts.set_env(env); opts } @@ -4273,6 +4399,15 @@ mod tests { tikv_cfg.validate().unwrap(); } + #[test] + fn test_rocks_rate_limit_zero() { + let mut tikv_cfg = TikvConfig::default(); + tikv_cfg.rocksdb.rate_bytes_per_sec = ReadableSize(0); + tikv_cfg + .rocksdb + .build_opt(&tikv_cfg.rocksdb.build_resources(Arc::new(Env::default()))); + } + #[test] fn test_parse_log_level() { #[derive(Serialize, Deserialize, Debug)] @@ -4433,9 +4568,13 @@ mod tests { assert_eq!(F::TAG, cfg.storage.api_version()); let engine = RocksDBEngine::new( &cfg.storage.data_dir, - Some(cfg.rocksdb.build_opt(None)), + Some( + cfg.rocksdb + .build_opt(&cfg.rocksdb.build_resources(Arc::new(Env::default()))), + ), cfg.rocksdb.build_cf_opts( - &cfg.storage.block_cache.build_shared_cache(), + &cfg.rocksdb + .build_cf_resources(cfg.storage.block_cache.build_shared_cache()), None, cfg.storage.api_version(), cfg.storage.engine, @@ -5051,6 +5190,7 @@ mod tests { #[test] fn test_compaction_guard() { let cache = Cache::new_lru_cache(LRUCacheOptions::new()); + let no_limiter: Option = None; // Test comopaction guard disabled. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), @@ -5058,7 +5198,7 @@ mod tests { ..Default::default() }; let provider = Some(MockRegionInfoProvider::new(vec![])); - let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, provider); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, no_limiter.as_ref(), provider); assert_eq!( config.target_file_size_base.0, cf_opts.get_target_file_size_base() @@ -5071,7 +5211,7 @@ mod tests { ..Default::default() }; let provider: Option = None; - let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, provider); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, no_limiter.as_ref(), provider); assert_eq!( config.target_file_size_base.0, cf_opts.get_target_file_size_base() @@ -5086,7 +5226,7 @@ mod tests { ..Default::default() }; let provider = Some(MockRegionInfoProvider::new(vec![])); - let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, provider); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, no_limiter.as_ref(), provider); assert_eq!( config.compaction_guard_max_output_file_size.0, cf_opts.get_target_file_size_base() @@ -5391,6 +5531,7 @@ mod tests { cfg.memory_usage_limit = None; cfg.raft_engine.mut_config().memory_limit = None; cfg.coprocessor_v2.coprocessor_plugin_directory = None; // Default is `None`, which is represented by not setting the key. + cfg.rocksdb.write_buffer_limit = None; cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.defaultcf.level0_stop_writes_trigger = None; cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = None; diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 2c31c9522b1..91b5178f8a0 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -16,19 +16,18 @@ use raftstore::RegionInfoAccessor; use tikv_util::worker::Scheduler; use crate::{ - config::{DbConfig, TikvConfig, DEFAULT_ROCKSDB_SUB_DIR}, + config::{CfResources, DbConfig, DbResources, TikvConfig, DEFAULT_ROCKSDB_SUB_DIR}, storage::config::EngineType, }; struct FactoryInner { - env: Arc, region_info_accessor: Option, - block_cache: Cache, rocksdb_config: Arc, api_version: ApiVersion, flow_listener: Option, sst_recovery_sender: Option>, - statistics: Arc, + db_resources: DbResources, + cf_resources: CfResources, state_storage: Option>, lite: bool, } @@ -40,17 +39,15 @@ pub struct KvEngineFactoryBuilder { impl KvEngineFactoryBuilder { pub fn new(env: Arc, config: &TikvConfig, cache: Cache) -> Self { - let statistics = Arc::new(RocksStatistics::new_titan()); Self { inner: FactoryInner { - env, region_info_accessor: None, - block_cache: cache, rocksdb_config: Arc::new(config.rocksdb.clone()), api_version: config.storage.api_version(), flow_listener: None, sst_recovery_sender: None, - statistics, + db_resources: config.rocksdb.build_resources(env), + cf_resources: config.rocksdb.build_cf_resources(cache), state_storage: None, lite: false, }, @@ -134,7 +131,7 @@ impl KvEngineFactory { } pub fn rocks_statistics(&self) -> Arc { - self.inner.statistics.clone() + self.inner.db_resources.statistics.clone() } fn db_opts(&self) -> RocksDbOptions { @@ -142,8 +139,7 @@ impl KvEngineFactory { let mut db_opts = self .inner .rocksdb_config - .build_opt(Some(self.inner.statistics.as_ref())); - db_opts.set_env(self.inner.env.clone()); + .build_opt(&self.inner.db_resources); if !self.inner.lite { db_opts.add_event_listener(RocksEventListener::new( "kv", @@ -158,7 +154,7 @@ impl KvEngineFactory { fn cf_opts(&self, for_engine: EngineType) -> Vec<(&str, RocksCfOptions)> { self.inner.rocksdb_config.build_cf_opts( - &self.inner.block_cache, + &self.inner.cf_resources, self.inner.region_info_accessor.as_ref(), self.inner.api_version, for_engine, @@ -166,7 +162,7 @@ impl KvEngineFactory { } pub fn block_cache(&self) -> &Cache { - &self.inner.block_cache + &self.inner.cf_resources.cache } /// Create a shared db. diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index 12a7776e434..d15a33742ba 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -96,7 +96,7 @@ impl TestEngineBuilder { if !enable_block_cache { cache_opt.capacity = Some(ReadableSize::kb(0)); } - let cache = cache_opt.build_shared_cache(); + let shared = cfg_rocksdb.build_cf_resources(cache_opt.build_shared_cache()); let cfs_opts = cfs .iter() .map(|cf| match *cf { @@ -104,19 +104,19 @@ impl TestEngineBuilder { CF_DEFAULT, cfg_rocksdb .defaultcf - .build_opt(&cache, None, api_version, EngineType::RaftKv), + .build_opt(&shared, None, api_version, EngineType::RaftKv), ), CF_LOCK => ( CF_LOCK, - cfg_rocksdb.lockcf.build_opt(&cache, EngineType::RaftKv), + cfg_rocksdb.lockcf.build_opt(&shared, EngineType::RaftKv), ), CF_WRITE => ( CF_WRITE, cfg_rocksdb .writecf - .build_opt(&cache, None, EngineType::RaftKv), + .build_opt(&shared, None, EngineType::RaftKv), ), - CF_RAFT => (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), + CF_RAFT => (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&shared)), _ => (*cf, RocksCfOptions::default()), }) .collect(); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 60e9b965c5d..802b0507849 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -4136,12 +4136,13 @@ mod tests { let engine = { let path = "".to_owned(); let cfg_rocksdb = db_config; - let cache = BlockCacheConfig::default().build_shared_cache(); + let shared = + cfg_rocksdb.build_cf_resources(BlockCacheConfig::default().build_shared_cache()); let cfs_opts = vec![ ( CF_DEFAULT, cfg_rocksdb.defaultcf.build_opt( - &cache, + &shared, None, ApiVersion::V1, EngineType::RaftKv, @@ -4149,15 +4150,15 @@ mod tests { ), ( CF_LOCK, - cfg_rocksdb.lockcf.build_opt(&cache, EngineType::RaftKv), + cfg_rocksdb.lockcf.build_opt(&shared, EngineType::RaftKv), ), ( CF_WRITE, cfg_rocksdb .writecf - .build_opt(&cache, None, EngineType::RaftKv), + .build_opt(&shared, None, EngineType::RaftKv), ), - (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), + (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&shared)), ]; RocksEngine::new( &path, None, cfs_opts, None, // io_rate_limiter diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 4e22463503a..c6f8e565218 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -312,6 +312,9 @@ fn test_serde_custom_tikv_config() { paranoid_checks: None, allow_concurrent_memtable_write: Some(false), enable_unordered_write: true, + write_buffer_limit: Some(ReadableSize::gb(1)), + write_buffer_stall_ratio: 0.0, + write_buffer_flush_oldest_first: false, defaultcf: DefaultCfConfig { block_size: ReadableSize::kb(12), block_cache_size: ReadableSize::gb(12), @@ -365,6 +368,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, writecf: WriteCfConfig { block_size: ReadableSize::kb(12), @@ -433,6 +437,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, lockcf: LockCfConfig { block_size: ReadableSize::kb(12), @@ -501,6 +506,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, raftcf: RaftCfConfig { block_size: ReadableSize::kb(12), @@ -569,6 +575,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, titan: titan_db_config.clone(), }; @@ -652,6 +659,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, titan: titan_db_config, }; diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 900e1c36aa6..b096437e60c 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -271,6 +271,7 @@ enable-pipelined-write = false enable-multi-batch-write = true enable-unordered-write = true allow-concurrent-memtable-write = false +write-buffer-limit = "1GB" [rocksdb.titan] enabled = true @@ -331,6 +332,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [rocksdb.defaultcf.titan] min-blob-size = "2018B" @@ -393,6 +395,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [rocksdb.lockcf] block-size = "12KB" @@ -442,6 +445,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [rocksdb.raftcf] block-size = "12KB" @@ -491,6 +495,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [raftdb] wal-recovery-mode = "skip-any-corrupted-records" @@ -572,6 +577,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [raftdb.defaultcf.titan] min-blob-size = "2018B" diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 412f9f5a777..452bcc89238 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -159,10 +159,15 @@ fn test_delete_files_in_range_for_titan() { cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize(0); cfg.rocksdb.defaultcf.titan.discardable_ratio = 0.4; cfg.rocksdb.defaultcf.titan.min_blob_size = ReadableSize(0); - let kv_db_opts = cfg.rocksdb.build_opt(None); - let kv_cfs_opts = - cfg.rocksdb - .build_cf_opts(&cache, None, cfg.storage.api_version(), cfg.storage.engine); + let kv_db_opts = cfg + .rocksdb + .build_opt(&cfg.rocksdb.build_resources(Default::default())); + let kv_cfs_opts = cfg.rocksdb.build_cf_opts( + &cfg.rocksdb.build_cf_resources(cache), + None, + cfg.storage.api_version(), + cfg.storage.engine, + ); let raft_path = path.path().join(Path::new("titan")); let engines = Engines::new( From 77c21995488a702a2276c1f1b472bb68c8b85bc4 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 27 Dec 2022 20:16:16 +0800 Subject: [PATCH 433/676] raftstore-v2: gc tablets (#13974) ref tikv/tikv#12842 Signed-off-by: tabokie Signed-off-by: Xinye Tao --- components/engine_panic/src/compact.rs | 11 +- components/engine_panic/src/misc.rs | 8 + components/engine_rocks/src/compact.rs | 14 +- components/engine_rocks/src/file_system.rs | 2 +- components/engine_rocks/src/misc.rs | 12 +- .../engine_rocks_helper/src/sst_recovery.rs | 3 +- components/engine_traits/src/compact.rs | 26 +- components/engine_traits/src/misc.rs | 4 + components/engine_traits/src/tablet.rs | 24 +- components/raftstore-v2/src/batch/store.rs | 11 +- .../operation/command/admin/compact_log.rs | 13 +- .../src/operation/command/admin/split.rs | 19 ++ .../raftstore-v2/src/operation/ready/mod.rs | 2 +- .../src/operation/ready/snapshot.rs | 3 +- components/raftstore-v2/src/raft/peer.rs | 47 +++- components/raftstore-v2/src/raft/storage.rs | 2 +- components/raftstore-v2/src/worker/mod.rs | 1 + .../raftstore-v2/src/worker/tablet_gc.rs | 227 ++++++++++++++++++ .../raftstore/src/store/async_io/write.rs | 8 +- .../raftstore/src/store/compaction_guard.rs | 2 +- .../raftstore/src/store/worker/compact.rs | 2 +- components/test_raftstore/src/cluster.rs | 3 +- src/config/mod.rs | 6 + src/storage/mvcc/reader/reader.rs | 2 +- 24 files changed, 393 insertions(+), 59 deletions(-) create mode 100644 components/raftstore-v2/src/worker/tablet_gc.rs diff --git a/components/engine_panic/src/compact.rs b/components/engine_panic/src/compact.rs index f1e78d57010..988bec790de 100644 --- a/components/engine_panic/src/compact.rs +++ b/components/engine_panic/src/compact.rs @@ -13,7 +13,7 @@ impl CompactExt for PanicEngine { panic!() } - fn compact_range( + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -24,15 +24,6 @@ impl CompactExt for PanicEngine { panic!() } - fn compact_files_in_range( - &self, - start: Option<&[u8]>, - end: Option<&[u8]>, - output_level: Option, - ) -> Result<()> { - panic!() - } - fn compact_files_in_range_cf( &self, cf: &str, diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 561d2892ca9..5e6fbe87267 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -64,10 +64,18 @@ impl MiscExt for PanicEngine { panic!() } + fn pause_background_work(&self) -> Result<()> { + panic!() + } + fn exists(path: &str) -> bool { panic!() } + fn locked(path: &str) -> Result { + panic!() + } + fn dump_stats(&self) -> Result { panic!() } diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index b9e3e5fe558..199b7d9f3be 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -24,7 +24,7 @@ impl CompactExt for RocksEngine { Ok(false) } - fn compact_range( + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -43,18 +43,6 @@ impl CompactExt for RocksEngine { Ok(()) } - fn compact_files_in_range( - &self, - start: Option<&[u8]>, - end: Option<&[u8]>, - output_level: Option, - ) -> Result<()> { - for cf_name in self.cf_names() { - self.compact_files_in_range_cf(cf_name, start, end, output_level)?; - } - Ok(()) - } - fn compact_files_in_range_cf( &self, cf: &str, diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index 5fc0ed7f6e2..b470237f313 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -92,7 +92,7 @@ mod tests { assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); - db.compact_range( + db.compact_range_cf( CF_DEFAULT, None, // start_key None, // end_key false, // exclusive_manual diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 75b193bdcf9..55546869272 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -273,10 +273,20 @@ impl MiscExt for RocksEngine { self.as_inner().sync_wal().map_err(r2e) } + fn pause_background_work(&self) -> Result<()> { + self.as_inner().pause_bg_work(); + Ok(()) + } + fn exists(path: &str) -> bool { crate::util::db_exist(path) } + fn locked(path: &str) -> Result { + let env = rocksdb::Env::default(); + env.is_db_locked(path).map_err(r2e) + } + fn dump_stats(&self) -> Result { const ROCKSDB_DB_STATS_KEY: &str = "rocksdb.dbstats"; const ROCKSDB_CF_STATS_KEY: &str = "rocksdb.cfstats"; @@ -659,7 +669,7 @@ mod tests { ]; assert_eq!(sst_range, expected); - db.compact_range(cf, None, None, false, 1).unwrap(); + db.compact_range_cf(cf, None, None, false, 1).unwrap(); let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); assert_eq!(sst_range.len(), 0); let sst_range = db.get_sst_key_ranges(cf, 1).unwrap(); diff --git a/components/engine_rocks_helper/src/sst_recovery.rs b/components/engine_rocks_helper/src/sst_recovery.rs index 7a820e6a79b..85fb8d74bee 100644 --- a/components/engine_rocks_helper/src/sst_recovery.rs +++ b/components/engine_rocks_helper/src/sst_recovery.rs @@ -227,7 +227,8 @@ mod tests { db.put(b"z2", b"val").unwrap(); db.put(b"z7", b"val").unwrap(); // generate SST file. - db.compact_range(CF_DEFAULT, None, None, false, 1).unwrap(); + db.compact_range_cf(CF_DEFAULT, None, None, false, 1) + .unwrap(); let files = db.as_inner().get_live_files(); assert_eq!(files.get_smallestkey(0), b"z2"); diff --git a/components/engine_traits/src/compact.rs b/components/engine_traits/src/compact.rs index 8dd1cc7d9b4..05590a1ff32 100644 --- a/components/engine_traits/src/compact.rs +++ b/components/engine_traits/src/compact.rs @@ -4,17 +4,30 @@ use std::collections::BTreeMap; -use crate::errors::Result; +use crate::{errors::Result, CfNamesExt}; -pub trait CompactExt { +pub trait CompactExt: CfNamesExt { type CompactedEvent: CompactedEvent; /// Checks whether any column family sets `disable_auto_compactions` to /// `True` or not. fn auto_compactions_is_disabled(&self) -> Result; - /// Compacts the column families in the specified range by manual or not. fn compact_range( + &self, + start_key: Option<&[u8]>, + end_key: Option<&[u8]>, + exclusive_manual: bool, + max_subcompactions: u32, + ) -> Result<()> { + for cf in self.cf_names() { + self.compact_range_cf(cf, start_key, end_key, exclusive_manual, max_subcompactions)?; + } + Ok(()) + } + + /// Compacts the column families in the specified range by manual or not. + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -32,7 +45,12 @@ pub trait CompactExt { start: Option<&[u8]>, end: Option<&[u8]>, output_level: Option, - ) -> Result<()>; + ) -> Result<()> { + for cf in self.cf_names() { + self.compact_files_in_range_cf(cf, start, end, output_level)?; + } + Ok(()) + } /// Compacts files in the range and above the output level of the given /// column family. Compacts all files to the bottommost level if the diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index a7679256f21..d9a07a1a915 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -97,9 +97,13 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn sync_wal(&self) -> Result<()>; + fn pause_background_work(&self) -> Result<()>; + /// Check whether a database exists at a given path fn exists(path: &str) -> bool; + fn locked(path: &str) -> Result; + /// Dump stats about the database into a string. /// /// For debugging. The format and content is unspecified. diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index 46b020cf138..64459bbc7ee 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -69,6 +69,19 @@ impl CachedTablet { } self.cache() } + + /// Returns how many versions has passed. + #[inline] + pub fn refresh(&mut self) -> u64 { + let old_version = self.version; + if self.latest.version.load(Ordering::Relaxed) > old_version { + let latest_data = self.latest.data.lock().unwrap(); + self.version = self.latest.version.load(Ordering::Relaxed); + self.cache = latest_data.clone(); + return self.version - old_version; + } + 0 + } } /// Context to be passed to `TabletFactory`. @@ -178,7 +191,6 @@ impl TabletFactory for SingletonFactory { struct TabletRegistryInner { // region_id, suffix -> tablet tablets: Mutex>>, - tombstone: Mutex>, factory: Box>, root: PathBuf, } @@ -197,9 +209,6 @@ impl Clone for TabletRegistry { } } -unsafe impl Send for TabletRegistry {} -unsafe impl Sync for TabletRegistry {} - impl TabletRegistry { pub fn new(factory: Box>, path: impl Into) -> Result { let root = path.into(); @@ -209,7 +218,6 @@ impl TabletRegistry { tablets: Mutex::new(HashMap::default()), factory, root, - tombstone: Mutex::default(), }), }) } @@ -273,17 +281,13 @@ impl TabletRegistry { ctx.suffix ))); } + // TODO: use compaction filter to trim range. let tablet = self.tablets.factory.open_tablet(ctx, &path)?; let mut cached = self.get_or_default(id); cached.set(tablet); Ok(cached) } - /// Destroy the tablet and its data - pub fn mark_tombstone(&self, id: u64, suffix: u64) { - self.tablets.tombstone.lock().unwrap().push((id, suffix)); - } - /// Loop over all opened tablets. Note, it's possible that the visited /// tablet is not the latest one. If latest one is required, you may /// either: diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index bcfa6ca0771..38ce4296c03 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -45,7 +45,7 @@ use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, - worker::pd, + worker::{pd, tablet_gc}, Error, Result, }; @@ -363,6 +363,7 @@ where pub struct Schedulers { pub read: Scheduler>, pub pd: Scheduler, + pub tablet_gc: Scheduler>, pub write: WriteSenders, // Following is not maintained by raftstore itself. @@ -375,6 +376,7 @@ struct Workers { /// Worker for fetching raft logs asynchronously async_read: Worker, pd: LazyWorker, + tablet_gc_worker: Worker, async_write: StoreWriters, // Following is not maintained by raftstore itself. @@ -386,6 +388,7 @@ impl Workers { Self { async_read: Worker::new("async-read-worker"), pd, + tablet_gc_worker: Worker::new("tablet-gc-worker"), async_write: StoreWriters::default(), background, } @@ -461,9 +464,15 @@ impl StoreSystem { ), ); + let tablet_gc_scheduler = workers.tablet_gc_worker.start( + "tablet-gc-worker", + tablet_gc::Runner::new(tablet_registry.clone(), self.logger.clone()), + ); + let schedulers = Schedulers { read: read_scheduler, pd: workers.pd.scheduler(), + tablet_gc: tablet_gc_scheduler, write: workers.async_write.senders(), split_check: split_check_scheduler, }; diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index af8fb5acc47..aaf067aa585 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -17,7 +17,7 @@ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; use protobuf::Message; use raftstore::{ - store::{fsm::new_admin_request, needs_evict_entry_cache, Transport}, + store::{fsm::new_admin_request, needs_evict_entry_cache, Transport, WriteTask}, Result, }; use slog::{debug, error, info}; @@ -29,6 +29,7 @@ use crate::{ operation::AdminCmdResult, raft::{Apply, Peer}, router::{CmdResChannel, PeerTick}, + worker::tablet_gc, }; impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { @@ -256,12 +257,14 @@ impl Peer { &mut self, store_ctx: &mut StoreContext, old_persisted: u64, + task: &mut WriteTask, ) { let new_persisted = self.storage().apply_trace().persisted_apply_index(); if old_persisted < new_persisted { + let region_id = self.region_id(); // TODO: batch it. if let Err(e) = store_ctx.engine.delete_all_but_one_states_before( - self.region_id(), + region_id, new_persisted, self.state_changes_mut(), ) { @@ -270,6 +273,12 @@ impl Peer { self.set_has_extra_write(); } self.maybe_compact_log_from_engine(store_ctx, Either::Left(old_persisted)); + if self.remove_tombstone_tablets_before(new_persisted) { + let sched = store_ctx.schedulers.tablet_gc.clone(); + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); + })) + } } } diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 2154eb20e90..386528070e2 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -59,6 +59,7 @@ use crate::{ operation::AdminCmdResult, raft::{Apply, Peer}, router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, + worker::tablet_gc, Error, }; @@ -439,6 +440,15 @@ impl Peer { self.add_pending_tick(PeerTick::SplitRegionCheck); } + self.record_tablet_as_tombstone_and_refresh(res.tablet_index, store_ctx); + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + derived, + )); + let last_region_id = res.regions.last().unwrap().get_id(); let mut new_ids = HashSet::default(); for (new_region, locks) in res.regions.into_iter().zip(region_locks) { @@ -491,6 +501,8 @@ impl Peer { ) { let region_id = split_init.region.id; if self.storage().is_initialized() && self.persisted_index() >= RAFT_INIT_LOG_INDEX { + // Race with split operation. The tablet created by split will eventually be + // deleted (TODO). We don't trim it. let _ = store_ctx .router .force_send(split_init.source_id, PeerMsg::SplitInitFinish(region_id)); @@ -532,6 +544,13 @@ impl Peer { store_ctx: &mut StoreContext, split_init: Box, ) { + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + )); if split_init.source_leader && self.leader_id() == INVALID_ID && self.term() == RAFT_INIT_LOG_TERM diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 14010fc9fe2..66d9755c1df 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -383,7 +383,7 @@ impl Peer { self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); - self.on_advance_persisted_apply_index(ctx, prev_persisted); + self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); if !ready.persisted_messages().is_empty() { write_task.messages = ready diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index e1a36ed8ec7..7a6e00aec4f 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -162,6 +162,7 @@ impl Peer { // Use a new FlushState to avoid conflicts with the old one. tablet_ctx.flush_state = Some(flush_state); ctx.tablet_registry.load(tablet_ctx, false).unwrap(); + self.record_tablet_as_tombstone_and_refresh(persisted_index, ctx); self.schedule_apply_fsm(ctx); self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(persisted_index); @@ -504,7 +505,7 @@ impl Storage { let _ = fs::remove_dir_all(path); } }; - task.persisted_cb = Some(Box::new(hook)); + task.persisted_cbs.push(Box::new(hook)); task.has_snapshot = true; Ok(()) } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index ca5aafa3bfb..4fbc7e9874e 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -33,6 +33,7 @@ use crate::{ AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, }, router::{CmdResChannel, PeerTick, QueryResChannel}, + worker::tablet_gc, Result, }; @@ -42,6 +43,11 @@ const REGION_READ_PROGRESS_CAP: usize = 128; pub struct Peer { raft_group: RawNode>, tablet: CachedTablet, + /// Tombstone tablets can only be destroyed when the tablet that replaces it + /// is persisted. This is a list of tablet index that awaits to be + /// persisted. When persisted_apply is advanced, we need to notify tablet_gc + /// worker to destroy them. + pending_tombstone_tablets: Vec, /// Statistics for self. self_stat: PeerStat, @@ -126,7 +132,6 @@ impl Peer { let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); - let cached_tablet = tablet_registry.get_or_default(region_id); let flush_state: Arc = Arc::default(); // We can't create tablet if tablet index is 0. It can introduce race when gc // old tablet and create new peer. We also can't get the correct range of the @@ -137,10 +142,12 @@ impl Peer { // TODO: Perhaps we should stop create the tablet automatically. tablet_registry.load(ctx, false)?; } + let cached_tablet = tablet_registry.get_or_default(region_id); let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { tablet: cached_tablet, + pending_tombstone_tablets: Vec::new(), self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), @@ -328,13 +335,43 @@ impl Peer { } #[inline] - pub fn tablet(&self) -> &CachedTablet { - &self.tablet + pub fn tablet(&mut self) -> Option<&EK> { + self.tablet.latest() + } + + #[inline] + pub fn record_tablet_as_tombstone_and_refresh( + &mut self, + new_tablet_index: u64, + ctx: &StoreContext, + ) { + if let Some(old_tablet) = self.tablet.cache() { + self.pending_tombstone_tablets.push(new_tablet_index); + let _ = ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::prepare_destroy( + old_tablet.clone(), + self.region_id(), + new_tablet_index, + )); + } + // TODO: Handle race between split and snapshot. So that we can assert + // `self.tablet.refresh() == 1` + assert!(self.tablet.refresh() > 0); } + /// Returns if there's any tombstone being removed. #[inline] - pub fn tablet_mut(&mut self) -> &mut CachedTablet { - &mut self.tablet + pub fn remove_tombstone_tablets_before(&mut self, persisted: u64) -> bool { + let mut removed = 0; + while let Some(i) = self.pending_tombstone_tablets.first() + && *i <= persisted + { + removed += 1; + } + self.pending_tombstone_tablets.drain(..removed); + removed > 0 } #[inline] diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 959f817ebd7..636970c0ad1 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -393,7 +393,7 @@ mod tests { assert_ne!(10, s.entry_storage().applied_index()); assert_ne!(1, s.entry_storage().applied_term()); assert_eq!(10, s.region_state().get_tablet_index()); - assert!(task.persisted_cb.is_some()); + assert!(!task.persisted_cbs.is_empty()); s.on_applied_snapshot(); assert_eq!(10, s.entry_storage().applied_index()); diff --git a/components/raftstore-v2/src/worker/mod.rs b/components/raftstore-v2/src/worker/mod.rs index 3d4e69fdcf6..6fafd01df85 100644 --- a/components/raftstore-v2/src/worker/mod.rs +++ b/components/raftstore-v2/src/worker/mod.rs @@ -1,3 +1,4 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. pub mod pd; +pub mod tablet_gc; diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs new file mode 100644 index 00000000000..cc1fcd971e9 --- /dev/null +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -0,0 +1,227 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Display, Formatter}, + path::{Path, PathBuf}, + time::Duration, +}; + +use collections::HashMap; +use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry}; +use kvproto::metapb::Region; +use slog::{error, warn, Logger}; +use tikv_util::worker::{Runnable, RunnableWithTimer}; + +pub enum Task { + Trim { + tablet: EK, + start_key: Box<[u8]>, + end_key: Box<[u8]>, + }, + PrepareDestroy { + tablet: EK, + region_id: u64, + wait_for_persisted: u64, + }, + Destroy { + region_id: u64, + persisted_index: u64, + }, +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match *self { + Task::Trim { + ref start_key, + ref end_key, + .. + } => write!( + f, + "trim tablet for start_key {}, end_key {}", + log_wrappers::Value::key(start_key), + log_wrappers::Value::key(end_key), + ), + Task::PrepareDestroy { + region_id, + wait_for_persisted, + .. + } => write!( + f, + "prepare destroy tablet for region_id {}, wait_for_persisted {}", + region_id, wait_for_persisted, + ), + Task::Destroy { + region_id, + persisted_index, + } => write!( + f, + "destroy tablet for region_id {} persisted_index {}", + region_id, persisted_index, + ), + } + } +} + +impl Task { + #[inline] + pub fn trim(tablet: EK, region: &Region) -> Self { + Task::Trim { + tablet, + start_key: region.get_start_key().into(), + end_key: region.get_end_key().into(), + } + } + + #[inline] + pub fn prepare_destroy(tablet: EK, region_id: u64, wait_for_persisted: u64) -> Self { + Task::PrepareDestroy { + tablet, + region_id, + wait_for_persisted, + } + } + + #[inline] + pub fn destroy(region_id: u64, persisted_index: u64) -> Self { + Task::Destroy { + region_id, + persisted_index, + } + } +} + +pub struct Runner { + tablet_registry: TabletRegistry, + logger: Logger, + + // region_id -> [(tablet_path, wait_for_persisted)]. + waiting_destroy_tasks: HashMap>, + pending_destroy_tasks: Vec, +} + +impl Runner { + pub fn new(tablet_registry: TabletRegistry, logger: Logger) -> Self { + Self { + tablet_registry, + logger, + waiting_destroy_tasks: HashMap::default(), + pending_destroy_tasks: Vec::new(), + } + } + + fn trim(tablet: &EK, start_key: &[u8], end_key: &[u8]) -> engine_traits::Result<()> { + let start_key = keys::data_key(start_key); + let end_key = keys::data_end_key(end_key); + let range1 = Range::new(&[], &start_key); + let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); + tablet.delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[range1, range2])?; + // TODO: Avoid this after compaction filter is ready. + tablet.delete_ranges_cfs(DeleteStrategy::DeleteByRange, &[range1, range2])?; + for r in [range1, range2] { + tablet.compact_range(Some(r.start_key), Some(r.end_key), false, 1)?; + } + Ok(()) + } + + fn prepare_destroy(&mut self, region_id: u64, tablet: EK, wait_for_persisted: u64) { + let _ = tablet.pause_background_work(); + self.waiting_destroy_tasks + .entry(region_id) + .or_default() + .push((PathBuf::from(tablet.path()), wait_for_persisted)); + } + + fn destroy(&mut self, region_id: u64, persisted: u64) { + if let Some(v) = self.waiting_destroy_tasks.get_mut(®ion_id) { + v.retain(|(path, wait)| { + if *wait <= persisted { + if !Self::process_destroy_task(&self.logger, &self.tablet_registry, path) { + self.pending_destroy_tasks.push(path.clone()); + } + return false; + } + true + }); + } + } + + /// Returns true if task is consumed. Failure is considered consumed. + fn process_destroy_task(logger: &Logger, registry: &TabletRegistry, path: &Path) -> bool { + match EK::locked(path.to_str().unwrap()) { + Err(e) => warn!( + logger, + "failed to check whether the tablet path is locked"; + "err" => ?e, + "path" => path.display(), + ), + Ok(false) => { + // TODO: use a meaningful table context. + let _ = registry + .tablet_factory() + .destroy_tablet(TabletContext::with_infinite_region(0, None), path) + .map_err(|e| { + warn!( + logger, + "failed to destroy tablet"; + "err" => ?e, + "path" => path.display(), + ) + }); + return true; + } + _ => {} + } + false + } +} + +impl Runnable for Runner +where + EK: KvEngine, +{ + type Task = Task; + + fn run(&mut self, task: Task) { + match task { + Task::Trim { + tablet, + start_key, + end_key, + } => { + if let Err(e) = Self::trim(&tablet, &start_key, &end_key) { + error!( + self.logger, + "failed to trim tablet"; + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => %e, + ); + } + } + Task::PrepareDestroy { + region_id, + tablet, + wait_for_persisted, + } => self.prepare_destroy(region_id, tablet, wait_for_persisted), + Task::Destroy { + region_id, + persisted_index, + } => self.destroy(region_id, persisted_index), + } + } +} + +impl RunnableWithTimer for Runner +where + EK: KvEngine, +{ + fn on_timeout(&mut self) { + self.pending_destroy_tasks + .retain(|task| !Self::process_destroy_task(&self.logger, &self.tablet_registry, task)); + } + + fn get_interval(&self) -> Duration { + Duration::from_secs(2) + } +} diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 9b13ce6af9b..b8cf6006dee 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -185,7 +185,7 @@ where pub send_time: Instant, pub raft_wb: Option, // called after writing to kvdb and raftdb. - pub persisted_cb: Option>, + pub persisted_cbs: Vec>, pub entries: Vec, pub cut_logs: Option<(u64, u64)>, pub raft_state: Option, @@ -213,7 +213,7 @@ where extra_write: ExtraWrite::None, messages: vec![], trackers: vec![], - persisted_cb: None, + persisted_cbs: Vec::new(), has_snapshot: false, } } @@ -419,9 +419,9 @@ where ); } } - if let Some(v) = task.persisted_cb.take() { + for v in task.persisted_cbs.drain(..) { self.persisted_cbs.push(v); - }; + } self.tasks.push(task); } diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index d43e33a4e08..efee09be906 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -455,7 +455,7 @@ mod tests { db.put(b"zc5", &value).unwrap(); db.put(b"zc6", &value).unwrap(); db.flush_cfs(&[], true /* wait */).unwrap(); - db.compact_range( + db.compact_range_cf( CF_DEFAULT, None, // start_key None, // end_key false, // exclusive_manual diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index 958da2adaa6..7bc7052b277 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -108,7 +108,7 @@ where .start_coarse_timer(); box_try!( self.engine - .compact_range(cf_name, start_key, end_key, false, 1 /* threads */,) + .compact_range_cf(cf_name, start_key, end_key, false, 1 /* threads */,) ); compact_range_timer.observe_duration(); info!( diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 833e8131746..b2330e26f93 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -308,7 +308,8 @@ impl Cluster { pub fn compact_data(&self) { for engine in self.engines.values() { let db = &engine.kv; - db.compact_range(CF_DEFAULT, None, None, false, 1).unwrap(); + db.compact_range_cf(CF_DEFAULT, None, None, false, 1) + .unwrap(); } } diff --git a/src/config/mod.rs b/src/config/mod.rs index 8886711f948..6ed8da3f111 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -98,6 +98,8 @@ pub const MIN_BLOCK_CACHE_SHARD_SIZE: usize = 128 * MIB as usize; /// Maximum of 15% of system memory can be used by Raft Engine. Normally its /// memory usage is much smaller than that. const RAFT_ENGINE_MEMORY_LIMIT_RATE: f64 = 0.15; +/// Tentative value. +const WRITE_BUFFER_MEMORY_LIMIT_RATE: f64 = 0.25; const LOCKCF_MIN_MEM: usize = 256 * MIB as usize; const LOCKCF_MAX_MEM: usize = GIB as usize; @@ -1220,6 +1222,10 @@ impl DbConfig { EngineType::RaftKv2 => { self.enable_multi_batch_write.get_or_insert(false); self.allow_concurrent_memtable_write.get_or_insert(false); + let total_mem = SysQuota::memory_limit_in_bytes() as f64; + self.write_buffer_limit.get_or_insert(ReadableSize( + (total_mem * WRITE_BUFFER_MEMORY_LIMIT_RATE) as u64, + )); } } } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 4847dbb8428..e530cc56577 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -1028,7 +1028,7 @@ pub mod tests { pub fn compact(&mut self) { for cf in ALL_CFS { - self.db.compact_range(cf, None, None, false, 1).unwrap(); + self.db.compact_range_cf(cf, None, None, false, 1).unwrap(); } } } From 0404a7c180dd052da1caccc35939a230f5106a37 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 27 Dec 2022 20:38:16 +0800 Subject: [PATCH 434/676] raftstore-v2: purge raft engine (#13993) ref tikv/tikv#12842 Implement periodical purge in v2. Signed-off-by: tabokie Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 33 +++++++++++++++++-- components/raftstore-v2/src/fsm/peer.rs | 3 +- .../operation/command/admin/compact_log.rs | 14 +++++--- components/raftstore-v2/src/router/message.rs | 1 + components/tikv_util/src/lib.rs | 1 + components/tikv_util/src/worker/pool.rs | 10 ++++-- 6 files changed, 52 insertions(+), 10 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 38ce4296c03..bd37a6d4e07 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -28,7 +28,7 @@ use raftstore::{ SplitCheckRunner, SplitCheckTask, StoreWriters, TabletSnapManager, Transport, WriteSenders, }, }; -use slog::Logger; +use slog::{warn, Logger}; use tikv_util::{ box_err, config::{Tracker, VersionTrack}, @@ -378,18 +378,20 @@ struct Workers { pd: LazyWorker, tablet_gc_worker: Worker, async_write: StoreWriters, + purge: Option, // Following is not maintained by raftstore itself. background: Worker, } impl Workers { - fn new(background: Worker, pd: LazyWorker) -> Self { + fn new(background: Worker, pd: LazyWorker, purge: Option) -> Self { Self { async_read: Worker::new("async-read-worker"), pd, tablet_gc_worker: Worker::new("tablet-gc-worker"), async_write: StoreWriters::default(), + purge, background, } } @@ -433,7 +435,29 @@ impl StoreSystem { .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); }); - let mut workers = Workers::new(background, pd_worker); + let purge_worker = if raft_engine.need_manual_purge() { + let worker = Worker::new("purge-worker"); + let raft_clone = raft_engine.clone(); + let logger = self.logger.clone(); + let router = router.clone(); + worker.spawn_interval_task(cfg.value().raft_engine_purge_interval.0, move || { + match raft_clone.manual_purge() { + Ok(regions) => { + for r in regions { + let _ = router.send(r, PeerMsg::ForceCompactLog); + } + } + Err(e) => { + warn!(logger, "purge expired files"; "err" => %e); + } + }; + }); + Some(worker) + } else { + None + }; + + let mut workers = Workers::new(background, pd_worker, purge_worker); workers .async_write .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; @@ -537,6 +561,9 @@ impl StoreSystem { workers.async_write.shutdown(); workers.async_read.stop(); workers.pd.stop(); + if let Some(w) = workers.purge { + w.stop(); + } } } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 22145ecdcaa..9b3586c6012 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -208,7 +208,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, match tick { PeerTick::Raft => self.on_raft_tick(), PeerTick::PdHeartbeat => self.on_pd_heartbeat(), - PeerTick::CompactLog => self.on_compact_log_tick(), + PeerTick::CompactLog => self.on_compact_log_tick(false), PeerTick::SplitRegionCheck => self.on_split_region_check(), PeerTick::CheckMerge => unimplemented!(), PeerTick::CheckPeerStaleState => unimplemented!(), @@ -297,6 +297,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .peer_mut() .on_request_split(self.store_ctx, request, ch) } + PeerMsg::ForceCompactLog => self.on_compact_log_tick(true), #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index aaf067aa585..d1d10d366bf 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -33,7 +33,7 @@ use crate::{ }; impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { - pub fn on_compact_log_tick(&mut self) { + pub fn on_compact_log_tick(&mut self, force: bool) { if !self.fsm.peer().is_leader() { // `compact_cache_to` is called when apply, there is no need to call // `compact_to` here, snapshot generating has already been cancelled @@ -44,7 +44,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.fsm .peer_mut() - .maybe_propose_compact_log(self.store_ctx); + .maybe_propose_compact_log(self.store_ctx, force); self.on_entry_cache_evict(); } @@ -64,7 +64,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, impl Peer { // Mirrors v1::on_raft_gc_log_tick. - fn maybe_propose_compact_log(&mut self, store_ctx: &mut StoreContext) { + fn maybe_propose_compact_log( + &mut self, + store_ctx: &mut StoreContext, + force: bool, + ) { // As leader, we would not keep caches for the peers that didn't response // heartbeat in the last few seconds. That happens probably because // another TiKV is down. In this case if we do not clean up the cache, @@ -122,7 +126,9 @@ impl Peer { self.entry_storage_mut() .compact_entry_cache(std::cmp::min(alive_cache_idx, applied_idx + 1)); - let mut compact_idx = if applied_idx > first_idx + let mut compact_idx = if force && replicated_idx > first_idx { + replicated_idx + } else if applied_idx > first_idx && applied_idx - first_idx >= store_ctx.cfg.raft_log_gc_count_limit() || self.approximate_raft_log_size() >= store_ctx.cfg.raft_log_gc_size_limit().0 { diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index cd88a23c744..930de5ff036 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -181,6 +181,7 @@ pub enum PeerMsg { request: RequestSplit, ch: CmdResChannel, }, + ForceCompactLog, /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index 9421c0e174b..9b13250fe1e 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -4,6 +4,7 @@ #![feature(thread_id_value)] #![feature(box_patterns)] #![feature(vec_into_raw_parts)] +#![feature(let_chains)] #[cfg(test)] extern crate test; diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index ba4b1e27f41..e761fac8bb5 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -376,8 +376,11 @@ impl Worker { let mut interval = GLOBAL_TIMER_HANDLE .interval(std::time::Instant::now(), interval) .compat(); + let stop = self.stop.clone(); self.remote.spawn(async move { - while let Some(Ok(_)) = interval.next().await { + while !stop.load(Ordering::Relaxed) + && let Some(Ok(_)) = interval.next().await + { func(); } }); @@ -391,8 +394,11 @@ impl Worker { let mut interval = GLOBAL_TIMER_HANDLE .interval(std::time::Instant::now(), interval) .compat(); + let stop = self.stop.clone(); self.remote.spawn(async move { - while let Some(Ok(_)) = interval.next().await { + while !stop.load(Ordering::Relaxed) + && let Some(Ok(_)) = interval.next().await + { let fut = func(); fut.await; } From 25261c8aa4f638b4d6f5d97e14fb7a786a3d7638 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 27 Dec 2022 23:30:17 +0800 Subject: [PATCH 435/676] raftstore-v2: cleanup stale tablet on restart (#13994) ref tikv/tikv#12842 If operations like snapshot, split, are aborted by restart, they needs to be either resumed or cleanup. This PR checks for garbage after restart and resume committed operations. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_rocks/src/write_batch.rs | 25 +++- components/engine_traits/src/lib.rs | 1 + components/engine_traits/src/tablet.rs | 9 ++ components/engine_traits/src/write_batch.rs | 7 + components/raftstore-v2/src/batch/store.rs | 68 ++++++++- components/raftstore-v2/src/fsm/peer.rs | 26 ++-- .../src/operation/command/admin/mod.rs | 2 +- .../src/operation/command/admin/split.rs | 40 ++++-- .../raftstore-v2/src/operation/command/mod.rs | 11 +- components/raftstore-v2/src/operation/life.rs | 2 +- components/raftstore-v2/src/operation/mod.rs | 1 + .../raftstore-v2/src/operation/query/mod.rs | 19 ++- .../src/operation/ready/apply_trace.rs | 57 +++++++- .../raftstore-v2/src/operation/ready/mod.rs | 2 +- .../src/operation/ready/snapshot.rs | 69 ++++++++-- components/raftstore-v2/src/raft/apply.rs | 9 +- components/raftstore-v2/src/raft/peer.rs | 4 +- .../raftstore-v2/tests/failpoints/mod.rs | 1 + .../tests/failpoints/test_split.rs | 106 ++++++++++++++ .../tests/integrations/cluster.rs | 118 ++++++++++++++++ .../tests/integrations/test_split.rs | 130 ++---------------- .../raftstore/src/store/async_io/write.rs | 5 + components/raftstore/src/store/fsm/store.rs | 2 +- 23 files changed, 532 insertions(+), 182 deletions(-) create mode 100644 components/raftstore-v2/tests/failpoints/test_split.rs diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index a46edfb0a4a..3659a7628d6 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -96,24 +96,39 @@ impl RocksWriteBatchVec { } } } -} -impl engine_traits::WriteBatch for RocksWriteBatchVec { - fn write_opt(&mut self, opts: &WriteOptions) -> Result { + #[inline] + fn write_impl(&mut self, opts: &WriteOptions, mut cb: impl FnMut()) -> Result { let opt: RocksWriteOptions = opts.into(); let mut seq = 0; if self.support_write_batch_vec { // FIXME(tabokie): Callback for empty write batch won't be called. self.get_db() - .multi_batch_write_callback(self.as_inner(), &opt.into_raw(), |s| seq = s) + .multi_batch_write_callback(self.as_inner(), &opt.into_raw(), |s| { + seq = s; + cb(); + }) .map_err(r2e)?; } else { self.get_db() - .write_callback(&self.wbs[0], &opt.into_raw(), |s| seq = s) + .write_callback(&self.wbs[0], &opt.into_raw(), |s| { + seq = s; + cb(); + }) .map_err(r2e)?; } Ok(seq) } +} + +impl engine_traits::WriteBatch for RocksWriteBatchVec { + fn write_opt(&mut self, opts: &WriteOptions) -> Result { + self.write_impl(opts, || {}) + } + + fn write_callback_opt(&mut self, opts: &WriteOptions, cb: impl FnMut()) -> Result { + self.write_impl(opts, cb) + } fn data_size(&self) -> usize { let mut size: usize = 0; diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index bc54a5e7627..45a3d18fa7a 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -253,6 +253,7 @@ #![feature(assert_matches)] #![feature(linked_list_cursors)] #![feature(let_chains)] +#![feature(str_split_as_str)] #[macro_use(fail_point)] extern crate fail; diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index 64459bbc7ee..edc0bd99870 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -226,6 +226,15 @@ impl TabletRegistry { format!("{}{}_{}", prefix, id, suffix) } + pub fn parse_tablet_name<'a>(&self, path: &'a Path) -> Option<(&'a str, u64, u64)> { + let name = path.file_name().unwrap().to_str().unwrap(); + let mut parts = name.rsplit('_'); + let suffix = parts.next()?.parse().ok()?; + let id = parts.next()?.parse().ok()?; + let prefix = parts.as_str(); + Some((prefix, id, suffix)) + } + pub fn tablet_root(&self) -> &Path { &self.tablets.root } diff --git a/components/engine_traits/src/write_batch.rs b/components/engine_traits/src/write_batch.rs index d8ff8d07796..8a92ac7c382 100644 --- a/components/engine_traits/src/write_batch.rs +++ b/components/engine_traits/src/write_batch.rs @@ -73,6 +73,13 @@ pub trait WriteBatch: Mutable { /// Commit the WriteBatch to disk with the given options fn write_opt(&mut self, opts: &WriteOptions) -> Result; + // TODO: it should be `FnOnce`. + fn write_callback_opt(&mut self, opts: &WriteOptions, mut cb: impl FnMut()) -> Result { + let seq = self.write_opt(opts)?; + cb(); + Ok(seq) + } + /// Commit the WriteBatch to disk atomically fn write(&mut self) -> Result { self.write_opt(&WriteOptions::default()) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index bd37a6d4e07..a3800085522 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -24,8 +24,10 @@ use raft::{StateRole, INVALID_ID}; use raftstore::{ coprocessor::{CoprocessorHost, RegionChangeEvent}, store::{ - fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, ReadRunner, ReadTask, - SplitCheckRunner, SplitCheckTask, StoreWriters, TabletSnapManager, Transport, WriteSenders, + fsm::store::{PeerTickBatch, ENTRY_CACHE_EVICT_TICK_DURATION}, + local_metrics::RaftMetrics, + Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, StoreWriters, + TabletSnapManager, Transport, WriteSenders, }, }; use slog::{warn, Logger}; @@ -43,6 +45,7 @@ use time::Timespec; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, + operation::SPLIT_PREFIX, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, worker::{pd, tablet_gc}, @@ -80,6 +83,32 @@ pub struct StoreContext { pub snap_mgr: TabletSnapManager, } +impl StoreContext { + pub fn update_ticks_timeout(&mut self) { + self.tick_batch[PeerTick::Raft as usize].wait_duration = self.cfg.raft_base_tick_interval.0; + self.tick_batch[PeerTick::CompactLog as usize].wait_duration = + self.cfg.raft_log_gc_tick_interval.0; + self.tick_batch[PeerTick::EntryCacheEvict as usize].wait_duration = + ENTRY_CACHE_EVICT_TICK_DURATION; + self.tick_batch[PeerTick::PdHeartbeat as usize].wait_duration = + self.cfg.pd_heartbeat_tick_interval.0; + self.tick_batch[PeerTick::SplitRegionCheck as usize].wait_duration = + self.cfg.split_region_check_tick_interval.0; + self.tick_batch[PeerTick::CheckPeerStaleState as usize].wait_duration = + self.cfg.peer_stale_state_check_interval.0; + self.tick_batch[PeerTick::CheckMerge as usize].wait_duration = + self.cfg.merge_check_tick_interval.0; + self.tick_batch[PeerTick::CheckLeaderLease as usize].wait_duration = + self.cfg.check_leader_lease_interval.0; + self.tick_batch[PeerTick::ReactivateMemoryLock as usize].wait_duration = + self.cfg.reactive_memory_lock_tick_interval.0; + self.tick_batch[PeerTick::ReportBuckets as usize].wait_duration = + self.cfg.report_region_buckets_tick_interval.0; + self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = + self.cfg.check_long_uncommitted_interval.0; + } +} + /// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. /// /// It is responsible for: @@ -152,6 +181,7 @@ impl PollHandler StorePollerBuilder { } meta.set_region(storage.region(), storage.is_initialized(), &self.logger); - let (sender, peer_fsm) = PeerFsm::new(&cfg, &self.tablet_registry, storage)?; + let (sender, peer_fsm) = + PeerFsm::new(&cfg, &self.tablet_registry, &self.snap_mgr, storage)?; meta.region_read_progress .insert(region_id, peer_fsm.as_ref().peer().read_progress().clone()); @@ -318,7 +349,33 @@ impl StorePollerBuilder { Ok(regions) } - fn clean_up_tablets(&self, _peers: &HashMap>) -> Result<()> { + fn clean_up_tablets(&self, peers: &HashMap>) -> Result<()> { + for entry in file_system::read_dir(self.tablet_registry.tablet_root())? { + let entry = entry?; + let path = entry.path(); + let Some((prefix, region_id, tablet_index)) = self.tablet_registry.parse_tablet_name(&path) else { continue }; + let fsm = match peers.get(®ion_id) { + Some((_, fsm)) => fsm, + None => { + // The peer is either destroyed or not created yet. It will be + // recovered by leader heartbeats. + file_system::remove_dir_all(&path)?; + continue; + } + }; + // Valid split tablet should be installed during recovery. + if prefix == SPLIT_PREFIX { + file_system::remove_dir_all(&path)?; + continue; + } + if prefix.is_empty() { + // Stale split data can be deleted. + if fsm.peer().storage().tablet_index() > tablet_index { + file_system::remove_dir_all(&path)?; + } + } + // TODO: handle other prefix + } // TODO: list all available tablets and destroy those which are not in the // peers. Ok(()) @@ -335,7 +392,7 @@ where fn build(&mut self, _priority: batch_system::Priority) -> Self::Handler { let cfg = self.cfg.value().clone(); - let poll_ctx = StoreContext { + let mut poll_ctx = StoreContext { logger: self.logger.clone(), trans: self.trans.clone(), current_time: None, @@ -354,6 +411,7 @@ where snap_mgr: self.snap_mgr.clone(), coprocessor_host: self.coprocessor_host.clone(), }; + poll_ctx.update_ticks_timeout(); let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); StorePoller::new(poll_ctx, cfg_tracker) } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 9b3586c6012..49f1efcb760 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -7,7 +7,7 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; -use raftstore::store::{Config, LocksStatus, Transport}; +use raftstore::store::{Config, LocksStatus, TabletSnapManager, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ is_zero_duration, @@ -30,7 +30,7 @@ pub struct PeerFsm { receiver: Receiver, /// A registry for all scheduled ticks. This can avoid scheduling ticks /// twice accidentally. - tick_registry: u16, + tick_registry: [bool; PeerTick::VARIANT_COUNT], is_stopped: bool, reactivate_memory_lock_ticks: usize, } @@ -39,16 +39,17 @@ impl PeerFsm { pub fn new( cfg: &Config, tablet_registry: &TabletRegistry, + snap_mgr: &TabletSnapManager, storage: Storage, ) -> Result> { - let peer = Peer::new(cfg, tablet_registry, storage)?; + let peer = Peer::new(cfg, tablet_registry, snap_mgr, storage)?; info!(peer.logger, "create peer"); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); let fsm = Box::new(PeerFsm { peer, mailbox: None, receiver: rx, - tick_registry: 0, + tick_registry: [false; PeerTick::VARIANT_COUNT], is_stopped: false, reactivate_memory_lock_ticks: 0, }); @@ -141,8 +142,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, pub fn schedule_tick(&mut self, tick: PeerTick) { assert!(PeerTick::VARIANT_COUNT <= u16::BITS as usize); let idx = tick as usize; - let key = 1u16 << (idx as u16); - if self.fsm.tick_registry & key != 0 { + if self.fsm.tick_registry[idx] { return; } if is_zero_duration(&self.store_ctx.tick_batch[idx].wait_duration) { @@ -167,7 +167,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, return; } }; - self.fsm.tick_registry |= key; + self.fsm.tick_registry[idx] = true; let logger = self.fsm.logger().clone(); // TODO: perhaps following allocation can be removed. let cb = Box::new(move || { @@ -194,6 +194,15 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } + // Unlike v1, it's a must to set ready when there are pending entries. Otherwise + // it may block for ever when there is unapplied conf change. + let entry_storage = self.fsm.peer.storage().entry_storage(); + if entry_storage.commit_index() > entry_storage.applied_index() + // Speed up setup if there is only one peer. + || self.fsm.peer.is_leader() + { + self.fsm.peer.set_has_ready(); + } } #[inline] @@ -205,6 +214,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } fn on_tick(&mut self, tick: PeerTick) { + self.fsm.tick_registry[tick as usize] = false; match tick { PeerTick::Raft => self.on_raft_tick(), PeerTick::PdHeartbeat => self.on_pd_heartbeat(), @@ -225,7 +235,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, match msg { PeerMsg::RaftMessage(msg) => { self.fsm.peer.on_raft_message(self.store_ctx, msg); - self.schedule_pending_ticks(); } PeerMsg::RaftQuery(cmd) => { self.on_receive_command(cmd.send_time); @@ -304,6 +313,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } // TODO: instead of propose pending commands immediately, we should use timeout. self.fsm.peer.propose_pending_writes(self.store_ctx); + self.schedule_pending_ticks(); } pub fn on_reactivate_memory_lock_tick(&mut self) { diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 9afd50a5305..977e26e0675 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -13,7 +13,7 @@ use protobuf::Message; use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; use split::SplitResult; -pub use split::{RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; +pub use split::{temp_split_path, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; use tikv_util::box_err; use txn_types::WriteBatchFlags; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 386528070e2..e1f4ae552f6 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -25,11 +25,13 @@ //! created by the store, and here init it using the data sent from the parent //! peer. -use std::{borrow::Cow, cmp}; +use std::{borrow::Cow, cmp, path::PathBuf}; use collections::HashSet; use crossbeam::channel::SendError; -use engine_traits::{Checkpointer, KvEngine, RaftEngine, RaftLogBatch, TabletContext}; +use engine_traits::{ + Checkpointer, KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, +}; use fail::fail_point; use kvproto::{ metapb::{self, Region, RegionEpoch}, @@ -118,6 +120,11 @@ pub struct SplitFlowControl { may_skip_split_check: bool, } +pub fn temp_split_path(registry: &TabletRegistry, region_id: u64) -> PathBuf { + let tablet_name = registry.tablet_name(SPLIT_PREFIX, region_id, RAFT_INIT_LOG_INDEX); + registry.tablet_root().join(tablet_name) +} + impl PeerFsmDelegate<'_, EK, ER, T> { pub fn on_split_region_check(&mut self) { if !self.fsm.peer_mut().on_split_region_check(self.store_ctx) { @@ -329,8 +336,7 @@ impl Apply { continue; } - let name = reg.tablet_name(SPLIT_PREFIX, new_region_id, RAFT_INIT_LOG_INDEX); - let split_temp_path = reg.tablet_root().join(name); + let split_temp_path = temp_split_path(reg, new_region_id); checkpointer .create_at(&split_temp_path, None, 0) .unwrap_or_else(|e| { @@ -344,16 +350,22 @@ impl Apply { } let derived_path = self.tablet_registry().tablet_path(region_id, log_index); - checkpointer - .create_at(&derived_path, None, 0) - .unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint with path {:?}: {:?}", - self.logger.list(), - derived_path, - e - ) - }); + // If it's recovered from restart, it's possible the target path exists already. + // And because checkpoint is atomic, so we don't need to worry about corruption. + // And it's also wrong to delete it and remake as it may has applied and flushed + // some data to the new checkpoint before being restarted. + if !derived_path.exists() { + checkpointer + .create_at(&derived_path, None, 0) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to create checkpoint with path {:?}: {:?}", + self.logger.list(), + derived_path, + e + ) + }); + } // Remove the old write batch. self.write_batch.take(); let reg = self.tablet_registry(); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 7fa2fa776c2..35b4ec1918e 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -51,7 +51,9 @@ mod admin; mod control; mod write; -pub use admin::{AdminCmdResult, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; +pub use admin::{ + temp_split_path, AdminCmdResult, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, +}; pub use control::ProposalControl; pub use write::{ SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, @@ -505,10 +507,14 @@ impl Apply { #[inline] pub fn flush(&mut self) { + let (index, term) = self.apply_progress(); + let flush_state = self.flush_state().clone(); if let Some(wb) = &mut self.write_batch && !wb.is_empty() { let mut write_opt = WriteOptions::default(); write_opt.set_disable_wal(true); - if let Err(e) = wb.write_opt(&write_opt) { + if let Err(e) = wb.write_callback_opt(&write_opt, || { + flush_state.set_applied_index(index); + }) { panic!("failed to write data: {:?}: {:?}", self.logger.list(), e); } self.metrics.written_bytes += wb.data_size() as u64; @@ -527,7 +533,6 @@ impl Apply { callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); } let mut apply_res = ApplyRes::default(); - let (index, term) = self.apply_progress(); apply_res.applied_index = index; apply_res.applied_term = term; apply_res.admin_result = self.take_admin_result().into_boxed_slice(); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index d61f11e7ada..ea42832eaea 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -229,7 +229,7 @@ impl Store { ctx.schedulers.read.clone(), &ctx.logger, ) - .and_then(|s| PeerFsm::new(&ctx.cfg, &ctx.tablet_registry, s)) + .and_then(|s| PeerFsm::new(&ctx.cfg, &ctx.tablet_registry, &ctx.snap_mgr, s)) { Ok(p) => p, res => { diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index f0a2624203a..c49a14142ce 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -9,6 +9,7 @@ mod ready; pub use command::{ AdminCmdResult, CommittedEntries, ProposalControl, RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, + SPLIT_PREFIX, }; pub use life::DestroyProgress; pub use ready::{ diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 4ffb4bcdcec..59c6f2d0f7c 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -19,7 +19,7 @@ use kvproto::{ errorpb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, StatusCmdType}, }; -use raft::Ready; +use raft::{Ready, StateRole}; use raftstore::{ errors::RAFTSTORE_IS_BUSY, store::{ @@ -29,7 +29,7 @@ use raftstore::{ }, Error, Result, }; -use slog::info; +use slog::{debug, info}; use tikv_util::box_err; use txn_types::WriteBatchFlags; @@ -375,11 +375,15 @@ impl Peer { /// Query internal states for debugging purpose. pub fn on_query_debug_info(&self, ch: DebugInfoChannel) { let entry_storage = self.storage().entry_storage(); + let mut status = self.raft_group().status(); + status + .progress + .get_or_insert_with(|| self.raft_group().raft.prs()); let mut meta = RegionMeta::new( self.storage().region_state(), entry_storage.apply_state(), GroupState::Ordered, - self.raft_group().status(), + status, self.raft_group().raft.raft_log.last_index(), self.raft_group().raft.raft_log.persisted, ); @@ -394,6 +398,10 @@ impl Peer { .raft_log .term(meta.raft_apply.commit_index) .unwrap(); + debug!(self.logger, "on query debug info"; + "tick" => self.raft_group().raft.election_elapsed, + "election_timeout" => self.raft_group().raft.randomized_election_timeout(), + ); ch.set_result(meta); } @@ -422,7 +430,10 @@ impl Peer { // Only leaders need to update applied_term. if progress_to_be_updated && self.is_leader() { - // TODO: add coprocessor_host hook + if applied_term == self.term() { + ctx.coprocessor_host + .on_applied_current_term(StateRole::Leader, self.region()); + } let progress = ReadProgress::applied_term(applied_term); let mut meta = ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id()).unwrap(); diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index d5aa93b587a..d6a83b7933b 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -30,18 +30,24 @@ use std::{cmp, sync::Mutex}; use engine_traits::{ - FlushProgress, KvEngine, RaftEngine, RaftLogBatch, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, - CF_WRITE, DATA_CFS, DATA_CFS_LEN, + FlushProgress, KvEngine, RaftEngine, RaftLogBatch, TabletRegistry, ALL_CFS, CF_DEFAULT, + CF_LOCK, CF_RAFT, CF_WRITE, DATA_CFS, DATA_CFS_LEN, }; use kvproto::{ metapb::Region, raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; -use raftstore::store::{ReadTask, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; +use raftstore::store::{ + ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, +}; use slog::Logger; use tikv_util::{box_err, worker::Scheduler}; use crate::{ + operation::{ + command::temp_split_path, + ready::snapshot::{install_tablet, recv_snap_path}, + }, raft::{Peer, Storage}, router::PeerMsg, Result, StoreRouter, @@ -372,6 +378,51 @@ impl Storage { .map(Some) } + /// Region state is written before actually moving data. It's possible that + /// the tablet is missing after restart. We need to move the data again + /// after being restarted. + pub fn recover_tablet(&self, registry: &TabletRegistry, snap_mgr: &TabletSnapManager) { + let tablet_index = self.region_state().get_tablet_index(); + if tablet_index == 0 { + // It's an uninitialized peer, nothing to recover. + return; + } + let region_id = self.region().get_id(); + let target_path = registry.tablet_path(region_id, tablet_index); + if target_path.exists() { + // Move data succeeded before restart, nothing to recover. + return; + } + if tablet_index == RAFT_INIT_LOG_INDEX { + // Its data may come from split or snapshot. Try split first. + let split_path = temp_split_path(registry, region_id); + if install_tablet(registry, &split_path, region_id, tablet_index) { + return; + } + } + let truncated_index = self.entry_storage().truncated_index(); + if truncated_index == tablet_index { + // Try snapshot. + let peer_id = self.peer().get_id(); + let snap_path = recv_snap_path( + snap_mgr, + region_id, + peer_id, + self.entry_storage().truncated_term(), + tablet_index, + ); + if install_tablet(registry, &snap_path, region_id, tablet_index) { + return; + } + } + panic!( + "{:?} data loss detected: {}_{} not found", + self.logger().list(), + region_id, + tablet_index + ); + } + /// Write initial persist trace for uninit peer. pub fn init_apply_trace(&self, write_task: &mut WriteTask) { let region_id = self.region().get_id(); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 66d9755c1df..8b125844d0e 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -23,7 +23,6 @@ mod snapshot; use std::{cmp, time::Instant}; -pub use apply_trace::{cf_offset, write_initial_states, ApplyTrace, DataTrace, StateStorage}; use engine_traits::{KvEngine, RaftEngine}; use error_code::ErrorCodeExt; use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; @@ -40,6 +39,7 @@ use tikv_util::{ }; pub use self::{ + apply_trace::{cf_offset, write_initial_states, ApplyTrace, DataTrace, StateStorage}, async_writer::AsyncWriter, snapshot::{GenSnapTask, SnapState}, }; diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 7a6e00aec4f..41dc0d39429 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -19,8 +19,10 @@ //! peer fsm, then Raft will get the snapshot. use std::{ + assert_matches::assert_matches, fmt::{self, Debug}, fs, + path::{Path, PathBuf}, sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, Arc, @@ -43,7 +45,7 @@ use tikv_util::box_err; use crate::{ fsm::ApplyResReporter, - operation::command::SPLIT_PREFIX, + operation::command::temp_split_path, raft::{Apply, Peer, Storage}, Result, StoreContext, }; @@ -115,6 +117,48 @@ impl Debug for GenSnapTask { } } +pub fn recv_snap_path( + snap_mgr: &TabletSnapManager, + region_id: u64, + peer_id: u64, + term: u64, + index: u64, +) -> PathBuf { + let key = TabletSnapKey::new(region_id, peer_id, term, index); + snap_mgr.final_recv_path(&key) +} + +/// Move the tablet from `source` to managed path. +/// +/// Returns false if `source` doesn't exist. +pub fn install_tablet( + registry: &TabletRegistry, + source: &Path, + region_id: u64, + tablet_index: u64, +) -> bool { + if !source.exists() { + return false; + } + let target_path = registry.tablet_path(region_id, tablet_index); + assert_matches!( + EK::locked(source.to_str().unwrap()), + Ok(false), + "source is locked: {} => {}", + source.display(), + target_path.display() + ); + if let Err(e) = fs::rename(source, &target_path) { + panic!( + "failed to rename tablet {} => {}: {:?}", + source.display(), + target_path.display(), + e + ); + } + true +} + impl Peer { pub fn on_snapshot_generated(&mut self, snapshot: GenSnapRes) { if self.storage_mut().on_snapshot_generated(snapshot) { @@ -476,32 +520,29 @@ impl Storage { let (path, clean_split) = match self.split_init_mut() { // If index not match, the peer may accept a newer snapshot after split. Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => { - let name = reg.tablet_name(SPLIT_PREFIX, region_id, last_index); - (reg.tablet_root().join(name), false) - } - si => { - let key = TabletSnapKey::new(region_id, peer_id, last_term, last_index); - (snap_mgr.final_recv_path(&key), si.is_some()) + (temp_split_path(®, region_id), false) } + si => ( + recv_snap_path(&snap_mgr, region_id, peer_id, last_term, last_index), + si.is_some(), + ), }; let logger = self.logger().clone(); // The snapshot require no additional processing such as ingest them to DB, but // it should load it into the factory after it persisted. let hook = move || { - let target_path = reg.tablet_path(region_id, last_index); - if let Err(e) = std::fs::rename(&path, &target_path) { + if !install_tablet(®, &path, region_id, last_index) { panic!( - "{:?} failed to load tablet, path: {} -> {}, {:?}", + "{:?} failed to install tablet, path: {}, region_id: {}, tablet_index: {}", logger.list(), path.display(), - target_path.display(), - e + region_id, + last_index ); } if clean_split { - let name = reg.tablet_name(SPLIT_PREFIX, region_id, last_index); - let path = reg.tablet_root().join(name); + let path = temp_split_path(®, region_id); let _ = fs::remove_dir_all(path); } }; diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 5539de3d617..666f3adb699 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -34,6 +34,7 @@ pub struct Apply { /// command. tombstone: bool, applied_term: u64, + applied_index: u64, /// The largest index that have modified each column family. modifications: DataTrace, admin_cmd_result: Vec, @@ -76,6 +77,7 @@ impl Apply { callbacks: vec![], tombstone: false, applied_term: 0, + applied_index: flush_state.applied_index(), modifications: [0; DATA_CFS_LEN], admin_cmd_result: vec![], region_state, @@ -115,7 +117,7 @@ impl Apply { #[inline] pub fn set_apply_progress(&mut self, index: u64, term: u64) { - self.flush_state.set_applied_index(index); + self.applied_index = index; self.applied_term = term; if self.log_recovery.is_none() { return; @@ -123,12 +125,15 @@ impl Apply { let log_recovery = self.log_recovery.as_ref().unwrap(); if log_recovery.iter().all(|v| index >= *v) { self.log_recovery.take(); + // Now all logs are recovered, flush them to avoid recover again + // and again. + let _ = self.tablet.flush_cfs(&[], false); } } #[inline] pub fn apply_progress(&self) -> (u64, u64) { - (self.flush_state.applied_index(), self.applied_term) + (self.applied_index, self.applied_term) } #[inline] diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 4fbc7e9874e..668b0ebf41d 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -20,7 +20,7 @@ use raftstore::{ fsm::ApplyMetrics, util::{Lease, RegionReadProgress}, Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, - ReadProgress, TxnExt, WriteTask, + ReadProgress, TabletSnapManager, TxnExt, WriteTask, }, }; use slog::Logger; @@ -118,6 +118,7 @@ impl Peer { pub fn new( cfg: &Config, tablet_registry: &TabletRegistry, + snap_mgr: &TabletSnapManager, storage: Storage, ) -> Result { let logger = storage.logger().clone(); @@ -137,6 +138,7 @@ impl Peer { // old tablet and create new peer. We also can't get the correct range of the // region, which is required for kv data gc. if tablet_index != 0 { + raft_group.store().recover_tablet(tablet_registry, snap_mgr); let mut ctx = TabletContext::new(®ion, Some(tablet_index)); ctx.flush_state = Some(flush_state.clone()); // TODO: Perhaps we should stop create the tablet automatically. diff --git a/components/raftstore-v2/tests/failpoints/mod.rs b/components/raftstore-v2/tests/failpoints/mod.rs index 84f1de2803d..d04ad2cafc2 100644 --- a/components/raftstore-v2/tests/failpoints/mod.rs +++ b/components/raftstore-v2/tests/failpoints/mod.rs @@ -10,4 +10,5 @@ mod cluster; mod test_basic_write; mod test_bootstrap; +mod test_split; mod test_trace_apply; diff --git a/components/raftstore-v2/tests/failpoints/test_split.rs b/components/raftstore-v2/tests/failpoints/test_split.rs new file mode 100644 index 00000000000..79356ae5805 --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_split.rs @@ -0,0 +1,106 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + thread, + time::{Duration, Instant}, +}; + +use engine_traits::{RaftEngineReadOnly, CF_DEFAULT}; +use futures::executor::block_on; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; + +use crate::cluster::{split_helper::split_region, Cluster}; + +/// If a node is restarted after metadata is persisted before tablet is not +/// installed, it should resume install the tablet. +#[test] +fn test_restart_resume() { + let mut cluster = Cluster::default(); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); + let router = &mut cluster.routers[0]; + + let region_id = 2; + let region = router.region_detail(region_id); + let peer = region.get_peers()[0].clone(); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + let fp = "async_write_before_cb"; + fail::cfg(fp, "return").unwrap(); + + let split_region_id = 1000; + let mut new_peer = peer.clone(); + new_peer.set_id(1001); + split_region( + router, + region, + peer, + split_region_id, + new_peer, + None, + None, + b"k11", + b"k11", + true, + ); + + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"k22", b"value"); + let header = Box::new(router.new_request_for(region_id).take_header()); + let (msg, mut sub) = PeerMsg::simple_write(header, put.encode()); + router.send(region_id, msg).unwrap(); + // Send a command to ensure split init is triggered. + block_on(sub.wait_proposed()); + + let region_state = raft_engine + .get_region_state(split_region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + let path = cluster + .node(0) + .tablet_registry() + .tablet_path(split_region_id, RAFT_INIT_LOG_INDEX); + assert!(!path.exists(), "{} should not exist", path.display()); + drop(raft_engine); + + cluster.restart(0); + // If split is resumed, the tablet should be installed. + assert!( + path.exists(), + "{} should exist after restart", + path.display() + ); + + // Both region should be recovered correctly. + let cases = vec![ + (split_region_id, b"k01", b"v01"), + (region_id, b"k21", b"v21"), + ]; + let router = &mut cluster.routers[0]; + let new_epoch = router + .new_request_for(split_region_id) + .take_header() + .take_region_epoch(); + let timer = Instant::now(); + for (region_id, key, val) in cases { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let mut header = Box::new(router.new_request_for(region_id).take_header()); + while timer.elapsed() < Duration::from_secs(3) { + // We need to wait till source peer replay split. + if *header.get_region_epoch() != new_epoch { + thread::sleep(Duration::from_millis(100)); + header = Box::new(router.new_request_for(region_id).take_header()); + continue; + } + break; + } + assert_eq!(*header.get_region_epoch(), new_epoch, "{:?}", header); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + router.send(region_id, msg).unwrap(); + // Send a command to ensure split init is triggered. + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + } +} diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 064fd9d1cad..4c025a0fc85 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -586,3 +586,121 @@ impl Drop for Cluster { } } } + +pub mod split_helper { + use std::{thread, time::Duration}; + + use engine_traits::CF_DEFAULT; + use futures::executor::block_on; + use kvproto::{ + metapb, pdpb, + raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest, RaftCmdResponse, SplitRequest}, + }; + use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; + + use super::TestRouter; + + pub fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req + } + + pub fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { + let (msg, sub) = PeerMsg::admin_command(req); + router.send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + + // TODO: when persistent implementation is ready, we can use tablet index of + // the parent to check whether the split is done. Now, just sleep a second. + thread::sleep(Duration::from_secs(1)); + } + + pub fn put(router: &mut TestRouter, region_id: u64, key: &[u8]) -> RaftCmdResponse { + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, b"v1"); + router.simple_write(region_id, header, put).unwrap() + } + + // Split the region according to the parameters + // return the updated original region + pub fn split_region<'a>( + router: &'a mut TestRouter, + region: metapb::Region, + peer: metapb::Peer, + split_region_id: u64, + split_peer: metapb::Peer, + left_key: Option<&'a [u8]>, + right_key: Option<&'a [u8]>, + propose_key: &[u8], + split_key: &[u8], + right_derive: bool, + ) -> (metapb::Region, metapb::Region) { + let region_id = region.id; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.mut_header().set_peer(peer); + + let mut split_id = pdpb::SplitId::new(); + split_id.new_region_id = split_region_id; + split_id.new_peer_ids = vec![split_peer.id]; + let admin_req = new_batch_split_region_request( + vec![propose_key.to_vec()], + vec![split_id], + right_derive, + ); + req.mut_requests().clear(); + req.set_admin_request(admin_req); + + must_split(region_id, req, router); + + let (left, right) = if !right_derive { + ( + router.region_detail(region_id), + router.region_detail(split_region_id), + ) + } else { + ( + router.region_detail(split_region_id), + router.region_detail(region_id), + ) + }; + + if let Some(right_key) = right_key { + let resp = put(router, left.id, right_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + let resp = put(router, right.id, right_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + } + if let Some(left_key) = left_key { + let resp = put(router, left.id, left_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let resp = put(router, right.id, left_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + } + + assert_eq!(left.get_end_key(), split_key); + assert_eq!(right.get_start_key(), split_key); + assert_eq!(region.get_start_key(), left.get_start_key()); + assert_eq!(region.get_end_key(), right.get_end_key()); + + (left, right) + } +} diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs index 1174a428011..7cea980beac 100644 --- a/components/raftstore-v2/tests/integrations/test_split.rs +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -1,121 +1,13 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{thread, time::Duration}; +use std::time::Duration; -use engine_traits::{Peekable, RaftEngineReadOnly, CF_DEFAULT, CF_RAFT}; -use futures::executor::block_on; -use kvproto::{ - metapb, pdpb, - raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest, RaftCmdResponse, SplitRequest}, -}; +use engine_traits::{Peekable, RaftEngineReadOnly, CF_RAFT}; use raftstore::store::{INIT_EPOCH_VER, RAFT_INIT_LOG_INDEX}; -use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use tikv_util::store::new_peer; use txn_types::{Key, TimeStamp}; -use crate::cluster::{Cluster, TestRouter}; - -fn new_batch_split_region_request( - split_keys: Vec>, - ids: Vec, - right_derive: bool, -) -> AdminRequest { - let mut req = AdminRequest::default(); - req.set_cmd_type(AdminCmdType::BatchSplit); - req.mut_splits().set_right_derive(right_derive); - let mut requests = Vec::with_capacity(ids.len()); - for (mut id, key) in ids.into_iter().zip(split_keys) { - let mut split = SplitRequest::default(); - split.set_split_key(key); - split.set_new_region_id(id.get_new_region_id()); - split.set_new_peer_ids(id.take_new_peer_ids()); - requests.push(split); - } - req.mut_splits().set_requests(requests.into()); - req -} - -fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { - let (msg, sub) = PeerMsg::admin_command(req); - router.send(region_id, msg).unwrap(); - block_on(sub.result()).unwrap(); - - // TODO: when persistent implementation is ready, we can use tablet index of - // the parent to check whether the split is done. Now, just sleep a second. - thread::sleep(Duration::from_secs(1)); -} - -fn put(router: &mut TestRouter, region_id: u64, key: &[u8]) -> RaftCmdResponse { - let header = Box::new(router.new_request_for(region_id).take_header()); - let mut put = SimpleWriteEncoder::with_capacity(64); - put.put(CF_DEFAULT, key, b"v1"); - router.simple_write(region_id, header, put).unwrap() -} - -// Split the region according to the parameters -// return the updated original region -fn split_region( - router: &mut TestRouter, - region: metapb::Region, - peer: metapb::Peer, - split_region_id: u64, - split_peer: metapb::Peer, - left_key: &[u8], - right_key: &[u8], - propose_key: &[u8], - split_key: &[u8], - right_derive: bool, -) -> (metapb::Region, metapb::Region) { - let region_id = region.id; - let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(region_id); - req.mut_header() - .set_region_epoch(region.get_region_epoch().clone()); - req.mut_header().set_peer(peer); - - let mut split_id = pdpb::SplitId::new(); - split_id.new_region_id = split_region_id; - split_id.new_peer_ids = vec![split_peer.id]; - let admin_req = - new_batch_split_region_request(vec![propose_key.to_vec()], vec![split_id], right_derive); - req.mut_requests().clear(); - req.set_admin_request(admin_req); - - must_split(region_id, req, router); - - let (left, right) = if !right_derive { - ( - router.region_detail(region_id), - router.region_detail(split_region_id), - ) - } else { - ( - router.region_detail(split_region_id), - router.region_detail(region_id), - ) - }; - - // The end key of left region is `split_key` - // So writing `right_key` will fail - let resp = put(router, left.id, right_key); - assert!(resp.get_header().has_error(), "{:?}", resp); - // But `left_key` should succeed - let resp = put(router, left.id, left_key); - assert!(!resp.get_header().has_error(), "{:?}", resp); - - // Mirror of above case - let resp = put(router, right.id, left_key); - assert!(resp.get_header().has_error(), "{:?}", resp); - let resp = put(router, right.id, right_key); - assert!(!resp.get_header().has_error(), "{:?}", resp); - - assert_eq!(left.get_end_key(), split_key); - assert_eq!(right.get_start_key(), split_key); - assert_eq!(region.get_start_key(), left.get_start_key()); - assert_eq!(region.get_end_key(), right.get_end_key()); - - (left, right) -} +use crate::cluster::{split_helper::split_region, Cluster}; #[test] fn test_split() { @@ -141,8 +33,8 @@ fn test_split() { peer.clone(), 1000, new_peer(store_id, 10), - b"k11", - b"k33", + Some(b"k11"), + Some(b"k33"), b"k22", b"k22", false, @@ -174,8 +66,8 @@ fn test_split() { peer, 1001, new_peer(store_id, 11), - b"k00", - b"k11", + Some(b"k00"), + Some(b"k11"), b"k11", b"k11", false, @@ -215,8 +107,8 @@ fn test_split() { new_peer(store_id, 10), 1002, new_peer(store_id, 12), - b"k22", - b"k33", + Some(b"k22"), + Some(b"k33"), b"k33", b"k33", false, @@ -251,8 +143,8 @@ fn test_split() { new_peer(store_id, 12), 1003, new_peer(store_id, 13), - b"k33", - b"k55", + Some(b"k33"), + Some(b"k55"), split_key.as_encoded(), actual_split_key.as_encoded(), false, diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index b8cf6006dee..b4cceb96a82 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -718,6 +718,11 @@ where self.batch.after_write_to_raft_db(&self.metrics); + fail_point!( + "async_write_before_cb", + !self.batch.persisted_cbs.is_empty(), + |_| () + ); self.batch.after_write_all(); fail_point!("raft_before_follower_send"); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 310c33b95b2..54bb7d0cc0b 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -106,7 +106,7 @@ use crate::{ type Key = Vec; pub const PENDING_MSG_CAP: usize = 100; -const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); +pub const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); pub const MULTI_FILES_SNAPSHOT_FEATURE: Feature = Feature::require(6, 1, 0); // it only makes sense for large region pub struct StoreInfo { From 09f9aac35ecdc6dc8aee2eb11d6aaccaeddd3e23 Mon Sep 17 00:00:00 2001 From: Zwb Date: Wed, 28 Dec 2022 12:00:16 +0800 Subject: [PATCH 436/676] modify raft gc log impl for witness (#13869) ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang Signed-off-by: Zwb Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/raftstore/src/store/fsm/apply.rs | 210 ++++++++++++++++- components/raftstore/src/store/fsm/peer.rs | 118 +++++++++- components/raftstore/src/store/fsm/store.rs | 3 + components/raftstore/src/store/msg.rs | 3 + components/raftstore/src/store/peer.rs | 15 ++ tests/failpoints/cases/test_witness.rs | 167 +++++++++++++- tests/integrations/raftstore/test_witness.rs | 224 ++++++++++--------- 8 files changed, 628 insertions(+), 114 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4c510da6d77..8433f54c512 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2747,7 +2747,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#e53d558bc6d7d8b7bb2d283cdf6dda52a2615632" +source = "git+https://github.com/pingcap/kvproto.git#ae3b086b09afbb26cebcd4c1fe14b82bbe1f0796" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index affa0205e8f..ec2d7bf72a8 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -151,6 +151,7 @@ impl HeapSize for PendingCmd {} pub struct PendingCmdQueue { normals: VecDeque>, conf_change: Option>, + compacts: VecDeque>, } impl PendingCmdQueue { @@ -158,6 +159,7 @@ impl PendingCmdQueue { PendingCmdQueue { normals: VecDeque::new(), conf_change: None, + compacts: VecDeque::new(), } } @@ -190,6 +192,23 @@ impl PendingCmdQueue { fn set_conf_change(&mut self, cmd: PendingCmd) { self.conf_change = Some(cmd); } + + fn push_compact(&mut self, cmd: PendingCmd) { + self.compacts.push_back(cmd); + } + + fn pop_compact(&mut self, index: u64) -> Option> { + let mut front = None; + while self.compacts.front().map_or(false, |c| c.index < index) { + front = self.compacts.pop_front(); + front.as_mut().unwrap().cb.take().unwrap(); + } + front + } + + fn has_compact(&mut self) -> bool { + !self.compacts.is_empty() + } } #[derive(Default, Debug)] @@ -281,6 +300,7 @@ pub enum ExecResult { SetFlashbackState { region: Region, }, + PendingCompactCmd, } /// The possible returned value when applying logs. @@ -1488,7 +1508,8 @@ where | ExecResult::CompactLog { .. } | ExecResult::DeleteRange { .. } | ExecResult::IngestSst { .. } - | ExecResult::TransferLeader { .. } => {} + | ExecResult::TransferLeader { .. } + | ExecResult::PendingCompactCmd => {} ExecResult::SplitRegion { ref derived, .. } => { self.region = derived.clone(); self.metrics.size_diff_hint = 0; @@ -1545,6 +1566,9 @@ where if let Some(cmd) = self.pending_cmds.conf_change.take() { notify_region_removed(self.region.get_id(), id, cmd); } + for cmd in self.pending_cmds.compacts.drain(..) { + notify_region_removed(self.region.get_id(), id, cmd); + } self.yield_state = None; let mut event = TraceEvent::default(); @@ -1562,6 +1586,9 @@ where if let Some(cmd) = self.pending_cmds.conf_change.take() { notify_stale_command(region_id, peer_id, self.term, cmd); } + for cmd in self.pending_cmds.compacts.drain(..) { + notify_region_removed(self.region.get_id(), peer_id, cmd); + } } fn clear_all_commands_silently(&mut self) { @@ -1571,6 +1598,9 @@ where if let Some(mut cmd) = self.pending_cmds.conf_change.take() { cmd.cb.take(); } + for mut cmd in self.pending_cmds.compacts.drain(..) { + cmd.cb.take(); + } } } @@ -2937,13 +2967,86 @@ where )) } + fn try_compact_log( + &mut self, + voter_replicated_index: u64, + voter_replicated_term: u64, + ) -> Result>> { + PEER_ADMIN_CMD_COUNTER.compact.all.inc(); + let first_index = entry_storage::first_index(&self.apply_state); + + if self.is_merging { + info!( + "in merging mode, skip compact"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "voter_replicated_index" => voter_replicated_index, + ); + return Ok(None); + } + + // When the witness restarted, the pending compact cmd has been lost, so use + // `voter_replicated_index` for gc to avoid log accumulation. + if !self.pending_cmds.has_compact() { + if voter_replicated_index <= first_index { + debug!( + "voter_replicated_index <= first index, no need to compact"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "compact_index" => voter_replicated_index, + "first_index" => first_index, + ); + return Ok(Some(TaskRes::Compact { + state: self.apply_state.get_truncated_state().clone(), + first_index: 0, + has_pending: false, + })); + } + // compact failure is safe to be omitted, no need to assert. + compact_raft_log( + &self.tag, + &mut self.apply_state, + voter_replicated_index, + voter_replicated_term, + )?; + PEER_ADMIN_CMD_COUNTER.compact.success.inc(); + return Ok(Some(TaskRes::Compact { + state: self.apply_state.get_truncated_state().clone(), + first_index, + has_pending: false, + })); + } + + match self.pending_cmds.pop_compact(voter_replicated_index) { + Some(cmd) => { + // compact failure is safe to be omitted, no need to assert. + compact_raft_log(&self.tag, &mut self.apply_state, cmd.index, cmd.term)?; + PEER_ADMIN_CMD_COUNTER.compact.success.inc(); + Ok(Some(TaskRes::Compact { + state: self.apply_state.get_truncated_state().clone(), + first_index, + has_pending: self.pending_cmds.has_compact(), + })) + } + None => { + info!( + "latest voter_replicated_index < compact_index, skip"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "voter_replicated_index" => voter_replicated_index, + ); + Ok(None) + } + } + } + fn exec_compact_log( &mut self, req: &AdminRequest, ) -> Result<(AdminResponse, ApplyResult)> { PEER_ADMIN_CMD_COUNTER.compact.all.inc(); - let compact_index = req.get_compact_log().get_compact_index(); + let mut compact_index = req.get_compact_log().get_compact_index(); let resp = AdminResponse::default(); let first_index = entry_storage::first_index(&self.apply_state); if compact_index <= first_index { @@ -2966,7 +3069,7 @@ where return Ok((resp, ApplyResult::None)); } - let compact_term = req.get_compact_log().get_compact_term(); + let mut compact_term = req.get_compact_log().get_compact_term(); // TODO: add unit tests to cover all the message integrity checks. if compact_term == 0 { info!( @@ -2981,6 +3084,41 @@ where )); } + let voter_replicated_index = req.get_compact_log().get_voter_replicated_index(); + // If there is any voter lagging behind, the log truncation of the witness + // shouldn't be triggered even if it's force mode(raft log size/count exceeds + // the threshold or raft engine purge), otherwise the witness can't help the + // lagging voter catch up logs when leader is down. In this situation Compact + // index should be queued. If witness receives a voter_replicated_index + // that is larger than the pending compact index, logs can be deleted. + if self.peer.is_witness { + if voter_replicated_index < compact_index { + self.pending_cmds.push_compact(PendingCmd::new( + compact_index, + compact_term, + Callback::None, + )); + match self.pending_cmds.pop_compact(voter_replicated_index) { + Some(cmd) => { + compact_index = cmd.index; + compact_term = cmd.term; + } + None => { + info!( + "voter_replicated_index < compact_index, skip"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "command" => ?req.get_compact_log() + ); + return Ok((resp, ApplyResult::Res(ExecResult::PendingCompactCmd))); + } + } + } else { + for mut cmd in self.pending_cmds.compacts.drain(..) { + cmd.cb.take().unwrap(); + } + } + } // compact failure is safe to be omitted, no need to assert. compact_raft_log( &self.tag, @@ -3451,6 +3589,11 @@ where #[cfg(any(test, feature = "testexport"))] #[allow(clippy::type_complexity)] Validate(u64, Box), + CheckCompact { + region_id: u64, + voter_replicated_index: u64, + voter_replicated_term: u64, + }, } impl Msg @@ -3498,6 +3641,17 @@ where } => write!(f, "[region {}] change cmd", region_id), #[cfg(any(test, feature = "testexport"))] Msg::Validate(region_id, _) => write!(f, "[region {}] validate", region_id), + Msg::CheckCompact { + region_id, + voter_replicated_index, + voter_replicated_term, + } => { + write!( + f, + "[region {}] check compact, voter_replicated_index: {}, voter_replicated_term: {}", + region_id, voter_replicated_index, voter_replicated_term + ) + } } } } @@ -3542,6 +3696,11 @@ where // Whether destroy request is from its target region's snapshot merge_from_snapshot: bool, }, + Compact { + state: RaftTruncatedState, + first_index: u64, + has_pending: bool, + }, } pub struct ApplyFsm @@ -3947,6 +4106,34 @@ where cb.invoke_read(resp); } + fn check_pending_compact_log( + &mut self, + ctx: &mut ApplyContext, + voter_replicated_index: u64, + voter_replicated_term: u64, + ) { + let res = self + .delegate + .try_compact_log(voter_replicated_index, voter_replicated_term); + match res { + Ok(res) => { + if let Some(res) = res { + ctx.prepare_for(&mut self.delegate); + self.delegate.write_apply_state(ctx.kv_wb_mut()); + ctx.commit_opt(&mut self.delegate, true); + ctx.finish_for(&mut self.delegate, VecDeque::new()); + ctx.notifier + .notify_one(self.delegate.region_id(), PeerMsg::ApplyRes { res }); + } + } + Err(e) => error!(?e; + "failed to compact log"; + "region_id" => self.delegate.region.get_id(), + "peer_id" => self.delegate.id(), + ), + } + } + fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>) { let mut drainer = msgs.drain(..); let mut batch_apply = None; @@ -4019,6 +4206,17 @@ where let delegate = &self.delegate as *const ApplyDelegate as *const u8; f(delegate) } + Msg::CheckCompact { + voter_replicated_index, + voter_replicated_term, + .. + } => { + self.check_pending_compact_log( + apply_ctx, + voter_replicated_index, + voter_replicated_term, + ); + } } } } @@ -4429,6 +4627,11 @@ where } #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => return, + Msg::CheckCompact { region_id, .. } => { + info!("target region is not found"; + "region_id" => region_id); + return; + } }, Either::Left(Err(TrySendError::Full(_))) => unreachable!(), }; @@ -4561,6 +4764,7 @@ mod memtrace { | Msg::Change { .. } => 0, #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => 0, + Msg::CheckCompact { .. } => 0, } } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 1b484df5316..abd8fd84771 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -53,7 +53,7 @@ use tikv_alloc::trace::TraceEvent; use tikv_util::{ box_err, debug, defer, error, escape, info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, - store::{find_peer, is_learner, region_on_same_stores}, + store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, sys::disk::DiskUsage, time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, trace, warn, @@ -1195,6 +1195,7 @@ where PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted_tick(), PeerTick::CheckPeersAvailability => self.on_check_peers_availability(), + PeerTick::RequestVoterReplicatedIndex => self.on_request_voter_replicated_index(), } } @@ -1217,6 +1218,9 @@ where self.fsm.has_ready = true; } self.fsm.peer.maybe_gen_approximate_buckets(self.ctx); + if self.fsm.peer.is_witness() { + self.register_pull_voter_replicated_index_tick(); + } } fn on_gc_snap(&mut self, snaps: Vec<(SnapKey, bool)>) { @@ -2310,6 +2314,21 @@ where *is_ready = true; } } + ApplyTaskRes::Compact { + state, + first_index, + has_pending, + } => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + // When the witness restarts, the pending compact cmds will be lost. We will try + // to use `voter_replicated_index` as the `compact index` to avoid log + // accumulation, but if `voter_replicated_index` is less than `first_index`, + // then gc is not needed. In this case, the `first_index` we pass back will be + // 0, and `has_pending` set to false. + if first_index != 0 { + self.on_ready_compact_log(first_index, state); + } + } } if self.fsm.peer.unsafe_recovery_state.is_some() { self.check_unsafe_recovery_state(); @@ -2667,6 +2686,53 @@ where ); } + fn on_voter_replicated_index_request(&mut self, from: &metapb::Peer) { + if !self.fsm.peer.is_leader() { + return; + } + let mut voter_replicated_idx = self.fsm.peer.get_store().last_index(); + for (peer_id, p) in self.fsm.peer.raft_group.raft.prs().iter() { + let peer = find_peer_by_id(self.region(), *peer_id).unwrap(); + if voter_replicated_idx > p.matched && !is_learner(peer) { + voter_replicated_idx = p.matched; + } + } + let first_index = self.fsm.peer.get_store().first_index(); + if voter_replicated_idx > first_index { + voter_replicated_idx = first_index; + } + let mut resp = ExtraMessage::default(); + resp.set_type(ExtraMessageType::MsgVoterReplicatedIndexResponse); + resp.voter_replicated_index = voter_replicated_idx; + self.fsm + .peer + .send_extra_message(resp, &mut self.ctx.trans, from); + debug!( + "leader responses voter_replicated_index to witness"; + "region_id" => self.region().get_id(), + "witness_id" => from.id, + "leader_id" => self.fsm.peer.peer.get_id(), + "voter_replicated_index" => voter_replicated_idx, + ); + } + + fn on_voter_replicated_index_response(&mut self, msg: &ExtraMessage) { + if self.fsm.peer.is_leader() || !self.fsm.peer.is_witness() { + return; + } + let voter_replicated_index = msg.voter_replicated_index; + if let Ok(voter_replicated_term) = self.fsm.peer.get_store().term(voter_replicated_index) { + self.ctx.apply_router.schedule_task( + self.region_id(), + ApplyTask::CheckCompact { + region_id: self.region_id(), + voter_replicated_index, + voter_replicated_term, + }, + ) + } + } + fn on_extra_message(&mut self, mut msg: RaftMessage) { match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { @@ -2716,6 +2782,12 @@ where ExtraMessageType::MsgAvailabilityResponse => { self.on_availability_response(msg.get_from_peer(), msg.get_extra_msg()); } + ExtraMessageType::MsgVoterReplicatedIndexRequest => { + self.on_voter_replicated_index_request(msg.get_from_peer()); + } + ExtraMessageType::MsgVoterReplicatedIndexResponse => { + self.on_voter_replicated_index_response(msg.get_extra_msg()); + } } } @@ -3871,6 +3943,9 @@ where self.fsm.peer.schedule_raftlog_gc(self.ctx, compact_to); self.fsm.peer.last_compacted_idx = compact_to; self.fsm.peer.mut_store().on_compact_raftlog(compact_to); + if self.fsm.peer.is_witness() { + self.fsm.peer.last_compacted_time = Instant::now(); + } } fn on_ready_split_region( @@ -4897,6 +4972,10 @@ where ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), + ExecResult::PendingCompactCmd => { + self.fsm.peer.has_pending_compact_cmd = true; + self.register_pull_voter_replicated_index_tick(); + } } } @@ -5315,8 +5394,13 @@ where let first_idx = self.fsm.peer.get_store().first_index(); let last_idx = self.fsm.peer.get_store().last_index(); + let mut voter_replicated_idx = last_idx; let (mut replicated_idx, mut alive_cache_idx) = (last_idx, last_idx); for (peer_id, p) in self.fsm.peer.raft_group.raft.prs().iter() { + let peer = find_peer_by_id(self.region(), *peer_id).unwrap(); + if !is_learner(peer) && voter_replicated_idx > p.matched { + voter_replicated_idx = p.matched; + } if replicated_idx > p.matched { replicated_idx = p.matched; } @@ -5405,7 +5489,8 @@ where let region_id = self.fsm.peer.region().get_id(); let peer = self.fsm.peer.peer.clone(); let term = self.fsm.peer.get_index_term(compact_idx); - let request = new_compact_log_request(region_id, peer, compact_idx, term); + let request = + new_compact_log_request(region_id, peer, compact_idx, term, voter_replicated_idx); self.propose_raft_command_internal( request, Callback::None, @@ -5444,6 +5529,27 @@ where self.register_check_long_uncommitted_tick(); } + fn on_request_voter_replicated_index(&mut self) { + if !self.fsm.peer.is_witness() || !self.fsm.peer.has_pending_compact_cmd { + return; + } + // TODO: make it configurable + if self.fsm.peer.last_compacted_time.elapsed() + > self.ctx.cfg.raft_log_gc_tick_interval.0 * 2 + { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgVoterReplicatedIndexRequest); + let leader_id = self.fsm.peer.leader_id(); + let leader = self.fsm.peer.get_peer_from_cache(leader_id); + if let Some(leader) = leader { + self.fsm + .peer + .send_extra_message(msg, &mut self.ctx.trans, &leader); + } + } + self.register_pull_voter_replicated_index_tick(); + } + fn register_check_leader_lease_tick(&mut self) { self.schedule_tick(PeerTick::CheckLeaderLease) } @@ -5979,6 +6085,10 @@ where } } + fn register_pull_voter_replicated_index_tick(&mut self) { + self.schedule_tick(PeerTick::RequestVoterReplicatedIndex); + } + fn on_check_peer_stale_state_tick(&mut self) { if self.fsm.peer.pending_remove { return; @@ -6421,6 +6531,7 @@ fn new_compact_log_request( peer: metapb::Peer, compact_index: u64, compact_term: u64, + voter_replicated_index: u64, ) -> RaftCmdRequest { let mut request = new_admin_request(region_id, peer); @@ -6428,6 +6539,9 @@ fn new_compact_log_request( admin.set_cmd_type(AdminCmdType::CompactLog); admin.mut_compact_log().set_compact_index(compact_index); admin.mut_compact_log().set_compact_term(compact_term); + admin + .mut_compact_log() + .set_voter_replicated_index(voter_replicated_index); request.set_admin_request(admin); request } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 54bb7d0cc0b..ceb8858046d 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -594,6 +594,9 @@ where self.cfg.check_long_uncommitted_interval.0; self.tick_batch[PeerTick::CheckPeersAvailability as usize].wait_duration = self.cfg.check_peers_availability_interval.0; + // TODO: make it reasonable + self.tick_batch[PeerTick::RequestVoterReplicatedIndex as usize].wait_duration = + self.cfg.raft_log_gc_tick_interval.0 * 2; } } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index a4c6c435741..08b0e9367dc 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -375,6 +375,7 @@ pub enum PeerTick { ReportBuckets = 9, CheckLongUncommitted = 10, CheckPeersAvailability = 11, + RequestVoterReplicatedIndex = 12, } impl PeerTick { @@ -395,6 +396,7 @@ impl PeerTick { PeerTick::ReportBuckets => "report_buckets", PeerTick::CheckLongUncommitted => "check_long_uncommitted", PeerTick::CheckPeersAvailability => "check_peers_availability", + PeerTick::RequestVoterReplicatedIndex => "request_voter_replicated_index", } } @@ -412,6 +414,7 @@ impl PeerTick { PeerTick::ReportBuckets, PeerTick::CheckLongUncommitted, PeerTick::CheckPeersAvailability, + PeerTick::RequestVoterReplicatedIndex, ]; TICKS } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index a72bb59d8bf..7752a0a1b0e 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -939,6 +939,15 @@ where /// The index of last compacted raft log. It is used for the next compact /// log task. pub last_compacted_idx: u64, + /// Record the time of the last raft log compact, the witness should query + /// the leader periodically whether `voter_replicated_index` is updated + /// if CompactLog admin command isn't triggered for a while. + pub last_compacted_time: Instant, + /// When the peer is witness, and there is any voter lagging behind, the + /// log truncation of the witness shouldn't be triggered even if it's + /// force mode, and this item will be set to `true`, after all pending + /// compact cmds have been handled, it will be set to `false`. + pub has_pending_compact_cmd: bool, /// The index of the latest urgent proposal index. last_urgent_proposal_idx: u64, /// The index of the latest committed split command. @@ -1083,6 +1092,10 @@ where let logger = slog_global::get_global().new(slog::o!("region_id" => region.get_id())); let raft_group = RawNode::new(&raft_cfg, ps, &logger)?; + // In order to avoid excessive log accumulation due to the loss of pending + // compaction cmds after the witness is restarted, it will actively pull + // voter_request_index once at start. + let has_pending_compact_cmd = peer.is_witness; let mut peer = Peer { peer, @@ -1118,6 +1131,8 @@ where tag: tag.clone(), last_applying_idx: applied_index, last_compacted_idx: 0, + last_compacted_time: Instant::now(), + has_pending_compact_cmd, last_urgent_proposal_idx: u64::MAX, last_committed_split_idx: 0, last_sent_snapshot_idx: 0, diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index cee75ff44b9..98a845b7016 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -2,11 +2,12 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; +use collections::HashMap; use futures::executor::block_on; -use kvproto::metapb; +use kvproto::{metapb, raft_serverpb::RaftApplyState}; use pd_client::PdClient; use test_raftstore::*; -use tikv_util::store::find_peer; +use tikv_util::{config::ReadableDuration, store::find_peer}; fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { peer.set_role(metapb::PeerRole::Learner); @@ -69,3 +70,165 @@ fn test_witness_update_region_in_local_reader() { fail::remove("change_peer_after_update_region_store_3"); } + +// Test the case witness pull voter_replicated_index when has pending compact +// cmd. +#[test] +fn test_witness_raftlog_gc_pull_voter_replicated_index() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } + + // one follower is down + cluster.stop_node(nodes[1]); + + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } + + // the witness truncated index is not advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + if id == 2 { + assert_eq!( + state.get_truncated_state().get_index() - before_states[&id].get_index(), + 0 + ); + } else { + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + } + + fail::cfg("on_raft_gc_log_tick", "return").unwrap(); + + // the follower is back online + cluster.run_node(nodes[1]).unwrap(); + cluster.must_put(b"k00", b"v00"); + must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(300)); + + // the truncated index is advanced now, as all the peers has replicated + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + fail::remove("on_raft_gc_log_tick"); +} + +// Test the case witness gc raftlog after reboot. +#[test] +fn test_witness_raftlog_gc_after_reboot() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } + + // one follower is down + cluster.stop_node(nodes[1]); + + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } + + // the witness truncated index is not advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + if id == 2 { + assert_eq!( + state.get_truncated_state().get_index() - before_states[&id].get_index(), + 0 + ); + } else { + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + } + + fail::cfg("on_raft_gc_log_tick", "return").unwrap(); + + // the follower is back online + cluster.run_node(nodes[1]).unwrap(); + cluster.must_put(b"k00", b"v00"); + must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); + + // the witness is down + cluster.stop_node(nodes[2]); + std::thread::sleep(Duration::from_millis(100)); + // the witness is back online + cluster.run_node(nodes[2]).unwrap(); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(300)); + + // the truncated index is advanced now, as all the peers has replicated + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + fail::remove("on_raft_gc_log_tick"); +} diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index a2518cc64ae..301a743588e 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -2,8 +2,13 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; +use collections::HashMap; use futures::executor::block_on; -use kvproto::{metapb, raft_cmdpb::ChangePeerRequest, raft_serverpb::PeerState}; +use kvproto::{ + metapb, + raft_cmdpb::ChangePeerRequest, + raft_serverpb::{PeerState, RaftApplyState}, +}; use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use test_raftstore::*; @@ -296,127 +301,134 @@ fn test_witness_conf_change() { // } // } -// TODO: add back when raft log gc logic is updated for witness -// // Test the case that truncated index won't advance when there is a witness -// even // if the gap gap exceeds the gc count limit -// #[test] -// fn test_witness_raftlog_gc_lagged_follower() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); +// Test the case that truncated index won't advance when there is a witness even +// if the gap gap exceeds the gc count limit +#[test] +fn test_witness_raftlog_gc_lagged_follower() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); -// cluster.must_put(b"k0", b"v0"); + cluster.must_put(b"k0", b"v0"); -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1); -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); -// // make sure raft log gc is triggered -// std::thread::sleep(Duration::from_millis(200)); -// let mut before_states = HashMap::default(); -// for (&id, engines) in &cluster.engines { -// let mut state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); before_states.insert(id, -// state.take_truncated_state()); } + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } -// // one follower is down -// cluster.stop_node(nodes[1]); + // one follower is down + cluster.stop_node(nodes[1]); -// // write some data to make log gap exceeds the gc limit -// for i in 1..1000 { -// let (k, v) = (format!("k{}", i), format!("v{}", i)); -// let key = k.as_bytes(); -// let value = v.as_bytes(); -// cluster.must_put(key, value); -// } + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } -// // the truncated index is not advanced -// for (&id, engines) in &cluster.engines { -// let state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); assert!(state.get_truncated_state(). -// get_index() - before_states[&id].get_index() < 10); } - -// // the follower is back online -// cluster.run_node(nodes[1]).unwrap(); -// cluster.must_put(b"k00", b"v00"); -// must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); -// // make sure raft log gc is triggered -// std::thread::sleep(Duration::from_millis(300)); - -// // the truncated index is advanced now, as all the peers has replicated -// for (&id, engines) in &cluster.engines { -// let state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); assert_ge!( -// state.get_truncated_state().get_index() - -// before_states[&id].get_index(), 900 -// ); -// } -// } + // the witness truncated index is not advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + if id == 2 { + assert_eq!( + state.get_truncated_state().get_index() - before_states[&id].get_index(), + 0 + ); + } else { + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + } -// TODO: add back when raft log gc logic is updated for witness -// // Test the case that truncated index is advance when there is a lagged -// witness #[test] -// fn test_witness_raftlog_gc_lagged_witness() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); + // the follower is back online + cluster.run_node(nodes[1]).unwrap(); + cluster.must_put(b"k00", b"v00"); + must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(300)); + + // the truncated index is advanced now, as all the peers has replicated + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } +} -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); +// Test the case that truncated index is advance when there is a lagged witness +#[test] +fn test_witness_raftlog_gc_lagged_witness() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1); -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); -// cluster.must_put(b"k0", b"v0"); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); -// // make sure raft log gc is triggered -// std::thread::sleep(Duration::from_millis(200)); -// let mut before_states = HashMap::default(); -// for (&id, engines) in &cluster.engines { -// let mut state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); before_states.insert(id, -// state.take_truncated_state()); } + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + cluster.must_put(b"k0", b"v0"); -// // the witness is down -// cluster.stop_node(nodes[2]); + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } -// // write some data to make log gap exceeds the gc limit -// for i in 1..1000 { -// let (k, v) = (format!("k{}", i), format!("v{}", i)); -// let key = k.as_bytes(); -// let value = v.as_bytes(); -// cluster.must_put(key, value); -// } + // the witness is down + cluster.stop_node(nodes[2]); -// // the witness is back online -// cluster.run_node(nodes[2]).unwrap(); + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } -// cluster.must_put(b"k00", b"v00"); -// std::thread::sleep(Duration::from_millis(200)); + // the witness is back online + cluster.run_node(nodes[2]).unwrap(); -// // the truncated index is advanced -// for (&id, engines) in &cluster.engines { -// let state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); println!("{} {}", id, -// state.get_truncated_state().get_index()); assert_ge!( -// state.get_truncated_state().get_index() - -// before_states[&id].get_index(), 900 -// ); -// } -// } + cluster.must_put(b"k00", b"v00"); + std::thread::sleep(Duration::from_millis(200)); + + // the truncated index is advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } +} // Test the case replica read can't be performed on witness peer. #[test] From 177efafee39a7f1cf7cbc6330d834cdbbe42a657 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 27 Dec 2022 23:52:16 -0800 Subject: [PATCH 437/676] raftstore-v2: a few panic fix (#13996) ref tikv/tikv#12842 a few panic fix 1) update_approximate_raft_log_size may run into divid by zero error 2) appy_delete may have None write_batch 3) StoreMeta::set_region may run into region corruption error if it's destroyed and re-created. 4) TabletSnapManager's snapshot size calculation may throw Other error. Signed-off-by: qi.xu Signed-off-by: Jay Lee Co-authored-by: qi.xu Co-authored-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 1 + components/raftstore-v2/src/fsm/store.rs | 14 ++++++++++++++ .../raftstore-v2/src/operation/command/mod.rs | 5 +++++ .../src/operation/command/write/mod.rs | 1 + components/raftstore-v2/src/operation/life.rs | 6 ++++++ components/raftstore-v2/src/worker/pd/mod.rs | 8 +++++++- .../raftstore-v2/src/worker/pd/store_heartbeat.rs | 7 +++++-- 7 files changed, 39 insertions(+), 3 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index a3800085522..800dbc98f91 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -535,6 +535,7 @@ impl StoreSystem { causal_ts_provider, self.logger.clone(), self.shutdown.clone(), + cfg.clone(), )); let split_check_scheduler = workers.background.start( diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index cb7aa99b179..f107715a535 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -80,6 +80,20 @@ impl StoreMeta { ); } } + + pub fn remove_region(&mut self, region_id: u64) { + let prev = self.regions.remove(®ion_id); + if let Some((prev, initialized)) = prev { + if initialized { + let key = ( + data_end_key(prev.get_end_key()), + prev.get_region_epoch().get_version(), + ); + let prev_id = self.region_ranges.remove(&key); + assert_eq!(prev_id, Some(prev.get_id())); + } + } + } } impl StoreRegionMeta for StoreMeta { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 35b4ec1918e..0a58bb64016 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -245,6 +245,11 @@ impl Peer { let apply = CommittedEntries { entry_and_proposals, }; + assert!( + self.apply_scheduler().is_some(), + "apply_scheduler should be something. region_id {}", + self.region_id() + ); self.apply_scheduler() .unwrap() .send(ApplyTask::CommittedEntries(apply)); diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index ad6e537b956..af806e3024e 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -177,6 +177,7 @@ impl Apply { } util::check_key_in_region(key, self.region_state().get_region())?; keys::data_key_with_buffer(key, &mut self.key_buffer); + self.ensure_write_buffer(); let res = if cf.is_empty() || cf == CF_DEFAULT { // TODO: use write_vector self.write_batch.as_mut().unwrap().delete(&self.key_buffer) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index ea42832eaea..0f2e72061ef 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -331,6 +331,12 @@ impl Peer { pub fn finish_destroy(&mut self, ctx: &mut StoreContext) { info!(self.logger, "peer destroyed"); ctx.router.close(self.region_id()); + { + ctx.store_meta + .lock() + .unwrap() + .remove_region(self.region_id()); + } if let Some(msg) = self.destroy_progress_mut().finish() { // The message will be dispatched to store fsm, which will create a // new peer. Ignore error as it's just a best effort. diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index cc977e68236..bfcf3389754 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -11,9 +11,12 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; use pd_client::PdClient; -use raftstore::store::{util::KeysInfoFormatter, FlowStatsReporter, ReadStats, TxnExt, WriteStats}; +use raftstore::store::{ + util::KeysInfoFormatter, Config, FlowStatsReporter, ReadStats, TxnExt, WriteStats, +}; use slog::{error, info, Logger}; use tikv_util::{ + config::VersionTrack, time::UnixSecs, worker::{Runnable, Scheduler}, }; @@ -122,6 +125,7 @@ where logger: Logger, shutdown: Arc, + cfg: Arc>, } impl Runner @@ -141,6 +145,7 @@ where causal_ts_provider: Option>, // used for rawkv apiv2 logger: Logger, shutdown: Arc, + cfg: Arc>, ) -> Self { Self { store_id, @@ -158,6 +163,7 @@ where causal_ts_provider, logger, shutdown, + cfg, } } } diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs index 2fbe378cff8..22bee3cbf26 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs @@ -272,8 +272,11 @@ where Ok(stats) => stats, }; let disk_cap = disk_stats.total_space(); - // TODO: custom capacity. - let capacity = disk_cap; + let capacity = if self.cfg.value().capacity.0 == 0 { + disk_cap + } else { + std::cmp::min(disk_cap, self.cfg.value().capacity.0) + }; // TODO: accurate snapshot size and kv engines size. let snap_size = 0; let kv_size = 0; From 06bfaa42a120d1c2cefa5515810a699b3abd458b Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 30 Dec 2022 10:40:19 +0800 Subject: [PATCH 438/676] raftstore-v2: avoid ticking when there are many unapplied logs (#13995) ref tikv/tikv#12842 Whenever timeout, the peer will check for unapplied logs whether there are pending conf change and trigger heavy reads. So we wait till most logs are applied before ticking. It also fix following issues: - PersistenceListener is not installed - implementation of persisted_apply_index is wrong - parse tablet name is wrong Signed-off-by: Jay Lee --- components/engine_rocks/src/event_listener.rs | 2 +- components/engine_traits/src/flush.rs | 8 +- components/engine_traits/src/tablet.rs | 23 +++++- components/raftstore-v2/src/fsm/apply.rs | 3 + components/raftstore-v2/src/fsm/peer.rs | 19 ++--- .../operation/command/admin/compact_log.rs | 42 +++++----- .../operation/command/admin/conf_change.rs | 2 +- .../src/operation/command/admin/mod.rs | 10 ++- .../src/operation/command/admin/split.rs | 21 +++-- .../src/operation/command/control.rs | 5 ++ .../raftstore-v2/src/operation/command/mod.rs | 28 ++++++- .../src/operation/ready/apply_trace.rs | 55 ++++++++++--- .../raftstore-v2/src/operation/ready/mod.rs | 78 ++++++++++++++---- .../src/operation/ready/snapshot.rs | 51 +++++++++--- components/raftstore-v2/src/raft/apply.rs | 12 ++- components/raftstore-v2/src/raft/peer.rs | 44 ++++++++--- components/raftstore-v2/src/raft/storage.rs | 21 ++--- .../src/router/internal_message.rs | 1 + .../integrations/test_transfer_leader.rs | 63 ++++++++------- components/raftstore/src/store/metrics.rs | 1 + components/raftstore/src/store/snap.rs | 7 +- components/server/src/server2.rs | 79 ++++++++++--------- src/config/mod.rs | 3 + src/server/raftkv2/node.rs | 34 ++++---- 24 files changed, 417 insertions(+), 195 deletions(-) diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index b940fcb39f3..3bbf03cb77f 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -261,7 +261,7 @@ mod tests { let (region_id, tablet_index) = (2, 3); let storage = Arc::new(MemStorage::default()); - let state = Arc::new(FlushState::default()); + let state = Arc::new(FlushState::new(0)); let listener = PersistenceListener::new(region_id, tablet_index, state.clone(), storage.clone()); let mut db_opt = RocksDbOptions::default(); diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index cfed95f0426..b3a827c234e 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -50,12 +50,18 @@ impl FlushProgress { /// raftstore will update state changes and corresponding apply index, when /// flush, `PersistenceListener` will query states related to the memtable /// and persist the relation to raft engine. -#[derive(Default, Debug)] +#[derive(Debug)] pub struct FlushState { applied_index: AtomicU64, } impl FlushState { + pub fn new(applied_index: u64) -> Self { + Self { + applied_index: AtomicU64::new(applied_index), + } + } + /// Set the latest applied index. #[inline] pub fn set_applied_index(&self, index: u64) { diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index edc0bd99870..f552fbc01aa 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -222,10 +222,20 @@ impl TabletRegistry { }) } + /// Format the name as {prefix}_{id}_{suffix}. If prefix is empty, it will + /// be format as {id}_{suffix}. pub fn tablet_name(&self, prefix: &str, id: u64, suffix: u64) -> String { - format!("{}{}_{}", prefix, id, suffix) + format!( + "{}{:_(&self, path: &'a Path) -> Option<(&'a str, u64, u64)> { let name = path.file_name().unwrap().to_str().unwrap(); let mut parts = name.rsplit('_'); @@ -463,10 +473,19 @@ mod tests { }); assert_eq!(count, 1); - let name = registry.tablet_name("prefix_", 12, 30); + let name = registry.tablet_name("prefix", 12, 30); assert_eq!(name, "prefix_12_30"); let normal_name = registry.tablet_name("", 20, 15); let normal_tablet_path = registry.tablet_path(20, 15); assert_eq!(registry.tablet_root().join(normal_name), normal_tablet_path); + + let full_prefix_path = registry.tablet_root().join(name); + let res = registry.parse_tablet_name(&full_prefix_path); + assert_eq!(res, Some(("prefix", 12, 30))); + let res = registry.parse_tablet_name(&normal_tablet_path); + assert_eq!(res, Some(("", 20, 15))); + let invalid_path = registry.tablet_root().join("invalid_12"); + let res = registry.parse_tablet_name(&invalid_path); + assert_eq!(res, None); } } diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index c0eabd2120e..b81d31329cb 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -65,6 +65,7 @@ impl ApplyFsm { read_scheduler: Scheduler>, flush_state: Arc, log_recovery: Option>, + applied_term: u64, logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); @@ -76,6 +77,7 @@ impl ApplyFsm { read_scheduler, flush_state, log_recovery, + applied_term, logger, ); ( @@ -114,6 +116,7 @@ impl ApplyFsm { ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), ApplyTask::UnsafeWrite(raw_write) => self.apply.apply_unsafe_write(raw_write), + ApplyTask::ManualFlush => self.apply.on_manual_flush(), } // TODO: yield after some time. diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 49f1efcb760..8b05435246b 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -43,7 +43,11 @@ impl PeerFsm { storage: Storage, ) -> Result> { let peer = Peer::new(cfg, tablet_registry, snap_mgr, storage)?; - info!(peer.logger, "create peer"); + info!(peer.logger, "create peer"; + "raft_state" => ?peer.storage().raft_state(), + "apply_state" => ?peer.storage().apply_state(), + "region_state" => ?peer.storage().region_state() + ); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); let fsm = Box::new(PeerFsm { peer, @@ -187,20 +191,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } fn on_start(&mut self) { - self.schedule_tick(PeerTick::Raft); + if !self.fsm.peer.maybe_pause_for_recovery() { + self.schedule_tick(PeerTick::Raft); + } self.schedule_tick(PeerTick::SplitRegionCheck); self.schedule_tick(PeerTick::PdHeartbeat); self.schedule_tick(PeerTick::CompactLog); if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } - // Unlike v1, it's a must to set ready when there are pending entries. Otherwise - // it may block for ever when there is unapplied conf change. - let entry_storage = self.fsm.peer.storage().entry_storage(); - if entry_storage.commit_index() > entry_storage.applied_index() - // Speed up setup if there is only one peer. - || self.fsm.peer.is_leader() - { + // Speed up setup if there is only one peer. + if self.fsm.peer.is_leader() { self.fsm.peer.set_has_ready(); } } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index d1d10d366bf..c36c7353871 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -21,7 +21,7 @@ use raftstore::{ Result, }; use slog::{debug, error, info}; -use tikv_util::{box_err, Either}; +use tikv_util::box_err; use crate::{ batch::StoreContext, @@ -255,7 +255,15 @@ impl Peer { .unwrap(); self.set_has_extra_write(); - self.maybe_compact_log_from_engine(store_ctx, Either::Right(old_truncated)); + // All logs < perssited_apply will be deleted, so should check with +1. + if old_truncated + 1 < self.storage().apply_trace().persisted_apply_index() { + self.compact_log_from_engine(store_ctx); + } + + let applied = *self.last_applying_index_mut(); + let total_cnt = applied - old_truncated; + let remain_cnt = applied - res.compact_index; + self.update_approximate_raft_log_size(|s| s * remain_cnt / total_cnt); } #[inline] @@ -278,7 +286,9 @@ impl Peer { } else { self.set_has_extra_write(); } - self.maybe_compact_log_from_engine(store_ctx, Either::Left(old_persisted)); + if old_persisted < self.entry_storage().truncated_index() + 1 { + self.compact_log_from_engine(store_ctx); + } if self.remove_tombstone_tablets_before(new_persisted) { let sched = store_ctx.schedulers.tablet_gc.clone(); task.persisted_cbs.push(Box::new(move || { @@ -288,19 +298,10 @@ impl Peer { } } - pub fn maybe_compact_log_from_engine( - &mut self, - store_ctx: &mut StoreContext, - old_index: Either, - ) { - let truncated = self.entry_storage().truncated_index(); - let persisted = self.storage().apply_trace().persisted_apply_index(); - match old_index { - Either::Left(old_persisted) if old_persisted >= truncated => return, - Either::Right(old_truncated) if old_truncated >= persisted => return, - _ => {} - } - let compact_index = std::cmp::min(truncated, persisted); + fn compact_log_from_engine(&mut self, store_ctx: &mut StoreContext) { + let truncated = self.entry_storage().truncated_index() + 1; + let persisted_applied = self.storage().apply_trace().persisted_apply_index(); + let compact_index = std::cmp::min(truncated, persisted_applied); // Raft Engine doesn't care about first index. if let Err(e) = store_ctx @@ -309,11 +310,12 @@ impl Peer { { error!(self.logger, "failed to compact raft logs"; "err" => ?e); } else { + // TODO: make this debug when stable. + info!(self.logger, "compact log"; + "index" => compact_index, + "apply_trace" => ?self.storage().apply_trace(), + "truncated" => ?self.entry_storage().apply_state()); self.set_has_extra_write(); - let applied = self.storage().apply_state().get_applied_index(); - let total_cnt = applied - self.storage().entry_storage().first_index() + 1; - let remain_cnt = applied - compact_index; - self.update_approximate_raft_log_size(|s| s * remain_cnt / total_cnt); } } } diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 5a6c91d3567..72b582d775d 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -232,7 +232,7 @@ impl Apply { ) -> Result<(AdminResponse, AdminCmdResult)> { let region = self.region_state().get_region(); let change_kind = ConfChangeKind::confchange_kind(changes.len()); - info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch()); + info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch(), "index" => index); let mut new_region = region.clone(); match change_kind { ConfChangeKind::LeaveJoint => self.apply_leave_joint(&mut new_region), diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 977e26e0675..4f2abb9c65e 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -110,9 +110,13 @@ impl Peer { } }; match &res { - Ok(index) => self - .proposal_control_mut() - .record_proposed_admin(cmd_type, *index), + Ok(index) => { + self.proposal_control_mut() + .record_proposed_admin(cmd_type, *index); + if self.proposal_control_mut().has_uncommitted_admin() { + self.raft_group_mut().skip_bcast_commit(false); + } + } Err(e) => { info!( self.logger, diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index e1f4ae552f6..faf059b3871 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -65,7 +65,7 @@ use crate::{ Error, }; -pub const SPLIT_PREFIX: &str = "split_"; +pub const SPLIT_PREFIX: &str = "split"; #[derive(Debug)] pub struct SplitResult { @@ -171,6 +171,9 @@ impl Peer { pub fn update_split_flow_control(&mut self, metrics: &ApplyMetrics) { let control = self.split_flow_control_mut(); control.size_diff_hint += metrics.size_diff_hint; + if self.is_leader() { + self.add_pending_tick(PeerTick::SplitRegionCheck); + } } pub fn on_request_split( @@ -265,6 +268,7 @@ impl Apply { self.logger, "split region"; "region" => ?region, + "index" => log_index, "boundaries" => %KeysInfoFormatter(boundaries.iter()), ); @@ -449,6 +453,8 @@ impl Peer { // Now pd only uses ReportBatchSplit for history operation show, // so we send it independently here. self.report_batch_split_pd(store_ctx, res.regions.to_vec()); + // After split, the peer may need to update its metrics. + self.split_flow_control_mut().may_skip_split_check = false; self.add_pending_tick(PeerTick::SplitRegionCheck); } @@ -629,7 +635,7 @@ mod test { kv::TestTabletFactory, }; use engine_traits::{ - Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, + FlushState, Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, }; use kvproto::{ metapb::RegionEpoch, @@ -787,8 +793,9 @@ mod test { reporter, reg, read_scheduler, - Arc::default(), + Arc::new(FlushState::new(5)), None, + 5, logger.clone(), ); @@ -803,7 +810,7 @@ mod test { splits.mut_requests().clear(); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 6).unwrap_err(); // Empty requests should be rejected. assert!(err.to_string().contains("missing split requests")); @@ -824,7 +831,7 @@ mod test { .mut_requests() .push(new_split_req(b"", 1, vec![11, 12, 13])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 7).unwrap_err(); // Empty key will not in any region exclusively. assert!(err.to_string().contains("missing split key"), "{:?}", err); @@ -836,7 +843,7 @@ mod test { .mut_requests() .push(new_split_req(b"k1", 1, vec![11, 12, 13])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 8).unwrap_err(); // keys should be in ascend order. assert!( err.to_string().contains("invalid split request"), @@ -852,7 +859,7 @@ mod test { .mut_requests() .push(new_split_req(b"k2", 1, vec![11, 12])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 9).unwrap_err(); // All requests should be checked. assert!(err.to_string().contains("id count"), "{:?}", err); diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs index b330d0093fe..fd53090fd65 100644 --- a/components/raftstore-v2/src/operation/command/control.rs +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -181,6 +181,11 @@ impl ProposalControl { } } + #[inline] + pub fn has_uncommitted_admin(&self) -> bool { + !self.proposed_admin_cmd.is_empty() && !self.proposed_admin_cmd.back().unwrap().committed + } + pub fn advance_apply(&mut self, index: u64, term: u64, region: &metapb::Region) { while !self.proposed_admin_cmd.is_empty() { let cmd = self.proposed_admin_cmd.front_mut().unwrap(); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 0a58bb64016..a533ae9af87 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -38,13 +38,14 @@ use raftstore::{ }, Error, Result, }; +use slog::{info, warn}; use tikv_util::{box_err, time::monotonic_raw_now}; use crate::{ batch::StoreContext, fsm::{ApplyFsm, ApplyResReporter}, raft::{Apply, Peer}, - router::{ApplyRes, ApplyTask, CmdResChannel}, + router::{ApplyRes, ApplyTask, CmdResChannel, PeerTick}, }; mod admin; @@ -111,6 +112,7 @@ impl Peer { read_scheduler, self.flush_state().clone(), self.storage().apply_trace().log_recovery(), + self.entry_storage().applied_term(), logger, ); @@ -306,6 +308,22 @@ impl Peer { apply_res.applied_index, progress_to_be_updated, ); + if self.pause_for_recovery() + && self.storage().entry_storage().commit_index() <= apply_res.applied_index + { + info!(self.logger, "recovery completed"; "apply_index" => apply_res.applied_index); + self.set_pause_for_recovery(false); + // Flush to avoid recover again and again. + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::ManualFlush); + } + self.add_pending_tick(PeerTick::Raft); + } + if !self.pause_for_recovery() && self.storage_mut().apply_trace_mut().should_flush() { + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::ManualFlush); + } + } } } @@ -347,6 +365,13 @@ impl Apply { } } + pub fn on_manual_flush(&mut self) { + self.flush(); + if let Err(e) = self.tablet().flush_cfs(&[], false) { + warn!(self.logger, "failed to flush: {:?}", e); + } + } + #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); @@ -512,6 +537,7 @@ impl Apply { #[inline] pub fn flush(&mut self) { + // TODO: maybe we should check whether there is anything to flush. let (index, term) = self.apply_progress(); let flush_state = self.flush_state().clone(); if let Some(wb) = &mut self.write_batch && !wb.is_empty() { diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index d6a83b7933b..1e9d1ef4221 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -40,7 +40,7 @@ use kvproto::{ use raftstore::store::{ ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use slog::Logger; +use slog::{trace, Logger}; use tikv_util::{box_err, worker::Scheduler}; use crate::{ @@ -130,7 +130,7 @@ impl engine_traits::StateStorage for StateStorage< /// Mapping from data cf to an u64 index. pub type DataTrace = [u64; DATA_CFS_LEN]; -#[derive(Clone, Copy, Default)] +#[derive(Clone, Copy, Default, Debug)] struct Progress { flushed: u64, /// The index of last entry that has modification to the CF. @@ -154,7 +154,7 @@ pub fn cf_offset(cf: &str) -> usize { /// interact with other peers will be traced. /// - support query the flushed progress without actually scanning raft engine, /// which is useful for cleaning up stale flush records. -#[derive(Default)] +#[derive(Default, Debug)] pub struct ApplyTrace { /// The modified indexes and flushed index of each data CF. data_cfs: Box<[Progress; DATA_CFS_LEN]>, @@ -168,6 +168,10 @@ pub struct ApplyTrace { admin: Progress, /// Index that is issued to be written. It may not be truely persisted. persisted_applied: u64, + /// Flush will be triggered explicitly when there are too many pending + /// writes. It marks the last index that is flushed to avoid too many + /// flushes. + last_flush_trigger: u64, /// `true` means the raft cf record should be persisted in next ready. try_persist: bool, } @@ -187,6 +191,7 @@ impl ApplyTrace { trace.admin.flushed = i; trace.admin.last_modified = i; trace.persisted_applied = i; + trace.last_flush_trigger = i; let applied_region_state = engine .get_region_state(region_id, trace.admin.flushed)? .unwrap(); @@ -218,7 +223,31 @@ impl ApplyTrace { } pub fn persisted_apply_index(&self) -> u64 { - self.admin.flushed + self.persisted_applied + } + + pub fn should_flush(&mut self) -> bool { + if self.admin.flushed < self.admin.last_modified { + // It's waiting for other peers, flush will not help. + return false; + } + let last_modified = self + .data_cfs + .iter() + .filter_map(|pr| { + if pr.last_modified != pr.flushed { + Some(pr.last_modified) + } else { + None + } + }) + .max(); + if let Some(m) = last_modified && m >= self.admin.flushed + 4096 && m >= self.last_flush_trigger + 4096 { + self.last_flush_trigger = m; + true + } else { + false + } } // All events before `mem_index` must be consumed before calling this function. @@ -456,6 +485,7 @@ impl Storage { impl Peer { pub fn on_data_flushed(&mut self, cf: &str, tablet_index: u64, index: u64) { + trace!(self.logger, "data flushed"; "cf" => cf, "tablet_index" => tablet_index, "index" => index, "trace" => ?self.storage().apply_trace()); if tablet_index < self.storage().tablet_index() { // Stale tablet. return; @@ -467,6 +497,7 @@ impl Peer { } pub fn on_data_modified(&mut self, modification: DataTrace) { + trace!(self.logger, "on data modified"; "modification" => ?modification, "trace" => ?self.storage().apply_trace()); let apply_index = self.storage().entry_storage().applied_index(); let apply_trace = self.storage_mut().apply_trace_mut(); for (cf, index) in DATA_CFS.iter().zip(modification) { @@ -556,22 +587,22 @@ mod tests { #[test] fn test_apply_trace() { let mut trace = ApplyTrace::default(); - assert_eq!(0, trace.persisted_apply_index()); + assert_eq!(0, trace.admin.flushed); // If there is no modifications, index should be advanced anyway. trace.maybe_advance_admin_flushed(2); - assert_eq!(2, trace.persisted_apply_index()); + assert_eq!(2, trace.admin.flushed); for cf in DATA_CFS { trace.on_modify(cf, 3); } trace.maybe_advance_admin_flushed(3); // Modification is not flushed. - assert_eq!(2, trace.persisted_apply_index()); + assert_eq!(2, trace.admin.flushed); for cf in DATA_CFS { trace.on_flush(cf, 3); } trace.maybe_advance_admin_flushed(3); // No admin is recorded, index should be advanced. - assert_eq!(3, trace.persisted_apply_index()); + assert_eq!(3, trace.admin.flushed); trace.on_admin_modify(4); for cf in DATA_CFS { trace.on_flush(cf, 4); @@ -581,25 +612,25 @@ mod tests { } trace.maybe_advance_admin_flushed(4); // Unflushed admin modification should hold index. - assert_eq!(3, trace.persisted_apply_index()); + assert_eq!(3, trace.admin.flushed); trace.on_admin_flush(4); trace.maybe_advance_admin_flushed(4); // Admin is flushed, index should be advanced. - assert_eq!(4, trace.persisted_apply_index()); + assert_eq!(4, trace.admin.flushed); for cf in DATA_CFS { trace.on_flush(cf, 5); } trace.maybe_advance_admin_flushed(4); // Though all data CFs are flushed, but index should not be // advanced as we don't know whether there is admin modification. - assert_eq!(4, trace.persisted_apply_index()); + assert_eq!(4, trace.admin.flushed); for cf in DATA_CFS { trace.on_modify(cf, 5); } trace.maybe_advance_admin_flushed(5); // Because modify is recorded, so we know there should be no admin // modification and index can be advanced. - assert_eq!(5, trace.persisted_apply_index()); + assert_eq!(5, trace.admin.flushed); } #[test] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 8b125844d0e..8a0e0770b1f 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -32,7 +32,7 @@ use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, store::{needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteTask}, }; -use slog::{debug, error, trace, warn}; +use slog::{debug, error, info, trace, warn}; use tikv_util::{ store::find_peer, time::{duration_to_sec, monotonic_raw_now}, @@ -50,6 +50,8 @@ use crate::{ router::{ApplyTask, PeerMsg, PeerTick}, }; +const PAUSE_FOR_RECOVERY_GAP: u64 = 128; + impl Store { pub fn on_store_unreachable( &mut self, @@ -76,9 +78,33 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } impl Peer { + pub fn maybe_pause_for_recovery(&mut self) -> bool { + let entry_storage = self.storage().entry_storage(); + let committed_index = entry_storage.commit_index(); + let applied_index = entry_storage.applied_index(); + if committed_index > applied_index { + // Unlike v1, it's a must to set ready when there are pending entries. Otherwise + // it may block for ever when there is unapplied conf change. + self.set_has_ready(); + } + if committed_index > applied_index + PAUSE_FOR_RECOVERY_GAP { + // If there are too many the missing logs, we need to skip ticking otherwise + // it may block the raftstore thread for a long time in reading logs for + // election timeout. + info!(self.logger, "pause for recovery"; "applied" => applied_index, "committed" => committed_index); + self.set_pause_for_recovery(true); + true + } else { + false + } + } + #[inline] fn tick(&mut self) -> bool { - self.raft_group_mut().tick() + // When it's handling snapshot, it's pointless to tick as all the side + // affects have to wait till snapshot is applied. On the other hand, ticking + // will bring other corner cases like elections. + !self.is_handling_snapshot() && self.raft_group_mut().tick() } pub fn on_peer_unreachable(&mut self, to_peer_id: u64) { @@ -107,6 +133,10 @@ impl Peer { "from_peer_id" => msg.get_from_peer().get_id(), "to_peer_id" => msg.get_to_peer().get_id(), ); + if self.pause_for_recovery() && msg.get_message().get_msg_type() == MessageType::MsgAppend { + ctx.raft_metrics.message_dropped.recovery.inc(); + return; + } if !self.serving() { return; } @@ -273,31 +303,44 @@ impl Peer { ) { // TODO: skip handling committed entries if a snapshot is being applied // asynchronously. - if self.is_leader() { + let mut update_lease = self.is_leader(); + if update_lease { for entry in committed_entries.iter().rev() { self.update_approximate_raft_log_size(|s| s + entry.get_data().len() as u64); - let propose_time = self - .proposals() - .find_propose_time(entry.get_term(), entry.get_index()); - if let Some(propose_time) = propose_time { - // We must renew current_time because this value may be created a long time ago. - // If we do not renew it, this time may be smaller than propose_time of a - // command, which was proposed in another thread while this thread receives its - // AppendEntriesResponse and is ready to calculate its commit-log-duration. - ctx.current_time.replace(monotonic_raw_now()); - ctx.raft_metrics.commit_log.observe(duration_to_sec( - (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), - )); - self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); - break; + if update_lease { + let propose_time = self + .proposals() + .find_propose_time(entry.get_term(), entry.get_index()); + if let Some(propose_time) = propose_time { + // We must renew current_time because this value may be created a long time + // ago. If we do not renew it, this time may be + // smaller than propose_time of a command, which was + // proposed in another thread while this thread receives its + // AppendEntriesResponse and is ready to calculate its commit-log-duration. + ctx.current_time.replace(monotonic_raw_now()); + ctx.raft_metrics.commit_log.observe(duration_to_sec( + (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), + )); + self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); + update_lease = false; + } } } } + let applying_index = committed_entries.last().unwrap().index; + let commit_to_current_term = committed_entries.last().unwrap().term == self.term(); + *self.last_applying_index_mut() = applying_index; if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { // Compact all cached entries instead of half evict. self.entry_storage_mut().evict_entry_cache(false); } self.schedule_apply_committed_entries(committed_entries); + if self.is_leader() + && commit_to_current_term + && !self.proposal_control().has_uncommitted_admin() + { + self.raft_group_mut().skip_bcast_commit(true); + } } /// Processing the ready of raft. A detail description of how it's handled @@ -525,6 +568,7 @@ impl Peer { self.region_heartbeat_pd(ctx); self.add_pending_tick(PeerTick::CompactLog); + self.add_pending_tick(PeerTick::SplitRegionCheck); } StateRole::Follower => { self.leader_lease_mut().expire(); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 41dc0d39429..c040bdcbb3b 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -37,7 +37,7 @@ use raftstore::{ coprocessor::RegionChangeEvent, store::{ metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, - TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, + TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, }; use slog::{error, info, warn}; @@ -197,19 +197,24 @@ impl Peer { StateRole::Follower, ); let persisted_index = self.persisted_index(); - let first_index = self.storage().entry_storage().first_index(); - if first_index == persisted_index + 1 { + *self.last_applying_index_mut() = persisted_index; + let snapshot_index = self.entry_storage().truncated_index(); + assert!(snapshot_index >= RAFT_INIT_LOG_INDEX, "{:?}", self.logger); + // If leader sends a message append to the follower while it's applying + // snapshot (via split init for example), the persisted_index may be larger + // than the first index. But as long as first index is not larger, the + // latest snapshot should be applied. + if snapshot_index <= persisted_index { let region_id = self.region_id(); - self.reset_flush_state(); + self.reset_flush_state(snapshot_index); let flush_state = self.flush_state().clone(); - let mut tablet_ctx = TabletContext::new(self.region(), Some(persisted_index)); + let mut tablet_ctx = TabletContext::new(self.region(), Some(snapshot_index)); // Use a new FlushState to avoid conflicts with the old one. tablet_ctx.flush_state = Some(flush_state); ctx.tablet_registry.load(tablet_ctx, false).unwrap(); - self.record_tablet_as_tombstone_and_refresh(persisted_index, ctx); - self.schedule_apply_fsm(ctx); + self.record_tablet_as_tombstone_and_refresh(snapshot_index, ctx); self.storage_mut().on_applied_snapshot(); - self.raft_group_mut().advance_apply_to(persisted_index); + self.raft_group_mut().advance_apply_to(snapshot_index); { let mut meta = ctx.store_meta.lock().unwrap(); meta.set_region(self.region(), true, &self.logger); @@ -218,18 +223,18 @@ impl Peer { meta.region_read_progress .insert(region_id, self.read_progress().clone()); } - self.read_progress_mut() - .update_applied_core(persisted_index); + self.read_progress_mut().update_applied_core(snapshot_index); let split = self.storage_mut().split_init_mut().take(); if split.as_ref().map_or(true, |s| { - !s.scheduled || persisted_index != RAFT_INIT_LOG_INDEX + !s.scheduled || snapshot_index != RAFT_INIT_LOG_INDEX }) { info!(self.logger, "apply tablet snapshot completely"); } if let Some(init) = split { - info!(self.logger, "init with snapshot finished"); + info!(self.logger, "init split with snapshot finished"); self.post_split_init(ctx, init); } + self.schedule_apply_fsm(ctx); } } } @@ -343,6 +348,15 @@ impl Storage { /// Validate the snapshot. Returns true if it's valid. fn validate_snap(&self, snap: &Snapshot, request_index: u64) -> bool { let idx = snap.get_metadata().get_index(); + if idx < RAFT_INIT_LOG_INDEX || snap.get_metadata().get_term() < RAFT_INIT_LOG_TERM { + info!( + self.logger(), + "corrupted snapshot detected, generate again"; + "snap" => ?snap, + "request_index" => request_index, + ); + return false; + } // TODO(nolouch): check tuncated index if idx < request_index { // stale snapshot, should generate again. @@ -489,8 +503,21 @@ impl Storage { )); } + let old_last_index = self.entry_storage().last_index(); + if self.entry_storage().first_index() <= old_last_index { + // All states are rewritten in the following blocks. Stale states will be + // cleaned up by compact worker. + task.cut_logs = Some((0, old_last_index + 1)); + self.entry_storage_mut().clear(); + } + let last_index = snap.get_metadata().get_index(); let last_term = snap.get_metadata().get_term(); + assert!( + last_index >= RAFT_INIT_LOG_INDEX && last_term >= RAFT_INIT_LOG_TERM, + "{:?}", + self.logger().list() + ); let region_state = self.region_state_mut(); region_state.set_state(PeerState::Normal); region_state.set_region(region); diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 666f3adb699..6818d7ae0d9 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -34,6 +34,9 @@ pub struct Apply { /// command. tombstone: bool, applied_term: u64, + // Apply progress is set after every command in case there is a flush. But it's + // wrong to update flush_state immediately as a manual flush from other thread + // can fetch the wrong apply index from flush_state. applied_index: u64, /// The largest index that have modified each column family. modifications: DataTrace, @@ -64,11 +67,15 @@ impl Apply { read_scheduler: Scheduler>, flush_state: Arc, log_recovery: Option>, + applied_term: u64, logger: Logger, ) -> Self { let mut remote_tablet = tablet_registry .get(region_state.get_region().get_id()) .unwrap(); + assert_ne!(applied_term, 0, "{:?}", logger.list()); + let applied_index = flush_state.applied_index(); + assert_ne!(applied_index, 0, "{:?}", logger.list()); Apply { peer, tablet: remote_tablet.latest().unwrap().clone(), @@ -76,7 +83,7 @@ impl Apply { write_batch: None, callbacks: vec![], tombstone: false, - applied_term: 0, + applied_term, applied_index: flush_state.applied_index(), modifications: [0; DATA_CFS_LEN], admin_cmd_result: vec![], @@ -125,9 +132,6 @@ impl Apply { let log_recovery = self.log_recovery.as_ref().unwrap(); if log_recovery.iter().all(|v| index >= *v) { self.log_recovery.take(); - // Now all logs are recovered, flush them to avoid recover again - // and again. - let _ = self.tablet.flush_cfs(&[], false); } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 668b0ebf41d..f3734b6821d 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -62,6 +62,7 @@ pub struct Peer { /// For raft log compaction. skip_compact_log_ticks: usize, approximate_raft_log_size: u64, + last_applying_index: u64, /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. @@ -73,6 +74,7 @@ pub struct Peer { has_ready: bool, /// Sometimes there is no ready at all, but we need to trigger async write. has_extra_write: bool, + pause_for_recovery: bool, /// Writer for persisting side effects asynchronously. pub(crate) async_writer: AsyncWriter, @@ -133,7 +135,7 @@ impl Peer { let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); - let flush_state: Arc = Arc::default(); + let flush_state: Arc = Arc::new(FlushState::new(applied_index)); // We can't create tablet if tablet index is 0. It can introduce race when gc // old tablet and create new peer. We also can't get the correct range of the // region, which is required for kv data gc. @@ -155,12 +157,14 @@ impl Peer { peer_heartbeats: HashMap::default(), skip_compact_log_ticks: 0, approximate_raft_log_size: 0, + last_applying_index: raft_group.store().apply_state().get_applied_index(), raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), apply_scheduler: None, has_ready: false, has_extra_write: false, + pause_for_recovery: false, destroy_progress: DestroyProgress::None, raft_group, logger, @@ -366,14 +370,17 @@ impl Peer { /// Returns if there's any tombstone being removed. #[inline] pub fn remove_tombstone_tablets_before(&mut self, persisted: u64) -> bool { - let mut removed = 0; - while let Some(i) = self.pending_tombstone_tablets.first() - && *i <= persisted - { - removed += 1; + let removed = self + .pending_tombstone_tablets + .iter() + .take_while(|i| **i <= persisted) + .count(); + if removed > 0 { + self.pending_tombstone_tablets.drain(..removed); + true + } else { + false } - self.pending_tombstone_tablets.drain(..removed); - removed > 0 } #[inline] @@ -431,6 +438,16 @@ impl Peer { mem::take(&mut self.has_extra_write) } + #[inline] + pub fn set_pause_for_recovery(&mut self, pause: bool) { + self.pause_for_recovery = pause; + } + + #[inline] + pub fn pause_for_recovery(&self) -> bool { + self.pause_for_recovery + } + #[inline] pub fn insert_peer_cache(&mut self, peer: metapb::Peer) { for p in self.raft_group.store().region().get_peers() { @@ -551,6 +568,10 @@ impl Peer { self.approximate_raft_log_size = f(self.approximate_raft_log_size); } + pub fn last_applying_index_mut(&mut self) -> &mut u64 { + &mut self.last_applying_index + } + #[inline] pub fn state_role(&self) -> StateRole { self.raft_group.raft.state @@ -654,8 +675,7 @@ impl Peer { /// See the comments of `check_snap_status` for more details. #[inline] pub fn is_handling_snapshot(&self) -> bool { - // todo: This method may be unnecessary now? - false + self.persisted_index() < self.entry_storage().truncated_index() } /// Returns `true` if the raft group has replicated a snapshot but not @@ -774,8 +794,8 @@ impl Peer { &self.flush_state } - pub fn reset_flush_state(&mut self) { - self.flush_state = Arc::default(); + pub fn reset_flush_state(&mut self, index: u64) { + self.flush_state = Arc::new(FlushState::new(index)); } // Note: Call `set_has_extra_write` after adding new state changes. diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 636970c0ad1..51bd41ba253 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -298,7 +298,9 @@ mod tests { ctor::{CfOptions, DbOptions}, kv::TestTabletFactory, }; - use engine_traits::{RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS}; + use engine_traits::{ + FlushState, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS, + }; use kvproto::{ metapb::{Peer, Region}, raft_serverpb::PeerState, @@ -379,25 +381,25 @@ mod tests { .unwrap() .unwrap(); - let snapshot = new_empty_snapshot(region.clone(), 10, 1, false); + let snapshot = new_empty_snapshot(region.clone(), 10, 9, false); let mut task = WriteTask::new(region.get_id(), 5, 0); s.apply_snapshot(&snapshot, &mut task, mgr, reg).unwrap(); // It can be set before load tablet. assert_eq!(PeerState::Normal, s.region_state().get_state()); assert_eq!(10, s.entry_storage().truncated_index()); - assert_eq!(1, s.entry_storage().truncated_term()); - assert_eq!(1, s.entry_storage().last_term()); + assert_eq!(9, s.entry_storage().truncated_term()); + assert_eq!(9, s.entry_storage().last_term()); assert_eq!(10, s.entry_storage().raft_state().last_index); // This index can't be set before load tablet. assert_ne!(10, s.entry_storage().applied_index()); - assert_ne!(1, s.entry_storage().applied_term()); + assert_ne!(9, s.entry_storage().applied_term()); assert_eq!(10, s.region_state().get_tablet_index()); assert!(!task.persisted_cbs.is_empty()); s.on_applied_snapshot(); assert_eq!(10, s.entry_storage().applied_index()); - assert_eq!(1, s.entry_storage().applied_term()); + assert_eq!(9, s.entry_storage().applied_term()); assert_eq!(10, s.region_state().get_tablet_index()); } @@ -440,8 +442,9 @@ mod tests { router, reg, sched, - Arc::default(), + Arc::new(FlushState::new(5)), None, + 5, logger, ); @@ -460,8 +463,8 @@ mod tests { SnapState::Generated(ref snap) => *snap.clone(), ref s => panic!("unexpected state: {:?}", s), }; - assert_eq!(snap.get_metadata().get_index(), 0); - assert_eq!(snap.get_metadata().get_term(), 0); + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); assert_eq!(snap.get_data().is_empty(), false); let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); let checkpointer_path = mgr.tablet_gen_path(&snap_key); diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 05e1baea1cf..092e7e21b5f 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -10,6 +10,7 @@ pub enum ApplyTask { Snapshot(GenSnapTask), /// Writes that doesn't care consistency. UnsafeWrite(Box<[u8]>), + ManualFlush, } #[derive(Debug, Default)] diff --git a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs index d031d6b1eba..18d81ef16aa 100644 --- a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs +++ b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{assert_matches::assert_matches, time::Duration}; use engine_traits::{Peekable, CF_DEFAULT}; use futures::executor::block_on; @@ -9,35 +9,32 @@ use kvproto::{ raft_cmdpb::{AdminCmdType, TransferLeaderRequest}, }; use raft::prelude::ConfChangeType; -use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; +use raftstore_v2::{ + router::{PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; use tikv_util::store::new_peer; use crate::cluster::Cluster; fn put_data( region_id: u64, - cluster: &Cluster, + cluster: &mut Cluster, node_off: usize, node_off_for_verify: usize, key: &[u8], ) { - let router = &cluster.routers[node_off]; + let mut router = &mut cluster.routers[node_off]; router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); // router.wait_applied_to_current_term(2, Duration::from_secs(3)); - let tablet_registry = cluster.node(node_off).tablet_registry(); - let tablet = tablet_registry - .get(region_id) - .unwrap() - .latest() - .unwrap() - .clone(); - assert!(tablet.get_value(key).unwrap().is_none()); + let snap = router.stale_snapshot(region_id); + assert_matches!(snap.get_value(key), Ok(None)); let header = Box::new(router.new_request_for(region_id).take_header()); let mut put = SimpleWriteEncoder::with_capacity(64); - put.put(CF_DEFAULT, &key[1..], b"value"); + put.put(CF_DEFAULT, key, b"value"); let (msg, mut sub) = PeerMsg::simple_write(header, put.encode()); router.send(region_id, msg).unwrap(); std::thread::sleep(std::time::Duration::from_millis(10)); @@ -53,17 +50,29 @@ fn put_data( let resp = block_on(sub.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); - - // Verify the data is ready in the other node - let tablet_registry = cluster.node(node_off_for_verify).tablet_registry(); - let tablet = tablet_registry - .get(region_id) - .unwrap() - .latest() - .unwrap() - .clone(); - assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); + router = &mut cluster.routers[node_off]; + let snap = router.stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), b"value"); + + // Because of skip bcast commit, the data should not be applied yet. + router = &mut cluster.routers[node_off_for_verify]; + let snap = router.stale_snapshot(region_id); + assert_matches!(snap.get_value(key), Ok(None)); + // Trigger heartbeat explicitly to commit on follower. + router = &mut cluster.routers[node_off]; + for _ in 0..2 { + router + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + router + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + } + cluster.dispatch(region_id, vec![]); + std::thread::sleep(std::time::Duration::from_millis(100)); + router = &mut cluster.routers[node_off_for_verify]; + let snap = router.stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), b"value"); } pub fn must_transfer_leader( @@ -97,7 +106,7 @@ pub fn must_transfer_leader( #[test] fn test_transfer_leader() { - let cluster = Cluster::with_node_count(3, None); + let mut cluster = Cluster::with_node_count(3, None); let region_id = 2; let router0 = &cluster.routers[0]; @@ -137,13 +146,13 @@ fn test_transfer_leader() { cluster.dispatch(region_id, vec![]); // Ensure follower has latest entries before transfer leader. - put_data(region_id, &cluster, 0, 1, b"zkey1"); + put_data(region_id, &mut cluster, 0, 1, b"key1"); // Perform transfer leader must_transfer_leader(&cluster, region_id, 0, 1, peer1); // Before transfer back to peer0, put some data again. - put_data(region_id, &cluster, 1, 0, b"zkey2"); + put_data(region_id, &mut cluster, 1, 0, b"key2"); // Perform transfer leader let store_id = cluster.node(0).id(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index b0f44c30c0f..ce4f099610e 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -177,6 +177,7 @@ make_static_metric! { region_nonexistent, applying_snap, disk_full, + recovery, } pub label_enum ProposalType { diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 05decd62815..939bc2a1078 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1998,7 +1998,12 @@ impl TabletSnapManager { { continue; } - for e in file_system::read_dir(path)? { + let entries = match file_system::read_dir(path) { + Ok(entries) => entries, + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + for e in entries { match e.and_then(|e| e.metadata()) { Ok(m) => total_size += m.len(), Err(e) if e.kind() == ErrorKind::NotFound => continue, diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 5beddf60151..4d4e283ea7e 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -61,6 +61,7 @@ use raftstore::{ }, RegionInfoAccessor, }; +use raftstore_v2::{router::RaftRouter, StateStorage}; use security::SecurityManager; use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, @@ -136,8 +137,7 @@ fn run_impl(config: TikvConfig) { tikv.init_encryption(); let fetcher = tikv.init_io_utility(); let listener = tikv.init_flow_receiver(); - let (raft_engine, engines_info) = tikv.init_raw_engines(listener); - tikv.init_engines(raft_engine); + let engines_info = tikv.init_engines(listener); let server_config = tikv.init_servers::(); tikv.register_services(); tikv.init_metrics_flusher(fetcher, engines_info); @@ -201,6 +201,7 @@ struct TikvServer { pd_client: Arc, flow_info_sender: Option>, flow_info_receiver: Option>, + router: Option>, node: Option>, resolver: Option, store_path: PathBuf, @@ -310,6 +311,7 @@ where cfg_controller: Some(cfg_controller), security_mgr, pd_client, + router: None, node: None, resolver: None, store_path, @@ -567,36 +569,6 @@ where engine_rocks::FlowListener::new(tx) } - fn init_engines(&mut self, raft_engine: ER) { - let tablet_registry = self.tablet_registry.clone().unwrap(); - let mut node = NodeV2::new( - &self.config.server, - self.pd_client.clone(), - None, - tablet_registry, - ); - node.try_bootstrap_store(&self.config.raft_store, &raft_engine) - .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); - assert_ne!(node.id(), 0); - - let router = node.router(); - let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( - router.store_router().clone(), - self.config.coprocessor.clone(), - ); - let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); - - let engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); - - self.engines = Some(TikvEngines { - raft_engine, - engine, - }); - self.node = Some(node); - self.coprocessor_host = Some(coprocessor_host); - self.region_info_accessor = Some(region_info_accessor); - } - fn init_gc_worker(&mut self) -> GcWorker> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( @@ -774,7 +746,7 @@ where }; let check_leader_runner = CheckLeaderRunner::new( - self.node.as_ref().unwrap().router().store_meta().clone(), + self.router.as_ref().unwrap().store_meta().clone(), self.coprocessor_host.clone().unwrap(), ); let check_leader_scheduler = self @@ -855,6 +827,8 @@ where .unwrap() .start( engines.raft_engine.clone(), + self.tablet_registry.clone().unwrap(), + self.router.as_ref().unwrap(), server.transport(), snap_mgr, self.concurrency_manager.clone(), @@ -1392,10 +1366,10 @@ impl ConfiguredRaftEngine for RaftLogEngine { } impl TikvServer { - fn init_raw_engines( + fn init_engines( &mut self, flow_listener: engine_rocks::FlowListener, - ) -> (CER, Arc) { + ) -> Arc { let block_cache = self.config.storage.block_cache.build_shared_cache(); let env = self .config @@ -1415,6 +1389,19 @@ impl TikvServer { let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); + + let mut node = NodeV2::new(&self.config.server, self.pd_client.clone(), None); + node.try_bootstrap_store(&self.config.raft_store, &raft_engine) + .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); + assert_ne!(node.id(), 0); + + let router = node.router().clone(); + + // Create kv engine. + let builder = builder.state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + router.clone(), + ))); let factory = Box::new(builder.build()); self.kv_statistics = Some(factory.rocks_statistics()); let registry = TabletRegistry::new(factory, self.store_path.join("tablets")) @@ -1428,12 +1415,30 @@ impl TikvServer { raft_engine.register_config(cfg_controller); let engines_info = Arc::new(EnginesResourceInfo::new( - registry, + registry.clone(), raft_engine.as_rocks_engine().cloned(), 180, // max_samples_to_preserve )); - (raft_engine, engines_info) + let router = RaftRouter::new(node.id(), registry, router); + let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( + router.store_router().clone(), + self.config.coprocessor.clone(), + ); + let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + + let engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); + + self.engines = Some(TikvEngines { + raft_engine, + engine, + }); + self.router = Some(router); + self.node = Some(node); + self.coprocessor_host = Some(coprocessor_host); + self.region_info_accessor = Some(region_info_accessor); + + engines_info } } diff --git a/src/config/mod.rs b/src/config/mod.rs index 6ed8da3f111..c78ec02182f 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3122,6 +3122,9 @@ impl TikvConfig { if self.storage.engine == EngineType::RaftKv2 { self.raft_store.store_io_pool_size = cmp::max(self.raft_store.store_io_pool_size, 1); + if !self.raft_engine.enable { + panic!("raft-kv2 only supports raft log engine."); + } } self.raft_store.raftdb_path = self.infer_raft_db_path(None)?; diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index ed6f16e8bec..b876951894c 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -11,7 +11,7 @@ use raftstore::{ coprocessor::CoprocessorHost, store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}, }; -use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreSystem}; +use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreRouter, StoreSystem}; use slog::{info, o, Logger}; use tikv_util::{ config::VersionTrack, @@ -24,11 +24,10 @@ use crate::server::{node::init_store, Result}; pub struct NodeV2 { cluster_id: u64, store: metapb::Store, - system: Option<(RaftRouter, StoreSystem)>, + system: Option<(StoreRouter, StoreSystem)>, has_started: bool, pd_client: Arc, - registry: TabletRegistry, logger: Logger, } @@ -43,7 +42,6 @@ where cfg: &crate::server::Config, pd_client: Arc, store: Option, - registry: TabletRegistry, ) -> NodeV2 { let store = init_store(store, cfg); @@ -53,7 +51,6 @@ where pd_client, system: None, has_started: false, - registry, logger: slog_global::borrow_global().new(o!()), } } @@ -71,16 +68,14 @@ where ) .bootstrap_store()?; self.store.set_id(store_id); + let (router, system) = raftstore_v2::create_store_batch_system(cfg, store_id, self.logger.clone()); - self.system = Some(( - RaftRouter::new(store_id, self.registry.clone(), router), - system, - )); + self.system = Some((router, system)); Ok(()) } - pub fn router(&self) -> &RaftRouter { + pub fn router(&self) -> &StoreRouter { &self.system.as_ref().unwrap().0 } @@ -90,6 +85,8 @@ where pub fn start( &mut self, raft_engine: ER, + registry: TabletRegistry, + router: &RaftRouter, trans: T, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, @@ -112,15 +109,10 @@ where ) .bootstrap_first_region(&self.store, store_id)? { - let path = self - .registry - .tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); + let path = registry.tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); let ctx = TabletContext::new(®ion, Some(RAFT_INIT_LOG_INDEX)); // TODO: make follow line can recover from abort. - self.registry - .tablet_factory() - .open_tablet(ctx, &path) - .unwrap(); + registry.tablet_factory().open_tablet(ctx, &path).unwrap(); } // Put store only if the cluster is bootstrapped. @@ -130,6 +122,8 @@ where self.start_store( raft_engine, + registry, + router, trans, snap_mgr, concurrency_manager, @@ -187,6 +181,8 @@ where fn start_store( &mut self, raft_engine: ER, + registry: TabletRegistry, + router: &RaftRouter, trans: T, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, @@ -207,13 +203,13 @@ where } self.has_started = true; - let (router, system) = self.system.as_mut().unwrap(); + let system = &mut self.system.as_mut().unwrap().1; system.start( store_id, store_cfg, raft_engine, - self.registry.clone(), + registry, trans, self.pd_client.clone(), router.store_router(), From 64293cb434c42c30fc37daeaaeae5c963aea26ea Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Fri, 30 Dec 2022 17:02:17 +0800 Subject: [PATCH 439/676] add commit/apply duration for raft store (#13946) ref tikv/tikv#12842 Signed-off-by: bufferflies <1045931706@qq.com> --- components/raftstore-v2/src/batch/store.rs | 1 + components/raftstore-v2/src/operation/command/mod.rs | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 800dbc98f91..72f05801a0e 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -153,6 +153,7 @@ impl StorePoller { fn flush_events(&mut self) { self.schedule_ticks(); + self.poll_ctx.raft_metrics.maybe_flush(); } fn schedule_ticks(&mut self) { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index a533ae9af87..4831c4abf9f 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -33,13 +33,17 @@ use raftstore::{ Proposal, }, local_metrics::RaftMetrics, + metrics::APPLY_TASK_WAIT_TIME_HISTOGRAM, msg::ErrorCallback, util, WriteCallback, }, Error, Result, }; use slog::{info, warn}; -use tikv_util::{box_err, time::monotonic_raw_now}; +use tikv_util::{ + box_err, + time::{duration_to_sec, monotonic_raw_now, Instant}, +}; use crate::{ batch::StoreContext, @@ -81,6 +85,7 @@ pub struct CommittedEntries { /// Entries need to be applied. Note some entries may not be included for /// flow control. entry_and_proposals: Vec<(Entry, Vec)>, + committed_time: Instant, } fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { @@ -246,6 +251,7 @@ impl Peer { // memtables in kv engine is flushed. let apply = CommittedEntries { entry_and_proposals, + committed_time: Instant::now(), }; assert!( self.apply_scheduler().is_some(), @@ -375,6 +381,8 @@ impl Apply { #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); + APPLY_TASK_WAIT_TIME_HISTOGRAM + .observe(duration_to_sec(ce.committed_time.saturating_elapsed())); for (e, ch) in ce.entry_and_proposals { if self.tombstone() { apply::notify_req_region_removed(self.region_state().get_region().get_id(), ch); From a6afe78c43e293addd18251dee209d630322dd9e Mon Sep 17 00:00:00 2001 From: hongyunyan <649330952@qq.com> Date: Tue, 3 Jan 2023 11:02:19 +0800 Subject: [PATCH 440/676] extend evict_entry_cache for restart (#13998) close tikv/tikv#13997 Support to use evict_entry_cache when restart node. Signed-off-by: tabokie Signed-off-by: hongyunyan <649330952@qq.com> Signed-off-by: Xinye Tao Signed-off-by: Jay Lee Signed-off-by: Wenbo Zhang Signed-off-by: Zwb Co-authored-by: Xinye Tao Co-authored-by: Jay Co-authored-by: Zwb Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/entry_storage.rs | 4 ++++ components/raftstore/src/store/peer.rs | 3 +++ 2 files changed, 7 insertions(+) diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index c6278c890f7..4d6372dd582 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -1227,6 +1227,10 @@ impl EntryStorage { let idx = cache.cache[drain_to].index; let mem_size_change = cache.compact_to(idx + 1); RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); + } else if !half { + let cache = &mut self.cache; + let mem_size_change = cache.compact_to(u64::MAX); + RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 7752a0a1b0e..9384a4940c7 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1188,6 +1188,9 @@ where peer.raft_group.campaign()?; } + let persisted_index = peer.raft_group.raft.raft_log.persisted; + peer.mut_store().update_cache_persisted(persisted_index); + Ok(peer) } From 5de5fd24da76d35060fab0ac6e85b903a7b32af2 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 3 Jan 2023 15:36:19 +0800 Subject: [PATCH 441/676] raft-engine: remove confusing API cut logs (#14010) ref tikv/tikv#12842 The API is supposed to be used with `append` but nowhere can we find the clue. This PR merges `cut_logs` and `append` to reduce confusion and mistakes. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_panic/src/raft_engine.rs | 11 ++++---- components/engine_rocks/src/raft_engine.rs | 21 +++++++++------ components/engine_traits/src/raft_engine.rs | 20 +++++++++----- components/raft_log_engine/src/engine.rs | 13 ++++----- components/raftstore-v2/src/operation/life.rs | 16 +++-------- .../src/operation/ready/snapshot.rs | 1 - .../raftstore/src/store/async_io/write.rs | 27 +++++++++++++------ .../src/store/async_io/write_tests.rs | 20 +++++--------- .../raftstore/src/store/entry_storage.rs | 3 +-- .../raftstore/src/store/peer_storage.rs | 12 ++++----- components/raftstore/src/store/snap.rs | 2 +- .../raftstore/src/store/worker/raftlog_gc.rs | 2 +- components/server/src/raft_engine_switch.rs | 8 +++--- tests/integrations/server/kv_service.rs | 2 +- 14 files changed, 83 insertions(+), 75 deletions(-) diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index c3de53b4932..854b75fe30d 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -167,11 +167,12 @@ impl RaftEngine for PanicEngine { } impl RaftLogBatch for PanicWriteBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { - panic!() - } - - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index d5331a2ce29..d566ac3821b 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -361,7 +361,19 @@ impl RaftEngine for RocksEngine { } impl RaftLogBatch for RocksWriteBatchVec { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + let overwrite_to = overwrite_to.unwrap_or(0); + if let Some(last) = entries.last() && last.get_index() + 1 < overwrite_to { + for index in last.get_index() + 1..overwrite_to { + let key = keys::raft_log_key(raft_group_id, index); + self.delete(&key).unwrap(); + } + } if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { let ser_buf = Vec::with_capacity(max_size as usize); return self.append_impl(raft_group_id, &entries, ser_buf); @@ -369,13 +381,6 @@ impl RaftLogBatch for RocksWriteBatchVec { Ok(()) } - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { - for index in from..to { - let key = keys::raft_log_key(raft_group_id, index); - self.delete(&key).unwrap(); - } - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.put_msg(&keys::raft_state_key(raft_group_id), state) } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 9e95ae95e14..68036eae1eb 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -66,7 +66,7 @@ pub trait RaftEngineDebug: RaftEngine + Sync + Send + 'static { Ok(true) }) .unwrap(); - batch.append(region_id, entries).unwrap(); + batch.append(region_id, None, entries).unwrap(); if let Some(state) = self.get_raft_state(region_id).unwrap() { batch.put_raft_state(region_id, &state).unwrap(); } @@ -150,11 +150,19 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send } pub trait RaftLogBatch: Send { - /// Note: `RaftLocalState` won't be updated in this call. - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()>; - - /// Remove Raft logs in [`from`, `to`) which will be overwritten later. - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64); + /// Append continuous entries to the batch. + /// + /// All existing entries with same index will be overwritten. If + /// `overwrite_to` is set to a larger value, then entries in + /// `[entries.last().get_index(), overwrite_to)` will be deleted. + /// Nothing will be deleted if entries is empty. Note: `RaftLocalState` + /// won't be updated in this call. + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()>; fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()>; diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 7c98adf325f..1ae148ba41c 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -385,17 +385,18 @@ const FLUSH_STATE_KEY: &[u8] = &[0x06]; const KEY_PREFIX_LEN: usize = RAFT_LOG_STATE_KEY.len(); impl RaftLogBatchTrait for RaftLogBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + _overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + // overwrite is handled within raft log engine. self.0 .add_entries::(raft_group_id, &entries) .map_err(transfer_error) } - fn cut_logs(&mut self, _: u64, _: u64, _: u64) { - // It's unnecessary because overlapped entries can be handled in - // `append`. - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.0 .put_message(raft_group_id, RAFT_LOG_STATE_KEY.to_vec(), state) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 0f2e72061ef..954c6992cf9 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -10,8 +10,6 @@ //! sending a message to store fsm first, and then using split to initialized //! the peer. -use std::cmp; - use batch_system::BasicMailbox; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; @@ -296,32 +294,24 @@ impl Peer { /// After destroy is finished, `finish_destroy` should be called to clean up /// memory states. pub fn start_destroy(&mut self, write_task: &mut WriteTask) { - let entry_storage = self.storage().entry_storage(); if self.postponed_destroy() { return; } - let first_index = entry_storage.first_index(); - let last_index = entry_storage.last_index(); - if first_index <= last_index { - write_task.cut_logs = match write_task.cut_logs { - None => Some((first_index, last_index)), - Some((f, l)) => Some((cmp::min(first_index, f), cmp::max(last_index, l))), - }; - } let raft_engine = self.entry_storage().raft_engine(); let mut region_state = self.storage().region_state().clone(); let region_id = region_state.get_region().get_id(); + // Use extra write to ensure these writes are the last writes to raft engine. let lb = write_task .extra_write .ensure_v2(|| raft_engine.log_batch(2)); - // We only use raft-log-engine for v2, first index is not important. + // We only use raft-log-engine for v2, first index and state are not important. let raft_state = self.entry_storage().raft_state(); raft_engine.clean(region_id, 0, raft_state, lb).unwrap(); - // Write worker will do the clean up when meeting tombstone state. region_state.set_state(PeerState::Tombstone); let applied_index = self.entry_storage().applied_index(); lb.put_region_state(region_id, applied_index, ®ion_state) .unwrap(); + self.set_has_extra_write(); self.destroy_progress_mut().start(); } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index c040bdcbb3b..76a5b4297b3 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -507,7 +507,6 @@ impl Storage { if self.entry_storage().first_index() <= old_last_index { // All states are rewritten in the following blocks. Stale states will be // cleaned up by compact worker. - task.cut_logs = Some((0, old_last_index + 1)); self.entry_storage_mut().clear(); } diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index b4cceb96a82..56d0f93a11d 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -186,8 +186,8 @@ where pub raft_wb: Option, // called after writing to kvdb and raftdb. pub persisted_cbs: Vec>, - pub entries: Vec, - pub cut_logs: Option<(u64, u64)>, + overwrite_to: Option, + entries: Vec, pub raft_state: Option, pub extra_write: ExtraWrite, pub messages: Vec, @@ -207,8 +207,8 @@ where ready_number, send_time: Instant::now(), raft_wb: None, + overwrite_to: None, entries: vec![], - cut_logs: None, raft_state: None, extra_write: ExtraWrite::None, messages: vec![], @@ -221,11 +221,21 @@ where pub fn has_data(&self) -> bool { !(self.raft_state.is_none() && self.entries.is_empty() - && self.cut_logs.is_none() && self.extra_write.is_empty() && self.raft_wb.as_ref().map_or(true, |wb| wb.is_empty())) } + /// Append continous entries. + /// + /// All existing entries with same index will be overwritten. If + /// `overwrite_to` is set to a larger value, then entries in + /// `[entries.last().get_index(), overwrite_to)` will be deleted. If + /// entries is empty, nothing will be deleted. + pub fn set_append(&mut self, overwrite_to: Option, entries: Vec) { + self.entries = entries; + self.overwrite_to = overwrite_to; + } + #[inline] pub fn ready_number(&self) -> u64 { self.ready_number @@ -387,11 +397,12 @@ where raft_wb.merge(wb).unwrap(); } raft_wb - .append(task.region_id, std::mem::take(&mut task.entries)) + .append( + task.region_id, + task.overwrite_to, + std::mem::take(&mut task.entries), + ) .unwrap(); - if let Some((from, to)) = task.cut_logs { - raft_wb.cut_logs(task.region_id, from, to); - } if let Some(raft_state) = task.raft_state.take() && self.raft_states.insert(task.region_id, raft_state).is_none() { diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 6007b39489e..d1861a8903c 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -167,7 +167,9 @@ fn delete_kv(wb: Option<&mut TestKvWriteBatch>, key: &[u8]) { /// Simulate kv puts on raft engine. fn put_raft_kv(wb: Option<&mut TestRaftLogBatch>, key: u64) { - wb.unwrap().append(key, vec![new_entry(key, key)]).unwrap(); + wb.unwrap() + .append(key, None, vec![new_entry(key, key)]) + .unwrap(); } fn delete_raft_kv(engine: &RaftTestEngine, wb: Option<&mut TestRaftLogBatch>, key: u64) { @@ -294,10 +296,7 @@ fn test_worker() { put_kv(task_3.extra_write.v1_mut(), b"kv_k3", b"kv_v3"); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); - task_3 - .entries - .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); - task_3.cut_logs = Some((8, 9)); + task_3.set_append(Some(9), vec![new_entry(6, 6), new_entry(7, 7)]); task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); task_3 .messages @@ -392,10 +391,7 @@ fn test_worker_split_raft_wb() { lb.put_apply_state(region_1, 25, &apply_state_3).unwrap(); put_raft_kv(task_3.raft_wb.as_mut(), raft_key_3); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), raft_key_1); - task_3 - .entries - .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); - task_3.cut_logs = Some((8, 9)); + task_3.set_append(Some(9), vec![new_entry(6, 6), new_entry(7, 7)]); task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); if split.1 { expected_wbs += 1; @@ -500,8 +496,7 @@ fn test_basic_flow() { delete_kv(task_3.extra_write.v1_mut(), b"kv_k1"); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); - task_3.entries.append(&mut vec![new_entry(6, 6)]); - task_3.cut_logs = Some((7, 8)); + task_3.set_append(Some(8), vec![new_entry(6, 6)]); task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); task_3 .messages @@ -603,8 +598,7 @@ fn test_basic_flow_with_states() { lb.put_apply_state(region_1, 5, &apply_state_3).unwrap(); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); - task_3.entries.append(&mut vec![new_entry(6, 6)]); - task_3.cut_logs = Some((7, 8)); + task_3.set_append(Some(8), vec![new_entry(6, 6)]); task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); task_3 .messages diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 4d6372dd582..bc85ecedc34 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -1075,9 +1075,8 @@ impl EntryStorage { self.cache.append(self.region_id, self.peer_id, &entries); - task.entries = entries; // Delete any previously appended log entries which never committed. - task.cut_logs = Some((last_index + 1, prev_last_index + 1)); + task.set_append(Some(prev_last_index + 1), entries); self.raft_state.set_last_index(last_index); self.last_term = last_term; diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index c9e460d1cbc..b060a866d71 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -2082,7 +2082,7 @@ pub mod tests { let mut lb = engines.raft.log_batch(4096); // last_index < commit_index is invalid. raft_state.set_last_index(11); - lb.append(1, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) + lb.append(1, None, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) .unwrap(); raft_state.mut_hard_state().set_commit(12); lb.put_raft_state(1, &raft_state).unwrap(); @@ -2093,7 +2093,7 @@ pub mod tests { let entries = (12..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); lb.put_raft_state(1, &raft_state).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); s = build_storage().unwrap(); @@ -2138,7 +2138,7 @@ pub mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); engines.raft.gc(1, 0, 21, &mut lb).unwrap(); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); raft_state.mut_hard_state().set_commit(14); s = build_storage().unwrap(); @@ -2150,7 +2150,7 @@ pub mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); entries[0].set_term(RAFT_INIT_LOG_TERM - 1); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); @@ -2158,7 +2158,7 @@ pub mod tests { let entries = (14..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM - 1); lb.put_raft_state(1, &raft_state).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); @@ -2168,7 +2168,7 @@ pub mod tests { engines.raft.gc(1, 0, 21, &mut lb).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.set_last_index(13); - lb.append(1, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) + lb.append(1, None, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) .unwrap(); lb.put_raft_state(1, &raft_state).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 939bc2a1078..a9ef7df8c62 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -2154,7 +2154,7 @@ pub mod tests { apply_entry.set_term(0); apply_state.mut_truncated_state().set_index(10); kv.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; - lb.append(region_id, vec![apply_entry])?; + lb.append(region_id, None, vec![apply_entry])?; // Put region info into kv engine. let region = gen_test_region(region_id, 1, 1); diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index ce829ed61b2..3edabae71a0 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -214,7 +214,7 @@ mod tests { for i in 0..100 { let mut e = Entry::new(); e.set_index(i); - raft_wb.append(region_id, vec![e]).unwrap(); + raft_wb.append(region_id, None, vec![e]).unwrap(); } raft_db.consume(&mut raft_wb, false /* sync */).unwrap(); diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index d0637a04b0a..bfaa2a6587e 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -161,7 +161,7 @@ fn run_dump_raftdb_worker( // Assume that we always scan entry first and raft state at the // end. batch - .append(region_id, std::mem::take(&mut entries)) + .append(region_id, None, std::mem::take(&mut entries)) .unwrap(); } _ => unreachable!("There is only 2 types of keys in raft"), @@ -170,7 +170,7 @@ fn run_dump_raftdb_worker( if local_size >= BATCH_THRESHOLD { local_size = 0; batch - .append(region_id, std::mem::take(&mut entries)) + .append(region_id, None, std::mem::take(&mut entries)) .unwrap(); let size = new_engine.consume(&mut batch, false).unwrap(); @@ -205,7 +205,7 @@ fn run_dump_raft_engine_worker( begin += old_engine .fetch_entries_to(id, begin, end, Some(BATCH_THRESHOLD), &mut entries) .unwrap() as u64; - batch.append(id, entries).unwrap(); + batch.append(id, None, entries).unwrap(); let size = new_engine.consume(&mut batch, false).unwrap(); count_size.fetch_add(size, Ordering::Relaxed); } @@ -303,7 +303,7 @@ mod tests { e.set_index(i); entries.push(e); } - batch.append(num, entries).unwrap(); + batch.append(num, None, entries).unwrap(); } // Get data from raft engine and assert. diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 496c587a7b9..8709373b766 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -966,7 +966,7 @@ fn test_debug_raft_log() { entry.set_entry_type(eraftpb::EntryType::EntryNormal); entry.set_data(vec![42].into()); let mut lb = engine.log_batch(0); - lb.append(region_id, vec![entry.clone()]).unwrap(); + lb.append(region_id, None, vec![entry.clone()]).unwrap(); engine.consume(&mut lb, false).unwrap(); assert_eq!( engine.get_entry(region_id, log_index).unwrap().unwrap(), From bce01cfbc82b58a38b066892a3c679daf91dd33f Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 3 Jan 2023 16:42:19 +0800 Subject: [PATCH 442/676] raftstore-v2: publish tablet in raftstore thread only (#14009) ref tikv/tikv#12842 Publish tablet in apply thread is unsafe. This PR moves the operation to raftstore. It also fixes the issues that applying two splits at a time can cause panic. It also makes sure cache will be cleared after tablet is published. Signed-off-by: Jay Lee --- components/engine_traits/src/tablet.rs | 34 +- components/raftstore-v2/src/batch/store.rs | 65 ++- components/raftstore-v2/src/fsm/store.rs | 17 +- .../operation/command/admin/compact_log.rs | 125 ++++- .../src/operation/command/admin/mod.rs | 1 + .../src/operation/command/admin/split.rs | 31 +- .../raftstore-v2/src/operation/command/mod.rs | 3 +- components/raftstore-v2/src/operation/life.rs | 10 +- components/raftstore-v2/src/operation/mod.rs | 11 +- .../raftstore-v2/src/operation/query/lease.rs | 6 +- .../raftstore-v2/src/operation/query/local.rs | 430 ++++++++++++------ .../raftstore-v2/src/operation/query/mod.rs | 4 +- .../raftstore-v2/src/operation/ready/mod.rs | 16 +- .../src/operation/ready/snapshot.rs | 52 ++- components/raftstore-v2/src/raft/apply.rs | 20 +- components/raftstore-v2/src/raft/peer.rs | 88 +--- components/raftstore-v2/src/raft/storage.rs | 76 +++- components/raftstore-v2/src/router/imp.rs | 8 +- .../raftstore-v2/src/worker/tablet_gc.rs | 15 +- .../tests/failpoints/test_split.rs | 3 + .../tests/integrations/cluster.rs | 2 +- .../raftstore/src/store/async_io/write.rs | 6 +- components/raftstore/src/store/mod.rs | 5 +- components/raftstore/src/store/worker/read.rs | 13 +- components/server/src/server2.rs | 4 +- src/config/mod.rs | 27 +- tests/integrations/config/mod.rs | 10 +- 27 files changed, 707 insertions(+), 375 deletions(-) diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index f552fbc01aa..6bdfa97a6e6 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -31,6 +31,13 @@ pub struct CachedTablet { version: u64, } +impl CachedTablet { + fn release(&mut self) { + self.cache = None; + self.version = 0; + } +} + impl CachedTablet { #[inline] fn new(data: Option) -> Self { @@ -44,13 +51,11 @@ impl CachedTablet { } } - pub fn set(&mut self, data: EK) { - self.version = { - let mut latest_data = self.latest.data.lock().unwrap(); - *latest_data = Some(data.clone()); - self.latest.version.fetch_add(1, Ordering::Relaxed) + 1 - }; - self.cache = Some(data); + pub fn set(&mut self, data: EK) -> Option { + self.cache = Some(data.clone()); + let mut latest_data = self.latest.data.lock().unwrap(); + self.version = self.latest.version.fetch_add(1, Ordering::Relaxed) + 1; + latest_data.replace(data) } /// Get the tablet from cache without checking if it's up to date. @@ -69,19 +74,6 @@ impl CachedTablet { } self.cache() } - - /// Returns how many versions has passed. - #[inline] - pub fn refresh(&mut self) -> u64 { - let old_version = self.version; - if self.latest.version.load(Ordering::Relaxed) > old_version { - let latest_data = self.latest.data.lock().unwrap(); - self.version = self.latest.version.load(Ordering::Relaxed); - self.cache = latest_data.clone(); - return self.version - old_version; - } - 0 - } } /// Context to be passed to `TabletFactory`. @@ -317,8 +309,10 @@ impl TabletRegistry { let mut tablets = self.tablets.tablets.lock().unwrap(); for (id, tablet) in tablets.iter_mut() { if !f(*id, tablet) { + tablet.release(); return; } + tablet.release(); } } } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 72f05801a0e..9ba7a63139c 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -45,7 +45,7 @@ use time::Timespec; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, - operation::SPLIT_PREFIX, + operation::{SharedReadTablet, SPLIT_PREFIX}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, worker::{pd, tablet_gc}, @@ -72,7 +72,7 @@ pub struct StoreContext { pub timer: SteadyTimer, pub schedulers: Schedulers, /// store meta - pub store_meta: Arc>, + pub store_meta: Arc>>, pub engine: ER, pub tablet_registry: TabletRegistry, pub apply_pool: FuturePool, @@ -259,7 +259,7 @@ struct StorePollerBuilder { schedulers: Schedulers, apply_pool: FuturePool, logger: Logger, - store_meta: Arc>, + store_meta: Arc>>, snap_mgr: TabletSnapManager, } @@ -273,7 +273,7 @@ impl StorePollerBuilder { router: StoreRouter, schedulers: Schedulers, logger: Logger, - store_meta: Arc>, + store_meta: Arc>>, snap_mgr: TabletSnapManager, coprocessor_host: CoprocessorHost, ) -> Self { @@ -429,13 +429,22 @@ pub struct Schedulers { pub split_check: Scheduler, } +impl Schedulers { + fn stop(&self) { + self.read.stop(); + self.pd.stop(); + self.tablet_gc.stop(); + self.split_check.stop(); + } +} + /// A set of background threads that will processing offloaded work from /// raftstore. struct Workers { /// Worker for fetching raft logs asynchronously async_read: Worker, pd: LazyWorker, - tablet_gc_worker: Worker, + tablet_gc: Worker, async_write: StoreWriters, purge: Option, @@ -448,18 +457,29 @@ impl Workers { Self { async_read: Worker::new("async-read-worker"), pd, - tablet_gc_worker: Worker::new("tablet-gc-worker"), + tablet_gc: Worker::new("tablet-gc-worker"), async_write: StoreWriters::default(), purge, background, } } + + fn stop(mut self) { + self.async_write.shutdown(); + self.async_read.stop(); + self.pd.stop(); + self.tablet_gc.stop(); + if let Some(w) = self.purge { + w.stop(); + } + } } /// The system used for polling Raft activities. pub struct StoreSystem { system: BatchSystem, StoreFsm>, workers: Option>, + schedulers: Option>, logger: Logger, shutdown: Arc, } @@ -474,7 +494,7 @@ impl StoreSystem { trans: T, pd_client: Arc, router: &StoreRouter, - store_meta: Arc>, + store_meta: Arc>>, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 @@ -548,7 +568,7 @@ impl StoreSystem { ), ); - let tablet_gc_scheduler = workers.tablet_gc_worker.start( + let tablet_gc_scheduler = workers.tablet_gc.start_with_timer( "tablet-gc-worker", tablet_gc::Runner::new(tablet_registry.clone(), self.logger.clone()), ); @@ -568,13 +588,14 @@ impl StoreSystem { tablet_registry, trans, router.clone(), - schedulers, + schedulers.clone(), self.logger.clone(), store_meta.clone(), snap_mgr, coprocessor_host, ); self.workers = Some(workers); + self.schedulers = Some(schedulers); let peers = builder.init()?; // Choose a different name so we know what version is actually used. rs stands // for raft store. @@ -585,9 +606,14 @@ impl StoreSystem { let mut address = Vec::with_capacity(peers.len()); { let mut meta = store_meta.as_ref().lock().unwrap(); - for (region_id, (tx, fsm)) in peers { - meta.readers - .insert(region_id, fsm.peer().generate_read_delegate()); + for (region_id, (tx, mut fsm)) in peers { + if let Some(tablet) = fsm.peer_mut().tablet() { + let read_tablet = SharedReadTablet::new(tablet.clone()); + meta.readers.insert( + region_id, + (fsm.peer().generate_read_delegate(), read_tablet), + ); + } address.push(region_id); mailboxes.push(( @@ -612,18 +638,16 @@ impl StoreSystem { if self.workers.is_none() { return; } - let mut workers = self.workers.take().unwrap(); + let workers = self.workers.take().unwrap(); - // TODO: gracefully shutdown future pool + // TODO: gracefully shutdown future apply pool + // Stop schedulers first, so all background future worker pool will be stopped + // gracefully. + self.schedulers.take().unwrap().stop(); self.system.shutdown(); - workers.async_write.shutdown(); - workers.async_read.stop(); - workers.pd.stop(); - if let Some(w) = workers.purge { - w.stop(); - } + workers.stop(); } } @@ -707,6 +731,7 @@ where let system = StoreSystem { system, workers: None, + schedulers: None, logger: logger.clone(), shutdown: Arc::new(AtomicBool::new(false)), }; diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index f107715a535..a5f22d7e1a8 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -12,9 +12,7 @@ use engine_traits::{KvEngine, RaftEngine}; use futures::{compat::Future01CompatExt, FutureExt}; use keys::{data_end_key, data_key}; use kvproto::metapb::Region; -use raftstore::store::{ - fsm::store::StoreRegionMeta, Config, ReadDelegate, RegionReadProgressRegistry, -}; +use raftstore::store::{fsm::store::StoreRegionMeta, Config, RegionReadProgressRegistry}; use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, @@ -24,13 +22,14 @@ use tikv_util::{ use crate::{ batch::StoreContext, + operation::ReadDelegatePair, router::{StoreMsg, StoreTick}, }; -pub struct StoreMeta { +pub struct StoreMeta { pub store_id: u64, /// region_id -> reader - pub readers: HashMap, + pub readers: HashMap>, /// region_id -> `RegionReadProgress` pub region_read_progress: RegionReadProgressRegistry, /// (region_end_key, epoch.version) -> region_id @@ -42,9 +41,9 @@ pub struct StoreMeta { pub(crate) regions: HashMap, } -impl StoreMeta { - pub fn new(store_id: u64) -> StoreMeta { - StoreMeta { +impl StoreMeta { + pub fn new(store_id: u64) -> Self { + Self { store_id, readers: HashMap::default(), region_read_progress: RegionReadProgressRegistry::default(), @@ -96,7 +95,7 @@ impl StoreMeta { } } -impl StoreRegionMeta for StoreMeta { +impl StoreRegionMeta for StoreMeta { #[inline] fn store_id(&self) -> u64 { self.store_id diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index c36c7353871..7127cd45306 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -17,7 +17,9 @@ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; use protobuf::Message; use raftstore::{ - store::{fsm::new_admin_request, needs_evict_entry_cache, Transport, WriteTask}, + store::{ + fsm::new_admin_request, needs_evict_entry_cache, Transport, WriteTask, RAFT_INIT_LOG_INDEX, + }, Result, }; use slog::{debug, error, info}; @@ -32,6 +34,47 @@ use crate::{ worker::tablet_gc, }; +#[derive(Debug)] +pub struct CompactLogContext { + skipped_ticks: usize, + approximate_log_size: u64, + last_applying_index: u64, + /// Tombstone tablets can only be destroyed when the tablet that replaces it + /// is persisted. This is a list of tablet index that awaits to be + /// persisted. When persisted_apply is advanced, we need to notify tablet_gc + /// worker to destroy them. + tombstone_tablets_wait_index: Vec, +} + +impl CompactLogContext { + pub fn new(last_applying_index: u64) -> CompactLogContext { + CompactLogContext { + skipped_ticks: 0, + approximate_log_size: 0, + last_applying_index, + tombstone_tablets_wait_index: vec![], + } + } + + #[inline] + pub fn maybe_skip_compact_log(&mut self, max_skip_ticks: usize) -> bool { + if self.skipped_ticks < max_skip_ticks { + self.skipped_ticks += 1; + true + } else { + false + } + } + + pub fn add_log_size(&mut self, size: u64) { + self.approximate_log_size += size; + } + + pub fn set_last_applying_index(&mut self, index: u64) { + self.last_applying_index = index; + } +} + impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { pub fn on_compact_log_tick(&mut self, force: bool) { if !self.fsm.peer().is_leader() { @@ -130,13 +173,16 @@ impl Peer { replicated_idx } else if applied_idx > first_idx && applied_idx - first_idx >= store_ctx.cfg.raft_log_gc_count_limit() - || self.approximate_raft_log_size() >= store_ctx.cfg.raft_log_gc_size_limit().0 + || self.compact_log_context().approximate_log_size + >= store_ctx.cfg.raft_log_gc_size_limit().0 { std::cmp::max(first_idx + (last_idx - first_idx) / 2, replicated_idx) } else if replicated_idx < first_idx || last_idx - first_idx < 3 || replicated_idx - first_idx < store_ctx.cfg.raft_log_gc_threshold - && self.maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) + && self + .compact_log_context_mut() + .maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) { return; } else { @@ -163,7 +209,7 @@ impl Peer { let (ch, _) = CmdResChannel::pair(); self.on_admin_command(store_ctx, req, ch); - self.reset_skip_compact_log_ticks(); + self.compact_log_context_mut().skipped_ticks = 0; } } @@ -217,6 +263,46 @@ impl Apply { } impl Peer { + #[inline] + pub fn record_tombstone_tablet( + &mut self, + ctx: &StoreContext, + old_tablet: EK, + new_tablet_index: u64, + ) { + let compact_log_context = self.compact_log_context_mut(); + compact_log_context + .tombstone_tablets_wait_index + .push(new_tablet_index); + let _ = ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::prepare_destroy( + old_tablet, + self.region_id(), + new_tablet_index, + )); + } + + /// Returns if there's any tombstone being removed. + #[inline] + fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { + let compact_log_context = self.compact_log_context_mut(); + let removed = compact_log_context + .tombstone_tablets_wait_index + .iter() + .take_while(|i| **i <= persisted) + .count(); + if removed > 0 { + compact_log_context + .tombstone_tablets_wait_index + .drain(..removed); + true + } else { + false + } + } + pub fn on_apply_res_compact_log( &mut self, store_ctx: &mut StoreContext, @@ -260,18 +346,25 @@ impl Peer { self.compact_log_from_engine(store_ctx); } - let applied = *self.last_applying_index_mut(); + let context = self.compact_log_context_mut(); + let applied = context.last_applying_index; let total_cnt = applied - old_truncated; let remain_cnt = applied - res.compact_index; - self.update_approximate_raft_log_size(|s| s * remain_cnt / total_cnt); + context.approximate_log_size = + (context.approximate_log_size as f64 * (remain_cnt as f64 / total_cnt as f64)) as u64; } + /// Called when apply index is persisted. There are two different situation: + /// + /// Generally, additional writes are triggered to persist apply index. In + /// this case task is `Some`. But after applying snapshot, the apply + /// index is persisted ahead of time. In this case task is `None`. #[inline] pub fn on_advance_persisted_apply_index( &mut self, store_ctx: &mut StoreContext, old_persisted: u64, - task: &mut WriteTask, + task: Option<&mut WriteTask>, ) { let new_persisted = self.storage().apply_trace().persisted_apply_index(); if old_persisted < new_persisted { @@ -286,14 +379,20 @@ impl Peer { } else { self.set_has_extra_write(); } - if old_persisted < self.entry_storage().truncated_index() + 1 { + // If it's snapshot, logs are gc already. + if task.is_some() && old_persisted < self.entry_storage().truncated_index() + 1 { self.compact_log_from_engine(store_ctx); } - if self.remove_tombstone_tablets_before(new_persisted) { + if self.remove_tombstone_tablets(new_persisted) { let sched = store_ctx.schedulers.tablet_gc.clone(); - task.persisted_cbs.push(Box::new(move || { + if let Some(task) = task { + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); + })); + } else { + // In snapshot, the index is persisted, tablet can be destroyed directly. let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); - })) + } } } } @@ -302,6 +401,10 @@ impl Peer { let truncated = self.entry_storage().truncated_index() + 1; let persisted_applied = self.storage().apply_trace().persisted_apply_index(); let compact_index = std::cmp::min(truncated, persisted_applied); + if compact_index == RAFT_INIT_LOG_INDEX + 1 { + // There is no logs at RAFT_INIT_LOG_INDEX, nothing to delete. + return; + } // Raft Engine doesn't care about first index. if let Err(e) = store_ctx diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 4f2abb9c65e..9ceaa76c03b 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -5,6 +5,7 @@ mod conf_change; mod split; mod transfer_leader; +pub use compact_log::CompactLogContext; use compact_log::CompactLogResult; use conf_change::ConfChangeResult; use engine_traits::{KvEngine, RaftEngine}; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index faf059b3871..add5af1ce52 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -25,7 +25,7 @@ //! created by the store, and here init it using the data sent from the parent //! peer. -use std::{borrow::Cow, cmp, path::PathBuf}; +use std::{any::Any, borrow::Cow, cmp, path::PathBuf}; use collections::HashSet; use crossbeam::channel::SendError; @@ -58,7 +58,7 @@ use slog::info; use crate::{ batch::StoreContext, fsm::{ApplyResReporter, PeerFsmDelegate}, - operation::AdminCmdResult, + operation::{AdminCmdResult, SharedReadTablet}, raft::{Apply, Peer}, router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, worker::tablet_gc, @@ -73,6 +73,10 @@ pub struct SplitResult { // The index of the derived region in `regions` pub derived_index: usize, pub tablet_index: u64, + // Hack: in common case we should use generic, but split is an unfrequent + // event that performance is not critical. And using `Any` can avoid polluting + // all existing code. + tablet: Box, } #[derive(Debug)] @@ -370,8 +374,6 @@ impl Apply { ) }); } - // Remove the old write batch. - self.write_batch.take(); let reg = self.tablet_registry(); let path = reg.tablet_path(region_id, log_index); let mut ctx = TabletContext::new(®ions[derived_index], Some(log_index)); @@ -380,7 +382,7 @@ impl Apply { // TODO: Should we avoid flushing for the old tablet? ctx.flush_state = Some(self.flush_state().clone()); let tablet = reg.tablet_factory().open_tablet(ctx, &path).unwrap(); - self.publish_tablet(tablet); + self.set_tablet(tablet.clone()); self.region_state_mut() .set_region(regions[derived_index].clone()); @@ -396,6 +398,7 @@ impl Apply { regions, derived_index, tablet_index: log_index, + tablet: Box::new(tablet), }), )) } @@ -427,10 +430,14 @@ impl Peer { }; fail_point!("on_split_invalidate_locks"); + let tablet: EK = match res.tablet.downcast() { + Ok(t) => *t, + Err(t) => unreachable!("tablet type should be the same: {:?}", t), + }; { let mut meta = store_ctx.store_meta.lock().unwrap(); meta.set_region(derived, true, &self.logger); - let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); + let (reader, read_tablet) = meta.readers.get_mut(&derived.get_id()).unwrap(); self.set_region( &store_ctx.coprocessor_host, reader, @@ -438,6 +445,12 @@ impl Peer { RegionChangeReason::Split, res.tablet_index, ); + + // Tablet should be updated in lock to match the epoch. + *read_tablet = SharedReadTablet::new(tablet.clone()); + } + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(store_ctx, tablet, res.tablet_index); } self.post_split(); @@ -457,8 +470,6 @@ impl Peer { self.split_flow_control_mut().may_skip_split_check = false; self.add_pending_tick(PeerTick::SplitRegionCheck); } - - self.record_tablet_as_tombstone_and_refresh(res.tablet_index, store_ctx); let _ = store_ctx .schedulers .tablet_gc @@ -632,7 +643,7 @@ mod test { use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::TestTabletFactory, + kv::{KvTestEngine, TestTabletFactory}, }; use engine_traits::{ FlushState, Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, @@ -679,7 +690,7 @@ mod test { } fn assert_split( - apply: &mut Apply, + apply: &mut Apply, parent_id: u64, right_derived: bool, new_region_ids: Vec, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 4831c4abf9f..8b0d3d7d461 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -57,7 +57,8 @@ mod control; mod write; pub use admin::{ - temp_split_path, AdminCmdResult, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, + temp_split_path, AdminCmdResult, CompactLogContext, RequestSplit, SplitFlowControl, SplitInit, + SPLIT_PREFIX, }; pub use control::ProposalControl; pub use write::{ diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 954c6992cf9..f312162d1e5 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -320,12 +320,12 @@ impl Peer { /// memory states. pub fn finish_destroy(&mut self, ctx: &mut StoreContext) { info!(self.logger, "peer destroyed"); - ctx.router.close(self.region_id()); + let region_id = self.region_id(); + ctx.router.close(region_id); { - ctx.store_meta - .lock() - .unwrap() - .remove_region(self.region_id()); + let mut meta = ctx.store_meta.lock().unwrap(); + meta.remove_region(region_id); + meta.readers.remove(®ion_id); } if let Some(msg) = self.destroy_progress_mut().finish() { // The message will be dispatched to store fsm, which will create a diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index c49a14142ce..dc245c24384 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -7,9 +7,9 @@ mod query; mod ready; pub use command::{ - AdminCmdResult, CommittedEntries, ProposalControl, RequestSplit, SimpleWriteBinary, - SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, - SPLIT_PREFIX, + AdminCmdResult, CommittedEntries, CompactLogContext, ProposalControl, RequestSplit, + SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, + SplitFlowControl, SPLIT_PREFIX, }; pub use life::DestroyProgress; pub use ready::{ @@ -17,4 +17,7 @@ pub use ready::{ StateStorage, }; -pub(crate) use self::{command::SplitInit, query::LocalReader}; +pub(crate) use self::{ + command::SplitInit, + query::{LocalReader, ReadDelegatePair, SharedReadTablet}, +}; diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index ca92729ee6f..0abd0cccd72 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -150,7 +150,7 @@ impl Peer { pub(crate) fn maybe_renew_leader_lease( &mut self, ts: Timespec, - store_meta: &Mutex, + store_meta: &Mutex>, progress: Option, ) { // A nonleader peer should never has leader lease. @@ -170,12 +170,12 @@ impl Peer { }; if let Some(progress) = progress { let mut meta = store_meta.lock().unwrap(); - let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; self.maybe_update_read_progress(reader, progress); } if let Some(progress) = read_progress { let mut meta = store_meta.lock().unwrap(); - let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; self.maybe_update_read_progress(reader, progress); } } diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 2cb5497d789..e4c0aa6d0b9 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -8,7 +8,7 @@ use std::{ use batch_system::Router; use crossbeam::channel::TrySendError; -use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine}; use futures::Future; use kvproto::{ errorpb, @@ -20,10 +20,9 @@ use raftstore::{ cmd_resp, util::LeaseState, worker_metrics::{self, TLS_LOCAL_READ_METRICS}, - LocalReadContext, LocalReaderCore, ReadDelegate, ReadExecutor, ReadExecutorProvider, - RegionSnapshot, RequestPolicy, + LocalReaderCore, ReadDelegate, ReadExecutorProvider, RegionSnapshot, }, - Error, Result, + Result, }; use slog::{debug, Logger}; use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now, Either}; @@ -50,6 +49,87 @@ where } } +pub type ReadDelegatePair = (ReadDelegate, SharedReadTablet); + +/// A share struct for local reader. +/// +/// Though it looks like `CachedTablet`, but there are subtle differences. +/// 1. `CachedTablet` always hold the latest version of the tablet. But +/// `SharedReadTablet` should only hold the tablet that matches epoch. So it +/// will be updated only when the epoch is updated. +/// 2. `SharedReadTablet` should always hold a tablet and the same tablet. If +/// tablet is taken, then it should be considered as stale and should check +/// again epoch to load the new `SharedReadTablet`. +/// 3. `SharedReadTablet` may be cloned into thread local. So its cache should +/// be released as soon as possible, so there should be no strong reference +/// that prevents tablet from being dropped after it's marked as stale by other +/// threads. +pub struct SharedReadTablet { + tablet: Arc>>, + cache: Option, + source: bool, +} + +impl SharedReadTablet { + pub fn new(tablet: EK) -> Self { + Self { + tablet: Arc::new(Mutex::new(Some(tablet))), + cache: None, + source: true, + } + } + + /// Should call `fill_cache` first. + pub fn cache(&self) -> &EK { + self.cache.as_ref().unwrap() + } + + pub fn fill_cache(&mut self) -> bool + where + EK: Clone, + { + self.cache = self.tablet.lock().unwrap().clone(); + self.cache.is_some() + } + + pub fn release(&mut self) { + self.cache = None; + } +} + +impl Clone for SharedReadTablet { + fn clone(&self) -> Self { + Self { + tablet: Arc::clone(&self.tablet), + cache: None, + source: false, + } + } +} + +impl Drop for SharedReadTablet { + fn drop(&mut self) { + if self.source { + self.tablet.lock().unwrap().take(); + } + } +} + +enum ReadResult { + Ok(T), + Redirect, + RetryForStaleDelegate, + Err(E), +} + +fn fail_resp(msg: String) -> RaftCmdResponse { + let mut err = errorpb::Error::default(); + err.set_message(msg); + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + resp +} + #[derive(Clone)] pub struct LocalReader where @@ -67,63 +147,69 @@ where E: KvEngine, C: MsgRouter, { - pub fn new( - store_meta: Arc>, - reg: TabletRegistry, - router: C, - logger: Logger, - ) -> Self { + pub fn new(store_meta: Arc>>, router: C, logger: Logger) -> Self { Self { - local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta, reg)), + local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta)), router, logger, } } - pub fn store_meta(&self) -> &Arc> { + pub fn store_meta(&self) -> &Arc>> { &self.local_reader.store_meta().store_meta } - pub fn pre_propose_raft_command( + fn pre_propose_raft_command( &mut self, req: &RaftCmdRequest, - ) -> Result, RequestPolicy)>> { - if let Some(delegate) = self.local_reader.validate_request(req)? { - let mut inspector = SnapRequestInspector { - delegate: &delegate, - logger: &self.logger, - }; - match inspector.inspect(req) { - Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), - Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), - // It can not handle other policies. - // TODO: we should only abort when lease expires. For other cases we should retry - // infinitely. - Ok(_) => Ok(None), - Err(e) => Err(e), + ) -> ReadResult<(CachedReadDelegate, ReadRequestPolicy)> { + let mut delegate = match self.local_reader.validate_request(req) { + Ok(Some(delegate)) => delegate, + Ok(None) => return ReadResult::Redirect, + Err(e) => return ReadResult::Err(e), + }; + + if !delegate.cached_tablet.fill_cache() { + return ReadResult::RetryForStaleDelegate; + } + let mut inspector = SnapRequestInspector { + delegate: &delegate, + logger: &self.logger, + }; + match inspector.inspect(req) { + Ok(ReadRequestPolicy::ReadLocal) => { + ReadResult::Ok((delegate, ReadRequestPolicy::ReadLocal)) } - } else { - Err(Error::RegionNotFound(req.get_header().get_region_id())) + Ok(ReadRequestPolicy::StaleRead) => { + ReadResult::Ok((delegate, ReadRequestPolicy::StaleRead)) + } + // It can not handle other policies. + // TODO: we should only abort when lease expires. For other cases we should retry + // infinitely. + Ok(ReadRequestPolicy::ReadIndex) => ReadResult::Redirect, + Err(e) => ReadResult::Err(e), } } fn try_get_snapshot( &mut self, req: &RaftCmdRequest, - ) -> std::result::Result>, RaftCmdResponse> { + ) -> ReadResult, RaftCmdResponse> { match self.pre_propose_raft_command(req) { - Ok(Some((mut delegate, policy))) => { + ReadResult::Ok((mut delegate, policy)) => { let mut snap = match policy { - RequestPolicy::ReadLocal => { + ReadRequestPolicy::ReadLocal => { let region = Arc::clone(&delegate.region); - let snap = - RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + let snap = RegionSnapshot::from_snapshot( + Arc::new(delegate.cached_tablet.cache().snapshot()), + region, + ); // Ensures the snapshot is acquired before getting the time atomic::fence(atomic::Ordering::Release); let snapshot_ts = monotonic_raw_now(); if !delegate.is_in_leader_lease(snapshot_ts) { - return Ok(None); + return ReadResult::Redirect; } TLS_LOCAL_READ_METRICS @@ -133,18 +219,24 @@ where self.maybe_renew_lease_in_advance(&delegate, req, snapshot_ts); snap } - RequestPolicy::StaleRead => { + ReadRequestPolicy::StaleRead => { let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - delegate.check_stale_read_safe(read_ts)?; + if let Err(e) = delegate.check_stale_read_safe(read_ts) { + return ReadResult::Err(e); + } let region = Arc::clone(&delegate.region); - let snap = - RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + let snap = RegionSnapshot::from_snapshot( + Arc::new(delegate.cached_tablet.cache().snapshot()), + region, + ); TLS_LOCAL_READ_METRICS .with(|m| m.borrow_mut().local_executed_requests.inc()); - delegate.check_stale_read_safe(read_ts)?; + if let Err(e) = delegate.check_stale_read_safe(read_ts) { + return ReadResult::Err(e); + } TLS_LOCAL_READ_METRICS .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); @@ -156,10 +248,11 @@ where snap.txn_ext = Some(delegate.txn_ext.clone()); snap.bucket_meta = delegate.bucket_meta.clone(); - Ok(Some(snap)) + delegate.cached_tablet.release(); + + ReadResult::Ok(snap) } - Ok(None) => Ok(None), - Err(e) => { + ReadResult::Err(e) => { let mut response = cmd_resp::new_error(e); if let Some(delegate) = self .local_reader @@ -168,8 +261,10 @@ where { cmd_resp::bind_term(&mut response, delegate.term); } - Err(response) + ReadResult::Err(response) } + ReadResult::Redirect => ReadResult::Redirect, + ReadResult::RetryForStaleDelegate => ReadResult::RetryForStaleDelegate, } } @@ -179,50 +274,85 @@ where ) -> impl Future, RaftCmdResponse>> + Send { let region_id = req.header.get_ref().region_id; - let res = match self.try_get_snapshot(&req) { - res @ (Ok(Some(_)) | Err(_)) => Either::Left(res), - Ok(None) => Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())), + let mut tried_cnt = 0; + let res = loop { + let res = self.try_get_snapshot(&req); + match res { + ReadResult::Ok(snap) => break Either::Left(Ok(snap)), + ReadResult::Err(e) => break Either::Left(Err(e)), + ReadResult::Redirect => { + break Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())); + } + ReadResult::RetryForStaleDelegate => { + tried_cnt += 1; + if tried_cnt < 10 { + continue; + } + break Either::Left(Err(fail_resp(format!( + "internal error: failed to get valid dalegate for {}", + region_id + )))); + } + } }; worker_metrics::maybe_tls_local_read_metrics_flush(); async move { - match res { - Either::Left(Ok(Some(snap))) => Ok(snap), - Either::Left(Err(e)) => Err(e), - Either::Right((fut, mut reader)) => { - let err = match fut.await? { - Some(query_res) => { - if query_res.read().is_some() { - // If query successful, try again. - req.mut_header().set_read_quorum(false); - if let Some(snap) = reader.try_get_snapshot(&req)? { - return Ok(snap); - } else { - let mut err = errorpb::Error::default(); - err.set_message(format!("no delegate found for {}", region_id)); - err - } - } else { - let QueryResult::Response(res) = query_res else { unreachable!() }; - assert!(res.get_header().has_error(), "{:?}", res); - return Err(res); + let (mut fut, mut reader) = match res { + Either::Left(Ok(snap)) => return Ok(snap), + Either::Left(Err(e)) => return Err(e), + Either::Right((fut, reader)) => (fut, reader), + }; + + let mut tried_cnt = 0; + loop { + match fut.await? { + Some(query_res) => { + if query_res.read().is_none() { + let QueryResult::Response(res) = query_res else { unreachable!() }; + assert!(res.get_header().has_error(), "{:?}", res); + return Err(res); + } + } + None => { + return Err(fail_resp(format!( + "internal error: failed to extend lease: canceled: {}", + region_id + ))); + } + } + + // If query successful, try again. + req.mut_header().set_read_quorum(false); + loop { + let r = reader.try_get_snapshot(&req); + match r { + ReadResult::Ok(snap) => return Ok(snap), + ReadResult::Err(e) => return Err(e), + ReadResult::Redirect => { + tried_cnt += 1; + if tried_cnt < 10 { + fut = reader.try_to_renew_lease(region_id, &req); + break; } + return Err(fail_resp(format!( + "internal error: can't handle msg in local reader for {}", + region_id + ))); } - None => { - let mut err = errorpb::Error::default(); - err.set_message(format!( - "failed to extend lease: canceled: {}", + ReadResult::RetryForStaleDelegate => { + tried_cnt += 1; + if tried_cnt < 10 { + continue; + } + return Err(fail_resp(format!( + "internal error: failed to get valid dalegate for {}", region_id - )); - err + ))); } - }; - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - Err(resp) + } } - Either::Left(Ok(None)) => unreachable!(), } } } @@ -309,7 +439,7 @@ where // The reason for this to be Arc, see the comment on get_delegate in // raftstore/src/store/worker/read.rs delegate: Arc, - cached_tablet: CachedTablet, + cached_tablet: SharedReadTablet, } impl Deref for CachedReadDelegate @@ -335,36 +465,20 @@ where } } -impl ReadExecutor for CachedReadDelegate -where - E: KvEngine, -{ - type Tablet = E; - - fn get_tablet(&mut self) -> &E { - self.cached_tablet.latest().unwrap() - } - - fn get_snapshot(&mut self, _: &Option>) -> Arc { - Arc::new(self.cached_tablet.latest().unwrap().snapshot()) - } -} - #[derive(Clone)] struct StoreMetaDelegate where E: KvEngine, { - store_meta: Arc>, - reg: TabletRegistry, + store_meta: Arc>>, } impl StoreMetaDelegate where E: KvEngine, { - pub fn new(store_meta: Arc>, reg: TabletRegistry) -> StoreMetaDelegate { - StoreMetaDelegate { store_meta, reg } + pub fn new(store_meta: Arc>>) -> StoreMetaDelegate { + StoreMetaDelegate { store_meta } } } @@ -373,7 +487,7 @@ where E: KvEngine, { type Executor = CachedReadDelegate; - type StoreMeta = Arc>; + type StoreMeta = Arc>>; fn store_id(&self) -> Option { Some(self.store_meta.as_ref().lock().unwrap().store_id) @@ -384,14 +498,13 @@ where fn get_executor_and_len(&self, region_id: u64) -> (usize, Option) { let meta = self.store_meta.as_ref().lock().unwrap(); let reader = meta.readers.get(®ion_id).cloned(); - if let Some(reader) = reader { + if let Some((reader, read_tablet)) = reader { // If reader is not None, cache must not be None. - let cached_tablet = self.reg.get(region_id).unwrap(); return ( meta.readers.len(), Some(CachedReadDelegate { delegate: Arc::new(reader), - cached_tablet, + cached_tablet: read_tablet, }), ); } @@ -399,13 +512,19 @@ where } } +enum ReadRequestPolicy { + StaleRead, + ReadLocal, + ReadIndex, +} + struct SnapRequestInspector<'r> { delegate: &'r ReadDelegate, logger: &'r Logger, } impl<'r> SnapRequestInspector<'r> { - fn inspect(&mut self, req: &RaftCmdRequest) -> Result { + fn inspect(&mut self, req: &RaftCmdRequest) -> Result { assert!(!req.has_admin_request()); if req.get_requests().len() != 1 || req.get_requests().first().unwrap().get_cmd_type() != CmdType::Snap @@ -417,26 +536,26 @@ impl<'r> SnapRequestInspector<'r> { let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); if flags.contains(WriteBatchFlags::STALE_READ) { - return Ok(RequestPolicy::StaleRead); + return Ok(ReadRequestPolicy::StaleRead); } if req.get_header().get_read_quorum() { - return Ok(RequestPolicy::ReadIndex); + return Ok(ReadRequestPolicy::ReadIndex); } // If applied index's term differs from current raft's term, leader transfer // must happened, if read locally, we may read old value. if !self.has_applied_to_current_term() { - return Ok(RequestPolicy::ReadIndex); + return Ok(ReadRequestPolicy::ReadIndex); } // Local read should be performed, if and only if leader is in lease. // None for now. match self.inspect_lease() { - LeaseState::Valid => Ok(RequestPolicy::ReadLocal), + LeaseState::Valid => Ok(ReadRequestPolicy::ReadLocal), LeaseState::Expired | LeaseState::Suspect => { // Perform a consistent read to Raft quorum and try to renew the leader lease. - Ok(RequestPolicy::ReadIndex) + Ok(ReadRequestPolicy::ReadIndex) } } } @@ -480,12 +599,13 @@ mod tests { thread::{self, JoinHandle}, }; + use collections::HashSet; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, kv::{KvTestEngine, TestTabletFactory}, }; - use engine_traits::{MiscExt, Peekable, SyncMutable, TabletContext, DATA_CFS}; + use engine_traits::{MiscExt, SyncMutable, TabletContext, TabletRegistry, DATA_CFS}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; use pd_client::BucketMeta; @@ -505,17 +625,27 @@ mod tests { #[derive(Clone)] struct MockRouter { p_router: SyncSender<(u64, PeerMsg)>, + addresses: Arc>>, } impl MockRouter { - fn new() -> (MockRouter, Receiver<(u64, PeerMsg)>) { + fn new(addresses: Arc>>) -> (MockRouter, Receiver<(u64, PeerMsg)>) { let (p_ch, p_rx) = sync_channel(1); - (MockRouter { p_router: p_ch }, p_rx) + ( + MockRouter { + p_router: p_ch, + addresses, + }, + p_rx, + ) } } impl MsgRouter for MockRouter { fn send(&self, addr: u64, cmd: PeerMsg) -> std::result::Result<(), TrySendError> { + if !self.addresses.lock().unwrap().contains(&addr) { + return Err(TrySendError::Disconnected(cmd)); + } self.p_router.send((addr, cmd)).unwrap(); Ok(()) } @@ -524,16 +654,15 @@ mod tests { #[allow(clippy::type_complexity)] fn new_reader( store_id: u64, - store_meta: Arc>, - reg: TabletRegistry, + store_meta: Arc>>, + addresses: Arc>>, ) -> ( LocalReader, Receiver<(u64, PeerMsg)>, ) { - let (ch, rx) = MockRouter::new(); + let (ch, rx) = MockRouter::new(addresses); let mut reader = LocalReader::new( store_meta, - reg, ch, Logger::root(slog::Discard, o!("key1" => "value1")), ); @@ -607,7 +736,8 @@ mod tests { let reg = TabletRegistry::new(factory, path.path()).unwrap(); let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); - let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), reg.clone()); + let addresses: Arc>> = Arc::default(); + let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), addresses.clone()); let (mix_tx, mix_rx) = sync_channel(1); let handler = mock_raftstore(mix_rx); @@ -649,9 +779,11 @@ mod tests { ); // No msg will ben sent rx.try_recv().unwrap_err(); + // It will be rejected first when processing local, and then rejected when + // trying to forward to raftstore. assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.no_region.get()), - 1 + 2 ); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), @@ -683,10 +815,11 @@ mod tests { track_ver: TrackVer::new(), bucket_meta: Some(bucket_meta.clone()), }; - meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data let ctx = TabletContext::new(®ion1, Some(10)); - reg.load(ctx, true).unwrap(); + let mut tablet = reg.load(ctx, true).unwrap(); + let shared = SharedReadTablet::new(tablet.latest().unwrap().clone()); + meta.readers.insert(1, (read_delegate, shared)); } let (ch_tx, ch_rx) = sync_channel(1); @@ -701,6 +834,7 @@ mod tests { meta.readers .get_mut(&1) .unwrap() + .0 .update(ReadProgress::applied_term(term6)); }), rx, @@ -710,6 +844,7 @@ mod tests { // The first try will be rejected due to unmatched applied term but after update // the applied term by the above thread, the snapshot will be acquired by // retrying. + addresses.lock().unwrap().insert(1); let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); assert!(Arc::ptr_eq(snap.txn_ext.as_ref().unwrap(), &txn_ext)); assert!(Arc::ptr_eq( @@ -730,14 +865,16 @@ mod tests { // Case: Expire lease to make the local reader lease check fail. lease.expire_remote_lease(); let remote = lease.maybe_new_remote_lease(term6).unwrap(); + let meta = store_meta.clone(); // Send what we want to do to mock raftstore mix_tx .send(( Box::new(move || { - let mut meta = store_meta.lock().unwrap(); + let mut meta = meta.lock().unwrap(); meta.readers .get_mut(&1) .unwrap() + .0 .update(ReadProgress::leader_lease(remote)); }), rx, @@ -757,6 +894,25 @@ mod tests { ); rx = ch_rx.recv().unwrap(); + // Case: Tablet miss should triger retry. + { + let ctx = TabletContext::new(®ion1, Some(15)); + let mut tablet = reg.load(ctx, true).unwrap(); + let shared = SharedReadTablet::new(tablet.latest().unwrap().clone()); + let mut meta = store_meta.lock().unwrap(); + meta.readers.get_mut(&1).unwrap().1 = shared; + } + block_on(reader.snapshot(cmd.clone())).unwrap(); + // Tablet miss should trigger reload tablet, so cache miss should increase. + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 6 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), + 1 + ); + // Case: Read quorum. let mut cmd_read_quorum = cmd.clone(); cmd_read_quorum.mut_header().set_read_quorum(true); @@ -806,8 +962,7 @@ mod tests { let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); let reg = TabletRegistry::new(factory, path.path()).unwrap(); - let store_meta = - StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(1))), reg.clone()); + let store_meta = StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(1)))); let tablet1; let tablet2; @@ -816,43 +971,46 @@ mod tests { // Create read_delegate with region id 1 let read_delegate = ReadDelegate::mock(1); - meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data let mut ctx = TabletContext::with_infinite_region(1, Some(10)); reg.load(ctx, true).unwrap(); tablet1 = reg.get(1).unwrap().latest().unwrap().clone(); tablet1.put(b"a1", b"val1").unwrap(); + let shared1 = SharedReadTablet::new(tablet1.clone()); + meta.readers.insert(1, (read_delegate, shared1)); // Create read_delegate with region id 2 let read_delegate = ReadDelegate::mock(2); - meta.readers.insert(2, read_delegate); // create tablet with region_id 1 and prepare some data ctx = TabletContext::with_infinite_region(2, Some(10)); reg.load(ctx, true).unwrap(); tablet2 = reg.get(2).unwrap().latest().unwrap().clone(); tablet2.put(b"a2", b"val2").unwrap(); + let shared2 = SharedReadTablet::new(tablet2.clone()); + meta.readers.insert(2, (read_delegate, shared2)); } let (_, delegate) = store_meta.get_executor_and_len(1); let mut delegate = delegate.unwrap(); - let tablet = delegate.get_tablet(); + assert!(delegate.cached_tablet.fill_cache()); + let tablet = delegate.cached_tablet.cache(); assert_eq!(tablet1.path(), tablet.path()); - let snapshot = delegate.get_snapshot(&None); - assert_eq!( - b"val1".to_vec(), - *snapshot.get_value(b"a1").unwrap().unwrap() - ); + let path1 = tablet.path().to_owned(); + delegate.cached_tablet.release(); let (_, delegate) = store_meta.get_executor_and_len(2); let mut delegate = delegate.unwrap(); - let tablet = delegate.get_tablet(); + assert!(delegate.cached_tablet.fill_cache()); + let tablet = delegate.cached_tablet.cache(); assert_eq!(tablet2.path(), tablet.path()); - let snapshot = delegate.get_snapshot(&None); - assert_eq!( - b"val2".to_vec(), - *snapshot.get_value(b"a2").unwrap().unwrap() - ); + + assert!(KvTestEngine::locked(&path1).unwrap()); + drop(tablet1); + drop(reg); + assert!(KvTestEngine::locked(&path1).unwrap()); + store_meta.store_meta.lock().unwrap().readers.remove(&1); + assert!(!KvTestEngine::locked(&path1).unwrap()); } } diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 59c6f2d0f7c..f26659c7b89 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -46,7 +46,7 @@ mod lease; mod local; mod replica; -pub(crate) use self::local::LocalReader; +pub(crate) use self::local::{LocalReader, ReadDelegatePair, SharedReadTablet}; impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> PeerFsmDelegate<'a, EK, ER, T> @@ -436,7 +436,7 @@ impl Peer { } let progress = ReadProgress::applied_term(applied_term); let mut meta = ctx.store_meta.lock().unwrap(); - let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; self.maybe_update_read_progress(reader, progress); } } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 8a0e0770b1f..fe4208db549 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -306,7 +306,8 @@ impl Peer { let mut update_lease = self.is_leader(); if update_lease { for entry in committed_entries.iter().rev() { - self.update_approximate_raft_log_size(|s| s + entry.get_data().len() as u64); + self.compact_log_context_mut() + .add_log_size(entry.get_data().len() as u64); if update_lease { let propose_time = self .proposals() @@ -329,7 +330,8 @@ impl Peer { } let applying_index = committed_entries.last().unwrap().index; let commit_to_current_term = committed_entries.last().unwrap().term == self.term(); - *self.last_applying_index_mut() = applying_index; + self.compact_log_context_mut() + .set_last_applying_index(applying_index); if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { // Compact all cached entries instead of half evict. self.entry_storage_mut().evict_entry_cache(false); @@ -426,7 +428,7 @@ impl Peer { self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); - self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); + self.on_advance_persisted_apply_index(ctx, prev_persisted, Some(&mut write_task)); if !ready.persisted_messages().is_empty() { write_task.messages = ready @@ -612,9 +614,11 @@ impl Peer { // leader apply the split command or an election timeout is passed since split // is committed. We already forbid renewing lease after committing split, and // original leader will update the reader delegate with latest epoch after - // applying split before the split peer starts campaign, so here the only thing - // we need to do is marking split is committed (which is done by `commit_to` - // above). It's correct to allow local read during split. + // applying split before the split peer starts campaign, so what needs to be + // done are 1. mark split is committed, which is done by `commit_to` above, + // 2. make sure split result is invisible until epoch is updated or reader may + // miss data from the new tablet. This is done by always publish tablet in + // `on_apply_res_split`. So it's correct to allow local read during split. // // - For merge, after the prepare merge command is committed, the target peers // may apply commit merge at any time, so we need to forbid any type of read diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 76a5b4297b3..2e1b9362a69 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -45,7 +45,7 @@ use tikv_util::box_err; use crate::{ fsm::ApplyResReporter, - operation::command::temp_split_path, + operation::{command::temp_split_path, SharedReadTablet}, raft::{Apply, Peer, Storage}, Result, StoreContext, }; @@ -197,7 +197,8 @@ impl Peer { StateRole::Follower, ); let persisted_index = self.persisted_index(); - *self.last_applying_index_mut() = persisted_index; + self.compact_log_context_mut() + .set_last_applying_index(persisted_index); let snapshot_index = self.entry_storage().truncated_index(); assert!(snapshot_index >= RAFT_INIT_LOG_INDEX, "{:?}", self.logger); // If leader sends a message append to the follower while it's applying @@ -211,18 +212,41 @@ impl Peer { let mut tablet_ctx = TabletContext::new(self.region(), Some(snapshot_index)); // Use a new FlushState to avoid conflicts with the old one. tablet_ctx.flush_state = Some(flush_state); - ctx.tablet_registry.load(tablet_ctx, false).unwrap(); - self.record_tablet_as_tombstone_and_refresh(snapshot_index, ctx); + let path = ctx.tablet_registry.tablet_path(region_id, snapshot_index); + assert!( + path.exists(), + "{:?} {} not exists", + self.logger.list(), + path.display() + ); + let tablet = ctx + .tablet_registry + .tablet_factory() + .open_tablet(tablet_ctx, &path) + .unwrap_or_else(|e| { + panic!( + "{:?} failed to load tablet at {}: {:?}", + self.logger.list(), + path.display(), + e + ); + }); + + let prev_persisted_applied = self.storage().apply_trace().persisted_apply_index(); self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(snapshot_index); + let read_tablet = SharedReadTablet::new(tablet.clone()); { let mut meta = ctx.store_meta.lock().unwrap(); meta.set_region(self.region(), true, &self.logger); meta.readers - .insert(region_id, self.generate_read_delegate()); + .insert(region_id, (self.generate_read_delegate(), read_tablet)); meta.region_read_progress .insert(region_id, self.read_progress().clone()); } + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(ctx, tablet, snapshot_index); + } self.read_progress_mut().update_applied_core(snapshot_index); let split = self.storage_mut().split_init_mut().take(); if split.as_ref().map_or(true, |s| { @@ -234,6 +258,7 @@ impl Peer { info!(self.logger, "init split with snapshot finished"); self.post_split_init(ctx, init); } + self.on_advance_persisted_apply_index(ctx, prev_persisted_applied, None); self.schedule_apply_fsm(ctx); } } @@ -506,7 +531,22 @@ impl Storage { let old_last_index = self.entry_storage().last_index(); if self.entry_storage().first_index() <= old_last_index { // All states are rewritten in the following blocks. Stale states will be - // cleaned up by compact worker. + // cleaned up by compact worker. Have to use raft write batch here becaue + // raft log engine expects deletes before writes. + let raft_engine = self.entry_storage().raft_engine(); + if task.raft_wb.is_none() { + task.raft_wb = Some(raft_engine.log_batch(64)); + } + let wb = task.raft_wb.as_mut().unwrap(); + raft_engine + .clean(region.get_id(), 0, self.entry_storage().raft_state(), wb) + .unwrap_or_else(|e| { + panic!( + "{:?} failed to clean up region: {:?}", + self.logger().list(), + e + ) + }); self.entry_storage_mut().clear(); } diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 6818d7ae0d9..8660e4795d0 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -2,7 +2,7 @@ use std::{mem, sync::Arc}; -use engine_traits::{CachedTablet, FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; +use engine_traits::{FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{ fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, @@ -19,8 +19,6 @@ use crate::{ /// Apply applies all the committed commands to kv db. pub struct Apply { peer: metapb::Peer, - /// publish the update of the tablet - remote_tablet: CachedTablet, tablet: EK, pub write_batch: Option, /// A buffer for encoding key. @@ -79,7 +77,6 @@ impl Apply { Apply { peer, tablet: remote_tablet.latest().unwrap().clone(), - remote_tablet, write_batch: None, callbacks: vec![], tombstone: false, @@ -155,13 +152,16 @@ impl Apply { &mut self.region_state } - /// Publish the tablet so that it can be used by read worker. - /// - /// Note, during split/merge, lease is expired explicitly and read is - /// forbidden. So publishing it immediately is OK. + /// The tablet can't be public yet, otherwise content of latest tablet + /// doesn't matches its epoch in both readers and peer fsm. #[inline] - pub fn publish_tablet(&mut self, tablet: EK) { - self.remote_tablet.set(tablet.clone()); + pub fn set_tablet(&mut self, tablet: EK) { + assert!( + self.write_batch.as_ref().map_or(true, |wb| wb.is_empty()), + "{:?}", + self.logger.list() + ); + self.write_batch.take(); self.tablet = tablet; } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index f3734b6821d..bc3d8a5af8e 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -30,10 +30,10 @@ use crate::{ batch::StoreContext, fsm::ApplyScheduler, operation::{ - AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, + AsyncWriter, CompactLogContext, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, + SplitFlowControl, }, router::{CmdResChannel, PeerTick, QueryResChannel}, - worker::tablet_gc, Result, }; @@ -43,11 +43,6 @@ const REGION_READ_PROGRESS_CAP: usize = 128; pub struct Peer { raft_group: RawNode>, tablet: CachedTablet, - /// Tombstone tablets can only be destroyed when the tablet that replaces it - /// is persisted. This is a list of tablet index that awaits to be - /// persisted. When persisted_apply is advanced, we need to notify tablet_gc - /// worker to destroy them. - pending_tombstone_tablets: Vec, /// Statistics for self. self_stat: PeerStat, @@ -60,9 +55,7 @@ pub struct Peer { peer_heartbeats: HashMap, /// For raft log compaction. - skip_compact_log_ticks: usize, - approximate_raft_log_size: u64, - last_applying_index: u64, + compact_log_context: CompactLogContext, /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. @@ -151,13 +144,10 @@ impl Peer { let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { tablet: cached_tablet, - pending_tombstone_tablets: Vec::new(), self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), - skip_compact_log_ticks: 0, - approximate_raft_log_size: 0, - last_applying_index: raft_group.store().apply_state().get_applied_index(), + compact_log_context: CompactLogContext::new(applied_index), raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), @@ -346,41 +336,18 @@ impl Peer { } #[inline] - pub fn record_tablet_as_tombstone_and_refresh( - &mut self, - new_tablet_index: u64, - ctx: &StoreContext, - ) { - if let Some(old_tablet) = self.tablet.cache() { - self.pending_tombstone_tablets.push(new_tablet_index); - let _ = ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::prepare_destroy( - old_tablet.clone(), - self.region_id(), - new_tablet_index, - )); - } - // TODO: Handle race between split and snapshot. So that we can assert - // `self.tablet.refresh() == 1` - assert!(self.tablet.refresh() > 0); + pub fn set_tablet(&mut self, tablet: EK) -> Option { + self.tablet.set(tablet) } - /// Returns if there's any tombstone being removed. #[inline] - pub fn remove_tombstone_tablets_before(&mut self, persisted: u64) -> bool { - let removed = self - .pending_tombstone_tablets - .iter() - .take_while(|i| **i <= persisted) - .count(); - if removed > 0 { - self.pending_tombstone_tablets.drain(..removed); - true - } else { - false - } + pub fn compact_log_context_mut(&mut self) -> &mut CompactLogContext { + &mut self.compact_log_context + } + + #[inline] + pub fn compact_log_context(&self) -> &CompactLogContext { + &self.compact_log_context } #[inline] @@ -543,35 +510,6 @@ impl Peer { down_peers } - #[inline] - pub fn reset_skip_compact_log_ticks(&mut self) { - self.skip_compact_log_ticks = 0; - } - - #[inline] - pub fn maybe_skip_compact_log(&mut self, max_skip_ticks: usize) -> bool { - if self.skip_compact_log_ticks < max_skip_ticks { - self.skip_compact_log_ticks += 1; - true - } else { - false - } - } - - #[inline] - pub fn approximate_raft_log_size(&self) -> u64 { - self.approximate_raft_log_size - } - - #[inline] - pub fn update_approximate_raft_log_size(&mut self, f: impl Fn(u64) -> u64) { - self.approximate_raft_log_size = f(self.approximate_raft_log_size); - } - - pub fn last_applying_index_mut(&mut self) -> &mut u64 { - &mut self.last_applying_index - } - #[inline] pub fn state_role(&self) -> StateRole { self.raft_group.raft.state diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 51bd41ba253..b0eec5a196c 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -307,8 +307,9 @@ mod tests { }; use raft::{Error as RaftError, StorageError}; use raftstore::store::{ - util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, - TabletSnapKey, TabletSnapManager, WriteTask, + util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, FetchedLogs, GenSnapRes, + ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, + RAFT_INIT_LOG_TERM, }; use slog::o; use tempfile::TempDir; @@ -357,14 +358,20 @@ mod tests { region } + fn new_entry(index: u64, term: u64) -> Entry { + let mut e = Entry::default(); + e.set_index(index); + e.set_term(term); + e + } + #[test] fn test_apply_snapshot() { let region = new_region(); let path = TempDir::new().unwrap(); let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()).unwrap(); - let raft_engine = - engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) - .unwrap(); + let engines = engine_test::new_temp_engine(&path); + let raft_engine = engines.raft.clone(); let mut wb = raft_engine.log_batch(10); write_initial_states(&mut wb, region.clone()).unwrap(); assert!(!wb.is_empty()); @@ -381,26 +388,57 @@ mod tests { .unwrap() .unwrap(); - let snapshot = new_empty_snapshot(region.clone(), 10, 9, false); - let mut task = WriteTask::new(region.get_id(), 5, 0); - s.apply_snapshot(&snapshot, &mut task, mgr, reg).unwrap(); + let mut task = WriteTask::new(region.get_id(), 5, 1); + let entries = (RAFT_INIT_LOG_INDEX + 1..RAFT_INIT_LOG_INDEX + 10) + .map(|i| new_entry(i, RAFT_INIT_LOG_TERM)) + .collect(); + s.entry_storage_mut().append(entries, &mut task); + write_to_db_for_test(&engines, task); + + let snap_index = RAFT_INIT_LOG_INDEX + 20; + let snap_term = 9; + let path = mgr.final_recv_path(&TabletSnapKey::new( + region.get_id(), + 5, + snap_term, + snap_index, + )); + reg.tablet_factory() + .open_tablet(TabletContext::new(®ion, Some(snap_index)), &path) + .unwrap(); + let snapshot = new_empty_snapshot(region.clone(), snap_index, snap_term, false); + let mut task = WriteTask::new(region.get_id(), 5, 1); + s.apply_snapshot(&snapshot, &mut task, mgr, reg.clone()) + .unwrap(); + // Add more entries to check if old entries are cleared. If not, it should panic + // with memtable hole when using raft engine. + let entries = (snap_index + 1..=snap_index + 10) + .map(|i| new_entry(i, snap_term)) + .collect(); + s.entry_storage_mut().append(entries, &mut task); + + assert!(!reg.tablet_path(region.get_id(), snap_index).exists()); + assert!(!task.persisted_cbs.is_empty()); + + write_to_db_for_test(&engines, task); + + assert!(reg.tablet_path(region.get_id(), snap_index).exists()); // It can be set before load tablet. assert_eq!(PeerState::Normal, s.region_state().get_state()); - assert_eq!(10, s.entry_storage().truncated_index()); - assert_eq!(9, s.entry_storage().truncated_term()); - assert_eq!(9, s.entry_storage().last_term()); - assert_eq!(10, s.entry_storage().raft_state().last_index); + assert_eq!(snap_index, s.entry_storage().truncated_index()); + assert_eq!(snap_term, s.entry_storage().truncated_term()); + assert_eq!(snap_term, s.entry_storage().last_term()); + assert_eq!(snap_index + 10, s.entry_storage().raft_state().last_index); // This index can't be set before load tablet. - assert_ne!(10, s.entry_storage().applied_index()); - assert_ne!(9, s.entry_storage().applied_term()); - assert_eq!(10, s.region_state().get_tablet_index()); - assert!(!task.persisted_cbs.is_empty()); + assert_ne!(snap_index, s.entry_storage().applied_index()); + assert_ne!(snap_term, s.entry_storage().applied_term()); + assert_eq!(snap_index, s.region_state().get_tablet_index()); s.on_applied_snapshot(); - assert_eq!(10, s.entry_storage().applied_index()); - assert_eq!(9, s.entry_storage().applied_term()); - assert_eq!(10, s.region_state().get_tablet_index()); + assert_eq!(snap_index, s.entry_storage().applied_index()); + assert_eq!(snap_term, s.entry_storage().applied_term()); + assert_eq!(snap_index, s.region_state().get_tablet_index()); } #[test] diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 668d7591a40..7a10c6c6b16 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -6,7 +6,7 @@ use std::{ }; use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine}; use futures::Future; use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, @@ -115,13 +115,13 @@ where } impl RaftRouter { - pub fn new(store_id: u64, reg: TabletRegistry, router: StoreRouter) -> Self { + pub fn new(store_id: u64, router: StoreRouter) -> Self { let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); let logger = router.logger().clone(); RaftRouter { router: router.clone(), - local_reader: LocalReader::new(store_meta, reg, router, logger), + local_reader: LocalReader::new(store_meta, router, logger), } } @@ -138,7 +138,7 @@ impl RaftRouter { self.router.check_send(addr, msg) } - pub fn store_meta(&self) -> &Arc> { + pub fn store_meta(&self) -> &Arc>> { self.local_reader.store_meta() } diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index cc1fcd971e9..aba477f883f 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -9,7 +9,7 @@ use std::{ use collections::HashMap; use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry}; use kvproto::metapb::Region; -use slog::{error, warn, Logger}; +use slog::{debug, error, warn, Logger}; use tikv_util::worker::{Runnable, RunnableWithTimer}; pub enum Task { @@ -156,10 +156,15 @@ impl Runner { "path" => path.display(), ), Ok(false) => { + let (_, region_id, tablet_index) = + registry.parse_tablet_name(path).unwrap_or(("", 0, 0)); // TODO: use a meaningful table context. let _ = registry .tablet_factory() - .destroy_tablet(TabletContext::with_infinite_region(0, None), path) + .destroy_tablet( + TabletContext::with_infinite_region(region_id, Some(tablet_index)), + path, + ) .map_err(|e| { warn!( logger, @@ -170,7 +175,9 @@ impl Runner { }); return true; } - _ => {} + Ok(true) => { + debug!(logger, "ignore locked tablet"; "path" => path.display()); + } } false } @@ -222,6 +229,6 @@ where } fn get_interval(&self) -> Duration { - Duration::from_secs(2) + Duration::from_secs(10) } } diff --git a/components/raftstore-v2/tests/failpoints/test_split.rs b/components/raftstore-v2/tests/failpoints/test_split.rs index 79356ae5805..e67041ab181 100644 --- a/components/raftstore-v2/tests/failpoints/test_split.rs +++ b/components/raftstore-v2/tests/failpoints/test_split.rs @@ -82,6 +82,9 @@ fn test_restart_resume() { .new_request_for(split_region_id) .take_header() .take_region_epoch(); + // Split will be resumed for region 2, not removing the fp will make write block + // forever. + fail::remove(fp); let timer = Instant::now(); for (region_id, key, val) in cases { let mut put = SimpleWriteEncoder::with_capacity(64); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 4c025a0fc85..ce0248130fb 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -276,7 +276,7 @@ impl RunningState { factory.open_tablet(ctx, &path).unwrap(); } - let router = RaftRouter::new(store_id, registry.clone(), router); + let router = RaftRouter::new(store_id, router); let store_meta = router.store_meta().clone(); let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()).unwrap(); diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 56d0f93a11d..817ff576f67 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -912,7 +912,6 @@ where } /// Used for test to write task to kv db and raft db. -#[cfg(test)] pub fn write_to_db_for_test( engines: &engine_traits::Engines, task: WriteTask, @@ -922,7 +921,8 @@ pub fn write_to_db_for_test( { let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); batch.add_write_task(&engines.raft, task); - batch.before_write_to_db(&StoreWriteMetrics::new(false)); + let metrics = StoreWriteMetrics::new(false); + batch.before_write_to_db(&metrics); if let ExtraBatchWrite::V1(kv_wb) = &mut batch.extra_batch_write { if !kv_wb.is_empty() { let mut write_opts = WriteOptions::new(); @@ -939,6 +939,8 @@ pub fn write_to_db_for_test( }); } } + batch.after_write_to_raft_db(&metrics); + batch.after_write_all(); } #[cfg(test)] diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 62561c63cbc..42fb320035b 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -31,7 +31,10 @@ pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ read::{AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask}, - write::{PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask}, + write::{ + write_to_db_for_test, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, + WriteTask, + }, write_router::{WriteRouter, WriteRouterContext, WriteSenders}, }, bootstrap::{ diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a8fc2e6e3df..a7849f5e1dd 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -286,7 +286,7 @@ impl Drop for ReadDelegate { /// #[RaftstoreCommon] pub trait ReadExecutorProvider: Send + Clone + 'static { - type Executor: ReadExecutor; + type Executor; type StoreMeta; fn store_id(&self) -> Option; @@ -687,11 +687,7 @@ where /// #[RaftstoreCommon]: LocalReader is an entry point where local read requests are dipatch to the /// relevant regions by LocalReader so that these requests can be handled by the /// relevant ReadDelegate respectively. -pub struct LocalReaderCore -where - D: ReadExecutor + Deref, - S: ReadExecutorProvider, -{ +pub struct LocalReaderCore { pub store_id: Cell>, store_meta: S, pub delegates: LruCache, @@ -699,7 +695,7 @@ where impl LocalReaderCore where - D: ReadExecutor + Deref + Clone, + D: Deref + Clone, S: ReadExecutorProvider, { pub fn new(store_meta: S) -> Self { @@ -827,8 +823,7 @@ where impl Clone for LocalReaderCore where - D: ReadExecutor + Deref, - S: ReadExecutorProvider, + S: Clone, { fn clone(&self) -> Self { LocalReaderCore { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 4d4e283ea7e..5d037fa3412 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -1415,12 +1415,12 @@ impl TikvServer { raft_engine.register_config(cfg_controller); let engines_info = Arc::new(EnginesResourceInfo::new( - registry.clone(), + registry, raft_engine.as_rocks_engine().cloned(), 180, // max_samples_to_preserve )); - let router = RaftRouter::new(node.id(), registry, router); + let router = RaftRouter::new(node.id(), router); let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( router.store_router().clone(), self.config.coprocessor.clone(), diff --git a/src/config/mod.rs b/src/config/mod.rs index c78ec02182f..d2c5941c5ec 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -344,7 +344,7 @@ macro_rules! cf_config { #[online_config(skip)] pub enable_doubly_skiplist: bool, #[online_config(skip)] - pub enable_compaction_guard: bool, + pub enable_compaction_guard: Option, #[online_config(skip)] pub compaction_guard_min_output_file_size: ReadableSize, #[online_config(skip)] @@ -596,7 +596,7 @@ macro_rules! build_cf_opt { if $opt.enable_doubly_skiplist { cf_opts.set_doubly_skiplist(); } - if $opt.enable_compaction_guard { + if $opt.enable_compaction_guard.unwrap_or(false) { if let Some(provider) = $region_info_provider { let factory = CompactionGuardGeneratorFactory::new( $cf_name, @@ -671,7 +671,7 @@ impl Default for DefaultCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Zstd, @@ -796,7 +796,7 @@ impl Default for WriteCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Zstd, @@ -902,7 +902,7 @@ impl Default for LockCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Disable, @@ -985,7 +985,7 @@ impl Default for RaftCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Disable, @@ -1218,6 +1218,8 @@ impl DbConfig { match engine { EngineType::RaftKv => { self.allow_concurrent_memtable_write.get_or_insert(true); + self.defaultcf.enable_compaction_guard.get_or_insert(true); + self.writecf.enable_compaction_guard.get_or_insert(true); } EngineType::RaftKv2 => { self.enable_multi_batch_write.get_or_insert(false); @@ -1475,7 +1477,7 @@ impl Default for RaftDefaultCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Disable, @@ -5203,7 +5205,7 @@ mod tests { // Test comopaction guard disabled. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: false, + enable_compaction_guard: Some(false), ..Default::default() }; let provider = Some(MockRegionInfoProvider::new(vec![])); @@ -5216,7 +5218,7 @@ mod tests { // Test compaction guard enabled but region info provider is missing. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, + enable_compaction_guard: Some(true), ..Default::default() }; let provider: Option = None; @@ -5229,7 +5231,7 @@ mod tests { // Test compaction guard enabled. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(4), compaction_guard_max_output_file_size: ReadableSize::mb(64), ..Default::default() @@ -5541,22 +5543,27 @@ mod tests { cfg.raft_engine.mut_config().memory_limit = None; cfg.coprocessor_v2.coprocessor_plugin_directory = None; // Default is `None`, which is represented by not setting the key. cfg.rocksdb.write_buffer_limit = None; + cfg.rocksdb.defaultcf.enable_compaction_guard = None; cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.defaultcf.level0_stop_writes_trigger = None; cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.defaultcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.writecf.enable_compaction_guard = None; cfg.rocksdb.writecf.level0_slowdown_writes_trigger = None; cfg.rocksdb.writecf.level0_stop_writes_trigger = None; cfg.rocksdb.writecf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.writecf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.lockcf.enable_compaction_guard = None; cfg.rocksdb.lockcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.lockcf.level0_stop_writes_trigger = None; cfg.rocksdb.lockcf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.lockcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.raftcf.enable_compaction_guard = None; cfg.rocksdb.raftcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.raftcf.level0_stop_writes_trigger = None; cfg.rocksdb.raftcf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.raftcf.hard_pending_compaction_bytes_limit = None; + cfg.raftdb.defaultcf.enable_compaction_guard = None; cfg.raftdb.defaultcf.level0_slowdown_writes_trigger = None; cfg.raftdb.defaultcf.level0_stop_writes_trigger = None; cfg.raftdb.defaultcf.soft_pending_compaction_bytes_limit = None; diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index c6f8e565218..0c6cf7cdd9c 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -359,7 +359,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: false, - enable_compaction_guard: false, + enable_compaction_guard: Some(false), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, @@ -428,7 +428,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: Some(false), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Zstd, @@ -497,7 +497,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, @@ -566,7 +566,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, @@ -650,7 +650,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, From 8aef20c019c969d5f7984d0ea953c0678f98cd95 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 3 Jan 2023 17:36:19 +0800 Subject: [PATCH 443/676] *: introduce slog_panic and SlogFormat (#14014) ref tikv/tikv#12842 These two are helpers to utilize the static KV pairs in logger. In the past, we use `logger.list()` to try to format the configured KV pairs, but it will not work as values are omitted. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 7 +- components/raftstore-v2/src/fsm/store.rs | 14 +- .../operation/command/admin/conf_change.rs | 20 +-- .../src/operation/command/admin/mod.rs | 11 +- .../src/operation/command/admin/split.rs | 49 +++--- .../raftstore-v2/src/operation/command/mod.rs | 16 +- .../src/operation/command/write/mod.rs | 27 ++-- .../operation/command/write/simple_write.rs | 13 +- components/raftstore-v2/src/operation/pd.rs | 9 +- .../raftstore-v2/src/operation/query/mod.rs | 7 +- .../src/operation/ready/apply_trace.rs | 11 +- .../src/operation/ready/async_writer.rs | 23 +-- .../raftstore-v2/src/operation/ready/mod.rs | 16 +- .../src/operation/ready/snapshot.rs | 17 +-- components/raftstore-v2/src/raft/apply.rs | 6 +- components/tikv_util/src/log.rs | 142 ++++++++++++++++++ 16 files changed, 272 insertions(+), 116 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 9ba7a63139c..e25ad53df8b 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -34,6 +34,7 @@ use slog::{warn, Logger}; use tikv_util::{ box_err, config::{Tracker, VersionTrack}, + log::SlogFormat, sys::SysQuota, time::Instant as TiInstant, timer::SteadyTimer, @@ -339,9 +340,9 @@ impl StorePollerBuilder { let prev = regions.insert(region_id, (sender, peer_fsm)); if let Some((_, p)) = prev { return Err(box_err!( - "duplicate region {:?} vs {:?}", - p.logger().list(), - regions[®ion_id].1.logger().list() + "duplicate region {} vs {}", + SlogFormat(p.logger()), + SlogFormat(regions[®ion_id].1.logger()) )); } Ok(()) diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index a5f22d7e1a8..86e3540d23c 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -17,7 +17,9 @@ use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, is_zero_duration, + log::SlogFormat, mpsc::{self, LooseBoundedSender, Receiver}, + slog_panic, }; use crate::{ @@ -60,12 +62,12 @@ impl StoreMeta { .insert(region_id, (region.clone(), initialized)); // `prev` only makes sense when it's initialized. if let Some((prev, prev_init)) = prev && prev_init { - assert!(initialized, "{:?} region corrupted", logger.list()); + assert!(initialized, "{} region corrupted", SlogFormat(logger)); if prev.get_region_epoch().get_version() != version { let prev_id = self.region_ranges.remove(&(data_end_key(prev.get_end_key()), prev.get_region_epoch().get_version())); - assert_eq!(prev_id, Some(region_id), "{:?} region corrupted", logger.list()); + assert_eq!(prev_id, Some(region_id), "{} region corrupted", SlogFormat(logger)); } else { - assert!(self.region_ranges.get(&(data_end_key(prev.get_end_key()), version)).is_some(), "{:?} region corrupted", logger.list()); + assert!(self.region_ranges.get(&(data_end_key(prev.get_end_key()), version)).is_some(), "{} region corrupted", SlogFormat(logger)); return; } } @@ -74,8 +76,8 @@ impl StoreMeta { self.region_ranges .insert((data_end_key(region.get_end_key()), version), region_id) .is_none(), - "{:?} region corrupted", - logger.list() + "{} region corrupted", + SlogFormat(logger) ); } } @@ -216,7 +218,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { fn on_start(&mut self) { if self.fsm.store.start_time.is_some() { - panic!("{:?} unable to start again", self.fsm.store.logger.list(),); + slog_panic!(self.fsm.store.logger, "store is already started"); } self.fsm.store.start_time = Some( diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 72b582d775d..6c041a551fe 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -27,7 +27,7 @@ use raftstore::{ Error, Result, }; use slog::{error, info, warn}; -use tikv_util::box_err; +use tikv_util::{box_err, slog_panic}; use super::AdminCmdResult; use crate::{ @@ -312,10 +312,10 @@ impl Apply { change_num += 1; } if change_num == 0 { - panic!( - "{:?} can't leave a non-joint config, region: {:?}", - self.logger.list(), - self.region_state() + slog_panic!( + self.logger, + "can't leave a non-joint config"; + "region" => ?self.region_state() ); } let conf_ver = region.get_region_epoch().get_conf_ver() + change_num; @@ -433,11 +433,11 @@ impl Apply { if let Some(exist_peer) = tikv_util::store::find_peer(region, store_id) { let r = exist_peer.get_role(); if r == PeerRole::IncomingVoter || r == PeerRole::DemotingVoter { - panic!( - "{:?} can't apply confchange because configuration is still in joint state, confchange: {:?}, region: {:?}", - self.logger.list(), - cp, - self.region_state() + slog_panic!( + self.logger, + "can't apply confchange because configuration is still in joint state"; + "confchange" => ?cp, + "region_state" => ?self.region_state() ); } } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 9ceaa76c03b..52bc5329dd4 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -15,7 +15,7 @@ use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; use split::SplitResult; pub use split::{temp_split_path, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; use crate::{batch::StoreContext, raft::Peer, router::CmdResChannel}; @@ -43,7 +43,10 @@ impl Peer { return; } if !req.has_admin_request() { - let e = box_err!("{:?} expect only execute admin command", self.logger.list()); + let e = box_err!( + "{} expect only execute admin command", + SlogFormat(&self.logger) + ); let resp = cmd_resp::new_error(e); ch.report_error(resp); return; @@ -67,8 +70,8 @@ impl Peer { // checker. if !self.applied_to_current_term() { let e = box_err!( - "{:?} peer has not applied to current term, applied_term {}, current_term {}", - self.logger.list(), + "{} peer has not applied to current term, applied_term {}, current_term {}", + SlogFormat(&self.logger), self.storage().entry_storage().applied_term(), self.term() ); diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index add5af1ce52..23fc6e3a8d9 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -54,6 +54,7 @@ use raftstore::{ Result, }; use slog::info; +use tikv_util::{log::SlogFormat, slog_panic}; use crate::{ batch::StoreContext, @@ -330,10 +331,10 @@ impl Apply { // We will freeze the memtable rather than flush it in the following PR. let tablet = self.tablet().clone(); let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint object: {:?}", - self.logger.list(), - e + slog_panic!( + self.logger, + "fails to create checkpoint object"; + "error" => ?e ) }); @@ -348,11 +349,11 @@ impl Apply { checkpointer .create_at(&split_temp_path, None, 0) .unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint with path {:?}: {:?}", - self.logger.list(), - split_temp_path, - e + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %split_temp_path.display(), + "error" => ?e ) }); } @@ -366,11 +367,11 @@ impl Apply { checkpointer .create_at(&derived_path, None, 0) .unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint with path {:?}: {:?}", - self.logger.list(), - derived_path, - e + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %derived_path.display(), + "error" => ?e ) }); } @@ -505,10 +506,10 @@ impl Peer { .router .force_send_control(StoreMsg::SplitInit(msg)) .unwrap_or_else(|e| { - panic!( - "{:?} fails to send split peer intialization msg to store : {:?}", - self.logger.list(), - e + slog_panic!( + self.logger, + "fails to send split peer intialization msg to store"; + "error" => ?e, ) }); } @@ -556,11 +557,11 @@ impl Peer { let res = self.raft_group_mut().step(msg); let accept_snap = self.raft_group().snap().is_some(); if res.is_err() || !accept_snap { - panic!( - "{:?} failed to accept snapshot {:?} with error {}", - self.logger.list(), - res, - accept_snap + slog_panic!( + self.logger, + "failed to accept snapshot"; + "accept_snapshot" => accept_snap, + "res" => ?res, ); } let prev = self.storage_mut().split_init_mut().replace(split_init); @@ -610,7 +611,7 @@ impl Peer { break; } } - assert!(found, "{:?} {}", self.logger.list(), region_id); + assert!(found, "{} {}", SlogFormat(&self.logger), region_id); let split_trace = self.split_trace_mut(); let mut off = 0; let mut admin_flushed = 0; diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 8b0d3d7d461..439d2136d76 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -41,7 +41,7 @@ use raftstore::{ }; use slog::{info, warn}; use tikv_util::{ - box_err, + box_err, slog_panic, time::{duration_to_sec, monotonic_raw_now, Instant}, }; @@ -71,12 +71,12 @@ fn parse_at(logger: &slog::Logger, buf: &[u8], index: u64, let mut m = M::default(); match m.merge_from_bytes(buf) { Ok(()) => m, - Err(e) => panic!( - "{:?} data is corrupted at [{}] {}: {:?}", - logger.list(), - term, - index, - e + Err(e) => slog_panic!( + logger, + "data is corrupted"; + "term" => term, + "index" => index, + "error" => ?e, ), } } @@ -555,7 +555,7 @@ impl Apply { if let Err(e) = wb.write_callback_opt(&write_opt, || { flush_state.set_applied_index(index); }) { - panic!("failed to write data: {:?}: {:?}", self.logger.list(), e); + slog_panic!(self.logger, "failed to write data"; "error" => ?e); } self.metrics.written_bytes += wb.data_size() as u64; self.metrics.written_keys += wb.count() as u64; diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index af806e3024e..14011d6fc1b 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -11,6 +11,7 @@ use raftstore::{ }, Result, }; +use tikv_util::slog_panic; use crate::{ batch::StoreContext, @@ -150,13 +151,13 @@ impl Apply { .put_cf(cf, &self.key_buffer, value) }; res.unwrap_or_else(|e| { - panic!( - "{:?} failed to write ({}, {}) {}: {:?}", - self.logger.list(), - log_wrappers::Value::key(key), - log_wrappers::Value::value(value), - cf, - e + slog_panic!( + self.logger, + "failed to write"; + "key" => %log_wrappers::Value::key(key), + "value" => %log_wrappers::Value::value(value), + "cf" => cf, + "error" => ?e ); }); fail::fail_point!("APPLY_PUT", |_| Err(raftstore::Error::Other( @@ -188,12 +189,12 @@ impl Apply { .delete_cf(cf, &self.key_buffer) }; res.unwrap_or_else(|e| { - panic!( - "{:?} failed to delete {} {}: {:?}", - self.logger.list(), - log_wrappers::Value::key(key), - cf, - e + slog_panic!( + self.logger, + "failed to delete"; + "key" => %log_wrappers::Value::key(key), + "cf" => cf, + "error" => ?e ); }); self.metrics.size_diff_hint -= self.key_buffer.len() as i64; diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index 57c01fca9d8..e6f81b20af1 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -5,6 +5,7 @@ use kvproto::raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}; use protobuf::{CodedInputStream, Message}; use raftstore::store::WriteCallback; use slog::Logger; +use tikv_util::slog_panic; use crate::{operation::command::parse_at, router::CmdResChannel}; @@ -191,12 +192,12 @@ impl<'a> SimpleWriteReqDecoder<'a> { let mut is = CodedInputStream::from_bytes(&buf[1..]); let header = match is.read_message() { Ok(h) => h, - Err(e) => panic!( - "{:?} data corrupted at [{}] {}: {:?}", - logger.list(), - term, - index, - e + Err(e) => slog_panic!( + logger, + "data corrupted"; + "term" => term, + "index" => index, + "error" => ?e ), }; let read = is.pos(); diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 894f39f278b..50b612f207d 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -7,6 +7,7 @@ use fail::fail_point; use kvproto::{metapb, pdpb}; use raftstore::store::Transport; use slog::error; +use tikv_util::slog_panic; use crate::{ batch::StoreContext, @@ -137,10 +138,10 @@ impl Peer { pending_peers.push(p); } else { if ctx.cfg.dev_assert { - panic!( - "{:?} failed to get peer {} from cache", - self.logger.list(), - id + slog_panic!( + self.logger, + "failed to get peer from cache"; + "get_peer_id" => id ); } error!( diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index f26659c7b89..305cdb666cc 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -30,7 +30,7 @@ use raftstore::{ Error, Result, }; use slog::{debug, info}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; use crate::{ @@ -363,7 +363,10 @@ impl Peer { } } StatusCmdType::InvalidStatus => { - return Err(box_err!("{:?} invalid status command!", self.logger.list())); + return Err(box_err!( + "{} invalid status command!", + SlogFormat(&self.logger) + )); } } diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 1e9d1ef4221..5ff9a27dee0 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -41,7 +41,7 @@ use raftstore::store::{ ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; use slog::{trace, Logger}; -use tikv_util::{box_err, worker::Scheduler}; +use tikv_util::{box_err, slog_panic, worker::Scheduler}; use crate::{ operation::{ @@ -444,11 +444,10 @@ impl Storage { return; } } - panic!( - "{:?} data loss detected: {}_{} not found", - self.logger().list(), - region_id, - tablet_index + slog_panic!( + self.logger(), + "tablet loss detected"; + "tablet_index" => tablet_index ); } diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index a2707b6d411..96f1611d9f1 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -9,6 +9,7 @@ use raftstore::store::{ WriteSenders, WriteTask, }; use slog::{warn, Logger}; +use tikv_util::slog_panic; use crate::{ batch::{StoreContext, StoreRouter}, @@ -117,11 +118,11 @@ impl AsyncWriter { let last_unpersisted = self.unpersisted_readies.back(); if last_unpersisted.map_or(true, |u| u.number < ready_number) { - panic!( - "{:?} ready number is too large {:?} vs {}", - logger.list(), - last_unpersisted, - ready_number + slog_panic!( + logger, + "ready number is too large"; + "last_unpersisted" => ?last_unpersisted, + "ready_number" => ready_number ); } @@ -130,15 +131,15 @@ impl AsyncWriter { // There must be a match in `self.unpersisted_readies`. loop { let Some(v) = self.unpersisted_readies.pop_front() else { - panic!("{:?} ready number not found {}", logger.list(), ready_number); + slog_panic!(logger, "ready number not found"; "ready_number" => ready_number); }; has_snapshot |= v.has_snapshot; if v.number > ready_number { - panic!( - "{:?} ready number not matched {:?} vs {}", - logger.list(), - v, - ready_number + slog_panic!( + logger, + "ready number not matched"; + "ready" => ?v, + "ready_number" => ready_number ); } if raft_messages.is_empty() { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index fe4208db549..29452533632 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -34,6 +34,8 @@ use raftstore::{ }; use slog::{debug, error, info, trace, warn}; use tikv_util::{ + log::SlogFormat, + slog_panic, store::find_peer, time::{duration_to_sec, monotonic_raw_now}, }; @@ -388,8 +390,8 @@ impl Peer { let prev_commit_index = self.entry_storage().commit_index(); assert!( hs.get_commit() >= prev_commit_index, - "{:?} {:?} {}", - self.logger.list(), + "{} {:?} {}", + SlogFormat(&self.logger), hs, prev_commit_index ); @@ -456,11 +458,11 @@ impl Peer { } } if !light_rd.messages().is_empty() || light_rd.commit_index().is_some() { - panic!( - "{:?} unexpected messages [{}] commit index [{:?}]", - self.logger.list(), - light_rd.messages().len(), - light_rd.commit_index() + slog_panic!( + self.logger, + "unexpected messages"; + "messages_count" => ?light_rd.messages().len(), + "commit_index" => ?light_rd.commit_index() ); } if !light_rd.committed_entries().is_empty() { diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 2e1b9362a69..8716f0c75ea 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -41,7 +41,7 @@ use raftstore::{ }, }; use slog::{error, info, warn}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat, slog_panic}; use crate::{ fsm::ApplyResReporter, @@ -554,8 +554,8 @@ impl Storage { let last_term = snap.get_metadata().get_term(); assert!( last_index >= RAFT_INIT_LOG_INDEX && last_term >= RAFT_INIT_LOG_TERM, - "{:?}", - self.logger().list() + "{}", + SlogFormat(self.logger()) ); let region_state = self.region_state_mut(); region_state.set_state(PeerState::Normal); @@ -599,12 +599,11 @@ impl Storage { // it should load it into the factory after it persisted. let hook = move || { if !install_tablet(®, &path, region_id, last_index) { - panic!( - "{:?} failed to install tablet, path: {}, region_id: {}, tablet_index: {}", - logger.list(), - path.display(), - region_id, - last_index + slog_panic!( + logger, + "failed to install tablet"; + "path" => %path.display(), + "tablet_index" => last_index ); } if clean_split { diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 8660e4795d0..2407d1ab3fe 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -9,7 +9,7 @@ use raftstore::store::{ ReadTask, }; use slog::Logger; -use tikv_util::worker::Scheduler; +use tikv_util::{log::SlogFormat, worker::Scheduler}; use crate::{ operation::{AdminCmdResult, DataTrace}, @@ -71,9 +71,9 @@ impl Apply { let mut remote_tablet = tablet_registry .get(region_state.get_region().get_id()) .unwrap(); - assert_ne!(applied_term, 0, "{:?}", logger.list()); + assert_ne!(applied_term, 0, "{}", SlogFormat(&logger)); let applied_index = flush_state.applied_index(); - assert_ne!(applied_index, 0, "{:?}", logger.list()); + assert_ne!(applied_index, 0, "{}", SlogFormat(&logger)); Apply { peer, tablet: remote_tablet.latest().unwrap().clone(), diff --git a/components/tikv_util/src/log.rs b/components/tikv_util/src/log.rs index 10facfa2287..fd351eecbd4 100644 --- a/components/tikv_util/src/log.rs +++ b/components/tikv_util/src/log.rs @@ -82,3 +82,145 @@ macro_rules! debug(($($args:tt)+) => { macro_rules! trace(($($args:tt)+) => { ::slog_global::trace!($($args)+) };); + +use std::fmt::{self, Display, Write}; + +use slog::{BorrowedKV, OwnedKVList, Record, KV}; + +struct FormatKeyValueList<'a, W> { + buffer: &'a mut W, + written: bool, +} + +impl<'a, W: Write> slog::Serializer for FormatKeyValueList<'a, W> { + fn emit_arguments(&mut self, key: slog::Key, val: &fmt::Arguments<'_>) -> slog::Result { + if !self.written { + write!(&mut self.buffer, "[{}={}]", key, val).unwrap(); + self.written = true; + } else { + write!(&mut self.buffer, " [{}={}]", key, val).unwrap() + } + Ok(()) + } +} + +/// A helper struct to format the key-value list of a slog logger. It's not +/// exact the same format as `TiKVFormat` and etc. It's just a simple +/// implementation for panic, return errors that doesn't show in normal logs +/// processing. +pub struct SlogFormat<'a>(pub &'a slog::Logger); + +impl<'a> Display for SlogFormat<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut formatter = FormatKeyValueList { + buffer: f, + written: false, + }; + let record = slog::record_static!(slog::Level::Trace, ""); + self.0 + .list() + .serialize( + &Record::new(&record, &format_args!(""), slog::b!()), + &mut formatter, + ) + .unwrap(); + Ok(()) + } +} + +#[doc(hidden)] +pub fn format_kv_list(buffer: &mut String, kv_list: &OwnedKVList, borrow_list: BorrowedKV<'_>) { + let mut formatter = FormatKeyValueList { + buffer, + written: false, + }; + let record = slog::record_static!(slog::Level::Trace, ""); + let args = format_args!(""); + let record = Record::new(&record, &args, slog::b!()); + // Serialize borrow list first to make region_id, peer_id at the end. + borrow_list.serialize(&record, &mut formatter).unwrap(); + kv_list.serialize(&record, &mut formatter).unwrap(); +} + +/// A helper macro to panic with the key-value list of a slog logger. +/// +/// Similar to `SlogFormat`, but just panic. +#[macro_export] +macro_rules! slog_panic { + ($logger:expr, $msg:expr, $borrowed_kv:expr) => {{ + let owned_kv = ($logger).list(); + let mut s = String::new(); + $crate::log::format_kv_list(&mut s, &owned_kv, $borrowed_kv); + if s.is_empty() { + panic!("{}", $msg) + } else { + panic!("{} {}", $msg, s) + } + }}; + ($logger:expr, $msg:expr) => {{ + $crate::slog_panic!($logger, $msg, slog::b!()) + }}; + ($logger:expr, $msg:expr; $($arg:tt)+) => {{ + $crate::slog_panic!($logger, $msg, slog::b!($($arg)+)) + }}; +} + +#[cfg(test)] +mod tests { + #[test] + fn test_format_kv() { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let s = format!("{}", super::SlogFormat(&logger)); + assert_eq!(s, String::new()); + + let logger = logger.new(slog::o!("a" => 1)); + let s = format!("{}", super::SlogFormat(&logger)); + assert_eq!(s, "[a=1]"); + + let logger = logger.new(slog::o!("b" => 2)); + let s = format!("{}", super::SlogFormat(&logger)); + assert_eq!(s, "[b=2] [a=1]"); + } + + #[test] + fn test_slog_panic() { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"); + }) + .unwrap_err(); + assert_eq!(err.downcast::().unwrap().as_str(), "test"); + + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"; "k" => "v"); + }) + .unwrap_err(); + assert_eq!(err.downcast::().unwrap().as_str(), "test [k=v]"); + + let logger = logger.new(slog::o!("a" => 1)); + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"); + }) + .unwrap_err(); + assert_eq!(err.downcast::().unwrap().as_str(), "test [a=1]"); + + let logger = logger.new(slog::o!("b" => 2)); + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"); + }) + .unwrap_err(); + assert_eq!( + err.downcast::().unwrap().as_str(), + "test [b=2] [a=1]" + ); + + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"; "k" => "v"); + }) + .unwrap_err(); + assert_eq!( + err.downcast::().unwrap().as_str(), + "test [k=v] [b=2] [a=1]" + ); + } +} From 4619f32f07207343692dc641656822c65157c616 Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 3 Jan 2023 18:08:19 -0800 Subject: [PATCH 444/676] Introduce priority queue for priority scheduling (#14002) ref tikv/tikv#13730 Introduce priority-based channel Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- Cargo.lock | 59 ++-- components/concurrency_manager/Cargo.toml | 7 +- components/tikv_util/Cargo.toml | 2 + components/tikv_util/src/mpsc/mod.rs | 2 + .../tikv_util/src/mpsc/priority_queue.rs | 289 ++++++++++++++++++ 5 files changed, 314 insertions(+), 45 deletions(-) create mode 100644 components/tikv_util/src/mpsc/priority_queue.rs diff --git a/Cargo.lock b/Cargo.lock index 8433f54c512..7a3c9ced013 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -800,7 +800,7 @@ dependencies = [ "kvproto", "lazy_static", "log_wrappers", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "prometheus-static-metric", @@ -1041,7 +1041,7 @@ dependencies = [ "fail", "futures 0.3.15", "kvproto", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "rand 0.8.5", "tikv_alloc", "tikv_util", @@ -1204,18 +1204,6 @@ dependencies = [ "crossbeam-utils 0.8.11", ] -[[package]] -name = "crossbeam-epoch" -version = "0.9.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.3", - "lazy_static", - "memoffset", - "scopeguard", -] - [[package]] name = "crossbeam-epoch" version = "0.9.8" @@ -1255,12 +1243,13 @@ dependencies = [ [[package]] name = "crossbeam-skiplist" -version = "0.0.0" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883a5821d7d079fcf34ac55f27a833ee61678110f6b97637cc74513c0d0b42fc" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.3", - "crossbeam-utils 0.8.3", + "crossbeam-epoch 0.9.8", + "crossbeam-utils 0.8.8", "scopeguard", ] @@ -1275,16 +1264,6 @@ dependencies = [ "lazy_static", ] -[[package]] -name = "crossbeam-utils" -version = "0.8.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" -dependencies = [ - "autocfg", - "cfg-if 1.0.0", - "lazy_static", -] - [[package]] name = "crossbeam-utils" version = "0.8.8" @@ -1379,7 +1358,7 @@ checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" dependencies = [ "cfg-if 1.0.0", "num_cpus", - "parking_lot 0.12.0", + "parking_lot 0.12.1", ] [[package]] @@ -1867,7 +1846,7 @@ dependencies = [ "maligned", "online_config", "openssl", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", "rand 0.8.5", @@ -3650,9 +3629,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", "parking_lot_core 0.9.1", @@ -3929,7 +3908,7 @@ dependencies = [ "log", "nix 0.24.1", "once_cell", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "protobuf", "protobuf-codegen-pure", "smallvec", @@ -4214,7 +4193,7 @@ dependencies = [ "nix 0.25.0", "num-derive", "num-traits", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", "protobuf", @@ -4312,7 +4291,7 @@ dependencies = [ "openssl", "ordered-float", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "prometheus-static-metric", @@ -4362,7 +4341,7 @@ dependencies = [ "keys", "kvproto", "log_wrappers", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "protobuf", @@ -6294,7 +6273,7 @@ dependencies = [ "online_config", "openssl", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "paste", "pd_client", "pin-project", @@ -6515,6 +6494,7 @@ dependencies = [ "cpu-time", "crc32fast", "crossbeam", + "crossbeam-skiplist", "derive_more", "error_code", "fail", @@ -6536,6 +6516,7 @@ dependencies = [ "openssl", "page_size", "panic_hook", + "parking_lot 0.12.1", "pin-project", "procfs", "procinfo", @@ -6620,7 +6601,7 @@ dependencies = [ "memchr", "mio 0.8.5", "num_cpus", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", "socket2", @@ -6874,7 +6855,7 @@ dependencies = [ "crossbeam-utils 0.8.8", "kvproto", "lazy_static", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pin-project", "prometheus", "slab", diff --git a/components/concurrency_manager/Cargo.toml b/components/concurrency_manager/Cargo.toml index e225cbe0519..b391c1d239a 100644 --- a/components/concurrency_manager/Cargo.toml +++ b/components/concurrency_manager/Cargo.toml @@ -5,6 +5,7 @@ publish = false version = "0.0.1" [dependencies] +crossbeam-skiplist = "0.1" fail = "0.5" kvproto = { workspace = true } parking_lot = "0.12" @@ -12,12 +13,6 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["macros", "sync", "time"] } txn_types = { workspace = true } -# FIXME: switch to the crates.io version after crossbeam-skiplist is released -[dependencies.crossbeam-skiplist] -git = "https://github.com/tikv/crossbeam.git" -branch = "tikv-5.0" -package = "crossbeam-skiplist" - [dev-dependencies] criterion = "0.3" futures = "0.3" diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 663eb2b681f..92f3bac3d5b 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -19,6 +19,7 @@ collections = { workspace = true } cpu-time = "1.0.0" crc32fast = "1.2" crossbeam = "0.8" +crossbeam-skiplist = "0.1" derive_more = "0.99.3" error_code = { workspace = true } fail = "0.5" @@ -37,6 +38,7 @@ num-traits = "0.2" num_cpus = "1" online_config = { workspace = true } openssl = "0.10" +parking_lot = "0.12.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" diff --git a/components/tikv_util/src/mpsc/mod.rs b/components/tikv_util/src/mpsc/mod.rs index 45249fed9bc..700691f1189 100644 --- a/components/tikv_util/src/mpsc/mod.rs +++ b/components/tikv_util/src/mpsc/mod.rs @@ -3,7 +3,9 @@ //! This module provides an implementation of mpsc channel based on //! crossbeam_channel. Comparing to the crossbeam_channel, this implementation //! supports closed detection and try operations. + pub mod future; +pub mod priority_queue; use std::{ cell::Cell, diff --git a/components/tikv_util/src/mpsc/priority_queue.rs b/components/tikv_util/src/mpsc/priority_queue.rs new file mode 100644 index 00000000000..3389d6154c3 --- /dev/null +++ b/components/tikv_util/src/mpsc/priority_queue.rs @@ -0,0 +1,289 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::{ + atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering}, + Arc, +}; + +use crossbeam::channel::{RecvError, SendError, TryRecvError, TrySendError}; +use crossbeam_skiplist::SkipMap; +use parking_lot::{Condvar, Mutex}; + +// Create a priority based channel. Sender can send message with priority of +// u64, and receiver will receive messages in ascending order of priority. For +// two messages of same priority, the receiving order follows FIFO. +pub fn unbounded() -> (Sender, Receiver) { + let queue = Arc::new(PriorityQueue::new()); + let sender = Sender { + inner: queue.clone(), + }; + let receiver = Receiver { inner: queue }; + (sender, receiver) +} + +struct Cell { + ptr: AtomicPtr, +} + +unsafe impl Send for Cell {} +unsafe impl Sync for Cell {} + +impl Cell { + fn new(value: T) -> Self { + Self { + ptr: AtomicPtr::new(Box::into_raw(Box::new(value))), + } + } + + fn take(&self) -> Option { + let p = self.ptr.swap(std::ptr::null_mut(), Ordering::SeqCst); + if !p.is_null() { + unsafe { Some(*Box::from_raw(p)) } + } else { + None + } + } +} + +impl Drop for Cell { + fn drop(&mut self) { + self.take(); + } +} + +#[derive(Default)] +struct PriorityQueue { + queue: SkipMap>, + disconnected: Mutex, + available: Condvar, + + sequencer: AtomicU64, + + senders: AtomicUsize, + receivers: AtomicUsize, +} + +impl PriorityQueue { + pub fn new() -> Self { + Self { + queue: SkipMap::new(), + disconnected: Mutex::new(false), + available: Condvar::new(), + sequencer: AtomicU64::new(0), + senders: AtomicUsize::new(1), + receivers: AtomicUsize::new(1), + } + } + + pub fn get_map_key(&self, pri: u64) -> MapKey { + MapKey { + priority: pri, + sequence: self.sequencer.fetch_add(1, Ordering::Relaxed), + } + } +} + +// When derived `PartialOrd` on structs, it will produce a lexicographic +// ordering based on the top-to-bottom declaration order of the struct’s +// members. +#[derive(Eq, PartialEq, Ord, PartialOrd)] +struct MapKey { + priority: u64, + sequence: u64, +} + +pub struct Sender { + inner: Arc>, +} + +impl Sender { + pub fn try_send(&self, msg: T, pri: u64) -> Result<(), TrySendError> { + self.send(msg, pri) + .map_err(|SendError(msg)| TrySendError::Disconnected(msg)) + } + + pub fn send(&self, msg: T, pri: u64) -> Result<(), SendError> { + if self.inner.receivers.load(Ordering::Acquire) == 0 { + return Err(SendError(msg)); + } + self.inner + .queue + .insert(self.inner.get_map_key(pri), Cell::new(msg)); + self.inner.available.notify_one(); + Ok(()) + } + + #[cfg(test)] + fn len(&self) -> usize { + self.inner.queue.len() + } +} + +impl Clone for Sender { + fn clone(&self) -> Self { + self.inner.senders.fetch_add(1, Ordering::AcqRel); + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl Drop for Sender { + fn drop(&mut self) { + let old = self.inner.senders.fetch_sub(1, Ordering::AcqRel); + if old <= 1 { + *self.inner.disconnected.lock() = true; + self.inner.available.notify_all(); + } + } +} + +pub struct Receiver { + inner: Arc>, +} + +impl Receiver { + pub fn try_recv(&self) -> Result { + match self.inner.queue.pop_front() { + Some(entry) => Ok(entry.value().take().unwrap()), + None if self.inner.senders.load(Ordering::SeqCst) == 0 => { + Err(TryRecvError::Disconnected) + } + None => Err(TryRecvError::Empty), + } + } + + pub fn recv(&self) -> Result { + loop { + match self.try_recv() { + Ok(msg) => return Ok(msg), + Err(TryRecvError::Disconnected) => { + return Err(RecvError); + } + Err(TryRecvError::Empty) => { + let mut disconnected = self.inner.disconnected.lock(); + if *disconnected { + return Err(RecvError); + } + self.inner.available.wait(&mut disconnected); + } + } + } + } + + #[cfg(test)] + fn len(&self) -> usize { + self.inner.queue.len() + } +} + +impl Clone for Receiver { + fn clone(&self) -> Self { + self.inner.receivers.fetch_add(1, Ordering::AcqRel); + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl Drop for Receiver { + fn drop(&mut self) { + self.inner.receivers.fetch_sub(1, Ordering::AcqRel); + } +} + +#[cfg(test)] +mod tests { + use std::{sync::atomic::AtomicU64, thread, time::Duration}; + + use crossbeam::channel::TrySendError; + use rand::Rng; + + use super::*; + + #[test] + fn test_priority() { + let (tx, rx) = super::unbounded::(); + tx.try_send(1, 2).unwrap(); + tx.send(2, 1).unwrap(); + tx.send(3, 3).unwrap(); + + assert_eq!(rx.try_recv(), Ok(2)); + assert_eq!(rx.recv(), Ok(1)); + assert_eq!(rx.recv(), Ok(3)); + assert_eq!(rx.try_recv(), Err(TryRecvError::Empty)); + + drop(rx); + assert_eq!(tx.send(2, 1), Err(SendError(2))); + assert_eq!(tx.try_send(2, 1), Err(TrySendError::Disconnected(2))); + + let (tx, rx) = super::unbounded::(); + drop(tx); + assert_eq!(rx.recv(), Err(RecvError)); + assert_eq!(rx.try_recv(), Err(TryRecvError::Disconnected)); + + let (tx, rx) = super::unbounded::(); + thread::spawn(move || { + thread::sleep(Duration::from_millis(100)); + tx.send(10, 1).unwrap(); + }); + assert_eq!(rx.recv(), Ok(10)); + + let (tx, rx) = super::unbounded::(); + assert_eq!(tx.len(), 0); + assert_eq!(rx.len(), 0); + tx.send(2, 1).unwrap(); + tx.send(3, 2).unwrap(); + assert_eq!(tx.len(), 2); + assert_eq!(rx.len(), 2); + drop(tx); + assert_eq!(rx.try_recv(), Ok(2)); + assert_eq!(rx.recv(), Ok(3)); + assert_eq!(rx.try_recv(), Err(TryRecvError::Disconnected)); + assert_eq!(rx.recv(), Err(RecvError)); + } + + #[test] + fn test_priority_multi_thread() { + let (tx, rx) = super::unbounded::(); + + let mut handlers = Vec::with_capacity(10); + let expected_count = Arc::new(AtomicU64::new(0)); + let real_counter = Arc::new(AtomicU64::new(0)); + for _ in 0..10 { + let sender = tx.clone(); + let expected_count = expected_count.clone(); + let handle = thread::spawn(move || { + let mut rng = rand::thread_rng(); + let pri = rng.gen_range(0..1000); + let mut cnt = 0; + for i in 0..1000 { + sender.send(i, pri).unwrap(); + cnt += i; + } + expected_count.fetch_add(cnt, Ordering::Relaxed); + }); + handlers.push(handle); + } + for _i in 0..10 { + let recv = rx.clone(); + let real_counter = real_counter.clone(); + let handle = thread::spawn(move || { + let mut cnt = 0; + while let Ok(v) = recv.recv() { + cnt += v; + } + real_counter.fetch_add(cnt, Ordering::Relaxed); + }); + handlers.push(handle); + } + drop(tx); + for h in handlers { + h.join().unwrap(); + } + assert_eq!( + expected_count.load(Ordering::Relaxed), + real_counter.load(Ordering::Relaxed) + ); + } +} From cdc2e486277d775b70f5db28a7b643ed2c3edbe1 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 5 Jan 2023 14:38:20 +0800 Subject: [PATCH 445/676] raftstore-v2: only send clean snapshot (#14015) ref tikv/tikv#12842 When the tablet contains dirty data right after split, generating snapshot may just a waste. On the other hand, split usually happens on all peers, so delay it a bit actually makes all peers more likely to be initialized by split. So this PR rejects generating snapshot when it detects it still has dirty data. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_panic/src/raft_engine.rs | 8 ++++ components/engine_rocks/src/raft_engine.rs | 13 ++++++ components/engine_traits/src/raft_engine.rs | 4 ++ components/raft_log_engine/src/engine.rs | 16 +++++++ components/raftstore-v2/src/fsm/peer.rs | 5 ++- .../src/operation/command/admin/split.rs | 45 +++++++++++++++---- .../raftstore-v2/src/operation/ready/mod.rs | 21 ++++++++- .../src/operation/ready/snapshot.rs | 25 ++++++++--- components/raftstore-v2/src/raft/storage.rs | 25 +++++++++++ components/raftstore-v2/src/router/message.rs | 3 ++ .../raftstore-v2/src/worker/tablet_gc.rs | 21 ++++++--- 11 files changed, 163 insertions(+), 23 deletions(-) diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 854b75fe30d..c0539c1edd5 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -67,6 +67,10 @@ impl RaftEngineReadOnly for PanicEngine { panic!() } + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result { + panic!() + } + fn get_recover_state(&self) -> Result> { panic!() } @@ -232,6 +236,10 @@ impl RaftLogBatch for PanicWriteBatch { panic!() } + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { + panic!() + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index d566ac3821b..a0a5acd5dd8 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -166,6 +166,10 @@ impl RaftEngineReadOnly for RocksEngine { panic!() } + fn get_dirty_mark(&self, _raft_group_id: u64, _tablet_index: u64) -> Result { + panic!() + } + fn get_recover_state(&self) -> Result> { self.get_msg_cf(CF_DEFAULT, keys::RECOVER_STATE_KEY) } @@ -439,6 +443,15 @@ impl RaftLogBatch for RocksWriteBatchVec { panic!() } + fn put_dirty_mark( + &mut self, + _raft_group_id: u64, + _tablet_index: u64, + _dirty: bool, + ) -> Result<()> { + panic!() + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.put_msg(keys::RECOVER_STATE_KEY, state) } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 68036eae1eb..671fed8b3cf 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -33,6 +33,7 @@ pub trait RaftEngineReadOnly: Sync + Send + 'static { ) -> Result>; /// Get the flushed index of the given CF. fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result>; + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result; fn get_recover_state(&self) -> Result>; fn get_entry(&self, raft_group_id: u64, index: u64) -> Result>; @@ -201,6 +202,9 @@ pub trait RaftLogBatch: Send { apply_index: u64, ) -> Result<()>; + /// Mark a tablet may contain data that is not supposed to be in its range. + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()>; + /// Indicate whether region states should be recovered from raftdb and /// replay raft logs. /// When kvdb's write-ahead-log is disabled, the sequence number of the last diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 1ae148ba41c..3db865ed8ad 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -381,6 +381,7 @@ const REGION_STATE_KEY: &[u8] = &[0x03]; const APPLY_STATE_KEY: &[u8] = &[0x04]; const RECOVER_STATE_KEY: &[u8] = &[0x05]; const FLUSH_STATE_KEY: &[u8] = &[0x06]; +const DIRTY_MARK_KEY: &[u8] = &[0x07]; // All keys are of the same length. const KEY_PREFIX_LEN: usize = RAFT_LOG_STATE_KEY.len(); @@ -475,6 +476,16 @@ impl RaftLogBatchTrait for RaftLogBatch { Ok(()) } + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { + let key = encode_key(DIRTY_MARK_KEY, tablet_index); + if dirty { + self.0.put(raft_group_id, key.to_vec(), vec![]); + } else { + self.0.delete(raft_group_id, key.to_vec()); + } + Ok(()) + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.0 .put_message(STORE_STATE_ID, RECOVER_STATE_KEY.to_vec(), state) @@ -601,6 +612,11 @@ impl RaftEngineReadOnly for RaftLogEngine { Ok(index) } + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result { + let key = encode_key(DIRTY_MARK_KEY, tablet_index); + Ok(self.0.get(raft_group_id, &key).is_some()) + } + fn get_recover_state(&self) -> Result> { self.0 .get_message(STORE_STATE_ID, RECOVER_STATE_KEY) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 8b05435246b..c05b58d0839 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -191,7 +191,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } fn on_start(&mut self) { - if !self.fsm.peer.maybe_pause_for_recovery() { + if !self.fsm.peer.maybe_pause_for_recovery(self.store_ctx) { self.schedule_tick(PeerTick::Raft); } self.schedule_tick(PeerTick::SplitRegionCheck); @@ -308,6 +308,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .on_request_split(self.store_ctx, request, ch) } PeerMsg::ForceCompactLog => self.on_compact_log_tick(true), + PeerMsg::TabletTrimmed { tablet_index } => { + self.fsm.peer_mut().on_tablet_trimmed(tablet_index) + } #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 23fc6e3a8d9..71c1e095d8c 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -471,12 +471,18 @@ impl Peer { self.split_flow_control_mut().may_skip_split_check = false; self.add_pending_tick(PeerTick::SplitRegionCheck); } + self.storage_mut().set_has_dirty_data(true); + let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); + let tablet_index = res.tablet_index; let _ = store_ctx .schedulers .tablet_gc .schedule(tablet_gc::Task::trim( self.tablet().unwrap().clone(), derived, + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, )); let last_region_id = res.regions.last().unwrap().get_id(); @@ -521,6 +527,9 @@ impl Peer { self.state_changes_mut() .put_region_state(region_id, res.tablet_index, ®ion_state) .unwrap(); + self.state_changes_mut() + .put_dirty_mark(region_id, res.tablet_index, true) + .unwrap(); self.set_has_extra_write(); } @@ -574,13 +583,21 @@ impl Peer { store_ctx: &mut StoreContext, split_init: Box, ) { - let _ = store_ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::trim( - self.tablet().unwrap().clone(), - self.region(), - )); + let region_id = self.region_id(); + if self.storage().has_dirty_data() { + let tablet_index = self.storage().tablet_index(); + let mailbox = store_ctx.router.mailbox(region_id).unwrap(); + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + } if split_init.source_leader && self.leader_id() == INVALID_ID && self.term() == RAFT_INIT_LOG_TERM @@ -593,7 +610,6 @@ impl Peer { // reduce client query miss. self.region_heartbeat_pd(store_ctx); } - let region_id = self.region_id(); if split_init.check_split { self.add_pending_tick(PeerTick::SplitRegionCheck); @@ -633,6 +649,19 @@ impl Peer { self.set_has_extra_write(); } } + + pub fn on_tablet_trimmed(&mut self, tablet_index: u64) { + info!(self.logger, "tablet is trimmed"; "tablet_index" => tablet_index); + let region_id = self.region_id(); + let changes = self.state_changes_mut(); + changes + .put_dirty_mark(region_id, tablet_index, false) + .unwrap(); + self.set_has_extra_write(); + if self.storage().tablet_index() == tablet_index { + self.storage_mut().set_has_dirty_data(false); + } + } } #[cfg(test)] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 29452533632..3ac500b7f49 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -50,6 +50,7 @@ use crate::{ fsm::{PeerFsmDelegate, Store}, raft::{Peer, Storage}, router::{ApplyTask, PeerMsg, PeerTick}, + worker::tablet_gc, }; const PAUSE_FOR_RECOVERY_GAP: u64 = 128; @@ -80,7 +81,25 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } impl Peer { - pub fn maybe_pause_for_recovery(&mut self) -> bool { + pub fn maybe_pause_for_recovery(&mut self, store_ctx: &mut StoreContext) -> bool { + // The task needs to be scheduled even if the tablet may be replaced during + // recovery. Otherwise if there are merges during recovery, the FSM may + // be paused forever. + if self.storage().has_dirty_data() { + let region_id = self.region_id(); + let mailbox = store_ctx.router.mailbox(region_id).unwrap(); + let tablet_index = self.storage().tablet_index(); + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + } let entry_storage = self.storage().entry_storage(); let committed_index = entry_storage.commit_index(); let applied_index = entry_storage.applied_index(); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 8716f0c75ea..1919ce269a6 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -345,12 +345,23 @@ impl Storage { }; } - info!( - self.logger(), - "requesting snapshot"; - "request_index" => request_index, - "request_peer" => to, - ); + if self.has_dirty_data() { + info!(self.logger(), "delay generating snapshot as there are still dirty data"; "request_index" => request_index, "request_peer" => to); + // It's OK to delay. If there are still dirty data, it means the tablet is just + // split. In normal cases, all peers will apply split, so reject generates + // snapshot may actually good for all peers as they are more likely + // to be initialized by split. + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } else { + info!( + self.logger(), + "requesting snapshot"; + "request_index" => request_index, + "request_peer" => to, + ); + } let canceled = Arc::new(AtomicBool::new(false)); let index = Arc::new(AtomicU64::new(0)); let mut gen_snap_task = self.gen_snap_task_mut(); @@ -586,6 +597,8 @@ impl Storage { let (path, clean_split) = match self.split_init_mut() { // If index not match, the peer may accept a newer snapshot after split. Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => { + lb.put_dirty_mark(region_id, last_index, true).unwrap(); + self.set_has_dirty_data(true); (temp_split_path(®, region_id), false) } si => ( diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index b0eec5a196c..aca8f0fafce 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -35,6 +35,9 @@ pub struct Storage { /// by messages, it has not persisted any states, we need to persist them /// at least once dispite whether the state changes since create. ever_persisted: bool, + /// It may have dirty data after split. Use a flag to indicate whether it + /// has finished clean up. + has_dirty_data: bool, logger: Logger, /// Snapshot part. @@ -116,6 +119,16 @@ impl Storage { pub fn apply_trace(&self) -> &ApplyTrace { &self.apply_trace } + + #[inline] + pub fn set_has_dirty_data(&mut self, has_dirty_data: bool) { + self.has_dirty_data = has_dirty_data; + } + + #[inline] + pub fn has_dirty_data(&self) -> bool { + self.has_dirty_data + } } impl Storage { @@ -139,6 +152,17 @@ impl Storage { }; let region = region_state.get_region(); let logger = logger.new(o!("region_id" => region.id, "peer_id" => peer.get_id())); + let has_dirty_data = + match engine.get_dirty_mark(region.get_id(), region_state.get_tablet_index()) { + Ok(b) => b, + Err(e) => { + return Err(box_err!( + "failed to get dirty mark for {}: {:?}", + region.get_id(), + e + )); + } + }; let entry_storage = EntryStorage::new( peer.get_id(), engine, @@ -153,6 +177,7 @@ impl Storage { peer: peer.clone(), region_state, ever_persisted: persisted, + has_dirty_data, logger, snap_states: RefCell::new(HashMap::default()), gen_snap_task: RefCell::new(Box::new(None)), diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 930de5ff036..353e17b0cb0 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -182,6 +182,9 @@ pub enum PeerMsg { ch: CmdResChannel, }, ForceCompactLog, + TabletTrimmed { + tablet_index: u64, + }, /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index aba477f883f..d4593223db3 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -17,6 +17,7 @@ pub enum Task { tablet: EK, start_key: Box<[u8]>, end_key: Box<[u8]>, + cb: Box, }, PrepareDestroy { tablet: EK, @@ -31,11 +32,9 @@ pub enum Task { impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - match *self { + match self { Task::Trim { - ref start_key, - ref end_key, - .. + start_key, end_key, .. } => write!( f, "trim tablet for start_key {}, end_key {}", @@ -65,11 +64,12 @@ impl Display for Task { impl Task { #[inline] - pub fn trim(tablet: EK, region: &Region) -> Self { + pub fn trim(tablet: EK, region: &Region, cb: impl FnOnce() + Send + 'static) -> Self { Task::Trim { tablet, start_key: region.get_start_key().into(), end_key: region.get_end_key().into(), + cb: Box::new(cb), } } @@ -110,7 +110,12 @@ impl Runner { } } - fn trim(tablet: &EK, start_key: &[u8], end_key: &[u8]) -> engine_traits::Result<()> { + fn trim( + tablet: &EK, + start_key: &[u8], + end_key: &[u8], + cb: Box, + ) -> engine_traits::Result<()> { let start_key = keys::data_key(start_key); let end_key = keys::data_end_key(end_key); let range1 = Range::new(&[], &start_key); @@ -121,6 +126,7 @@ impl Runner { for r in [range1, range2] { tablet.compact_range(Some(r.start_key), Some(r.end_key), false, 1)?; } + cb(); Ok(()) } @@ -195,8 +201,9 @@ where tablet, start_key, end_key, + cb, } => { - if let Err(e) = Self::trim(&tablet, &start_key, &end_key) { + if let Err(e) = Self::trim(&tablet, &start_key, &end_key, cb) { error!( self.logger, "failed to trim tablet"; From df3ee59d3d134e2ef5d8e5ec90d36d218b86e4a4 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 5 Jan 2023 17:24:20 +0800 Subject: [PATCH 446/676] raftstore-v2: update region size after split check (#14019) ref tikv/tikv#12842 Signed-off-by: Jay Lee --- components/raftstore-v2/src/fsm/peer.rs | 7 +++ .../src/operation/command/admin/split.rs | 52 ++++++++++++++++++- components/raftstore-v2/src/operation/pd.rs | 6 +-- components/raftstore-v2/src/router/imp.rs | 8 +-- components/raftstore-v2/src/router/message.rs | 7 +++ 5 files changed, 71 insertions(+), 9 deletions(-) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index c05b58d0839..fee1a00993b 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -307,6 +307,13 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .peer_mut() .on_request_split(self.store_ctx, request, ch) } + PeerMsg::UpdateRegionSize { size } => { + self.fsm.peer_mut().on_update_region_size(size) + } + PeerMsg::UpdateRegionKeys { keys } => { + self.fsm.peer_mut().on_update_region_keys(keys) + } + PeerMsg::ClearRegionSize => self.fsm.peer_mut().on_clear_region_size(), PeerMsg::ForceCompactLog => self.on_compact_log_tick(true), PeerMsg::TabletTrimmed { tablet_index } => { self.fsm.peer_mut().on_tablet_trimmed(tablet_index) diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 71c1e095d8c..f63f1f2ae17 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -74,7 +74,7 @@ pub struct SplitResult { // The index of the derived region in `regions` pub derived_index: usize, pub tablet_index: u64, - // Hack: in common case we should use generic, but split is an unfrequent + // Hack: in common case we should use generic, but split is an infrequent // event that performance is not critical. And using `Any` can avoid polluting // all existing code. tablet: Box, @@ -91,6 +91,8 @@ pub struct SplitInit { /// In-memory pessimistic locks that should be inherited from parent region pub locks: PeerPessimisticLocks, + approximate_size: Option, + approximate_keys: Option, } impl SplitInit { @@ -123,6 +125,20 @@ pub struct SplitFlowControl { size_diff_hint: i64, skip_split_count: u64, may_skip_split_check: bool, + approximate_size: Option, + approximate_keys: Option, +} + +impl SplitFlowControl { + #[inline] + pub fn approximate_size(&self) -> Option { + self.approximate_size + } + + #[inline] + pub fn approximate_keys(&self) -> Option { + self.approximate_keys + } } pub fn temp_split_path(registry: &TabletRegistry, region_id: u64) -> PathBuf { @@ -173,6 +189,25 @@ impl Peer { false } + pub fn on_update_region_size(&mut self, size: u64) { + self.split_flow_control_mut().approximate_size = Some(size); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::PdHeartbeat); + } + + pub fn on_update_region_keys(&mut self, keys: u64) { + self.split_flow_control_mut().approximate_keys = Some(keys); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::PdHeartbeat); + } + + pub fn on_clear_region_size(&mut self) { + let control = self.split_flow_control_mut(); + control.approximate_size.take(); + control.approximate_keys.take(); + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + pub fn update_split_flow_control(&mut self, metrics: &ApplyMetrics) { let control = self.split_flow_control_mut(); control.size_diff_hint += metrics.size_diff_hint; @@ -454,6 +489,11 @@ impl Peer { self.record_tombstone_tablet(store_ctx, tablet, res.tablet_index); } + let new_region_count = res.regions.len() as u64; + let control = self.split_flow_control_mut(); + let estimated_size = control.approximate_size.map(|v| v / new_region_count); + let estimated_keys = control.approximate_keys.map(|v| v / new_region_count); + self.post_split(); if self.is_leader() { @@ -468,7 +508,10 @@ impl Peer { // so we send it independently here. self.report_batch_split_pd(store_ctx, res.regions.to_vec()); // After split, the peer may need to update its metrics. - self.split_flow_control_mut().may_skip_split_check = false; + let control = self.split_flow_control_mut(); + control.may_skip_split_check = false; + control.approximate_size = estimated_size; + control.approximate_keys = estimated_keys; self.add_pending_tick(PeerTick::SplitRegionCheck); } self.storage_mut().set_has_dirty_data(true); @@ -500,6 +543,8 @@ impl Peer { source_id: region_id, check_split: last_region_id == new_region_id, scheduled: false, + approximate_size: estimated_size, + approximate_keys: estimated_keys, locks, })); @@ -606,6 +651,9 @@ impl Peer { self.set_has_ready(); *self.txn_ext().pessimistic_locks.write() = split_init.locks; + let control = self.split_flow_control_mut(); + control.approximate_size = split_init.approximate_size; + control.approximate_keys = split_init.approximate_keys; // The new peer is likely to become leader, send a heartbeat immediately to // reduce client query miss. self.region_heartbeat_pd(store_ctx); diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 50b612f207d..d80258f14b1 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -77,7 +77,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, impl Peer { #[inline] - pub fn region_heartbeat_pd(&self, ctx: &StoreContext) { + pub fn region_heartbeat_pd(&mut self, ctx: &StoreContext) { let task = pd::Task::RegionHeartbeat(pd::RegionHeartbeatTask { term: self.term(), region: self.region().clone(), @@ -86,8 +86,8 @@ impl Peer { pending_peers: self.collect_pending_peers(ctx), written_bytes: self.self_stat().written_bytes, written_keys: self.self_stat().written_keys, - approximate_size: None, - approximate_keys: None, + approximate_size: self.split_flow_control_mut().approximate_size(), + approximate_keys: self.split_flow_control_mut().approximate_keys(), wait_data_peers: Vec::new(), }); if let Err(e) = ctx.schedulers.pd.schedule(task) { diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 7a10c6c6b16..315f8a0d8eb 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -33,12 +33,12 @@ impl AsyncReadNotifier for StoreRouter { } impl raftstore::coprocessor::StoreHandle for StoreRouter { - fn update_approximate_size(&self, _region_id: u64, _size: u64) { - // TODO + fn update_approximate_size(&self, region_id: u64, size: u64) { + let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); } - fn update_approximate_keys(&self, _region_id: u64, _keys: u64) { - // TODO + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); } fn ask_split( diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 353e17b0cb0..c1e5f0d37dc 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -181,6 +181,13 @@ pub enum PeerMsg { request: RequestSplit, ch: CmdResChannel, }, + UpdateRegionSize { + size: u64, + }, + UpdateRegionKeys { + keys: u64, + }, + ClearRegionSize, ForceCompactLog, TabletTrimmed { tablet_index: u64, From cc9e69b925020e58b786bb811f1bcdba05a7c09f Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Fri, 6 Jan 2023 14:20:21 +0800 Subject: [PATCH 447/676] raftstore-v2: store heartbeat add kv size and snap size (#14016) ref tikv/tikv#12842 1. store heartbeat should add snapshot and kv engine used size Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Xinye Tao --- components/raftstore-v2/src/batch/store.rs | 1 + components/raftstore-v2/src/operation/pd.rs | 4 +--- components/raftstore-v2/src/worker/pd/mod.rs | 6 +++++- .../raftstore-v2/src/worker/pd/store_heartbeat.rs | 11 ++++++++--- .../tests/integrations/test_pd_heartbeat.rs | 1 + 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index e25ad53df8b..621f826619b 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -551,6 +551,7 @@ impl StoreSystem { pd_client, raft_engine.clone(), tablet_registry.clone(), + snap_mgr.clone(), router.clone(), workers.pd.remote(), concurrency_manager, diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index d80258f14b1..26945a3e176 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -50,9 +50,7 @@ impl Store { stats.set_bytes_written(0); stats.set_keys_written(0); stats.set_is_busy(false); - - // stats.set_query_stats(query_stats); - + // TODO: add query stats let task = pd::Task::StoreHeartbeat { stats }; if let Err(e) = ctx.schedulers.pd.schedule(task) { error!(self.logger(), "notify pd failed"; diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index bfcf3389754..b54d088db66 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -12,7 +12,8 @@ use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; use pd_client::PdClient; use raftstore::store::{ - util::KeysInfoFormatter, Config, FlowStatsReporter, ReadStats, TxnExt, WriteStats, + util::KeysInfoFormatter, Config, FlowStatsReporter, ReadStats, TabletSnapManager, TxnExt, + WriteStats, }; use slog::{error, info, Logger}; use tikv_util::{ @@ -105,6 +106,7 @@ where pd_client: Arc, raft_engine: ER, tablet_registry: TabletRegistry, + snap_mgr: TabletSnapManager, router: StoreRouter, remote: Remote, @@ -139,6 +141,7 @@ where pd_client: Arc, raft_engine: ER, tablet_registry: TabletRegistry, + snap_mgr: TabletSnapManager, router: StoreRouter, remote: Remote, concurrency_manager: ConcurrencyManager, @@ -152,6 +155,7 @@ where pd_client, raft_engine, tablet_registry, + snap_mgr, router, remote, region_peers: HashMap::default(), diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs index 22bee3cbf26..ba75354c753 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs @@ -277,9 +277,14 @@ where } else { std::cmp::min(disk_cap, self.cfg.value().capacity.0) }; - // TODO: accurate snapshot size and kv engines size. - let snap_size = 0; - let kv_size = 0; + let mut kv_size = 0; + self.tablet_registry.for_each_opened_tablet(|_, cached| { + if let Some(tablet) = cached.latest() { + kv_size += tablet.get_engine_used_size().unwrap_or(0); + } + true + }); + let snap_size = self.snap_mgr.total_snap_size().unwrap(); let used_size = snap_size + kv_size + self diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs index 96bcbbccf7a..09ead81c0c2 100644 --- a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -52,6 +52,7 @@ fn test_store_heartbeat() { let stats = block_on(cluster.node(0).pd_client().get_store_stats_async(store_id)).unwrap(); if stats.get_start_time() > 0 { assert_ne!(stats.get_capacity(), 0); + assert_ne!(stats.get_used_size(), 0); return; } std::thread::sleep(std::time::Duration::from_millis(50)); From c71fdfc49414005c4630e357e1ab6418ddf104f7 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 6 Jan 2023 17:52:22 +0800 Subject: [PATCH 448/676] log-backup: limit inflight raft msg from pitr (#13976) close tikv/tikv#13977 Signed-off-by: tabokie --- .../src/worker/pd/update_max_timestamp.rs | 3 - components/sst_importer/src/sst_importer.rs | 2 +- src/import/mod.rs | 2 +- src/import/sst_service.rs | 626 ++++++++---------- 4 files changed, 295 insertions(+), 338 deletions(-) diff --git a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs index 0de3fb9a87c..178d00ebd15 100644 --- a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs +++ b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs @@ -93,13 +93,10 @@ where } }; - #[cfg(feature = "failpoints")] let delay = (|| { fail::fail_point!("delay_update_max_ts", |_| true); false })(); - #[cfg(not(feature = "failpoints"))] - let delay = false; if delay { info!(self.logger, "[failpoint] delay update max ts for 1s"; "region_id" => region_id); diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 3e06eb76899..8b6d64f483f 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -763,7 +763,7 @@ impl SstImporter { start_ts: u64, restore_ts: u64, file_buff: Arc>, - build_fn: &mut dyn FnMut(Vec, Vec), + mut build_fn: impl FnMut(Vec, Vec), ) -> Result> { let mut event_iter = EventIterator::new(file_buff.as_slice()); let mut smallest_key = None; diff --git a/src/import/mod.rs b/src/import/mod.rs index d3a522ede5e..e2fa3729e52 100644 --- a/src/import/mod.rs +++ b/src/import/mod.rs @@ -29,7 +29,7 @@ pub fn make_rpc_error(err: E) -> RpcStatus { #[macro_export] macro_rules! send_rpc_response { - ($res:ident, $sink:ident, $label:ident, $timer:ident) => {{ + ($res:expr, $sink:ident, $label:ident, $timer:ident) => {{ let res = match $res { Ok(resp) => { IMPORT_RPC_DURATION diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 8ce6f9961fb..ea52cad0095 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -1,7 +1,7 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::HashMap, + collections::{HashMap, VecDeque}, future::Future, path::PathBuf, sync::{Arc, Mutex}, @@ -11,7 +11,7 @@ use std::{ use collections::HashSet; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; -use futures::{future::join_all, sink::SinkExt, stream::TryStreamExt, TryFutureExt}; +use futures::{sink::SinkExt, stream::TryStreamExt, TryFutureExt}; use futures_executor::{ThreadPool, ThreadPoolBuilder}; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, @@ -19,9 +19,12 @@ use grpcio::{ use kvproto::{ encryptionpb::EncryptionMethod, errorpb, - import_sstpb::{RawWriteRequest_oneof_chunk as RawChunk, WriteRequest_oneof_chunk as Chunk, *}, + import_sstpb::{ + Error as ImportPbError, ImportSst, Range, RawWriteRequest_oneof_chunk as RawChunk, SstMeta, + SwitchMode, WriteRequest_oneof_chunk as Chunk, *, + }, kvrpcpb::Context, - raft_cmdpb::*, + raft_cmdpb::{CmdType, DeleteRequest, PutRequest, RaftCmdRequest, RaftRequestHeader, Request}, }; use protobuf::Message; use raftstore::{ @@ -44,6 +47,8 @@ use txn_types::{Key, WriteRef, WriteType}; use super::make_rpc_error; use crate::{import::duplicate_detect::DuplicateDetector, server::CONFIG_ROCKSDB_GAUGE}; +const MAX_INFLIGHT_RAFT_MSGS: usize = 64; + /// ImportSstService provides tikv-server with the ability to ingest SST files. /// /// It saves the SST sent from client to a file and then sends a command to @@ -74,6 +79,161 @@ pub struct SnapshotResult { term: u64, } +struct RequestCollector { + context: Context, + max_raft_req_size: usize, + /// Retain the last ts of each key in each request. + /// This is used for write CF because resolved ts observer hates duplicated + /// key in the same request. + write_reqs: HashMap, (Request, u64)>, + /// Collector favor that simple collect all items, and it do not contains + /// duplicated key-value. This is used for default CF. + default_reqs: HashMap, Request>, + /// Size of all `Request`s. + unpacked_size: usize, + + pending_raft_reqs: Vec, +} + +impl RequestCollector { + fn new(context: Context, max_raft_req_size: usize) -> Self { + Self { + context, + max_raft_req_size, + write_reqs: HashMap::default(), + default_reqs: HashMap::default(), + unpacked_size: 0, + pending_raft_reqs: Vec::new(), + } + } + + fn accept_kv(&mut self, cf: &str, is_delete: bool, k: Vec, v: Vec) { + // Need to skip the empty key/value that could break the transaction or cause + // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. + if k.is_empty() || (!is_delete && v.is_empty()) { + return; + } + let mut req = Request::default(); + if is_delete { + let mut del = DeleteRequest::default(); + del.set_key(k); + del.set_cf(cf.to_string()); + req.set_cmd_type(CmdType::Delete); + req.set_delete(del); + } else { + if cf == CF_WRITE && !write_needs_restore(&v) { + return; + } + + let mut put = PutRequest::default(); + put.set_key(k); + put.set_value(v); + put.set_cf(cf.to_string()); + req.set_cmd_type(CmdType::Put); + req.set_put(put); + } + self.accept(cf, req); + } + + // we need to remove duplicate keys in here, since + // in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 + // will panic if found duplicated entry during Vec. + fn accept(&mut self, cf: &str, req: Request) { + let k = key_from_request(&req); + match cf { + CF_WRITE => { + let (encoded_key, ts) = match Key::split_on_ts_for(k) { + Ok(k) => k, + Err(err) => { + warn!( + "key without ts, skipping"; + "key" => %log_wrappers::Value::key(k), + "err" => %err + ); + return; + } + }; + if self + .write_reqs + .get(encoded_key) + .map(|(_, old_ts)| *old_ts < ts.into_inner()) + .unwrap_or(true) + { + self.unpacked_size += req.compute_size() as usize; + if let Some((v, _)) = self + .write_reqs + .insert(encoded_key.to_owned(), (req, ts.into_inner())) + { + self.unpacked_size -= v.get_cached_size() as usize; + } + } + } + CF_DEFAULT => { + self.unpacked_size += req.compute_size() as usize; + if let Some(v) = self.default_reqs.insert(k.to_owned(), req) { + self.unpacked_size -= v.get_cached_size() as usize; + } + } + _ => unreachable!(), + } + + if self.unpacked_size >= self.max_raft_req_size { + self.pack_all(); + } + } + + #[cfg(test)] + fn drain_unpacked_reqs(&mut self, cf: &str) -> Vec { + let res: Vec = if cf == CF_DEFAULT { + self.default_reqs.drain().map(|(_, req)| req).collect() + } else { + self.write_reqs.drain().map(|(_, (req, _))| req).collect() + }; + for r in &res { + self.unpacked_size -= r.get_cached_size() as usize; + } + res + } + + #[inline] + fn drain_raft_reqs(&mut self, take_unpacked: bool) -> std::vec::Drain<'_, RaftCmdRequest> { + if take_unpacked { + self.pack_all(); + } + self.pending_raft_reqs.drain(..) + } + + fn pack_all(&mut self) { + if self.unpacked_size == 0 { + return; + } + let mut cmd = RaftCmdRequest::default(); + let mut header = make_request_header(self.context.clone()); + // Set the UUID of header to prevent raftstore batching our requests. + // The current `resolved_ts` observer assumes that each batch of request doesn't + // has two writes to the same key. (Even with 2 different TS). That was true + // for normal cases because the latches reject concurrency write to keys. + // However we have bypassed the latch layer :( + header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); + cmd.set_header(header); + let mut reqs: Vec<_> = self.write_reqs.drain().map(|(_, (req, _))| req).collect(); + reqs.append(&mut self.default_reqs.drain().map(|(_, req)| req).collect()); + if reqs.is_empty() { + debug_assert!(false, "attempt to pack an empty request"); + return; + } + cmd.set_requests(reqs.into()); + + self.pending_raft_reqs.push(cmd); + self.unpacked_size = 0; + } + + #[inline] + fn is_empty(&self) -> bool { + self.pending_raft_reqs.is_empty() && self.unpacked_size == 0 + } +} + impl ImportSstService where E: KvEngine, @@ -281,6 +441,101 @@ where Ok(resp) } } + + async fn apply_imp( + mut req: ApplyRequest, + importer: Arc, + router: Router, + limiter: Limiter, + max_raft_size: usize, + ) -> std::result::Result, ImportPbError> { + type RaftWriteFuture = futures::channel::oneshot::Receiver; + async fn handle_raft_write(fut: RaftWriteFuture) -> std::result::Result<(), ImportPbError> { + match fut.await { + Err(e) => { + let msg = format!("failed to complete raft command: {}", e); + let mut e = ImportPbError::default(); + e.set_message(msg); + return Err(e); + } + Ok(mut r) if r.response.get_header().has_error() => { + let mut e = ImportPbError::default(); + e.set_message("failed to complete raft command".to_string()); + e.set_store_error(r.response.take_header().take_error()); + return Err(e); + } + _ => {} + } + Ok(()) + } + + let mut range: Option = None; + + let mut collector = RequestCollector::new(req.take_context(), max_raft_size * 7 / 8); + let mut metas = req.take_metas(); + let mut rules = req.take_rewrite_rules(); + // For compatibility with old requests. + if req.has_meta() { + metas.push(req.take_meta()); + rules.push(req.take_rewrite_rule()); + } + let ext_storage = importer.wrap_kms( + importer + .external_storage_or_cache(req.get_storage_backend(), req.get_storage_cache_id())?, + false, + ); + + let mut inflight_futures: VecDeque = VecDeque::new(); + + let mut tasks = metas.iter().zip(rules.iter()).peekable(); + while let Some((meta, rule)) = tasks.next() { + let buff = importer.read_from_kv_file( + meta, + rule, + ext_storage.clone(), + req.get_storage_backend(), + &limiter, + )?; + if let Some(mut r) = importer.do_apply_kv_file( + meta.get_start_key(), + meta.get_end_key(), + meta.get_start_ts(), + meta.get_restore_ts(), + buff, + |k, v| collector.accept_kv(meta.get_cf(), meta.get_is_delete(), k, v), + )? { + if let Some(range) = range.as_mut() { + range.start = range.take_start().min(r.take_start()); + range.end = range.take_end().max(r.take_end()); + } else { + range = Some(r); + } + } + + let is_last_task = tasks.peek().is_none(); + for req in collector.drain_raft_reqs(is_last_task) { + while inflight_futures.len() >= MAX_INFLIGHT_RAFT_MSGS { + handle_raft_write(inflight_futures.pop_front().unwrap()).await?; + } + let (cb, future) = paired_future_callback(); + match router.send_command(req, Callback::write(cb), RaftCmdExtraOpts::default()) { + Ok(_) => inflight_futures.push_back(future), + Err(e) => { + let msg = format!("failed to send raft command: {}", e); + let mut e = ImportPbError::default(); + e.set_message(msg); + return Err(e); + } + } + } + } + assert!(collector.is_empty()); + for fut in inflight_futures { + handle_raft_write(fut).await?; + } + + Ok(range) + } } #[macro_export] @@ -375,8 +630,7 @@ where } let task = async move { - let res = Ok(SwitchModeResponse::default()); - crate::send_rpc_response!(res, sink, label, timer); + crate::send_rpc_response!(Ok(SwitchModeResponse::default()), sink, label, timer); }; ctx.spawn(task); } @@ -448,7 +702,7 @@ where .observe(start.saturating_elapsed().as_secs_f64()); if let Err(e) = importer.remove_dir(req.get_prefix()) { - let mut import_err = kvproto::import_sstpb::Error::default(); + let mut import_err = ImportPbError::default(); import_err.set_message(format!("failed to remove directory: {}", e)); resp.set_error(import_err); } @@ -456,176 +710,37 @@ where .with_label_values(&[label]) .observe(start.saturating_elapsed().as_secs_f64()); - let resp = Ok(resp); - crate::send_rpc_response!(resp, sink, label, timer); + crate::send_rpc_response!(Ok(resp), sink, label, timer); }; self.threads.spawn(handle_task); } // Downloads KV file and performs key-rewrite then apply kv into this tikv // store. - fn apply( - &mut self, - _ctx: RpcContext<'_>, - mut req: ApplyRequest, - sink: UnarySink, - ) { + fn apply(&mut self, _ctx: RpcContext<'_>, req: ApplyRequest, sink: UnarySink) { let label = "apply"; - let timer = Instant::now_coarse(); - let importer = Arc::clone(&self.importer); + let start = Instant::now(); + let importer = self.importer.clone(); let router = self.router.clone(); let limiter = self.limiter.clone(); - let start = Instant::now(); - let raft_size = self.raft_entry_max_size; + let max_raft_size = self.raft_entry_max_size.0 as usize; let handle_task = async move { // Records how long the apply task waits to be scheduled. sst_importer::metrics::IMPORTER_APPLY_DURATION .with_label_values(&["queue"]) .observe(start.saturating_elapsed().as_secs_f64()); - let mut start_apply = Instant::now(); - let mut futs = vec![]; - let mut apply_resp = ApplyResponse::default(); - let context = req.take_context(); - let mut rules = req.take_rewrite_rules(); - let mut metas = req.take_metas(); - // For compatibility with old requests. - if req.has_meta() { - metas.push(req.take_meta()); - rules.push(req.take_rewrite_rule()); - } - let result = (|| -> Result<()> { - let mut cmd_reqs = vec![]; - let mut reqs_default = RequestCollector::from_cf(CF_DEFAULT); - let mut reqs_write = RequestCollector::from_cf(CF_WRITE); - let mut req_default_size = 0_u64; - let mut req_write_size = 0_u64; - let mut range: Option = None; - let ext_storage = { - let inner = importer.wrap_kms( - importer.external_storage_or_cache( - req.get_storage_backend(), - req.get_storage_cache_id(), - )?, - false, - ); - inner - }; - - for (i, meta) in metas.iter().enumerate() { - let (reqs, req_size) = if meta.get_cf() == CF_DEFAULT { - (&mut reqs_default, &mut req_default_size) - } else { - (&mut reqs_write, &mut req_write_size) - }; - - let mut build_req_fn = build_apply_request( - req_size, - raft_size.0, - reqs, - cmd_reqs.as_mut(), - meta.get_is_delete(), - meta.get_cf(), - context.clone(), - ); - - let buff = importer.read_from_kv_file( - meta, - &rules[i], - Arc::clone(&ext_storage), - req.get_storage_backend(), - &limiter, - )?; - let r: Option = importer.do_apply_kv_file( - meta.get_start_key(), - meta.get_end_key(), - meta.get_start_ts(), - meta.get_restore_ts(), - buff, - &mut build_req_fn, - )?; - - if let Some(mut r) = r { - range = match range { - Some(mut v) => { - let s = v.take_start().min(r.take_start()); - let e = v.take_end().max(r.take_end()); - Some(Range { - start: s, - end: e, - ..Default::default() - }) - } - None => Some(r), - }; - } - } + let mut resp = ApplyResponse::default(); - if !reqs_default.is_empty() { - let cmd = make_request(&mut reqs_default, context.clone()); - cmd_reqs.push(cmd); - IMPORTER_APPLY_BYTES.observe(req_default_size as _); - } - if !reqs_write.is_empty() { - let cmd = make_request(&mut reqs_write, context); - cmd_reqs.push(cmd); - IMPORTER_APPLY_BYTES.observe(req_write_size as _); - } - - start_apply = Instant::now(); - for cmd in cmd_reqs { - let (cb, future) = paired_future_callback(); - match router.send_command(cmd, Callback::write(cb), RaftCmdExtraOpts::default()) - { - Ok(_) => futs.push(future), - Err(e) => { - let mut import_err = kvproto::import_sstpb::Error::default(); - import_err.set_message(format!("failed to send raft command: {}", e)); - apply_resp.set_error(import_err); - } - } - } - if let Some(r) = range { - apply_resp.set_range(r); - } - Ok(()) - })(); - if let Err(e) = result { - apply_resp.set_error(e.into()); + match Self::apply_imp(req, importer, router, limiter, max_raft_size).await { + Ok(Some(r)) => resp.set_range(r), + Err(e) => resp.set_error(e), + _ => {} } - let resp = Ok(join_all(futs).await.iter().fold(apply_resp, |mut resp, x| { - match x { - Err(e) => { - let mut import_err = kvproto::import_sstpb::Error::default(); - import_err.set_message(format!("failed to complete raft command: {}", e)); - resp.set_error(import_err); - } - Ok(r) => { - if r.response.get_header().has_error() { - let mut import_err = kvproto::import_sstpb::Error::default(); - let err = r.response.get_header().get_error(); - import_err.set_message("failed to complete raft command".to_string()); - // FIXME: if there are many errors, we may lose some of them here. - import_err.set_store_error(err.clone()); - warn!("failed to apply the file to the store"; "error" => ?err); - resp.set_error(import_err); - } - } - } - resp - })); - - // Records how long the apply task waits to be scheduled. - sst_importer::metrics::IMPORTER_APPLY_DURATION - .with_label_values(&["apply"]) - .observe(start_apply.saturating_elapsed().as_secs_f64()); - sst_importer::metrics::IMPORTER_APPLY_DURATION - .with_label_values(&["finish"]) - .observe(start.saturating_elapsed().as_secs_f64()); debug!("finished apply kv file with {:?}", resp); - crate::send_rpc_response!(resp, sink, label, timer); + crate::send_rpc_response!(Ok(resp), sink, label, start); }; self.block_threads.spawn_ok(handle_task); } @@ -678,8 +793,7 @@ where }, Err(e) => resp.set_error(e.into()), } - let resp = Ok(resp); - crate::send_rpc_response!(resp, sink, label, timer); + crate::send_rpc_response!(Ok(resp), sink, label, timer); }; self.threads.spawn(handle_task); @@ -848,8 +962,12 @@ where }); let ctx_task = async move { - let res = Ok(SetDownloadSpeedLimitResponse::default()); - crate::send_rpc_response!(res, sink, label, timer); + crate::send_rpc_response!( + Ok(SetDownloadSpeedLimitResponse::default()), + sink, + label, + timer + ); }; ctx.spawn(ctx_task); @@ -958,70 +1076,6 @@ fn pb_error_inc(type_: &str, e: &errorpb::Error) { IMPORTER_ERROR_VEC.with_label_values(&[type_, label]).inc(); } -enum RequestCollector { - /// Retain the last ts of each key in each request. - /// This is used for write CF because resolved ts observer hates duplicated - /// key in the same request. - RetainLastTs(HashMap, (Request, u64)>), - /// Collector favor that simple collect all items, and it do not contains - /// duplicated key-value. This is used for default CF. - KeepAll(HashMap, Request>), -} - -impl RequestCollector { - fn from_cf(cf: &str) -> Self { - match cf { - CF_DEFAULT | "" => Self::KeepAll(Default::default()), - CF_WRITE => Self::RetainLastTs(Default::default()), - _ => { - warn!("unknown cf name, using default request collector"; "cf" => %cf); - Self::RetainLastTs(Default::default()) - } - } - } - - fn accept(&mut self, req: Request) { - let k = key_from_request(&req); - match self { - RequestCollector::RetainLastTs(ref mut reqs) => { - let (encoded_key, ts) = match Key::split_on_ts_for(k) { - Ok(k) => k, - Err(err) => { - warn!("key without ts, skipping"; "key" => %log_wrappers::Value::key(k), "err" => %err); - return; - } - }; - if reqs - .get(encoded_key) - .map(|(_, old_ts)| *old_ts < ts.into_inner()) - .unwrap_or(true) - { - reqs.insert(encoded_key.to_owned(), (req, ts.into_inner())); - } - } - RequestCollector::KeepAll(ref mut reqs) => { - reqs.insert(k.to_owned(), req); - } - } - } - - fn drain(&mut self) -> Vec { - match self { - RequestCollector::RetainLastTs(ref mut reqs) => { - reqs.drain().map(|(_, (req, _))| req).collect() - } - RequestCollector::KeepAll(ref mut reqs) => reqs.drain().map(|(_, req)| req).collect(), - } - } - - fn is_empty(&self) -> bool { - match self { - RequestCollector::RetainLastTs(reqs) => reqs.is_empty(), - RequestCollector::KeepAll(reqs) => reqs.is_empty(), - } - } -} - fn key_from_request(req: &Request) -> &[u8] { if req.has_put() { return req.get_put().get_key(); @@ -1029,8 +1083,7 @@ fn key_from_request(req: &Request) -> &[u8] { if req.has_delete() { return req.get_delete().get_key(); } - warn!("trying to extract key from request is neither put nor delete."); - b"" + panic!("trying to extract key from request is neither put nor delete.") } fn make_request_header(mut context: Context) -> RaftRequestHeader { @@ -1042,77 +1095,6 @@ fn make_request_header(mut context: Context) -> RaftRequestHeader { header } -fn make_request(reqs: &mut RequestCollector, context: Context) -> RaftCmdRequest { - let mut cmd = RaftCmdRequest::default(); - let mut header = make_request_header(context); - // Set the UUID of header to prevent raftstore batching our requests. - // The current `resolved_ts` observer assumes that each batch of request doesn't - // has two writes to the same key. (Even with 2 different TS). That was true - // for normal cases because the latches reject concurrency write to keys. - // However we have bypassed the latch layer :( - header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); - cmd.set_header(header); - cmd.set_requests(reqs.drain().into()); - cmd -} - -// we need to remove duplicate keys in here, since -// in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 -// will panic if found duplicated entry during Vec. -fn build_apply_request<'a, 'b>( - req_size: &'a mut u64, - raft_size: u64, - reqs: &'a mut RequestCollector, - cmd_reqs: &'a mut Vec, - is_delete: bool, - cf: &'b str, - context: Context, -) -> Box, Vec) + 'b> -where - 'a: 'b, -{ - // use callback to collect kv data. - Box::new(move |k: Vec, v: Vec| { - // Need to skip the empty key/value that could break the transaction or cause - // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. - if k.is_empty() || (!is_delete && v.is_empty()) { - return; - } - - let mut req = Request::default(); - if is_delete { - let mut del = DeleteRequest::default(); - del.set_key(k); - del.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Delete); - req.set_delete(del); - } else { - if cf == CF_WRITE && !write_needs_restore(&v) { - return; - } - - let mut put = PutRequest::default(); - put.set_key(k); - put.set_value(v); - put.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Put); - req.set_put(put); - } - - // When the request size get grow to max request size, - // build the request and add it to a batch. - if *req_size + req.compute_size() as u64 > raft_size * 7 / 8 { - IMPORTER_APPLY_BYTES.observe(*req_size as _); - *req_size = 0; - let cmd = make_request(reqs, context.clone()); - cmd_reqs.push(cmd); - } - - *req_size += req.compute_size() as u64; - reqs.accept(req); - }) -} - fn write_needs_restore(write: &[u8]) -> bool { let w = WriteRef::parse(write); match w { @@ -1146,9 +1128,7 @@ mod test { use kvproto::{kvrpcpb::Context, raft_cmdpb::*}; use txn_types::{Key, TimeStamp, Write, WriteType}; - use crate::import::sst_service::{ - build_apply_request, key_from_request, make_request, RequestCollector, - }; + use crate::import::sst_service::{key_from_request, RequestCollector}; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1213,30 +1193,14 @@ mod test { } fn run_case(c: &Case) { - let mut cmds = vec![]; - let mut reqs = RequestCollector::from_cf(c.cf); - let mut req_size = 0_u64; - - let mut builder = build_apply_request( - &mut req_size, - 1024, - &mut reqs, - &mut cmds, - c.is_delete, - c.cf, - Context::new(), - ); + let mut collector = RequestCollector::new(Context::new(), 1024); for (k, v) in c.mutations.clone() { - builder(k, v); - } - drop(builder); - if !reqs.is_empty() { - let cmd = make_request(&mut reqs, Context::new()); - cmds.push(cmd); + collector.accept_kv(c.cf, c.is_delete, k, v); } + let reqs = collector.drain_raft_reqs(true); - let mut req1: HashMap<_, _> = cmds + let mut req1: HashMap<_, _> = reqs .into_iter() .flat_map(|mut x| x.take_requests().into_iter()) .map(|req| { @@ -1318,8 +1282,7 @@ mod test { #[test] fn test_request_collector_with_write_cf() { - let mut request_collector = RequestCollector::from_cf(CF_WRITE); - assert_eq!(request_collector.is_empty(), true); + let mut request_collector = RequestCollector::new(Context::new(), 102400); let reqs = vec![ write_req(b"foo", WriteType::Put, 40, 39), write_req(b"aar", WriteType::Put, 38, 37), @@ -1333,23 +1296,21 @@ mod test { ]; for req in reqs { - request_collector.accept(req); + request_collector.accept(CF_WRITE, req); } - assert_eq!(request_collector.is_empty(), false); - let mut reqs = request_collector.drain(); + let mut reqs: Vec<_> = request_collector.drain_unpacked_reqs(CF_WRITE); reqs.sort_by(|r1, r2| { let k1 = key_from_request(r1); let k2 = key_from_request(r2); k1.cmp(k2) }); assert_eq!(reqs, reqs_result); - assert_eq!(request_collector.is_empty(), true); + assert!(request_collector.is_empty()); } #[test] fn test_request_collector_with_default_cf() { - let mut request_collector = RequestCollector::from_cf(CF_DEFAULT); - assert_eq!(request_collector.is_empty(), true); + let mut request_collector = RequestCollector::new(Context::new(), 102400); let reqs = vec![ default_req(b"foo", b"", 39), default_req(b"zzz", b"", 40), @@ -1363,10 +1324,9 @@ mod test { ]; for req in reqs { - request_collector.accept(req); + request_collector.accept(CF_DEFAULT, req); } - assert_eq!(request_collector.is_empty(), false); - let mut reqs = request_collector.drain(); + let mut reqs: Vec<_> = request_collector.drain_unpacked_reqs(CF_DEFAULT); reqs.sort_by(|r1, r2| { let k1 = key_from_request(r1); let (k1, ts1) = Key::split_on_ts_for(k1).unwrap(); @@ -1376,6 +1336,6 @@ mod test { k1.cmp(k2).then(ts1.cmp(&ts2)) }); assert_eq!(reqs, reqs_result); - assert_eq!(request_collector.is_empty(), true); + assert!(request_collector.is_empty()); } } From 71efe9e6af802761bec9fcc0e468035cf3adb3b7 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 6 Jan 2023 18:16:21 +0800 Subject: [PATCH 449/676] raftstore-v2: adaptive apply (#14020) ref tikv/tikv#12842 Make apply adaptive to reduce high tail latency. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/fsm/apply.rs | 12 ++- .../operation/command/admin/compact_log.rs | 99 +++++++++++++------ .../operation/command/admin/conf_change.rs | 11 ++- .../src/operation/command/admin/split.rs | 4 +- .../raftstore-v2/src/operation/command/mod.rs | 88 ++++++++++++++++- components/raftstore-v2/src/operation/life.rs | 8 +- components/raftstore-v2/src/operation/mod.rs | 6 +- .../src/operation/ready/apply_trace.rs | 17 ++-- .../raftstore-v2/src/operation/ready/mod.rs | 4 +- .../src/operation/ready/snapshot.rs | 28 +++--- components/raftstore-v2/src/raft/apply.rs | 21 +++- components/raftstore-v2/src/raft/storage.rs | 5 +- 12 files changed, 225 insertions(+), 78 deletions(-) diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index b81d31329cb..1544a703c6d 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -10,7 +10,7 @@ use crossbeam::channel::TryRecvError; use engine_traits::{FlushState, KvEngine, TabletRegistry}; use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; -use raftstore::store::ReadTask; +use raftstore::store::{Config, ReadTask}; use slog::Logger; use tikv_util::{ mpsc::future::{self, Receiver, Sender, WakePolicy}, @@ -58,6 +58,7 @@ pub struct ApplyFsm { impl ApplyFsm { pub fn new( + cfg: &Config, peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, @@ -70,6 +71,7 @@ impl ApplyFsm { ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); let apply = Apply::new( + cfg, peer, region_state, res_reporter, @@ -100,6 +102,7 @@ impl ApplyFsm { res = self.receiver.next().fuse() => res, _ = timeout.fuse() => None, }; + self.apply.on_start_apply(); let mut task = match res { Some(r) => r, None => { @@ -116,10 +119,10 @@ impl ApplyFsm { ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), ApplyTask::UnsafeWrite(raw_write) => self.apply.apply_unsafe_write(raw_write), - ApplyTask::ManualFlush => self.apply.on_manual_flush(), + ApplyTask::ManualFlush => self.apply.on_manual_flush().await, } - // TODO: yield after some time. + self.apply.maybe_flush().await; // Perhaps spin sometime? match self.receiver.try_recv() { @@ -128,7 +131,8 @@ impl ApplyFsm { Err(TryRecvError::Disconnected) => return, } } - self.apply.flush(); + let written_bytes = self.apply.flush(); + self.apply.maybe_reschedule(written_bytes).await; } } } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 7127cd45306..39cf02de775 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -23,7 +23,7 @@ use raftstore::{ Result, }; use slog::{debug, error, info}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat}; use crate::{ batch::StoreContext, @@ -303,6 +303,35 @@ impl Peer { } } + #[inline] + pub fn record_tombstone_tablet_for_destroy( + &mut self, + ctx: &StoreContext, + task: &mut WriteTask, + ) { + let compact_log_context = self.compact_log_context_mut(); + assert!( + compact_log_context.tombstone_tablets_wait_index.is_empty(), + "{} all tombstone should be cleared before being destroyed.", + SlogFormat(&self.logger) + ); + let tablet = match self.tablet() { + Some(tablet) => tablet.clone(), + None => return, + }; + let region_id = self.region_id(); + let applied_index = self.entry_storage().applied_index(); + let sched = ctx.schedulers.tablet_gc.clone(); + let _ = sched.schedule(tablet_gc::Task::prepare_destroy( + tablet, + self.region_id(), + applied_index, + )); + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet_gc::Task::destroy(region_id, applied_index)); + })); + } + pub fn on_apply_res_compact_log( &mut self, store_ctx: &mut StoreContext, @@ -342,8 +371,17 @@ impl Peer { self.set_has_extra_write(); // All logs < perssited_apply will be deleted, so should check with +1. - if old_truncated + 1 < self.storage().apply_trace().persisted_apply_index() { - self.compact_log_from_engine(store_ctx); + if old_truncated + 1 < self.storage().apply_trace().persisted_apply_index() + && let Some(index) = self.compact_log_index() { + // Raft Engine doesn't care about first index. + if let Err(e) = + store_ctx + .engine + .gc(self.region_id(), 0, index, self.state_changes_mut()) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } + // Extra write set right above. } let context = self.compact_log_context_mut(); @@ -354,38 +392,44 @@ impl Peer { (context.approximate_log_size as f64 * (remain_cnt as f64 / total_cnt as f64)) as u64; } - /// Called when apply index is persisted. There are two different situation: - /// - /// Generally, additional writes are triggered to persist apply index. In - /// this case task is `Some`. But after applying snapshot, the apply - /// index is persisted ahead of time. In this case task is `None`. + /// Called when apply index is persisted. #[inline] pub fn on_advance_persisted_apply_index( &mut self, store_ctx: &mut StoreContext, old_persisted: u64, - task: Option<&mut WriteTask>, + task: &mut WriteTask, ) { let new_persisted = self.storage().apply_trace().persisted_apply_index(); if old_persisted < new_persisted { let region_id = self.region_id(); // TODO: batch it. + // TODO: avoid allocation if there is nothing to delete. if let Err(e) = store_ctx.engine.delete_all_but_one_states_before( region_id, new_persisted, - self.state_changes_mut(), + task.extra_write + .ensure_v2(|| self.entry_storage().raft_engine().log_batch(0)), ) { error!(self.logger, "failed to delete raft states"; "err" => ?e); - } else { - self.set_has_extra_write(); } // If it's snapshot, logs are gc already. - if task.is_some() && old_persisted < self.entry_storage().truncated_index() + 1 { - self.compact_log_from_engine(store_ctx); + if !task.has_snapshot + && old_persisted < self.entry_storage().truncated_index() + 1 + && let Some(index) = self.compact_log_index() { + let batch = task.extra_write.ensure_v2(|| self.entry_storage().raft_engine().log_batch(0)); + // Raft Engine doesn't care about first index. + if let Err(e) = + store_ctx + .engine + .gc(self.region_id(), 0, index, batch) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } } if self.remove_tombstone_tablets(new_persisted) { let sched = store_ctx.schedulers.tablet_gc.clone(); - if let Some(task) = task { + if !task.has_snapshot { task.persisted_cbs.push(Box::new(move || { let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); })); @@ -397,28 +441,19 @@ impl Peer { } } - fn compact_log_from_engine(&mut self, store_ctx: &mut StoreContext) { + fn compact_log_index(&mut self) -> Option { let truncated = self.entry_storage().truncated_index() + 1; let persisted_applied = self.storage().apply_trace().persisted_apply_index(); let compact_index = std::cmp::min(truncated, persisted_applied); if compact_index == RAFT_INIT_LOG_INDEX + 1 { // There is no logs at RAFT_INIT_LOG_INDEX, nothing to delete. - return; - } - // Raft Engine doesn't care about first index. - if let Err(e) = - store_ctx - .engine - .gc(self.region_id(), 0, compact_index, self.state_changes_mut()) - { - error!(self.logger, "failed to compact raft logs"; "err" => ?e); - } else { - // TODO: make this debug when stable. - info!(self.logger, "compact log"; - "index" => compact_index, - "apply_trace" => ?self.storage().apply_trace(), - "truncated" => ?self.entry_storage().apply_state()); - self.set_has_extra_write(); + return None; } + // TODO: make this debug when stable. + info!(self.logger, "compact log"; + "index" => compact_index, + "apply_trace" => ?self.storage().apply_trace(), + "truncated" => ?self.entry_storage().apply_state()); + Some(compact_index) } } diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 6c041a551fe..8c9771b0201 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -9,7 +9,7 @@ use std::time::Instant; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ metapb::{self, PeerRole}, raft_cmdpb::{AdminRequest, AdminResponse, ChangePeerRequest, RaftCmdRequest}, @@ -146,7 +146,7 @@ impl Peer { let remove_self = conf_change.region_state.get_state() == PeerState::Tombstone; self.storage_mut() - .set_region_state(conf_change.region_state); + .set_region_state(conf_change.region_state.clone()); if self.is_leader() { info!( self.logger, @@ -189,7 +189,14 @@ impl Peer { self.raft_group().raft.state, ); if remove_self { + // When self is destroyed, all metas will be cleaned in `start_destroy`. self.mark_for_destroy(None); + } else { + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, conf_change.index, &conf_change.region_state) + .unwrap(); + self.set_has_extra_write(); } } } diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index f63f1f2ae17..d01b1371338 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -731,7 +731,7 @@ mod test { raft_cmdpb::{BatchSplitRequest, SplitRequest}, raft_serverpb::{PeerState, RegionLocalState}, }; - use raftstore::store::cmd_resp::new_error; + use raftstore::store::{cmd_resp::new_error, Config}; use slog::o; use tempfile::TempDir; use tikv_util::{ @@ -872,6 +872,7 @@ mod test { let (read_scheduler, _rx) = dummy_scheduler(); let (reporter, _) = MockReporter::new(); let mut apply = Apply::new( + &Config::default(), region .get_peers() .iter() @@ -1059,6 +1060,7 @@ mod test { // Split will create checkpoint tablet, so if there are some writes before // split, they should be flushed immediately. apply.apply_put(CF_DEFAULT, 50, b"k04", b"v4").unwrap(); + apply.apply_flow_control_mut().set_need_flush(true); assert!(!WriteBatch::is_empty(apply.write_batch.as_ref().unwrap())); splits.mut_requests().clear(); splits diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 439d2136d76..a6ab227d402 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -16,7 +16,7 @@ //! - Applied result are sent back to peer fsm, and update memory state in //! `on_apply_res`. -use std::mem; +use std::{mem, time::Duration}; use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; use kvproto::raft_cmdpb::{ @@ -35,7 +35,7 @@ use raftstore::{ local_metrics::RaftMetrics, metrics::APPLY_TASK_WAIT_TIME_HISTOGRAM, msg::ErrorCallback, - util, WriteCallback, + util, Config, WriteCallback, }, Error, Result, }; @@ -111,6 +111,7 @@ impl Peer { let logger = self.logger.clone(); let read_scheduler = self.storage().read_scheduler(); let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( + &store_ctx.cfg, self.peer().clone(), region_state, mailbox, @@ -268,6 +269,8 @@ impl Peer { if !self.serving() { return; } + // TODO: remove following log once stable. + info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res); // It must just applied a snapshot. if apply_res.applied_index < self.entry_storage().first_index() { // Ignore admin command side effects, otherwise it may split incomplete @@ -334,7 +337,38 @@ impl Peer { } } +#[derive(Debug)] +pub struct ApplyFlowControl { + timer: Instant, + last_check_keys: u64, + need_flush: bool, + yield_time: Duration, + yield_written_bytes: u64, +} + +impl ApplyFlowControl { + pub fn new(cfg: &Config) -> Self { + ApplyFlowControl { + timer: Instant::now_coarse(), + last_check_keys: 0, + need_flush: false, + yield_time: cfg.apply_yield_duration.0, + yield_written_bytes: cfg.apply_yield_write_size.0, + } + } + + #[cfg(test)] + pub fn set_need_flush(&mut self, need_flush: bool) { + self.need_flush = need_flush; + } +} + impl Apply { + #[inline] + pub fn on_start_apply(&mut self) { + self.apply_flow_control_mut().timer = Instant::now_coarse(); + } + #[inline] fn should_skip(&self, off: usize, index: u64) -> bool { let log_recovery = self.log_recovery(); @@ -370,13 +404,15 @@ impl Apply { } } } + self.apply_flow_control_mut().need_flush = true; } - pub fn on_manual_flush(&mut self) { - self.flush(); + pub async fn on_manual_flush(&mut self) { + let written_bytes = self.flush(); if let Err(e) = self.tablet().flush_cfs(&[], false) { warn!(self.logger, "failed to flush: {:?}", e); } + self.maybe_reschedule(written_bytes).await } #[inline] @@ -414,6 +450,7 @@ impl Apply { } // Flush may be triggerred in the middle, so always update the index and term. self.set_apply_progress(e.index, e.term); + self.apply_flow_control_mut().need_flush = true; } } @@ -544,10 +581,49 @@ impl Apply { } } + fn should_reschedule(&self, written_bytes: u64) -> bool { + let control = self.apply_flow_control(); + written_bytes >= control.yield_written_bytes + || control.timer.saturating_elapsed() >= control.yield_time + } + + pub async fn maybe_reschedule(&mut self, written_bytes: u64) { + if self.should_reschedule(written_bytes) { + yatp::task::future::reschedule().await; + self.apply_flow_control_mut().timer = Instant::now_coarse(); + } + } + + /// Check whether it needs to flush. + /// + /// We always batch as much inputs as possible, flush will only be triggered + /// when it has been processing too long. + pub async fn maybe_flush(&mut self) { + let buffer_keys = self.metrics.written_keys; + let control = self.apply_flow_control_mut(); + if buffer_keys >= control.last_check_keys + 128 { + // Reschedule by write size was designed to avoid too many deletes impacts + // performance so it doesn't need pricise control. If checking bytes here may + // make the batch too small and hurt performance. + if self.should_reschedule(0) { + let written_bytes = self.flush(); + self.maybe_reschedule(written_bytes).await; + } else { + self.apply_flow_control_mut().last_check_keys = self.metrics.written_keys; + } + } + } + #[inline] - pub fn flush(&mut self) { + pub fn flush(&mut self) -> u64 { // TODO: maybe we should check whether there is anything to flush. let (index, term) = self.apply_progress(); + let control = self.apply_flow_control_mut(); + control.last_check_keys = 0; + if !control.need_flush { + return 0; + } + control.need_flush = false; let flush_state = self.flush_state().clone(); if let Some(wb) = &mut self.write_batch && !wb.is_empty() { let mut write_opt = WriteOptions::default(); @@ -578,6 +654,8 @@ impl Apply { apply_res.admin_result = self.take_admin_result().into_boxed_slice(); apply_res.modifications = *self.modifications_mut(); apply_res.metrics = mem::take(&mut self.metrics); + let written_bytes = apply_res.metrics.written_bytes; self.res_reporter().report(apply_res); + written_bytes } } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index f312162d1e5..f82fb1e8386 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -293,7 +293,11 @@ impl Peer { /// /// After destroy is finished, `finish_destroy` should be called to clean up /// memory states. - pub fn start_destroy(&mut self, write_task: &mut WriteTask) { + pub fn start_destroy( + &mut self, + ctx: &mut StoreContext, + write_task: &mut WriteTask, + ) { if self.postponed_destroy() { return; } @@ -311,7 +315,7 @@ impl Peer { let applied_index = self.entry_storage().applied_index(); lb.put_region_state(region_id, applied_index, ®ion_state) .unwrap(); - self.set_has_extra_write(); + self.record_tombstone_tablet_for_destroy(ctx, write_task); self.destroy_progress_mut().start(); } diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index dc245c24384..807f425e998 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -7,9 +7,9 @@ mod query; mod ready; pub use command::{ - AdminCmdResult, CommittedEntries, CompactLogContext, ProposalControl, RequestSplit, - SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, - SplitFlowControl, SPLIT_PREFIX, + AdminCmdResult, ApplyFlowControl, CommittedEntries, CompactLogContext, ProposalControl, + RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, + SimpleWriteReqEncoder, SplitFlowControl, SPLIT_PREFIX, }; pub use life::DestroyProgress; pub use ready::{ diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 5ff9a27dee0..e5b1c169c5b 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -301,19 +301,24 @@ impl ApplyTrace { None } - pub fn reset_snapshot(&mut self, index: u64) { + pub fn restore_snapshot(&mut self, index: u64) { for pr in self.data_cfs.iter_mut() { - pr.flushed = index; pr.last_modified = index; } - self.admin.flushed = index; + self.admin.last_modified = index; + // Snapshot is a special case that KVs are not flushed yet, so all flushed + // state should not be changed. But persisted_applied is updated whenever an + // asynchronous write is triggered. So it can lead to a special case that + // persisted_applied < admin.flushed. It seems no harm ATM though. self.persisted_applied = index; self.try_persist = false; } - #[inline] - pub fn reset_should_persist(&mut self) { - self.try_persist = false; + pub fn on_applied_snapshot(&mut self, index: u64) { + for pr in self.data_cfs.iter_mut() { + pr.flushed = index; + } + self.admin.flushed = index; } #[inline] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 3ac500b7f49..2fdc228ea2f 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -449,7 +449,7 @@ impl Peer { self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); - self.on_advance_persisted_apply_index(ctx, prev_persisted, Some(&mut write_task)); + self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); if !ready.persisted_messages().is_empty() { write_task.messages = ready @@ -459,7 +459,7 @@ impl Peer { .collect(); } if !self.serving() { - self.start_destroy(&mut write_task); + self.start_destroy(ctx, &mut write_task); ctx.coprocessor_host.on_region_changed( self.region(), RegionChangeEvent::Destroy, diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 1919ce269a6..04b6ed7e12b 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -215,8 +215,8 @@ impl Peer { let path = ctx.tablet_registry.tablet_path(region_id, snapshot_index); assert!( path.exists(), - "{:?} {} not exists", - self.logger.list(), + "{} {} not exists", + SlogFormat(&self.logger), path.display() ); let tablet = ctx @@ -224,15 +224,14 @@ impl Peer { .tablet_factory() .open_tablet(tablet_ctx, &path) .unwrap_or_else(|e| { - panic!( - "{:?} failed to load tablet at {}: {:?}", - self.logger.list(), - path.display(), - e + slog_panic!( + self.logger, + "failed to load tablet"; + "path" => path.display(), + "error" => ?e ); }); - let prev_persisted_applied = self.storage().apply_trace().persisted_apply_index(); self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(snapshot_index); let read_tablet = SharedReadTablet::new(tablet.clone()); @@ -258,7 +257,6 @@ impl Peer { info!(self.logger, "init split with snapshot finished"); self.post_split_init(ctx, init); } - self.on_advance_persisted_apply_index(ctx, prev_persisted_applied, None); self.schedule_apply_fsm(ctx); } } @@ -511,7 +509,7 @@ impl Storage { let index = entry.truncated_index(); entry.set_applied_term(term); entry.apply_state_mut().set_applied_index(index); - self.apply_trace_mut().reset_snapshot(index); + self.apply_trace_mut().on_applied_snapshot(index); } pub fn apply_snapshot( @@ -552,10 +550,10 @@ impl Storage { raft_engine .clean(region.get_id(), 0, self.entry_storage().raft_state(), wb) .unwrap_or_else(|e| { - panic!( - "{:?} failed to clean up region: {:?}", - self.logger().list(), - e + slog_panic!( + self.logger(), + "failed to clean up region"; + "error" => ?e ) }); self.entry_storage_mut().clear(); @@ -578,7 +576,7 @@ impl Storage { entry_storage.set_truncated_term(last_term); entry_storage.set_last_term(last_term); - self.apply_trace_mut().reset_should_persist(); + self.apply_trace_mut().restore_snapshot(last_index); self.set_ever_persisted(); let lb = task .extra_write diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 2407d1ab3fe..7a5b03120b1 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -6,13 +6,13 @@ use engine_traits::{FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_L use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{ fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, - ReadTask, + Config, ReadTask, }; use slog::Logger; use tikv_util::{log::SlogFormat, worker::Scheduler}; use crate::{ - operation::{AdminCmdResult, DataTrace}, + operation::{AdminCmdResult, ApplyFlowControl, DataTrace}, router::CmdResChannel, }; @@ -28,6 +28,8 @@ pub struct Apply { callbacks: Vec<(Vec, RaftCmdResponse)>, + flow_control: ApplyFlowControl, + /// A flag indicates whether the peer is destroyed by applying admin /// command. tombstone: bool, @@ -58,6 +60,7 @@ pub struct Apply { impl Apply { #[inline] pub fn new( + cfg: &Config, peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, @@ -79,6 +82,7 @@ impl Apply { tablet: remote_tablet.latest().unwrap().clone(), write_batch: None, callbacks: vec![], + flow_control: ApplyFlowControl::new(cfg), tombstone: false, applied_term, applied_index: flush_state.applied_index(), @@ -158,8 +162,8 @@ impl Apply { pub fn set_tablet(&mut self, tablet: EK) { assert!( self.write_batch.as_ref().map_or(true, |wb| wb.is_empty()), - "{:?}", - self.logger.list() + "{} setting tablet while still have dirty write batch", + SlogFormat(&self.logger) ); self.write_batch.take(); self.tablet = tablet; @@ -222,4 +226,13 @@ impl Apply { pub fn log_recovery(&self) -> &Option> { &self.log_recovery } + + #[inline] + pub fn apply_flow_control_mut(&mut self) -> &mut ApplyFlowControl { + &mut self.flow_control + } + + pub fn apply_flow_control(&self) -> &ApplyFlowControl { + &self.flow_control + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index aca8f0fafce..1d1f53f9c53 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -332,8 +332,8 @@ mod tests { }; use raft::{Error as RaftError, StorageError}; use raftstore::store::{ - util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, FetchedLogs, GenSnapRes, - ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, + util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, Config, FetchedLogs, + GenSnapRes, ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; use slog::o; @@ -500,6 +500,7 @@ mod tests { state.set_region(region.clone()); // setup peer applyer let mut apply = Apply::new( + &Config::default(), region.get_peers()[0].clone(), state, router, From 8e6e348505e7f1f7b5e023c00b30f90e8d1b4084 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 10 Jan 2023 14:28:23 +0800 Subject: [PATCH 450/676] raftstore-v2: add waterfall metrics (#14029) ref tikv/tikv#12842 - add water metrics - fix potential panic when destroying a peer - fix incorrect store size Signed-off-by: Jay Lee --- components/engine_rocks/src/misc.rs | 4 +- components/raftstore-v2/src/batch/store.rs | 14 +- .../operation/command/admin/compact_log.rs | 10 +- .../raftstore-v2/src/operation/command/mod.rs | 71 ++++++++- components/raftstore-v2/src/operation/life.rs | 2 + .../src/operation/ready/apply_trace.rs | 5 + .../raftstore-v2/src/operation/ready/mod.rs | 144 +++++++++++++++++- components/raftstore-v2/src/raft/apply.rs | 15 +- .../src/router/response_channel.rs | 36 +++-- components/raftstore/src/lib.rs | 1 + .../raftstore/src/store/async_io/write.rs | 6 +- components/raftstore/src/store/fsm/apply.rs | 28 ++-- components/raftstore/src/store/fsm/peer.rs | 19 +-- .../raftstore/src/store/local_metrics.rs | 4 +- components/raftstore/src/store/msg.rs | 44 ++++-- components/raftstore/src/store/peer.rs | 24 +-- 16 files changed, 350 insertions(+), 77 deletions(-) diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 55546869272..e339facaac4 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -2,7 +2,7 @@ use engine_traits::{ CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, - Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, + Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, }; use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; @@ -258,7 +258,7 @@ impl MiscExt for RocksEngine { fn get_engine_used_size(&self) -> Result { let mut used_size: u64 = 0; - for cf in ALL_CFS { + for cf in self.cf_names() { let handle = util::get_cf_handle(self.as_inner(), cf)?; used_size += util::get_engine_cf_used_size(self.as_inner(), handle); } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 621f826619b..6183778c369 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -36,7 +36,7 @@ use tikv_util::{ config::{Tracker, VersionTrack}, log::SlogFormat, sys::SysQuota, - time::Instant as TiInstant, + time::{duration_to_sec, Instant as TiInstant}, timer::SteadyTimer, worker::{LazyWorker, Scheduler, Worker}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, @@ -122,6 +122,7 @@ struct StorePoller { /// Buffers to hold in-coming messages. store_msg_buf: Vec, peer_msg_buf: Vec, + timer: tikv_util::time::Instant, /// These fields controls the timing of flushing messages generated by /// FSMs. last_flush_time: TiInstant, @@ -135,6 +136,7 @@ impl StorePoller { cfg_tracker, store_msg_buf: Vec::new(), peer_msg_buf: Vec::new(), + timer: tikv_util::time::Instant::now(), last_flush_time: TiInstant::now(), need_flush_events: false, } @@ -185,6 +187,8 @@ impl PollHandler Option { @@ -234,7 +238,13 @@ impl PollHandler>>]) {} + fn end(&mut self, _batch: &mut [Option>>]) { + let dur = self.timer.saturating_elapsed(); + self.poll_ctx + .raft_metrics + .process_ready + .observe(duration_to_sec(dur)); + } fn pause(&mut self) { if self.poll_ctx.trans.need_flush() { diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 39cf02de775..a4983b28a47 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -303,15 +303,21 @@ impl Peer { } } + pub fn has_pending_tombstone_tablets(&self) -> bool { + !self + .compact_log_context() + .tombstone_tablets_wait_index + .is_empty() + } + #[inline] pub fn record_tombstone_tablet_for_destroy( &mut self, ctx: &StoreContext, task: &mut WriteTask, ) { - let compact_log_context = self.compact_log_context_mut(); assert!( - compact_log_context.tombstone_tablets_wait_index.is_empty(), + !self.has_pending_tombstone_tablets(), "{} all tombstone should be cleared before being destroyed.", SlogFormat(&self.logger) ); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index a6ab227d402..047fe026ffe 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -18,7 +18,7 @@ use std::{mem, time::Duration}; -use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; +use engine_traits::{KvEngine, PerfContext, RaftEngine, WriteBatch, WriteOptions}; use kvproto::raft_cmdpb::{ AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, }; @@ -32,8 +32,8 @@ use raftstore::{ apply::{self, APPLY_WB_SHRINK_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP}, Proposal, }, - local_metrics::RaftMetrics, - metrics::APPLY_TASK_WAIT_TIME_HISTOGRAM, + local_metrics::{RaftMetrics, TimeTracker}, + metrics::{APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM}, msg::ErrorCallback, util, Config, WriteCallback, }, @@ -221,12 +221,35 @@ impl Peer { } proposal.must_pass_epoch_check = self.applied_to_current_term(); proposal.propose_time = Some(*ctx.current_time.get_or_insert_with(monotonic_raw_now)); + self.report_batch_wait_duration(ctx, &proposal.cb); self.proposals_mut().push(proposal); self.set_has_ready(); } + fn report_batch_wait_duration( + &self, + ctx: &mut StoreContext, + ch: &Vec, + ) { + if !ctx.raft_metrics.waterfall_metrics || ch.is_empty() { + return; + } + let now = std::time::Instant::now(); + for c in ch { + for tracker in c.write_trackers() { + tracker.observe(now, &ctx.raft_metrics.wf_batch_wait, |t| { + &mut t.metrics.wf_batch_wait_nanos + }); + } + } + } + #[inline] - pub fn schedule_apply_committed_entries(&mut self, committed_entries: Vec) { + pub fn schedule_apply_committed_entries( + &mut self, + ctx: &mut StoreContext, + committed_entries: Vec, + ) { if committed_entries.is_empty() { return; } @@ -246,6 +269,7 @@ impl Peer { } else { entry_and_proposals = committed_entries.into_iter().map(|e| (e, vec![])).collect(); } + self.report_store_time_duration(ctx, &mut entry_and_proposals); // Unlike v1, v2 doesn't need to persist commit index and commit term. The // point of persist commit index/term of raft apply state is to recover commit // index when the writes to raft engine is lost but writes to kv engine is @@ -265,6 +289,26 @@ impl Peer { .send(ApplyTask::CommittedEntries(apply)); } + #[inline] + fn report_store_time_duration( + &mut self, + ctx: &mut StoreContext, + entry_and_proposals: &mut [(Entry, Vec)], + ) { + let now = std::time::Instant::now(); + for (_, chs) in entry_and_proposals { + for tracker in chs.write_trackers_mut() { + tracker.observe(now, &ctx.raft_metrics.store_time, |t| { + t.metrics.write_instant = Some(now); + &mut t.metrics.store_time_nanos + }); + if let TimeTracker::Instant(t) = tracker { + *t = now; + } + } + } + } + pub fn on_apply_res(&mut self, ctx: &mut StoreContext, apply_res: ApplyRes) { if !self.serving() { return; @@ -625,9 +669,11 @@ impl Apply { } control.need_flush = false; let flush_state = self.flush_state().clone(); - if let Some(wb) = &mut self.write_batch && !wb.is_empty() { + if let Some(wb) = &self.write_batch && !wb.is_empty() { + self.perf_context().start_observe(); let mut write_opt = WriteOptions::default(); write_opt.set_disable_wal(true); + let wb = self.write_batch.as_mut().unwrap(); if let Err(e) = wb.write_callback_opt(&write_opt, || { flush_state.set_applied_index(index); }) { @@ -640,11 +686,26 @@ impl Apply { } else { self.write_batch.take(); } + let tokens: Vec<_> = self + .callbacks_mut() + .iter() + .flat_map(|(v, _)| { + v.write_trackers() + .flat_map(|t| t.as_tracker_token().cloned()) + }) + .collect(); + self.perf_context().report_metrics(&tokens); } let callbacks = self.callbacks_mut(); + let now = std::time::Instant::now(); + let apply_time = APPLY_TIME_HISTOGRAM.local(); for (ch, resp) in callbacks.drain(..) { + for tracker in ch.write_trackers() { + tracker.observe(now, &apply_time, |t| &mut t.metrics.apply_time_nanos); + } ch.set_result(resp); } + apply_time.flush(); if callbacks.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index f82fb1e8386..88646f06b59 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -286,6 +286,8 @@ impl Peer { let entry_storage = self.storage().entry_storage(); // TODO: check actual split index instead of commit index. entry_storage.applied_index() != entry_storage.commit_index() + // Wait for critical commands like split. + || self.has_pending_tombstone_tablets() } /// Start the destroy progress. It will write `Tombstone` state diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index e5b1c169c5b..5b88a6ba94d 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -473,6 +473,11 @@ impl Storage { } pub fn record_apply_trace(&mut self, write_task: &mut WriteTask) { + let trace = self.apply_trace(); + // Maybe tablet index can be different? + if trace.persisted_applied > trace.admin.flushed { + return; + } let region_id = self.region().get_id(); let raft_engine = self.entry_storage().raft_engine(); let tablet_index = self.tablet_index(); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 2fdc228ea2f..3f559feff8b 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -30,7 +30,10 @@ use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, - store::{needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteTask}, + store::{ + needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteCallback, + WriteTask, + }, }; use slog::{debug, error, info, trace, warn}; use tikv_util::{ @@ -205,10 +208,14 @@ impl Peer { self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); } self.insert_peer_cache(msg.take_from_peer()); + let pre_committed_index = self.raft_group().raft.raft_log.committed; if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { self.on_transfer_leader_msg(ctx, msg.get_message(), msg.disk_usage) } else if let Err(e) = self.raft_group_mut().step(msg.take_message()) { error!(self.logger, "raft step error"; "err" => ?e); + } else { + let committed_index = self.raft_group().raft.raft_log.committed; + self.report_commit_log_duration(ctx, pre_committed_index, committed_index); } self.set_has_ready(); @@ -317,6 +324,56 @@ impl Peer { } } + /// Send a message. + /// + /// The message is pushed into the send buffer, it may not be sent out until + /// transport is flushed explicitly. + fn send_raft_message_on_leader( + &mut self, + ctx: &mut StoreContext, + msg: RaftMessage, + ) { + let message = msg.get_message(); + if message.get_msg_type() == MessageType::MsgAppend + && let Some(fe) = message.get_entries().first() + && let Some(le) = message.get_entries().last() + { + let last = (le.get_term(), le.get_index()); + let first = (fe.get_term(), fe.get_index()); + let now = Instant::now(); + let queue = self.proposals_mut().queue_mut(); + // Proposals are batched up, so it will liely hit after one or two steps. + for p in queue.iter_mut().rev() { + if p.sent { + break; + } + let cur = (p.term, p.index); + if cur > last { + continue; + } + if cur < first { + break; + } + for tracker in p.cb.write_trackers() { + tracker.observe(now, &ctx.raft_metrics.wf_send_proposal, |t| { + &mut t.metrics.wf_send_proposal_nanos + }); + } + p.sent = true; + } + } + if message.get_msg_type() == MessageType::MsgTimeoutNow { + // After a leader transfer procedure is triggered, the lease for + // the old leader may be expired earlier than usual, since a new leader + // may be elected and the old leader doesn't step down due to + // network partition from the new leader. + // For lease safety during leader transfer, transit `leader_lease` + // to suspect. + self.leader_lease_mut().suspect(monotonic_raw_now()); + } + self.send_raft_message(ctx, msg) + } + fn handle_raft_committed_entries( &mut self, ctx: &mut crate::batch::StoreContext, @@ -357,7 +414,7 @@ impl Peer { // Compact all cached entries instead of half evict. self.entry_storage_mut().evict_entry_cache(false); } - self.schedule_apply_committed_entries(committed_entries); + self.schedule_apply_committed_entries(ctx, committed_entries); if self.is_leader() && commit_to_current_term && !self.proposal_control().has_uncommitted_admin() @@ -423,7 +480,7 @@ impl Peer { debug_assert!(self.is_leader()); for msg in ready.take_messages() { if let Some(msg) = self.build_raft_message(msg) { - self.send_raft_message(ctx, msg); + self.send_raft_message_on_leader(ctx, msg); } } } @@ -445,6 +502,7 @@ impl Peer { let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); + self.report_send_to_queue_duration(ctx, &mut write_task, ready.entries()); let prev_persisted = self.storage().apply_trace().persisted_apply_index(); self.merge_state_changes_to(&mut write_task); self.storage_mut() @@ -519,8 +577,13 @@ impl Peer { } let persisted_number = self.async_writer.persisted_number(); + let pre_persisted_index = self.persisted_index(); + let pre_committed_index = self.raft_group().raft.raft_log.committed; self.raft_group_mut().on_persist_ready(persisted_number); let persisted_index = self.persisted_index(); + let committed_index = self.raft_group().raft.raft_log.committed; + self.report_persist_log_duration(ctx, pre_persisted_index, persisted_index); + self.report_commit_log_duration(ctx, pre_committed_index, committed_index); // The apply snapshot process order would be: // - Get the snapshot from the ready // - Wait for async writer to load this tablet @@ -543,6 +606,81 @@ impl Peer { } } + #[inline] + fn report_persist_log_duration( + &self, + ctx: &mut StoreContext, + from: u64, + to: u64, + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || from >= to { + return; + } + let now = Instant::now(); + for i in from + 1..to { + if let Some((term, trackers)) = self.proposals().find_trackers(i) { + if self.entry_storage().term(i).map_or(false, |t| t == term) { + for tracker in trackers { + tracker.observe(now, &ctx.raft_metrics.wf_persist_log, |t| { + &mut t.metrics.wf_persist_log_nanos + }); + } + } + } + } + } + + #[inline] + fn report_commit_log_duration(&self, ctx: &mut StoreContext, from: u64, to: u64) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || from >= to { + return; + } + let now = Instant::now(); + for i in from + 1..to { + if let Some((term, trackers)) = self.proposals().find_trackers(i) { + if self.entry_storage().term(i).map_or(false, |t| t == term) { + let commit_persisted = i <= self.persisted_index(); + let hist = if commit_persisted { + &ctx.raft_metrics.wf_commit_log + } else { + &ctx.raft_metrics.wf_commit_not_persist_log + }; + for tracker in trackers { + tracker.observe(now, hist, |t| { + t.metrics.commit_not_persisted = !commit_persisted; + &mut t.metrics.wf_commit_log_nanos + }); + } + } + } + } + } + + #[inline] + fn report_send_to_queue_duration( + &mut self, + ctx: &mut StoreContext, + write_task: &mut WriteTask, + entries: &[raft::eraftpb::Entry], + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() { + return; + } + let now = Instant::now(); + for entry in entries { + if let Some((term, trackers)) = self.proposals().find_trackers(entry.index) { + if entry.term == term { + for tracker in trackers { + write_task.trackers.push(*tracker); + tracker.observe(now, &ctx.raft_metrics.wf_send_to_queue, |t| { + &mut t.metrics.wf_send_to_queue_nanos + }); + } + } + } + } + } + #[cfg(feature = "testexport")] pub fn on_wait_flush(&mut self, ch: crate::router::FlushChannel) { self.async_writer.subscirbe_flush(ch); diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 7a5b03120b1..6d1faa98cbf 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -2,7 +2,9 @@ use std::{mem, sync::Arc}; -use engine_traits::{FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; +use engine_traits::{ + FlushState, KvEngine, PerfContextKind, TabletRegistry, WriteBatch, DATA_CFS_LEN, +}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{ fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, @@ -20,6 +22,7 @@ use crate::{ pub struct Apply { peer: metapb::Peer, tablet: EK, + perf_context: EK::PerfContext, pub write_batch: Option, /// A buffer for encoding key. pub key_buffer: Vec, @@ -77,9 +80,12 @@ impl Apply { assert_ne!(applied_term, 0, "{}", SlogFormat(&logger)); let applied_index = flush_state.applied_index(); assert_ne!(applied_index, 0, "{}", SlogFormat(&logger)); + let tablet = remote_tablet.latest().unwrap().clone(); + let perf_context = tablet.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply); Apply { peer, - tablet: remote_tablet.latest().unwrap().clone(), + tablet, + perf_context, write_batch: None, callbacks: vec![], flow_control: ApplyFlowControl::new(cfg), @@ -174,6 +180,11 @@ impl Apply { &self.tablet } + #[inline] + pub fn perf_context(&mut self) -> &mut EK::PerfContext { + &mut self.perf_context + } + #[inline] pub fn peer(&self) -> &metapb::Peer { &self.peer diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index 2cb75acccfc..eeeb13f6555 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -30,8 +30,7 @@ use raftstore::store::{ local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, WriteCallback, }; -use smallvec::SmallVec; -use tracker::TrackerToken; +use tracker::{TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; /// A struct allows to watch and notify specific events. /// @@ -54,6 +53,7 @@ struct EventCore { before_set: UnsafeCell>>, // Waker can be changed, need to use `AtomicWaker` to guarantee no data race. waker: AtomicWaker, + tracker: UnsafeCell, } unsafe impl Send for EventCore {} @@ -244,16 +244,19 @@ impl BaseChannel { /// Creates a pair of channel and subscriber. #[inline] pub fn pair() -> (Self, BaseSubscriber) { - Self::with_mask(u32::MAX) + let tracker_token = tracker::get_tls_tracker_token(); + Self::with_mask(u32::MAX, TimeTracker::Tracker(tracker_token)) } - fn with_mask(mask: u32) -> (Self, BaseSubscriber) { + #[inline] + fn with_mask(mask: u32, tracker: TimeTracker) -> (Self, BaseSubscriber) { let core: Arc> = Arc::new(EventCore { event: AtomicU64::new(0), res: UnsafeCell::new(None), event_mask: mask, before_set: UnsafeCell::new(None), waker: AtomicWaker::new(), + tracker: UnsafeCell::new(tracker), }); (Self { core: core.clone() }, BaseSubscriber { core }) } @@ -449,7 +452,17 @@ impl CmdResChannelBuilder { #[inline] pub fn build(self) -> (CmdResChannel, CmdResSubscriber) { - let (c, s) = CmdResChannel::with_mask(self.event_mask); + let tracker_token = tracker::get_tls_tracker_token(); + let now = std::time::Instant::now(); + let tracker = if tracker_token == INVALID_TRACKER_TOKEN { + TimeTracker::Instant(now) + } else { + GLOBAL_TRACKERS.with_tracker(tracker_token, |tracker| { + tracker.metrics.write_instant = Some(now); + }); + TimeTracker::Tracker(tracker_token) + }; + let (c, s) = CmdResChannel::with_mask(self.event_mask, tracker); if let Some(f) = self.before_set { unsafe { *c.core.before_set.get() = Some(f); @@ -493,12 +506,15 @@ impl WriteCallback for CmdResChannel { self.core.notify_event(Self::COMMITTED_EVENT); } - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - None + type TimeTrackerListRef<'a> = &'a [TimeTracker]; + #[inline] + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + std::slice::from_ref(unsafe { &*self.core.tracker.get() }) } - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - None + type TimeTrackerListMut<'a> = &'a mut [TimeTracker]; + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + std::slice::from_mut(unsafe { &mut *self.core.tracker.get() }) } // TODO: support executing hooks inside setting result. @@ -577,7 +593,7 @@ impl ReadCallback for QueryResChannel { } fn read_tracker(&self) -> Option<&TrackerToken> { - None + unsafe { (*self.core.tracker.get()).as_tracker_token() } } } diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index 6104ae7b7cf..1db5f79d226 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -8,6 +8,7 @@ #![feature(hash_drain_filter)] #![feature(let_chains)] #![feature(assert_matches)] +#![feature(type_alias_impl_trait)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 817ff576f67..7016d0ab606 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -718,7 +718,11 @@ where .batch .tasks .iter() - .flat_map(|task| task.trackers.iter().flat_map(|t| t.as_tracker_token())) + .flat_map(|task| { + task.trackers + .iter() + .flat_map(|t| t.as_tracker_token().cloned()) + }) .collect(); self.perf_context.report_metrics(&trackers); write_raft_time = duration_to_sec(now.saturating_elapsed()); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index ec2d7bf72a8..cab6ae0ffe8 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -582,8 +582,7 @@ where .cb_batch .iter() .flat_map(|(cb, _)| cb.write_trackers()) - .flat_map(|trackers| trackers.iter().map(|t| t.as_tracker_token())) - .flatten() + .flat_map(|trackers| trackers.as_tracker_token().cloned()) .collect(); self.perf_context.report_metrics(&trackers); self.sync_log_hint = false; @@ -620,7 +619,7 @@ where // Invoke callbacks let now = std::time::Instant::now(); for (cb, resp) in cb_batch.drain(..) { - for tracker in cb.write_trackers().iter().flat_map(|v| *v) { + for tracker in cb.write_trackers() { tracker.observe(now, &self.apply_time, |t| &mut t.metrics.apply_time_nanos); } cb.invoke_with_response(resp); @@ -3333,15 +3332,13 @@ impl Apply { pub fn on_schedule(&mut self, metrics: &RaftMetrics) { let now = std::time::Instant::now(); for cb in &mut self.cbs { - if let Some(trackers) = cb.cb.write_trackers_mut() { - for tracker in trackers { - tracker.observe(now, &metrics.store_time, |t| { - t.metrics.write_instant = Some(now); - &mut t.metrics.store_time_nanos - }); - if let TimeTracker::Instant(t) = tracker { - *t = now; - } + for tracker in cb.cb.write_trackers_mut() { + tracker.observe(now, &metrics.store_time, |t| { + t.metrics.write_instant = Some(now); + &mut t.metrics.store_time_nanos + }); + if let TimeTracker::Instant(t) = tracker { + *t = now; } } } @@ -3410,6 +3407,7 @@ pub struct Proposal { /// lease. pub propose_time: Option, pub must_pass_epoch_check: bool, + pub sent: bool, } impl Proposal { @@ -3421,6 +3419,7 @@ impl Proposal { propose_time: None, must_pass_epoch_check: false, is_conf_change: false, + sent: false, } } } @@ -4170,9 +4169,9 @@ where .cbs .iter() .flat_map(|p| p.cb.write_trackers()) - .flat_map(|ts| ts.iter().flat_map(|t| t.as_tracker_token())) + .flat_map(|ts| ts.as_tracker_token()) { - GLOBAL_TRACKERS.with_tracker(tracker, |t| { + GLOBAL_TRACKERS.with_tracker(*tracker, |t| { t.metrics.apply_wait_nanos = apply_wait.as_nanos() as u64; }); } @@ -5082,6 +5081,7 @@ mod tests { cb, propose_time: None, must_pass_epoch_check: false, + sent: true, } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index abd8fd84771..e302ea6588a 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -524,13 +524,14 @@ where })) }; - let tokens: SmallVec<[TimeTracker; 4]> = cbs + let trackers: SmallVec<[TimeTracker; 4]> = cbs .iter_mut() - .filter_map(|cb| cb.write_trackers().map(|t| t[0])) + .flat_map(|cb| cb.write_trackers()) + .cloned() .collect(); - let mut cb = Callback::write_ext( - Box::new(move |resp| { + let cb = Callback::Write { + cb: Box::new(move |resp| { for cb in cbs { let mut cmd_resp = RaftCmdResponse::default(); cmd_resp.set_header(resp.response.get_header().clone()); @@ -539,12 +540,8 @@ where }), proposed_cb, committed_cb, - ); - - if let Some(trackers) = cb.write_trackers_mut() { - *trackers = tokens; - } - + trackers, + }; return Some((req, cb)); } None @@ -5245,7 +5242,7 @@ where if self.ctx.raft_metrics.waterfall_metrics { let now = Instant::now(); - for tracker in cb.write_trackers().iter().flat_map(|v| *v) { + for tracker in cb.write_trackers() { tracker.observe(now, &self.ctx.raft_metrics.wf_batch_wait, |t| { &mut t.metrics.wf_batch_wait_nanos }); diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 5cfbb645612..c1db17f8cae 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -214,9 +214,9 @@ pub enum TimeTracker { } impl TimeTracker { - pub fn as_tracker_token(&self) -> Option { + pub fn as_tracker_token(&self) -> Option<&TrackerToken> { match self { - TimeTracker::Tracker(tt) => Some(*tt), + TimeTracker::Tracker(tt) => Some(tt), TimeTracker::Instant(_) => None, } } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 08b0e9367dc..e3fc8530d76 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -225,8 +225,16 @@ pub trait WriteCallback: ErrorCallback { fn notify_proposed(&mut self); fn notify_committed(&mut self); - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>>; - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>>; + + type TimeTrackerListRef<'a>: IntoIterator + where + Self: 'a; + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_>; + + type TimeTrackerListMut<'a>: IntoIterator + where + Self: 'a; + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_>; fn set_result(self, result: Self::Response); } @@ -276,16 +284,24 @@ impl WriteCallback for Callback { self.invoke_committed(); } + type TimeTrackerListRef<'a> = impl IntoIterator; #[inline] - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - let Callback::Write { trackers, .. } = self else { return None; }; - Some(trackers) + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + let trackers = match self { + Callback::Write { trackers, .. } => Some(trackers), + _ => None, + }; + trackers.into_iter().flatten() } + type TimeTrackerListMut<'a> = impl IntoIterator; #[inline] - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - let Callback::Write { trackers, .. } = self else { return None; }; - Some(trackers) + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + let trackers = match self { + Callback::Write { trackers, .. } => Some(trackers), + _ => None, + }; + trackers.into_iter().flatten() } #[inline] @@ -296,7 +312,7 @@ impl WriteCallback for Callback { impl WriteCallback for Vec where - C: WriteCallback, + C: WriteCallback + 'static, C::Response: Clone, { type Response = C::Response; @@ -315,14 +331,16 @@ where } } + type TimeTrackerListRef<'a> = impl Iterator + 'a; #[inline] - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - None + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + self.iter().flat_map(|c| c.write_trackers()) } + type TimeTrackerListMut<'a> = impl Iterator + 'a; #[inline] - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - None + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + self.iter_mut().flat_map(|c| c.write_trackers_mut()) } #[inline] diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9384a4940c7..347f62dd945 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -70,7 +70,7 @@ use uuid::Uuid; use super::{ cmd_resp, - local_metrics::{RaftMetrics, TimeTracker}, + local_metrics::RaftMetrics, metrics::*, peer_storage::{write_peer_state, CheckApplyingSnapStatus, HandleReadyResult, PeerStorage}, read_queue::{ReadIndexQueue, ReadIndexRequest}, @@ -141,16 +141,16 @@ impl ProposalQueue { /// Find the trackers of given index. /// Caller should check if term is matched before using trackers. - fn find_trackers(&self, index: u64) -> Option<(u64, &SmallVec<[TimeTracker; 4]>)> { + pub fn find_trackers(&self, index: u64) -> Option<(u64, C::TimeTrackerListRef<'_>)> { self.queue .binary_search_by_key(&index, |p: &Proposal<_>| p.index) .ok() - .and_then(|i| { - self.queue[i] - .cb - .write_trackers() - .map(|ts| (self.queue[i].term, ts)) - }) + .map(|i| (self.queue[i].term, self.queue[i].cb.write_trackers())) + } + + #[inline] + pub fn queue_mut(&mut self) -> &mut VecDeque> { + &mut self.queue } pub fn find_propose_time(&self, term: u64, index: u64) -> Option { @@ -1825,7 +1825,7 @@ where { let proposal = &self.proposals.queue[idx]; if term == proposal.term { - for tracker in proposal.cb.write_trackers().iter().flat_map(|v| v.iter()) { + for tracker in proposal.cb.write_trackers() { tracker.observe(std_now, &ctx.raft_metrics.wf_send_proposal, |t| { &mut t.metrics.wf_send_proposal_nanos }); @@ -2767,8 +2767,8 @@ where for entry in ready.entries() { if let Some((term, times)) = self.proposals.find_trackers(entry.get_index()) { if entry.term == term { - trackers.extend_from_slice(times); for tracker in times { + trackers.push(*tracker); tracker.observe(now, &ctx.raft_metrics.wf_send_to_queue, |t| { &mut t.metrics.wf_send_to_queue_nanos }); @@ -3687,6 +3687,7 @@ where cb, propose_time: None, must_pass_epoch_check: has_applied_to_current_term, + sent: false, }; if let Some(cmd_type) = req_admin_cmd_type { self.cmd_epoch_checker @@ -4018,6 +4019,7 @@ where cb: Callback::None, propose_time: Some(now), must_pass_epoch_check: false, + sent: false, }; self.post_propose(poll_ctx, p); } @@ -5941,6 +5943,7 @@ mod tests { cb: Callback::write(Box::new(|_| {})), propose_time: Some(u64_to_timespec(index)), must_pass_epoch_check: false, + sent: false, }); }; for index in 1..=100 { @@ -6014,6 +6017,7 @@ mod tests { is_conf_change: false, propose_time: None, must_pass_epoch_check: false, + sent: false, }); } for (index, term) in entries { From 528e06dcc4ffa5d099b60fbe93972732d141e014 Mon Sep 17 00:00:00 2001 From: Wenxuan Date: Thu, 12 Jan 2023 16:22:34 +0800 Subject: [PATCH 451/676] util: Fix incorrect memory capacity (#14034) * util: Fix incorrect memory capacity Signed-off-by: Wish * Fix lints Signed-off-by: Wish * Check capacity with /proc/meminfo Signed-off-by: Wish Signed-off-by: Wish --- components/tikv_util/src/sys/mod.rs | 4 +- src/server/service/diagnostics/sys.rs | 61 ++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 49e6812b81f..797da2aea54 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -22,7 +22,7 @@ use mnt::get_mount; use sysinfo::RefreshKind; pub use sysinfo::{CpuExt, DiskExt, NetworkExt, ProcessExt, SystemExt}; -use crate::config::{ReadableSize, KIB}; +use crate::config::ReadableSize; pub const HIGH_PRI: i32 = -1; const CPU_CORES_QUOTA_ENV_VAR_KEY: &str = "TIKV_CPU_CORES_QUOTA"; @@ -93,7 +93,7 @@ impl SysQuota { fn sysinfo_memory_limit_in_bytes() -> u64 { let system = sysinfo::System::new_with_specifics(RefreshKind::new().with_memory()); - system.total_memory() * KIB + system.total_memory() } } diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index 6e9585ab2c9..8a84eaf6293 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -3,10 +3,7 @@ use std::{collections::HashMap, string::ToString}; use kvproto::diagnosticspb::{ServerInfoItem, ServerInfoPair}; -use tikv_util::{ - config::KIB, - sys::{cpu_time::LinuxStyleCpuTime, ioload, SysQuota, *}, -}; +use tikv_util::sys::{cpu_time::LinuxStyleCpuTime, ioload, SysQuota, *}; use walkdir::WalkDir; use crate::server::service::diagnostics::SYS_INFO; @@ -129,12 +126,12 @@ fn cpu_load_info(prev_cpu: CpuTimeSnapshot, collector: &mut Vec) fn mem_load_info(collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_memory(); - let total_memory = system.total_memory() * KIB; - let used_memory = system.used_memory() * KIB; - let free_memory = system.free_memory() * KIB; - let total_swap = system.total_swap() * KIB; - let used_swap = system.used_swap() * KIB; - let free_swap = system.free_swap() * KIB; + let total_memory = system.total_memory(); + let used_memory = system.used_memory(); + let free_memory = system.free_memory(); + let total_swap = system.total_swap(); + let used_swap = system.used_swap(); + let free_swap = system.free_swap(); drop(system); let used_memory_pct = (used_memory as f64) / (total_memory as f64); let free_memory_pct = (free_memory as f64) / (total_memory as f64); @@ -683,6 +680,50 @@ mod tests { assert_ne!(processes.get_pairs().len(), 0); } + #[test] + #[cfg(target_os = "linux")] + fn test_memory() { + let mut mem_total_kb: u64 = 0; + { + use std::io::BufRead; + + let f = std::fs::File::open("/proc/meminfo").unwrap(); + let reader = std::io::BufReader::new(f); + for line in reader.lines() { + let l = line.unwrap(); + let mut parts = l.split_whitespace(); + if parts.next().unwrap() != "MemTotal:" { + continue; + } + mem_total_kb = parts.next().unwrap().parse().unwrap(); + let unit = parts.next().unwrap(); + assert_eq!(unit, "kB"); + } + } + assert!(mem_total_kb > 0); + + let mut collector = vec![]; + hardware_info(&mut collector); + + let mut memory_checked = false; + + 'outer: for item in &collector { + if item.get_tp() != "memory" { + continue; + } + for pair in item.get_pairs() { + if pair.get_key() != "capacity" { + continue; + } + assert_eq!(pair.get_value(), (mem_total_kb * 1024).to_string()); + memory_checked = true; + break 'outer; + } + } + + assert!(memory_checked); + } + #[test] fn test_hardware_info() { let mut collector = vec![]; From e1467c56a445d36a8fd8642f9467a0b18fbb8203 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 12 Jan 2023 17:35:52 +0800 Subject: [PATCH 452/676] pd_client: fix the kvproto compatibility for global config (#14041) * hotfix kvproto for global config Signed-off-by: husharp * make format happy Signed-off-by: husharp Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/pd_client/src/client.rs | 4 ++-- components/pd_client/src/client_v2.rs | 6 +++--- components/pd_client/src/lib.rs | 2 +- components/test_pd/src/mocker/mod.rs | 8 ++++---- tests/failpoints/cases/test_pd_client.rs | 15 +++++---------- tests/failpoints/cases/test_pd_client_legacy.rs | 16 +++++----------- 7 files changed, 21 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7a3c9ced013..c98cd025fad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2726,7 +2726,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#ae3b086b09afbb26cebcd4c1fe14b82bbe1f0796" +source = "git+https://github.com/pingcap/kvproto.git#a14c44ef44b378d15adb5baad8402b838f031b51" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 9f466a6a351..5bccdcfacea 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -286,10 +286,10 @@ impl fmt::Debug for RpcClient { const LEADER_CHANGE_RETRY: usize = 10; impl PdClient for RpcClient { - fn load_global_config(&self, list: Vec) -> PdFuture> { + fn load_global_config(&self, config_path: String) -> PdFuture> { use kvproto::pdpb::LoadGlobalConfigRequest; let mut req = LoadGlobalConfigRequest::new(); - req.set_names(list.into()); + req.set_config_path(config_path); let executor = |client: &Client, req| match client .inner .rl() diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 3d17a94a494..b42d8fb3ddb 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -542,7 +542,7 @@ pub trait PdClient { fn fetch_cluster_id(&mut self) -> Result; - fn load_global_config(&mut self, list: Vec) -> PdFuture>; + fn load_global_config(&mut self, config_path: String) -> PdFuture>; fn watch_global_config( &mut self, @@ -791,10 +791,10 @@ impl PdClient for RpcClient { Ok((tx, resp_rx)) } - fn load_global_config(&mut self, list: Vec) -> PdFuture> { + fn load_global_config(&mut self, config_path: String) -> PdFuture> { use kvproto::pdpb::LoadGlobalConfigRequest; let mut req = LoadGlobalConfigRequest::new(); - req.set_names(list.into()); + req.set_config_path(config_path); let mut raw_client = self.raw_client.clone(); Box::pin(async move { raw_client.wait_for_ready().await?; diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 8674130c799..46a3e6924db 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -209,7 +209,7 @@ pub const INVALID_ID: u64 = 0; /// all the time. pub trait PdClient: Send + Sync { /// Load a list of GlobalConfig - fn load_global_config(&self, _list: Vec) -> PdFuture> { + fn load_global_config(&self, _config_path: String) -> PdFuture> { unimplemented!(); } diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index d904c95d4a8..84c2508d4ea 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -27,13 +27,13 @@ pub type Result = result::Result; pub trait PdMocker { fn load_global_config( &self, - req: &LoadGlobalConfigRequest, + _req: &LoadGlobalConfigRequest, ) -> Option> { let mut send = vec![]; - for r in req.get_names() { + for r in 0..10 { let mut i = GlobalConfigItem::default(); - i.set_name(format!("/global/config/{}", r.clone())); - i.set_value(r.clone()); + i.set_name(format!("/global/config/{}", r)); + i.set_value(r.to_string()); send.push(i); } let mut res = LoadGlobalConfigResponse::default(); diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index ca0a473a8b7..7dd767d19c9 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -69,7 +69,7 @@ fn test_pd_client_deadlock() { request!(client => block_on(get_gc_safe_point())), request!(client => block_on(get_store_and_stats(0))), request!(client => get_operator(0)), - request!(client => load_global_config(vec![])), + request!(client => load_global_config(String::default())), ]; for (name, func) in test_funcs { @@ -101,14 +101,7 @@ fn test_pd_client_deadlock() { fn test_load_global_config() { let (mut _server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); let res = futures::executor::block_on(async move { - client - .load_global_config( - ["abc", "123", "xyz"] - .iter() - .map(|x| x.to_string()) - .collect::>(), - ) - .await + client.load_global_config("global".to_string()).await }); for (k, v) in res.unwrap() { assert_eq!(k, format!("/global/config/{}", v)) @@ -293,7 +286,9 @@ fn test_retry() { }); test_retry_success(&mut client, |c| block_on(c.get_gc_safe_point())); test_retry_success(&mut client, |c| c.get_operator(0)); - test_retry_success(&mut client, |c| block_on(c.load_global_config(vec![]))); + test_retry_success(&mut client, |c| { + block_on(c.load_global_config(String::default())) + }); fail::remove(pd_client_v2_timeout_fp); fail::remove(pd_client_v2_backoff_fp); diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs index eb22ac29e45..172db8ac09e 100644 --- a/tests/failpoints/cases/test_pd_client_legacy.rs +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -73,7 +73,7 @@ fn test_pd_client_deadlock() { request!(client => block_on(get_store_stats_async(0))), request!(client => get_operator(0)), request!(client => block_on(get_tso())), - request!(client => load_global_config(vec![])), + request!(client => load_global_config(String::default())), ]; for (name, func) in test_funcs { @@ -108,16 +108,10 @@ fn test_pd_client_deadlock() { #[test] fn test_load_global_config() { let (mut _server, client) = new_test_server_and_client(ReadableDuration::millis(100)); - let res = futures::executor::block_on(async move { - client - .load_global_config( - ["abc", "123", "xyz"] - .iter() - .map(|x| x.to_string()) - .collect::>(), - ) - .await - }); + let res = + futures::executor::block_on( + async move { client.load_global_config("global".into()).await }, + ); for (k, v) in res.unwrap() { assert_eq!(k, format!("/global/config/{}", v)) } From 2daa168f13831ab9cfd653ad2971eccbb3f38a22 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 13 Jan 2023 09:03:46 +0800 Subject: [PATCH 453/676] *: add resource group for the read path (#14001) ref tikv/tikv#13730 Signed-off-by: glorv --- Cargo.lock | 25 +- Cargo.toml | 3 + components/resource_control/Cargo.toml | 20 + components/resource_control/src/future.rs | 46 ++ components/resource_control/src/lib.rs | 18 + .../resource_control/src/resource_group.rs | 482 ++++++++++++++++++ components/server/Cargo.toml | 1 + components/server/src/server.rs | 21 + components/server/src/server2.rs | 21 + .../tikv_util/src/yatp_pool/future_pool.rs | 23 +- components/tikv_util/src/yatp_pool/mod.rs | 16 +- src/config/mod.rs | 5 + src/coprocessor/endpoint.rs | 12 + src/read_pool.rs | 95 ++-- src/storage/mod.rs | 32 ++ 15 files changed, 783 insertions(+), 37 deletions(-) create mode 100644 components/resource_control/Cargo.toml create mode 100644 components/resource_control/src/future.rs create mode 100644 components/resource_control/src/lib.rs create mode 100644 components/resource_control/src/resource_group.rs diff --git a/Cargo.lock b/Cargo.lock index c98cd025fad..0b7ca52725c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4656,6 +4656,25 @@ dependencies = [ "txn_types", ] +[[package]] +name = "resource_control" +version = "0.0.1" +dependencies = [ + "byteorder", + "crossbeam-skiplist", + "dashmap", + "kvproto", + "lazy_static", + "online_config", + "pin-project", + "prometheus", + "serde", + "slog", + "slog-global", + "tikv_util", + "yatp", +] + [[package]] name = "resource_metering" version = "0.0.1" @@ -5209,6 +5228,7 @@ dependencies = [ "raftstore-v2", "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "serde_json", @@ -6290,6 +6310,7 @@ dependencies = [ "rand 0.7.3", "regex", "reqwest", + "resource_control", "resource_metering", "rev_lines", "seahash", @@ -7363,9 +7384,11 @@ checksum = "541b12c998c5b56aa2b4e6f18f03664eef9a4fd0a246a55594efae6cc2d964b5" [[package]] name = "yatp" version = "0.0.1" -source = "git+https://github.com/tikv/yatp.git?branch=master#39cb495953d40a7e846363c06090755c2eac65fa" +source = "git+https://github.com/tikv/yatp.git?branch=master#bcf431a2619c06ab7fa0c72073a0c775646c484f" dependencies = [ "crossbeam-deque", + "crossbeam-skiplist", + "crossbeam-utils 0.8.8", "dashmap", "fail", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index 4c8af61e554..d76dce26a18 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -139,6 +139,7 @@ raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } rand = "0.7.3" regex = "1.3" +resource_control = { workspace = true } resource_metering = { workspace = true } rev_lines = "0.2.1" seahash = "4.1.0" @@ -267,6 +268,7 @@ members = [ "components/raftstore", "components/raftstore-v2", "components/resolved_ts", + "components/resource_control", "components/resource_metering", "components/security", "components/server", @@ -341,6 +343,7 @@ raft_log_engine = { path = "components/raft_log_engine" } raftstore = { path = "components/raftstore", default-features = false } raftstore-v2 = { path = "components/raftstore-v2", default-features = false } resolved_ts = { path = "components/resolved_ts" } +resource_control = { path = "components/resource_control" } resource_metering = { path = "components/resource_metering" } security = { path = "components/security" } server = { path = "components/server" } diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml new file mode 100644 index 00000000000..822aed2cd2d --- /dev/null +++ b/components/resource_control/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "resource_control" +version = "0.0.1" +edition = "2021" +publish = false + +[dependencies] +byteorder = "1.2" +crossbeam-skiplist = "0.1" +dashmap = "5.1" +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +lazy_static = "1.0" +online_config = { workspace = true } +pin-project = "1.0" +prometheus = { version = "0.13", features = ["nightly"] } +serde = { version = "1.0", features = ["derive"] } +slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +tikv_util = { workspace = true } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs new file mode 100644 index 00000000000..8027a27b394 --- /dev/null +++ b/components/resource_control/src/future.rs @@ -0,0 +1,46 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + future::Future, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use pin_project::pin_project; +use tikv_util::time::Instant; + +use crate::resource_group::{ResourceConsumeType, ResourceController}; + +#[pin_project] +pub struct ControlledFuture { + #[pin] + future: F, + controller: Arc, + group_name: Vec, +} + +impl ControlledFuture { + pub fn new(future: F, controller: Arc, group_name: Vec) -> Self { + Self { + future, + controller, + group_name, + } + } +} + +impl Future for ControlledFuture { + type Output = F::Output; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.project(); + let now = Instant::now(); + let res = this.future.poll(cx); + this.controller.consume( + this.group_name, + ResourceConsumeType::CpuTime(now.saturating_elapsed()), + ); + res + } +} diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs new file mode 100644 index 00000000000..516e5dd6c8d --- /dev/null +++ b/components/resource_control/src/lib.rs @@ -0,0 +1,18 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use online_config::OnlineConfig; +use serde::{Deserialize, Serialize}; + +mod resource_group; +pub use resource_group::{ResourceController, ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; + +mod future; +pub use future::ControlledFuture; + +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig, Default)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct Config { + #[online_config(skip)] + pub enabled: bool, +} diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs new file mode 100644 index 00000000000..d9fa3ccf14c --- /dev/null +++ b/components/resource_control/src/resource_group.rs @@ -0,0 +1,482 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use dashmap::{mapref::one::Ref, DashMap}; +use kvproto::resource_manager::{GroupMode, ResourceGroup}; +use yatp::queue::priority::TaskPriorityProvider; + +// a read task cost at least 50us. +const DEFAULT_PRIORITY_PER_READ_TASK: u64 = 50; +// extra task schedule factor +const TASK_EXTRA_FACTOR_BY_LEVEL: [u64; 3] = [0, 20, 100]; +/// duration to update the minimal priority value of each resource group. +pub const MIN_PRIORITY_UPDATE_INTERVAL: Duration = Duration::from_secs(1); +/// default resource group name +const DEFAULT_RESOURCE_GROUP_NAME: &str = "default"; +/// default value of max RU quota. +const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; + +pub enum ResourceConsumeType { + CpuTime(Duration), + IoBytes(u64), +} + +/// ResourceGroupManager manages the metadata of each resource group. +#[derive(Default)] +pub struct ResourceGroupManager { + resource_groups: DashMap, + registry: Mutex>>, +} + +impl ResourceGroupManager { + fn get_ru_setting(rg: &ResourceGroup, is_read: bool) -> u64 { + match (rg.get_mode(), is_read) { + (GroupMode::RuMode, true) => rg + .get_r_u_settings() + .get_r_r_u() + .get_settings() + .get_fill_rate(), + (GroupMode::RuMode, false) => rg + .get_r_u_settings() + .get_w_r_u() + .get_settings() + .get_fill_rate(), + // TODO: currently we only consider the cpu usage in the read path, we may also take + // io read bytes into account later. + (GroupMode::RawMode, true) => rg + .get_resource_settings() + .get_cpu() + .get_settings() + .get_fill_rate(), + (GroupMode::RawMode, false) => rg + .get_resource_settings() + .get_io_write() + .get_settings() + .get_fill_rate(), + // return a default value for unsupported config. + (GroupMode::Unknown, _) => 1, + } + } + + pub fn add_resource_group(&self, rg: ResourceGroup) { + let group_name = rg.get_name().to_ascii_lowercase(); + self.registry.lock().unwrap().iter().for_each(|controller| { + let ru_quota = Self::get_ru_setting(&rg, controller.is_read); + controller.add_resource_group(group_name.clone().into_bytes(), ru_quota); + }); + self.resource_groups.insert(group_name, rg); + } + + pub fn remove_resource_group(&self, name: &str) { + let group_name = name.to_ascii_lowercase(); + self.registry.lock().unwrap().iter().for_each(|controller| { + controller.remove_resource_group(group_name.as_bytes()); + }); + self.resource_groups.remove(&group_name); + } + + pub fn get_resource_group(&self, name: &str) -> Option> { + self.resource_groups.get(&name.to_ascii_lowercase()) + } + + pub fn get_all_resource_groups(&self) -> Vec { + self.resource_groups.iter().map(|g| g.clone()).collect() + } + + pub fn derive_controller(&self, name: String, is_read: bool) -> Arc { + let controller = Arc::new(ResourceController::new(name, is_read)); + self.registry.lock().unwrap().push(controller.clone()); + for g in &self.resource_groups { + let ru_quota = Self::get_ru_setting(g.value(), controller.is_read); + controller.add_resource_group(g.key().clone().into_bytes(), ru_quota); + } + + controller + } + + pub fn advance_min_virtual_time(&self) { + for controller in self.registry.lock().unwrap().iter() { + controller.update_min_virtual_time(); + } + } +} + +pub struct ResourceController { + // resource controller name is not used currently. + #[allow(dead_code)] + name: String, + // We handle the priority differently between read and write request: + // 1. the priority factor is calculate based on read/write RU settings. + // 2. for read request, we increase a constant virtual time delta at each `get_priority` call + // because the cost can't be calculated at start, so we only increase a constant delta and + // increase the real cost after task is executed; but don't increase it at write because + // the cost is known so we just pre-consume it. + is_read: bool, + // Track the maximum ru quota used to calculate the factor of each resource group. + // factor = max_ru_quota / group_ru_quota * 10.0 + // We use mutex here to ensure when we need to change this value and do adjust all resource + // groups' factors, it can't be changed concurrently. + max_ru_quota: Mutex, + // record consumption of each resource group, name --> resource_group + resource_consumptions: DashMap, GroupPriorityTracker>, + + last_min_vt: AtomicU64, +} + +impl ResourceController { + pub fn new(name: String, is_read: bool) -> Self { + let controller = Self { + name, + is_read, + max_ru_quota: Mutex::new(DEFAULT_MAX_RU_QUOTA), + resource_consumptions: DashMap::new(), + last_min_vt: AtomicU64::new(0), + }; + // add the "default" resource group + controller.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); + controller + } + + fn calculate_factor(max_quota: u64, quota: u64) -> u64 { + if quota > 0 { + // we use max_quota / quota as the resource group factor, but because we need to + // cast the value to integer, so we times it by 10 to ensure the accuracy is + // enough. + (max_quota as f64 / quota as f64 * 10.0).round() as u64 + } else { + 1 + } + } + + fn add_resource_group(&self, name: Vec, ru_quota: u64) { + let mut max_ru_quota = self.max_ru_quota.lock().unwrap(); + if ru_quota > *max_ru_quota { + *max_ru_quota = ru_quota; + // adjust all group weight because the current value is too small. + self.adjust_all_resource_group_factors(ru_quota); + } + let weight = Self::calculate_factor(*max_ru_quota, ru_quota); + + let vt_delta_for_get = if self.is_read { + DEFAULT_PRIORITY_PER_READ_TASK * weight + } else { + 0 + }; + let group = GroupPriorityTracker { + ru_quota, + weight, + virtual_time: AtomicU64::new(self.last_min_vt.load(Ordering::Acquire)), + vt_delta_for_get, + }; + // maybe update existed group + self.resource_consumptions.insert(name, group); + } + + // we calculate the weight of each resource group based on the currently maximum + // ru quota, if a incoming resource group has a bigger quota, we need to + // adjust all the existing groups. As we expect this won't happen very + // often, and iterate 10k entry cost less than 5ms, so the performance is + // acceptable. + fn adjust_all_resource_group_factors(&self, max_ru_quota: u64) { + self.resource_consumptions.iter_mut().for_each(|mut g| { + g.value_mut().weight = Self::calculate_factor(max_ru_quota, g.ru_quota); + }); + } + + fn remove_resource_group(&self, name: &[u8]) { + // do not remove the default resource group, reset to default setting instead. + if DEFAULT_RESOURCE_GROUP_NAME.as_bytes() == name { + self.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); + } + self.resource_consumptions.remove(name); + } + + #[inline] + fn resource_group(&self, name: &[u8]) -> Ref<'_, Vec, GroupPriorityTracker> { + if let Some(g) = self.resource_consumptions.get(name) { + g + } else { + self.resource_consumptions + .get(DEFAULT_RESOURCE_GROUP_NAME.as_bytes()) + .unwrap() + } + } + + pub fn consume(&self, name: &[u8], delta: ResourceConsumeType) { + self.resource_group(name).consume(delta) + } + + pub fn update_min_virtual_time(&self) { + let mut min_vt = u64::MAX; + let mut max_vt = 0; + self.resource_consumptions.iter().for_each(|g| { + let vt = g.current_vt(); + if min_vt > vt { + min_vt = vt; + } + if max_vt < vt { + max_vt = vt; + } + }); + + // TODO: use different threshold for different resource type + // needn't do update if the virtual different is less than 100ms/100KB. + if min_vt + 100_000 >= max_vt { + return; + } + + self.resource_consumptions.iter().for_each(|g| { + let vt = g.current_vt(); + if vt < max_vt { + // TODO: is increase by half is a good choice. + g.increase_vt((max_vt - vt) / 2); + } + }); + // max_vt is actually a little bigger than the current min vt, but we don't + // need totally accurate here. + self.last_min_vt.store(max_vt, Ordering::Relaxed); + } +} + +impl TaskPriorityProvider for ResourceController { + fn priority_of(&self, extras: &yatp::queue::Extras) -> u64 { + self.resource_group(extras.metadata()) + .get_priority(extras.current_level() as usize) + } +} + +struct GroupPriorityTracker { + // the ru setting of this group. + ru_quota: u64, + weight: u64, + virtual_time: AtomicU64, + // the constant delta value for each `get_priority` call, + vt_delta_for_get: u64, +} + +impl GroupPriorityTracker { + fn get_priority(&self, level: usize) -> u64 { + let task_extra_priority = TASK_EXTRA_FACTOR_BY_LEVEL[level] * 1000 * self.weight; + (if self.vt_delta_for_get > 0 { + self.virtual_time + .fetch_add(self.vt_delta_for_get, Ordering::Relaxed) + + self.vt_delta_for_get + } else { + self.virtual_time.load(Ordering::Relaxed) + }) + task_extra_priority + } + + #[inline] + fn current_vt(&self) -> u64 { + self.virtual_time.load(Ordering::Relaxed) + } + + #[inline] + fn increase_vt(&self, vt_delta: u64) { + self.virtual_time.fetch_add(vt_delta, Ordering::Relaxed); + } + + // TODO: make it delta type as generic to avoid mixed consume different types. + #[inline] + fn consume(&self, delta: ResourceConsumeType) { + let vt_delta = match delta { + ResourceConsumeType::CpuTime(dur) => dur.as_micros() as u64, + ResourceConsumeType::IoBytes(bytes) => bytes, + } * self.weight; + self.increase_vt(vt_delta); + } +} + +#[cfg(test)] +mod tests { + use kvproto::resource_manager::*; + use yatp::queue::Extras; + + use super::*; + + fn new_resource_group( + name: String, + is_ru_mode: bool, + read_tokens: u64, + write_tokens: u64, + ) -> ResourceGroup { + let mut group = ResourceGroup::new(); + group.set_name(name); + let mode = if is_ru_mode { + GroupMode::RuMode + } else { + GroupMode::RawMode + }; + group.set_mode(mode); + if is_ru_mode { + let mut ru_setting = GroupRequestUnitSettings::new(); + ru_setting + .mut_r_r_u() + .mut_settings() + .set_fill_rate(read_tokens); + ru_setting + .mut_w_r_u() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_r_u_settings(ru_setting); + } else { + let mut resource_setting = GroupResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_resource_settings(resource_setting); + } + group + } + + #[test] + fn test_resource_group() { + let resource_manager = ResourceGroupManager::default(); + + let group1 = new_resource_group("TEST".into(), true, 100, 100); + resource_manager.add_resource_group(group1); + + assert!(resource_manager.get_resource_group("test1").is_none()); + + let group = resource_manager.get_resource_group("test").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_r_u() + .get_settings() + .get_fill_rate(), + 100 + ); + drop(group); + assert_eq!(resource_manager.resource_groups.len(), 1); + + let group1 = new_resource_group("Test".into(), true, 200, 100); + resource_manager.add_resource_group(group1); + let group = resource_manager.get_resource_group("test").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_r_u() + .get_settings() + .get_fill_rate(), + 200 + ); + drop(group); + assert_eq!(resource_manager.resource_groups.len(), 1); + + let group2 = new_resource_group("test2".into(), true, 400, 200); + resource_manager.add_resource_group(group2); + assert_eq!(resource_manager.resource_groups.len(), 2); + + let resouce_ctl = resource_manager.derive_controller("test_read".into(), true); + assert_eq!(resouce_ctl.resource_consumptions.len(), 3); + + let group1 = resouce_ctl.resource_group("test".as_bytes()); + assert_eq!(group1.weight, 500); + let group2 = resouce_ctl.resource_group("test2".as_bytes()); + assert_eq!(group2.weight, 250); + assert_eq!(group1.current_vt(), 0); + + let mut extras1 = Extras::single_level(); + extras1.set_metadata("test".as_bytes().to_owned()); + assert_eq!(resouce_ctl.priority_of(&extras1), 25_000); + assert_eq!(group1.current_vt(), 25_000); + + let mut extras2 = Extras::single_level(); + extras2.set_metadata("test2".as_bytes().to_owned()); + assert_eq!(resouce_ctl.priority_of(&extras2), 12_500); + assert_eq!(group2.current_vt(), 12_500); + + let mut extras3 = Extras::single_level(); + extras3.set_metadata("unknown_group".as_bytes().to_owned()); + assert_eq!(resouce_ctl.priority_of(&extras3), 50); + assert_eq!( + resouce_ctl + .resource_group("default".as_bytes()) + .current_vt(), + 50 + ); + + resouce_ctl.consume( + "test".as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + resouce_ctl.consume( + "test2".as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + + assert_eq!(group1.current_vt(), 5_025_000); + assert_eq!(group1.current_vt(), group2.current_vt() * 2); + + // test update all group vts + resource_manager.advance_min_virtual_time(); + let group1_vt = group1.current_vt(); + assert_eq!(group1_vt, 5_025_000); + assert!(group2.current_vt() >= group1.current_vt() * 3 / 4); + assert!( + resouce_ctl + .resource_group("default".as_bytes()) + .current_vt() + >= group1.current_vt() / 2 + ); + + drop(group1); + drop(group2); + + // test add 1 new resource group + let new_group = new_resource_group("new_group".into(), true, 500, 500); + resource_manager.add_resource_group(new_group); + + assert_eq!(resouce_ctl.resource_consumptions.len(), 4); + let group3 = resouce_ctl.resource_group("new_group".as_bytes()); + assert_eq!(group3.weight, 200); + assert!(group3.current_vt() >= group1_vt / 2); + } + + #[test] + fn test_adjust_resource_group_weight() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); + + let group1 = new_resource_group("test1".into(), true, 5000, 1000); + resource_manager.add_resource_group(group1); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 100 + ); + + // add a resource group with big ru + let group1 = new_resource_group("test2".into(), true, 50000, 2000); + resource_manager.add_resource_group(group1); + assert_eq!(*resource_ctl.max_ru_quota.lock().unwrap(), 50000); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 100); + assert_eq!(resource_ctl.resource_group("test2".as_bytes()).weight, 10); + // resource_ctl_write should be unchanged. + assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 10000); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 100 + ); + assert_eq!( + resource_ctl_write.resource_group("test2".as_bytes()).weight, + 50 + ); + } +} diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index acdca09b29c..d5e2f177b5e 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -69,6 +69,7 @@ raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } rand = "0.8" resolved_ts = { workspace = true } +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true } serde_json = "1.0" diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 3c926969ce2..52b9fbf1d1a 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -82,6 +82,7 @@ use raftstore::{ }, RaftRouterCompactedEventSender, }; +use resource_control::{ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; use security::SecurityManager; use snap_recovery::RecoveryService; use tikv::{ @@ -244,6 +245,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + resource_manager: Arc, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, br_snap_recovery_mode: bool, // use for br snapshot recovery @@ -320,6 +322,7 @@ where let config = cfg_controller.get_current(); let store_path = Path::new(&config.storage.data_dir).to_owned(); + let resource_manager = Arc::new(ResourceGroupManager::default()); // Initialize raftstore channels. let (router, system) = fsm::create_raft_batch_system(&config.raft_store); @@ -328,6 +331,14 @@ where let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); + // spawn a task to periodically update the minimal virtual time of all resource + // group. + if config.resource_control.enabled { + let resource_mgr1 = resource_manager.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr1.advance_min_virtual_time(); + }); + } let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), @@ -398,6 +409,7 @@ where flow_info_receiver: None, sst_worker: None, quota_limiter, + resource_manager, causal_ts_provider, tablet_registry: None, br_snap_recovery_mode: is_recovering_marked, @@ -733,10 +745,19 @@ where } let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let priority_mgr = if self.config.resource_control.enabled { + Some( + self.resource_manager + .derive_controller("unified-read-pool".into(), true), + ) + } else { + None + }; Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), + priority_mgr, )) } else { None diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 5d037fa3412..12e6af61613 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -62,6 +62,7 @@ use raftstore::{ RegionInfoAccessor, }; use raftstore_v2::{router::RaftRouter, StateStorage}; +use resource_control::{ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; use security::SecurityManager; use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, @@ -221,6 +222,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + resource_manager: Arc, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, } @@ -285,6 +287,15 @@ where config.quota.max_delay_duration, config.quota.enable_auto_tune, )); + let resource_manager = Arc::new(ResourceGroupManager::default()); + // spawn a task to periodically update the minimal virtual time of all resource + // group. + if config.resource_control.enabled { + let resource_mgr1 = resource_manager.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr1.advance_min_virtual_time(); + }); + } let mut causal_ts_provider = None; if let ApiVersion::V2 = F::TAG { @@ -333,6 +344,7 @@ where flow_info_receiver: None, sst_worker: None, quota_limiter, + resource_manager, causal_ts_provider, tablet_registry: None, } @@ -622,10 +634,19 @@ where let pd_sender = raftstore_v2::FlowReporter::new(pd_worker.scheduler()); let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let priority_mgr = if self.config.resource_control.enabled { + Some( + self.resource_manager + .derive_controller("unified-read-pool".into(), true), + ) + } else { + None + }; Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), + priority_mgr, )) } else { None diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 9de2d49cb07..e74ced848c0 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -15,7 +15,7 @@ use fail::fail_point; use futures::channel::oneshot::{self, Canceled}; use prometheus::{IntCounter, IntGauge}; use tracker::TrackedFuture; -use yatp::task::future; +use yatp::{queue::Extras, task::future}; pub type ThreadPool = yatp::ThreadPool; @@ -82,7 +82,14 @@ impl FuturePool { where F: Future + Send + 'static, { - self.inner.spawn(TrackedFuture::new(future)) + self.inner.spawn(TrackedFuture::new(future), None) + } + + pub fn spawn_with_extras(&self, future: F, extras: Extras) -> Result<(), Full> + where + F: Future + Send + 'static, + { + self.inner.spawn(TrackedFuture::new(future), Some(extras)) } /// Spawns a future in the pool and returns a handle to the result of the @@ -143,7 +150,7 @@ impl PoolInner { } } - fn spawn(&self, future: F) -> Result<(), Full> + fn spawn(&self, future: F, extras: Option) -> Result<(), Full> where F: Future + Send + 'static, { @@ -154,11 +161,17 @@ impl PoolInner { metrics_running_task_count.inc(); - self.pool.spawn(async move { + let f = async move { let _ = future.await; metrics_handled_task_count.inc(); metrics_running_task_count.dec(); - }); + }; + + if let Some(extras) = extras { + self.pool.spawn(future::TaskCell::new(f, extras)); + } else { + self.pool.spawn(f); + } Ok(()) } diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index 6e246d6cddf..29376b904a5 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -10,7 +10,7 @@ pub use future_pool::{Full, FuturePool}; use prometheus::{local::LocalHistogram, Histogram}; use yatp::{ pool::{CloneRunnerBuilder, Local, Runner}, - queue::{multilevel, QueueType, TaskCell as _}, + queue::{multilevel, priority, QueueType, TaskCell as _}, task::future::{Runner as FutureRunner, TaskCell}, ThreadPool, }; @@ -282,6 +282,20 @@ impl YatpPoolBuilder { .build_with_queue_and_runner(QueueType::Multilevel(multilevel_builder), runner_builder) } + pub fn build_priority_pool( + &mut self, + priority_provider: Arc, + ) -> ThreadPool { + let (builder, read_pool_runner) = self.create_builder(); + let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); + let priority_builder = priority::Builder::new( + priority::Config::default().name(Some(name)), + priority_provider, + ); + let runner_builder = priority_builder.runner_builder(CloneRunnerBuilder(read_pool_runner)); + builder.build_with_queue_and_runner(QueueType::Priority(priority_builder), runner_builder) + } + fn create_builder(&mut self) -> (yatp::Builder, YatpPoolRunner) { let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); let mut builder = yatp::Builder::new(thd_name!(name)); diff --git a/src/config/mod.rs b/src/config/mod.rs index d2c5941c5ec..8d3e5477f26 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -55,6 +55,7 @@ use raftstore::{ coprocessor::{Config as CopConfig, RegionInfoAccessor}, store::{CompactionGuardGeneratorFactory, Config as RaftstoreConfig, SplitConfig}, }; +use resource_control::Config as ResourceControlConfig; use resource_metering::Config as ResourceMeteringConfig; use security::SecurityConfig; use serde::{ @@ -3039,6 +3040,9 @@ pub struct TikvConfig { #[online_config(skip)] pub causal_ts: CausalTsConfig, + + #[online_config(submodule)] + pub resource_control: ResourceControlConfig, } impl Default for TikvConfig { @@ -3081,6 +3085,7 @@ impl Default for TikvConfig { resource_metering: ResourceMeteringConfig::default(), backup_stream: BackupStreamConfig::default(), causal_ts: CausalTsConfig::default(), + resource_control: ResourceControlConfig::default(), } } } diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 54fcaeb0489..711cd83e607 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -470,6 +470,11 @@ impl Endpoint { let resource_tag = self .resource_tag_factory .new_tag_with_key_ranges(&req_ctx.context, key_ranges); + let group_name = req_ctx + .context + .get_resource_group_name() + .as_bytes() + .to_owned(); // box the tracker so that moving it is cheap. let tracker = Box::new(Tracker::new(req_ctx, self.slow_log_threshold)); @@ -480,6 +485,7 @@ impl Endpoint { .in_resource_metering_tag(resource_tag), priority, task_id, + group_name, ) .map_err(|_| Error::MaxPendingTasksExceeded); async move { res.await? } @@ -690,6 +696,11 @@ impl Endpoint { ) -> Result>> { let (tx, rx) = mpsc::channel::>(self.stream_channel_size); let priority = req_ctx.context.get_priority(); + let group_name = req_ctx + .context + .get_resource_group_name() + .as_bytes() + .to_owned(); let key_ranges = req_ctx .ranges .iter() @@ -712,6 +723,7 @@ impl Endpoint { }), priority, task_id, + group_name, ) .map_err(|_| Error::MaxPendingTasksExceeded)?; Ok(rx) diff --git a/src/read_pool.rs b/src/read_pool.rs index 5212c4ae594..1a590679584 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -11,6 +11,7 @@ use futures::{channel::oneshot, future::TryFutureExt}; use kvproto::kvrpcpb::CommandPri; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use prometheus::{IntCounter, IntGauge}; +use resource_control::{ControlledFuture, ResourceController}; use thiserror::Error; use tikv_util::{ sys::{cpu_time::ProcessStat, SysQuota}, @@ -52,6 +53,7 @@ pub enum ReadPool { running_threads: IntGauge, max_tasks: usize, pool_size: usize, + resource_ctl: Option>, }, } @@ -73,12 +75,14 @@ impl ReadPool { running_threads, max_tasks, pool_size, + resource_ctl, } => ReadPoolHandle::Yatp { remote: pool.remote().clone(), running_tasks: running_tasks.clone(), running_threads: running_threads.clone(), max_tasks: *max_tasks, pool_size: *pool_size, + resource_ctl: resource_ctl.clone(), }, } } @@ -97,11 +101,18 @@ pub enum ReadPoolHandle { running_threads: IntGauge, max_tasks: usize, pool_size: usize, + resource_ctl: Option>, }, } impl ReadPoolHandle { - pub fn spawn(&self, f: F, priority: CommandPri, task_id: u64) -> Result<(), ReadPoolError> + pub fn spawn( + &self, + f: F, + priority: CommandPri, + task_id: u64, + group_meta: Vec, + ) -> Result<(), ReadPoolError> where F: Future + Send + 'static, { @@ -123,6 +134,7 @@ impl ReadPoolHandle { remote, running_tasks, max_tasks, + resource_ctl, .. } => { let running_tasks = running_tasks.clone(); @@ -140,14 +152,29 @@ impl ReadPoolHandle { CommandPri::Normal => None, CommandPri::Low => Some(2), }; - let extras = Extras::new_multilevel(task_id, fixed_level); - let task_cell = TaskCell::new( - TrackedFuture::new(async move { - f.await; - running_tasks.dec(); - }), - extras, - ); + let mut extras = Extras::new_multilevel(task_id, fixed_level); + extras.set_metadata(group_meta.clone()); + let task_cell = if let Some(resource_ctl) = resource_ctl { + TaskCell::new( + TrackedFuture::new(ControlledFuture::new( + async move { + f.await; + running_tasks.dec(); + }, + resource_ctl.clone(), + group_meta, + )), + extras, + ) + } else { + TaskCell::new( + TrackedFuture::new(async move { + f.await; + running_tasks.dec(); + }), + extras, + ) + }; remote.spawn(task_cell); } } @@ -159,6 +186,7 @@ impl ReadPoolHandle { f: F, priority: CommandPri, task_id: u64, + group_meta: Vec, ) -> impl Future> where F: Future + Send + 'static, @@ -172,6 +200,7 @@ impl ReadPoolHandle { }, priority, task_id, + group_meta, ); async move { res?; @@ -262,11 +291,12 @@ pub fn build_yatp_read_pool( config: &UnifiedReadPoolConfig, reporter: R, engine: E, + resource_ctl: Option>, ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); - let mut builder = YatpPoolBuilder::new(ReporterTicker { reporter }); let raftkv = Arc::new(Mutex::new(engine)); - let pool = builder + let mut builder = YatpPoolBuilder::new(ReporterTicker { reporter }); + builder .name_prefix(&unified_read_pool_name) .stack_size(config.stack_size.0 as usize) .thread_count( @@ -284,8 +314,12 @@ pub fn build_yatp_read_pool( }) .before_stop(|| unsafe { destroy_tls_engine::(); - }) - .build_multi_level_pool(); + }); + let pool = if let Some(ref r) = resource_ctl { + builder.build_priority_pool(r.clone()) + } else { + builder.build_multi_level_pool() + }; ReadPool::Yatp { pool, running_tasks: UNIFIED_READ_POOL_RUNNING_TASKS @@ -296,6 +330,7 @@ pub fn build_yatp_read_pool( .max_tasks_per_worker .saturating_mul(config.max_thread_count), pool_size: config.max_thread_count, + resource_ctl, } } @@ -600,7 +635,7 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine); + let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -616,18 +651,18 @@ mod tests { let (task3, _tx3) = gen_task(); let (task4, _tx4) = gen_task(); - handle.spawn(task1, CommandPri::Normal, 1).unwrap(); - handle.spawn(task2, CommandPri::Normal, 2).unwrap(); + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task3, CommandPri::Normal, 3) { + match handle.spawn(task3, CommandPri::Normal, 3, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } tx1.send(()).unwrap(); thread::sleep(Duration::from_millis(300)); - handle.spawn(task4, CommandPri::Normal, 4).unwrap(); + handle.spawn(task4, CommandPri::Normal, 4, vec![]).unwrap(); } #[test] @@ -641,7 +676,7 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine); + let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -658,11 +693,11 @@ mod tests { let (task4, _tx4) = gen_task(); let (task5, _tx5) = gen_task(); - handle.spawn(task1, CommandPri::Normal, 1).unwrap(); - handle.spawn(task2, CommandPri::Normal, 2).unwrap(); + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task3, CommandPri::Normal, 3) { + match handle.spawn(task3, CommandPri::Normal, 3, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } @@ -670,10 +705,10 @@ mod tests { handle.scale_pool_size(3); assert_eq!(handle.get_normal_pool_size(), 3); - handle.spawn(task4, CommandPri::Normal, 4).unwrap(); + handle.spawn(task4, CommandPri::Normal, 4, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task5, CommandPri::Normal, 5) { + match handle.spawn(task5, CommandPri::Normal, 5, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } @@ -690,7 +725,7 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine); + let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -707,11 +742,11 @@ mod tests { let (task4, _tx4) = gen_task(); let (task5, _tx5) = gen_task(); - handle.spawn(task1, CommandPri::Normal, 1).unwrap(); - handle.spawn(task2, CommandPri::Normal, 2).unwrap(); + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task3, CommandPri::Normal, 3) { + match handle.spawn(task3, CommandPri::Normal, 3, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } @@ -723,10 +758,10 @@ mod tests { handle.scale_pool_size(1); assert_eq!(handle.get_normal_pool_size(), 1); - handle.spawn(task4, CommandPri::Normal, 4).unwrap(); + handle.spawn(task4, CommandPri::Normal, 4, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task5, CommandPri::Normal, 5) { + match handle.spawn(task5, CommandPri::Normal, 5, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 802b0507849..0819c2599b9 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -594,6 +594,7 @@ impl Storage { let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -727,6 +728,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -750,6 +752,11 @@ impl Storage { const CMD: CommandKind = CommandKind::batch_get_command; // all requests in a batch have the same region, epoch, term, replica_read let priority = requests[0].get_context().get_priority(); + let group_name = requests[0] + .get_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let concurrency_manager = self.concurrency_manager.clone(); let api_version = self.api_version; @@ -910,6 +917,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -929,6 +937,7 @@ impl Storage { let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::batch_get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = keys .iter() @@ -1082,6 +1091,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1109,6 +1119,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::scan; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -1258,6 +1269,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1276,6 +1288,7 @@ impl Storage { ) -> impl Future>> { const CMD: CommandKind = CommandKind::scan_lock; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -1405,6 +1418,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -1577,6 +1591,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self .resource_tag_factory @@ -1639,6 +1654,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1657,6 +1673,11 @@ impl Storage { const CMD: CommandKind = CommandKind::raw_batch_get_command; // all requests in a batch have the same region, epoch, term, replica_read let priority = gets[0].get_context().get_priority(); + let group_name = gets[0] + .get_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let api_version = self.api_version; @@ -1770,6 +1791,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -1786,6 +1808,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_batch_get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = keys.iter().map(|k| (k.clone(), k.clone())).collect(); let resource_tag = self @@ -1866,6 +1889,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2272,6 +2296,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_scan; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag(&ctx); let api_version = self.api_version; @@ -2380,6 +2405,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2400,6 +2426,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_batch_scan; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = ranges .iter() @@ -2536,6 +2563,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2553,6 +2581,7 @@ impl Storage { ) -> impl Future>> { const CMD: CommandKind = CommandKind::raw_get_key_ttl; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self .resource_tag_factory @@ -2615,6 +2644,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2719,6 +2749,7 @@ impl Storage { ) -> impl Future> { const CMD: CommandKind = CommandKind::raw_checksum; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = ranges .iter() @@ -2793,6 +2824,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { From 321aa833ca5ec0fd5dcec7fa8c01f65116d72ba6 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Fri, 13 Jan 2023 11:49:46 +0800 Subject: [PATCH 454/676] txn: Do constraint check when handling repeated acqurie_pessimsitic_lock request (#14037) close tikv/tikv#14038, close pingcap/tidb#40114 Fixes the problem that when handling repeated acquire_pessimistic_lock requests is recevied, should_not_exist is ignored. TiKV provides idempotency for these RPC requests, but for acquire_pessimistic_lock, it ignored the possibility that the client may expect a pessimistic_rollback between two acquire_pessimistic_lock request on the same key. In this case the second request may come from another statement and carries `should_not_exist` that wasn't set in the previously finished pessimistic lock request. If the first request successfully acquired the lock and the pessimistic_rollback failed, TiKV may return a sucessful response, making the client believe that the key doesn't exist before. In some rare cases, this has risk to cause data inconsistency. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- .../txn/actions/acquire_pessimistic_lock.rs | 150 +++++++++++++++++- 1 file changed, 146 insertions(+), 4 deletions(-) diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index fcffd500c8e..86b9ddeab41 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -142,10 +142,22 @@ pub fn acquire_pessimistic_lock( None }; - if need_load_value { - val = reader.get(&key, for_update_ts)?; - } else if need_check_existence { - val = reader.get_write(&key, for_update_ts)?.map(|_| vec![]); + if need_load_value || need_check_existence || should_not_exist { + let write = reader.get_write_with_commit_ts(&key, for_update_ts)?; + if let Some((write, commit_ts)) = write { + // Here `get_write_with_commit_ts` returns only the latest PUT if it exists and + // is not deleted. It's still ok to pass it into `check_data_constraint`. + // In case we are going to lock it with write conflict, we do not check it since + // the statement will then retry. + if locked_with_conflict_ts.is_none() { + check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; + } + if need_load_value { + val = Some(reader.load_data(&key, write)?); + } else if need_check_existence { + val = Some(vec![]); + } + } } // Pervious write is not loaded. let (prev_write_loaded, prev_write) = (false, None); @@ -1832,4 +1844,134 @@ pub mod tests { must_pessimistic_rollback(&mut engine, b"k1", 10, 50); must_unlocked(&mut engine, b"k1"); } + + #[test] + fn test_repeated_request_check_should_not_exist() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + for &(return_values, check_existence) in + &[(false, false), (false, true), (true, false), (true, true)] + { + let key = &[b'k', (return_values as u8 * 2) + check_existence as u8] as &[u8]; + + // An empty key. + must_succeed(&mut engine, key, key, 10, 10); + let res = must_succeed_impl( + &mut engine, + key, + key, + 10, + true, + 1000, + 10, + return_values, + check_existence, + 15, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_lock(&mut engine, key, key, 10, 10, DoPessimisticCheck); + must_commit(&mut engine, key, 10, 19); + + // The key has one record: Lock(10, 19) + must_succeed(&mut engine, key, key, 20, 20); + let res = must_succeed_impl( + &mut engine, + key, + key, + 20, + true, + 1000, + 20, + return_values, + check_existence, + 25, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_put(&mut engine, key, b"v1", key, 20, 20, DoPessimisticCheck); + must_commit(&mut engine, key, 20, 29); + + // The key has records: + // Lock(10, 19), Put(20, 29) + must_succeed(&mut engine, key, key, 30, 30); + let error = must_err_impl( + &mut engine, + key, + key, + 30, + true, + 30, + return_values, + check_existence, + 35, + false, + ); + assert!(matches!( + error, + MvccError(box ErrorInner::AlreadyExist { .. }) + )); + must_pessimistic_prewrite_lock(&mut engine, key, key, 30, 30, DoPessimisticCheck); + must_commit(&mut engine, key, 30, 39); + + // Lock(10, 19), Put(20, 29), Lock(30, 39) + must_succeed(&mut engine, key, key, 40, 40); + let error = must_err_impl( + &mut engine, + key, + key, + 40, + true, + 40, + return_values, + check_existence, + 45, + false, + ); + assert!(matches!( + error, + MvccError(box ErrorInner::AlreadyExist { .. }) + )); + must_pessimistic_prewrite_delete(&mut engine, key, key, 40, 40, DoPessimisticCheck); + must_commit(&mut engine, key, 40, 49); + + // Lock(10, 19), Put(20, 29), Lock(30, 39), Delete(40, 49) + must_succeed(&mut engine, key, key, 50, 50); + let res = must_succeed_impl( + &mut engine, + key, + key, + 50, + true, + 1000, + 50, + return_values, + check_existence, + 55, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_lock(&mut engine, key, key, 50, 50, DoPessimisticCheck); + must_commit(&mut engine, key, 50, 59); + + // Lock(10, 19), Put(20, 29), Lock(30, 39), Delete(40, 49), Lock(50, 59) + must_succeed(&mut engine, key, key, 60, 60); + let res = must_succeed_impl( + &mut engine, + key, + key, + 60, + true, + 1000, + 60, + return_values, + check_existence, + 65, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_lock(&mut engine, key, key, 60, 60, DoPessimisticCheck); + must_commit(&mut engine, key, 60, 69); + } + } } From 65a99a89b9f03de1ca24cee8c33584d13370becc Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 13 Jan 2023 14:27:46 +0800 Subject: [PATCH 455/676] raftstore-v2: fix metrics and perf context (#14035) ref tikv/tikv#12842 This PR fixes several bugs and metrics: - Now waterfall timer will be reset in before_write, the goal is to solve the confusion that stall writes can pollute the whole waterfall metrics. - Perf context is changed not to be associated with engine instance. Perf context is thread local and instance independent under the hook. - Fix flushed index advance failure due to suspicious flush. - Support print long uncommitted logs and fix incorrect commit time Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_panic/src/perf_context.rs | 2 +- components/engine_rocks/src/perf_context.rs | 2 +- components/engine_tirocks/src/perf_context.rs | 1 - components/engine_traits/src/flush.rs | 5 +- components/engine_traits/src/perf_context.rs | 2 +- components/raft_log_engine/src/engine.rs | 2 +- components/raftstore-v2/src/batch/store.rs | 1 + components/raftstore-v2/src/fsm/peer.rs | 2 +- .../operation/command/admin/compact_log.rs | 5 + .../raftstore-v2/src/operation/command/mod.rs | 36 ++++--- .../raftstore-v2/src/operation/query/lease.rs | 2 +- .../src/operation/query/replica.rs | 2 +- .../src/operation/ready/apply_trace.rs | 33 +++++-- .../raftstore-v2/src/operation/ready/mod.rs | 66 ++++++++++++- components/raftstore-v2/src/raft/apply.rs | 2 +- components/raftstore-v2/src/raft/peer.rs | 18 +++- .../src/router/response_channel.rs | 94 +++++++++++++------ .../raftstore-v2/src/worker/tablet_gc.rs | 2 + .../raftstore/src/store/async_io/write.rs | 13 +-- components/raftstore/src/store/fsm/apply.rs | 14 ++- components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/fsm/store.rs | 16 ++-- .../raftstore/src/store/local_metrics.rs | 73 ++++++++------ components/raftstore/src/store/msg.rs | 19 +--- components/raftstore/src/store/peer.rs | 4 +- src/coprocessor/tracker.rs | 32 +++---- src/storage/metrics.rs | 26 ++--- 27 files changed, 303 insertions(+), 173 deletions(-) diff --git a/components/engine_panic/src/perf_context.rs b/components/engine_panic/src/perf_context.rs index 46d18c00e77..27bdd1ac066 100644 --- a/components/engine_panic/src/perf_context.rs +++ b/components/engine_panic/src/perf_context.rs @@ -8,7 +8,7 @@ use crate::engine::PanicEngine; impl PerfContextExt for PanicEngine { type PerfContext = PanicPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { panic!() } } diff --git a/components/engine_rocks/src/perf_context.rs b/components/engine_rocks/src/perf_context.rs index a731a9461dc..f8cfdbcc667 100644 --- a/components/engine_rocks/src/perf_context.rs +++ b/components/engine_rocks/src/perf_context.rs @@ -8,7 +8,7 @@ use crate::{engine::RocksEngine, perf_context_impl::PerfContextStatistics}; impl PerfContextExt for RocksEngine { type PerfContext = RocksPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { RocksPerfContext::new(level, kind) } } diff --git a/components/engine_tirocks/src/perf_context.rs b/components/engine_tirocks/src/perf_context.rs index d1d975c65c3..643967230df 100644 --- a/components/engine_tirocks/src/perf_context.rs +++ b/components/engine_tirocks/src/perf_context.rs @@ -136,7 +136,6 @@ impl engine_traits::PerfContextExt for RocksEngine { type PerfContext = RocksPerfContext; fn get_perf_context( - &self, level: engine_traits::PerfLevel, kind: engine_traits::PerfContextKind, ) -> Self::PerfContext { diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index b3a827c234e..8300348da8c 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -157,7 +157,10 @@ impl PersistenceListener { } match flushed_pr { Some(pr) => pr, - None => panic!("{} not found in {:?}", cf, prs), + None => panic!( + "[region_id={}] [tablet_index={}] {} not found in {:?}", + self.region_id, self.tablet_index, cf, prs + ), } }; self.storage diff --git a/components/engine_traits/src/perf_context.rs b/components/engine_traits/src/perf_context.rs index ba48974a460..44462e3fe3c 100644 --- a/components/engine_traits/src/perf_context.rs +++ b/components/engine_traits/src/perf_context.rs @@ -37,7 +37,7 @@ numeric_enum_serializing_mod! {perf_level_serde PerfLevel { pub trait PerfContextExt { type PerfContext: PerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext; + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext; } /// The subsystem the PerfContext is being created for. diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 3db865ed8ad..838fe461f4b 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -366,7 +366,7 @@ impl RaftLogEngine { impl PerfContextExt for RaftLogEngine { type PerfContext = RaftEnginePerfContext; - fn get_perf_context(&self, _level: PerfLevel, _kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(_level: PerfLevel, _kind: PerfContextKind) -> Self::PerfContext { RaftEnginePerfContext } } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 6183778c369..ccf3f19f3ea 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -188,6 +188,7 @@ impl PollHandler PeerFsmDelegate<'a, EK, ER, PeerTick::CheckLeaderLease => unimplemented!(), PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), PeerTick::ReportBuckets => unimplemented!(), - PeerTick::CheckLongUncommitted => unimplemented!(), + PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted(), } } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index a4983b28a47..0f5fd9b392f 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -73,6 +73,11 @@ impl CompactLogContext { pub fn set_last_applying_index(&mut self, index: u64) { self.last_applying_index = index; } + + #[inline] + pub fn last_applying_index(&self) -> u64 { + self.last_applying_index + } } impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 047fe026ffe..cf29d9ee25a 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -32,7 +32,7 @@ use raftstore::{ apply::{self, APPLY_WB_SHRINK_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP}, Proposal, }, - local_metrics::{RaftMetrics, TimeTracker}, + local_metrics::RaftMetrics, metrics::{APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM}, msg::ErrorCallback, util, Config, WriteCallback, @@ -302,9 +302,7 @@ impl Peer { t.metrics.write_instant = Some(now); &mut t.metrics.store_time_nanos }); - if let TimeTracker::Instant(t) = tracker { - *t = now; - } + tracker.reset(now); } } } @@ -314,7 +312,7 @@ impl Peer { return; } // TODO: remove following log once stable. - info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res); + info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res, "apply_trace" => ?self.storage().apply_trace()); // It must just applied a snapshot. if apply_res.applied_index < self.entry_storage().first_index() { // Ignore admin command side effects, otherwise it may split incomplete @@ -378,6 +376,12 @@ impl Peer { scheduler.send(ApplyTask::ManualFlush); } } + let last_applying_index = self.compact_log_context().last_applying_index(); + let committed_index = self.entry_storage().commit_index(); + if last_applying_index < committed_index { + // We need to continue to apply after previous page is finished. + self.set_has_ready(); + } } } @@ -691,11 +695,23 @@ impl Apply { .iter() .flat_map(|(v, _)| { v.write_trackers() - .flat_map(|t| t.as_tracker_token().cloned()) + .flat_map(|t| t.as_tracker_token()) }) .collect(); self.perf_context().report_metrics(&tokens); } + let mut apply_res = ApplyRes::default(); + apply_res.applied_index = index; + apply_res.applied_term = term; + apply_res.admin_result = self.take_admin_result().into_boxed_slice(); + apply_res.modifications = *self.modifications_mut(); + apply_res.metrics = mem::take(&mut self.metrics); + let written_bytes = apply_res.metrics.written_bytes; + self.res_reporter().report(apply_res); + + // Report result first and then invoking callbacks. This may delays callback a + // little bit, but can make sure all following messages must see the side + // effect of admin commands. let callbacks = self.callbacks_mut(); let now = std::time::Instant::now(); let apply_time = APPLY_TIME_HISTOGRAM.local(); @@ -709,14 +725,6 @@ impl Apply { if callbacks.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); } - let mut apply_res = ApplyRes::default(); - apply_res.applied_index = index; - apply_res.applied_term = term; - apply_res.admin_result = self.take_admin_result().into_boxed_slice(); - apply_res.modifications = *self.modifications_mut(); - apply_res.metrics = mem::take(&mut self.metrics); - let written_bytes = apply_res.metrics.written_bytes; - self.res_reporter().report(apply_res); written_bytes } } diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 0abd0cccd72..3185f1bd24b 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -112,7 +112,7 @@ impl Peer { let time = monotonic_raw_now(); for (_, ch, mut read_index) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) .to_std() .unwrap() diff --git a/components/raftstore-v2/src/operation/query/replica.rs b/components/raftstore-v2/src/operation/query/replica.rs index fb00adbbc5a..901fd9726f6 100644 --- a/components/raftstore-v2/src/operation/query/replica.rs +++ b/components/raftstore-v2/src/operation/query/replica.rs @@ -75,7 +75,7 @@ impl Peer { let time = monotonic_raw_now(); for (req, ch, _) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) .to_std() .unwrap() diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 5b88a6ba94d..67bbed5aa4b 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -40,7 +40,7 @@ use kvproto::{ use raftstore::store::{ ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use slog::{trace, Logger}; +use slog::{info, trace, Logger}; use tikv_util::{box_err, slog_panic, worker::Scheduler}; use crate::{ @@ -133,7 +133,8 @@ pub type DataTrace = [u64; DATA_CFS_LEN]; #[derive(Clone, Copy, Default, Debug)] struct Progress { flushed: u64, - /// The index of last entry that has modification to the CF. + /// The index of last entry that has modification to the CF. The value + /// can be larger than the index that actually modifies the CF in apply. /// /// If `flushed` == `last_modified`, then all data in the CF is persisted. last_modified: u64, @@ -192,9 +193,13 @@ impl ApplyTrace { trace.admin.last_modified = i; trace.persisted_applied = i; trace.last_flush_trigger = i; - let applied_region_state = engine - .get_region_state(region_id, trace.admin.flushed)? - .unwrap(); + let applied_region_state = match engine.get_region_state(region_id, trace.admin.flushed)? { + Some(s) => s, + None => panic!( + "failed to get region state [region_id={}] [apply_trace={:?}]", + region_id, trace + ), + }; Ok((trace, applied_region_state)) } @@ -242,7 +247,7 @@ impl ApplyTrace { } }) .max(); - if let Some(m) = last_modified && m >= self.admin.flushed + 4096 && m >= self.last_flush_trigger + 4096 { + if let Some(m) = last_modified && m >= self.admin.flushed + 4096000 && m >= self.last_flush_trigger + 4096000 { self.last_flush_trigger = m; true } else { @@ -257,10 +262,17 @@ impl ApplyTrace { } let min_flushed = self .data_cfs - .iter() + .iter_mut() // Only unflushed CFs are considered. Flushed CF always have uptodate changes // persisted. .filter_map(|pr| { + // All modifications before mem_index must be seen. If following condition is + // true, it means the modification comes beyond general apply process (like + // transaction GC unsafe write). Align `last_modified` to `flushed` to avoid + // blocking raft log GC. + if mem_index >= pr.flushed && pr.flushed > pr.last_modified { + pr.last_modified = pr.flushed; + } if pr.last_modified != pr.flushed { Some(pr.flushed) } else { @@ -484,6 +496,7 @@ impl Storage { let lb = write_task .extra_write .ensure_v2(|| raft_engine.log_batch(1)); + info!(self.logger(), "persisting admin flushed"; "tablet_index" => tablet_index, "flushed" => trace.admin.flushed); let trace = self.apply_trace_mut(); lb.put_flushed_index(region_id, CF_RAFT, tablet_index, trace.admin.flushed) .unwrap(); @@ -660,6 +673,12 @@ mod tests { ([(8, 2), (9, 3), (7, 5)], (4, 4), 5, 5), ([(8, 2), (9, 3), (7, 5)], (5, 5), 5, 5), ([(2, 3), (9, 3), (7, 5)], (2, 2), 5, 2), + // In special cae, some CF may be flushed without any modification recorded, + // we should still able to advance the apply index forward. + ([(5, 2), (9, 3), (7, 3)], (2, 2), 3, 3), + ([(5, 2), (9, 3), (7, 3)], (2, 2), 6, 6), + ([(5, 2), (9, 3), (7, 3)], (2, 2), 10, 10), + ([(5, 2), (9, 3), (7, 3)], (2, 3), 10, 2), ]; for (case, (data_cfs, admin, mem_index, exp)) in cases.iter().enumerate() { let mut trace = ApplyTrace::default(); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 3f559feff8b..d1348cf014b 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -81,6 +81,16 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } self.schedule_tick(PeerTick::Raft); } + + pub fn on_check_long_uncommitted(&mut self) { + if !self.fsm.peer().is_leader() { + return; + } + self.fsm + .peer_mut() + .check_long_uncommitted_proposals(self.store_ctx); + self.schedule_tick(PeerTick::CheckLongUncommitted); + } } impl Peer { @@ -396,9 +406,10 @@ impl Peer { // smaller than propose_time of a command, which was // proposed in another thread while this thread receives its // AppendEntriesResponse and is ready to calculate its commit-log-duration. - ctx.current_time.replace(monotonic_raw_now()); + let current_time = monotonic_raw_now(); + ctx.current_time.replace(current_time); ctx.raft_metrics.commit_log.observe(duration_to_sec( - (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), + (current_time - propose_time).to_std().unwrap(), )); self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); update_lease = false; @@ -730,6 +741,7 @@ impl Peer { self.region_heartbeat_pd(ctx); self.add_pending_tick(PeerTick::CompactLog); self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::CheckLongUncommitted); } StateRole::Follower => { self.leader_lease_mut().expire(); @@ -793,6 +805,56 @@ impl Peer { self.read_progress_mut().discard(); } } + + /// Check if there is long uncommitted proposal. + /// + /// This will increase the threshold when a long uncommitted proposal is + /// detected, and reset the threshold when there is no long uncommitted + /// proposal. + fn has_long_uncommitted_proposals(&mut self, ctx: &mut StoreContext) -> bool { + let mut has_long_uncommitted = false; + let base_threshold = ctx.cfg.long_uncommitted_base_threshold.0; + if let Some(propose_time) = self.proposals().oldest().and_then(|p| p.propose_time) { + // When a proposal was proposed with this ctx before, the current_time can be + // some. + let current_time = *ctx.current_time.get_or_insert_with(monotonic_raw_now); + let elapsed = match (current_time - propose_time).to_std() { + Ok(elapsed) => elapsed, + Err(_) => return false, + }; + // Increase the threshold for next turn when a long uncommitted proposal is + // detected. + let threshold = self.long_uncommitted_threshold(); + if elapsed >= threshold { + has_long_uncommitted = true; + self.set_long_uncommitted_threshold(threshold + base_threshold); + } else if elapsed < base_threshold { + self.set_long_uncommitted_threshold(base_threshold); + } + } else { + self.set_long_uncommitted_threshold(base_threshold); + } + has_long_uncommitted + } + + fn check_long_uncommitted_proposals(&mut self, ctx: &mut StoreContext) { + if self.has_long_uncommitted_proposals(ctx) { + let status = self.raft_group().status(); + let mut buffer: Vec<(u64, u64, u64)> = Vec::new(); + if let Some(prs) = status.progress { + for (id, p) in prs.iter() { + buffer.push((*id, p.commit_group_id, p.matched)); + } + } + warn!( + self.logger, + "found long uncommitted proposals"; + "progress" => ?buffer, + "cache_first_index" => ?self.entry_storage().entry_cache_first_index(), + "next_turn_threshold" => ?self.long_uncommitted_threshold(), + ); + } + } } impl Storage { diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 6d1faa98cbf..7a1a22a5a95 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -81,7 +81,7 @@ impl Apply { let applied_index = flush_state.applied_index(); assert_ne!(applied_index, 0, "{}", SlogFormat(&logger)); let tablet = remote_tablet.latest().unwrap().clone(); - let perf_context = tablet.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply); + let perf_context = EK::get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply); Apply { peer, tablet, diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index bc3d8a5af8e..8051066d4f9 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -1,7 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - mem, + cmp, mem, sync::{atomic::Ordering, Arc}, time::{Duration, Instant}, }; @@ -104,6 +104,8 @@ pub struct Peer { /// lead_transferee if this peer(leader) is in a leadership transferring. leader_transferee: u64, + + long_uncommitted_threshold: u64, } impl Peer { @@ -180,6 +182,10 @@ impl Peer { flush_state, split_flow_control: SplitFlowControl::default(), leader_transferee: raft::INVALID_ID, + long_uncommitted_threshold: cmp::max( + cfg.long_uncommitted_base_threshold.0.as_secs(), + 1, + ), }; // If this region has only one peer and I am the one, campaign directly. @@ -769,4 +775,14 @@ impl Peer { .unwrap_or(raft::INVALID_ID), ) } + + #[inline] + pub fn long_uncommitted_threshold(&self) -> Duration { + Duration::from_secs(self.long_uncommitted_threshold) + } + + #[inline] + pub fn set_long_uncommitted_threshold(&mut self, dur: Duration) { + self.long_uncommitted_threshold = cmp::max(dur.as_secs(), 1); + } } diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index eeeb13f6555..f70b6635982 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -30,7 +30,12 @@ use raftstore::store::{ local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, WriteCallback, }; -use tracker::{TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; +use tracker::{get_tls_tracker_token, TrackerToken}; + +union Tracker { + read: TrackerToken, + write: TimeTracker, +} /// A struct allows to watch and notify specific events. /// @@ -53,7 +58,7 @@ struct EventCore { before_set: UnsafeCell>>, // Waker can be changed, need to use `AtomicWaker` to guarantee no data race. waker: AtomicWaker, - tracker: UnsafeCell, + tracker: UnsafeCell, } unsafe impl Send for EventCore {} @@ -240,16 +245,17 @@ pub struct BaseChannel { core: Arc>, } -impl BaseChannel { - /// Creates a pair of channel and subscriber. - #[inline] - pub fn pair() -> (Self, BaseSubscriber) { - let tracker_token = tracker::get_tls_tracker_token(); - Self::with_mask(u32::MAX, TimeTracker::Tracker(tracker_token)) - } +#[inline] +fn pair() -> (BaseChannel, BaseSubscriber) { + let tracker = Tracker { + read: get_tls_tracker_token(), + }; + BaseChannel::::with_mask(u32::MAX, tracker) +} +impl BaseChannel { #[inline] - fn with_mask(mask: u32, tracker: TimeTracker) -> (Self, BaseSubscriber) { + fn with_mask(mask: u32, tracker: Tracker) -> (Self, BaseSubscriber) { let core: Arc> = Arc::new(EventCore { event: AtomicU64::new(0), res: UnsafeCell::new(None), @@ -452,15 +458,8 @@ impl CmdResChannelBuilder { #[inline] pub fn build(self) -> (CmdResChannel, CmdResSubscriber) { - let tracker_token = tracker::get_tls_tracker_token(); - let now = std::time::Instant::now(); - let tracker = if tracker_token == INVALID_TRACKER_TOKEN { - TimeTracker::Instant(now) - } else { - GLOBAL_TRACKERS.with_tracker(tracker_token, |tracker| { - tracker.metrics.write_instant = Some(now); - }); - TimeTracker::Tracker(tracker_token) + let tracker = Tracker { + write: TimeTracker::default(), }; let (c, s) = CmdResChannel::with_mask(self.event_mask, tracker); if let Some(f) = self.before_set { @@ -476,6 +475,15 @@ impl CmdResChannel { // Valid range is [1, 30] const PROPOSED_EVENT: u64 = 1; const COMMITTED_EVENT: u64 = 2; + + /// Creates a pair of channel and subscriber. + #[inline] + pub fn pair() -> (Self, CmdResSubscriber) { + let tracker = Tracker { + write: TimeTracker::default(), + }; + Self::with_mask(u32::MAX, tracker) + } } impl ErrorCallback for CmdResChannel { @@ -509,12 +517,12 @@ impl WriteCallback for CmdResChannel { type TimeTrackerListRef<'a> = &'a [TimeTracker]; #[inline] fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { - std::slice::from_ref(unsafe { &*self.core.tracker.get() }) + std::slice::from_ref(unsafe { &(*self.core.tracker.get()).write }) } type TimeTrackerListMut<'a> = &'a mut [TimeTracker]; fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { - std::slice::from_mut(unsafe { &mut *self.core.tracker.get() }) + std::slice::from_mut(unsafe { &mut (*self.core.tracker.get()).write }) } // TODO: support executing hooks inside setting result. @@ -572,6 +580,13 @@ impl QueryResult { pub type QueryResChannel = BaseChannel; +impl QueryResChannel { + #[inline] + pub fn pair() -> (Self, QueryResSubscriber) { + pair() + } +} + impl ErrorCallback for QueryResChannel { #[inline] fn report_error(self, err: RaftCmdResponse) { @@ -592,8 +607,8 @@ impl ReadCallback for QueryResChannel { self.set_result(res); } - fn read_tracker(&self) -> Option<&TrackerToken> { - unsafe { (*self.core.tracker.get()).as_tracker_token() } + fn read_tracker(&self) -> Option { + Some(unsafe { (*self.core.tracker.get()).read }) } } @@ -608,6 +623,13 @@ impl fmt::Debug for QueryResChannel { pub type DebugInfoChannel = BaseChannel; pub type DebugInfoSubscriber = BaseSubscriber; +impl DebugInfoChannel { + #[inline] + pub fn pair() -> (Self, DebugInfoSubscriber) { + pair() + } +} + impl Debug for DebugInfoChannel { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "DebugInfoChannel") @@ -615,17 +637,29 @@ impl Debug for DebugInfoChannel { } #[cfg(feature = "testexport")] -pub type FlushChannel = BaseChannel<()>; -#[cfg(feature = "testexport")] -pub type FlushSubscriber = BaseSubscriber<()>; +mod flush_channel { + use super::*; -#[cfg(feature = "testexport")] -impl Debug for FlushChannel { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "FlushChannel") + pub type FlushChannel = BaseChannel<()>; + pub type FlushSubscriber = BaseSubscriber<()>; + + impl FlushChannel { + #[inline] + pub fn pair() -> (Self, FlushSubscriber) { + pair() + } + } + + impl Debug for FlushChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "FlushChannel") + } } } +#[cfg(feature = "testexport")] +pub use flush_channel::{FlushChannel, FlushSubscriber}; + #[cfg(test)] mod tests { use std::assert_matches::assert_matches; diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index d4593223db3..d6d19743b1e 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -131,6 +131,8 @@ impl Runner { } fn prepare_destroy(&mut self, region_id: u64, tablet: EK, wait_for_persisted: u64) { + // The tablet is about to be deleted, flush is a waste and will block destroy. + let _ = tablet.set_db_options(&[("avoid_flush_during_shutdown", "true")]); let _ = tablet.pause_background_work(); self.waiting_destroy_tasks .entry(region_id) diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 7016d0ab606..98c76ddd6d1 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -465,11 +465,12 @@ where self.flush_states_to_raft_wb(); if metrics.waterfall_metrics { let now = std::time::Instant::now(); - for task in &self.tasks { - for tracker in &task.trackers { + for task in &mut self.tasks { + for tracker in &mut task.trackers { tracker.observe(now, &metrics.wf_before_write, |t| { &mut t.metrics.wf_before_write_nanos }); + tracker.reset(now); } } } @@ -549,7 +550,7 @@ where ) -> Self { let batch = WriteTaskBatch::new(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); let perf_context = - raft_engine.get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); + ER::get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); let cfg_tracker = cfg.clone().tracker(tag.clone()); Self { store_id, @@ -718,11 +719,7 @@ where .batch .tasks .iter() - .flat_map(|task| { - task.trackers - .iter() - .flat_map(|t| t.as_tracker_token().cloned()) - }) + .flat_map(|task| task.trackers.iter().flat_map(|t| t.as_tracker_token())) .collect(); self.perf_context.report_metrics(&trackers); write_raft_time = duration_to_sec(now.saturating_elapsed()); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index cab6ae0ffe8..58df32fd404 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -83,7 +83,7 @@ use crate::{ cmd_resp, entry_storage::{self, CachedEntries}, fsm::RaftPollerBuilder, - local_metrics::{RaftMetrics, TimeTracker}, + local_metrics::RaftMetrics, memory::*, metrics::*, msg::{Callback, ErrorCallback, PeerMsg, ReadResponse, SignificantMsg}, @@ -475,7 +475,7 @@ where host, importer, region_scheduler, - engine: engine.clone(), + engine, router, notifier, kv_wb, @@ -488,7 +488,7 @@ where committed_count: 0, sync_log_hint: false, use_delete_range: cfg.use_delete_range, - perf_context: engine.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), + perf_context: EK::get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), yield_duration: cfg.apply_yield_duration.0, yield_msg_size: cfg.apply_yield_write_size.0, delete_ssts: vec![], @@ -582,7 +582,7 @@ where .cb_batch .iter() .flat_map(|(cb, _)| cb.write_trackers()) - .flat_map(|trackers| trackers.as_tracker_token().cloned()) + .flat_map(|trackers| trackers.as_tracker_token()) .collect(); self.perf_context.report_metrics(&trackers); self.sync_log_hint = false; @@ -3337,9 +3337,7 @@ impl Apply { t.metrics.write_instant = Some(now); &mut t.metrics.store_time_nanos }); - if let TimeTracker::Instant(t) = tracker { - *t = now; - } + tracker.reset(now); } } } @@ -4171,7 +4169,7 @@ where .flat_map(|p| p.cb.write_trackers()) .flat_map(|ts| ts.as_tracker_token()) { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.apply_wait_nanos = apply_wait.as_nanos() as u64; }); } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index e302ea6588a..7e00798b6df 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -622,7 +622,7 @@ where .propose_wait_time .observe(propose_time.as_secs_f64()); cmd.callback.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_propose_wait_nanos = propose_time.as_nanos() as u64; }) diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index ceb8858046d..3724eba13e2 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1374,14 +1374,14 @@ where ready_count: 0, has_ready: false, current_time: None, - raft_perf_context: self - .engines - .raft - .get_perf_context(self.cfg.value().perf_level, PerfContextKind::RaftstoreStore), - kv_perf_context: self - .engines - .kv - .get_perf_context(self.cfg.value().perf_level, PerfContextKind::RaftstoreStore), + raft_perf_context: ER::get_perf_context( + self.cfg.value().perf_level, + PerfContextKind::RaftstoreStore, + ), + kv_perf_context: EK::get_perf_context( + self.cfg.value().perf_level, + PerfContextKind::RaftstoreStore, + ), tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], node_start_time: Some(TiInstant::now_coarse()), feature_gate: self.feature_gate.clone(), diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index c1db17f8cae..0e6a09cbf0b 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -7,7 +7,7 @@ use collections::HashSet; use prometheus::local::LocalHistogram; use raft::eraftpb::MessageType; use tikv_util::time::{Duration, Instant}; -use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS}; +use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; use super::metrics::*; @@ -208,47 +208,60 @@ impl StoreWriteMetrics { /// Tracker for the durations of a raftstore request. /// If a global tracker is not available, it will fallback to an Instant. #[derive(Debug, Clone, Copy)] -pub enum TimeTracker { - Tracker(TrackerToken), - Instant(std::time::Instant), +pub struct TimeTracker { + token: TrackerToken, + start: std::time::Instant, +} + +impl Default for TimeTracker { + #[inline] + fn default() -> Self { + let token = tracker::get_tls_tracker_token(); + let start = std::time::Instant::now(); + let tracker = TimeTracker { token, start }; + if token == INVALID_TRACKER_TOKEN { + return tracker; + } + + GLOBAL_TRACKERS.with_tracker(token, |tracker| { + tracker.metrics.write_instant = Some(start); + }); + tracker + } } impl TimeTracker { - pub fn as_tracker_token(&self) -> Option<&TrackerToken> { - match self { - TimeTracker::Tracker(tt) => Some(tt), - TimeTracker::Instant(_) => None, + #[inline] + pub fn as_tracker_token(&self) -> Option { + if self.token == INVALID_TRACKER_TOKEN { + None + } else { + Some(self.token) } } + #[inline] pub fn observe( &self, now: std::time::Instant, local_metric: &LocalHistogram, tracker_metric: impl FnOnce(&mut Tracker) -> &mut u64, ) { - match self { - TimeTracker::Tracker(t) => { - if let Some(dur) = GLOBAL_TRACKERS - .with_tracker(*t, |tracker| { - tracker.metrics.write_instant.map(|write_instant| { - let dur = now.saturating_duration_since(write_instant); - let metric = tracker_metric(tracker); - if *metric == 0 { - *metric = dur.as_nanos() as u64; - } - dur - }) - }) - .flatten() - { - local_metric.observe(dur.as_secs_f64()); - } - } - TimeTracker::Instant(t) => { - let dur = now.saturating_duration_since(*t); - local_metric.observe(dur.as_secs_f64()); - } + let dur = now.saturating_duration_since(self.start); + local_metric.observe(dur.as_secs_f64()); + if self.token == INVALID_TRACKER_TOKEN { + return; } + GLOBAL_TRACKERS.with_tracker(self.token, |tracker| { + let metric = tracker_metric(tracker); + if *metric == 0 { + *metric = dur.as_nanos() as u64; + } + }); + } + + #[inline] + pub fn reset(&mut self, start: std::time::Instant) { + self.start = start; } } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index e3fc8530d76..b2a2a7aa1d1 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -24,7 +24,7 @@ use pd_client::BucketMeta; use raft::SnapshotStatus; use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; -use tracker::{get_tls_tracker_token, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; +use tracker::{get_tls_tracker_token, TrackerToken}; use super::{local_metrics::TimeTracker, region_meta::RegionMeta, FetchedLogs, RegionSnapshot}; use crate::store::{ @@ -137,16 +137,7 @@ where proposed_cb: Option, committed_cb: Option, ) -> Self { - let tracker_token = get_tls_tracker_token(); - let now = std::time::Instant::now(); - let tracker = if tracker_token == INVALID_TRACKER_TOKEN { - TimeTracker::Instant(now) - } else { - GLOBAL_TRACKERS.with_tracker(tracker_token, |tracker| { - tracker.metrics.write_instant = Some(now); - }); - TimeTracker::Tracker(tracker_token) - }; + let tracker = TimeTracker::default(); Callback::Write { cb, @@ -217,7 +208,7 @@ pub trait ReadCallback: ErrorCallback { type Response; fn set_result(self, result: Self::Response); - fn read_tracker(&self) -> Option<&TrackerToken>; + fn read_tracker(&self) -> Option; } pub trait WriteCallback: ErrorCallback { @@ -265,9 +256,9 @@ impl ReadCallback for Callback { self.invoke_read(result); } - fn read_tracker(&self) -> Option<&TrackerToken> { + fn read_tracker(&self) -> Option { let Callback::Read { tracker, .. } = self else { return None; }; - Some(tracker) + Some(*tracker) } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 347f62dd945..586ab7ba133 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -200,7 +200,7 @@ impl ProposalQueue { } #[inline] - fn oldest(&self) -> Option<&Proposal> { + pub fn oldest(&self) -> Option<&Proposal> { self.queue.front() } @@ -3292,7 +3292,7 @@ where let time = monotonic_raw_now(); for (req, cb, mut read_index) in read.take_cmds().drain(..) { cb.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read.propose_time).to_std().unwrap().as_nanos() as u64; }) diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index d6e146adf11..9c0b79ff8b8 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -6,7 +6,7 @@ use ::tracker::{get_tls_tracker_token, with_tls_tracker}; use engine_traits::{PerfContext, PerfContextExt, PerfContextKind}; use kvproto::{kvrpcpb, kvrpcpb::ScanDetailV2}; use pd_client::BucketMeta; -use tikv_kv::{with_tls_engine, Engine}; +use tikv_kv::Engine; use tikv_util::time::{self, Duration, Instant}; use txn_types::Key; @@ -148,9 +148,7 @@ impl Tracker { } self.with_perf_context(|perf_context| { - if let Some(c) = perf_context { - c.start_observe(); - } + perf_context.start_observe(); }); self.current_stage = TrackerState::ItemBegan(now); } @@ -164,9 +162,7 @@ impl Tracker { self.total_storage_stats.add(&storage_stats); } self.with_perf_context(|perf_context| { - if let Some(c) = perf_context { - c.report_metrics(&[get_tls_tracker_token()]); - } + perf_context.report_metrics(&[get_tls_tracker_token()]); }); self.current_stage = TrackerState::ItemFinished(now); } else { @@ -361,7 +357,7 @@ impl Tracker { fn with_perf_context(&self, f: F) -> T where - F: FnOnce(&mut Option>) -> T, + F: FnOnce(&mut Box) -> T, { thread_local! { static SELECT: RefCell>> = RefCell::new(None); @@ -385,19 +381,13 @@ impl Tracker { }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - if c.is_none() { - *c = unsafe { - with_tls_engine::(|engine| { - engine.kv_engine().map(|engine| { - Box::new(engine.get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), - )) as Box - }) - }) - }; - } - f(&mut c) + let perf_context = c.get_or_insert_with(|| { + Box::new(E::Local::get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), + )) as Box + }); + f(perf_context) }) } } diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 080ff2c5951..4837567ee43 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -11,7 +11,7 @@ use pd_client::BucketMeta; use prometheus::*; use prometheus_static_metric::*; use raftstore::store::{util::build_key_range, ReadStats}; -use tikv_kv::{with_tls_engine, Engine}; +use tikv_kv::Engine; use tracker::get_tls_tracker_token; use crate::{ @@ -347,23 +347,15 @@ where }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - if c.is_none() { - *c = with_tls_engine(|engine: &mut E| { - engine.kv_engine().map(|c| { - Box::new(c.get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Storage(cmd.get_str()), - )) as Box - }) - }); - }; - if let Some(c) = &mut *c { - c.start_observe(); - } + let perf_context = c.get_or_insert_with(|| { + Box::new(E::Local::get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Storage(cmd.get_str()), + )) as Box + }); + perf_context.start_observe(); let res = f(); - if let Some(c) = &mut *c { - c.report_metrics(&[get_tls_tracker_token()]); - } + perf_context.report_metrics(&[get_tls_tracker_token()]); res }) } From a3c15ce27d582dc695848bffb363631f4cae2db5 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 16 Jan 2023 12:27:48 +0800 Subject: [PATCH 456/676] raftstore-v2: cleanup txn_ext (#14051) ref tikv/tikv#12842 Move transaction related code to txn_ext.rs. Fix the bug that snapshot doesn't set term and extra_op. Signed-off-by: Jay Lee --- components/raftstore-v2/src/fsm/peer.rs | 39 +-- .../operation/command/admin/conf_change.rs | 1 + .../src/operation/command/admin/split.rs | 16 +- .../command/admin/transfer_leader.rs | 97 +------ components/raftstore-v2/src/operation/mod.rs | 2 + components/raftstore-v2/src/operation/pd.rs | 16 -- .../raftstore-v2/src/operation/query/local.rs | 4 + .../raftstore-v2/src/operation/ready/mod.rs | 15 +- .../raftstore-v2/src/operation/txn_ext.rs | 260 ++++++++++++++++++ components/raftstore-v2/src/raft/peer.rs | 66 ++--- 10 files changed, 303 insertions(+), 213 deletions(-) create mode 100644 components/raftstore-v2/src/operation/txn_ext.rs diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 0a6a66e8df1..26d5c2a1458 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -7,7 +7,7 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; -use raftstore::store::{Config, LocksStatus, TabletSnapManager, Transport}; +use raftstore::store::{Config, TabletSnapManager, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ is_zero_duration, @@ -32,7 +32,6 @@ pub struct PeerFsm { /// twice accidentally. tick_registry: [bool; PeerTick::VARIANT_COUNT], is_stopped: bool, - reactivate_memory_lock_ticks: usize, } impl PeerFsm { @@ -55,7 +54,6 @@ impl PeerFsm { receiver: rx, tick_registry: [false; PeerTick::VARIANT_COUNT], is_stopped: false, - reactivate_memory_lock_ticks: 0, }); Ok((tx, fsm)) } @@ -136,9 +134,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, fn schedule_pending_ticks(&mut self) { let pending_ticks = self.fsm.peer.take_pending_ticks(); for tick in pending_ticks { - if tick == PeerTick::ReactivateMemoryLock { - self.fsm.reactivate_memory_lock_ticks = 0; - } self.schedule_tick(tick); } } @@ -225,7 +220,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerTick::CheckPeerStaleState => unimplemented!(), PeerTick::EntryCacheEvict => self.on_entry_cache_evict(), PeerTick::CheckLeaderLease => unimplemented!(), - PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), + PeerTick::ReactivateMemoryLock => { + self.fsm.peer.on_reactivate_memory_lock_tick(self.store_ctx) + } PeerTick::ReportBuckets => unimplemented!(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted(), } @@ -326,32 +323,4 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.fsm.peer.propose_pending_writes(self.store_ctx); self.schedule_pending_ticks(); } - - pub fn on_reactivate_memory_lock_tick(&mut self) { - let mut pessimistic_locks = self.fsm.peer.txn_ext().pessimistic_locks.write(); - - // If it is not leader, we needn't reactivate by tick. In-memory pessimistic - // lock will be enabled when this region becomes leader again. - // And this tick is currently only used for the leader transfer failure case. - if !self.fsm.peer().is_leader() - || pessimistic_locks.status != LocksStatus::TransferringLeader - { - return; - } - - self.fsm.reactivate_memory_lock_ticks += 1; - let transferring_leader = self.fsm.peer.raft_group().raft.lead_transferee.is_some(); - // `lead_transferee` is not set immediately after the lock status changes. So, - // we need the tick count condition to avoid reactivating too early. - if !transferring_leader - && self.fsm.reactivate_memory_lock_ticks - >= self.store_ctx.cfg.reactive_memory_lock_timeout_tick - { - pessimistic_locks.status = LocksStatus::Normal; - self.fsm.reactivate_memory_lock_ticks = 0; - } else { - drop(pessimistic_locks); - self.schedule_tick(PeerTick::ReactivateMemoryLock); - } - } } diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 8c9771b0201..42c433584fe 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -261,6 +261,7 @@ impl Apply { "changes" => ?changes, "legacy" => legacy, "original region" => ?region, "err" => ?e); + return Err(e); } } let conf_ver = region.get_region_epoch().get_conf_ver() + changes.len() as u64; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index d01b1371338..f9e44286490 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -449,21 +449,9 @@ impl Peer { fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); let derived = &res.regions[res.derived_index]; - let derived_epoch = derived.get_region_epoch().clone(); let region_id = derived.get_id(); - // Group in-memory pessimistic locks in the original region into new regions. - // The locks of new regions will be put into the corresponding new regions - // later. And the locks belonging to the old region will stay in the original - // map. - let region_locks = { - let mut pessimistic_locks = self.txn_ext().pessimistic_locks.write(); - info!(self.logger, "moving {} locks to new regions", pessimistic_locks.len();); - // Update the version so the concurrent reader will fail due to EpochNotMatch - // instead of PessimisticLockNotFound. - pessimistic_locks.version = derived_epoch.get_version(); - pessimistic_locks.group_by_regions(&res.regions, derived) - }; + let region_locks = self.txn_context().split(&res.regions, derived); fail_point!("on_split_invalidate_locks"); let tablet: EK = match res.tablet.downcast() { @@ -650,7 +638,7 @@ impl Peer { let _ = self.raft_group_mut().campaign(); self.set_has_ready(); - *self.txn_ext().pessimistic_locks.write() = split_init.locks; + self.txn_context().init_with_lock(split_init.locks); let control = self.split_flow_control_mut(); control.approximate_size = split_init.approximate_size; control.approximate_keys = split_init.approximate_keys; diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 12bd7bbf491..54aa9845e17 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -3,22 +3,19 @@ use std::cmp::Ordering; use bytes::Bytes; -use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; -use fail::fail_point; +use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ disk_usage::DiskUsage, metapb, raft_cmdpb::{ - AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest, RaftRequestHeader, - TransferLeaderRequest, + AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest, TransferLeaderRequest, }, }; -use parking_lot::RwLockWriteGuard; use raft::{eraftpb, ProgressState, Storage}; use raftstore::{ store::{ fsm::new_admin_request, make_transfer_leader_response, metrics::PEER_ADMIN_CMD_COUNTER, - LocksStatus, TRANSFER_LEADER_COMMAND_REPLY_CTX, + TRANSFER_LEADER_COMMAND_REPLY_CTX, }, Result, }; @@ -30,9 +27,8 @@ use super::AdminCmdResult; use crate::{ batch::StoreContext, fsm::ApplyResReporter, - operation::command::write::SimpleWriteEncoder, raft::{Apply, Peer}, - router::{CmdResChannel, PeerMsg, PeerTick}, + router::{CmdResChannel, PeerMsg}, }; fn transfer_leader_cmd(msg: &RaftCmdRequest) -> Option<&TransferLeaderRequest> { @@ -296,91 +292,6 @@ impl Peer { } None } - - // Returns whether we should propose another TransferLeader command. This is - // for: - // - Considering the amount of pessimistic locks can be big, it can reduce - // unavailable time caused by waiting for the transferee catching up logs. - // - Make transferring leader strictly after write commands that executes before - // proposing the locks, preventing unexpected lock loss. - fn propose_locks_before_transfer_leader( - &mut self, - ctx: &mut StoreContext, - msg: &eraftpb::Message, - ) -> bool { - // 1. Disable in-memory pessimistic locks. - - // Clone to make borrow checker happy when registering ticks. - let txn_ext = self.txn_ext().clone(); - let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); - - // If the message context == TRANSFER_LEADER_COMMAND_REPLY_CTX, the message - // is a reply to a transfer leader command before. If the locks status remain - // in the TransferringLeader status, we can safely initiate transferring leader - // now. - // If it's not in TransferringLeader status now, it is probably because several - // ticks have passed after proposing the locks in the last time and we - // reactivate the memory locks. Then, we should propose the locks again. - if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX - && pessimistic_locks.status == LocksStatus::TransferringLeader - { - return false; - } - - // If it is not writable, it's probably because it's a retried TransferLeader - // and the locks have been proposed. But we still need to return true to - // propose another TransferLeader command. Otherwise, some write requests that - // have marked some locks as deleted will fail because raft rejects more - // proposals. - // It is OK to return true here if it's in other states like MergingRegion or - // NotLeader. In those cases, the locks will fail to propose and nothing will - // happen. - if !pessimistic_locks.is_writable() { - return true; - } - pessimistic_locks.status = LocksStatus::TransferringLeader; - self.add_pending_tick(PeerTick::ReactivateMemoryLock); - - // 2. Propose pessimistic locks - if pessimistic_locks.is_empty() { - return false; - } - // FIXME: Raft command has size limit. Either limit the total size of - // pessimistic locks in a region, or split commands here. - let mut encoder = SimpleWriteEncoder::with_capacity(512); - let mut lock_count = 0; - { - // Downgrade to a read guard, do not block readers in the scheduler as far as - // possible. - let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); - fail_point!("invalidate_locks_before_transfer_leader"); - for (key, (lock, deleted)) in &*pessimistic_locks { - if *deleted { - continue; - } - lock_count += 1; - encoder.put(CF_LOCK, key.as_encoded(), &lock.to_lock().to_bytes()); - } - } - if lock_count == 0 { - // If the map is not empty but all locks are deleted, it is possible that a - // write command has just marked locks deleted but not proposed yet. - // It might cause that command to fail if we skip proposing the - // extra TransferLeader command here. - return true; - } - let mut header = Box::::default(); - header.set_region_id(self.region_id()); - header.set_region_epoch(self.region().get_region_epoch().clone()); - header.set_peer(self.peer().clone()); - info!( - self.logger, - "propose {} locks before transferring leader", lock_count; - ); - let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; - self.on_simple_write(ctx, write.header, write.data, write.ch); - true - } } impl Apply { diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 807f425e998..76baf31f9c8 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -5,6 +5,7 @@ mod life; mod pd; mod query; mod ready; +mod txn_ext; pub use command::{ AdminCmdResult, ApplyFlowControl, CommittedEntries, CompactLogContext, ProposalControl, @@ -20,4 +21,5 @@ pub use ready::{ pub(crate) use self::{ command::SplitInit, query::{LocalReader, ReadDelegatePair, SharedReadTablet}, + txn_ext::TxnContext, }; diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 26945a3e176..17abdd85cf0 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -205,20 +205,4 @@ impl Peer { ); } } - - #[inline] - pub fn update_max_timestamp_pd(&self, ctx: &StoreContext, initial_status: u64) { - let task = pd::Task::UpdateMaxTimestamp { - region_id: self.region_id(), - initial_status, - txn_ext: self.txn_ext().clone(), - }; - if let Err(e) = ctx.schedulers.pd.schedule(task) { - error!( - self.logger, - "failed to notify pd with UpdateMaxTimestamp"; - "err" => %e, - ); - } - } } diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index e4c0aa6d0b9..13b815d1ebc 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -2,6 +2,7 @@ // #[PerformanceCriticalPath] use std::{ + num::NonZeroU64, ops::Deref, sync::{atomic, Arc, Mutex}, }; @@ -246,6 +247,8 @@ where }; snap.txn_ext = Some(delegate.txn_ext.clone()); + snap.term = NonZeroU64::new(delegate.term); + snap.txn_extra_op = delegate.txn_extra_op.load(); snap.bucket_meta = delegate.bucket_meta.clone(); delegate.cached_tablet.release(); @@ -945,6 +948,7 @@ mod tests { assert_eq!(read_progress.safe_ts(), 2); let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); assert_eq!(*snap.get_region(), region1); + assert_eq!(snap.term, NonZeroU64::new(term6)); drop(mix_tx); handler.join().unwrap(); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index d1348cf014b..87e1c100a87 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -728,12 +728,12 @@ impl Peer { // latency. self.raft_group_mut().skip_bcast_commit(false); - // Init the in-memory pessimistic lock table when the peer becomes leader. - self.activate_in_memory_pessimistic_locks(); - - // A more recent read may happen on the old leader. So max ts should - // be updated after a peer becomes leader. - self.require_updating_max_ts(ctx); + self.txn_context().on_became_leader( + ctx, + self.term(), + self.region(), + &self.logger, + ); // Exit entry cache warmup state when the peer becomes leader. self.entry_storage_mut().clear_entry_cache_warmup_state(); @@ -746,7 +746,8 @@ impl Peer { StateRole::Follower => { self.leader_lease_mut().expire(); self.storage_mut().cancel_generating_snap(None); - self.clear_in_memory_pessimistic_locks(); + self.txn_context() + .on_became_follower(self.term(), self.region()); } _ => {} } diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs new file mode 100644 index 00000000000..911c1eaab78 --- /dev/null +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -0,0 +1,260 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains everything related to transaction hook. +//! +//! This is the temporary (efficient) solution, it should be implemented as one +//! type of coprocessor. + +use std::sync::{atomic::Ordering, Arc}; + +use crossbeam::atomic::AtomicCell; +use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; +use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::RaftRequestHeader}; +use parking_lot::RwLockWriteGuard; +use raft::eraftpb; +use raftstore::store::{ + LocksStatus, PeerPessimisticLocks, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, +}; +use slog::{error, info, Logger}; + +use crate::{ + batch::StoreContext, + raft::Peer, + router::{PeerMsg, PeerTick}, + worker::pd, + SimpleWriteEncoder, +}; + +pub struct TxnContext { + ext: Arc, + extra_op: Arc>, + reactivate_memory_lock_ticks: usize, +} + +impl Default for TxnContext { + #[inline] + fn default() -> Self { + Self { + ext: Arc::default(), + extra_op: Arc::new(AtomicCell::new(ExtraOp::Noop)), + reactivate_memory_lock_ticks: 0, + } + } +} + +impl TxnContext { + #[inline] + pub fn on_region_changed(&self, term: u64, region: &Region) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn on_became_leader( + &self, + ctx: &mut StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) { + // A more recent read may happen on the old leader. So max ts should + // be updated after a peer becomes leader. + self.require_updating_max_ts(ctx, term, region, logger); + + // Init the in-memory pessimistic lock table when the peer becomes leader. + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::Normal; + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn on_became_follower(&self, term: u64, region: &Region) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::NotLeader; + pessimistic_locks.clear(); + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn ext(&self) -> &Arc { + &self.ext + } + + #[inline] + pub fn extra_op(&self) -> &Arc> { + &self.extra_op + } + + // TODO: find a better place to put all txn related stuff. + fn require_updating_max_ts( + &self, + ctx: &StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let epoch = region.get_region_epoch(); + let term_low_bits = term & ((1 << 32) - 1); // 32 bits + let version_lot_bits = epoch.get_version() & ((1 << 31) - 1); // 31 bits + let initial_status = (term_low_bits << 32) | (version_lot_bits << 1); + self.ext + .max_ts_sync_status + .store(initial_status, Ordering::SeqCst); + info!( + logger, + "require updating max ts"; + "initial_status" => initial_status, + ); + let task = pd::Task::UpdateMaxTimestamp { + region_id: region.get_id(), + initial_status, + txn_ext: self.ext.clone(), + }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!(logger, "failed to notify pd with UpdateMaxTimestamp"; "err" => ?e); + } + } + + pub fn split(&self, regions: &[Region], derived: &Region) -> Vec { + // Group in-memory pessimistic locks in the original region into new regions. + // The locks of new regions will be put into the corresponding new regions + // later. And the locks belonging to the old region will stay in the original + // map. + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + // Update the version so the concurrent reader will fail due to EpochNotMatch + // instead of PessimisticLockNotFound. + pessimistic_locks.version = derived.get_region_epoch().get_version(); + pessimistic_locks.group_by_regions(regions, derived) + } + + pub fn init_with_lock(&self, locks: PeerPessimisticLocks) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + *pessimistic_locks = locks; + } +} + +impl Peer { + /// Returns True means the tick is consumed, otherwise the tick should be + /// rescheduled. + pub fn on_reactivate_memory_lock_tick(&mut self, ctx: &mut StoreContext) { + // If it is not leader, we needn't reactivate by tick. In-memory pessimistic + // lock will be enabled when this region becomes leader again. + if !self.is_leader() { + return; + } + + let transferring_leader = self.raft_group().raft.lead_transferee.is_some(); + let txn_context = self.txn_context_mut(); + let mut pessimistic_locks = txn_context.ext.pessimistic_locks.write(); + + // And this tick is currently only used for the leader transfer failure case. + if pessimistic_locks.status != LocksStatus::TransferringLeader { + return; + } + + txn_context.reactivate_memory_lock_ticks += 1; + // `lead_transferee` is not set immediately after the lock status changes. So, + // we need the tick count condition to avoid reactivating too early. + if !transferring_leader + && txn_context.reactivate_memory_lock_ticks >= ctx.cfg.reactive_memory_lock_timeout_tick + { + pessimistic_locks.status = LocksStatus::Normal; + txn_context.reactivate_memory_lock_ticks = 0; + } else { + drop(pessimistic_locks); + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + } + } + + // Returns whether we should propose another TransferLeader command. This is + // for: + // - Considering the amount of pessimistic locks can be big, it can reduce + // unavailable time caused by waiting for the transferee catching up logs. + // - Make transferring leader strictly after write commands that executes before + // proposing the locks, preventing unexpected lock loss. + pub fn propose_locks_before_transfer_leader( + &mut self, + ctx: &mut StoreContext, + msg: &eraftpb::Message, + ) -> bool { + // 1. Disable in-memory pessimistic locks. + + // Clone to make borrow checker happy when registering ticks. + let txn_ext = self.txn_context().ext.clone(); + let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); + + // If the message context == TRANSFER_LEADER_COMMAND_REPLY_CTX, the message + // is a reply to a transfer leader command before. If the locks status remain + // in the TransferringLeader status, we can safely initiate transferring leader + // now. + // If it's not in TransferringLeader status now, it is probably because several + // ticks have passed after proposing the locks in the last time and we + // reactivate the memory locks. Then, we should propose the locks again. + if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX + && pessimistic_locks.status == LocksStatus::TransferringLeader + { + return false; + } + + // If it is not writable, it's probably because it's a retried TransferLeader + // and the locks have been proposed. But we still need to return true to + // propose another TransferLeader command. Otherwise, some write requests that + // have marked some locks as deleted will fail because raft rejects more + // proposals. + // It is OK to return true here if it's in other states like MergingRegion or + // NotLeader. In those cases, the locks will fail to propose and nothing will + // happen. + if !pessimistic_locks.is_writable() { + return true; + } + pessimistic_locks.status = LocksStatus::TransferringLeader; + self.txn_context_mut().reactivate_memory_lock_ticks = 0; + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + + // 2. Propose pessimistic locks + if pessimistic_locks.is_empty() { + return false; + } + // FIXME: Raft command has size limit. Either limit the total size of + // pessimistic locks in a region, or split commands here. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + let mut lock_count = 0; + { + // Downgrade to a read guard, do not block readers in the scheduler as far as + // possible. + let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); + fail::fail_point!("invalidate_locks_before_transfer_leader"); + for (key, (lock, deleted)) in &*pessimistic_locks { + if *deleted { + continue; + } + lock_count += 1; + encoder.put(CF_LOCK, key.as_encoded(), &lock.to_lock().to_bytes()); + } + } + if lock_count == 0 { + // If the map is not empty but all locks are deleted, it is possible that a + // write command has just marked locks deleted but not proposed yet. + // It might cause that command to fail if we skip proposing the + // extra TransferLeader command here. + return true; + } + let mut header = Box::::default(); + header.set_region_id(self.region_id()); + header.set_region_epoch(self.region().get_region_epoch().clone()); + header.set_peer(self.peer().clone()); + info!( + self.logger, + "propose {} locks before transferring leader", lock_count; + ); + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; + self.on_simple_write(ctx, write.header, write.data, write.ch); + true + } +} diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 8051066d4f9..6cfcda4da25 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -2,16 +2,15 @@ use std::{ cmp, mem, - sync::{atomic::Ordering, Arc}, + sync::Arc, time::{Duration, Instant}, }; use collections::{HashMap, HashSet}; -use crossbeam::atomic::AtomicCell; use engine_traits::{ CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, }; -use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; +use kvproto::{metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; use raftstore::{ @@ -19,19 +18,18 @@ use raftstore::{ store::{ fsm::ApplyMetrics, util::{Lease, RegionReadProgress}, - Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, - ReadProgress, TabletSnapManager, TxnExt, WriteTask, + Config, EntryStorage, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, + TabletSnapManager, WriteTask, }, }; use slog::Logger; use super::storage::Storage; use crate::{ - batch::StoreContext, fsm::ApplyScheduler, operation::{ AsyncWriter, CompactLogContext, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, - SplitFlowControl, + SplitFlowControl, TxnContext, }, router::{CmdResChannel, PeerTick, QueryResChannel}, Result, @@ -83,8 +81,7 @@ pub struct Peer { last_region_buckets: Option, /// Transaction extensions related to this peer. - txn_ext: Arc, - txn_extra_op: Arc>, + txn_context: TxnContext, pending_ticks: Vec, @@ -173,8 +170,7 @@ impl Peer { ), region_buckets: None, last_region_buckets: None, - txn_ext: Arc::default(), - txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), + txn_context: TxnContext::default(), proposal_control: ProposalControl::new(0), pending_ticks: Vec::new(), split_trace: vec![], @@ -261,11 +257,8 @@ impl Peer { self.read_progress .update_leader_info(self.leader_id(), self.term(), self.region()); - { - let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); - pessimistic_locks.term = self.term(); - pessimistic_locks.version = self.region().get_region_epoch().get_version(); - } + self.txn_context + .on_region_changed(self.term(), self.region()); if self.serving() { host.on_region_changed( @@ -639,21 +632,6 @@ impl Peer { mem::take(&mut self.pending_ticks) } - pub fn activate_in_memory_pessimistic_locks(&mut self) { - let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); - pessimistic_locks.status = LocksStatus::Normal; - pessimistic_locks.term = self.term(); - pessimistic_locks.version = self.region().get_region_epoch().get_version(); - } - - pub fn clear_in_memory_pessimistic_locks(&mut self) { - let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); - pessimistic_locks.status = LocksStatus::NotLeader; - pessimistic_locks.clear(); - pessimistic_locks.term = self.term(); - pessimistic_locks.version = self.region().get_region_epoch().get_version(); - } - #[inline] pub fn post_split(&mut self) { self.reset_region_buckets(); @@ -678,8 +656,13 @@ impl Peer { } #[inline] - pub fn txn_ext(&self) -> &Arc { - &self.txn_ext + pub fn txn_context(&self) -> &TxnContext { + &self.txn_context + } + + #[inline] + pub fn txn_context_mut(&mut self) -> &mut TxnContext { + &mut self.txn_context } pub fn generate_read_delegate(&self) -> ReadDelegate { @@ -690,8 +673,8 @@ impl Peer { self.term(), self.region().clone(), self.storage().entry_storage().applied_term(), - self.txn_extra_op.clone(), - self.txn_ext.clone(), + self.txn_context.extra_op().clone(), + self.txn_context.ext().clone(), self.read_progress().clone(), self.region_buckets.as_ref().map(|b| b.meta.clone()), ) @@ -715,19 +698,6 @@ impl Peer { .advance_apply(apply_index, term, region); } - // TODO: find a better place to put all txn related stuff. - pub fn require_updating_max_ts(&self, ctx: &StoreContext) { - let epoch = self.region().get_region_epoch(); - let term_low_bits = self.term() & ((1 << 32) - 1); // 32 bits - let version_lot_bits = epoch.get_version() & ((1 << 31) - 1); // 31 bits - let initial_status = (term_low_bits << 32) | (version_lot_bits << 1); - self.txn_ext - .max_ts_sync_status - .store(initial_status, Ordering::SeqCst); - - self.update_max_timestamp_pd(ctx, initial_status); - } - #[inline] pub fn split_trace_mut(&mut self) -> &mut Vec<(u64, HashSet)> { &mut self.split_trace From 6d163b846327a0f61c1049b97cb4b315639ce9a6 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 16 Jan 2023 19:01:49 -0800 Subject: [PATCH 457/676] raftstore-v2: a few small fixes (#14039) ref tikv/tikv#12842 1) add snapshot apply metrics 2) disable bloomfilter for raftkv-v2 for now until a proper ratio is found 3) disable rocksdb write stall for raftkv-v2 until the tablet flow control is fully verified. Signed-off-by: Qi Xu Co-authored-by: Qi Xu --- components/raftstore-v2/src/operation/ready/mod.rs | 5 +++-- components/raftstore-v2/src/operation/ready/snapshot.rs | 6 ++++-- src/config/mod.rs | 7 ++++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 87e1c100a87..38d126ac87a 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -31,8 +31,8 @@ use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, store::{ - needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteCallback, - WriteTask, + needs_evict_entry_cache, util, worker_metrics::SNAP_COUNTER, FetchedLogs, ReadProgress, + Transport, WriteCallback, WriteTask, }, }; use slog::{debug, error, info, trace, warn}; @@ -877,6 +877,7 @@ impl Storage { ctx.snap_mgr.clone(), ctx.tablet_registry.clone(), ) { + SNAP_COUNTER.apply.fail.inc(); error!(self.logger(),"failed to apply snapshot";"error" => ?e) } } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 04b6ed7e12b..bcbe220252b 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -36,8 +36,9 @@ use raft::{eraftpb::Snapshot, StateRole}; use raftstore::{ coprocessor::RegionChangeEvent, store::{ - metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, - TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, worker_metrics::SNAP_COUNTER, + GenSnapRes, ReadTask, TabletSnapKey, TabletSnapManager, Transport, WriteTask, + RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, }; use slog::{error, info, warn}; @@ -252,6 +253,7 @@ impl Peer { !s.scheduled || snapshot_index != RAFT_INIT_LOG_INDEX }) { info!(self.logger, "apply tablet snapshot completely"); + SNAP_COUNTER.apply.success.inc(); } if let Some(init) = split { info!(self.logger, "init split with snapshot finished"); diff --git a/src/config/mod.rs b/src/config/mod.rs index 8d3e5477f26..9caa68d8e6b 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -116,7 +116,8 @@ fn bloom_filter_ratio(et: EngineType) -> f64 { EngineType::RaftKv => 0.1, // In v2, every peer has its own tablet. The data scale is about tens of // GiBs. We only need a small portion for those key. - EngineType::RaftKv2 => 0.005, + // TODO: disable it for now until find out the proper ratio + EngineType::RaftKv2 => 0.0, } } @@ -1229,6 +1230,10 @@ impl DbConfig { self.write_buffer_limit.get_or_insert(ReadableSize( (total_mem * WRITE_BUFFER_MEMORY_LIMIT_RATE) as u64, )); + self.defaultcf.disable_write_stall = true; + self.writecf.disable_write_stall = true; + self.lockcf.disable_write_stall = true; + self.raftcf.disable_write_stall = true; } } } From a463db0911b4a2f2f47a29b567c54338a7ff3876 Mon Sep 17 00:00:00 2001 From: Zwb Date: Tue, 17 Jan 2023 14:51:48 +0800 Subject: [PATCH 458/676] apply: fix witness raft log gc panic and refactor (#14054) ref tikv/tikv#12876 fix witness raft log gc panic and refactor Signed-off-by: Wenbo Zhang Co-authored-by: Xinye Tao --- components/raftstore/src/store/config.rs | 7 ++ components/raftstore/src/store/fsm/apply.rs | 77 ++++++++++++--------- components/raftstore/src/store/fsm/peer.rs | 35 ++++------ tests/failpoints/cases/test_witness.rs | 8 +++ tests/integrations/config/mod.rs | 1 + 5 files changed, 74 insertions(+), 54 deletions(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 454cf61a4c8..4d3210318a6 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -68,6 +68,9 @@ pub struct Config { pub raft_log_compact_sync_interval: ReadableDuration, // Interval to gc unnecessary raft log. pub raft_log_gc_tick_interval: ReadableDuration, + // Interval to request voter_replicated_index for gc unnecessary raft log, + // if the leader has not initiated gc for a long time. + pub request_voter_replicated_index_interval: ReadableDuration, // A threshold to gc stale raft log, must >= 1. pub raft_log_gc_threshold: u64, // When entry count exceed this value, gc will be forced trigger. @@ -339,6 +342,7 @@ impl Default for Config { raft_entry_max_size: ReadableSize::mb(8), raft_log_compact_sync_interval: ReadableDuration::secs(2), raft_log_gc_tick_interval: ReadableDuration::secs(3), + request_voter_replicated_index_interval: ReadableDuration::minutes(5), raft_log_gc_threshold: 50, raft_log_gc_count_limit: None, raft_log_gc_size_limit: None, @@ -813,6 +817,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["raft_log_gc_tick_interval"]) .set(self.raft_log_gc_tick_interval.as_secs_f64()); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["request_voter_replicated_index_interval"]) + .set(self.request_voter_replicated_index_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["raft_log_gc_threshold"]) .set(self.raft_log_gc_threshold as f64); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 58df32fd404..60ed35e6892 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -258,6 +258,7 @@ pub enum ExecResult { CompactLog { state: RaftTruncatedState, first_index: u64, + has_pending: bool, }, SplitRegion { regions: Vec, @@ -300,7 +301,12 @@ pub enum ExecResult { SetFlashbackState { region: Region, }, - PendingCompactCmd, + // The raftstore thread will use it to update the internal state of `PeerFsm`. If it is + // `true`, when the raftstore detects that the raft log has not been gc for a long time, + // the raftstore thread will actively pull the `voter_replicated_index` from the leader + // and try to compact pending gc. If false, raftstore does not do any additional + // processing. + HasPendingCompactCmd(bool), } /// The possible returned value when applying logs. @@ -1508,7 +1514,7 @@ where | ExecResult::DeleteRange { .. } | ExecResult::IngestSst { .. } | ExecResult::TransferLeader { .. } - | ExecResult::PendingCompactCmd => {} + | ExecResult::HasPendingCompactCmd(..) => {} ExecResult::SplitRegion { ref derived, .. } => { self.region = derived.clone(); self.metrics.size_diff_hint = 0; @@ -2966,11 +2972,13 @@ where )) } + // When the first return value is true, it means that we have updated + // `RaftApplyState`, and the caller needs to do persistence. fn try_compact_log( &mut self, voter_replicated_index: u64, voter_replicated_term: u64, - ) -> Result>> { + ) -> Result<(bool, Option>)> { PEER_ADMIN_CMD_COUNTER.compact.all.inc(); let first_index = entry_storage::first_index(&self.apply_state); @@ -2981,7 +2989,7 @@ where "peer_id" => self.id(), "voter_replicated_index" => voter_replicated_index, ); - return Ok(None); + return Ok((false, None)); } // When the witness restarted, the pending compact cmd has been lost, so use @@ -2995,11 +3003,7 @@ where "compact_index" => voter_replicated_index, "first_index" => first_index, ); - return Ok(Some(TaskRes::Compact { - state: self.apply_state.get_truncated_state().clone(), - first_index: 0, - has_pending: false, - })); + return Ok((false, Some(ExecResult::HasPendingCompactCmd(false)))); } // compact failure is safe to be omitted, no need to assert. compact_raft_log( @@ -3009,11 +3013,7 @@ where voter_replicated_term, )?; PEER_ADMIN_CMD_COUNTER.compact.success.inc(); - return Ok(Some(TaskRes::Compact { - state: self.apply_state.get_truncated_state().clone(), - first_index, - has_pending: false, - })); + return Ok((true, Some(ExecResult::HasPendingCompactCmd(false)))); } match self.pending_cmds.pop_compact(voter_replicated_index) { @@ -3021,11 +3021,14 @@ where // compact failure is safe to be omitted, no need to assert. compact_raft_log(&self.tag, &mut self.apply_state, cmd.index, cmd.term)?; PEER_ADMIN_CMD_COUNTER.compact.success.inc(); - Ok(Some(TaskRes::Compact { - state: self.apply_state.get_truncated_state().clone(), - first_index, - has_pending: self.pending_cmds.has_compact(), - })) + Ok(( + true, + Some(ExecResult::CompactLog { + state: self.apply_state.get_truncated_state().clone(), + first_index, + has_pending: self.pending_cmds.has_compact(), + }), + )) } None => { info!( @@ -3034,7 +3037,7 @@ where "peer_id" => self.id(), "voter_replicated_index" => voter_replicated_index, ); - Ok(None) + Ok((false, None)) } } } @@ -3109,7 +3112,10 @@ where "peer_id" => self.id(), "command" => ?req.get_compact_log() ); - return Ok((resp, ApplyResult::Res(ExecResult::PendingCompactCmd))); + return Ok(( + resp, + ApplyResult::Res(ExecResult::HasPendingCompactCmd(true)), + )); } } } else { @@ -3133,6 +3139,7 @@ where ApplyResult::Res(ExecResult::CompactLog { state: self.apply_state.get_truncated_state().clone(), first_index, + has_pending: self.pending_cmds.has_compact(), }), )) } @@ -3693,11 +3700,6 @@ where // Whether destroy request is from its target region's snapshot merge_from_snapshot: bool, }, - Compact { - state: RaftTruncatedState, - first_index: u64, - has_pending: bool, - }, } pub struct ApplyFsm @@ -4109,18 +4111,29 @@ where voter_replicated_index: u64, voter_replicated_term: u64, ) { + if self.delegate.pending_remove || self.delegate.stopped { + return; + } + let res = self .delegate .try_compact_log(voter_replicated_index, voter_replicated_term); match res { - Ok(res) => { + Ok((should_write, res)) => { if let Some(res) = res { + if ctx.timer.is_none() { + ctx.timer = Some(Instant::now_coarse()); + } ctx.prepare_for(&mut self.delegate); - self.delegate.write_apply_state(ctx.kv_wb_mut()); - ctx.commit_opt(&mut self.delegate, true); - ctx.finish_for(&mut self.delegate, VecDeque::new()); - ctx.notifier - .notify_one(self.delegate.region_id(), PeerMsg::ApplyRes { res }); + let mut result = VecDeque::new(); + // If modified `truncated_state` in `try_compact_log`, the apply state should be + // persisted. + if should_write { + self.delegate.write_apply_state(ctx.kv_wb_mut()); + ctx.commit_opt(&mut self.delegate, true); + } + result.push_back(res); + ctx.finish_for(&mut self.delegate, result); } } Err(e) => error!(?e; diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 7e00798b6df..ccde4b031ef 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2311,21 +2311,6 @@ where *is_ready = true; } } - ApplyTaskRes::Compact { - state, - first_index, - has_pending, - } => { - self.fsm.peer.has_pending_compact_cmd = has_pending; - // When the witness restarts, the pending compact cmds will be lost. We will try - // to use `voter_replicated_index` as the `compact index` to avoid log - // accumulation, but if `voter_replicated_index` is less than `first_index`, - // then gc is not needed. In this case, the `first_index` we pass back will be - // 0, and `has_pending` set to false. - if first_index != 0 { - self.on_ready_compact_log(first_index, state); - } - } } if self.fsm.peer.unsafe_recovery_state.is_some() { self.check_unsafe_recovery_state(); @@ -4933,8 +4918,13 @@ where while let Some(result) = exec_results.pop_front() { match result { ExecResult::ChangePeer(cp) => self.on_ready_change_peer(cp), - ExecResult::CompactLog { first_index, state } => { - self.on_ready_compact_log(first_index, state) + ExecResult::CompactLog { + state, + first_index, + has_pending, + } => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + self.on_ready_compact_log(first_index, state); } ExecResult::SplitRegion { derived, @@ -4969,9 +4959,11 @@ where ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), - ExecResult::PendingCompactCmd => { - self.fsm.peer.has_pending_compact_cmd = true; - self.register_pull_voter_replicated_index_tick(); + ExecResult::HasPendingCompactCmd(has_pending) => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + if has_pending { + self.register_pull_voter_replicated_index_tick(); + } } } } @@ -5530,9 +5522,8 @@ where if !self.fsm.peer.is_witness() || !self.fsm.peer.has_pending_compact_cmd { return; } - // TODO: make it configurable if self.fsm.peer.last_compacted_time.elapsed() - > self.ctx.cfg.raft_log_gc_tick_interval.0 * 2 + > self.ctx.cfg.request_voter_replicated_index_interval.0 { let mut msg = ExtraMessage::default(); msg.set_type(ExtraMessageType::MsgVoterReplicatedIndexRequest); diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index 98a845b7016..552434d1fed 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -78,6 +78,10 @@ fn test_witness_raftlog_gc_pull_voter_replicated_index() { let mut cluster = new_server_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster + .cfg + .raft_store + .request_voter_replicated_index_interval = ReadableDuration::millis(100); cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); assert_eq!(nodes.len(), 3); @@ -155,6 +159,10 @@ fn test_witness_raftlog_gc_after_reboot() { let mut cluster = new_server_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster + .cfg + .raft_store + .request_voter_replicated_index_interval = ReadableDuration::millis(100); cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); assert_eq!(nodes.len(), 3); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 0c6cf7cdd9c..a4e15b8fa6e 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -180,6 +180,7 @@ fn test_serde_custom_tikv_config() { raft_entry_max_size: ReadableSize::mb(12), raft_log_compact_sync_interval: ReadableDuration::secs(12), raft_log_gc_tick_interval: ReadableDuration::secs(12), + request_voter_replicated_index_interval: ReadableDuration::minutes(5), raft_log_gc_threshold: 12, raft_log_gc_count_limit: Some(12), raft_log_gc_size_limit: Some(ReadableSize::kb(1)), From 5235542066f3cd41d02581c6ee064159938f545e Mon Sep 17 00:00:00 2001 From: iosmanthus Date: Tue, 17 Jan 2023 21:05:50 +0800 Subject: [PATCH 459/676] copr: support handling keyspace request (#14027) ref tikv/tikv#12999 copr: support handling keyspace request Signed-off-by: iosmanthus --- Cargo.lock | 4 + components/api_version/Cargo.toml | 1 + components/api_version/src/keyspace.rs | 163 +++++++++++++++ components/api_version/src/lib.rs | 6 +- components/test_backup/src/lib.rs | 9 +- components/tidb_query_common/Cargo.toml | 1 + .../tidb_query_common/src/storage/scanner.rs | 187 +++++++++--------- components/tidb_query_datatype/Cargo.toml | 1 + .../tidb_query_datatype/src/codec/table.rs | 17 +- components/tidb_query_executors/Cargo.toml | 1 + .../src/index_scan_executor.rs | 39 ++-- components/tidb_query_executors/src/runner.rs | 11 +- .../src/table_scan_executor.rs | 35 ++-- .../src/util/scan_executor.rs | 16 +- src/coprocessor/checksum.rs | 8 +- src/coprocessor/dag/mod.rs | 15 +- src/coprocessor/endpoint.rs | 20 +- src/coprocessor/statistics/analyze.rs | 46 +++-- .../coprocessor_executors/index_scan/util.rs | 3 +- .../coprocessor_executors/integrated/util.rs | 3 +- .../coprocessor_executors/table_scan/util.rs | 3 +- .../benches/coprocessor_executors/util/mod.rs | 3 +- .../integrations/coprocessor/test_checksum.rs | 10 +- 23 files changed, 414 insertions(+), 188 deletions(-) create mode 100644 components/api_version/src/keyspace.rs diff --git a/Cargo.lock b/Cargo.lock index 0b7ca52725c..069dbc4950e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,6 +84,7 @@ dependencies = [ "codec", "engine_traits", "kvproto", + "log_wrappers", "match-template", "panic_hook", "thiserror", @@ -6103,6 +6104,7 @@ name = "tidb_query_common" version = "0.0.1" dependencies = [ "anyhow", + "api_version", "async-trait", "byteorder", "derive_more", @@ -6124,6 +6126,7 @@ dependencies = [ name = "tidb_query_datatype" version = "0.0.1" dependencies = [ + "api_version", "base64", "bitfield", "bitflags", @@ -6164,6 +6167,7 @@ name = "tidb_query_executors" version = "0.0.1" dependencies = [ "anyhow", + "api_version", "async-trait", "codec", "collections", diff --git a/components/api_version/Cargo.toml b/components/api_version/Cargo.toml index 7362ca25ccc..c80607145bd 100644 --- a/components/api_version/Cargo.toml +++ b/components/api_version/Cargo.toml @@ -12,6 +12,7 @@ bitflags = "1.0.1" codec = { workspace = true } engine_traits = { workspace = true } kvproto = { workspace = true } +log_wrappers = { workspace = true } match-template = "0.0.1" thiserror = "1.0" tikv_alloc = { workspace = true } diff --git a/components/api_version/src/keyspace.rs b/components/api_version/src/keyspace.rs new file mode 100644 index 00000000000..4b263822a1b --- /dev/null +++ b/components/api_version/src/keyspace.rs @@ -0,0 +1,163 @@ +use std::fmt::Debug; + +use engine_traits::{Error, Result}; +use tikv_util::box_err; + +use super::*; + +const KEYSPACE_PREFIX_LEN: usize = 4; + +pub trait KvPair { + fn key(&self) -> &[u8]; + fn value(&self) -> &[u8]; + fn kv(&self) -> (&[u8], &[u8]) { + (self.key(), self.value()) + } +} + +impl KvPair for (Vec, Vec) { + fn key(&self) -> &[u8] { + &self.0 + } + fn value(&self) -> &[u8] { + &self.1 + } +} + +pub trait Keyspace { + type KvPair: KvPair = (Vec, Vec); + fn make_kv_pair(p: (Vec, Vec)) -> Result; + fn parse_keyspace(key: &[u8]) -> Result<(Option, &[u8])> { + Ok((None, key)) + } +} + +#[derive(PartialEq, Clone, Copy, Debug)] +pub struct KeyspaceId(u32); + +impl From for KeyspaceId { + fn from(id: u32) -> Self { + Self(id) + } +} + +impl Keyspace for ApiV1 { + fn make_kv_pair(p: (Vec, Vec)) -> Result { + Ok(p) + } +} + +impl Keyspace for ApiV1Ttl { + fn make_kv_pair(p: (Vec, Vec)) -> Result { + Ok(p) + } +} + +impl Keyspace for ApiV2 { + type KvPair = KeyspaceKv; + + fn make_kv_pair(p: (Vec, Vec)) -> Result { + let (k, v) = p; + let (keyspace, _) = Self::parse_keyspace(&k)?; + Ok(KeyspaceKv { + k, + v, + keyspace: keyspace.unwrap(), + }) + } + + fn parse_keyspace(key: &[u8]) -> Result<(Option, &[u8])> { + let mode = ApiV2::parse_key_mode(key); + if key.len() < KEYSPACE_PREFIX_LEN || (mode != KeyMode::Raw && mode != KeyMode::Txn) { + return Err(Error::Other(box_err!( + "invalid API V2 key: {}", + log_wrappers::Value(key) + ))); + } + let id = u32::from_be_bytes([0, key[1], key[2], key[3]]); + Ok((Some(KeyspaceId::from(id)), &key[KEYSPACE_PREFIX_LEN..])) + } +} + +pub struct KeyspaceKv { + k: Vec, + v: Vec, + keyspace: KeyspaceId, +} + +impl KvPair for KeyspaceKv { + fn key(&self) -> &[u8] { + &self.k[KEYSPACE_PREFIX_LEN..] + } + + fn value(&self) -> &[u8] { + &self.v + } +} + +impl KeyspaceKv { + pub fn keyspace(&self) -> KeyspaceId { + self.keyspace + } +} + +impl PartialEq<(Vec, Vec)> for KeyspaceKv { + fn eq(&self, other: &(Vec, Vec)) -> bool { + self.kv() == (&other.0, &other.1) + } +} + +impl PartialEq for KeyspaceKv { + fn eq(&self, other: &Self) -> bool { + self.k == other.k && self.v == other.v + } +} + +impl Debug for KeyspaceKv { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("KeyspaceKv") + .field("key", &log_wrappers::Value(self.key())) + .field("value", &log_wrappers::Value(self.value())) + .field("keyspace", &self.keyspace()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_v1_parse_keyspace() { + let k = b"t123_111"; + let (keyspace, key) = ApiV1::parse_keyspace(k).unwrap(); + assert_eq!(None, keyspace); + assert_eq!(k, key); + + let (keyspace, key) = ApiV1Ttl::parse_keyspace(k).unwrap(); + assert_eq!(None, keyspace); + assert_eq!(k, key); + } + + #[test] + fn test_v2_parse_keyspace() { + let ok = vec![ + (b"x\x00\x00\x01t123_114", 1, b"t123_114"), + (b"r\x00\x00\x01t123_112", 1, b"t123_112"), + (b"x\x01\x00\x00t213_112", 0x010000, b"t213_112"), + (b"r\x01\x00\x00t123_113", 0x010000, b"t123_113"), + ]; + + for (key, id, user_key) in ok { + let (keyspace, key) = ApiV2::parse_keyspace(key).unwrap(); + assert_eq!(Some(KeyspaceId::from(id)), keyspace); + assert_eq!(user_key, key); + } + + let err: Vec<&[u8]> = vec![b"t123_111", b"s\x00\x00", b"r\x00\x00"]; + + for key in err { + ApiV2::parse_keyspace(key).unwrap_err(); + } + } +} diff --git a/components/api_version/src/lib.rs b/components/api_version/src/lib.rs index 0c9ae388917..879751e7b62 100644 --- a/components/api_version/src/lib.rs +++ b/components/api_version/src/lib.rs @@ -1,17 +1,21 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. #![feature(min_specialization)] +#![feature(associated_type_defaults)] mod api_v1; mod api_v1ttl; pub mod api_v2; +pub mod keyspace; use engine_traits::Result; use kvproto::kvrpcpb::ApiVersion; pub use match_template::match_template; use txn_types::{Key, TimeStamp}; -pub trait KvFormat: Clone + Copy + 'static + Send + Sync { +use crate::keyspace::Keyspace; + +pub trait KvFormat: Keyspace + Clone + Copy + 'static + Send + Sync { const TAG: ApiVersion; /// Corresponding TAG of client requests. For test only. #[cfg(any(test, feature = "testexport"))] diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index e990924c638..3409a6ef366 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -8,7 +8,7 @@ use std::{ time::Duration, }; -use api_version::{dispatch_api_version, KvFormat, RawValue}; +use api_version::{dispatch_api_version, keyspace::KvPair, ApiV1, KvFormat, RawValue}; use backup::Task; use collections::HashMap; use engine_traits::{CfName, IterOptions, CF_DEFAULT, CF_WRITE, DATA_KEY_PREFIX_LEN}; @@ -354,7 +354,7 @@ impl TestSuite { Default::default(), false, ); - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: TikvStorage::new(snap_store, false), ranges: vec![Range::Interval(IntervalRange::from((start, end)))], scan_backward_in_range: false, @@ -362,8 +362,9 @@ impl TestSuite { is_scanned_range_aware: false, }); let digest = crc64fast::Digest::new(); - while let Some((k, v)) = block_on(scanner.next()).unwrap() { - checksum = checksum_crc64_xor(checksum, digest.clone(), &k, &v); + while let Some(row) = block_on(scanner.next()).unwrap() { + let (k, v) = row.kv(); + checksum = checksum_crc64_xor(checksum, digest.clone(), k, v); total_kvs += 1; total_bytes += (k.len() + v.len()) as u64; } diff --git a/components/tidb_query_common/Cargo.toml b/components/tidb_query_common/Cargo.toml index 3dd1693ba0d..f192b22a5f6 100644 --- a/components/tidb_query_common/Cargo.toml +++ b/components/tidb_query_common/Cargo.toml @@ -7,6 +7,7 @@ description = "Common utility of a query engine to run TiDB pushed down executor [dependencies] anyhow = "1.0" +api_version = { workspace = true } async-trait = "0.1" derive_more = "0.99.3" error_code = { workspace = true } diff --git a/components/tidb_query_common/src/storage/scanner.rs b/components/tidb_query_common/src/storage/scanner.rs index e12659f329b..d0d2345a09e 100644 --- a/components/tidb_query_common/src/storage/scanner.rs +++ b/components/tidb_query_common/src/storage/scanner.rs @@ -1,7 +1,8 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{marker::PhantomData, time::Duration}; +use api_version::KvFormat; use tikv_util::time::Instant; use yatp::task::future::reschedule; @@ -17,7 +18,7 @@ const CHECK_KEYS: usize = 32; /// A scanner that scans over multiple ranges. Each range can be a point range /// containing only one row, or an interval range containing multiple rows. -pub struct RangesScanner { +pub struct RangesScanner { storage: T, ranges_iter: RangesIterator, @@ -34,6 +35,8 @@ pub struct RangesScanner { working_range_begin_key: Vec, working_range_end_key: Vec, rescheduler: RescheduleChecker, + + _phantom: PhantomData, } // TODO: maybe it's better to make it generic to avoid directly depending @@ -72,7 +75,7 @@ pub struct RangesScannerOptions { pub is_scanned_range_aware: bool, // TODO: This can be const generics } -impl RangesScanner { +impl RangesScanner { pub fn new( RangesScannerOptions { storage, @@ -81,7 +84,7 @@ impl RangesScanner { is_key_only, is_scanned_range_aware, }: RangesScannerOptions, - ) -> RangesScanner { + ) -> RangesScanner { let ranges_len = ranges.len(); let ranges_iter = RangesIterator::new(ranges); RangesScanner { @@ -98,13 +101,14 @@ impl RangesScanner { working_range_begin_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), working_range_end_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), rescheduler: RescheduleChecker::new(), + _phantom: PhantomData, } } /// Fetches next row. // Note: This is not implemented over `Iterator` since it can fail. // TODO: Change to use reference to avoid allocation and copy. - pub async fn next(&mut self) -> Result, StorageError> { + pub async fn next(&mut self) -> Result, StorageError> { self.next_opt(true).await } @@ -114,7 +118,7 @@ impl RangesScanner { pub async fn next_opt( &mut self, update_scanned_range: bool, - ) -> Result, StorageError> { + ) -> Result, StorageError> { loop { let mut force_check = true; let range = self.ranges_iter.next(); @@ -150,14 +154,14 @@ impl RangesScanner { if self.is_scanned_range_aware && update_scanned_range { self.update_scanned_range_from_scanned_row(&some_row); } - if some_row.is_some() { + if let Some(row) = some_row { // Retrieved one row from point range or interval range. if let Some(r) = self.scanned_rows_per_range.last_mut() { *r += 1; } self.rescheduler.check_reschedule(force_check).await; - - return Ok(some_row); + let kv = F::make_kv_pair(row).map_err(|e| StorageError(anyhow::Error::from(e)))?; + return Ok(Some(kv)); } else { // No more row in the range. self.ranges_iter.notify_drained(); @@ -288,6 +292,7 @@ impl RangesScanner { #[cfg(test)] mod tests { + use api_version::{keyspace::KvPair, ApiV1}; use futures::executor::block_on; use super::*; @@ -315,7 +320,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "c")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -323,24 +328,24 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo".to_vec(), b"1".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), b"1".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_2".to_vec(), b"3".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), b"3".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_3".to_vec(), b"5".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), b"5".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar".to_vec(), b"2".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), b"2".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar_2".to_vec(), b"4".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar_2".to_vec(), b"4".to_vec()) ); assert_eq!(block_on(scanner.next()).unwrap(), None); @@ -351,7 +356,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "bar_2")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -359,20 +364,20 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_2".to_vec(), b"3".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), b"3".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo".to_vec(), b"1".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), b"1".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_3".to_vec(), b"5".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), b"5".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar".to_vec(), b"2".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), b"2".to_vec()) ); assert_eq!(block_on(scanner.next()).unwrap(), None); @@ -382,7 +387,7 @@ mod tests { PointRange::from("foo_3").into(), PointRange::from("bar_3").into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -390,24 +395,24 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar_2".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar_2".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_2".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_3".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), Vec::new()) ); assert_eq!(block_on(scanner.next()).unwrap(), None); } @@ -422,7 +427,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "z")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -431,9 +436,9 @@ mod tests { }); let mut scanned_rows_per_range = Vec::new(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![2, 0, 1]); @@ -443,21 +448,21 @@ mod tests { assert_eq!(scanned_rows_per_range, vec![0]); scanned_rows_per_range.clear(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![0, 2]); scanned_rows_per_range.clear(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![1]); scanned_rows_per_range.clear(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); assert_eq!(block_on(scanner.next()).unwrap(), None); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); @@ -477,7 +482,7 @@ mod tests { // No range let ranges = vec![]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -497,7 +502,7 @@ mod tests { // Empty interval range let ranges = vec![IntervalRange::from(("x", "xb")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -513,7 +518,7 @@ mod tests { // Empty point range let ranges = vec![PointRange::from("x").into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -529,7 +534,7 @@ mod tests { // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -537,14 +542,14 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); @@ -567,7 +572,7 @@ mod tests { PointRange::from("bar_3").into(), IntervalRange::from(("bar_4", "box")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -575,25 +580,25 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo\0"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); assert_eq!(&r.upper_exclusive, b"bar\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar\0"); @@ -612,7 +617,7 @@ mod tests { // No range let ranges = vec![]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -632,7 +637,7 @@ mod tests { // Empty interval range let ranges = vec![IntervalRange::from(("x", "xb")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -648,7 +653,7 @@ mod tests { // Empty point range let ranges = vec![PointRange::from("x").into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -664,7 +669,7 @@ mod tests { // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -672,14 +677,14 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2"); assert_eq!(&r.upper_exclusive, b"foo_8"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); @@ -700,7 +705,7 @@ mod tests { IntervalRange::from(("foo_5", "foo_50")).into(), IntervalRange::from(("foo", "foo_3")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: true, @@ -708,20 +713,20 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar_2"); assert_eq!(&r.upper_exclusive, b"box"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar"); assert_eq!(&r.upper_exclusive, b"bar_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); @@ -739,7 +744,7 @@ mod tests { let storage = create_storage(); // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -749,7 +754,7 @@ mod tests { // Only lower_inclusive is updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -757,7 +762,7 @@ mod tests { // Upper_exclusive is updated. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -765,7 +770,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo_3" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -791,7 +796,7 @@ mod tests { PointRange::from("bar_3").into(), IntervalRange::from(("bar_4", "box")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -801,7 +806,7 @@ mod tests { // Only lower_inclusive is updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -809,7 +814,7 @@ mod tests { // Upper_exclusive is updated. Updated by scanned row. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -817,7 +822,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"bar" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -825,7 +830,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"bar_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -846,7 +851,7 @@ mod tests { let storage = create_storage(); // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -856,7 +861,7 @@ mod tests { // Only lower_inclusive is updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo_3" ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); @@ -864,7 +869,7 @@ mod tests { // Upper_exclusive is updated. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); @@ -872,7 +877,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); @@ -896,7 +901,7 @@ mod tests { IntervalRange::from(("foo_5", "foo_50")).into(), IntervalRange::from(("foo", "foo_3")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: true, @@ -906,7 +911,7 @@ mod tests { // Lower_inclusive is updated. Upper_exclusive is not update. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"bar_2" ); assert_eq!(&scanner.working_range_begin_key, b"box"); @@ -914,7 +919,7 @@ mod tests { // Upper_exclusive is updated. Updated by scanned row. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"bar" ); assert_eq!(&scanner.working_range_begin_key, b"box"); @@ -922,7 +927,7 @@ mod tests { // Upper_exclusive is not update. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"box"); @@ -930,7 +935,7 @@ mod tests { // Upper_exclusive is not update. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"box"); diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index e9d96e16284..e670674cdc6 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -6,6 +6,7 @@ publish = false description = "Data type of a query engine to run TiDB pushed down executors" [dependencies] +api_version = { workspace = true } base64 = "0.13" bitfield = "0.13.2" bitflags = "1.0.1" diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 00f6c22347b..37becbfb801 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -2,6 +2,7 @@ use std::{cmp, convert::TryInto, io::Write, sync::Arc, u8}; +use api_version::KvFormat; use codec::prelude::*; use collections::{HashMap, HashSet}; use kvproto::coprocessor::KeyRange; @@ -75,10 +76,13 @@ pub fn extract_table_prefix(key: &[u8]) -> Result<&[u8]> { } /// Checks if the range is for table record or index. -pub fn check_table_ranges(ranges: &[KeyRange]) -> Result<()> { +pub fn check_table_ranges(ranges: &[KeyRange]) -> Result<()> { for range in ranges { - extract_table_prefix(range.get_start())?; - extract_table_prefix(range.get_end())?; + let (_, start) = + F::parse_keyspace(range.get_start()).map_err(|e| Error::Other(Box::new(e)))?; + let (_, end) = F::parse_keyspace(range.get_end()).map_err(|e| Error::Other(Box::new(e)))?; + extract_table_prefix(start)?; + extract_table_prefix(end)?; if range.get_start() >= range.get_end() { return Err(invalid_type!( "invalid range,range.start should be smaller than range.end, but got [{:?},{:?})", @@ -544,6 +548,7 @@ pub fn generate_index_data_for_test( mod tests { use std::{i64, iter::FromIterator}; + use api_version::ApiV1; use collections::{HashMap, HashSet}; use tipb::ColumnInfo; @@ -790,18 +795,18 @@ mod tests { let mut range = KeyRange::default(); range.set_start(small_key.clone()); range.set_end(large_key.clone()); - check_table_ranges(&[range]).unwrap(); + check_table_ranges::(&[range]).unwrap(); // test range.start > range.end let mut range = KeyRange::default(); range.set_end(small_key.clone()); range.set_start(large_key); - check_table_ranges(&[range]).unwrap_err(); + check_table_ranges::(&[range]).unwrap_err(); // test invalid end let mut range = KeyRange::default(); range.set_start(small_key); range.set_end(b"xx".to_vec()); - check_table_ranges(&[range]).unwrap_err(); + check_table_ranges::(&[range]).unwrap_err(); } #[test] diff --git a/components/tidb_query_executors/Cargo.toml b/components/tidb_query_executors/Cargo.toml index 123c306c125..331634dbd04 100644 --- a/components/tidb_query_executors/Cargo.toml +++ b/components/tidb_query_executors/Cargo.toml @@ -6,6 +6,7 @@ publish = false description = "A vector query engine to run TiDB pushed down executors" [dependencies] +api_version = { workspace = true } async-trait = "0.1" codec = { workspace = true } collections = { workspace = true } diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index ae04ffe03e6..9e415918541 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -2,6 +2,7 @@ use std::sync::Arc; +use api_version::{ApiV1, KvFormat}; use async_trait::async_trait; use codec::{number::NumberCodec, prelude::NumberDecoder}; use itertools::izip; @@ -30,11 +31,13 @@ use DecodeHandleStrategy::*; use super::util::scan_executor::*; use crate::interface::*; -pub struct BatchIndexScanExecutor(ScanExecutor); +pub struct BatchIndexScanExecutor( + ScanExecutor, +); // We assign a dummy type `Box>` so that we can // omit the type when calling `check_supported`. -impl BatchIndexScanExecutor>> { +impl BatchIndexScanExecutor>, ApiV1> { /// Checks whether this executor can be used. #[inline] pub fn check_supported(descriptor: &IndexScan) -> Result<()> { @@ -42,7 +45,7 @@ impl BatchIndexScanExecutor>> { } } -impl BatchIndexScanExecutor { +impl BatchIndexScanExecutor { pub fn new( storage: S, config: Arc, @@ -154,7 +157,7 @@ impl BatchIndexScanExecutor { } #[async_trait] -impl BatchExecutor for BatchIndexScanExecutor { +impl BatchExecutor for BatchIndexScanExecutor { type StorageStats = S::Statistics; #[inline] @@ -975,7 +978,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![columns_info[0].clone(), columns_info[1].clone()], @@ -1028,7 +1031,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![ @@ -1092,7 +1095,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![columns_info[1].clone(), columns_info[0].clone()], @@ -1133,7 +1136,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![ @@ -1185,7 +1188,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), vec![ @@ -1262,7 +1265,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![ @@ -1319,7 +1322,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), vec![ @@ -1433,7 +1436,7 @@ mod tests { let mut value = value_prefix.clone(); value.extend(restore_data); let store = FixtureStorage::from(vec![(key.clone(), value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1476,7 +1479,7 @@ mod tests { let value = value_prefix; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1572,7 +1575,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, vec![])]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1672,7 +1675,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1766,7 +1769,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1859,7 +1862,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1985,7 +1988,7 @@ mod tests { let mut value = value_prefix; value.extend(restore_data); let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 551c3da8a7e..d04be41507e 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -2,6 +2,7 @@ use std::{convert::TryFrom, sync::Arc}; +use api_version::KvFormat; use fail::fail_point; use kvproto::coprocessor::KeyRange; use protobuf::Message; @@ -164,7 +165,7 @@ fn is_arrow_encodable(schema: &[FieldType]) -> bool { } #[allow(clippy::explicit_counter_loop)] -pub fn build_executors( +pub fn build_executors( executor_descriptors: Vec, storage: S, ranges: Vec, @@ -192,7 +193,7 @@ pub fn build_executors( let primary_prefix_column_ids = descriptor.take_primary_prefix_column_ids(); Box::new( - BatchTableScanExecutor::new( + BatchTableScanExecutor::<_, F>::new( storage, config.clone(), columns_info, @@ -212,7 +213,7 @@ pub fn build_executors( let columns_info = descriptor.take_columns().into(); let primary_column_ids_len = descriptor.take_primary_column_ids().len(); Box::new( - BatchIndexScanExecutor::new( + BatchIndexScanExecutor::<_, F>::new( storage, config.clone(), columns_info, @@ -364,7 +365,7 @@ pub fn build_executors( } impl BatchExecutorsRunner { - pub fn from_request + 'static>( + pub fn from_request + 'static, F: KvFormat>( mut req: DagRequest, ranges: Vec, storage: S, @@ -380,7 +381,7 @@ impl BatchExecutorsRunner { config.paging_size = paging_size; let config = Arc::new(config); - let out_most_executor = build_executors( + let out_most_executor = build_executors::<_, F>( req.take_executors().into(), storage, ranges, diff --git a/components/tidb_query_executors/src/table_scan_executor.rs b/components/tidb_query_executors/src/table_scan_executor.rs index 957a23ba8c0..4397869fcaa 100644 --- a/components/tidb_query_executors/src/table_scan_executor.rs +++ b/components/tidb_query_executors/src/table_scan_executor.rs @@ -2,6 +2,7 @@ use std::{collections::HashSet, sync::Arc}; +use api_version::{ApiV1, KvFormat}; use async_trait::async_trait; use collections::HashMap; use kvproto::coprocessor::KeyRange; @@ -23,13 +24,15 @@ use tipb::{ColumnInfo, FieldType, TableScan}; use super::util::scan_executor::*; use crate::interface::*; -pub struct BatchTableScanExecutor(ScanExecutor); +pub struct BatchTableScanExecutor( + ScanExecutor, +); type HandleIndicesVec = SmallVec<[usize; 2]>; // We assign a dummy type `Box>` so that we can // omit the type when calling `check_supported`. -impl BatchTableScanExecutor>> { +impl BatchTableScanExecutor>, ApiV1> { /// Checks whether this executor can be used. #[inline] pub fn check_supported(descriptor: &TableScan) -> Result<()> { @@ -37,7 +40,7 @@ impl BatchTableScanExecutor>> { } } -impl BatchTableScanExecutor { +impl BatchTableScanExecutor { #[allow(clippy::too_many_arguments)] pub fn new( storage: S, @@ -110,7 +113,7 @@ impl BatchTableScanExecutor { } #[async_trait] -impl BatchExecutor for BatchTableScanExecutor { +impl BatchExecutor for BatchTableScanExecutor { type StorageStats = S::Statistics; #[inline] @@ -702,7 +705,7 @@ mod tests { batch_expect_rows: &[usize], ) { let columns_info = helper.columns_info_by_idx(col_idxs); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( helper.store(), Arc::new(EvalConfig::default()), columns_info, @@ -786,7 +789,7 @@ mod tests { fn test_execution_summary() { let helper = TableScanTestHelper::new(); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( helper.store(), Arc::new(EvalConfig::default()), helper.columns_info_by_idx(&[0]), @@ -925,7 +928,7 @@ mod tests { // For row 0 + row 1 + (row 2 ~ row 4), we should only get row 0, row 1 and an // error. for corrupted_row_index in 2..=4 { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1032,7 +1035,7 @@ mod tests { // We should get row 0 and error because no further rows should be scanned when // there is an error. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1080,7 +1083,7 @@ mod tests { }); let mut schema = schema.clone(); schema.push(FieldTypeTp::LongLong.into()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info, @@ -1122,7 +1125,7 @@ mod tests { // Let's also repeat case 1 for smaller batch size { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1165,7 +1168,7 @@ mod tests { // Case 2: row 1 + row 2 // We should get error and no row, for the same reason as above. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1186,7 +1189,7 @@ mod tests { // Case 3: row 2 + row 0 // We should get row 2 and row 0. There is no error. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1220,7 +1223,7 @@ mod tests { // Case 4: row 1 // We should get error. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1270,7 +1273,7 @@ mod tests { let store = FixtureStorage::new(iter::once((key, (Ok(value)))).collect()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1378,7 +1381,7 @@ mod tests { let store = FixtureStorage::new(iter::once((key, (Ok(value)))).collect()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1559,7 +1562,7 @@ mod tests { let store = FixtureStorage::new(iter::once((key, (Ok(value)))).collect()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info.clone(), diff --git a/components/tidb_query_executors/src/util/scan_executor.rs b/components/tidb_query_executors/src/util/scan_executor.rs index 935db5dd392..75c7cdc9fe3 100644 --- a/components/tidb_query_executors/src/util/scan_executor.rs +++ b/components/tidb_query_executors/src/util/scan_executor.rs @@ -1,5 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use api_version::{keyspace::KvPair, KvFormat}; use async_trait::async_trait; use kvproto::coprocessor::KeyRange; use tidb_query_common::{ @@ -40,12 +41,12 @@ pub trait ScanExecutorImpl: Send { /// A shared executor implementation for both table scan and index scan. /// Implementation differences between table scan and index scan are further /// given via `ScanExecutorImpl`. -pub struct ScanExecutor { +pub struct ScanExecutor { /// The internal scanning implementation. imp: I, /// The scanner that scans over ranges. - scanner: RangesScanner, + scanner: RangesScanner, /// A flag indicating whether this executor is ended. When table is drained /// or there was an error scanning the table, this flag will be set to @@ -63,7 +64,7 @@ pub struct ScanExecutorOptions { pub is_scanned_range_aware: bool, } -impl ScanExecutor { +impl ScanExecutor { pub fn new( ScanExecutorOptions { imp, @@ -75,7 +76,7 @@ impl ScanExecutor { is_scanned_range_aware, }: ScanExecutorOptions, ) -> Result { - tidb_query_datatype::codec::table::check_table_ranges(&key_ranges)?; + tidb_query_datatype::codec::table::check_table_ranges::(&key_ranges)?; if is_backward { key_ranges.reverse(); } @@ -108,10 +109,11 @@ impl ScanExecutor { for i in 0..scan_rows { let some_row = self.scanner.next_opt(i == scan_rows - 1).await?; - if let Some((key, value)) = some_row { + if let Some(row) = some_row { // Retrieved one row from point range or non-point range. - if let Err(e) = self.imp.process_kv_pair(&key, &value, columns) { + let (key, value) = row.kv(); + if let Err(e) = self.imp.process_kv_pair(key, value, columns) { // When there are errors in `process_kv_pair`, columns' length may not be // identical. For example, the filling process may be partially done so that // first several columns have N rows while the rest have N-1 rows. Since we do @@ -162,7 +164,7 @@ pub fn check_columns_info_supported(columns_info: &[ColumnInfo]) -> Result<()> { } #[async_trait] -impl BatchExecutor for ScanExecutor { +impl BatchExecutor for ScanExecutor { type StorageStats = S::Statistics; #[inline] diff --git a/src/coprocessor/checksum.rs b/src/coprocessor/checksum.rs index 52bd0a60184..3778f549427 100644 --- a/src/coprocessor/checksum.rs +++ b/src/coprocessor/checksum.rs @@ -1,5 +1,6 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +use api_version::{keyspace::KvPair, ApiV1}; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; use protobuf::Message; @@ -18,7 +19,7 @@ use crate::{ // `ChecksumContext` is used to handle `ChecksumRequest` pub struct ChecksumContext { req: ChecksumRequest, - scanner: RangesScanner>>, + scanner: RangesScanner>, ApiV1>, } impl ChecksumContext { @@ -73,12 +74,13 @@ impl RequestHandler for ChecksumContext { let mut prefix_digest = crc64fast::Digest::new(); prefix_digest.write(&old_prefix); - while let Some((k, v)) = self.scanner.next().await? { + while let Some(row) = self.scanner.next().await? { + let (k, v) = row.kv(); if !k.starts_with(&new_prefix) { return Err(box_err!("Wrong prefix expect: {:?}", new_prefix)); } checksum = - checksum_crc64_xor(checksum, prefix_digest.clone(), &k[new_prefix.len()..], &v); + checksum_crc64_xor(checksum, prefix_digest.clone(), &k[new_prefix.len()..], v); total_kvs += 1; total_bytes += k.len() + v.len() + old_prefix.len() - new_prefix.len(); } diff --git a/src/coprocessor/dag/mod.rs b/src/coprocessor/dag/mod.rs index ce575859e59..31a6df181d5 100644 --- a/src/coprocessor/dag/mod.rs +++ b/src/coprocessor/dag/mod.rs @@ -2,8 +2,9 @@ mod storage_impl; -use std::sync::Arc; +use std::{marker::PhantomData, sync::Arc}; +use api_version::KvFormat; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; use protobuf::Message; @@ -18,7 +19,7 @@ use crate::{ tikv_util::quota_limiter::QuotaLimiter, }; -pub struct DagHandlerBuilder { +pub struct DagHandlerBuilder { req: DagRequest, ranges: Vec, store: S, @@ -29,9 +30,10 @@ pub struct DagHandlerBuilder { is_cache_enabled: bool, paging_size: Option, quota_limiter: Arc, + _phantom: PhantomData, } -impl DagHandlerBuilder { +impl DagHandlerBuilder { pub fn new( req: DagRequest, ranges: Vec, @@ -54,6 +56,7 @@ impl DagHandlerBuilder { is_cache_enabled, paging_size, quota_limiter, + _phantom: PhantomData, } } @@ -65,7 +68,7 @@ impl DagHandlerBuilder { pub fn build(self) -> Result> { COPR_DAG_REQ_COUNT.with_label_values(&["batch"]).inc(); - Ok(BatchDagHandler::new( + Ok(BatchDagHandler::new::<_, F>( self.req, self.ranges, self.store, @@ -87,7 +90,7 @@ pub struct BatchDagHandler { } impl BatchDagHandler { - pub fn new( + pub fn new( req: DagRequest, ranges: Vec, store: S, @@ -100,7 +103,7 @@ impl BatchDagHandler { quota_limiter: Arc, ) -> Result { Ok(Self { - runner: tidb_query_executors::runner::BatchExecutorsRunner::from_request( + runner: tidb_query_executors::runner::BatchExecutorsRunner::from_request::<_, F>( req, ranges, TikvStorage::new(store, is_cache_enabled), diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 711cd83e607..b9d01419a49 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -7,6 +7,7 @@ use std::{ use ::tracker::{ set_tls_tracker_token, with_tls_tracker, RequestInfo, RequestType, GLOBAL_TRACKERS, }; +use api_version::{dispatch_api_version, KvFormat}; use async_stream::try_stream; use concurrency_manager::ConcurrencyManager; use engine_traits::PerfLevel; @@ -147,6 +148,21 @@ impl Endpoint { /// /// It also checks if there are locks in memory blocking this read request. fn parse_request_and_check_memory_locks( + &self, + req: coppb::Request, + peer: Option, + is_streaming: bool, + ) -> Result<(RequestHandlerBuilder, ReqContext)> { + dispatch_api_version!(req.get_context().get_api_version(), { + self.parse_request_and_check_memory_locks_impl::(req, peer, is_streaming) + }) + } + + /// Parse the raw `Request` to create `RequestHandlerBuilder` and + /// `ReqContext`. Returns `Err` if fails. + /// + /// It also checks if there are locks in memory blocking this read request. + fn parse_request_and_check_memory_locks_impl( &self, mut req: coppb::Request, peer: Option, @@ -232,7 +248,7 @@ impl Endpoint { 0 => None, i => Some(i), }; - dag::DagHandlerBuilder::new( + dag::DagHandlerBuilder::<_, F>::new( dag, req_ctx.ranges.clone(), store, @@ -281,7 +297,7 @@ impl Endpoint { let quota_limiter = self.quota_limiter.clone(); builder = Box::new(move |snap, req_ctx| { - statistics::analyze::AnalyzeContext::new( + statistics::analyze::AnalyzeContext::<_, F>::new( analyze, req_ctx.ranges.clone(), start_ts, diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 383f6161a1b..25ecf95653d 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -1,7 +1,8 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Reverse, collections::BinaryHeap, mem, sync::Arc}; +use std::{cmp::Reverse, collections::BinaryHeap, marker::PhantomData, mem, sync::Arc}; +use api_version::{keyspace::KvPair, KvFormat}; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; use protobuf::Message; @@ -41,16 +42,17 @@ const ANALYZE_VERSION_V1: i32 = 1; const ANALYZE_VERSION_V2: i32 = 2; // `AnalyzeContext` is used to handle `AnalyzeReq` -pub struct AnalyzeContext { +pub struct AnalyzeContext { req: AnalyzeReq, storage: Option>>, ranges: Vec, storage_stats: Statistics, quota_limiter: Arc, is_auto_analyze: bool, + _phantom: PhantomData, } -impl AnalyzeContext { +impl AnalyzeContext { pub fn new( req: AnalyzeReq, ranges: Vec, @@ -77,13 +79,14 @@ impl AnalyzeContext { storage_stats: Statistics::default(), quota_limiter, is_auto_analyze, + _phantom: PhantomData, }) } // handle_column is used to process `AnalyzeColumnsReq` // it would build a histogram for the primary key(if needed) and // collectors for each column value. - async fn handle_column(builder: &mut SampleBuilder) -> Result> { + async fn handle_column(builder: &mut SampleBuilder) -> Result> { let (col_res, _) = builder.collect_columns_stats().await?; let res_data = { @@ -93,7 +96,7 @@ impl AnalyzeContext { Ok(res_data) } - async fn handle_mixed(builder: &mut SampleBuilder) -> Result> { + async fn handle_mixed(builder: &mut SampleBuilder) -> Result> { let (col_res, idx_res) = builder.collect_columns_stats().await?; let res_data = { @@ -109,7 +112,7 @@ impl AnalyzeContext { Ok(res_data) } - async fn handle_full_sampling(builder: &mut RowSampleBuilder) -> Result> { + async fn handle_full_sampling(builder: &mut RowSampleBuilder) -> Result> { let sample_res = builder.collect_column_stats().await?; let res_data = { let res = sample_res.into_proto(); @@ -122,7 +125,7 @@ impl AnalyzeContext { // it would build a histogram and count-min sketch of index values. async fn handle_index( req: AnalyzeIndexReq, - scanner: &mut RangesScanner>>, + scanner: &mut RangesScanner>, F>, is_common_handle: bool, ) -> Result> { let mut hist = Histogram::new(req.get_bucket_size() as usize); @@ -142,8 +145,8 @@ impl AnalyzeContext { } else { ANALYZE_VERSION_V1 }; - while let Some((key, _)) = scanner.next().await? { - let mut key = &key[..]; + while let Some(row) = scanner.next().await? { + let mut key = row.key(); if is_common_handle { table::check_record_key(key)?; key = &key[table::PREFIX_LEN..]; @@ -209,14 +212,14 @@ impl AnalyzeContext { } #[async_trait] -impl RequestHandler for AnalyzeContext { +impl RequestHandler for AnalyzeContext { async fn handle_request(&mut self) -> Result> { let ret = match self.req.get_tp() { AnalyzeType::TypeIndex | AnalyzeType::TypeCommonHandle => { let req = self.req.take_idx_req(); let ranges = std::mem::take(&mut self.ranges); - table::check_table_ranges(&ranges)?; - let mut scanner = RangesScanner::new(RangesScannerOptions { + table::check_table_ranges::(&ranges)?; + let mut scanner = RangesScanner::<_, F>::new(RangesScannerOptions { storage: self.storage.take().unwrap(), ranges: ranges .into_iter() @@ -240,7 +243,7 @@ impl RequestHandler for AnalyzeContext { let col_req = self.req.take_col_req(); let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = SampleBuilder::new(col_req, None, storage, ranges)?; + let mut builder = SampleBuilder::<_, F>::new(col_req, None, storage, ranges)?; let res = AnalyzeContext::handle_column(&mut builder).await; builder.data.collect_storage_stats(&mut self.storage_stats); res @@ -252,7 +255,8 @@ impl RequestHandler for AnalyzeContext { let idx_req = self.req.take_idx_req(); let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = SampleBuilder::new(col_req, Some(idx_req), storage, ranges)?; + let mut builder = + SampleBuilder::<_, F>::new(col_req, Some(idx_req), storage, ranges)?; let res = AnalyzeContext::handle_mixed(&mut builder).await; builder.data.collect_storage_stats(&mut self.storage_stats); res @@ -263,7 +267,7 @@ impl RequestHandler for AnalyzeContext { let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = RowSampleBuilder::new( + let mut builder = RowSampleBuilder::<_, F>::new( col_req, storage, ranges, @@ -302,8 +306,8 @@ impl RequestHandler for AnalyzeContext { } } -struct RowSampleBuilder { - data: BatchTableScanExecutor>>, +struct RowSampleBuilder { + data: BatchTableScanExecutor>, F>, max_sample_size: usize, max_fm_sketch_size: usize, @@ -314,7 +318,7 @@ struct RowSampleBuilder { is_auto_analyze: bool, } -impl RowSampleBuilder { +impl RowSampleBuilder { fn new( mut req: AnalyzeColumnsReq, storage: TikvStorage>, @@ -784,8 +788,8 @@ impl Drop for BaseRowSampleCollector { } } -struct SampleBuilder { - data: BatchTableScanExecutor>>, +struct SampleBuilder { + data: BatchTableScanExecutor>, F>, max_bucket_size: usize, max_sample_size: usize, @@ -802,7 +806,7 @@ struct SampleBuilder { /// `SampleBuilder` is used to analyze columns. It collects sample from /// the result set using Reservoir Sampling algorithm, estimates NDVs /// using FM Sketch during the collecting process, and builds count-min sketch. -impl SampleBuilder { +impl SampleBuilder { fn new( mut req: AnalyzeColumnsReq, common_handle_req: Option, diff --git a/tests/benches/coprocessor_executors/index_scan/util.rs b/tests/benches/coprocessor_executors/index_scan/util.rs index 7531fb68944..8d579c98a4f 100644 --- a/tests/benches/coprocessor_executors/index_scan/util.rs +++ b/tests/benches/coprocessor_executors/index_scan/util.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::black_box; use futures::executor::block_on; use kvproto::coprocessor::KeyRange; @@ -33,7 +34,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchIndexScan store: &Store, unique: bool, ) -> Self::E { - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( black_box(TikvStorage::new( ToTxnStore::::to_store(store), false, diff --git a/tests/benches/coprocessor_executors/integrated/util.rs b/tests/benches/coprocessor_executors/integrated/util.rs index d9cb5fd2138..4b747307049 100644 --- a/tests/benches/coprocessor_executors/integrated/util.rs +++ b/tests/benches/coprocessor_executors/integrated/util.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::{black_box, measurement::Measurement}; use kvproto::coprocessor::KeyRange; use test_coprocessor::*; @@ -71,7 +72,7 @@ where store: &Store, ) { crate::util::bencher::BatchNextAllBencher::new(|| { - tidb_query_executors::runner::build_executors( + tidb_query_executors::runner::build_executors::<_, ApiV1>( black_box(executors.to_vec()), black_box(TikvStorage::new(ToTxnStore::::to_store(store), false)), black_box(ranges.to_vec()), diff --git a/tests/benches/coprocessor_executors/table_scan/util.rs b/tests/benches/coprocessor_executors/table_scan/util.rs index 2fe7c4fc4c0..0b2185074c8 100644 --- a/tests/benches/coprocessor_executors/table_scan/util.rs +++ b/tests/benches/coprocessor_executors/table_scan/util.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::black_box; use futures::executor::block_on; use kvproto::coprocessor::KeyRange; @@ -33,7 +34,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchTableScan store: &Store, _: (), ) -> Self::E { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( black_box(TikvStorage::new( ToTxnStore::::to_store(store), false, diff --git a/tests/benches/coprocessor_executors/util/mod.rs b/tests/benches/coprocessor_executors/util/mod.rs index 5ef442a25cd..0a5708c74ce 100644 --- a/tests/benches/coprocessor_executors/util/mod.rs +++ b/tests/benches/coprocessor_executors/util/mod.rs @@ -8,6 +8,7 @@ pub mod store; use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::{black_box, measurement::Measurement}; use kvproto::coprocessor::KeyRange; use test_coprocessor::*; @@ -41,7 +42,7 @@ pub fn build_dag_handler( let mut dag = DagRequest::default(); dag.set_executors(executors.to_vec().into()); - tikv::coprocessor::dag::DagHandlerBuilder::new( + tikv::coprocessor::dag::DagHandlerBuilder::<_, ApiV1>::new( black_box(dag), black_box(ranges.to_vec()), black_box(ToTxnStore::::to_store(store)), diff --git a/tests/integrations/coprocessor/test_checksum.rs b/tests/integrations/coprocessor/test_checksum.rs index 66df6b2832c..405070842b4 100644 --- a/tests/integrations/coprocessor/test_checksum.rs +++ b/tests/integrations/coprocessor/test_checksum.rs @@ -2,6 +2,7 @@ use std::u64; +use api_version::{keyspace::KvPair, ApiV1}; use futures::executor::block_on; use kvproto::{ coprocessor::{KeyRange, Request}, @@ -79,7 +80,7 @@ fn reversed_checksum_crc64_xor(store: &Store, range: KeyRange) -> Default::default(), false, ); - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: TikvStorage::new(store, false), ranges: vec![Range::from_pb_range(range, false)], scan_backward_in_range: true, @@ -89,10 +90,11 @@ fn reversed_checksum_crc64_xor(store: &Store, range: KeyRange) -> let mut checksum = 0; let digest = crc64fast::Digest::new(); - while let Some((k, v)) = block_on(scanner.next()).unwrap() { + while let Some(row) = block_on(scanner.next()).unwrap() { + let (k, v) = row.kv(); let mut digest = digest.clone(); - digest.write(&k); - digest.write(&v); + digest.write(k); + digest.write(v); checksum ^= digest.sum64(); } checksum From e2e9f9c2a62051dc21cdb28767e41e65fc79acee Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 17 Jan 2023 23:21:49 +0800 Subject: [PATCH 460/676] storage: add priority scheduling for scheduler worker (#14057) ref tikv/tikv#13730 Support priority-based scheduling for the scheduler worker pool. Signed-off-by: Connor1996 Co-authored-by: Xinye Tao --- Cargo.lock | 2 + components/resource_control/src/lib.rs | 4 +- .../resource_control/src/resource_group.rs | 26 +- components/server/src/server.rs | 43 +-- components/server/src/server2.rs | 37 +-- components/test_raftstore/Cargo.toml | 1 + components/test_raftstore/src/cluster.rs | 18 +- components/test_raftstore/src/node.rs | 2 + components/test_raftstore/src/server.rs | 5 + .../tikv_util/src/yatp_pool/future_pool.rs | 2 + components/tikv_util/src/yatp_pool/mod.rs | 65 ++-- src/config/mod.rs | 20 +- src/read_pool.rs | 3 +- src/server/metrics.rs | 6 + src/server/service/kv.rs | 20 ++ src/storage/mod.rs | 55 +++- src/storage/txn/commands/mod.rs | 7 + src/storage/txn/mod.rs | 2 +- src/storage/txn/sched_pool.rs | 165 ++++++++-- src/storage/txn/scheduler.rs | 289 ++++++------------ tests/Cargo.toml | 1 + tests/failpoints/cases/test_storage.rs | 5 +- 22 files changed, 441 insertions(+), 337 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 069dbc4950e..ab1d164a1e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5884,6 +5884,7 @@ dependencies = [ "raftstore", "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "server", @@ -5997,6 +5998,7 @@ dependencies = [ "raftstore", "rand 0.8.5", "rand_xorshift", + "resource_control", "resource_metering", "security", "serde_json", diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index 516e5dd6c8d..eb6679f71e8 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -4,7 +4,9 @@ use online_config::OnlineConfig; use serde::{Deserialize, Serialize}; mod resource_group; -pub use resource_group::{ResourceController, ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; +pub use resource_group::{ + ResourceConsumeType, ResourceController, ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL, +}; mod future; pub use future::ControlledFuture; diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index d9fa3ccf14c..70f89fd1a9d 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -382,40 +382,40 @@ mod tests { resource_manager.add_resource_group(group2); assert_eq!(resource_manager.resource_groups.len(), 2); - let resouce_ctl = resource_manager.derive_controller("test_read".into(), true); - assert_eq!(resouce_ctl.resource_consumptions.len(), 3); + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + assert_eq!(resource_ctl.resource_consumptions.len(), 3); - let group1 = resouce_ctl.resource_group("test".as_bytes()); + let group1 = resource_ctl.resource_group("test".as_bytes()); assert_eq!(group1.weight, 500); - let group2 = resouce_ctl.resource_group("test2".as_bytes()); + let group2 = resource_ctl.resource_group("test2".as_bytes()); assert_eq!(group2.weight, 250); assert_eq!(group1.current_vt(), 0); let mut extras1 = Extras::single_level(); extras1.set_metadata("test".as_bytes().to_owned()); - assert_eq!(resouce_ctl.priority_of(&extras1), 25_000); + assert_eq!(resource_ctl.priority_of(&extras1), 25_000); assert_eq!(group1.current_vt(), 25_000); let mut extras2 = Extras::single_level(); extras2.set_metadata("test2".as_bytes().to_owned()); - assert_eq!(resouce_ctl.priority_of(&extras2), 12_500); + assert_eq!(resource_ctl.priority_of(&extras2), 12_500); assert_eq!(group2.current_vt(), 12_500); let mut extras3 = Extras::single_level(); extras3.set_metadata("unknown_group".as_bytes().to_owned()); - assert_eq!(resouce_ctl.priority_of(&extras3), 50); + assert_eq!(resource_ctl.priority_of(&extras3), 50); assert_eq!( - resouce_ctl + resource_ctl .resource_group("default".as_bytes()) .current_vt(), 50 ); - resouce_ctl.consume( + resource_ctl.consume( "test".as_bytes(), ResourceConsumeType::CpuTime(Duration::from_micros(10000)), ); - resouce_ctl.consume( + resource_ctl.consume( "test2".as_bytes(), ResourceConsumeType::CpuTime(Duration::from_micros(10000)), ); @@ -429,7 +429,7 @@ mod tests { assert_eq!(group1_vt, 5_025_000); assert!(group2.current_vt() >= group1.current_vt() * 3 / 4); assert!( - resouce_ctl + resource_ctl .resource_group("default".as_bytes()) .current_vt() >= group1.current_vt() / 2 @@ -442,8 +442,8 @@ mod tests { let new_group = new_resource_group("new_group".into(), true, 500, 500); resource_manager.add_resource_group(new_group); - assert_eq!(resouce_ctl.resource_consumptions.len(), 4); - let group3 = resouce_ctl.resource_group("new_group".as_bytes()); + assert_eq!(resource_ctl.resource_consumptions.len(), 4); + let group3 = resource_ctl.resource_group("new_group".as_bytes()); assert_eq!(group3.weight, 200); assert!(group3.current_vt() >= group1_vt / 2); } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 52b9fbf1d1a..cfc7e59e243 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -245,7 +245,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, - resource_manager: Arc, + resource_manager: Option>, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, br_snap_recovery_mode: bool, // use for br snapshot recovery @@ -322,23 +322,27 @@ where let config = cfg_controller.get_current(); let store_path = Path::new(&config.storage.data_dir).to_owned(); - let resource_manager = Arc::new(ResourceGroupManager::default()); - - // Initialize raftstore channels. - let (router, system) = fsm::create_raft_batch_system(&config.raft_store); let thread_count = config.server.background_thread_count; let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); - // spawn a task to periodically update the minimal virtual time of all resource - // group. - if config.resource_control.enabled { - let resource_mgr1 = resource_manager.clone(); + + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mgr1 = mgr.clone(); + // spawn a task to periodically update the minimal virtual time of all resource + // group. background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { - resource_mgr1.advance_min_virtual_time(); + mgr1.advance_min_virtual_time(); }); - } + Some(mgr) + } else { + None + }; + + // Initialize raftstore channels. + let (router, system) = fsm::create_raft_batch_system(&config.raft_store); let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), @@ -745,19 +749,15 @@ where } let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { - let priority_mgr = if self.config.resource_control.enabled { - Some( - self.resource_manager - .derive_controller("unified-read-pool".into(), true), - ) - } else { - None - }; + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), - priority_mgr, + resource_ctl, )) } else { None @@ -831,6 +831,9 @@ where Arc::clone(&self.quota_limiter), self.pd_client.feature_gate().clone(), self.causal_ts_provider.clone(), + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); cfg_controller.register( diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 12e6af61613..03b02e5f81e 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -222,7 +222,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, - resource_manager: Arc, + resource_manager: Option>, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, } @@ -287,15 +287,19 @@ where config.quota.max_delay_duration, config.quota.enable_auto_tune, )); - let resource_manager = Arc::new(ResourceGroupManager::default()); - // spawn a task to periodically update the minimal virtual time of all resource - // group. - if config.resource_control.enabled { - let resource_mgr1 = resource_manager.clone(); + + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mgr1 = mgr.clone(); + // spawn a task to periodically update the minimal virtual time of all resource + // group. background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { - resource_mgr1.advance_min_virtual_time(); + mgr1.advance_min_virtual_time(); }); - } + Some(mgr) + } else { + None + }; let mut causal_ts_provider = None; if let ApiVersion::V2 = F::TAG { @@ -634,19 +638,15 @@ where let pd_sender = raftstore_v2::FlowReporter::new(pd_worker.scheduler()); let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { - let priority_mgr = if self.config.resource_control.enabled { - Some( - self.resource_manager - .derive_controller("unified-read-pool".into(), true), - ) - } else { - None - }; + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), - priority_mgr, + resource_ctl, )) } else { None @@ -719,6 +719,9 @@ where Arc::clone(&self.quota_limiter), self.pd_client.feature_gate().clone(), self.causal_ts_provider.clone(), + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); cfg_controller.register( diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index 71c214ae21d..25a1224e261 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -49,6 +49,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raftstore = { workspace = true, features = ["testexport"] } rand = "0.8" resolved_ts = { workspace = true } +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true } server = { workspace = true } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index b2330e26f93..2121b7e021f 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -46,6 +46,7 @@ use raftstore::{ }, Error, Result, }; +use resource_control::ResourceGroupManager; use tempfile::TempDir; use test_pd_client::TestPdClient; use tikv::server::Result as ServerResult; @@ -80,6 +81,7 @@ pub trait Simulator { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Arc, ) -> ServerResult; fn stop_node(&mut self, node_id: u64); fn get_node_ids(&self) -> HashSet; @@ -174,6 +176,7 @@ pub struct Cluster { pub raft_statistics: Vec>>, pub sim: Arc>, pub pd_client: Arc, + resource_manager: Arc, } impl Cluster { @@ -207,6 +210,7 @@ impl Cluster { pd_client, sst_workers: vec![], sst_workers_map: HashMap::default(), + resource_manager: Arc::new(ResourceGroupManager::default()), kv_statistics: vec![], raft_statistics: vec![], } @@ -294,6 +298,7 @@ impl Cluster { key_mgr.clone(), router, system, + &self.resource_manager, )?; self.group_props.insert(node_id, props); self.engines.insert(node_id, engines); @@ -365,9 +370,16 @@ impl Cluster { tikv_util::thread_group::set_properties(Some(props)); debug!("calling run node"; "node_id" => node_id); // FIXME: rocksdb event listeners may not work, because we change the router. - self.sim - .wl() - .run_node(node_id, cfg, engines, store_meta, key_mgr, router, system)?; + self.sim.wl().run_node( + node_id, + cfg, + engines, + store_meta, + key_mgr, + router, + system, + &self.resource_manager, + )?; debug!("node {} started", node_id); Ok(()) } diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 78d98e5a5d3..9ae76dba9f8 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -30,6 +30,7 @@ use raftstore::{ }, Result, }; +use resource_control::ResourceGroupManager; use resource_metering::CollectorRegHandle; use tempfile::TempDir; use test_pd_client::TestPdClient; @@ -229,6 +230,7 @@ impl Simulator for NodeCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + _resource_manager: &Arc, ) -> ServerResult { assert!(node_id == 0 || !self.nodes.contains_key(&node_id)); let pd_worker = LazyWorker::new("test-pd-worker"); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 0ec60e468ee..ccf4df43497 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -42,6 +42,7 @@ use raftstore::{ }, Result, }; +use resource_control::ResourceGroupManager; use resource_metering::{CollectorRegHandle, ResourceTagFactory}; use security::SecurityManager; use tempfile::TempDir; @@ -264,6 +265,7 @@ impl ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Arc, ) -> ServerResult { let (tmp_str, tmp) = if node_id == 0 || !self.snap_paths.contains_key(&node_id) { let p = test_util::temp_dir("test_cluster", cfg.prefer_mem); @@ -414,6 +416,7 @@ impl ServerCluster { quota_limiter.clone(), self.pd_client.feature_gate().clone(), self.get_causal_ts_provider(node_id), + Some(resource_manager.derive_controller("scheduler-worker-pool".to_owned(), true)), )?; self.storages.insert(node_id, raft_engine); @@ -649,6 +652,7 @@ impl Simulator for ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Arc, ) -> ServerResult { dispatch_api_version!( cfg.storage.api_version(), @@ -660,6 +664,7 @@ impl Simulator for ServerCluster { key_manager, router, system, + resource_manager, ) ) } diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index e74ced848c0..f010b508aaa 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -28,6 +28,8 @@ struct Env { } #[derive(Clone)] +// FuturePool wraps a yatp thread pool providing task count metrics and gate +// maximum running tasks. pub struct FuturePool { inner: Arc, } diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index 29376b904a5..305d2162482 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -198,42 +198,42 @@ impl YatpPoolBuilder { } } - pub fn config(&mut self, config: Config) -> &mut Self { + pub fn config(self, config: Config) -> Self { // TODO: maybe we should use (1, num_cpu) for min and max thread count. self.thread_count(config.workers, config.workers, config.workers) .stack_size(config.stack_size) .max_tasks(config.workers.saturating_mul(config.max_tasks_per_worker)) } - pub fn stack_size(&mut self, val: usize) -> &mut Self { + pub fn stack_size(mut self, val: usize) -> Self { self.stack_size = val; self } - pub fn name_prefix(&mut self, val: impl Into) -> &mut Self { + pub fn name_prefix(mut self, val: impl Into) -> Self { let name = val.into(); self.name_prefix = Some(name); self } pub fn thread_count( - &mut self, + mut self, min_thread_count: usize, core_thread_count: usize, max_thread_count: usize, - ) -> &mut Self { + ) -> Self { self.min_thread_count = min_thread_count; self.core_thread_count = core_thread_count; self.max_thread_count = max_thread_count; self } - pub fn max_tasks(&mut self, tasks: usize) -> &mut Self { + pub fn max_tasks(mut self, tasks: usize) -> Self { self.max_tasks = tasks; self } - pub fn before_stop(&mut self, f: F) -> &mut Self + pub fn before_stop(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -241,7 +241,7 @@ impl YatpPoolBuilder { self } - pub fn after_start(&mut self, f: F) -> &mut Self + pub fn after_start(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -249,7 +249,7 @@ impl YatpPoolBuilder { self } - pub fn before_pause(&mut self, f: F) -> &mut Self + pub fn before_pause(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -257,13 +257,32 @@ impl YatpPoolBuilder { self } - pub fn build_future_pool(&mut self) -> FuturePool { + pub fn build_future_pool(self) -> FuturePool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); + let size = self.core_thread_count; + let task = self.max_tasks; let pool = self.build_single_level_pool(); - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); - FuturePool::from_pool(pool, name, self.core_thread_count, self.max_tasks) + FuturePool::from_pool(pool, &name, size, task) + } + + pub fn build_priority_future_pool( + self, + priority_provider: Arc, + ) -> FuturePool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); + let size = self.core_thread_count; + let task = self.max_tasks; + let pool = self.build_priority_pool(priority_provider); + FuturePool::from_pool(pool, &name, size, task) } - pub fn build_single_level_pool(&mut self) -> ThreadPool { + pub fn build_single_level_pool(self) -> ThreadPool { let (builder, runner) = self.create_builder(); builder.build_with_queue_and_runner( yatp::queue::QueueType::SingleLevel, @@ -271,9 +290,12 @@ impl YatpPoolBuilder { ) } - pub fn build_multi_level_pool(&mut self) -> ThreadPool { + pub fn build_multi_level_pool(self) -> ThreadPool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); let (builder, read_pool_runner) = self.create_builder(); - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); let multilevel_builder = multilevel::Builder::new(multilevel::Config::default().name(Some(name))); let runner_builder = @@ -283,11 +305,14 @@ impl YatpPoolBuilder { } pub fn build_priority_pool( - &mut self, + self, priority_provider: Arc, ) -> ThreadPool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); let (builder, read_pool_runner) = self.create_builder(); - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); let priority_builder = priority::Builder::new( priority::Config::default().name(Some(name)), priority_provider, @@ -296,8 +321,8 @@ impl YatpPoolBuilder { builder.build_with_queue_and_runner(QueueType::Priority(priority_builder), runner_builder) } - fn create_builder(&mut self) -> (yatp::Builder, YatpPoolRunner) { - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); + fn create_builder(mut self) -> (yatp::Builder, YatpPoolRunner) { + let name = self.name_prefix.unwrap_or_else(|| "yatp_pool".to_string()); let mut builder = yatp::Builder::new(thd_name!(name)); builder .stack_size(self.stack_size) @@ -309,7 +334,7 @@ impl YatpPoolBuilder { let before_stop = self.before_stop.take(); let before_pause = self.before_pause.take(); let schedule_wait_duration = - metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[&name]); let read_pool_runner = YatpPoolRunner::new( Default::default(), self.ticker.clone(), diff --git a/src/config/mod.rs b/src/config/mod.rs index 9caa68d8e6b..7878696faa5 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -4930,14 +4930,8 @@ mod tests { let max_pool_size = std::cmp::max(4, SysQuota::cpu_cores_quota() as usize); let check_scale_pool_size = |size: usize, ok: bool| { - let origin_pool_size = scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .get_pool_size(); - let origin_pool_size_high = scheduler - .get_sched_pool(CommandPri::High) - .pool - .get_pool_size(); + let origin_pool_size = scheduler.get_sched_pool().get_pool_size(CommandPri::Normal); + let origin_pool_size_high = scheduler.get_sched_pool().get_pool_size(CommandPri::High); let res = cfg_controller .update_config("storage.scheduler-worker-pool-size", &format!("{}", size)); let (expected_size, expected_size_high) = if ok { @@ -4948,17 +4942,11 @@ mod tests { (origin_pool_size, origin_pool_size_high) }; assert_eq!( - scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .get_pool_size(), + scheduler.get_sched_pool().get_pool_size(CommandPri::Normal), expected_size ); assert_eq!( - scheduler - .get_sched_pool(CommandPri::High) - .pool - .get_pool_size(), + scheduler.get_sched_pool().get_pool_size(CommandPri::High), expected_size_high ); }; diff --git a/src/read_pool.rs b/src/read_pool.rs index 1a590679584..ea20b149a3d 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -295,8 +295,7 @@ pub fn build_yatp_read_pool( ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); let raftkv = Arc::new(Mutex::new(engine)); - let mut builder = YatpPoolBuilder::new(ReporterTicker { reporter }); - builder + let builder = YatpPoolBuilder::new(ReporterTicker { reporter }) .name_prefix(&unified_read_pool_name) .stack_size(config.stack_size.0 as usize) .thread_count( diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 23f8256835b..d35c58cbf34 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -207,6 +207,12 @@ lazy_static! { &["type"] ) .unwrap(); + pub static ref GRPC_RESOURCE_GROUP_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_grpc_resource_group_total", + "Total number of handle grpc message for each resource group", + &["name"] + ) + .unwrap(); pub static ref GRPC_PROXY_MSG_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_grpc_proxy_msg_total", "Total number of handle grpc proxy message", diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 6c85741f64a..d42eb510891 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -171,6 +171,10 @@ macro_rules! handle_request { let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); let resp = $future_name(&self.storage, req); let task = async move { let resp = resp.await?; @@ -1043,6 +1047,10 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid, String::default()); }, Some(batch_commands_request::request::Cmd::Get(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_get(&req) }) { @@ -1057,6 +1065,10 @@ fn handle_batch_commands_request( } }, Some(batch_commands_request::request::Cmd::RawGet(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_raw_get(&req) }) { @@ -1071,6 +1083,10 @@ fn handle_batch_commands_request( } }, Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); let resp = future_copr(copr, Some(peer.to_string()), req) @@ -1098,6 +1114,10 @@ fn handle_batch_commands_request( ); } $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); let resp = $future_fn($($arg,)* req) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0819c2599b9..7429ed8900b 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -89,6 +89,7 @@ use kvproto::{ use pd_client::FeatureGate; use raftstore::store::{util::build_key_range, ReadStats, TxnExt, WriteStats}; use rand::prelude::*; +use resource_control::ResourceController; use resource_metering::{FutureExt, ResourceTagFactory}; use tikv_kv::{OnAppliedCb, SnapshotExt}; use tikv_util::{ @@ -129,7 +130,7 @@ use crate::{ txn::{ commands::{RawAtomicStore, RawCompareAndSwap, TypedCommand}, flow_controller::{EngineFlowController, FlowController}, - scheduler::Scheduler as TxnScheduler, + scheduler::TxnScheduler, Command, ErrorInner as TxnError, }, types::StorageCallbackType, @@ -270,6 +271,7 @@ impl Storage { quota_limiter: Arc, feature_gate: FeatureGate, causal_ts_provider: Option>, + resource_ctl: Option>, ) -> Result { assert_eq!(config.api_version(), F::TAG, "Api version not match"); @@ -285,6 +287,7 @@ impl Storage { resource_tag_factory.clone(), Arc::clone("a_limiter), feature_gate, + resource_ctl, ); info!("Storage started."); @@ -1509,15 +1512,20 @@ impl Storage { // Schedule raw modify commands, which reuse the scheduler worker pool. // TODO: separate the txn and raw commands if needed in the future. - fn sched_raw_command(&self, tag: CommandKind, future: T) -> Result<()> + fn sched_raw_command( + &self, + group_name: &str, + pri: CommandPri, + tag: CommandKind, + future: T, + ) -> Result<()> where - T: Future + Send + 'static, + T: Future + Send + 'static, { SCHED_STAGE_COUNTER_VEC.get(tag).new.inc(); self.sched - .get_sched_pool(CommandPri::Normal) - .pool - .spawn(future) + .get_sched_pool() + .spawn(group_name, pri, future) .map_err(|_| Error::from(ErrorInner::SchedTooBusy)) } @@ -1955,7 +1963,10 @@ impl Storage { let provider = self.causal_ts_provider.clone(); let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); - self.sched_raw_command(CMD, async move { + + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2065,7 +2076,9 @@ impl Storage { let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2128,7 +2141,9 @@ impl Storage { let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2187,7 +2202,9 @@ impl Storage { let cf = Self::rawkv_cf(&cf, self.api_version)?; let engine = self.engine.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2233,7 +2250,9 @@ impl Storage { let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2672,7 +2691,9 @@ impl Storage { return Err(Error::from(ErrorInner::TtlNotEnabled)); } let sched = self.get_scheduler(); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { let key = F::encode_raw_key_owned(key, None); let cmd = RawCompareAndSwap::new(cf, key, previous_value, value, ttl, api_version, ctx); Self::sched_raw_atomic_command( @@ -2703,7 +2724,9 @@ impl Storage { Self::check_ttl_valid(pairs.len(), &ttls)?; let sched = self.get_scheduler(); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls, None); let cmd = RawAtomicStore::new(cf, modifies, ctx); Self::sched_raw_atomic_command( @@ -2726,7 +2749,9 @@ impl Storage { Self::check_api_version(self.api_version, ctx.api_version, CMD, &keys)?; let cf = Self::rawkv_cf(&cf, self.api_version)?; let sched = self.get_scheduler(); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { // Do NOT encode ts here as RawAtomicStore use key to gen lock let modifies = keys .into_iter() @@ -3183,6 +3208,7 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), ts_provider, + None, ) } @@ -3213,6 +3239,7 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), None, + Some(Arc::new(ResourceController::new("test".to_owned(), false))), ) } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 2d79ebc97cc..5b94ea5bd85 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -715,6 +715,13 @@ impl Command { self.command_ext().get_ctx().get_priority() } + pub fn group_name(&self) -> String { + self.command_ext() + .get_ctx() + .get_resource_group_name() + .to_owned() + } + pub fn need_flow_control(&self) -> bool { !self.readonly() && self.priority() != CommandPri::High } diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index f6884b0efb8..d3b199208cb 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -32,7 +32,7 @@ pub use self::{ }, commands::{Command, RESOLVE_LOCK_BATCH_SIZE}, latch::{Latches, Lock}, - scheduler::Scheduler, + scheduler::TxnScheduler, store::{ EntryBatch, FixtureStore, FixtureStoreScanner, Scanner, SnapshotStore, Store, TxnEntry, TxnEntryScanner, TxnEntryStore, diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index c7c69b5bbf4..0cff9d51d41 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -8,14 +8,16 @@ use std::{ use collections::HashMap; use file_system::{set_io_type, IoType}; -use kvproto::pdpb::QueryKind; +use kvproto::{kvrpcpb::CommandPri, pdpb::QueryKind}; use pd_client::{Feature, FeatureGate}; use prometheus::local::*; use raftstore::store::WriteStats; +use resource_control::{ControlledFuture, ResourceController}; use tikv_util::{ sys::SysQuota, - yatp_pool::{FuturePool, PoolTicker, YatpPoolBuilder}, + yatp_pool::{Full, FuturePool, PoolTicker, YatpPoolBuilder}, }; +use yatp::queue::Extras; use crate::storage::{ kv::{destroy_tls_engine, set_tls_engine, Engine, FlowStatsReporter, Statistics}, @@ -41,11 +43,6 @@ thread_local! { static TLS_FEATURE_GATE: RefCell = RefCell::new(latest_feature_gate()); } -#[derive(Clone)] -pub struct SchedPool { - pub pool: FuturePool, -} - #[derive(Clone)] pub struct SchedTicker { reporter: R, @@ -57,38 +54,142 @@ impl PoolTicker for SchedTicker { } } +#[derive(Clone)] +pub enum SchedPool { + // separated thread pools for different priority commands + Vanilla { + high_worker_pool: FuturePool, + worker_pool: FuturePool, + }, + // one priority based thread pool to handle all commands + Priority { + worker_pool: FuturePool, + resource_ctl: Arc, + }, +} + impl SchedPool { pub fn new( engine: E, pool_size: usize, reporter: R, feature_gate: FeatureGate, - name_prefix: &str, + resource_ctl: Option>, ) -> Self { - let engine = Arc::new(Mutex::new(engine)); - // for low cpu quota env, set the max-thread-count as 4 to allow potential cases - // that we need more thread than cpu num. - let max_pool_size = std::cmp::max( - pool_size, - std::cmp::max(4, SysQuota::cpu_cores_quota() as usize), - ); - let pool = YatpPoolBuilder::new(SchedTicker {reporter:reporter.clone()}) - .thread_count(1, pool_size, max_pool_size) - .name_prefix(name_prefix) - // Safety: by setting `after_start` and `before_stop`, `FuturePool` ensures - // the tls_engine invariants. - .after_start(move || { - set_tls_engine(engine.lock().unwrap().clone()); - set_io_type(IoType::ForegroundWrite); - TLS_FEATURE_GATE.with(|c| *c.borrow_mut() = feature_gate.clone()); - }) - .before_stop(move || unsafe { - // Safety: we ensure the `set_` and `destroy_` calls use the same engine type. - destroy_tls_engine::(); - tls_flush(&reporter); - }) - .build_future_pool(); - SchedPool { pool } + let builder = |pool_size: usize, name_prefix: &str| { + let engine = Arc::new(Mutex::new(engine.clone())); + let feature_gate = feature_gate.clone(); + let reporter = reporter.clone(); + // for low cpu quota env, set the max-thread-count as 4 to allow potential cases + // that we need more thread than cpu num. + let max_pool_size = std::cmp::max( + pool_size, + std::cmp::max(4, SysQuota::cpu_cores_quota() as usize), + ); + YatpPoolBuilder::new(SchedTicker {reporter:reporter.clone()}) + .thread_count(1, pool_size, max_pool_size) + .name_prefix(name_prefix) + // Safety: by setting `after_start` and `before_stop`, `FuturePool` ensures + // the tls_engine invariants. + .after_start(move || { + set_tls_engine(engine.lock().unwrap().clone()); + set_io_type(IoType::ForegroundWrite); + TLS_FEATURE_GATE.with(|c| *c.borrow_mut() = feature_gate.clone()); + }) + .before_stop(move || unsafe { + // Safety: we ensure the `set_` and `destroy_` calls use the same engine type. + destroy_tls_engine::(); + tls_flush(&reporter); + }) + }; + if let Some(ref r) = resource_ctl { + SchedPool::Priority { + worker_pool: builder(pool_size, "sched-worker-pool") + .build_priority_future_pool(r.clone()), + resource_ctl: r.clone(), + } + } else { + SchedPool::Vanilla { + worker_pool: builder(pool_size, "sched-worker-pool").build_future_pool(), + high_worker_pool: builder(std::cmp::max(1, pool_size / 2), "sched-high-pri-pool") + .build_future_pool(), + } + } + } + + pub fn spawn( + &self, + group_name: &str, + priority: CommandPri, + f: impl futures::Future + Send + 'static, + ) -> Result<(), Full> { + match self { + SchedPool::Vanilla { + high_worker_pool, + worker_pool, + } => { + if priority == CommandPri::High { + high_worker_pool.spawn(f) + } else { + worker_pool.spawn(f) + } + } + SchedPool::Priority { + worker_pool, + resource_ctl, + } => { + let fixed_level = match priority { + CommandPri::High => Some(0), + CommandPri::Normal => None, + CommandPri::Low => Some(2), + }; + // TODO: maybe use a better way to generate task_id + let task_id = rand::random::(); + let mut extras = Extras::new_multilevel(task_id, fixed_level); + extras.set_metadata(group_name.as_bytes().to_owned()); + worker_pool.spawn_with_extras( + ControlledFuture::new( + async move { + f.await; + }, + resource_ctl.clone(), + group_name.as_bytes().to_owned(), + ), + extras, + ) + } + } + } + + pub fn scale_pool_size(&self, pool_size: usize) { + match self { + SchedPool::Vanilla { + high_worker_pool, + worker_pool, + } => { + high_worker_pool.scale_pool_size(std::cmp::max(1, pool_size / 2)); + worker_pool.scale_pool_size(pool_size); + } + SchedPool::Priority { worker_pool, .. } => { + worker_pool.scale_pool_size(pool_size); + } + } + } + + pub fn get_pool_size(&self, priority: CommandPri) -> usize { + match self { + SchedPool::Vanilla { + high_worker_pool, + worker_pool, + } => { + if priority == CommandPri::High { + high_worker_pool.get_pool_size() + } else { + worker_pool.get_pool_size() + } + } + SchedPool::Priority { worker_pool, .. } => worker_pool.get_pool_size(), + } } } diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index d96e3e7c97f..17110a07e7b 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath -//! Scheduler which schedules the execution of `storage::Command`s. +//! TxnScheduler which schedules the execution of `storage::Command`s. //! //! There is one scheduler for each store. It receives commands from clients, //! executes them against the MVCC layer storage engine. @@ -12,16 +12,16 @@ //! leader. When the client read or write a row, the command is sent to the //! scheduler which is on the region leader's store. //! -//! Scheduler runs in a single-thread event loop, but command executions are +//! TxnScheduler runs in a single-thread event loop, but command executions are //! delegated to a pool of worker thread. //! -//! Scheduler keeps track of all the running commands and uses latches to ensure -//! serialized access to the overlapping rows involved in concurrent commands. -//! But note that scheduler only ensures serialized access to the overlapping -//! rows at command level, but a transaction may consist of multiple commands, -//! therefore conflicts may happen at transaction level. Transaction semantics -//! is ensured by the transaction protocol implemented in the client library, -//! which is transparent to the scheduler. +//! TxnScheduler keeps track of all the running commands and uses latches to +//! ensure serialized access to the overlapping rows involved in concurrent +//! commands. But note that scheduler only ensures serialized access to the +//! overlapping rows at command level, but a transaction may consist of multiple +//! commands, therefore conflicts may happen at transaction level. Transaction +//! semantics is ensured by the transaction protocol implemented in the client +//! library, which is transparent to the scheduler. use std::{ marker::PhantomData, @@ -47,12 +47,11 @@ use kvproto::{ use parking_lot::{Mutex, MutexGuard, RwLockWriteGuard}; use pd_client::{Feature, FeatureGate}; use raftstore::store::TxnExt; +use resource_control::ResourceController; use resource_metering::{FutureExt, ResourceTagFactory}; use smallvec::{smallvec, SmallVec}; use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData, WriteEvent}; -use tikv_util::{ - deadline::Deadline, quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE, -}; +use tikv_util::{quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE}; use tracker::{get_tls_tracker_token, set_tls_tracker_token, TrackerToken}; use txn_types::TimeStamp; @@ -239,7 +238,7 @@ impl SchedulerTaskCallback { } } -struct SchedulerInner { +struct TxnSchedulerInner { // slot_id -> { cid -> `TaskContext` } in the slot. task_slots: Vec>>>, @@ -251,11 +250,8 @@ struct SchedulerInner { sched_pending_write_threshold: usize, - // worker pool - worker_pool: SchedPool, - - // high priority commands and system commands will be delivered to this pool - high_priority_pool: SchedPool, + // all tasks are executed in this pool + sched_worker_pool: SchedPool, // used to control write flow running_write_bytes: CachePadded, @@ -292,7 +288,7 @@ fn id_index(cid: u64) -> usize { cid as usize % TASKS_SLOTS_NUM } -impl SchedulerInner { +impl TxnSchedulerInner { /// Generates the next command ID. #[inline] fn gen_id(&self) -> u64 { @@ -375,19 +371,23 @@ impl SchedulerInner { /// /// Returns a deadline error if the deadline is exceeded. Returns the `Task` /// if all latches are acquired, returns `None` otherwise. - fn acquire_lock_on_wakeup(&self, cid: u64) -> Result, StorageError> { + fn acquire_lock_on_wakeup( + &self, + cid: u64, + ) -> Result, (String, CommandPri, StorageError)> { let mut task_slot = self.get_task_slot(cid); let tctx = task_slot.get_mut(&cid).unwrap(); // Check deadline early during acquiring latches to avoid expired requests // blocking other requests. - if let Err(e) = tctx.task.as_ref().unwrap().cmd.deadline().check() { + let cmd = &tctx.task.as_ref().unwrap().cmd; + if let Err(e) = cmd.deadline().check() { // `acquire_lock_on_wakeup` is called when another command releases its locks // and wakes up command `cid`. This command inserted its lock before // and now the lock is at the front of the queue. The actual // acquired count is one more than the `owned_count` recorded in the // lock, so we increase one to make `release` work. tctx.lock.owned_count += 1; - return Err(e.into()); + return Err((cmd.group_name(), cmd.priority(), e.into())); } if self.latches.acquire(&mut tctx.lock, cid) { tctx.on_schedule(); @@ -401,25 +401,22 @@ impl SchedulerInner { } fn scale_pool_size(&self, pool_size: usize) { - self.worker_pool.pool.scale_pool_size(pool_size); - self.high_priority_pool - .pool - .scale_pool_size(std::cmp::max(1, pool_size / 2)); + self.sched_worker_pool.scale_pool_size(pool_size); } } -/// Scheduler which schedules the execution of `storage::Command`s. +/// TxnScheduler which schedules the execution of `storage::Command`s. #[derive(Clone)] -pub struct Scheduler { - inner: Arc>, +pub struct TxnScheduler { + inner: Arc>, // The engine can be fetched from the thread local storage of scheduler threads. // So, we don't store the engine here. _engine: PhantomData, } -unsafe impl Send for Scheduler {} +unsafe impl Send for TxnScheduler {} -impl Scheduler { +impl TxnScheduler { /// Creates a scheduler. pub(in crate::storage) fn new( engine: E, @@ -433,6 +430,7 @@ impl Scheduler { resource_tag_factory: ResourceTagFactory, quota_limiter: Arc, feature_gate: FeatureGate, + resource_ctl: Option>, ) -> Self { let t = Instant::now_coarse(); let mut task_slots = Vec::with_capacity(TASKS_SLOTS_NUM); @@ -442,25 +440,18 @@ impl Scheduler { let lock_wait_queues = LockWaitQueues::new(lock_mgr.clone()); - let inner = Arc::new(SchedulerInner { + let inner = Arc::new(TxnSchedulerInner { task_slots, id_alloc: AtomicU64::new(0).into(), latches: Latches::new(config.scheduler_concurrency), running_write_bytes: AtomicUsize::new(0).into(), sched_pending_write_threshold: config.scheduler_pending_write_threshold.0 as usize, - worker_pool: SchedPool::new( - engine.clone(), - config.scheduler_worker_pool_size, - reporter.clone(), - feature_gate.clone(), - "sched-worker-pool", - ), - high_priority_pool: SchedPool::new( + sched_worker_pool: SchedPool::new( engine, - std::cmp::max(1, config.scheduler_worker_pool_size / 2), + config.scheduler_worker_pool_size, reporter, feature_gate.clone(), - "sched-high-pri-pool", + resource_ctl, ), control_mutex: Arc::new(tokio::sync::Mutex::new(false)), lock_mgr, @@ -481,7 +472,7 @@ impl Scheduler { t.saturating_elapsed(), "initialized the transaction scheduler" ); - Scheduler { + TxnScheduler { inner, _engine: PhantomData, } @@ -561,26 +552,19 @@ impl Scheduler { return; } let task = tctx.task.as_ref().unwrap(); - let deadline = task.cmd.deadline(); - let cmd_ctx = task.cmd.ctx().clone(); - self.fail_fast_or_check_deadline(cid, tag, cmd_ctx, deadline); + self.fail_fast_or_check_deadline(cid, &task.cmd); fail_point!("txn_scheduler_acquire_fail"); } - fn fail_fast_or_check_deadline( - &self, - cid: u64, - tag: CommandKind, - cmd_ctx: Context, - deadline: Deadline, - ) { + fn fail_fast_or_check_deadline(&self, cid: u64, cmd: &Command) { + let tag = cmd.tag(); + let ctx = cmd.ctx().clone(); + let deadline = cmd.deadline(); let sched = self.clone(); - self.inner - .high_priority_pool - .pool - .spawn(async move { + self.get_sched_pool() + .spawn(&cmd.group_name(), cmd.priority(), async move { match unsafe { - with_tls_engine(|engine: &mut E| engine.precheck_write_with_ctx(&cmd_ctx)) + with_tls_engine(|engine: &mut E| engine.precheck_write_with_ctx(&ctx)) } { // Precheck failed, try to return err early. Err(e) => { @@ -632,14 +616,12 @@ impl Scheduler { self.execute(task); } Ok(None) => {} - Err(err) => { + Err((group_name, pri, err)) => { // Spawn the finish task to the pool to avoid stack overflow // when many queuing tasks fail successively. let this = self.clone(); - self.inner - .worker_pool - .pool - .spawn(async move { + self.get_sched_pool() + .spawn(&group_name, pri, async move { this.finish_with_err(cid, err); }) .unwrap(); @@ -670,21 +652,17 @@ impl Scheduler { } // pub for test - pub fn get_sched_pool(&self, priority: CommandPri) -> &SchedPool { - if priority == CommandPri::High { - &self.inner.high_priority_pool - } else { - &self.inner.worker_pool - } + pub fn get_sched_pool(&self) -> &SchedPool { + &self.inner.sched_worker_pool } /// Executes the task in the sched pool. fn execute(&self, mut task: Task) { set_tls_tracker_token(task.tracker); let sched = self.clone(); - self.get_sched_pool(task.cmd.priority()) - .pool - .spawn(async move { + + self.get_sched_pool() + .spawn(&task.cmd.group_name(), task.cmd.priority(), async move { fail_point!("scheduler_start_execute"); if sched.check_task_deadline_exceeded(&task) { return; @@ -800,6 +778,7 @@ impl Scheduler { async_apply_prewrite: bool, new_acquired_locks: Vec, tag: CommandKind, + group_name: &str, ) { // TODO: Does async apply prewrite worth a special metric here? if pipelined { @@ -847,7 +826,7 @@ impl Scheduler { assert!(pipelined || async_apply_prewrite); } - self.on_acquired_locks_finished(new_acquired_locks); + self.on_acquired_locks_finished(group_name, new_acquired_locks); if do_wake_up { let woken_up_resumable_lock_requests = tctx.woken_up_resumable_lock_requests; @@ -932,7 +911,11 @@ impl Scheduler { ); } - fn on_release_locks(&self, released_locks: ReleasedLocks) -> SVec> { + fn on_release_locks( + &self, + group_name: &str, + released_locks: ReleasedLocks, + ) -> SVec> { // This function is always called when holding the latch of the involved keys. // So if we found the lock waiting queues are empty, there's no chance // that other threads/commands adds new lock-wait entries to the keys @@ -973,13 +956,21 @@ impl Scheduler { }); if !legacy_wake_up_list.is_empty() || !delay_wake_up_futures.is_empty() { - self.wake_up_legacy_pessimistic_locks(legacy_wake_up_list, delay_wake_up_futures); + self.wake_up_legacy_pessimistic_locks( + group_name, + legacy_wake_up_list, + delay_wake_up_futures, + ); } resumable_wake_up_list } - fn on_acquired_locks_finished(&self, new_acquired_locks: Vec) { + fn on_acquired_locks_finished( + &self, + group_name: &str, + new_acquired_locks: Vec, + ) { if new_acquired_locks.is_empty() || self.inner.lock_wait_queues.is_empty() { return; } @@ -992,9 +983,8 @@ impl Scheduler { .update_lock_wait(new_acquired_locks); } else { let lock_wait_queues = self.inner.lock_wait_queues.clone(); - self.get_sched_pool(CommandPri::High) - .pool - .spawn(async move { + self.get_sched_pool() + .spawn(group_name, CommandPri::High, async move { lock_wait_queues.update_lock_wait(new_acquired_locks); }) .unwrap(); @@ -1003,15 +993,16 @@ impl Scheduler { fn wake_up_legacy_pessimistic_locks( &self, + group_name: &str, legacy_wake_up_list: impl IntoIterator, ReleasedLock)> + Send + 'static, delayed_wake_up_futures: impl IntoIterator + Send + 'static, ) { let self1 = self.clone(); - self.get_sched_pool(CommandPri::High) - .pool - .spawn(async move { + let group_name1 = group_name.to_owned(); + self.get_sched_pool() + .spawn(group_name, CommandPri::High, async move { for (lock_info, released_lock) in legacy_wake_up_list { let cb = lock_info.key_cb.unwrap().into_inner(); let e = StorageError::from(Error::from(MvccError::from( @@ -1030,9 +1021,8 @@ impl Scheduler { for f in delayed_wake_up_futures { let self2 = self1.clone(); self1 - .get_sched_pool(CommandPri::High) - .pool - .spawn(async move { + .get_sched_pool() + .spawn(&group_name1, CommandPri::High, async move { let res = f.await; if let Some(resumable_lock_wait_entry) = res { self2.schedule_awakened_pessimistic_locks( @@ -1121,7 +1111,7 @@ impl Scheduler { } /// Processes a read command within a worker thread, then posts - /// `ReadFinished` message back to the `Scheduler`. + /// `ReadFinished` message back to the `TxnScheduler`. fn process_read(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { fail_point!("txn_before_process_read"); debug!("process read cmd in worker pool"; "cid" => task.cid); @@ -1144,12 +1134,13 @@ impl Scheduler { /// Processes a write command within a worker thread, then posts either a /// `WriteFinished` message if successful or a `FinishedWithErr` message - /// back to the `Scheduler`. + /// back to the `TxnScheduler`. async fn process_write(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { fail_point!("txn_before_process_write"); let write_bytes = task.cmd.write_bytes(); let tag = task.cmd.tag(); let cid = task.cid; + let group_name = task.cmd.group_name(); let tracker = task.tracker; let scheduler = self.clone(); let quota_limiter = self.inner.quota_limiter.clone(); @@ -1285,7 +1276,7 @@ impl Scheduler { } let woken_up_resumable_entries = if !released_locks.is_empty() { - scheduler.on_release_locks(released_locks) + scheduler.on_release_locks(&group_name, released_locks) } else { smallvec![] }; @@ -1306,6 +1297,7 @@ impl Scheduler { false, new_acquired_locks, tag, + &group_name, ); return; } @@ -1336,6 +1328,7 @@ impl Scheduler { false, new_acquired_locks, tag, + &group_name, ); return; } @@ -1522,6 +1515,7 @@ impl Scheduler { is_async_apply_prewrite, new_acquired_locks, tag, + &group_name, ); KV_COMMAND_KEYWRITE_HISTOGRAM_VEC .get(tag) @@ -1828,7 +1822,7 @@ mod tests { } // TODO(cosven): use this in the following test cases to reduce duplicate code. - fn new_test_scheduler() -> (Scheduler, RocksEngine) { + fn new_test_scheduler() -> (TxnScheduler, RocksEngine) { let engine = TestEngineBuilder::new().build().unwrap(); let config = Config { scheduler_concurrency: 1024, @@ -1838,7 +1832,7 @@ mod tests { ..Default::default() }; ( - Scheduler::new( + TxnScheduler::new( engine.clone(), MockLockManager::new(), ConcurrencyManager::new(1.into()), @@ -1854,6 +1848,7 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), latest_feature_gate(), + Some(Arc::new(ResourceController::new("test".to_owned(), true))), ), engine, ) @@ -1978,31 +1973,7 @@ mod tests { #[test] fn test_acquire_latch_deadline() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); let mut lock = Lock::new(&[Key::from_raw(b"b")]); let cid = scheduler.inner.gen_id(); @@ -2084,38 +2055,15 @@ mod tests { #[test] fn test_pool_available_deadline() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); // Spawn a task that sleeps for 500ms to occupy the pool. The next request // cannot run within 500ms. scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .spawn(async { thread::sleep(Duration::from_millis(500)) }) + .get_sched_pool() + .spawn("", CommandPri::Normal, async { + thread::sleep(Duration::from_millis(500)) + }) .unwrap(); let mut req = BatchRollbackRequest::default(); @@ -2144,31 +2092,7 @@ mod tests { #[test] fn test_flow_control_trottle_deadline() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); let mut req = CheckTxnStatusRequest::default(); req.mut_context().max_execution_duration_ms = 100; @@ -2212,31 +2136,7 @@ mod tests { #[test] fn test_accumulate_many_expired_commands() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); let mut lock = Lock::new(&[Key::from_raw(b"b")]); let cid = scheduler.inner.gen_id(); @@ -2283,7 +2183,7 @@ mod tests { let feature_gate = FeatureGate::default(); feature_gate.set_version("6.0.0").unwrap(); - let scheduler = Scheduler::new( + let scheduler = TxnScheduler::new( engine, MockLockManager::new(), ConcurrencyManager::new(1.into()), @@ -2299,6 +2199,7 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), feature_gate.clone(), + Some(Arc::new(ResourceController::new("test".to_owned(), true))), ); // Use sync mode if pipelined_pessimistic_lock is false. assert_eq!(scheduler.pessimistic_lock_mode(), PessimisticLockMode::Sync); diff --git a/tests/Cargo.toml b/tests/Cargo.toml index ae6c6984487..1cc0e6bce87 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -95,6 +95,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raft_log_engine = { workspace = true } raftstore = { workspace = true } rand = "0.8.3" +resource_control = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tempfile = "3.0" diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 2508b544285..1a7d44db972 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -312,10 +312,7 @@ fn test_scale_scheduler_pool() { .update_config("storage.scheduler-worker-pool-size", &format!("{}", size)) .unwrap(); assert_eq!( - scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .get_pool_size(), + scheduler.get_sched_pool().get_pool_size(CommandPri::Normal), size ); }; From 7240e5778ef3c379b0f898c103dc675fad7af099 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 18 Jan 2023 11:47:50 +0800 Subject: [PATCH 461/676] fix docker build (#13937) ref tikv/tikv#11312 Fix `make docker`. Signed-off-by: tabokie --- Dockerfile | 11 ++++++++--- cmd/build.rs | 4 +++- components/profiler/Cargo.toml | 1 + scripts/check-docker-build | 2 +- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index c4ad36dc6e7..aefa51b2222 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,11 @@ RUN ln -s /usr/bin/cmake3 /usr/bin/cmake ENV LIBRARY_PATH /usr/local/lib:$LIBRARY_PATH ENV LD_LIBRARY_PATH /usr/local/lib:$LD_LIBRARY_PATH +# Install protoc +RUN curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-x86_64.zip" +RUN unzip protoc-3.15.8-linux-x86_64.zip -d /usr/local/ +ENV PATH /usr/local/bin/:$PATH + # Install Rustup RUN curl https://sh.rustup.rs -sSf | sh -s -- --no-modify-path --default-toolchain none -y ENV PATH /root/.cargo/bin/:$PATH @@ -72,8 +77,7 @@ RUN mkdir -p ./cmd/tikv-ctl/src ./cmd/tikv-server/src && \ echo 'fn main() {}' > ./cmd/tikv-ctl/src/main.rs && \ echo 'fn main() {}' > ./cmd/tikv-server/src/main.rs && \ for cargotoml in $(find . -type f -name "Cargo.toml"); do \ - sed -i '/fuzz/d' ${cargotoml} && \ - sed -i '/profiler/d' ${cargotoml} ; \ + sed -i '/fuzz/d' ${cargotoml} ; \ done COPY Makefile ./ @@ -105,8 +109,9 @@ FROM pingcap/alpine-glibc COPY --from=builder /tikv/target/release/tikv-server /tikv-server COPY --from=builder /tikv/target/release/tikv-ctl /tikv-ctl +# FIXME: Figure out why libstdc++ is not staticly linked. RUN apk add --no-cache \ - curl + curl libstdc++ EXPOSE 20160 20180 diff --git a/cmd/build.rs b/cmd/build.rs index 6d11a38f705..c19797d9227 100644 --- a/cmd/build.rs +++ b/cmd/build.rs @@ -32,7 +32,9 @@ fn link_sys_lib(lib: &str, tool: &cc::Tool) { } // remove lib prefix and .a postfix. let libname = &lib[3..lib.len() - 2]; - println!("cargo:rustc-link-lib=static:+whole-archive={}", &libname); + // Get around the issue "the linking modifiers `+bundle` and `+whole-archive` + // are not compatible with each other when generating rlibs" + println!("cargo:rustc-link-lib=static:-bundle,+whole-archive={}", &libname); println!( "cargo:rustc-link-search=native={}", path.parent().unwrap().display() diff --git a/components/profiler/Cargo.toml b/components/profiler/Cargo.toml index b0c456b209f..e5583a631d5 100644 --- a/components/profiler/Cargo.toml +++ b/components/profiler/Cargo.toml @@ -18,4 +18,5 @@ valgrind_request = { version = "1.1.0", optional = true } [[example]] name = "prime" +path = "examples/prime.rs" required-features = ["profiling"] diff --git a/scripts/check-docker-build b/scripts/check-docker-build index 6a505f31a89..0eee0c5cf1f 100755 --- a/scripts/check-docker-build +++ b/scripts/check-docker-build @@ -2,7 +2,7 @@ # This script checks if all cargo targets have path specifications. set -euo pipefail -for i in $(git ls-files | grep 'Cargo.toml' | grep -v 'fuzz/\|./profiler/'); do +for i in $(git ls-files | grep 'Cargo.toml' | grep -v 'fuzz/'); do for target in "test" "bench" "bin" "example"; do # from "[[test]]" to the first trailing empty line matches=$(sed -n "/\[\[$target\]\]/,/^$/ p" $i) From b35d4fb33a18c5be9136c790e01ca449075e6acb Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 18 Jan 2023 14:57:51 +0800 Subject: [PATCH 462/676] pd_client: fix the kvproto compatibility (#14064) close tikv/tikv#14063 make sure kvproto compatibility Signed-off-by: husharp --- Cargo.lock | 2 +- components/error_code/src/pd.rs | 1 + components/pd_client/src/client.rs | 6 +----- components/pd_client/src/client_v2.rs | 6 +----- components/pd_client/src/errors.rs | 4 ++++ components/pd_client/src/util.rs | 1 + components/resource_control/src/resource_group.rs | 8 ++++---- etc/error_code.toml | 5 +++++ 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab1d164a1e0..a2924314f8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2727,7 +2727,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#a14c44ef44b378d15adb5baad8402b838f031b51" +source = "git+https://github.com/pingcap/kvproto.git#adcf4c414bfd0ccf18436b377430aa2450fd4c81" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/error_code/src/pd.rs b/components/error_code/src/pd.rs index 3ca2ac0b29f..782c4f3923b 100644 --- a/components/error_code/src/pd.rs +++ b/components/error_code/src/pd.rs @@ -12,5 +12,6 @@ define_error_codes!( REGION_NOT_FOUND => ("RegionNotFound", "", ""), STORE_TOMBSTONE => ("StoreTombstone", "", ""), GLOBAL_CONFIG_NOT_FOUND => ("GlobalConfigNotFound","",""), + DATA_COMPACTED => ("DataCompacted","",""), UNKNOWN => ("Unknown", "", "") ); diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 5bccdcfacea..1e1e5980908 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -302,11 +302,7 @@ impl PdClient for RpcClient { Ok(grpc_response) => { let mut res = HashMap::with_capacity(grpc_response.get_items().len()); for c in grpc_response.get_items() { - if c.has_error() { - error!("failed to load global config with key {:?}", c.get_error()); - } else { - res.insert(c.get_name().to_owned(), c.get_value().to_owned()); - } + res.insert(c.get_name().to_owned(), c.get_value().to_owned()); } Ok(res) } diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index b42d8fb3ddb..35e5c3b4785 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -803,11 +803,7 @@ impl PdClient for RpcClient { Ok(grpc_response) => { let mut res = HashMap::with_capacity(grpc_response.get_items().len()); for c in grpc_response.get_items() { - if c.has_error() { - error!("failed to load global config with key {:?}", c.get_error()); - } else { - res.insert(c.get_name().to_owned(), c.get_value().to_owned()); - } + res.insert(c.get_name().to_owned(), c.get_value().to_owned()); } Ok(res) } diff --git a/components/pd_client/src/errors.rs b/components/pd_client/src/errors.rs index 61adceec391..689cb276064 100644 --- a/components/pd_client/src/errors.rs +++ b/components/pd_client/src/errors.rs @@ -26,6 +26,8 @@ pub enum Error { StoreTombstone(String), #[error("global config item {0} not found")] GlobalConfigNotFound(String), + #[error("required watch revision is smaller than current compact/min revision. {0:?}")] + DataCompacted(String), } pub type Result = result::Result; @@ -38,6 +40,7 @@ impl Error { | Error::RegionNotFound(_) | Error::StoreTombstone(_) | Error::GlobalConfigNotFound(_) + | Error::DataCompacted(_) | Error::ClusterBootstrapped(_) | Error::Incompatible => false, } @@ -55,6 +58,7 @@ impl ErrorCodeExt for Error { Error::RegionNotFound(_) => error_code::pd::REGION_NOT_FOUND, Error::StoreTombstone(_) => error_code::pd::STORE_TOMBSTONE, Error::GlobalConfigNotFound(_) => error_code::pd::GLOBAL_CONFIG_NOT_FOUND, + Error::DataCompacted(_) => error_code::pd::DATA_COMPACTED, Error::Other(_) => error_code::pd::UNKNOWN, } } diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 72c8cc16b04..fd58cd921d8 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -873,6 +873,7 @@ pub fn check_resp_header(header: &ResponseHeader) -> Result<()> { ErrorType::GlobalConfigNotFound => { Err(Error::GlobalConfigNotFound(err.get_message().to_owned())) } + ErrorType::DataCompacted => Err(Error::DataCompacted(err.get_message().to_owned())), ErrorType::Ok => Ok(()), ErrorType::DuplicatedEntry | ErrorType::EntryNotFound => Err(box_err!(err.get_message())), ErrorType::Unknown => Err(box_err!(err.get_message())), diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 70f89fd1a9d..bfe9d92d0f3 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -51,12 +51,12 @@ impl ResourceGroupManager { // TODO: currently we only consider the cpu usage in the read path, we may also take // io read bytes into account later. (GroupMode::RawMode, true) => rg - .get_resource_settings() + .get_raw_resource_settings() .get_cpu() .get_settings() .get_fill_rate(), (GroupMode::RawMode, false) => rg - .get_resource_settings() + .get_raw_resource_settings() .get_io_write() .get_settings() .get_fill_rate(), @@ -327,7 +327,7 @@ mod tests { .set_fill_rate(write_tokens); group.set_r_u_settings(ru_setting); } else { - let mut resource_setting = GroupResourceSettings::new(); + let mut resource_setting = GroupRawResourceSettings::new(); resource_setting .mut_cpu() .mut_settings() @@ -336,7 +336,7 @@ mod tests { .mut_io_write() .mut_settings() .set_fill_rate(write_tokens); - group.set_resource_settings(resource_setting); + group.set_raw_resource_settings(resource_setting); } group } diff --git a/etc/error_code.toml b/etc/error_code.toml index 5cdd770f8d2..6b361e29e37 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -263,6 +263,11 @@ error = ''' KV:Pd:GlobalConfigNotFound ''' +["KV:Pd:DataCompacted"] +error = ''' +KV:Pd:DataCompacted +''' + ["KV:Pd:Unknown"] error = ''' KV:Pd:Unknown From 15445fd8a9c6832afeaf335a84c334fa13f6ecfe Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 19 Jan 2023 11:23:49 +0800 Subject: [PATCH 463/676] raftstore-v2: add more features to pd worker v2 (#14003) ref tikv/tikv#12842 Signed-off-by: tabokie --- components/engine_panic/src/misc.rs | 4 + components/engine_panic/src/snapshot.rs | 10 +- components/engine_rocks/src/misc.rs | 12 + components/engine_rocks/src/snapshot.rs | 10 +- components/engine_traits/src/misc.rs | 2 + components/engine_traits/src/snapshot.rs | 4 +- components/raftstore-v2/src/batch/store.rs | 17 +- components/raftstore-v2/src/lib.rs | 2 +- .../pd/{update_max_timestamp.rs => misc.rs} | 13 + components/raftstore-v2/src/worker/pd/mod.rs | 223 ++++++++-- .../pd/{region_heartbeat.rs => region.rs} | 180 +++++++- .../raftstore-v2/src/worker/pd/split.rs | 85 +++- .../pd/{store_heartbeat.rs => store.rs} | 11 + .../tests/integrations/cluster.rs | 6 +- .../src/coprocessor/consistency_check.rs | 4 +- components/raftstore/src/store/fsm/store.rs | 5 +- components/raftstore/src/store/mod.rs | 9 +- components/raftstore/src/store/worker/mod.rs | 5 +- components/raftstore/src/store/worker/pd.rs | 408 ++++++++---------- components/server/src/server2.rs | 68 ++- components/test_raftstore/src/util.rs | 5 +- src/server/raftkv2/node.rs | 14 +- 22 files changed, 781 insertions(+), 316 deletions(-) rename components/raftstore-v2/src/worker/pd/{update_max_timestamp.rs => misc.rs} (89%) rename components/raftstore-v2/src/worker/pd/{region_heartbeat.rs => region.rs} (58%) rename components/raftstore-v2/src/worker/pd/{store_heartbeat.rs => store.rs} (96%) diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 5e6fbe87267..93218767ec0 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -92,6 +92,10 @@ impl MiscExt for PanicEngine { panic!() } + fn get_num_keys(&self) -> Result { + panic!() + } + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_panic/src/snapshot.rs b/components/engine_panic/src/snapshot.rs index 296d7ce617a..f6cda5312cb 100644 --- a/components/engine_panic/src/snapshot.rs +++ b/components/engine_panic/src/snapshot.rs @@ -2,7 +2,9 @@ use std::ops::Deref; -use engine_traits::{IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, Snapshot}; +use engine_traits::{ + CfNamesExt, IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, Snapshot, +}; use crate::{db_vector::PanicDbVector, engine::PanicEngine}; @@ -36,6 +38,12 @@ impl Iterable for PanicSnapshot { } } +impl CfNamesExt for PanicSnapshot { + fn cf_names(&self) -> Vec<&str> { + panic!() + } +} + pub struct PanicSnapshotIterator; impl Iterator for PanicSnapshotIterator { diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index e339facaac4..3477226ae76 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -332,6 +332,18 @@ impl MiscExt for RocksEngine { .get_property_int_cf(handle, ROCKSDB_TOTAL_SST_FILES_SIZE)) } + fn get_num_keys(&self) -> Result { + let mut total = 0; + for cf in self.cf_names() { + let handle = util::get_cf_handle(self.as_inner(), cf).unwrap(); + total += self + .as_inner() + .get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) + .unwrap_or_default(); + } + Ok(total) + } + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_rocks/src/snapshot.rs b/components/engine_rocks/src/snapshot.rs index b19a32fd739..60a12c4ac6d 100644 --- a/components/engine_rocks/src/snapshot.rs +++ b/components/engine_rocks/src/snapshot.rs @@ -5,7 +5,9 @@ use std::{ sync::Arc, }; -use engine_traits::{self, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot}; +use engine_traits::{ + self, CfNamesExt, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot, +}; use rocksdb::{rocksdb_options::UnsafeSnap, DBIterator, DB}; use crate::{ @@ -95,3 +97,9 @@ impl Peekable for RocksSnapshot { Ok(v.map(RocksDbVector::from_raw)) } } + +impl CfNamesExt for RocksSnapshot { + fn cf_names(&self) -> Vec<&str> { + self.db.cf_names() + } +} diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index d9a07a1a915..5bbcbb2de79 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -115,6 +115,8 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn get_total_sst_files_size_cf(&self, cf: &str) -> Result>; + fn get_num_keys(&self) -> Result; + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_traits/src/snapshot.rs b/components/engine_traits/src/snapshot.rs index 7907abd1445..a5829161e25 100644 --- a/components/engine_traits/src/snapshot.rs +++ b/components/engine_traits/src/snapshot.rs @@ -2,7 +2,7 @@ use std::fmt::Debug; -use crate::{iterable::Iterable, peekable::Peekable}; +use crate::{iterable::Iterable, peekable::Peekable, CfNamesExt}; /// A consistent read-only view of the database. /// @@ -10,6 +10,6 @@ use crate::{iterable::Iterable, peekable::Peekable}; /// clonable, call `into_sync` to create a `SyncSnapshot`. pub trait Snapshot where - Self: 'static + Peekable + Iterable + Send + Sync + Sized + Debug, + Self: 'static + Peekable + Iterable + CfNamesExt + Send + Sync + Sized + Debug, { } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index ccf3f19f3ea..280e8dcc396 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -26,10 +26,11 @@ use raftstore::{ store::{ fsm::store::{PeerTickBatch, ENTRY_CACHE_EVICT_TICK_DURATION}, local_metrics::RaftMetrics, - Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, StoreWriters, - TabletSnapManager, Transport, WriteSenders, + AutoSplitController, Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, + StoreWriters, TabletSnapManager, Transport, WriteSenders, }, }; +use resource_metering::CollectorRegHandle; use slog::{warn, Logger}; use tikv_util::{ box_err, @@ -511,6 +512,8 @@ impl StoreSystem { concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, ) -> Result<()> @@ -526,7 +529,9 @@ impl StoreSystem { .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); }); - let purge_worker = if raft_engine.need_manual_purge() { + let purge_worker = if raft_engine.need_manual_purge() + && !cfg.value().raft_engine_purge_interval.0.is_zero() + { let worker = Worker::new("purge-worker"); let raft_clone = raft_engine.clone(); let logger = self.logger.clone(); @@ -567,10 +572,14 @@ impl StoreSystem { workers.pd.remote(), concurrency_manager, causal_ts_provider, + workers.pd.scheduler(), + auto_split_controller, + store_meta.lock().unwrap().region_read_progress.clone(), + collector_reg_handle, self.logger.clone(), self.shutdown.clone(), cfg.clone(), - )); + )?); let split_check_scheduler = workers.background.start( "split-check", diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 7ddb1687d91..b82b6de3931 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -41,4 +41,4 @@ pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{store::Config, Error, Result}; -pub use worker::pd::{FlowReporter, Task as PdTask}; +pub use worker::pd::{PdReporter, Task as PdTask}; diff --git a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs b/components/raftstore-v2/src/worker/pd/misc.rs similarity index 89% rename from components/raftstore-v2/src/worker/pd/update_max_timestamp.rs rename to components/raftstore-v2/src/worker/pd/misc.rs index 178d00ebd15..68c624b089a 100644 --- a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs +++ b/components/raftstore-v2/src/worker/pd/misc.rs @@ -107,4 +107,17 @@ where self.remote.spawn(f); } } + + pub fn handle_report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) { + let resp = self + .pd_client + .report_min_resolved_ts(store_id, min_resolved_ts); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + warn!(logger, "report min resolved_ts failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } } diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index b54d088db66..b23d1500914 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -10,12 +10,14 @@ use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; -use pd_client::PdClient; +use pd_client::{BucketStat, PdClient}; use raftstore::store::{ - util::KeysInfoFormatter, Config, FlowStatsReporter, ReadStats, TabletSnapManager, TxnExt, - WriteStats, + util::KeysInfoFormatter, AutoSplitController, Config, FlowStatsReporter, PdStatsMonitor, + ReadStats, RegionReadProgressRegistry, SplitInfo, StoreStatsReporter, TabletSnapManager, + TxnExt, WriteStats, NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }; -use slog::{error, info, Logger}; +use resource_metering::{Collector, CollectorRegHandle, RawRecords}; +use slog::{error, Logger}; use tikv_util::{ config::VersionTrack, time::UnixSecs, @@ -28,22 +30,36 @@ use crate::{ router::{CmdResChannel, PeerMsg}, }; -mod region_heartbeat; +mod misc; +mod region; mod split; -mod store_heartbeat; -mod update_max_timestamp; +mod store; -pub use region_heartbeat::RegionHeartbeatTask; +pub use region::RegionHeartbeatTask; + +type RecordPairVec = Vec; pub enum Task { - RegionHeartbeat(RegionHeartbeatTask), + // In store.rs. StoreHeartbeat { stats: pdpb::StoreStats, // TODO: StoreReport, StoreDrAutoSyncStatus }, + UpdateStoreInfos { + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + }, + // In region.rs. + RegionHeartbeat(RegionHeartbeatTask), + ReportRegionBuckets(BucketStat), + UpdateReadStats(ReadStats), + UpdateWriteStats(WriteStats), + UpdateRegionCpuRecords(Arc), DestroyPeer { region_id: u64, }, + // In split.rs. AskBatchSplit { region: metapb::Region, split_keys: Vec>, @@ -54,24 +70,51 @@ pub enum Task { ReportBatchSplit { regions: Vec, }, + AutoSplit { + split_infos: Vec, + }, + // In misc.rs. UpdateMaxTimestamp { region_id: u64, initial_status: u64, txn_ext: Arc, }, + ReportMinResolvedTs { + store_id: u64, + min_resolved_ts: u64, + }, } impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match *self { + Task::StoreHeartbeat { ref stats, .. } => { + write!(f, "store heartbeat stats: {stats:?}") + } + Task::UpdateStoreInfos { + ref cpu_usages, + ref read_io_rates, + ref write_io_rates, + } => write!( + f, + "get store's information: cpu_usages {:?}, read_io_rates {:?}, write_io_rates {:?}", + cpu_usages, read_io_rates, write_io_rates, + ), Task::RegionHeartbeat(ref hb_task) => write!( f, "region heartbeat for region {:?}, leader {}", hb_task.region, hb_task.peer.get_id(), ), - Task::StoreHeartbeat { ref stats, .. } => { - write!(f, "store heartbeat stats: {:?}", stats) + Task::ReportRegionBuckets(ref buckets) => write!(f, "report buckets: {:?}", buckets), + Task::UpdateReadStats(ref stats) => { + write!(f, "update read stats: {stats:?}") + } + Task::UpdateWriteStats(ref stats) => { + write!(f, "update write stats: {stats:?}") + } + Task::UpdateRegionCpuRecords(ref cpu_records) => { + write!(f, "get region cpu records: {:?}", cpu_records) } Task::DestroyPeer { ref region_id } => { write!(f, "destroy peer of region {}", region_id) @@ -87,11 +130,22 @@ impl Display for Task { KeysInfoFormatter(split_keys.iter()) ), Task::ReportBatchSplit { ref regions } => write!(f, "report split {:?}", regions), + Task::AutoSplit { ref split_infos } => { + write!(f, "auto split split regions, num is {}", split_infos.len()) + } Task::UpdateMaxTimestamp { region_id, .. } => write!( f, "update the max timestamp for region {} in the concurrency manager", region_id ), + Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + } => write!( + f, + "report min resolved ts: store {}, resolved ts {}", + store_id, min_resolved_ts, + ), } } } @@ -108,16 +162,18 @@ where tablet_registry: TabletRegistry, snap_mgr: TabletSnapManager, router: StoreRouter, + stats_monitor: PdStatsMonitor, remote: Remote, - region_peers: HashMap, - - // For store_heartbeat. + // For store. start_ts: UnixSecs, - store_stat: store_heartbeat::StoreStat, + store_stat: store::StoreStat, - // For region_heartbeat. + // For region. + region_peers: HashMap, + region_buckets: HashMap, + // region_id -> total_cpu_time_ms (since last region heartbeat) region_cpu_records: HashMap, is_hb_receiver_scheduled: bool, @@ -146,21 +202,38 @@ where remote: Remote, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 + pd_scheduler: Scheduler, + auto_split_controller: AutoSplitController, + region_read_progress: RegionReadProgressRegistry, + collector_reg_handle: CollectorRegHandle, logger: Logger, shutdown: Arc, cfg: Arc>, - ) -> Self { - Self { + ) -> Result { + let mut stats_monitor = PdStatsMonitor::new( + cfg.value().pd_store_heartbeat_tick_interval.0 / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, + cfg.value().report_min_resolved_ts_interval.0, + PdReporter::new(pd_scheduler, logger.clone()), + ); + stats_monitor.start( + auto_split_controller, + region_read_progress, + collector_reg_handle, + store_id, + )?; + Ok(Self { store_id, pd_client, raft_engine, tablet_registry, snap_mgr, router, + stats_monitor, remote, - region_peers: HashMap::default(), start_ts: UnixSecs::zero(), - store_stat: store_heartbeat::StoreStat::default(), + store_stat: store::StoreStat::default(), + region_peers: HashMap::default(), + region_buckets: HashMap::default(), region_cpu_records: HashMap::default(), is_hb_receiver_scheduled: false, concurrency_manager, @@ -168,7 +241,7 @@ where logger, shutdown, cfg, - } + }) } } @@ -183,8 +256,17 @@ where fn run(&mut self, task: Task) { self.maybe_schedule_heartbeat_receiver(); match task { - Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), Task::StoreHeartbeat { stats } => self.handle_store_heartbeat(stats), + Task::UpdateStoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + } => self.handle_update_store_infos(cpu_usages, read_io_rates, write_io_rates), + Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), + Task::ReportRegionBuckets(buckets) => self.handle_report_region_buckets(buckets), + Task::UpdateReadStats(stats) => self.handle_update_read_stats(stats), + Task::UpdateWriteStats(stats) => self.handle_update_write_stats(stats), + Task::UpdateRegionCpuRecords(records) => self.handle_update_region_cpu_records(records), Task::DestroyPeer { region_id } => self.handle_destroy_peer(region_id), Task::AskBatchSplit { region, @@ -194,51 +276,98 @@ where ch, } => self.handle_ask_batch_split(region, split_keys, peer, right_derive, ch), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), + Task::AutoSplit { split_infos } => self.handle_auto_split(split_infos), Task::UpdateMaxTimestamp { region_id, initial_status, txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), + Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + } => self.handle_report_min_resolved_ts(store_id, min_resolved_ts), } } } -impl Runner -where - EK: KvEngine, - ER: RaftEngine, - T: PdClient + 'static, -{ - fn handle_destroy_peer(&mut self, region_id: u64) { - match self.region_peers.remove(®ion_id) { - None => {} - Some(_) => { - info!(self.logger, "remove peer statistic record in pd"; "region_id" => region_id) - } +#[derive(Clone)] +pub struct PdReporter { + scheduler: Scheduler, + logger: Logger, +} + +impl PdReporter { + pub fn new(scheduler: Scheduler, logger: Logger) -> Self { + PdReporter { scheduler, logger } + } +} + +impl FlowStatsReporter for PdReporter { + fn report_read_stats(&self, stats: ReadStats) { + if let Err(e) = self.scheduler.schedule(Task::UpdateReadStats(stats)) { + error!(self.logger, "Failed to send read flow statistics"; "err" => ?e); + } + } + + fn report_write_stats(&self, stats: WriteStats) { + if let Err(e) = self.scheduler.schedule(Task::UpdateWriteStats(stats)) { + error!(self.logger, "Failed to send write flow statistics"; "err" => ?e); } } } -#[derive(Clone)] -pub struct FlowReporter { - _scheduler: Scheduler, +impl Collector for PdReporter { + fn collect(&self, records: Arc) { + self.scheduler + .schedule(Task::UpdateRegionCpuRecords(records)) + .ok(); + } } -impl FlowReporter { - pub fn new(scheduler: Scheduler) -> Self { - FlowReporter { - _scheduler: scheduler, +impl StoreStatsReporter for PdReporter { + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + let task = Task::UpdateStoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send store infos to pd worker"; + "err" => ?e, + ); } } -} -impl FlowStatsReporter for FlowReporter { - fn report_read_stats(&self, _read_stats: ReadStats) { - // TODO + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64) { + let task = Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send min resolved ts to pd worker"; + "err" => ?e, + ); + } } - fn report_write_stats(&self, _write_stats: WriteStats) { - // TODO + fn auto_split(&self, split_infos: Vec) { + let task = Task::AutoSplit { split_infos }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send split infos to pd worker"; + "err" => ?e, + ); + } } } diff --git a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs b/components/raftstore-v2/src/worker/pd/region.rs similarity index 58% rename from components/raftstore-v2/src/worker/pd/region_heartbeat.rs rename to components/raftstore-v2/src/worker/pd/region.rs index 31f84801ed2..d282534329b 100644 --- a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -1,10 +1,15 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{sync::Arc, time::Duration}; +use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{metapb, pdpb}; -use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, PdClient, RegionStat}; +use pd_client::{ + merge_bucket_stats, metrics::PD_HEARTBEAT_COUNTER_VEC, BucketStat, PdClient, RegionStat, +}; +use raftstore::store::{ReadStats, WriteStats}; +use resource_metering::RawRecords; use slog::{debug, info}; use tikv_util::{store::QueryStats, time::UnixSecs}; @@ -44,6 +49,58 @@ pub struct PeerStat { pub approximate_size: u64, } +#[derive(Default)] +pub struct ReportBucket { + current_stat: BucketStat, + last_report_stat: Option, + last_report_ts: UnixSecs, +} + +impl ReportBucket { + fn new(current_stat: BucketStat) -> Self { + Self { + current_stat, + ..Default::default() + } + } + + fn report(&mut self, report_ts: UnixSecs) -> BucketStat { + self.last_report_ts = report_ts; + match self.last_report_stat.replace(self.current_stat.clone()) { + Some(last) => { + let mut delta = BucketStat::new( + self.current_stat.meta.clone(), + pd_client::new_bucket_stats(&self.current_stat.meta), + ); + // Buckets may be changed, recalculate last stats according to current meta. + merge_bucket_stats( + &delta.meta.keys, + &mut delta.stats, + &last.meta.keys, + &last.stats, + ); + for i in 0..delta.meta.keys.len() - 1 { + delta.stats.write_bytes[i] = + self.current_stat.stats.write_bytes[i] - delta.stats.write_bytes[i]; + delta.stats.write_keys[i] = + self.current_stat.stats.write_keys[i] - delta.stats.write_keys[i]; + delta.stats.write_qps[i] = + self.current_stat.stats.write_qps[i] - delta.stats.write_qps[i]; + + delta.stats.read_bytes[i] = + self.current_stat.stats.read_bytes[i] - delta.stats.read_bytes[i]; + delta.stats.read_keys[i] = + self.current_stat.stats.read_keys[i] - delta.stats.read_keys[i]; + delta.stats.read_qps[i] = + self.current_stat.stats.read_qps[i] - delta.stats.read_qps[i]; + } + delta + } + None => self.current_stat.clone(), + } + } +} + impl Runner where EK: KvEngine, @@ -244,4 +301,123 @@ where self.remote.spawn(f); self.is_hb_receiver_scheduled = true; } + + pub fn handle_report_region_buckets(&mut self, region_buckets: BucketStat) { + let region_id = region_buckets.meta.region_id; + self.merge_buckets(region_buckets); + let report_buckets = self.region_buckets.get_mut(®ion_id).unwrap(); + let last_report_ts = if report_buckets.last_report_ts.is_zero() { + self.start_ts + } else { + report_buckets.last_report_ts + }; + let now = UnixSecs::now(); + let interval_second = now.into_inner() - last_report_ts.into_inner(); + let delta = report_buckets.report(now); + let resp = self + .pd_client + .report_region_buckets(&delta, Duration::from_secs(interval_second)); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + debug!( + logger, + "failed to send buckets"; + "region_id" => region_id, + "version" => delta.meta.version, + "region_epoch" => ?delta.meta.region_epoch, + "err" => ?e + ); + } + }; + self.remote.spawn(f); + } + + pub fn handle_update_read_stats(&mut self, mut stats: ReadStats) { + for (region_id, region_info) in stats.region_infos.iter_mut() { + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); + peer_stat.read_bytes += region_info.flow.read_bytes as u64; + peer_stat.read_keys += region_info.flow.read_keys as u64; + self.store_stat.engine_total_bytes_read += region_info.flow.read_bytes as u64; + self.store_stat.engine_total_keys_read += region_info.flow.read_keys as u64; + peer_stat + .query_stats + .add_query_stats(®ion_info.query_stats.0); + self.store_stat + .engine_total_query_num + .add_query_stats(®ion_info.query_stats.0); + } + for (_, region_buckets) in std::mem::take(&mut stats.region_buckets) { + self.merge_buckets(region_buckets); + } + if !stats.region_infos.is_empty() { + self.stats_monitor.maybe_send_read_stats(stats); + } + } + + pub fn handle_update_write_stats(&mut self, mut stats: WriteStats) { + for (region_id, region_info) in stats.region_infos.iter_mut() { + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); + peer_stat.query_stats.add_query_stats(®ion_info.0); + self.store_stat + .engine_total_query_num + .add_query_stats(®ion_info.0); + } + } + + pub fn handle_update_region_cpu_records(&mut self, records: Arc) { + // Send Region CPU info to AutoSplitController inside the stats_monitor. + self.stats_monitor.maybe_send_cpu_stats(&records); + Self::calculate_region_cpu_records(self.store_id, records, &mut self.region_cpu_records); + } + + pub fn handle_destroy_peer(&mut self, region_id: u64) { + match self.region_peers.remove(®ion_id) { + None => {} + Some(_) => { + info!(self.logger, "remove peer statistic record in pd"; "region_id" => region_id) + } + } + } + + fn merge_buckets(&mut self, mut buckets: BucketStat) { + let region_id = buckets.meta.region_id; + self.region_buckets + .entry(region_id) + .and_modify(|report_bucket| { + let current = &mut report_bucket.current_stat; + if current.meta < buckets.meta { + std::mem::swap(current, &mut buckets); + } + + merge_bucket_stats( + ¤t.meta.keys, + &mut current.stats, + &buckets.meta.keys, + &buckets.stats, + ); + }) + .or_insert_with(|| ReportBucket::new(buckets)); + } + + fn calculate_region_cpu_records( + store_id: u64, + records: Arc, + region_cpu_records: &mut HashMap, + ) { + for (tag, record) in &records.records { + let record_store_id = tag.store_id; + if record_store_id != store_id { + continue; + } + // Reporting a region heartbeat later will clear the corresponding record. + *region_cpu_records.entry(tag.region_id).or_insert(0) += record.cpu_time; + } + } } diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index cb7c3ad9308..bf13e01120a 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -6,10 +6,12 @@ use kvproto::{ raft_cmdpb::{AdminCmdType, AdminRequest, SplitRequest}, }; use pd_client::PdClient; -use slog::{info, warn}; +use raftstore::store::SplitInfo; +use slog::{info, warn, Logger}; +use yatp::{task::future::TaskCell, Remote}; use super::{requests::*, Runner}; -use crate::router::CmdResChannel; +use crate::{batch::StoreRouter, router::CmdResChannel}; fn new_batch_split_region_request( split_keys: Vec>, @@ -37,24 +39,50 @@ where ER: RaftEngine, T: PdClient + 'static, { + #[inline] pub fn handle_ask_batch_split( &mut self, - mut region: metapb::Region, + region: metapb::Region, split_keys: Vec>, peer: metapb::Peer, right_derive: bool, ch: CmdResChannel, + ) { + Self::ask_batch_split_imp( + &self.pd_client, + &self.logger, + &self.router, + &self.remote, + region, + split_keys, + peer, + right_derive, + Some(ch), + ); + } + + fn ask_batch_split_imp( + pd_client: &T, + logger: &Logger, + router: &StoreRouter, + remote: &Remote, + mut region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + ch: Option, ) { if split_keys.is_empty() { - info!(self.logger, "empty split key, skip ask batch split"; - "region_id" => region.get_id()); + info!( + logger, + "empty split key, skip ask batch split"; + "region_id" => region.get_id() + ); return; } - let resp = self - .pd_client - .ask_batch_split(region.clone(), split_keys.len()); - let router = self.router.clone(); - let logger = self.logger.clone(); + let resp = pd_client.ask_batch_split(region.clone(), split_keys.len()); + let router = router.clone(); + let logger = logger.clone(); let f = async move { match resp.await { Ok(mut resp) => { @@ -73,7 +101,7 @@ where ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); - send_admin_request(&logger, &router, region_id, epoch, peer, req, Some(ch)); + send_admin_request(&logger, &router, region_id, epoch, peer, req, ch); } Err(e) => { warn!( @@ -85,7 +113,7 @@ where } } }; - self.remote.spawn(f); + remote.spawn(f); } pub fn handle_report_batch_split(&mut self, regions: Vec) { @@ -98,4 +126,37 @@ where }; self.remote.spawn(f); } + + pub fn handle_auto_split(&mut self, split_infos: Vec) { + let pd_client = self.pd_client.clone(); + let logger = self.logger.clone(); + let router = self.router.clone(); + let remote = self.remote.clone(); + + let f = async move { + for split_info in split_infos { + let Ok(Some(region)) = + pd_client.get_region_by_id(split_info.region_id).await else { continue }; + // Try to split the region with the given split key. + if let Some(split_key) = split_info.split_key { + Self::ask_batch_split_imp( + &pd_client, + &logger, + &router, + &remote, + region, + vec![split_key], + split_info.peer, + true, + None, + ); + // Try to split the region on half within the given key + // range if there is no `split_key` been given. + } else if split_info.start_key.is_some() && split_info.end_key.is_some() { + // TODO: implement half split + } + } + }; + self.remote.spawn(f); + } } diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store.rs similarity index 96% rename from components/raftstore-v2/src/worker/pd/store_heartbeat.rs rename to components/raftstore-v2/src/worker/pd/store.rs index ba75354c753..8f30b85d6f3 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store.rs @@ -257,6 +257,17 @@ where self.remote.spawn(f); } + pub fn handle_update_store_infos( + &mut self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + self.store_stat.store_cpu_usages = cpu_usages; + self.store_stat.store_read_io_rates = read_io_rates; + self.store_stat.store_write_io_rates = write_io_rates; + } + /// Returns (capacity, used, available). fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { let disk_stats = match fs2::statvfs(self.tablet_registry.tablet_root()) { diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index ce0248130fb..90f7c500903 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -33,7 +33,8 @@ use raftstore::{ coprocessor::CoprocessorHost, store::{ region_meta::{RegionLocalState, RegionMeta}, - Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, + AutoSplitController, Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, + RAFT_INIT_LOG_INDEX, }, }; use raftstore_v2::{ @@ -41,6 +42,7 @@ use raftstore_v2::{ router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, }; +use resource_metering::CollectorRegHandle; use slog::{debug, o, Logger}; use tempfile::TempDir; use test_pd::mocker::Service; @@ -300,6 +302,8 @@ impl RunningState { concurrency_manager, causal_ts_provider, coprocessor_host, + AutoSplitController::default(), + CollectorRegHandle::new_for_test(), background.clone(), pd_worker, ) diff --git a/components/raftstore/src/coprocessor/consistency_check.rs b/components/raftstore/src/coprocessor/consistency_check.rs index 5ba97089f85..2ebf27c963f 100644 --- a/components/raftstore/src/coprocessor/consistency_check.rs +++ b/components/raftstore/src/coprocessor/consistency_check.rs @@ -2,7 +2,7 @@ use std::marker::PhantomData; -use engine_traits::{KvEngine, Snapshot, ALL_CFS, CF_RAFT}; +use engine_traits::{KvEngine, Snapshot, CF_RAFT}; use kvproto::metapb::Region; use crate::{ @@ -63,7 +63,7 @@ fn compute_hash_on_raw(region: &Region, snap: &S) -> Result { let start_key = keys::enc_start_key(region); let end_key = keys::enc_end_key(region); - for cf in ALL_CFS { + for cf in snap.cf_names() { snap.scan(cf, &start_key, &end_key, false, |k, v| { digest.update(k); digest.update(v); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 3724eba13e2..b75aee3b4bb 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1516,7 +1516,9 @@ impl RaftBatchSystem { ) -> Result<()> { assert!(self.workers.is_none()); // TODO: we can get cluster meta regularly too later. - let purge_worker = if engines.raft.need_manual_purge() { + let purge_worker = if engines.raft.need_manual_purge() + && !cfg.value().raft_engine_purge_interval.0.is_zero() + { let worker = Worker::new("purge-worker"); let raft_clone = engines.raft.clone(); let router_clone = self.router(); @@ -1735,7 +1737,6 @@ impl RaftBatchSystem { Arc::clone(&pd_client), self.router.clone(), workers.pd_worker.scheduler(), - cfg.pd_store_heartbeat_tick_interval.0, auto_split_controller, concurrency_manager, snap_mgr, diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 42fb320035b..fe3c12427bd 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -79,9 +79,10 @@ pub use self::{ worker::{ metrics as worker_metrics, AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, - LocalReadContext, LocalReader, LocalReaderCore, PdTask, ReadDelegate, ReadExecutor, - ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, - SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, - TrackVer, WriteStats, + LocalReadContext, LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, + StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }, }; diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index e021651ba3d..ac23f4e58d5 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -27,7 +27,8 @@ pub use self::{ consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, - Runner as PdRunner, Task as PdTask, + Runner as PdRunner, StatsMonitor as PdStatsMonitor, StoreStatsReporter, Task as PdTask, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }, raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, read::{ @@ -44,5 +45,5 @@ pub use self::{ Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, }, split_config::{SplitConfig, SplitConfigManager}, - split_controller::{AutoSplitController, ReadStats, SplitConfigChange, WriteStats}, + split_controller::{AutoSplitController, ReadStats, SplitConfigChange, SplitInfo, WriteStats}, }; diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index fdfa1b44c85..79b58d75c83 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -69,6 +69,8 @@ use crate::{ }, }; +pub const NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT: u32 = 2; + type RecordPairVec = Vec; #[derive(Default, Debug, Clone)] @@ -189,7 +191,6 @@ where id: u64, duration: RaftstoreDuration, }, - UpdateRegionCpuCollector(bool), RegionCpuRecords(Arc), ReportMinResolvedTs { store_id: u64, @@ -267,7 +268,7 @@ pub struct PeerStat { } #[derive(Default)] -pub struct ReportBucket { +struct ReportBucket { current_stat: BucketStat, last_report_stat: Option, last_report_ts: UnixSecs, @@ -418,12 +419,6 @@ where Task::UpdateSlowScore { id, ref duration } => { write!(f, "compute slow score: id {}, duration {:?}", id, duration) } - Task::UpdateRegionCpuCollector(is_register) => { - if is_register { - return write!(f, "register region cpu collector"); - } - write!(f, "deregister region cpu collector") - } Task::RegionCpuRecords(ref cpu_records) => { write!(f, "get region cpu records: {:?}", cpu_records) } @@ -476,12 +471,83 @@ fn convert_record_pairs(m: HashMap) -> RecordPairVec { .collect() } -struct StatsMonitor +#[derive(Clone)] +pub struct WrappedScheduler(Scheduler>); + +impl Collector for WrappedScheduler where EK: KvEngine, ER: RaftEngine, { - scheduler: Scheduler>, + fn collect(&self, records: Arc) { + self.0.schedule(Task::RegionCpuRecords(records)).ok(); + } +} + +pub trait StoreStatsReporter: Send + Clone + Sync + 'static + Collector { + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ); + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64); + fn auto_split(&self, split_infos: Vec); +} + +impl StoreStatsReporter for WrappedScheduler +where + EK: KvEngine, + ER: RaftEngine, +{ + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + let task = Task::StoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send store infos to pd worker"; + "err" => ?e, + ); + } + } + + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64) { + let task = Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send min resolved ts to pd worker"; + "err" => ?e, + ); + } + } + + fn auto_split(&self, split_infos: Vec) { + let task = Task::AutoSplit { split_infos }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send split infos to pd worker"; + "err" => ?e, + ); + } + } +} + +pub struct StatsMonitor +where + T: StoreStatsReporter, +{ + reporter: T, handle: Option>, timer: Option>, read_stats_sender: Option>, @@ -492,18 +558,13 @@ where report_min_resolved_ts_interval: Duration, } -impl StatsMonitor +impl StatsMonitor where - EK: KvEngine, - ER: RaftEngine, + T: StoreStatsReporter, { - pub fn new( - interval: Duration, - report_min_resolved_ts_interval: Duration, - scheduler: Scheduler>, - ) -> Self { + pub fn new(interval: Duration, report_min_resolved_ts_interval: Duration, reporter: T) -> Self { StatsMonitor { - scheduler, + reporter, handle: None, timer: None, read_stats_sender: None, @@ -524,11 +585,10 @@ where &mut self, mut auto_split_controller: AutoSplitController, region_read_progress: RegionReadProgressRegistry, + collector_reg_handle: CollectorRegHandle, store_id: u64, ) -> Result<(), io::Error> { - if self.collect_tick_interval < default_collect_tick_interval() - || self.collect_store_infos_interval < self.collect_tick_interval - { + if self.collect_tick_interval < default_collect_tick_interval() { info!( "interval is too small, skip stats monitoring. If we are running tests, it is normal, otherwise a check is needed." ); @@ -555,7 +615,7 @@ where let (cpu_stats_sender, cpu_stats_receiver) = mpsc::channel(); self.cpu_stats_sender = Some(cpu_stats_sender); - let scheduler = self.scheduler.clone(); + let reporter = self.reporter.clone(); let props = tikv_util::thread_group::current_properties(); fn is_enable_tick(timer_cnt: u64, interval: u64) -> bool { @@ -570,13 +630,23 @@ where // make sure the record won't be disturbed. let mut collect_store_infos_thread_stats = ThreadInfoStatistics::new(); let mut load_base_split_thread_stats = ThreadInfoStatistics::new(); + let mut region_cpu_records_collector = None; + // Register the region CPU records collector. + if auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio + > 0.0 + { + region_cpu_records_collector = + Some(collector_reg_handle.register(Box::new(reporter.clone()), false)); + } while let Err(mpsc::RecvTimeoutError::Timeout) = timer_rx.recv_timeout(tick_interval) { if is_enable_tick(timer_cnt, collect_store_infos_interval) { StatsMonitor::collect_store_infos( &mut collect_store_infos_thread_stats, - &scheduler, + &reporter, ); } if is_enable_tick(timer_cnt, load_base_split_check_interval) { @@ -585,14 +655,15 @@ where &read_stats_receiver, &cpu_stats_receiver, &mut load_base_split_thread_stats, - &scheduler, + &reporter, + &collector_reg_handle, + &mut region_cpu_records_collector, ); } if is_enable_tick(timer_cnt, report_min_resolved_ts_interval) { - StatsMonitor::report_min_resolved_ts( - ®ion_read_progress, + reporter.report_min_resolved_ts( store_id, - &scheduler, + region_read_progress.get_min_resolved_ts(), ); } timer_cnt += 1; @@ -604,26 +675,13 @@ where Ok(()) } - pub fn collect_store_infos( - thread_stats: &mut ThreadInfoStatistics, - scheduler: &Scheduler>, - ) { + pub fn collect_store_infos(thread_stats: &mut ThreadInfoStatistics, reporter: &T) { thread_stats.record(); let cpu_usages = convert_record_pairs(thread_stats.get_cpu_usages()); let read_io_rates = convert_record_pairs(thread_stats.get_read_io_rates()); let write_io_rates = convert_record_pairs(thread_stats.get_write_io_rates()); - let task = Task::StoreInfos { - cpu_usages, - read_io_rates, - write_io_rates, - }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send store infos to pd worker"; - "err" => ?e, - ); - } + reporter.report_store_infos(cpu_usages, read_io_rates, write_io_rates); } pub fn load_base_split( @@ -631,16 +689,19 @@ where read_stats_receiver: &Receiver, cpu_stats_receiver: &Receiver>, thread_stats: &mut ThreadInfoStatistics, - scheduler: &Scheduler>, + reporter: &T, + collector_reg_handle: &CollectorRegHandle, + region_cpu_records_collector: &mut Option, ) { let start_time = TiInstant::now(); match auto_split_controller.refresh_and_check_cfg() { SplitConfigChange::UpdateRegionCpuCollector(is_register) => { - if let Err(e) = scheduler.schedule(Task::UpdateRegionCpuCollector(is_register)) { - error!( - "failed to register or deregister the region cpu collector"; - "is_register" => is_register, - "err" => ?e, + // If it's a deregister task, just take and drop the original collector. + if !is_register { + region_cpu_records_collector.take(); + } else { + region_cpu_records_collector.get_or_insert( + collector_reg_handle.register(Box::new(reporter.clone()), false), ); } } @@ -658,13 +719,7 @@ where let (top_qps, split_infos) = auto_split_controller.flush(read_stats_vec, cpu_stats_vec, thread_stats); auto_split_controller.clear(); - let task = Task::AutoSplit { split_infos }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send split infos to pd worker"; - "err" => ?e, - ); - } + reporter.auto_split(split_infos); for i in 0..TOP_N { if i < top_qps.len() { READ_QPS_TOPN @@ -677,23 +732,6 @@ where LOAD_BASE_SPLIT_DURATION_HISTOGRAM.observe(start_time.saturating_elapsed_secs()); } - pub fn report_min_resolved_ts( - region_read_progress: &RegionReadProgressRegistry, - store_id: u64, - scheduler: &Scheduler>, - ) { - let task = Task::ReportMinResolvedTs { - store_id, - min_resolved_ts: region_read_progress.get_min_resolved_ts(), - }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send min resolved ts to pd worker"; - "err" => ?e, - ); - } - } - pub fn stop(&mut self) { if let Some(h) = self.handle.take() { drop(self.timer.take()); @@ -705,14 +743,22 @@ where } } - #[inline(always)] - fn get_read_stats_sender(&self) -> &Option> { - &self.read_stats_sender + #[inline] + pub fn maybe_send_read_stats(&self, read_stats: ReadStats) { + if let Some(sender) = &self.read_stats_sender { + if sender.send(read_stats).is_err() { + warn!("send read_stats failed, are we shutting down?") + } + } } - #[inline(always)] - fn get_cpu_stats_sender(&self) -> &Option>> { - &self.cpu_stats_sender + #[inline] + pub fn maybe_send_cpu_stats(&self, cpu_stats: &Arc) { + if let Some(sender) = &self.cpu_stats_sender { + if sender.send(cpu_stats.clone()).is_err() { + warn!("send region cpu info failed, are we shutting down?") + } + } } } @@ -845,37 +891,6 @@ impl SlowScore { } } -// RegionCpuMeteringCollector is used to collect the region-related CPU info. -struct RegionCpuMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - scheduler: Scheduler>, -} - -impl RegionCpuMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - fn new(scheduler: Scheduler>) -> RegionCpuMeteringCollector { - RegionCpuMeteringCollector { scheduler } - } -} - -impl Collector for RegionCpuMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - fn collect(&self, records: Arc) { - self.scheduler - .schedule(Task::RegionCpuRecords(records)) - .ok(); - } -} - pub struct Runner where EK: KvEngine, @@ -896,11 +911,9 @@ where // actually it is the sender connected to Runner's Worker which // calls Runner's run() on Task received. scheduler: Scheduler>, - stats_monitor: StatsMonitor, + stats_monitor: StatsMonitor>, store_heartbeat_interval: Duration, - collector_reg_handle: CollectorRegHandle, - region_cpu_records_collector: Option, // region_id -> total_cpu_time_ms (since last region heartbeat) region_cpu_records: HashMap, @@ -922,15 +935,12 @@ where ER: RaftEngine, T: PdClient + 'static, { - const INTERVAL_DIVISOR: u32 = 2; - pub fn new( cfg: &Config, store_id: u64, pd_client: Arc, router: RaftRouter, scheduler: Scheduler>, - store_heartbeat_interval: Duration, auto_split_controller: AutoSplitController, concurrency_manager: ConcurrencyManager, snap_mgr: SnapManager, @@ -941,25 +951,19 @@ where coprocessor_host: CoprocessorHost, causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Runner { - // Register the region CPU records collector. - let mut region_cpu_records_collector = None; - if auto_split_controller - .cfg - .region_cpu_overload_threshold_ratio - > 0.0 - { - region_cpu_records_collector = Some(collector_reg_handle.register( - Box::new(RegionCpuMeteringCollector::new(scheduler.clone())), - false, - )); - } - let interval = store_heartbeat_interval / Self::INTERVAL_DIVISOR; + let store_heartbeat_interval = cfg.pd_store_heartbeat_tick_interval.0; + let interval = store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT; let mut stats_monitor = StatsMonitor::new( interval, cfg.report_min_resolved_ts_interval.0, - scheduler.clone(), + WrappedScheduler(scheduler.clone()), ); - if let Err(e) = stats_monitor.start(auto_split_controller, region_read_progress, store_id) { + if let Err(e) = stats_monitor.start( + auto_split_controller, + region_read_progress, + collector_reg_handle, + store_id, + ) { error!("failed to start stats collector, error = {:?}", e); } @@ -975,8 +979,6 @@ where scheduler, store_heartbeat_interval, stats_monitor, - collector_reg_handle, - region_cpu_records_collector, region_cpu_records: HashMap::default(), concurrency_manager, snap_mgr, @@ -1041,21 +1043,6 @@ where self.remote.spawn(f); } - fn handle_update_region_cpu_collector(&mut self, is_register: bool) { - // If it's a deregister task, just take and drop the original collector. - if !is_register { - self.region_cpu_records_collector.take(); - return; - } - if self.region_cpu_records_collector.is_some() { - return; - } - self.region_cpu_records_collector = Some(self.collector_reg_handle.register( - Box::new(RegionCpuMeteringCollector::new(self.scheduler.clone())), - false, - )); - } - // Note: The parameter doesn't contain `self` because this function may // be called in an asynchronous context. fn handle_ask_batch_split( @@ -1604,11 +1591,7 @@ where self.merge_buckets(region_buckets); } if !read_stats.region_infos.is_empty() { - if let Some(sender) = self.stats_monitor.get_read_stats_sender() { - if sender.send(read_stats).is_err() { - warn!("send read_stats failed, are we shutting down?") - } - } + self.stats_monitor.maybe_send_read_stats(read_stats); } } @@ -1756,11 +1739,7 @@ where // TODO: more accurate CPU consumption of a specified region. fn handle_region_cpu_records(&mut self, records: Arc) { // Send Region CPU info to AutoSplitController inside the stats_monitor. - if let Some(cpu_stats_sender) = self.stats_monitor.get_cpu_stats_sender() { - if cpu_stats_sender.send(records.clone()).is_err() { - warn!("send region cpu info failed, are we shutting down?") - } - } + self.stats_monitor.maybe_send_cpu_stats(&records); calculate_region_cpu_records(self.store_id, records, &mut self.region_cpu_records); } @@ -1856,22 +1835,10 @@ where stats.set_is_busy(true); // We do not need to report store_info, so we just set `None` here. - let task = Task::StoreHeartbeat { - stats, - store_info: None, - report: None, - dr_autosync_status: None, - }; - if let Err(e) = self.scheduler.schedule(task) { - error!("force report store heartbeat failed"; - "store_id" => self.store_id, - "err" => ?e - ); - } else { - warn!("scheduling store_heartbeat timeout, force report store slow score to pd."; - "store_id" => self.store_id, - ); - } + self.handle_store_heartbeat(stats, None, None, None); + warn!("scheduling store_heartbeat timeout, force report store slow score to pd."; + "store_id" => self.store_id, + ); } fn is_store_heartbeat_delayed(&self) -> bool { @@ -1954,48 +1921,43 @@ where let f = async move { for split_info in split_infos { - if let Ok(Some(region)) = - pd_client.get_region_by_id(split_info.region_id).await - { - // Try to split the region with the given split key. - if let Some(split_key) = split_info.split_key { - Self::handle_ask_batch_split( - router.clone(), - scheduler.clone(), - pd_client.clone(), - region, - vec![split_key], - split_info.peer, - true, - Callback::None, - String::from("auto_split"), - remote.clone(), + let Ok(Some(region)) = + pd_client.get_region_by_id(split_info.region_id).await else { continue }; + // Try to split the region with the given split key. + if let Some(split_key) = split_info.split_key { + Self::handle_ask_batch_split( + router.clone(), + scheduler.clone(), + pd_client.clone(), + region, + vec![split_key], + split_info.peer, + true, + Callback::None, + String::from("auto_split"), + remote.clone(), + ); + // Try to split the region on half within the given key + // range if there is no `split_key` been given. + } else if split_info.start_key.is_some() && split_info.end_key.is_some() { + let start_key = split_info.start_key.unwrap(); + let end_key = split_info.end_key.unwrap(); + let region_id = region.get_id(); + let msg = CasualMessage::HalfSplitRegion { + region_epoch: region.get_region_epoch().clone(), + start_key: Some(start_key.clone()), + end_key: Some(end_key.clone()), + policy: pdpb::CheckPolicy::Scan, + source: "auto_split", + cb: Callback::None, + }; + if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { + error!("send auto half split request failed"; + "region_id" => region_id, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => ?e, ); - return; - } - // Try to split the region on half within the given key range - // if there is no `split_key` been given. - if split_info.start_key.is_some() && split_info.end_key.is_some() { - let start_key = split_info.start_key.unwrap(); - let end_key = split_info.end_key.unwrap(); - let region_id = region.get_id(); - let msg = CasualMessage::HalfSplitRegion { - region_epoch: region.get_region_epoch().clone(), - start_key: Some(start_key.clone()), - end_key: Some(end_key.clone()), - policy: pdpb::CheckPolicy::Scan, - source: "auto_split", - cb: Callback::None, - }; - if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) - { - error!("send auto half split request failed"; - "region_id" => region_id, - "start_key" => log_wrappers::Value::key(&start_key), - "end_key" => log_wrappers::Value::key(&end_key), - "err" => ?e, - ); - } } } } @@ -2124,9 +2086,6 @@ where } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), Task::UpdateSlowScore { id, duration } => self.slow_score.record(id, duration.sum()), - Task::UpdateRegionCpuCollector(is_register) => { - self.handle_update_region_cpu_collector(is_register) - } Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), Task::ReportMinResolvedTs { store_id, @@ -2469,7 +2428,7 @@ mod tests { struct RunnerTest { store_stat: Arc>, - stats_monitor: StatsMonitor, + stats_monitor: StatsMonitor>, } impl RunnerTest { @@ -2481,13 +2440,16 @@ mod tests { let mut stats_monitor = StatsMonitor::new( Duration::from_secs(interval), Duration::from_secs(0), - scheduler, + WrappedScheduler(scheduler), ); let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); - if let Err(e) = - stats_monitor.start(AutoSplitController::default(), region_read_progress, 1) - { + if let Err(e) = stats_monitor.start( + AutoSplitController::default(), + region_read_progress, + CollectorRegHandle::new_for_test(), + 1, + ) { error!("failed to start stats collector, error = {:?}", e); } diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 03b02e5f81e..36a02130fdb 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -47,7 +47,10 @@ use file_system::{ use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; -use kvproto::{deadlock::create_deadlock, diagnosticspb::create_diagnostics, kvrpcpb::ApiVersion}; +use kvproto::{ + deadlock::create_deadlock, diagnosticspb::create_diagnostics, kvrpcpb::ApiVersion, + resource_usage_agent::create_resource_metering_pub_sub, +}; use pd_client::{PdClient, RpcClient}; use raft_log_engine::RaftLogEngine; use raftstore::{ @@ -56,8 +59,8 @@ use raftstore::{ RawConsistencyCheckObserver, }, store::{ - memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, CheckLeaderRunner, SplitConfigManager, - TabletSnapManager, + memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, AutoSplitController, CheckLeaderRunner, + SplitConfigManager, TabletSnapManager, }, RegionInfoAccessor, }; @@ -68,7 +71,7 @@ use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, - read_pool::{build_yatp_read_pool, ReadPool}, + read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, server::{ config::{Config as ServerConfig, ServerConfigManager}, gc_worker::{AutoGcConfig, GcWorker}, @@ -235,6 +238,7 @@ struct TikvEngines { struct Servers { lock_mgr: LockManager, server: LocalServer, + rsmeter_pubsub_service: resource_metering::PubSubService, } type LocalServer = Server>; @@ -635,7 +639,10 @@ where let engines = self.engines.as_ref().unwrap(); let pd_worker = LazyWorker::new("pd-worker"); - let pd_sender = raftstore_v2::FlowReporter::new(pd_worker.scheduler()); + let pd_sender = raftstore_v2::PdReporter::new( + pd_worker.scheduler(), + slog_global::borrow_global().new(slog::o!()), + ); let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { let resource_ctl = self @@ -674,15 +681,16 @@ where let (reporter_notifier, data_sink_reg_handle, reporter_worker) = resource_metering::init_reporter( self.config.resource_metering.clone(), - collector_reg_handle, + collector_reg_handle.clone(), ); self.to_stop.push(reporter_worker); let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( self.config.resource_metering.receiver_address.clone(), self.env.clone(), - data_sink_reg_handle, + data_sink_reg_handle.clone(), ); self.to_stop.push(single_target_worker); + let rsmeter_pubsub_service = resource_metering::PubSubService::new(data_sink_reg_handle); let cfg_manager = resource_metering::ConfigManager::new( self.config.resource_metering.clone(), @@ -769,6 +777,22 @@ where cop_read_pools.handle() }; + let mut unified_read_pool_scale_receiver = None; + if self.config.readpool.is_unified_pool_enabled() { + let (unified_read_pool_scale_notifier, rx) = mpsc::sync_channel(10); + cfg_controller.register( + tikv::config::Module::Readpool, + Box::new(ReadPoolConfigManager::new( + unified_read_pool.as_ref().unwrap().handle(), + unified_read_pool_scale_notifier, + &self.background_worker, + self.config.readpool.unified.max_thread_count, + self.config.readpool.unified.auto_adjust_pool_size, + )), + ); + unified_read_pool_scale_receiver = Some(rx); + } + let check_leader_runner = CheckLeaderRunner::new( self.router.as_ref().unwrap().store_meta().clone(), self.coprocessor_host.clone().unwrap(), @@ -828,7 +852,17 @@ where let split_config_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(self.config.split.clone()))); - cfg_controller.register(tikv::config::Module::Split, Box::new(split_config_manager)); + cfg_controller.register( + tikv::config::Module::Split, + Box::new(split_config_manager.clone()), + ); + + let auto_split_controller = AutoSplitController::new( + split_config_manager, + self.config.server.grpc_concurrency, + self.config.readpool.unified.max_thread_count, + unified_read_pool_scale_receiver, + ); // `ConsistencyCheckObserver` must be registered before `Node::start`. let safe_point = Arc::new(AtomicU64::new(0)); @@ -858,6 +892,8 @@ where self.concurrency_manager.clone(), self.causal_ts_provider.clone(), self.coprocessor_host.clone().unwrap(), + auto_split_controller, + collector_reg_handle, self.background_worker.clone(), pd_worker, raft_store, @@ -882,7 +918,11 @@ where initial_metric(&self.config.metric); - self.servers = Some(Servers { lock_mgr, server }); + self.servers = Some(Servers { + lock_mgr, + server, + rsmeter_pubsub_service, + }); server_config } @@ -923,6 +963,16 @@ where &self.config.pessimistic_txn, ) .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); + + if servers + .server + .register_service(create_resource_metering_pub_sub( + servers.rsmeter_pubsub_service.clone(), + )) + .is_some() + { + warn!("failed to register resource metering pubsub service"); + } } fn init_io_utility(&mut self) -> BytesFetcher { diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index d5c2eefa6d6..8b3745120d5 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -16,7 +16,8 @@ use encryption_export::{ use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, ALL_CFS, CF_DEFAULT, CF_RAFT, + CfNamesExt, Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, CF_DEFAULT, + CF_RAFT, }; use file_system::IoRateLimiter; use futures::executor::block_on; @@ -101,7 +102,7 @@ pub fn must_region_cleared(engine: &Engines, region assert_eq!(state.get_state(), PeerState::Tombstone, "{:?}", state); let start_key = keys::data_key(region.get_start_key()); let end_key = keys::data_key(region.get_end_key()); - for cf in ALL_CFS { + for cf in engine.kv.cf_names() { engine .kv .scan(cf, &start_key, &end_key, false, |k, v| { diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index b876951894c..588e8ae9e9b 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -9,9 +9,13 @@ use kvproto::{metapb, replication_modepb::ReplicationStatus}; use pd_client::PdClient; use raftstore::{ coprocessor::CoprocessorHost, - store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}, + store::{ + AutoSplitController, GlobalReplicationState, TabletSnapManager, Transport, + RAFT_INIT_LOG_INDEX, + }, }; use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreRouter, StoreSystem}; +use resource_metering::CollectorRegHandle; use slog::{info, o, Logger}; use tikv_util::{ config::VersionTrack, @@ -92,6 +96,8 @@ where concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, store_cfg: Arc>, @@ -129,6 +135,8 @@ where concurrency_manager, causal_ts_provider, coprocessor_host, + auto_split_controller, + collector_reg_handle, background, pd_worker, store_cfg, @@ -188,6 +196,8 @@ where concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, store_cfg: Arc>, @@ -218,6 +228,8 @@ where concurrency_manager, causal_ts_provider, coprocessor_host, + auto_split_controller, + collector_reg_handle, background, pd_worker, )?; From 860fc839a988a6c975fbea18fc22f1d840bdfdc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 19 Jan 2023 17:03:49 +0800 Subject: [PATCH 464/676] log-backup: an ad-hoc way for hot reloading TLS certs (#14072) close tikv/tikv#14071 Log backup would aware TLS certifications changing. Signed-off-by: hillium --- .../src/metadata/store/lazy_etcd.rs | 91 ++++++++++++------- components/security/src/lib.rs | 4 + components/server/src/server.rs | 8 +- 3 files changed, 65 insertions(+), 38 deletions(-) diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index b712a23973d..37ffbad37c4 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -1,15 +1,20 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, time::Duration}; +use std::{ + sync::Arc, + time::{Duration, SystemTime}, +}; use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; use openssl::x509::verify::X509VerifyFlags; +use security::SecurityManager; use tikv_util::{ info, stream::{RetryError, RetryExt}, + warn, }; -use tokio::sync::OnceCell; +use tokio::sync::Mutex as AsyncMutex; use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; use crate::errors::{ContextualResultExt, Result}; @@ -17,20 +22,34 @@ use crate::errors::{ContextualResultExt, Result}; const RPC_TIMEOUT: Duration = Duration::from_secs(30); #[derive(Clone)] -pub struct LazyEtcdClient(Arc); +pub struct LazyEtcdClient(Arc>); -#[derive(Debug)] +#[derive(Clone)] pub struct ConnectionConfig { - pub tls: Option, + pub tls: Arc, pub keep_alive_interval: Duration, pub keep_alive_timeout: Duration, } +impl std::fmt::Debug for ConnectionConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ConnectionConfig") + .field("keep_alive_interval", &self.keep_alive_interval) + .field("keep_alive_timeout", &self.keep_alive_timeout) + .finish() + } +} + impl ConnectionConfig { /// Convert the config to the connection option. fn to_connection_options(&self) -> ConnectOptions { let mut opts = ConnectOptions::new(); - if let Some(tls) = &self.tls { + if let Some(tls) = &self + .tls + .client_suite() + .map_err(|err| warn!("failed to load client suite!"; "err" => %err)) + .ok() + { opts = opts.with_openssl_tls( OpenSslClientConfig::default() .ca_cert_pem(&tls.ca) @@ -54,28 +73,27 @@ impl ConnectionConfig { impl LazyEtcdClient { pub fn new(endpoints: &[String], conf: ConnectionConfig) -> Self { - Self(Arc::new(LazyEtcdClientInner { - opt: conf.to_connection_options(), + Self(Arc::new(AsyncMutex::new(LazyEtcdClientInner { + conf, endpoints: endpoints.iter().map(ToString::to_string).collect(), - cli: OnceCell::new(), - })) + last_modified: None, + cli: None, + }))) } -} - -impl std::ops::Deref for LazyEtcdClient { - type Target = LazyEtcdClientInner; - fn deref(&self) -> &Self::Target { - Arc::deref(&self.0) + async fn get_cli(&self) -> Result { + let mut l = self.0.lock().await; + l.get_cli().await.cloned() } } #[derive(Clone)] pub struct LazyEtcdClientInner { - opt: ConnectOptions, + conf: ConnectionConfig, endpoints: Vec, - cli: OnceCell, + last_modified: Option, + cli: Option, } fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { @@ -130,23 +148,34 @@ where } impl LazyEtcdClientInner { - async fn connect(&self) -> Result { + async fn connect(&mut self) -> Result<&EtcdStore> { let store = retry(|| { // For now, the interface of the `etcd_client` doesn't us to control // how to create channels when connecting, hence we cannot update the tls config - // at runtime. - // TODO: maybe add some method like `with_channel` for `etcd_client`, and adapt - // the `SecurityManager` API, instead of doing everything by own. - etcd_client::Client::connect(self.endpoints.clone(), Some(self.opt.clone())) + // at runtime, now what we did is manually check that each time we are getting + // the clients. + etcd_client::Client::connect( + self.endpoints.clone(), + Some(self.conf.to_connection_options()), + ) }) .await .context("during connecting to the etcd")?; - Ok(EtcdStore::from(store)) + let store = EtcdStore::from(store); + self.cli = Some(store); + Ok(self.cli.as_ref().unwrap()) } - pub async fn get_cli(&self) -> Result<&EtcdStore> { - let store = self.cli.get_or_try_init(|| self.connect()).await?; - Ok(store) + pub async fn get_cli(&mut self) -> Result<&EtcdStore> { + let modified = self.conf.tls.get_config().is_modified(&mut self.last_modified) + // Don't reload once we cannot check whether it is modified. + // Because when TLS disabled, this would always fail. + .unwrap_or(false); + if !modified && self.cli.is_some() { + return Ok(self.cli.as_ref().unwrap()); + } + info!("log backup reconnecting to the etcd service."; "tls_modified" => %modified, "connected_before" => %self.cli.is_some()); + self.connect().await } } @@ -155,7 +184,7 @@ impl MetaStore for LazyEtcdClient { type Snap = EtcdSnapshot; async fn snapshot(&self) -> Result { - self.0.get_cli().await?.snapshot().await + self.get_cli().await?.snapshot().await } async fn watch( @@ -163,14 +192,14 @@ impl MetaStore for LazyEtcdClient { keys: super::Keys, start_rev: i64, ) -> Result { - self.0.get_cli().await?.watch(keys, start_rev).await + self.get_cli().await?.watch(keys, start_rev).await } async fn txn(&self, txn: super::Transaction) -> Result<()> { - self.0.get_cli().await?.txn(txn).await + self.get_cli().await?.txn(txn).await } async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { - self.0.get_cli().await?.txn_cond(txn).await + self.get_cli().await?.txn_cond(txn).await } } diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index 52f438236fd..68328c01ebe 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -190,6 +190,10 @@ impl SecurityManager { ) } } + + pub fn get_config(&self) -> &SecurityConfig { + &self.cfg + } } #[derive(Clone)] diff --git a/components/server/src/server.rs b/components/server/src/server.rs index cfc7e59e243..97fd1f77eef 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1022,13 +1022,7 @@ where ConnectionConfig { keep_alive_interval: self.config.server.grpc_keepalive_time.0, keep_alive_timeout: self.config.server.grpc_keepalive_timeout.0, - tls: self - .security_mgr - .client_suite() - .map_err(|err| { - warn!("Failed to load client TLS suite, ignoring TLS config."; "err" => %err); - }) - .ok(), + tls: Arc::clone(&self.security_mgr), }, ); let backup_stream_endpoint = backup_stream::Endpoint::new( From 42c3814f2a11c50d6a496c8aaca8e314b26f7ead Mon Sep 17 00:00:00 2001 From: YangKeao Date: Thu, 19 Jan 2023 04:25:49 -0500 Subject: [PATCH 465/676] json, copr: implement unary not for json (#14070) close tikv/tikv#14069 Signed-off-by: YangKeao Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/tidb_query_executors/src/runner.rs | 9 ++++++ components/tidb_query_expr/src/impl_op.rs | 32 +++++++++++++++++++ components/tidb_query_expr/src/lib.rs | 1 + 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index a2924314f8a..cc89037bffa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6599,7 +6599,7 @@ dependencies = [ [[package]] name = "tipb" version = "0.0.1" -source = "git+https://github.com/pingcap/tipb.git#f3286471a05a4454a1071dd5f66ac7dbf6c79ba3" +source = "git+https://github.com/pingcap/tipb.git#c6b7a5a1623bb2766a502301ecc3ac8f98cc7c79" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index d04be41507e..392b41ff165 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -150,6 +150,15 @@ impl BatchExecutorsRunner<()> { ExecType::TypePartitionTableScan => { other_err!("PartitionTableScan executor not implemented"); } + ExecType::TypeSort => { + other_err!("Sort executor not implemented"); + } + ExecType::TypeWindow => { + other_err!("Window executor not implemented"); + } + ExecType::TypeExpand => { + other_err!("Expand executor not implemented"); + } } } diff --git a/components/tidb_query_expr/src/impl_op.rs b/components/tidb_query_expr/src/impl_op.rs index 5289f427e93..665448279fb 100644 --- a/components/tidb_query_expr/src/impl_op.rs +++ b/components/tidb_query_expr/src/impl_op.rs @@ -55,6 +55,18 @@ pub fn unary_not_decimal(arg: Option<&Decimal>) -> Result> { Ok(arg.as_ref().map(|v| v.is_zero() as i64)) } +#[rpn_fn(nullable)] +#[inline] +pub fn unary_not_json(arg: Option) -> Result> { + let json_zero = Json::from_i64(0).unwrap(); + Ok(arg.as_ref().map(|v| { + if v == &json_zero.as_ref() { + return 1; + } + 0 + })) +} + #[rpn_fn(nullable)] #[inline] pub fn unary_minus_uint(arg: Option<&Int>) -> Result> { @@ -383,6 +395,26 @@ mod tests { } } + #[test] + fn test_unary_not_json() { + let test_cases = vec![ + (None, None), + (Some(Json::from_i64(0).unwrap()), Some(1)), + (Some(Json::from_i64(1).unwrap()), Some(0)), + ( + Some(Json::from_array(vec![Json::from_i64(0).unwrap()]).unwrap()), + Some(0), + ), + ]; + for (arg, expect_output) in test_cases { + let output = RpnFnScalarEvaluator::new() + .push_param(arg.clone()) + .evaluate(ScalarFuncSig::UnaryNotJson) + .unwrap(); + assert_eq!(output, expect_output, "{:?}", arg.as_ref()); + } + } + #[test] fn test_unary_minus_int() { let unsigned_test_cases = vec![ diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index 43b0602ebbb..649a7cfa1c8 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -732,6 +732,7 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::UnaryNotInt => unary_not_int_fn_meta(), ScalarFuncSig::UnaryNotReal => unary_not_real_fn_meta(), ScalarFuncSig::UnaryNotDecimal => unary_not_decimal_fn_meta(), + ScalarFuncSig::UnaryNotJson => unary_not_json_fn_meta(), ScalarFuncSig::UnaryMinusInt => map_unary_minus_int_func(value, children)?, ScalarFuncSig::UnaryMinusReal => unary_minus_real_fn_meta(), ScalarFuncSig::UnaryMinusDecimal => unary_minus_decimal_fn_meta(), From cf622538b2ab118f51bf64a23ba41507b7e67f3f Mon Sep 17 00:00:00 2001 From: Zwb Date: Thu, 19 Jan 2023 22:15:00 +0800 Subject: [PATCH 466/676] raftstore: support switch witness (#13491) * support switch witness ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * add switch witness api for test_pd_client ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * pd heartbeat resp support switch witness ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * update region epoch ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix write apply state race ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * remove unnecessary code ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * add back test_witness_conf_change ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * add some tests ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * avoid test failures ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * a few refactor ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * add witness election priority and address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * clean code ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix tests failed caused by cfg ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix test failed caused by mistake modify ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * adjust priority after snapshot persisted ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * notify pd after switch witness as region changed ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * define a new backoff error for witness ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix panic caused by applygap ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * forbid transfer leader to non-witness waiting data ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * update kvproto ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix two panics ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * retry request snapshot ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * retry to request snaphost after term change ref #12876 Signed-off-by: Wenbo Zhang * update kvproto comment ref #12876 Signed-off-by: Wenbo Zhang Signed-off-by: Wenbo Zhang Signed-off-by: Zwb Co-authored-by: Xinye Tao --- Cargo.lock | 22 +- components/error_code/src/raftstore.rs | 3 + .../raftstore-v2/src/operation/query/local.rs | 1 + components/raftstore/src/coprocessor/mod.rs | 1 + components/raftstore/src/errors.rs | 9 + components/raftstore/src/store/config.rs | 8 + components/raftstore/src/store/fsm/apply.rs | 136 +++++++- components/raftstore/src/store/fsm/mod.rs | 2 +- components/raftstore/src/store/fsm/peer.rs | 170 +++++++-- components/raftstore/src/store/fsm/store.rs | 5 + components/raftstore/src/store/metrics.rs | 7 +- components/raftstore/src/store/msg.rs | 5 +- components/raftstore/src/store/peer.rs | 108 ++++-- .../raftstore/src/store/peer_storage.rs | 18 + components/raftstore/src/store/util.rs | 2 +- .../raftstore/src/store/worker/metrics.rs | 1 + components/raftstore/src/store/worker/pd.rs | 34 +- components/raftstore/src/store/worker/read.rs | 24 +- components/test_pd_client/src/pd.rs | 119 ++++++- etc/error_code.toml | 5 + tests/failpoints/cases/test_witness.rs | 273 +++++++++++++-- tests/integrations/config/mod.rs | 1 + tests/integrations/raftstore/test_witness.rs | 328 +++++++++--------- 23 files changed, 1029 insertions(+), 253 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cc89037bffa..e9f55d1923d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2732,7 +2732,7 @@ dependencies = [ "futures 0.3.15", "grpcio", "protobuf", - "protobuf-build", + "protobuf-build 0.13.0", "raft-proto", ] @@ -4121,6 +4121,18 @@ dependencies = [ "regex", ] +[[package]] +name = "protobuf-build" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fb3c02f54ecaf12572c1a60dbdb36b1f8f713a16105881143f2be84cca5bbe3" +dependencies = [ + "bitflags", + "protobuf", + "protobuf-codegen", + "regex", +] + [[package]] name = "protobuf-codegen" version = "2.8.0" @@ -4161,7 +4173,7 @@ dependencies = [ [[package]] name = "raft" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#2357cb22760719bcd107a90d1e64ef505bdb1e15" +source = "git+https://github.com/tikv/raft-rs?branch=master#f73766712a538c2f6eb135b455297ad6c03fc58d" dependencies = [ "bytes", "fxhash", @@ -4220,11 +4232,11 @@ dependencies = [ [[package]] name = "raft-proto" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#2357cb22760719bcd107a90d1e64ef505bdb1e15" +source = "git+https://github.com/tikv/raft-rs?branch=master#f73766712a538c2f6eb135b455297ad6c03fc58d" dependencies = [ "bytes", "protobuf", - "protobuf-build", + "protobuf-build 0.14.0", ] [[package]] @@ -6604,7 +6616,7 @@ dependencies = [ "futures 0.3.15", "grpcio", "protobuf", - "protobuf-build", + "protobuf-build 0.13.0", ] [[package]] diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index 1b6a85493cf..35dfe564ef0 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -32,6 +32,7 @@ define_error_codes!( RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), FLASHBACK_IN_PROGRESS => ("FlashbackInProgress", "", ""), FLASHBACK_NOT_PREPARED => ("FlashbackNotPrepared", "", ""), + IS_WITNESS => ("IsWitness", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), @@ -70,6 +71,8 @@ impl ErrorCodeExt for errorpb::Error { FLASHBACK_IN_PROGRESS } else if self.has_flashback_not_prepared() { FLASHBACK_NOT_PREPARED + } else if self.has_is_witness() { + IS_WITNESS } else { UNKNOWN } diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 13b815d1ebc..f574571f790 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -815,6 +815,7 @@ mod tests { txn_ext: txn_ext.clone(), read_progress: read_progress.clone(), pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: Some(bucket_meta.clone()), }; diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 5100e9d4632..73110660856 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -300,6 +300,7 @@ pub enum RegionChangeReason { PrepareMerge, CommitMerge, RollbackMerge, + SwitchWitness, } #[derive(Clone, Copy, Debug, PartialEq)] diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 3c415c65af6..36fcec7f1f3 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -140,6 +140,9 @@ pub enum Error { region_id: u64, local_state: raft_serverpb::RegionLocalState, }, + + #[error("peer is a witness of region {0}")] + IsWitness(u64), } pub type Result = result::Result; @@ -263,6 +266,11 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_flashback_not_prepared(e); } + Error::IsWitness(region_id) => { + let mut e = errorpb::IsWitness::default(); + e.set_region_id(region_id); + errorpb.set_is_witness(e); + } _ => {} }; @@ -319,6 +327,7 @@ impl ErrorCodeExt for Error { Error::DataIsNotReady { .. } => error_code::raftstore::DATA_IS_NOT_READY, Error::DeadlineExceeded => error_code::raftstore::DEADLINE_EXCEEDED, Error::PendingPrepareMerge => error_code::raftstore::PENDING_PREPARE_MERGE, + Error::IsWitness(..) => error_code::raftstore::IS_WITNESS, Error::Other(_) | Error::RegionNotRegistered { .. } => error_code::raftstore::UNKNOWN, } diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 4d3210318a6..34f4e159dee 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -324,6 +324,12 @@ pub struct Config { #[online_config(hidden)] // Interval to check peers availability info. pub check_peers_availability_interval: ReadableDuration, + + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(hidden)] + // Interval to check if need to request snapshot. + pub check_request_snapshot_interval: ReadableDuration, } impl Default for Config { @@ -433,6 +439,8 @@ impl Default for Config { unreachable_backoff: ReadableDuration::secs(10), // TODO: make its value reasonable check_peers_availability_interval: ReadableDuration::secs(30), + // TODO: make its value reasonable + check_request_snapshot_interval: ReadableDuration::minutes(1), } } } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 60ed35e6892..9f2d234010f 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -40,7 +40,7 @@ use kvproto::{ metapb::{self, PeerRole, Region, RegionEpoch}, raft_cmdpb::{ AdminCmdType, AdminRequest, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, - RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, + RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, SwitchWitnessRequest, }, raft_serverpb::{MergeState, PeerState, RaftApplyState, RaftTruncatedState, RegionLocalState}, }; @@ -252,6 +252,13 @@ impl Range { } } +#[derive(Default, Debug)] +pub struct SwitchWitness { + pub index: u64, + pub switches: Vec, + pub region: Region, +} + #[derive(Debug)] pub enum ExecResult { ChangePeer(ChangePeer), @@ -301,6 +308,7 @@ pub enum ExecResult { SetFlashbackState { region: Region, }, + BatchSwitchWitness(SwitchWitness), // The raftstore thread will use it to update the internal state of `PeerFsm`. If it is // `true`, when the raftstore detects that the raft log has not been gc for a long time, // the raftstore thread will actively pull the `voter_replicated_index` from the leader @@ -979,6 +987,9 @@ where /// in same Ready should be applied failed. pending_remove: bool, + /// Indicates whether the peer is waiting data. See more in `Peer`. + wait_data: bool, + /// The commands waiting to be committed and applied pending_cmds: PendingCmdQueue>, /// The counter of pending request snapshots. See more in `Peer`. @@ -1041,6 +1052,7 @@ where peer: find_peer_by_id(®.region, reg.id).unwrap().clone(), region: reg.region, pending_remove: false, + wait_data: false, last_flush_applied_index: reg.apply_state.get_applied_index(), apply_state: reg.apply_state, applied_term: reg.applied_term, @@ -1119,7 +1131,13 @@ where match res { ApplyResult::None => {} - ApplyResult::Res(res) => results.push_back(res), + ApplyResult::Res(res) => { + results.push_back(res); + if self.wait_data { + apply_ctx.committed_count -= committed_entries_drainer.len(); + break; + } + } ApplyResult::Yield | ApplyResult::WaitMergeSource(_) => { // Both cancel and merge will yield current processing. apply_ctx.committed_count -= committed_entries_drainer.len() + 1; @@ -1535,6 +1553,12 @@ where ExecResult::SetFlashbackState { ref region } => { self.region = region.clone(); } + ExecResult::BatchSwitchWitness(ref switches) => { + self.region = switches.region.clone(); + if let Some(p) = find_peer_by_id(&self.region, self.id()) { + self.peer = p.clone(); + } + } } } if let Some(epoch) = origin_epoch { @@ -1669,7 +1693,7 @@ where AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { self.exec_flashback(ctx, request) } - AdminCmdType::BatchSwitchWitness => Err(box_err!("unsupported admin command type")), + AdminCmdType::BatchSwitchWitness => self.exec_batch_switch_witness(ctx, request), AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), }?; response.set_cmd_type(cmd_type); @@ -3202,6 +3226,90 @@ where )) } + fn exec_batch_switch_witness( + &mut self, + ctx: &mut ApplyContext, + request: &AdminRequest, + ) -> Result<(AdminResponse, ApplyResult)> { + assert!(request.has_switch_witnesses()); + let switches = request + .get_switch_witnesses() + .get_switch_witnesses() + .to_vec(); + + info!( + "exec BatchSwitchWitness"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "epoch" => ?self.region.get_region_epoch(), + ); + + let mut region = self.region.clone(); + for s in switches.as_slice() { + PEER_ADMIN_CMD_COUNTER.batch_switch_witness.all.inc(); + let (peer_id, is_witness) = (s.get_peer_id(), s.get_is_witness()); + let mut peer_is_exist = false; + for p in region.mut_peers().iter_mut() { + if p.id == peer_id { + if p.is_witness == is_witness { + return Err(box_err!( + "switch peer {:?} on region {:?} is no-op", + p, + self.region + )); + } + p.is_witness = is_witness; + peer_is_exist = true; + break; + } + } + if !peer_is_exist { + return Err(box_err!( + "switch peer {} on region {:?} failed: peer does not exist", + peer_id, + self.region + )); + } + PEER_ADMIN_CMD_COUNTER.batch_switch_witness.success.inc(); + if self.id() == peer_id && !is_witness { + self.wait_data = true; + self.peer.is_witness = false; + } + } + let conf_ver = region.get_region_epoch().get_conf_ver() + switches.len() as u64; + region.mut_region_epoch().set_conf_ver(conf_ver); + info!( + "switch witness successfully"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "switches" => ?switches, + "original region" => ?&self.region, + "current region" => ?®ion, + ); + + let state = if self.pending_remove { + PeerState::Tombstone + } else if self.wait_data { + PeerState::Unavailable + } else { + PeerState::Normal + }; + + if let Err(e) = write_peer_state(ctx.kv_wb_mut(), ®ion, state, None) { + panic!("{} failed to update region state: {:?}", self.tag, e); + } + + let resp = AdminResponse::default(); + Ok(( + resp, + ApplyResult::Res(ExecResult::BatchSwitchWitness(SwitchWitness { + index: ctx.exec_log_index, + switches, + region, + })), + )) + } + fn update_memory_trace(&mut self, event: &mut TraceEvent) { let pending_cmds = self.pending_cmds.heap_size(); let merge_yield = if let Some(ref mut state) = self.yield_state { @@ -3593,6 +3701,7 @@ where #[cfg(any(test, feature = "testexport"))] #[allow(clippy::type_complexity)] Validate(u64, Box), + Recover(u64), CheckCompact { region_id: u64, voter_replicated_index: u64, @@ -3645,6 +3754,7 @@ where } => write!(f, "[region {}] change cmd", region_id), #[cfg(any(test, feature = "testexport"))] Msg::Validate(region_id, _) => write!(f, "[region {}] validate", region_id), + Msg::Recover(region_id) => write!(f, "recover [region {}] apply", region_id), Msg::CheckCompact { region_id, voter_replicated_index, @@ -3770,6 +3880,10 @@ where return; } + if self.delegate.wait_data { + return; + } + let mut entries = Vec::new(); let mut dangle_size = 0; @@ -3972,8 +4086,9 @@ where if self.delegate.pending_remove || self.delegate.stopped { return; } - if self.delegate.peer.is_witness { - // witness shouldn't generate snapshot. + if self.delegate.peer.is_witness || self.delegate.wait_data { + // witness or non-witness hasn't finish applying snapshot shouldn't generate + // snapshot. return; } let applied_index = self.delegate.apply_state.get_applied_index(); @@ -4199,8 +4314,11 @@ where } } } - batch_apply = Some(apply); + if !self.delegate.wait_data { + batch_apply = Some(apply); + } } + Msg::Recover(..) => self.delegate.wait_data = false, Msg::Registration(reg) => self.handle_registration(reg), Msg::Destroy(d) => self.handle_destroy(apply_ctx, d), Msg::LogsUpToDate(cul) => self.logs_up_to_date_for_merge(apply_ctx, cul), @@ -4637,6 +4755,11 @@ where } #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => return, + Msg::Recover(region_id) => { + info!("recover apply"; + "region_id" => region_id); + return; + } Msg::CheckCompact { region_id, .. } => { info!("target region is not found"; "region_id" => region_id); @@ -4774,6 +4897,7 @@ mod memtrace { | Msg::Change { .. } => 0, #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => 0, + Msg::Recover(..) => 0, Msg::CheckCompact { .. } => 0, } } diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index 2f700eec9bf..b481caf4f74 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -14,7 +14,7 @@ pub use self::{ check_sst_for_ingestion, create_apply_batch_system, Apply, ApplyBatchSystem, ApplyMetrics, ApplyRes, ApplyRouter, Builder as ApplyPollerBuilder, CatchUpLogs, ChangeObserver, ChangePeer, ExecResult, GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, - Registration, TaskRes as ApplyTaskRes, + Registration, SwitchWitness, TaskRes as ApplyTaskRes, }, peer::{new_admin_request, DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO}, store::{ diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index ccde4b031ef..d405c3471af 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -75,7 +75,7 @@ use crate::{ apply, store::{PollContext, StoreMeta}, ApplyMetrics, ApplyTask, ApplyTaskRes, CatchUpLogs, ChangeObserver, ChangePeer, - ExecResult, + ExecResult, SwitchWitness, }, hibernate_state::{GroupState, HibernateState}, local_metrics::{RaftMetrics, TimeTracker}, @@ -247,6 +247,7 @@ where raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, + wait_data: bool, ) -> Result> { let meta_peer = match find_peer(region, store_id) { None => { @@ -277,6 +278,7 @@ where engines, region, meta_peer, + wait_data, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -331,6 +333,7 @@ where engines, ®ion, peer, + false, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -1192,6 +1195,7 @@ where PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted_tick(), PeerTick::CheckPeersAvailability => self.on_check_peers_availability(), + PeerTick::RequestSnapshot => self.on_request_snapshot_tick(), PeerTick::RequestVoterReplicatedIndex => self.on_request_voter_replicated_index(), } } @@ -1203,6 +1207,9 @@ where self.register_split_region_check_tick(); self.register_check_peer_stale_state_tick(); self.on_check_merge(); + if self.fsm.peer.wait_data { + self.on_request_snapshot_tick(); + } // Apply committed entries more quickly. // Or if it's a leader. This implicitly means it's a singleton // because it becomes leader in `Peer::new` when it's a @@ -1951,6 +1958,7 @@ where self.register_raft_gc_log_tick(); self.register_check_leader_lease_tick(); self.register_report_region_buckets_tick(); + self.register_check_peers_availability_tick(); } if let Some(ForceLeaderState::ForceLeader { .. }) = self.fsm.peer.force_leader { @@ -2161,12 +2169,6 @@ where return; } - // Keep ticking if there are disk full peers for the Region. - if !self.fsm.peer.disk_full_peers.is_empty() { - self.register_raft_base_tick(); - return; - } - debug!("stop ticking"; "res" => ?res, "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), @@ -2258,6 +2260,9 @@ where "peer_id" => self.fsm.peer_id(), "res" => ?res, ); + if self.fsm.peer.wait_data { + return; + } self.on_ready_result(&mut res.exec_res, &res.metrics); if self.fsm.stopped { return; @@ -2467,6 +2472,17 @@ where return Ok(()); } + if MessageType::MsgAppend == msg_type + && self.fsm.peer.wait_data + && self.fsm.peer.should_reject_msgappend + { + debug!("skip {:?} because of non-witness waiting data", msg_type; + "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id() + ); + self.ctx.raft_metrics.message_dropped.non_witness.inc(); + return Ok(()); + } + if !self.validate_raft_msg(&msg) { return Ok(()); } @@ -2603,6 +2619,7 @@ where fn on_hibernate_request(&mut self, from: &metapb::Peer) { if !self.ctx.cfg.hibernate_regions || self.fsm.peer.has_uncommitted_log() + || self.fsm.peer.wait_data || from.get_id() != self.fsm.peer.leader_id() { // Ignore the message means rejecting implicitly. @@ -3053,7 +3070,7 @@ where if snap.get_metadata().get_index() < self.fsm.peer.get_store().applied_index() && snap_data.get_meta().get_for_witness() != self.fsm.peer.is_witness() { - info!( + error!( "mismatch witness snapshot"; "region_id" => region_id, "peer_id" => self.fsm.peer_id(), @@ -3355,7 +3372,6 @@ where ); } else { self.fsm.peer.transfer_leader(&from); - self.fsm.peer.wait_data_peers.clear(); } } } @@ -4069,6 +4085,7 @@ where self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), &new_region, + false, ) { Ok((sender, new_peer)) => (sender, new_peer), Err(e) => { @@ -4959,6 +4976,9 @@ where ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), + ExecResult::BatchSwitchWitness(switches) => { + self.on_ready_batch_switch_witness(switches) + } ExecResult::HasPendingCompactCmd(has_pending) => { self.fsm.peer.has_pending_compact_cmd = has_pending; if has_pending { @@ -5126,8 +5146,29 @@ where && msg.get_admin_request().get_cmd_type() == AdminCmdType::TransferLeader) { self.ctx.raft_metrics.invalid_proposal.witness.inc(); - // TODO: use a dedicated error type - return Err(Error::RecoveryInProgress(self.region_id())); + return Err(Error::IsWitness(self.region_id())); + } + + // Forbid requests to switch it into a witness when it's a leader + if self.fsm.peer.is_leader() + && msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::BatchSwitchWitness + && msg + .get_admin_request() + .get_switch_witnesses() + .get_switch_witnesses() + .iter() + .any(|s| s.get_peer_id() == self.fsm.peer.peer.get_id() && s.get_is_witness()) + { + self.ctx.raft_metrics.invalid_proposal.witness.inc(); + return Err(Error::IsWitness(self.region_id())); + } + + // Forbid requests when it becomes to non-witness but not finish applying + // snapshot. + if self.fsm.peer.wait_data { + self.ctx.raft_metrics.invalid_proposal.non_witness.inc(); + return Err(Error::IsWitness(self.region_id())); } // check whether the peer is initialized. @@ -5518,6 +5559,36 @@ where self.register_check_long_uncommitted_tick(); } + fn on_request_snapshot_tick(&mut self) { + fail_point!("ignore request snapshot", |_| { + self.schedule_tick(PeerTick::RequestSnapshot); + }); + if !self.fsm.peer.wait_data || self.fsm.peer.is_leader() { + return; + } + self.fsm.peer.request_index = self.fsm.peer.raft_group.raft.raft_log.last_index(); + let last_term = self.fsm.peer.get_index_term(self.fsm.peer.request_index); + if last_term == self.fsm.peer.term() { + self.fsm.peer.should_reject_msgappend = true; + if let Err(e) = self.fsm.peer.raft_group.request_snapshot() { + error!( + "failed to request snapshot"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => %e, + ); + } + } else { + // If a leader change occurs after switch to non-witness, it should be + // continue processing `MsgAppend` until `last_term == term`, then retry + // to request snapshot. + self.fsm.peer.should_reject_msgappend = false; + } + // Requesting a snapshot may fail, so register a periodic event as a defense + // until succeeded. + self.schedule_tick(PeerTick::RequestSnapshot); + } + fn on_request_voter_replicated_index(&mut self) { if !self.fsm.peer.is_witness() || !self.fsm.peer.has_pending_compact_cmd { return; @@ -6059,18 +6130,31 @@ where } fn on_check_peers_availability(&mut self) { + let mut invalid_peers: Vec = Vec::new(); for peer_id in self.fsm.peer.wait_data_peers.iter() { - let peer = self.fsm.peer.get_peer_from_cache(*peer_id).unwrap(); - let mut msg = ExtraMessage::default(); - msg.set_type(ExtraMessageType::MsgAvailabilityRequest); - self.fsm - .peer - .send_extra_message(msg, &mut self.ctx.trans, &peer); - debug!( - "check peer availability"; - "target peer id" => *peer_id, - ); + match self.fsm.peer.get_peer_from_cache(*peer_id) { + Some(peer) => { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityRequest); + self.fsm + .peer + .send_extra_message(msg, &mut self.ctx.trans, &peer); + debug!( + "check peer availability"; + "target peer id" => *peer_id, + ); + } + None => invalid_peers.push(*peer_id), + } } + // For some reasons, the peer corresponding to the previously saved peer_id + // no longer exists. In order to avoid passing invalid information to pd when + // reporting pending peers and affecting pd scheduling, remove it from the + // `wait_data_peers`. + self.fsm + .peer + .wait_data_peers + .retain(|peer_id| !invalid_peers.contains(peer_id)); } fn register_pull_voter_replicated_index_tick(&mut self) { @@ -6355,6 +6439,50 @@ where self.fsm.peer.leader_lease_mut().expire_remote_lease(); } + fn on_ready_batch_switch_witness(&mut self, sw: SwitchWitness) { + { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.set_region( + &self.ctx.coprocessor_host, + sw.region, + &mut self.fsm.peer, + RegionChangeReason::SwitchWitness, + ); + } + for s in sw.switches { + let (peer_id, is_witness) = (s.get_peer_id(), s.get_is_witness()); + if self.fsm.peer_id() == peer_id { + if is_witness && !self.fsm.peer.is_leader() { + let _ = self.fsm.peer.get_store().clear_data(); + self.fsm.peer.raft_group.set_priority(-1); + } else { + self.fsm + .peer + .update_read_progress(self.ctx, ReadProgress::WaitData(true)); + self.fsm.peer.wait_data = true; + self.on_request_snapshot_tick(); + } + self.fsm.peer.peer.is_witness = is_witness; + continue; + } + if !is_witness && !self.fsm.peer.wait_data_peers.contains(&peer_id) { + self.fsm.peer.wait_data_peers.push(peer_id); + } + } + if self.fsm.peer.is_leader() { + info!( + "notify pd with change peer region"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "region" => ?self.fsm.peer.region(), + ); + self.fsm.peer.heartbeat_pd(self.ctx); + if !self.fsm.peer.wait_data_peers.is_empty() { + self.register_check_peers_availability_tick(); + } + } + } + /// Verify and store the hash to state. return true means the hash has been /// stored successfully. // TODO: Consider context in the function. diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index b75aee3b4bb..2ca573824f9 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -594,6 +594,8 @@ where self.cfg.check_long_uncommitted_interval.0; self.tick_batch[PeerTick::CheckPeersAvailability as usize].wait_duration = self.cfg.check_peers_availability_interval.0; + self.tick_batch[PeerTick::RequestSnapshot as usize].wait_duration = + self.cfg.check_request_snapshot_interval.0; // TODO: make it reasonable self.tick_batch[PeerTick::RequestVoterReplicatedIndex as usize].wait_duration = self.cfg.raft_log_gc_tick_interval.0 * 2; @@ -1206,6 +1208,7 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), region, + local_state.get_state() == PeerState::Unavailable, )); peer.peer.init_replication_mode(&mut replication_state); if local_state.get_state() == PeerState::Merging { @@ -1246,6 +1249,7 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), ®ion, + false, )?; peer.peer.init_replication_mode(&mut replication_state); peer.schedule_applying_snapshot(); @@ -2911,6 +2915,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), ®ion, + false, ) { Ok((sender, peer)) => (sender, peer), Err(e) => { diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index ce4f099610e..6c6357d286c 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -35,7 +35,8 @@ make_auto_flush_static_metric! { compact, transfer_leader, prepare_flashback, - finish_flashback + finish_flashback, + batch_switch_witness : "batch-switch-witness", } pub label_enum AdminCmdStatus { @@ -177,6 +178,7 @@ make_static_metric! { region_nonexistent, applying_snap, disk_full, + non_witness, recovery, } @@ -205,7 +207,8 @@ make_static_metric! { force_leader, witness, flashback_in_progress, - flashback_not_prepared + flashback_not_prepared, + non_witness, } pub label_enum RaftEventDurationType { diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index b2a2a7aa1d1..3c555689cb9 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -384,7 +384,8 @@ pub enum PeerTick { ReportBuckets = 9, CheckLongUncommitted = 10, CheckPeersAvailability = 11, - RequestVoterReplicatedIndex = 12, + RequestSnapshot = 12, + RequestVoterReplicatedIndex = 13, } impl PeerTick { @@ -405,6 +406,7 @@ impl PeerTick { PeerTick::ReportBuckets => "report_buckets", PeerTick::CheckLongUncommitted => "check_long_uncommitted", PeerTick::CheckPeersAvailability => "check_peers_availability", + PeerTick::RequestSnapshot => "request_snapshot", PeerTick::RequestVoterReplicatedIndex => "request_voter_replicated_index", } } @@ -423,6 +425,7 @@ impl PeerTick { PeerTick::ReportBuckets, PeerTick::CheckLongUncommitted, PeerTick::CheckPeersAvailability, + PeerTick::RequestSnapshot, PeerTick::RequestVoterReplicatedIndex, ]; TICKS diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 586ab7ba133..e2a914fded6 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -894,6 +894,17 @@ where /// the same time period. pub wait_data: bool, + /// When the witness becomes non-witness, it need to actively request a + /// snapshot from the leader, but the request may fail, so we need to save + /// the request index for retrying. + pub request_index: u64, + + /// When the witness becomes non-witness, it need to actively request a + /// snapshot from the leader, In order to avoid log lag, we need to reject + /// the leader's `MsgAppend` request unless the `term` of the `last index` + /// is less than the peer's current `term`. + pub should_reject_msgappend: bool, + /// Force leader state is only used in online recovery when the majority of /// peers are missing. In this state, it forces one peer to become leader /// out of accordance with Raft election rule, and forbids any @@ -1055,6 +1066,7 @@ where engines: Engines, region: &metapb::Region, peer: metapb::Peer, + wait_data: bool, ) -> Result> { let peer_id = peer.get_id(); if peer_id == raft::INVALID_ID { @@ -1086,12 +1098,13 @@ where skip_bcast_commit: true, pre_vote: cfg.prevote, max_committed_size_per_ready: MAX_COMMITTED_SIZE_PER_READY, - // TODO: if peer.is_witness { 0 } else { 1 }, + priority: if peer.is_witness { -1 } else { 0 }, ..Default::default() }; let logger = slog_global::get_global().new(slog::o!("region_id" => region.get_id())); let raft_group = RawNode::new(&raft_cfg, ps, &logger)?; + let last_index = raft_group.store().last_index(); // In order to avoid excessive log accumulation due to the loss of pending // compaction cmds after the witness is restarted, it will actively pull // voter_request_index once at start. @@ -1118,7 +1131,9 @@ where compaction_declined_bytes: 0, leader_unreachable: false, pending_remove: false, - wait_data: false, + wait_data, + request_index: last_index, + should_reject_msgappend: false, should_wake_up: false, force_leader: None, pending_merge_state: None, @@ -1592,6 +1607,14 @@ where res.reason = "replication mode"; return res; } + if !self.disk_full_peers.is_empty() { + res.reason = "has disk full peers"; + return res; + } + if !self.wait_data_peers.is_empty() { + res.reason = "has wait data peers"; + return res; + } res.up_to_date = true; res } @@ -1617,6 +1640,8 @@ where && !self.has_unresolved_reads() // If it becomes leader, the stats is not valid anymore. && !self.is_leader() + // Keep ticking if it's waiting for snapshot. + && !self.wait_data } } @@ -2061,6 +2086,12 @@ where let status = self.raft_group.status(); let truncated_idx = self.get_store().truncated_index(); + for peer_id in &self.wait_data_peers { + if let Some(p) = self.get_peer_from_cache(*peer_id) { + pending_peers.push(p); + } + } + if status.progress.is_none() { return pending_peers; } @@ -2137,6 +2168,9 @@ where if self.peers_start_pending_time[i].0 != peer_id { continue; } + if self.wait_data_peers.contains(&peer_id) { + continue; + } let truncated_idx = self.raft_group.store().truncated_index(); if let Some(progress) = self.raft_group.raft.prs().get(peer_id) { if progress.matched >= truncated_idx { @@ -2394,8 +2428,12 @@ where // a stale heartbeat can make the leader think follower has already applied // the snapshot, and send remaining log entries, which may increase // commit_index. + // + // If it's witness before, but a command changes it to non-witness, it will stop + // applying all following command, therefore, add the judgment of `wait_data` to + // avoid applying snapshot is also blocked. // TODO: add more test - self.last_applying_idx == self.get_store().applied_index() + (self.last_applying_idx == self.get_store().applied_index() || self.wait_data) // Requesting snapshots also triggers apply workers to write // apply states even if there is no pending committed entry. // TODO: Instead of sharing the counter, we should apply snapshots @@ -2565,11 +2603,18 @@ where // i.e. call `RawNode::advance_apply_to`. self.post_pending_read_index_on_replica(ctx); // Resume `read_progress` + self.update_read_progress(ctx, ReadProgress::WaitData(false)); self.read_progress.resume(); // Update apply index to `last_applying_idx` self.read_progress .update_applied(self.last_applying_idx, &ctx.coprocessor_host); - self.notify_leader_the_peer_is_available(ctx); + if self.wait_data { + self.notify_leader_the_peer_is_available(ctx); + ctx.apply_router + .schedule_task(self.region_id, ApplyTask::Recover(self.region_id)); + self.wait_data = false; + return false; + } } CheckApplyingSnapStatus::Idle => { // FIXME: It's possible that the snapshot applying task is canceled. @@ -2590,22 +2635,19 @@ where &mut self, ctx: &mut PollContext, ) { - if self.wait_data { - self.wait_data = false; - fail_point!("ignore notify leader the peer is available", |_| {}); - let leader_id = self.leader_id(); - let leader = self.get_peer_from_cache(leader_id); - if let Some(leader) = leader { - let mut msg = ExtraMessage::default(); - msg.set_type(ExtraMessageType::MsgAvailabilityResponse); - msg.wait_data = false; - self.send_extra_message(msg, &mut ctx.trans, &leader); - info!( - "notify leader the leader is available"; - "region id" => self.region().get_id(), - "peer id" => self.peer.id - ); - } + fail_point!("ignore notify leader the peer is available", |_| {}); + let leader_id = self.leader_id(); + let leader = self.get_peer_from_cache(leader_id); + if let Some(leader) = leader { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityResponse); + msg.wait_data = false; + self.send_extra_message(msg, &mut ctx.trans, &leader); + info!( + "notify leader the peer is available"; + "region id" => self.region().get_id(), + "peer id" => self.peer.id + ); } } @@ -3128,9 +3170,8 @@ where "after" => ?peer, ); self.peer = peer; - // TODO: set priority for witness - // self.raft_group - // .set_priority(if self.peer.is_witness { 0 } else { 1 }); + self.raft_group + .set_priority(if self.peer.is_witness { -1 } else { 0 }); }; self.activate(ctx); @@ -3586,6 +3627,16 @@ where reader.update(progress); } + pub fn update_read_progress( + &self, + ctx: &mut PollContext, + progress: ReadProgress, + ) { + let mut meta = ctx.store_meta.lock().unwrap(); + let reader = meta.readers.get_mut(&self.region_id).unwrap(); + self.maybe_update_read_progress(reader, progress); + } + pub fn maybe_campaign(&mut self, parent_is_leader: bool) -> bool { if self.region().get_peers().len() <= 1 { // The peer campaigned when it was created, no need to do it again. @@ -4434,13 +4485,10 @@ where msg: &eraftpb::Message, peer_disk_usage: DiskUsage, ) -> bool { - if self.is_witness() { - // shouldn't transfer leader to witness peer - return true; - } - let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); - if pending_snapshot + // shouldn't transfer leader to witness peer or non-witness waiting data + if self.is_witness() || self.wait_data + || pending_snapshot || msg.get_from() != self.leader_id() // Transfer leader to node with disk full will lead to write availablity downback. // But if the current leader is disk full, and send such request, we should allow it, @@ -4455,6 +4503,8 @@ where "from" => msg.get_from(), "pending_snapshot" => pending_snapshot, "disk_usage" => ?ctx.self_disk_usage, + "is_witness" => self.is_witness(), + "wait_data" => self.wait_data, ); return true; } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index b060a866d71..8dc8a18906c 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -449,6 +449,11 @@ where /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no /// available snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + fail_point!("ignore generate snapshot", self.peer_id == 1, |_| { + Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )) + }); if self.peer.as_ref().unwrap().is_witness { // witness could be the leader for a while, do not generate snapshot now return Err(raft::Error::Store( @@ -457,6 +462,18 @@ where } if find_peer_by_id(&self.region, to).map_or(false, |p| p.is_witness) { + // Although we always sending snapshot task behind apply task to get latest + // snapshot, we can't use `last_applying_idx` here, as below the judgment + // condition will generate an witness snapshot directly, the new non-witness + // will ingore this mismatch snapshot and can't request snapshot successfully + // again. + if self.applied_index() < request_index { + // It may be a request from non-witness. In order to avoid generating mismatch + // snapshots, wait for apply non-witness to complete + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } // generate an empty snapshot for witness directly return Ok(util::new_empty_snapshot( self.region.clone(), @@ -666,6 +683,7 @@ where "peer_id" => self.peer_id, "region" => ?region, "state" => ?self.apply_state(), + "for_witness" => for_witness, ); Ok((region, for_witness)) diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 2d27b56fda5..2f61534d159 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -228,7 +228,7 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { AdminCmdEpochState::new(true, true, false, false) } - AdminCmdType::BatchSwitchWitness => unimplemented!(), + AdminCmdType::BatchSwitchWitness => AdminCmdEpochState::new(false, true, false, true), } } diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index 5861e27a508..e6c3c505cdf 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -59,6 +59,7 @@ make_static_metric! { witness, flashback_not_prepared, flashback_in_progress, + wait_data, } pub struct LocalReadRejectCounter : LocalIntCounter { diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 79b58d75c83..18ecc77f599 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -25,8 +25,8 @@ use kvproto::{ kvrpcpb::DiskFullOpt, metapb, pdpb, raft_cmdpb::{ - AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, - SplitRequest, + AdminCmdType, AdminRequest, BatchSwitchWitnessRequest, ChangePeerRequest, + ChangePeerV2Request, RaftCmdRequest, SplitRequest, SwitchWitnessRequest, }, raft_serverpb::RaftMessage, replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, @@ -1551,6 +1551,18 @@ where deadline:None, disk_full_opt:DiskFullOpt::AllowedOnAlmostFull, }); + } else if resp.has_switch_witnesses() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["switch witness"]) + .inc(); + + let mut switches = resp.take_switch_witnesses(); + info!("try to switch witness"; + "region_id" => region_id, + "switch witness" => ?switches + ); + let req = new_batch_switch_witness(switches.take_switch_witnesses().into()); + send_admin_request(&router, region_id, epoch, peer, req, Callback::None, Default::default()); } else { PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["noop"]).inc(); } @@ -2257,6 +2269,24 @@ fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { req } +fn new_batch_switch_witness(switches: Vec) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSwitchWitness); + let switch_reqs = switches + .into_iter() + .map(|s| { + let mut sw = SwitchWitnessRequest::default(); + sw.set_peer_id(s.get_peer_id()); + sw.set_is_witness(s.get_is_witness()); + sw + }) + .collect(); + let mut sw = BatchSwitchWitnessRequest::default(); + sw.set_switch_witnesses(switch_reqs); + req.set_switch_witnesses(sw); + req +} + fn send_admin_request( router: &RaftRouter, region_id: u64, diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a7849f5e1dd..6b20e375786 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -412,6 +412,8 @@ pub struct ReadDelegate { pub txn_ext: Arc, pub read_progress: Arc, pub pending_remove: bool, + /// Indicates whether the peer is waiting data. See more in `Peer`. + pub wait_data: bool, // `track_ver` used to keep the local `ReadDelegate` in `LocalReader` // up-to-date with the global `ReadDelegate` stored at `StoreMeta` @@ -435,6 +437,7 @@ impl ReadDelegate { txn_ext: peer.txn_ext.clone(), read_progress: peer.read_progress.clone(), pending_remove: false, + wait_data: false, bucket_meta: peer.region_buckets.as_ref().map(|b| b.meta.clone()), track_ver: TrackVer::new(), } @@ -463,6 +466,7 @@ impl ReadDelegate { txn_ext, read_progress, pending_remove: false, + wait_data: false, bucket_meta, track_ver: TrackVer::new(), } @@ -496,6 +500,9 @@ impl ReadDelegate { Progress::RegionBuckets(bucket_meta) => { self.bucket_meta = Some(bucket_meta); } + Progress::WaitData(wait_data) => { + self.wait_data = wait_data; + } } } @@ -591,6 +598,7 @@ impl ReadDelegate { txn_ext: Default::default(), read_progress, pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, } @@ -620,6 +628,7 @@ pub enum Progress { AppliedTerm(u64), LeaderLease(RemoteLease), RegionBuckets(Arc), + WaitData(bool), } impl Progress { @@ -642,6 +651,10 @@ impl Progress { pub fn region_buckets(bucket_meta: Arc) -> Progress { Progress::RegionBuckets(bucket_meta) } + + pub fn wait_data(wait_data: bool) -> Progress { + Progress::WaitData(wait_data) + } } struct SnapCache @@ -797,7 +810,13 @@ where // Check witness if find_peer_by_id(&delegate.region, delegate.peer_id).map_or(true, |p| p.is_witness) { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); - return Err(Error::RecoveryInProgress(region_id)); + return Err(Error::IsWitness(region_id)); + } + + // Check non-witness hasn't finish applying snapshot yet. + if delegate.wait_data { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.wait_data.inc()); + return Err(Error::IsWitness(region_id)); } // Check whether the region is in the flashback state and the local read could @@ -1299,6 +1318,7 @@ mod tests { txn_ext: Arc::new(TxnExt::default()), read_progress: read_progress.clone(), pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, }; @@ -1590,6 +1610,7 @@ mod tests { track_ver: TrackVer::new(), read_progress: Arc::new(RegionReadProgress::new(®ion, 0, 0, 1)), pending_remove: false, + wait_data: false, bucket_meta: None, }; meta.readers.insert(1, read_delegate); @@ -1715,6 +1736,7 @@ mod tests { txn_ext: Arc::new(TxnExt::default()), read_progress, pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, }; diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index 513d08643a7..a76692c4a67 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -27,8 +27,8 @@ use keys::{self, data_key, enc_end_key, enc_start_key}; use kvproto::{ metapb::{self, PeerRole}, pdpb::{ - self, ChangePeer, ChangePeerV2, CheckPolicy, Merge, RegionHeartbeatResponse, SplitRegion, - TransferLeader, + self, BatchSwitchWitness, ChangePeer, ChangePeerV2, CheckPolicy, Merge, + RegionHeartbeatResponse, SplitRegion, SwitchWitness, TransferLeader, }, replication_modepb::{ DrAutoSyncState, RegionReplicationStatus, ReplicationMode, ReplicationStatus, @@ -40,7 +40,7 @@ use pd_client::{ }; use raft::eraftpb::ConfChangeType; use tikv_util::{ - store::{check_key_in_region, find_peer, is_learner, new_peer, QueryStats}, + store::{check_key_in_region, find_peer, find_peer_by_id, is_learner, new_peer, QueryStats}, time::{Instant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, Either, HandyRwLock, @@ -135,6 +135,11 @@ enum Operator { remove_peers: Vec, policy: SchedulePolicy, }, + BatchSwitchWitness { + peer_ids: Vec, + is_witnesses: Vec, + policy: SchedulePolicy, + }, } pub fn sleep_ms(ms: u64) { @@ -201,6 +206,22 @@ pub fn new_pd_merge_region(target_region: metapb::Region) -> RegionHeartbeatResp resp } +fn switch_witness(peer_id: u64, is_witness: bool) -> SwitchWitness { + let mut sw = SwitchWitness::default(); + sw.set_peer_id(peer_id); + sw.set_is_witness(is_witness); + sw +} + +pub fn new_pd_batch_switch_witnesses(switches: Vec) -> RegionHeartbeatResponse { + let mut switch_witnesses = BatchSwitchWitness::default(); + switch_witnesses.set_switch_witnesses(switches.into()); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_switch_witnesses(switch_witnesses); + resp +} + impl Operator { fn make_region_heartbeat_response( &self, @@ -276,6 +297,17 @@ impl Operator { } new_pd_change_peer_v2(cps) } + Operator::BatchSwitchWitness { + ref peer_ids, + ref is_witnesses, + .. + } => { + let mut switches = Vec::with_capacity(peer_ids.len()); + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + switches.push(switch_witness(*peer_id, *is_witness)); + } + new_pd_batch_switch_witnesses(switches) + } } } @@ -360,6 +392,26 @@ impl Operator { add && remove || !policy.schedule() } + Operator::BatchSwitchWitness { + ref peer_ids, + ref is_witnesses, + ref mut policy, + } => { + if !policy.schedule() { + return true; + } + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + if region + .get_peers() + .iter() + .any(|p| (p.get_id() == *peer_id) && (p.get_is_witness() != *is_witness)) + || cluster.pending_peers.contains_key(peer_id) + { + return false; + } + } + true + } } } } @@ -1043,6 +1095,48 @@ impl TestPdClient { panic!("region {:?} failed to leave joint", region); } + pub fn must_finish_switch_witnesses( + &self, + region_id: u64, + peer_ids: Vec, + is_witnesses: Vec, + ) { + for _ in 1..500 { + sleep_ms(10); + let region = match block_on(self.get_region_by_id(region_id)).unwrap() { + Some(region) => region, + None => continue, + }; + + for p in region.get_peers().iter() { + error!("in must_finish_switch_witnesses, p: {:?}", p); + } + + let mut need_retry = false; + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + match find_peer_by_id(®ion, *peer_id) { + Some(p) => { + if p.get_is_witness() != *is_witness + || self.cluster.rl().pending_peers.contains_key(&p.get_id()) + { + need_retry = true; + break; + } + } + None => { + need_retry = true; + break; + } + } + } + if !need_retry { + return; + } + } + let region = block_on(self.get_region_by_id(region_id)).unwrap(); + panic!("region {:?} failed to finish switch witnesses", region); + } + pub fn add_region(&self, region: &metapb::Region) { self.cluster.wl().add_region(region) } @@ -1072,6 +1166,15 @@ impl TestPdClient { self.schedule_operator(region_id, op); } + pub fn switch_witnesses(&self, region_id: u64, peer_ids: Vec, is_witnesses: Vec) { + let op = Operator::BatchSwitchWitness { + peer_ids, + is_witnesses, + policy: SchedulePolicy::TillSuccess, + }; + self.schedule_operator(region_id, op); + } + pub fn joint_confchange( &self, region_id: u64, @@ -1189,6 +1292,16 @@ impl TestPdClient { self.must_none_peer(region_id, peer); } + pub fn must_switch_witnesses( + &self, + region_id: u64, + peer_ids: Vec, + is_witnesses: Vec, + ) { + self.switch_witnesses(region_id, peer_ids.clone(), is_witnesses.clone()); + self.must_finish_switch_witnesses(region_id, peer_ids, is_witnesses); + } + pub fn must_joint_confchange( &self, region_id: u64, diff --git a/etc/error_code.toml b/etc/error_code.toml index 6b361e29e37..bb23c9b5e26 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -448,6 +448,11 @@ error = ''' KV:Raftstore:FlashbackNotPrepared ''' +["KV:Raftstore:IsWitness"] +error = ''' +KV:Raftstore:IsWitness +''' + ["KV:Raftstore:SnapAbort"] error = ''' KV:Raftstore:SnapAbort diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index 552434d1fed..ef178ee8aa0 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -4,22 +4,11 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; use collections::HashMap; use futures::executor::block_on; -use kvproto::{metapb, raft_serverpb::RaftApplyState}; +use kvproto::raft_serverpb::RaftApplyState; use pd_client::PdClient; use test_raftstore::*; use tikv_util::{config::ReadableDuration, store::find_peer}; -fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { - peer.set_role(metapb::PeerRole::Learner); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_remove_peer(region_id, peer.clone()); - peer.set_is_witness(true); - peer.set_id(peer.get_id() + 10); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - peer.set_role(metapb::PeerRole::Voter); - cluster.pd_client.must_add_peer(region_id, peer.clone()); -} - // Test the case local reader works well with witness peer. #[test] fn test_witness_update_region_in_local_reader() { @@ -35,8 +24,12 @@ fn test_witness_update_region_in_local_reader() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); cluster.must_put(b"k0", b"v0"); @@ -61,8 +54,8 @@ fn test_witness_update_region_in_local_reader() { .read(None, request.clone(), Duration::from_millis(100)) .unwrap(); assert_eq!( - resp.get_header().get_error().get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { region_id: region.get_id(), ..Default::default() } @@ -95,8 +88,12 @@ fn test_witness_raftlog_gc_pull_voter_replicated_index() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); // make sure raft log gc is triggered std::thread::sleep(Duration::from_millis(200)); @@ -176,8 +173,12 @@ fn test_witness_raftlog_gc_after_reboot() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); // make sure raft log gc is triggered std::thread::sleep(Duration::from_millis(200)); @@ -240,3 +241,235 @@ fn test_witness_raftlog_gc_after_reboot() { } fail::remove("on_raft_gc_log_tick"); } + +// Test the case request snapshot and apply successfully after non-witness +// restart. +#[test] +fn test_request_snapshot_after_reboot() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(20); + cluster.cfg.raft_store.check_request_snapshot_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + cluster.must_put(b"k1", b"v1"); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // witness -> nonwitness + let fp = "ignore request snapshot"; + fail::cfg(fp, "return").unwrap(); + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(500)); + // as we ignore request snapshot, so snapshot should still not applied yet + assert_eq!(cluster.pd_client.get_pending_peers().len(), 1); + must_get_none(&cluster.get_engine(3), b"k1"); + + cluster.stop_node(nodes[2]); + fail::remove(fp); + std::thread::sleep(Duration::from_millis(100)); + // the PeerState is Unavailable, so it will request snapshot immediately after + // start. + cluster.run_node(nodes[2]).unwrap(); + must_get_none(&cluster.get_engine(3), b"k1"); + std::thread::sleep(Duration::from_millis(500)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + assert_eq!(cluster.pd_client.get_pending_peers().len(), 0); +} + +// Test the case request snapshot and apply successfully after term change. +#[test] +fn test_request_snapshot_after_term_change() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(20); + cluster.cfg.raft_store.check_request_snapshot_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + cluster.must_put(b"k1", b"v1"); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // witness -> nonwitness + let fp1 = "ignore generate snapshot"; + fail::cfg(fp1, "return").unwrap(); + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(500)); + // as we ignore generate snapshot, so snapshot should still not applied yet + assert_eq!(cluster.pd_client.get_pending_peers().len(), 1); + must_get_none(&cluster.get_engine(3), b"k1"); + + let peer_on_store2 = find_peer(®ion, nodes[1]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + // After leader changes, the `term` and `last term` no longer match, so + // continue to receive `MsgAppend` until the two get equal, then retry to + // request snapshot and complete the application. + std::thread::sleep(Duration::from_millis(500)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + assert_eq!(cluster.pd_client.get_pending_peers().len(), 0); + fail::remove(fp1); +} + +fn test_non_witness_availability(fp: &str) { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.check_peers_availability_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // non-witness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + cluster.must_put(b"k1", b"v1"); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + fail::cfg(fp, "return").unwrap(); + + // witness -> non-witness + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(500)); + // snapshot applied + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + assert_eq!(cluster.pd_client.get_pending_peers().len(), 0); + fail::remove(fp); +} + +// Test the case leader pulls non-witness availability when non-witness failed +// to push the info. +#[test] +fn test_pull_non_witness_availability() { + test_non_witness_availability("ignore notify leader the peer is available"); +} + +// Test the case non-witness pushes its availability without leader pulling. +#[test] +fn test_push_non_witness_availability() { + test_non_witness_availability("ignore schedule check non-witness availability tick"); +} + +// Test the case non-witness hasn't finish applying snapshot when receives read +// request. +#[test] +fn test_non_witness_replica_read() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.check_request_snapshot_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + // witness -> nonwitness + fail::cfg("ignore request snapshot", "return").unwrap(); + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(100)); + // as we ignore request snapshot, so snapshot should still not applied yet + + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3.clone()); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { + region_id: region.get_id(), + ..Default::default() + } + ); + + // start requesting snapshot and give enough time for applying snapshot to + // complete + fail::remove("ignore request snapshot"); + std::thread::sleep(Duration::from_millis(500)); + + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request, Duration::from_millis(100)) + .unwrap(); + assert_eq!(resp.get_header().has_error(), false); +} diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index a4e15b8fa6e..bb35b069a41 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -255,6 +255,7 @@ fn test_serde_custom_tikv_config() { max_snapshot_file_raw_size: ReadableSize::gb(10), unreachable_backoff: ReadableDuration::secs(111), check_peers_availability_interval: ReadableDuration::secs(30), + check_request_snapshot_interval: ReadableDuration::minutes(1), }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index 301a743588e..f35b21b08a1 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -14,28 +14,6 @@ use raft::eraftpb::ConfChangeType; use test_raftstore::*; use tikv_util::store::find_peer; -fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { - peer.set_role(metapb::PeerRole::Learner); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_remove_peer(region_id, peer.clone()); - peer.set_is_witness(true); - peer.set_id(peer.get_id() + 10); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - peer.set_role(metapb::PeerRole::Voter); - cluster.pd_client.must_add_peer(region_id, peer.clone()); -} - -fn become_non_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { - peer.set_role(metapb::PeerRole::Learner); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_remove_peer(region_id, peer.clone()); - peer.set_is_witness(false); - peer.set_id(peer.get_id() + 10); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - peer.set_role(metapb::PeerRole::Voter); - cluster.pd_client.must_add_peer(region_id, peer.clone()); -} - // Test the case that region split or merge with witness peer #[test] fn test_witness_split_merge() { @@ -49,9 +27,12 @@ fn test_witness_split_merge() { let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); - + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); let before = cluster .apply_state(region.get_id(), nodes[2]) .get_applied_index(); @@ -96,8 +77,12 @@ fn test_witness_split_merge() { assert!(find_peer(&right, nodes[2]).unwrap().is_witness); // can't merge with different witness location - let mut peer_on_store3 = find_peer(&left, nodes[2]).unwrap().clone(); - become_non_witness(&cluster, left.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(&left, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + left.get_id(), + vec![peer_on_store3.get_id()], + vec![false], + ); let left = cluster.get_region(b"k1"); let req = new_admin_request( left.get_id(), @@ -174,6 +159,8 @@ fn test_witness_conf_change() { .pd_client .must_remove_peer(region.get_id(), peer_on_store3); + std::thread::sleep(Duration::from_millis(10)); + assert_eq!( cluster .region_local_state(region.get_id(), nodes[2]) @@ -182,124 +169,127 @@ fn test_witness_conf_change() { ); } -// #[test] -// // Test flow of switch witness -// fn test_witness_switch_witness() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// cluster.must_put(b"k1", b"v1"); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); - -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); - -// std::thread::sleep(Duration::from_millis(100)); -// must_get_none(&cluster.get_engine(3), b"k1"); - -// // witness -> nonwitness -// peer_on_store3.set_role(metapb::PeerRole::Learner); -// cluster -// .pd_client -// .must_add_peer(region.get_id(), peer_on_store3.clone()); -// cluster -// .pd_client -// .must_remove_peer(region.get_id(), peer_on_store3.clone()); -// peer_on_store3.set_is_witness(false); -// cluster -// .pd_client -// .must_add_peer(region.get_id(), peer_on_store3.clone()); -// std::thread::sleep(Duration::from_millis(100)); -// must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); -// } - -// TODO: add back when switch witness is supported -// // Test the case that leader is forbidden to become witness -// #[test] -// fn test_witness_leader() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// cluster.must_put(b"k1", b"v1"); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let mut peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); - -// // can't make leader to witness -// peer_on_store1.set_is_witness(true); -// cluster -// .pd_client -// .add_peer(region.get_id(), peer_on_store1.clone()); - -// std::thread::sleep(Duration::from_millis(100)); -// assert_eq!( -// cluster.leader_of_region(region.get_id()).unwrap().store_id, -// 1 -// ); -// // leader changes to witness failed, so still can get the value -// must_get_equal(&cluster.get_engine(nodes[0]), b"k1", b"v1"); - -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// // can't transfer leader to witness -// cluster.transfer_leader(region.get_id(), &mut peer_on_store3); -// assert_eq!( -// cluster.leader_of_region(region.get_id()).unwrap().store_id, -// nodes[0], -// ); -// } - -// TODO: add back when election priority is supported -// // Test the case that witness can't be elected as leader based on election -// // priority when there is no log gap -// #[test] -// fn test_witness_election_priority() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); -// cluster.must_put(b"k0", b"v0"); - -// // make sure logs are replicated to the witness -// std::thread::sleep(Duration::from_millis(100)); - -// for i in 1..10 { -// let node = -// cluster.leader_of_region(region.get_id()).unwrap().store_id; cluster. -// stop_node(node); let (k, v) = (format!("k{}", i), format!("v{}", i)); -// let key = k.as_bytes(); -// let value = v.as_bytes(); -// cluster.must_put(key, value); -// // the witness can't be elected as the leader when there is no log -// gap assert_ne!( -// cluster.leader_of_region(region.get_id()).unwrap().store_id, -// nodes[2], -// ); -// cluster.run_node(node).unwrap(); -// } -// } +// Test flow of switch witness +#[test] +fn test_witness_switch_witness() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // witness -> non-witness + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![false], + ); + + std::thread::sleep(Duration::from_millis(100)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); +} + +// Test the case that leader is forbidden to become witness +#[test] +fn test_witness_leader() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // can't make leader to witness + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store1.get_id()], vec![true]); + + std::thread::sleep(Duration::from_millis(100)); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + 1 + ); + // leader changes to witness failed, so still can get the value + must_get_equal(&cluster.get_engine(nodes[0]), b"k1", b"v1"); + + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + // can't transfer leader to witness + cluster.transfer_leader(region.get_id(), peer_on_store3); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[0], + ); +} + +// Test the case that witness can't be elected as leader based on election +// priority when there is no log gap +#[test] +fn test_witness_election_priority() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + cluster.must_put(b"k0", b"v0"); + + // make sure logs are replicated to the witness + std::thread::sleep(Duration::from_millis(100)); + + for i in 1..10 { + let node = cluster.leader_of_region(region.get_id()).unwrap().store_id; + cluster.stop_node(node); + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + // the witness can't be elected as the leader when there is no log gap + assert_ne!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[2], + ); + cluster.run_node(node).unwrap(); + // make sure logs are replicated to the restarted node + std::thread::sleep(Duration::from_millis(100)); + } +} // Test the case that truncated index won't advance when there is a witness even // if the gap gap exceeds the gc count limit @@ -320,8 +310,12 @@ fn test_witness_raftlog_gc_lagged_follower() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); // make sure raft log gc is triggered std::thread::sleep(Duration::from_millis(200)); @@ -391,8 +385,12 @@ fn test_witness_raftlog_gc_lagged_witness() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); cluster.must_put(b"k0", b"v0"); // make sure raft log gc is triggered @@ -447,8 +445,12 @@ fn test_witness_replica_read() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); let mut request = new_request( region.get_id(), @@ -463,15 +465,15 @@ fn test_witness_replica_read() { .read(None, request, Duration::from_millis(100)) .unwrap(); assert_eq!( - resp.get_header().get_error().get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { region_id: region.get_id(), ..Default::default() } ); } -fn must_get_error_recovery_in_progress( +fn must_get_error_is_witness( cluster: &mut Cluster, region: &metapb::Region, cmd: kvproto::raft_cmdpb::Request, @@ -486,8 +488,8 @@ fn must_get_error_recovery_in_progress( .call_command_on_leader(req, Duration::from_millis(100)) .unwrap(); assert_eq!( - resp.get_header().get_error().get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { region_id: region.get_id(), ..Default::default() }, @@ -513,9 +515,13 @@ fn test_witness_leader_down() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); - let mut peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); + let peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); // nonwitness -> witness - become_witness(&cluster, region.get_id(), &mut peer_on_store2); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store2.get_id()], + vec![true], + ); // the other follower is isolated cluster.add_send_filter(IsolationFilterFactory::new(3)); @@ -530,13 +536,13 @@ fn test_witness_leader_down() { // forbid writes let put = new_put_cmd(b"k3", b"v3"); - must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + must_get_error_is_witness(&mut cluster, ®ion, put); // forbid reads let get = new_get_cmd(b"k1"); - must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + must_get_error_is_witness(&mut cluster, ®ion, get); // forbid read index let read_index = new_read_index_cmd(); - must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + must_get_error_is_witness(&mut cluster, ®ion, read_index); let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store3); From f178f781048bef4930a8e82fd08c3e194e9f8ae4 Mon Sep 17 00:00:00 2001 From: Hu# Date: Fri, 20 Jan 2023 14:57:49 +0800 Subject: [PATCH 467/676] resource_manager: add watch for resource group (#14022) close tikv/tikv#13983 - add etcd mock for pd - add service for resource group Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- Cargo.lock | 10 + components/pd_client/src/client.rs | 67 +++- components/pd_client/src/errors.rs | 6 +- components/pd_client/src/lib.rs | 24 +- components/resource_control/Cargo.toml | 10 + components/resource_control/src/lib.rs | 3 + .../resource_control/src/resource_group.rs | 7 +- components/resource_control/src/service.rs | 267 ++++++++++++++++ components/server/src/server.rs | 16 +- components/server/src/server2.rs | 16 +- components/test_pd/Cargo.toml | 3 + components/test_pd/src/lib.rs | 1 + components/test_pd/src/mocker/etcd.rs | 288 ++++++++++++++++++ components/test_pd/src/mocker/mod.rs | 61 +++- components/test_pd/src/server.rs | 67 +++- components/tikv_util/src/worker/pool.rs | 7 + tests/failpoints/cases/test_pd_client.rs | 54 ---- .../failpoints/cases/test_pd_client_legacy.rs | 111 ++++--- 18 files changed, 866 insertions(+), 152 deletions(-) create mode 100644 components/resource_control/src/service.rs create mode 100644 components/test_pd/src/mocker/etcd.rs diff --git a/Cargo.lock b/Cargo.lock index e9f55d1923d..ee047aaae6d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4676,15 +4676,22 @@ dependencies = [ "byteorder", "crossbeam-skiplist", "dashmap", + "fail", + "futures 0.3.15", "kvproto", "lazy_static", "online_config", + "pd_client", "pin-project", "prometheus", + "protobuf", "serde", "slog", "slog-global", + "test_pd", + "test_pd_client", "tikv_util", + "tokio", "yatp", ] @@ -5838,11 +5845,14 @@ dependencies = [ "futures 0.3.15", "grpcio", "kvproto", + "log_wrappers", "pd_client", "security", "slog", "slog-global", "tikv_util", + "tokio", + "tokio-stream", ] [[package]] diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 1e1e5980908..6686c4e8a04 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -1,7 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::HashMap, fmt, sync::{ atomic::{AtomicU64, Ordering}, @@ -286,9 +285,46 @@ impl fmt::Debug for RpcClient { const LEADER_CHANGE_RETRY: usize = 10; impl PdClient for RpcClient { - fn load_global_config(&self, config_path: String) -> PdFuture> { - use kvproto::pdpb::LoadGlobalConfigRequest; - let mut req = LoadGlobalConfigRequest::new(); + fn store_global_config( + &self, + config_path: String, + items: Vec, + ) -> PdFuture<()> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["store_global_config"]) + .start_coarse_timer(); + + let mut req = pdpb::StoreGlobalConfigRequest::new(); + req.set_config_path(config_path); + req.set_changes(items.into()); + let executor = move |client: &Client, req| match client + .inner + .rl() + .client_stub + .store_global_config_async(&req) + { + Ok(grpc_response) => Box::pin(async move { + if let Err(err) = grpc_response.await { + return Err(box_err!("{:?}", err)); + } + Ok(()) + }) as PdFuture<_>, + Err(err) => Box::pin(async move { Err(box_err!("{:?}", err)) }) as PdFuture<_>, + }; + self.pd_client + .request(req, executor, LEADER_CHANGE_RETRY) + .execute() + } + + fn load_global_config( + &self, + config_path: String, + ) -> PdFuture<(Vec, i64)> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["load_global_config"]) + .start_coarse_timer(); + + let mut req = pdpb::LoadGlobalConfigRequest::new(); req.set_config_path(config_path); let executor = |client: &Client, req| match client .inner @@ -299,13 +335,10 @@ impl PdClient for RpcClient { { Ok(grpc_response) => Box::pin(async move { match grpc_response.await { - Ok(grpc_response) => { - let mut res = HashMap::with_capacity(grpc_response.get_items().len()); - for c in grpc_response.get_items() { - res.insert(c.get_name().to_owned(), c.get_value().to_owned()); - } - Ok(res) - } + Ok(grpc_response) => Ok(( + Vec::from(grpc_response.get_items()), + grpc_response.get_revision(), + )), Err(err) => Err(box_err!("{:?}", err)), } }) as PdFuture<_>, @@ -318,9 +351,17 @@ impl PdClient for RpcClient { fn watch_global_config( &self, + config_path: String, + revision: i64, ) -> Result> { - use kvproto::pdpb::WatchGlobalConfigRequest; - let req = WatchGlobalConfigRequest::default(); + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["watch_global_config"]) + .start_coarse_timer(); + + let mut req = pdpb::WatchGlobalConfigRequest::default(); + info!("[global_config] start watch global config"; "path" => &config_path, "revision" => revision); + req.set_config_path(config_path); + req.set_revision(revision); sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, _| { client.watch_global_config(&req) }) diff --git a/components/pd_client/src/errors.rs b/components/pd_client/src/errors.rs index 689cb276064..5bacca03354 100644 --- a/components/pd_client/src/errors.rs +++ b/components/pd_client/src/errors.rs @@ -35,12 +35,14 @@ pub type Result = result::Result; impl Error { pub fn retryable(&self) -> bool { match self { - Error::Grpc(_) | Error::ClusterNotBootstrapped(_) | Error::StreamDisconnect(_) => true, + Error::Grpc(_) + | Error::ClusterNotBootstrapped(_) + | Error::StreamDisconnect(_) + | Error::DataCompacted(_) => true, Error::Other(_) | Error::RegionNotFound(_) | Error::StoreTombstone(_) | Error::GlobalConfigNotFound(_) - | Error::DataCompacted(_) | Error::ClusterBootstrapped(_) | Error::Incompatible => false, } diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 46a3e6924db..b877750770d 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -14,15 +14,14 @@ mod util; mod config; pub mod errors; -use std::{cmp::Ordering, collections::HashMap, ops::Deref, sync::Arc, time::Duration}; +use std::{cmp::Ordering, ops::Deref, sync::Arc, time::Duration}; use futures::future::BoxFuture; -use grpcio::ClientSStreamReceiver; use kvproto::{ metapb, pdpb, replication_modepb::{RegionReplicationStatus, ReplicationStatus, StoreDrAutoSyncStatus}, }; -use pdpb::{QueryStats, WatchGlobalConfigResponse}; +use pdpb::QueryStats; use tikv_util::time::{Instant, UnixSecs}; use txn_types::TimeStamp; @@ -201,6 +200,8 @@ impl BucketStat { } pub const INVALID_ID: u64 = 0; +// TODO: Implementation of config registration for each module +pub const RESOURCE_CONTROL_CONFIG_PATH: &str = "resource_group/settings"; /// PdClient communicates with Placement Driver (PD). /// Because now one PD only supports one cluster, so it is no need to pass @@ -209,17 +210,28 @@ pub const INVALID_ID: u64 = 0; /// all the time. pub trait PdClient: Send + Sync { /// Load a list of GlobalConfig - fn load_global_config(&self, _config_path: String) -> PdFuture> { + fn load_global_config( + &self, + _config_path: String, + ) -> PdFuture<(Vec, i64)> { unimplemented!(); } /// Store a list of GlobalConfig - fn store_global_config(&self, _list: HashMap) -> PdFuture<()> { + fn store_global_config( + &self, + _config_path: String, + _items: Vec, + ) -> PdFuture<()> { unimplemented!(); } /// Watching change of GlobalConfig - fn watch_global_config(&self) -> Result> { + fn watch_global_config( + &self, + _config_path: String, + _revision: i64, + ) -> Result> { unimplemented!(); } diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml index 822aed2cd2d..3f796627040 100644 --- a/components/resource_control/Cargo.toml +++ b/components/resource_control/Cargo.toml @@ -4,17 +4,27 @@ version = "0.0.1" edition = "2021" publish = false +[features] +failpoints = ["fail/failpoints"] + [dependencies] byteorder = "1.2" crossbeam-skiplist = "0.1" dashmap = "5.1" +fail = "0.5" +futures = { version = "0.3" } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.0" online_config = { workspace = true } +pd_client = { workspace = true } pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } +protobuf = { version = "2.8", features = ["bytes"] } serde = { version = "1.0", features = ["derive"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +test_pd = { workspace = true } +test_pd_client = { workspace = true } tikv_util = { workspace = true } +tokio = { version = "1.5", features = ["time"] } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index eb6679f71e8..5534ed2153d 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -11,6 +11,9 @@ pub use resource_group::{ mod future; pub use future::ControlledFuture; +mod service; +pub use service::ResourceManagerService; + #[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig, Default)] #[serde(default)] #[serde(rename_all = "kebab-case")] diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index bfe9d92d0f3..23a50b42560 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -295,18 +295,19 @@ impl GroupPriorityTracker { } #[cfg(test)] -mod tests { - use kvproto::resource_manager::*; +pub(crate) mod tests { use yatp::queue::Extras; use super::*; - fn new_resource_group( + pub fn new_resource_group( name: String, is_ru_mode: bool, read_tokens: u64, write_tokens: u64, ) -> ResourceGroup { + use kvproto::resource_manager::{GroupRawResourceSettings, GroupRequestUnitSettings}; + let mut group = ResourceGroup::new(); group.set_name(name); let mode = if is_ru_mode { diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs new file mode 100644 index 00000000000..ea9a9d724b9 --- /dev/null +++ b/components/resource_control/src/service.rs @@ -0,0 +1,267 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::Arc, time::Duration}; + +use futures::StreamExt; +use kvproto::{pdpb::EventType, resource_manager::ResourceGroup}; +use pd_client::{Error as PdError, PdClient, RpcClient, RESOURCE_CONTROL_CONFIG_PATH}; +use tikv_util::error; + +use crate::ResourceGroupManager; + +#[derive(Clone)] +pub struct ResourceManagerService { + manager: Arc, + pd_client: Arc, + // record watch revision + revision: i64, +} + +impl ResourceManagerService { + /// Constructs a new `Service` with `ResourceGroupManager` and a `RpcClient` + pub fn new( + manager: Arc, + pd_client: Arc, + ) -> ResourceManagerService { + ResourceManagerService { + pd_client, + manager, + revision: 0, + } + } +} + +impl ResourceManagerService { + pub async fn watch_resource_groups(&mut self) { + // Firstly, load all resource groups as of now. + let (groups, revision) = self.list_resource_groups().await; + self.revision = revision; + groups + .into_iter() + .for_each(|rg| self.manager.add_resource_group(rg)); + // Secondly, start watcher at loading revision. + loop { + match self + .pd_client + .watch_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), self.revision) + { + Ok(mut stream) => { + while let Some(grpc_response) = stream.next().await { + match grpc_response { + Ok(r) => { + self.revision = r.get_revision(); + r.get_changes() + .iter() + .for_each(|item| match item.get_kind() { + EventType::Put => { + if let Ok(group) = + protobuf::parse_from_bytes::( + item.get_value().as_bytes(), + ) + { + self.manager.add_resource_group(group); + } + } + EventType::Delete => { + self.manager.remove_resource_group(item.get_name()); + } + }); + } + Err(err) => { + error!("failed to get stream"; "err" => ?err); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } + Err(PdError::DataCompacted(msg)) => { + error!("required revision has been compacted"; "err" => ?msg); + // If the etcd revision is compacted, we need to reload all resouce groups. + let (groups, revision) = self.list_resource_groups().await; + self.revision = revision; + groups + .into_iter() + .for_each(|rg| self.manager.add_resource_group(rg)); + } + Err(err) => { + error!("failed to watch resource groups"; "err" => ?err); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } + + async fn list_resource_groups(&mut self) -> (Vec, i64) { + loop { + match self + .pd_client + .load_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string()) + .await + { + Ok((items, revision)) => { + let groups = items + .into_iter() + .filter_map(|g| protobuf::parse_from_bytes(g.get_value().as_bytes()).ok()) + .collect(); + return (groups, revision); + } + Err(err) => { + error!("failed to load global config"; "err" => ?err); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } +} + +#[cfg(test)] +pub mod tests { + use std::time::Duration; + + use futures::executor::block_on; + use kvproto::pdpb::GlobalConfigItem; + use pd_client::RpcClient; + use protobuf::Message; + use test_pd::{mocker::Service, util::*, Server as MockServer}; + use tikv_util::{config::ReadableDuration, worker::Builder}; + + use crate::resource_group::tests::new_resource_group; + + fn new_test_server_and_client( + update_interval: ReadableDuration, + ) -> (MockServer, RpcClient) { + let server = MockServer::new(1); + let eps = server.bind_addrs(); + let client = new_client_with_update_interval(eps, None, update_interval); + (server, client) + } + + fn add_resource_group(pd_client: Arc, group: ResourceGroup) { + let mut item = GlobalConfigItem::default(); + item.set_kind(EventType::Put); + item.set_name(group.get_name().to_string()); + let mut buf = Vec::new(); + group.write_to_vec(&mut buf).unwrap(); + item.set_value(String::from_utf8(buf).unwrap()); + + futures::executor::block_on(async move { + pd_client + .store_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), vec![item]) + .await + }) + .unwrap(); + } + + fn delete_resource_group(pd_client: Arc, name: &str) { + let mut item = GlobalConfigItem::default(); + item.set_kind(EventType::Delete); + item.set_name(name.to_string()); + + futures::executor::block_on(async move { + pd_client + .store_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), vec![item]) + .await + }) + .unwrap(); + } + + use super::*; + #[test] + fn crud_config_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let group = new_resource_group("TEST".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 1); + assert_eq!(revision, 1); + + delete_resource_group(s.pd_client.clone(), "TEST"); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 0); + assert_eq!(revision, 2); + + server.stop(); + } + + #[test] + fn watch_config_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 0); + assert_eq!(revision, 0); + + let background_worker = Builder::new("background").thread_count(1).create(); + let mut s_clone = s.clone(); + background_worker.spawn_async_task(async move { + s_clone.watch_resource_groups().await; + }); + // Mock add + let group1 = new_resource_group("TEST1".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group1); + let group2 = new_resource_group("TEST2".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group2); + // Mock modify + let group2 = new_resource_group("TEST2".into(), true, 50, 50); + add_resource_group(s.pd_client.clone(), group2); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 2); + assert_eq!(revision, 3); + // Mock delete + delete_resource_group(s.pd_client.clone(), "TEST1"); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 1); + assert_eq!(revision, 4); + // Wait for watcher + std::thread::sleep(Duration::from_millis(100)); + let groups = s.manager.get_all_resource_groups(); + assert_eq!(groups.len(), 1); + assert!(s.manager.get_resource_group("TEST1").is_none()); + let group = s.manager.get_resource_group("TEST2").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_r_u() + .get_settings() + .get_fill_rate(), + 50 + ); + server.stop(); + } + + #[test] + fn reboot_watch_server_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let background_worker = Builder::new("background").thread_count(1).create(); + let mut s_clone = s.clone(); + background_worker.spawn_async_task(async move { + s_clone.watch_resource_groups().await; + }); + // Mock add + let group1 = new_resource_group("TEST1".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group1); + // Mock reboot watch server + let watch_global_config_fp = "watch_global_config_return"; + fail::cfg(watch_global_config_fp, "return").unwrap(); + std::thread::sleep(Duration::from_millis(100)); + fail::remove(watch_global_config_fp); + // Mock add after rebooting will success + let group1 = new_resource_group("TEST2".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group1); + // Wait watcher update + std::thread::sleep(Duration::from_secs(1)); + let groups = s.manager.get_all_resource_groups(); + assert_eq!(groups.len(), 2); + + server.stop(); + } +} diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 97fd1f77eef..207373313a4 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -82,7 +82,9 @@ use raftstore::{ }, RaftRouterCompactedEventSender, }; -use resource_control::{ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, +}; use security::SecurityManager; use snap_recovery::RecoveryService; use tikv::{ @@ -330,11 +332,17 @@ where let resource_manager = if config.resource_control.enabled { let mgr = Arc::new(ResourceGroupManager::default()); - let mgr1 = mgr.clone(); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); // spawn a task to periodically update the minimal virtual time of all resource - // group. + // groups. + let resource_mgr = mgr.clone(); background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { - mgr1.advance_min_virtual_time(); + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; }); Some(mgr) } else { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 36a02130fdb..f193e1c7445 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -65,7 +65,9 @@ use raftstore::{ RegionInfoAccessor, }; use raftstore_v2::{router::RaftRouter, StateStorage}; -use resource_control::{ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, +}; use security::SecurityManager; use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, @@ -294,11 +296,17 @@ where let resource_manager = if config.resource_control.enabled { let mgr = Arc::new(ResourceGroupManager::default()); - let mgr1 = mgr.clone(); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); // spawn a task to periodically update the minimal virtual time of all resource - // group. + // groups. + let resource_mgr = mgr.clone(); background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { - mgr1.advance_min_virtual_time(); + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; }); Some(mgr) } else { diff --git a/components/test_pd/Cargo.toml b/components/test_pd/Cargo.toml index a478e6ee325..6277789b194 100644 --- a/components/test_pd/Cargo.toml +++ b/components/test_pd/Cargo.toml @@ -10,8 +10,11 @@ fail = "0.5" futures = "0.3" grpcio = { workspace = true } kvproto = { workspace = true } +log_wrappers = { workspace = true } pd_client = { workspace = true } security = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tikv_util = { workspace = true } +tokio = { version = "1.0", features = ["full"] } +tokio-stream = "0.1" diff --git a/components/test_pd/src/lib.rs b/components/test_pd/src/lib.rs index 187a899d7fb..bd768e58318 100644 --- a/components/test_pd/src/lib.rs +++ b/components/test_pd/src/lib.rs @@ -1,4 +1,5 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(slice_group_by)] #[macro_use] extern crate tikv_util; diff --git a/components/test_pd/src/mocker/etcd.rs b/components/test_pd/src/mocker/etcd.rs new file mode 100644 index 00000000000..3939dfc9a72 --- /dev/null +++ b/components/test_pd/src/mocker/etcd.rs @@ -0,0 +1,288 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cell::Cell, + collections::{BTreeMap, HashMap}, + ops::Bound, + sync::Arc, +}; + +use futures::lock::Mutex; +use tokio::sync::mpsc::{self, Sender}; +use tokio_stream::wrappers::ReceiverStream; + +use super::Result; + +/// An in-memory, single versioned storage. +/// Emulating some interfaces of etcd for testing. +#[derive(Default, Debug)] +pub struct Etcd { + items: BTreeMap, + subs: HashMap, + revision: i64, + sub_id_alloc: Cell, +} + +pub type EtcdClient = Arc>; + +impl Etcd { + fn alloc_rev(&mut self) -> i64 { + self.revision += 1; + self.revision + } + + pub fn get_revision(&self) -> i64 { + self.revision + } + + pub fn get_key(&self, keys: Keys) -> (Vec, i64) { + let (start_key, end_key) = keys.into_bound(); + let kvs = self + .items + .range(( + Bound::Included(&Key(start_key, 0)), + Bound::Excluded(&Key(end_key, self.revision)), + )) + .collect::>() + .as_slice() + .group_by(|item1, item2| item1.0.0 == item2.0.0) + .filter_map(|group| { + let (k, v) = group.last()?; + match v { + Value::Val(val) => Some(KeyValue(MetaKey(k.0.clone()), val.clone())), + Value::Del => None, + } + }) + .fold(Vec::new(), |mut items, item| { + items.push(item); + items + }); + + (kvs, self.get_revision()) + } + + pub async fn set(&mut self, mut pair: KeyValue) -> Result<()> { + let rev = self.alloc_rev(); + for sub in self.subs.values() { + if pair.key() < sub.end_key.as_slice() && pair.key() >= sub.start_key.as_slice() { + sub.tx + .send(KvEvent { + kind: KvEventType::Put, + pair: pair.clone(), + }) + .await + .unwrap(); + } + } + self.items + .insert(Key(pair.take_key(), rev), Value::Val(pair.take_value())); + Ok(()) + } + + pub async fn delete(&mut self, keys: Keys) -> Result<()> { + let (start_key, end_key) = keys.into_bound(); + let rev = self.alloc_rev(); + let mut v = self + .items + .range(( + Bound::Included(Key(start_key, 0)), + Bound::Excluded(Key(end_key, self.revision)), + )) + .map(|(k, _)| Key::clone(k)) + .collect::>(); + v.dedup_by(|k1, k2| k1.0 == k2.0); + + for mut victim in v { + let k = Key(victim.0.clone(), rev); + self.items.insert(k, Value::Del); + + for sub in self.subs.values() { + if victim.0.as_slice() < sub.end_key.as_slice() + && victim.0.as_slice() >= sub.start_key.as_slice() + { + sub.tx + .send(KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(std::mem::take(&mut victim.0)), vec![]), + }) + .await + .unwrap(); + } + } + } + Ok(()) + } + + pub async fn watch(&mut self, keys: Keys, start_rev: i64) -> Result> { + let id = self.sub_id_alloc.get(); + self.sub_id_alloc.set(id + 1); + let (tx, rx) = mpsc::channel(1024); + let (start_key, end_key) = keys.into_bound(); + + // Sending events from [start_rev, now) to the client. + let mut pending = self + .items + .range(( + Bound::Included(Key(start_key.clone(), 0)), + Bound::Excluded(Key(end_key.clone(), self.revision)), + )) + .filter(|(k, _)| k.1 >= start_rev) + .collect::>(); + pending.sort_by_key(|(k, _)| k.1); + for (k, v) in pending { + let event = match v { + Value::Val(val) => KvEvent { + kind: KvEventType::Put, + pair: KeyValue(MetaKey(k.0.clone()), val.clone()), + }, + Value::Del => KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(k.0.clone()), vec![]), + }, + }; + tx.send(event).await.expect("too many pending events"); + } + + self.subs.insert( + id, + Subscriber { + start_key, + end_key, + tx, + }, + ); + Ok(ReceiverStream::new(rx)) + } + + pub fn clear_subs(&mut self) { + self.subs.clear(); + self.sub_id_alloc.set(0); + } + + /// A tool for dumpling the whole storage when test failed. + /// Add this to test code temporarily for debugging. + #[allow(dead_code)] + pub fn dump(&self) { + println!(">>>>>>> /etc (revision = {}) <<<<<<<", self.revision); + for (k, v) in self.items.iter() { + println!("{:?} => {:?}", k, v); + } + } +} + +#[derive(Clone, Debug)] +pub struct MetaKey(pub Vec); + +impl MetaKey { + /// return the key that keeps the range [self, self.next()) contains only + /// `self`. + pub fn next(&self) -> Self { + let mut next = self.clone(); + next.0.push(0); + next + } + + /// return the key that keeps the range [self, self.next_prefix()) contains + /// all keys with the prefix `self`. + pub fn next_prefix(&self) -> Self { + let mut next_prefix = self.clone(); + for i in (0..next_prefix.0.len()).rev() { + if next_prefix.0[i] == u8::MAX { + next_prefix.0.pop(); + } else { + next_prefix.0[i] += 1; + break; + } + } + next_prefix + } +} + +/// A simple key value pair of metadata. +#[derive(Clone, Debug)] +pub struct KeyValue(pub MetaKey, pub Vec); + +impl KeyValue { + pub fn key(&self) -> &[u8] { + self.0.0.as_slice() + } + + pub fn value(&self) -> &[u8] { + self.1.as_slice() + } + + pub fn take_key(&mut self) -> Vec { + std::mem::take(&mut self.0.0) + } + + pub fn take_value(&mut self) -> Vec { + std::mem::take(&mut self.1) + } +} + +#[derive(Debug)] +pub enum KvEventType { + Put, + Delete, +} + +#[derive(Debug)] +pub struct KvEvent { + pub kind: KvEventType, + pub pair: KeyValue, +} + +#[derive(Debug)] +struct Subscriber { + start_key: Vec, + end_key: Vec, + tx: Sender, +} + +/// A key with revision. +#[derive(Default, Eq, PartialEq, Ord, PartialOrd, Clone)] +struct Key(Vec, i64); + +impl std::fmt::Debug for Key { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Key") + .field(&format_args!( + "{}@{}", + log_wrappers::Value::key(&self.0), + self.1 + )) + .finish() + } +} + +/// A value (maybe tombstone.) +#[derive(Debug, PartialEq, Clone)] +enum Value { + Val(Vec), + Del, +} + +/// The key set for getting. +#[derive(Debug)] +pub enum Keys { + Prefix(MetaKey), + Range(MetaKey, MetaKey), + Key(MetaKey), +} + +impl Keys { + /// convert the key set for corresponding key range. + pub fn into_bound(self) -> (Vec, Vec) { + match self { + Keys::Prefix(x) => { + let next = x.next_prefix().0; + ((x.0), (next)) + } + Keys::Range(start, end) => ((start.0), (end.0)), + Keys::Key(k) => { + let next = k.next().0; + ((k.0), (next)) + } + } + } +} diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index 84c2508d4ea..b9ae839b06e 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -2,15 +2,18 @@ use std::result; +use futures::executor::block_on; use kvproto::pdpb::*; mod bootstrap; +pub mod etcd; mod incompatible; mod leader_change; mod retry; mod service; mod split; +use self::etcd::{EtcdClient, KeyValue, Keys, MetaKey}; pub use self::{ bootstrap::AlreadyBootstrapped, incompatible::Incompatible, @@ -28,28 +31,62 @@ pub trait PdMocker { fn load_global_config( &self, _req: &LoadGlobalConfigRequest, + etcd_client: EtcdClient, ) -> Option> { - let mut send = vec![]; - for r in 0..10 { - let mut i = GlobalConfigItem::default(); - i.set_name(format!("/global/config/{}", r)); - i.set_value(r.to_string()); - send.push(i); - } let mut res = LoadGlobalConfigResponse::default(); - res.set_items(send.into()); + let mut items = Vec::new(); + let (resp, revision) = block_on(async move { + etcd_client.lock().await.get_key(Keys::Range( + MetaKey(b"".to_vec()), + MetaKey(b"\xff".to_vec()), + )) + }); + + let values: Vec = resp + .iter() + .map(|kv| { + let mut item = GlobalConfigItem::default(); + item.set_name(String::from_utf8(kv.key().to_vec()).unwrap()); + item.set_value(String::from_utf8(kv.value().to_vec()).unwrap()); + item + }) + .collect(); + + items.extend(values); + res.set_revision(revision); + res.set_items(items.into()); Some(Ok(res)) } fn store_global_config( &self, - _: &StoreGlobalConfigRequest, + req: &StoreGlobalConfigRequest, + etcd_client: EtcdClient, ) -> Option> { - unimplemented!() + for item in req.get_changes() { + let cli = etcd_client.clone(); + block_on(async move { + match item.get_kind() { + EventType::Put => { + let kv = KeyValue(MetaKey(item.get_name().into()), item.get_value().into()); + cli.lock().await.set(kv).await + } + EventType::Delete => { + let key = Keys::Key(MetaKey(item.get_name().into())); + cli.lock().await.delete(key).await + } + } + }) + .unwrap(); + } + Some(Ok(StoreGlobalConfigResponse::default())) } - fn watch_global_config(&self) -> Option> { - panic!("could not mock this function due to it should return a stream") + fn watch_global_config( + &self, + _req: &WatchGlobalConfigRequest, + ) -> Option> { + unimplemented!() } fn get_members(&self, _: &GetMembersRequest) -> Option> { diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index 9e1a2b3bb0f..cb495307a1f 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -1,6 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + str::from_utf8, sync::{ atomic::{AtomicI64, Ordering}, Arc, @@ -20,6 +21,7 @@ use pd_client::Error as PdError; use security::*; use super::mocker::*; +use crate::mocker::etcd::{EtcdClient, Keys, KvEventType, MetaKey}; pub struct Server { server: Option, @@ -57,6 +59,7 @@ impl Server { default_handler, case, tso_logical: Arc::new(AtomicI64::default()), + etcd_client: EtcdClient::default(), }; let mut server = Server { server: None, @@ -170,6 +173,7 @@ struct PdMock { default_handler: Arc, case: Option>, tso_logical: Arc, + etcd_client: EtcdClient, } impl Clone for PdMock { @@ -178,6 +182,7 @@ impl Clone for PdMock { default_handler: Arc::clone(&self.default_handler), case: self.case.clone(), tso_logical: self.tso_logical.clone(), + etcd_client: self.etcd_client.clone(), } } } @@ -189,39 +194,71 @@ impl Pd for PdMock { req: LoadGlobalConfigRequest, sink: UnarySink, ) { - hijack_unary(self, ctx, sink, |c| c.load_global_config(&req)) + let cli = self.etcd_client.clone(); + hijack_unary(self, ctx, sink, |c| c.load_global_config(&req, cli.clone())) } fn store_global_config( &mut self, - _ctx: RpcContext<'_>, - _req: StoreGlobalConfigRequest, - _sink: UnarySink, + ctx: RpcContext<'_>, + req: StoreGlobalConfigRequest, + sink: UnarySink, ) { - unimplemented!() + let cli = self.etcd_client.clone(); + hijack_unary(self, ctx, sink, |c| { + c.store_global_config(&req, cli.clone()) + }) } fn watch_global_config( &mut self, ctx: RpcContext<'_>, - _req: WatchGlobalConfigRequest, + req: WatchGlobalConfigRequest, mut sink: ServerStreamingSink, ) { - ctx.spawn(async move { - let mut name: usize = 0; - loop { + let cli = self.etcd_client.clone(); + let future = async move { + let mut watcher = match cli + .lock() + .await + .watch( + Keys::Range(MetaKey(b"".to_vec()), MetaKey(b"\xff".to_vec())), + req.revision, + ) + .await + { + Ok(w) => w, + Err(err) => { + error!("failed to watch: {:?}", err); + return; + } + }; + + while let Some(event) = watcher.as_mut().recv().await { + info!("watch event from etcd"; "event" => ?event); let mut change = GlobalConfigItem::new(); - change.set_name(format!("/global/config/{:?}", name).to_owned()); - change.set_value(format!("{:?}", name)); + change.set_kind(match event.kind { + KvEventType::Put => EventType::Put, + KvEventType::Delete => EventType::Delete, + }); + change.set_name(from_utf8(event.pair.key()).unwrap().to_string()); + change.set_value(from_utf8(event.pair.value()).unwrap().to_string()); let mut wc = WatchGlobalConfigResponse::default(); wc.set_changes(vec![change].into()); - // simulate network delay - std::thread::sleep(Duration::from_millis(10)); - name += 1; let _ = sink.send((wc, WriteFlags::default())).await; let _ = sink.flush().await; + #[cfg(feature = "failpoints")] + { + use futures::executor::block_on; + let cli_clone = cli.clone(); + fail_point!("watch_global_config_return", |_| { + block_on(async move { cli_clone.lock().await.clear_subs() }); + watcher.close(); + }); + } } - }) + }; + ctx.spawn(future); } fn get_members( diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index e761fac8bb5..26dbf495f54 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -405,6 +405,13 @@ impl Worker { }); } + pub fn spawn_async_task(&self, f: F) + where + F: Future + Send + 'static, + { + self.remote.spawn(f); + } + fn delay_notify(tx: UnboundedSender>, timeout: Duration) { let now = Instant::now(); let f = GLOBAL_TIMER_HANDLE diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index 7dd767d19c9..92942fa90f9 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -97,60 +97,6 @@ fn test_pd_client_deadlock() { fail::remove(pd_client_reconnect_fp); } -#[test] -fn test_load_global_config() { - let (mut _server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); - let res = futures::executor::block_on(async move { - client.load_global_config("global".to_string()).await - }); - for (k, v) in res.unwrap() { - assert_eq!(k, format!("/global/config/{}", v)) - } -} - -#[test] -fn test_watch_global_config_on_closed_server() { - let (mut server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); - use futures::StreamExt; - let j = std::thread::spawn(move || { - let mut r = client.watch_global_config().unwrap(); - block_on(async move { - let mut i: usize = 0; - while let Some(r) = r.next().await { - match r { - Ok(res) => { - let change = &res.get_changes()[0]; - assert_eq!( - change - .get_name() - .split('/') - .collect::>() - .last() - .unwrap() - .to_owned(), - format!("{:?}", i) - ); - assert_eq!(change.get_value().to_owned(), format!("{:?}", i)); - i += 1; - } - Err(e) => { - if let grpcio::Error::RpcFailure(e) = e { - // 14-UNAVAILABLE - assert_eq!(e.code(), grpcio::RpcStatusCode::from(14)); - break; - } else { - panic!("other error occur {:?}", e) - } - } - } - } - }); - }); - thread::sleep(Duration::from_millis(200)); - server.stop(); - j.join().unwrap(); -} - // Updating pd leader may be slow, we need to make sure it does not block other // RPC in the same gRPC Environment. #[test] diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs index 172db8ac09e..3638e448bd9 100644 --- a/tests/failpoints/cases/test_pd_client_legacy.rs +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -7,11 +7,11 @@ use std::{ }; use grpcio::EnvBuilder; -use kvproto::metapb::*; +use kvproto::{metapb::*, pdpb::GlobalConfigItem}; use pd_client::{PdClient, RegionInfo, RegionStat, RpcClient}; use security::{SecurityConfig, SecurityManager}; use test_pd::{mocker::*, util::*, Server as MockServer}; -use tikv_util::config::ReadableDuration; +use tikv_util::{config::ReadableDuration, worker::Builder}; fn new_test_server_and_client( update_interval: ReadableDuration, @@ -108,57 +108,90 @@ fn test_pd_client_deadlock() { #[test] fn test_load_global_config() { let (mut _server, client) = new_test_server_and_client(ReadableDuration::millis(100)); - let res = - futures::executor::block_on( - async move { client.load_global_config("global".into()).await }, - ); - for (k, v) in res.unwrap() { - assert_eq!(k, format!("/global/config/{}", v)) + let global_items = vec![("test1", "val1"), ("test2", "val2"), ("test3", "val3")]; + let check_items = global_items.clone(); + if let Err(err) = futures::executor::block_on( + client.store_global_config( + String::from("global"), + global_items + .iter() + .map(|(name, value)| { + let mut item = GlobalConfigItem::default(); + item.set_name(name.to_string()); + item.set_value(value.to_string()); + item + }) + .collect::>(), + ), + ) { + panic!("error occur {:?}", err); } + + let (res, revision) = + futures::executor::block_on(client.load_global_config(String::from("global"))).unwrap(); + assert!( + res.iter() + .zip(check_items) + .all(|(item1, item2)| item1.name == item2.0 && item1.value == item2.1) + ); + assert_eq!(revision, 3); } #[test] fn test_watch_global_config_on_closed_server() { let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let global_items = vec![("test1", "val1"), ("test2", "val2"), ("test3", "val3")]; + let items_clone = global_items.clone(); + let client = Arc::new(client); + let cli_clone = client.clone(); use futures::StreamExt; - let j = std::thread::spawn(move || { - futures::executor::block_on(async move { - let mut r = client.watch_global_config().unwrap(); - let mut i: usize = 0; - while let Some(r) = r.next().await { - match r { - Ok(res) => { - let change = &res.get_changes()[0]; - assert_eq!( - change - .get_name() - .split('/') - .collect::>() - .last() - .unwrap() - .to_owned(), - format!("{:?}", i) - ); - assert_eq!(change.get_value().to_owned(), format!("{:?}", i)); - i += 1; - } - Err(e) => { - if let grpcio::Error::RpcFailure(e) = e { - // 14-UNAVAILABLE - assert_eq!(e.code(), grpcio::RpcStatusCode::from(14)); - break; - } else { - panic!("other error occur {:?}", e) + let background_worker = Builder::new("background").thread_count(1).create(); + background_worker.spawn_async_task(async move { + match cli_clone.watch_global_config("global".into(), 0) { + Ok(mut stream) => { + let mut i: usize = 0; + while let Some(grpc_response) = stream.next().await { + match grpc_response { + Ok(r) => { + for item in r.get_changes() { + assert_eq!(item.get_name(), items_clone[i].0); + assert_eq!(item.get_value(), items_clone[i].1); + i += 1; + } } + Err(err) => panic!("failed to get stream, err: {:?}", err), } } } - }); + Err(err) => { + if !err.to_string().contains("UNAVAILABLE") { + // Not 14-UNAVAILABLE + panic!("other error occur {:?}", err) + } + } + } }); - thread::sleep(Duration::from_millis(200)); + + if let Err(err) = futures::executor::block_on( + client.store_global_config( + "global".into(), + global_items + .iter() + .map(|(name, value)| { + let mut item = GlobalConfigItem::default(); + item.set_name(name.to_string()); + item.set_value(value.to_string()); + item + }) + .collect::>(), + ), + ) { + panic!("error occur {:?}", err); + } + + thread::sleep(Duration::from_millis(100)); server.stop(); - j.join().unwrap(); } // Updating pd leader may be slow, we need to make sure it does not block other From 9726e56e5b667649504e3ec636f12843bc94ff8d Mon Sep 17 00:00:00 2001 From: Connor Date: Fri, 20 Jan 2023 22:43:49 +0800 Subject: [PATCH 468/676] batch-system: add priority scheduling for batch system (#14065) ref tikv/tikv#13730 Support priority-based scheduling for the apply batch system. Signed-off-by: Connor1996 --- Cargo.lock | 8 +- components/batch-system/Cargo.toml | 2 + .../batch-system/benches/batch-system.rs | 6 +- components/batch-system/benches/router.rs | 2 +- components/batch-system/src/batch.rs | 101 ++----- components/batch-system/src/channel.rs | 252 ++++++++++++++++++ components/batch-system/src/fsm.rs | 18 +- components/batch-system/src/lib.rs | 3 +- components/batch-system/src/mailbox.rs | 2 + components/batch-system/src/test_runner.rs | 19 +- components/batch-system/tests/cases/batch.rs | 105 +++++++- components/batch-system/tests/cases/router.rs | 4 +- components/raftstore-v2/Cargo.toml | 1 + components/raftstore-v2/src/batch/store.rs | 2 +- .../raftstore-v2/src/operation/command/mod.rs | 1 + components/raftstore-v2/src/router/message.rs | 5 + components/raftstore/Cargo.toml | 1 + .../raftstore/src/store/entry_storage.rs | 7 + components/raftstore/src/store/fsm/apply.rs | 55 +++- components/raftstore/src/store/fsm/peer.rs | 6 +- components/raftstore/src/store/fsm/store.rs | 17 +- components/raftstore/src/store/msg.rs | 5 + components/raftstore/src/store/peer.rs | 5 +- components/raftstore/src/store/util.rs | 38 ++- .../src/store/worker/refresh_config.rs | 2 +- .../resource_control/src/resource_group.rs | 15 +- components/server/src/server.rs | 2 +- components/test_raftstore/src/cluster.rs | 12 +- components/test_raftstore/src/node.rs | 2 +- components/test_raftstore/src/server.rs | 8 +- components/tikv_util/Cargo.toml | 2 +- .../tikv_util/src/mpsc/priority_queue.rs | 46 ++-- .../integrations/config/dynamic/raftstore.rs | 2 +- tests/integrations/config/dynamic/snap.rs | 3 +- .../integrations/raftstore/test_bootstrap.rs | 2 +- 35 files changed, 614 insertions(+), 147 deletions(-) create mode 100644 components/batch-system/src/channel.rs diff --git a/Cargo.lock b/Cargo.lock index ee047aaae6d..d288af846a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -581,9 +581,11 @@ dependencies = [ "derive_more", "fail", "file_system", + "kvproto", "lazy_static", "online_config", "prometheus", + "resource_control", "serde", "serde_derive", "slog", @@ -2727,7 +2729,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#adcf4c414bfd0ccf18436b377430aa2450fd4c81" +source = "git+https://github.com/pingcap/kvproto.git#009f31598ac3200dc8b32e18f96fc4deb7b32e48" dependencies = [ "futures 0.3.15", "grpcio", @@ -4312,6 +4314,7 @@ dependencies = [ "raft", "raft-proto", "rand 0.8.5", + "resource_control", "resource_metering", "serde", "serde_derive", @@ -4362,6 +4365,7 @@ dependencies = [ "raft-proto", "raftstore", "rand 0.8.5", + "resource_control", "resource_metering", "slog", "slog-global", @@ -6565,7 +6569,7 @@ dependencies = [ "openssl", "page_size", "panic_hook", - "parking_lot 0.12.1", + "parking_lot_core 0.9.1", "pin-project", "procfs", "procinfo", diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index 7fe5798f833..75a0230c188 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -13,9 +13,11 @@ crossbeam = "0.8" derive_more = { version = "0.99", optional = true } fail = "0.5" file_system = { workspace = true } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" online_config = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } +resource_control = { workspace = true } serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/batch-system/benches/batch-system.rs b/components/batch-system/benches/batch-system.rs index c248eabaf04..9edf72f0ff9 100644 --- a/components/batch-system/benches/batch-system.rs +++ b/components/batch-system/benches/batch-system.rs @@ -20,7 +20,7 @@ fn end_hook(tx: &std::sync::mpsc::Sender<()>) -> Message { fn bench_spawn_many(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); const ID_LIMIT: u64 = 32; const MESSAGE_LIMIT: usize = 256; @@ -55,7 +55,7 @@ fn bench_spawn_many(c: &mut Criterion) { fn bench_imbalance(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); const ID_LIMIT: u64 = 10; const MESSAGE_LIMIT: usize = 512; @@ -92,7 +92,7 @@ fn bench_imbalance(c: &mut Criterion) { fn bench_fairness(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); let state_cnt = Arc::new(AtomicUsize::new(0)); for id in 0..10 { diff --git a/components/batch-system/benches/router.rs b/components/batch-system/benches/router.rs index 3dd7e282e15..e25ee58b94d 100644 --- a/components/batch-system/benches/router.rs +++ b/components/batch-system/benches/router.rs @@ -8,7 +8,7 @@ use criterion::*; fn bench_send(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); let (normal_tx, normal_fsm) = Runner::new(100000); let normal_box = BasicMailbox::new(normal_tx, normal_fsm, Arc::default()); diff --git a/components/batch-system/src/batch.rs b/components/batch-system/src/batch.rs index 4d935ad4819..48ef809d421 100644 --- a/components/batch-system/src/batch.rs +++ b/components/batch-system/src/batch.rs @@ -15,15 +15,16 @@ use std::{ time::Duration, }; -use crossbeam::channel::{self, SendError}; use fail::fail_point; use file_system::{set_io_type, IoType}; +use resource_control::ResourceController; use tikv_util::{ debug, error, info, mpsc, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, - time::Instant, warn, + time::Instant, }; use crate::{ + channel::{fsm_channel, ControlScheduler, FsmReceiver, FsmSender, NormalScheduler}, config::Config, fsm::{Fsm, FsmScheduler, Priority}, mailbox::BasicMailbox, @@ -37,60 +38,6 @@ pub enum FsmTypes { // Used as a signal that scheduler should be shutdown. Empty, } - -// A macro to introduce common definition of scheduler. -macro_rules! impl_sched { - ($name:ident, $ty:path,Fsm = $fsm:tt) => { - pub struct $name { - sender: channel::Sender>, - low_sender: channel::Sender>, - } - - impl Clone for $name { - #[inline] - fn clone(&self) -> $name { - $name { - sender: self.sender.clone(), - low_sender: self.low_sender.clone(), - } - } - } - - impl FsmScheduler for $name - where - $fsm: Fsm, - { - type Fsm = $fsm; - - #[inline] - fn schedule(&self, fsm: Box) { - let sender = match fsm.get_priority() { - Priority::Normal => &self.sender, - Priority::Low => &self.low_sender, - }; - match sender.send($ty(fsm)) { - Ok(()) => {} - // TODO: use debug instead. - Err(SendError($ty(fsm))) => warn!("failed to schedule fsm {:p}", fsm), - _ => unreachable!(), - } - } - - fn shutdown(&self) { - // TODO: close it explicitly once it's supported. - // Magic number, actually any number greater than poll pool size works. - for _ in 0..256 { - let _ = self.sender.send(FsmTypes::Empty); - let _ = self.low_sender.send(FsmTypes::Empty); - } - } - } - }; -} - -impl_sched!(NormalScheduler, FsmTypes::Normal, Fsm = N); -impl_sched!(ControlScheduler, FsmTypes::Control, Fsm = C); - pub struct NormalFsm { fsm: Box, timer: Instant, @@ -168,7 +115,7 @@ impl Batch { /// /// When pending messages of the FSM is different than `expected_len`, /// attempts to schedule it in this poller again. Returns the `fsm` if the - /// re-scheduling suceeds. + /// re-scheduling succeeds. fn release(&mut self, mut fsm: NormalFsm, expected_len: usize) -> Option> { let mailbox = fsm.take_mailbox().unwrap(); mailbox.release(fsm.fsm); @@ -341,7 +288,7 @@ pub trait PollHandler: Send + 'static { /// Internal poller that fetches batch and call handler hooks for readiness. pub struct Poller { pub router: Router, ControlScheduler>, - pub fsm_receiver: channel::Receiver>, + pub fsm_receiver: FsmReceiver, pub handler: Handler, pub max_batch_size: usize, pub reschedule_duration: Duration, @@ -534,8 +481,8 @@ pub trait HandlerBuilder { pub struct BatchSystem { name_prefix: Option, router: BatchRouter, - receiver: channel::Receiver>, - low_receiver: channel::Receiver>, + receiver: FsmReceiver, + low_receiver: FsmReceiver, pool_size: usize, max_batch_size: usize, workers: Arc>>>, @@ -649,15 +596,15 @@ where } } -struct PoolStateBuilder { +struct PoolStateBuilder { max_batch_size: usize, reschedule_duration: Duration, - fsm_receiver: channel::Receiver>, - fsm_sender: channel::Sender>, + fsm_receiver: FsmReceiver, + fsm_sender: FsmSender, pool_size: usize, } -impl PoolStateBuilder { +impl PoolStateBuilder { fn build>( self, name_prefix: String, @@ -683,11 +630,11 @@ impl PoolStateBuilder { } } -pub struct PoolState> { +pub struct PoolState> { pub name_prefix: String, pub handler_builder: H, - pub fsm_receiver: channel::Receiver>, - pub fsm_sender: channel::Sender>, + pub fsm_receiver: FsmReceiver, + pub fsm_sender: FsmSender, pub low_priority_pool_size: usize, pub expected_pool_size: usize, pub workers: Arc>>>, @@ -707,32 +654,32 @@ pub fn create_system( cfg: &Config, sender: mpsc::LooseBoundedSender, controller: Box, + resource_ctl: Option>, ) -> (BatchRouter, BatchSystem) { let state_cnt = Arc::new(AtomicUsize::new(0)); let control_box = BasicMailbox::new(sender, controller, state_cnt.clone()); - let (tx, rx) = channel::unbounded(); - let (tx2, rx2) = channel::unbounded(); + let (sender, receiver) = fsm_channel(resource_ctl); + let (low_sender, low_receiver) = fsm_channel(None); // no resource control for low fsm let normal_scheduler = NormalScheduler { - sender: tx.clone(), - low_sender: tx2.clone(), + sender: sender.clone(), + low_sender, }; let control_scheduler = ControlScheduler { - sender: tx.clone(), - low_sender: tx2, + sender: sender.clone(), }; let pool_state_builder = PoolStateBuilder { max_batch_size: cfg.max_batch_size(), reschedule_duration: cfg.reschedule_duration.0, - fsm_receiver: rx.clone(), - fsm_sender: tx, + fsm_receiver: receiver.clone(), + fsm_sender: sender, pool_size: cfg.pool_size, }; let router = Router::new(control_box, normal_scheduler, control_scheduler, state_cnt); let system = BatchSystem { name_prefix: None, router: router.clone(), - receiver: rx, - low_receiver: rx2, + receiver, + low_receiver, pool_size: cfg.pool_size, max_batch_size: cfg.max_batch_size(), workers: Arc::new(Mutex::new(Vec::new())), diff --git a/components/batch-system/src/channel.rs b/components/batch-system/src/channel.rs new file mode 100644 index 00000000000..094b6a7a2ae --- /dev/null +++ b/components/batch-system/src/channel.rs @@ -0,0 +1,252 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cell::RefCell, sync::Arc}; + +use crossbeam::channel::{self, RecvError, SendError, TryRecvError, TrySendError}; +use kvproto::kvrpcpb::CommandPri; +use resource_control::{ResourceConsumeType, ResourceController}; +use tikv_util::{mpsc::priority_queue, warn}; + +use crate::{ + fsm::{Fsm, FsmScheduler, Priority, ResourceMetered}, + FsmTypes, +}; + +pub fn fsm_channel( + resource_ctl: Option>, +) -> (FsmSender, FsmReceiver) { + if let Some(ctl) = resource_ctl { + let (tx, rx) = priority_queue::unbounded(); + ( + FsmSender::Priority { + resource_ctl: ctl, + sender: tx, + last_msg_group: RefCell::new(String::new()), + }, + FsmReceiver::Priority(rx), + ) + } else { + let (tx, rx) = channel::unbounded(); + (FsmSender::Vanilla(tx), FsmReceiver::Vanilla(rx)) + } +} + +pub struct NormalScheduler { + pub(crate) sender: FsmSender, + pub(crate) low_sender: FsmSender, +} + +impl Clone for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + NormalScheduler { + sender: self.sender.clone(), + low_sender: self.low_sender.clone(), + } + } +} + +impl FsmScheduler for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = N; + + fn consume_msg_resource(&self, msg: &::Message) { + self.sender.consume_msg_resource(msg); + } + + #[inline] + fn schedule(&self, fsm: Box) { + let sender = match fsm.get_priority() { + Priority::Normal => &self.sender, + Priority::Low => &self.low_sender, + }; + + match sender.send(FsmTypes::Normal(fsm)) { + Ok(()) => {} + Err(SendError(FsmTypes::Normal(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty); + let _ = self.low_sender.send(FsmTypes::Empty); + } + } +} + +pub struct ControlScheduler { + pub(crate) sender: FsmSender, +} + +impl Clone for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + ControlScheduler { + sender: self.sender.clone(), + } + } +} + +impl FsmScheduler for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = C; + + fn consume_msg_resource(&self, _msg: &::Message) {} + + #[inline] + fn schedule(&self, fsm: Box) { + match self.sender.send(FsmTypes::Control(fsm)) { + Ok(()) => {} + Err(SendError(FsmTypes::Control(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty); + } + } +} + +pub enum FsmSender { + Vanilla(channel::Sender>), + Priority { + resource_ctl: Arc, + sender: priority_queue::Sender>, + last_msg_group: RefCell, + }, +} + +impl Clone for FsmSender +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + match self { + FsmSender::Vanilla(sender) => FsmSender::Vanilla(sender.clone()), + FsmSender::Priority { + resource_ctl, + sender, + .. + } => FsmSender::Priority { + resource_ctl: resource_ctl.clone(), + sender: sender.clone(), + last_msg_group: RefCell::new(String::new()), + }, + } + } +} + +impl FsmSender { + pub fn send(&self, fsm: FsmTypes) -> Result<(), SendError>> { + match self { + FsmSender::Vanilla(sender) => sender.send(fsm), + FsmSender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + // TODO: pass different priority + let pri = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + sender.send(fsm, pri) + } + } + } + + pub fn try_send(&self, fsm: FsmTypes) -> Result<(), TrySendError>> { + match self { + FsmSender::Vanilla(sender) => sender.try_send(fsm), + FsmSender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + let priority = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + sender.try_send(fsm, priority) + } + } + } + + fn consume_msg_resource(&self, msg: &N::Message) { + match self { + FsmSender::Vanilla(_) => {} + FsmSender::Priority { + resource_ctl, + last_msg_group, + .. + } => { + if let Some(mut groups) = msg.get_resource_consumptions() { + let mut dominant_group = "".to_owned(); + let mut max_write_bytes = 0; + for (group_name, write_bytes) in groups.drain() { + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + if write_bytes > max_write_bytes { + dominant_group = group_name; + max_write_bytes = write_bytes; + } + } + *last_msg_group.borrow_mut() = dominant_group; + } + } + } + } +} + +pub enum FsmReceiver { + Vanilla(channel::Receiver>), + Priority(priority_queue::Receiver>), +} + +impl Clone for FsmReceiver +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + match self { + FsmReceiver::Vanilla(receiver) => FsmReceiver::Vanilla(receiver.clone()), + FsmReceiver::Priority(receiver) => FsmReceiver::Priority(receiver.clone()), + } + } +} + +impl FsmReceiver { + pub fn recv(&self) -> Result, RecvError> { + match self { + FsmReceiver::Vanilla(receiver) => receiver.recv(), + FsmReceiver::Priority(receiver) => receiver.recv(), + } + } + + pub fn try_recv(&self) -> Result, TryRecvError> { + match self { + FsmReceiver::Vanilla(receiver) => receiver.try_recv(), + FsmReceiver::Priority(receiver) => receiver.try_recv(), + } + } +} diff --git a/components/batch-system/src/fsm.rs b/components/batch-system/src/fsm.rs index 09e32333c96..5d9e009fa01 100644 --- a/components/batch-system/src/fsm.rs +++ b/components/batch-system/src/fsm.rs @@ -10,6 +10,8 @@ use std::{ usize, }; +use collections::HashMap; + use crate::mailbox::BasicMailbox; #[derive(Clone, Copy, Debug, PartialEq)] @@ -24,15 +26,26 @@ pub trait FsmScheduler { /// Schedule a Fsm for later handling. fn schedule(&self, fsm: Box); + /// Shutdown the scheduler, which indicates that resources like /// background thread pool should be released. fn shutdown(&self); + + /// Consume the resources of msg in resource controller if enabled, + /// otherwise do nothing. + fn consume_msg_resource(&self, msg: &::Message); +} + +pub trait ResourceMetered { + fn get_resource_consumptions(&self) -> Option> { + None + } } /// A `Fsm` is a finite state machine. It should be able to be notified for /// updating internal state according to incoming messages. -pub trait Fsm { - type Message: Send; +pub trait Fsm: Send + 'static { + type Message: Send + ResourceMetered; fn is_stopped(&self) -> bool; @@ -42,6 +55,7 @@ pub trait Fsm { Self: Sized, { } + /// Take the mailbox from FSM. Implementation should ensure there will be /// no reference to mailbox after calling this method. fn take_mailbox(&mut self) -> Option> diff --git a/components/batch-system/src/lib.rs b/components/batch-system/src/lib.rs index 9a307a534ac..f4f799dcc9a 100644 --- a/components/batch-system/src/lib.rs +++ b/components/batch-system/src/lib.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod batch; +mod channel; mod config; mod fsm; mod mailbox; @@ -16,7 +17,7 @@ pub use self::{ PollHandler, Poller, PoolState, }, config::Config, - fsm::{Fsm, FsmScheduler, Priority}, + fsm::{Fsm, FsmScheduler, Priority, ResourceMetered}, mailbox::{BasicMailbox, Mailbox}, router::Router, }; diff --git a/components/batch-system/src/mailbox.rs b/components/batch-system/src/mailbox.rs index 5afddf73c14..869031392af 100644 --- a/components/batch-system/src/mailbox.rs +++ b/components/batch-system/src/mailbox.rs @@ -75,6 +75,7 @@ impl BasicMailbox { msg: Owner::Message, scheduler: &S, ) -> Result<(), SendError> { + scheduler.consume_msg_resource(&msg); self.sender.force_send(msg)?; self.state.notify(scheduler, Cow::Borrowed(self)); Ok(()) @@ -89,6 +90,7 @@ impl BasicMailbox { msg: Owner::Message, scheduler: &S, ) -> Result<(), TrySendError> { + scheduler.consume_msg_resource(&msg); self.sender.try_send(msg)?; self.state.notify(scheduler, Cow::Borrowed(self)); Ok(()) diff --git a/components/batch-system/src/test_runner.rs b/components/batch-system/src/test_runner.rs index 6be64d5d695..a3ae80dc55a 100644 --- a/components/batch-system/src/test_runner.rs +++ b/components/batch-system/src/test_runner.rs @@ -11,10 +11,11 @@ use std::{ }, }; +use collections::HashMap; use derive_more::{Add, AddAssign}; use tikv_util::mpsc; -use crate::*; +use crate::{fsm::ResourceMetered, *}; /// Message `Runner` can accepts. pub enum Message { @@ -22,6 +23,21 @@ pub enum Message { Loop(usize), /// `Runner` will call the callback directly. Callback(Box), + /// group name, write bytes + Resource(String, u64), +} + +impl ResourceMetered for Message { + fn get_resource_consumptions(&self) -> Option> { + match self { + Message::Resource(group_name, bytes) => { + let mut map = HashMap::default(); + map.insert(group_name.to_owned(), *bytes); + Some(map) + } + _ => None, + } + } } /// A simple runner used for benchmarking only. @@ -102,6 +118,7 @@ impl Handler { } } Ok(Message::Callback(cb)) => cb(self, r), + Ok(Message::Resource(..)) => {} Err(_) => break, } } diff --git a/components/batch-system/tests/cases/batch.rs b/components/batch-system/tests/cases/batch.rs index f950df68b8d..dc13affc363 100644 --- a/components/batch-system/tests/cases/batch.rs +++ b/components/batch-system/tests/cases/batch.rs @@ -7,13 +7,15 @@ use std::{ }; use batch_system::{test_runner::*, *}; +use kvproto::resource_manager::{GroupMode, GroupRawResourceSettings, ResourceGroup}; +use resource_control::ResourceGroupManager; use tikv_util::mpsc; #[test] fn test_batch() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); let metrics = builder.metrics.clone(); system.spawn("test".to_owned(), builder); @@ -55,7 +57,7 @@ fn test_batch() { fn test_priority() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); let (tx, rx) = mpsc::unbounded(); @@ -101,3 +103,102 @@ fn test_priority() { .unwrap(); assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(3)); } + +#[test] +fn test_resource_group() { + let (control_tx, control_fsm) = Runner::new(10); + let resource_manager = ResourceGroupManager::default(); + + let get_group = |name: &str, read_tokens: u64, write_tokens: u64| -> ResourceGroup { + let mut group = ResourceGroup::new(); + group.set_name(name.to_string()); + group.set_mode(GroupMode::RawMode); + let mut resource_setting = GroupRawResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_raw_resource_settings(resource_setting); + group + }; + + resource_manager.add_resource_group(get_group("group1", 10, 10)); + resource_manager.add_resource_group(get_group("group2", 100, 100)); + + let mut cfg = Config::default(); + cfg.pool_size = 1; + let (router, mut system) = batch_system::create_system( + &cfg, + control_tx, + control_fsm, + Some(resource_manager.derive_controller("test".to_string(), false)), + ); + let builder = Builder::new(); + system.spawn("test".to_owned(), builder); + let (tx, rx) = mpsc::unbounded(); + let tx_ = tx.clone(); + let r = router.clone(); + let state_cnt = Arc::new(AtomicUsize::new(0)); + router + .send_control(Message::Callback(Box::new( + move |_: &Handler, _: &mut Runner| { + let (tx, runner) = Runner::new(10); + r.register(1, BasicMailbox::new(tx, runner, state_cnt.clone())); + let (tx2, runner2) = Runner::new(10); + r.register(2, BasicMailbox::new(tx2, runner2, state_cnt)); + tx_.send(0).unwrap(); + }, + ))) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(0)); + + let tx_ = tx.clone(); + let (tx1, rx1) = std::sync::mpsc::sync_channel(0); + // block the thread + router + .send_control(Message::Callback(Box::new( + move |_: &Handler, _: &mut Runner| { + tx_.send(0).unwrap(); + tx1.send(0).unwrap(); + }, + ))) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(0)); + + router + .send(1, Message::Resource("group1".to_string(), 1)) + .unwrap(); + let tx_ = tx.clone(); + router + .send( + 1, + Message::Callback(Box::new(move |_: &Handler, _: &mut Runner| { + tx_.send(1).unwrap(); + })), + ) + .unwrap(); + + router + .send(2, Message::Resource("group2".to_string(), 1)) + .unwrap(); + router + .send( + 2, + Message::Callback(Box::new(move |_: &Handler, _: &mut Runner| { + tx.send(2).unwrap(); + })), + ) + .unwrap(); + + // pause the blocking thread + assert_eq!(rx1.recv_timeout(Duration::from_secs(3)), Ok(0)); + + // should recv from group2 first, because group2 has more tokens and it would be + // handled with higher priority. + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(2)); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(1)); +} diff --git a/components/batch-system/tests/cases/router.rs b/components/batch-system/tests/cases/router.rs index 543937fa8ef..d746dfad5cb 100644 --- a/components/batch-system/tests/cases/router.rs +++ b/components/batch-system/tests/cases/router.rs @@ -30,7 +30,7 @@ fn test_basic() { let (control_drop_tx, control_drop_rx) = mpsc::unbounded(); control_fsm.sender = Some(control_drop_tx); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); @@ -130,7 +130,7 @@ fn test_basic() { fn test_router_trace() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 6726c5ed742..5b917b9ddf7 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -52,6 +52,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raft-proto = { version = "0.7.0" } raftstore = { workspace = true } rand = "0.8.3" +resource_control = { workspace = true } resource_metering = { workspace = true } slog = "2.3" smallvec = "1.4" diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 280e8dcc396..1c7360a86bc 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -749,7 +749,7 @@ where { let (store_tx, store_fsm) = StoreFsm::new(cfg, store_id, logger.clone()); let (router, system) = - batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); + batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm, None); let system = StoreSystem { system, workers: None, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index cf29d9ee25a..edca9510c27 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -590,6 +590,7 @@ impl Apply { AdminCmdType::InvalidAdmin => { return Err(box_err!("invalid admin command type")); } + AdminCmdType::UpdateGcPeer => unimplemented!(), }; match admin_result { diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index c1e5f0d37dc..a9353e171d9 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -2,6 +2,7 @@ // #[PerformanceCriticalPath] +use batch_system::ResourceMetered; use kvproto::{ metapb, raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, @@ -197,6 +198,8 @@ pub enum PeerMsg { WaitFlush(super::FlushChannel), } +impl ResourceMetered for PeerMsg {} + impl PeerMsg { pub fn raft_query(req: RaftCmdRequest) -> (Self, QueryResSubscriber) { let (ch, sub) = QueryResChannel::pair(); @@ -259,3 +262,5 @@ pub enum StoreMsg { Start, StoreUnreachable { to_store_id: u64 }, } + +impl ResourceMetered for StoreMsg {} diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 548693b71ac..8df501f279d 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -72,6 +72,7 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0", default-features = false } rand = "0.8.3" +resource_control = { workspace = true } resource_metering = { workspace = true } serde = "1.0" serde_derive = "1.0" diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index bc85ecedc34..afa13730ccf 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -69,6 +69,13 @@ impl CachedEntries { } } + pub fn iter_entries(&self, mut f: impl FnMut(&Entry)) { + let entries = self.entries.lock().unwrap(); + for entry in &entries.0 { + f(entry); + } + } + /// Take cached entries and dangle size for them. `dangle` means not in /// entry cache. pub fn take_entries(&self) -> (Vec, usize) { diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 9f2d234010f..22a42393173 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -24,7 +24,7 @@ use std::{ use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, Config as BatchSystemConfig, Fsm, HandleResult, - HandlerBuilder, PollHandler, Priority, + HandlerBuilder, PollHandler, Priority, ResourceMetered, }; use collections::{HashMap, HashMapEntry, HashSet}; use crossbeam::channel::{TryRecvError, TrySendError}; @@ -46,11 +46,12 @@ use kvproto::{ }; use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; use prometheus::local::LocalHistogram; -use protobuf::{wire_format::WireType, CodedInputStream}; +use protobuf::{wire_format::WireType, CodedInputStream, Message}; use raft::eraftpb::{ ConfChange, ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot, }; use raft_proto::ConfChangeI; +use resource_control::ResourceController; use smallvec::{smallvec, SmallVec}; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -1695,6 +1696,7 @@ where } AdminCmdType::BatchSwitchWitness => self.exec_batch_switch_witness(ctx, request), AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), + AdminCmdType::UpdateGcPeer => unimplemented!(), }?; response.set_cmd_type(cmd_type); @@ -3709,6 +3711,26 @@ where }, } +impl ResourceMetered for Msg { + fn get_resource_consumptions(&self) -> Option> { + match self { + Msg::Apply { apply, .. } => { + let mut map = HashMap::default(); + for cached_entries in &apply.entries { + cached_entries.iter_entries(|entry| { + // TODO: maybe use a more efficient way to get the resource group name. + let header = util::get_entry_header(entry); + let group_name = header.get_resource_group_name().to_owned(); + *map.entry(group_name).or_default() += entry.compute_size() as u64; + }); + } + Some(map) + } + _ => None, + } + } +} + impl Msg where EK: KvEngine, @@ -4406,6 +4428,7 @@ pub enum ControlMsg { }, } +impl ResourceMetered for ControlMsg {} pub struct ControlFsm { receiver: Receiver, stopped: bool, @@ -4834,10 +4857,15 @@ impl ApplyBatchSystem { pub fn create_apply_batch_system( cfg: &Config, + resource_ctl: Option>, ) -> (ApplyRouter, ApplyBatchSystem) { let (control_tx, control_fsm) = ControlFsm::new(); - let (router, system) = - batch_system::create_system(&cfg.apply_batch_system, control_tx, control_fsm); + let (router, system) = batch_system::create_system( + &cfg.apply_batch_system, + control_tx, + control_fsm, + resource_ctl, + ); (ApplyRouter { router }, ApplyBatchSystem { system }) } @@ -4984,6 +5012,7 @@ mod tests { cmd.mut_put().set_key(b"key".to_vec()); cmd.mut_put().set_value(b"value".to_vec()); let mut req = RaftCmdRequest::default(); + req.set_header(RaftRequestHeader::default()); req.mut_requests().push(cmd); e.set_data(req.write_to_bytes().unwrap().into()) } @@ -5251,7 +5280,7 @@ mod tests { let (_dir, importer) = create_tmp_importer("apply-basic"); let (region_scheduler, mut snapshot_rx) = dummy_scheduler(); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -5715,7 +5744,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6054,7 +6083,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6145,7 +6174,7 @@ mod tests { cfg.apply_batch_system.low_priority_pool_size = 0; Arc::new(VersionTrack::new(cfg)) }; - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6325,7 +6354,7 @@ mod tests { cfg.apply_batch_system.low_priority_pool_size = 0; Arc::new(VersionTrack::new(cfg)) }; - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6418,7 +6447,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Config::default(); - let (router, mut system) = create_apply_batch_system(&cfg); + let (router, mut system) = create_apply_batch_system(&cfg, None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-exec-observer".to_owned(), @@ -6642,7 +6671,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Config::default(); - let (router, mut system) = create_apply_batch_system(&cfg); + let (router, mut system) = create_apply_batch_system(&cfg, None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6922,7 +6951,7 @@ mod tests { .register_cmd_observer(1, BoxCmdObserver::new(obs)); let (region_scheduler, _) = dummy_scheduler(); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -7148,7 +7177,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "flashback_need_to_be_applied".to_owned(), diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index d405c3471af..4266e400cd3 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2702,7 +2702,7 @@ where } let mut resp = ExtraMessage::default(); resp.set_type(ExtraMessageType::MsgVoterReplicatedIndexResponse); - resp.voter_replicated_index = voter_replicated_idx; + resp.index = voter_replicated_idx; self.fsm .peer .send_extra_message(resp, &mut self.ctx.trans, from); @@ -2719,7 +2719,7 @@ where if self.fsm.peer.is_leader() || !self.fsm.peer.is_witness() { return; } - let voter_replicated_index = msg.voter_replicated_index; + let voter_replicated_index = msg.index; if let Ok(voter_replicated_term) = self.fsm.peer.get_store().term(voter_replicated_index) { self.ctx.apply_router.schedule_task( self.region_id(), @@ -2787,6 +2787,8 @@ where ExtraMessageType::MsgVoterReplicatedIndexResponse => { self.on_voter_replicated_index_response(msg.get_extra_msg()); } + ExtraMessageType::MsgGcPeerRequest => unimplemented!(), + ExtraMessageType::MsgGcPeerResponse => unimplemented!(), } } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 2ca573824f9..e68873cadf1 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -42,6 +42,7 @@ use kvproto::{ use pd_client::{Feature, FeatureGate, PdClient}; use protobuf::Message; use raft::StateRole; +use resource_control::ResourceGroupManager; use resource_metering::CollectorRegHandle; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -1795,11 +1796,21 @@ impl RaftBatchSystem { pub fn create_raft_batch_system( cfg: &Config, + resource_manager: &Option>, ) -> (RaftRouter, RaftBatchSystem) { let (store_tx, store_fsm) = StoreFsm::new(cfg); - let (apply_router, apply_system) = create_apply_batch_system(cfg); - let (router, system) = - batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); + let (apply_router, apply_system) = create_apply_batch_system( + cfg, + resource_manager + .as_ref() + .map(|m| m.derive_controller("apply".to_owned(), false)), + ); + let (router, system) = batch_system::create_system( + &cfg.store_batch_system, + store_tx, + store_fsm, + None, // Do not do priority scheduling for store batch system + ); let raft_router = RaftRouter { router }; let system = RaftBatchSystem { system, diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 3c555689cb9..195a94478dc 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use std::{borrow::Cow, fmt}; +use batch_system::ResourceMetered; use collections::HashSet; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use futures::channel::mpsc::UnboundedSender; @@ -772,6 +773,8 @@ pub enum PeerMsg { Destroy(u64), } +impl ResourceMetered for PeerMsg {} + impl fmt::Debug for PeerMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -867,6 +870,8 @@ where }, } +impl ResourceMetered for StoreMsg {} + impl fmt::Debug for StoreMsg where EK: KvEngine, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index e2a914fded6..44701fbf705 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4843,7 +4843,7 @@ where return; } if let Some(ref state) = self.pending_merge_state { - if state.get_commit() == extra_msg.get_premerge_commit() { + if state.get_commit() == extra_msg.get_index() { self.add_want_rollback_merge_peer(peer_id); } } @@ -5438,7 +5438,7 @@ where }; let mut extra_msg = ExtraMessage::default(); extra_msg.set_type(ExtraMessageType::MsgWantRollbackMerge); - extra_msg.set_premerge_commit(premerge_commit); + extra_msg.set_index(premerge_commit); self.send_extra_message(extra_msg, &mut ctx.trans, &to_peer); } @@ -5795,6 +5795,7 @@ mod tests { AdminCmdType::ComputeHash, AdminCmdType::VerifyHash, AdminCmdType::BatchSwitchWitness, + AdminCmdType::UpdateGcPeer, ]; for tp in AdminCmdType::values() { let mut msg = RaftCmdRequest::default(); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 2f61534d159..4d8128822c7 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -24,9 +24,9 @@ use kvproto::{ }, raft_serverpb::{RaftMessage, RaftSnapshotData}, }; -use protobuf::{self, Message}; +use protobuf::{self, CodedInputStream, Message}; use raft::{ - eraftpb::{self, ConfChangeType, ConfState, MessageType, Snapshot}, + eraftpb::{self, ConfChangeType, ConfState, Entry, EntryType, MessageType, Snapshot}, Changer, RawNode, INVALID_INDEX, }; use raft_proto::ConfChangeI; @@ -229,6 +229,7 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdEpochState::new(true, true, false, false) } AdminCmdType::BatchSwitchWitness => AdminCmdEpochState::new(false, true, false, true), + AdminCmdType::UpdateGcPeer => AdminCmdEpochState::new(false, false, false, false), } } @@ -725,6 +726,24 @@ pub(crate) fn u64_to_timespec(u: u64) -> Timespec { Timespec::new(sec as i64, nsec as i32) } +pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { + if entry.get_entry_type() != EntryType::EntryNormal { + return RaftRequestHeader::default(); + } + // request header is encoded into data + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return RaftRequestHeader::default(); + } + let (field_number, _) = is.read_tag_unpack().unwrap(); + let t = is.read_message().unwrap(); + // Header field is of number 1 + if field_number != 1 { + panic!("unexpected field number: {} {:?}", field_number, t); + } + t +} + /// Parse data of entry `index`. /// /// # Panics @@ -1671,6 +1690,7 @@ mod tests { metapb::{self, RegionEpoch}, raft_cmdpb::AdminRequest, }; + use protobuf::Message as _; use raft::eraftpb::{ConfChangeType, Entry, Message, MessageType}; use tikv_util::store::new_peer; use time::Duration as TimeDuration; @@ -1749,6 +1769,20 @@ mod tests { assert_eq!(m1.inspect(Some(monotonic_raw_now())), LeaseState::Valid); } + #[test] + fn test_get_entry_header() { + let mut req = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_resource_group_name("test".to_owned()); + req.set_header(header); + let mut entry = Entry::new(); + entry.set_term(1); + entry.set_index(2); + entry.set_data(req.write_to_bytes().unwrap().into()); + let header = get_entry_header(&entry); + assert_eq!(header.get_resource_group_name(), "test"); + } + #[test] fn test_timespec_u64() { let cases = vec![ diff --git a/components/raftstore/src/store/worker/refresh_config.rs b/components/raftstore/src/store/worker/refresh_config.rs index 6555e96f102..d09a6dd9f53 100644 --- a/components/raftstore/src/store/worker/refresh_config.rs +++ b/components/raftstore/src/store/worker/refresh_config.rs @@ -43,7 +43,7 @@ where for _ in 0..size { if let Err(e) = self.state.fsm_sender.send(FsmTypes::Empty) { error!( - "failed to decrese thread pool"; + "failed to decrease thread pool"; "decrease to" => size, "err" => %e, ); diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 23a50b42560..1524ebcba5d 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -9,7 +9,10 @@ use std::{ }; use dashmap::{mapref::one::Ref, DashMap}; -use kvproto::resource_manager::{GroupMode, ResourceGroup}; +use kvproto::{ + kvrpcpb::CommandPri, + resource_manager::{GroupMode, ResourceGroup}, +}; use yatp::queue::priority::TaskPriorityProvider; // a read task cost at least 50us. @@ -97,7 +100,6 @@ impl ResourceGroupManager { let ru_quota = Self::get_ru_setting(g.value(), controller.is_read); controller.add_resource_group(g.key().clone().into_bytes(), ru_quota); } - controller } @@ -243,6 +245,15 @@ impl ResourceController { // need totally accurate here. self.last_min_vt.store(max_vt, Ordering::Relaxed); } + + pub fn get_priority(&self, name: &[u8], pri: CommandPri) -> u64 { + let level = match pri { + CommandPri::Low => 2, + CommandPri::Normal => 1, + CommandPri::High => 0, + }; + self.resource_group(name).get_priority(level) + } } impl TaskPriorityProvider for ResourceController { diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 207373313a4..2a479964ced 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -350,7 +350,7 @@ where }; // Initialize raftstore channels. - let (router, system) = fsm::create_raft_batch_system(&config.raft_store); + let (router, system) = fsm::create_raft_batch_system(&config.raft_store, &resource_manager); let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 2121b7e021f..81e7129407e 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -81,7 +81,7 @@ pub trait Simulator { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, - resource_manager: &Arc, + resource_manager: &Option>, ) -> ServerResult; fn stop_node(&mut self, node_id: u64); fn get_node_ids(&self) -> HashSet; @@ -176,7 +176,7 @@ pub struct Cluster { pub raft_statistics: Vec>>, pub sim: Arc>, pub pd_client: Arc, - resource_manager: Arc, + resource_manager: Option>, } impl Cluster { @@ -210,7 +210,7 @@ impl Cluster { pd_client, sst_workers: vec![], sst_workers_map: HashMap::default(), - resource_manager: Arc::new(ResourceGroupManager::default()), + resource_manager: Some(Arc::new(ResourceGroupManager::default())), kv_statistics: vec![], raft_statistics: vec![], } @@ -279,7 +279,8 @@ impl Cluster { // Try start new nodes. for _ in 0..self.count - self.engines.len() { - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); self.create_engine(Some(router.clone())); let engines = self.dbs.last().unwrap().clone(); @@ -350,7 +351,8 @@ impl Cluster { debug!("starting node {}", node_id); let engines = self.engines[&node_id].clone(); let key_mgr = self.key_managers_map[&node_id].clone(); - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); let mut cfg = self.cfg.clone(); if let Some(labels) = self.labels.get(&node_id) { cfg.server.labels = labels.to_owned(); diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 9ae76dba9f8..05ed8ece83d 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -230,7 +230,7 @@ impl Simulator for NodeCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, - _resource_manager: &Arc, + _resource_manager: &Option>, ) -> ServerResult { assert!(node_id == 0 || !self.nodes.contains_key(&node_id)); let pd_worker = LazyWorker::new("test-pd-worker"); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index ccf4df43497..63a0b4e4804 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -265,7 +265,7 @@ impl ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, - resource_manager: &Arc, + resource_manager: &Option>, ) -> ServerResult { let (tmp_str, tmp) = if node_id == 0 || !self.snap_paths.contains_key(&node_id) { let p = test_util::temp_dir("test_cluster", cfg.prefer_mem); @@ -416,7 +416,9 @@ impl ServerCluster { quota_limiter.clone(), self.pd_client.feature_gate().clone(), self.get_causal_ts_provider(node_id), - Some(resource_manager.derive_controller("scheduler-worker-pool".to_owned(), true)), + resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), )?; self.storages.insert(node_id, raft_engine); @@ -652,7 +654,7 @@ impl Simulator for ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, - resource_manager: &Arc, + resource_manager: &Option>, ) -> ServerResult { dispatch_api_version!( cfg.storage.api_version(), diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 92f3bac3d5b..1193751b228 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -38,7 +38,7 @@ num-traits = "0.2" num_cpus = "1" online_config = { workspace = true } openssl = "0.10" -parking_lot = "0.12.1" +parking_lot_core = "0.9.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" diff --git a/components/tikv_util/src/mpsc/priority_queue.rs b/components/tikv_util/src/mpsc/priority_queue.rs index 3389d6154c3..fac741361db 100644 --- a/components/tikv_util/src/mpsc/priority_queue.rs +++ b/components/tikv_util/src/mpsc/priority_queue.rs @@ -7,7 +7,9 @@ use std::sync::{ use crossbeam::channel::{RecvError, SendError, TryRecvError, TrySendError}; use crossbeam_skiplist::SkipMap; -use parking_lot::{Condvar, Mutex}; +use parking_lot_core::{ + park, unpark_all, unpark_one, SpinWait, DEFAULT_PARK_TOKEN, DEFAULT_UNPARK_TOKEN, +}; // Create a priority based channel. Sender can send message with priority of // u64, and receiver will receive messages in ascending order of priority. For @@ -54,8 +56,6 @@ impl Drop for Cell { #[derive(Default)] struct PriorityQueue { queue: SkipMap>, - disconnected: Mutex, - available: Condvar, sequencer: AtomicU64, @@ -67,8 +67,6 @@ impl PriorityQueue { pub fn new() -> Self { Self { queue: SkipMap::new(), - disconnected: Mutex::new(false), - available: Condvar::new(), sequencer: AtomicU64::new(0), senders: AtomicUsize::new(1), receivers: AtomicUsize::new(1), @@ -81,6 +79,10 @@ impl PriorityQueue { sequence: self.sequencer.fetch_add(1, Ordering::Relaxed), } } + + fn is_disconnected(&self) -> bool { + self.senders.load(Ordering::SeqCst) == 0 + } } // When derived `PartialOrd` on structs, it will produce a lexicographic @@ -109,7 +111,10 @@ impl Sender { self.inner .queue .insert(self.inner.get_map_key(pri), Cell::new(msg)); - self.inner.available.notify_one(); + let addr = &*self.inner as *const PriorityQueue as usize; + unsafe { + unpark_one(addr, |_| DEFAULT_UNPARK_TOKEN); + } Ok(()) } @@ -132,8 +137,10 @@ impl Drop for Sender { fn drop(&mut self) { let old = self.inner.senders.fetch_sub(1, Ordering::AcqRel); if old <= 1 { - *self.inner.disconnected.lock() = true; - self.inner.available.notify_all(); + let addr = &*self.inner as *const PriorityQueue as usize; + unsafe { + unpark_all(addr, DEFAULT_UNPARK_TOKEN); + } } } } @@ -146,14 +153,13 @@ impl Receiver { pub fn try_recv(&self) -> Result { match self.inner.queue.pop_front() { Some(entry) => Ok(entry.value().take().unwrap()), - None if self.inner.senders.load(Ordering::SeqCst) == 0 => { - Err(TryRecvError::Disconnected) - } + None if self.inner.is_disconnected() => Err(TryRecvError::Disconnected), None => Err(TryRecvError::Empty), } } pub fn recv(&self) -> Result { + let mut spin = SpinWait::new(); loop { match self.try_recv() { Ok(msg) => return Ok(msg), @@ -161,17 +167,25 @@ impl Receiver { return Err(RecvError); } Err(TryRecvError::Empty) => { - let mut disconnected = self.inner.disconnected.lock(); - if *disconnected { - return Err(RecvError); + if spin.spin() { + continue; + } + let addr = &*self.inner as *const PriorityQueue as usize; + unsafe { + park( + addr, + || self.len() == 0 && !self.inner.is_disconnected(), + || {}, + |_, _| {}, + DEFAULT_PARK_TOKEN, + None, + ); } - self.inner.available.wait(&mut disconnected); } } } } - #[cfg(test)] fn len(&self) -> usize { self.inner.queue.len() } diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 70e70b3cbe6..ff1babb7e1f 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -66,7 +66,7 @@ fn start_raftstore( ApplyRouter, RaftBatchSystem, ) { - let (raft_router, mut system) = create_raft_batch_system(&cfg.raft_store); + let (raft_router, mut system) = create_raft_batch_system(&cfg.raft_store, &None); let engines = create_tmp_engine(dir); let host = CoprocessorHost::default(); let importer = { diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index 1a82ec8005e..af03246acf4 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -45,7 +45,8 @@ fn start_server( .name_prefix(thd_name!("test-server")) .build(), ); - let (raft_router, _) = create_raft_batch_system::(&cfg.raft_store); + let (raft_router, _) = + create_raft_batch_system::(&cfg.raft_store, &None); let mut snap_worker = Worker::new("snap-handler").lazy_build("snap-handler"); let snap_worker_scheduler = snap_worker.scheduler(); let server_config = Arc::new(VersionTrack::new(cfg.server.clone())); diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index 8ede13bd0f4..ee063e0f1e7 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -42,7 +42,7 @@ fn test_node_bootstrap_with_prepared_data() { let pd_client = Arc::new(TestPdClient::new(0, false)); let cfg = new_tikv_config(0); - let (_, system) = fsm::create_raft_batch_system(&cfg.raft_store); + let (_, system) = fsm::create_raft_batch_system(&cfg.raft_store, &None); let simulate_trans = SimulateTransport::new(ChannelTransport::new()); let tmp_path = Builder::new().prefix("test_cluster").tempdir().unwrap(); let engine = From c353910ef6a296b592db6b217ec888cee34eaffc Mon Sep 17 00:00:00 2001 From: andreid-db <103079610+andreid-db@users.noreply.github.com> Date: Sat, 28 Jan 2023 20:15:54 -0800 Subject: [PATCH 469/676] config: allow starting TiKV nodes with <1 CPU (#14084) close tikv/tikv#13586, close tikv/tikv#13752, ref tikv/tikv#14017 Signed-off-by: Andrei Dragus --- components/raftstore/src/store/config.rs | 2 +- src/config/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 34f4e159dee..d6994a16ed4 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -660,7 +660,7 @@ impl Config { // prevent mistakenly inputting too large values, the max limit is made // according to the cpu quota * 10. Notice 10 is only an estimate, not an // empirical value. - let limit = SysQuota::cpu_cores_quota() as usize * 10; + let limit = (SysQuota::cpu_cores_quota() * 10.0) as usize; if self.apply_batch_system.pool_size == 0 || self.apply_batch_system.pool_size > limit { return Err(box_err!( "apply-pool-size should be greater than 0 and less than or equal to: {}", diff --git a/src/config/mod.rs b/src/config/mod.rs index 7878696faa5..99b593e2443 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1396,7 +1396,7 @@ impl DbConfig { // prevent mistakenly inputting too large values, the max limit is made // according to the cpu quota * 10. Notice 10 is only an estimate, not an // empirical value. - let limit = SysQuota::cpu_cores_quota() as i32 * 10; + let limit = (SysQuota::cpu_cores_quota() * 10.0) as i32; if self.max_background_jobs <= 0 || self.max_background_jobs > limit { return Err(format!( "max_background_jobs should be greater than 0 and less than or equal to {:?}", From 68710b99ee8f64bb353e617745c8ddf727646913 Mon Sep 17 00:00:00 2001 From: Hu# Date: Sun, 29 Jan 2023 13:49:53 +0800 Subject: [PATCH 470/676] pd_client: replace PD_REQUEST_HISTOGRAM_VEC with static metrics (#14087) close tikv/tikv#14086 PD_REQUEST_HISTOGRAM_VEC can be changed to use static metrics to improve performance. Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/batch-system/src/router.rs | 9 +- components/pd_client/Cargo.toml | 1 + components/pd_client/src/client.rs | 96 +++++++++------------ components/pd_client/src/client_v2.rs | 80 +++++++---------- components/pd_client/src/metrics.rs | 50 +++++++++-- components/raftstore/src/store/fsm/peer.rs | 4 +- components/raftstore/src/store/fsm/store.rs | 2 +- 8 files changed, 124 insertions(+), 119 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d288af846a6..95587f98565 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3696,6 +3696,7 @@ dependencies = [ "log", "log_wrappers", "prometheus", + "prometheus-static-metric", "security", "semver 0.10.0", "serde", diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index bfcb93c9d6b..ef937209531 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -12,12 +12,7 @@ use std::{ use collections::HashMap; use crossbeam::channel::{SendError, TrySendError}; -use tikv_util::{ - debug, info, - lru::LruCache, - time::{duration_to_sec, Instant}, - Either, -}; +use tikv_util::{debug, info, lru::LruCache, time::Instant, Either}; use crate::{ fsm::{Fsm, FsmScheduler, FsmState}, @@ -322,7 +317,7 @@ where for mailbox in mailboxes.map.values() { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); } - BROADCAST_NORMAL_DURATION.observe(duration_to_sec(timer.saturating_elapsed())); + BROADCAST_NORMAL_DURATION.observe(timer.saturating_elapsed_secs()); } /// Try to notify all FSMs that the cluster is being shutdown. diff --git a/components/pd_client/Cargo.toml b/components/pd_client/Cargo.toml index c25e37f23b5..f46d6111c5d 100644 --- a/components/pd_client/Cargo.toml +++ b/components/pd_client/Cargo.toml @@ -19,6 +19,7 @@ lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } +prometheus-static-metric = "0.5" security = { workspace = true } semver = "0.10" serde = "1.0" diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 6686c4e8a04..b0c21797a91 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -26,10 +26,8 @@ use kvproto::{ }; use security::SecurityManager; use tikv_util::{ - box_err, debug, error, info, thd_name, - time::{duration_to_sec, Instant}, - timer::GLOBAL_TIMER_HANDLE, - warn, Either, HandyRwLock, + box_err, debug, error, info, thd_name, time::Instant, timer::GLOBAL_TIMER_HANDLE, warn, Either, + HandyRwLock, }; use txn_types::TimeStamp; use yatp::{task::future::TaskCell, ThreadPool}; @@ -193,9 +191,7 @@ impl RpcClient { &self, key: &[u8], ) -> PdFuture<(metapb::Region, Option)> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_region.start_coarse_timer(); let mut req = pdpb::GetRegionRequest::default(); req.set_header(self.header()); @@ -255,8 +251,8 @@ impl RpcClient { Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store_async"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_store_async + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; let store = resp.take_store(); if store.get_state() != metapb::StoreState::Tombstone { @@ -291,7 +287,7 @@ impl PdClient for RpcClient { items: Vec, ) -> PdFuture<()> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["store_global_config"]) + .store_global_config .start_coarse_timer(); let mut req = pdpb::StoreGlobalConfigRequest::new(); @@ -321,7 +317,7 @@ impl PdClient for RpcClient { config_path: String, ) -> PdFuture<(Vec, i64)> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["load_global_config"]) + .load_global_config .start_coarse_timer(); let mut req = pdpb::LoadGlobalConfigRequest::new(); @@ -355,7 +351,7 @@ impl PdClient for RpcClient { revision: i64, ) -> Result> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["watch_global_config"]) + .watch_global_config .start_coarse_timer(); let mut req = pdpb::WatchGlobalConfigRequest::default(); @@ -377,7 +373,7 @@ impl PdClient for RpcClient { region: metapb::Region, ) -> Result> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["bootstrap_cluster"]) + .bootstrap_cluster .start_coarse_timer(); let mut req = pdpb::BootstrapRequest::default(); @@ -394,7 +390,7 @@ impl PdClient for RpcClient { fn is_cluster_bootstrapped(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_cluster_bootstrapped"]) + .is_cluster_bootstrapped .start_coarse_timer(); let mut req = pdpb::IsBootstrappedRequest::default(); @@ -409,9 +405,7 @@ impl PdClient for RpcClient { } fn alloc_id(&self) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["alloc_id"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.alloc_id.start_coarse_timer(); let mut req = pdpb::AllocIdRequest::default(); req.set_header(self.header()); @@ -430,7 +424,7 @@ impl PdClient for RpcClient { fn is_recovering_marked(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_recovering_marked"]) + .is_recovering_marked .start_coarse_timer(); let mut req = pdpb::IsSnapshotRecoveringRequest::default(); @@ -445,9 +439,7 @@ impl PdClient for RpcClient { } fn put_store(&self, store: metapb::Store) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["put_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.put_store.start_coarse_timer(); let mut req = pdpb::PutStoreRequest::default(); req.set_header(self.header()); @@ -462,9 +454,7 @@ impl PdClient for RpcClient { } fn get_store(&self, store_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_store.start_coarse_timer(); let mut req = pdpb::GetStoreRequest::default(); req.set_header(self.header()); @@ -488,9 +478,7 @@ impl PdClient for RpcClient { } fn get_all_stores(&self, exclude_tombstone: bool) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_all_stores"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_all_stores.start_coarse_timer(); let mut req = pdpb::GetAllStoresRequest::default(); req.set_header(self.header()); @@ -506,7 +494,7 @@ impl PdClient for RpcClient { fn get_cluster_config(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_cluster_config"]) + .get_cluster_config .start_coarse_timer(); let mut req = pdpb::GetClusterConfigRequest::default(); @@ -558,8 +546,8 @@ impl PdClient for RpcClient { Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; if resp.has_region() { Ok(Some(resp.take_region())) @@ -600,8 +588,8 @@ impl PdClient for RpcClient { Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_leader_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_leader_by_id + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; if resp.has_region() && resp.has_leader() { Ok(Some((resp.take_region(), resp.take_leader()))) @@ -737,8 +725,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp) }) as PdFuture<_> @@ -775,8 +763,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_batch_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp) }) as PdFuture<_> @@ -821,8 +809,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["store_heartbeat"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .store_heartbeat + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; match feature_gate.set_version(resp.get_cluster_version()) { Err(_) => warn!("invalid cluster version: {}", resp.get_cluster_version()), @@ -858,8 +846,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["report_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .report_batch_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> @@ -871,9 +859,7 @@ impl PdClient for RpcClient { } fn scatter_region(&self, mut region: RegionInfo) -> Result<()> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["scatter_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.scatter_region.start_coarse_timer(); let mut req = pdpb::ScatterRegionRequest::default(); req.set_header(self.header()); @@ -912,8 +898,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_gc_safe_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_gc_safe_point + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp.get_safe_point()) }) as PdFuture<_> @@ -929,9 +915,7 @@ impl PdClient for RpcClient { } fn get_operator(&self, region_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_operator"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_operator.start_coarse_timer(); let mut req = pdpb::GetOperatorRequest::default(); req.set_header(self.header()); @@ -946,7 +930,7 @@ impl PdClient for RpcClient { } fn batch_get_tso(&self, count: u32) -> PdFuture { - let begin = Instant::now(); + let timer = Instant::now(); let executor = move |client: &Client, _| { // Remove Box::pin and Compat when GLOBAL_TIMER_HANDLE supports futures 0.3 let ts_fut = Compat::new(Box::pin(client.inner.rl().tso.get_timestamp(count))); @@ -965,8 +949,8 @@ impl PdClient for RpcClient { } })?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["tso"]) - .observe(duration_to_sec(begin.saturating_elapsed())); + .tso + .observe(timer.saturating_elapsed_secs()); Ok(ts) }) as PdFuture<_> }; @@ -981,7 +965,7 @@ impl PdClient for RpcClient { safe_point: TimeStamp, ttl: Duration, ) -> PdFuture<()> { - let begin = Instant::now(); + let timer = Instant::now(); let mut req = pdpb::UpdateServiceGcSafePointRequest::default(); req.set_header(self.header()); req.set_service_id(name.into()); @@ -1003,8 +987,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["update_service_safe_point"]) - .observe(duration_to_sec(begin.saturating_elapsed())); + .update_service_safe_point + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> @@ -1039,8 +1023,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["min_resolved_ts"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .min_resolved_ts + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 35e5c3b4785..cfa0d46303c 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -47,12 +47,8 @@ use kvproto::{ }; use security::SecurityManager; use tikv_util::{ - box_err, error, info, - mpsc::future as mpsc, - slow_log, thd_name, - time::{duration_to_sec, Instant}, - timer::GLOBAL_TIMER_HANDLE, - warn, + box_err, error, info, mpsc::future as mpsc, slow_log, thd_name, time::Instant, + timer::GLOBAL_TIMER_HANDLE, warn, }; use tokio::sync::{broadcast, mpsc as tokio_mpsc}; use txn_types::TimeStamp; @@ -835,7 +831,7 @@ impl PdClient for RpcClient { region: metapb::Region, ) -> Result> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["bootstrap_cluster"]) + .bootstrap_cluster .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -856,7 +852,7 @@ impl PdClient for RpcClient { fn is_cluster_bootstrapped(&mut self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_cluster_bootstrapped"]) + .is_cluster_bootstrapped .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -875,9 +871,7 @@ impl PdClient for RpcClient { } fn alloc_id(&mut self) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["alloc_id"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.alloc_id.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -902,7 +896,7 @@ impl PdClient for RpcClient { fn is_recovering_marked(&mut self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_recovering_marked"]) + .is_recovering_marked .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -921,9 +915,7 @@ impl PdClient for RpcClient { } fn put_store(&mut self, store: metapb::Store) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["put_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.put_store.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -962,8 +954,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store_and_stats"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_store_and_stats + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; let store = resp.take_store(); @@ -976,9 +968,7 @@ impl PdClient for RpcClient { } fn get_all_stores(&mut self, exclude_tombstone: bool) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_all_stores"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_all_stores.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -998,7 +988,7 @@ impl PdClient for RpcClient { fn get_cluster_config(&mut self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_cluster_config"]) + .get_cluster_config .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -1037,8 +1027,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; let region = if resp.has_region() { @@ -1076,8 +1066,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; if resp.has_region() { @@ -1115,8 +1105,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_leader_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_leader_by_id + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; if resp.has_region() && resp.has_leader() { @@ -1145,8 +1135,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_split + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(resp) @@ -1179,8 +1169,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_batch_split + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(resp) @@ -1223,8 +1213,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["store_heartbeat"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .store_heartbeat + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; match feature_gate.set_version(resp.get_cluster_version()) { @@ -1257,8 +1247,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["report_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .report_batch_split + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(()) @@ -1266,9 +1256,7 @@ impl PdClient for RpcClient { } fn scatter_region(&mut self, mut region: RegionInfo) -> Result<()> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["scatter_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.scatter_region.start_coarse_timer(); let mut req = pdpb::ScatterRegionRequest::default(); req.set_region_id(region.get_id()); @@ -1307,8 +1295,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_gc_saft_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_gc_safe_point + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(resp.get_safe_point()) @@ -1316,9 +1304,7 @@ impl PdClient for RpcClient { } fn get_operator(&mut self, region_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_operator"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_operator.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -1366,8 +1352,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["update_service_safe_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .update_service_safe_point + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(()) @@ -1396,8 +1382,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["min_resolved_ts"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .min_resolved_ts + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(()) diff --git a/components/pd_client/src/metrics.rs b/components/pd_client/src/metrics.rs index 57879a57d0e..a4ef9c5ce4e 100644 --- a/components/pd_client/src/metrics.rs +++ b/components/pd_client/src/metrics.rs @@ -2,14 +2,52 @@ use lazy_static::lazy_static; use prometheus::*; +use prometheus_static_metric::{make_static_metric, register_static_histogram_vec}; + +make_static_metric! { + pub label_enum PDRequestEventType { + get_region, + get_region_by_id, + get_region_leader_by_id, + scatter_region, + get_store, + get_store_async, + put_store, + get_all_stores, + get_store_and_stats, + store_global_config, + load_global_config, + watch_global_config, + bootstrap_cluster, + is_cluster_bootstrapped, + get_cluster_config, + ask_split, + ask_batch_split, + report_batch_split, + get_gc_safe_point, + update_service_safe_point, + min_resolved_ts, + get_operator, + alloc_id, + is_recovering_marked, + store_heartbeat, + tso, + } + + pub struct PDRequestEventHistogramVec: Histogram { + "type" => PDRequestEventType, + } +} lazy_static! { - pub static ref PD_REQUEST_HISTOGRAM_VEC: HistogramVec = register_histogram_vec!( - "tikv_pd_request_duration_seconds", - "Bucketed histogram of PD requests duration", - &["type"] - ) - .unwrap(); + pub static ref PD_REQUEST_HISTOGRAM_VEC: PDRequestEventHistogramVec = + register_static_histogram_vec!( + PDRequestEventHistogramVec, + "tikv_pd_request_duration_seconds", + "Bucketed histogram of PD requests duration", + &["type"] + ) + .unwrap(); pub static ref PD_HEARTBEAT_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_pd_heartbeat_message_total", "Total number of PD heartbeat messages.", diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 4266e400cd3..1cc603f2490 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -55,7 +55,7 @@ use tikv_util::{ mpsc::{self, LooseBoundedSender, Receiver}, store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, sys::disk::DiskUsage, - time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, + time::{monotonic_raw_now, Instant as TiInstant}, trace, warn, worker::{ScheduleError, Scheduler}, Either, @@ -694,7 +694,7 @@ where .raft_metrics .event_time .peer_msg - .observe(duration_to_sec(timer.saturating_elapsed())); + .observe(timer.saturating_elapsed_secs()); } #[inline] diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index e68873cadf1..26f2983998d 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -806,7 +806,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> .raft_metrics .event_time .store_msg - .observe(duration_to_sec(timer.saturating_elapsed())); + .observe(timer.saturating_elapsed_secs()); } fn start(&mut self, store: metapb::Store) { From b1936e6c2d73789b05545b96b32dc22fac880a79 Mon Sep 17 00:00:00 2001 From: Zhi Qi <30543181+LittleFall@users.noreply.github.com> Date: Mon, 30 Jan 2023 17:25:54 +0800 Subject: [PATCH 471/676] copr: (refactor) Lift heap struct out from top_n_executor (#14096) ref tikv/tikv#13936 Signed-off-by: Zhi Qi --- .../src/top_n_executor.rs | 210 ++--------------- .../tidb_query_executors/src/util/mod.rs | 1 + .../src/util/top_n_heap.rs | 211 ++++++++++++++++++ 3 files changed, 229 insertions(+), 193 deletions(-) create mode 100644 components/tidb_query_executors/src/util/top_n_heap.rs diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index 6ef8c6b2224..5ebc65baa25 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -1,20 +1,23 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Ordering, collections::BinaryHeap, ptr::NonNull, sync::Arc}; +use std::sync::Arc; use async_trait::async_trait; use tidb_query_common::{storage::IntervalRange, Result}; use tidb_query_datatype::{ - codec::{ - batch::{LazyBatchColumn, LazyBatchColumnVec}, - data_type::*, - }, + codec::{batch::LazyBatchColumnVec, data_type::*}, expr::{EvalConfig, EvalContext, EvalWarnings}, }; use tidb_query_expr::{RpnExpression, RpnExpressionBuilder, RpnStackNode}; use tipb::{Expr, FieldType, TopN}; -use crate::{interface::*, util::*}; +use crate::{ + interface::*, + util::{ + top_n_heap::{HeapItemSourceData, HeapItemUnsafe, TopNHeap}, + *, + }, +}; pub struct BatchTopNExecutor { /// The heap, which contains N rows at most. @@ -22,7 +25,7 @@ pub struct BatchTopNExecutor { /// This field is placed before `eval_columns_buffer_unsafe`, `order_exprs`, /// `order_is_desc` and `src` because it relies on data in those fields /// and we want this field to be dropped first. - heap: BinaryHeap, + heap: TopNHeap, /// A collection of all evaluated columns. This is to avoid repeated /// allocations in each `next_batch()`. @@ -97,7 +100,7 @@ impl BatchTopNExecutor { .collect(); Self { - heap: BinaryHeap::new(), + heap: TopNHeap::new(n), eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), @@ -126,7 +129,7 @@ impl BatchTopNExecutor { .collect(); Self { - heap: BinaryHeap::new(), + heap: TopNHeap::new(n), eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), @@ -140,7 +143,7 @@ impl BatchTopNExecutor { } pub fn new( - config: std::sync::Arc, + config: Arc, src: Src, order_exprs_def: Vec, order_is_desc: Vec, @@ -163,8 +166,7 @@ impl BatchTopNExecutor { .collect(); Ok(Self { - // Avoid large N causing OOM - heap: BinaryHeap::with_capacity(n.min(1024)), + heap: TopNHeap::new(n), // Simply large enough to avoid repeated allocations eval_columns_buffer_unsafe: Box::new(Vec::with_capacity(512)), order_exprs: order_exprs.into_boxed_slice(), @@ -182,7 +184,7 @@ impl BatchTopNExecutor { async fn handle_next_batch(&mut self) -> Result> { // Use max batch size from the beginning because top N // always needs to calculate over all data. - let src_result = self.src.next_batch(crate::runner::BATCH_MAX_SIZE).await; + let src_result = self.src.next_batch(BATCH_MAX_SIZE).await; self.context.warnings = src_result.warnings; @@ -193,7 +195,7 @@ impl BatchTopNExecutor { } if src_is_drained { - Ok(Some(self.heap_take_all())) + Ok(Some(self.heap.take_all())) } else { Ok(None) } @@ -240,84 +242,11 @@ impl BatchTopNExecutor { eval_columns_offset: eval_offset, logical_row_index, }; - self.heap_add_row(row)?; - } - - Ok(()) - } - - fn heap_add_row(&mut self, row: HeapItemUnsafe) -> Result<()> { - if self.heap.len() < self.n { - // HeapItemUnsafe must be checked valid to compare in advance, or else it may - // panic inside BinaryHeap. - row.cmp_sort_key(&row)?; - - // Push into heap when heap is not full. - self.heap.push(row); - } else { - // Swap the greatest row in the heap if this row is smaller than that row. - let mut greatest_row = self.heap.peek_mut().unwrap(); - if row.cmp_sort_key(&greatest_row)? == Ordering::Less { - *greatest_row = row; - } + self.heap.add_row(row)?; } Ok(()) } - - #[allow(clippy::clone_on_copy)] - fn heap_take_all(&mut self) -> LazyBatchColumnVec { - let heap = std::mem::take(&mut self.heap); - let sorted_items = heap.into_sorted_vec(); - if sorted_items.is_empty() { - return LazyBatchColumnVec::empty(); - } - - let mut result = sorted_items[0] - .source_data - .physical_columns - .clone_empty(sorted_items.len()); - - for (column_index, result_column) in result.as_mut_slice().iter_mut().enumerate() { - match result_column { - LazyBatchColumn::Raw(dest_column) => { - for item in &sorted_items { - let src = item.source_data.physical_columns[column_index].raw(); - dest_column - .push(&src[item.source_data.logical_rows[item.logical_row_index]]); - } - } - LazyBatchColumn::Decoded(dest_vector_value) => { - match_template::match_template! { - TT = [ - Int, - Real, - Duration, - Decimal, - DateTime, - Bytes => BytesRef, - Json => JsonRef, - Enum => EnumRef, - Set => SetRef, - ], - match dest_vector_value { - VectorValue::TT(dest_column) => { - for item in &sorted_items { - let src: &VectorValue = item.source_data.physical_columns[column_index].decoded(); - let src_ref = TT::borrow_vector_value(src); - // TODO: This clone is not necessary. - dest_column.push(src_ref.get_option_ref(item.source_data.logical_rows[item.logical_row_index]).map(|x| x.into_owned_value())); - } - }, - } - } - } - } - } - - result.assert_columns_equal_length(); - result - } } #[async_trait] @@ -402,111 +331,6 @@ impl BatchExecutor for BatchTopNExecutor { } } -struct HeapItemSourceData { - physical_columns: LazyBatchColumnVec, - logical_rows: Vec, -} - -/// The item in the heap of `BatchTopNExecutor`. -/// -/// WARN: The content of this structure is valid only if `BatchTopNExecutor` is -/// valid (i.e. not dropped). Thus it is called unsafe. -struct HeapItemUnsafe { - /// A pointer to the `order_is_desc` field in `BatchTopNExecutor`. - order_is_desc_ptr: NonNull<[bool]>, - - /// A pointer to the `order_exprs_field_type` field in `order_exprs`. - order_exprs_field_type_ptr: NonNull<[FieldType]>, - - /// The source data that evaluated column in this structure is using. - source_data: Arc, - - /// A pointer to the `eval_columns_buffer` field in `BatchTopNExecutor`. - eval_columns_buffer_ptr: NonNull>>, - - /// The begin offset of the evaluated columns stored in the buffer. - /// - /// The length of evaluated columns in the buffer is `order_is_desc.len()`. - eval_columns_offset: usize, - - /// Which logical row in the evaluated columns this heap item is - /// representing. - logical_row_index: usize, -} - -impl HeapItemUnsafe { - fn get_order_is_desc(&self) -> &[bool] { - unsafe { self.order_is_desc_ptr.as_ref() } - } - - fn get_order_exprs_field_type(&self) -> &[FieldType] { - unsafe { self.order_exprs_field_type_ptr.as_ref() } - } - - fn get_eval_columns(&self, len: usize) -> &[RpnStackNode<'_>] { - let offset_begin = self.eval_columns_offset; - let offset_end = offset_begin + len; - let vec_buf = unsafe { self.eval_columns_buffer_ptr.as_ref() }; - &vec_buf[offset_begin..offset_end] - } - - fn cmp_sort_key(&self, other: &Self) -> Result { - // Only debug assert because this function is called pretty frequently. - debug_assert_eq!(self.get_order_is_desc(), other.get_order_is_desc()); - - let order_is_desc = self.get_order_is_desc(); - let order_exprs_field_type = self.get_order_exprs_field_type(); - let columns_len = order_is_desc.len(); - let eval_columns_lhs = self.get_eval_columns(columns_len); - let eval_columns_rhs = other.get_eval_columns(columns_len); - - for column_idx in 0..columns_len { - let lhs_node = &eval_columns_lhs[column_idx]; - let rhs_node = &eval_columns_rhs[column_idx]; - let lhs = lhs_node.get_logical_scalar_ref(self.logical_row_index); - let rhs = rhs_node.get_logical_scalar_ref(other.logical_row_index); - - // There is panic inside, but will never panic, since the data type of - // corresponding column should be consistent for each - // `HeapItemUnsafe`. - let ord = lhs.cmp_sort_key(&rhs, &order_exprs_field_type[column_idx])?; - - if ord == Ordering::Equal { - continue; - } - if !order_is_desc[column_idx] { - return Ok(ord); - } else { - return Ok(ord.reverse()); - } - } - - Ok(Ordering::Equal) - } -} - -/// WARN: HeapItemUnsafe implements partial ordering. It panics when Collator -/// fails to parse. So make sure that it is valid before putting it into a heap. -impl Ord for HeapItemUnsafe { - fn cmp(&self, other: &Self) -> Ordering { - self.cmp_sort_key(other).unwrap() - } -} - -impl PartialOrd for HeapItemUnsafe { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl PartialEq for HeapItemUnsafe { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl Eq for HeapItemUnsafe {} - #[cfg(test)] mod tests { use futures::executor::block_on; diff --git a/components/tidb_query_executors/src/util/mod.rs b/components/tidb_query_executors/src/util/mod.rs index 6aa578459e2..ca05e49fcd3 100644 --- a/components/tidb_query_executors/src/util/mod.rs +++ b/components/tidb_query_executors/src/util/mod.rs @@ -5,6 +5,7 @@ pub mod hash_aggr_helper; #[cfg(test)] pub mod mock_executor; pub mod scan_executor; +pub mod top_n_heap; use tidb_query_common::Result; use tidb_query_datatype::{codec::batch::LazyBatchColumnVec, expr::EvalContext}; diff --git a/components/tidb_query_executors/src/util/top_n_heap.rs b/components/tidb_query_executors/src/util/top_n_heap.rs new file mode 100644 index 00000000000..0cbef103e4d --- /dev/null +++ b/components/tidb_query_executors/src/util/top_n_heap.rs @@ -0,0 +1,211 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cmp::Ordering, collections::BinaryHeap, ptr::NonNull, sync::Arc}; + +use tidb_query_common::Result; +use tidb_query_datatype::codec::{ + batch::{LazyBatchColumn, LazyBatchColumnVec}, + data_type::*, +}; +use tidb_query_expr::RpnStackNode; +use tipb::FieldType; + +/// TopNHeap is the common data structure used in TopN-like executors. +pub struct TopNHeap { + /// The maximum number of rows in the heap. + n: usize, + /// The heap. + heap: BinaryHeap, +} + +impl TopNHeap { + /// parameters: + /// - n: The maximum number of rows in the heaps + /// note: to avoid large N causing OOM, the initial capacity will be limited + /// up to 1024. + pub fn new(n: usize) -> Self { + Self { + n, + // Avoid large N causing OOM + heap: BinaryHeap::with_capacity(n.min(1024)), + } + } + + pub fn add_row(&mut self, row: HeapItemUnsafe) -> Result<()> { + if self.heap.len() < self.n { + // HeapItemUnsafe must be checked valid to compare in advance, or else it may + // panic inside BinaryHeap. + row.cmp_sort_key(&row)?; + + // Push into heap when heap is not full. + self.heap.push(row); + } else { + // Swap the greatest row in the heap if this row is smaller than that row. + let mut greatest_row = self.heap.peek_mut().unwrap(); + if row.cmp_sort_key(&greatest_row)? == Ordering::Less { + *greatest_row = row; + } + } + + Ok(()) + } + + #[allow(clippy::clone_on_copy)] + pub fn take_all(&mut self) -> LazyBatchColumnVec { + let heap = std::mem::take(&mut self.heap); + let sorted_items = heap.into_sorted_vec(); + if sorted_items.is_empty() { + return LazyBatchColumnVec::empty(); + } + + let mut result = sorted_items[0] + .source_data + .physical_columns + .clone_empty(sorted_items.len()); + + for (column_index, result_column) in result.as_mut_slice().iter_mut().enumerate() { + match result_column { + LazyBatchColumn::Raw(dest_column) => { + for item in &sorted_items { + let src = item.source_data.physical_columns[column_index].raw(); + dest_column + .push(&src[item.source_data.logical_rows[item.logical_row_index]]); + } + } + LazyBatchColumn::Decoded(dest_vector_value) => { + match_template::match_template! { + TT = [ + Int, + Real, + Duration, + Decimal, + DateTime, + Bytes => BytesRef, + Json => JsonRef, + Enum => EnumRef, + Set => SetRef, + ], + match dest_vector_value { + VectorValue::TT(dest_column) => { + for item in &sorted_items { + let src: &VectorValue = item.source_data.physical_columns[column_index].decoded(); + let src_ref = TT::borrow_vector_value(src); + // TODO: This clone is not necessary. + dest_column.push(src_ref.get_option_ref(item.source_data.logical_rows[item.logical_row_index]).map(|x| x.into_owned_value())); + } + }, + } + } + } + } + } + + result.assert_columns_equal_length(); + result + } +} + +pub struct HeapItemSourceData { + pub physical_columns: LazyBatchColumnVec, + pub logical_rows: Vec, +} + +/// The item in the heap of `BatchTopNExecutor`. +/// +/// WARN: The content of this structure is valid only if `BatchTopNExecutor` is +/// valid (i.e. not dropped). Thus it is called unsafe. +pub struct HeapItemUnsafe { + /// A pointer to the `order_is_desc` field in `BatchTopNExecutor`. + pub order_is_desc_ptr: NonNull<[bool]>, + + /// A pointer to the `order_exprs_field_type` field in `order_exprs`. + pub order_exprs_field_type_ptr: NonNull<[FieldType]>, + + /// The source data that evaluated column in this structure is using. + pub source_data: Arc, + + /// A pointer to the `eval_columns_buffer` field in `BatchTopNExecutor`. + pub eval_columns_buffer_ptr: NonNull>>, + + /// The begin offset of the evaluated columns stored in the buffer. + /// + /// The length of evaluated columns in the buffer is `order_is_desc.len()`. + pub eval_columns_offset: usize, + + /// Which logical row in the evaluated columns this heap item is + /// representing. + pub logical_row_index: usize, +} + +impl HeapItemUnsafe { + fn get_order_is_desc(&self) -> &[bool] { + unsafe { self.order_is_desc_ptr.as_ref() } + } + + fn get_order_exprs_field_type(&self) -> &[FieldType] { + unsafe { self.order_exprs_field_type_ptr.as_ref() } + } + + fn get_eval_columns(&self, len: usize) -> &[RpnStackNode<'_>] { + let offset_begin = self.eval_columns_offset; + let offset_end = offset_begin + len; + let vec_buf = unsafe { self.eval_columns_buffer_ptr.as_ref() }; + &vec_buf[offset_begin..offset_end] + } + + fn cmp_sort_key(&self, other: &Self) -> Result { + // Only debug assert because this function is called pretty frequently. + debug_assert_eq!(self.get_order_is_desc(), other.get_order_is_desc()); + + let order_is_desc = self.get_order_is_desc(); + let order_exprs_field_type = self.get_order_exprs_field_type(); + let columns_len = order_is_desc.len(); + let eval_columns_lhs = self.get_eval_columns(columns_len); + let eval_columns_rhs = other.get_eval_columns(columns_len); + + for column_idx in 0..columns_len { + let lhs_node = &eval_columns_lhs[column_idx]; + let rhs_node = &eval_columns_rhs[column_idx]; + let lhs = lhs_node.get_logical_scalar_ref(self.logical_row_index); + let rhs = rhs_node.get_logical_scalar_ref(other.logical_row_index); + + // There is panic inside, but will never panic, since the data type of + // corresponding column should be consistent for each + // `HeapItemUnsafe`. + let ord = lhs.cmp_sort_key(&rhs, &order_exprs_field_type[column_idx])?; + + if ord == Ordering::Equal { + continue; + } + return if !order_is_desc[column_idx] { + Ok(ord) + } else { + Ok(ord.reverse()) + }; + } + + Ok(Ordering::Equal) + } +} + +/// WARN: HeapItemUnsafe implements partial ordering. It panics when Collator +/// fails to parse. So make sure that it is valid before putting it into a heap. +impl Ord for HeapItemUnsafe { + fn cmp(&self, other: &Self) -> Ordering { + self.cmp_sort_key(other).unwrap() + } +} + +impl PartialOrd for HeapItemUnsafe { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for HeapItemUnsafe { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for HeapItemUnsafe {} From 0ce3485ca67eab8c9540a6931047478de67c48c0 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 30 Jan 2023 17:43:54 +0800 Subject: [PATCH 472/676] raftstore: allow the read request with a smaller ts during flashback (#14088) close tikv/tikv#14045 - Store the flashback `start_ts` in region meta. - Allow the read request with a smaller ts during flashback. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/raftstore/src/store/fsm/apply.rs | 4 +- components/raftstore/src/store/fsm/peer.rs | 10 ++- components/raftstore/src/store/util.rs | 24 ++++++- components/raftstore/src/store/worker/read.rs | 5 +- components/test_raftstore/src/util.rs | 30 ++++++--- components/tikv_kv/src/lib.rs | 7 ++- src/server/raftkv/mod.rs | 27 ++++---- src/server/raftkv2/mod.rs | 19 +++--- src/server/service/kv.rs | 9 ++- tests/integrations/server/kv_service.rs | 62 +++++++++++-------- 11 files changed, 129 insertions(+), 70 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 95587f98565..46eac5930a1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2729,7 +2729,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#009f31598ac3200dc8b32e18f96fc4deb7b32e48" +source = "git+https://github.com/pingcap/kvproto.git#1b2b4114103afb06796b7e44f45f7e55133673c0" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 22a42393173..bb262b9ffa8 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1649,7 +1649,8 @@ where req.get_header().get_region_epoch().get_version() >= self.last_merge_version; check_req_region_epoch(req, &self.region, include_region)?; check_flashback_state( - self.region.get_is_in_flashback(), + self.region.is_in_flashback, + self.region.flashback_start_ts, req, self.region_id(), false, @@ -2975,6 +2976,7 @@ where // Modify the region meta in memory. let mut region = self.region.clone(); region.set_is_in_flashback(is_in_flashback); + region.set_flashback_start_ts(req.get_prepare_flashback().get_start_ts()); // Modify the `RegionLocalState` persisted in disk. write_peer_state(ctx.kv_wb_mut(), ®ion, PeerState::Normal, None).unwrap_or_else(|e| { panic!( diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 1cc603f2490..a8232fd8322 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5223,9 +5223,13 @@ where // the apply phase and because a read-only request doesn't need to be applied, // so it will be allowed during the flashback progress, for example, a snapshot // request. - if let Err(e) = - util::check_flashback_state(self.region().is_in_flashback, msg, region_id, true) - { + if let Err(e) = util::check_flashback_state( + self.region().is_in_flashback, + self.region().flashback_start_ts, + msg, + region_id, + true, + ) { match e { Error::FlashbackInProgress(_) => self .ctx diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 4d8128822c7..0344adb2b92 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -31,7 +31,9 @@ use raft::{ }; use raft_proto::ConfChangeI; use tikv_util::{ - box_err, debug, info, + box_err, + codec::number::{decode_u64, NumberEncoder}, + debug, info, store::{find_peer_by_id, region}, time::monotonic_raw_now, Either, @@ -336,6 +338,7 @@ pub fn compare_region_epoch( // flashback. pub fn check_flashback_state( is_in_flashback: bool, + flashback_start_ts: u64, req: &RaftCmdRequest, region_id: u64, skip_not_prepared: bool, @@ -347,11 +350,20 @@ pub fn check_flashback_state( { return Ok(()); } + // TODO: only use `flashback_start_ts` to check flashback state. + let is_in_flashback = is_in_flashback || flashback_start_ts > 0; let is_flashback_request = WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) .contains(WriteBatchFlags::FLASHBACK); - // If the region is in the flashback state, the only allowed request is the - // flashback request itself. + // If the region is in the flashback state: + // - A request with flashback flag will be allowed. + // - A read request whose `read_ts` is smaller than `flashback_start_ts` will + // be allowed. if is_in_flashback && !is_flashback_request { + if let Ok(read_ts) = decode_u64(&mut req.get_header().get_flag_data()) { + if read_ts != 0 && read_ts < flashback_start_ts { + return Ok(()); + } + } return Err(Error::FlashbackInProgress(region_id)); } // If the region is not in the flashback state, the flashback request itself @@ -362,6 +374,12 @@ pub fn check_flashback_state( Ok(()) } +pub fn encode_start_ts_into_flag_data(header: &mut RaftRequestHeader, start_ts: u64) { + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(start_ts).unwrap(); + header.set_flag_data(data.into()); +} + pub fn is_region_epoch_equal( from_epoch: &metapb::RegionEpoch, current_epoch: &metapb::RegionEpoch, diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 6b20e375786..5d6835666b4 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -822,7 +822,10 @@ where // Check whether the region is in the flashback state and the local read could // be performed. let is_in_flashback = delegate.region.is_in_flashback; - if let Err(e) = util::check_flashback_state(is_in_flashback, req, region_id, false) { + let flashback_start_ts = delegate.region.flashback_start_ts; + if let Err(e) = + util::check_flashback_state(is_in_flashback, flashback_start_ts, req, region_id, false) + { TLS_LOCAL_READ_METRICS.with(|m| match e { Error::FlashbackNotPrepared(_) => { m.borrow_mut().reject_reason.flashback_not_prepared.inc() diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 8b3745120d5..4bcb99adca3 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -1246,15 +1246,9 @@ pub fn must_raw_get(client: &TikvClient, ctx: Context, key: Vec) -> Option { pub pb_ctx: &'a Context, pub read_id: Option, - // When start_ts is None and `stale_read` is true, it means acquire a snapshot without any - // consistency guarantee. + // When `start_ts` is None and `stale_read` is true, it means acquire a snapshot without any + // consistency guarantee. This filed is also used to check if a read is allowed in the + // flashback. pub start_ts: Option, // `key_ranges` is used in replica read. It will send to // the leader via raft "read index" to check memory locks. @@ -418,7 +419,7 @@ pub trait Engine: Send + Clone + 'static { /// Mark the start of flashback. // It's an infrequent API, use trait object for simplicity. - fn start_flashback(&self, _ctx: &Context) -> BoxFuture<'static, Result<()>> { + fn start_flashback(&self, _ctx: &Context, _start_ts: u64) -> BoxFuture<'static, Result<()>> { Box::pin(futures::future::ready(Ok(()))) } diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index c50c42c9fc6..751c07c6b65 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -44,14 +44,13 @@ use raftstore::{ errors::Error as RaftServerError, router::{LocalReadRouter, RaftStoreRouter}, store::{ - self, Callback as StoreCallback, RaftCmdExtraOpts, ReadIndexContext, ReadResponse, - RegionSnapshot, StoreMsg, WriteResponse, + self, util::encode_start_ts_into_flag_data, Callback as StoreCallback, RaftCmdExtraOpts, + ReadIndexContext, ReadResponse, RegionSnapshot, StoreMsg, WriteResponse, }, }; use thiserror::Error; use tikv_kv::{write_modifies, OnAppliedCb, WriteEvent}; use tikv_util::{ - codec::number::NumberEncoder, future::{paired_future_callback, paired_must_called_future_callback}, time::Instant, }; @@ -547,18 +546,21 @@ where let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; - if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { - let mut data = [0u8; 8]; - (&mut data[..]) - .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) - .unwrap(); + let need_encoded_start_ts = ctx.start_ts.map_or(true, |ts| !ts.is_zero()); + if ctx.pb_ctx.get_stale_read() && need_encoded_start_ts { flags |= WriteBatchFlags::STALE_READ.bits(); - header.set_flag_data(data.into()); } if ctx.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); + // Encode `start_ts` in `flag_data` for the check of stale read and flashback. + if need_encoded_start_ts { + encode_start_ts_into_flag_data( + &mut header, + ctx.start_ts.unwrap_or_default().into_inner(), + ); + } let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); @@ -637,13 +639,16 @@ where } } - fn start_flashback(&self, ctx: &Context) -> BoxFuture<'static, kv::Result<()>> { + fn start_flashback(&self, ctx: &Context, start_ts: u64) -> BoxFuture<'static, kv::Result<()>> { // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the // later flashback. Once invoked, we will update the persistent region meta and // the memory state of the flashback in Peer FSM to reject all read, write // and scheduling operations for this region when propose/apply before we // start the actual data flashback transaction command in the next phase. - let req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); + let mut req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); + req.mut_admin_request() + .mut_prepare_flashback() + .set_start_ts(start_ts); exec_admin(&*self.router, req) } diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 526a1fab3ca..28f2a1d5d25 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -15,7 +15,7 @@ use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; use futures::{Future, Stream, StreamExt}; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; pub use node::NodeV2; -use raftstore::store::RegionSnapshot; +use raftstore::store::{util::encode_start_ts_into_flag_data, RegionSnapshot}; use raftstore_v2::{ router::{ message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, @@ -23,7 +23,7 @@ use raftstore_v2::{ SimpleWriteBinary, SimpleWriteEncoder, }; use tikv_kv::{Modify, WriteEvent}; -use tikv_util::{codec::number::NumberEncoder, time::Instant}; +use tikv_util::time::Instant; use txn_types::{TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::{ @@ -153,18 +153,21 @@ impl tikv_kv::Engine for RaftKv2 { let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; - if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { - let mut data = [0u8; 8]; - (&mut data[..]) - .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) - .unwrap(); + let need_encoded_start_ts = ctx.start_ts.map_or(true, |ts| !ts.is_zero()); + if ctx.pb_ctx.get_stale_read() && need_encoded_start_ts { flags |= WriteBatchFlags::STALE_READ.bits(); - header.set_flag_data(data.into()); } if ctx.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); + // Encode `start_ts` in `flag_data` for the check of stale read and flashback. + if need_encoded_start_ts { + encode_start_ts_into_flag_data( + &mut header, + ctx.start_ts.unwrap_or_default().into_inner(), + ); + } let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index d42eb510891..da292eca17d 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1450,7 +1450,9 @@ fn future_prepare_flashback_to_version( ) -> impl Future> { let storage = storage.clone(); async move { - let f = storage.get_engine().start_flashback(req.get_context()); + let f = storage + .get_engine() + .start_flashback(req.get_context(), req.get_start_ts()); let mut res = f.await.map_err(storage::Error::from); if matches!(res, Ok(())) { // After the region is put into the flashback state, we need to do a special @@ -1488,10 +1490,7 @@ fn future_flashback_to_version( res = f.await.unwrap_or_else(|e| Err(box_err!(e))); } if matches!(res, Ok(())) { - // Only finish flashback when Flashback executed successfully. - fail_point!("skip_finish_flashback_to_version", |_| { - Ok(FlashbackToVersionResponse::default()) - }); + // Only finish when flashback executed successfully. let f = storage.get_engine().end_flashback(req.get_context()); res = f.await.map_err(storage::Error::from); } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 8709373b766..61a3fb39097 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -711,19 +711,17 @@ fn test_mvcc_flashback() { } #[test] -#[cfg(feature = "failpoints")] fn test_mvcc_flashback_block_rw() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); - // Flashback - must_flashback_to_version(&client, ctx.clone(), 0, 1, 2); - // Try to read. + // Prepare the flashback. + must_prepare_flashback(&client, ctx.clone(), 1, 2); + // Try to read version 3 (after flashback, FORBIDDEN). let (k, v) = (b"key".to_vec(), b"value".to_vec()); // Get let mut get_req = GetRequest::default(); get_req.set_context(ctx.clone()); get_req.key = k.clone(); - get_req.version = 1; + get_req.version = 3; let get_resp = client.kv_get(&get_req).unwrap(); assert!(get_resp.get_region_error().has_flashback_in_progress()); assert!(!get_resp.has_error()); @@ -733,28 +731,48 @@ fn test_mvcc_flashback_block_rw() { scan_req.set_context(ctx.clone()); scan_req.start_key = k.clone(); scan_req.limit = 1; - scan_req.version = 1; + scan_req.version = 3; let scan_resp = client.kv_scan(&scan_req).unwrap(); assert!(scan_resp.get_region_error().has_flashback_in_progress()); + assert!(!scan_resp.has_error()); assert!(scan_resp.pairs.is_empty()); - // Try to write. + // Try to read version 1 (before flashback, ALLOWED). + // Get + let mut get_req = GetRequest::default(); + get_req.set_context(ctx.clone()); + get_req.key = k.clone(); + get_req.version = 1; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(!get_resp.has_error()); + assert!(get_resp.value.is_empty()); + // Scan + let mut scan_req = ScanRequest::default(); + scan_req.set_context(ctx.clone()); + scan_req.start_key = k.clone(); + scan_req.limit = 1; + scan_req.version = 1; + let scan_resp = client.kv_scan(&scan_req).unwrap(); + assert!(!scan_resp.has_region_error()); + assert!(!scan_resp.has_error()); + assert!(scan_resp.pairs.is_empty()); + // Try to write (FORBIDDEN). // Prewrite let mut mutation = Mutation::default(); mutation.set_op(Op::Put); mutation.set_key(k.clone()); mutation.set_value(v); - let prewrite_resp = try_kv_prewrite(&client, ctx, vec![mutation], k, 1); + let prewrite_resp = try_kv_prewrite(&client, ctx.clone(), vec![mutation], k, 1); assert!(prewrite_resp.get_region_error().has_flashback_in_progress()); - fail::remove("skip_finish_flashback_to_version"); + // Finish the flashback. + must_finish_flashback(&client, ctx, 1, 2, 3); } #[test] -#[cfg(feature = "failpoints")] fn test_mvcc_flashback_block_scheduling() { let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); - fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); - // Flashback - must_flashback_to_version(&client, ctx, 0, 1, 2); + // Prepare the flashback. + must_prepare_flashback(&client, ctx.clone(), 0, 1); // Try to transfer leader. let transfer_leader_resp = cluster.try_transfer_leader(1, new_peer(2, 2)); assert!( @@ -763,7 +781,8 @@ fn test_mvcc_flashback_block_scheduling() { .get_error() .has_flashback_in_progress() ); - fail::remove("skip_finish_flashback_to_version"); + // Finish the flashback. + must_finish_flashback(&client, ctx, 0, 1, 2); } #[test] @@ -794,16 +813,7 @@ fn test_mvcc_flashback_unprepared() { assert!(!get_resp.has_error()); assert_eq!(get_resp.value, b"".to_vec()); // Mock the flashback retry. - let mut req = FlashbackToVersionRequest::default(); - req.set_context(ctx); - req.set_start_ts(6); - req.set_commit_ts(7); - req.version = 0; - req.start_key = b"a".to_vec(); - req.end_key = b"z".to_vec(); - let resp = client.kv_flashback_to_version(&req).unwrap(); - assert!(!resp.has_region_error()); - assert!(resp.get_error().is_empty()); + must_finish_flashback(&client, ctx.clone(), 0, 6, 7); let get_resp = client.kv_get(&get_req).unwrap(); assert!(!get_resp.has_region_error()); assert!(!get_resp.has_error()); @@ -811,7 +821,7 @@ fn test_mvcc_flashback_unprepared() { } #[test] -fn test_mvcc_flashback_with_unlimit_range() { +fn test_mvcc_flashback_with_unlimited_range() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; From 7ec73fdd440a1d81e2a5a8c62aa9e31828959903 Mon Sep 17 00:00:00 2001 From: iosmanthus Date: Tue, 31 Jan 2023 10:41:55 +0800 Subject: [PATCH 473/676] import: sst_importer support download SST and rewrite into keyspace data. (#14046) ref tikv/tikv#12999 import: sst_importer support download SST and rewrite into keyspace data. Signed-off-by: iosmanthus --- Cargo.lock | 1 + components/keys/Cargo.toml | 1 + components/keys/src/rewrite.rs | 10 +++ components/sst_importer/src/import_file.rs | 3 +- components/sst_importer/src/sst_importer.rs | 90 ++++++++++++++++----- components/txn_types/src/types.rs | 10 +++ src/import/sst_service.rs | 4 +- 7 files changed, 95 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 46eac5930a1..f2ce2ba4ce1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2724,6 +2724,7 @@ dependencies = [ "panic_hook", "thiserror", "tikv_alloc", + "tikv_util", ] [[package]] diff --git a/components/keys/Cargo.toml b/components/keys/Cargo.toml index 5f2bf5935ee..b5a6412d00a 100644 --- a/components/keys/Cargo.toml +++ b/components/keys/Cargo.toml @@ -10,6 +10,7 @@ kvproto = { workspace = true } log_wrappers = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } +tikv_util = { workspace = true } [dev-dependencies] panic_hook = { workspace = true } diff --git a/components/keys/src/rewrite.rs b/components/keys/src/rewrite.rs index 51f588e9732..68541bb50e0 100644 --- a/components/keys/src/rewrite.rs +++ b/components/keys/src/rewrite.rs @@ -6,11 +6,21 @@ use std::ops::Bound::{self, *}; +use tikv_util::codec::bytes::encode_bytes; + /// An error indicating the key cannot be rewritten because it does not start /// with the given prefix. #[derive(PartialEq, Debug, Clone)] pub struct WrongPrefix; +pub fn encode_bound(bound: Bound>) -> Bound> { + match bound { + Included(k) => Included(encode_bytes(&k)), + Excluded(k) => Excluded(encode_bytes(&k)), + Unbounded => Unbounded, + } +} + /// Rewrites the prefix of a byte array. pub fn rewrite_prefix( old_prefix: &[u8], diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index f766729a066..84d2f67bbab 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -15,6 +15,7 @@ use engine_traits::{ iter_option, EncryptionKeyManager, Iterator, KvEngine, RefIterable, SstMetaInfo, SstReader, }; use file_system::{get_io_rate_limiter, sync_dir, File, OpenOptions}; +use keys::data_key; use kvproto::{import_sstpb::*, kvrpcpb::ApiVersion}; use tikv_util::time::Instant; use uuid::{Builder as UuidBuilder, Uuid}; @@ -336,7 +337,7 @@ impl ImportDir { let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; for &(start, end) in TIDB_RANGES_COMPLEMENT { - let opt = iter_option(start, end, false); + let opt = iter_option(&data_key(start), &data_key(end), false); let mut iter = sst_reader.iter(opt)?; if iter.seek(start)? { error!( diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 8b6d64f483f..fabe9e2a13a 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -32,7 +32,10 @@ use kvproto::{ kvrpcpb::ApiVersion, }; use tikv_util::{ - codec::stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, + codec::{ + bytes::{decode_bytes_in_place, encode_bytes}, + stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, + }, config::ReadableSize, stream::block_on_external_io, sys::SysQuota, @@ -53,13 +56,18 @@ use crate::{ #[derive(Default, Debug, Clone)] pub struct DownloadExt<'a> { cache_key: Option<&'a str>, + req_type: DownloadRequestType, } impl<'a> DownloadExt<'a> { - pub fn cache_key(self, key: &'a str) -> Self { - Self { - cache_key: Some(key), - } + pub fn cache_key(mut self, key: &'a str) -> Self { + self.cache_key = Some(key); + self + } + + pub fn req_type(mut self, req_type: DownloadRequestType) -> Self { + self.req_type = req_type; + self } } @@ -896,16 +904,20 @@ impl SstImporter { let sst_reader = RocksSstReader::open_with_env(dst_file_name, Some(env))?; sst_reader.verify_checksum()?; + // undo key rewrite so we could compare with the keys inside SST + let old_prefix = rewrite_rule.get_old_key_prefix(); + let new_prefix = rewrite_rule.get_new_key_prefix(); + let req_type = ext.req_type; + debug!("downloaded file and verified"; "meta" => ?meta, "name" => name, "path" => dst_file_name, + "old_prefix" => log_wrappers::Value::key(old_prefix), + "new_prefix" => log_wrappers::Value::key(new_prefix), + "req_type" => ?req_type, ); - // undo key rewrite so we could compare with the keys inside SST - let old_prefix = rewrite_rule.get_old_key_prefix(); - let new_prefix = rewrite_rule.get_new_key_prefix(); - let range_start = meta.get_range().get_start(); let range_end = meta.get_range().get_end(); let range_start_bound = key_to_bound(range_start); @@ -915,14 +927,14 @@ impl SstImporter { key_to_bound(range_end) }; - let range_start = + let mut range_start = keys::rewrite::rewrite_prefix_of_start_bound(new_prefix, old_prefix, range_start_bound) .map_err(|_| Error::WrongKeyPrefix { what: "SST start range", key: range_start.to_vec(), prefix: new_prefix.to_vec(), })?; - let range_end = + let mut range_end = keys::rewrite::rewrite_prefix_of_end_bound(new_prefix, old_prefix, range_end_bound) .map_err(|_| Error::WrongKeyPrefix { what: "SST end range", @@ -930,6 +942,11 @@ impl SstImporter { prefix: new_prefix.to_vec(), })?; + if req_type == DownloadRequestType::Keyspace { + range_start = keys::rewrite::encode_bound(range_start); + range_end = keys::rewrite::encode_bound(range_end); + } + let start_rename_rewrite = Instant::now(); // read the first and last keys from the SST, determine if we could // simply move the entire SST instead of iterating and generate a new one. @@ -942,9 +959,15 @@ impl SstImporter { return Ok(None); } if !iter.seek_to_first()? { + let mut range = meta.get_range().clone(); + if req_type == DownloadRequestType::Keyspace { + *range.mut_start() = encode_bytes(&range.take_start()); + *range.mut_end() = encode_bytes(&range.take_end()); + } // the SST is empty, so no need to iterate at all (should be impossible?) - return Ok(Some(meta.get_range().clone())); + return Ok(Some(range)); } + let start_key = keys::origin_key(iter.key()); if is_before_start_bound(start_key, &range_start) { // SST's start is before the range to consume, so needs to iterate to skip over @@ -995,8 +1018,10 @@ impl SstImporter { } // perform iteration and key rewrite. - let mut key = keys::data_key(new_prefix); - let new_prefix_data_key_len = key.len(); + let mut data_key = keys::DATA_PREFIX_KEY.to_vec(); + let data_key_prefix_len = keys::DATA_PREFIX_KEY.len(); + let mut user_key = new_prefix.to_vec(); + let user_key_prefix_len = new_prefix.len(); let mut first_key = None; match range_start { @@ -1016,10 +1041,22 @@ impl SstImporter { .unwrap(); while iter.valid()? { - let old_key = keys::origin_key(iter.key()); - if is_after_end_bound(old_key, &range_end) { + let mut old_key = Cow::Borrowed(keys::origin_key(iter.key())); + let mut ts = None; + + if is_after_end_bound(old_key.as_ref(), &range_end) { break; } + + if req_type == DownloadRequestType::Keyspace { + ts = Some(Key::decode_ts_bytes_from(old_key.as_ref())?.to_owned()); + old_key = { + let mut key = old_key.to_vec(); + decode_bytes_in_place(&mut key, false)?; + Cow::Owned(key) + }; + } + if !old_key.starts_with(old_prefix) { return Err(Error::WrongKeyPrefix { what: "Key in SST", @@ -1027,12 +1064,21 @@ impl SstImporter { prefix: old_prefix.to_vec(), }); } - key.truncate(new_prefix_data_key_len); - key.extend_from_slice(&old_key[old_prefix.len()..]); + + data_key.truncate(data_key_prefix_len); + user_key.truncate(user_key_prefix_len); + user_key.extend_from_slice(&old_key[old_prefix.len()..]); + if req_type == DownloadRequestType::Keyspace { + data_key.extend(encode_bytes(&user_key)); + data_key.extend(ts.unwrap()); + } else { + data_key.extend_from_slice(&user_key); + } + let mut value = Cow::Borrowed(iter.value()); if rewrite_rule.new_timestamp != 0 { - key = Key::from_encoded(key) + data_key = Key::from_encoded(data_key) .truncate_ts() .map_err(|e| { Error::BadFormat(format!( @@ -1056,10 +1102,10 @@ impl SstImporter { } } - sst_writer.put(&key, &value)?; + sst_writer.put(&data_key, &value)?; iter.next()?; if first_key.is_none() { - first_key = Some(keys::origin_key(&key).to_vec()); + first_key = Some(keys::origin_key(&data_key).to_vec()); } } @@ -1078,7 +1124,7 @@ impl SstImporter { let mut final_range = Range::default(); final_range.set_start(start_key); - final_range.set_end(keys::origin_key(&key).to_vec()); + final_range.set_end(keys::origin_key(&data_key).to_vec()); Ok(Some(final_range)) } else { // nothing is written: prevents finishing the SST at all. diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 60e64bf444a..15779df426a 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -192,6 +192,16 @@ impl Key { Ok(number::decode_u64_desc(&mut ts)?.into()) } + /// Decode the timestamp from a ts encoded key and return in bytes. + #[inline] + pub fn decode_ts_bytes_from(key: &[u8]) -> Result<&[u8], codec::Error> { + let len = key.len(); + if len < number::U64_SIZE { + return Err(codec::Error::KeyLength); + } + Ok(&key[key.len() - number::U64_SIZE..]) + } + /// Whether the user key part of a ts encoded key `ts_encoded_key` equals to /// the encoded user key `user_key`. /// diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index ea52cad0095..08eabe32f0c 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -783,7 +783,9 @@ where cipher, limiter, engine, - DownloadExt::default().cache_key(req.get_storage_cache_id()), + DownloadExt::default() + .cache_key(req.get_storage_cache_id()) + .req_type(req.get_request_type()), ); let mut resp = DownloadResponse::default(); match res.await { From ec2f4dc5420dbdab05ea47ff1724a54e765cdca4 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 30 Jan 2023 21:47:54 -0800 Subject: [PATCH 474/676] rocksdb: reduce rocksdb block size to 16KB (#14053) close tikv/tikv#14052 The writecf and defaultcf's default block size is changed to 16KB to improve read performance (reduce read amplification) Signed-off-by: qi.xu Co-authored-by: qi.xu --- etc/config-template.toml | 4 ++-- src/config/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/config-template.toml b/etc/config-template.toml index 62623afed0e..59152570da1 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -679,7 +679,7 @@ ## The data block size. RocksDB compresses data based on the unit of block. ## Similar to page in other databases, block is the smallest unit cached in block-cache. Note that ## the block size specified here corresponds to uncompressed data. -# block-size = "64KB" +# block-size = "16KB" ## If you're doing point lookups you definitely want to turn bloom filters on. We use bloom filters ## to avoid unnecessary disk reads. Default bits_per_key is 10, which yields ~1% false positive @@ -915,7 +915,7 @@ [rocksdb.writecf] ## Recommend to set it the same as `rocksdb.defaultcf.compression-per-level`. # compression-per-level = ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] -# block-size = "64KB" +# block-size = "16KB" ## Recommend to set it the same as `rocksdb.defaultcf.write-buffer-size`. # write-buffer-size = "128MB" diff --git a/src/config/mod.rs b/src/config/mod.rs index 99b593e2443..0a32c99f422 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -631,7 +631,7 @@ impl Default for DefaultCfConfig { let total_mem = SysQuota::memory_limit_in_bytes(); DefaultCfConfig { - block_size: ReadableSize::kb(64), + block_size: ReadableSize::kb(16), block_cache_size: memory_limit_for_cf(false, CF_DEFAULT, total_mem), disable_block_cache: false, cache_index_and_filter_blocks: true, @@ -756,7 +756,7 @@ impl Default for WriteCfConfig { }; WriteCfConfig { - block_size: ReadableSize::kb(64), + block_size: ReadableSize::kb(16), block_cache_size: memory_limit_for_cf(false, CF_WRITE, total_mem), disable_block_cache: false, cache_index_and_filter_blocks: true, From 15d6040c68eb0f2edf6b9304aebf69092657f8a4 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 30 Jan 2023 22:01:54 -0800 Subject: [PATCH 475/676] storage: add an alias partitioned-raft-kv for RaftKv2 (#14083) ref tikv/tikv#12842 add an alias partitioned-raft-kv for RaftKv2 Signed-off-by: qi.xu Co-authored-by: qi.xu Co-authored-by: Ti Chi Robot --- src/config/mod.rs | 2 +- src/storage/config.rs | 1 + tests/integrations/config/test-custom.toml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 0a32c99f422..7e006ef2eed 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3135,7 +3135,7 @@ impl TikvConfig { if self.storage.engine == EngineType::RaftKv2 { self.raft_store.store_io_pool_size = cmp::max(self.raft_store.store_io_pool_size, 1); if !self.raft_engine.enable { - panic!("raft-kv2 only supports raft log engine."); + panic!("partitioned-raft-kv only supports raft log engine."); } } diff --git a/src/storage/config.rs b/src/storage/config.rs index 68d739c1639..d74bd721104 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -35,6 +35,7 @@ const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; #[serde(rename_all = "kebab-case")] pub enum EngineType { RaftKv, + #[serde(alias = "partitioned-raft-kv")] RaftKv2, } diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index b096437e60c..d79ec7899e2 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -90,7 +90,7 @@ a = "b" [storage] data-dir = "/var" -engine = "raft-kv2" +engine = "partitioned-raft-kv" gc-ratio-threshold = 1.2 max-key-size = 4096 scheduler-concurrency = 123 From 23a228824cb0e82cc495edb28c3276774f97aead Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 31 Jan 2023 14:35:54 +0800 Subject: [PATCH 476/676] resolved_ts: reduce network traffic by filter regions (#14098) close tikv/tikv#14092 resolved_ts: reduce network traffic by filter regions Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/cdc/src/endpoint.rs | 2 +- components/resolved_ts/src/advance.rs | 128 +++++++++++++++++++++++++- src/config/mod.rs | 2 +- 3 files changed, 128 insertions(+), 4 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 6d64754d042..2b4eb9ff226 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1543,7 +1543,7 @@ mod tests { } let diff = cfg.diff(&updated_cfg); ep.run(Task::ChangeConfig(diff)); - assert_eq!(ep.config.min_ts_interval, ReadableDuration::millis(200)); + assert_eq!(ep.config.min_ts_interval, ReadableDuration::secs(1)); assert_eq!(ep.config.hibernate_regions_compatible, true); { diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index a78e903bc72..fd58fac1601 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -149,6 +149,7 @@ pub struct LeadershipResolver { region_map: HashMap>, // region_id -> peers id, record the responses. resp_map: HashMap>, + checking_regions: HashSet, valid_regions: HashSet, gc_interval: Duration, @@ -176,6 +177,7 @@ impl LeadershipResolver { region_map: HashMap::default(), resp_map: HashMap::default(), valid_regions: HashSet::default(), + checking_regions: HashSet::default(), last_gc_time: Instant::now_coarse(), gc_interval, } @@ -188,6 +190,7 @@ impl LeadershipResolver { self.region_map = HashMap::default(); self.resp_map = HashMap::default(); self.valid_regions = HashSet::default(); + self.checking_regions = HashSet::default(); self.last_gc_time = now; } } @@ -203,6 +206,7 @@ impl LeadershipResolver { for v in self.resp_map.values_mut() { v.clear(); } + self.checking_regions.clear(); self.valid_regions.clear(); } @@ -248,7 +252,11 @@ impl LeadershipResolver { // This function broadcasts a special message to all stores, gets the leader id // of them to confirm whether current peer has a quorum which accepts its // leadership. - pub async fn resolve(&mut self, _regions: Vec, min_ts: TimeStamp) -> Vec { + pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + if regions.is_empty() { + return regions; + } + // Clear previous result before resolving. self.clear(); // GC when necessary to prevent memory leak. @@ -256,15 +264,22 @@ impl LeadershipResolver { PENDING_RTS_COUNT.inc(); defer!(PENDING_RTS_COUNT.dec()); - fail_point!("before_sync_replica_read_state", |_| _regions.clone()); + fail_point!("before_sync_replica_read_state", |_| regions.clone()); let store_id = self.store_id; let valid_regions = &mut self.valid_regions; let region_map = &mut self.region_map; let resp_map = &mut self.resp_map; let store_req_map = &mut self.store_req_map; + let checking_regions = &mut self.checking_regions; + for region_id in ®ions { + checking_regions.insert(*region_id); + } self.region_read_progress.with(|registry| { for (region_id, read_progress) in registry { + if !checking_regions.contains(region_id) { + continue; + } let core = read_progress.get_core(); let local_leader_info = core.get_local_leader_info(); let leader_id = local_leader_info.get_leader_id(); @@ -512,3 +527,112 @@ async fn get_tikv_client( RTS_TIKV_CLIENT_INIT_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); Ok(cli) } + +#[cfg(test)] +mod tests { + use std::{ + sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, + }, + time::Duration, + }; + + use grpcio::{self, ChannelBuilder, EnvBuilder, Server, ServerBuilder}; + use kvproto::{metapb::Region, tikvpb::Tikv, tikvpb_grpc::create_tikv}; + use pd_client::PdClient; + use raftstore::store::util::RegionReadProgress; + use tikv_util::store::new_peer; + + use super::*; + + #[derive(Clone)] + struct MockTikv { + req_tx: Sender, + } + + impl Tikv for MockTikv { + fn check_leader( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: CheckLeaderRequest, + sink: ::grpcio::UnarySink, + ) { + self.req_tx.send(req).unwrap(); + ctx.spawn(async { + sink.success(CheckLeaderResponse::default()).await.unwrap(); + }) + } + } + + struct MockPdClient {} + impl PdClient for MockPdClient {} + + fn new_rpc_suite(env: Arc) -> (Server, TikvClient, Receiver) { + let (tx, rx) = channel(); + let tikv_service = MockTikv { req_tx: tx }; + let builder = ServerBuilder::new(env.clone()).register_service(create_tikv(tikv_service)); + let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); + server.start(); + let (_, port) = server.bind_addrs().next().unwrap(); + let addr = format!("127.0.0.1:{}", port); + let channel = ChannelBuilder::new(env).connect(&addr); + let client = TikvClient::new(channel); + (server, client, rx) + } + + #[tokio::test] + async fn test_resolve_leader_request_size() { + let env = Arc::new(EnvBuilder::new().build()); + let (mut server, tikv_client, rx) = new_rpc_suite(env.clone()); + + let mut region1 = Region::default(); + region1.id = 1; + region1.peers.push(new_peer(1, 1)); + region1.peers.push(new_peer(2, 11)); + let progress1 = RegionReadProgress::new(®ion1, 1, 1, 1); + progress1.update_leader_info(1, 1, ®ion1); + + let mut region2 = Region::default(); + region2.id = 2; + region2.peers.push(new_peer(1, 2)); + region2.peers.push(new_peer(2, 22)); + let progress2 = RegionReadProgress::new(®ion2, 1, 1, 2); + progress2.update_leader_info(2, 2, ®ion2); + + let mut leader_resolver = LeadershipResolver::new( + 1, // store id + Arc::new(MockPdClient {}), + env.clone(), + Arc::new(SecurityManager::default()), + RegionReadProgressRegistry::new(), + Duration::from_secs(1), + ); + leader_resolver + .tikv_clients + .lock() + .await + .insert(2 /* store id */, tikv_client); + leader_resolver + .region_read_progress + .insert(1, Arc::new(progress1)); + leader_resolver + .region_read_progress + .insert(2, Arc::new(progress2)); + + leader_resolver.resolve(vec![1, 2], TimeStamp::new(1)).await; + let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(req.regions.len(), 2); + + // Checking one region only send 1 region in request. + leader_resolver.resolve(vec![1], TimeStamp::new(1)).await; + let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(req.regions.len(), 1); + + // Checking zero region does not send request. + leader_resolver.resolve(vec![], TimeStamp::new(1)).await; + rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); + + let _ = server.shutdown().await; + } +} diff --git a/src/config/mod.rs b/src/config/mod.rs index 7e006ef2eed..3274d5442df 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2653,7 +2653,7 @@ pub struct CdcConfig { impl Default for CdcConfig { fn default() -> Self { Self { - min_ts_interval: ReadableDuration::millis(200), + min_ts_interval: ReadableDuration::secs(1), hibernate_regions_compatible: true, // 4 threads for incremental scan. incremental_scan_threads: 4, From a33eb2d08991f278785e8b3047c643bf07839bce Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 31 Jan 2023 14:59:54 +0800 Subject: [PATCH 477/676] raftstore-v2: fix peer not cleanup when it replicates more logs (#14101) ref tikv/tikv#12842 If it accepts more logs than conf remove itself, applied_index == commit_index will never be true. So we should check if it's a tombstone already first. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/fsm/store.rs | 4 + components/raftstore-v2/src/operation/life.rs | 7 +- .../raftstore-v2/src/operation/ready/mod.rs | 26 +++--- .../src/operation/ready/snapshot.rs | 17 +++- components/raftstore-v2/src/raft/storage.rs | 7 +- components/raftstore-v2/src/router/message.rs | 10 ++- .../tests/integrations/cluster.rs | 13 ++- .../tests/integrations/test_conf_change.rs | 80 ++++++++++++++++++- .../raftstore/src/store/async_io/read.rs | 6 +- 9 files changed, 144 insertions(+), 26 deletions(-) diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 86e3540d23c..17c0a9a50f9 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -266,6 +266,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { .fsm .store .on_store_unreachable(self.store_ctx, to_store_id), + #[cfg(feature = "testexport")] + StoreMsg::WaitFlush { region_id, ch } => { + self.fsm.store.on_wait_flush(self.store_ctx, region_id, ch) + } } } } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 88646f06b59..3a9f678bd8c 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -284,8 +284,11 @@ impl Peer { #[inline] pub fn postponed_destroy(&self) -> bool { let entry_storage = self.storage().entry_storage(); - // TODO: check actual split index instead of commit index. - entry_storage.applied_index() != entry_storage.commit_index() + // If it's marked as tombstone, then it must be changed by conf change. In + // this case, all following entries are skipped so applied_index never equals + // to commit_index. + (self.storage().region_state().get_state() != PeerState::Tombstone + && entry_storage.applied_index() != entry_storage.commit_index()) // Wait for critical commands like split. || self.has_pending_tombstone_tablets() } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 38d126ac87a..e7c32e742ec 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -52,7 +52,7 @@ use crate::{ batch::StoreContext, fsm::{PeerFsmDelegate, Store}, raft::{Peer, Storage}, - router::{ApplyTask, PeerMsg, PeerTick}, + router::{PeerMsg, PeerTick}, worker::tablet_gc, }; @@ -70,6 +70,19 @@ impl Store { ctx.router .broadcast_normal(|| PeerMsg::StoreUnreachable { to_store_id }); } + + #[cfg(feature = "testexport")] + pub fn on_wait_flush( + &mut self, + ctx: &mut StoreContext, + region_id: u64, + ch: crate::router::FlushChannel, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let _ = ctx.router.send(region_id, PeerMsg::WaitFlush(ch)); + } } impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { @@ -455,6 +468,7 @@ impl Peer { && !self.raft_group().has_ready() && (self.serving() || self.postponed_destroy()) { + self.maybe_schedule_gen_snapshot(); #[cfg(feature = "testexport")] self.async_writer.notify_flush(); return; @@ -501,15 +515,7 @@ impl Peer { self.handle_raft_committed_entries(ctx, ready.take_committed_entries()); } - // Check whether there is a pending generate snapshot task, the task - // needs to be sent to the apply system. - // Always sending snapshot task after apply task, so it gets latest - // snapshot. - if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { - self.apply_scheduler() - .unwrap() - .send(ApplyTask::Snapshot(gen_task)); - } + self.maybe_schedule_gen_snapshot(); let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index bcbe220252b..1fae813577c 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -41,13 +41,14 @@ use raftstore::{ RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, }; -use slog::{error, info, warn}; +use slog::{debug, error, info, warn}; use tikv_util::{box_err, log::SlogFormat, slog_panic}; use crate::{ fsm::ApplyResReporter, operation::{command::temp_split_path, SharedReadTablet}, raft::{Apply, Peer, Storage}, + router::ApplyTask, Result, StoreContext, }; @@ -161,6 +162,19 @@ pub fn install_tablet( } impl Peer { + /// Check whether there is a pending generate snapshot task, the task + /// needs to be sent to the apply system. + /// Always sending snapshot task after apply task, so it gets latest + /// snapshot. + #[inline] + pub fn maybe_schedule_gen_snapshot(&mut self) { + if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { + self.apply_scheduler() + .unwrap() + .send(ApplyTask::Snapshot(gen_task)); + } + } + pub fn on_snapshot_generated(&mut self, snapshot: GenSnapRes) { if self.storage_mut().on_snapshot_generated(snapshot) { self.raft_group_mut().ping(); @@ -270,6 +284,7 @@ impl Apply { /// Will schedule a task to read worker and then generate a snapshot /// asynchronously. pub fn schedule_gen_snapshot(&mut self, snap_task: GenSnapTask) { + debug!(self.logger, "scheduling snapshot"; "task" => ?snap_task); // Do not generate, the peer is removed. if self.tombstone() { snap_task.canceled.store(true, Ordering::SeqCst); diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 1d1f53f9c53..ce15ac20621 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -9,7 +9,7 @@ use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ metapb, - raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, }; use raft::{ eraftpb::{ConfState, Entry, Snapshot}, @@ -234,10 +234,7 @@ impl Storage { #[inline] pub fn tablet_index(&self) -> u64 { - match self.region_state.get_state() { - PeerState::Tombstone | PeerState::Applying => 0, - _ => self.region_state.get_tablet_index(), - } + self.region_state.get_tablet_index() } #[inline] diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index a9353e171d9..8814a97cc5f 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -260,7 +260,15 @@ pub enum StoreMsg { SplitInit(Box), Tick(StoreTick), Start, - StoreUnreachable { to_store_id: u64 }, + StoreUnreachable { + to_store_id: u64, + }, + /// A message that used to check if a flush is happened. + #[cfg(feature = "testexport")] + WaitFlush { + region_id: u64, + ch: super::FlushChannel, + }, } impl ResourceMetered for StoreMsg {} diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 90f7c500903..2076272b44b 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -39,7 +39,7 @@ use raftstore::{ }; use raftstore_v2::{ create_store_batch_system, - router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, + router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter, StoreMsg}, Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, }; use resource_metering::CollectorRegHandle; @@ -127,7 +127,16 @@ impl TestRouter { let res = self.send(region_id, PeerMsg::WaitFlush(ch)); match res { Ok(_) => return block_on(sub.result()).is_some(), - Err(TrySendError::Disconnected(_)) => return false, + Err(TrySendError::Disconnected(m)) => { + let PeerMsg::WaitFlush(ch) = m else { unreachable!() }; + match self + .store_router() + .send_control(StoreMsg::WaitFlush { region_id, ch }) + { + Ok(_) => return block_on(sub.result()).is_some(), + Err(_) => return false, + } + } Err(TrySendError::Full(_)) => thread::sleep(Duration::from_millis(10)), } } diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 8a075bb9a35..4b3445a00ad 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -2,8 +2,9 @@ use std::{self, time::Duration}; -use engine_traits::{Peekable, CF_DEFAULT}; -use kvproto::raft_cmdpb::AdminCmdType; +use engine_traits::{Peekable, RaftEngineReadOnly, CF_DEFAULT}; +use futures::executor::block_on; +use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::PeerState}; use raft::prelude::ConfChangeType; use raftstore_v2::{ router::{PeerMsg, PeerTick}, @@ -102,3 +103,78 @@ fn test_simple_change() { let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); check_skip_wal(cached.latest().unwrap().as_inner().path()); } + +/// Test if a peer can be destroyed by conf change if logs after conf change are +/// also replicated. +#[test] +fn test_remove_by_conf_change() { + let cluster = Cluster::with_node_count(2, None); + let region_id = 2; + let mut req = cluster.routers[0].new_request_for(2); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddLearnerNode); + let store_id = cluster.node(1).id(); + let new_peer = new_learner_peer(store_id, 10); + admin_req.mut_change_peer().set_peer(new_peer); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + // So heartbeat will create a learner. + cluster.dispatch(2, vec![]); + // Trigger the raft tick to replica the log to the learner and execute the + // snapshot task. + cluster.routers[0] + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + cluster.dispatch(region_id, vec![]); + // Wait some time so snapshot can be generated. + std::thread::sleep(Duration::from_millis(100)); + cluster.dispatch(region_id, vec![]); + + // write one kv to make flow control replicated. + let (key, val) = (b"key", b"value"); + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, _) = PeerMsg::simple_write(header, put.encode()); + cluster.routers[0].send(region_id, msg).unwrap(); + cluster.dispatch(region_id, vec![]); + + let new_conf_ver = req.get_header().get_region_epoch().get_conf_ver() + 1; + req.mut_header() + .mut_region_epoch() + .set_conf_ver(new_conf_ver); + req.mut_admin_request() + .mut_change_peer() + .set_change_type(ConfChangeType::RemoveNode); + let (admin_msg, admin_sub) = PeerMsg::admin_command(req.clone()); + // write one kv after removal + let (key, val) = (b"key1", b"value"); + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + // Send them at the same time so they will be all sent to learner. + cluster.routers[0].send(region_id, admin_msg).unwrap(); + cluster.routers[0].send(region_id, msg).unwrap(); + let resp = block_on(admin_sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // Dispatch messages so the learner will receive conf remove and write at the + // same time. + cluster.dispatch(region_id, vec![]); + cluster.routers[1].wait_flush(region_id, Duration::from_millis(300)); + // Wait for apply. + std::thread::sleep(Duration::from_millis(100)); + let raft_engine = &cluster.node(1).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_state(), PeerState::Tombstone); + assert_eq!(raft_engine.get_raft_state(region_id).unwrap(), None); +} diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index b298ed3529e..45492feb294 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -227,10 +227,10 @@ where error!("failed to create checkpointer"; "region_id" => region_id, "error" => %e); SNAP_COUNTER.generate.fail.inc(); } else { + let elapsed = start.saturating_elapsed_secs(); SNAP_COUNTER.generate.success.inc(); - SNAP_HISTOGRAM - .generate - .observe(start.saturating_elapsed_secs()); + SNAP_HISTOGRAM.generate.observe(elapsed); + info!("snapshot generated"; "region_id" => region_id, "elapsed" => elapsed, "key" => ?snap_key, "for_balance" => for_balance); res = Some(Box::new((snapshot, to_peer))) } From 9c0df6d68c72d30021b36d24275fdceca9864235 Mon Sep 17 00:00:00 2001 From: you06 Date: Wed, 1 Feb 2023 15:43:55 +0800 Subject: [PATCH 478/676] cop: handle unset scan details in store batch (#14102) close tikv/tikv#14109 Signed-off-by: you06 --- src/coprocessor/endpoint.rs | 2 + tests/integrations/coprocessor/test_select.rs | 60 +++++++++++-------- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index b9d01419a49..6ac1bebc541 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -600,6 +600,8 @@ impl Endpoint { response.set_locked(lock_info); } response.set_other_error(resp.take_other_error()); + // keep the exec details already generated. + response.set_exec_details_v2(resp.take_exec_details_v2()); GLOBAL_TRACKERS.with_tracker(cur_tracker, |tracker| { tracker.write_scan_detail( response.mut_exec_details_v2().mut_scan_detail_v2(), diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index ad195f62774..056f24b5fee 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -4,11 +4,10 @@ use std::{cmp, thread, time::Duration}; use engine_traits::CF_LOCK; use kvproto::{ - coprocessor::{Request, Response, StoreBatchTask}, - errorpb, - kvrpcpb::{Context, IsolationLevel, LockInfo}, + coprocessor::{Request, Response, StoreBatchTask, StoreBatchTaskResponse}, + kvrpcpb::{Context, IsolationLevel}, }; -use protobuf::{Message, SingularPtrField}; +use protobuf::Message; use raftstore::store::Bucket; use test_coprocessor::*; use test_raftstore::{Cluster, ServerCluster}; @@ -2151,11 +2150,14 @@ fn test_batch_request() { } req }; - let verify_response = |result: &QueryResult, - data: &[u8], - region_err: &SingularPtrField, - locked: &SingularPtrField, - other_err: &String| { + let verify_response = |result: &QueryResult, resp: &Response| { + let (data, details, region_err, locked, other_err) = ( + resp.get_data(), + resp.get_exec_details_v2(), + &resp.region_error, + &resp.locked, + &resp.other_error, + ); match result { QueryResult::Valid(res) => { let expected_len = res.len(); @@ -2179,6 +2181,12 @@ fn test_batch_request() { assert!(region_err.is_none()); assert!(locked.is_none()); assert!(other_err.is_empty()); + let scan_details = details.get_scan_detail_v2(); + assert_eq!(scan_details.processed_versions, row_count as u64); + if row_count > 0 { + assert!(scan_details.processed_versions_size > 0); + assert!(scan_details.total_versions > 0); + } } QueryResult::ErrRegion => { assert!(region_err.is_some()); @@ -2198,6 +2206,20 @@ fn test_batch_request() { } }; + let batch_resp_2_resp = |batch_resp: &mut StoreBatchTaskResponse| -> Response { + let mut response = Response::default(); + response.set_data(batch_resp.take_data()); + if let Some(err) = batch_resp.region_error.take() { + response.set_region_error(err); + } + if let Some(lock_info) = batch_resp.locked.take() { + response.set_locked(lock_info); + } + response.set_other_error(batch_resp.take_other_error()); + response.set_exec_details_v2(batch_resp.take_exec_details_v2()); + response + }; + for (ranges, results, invalid_epoch, key_is_locked) in cases.iter() { let mut req = prepare_req(&mut cluster, ranges); if *invalid_epoch { @@ -2229,25 +2251,13 @@ fn test_batch_request() { } } let mut resp = handle_request(&endpoint, req); - let batch_results = resp.take_batch_responses().to_vec(); + let mut batch_results = resp.take_batch_responses().to_vec(); for (i, result) in results.iter().enumerate() { if i == 0 { - verify_response( - result, - resp.get_data(), - &resp.region_error, - &resp.locked, - &resp.other_error, - ); + verify_response(result, &resp); } else { - let batch_resp = batch_results.get(i - 1).unwrap(); - verify_response( - result, - batch_resp.get_data(), - &batch_resp.region_error, - &batch_resp.locked, - &batch_resp.other_error, - ); + let batch_resp = batch_results.get_mut(i - 1).unwrap(); + verify_response(result, &batch_resp_2_resp(batch_resp)); }; } if *key_is_locked { From db14c53267ebf815d6a8ae12036bd5e20326f7ee Mon Sep 17 00:00:00 2001 From: ShuNing Date: Thu, 2 Feb 2023 10:23:55 +0800 Subject: [PATCH 479/676] resource_control: unify wru/rru to ru (#14121) close tikv/tikv#14120 resource_control: unify wru/rru to ru Signed-off-by: nolouch --- Cargo.lock | 2 +- .../resource_control/src/resource_group.rs | 43 +++++++++---------- components/resource_control/src/service.rs | 14 +++--- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f2ce2ba4ce1..1747e74fafa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2730,7 +2730,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#1b2b4114103afb06796b7e44f45f7e55133673c0" +source = "git+https://github.com/pingcap/kvproto.git#a7c51106dfe70ebf59221018b50d1ec6ad25da74" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 1524ebcba5d..a0abfb11464 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -41,14 +41,10 @@ pub struct ResourceGroupManager { impl ResourceGroupManager { fn get_ru_setting(rg: &ResourceGroup, is_read: bool) -> u64 { match (rg.get_mode(), is_read) { - (GroupMode::RuMode, true) => rg + // RU mode, read and write use the same setting. + (GroupMode::RuMode, _) => rg .get_r_u_settings() - .get_r_r_u() - .get_settings() - .get_fill_rate(), - (GroupMode::RuMode, false) => rg - .get_r_u_settings() - .get_w_r_u() + .get_r_u() .get_settings() .get_fill_rate(), // TODO: currently we only consider the cpu usage in the read path, we may also take @@ -311,6 +307,10 @@ pub(crate) mod tests { use super::*; + pub fn new_resource_group_ru(name: String, ru: u64) -> ResourceGroup { + new_resource_group(name, true, ru, ru) + } + pub fn new_resource_group( name: String, is_ru_mode: bool, @@ -328,15 +328,12 @@ pub(crate) mod tests { }; group.set_mode(mode); if is_ru_mode { + assert!(read_tokens == write_tokens); let mut ru_setting = GroupRequestUnitSettings::new(); ru_setting - .mut_r_r_u() + .mut_r_u() .mut_settings() .set_fill_rate(read_tokens); - ru_setting - .mut_w_r_u() - .mut_settings() - .set_fill_rate(write_tokens); group.set_r_u_settings(ru_setting); } else { let mut resource_setting = GroupRawResourceSettings::new(); @@ -357,7 +354,7 @@ pub(crate) mod tests { fn test_resource_group() { let resource_manager = ResourceGroupManager::default(); - let group1 = new_resource_group("TEST".into(), true, 100, 100); + let group1 = new_resource_group_ru("TEST".into(), 100); resource_manager.add_resource_group(group1); assert!(resource_manager.get_resource_group("test1").is_none()); @@ -367,7 +364,7 @@ pub(crate) mod tests { group .value() .get_r_u_settings() - .get_r_r_u() + .get_r_u() .get_settings() .get_fill_rate(), 100 @@ -375,14 +372,14 @@ pub(crate) mod tests { drop(group); assert_eq!(resource_manager.resource_groups.len(), 1); - let group1 = new_resource_group("Test".into(), true, 200, 100); + let group1 = new_resource_group_ru("Test".into(), 200); resource_manager.add_resource_group(group1); let group = resource_manager.get_resource_group("test").unwrap(); assert_eq!( group .value() .get_r_u_settings() - .get_r_r_u() + .get_r_u() .get_settings() .get_fill_rate(), 200 @@ -390,7 +387,7 @@ pub(crate) mod tests { drop(group); assert_eq!(resource_manager.resource_groups.len(), 1); - let group2 = new_resource_group("test2".into(), true, 400, 200); + let group2 = new_resource_group_ru("test2".into(), 400); resource_manager.add_resource_group(group2); assert_eq!(resource_manager.resource_groups.len(), 2); @@ -451,7 +448,7 @@ pub(crate) mod tests { drop(group2); // test add 1 new resource group - let new_group = new_resource_group("new_group".into(), true, 500, 500); + let new_group = new_resource_group_ru("new_group".into(), 500); resource_manager.add_resource_group(new_group); assert_eq!(resource_ctl.resource_consumptions.len(), 4); @@ -466,29 +463,29 @@ pub(crate) mod tests { let resource_ctl = resource_manager.derive_controller("test_read".into(), true); let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); - let group1 = new_resource_group("test1".into(), true, 5000, 1000); + let group1 = new_resource_group_ru("test1".into(), 5000); resource_manager.add_resource_group(group1); assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); assert_eq!( resource_ctl_write.resource_group("test1".as_bytes()).weight, - 100 + 20 ); // add a resource group with big ru - let group1 = new_resource_group("test2".into(), true, 50000, 2000); + let group1 = new_resource_group_ru("test2".into(), 50000); resource_manager.add_resource_group(group1); assert_eq!(*resource_ctl.max_ru_quota.lock().unwrap(), 50000); assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 100); assert_eq!(resource_ctl.resource_group("test2".as_bytes()).weight, 10); // resource_ctl_write should be unchanged. - assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 10000); + assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 50000); assert_eq!( resource_ctl_write.resource_group("test1".as_bytes()).weight, 100 ); assert_eq!( resource_ctl_write.resource_group("test2".as_bytes()).weight, - 50 + 10 ); } } diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index ea9a9d724b9..2381b168987 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -125,7 +125,7 @@ pub mod tests { use test_pd::{mocker::Service, util::*, Server as MockServer}; use tikv_util::{config::ReadableDuration, worker::Builder}; - use crate::resource_group::tests::new_resource_group; + use crate::resource_group::tests::{new_resource_group, new_resource_group_ru}; fn new_test_server_and_client( update_interval: ReadableDuration, @@ -202,12 +202,12 @@ pub mod tests { s_clone.watch_resource_groups().await; }); // Mock add - let group1 = new_resource_group("TEST1".into(), true, 100, 100); + let group1 = new_resource_group_ru("TEST1".into(), 100); add_resource_group(s.pd_client.clone(), group1); - let group2 = new_resource_group("TEST2".into(), true, 100, 100); + let group2 = new_resource_group_ru("TEST2".into(), 100); add_resource_group(s.pd_client.clone(), group2); // Mock modify - let group2 = new_resource_group("TEST2".into(), true, 50, 50); + let group2 = new_resource_group_ru("TEST2".into(), 50); add_resource_group(s.pd_client.clone(), group2); let (res, revision) = block_on(s.list_resource_groups()); assert_eq!(res.len(), 2); @@ -227,7 +227,7 @@ pub mod tests { group .value() .get_r_u_settings() - .get_r_r_u() + .get_r_u() .get_settings() .get_fill_rate(), 50 @@ -247,7 +247,7 @@ pub mod tests { s_clone.watch_resource_groups().await; }); // Mock add - let group1 = new_resource_group("TEST1".into(), true, 100, 100); + let group1 = new_resource_group_ru("TEST1".into(), 100); add_resource_group(s.pd_client.clone(), group1); // Mock reboot watch server let watch_global_config_fp = "watch_global_config_return"; @@ -255,7 +255,7 @@ pub mod tests { std::thread::sleep(Duration::from_millis(100)); fail::remove(watch_global_config_fp); // Mock add after rebooting will success - let group1 = new_resource_group("TEST2".into(), true, 100, 100); + let group1 = new_resource_group_ru("TEST2".into(), 100); add_resource_group(s.pd_client.clone(), group1); // Wait watcher update std::thread::sleep(Duration::from_secs(1)); From d1d29203e6a93b05dd435ea27a9b39fb30b23f41 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 2 Feb 2023 16:33:56 +0800 Subject: [PATCH 480/676] pd_client: fix item value type (#14106) close tikv/tikv#14104 We need to use the new field to support item value as bytes to avoid proto string check failures. Signed-off-by: husharp --- Cargo.lock | 3 +-- components/resource_control/Cargo.toml | 1 - .../resource_control/src/resource_group.rs | 1 - components/resource_control/src/service.rs | 27 +++++++++++++------ components/test_pd/src/mocker/mod.rs | 5 ++-- components/test_pd/src/server.rs | 2 +- .../failpoints/cases/test_pd_client_legacy.rs | 12 ++++++--- 7 files changed, 32 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1747e74fafa..78c9e88b538 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2730,7 +2730,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#a7c51106dfe70ebf59221018b50d1ec6ad25da74" +source = "git+https://github.com/pingcap/kvproto.git#2b853bed812556901846f42820b63d8a0d9c8d24" dependencies = [ "futures 0.3.15", "grpcio", @@ -4697,7 +4697,6 @@ dependencies = [ "test_pd", "test_pd_client", "tikv_util", - "tokio", "yatp", ] diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml index 3f796627040..39d37ac0f6b 100644 --- a/components/resource_control/Cargo.toml +++ b/components/resource_control/Cargo.toml @@ -26,5 +26,4 @@ slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global test_pd = { workspace = true } test_pd_client = { workspace = true } tikv_util = { workspace = true } -tokio = { version = "1.5", features = ["time"] } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index a0abfb11464..c5112c13516 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -358,7 +358,6 @@ pub(crate) mod tests { resource_manager.add_resource_group(group1); assert!(resource_manager.get_resource_group("test1").is_none()); - let group = resource_manager.get_resource_group("test").unwrap(); assert_eq!( group diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index 2381b168987..fc24af4fdc4 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -2,10 +2,10 @@ use std::{sync::Arc, time::Duration}; -use futures::StreamExt; +use futures::{compat::Future01CompatExt, StreamExt}; use kvproto::{pdpb::EventType, resource_manager::ResourceGroup}; use pd_client::{Error as PdError, PdClient, RpcClient, RESOURCE_CONTROL_CONFIG_PATH}; -use tikv_util::error; +use tikv_util::{error, timer::GLOBAL_TIMER_HANDLE}; use crate::ResourceGroupManager; @@ -31,6 +31,8 @@ impl ResourceManagerService { } } +const RETRY_INTERVAL: Duration = Duration::from_secs(1); // to consistent with pd_client + impl ResourceManagerService { pub async fn watch_resource_groups(&mut self) { // Firstly, load all resource groups as of now. @@ -56,7 +58,7 @@ impl ResourceManagerService { EventType::Put => { if let Ok(group) = protobuf::parse_from_bytes::( - item.get_value().as_bytes(), + item.get_payload(), ) { self.manager.add_resource_group(group); @@ -69,7 +71,10 @@ impl ResourceManagerService { } Err(err) => { error!("failed to get stream"; "err" => ?err); - tokio::time::sleep(Duration::from_secs(1)).await; + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; } } } @@ -85,7 +90,10 @@ impl ResourceManagerService { } Err(err) => { error!("failed to watch resource groups"; "err" => ?err); - tokio::time::sleep(Duration::from_secs(1)).await; + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; } } } @@ -101,13 +109,16 @@ impl ResourceManagerService { Ok((items, revision)) => { let groups = items .into_iter() - .filter_map(|g| protobuf::parse_from_bytes(g.get_value().as_bytes()).ok()) + .filter_map(|g| protobuf::parse_from_bytes(g.get_payload()).ok()) .collect(); return (groups, revision); } Err(err) => { error!("failed to load global config"; "err" => ?err); - tokio::time::sleep(Duration::from_secs(1)).await; + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; } } } @@ -142,7 +153,7 @@ pub mod tests { item.set_name(group.get_name().to_string()); let mut buf = Vec::new(); group.write_to_vec(&mut buf).unwrap(); - item.set_value(String::from_utf8(buf).unwrap()); + item.set_payload(buf); futures::executor::block_on(async move { pd_client diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index b9ae839b06e..fc257b12a9f 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -47,7 +47,7 @@ pub trait PdMocker { .map(|kv| { let mut item = GlobalConfigItem::default(); item.set_name(String::from_utf8(kv.key().to_vec()).unwrap()); - item.set_value(String::from_utf8(kv.value().to_vec()).unwrap()); + item.set_payload(kv.value().into()); item }) .collect(); @@ -68,7 +68,8 @@ pub trait PdMocker { block_on(async move { match item.get_kind() { EventType::Put => { - let kv = KeyValue(MetaKey(item.get_name().into()), item.get_value().into()); + let kv = + KeyValue(MetaKey(item.get_name().into()), item.get_payload().into()); cli.lock().await.set(kv).await } EventType::Delete => { diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index cb495307a1f..28d4077b674 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -242,7 +242,7 @@ impl Pd for PdMock { KvEventType::Delete => EventType::Delete, }); change.set_name(from_utf8(event.pair.key()).unwrap().to_string()); - change.set_value(from_utf8(event.pair.value()).unwrap().to_string()); + change.set_payload(event.pair.value().into()); let mut wc = WatchGlobalConfigResponse::default(); wc.set_changes(vec![change].into()); let _ = sink.send((wc, WriteFlags::default())).await; diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs index 3638e448bd9..d6cf7f1817d 100644 --- a/tests/failpoints/cases/test_pd_client_legacy.rs +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + str::from_utf8, sync::{mpsc, Arc}, thread, time::Duration, @@ -118,7 +119,7 @@ fn test_load_global_config() { .map(|(name, value)| { let mut item = GlobalConfigItem::default(); item.set_name(name.to_string()); - item.set_value(value.to_string()); + item.set_payload(value.as_bytes().into()); item }) .collect::>(), @@ -132,7 +133,7 @@ fn test_load_global_config() { assert!( res.iter() .zip(check_items) - .all(|(item1, item2)| item1.name == item2.0 && item1.value == item2.1) + .all(|(item1, item2)| item1.name == item2.0 && item1.payload == item2.1.as_bytes()) ); assert_eq!(revision, 3); } @@ -156,7 +157,10 @@ fn test_watch_global_config_on_closed_server() { Ok(r) => { for item in r.get_changes() { assert_eq!(item.get_name(), items_clone[i].0); - assert_eq!(item.get_value(), items_clone[i].1); + assert_eq!( + from_utf8(item.get_payload()).unwrap(), + items_clone[i].1 + ); i += 1; } } @@ -181,7 +185,7 @@ fn test_watch_global_config_on_closed_server() { .map(|(name, value)| { let mut item = GlobalConfigItem::default(); item.set_name(name.to_string()); - item.set_value(value.to_string()); + item.set_payload(value.as_bytes().into()); item }) .collect::>(), From 37915609defa68c174e7659f99108a0982662989 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 3 Feb 2023 13:57:56 +0800 Subject: [PATCH 481/676] raftstore-v2: add tablet logger and update dep (#14129) ref tikv/tikv#12842 - Update raft-engine to fix data corruption during restart - Add tablet logger so we can know which tablet the logs belongs to Signed-off-by: Jay Lee --- Cargo.lock | 209 +++++++++--------- cmd/tikv-ctl/src/main.rs | 3 +- components/engine_rocks/src/logger.rs | 24 ++ components/raft_log_engine/src/engine.rs | 11 +- .../raftstore-v2/src/operation/ready/mod.rs | 12 +- src/config/mod.rs | 15 +- src/server/engine_factory.rs | 13 +- tests/integrations/storage/test_titan.rs | 5 +- 8 files changed, 167 insertions(+), 125 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 78c9e88b538..633194d9323 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -225,7 +225,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d962799a5863fdf06fbf594e04102130582d010379137e9a98a7e2e693a5885" dependencies = [ "error-code", - "libc 0.2.132", + "libc 0.2.139", "wasm-bindgen", "winapi 0.3.9", ] @@ -256,7 +256,7 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -447,7 +447,7 @@ dependencies = [ "addr2line", "cc", "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "miniz_oxide 0.4.4", "object", "rustc-demangle", @@ -603,7 +603,7 @@ dependencies = [ "bcc-sys", "bitflags", "byteorder", - "libc 0.2.132", + "libc 0.2.139", "regex", "thiserror", ] @@ -735,7 +735,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", "pkg-config", ] @@ -761,7 +761,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7f788eaf239475a3c1e1acf89951255a46c4b9b46cf3e866fc4d0707b4b9e36" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "valgrind_request", ] @@ -934,7 +934,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f54d78e30b388d4815220c8dd03fea5656b6c6d32adb59e89061552a102f8da1" dependencies = [ "glob", - "libc 0.2.132", + "libc 0.2.139", "libloading", ] @@ -1018,7 +1018,7 @@ dependencies = [ "byteorder", "bytes", "error_code", - "libc 0.2.132", + "libc 0.2.139", "panic_hook", "protobuf", "rand 0.8.5", @@ -1077,7 +1077,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62" dependencies = [ "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -1092,7 +1092,7 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -1150,7 +1150,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63aaaf47e457badbcb376c65a49d0f182c317ebd97dc6d1ced94c8e1d09c0f3a" dependencies = [ "criterion", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -1217,7 +1217,7 @@ dependencies = [ "cfg-if 1.0.0", "crossbeam-utils 0.8.8", "lazy_static", - "memoffset", + "memoffset 0.6.4", "scopeguard", ] @@ -1229,7 +1229,7 @@ dependencies = [ "autocfg", "cfg-if 1.0.0", "crossbeam-utils 0.8.11", - "memoffset", + "memoffset 0.6.4", "once_cell", "scopeguard", ] @@ -1420,7 +1420,7 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "redox_users", "winapi 0.3.9", ] @@ -1681,7 +1681,7 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5115567ac25674e0043e472be13d14e537f37ea8aa4bdc4aef0c89add1db1ff" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "str-buf", ] @@ -1789,7 +1789,7 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "libloading", "matches", "nix 0.24.1", @@ -1845,7 +1845,7 @@ dependencies = [ "crossbeam-utils 0.8.8", "fs2", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "maligned", "online_config", "openssl", @@ -1870,7 +1870,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "thiserror", "winapi 0.3.9", ] @@ -1882,7 +1882,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d34cfa13a63ae058bfa601fe9e313bbdb3746427c1459185464ce0fcf62e1e8" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.2.11", "winapi 0.3.9", ] @@ -1895,7 +1895,7 @@ checksum = "d691fdb3f817632d259d09220d4cf0991dbb2c9e59e044a02a59194bf6e14484" dependencies = [ "cc", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -1923,7 +1923,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2adaffba6388640136149e18ed080b77a78611c1e1d6de75aedcdf78df5d4682" dependencies = [ "crc32fast", - "libc 0.2.132", + "libc 0.2.139", "libz-sys", "miniz_oxide 0.3.7", ] @@ -1964,7 +1964,7 @@ name = "fs2" version = "0.4.3" source = "git+https://github.com/tabokie/fs2-rs?branch=tikv#cd503764a19a99d74c1ab424dd13d6bcd093fcae" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -1990,7 +1990,7 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f41b048a94555da0f42f1d632e2e19510084fb8e303b0daa2816e733fb3644a0" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2226,7 +2226,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "473a1265acc8ff1e808cd0a1af8cee3c2ee5200916058a2ca113c29f2d903571" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.132", + "libc 0.2.139", "wasi 0.7.0", ] @@ -2238,7 +2238,7 @@ checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ "cfg-if 1.0.0", "js-sys", - "libc 0.2.132", + "libc 0.2.139", "wasi 0.10.2+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2287,7 +2287,7 @@ dependencies = [ "futures-executor", "futures-util", "grpcio-sys", - "libc 0.2.132", + "libc 0.2.139", "log", "parking_lot 0.11.1", "protobuf", @@ -2324,7 +2324,7 @@ dependencies = [ "bindgen 0.59.2", "cc", "cmake", - "libc 0.2.132", + "libc 0.2.139", "libz-sys", "openssl-sys", "pkg-config", @@ -2392,7 +2392,7 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "307c3c9f937f38e3534b1d6447ecf090cafcc9744e4a6360e8b037b2cf5af120" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2600,7 +2600,7 @@ checksum = "4816c66d2c8ae673df83366c18341538f234a26d65a9ecea5c348b453ac1d02f" dependencies = [ "bitflags", "inotify-sys", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2609,7 +2609,7 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2636,7 +2636,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2682,7 +2682,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b1d42ef453b30b7387e113da1c83ab1605d90c5b4e0eb8e96d016ed3b8c160" dependencies = [ "getrandom 0.1.12", - "libc 0.2.132", + "libc 0.2.139", "log", ] @@ -2823,9 +2823,9 @@ checksum = "e32a70cf75e5846d53a673923498228bbec6a8624708a9ea5645f075d6276122" [[package]] name = "libc" -version = "0.2.132" +version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libfuzzer-sys" @@ -2865,7 +2865,7 @@ dependencies = [ "bzip2-sys", "cc", "cmake", - "libc 0.2.132", + "libc 0.2.139", "libtitan_sys", "libz-sys", "lz4-sys", @@ -2883,7 +2883,7 @@ dependencies = [ "bzip2-sys", "cc", "cmake", - "libc 0.2.132", + "libc 0.2.139", "libz-sys", "lz4-sys", "snappy-sys", @@ -2897,7 +2897,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", "pkg-config", "vcpkg", ] @@ -2953,7 +2953,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3008,7 +3008,7 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3017,7 +3017,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3027,7 +3027,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3039,6 +3039,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "memory_trace_macros" version = "0.1.0" @@ -3098,7 +3107,7 @@ dependencies = [ "fuchsia-zircon-sys", "iovec", "kernel32-sys", - "libc 0.2.132", + "libc 0.2.139", "log", "miow", "net2", @@ -3112,7 +3121,7 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "log", "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.42.0", @@ -3158,7 +3167,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1587ebb20a5b04738f16cffa7e2526f1b8496b84f92920facd518362ff1559eb" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3209,7 +3218,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8d96b2e1c8da3957d58100b09f102c6d9cfdfced01b7ec5a8974044bb09dbd4" dependencies = [ "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "openssl", "openssl-probe", @@ -3227,7 +3236,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3239,22 +3248,22 @@ checksum = "8f17df307904acd05aa8e32e97bb20f2a0df1728bbc2d771ae8f9a90463441e9" dependencies = [ "bitflags", "cfg-if 1.0.0", - "libc 0.2.132", - "memoffset", + "libc 0.2.139", + "memoffset 0.6.4", ] [[package]] name = "nix" -version = "0.25.0" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e322c04a9e3440c327fca7b6c8a63e6890a32fa2ad689db972425f07e0d22abb" +checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ - "autocfg", "bitflags", "cfg-if 1.0.0", - "libc 0.2.132", - "memoffset", + "libc 0.2.139", + "memoffset 0.7.1", "pin-utils", + "static_assertions", ] [[package]] @@ -3311,7 +3320,7 @@ dependencies = [ "fsevent", "fsevent-sys", "inotify", - "libc 0.2.132", + "libc 0.2.139", "mio 0.6.23", "mio-extras", "walkdir", @@ -3464,7 +3473,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3542,7 +3551,7 @@ dependencies = [ "bitflags", "cfg-if 1.0.0", "foreign-types", - "libc 0.2.132", + "libc 0.2.139", "once_cell", "openssl-macros", "openssl-sys", @@ -3582,7 +3591,7 @@ checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f" dependencies = [ "autocfg", "cc", - "libc 0.2.132", + "libc 0.2.139", "openssl-src", "pkg-config", "vcpkg", @@ -3612,7 +3621,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3649,7 +3658,7 @@ checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" dependencies = [ "cfg-if 1.0.0", "instant", - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.2.11", "smallvec", "winapi 0.3.9", @@ -3662,7 +3671,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28141e0cc4143da2443301914478dc976a61ffdb3f043058310c70df2fed8954" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.2.11", "smallvec", "windows-sys 0.32.0", @@ -3739,7 +3748,7 @@ checksum = "b8f94885300e262ef461aa9fd1afbf7df3caf9e84e271a74925d1c6c8b24830f" dependencies = [ "bitflags", "byteorder", - "libc 0.2.132", + "libc 0.2.139", "mmap", "nom 4.2.3", "phf", @@ -3882,7 +3891,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d27361d7578b410d0eb5fe815c2b2105b01ab770a7c738cb9a231457a809fcc7" dependencies = [ "ipnetwork", - "libc 0.2.132", + "libc 0.2.139", "pnet_base", "pnet_sys", "winapi 0.2.8", @@ -3894,7 +3903,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82f881a6d75ac98c5541db6144682d1773bb14c6fc50c6ebac7086c8f7f23c29" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.2.8", "ws2_32-sys", ] @@ -3909,7 +3918,7 @@ dependencies = [ "cfg-if 1.0.0", "findshlibs", "inferno", - "libc 0.2.132", + "libc 0.2.139", "log", "nix 0.24.1", "once_cell", @@ -3993,7 +4002,7 @@ dependencies = [ "byteorder", "hex 0.4.2", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -4002,7 +4011,7 @@ version = "0.4.2" source = "git+https://github.com/tikv/procinfo-rs?rev=6599eb9dca74229b2c1fcc44118bef7eff127128#6599eb9dca74229b2c1fcc44118bef7eff127128" dependencies = [ "byteorder", - "libc 0.2.132", + "libc 0.2.139", "nom 2.2.1", "rustc_version 0.2.3", ] @@ -4027,7 +4036,7 @@ dependencies = [ "cfg-if 1.0.0", "fnv", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "memchr", "parking_lot 0.11.1", "protobuf", @@ -4192,7 +4201,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#82f6da7b8dff1856483e8e72a59dda903fb2499b" +source = "git+https://github.com/tikv/raft-engine.git#33530112c3a4acaf8c50ca9d0470284109926296" dependencies = [ "byteorder", "crc32fast", @@ -4203,11 +4212,11 @@ dependencies = [ "hex 0.4.2", "if_chain", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "lz4-sys", "memmap2", - "nix 0.25.0", + "nix 0.26.2", "num-derive", "num-traits", "parking_lot 0.12.1", @@ -4226,7 +4235,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#82f6da7b8dff1856483e8e72a59dda903fb2499b" +source = "git+https://github.com/tikv/raft-engine.git#33530112c3a4acaf8c50ca9d0470284109926296" dependencies = [ "clap 3.1.6", "env_logger", @@ -4390,7 +4399,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" dependencies = [ "fuchsia-cprng", - "libc 0.2.132", + "libc 0.2.139", "rand_core 0.3.1", "rdrand", "winapi 0.3.9", @@ -4403,7 +4412,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ "getrandom 0.1.12", - "libc 0.2.132", + "libc 0.2.139", "rand_chacha 0.2.1", "rand_core 0.5.1", "rand_hc", @@ -4415,7 +4424,7 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "rand_chacha 0.3.0", "rand_core 0.6.2", ] @@ -4710,7 +4719,7 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "online_config", "pdqselect", @@ -4773,7 +4782,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b72b84d47e8ec5a4f2872e8262b8f8256c5be1c938a7d6d3a867a3ba8f722f74" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", "once_cell", "spin", "untrusted", @@ -4786,7 +4795,7 @@ name = "rocksdb" version = "0.3.0" source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "librocksdb_sys", ] @@ -5034,7 +5043,7 @@ dependencies = [ "bitflags", "core-foundation", "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", "security-framework-sys", ] @@ -5045,7 +5054,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3676258fd3cfe2c9a0ec99ce3038798d847ce3e4bb17746373eb9f0f1ac16339" dependencies = [ "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -5241,7 +5250,7 @@ dependencies = [ "hex 0.4.2", "keys", "kvproto", - "libc 0.2.132", + "libc 0.2.139", "log", "log_wrappers", "pd_client", @@ -5302,7 +5311,7 @@ version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "signal-hook-registry", ] @@ -5312,7 +5321,7 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -5445,7 +5454,7 @@ version = "0.1.0" source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" dependencies = [ "cmake", - "libc 0.2.132", + "libc 0.2.139", "pkg-config", ] @@ -5473,7 +5482,7 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -5683,7 +5692,7 @@ checksum = "ade661fa5e048ada64ad7901713301c21d2dbc5b65ee7967de8826c111452960" dependencies = [ "cfg-if 1.0.0", "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", "ntapi", "once_cell", "rayon", @@ -5766,7 +5775,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "rand 0.8.5", "redox_syscall 0.2.11", "remove_dir_all", @@ -6009,7 +6018,7 @@ dependencies = [ "hyper", "keys", "kvproto", - "libc 0.2.132", + "libc 0.2.139", "log_wrappers", "more-asserts", "online_config", @@ -6310,7 +6319,7 @@ dependencies = [ "keys", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "libloading", "log", "log_wrappers", @@ -6410,7 +6419,7 @@ dependencies = [ "hex 0.4.2", "keys", "kvproto", - "libc 0.2.132", + "libc 0.2.139", "log", "log_wrappers", "pd_client", @@ -6445,7 +6454,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e37706572f4b151dff7a0146e040804e9c26fe3a3118591112f05cf12a4216c1" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "paste", "tikv-jemalloc-sys", ] @@ -6458,7 +6467,7 @@ checksum = "aeab4310214fe0226df8bfeb893a291a58b19682e8a07e1e1d4483ad4200d315" dependencies = [ "cc", "fs_extra", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -6467,7 +6476,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20612db8a13a6c06d57ec83953694185a367e16945f66565e8028d2c0bd76979" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "tikv-jemalloc-sys", ] @@ -6490,7 +6499,7 @@ version = "0.1.0" dependencies = [ "fxhash", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "mimalloc", "snmalloc-rs", "tcmalloc", @@ -6559,7 +6568,7 @@ dependencies = [ "http", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "log_wrappers", "mnt", @@ -6608,7 +6617,7 @@ version = "0.1.42" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.1.56", "winapi 0.3.9", ] @@ -6651,7 +6660,7 @@ checksum = "a9e03c497dc955702ba729190dc4aac6f2a0ce97f913e5b1b5912fc5039d9099" dependencies = [ "autocfg", "bytes", - "libc 0.2.132", + "libc 0.2.139", "memchr", "mio 0.8.5", "num_cpus", @@ -7037,7 +7046,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "055058552ca15c566082fc61da433ae678f78986a6f16957e33162d1b218792a" dependencies = [ "kernel32-sys", - "libc 0.2.132", + "libc 0.2.139", "winapi 0.2.8", ] @@ -7222,7 +7231,7 @@ checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" dependencies = [ "either", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -7461,7 +7470,7 @@ version = "5.0.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "zstd-sys", ] @@ -7472,5 +7481,5 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", ] diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 30cd7035bef..e4c7be98dba 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -686,7 +686,8 @@ fn build_rocks_opts(cfg: &TikvConfig) -> engine_rocks::RocksDbOptions { .unwrap() .map(Arc::new); let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); - cfg.rocksdb.build_opt(&cfg.rocksdb.build_resources(env)) + let resource = cfg.rocksdb.build_resources(env); + cfg.rocksdb.build_opt(&resource, cfg.storage.engine) } fn run_ldb_command(args: Vec, cfg: &TikvConfig) { diff --git a/components/engine_rocks/src/logger.rs b/components/engine_rocks/src/logger.rs index b7b196448c5..85f4de713ac 100644 --- a/components/engine_rocks/src/logger.rs +++ b/components/engine_rocks/src/logger.rs @@ -20,6 +20,30 @@ impl Logger for RocksdbLogger { } } +pub struct TabletLogger { + tablet_name: String, +} + +impl TabletLogger { + pub fn new(tablet_name: String) -> Self { + Self { tablet_name } + } +} + +impl Logger for TabletLogger { + fn logv(&self, log_level: InfoLogLevel, log: &str) { + match log_level { + InfoLogLevel::Header => info!(#"rocksdb_log_header", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Debug => debug!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Info => info!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Warn => warn!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Error => error!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Fatal => crit!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + _ => {} + } + } +} + #[derive(Default)] pub struct RaftDbLogger; diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 838fe461f4b..92d7a4f7353 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -472,18 +472,21 @@ impl RaftLogBatchTrait for RaftLogBatch { let key = encode_flushed_key(cf, tablet_index); let mut value = vec![0; 8]; NumberCodec::encode_u64(&mut value, apply_index); - self.0.put(raft_group_id, key.to_vec(), value); - Ok(()) + self.0 + .put(raft_group_id, key.to_vec(), value) + .map_err(transfer_error) } fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { let key = encode_key(DIRTY_MARK_KEY, tablet_index); if dirty { - self.0.put(raft_group_id, key.to_vec(), vec![]); + self.0 + .put(raft_group_id, key.to_vec(), vec![]) + .map_err(transfer_error) } else { self.0.delete(raft_group_id, key.to_vec()); + Ok(()) } - Ok(()) } fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index e7c32e742ec..7f656e29210 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -535,11 +535,13 @@ impl Peer { } if !self.serving() { self.start_destroy(ctx, &mut write_task); - ctx.coprocessor_host.on_region_changed( - self.region(), - RegionChangeEvent::Destroy, - self.raft_group().raft.state, - ); + if self.persisted_index() != 0 { + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Destroy, + self.raft_group().raft.state, + ); + } } // Ready number should increase monotonically. assert!(self.async_writer.known_largest_number() < ready.number()); diff --git a/src/config/mod.rs b/src/config/mod.rs index 3274d5442df..38d69f1ab29 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1264,7 +1264,7 @@ impl DbConfig { } } - pub fn build_opt(&self, shared: &DbResources) -> RocksDbOptions { + pub fn build_opt(&self, shared: &DbResources, for_engine: EngineType) -> RocksDbOptions { let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { @@ -1306,7 +1306,9 @@ impl DbConfig { if let Some(b) = self.paranoid_checks { opts.set_paranoid_checks(b); } - opts.set_info_log(RocksdbLogger::default()); + if for_engine == EngineType::RaftKv { + opts.set_info_log(RocksdbLogger::default()); + } opts.set_info_log_level(self.info_log_level.into()); if self.titan.enabled { opts.set_titandb_options(&self.titan.build_opts()); @@ -4424,9 +4426,10 @@ mod tests { fn test_rocks_rate_limit_zero() { let mut tikv_cfg = TikvConfig::default(); tikv_cfg.rocksdb.rate_bytes_per_sec = ReadableSize(0); + let resource = tikv_cfg.rocksdb.build_resources(Arc::new(Env::default())); tikv_cfg .rocksdb - .build_opt(&tikv_cfg.rocksdb.build_resources(Arc::new(Env::default()))); + .build_opt(&resource, tikv_cfg.storage.engine); } #[test] @@ -4587,12 +4590,10 @@ mod tests { Arc, ) { assert_eq!(F::TAG, cfg.storage.api_version()); + let resource = cfg.rocksdb.build_resources(Arc::default()); let engine = RocksDBEngine::new( &cfg.storage.data_dir, - Some( - cfg.rocksdb - .build_opt(&cfg.rocksdb.build_resources(Arc::new(Env::default()))), - ), + Some(cfg.rocksdb.build_opt(&resource, cfg.storage.engine)), cfg.rocksdb.build_cf_opts( &cfg.rocksdb .build_cf_resources(cfg.storage.block_cache.build_shared_cache()), diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 91b5178f8a0..ff06e41cc57 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -6,6 +6,7 @@ use engine_rocks::{ raw::{Cache, Env}, CompactedEventSender, CompactionListener, FlowListener, RocksCfOptions, RocksCompactionJobInfo, RocksDbOptions, RocksEngine, RocksEventListener, RocksPersistenceListener, RocksStatistics, + TabletLogger, }; use engine_traits::{ CompactionJobInfo, MiscExt, PersistenceListener, Result, StateStorage, TabletContext, @@ -134,12 +135,12 @@ impl KvEngineFactory { self.inner.db_resources.statistics.clone() } - fn db_opts(&self) -> RocksDbOptions { + fn db_opts(&self, for_engine: EngineType) -> RocksDbOptions { // Create kv engine. let mut db_opts = self .inner .rocksdb_config - .build_opt(&self.inner.db_resources); + .build_opt(&self.inner.db_resources, for_engine); if !self.inner.lite { db_opts.add_event_listener(RocksEventListener::new( "kv", @@ -170,7 +171,7 @@ impl KvEngineFactory { /// It will always create in path/DEFAULT_DB_SUB_DIR. pub fn create_shared_db(&self, path: impl AsRef) -> Result { let path = path.as_ref(); - let mut db_opts = self.db_opts(); + let mut db_opts = self.db_opts(EngineType::RaftKv); let cf_opts = self.cf_opts(EngineType::RaftKv); if let Some(listener) = &self.inner.flow_listener { db_opts.add_event_listener(listener.clone()); @@ -187,7 +188,9 @@ impl KvEngineFactory { impl TabletFactory for KvEngineFactory { fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { - let mut db_opts = self.db_opts(); + let mut db_opts = self.db_opts(EngineType::RaftKv2); + let tablet_name = path.file_name().unwrap().to_str().unwrap().to_string(); + db_opts.set_info_log(TabletLogger::new(tablet_name)); let cf_opts = self.cf_opts(EngineType::RaftKv2); if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { db_opts.add_event_listener(listener.clone_with(ctx.id, suffix)); @@ -215,7 +218,7 @@ impl TabletFactory for KvEngineFactory { fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()> { info!("destroy tablet"; "path" => %path.display(), "id" => ctx.id, "suffix" => ?ctx.suffix); // Create kv engine. - let _db_opts = self.db_opts(); + let _db_opts = self.db_opts(EngineType::RaftKv2); let _cf_opts = self.cf_opts(EngineType::RaftKv2); // TODOTODO: call rust-rocks or tirocks to destroy_engine; // engine_rocks::util::destroy_engine( diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 452bcc89238..dc0a85bc9c2 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -159,9 +159,8 @@ fn test_delete_files_in_range_for_titan() { cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize(0); cfg.rocksdb.defaultcf.titan.discardable_ratio = 0.4; cfg.rocksdb.defaultcf.titan.min_blob_size = ReadableSize(0); - let kv_db_opts = cfg - .rocksdb - .build_opt(&cfg.rocksdb.build_resources(Default::default())); + let resource = cfg.rocksdb.build_resources(Default::default()); + let kv_db_opts = cfg.rocksdb.build_opt(&resource, cfg.storage.engine); let kv_cfs_opts = cfg.rocksdb.build_cf_opts( &cfg.rocksdb.build_cf_resources(cache), None, From c8c1ca8b8376d7f29c05cd1cf08b469ddbc4939c Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 3 Feb 2023 15:43:55 +0800 Subject: [PATCH 482/676] raftstore: Observe when receive raft message (#14043) ref tikv/tikv#13855 Introduce observers when receive raft message. Signed-off-by: CalvinNeo --- components/cdc/src/observer.rs | 2 ++ .../raftstore-v2/src/operation/ready/mod.rs | 1 + .../raftstore/src/coprocessor/dispatcher.rs | 35 +++++++++++++++++++ components/raftstore/src/coprocessor/mod.rs | 15 ++++++-- components/raftstore/src/store/fsm/peer.rs | 3 ++ components/raftstore/src/store/fsm/store.rs | 3 ++ components/raftstore/src/store/peer.rs | 1 + components/raftstore/src/store/snap.rs | 16 ++++++--- 8 files changed, 70 insertions(+), 6 deletions(-) diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 696bc6341ee..aac2842e404 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -273,6 +273,7 @@ mod tests { prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, initialized: true, + peer_id: raft::INVALID_ID, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { @@ -301,6 +302,7 @@ mod tests { prev_lead_transferee: 3, vote: 3, initialized: true, + peer_id: raft::INVALID_ID, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 7f656e29210..03dce74d4e7 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -768,6 +768,7 @@ impl Peer { prev_lead_transferee: target, vote: self.raft_group().raft.vote, initialized: self.storage().is_initialized(), + peer_id: self.peer().get_id(), }, ); self.proposal_control_mut().maybe_update_term(term); diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 794a46b8e3a..0e45ef1d09d 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -8,6 +8,7 @@ use kvproto::{ metapb::{Region, RegionEpoch}, pdpb::CheckPolicy, raft_cmdpb::{ComputeHashRequest, RaftCmdRequest}, + raft_serverpb::RaftMessage, }; use protobuf::Message; use raft::eraftpb; @@ -278,6 +279,7 @@ impl_box_observer_g!( ConsistencyCheckObserver, WrappedConsistencyCheckObserver ); +impl_box_observer!(BoxMessageObserver, MessageObserver, WrappedMessageObserver); /// Registry contains all registered coprocessors. #[derive(Clone)] @@ -296,6 +298,7 @@ where read_index_observers: Vec>, pd_task_observers: Vec>, update_safe_ts_observers: Vec>, + message_observers: Vec>, // TODO: add endpoint } @@ -313,6 +316,7 @@ impl Default for Registry { read_index_observers: Default::default(), pd_task_observers: Default::default(), update_safe_ts_observers: Default::default(), + message_observers: Default::default(), } } } @@ -381,6 +385,10 @@ impl Registry { pub fn register_update_safe_ts_observer(&mut self, priority: u32, qo: BoxUpdateSafeTsObserver) { push!(priority, qo, self.update_safe_ts_observers); } + + pub fn register_message_observer(&mut self, priority: u32, qo: BoxMessageObserver) { + push!(priority, qo, self.message_observers); + } } /// A macro that loops over all observers and returns early when error is found @@ -780,6 +788,17 @@ impl CoprocessorHost { true } + /// Returns false if the message should not be stepped later. + pub fn on_raft_message(&self, msg: &RaftMessage) -> bool { + for observer in &self.registry.message_observers { + let observer = observer.observer.inner(); + if !observer.on_raft_message(msg) { + return false; + } + } + true + } + pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -890,6 +909,7 @@ mod tests { OnUpdateSafeTs = 23, PrePersist = 24, PreWriteApplyState = 25, + OnRaftMessage = 26, } impl Coprocessor for TestCoprocessor {} @@ -1132,6 +1152,14 @@ mod tests { } } + impl MessageObserver for TestCoprocessor { + fn on_raft_message(&self, _: &RaftMessage) -> bool { + self.called + .fetch_add(ObserverIndex::OnRaftMessage as usize, Ordering::SeqCst); + true + } + } + macro_rules! assert_all { ($target:expr, $expect:expr) => {{ for (c, e) in ($target).iter().zip($expect) { @@ -1168,6 +1196,8 @@ mod tests { .register_cmd_observer(1, BoxCmdObserver::new(ob.clone())); host.registry .register_update_safe_ts_observer(1, BoxUpdateSafeTsObserver::new(ob.clone())); + host.registry + .register_message_observer(1, BoxMessageObserver::new(ob.clone())); let mut index: usize = 0; let region = Region::default(); @@ -1282,6 +1312,11 @@ mod tests { host.pre_write_apply_state(®ion); index += ObserverIndex::PreWriteApplyState as usize; assert_all!([&ob.called], &[index]); + + let msg = RaftMessage::default(); + host.on_raft_message(&msg); + index += ObserverIndex::OnRaftMessage as usize; + assert_all!([&ob.called], &[index]); } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 73110660856..98b045dbed8 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -26,14 +26,16 @@ mod metrics; pub mod region_info_accessor; mod split_check; pub mod split_observer; +use kvproto::raft_serverpb::RaftMessage; pub use self::{ config::{Config, ConsistencyCheckMethod}, consistency_check::{ConsistencyCheckObserver, Raw as RawConsistencyCheckObserver}, dispatcher::{ BoxAdminObserver, BoxApplySnapshotObserver, BoxCmdObserver, BoxConsistencyCheckObserver, - BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, - BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, StoreHandle, + BoxMessageObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, + BoxRoleObserver, BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, + StoreHandle, }, error::{Error, Result}, region_info_accessor::{ @@ -269,6 +271,7 @@ pub struct RoleChange { /// Which peer is voted by itself. pub vote: u64, pub initialized: bool, + pub peer_id: u64, } impl RoleChange { @@ -280,6 +283,7 @@ impl RoleChange { prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, initialized: true, + peer_id: raft::INVALID_ID, } } } @@ -334,6 +338,13 @@ pub trait RegionChangeObserver: Coprocessor { } } +pub trait MessageObserver: Coprocessor { + /// Returns false if the message should not be stepped later. + fn on_raft_message(&self, _: &RaftMessage) -> bool { + true + } +} + #[derive(Clone, Debug, Default)] pub struct Cmd { pub index: u64, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index a8232fd8322..75da7d497e4 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -610,6 +610,9 @@ where for m in msgs.drain(..) { match m { PeerMsg::RaftMessage(msg) => { + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { error!(%e; "handle raft message err"; diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 26f2983998d..85631bebe09 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -753,6 +753,9 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> match m { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => { + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { if matches!(&e, Error::RegionNotRegistered { .. }) { // This may happen in normal cases when add-peer runs slowly diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 44701fbf705..a6010a6761f 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2335,6 +2335,7 @@ where prev_lead_transferee: self.lead_transferee, vote: self.raft_group.raft.vote, initialized: self.is_initialized(), + peer_id: self.peer.get_id(), }, ); self.cmd_epoch_checker.maybe_update_term(self.term()); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index a9ef7df8c62..358ec716195 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -207,7 +207,9 @@ fn retry_delete_snapshot(mgr: &SnapManagerCore, key: &SnapKey, snap: &Snapshot) false } -fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { +// Create a SnapshotMeta that can be later put into RaftSnapshotData or written +// into file. +pub fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { let mut meta = Vec::with_capacity(cf_files.len()); for cf_file in cf_files { if !SNAPSHOT_CFS.iter().any(|cf| cf_file.cf == *cf) { @@ -663,7 +665,8 @@ impl Snapshot { Ok(snapshot_meta) } - fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { + // Validate and set SnapshotMeta of this Snapshot. + pub fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { let mut cf_file_count_from_meta: Vec = vec![]; let mut file_count = 0; let mut current_cf = ""; @@ -812,8 +815,9 @@ impl Snapshot { } } - // Only called in `do_build`. - fn save_meta_file(&mut self) -> RaftStoreResult<()> { + // Save `SnapshotMeta` to file. + // Used in `do_build` and by external crates. + pub fn save_meta_file(&mut self) -> RaftStoreResult<()> { let v = box_try!(self.meta_file.meta.as_ref().unwrap().write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { // `meta_file` could be None for this case: in `init_for_building` the snapshot @@ -1125,6 +1129,10 @@ impl Snapshot { file_system::metadata(&self.meta_file.path) } + pub fn meta_path(&self) -> &PathBuf { + &self.meta_file.path + } + pub fn total_size(&self) -> u64 { self.cf_files .iter() From 656c9831d5f3be206b06745f6e0fd9b51ccfcfad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Fri, 3 Feb 2023 15:59:56 +0800 Subject: [PATCH 483/676] log-backup: added check leader call before flushing (#14108) close tikv/tikv#14099 Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 17 +++++ components/backup-stream/src/metrics.rs | 5 ++ .../backup-stream/src/subscription_manager.rs | 15 ++++- .../backup-stream/src/subscription_track.rs | 23 +++++-- components/backup-stream/tests/mod.rs | 64 +++++++++++++++++++ components/server/src/server.rs | 8 +++ 6 files changed, 125 insertions(+), 7 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index c50c70a2eec..ff380551b90 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -8,6 +8,7 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use error_code::ErrorCodeExt; use futures::FutureExt; +use grpcio::Environment; use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, metapb::Region, @@ -17,7 +18,10 @@ use pd_client::PdClient; use raftstore::{ coprocessor::{CmdBatch, ObserveHandle, RegionInfoProvider}, router::RaftStoreRouter, + store::RegionReadProgressRegistry, }; +use resolved_ts::LeadershipResolver; +use security::SecurityManager; use tikv::config::BackupStreamConfig; use tikv_util::{ box_err, @@ -112,6 +116,10 @@ where router: RT, pd_client: Arc, concurrency_manager: ConcurrencyManager, + // Required by Leadership Resolver. + env: Arc, + region_read_progress: RegionReadProgressRegistry, + security_mgr: Arc, ) -> Self { crate::metrics::STREAM_ENABLED.inc(); let pool = create_tokio_runtime((config.num_threads / 2).max(1), "backup-stream") @@ -148,6 +156,14 @@ where let initial_scan_throughput_quota = Limiter::new(limit); info!("the endpoint of stream backup started"; "path" => %config.temp_path); let subs = SubscriptionTracer::default(); + let leadership_resolver = LeadershipResolver::new( + store_id, + Arc::clone(&pd_client) as _, + env, + security_mgr, + region_read_progress, + Duration::from_secs(60), + ); let (region_operator, op_loop) = RegionSubscriptionManager::start( InitialDataLoader::new( router.clone(), @@ -163,6 +179,7 @@ where meta_client.clone(), pd_client.clone(), ((config.num_threads + 1) / 2).max(1), + leadership_resolver, ); pool.spawn(op_loop); let mut checkpoint_mgr = CheckpointManager::default(); diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index c3f99b8617e..0805dae5f77 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -155,4 +155,9 @@ lazy_static! { &["stage"] ) .unwrap(); + pub static ref LOST_LEADER_REGION: IntCounter = register_int_counter!( + "tikv_log_backup_lost_leader_region", + "The regions that lost leadership during resolving" + ) + .unwrap(); } diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 91b4c096e7d..a31a43980b5 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -21,6 +21,7 @@ use raftstore::{ router::RaftStoreRouter, store::fsm::ChangeObserver, }; +use resolved_ts::LeadershipResolver; use tikv::storage::Statistics; use tikv_util::{box_err, debug, info, time::Instant, warn, worker::Scheduler}; use tokio::sync::mpsc::{channel, Receiver, Sender}; @@ -351,6 +352,7 @@ where meta_cli: MetadataClient, pd_client: Arc, scan_pool_size: usize, + leader_checker: LeadershipResolver, ) -> (Self, future![()]) where E: KvEngine, @@ -370,7 +372,7 @@ where scan_pool_handle: Arc::new(scan_pool_handle), scans: CallbackWaitGroup::new(), }; - let fut = op.clone().region_operator_loop(rx); + let fut = op.clone().region_operator_loop(rx, leader_checker); (op, fut) } @@ -390,7 +392,11 @@ where } /// the handler loop. - async fn region_operator_loop(self, mut message_box: Receiver) { + async fn region_operator_loop( + self, + mut message_box: Receiver, + mut leader_checker: LeadershipResolver, + ) { while let Some(op) = message_box.recv().await { info!("backup stream: on_modify_observe"; "op" => ?op); match op { @@ -454,7 +460,10 @@ where warn!("waiting for initial scanning done timed out, forcing progress!"; "take" => ?now.saturating_elapsed(), "timedout" => %timedout); } - let cps = self.subs.resolve_with(min_ts); + let regions = leader_checker + .resolve(self.subs.current_regions(), min_ts) + .await; + let cps = self.subs.resolve_with(min_ts, regions); let min_region = cps.iter().min_by_key(|rs| rs.checkpoint); // If there isn't any region observed, the `min_ts` can be used as resolved ts // safely. diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index a24076661bb..1f823130d3b 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, time::Duration}; +use std::{collections::HashSet, sync::Arc, time::Duration}; use dashmap::{ mapref::{entry::Entry, one::RefMut}, @@ -149,12 +149,27 @@ impl SubscriptionTracer { } } + pub fn current_regions(&self) -> Vec { + self.0.iter().map(|s| *s.key()).collect() + } + /// try advance the resolved ts with the min ts of in-memory locks. /// returns the regions and theirs resolved ts. - pub fn resolve_with(&self, min_ts: TimeStamp) -> Vec { + pub fn resolve_with( + &self, + min_ts: TimeStamp, + regions: impl IntoIterator, + ) -> Vec { + let rs = regions.into_iter().collect::>(); self.0 .iter_mut() - // Don't advance the checkpoint ts of removed region. + .filter(|s| { + let contains = rs.contains(s.key()); + if !contains { + crate::metrics::LOST_LEADER_REGION.inc(); + } + contains + }) .map(|mut s| ResolveResult::resolve(s.value_mut(), min_ts)) .collect() } @@ -500,7 +515,7 @@ mod test { drop(region4_sub); let mut rs = subs - .resolve_with(TimeStamp::new(1000)) + .resolve_with(TimeStamp::new(1000), vec![1, 2, 3, 4]) .into_iter() .map(|r| (r.region, r.checkpoint, r.checkpoint_type)) .collect::>(); diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 7256cd62c03..b7afcd1441f 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -337,6 +337,13 @@ impl Suite { raft_router, cluster.pd_client.clone(), cm, + Arc::clone(&self.env), + cluster.store_metas[&id] + .lock() + .unwrap() + .region_read_progress + .clone(), + Arc::clone(&sim.security_mgr), ); worker.start(endpoint); } @@ -803,6 +810,7 @@ mod test { }; use futures::{Stream, StreamExt}; use pd_client::PdClient; + use test_raftstore::IsolationFilterFactory; use tikv_util::{box_err, defer, info, HandyRwLock}; use tokio::time::timeout; use txn_types::{Key, TimeStamp}; @@ -1231,6 +1239,17 @@ mod test { ); } + async fn collect_all_current( + mut s: impl Stream + Unpin, + max_gap: Duration, + ) -> Vec { + let mut r = vec![]; + while let Ok(Some(x)) = timeout(max_gap, s.next()).await { + r.push(x); + } + r + } + async fn collect_current(mut s: impl Stream + Unpin, goal: usize) -> Vec { let mut r = vec![]; while let Ok(Some(x)) = timeout(Duration::from_secs(10), s.next()).await { @@ -1286,4 +1305,49 @@ mod test { round1.union(&round2).map(|x| x.as_slice()), )); } + + #[test] + fn network_partition() { + let mut suite = super::SuiteBuilder::new_named("network_partition") + .nodes(3) + .build(); + let stream = suite.flush_stream(); + suite.must_register_task(1, "network_partition"); + let leader = suite.cluster.leader_of_region(1).unwrap(); + let round1 = run_async_test(suite.write_records(0, 64, 1)); + + suite + .cluster + .add_send_filter(IsolationFilterFactory::new(leader.store_id)); + suite.cluster.reset_leader_of_region(1); + suite + .cluster + .must_wait_for_leader_expire(leader.store_id, 1); + let leader2 = suite.cluster.leader_of_region(1).unwrap(); + assert_ne!(leader.store_id, leader2.store_id, "leader not switched."); + let ts = suite.tso(); + suite.must_kv_prewrite( + 1, + vec![mutation(make_record_key(1, 778), b"generator".to_vec())], + make_record_key(1, 778), + ts, + ); + suite.sync(); + suite.force_flush_files("network_partition"); + suite.wait_for_flush(); + + let cps = run_async_test(collect_all_current(stream, Duration::from_secs(2))); + assert!( + cps.iter() + .flat_map(|(_s, cp)| cp.events.iter().map(|resp| resp.checkpoint)) + .all(|cp| cp <= ts.into_inner()), + "ts={} cps={:?}", + ts, + cps + ); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.iter().map(|k| k.as_slice()), + )) + } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 2a479964ced..3da6b0c4950 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1043,6 +1043,14 @@ where self.router.clone(), self.pd_client.clone(), self.concurrency_manager.clone(), + Arc::clone(&self.env), + engines + .store_meta + .lock() + .unwrap() + .region_read_progress + .clone(), + Arc::clone(&self.security_mgr), ); backup_stream_worker.start(backup_stream_endpoint); self.to_stop.push(backup_stream_worker); From 6daed4f45208a2818f09038279aa5ba1e0f0412e Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 3 Feb 2023 18:35:56 +0800 Subject: [PATCH 484/676] raftstore-v2: support tracing peer lifetime (#14056) ref tikv/tikv#12842, ref tikv/tikv#13818 In V1, a peer is responsible to destroy itself. The design is to make leader do less work and reduce writes. But from the practice of the pass years, not making it a strong guarantee actually makes the implementation complicated and hard to be correct and difficult to understand. In V2, we changes to make leader the very role to make sure all removed peers or merged peers must be destroyed in the end. Push mode is way easier to understand and implement correctly. The downside is extra writes are introduced but it's worthy. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 7 + components/raftstore-v2/src/fsm/peer.rs | 1 + components/raftstore-v2/src/fsm/store.rs | 9 +- .../operation/command/admin/conf_change.rs | 76 ++++- .../src/operation/command/admin/mod.rs | 7 +- .../raftstore-v2/src/operation/command/mod.rs | 21 +- components/raftstore-v2/src/operation/life.rs | 294 +++++++++++++++++- components/raftstore-v2/src/operation/mod.rs | 2 +- .../raftstore-v2/src/operation/ready/mod.rs | 50 ++- .../src/operation/ready/snapshot.rs | 4 + components/raftstore-v2/src/raft/peer.rs | 43 ++- components/raftstore-v2/src/router/message.rs | 3 + .../tests/integrations/cluster.rs | 4 + .../tests/integrations/test_conf_change.rs | 38 ++- .../tests/integrations/test_life.rs | 214 ++++++++++++- .../raftstore/src/store/async_io/read.rs | 2 + components/raftstore/src/store/fsm/apply.rs | 2 +- components/raftstore/src/store/fsm/peer.rs | 4 +- 18 files changed, 732 insertions(+), 49 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 1c7360a86bc..2a3cc63f797 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -75,6 +75,7 @@ pub struct StoreContext { pub schedulers: Schedulers, /// store meta pub store_meta: Arc>>, + pub shutdown: Arc, pub engine: ER, pub tablet_registry: TabletRegistry, pub apply_pool: FuturePool, @@ -108,6 +109,7 @@ impl StoreContext { self.cfg.report_region_buckets_tick_interval.0; self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = self.cfg.check_long_uncommitted_interval.0; + self.tick_batch[PeerTick::GcPeer as usize].wait_duration = Duration::from_secs(60); } } @@ -273,6 +275,7 @@ struct StorePollerBuilder { apply_pool: FuturePool, logger: Logger, store_meta: Arc>>, + shutdown: Arc, snap_mgr: TabletSnapManager, } @@ -287,6 +290,7 @@ impl StorePollerBuilder { schedulers: Schedulers, logger: Logger, store_meta: Arc>>, + shutdown: Arc, snap_mgr: TabletSnapManager, coprocessor_host: CoprocessorHost, ) -> Self { @@ -312,6 +316,7 @@ impl StorePollerBuilder { schedulers, store_meta, snap_mgr, + shutdown, coprocessor_host, } } @@ -418,6 +423,7 @@ where timer: SteadyTimer::default(), schedulers: self.schedulers.clone(), store_meta: self.store_meta.clone(), + shutdown: self.shutdown.clone(), engine: self.engine.clone(), tablet_registry: self.tablet_registry.clone(), apply_pool: self.apply_pool.clone(), @@ -613,6 +619,7 @@ impl StoreSystem { schedulers.clone(), self.logger.clone(), store_meta.clone(), + self.shutdown.clone(), snap_mgr, coprocessor_host, ); diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 26d5c2a1458..47d23a67d1d 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -225,6 +225,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } PeerTick::ReportBuckets => unimplemented!(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted(), + PeerTick::GcPeer => self.fsm.peer_mut().on_gc_peer_tick(self.store_ctx), } } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 17c0a9a50f9..fef433f04f5 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -12,7 +12,9 @@ use engine_traits::{KvEngine, RaftEngine}; use futures::{compat::Future01CompatExt, FutureExt}; use keys::{data_end_key, data_key}; use kvproto::metapb::Region; -use raftstore::store::{fsm::store::StoreRegionMeta, Config, RegionReadProgressRegistry}; +use raftstore::store::{ + fsm::store::StoreRegionMeta, Config, RegionReadProgressRegistry, Transport, +}; use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, @@ -255,7 +257,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { } } - pub fn handle_msgs(&mut self, store_msg_buf: &mut Vec) { + pub fn handle_msgs(&mut self, store_msg_buf: &mut Vec) + where + T: Transport, + { for msg in store_msg_buf.drain(..) { match msg { StoreMsg::Start => self.on_start(), diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 42c433584fe..1b8d29a7a54 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -49,6 +49,12 @@ pub struct ConfChangeResult { pub region_state: RegionLocalState, } +#[derive(Debug)] +pub struct UpdateGcPeersResult { + index: u64, + region_state: RegionLocalState, +} + impl Peer { #[inline] pub fn propose_conf_change( @@ -177,10 +183,13 @@ impl Peer { } } } - if has_new_peer.is_some() { - // Speed up snapshot instead of waiting another heartbeat. - self.raft_group_mut().ping(); - self.set_has_ready(); + if self.is_leader() { + if has_new_peer.is_some() { + // Speed up snapshot instead of waiting another heartbeat. + self.raft_group_mut().ping(); + self.set_has_ready(); + } + self.maybe_schedule_gc_peer_tick(); } } ctx.coprocessor_host.on_region_changed( @@ -199,6 +208,15 @@ impl Peer { self.set_has_extra_write(); } } + + pub fn on_apply_res_update_gc_peers(&mut self, result: UpdateGcPeersResult) { + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, result.index, &result.region_state) + .unwrap(); + self.set_has_extra_write(); + self.storage_mut().set_region_state(result.region_state); + } } impl Apply { @@ -279,7 +297,28 @@ impl Apply { ); let my_id = self.peer().get_id(); let state = self.region_state_mut(); + let mut removed_records: Vec<_> = state.take_removed_records().into(); + for p0 in state.get_region().get_peers() { + // No matching store ID means the peer must be removed. + if new_region + .get_peers() + .iter() + .all(|p1| p1.get_store_id() != p0.get_store_id()) + { + removed_records.push(p0.clone()); + } + } + // If a peer is replaced in the same store, the leader will keep polling the + // new peer on the same store, which implies that the old peer must be + // tombstone in the end. + removed_records.retain(|p0| { + new_region + .get_peers() + .iter() + .all(|p1| p1.get_store_id() != p0.get_store_id()) + }); state.set_region(new_region.clone()); + state.set_removed_records(removed_records.into()); let new_peer = new_region .get_peers() .iter() @@ -534,4 +573,33 @@ impl Apply { .inc(); Ok(()) } + + pub fn apply_update_gc_peer( + &mut self, + log_index: u64, + admin_req: &AdminRequest, + ) -> (AdminResponse, AdminCmdResult) { + let mut removed_records: Vec<_> = self.region_state_mut().take_removed_records().into(); + let mut merged_records: Vec<_> = self.region_state_mut().take_merged_records().into(); + let updates = admin_req.get_update_gc_peers().get_peer_id(); + info!(self.logger, "update gc peer"; "index" => log_index, "updates" => ?updates, "gc_peers" => ?removed_records, "merged_peers" => ?merged_records); + removed_records.retain(|p| !updates.contains(&p.get_id())); + merged_records.retain_mut(|r| { + let mut sources: Vec<_> = r.take_source_peers().into(); + sources.retain(|p| !updates.contains(&p.get_id())); + r.set_source_peers(sources.into()); + !r.get_source_peers().is_empty() + }); + self.region_state_mut() + .set_removed_records(removed_records.into()); + self.region_state_mut() + .set_merged_records(merged_records.into()); + ( + AdminResponse::default(), + AdminCmdResult::UpdateGcPeers(UpdateGcPeersResult { + index: log_index, + region_state: self.region_state().clone(), + }), + ) + } } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 52bc5329dd4..1546983645f 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -7,7 +7,7 @@ mod transfer_leader; pub use compact_log::CompactLogContext; use compact_log::CompactLogResult; -use conf_change::ConfChangeResult; +use conf_change::{ConfChangeResult, UpdateGcPeersResult}; use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; use protobuf::Message; @@ -28,6 +28,7 @@ pub enum AdminCmdResult { ConfChange(ConfChangeResult), TransferLeader(u64), CompactLog(CompactLogResult), + UpdateGcPeers(UpdateGcPeersResult), } impl Peer { @@ -110,6 +111,10 @@ impl Peer { } } AdminCmdType::CompactLog => self.propose_compact_log(ctx, req), + AdminCmdType::UpdateGcPeer => { + let data = req.write_to_bytes().unwrap(); + self.propose(ctx, data) + } _ => unimplemented!(), } }; diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index edca9510c27..5434eca6b38 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -16,7 +16,7 @@ //! - Applied result are sent back to peer fsm, and update memory state in //! `on_apply_res`. -use std::{mem, time::Duration}; +use std::{mem, sync::atomic::Ordering, time::Duration}; use engine_traits::{KvEngine, PerfContext, RaftEngine, WriteBatch, WriteOptions}; use kvproto::raft_cmdpb::{ @@ -41,7 +41,9 @@ use raftstore::{ }; use slog::{info, warn}; use tikv_util::{ - box_err, slog_panic, + box_err, + log::SlogFormat, + slog_panic, time::{duration_to_sec, monotonic_raw_now, Instant}, }; @@ -107,7 +109,17 @@ impl Peer { #[inline] pub fn schedule_apply_fsm(&mut self, store_ctx: &mut StoreContext) { let region_state = self.storage().region_state().clone(); - let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); + let mailbox = match store_ctx.router.mailbox(self.region_id()) { + Some(m) => m, + None => { + assert!( + store_ctx.shutdown.load(Ordering::Relaxed), + "failed to load mailbox: {}", + SlogFormat(&self.logger) + ); + return; + } + }; let logger = self.logger.clone(); let read_scheduler = self.storage().read_scheduler(); let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( @@ -334,6 +346,7 @@ impl Peer { } AdminCmdResult::TransferLeader(term) => self.on_transfer_leader(ctx, term), AdminCmdResult::CompactLog(res) => self.on_apply_res_compact_log(ctx, res), + AdminCmdResult::UpdateGcPeers(state) => self.on_apply_res_update_gc_peers(state), } } @@ -587,10 +600,10 @@ impl Apply { AdminCmdType::PrepareFlashback => unimplemented!(), AdminCmdType::FinishFlashback => unimplemented!(), AdminCmdType::BatchSwitchWitness => unimplemented!(), + AdminCmdType::UpdateGcPeer => self.apply_update_gc_peer(log_index, admin_req), AdminCmdType::InvalidAdmin => { return Err(box_err!("invalid admin command type")); } - AdminCmdType::UpdateGcPeer => unimplemented!(), }; match admin_result { diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 3a9f678bd8c..a407f6bc8ef 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -9,15 +9,34 @@ //! In v1, it can also be created by split. In v2, it's required to create by //! sending a message to store fsm first, and then using split to initialized //! the peer. +//! +//! A peer can only be removed in a raft group by conf change or merge. When +//! applying conf change, removed peer is added to `removed_records`; when +//! applying merge, source peer is added to merged_records. Quorum must agree +//! on the removal, but the removed peer may not necessary be in the quorum. So +//! the peer may not really destroy itself until either: +//! - applying conf change remove; +//! - receiving a RaftMessage with `is_tombstone` set; +//! - receiving a RaftMessage targeting larger ID. +//! +//! Leader is responsible to keep polling all removed peers and guarantee they +//! are really destroyed. A peer is considered destroyed only when a tombstone +//! record with the same ID or larger ID is persisted. For `removed_records`, +//! leader only needs to send a message with `is_tombstone` set. For +//! `merged_records`, to avoid race between destroy and merge, leader needs to +//! ask target peer to destroy source peer. + +use std::{cmp, mem}; use batch_system::BasicMailbox; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ - metapb::Region, - raft_serverpb::{PeerState, RaftMessage}, + metapb::{self, Region}, + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, + raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}, }; -use raftstore::store::{util, WriteTask}; +use raftstore::store::{util, Transport, WriteTask}; use slog::{debug, error, info, warn}; use tikv_util::store::find_peer; @@ -26,7 +45,7 @@ use crate::{ batch::StoreContext, fsm::{PeerFsm, Store}, raft::{Peer, Storage}, - router::PeerMsg, + router::{CmdResChannel, PeerMsg, PeerTick}, }; /// When a peer is about to destroy, it becomes `WaitReady` first. If there is @@ -87,6 +106,11 @@ impl DestroyProgress { } } +#[derive(Default)] +pub struct GcPeerContext { + confirmed_ids: Vec, +} + impl Store { /// The method is called during split. /// The creation process is: @@ -100,6 +124,7 @@ impl Store { ) where EK: KvEngine, ER: RaftEngine, + T: Transport, { let region_id = msg.region.id; let mut raft_msg = Box::::default(); @@ -137,10 +162,11 @@ impl Store { ) where EK: KvEngine, ER: RaftEngine, + T: Transport, { let region_id = msg.get_region_id(); // The message can be sent when the peer is being created, so try send it first. - let msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m))) = + let mut msg = if let Err(TrySendError::Disconnected(PeerMsg::RaftMessage(m))) = ctx.router.send(region_id, PeerMsg::RaftMessage(msg)) { m @@ -166,13 +192,12 @@ impl Store { ctx.raft_metrics.message_dropped.mismatch_region_epoch.inc(); return; } - // TODO: maybe we need to ack the message to confirm the peer is destroyed. - if msg.get_is_tombstone() || msg.has_merge_target() { + if msg.has_merge_target() { // Target tombstone peer doesn't exist, so ignore it. ctx.raft_metrics.message_dropped.stale_msg.inc(); return; } - let from_epoch = msg.get_region_epoch(); + let mut destroyed = false; let local_state = match ctx.engine.get_region_state(region_id, u64::MAX) { Ok(s) => s, Err(e) => { @@ -192,30 +217,51 @@ impl Store { // skip handling gc for simplicity. let local_epoch = local_state.get_region().get_region_epoch(); // The region in this peer is already destroyed - if util::is_epoch_stale(from_epoch, local_epoch) { - ctx.raft_metrics.message_dropped.region_tombstone_peer.inc(); + if util::is_epoch_stale(msg.get_region_epoch(), local_epoch) { + destroyed = true; + } + if !destroyed && let Some(local_peer) = find_peer(local_state.get_region(), self.store_id()) && to_peer.id <= local_peer.get_id() { + destroyed = true; + } + } + if destroyed { + if msg.get_is_tombstone() { + if let Some(msg) = build_peer_destroyed_report(&mut msg) { + let _ = ctx.trans.send(msg); + } return; } - if let Some(local_peer) = find_peer(local_state.get_region(), self.store_id()) { - if to_peer.id <= local_peer.get_id() { - ctx.raft_metrics.message_dropped.region_tombstone_peer.inc(); + if msg.has_extra_msg() { + let extra_msg = msg.get_extra_msg(); + if extra_msg.get_type() == ExtraMessageType::MsgGcPeerRequest + && extra_msg.has_check_gc_peer() + { + forward_destroy_source_peer(ctx, &msg); return; } } + ctx.raft_metrics.message_dropped.region_tombstone_peer.inc(); + return; } + // If it's not destroyed, and the message is a tombstone message, create the + // peer and destroy immediately to leave a tombstone record. // So the peer must need to be created. We don't need to synchronous with split // as split won't create peer in v2. And we don't check for range // conflict as v2 depends on tablet, which allows conflict ranges. let mut region = Region::default(); region.set_id(region_id); - region.set_region_epoch(from_epoch.clone()); + region.set_region_epoch(msg.get_region_epoch().clone()); // Peer list doesn't have to be complete, as it's uninitialized. // // If the id of the from_peer is INVALID_ID, this msg must be sent from parent // peer in the split execution in which case we do not add it into the region. - if from_peer.id != raft::INVALID_ID { + if from_peer.id != raft::INVALID_ID + // Check merge may be sent from different region + && (msg.get_extra_msg().get_type() != ExtraMessageType::MsgGcPeerRequest + || msg.get_extra_msg().get_check_gc_peer().get_from_region_id() == region_id) + { region.mut_peers().push(from_peer.clone()); } region.mut_peers().push(to_peer.clone()); @@ -260,7 +306,225 @@ impl Store { } } +/// Tell leader that `to_peer` from `tombstone_msg` is destroyed. +fn build_peer_destroyed_report(tombstone_msg: &mut RaftMessage) -> Option { + let to_region_id = if tombstone_msg.has_extra_msg() { + assert_eq!( + tombstone_msg.get_extra_msg().get_type(), + ExtraMessageType::MsgGcPeerRequest + ); + tombstone_msg + .get_extra_msg() + .get_check_gc_peer() + .get_from_region_id() + } else { + tombstone_msg.get_region_id() + }; + if to_region_id == 0 || tombstone_msg.get_from_peer().get_id() == 0 { + return None; + } + let mut msg = RaftMessage::default(); + msg.set_region_id(to_region_id); + msg.set_from_peer(tombstone_msg.take_to_peer()); + msg.set_to_peer(tombstone_msg.take_from_peer()); + msg.mut_extra_msg() + .set_type(ExtraMessageType::MsgGcPeerResponse); + Some(msg) +} + +/// Forward the destroy request from target peer to merged source peer. +fn forward_destroy_source_peer(ctx: &mut StoreContext, msg: &RaftMessage) +where + EK: KvEngine, + ER: RaftEngine, + T: Transport, +{ + let extra_msg = msg.get_extra_msg(); + // Instead of respond leader directly, send a message to target region to + // double check it's really destroyed. + let check_gc_peer = extra_msg.get_check_gc_peer(); + let mut tombstone_msg = Box::::default(); + tombstone_msg.set_region_id(check_gc_peer.get_check_region_id()); + tombstone_msg.set_from_peer(msg.get_from_peer().clone()); + tombstone_msg.set_to_peer(check_gc_peer.get_check_peer().clone()); + tombstone_msg.set_region_epoch(check_gc_peer.get_check_region_epoch().clone()); + tombstone_msg.set_is_tombstone(true); + // No need to set epoch as we don't know what it is. + tombstone_msg + .mut_extra_msg() + .set_type(ExtraMessageType::MsgGcPeerRequest); + tombstone_msg + .mut_extra_msg() + .mut_check_gc_peer() + .set_from_region_id(check_gc_peer.get_from_region_id()); + let _ = ctx.router.send_raft_message(tombstone_msg); +} + impl Peer { + pub fn maybe_schedule_gc_peer_tick(&mut self) { + let region_state = self.storage().region_state(); + if !region_state.get_removed_records().is_empty() + || !region_state.get_merged_records().is_empty() + { + self.add_pending_tick(PeerTick::GcPeer); + } + } + + /// Returns `true` means the sender will be gced. The message is stale. + pub fn maybe_gc_sender(&mut self, msg: &RaftMessage) -> bool { + let removed_peers = self.storage().region_state().get_removed_records(); + // Only removed_records can be determined directly. + if let Some(peer) = removed_peers + .iter() + .find(|p| p.id == msg.get_from_peer().get_id()) + { + let tombstone_msg = self.tombstone_message_for_same_region(peer.clone()); + self.add_message(tombstone_msg); + self.set_has_ready(); + true + } else { + false + } + } + + fn tombstone_message_for_same_region(&self, peer: metapb::Peer) -> RaftMessage { + let region_id = self.region_id(); + let mut tombstone_message = RaftMessage::default(); + tombstone_message.set_region_id(region_id); + tombstone_message.set_from_peer(self.peer().clone()); + tombstone_message.set_to_peer(peer); + tombstone_message.set_region_epoch(self.region().get_region_epoch().clone()); + tombstone_message.set_is_tombstone(true); + tombstone_message + } + + pub fn on_tombstone_message(&mut self, msg: &mut RaftMessage) { + match msg.get_to_peer().get_id().cmp(&self.peer_id()) { + cmp::Ordering::Less => { + if let Some(msg) = build_peer_destroyed_report(msg) { + self.add_message(msg); + self.set_has_ready(); + } + } + // No matter it's greater or equal, the current peer must be destroyed. + _ => { + self.mark_for_destroy(None); + } + } + } + + /// When leader tries to gc merged source peer, it will send a gc request to + /// target peer. If target peer makes sure the merged is finished, it + /// forward the message to source peer and let source peer send back a + /// response. + pub fn on_gc_peer_request( + &mut self, + ctx: &mut StoreContext, + msg: &RaftMessage, + ) { + let extra_msg = msg.get_extra_msg(); + if !extra_msg.has_check_gc_peer() || extra_msg.get_index() == 0 { + // Corrupted message. + return; + } + if self.storage().tablet_index() < extra_msg.get_index() { + // Merge not finish. + return; + } + + forward_destroy_source_peer(ctx, msg); + } + + /// A peer confirms it's destroyed. + pub fn on_gc_peer_response(&mut self, msg: &RaftMessage) { + let gc_peer_id = msg.get_from_peer().get_id(); + let state = self.storage().region_state(); + if state + .get_removed_records() + .iter() + .all(|p| p.get_id() != gc_peer_id) + && state.get_merged_records().iter().all(|p| { + p.get_source_peers() + .iter() + .all(|p| p.get_id() != gc_peer_id) + }) + { + return; + } + let ctx = self.gc_peer_context_mut(); + if ctx.confirmed_ids.contains(&gc_peer_id) { + return; + } + ctx.confirmed_ids.push(gc_peer_id); + } + + pub fn on_gc_peer_tick(&mut self, ctx: &mut StoreContext) { + if !self.is_leader() { + return; + } + let state = self.storage().region_state(); + if state.get_removed_records().is_empty() && state.get_merged_records().is_empty() { + return; + } + let mut need_gc_ids = Vec::with_capacity(5); + let gc_context = self.gc_peer_context(); + for peer in state.get_removed_records() { + need_gc_ids.push(peer.get_id()); + if gc_context.confirmed_ids.contains(&peer.get_id()) { + continue; + } + + let msg = self.tombstone_message_for_same_region(peer.clone()); + // For leader, it's OK to send gc message immediately. + let _ = ctx.trans.send(msg); + } + for record in state.get_merged_records() { + // For merge, we ask target to check whether source should be deleted. + for (source, target) in record + .get_source_peers() + .iter() + .zip(record.get_target_peers()) + { + need_gc_ids.push(source.get_id()); + if gc_context.confirmed_ids.contains(&source.get_id()) { + continue; + } + + let mut msg = RaftMessage::default(); + msg.set_region_id(record.get_target_region_id()); + msg.set_from_peer(self.peer().clone()); + msg.set_to_peer(target.clone()); + msg.set_region_epoch(record.get_target_epoch().clone()); + let extra_msg = msg.mut_extra_msg(); + extra_msg.set_type(ExtraMessageType::MsgGcPeerRequest); + extra_msg.set_index(record.get_index()); + let check_peer = extra_msg.mut_check_gc_peer(); + check_peer.set_from_region_id(self.region_id()); + check_peer.set_check_region_id(record.get_source_region_id()); + check_peer.set_check_peer(source.clone()); + check_peer.set_check_region_epoch(record.get_source_epoch().clone()); + let _ = ctx.trans.send(msg); + } + } + let gc_ctx = self.gc_peer_context_mut(); + if !gc_ctx.confirmed_ids.is_empty() { + let mut confirmed_ids = mem::take(&mut gc_ctx.confirmed_ids); + confirmed_ids.retain(|id| need_gc_ids.contains(id)); + let mut req = RaftCmdRequest::default(); + let header = req.mut_header(); + header.set_region_id(self.region_id()); + header.set_peer(self.peer().clone()); + let admin = req.mut_admin_request(); + admin.set_cmd_type(AdminCmdType::UpdateGcPeer); + let gc_peer = admin.mut_update_gc_peers(); + gc_peer.set_peer_id(confirmed_ids); + let (ch, _) = CmdResChannel::pair(); + // It's OK to fail as we will retry by tick. + self.on_admin_command(ctx, req, ch); + } + self.maybe_schedule_gc_peer_tick(); + } + /// A peer can be destroyed in three cases: /// 1. Received a gc message; /// 2. Received a message whose target peer's ID is larger than this; diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 76baf31f9c8..492595851e2 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -12,7 +12,7 @@ pub use command::{ RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, SPLIT_PREFIX, }; -pub use life::DestroyProgress; +pub use life::{DestroyProgress, GcPeerContext}; pub use ready::{ cf_offset, write_initial_states, ApplyTrace, AsyncWriter, DataTrace, GenSnapTask, SnapState, StateStorage, diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 03dce74d4e7..c77766f6ce5 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -25,7 +25,10 @@ use std::{cmp, time::Instant}; use engine_traits::{KvEngine, RaftEngine}; use error_code::ErrorCodeExt; -use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; +use kvproto::{ + raft_cmdpb::AdminCmdType, + raft_serverpb::{ExtraMessageType, RaftMessage}, +}; use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::{ @@ -168,7 +171,7 @@ impl Peer { } } - pub fn on_raft_message( + pub fn on_raft_message( &mut self, ctx: &mut StoreContext, mut msg: Box, @@ -187,16 +190,34 @@ impl Peer { if !self.serving() { return; } + if util::is_vote_msg(msg.get_message()) && self.maybe_gc_sender(&msg) { + return; + } if msg.get_to_peer().get_store_id() != self.peer().get_store_id() { ctx.raft_metrics.message_dropped.mismatch_store_id.inc(); return; } - if !msg.has_region_epoch() { - ctx.raft_metrics.message_dropped.mismatch_region_epoch.inc(); + if msg.get_is_tombstone() { + self.on_tombstone_message(&mut msg); return; } - if msg.get_is_tombstone() { - self.mark_for_destroy(None); + if msg.has_extra_msg() && msg.get_to_peer().get_id() == self.peer_id() { + // GcRequest/GcResponse may be sent from/to different regions, skip further + // checks. + match msg.get_extra_msg().get_type() { + ExtraMessageType::MsgGcPeerResponse => { + self.on_gc_peer_response(&msg); + return; + } + ExtraMessageType::MsgGcPeerRequest => { + self.on_gc_peer_request(ctx, &msg); + return; + } + _ => (), + } + } + if !msg.has_region_epoch() { + ctx.raft_metrics.message_dropped.mismatch_region_epoch.inc(); return; } if msg.has_merge_target() { @@ -221,7 +242,6 @@ impl Peer { } if msg.has_extra_msg() { unimplemented!(); - // return; } // TODO: drop all msg append when the peer is uninitialized and has conflict @@ -465,6 +485,7 @@ impl Peer { ctx.has_ready = true; if !has_extra_write + && !self.has_pending_messages() && !self.raft_group().has_ready() && (self.serving() || self.postponed_destroy()) { @@ -508,6 +529,11 @@ impl Peer { self.send_raft_message_on_leader(ctx, msg); } } + if self.has_pending_messages() { + for msg in self.take_pending_messages() { + self.send_raft_message_on_leader(ctx, msg); + } + } } self.apply_reads(ctx, &ready); @@ -533,6 +559,15 @@ impl Peer { .flat_map(|m| self.build_raft_message(m)) .collect(); } + if self.has_pending_messages() { + if write_task.messages.is_empty() { + write_task.messages = self.take_pending_messages(); + } else { + write_task + .messages + .append(&mut self.take_pending_messages()); + } + } if !self.serving() { self.start_destroy(ctx, &mut write_task); if self.persisted_index() != 0 { @@ -750,6 +785,7 @@ impl Peer { self.add_pending_tick(PeerTick::CompactLog); self.add_pending_tick(PeerTick::SplitRegionCheck); self.add_pending_tick(PeerTick::CheckLongUncommitted); + self.maybe_schedule_gc_peer_tick(); } StateRole::Follower => { self.leader_lease_mut().expire(); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 1fae813577c..adf20bfce37 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -546,6 +546,8 @@ impl Storage { let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap.get_data())?; let region = snap_data.take_region(); + let removed_records = snap_data.take_removed_records(); + let merged_records = snap_data.take_merged_records(); if region.get_id() != region_id { return Err(box_err!( "mismatch region id {}!={}", @@ -586,6 +588,8 @@ impl Storage { let region_state = self.region_state_mut(); region_state.set_state(PeerState::Normal); region_state.set_region(region); + region_state.set_removed_records(removed_records); + region_state.set_merged_records(merged_records); region_state.set_tablet_index(last_index); let entry_storage = self.entry_storage_mut(); entry_storage.raft_state_mut().set_last_index(last_index); diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 6cfcda4da25..814dc72e622 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -10,7 +10,10 @@ use collections::{HashMap, HashSet}; use engine_traits::{ CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, }; -use kvproto::{metapb, pdpb, raft_serverpb::RegionLocalState}; +use kvproto::{ + metapb, pdpb, + raft_serverpb::{RaftMessage, RegionLocalState}, +}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; use raftstore::{ @@ -28,8 +31,8 @@ use super::storage::Storage; use crate::{ fsm::ApplyScheduler, operation::{ - AsyncWriter, CompactLogContext, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, - SplitFlowControl, TxnContext, + AsyncWriter, CompactLogContext, DestroyProgress, GcPeerContext, ProposalControl, + SimpleWriteReqEncoder, SplitFlowControl, TxnContext, }, router::{CmdResChannel, PeerTick, QueryResChannel}, Result, @@ -103,6 +106,12 @@ pub struct Peer { leader_transferee: u64, long_uncommitted_threshold: u64, + + /// Pending messages to be sent on handle ready. We should avoid sending + /// messages immediately otherwise it may break the persistence assumption. + pending_messages: Vec, + + gc_peer_context: GcPeerContext, } impl Peer { @@ -182,6 +191,8 @@ impl Peer { cfg.long_uncommitted_base_threshold.0.as_secs(), 1, ), + pending_messages: vec![], + gc_peer_context: GcPeerContext::default(), }; // If this region has only one peer and I am the one, campaign directly. @@ -624,6 +635,7 @@ impl Peer { #[inline] pub fn add_pending_tick(&mut self, tick: PeerTick) { + // Msg per batch is 4096/256 by default, the buffer won't grow too large. self.pending_ticks.push(tick); } @@ -755,4 +767,29 @@ impl Peer { pub fn set_long_uncommitted_threshold(&mut self, dur: Duration) { self.long_uncommitted_threshold = cmp::max(dur.as_secs(), 1); } + + #[inline] + pub fn add_message(&mut self, msg: RaftMessage) { + self.pending_messages.push(msg); + } + + #[inline] + pub fn has_pending_messages(&mut self) -> bool { + !self.pending_messages.is_empty() + } + + #[inline] + pub fn take_pending_messages(&mut self) -> Vec { + mem::take(&mut self.pending_messages) + } + + #[inline] + pub fn gc_peer_context(&self) -> &GcPeerContext { + &self.gc_peer_context + } + + #[inline] + pub fn gc_peer_context_mut(&mut self) -> &mut GcPeerContext { + &mut self.gc_peer_context + } } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 8814a97cc5f..a14c9ba9866 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -33,6 +33,7 @@ pub enum PeerTick { ReactivateMemoryLock = 8, ReportBuckets = 9, CheckLongUncommitted = 10, + GcPeer = 11, } impl PeerTick { @@ -52,6 +53,7 @@ impl PeerTick { PeerTick::ReactivateMemoryLock => "reactivate_memory_lock", PeerTick::ReportBuckets => "report_buckets", PeerTick::CheckLongUncommitted => "check_long_uncommitted", + PeerTick::GcPeer => "gc_peer", } } @@ -68,6 +70,7 @@ impl PeerTick { PeerTick::ReactivateMemoryLock, PeerTick::ReportBuckets, PeerTick::CheckLongUncommitted, + PeerTick::GcPeer, ]; TICKS } diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 2076272b44b..ac3f30c7107 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -525,6 +525,10 @@ impl Cluster { &self.nodes[offset] } + pub fn receiver(&self, offset: usize) -> &Receiver { + &self.receivers[offset] + } + /// Send messages and wait for side effects are all handled. #[allow(clippy::vec_box)] pub fn dispatch(&self, region_id: u64, mut msgs: Vec>) { diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 4b3445a00ad..7fa75a5a281 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -38,7 +38,7 @@ fn test_simple_change() { let match_index = meta.raft_apply.applied_index; assert_eq!(meta.region_state.epoch.version, epoch.get_version()); assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); - assert_eq!(meta.region_state.peers, vec![leader_peer, new_peer]); + assert_eq!(meta.region_state.peers, vec![leader_peer, new_peer.clone()]); // So heartbeat will create a learner. cluster.dispatch(2, vec![]); @@ -96,6 +96,42 @@ fn test_simple_change() { assert_eq!(meta.region_state.epoch.version, epoch.get_version()); assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); assert_eq!(meta.region_state.peers, vec![leader_peer]); + cluster.routers[0].wait_flush(region_id, Duration::from_millis(300)); + let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert!( + region_state.get_removed_records().contains(&new_peer), + "{:?}", + region_state + ); + + // If adding a peer on the same store, removed_records should be cleaned. + req.mut_header() + .mut_region_epoch() + .set_conf_ver(new_conf_ver); + req.mut_admin_request() + .mut_change_peer() + .set_change_type(ConfChangeType::AddLearnerNode); + req.mut_admin_request() + .mut_change_peer() + .mut_peer() + .set_id(11); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + cluster.routers[0].wait_flush(region_id, Duration::from_millis(300)); + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert!( + region_state.get_removed_records().is_empty(), + "{:?}", + region_state + ); + // TODO: check if the peer is removed once life trace is implemented or // snapshot is implemented. diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs index a2ae0bbb9f8..2a5dfafc509 100644 --- a/components/raftstore-v2/tests/integrations/test_life.rs +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -7,14 +7,19 @@ use std::{ }; use crossbeam::channel::TrySendError; -use engine_traits::{RaftEngine, RaftEngineReadOnly}; +use engine_traits::{RaftEngine, RaftEngineReadOnly, CF_DEFAULT}; use futures::executor::block_on; use kvproto::{ metapb, - raft_serverpb::{PeerState, RaftMessage}, + raft_cmdpb::AdminCmdType, + raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}, }; -use raftstore_v2::router::{DebugInfoChannel, PeerMsg}; -use tikv_util::store::new_peer; +use raft::prelude::{ConfChangeType, MessageType}; +use raftstore_v2::{ + router::{DebugInfoChannel, PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; +use tikv_util::store::{new_learner_peer, new_peer}; use crate::cluster::{Cluster, TestRouter}; @@ -62,6 +67,23 @@ fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb ); } +#[track_caller] +fn assert_valid_report(report: &RaftMessage, region_id: u64, peer_id: u64) { + assert_eq!( + report.get_extra_msg().get_type(), + ExtraMessageType::MsgGcPeerResponse + ); + assert_eq!(report.get_region_id(), region_id); + assert_eq!(report.get_from_peer().get_id(), peer_id); +} + +#[track_caller] +fn assert_tombstone_msg(msg: &RaftMessage, region_id: u64, peer_id: u64) { + assert_eq!(msg.get_region_id(), region_id); + assert_eq!(msg.get_to_peer().get_id(), peer_id); + assert!(msg.get_is_tombstone()); +} + /// Test a peer can be created by general raft message and destroyed tombstone /// message. #[test] @@ -99,9 +121,6 @@ fn test_life_by_message() { msg.take_region_epoch(); }); - // Check tombstone. - assert_wrong(&|msg| msg.set_is_tombstone(true)); - // Correct message will create a peer, but the peer will not be initialized. router.send_raft_message(msg.clone()).unwrap(); let timeout = Duration::from_secs(3); @@ -156,11 +175,20 @@ fn test_destroy_by_larger_id() { msg.mut_region_epoch().set_conf_ver(1); msg.set_from_peer(new_peer(2, 8)); let raft_message = msg.mut_message(); - raft_message.set_msg_type(raft::prelude::MessageType::MsgHeartbeat); + raft_message.set_msg_type(MessageType::MsgHeartbeat); raft_message.set_from(6); raft_message.set_term(init_term); // Create the peer. router.send_raft_message(msg.clone()).unwrap(); + // There must be heartbeat response. + let hb = cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_eq!( + hb.get_message().get_msg_type(), + MessageType::MsgHeartbeatResponse + ); let timeout = Duration::from_secs(3); let meta = router @@ -178,6 +206,20 @@ fn test_destroy_by_larger_id() { .unwrap(); assert_eq!(meta.raft_status.id, test_peer_id); assert_eq!(meta.raft_status.hard_state.term, init_term); + cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap_err(); + + // Smaller ID tombstone message should trigger report. + let mut smaller_id_tombstone_msg = smaller_id_msg.clone(); + smaller_id_tombstone_msg.set_is_tombstone(true); + router.send_raft_message(smaller_id_tombstone_msg).unwrap(); + let report = cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_valid_report(&report, test_region_id, test_peer_id - 1); // Larger ID should trigger destroy. let mut larger_id_msg = smaller_id_msg; @@ -199,3 +241,159 @@ fn test_destroy_by_larger_id() { assert_eq!(meta.raft_status.id, test_peer_id + 1); assert_eq!(meta.raft_status.hard_state.term, init_term + 1); } + +#[test] +fn test_gc_peer_request() { + let cluster = Cluster::default(); + let router = &cluster.routers[0]; + let test_region_id = 4; + let test_peer_id = 5; + let test_leader_id = 6; + + let mut msg = Box::::default(); + msg.set_region_id(test_region_id); + msg.set_to_peer(new_peer(1, test_peer_id)); + msg.mut_region_epoch().set_conf_ver(1); + msg.set_from_peer(new_peer(2, test_leader_id)); + let raft_message = msg.mut_message(); + raft_message.set_msg_type(raft::prelude::MessageType::MsgHeartbeat); + raft_message.set_from(6); + raft_message.set_term(5); + + // Tombstone message should create the peer and then destroy it. + let mut tombstone_msg = msg.clone(); + tombstone_msg.set_is_tombstone(true); + router.send_raft_message(tombstone_msg.clone()).unwrap(); + cluster.routers[0].wait_flush(test_region_id, Duration::from_millis(300)); + assert_peer_not_exist(test_region_id, test_peer_id, router); + // Resend a normal message will not create the peer. + router.send_raft_message(msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, router); + cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap_err(); + // Resend tombstone message should trigger report. + router.send_raft_message(tombstone_msg).unwrap(); + assert_peer_not_exist(test_region_id, test_peer_id, router); + let report = cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_valid_report(&report, test_region_id, test_peer_id); +} + +#[test] +fn test_gc_peer_response() { + let cluster = Cluster::with_node_count(2, None); + let region_id = 2; + let mut req = cluster.routers[0].new_request_for(region_id); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddLearnerNode); + let store_id = cluster.node(1).id(); + let new_peer = new_learner_peer(store_id, 10); + admin_req.mut_change_peer().set_peer(new_peer.clone()); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert!(region_state.get_removed_records().is_empty()); + + let new_conf_ver = req.get_header().get_region_epoch().get_conf_ver() + 1; + req.mut_header() + .mut_region_epoch() + .set_conf_ver(new_conf_ver); + req.mut_admin_request() + .mut_change_peer() + .set_change_type(ConfChangeType::RemoveNode); + let resp = cluster.routers[0] + .admin_command(region_id, req.clone()) + .unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + cluster.routers[0].wait_flush(region_id, Duration::from_millis(300)); + // Drain all existing messages. + while cluster.receiver(0).try_recv().is_ok() {} + + let mut msg = Box::::default(); + msg.set_region_id(region_id); + msg.set_to_peer(req.get_header().get_peer().clone()); + msg.set_from_peer(new_peer); + let receiver = &cluster.receiver(0); + for ty in &[MessageType::MsgRequestVote, MessageType::MsgRequestPreVote] { + msg.mut_message().set_msg_type(*ty); + cluster.routers[0].send_raft_message(msg.clone()).unwrap(); + let tombstone_msg = match receiver.recv_timeout(Duration::from_millis(300)) { + Ok(msg) => msg, + Err(e) => panic!("failed to receive tombstone message {:?}: {:?}", ty, e), + }; + assert_tombstone_msg(&tombstone_msg, region_id, 10); + } + // Non-vote message should not trigger tombstone. + msg.mut_message().set_msg_type(MessageType::MsgHeartbeat); + cluster.routers[0].send_raft_message(msg).unwrap(); + cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap_err(); + + // GcTick should also trigger tombstone. + cluster.routers[0] + .send(region_id, PeerMsg::Tick(PeerTick::GcPeer)) + .unwrap(); + let tombstone_msg = cluster + .receiver(0) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_tombstone_msg(&tombstone_msg, region_id, 10); + + // First message to create the peer and destroy. + cluster.routers[1] + .send_raft_message(Box::new(tombstone_msg.clone())) + .unwrap(); + cluster.routers[1].wait_flush(region_id, Duration::from_millis(300)); + cluster + .receiver(1) + .recv_timeout(Duration::from_millis(300)) + .unwrap_err(); + // Send message should trigger tombstone report. + cluster.routers[1] + .send_raft_message(Box::new(tombstone_msg)) + .unwrap(); + let report = cluster + .receiver(1) + .recv_timeout(Duration::from_millis(300)) + .unwrap(); + assert_valid_report(&report, region_id, 10); + cluster.routers[0] + .send_raft_message(Box::new(report)) + .unwrap(); + let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_removed_records().len(), 1); + // Tick should flush records gc. + cluster.routers[0] + .send(region_id, PeerMsg::Tick(PeerTick::GcPeer)) + .unwrap(); + // Trigger a write to make sure records gc is finished. + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + cluster.routers[0].send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + cluster.routers[0].wait_flush(region_id, Duration::from_millis(300)); + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert!(region_state.get_removed_records().is_empty()); +} diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index 45492feb294..5b53ad499b5 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -218,6 +218,8 @@ where snap_data.set_region(region_state.get_region().clone()); snap_data.set_version(TABLET_SNAPSHOT_VERSION); snap_data.mut_meta().set_for_balance(for_balance); + snap_data.set_removed_records(region_state.get_removed_records().into()); + snap_data.set_merged_records(region_state.get_merged_records().into()); snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); // create checkpointer. diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index bb262b9ffa8..1853d200140 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1695,9 +1695,9 @@ where AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { self.exec_flashback(ctx, request) } + AdminCmdType::UpdateGcPeer => Err(box_err!("v2 only command and it's safe to skip")), AdminCmdType::BatchSwitchWitness => self.exec_batch_switch_witness(ctx, request), AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), - AdminCmdType::UpdateGcPeer => unimplemented!(), }?; response.set_cmd_type(cmd_type); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 75da7d497e4..d5b73e5f721 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2790,8 +2790,8 @@ where ExtraMessageType::MsgVoterReplicatedIndexResponse => { self.on_voter_replicated_index_response(msg.get_extra_msg()); } - ExtraMessageType::MsgGcPeerRequest => unimplemented!(), - ExtraMessageType::MsgGcPeerResponse => unimplemented!(), + // It's v2 only message and ignore does no harm. + ExtraMessageType::MsgGcPeerRequest | ExtraMessageType::MsgGcPeerResponse => (), } } From 44a586f9083a83fa7f083acddc1dfd336ba7d264 Mon Sep 17 00:00:00 2001 From: Connor Date: Fri, 3 Feb 2023 19:01:56 +0800 Subject: [PATCH 485/676] raftstore: support priority scheduling for async write (#14103) ref tikv/tikv#13730 Support priority-based scheduling for the async write. Each channel of async write worker is replaced with a priority-based channel when the config `resource-control.enabled` is true. Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 + components/batch-system/src/batch.rs | 25 +- components/batch-system/src/channel.rs | 252 ------------------ components/batch-system/src/fsm.rs | 8 +- components/batch-system/src/lib.rs | 4 +- components/batch-system/src/scheduler.rs | 105 ++++++++ components/batch-system/src/test_runner.rs | 3 +- components/raftstore-v2/src/batch/store.rs | 2 +- components/raftstore-v2/src/router/message.rs | 2 +- .../raftstore/src/store/async_io/write.rs | 49 +++- .../src/store/async_io/write_router.rs | 106 +++++--- .../src/store/async_io/write_tests.rs | 160 +++++++++-- components/raftstore/src/store/fsm/apply.rs | 5 +- components/raftstore/src/store/fsm/store.rs | 21 +- components/raftstore/src/store/msg.rs | 2 +- .../src/store/worker/refresh_config.rs | 2 +- components/resource_control/Cargo.toml | 2 + components/resource_control/src/channel.rs | 183 +++++++++++++ components/resource_control/src/lib.rs | 3 + 19 files changed, 580 insertions(+), 356 deletions(-) delete mode 100644 components/batch-system/src/channel.rs create mode 100644 components/batch-system/src/scheduler.rs create mode 100644 components/resource_control/src/channel.rs diff --git a/Cargo.lock b/Cargo.lock index 633194d9323..21145778082 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4689,6 +4689,8 @@ name = "resource_control" version = "0.0.1" dependencies = [ "byteorder", + "collections", + "crossbeam", "crossbeam-skiplist", "dashmap", "fail", diff --git a/components/batch-system/src/batch.rs b/components/batch-system/src/batch.rs index 48ef809d421..19005ef2c43 100644 --- a/components/batch-system/src/batch.rs +++ b/components/batch-system/src/batch.rs @@ -17,18 +17,21 @@ use std::{ use fail::fail_point; use file_system::{set_io_type, IoType}; -use resource_control::ResourceController; +use resource_control::{ + channel::{unbounded, Receiver, Sender}, + ResourceController, +}; use tikv_util::{ debug, error, info, mpsc, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, time::Instant, }; use crate::{ - channel::{fsm_channel, ControlScheduler, FsmReceiver, FsmSender, NormalScheduler}, config::Config, fsm::{Fsm, FsmScheduler, Priority}, mailbox::BasicMailbox, router::Router, + scheduler::{ControlScheduler, NormalScheduler}, }; /// A unify type for FSMs so that they can be sent to channel easily. @@ -288,7 +291,7 @@ pub trait PollHandler: Send + 'static { /// Internal poller that fetches batch and call handler hooks for readiness. pub struct Poller { pub router: Router, ControlScheduler>, - pub fsm_receiver: FsmReceiver, + pub fsm_receiver: Receiver>, pub handler: Handler, pub max_batch_size: usize, pub reschedule_duration: Duration, @@ -481,8 +484,8 @@ pub trait HandlerBuilder { pub struct BatchSystem { name_prefix: Option, router: BatchRouter, - receiver: FsmReceiver, - low_receiver: FsmReceiver, + receiver: Receiver>, + low_receiver: Receiver>, pool_size: usize, max_batch_size: usize, workers: Arc>>>, @@ -599,8 +602,8 @@ where struct PoolStateBuilder { max_batch_size: usize, reschedule_duration: Duration, - fsm_receiver: FsmReceiver, - fsm_sender: FsmSender, + fsm_receiver: Receiver>, + fsm_sender: Sender>, pool_size: usize, } @@ -633,8 +636,8 @@ impl PoolStateBuilder { pub struct PoolState> { pub name_prefix: String, pub handler_builder: H, - pub fsm_receiver: FsmReceiver, - pub fsm_sender: FsmSender, + pub fsm_receiver: Receiver>, + pub fsm_sender: Sender>, pub low_priority_pool_size: usize, pub expected_pool_size: usize, pub workers: Arc>>>, @@ -658,8 +661,8 @@ pub fn create_system( ) -> (BatchRouter, BatchSystem) { let state_cnt = Arc::new(AtomicUsize::new(0)); let control_box = BasicMailbox::new(sender, controller, state_cnt.clone()); - let (sender, receiver) = fsm_channel(resource_ctl); - let (low_sender, low_receiver) = fsm_channel(None); // no resource control for low fsm + let (sender, receiver) = unbounded(resource_ctl); + let (low_sender, low_receiver) = unbounded(None); // no resource control for low fsm let normal_scheduler = NormalScheduler { sender: sender.clone(), low_sender, diff --git a/components/batch-system/src/channel.rs b/components/batch-system/src/channel.rs deleted file mode 100644 index 094b6a7a2ae..00000000000 --- a/components/batch-system/src/channel.rs +++ /dev/null @@ -1,252 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{cell::RefCell, sync::Arc}; - -use crossbeam::channel::{self, RecvError, SendError, TryRecvError, TrySendError}; -use kvproto::kvrpcpb::CommandPri; -use resource_control::{ResourceConsumeType, ResourceController}; -use tikv_util::{mpsc::priority_queue, warn}; - -use crate::{ - fsm::{Fsm, FsmScheduler, Priority, ResourceMetered}, - FsmTypes, -}; - -pub fn fsm_channel( - resource_ctl: Option>, -) -> (FsmSender, FsmReceiver) { - if let Some(ctl) = resource_ctl { - let (tx, rx) = priority_queue::unbounded(); - ( - FsmSender::Priority { - resource_ctl: ctl, - sender: tx, - last_msg_group: RefCell::new(String::new()), - }, - FsmReceiver::Priority(rx), - ) - } else { - let (tx, rx) = channel::unbounded(); - (FsmSender::Vanilla(tx), FsmReceiver::Vanilla(rx)) - } -} - -pub struct NormalScheduler { - pub(crate) sender: FsmSender, - pub(crate) low_sender: FsmSender, -} - -impl Clone for NormalScheduler -where - N: Fsm, - C: Fsm, -{ - fn clone(&self) -> Self { - NormalScheduler { - sender: self.sender.clone(), - low_sender: self.low_sender.clone(), - } - } -} - -impl FsmScheduler for NormalScheduler -where - N: Fsm, - C: Fsm, -{ - type Fsm = N; - - fn consume_msg_resource(&self, msg: &::Message) { - self.sender.consume_msg_resource(msg); - } - - #[inline] - fn schedule(&self, fsm: Box) { - let sender = match fsm.get_priority() { - Priority::Normal => &self.sender, - Priority::Low => &self.low_sender, - }; - - match sender.send(FsmTypes::Normal(fsm)) { - Ok(()) => {} - Err(SendError(FsmTypes::Normal(fsm))) => warn!("failed to schedule fsm {:p}", fsm), - _ => unreachable!(), - } - } - - fn shutdown(&self) { - // TODO: close it explicitly once it's supported. - // Magic number, actually any number greater than poll pool size works. - for _ in 0..256 { - let _ = self.sender.send(FsmTypes::Empty); - let _ = self.low_sender.send(FsmTypes::Empty); - } - } -} - -pub struct ControlScheduler { - pub(crate) sender: FsmSender, -} - -impl Clone for ControlScheduler -where - N: Fsm, - C: Fsm, -{ - fn clone(&self) -> Self { - ControlScheduler { - sender: self.sender.clone(), - } - } -} - -impl FsmScheduler for ControlScheduler -where - N: Fsm, - C: Fsm, -{ - type Fsm = C; - - fn consume_msg_resource(&self, _msg: &::Message) {} - - #[inline] - fn schedule(&self, fsm: Box) { - match self.sender.send(FsmTypes::Control(fsm)) { - Ok(()) => {} - Err(SendError(FsmTypes::Control(fsm))) => warn!("failed to schedule fsm {:p}", fsm), - _ => unreachable!(), - } - } - - fn shutdown(&self) { - // TODO: close it explicitly once it's supported. - // Magic number, actually any number greater than poll pool size works. - for _ in 0..256 { - let _ = self.sender.send(FsmTypes::Empty); - } - } -} - -pub enum FsmSender { - Vanilla(channel::Sender>), - Priority { - resource_ctl: Arc, - sender: priority_queue::Sender>, - last_msg_group: RefCell, - }, -} - -impl Clone for FsmSender -where - N: Fsm, - C: Fsm, -{ - fn clone(&self) -> Self { - match self { - FsmSender::Vanilla(sender) => FsmSender::Vanilla(sender.clone()), - FsmSender::Priority { - resource_ctl, - sender, - .. - } => FsmSender::Priority { - resource_ctl: resource_ctl.clone(), - sender: sender.clone(), - last_msg_group: RefCell::new(String::new()), - }, - } - } -} - -impl FsmSender { - pub fn send(&self, fsm: FsmTypes) -> Result<(), SendError>> { - match self { - FsmSender::Vanilla(sender) => sender.send(fsm), - FsmSender::Priority { - resource_ctl, - sender, - last_msg_group, - } => { - // TODO: pass different priority - let pri = resource_ctl - .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); - sender.send(fsm, pri) - } - } - } - - pub fn try_send(&self, fsm: FsmTypes) -> Result<(), TrySendError>> { - match self { - FsmSender::Vanilla(sender) => sender.try_send(fsm), - FsmSender::Priority { - resource_ctl, - sender, - last_msg_group, - } => { - let priority = resource_ctl - .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); - sender.try_send(fsm, priority) - } - } - } - - fn consume_msg_resource(&self, msg: &N::Message) { - match self { - FsmSender::Vanilla(_) => {} - FsmSender::Priority { - resource_ctl, - last_msg_group, - .. - } => { - if let Some(mut groups) = msg.get_resource_consumptions() { - let mut dominant_group = "".to_owned(); - let mut max_write_bytes = 0; - for (group_name, write_bytes) in groups.drain() { - resource_ctl.consume( - group_name.as_bytes(), - ResourceConsumeType::IoBytes(write_bytes), - ); - if write_bytes > max_write_bytes { - dominant_group = group_name; - max_write_bytes = write_bytes; - } - } - *last_msg_group.borrow_mut() = dominant_group; - } - } - } - } -} - -pub enum FsmReceiver { - Vanilla(channel::Receiver>), - Priority(priority_queue::Receiver>), -} - -impl Clone for FsmReceiver -where - N: Fsm, - C: Fsm, -{ - fn clone(&self) -> Self { - match self { - FsmReceiver::Vanilla(receiver) => FsmReceiver::Vanilla(receiver.clone()), - FsmReceiver::Priority(receiver) => FsmReceiver::Priority(receiver.clone()), - } - } -} - -impl FsmReceiver { - pub fn recv(&self) -> Result, RecvError> { - match self { - FsmReceiver::Vanilla(receiver) => receiver.recv(), - FsmReceiver::Priority(receiver) => receiver.recv(), - } - } - - pub fn try_recv(&self) -> Result, TryRecvError> { - match self { - FsmReceiver::Vanilla(receiver) => receiver.try_recv(), - FsmReceiver::Priority(receiver) => receiver.try_recv(), - } - } -} diff --git a/components/batch-system/src/fsm.rs b/components/batch-system/src/fsm.rs index 5d9e009fa01..3fa5ad15a64 100644 --- a/components/batch-system/src/fsm.rs +++ b/components/batch-system/src/fsm.rs @@ -10,7 +10,7 @@ use std::{ usize, }; -use collections::HashMap; +use resource_control::ResourceMetered; use crate::mailbox::BasicMailbox; @@ -36,12 +36,6 @@ pub trait FsmScheduler { fn consume_msg_resource(&self, msg: &::Message); } -pub trait ResourceMetered { - fn get_resource_consumptions(&self) -> Option> { - None - } -} - /// A `Fsm` is a finite state machine. It should be able to be notified for /// updating internal state according to incoming messages. pub trait Fsm: Send + 'static { diff --git a/components/batch-system/src/lib.rs b/components/batch-system/src/lib.rs index f4f799dcc9a..2e59d42808c 100644 --- a/components/batch-system/src/lib.rs +++ b/components/batch-system/src/lib.rs @@ -1,12 +1,12 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod batch; -mod channel; mod config; mod fsm; mod mailbox; mod metrics; mod router; +mod scheduler; #[cfg(feature = "test-runner")] pub mod test_runner; @@ -17,7 +17,7 @@ pub use self::{ PollHandler, Poller, PoolState, }, config::Config, - fsm::{Fsm, FsmScheduler, Priority, ResourceMetered}, + fsm::{Fsm, FsmScheduler, Priority}, mailbox::{BasicMailbox, Mailbox}, router::Router, }; diff --git a/components/batch-system/src/scheduler.rs b/components/batch-system/src/scheduler.rs new file mode 100644 index 00000000000..9eadb125f78 --- /dev/null +++ b/components/batch-system/src/scheduler.rs @@ -0,0 +1,105 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use crossbeam::channel::SendError; +use resource_control::channel::Sender; +use tikv_util::warn; + +use crate::{ + fsm::{Fsm, FsmScheduler, Priority}, + FsmTypes, +}; +pub struct NormalScheduler { + pub(crate) sender: Sender>, + pub(crate) low_sender: Sender>, +} + +impl Clone for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + NormalScheduler { + sender: self.sender.clone(), + low_sender: self.low_sender.clone(), + } + } +} + +impl FsmScheduler for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = N; + + fn consume_msg_resource(&self, msg: &::Message) { + self.sender.consume_msg_resource(msg); + } + + #[inline] + fn schedule(&self, fsm: Box) { + let sender = match fsm.get_priority() { + Priority::Normal => &self.sender, + Priority::Low => &self.low_sender, + }; + + match sender.send(FsmTypes::Normal(fsm), 0) { + Ok(_) => {} + Err(SendError(FsmTypes::Normal(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty, 0); + let _ = self.low_sender.send(FsmTypes::Empty, 0); + } + } +} + +pub struct ControlScheduler { + pub(crate) sender: Sender>, +} + +impl Clone for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + ControlScheduler { + sender: self.sender.clone(), + } + } +} + +impl FsmScheduler for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = C; + + fn consume_msg_resource(&self, _msg: &::Message) {} + + #[inline] + fn schedule(&self, fsm: Box) { + match self.sender.send(FsmTypes::Control(fsm), 0) { + Ok(_) => {} + Err(SendError(FsmTypes::Control(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty, 0); + } + } +} diff --git a/components/batch-system/src/test_runner.rs b/components/batch-system/src/test_runner.rs index a3ae80dc55a..9a84a5fe545 100644 --- a/components/batch-system/src/test_runner.rs +++ b/components/batch-system/src/test_runner.rs @@ -13,9 +13,10 @@ use std::{ use collections::HashMap; use derive_more::{Add, AddAssign}; +use resource_control::ResourceMetered; use tikv_util::mpsc; -use crate::{fsm::ResourceMetered, *}; +use crate::*; /// Message `Runner` can accepts. pub enum Message { diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 2a3cc63f797..83d6b2e1f2a 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -477,7 +477,7 @@ impl Workers { async_read: Worker::new("async-read-worker"), pd, tablet_gc: Worker::new("tablet-gc-worker"), - async_write: StoreWriters::default(), + async_write: StoreWriters::new(None), purge, background, } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index a14c9ba9866..91efc54c867 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -2,13 +2,13 @@ // #[PerformanceCriticalPath] -use batch_system::ResourceMetered; use kvproto::{ metapb, raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, raft_serverpb::RaftMessage, }; use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs, GenSnapRes}; +use resource_control::ResourceMetered; use tikv_util::time::Instant; use super::{ diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 98c76ddd6d1..4d8392edd55 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -14,7 +14,7 @@ use std::{ }; use collections::HashMap; -use crossbeam::channel::{bounded, Receiver, Sender, TryRecvError}; +use crossbeam::channel::TryRecvError; use engine_traits::{ KvEngine, PerfContext, PerfContextKind, RaftEngine, RaftLogBatch, WriteBatch, WriteOptions, }; @@ -23,6 +23,10 @@ use fail::fail_point; use kvproto::raft_serverpb::{RaftLocalState, RaftMessage}; use protobuf::Message; use raft::eraftpb::Entry; +use resource_control::{ + channel::{bounded, Receiver, Sender}, + ResourceController, ResourceMetered, +}; use tikv_util::{ box_err, config::{ReadableSize, Tracker, VersionTrack}, @@ -41,6 +45,7 @@ use crate::{ local_metrics::{RaftSendMessageMetrics, StoreWriteMetrics, TimeTracker}, metrics::*, transport::Transport, + util, util::LatencyInspector, PeerMsg, }, @@ -268,6 +273,29 @@ where inspector: Vec, }, Shutdown, + #[cfg(test)] + Pause(std::sync::mpsc::Receiver<()>), +} + +impl ResourceMetered for WriteMsg +where + EK: KvEngine, + ER: RaftEngine, +{ + fn get_resource_consumptions(&self) -> Option> { + match self { + WriteMsg::WriteTask(t) => { + let mut map = HashMap::default(); + for entry in &t.entries { + let header = util::get_entry_header(entry); + let group_name = header.get_resource_group_name().to_owned(); + *map.entry(group_name).or_default() += entry.compute_size() as u64; + } + Some(map) + } + _ => None, + } + } } impl fmt::Debug for WriteMsg @@ -284,6 +312,8 @@ where ), WriteMsg::Shutdown => write!(fmt, "WriteMsg::Shutdown"), WriteMsg::LatencyInspect { .. } => write!(fmt, "WriteMsg::LatencyInspect"), + #[cfg(test)] + WriteMsg::Pause(_) => write!(fmt, "WriteMsg::Pause"), } } } @@ -641,6 +671,10 @@ where } => { self.pending_latency_inspect.push((send_time, inspector)); } + #[cfg(test)] + WriteMsg::Pause(rx) => { + let _ = rx.recv(); + } } false } @@ -845,13 +879,15 @@ where EK: KvEngine, ER: RaftEngine, { + resource_ctl: Option>, writers: Vec>>, handlers: Vec>, } -impl Default for StoreWriters { - fn default() -> Self { +impl StoreWriters { + pub fn new(resource_ctl: Option>) -> Self { Self { + resource_ctl, writers: vec![], handlers: vec![], } @@ -879,7 +915,10 @@ where let pool_size = cfg.value().store_io_pool_size; for i in 0..pool_size { let tag = format!("store-writer-{}", i); - let (tx, rx) = bounded(cfg.value().store_io_notify_capacity); + let (tx, rx) = bounded( + self.resource_ctl.clone(), + cfg.value().store_io_notify_capacity, + ); let mut worker = Worker::new( store_id, tag.clone(), @@ -906,7 +945,7 @@ where assert_eq!(self.writers.len(), self.handlers.len()); for (i, handler) in self.handlers.drain(..).enumerate() { info!("stopping store writer {}", i); - self.writers[i].send(WriteMsg::Shutdown).unwrap(); + self.writers[i].send(WriteMsg::Shutdown, 0).unwrap(); handler.join().unwrap(); } } diff --git a/components/raftstore/src/store/async_io/write_router.rs b/components/raftstore/src/store/async_io/write_router.rs index 6c1db6419cf..ead22f70b28 100644 --- a/components/raftstore/src/store/async_io/write_router.rs +++ b/components/raftstore/src/store/async_io/write_router.rs @@ -13,8 +13,9 @@ use std::{ time::Duration, }; -use crossbeam::channel::{Sender, TrySendError}; +use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine}; +use resource_control::channel::Sender; use tikv_util::{info, time::Instant}; use crate::store::{ @@ -22,7 +23,7 @@ use crate::store::{ metrics::*, }; -const RETRY_SCHEDULE_MILLISECONS: u64 = 10; +const RETRY_SCHEDULE_MILLISECONDS: u64 = 10; pub trait WriteRouterContext where @@ -68,6 +69,9 @@ where last_unpersisted: Option, /// Pending write msgs since rescheduling. pending_write_msgs: Vec>, + /// The scheduling priority of the last msg, only valid when priority + /// scheduling is enabled + last_msg_priority: u64, } impl WriteRouter @@ -83,6 +87,7 @@ where next_writer_id: None, last_unpersisted: None, pending_write_msgs: vec![], + last_msg_priority: 0, } } @@ -217,17 +222,21 @@ where } else { // Rescheduling fails at this time. Retry 10ms later. // The task should be sent to the original write worker. - self.next_retry_time = now + Duration::from_millis(RETRY_SCHEDULE_MILLISECONS); + self.next_retry_time = now + Duration::from_millis(RETRY_SCHEDULE_MILLISECONDS); true } } - fn send>(&self, ctx: &mut C, msg: WriteMsg) { - match ctx.write_senders()[self.writer_id].try_send(msg) { - Ok(()) => (), + fn send>(&mut self, ctx: &mut C, msg: WriteMsg) { + let sender = &ctx.write_senders()[self.writer_id]; + sender.consume_msg_resource(&msg); + // pass the priority of last msg as low bound to make sure all messages of one + // peer are handled sequentially. + match sender.try_send(msg, self.last_msg_priority) { + Ok(priority) => self.last_msg_priority = priority, Err(TrySendError::Full(msg)) => { let now = Instant::now(); - if ctx.write_senders()[self.writer_id].send(msg).is_err() { + if sender.send(msg, self.last_msg_priority).is_err() { // Write threads are destroyed after store threads during shutdown. panic!("{} failed to send write msg, err: disconnected", self.tag); } @@ -275,35 +284,55 @@ impl Index for WriteSenders { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use std::thread; - use crossbeam::channel::{bounded, Receiver}; - use engine_test::kv::KvTestEngine; + use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; + use resource_control::channel::{bounded, Receiver}; use tikv_util::config::ReadableDuration; use super::*; + pub struct TestContext { + pub senders: WriteSenders, + pub config: Config, + pub raft_metrics: RaftMetrics, + } + + impl WriteRouterContext for TestContext { + fn write_senders(&self) -> &WriteSenders { + &self.senders + } + + fn config(&self) -> &Config { + &self.config + } + + fn raft_metrics(&self) -> &RaftMetrics { + &self.raft_metrics + } + } + struct TestWriteRouter { - receivers: Vec>>, - senders: WriteSenders, - config: Config, - raft_metrics: RaftMetrics, + receivers: Vec>>, + ctx: TestContext, } impl TestWriteRouter { fn new(config: Config) -> Self { let (mut receivers, mut senders) = (vec![], vec![]); for _ in 0..config.store_io_pool_size { - let (tx, rx) = bounded(config.store_io_notify_capacity); + let (tx, rx) = bounded(None, config.store_io_notify_capacity); receivers.push(rx); senders.push(tx); } Self { receivers, - senders: WriteSenders::new(senders), - config, - raft_metrics: RaftMetrics::new(true), + ctx: TestContext { + senders: WriteSenders::new(senders), + config, + raft_metrics: RaftMetrics::new(true), + }, } } @@ -321,6 +350,7 @@ mod tests { fn must_same_reschedule_count(&self, count: usize) { let cnt = self + .ctx .senders .io_reschedule_concurrent_count .load(Ordering::Relaxed); @@ -330,20 +360,6 @@ mod tests { } } - impl WriteRouterContext for TestWriteRouter { - fn write_senders(&self) -> &WriteSenders { - &self.senders - } - - fn config(&self) -> &Config { - &self.config - } - - fn raft_metrics(&self) -> &RaftMetrics { - &self.raft_metrics - } - } - #[test] fn test_write_router_no_schedule() { let mut config = Config::new(); @@ -352,10 +368,10 @@ mod tests { config.store_io_pool_size = 4; let mut t = TestWriteRouter::new(config); let mut r = WriteRouter::new("1".to_string()); - r.send_write_msg(&mut t, None, WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, None, WriteMsg::Shutdown); let writer_id = r.writer_id; for _ in 1..10 { - r.send_write_msg(&mut t, Some(10), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(10), WriteMsg::Shutdown); thread::sleep(Duration::from_millis(10)); } assert_eq!(writer_id, r.writer_id); @@ -375,7 +391,7 @@ mod tests { let last_time = r.next_retry_time; thread::sleep(Duration::from_millis(10)); // `writer_id` will be chosen randomly due to `last_unpersisted` is None - r.send_write_msg(&mut t, None, WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, None, WriteMsg::Shutdown); assert!(r.next_retry_time > last_time); assert_eq!(r.next_writer_id, None); assert_eq!(r.last_unpersisted, None); @@ -390,7 +406,7 @@ mod tests { let writer_id = r.writer_id; let timer = Instant::now(); loop { - r.send_write_msg(&mut t, Some(10), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(10), WriteMsg::Shutdown); if let Some(id) = r.next_writer_id { assert!(writer_id != id); assert_eq!(r.last_unpersisted, Some(10)); @@ -408,7 +424,7 @@ mod tests { thread::sleep(Duration::from_millis(10)); } - r.send_write_msg(&mut t, Some(20), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(20), WriteMsg::Shutdown); assert!(r.next_writer_id.is_some()); // `last_unpersisted` should not change assert_eq!(r.last_unpersisted, Some(10)); @@ -417,7 +433,7 @@ mod tests { t.must_same_reschedule_count(1); // No effect due to 9 < `last_unpersisted`(10) - r.check_new_persisted(&mut t, 9); + r.check_new_persisted(&mut t.ctx, 9); assert!(r.next_writer_id.is_some()); assert_eq!(r.last_unpersisted, Some(10)); assert_eq!(r.pending_write_msgs.len(), 2); @@ -425,7 +441,7 @@ mod tests { t.must_same_reschedule_count(1); // Should reschedule and send msg - r.check_new_persisted(&mut t, 10); + r.check_new_persisted(&mut t.ctx, 10); assert_eq!(r.next_writer_id, None); assert_eq!(r.last_unpersisted, None); assert!(r.pending_write_msgs.is_empty()); @@ -433,7 +449,8 @@ mod tests { t.must_same_reschedule_count(0); thread::sleep(Duration::from_millis(10)); - t.senders + t.ctx + .senders .io_reschedule_concurrent_count .store(4, Ordering::Relaxed); // Should retry reschedule next time because the limitation of concurrent count. @@ -441,7 +458,7 @@ mod tests { // so using loop here. let timer = Instant::now(); loop { - r.send_write_msg(&mut t, Some(30), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(30), WriteMsg::Shutdown); t.must_same_msg_count(r.writer_id, 1); if r.next_writer_id.is_some() { assert_eq!(r.last_unpersisted, None); @@ -456,12 +473,13 @@ mod tests { thread::sleep(Duration::from_millis(10)); } - t.senders + t.ctx + .senders .io_reschedule_concurrent_count .store(3, Ordering::Relaxed); - thread::sleep(Duration::from_millis(RETRY_SCHEDULE_MILLISECONS + 2)); + thread::sleep(Duration::from_millis(RETRY_SCHEDULE_MILLISECONDS + 2)); // Should reschedule now - r.send_write_msg(&mut t, Some(40), WriteMsg::Shutdown); + r.send_write_msg(&mut t.ctx, Some(40), WriteMsg::Shutdown); assert!(r.next_writer_id.is_some()); assert_eq!(r.last_unpersisted, Some(40)); t.must_same_msg_count(r.writer_id, 0); diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index d1861a8903c..cae5842c8b8 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -1,20 +1,27 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{sync::mpsc, time::Duration}; use collections::HashSet; -use crossbeam::channel::unbounded; +use crossbeam::channel::{unbounded, Receiver, Sender}; use engine_test::{kv::KvTestEngine, new_temp_engine, raft::RaftTestEngine}; use engine_traits::{Engines, Mutable, Peekable, RaftEngineReadOnly, WriteBatchExt}; -use kvproto::raft_serverpb::{RaftApplyState, RaftMessage, RegionLocalState}; +use kvproto::{ + raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, + raft_serverpb::{RaftApplyState, RaftMessage, RegionLocalState}, + resource_manager::{GroupMode, GroupRawResourceSettings, ResourceGroup}, +}; +use resource_control::ResourceGroupManager; use tempfile::Builder; use super::*; use crate::{ - store::{peer_storage::tests::new_entry, Config, Transport}, + store::{ + async_io::write_router::tests::TestContext, local_metrics::RaftMetrics, + peer_storage::tests::new_entry, Config, Transport, WriteRouter, + }, Result, }; - type TestKvWriteBatch = ::WriteBatch; type TestRaftLogBatch = ::LogBatch; @@ -122,7 +129,7 @@ fn must_wait_same_notifies( } let timer = Instant::now(); loop { - match notify_rx.recv() { + match notify_rx.recv_timeout(Duration::from_secs(3)) { Ok((region_id, n)) => { if let Some(n2) = notify_map.get(®ion_id) { if n == *n2 { @@ -196,7 +203,7 @@ struct TestWorker { impl TestWorker { fn new(cfg: &Config, engines: &Engines) -> Self { - let (_, task_rx) = unbounded(); + let (_, task_rx) = resource_control::channel::unbounded(None); let (msg_tx, msg_rx) = unbounded(); let trans = TestTransport { tx: msg_tx }; let (notify_tx, notify_rx) = unbounded(); @@ -222,15 +229,24 @@ struct TestWriters { writers: StoreWriters, msg_rx: Receiver, notify_rx: Receiver<(u64, (u64, u64))>, + ctx: TestContext, } impl TestWriters { - fn new(cfg: &Config, engines: &Engines) -> Self { + fn new( + cfg: Config, + engines: &Engines, + resource_manager: Option>, + ) -> Self { let (msg_tx, msg_rx) = unbounded(); let trans = TestTransport { tx: msg_tx }; let (notify_tx, notify_rx) = unbounded(); let notifier = TestNotifier { tx: notify_tx }; - let mut writers = StoreWriters::default(); + let mut writers = StoreWriters::new( + resource_manager + .as_ref() + .map(|m| m.derive_controller("test".into(), false)), + ); writers .spawn( 1, @@ -242,13 +258,21 @@ impl TestWriters { ) .unwrap(); Self { - writers, msg_rx, notify_rx, + ctx: TestContext { + config: cfg, + raft_metrics: RaftMetrics::new(true), + senders: writers.senders(), + }, + writers, } } - fn write_sender(&self, id: usize) -> Sender> { + fn write_sender( + &self, + id: usize, + ) -> resource_control::channel::Sender> { self.writers.senders()[id].clone() } } @@ -460,7 +484,7 @@ fn test_basic_flow() { let engines = new_temp_engine(&path); let mut cfg = Config::default(); cfg.store_io_pool_size = 2; - let mut t = TestWriters::new(&cfg, &engines); + let mut t = TestWriters::new(cfg, &engines, None); let mut task_1 = WriteTask::::new(region_1, 1, 10); init_write_batch(&engines, &mut task_1); @@ -474,7 +498,9 @@ fn test_basic_flow() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.write_sender(0).send(WriteMsg::WriteTask(task_1)).unwrap(); + t.write_sender(0) + .send(WriteMsg::WriteTask(task_1), 0) + .unwrap(); let mut task_2 = WriteTask::::new(2, 2, 20); init_write_batch(&engines, &mut task_2); @@ -488,7 +514,9 @@ fn test_basic_flow() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.write_sender(1).send(WriteMsg::WriteTask(task_2)).unwrap(); + t.write_sender(1) + .send(WriteMsg::WriteTask(task_2), 0) + .unwrap(); let mut task_3 = WriteTask::::new(region_1, 1, 15); init_write_batch(&engines, &mut task_3); @@ -502,7 +530,9 @@ fn test_basic_flow() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.write_sender(0).send(WriteMsg::WriteTask(task_3)).unwrap(); + t.write_sender(0) + .send(WriteMsg::WriteTask(task_3), 0) + .unwrap(); must_wait_same_notifies(vec![(region_1, (1, 15)), (region_2, (2, 20))], &t.notify_rx); @@ -532,7 +562,6 @@ fn test_basic_flow() { ); must_have_same_count_msg(6, &t.msg_rx); - t.writers.shutdown(); } @@ -548,7 +577,7 @@ fn test_basic_flow_with_states() { let engines = new_temp_engine(&path); let mut cfg = Config::default(); cfg.store_io_pool_size = 2; - let mut t = TestWriters::new(&cfg, &engines); + let mut t = TestWriters::new(cfg, &engines, None); let mut task_1 = WriteTask::::new(region_1, 1, 10); task_1.raft_wb = Some(engines.raft.log_batch(0)); @@ -571,7 +600,9 @@ fn test_basic_flow_with_states() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.write_sender(0).send(WriteMsg::WriteTask(task_1)).unwrap(); + t.write_sender(0) + .send(WriteMsg::WriteTask(task_1), 0) + .unwrap(); let mut task_2 = WriteTask::::new(2, 2, 20); task_2.raft_wb = Some(engines.raft.log_batch(0)); @@ -588,7 +619,9 @@ fn test_basic_flow_with_states() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.write_sender(1).send(WriteMsg::WriteTask(task_2)).unwrap(); + t.write_sender(1) + .send(WriteMsg::WriteTask(task_2), 0) + .unwrap(); let mut task_3 = WriteTask::::new(region_1, 1, 15); task_3.raft_wb = Some(engines.raft.log_batch(0)); @@ -604,7 +637,9 @@ fn test_basic_flow_with_states() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.write_sender(0).send(WriteMsg::WriteTask(task_3)).unwrap(); + t.write_sender(0) + .send(WriteMsg::WriteTask(task_3), 0) + .unwrap(); must_wait_same_notifies(vec![(region_1, (1, 15)), (region_2, (2, 20))], &t.notify_rx); @@ -645,3 +680,88 @@ fn test_basic_flow_with_states() { t.writers.shutdown(); } + +#[test] +fn test_resource_group() { + let region_1 = 1; + let region_2 = 2; + + let resource_manager = Arc::new(ResourceGroupManager::default()); + let get_group = |name: &str, read_tokens: u64, write_tokens: u64| -> ResourceGroup { + let mut group = ResourceGroup::new(); + group.set_name(name.to_string()); + group.set_mode(GroupMode::RawMode); + let mut resource_setting = GroupRawResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_raw_resource_settings(resource_setting); + group + }; + resource_manager.add_resource_group(get_group("group1", 10, 10)); + resource_manager.add_resource_group(get_group("group2", 100, 100)); + + let path = Builder::new().prefix("async-io-basic").tempdir().unwrap(); + let engines = new_temp_engine(&path); + let mut cfg = Config::default(); + cfg.store_io_pool_size = 1; + + let mut t = TestWriters::new(cfg, &engines, Some(resource_manager)); + + let (tx, rx) = mpsc::sync_channel(0); + t.write_sender(0).send(WriteMsg::Pause(rx), 0).unwrap(); + + let mut r = WriteRouter::new("1".to_string()); + let mut task_1 = WriteTask::::new(region_1, 1, 10); + init_write_batch(&engines, &mut task_1); + put_raft_kv(task_1.raft_wb.as_mut(), 17); + let entries = vec![new_entry(5, 5), new_entry(6, 5), new_entry(7, 5)]; + let mut entries = entries + .into_iter() + .map(|mut e| { + let mut req = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_resource_group_name("group1".to_owned()); + req.set_header(header); + e.set_data(req.write_to_bytes().unwrap().into()); + e + }) + .collect(); + task_1.entries.append(&mut entries); + task_1.raft_state = Some(new_raft_state(5, 234, 6, 7)); + task_1 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + r.send_write_msg(&mut t.ctx, None, WriteMsg::WriteTask(task_1)); + + let mut r = WriteRouter::new("2".to_string()); + let mut task_2 = WriteTask::::new(region_2, 2, 20); + init_write_batch(&engines, &mut task_2); + put_raft_kv(task_2.raft_wb.as_mut(), 27); + let entries = vec![new_entry(50, 12), new_entry(51, 13)]; + let mut entries = entries + .into_iter() + .map(|mut e| { + let mut req = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_resource_group_name("group2".to_owned()); + req.set_header(header); + e.set_data(req.write_to_bytes().unwrap().into()); + e + }) + .collect(); + task_2.entries.append(&mut entries); + task_2.raft_state = Some(new_raft_state(13, 567, 49, 51)); + task_2 + .messages + .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); + r.send_write_msg(&mut t.ctx, None, WriteMsg::WriteTask(task_2)); + + tx.send(()).unwrap(); + must_wait_same_notifies(vec![(region_1, (1, 10)), (region_2, (2, 20))], &t.notify_rx); +} diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 1853d200140..7f4e5497cb9 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -24,7 +24,7 @@ use std::{ use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, Config as BatchSystemConfig, Fsm, HandleResult, - HandlerBuilder, PollHandler, Priority, ResourceMetered, + HandlerBuilder, PollHandler, Priority, }; use collections::{HashMap, HashMapEntry, HashSet}; use crossbeam::channel::{TryRecvError, TrySendError}; @@ -51,7 +51,7 @@ use raft::eraftpb::{ ConfChange, ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot, }; use raft_proto::ConfChangeI; -use resource_control::ResourceController; +use resource_control::{ResourceController, ResourceMetered}; use smallvec::{smallvec, SmallVec}; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -4431,6 +4431,7 @@ pub enum ControlMsg { } impl ResourceMetered for ControlMsg {} + pub struct ControlFsm { receiver: Receiver, stopped: bool, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 85631bebe09..66acd187215 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -22,7 +22,7 @@ use batch_system::{ use causal_ts::CausalTsProviderImpl; use collections::{HashMap, HashMapEntry, HashSet}; use concurrency_manager::ConcurrencyManager; -use crossbeam::channel::{unbounded, TryRecvError, TrySendError}; +use crossbeam::channel::{TryRecvError, TrySendError}; use engine_traits::{ CompactedEvent, DeleteStrategy, Engines, KvEngine, Mutable, PerfContextKind, RaftEngine, RaftLogBatch, Range, WriteBatch, WriteOptions, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, @@ -42,7 +42,7 @@ use kvproto::{ use pd_client::{Feature, FeatureGate, PdClient}; use protobuf::Message; use raft::StateRole; -use resource_control::ResourceGroupManager; +use resource_control::{channel::unbounded, ResourceGroupManager}; use resource_metering::CollectorRegHandle; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -1053,12 +1053,13 @@ impl PollHandler, St } } else { let writer_id = rand::random::() % self.poll_ctx.cfg.store_io_pool_size; - if let Err(err) = - self.poll_ctx.write_senders[writer_id].try_send(WriteMsg::LatencyInspect { + if let Err(err) = self.poll_ctx.write_senders[writer_id].try_send( + WriteMsg::LatencyInspect { send_time: write_begin, inspector: latency_inspect, - }) - { + }, + 0, + ) { warn!("send latency inspecting to write workers failed"; "err" => ?err); } } @@ -1340,7 +1341,7 @@ where fn build(&mut self, _: Priority) -> RaftPoller { let sync_write_worker = if self.write_senders.is_empty() { - let (_, rx) = unbounded(); + let (_, rx) = unbounded(None); Some(WriteWorker::new( self.store.get_id(), "sync-writer".to_string(), @@ -1821,7 +1822,11 @@ pub fn create_raft_batch_system( apply_router, apply_system, router: raft_router.clone(), - store_writers: StoreWriters::default(), + store_writers: StoreWriters::new( + resource_manager + .as_ref() + .map(|m| m.derive_controller("store-writer".to_owned(), false)), + ), }; (raft_router, system) } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 195a94478dc..935210951f0 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -5,7 +5,6 @@ use std::sync::Arc; use std::{borrow::Cow, fmt}; -use batch_system::ResourceMetered; use collections::HashSet; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use futures::channel::mpsc::UnboundedSender; @@ -23,6 +22,7 @@ use kvproto::{ #[cfg(any(test, feature = "testexport"))] use pd_client::BucketMeta; use raft::SnapshotStatus; +use resource_control::ResourceMetered; use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; use tracker::{get_tls_tracker_token, TrackerToken}; diff --git a/components/raftstore/src/store/worker/refresh_config.rs b/components/raftstore/src/store/worker/refresh_config.rs index d09a6dd9f53..ff34b9abb4e 100644 --- a/components/raftstore/src/store/worker/refresh_config.rs +++ b/components/raftstore/src/store/worker/refresh_config.rs @@ -41,7 +41,7 @@ where { pub fn decrease_by(&mut self, size: usize) { for _ in 0..size { - if let Err(e) = self.state.fsm_sender.send(FsmTypes::Empty) { + if let Err(e) = self.state.fsm_sender.send(FsmTypes::Empty, 0) { error!( "failed to decrease thread pool"; "decrease to" => size, diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml index 39d37ac0f6b..2e1a0990d49 100644 --- a/components/resource_control/Cargo.toml +++ b/components/resource_control/Cargo.toml @@ -9,6 +9,8 @@ failpoints = ["fail/failpoints"] [dependencies] byteorder = "1.2" +collections = { workspace = true } +crossbeam = "0.8" crossbeam-skiplist = "0.1" dashmap = "5.1" fail = "0.5" diff --git a/components/resource_control/src/channel.rs b/components/resource_control/src/channel.rs new file mode 100644 index 00000000000..55bc2ed33b9 --- /dev/null +++ b/components/resource_control/src/channel.rs @@ -0,0 +1,183 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cell::RefCell, sync::Arc}; + +use collections::HashMap; +use crossbeam::channel::{self, RecvError, SendError, TryRecvError, TrySendError}; +use kvproto::kvrpcpb::CommandPri; +use tikv_util::mpsc::priority_queue; + +use crate::{ResourceConsumeType, ResourceController}; + +pub trait ResourceMetered { + // returns the msg consumption of each hash map + fn get_resource_consumptions(&self) -> Option> { + None + } +} + +pub fn bounded( + resource_ctl: Option>, + cap: usize, +) -> (Sender, Receiver) { + if let Some(ctl) = resource_ctl { + // TODO: make it bounded + let (tx, rx) = priority_queue::unbounded(); + ( + Sender::Priority { + resource_ctl: ctl, + sender: tx, + last_msg_group: RefCell::new(String::new()), + }, + Receiver::Priority(rx), + ) + } else { + let (tx, rx) = channel::bounded(cap); + (Sender::Vanilla(tx), Receiver::Vanilla(rx)) + } +} + +pub fn unbounded( + resource_ctl: Option>, +) -> (Sender, Receiver) { + if let Some(ctl) = resource_ctl { + let (tx, rx) = priority_queue::unbounded(); + ( + Sender::Priority { + resource_ctl: ctl, + sender: tx, + last_msg_group: RefCell::new(String::new()), + }, + Receiver::Priority(rx), + ) + } else { + let (tx, rx) = channel::unbounded(); + (Sender::Vanilla(tx), Receiver::Vanilla(rx)) + } +} + +pub enum Sender { + Vanilla(channel::Sender), + Priority { + resource_ctl: Arc, + sender: priority_queue::Sender, + last_msg_group: RefCell, + }, +} + +impl Clone for Sender { + fn clone(&self) -> Self { + match self { + Sender::Vanilla(sender) => Sender::Vanilla(sender.clone()), + Sender::Priority { + resource_ctl, + sender, + .. + } => Sender::Priority { + resource_ctl: resource_ctl.clone(), + sender: sender.clone(), + last_msg_group: RefCell::new(String::new()), + }, + } + } +} + +impl Sender { + // `low_bound` represents the lowest priority that the message can be sent with. + // It's used to make sure messages from one peer are sent in order. + // The returned value is the priority that the message sent with. It is + // calculated by resource controller and compared with `low_bound`. + pub fn send(&self, m: T, low_bound: u64) -> Result> { + match self { + Sender::Vanilla(sender) => sender.send(m).map(|_| 0), + Sender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + // TODO: pass different command priority + let priority = std::cmp::max( + resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal), + low_bound, + ); + sender.send(m, priority).map(|_| priority) + } + } + } + + pub fn try_send(&self, m: T, low_bound: u64) -> Result> { + match self { + Sender::Vanilla(sender) => sender.try_send(m).map(|_| 0), + Sender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + let priority = std::cmp::max( + resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal), + low_bound, + ); + sender.try_send(m, priority).map(|_| priority) + } + } + } + + pub fn consume_msg_resource(&self, msg: &impl ResourceMetered) { + match self { + Sender::Vanilla(_) => {} + Sender::Priority { + resource_ctl, + last_msg_group, + .. + } => { + if let Some(mut groups) = msg.get_resource_consumptions() { + let mut dominant_group = "".to_owned(); + let mut max_write_bytes = 0; + for (group_name, write_bytes) in groups.drain() { + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + if write_bytes > max_write_bytes { + dominant_group = group_name; + max_write_bytes = write_bytes; + } + } + *last_msg_group.borrow_mut() = dominant_group; + } + } + } + } +} + +pub enum Receiver { + Vanilla(channel::Receiver), + Priority(priority_queue::Receiver), +} + +impl Clone for Receiver { + fn clone(&self) -> Self { + match self { + Receiver::Vanilla(receiver) => Receiver::Vanilla(receiver.clone()), + Receiver::Priority(receiver) => Receiver::Priority(receiver.clone()), + } + } +} + +impl Receiver { + pub fn recv(&self) -> Result { + match self { + Receiver::Vanilla(receiver) => receiver.recv(), + Receiver::Priority(receiver) => receiver.recv(), + } + } + + pub fn try_recv(&self) -> Result { + match self { + Receiver::Vanilla(receiver) => receiver.try_recv(), + Receiver::Priority(receiver) => receiver.try_recv(), + } + } +} diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index 5534ed2153d..1c4c93c82d2 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -14,6 +14,9 @@ pub use future::ControlledFuture; mod service; pub use service::ResourceManagerService; +pub mod channel; +pub use channel::ResourceMetered; + #[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig, Default)] #[serde(default)] #[serde(rename_all = "kebab-case")] From 495abac06eb3319f0a75d0e5e63ea43086b06fe7 Mon Sep 17 00:00:00 2001 From: Jianjun Liao <36503113+Leavrth@users.noreply.github.com> Date: Fri, 3 Feb 2023 19:17:56 +0800 Subject: [PATCH 486/676] cloud: azblob: add retry for http code 500 error (#14094) close tikv/tikv#14093 Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 ++ components/cloud/azure/Cargo.toml | 2 ++ components/cloud/azure/src/azblob.rs | 44 ++++++++++++++++++++++++---- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 21145778082..dd2869a7b10 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -359,7 +359,9 @@ dependencies = [ "futures 0.3.15", "futures-util", "kvproto", + "lazy_static", "oauth2", + "regex", "slog", "slog-global", "tikv_util", diff --git a/components/cloud/azure/Cargo.toml b/components/cloud/azure/Cargo.toml index c08dc76fdff..57ea6c14aef 100644 --- a/components/cloud/azure/Cargo.toml +++ b/components/cloud/azure/Cargo.toml @@ -15,7 +15,9 @@ cloud = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } kvproto = { workspace = true } +lazy_static = "1.4.0" oauth2 = { version = "4.0.0", default-features = false } +regex = "1" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tikv_util = { workspace = true } diff --git a/components/cloud/azure/src/azblob.rs b/components/cloud/azure/src/azblob.rs index 12b6149fad5..47d2d731da8 100644 --- a/components/cloud/azure/src/azblob.rs +++ b/components/cloud/azure/src/azblob.rs @@ -26,7 +26,9 @@ use futures_util::{ TryStreamExt, }; pub use kvproto::brpb::{AzureBlobStorage as InputConfig, Bucket as InputBucket, CloudDynamic}; +use lazy_static::lazy_static; use oauth2::{ClientId, ClientSecret}; +use regex::Regex; use tikv_util::{ debug, stream::{retry, RetryError}, @@ -224,6 +226,7 @@ impl BlobConfig for Config { enum RequestError { InvalidInput(Box, String), + InternalError(String), TimeOut(String), } @@ -233,6 +236,7 @@ impl From for io::Error { RequestError::InvalidInput(e, tag) => { Self::new(io::ErrorKind::InvalidInput, format!("{}: {}", tag, &e)) } + RequestError::InternalError(msg) => Self::new(io::ErrorKind::Other, msg), RequestError::TimeOut(msg) => Self::new(io::ErrorKind::TimedOut, msg), } } @@ -240,10 +244,21 @@ impl From for io::Error { impl RetryError for RequestError { fn is_retryable(&self) -> bool { - matches!(self, Self::TimeOut(_)) + matches!(self, Self::TimeOut(_) | Self::InternalError(_)) } } +fn err_is_retryable(err_info: &str) -> bool { + // HTTP Code 503: The server is busy + // HTTP Code 500: Operation could not be completed within the specified time. + // More details seen in https://learn.microsoft.com/en-us/rest/api/storageservices/blob-service-error-codes + lazy_static! { + static ref RE: Regex = Regex::new(r"status: 5[0-9][0-9],").unwrap(); + } + + RE.is_match(err_info) +} + const CONNECTION_TIMEOUT: Duration = Duration::from_secs(900); /// A helper for uploading a large file to Azure storage. @@ -308,10 +323,9 @@ impl AzureUploader { Ok(_) => Ok(()), Err(err) => { let err_info = ToString::to_string(&err); - if err_info.contains("busy") { - // server is busy, retry later - Err(RequestError::TimeOut(format!( - "the resource is busy: {}, retry later", + if err_is_retryable(&err_info) { + Err(RequestError::InternalError(format!( + "internal error: {}, retry later", err_info ))) } else { @@ -765,4 +779,24 @@ mod tests { cd.set_bucket(bucket); cd } + + #[tokio::test] + async fn test_error_retryable() { + let err_info = "HTTP error status (status: 503,... The server is busy."; + assert!(err_is_retryable(err_info)); + let err_info = "HTTP error status (status: 500,... Operation could not be completed within the specified time."; + assert!(err_is_retryable(err_info)); + let err_info = + "HTTP error status (status: 409,... The blob type is invalid for this operation."; + assert!(!err_is_retryable(err_info)); + let err_info = "HTTP error status (status: 50,... "; + assert!(!err_is_retryable(err_info)); + let err = "NaN".parse::().unwrap_err(); + let err1 = RequestError::InvalidInput(Box::new(err), "invalid-input".to_owned()); + let err2 = RequestError::InternalError("internal-error".to_owned()); + let err3 = RequestError::TimeOut("time-out".to_owned()); + assert!(!err1.is_retryable()); + assert!(err2.is_retryable()); + assert!(err3.is_retryable()); + } } From 856987fde93b68c6489635d16d9e27c102b7d47f Mon Sep 17 00:00:00 2001 From: Zwb Date: Fri, 3 Feb 2023 19:33:55 +0800 Subject: [PATCH 487/676] tests: fix test_witness_replica_read fail (#14110) ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot --- tests/integrations/raftstore/test_witness.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index f35b21b08a1..907c49c03af 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -452,6 +452,9 @@ fn test_witness_replica_read() { vec![true], ); + // make sure the peer_on_store3 has completed applied to witness + std::thread::sleep(Duration::from_millis(200)); + let mut request = new_request( region.get_id(), region.get_region_epoch().clone(), From 22202b26f9ced98a8d944eae1309e3610ca48566 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Fri, 3 Feb 2023 21:41:55 -0800 Subject: [PATCH 488/676] tikv: optimize default config for multi-rocksdb (#14107) ref tikv/tikv#12842 set block-cache.capacity to 0.3 * memory limit instead of 0.45 for partitioned-raft-kv. set region-split-size to 1GB by default when bucket is enabled set region-split-size to 10GB by default when partitioned-raft-kv is enabled. These numbers may be tuned futher if we have other better results. Signed-off-by: qi.xu Co-authored-by: qi.xu --- cmd/tikv-ctl/src/executor.rs | 5 +- .../raftstore/src/coprocessor/config.rs | 50 +++++++---- .../src/coprocessor/split_check/half.rs | 4 +- .../src/coprocessor/split_check/keys.rs | 2 +- .../src/coprocessor/split_check/size.rs | 12 +-- .../src/coprocessor/split_check/table.rs | 2 +- components/server/src/raft_engine_switch.rs | 5 +- components/server/src/server.rs | 13 ++- components/server/src/server2.rs | 13 ++- components/snap_recovery/src/init_cluster.rs | 7 +- components/test_raftstore/src/node.rs | 4 +- components/test_raftstore/src/server.rs | 2 +- components/test_raftstore/src/util.rs | 5 +- src/config/mod.rs | 83 +++++++++++++++++-- src/server/engine_factory.rs | 5 +- src/storage/config.rs | 10 ++- src/storage/kv/test_engine_builder.rs | 3 +- src/storage/mod.rs | 5 +- tests/failpoints/cases/test_split_region.rs | 2 +- tests/integrations/config/mod.rs | 2 +- .../raftstore/test_split_region.rs | 8 +- tests/integrations/storage/test_titan.rs | 5 +- 22 files changed, 185 insertions(+), 62 deletions(-) diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 94610face44..7dd00a1d29c 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -61,7 +61,10 @@ pub fn new_debug_executor( .unwrap() .map(Arc::new); - let cache = cfg.storage.block_cache.build_shared_cache(); + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); let env = cfg .build_shared_rocks_env(key_manager.clone(), None /* io_rate_limiter */) .unwrap(); diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index fb1fc35345f..3014c5c2358 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -25,7 +25,7 @@ pub struct Config { /// [b,c), [c,d) will be region_split_size (maybe a little larger). /// by default, region_max_size = region_split_size * 2 / 3. pub region_max_size: Option, - pub region_split_size: ReadableSize, + pub region_split_size: Option, /// When the number of keys in region [a,e) meets the region_max_keys, /// it will be split into two several regions [a,b), [b,c), [c,d), [d,e). @@ -71,6 +71,9 @@ pub enum ConsistencyCheckMethod { /// Default region split size. pub const SPLIT_SIZE_MB: u64 = 96; +pub const LARGE_REGION_SPLIT_SIZE_MB: u64 = 1024; +pub const RAFTSTORE_V2_SPLIT_SIZE_MB: u64 = 10240; + /// Default batch split limit. pub const BATCH_SPLIT_LIMIT: u64 = 10; @@ -80,11 +83,10 @@ pub const DEFAULT_REGION_BUCKET_MERGE_SIZE_RATIO: f64 = 0.33; impl Default for Config { fn default() -> Config { - let split_size = ReadableSize::mb(SPLIT_SIZE_MB); Config { split_region_on_table: false, batch_split_limit: BATCH_SPLIT_LIMIT, - region_split_size: split_size, + region_split_size: None, region_max_size: None, region_split_keys: None, region_max_keys: None, @@ -100,39 +102,55 @@ impl Default for Config { } impl Config { + pub fn region_split_size(&self) -> ReadableSize { + self.region_split_size + .unwrap_or(/* v1 only */ if self.enable_region_bucket { + ReadableSize::mb(LARGE_REGION_SPLIT_SIZE_MB) + } else { + ReadableSize::mb(SPLIT_SIZE_MB) + }) + } + pub fn region_max_keys(&self) -> u64 { - let default_split_keys = self.region_split_size.as_mb_f64() * 10000.0; + let default_split_keys = self.region_split_size().as_mb_f64() * 10000.0; self.region_max_keys .unwrap_or(default_split_keys as u64 / 2 * 3) } pub fn region_max_size(&self) -> ReadableSize { self.region_max_size - .unwrap_or(self.region_split_size / 2 * 3) + .unwrap_or(self.region_split_size() / 2 * 3) } pub fn region_split_keys(&self) -> u64 { // Assume the average size of KVs is 100B. self.region_split_keys - .unwrap_or((self.region_split_size.as_mb_f64() * 10000.0) as u64) + .unwrap_or((self.region_split_size().as_mb_f64() * 10000.0) as u64) + } + + pub fn optimize_for(&mut self, raftstore_v2: bool) { + // overwrite the default region_split_size when it's multi-rocksdb + if raftstore_v2 && self.region_split_size.is_none() { + self.region_split_size = Some(ReadableSize::mb(RAFTSTORE_V2_SPLIT_SIZE_MB)); + } } pub fn validate(&mut self) -> Result<()> { if self.region_split_keys.is_none() { - self.region_split_keys = Some((self.region_split_size.as_mb_f64() * 10000.0) as u64); + self.region_split_keys = Some((self.region_split_size().as_mb_f64() * 10000.0) as u64); } match self.region_max_size { Some(region_max_size) => { - if region_max_size.0 < self.region_split_size.0 { + if region_max_size.0 < self.region_split_size().0 { return Err(box_err!( "region max size {} must >= split size {}", region_max_size.0, - self.region_split_size.0 + self.region_split_size().0 )); } } - None => self.region_max_size = Some(self.region_split_size / 2 * 3), + None => self.region_max_size = Some(self.region_split_size() / 2 * 3), } match self.region_max_keys { @@ -148,10 +166,10 @@ impl Config { None => self.region_max_keys = Some(self.region_split_keys() / 2 * 3), } if self.enable_region_bucket { - if self.region_split_size.0 < self.region_bucket_size.0 { + if self.region_split_size().0 < self.region_bucket_size.0 { return Err(box_err!( "region split size {} must >= region bucket size {}", - self.region_split_size.0, + self.region_split_size().0, self.region_bucket_size.0 )); } @@ -208,12 +226,12 @@ mod tests { cfg = Config::default(); cfg.region_max_size = Some(ReadableSize(10)); - cfg.region_split_size = ReadableSize(20); + cfg.region_split_size = Some(ReadableSize(20)); cfg.validate().unwrap_err(); cfg = Config::default(); cfg.region_max_size = None; - cfg.region_split_size = ReadableSize(20); + cfg.region_split_size = Some(ReadableSize(20)); cfg.validate().unwrap(); assert_eq!(cfg.region_max_size, Some(ReadableSize(30))); @@ -230,12 +248,12 @@ mod tests { cfg = Config::default(); cfg.enable_region_bucket = false; - cfg.region_split_size = ReadableSize(20); + cfg.region_split_size = Some(ReadableSize(20)); cfg.region_bucket_size = ReadableSize(30); cfg.validate().unwrap(); cfg = Config::default(); - cfg.region_split_size = ReadableSize::mb(20); + cfg.region_split_size = Some(ReadableSize::mb(20)); cfg.validate().unwrap(); assert_eq!(cfg.region_split_keys, Some(200000)); } diff --git a/components/raftstore/src/coprocessor/split_check/half.rs b/components/raftstore/src/coprocessor/split_check/half.rs index fafa41e44b5..259334d2f42 100644 --- a/components/raftstore/src/coprocessor/split_check/half.rs +++ b/components/raftstore/src/coprocessor/split_check/half.rs @@ -267,7 +267,7 @@ mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { - region_split_size: ReadableSize(130_u64), + region_split_size: Some(ReadableSize(130_u64)), enable_region_bucket: true, region_bucket_size: ReadableSize(20_u64), // so that each key below will form a bucket ..Default::default() @@ -391,7 +391,7 @@ mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { - region_split_size: ReadableSize(130_u64), + region_split_size: Some(ReadableSize(130_u64)), enable_region_bucket: true, region_bucket_size: ReadableSize(20_u64), // so that each key below will form a bucket ..Default::default() diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index 92e159d233f..58c42d55513 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -590,7 +590,7 @@ mod tests { // The split by keys should still work. But if the bug in on_kv() in size.rs // exists, it will result in split by keys failed. cfg.region_max_size = Some(ReadableSize(region_size * 6 / 5)); - cfg.region_split_size = ReadableSize(region_size * 4 / 5); + cfg.region_split_size = Some(ReadableSize(region_size * 4 / 5)); runnable = SplitCheckRunner::new(engine, tx.clone(), CoprocessorHost::new(tx, cfg)); runnable.run(SplitCheckTask::split_check( region.clone(), diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 1f4a33d7af7..8a1a5558c7d 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -149,7 +149,7 @@ impl SplitCheckObserver for SizeCheckObserver // Need to check size. host.add_checker(Box::new(Checker::new( host.cfg.region_max_size().0, - host.cfg.region_split_size.0, + host.cfg.region_split_size().0, host.cfg.batch_split_limit, policy, ))); @@ -186,7 +186,7 @@ impl SplitCheckObserver for SizeCheckObserver // Need to check size. host.add_checker(Box::new(Checker::new( host.cfg.region_max_size().0, - host.cfg.region_split_size.0, + host.cfg.region_split_size().0, host.cfg.batch_split_limit, policy, ))); @@ -420,7 +420,7 @@ pub mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { region_max_size: Some(ReadableSize(100)), - region_split_size: ReadableSize(60), + region_split_size: Some(ReadableSize(60)), region_max_keys: Some(1000000), region_split_keys: Some(1000000), batch_split_limit: 5, @@ -545,7 +545,7 @@ pub mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { region_max_size: Some(ReadableSize(50000)), - region_split_size: ReadableSize(50000), + region_split_size: Some(ReadableSize(50000)), region_max_keys: Some(1000000), region_split_keys: Some(1000000), batch_split_limit: 5, @@ -671,7 +671,7 @@ pub mod tests { let (tx, _rx) = mpsc::sync_channel(100); let mut cfg = Config { region_max_size: Some(ReadableSize(50000)), - region_split_size: ReadableSize(50000), + region_split_size: Some(ReadableSize(50000)), region_max_keys: Some(1000000), region_split_keys: Some(1000000), batch_split_limit: 5, @@ -736,7 +736,7 @@ pub mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { region_max_size: Some(ReadableSize(100)), - region_split_size: ReadableSize(60), + region_split_size: Some(ReadableSize(60)), region_max_keys: Some(1000000), region_split_keys: Some(1000000), batch_split_limit: 5, diff --git a/components/raftstore/src/coprocessor/split_check/table.rs b/components/raftstore/src/coprocessor/split_check/table.rs index 684e87e1693..eec7b15b9b3 100644 --- a/components/raftstore/src/coprocessor/split_check/table.rs +++ b/components/raftstore/src/coprocessor/split_check/table.rs @@ -326,7 +326,7 @@ mod tests { split_region_on_table: true, // Try to "disable" size split. region_max_size: Some(ReadableSize::gb(2)), - region_split_size: ReadableSize::gb(1), + region_split_size: Some(ReadableSize::gb(1)), // Try to "disable" keys split region_max_keys: Some(2000000000), region_split_keys: Some(1000000000), diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index bfaa2a6587e..bf46f07eabd 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -237,7 +237,10 @@ mod tests { cfg.raft_store.raftdb_path = raftdb_path.to_str().unwrap().to_owned(); cfg.raftdb.wal_dir = raftdb_wal_path.to_str().unwrap().to_owned(); cfg.raft_engine.mut_config().dir = raft_engine_path.to_str().unwrap().to_owned(); - let cache = cfg.storage.block_cache.build_shared_cache(); + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); // Dump logs from RocksEngine to RaftLogEngine. let raft_engine = RaftLogEngine::new( diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 3da6b0c4950..4fe397e9eb5 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -955,7 +955,7 @@ where self.config .raft_store .validate( - self.config.coprocessor.region_split_size, + self.config.coprocessor.region_split_size(), self.config.coprocessor.enable_region_bucket, self.config.coprocessor.region_bucket_size, ) @@ -1838,7 +1838,11 @@ impl TikvServer { &mut self, flow_listener: engine_rocks::FlowListener, ) -> (Engines, Arc) { - let block_cache = self.config.storage.block_cache.build_shared_cache(); + let block_cache = self + .config + .storage + .block_cache + .build_shared_cache(self.config.storage.engine); let env = self .config .build_shared_rocks_env(self.encryption_key_manager.clone(), get_io_rate_limiter()) @@ -2193,7 +2197,10 @@ mod test { config.rocksdb.lockcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); let env = Arc::new(Env::default()); let path = Builder::new().prefix("test-update").tempdir().unwrap(); - let cache = config.storage.block_cache.build_shared_cache(); + let cache = config + .storage + .block_cache + .build_shared_cache(config.storage.engine); let factory = KvEngineFactoryBuilder::new(env, &config, cache).build(); let reg = TabletRegistry::new(Box::new(factory), path.path().join("tablets")).unwrap(); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index f193e1c7445..20d79e7cce5 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -814,7 +814,7 @@ where self.config .raft_store .validate( - self.config.coprocessor.region_split_size, + self.config.coprocessor.region_split_size(), self.config.coprocessor.enable_region_bucket, self.config.coprocessor.region_bucket_size, ) @@ -1452,7 +1452,11 @@ impl TikvServer { &mut self, flow_listener: engine_rocks::FlowListener, ) -> Arc { - let block_cache = self.config.storage.block_cache.build_shared_cache(); + let block_cache = self + .config + .storage + .block_cache + .build_shared_cache(self.config.storage.engine); let env = self .config .build_shared_rocks_env(self.encryption_key_manager.clone(), get_io_rate_limiter()) @@ -1799,7 +1803,10 @@ mod test { config.rocksdb.lockcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); let env = Arc::new(Env::default()); let path = Builder::new().prefix("test-update").tempdir().unwrap(); - let cache = config.storage.block_cache.build_shared_cache(); + let cache = config + .storage + .block_cache + .build_shared_cache(config.storage.engine); let factory = KvEngineFactoryBuilder::new(env, &config, cache).build(); let reg = TabletRegistry::new(Box::new(factory), path.path().join("tablets")).unwrap(); diff --git a/components/snap_recovery/src/init_cluster.rs b/components/snap_recovery/src/init_cluster.rs index e7818b3f888..d3a2ebade73 100644 --- a/components/snap_recovery/src/init_cluster.rs +++ b/components/snap_recovery/src/init_cluster.rs @@ -100,7 +100,7 @@ pub fn enter_snap_recovery_mode(config: &mut TikvConfig) { // Disable region split during recovering. config.coprocessor.region_max_size = Some(ReadableSize::gb(MAX_REGION_SIZE)); - config.coprocessor.region_split_size = ReadableSize::gb(MAX_REGION_SIZE); + config.coprocessor.region_split_size = Some(ReadableSize::gb(MAX_REGION_SIZE)); config.coprocessor.region_max_keys = Some(MAX_SPLIT_KEY); config.coprocessor.region_split_keys = Some(MAX_SPLIT_KEY); } @@ -314,7 +314,10 @@ pub fn create_local_engine_service( let env = config .build_shared_rocks_env(key_manager.clone(), None) .map_err(|e| format!("build shared rocks env: {}", e))?; - let block_cache = config.storage.block_cache.build_shared_cache(); + let block_cache = config + .storage + .block_cache + .build_shared_cache(config.storage.engine); // init rocksdb / kv db let factory = KvEngineFactoryBuilder::new(env.clone(), config, block_cache) diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 05ed8ece83d..78e1dbb36c3 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -239,7 +239,7 @@ impl Simulator for NodeCluster { let mut raft_store = cfg.raft_store.clone(); raft_store .validate( - cfg.coprocessor.region_split_size, + cfg.coprocessor.region_split_size(), cfg.coprocessor.enable_region_bucket, cfg.coprocessor.region_bucket_size, ) @@ -347,7 +347,7 @@ impl Simulator for NodeCluster { .map(|p| p.path().to_str().unwrap().to_owned()) ); - let region_split_size = cfg.coprocessor.region_split_size; + let region_split_size = cfg.coprocessor.region_split_size(); let enable_region_bucket = cfg.coprocessor.enable_region_bucket; let region_bucket_size = cfg.coprocessor.region_bucket_size; let mut raftstore_cfg = cfg.tikv.raft_store; diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 63a0b4e4804..a17c65b8aec 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -499,7 +499,7 @@ impl ServerCluster { let mut raft_store = cfg.raft_store.clone(); raft_store .validate( - cfg.coprocessor.region_split_size, + cfg.coprocessor.region_split_size(), cfg.coprocessor.enable_region_bucket, cfg.coprocessor.region_bucket_size, ) diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 4bcb99adca3..e765cfb883f 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -588,7 +588,10 @@ pub fn create_test_engine( data_key_manager_from_config(&cfg.security.encryption, dir.path().to_str().unwrap()) .unwrap() .map(Arc::new); - let cache = cfg.storage.block_cache.build_shared_cache(); + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); let env = cfg .build_shared_rocks_env(key_manager.clone(), limiter) .unwrap(); diff --git a/src/config/mod.rs b/src/config/mod.rs index 38d69f1ab29..7539fc13c63 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -89,6 +89,9 @@ pub const DEFAULT_ROCKSDB_SUB_DIR: &str = "db"; /// By default, block cache size will be set to 45% of system memory. pub const BLOCK_CACHE_RATE: f64 = 0.45; +/// Because multi-rocksdb has 25% memory table quota, we have to reduce block +/// cache a bit +pub const RAFTSTORE_V2_BLOCK_CACHE_RATE: f64 = 0.30; /// By default, TiKV will try to limit memory usage to 75% of system memory. pub const MEMORY_USAGE_LIMIT_RATE: f64 = 0.75; @@ -3230,9 +3233,14 @@ impl TikvConfig { self.raft_engine.validate()?; self.server.validate()?; self.pd.validate()?; + + // cannot pass EngineType directly as component raftstore cannot have dependency + // on tikv + self.coprocessor + .optimize_for(self.storage.engine == EngineType::RaftKv2); self.coprocessor.validate()?; self.raft_store.validate( - self.coprocessor.region_split_size, + self.coprocessor.region_split_size(), self.coprocessor.enable_region_bucket, self.coprocessor.region_bucket_size, )?; @@ -3447,7 +3455,7 @@ impl TikvConfig { "override coprocessor.region-split-size with raftstore.region-split-size, {:?}", self.raft_store.region_split_size ); - self.coprocessor.region_split_size = self.raft_store.region_split_size; + self.coprocessor.region_split_size = Some(self.raft_store.region_split_size); } self.raft_store.region_split_size = default_raft_store.region_split_size; } @@ -4167,7 +4175,10 @@ mod tests { use grpcio::ResourceQuota; use itertools::Itertools; use kvproto::kvrpcpb::CommandPri; - use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; + use raftstore::coprocessor::{ + config::{LARGE_REGION_SPLIT_SIZE_MB, RAFTSTORE_V2_SPLIT_SIZE_MB, SPLIT_SIZE_MB}, + region_info_accessor::MockRegionInfoProvider, + }; use slog::Level; use tempfile::Builder; use tikv_kv::RocksEngine as RocksDBEngine; @@ -4595,8 +4606,11 @@ mod tests { &cfg.storage.data_dir, Some(cfg.rocksdb.build_opt(&resource, cfg.storage.engine)), cfg.rocksdb.build_cf_opts( - &cfg.rocksdb - .build_cf_resources(cfg.storage.block_cache.build_shared_cache()), + &cfg.rocksdb.build_cf_resources( + cfg.storage + .block_cache + .build_shared_cache(cfg.storage.engine), + ), None, cfg.storage.api_version(), cfg.storage.engine, @@ -5523,17 +5537,22 @@ mod tests { // on. default_cfg.readpool.storage.adjust_use_unified_pool(); default_cfg.readpool.coprocessor.adjust_use_unified_pool(); + default_cfg + .coprocessor + .optimize_for(default_cfg.storage.engine == EngineType::RaftKv2); default_cfg.security.redact_info_log = Some(false); default_cfg.coprocessor.region_max_size = Some(default_cfg.coprocessor.region_max_size()); default_cfg.coprocessor.region_max_keys = Some(default_cfg.coprocessor.region_max_keys()); + default_cfg.coprocessor.region_split_size = + Some(default_cfg.coprocessor.region_split_size()); default_cfg.coprocessor.region_split_keys = Some(default_cfg.coprocessor.region_split_keys()); default_cfg.raft_store.raft_log_gc_size_limit = - Some(default_cfg.coprocessor.region_split_size * 3 / 4); + Some(default_cfg.coprocessor.region_split_size() * 3 / 4); default_cfg.raft_store.raft_log_gc_count_limit = - Some(default_cfg.coprocessor.region_split_size * 3 / 4 / ReadableSize::kb(1)); + Some(default_cfg.coprocessor.region_split_size() * 3 / 4 / ReadableSize::kb(1)); default_cfg.raft_store.region_split_check_diff = - Some(default_cfg.coprocessor.region_split_size / 16); + Some(default_cfg.coprocessor.region_split_size() / 16); // Other special cases. cfg.pd.retry_max_count = default_cfg.pd.retry_max_count; // Both -1 and isize::MAX are the same. @@ -5567,10 +5586,58 @@ mod tests { cfg.raftdb.defaultcf.level0_stop_writes_trigger = None; cfg.raftdb.defaultcf.soft_pending_compaction_bytes_limit = None; cfg.raftdb.defaultcf.hard_pending_compaction_bytes_limit = None; + cfg.coprocessor + .optimize_for(default_cfg.storage.engine == EngineType::RaftKv2); assert_eq!(cfg, default_cfg); } + #[test] + fn test_region_size_config() { + let mut default_cfg = TikvConfig::default(); + default_cfg.coprocessor.optimize_for(false); + default_cfg.coprocessor.validate().unwrap(); + assert_eq!( + default_cfg.coprocessor.region_split_size(), + ReadableSize::mb(SPLIT_SIZE_MB) + ); + + let mut default_cfg = TikvConfig::default(); + default_cfg.coprocessor.enable_region_bucket = true; + default_cfg.coprocessor.optimize_for(false); + default_cfg.coprocessor.validate().unwrap(); + assert_eq!( + default_cfg.coprocessor.region_split_size(), + ReadableSize::mb(LARGE_REGION_SPLIT_SIZE_MB) + ); + + let mut default_cfg = TikvConfig::default(); + default_cfg.coprocessor.optimize_for(true); + default_cfg.coprocessor.validate().unwrap(); + assert_eq!( + default_cfg.coprocessor.region_split_size(), + ReadableSize::mb(RAFTSTORE_V2_SPLIT_SIZE_MB) + ); + + let mut default_cfg = TikvConfig::default(); + default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); + default_cfg.coprocessor.optimize_for(false); + default_cfg.coprocessor.validate().unwrap(); + assert_eq!( + default_cfg.coprocessor.region_split_size(), + ReadableSize::mb(500) + ); + + let mut default_cfg = TikvConfig::default(); + default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); + default_cfg.coprocessor.optimize_for(true); + default_cfg.coprocessor.validate().unwrap(); + assert_eq!( + default_cfg.coprocessor.region_split_size(), + ReadableSize::mb(500) + ); + } + #[test] fn test_compatibility_with_old_config_template() { let mut buf = Vec::new(); diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index ff06e41cc57..413adf0d415 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -258,7 +258,10 @@ mod tests { e ); }); - let cache = cfg.storage.block_cache.build_shared_cache(); + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); let dir = test_util::temp_dir("test-engine-factory", false); let env = cfg.build_shared_rocks_env(None, None).unwrap(); diff --git a/src/storage/config.rs b/src/storage/config.rs index d74bd721104..f65ed15cece 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -14,7 +14,7 @@ use tikv_util::{ sys::SysQuota, }; -use crate::config::{BLOCK_CACHE_RATE, MIN_BLOCK_CACHE_SHARD_SIZE}; +use crate::config::{BLOCK_CACHE_RATE, MIN_BLOCK_CACHE_SHARD_SIZE, RAFTSTORE_V2_BLOCK_CACHE_RATE}; pub const DEFAULT_DATA_DIR: &str = "./"; const DEFAULT_GC_RATIO_THRESHOLD: f64 = 1.1; @@ -240,14 +240,18 @@ impl BlockCacheConfig { } } - pub fn build_shared_cache(&self) -> Cache { + pub fn build_shared_cache(&self, engine_type: EngineType) -> Cache { if self.shared == Some(false) { warn!("storage.block-cache.shared is deprecated, cache is always shared."); } let capacity = match self.capacity { None => { let total_mem = SysQuota::memory_limit_in_bytes(); - ((total_mem as f64) * BLOCK_CACHE_RATE) as usize + if engine_type == EngineType::RaftKv2 { + ((total_mem as f64) * RAFTSTORE_V2_BLOCK_CACHE_RATE) as usize + } else { + ((total_mem as f64) * BLOCK_CACHE_RATE) as usize + } } Some(c) => c.0 as usize, }; diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index d15a33742ba..aff54a41faa 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -96,7 +96,8 @@ impl TestEngineBuilder { if !enable_block_cache { cache_opt.capacity = Some(ReadableSize::kb(0)); } - let shared = cfg_rocksdb.build_cf_resources(cache_opt.build_shared_cache()); + let shared = + cfg_rocksdb.build_cf_resources(cache_opt.build_shared_cache(EngineType::RaftKv)); let cfs_opts = cfs .iter() .map(|cf| match *cf { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 7429ed8900b..6273bc3d54c 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -4195,8 +4195,9 @@ mod tests { let engine = { let path = "".to_owned(); let cfg_rocksdb = db_config; - let shared = - cfg_rocksdb.build_cf_resources(BlockCacheConfig::default().build_shared_cache()); + let shared = cfg_rocksdb.build_cf_resources( + BlockCacheConfig::default().build_shared_cache(EngineType::RaftKv), + ); let cfs_opts = vec![ ( CF_DEFAULT, diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 416116c833b..09e87bb8d4d 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -175,7 +175,7 @@ fn gen_split_region() -> (Region, Region, Region) { let region_split_size = 30000; cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(20); cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); - cluster.cfg.coprocessor.region_split_size = ReadableSize(region_split_size); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); let mut range = 1..; cluster.run(); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index bb35b069a41..61ec0d1f3f4 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -726,7 +726,7 @@ fn test_serde_custom_tikv_config() { split_region_on_table: false, batch_split_limit: 1, region_max_size: Some(ReadableSize::mb(12)), - region_split_size: ReadableSize::mb(12), + region_split_size: Some(ReadableSize::mb(12)), region_max_keys: Some(100000), region_split_keys: Some(100000), consistency_check_method: ConsistencyCheckMethod::Raw, diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 10771c57863..23c3b0b41c2 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -152,7 +152,7 @@ fn test_server_split_region_twice() { fn test_auto_split_region(cluster: &mut Cluster) { cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(100); cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(REGION_MAX_SIZE)); - cluster.cfg.coprocessor.region_split_size = ReadableSize(REGION_SPLIT_SIZE); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(REGION_SPLIT_SIZE)); let check_size_diff = cluster.cfg.raft_store.region_split_check_diff().0; let mut range = 1..; @@ -564,7 +564,7 @@ fn test_split_region_diff_check(cluster: &mut Cluster) { cluster.cfg.raft_store.region_split_check_diff = Some(ReadableSize(10)); cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(20); cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); - cluster.cfg.coprocessor.region_split_size = ReadableSize(region_split_size); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); let mut range = 1..; @@ -630,7 +630,7 @@ fn test_node_split_region_after_reboot_with_config_change() { cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(20); cluster.cfg.coprocessor.enable_region_bucket = true; cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); - cluster.cfg.coprocessor.region_split_size = ReadableSize(region_split_size); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(region_split_size); cluster.run(); @@ -646,7 +646,7 @@ fn test_node_split_region_after_reboot_with_config_change() { // change the config to make the region splittable cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size / 3)); - cluster.cfg.coprocessor.region_split_size = ReadableSize(region_split_size / 3); + cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size / 3)); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(region_split_size / 3); cluster.stop_node(1); cluster.run_node(1).unwrap(); diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index dc0a85bc9c2..921dcf3615f 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -149,7 +149,10 @@ fn test_delete_files_in_range_for_titan() { // Set configs and create engines let mut cfg = TikvConfig::default(); - let cache = cfg.storage.block_cache.build_shared_cache(); + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); cfg.rocksdb.titan.enabled = true; cfg.rocksdb.titan.disable_gc = true; cfg.rocksdb.titan.purge_obsolete_files_period = ReadableDuration::secs(1); From 8484ececb571a28094f3d316fcb6b71f7b2ff12e Mon Sep 17 00:00:00 2001 From: 3pointer Date: Sat, 4 Feb 2023 21:29:55 +0800 Subject: [PATCH 489/676] log-backup: support CA-bundle certifications (#14081) ref tikv/tikv#13867, ref pingcap/tidb#38775 --- .../src/metadata/store/lazy_etcd.rs | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 37ffbad37c4..7e8b7881070 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -7,7 +7,10 @@ use std::{ use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; -use openssl::x509::verify::X509VerifyFlags; +use openssl::{ + pkey::PKey, + x509::{verify::X509VerifyFlags, X509}, +}; use security::SecurityManager; use tikv_util::{ info, @@ -59,7 +62,20 @@ impl ConnectionConfig { // We haven't make it configurable because it is enabled in gRPC by default too. // TODO: Perhaps implement grpc-io based etcd client, fully remove the difference between gRPC TLS and our custom TLS? .manually(|c| c.cert_store_mut().set_flags(X509VerifyFlags::PARTIAL_CHAIN)) - .client_cert_pem_and_key(&tls.client_cert, &tls.client_key.0), + .manually(|c| { + let mut client_certs= X509::stack_from_pem(&tls.client_cert)?; + let client_key = PKey::private_key_from_pem(&tls.client_key.0)?; + if !client_certs.is_empty() { + c.set_certificate(&client_certs[0])?; + } + if client_certs.len() > 1 { + for i in client_certs.drain(1..) { + c.add_extra_chain_cert(i)?; + } + } + c.set_private_key(&client_key)?; + Ok(()) + }), ) } opts = opts From 14dd46d82e807933f7a2ee237632152aa5ea5e9f Mon Sep 17 00:00:00 2001 From: Ping Yu Date: Mon, 6 Feb 2023 13:07:57 +0800 Subject: [PATCH 490/676] rawkv: fix flaky integration test case `test_raw_put_key_guard` (#14140) close tikv/tikv#14141 rawkv: fix flaky integration test case `test_raw_put_key_guard`. Signed-off-by: Ping Yu --- tests/failpoints/cases/test_rawkv.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index 274a458958e..e228e82830c 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -276,7 +276,7 @@ fn test_region_merge() { suite.stop(); } -// Verify the raw key guard correctness in apiv2 +// Verify the raw key guard correctness in APIv2. #[test] fn test_raw_put_key_guard() { let mut suite = TestSuite::new(3, ApiVersion::V2); @@ -296,12 +296,19 @@ fn test_raw_put_key_guard() { let copy_test_key = test_key.clone(); let copy_test_value = test_value.clone(); - let apply_wait_timeout = 2000; // ms, assume send request and apply can be finished in 2s. fail::cfg(pause_write_fp, "pause").unwrap(); let handle = thread::spawn(move || { must_raw_put(&client, ctx, copy_test_key, copy_test_value); }); - thread::sleep(Duration::from_millis(apply_wait_timeout)); + + // Wait for global_min_lock_ts. + sleep_ms(500); + let start = Instant::now(); + while leader_cm.global_min_lock_ts().is_none() + && start.saturating_elapsed() < Duration::from_secs(5) + { + sleep_ms(200); + } // Before raw_put finish, min_ts should be the ts of "key guard" of the raw_put // request. From 41a89be5c36ece45084a56143ba387b1c8840055 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 6 Feb 2023 13:55:56 +0800 Subject: [PATCH 491/676] Integration test: use proc-macro to reuse test cases. (#14133) ref tikv/tikv#12842 Add proc-macro test_case to reuse test cases. Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- Cargo.lock | 10 ++ Cargo.toml | 2 + components/test_raftstore_macro/Cargo.toml | 13 ++ components/test_raftstore_macro/src/lib.rs | 151 +++++++++++++++++ scripts/check-bins.py | 2 +- tests/Cargo.toml | 1 + tests/integrations/raftstore/test_single.rs | 174 +++++++++----------- 7 files changed, 252 insertions(+), 101 deletions(-) create mode 100644 components/test_raftstore_macro/Cargo.toml create mode 100644 components/test_raftstore_macro/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index dd2869a7b10..0872b28c827 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5940,6 +5940,15 @@ dependencies = [ "txn_types", ] +[[package]] +name = "test_raftstore_macro" +version = "0.0.1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "test_sst_importer" version = "0.1.0" @@ -6051,6 +6060,7 @@ dependencies = [ "test_pd", "test_pd_client", "test_raftstore", + "test_raftstore_macro", "test_sst_importer", "test_storage", "test_util", diff --git a/Cargo.toml b/Cargo.toml index d76dce26a18..f7d44c94866 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -280,6 +280,7 @@ members = [ "components/test_pd", "components/test_pd_client", "components/test_raftstore", + "components/test_raftstore_macro", "components/test_sst_importer", "components/test_storage", "components/test_util", @@ -355,6 +356,7 @@ example_coprocessor_plugin = { path = "components/test_coprocessor_plugin/exampl test_pd = { path = "components/test_pd" } test_pd_client = { path = "components/test_pd_client" } test_raftstore = { path = "components/test_raftstore", default-features = false } +test_raftstore_macro = { path = "components/test_raftstore_macro" } test_sst_importer = { path = "components/test_sst_importer" } test_storage = { path = "components/test_storage", default-features = false } test_util = { path = "components/test_util" } diff --git a/components/test_raftstore_macro/Cargo.toml b/components/test_raftstore_macro/Cargo.toml new file mode 100644 index 00000000000..7a05f56ed3d --- /dev/null +++ b/components/test_raftstore_macro/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "test_raftstore_macro" +version = "0.0.1" +edition = "2018" +publish = false + +[lib] +proc-macro = true + +[dependencies] +proc-macro2 = "1.0" +quote = "1" +syn = { version = "1", features = ["full", "extra-traits"] } diff --git a/components/test_raftstore_macro/src/lib.rs b/components/test_raftstore_macro/src/lib.rs new file mode 100644 index 00000000000..59a2c6f1273 --- /dev/null +++ b/components/test_raftstore_macro/src/lib.rs @@ -0,0 +1,151 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use proc_macro::TokenStream; +use proc_macro2::{TokenStream as TokenStream2, TokenTree}; +use quote::{quote, ToTokens}; +use syn::{parse_macro_input, parse_quote, Ident, ItemFn, Path}; + +/// test_case generate test cases using cluster creation method provided. +/// +/// ex: +/// #[test_case(test_raftstore::new_node_cluster)] +/// #[test_case(test_raftstore::new_server_cluster)] +/// #[test_case(test_raftstore_v2::new_node_cluster)] +/// fn test_something() { +/// let cluster = new_cluster(...) +/// } +/// +/// It generates three test cases as following: +/// +/// #[cfg(test)] +/// mod test_something { +/// #[test] +/// fn test_raftstore_new_node_cluster() { +/// use test_raftstore::new_node_cluster as new_cluster; +/// let mut cluster = new_cluster(0, 1); +/// } +/// +/// #[test] +/// fn test_raftstore_new_server_cluster() { +/// use test_raftstore::new_server_cluster as new_cluster; +/// let mut cluster = new_cluster(0, 1); +/// } +/// +/// #[test] +/// fn test_raftstore_v2_new_server_cluster() { +/// use test_raftstore::test_raftstore_v2 as new_cluster; +/// let mut cluster = new_cluster(0, 1); +/// } +/// } +#[proc_macro_attribute] +pub fn test_case(arg: TokenStream, input: TokenStream) -> TokenStream { + let mut fn_item = parse_macro_input!(input as ItemFn); + let mut test_cases = vec![TokenStream2::from(arg)]; + let mut attrs_to_remove = vec![]; + + let legal_test_case_name: Path = parse_quote!(test_case); + for (idx, attr) in fn_item.attrs.iter().enumerate() { + if legal_test_case_name == attr.path { + test_cases.push(attr.into_token_stream()); + attrs_to_remove.push(idx); + } + } + + for i in attrs_to_remove.into_iter().rev() { + fn_item.attrs.swap_remove(i); + } + + render_test_cases(test_cases, fn_item.clone()) +} + +fn render_test_cases(test_cases: Vec, fn_item: ItemFn) -> TokenStream { + let mut rendered_test_cases: Vec = vec![]; + for case in test_cases { + let mut item = fn_item.clone(); + + // Parse test case to get the package name and the method name + let (package, method) = parse_test_case(case); + let test_name = format!("{}_{}", package, method); + // Insert a use statment at the beginning of the test, + // ex: " use test_raftstore::new_node_cluster as new_cluster ", so we can use + // new_cluster in all situations. + item.block.stmts.insert( + 0, + syn::parse( + quote! { + use #package::#method as new_cluster; + } + .into(), + ) + .unwrap(), + ); + item.attrs.insert(0, parse_quote! { #[test] }); + let method_name = Ident::new(&test_name, item.sig.ident.span()); + item.sig.ident = method_name; + + rendered_test_cases.push(item.to_token_stream()); + } + + let mod_name = fn_item.sig.ident; + let output = quote! { + #[cfg(test)] + mod #mod_name { + #[allow(unused_imports)] + use super::*; + + #(#rendered_test_cases)* + } + }; + + output.into() +} + +// Parsing test case to get package name and method name. +// There are two cases that need to be considered +// 1. the first token is Ident type +// 2. the first token is Punct type +// +// use the following case as an example +// #[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore::new_server_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] +// fn test_something() {} +// +// The first case ( #[test_case(test_raftstore::new_node_cluster)] ) +// will be passed to the proc-macro "test_case" as the first argument and the +// #[test_case(...)] will be stripped off automatically. So the first token is +// the Ident type, namely "test_raftstore". +// +// The other two cases are in the `attr` fileds of ItemFn, and +// #[test_case(...)] are untouched. So the first token is Punct type. +fn parse_test_case(test_case: TokenStream2) -> (Ident, Ident) { + let mut iter = test_case.into_iter(); + let package = match iter.next().unwrap() { + // ex: test_raftstore::new_node_cluster + TokenTree::Ident(package) => package, + // ex: #[test_raftstore::new_node_cluster] + TokenTree::Punct(_) => match iter.next().unwrap() { + TokenTree::Group(group) => { + let mut iter = group.stream().into_iter(); + iter.next(); + match iter.next().unwrap() { + TokenTree::Group(group) => { + let stream = group.stream(); + return parse_test_case(stream); + } + _ => panic!("Invalid token stream"), + } + } + _ => panic!("Invalid token stream"), + }, + _ => panic!("Invalid token stream"), + }; + // Skip two ':' + iter.next(); + iter.next(); + let method = match iter.next().unwrap() { + TokenTree::Ident(method) => method, + _ => panic!("Invalid token stream"), + }; + (package, method) +} diff --git a/scripts/check-bins.py b/scripts/check-bins.py index aaa13e6b9de..1255472a76a 100644 --- a/scripts/check-bins.py +++ b/scripts/check-bins.py @@ -14,7 +14,7 @@ "online_config", "online_config_derive", "tidb_query_codegen", "panic_hook", "fuzz", "fuzzer_afl", "fuzzer_honggfuzz", "fuzzer_libfuzzer", "coprocessor_plugin_api", "example_coprocessor_plugin", "memory_trace_macros", "case_macros", - "tracker" + "tracker", "test_raftstore_macro" } JEMALLOC_SYMBOL = ["je_arena_boot", " malloc"] diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 1cc0e6bce87..96ee19e9bae 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -143,6 +143,7 @@ test_coprocessor = { workspace = true } test_pd = { workspace = true } test_pd_client = { workspace = true } test_raftstore = { workspace = true } +test_raftstore_macro = { workspace = true } test_sst_importer = { workspace = true } test_storage = { workspace = true } test_util = { workspace = true } diff --git a/tests/integrations/raftstore/test_single.rs b/tests/integrations/raftstore/test_single.rs index 73944428953..b7fcb6a7b34 100644 --- a/tests/integrations/raftstore/test_single.rs +++ b/tests/integrations/raftstore/test_single.rs @@ -6,11 +6,59 @@ use engine_traits::{CfName, CF_DEFAULT, CF_WRITE}; use raftstore::store::*; use rand::prelude::*; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::*, time::Instant}; // TODO add epoch not match test cases. -fn test_put(cluster: &mut Cluster) { +fn test_delete_range(cluster: &mut Cluster, cf: CfName) { + let data_set: Vec<_> = (1..500) + .map(|i| { + ( + format!("key{:08}", i).into_bytes(), + format!("value{}", i).into_bytes(), + ) + }) + .collect(); + for kvs in data_set.chunks(50) { + let requests = kvs.iter().map(|(k, v)| new_put_cf_cmd(cf, k, v)).collect(); + // key9 is always the last region. + cluster.batch_put(b"key9", requests).unwrap(); + } + + // delete_range request with notify_only set should not actually delete data. + cluster.must_notify_delete_range_cf(cf, b"", b""); + + let mut rng = rand::thread_rng(); + for _ in 0..50 { + let (k, v) = data_set.choose(&mut rng).unwrap(); + assert_eq!(cluster.get_cf(cf, k).unwrap(), *v); + } + + // Empty keys means the whole range. + cluster.must_delete_range_cf(cf, b"", b""); + + for _ in 0..50 { + let k = &data_set.choose(&mut rng).unwrap().0; + assert!(cluster.get_cf(cf, k).is_none()); + } +} + +fn test_put_large_entry(cluster: &mut Cluster) { + let max_size: usize = 1024; + cluster.cfg.raft_store.raft_entry_max_size = ReadableSize(max_size as u64); + + cluster.run(); + + let large_value = vec![b'v'; max_size + 1]; + let res = cluster.put(b"key", large_value.as_slice()); + assert!(res.as_ref().err().unwrap().has_raft_entry_too_large()); +} + +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +fn test_put() { + let mut cluster = new_cluster(0, 1); cluster.run(); let mut data_set: Vec<_> = (1..1000) @@ -53,7 +101,10 @@ fn test_put(cluster: &mut Cluster) { } } -fn test_delete(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +fn test_delete() { + let mut cluster = new_cluster(0, 1); cluster.run(); let data_set: Vec<_> = (1..1000) @@ -80,40 +131,30 @@ fn test_delete(cluster: &mut Cluster) { } } -fn test_delete_range(cluster: &mut Cluster, cf: CfName) { - let data_set: Vec<_> = (1..500) - .map(|i| { - ( - format!("key{:08}", i).into_bytes(), - format!("value{}", i).into_bytes(), - ) - }) - .collect(); - for kvs in data_set.chunks(50) { - let requests = kvs.iter().map(|(k, v)| new_put_cf_cmd(cf, k, v)).collect(); - // key9 is always the last region. - cluster.batch_put(b"key9", requests).unwrap(); - } - - // delete_range request with notify_only set should not actually delete data. - cluster.must_notify_delete_range_cf(cf, b"", b""); - - let mut rng = rand::thread_rng(); - for _ in 0..50 { - let (k, v) = data_set.choose(&mut rng).unwrap(); - assert_eq!(cluster.get_cf(cf, k).unwrap(), *v); - } - - // Empty keys means the whole range. - cluster.must_delete_range_cf(cf, b"", b""); +#[test] +fn test_node_use_delete_range() { + let mut cluster = new_node_cluster(0, 1); + cluster.cfg.raft_store.use_delete_range = true; + cluster.run(); + test_delete_range(&mut cluster, CF_DEFAULT); + // Prefix bloom filter is always enabled in the Write CF. + test_delete_range(&mut cluster, CF_WRITE); +} - for _ in 0..50 { - let k = &data_set.choose(&mut rng).unwrap().0; - assert!(cluster.get_cf(cf, k).is_none()); - } +#[test] +fn test_node_not_use_delete_range() { + let mut cluster = new_node_cluster(0, 1); + cluster.cfg.raft_store.use_delete_range = false; + cluster.run(); + test_delete_range(&mut cluster, CF_DEFAULT); + // Prefix bloom filter is always enabled in the Write CF. + test_delete_range(&mut cluster, CF_WRITE); } -fn test_wrong_store_id(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +fn test_wrong_store_id() { + let mut cluster = new_cluster(0, 1); cluster.run(); let (k, v) = (b"k", b"v"); @@ -137,73 +178,6 @@ fn test_wrong_store_id(cluster: &mut Cluster) { ); } -fn test_put_large_entry(cluster: &mut Cluster) { - let max_size: usize = 1024; - cluster.cfg.raft_store.raft_entry_max_size = ReadableSize(max_size as u64); - - cluster.run(); - - let large_value = vec![b'v'; max_size + 1]; - let res = cluster.put(b"key", large_value.as_slice()); - assert!(res.as_ref().err().unwrap().has_raft_entry_too_large()); -} - -#[test] -fn test_node_put() { - let mut cluster = new_node_cluster(0, 1); - test_put(&mut cluster); -} - -#[test] -fn test_node_delete() { - let mut cluster = new_node_cluster(0, 1); - test_delete(&mut cluster); -} - -#[test] -fn test_node_use_delete_range() { - let mut cluster = new_node_cluster(0, 1); - cluster.cfg.raft_store.use_delete_range = true; - cluster.run(); - test_delete_range(&mut cluster, CF_DEFAULT); - // Prefix bloom filter is always enabled in the Write CF. - test_delete_range(&mut cluster, CF_WRITE); -} - -#[test] -fn test_node_not_use_delete_range() { - let mut cluster = new_node_cluster(0, 1); - cluster.cfg.raft_store.use_delete_range = false; - cluster.run(); - test_delete_range(&mut cluster, CF_DEFAULT); - // Prefix bloom filter is always enabled in the Write CF. - test_delete_range(&mut cluster, CF_WRITE); -} - -#[test] -fn test_node_wrong_store_id() { - let mut cluster = new_node_cluster(0, 1); - test_wrong_store_id(&mut cluster); -} - -#[test] -fn test_server_put() { - let mut cluster = new_server_cluster(0, 1); - test_put(&mut cluster); -} - -#[test] -fn test_server_delete() { - let mut cluster = new_server_cluster(0, 1); - test_delete(&mut cluster); -} - -#[test] -fn test_server_wrong_store_id() { - let mut cluster = new_server_cluster(0, 1); - test_wrong_store_id(&mut cluster); -} - #[test] fn test_node_put_large_entry() { let mut cluster = new_node_cluster(0, 1); From 7c20add6cef90ac231db6d10374856f23d89e3f1 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 6 Feb 2023 16:31:57 +0800 Subject: [PATCH 492/676] raftstore, storage: return FlashbackNotPrepared error if the flashback commit check failed (#14145) close tikv/tikv#14143, ref tikv/tikv#14143 As https://github.com/tikv/tikv/issues/14143 mentioned, flashback should not return `TxnLockNotFound` error to the client if the flashback commit check failed, which will cause TiDB to retry the flashback forever. This PR changes this error to `FlashbackNotPrepared` to match the client handling logic. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/error_code/src/storage.rs | 1 + components/raftstore/src/errors.rs | 7 ++++--- components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/util.rs | 2 +- components/raftstore/src/store/worker/read.rs | 2 +- etc/error_code.toml | 10 ++++++++++ src/storage/errors.rs | 9 +++++++++ src/storage/txn/actions/flashback_to_version.rs | 13 +++++++------ .../txn/commands/flashback_to_version_read_phase.rs | 1 + src/storage/txn/mod.rs | 7 +++++++ tests/integrations/server/kv_service.rs | 3 ++- 12 files changed, 45 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0872b28c827..485aeb43c52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2732,7 +2732,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#2b853bed812556901846f42820b63d8a0d9c8d24" +source = "git+https://github.com/pingcap/kvproto.git#eccad3776d7b076da68d6c51fb7506b8562b9802" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/error_code/src/storage.rs b/components/error_code/src/storage.rs index ff994032dea..e2cf34094c3 100644 --- a/components/error_code/src/storage.rs +++ b/components/error_code/src/storage.rs @@ -21,6 +21,7 @@ define_error_codes!( BAD_FORMAT_WRITE => ("BadFormatWrite", "",""), KEY_IS_LOCKED => ("KeyIsLocked", "", ""), MAX_TIMESTAMP_NOT_SYNCED => ("MaxTimestampNotSynced", "", ""), + FLASHBACK_NOT_PREPARED => ("FlashbackNotPrepared", "", ""), DEADLINE_EXCEEDED => ("DeadlineExceeded", "", ""), API_VERSION_NOT_MATCHED => ("ApiVersionNotMatched", "", ""), INVALID_KEY_MODE => ("InvalidKeyMode", "", ""), diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 36fcec7f1f3..5deef832723 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -58,8 +58,8 @@ pub enum Error { #[error("region {0} is in the recovery progress")] RecoveryInProgress(u64), - #[error("region {0} is in the flashback progress")] - FlashbackInProgress(u64), + #[error("region {0} is in the flashback progress with start_ts {1}")] + FlashbackInProgress(u64, u64), #[error("region {0} not prepared the flashback")] FlashbackNotPrepared(u64), @@ -256,9 +256,10 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_recovery_in_progress(e); } - Error::FlashbackInProgress(region_id) => { + Error::FlashbackInProgress(region_id, flashback_start_ts) => { let mut e = errorpb::FlashbackInProgress::default(); e.set_region_id(region_id); + e.set_flashback_start_ts(flashback_start_ts); errorpb.set_flashback_in_progress(e); } Error::FlashbackNotPrepared(region_id) => { diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index d5b73e5f721..05b443be4eb 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5234,7 +5234,7 @@ where true, ) { match e { - Error::FlashbackInProgress(_) => self + Error::FlashbackInProgress(..) => self .ctx .raft_metrics .invalid_proposal diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 0344adb2b92..0127cc5c7e6 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -364,7 +364,7 @@ pub fn check_flashback_state( return Ok(()); } } - return Err(Error::FlashbackInProgress(region_id)); + return Err(Error::FlashbackInProgress(region_id, flashback_start_ts)); } // If the region is not in the flashback state, the flashback request itself // should be rejected. diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 5d6835666b4..379af09eb2e 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -830,7 +830,7 @@ where Error::FlashbackNotPrepared(_) => { m.borrow_mut().reject_reason.flashback_not_prepared.inc() } - Error::FlashbackInProgress(_) => { + Error::FlashbackInProgress(..) => { m.borrow_mut().reject_reason.flashback_in_progress.inc() } _ => unreachable!(), diff --git a/etc/error_code.toml b/etc/error_code.toml index bb23c9b5e26..4fae4d9ea57 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -563,6 +563,11 @@ error = ''' KV:SstImporter:InvalidKeyMode ''' +["KV:SstImporter:ResourceNotEnough"] +error = ''' +KV:SstImporter:ResourceNotEnough +''' + ["KV:Storage:Timeout"] error = ''' KV:Storage:Timeout @@ -653,6 +658,11 @@ error = ''' KV:Storage:MaxTimestampNotSynced ''' +["KV:Storage:FlashbackNotPrepared"] +error = ''' +KV:Storage:FlashbackNotPrepared +''' + ["KV:Storage:DeadlineExceeded"] error = ''' KV:Storage:DeadlineExceeded diff --git a/src/storage/errors.rs b/src/storage/errors.rs index 2b41cf23ea2..92568d22e45 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -255,6 +255,15 @@ pub fn extract_region_error_from_error(e: &Error) -> Option { err.set_max_timestamp_not_synced(Default::default()); Some(err) } + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::FlashbackNotPrepared( + region_id, + )))) => { + let mut err = errorpb::Error::default(); + let mut flashback_not_prepared_err = errorpb::FlashbackNotPrepared::default(); + flashback_not_prepared_err.set_region_id(*region_id); + err.set_flashback_not_prepared(flashback_not_prepared_err); + Some(err) + } Error(box ErrorInner::SchedTooBusy) => { let mut err = errorpb::Error::default(); let mut server_is_busy_err = errorpb::ServerIsBusy::default(); diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index f44854159c0..bb0c95eb935 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -233,6 +233,7 @@ pub fn check_flashback_commit( key_to_commit: &Key, flashback_start_ts: TimeStamp, flashback_commit_ts: TimeStamp, + region_id: u64, ) -> TxnResult { match reader.load_lock(key_to_commit)? { // If the lock exists, it means the flashback hasn't been finished. @@ -241,7 +242,7 @@ pub fn check_flashback_commit( return Ok(false); } error!( - "check flashback commit exception: lock not found"; + "check flashback commit exception: lock record mismatched"; "key_to_commit" => log_wrappers::Value::key(key_to_commit.as_encoded()), "flashback_start_ts" => flashback_start_ts, "flashback_commit_ts" => flashback_commit_ts, @@ -266,11 +267,11 @@ pub fn check_flashback_commit( ); } } - Err(txn::Error::from_mvcc(mvcc::ErrorInner::TxnLockNotFound { - start_ts: flashback_start_ts, - commit_ts: flashback_commit_ts, - key: key_to_commit.to_raw()?, - })) + // If both the flashback lock and commit records are mismatched, it means + // the current region is not in the flashback state. + Err(txn::Error::from(txn::ErrorInner::FlashbackNotPrepared( + region_id, + ))) } pub fn get_first_user_key( diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index 7fdc86288c2..4be0239aad2 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -213,6 +213,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { &start_key, self.start_ts, self.commit_ts, + self.ctx.get_region_id(), )? { statistics.add(&reader.statistics); return Ok(ProcessResult::Res); diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index d3b199208cb..f43e309f503 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -141,6 +141,9 @@ pub enum ErrorInner { start_ts: {start_ts}, region_id: {region_id}" )] MaxTimestampNotSynced { region_id: u64, start_ts: TimeStamp }, + + #[error("region {0} not prepared the flashback")] + FlashbackNotPrepared(u64), } impl ErrorInner { @@ -174,6 +177,9 @@ impl ErrorInner { region_id, start_ts, }), + ErrorInner::FlashbackNotPrepared(region_id) => { + Some(ErrorInner::FlashbackNotPrepared(region_id)) + } ErrorInner::Other(_) | ErrorInner::ProtoBuf(_) | ErrorInner::Io(_) => None, } } @@ -224,6 +230,7 @@ impl ErrorCodeExt for Error { ErrorInner::MaxTimestampNotSynced { .. } => { error_code::storage::MAX_TIMESTAMP_NOT_SYNCED } + ErrorInner::FlashbackNotPrepared(_) => error_code::storage::FLASHBACK_NOT_PREPARED, } } } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 61a3fb39097..30dd3b120ca 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -800,7 +800,8 @@ fn test_mvcc_flashback_unprepared() { req.set_start_key(b"a".to_vec()); req.set_end_key(b"z".to_vec()); let resp = client.kv_flashback_to_version(&req).unwrap(); - assert!(resp.get_error().contains("txn lock not found")); + assert!(resp.get_region_error().has_flashback_not_prepared()); + assert!(resp.get_error().is_empty()); must_kv_read_equal(&client, ctx.clone(), k.clone(), v, 6); // Flashback with preparing. must_flashback_to_version(&client, ctx.clone(), 0, 6, 7); From 2e7aede3d8d11b07b0c83d920f06238790861968 Mon Sep 17 00:00:00 2001 From: Lucas Date: Tue, 7 Feb 2023 11:33:57 +0800 Subject: [PATCH 493/676] raftstore: support dynamically resize the count of async ios. (#13965) close tikv/tikv#13964 Support dynamically modify the count of async-ios. Signed-off-by: Lucasliang --- .../raftstore/src/store/async_io/write.rs | 143 ++++++++++--- .../src/store/async_io/write_router.rs | 91 +++++++-- components/raftstore/src/store/config.rs | 28 ++- components/raftstore/src/store/fsm/store.rs | 21 +- .../src/store/worker/refresh_config.rs | 100 ++++++++- .../integrations/config/dynamic/raftstore.rs | 69 ++++++- .../integrations/raftstore/test_scale_pool.rs | 189 ++++++++++++++++++ 7 files changed, 572 insertions(+), 69 deletions(-) diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 4d8392edd55..9b25d7de806 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -21,10 +21,11 @@ use engine_traits::{ use error_code::ErrorCodeExt; use fail::fail_point; use kvproto::raft_serverpb::{RaftLocalState, RaftMessage}; +use parking_lot::Mutex; use protobuf::Message; use raft::eraftpb::Entry; use resource_control::{ - channel::{bounded, Receiver, Sender}, + channel::{bounded, Receiver}, ResourceController, ResourceMetered, }; use tikv_util::{ @@ -37,7 +38,7 @@ use tikv_util::{ warn, }; -use super::write_router::WriteSenders; +use super::write_router::{SharedSenders, WriteSenders}; use crate::{ store::{ config::Config, @@ -874,22 +875,41 @@ where } } +#[derive(Clone)] +pub struct StoreWritersContext +where + EK: KvEngine, + ER: RaftEngine, + T: Transport + 'static, + N: PersistedNotifier, +{ + pub store_id: u64, + pub raft_engine: ER, + pub kv_engine: Option, + pub transfer: T, + pub notifier: N, + pub cfg: Arc>, +} + +#[derive(Clone)] pub struct StoreWriters where EK: KvEngine, ER: RaftEngine, { resource_ctl: Option>, - writers: Vec>>, - handlers: Vec>, + /// Mailboxes for sending raft messages to async ios. + writers: Arc>>, + /// Background threads for handling asynchronous messages. + handlers: Arc>>>, } impl StoreWriters { pub fn new(resource_ctl: Option>) -> Self { Self { resource_ctl, - writers: vec![], - handlers: vec![], + writers: Arc::new(VersionTrack::default()), + handlers: Arc::new(Mutex::new(vec![])), } } } @@ -913,42 +933,99 @@ where cfg: &Arc>, ) -> Result<()> { let pool_size = cfg.value().store_io_pool_size; - for i in 0..pool_size { - let tag = format!("store-writer-{}", i); - let (tx, rx) = bounded( - self.resource_ctl.clone(), - cfg.value().store_io_notify_capacity, - ); - let mut worker = Worker::new( - store_id, - tag.clone(), - raft_engine.clone(), - kv_engine.clone(), - rx, - notifier.clone(), - trans.clone(), - cfg, - ); - info!("starting store writer {}", i); - let t = thread::Builder::new() - .name(thd_name!(tag)) - .spawn_wrapper(move || { - worker.run(); - })?; - self.writers.push(tx); - self.handlers.push(t); + if pool_size > 0 { + self.increase_to( + pool_size, + StoreWritersContext { + store_id, + notifier: notifier.clone(), + raft_engine, + kv_engine, + transfer: trans.clone(), + cfg: cfg.clone(), + }, + )?; } Ok(()) } pub fn shutdown(&mut self) { - assert_eq!(self.writers.len(), self.handlers.len()); - for (i, handler) in self.handlers.drain(..).enumerate() { + let mut handlers = self.handlers.lock(); + let writers = self.writers.value().get(); + assert_eq!(writers.len(), handlers.len()); + for (i, handler) in handlers.drain(..).enumerate() { info!("stopping store writer {}", i); - self.writers[i].send(WriteMsg::Shutdown, 0).unwrap(); + writers[i].send(WriteMsg::Shutdown, 0).unwrap(); handler.join().unwrap(); } } + + #[inline] + /// Returns the valid size of store writers. + pub fn size(&self) -> usize { + self.writers.value().get().len() + } + + pub fn decrease_to(&mut self, size: usize) -> Result<()> { + // Only update logical version of writers but not destroying the workers, so + // that peers that are still using the writer_id (because there're + // unpersisted tasks) can proceed to finish their tasks. After the peer + // gets rescheduled, it will use a new writer_id within the new + // capacity, specified by refreshed `store-io-pool-size`. + // + // TODO: find an elegant way to effectively free workers. + assert_eq!(self.writers.value().get().len(), self.handlers.lock().len()); + self.writers + .update(move |writers: &mut SharedSenders| -> Result<()> { + assert!(writers.get().len() > size); + Ok(()) + })?; + Ok(()) + } + + pub fn increase_to( + &mut self, + size: usize, + writer_meta: StoreWritersContext, + ) -> Result<()> { + let mut handlers = self.handlers.lock(); + let current_size = self.writers.value().get().len(); + assert_eq!(current_size, handlers.len()); + let resource_ctl = self.resource_ctl.clone(); + self.writers + .update(move |writers: &mut SharedSenders| -> Result<()> { + let mut cached_senders = writers.get(); + for i in current_size..size { + let tag = format!("store-writer-{}", i); + let (tx, rx) = bounded( + resource_ctl.clone(), + writer_meta.cfg.value().store_io_notify_capacity, + ); + let mut worker = Worker::new( + writer_meta.store_id, + tag.clone(), + writer_meta.raft_engine.clone(), + writer_meta.kv_engine.clone(), + rx, + writer_meta.notifier.clone(), + writer_meta.transfer.clone(), + &writer_meta.cfg, + ); + info!("starting store writer {}", i); + let t = + thread::Builder::new() + .name(thd_name!(tag)) + .spawn_wrapper(move || { + worker.run(); + })?; + cached_senders.push(tx); + handlers.push(t); + } + writers.set(cached_senders); + Ok(()) + })?; + Ok(()) + } } /// Used for test to write task to kv db and raft db. diff --git a/components/raftstore/src/store/async_io/write_router.rs b/components/raftstore/src/store/async_io/write_router.rs index ead22f70b28..d00007a9485 100644 --- a/components/raftstore/src/store/async_io/write_router.rs +++ b/components/raftstore/src/store/async_io/write_router.rs @@ -16,7 +16,11 @@ use std::{ use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine}; use resource_control::channel::Sender; -use tikv_util::{info, time::Instant}; +use tikv_util::{ + config::{Tracker, VersionTrack}, + error, info, safe_panic, + time::Instant, +}; use crate::store::{ async_io::write::WriteMsg, config::Config, fsm::store::PollContext, local_metrics::RaftMetrics, @@ -163,13 +167,14 @@ where if self.last_unpersisted.is_some() { return false; } - if ctx.config().store_io_pool_size <= 1 { - self.writer_id = 0; - return true; - } + // Local senders may not be updated when `store_io_pool_size()` has been + // increased by the `ctx.config().update()`, keep the real size until it's + // updated by `poller.begin()`. + let async_io_pool_size = + std::cmp::min(ctx.write_senders().size(), ctx.config().store_io_pool_size); if last_unpersisted.is_none() { // If no previous pending ready, we can randomly select a new writer worker. - self.writer_id = rand::random::() % ctx.config().store_io_pool_size; + self.writer_id = rand::random::() % async_io_pool_size; self.next_retry_time = Instant::now_coarse() + ctx.config().io_reschedule_hotpot_duration.0; self.next_writer_id = None; @@ -188,7 +193,7 @@ where // The hot write peers should not be rescheduled entirely. // So it will not be rescheduled if the random id is the same as the original // one. - let new_id = rand::random::() % ctx.config().store_io_pool_size; + let new_id = rand::random::() % async_io_pool_size; if new_id == self.writer_id { // Reset the time self.next_retry_time = now + ctx.config().io_reschedule_hotpot_duration.0; @@ -238,7 +243,7 @@ where let now = Instant::now(); if sender.send(msg, self.last_msg_priority).is_err() { // Write threads are destroyed after store threads during shutdown. - panic!("{} failed to send write msg, err: disconnected", self.tag); + safe_panic!("{} failed to send write msg, err: disconnected", self.tag); } ctx.raft_metrics() .write_block_wait @@ -246,31 +251,87 @@ where } Err(TrySendError::Disconnected(_)) => { // Write threads are destroyed after store threads during shutdown. - panic!("{} failed to send write msg, err: disconnected", self.tag); + safe_panic!("{} failed to send write msg, err: disconnected", self.tag); } } } } +/// Safefly shared senders among the controller and raftstore threads. +/// Senders in it can only be accessed by cloning method `senders()`. +/// +/// `Clone` is safe to race with concurrent `Sender.send()` because the +/// `RefCell` field `last_msg_group` in `Sender` is skipped. +#[derive(Clone)] +pub struct SharedSenders(Vec>>); + +impl Default for SharedSenders { + fn default() -> Self { + Self(vec![]) + } +} + +impl SharedSenders { + #[inline] + pub fn get(&self) -> Vec>> { + self.0.clone() + } + + #[inline] + pub fn set(&mut self, senders: Vec>>) { + self.0 = senders; + } +} + +/// All `Sender`s in `SharedSenders` are shared by the global controller +/// thread and raftstore threads. There won't exist concurrent `Sender.send()` +/// calling scenarios among threads on a same `Sender`. +/// On the one hand, th controller thread will not call `Sender.send()` to +/// consume resources to send messages, just updating the size of `Sender`s if +/// `store-io-pool-size` is resized. On the other hand, each raftstore thread +/// just use its local cloned `Sender`s for sending messages and update it at +/// `begin()`, the first stage for processing messages. +/// Therefore, it's safe to manually remain `Send` trait for +/// `SharedSenders`. +/// +/// TODO: use an elegant implementation, such as `Mutex`, to avoid this +/// hack for sharing `Sender`s among multi-threads. +unsafe impl Sync for SharedSenders {} + /// Senders for asynchronous writes. There can be multiple senders, generally /// you should use `WriteRouter` to decide which sender to be used. #[derive(Clone)] pub struct WriteSenders { - write_senders: Vec>>, + senders: Tracker>, + cached_senders: Vec>>, io_reschedule_concurrent_count: Arc, } impl WriteSenders { - pub fn new(write_senders: Vec>>) -> Self { + pub fn new(senders: Arc>>) -> Self { + let cached_senders = senders.value().get(); WriteSenders { - write_senders, + senders: senders.tracker("async writers' tracker".to_owned()), + cached_senders, io_reschedule_concurrent_count: Arc::default(), } } #[inline] pub fn is_empty(&self) -> bool { - self.write_senders.is_empty() + self.cached_senders.is_empty() + } + + #[inline] + pub fn size(&self) -> usize { + self.cached_senders.len() + } + + #[inline] + pub fn refresh(&mut self) { + if let Some(senders) = self.senders.any_new() { + self.cached_senders = senders.get(); + } } } @@ -279,7 +340,7 @@ impl Index for WriteSenders { #[inline] fn index(&self, index: usize) -> &Sender> { - &self.write_senders[index] + &self.cached_senders[index] } } @@ -329,7 +390,7 @@ pub(crate) mod tests { Self { receivers, ctx: TestContext { - senders: WriteSenders::new(senders), + senders: WriteSenders::new(Arc::new(VersionTrack::new(SharedSenders(senders)))), config, raft_metrics: RaftMetrics::new(true), }, diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index d6994a16ed4..6667a46c4e5 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -206,7 +206,6 @@ pub struct Config { pub store_batch_system: BatchSystemConfig, /// If it is 0, it means io tasks are handled in store threads. - #[online_config(skip)] pub store_io_pool_size: usize, #[online_config(skip)] @@ -1049,8 +1048,25 @@ impl ConfigManager for RaftstoreConfigManager { ) -> std::result::Result<(), Box> { { let change = change.clone(); - self.config - .update(move |cfg: &mut Config| cfg.update(change))?; + self.config.update(move |cfg: &mut Config| { + // Currently, it's forbidden to modify the write mode either from `async` to + // `sync` or from `sync` to `async`. + if let Some(ConfigValue::Usize(resized_io_size)) = change.get("store_io_pool_size") + { + if cfg.store_io_pool_size == 0 && *resized_io_size > 0 { + return Err( + "SYNC mode, not allowed to resize the size of store-io-pool-size" + .into(), + ); + } else if cfg.store_io_pool_size > 0 && *resized_io_size == 0 { + return Err( + "ASYNC mode, not allowed to be set to SYNC mode by resizing store-io-pool-size to 0" + .into(), + ); + } + } + cfg.update(change) + })?; } if let Some(ConfigValue::Module(raft_batch_system_change)) = change.get("store_batch_system") @@ -1062,6 +1078,12 @@ impl ConfigManager for RaftstoreConfigManager { { self.schedule_config_change(RaftStoreBatchComponent::Apply, apply_batch_system_change); } + if let Some(ConfigValue::Usize(resized_io_size)) = change.get("store_io_pool_size") { + let resize_io_task = RefreshConfigTask::ScaleWriters(*resized_io_size); + if let Err(e) = self.scheduler.schedule(resize_io_task) { + error!("raftstore configuration manager schedule to resize store-io-pool-size work task failed"; "err"=> ?e); + } + } info!( "raftstore config changed"; "change" => ?change, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 66acd187215..4b9e69f9763 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -71,7 +71,7 @@ use crate::{ store::{ async_io::{ read::{ReadRunner, ReadTask}, - write::{StoreWriters, Worker as WriteWorker, WriteMsg}, + write::{StoreWriters, StoreWritersContext, Worker as WriteWorker, WriteMsg}, write_router::WriteSenders, }, config::Config, @@ -907,6 +907,8 @@ impl PollHandler, St self.poll_ctx.update_ticks_timeout(); update_cfg(&incoming.store_batch_system); } + // update store writers if necessary + self.poll_ctx.write_senders.refresh(); } fn handle_control(&mut self, store: &mut StoreFsm) -> Option { @@ -1052,7 +1054,13 @@ impl PollHandler, St } } } else { - let writer_id = rand::random::() % self.poll_ctx.cfg.store_io_pool_size; + // Use the valid size of async-ios for generating `writer_id` when the local + // senders haven't been updated by `poller.begin(). + let writer_id = rand::random::() + % std::cmp::min( + self.poll_ctx.cfg.store_io_pool_size, + self.poll_ctx.write_senders.size(), + ); if let Err(err) = self.poll_ctx.write_senders[writer_id].try_send( WriteMsg::LatencyInspect { send_time: write_begin, @@ -1733,6 +1741,15 @@ impl RaftBatchSystem { .spawn("apply".to_owned(), apply_poller_builder); let refresh_config_runner = RefreshConfigRunner::new( + StoreWritersContext { + store_id: store.get_id(), + notifier: self.router.clone(), + raft_engine: raft_builder.engines.raft.clone(), + kv_engine: Some(raft_builder.engines.kv.clone()), + transfer: raft_builder.trans.clone(), + cfg: raft_builder.cfg.clone(), + }, + self.store_writers.clone(), self.apply_router.router.clone(), self.router.router.clone(), self.apply_system.build_pool_state(apply_builder), diff --git a/components/raftstore/src/store/worker/refresh_config.rs b/components/raftstore/src/store/worker/refresh_config.rs index ff34b9abb4e..7ba0476d381 100644 --- a/components/raftstore/src/store/worker/refresh_config.rs +++ b/components/raftstore/src/store/worker/refresh_config.rs @@ -11,10 +11,15 @@ use tikv_util::{ debug, error, info, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, worker::Runnable, }; -use crate::store::fsm::{ - apply::{ApplyFsm, ControlFsm}, - store::StoreFsm, - PeerFsm, +use crate::store::{ + async_io::write::{StoreWriters, StoreWritersContext}, + fsm::{ + apply::{ApplyFsm, ControlFsm}, + store::{RaftRouter, StoreFsm}, + PeerFsm, + }, + transport::Transport, + PersistedNotifier, }; pub struct PoolController> { @@ -110,6 +115,38 @@ where } } +struct WriterContoller +where + EK: engine_traits::KvEngine, + ER: engine_traits::RaftEngine, + T: Transport + 'static, + N: PersistedNotifier, +{ + writer_meta: StoreWritersContext, + store_writers: StoreWriters, + expected_writers_size: usize, +} + +impl WriterContoller +where + EK: engine_traits::KvEngine, + ER: engine_traits::RaftEngine, + T: Transport + 'static, + N: PersistedNotifier, +{ + pub fn new( + writer_meta: StoreWritersContext, + store_writers: StoreWriters, + ) -> Self { + let writers_size = store_writers.size(); + Self { + writer_meta, + store_writers, + expected_writers_size: writers_size, + } + } +} + #[derive(Debug, Clone, Copy)] pub enum BatchComponent { Store, @@ -133,6 +170,7 @@ impl Display for BatchComponent { pub enum Task { ScalePool(BatchComponent, usize), ScaleBatchSize(BatchComponent, usize), + ScaleWriters(usize), } impl Display for Task { @@ -144,38 +182,48 @@ impl Display for Task { Task::ScaleBatchSize(component, size) => { write!(f, "Scale max_batch_size adjusts {}: {} ", component, size) } + Task::ScaleWriters(size) => { + write!(f, "Scale store_io_pool_size adjusts {} ", size) + } } } } -pub struct Runner +pub struct Runner where EK: engine_traits::KvEngine, ER: engine_traits::RaftEngine, AH: HandlerBuilder, ControlFsm>, RH: HandlerBuilder, StoreFsm>, + T: Transport + 'static, { + writer_ctrl: WriterContoller>, apply_pool: PoolController, ControlFsm, AH>, raft_pool: PoolController, StoreFsm, RH>, } -impl Runner +impl Runner where EK: engine_traits::KvEngine, ER: engine_traits::RaftEngine, AH: HandlerBuilder, ControlFsm>, RH: HandlerBuilder, StoreFsm>, + T: Transport + 'static, { pub fn new( + writer_meta: StoreWritersContext>, + store_writers: StoreWriters, apply_router: BatchRouter, ControlFsm>, raft_router: BatchRouter, StoreFsm>, apply_pool_state: PoolState, ControlFsm, AH>, raft_pool_state: PoolState, StoreFsm, RH>, ) -> Self { + let writer_ctrl = WriterContoller::new(writer_meta, store_writers); let apply_pool = PoolController::new(apply_router, apply_pool_state); let raft_pool = PoolController::new(raft_router, raft_pool_state); Runner { + writer_ctrl, apply_pool, raft_pool, } @@ -187,7 +235,7 @@ where match current_pool_size.cmp(&size) { std::cmp::Ordering::Greater => self.raft_pool.decrease_by(current_pool_size - size), std::cmp::Ordering::Less => self.raft_pool.increase_by(size - current_pool_size), - std::cmp::Ordering::Equal => (), + std::cmp::Ordering::Equal => return, } self.raft_pool.cleanup_poller_threads(); info!( @@ -203,7 +251,7 @@ where match current_pool_size.cmp(&size) { std::cmp::Ordering::Greater => self.apply_pool.decrease_by(current_pool_size - size), std::cmp::Ordering::Less => self.apply_pool.increase_by(size - current_pool_size), - std::cmp::Ordering::Equal => (), + std::cmp::Ordering::Equal => return, } self.apply_pool.cleanup_poller_threads(); info!( @@ -212,14 +260,47 @@ where "to" => self.apply_pool.state.expected_pool_size ); } + + /// Resizes the count of background threads in store_writers. + fn resize_store_writers(&mut self, size: usize) { + // The resizing of store writers will not directly update the local cached + // store writers in each poller. Each poller will timely correct its local + // cached in its next `poller.begin()` after the resize operation completed. + let current_size = self.writer_ctrl.expected_writers_size; + self.writer_ctrl.expected_writers_size = size; + match current_size.cmp(&size) { + std::cmp::Ordering::Greater => { + if let Err(e) = self.writer_ctrl.store_writers.decrease_to(size) { + error!("failed to decrease store writers size"; "err_msg" => ?e); + } + } + std::cmp::Ordering::Less => { + let writer_meta = self.writer_ctrl.writer_meta.clone(); + if let Err(e) = self + .writer_ctrl + .store_writers + .increase_to(size, writer_meta) + { + error!("failed to increase store writers size"; "err_msg" => ?e); + } + } + std::cmp::Ordering::Equal => return, + } + info!( + "resize store writers pool"; + "from" => current_size, + "to" => size + ); + } } -impl Runnable for Runner +impl Runnable for Runner where EK: engine_traits::KvEngine, ER: engine_traits::RaftEngine, AH: HandlerBuilder, ControlFsm> + std::marker::Send, RH: HandlerBuilder, StoreFsm> + std::marker::Send, + T: Transport + 'static, { type Task = Task; @@ -237,6 +318,7 @@ where self.apply_pool.state.max_batch_size = size; } }, + Task::ScaleWriters(size) => self.resize_store_writers(size), } } } diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index ff1babb7e1f..003d63d9a47 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -141,19 +141,19 @@ where rx.recv_timeout(Duration::from_secs(3)).unwrap(); } +fn new_changes(cfgs: Vec<(&str, &str)>) -> std::collections::HashMap { + std::collections::HashMap::from_iter( + cfgs.into_iter() + .map(|kv| (kv.0.to_owned(), kv.1.to_owned())), + ) +} + #[test] fn test_update_raftstore_config() { let (mut config, _dir) = TikvConfig::with_tmp().unwrap(); config.validate().unwrap(); let (cfg_controller, router, _, mut system) = start_raftstore(config.clone(), &_dir); - let new_changes = |cfgs: Vec<(&str, &str)>| { - std::collections::HashMap::from_iter( - cfgs.into_iter() - .map(|kv| (kv.0.to_owned(), kv.1.to_owned())), - ) - }; - // dispatch updated config let change = new_changes(vec![ ("raftstore.messages-per-tick", "12345"), @@ -224,3 +224,58 @@ fn test_update_raftstore_config() { system.shutdown(); } + +#[test] +fn test_update_raftstore_io_config() { + // Test update raftstore configurations on io settings. + // Start from SYNC mode. + { + let (mut resize_config, _dir) = TikvConfig::with_tmp().unwrap(); + resize_config.validate().unwrap(); + let (cfg_controller, _, _, mut system) = start_raftstore(resize_config, &_dir); + + // not allowed to resize from SYNC mode to ASYNC mode + let resize_store_writers_cfg = vec![("raftstore.store-io-pool-size", "2")]; + assert!( + cfg_controller + .update(new_changes(resize_store_writers_cfg)) + .is_err() + ); + system.shutdown(); + } + // Start from ASYNC mode. + { + let (mut resize_config, _dir) = TikvConfig::with_tmp().unwrap(); + resize_config.raft_store.store_io_pool_size = 2; + resize_config.validate().unwrap(); + let (cfg_controller, _, _, mut system) = start_raftstore(resize_config, &_dir); + + // not allowed to resize from ASYNC mode to SYNC mode + let resize_store_writers_cfg = vec![("raftstore.store-io-pool-size", "0")]; + assert!( + cfg_controller + .update(new_changes(resize_store_writers_cfg)) + .is_err() + ); + system.shutdown(); + } + // Modify the size of async-ios. + { + let (mut resize_config, _dir) = TikvConfig::with_tmp().unwrap(); + resize_config.raft_store.store_io_pool_size = 2; + resize_config.validate().unwrap(); + let (cfg_controller, _, _, mut system) = start_raftstore(resize_config, &_dir); + + // resize the count of ios to 1 by decreasing. + let resize_store_writers_cfg = vec![("raftstore.store-io-pool-size", "1")]; + cfg_controller + .update(new_changes(resize_store_writers_cfg)) + .unwrap(); + // resize the count of ios to 4 by increasing. + let resize_store_writers_cfg = vec![("raftstore.store-io-pool-size", "4")]; + cfg_controller + .update(new_changes(resize_store_writers_cfg)) + .unwrap(); + system.shutdown(); + } +} diff --git a/tests/integrations/raftstore/test_scale_pool.rs b/tests/integrations/raftstore/test_scale_pool.rs index 1672e57ae02..794cf90f4cb 100644 --- a/tests/integrations/raftstore/test_scale_pool.rs +++ b/tests/integrations/raftstore/test_scale_pool.rs @@ -157,3 +157,192 @@ fn test_decrease_pool() { cluster.must_put(b"k2", b"v2"); must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); } + +fn get_async_writers_tids() -> Vec { + let prefix = "store-writer-"; + let mut writers_tids = vec![]; + let pid = thread::process_id(); + let all_tids: Vec<_> = thread::thread_ids(pid).unwrap(); + for tid in all_tids { + if let Ok(stat) = thread::full_thread_stat(pid, tid) { + if stat.command.starts_with(prefix) { + writers_tids.push(tid); + } + } + } + writers_tids +} + +#[test] +fn test_increase_async_ios() { + let mut cluster = new_node_cluster(0, 1); + cluster.cfg.raft_store.store_io_pool_size = 1; + cluster.pd_client.disable_default_operator(); + cluster.run(); + + // Save current async-io tids before shrinking + let org_writers_tids = get_async_writers_tids(); + assert_eq!(1, org_writers_tids.len()); + // Request can be handled as usual + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + + // Update config, expand from 1 to 2 + { + let sim = cluster.sim.rl(); + let cfg_controller = sim.get_cfg_controller().unwrap(); + + let change = { + let mut change = HashMap::new(); + change.insert("raftstore.store-io-pool-size".to_owned(), "2".to_owned()); + change + }; + + cfg_controller.update(change).unwrap(); + assert_eq!( + cfg_controller.get_current().raft_store.store_io_pool_size, + 2 + ); + // Wait for the completion of increasing async-ios + std::thread::sleep(std::time::Duration::from_secs(1)); + } + // Save current async-io tids after scaling up, and compared with the + // orginial one before scaling up, the thread num should be added up to TWO. + let cur_writers_tids = get_async_writers_tids(); + assert_eq!(cur_writers_tids.len() - 1, org_writers_tids.len()); + + // Request can be handled as usual + cluster.must_put(b"k2", b"v2"); + must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); +} + +#[test] +fn test_decrease_async_ios() { + let mut cluster = new_node_cluster(0, 1); + cluster.cfg.raft_store.store_io_pool_size = 4; + cluster.pd_client.disable_default_operator(); + cluster.run(); + + // Save current async-io tids before shrinking + let org_writers_tids = get_async_writers_tids(); + assert_eq!(4, org_writers_tids.len()); + // Request can be handled as usual + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + + // Update config, shrink from 4 to 1 + { + let sim = cluster.sim.rl(); + let cfg_controller = sim.get_cfg_controller().unwrap(); + let change = { + let mut change = HashMap::new(); + change.insert("raftstore.store-io-pool-size".to_owned(), "1".to_owned()); + change + }; + + cfg_controller.update(change).unwrap(); + assert_eq!( + cfg_controller.get_current().raft_store.store_io_pool_size, + 1 + ); + // Wait for the completion of decreasing async-ios + std::thread::sleep(std::time::Duration::from_secs(1)); + } + + // Save current async-io tids after scaling down, and compared with the + // orginial one before shrinking. As the decreasing of async-ios won't + // release asynchronous writers, the thread num should not be updated. + let cur_writers_tids = get_async_writers_tids(); + assert_eq!(cur_writers_tids.len(), org_writers_tids.len()); + // After shrinking, all the left tids must be there before + for tid in cur_writers_tids { + assert!(org_writers_tids.contains(&tid)); + } + // Request can be handled as usual + cluster.must_put(b"k2", b"v2"); + must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); +} + +#[test] +fn test_resize_async_ios_failed_1() { + let mut cluster = new_node_cluster(0, 1); + cluster.cfg.raft_store.store_io_pool_size = 2; + cluster.pd_client.disable_default_operator(); + cluster.run(); + + // Save current async-io tids before shrinking + let org_writers_tids = get_async_writers_tids(); + assert_eq!(2, org_writers_tids.len()); + // Request can be handled as usual + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + + // Update config, expand from async-mode(async-ios == 2) to + // sync-mode(async-ios == 0). + { + let sim = cluster.sim.rl(); + let cfg_controller = sim.get_cfg_controller().unwrap(); + + let change = { + let mut change = HashMap::new(); + change.insert("raftstore.store-io-pool-size".to_owned(), "0".to_owned()); + change + }; + + assert!(cfg_controller.update(change).is_err()); + assert_eq!( + cfg_controller.get_current().raft_store.store_io_pool_size, + 2 + ); + } + // Save current async-io tids after scaling up, and compared with the + // orginial one before scaling up, the thread num should be added up to TWO. + let cur_writers_tids = get_async_writers_tids(); + assert_eq!(cur_writers_tids.len(), org_writers_tids.len()); + + // Request can be handled as usual + cluster.must_put(b"k2", b"v2"); + must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); +} + +#[test] +fn test_resize_async_ios_failed_2() { + let mut cluster = new_node_cluster(0, 1); + cluster.cfg.raft_store.store_io_pool_size = 0; + cluster.pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + + // Save current async-io tids before shrinking + let org_writers_tids = get_async_writers_tids(); + assert_eq!(0, org_writers_tids.len()); + // Request can be handled as usual + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); + + // Update config, expand from sync-mode(async-ios == 0) to + // async-mode(async-ios == 2). + { + let sim = cluster.sim.rl(); + let cfg_controller = sim.get_cfg_controller().unwrap(); + + let change = { + let mut change = HashMap::new(); + change.insert("raftstore.store-io-pool-size".to_owned(), "2".to_owned()); + change + }; + + assert!(cfg_controller.update(change).is_err()); + assert_eq!( + cfg_controller.get_current().raft_store.store_io_pool_size, + 0 + ); + } + // Save current async-io tids after scaling up, and compared with the + // orginial one before scaling up, the thread num should be added up to TWO. + let cur_writers_tids = get_async_writers_tids(); + assert_eq!(cur_writers_tids.len(), org_writers_tids.len()); + + // Request can be handled as usual + cluster.must_put(b"k2", b"v2"); + must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); +} From 26813ba8525f309160c6691893ef93e5ed1bf34e Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 7 Feb 2023 13:01:57 +0800 Subject: [PATCH 494/676] raftstore-v2: fix tablet gc issues (#14125) close tikv/tikv#14115 Signed-off-by: tabokie Co-authored-by: Ti Chi Robot --- Cargo.lock | 6 +- components/engine_panic/src/misc.rs | 4 + components/engine_rocks/src/misc.rs | 9 + components/engine_test/src/lib.rs | 4 +- components/engine_traits/src/misc.rs | 4 + .../raftstore-v2/src/worker/tablet_gc.rs | 164 +++++++++++++++--- 6 files changed, 158 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 485aeb43c52..473058ffd9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2861,7 +2861,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" +source = "git+https://github.com/tikv/rust-rocksdb.git#b2cd42588ac62e40e297fea56a2286c0c389aade" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2880,7 +2880,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" +source = "git+https://github.com/tikv/rust-rocksdb.git#b2cd42588ac62e40e297fea56a2286c0c389aade" dependencies = [ "bzip2-sys", "cc", @@ -4797,7 +4797,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" +source = "git+https://github.com/tikv/rust-rocksdb.git#b2cd42588ac62e40e297fea56a2286c0c389aade" dependencies = [ "libc 0.2.139", "librocksdb_sys", diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 93218767ec0..5603bf43c77 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -68,6 +68,10 @@ impl MiscExt for PanicEngine { panic!() } + fn continue_background_work(&self) -> Result<()> { + panic!() + } + fn exists(path: &str) -> bool { panic!() } diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 3477226ae76..8d5bb3d43ef 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -274,10 +274,19 @@ impl MiscExt for RocksEngine { } fn pause_background_work(&self) -> Result<()> { + // This will make manual compaction return error instead of waiting. In practice + // we might want to identify this case by parsing error message. + self.as_inner().disable_manual_compaction(); self.as_inner().pause_bg_work(); Ok(()) } + fn continue_background_work(&self) -> Result<()> { + self.as_inner().enable_manual_compaction(); + self.as_inner().continue_bg_work(); + Ok(()) + } + fn exists(path: &str) -> bool { crate::util::db_exist(path) } diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 2d89929a4b2..1b0dbfbddb6 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -129,8 +129,8 @@ pub mod kv { } fn destroy_tablet(&self, _ctx: TabletContext, path: &Path) -> Result<()> { - let tombstone_path = path.join(TOMBSTONE_SUFFIX); - std::fs::remove_dir_all(&tombstone_path)?; + let tombstone_path = path.with_extension(TOMBSTONE_SUFFIX); + let _ = std::fs::remove_dir_all(&tombstone_path); std::fs::rename(path, &tombstone_path)?; std::fs::remove_dir_all(tombstone_path)?; Ok(()) diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 5bbcbb2de79..c2d317f529f 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -97,8 +97,12 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn sync_wal(&self) -> Result<()>; + /// Depending on the implementation, some on-going manual compactions may be + /// aborted. fn pause_background_work(&self) -> Result<()>; + fn continue_background_work(&self) -> Result<()>; + /// Check whether a database exists at a given path fn exists(path: &str) -> bool; diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index d6d19743b1e..dc5f3dad56d 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -9,8 +9,13 @@ use std::{ use collections::HashMap; use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry}; use kvproto::metapb::Region; -use slog::{debug, error, warn, Logger}; -use tikv_util::worker::{Runnable, RunnableWithTimer}; +use slog::{debug, error, info, warn, Logger}; +use tikv_util::{ + worker::{Runnable, RunnableWithTimer}, + yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, +}; + +const DEFAULT_BACKGROUND_POOL_SIZE: usize = 6; pub enum Task { Trim { @@ -98,6 +103,10 @@ pub struct Runner { // region_id -> [(tablet_path, wait_for_persisted)]. waiting_destroy_tasks: HashMap>, pending_destroy_tasks: Vec, + + // An independent pool to run tasks that are time-consuming but doesn't take CPU resources, + // such as waiting for RocksDB compaction. + background_pool: FuturePool, } impl Runner { @@ -107,27 +116,72 @@ impl Runner { logger, waiting_destroy_tasks: HashMap::default(), pending_destroy_tasks: Vec::new(), + background_pool: YatpPoolBuilder::new(DefaultTicker::default()) + .name_prefix("tablet-gc-bg") + .thread_count( + 0, + DEFAULT_BACKGROUND_POOL_SIZE, + DEFAULT_BACKGROUND_POOL_SIZE, + ) + .build_future_pool(), } } - fn trim( - tablet: &EK, - start_key: &[u8], - end_key: &[u8], - cb: Box, - ) -> engine_traits::Result<()> { - let start_key = keys::data_key(start_key); - let end_key = keys::data_end_key(end_key); + fn trim(&self, tablet: EK, start: Box<[u8]>, end: Box<[u8]>, cb: Box) { + let start_key = keys::data_key(&start); + let end_key = keys::data_end_key(&end); let range1 = Range::new(&[], &start_key); let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); - tablet.delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[range1, range2])?; - // TODO: Avoid this after compaction filter is ready. - tablet.delete_ranges_cfs(DeleteStrategy::DeleteByRange, &[range1, range2])?; - for r in [range1, range2] { - tablet.compact_range(Some(r.start_key), Some(r.end_key), false, 1)?; + // TODO: Avoid `DeleteByRange` after compaction filter is ready. + if let Err(e) = tablet + .delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[range1, range2]) + .and_then(|_| { + tablet.delete_ranges_cfs(DeleteStrategy::DeleteByRange, &[range1, range2]) + }) + { + error!( + self.logger, + "failed to trim tablet"; + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => %e, + ); + return; } - cb(); - Ok(()) + let logger = self.logger.clone(); + self.background_pool + .spawn(async move { + let range1 = Range::new(&[], &start_key); + let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); + for r in [range1, range2] { + if let Err(e) = + tablet.compact_range(Some(r.start_key), Some(r.end_key), false, 1) + { + if e.to_string().contains("Manual compaction paused") { + info!( + logger, + "tablet manual compaction is paused, skip trim"; + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => %e, + ); + } else { + error!( + logger, + "failed to trim tablet"; + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => %e, + ); + } + return; + } + } + // drop before callback. + drop(tablet); + cb(); + }) + .unwrap(); } fn prepare_destroy(&mut self, region_id: u64, tablet: EK, wait_for_persisted: u64) { @@ -204,17 +258,7 @@ where start_key, end_key, cb, - } => { - if let Err(e) = Self::trim(&tablet, &start_key, &end_key, cb) { - error!( - self.logger, - "failed to trim tablet"; - "start_key" => log_wrappers::Value::key(&start_key), - "end_key" => log_wrappers::Value::key(&end_key), - "err" => %e, - ); - } - } + } => self.trim(tablet, start_key, end_key, cb), Task::PrepareDestroy { region_id, tablet, @@ -241,3 +285,67 @@ where Duration::from_secs(10) } } + +#[cfg(test)] +mod tests { + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::TestTabletFactory, + }; + use engine_traits::{MiscExt, TabletContext, TabletRegistry}; + use tempfile::Builder; + + use super::*; + + #[test] + fn test_race_between_destroy_and_trim() { + let dir = Builder::new() + .prefix("test_race_between_destroy_and_trim") + .tempdir() + .unwrap(); + let factory = Box::new(TestTabletFactory::new( + DbOptions::default(), + vec![("default", CfOptions::default())], + )); + let registry = TabletRegistry::new(factory, dir.path()).unwrap(); + let logger = slog_global::borrow_global().new(slog::o!()); + let mut runner = Runner::new(registry.clone(), logger); + + let mut region = Region::default(); + let rid = 1; + region.set_id(rid); + region.set_start_key(b"a".to_vec()); + region.set_end_key(b"b".to_vec()); + let tablet = registry + .load(TabletContext::new(®ion, Some(1)), true) + .unwrap() + .latest() + .unwrap() + .clone(); + runner.run(Task::prepare_destroy(tablet.clone(), rid, 10)); + let (tx, rx) = std::sync::mpsc::channel(); + runner.run(Task::trim(tablet, ®ion, move || tx.send(()).unwrap())); + rx.recv().unwrap(); + + let rid = 2; + region.set_id(rid); + region.set_start_key(b"c".to_vec()); + region.set_end_key(b"d".to_vec()); + let tablet = registry + .load(TabletContext::new(®ion, Some(1)), true) + .unwrap() + .latest() + .unwrap() + .clone(); + registry.remove(rid); + runner.run(Task::prepare_destroy(tablet.clone(), rid, 10)); + runner.run(Task::destroy(rid, 100)); + let path = PathBuf::from(tablet.path()); + assert!(path.exists()); + let (tx, rx) = std::sync::mpsc::channel(); + runner.run(Task::trim(tablet, ®ion, move || tx.send(()).unwrap())); + rx.recv().unwrap(); + runner.on_timeout(); + assert!(!path.exists()); + } +} From d083fc92d5228fb4bfc74a98e5ae7982d8fef22a Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Tue, 7 Feb 2023 13:29:57 +0800 Subject: [PATCH 495/676] copr: reject request when estimated waiting duration exceeds threshold (#14077) ref tikv/tikv#14151 Add a read pool time slice inspector to predict the waiting time for read requests. Use the estimated duration to reject requests which have busy_threashold. Signed-off-by: Yilin Chen Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/server/src/server.rs | 13 +- components/server/src/server2.rs | 13 +- src/coprocessor/endpoint.rs | 17 ++- src/read_pool.rs | 175 ++++++++++++++++++++++++- src/storage/mod.rs | 150 ++++++++++----------- tests/failpoints/cases/test_storage.rs | 17 ++- 7 files changed, 295 insertions(+), 92 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 473058ffd9d..1b3c1452ebf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2732,7 +2732,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#eccad3776d7b076da68d6c51fb7506b8562b9802" +source = "git+https://github.com/pingcap/kvproto.git#0561adc3754362675cc08b5203d8b6444e645395" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 4fe397e9eb5..be516a84ae0 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -92,7 +92,9 @@ use tikv::{ coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, import::{ImportSstService, SstImporter}, - read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, + read_pool::{ + build_yatp_read_pool, ReadPool, ReadPoolConfigManager, UPDATE_EWMA_TIME_SLICE_INTERVAL, + }, server::{ config::{Config as ServerConfig, ServerConfigManager}, gc_worker::{AutoGcConfig, GcWorker}, @@ -770,6 +772,15 @@ where } else { None }; + if let Some(unified_read_pool) = &unified_read_pool { + let handle = unified_read_pool.handle(); + self.background_worker.spawn_interval_task( + UPDATE_EWMA_TIME_SLICE_INTERVAL, + move || { + handle.update_ewma_time_slice(); + }, + ); + } // The `DebugService` and `DiagnosticsService` will share the same thread pool let props = tikv_util::thread_group::current_properties(); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 20d79e7cce5..0797b391d87 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -73,7 +73,9 @@ use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, - read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, + read_pool::{ + build_yatp_read_pool, ReadPool, ReadPoolConfigManager, UPDATE_EWMA_TIME_SLICE_INTERVAL, + }, server::{ config::{Config as ServerConfig, ServerConfigManager}, gc_worker::{AutoGcConfig, GcWorker}, @@ -666,6 +668,15 @@ where } else { None }; + if let Some(unified_read_pool) = &unified_read_pool { + let handle = unified_read_pool.handle(); + self.background_worker.spawn_interval_task( + UPDATE_EWMA_TIME_SLICE_INTERVAL, + move || { + handle.update_ewma_time_slice(); + }, + ); + } // The `DebugService` and `DiagnosticsService` will share the same thread pool let props = tikv_util::thread_group::current_properties(); diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 6ac1bebc541..3ba320149ac 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -11,7 +11,7 @@ use api_version::{dispatch_api_version, KvFormat}; use async_stream::try_stream; use concurrency_manager::ConcurrencyManager; use engine_traits::PerfLevel; -use futures::{channel::mpsc, prelude::*}; +use futures::{channel::mpsc, future::Either, prelude::*}; use kvproto::{coprocessor as coppb, errorpb, kvrpcpb}; use protobuf::{CodedInputStream, Message}; use resource_metering::{FutureExt, ResourceTagFactory, StreamExt}; @@ -516,6 +516,16 @@ impl Endpoint { mut req: coppb::Request, peer: Option, ) -> impl Future> { + // Check the load of the read pool. If it's too busy, generate and return + // error in the gRPC thread to avoid waiting in the queue of the read pool. + if let Err(busy_err) = self.read_pool.check_busy_threshold(Duration::from_millis( + req.get_context().get_busy_threshold_ms() as u64, + )) { + let mut resp = coppb::Response::default(); + resp.mut_region_error().set_server_is_busy(busy_err); + return Either::Left(async move { resp.into() }); + } + let tracker = GLOBAL_TRACKERS.insert(::tracker::Tracker::new(RequestInfo::new( req.get_context(), RequestType::Unknown, @@ -526,7 +536,7 @@ impl Endpoint { let result_of_future = self .parse_request_and_check_memory_locks(req, peer, false) .map(|(handler_builder, req_ctx)| self.handle_unary_request(req_ctx, handler_builder)); - async move { + let fut = async move { let res = match result_of_future { Err(e) => { let mut res = make_error_response(e); @@ -546,7 +556,8 @@ impl Endpoint { }; GLOBAL_TRACKERS.remove(tracker); res - } + }; + Either::Right(fut) } // process_batch_tasks process the input batched coprocessor tasks if any, diff --git a/src/read_pool.rs b/src/read_pool.rs index ea20b149a3d..2c56e205ef7 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -1,16 +1,21 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + convert::TryFrom, future::Future, - sync::{mpsc::SyncSender, Arc, Mutex}, + sync::{ + atomic::{AtomicU64, Ordering}, + mpsc::SyncSender, + Arc, Mutex, + }, time::Duration, }; use file_system::{set_io_type, IoType}; use futures::{channel::oneshot, future::TryFutureExt}; -use kvproto::kvrpcpb::CommandPri; +use kvproto::{errorpb, kvrpcpb::CommandPri}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; -use prometheus::{IntCounter, IntGauge}; +use prometheus::{Histogram, IntCounter, IntGauge}; use resource_control::{ControlledFuture, ResourceController}; use thiserror::Error; use tikv_util::{ @@ -54,6 +59,7 @@ pub enum ReadPool { max_tasks: usize, pool_size: usize, resource_ctl: Option>, + time_slice_inspector: Arc, }, } @@ -76,6 +82,7 @@ impl ReadPool { max_tasks, pool_size, resource_ctl, + time_slice_inspector, } => ReadPoolHandle::Yatp { remote: pool.remote().clone(), running_tasks: running_tasks.clone(), @@ -83,6 +90,7 @@ impl ReadPool { max_tasks: *max_tasks, pool_size: *pool_size, resource_ctl: resource_ctl.clone(), + time_slice_inspector: time_slice_inspector.clone(), }, } } @@ -102,6 +110,7 @@ pub enum ReadPoolHandle { max_tasks: usize, pool_size: usize, resource_ctl: Option>, + time_slice_inspector: Arc, }, } @@ -251,6 +260,121 @@ impl ReadPoolHandle { } } } + + pub fn get_ewma_time_slice(&self) -> Option { + match self { + ReadPoolHandle::FuturePools { .. } => None, + ReadPoolHandle::Yatp { + time_slice_inspector, + .. + } => Some(time_slice_inspector.get_ewma_time_slice()), + } + } + + pub fn update_ewma_time_slice(&self) { + if let ReadPoolHandle::Yatp { + time_slice_inspector, + .. + } = self + { + time_slice_inspector.update(); + } + } + + pub fn get_estimated_wait_duration(&self) -> Option { + self.get_ewma_time_slice() + .map(|s| s * (self.get_queue_size_per_worker() as u32)) + } + + pub fn check_busy_threshold( + &self, + busy_threshold: Duration, + ) -> Result<(), errorpb::ServerIsBusy> { + if busy_threshold.is_zero() { + return Ok(()); + } + let estimated_wait = match self.get_estimated_wait_duration() { + Some(estimated_wait) if estimated_wait > busy_threshold => estimated_wait, + _ => return Ok(()), + }; + // TODO: Get applied_index from the raftstore and check memory locks. Then, we + // can skip read index in replica read. But now the difficulty is that we don't + // have access to the the local reader in gRPC threads. + let mut busy_err = errorpb::ServerIsBusy::default(); + busy_err.set_reason("estimated wait time exceeds threshold".to_owned()); + busy_err.estimated_wait_ms = u32::try_from(estimated_wait.as_millis()).unwrap_or(u32::MAX); + Err(busy_err) + } +} + +pub const UPDATE_EWMA_TIME_SLICE_INTERVAL: Duration = Duration::from_millis(200); + +pub struct TimeSliceInspector { + // `atomic_ewma_nanos` is a mirror of `inner.ewma` provided for fast access. It is updated in + // the `update` method. + atomic_ewma_nanos: AtomicU64, + inner: Mutex, +} + +struct TimeSliceInspectorInner { + time_slice_hist: [Histogram; 3], + ewma: Duration, + + last_sum: Duration, + last_count: u64, +} + +impl TimeSliceInspector { + pub fn new(name: &str) -> Self { + let time_slice_hist = [ + yatp::metrics::TASK_POLL_DURATION.with_label_values(&[name, "0"]), + yatp::metrics::TASK_POLL_DURATION.with_label_values(&[name, "1"]), + yatp::metrics::TASK_POLL_DURATION.with_label_values(&[name, "2"]), + ]; + let inner = TimeSliceInspectorInner { + time_slice_hist, + ewma: Duration::default(), + last_sum: Duration::default(), + last_count: 0, + }; + Self { + atomic_ewma_nanos: AtomicU64::default(), + inner: Mutex::new(inner), + } + } + + pub fn update(&self) { + // new_ewma = WEIGHT * new_val + (1 - WEIGHT) * old_ewma + const WEIGHT: f64 = 0.3; + // If the accumulated time slice is less than 100ms, the EWMA is not updated. + const MIN_TIME_DIFF: Duration = Duration::from_millis(100); + + let mut inner = self.inner.lock().unwrap(); + let mut new_sum = Duration::default(); + let mut new_count = 0; + // Now, we simplify the problem by merging samples from all levels. If we want + // more accurate answer in the future, calculate for each level separately. + for hist in &inner.time_slice_hist { + new_sum += Duration::from_secs_f64(hist.get_sample_sum()); + new_count += hist.get_sample_count(); + } + let time_diff = new_sum - inner.last_sum; + if time_diff < MIN_TIME_DIFF { + return; + } + let new_val = time_diff / ((new_count - inner.last_count) as u32); + let new_ewma = new_val.mul_f64(WEIGHT) + inner.ewma.mul_f64(1.0 - WEIGHT); + inner.ewma = new_ewma; + inner.last_sum = new_sum; + inner.last_count = new_count; + + self.atomic_ewma_nanos + .store(new_ewma.as_nanos() as u64, Ordering::Release); + } + + pub fn get_ewma_time_slice(&self) -> Duration { + Duration::from_nanos(self.atomic_ewma_nanos.load(Ordering::Acquire)) + } } #[derive(Clone)] @@ -273,8 +397,6 @@ impl ReporterTicker { #[cfg(test)] fn get_unified_read_pool_name() -> String { - use std::sync::atomic::{AtomicU64, Ordering}; - static COUNTER: AtomicU64 = AtomicU64::new(0); format!( "unified-read-pool-test-{}", @@ -319,6 +441,7 @@ pub fn build_yatp_read_pool( } else { builder.build_multi_level_pool() }; + let time_slice_inspector = Arc::new(TimeSliceInspector::new(&unified_read_pool_name)); ReadPool::Yatp { pool, running_tasks: UNIFIED_READ_POOL_RUNNING_TASKS @@ -330,6 +453,7 @@ pub fn build_yatp_read_pool( .saturating_mul(config.max_thread_count), pool_size: config.max_thread_count, resource_ctl, + time_slice_inspector, } } @@ -765,4 +889,45 @@ mod tests { _ => panic!("should return full error"), } } + + #[test] + fn test_time_slice_inspector_ewma() { + const MARGIN: f64 = 1e-5; // 10us + + let name = "test_time_slice_inspector_ewma"; + let inspector = TimeSliceInspector::new(name); + let hist = yatp::metrics::TASK_POLL_DURATION.with_label_values(&[name, "0"]); + + // avg: 0.055, prev_ewma: 0 => new_ewma = 0.0165 + for i in 1..=10 { + hist.observe(i as f64 * 0.01); + } + inspector.update(); + let ewma = inspector.get_ewma_time_slice().as_secs_f64(); + assert!((ewma - 0.0165).abs() < MARGIN); + + // avg: 0.0125, prev_ewma: 0.0165 => new_ewma = 0.0153 + for i in 5..=20 { + hist.observe(i as f64 * 0.001); + } + inspector.update(); + let ewma = inspector.get_ewma_time_slice().as_secs_f64(); + assert!((ewma - 0.0153).abs() < MARGIN); + + // sum: 55ms, don't update ewma + for i in 1..=10 { + hist.observe(i as f64 * 0.001); + } + inspector.update(); + let ewma = inspector.get_ewma_time_slice().as_secs_f64(); + assert!((ewma - 0.0153).abs() < MARGIN); + + // avg: 0.00786, prev_ewma: 0.0153 => new_ewma = 0.01307 + for i in 5..=15 { + hist.observe(i as f64 * 0.001); + } + inspector.update(); + let ewma = inspector.get_ewma_time_slice().as_secs_f64(); + assert!((ewma - 0.01307).abs() < MARGIN); + } } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 6273bc3d54c..ca35018e01e 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -69,6 +69,7 @@ use std::{ atomic::{self, AtomicBool, AtomicU64, Ordering}, Arc, }, + time::Duration, }; use api_version::{ApiV1, ApiV2, KeyMode, KvFormat, RawValue}; @@ -78,7 +79,7 @@ use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; use engine_traits::{ raw_ttl::ttl_to_expire_ts, CfName, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, DATA_CFS_LEN, }; -use futures::prelude::*; +use futures::{future::Either, prelude::*}; use kvproto::{ kvrpcpb::{ ApiVersion, ChecksumAlgorithm, CommandPri, Context, GetRequest, IsolationLevel, KeyRange, @@ -605,11 +606,13 @@ impl Storage { ); let concurrency_manager = self.concurrency_manager.clone(); let api_version = self.api_version; + let busy_threshold = Duration::from_millis(ctx.busy_threshold_ms as u64); let quota_limiter = self.quota_limiter.clone(); let mut sample = quota_limiter.new_sample(true); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { let stage_scheduled_ts = Instant::now(); tls_collect_query( @@ -663,13 +666,13 @@ impl Storage { false, ); snap_store - .get(&key, &mut statistics) - // map storage::txn::Error -> storage::Error - .map_err(Error::from) - .map(|r| { - KV_COMMAND_KEYREAD_HISTOGRAM_STATIC.get(CMD).observe(1_f64); - r - }) + .get(&key, &mut statistics) + // map storage::txn::Error -> storage::Error + .map_err(Error::from) + .map(|r| { + KV_COMMAND_KEYREAD_HISTOGRAM_STATIC.get(CMD).observe(1_f64); + r + }) }); metrics::tls_collect_scan_details(CMD, &statistics); metrics::tls_collect_read_flow( @@ -732,11 +735,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } /// Get values of a set of keys with separate context from a snapshot, @@ -762,6 +761,8 @@ impl Storage { .to_owned(); let concurrency_manager = self.concurrency_manager.clone(); let api_version = self.api_version; + let busy_threshold = + Duration::from_millis(requests[0].get_context().busy_threshold_ms as u64); // The resource tags of these batched requests are not the same, and it is quite // expensive to distinguish them, so we can find random one of them as a @@ -775,7 +776,8 @@ impl Storage { // Unset the TLS tracker because the future below does not belong to any // specific request clear_tls_tracker_token(); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); KV_COMMAND_KEYREAD_HISTOGRAM_STATIC @@ -921,11 +923,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } /// Get values of a set of keys in a batch from the snapshot. @@ -951,9 +949,11 @@ impl Storage { .new_tag_with_key_ranges(&ctx, key_ranges); let concurrency_manager = self.concurrency_manager.clone(); let api_version = self.api_version; + let busy_threshold = Duration::from_millis(ctx.busy_threshold_ms as u64); let quota_limiter = self.quota_limiter.clone(); let mut sample = quota_limiter.new_sample(true); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { let stage_scheduled_ts = Instant::now(); let mut key_ranges = vec![]; @@ -1095,12 +1095,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } /// Scan keys in [`start_key`, `end_key`) up to `limit` keys from the @@ -1136,8 +1131,10 @@ impl Storage { ); let concurrency_manager = self.concurrency_manager.clone(); let api_version = self.api_version; + let busy_threshold = Duration::from_millis(ctx.busy_threshold_ms as u64); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { { let end_key = match &end_key { @@ -1273,12 +1270,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } pub fn scan_lock( @@ -1605,8 +1597,10 @@ impl Storage { .resource_tag_factory .new_tag_with_key_ranges(&ctx, vec![(key.clone(), key.clone())]); let api_version = self.api_version; + let busy_threshold = Duration::from_millis(ctx.busy_threshold_ms as u64); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_COMMANDS_PRI_COUNTER_VEC_STATIC @@ -1663,12 +1657,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } /// Get the values of a set of raw keys, return a list of `Result`s. @@ -1688,6 +1677,7 @@ impl Storage { .to_owned(); let priority_tag = get_priority_tag(priority); let api_version = self.api_version; + let busy_threshold = Duration::from_millis(gets[0].get_context().busy_threshold_ms as u64); // The resource tags of these batched requests are not the same, and it is quite // expensive to distinguish them, so we can find random one of them as a @@ -1699,7 +1689,8 @@ impl Storage { .resource_tag_factory .new_tag_with_key_ranges(rand_ctx, vec![(rand_key.clone(), rand_key)]); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_COMMANDS_PRI_COUNTER_VEC_STATIC @@ -1800,11 +1791,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } /// Get the values of some raw keys in a batch. @@ -1823,8 +1810,10 @@ impl Storage { .resource_tag_factory .new_tag_with_key_ranges(&ctx, key_ranges); let api_version = self.api_version; + let busy_threshold = Duration::from_millis(ctx.busy_threshold_ms as u64); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { let mut key_ranges = vec![]; KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); @@ -1898,12 +1887,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } async fn check_causal_ts_flushed(ctx: &mut Context, tag: CommandKind) -> Result<()> { @@ -2319,8 +2303,10 @@ impl Storage { let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag(&ctx); let api_version = self.api_version; + let busy_threshold = Duration::from_millis(ctx.busy_threshold_ms as u64); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_COMMANDS_PRI_COUNTER_VEC_STATIC @@ -2425,12 +2411,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } /// Scan raw keys in multiple ranges in a batch. @@ -2455,8 +2436,10 @@ impl Storage { .resource_tag_factory .new_tag_with_key_ranges(&ctx, key_ranges); let api_version = self.api_version; + let busy_threshold = Duration::from_millis(ctx.busy_threshold_ms as u64); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_COMMANDS_PRI_COUNTER_VEC_STATIC @@ -2583,12 +2566,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } /// Get the value of a raw key. @@ -2606,8 +2584,10 @@ impl Storage { .resource_tag_factory .new_tag_with_key_ranges(&ctx, vec![(key.clone(), key.clone())]); let api_version = self.api_version; + let busy_threshold = Duration::from_millis(ctx.busy_threshold_ms as u64); - let res = self.read_pool.spawn_handle( + self.read_pool_spawn_with_busy_check( + busy_threshold, async move { KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_COMMANDS_PRI_COUNTER_VEC_STATIC @@ -2664,12 +2644,7 @@ impl Storage { priority, thread_rng().next_u64(), group_name, - ); - - async move { - res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) - .await? - } + ) } pub fn raw_compare_and_swap_atomic( @@ -2857,6 +2832,31 @@ impl Storage { .await? } } + + fn read_pool_spawn_with_busy_check( + &self, + busy_threshold: Duration, + future: Fut, + priority: CommandPri, + task_id: u64, + group_meta: Vec, + ) -> impl Future> + where + Fut: Future> + Send + 'static, + T: Send + 'static, + { + if let Err(busy_err) = self.read_pool.check_busy_threshold(busy_threshold) { + let mut err = kvproto::errorpb::Error::default(); + err.set_server_is_busy(busy_err); + return Either::Left(future::err(Error::from(ErrorInner::Kv(err.into())))); + } + Either::Right( + self.read_pool + .spawn_handle(future, priority, task_id, group_meta) + .map_err(|_| Error::from(ErrorInner::SchedTooBusy)) + .and_then(|res| future::ready(res)), + ) + } } pub async fn get_raw_key_guard( diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 1a7d44db972..42cda54281e 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -1445,12 +1445,17 @@ fn test_before_propose_deadline() { }), ) .unwrap(); - assert!(matches!( - rx.recv().unwrap(), - Err(StorageError(box StorageErrorInner::Kv(KvError( - box KvErrorInner::Request(_), - )))) - )); + let res = rx.recv().unwrap(); + assert!( + matches!( + res, + Err(StorageError(box StorageErrorInner::Kv(KvError( + box KvErrorInner::Request(_), + )))) + ), + "actual: {:?}", + res + ); } #[test] From 10e93a767162e74c15aa054f2d36939355e9052f Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Tue, 7 Feb 2023 13:47:57 +0800 Subject: [PATCH 496/676] raftstore-v2: fix peer destroy not clear in scale-in scene (#14112) close tikv/tikv#14128 1. release tablet in TableCache after peer destroyed 2. release tombstone tablet after applying snapshot Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot --- components/batch-system/src/router.rs | 2 +- .../operation/command/admin/compact_log.rs | 7 +- components/raftstore-v2/src/operation/life.rs | 1 + .../src/operation/ready/snapshot.rs | 7 + .../tests/integrations/test_conf_change.rs | 247 ++++++++---------- 5 files changed, 131 insertions(+), 133 deletions(-) diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index ef937209531..4238929d1d4 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -337,7 +337,7 @@ where /// Close the mailbox of address. pub fn close(&self, addr: u64) { - info!("[region {}] shutdown mailbox", addr); + info!("shutdown mailbox"; "region_id" => addr); unsafe { &mut *self.caches.as_ptr() }.remove(&addr); let mut mailboxes = self.normals.lock().unwrap(); if let Some(mb) = mailboxes.map.remove(&addr) { diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 0f5fd9b392f..8e83387012e 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -275,6 +275,11 @@ impl Peer { old_tablet: EK, new_tablet_index: u64, ) { + info!(self.logger, + "record tombstone tablet"; + "prev_tablet_path" => old_tablet.path(), + "new_tablet_index" => new_tablet_index + ); let compact_log_context = self.compact_log_context_mut(); compact_log_context .tombstone_tablets_wait_index @@ -291,7 +296,7 @@ impl Peer { /// Returns if there's any tombstone being removed. #[inline] - fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { + pub fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { let compact_log_context = self.compact_log_context_mut(); let removed = compact_log_context .tombstone_tablets_wait_index diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index a407f6bc8ef..11969701c74 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -599,6 +599,7 @@ impl Peer { let mut meta = ctx.store_meta.lock().unwrap(); meta.remove_region(region_id); meta.readers.remove(®ion_id); + ctx.tablet_registry.remove(region_id); } if let Some(msg) = self.destroy_progress_mut().finish() { // The message will be dispatched to store fsm, which will create a diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index adf20bfce37..87a1496be15 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -49,6 +49,7 @@ use crate::{ operation::{command::temp_split_path, SharedReadTablet}, raft::{Apply, Peer, Storage}, router::ApplyTask, + worker::tablet_gc, Result, StoreContext, }; @@ -274,6 +275,12 @@ impl Peer { self.post_split_init(ctx, init); } self.schedule_apply_fsm(ctx); + if self.remove_tombstone_tablets(snapshot_index) { + let _ = ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::destroy(region_id, snapshot_index)); + } } } } diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 7fa75a5a281..7ea49c02a6b 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -4,7 +4,10 @@ use std::{self, time::Duration}; use engine_traits::{Peekable, RaftEngineReadOnly, CF_DEFAULT}; use futures::executor::block_on; -use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::PeerState}; +use kvproto::{ + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, + raft_serverpb::PeerState, +}; use raft::prelude::ConfChangeType; use raftstore_v2::{ router::{PeerMsg, PeerTick}, @@ -17,58 +20,18 @@ use crate::cluster::{check_skip_wal, Cluster}; #[test] fn test_simple_change() { let mut cluster = Cluster::with_node_count(2, None); - let region_id = 2; - let mut req = cluster.routers[0].new_request_for(2); - let admin_req = req.mut_admin_request(); - admin_req.set_cmd_type(AdminCmdType::ChangePeer); - admin_req - .mut_change_peer() - .set_change_type(ConfChangeType::AddLearnerNode); - let store_id = cluster.node(1).id(); - let new_peer = new_learner_peer(store_id, 10); - admin_req.mut_change_peer().set_peer(new_peer.clone()); - let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); - assert!(!resp.get_header().has_error(), "{:?}", resp); - let epoch = req.get_header().get_region_epoch(); - let new_conf_ver = epoch.get_conf_ver() + 1; - let leader_peer = req.get_header().get_peer().clone(); + let (region_id, peer_id, offset_id) = (2, 10, 1); + + // 1. add learner on store-2 + add_learner(&cluster, offset_id, region_id, peer_id); let meta = cluster.routers[0] - .must_query_debug_info(2, Duration::from_secs(3)) + .must_query_debug_info(region_id, Duration::from_secs(3)) .unwrap(); let match_index = meta.raft_apply.applied_index; - assert_eq!(meta.region_state.epoch.version, epoch.get_version()); - assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); - assert_eq!(meta.region_state.peers, vec![leader_peer, new_peer.clone()]); - // So heartbeat will create a learner. - cluster.dispatch(2, vec![]); - let meta = cluster.routers[1] - .must_query_debug_info(2, Duration::from_secs(3)) - .unwrap(); - assert_eq!(meta.raft_status.id, 10, "{:?}", meta); - assert_eq!(meta.region_state.epoch.version, epoch.get_version()); - assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); - assert_eq!( - meta.raft_status.soft_state.leader_id, - req.get_header().get_peer().get_id() - ); - // Trigger the raft tick to replica the log to the learner and execute the - // snapshot task. - cluster.routers[0] - .send(region_id, PeerMsg::Tick(PeerTick::Raft)) - .unwrap(); - cluster.dispatch(region_id, vec![]); - - // write one kv after snapshot + // 2. write one kv after snapshot let (key, val) = (b"key", b"value"); - let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); - let mut put = SimpleWriteEncoder::with_capacity(64); - put.put(CF_DEFAULT, key, val); - let (msg, _) = PeerMsg::simple_write(header, put.encode()); - cluster.routers[0].send(region_id, msg).unwrap(); - std::thread::sleep(Duration::from_millis(1000)); - cluster.dispatch(region_id, vec![]); - + write_kv(&cluster, region_id, key, val); let meta = cluster.routers[1] .must_query_debug_info(region_id, Duration::from_secs(3)) .unwrap(); @@ -76,67 +39,29 @@ fn test_simple_change() { // read the new written kv. assert_eq!(match_index, meta.raft_apply.truncated_state.index); assert!(meta.raft_apply.applied_index >= match_index); - let snap = cluster.routers[1].stale_snapshot(2); + let snap = cluster.routers[offset_id].stale_snapshot(region_id); assert_eq!(snap.get_value(key).unwrap().unwrap(), val); + // 3. remove peer from store-2 + remove_peer(&cluster, offset_id, region_id, peer_id); - req.mut_header() - .mut_region_epoch() - .set_conf_ver(new_conf_ver); - req.mut_admin_request() - .mut_change_peer() - .set_change_type(ConfChangeType::RemoveNode); - let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); - assert!(!resp.get_header().has_error(), "{:?}", resp); - let epoch = req.get_header().get_region_epoch(); - let new_conf_ver = epoch.get_conf_ver() + 1; - let leader_peer = req.get_header().get_peer().clone(); - let meta = cluster.routers[0] - .must_query_debug_info(2, Duration::from_secs(3)) - .unwrap(); - assert_eq!(meta.region_state.epoch.version, epoch.get_version()); - assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); - assert_eq!(meta.region_state.peers, vec![leader_peer]); - cluster.routers[0].wait_flush(region_id, Duration::from_millis(300)); - let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; - let region_state = raft_engine - .get_region_state(region_id, u64::MAX) - .unwrap() - .unwrap(); - assert!( - region_state.get_removed_records().contains(&new_peer), - "{:?}", - region_state - ); + // To avaid that some status doesn't clear after destroying, it can support to + // create peer by many times. + let repeat = 3; + for i in 1..repeat { + add_learner(&cluster, offset_id, region_id, peer_id + i); + write_kv(&cluster, region_id, key, val); + remove_peer(&cluster, offset_id, region_id, peer_id + i); + } - // If adding a peer on the same store, removed_records should be cleaned. - req.mut_header() - .mut_region_epoch() - .set_conf_ver(new_conf_ver); - req.mut_admin_request() - .mut_change_peer() - .set_change_type(ConfChangeType::AddLearnerNode); - req.mut_admin_request() - .mut_change_peer() - .mut_peer() - .set_id(11); - let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); - assert!(!resp.get_header().has_error(), "{:?}", resp); - cluster.routers[0].wait_flush(region_id, Duration::from_millis(300)); - let region_state = raft_engine - .get_region_state(region_id, u64::MAX) - .unwrap() - .unwrap(); - assert!( - region_state.get_removed_records().is_empty(), - "{:?}", - region_state - ); + add_learner(&cluster, offset_id, region_id, peer_id + repeat); + write_kv(&cluster, region_id, key, val); + let snap = cluster.routers[offset_id].stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), val); // TODO: check if the peer is removed once life trace is implemented or // snapshot is implemented. - // Check if WAL is skipped for admin command. - let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); + let mut cached = cluster.node(0).tablet_registry().get(region_id).unwrap(); check_skip_wal(cached.latest().unwrap().as_inner().path()); } @@ -145,38 +70,12 @@ fn test_simple_change() { #[test] fn test_remove_by_conf_change() { let cluster = Cluster::with_node_count(2, None); - let region_id = 2; - let mut req = cluster.routers[0].new_request_for(2); - let admin_req = req.mut_admin_request(); - admin_req.set_cmd_type(AdminCmdType::ChangePeer); - admin_req - .mut_change_peer() - .set_change_type(ConfChangeType::AddLearnerNode); - let store_id = cluster.node(1).id(); - let new_peer = new_learner_peer(store_id, 10); - admin_req.mut_change_peer().set_peer(new_peer); - let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); - assert!(!resp.get_header().has_error(), "{:?}", resp); - // So heartbeat will create a learner. - cluster.dispatch(2, vec![]); - // Trigger the raft tick to replica the log to the learner and execute the - // snapshot task. - cluster.routers[0] - .send(region_id, PeerMsg::Tick(PeerTick::Raft)) - .unwrap(); - cluster.dispatch(region_id, vec![]); - // Wait some time so snapshot can be generated. - std::thread::sleep(Duration::from_millis(100)); - cluster.dispatch(region_id, vec![]); + let (region_id, peer_id, offset_id) = (2, 10, 1); + let mut req = add_learner(&cluster, offset_id, region_id, peer_id); // write one kv to make flow control replicated. let (key, val) = (b"key", b"value"); - let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); - let mut put = SimpleWriteEncoder::with_capacity(64); - put.put(CF_DEFAULT, key, val); - let (msg, _) = PeerMsg::simple_write(header, put.encode()); - cluster.routers[0].send(region_id, msg).unwrap(); - cluster.dispatch(region_id, vec![]); + write_kv(&cluster, region_id, key, val); let new_conf_ver = req.get_header().get_region_epoch().get_conf_ver() + 1; req.mut_header() @@ -214,3 +113,89 @@ fn test_remove_by_conf_change() { assert_eq!(region_state.get_state(), PeerState::Tombstone); assert_eq!(raft_engine.get_raft_state(region_id).unwrap(), None); } + +fn add_learner( + cluster: &Cluster, + offset_id: usize, + region_id: u64, + peer_id: u64, +) -> RaftCmdRequest { + let store_id = cluster.node(offset_id).id(); + let mut req = cluster.routers[0].new_request_for(region_id); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddLearnerNode); + let new_peer = new_learner_peer(store_id, peer_id); + admin_req.mut_change_peer().set_peer(new_peer.clone()); + let resp = cluster.routers[0] + .admin_command(region_id, req.clone()) + .unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let epoch = req.get_header().get_region_epoch(); + let new_conf_ver = epoch.get_conf_ver() + 1; + let leader_peer = req.get_header().get_peer().clone(); + let meta = cluster.routers[0] + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + assert_eq!(meta.region_state.peers, vec![leader_peer, new_peer]); + + // heartbeat will create a learner. + cluster.dispatch(region_id, vec![]); + cluster.routers[0] + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + let meta = cluster.routers[offset_id] + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.id, peer_id, "{:?}", meta); + + // Wait some time so snapshot can be generated. + std::thread::sleep(Duration::from_millis(100)); + cluster.dispatch(region_id, vec![]); + req +} + +fn write_kv(cluster: &Cluster, region_id: u64, key: &[u8], val: &[u8]) { + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, _) = PeerMsg::simple_write(header, put.encode()); + cluster.routers[0].send(region_id, msg).unwrap(); + std::thread::sleep(Duration::from_millis(1000)); + cluster.dispatch(region_id, vec![]); +} + +fn remove_peer(cluster: &Cluster, offset_id: usize, region_id: u64, peer_id: u64) { + let store_id = cluster.node(offset_id).id(); + let mut req = cluster.routers[0].new_request_for(region_id); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::RemoveNode); + admin_req + .mut_change_peer() + .set_peer(new_learner_peer(store_id, peer_id)); + let resp = cluster.routers[0] + .admin_command(region_id, req.clone()) + .unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + cluster.routers[offset_id] + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + cluster.dispatch(region_id, vec![]); + std::thread::sleep(Duration::from_millis(100)); + + let raft_engine = &cluster.node(offset_id).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_state(), PeerState::Tombstone); + assert_eq!(raft_engine.get_raft_state(region_id).unwrap(), None); +} From a45c7de539223d5c7aaa00aabd6b5f98dd471cbe Mon Sep 17 00:00:00 2001 From: Zwb Date: Tue, 7 Feb 2023 14:31:57 +0800 Subject: [PATCH 497/676] apply: ignore compute and verify hash when it's a witness (#14150) close tikv/tikv#14142 apply: ignore compute and verify hash when it's a witness Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/apply.rs | 29 ++++++++----- tests/integrations/raftstore/test_witness.rs | 44 +++++++++++++++++++- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 7f4e5497cb9..fba17db7391 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -3197,16 +3197,20 @@ where let resp = AdminResponse::default(); Ok(( resp, - ApplyResult::Res(ExecResult::ComputeHash { - region: self.region.clone(), - index: ctx.exec_log_index, - context: req.get_compute_hash().get_context().to_vec(), - // This snapshot may be held for a long time, which may cause too many - // open files in rocksdb. - // TODO: figure out another way to do consistency check without snapshot - // or short life snapshot. - snap: ctx.engine.snapshot(), - }), + if self.peer.is_witness { + ApplyResult::None + } else { + ApplyResult::Res(ExecResult::ComputeHash { + region: self.region.clone(), + index: ctx.exec_log_index, + context: req.get_compute_hash().get_context().to_vec(), + // This snapshot may be held for a long time, which may cause too many + // open files in rocksdb. + // TODO: figure out another way to do consistency check without snapshot + // or short life snapshot. + snap: ctx.engine.snapshot(), + }) + }, )) } @@ -3215,11 +3219,14 @@ where _: &ApplyContext, req: &AdminRequest, ) -> Result<(AdminResponse, ApplyResult)> { + let resp = AdminResponse::default(); + if self.peer.is_witness { + return Ok((resp, ApplyResult::None)); + } let verify_req = req.get_verify_hash(); let index = verify_req.get_index(); let context = verify_req.get_context().to_vec(); let hash = verify_req.get_hash().to_vec(); - let resp = AdminResponse::default(); Ok(( resp, ApplyResult::Res(ExecResult::VerifyHash { diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index 907c49c03af..d5a9992bc3a 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -12,7 +12,7 @@ use kvproto::{ use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use test_raftstore::*; -use tikv_util::store::find_peer; +use tikv_util::{config::ReadableDuration, store::find_peer}; // Test the case that region split or merge with witness peer #[test] @@ -556,3 +556,45 @@ fn test_witness_leader_down() { ); assert_eq!(cluster.must_get(b"k9"), Some(b"v9".to_vec())); } + +// Test the case that witness ignore consistency check as it has no data +#[test] +fn test_witness_ignore_consistency_check() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_election_timeout_ticks = 50; + // disable compact log to make test more stable. + cluster.cfg.raft_store.raft_log_gc_threshold = 1000; + cluster.cfg.raft_store.consistency_check_interval = ReadableDuration::secs(1); + cluster.run(); + + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + // make sure the peer_on_store3 has completed applied to witness + std::thread::sleep(Duration::from_millis(200)); + + for i in 0..300 { + cluster.must_put( + format!("k{:06}", i).as_bytes(), + format!("k{:06}", i).as_bytes(), + ); + std::thread::sleep(Duration::from_millis(10)); + } +} From adff03cab87e78e2f7f542b1e2875254c7706f38 Mon Sep 17 00:00:00 2001 From: fengou1 <85682690+fengou1@users.noreply.github.com> Date: Tue, 7 Feb 2023 17:25:57 +0800 Subject: [PATCH 498/676] fix: ebs volume snapshot support tikv node equipped with 2 cpu or less configuration (#14153) close tikv/tikv#14017 Signed-off-by: fengou1 Co-authored-by: Ti Chi Robot --- components/snap_recovery/src/init_cluster.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/components/snap_recovery/src/init_cluster.rs b/components/snap_recovery/src/init_cluster.rs index d3a2ebade73..4e72a19d6a6 100644 --- a/components/snap_recovery/src/init_cluster.rs +++ b/components/snap_recovery/src/init_cluster.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error as StdError, result, sync::Arc, thread, time::Duration}; +use std::{cmp, error::Error as StdError, i32, result, sync::Arc, thread, time::Duration}; use encryption_export::data_key_manager_from_config; use engine_rocks::{util::new_engine_opt, RocksEngine}; @@ -14,7 +14,10 @@ use tikv::{ config::TikvConfig, server::{config::Config as ServerConfig, KvEngineFactoryBuilder}, }; -use tikv_util::config::{ReadableDuration, ReadableSize, VersionTrack}; +use tikv_util::{ + config::{ReadableDuration, ReadableSize, VersionTrack}, + sys::SysQuota, +}; const CLUSTER_BOOTSTRAPPED_MAX_RETRY: u64 = 60; const CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs(3); @@ -88,7 +91,9 @@ pub fn enter_snap_recovery_mode(config: &mut TikvConfig) { config.rocksdb.lockcf.disable_auto_compactions = true; config.rocksdb.raftcf.disable_auto_compactions = true; - config.rocksdb.max_background_jobs = 32; + // for cpu = 1, take a reasonable value min[32, maxValue]. + let limit = (SysQuota::cpu_cores_quota() * 10.0) as i32; + config.rocksdb.max_background_jobs = cmp::min(32, limit); // disable resolve ts during the recovery config.resolved_ts.enable = false; From db50ce6ad2f07854ae323a11e49d3d15a4e43b6b Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Tue, 7 Feb 2023 02:39:57 -0800 Subject: [PATCH 499/676] Increase read pool limit (#13766) close tikv/tikv#13690 * Read pool size limit is removed. It was previously hard-coded to be the total number of cores. Limiting the number of threads to be less than the number of cores reduces the chance of context switch, but also makes TiKV prone to starvation problem (since read pool is currently FIFO), even if all the queries require no IO or IO is async (imagining there are number_of_cores clients sending big queries, and another numbers of clients sending small queries concurrently to a single TiKV server, and all of them require no IO). Thread starvation causes high tail latency which is even worse than context switching. According to the feature requester, increasing the number of threads significantly improved the tail latency in their environment. Thus, we should remove the limit, and leave it to the users. Signed-off-by: Yang Zhang --- src/config/mod.rs | 8 ++------ src/read_pool.rs | 7 +++++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 7539fc13c63..4188d8409e3 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1960,7 +1960,7 @@ impl UnifiedReadPoolConfig { } let limit = cmp::max( UNIFIED_READPOOL_MIN_CONCURRENCY, - SysQuota::cpu_cores_quota() as usize, + SysQuota::cpu_cores_quota() as usize * 10, // at most 10 threads per core ); if self.max_thread_count > limit { return Err(format!( @@ -2050,11 +2050,7 @@ mod unified_read_pool_tests { }; invalid_cfg.validate().unwrap_err(); let invalid_cfg = UnifiedReadPoolConfig { - min_thread_count: 1, - max_thread_count: cmp::max( - UNIFIED_READPOOL_MIN_CONCURRENCY, - SysQuota::cpu_cores_quota() as usize, - ) + 1, + max_thread_count: SysQuota::cpu_cores_quota() as usize * 10 + 1, ..cfg }; invalid_cfg.validate().unwrap_err(); diff --git a/src/read_pool.rs b/src/read_pool.rs index 2c56e205ef7..8ef2c4a9b25 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -424,8 +424,11 @@ pub fn build_yatp_read_pool( config.min_thread_count, config.max_thread_count, std::cmp::max( - UNIFIED_READPOOL_MIN_CONCURRENCY, - SysQuota::cpu_cores_quota() as usize, + std::cmp::max( + UNIFIED_READPOOL_MIN_CONCURRENCY, + SysQuota::cpu_cores_quota() as usize, + ), + config.max_thread_count, ), ) .after_start(move || { From 1df793c27270157f1e479de67a1a3d4badba646f Mon Sep 17 00:00:00 2001 From: Shenghui Wu <793703860@qq.com> Date: Tue, 7 Feb 2023 19:37:57 +0800 Subject: [PATCH 500/676] copr: fix error when cast const Enum to any type (#14149) close tikv/tikv#14156, close pingcap/tidb#40341 copr: fix error when cast const Enum to any type Co-authored-by: Ti Chi Robot --- components/tidb_query_expr/src/types/expr_builder.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/tidb_query_expr/src/types/expr_builder.rs b/components/tidb_query_expr/src/types/expr_builder.rs index 5311a2c03d9..6ccfd35631e 100644 --- a/components/tidb_query_expr/src/types/expr_builder.rs +++ b/components/tidb_query_expr/src/types/expr_builder.rs @@ -68,7 +68,8 @@ impl RpnExpressionBuilder { | ExprType::MysqlTime | ExprType::MysqlDuration | ExprType::MysqlDecimal - | ExprType::MysqlJson => Ok(true), + | ExprType::MysqlJson + | ExprType::MysqlEnum => Ok(true), ExprType::ScalarFunc => Ok(false), ExprType::ColumnRef => Ok(false), _ => Err(other_err!("Unsupported expression type {:?}", c.get_tp())), From 98ac5d2ad17c76d13a0f7e34f0bd32507fc2b8dc Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Tue, 7 Feb 2023 19:23:59 -0800 Subject: [PATCH 501/676] Support backup replica read (#13975) close tikv/tikv#14060 Support replica read for backup Signed-off-by: Yang Zhang --- components/backup/src/endpoint.rs | 219 +++++++++++++++++++++++++----- 1 file changed, 185 insertions(+), 34 deletions(-) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 0469ffa30a7..896020cf51a 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -71,6 +71,7 @@ struct Request { compression_type: CompressionType, compression_level: i32, cipher: CipherInfo, + replica_read: bool, } /// Backup Task. @@ -131,6 +132,7 @@ impl Task { cf, compression_type: req.get_compression_type(), compression_level: req.get_compression_level(), + replica_read: req.get_replica_read(), cipher: req.cipher_info.unwrap_or_else(|| { let mut cipher = CipherInfo::default(); cipher.set_cipher_type(EncryptionMethod::Plaintext); @@ -153,9 +155,10 @@ pub struct BackupRange { start_key: Option, end_key: Option, region: Region, - leader: Peer, + peer: Peer, codec: KeyValueCodec, cf: CfName, + uses_replica_read: bool, } /// The generic saveable writer. for generic `InMemBackupFiles`. @@ -304,35 +307,45 @@ impl BackupRange { let mut ctx = Context::default(); ctx.set_region_id(self.region.get_id()); ctx.set_region_epoch(self.region.get_region_epoch().to_owned()); - ctx.set_peer(self.leader.clone()); - - // Update max_ts and check the in-memory lock table before getting the snapshot - concurrency_manager.update_max_ts(backup_ts); - concurrency_manager - .read_range_check( - self.start_key.as_ref(), - self.end_key.as_ref(), - |key, lock| { - Lock::check_ts_conflict( - Cow::Borrowed(lock), - key, - backup_ts, - &Default::default(), - IsolationLevel::Si, - ) - }, - ) - .map_err(MvccError::from) - .map_err(TxnError::from)?; + ctx.set_peer(self.peer.clone()); + ctx.set_replica_read(self.uses_replica_read); + ctx.set_isolation_level(IsolationLevel::Si); - // Currently backup always happens on the leader, so we don't need - // to set key ranges and start ts to check. - assert!(!ctx.get_replica_read()); - let snap_ctx = SnapContext { + let mut snap_ctx = SnapContext { pb_ctx: &ctx, allowed_in_flashback: self.region.is_in_flashback, ..Default::default() }; + if self.uses_replica_read { + snap_ctx.start_ts = Some(backup_ts); + let mut key_range = KeyRange::default(); + if let Some(start_key) = self.start_key.as_ref() { + key_range.set_start_key(start_key.clone().into_encoded()); + } + if let Some(end_key) = self.end_key.as_ref() { + key_range.set_end_key(end_key.clone().into_encoded()); + } + snap_ctx.key_ranges = vec![key_range]; + } else { + // Update max_ts and check the in-memory lock table before getting the snapshot + concurrency_manager.update_max_ts(backup_ts); + concurrency_manager + .read_range_check( + self.start_key.as_ref(), + self.end_key.as_ref(), + |key, lock| { + Lock::check_ts_conflict( + Cow::Borrowed(lock), + key, + backup_ts, + &Default::default(), + IsolationLevel::Si, + ) + }, + ) + .map_err(MvccError::from) + .map_err(TxnError::from)?; + } let start_snapshot = Instant::now(); let snapshot = match engine.snapshot(snap_ctx) { @@ -540,7 +553,8 @@ impl BackupRange { let mut ctx = Context::default(); ctx.set_region_id(self.region.get_id()); ctx.set_region_epoch(self.region.get_region_epoch().to_owned()); - ctx.set_peer(self.leader.clone()); + ctx.set_peer(self.peer.clone()); + let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() @@ -739,7 +753,7 @@ impl Progress { /// Forward the progress by `ranges` BackupRanges /// /// The size of the returned BackupRanges should <= `ranges` - fn forward(&mut self, limit: usize) -> Vec { + fn forward(&mut self, limit: usize, replica_read: bool) -> Vec { if self.finished { return Vec::new(); } @@ -769,18 +783,20 @@ impl Progress { break; } } - if info.role == StateRole::Leader { + let peer = find_peer(region, store_id).unwrap().to_owned(); + // Raft peer role has to match the replica read flag. + if replica_read || info.role == StateRole::Leader { let ekey = get_min_end_key(end_key.as_ref(), region); let skey = get_max_start_key(start_key.as_ref(), region); assert!(!(skey == ekey && ekey.is_some()), "{:?} {:?}", skey, ekey); - let leader = find_peer(region, store_id).unwrap().to_owned(); let backup_range = BackupRange { start_key: skey, end_key: ekey, region: region.clone(), - leader, + peer, codec, cf: cf_name, + uses_replica_read: info.role != StateRole::Leader, }; tx.send(backup_range).unwrap(); count += 1; @@ -907,7 +923,7 @@ impl Endpoint { // (See https://tokio.rs/tokio/tutorial/shared-state) // Use &mut and mark the type for making rust-analyzer happy. let progress: &mut Progress<_> = &mut prs.lock().unwrap(); - let batch = progress.forward(batch_size); + let batch = progress.forward(batch_size, request.replica_read); if batch.is_empty() { return; } @@ -1080,7 +1096,6 @@ impl Endpoint { let backend = Arc::::from(backend); let concurrency = self.config_manager.0.read().unwrap().num_threads; self.pool.borrow_mut().adjust_with(concurrency); - // make the buffer small enough to implement back pressure. let (tx, rx) = async_channel::bounded(1); for _ in 0..concurrency { self.spawn_backup_worker( @@ -1307,6 +1322,38 @@ pub mod tests { map.create_region(r, StateRole::Leader); } } + pub fn add_region( + &self, + id: u64, + mut start_key: Vec, + mut end_key: Vec, + peer_role: metapb::PeerRole, + state_role: StateRole, + ) { + let mut region = metapb::Region::default(); + region.set_id(id); + if !start_key.is_empty() { + if self.need_encode_key { + start_key = Key::from_raw(&start_key).into_encoded(); + } else { + start_key = Key::from_encoded(start_key).into_encoded(); + } + } + if !end_key.is_empty() { + if self.need_encode_key { + end_key = Key::from_raw(&end_key).into_encoded(); + } else { + end_key = Key::from_encoded(end_key).into_encoded(); + } + } + region.set_start_key(start_key); + region.set_end_key(end_key); + let mut new_peer = new_peer(1, 1); + new_peer.set_role(peer_role); + region.mut_peers().push(new_peer); + let mut map = self.regions.lock().unwrap(); + map.create_region(region, state_role); + } fn canecl_on_seek(&mut self, cancel: Arc) { self.cancel = Some(cancel); } @@ -1456,7 +1503,7 @@ pub mod tests { let mut ranges = Vec::with_capacity(expect.len()); while ranges.len() != expect.len() { let n = (rand::random::() % 3) + 1; - let mut r = prs.forward(n); + let mut r = prs.forward(n, false); // The returned backup ranges should <= n assert!(r.len() <= n); @@ -1508,6 +1555,7 @@ pub mod tests { compression_type: CompressionType::Unknown, compression_level: 0, cipher: CipherInfo::default(), + replica_read: false, }, resp: tx, }; @@ -1563,6 +1611,108 @@ pub mod tests { } } + #[test] + fn test_backup_replica_read() { + let (_tmp, endpoint) = new_endpoint(); + + endpoint.region_info.add_region( + 1, + b"".to_vec(), + b"1".to_vec(), + metapb::PeerRole::Voter, + StateRole::Leader, + ); + endpoint.region_info.add_region( + 2, + b"1".to_vec(), + b"2".to_vec(), + metapb::PeerRole::Voter, + StateRole::Follower, + ); + endpoint.region_info.add_region( + 3, + b"2".to_vec(), + b"3".to_vec(), + metapb::PeerRole::Learner, + StateRole::Follower, + ); + + let tmp = TempDir::new().unwrap(); + let backend = make_local_backend(tmp.path()); + + let (tx, rx) = unbounded(); + let mut ranges = vec![]; + let key_range = KeyRange { + start_key: b"".to_vec(), + end_key: b"3".to_vec(), + ..Default::default() + }; + ranges.push(key_range); + let read_leader_task = Task { + request: Request { + start_key: b"1".to_vec(), + end_key: b"2".to_vec(), + sub_ranges: ranges.clone(), + start_ts: 1.into(), + end_ts: 1.into(), + backend: backend.clone(), + limiter: Limiter::new(f64::INFINITY), + cancel: Arc::default(), + is_raw_kv: false, + dst_api_ver: ApiVersion::V1, + cf: engine_traits::CF_DEFAULT, + compression_type: CompressionType::Unknown, + compression_level: 0, + cipher: CipherInfo::default(), + replica_read: false, + }, + resp: tx, + }; + endpoint.handle_backup_task(read_leader_task); + let resps: Vec<_> = block_on(rx.collect()); + assert_eq!(resps.len(), 1); + for a in &resps { + assert_eq!(a.get_start_key(), b""); + assert_eq!(a.get_end_key(), b"1"); + } + + let (tx, rx) = unbounded(); + let replica_read_task = Task { + request: Request { + start_key: b"".to_vec(), + end_key: b"3".to_vec(), + sub_ranges: ranges.clone(), + start_ts: 1.into(), + end_ts: 1.into(), + backend, + limiter: Limiter::new(f64::INFINITY), + cancel: Arc::default(), + is_raw_kv: false, + dst_api_ver: ApiVersion::V1, + cf: engine_traits::CF_DEFAULT, + compression_type: CompressionType::Unknown, + compression_level: 0, + cipher: CipherInfo::default(), + replica_read: true, + }, + resp: tx, + }; + endpoint.handle_backup_task(replica_read_task); + let resps: Vec<_> = block_on(rx.collect()); + let expected: Vec<(&[u8], &[u8])> = vec![(b"", b"1"), (b"1", b"2"), (b"2", b"3")]; + assert_eq!(resps.len(), 3); + for a in &resps { + assert!( + expected + .iter() + .any(|b| { a.get_start_key() == b.0 && a.get_end_key() == b.1 }), + "{:?} {:?}", + resps, + expected + ); + } + } + #[test] fn test_seek_ranges() { let (_tmp, endpoint) = new_endpoint(); @@ -1594,7 +1744,7 @@ pub mod tests { let mut ranges = Vec::with_capacity(expect.len()); while ranges.len() != expect.len() { let n = (rand::random::() % 3) + 1; - let mut r = prs.forward(n); + let mut r = prs.forward(n, false); // The returned backup ranges should <= n assert!(r.len() <= n); @@ -1656,6 +1806,7 @@ pub mod tests { compression_type: CompressionType::Unknown, compression_level: 0, cipher: CipherInfo::default(), + replica_read: false, }, resp: tx, }; From 1d97f4d56b11195bc14ceb82878fd955adc0afb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 8 Feb 2023 11:37:59 +0800 Subject: [PATCH 502/676] log-backup: report when watch canceled (#14154) close tikv/tikv#14159 Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 2 + components/backup-stream/src/errors.rs | 18 ++++++++- .../backup-stream/src/metadata/store/etcd.rs | 39 +++++++++++++------ 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index ff380551b90..dc053feff33 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -354,6 +354,7 @@ where continue; } }; + info!("start watching the task changes."; "from_rev" => %revision_new); loop { if let Some(event) = watcher.stream.next().await { @@ -403,6 +404,7 @@ where continue; } }; + info!("start watching the pausing events."; "from_rev" => %revision_new); loop { if let Some(event) = watcher.stream.next().await { diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index a3f76e0255f..2fecf0ac514 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -22,7 +22,7 @@ pub enum Error { #[error("gRPC meet error {0}")] Grpc(#[from] GrpcError), #[error("Etcd meet error {0}")] - Etcd(#[from] EtcdError), + Etcd(#[from] EtcdErrorExt), #[error("Protobuf meet error {0}")] Protobuf(#[from] ProtobufError), #[error("No such task {task_name:?}")] @@ -52,6 +52,22 @@ pub enum Error { Other(#[from] Box), } +impl From for Error { + fn from(value: EtcdError) -> Self { + Self::Etcd(value.into()) + } +} + +#[derive(ThisError, Debug)] +pub enum EtcdErrorExt { + #[error("{0}")] + Normal(#[from] EtcdError), + #[error("the watch canceled")] + WatchCanceled, + #[error("the required revision has been compacted, current is {current}")] + RevisionCompacted { current: i64 }, +} + impl ErrorCodeExt for Error { fn error_code(&self) -> error_code::ErrorCode { use error_code::backup_stream::*; diff --git a/components/backup-stream/src/metadata/store/etcd.rs b/components/backup-stream/src/metadata/store/etcd.rs index 556661700f9..e52cc4f92d9 100644 --- a/components/backup-stream/src/metadata/store/etcd.rs +++ b/components/backup-stream/src/metadata/store/etcd.rs @@ -23,7 +23,7 @@ use super::{ TransactionOp, }; use crate::{ - errors::Result, + errors::{Error, EtcdErrorExt, Result}, metadata::{ keys::{KeyValue, MetaKey}, metrics::METADATA_KEY_OPERATION, @@ -113,17 +113,32 @@ impl MetaStore for EtcdStore { |events| -> Pin> + Send>> { match events { Err(err) => Box::pin(tokio_stream::once(Err(err.into()))), - Ok(events) => Box::pin(tokio_stream::iter( - // TODO: remove the copy here via access the protobuf field directly. - #[allow(clippy::unnecessary_to_owned)] - events.events().to_owned().into_iter().filter_map(|event| { - let kv = event.kv()?; - Some(Ok(KvEvent { - kind: event.event_type().into(), - pair: kv.clone().into(), - })) - }), - )), + Ok(events) => { + if events.compact_revision() > 0 && events.canceled() { + return Box::pin(tokio_stream::once(Err(Error::Etcd( + EtcdErrorExt::RevisionCompacted { + current: events.compact_revision(), + }, + )))); + } + if events.canceled() { + return Box::pin(tokio_stream::once(Err(Error::Etcd( + EtcdErrorExt::WatchCanceled, + )))); + } + Box::pin(tokio_stream::iter( + // TODO: remove the copy here via access the protobuf field + // directly. + #[allow(clippy::unnecessary_to_owned)] + events.events().to_owned().into_iter().filter_map(|event| { + let kv = event.kv()?; + Some(Ok(KvEvent { + kind: event.event_type().into(), + pair: kv.clone().into(), + })) + }), + )) + } } }, )), From 5a8d477fdbd713faf1fc22dbb3eadb9167e388d9 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 8 Feb 2023 14:27:59 +0800 Subject: [PATCH 503/676] engine_traits: allow chaos flush notification (#14160) close tikv/tikv#14113 `OnFlushComplete` can be called out of order. What we can assume is when a seqno finishes flush, all SSTs have smaller seqno must also finish flush. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_rocks/src/event_listener.rs | 20 +++++-- components/engine_rocks/src/lib.rs | 1 + components/engine_traits/src/cf_defs.rs | 5 ++ components/engine_traits/src/flush.rs | 60 ++++++++++++++----- .../src/operation/command/write/mod.rs | 7 +-- components/raftstore-v2/src/operation/mod.rs | 3 +- .../src/operation/ready/apply_trace.rs | 15 ++--- .../raftstore-v2/src/operation/ready/mod.rs | 2 +- 8 files changed, 77 insertions(+), 36 deletions(-) diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 3bbf03cb77f..23ff7cf5f50 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -194,8 +194,20 @@ impl rocksdb::EventListener for RocksPersistenceListener { } fn on_flush_completed(&self, job: &FlushJobInfo) { + let num = match job + .file_path() + .file_prefix() + .and_then(|n| n.to_str()) + .map(|n| n.parse()) + { + Some(Ok(n)) => n, + _ => { + slog_global::error!("failed to parse file number"; "path" => job.file_path().display()); + 0 + } + }; self.0 - .on_flush_completed(job.cf_name(), job.largest_seqno()); + .on_flush_completed(job.cf_name(), job.largest_seqno(), num); } } @@ -207,7 +219,7 @@ mod tests { }; use engine_traits::{ - FlushProgress, FlushState, MiscExt, StateStorage, SyncMutable, CF_DEFAULT, DATA_CFS, + ApplyProgress, FlushState, MiscExt, StateStorage, SyncMutable, CF_DEFAULT, DATA_CFS, }; use tempfile::Builder; @@ -221,7 +233,7 @@ mod tests { assert_eq!(filename, "/000398.sst"); } - type Record = (u64, u64, FlushProgress); + type Record = (u64, u64, ApplyProgress); #[derive(Default)] struct MemStorage { @@ -229,7 +241,7 @@ mod tests { } impl StateStorage for MemStorage { - fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress) { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: ApplyProgress) { self.records .lock() .unwrap() diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index 94a4c23a3c4..b5561b3de42 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -18,6 +18,7 @@ #![cfg_attr(test, feature(test))] #![feature(let_chains)] #![feature(option_get_or_insert_default)] +#![feature(path_file_prefix)] #[allow(unused_extern_crates)] extern crate tikv_alloc; diff --git a/components/engine_traits/src/cf_defs.rs b/components/engine_traits/src/cf_defs.rs index 1658f49053c..27546dfc1c1 100644 --- a/components/engine_traits/src/cf_defs.rs +++ b/components/engine_traits/src/cf_defs.rs @@ -11,6 +11,11 @@ pub const ALL_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE, CF_RAFT]; pub const DATA_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE]; pub const DATA_CFS_LEN: usize = DATA_CFS.len(); +pub fn data_cf_offset(cf: &str) -> usize { + let cf = if cf.is_empty() { CF_DEFAULT } else { cf }; + DATA_CFS.iter().position(|c| *c == cf).expect(cf) +} + pub fn name_to_cf(name: &str) -> Option { if name.is_empty() { return Some(CF_DEFAULT); diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index 8300348da8c..d35233bc310 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -20,17 +20,20 @@ use std::{ }, }; -use crate::{RaftEngine, RaftLogBatch}; +use slog_global::info; +use tikv_util::set_panic_mark; + +use crate::{data_cf_offset, RaftEngine, RaftLogBatch, DATA_CFS_LEN}; #[derive(Debug)] -pub struct FlushProgress { +pub struct ApplyProgress { cf: String, apply_index: u64, earliest_seqno: u64, } -impl FlushProgress { - fn merge(&mut self, pr: FlushProgress) { +impl ApplyProgress { + fn merge(&mut self, pr: ApplyProgress) { debug_assert_eq!(self.cf, pr.cf); debug_assert!(self.apply_index <= pr.apply_index); self.apply_index = pr.apply_index; @@ -45,6 +48,12 @@ impl FlushProgress { } } +#[derive(Default, Debug)] +struct FlushProgress { + prs: LinkedList, + last_flushed: [u64; DATA_CFS_LEN], +} + /// A share state between raftstore and underlying engine. /// /// raftstore will update state changes and corresponding apply index, when @@ -77,7 +86,7 @@ impl FlushState { /// A helper trait to avoid exposing `RaftEngine` to `TabletFactory`. pub trait StateStorage: Sync + Send { - fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress); + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: ApplyProgress); } /// A flush listener that maps memtable to apply index and persist the relation @@ -86,7 +95,7 @@ pub struct PersistenceListener { region_id: u64, tablet_index: u64, state: Arc, - progress: Mutex>, + progress: Mutex, storage: Arc, } @@ -101,7 +110,7 @@ impl PersistenceListener { region_id, tablet_index, state, - progress: Mutex::new(LinkedList::new()), + progress: Mutex::new(FlushProgress::default()), storage, } } @@ -120,8 +129,17 @@ impl PersistenceListener { // thread writting to the DB and increasing apply index. // Apply index will be set within DB lock, so it's correct even with manual // flush. + let offset = data_cf_offset(&cf); let apply_index = self.state.applied_index.load(Ordering::SeqCst); - self.progress.lock().unwrap().push_back(FlushProgress { + let mut prs = self.progress.lock().unwrap(); + let flushed = prs.last_flushed[offset]; + if flushed > earliest_seqno { + panic!( + "sealed seqno has been flushed {} {} {} <= {}", + cf, apply_index, earliest_seqno, flushed + ); + } + prs.prs.push_back(ApplyProgress { cf, apply_index, earliest_seqno, @@ -131,12 +149,21 @@ impl PersistenceListener { /// Called a memtable finished flushing. /// /// `largest_seqno` should be the largest seqno of the generated file. - pub fn on_flush_completed(&self, cf: &str, largest_seqno: u64) { + pub fn on_flush_completed(&self, cf: &str, largest_seqno: u64, file_no: u64) { // Maybe we should hook the compaction to avoid the file is compacted before // being recorded. + let offset = data_cf_offset(cf); let pr = { let mut prs = self.progress.lock().unwrap(); - let mut cursor = prs.cursor_front_mut(); + let flushed = prs.last_flushed[offset]; + if flushed >= largest_seqno { + // According to facebook/rocksdb#11183, it's possible OnFlushCompleted can be + // called out of order. But it's guaranteed files are installed in order. + info!("flush complete reorder found"; "flushed" => flushed, "largest_seqno" => largest_seqno, "file_no" => file_no, "cf" => cf); + return; + } + prs.last_flushed[offset] = largest_seqno; + let mut cursor = prs.prs.cursor_front_mut(); let mut flushed_pr = None; while let Some(pr) = cursor.current() { if pr.cf != cf { @@ -157,10 +184,13 @@ impl PersistenceListener { } match flushed_pr { Some(pr) => pr, - None => panic!( - "[region_id={}] [tablet_index={}] {} not found in {:?}", - self.region_id, self.tablet_index, cf, prs - ), + None => { + set_panic_mark(); + panic!( + "[region_id={}] [tablet_index={}] {} {} {} not found in {:?}", + self.region_id, self.tablet_index, cf, largest_seqno, file_no, prs + ) + } } }; self.storage @@ -169,7 +199,7 @@ impl PersistenceListener { } impl StateStorage for R { - fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress) { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: ApplyProgress) { if pr.apply_index == 0 { return; } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 14011d6fc1b..e958a3ec08f 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{KvEngine, Mutable, RaftEngine, CF_DEFAULT}; +use engine_traits::{data_cf_offset, KvEngine, Mutable, RaftEngine, CF_DEFAULT}; use kvproto::raft_cmdpb::RaftRequestHeader; use raftstore::{ store::{ @@ -15,7 +15,6 @@ use tikv_util::slog_panic; use crate::{ batch::StoreContext, - operation::cf_offset, raft::{Apply, Peer}, router::{ApplyTask, CmdResChannel}, }; @@ -129,7 +128,7 @@ impl Peer { impl Apply { #[inline] pub fn apply_put(&mut self, cf: &str, index: u64, key: &[u8], value: &[u8]) -> Result<()> { - let off = cf_offset(cf); + let off = data_cf_offset(cf); if self.should_skip(off, index) { return Ok(()); } @@ -172,7 +171,7 @@ impl Apply { #[inline] pub fn apply_delete(&mut self, cf: &str, index: u64, key: &[u8]) -> Result<()> { - let off = cf_offset(cf); + let off = data_cf_offset(cf); if self.should_skip(off, index) { return Ok(()); } diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 492595851e2..9cdd78dcb4c 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -14,8 +14,7 @@ pub use command::{ }; pub use life::{DestroyProgress, GcPeerContext}; pub use ready::{ - cf_offset, write_initial_states, ApplyTrace, AsyncWriter, DataTrace, GenSnapTask, SnapState, - StateStorage, + write_initial_states, ApplyTrace, AsyncWriter, DataTrace, GenSnapTask, SnapState, StateStorage, }; pub(crate) use self::{ diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 67bbed5aa4b..71e282728f7 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -30,8 +30,8 @@ use std::{cmp, sync::Mutex}; use engine_traits::{ - FlushProgress, KvEngine, RaftEngine, RaftLogBatch, TabletRegistry, ALL_CFS, CF_DEFAULT, - CF_LOCK, CF_RAFT, CF_WRITE, DATA_CFS, DATA_CFS_LEN, + data_cf_offset, ApplyProgress, KvEngine, RaftEngine, RaftLogBatch, TabletRegistry, ALL_CFS, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, DATA_CFS, DATA_CFS_LEN, }; use kvproto::{ metapb::Region, @@ -111,7 +111,7 @@ impl StateStorage { } impl engine_traits::StateStorage for StateStorage { - fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress) { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: ApplyProgress) { let cf = to_static_cf(pr.cf()); let flushed_index = pr.applied_index(); self.raft_engine @@ -140,11 +140,6 @@ struct Progress { last_modified: u64, } -pub fn cf_offset(cf: &str) -> usize { - let cf = if cf.is_empty() { CF_DEFAULT } else { cf }; - DATA_CFS.iter().position(|c| *c == cf).expect(cf) -} - /// `ApplyTrace` is used to track the indexes of modifications and flushes. /// /// It has 3 core functionalities: @@ -204,7 +199,7 @@ impl ApplyTrace { } fn on_flush(&mut self, cf: &str, index: u64) { - let off = cf_offset(cf); + let off = data_cf_offset(cf); // Technically it should always be true. if index > self.data_cfs[off].flushed { self.data_cfs[off].flushed = index; @@ -212,7 +207,7 @@ impl ApplyTrace { } fn on_modify(&mut self, cf: &str, index: u64) { - let off = cf_offset(cf); + let off = data_cf_offset(cf); self.data_cfs[off].last_modified = index; } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index c77766f6ce5..a88df2245cc 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -47,7 +47,7 @@ use tikv_util::{ }; pub use self::{ - apply_trace::{cf_offset, write_initial_states, ApplyTrace, DataTrace, StateStorage}, + apply_trace::{write_initial_states, ApplyTrace, DataTrace, StateStorage}, async_writer::AsyncWriter, snapshot::{GenSnapTask, SnapState}, }; From 8d63c2714f0f00fc0a3fe8bb468cbe2aeb851144 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 8 Feb 2023 14:53:59 +0800 Subject: [PATCH 504/676] raftstore-v2: gc all split tablets (#14169) close tikv/tikv#14162, close tikv/tikv#14163 Force gc all split tablets by checking finish event. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 5 ++ .../src/operation/command/admin/mod.rs | 5 +- .../src/operation/command/admin/split.rs | 51 ++++++++++---- .../raftstore-v2/src/operation/command/mod.rs | 4 +- components/raftstore-v2/src/operation/life.rs | 69 ++++++++++++------- .../raftstore-v2/src/worker/tablet_gc.rs | 67 ++++++++++++++++-- 6 files changed, 155 insertions(+), 46 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 83d6b2e1f2a..1a507bb7f10 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -372,6 +372,11 @@ impl StorePollerBuilder { for entry in file_system::read_dir(self.tablet_registry.tablet_root())? { let entry = entry?; let path = entry.path(); + if path.extension().map_or(false, |s| s == "tmp") { + // The directory may be generated by an aborted checkpoint. + file_system::remove_dir_all(&path)?; + continue; + } let Some((prefix, region_id, tablet_index)) = self.tablet_registry.parse_tablet_name(&path) else { continue }; let fsm = match peers.get(®ion_id) { Some((_, fsm)) => fsm, diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 1546983645f..a912cb7a3d5 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -14,7 +14,10 @@ use protobuf::Message; use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; use split::SplitResult; -pub use split::{temp_split_path, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; +pub use split::{ + report_split_init_finish, temp_split_path, RequestSplit, SplitFlowControl, SplitInit, + SPLIT_PREFIX, +}; use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index f9e44286490..0fbe31277ed 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -53,7 +53,7 @@ use raftstore::{ }, Result, }; -use slog::info; +use slog::{error, info}; use tikv_util::{log::SlogFormat, slog_panic}; use crate::{ @@ -86,8 +86,8 @@ pub struct SplitInit { pub region: metapb::Region, pub check_split: bool, pub scheduled: bool, - pub source_leader: bool, - pub source_id: u64, + pub derived_leader: bool, + pub derived_region_id: u64, /// In-memory pessimistic locks that should be inherited from parent region pub locks: PeerPessimisticLocks, @@ -113,6 +113,35 @@ impl SplitInit { } } +pub fn report_split_init_finish( + ctx: &mut StoreContext, + derived_region_id: u64, + finish_region_id: u64, + cleanup: bool, +) where + EK: KvEngine, + ER: RaftEngine, +{ + let _ = ctx.router.force_send( + derived_region_id, + PeerMsg::SplitInitFinish(finish_region_id), + ); + if !cleanup { + return; + } + + if let Err(e) = ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::direct_destroy_path(temp_split_path( + &ctx.tablet_registry, + finish_region_id, + ))) + { + error!(ctx.logger, "failed to destroy split init temp"; "error" => ?e); + } +} + #[derive(Debug)] pub struct RequestSplit { pub epoch: RegionEpoch, @@ -527,8 +556,8 @@ impl Peer { new_ids.insert(new_region_id); let split_init = PeerMsg::SplitInit(Box::new(SplitInit { region: new_region, - source_leader: self.is_leader(), - source_id: region_id, + derived_leader: self.is_leader(), + derived_region_id: region_id, check_split: last_region_id == new_region_id, scheduled: false, approximate_size: estimated_size, @@ -574,10 +603,8 @@ impl Peer { let region_id = split_init.region.id; if self.storage().is_initialized() && self.persisted_index() >= RAFT_INIT_LOG_INDEX { // Race with split operation. The tablet created by split will eventually be - // deleted (TODO). We don't trim it. - let _ = store_ctx - .router - .force_send(split_init.source_id, PeerMsg::SplitInitFinish(region_id)); + // deleted. We don't trim it. + report_split_init_finish(store_ctx, split_init.derived_region_id, region_id, true); return; } @@ -631,7 +658,7 @@ impl Peer { }, )); } - if split_init.source_leader + if split_init.derived_leader && self.leader_id() == INVALID_ID && self.term() == RAFT_INIT_LOG_TERM { @@ -650,9 +677,7 @@ impl Peer { if split_init.check_split { self.add_pending_tick(PeerTick::SplitRegionCheck); } - let _ = store_ctx - .router - .force_send(split_init.source_id, PeerMsg::SplitInitFinish(region_id)); + report_split_init_finish(store_ctx, split_init.derived_region_id, region_id, false); } pub fn on_split_init_finish(&mut self, region_id: u64) { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 5434eca6b38..d887af7d6d6 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -59,8 +59,8 @@ mod control; mod write; pub use admin::{ - temp_split_path, AdminCmdResult, CompactLogContext, RequestSplit, SplitFlowControl, SplitInit, - SPLIT_PREFIX, + report_split_init_finish, temp_split_path, AdminCmdResult, CompactLogContext, RequestSplit, + SplitFlowControl, SplitInit, SPLIT_PREFIX, }; pub use control::ProposalControl; pub use write::{ diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 11969701c74..8b63f9aae89 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -44,6 +44,7 @@ use super::command::SplitInit; use crate::{ batch::StoreContext, fsm::{PeerFsm, Store}, + operation::command::report_split_init_finish, raft::{Peer, Storage}, router::{CmdResChannel, PeerMsg, PeerTick}, }; @@ -111,6 +112,46 @@ pub struct GcPeerContext { confirmed_ids: Vec, } +fn check_if_to_peer_destroyed( + engine: &ER, + msg: &RaftMessage, + store_id: u64, +) -> engine_traits::Result { + let region_id = msg.get_region_id(); + let to_peer = msg.get_to_peer(); + let local_state = match engine.get_region_state(region_id, u64::MAX)? { + Some(s) => s, + None => return Ok(false), + }; + // Split will not create peer in v2, so the state must be Tombstone. + if local_state.get_state() != PeerState::Tombstone { + panic!( + "[region {}] {} peer doesn't exist but has valid local state {:?}", + region_id, to_peer.id, local_state + ); + } + // Compared to v1, we rely on leader to confirm destroy actively, so here + // skip handling gc for simplicity. + let local_epoch = local_state.get_region().get_region_epoch(); + // The region in this peer is already destroyed + if util::is_epoch_stale(msg.get_region_epoch(), local_epoch) { + return Ok(true); + } + if let Some(local_peer) = find_peer(local_state.get_region(), store_id) && to_peer.id <= local_peer.get_id() { + return Ok(true); + } + // If the peer is destroyed by conf change, all above checks will pass. + if local_state + .get_removed_records() + .iter() + .find(|p| p.get_store_id() == store_id) + .map_or(false, |p| to_peer.id <= p.get_id()) + { + return Ok(true); + } + Ok(false) +} + impl Store { /// The method is called during split. /// The creation process is: @@ -126,6 +167,7 @@ impl Store { ER: RaftEngine, T: Transport, { + let derived_region_id = msg.derived_region_id; let region_id = msg.region.id; let mut raft_msg = Box::::default(); raft_msg.set_region_id(region_id); @@ -147,7 +189,8 @@ impl Store { self.logger(), "Split peer is destroyed before sending the intialization msg"; "split init msg" => ?m, - ) + ); + report_split_init_finish(ctx, derived_region_id, region_id, true); } } @@ -197,33 +240,13 @@ impl Store { ctx.raft_metrics.message_dropped.stale_msg.inc(); return; } - let mut destroyed = false; - let local_state = match ctx.engine.get_region_state(region_id, u64::MAX) { - Ok(s) => s, + let destroyed = match check_if_to_peer_destroyed(&ctx.engine, &msg, self.store_id()) { + Ok(d) => d, Err(e) => { error!(self.logger(), "failed to get region state"; "region_id" => region_id, "err" => ?e); return; } }; - if let Some(local_state) = local_state { - // Split will not create peer in v2, so the state must be Tombstone. - if local_state.get_state() != PeerState::Tombstone { - panic!( - "[region {}] {} peer doesn't exist but has valid local state {:?}", - region_id, to_peer.id, local_state - ); - } - // Compared to v1, we rely on leader to confirm destroy actively, so here - // skip handling gc for simplicity. - let local_epoch = local_state.get_region().get_region_epoch(); - // The region in this peer is already destroyed - if util::is_epoch_stale(msg.get_region_epoch(), local_epoch) { - destroyed = true; - } - if !destroyed && let Some(local_peer) = find_peer(local_state.get_region(), self.store_id()) && to_peer.id <= local_peer.get_id() { - destroyed = true; - } - } if destroyed { if msg.get_is_tombstone() { if let Some(msg) = build_peer_destroyed_report(&mut msg) { diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index dc5f3dad56d..d9bd03b326a 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -13,6 +13,7 @@ use slog::{debug, error, info, warn, Logger}; use tikv_util::{ worker::{Runnable, RunnableWithTimer}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, + Either, }; const DEFAULT_BACKGROUND_POOL_SIZE: usize = 6; @@ -25,7 +26,8 @@ pub enum Task { cb: Box, }, PrepareDestroy { - tablet: EK, + // A path is passed only when the db is never opened. + tablet: Either, region_id: u64, wait_for_persisted: u64, }, @@ -33,6 +35,8 @@ pub enum Task { region_id: u64, persisted_index: u64, }, + /// Sometimes we know for sure a tablet can be destroyed directly. + DirectDestroy { tablet: Either }, } impl Display for Task { @@ -63,6 +67,9 @@ impl Display for Task { "destroy tablet for region_id {} persisted_index {}", region_id, persisted_index, ), + Task::DirectDestroy { .. } => { + write!(f, "direct destroy tablet") + } } } } @@ -81,7 +88,16 @@ impl Task { #[inline] pub fn prepare_destroy(tablet: EK, region_id: u64, wait_for_persisted: u64) -> Self { Task::PrepareDestroy { - tablet, + tablet: Either::Left(tablet), + region_id, + wait_for_persisted, + } + } + + #[inline] + pub fn prepare_destroy_path(path: PathBuf, region_id: u64, wait_for_persisted: u64) -> Self { + Task::PrepareDestroy { + tablet: Either::Right(path), region_id, wait_for_persisted, } @@ -94,6 +110,20 @@ impl Task { persisted_index, } } + + #[inline] + pub fn direct_destroy(tablet: EK) -> Self { + Task::DirectDestroy { + tablet: Either::Left(tablet), + } + } + + #[inline] + pub fn direct_destroy_path(path: PathBuf) -> Self { + Task::DirectDestroy { + tablet: Either::Right(path), + } + } } pub struct Runner { @@ -184,14 +214,29 @@ impl Runner { .unwrap(); } - fn prepare_destroy(&mut self, region_id: u64, tablet: EK, wait_for_persisted: u64) { - // The tablet is about to be deleted, flush is a waste and will block destroy. - let _ = tablet.set_db_options(&[("avoid_flush_during_shutdown", "true")]); - let _ = tablet.pause_background_work(); + fn pause_background_work(&mut self, tablet: Either) -> PathBuf { + match tablet { + Either::Left(tablet) => { + // The tablet is about to be deleted, flush is a waste and will block destroy. + let _ = tablet.set_db_options(&[("avoid_flush_during_shutdown", "true")]); + let _ = tablet.pause_background_work(); + PathBuf::from(tablet.path()) + } + Either::Right(path) => path, + } + } + + fn prepare_destroy( + &mut self, + region_id: u64, + tablet: Either, + wait_for_persisted: u64, + ) { + let path = self.pause_background_work(tablet); self.waiting_destroy_tasks .entry(region_id) .or_default() - .push((PathBuf::from(tablet.path()), wait_for_persisted)); + .push((path, wait_for_persisted)); } fn destroy(&mut self, region_id: u64, persisted: u64) { @@ -208,6 +253,13 @@ impl Runner { } } + fn direct_destroy(&mut self, tablet: Either) { + let path = self.pause_background_work(tablet); + if !Self::process_destroy_task(&self.logger, &self.tablet_registry, &path) { + self.pending_destroy_tasks.push(path); + } + } + /// Returns true if task is consumed. Failure is considered consumed. fn process_destroy_task(logger: &Logger, registry: &TabletRegistry, path: &Path) -> bool { match EK::locked(path.to_str().unwrap()) { @@ -268,6 +320,7 @@ where region_id, persisted_index, } => self.destroy(region_id, persisted_index), + Task::DirectDestroy { tablet, .. } => self.direct_destroy(tablet), } } } From eb2e41645fa6a8db25211e8e3831dbea89bb7e18 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Wed, 8 Feb 2023 15:23:59 +0800 Subject: [PATCH 505/676] log-backup: retry for more time when failed to get snapshot(to adapt the feature witness) (#14155) ref tikv/tikv#14137 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 8 +++++--- components/backup-stream/src/event_loader.rs | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index dc053feff33..a13c52c9212 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -349,7 +349,7 @@ where let mut watcher = match watcher { Ok(w) => w, Err(e) => { - e.report("failed to start watch pause"); + e.report("failed to start watch task"); tokio::time::sleep(Duration::from_secs(5)).await; continue; } @@ -358,11 +358,12 @@ where loop { if let Some(event) = watcher.stream.next().await { - info!("backup stream watch event from etcd"; "event" => ?event); + info!("backup stream watch task from etcd"; "event" => ?event); let revision = meta_client.get_reversion().await; if let Ok(r) = revision { revision_new = r; + info!("update the revision"; "revision" => revision_new); } match event { @@ -408,10 +409,11 @@ where loop { if let Some(event) = watcher.stream.next().await { - info!("backup stream watch event from etcd"; "event" => ?event); + info!("backup stream watch pause from etcd"; "event" => ?event); let revision = meta_client.get_reversion().await; if let Ok(r) = revision { revision_new = r; + info!("update the revision"; "revision" => revision_new); } match event { diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 6222f058cd4..13c958a499a 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -43,7 +43,7 @@ use crate::{ Task, }; -const MAX_GET_SNAPSHOT_RETRY: usize = 3; +const MAX_GET_SNAPSHOT_RETRY: usize = 5; #[derive(Clone)] pub struct PendingMemoryQuota(Arc); @@ -269,7 +269,7 @@ where if !can_retry { break; } - std::thread::sleep(Duration::from_millis(500)); + std::thread::sleep(Duration::from_secs(1)); continue; } } From dda37a457c116586992b6a5b758b807f1ca2c1fc Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 8 Feb 2023 15:53:59 +0800 Subject: [PATCH 506/676] server: disable `PersistStats` RocksDB task for v2 (#14111) ref tikv/tikv#12842 Signed-off-by: tabokie --- src/config/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/config/mod.rs b/src/config/mod.rs index 4188d8409e3..7247d426b21 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1324,6 +1324,10 @@ impl DbConfig { if let Some(r) = &shared.write_buffer_manager { opts.set_write_buffer_manager(r); } + if for_engine == EngineType::RaftKv2 { + // Historical stats are not used. + opts.set_stats_persist_period_sec(0); + } opts } From f46f3866410b1698d306242f4168bae7c3366c48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 8 Feb 2023 16:23:59 +0800 Subject: [PATCH 507/676] log-backup: update the endpoints when etcd cluster config changes (#14127) close tikv/tikv#14165 Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/backup-stream/Cargo.toml | 2 +- .../backup-stream/src/metadata/store/etcd.rs | 317 +++++++++++++++++- .../src/metadata/store/lazy_etcd.rs | 109 +++++- 4 files changed, 417 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1b3c1452ebf..7750f729778 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1702,7 +1702,7 @@ dependencies = [ [[package]] name = "etcd-client" version = "0.10.2" -source = "git+https://github.com/pingcap/etcd-client?rev=14a6f8731f1890d5fd2f6e16a9f0d0a306b0599e#14a6f8731f1890d5fd2f6e16a9f0d0a306b0599e" +source = "git+https://github.com/pingcap/etcd-client?rev=41d393c32a7a7c728550cee1d9a138dafe6f3e27#41d393c32a7a7c728550cee1d9a138dafe6f3e27" dependencies = [ "http", "hyper", diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index b1a61580cb6..43bda42a088 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -32,7 +32,7 @@ engine_traits = { workspace = true } error_code = { workspace = true } # We cannot update the etcd-client to latest version because of the cyclic requirement. # Also we need wait until https://github.com/etcdv3/etcd-client/pull/43/files to be merged. -etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "14a6f8731f1890d5fd2f6e16a9f0d0a306b0599e", features = ["pub-response-field", "tls-openssl-vendored"] } +etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "41d393c32a7a7c728550cee1d9a138dafe6f3e27", features = ["pub-response-field", "tls-openssl-vendored"] } external_storage = { workspace = true } external_storage_export = { workspace = true } fail = "0.5" diff --git a/components/backup-stream/src/metadata/store/etcd.rs b/components/backup-stream/src/metadata/store/etcd.rs index e52cc4f92d9..62a246a08ef 100644 --- a/components/backup-stream/src/metadata/store/etcd.rs +++ b/components/backup-stream/src/metadata/store/etcd.rs @@ -4,17 +4,17 @@ use std::{ cmp::Ordering, collections::{HashMap, HashSet}, pin::Pin, - sync::Arc, + sync::{Arc, Weak}, time::Duration, }; use async_trait::async_trait; use etcd_client::{ - Client, Compare, CompareOp, DeleteOptions, EventType, GetOptions, PutOptions, SortOrder, - SortTarget, Txn, TxnOp, WatchOptions, + Client, Compare, CompareOp, DeleteOptions, EventType, GetOptions, Member, PutOptions, + SortOrder, SortTarget, Txn, TxnOp, WatchOptions, }; use futures::StreamExt; -use tikv_util::warn; +use tikv_util::{info, warn}; use tokio::sync::Mutex; use tokio_stream::Stream; @@ -23,6 +23,7 @@ use super::{ TransactionOp, }; use crate::{ + annotate, errors::{Error, EtcdErrorExt, Result}, metadata::{ keys::{KeyValue, MetaKey}, @@ -35,6 +36,187 @@ use crate::{ #[derive(Clone)] pub struct EtcdStore(Arc>); +#[derive(Default)] +pub(super) struct TopologyUpdater { + last_urls: HashSet, + client: Weak>, + + // back off configs + pub(super) loop_interval: Duration, + pub(super) loop_failure_back_off: Duration, +} + +impl std::fmt::Debug for TopologyUpdater { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TopologyUpdater") + .field("last_urls", &self.last_urls) + .finish() + } +} + +#[async_trait] +pub(super) trait ClusterInfoProvider { + async fn get_members(&mut self) -> Result>; + async fn add_endpoint(&mut self, endpoint: &str) -> Result<()>; + async fn remove_endpoint(&mut self, endpoint: &str) -> Result<()>; +} + +#[async_trait] +impl ClusterInfoProvider for Client { + async fn get_members(&mut self) -> Result> { + let result = self.member_list().await?; + Ok(result.members().to_vec()) + } + + async fn add_endpoint(&mut self, endpoint: &str) -> Result<()> { + Client::add_endpoint(self, endpoint) + .await + .map_err(|err| annotate!(err, "during adding the endpoint {}", endpoint))?; + Ok(()) + } + + async fn remove_endpoint(&mut self, endpoint: &str) -> Result<()> { + Client::remove_endpoint(self, endpoint) + .await + .map_err(|err| annotate!(err, "during removing the endpoint {}", endpoint))?; + Ok(()) + } +} + +#[derive(Debug, Clone, Copy)] +enum DiffType { + Add, + Remove, +} + +#[derive(Clone)] +struct Diff { + diff_type: DiffType, + url: String, +} + +impl std::fmt::Debug for Diff { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let syn = match self.diff_type { + DiffType::Add => "+", + DiffType::Remove => "-", + }; + write!(f, "{}{}", syn, self.url) + } +} + +impl TopologyUpdater { + // Note: we may require the initial endpoints from the arguments directly. + // So the internal map won't get inconsistent when the cluster config changed + // during initializing. + // But that is impossible for now because we cannot query the node ID before + // connecting. + pub fn new(cluster_ref: Weak>) -> Self { + Self { + last_urls: Default::default(), + client: cluster_ref, + + loop_interval: Duration::from_secs(60), + loop_failure_back_off: Duration::from_secs(10), + } + } + + pub fn init(&mut self, members: impl Iterator) { + for mem in members { + self.last_urls.insert(mem); + } + } + + fn diff(&self, incoming: &[Member]) -> Vec { + let newer = incoming + .iter() + .flat_map(|mem| mem.client_urls().iter()) + .collect::>(); + let mut result = vec![]; + for url in &newer { + if !self.last_urls.contains(*url) { + result.push(Diff { + diff_type: DiffType::Add, + url: String::clone(url), + }) + } + } + for url in &self.last_urls { + if !newer.contains(url) { + result.push(Diff { + diff_type: DiffType::Remove, + url: String::clone(url), + }) + } + } + result + } + + fn apply(&mut self, diff: &Diff) -> Option { + match diff.diff_type { + DiffType::Add => match self.last_urls.insert(diff.url.clone()) { + true => None, + false => Some(format!( + "the member to adding with url {} overrides existing urls.", + diff.url + )), + }, + DiffType::Remove => match self.last_urls.remove(&diff.url) { + true => None, + false => Some(format!( + "the member to remove with url {} hasn't been added.", + diff.url + )), + }, + } + } + + async fn update_topology_by(&mut self, cli: &mut C, diff: &Diff) -> Result<()> { + match diff.diff_type { + DiffType::Add => cli.add_endpoint(&diff.url).await?, + DiffType::Remove => cli.remove_endpoint(&diff.url).await?, + } + Ok(()) + } + + async fn do_update(&mut self, cli: &mut C) -> Result<()> { + let cluster = cli.get_members().await?; + let diffs = self.diff(cluster.as_slice()); + if !diffs.is_empty() { + info!("log backup updating store topology."; "diffs" => ?diffs, "current_state" => ?self); + } + for diff in diffs { + match self.apply(&diff) { + Some(warning) => { + warn!("log backup meet some wrong status when updating PD clients, skipping this update."; "warn" => %warning); + } + None => self.update_topology_by(cli, &diff).await?, + } + } + Result::Ok(()) + } + + pub(super) async fn update_topology_loop(&mut self) { + while let Some(cli) = self.client.upgrade() { + let mut lock = cli.lock().await; + let result = self.do_update(&mut lock).await; + drop(lock); + match result { + Ok(_) => tokio::time::sleep(self.loop_interval).await, + Err(err) => { + err.report("during updating etcd topology"); + tokio::time::sleep(self.loop_failure_back_off).await; + } + } + } + } + + pub async fn main_loop(mut self) { + info!("log backup topology updater finish initialization."; "current_state" => ?self); + self.update_topology_loop().await + } +} + impl EtcdStore { pub fn connect, S: AsRef<[E]>>(endpoints: S) -> Self { // TODO remove block_on @@ -42,6 +224,10 @@ impl EtcdStore { futures::executor::block_on(etcd_client::Client::connect(&endpoints, None)).unwrap(); Self(Arc::new(Mutex::new(cli))) } + + pub fn inner(&self) -> &Arc> { + &self.0 + } } impl From for EtcdStore { @@ -316,3 +502,126 @@ impl Snapshot for EtcdSnapshot { self.revision } } + +#[cfg(test)] +mod test { + use std::{ + collections::{HashMap, HashSet}, + fmt::Display, + sync::Arc, + time::Duration, + }; + + use async_trait::async_trait; + use etcd_client::{proto::PbMember, Member}; + use tokio::{sync::Mutex, time::timeout}; + + use super::{ClusterInfoProvider, TopologyUpdater}; + use crate::errors::Result; + + #[derive(Default, Debug)] + struct FakeCluster { + id_alloc: u64, + members: HashMap, + endpoints: HashSet, + } + + #[async_trait] + impl ClusterInfoProvider for FakeCluster { + async fn get_members(&mut self) -> Result> { + let members = self.members.values().cloned().collect(); + Ok(members) + } + + async fn add_endpoint(&mut self, endpoint: &str) -> Result<()> { + self.endpoints.insert(endpoint.to_owned()); + Ok(()) + } + + async fn remove_endpoint(&mut self, endpoint: &str) -> Result<()> { + self.endpoints.remove(endpoint); + Ok(()) + } + } + + impl FakeCluster { + fn new_id(&mut self) -> u64 { + let i = self.id_alloc; + self.id_alloc += 1; + i + } + + fn init_with_member(&mut self, n: usize) -> Vec { + let mut endpoints = Vec::with_capacity(n); + for _ in 0..n { + let mem = self.add_member(); + let url = format!("fakestore://{}", mem); + self.endpoints.insert(url.clone()); + endpoints.push(url); + } + endpoints + } + + fn add_member(&mut self) -> u64 { + let id = self.new_id(); + let mut mem = PbMember::default(); + mem.id = id; + mem.client_ur_ls = vec![format!("fakestore://{}", id)]; + // Safety: `Member` is #[repr(transparent)]. + self.members.insert(id, unsafe { std::mem::transmute(mem) }); + id + } + + fn remove_member(&mut self, id: u64) -> bool { + self.members.remove(&id).is_some() + } + + fn check_consistency(&self, message: impl Display) { + let urls = self + .members + .values() + .flat_map(|mem| mem.client_urls().iter().cloned()) + .collect::>(); + assert_eq!( + urls, self.endpoints, + "{}: consistency check not passed.", + message + ); + } + } + + #[test] + fn test_topology_updater() { + let mut c = FakeCluster::default(); + let eps = c.init_with_member(3); + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + let sc = Arc::new(Mutex::new(c)); + let mut tu = TopologyUpdater::new(Arc::downgrade(&sc)); + tu.loop_failure_back_off = Duration::ZERO; + tu.loop_interval = Duration::from_millis(100); + tu.init(eps.into_iter()); + + { + let mut sc = sc.blocking_lock(); + sc.check_consistency("after init"); + sc.add_member(); + rt.block_on(tu.do_update(&mut sc)).unwrap(); + sc.check_consistency("adding nodes"); + sc.add_member(); + sc.add_member(); + rt.block_on(tu.do_update(&mut sc)).unwrap(); + sc.check_consistency("adding more nodes"); + assert!(sc.remove_member(0), "{:?}", sc); + rt.block_on(tu.do_update(&mut sc)).unwrap(); + sc.check_consistency("removing nodes"); + } + + drop(sc); + rt.block_on(async { timeout(Duration::from_secs(1), tu.update_topology_loop()).await }) + .unwrap() + } +} diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 7e8b7881070..3b697dae9b9 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -19,7 +19,10 @@ use tikv_util::{ }; use tokio::sync::Mutex as AsyncMutex; -use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; +use super::{ + etcd::{EtcdSnapshot, TopologyUpdater}, + EtcdStore, MetaStore, +}; use crate::errors::{ContextualResultExt, Result}; const RPC_TIMEOUT: Duration = Duration::from_secs(30); @@ -34,6 +37,16 @@ pub struct ConnectionConfig { pub keep_alive_timeout: Duration, } +impl Default for ConnectionConfig { + fn default() -> Self { + Self { + tls: Default::default(), + keep_alive_interval: Duration::from_secs(10), + keep_alive_timeout: Duration::from_secs(3), + } + } +} + impl std::fmt::Debug for ConnectionConfig { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("ConnectionConfig") @@ -89,12 +102,15 @@ impl ConnectionConfig { impl LazyEtcdClient { pub fn new(endpoints: &[String], conf: ConnectionConfig) -> Self { - Self(Arc::new(AsyncMutex::new(LazyEtcdClientInner { - conf, - endpoints: endpoints.iter().map(ToString::to_string).collect(), - last_modified: None, - cli: None, - }))) + let mut inner = LazyEtcdClientInner::new(endpoints, conf); + inner.normalize_urls(); + Self(Arc::new(AsyncMutex::new(inner))) + } + + // For testing -- check whether the endpoints are properly normalized. + #[cfg(test)] + pub(super) fn endpoints(&self) -> Vec { + self.0.blocking_lock().endpoints.clone() } async fn get_cli(&self) -> Result { @@ -112,6 +128,17 @@ pub struct LazyEtcdClientInner { cli: Option, } +impl LazyEtcdClientInner { + fn new(endpoints: &[String], conf: ConnectionConfig) -> Self { + LazyEtcdClientInner { + conf, + endpoints: endpoints.iter().map(ToString::to_string).collect(), + last_modified: None, + cli: None, + } + } +} + fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { match etcd_err { EtcdError::InvalidArgs(_) @@ -164,6 +191,21 @@ where } impl LazyEtcdClientInner { + fn normalize_urls(&mut self) { + let enabled_tls = self.conf.tls.client_suite().is_ok(); + for endpoint in self.endpoints.iter_mut() { + // Don't touch them when the schemes already provided. + // Given etcd is based on gRPC (which relies on HTTP/2), + // there shouldn't be other schemes available (Hopefully...) + if endpoint.starts_with("http://") || endpoint.starts_with("https://") { + continue; + } + let expected_scheme = if enabled_tls { "https" } else { "http" }; + *endpoint = format!("{}://{}", expected_scheme, endpoint) + } + info!("log backup normalized etcd endpoints"; "endpoints" => ?self.endpoints); + } + async fn connect(&mut self) -> Result<&EtcdStore> { let store = retry(|| { // For now, the interface of the `etcd_client` doesn't us to control @@ -178,7 +220,10 @@ impl LazyEtcdClientInner { .await .context("during connecting to the etcd")?; let store = EtcdStore::from(store); + let mut updater = TopologyUpdater::new(Arc::downgrade(store.inner())); self.cli = Some(store); + updater.init(self.endpoints.iter().cloned()); + tokio::task::spawn(updater.main_loop()); Ok(self.cli.as_ref().unwrap()) } @@ -219,3 +264,53 @@ impl MetaStore for LazyEtcdClient { self.get_cli().await?.txn_cond(txn).await } } + +#[cfg(test)] +mod tests { + use std::{fs::File, io::Write, path::PathBuf, sync::Arc}; + + use security::{SecurityConfig, SecurityManager}; + use tempfile::TempDir; + + use super::LazyEtcdClient; + use crate::{errors::Result, metadata::ConnectionConfig}; + + #[test] + fn test_normalize_url() -> Result<()> { + let endpoints = ["http://pd-1".to_owned(), "pd-2".to_owned()]; + let le = LazyEtcdClient::new(&endpoints, Default::default()); + assert_eq!(le.endpoints(), &["http://pd-1", "http://pd-2"]); + + let tempdir = TempDir::new()?; + let write_all = |path: &PathBuf, content| { + let mut f = File::create(path)?; + f.write_all(content)?; + Result::Ok(()) + }; + let ca = tempdir.path().join("ca"); + let cert = tempdir.path().join("cert"); + let key = tempdir.path().join("key"); + write_all(&ca, b"CA :3")?; + write_all(&cert, b"Cert :D")?; + write_all(&key, b"Key X)")?; + + let cfg = SecurityConfig { + ca_path: ca.to_string_lossy().into_owned(), + cert_path: cert.to_string_lossy().into_owned(), + key_path: key.to_string_lossy().into_owned(), + + ..Default::default() + }; + let sm = SecurityManager::new(&cfg).unwrap(); + let endpoints = ["https://pd-1".to_owned(), "pd-2".to_owned()]; + let le = LazyEtcdClient::new( + &endpoints, + ConnectionConfig { + tls: Arc::new(sm), + ..Default::default() + }, + ); + assert_eq!(le.endpoints(), &["https://pd-1", "https://pd-2"]); + Result::Ok(()) + } +} From f6513edc265b6716f3b99ed9b10dd4ffacdf4de2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 8 Feb 2023 17:27:59 +0800 Subject: [PATCH 508/676] log-backup: edit checkpoint to 2 hours (#13894) ref tikv/tikv#13889 Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- components/backup-stream/src/checkpoint_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 5cf4292faa3..47ec34d2113 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -371,13 +371,13 @@ impl FlushObserver for BasicFlushObserver { .update_service_safe_point( format!("backup-stream-{}-{}", task, self.store_id), TimeStamp::new(rts.saturating_sub(1)), - // Add a service safe point for 24 hours. (the same as fatal error.) + // Add a service safe point for 2 hours. // We make it the same duration as we meet fatal errors because TiKV may be // SIGKILL'ed after it meets fatal error and before it successfully updated the // fatal error safepoint. // TODO: We'd better make the coordinator, who really // calculates the checkpoint to register service safepoint. - Duration::from_secs(60 * 60 * 24), + Duration::from_secs(60 * 60 * 2), ) .await { From ee00d70008562b45bafb0dcd88d70520a18fa742 Mon Sep 17 00:00:00 2001 From: Connor Date: Wed, 8 Feb 2023 18:23:59 +0800 Subject: [PATCH 509/676] grafana: add grpc resource group QPS panel (#14171) ref tikv/tikv#13730 Add grpc resource group QPS panel and fix datasource Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- metrics/grafana/tikv_details.json | 154 ++++++++++++++++++++++++------ 1 file changed, 126 insertions(+), 28 deletions(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index cff4b5f7742..357edac04a7 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -40,6 +40,12 @@ "name": "Singlestat", "version": "" }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, { "type": "panel", "id": "table", @@ -64,7 +70,7 @@ "gnetId": null, "graphTooltip": 1, "id": null, - "iteration": 1651043540619, + "iteration": 1675760728538, "links": [], "panels": [ { @@ -4532,7 +4538,6 @@ }, "yaxes": [ { - "$$hashKey": "object:150", "format": "s", "label": null, "logBase": 2, @@ -4541,7 +4546,6 @@ "show": true }, { - "$$hashKey": "object:151", "format": "short", "label": null, "logBase": 2, @@ -4632,7 +4636,6 @@ }, "yaxes": [ { - "$$hashKey": "object:150", "format": "s", "label": null, "logBase": 2, @@ -4641,7 +4644,6 @@ "show": true }, { - "$$hashKey": "object:151", "format": "short", "label": null, "logBase": 2, @@ -4697,7 +4699,6 @@ "renderer": "flot", "seriesOverrides": [ { - "$$hashKey": "object:80", "alias": "/.*/", "stack": "A" } @@ -4746,7 +4747,6 @@ }, "yaxes": [ { - "$$hashKey": "object:264", "format": "ns", "label": null, "logBase": 1, @@ -4755,7 +4755,6 @@ "show": true }, { - "$$hashKey": "object:265", "format": "short", "label": null, "logBase": 1, @@ -4811,7 +4810,6 @@ "renderer": "flot", "seriesOverrides": [ { - "$$hashKey": "object:62", "alias": "/.*/", "stack": "A" } @@ -4858,7 +4856,6 @@ }, "yaxes": [ { - "$$hashKey": "object:264", "format": "binBps", "label": null, "logBase": 1, @@ -4867,7 +4864,6 @@ "show": true }, { - "$$hashKey": "object:265", "format": "short", "label": null, "logBase": 1, @@ -4915,7 +4911,7 @@ "h": 8, "w": 12, "x": 0, - "y": 4 + "y": 5 }, "hiddenSeries": false, "id": 95, @@ -4941,7 +4937,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", @@ -5019,7 +5015,7 @@ "h": 8, "w": 12, "x": 12, - "y": 4 + "y": 5 }, "hiddenSeries": false, "id": 107, @@ -5045,7 +5041,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", @@ -5123,7 +5119,7 @@ "h": 8, "w": 12, "x": 0, - "y": 12 + "y": 13 }, "hiddenSeries": false, "id": 98, @@ -5151,7 +5147,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", @@ -5228,7 +5224,7 @@ "h": 8, "w": 12, "x": 12, - "y": 12 + "y": 13 }, "hiddenSeries": false, "id": 2532, @@ -5256,7 +5252,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", @@ -5333,7 +5329,7 @@ "h": 8, "w": 12, "x": 0, - "y": 20 + "y": 21 }, "hiddenSeries": false, "id": 2533, @@ -5361,7 +5357,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", @@ -5473,7 +5469,7 @@ "h": 8, "w": 12, "x": 12, - "y": 20 + "y": 21 }, "hiddenSeries": false, "id": 2534, @@ -5501,7 +5497,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", @@ -5653,7 +5649,6 @@ }, "yaxes": [ { - "$$hashKey": "object:69", "format": "ops", "label": null, "logBase": 1, @@ -5662,7 +5657,6 @@ "show": true }, { - "$$hashKey": "object:70", "format": "short", "label": null, "logBase": 1, @@ -5761,7 +5755,6 @@ }, "yaxes": [ { - "$$hashKey": "object:69", "format": "µs", "label": null, "logBase": 1, @@ -5770,7 +5763,112 @@ "show": true }, { - "$$hashKey": "object:70", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "The QPS of different resource groups of gRPC request", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 37 + }, + "hiddenSeries": false, + "id": 23763573091, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_grpc_resource_group_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (name)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "metric": "tikv_grpc_msg_duration_seconds_bucket", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "gRPC resource group QPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { "format": "short", "label": null, "logBase": 1, @@ -15871,7 +15969,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "tidb-cluster", + "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fieldConfig": { From 2d7bf4c2089c0b9d8e2ef6d6417d2473f66f2892 Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 8 Feb 2023 20:31:59 +0800 Subject: [PATCH 510/676] resource_control: support return resource groups config via http (#14170) ref tikv/tikv#13730 Signed-off-by: glorv --- components/pd_client/src/client.rs | 8 +- .../resource_control/src/resource_group.rs | 55 ++++++ components/resource_control/src/service.rs | 182 ++++++++++-------- components/server/src/server.rs | 1 + components/server/src/server2.rs | 1 + components/test_pd/src/mocker/etcd.rs | 27 ++- src/server/status_server/mod.rs | 79 ++++++++ tests/integrations/server/status_server.rs | 1 + 8 files changed, 268 insertions(+), 86 deletions(-) diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index b0c21797a91..402192596b5 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -338,7 +338,13 @@ impl PdClient for RpcClient { Err(err) => Err(box_err!("{:?}", err)), } }) as PdFuture<_>, - Err(err) => Box::pin(async move { Err(box_err!("{:?}", err)) }) as PdFuture<_>, + Err(err) => Box::pin(async move { + Err(box_err!( + "load global config failed, path: '{}', err: {:?}", + req.get_config_path(), + err + )) + }) as PdFuture<_>, }; self.pd_client .request(req, executor, LEADER_CHANGE_RETRY) diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index c5112c13516..390214bc687 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -13,6 +13,7 @@ use kvproto::{ kvrpcpb::CommandPri, resource_manager::{GroupMode, ResourceGroup}, }; +use tikv_util::info; use yatp::queue::priority::TaskPriorityProvider; // a read task cost at least 50us. @@ -70,6 +71,7 @@ impl ResourceGroupManager { let ru_quota = Self::get_ru_setting(&rg, controller.is_read); controller.add_resource_group(group_name.clone().into_bytes(), ru_quota); }); + info!("add resource group"; "name"=> &rg.name, "ru" => rg.get_r_u_settings().get_r_u().get_settings().get_fill_rate()); self.resource_groups.insert(group_name, rg); } @@ -78,9 +80,28 @@ impl ResourceGroupManager { self.registry.lock().unwrap().iter().for_each(|controller| { controller.remove_resource_group(group_name.as_bytes()); }); + info!("remove resource group"; "name"=> name); self.resource_groups.remove(&group_name); } + pub fn retain(&self, mut f: impl FnMut(&String, &ResourceGroup) -> bool) { + let mut removed_names = vec![]; + self.resource_groups.retain(|k, v| { + let ret = f(k, v); + if !ret { + removed_names.push(k.clone()); + } + ret + }); + if !removed_names.is_empty() { + self.registry.lock().unwrap().iter().for_each(|controller| { + for name in &removed_names { + controller.remove_resource_group(name.as_bytes()); + } + }); + } + } + pub fn get_resource_group(&self, name: &str) -> Option> { self.resource_groups.get(&name.to_ascii_lowercase()) } @@ -173,6 +194,7 @@ impl ResourceController { virtual_time: AtomicU64::new(self.last_min_vt.load(Ordering::Acquire)), vt_delta_for_get, }; + // maybe update existed group self.resource_consumptions.insert(name, group); } @@ -192,6 +214,7 @@ impl ResourceController { // do not remove the default resource group, reset to default setting instead. if DEFAULT_RESOURCE_GROUP_NAME.as_bytes() == name { self.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); + return; } self.resource_consumptions.remove(name); } @@ -487,4 +510,36 @@ pub(crate) mod tests { 10 ); } + + #[test] + fn test_retain_resource_groups() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); + + for i in 0..5 { + let group1 = new_resource_group_ru(format!("test{}", i), 100); + resource_manager.add_resource_group(group1); + // add a resource group with big ru + let group1 = new_resource_group_ru(format!("group{}", i), 100); + resource_manager.add_resource_group(group1); + } + assert_eq!(resource_manager.get_all_resource_groups().len(), 10); + assert_eq!(resource_ctl.resource_consumptions.len(), 11); // 10 + 1(default) + assert_eq!(resource_ctl_write.resource_consumptions.len(), 11); + + resource_manager.retain(|k, _v| k.starts_with("test")); + assert_eq!(resource_manager.get_all_resource_groups().len(), 5); + assert_eq!(resource_ctl.resource_consumptions.len(), 6); + assert_eq!(resource_ctl_write.resource_consumptions.len(), 6); + assert!(resource_manager.get_resource_group("group1").is_none()); + assert_eq!( + resource_ctl.resource_group("group2".as_bytes()).key(), + "default".as_bytes() + ); + assert_eq!( + resource_ctl_write.resource_group("group2".as_bytes()).key(), + "default".as_bytes() + ); + } } diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index fc24af4fdc4..a2d64f57c3b 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -1,6 +1,6 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, time::Duration}; +use std::{collections::HashSet, sync::Arc, time::Duration}; use futures::{compat::Future01CompatExt, StreamExt}; use kvproto::{pdpb::EventType, resource_manager::ResourceGroup}; @@ -35,71 +35,76 @@ const RETRY_INTERVAL: Duration = Duration::from_secs(1); // to consistent with p impl ResourceManagerService { pub async fn watch_resource_groups(&mut self) { - // Firstly, load all resource groups as of now. - let (groups, revision) = self.list_resource_groups().await; - self.revision = revision; - groups - .into_iter() - .for_each(|rg| self.manager.add_resource_group(rg)); - // Secondly, start watcher at loading revision. - loop { - match self - .pd_client - .watch_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), self.revision) - { - Ok(mut stream) => { - while let Some(grpc_response) = stream.next().await { - match grpc_response { - Ok(r) => { - self.revision = r.get_revision(); - r.get_changes() - .iter() - .for_each(|item| match item.get_kind() { - EventType::Put => { - if let Ok(group) = - protobuf::parse_from_bytes::( + 'outer: loop { + // Firstly, load all resource groups as of now. + self.reload_all_resource_groups().await; + // Secondly, start watcher at loading revision. + loop { + match self + .pd_client + .watch_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), self.revision) + { + Ok(mut stream) => { + while let Some(grpc_response) = stream.next().await { + match grpc_response { + Ok(r) => { + self.revision = r.get_revision(); + r.get_changes() + .iter() + .for_each(|item| match item.get_kind() { + EventType::Put => { + match protobuf::parse_from_bytes::( item.get_payload(), - ) - { - self.manager.add_resource_group(group); + ) { + Ok(group) => { + self.manager.add_resource_group(group); + } + Err(e) => { + error!("parse put resource group event failed"; "name" => item.get_name(), "err" => ?e); + } + } } - } - EventType::Delete => { - self.manager.remove_resource_group(item.get_name()); - } - }); - } - Err(err) => { - error!("failed to get stream"; "err" => ?err); - let _ = GLOBAL_TIMER_HANDLE - .delay(std::time::Instant::now() + RETRY_INTERVAL) - .compat() - .await; + EventType::Delete => { + match protobuf::parse_from_bytes::( + item.get_payload(), + ) { + Ok(group) => { + self.manager.remove_resource_group(group.get_name()); + } + Err(e) => { + error!("parse delete resource group event failed"; "name" => item.get_name(), "err" => ?e); + } + } + } + }); + } + Err(err) => { + error!("failed to get stream"; "err" => ?err); + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; + } } } } - } - Err(PdError::DataCompacted(msg)) => { - error!("required revision has been compacted"; "err" => ?msg); - // If the etcd revision is compacted, we need to reload all resouce groups. - let (groups, revision) = self.list_resource_groups().await; - self.revision = revision; - groups - .into_iter() - .for_each(|rg| self.manager.add_resource_group(rg)); - } - Err(err) => { - error!("failed to watch resource groups"; "err" => ?err); - let _ = GLOBAL_TIMER_HANDLE - .delay(std::time::Instant::now() + RETRY_INTERVAL) - .compat() - .await; + Err(PdError::DataCompacted(msg)) => { + error!("required revision has been compacted"; "err" => ?msg); + continue 'outer; + } + Err(err) => { + error!("failed to watch resource groups"; "err" => ?err); + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; + } } } } } - async fn list_resource_groups(&mut self) -> (Vec, i64) { + async fn reload_all_resource_groups(&mut self) { loop { match self .pd_client @@ -107,11 +112,22 @@ impl ResourceManagerService { .await { Ok((items, revision)) => { - let groups = items - .into_iter() - .filter_map(|g| protobuf::parse_from_bytes(g.get_payload()).ok()) - .collect(); - return (groups, revision); + let mut vaild_groups = HashSet::with_capacity(items.len()); + items.iter().for_each(|g| { + match protobuf::parse_from_bytes::(g.get_payload()) { + Ok(rg) => { + vaild_groups.insert(rg.get_name().to_ascii_lowercase()); + self.manager.add_resource_group(rg); + } + Err(e) => { + error!("parse resource group failed"; "name" => g.get_name(), "err" => ?e); + } + } + }); + + self.manager.retain(|name, _g| vaild_groups.contains(name)); + self.revision = revision; + return; } Err(err) => { error!("failed to load global config"; "err" => ?err); @@ -185,14 +201,14 @@ pub mod tests { let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); let group = new_resource_group("TEST".into(), true, 100, 100); add_resource_group(s.pd_client.clone(), group); - let (res, revision) = block_on(s.list_resource_groups()); - assert_eq!(res.len(), 1); - assert_eq!(revision, 1); + block_on(s.reload_all_resource_groups()); + assert_eq!(s.manager.get_all_resource_groups().len(), 1); + assert_eq!(s.revision, 1); delete_resource_group(s.pd_client.clone(), "TEST"); - let (res, revision) = block_on(s.list_resource_groups()); - assert_eq!(res.len(), 0); - assert_eq!(revision, 2); + block_on(s.reload_all_resource_groups()); + assert_eq!(s.manager.get_all_resource_groups().len(), 0); + assert_eq!(s.revision, 2); server.stop(); } @@ -203,9 +219,24 @@ pub mod tests { let resource_manager = ResourceGroupManager::default(); let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); - let (res, revision) = block_on(s.list_resource_groups()); - assert_eq!(res.len(), 0); - assert_eq!(revision, 0); + block_on(s.reload_all_resource_groups()); + assert_eq!(s.manager.get_all_resource_groups().len(), 0); + assert_eq!(s.revision, 0); + + // TODO: find a better way to observe the watch is ready. + let wait_watch_ready = |s: &ResourceManagerService, count: usize| { + for _i in 0..100 { + if s.manager.get_all_resource_groups().len() == count { + return; + } + std::thread::sleep(Duration::from_millis(1)); + } + panic!( + "wait time out, expectd: {}, got: {}", + count, + s.manager.get_all_resource_groups().len() + ); + }; let background_worker = Builder::new("background").thread_count(1).create(); let mut s_clone = s.clone(); @@ -220,16 +251,13 @@ pub mod tests { // Mock modify let group2 = new_resource_group_ru("TEST2".into(), 50); add_resource_group(s.pd_client.clone(), group2); - let (res, revision) = block_on(s.list_resource_groups()); - assert_eq!(res.len(), 2); - assert_eq!(revision, 3); + wait_watch_ready(&s, 2); + // Mock delete delete_resource_group(s.pd_client.clone(), "TEST1"); - let (res, revision) = block_on(s.list_resource_groups()); - assert_eq!(res.len(), 1); - assert_eq!(revision, 4); + // Wait for watcher - std::thread::sleep(Duration::from_millis(100)); + wait_watch_ready(&s, 1); let groups = s.manager.get_all_resource_groups(); assert_eq!(groups.len(), 1); assert!(s.manager.get_resource_group("TEST1").is_none()); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index be516a84ae0..99d56ac10cd 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1697,6 +1697,7 @@ where Arc::new(self.config.security.clone()), self.engines.as_ref().unwrap().engine.raft_extension(), self.store_path.clone(), + self.resource_manager.clone(), ) { Ok(status_server) => Box::new(status_server), Err(e) => { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 0797b391d87..2a67318439b 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -1311,6 +1311,7 @@ where Arc::new(self.config.security.clone()), self.engines.as_ref().unwrap().engine.raft_extension(), self.store_path.clone(), + self.resource_manager.clone(), ) { Ok(status_server) => Box::new(status_server), Err(e) => { diff --git a/components/test_pd/src/mocker/etcd.rs b/components/test_pd/src/mocker/etcd.rs index 3939dfc9a72..d0fe3f43e68 100644 --- a/components/test_pd/src/mocker/etcd.rs +++ b/components/test_pd/src/mocker/etcd.rs @@ -50,7 +50,7 @@ impl Etcd { let (k, v) = group.last()?; match v { Value::Val(val) => Some(KeyValue(MetaKey(k.0.clone()), val.clone())), - Value::Del => None, + Value::Del(_) => None, } }) .fold(Vec::new(), |mut items, item| { @@ -88,13 +88,14 @@ impl Etcd { Bound::Included(Key(start_key, 0)), Bound::Excluded(Key(end_key, self.revision)), )) - .map(|(k, _)| Key::clone(k)) + .map(|(k, v)| (Key::clone(k), v.clone())) .collect::>(); v.dedup_by(|k1, k2| k1.0 == k2.0); - for mut victim in v { + for (victim, data) in v { let k = Key(victim.0.clone(), rev); - self.items.insert(k, Value::Del); + let data = data.take_data(); + self.items.insert(k, Value::Del(data.clone())); for sub in self.subs.values() { if victim.0.as_slice() < sub.end_key.as_slice() @@ -103,7 +104,7 @@ impl Etcd { sub.tx .send(KvEvent { kind: KvEventType::Delete, - pair: KeyValue(MetaKey(std::mem::take(&mut victim.0)), vec![]), + pair: KeyValue(MetaKey(victim.0.clone()), data.clone()), }) .await .unwrap(); @@ -135,9 +136,9 @@ impl Etcd { kind: KvEventType::Put, pair: KeyValue(MetaKey(k.0.clone()), val.clone()), }, - Value::Del => KvEvent { + Value::Del(val) => KvEvent { kind: KvEventType::Delete, - pair: KeyValue(MetaKey(k.0.clone()), vec![]), + pair: KeyValue(MetaKey(k.0.clone()), val.clone()), }, }; tx.send(event).await.expect("too many pending events"); @@ -259,7 +260,17 @@ impl std::fmt::Debug for Key { #[derive(Debug, PartialEq, Clone)] enum Value { Val(Vec), - Del, + // the value is the last put val. This is used for watch changes. + Del(Vec), +} + +impl Value { + fn take_data(self) -> Vec { + match self { + Value::Val(d) => d, + Value::Del(d) => d, + } + } } /// The key set for getting. diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 2f87c5d0264..ad7779b121c 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -32,6 +32,7 @@ use hyper::{ service::{make_service_fn, service_fn}, Body, Method, Request, Response, Server, StatusCode, }; +use kvproto::resource_manager::ResourceGroup; use online_config::OnlineConfig; use openssl::{ ssl::{Ssl, SslAcceptor, SslContext, SslFiletype, SslMethod, SslVerifyMode}, @@ -44,7 +45,9 @@ pub use profile::{ }; use prometheus::TEXT_FORMAT; use regex::Regex; +use resource_control::ResourceGroupManager; use security::{self, SecurityConfig}; +use serde::Serialize; use serde_json::Value; use tikv_kv::RaftExtension; use tikv_util::{ @@ -89,6 +92,7 @@ pub struct StatusServer { router: R, security_config: Arc, store_path: PathBuf, + resource_manager: Option>, } impl StatusServer @@ -101,6 +105,7 @@ where security_config: Arc, router: R, store_path: PathBuf, + resource_manager: Option>, ) -> Result { let thread_pool = Builder::new_multi_thread() .enable_all() @@ -120,6 +125,7 @@ where router, security_config, store_path, + resource_manager, }) } @@ -518,6 +524,7 @@ where let cfg_controller = self.cfg_controller.clone(); let router = self.router.clone(); let store_path = self.store_path.clone(); + let resource_manager = self.resource_manager.clone(); // Start to serve. let server = builder.serve(make_service_fn(move |conn: &C| { let x509 = conn.get_x509(); @@ -525,6 +532,7 @@ where let cfg_controller = cfg_controller.clone(); let router = router.clone(); let store_path = store_path.clone(); + let resource_manager = resource_manager.clone(); async move { // Create a status service. Ok::<_, hyper::Error>(service_fn(move |req: Request| { @@ -533,6 +541,7 @@ where let cfg_controller = cfg_controller.clone(); let router = router.clone(); let store_path = store_path.clone(); + let resource_manager = resource_manager.clone(); async move { let path = req.uri().path().to_owned(); let method = req.method().to_owned(); @@ -607,6 +616,9 @@ where (Method::PUT, path) if path.starts_with("/log-level") => { Self::change_log_level(req).await } + (Method::GET, "/resource_groups") => { + Self::handle_get_all_resource_groups(resource_manager.as_ref()) + } _ => Ok(make_response(StatusCode::NOT_FOUND, "path not found")), } } @@ -644,6 +656,63 @@ where } Ok(()) } + + pub fn handle_get_all_resource_groups( + mgr: Option<&Arc>, + ) -> hyper::Result> { + let groups = if let Some(mgr) = mgr { + mgr.get_all_resource_groups() + .into_iter() + .map(into_debug_request_group) + .collect() + } else { + vec![] + }; + let body = match serde_json::to_vec(&groups) { + Ok(body) => body, + Err(err) => { + return Ok(make_response( + StatusCode::INTERNAL_SERVER_ERROR, + format!("fails to json: {}", err), + )); + } + }; + match Response::builder() + .header("content-type", "application/json") + .body(hyper::Body::from(body)) + { + Ok(resp) => Ok(resp), + Err(err) => Ok(make_response( + StatusCode::INTERNAL_SERVER_ERROR, + format!("fails to build response: {}", err), + )), + } + } +} + +#[derive(Serialize)] +struct ResouceGroupSetting { + name: String, + ru: u64, + burst_limit: i64, +} + +fn into_debug_request_group(rg: ResourceGroup) -> ResouceGroupSetting { + ResouceGroupSetting { + name: rg.name, + ru: rg + .r_u_settings + .get_ref() + .get_r_u() + .get_settings() + .get_fill_rate(), + burst_limit: rg + .r_u_settings + .get_ref() + .get_r_u() + .get_settings() + .get_burst_limit(), + } } // To unify TLS/Plain connection usage in start_serve function @@ -957,6 +1026,7 @@ mod tests { Arc::new(SecurityConfig::default()), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); @@ -1005,6 +1075,7 @@ mod tests { Arc::new(SecurityConfig::default()), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); @@ -1050,6 +1121,7 @@ mod tests { Arc::new(SecurityConfig::default()), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); @@ -1166,6 +1238,7 @@ mod tests { Arc::new(SecurityConfig::default()), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); @@ -1210,6 +1283,7 @@ mod tests { Arc::new(SecurityConfig::default()), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); @@ -1246,6 +1320,7 @@ mod tests { Arc::new(new_security_cfg(Some(allowed_cn))), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); @@ -1319,6 +1394,7 @@ mod tests { Arc::new(SecurityConfig::default()), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); @@ -1349,6 +1425,7 @@ mod tests { Arc::new(SecurityConfig::default()), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); @@ -1382,6 +1459,7 @@ mod tests { Arc::new(SecurityConfig::default()), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); @@ -1437,6 +1515,7 @@ mod tests { Arc::new(SecurityConfig::default()), MockRouter, temp_dir.path().to_path_buf(), + None, ) .unwrap(); let addr = "127.0.0.1:0".to_owned(); diff --git a/tests/integrations/server/status_server.rs b/tests/integrations/server/status_server.rs index 929a7c286ae..1e3963ffdb7 100644 --- a/tests/integrations/server/status_server.rs +++ b/tests/integrations/server/status_server.rs @@ -45,6 +45,7 @@ fn test_region_meta_endpoint() { Arc::new(SecurityConfig::default()), router, std::env::temp_dir(), + None, ) .unwrap(); let addr = format!("127.0.0.1:{}", test_util::alloc_port()); From 2301dac437347b9e81823894c3ea2bbc96f2b0a9 Mon Sep 17 00:00:00 2001 From: Liu Cong Date: Wed, 8 Feb 2023 22:09:59 +0800 Subject: [PATCH 511/676] raftstore: new slow store detecting (#14000) ref tikv/tikv#14131 PD schedulers: new scheduler `evict-slow-trend-scheduler`, for new slow store detecting and leader evicting Signed-off-by: Liu Cong Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/config.rs | 9 + components/raftstore/src/store/metrics.rs | 70 ++ components/raftstore/src/store/worker/pd.rs | 97 ++- components/tikv_util/src/lib.rs | 1 + components/tikv_util/src/store/query_stats.rs | 8 + components/tikv_util/src/trend.rs | 734 ++++++++++++++++++ tests/integrations/config/mod.rs | 2 + 7 files changed, 920 insertions(+), 1 deletion(-) create mode 100644 components/tikv_util/src/trend.rs diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 6667a46c4e5..342ace1139e 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -291,6 +291,11 @@ pub struct Config { // Interval to inspect the latency of raftstore for slow store detection. pub inspect_interval: ReadableDuration, + // The unsensitive(increase it to reduce sensitiveness) of the cause-trend detection + pub slow_trend_unsensitive_cause: f64, + // The unsensitive(increase it to reduce sensitiveness) of the result-trend detection + pub slow_trend_unsensitive_result: f64, + // Interval to report min resolved ts, if it is zero, it means disabled. pub report_min_resolved_ts_interval: ReadableDuration, @@ -430,6 +435,10 @@ impl Default for Config { region_split_size: ReadableSize(0), clean_stale_peer_delay: ReadableDuration::minutes(0), inspect_interval: ReadableDuration::millis(500), + // The param `slow_trend_unsensitive_cause == 2.0` can yield good results, + // make it `10.0` to reduce a bit sensitiveness because SpikeFilter is disabled + slow_trend_unsensitive_cause: 10.0, + slow_trend_unsensitive_result: 0.5, report_min_resolved_ts_interval: ReadableDuration::secs(1), check_leader_lease_interval: ReadableDuration::secs(0), renew_leader_lease_advance_duration: ReadableDuration::secs(0), diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 6c6357d286c..7df8819c998 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -780,6 +780,76 @@ lazy_static! { pub static ref STORE_SLOW_SCORE_GAUGE: Gauge = register_gauge!("tikv_raftstore_slow_score", "Slow score of the store.").unwrap(); + pub static ref STORE_SLOW_TREND_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend", "Slow trend changing rate").unwrap(); + + pub static ref STORE_SLOW_TREND_L0_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l0", "Slow trend L0 window avg value.").unwrap(); + pub static ref STORE_SLOW_TREND_L1_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l1", "Slow trend L1 window avg value.").unwrap(); + pub static ref STORE_SLOW_TREND_L2_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l2", "Slow trend L2 window avg value.").unwrap(); + + pub static ref STORE_SLOW_TREND_L0_L1_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l0_l1", "Slow trend changing rate: L0/L1.").unwrap(); + pub static ref STORE_SLOW_TREND_L1_L2_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l1_l2", "Slow trend changing rate: L1/L2.").unwrap(); + + pub static ref STORE_SLOW_TREND_L1_MARGIN_ERROR_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l1_margin_error", "Slow trend: L1 margin error range").unwrap(); + pub static ref STORE_SLOW_TREND_L2_MARGIN_ERROR_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_l2_margin_error", "Slow trend: L2 margin error range").unwrap(); + + pub static ref STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC: IntGaugeVec = + register_int_gauge_vec!( + "tikv_raftstore_slow_trend_margin_error_gap", + "Slow trend: the gap between margin window time and current sampling time", + &["window"] + ).unwrap(); + + pub static ref STORE_SLOW_TREND_MISC_GAUGE_VEC: IntGaugeVec = + register_int_gauge_vec!( + "tikv_raftstore_slow_trend_misc", + "Slow trend uncatelogued gauge(s)", + &["type"] + ).unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_VALUE_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_value", "Store slow trend result meantime value").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result", "Store slow trend result changing rate").unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_L0_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l0", "Slow trend result L0 window avg value.").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_L1_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l1", "Slow trend result L1 window avg value.").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_L2_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l2", "Slow trend result L2 window avg value.").unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_L0_L1_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l0_l1", "Slow trend result changing rate: L0/L1.").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_L1_L2_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l1_l2", "Slow trend result changing rate: L1/L2.").unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_L1_MARGIN_ERROR_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l1_margin_error", "Slow trend result: L1 margin error range").unwrap(); + pub static ref STORE_SLOW_TREND_RESULT_L2_MARGIN_ERROR_GAUGE: Gauge = + register_gauge!("tikv_raftstore_slow_trend_result_l2_margin_error", "Slow trend result: L2 margin error range").unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC: IntGaugeVec = + register_int_gauge_vec!( + "tikv_raftstore_slow_trend_result_margin_error_gap", + "Slow trend result: the gap between margin window time and current sampling time", + &["window"] + ).unwrap(); + + pub static ref STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC: IntGaugeVec = + register_int_gauge_vec!( + "tikv_raftstore_slow_trend_result_misc", + "Slow trend result uncatelogued gauge(s)", + &["type"] + ).unwrap(); + pub static ref RAFT_LOG_GC_SKIPPED_VEC: IntCounterVec = register_int_counter_vec!( "tikv_raftstore_raft_log_gc_skipped", "Total number of skipped raft log gc.", diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 18ecc77f599..f43e1ec33d5 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -45,6 +45,7 @@ use tikv_util::{ time::{Instant as TiInstant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, topn::TopN, + trend::{RequestPerSecRecorder, Trend}, warn, worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}, }; @@ -921,6 +922,9 @@ where snap_mgr: SnapManager, remote: Remote, slow_score: SlowScore, + slow_trend_cause: Trend, + slow_trend_result: Trend, + slow_trend_result_recorder: RequestPerSecRecorder, // The health status of the store is updated by the slow score mechanism. health_service: Option, @@ -984,6 +988,39 @@ where snap_mgr, remote, slow_score: SlowScore::new(cfg.inspect_interval.0), + slow_trend_cause: Trend::new( + // Disable SpikeFilter for now + Duration::from_secs(0), + STORE_SLOW_TREND_MISC_GAUGE_VEC.with_label_values(&["spike_filter_value"]), + STORE_SLOW_TREND_MISC_GAUGE_VEC.with_label_values(&["spike_filter_count"]), + Duration::from_secs(180), + Duration::from_secs(30), + Duration::from_secs(120), + Duration::from_secs(600), + 1, + tikv_util::time::duration_to_us(Duration::from_micros(500)), + STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC.with_label_values(&["L1"]), + STORE_SLOW_TREND_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC.with_label_values(&["L2"]), + cfg.slow_trend_unsensitive_cause, + ), + slow_trend_result: Trend::new( + // Disable SpikeFilter for now + Duration::from_secs(0), + STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC.with_label_values(&["spike_filter_value"]), + STORE_SLOW_TREND_RESULT_MISC_GAUGE_VEC.with_label_values(&["spike_filter_count"]), + Duration::from_secs(120), + Duration::from_secs(15), + Duration::from_secs(60), + Duration::from_secs(300), + 1, + 2000, + STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC + .with_label_values(&["L1"]), + STORE_SLOW_TREND_RESULT_MARGIN_ERROR_WINDOW_GAP_GAUGE_VEC + .with_label_values(&["L2"]), + cfg.slow_trend_unsensitive_result, + ), + slow_trend_result_recorder: RequestPerSecRecorder::new(), health_service, curr_health_status: ServingStatus::Serving, coprocessor_host, @@ -1254,6 +1291,9 @@ where .store_stat .engine_total_query_num .sub_query_stats(&self.store_stat.engine_last_query_num); + let total_query_num = self + .slow_trend_result_recorder + .record_and_get_current_rps(res.get_all_query_num(), Instant::now()); stats.set_query_stats(res.0); stats.set_cpu_usages(self.store_stat.store_cpu_usages.clone().into()); @@ -1293,6 +1333,7 @@ where let slow_score = self.slow_score.get(); stats.set_slow_score(slow_score as u64); + self.set_slow_trend_to_store_stats(&mut stats, total_query_num); let router = self.router.clone(); let resp = self @@ -1379,6 +1420,51 @@ where self.remote.spawn(f); } + fn set_slow_trend_to_store_stats( + &mut self, + stats: &mut pdpb::StoreStats, + total_query_num: Option, + ) { + let slow_trend_cause_rate = self.slow_trend_cause.increasing_rate(); + STORE_SLOW_TREND_GAUGE.set(slow_trend_cause_rate); + let mut slow_trend = pdpb::SlowTrend::default(); + slow_trend.set_cause_rate(slow_trend_cause_rate); + slow_trend.set_cause_value(self.slow_trend_cause.l0_avg()); + if let Some(total_query_num) = total_query_num { + self.slow_trend_result + .record(total_query_num as u64, Instant::now()); + slow_trend.set_result_value(self.slow_trend_result.l0_avg()); + let slow_trend_result_rate = self.slow_trend_result.increasing_rate(); + slow_trend.set_result_rate(slow_trend_result_rate); + STORE_SLOW_TREND_RESULT_GAUGE.set(slow_trend_result_rate); + STORE_SLOW_TREND_RESULT_VALUE_GAUGE.set(total_query_num); + } else { + // Just to mark the invalid range on the graphic + STORE_SLOW_TREND_RESULT_VALUE_GAUGE.set(-100.0); + } + stats.set_slow_trend(slow_trend); + self.write_slow_trend_metrics(); + } + + fn write_slow_trend_metrics(&mut self) { + STORE_SLOW_TREND_L0_GAUGE.set(self.slow_trend_cause.l0_avg()); + STORE_SLOW_TREND_L1_GAUGE.set(self.slow_trend_cause.l1_avg()); + STORE_SLOW_TREND_L2_GAUGE.set(self.slow_trend_cause.l2_avg()); + STORE_SLOW_TREND_L0_L1_GAUGE.set(self.slow_trend_cause.l0_l1_rate()); + STORE_SLOW_TREND_L1_L2_GAUGE.set(self.slow_trend_cause.l1_l2_rate()); + STORE_SLOW_TREND_L1_MARGIN_ERROR_GAUGE.set(self.slow_trend_cause.l1_margin_error_base()); + STORE_SLOW_TREND_L2_MARGIN_ERROR_GAUGE.set(self.slow_trend_cause.l2_margin_error_base()); + STORE_SLOW_TREND_RESULT_L0_GAUGE.set(self.slow_trend_result.l0_avg()); + STORE_SLOW_TREND_RESULT_L1_GAUGE.set(self.slow_trend_result.l1_avg()); + STORE_SLOW_TREND_RESULT_L2_GAUGE.set(self.slow_trend_result.l2_avg()); + STORE_SLOW_TREND_RESULT_L0_L1_GAUGE.set(self.slow_trend_result.l0_l1_rate()); + STORE_SLOW_TREND_RESULT_L1_L2_GAUGE.set(self.slow_trend_result.l1_l2_rate()); + STORE_SLOW_TREND_RESULT_L1_MARGIN_ERROR_GAUGE + .set(self.slow_trend_result.l1_margin_error_base()); + STORE_SLOW_TREND_RESULT_L2_MARGIN_ERROR_GAUGE + .set(self.slow_trend_result.l2_margin_error_base()); + } + fn handle_report_batch_split(&self, regions: Vec) { let resp = self.pd_client.report_batch_split(regions); let f = async move { @@ -2097,7 +2183,13 @@ where txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), - Task::UpdateSlowScore { id, duration } => self.slow_score.record(id, duration.sum()), + Task::UpdateSlowScore { id, duration } => { + self.slow_score.record(id, duration.sum()); + self.slow_trend_cause.record( + tikv_util::time::duration_to_us(duration.store_wait_duration.unwrap()), + Instant::now(), + ); + } Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), Task::ReportMinResolvedTs { store_id, @@ -2121,6 +2213,9 @@ where T: PdClient + 'static, { fn on_timeout(&mut self) { + // Record a fairly great value when timeout + self.slow_trend_cause.record(500_000, Instant::now()); + // The health status is recovered to serving as long as any tick // does not timeout. if self.curr_health_status == ServingStatus::ServiceUnknown diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index 9b13250fe1e..fd294a08d34 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -61,6 +61,7 @@ pub mod thread_group; pub mod time; pub mod timer; pub mod topn; +pub mod trend; pub mod worker; pub mod yatp_pool; diff --git a/components/tikv_util/src/store/query_stats.rs b/components/tikv_util/src/store/query_stats.rs index 1c352cfc303..6cf461411aa 100644 --- a/components/tikv_util/src/store/query_stats.rs +++ b/components/tikv_util/src/store/query_stats.rs @@ -94,6 +94,14 @@ impl QueryStats { mem::swap(&mut self.0, &mut query_stats); query_stats } + + pub fn get_all_query_num(&self) -> u64 { + let mut sum: u64 = 0; + for kind in QUERY_KINDS { + sum += QueryStats::get_query_num(&self.0, *kind); + } + sum + } } pub fn is_read_query(kind: QueryKind) -> bool { diff --git a/components/tikv_util/src/trend.rs b/components/tikv_util/src/trend.rs new file mode 100644 index 00000000000..8ae3bb3d5aa --- /dev/null +++ b/components/tikv_util/src/trend.rs @@ -0,0 +1,734 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::vec_deque::VecDeque, + time::{Duration, Instant}, +}; + +use prometheus::IntGauge; + +pub struct SampleValue { + value: u64, + time: Instant, +} + +pub struct SampleWindow { + sum: u64, + values: VecDeque, + duration: Duration, + overflow: bool, +} + +impl SampleWindow { + pub fn new(duration: Duration) -> Self { + Self { + sum: 0, + values: VecDeque::new(), + duration, + overflow: false, + } + } + + #[inline] + pub fn record(&mut self, value: u64, now: Instant) { + self.values.push_back(SampleValue { value, time: now }); + self.sum = self.sum.saturating_add(value); + while !self.values.is_empty() + && now.duration_since(self.values.front().unwrap().time) > self.duration + { + let front = self.values.pop_front().unwrap(); + self.sum = self.sum.saturating_sub(front.value); + self.overflow = true; + } + } + + #[inline] + pub fn is_overflow(&self) -> bool { + self.overflow + } + + #[inline] + pub fn drain(&mut self) -> (VecDeque, u64, bool) { + let result = ( + self.values.drain(..).collect::>(), + self.sum, + self.overflow, + ); + self.sum = 0; + self.overflow = false; + result + } + + #[inline] + // TODO: better memory operating? + pub fn move_from(&mut self, source: &mut Self) { + (self.values, self.sum, self.overflow) = source.drain(); + } + + #[inline] + pub fn valid(&self) -> bool { + !self.values.is_empty() + } + + #[inline] + pub fn avg(&self) -> f64 { + if !self.values.is_empty() { + self.sum as f64 / self.values.len() as f64 + } else { + 0.0 + } + } + + #[inline] + pub fn std_ev(&self) -> f64 { + if self.values.len() <= 1 { + return 0.0; + } + let avg = self.avg(); + let mut delta_sq_sum = 0.0; + for v in self.values.iter() { + let delta = (v.value as f64) - avg; + delta_sq_sum += delta * delta; + } + // We use `self.values.len()` rather than `self.values.len() - 1` + f64::sqrt(delta_sq_sum / self.values.len() as f64) + } + + #[inline] + pub fn std_ev_ratio(&self) -> f64 { + if self.values.len() <= 1 { + 0.0 + } else { + self.std_ev() / self.avg() + } + } +} + +pub struct SampleWindows { + pub windows: Vec, +} + +impl SampleWindows { + pub fn new(windows_durations: Vec) -> Self { + let mut windows = vec![]; + for duration in windows_durations.iter() { + windows.push(SampleWindow::new(*duration)); + } + Self { windows } + } + + #[inline] + pub fn record(&mut self, value: u64, now: Instant) { + self.windows + .iter_mut() + .for_each(|window| window.record(value, now)) + } + + #[inline] + pub fn valid(&self) -> bool { + for window in self.windows.iter() { + if !window.valid() { + return false; + } + } + true + } +} + +// TODO: Generalize this module using SPOT(https://dl.acm.org/doi/10.1145/3097983.3098144) +// +// Without SPOT: +// - Margin errors calculating is based on sampling +// - `flip_margin_error_multiple` controls when to flip, hence control what to +// sample +// - `flip_margin_error_multiple` is a fixed value, can't fit all cases +// +// With SPOT: +// - `enter_threshold_multiple` will be insteaded of by `risk` +// - `risk` also a fixed value, but it's based on distribution, so it could +// fits all +struct HistoryWindow { + name: &'static str, + window_duration: Duration, + sample_interval_duration: Duration, + current_window: SampleWindow, + previous_window: SampleWindow, + last_sampled_time: Instant, + last_flipped_time: Instant, + flipping_start_time: Option, + margin_error_base: f64, + flip_margin_error_multiple: f64, + gap_gauge: IntGauge, +} + +impl HistoryWindow { + pub fn new( + name: &'static str, + window_duration: Duration, + sample_interval_duration: Duration, + margin_error_base: f64, + gap_gauge: IntGauge, + flip_margin_error_multiple: f64, + ) -> Self { + let now = Instant::now(); + Self { + name, + window_duration, + sample_interval_duration, + current_window: SampleWindow::new(window_duration), + previous_window: SampleWindow::new(window_duration), + last_sampled_time: now, + last_flipped_time: now, + flipping_start_time: None, + margin_error_base, + gap_gauge, + flip_margin_error_multiple, + } + } + + #[inline] + pub fn record(&mut self, value: f64, now: Instant, increasing_rate: f64) { + let gap_secs = if self.current_window.is_overflow() { + now.saturating_duration_since(self.current_window.values.front().unwrap().time) + .as_secs() as i64 + } else if self.previous_window.is_overflow() { + now.saturating_duration_since(self.previous_window.values.front().unwrap().time) + .as_secs() as i64 + } else { + // Just to mark the invalid range on the graphic + -100 + }; + self.gap_gauge.set(gap_secs); + + if now.duration_since(self.last_sampled_time) <= self.sample_interval_duration { + return; + } + let should_skip = self.try_flip(value, now, increasing_rate); + if should_skip { + return; + } + self.current_window.record(value as u64, now); + self.last_sampled_time = now; + } + + #[inline] + pub fn valid(&self) -> bool { + self.current_window.is_overflow() || self.previous_window.is_overflow() + } + + #[inline] + pub fn margin_error(&self) -> f64 { + let margin_error = if self.flipping_start_time.is_none() { + if self.current_window.is_overflow() { + self.current_window.std_ev() + } else if self.previous_window.is_overflow() { + // We use the previous margin error in the duration: + // - After flipping ends + // - Yet before current window is overflow + self.previous_window.std_ev() + } else { + 0.0 + } + } else if self.previous_window.is_overflow() { + self.previous_window.std_ev() + } else { + 0.0 + }; + f64::max(margin_error, self.margin_error_base) + } + + #[inline] + // Return bool: shoud_skip_current_value + fn try_flip(&mut self, value: f64, now: Instant, increasing_rate: f64) -> bool { + if !self.current_window.is_overflow() { + return false; + } + let current_avg = self.current_window.avg(); + let margin_error = self.margin_error(); + + // The output margin_error multiple can up to `self.flip_margin_error_multiple + + // 1` without flipping (increasing_rate already minus a margin_error) + let flip_margin_error = margin_error * self.flip_margin_error_multiple; + let delta = f64::abs(value - current_avg); + + // Strict condition for exiting flipping (to do actual flipping) + if self.flipping_start_time.is_some() { + // Make sure not stuck at flipping phase by using `time_based_multiple`, + // increase by time + // - Expectation of time_based_multiple: starts at 0.0, to `margin_error * 5%` + // at 4min, to 10% at 12min, to 20% at 28min + // - f64::abs() is for preventing crash in case the server time is adjusted + let flipping_duration = now.duration_since(self.flipping_start_time.unwrap()); + let time_based_multiple = + (f64::abs(flipping_duration.as_secs() as f64) / 240.0 + 1.0).log2() / 20.0; + if f64::abs(increasing_rate) > margin_error * time_based_multiple { + // Keep flipping, skip the huge-changing phase, wait for stable + return true; + } else { + // The huge-changing phase ends, do flipping + self.flip(); + self.flipping_start_time = None; + self.last_flipped_time = now; + info!( + "history window flipping: end"; + "name" => self.name, + "delta" => delta, + "flip_margin_error" => margin_error, + "time_based_multiple" => time_based_multiple, + "increasing_rate" => increasing_rate, + "flipping_duration" => flipping_duration.as_secs(), + ); + return false; + } + } + + // Loose condition for entering flipping + if now.duration_since(self.last_flipped_time) > self.window_duration + && delta > flip_margin_error + { + // Enter flipping phase, may last for a while + self.flipping_start_time = Some(Instant::now()); + info!( + "history window flipping: enter"; + "name" => self.name, + "delta" => delta, + "flip_margin_error" => flip_margin_error, + "increasing_rate" => increasing_rate, + ); + } + false + } + + #[inline] + fn flip(&mut self) { + self.previous_window.move_from(&mut self.current_window); + } +} + +// TODO: Generalize this filter using SPOT(https://dl.acm.org/doi/10.1145/3097983.3098144) +// - `enter_threshold_multiple` is a fixed value, can't fit all cases +// - Using SPOT, `enter_threshold_multiple` will be insteaded of by `risk` +// - `risk` also a fixed value, but it's based on distribution, so it could +// fits all +pub struct SpikeFilter { + values: VecDeque, + duration: Duration, + filter_value_gauge: IntGauge, + filter_count_gauge: IntGauge, + exit_threshold_avg_multiple: f64, + exit_threshold_margin_error_multiple: f64, + enter_threshold_multiple: f64, +} + +impl SpikeFilter { + pub fn new( + duration: Duration, + filter_value_gauge: IntGauge, + filter_count_gauge: IntGauge, + exit_threshold_avg_multiple: f64, + exit_threshold_margin_error_multiple: f64, + enter_threshold_multiple: f64, + ) -> Self { + assert!(enter_threshold_multiple > 1.0); + Self { + values: VecDeque::new(), + duration, + filter_value_gauge, + filter_count_gauge, + exit_threshold_avg_multiple, + exit_threshold_margin_error_multiple, + enter_threshold_multiple, + } + } + + #[inline] + // TODO: better memory operating? + pub fn record( + &mut self, + value: u64, + now: Instant, + history_avg: f64, + history_margin_error: f64, + ) -> Option> { + let exit_threshold = history_avg * self.exit_threshold_avg_multiple + + history_margin_error * self.exit_threshold_margin_error_multiple; + let enter_threshold = exit_threshold * self.enter_threshold_multiple; + let curr = SampleValue { value, time: now }; + + // Spike entering check + if (value as f64) > enter_threshold { + // Hold the very high values in the checking sequence + self.values.push_back(curr); + if now.duration_since(self.values.front().unwrap().time) > self.duration { + // The checking sequence is too long to be a spike, dump all and exit checking + let values: Vec = self.values.drain(..).collect(); + return Some(values); + } + // The curr value is on hold, return None + return None; + } + + // Not in a spike, nothing happen + if self.values.is_empty() { + return Some(vec![curr]); + } + + // In a spike + + // Spike ending check + if (value as f64) < exit_threshold { + if self.values.len() <= 2 { + // The checking sequence is too short to be a spike, dump all and exit checking + let mut values: Vec = self.values.drain(..).collect(); + values.push(curr); + return Some(values); + } + // The checking sequence is not long enough to be regular high, it's a spike, + // discard all but return curr + self.filter_value_gauge.set(self.avg() as i64); + self.filter_count_gauge.inc(); + self.values.drain(..); + return Some(vec![curr]); + } + + // Hold curr value to this spike + self.values.push_back(curr); + None + } + + #[inline] + fn avg(&self) -> f64 { + if self.values.is_empty() { + return 0.0; + } + let mut sum: f64 = 0.0; + for value in self.values.iter() { + sum += value.value as f64; + } + sum / (self.values.len() as f64) + } +} + +// Responsibilities of each window: +// +// L0: +// Eleminate very short time jitter, +// Consider its avg value as a point in data flow +// L1: +// `L0.avg/L1.avg` to trigger slow-event, not last long but high sensitive +// Sensitive could be tuned by `L0.duration` and `L1.duration` +// Include periodic fluctuations, so it's avg could be seen as baseline +// value Its duration is also the no-detectable duration after TiKV starting +// L2: +// `L1.avg/L2.avg` to trigger slow-event, last long but low sensitive +// Sensitive could be tuned by `L1.duration` and `L2.duration` +// +// L* History: +// Sample history values and calculate the margin error +// +// Spike Filter: +// Erase very high and short time spike-values +// +pub struct Trend { + sample_interval: usize, + sample_sequence_id: usize, + + spike_filter: SpikeFilter, + spike_filter_enabled: bool, + + data_flow: SampleWindows, + + l1_history: HistoryWindow, + l2_history: HistoryWindow, + + // When SPOT is being used, these should be `risk multiple` + l1_margin_error_multiple: f64, + l2_margin_error_multiple: f64, + + curves_composer: CurvesComposer, +} + +impl Trend { + pub fn new( + spike_filter_duration: Duration, + spike_filter_value_gauge: IntGauge, + spike_filter_count_gauge: IntGauge, + history_duration: Duration, + l0_duration: Duration, + l1_duration: Duration, + l2_duration: Duration, + sample_interval: usize, + tolerable_margin_error_value: u64, + l1_gap_gauge: IntGauge, + l2_gap_gauge: IntGauge, + unsensitive_multiple: f64, + ) -> Self { + let margin_error_base = tolerable_margin_error_value as f64; + Self { + sample_interval, + sample_sequence_id: 0, + data_flow: SampleWindows::new(vec![l0_duration, l1_duration, l2_duration]), + spike_filter_enabled: !spike_filter_duration.is_zero(), + spike_filter: SpikeFilter::new( + spike_filter_duration, + spike_filter_value_gauge, + spike_filter_count_gauge, + 1.0, + 5.0, + 2.0, + ), + l1_history: HistoryWindow::new( + "L1", + history_duration, + Duration::from_secs(1), + margin_error_base, + l1_gap_gauge, + 3.0, + ), + l2_history: HistoryWindow::new( + "L2", + history_duration, + Duration::from_secs(1), + margin_error_base, + l2_gap_gauge, + 2.0, + ), + l1_margin_error_multiple: 3.0 * unsensitive_multiple, + l2_margin_error_multiple: 2.0 * unsensitive_multiple, + curves_composer: CurvesComposer::new(l0_duration, l1_duration, l2_duration, 2.0), + } + } + + #[inline] + pub fn record(&mut self, value: u64, now: Instant) { + if !self.check_should_sample() { + return; + } + if !self.spike_filter_enabled || !self.data_flow.windows[1].is_overflow() { + self.record_unfiltered(value, now); + return; + } + if let Some(filtered) = + self.spike_filter + .record(value, now, self.l1_avg(), self.l1_margin_error()) + { + for sample in filtered.iter() { + self.record_unfiltered(sample.value, sample.time) + } + } + } + + #[inline] + pub fn increasing_rate(&self) -> f64 { + self.curves_composer + .compose(self.l0_l1_rate(), self.l1_l2_rate()) + } + + #[inline] + pub fn l0_avg(&self) -> f64 { + self.data_flow.windows[0].avg() + } + + #[inline] + pub fn l1_avg(&self) -> f64 { + self.data_flow.windows[1].avg() + } + + #[inline] + pub fn l2_avg(&self) -> f64 { + self.data_flow.windows[2].avg() + } + + #[inline] + pub fn l1_margin_error_base(&self) -> f64 { + self.l1_history.margin_error() + } + + #[inline] + pub fn l2_margin_error_base(&self) -> f64 { + self.l2_history.margin_error() + } + + #[inline] + pub fn l0_l1_rate(&self) -> f64 { + if !self.data_flow.windows[2].is_overflow() { + return 0.0; + } + if !self.l1_history.valid() { + return 0.0; + } + let l1_avg = self.l1_avg(); + Trend::la_lb_rate(self.l0_avg(), l1_avg, self.l1_margin_error()) + } + + #[inline] + pub fn l1_l2_rate(&self) -> f64 { + if !self.data_flow.windows[2].is_overflow() { + return 0.0; + } + if !self.l2_history.valid() { + return 0.0; + } + Trend::la_lb_rate(self.l1_avg(), self.l2_avg(), self.l2_margin_error()) + } + + #[inline] + fn check_should_sample(&mut self) -> bool { + if self.sample_interval <= 1 { + return true; + } + let should = self.sample_sequence_id % self.sample_interval == 0; + self.sample_sequence_id += 1; + should + } + + #[inline] + fn record_unfiltered(&mut self, value: u64, now: Instant) { + self.data_flow.record(value, now); + // TODO: Reduce the `increasing_rate()` calculating count? + let increasing_rate = self.increasing_rate(); + self.l1_history.record(self.l0_avg(), now, increasing_rate); + self.l2_history.record(self.l1_avg(), now, increasing_rate); + } + + #[inline] + fn l1_margin_error(&self) -> f64 { + self.l1_history.margin_error() * self.l1_margin_error_multiple + } + + #[inline] + fn l2_margin_error(&self) -> f64 { + self.l2_history.margin_error() * self.l2_margin_error_multiple + } + + #[inline] + fn la_lb_rate(la_avg: f64, lb_avg: f64, margin_error: f64) -> f64 { + if lb_avg < f64::EPSILON { + return 0.0; + } + let mut increased = la_avg - lb_avg; + if f64::abs(increased) < f64::EPSILON { + return 0.0; + } + increased = if la_avg < lb_avg { + if -increased > margin_error { + -increased - margin_error + } else { + 0.0 + } + } else if increased > margin_error { + increased - margin_error + } else { + 0.0 + }; + let mut inc_sq = increased * increased; + if la_avg < lb_avg { + inc_sq = -inc_sq; + }; + let res = la_avg * inc_sq / f64::sqrt(lb_avg); + if la_avg >= lb_avg { + f64::sqrt(res) + } else { + -f64::sqrt(-res) + } + } +} + +struct CurvesComposer { + l0_l1_vs_l1_l2: f64, +} + +impl CurvesComposer { + pub fn new( + l0_duration: Duration, + l1_duration: Duration, + l2_duration: Duration, + l1_l2_extra_weight: f64, + ) -> Self { + let l0_l1 = l0_duration.as_nanos() as f64 / l1_duration.as_nanos() as f64; + let l1_l2 = l1_duration.as_nanos() as f64 / l2_duration.as_nanos() as f64; + Self { + l0_l1_vs_l1_l2: l1_l2_extra_weight * l0_l1 / l1_l2, + } + } + + #[inline] + pub fn compose(&self, l0_l1_rate: f64, l1_l2_rate: f64) -> f64 { + l0_l1_rate + l1_l2_rate * self.l0_l1_vs_l1_l2 + } +} + +pub struct RequestPerSecRecorder { + previous_ts: Instant, + inited: bool, +} + +impl Default for RequestPerSecRecorder { + fn default() -> Self { + Self::new() + } +} + +impl RequestPerSecRecorder { + pub fn new() -> Self { + Self { + previous_ts: Instant::now(), + inited: false, + } + } + + #[inline] + pub fn record_and_get_current_rps( + &mut self, + observed_request_count: u64, + now: Instant, + ) -> Option { + if !self.inited { + self.inited = true; + self.previous_ts = now; + None + } else { + self.inited = true; + let secs = now.saturating_duration_since(self.previous_ts).as_secs(); + self.previous_ts = now; + if secs == 0 { + None + } else { + Some(observed_request_count as f64 / secs as f64) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::time::{Duration, Instant}; + + use super::*; + + #[test] + fn test_sample_window() { + let now = Instant::now(); + let mut window = SampleWindow::new(Duration::from_secs(4)); + assert_eq!(window.valid(), false); + assert_eq!(window.avg(), 0.0); + assert_eq!(window.std_ev_ratio(), 0.0); + window.record(10, now); + assert_eq!(window.valid(), true); + assert_eq!(window.avg(), 10.0); + assert_eq!(window.overflow, false); + assert_eq!(window.std_ev_ratio(), 0.0); + window.record(20, now + Duration::from_secs(1)); + assert_eq!(window.avg(), (10.0 + 20.0) / 2.0); + assert_eq!(window.overflow, false); + assert_eq!(window.std_ev_ratio(), 5.0 / 15.0); + window.record(30, now + Duration::from_secs(2)); + assert_eq!(window.avg(), (10.0 + 20.0 + 30.0) / 3.0); + assert_eq!(window.overflow, false); + assert_eq!(window.std_ev_ratio(), f64::sqrt(200.0 / 3.0) / 20.0); + window.record(40, now + Duration::from_secs(5)); + assert_eq!(window.avg(), (20.0 + 30.0 + 40.0) / 3.0); + assert_eq!(window.overflow, true); + assert_eq!(window.std_ev_ratio(), f64::sqrt(200.0 / 3.0) / 30.0); + } +} diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 61ec0d1f3f4..351e9d74ca0 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -256,6 +256,8 @@ fn test_serde_custom_tikv_config() { unreachable_backoff: ReadableDuration::secs(111), check_peers_availability_interval: ReadableDuration::secs(30), check_request_snapshot_interval: ReadableDuration::minutes(1), + slow_trend_unsensitive_cause: 10.0, + slow_trend_unsensitive_result: 0.5, }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { From 6b6f9154b6507ad7036bc2df41d58864c9dece43 Mon Sep 17 00:00:00 2001 From: ekexium Date: Thu, 9 Feb 2023 12:12:00 +0800 Subject: [PATCH 512/676] metrics: add panels showing pessimistic lock queue lengths (#14158) ref tikv/tikv#14157 Signed-off-by: ekexium Co-authored-by: Ti Chi Robot --- metrics/grafana/tikv_details.json | 184 ++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 357edac04a7..334c3c119f7 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -36456,6 +36456,190 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "dashLength": 10, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "editable": true, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 75 + }, + "id": 23763573091, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "pluginVersion": "7.5.11", + "pointradius": 5, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "targets": [ + { + "expr": "sum(tikv_lock_wait_queue_entries_gauge_vec{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", + "legendFormat": "{{type}}", + "interval": "", + "exemplar": true, + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Pessimistic lock activities", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + }, + "bars": false, + "dashes": false, + "error": false, + "fillGradient": 0, + "hiddenSeries": false, + "percentage": false, + "points": false, + "stack": false, + "steppedLine": false, + "timeFrom": null, + "timeShift": null, + "description": "The number of active keys and waiters." + }, + { + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 83 + }, + "id": 23763573092, + "legend": { + "show": false + }, + "links": [], + "pluginVersion": "7.5.11", + "targets": [ + { + "expr": "sum(increase(tikv_lock_wait_queue_length_bucket{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "legendFormat": "{{le}}", + "interval": "", + "exemplar": true, + "format": "heatmap", + "intervalFactor": 2, + "refId": "A", + "step": 4 + } + ], + "title": "Lengths of lock wait queues when transaction enqueues", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "timeFrom": null, + "timeShift": null, + "description": "The length includes the entering transaction itself", + "heatmap": {}, + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "mode": "spectrum", + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "exponent": 0.5, + "colorScheme": "interpolateOranges" + }, + "dataFormat": "tsbuckets", + "yBucketBound": "auto", + "reverseYBuckets": false, + "xAxis": { + "show": true + }, + "yAxis": { + "show": true, + "format": "short", + "decimals": null, + "logBase": 1, + "splitFactor": null, + "min": null, + "max": null + }, + "xBucketSize": null, + "xBucketNumber": null, + "yBucketSize": null, + "yBucketNumber": null, + "highlightCards": true, + "hideZeroBuckets": true } ], "title": "Pessimistic Locking", From fcc6829e41ea675b63290475ac1760664c905f62 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 9 Feb 2023 13:37:59 +0800 Subject: [PATCH 513/676] config: disallow encryption for v2 (#14190) ref tikv/tikv#12842 Signed-off-by: tabokie --- Cargo.lock | 1 + Cargo.toml | 6 ++++-- cmd/tikv-ctl/Cargo.toml | 4 ++-- components/backup-stream/Cargo.toml | 4 ++-- components/backup/Cargo.toml | 4 ++-- components/batch-system/Cargo.toml | 6 +++--- components/causal_ts/Cargo.toml | 4 ++-- components/cdc/Cargo.toml | 4 ++-- components/cloud/aws/Cargo.toml | 4 ++-- components/cloud/azure/Cargo.toml | 4 ++-- components/cloud/gcp/Cargo.toml | 4 ++-- components/encryption/Cargo.toml | 4 ++-- components/encryption/export/Cargo.toml | 4 ++-- components/engine_rocks/Cargo.toml | 4 ++-- components/engine_rocks_helper/Cargo.toml | 4 ++-- components/engine_tirocks/Cargo.toml | 4 ++-- components/engine_traits/Cargo.toml | 4 ++-- components/external_storage/Cargo.toml | 4 ++-- components/file_system/Cargo.toml | 4 ++-- components/pd_client/Cargo.toml | 4 ++-- components/raft_log_engine/Cargo.toml | 4 ++-- components/raftstore-v2/Cargo.toml | 2 +- components/raftstore/Cargo.toml | 4 ++-- components/resolved_ts/Cargo.toml | 4 ++-- components/resource_control/Cargo.toml | 6 +++--- components/resource_metering/Cargo.toml | 4 ++-- components/security/Cargo.toml | 1 + components/security/src/lib.rs | 16 +++++++++++----- components/server/Cargo.toml | 4 ++-- components/snap_recovery/Cargo.toml | 6 +++--- components/sst_importer/Cargo.toml | 4 ++-- components/test_pd/Cargo.toml | 4 ++-- components/test_pd_client/Cargo.toml | 4 ++-- components/test_raftstore/Cargo.toml | 4 ++-- components/test_util/Cargo.toml | 4 ++-- components/tidb_query_datatype/Cargo.toml | 4 ++-- components/tidb_query_executors/Cargo.toml | 4 ++-- components/tikv_kv/Cargo.toml | 4 ++-- components/tikv_util/Cargo.toml | 4 ++-- components/txn_types/Cargo.toml | 2 +- src/config/mod.rs | 3 ++- tests/Cargo.toml | 4 ++-- 42 files changed, 94 insertions(+), 83 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7750f729778..74701b0561f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5031,6 +5031,7 @@ dependencies = [ "collections", "encryption", "grpcio", + "kvproto", "serde", "serde_derive", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index f7d44c94866..29337b4a002 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -149,8 +149,8 @@ serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" serde_ignored = "0.1" serde_json = { version = "1.0", features = ["preserve_order"] } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } smallvec = "1.4" sst_importer = { workspace = true } strum = { version = "0.20", features = ["derive"] } @@ -380,6 +380,8 @@ tipb = { git = "https://github.com/pingcap/tipb.git" } kvproto = { git = "https://github.com/pingcap/kvproto.git" } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } [profile.dev.package.grpcio-sys] debug = false diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index 1e0699f64cf..718d760e3d4 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -78,8 +78,8 @@ regex = "1" security = { workspace = true } serde_json = "1.0" server = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } structopt = "0.3" tempfile = "3.0" tikv = { workspace = true } diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 43bda42a088..f3f1b482be0 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -56,8 +56,8 @@ raftstore = { workspace = true } regex = "1" resolved_ts = { workspace = true } security = { path = "../security" } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1" tidb_query_datatype = { workspace = true } tikv = { workspace = true } diff --git a/components/backup/Cargo.toml b/components/backup/Cargo.toml index 27f7d68e8e3..4f12dd04c36 100644 --- a/components/backup/Cargo.toml +++ b/components/backup/Cargo.toml @@ -63,9 +63,9 @@ raftstore = { workspace = true } security = { workspace = true } serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } thiserror = "1.0" tidb_query_common = { workspace = true } tikv = { workspace = true } diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index 75a0230c188..af57bbef930 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -13,15 +13,15 @@ crossbeam = "0.8" derive_more = { version = "0.99", optional = true } fail = "0.5" file_system = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" online_config = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } resource_control = { workspace = true } serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } tikv_alloc = { workspace = true } tikv_util = { workspace = true } diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index a5dd62cd5d2..71af0419a68 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -26,8 +26,8 @@ prometheus-static-metric = "0.5" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } test_pd_client = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index 94d80bf1d9f..3dfbb402d2e 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -55,8 +55,8 @@ raftstore = { workspace = true } resolved_ts = { workspace = true } security = { workspace = true } semver = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" tikv = { workspace = true } tikv_kv = { workspace = true } diff --git a/components/cloud/aws/Cargo.toml b/components/cloud/aws/Cargo.toml index 5d28e09e8f4..24518515ea0 100644 --- a/components/cloud/aws/Cargo.toml +++ b/components/cloud/aws/Cargo.toml @@ -31,8 +31,8 @@ rusoto_credential = "0.46.0" rusoto_kms = { version = "0.46.0", features = ["serialize_structs"] } rusoto_s3 = { version = "0.46.0", features = ["serialize_structs"] } rusoto_sts = "0.46.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" tikv_util = { workspace = true } # better to not use slog-global, but pass in the logger diff --git a/components/cloud/azure/Cargo.toml b/components/cloud/azure/Cargo.toml index 57ea6c14aef..0a45ccc2c63 100644 --- a/components/cloud/azure/Cargo.toml +++ b/components/cloud/azure/Cargo.toml @@ -18,8 +18,8 @@ kvproto = { workspace = true } lazy_static = "1.4.0" oauth2 = { version = "4.0.0", default-features = false } regex = "1" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time"] } url = "2.0" diff --git a/components/cloud/gcp/Cargo.toml b/components/cloud/gcp/Cargo.toml index 5074a3c9da4..4c3b8994ffc 100644 --- a/components/cloud/gcp/Cargo.toml +++ b/components/cloud/gcp/Cargo.toml @@ -12,9 +12,9 @@ http = "0.2.0" hyper = "0.14" hyper-tls = "0.5" kvproto = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } tame-gcs = { version = "0.10", features = ["async-multipart"] } tame-oauth = "0.4.7" tikv_util = { workspace = true } diff --git a/components/encryption/Cargo.toml b/components/encryption/Cargo.toml index 18b6cb7305c..94ab0d39957 100644 --- a/components/encryption/Cargo.toml +++ b/components/encryption/Cargo.toml @@ -30,9 +30,9 @@ protobuf = { version = "2.8", features = ["bytes"] } rand = "0.8" serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } tikv_util = { workspace = true } diff --git a/components/encryption/export/Cargo.toml b/components/encryption/export/Cargo.toml index fc4fe59d3fb..164ea312e5d 100644 --- a/components/encryption/export/Cargo.toml +++ b/components/encryption/export/Cargo.toml @@ -21,9 +21,9 @@ file_system = { workspace = true } kvproto = { workspace = true } openssl = "0.10" protobuf = { version = "2.8", features = ["bytes"] } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } tikv_util = { workspace = true } [dev-dependencies] diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index a0e3e878c54..6775705e3e1 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -44,8 +44,8 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code regex = "1" serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } slog_derive = "0.2" tempfile = "3.0" tikv_alloc = { workspace = true } diff --git a/components/engine_rocks_helper/Cargo.toml b/components/engine_rocks_helper/Cargo.toml index ec66aa474a9..b8847fa6ba8 100644 --- a/components/engine_rocks_helper/Cargo.toml +++ b/components/engine_rocks_helper/Cargo.toml @@ -18,8 +18,8 @@ pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = "2.8" raftstore = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } tikv_util = { workspace = true } [dev-dependencies] diff --git a/components/engine_tirocks/Cargo.toml b/components/engine_tirocks/Cargo.toml index 07c2a7ec42c..b3cac78b502 100644 --- a/components/engine_tirocks/Cargo.toml +++ b/components/engine_tirocks/Cargo.toml @@ -14,8 +14,8 @@ lazy_static = "1.4.0" log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } slog_derive = "0.2" tikv_alloc = { workspace = true } tikv_util = { workspace = true } diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index 2370f1c9e7e..00b3bb97b66 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -19,8 +19,8 @@ log_wrappers = { workspace = true } protobuf = "2" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } serde = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } tikv_util = { workspace = true } diff --git a/components/external_storage/Cargo.toml b/components/external_storage/Cargo.toml index 839e34e3f22..4ff13e564ff 100644 --- a/components/external_storage/Cargo.toml +++ b/components/external_storage/Cargo.toml @@ -37,9 +37,9 @@ prometheus = { version = "0.13", default-features = false, features = ["nightly" protobuf = { optional = true, version = "2" } rand = "0.8" rusoto_core = "0.46.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "fs", "process"] } diff --git a/components/file_system/Cargo.toml b/components/file_system/Cargo.toml index 033d31681c1..2252ebc3f1b 100644 --- a/components/file_system/Cargo.toml +++ b/components/file_system/Cargo.toml @@ -21,8 +21,8 @@ prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" rand = "0.8" serde = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } strum = { version = "0.20", features = ["derive"] } tikv_alloc = { workspace = true } tikv_util = { workspace = true } diff --git a/components/pd_client/Cargo.toml b/components/pd_client/Cargo.toml index f46d6111c5d..976ad90432a 100644 --- a/components/pd_client/Cargo.toml +++ b/components/pd_client/Cargo.toml @@ -24,8 +24,8 @@ security = { workspace = true } semver = "0.10" serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } tikv_util = { workspace = true } diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 8a336177706..cbccea9dbe0 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -18,8 +18,8 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raft-engine = { git = "https://github.com/tikv/raft-engine.git", features = ["swap"] } serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } tikv_util = { workspace = true } time = "0.1" tracker = { workspace = true } diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 5b917b9ddf7..c7e403afebe 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -65,7 +65,7 @@ yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] engine_test = { workspace = true } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } tempfile = "3.0" test_pd = { workspace = true } test_util = { workspace = true } diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 8df501f279d..cbf943800ee 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -77,8 +77,8 @@ resource_metering = { workspace = true } serde = "1.0" serde_derive = "1.0" serde_with = "1.4" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } smallvec = "1.4" sst_importer = { workspace = true } tempfile = "3.0" diff --git a/components/resolved_ts/Cargo.toml b/components/resolved_ts/Cargo.toml index 10a555678c3..db3c0643cb7 100644 --- a/components/resolved_ts/Cargo.toml +++ b/components/resolved_ts/Cargo.toml @@ -41,8 +41,8 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raftstore = { workspace = true } security = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" tikv = { workspace = true } tikv_util = { workspace = true } diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml index 2e1a0990d49..6cb7d547e6c 100644 --- a/components/resource_control/Cargo.toml +++ b/components/resource_control/Cargo.toml @@ -15,7 +15,7 @@ crossbeam-skiplist = "0.1" dashmap = "5.1" fail = "0.5" futures = { version = "0.3" } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.0" online_config = { workspace = true } pd_client = { workspace = true } @@ -23,8 +23,8 @@ pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } serde = { version = "1.0", features = ["derive"] } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } test_pd = { workspace = true } test_pd_client = { workspace = true } tikv_util = { workspace = true } diff --git a/components/resource_metering/Cargo.toml b/components/resource_metering/Cargo.toml index 20ed4ea2eda..f8e26e01c50 100644 --- a/components/resource_metering/Cargo.toml +++ b/components/resource_metering/Cargo.toml @@ -18,8 +18,8 @@ pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } tikv_util = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] diff --git a/components/security/Cargo.toml b/components/security/Cargo.toml index a9cdd620d12..fdf7ab8e29e 100644 --- a/components/security/Cargo.toml +++ b/components/security/Cargo.toml @@ -8,6 +8,7 @@ publish = false collections = { workspace = true } encryption = { workspace = true } grpcio = { workspace = true } +kvproto = { workspace = true } serde = "1.0" serde_derive = "1.0" serde_json = "1.0" diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index 68328c01ebe..bbd296ae1f7 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -87,7 +87,7 @@ pub struct ClientSuite { impl SecurityConfig { /// Validates ca, cert and private key. - pub fn validate(&self) -> Result<(), Box> { + pub fn validate(&self, raftstore_v2: bool) -> Result<(), Box> { check_key_file("ca key", &self.ca_path)?; check_key_file("cert key", &self.cert_path)?; check_key_file("private key", &self.key_path)?; @@ -97,6 +97,12 @@ impl SecurityConfig { { return Err("ca, cert and private key should be all configured.".into()); } + if raftstore_v2 + && self.encryption.data_encryption_method + != kvproto::encryptionpb::EncryptionMethod::Plaintext + { + return Err("encryption is not supported for partitioned-raft-kv".into()); + } Ok(()) } @@ -298,7 +304,7 @@ mod tests { fn test_security() { let cfg = SecurityConfig::default(); // default is disable secure connection. - cfg.validate().unwrap(); + cfg.validate(false).unwrap(); let mgr = SecurityManager::new(&cfg).unwrap(); assert!(mgr.cfg.ca_path.is_empty()); assert!(mgr.cfg.cert_path.is_empty()); @@ -307,7 +313,7 @@ mod tests { let assert_cfg = |c: fn(&mut SecurityConfig), valid: bool| { let mut invalid_cfg = cfg.clone(); c(&mut invalid_cfg); - assert_eq!(invalid_cfg.validate().is_ok(), valid); + assert_eq!(invalid_cfg.validate(false).is_ok(), valid); }; // invalid path should be rejected. @@ -335,11 +341,11 @@ mod tests { c.cert_path = format!("{}", example_cert.display()); c.key_path = format!("{}", example_key.display()); // incomplete configuration. - c.validate().unwrap_err(); + c.validate(false).unwrap_err(); // data should be loaded from file after validating. c.ca_path = format!("{}", example_ca.display()); - c.validate().unwrap(); + c.validate(false).unwrap(); let (ca, cert, key) = c.load_certs().unwrap_or_default(); assert_eq!(ca, vec![0]); diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index d5e2f177b5e..554dbaa63f9 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -73,8 +73,8 @@ resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true } serde_json = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } snap_recovery = { workspace = true } tempfile = "3.0" tikv = { workspace = true } diff --git a/components/snap_recovery/Cargo.toml b/components/snap_recovery/Cargo.toml index 1b69d8ba150..4768759b852 100644 --- a/components/snap_recovery/Cargo.toml +++ b/components/snap_recovery/Cargo.toml @@ -14,14 +14,14 @@ engine_traits = { workspace = true } futures = { version = "0.3", features = ["executor"] } grpcio = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto" } +kvproto = { workspace = true } log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } pd_client = { workspace = true } protobuf = { version = "2.8", features = ["bytes"] } raft_log_engine = { workspace = true } raftstore = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } structopt = "0.3" tempfile = "3.0" thiserror = "1.0" diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index d0e2ff7eca8..a21a58c0a6c 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -34,8 +34,8 @@ prometheus = { version = "0.13", default-features = false } rand = "0.8" serde = "1.0" serde_derive = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } tikv_util = { workspace = true } diff --git a/components/test_pd/Cargo.toml b/components/test_pd/Cargo.toml index 6277789b194..7747ac1bbc6 100644 --- a/components/test_pd/Cargo.toml +++ b/components/test_pd/Cargo.toml @@ -13,8 +13,8 @@ kvproto = { workspace = true } log_wrappers = { workspace = true } pd_client = { workspace = true } security = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.0", features = ["full"] } tokio-stream = "0.1" diff --git a/components/test_pd_client/Cargo.toml b/components/test_pd_client/Cargo.toml index 9f67752b4c5..3b002970236 100644 --- a/components/test_pd_client/Cargo.toml +++ b/components/test_pd_client/Cargo.toml @@ -14,8 +14,8 @@ kvproto = { workspace = true } log_wrappers = { workspace = true } pd_client = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-timer = { workspace = true } diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index 25a1224e261..1b87aeac11b 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -53,9 +53,9 @@ resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true } server = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } # better to not use slog-global, but pass in the logger -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } tempfile = "3.0" test_pd_client = { workspace = true } test_util = { workspace = true } diff --git a/components/test_util/Cargo.toml b/components/test_util/Cargo.toml index 740132353f3..64dbb2456ce 100644 --- a/components/test_util/Cargo.toml +++ b/components/test_util/Cargo.toml @@ -20,8 +20,8 @@ kvproto = { workspace = true } rand = "0.8" rand_isaac = "0.3" security = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } tempfile = "3.0" tikv_util = { workspace = true } time = "0.1" diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index e670674cdc6..c1be29a956d 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -32,8 +32,8 @@ protobuf = "2" regex = "1.1" serde = "1.0" serde_json = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } static_assertions = { version = "1.0", features = ["nightly"] } thiserror = "1.0" tidb_query_common = { workspace = true } diff --git a/components/tidb_query_executors/Cargo.toml b/components/tidb_query_executors/Cargo.toml index 331634dbd04..3fb3fdca2bb 100644 --- a/components/tidb_query_executors/Cargo.toml +++ b/components/tidb_query_executors/Cargo.toml @@ -17,8 +17,8 @@ kvproto = { workspace = true } log_wrappers = { workspace = true } match-template = "0.0.1" protobuf = { version = "2.8", features = ["bytes"] } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } smallvec = "1.4" tidb_query_aggr = { workspace = true } tidb_query_common = { workspace = true } diff --git a/components/tikv_kv/Cargo.toml b/components/tikv_kv/Cargo.toml index 2911c7738c6..8197637243e 100644 --- a/components/tikv_kv/Cargo.toml +++ b/components/tikv_kv/Cargo.toml @@ -43,8 +43,8 @@ prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raftstore = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } slog_derive = "0.2" tempfile = "3.0" thiserror = "1.0" diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 1193751b228..b501322e152 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -47,9 +47,9 @@ rand = "0.8" rusoto_core = "0.46.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog = { workspace = true } slog-async = "2.3" -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog-global = { workspace = true } slog-json = "2.3" slog-term = "2.4" sysinfo = "0.26" diff --git a/components/txn_types/Cargo.toml b/components/txn_types/Cargo.toml index 0c357ef1dd6..987b7216d22 100644 --- a/components/txn_types/Cargo.toml +++ b/components/txn_types/Cargo.toml @@ -13,7 +13,7 @@ error_code = { workspace = true } farmhash = "1.1.5" kvproto = { workspace = true } log_wrappers = { workspace = true } -slog = "2.3" +slog = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } tikv_util = { workspace = true } diff --git a/src/config/mod.rs b/src/config/mod.rs index 7247d426b21..5e923023ca0 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3244,7 +3244,8 @@ impl TikvConfig { self.coprocessor.enable_region_bucket, self.coprocessor.region_bucket_size, )?; - self.security.validate()?; + self.security + .validate(self.storage.engine == EngineType::RaftKv2)?; self.import.validate()?; self.backup.validate()?; self.backup_stream.validate()?; diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 96ee19e9bae..6fb05f19cd1 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -96,8 +96,8 @@ raft_log_engine = { workspace = true } raftstore = { workspace = true } rand = "0.8.3" resource_control = { workspace = true } -slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } -slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +slog = { workspace = true } +slog-global = { workspace = true } tempfile = "3.0" tidb_query_aggr = { workspace = true } tidb_query_common = { workspace = true } From 10c5813851cdcd399bde953248616f8717b19e60 Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 9 Feb 2023 14:03:59 +0800 Subject: [PATCH 514/676] resource_control: pass missing resource group name to request header (#14192) close tikv/tikv#14191 pass missing resource group name to request header Signed-off-by: Connor1996 Co-authored-by: Xinye Tao --- src/server/raftkv/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 751c07c6b65..9c4c59a4ae5 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -160,6 +160,7 @@ pub fn new_request_header(ctx: &Context) -> RaftRequestHeader { } header.set_sync_log(ctx.get_sync_log()); header.set_replica_read(ctx.get_replica_read()); + header.set_resource_group_name(ctx.get_resource_group_name().to_owned()); header } From 984d09a559e14d6a9a26e8162a2345e667d8f49d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 9 Feb 2023 14:20:00 +0800 Subject: [PATCH 515/676] log-backup: allow observer hibernate when there isn't any task (#14018) close tikv/tikv#14012 Added a "hibernate mode" for the log backup observer: while there isn't any task, it won't emit leader drop or region change events. So some verbose logs can be omitted. Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- components/backup-stream/src/observer.rs | 32 +++++++++++++++++++-- components/backup-stream/src/utils.rs | 4 +++ components/sst_importer/src/sst_importer.rs | 8 ++++-- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/components/backup-stream/src/observer.rs b/components/backup-stream/src/observer.rs index 36c310d3532..1a0a0f7cc9e 100644 --- a/components/backup-stream/src/observer.rs +++ b/components/backup-stream/src/observer.rs @@ -96,6 +96,13 @@ impl BackupStreamObserver { .rl() .is_overlapping((region.get_start_key(), end_key)) } + + /// Check whether there are any task range registered to the observer. + /// when there isn't any task, we can ignore the events, so we don't need to + /// handle useless events. (Also won't yield verbose logs.) + fn is_hibernating(&self) -> bool { + self.ranges.rl().is_empty() + } } impl Coprocessor for BackupStreamObserver {} @@ -149,7 +156,7 @@ impl CmdObserver for BackupStreamObserver { impl RoleObserver for BackupStreamObserver { fn on_role_change(&self, ctx: &mut ObserverContext<'_>, r: &RoleChange) { - if r.state != StateRole::Leader { + if r.state != StateRole::Leader && !self.is_hibernating() { try_send!( self.scheduler, Task::ModifyObserve(ObserveOp::Stop { @@ -167,7 +174,7 @@ impl RegionChangeObserver for BackupStreamObserver { event: RegionChangeEvent, role: StateRole, ) { - if role != StateRole::Leader { + if role != StateRole::Leader || self.is_hibernating() { return; } match event { @@ -207,7 +214,7 @@ mod tests { use raft::StateRole; use raftstore::coprocessor::{ Cmd, CmdBatch, CmdObserveInfo, CmdObserver, ObserveHandle, ObserveLevel, ObserverContext, - RegionChangeEvent, RegionChangeObserver, RoleChange, RoleObserver, + RegionChangeEvent, RegionChangeObserver, RegionChangeReason, RoleChange, RoleObserver, }; use tikv_util::{worker::dummy_scheduler, HandyRwLock}; @@ -321,4 +328,23 @@ mod tests { Ok(Some(Task::ModifyObserve(ObserveOp::Stop { region, .. }))) if region.id == 42 ); } + + #[test] + fn test_hibernate() { + let (sched, mut rx) = dummy_scheduler(); + + // Prepare: assuming a task wants the range of [0001, 0010]. + let o = BackupStreamObserver::new(sched); + let r = fake_region(43, b"0010", b"0042"); + let mut ctx = ObserverContext::new(&r); + o.on_region_changed(&mut ctx, RegionChangeEvent::Create, StateRole::Leader); + o.on_region_changed( + &mut ctx, + RegionChangeEvent::Update(RegionChangeReason::Split), + StateRole::Leader, + ); + o.on_role_change(&mut ctx, &RoleChange::new(StateRole::Leader)); + let task = rx.recv_timeout(Duration::from_millis(20)); + assert!(task.is_err(), "it is {:?}", task); + } } diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 1746882690f..a5d83e50328 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -315,6 +315,10 @@ impl SegmentMap { pub fn get_inner(&mut self) -> &mut BTreeMap> { &mut self.0 } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } } /// transform a [`RaftCmdRequest`] to `(key, value, cf)` triple. diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index fabe9e2a13a..384a48e96a8 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -438,7 +438,9 @@ impl SstImporter { if self.import_support_download() { let shrink_file_count = shrink_files.len(); - info!("shrink space by tick"; "shrink files count" => shrink_file_count, "retain files count" => retain_file_count); + if shrink_file_count > 0 || retain_file_count > 0 { + info!("shrink space by tick"; "shrink files count" => shrink_file_count, "retain files count" => retain_file_count); + } for f in shrink_files { if let Err(e) = file_system::remove_file(&f) { @@ -447,7 +449,9 @@ impl SstImporter { } shrink_file_count } else { - info!("shrink cache by tick"; "shrink size" => shrink_buff_size, "retain size" => retain_buff_size); + if shrink_buff_size > 0 || retain_buff_size > 0 { + info!("shrink cache by tick"; "shrink size" => shrink_buff_size, "retain size" => retain_buff_size); + } self.dec_mem(shrink_buff_size as _); shrink_buff_size } From c5e8704c701840267e4fc128be8b99a10836c717 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 9 Feb 2023 15:40:00 +0800 Subject: [PATCH 516/676] raftkv: allow cancel error in snapshot (#14183) close tikv/tikv#13926 Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/test_raftstore/src/cluster.rs | 5 +++ src/server/raftkv/mod.rs | 8 ++-- tests/integrations/coprocessor/test_select.rs | 44 ++++++++++++++++++- 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 81e7129407e..9d6444904f2 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -389,12 +389,17 @@ impl Cluster { pub fn stop_node(&mut self, node_id: u64) { debug!("stopping node {}", node_id); self.group_props[&node_id].mark_shutdown(); + // Simulate shutdown behavior of server shutdown. It's not enough to just set + // the map above as current thread may also query properties during shutdown. + let previous_prop = tikv_util::thread_group::current_properties(); + tikv_util::thread_group::set_properties(Some(self.group_props[&node_id].clone())); match self.sim.write() { Ok(mut sim) => sim.stop_node(node_id), Err(_) => safe_panic!("failed to acquire write lock."), } self.pd_client.shutdown_store(node_id); debug!("node {} stopped", node_id); + tikv_util::thread_group::set_properties(previous_prop); } pub fn get_engine(&self, node_id: u64) -> RocksEngine { diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 9c4c59a4ae5..0f0d8fa5689 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -579,10 +579,12 @@ where .map_err(kv::Error::from); } async move { - // It's impossible to return cancel because the callback will be invoked if it's - // destroyed. let res = match res { - Ok(()) => f.await.unwrap(), + Ok(()) => match f.await { + Ok(r) => r, + // Canceled may be returned during shutdown. + Err(e) => Err(kv::Error::from(kv::ErrorInner::Other(box_err!(e)))), + }, Err(e) => Err(e), }; match res { diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 056f24b5fee..fe545d07ec1 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -21,7 +21,11 @@ use tikv::{ server::Config, storage::TestEngineBuilder, }; -use tikv_util::{codec::number::*, config::ReadableSize}; +use tikv_util::{ + codec::number::*, + config::{ReadableDuration, ReadableSize}, + HandyRwLock, +}; use tipb::{ AnalyzeColumnsReq, AnalyzeReq, AnalyzeType, ChecksumRequest, Chunk, Expr, ExprType, ScalarFuncSig, SelectResponse, @@ -226,6 +230,44 @@ fn test_select_after_lease() { } } +/// If a failed read should not trigger panic. +#[test] +fn test_select_failed() { + let mut cluster = test_raftstore::new_server_cluster(0, 3); + cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::hours(10); + cluster.run(); + // make sure leader has been elected. + assert_eq!(cluster.must_get(b""), None); + let region = cluster.get_region(b""); + let leader = cluster.leader_of_region(region.get_id()).unwrap(); + let engine = cluster.sim.rl().storages[&leader.get_id()].clone(); + let mut ctx = Context::default(); + ctx.set_region_id(region.get_id()); + ctx.set_region_epoch(region.get_region_epoch().clone()); + ctx.set_peer(leader); + + let product = ProductTable::new(); + let (_, endpoint, _) = + init_data_with_engine_and_commit(ctx.clone(), engine, &product, &[], true); + + // Sleep until the leader lease is expired. + thread::sleep( + cluster.cfg.raft_store.raft_heartbeat_interval() + * cluster.cfg.raft_store.raft_election_timeout_ticks as u32 + * 2, + ); + for id in 1..=3 { + if id != ctx.get_peer().get_store_id() { + cluster.stop_node(id); + } + } + let req = DagSelect::from(&product).build_with(ctx.clone(), &[0]); + let f = endpoint.parse_and_handle_unary_request(req, None); + cluster.stop_node(ctx.get_peer().get_store_id()); + drop(cluster); + let _ = futures::executor::block_on(f); +} + #[test] fn test_scan_detail() { let data = vec![ From d513b789f0709520cdccc20bc4e6560761ad01a5 Mon Sep 17 00:00:00 2001 From: Zhi Qi <30543181+LittleFall@users.noreply.github.com> Date: Thu, 9 Feb 2023 17:31:59 +0800 Subject: [PATCH 517/676] copr: (feat) Implement operator PartitionTopN (#14116) ref tikv/tikv#13936 Signed-off-by: Zhi Qi Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/tidb_query_executors/src/lib.rs | 2 + .../src/partition_top_n_executor.rs | 2204 +++++++++++++++++ components/tidb_query_executors/src/runner.rs | 40 +- .../src/top_n_executor.rs | 2 +- .../src/util/top_n_heap.rs | 26 +- 6 files changed, 2258 insertions(+), 18 deletions(-) create mode 100644 components/tidb_query_executors/src/partition_top_n_executor.rs diff --git a/Cargo.lock b/Cargo.lock index 74701b0561f..046220980bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6650,7 +6650,7 @@ dependencies = [ [[package]] name = "tipb" version = "0.0.1" -source = "git+https://github.com/pingcap/tipb.git#c6b7a5a1623bb2766a502301ecc3ac8f98cc7c79" +source = "git+https://github.com/pingcap/tipb.git#614f3ffd42ddc84b78ff59d65f105f2099a6f1b1" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/tidb_query_executors/src/lib.rs b/components/tidb_query_executors/src/lib.rs index ad86f94f9b8..2ce85f90111 100644 --- a/components/tidb_query_executors/src/lib.rs +++ b/components/tidb_query_executors/src/lib.rs @@ -29,6 +29,7 @@ mod fast_hash_aggr_executor; mod index_scan_executor; pub mod interface; mod limit_executor; +mod partition_top_n_executor; mod projection_executor; pub mod runner; mod selection_executor; @@ -42,6 +43,7 @@ mod util; pub use self::{ fast_hash_aggr_executor::BatchFastHashAggregationExecutor, index_scan_executor::BatchIndexScanExecutor, limit_executor::BatchLimitExecutor, + partition_top_n_executor::BatchPartitionTopNExecutor, projection_executor::BatchProjectionExecutor, selection_executor::BatchSelectionExecutor, simple_aggr_executor::BatchSimpleAggregationExecutor, slow_hash_aggr_executor::BatchSlowHashAggregationExecutor, diff --git a/components/tidb_query_executors/src/partition_top_n_executor.rs b/components/tidb_query_executors/src/partition_top_n_executor.rs new file mode 100644 index 00000000000..52cf2e85925 --- /dev/null +++ b/components/tidb_query_executors/src/partition_top_n_executor.rs @@ -0,0 +1,2204 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use async_trait::async_trait; +use tidb_query_common::{storage::IntervalRange, Result}; +use tidb_query_datatype::{ + codec::{batch::LazyBatchColumnVec, data_type::BATCH_MAX_SIZE}, + expr::{EvalConfig, EvalContext, EvalWarnings}, +}; +use tidb_query_expr::{RpnExpression, RpnExpressionBuilder, RpnStackNode}; +use tipb::{Expr, FieldType}; + +use crate::{ + interface::*, + util::{ + ensure_columns_decoded, eval_exprs_decoded_no_lifetime, + top_n_heap::{HeapItemSourceData, HeapItemUnsafe, TopNHeap}, + }, +}; + +pub struct BatchPartitionTopNExecutor { + heap: TopNHeap, + + /// See `BatchPartitionTopNExecutor::eval_columns_buffer_unsafe` for more + /// information. + #[allow(clippy::box_collection)] + eval_columns_buffer_unsafe: Box>>, + + /// The data should be sorted by the partition expression. + /// But if not, the result is still correct after the second-stage topn. + partition_exprs: Box<[RpnExpression]>, + partition_exprs_field_type: Box<[FieldType]>, + /// dummy value, just for convenience. + partition_is_desc: Box<[bool]>, + + /// The partition key of the last row, i.e. all the heap items have the same + /// partition key. + /// The reason for make this a HeapItemUnsafe is to reuse of the existing + /// comparison logic, i.e. `cmp_sort_key`. + last_partition_key: Option, + + order_exprs: Box<[RpnExpression]>, + /// This field stores the field type of the results evaluated by the exprs + /// in `order_exprs`. + order_exprs_field_type: Box<[FieldType]>, + + /// Whether or not it is descending order for each order by column. + order_is_desc: Box<[bool]>, + + n: usize, + + context: EvalContext, + src: Src, +} + +impl BatchPartitionTopNExecutor { + #[cfg(test)] + pub fn new_for_test( + src: Src, + order_exprs: Vec, + order_is_desc: Vec, + partition_exprs: Vec, + n: usize, + ) -> Self { + assert_eq!(order_exprs.len(), order_is_desc.len()); + + let order_exprs_field_type: Vec = order_exprs + .iter() + .map(|expr| expr.ret_field_type(src.schema()).clone()) + .collect(); + + let partition_exprs_field_type: Vec = partition_exprs + .iter() + .map(|expr| expr.ret_field_type(src.schema()).clone()) + .collect(); + + Self { + heap: TopNHeap::new(n), + eval_columns_buffer_unsafe: Box::>::default(), + partition_is_desc: vec![false; partition_exprs.len()].into_boxed_slice(), + partition_exprs: partition_exprs.into_boxed_slice(), + partition_exprs_field_type: partition_exprs_field_type.into_boxed_slice(), + last_partition_key: None, + order_exprs: order_exprs.into_boxed_slice(), + order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), + order_is_desc: order_is_desc.into_boxed_slice(), + n, + + context: EvalContext::default(), + src, + } + } + + #[cfg(test)] + pub fn new_for_test_with_config( + config: Arc, + src: Src, + order_exprs: Vec, + order_is_desc: Vec, + partition_exprs: Vec, + n: usize, + ) -> Self { + assert_eq!(order_exprs.len(), order_is_desc.len()); + + let order_exprs_field_type: Vec = order_exprs + .iter() + .map(|expr| expr.ret_field_type(src.schema()).clone()) + .collect(); + + let partition_exprs_field_type: Vec = partition_exprs + .iter() + .map(|expr| expr.ret_field_type(src.schema()).clone()) + .collect(); + + Self { + heap: TopNHeap::new(n), + eval_columns_buffer_unsafe: Box::>::default(), + partition_is_desc: vec![false; partition_exprs.len()].into_boxed_slice(), + partition_exprs: partition_exprs.into_boxed_slice(), + partition_exprs_field_type: partition_exprs_field_type.into_boxed_slice(), + last_partition_key: None, + order_exprs: order_exprs.into_boxed_slice(), + order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), + order_is_desc: order_is_desc.into_boxed_slice(), + n, + + context: EvalContext::new(config), + src, + } + } + + pub fn new( + config: Arc, + src: Src, + partition_exprs_def: Vec, + order_exprs_def: Vec, + order_is_desc: Vec, + n: usize, + ) -> Result { + assert_eq!(order_exprs_def.len(), order_is_desc.len()); + + let mut ctx = EvalContext::new(config.clone()); + + let mut order_exprs: Vec = Vec::with_capacity(order_exprs_def.len()); + for def in order_exprs_def { + order_exprs.push(RpnExpressionBuilder::build_from_expr_tree( + def, + &mut ctx, + src.schema().len(), + )?); + } + let order_exprs_field_type: Vec = order_exprs + .iter() + .map(|expr| expr.ret_field_type(src.schema()).clone()) + .collect(); + + let mut partition_exprs: Vec = Vec::with_capacity(partition_exprs_def.len()); + for def in partition_exprs_def { + partition_exprs.push(RpnExpressionBuilder::build_from_expr_tree( + def, + &mut ctx, + src.schema().len(), + )?); + } + let partition_exprs_field_type: Vec = partition_exprs + .iter() + .map(|expr| expr.ret_field_type(src.schema()).clone()) + .collect(); + + Ok(Self { + // Simply large enough to avoid repeated allocations + heap: TopNHeap::new(n), + eval_columns_buffer_unsafe: Box::new(Vec::with_capacity(512)), + partition_is_desc: vec![false; partition_exprs.len()].into_boxed_slice(), + partition_exprs: partition_exprs.into_boxed_slice(), + partition_exprs_field_type: partition_exprs_field_type.into_boxed_slice(), + order_exprs: order_exprs.into_boxed_slice(), + order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), + order_is_desc: order_is_desc.into_boxed_slice(), + n, + context: EvalContext::new(config), + src, + last_partition_key: None, + }) + } + + // Check whether the partition key of the this row is equal to the saved + // partition key. If yes, return true. Else, update saved partition key, + // and return false. + fn check_partition_equal_or_update(&mut self, current: HeapItemUnsafe) -> Result { + if let Some(last_partition_key) = &self.last_partition_key { + if last_partition_key == ¤t { + return Ok(true); + } + } + self.last_partition_key = Some(current); + Ok(false) + } + + #[inline] + async fn handle_next_batch(&mut self) -> Result<(LazyBatchColumnVec, bool)> { + let mut result = LazyBatchColumnVec::empty(); + let src_result = self.src.next_batch(BATCH_MAX_SIZE).await; + self.context.warnings = src_result.warnings; + let src_is_drained = src_result.is_drained?; + + let (mut physical_columns, logical_rows) = + (src_result.physical_columns, src_result.logical_rows); + + if !logical_rows.is_empty() { + ensure_columns_decoded( + &mut self.context, + &self.order_exprs, + self.src.schema(), + &mut physical_columns, + &logical_rows, + )?; + ensure_columns_decoded( + &mut self.context, + &self.partition_exprs, + self.src.schema(), + &mut physical_columns, + &logical_rows, + )?; + + let pinned_source_data = Arc::new(HeapItemSourceData { + physical_columns, + logical_rows, + }); + + let order_eval_offset = self.eval_columns_buffer_unsafe.len(); + unsafe { + eval_exprs_decoded_no_lifetime( + &mut self.context, + &self.order_exprs, + self.src.schema(), + &pinned_source_data.physical_columns, + &pinned_source_data.logical_rows, + &mut self.eval_columns_buffer_unsafe, + )?; + } + // todo: optimize memory use of this. + let partition_eval_offset = self.eval_columns_buffer_unsafe.len(); + unsafe { + eval_exprs_decoded_no_lifetime( + &mut self.context, + &self.partition_exprs, + self.src.schema(), + &pinned_source_data.physical_columns, + &pinned_source_data.logical_rows, + &mut self.eval_columns_buffer_unsafe, + )?; + } + // todo: optimize the memory usage of this, don't need so many same information + // in items. Maybe we can import a Heap with customized comparator. + for logical_row_index in 0..pinned_source_data.logical_rows.len() { + let partition_key = HeapItemUnsafe { + // order_is_desc_ptr here is just a dummy value, todo: refactor the compare + // logic and eliminate this. + order_is_desc_ptr: (*self.partition_is_desc).into(), + order_exprs_field_type_ptr: (*self.partition_exprs_field_type).into(), + source_data: pinned_source_data.clone(), + eval_columns_buffer_ptr: self.eval_columns_buffer_unsafe.as_ref().into(), + eval_columns_offset: partition_eval_offset, + logical_row_index, + }; + + if !self.check_partition_equal_or_update(partition_key)? { + self.heap.take_all_append_to(&mut result); + self.heap = TopNHeap::new(self.n); + } + + let row = HeapItemUnsafe { + order_is_desc_ptr: (*self.order_is_desc).into(), + order_exprs_field_type_ptr: (*self.order_exprs_field_type).into(), + source_data: pinned_source_data.clone(), + eval_columns_buffer_ptr: self.eval_columns_buffer_unsafe.as_ref().into(), + eval_columns_offset: order_eval_offset, + logical_row_index, + }; + self.heap.add_row(row)?; + } + } + if src_is_drained { + self.heap.take_all_append_to(&mut result); + } + + Ok((result, src_is_drained)) + } +} + +/// todo: review this. +/// All `NonNull` pointers in `BatchPartitionTopNExecutor` cannot be accessed +/// out of the struct and `BatchPartitionTopNExecutor` doesn't leak the pointers +/// to other threads. Therefore, with those `NonNull` pointers, +/// BatchPartitionTopNExecutor still remains `Send`. +unsafe impl Send for BatchPartitionTopNExecutor {} + +#[async_trait] +impl BatchExecutor for BatchPartitionTopNExecutor { + type StorageStats = Src::StorageStats; + + #[inline] + fn schema(&self) -> &[FieldType] { + self.src.schema() + } + + /// Implementation of BatchExecutor::next_batch + /// Memory Control Analysis: + /// 1. if n > paging_size(1024), this operator won't do anything and just + /// return data to upstream. So we can think n is less than or equal to + /// paging_size. + /// 2. The worst case is that there is already n rows in heap, and first + /// row of src_result has different partition with rows in heap. So heap + /// will be flushed. And the last row of src_result has another different + /// partition with the first two. So heap will be flushed again. + /// In this case, there can be 2*n-1 rows in the result, which may be larger + /// than paging_size. + /// todo: find a good solution to limit it up to paging_size. + /// baseline: limit n up to paging_size/2 + #[inline] + async fn next_batch(&mut self, scan_rows: usize) -> BatchExecuteResult { + if self.n == 0 { + return BatchExecuteResult { + physical_columns: LazyBatchColumnVec::empty(), + logical_rows: Vec::new(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }; + } + + // limit intermediate memory by paging_size. + if let Some(paging_size) = self.context.cfg.paging_size { + if self.n * 2 > paging_size as usize { + return self.src.next_batch(scan_rows).await; + } + } + + let result = self.handle_next_batch().await; + + match result { + Err(e) => BatchExecuteResult { + physical_columns: LazyBatchColumnVec::empty(), + logical_rows: Vec::new(), + warnings: self.context.take_warnings(), + is_drained: Err(e), + }, + Ok((logical_columns, is_drained)) => { + let logical_rows = (0..logical_columns.rows_len()).collect(); + BatchExecuteResult { + physical_columns: logical_columns, + logical_rows, + warnings: self.context.take_warnings(), + is_drained: Ok(is_drained), + } + } + } + } + + #[inline] + fn collect_exec_stats(&mut self, dest: &mut ExecuteStats) { + self.src.collect_exec_stats(dest); + } + + #[inline] + fn collect_storage_stats(&mut self, dest: &mut Self::StorageStats) { + self.src.collect_storage_stats(dest); + } + + #[inline] + fn take_scanned_range(&mut self) -> IntervalRange { + self.src.take_scanned_range() + } + + #[inline] + fn can_be_cached(&self) -> bool { + self.src.can_be_cached() + } +} + +#[cfg(test)] +mod tests { + use futures::executor::block_on; + use tidb_query_datatype::{ + builder::FieldTypeBuilder, + codec::{batch::LazyBatchColumnVec, data_type::*}, + expr::EvalWarnings, + Collation, FieldTypeFlag, FieldTypeTp, + }; + use tidb_query_expr::{ + impl_arithmetic::{IntDivideInt, IntIntPlus}, + RpnExpressionBuilder, + }; + + use super::*; + use crate::{impl_arithmetic::arithmetic_fn_meta, util::mock_executor::MockExecutor}; + + #[test] + fn test_top_0() { + let src_exec = MockExecutor::new( + vec![FieldTypeTp::Double.into(), FieldTypeTp::Double.into()], + vec![BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Real(vec![None, Real::new(7.0).ok(), None, None].into()), + VectorValue::Real(vec![None, Real::new(7.0).ok(), None, None].into()), + ]), + logical_rows: (0..1).collect(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }], + ); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_constant_for_test(1) + .build_for_test(), + ], + vec![false], + vec![ + RpnExpressionBuilder::new_for_test() + .push_constant_for_test(1) + .build_for_test(), + ], + 0, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(r.is_drained.unwrap()); + } + + #[test] + fn test_constant_partition() { + let src_exec = MockExecutor::new( + vec![FieldTypeTp::Double.into(), FieldTypeTp::Double.into()], + vec![BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Real( + vec![ + Real::new(1.0).ok(), + Real::new(2.0).ok(), + Real::new(3.0).ok(), + Real::new(4.0).ok(), + ] + .into(), + ), + VectorValue::Real( + vec![ + Real::new(5.0).ok(), + Real::new(6.0).ok(), + Real::new(7.0).ok(), + Real::new(8.0).ok(), + ] + .into(), + ), + ]), + logical_rows: (0..4).collect(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }], + ); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_constant_for_test(1) + .build_for_test(), + ], + vec![false], + vec![ + RpnExpressionBuilder::new_for_test() + .push_constant_for_test(1) + .build_for_test(), + ], + 2, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1]); + assert_eq!(r.physical_columns.rows_len(), 2); + assert_eq!(r.physical_columns.columns_len(), 2); + assert_eq!( + r.physical_columns[0].decoded().to_real_vec(), + &[Real::new(2.0).ok(), Real::new(1.0).ok(),] + ); + assert_eq!( + r.physical_columns[1].decoded().to_real_vec(), + &[Real::new(6.0).ok(), Real::new(5.0).ok(),] + ); + assert!(r.is_drained.unwrap()); + } + + #[test] + fn test_multiple_and_null_part_key() { + let src_exec = MockExecutor::new( + vec![FieldTypeTp::Long.into(), FieldTypeTp::Long.into()], + vec![BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int( + vec![ + Some(1), + Some(1), + Some(1), + None, + None, + None, + Some(2), + Some(2), + Some(2), + ] + .into(), + ), + VectorValue::Int( + vec![ + Some(1), + Some(1), + None, + None, + None, + Some(2), + Some(1), + Some(1), + None, + ] + .into(), + ), + ]), + logical_rows: (0..9).collect(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }], + ); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![], + vec![], + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test(), + ], + 1, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5]); + assert_eq!(r.physical_columns.rows_len(), 6); + assert_eq!(r.physical_columns.columns_len(), 2); + assert_eq!( + r.physical_columns[0].decoded().to_int_vec(), + &[Some(1), Some(1), None, None, Some(2), Some(2)] + ); + assert_eq!( + r.physical_columns[1].decoded().to_int_vec(), + &[Some(1), None, None, Some(2), Some(1), None] + ); + assert!(r.is_drained.unwrap()); + } + + fn make_expr_case() -> MockExecutor { + MockExecutor::new( + vec![ + FieldTypeTp::Long.into(), + FieldTypeTp::Long.into(), + FieldTypeTp::Long.into(), + ], + vec![BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int( + vec![ + Some(1), + Some(2), + Some(3), + Some(4), + None, + Some(6), + None, + Some(8), + Some(9), + ] + .into(), + ), + VectorValue::Int( + vec![ + Some(2), + Some(1), + Some(4), + Some(3), + Some(5), + None, + None, + Some(9), + Some(8), + ] + .into(), + ), + VectorValue::Int( + vec![ + Some(1), + Some(2), + Some(3), + Some(4), + Some(5), + Some(6), + Some(7), + Some(8), + Some(9), + ] + .into(), + ), + ]), + logical_rows: (0..9).collect(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }], + ) + } + + /// partition by col2/2, order by col2 + #[test] + fn test_expr_key1() { + let mut exec = BatchPartitionTopNExecutor::new_for_test( + make_expr_case(), + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(2) + .build_for_test(), + ], + vec![false], + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(2) + .push_constant_for_test(2) + .push_fn_call_for_test( + arithmetic_fn_meta::(), + 2, + FieldTypeTp::Long, + ) + .build_for_test(), + ], + 1, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); + assert_eq!(r.physical_columns.rows_len(), 5); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[2].decoded().to_int_vec(), + &[Some(1), Some(2), Some(4), Some(6), Some(8)] + ); + assert!(r.is_drained.unwrap()); + } + + /// partition by col0 + col1, order by col2 + #[test] + fn test_expr_key2() { + let mut exec = BatchPartitionTopNExecutor::new_for_test( + make_expr_case(), + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(2) + .build_for_test(), + ], + vec![true], + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .push_column_ref_for_test(1) + .push_fn_call_for_test(arithmetic_fn_meta::(), 2, FieldTypeTp::Long) + .build_for_test(), + ], + 1, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3]); + assert_eq!(r.physical_columns.rows_len(), 4); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[2].decoded().to_int_vec(), + &[Some(2), Some(4), Some(7), Some(9)] + ); + assert!(r.is_drained.unwrap()); + } + + /// Currently, When the data is not ordered by partition key, e.g. 1 1 2 1, + /// it will treat discontinuous same key as different partition. + #[test] + fn test_unordered_key() { + let src_exec = MockExecutor::new( + vec![FieldTypeTp::Long.into(), FieldTypeTp::Double.into()], + vec![BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int(vec![Some(1), Some(1), Some(2), Some(1)].into()), + VectorValue::Real( + vec![ + Real::new(5.0).ok(), + None, + Real::new(7.0).ok(), + Real::new(4.0).ok(), + ] + .into(), + ), + ]), + logical_rows: (0..4).collect(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }], + ); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test(), + ], + vec![false], + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + ], + 1, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2]); + assert_eq!(r.physical_columns.rows_len(), 3); + assert_eq!(r.physical_columns.columns_len(), 2); + assert_eq!( + r.physical_columns[0].decoded().to_int_vec(), + &[Some(1), Some(2), Some(1)] + ); + assert_eq!( + r.physical_columns[1].decoded().to_real_vec(), + &[None, Real::new(7.0).ok(), Real::new(4.0).ok()] + ); + assert!(r.is_drained.unwrap()); + } + + fn make_integrated_data() -> MockExecutor { + MockExecutor::new( + vec![ + FieldTypeBuilder::new() + .tp(FieldTypeTp::VarString) + .flag(FieldTypeFlag::BINARY) + .into(), // primary key + FieldTypeBuilder::new() + .tp(FieldTypeTp::VarString) + .flag(FieldTypeFlag::BINARY) + .into(), // secondary key + FieldTypeTp::LongLong.into(), // timestamp + FieldTypeTp::MediumBlob.into(), // value + ], + vec![BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Bytes( + vec![ + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + ] + .into(), + ), + VectorValue::Bytes( + vec![ + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"1".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + Some(b"2".to_vec()), + ] + .into(), + ), + VectorValue::Int( + vec![ + Some(1672736824585607000_i64), + Some(1672736824789029000_i64), + Some(1672736824850598000_i64), + Some(1672736824884993000_i64), + Some(1672736824918933000_i64), + Some(1672736824953241000_i64), + Some(1672736824987116000_i64), + Some(1672736825021485000_i64), + Some(1672736825208127000_i64), + Some(1672736825263135000_i64), + Some(1672736825296467000_i64), + Some(1672736825330420000_i64), + Some(1672736825363611000_i64), + Some(1672736825398155000_i64), + Some(1672736825432106000_i64), + Some(1672736825466432000_i64), + ] + .into(), + ), + VectorValue::Bytes( + vec![ + Some(b"01".to_vec()), + Some(b"02".to_vec()), + Some(b"03".to_vec()), + Some(b"04".to_vec()), + Some(b"05".to_vec()), + Some(b"06".to_vec()), + Some(b"07".to_vec()), + Some(b"08".to_vec()), + Some(b"09".to_vec()), + Some(b"10".to_vec()), + Some(b"11".to_vec()), + Some(b"12".to_vec()), + Some(b"13".to_vec()), + Some(b"14".to_vec()), + Some(b"15".to_vec()), + Some(b"16".to_vec()), + ] + .into(), + ), + ]), + logical_rows: (0..16).collect(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }], + ) + } + + #[test] + fn test_integrated() { + let mut exec = BatchPartitionTopNExecutor::new_for_test( + make_integrated_data(), + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(2) + .build_for_test(), + ], + vec![true], + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test(), + ], + 2, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(r.physical_columns.rows_len(), 8); + assert_eq!(r.physical_columns.columns_len(), 4); + assert!(r.is_drained.unwrap()); + + assert_eq!( + r.physical_columns[2].decoded().to_int_vec(), + &[ + Some(1672736824884993000), + Some(1672736824850598000), + Some(1672736825021485000), + Some(1672736824987116000), + Some(1672736825330420000), + Some(1672736825296467000), + Some(1672736825466432000), + Some(1672736825432106000) + ] + ); + assert_eq!( + r.physical_columns[3].decoded().to_bytes_vec(), + &[ + Some(b"04".to_vec()), + Some(b"03".to_vec()), + Some(b"08".to_vec()), + Some(b"07".to_vec()), + Some(b"12".to_vec()), + Some(b"11".to_vec()), + Some(b"16".to_vec()), + Some(b"15".to_vec()) + ] + ); + } + + /// Builds an executor that will return these data: + /// + /// ```text + /// == Schema == + /// Col0 (LongLong(Unsigned)) Col1(LongLong[UnSigned]) Col2(LongLong[Signed]) + /// == Call #1 == + /// 1 18,446,744,073,709,551,615 -3 + /// 1 NULL NULL + /// 1 18,446,744,073,709,551,613 -1 + /// 1 2023 2024 + /// 1 2000 2000 + /// == Call #2 == + /// == Call #3 == + /// 2 9,223,372,036,854,775,807 9,223,372,036,854,775,807 + /// 2 300 300 + /// 2 9,223,372,036,854,775,808 -9,223,372,036,854,775,808 + /// 2 NULL NULL + /// 3 NULL NULL + /// == Call #4 == + /// (drained) (drained) (drained) + fn make_full_batch() -> MockExecutor { + MockExecutor::new( + vec![ + FieldTypeBuilder::new() + .tp(FieldTypeTp::LongLong) + .flag(FieldTypeFlag::UNSIGNED) + .into(), + FieldTypeBuilder::new() + .tp(FieldTypeTp::LongLong) + .flag(FieldTypeFlag::UNSIGNED) + .into(), + FieldTypeTp::LongLong.into(), + ], + vec![ + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int(vec![Some(1), Some(1), Some(1), Some(1), Some(1)].into()), + VectorValue::Int( + vec![ + Some(18_446_744_073_709_551_615_u64 as i64), + None, + Some(18_446_744_073_709_551_613_u64 as i64), + Some(2023), + Some(2000), + ] + .into(), + ), + VectorValue::Int( + vec![Some(-3), None, Some(-1), Some(2024), Some(2000)].into(), + ), + ]), + logical_rows: vec![0, 1, 2, 3, 4], + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::empty(), + logical_rows: Vec::new(), + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int(vec![Some(2), Some(2), Some(2), Some(2), Some(3)].into()), + VectorValue::Int( + vec![ + Some(9_223_372_036_854_775_807_u64 as i64), + Some(300), + Some(9_223_372_036_854_775_808_u64 as i64), + None, + None, + ] + .into(), + ), + VectorValue::Int( + vec![ + Some(9_223_372_036_854_775_807_u64 as i64), + Some(300), + Some(-9_223_372_036_854_775_808), + None, + None, + ] + .into(), + ), + ]), + logical_rows: vec![0, 1, 2, 3, 4], + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::empty(), + logical_rows: Vec::new(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }, + ], + ) + } + + #[test] + fn test_small_n() { + let mut config = EvalConfig::default(); + config.paging_size = Some(10); + let config = Arc::new(config); + let src_exec = make_full_batch(); + let mut exec = BatchPartitionTopNExecutor::new_for_test_with_config( + config, + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test(), + ], + vec![false], + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + ], + 2, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3]); + assert_eq!(r.physical_columns.rows_len(), 4); + assert_eq!(r.physical_columns.columns_len(), 3); + assert!(!r.is_drained.unwrap()); + assert_eq!( + r.physical_columns[0].decoded().to_int_vec(), + &[Some(1), Some(1), Some(2), Some(2)] + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0]); + assert_eq!(r.physical_columns.rows_len(), 1); + assert!(r.is_drained.unwrap()); + assert_eq!(r.physical_columns[0].decoded().to_int_vec(), &[Some(3)]); + } + + #[test] + fn test_without_order_key() { + let mut config = EvalConfig::default(); + config.paging_size = Some(10); + let config = Arc::new(config); + let src_exec = make_full_batch(); + let mut exec = BatchPartitionTopNExecutor::new_for_test_with_config( + config, + src_exec, + vec![], + vec![], + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + ], + 2, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3]); + assert_eq!(r.physical_columns.rows_len(), 4); + assert_eq!(r.physical_columns.columns_len(), 3); + assert!(!r.is_drained.unwrap()); + assert_eq!( + r.physical_columns[0].decoded().to_int_vec(), + &[Some(1), Some(1), Some(2), Some(2)] + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0]); + assert_eq!(r.physical_columns.rows_len(), 1); + assert!(r.is_drained.unwrap()); + assert_eq!(r.physical_columns[0].decoded().to_int_vec(), &[Some(3)]); + } + + #[test] + fn test_paging_limit_normal_n() { + let mut config = EvalConfig::default(); + config.paging_size = Some(10); + let config = Arc::new(config); + let src_exec = make_full_batch(); + let mut exec = BatchPartitionTopNExecutor::new_for_test_with_config( + config, + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test(), + ], + vec![false], + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + ], + 5, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(r.physical_columns.rows_len(), 9); + assert_eq!(r.physical_columns.columns_len(), 3); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0]); + assert_eq!(r.physical_columns.rows_len(), 1); + assert!(r.is_drained.unwrap()); + } + + #[test] + fn test_paging_limit_oversize_n() { + let mut config = EvalConfig::default(); + config.paging_size = Some(9); + let config = Arc::new(config); + let src_exec = make_full_batch(); + let mut exec = BatchPartitionTopNExecutor::new_for_test_with_config( + config, + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test(), + ], + vec![false], + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + ], + 5, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); + assert_eq!(r.physical_columns.rows_len(), 5); + assert_eq!(r.physical_columns.columns_len(), 3); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); + assert_eq!(r.physical_columns.rows_len(), 5); + assert_eq!(r.physical_columns.columns_len(), 3); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(r.is_drained.unwrap()); + } + + /// The following tests are copied from `batch_top_n_executor.rs`. + #[test] + fn test_no_partition_top_0() { + let src_exec = MockExecutor::new( + vec![FieldTypeTp::Double.into()], + vec![BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![VectorValue::Real( + vec![None, Real::new(7.0).ok(), None, None].into(), + )]), + logical_rows: (0..1).collect(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }], + ); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_constant_for_test(1) + .build_for_test(), + ], + vec![false], + vec![], + 0, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(r.is_drained.unwrap()); + } + + #[test] + fn test_no_partition_no_row() { + let src_exec = MockExecutor::new( + vec![FieldTypeTp::LongLong.into()], + vec![ + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![VectorValue::Int( + vec![Some(5)].into(), + )]), + logical_rows: Vec::new(), + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::empty(), + logical_rows: Vec::new(), + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }, + ], + ); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + ], + vec![false], + vec![], + 10, + ); + + let r = block_on(exec.next_batch(1)); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(r.is_drained.unwrap()); + } + + /// Builds an executor that will return these data: + /// + /// == Schema == + /// Col0 (Int) Col1(Int) Col2(Real) + /// == Call #1 == + /// NULL -1 -1.0 + /// NULL NULL 2.0 + /// NULL 1 4.0 + /// == Call #2 == + /// == Call #3 == + /// -1 NULL NULL + /// -10 10 3.0 + /// -10 NULL -5.0 + /// -10 -10 0.0 + /// (drained) + fn make_src_executor() -> MockExecutor { + MockExecutor::new( + vec![ + FieldTypeTp::LongLong.into(), + FieldTypeTp::LongLong.into(), + FieldTypeTp::Double.into(), + ], + vec![ + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int(vec![None, None, Some(5), None].into()), + VectorValue::Int(vec![None, Some(1), None, Some(-1)].into()), + VectorValue::Real( + vec![ + Real::new(2.0).ok(), + Real::new(4.0).ok(), + None, + Real::new(-1.0).ok(), + ] + .into(), + ), + ]), + logical_rows: vec![3, 0, 1], + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int(vec![Some(0)].into()), + VectorValue::Int(vec![Some(10)].into()), + VectorValue::Real(vec![Real::new(10.0).ok()].into()), + ]), + logical_rows: Vec::new(), + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int( + vec![Some(-10), Some(-1), Some(-10), None, Some(-10), None].into(), + ), + VectorValue::Int( + vec![None, None, Some(10), Some(-9), Some(-10), None].into(), + ), + VectorValue::Real( + vec![ + Real::new(-5.0).ok(), + None, + Real::new(3.0).ok(), + None, + Real::new(0.0).ok(), + Real::new(9.9).ok(), + ] + .into(), + ), + ]), + logical_rows: vec![1, 2, 0, 4], + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }, + ], + ) + } + + #[test] + fn test_no_partition_integration_1() { + // Order by single column, data len < n. + // + // mysql> select * from t order by col2 limit 100; + // +------+------+------+ + // | col0 | col1 | col2 | + // +------+------+------+ + // | -1 | NULL | NULL | + // | -10 | NULL | -5 | + // | NULL | -1 | -1 | + // | -10 | -10 | 0 | + // | NULL | NULL | 2 | + // | -10 | 10 | 3 | + // | NULL | 1 | 4 | + // +------+------+------+ + // + // Note: ORDER BY does not use stable sort, so let's order by col2 to avoid + // duplicate records. + + let src_exec = make_src_executor(); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(2) + .build_for_test(), + ], + vec![false], + vec![], + 100, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6]); + assert_eq!(r.physical_columns.rows_len(), 7); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[0].decoded().to_int_vec(), + &[Some(-1), Some(-10), None, Some(-10), None, Some(-10), None] + ); + assert_eq!( + r.physical_columns[1].decoded().to_int_vec(), + &[None, None, Some(-1), Some(-10), None, Some(10), Some(1)] + ); + assert_eq!( + r.physical_columns[2].decoded().to_real_vec(), + &[ + None, + Real::new(-5.0).ok(), + Real::new(-1.0).ok(), + Real::new(0.0).ok(), + Real::new(2.0).ok(), + Real::new(3.0).ok(), + Real::new(4.0).ok() + ] + ); + assert!(r.is_drained.unwrap()); + } + + #[test] + fn test_no_partition_integration_2() { + // Order by multiple columns, data len == n. + // + // mysql> select * from t order by col0 desc, col1 limit 7; + // +------+------+------+ + // | col0 | col1 | col2 | + // +------+------+------+ + // | -1 | NULL | NULL | + // | -10 | NULL | -5 | + // | -10 | -10 | 0 | + // | -10 | 10 | 3 | + // | NULL | NULL | 2 | + // | NULL | -1 | -1 | + // | NULL | 1 | 4 | + // +------+------+------+ + + let src_exec = make_src_executor(); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test(), + ], + vec![true, false], + vec![], + 7, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6]); + assert_eq!(r.physical_columns.rows_len(), 7); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[0].decoded().to_int_vec(), + &[Some(-1), Some(-10), Some(-10), Some(-10), None, None, None] + ); + assert_eq!( + r.physical_columns[1].decoded().to_int_vec(), + &[None, None, Some(-10), Some(10), None, Some(-1), Some(1)] + ); + assert_eq!( + r.physical_columns[2].decoded().to_real_vec(), + &[ + None, + Real::new(-5.0).ok(), + Real::new(0.0).ok(), + Real::new(3.0).ok(), + Real::new(2.0).ok(), + Real::new(-1.0).ok(), + Real::new(4.0).ok() + ] + ); + assert!(r.is_drained.unwrap()); + } + + #[test] + fn test_no_partition_integration_3() { + use tidb_query_expr::{ + impl_arithmetic::{arithmetic_fn_meta, IntIntPlus}, + impl_op::is_null_fn_meta, + }; + + // Order by multiple expressions, data len > n. + // + // mysql> select * from t order by isnull(col0), col0, col1 + 1 desc limit 5; + // +------+------+------+ + // | col0 | col1 | col2 | + // +------+------+------+ + // | -10 | 10 | 3 | + // | -10 | -10 | 0 | + // | -10 | NULL | -5 | + // | -1 | NULL | NULL | + // | NULL | 1 | 4 | + // +------+------+------+ + + let src_exec = make_src_executor(); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .push_fn_call_for_test(is_null_fn_meta::(), 1, FieldTypeTp::LongLong) + .build_for_test(), + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .push_constant_for_test(1) + .push_fn_call_for_test( + arithmetic_fn_meta::(), + 2, + FieldTypeTp::LongLong, + ) + .build_for_test(), + ], + vec![false, false, true], + vec![], + 5, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); + assert_eq!(r.physical_columns.rows_len(), 5); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[0].decoded().to_int_vec(), + &[Some(-10), Some(-10), Some(-10), Some(-1), None] + ); + assert_eq!( + r.physical_columns[1].decoded().to_int_vec(), + &[Some(10), Some(-10), None, None, Some(1)] + ); + assert_eq!( + r.physical_columns[2].decoded().to_real_vec(), + &[ + Real::new(3.0).ok(), + Real::new(0.0).ok(), + Real::new(-5.0).ok(), + None, + Real::new(4.0).ok() + ] + ); + assert!(r.is_drained.unwrap()); + } + + /// Builds an executor that will return these data: + /// + /// ```text + /// == Schema == + /// Col0 (Bytes[Utf8Mb4GeneralCi]) Col1(Bytes[Utf8Mb4Bin]) Col2(Bytes[Binary]) + /// == Call #1 == + /// "aa" "aaa" "áaA" + /// NULL NULL "Aa" + /// "aa" "aa" NULL + /// == Call #2 == + /// == Call #3 == + /// "áaA" "áa" NULL + /// "áa" "áaA" "aa" + /// "Aa" NULL "aaa" + /// "aaa" "Aa" "áa" + /// (drained) + /// ``` + fn make_bytes_src_executor() -> MockExecutor { + MockExecutor::new( + vec![ + FieldTypeBuilder::new() + .tp(FieldTypeTp::VarChar) + .collation(Collation::Utf8Mb4GeneralCi) + .into(), + FieldTypeBuilder::new() + .tp(FieldTypeTp::VarChar) + .collation(Collation::Utf8Mb4Bin) + .into(), + FieldTypeBuilder::new() + .tp(FieldTypeTp::VarChar) + .collation(Collation::Binary) + .into(), + ], + vec![ + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Bytes( + vec![Some(b"aa".to_vec()), None, Some(b"aa".to_vec())].into(), + ), + VectorValue::Bytes( + vec![Some(b"aa".to_vec()), None, Some(b"aaa".to_vec())].into(), + ), + VectorValue::Bytes( + vec![None, Some(b"Aa".to_vec()), Some("áaA".as_bytes().to_vec())] + .into(), + ), + ]), + logical_rows: vec![2, 1, 0], + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::empty(), + logical_rows: Vec::new(), + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Bytes( + vec![ + Some("áaA".as_bytes().to_vec()), + Some("áa".as_bytes().to_vec()), + Some(b"Aa".to_vec()), + Some(b"aaa".to_vec()), + ] + .into(), + ), + VectorValue::Bytes( + vec![ + Some("áa".as_bytes().to_vec()), + Some("áaA".as_bytes().to_vec()), + None, + Some(b"Aa".to_vec()), + ] + .into(), + ), + VectorValue::Bytes( + vec![ + None, + Some(b"aa".to_vec()), + Some(b"aaa".to_vec()), + Some("áa".as_bytes().to_vec()), + ] + .into(), + ), + ]), + logical_rows: vec![0, 1, 2, 3], + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }, + ], + ) + } + + #[test] + fn test_no_partition_bytes_1() { + // Order by multiple expressions with collation, data len > n. + // + // mysql> select * from t order by col1 desc, col3 desc, col2 limit 5; + // +------+--------+--------+ + // | col1 | col2 | col3 | + // +------+--------+--------+ + // | aaa | Aa | áa | + // | áaA | áa | | + // | aa | aaa | áaA | + // | Aa | | aaa | + // | áa | áaA | aa | + // +------+--------+--------+ + + let src_exec = make_bytes_src_executor(); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(2) + .build_for_test(), + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test(), + ], + vec![true, true, false], + vec![], + 5, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); + assert_eq!(r.physical_columns.rows_len(), 5); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[0].decoded().to_bytes_vec(), + &[ + Some(b"aaa".to_vec()), + Some("áaA".as_bytes().to_vec()), + Some(b"aa".to_vec()), + Some(b"Aa".to_vec()), + Some("áa".as_bytes().to_vec()), + ] + ); + assert_eq!( + r.physical_columns[1].decoded().to_bytes_vec(), + &[ + Some(b"Aa".to_vec()), + Some("áa".as_bytes().to_vec()), + Some(b"aaa".to_vec()), + None, + Some("áaA".as_bytes().to_vec()), + ] + ); + assert_eq!( + r.physical_columns[2].decoded().to_bytes_vec(), + &[ + Some("áa".as_bytes().to_vec()), + None, + Some("áaA".as_bytes().to_vec()), + Some(b"aaa".to_vec()), + Some(b"aa".to_vec()), + ] + ); + assert!(r.is_drained.unwrap()); + } + + #[test] + fn test_no_partition_bytes_2() { + // Order by multiple expressions with collation, data len > n. + // + // mysql> select * from test order by col1, col2, col3 limit 5; + // +--------+--------+--------+ + // | col1 | col2 | col3 | + // +--------+--------+--------+ + // | | | Aa | + // | Aa | | aaa | + // | aa | aa | | + // | aa | aaa | áaA | + // | áa | áaA | aa | + // +--------+--------+--------+ + + let src_exec = make_bytes_src_executor(); + + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(1) + .build_for_test(), + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(2) + .build_for_test(), + ], + vec![false, false, false], + vec![], + 5, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); + assert_eq!(r.physical_columns.rows_len(), 5); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[0].decoded().to_bytes_vec(), + &[ + None, + Some(b"Aa".to_vec()), + Some(b"aa".to_vec()), + Some(b"aa".to_vec()), + Some("áa".as_bytes().to_vec()), + ] + ); + assert_eq!( + r.physical_columns[1].decoded().to_bytes_vec(), + &[ + None, + None, + Some(b"aa".to_vec()), + Some(b"aaa".to_vec()), + Some("áaA".as_bytes().to_vec()), + ] + ); + assert_eq!( + r.physical_columns[2].decoded().to_bytes_vec(), + &[ + Some(b"Aa".to_vec()), + Some(b"aaa".to_vec()), + None, + Some("áaA".as_bytes().to_vec()), + Some(b"aa".to_vec()), + ] + ); + assert!(r.is_drained.unwrap()); + } + + /// Builds an executor that will return these data: + /// + /// ```text + /// == Schema == + /// Col0 (LongLong(Unsigned)) Col1(LongLong[Signed]) Col2(Long[Unsigned]) + /// == Call #1 == + /// 18,446,744,073,709,551,615 -3 4,294,967,293 + /// NULL NULL NULL + /// 18,446,744,073,709,551,613 -1 4,294,967,295 + /// == Call #2 == + /// == Call #3 == + /// 2000 2000 2000 + /// 9,223,372,036,854,775,807 9,223,372,036,854,775,807 2,147,483,647 + /// 300 300 300 + /// 9,223,372,036,854,775,808 -9,223,372,036,854,775,808 2,147,483,648 + /// (drained) (drained) (drained) + /// ``` + fn make_src_executor_unsigned() -> MockExecutor { + MockExecutor::new( + vec![ + FieldTypeBuilder::new() + .tp(FieldTypeTp::LongLong) + .flag(FieldTypeFlag::UNSIGNED) + .into(), + FieldTypeTp::LongLong.into(), + FieldTypeBuilder::new() + .tp(FieldTypeTp::Long) + .flag(FieldTypeFlag::UNSIGNED) + .into(), + ], + vec![ + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int( + vec![ + Some(18_446_744_073_709_551_613_u64 as i64), + None, + Some(18_446_744_073_709_551_615_u64 as i64), + ] + .into(), + ), + VectorValue::Int(vec![Some(-1), None, Some(-3)].into()), + VectorValue::Int( + vec![ + Some(4_294_967_295_u32 as i64), + None, + Some(4_294_967_295_u32 as i64), + ] + .into(), + ), + ]), + logical_rows: vec![2, 1, 0], + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::empty(), + logical_rows: Vec::new(), + warnings: EvalWarnings::default(), + is_drained: Ok(false), + }, + BatchExecuteResult { + physical_columns: LazyBatchColumnVec::from(vec![ + VectorValue::Int( + vec![ + Some(300_u64 as i64), + Some(9_223_372_036_854_775_807_u64 as i64), + Some(2000_u64 as i64), + Some(9_223_372_036_854_775_808_u64 as i64), + ] + .into(), + ), + VectorValue::Int( + vec![ + Some(300), + Some(9_223_372_036_854_775_807), + Some(2000), + Some(-9_223_372_036_854_775_808), + ] + .into(), + ), + VectorValue::Int( + vec![ + Some(300_u32 as i64), + Some(2_147_483_647_u32 as i64), + Some(2000_u32 as i64), + Some(2_147_483_648_u32 as i64), + ] + .into(), + ), + ]), + logical_rows: vec![2, 1, 0, 3], + warnings: EvalWarnings::default(), + is_drained: Ok(true), + }, + ], + ) + } + + #[test] + fn test_no_partition_top_unsigned() { + let test_top5 = |col_index: usize, is_desc: bool, expected: &[Option]| { + let src_exec = make_src_executor_unsigned(); + let mut exec = BatchPartitionTopNExecutor::new_for_test( + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(col_index) + .build_for_test(), + ], + vec![is_desc], + vec![], + 5, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); + assert_eq!(r.physical_columns.rows_len(), 5); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[col_index].decoded().to_int_vec(), + expected + ); + assert!(r.is_drained.unwrap()); + }; + + test_top5( + 0, + false, + &[ + None, + Some(300_u64 as i64), + Some(2000_u64 as i64), + Some(9_223_372_036_854_775_807_u64 as i64), + Some(9_223_372_036_854_775_808_u64 as i64), + ], + ); + + test_top5( + 0, + true, + &[ + Some(18_446_744_073_709_551_615_u64 as i64), + Some(18_446_744_073_709_551_613_u64 as i64), + Some(9_223_372_036_854_775_808_u64 as i64), + Some(9_223_372_036_854_775_807_u64 as i64), + Some(2000_u64 as i64), + ], + ); + + test_top5( + 1, + false, + &[ + None, + Some(-9_223_372_036_854_775_808), + Some(-3), + Some(-1), + Some(300), + ], + ); + + test_top5( + 1, + true, + &[ + Some(9_223_372_036_854_775_807), + Some(2000), + Some(300), + Some(-1), + Some(-3), + ], + ); + + test_top5( + 2, + false, + &[ + None, + Some(300_u32 as i64), + Some(2000_u32 as i64), + Some(2_147_483_647_u32 as i64), + Some(2_147_483_648_u32 as i64), + ], + ); + + test_top5( + 2, + true, + &[ + Some(4_294_967_295_u32 as i64), + Some(4_294_967_295_u32 as i64), + Some(2_147_483_648_u32 as i64), + Some(2_147_483_647_u32 as i64), + Some(2000_u32 as i64), + ], + ); + } + + #[test] + fn test_no_partition_top_paging() { + // Top N = 5 and PagingSize = 10, same with no-paging. + let test_top5_paging6 = |col_index: usize, is_desc: bool, expected: &[Option]| { + let mut config = EvalConfig::default(); + config.paging_size = Some(10); + let config = Arc::new(config); + let src_exec = make_src_executor_unsigned(); + let mut exec = BatchPartitionTopNExecutor::new_for_test_with_config( + config, + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(col_index) + .build_for_test(), + ], + vec![is_desc], + vec![], + 5, + ); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert!(r.logical_rows.is_empty()); + assert_eq!(r.physical_columns.rows_len(), 0); + assert!(!r.is_drained.unwrap()); + + let r = block_on(exec.next_batch(1)); + assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); + assert_eq!(r.physical_columns.rows_len(), 5); + assert_eq!(r.physical_columns.columns_len(), 3); + assert_eq!( + r.physical_columns[col_index].decoded().to_int_vec(), + expected + ); + assert!(r.is_drained.unwrap()); + }; + + test_top5_paging6( + 0, + false, + &[ + None, + Some(300_u64 as i64), + Some(2000_u64 as i64), + Some(9_223_372_036_854_775_807_u64 as i64), + Some(9_223_372_036_854_775_808_u64 as i64), + ], + ); + + test_top5_paging6( + 0, + true, + &[ + Some(18_446_744_073_709_551_615_u64 as i64), + Some(18_446_744_073_709_551_613_u64 as i64), + Some(9_223_372_036_854_775_808_u64 as i64), + Some(9_223_372_036_854_775_807_u64 as i64), + Some(2000_u64 as i64), + ], + ); + + test_top5_paging6( + 1, + false, + &[ + None, + Some(-9_223_372_036_854_775_808), + Some(-3), + Some(-1), + Some(300), + ], + ); + + test_top5_paging6( + 1, + true, + &[ + Some(9_223_372_036_854_775_807), + Some(2000), + Some(300), + Some(-1), + Some(-3), + ], + ); + + test_top5_paging6( + 2, + false, + &[ + None, + Some(300_u32 as i64), + Some(2000_u32 as i64), + Some(2_147_483_647_u32 as i64), + Some(2_147_483_648_u32 as i64), + ], + ); + + test_top5_paging6( + 2, + true, + &[ + Some(4_294_967_295_u32 as i64), + Some(4_294_967_295_u32 as i64), + Some(2_147_483_648_u32 as i64), + Some(2_147_483_647_u32 as i64), + Some(2000_u32 as i64), + ], + ); + + // Top N = 5 and PagingSize = 8, return all data and do nothing. + let test_top5_paging4 = |build_src_executor: fn() -> MockExecutor| { + let mut config = EvalConfig::default(); + config.paging_size = Some(8); + let config = Arc::new(config); + let src_exec = build_src_executor(); + let mut exec = BatchPartitionTopNExecutor::new_for_test_with_config( + config, + src_exec, + vec![ + RpnExpressionBuilder::new_for_test() + .push_column_ref_for_test(0) + .build_for_test(), + ], + vec![false], + vec![], + 5, + ); + let mut exec2 = build_src_executor(); + + loop { + let r1 = block_on(exec.next_batch(1)); + let r2 = block_on(exec2.next_batch(1)); + assert_eq!(r1.logical_rows, r2.logical_rows); + assert_eq!( + r1.physical_columns.rows_len(), + r2.physical_columns.rows_len() + ); + assert_eq!( + r1.physical_columns.columns_len(), + r2.physical_columns.columns_len() + ); + let r1_is_drained = r1.is_drained.unwrap(); + assert_eq!(r1_is_drained, r2.is_drained.unwrap()); + if r1_is_drained { + break; + } + } + }; + + test_top5_paging4(make_src_executor_unsigned); + test_top5_paging4(make_src_executor); + test_top5_paging4(make_bytes_src_executor); + } +} diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 392b41ff165..f4a3ea8a2ad 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -4,6 +4,7 @@ use std::{convert::TryFrom, sync::Arc}; use api_version::KvFormat; use fail::fail_point; +use itertools::Itertools; use kvproto::coprocessor::KeyRange; use protobuf::Message; use tidb_query_common::{ @@ -348,17 +349,36 @@ pub fn build_executors( order_exprs_def.push(item.take_expr()); order_is_desc.push(item.get_desc()); } + let partition_by = d + .take_partition_by() + .into_iter() + .map(|mut item| item.take_expr()) + .collect_vec(); - Box::new( - BatchTopNExecutor::new( - config.clone(), - executor, - order_exprs_def, - order_is_desc, - d.get_limit() as usize, - )? - .collect_summary(summary_slot_index), - ) + if partition_by.is_empty() { + Box::new( + BatchTopNExecutor::new( + config.clone(), + executor, + order_exprs_def, + order_is_desc, + d.get_limit() as usize, + )? + .collect_summary(summary_slot_index), + ) + } else { + Box::new( + BatchPartitionTopNExecutor::new( + config.clone(), + executor, + partition_by, + order_exprs_def, + order_is_desc, + d.get_limit() as usize, + )? + .collect_summary(summary_slot_index), + ) + } } _ => { return Err(other_err!( diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index 5ebc65baa25..dd6b7be2dba 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -38,7 +38,7 @@ pub struct BatchTopNExecutor { /// 1. `BatchTopNExecutor` is valid (i.e. not dropped). /// /// 2. The referenced `LazyBatchColumnVec` of the element must be valid, - /// which only happens when at least one of the row is in the `heap`. + /// which only happens when at least one of the row is in the `heap`. /// Note that rows may be swapped out from `heap` at any time. /// /// This field is placed before `order_exprs` and `src` because it relies on diff --git a/components/tidb_query_executors/src/util/top_n_heap.rs b/components/tidb_query_executors/src/util/top_n_heap.rs index 0cbef103e4d..57bd4b63017 100644 --- a/components/tidb_query_executors/src/util/top_n_heap.rs +++ b/components/tidb_query_executors/src/util/top_n_heap.rs @@ -51,17 +51,25 @@ impl TopNHeap { } #[allow(clippy::clone_on_copy)] - pub fn take_all(&mut self) -> LazyBatchColumnVec { + pub fn take_all_append_to(&mut self, result: &mut LazyBatchColumnVec) { let heap = std::mem::take(&mut self.heap); let sorted_items = heap.into_sorted_vec(); if sorted_items.is_empty() { - return LazyBatchColumnVec::empty(); + return; } - let mut result = sorted_items[0] - .source_data - .physical_columns - .clone_empty(sorted_items.len()); + // If it is a pure empty LazyBatchColumnVec, we need create columns on it first. + if result.columns_len() == 0 { + *result = sorted_items[0] + .source_data + .physical_columns + .clone_empty(self.heap.len()); + } + // todo: check schema is equal + assert_eq!( + result.columns_len(), + sorted_items[0].source_data.physical_columns.columns_len(), + ); for (column_index, result_column) in result.as_mut_slice().iter_mut().enumerate() { match result_column { @@ -101,6 +109,12 @@ impl TopNHeap { } result.assert_columns_equal_length(); + } + + #[allow(clippy::clone_on_copy)] + pub fn take_all(&mut self) -> LazyBatchColumnVec { + let mut result = LazyBatchColumnVec::empty(); + self.take_all_append_to(&mut result); result } } From ef09d272b13136517044125e4da3d1577fadb327 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Thu, 9 Feb 2023 08:05:58 -0800 Subject: [PATCH 518/676] polish the config region_split_size (#14182) ref tikv/tikv#12842 materialize the region_split_size in optimize_for Signed-off-by: qi.xu Co-authored-by: qi.xu Co-authored-by: Ti Chi Robot --- components/raftstore/src/coprocessor/config.rs | 8 ++++++-- src/config/mod.rs | 2 -- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index 3014c5c2358..137de200b71 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -130,8 +130,12 @@ impl Config { pub fn optimize_for(&mut self, raftstore_v2: bool) { // overwrite the default region_split_size when it's multi-rocksdb - if raftstore_v2 && self.region_split_size.is_none() { - self.region_split_size = Some(ReadableSize::mb(RAFTSTORE_V2_SPLIT_SIZE_MB)); + if self.region_split_size.is_none() { + if raftstore_v2 { + self.region_split_size = Some(ReadableSize::mb(RAFTSTORE_V2_SPLIT_SIZE_MB)); + } else { + self.region_split_size = Some(self.region_split_size()); + } } } diff --git a/src/config/mod.rs b/src/config/mod.rs index 5e923023ca0..a8e15c38642 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -5544,8 +5544,6 @@ mod tests { default_cfg.security.redact_info_log = Some(false); default_cfg.coprocessor.region_max_size = Some(default_cfg.coprocessor.region_max_size()); default_cfg.coprocessor.region_max_keys = Some(default_cfg.coprocessor.region_max_keys()); - default_cfg.coprocessor.region_split_size = - Some(default_cfg.coprocessor.region_split_size()); default_cfg.coprocessor.region_split_keys = Some(default_cfg.coprocessor.region_split_keys()); default_cfg.raft_store.raft_log_gc_size_limit = From 91b7a49773978c3258e48f310afa48d1e1d5c1ea Mon Sep 17 00:00:00 2001 From: zzm Date: Sat, 11 Feb 2023 14:40:00 +0800 Subject: [PATCH 519/676] cdc: fix cdc integration test ` test_rawkv_sacn` (#14147) close tikv/tikv#14146 make cdc integration test `test_rawkv_sacn` stable Signed-off-by: zeminzhou --- components/cdc/tests/integrations/test_cdc.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index 73f46fe6427..9de1a77a8ad 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -613,16 +613,15 @@ fn test_cdc_scan_impl() { fn test_cdc_rawkv_scan() { let mut suite = TestSuite::new(3, ApiVersion::V2); - suite.set_tso(10); - suite.flush_causal_timestamp_for_region(1); let (k1, v1) = (b"rkey1".to_vec(), b"value1".to_vec()); suite.must_kv_put(1, k1, v1); let (k2, v2) = (b"rkey2".to_vec(), b"value2".to_vec()); suite.must_kv_put(1, k2, v2); - suite.set_tso(1000); + let ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); suite.flush_causal_timestamp_for_region(1); + let (k3, v3) = (b"rkey3".to_vec(), b"value3".to_vec()); suite.must_kv_put(1, k3.clone(), v3.clone()); @@ -631,7 +630,7 @@ fn test_cdc_rawkv_scan() { let mut req = suite.new_changedata_request(1); req.set_kv_api(ChangeDataRequestKvApi::RawKv); - req.set_checkpoint_ts(999); + req.set_checkpoint_ts(ts.into_inner()); let (mut req_tx, event_feed_wrap, receive_event) = new_event_feed(suite.get_region_cdc_client(1)); block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); From e05f8f3ac0083e3f663da3afb24bccb5a3c0007c Mon Sep 17 00:00:00 2001 From: Yilin Chen Date: Mon, 13 Feb 2023 10:46:01 +0800 Subject: [PATCH 520/676] read_pool: avoid get inconsistent sample sum and count from histogram (#14202) close tikv/tikv#14200 We may get inconsistent state when calling `get_sample_sum` and `get_sample_count` in a non-atomic fashion. This may cause unexpected calculation result. This PR calls `Histogram::metric` which uses a lock inside to guarantee consistency. And the PR also adds protective checks to avoid dividing by zero. Signed-off-by: Yilin Chen --- src/read_pool.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/read_pool.rs b/src/read_pool.rs index 8ef2c4a9b25..1488ffada15 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -15,7 +15,7 @@ use file_system::{set_io_type, IoType}; use futures::{channel::oneshot, future::TryFutureExt}; use kvproto::{errorpb, kvrpcpb::CommandPri}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; -use prometheus::{Histogram, IntCounter, IntGauge}; +use prometheus::{core::Metric, Histogram, IntCounter, IntGauge}; use resource_control::{ControlledFuture, ResourceController}; use thiserror::Error; use tikv_util::{ @@ -355,11 +355,15 @@ impl TimeSliceInspector { // Now, we simplify the problem by merging samples from all levels. If we want // more accurate answer in the future, calculate for each level separately. for hist in &inner.time_slice_hist { - new_sum += Duration::from_secs_f64(hist.get_sample_sum()); - new_count += hist.get_sample_count(); + // Call `metric` to get a consistent snapshot of sum and count. + let metric_proto = hist.metric(); + let hist_proto = metric_proto.get_histogram(); + new_sum += Duration::from_secs_f64(hist_proto.get_sample_sum()); + new_count += hist_proto.get_sample_count(); } - let time_diff = new_sum - inner.last_sum; - if time_diff < MIN_TIME_DIFF { + let time_diff = new_sum.saturating_sub(inner.last_sum); + let count_diff = new_count.saturating_sub(inner.last_count); + if time_diff < MIN_TIME_DIFF || count_diff == 0 { return; } let new_val = time_diff / ((new_count - inner.last_count) as u32); From 61c5f1caf74aaeec36e0fd46fc137af672b37451 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 13 Feb 2023 13:52:00 +0800 Subject: [PATCH 521/676] *: fix tablet leak in flow control (#14197) close tikv/tikv#14196 Also add a tool to trace tablet leak. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + Cargo.toml | 1 + cmd/tikv-server/Cargo.toml | 1 + components/engine_rocks/Cargo.toml | 1 + components/engine_rocks/src/engine.rs | 148 +++++++++++++++++- components/engine_traits/Cargo.toml | 1 + components/engine_traits/src/tablet.rs | 5 +- src/server/status_server/mod.rs | 16 ++ .../singleton_flow_controller.rs | 86 ++++++---- .../flow_controller/tablet_flow_controller.rs | 72 ++++++--- 10 files changed, 267 insertions(+), 65 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 046220980bc..6974e776935 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1616,6 +1616,7 @@ dependencies = [ "file_system", "keys", "kvproto", + "lazy_static", "log_wrappers", "protobuf", "raft", diff --git a/Cargo.toml b/Cargo.toml index 29337b4a002..bab7869a9d0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ publish = false [features] default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +trace-tablet-lifetime = ["engine_rocks/trace-lifetime"] tcmalloc = ["tikv_alloc/tcmalloc"] jemalloc = ["tikv_alloc/jemalloc", "engine_rocks/jemalloc"] mimalloc = ["tikv_alloc/mimalloc"] diff --git a/cmd/tikv-server/Cargo.toml b/cmd/tikv-server/Cargo.toml index c5b5cb6403c..4bba926a68e 100644 --- a/cmd/tikv-server/Cargo.toml +++ b/cmd/tikv-server/Cargo.toml @@ -7,6 +7,7 @@ publish = false [features] default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +trace-tablet-lifetime = ["tikv/trace-tablet-lifetime"] tcmalloc = ["server/tcmalloc"] jemalloc = ["server/jemalloc"] mimalloc = ["server/mimalloc"] diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index 6775705e3e1..4c2b7bf5a52 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -5,6 +5,7 @@ edition = "2018" publish = false [features] +trace-lifetime = [] jemalloc = ["rocksdb/jemalloc"] portable = ["rocksdb/portable"] sse = ["rocksdb/sse"] diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 0e73de357e5..de29e676277 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -10,21 +10,154 @@ use crate::{ RocksEngineIterator, RocksSnapshot, }; +#[cfg(feature = "trace-lifetime")] +mod trace { + //! Trace tools for tablets. + //! + //! It's hard to know who is holding the rocksdb reference when trying to + //! debug why the tablet is not deleted. The module will record the + //! backtrace and thread name when the tablet is created or clone. So + //! after print all the backtrace, we can easily figure out who is + //! leaking the tablet. + //! + //! To use the feature, you need to compile tikv-server with + //! trace-tabelt-lifetime feature. For example, `env + //! ENABLE_FEATURES=trace-tablet-lifetime make release`. And then query the trace information by `curl http://ip:status_port/region/id?trace-tablet=1`. + + use std::{ + backtrace::Backtrace, + collections::BTreeMap, + ops::Bound::Included, + sync::{ + atomic::{AtomicU64, Ordering}, + Mutex, + }, + }; + + use rocksdb::DB; + + static CNT: AtomicU64 = AtomicU64::new(0); + + fn inc_id() -> u64 { + CNT.fetch_add(1, Ordering::Relaxed) + } + + struct BacktraceInfo { + bt: Backtrace, + name: String, + } + + impl BacktraceInfo { + fn default() -> Self { + BacktraceInfo { + bt: Backtrace::force_capture(), + name: std::thread::current().name().unwrap_or("").to_string(), + } + } + } + + #[derive(PartialEq, PartialOrd, Eq, Ord, Clone, Copy, Default, Debug)] + struct TabletTraceKey { + region_id: u64, + suffix: u64, + addr: u64, + alloc_id: u64, + } + + lazy_static::lazy_static! { + static ref TABLET_TRACE: Mutex> = Mutex::new(BTreeMap::default()); + } + + pub fn list(id: u64) -> Vec { + let min = TabletTraceKey { + region_id: id, + suffix: 0, + addr: 0, + alloc_id: 0, + }; + let max = TabletTraceKey { + region_id: id, + suffix: u64::MAX, + addr: u64::MAX, + alloc_id: u64::MAX, + }; + let traces = TABLET_TRACE.lock().unwrap(); + traces + .range((Included(min), Included(max))) + .map(|(k, v)| { + format!( + "{}_{} {} {} {}", + k.region_id, k.suffix, k.addr, v.name, v.bt + ) + }) + .collect() + } + + #[derive(Debug)] + pub struct TabletTraceId(TabletTraceKey); + + impl TabletTraceId { + pub fn new(path: &str, db: &DB) -> Self { + let mut name = path.split('/'); + let name = name.next_back().unwrap(); + let parts: Vec<_> = name.split('_').collect(); + if parts.len() == 2 { + let id: u64 = parts[0].parse().unwrap(); + let suffix: u64 = parts[1].parse().unwrap(); + let bt = BacktraceInfo::default(); + let key = TabletTraceKey { + region_id: id, + suffix, + addr: db as *const _ as u64, + alloc_id: inc_id(), + }; + TABLET_TRACE.lock().unwrap().insert(key, bt); + Self(key) + } else { + Self(Default::default()) + } + } + } + + impl Clone for TabletTraceId { + fn clone(&self) -> Self { + if self.0.region_id != 0 { + let bt = BacktraceInfo::default(); + let mut key = self.0; + key.alloc_id = inc_id(); + TABLET_TRACE.lock().unwrap().insert(key, bt); + Self(key) + } else { + Self(self.0) + } + } + } + + impl Drop for TabletTraceId { + fn drop(&mut self) { + if self.0.region_id != 0 { + TABLET_TRACE.lock().unwrap().remove(&self.0); + } + } + } +} + #[derive(Clone, Debug)] pub struct RocksEngine { db: Arc, support_multi_batch_write: bool, + #[cfg(feature = "trace-lifetime")] + _id: trace::TabletTraceId, } impl RocksEngine { pub(crate) fn new(db: DB) -> RocksEngine { - RocksEngine::from_db(Arc::new(db)) - } - - pub fn from_db(db: Arc) -> Self { + let db = Arc::new(db); RocksEngine { - db: db.clone(), support_multi_batch_write: db.get_db_options().is_enable_multi_batch_write(), + #[cfg(feature = "trace-lifetime")] + _id: trace::TabletTraceId::new(db.path(), &db), + db, } } @@ -39,6 +172,11 @@ impl RocksEngine { pub fn support_multi_batch_write(&self) -> bool { self.support_multi_batch_write } + + #[cfg(feature = "trace-lifetime")] + pub fn trace(region_id: u64) -> Vec { + trace::list(region_id) + } } impl KvEngine for RocksEngine { diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index 00b3bb97b66..f235a4d545e 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -15,6 +15,7 @@ fail = "0.5" file_system = { workspace = true } keys = { workspace = true } kvproto = { workspace = true } +lazy_static = "1.0" log_wrappers = { workspace = true } protobuf = "2" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index 6bdfa97a6e6..2c2eb290b0e 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -44,10 +44,11 @@ impl CachedTablet { CachedTablet { latest: Arc::new(LatestTablet { data: Mutex::new(data.clone()), - version: AtomicU64::new(0), + version: AtomicU64::new(1), }), cache: data, - version: 0, + // We use 0 in release, so it needs to be intialized to 1. + version: 1, } } diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index ad7779b121c..2beed27de8b 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -476,6 +476,22 @@ where )); } }; + + #[cfg(feature = "trace-tablet-lifetime")] + let body = { + let query = req.uri().query().unwrap_or(""); + let query_pairs: HashMap<_, _> = + url::form_urlencoded::parse(query.as_bytes()).collect(); + + let mut body = body; + if query_pairs.contains_key("trace-tablet") { + for s in engine_rocks::RocksEngine::trace(id) { + body.push(b'\n'); + body.extend_from_slice(s.as_bytes()); + } + }; + body + }; match Response::builder() .header("content-type", "application/json") .body(hyper::Body::from(body)) diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index 2b36d6d8821..801d3d27280 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -442,8 +442,43 @@ impl Default for CfFlowChecker { } } +pub trait FlowControlFactorStore { + fn num_files_at_level(&self, region_id: u64, cf: &str, level: usize) -> u64; + fn num_immutable_mem_table(&self, region_id: u64, cf: &str) -> u64; + fn pending_compaction_bytes(&self, region_id: u64, cf: &str) -> u64; + fn cf_names(&self, region_id: u64) -> Vec; +} + +impl FlowControlFactorStore for E { + fn cf_names(&self, _region_id: u64) -> Vec { + CfNamesExt::cf_names(self) + .iter() + .map(|v| v.to_string()) + .collect() + } + + fn num_files_at_level(&self, _region_id: u64, cf: &str, level: usize) -> u64 { + match self.get_cf_num_files_at_level(cf, level) { + Ok(Some(n)) => n, + _ => 0, + } + } + fn num_immutable_mem_table(&self, _region_id: u64, cf: &str) -> u64 { + match self.get_cf_num_immutable_mem_table(cf) { + Ok(Some(n)) => n, + _ => 0, + } + } + fn pending_compaction_bytes(&self, _region_id: u64, cf: &str) -> u64 { + match self.get_cf_pending_compaction_bytes(cf) { + Ok(Some(n)) => n, + _ => 0, + } + } +} + #[derive(CopyGetters, Setters)] -pub(super) struct FlowChecker { +pub(super) struct FlowChecker { pub soft_pending_compaction_bytes_limit: u64, hard_pending_compaction_bytes_limit: u64, memtables_threshold: u64, @@ -469,34 +504,38 @@ pub(super) struct FlowChecker FlowChecker { +impl FlowChecker { pub fn new( config: &FlowControlConfig, engine: E, discard_ratio: Arc, limiter: Arc, ) -> Self { - Self::new_with_tablet_suffix(config, engine, discard_ratio, limiter, 0) + Self::new_with_region_id(0, 0, config, engine, discard_ratio, limiter) } - pub fn new_with_tablet_suffix( + pub fn new_with_region_id( + region_id: u64, + tablet_suffix: u64, config: &FlowControlConfig, engine: E, discard_ratio: Arc, limiter: Arc, - tablet_suffix: u64, ) -> Self { let cf_checkers = engine - .cf_names() + .cf_names(region_id) .into_iter() - .map(|cf| (cf.to_owned(), CfFlowChecker::default())) + .map(|cf_name| (cf_name, CfFlowChecker::default())) .collect(); Self { + region_id, + tablet_suffix, soft_pending_compaction_bytes_limit: config.soft_pending_compaction_bytes_limit.0, hard_pending_compaction_bytes_limit: config.hard_pending_compaction_bytes_limit.0, memtables_threshold: config.memtables_threshold, @@ -510,7 +549,6 @@ impl FlowChecker { last_record_time: Instant::now_coarse(), last_speed: 0.0, wait_for_destroy_range_finish: false, - tablet_suffix, } } @@ -568,11 +606,8 @@ impl FlowChecker { for (cf, cf_checker) in &mut self.cf_checkers { if let Some(before) = cf_checker.pending_bytes_before_unsafe_destroy_range { let soft = (self.soft_pending_compaction_bytes_limit as f64).log2(); - let after = (self - .engine - .get_cf_pending_compaction_bytes(cf) - .unwrap_or(None) - .unwrap_or(0) as f64) + let after = (self.engine.pending_compaction_bytes(self.region_id, cf) + as f64) .log2(); assert!(before < soft); @@ -691,12 +726,7 @@ impl FlowChecker { // Because pending compaction bytes changes dramatically, take the // logarithm of pending compaction bytes to make the values fall into // a relative small range - let num = (self - .engine - .get_cf_pending_compaction_bytes(&cf) - .unwrap_or(None) - .unwrap_or(0) as f64) - .log2(); + let num = (self.engine.pending_compaction_bytes(self.region_id, &cf) as f64).log2(); let checker = self.cf_checkers.get_mut(&cf).unwrap(); checker.long_term_pending_bytes.observe(num); SCHED_PENDING_COMPACTION_BYTES_GAUGE @@ -756,11 +786,7 @@ impl FlowChecker { } fn on_memtable_change(&mut self, cf: &str) { - let num_memtables = self - .engine - .get_cf_num_immutable_mem_table(cf) - .unwrap_or(None) - .unwrap_or(0); + let num_memtables = self.engine.num_immutable_mem_table(self.region_id, cf); let checker = self.cf_checkers.get_mut(cf).unwrap(); SCHED_MEMTABLE_GAUGE .with_label_values(&[cf]) @@ -839,11 +865,7 @@ impl FlowChecker { } fn collect_l0_consumption_stats(&mut self, cf: &str, l0_bytes: u64) { - let num_l0_files = self - .engine - .get_cf_num_files_at_level(cf, 0) - .unwrap_or(None) - .unwrap_or(0); + let num_l0_files = self.engine.num_files_at_level(self.region_id, cf, 0); let checker = self.cf_checkers.get_mut(cf).unwrap(); checker.last_l0_bytes += l0_bytes; checker.long_term_num_l0_files.observe(num_l0_files); @@ -856,11 +878,7 @@ impl FlowChecker { } fn collect_l0_production_stats(&mut self, cf: &str, flush_bytes: u64) { - let num_l0_files = self - .engine - .get_cf_num_files_at_level(cf, 0) - .unwrap_or(None) - .unwrap_or(0); + let num_l0_files = self.engine.num_files_at_level(self.region_id, cf, 0); let checker = self.cf_checkers.get_mut(cf).unwrap(); checker.last_flush_bytes += flush_bytes; diff --git a/src/storage/txn/flow_controller/tablet_flow_controller.rs b/src/storage/txn/flow_controller/tablet_flow_controller.rs index 922e986874a..556b5f4a8fa 100644 --- a/src/storage/txn/flow_controller/tablet_flow_controller.rs +++ b/src/storage/txn/flow_controller/tablet_flow_controller.rs @@ -17,9 +17,48 @@ use engine_traits::{CfNamesExt, FlowControlFactorsExt, TabletRegistry}; use rand::Rng; use tikv_util::{sys::thread::StdThreadBuildWrapper, time::Limiter}; -use super::singleton_flow_controller::{FlowChecker, Msg, RATIO_SCALE_FACTOR, TICK_DURATION}; +use super::singleton_flow_controller::{ + FlowChecker, FlowControlFactorStore, Msg, RATIO_SCALE_FACTOR, TICK_DURATION, +}; use crate::storage::config::FlowControlConfig; +pub struct TabletFlowFactorStore { + registry: TabletRegistry, +} + +impl TabletFlowFactorStore { + pub fn new(registry: TabletRegistry) -> Self { + Self { registry } + } + + fn query(&self, region_id: u64, f: impl Fn(&EK) -> engine_traits::Result>) -> u64 { + self.registry + .get(region_id) + .and_then(|mut c| c.latest().and_then(|t| f(t).ok().flatten())) + .unwrap_or(0) + } +} + +impl FlowControlFactorStore + for TabletFlowFactorStore +{ + fn cf_names(&self, _region_id: u64) -> Vec { + engine_traits::DATA_CFS + .iter() + .map(|s| s.to_string()) + .collect() + } + fn num_files_at_level(&self, region_id: u64, cf: &str, level: usize) -> u64 { + self.query(region_id, |t| t.get_cf_num_files_at_level(cf, level)) + } + fn num_immutable_mem_table(&self, region_id: u64, cf: &str) -> u64 { + self.query(region_id, |t| t.get_cf_num_immutable_mem_table(cf)) + } + fn pending_compaction_bytes(&self, region_id: u64, cf: &str) -> u64 { + self.query(region_id, |t| t.get_cf_pending_compaction_bytes(cf)) + } +} + type Limiters = Arc, Arc)>>>; pub struct TabletFlowController { enabled: Arc, @@ -59,8 +98,7 @@ impl TabletFlowController { Msg::Disable }) .unwrap(); - let flow_checkers: Arc>>> = - Arc::new(RwLock::new(HashMap::default())); + let flow_checkers = Arc::new(RwLock::new(HashMap::default())); let limiters: Limiters = Arc::new(RwLock::new(HashMap::default())); Self { enabled: Arc::new(AtomicBool::new(config.enable)), @@ -90,7 +128,7 @@ impl FlowInfoDispatcher { rx: Receiver, flow_info_receiver: Receiver, registry: TabletRegistry, - flow_checkers: Arc>>>, + flow_checkers: Arc>>>>, limiters: Limiters, config: FlowControlConfig, ) -> JoinHandle<()> { @@ -139,14 +177,10 @@ impl FlowInfoDispatcher { } Ok(FlowInfo::Created(region_id, suffix)) => { let mut checkers = flow_checkers.as_ref().write().unwrap(); - let checker = match checkers.entry(region_id) { + match checkers.entry(region_id) { HashMapEntry::Occupied(e) => e.into_mut(), HashMapEntry::Vacant(e) => { - let engine = if let Some(mut c) = registry.get(region_id) && let Some(t) = c.latest() { - t.clone() - } else { - continue; - }; + let engine = TabletFlowFactorStore::new(registry.clone()); let mut v = limiters.as_ref().write().unwrap(); let discard_ratio = Arc::new(AtomicU32::new(0)); let limiter = v.entry(region_id).or_insert(( @@ -157,26 +191,16 @@ impl FlowInfoDispatcher { ), discard_ratio, )); - e.insert(FlowChecker::new_with_tablet_suffix( + e.insert(FlowChecker::new_with_region_id( + region_id, + suffix, &config, engine, limiter.1.clone(), limiter.0.clone(), - suffix, )) - }, - }; - // check if the checker's engine is exactly (region_id, suffix) - // if checker.suffix < suffix, it means its tablet is old and needs the - // refresh - if checker.tablet_suffix() < suffix { - let cached = registry.get(region_id); - // None means the region is destroyed. - if let Some(mut c) = cached && let Some(engine) = c.latest() { - checker.set_engine(engine.clone()); - checker.set_tablet_suffix(suffix); } - } + }; } Ok(FlowInfo::Destroyed(region_id, suffix)) => { let mut remove_limiter = false; From c9cebe6cd44255018202d2e56023df096038bc09 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 13 Feb 2023 16:20:01 +0800 Subject: [PATCH 522/676] raftstore-v2: integration test (#13989) ref tikv/tikv#12842 Implement integration test framework v2 Signed-off-by: SpadeA-Tang Co-authored-by: Xinye Tao --- Cargo.lock | 49 + Cargo.toml | 4 +- .../cdc/tests/failpoints/test_observe.rs | 2 +- .../tests/integrations/test_flow_control.rs | 2 +- components/cdc/tests/mod.rs | 2 +- components/engine_traits/Cargo.toml | 1 + components/engine_traits/src/tablet.rs | 7 + components/raftstore-v2/src/lib.rs | 2 +- components/raftstore-v2/src/router/imp.rs | 12 + components/raftstore/src/store/snap.rs | 5 + components/resolved_ts/tests/mod.rs | 2 +- components/test_backup/src/lib.rs | 2 +- components/test_raftstore-v2/Cargo.toml | 68 + components/test_raftstore-v2/src/cluster.rs | 1518 +++++++++++++++++ components/test_raftstore-v2/src/lib.rs | 9 + components/test_raftstore-v2/src/node.rs | 423 +++++ components/test_raftstore-v2/src/server.rs | 726 ++++++++ .../src/transport_simulate.rs | 128 ++ components/test_raftstore-v2/src/util.rs | 191 +++ components/test_raftstore/src/cluster.rs | 47 +- components/test_raftstore/src/lib.rs | 2 +- .../test_raftstore/src/transport_simulate.rs | 2 +- components/test_raftstore/src/util.rs | 96 +- components/test_raftstore_macro/src/lib.rs | 10 +- src/server/engine_factory.rs | 8 + src/server/service/kv.rs | 6 + src/server/tablet_snap.rs | 39 + tests/Cargo.toml | 1 + tests/failpoints/cases/test_async_io.rs | 4 +- .../cases/test_cmd_epoch_checker.rs | 4 +- tests/failpoints/cases/test_conf_change.rs | 2 +- tests/failpoints/cases/test_early_apply.rs | 2 +- tests/failpoints/cases/test_merge.rs | 38 +- tests/failpoints/cases/test_pending_peers.rs | 4 +- tests/failpoints/cases/test_rawkv.rs | 2 +- tests/failpoints/cases/test_replica_read.rs | 22 +- .../cases/test_replica_stale_read.rs | 4 +- tests/failpoints/cases/test_snap.rs | 18 +- tests/failpoints/cases/test_split_region.rs | 10 +- tests/failpoints/cases/test_stale_peer.rs | 2 +- tests/failpoints/cases/test_stale_read.rs | 12 +- .../raftstore/test_conf_change.rs | 311 ++-- .../integrations/raftstore/test_flashback.rs | 4 +- .../integrations/raftstore/test_lease_read.rs | 24 +- tests/integrations/raftstore/test_merge.rs | 95 +- tests/integrations/raftstore/test_prevote.rs | 2 +- .../raftstore/test_region_info_accessor.rs | 2 +- .../raftstore/test_replica_read.rs | 18 +- tests/integrations/raftstore/test_single.rs | 54 +- tests/integrations/raftstore/test_snap.rs | 6 +- .../raftstore/test_split_region.rs | 286 ++-- .../raftstore/test_transfer_leader.rs | 145 +- .../raftstore/test_unsafe_recovery.rs | 4 +- tests/integrations/storage/test_raftkv.rs | 2 +- 54 files changed, 3774 insertions(+), 667 deletions(-) create mode 100644 components/test_raftstore-v2/Cargo.toml create mode 100644 components/test_raftstore-v2/src/cluster.rs create mode 100644 components/test_raftstore-v2/src/lib.rs create mode 100644 components/test_raftstore-v2/src/node.rs create mode 100644 components/test_raftstore-v2/src/server.rs create mode 100644 components/test_raftstore-v2/src/transport_simulate.rs create mode 100644 components/test_raftstore-v2/src/util.rs diff --git a/Cargo.lock b/Cargo.lock index 6974e776935..6083f14bad7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5942,6 +5942,54 @@ dependencies = [ "txn_types", ] +[[package]] +name = "test_raftstore-v2" +version = "0.0.1" +dependencies = [ + "api_version", + "backtrace", + "causal_ts", + "collections", + "concurrency_manager", + "crossbeam", + "encryption_export", + "engine_rocks", + "engine_rocks_helper", + "engine_test", + "engine_traits", + "fail", + "file_system", + "futures 0.3.15", + "grpcio", + "grpcio-health", + "keys", + "kvproto", + "lazy_static", + "log_wrappers", + "pd_client", + "protobuf", + "raft", + "raftstore", + "raftstore-v2", + "rand 0.8.5", + "resolved_ts", + "resource_control", + "resource_metering", + "security", + "server", + "slog", + "slog-global", + "tempfile", + "test_pd_client", + "test_raftstore", + "test_util", + "tikv", + "tikv_util", + "tokio", + "tokio-timer", + "txn_types", +] + [[package]] name = "test_raftstore_macro" version = "0.0.1" @@ -6062,6 +6110,7 @@ dependencies = [ "test_pd", "test_pd_client", "test_raftstore", + "test_raftstore-v2", "test_raftstore_macro", "test_sst_importer", "test_storage", diff --git a/Cargo.toml b/Cargo.toml index bab7869a9d0..63be8944f5f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,7 +39,7 @@ cloud-azure = [ "encryption_export/cloud-azure", "sst_importer/cloud-azure", ] -testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport"] +testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport", "engine_traits/testexport"] test-engine-kv-rocksdb = [ "engine_test/test-engine-kv-rocksdb" ] @@ -281,6 +281,7 @@ members = [ "components/test_pd", "components/test_pd_client", "components/test_raftstore", + "components/test_raftstore-v2", "components/test_raftstore_macro", "components/test_sst_importer", "components/test_storage", @@ -357,6 +358,7 @@ example_coprocessor_plugin = { path = "components/test_coprocessor_plugin/exampl test_pd = { path = "components/test_pd" } test_pd_client = { path = "components/test_pd_client" } test_raftstore = { path = "components/test_raftstore", default-features = false } +test_raftstore-v2 = { path = "components/test_raftstore-v2", default-features = false } test_raftstore_macro = { path = "components/test_raftstore_macro" } test_sst_importer = { path = "components/test_sst_importer" } test_storage = { path = "components/test_storage", default-features = false } diff --git a/components/cdc/tests/failpoints/test_observe.rs b/components/cdc/tests/failpoints/test_observe.rs index 8c418558dcc..480fcc4582f 100644 --- a/components/cdc/tests/failpoints/test_observe.rs +++ b/components/cdc/tests/failpoints/test_observe.rs @@ -130,7 +130,7 @@ fn test_observe_duplicate_cmd_impl() { #[allow(dead_code)] fn test_delayed_change_cmd() { let mut cluster = new_server_cluster(1, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(20)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(20)); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(100); cluster.pd_client.disable_default_operator(); let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); diff --git a/components/cdc/tests/integrations/test_flow_control.rs b/components/cdc/tests/integrations/test_flow_control.rs index 56cb43e06c4..fdfd136d9c7 100644 --- a/components/cdc/tests/integrations/test_flow_control.rs +++ b/components/cdc/tests/integrations/test_flow_control.rs @@ -15,7 +15,7 @@ use crate::{new_event_feed, TestSuiteBuilder}; fn test_cdc_congest() { let mut cluster = new_server_cluster(1, 1); // Increase the Raft tick interval to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(100), None); + configure_for_lease_read(&mut cluster.cfg, Some(100), None); let memory_quota = 1024; // 1KB let mut suite = TestSuiteBuilder::new() .cluster(cluster) diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index 77e50bb10b2..843b6b2f1d0 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -236,7 +236,7 @@ impl TestSuite { pub fn new(count: usize, api_version: ApiVersion) -> TestSuite { let mut cluster = new_server_cluster_with_api_ver(1, count, api_version); // Increase the Raft tick interval to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(100), None); + configure_for_lease_read(&mut cluster.cfg, Some(100), None); // Disable background renew to make timestamp predictable. configure_for_causal_ts(&mut cluster, "0s", 1); diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index f235a4d545e..664bc72afc5 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -6,6 +6,7 @@ publish = false [features] failpoints = ["fail/failpoints"] +testexport = [] [dependencies] case_macros = { workspace = true } diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index 2c2eb290b0e..79512a99f64 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -13,6 +13,8 @@ use collections::HashMap; use kvproto::metapb::Region; use tikv_util::box_err; +#[cfg(any(test, feature = "testexport"))] +use crate::StateStorage; use crate::{Error, FlushState, Result}; #[derive(Debug)] @@ -146,6 +148,11 @@ pub trait TabletFactory: Send + Sync { /// Check if the tablet with specified path exists fn exists(&self, path: &Path) -> bool; + + #[cfg(any(test, feature = "testexport"))] + fn set_state_storage(&self, _: Arc) { + unimplemented!() + } } pub struct SingletonFactory { diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index b82b6de3931..8af6b57e9bc 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -39,6 +39,6 @@ pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; -pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; +pub use operation::{write_initial_states, SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{store::Config, Error, Result}; pub use worker::pd::{PdReporter, Task as PdTask}; diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 315f8a0d8eb..a9a8b23b571 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -156,4 +156,16 @@ impl RaftRouter { { self.local_reader.snapshot(req) } + + #[cfg(any(test, feature = "testexport"))] + pub fn new_with_store_meta( + router: StoreRouter, + store_meta: Arc>>, + ) -> Self { + let logger = router.logger().clone(); + RaftRouter { + router: router.clone(), + local_reader: LocalReader::new(store_meta, router, logger), + } + } } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 358ec716195..5f971818e9a 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -2021,6 +2021,11 @@ impl TabletSnapManager { } Ok(total_size) } + + #[inline] + pub fn root_path(&self) -> &Path { + self.base.as_path() + } } #[cfg(test)] diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index e8d2a6429ba..314a11db1a2 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -40,7 +40,7 @@ impl TestSuite { pub fn new(count: usize) -> Self { let mut cluster = new_server_cluster(1, count); // Increase the Raft tick interval to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(100), None); + configure_for_lease_read(&mut cluster.cfg, Some(100), None); Self::with_cluster(count, cluster) } diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index 3409a6ef366..cb669070b9e 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -73,7 +73,7 @@ impl TestSuite { pub fn new(count: usize, sst_max_size: u64, api_version: ApiVersion) -> TestSuite { let mut cluster = new_server_cluster_with_api_ver(1, count, api_version); // Increase the Raft tick interval to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(100), None); + configure_for_lease_read(&mut cluster.cfg, Some(100), None); cluster.run(); let mut endpoints = HashMap::default(); diff --git a/components/test_raftstore-v2/Cargo.toml b/components/test_raftstore-v2/Cargo.toml new file mode 100644 index 00000000000..9ccfdb93cfe --- /dev/null +++ b/components/test_raftstore-v2/Cargo.toml @@ -0,0 +1,68 @@ +[package] +name = "test_raftstore-v2" +version = "0.0.1" +edition = "2018" +publish = false + +[features] +default = ["test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] +cloud-aws = ["encryption_export/cloud-aws"] +cloud-gcp = ["encryption_export/cloud-gcp"] +cloud-azure = ["encryption_export/cloud-azure"] +test-engine-kv-rocksdb = [ + "raftstore/test-engine-kv-rocksdb" +] +test-engine-raft-raft-engine = [ + "raftstore/test-engine-raft-raft-engine" +] +test-engines-rocksdb = [ + "raftstore/test-engines-rocksdb", +] +test-engines-panic = [ + "raftstore/test-engines-panic", +] + +[dependencies] +api_version = { workspace = true } +backtrace = "0.3" +causal_ts = { workspace = true, features = ["testexport"] } +collections = { workspace = true } +concurrency_manager = { workspace = true } +crossbeam = "0.8" +encryption_export = { workspace = true } +engine_rocks = { workspace = true } +engine_rocks_helper = { workspace = true } +engine_test = { workspace = true } +engine_traits = { workspace = true } +fail = "0.5" +file_system = { workspace = true } +futures = "0.3" +grpcio = { workspace = true } +grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } +keys = { workspace = true } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +lazy_static = "1.3" +log_wrappers = { workspace = true } +pd_client = { workspace = true } +protobuf = { version = "2.8", features = ["bytes"] } +raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } +raftstore = { workspace = true, features = ["testexport"] } +raftstore-v2 = { workspace = true, features = ["testexport"] } +rand = "0.8" +resolved_ts = { workspace = true } +resource_control = { workspace = true } +resource_metering = { workspace = true } +security = { workspace = true } +server = { workspace = true } +slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +# better to not use slog-global, but pass in the logger +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +tempfile = "3.0" +test_pd_client = { workspace = true } +test_raftstore = { workspace = true } +test_util = { workspace = true } +tikv = { workspace = true } +tikv_util = { workspace = true } +tokio = { version = "1.5", features = ["rt-multi-thread"] } +tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +txn_types = { workspace = true } diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs new file mode 100644 index 00000000000..c935040055f --- /dev/null +++ b/components/test_raftstore-v2/src/cluster.rs @@ -0,0 +1,1518 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::hash_map::Entry as MapEntry, + result, + sync::{Arc, Mutex, RwLock}, + thread, + time::Duration, +}; + +use collections::{HashMap, HashSet}; +use encryption_export::DataKeyManager; +use engine_rocks::{RocksDbVector, RocksEngine, RocksSnapshot, RocksStatistics}; +use engine_test::raft::RaftTestEngine; +use engine_traits::{ + Iterable, KvEngine, MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, RaftLogBatch, + ReadOptions, SyncMutable, TabletRegistry, CF_DEFAULT, +}; +use file_system::IoRateLimiter; +use futures::{compat::Future01CompatExt, executor::block_on, select, FutureExt}; +use keys::data_key; +use kvproto::{ + errorpb::Error as PbError, + kvrpcpb::ApiVersion, + metapb::{self, Buckets, PeerRole, RegionEpoch}, + raft_cmdpb::{ + AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RegionDetailResponse, Request, + Response, StatusCmdType, + }, + raft_serverpb::{PeerState, RaftApplyState, RegionLocalState, StoreIdent}, +}; +use pd_client::PdClient; +use raftstore::{ + store::{ + cmd_resp, initial_region, util::check_key_in_region, Bucket, BucketRange, Callback, + RegionSnapshot, WriteResponse, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, + }, + Error, Result, +}; +use raftstore_v2::{ + router::{PeerMsg, QueryResult}, + write_initial_states, SimpleWriteEncoder, StoreMeta, StoreRouter, +}; +use resource_control::ResourceGroupManager; +use tempfile::TempDir; +use test_pd_client::TestPdClient; +use test_raftstore::{ + is_error_response, new_admin_request, new_delete_cmd, new_delete_range_cmd, new_get_cf_cmd, + new_peer, new_put_cf_cmd, new_region_detail_cmd, new_region_leader_cmd, new_request, + new_snap_cmd, new_status_request, new_store, new_tikv_config_with_api_ver, + new_transfer_leader_cmd, sleep_ms, Config, Filter, FilterFactory, PartitionFilterFactory, + RawEngine, +}; +use tikv::server::Result as ServerResult; +use tikv_util::{ + box_err, box_try, debug, error, safe_panic, thread_group::GroupProperties, time::Instant, + timer::GLOBAL_TIMER_HANDLE, warn, worker::LazyWorker, HandyRwLock, +}; + +use crate::create_test_engine; + +// We simulate 3 or 5 nodes, each has a store. +// Sometimes, we use fixed id to test, which means the id +// isn't allocated by pd, and node id, store id are same. +// E,g, for node 1, the node id and store id are both 1. +pub trait Simulator { + // Pass 0 to let pd allocate a node id if db is empty. + // If node id > 0, the node must be created in db already, + // and the node id must be the same as given argument. + // Return the node id. + // TODO: we will rename node name here because now we use store only. + fn run_node( + &mut self, + node_id: u64, + cfg: Config, + store_meta: Arc>>, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + resource_manager: &Option>, + ) -> ServerResult; + + fn stop_node(&mut self, node_id: u64); + fn get_node_ids(&self) -> HashSet; + fn add_send_filter(&mut self, node_id: u64, filter: Box); + fn clear_send_filters(&mut self, node_id: u64); + fn get_router(&self, node_id: u64) -> Option>; + fn get_snap_dir(&self, node_id: u64) -> String; + + fn read(&mut self, request: RaftCmdRequest, timeout: Duration) -> Result { + let mut req_clone = request.clone(); + req_clone.clear_requests(); + req_clone.mut_requests().push(new_snap_cmd()); + match self.snapshot(req_clone, timeout) { + Ok(snap) => { + let requests = request.get_requests(); + let mut response = RaftCmdResponse::default(); + let mut responses = Vec::with_capacity(requests.len()); + for req in requests { + let cmd_type = req.get_cmd_type(); + match cmd_type { + CmdType::Get => { + let mut resp = Response::default(); + let key = req.get_get().get_key(); + let cf = req.get_get().get_cf(); + let region = snap.get_region(); + + if let Err(e) = check_key_in_region(key, region) { + return Ok(cmd_resp::new_error(e)); + } + + let res = if cf.is_empty() { + snap.get_value(key).unwrap_or_else(|e| { + panic!( + "[region {}] failed to get {} with cf {}: {:?}", + snap.get_region().get_id(), + log_wrappers::Value::key(key), + cf, + e + ) + }) + } else { + snap.get_value_cf(cf, key).unwrap_or_else(|e| { + panic!( + "[region {}] failed to get {}: {:?}", + snap.get_region().get_id(), + log_wrappers::Value::key(key), + e + ) + }) + }; + if let Some(res) = res { + resp.mut_get().set_value(res.to_vec()); + } + resp.set_cmd_type(cmd_type); + responses.push(resp); + } + _ => unimplemented!(), + } + } + response.set_responses(responses.into()); + + Ok(response) + } + Err(e) => Ok(e), + } + } + + fn snapshot( + &mut self, + request: RaftCmdRequest, + timeout: Duration, + ) -> std::result::Result::Snapshot>, RaftCmdResponse>; + + fn async_peer_msg_on_node(&self, node_id: u64, region_id: u64, msg: PeerMsg) -> Result<()>; + + fn call_query(&self, request: RaftCmdRequest, timeout: Duration) -> Result { + let node_id = request.get_header().get_peer().get_store_id(); + self.call_query_on_node(node_id, request, timeout) + } + + fn call_query_on_node( + &self, + node_id: u64, + request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + let region_id = request.get_header().get_region_id(); + let (msg, sub) = PeerMsg::raft_query(request); + match self.async_peer_msg_on_node(node_id, region_id, msg) { + Ok(()) => {} + Err(e) => { + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(e.into()); + return Ok(resp); + } + } + + let timeout_f = GLOBAL_TIMER_HANDLE.delay(std::time::Instant::now() + timeout); + // todo: unwrap? + match block_on(async move { + select! { + res = sub.result().fuse() => Ok(res.unwrap()), + _ = timeout_f.compat().fuse() => Err(Error::Timeout(format!("request timeout for {:?}", timeout))), + + } + }).unwrap() { + QueryResult::Read(_) => unreachable!(), + QueryResult::Response(resp) => Ok(resp), + } + } + + fn call_command(&self, request: RaftCmdRequest, timeout: Duration) -> Result { + let node_id = request.get_header().get_peer().get_store_id(); + self.call_command_on_node(node_id, request, timeout) + } + + fn call_command_on_node( + &self, + node_id: u64, + mut request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + let region_id = request.get_header().get_region_id(); + + let (msg, sub) = if request.has_admin_request() { + PeerMsg::admin_command(request) + } else { + let requests = request.get_requests(); + let mut write_encoder = SimpleWriteEncoder::with_capacity(64); + for req in requests { + match req.get_cmd_type() { + CmdType::Put => { + let put = req.get_put(); + write_encoder.put(put.get_cf(), put.get_key(), put.get_value()); + } + CmdType::Delete => { + let delete = req.get_delete(); + write_encoder.delete(delete.get_cf(), delete.get_key()); + } + CmdType::DeleteRange => { + unimplemented!() + } + _ => unreachable!(), + } + } + PeerMsg::simple_write(Box::new(request.take_header()), write_encoder.encode()) + }; + + match self.async_peer_msg_on_node(node_id, region_id, msg) { + Ok(()) => {} + Err(e) => { + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(e.into()); + return Ok(resp); + } + } + + let timeout_f = GLOBAL_TIMER_HANDLE.delay(std::time::Instant::now() + timeout); + block_on(async move { + select! { + // todo: unwrap? + res = sub.result().fuse() => Ok(res.unwrap()), + _ = timeout_f.compat().fuse() => Err(Error::Timeout(format!("request timeout for {:?}", timeout))), + + } + }) + } +} + +pub struct Cluster { + pub cfg: Config, + leaders: HashMap, + pub count: usize, + + pub paths: Vec, + pub engines: Vec<(TabletRegistry, RaftTestEngine)>, + pub tablet_registries: HashMap>, + pub raft_engines: HashMap, + pub store_metas: HashMap>>>, + key_managers: Vec>>, + pub io_rate_limiter: Option>, + key_managers_map: HashMap>>, + group_props: HashMap, + pub sst_workers: Vec>, + pub sst_workers_map: HashMap, + pub kv_statistics: Vec>, + pub raft_statistics: Vec>>, + pub sim: Arc>, + pub pd_client: Arc, + resource_manager: Option>, +} + +impl Cluster { + pub fn new( + id: u64, + count: usize, + sim: Arc>, + pd_client: Arc, + api_version: ApiVersion, + ) -> Cluster { + Cluster { + cfg: Config { + tikv: new_tikv_config_with_api_ver(id, api_version), + prefer_mem: true, + }, + count, + tablet_registries: HashMap::default(), + key_managers_map: HashMap::default(), + group_props: HashMap::default(), + raft_engines: HashMap::default(), + store_metas: HashMap::default(), + leaders: HashMap::default(), + kv_statistics: vec![], + raft_statistics: vec![], + sst_workers: vec![], + sst_workers_map: HashMap::default(), + paths: vec![], + engines: vec![], + key_managers: vec![], + io_rate_limiter: None, + resource_manager: Some(Arc::new(ResourceGroupManager::default())), + sim, + pd_client, + } + } + + pub fn id(&self) -> u64 { + self.cfg.server.cluster_id + } + + // Bootstrap the store with fixed ID (like 1, 2, .. 5) and + // initialize first region in all stores, then start the cluster. + pub fn run(&mut self) { + self.create_engines(); + self.bootstrap_region().unwrap(); + self.start().unwrap(); + } + + // Bootstrap the store with fixed ID (like 1, 2, .. 5) and + // initialize first region in store 1, then start the cluster. + pub fn run_conf_change(&mut self) -> u64 { + self.create_engines(); + let region_id = self.bootstrap_conf_change(); + self.start().unwrap(); + region_id + } + + pub fn create_engines(&mut self) { + self.io_rate_limiter = Some(Arc::new( + self.cfg + .storage + .io_rate_limit + .build(true /* enable_statistics */), + )); + for id in 1..self.count + 1 { + self.create_engine(Some((self.id(), id as u64))); + } + } + + // id indicates cluster id store_id + fn create_engine(&mut self, id: Option<(u64, u64)>) { + let (reg, raft_engine, key_manager, dir, sst_worker, kv_statistics, raft_statistics) = + create_test_engine(id, self.io_rate_limiter.clone(), &self.cfg); + self.engines.push((reg, raft_engine)); + self.key_managers.push(key_manager); + self.paths.push(dir); + self.sst_workers.push(sst_worker); + self.kv_statistics.push(kv_statistics); + self.raft_statistics.push(raft_statistics); + } + + pub fn start(&mut self) -> ServerResult<()> { + if self.cfg.raft_store.store_io_pool_size == 0 { + // v2 always use async write. + self.cfg.raft_store.store_io_pool_size = 1; + } + + let node_ids: Vec = self.tablet_registries.iter().map(|(&id, _)| id).collect(); + for node_id in node_ids { + self.run_node(node_id)?; + } + + // Try start new nodes. + for id in self.raft_engines.len()..self.count { + let id = id as u64 + 1; + self.create_engine(Some((self.id(), id))); + let (tablet_registry, raft_engine) = self.engines.last().unwrap().clone(); + + let key_mgr = self.key_managers.last().unwrap().clone(); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(id))); + + let props = GroupProperties::default(); + tikv_util::thread_group::set_properties(Some(props.clone())); + + // todo: GroupProperties + let mut sim = self.sim.wl(); + let node_id = sim.run_node( + id, + self.cfg.clone(), + store_meta.clone(), + raft_engine.clone(), + tablet_registry.clone(), + &self.resource_manager, + )?; + assert_eq!(id, node_id); + self.group_props.insert(node_id, props); + self.raft_engines.insert(node_id, raft_engine.clone()); + self.tablet_registries + .insert(node_id, tablet_registry.clone()); + self.store_metas.insert(node_id, store_meta); + self.key_managers_map.insert(node_id, key_mgr); + } + + Ok(()) + } + + pub fn run_node(&mut self, node_id: u64) -> ServerResult<()> { + debug!("starting node {}", node_id); + let tablet_registry = self.tablet_registries[&node_id].clone(); + let raft_engine = self.raft_engines[&node_id].clone(); + let cfg = self.cfg.clone(); + + // if let Some(labels) = self.labels.get(&node_id) { + // cfg.server.labels = labels.to_owned(); + // } + let store_meta = match self.store_metas.entry(node_id) { + MapEntry::Occupied(o) => { + let mut meta = o.get().lock().unwrap(); + *meta = StoreMeta::new(node_id); + o.get().clone() + } + MapEntry::Vacant(v) => v + .insert(Arc::new(Mutex::new(StoreMeta::new(node_id)))) + .clone(), + }; + + let props = GroupProperties::default(); + self.group_props.insert(node_id, props.clone()); + tikv_util::thread_group::set_properties(Some(props)); + + debug!("calling run node"; "node_id" => node_id); + self.sim.wl().run_node( + node_id, + cfg, + store_meta, + raft_engine, + tablet_registry, + &self.resource_manager, + )?; + debug!("node {} started", node_id); + Ok(()) + } + + pub fn stop_node(&mut self, node_id: u64) { + debug!("stopping node {}", node_id); + self.group_props[&node_id].mark_shutdown(); + + // Simulate shutdown behavior of server shutdown. It's not enough to just set + // the map above as current thread may also query properties during shutdown. + let previous_prop = tikv_util::thread_group::current_properties(); + tikv_util::thread_group::set_properties(Some(self.group_props[&node_id].clone())); + match self.sim.write() { + Ok(mut sim) => sim.stop_node(node_id), + Err(_) => safe_panic!("failed to acquire write lock."), + } + self.pd_client.shutdown_store(node_id); + + let mut regions = vec![]; + let reg = &self.tablet_registries[&node_id]; + reg.for_each_opened_tablet(|region_id, _| { + regions.push(region_id); + true + }); + for region_id in regions { + if let Some(mut tablet) = reg.get(region_id) { + if let Some(tablet) = tablet.latest() { + let mut tried = 0; + while tried < 10 { + if Arc::strong_count(tablet.as_inner()) <= 3 { + break; + } + thread::sleep(Duration::from_millis(10)); + tried += 1; + } + } + } + reg.remove(region_id); + } + + debug!("node {} stopped", node_id); + tikv_util::thread_group::set_properties(previous_prop); + } + + /// Multiple nodes with fixed node id, like node 1, 2, .. 5, + /// First region 1 is in all stores with peer 1, 2, .. 5. + /// Peer 1 is in node 1, store 1, etc. + /// + /// Must be called after `create_engines`. + pub fn bootstrap_region(&mut self) -> Result<()> { + for (i, (tablet_registry, raft_engine)) in self.engines.iter().enumerate() { + let id = i as u64 + 1; + self.tablet_registries.insert(id, tablet_registry.clone()); + self.raft_engines.insert(id, raft_engine.clone()); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(id))); + self.store_metas.insert(id, store_meta); + self.key_managers_map + .insert(id, self.key_managers[i].clone()); + self.sst_workers_map.insert(id, i); + } + + let mut region = metapb::Region::default(); + region.set_id(1); + region.set_start_key(keys::EMPTY_KEY.to_vec()); + region.set_end_key(keys::EMPTY_KEY.to_vec()); + region.mut_region_epoch().set_version(INIT_EPOCH_VER); + region.mut_region_epoch().set_conf_ver(INIT_EPOCH_CONF_VER); + + for &id in self.raft_engines.keys() { + let peer = new_peer(id, id); + region.mut_peers().push(peer.clone()); + } + + for raft_engine in self.raft_engines.values() { + let mut wb = raft_engine.log_batch(10); + wb.put_prepare_bootstrap_region(®ion)?; + write_initial_states(&mut wb, region.clone())?; + box_try!(raft_engine.consume(&mut wb, true)); + } + + self.bootstrap_cluster(region); + + Ok(()) + } + + pub fn bootstrap_conf_change(&mut self) -> u64 { + for (i, (tablet_registry, raft_engine)) in self.engines.iter().enumerate() { + let id = i as u64 + 1; + self.tablet_registries.insert(id, tablet_registry.clone()); + self.raft_engines.insert(id, raft_engine.clone()); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(id))); + self.store_metas.insert(id, store_meta); + self.key_managers_map + .insert(id, self.key_managers[i].clone()); + self.sst_workers_map.insert(id, i); + } + + let node_id = 1; + let region_id = 1; + let peer_id = 1; + + let region = initial_region(node_id, region_id, peer_id); + let raft_engine = self.raft_engines[&node_id].clone(); + let mut wb = raft_engine.log_batch(10); + wb.put_prepare_bootstrap_region(®ion).unwrap(); + write_initial_states(&mut wb, region.clone()).unwrap(); + raft_engine.consume(&mut wb, true).unwrap(); + + self.bootstrap_cluster(region); + + region_id + } + + // This is only for fixed id test + fn bootstrap_cluster(&mut self, region: metapb::Region) { + self.pd_client + .bootstrap_cluster(new_store(1, "".to_owned()), region) + .unwrap(); + for id in self.raft_engines.keys() { + let store = new_store(*id, "".to_owned()); + // todo: labels + self.pd_client.put_store(store).unwrap(); + } + } + + pub fn get_engine(&self, node_id: u64) -> WrapFactory { + WrapFactory::new( + self.pd_client.clone(), + self.raft_engines[&node_id].clone(), + self.tablet_registries[&node_id].clone(), + ) + } + + // mixed read and write requests are not supportted + pub fn call_command( + &mut self, + request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + let mut is_read = false; + let mut not_read = false; + for req in request.get_requests() { + match req.get_cmd_type() { + CmdType::Get | CmdType::Snap | CmdType::ReadIndex => { + is_read = true; + } + _ => { + not_read = true; + } + } + } + let ret = if is_read { + assert!(!not_read); + self.sim.wl().read(request.clone(), timeout) + } else if request.has_status_request() { + self.sim.wl().call_query(request.clone(), timeout) + } else { + self.sim.wl().call_command(request.clone(), timeout) + }; + match ret { + Err(e) => { + warn!("failed to call command {:?}: {:?}", request, e); + Err(e) + } + a => a, + } + } + + pub fn call_command_on_leader( + &mut self, + mut request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + let timer = Instant::now(); + let region_id = request.get_header().get_region_id(); + loop { + let leader = match self.leader_of_region(region_id) { + None => return Err(Error::NotLeader(region_id, None)), + Some(l) => l, + }; + request.mut_header().set_peer(leader); + let resp = match self.call_command(request.clone(), timeout) { + e @ Err(_) => return e, + Ok(resp) => resp, + }; + if self.refresh_leader_if_needed(&resp, region_id) + && timer.saturating_elapsed() < timeout + { + warn!( + "{:?} is no longer leader, let's retry", + request.get_header().get_peer() + ); + continue; + } + return Ok(resp); + } + } + + pub fn call_command_on_node( + &self, + node_id: u64, + request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + match self + .sim + .rl() + .call_command_on_node(node_id, request.clone(), timeout) + { + Err(e) => { + warn!("failed to call command {:?}: {:?}", request, e); + Err(e) + } + a => a, + } + } + + pub fn leader_of_region(&mut self, region_id: u64) -> Option { + let timer = Instant::now_coarse(); + let timeout = Duration::from_secs(5); + let mut store_ids = None; + while timer.saturating_elapsed() < timeout { + match self.voter_store_ids_of_region(region_id) { + None => thread::sleep(Duration::from_millis(10)), + Some(ids) => { + store_ids = Some(ids); + break; + } + } + } + let store_ids = store_ids?; + if let Some(l) = self.leaders.get(®ion_id) { + // leader may be stopped in some tests. + if self.valid_leader_id(region_id, l.get_store_id()) { + return Some(l.clone()); + } + } + self.reset_leader_of_region(region_id); + let mut leader = None; + let mut leaders = HashMap::default(); + + let node_ids = self.sim.rl().get_node_ids(); + // For some tests, we stop the node but pd still has this information, + // and we must skip this. + let alive_store_ids: Vec<_> = store_ids + .iter() + .filter(|id| node_ids.contains(id)) + .cloned() + .collect(); + while timer.saturating_elapsed() < timeout { + for store_id in &alive_store_ids { + let l = match self.query_leader(*store_id, region_id, Duration::from_secs(1)) { + None => continue, + Some(l) => l, + }; + leaders + .entry(l.get_id()) + .or_insert((l, vec![])) + .1 + .push(*store_id); + } + if let Some((_, (l, c))) = leaders.iter().max_by_key(|(_, (_, c))| c.len()) { + if c.contains(&l.get_store_id()) { + leader = Some(l.clone()); + // Technically, correct calculation should use two quorum when in joint + // state. Here just for simplicity. + if c.len() > store_ids.len() / 2 { + break; + } + } + } + debug!("failed to detect leaders"; "leaders" => ?leaders, "store_ids" => ?store_ids); + sleep_ms(10); + leaders.clear(); + } + + if let Some(l) = leader { + self.leaders.insert(region_id, l); + } + + self.leaders.get(®ion_id).cloned() + } + + pub fn query_leader( + &mut self, + store_id: u64, + region_id: u64, + timeout: Duration, + ) -> Option { + // To get region leader, we don't care real peer id, so use 0 instead. + let peer = new_peer(store_id, 0); + let find_leader = new_status_request(region_id, peer, new_region_leader_cmd()); + let mut resp = match self.call_command(find_leader, timeout) { + Ok(resp) => resp, + Err(err) => { + error!( + "fail to get leader of region {} on store {}, error: {:?}", + region_id, store_id, err + ); + return None; + } + }; + let mut region_leader = resp.take_status_response().take_region_leader(); + // NOTE: node id can't be 0. + if self.valid_leader_id(region_id, region_leader.get_leader().get_store_id()) { + Some(region_leader.take_leader()) + } else { + None + } + } + + fn valid_leader_id(&self, region_id: u64, leader_store_id: u64) -> bool { + let store_ids = match self.voter_store_ids_of_region(region_id) { + None => return false, + Some(ids) => ids, + }; + let node_ids = self.sim.rl().get_node_ids(); + store_ids.contains(&leader_store_id) && node_ids.contains(&leader_store_id) + } + + fn voter_store_ids_of_region(&self, region_id: u64) -> Option> { + block_on(self.pd_client.get_region_by_id(region_id)) + .unwrap() + .map(|region| { + region + .get_peers() + .iter() + .flat_map(|p| { + if p.get_role() != PeerRole::Learner { + Some(p.get_store_id()) + } else { + None + } + }) + .collect() + }) + } + + pub fn reset_leader_of_region(&mut self, region_id: u64) { + self.leaders.remove(®ion_id); + } + + // If the resp is "not leader error", get the real leader. + // Otherwise reset or refresh leader if needed. + // Returns if the request should retry. + fn refresh_leader_if_needed(&mut self, resp: &RaftCmdResponse, region_id: u64) -> bool { + if !is_error_response(resp) { + return false; + } + + let err = resp.get_header().get_error(); + if err + .get_message() + .contains("peer has not applied to current term") + { + // leader peer has not applied to current term + return true; + } + + // If command is stale, leadership may have changed. + // EpochNotMatch is not checked as leadership is checked first in raftstore. + if err.has_stale_command() { + self.reset_leader_of_region(region_id); + return true; + } + + if !err.has_not_leader() { + return false; + } + let err = err.get_not_leader(); + if !err.has_leader() { + self.reset_leader_of_region(region_id); + return true; + } + self.leaders.insert(region_id, err.get_leader().clone()); + true + } + + pub fn request( + &mut self, + key: &[u8], + reqs: Vec, + read_quorum: bool, + timeout: Duration, + ) -> RaftCmdResponse { + let timer = Instant::now(); + let mut tried_times = 0; + while tried_times < 2 || timer.saturating_elapsed() < timeout { + tried_times += 1; + let mut region = self.get_region(key); + let region_id = region.get_id(); + let req = new_request( + region_id, + region.take_region_epoch(), + reqs.clone(), + read_quorum, + ); + let result = self.call_command_on_leader(req, timeout); + + let resp = match result { + e @ Err(Error::Timeout(_)) + | e @ Err(Error::NotLeader(..)) + | e @ Err(Error::StaleCommand) => { + warn!("call command failed, retry it"; "err" => ?e); + sleep_ms(100); + continue; + } + Err(e) => panic!("call command failed {:?}", e), + Ok(resp) => resp, + }; + + if resp.get_header().get_error().has_epoch_not_match() { + warn!("seems split, let's retry"); + sleep_ms(100); + continue; + } + if resp + .get_header() + .get_error() + .get_message() + .contains("merging mode") + { + warn!("seems waiting for merge, let's retry"); + sleep_ms(100); + continue; + } + return resp; + } + panic!("request timeout"); + } + + pub fn get_region(&self, key: &[u8]) -> metapb::Region { + self.get_region_with(key, |_| true) + } + + pub fn get_region_id(&self, key: &[u8]) -> u64 { + self.get_region(key).get_id() + } + + // Get region ids of all opened tablets in a store + pub fn region_ids(&self, store_id: u64) -> Vec { + let mut ids = vec![]; + let registry = self.tablet_registries.get(&store_id).unwrap(); + registry.for_each_opened_tablet(|id, _| -> bool { + ids.push(id); + true + }); + ids + } + + pub fn scan( + &self, + store_id: u64, + cf: &str, + start_key: &[u8], + end_key: &[u8], + fill_cache: bool, + mut f: F, + ) -> engine_traits::Result<()> + where + F: FnMut(&[u8], &[u8]) -> engine_traits::Result, + { + let region_ids = self.region_ids(store_id); + for id in region_ids { + self.scan_region(store_id, id, cf, start_key, end_key, fill_cache, &mut f)?; + } + Ok(()) + } + + // start_key and end_key should be `data key` + fn scan_region( + &self, + store_id: u64, + region_id: u64, + cf: &str, + start_key: &[u8], + end_key: &[u8], + fill_cache: bool, + f: F, + ) -> engine_traits::Result<()> + where + F: FnMut(&[u8], &[u8]) -> engine_traits::Result, + { + let tablet_registry = self.tablet_registries.get(&store_id).unwrap(); + let tablet = tablet_registry + .get(region_id) + .unwrap() + .latest() + .unwrap() + .clone(); + + let region = block_on(self.pd_client.get_region_by_id(region_id)) + .unwrap() + .unwrap(); + let region_start_key: &[u8] = &data_key(region.get_start_key()); + let region_end_key: &[u8] = &data_key(region.get_end_key()); + + let amended_start_key = if start_key > region_start_key { + start_key + } else { + region_start_key + }; + let amended_end_key = if end_key < region_end_key || region_end_key.is_empty() { + end_key + } else { + region_end_key + }; + + tablet.scan(cf, amended_start_key, amended_end_key, fill_cache, f) + } + + pub fn get_raft_engine(&self, node_id: u64) -> RaftTestEngine { + self.raft_engines[&node_id].clone() + } + + pub fn get_region_epoch(&self, region_id: u64) -> RegionEpoch { + block_on(self.pd_client.get_region_by_id(region_id)) + .unwrap() + .unwrap() + .take_region_epoch() + } + + pub fn region_detail(&mut self, region_id: u64, store_id: u64) -> RegionDetailResponse { + let status_cmd = new_region_detail_cmd(); + let peer = new_peer(store_id, 0); + let req = new_status_request(region_id, peer, status_cmd); + let resp = self.call_command(req, Duration::from_secs(5)); + assert!(resp.is_ok(), "{:?}", resp); + + let mut resp = resp.unwrap(); + assert!(resp.has_status_response()); + let mut status_resp = resp.take_status_response(); + assert_eq!(status_resp.get_cmd_type(), StatusCmdType::RegionDetail); + assert!(status_resp.has_region_detail()); + status_resp.take_region_detail() + } + + pub fn get(&mut self, key: &[u8]) -> Option> { + self.get_impl(CF_DEFAULT, key, false) + } + + pub fn get_cf(&mut self, cf: &str, key: &[u8]) -> Option> { + self.get_impl(cf, key, false) + } + + pub fn must_get(&mut self, key: &[u8]) -> Option> { + self.get_impl(CF_DEFAULT, key, true) + } + + fn get_impl(&mut self, cf: &str, key: &[u8], read_quorum: bool) -> Option> { + let mut resp = self.request( + key, + vec![new_get_cf_cmd(cf, key)], + read_quorum, + Duration::from_secs(5), + ); + if resp.get_header().has_error() { + panic!("response {:?} has error", resp); + } + assert_eq!(resp.get_responses().len(), 1); + assert_eq!(resp.get_responses()[0].get_cmd_type(), CmdType::Get); + if resp.get_responses()[0].has_get() { + Some(resp.mut_responses()[0].mut_get().take_value()) + } else { + None + } + } + + // Flush the cf of all opened tablets + pub fn must_flush_cf(&mut self, cf: &str, sync: bool) { + for registry in self.tablet_registries.values() { + registry.for_each_opened_tablet(|_id, cached_tablet| -> bool { + if let Some(db) = cached_tablet.latest() { + db.flush_cf(cf, sync).unwrap(); + } + true + }); + } + } + + // Get region when the `filter` returns true. + pub fn get_region_with(&self, key: &[u8], filter: F) -> metapb::Region + where + F: Fn(&metapb::Region) -> bool, + { + for _ in 0..100 { + if let Ok(region) = self.pd_client.get_region(key) { + if filter(®ion) { + return region; + } + } + // We may meet range gap after split, so here we will + // retry to get the region again. + sleep_ms(20); + } + + panic!("find no region for {}", log_wrappers::hex_encode_upper(key)); + } + + pub fn must_put(&mut self, key: &[u8], value: &[u8]) { + self.must_put_cf(CF_DEFAULT, key, value); + } + + pub fn must_put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) { + if let Err(e) = self.batch_put(key, vec![new_put_cf_cmd(cf, key, value)]) { + panic!("has error: {:?}", e); + } + } + + pub fn put(&mut self, key: &[u8], value: &[u8]) -> result::Result<(), PbError> { + self.batch_put(key, vec![new_put_cf_cmd(CF_DEFAULT, key, value)]) + .map(|_| ()) + } + + pub fn batch_put( + &mut self, + region_key: &[u8], + reqs: Vec, + ) -> result::Result { + let resp = self.request(region_key, reqs, false, Duration::from_secs(5)); + if resp.get_header().has_error() { + Err(resp.get_header().get_error().clone()) + } else { + Ok(resp) + } + } + + pub fn must_delete(&mut self, key: &[u8]) { + self.must_delete_cf(CF_DEFAULT, key) + } + + pub fn must_delete_cf(&mut self, cf: &str, key: &[u8]) { + let resp = self.request( + key, + vec![new_delete_cmd(cf, key)], + false, + Duration::from_secs(5), + ); + if resp.get_header().has_error() { + panic!("response {:?} has error", resp); + } + } + + pub fn must_delete_range_cf(&mut self, cf: &str, start: &[u8], end: &[u8]) { + let resp = self.request( + start, + vec![new_delete_range_cmd(cf, start, end)], + false, + Duration::from_secs(5), + ); + if resp.get_header().has_error() { + panic!("response {:?} has error", resp); + } + } + + pub fn must_notify_delete_range_cf(&mut self, cf: &str, start: &[u8], end: &[u8]) { + let mut req = new_delete_range_cmd(cf, start, end); + req.mut_delete_range().set_notify_only(true); + let resp = self.request(start, vec![req], false, Duration::from_secs(5)); + if resp.get_header().has_error() { + panic!("response {:?} has error", resp); + } + } + + pub fn apply_state(&self, region_id: u64, store_id: u64) -> RaftApplyState { + self.get_engine(store_id) + .get_apply_state(region_id) + .unwrap() + .unwrap() + } + + pub fn add_send_filter_on_node(&mut self, node_id: u64, filter: Box) { + self.sim.wl().add_send_filter(node_id, filter); + } + + pub fn add_send_filter(&self, factory: F) { + let mut sim = self.sim.wl(); + for node_id in sim.get_node_ids() { + for filter in factory.generate(node_id) { + sim.add_send_filter(node_id, filter); + } + } + } + + pub fn clear_send_filters(&self) { + let mut sim = self.sim.wl(); + for node_id in sim.get_node_ids() { + sim.clear_send_filters(node_id); + } + } + + // it's so common that we provide an API for it + pub fn partition(&mut self, s1: Vec, s2: Vec) { + self.add_send_filter(PartitionFilterFactory::new(s1, s2)); + } + + pub fn transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) { + let epoch = self.get_region_epoch(region_id); + let transfer_leader = new_admin_request(region_id, &epoch, new_transfer_leader_cmd(leader)); + // todo(SpadeA): modify + let resp = self + .call_command_on_leader(transfer_leader, Duration::from_secs(500)) + .unwrap(); + assert_eq!( + resp.get_admin_response().get_cmd_type(), + AdminCmdType::TransferLeader, + "{:?}", + resp + ); + } + + pub fn must_transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) { + let timer = Instant::now(); + loop { + self.reset_leader_of_region(region_id); + let cur_leader = self.leader_of_region(region_id); + if let Some(ref cur_leader) = cur_leader { + if cur_leader.get_id() == leader.get_id() + && cur_leader.get_store_id() == leader.get_store_id() + { + return; + } + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + panic!( + "failed to transfer leader to [{}] {:?}, current leader: {:?}", + region_id, leader, cur_leader + ); + } + self.transfer_leader(region_id, leader.clone()); + } + } + + pub fn try_transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) -> RaftCmdResponse { + let epoch = self.get_region_epoch(region_id); + let transfer_leader = new_admin_request(region_id, &epoch, new_transfer_leader_cmd(leader)); + self.call_command_on_leader(transfer_leader, Duration::from_secs(5)) + .unwrap() + } + + // It's similar to `ask_split`, the difference is the msg, it sends, is + // `Msg::SplitRegion`, and `region` will not be embedded to that msg. + // Caller must ensure that the `split_key` is in the `region`. + pub fn split_region( + &mut self, + region: &metapb::Region, + split_key: &[u8], + mut cb: Callback, + ) { + let leader = self.leader_of_region(region.get_id()).unwrap(); + let router = self.sim.rl().get_router(leader.get_store_id()).unwrap(); + let split_key = split_key.to_vec(); + let (split_region_req, mut sub) = PeerMsg::request_split( + region.get_region_epoch().clone(), + vec![split_key], + "test".into(), + ); + + router + .check_send(region.get_id(), split_region_req) + .unwrap(); + + block_on(async { + sub.wait_proposed().await; + cb.invoke_proposed(); + sub.wait_committed().await; + cb.invoke_committed(); + let res = sub.result().await.unwrap(); + cb.invoke_with_response(res) + }); + } + + pub fn must_split(&mut self, region: &metapb::Region, split_key: &[u8]) { + let mut try_cnt = 0; + let split_count = self.pd_client.get_split_count(); + loop { + debug!("asking split"; "region" => ?region, "key" => ?split_key); + // In case ask split message is ignored, we should retry. + if try_cnt % 50 == 0 { + self.reset_leader_of_region(region.get_id()); + let key = split_key.to_vec(); + let check = Box::new(move |write_resp: WriteResponse| { + let mut resp = write_resp.response; + if resp.get_header().has_error() { + let error = resp.get_header().get_error(); + if error.has_epoch_not_match() + || error.has_not_leader() + || error.has_stale_command() + || error + .get_message() + .contains("peer has not applied to current term") + { + warn!("fail to split: {:?}, ignore.", error); + return; + } + panic!("failed to split: {:?}", resp); + } + let admin_resp = resp.mut_admin_response(); + let split_resp = admin_resp.mut_splits(); + let regions = split_resp.get_regions(); + assert_eq!(regions.len(), 2); + assert_eq!(regions[0].get_end_key(), key.as_slice()); + assert_eq!(regions[0].get_end_key(), regions[1].get_start_key()); + }); + if self.leader_of_region(region.get_id()).is_some() { + self.split_region(region, split_key, Callback::write(check)); + } + } + + if self.pd_client.check_split(region, split_key) + && self.pd_client.get_split_count() > split_count + { + return; + } + + if try_cnt > 250 { + panic!( + "region {:?} has not been split by {}", + region, + log_wrappers::hex_encode_upper(split_key) + ); + } + try_cnt += 1; + sleep_ms(20); + } + } + + pub fn wait_region_split(&mut self, region: &metapb::Region) { + self.wait_region_split_max_cnt(region, 20, 250, true); + } + + pub fn wait_region_split_max_cnt( + &mut self, + region: &metapb::Region, + itvl_ms: u64, + max_try_cnt: u64, + is_panic: bool, + ) { + let mut try_cnt = 0; + let split_count = self.pd_client.get_split_count(); + loop { + if self.pd_client.get_split_count() > split_count { + match self.pd_client.get_region(region.get_start_key()) { + Err(_) => {} + Ok(left) => { + if left.get_end_key() != region.get_end_key() { + return; + } + } + } + } + + if try_cnt > max_try_cnt { + if is_panic { + panic!( + "region {:?} has not been split after {}ms", + region, + max_try_cnt * itvl_ms + ); + } else { + return; + } + } + try_cnt += 1; + sleep_ms(itvl_ms); + } + } + + pub fn get_snap_dir(&self, node_id: u64) -> String { + self.sim.rl().get_snap_dir(node_id) + } + + pub fn refresh_region_bucket_keys( + &mut self, + _region: &metapb::Region, + _buckets: Vec, + _bucket_ranges: Option>, + _expect_buckets: Option, + ) -> u64 { + unimplemented!() + } + + pub fn send_half_split_region_message( + &mut self, + _region: &metapb::Region, + _expected_bucket_ranges: Option>, + ) { + unimplemented!() + } + + pub fn shutdown(&mut self) { + debug!("about to shutdown cluster"); + let keys = match self.sim.read() { + Ok(s) => s.get_node_ids(), + Err(_) => { + safe_panic!("failed to acquire read lock"); + // Leave the resource to avoid double panic. + return; + } + }; + for id in keys { + self.stop_node(id); + } + self.leaders.clear(); + for store_meta in self.store_metas.values() { + while Arc::strong_count(store_meta) != 1 { + std::thread::sleep(Duration::from_millis(10)); + } + } + self.store_metas.clear(); + for sst_worker in self.sst_workers.drain(..) { + sst_worker.stop_worker(); + } + + debug!("all nodes are shut down."); + } +} + +pub fn bootstrap_store( + raft_engine: &ER, + cluster_id: u64, + store_id: u64, +) -> Result<()> { + let mut ident = StoreIdent::default(); + + if !raft_engine.is_empty()? { + return Err(box_err!("store is not empty and has already had data")); + } + + ident.set_cluster_id(cluster_id); + ident.set_store_id(store_id); + + let mut lb = raft_engine.log_batch(1); + lb.put_store_ident(&ident)?; + raft_engine.consume(&mut lb, true)?; + + Ok(()) +} + +impl Drop for Cluster { + fn drop(&mut self) { + test_util::clear_failpoints(); + self.shutdown(); + } +} + +pub struct WrapFactory { + pd_client: Arc, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, +} + +impl WrapFactory { + pub fn new( + pd_client: Arc, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + ) -> Self { + Self { + raft_engine, + tablet_registry, + pd_client, + } + } + + fn region_id_of_key(&self, key: &[u8]) -> u64 { + self.pd_client.get_region(key).unwrap().get_id() + } + + fn get_tablet(&self, key: &[u8]) -> Option { + // todo: unwrap + let region_id = self.region_id_of_key(key); + self.tablet_registry.get(region_id)?.latest().cloned() + } + + pub fn get_region_state( + &self, + region_id: u64, + ) -> engine_traits::Result> { + self.raft_engine.get_region_state(region_id, u64::MAX) + } + + pub fn get_apply_state(&self, region_id: u64) -> engine_traits::Result> { + self.raft_engine.get_apply_state(region_id, u64::MAX) + } +} + +impl Peekable for WrapFactory { + type DbVector = RocksDbVector; + + fn get_value_opt( + &self, + opts: &ReadOptions, + key: &[u8], + ) -> engine_traits::Result> { + let region_id = self.region_id_of_key(key); + + if let Ok(Some(state)) = self.get_region_state(region_id) { + if state.state == PeerState::Tombstone { + return Ok(None); + } + } + + match self.get_tablet(key) { + Some(tablet) => tablet.get_value_opt(opts, key), + _ => Ok(None), + } + } + + fn get_value_cf_opt( + &self, + opts: &ReadOptions, + cf: &str, + key: &[u8], + ) -> engine_traits::Result> { + let region_id = self.region_id_of_key(key); + + if let Ok(Some(state)) = self.get_region_state(region_id) { + if state.state == PeerState::Tombstone { + return Ok(None); + } + } + + match self.get_tablet(key) { + Some(tablet) => tablet.get_value_cf_opt(opts, cf, key), + _ => Ok(None), + } + } + + fn get_msg_cf( + &self, + _cf: &str, + _key: &[u8], + ) -> engine_traits::Result> { + unimplemented!() + } +} + +impl SyncMutable for WrapFactory { + fn put(&self, key: &[u8], value: &[u8]) -> engine_traits::Result<()> { + match self.get_tablet(key) { + Some(tablet) => tablet.put(key, value), + _ => unimplemented!(), + } + } + + fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> engine_traits::Result<()> { + match self.get_tablet(key) { + Some(tablet) => tablet.put_cf(cf, key, value), + _ => unimplemented!(), + } + } + + fn delete(&self, key: &[u8]) -> engine_traits::Result<()> { + match self.get_tablet(key) { + Some(tablet) => tablet.delete(key), + _ => unimplemented!(), + } + } + + fn delete_cf(&self, cf: &str, key: &[u8]) -> engine_traits::Result<()> { + match self.get_tablet(key) { + Some(tablet) => tablet.delete_cf(cf, key), + _ => unimplemented!(), + } + } + + fn delete_range(&self, _begin_key: &[u8], _end_key: &[u8]) -> engine_traits::Result<()> { + unimplemented!() + } + + fn delete_range_cf( + &self, + _cf: &str, + _begin_key: &[u8], + _end_key: &[u8], + ) -> engine_traits::Result<()> { + unimplemented!() + } +} + +impl RawEngine for WrapFactory { + fn region_local_state( + &self, + region_id: u64, + ) -> engine_traits::Result> { + self.get_region_state(region_id) + } +} diff --git a/components/test_raftstore-v2/src/lib.rs b/components/test_raftstore-v2/src/lib.rs new file mode 100644 index 00000000000..101658ff57b --- /dev/null +++ b/components/test_raftstore-v2/src/lib.rs @@ -0,0 +1,9 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod cluster; +mod node; +mod server; +mod transport_simulate; +pub mod util; + +pub use crate::{cluster::*, node::*, server::*, transport_simulate::*, util::*}; diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs new file mode 100644 index 00000000000..96275cc8383 --- /dev/null +++ b/components/test_raftstore-v2/src/node.rs @@ -0,0 +1,423 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{Arc, Mutex, RwLock}, + time::Duration, +}; + +use collections::{HashMap, HashSet}; +use concurrency_manager::ConcurrencyManager; +use engine_rocks::RocksEngine; +use engine_test::raft::RaftTestEngine; +use engine_traits::{RaftEngineReadOnly, TabletRegistry}; +use kvproto::{ + kvrpcpb::ApiVersion, + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_serverpb::RaftMessage, +}; +use raft::prelude::MessageType; +use raftstore::{ + coprocessor::CoprocessorHost, + errors::Error as RaftError, + store::{ + AutoSplitController, GlobalReplicationState, RegionSnapshot, SplitConfigManager, + TabletSnapKey, TabletSnapManager, Transport, + }, + Result, +}; +use raftstore_v2::{ + router::{PeerMsg, RaftRouter}, + StateStorage, StoreMeta, StoreRouter, +}; +use resource_control::ResourceGroupManager; +use resource_metering::CollectorRegHandle; +use tempfile::TempDir; +use test_pd_client::TestPdClient; +use test_raftstore::{Config, Filter}; +use tikv::{ + config::{ConfigController, Module}, + server::{ + raftkv::ReplicaReadLockChecker, tablet_snap::copy_tablet_snapshot, NodeV2, + Result as ServerResult, + }, +}; +use tikv_util::{ + box_err, + config::VersionTrack, + worker::{Builder as WorkerBuilder, LazyWorker}, +}; + +use crate::{Cluster, RaftStoreRouter, SimulateTransport, Simulator, SnapshotRouter}; + +#[derive(Clone)] +pub struct ChannelTransport { + core: Arc>, +} + +impl ChannelTransport { + pub fn new() -> ChannelTransport { + ChannelTransport { + core: Arc::new(Mutex::new(ChannelTransportCore { + snap_paths: HashMap::default(), + routers: HashMap::default(), + })), + } + } + + pub fn core(&self) -> &Arc> { + &self.core + } +} + +impl Transport for ChannelTransport { + fn send(&mut self, msg: RaftMessage) -> raftstore::Result<()> { + let from_store = msg.get_from_peer().get_store_id(); + let to_store = msg.get_to_peer().get_store_id(); + let is_snapshot = msg.get_message().get_msg_type() == MessageType::MsgSnapshot; + + if is_snapshot { + let snap = msg.get_message().get_snapshot(); + let key = TabletSnapKey::from_region_snap( + msg.get_region_id(), + msg.get_to_peer().get_id(), + snap, + ); + let sender_snap_mgr = match self.core.lock().unwrap().snap_paths.get(&from_store) { + Some(snap_mgr) => snap_mgr.0.clone(), + None => return Err(box_err!("missing snap manager for store {}", from_store)), + }; + let recver_snap_mgr = match self.core.lock().unwrap().snap_paths.get(&to_store) { + Some(snap_mgr) => snap_mgr.0.clone(), + None => return Err(box_err!("missing snap manager for store {}", to_store)), + }; + + if let Err(e) = + copy_tablet_snapshot(key, msg.clone(), &sender_snap_mgr, &recver_snap_mgr) + { + return Err(box_err!("copy tablet snapshot failed: {:?}", e)); + } + } + + let core = self.core.lock().unwrap(); + match core.routers.get(&to_store) { + Some(h) => { + h.send_raft_msg(msg)?; + // report snapshot status if needed + Ok(()) + } + _ => Err(box_err!("missing sender for store {}", to_store)), + } + } + + fn set_store_allowlist(&mut self, _allowlist: Vec) { + unimplemented!(); + } + + fn need_flush(&self) -> bool { + false + } + + fn flush(&mut self) {} +} + +pub struct ChannelTransportCore { + pub snap_paths: HashMap, + pub routers: HashMap>>, +} + +impl Default for ChannelTransport { + fn default() -> Self { + Self::new() + } +} + +type SimulateChannelTransport = SimulateTransport; + +pub struct NodeCluster { + trans: ChannelTransport, + pd_client: Arc, + nodes: HashMap>, + simulate_trans: HashMap, + concurrency_managers: HashMap, + // snap_mgrs: HashMap, +} + +impl NodeCluster { + pub fn new(pd_client: Arc) -> NodeCluster { + NodeCluster { + trans: ChannelTransport::new(), + pd_client, + nodes: HashMap::default(), + simulate_trans: HashMap::default(), + concurrency_managers: HashMap::default(), + // snap_mgrs: HashMap::default(), + } + } +} + +impl Simulator for NodeCluster { + fn get_node_ids(&self) -> HashSet { + self.nodes.keys().cloned().collect() + } + + fn add_send_filter(&mut self, node_id: u64, filter: Box) { + self.simulate_trans + .get_mut(&node_id) + .unwrap() + .add_filter(filter); + } + + fn clear_send_filters(&mut self, node_id: u64) { + self.simulate_trans + .get_mut(&node_id) + .unwrap() + .clear_filters(); + } + + fn run_node( + &mut self, + node_id: u64, + cfg: Config, + store_meta: Arc>>, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + _resource_manager: &Option>, + ) -> ServerResult { + assert!(!self.nodes.contains_key(&node_id)); + let pd_worker = LazyWorker::new("test-pd-worker"); + + let simulate_trans = SimulateTransport::new(self.trans.clone()); + let mut raft_store = cfg.raft_store.clone(); + raft_store + .validate( + cfg.coprocessor.region_split_size.unwrap_or_default(), + cfg.coprocessor.enable_region_bucket, + cfg.coprocessor.region_bucket_size, + ) + .unwrap(); + + let mut node = NodeV2::new(&cfg.server, self.pd_client.clone(), None); + node.try_bootstrap_store(&raft_store, &raft_engine).unwrap(); + assert_eq!(node.id(), node_id); + + tablet_registry + .tablet_factory() + .set_state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + node.router().clone(), + ))); + + // todo: node id 0 + let (snap_mgr, snap_mgs_path) = if node_id == 0 + || !self + .trans + .core + .lock() + .unwrap() + .snap_paths + .contains_key(&node_id) + { + let tmp = test_util::temp_dir("test_cluster", cfg.prefer_mem); + let snap_path = tmp.path().to_str().unwrap().to_owned(); + (TabletSnapManager::new(snap_path)?, Some(tmp)) + } else { + let trans = self.trans.core.lock().unwrap(); + let &(ref snap_mgr, _) = &trans.snap_paths[&node_id]; + (snap_mgr.clone(), None) + }; + + let raft_router = RaftRouter::new_with_store_meta(node.router().clone(), store_meta); + // Create coprocessor. + let mut coprocessor_host = + CoprocessorHost::new(raft_router.store_router().clone(), cfg.coprocessor.clone()); + + // if let Some(f) = self.post_create_coprocessor_host.as_ref() { + // f(node_id, &mut coprocessor_host); + // } + + let cm = ConcurrencyManager::new(1.into()); + self.concurrency_managers.insert(node_id, cm.clone()); + + ReplicaReadLockChecker::new(cm.clone()).register(&mut coprocessor_host); + + let cfg_controller = ConfigController::new(cfg.tikv.clone()); + // cfg_controller.register( + // Module::Coprocessor, + // Box::new(SplitCheckConfigManager(split_scheduler.clone())), + // ); + + let split_config_manager = + SplitConfigManager::new(Arc::new(VersionTrack::new(cfg.tikv.split.clone()))); + cfg_controller.register(Module::Split, Box::new(split_config_manager.clone())); + + let auto_split_controller = AutoSplitController::new( + split_config_manager, + cfg.tikv.server.grpc_concurrency, + cfg.tikv.readpool.unified.max_thread_count, + // todo: Is None sufficient for test? + None, + ); + + let bg_worker = WorkerBuilder::new("background").thread_count(2).create(); + let state: Arc> = Arc::default(); + node.start( + raft_engine.clone(), + tablet_registry, + &raft_router, + simulate_trans.clone(), + snap_mgr.clone(), + cm, + None, + coprocessor_host, + auto_split_controller, + CollectorRegHandle::new_for_test(), + bg_worker, + pd_worker, + Arc::new(VersionTrack::new(raft_store)), + &state, + )?; + assert!( + raft_engine + .get_prepare_bootstrap_region() + .unwrap() + .is_none() + ); + assert!(node_id == 0 || node_id == node.id()); + let node_id = node.id(); + + let region_split_size = cfg.coprocessor.region_split_size; + let enable_region_bucket = cfg.coprocessor.enable_region_bucket; + let region_bucket_size = cfg.coprocessor.region_bucket_size; + let mut raftstore_cfg = cfg.tikv.raft_store; + raftstore_cfg + .validate( + region_split_size.unwrap_or_default(), + enable_region_bucket, + region_bucket_size, + ) + .unwrap(); + + // let raft_store = Arc::new(VersionTrack::new(raftstore_cfg)); + // cfg_controller.register( + // Module::Raftstore, + // Box::new(RaftstoreConfigManager::new( + // node.refresh_config_scheduler(), + // raft_store, + // )), + // ); + + if let Some(tmp) = snap_mgs_path { + self.trans + .core + .lock() + .unwrap() + .snap_paths + .insert(node_id, (snap_mgr, tmp)); + } + + self.trans + .core + .lock() + .unwrap() + .routers + .insert(node_id, SimulateTransport::new(raft_router)); + + self.nodes.insert(node_id, node); + self.simulate_trans.insert(node_id, simulate_trans); + Ok(node_id) + } + + fn snapshot( + &mut self, + request: RaftCmdRequest, + timeout: Duration, + ) -> std::result::Result< + RegionSnapshot<::Snapshot>, + RaftCmdResponse, + > { + let node_id = request.get_header().get_peer().get_store_id(); + if !self + .trans + .core + .lock() + .unwrap() + .routers + .contains_key(&node_id) + { + let mut resp = RaftCmdResponse::default(); + let e: RaftError = box_err!("missing sender for store {}", node_id); + resp.mut_header().set_error(e.into()); + return Err(resp); + } + + let mut router = { + let mut guard = self.trans.core.lock().unwrap(); + guard.routers.get_mut(&node_id).unwrap().clone() + }; + + router.snapshot(request, timeout) + } + + fn async_peer_msg_on_node(&self, node_id: u64, region_id: u64, msg: PeerMsg) -> Result<()> { + if !self + .trans + .core + .lock() + .unwrap() + .routers + .contains_key(&node_id) + { + return Err(box_err!("missing sender for store {}", node_id)); + } + + let router = self + .trans + .core + .lock() + .unwrap() + .routers + .get(&node_id) + .cloned() + .unwrap(); + + router.send_peer_msg(region_id, msg) + } + + fn stop_node(&mut self, node_id: u64) { + if let Some(mut node) = self.nodes.remove(&node_id) { + node.stop(); + } + self.trans + .core + .lock() + .unwrap() + .routers + .remove(&node_id) + .unwrap(); + } + + fn get_router(&self, node_id: u64) -> Option> { + self.nodes.get(&node_id).map(|node| node.router().clone()) + } + + fn get_snap_dir(&self, node_id: u64) -> String { + self.trans.core.lock().unwrap().snap_paths[&node_id] + .0 + .root_path() + .to_str() + .unwrap() + .to_owned() + } +} + +pub fn new_node_cluster(id: u64, count: usize) -> Cluster { + let pd_client = Arc::new(TestPdClient::new(id, false)); + let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); + Cluster::new(id, count, sim, pd_client, ApiVersion::V1) +} + +pub fn new_incompatible_node_cluster(id: u64, count: usize) -> Cluster { + let pd_client = Arc::new(TestPdClient::new(id, true)); + let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); + Cluster::new(id, count, sim, pd_client, ApiVersion::V1) +} diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs new file mode 100644 index 00000000000..e64844bb490 --- /dev/null +++ b/components/test_raftstore-v2/src/server.rs @@ -0,0 +1,726 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{Arc, Mutex, RwLock}, + thread, + time::Duration, +}; + +use api_version::{dispatch_api_version, KvFormat}; +use causal_ts::CausalTsProviderImpl; +use collections::{HashMap, HashSet}; +use concurrency_manager::ConcurrencyManager; +use engine_rocks::{RocksEngine, RocksSnapshot}; +use engine_test::raft::RaftTestEngine; +use engine_traits::{KvEngine, TabletRegistry}; +use futures::executor::block_on; +use grpcio::{ChannelBuilder, EnvBuilder, Environment, Error as GrpcError, Service}; +use grpcio_health::HealthService; +use kvproto::{ + deadlock_grpc::create_deadlock, + debugpb_grpc::DebugClient, + diagnosticspb_grpc::create_diagnostics, + kvrpcpb::{ApiVersion, Context}, + metapb, + raft_cmdpb::RaftCmdResponse, + tikvpb_grpc::TikvClient, +}; +use pd_client::PdClient; +use raftstore::{ + coprocessor::CoprocessorHost, + errors::Error as RaftError, + store::{ + AutoSplitController, CheckLeaderRunner, FlowStatsReporter, ReadStats, RegionSnapshot, + TabletSnapManager, WriteStats, + }, + RegionInfoAccessor, +}; +use raftstore_v2::{router::RaftRouter, StateStorage, StoreMeta, StoreRouter}; +use resource_control::ResourceGroupManager; +use resource_metering::{CollectorRegHandle, ResourceTagFactory}; +use security::SecurityManager; +use slog_global::debug; +use tempfile::TempDir; +use test_pd_client::TestPdClient; +use test_raftstore::{AddressMap, Config}; +use tikv::{ + coprocessor, coprocessor_v2, + read_pool::ReadPool, + server::{ + gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, + raftkv::ReplicaReadLockChecker, resolve, service::DiagnosticsService, ConnectionBuilder, + Error, NodeV2, PdStoreAddrResolver, RaftClient, RaftKv2, Result as ServerResult, Server, + ServerTransport, + }, + storage::{ + self, + kv::{FakeExtension, SnapContext}, + txn::flow_controller::{EngineFlowController, FlowController}, + Engine, Storage, + }, +}; +use tikv_util::{ + box_err, + config::VersionTrack, + quota_limiter::QuotaLimiter, + sys::thread::ThreadBuildWrapper, + thd_name, + worker::{Builder as WorkerBuilder, LazyWorker}, + Either, HandyRwLock, +}; +use tokio::runtime::Builder as TokioBuilder; +use txn_types::TxnExtraScheduler; + +use crate::{Cluster, RaftStoreRouter, SimulateTransport, Simulator, SnapshotRouter}; + +#[derive(Clone)] +struct DummyReporter; + +impl FlowStatsReporter for DummyReporter { + fn report_read_stats(&self, _read_stats: ReadStats) {} + fn report_write_stats(&self, _write_stats: WriteStats) {} +} + +type SimulateRaftExtension = ::RaftExtension; +type SimulateStoreTransport = SimulateTransport>; +type SimulateServerTransport = + SimulateTransport>; + +pub type SimulateEngine = RaftKv2; + +pub struct ServerMeta { + node: NodeV2, + server: Server, + sim_router: SimulateStoreTransport, + sim_trans: SimulateServerTransport, + raw_router: StoreRouter, + rsmeter_cleanup: Box, +} + +type PendingServices = Vec Service>>; + +pub struct ServerCluster { + metas: HashMap, + addrs: AddressMap, + pub storages: HashMap, + pub region_info_accessors: HashMap, + snap_paths: HashMap, + snap_mgrs: HashMap, + pd_client: Arc, + // raft_client: RaftClient, + concurrency_managers: HashMap, + env: Arc, + pub pending_services: HashMap, + pub health_services: HashMap, + pub security_mgr: Arc, + pub txn_extra_schedulers: HashMap>, + pub causal_ts_providers: HashMap>, +} + +impl ServerCluster { + pub fn new(pd_client: Arc) -> ServerCluster { + let env = Arc::new( + EnvBuilder::new() + .cq_count(2) + .name_prefix(thd_name!("server-cluster")) + .build(), + ); + let security_mgr = Arc::new(SecurityManager::new(&Default::default()).unwrap()); + let map = AddressMap::default(); + // We don't actually need to handle snapshot message, just create a dead worker + // to make it compile. + let worker = LazyWorker::new("snap-worker"); + let conn_builder = ConnectionBuilder::new( + env.clone(), + Arc::default(), + security_mgr.clone(), + map.clone(), + FakeExtension {}, + worker.scheduler(), + Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), + ); + let _raft_client = RaftClient::new(conn_builder); + ServerCluster { + metas: HashMap::default(), + addrs: map, + pd_client, + security_mgr, + storages: HashMap::default(), + region_info_accessors: HashMap::default(), + snap_mgrs: HashMap::default(), + snap_paths: HashMap::default(), + pending_services: HashMap::default(), + health_services: HashMap::default(), + // raft_client, + concurrency_managers: HashMap::default(), + env, + txn_extra_schedulers: HashMap::default(), + causal_ts_providers: HashMap::default(), + } + } + + pub fn get_addr(&self, node_id: u64) -> String { + self.addrs.get(node_id).unwrap() + } + + pub fn run_node_impl( + &mut self, + node_id: u64, + mut cfg: Config, + store_meta: Arc>>, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + resource_manager: &Option>, + ) -> ServerResult { + let (snap_mgr, snap_mgs_path) = if !self.snap_mgrs.contains_key(&node_id) { + let tmp = test_util::temp_dir("test_cluster", cfg.prefer_mem); + let snap_path = tmp.path().to_str().unwrap().to_owned(); + (TabletSnapManager::new(snap_path)?, Some(tmp)) + } else { + (self.snap_mgrs[&node_id].clone(), None) + }; + + let bg_worker = WorkerBuilder::new("background").thread_count(2).create(); + + if cfg.server.addr == "127.0.0.1:0" { + // Now we cache the store address, so here we should re-use last + // listening address for the same store. + if let Some(addr) = self.addrs.get(node_id) { + cfg.server.addr = addr; + } else { + cfg.server.addr = format!("127.0.0.1:{}", test_util::alloc_port()); + } + } + + // Create node. + let mut raft_store = cfg.raft_store.clone(); + raft_store + .validate( + cfg.coprocessor.region_split_size.unwrap_or_default(), + cfg.coprocessor.enable_region_bucket, + cfg.coprocessor.region_bucket_size, + ) + .unwrap(); + + let mut node = NodeV2::new(&cfg.server, self.pd_client.clone(), None); + node.try_bootstrap_store(&raft_store, &raft_engine).unwrap(); + assert_eq!(node.id(), node_id); + + tablet_registry + .tablet_factory() + .set_state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + node.router().clone(), + ))); + + let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); + + let raft_router = + RaftRouter::new_with_store_meta(node.router().clone(), store_meta.clone()); + + // Create coprocessor. + let mut coprocessor_host = + CoprocessorHost::new(raft_router.store_router().clone(), cfg.coprocessor.clone()); + + let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + + let sim_router = SimulateTransport::new(raft_router.clone()); + // todo(SpadeA): simulate transport + let mut raft_kv_v2 = + RaftKv2::new(raft_router.clone(), region_info_accessor.region_leaders()); + + // Create storage. + let pd_worker = LazyWorker::new("test-pd-worker"); + let pd_sender = raftstore_v2::PdReporter::new( + pd_worker.scheduler(), + slog_global::borrow_global().new(slog::o!()), + ); + let storage_read_pool = ReadPool::from(storage::build_read_pool( + &tikv::config::StorageReadPoolConfig::default_for_test(), + pd_sender, + raft_kv_v2.clone(), + )); + + if let Some(scheduler) = self.txn_extra_schedulers.remove(&node_id) { + raft_kv_v2.set_txn_extra_scheduler(scheduler); + } + + let latest_ts = + block_on(self.pd_client.get_tso()).expect("failed to get timestamp from PD"); + let concurrency_manager = ConcurrencyManager::new(latest_ts); + + let (tx, _rx) = std::sync::mpsc::channel(); + let mut gc_worker = GcWorker::new( + raft_kv_v2.clone(), + tx, + cfg.gc.clone(), + Default::default(), + Arc::new(region_info_accessor.clone()), + ); + gc_worker.start(node_id).unwrap(); + + // todo: resolved ts + + if ApiVersion::V2 == F::TAG { + let casual_ts_provider: Arc = Arc::new( + block_on(causal_ts::BatchTsoProvider::new_opt( + self.pd_client.clone(), + cfg.causal_ts.renew_interval.0, + cfg.causal_ts.alloc_ahead_buffer.0, + cfg.causal_ts.renew_batch_min_size, + cfg.causal_ts.renew_batch_max_size, + )) + .unwrap() + .into(), + ); + self.causal_ts_providers.insert(node_id, casual_ts_provider); + } + + // Start resource metering. + let (res_tag_factory, collector_reg_handle, rsmeter_cleanup) = + self.init_resource_metering(&cfg.resource_metering); + + let check_leader_runner = CheckLeaderRunner::new(store_meta, coprocessor_host.clone()); + let check_leader_scheduler = bg_worker.start("check-leader", check_leader_runner); + + let mut lock_mgr = LockManager::new(&cfg.pessimistic_txn); + let quota_limiter = Arc::new(QuotaLimiter::new( + cfg.quota.foreground_cpu_time, + cfg.quota.foreground_write_bandwidth, + cfg.quota.foreground_read_bandwidth, + cfg.quota.background_cpu_time, + cfg.quota.background_write_bandwidth, + cfg.quota.background_read_bandwidth, + cfg.quota.max_delay_duration, + cfg.quota.enable_auto_tune, + )); + + let casual_ts_provider = self.get_causal_ts_provider(node_id); + let store = Storage::<_, _, F>::from_engine( + raft_kv_v2.clone(), + &cfg.storage, + storage_read_pool.handle(), + lock_mgr.clone(), + concurrency_manager.clone(), + lock_mgr.get_storage_dynamic_configs(), + Arc::new(FlowController::Singleton(EngineFlowController::empty())), + DummyReporter, + res_tag_factory.clone(), + quota_limiter.clone(), + self.pd_client.feature_gate().clone(), + casual_ts_provider.clone(), + resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), + )?; + self.storages.insert(node_id, raft_kv_v2); + + ReplicaReadLockChecker::new(concurrency_manager.clone()).register(&mut coprocessor_host); + + // todo: Import Sst Service + + // Create deadlock service. + let deadlock_service = lock_mgr.deadlock_service(); + + // Create pd client, snapshot manager, server. + let (resolver, state) = resolve::new_resolver( + Arc::clone(&self.pd_client), + &bg_worker, + store.get_engine().raft_extension(), + ); + let security_mgr = Arc::new(SecurityManager::new(&cfg.security).unwrap()); + let cop_read_pool = ReadPool::from(coprocessor::readpool_impl::build_read_pool_for_test( + &tikv::config::CoprReadPoolConfig::default_for_test(), + store.get_engine(), + )); + let copr = coprocessor::Endpoint::new( + &server_cfg.value().clone(), + cop_read_pool.handle(), + concurrency_manager.clone(), + res_tag_factory, + quota_limiter, + ); + let copr_v2 = coprocessor_v2::Endpoint::new(&cfg.coprocessor_v2); + let mut server = None; + + // Create Debug service. + let debug_thread_pool = Arc::new( + TokioBuilder::new_multi_thread() + .thread_name(thd_name!("debugger")) + .worker_threads(1) + .after_start_wrapper(|| {}) + .before_stop_wrapper(|| {}) + .build() + .unwrap(), + ); + let debug_thread_handle = debug_thread_pool.handle().clone(); + let diag_service = DiagnosticsService::new( + debug_thread_handle, + cfg.log.file.filename.clone(), + cfg.slow_log_file.clone(), + ); + + let health_service = HealthService::default(); + + for _ in 0..100 { + let mut svr = Server::new( + node_id, + &server_cfg, + &security_mgr, + store.clone(), + copr.clone(), + copr_v2.clone(), + resolver.clone(), + Either::Right(snap_mgr.clone()), + gc_worker.clone(), + check_leader_scheduler.clone(), + self.env.clone(), + None, + debug_thread_pool.clone(), + health_service.clone(), + ) + .unwrap(); + svr.register_service(create_diagnostics(diag_service.clone())); + svr.register_service(create_deadlock(deadlock_service.clone())); + if let Some(svcs) = self.pending_services.get(&node_id) { + for fact in svcs { + svr.register_service(fact()); + } + } + match svr.build_and_bind() { + Ok(_) => { + server = Some(svr); + break; + } + Err(Error::Grpc(GrpcError::BindFail(ref addr, ref port))) => { + // Servers may meet the error, when we restart them. + debug!("fail to create a server: bind fail {:?}", (addr, port)); + thread::sleep(Duration::from_millis(100)); + continue; + } + Err(ref e) => panic!("fail to create a server: {:?}", e), + } + } + let mut server = server.unwrap(); + let addr = server.listening_addr(); + assert_eq!(addr.clone().to_string(), node.store().address); + cfg.server.addr = format!("{}", addr); + let trans = server.transport(); + let simulate_trans = SimulateTransport::new(trans); + let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); + + // Register the role change observer of the lock manager. + lock_mgr.register_detector_role_change_observer(&mut coprocessor_host); + + let pessimistic_txn_cfg = cfg.tikv.pessimistic_txn; + node.start( + raft_engine, + tablet_registry, + &raft_router, + simulate_trans.clone(), + snap_mgr.clone(), + concurrency_manager.clone(), + casual_ts_provider, + coprocessor_host, + AutoSplitController::default(), + collector_reg_handle, + bg_worker, + pd_worker, + Arc::new(VersionTrack::new(raft_store)), + &state, + )?; + assert!(node_id == 0 || node_id == node.id()); + let node_id = node.id(); + self.snap_mgrs.insert(node_id, snap_mgr); + if let Some(tmp) = snap_mgs_path { + self.snap_paths.insert(node_id, tmp); + } + self.region_info_accessors + .insert(node_id, region_info_accessor); + // todo: importer + self.health_services.insert(node_id, health_service); + + lock_mgr + .start( + node.id(), + Arc::clone(&self.pd_client), + resolver, + Arc::clone(&security_mgr), + &pessimistic_txn_cfg, + ) + .unwrap(); + + server.start(server_cfg, security_mgr).unwrap(); + + self.metas.insert( + node_id, + ServerMeta { + raw_router: raft_router.store_router().clone(), + node, + server, + sim_router, + sim_trans: simulate_trans, + rsmeter_cleanup, + }, + ); + self.addrs.insert(node_id, format!("{}", addr)); + self.concurrency_managers + .insert(node_id, concurrency_manager); + + Ok(node_id) + } + + pub fn get_causal_ts_provider(&self, node_id: u64) -> Option> { + self.causal_ts_providers.get(&node_id).cloned() + } + + fn init_resource_metering( + &self, + cfg: &resource_metering::Config, + ) -> (ResourceTagFactory, CollectorRegHandle, Box) { + let (_, collector_reg_handle, resource_tag_factory, recorder_worker) = + resource_metering::init_recorder(cfg.precision.as_millis()); + let (_, data_sink_reg_handle, reporter_worker) = + resource_metering::init_reporter(cfg.clone(), collector_reg_handle.clone()); + let (_, single_target_worker) = resource_metering::init_single_target( + cfg.receiver_address.clone(), + Arc::new(Environment::new(2)), + data_sink_reg_handle, + ); + + ( + resource_tag_factory, + collector_reg_handle, + Box::new(move || { + single_target_worker.stop_worker(); + reporter_worker.stop_worker(); + recorder_worker.stop_worker(); + }), + ) + } + + pub fn get_concurrency_manager(&self, node_id: u64) -> ConcurrencyManager { + self.concurrency_managers.get(&node_id).unwrap().clone() + } +} + +impl Simulator for ServerCluster { + fn get_node_ids(&self) -> HashSet { + self.metas.keys().cloned().collect() + } + + fn add_send_filter(&mut self, node_id: u64, filter: Box) { + self.metas + .get_mut(&node_id) + .unwrap() + .sim_trans + .add_filter(filter); + } + + fn clear_send_filters(&mut self, node_id: u64) { + self.metas + .get_mut(&node_id) + .unwrap() + .sim_trans + .clear_filters(); + } + + fn run_node( + &mut self, + node_id: u64, + cfg: Config, + store_meta: Arc>>, + raft_engine: RaftTestEngine, + tablet_registry: TabletRegistry, + resource_manager: &Option>, + ) -> ServerResult { + dispatch_api_version!( + cfg.storage.api_version(), + self.run_node_impl::( + node_id, + cfg, + store_meta, + raft_engine, + tablet_registry, + resource_manager + ) + ) + } + + fn stop_node(&mut self, node_id: u64) { + if let Some(mut meta) = self.metas.remove(&node_id) { + meta.server.stop().unwrap(); + meta.node.stop(); + // // resolved ts worker started, let's stop it + // if let Some(worker) = meta.rts_worker { + // worker.stop_worker(); + // } + (meta.rsmeter_cleanup)(); + } + self.storages.remove(&node_id); + } + + fn snapshot( + &mut self, + request: kvproto::raft_cmdpb::RaftCmdRequest, + timeout: Duration, + ) -> std::result::Result::Snapshot>, RaftCmdResponse> + { + let node_id = request.get_header().get_peer().get_store_id(); + let mut router = match self.metas.get(&node_id) { + None => { + let mut resp = RaftCmdResponse::default(); + let e: RaftError = box_err!("missing sender for store {}", node_id); + resp.mut_header().set_error(e.into()); + return Err(resp); + } + Some(meta) => meta.sim_router.clone(), + }; + + router.snapshot(request, timeout) + } + + fn async_peer_msg_on_node( + &self, + node_id: u64, + region_id: u64, + msg: raftstore_v2::router::PeerMsg, + ) -> raftstore::Result<()> { + let router = match self.metas.get(&node_id) { + None => return Err(box_err!("missing sender for store {}", node_id)), + Some(meta) => meta.sim_router.clone(), + }; + + router.send_peer_msg(region_id, msg) + } + + fn get_router(&self, node_id: u64) -> Option> { + self.metas.get(&node_id).map(|m| m.raw_router.clone()) + } + + fn get_snap_dir(&self, node_id: u64) -> String { + self.snap_mgrs[&node_id] + .root_path() + .to_str() + .unwrap() + .to_owned() + } +} + +impl Cluster { + pub fn must_get_snapshot_of_region(&mut self, region_id: u64) -> RegionSnapshot { + let mut try_snapshot = || -> Option> { + let leader = self.leader_of_region(region_id)?; + let store_id = leader.store_id; + let epoch = self.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + + let mut storage = self.sim.rl().storages.get(&store_id).unwrap().clone(); + let snap_ctx = SnapContext { + pb_ctx: &ctx, + ..Default::default() + }; + storage.snapshot(snap_ctx).ok() + }; + for _ in 0..10 { + if let Some(snapshot) = try_snapshot() { + return snapshot; + } + thread::sleep(Duration::from_millis(200)); + } + panic!("failed to get snapshot of region {}", region_id); + } +} + +pub fn new_server_cluster(id: u64, count: usize) -> Cluster { + let pd_client = Arc::new(TestPdClient::new(id, false)); + let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); + Cluster::new(id, count, sim, pd_client, ApiVersion::V1) +} + +pub fn new_incompatible_server_cluster(id: u64, count: usize) -> Cluster { + let pd_client = Arc::new(TestPdClient::new(id, true)); + let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); + Cluster::new(id, count, sim, pd_client, ApiVersion::V1) +} + +pub fn new_server_cluster_with_api_ver( + id: u64, + count: usize, + api_ver: ApiVersion, +) -> Cluster { + let pd_client = Arc::new(TestPdClient::new(id, false)); + let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); + Cluster::new(id, count, sim, pd_client, api_ver) +} + +pub fn must_new_cluster_and_kv_client() -> (Cluster, TikvClient, Context) { + must_new_cluster_and_kv_client_mul(1) +} + +pub fn must_new_cluster_and_kv_client_mul( + count: usize, +) -> (Cluster, TikvClient, Context) { + let (cluster, leader, ctx) = must_new_cluster_mul(count); + + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + (cluster, client, ctx) +} +pub fn must_new_cluster_mul(count: usize) -> (Cluster, metapb::Peer, Context) { + must_new_and_configure_cluster_mul(count, |_| ()) +} + +fn must_new_and_configure_cluster_mul( + count: usize, + mut configure: impl FnMut(&mut Cluster), +) -> (Cluster, metapb::Peer, Context) { + let mut cluster = new_server_cluster(0, count); + configure(&mut cluster); + cluster.run(); + let region_id = 1; + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader.clone()); + ctx.set_region_epoch(epoch); + + (cluster, leader, ctx) +} + +pub fn must_new_and_configure_cluster_and_kv_client( + configure: impl FnMut(&mut Cluster), +) -> (Cluster, TikvClient, Context) { + let (cluster, leader, ctx) = must_new_and_configure_cluster(configure); + + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = TikvClient::new(channel); + + (cluster, client, ctx) +} + +pub fn must_new_and_configure_cluster( + configure: impl FnMut(&mut Cluster), +) -> (Cluster, metapb::Peer, Context) { + must_new_and_configure_cluster_mul(1, configure) +} + +pub fn must_new_cluster_and_debug_client() -> (Cluster, DebugClient, u64) { + let (cluster, leader, _) = must_new_cluster_mul(1); + + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); + let client = DebugClient::new(channel); + + (cluster, client, leader.get_store_id()) +} diff --git a/components/test_raftstore-v2/src/transport_simulate.rs b/components/test_raftstore-v2/src/transport_simulate.rs new file mode 100644 index 00000000000..f42a891e60f --- /dev/null +++ b/components/test_raftstore-v2/src/transport_simulate.rs @@ -0,0 +1,128 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{Arc, RwLock}, + time::{Duration, Instant}, +}; + +use engine_traits::{KvEngine, RaftEngine}; +use futures::{compat::Future01CompatExt, FutureExt}; +use kvproto::{ + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_serverpb::RaftMessage, +}; +use raftstore::{ + router::handle_send_error, + store::{cmd_resp, RegionSnapshot, Transport}, + Error, Result, Result as RaftStoreResult, +}; +use raftstore_v2::router::{PeerMsg, RaftRouter}; +use test_raftstore::{filter_send, Filter}; +use tikv_util::{timer::GLOBAL_TIMER_HANDLE, HandyRwLock}; + +#[derive(Clone)] +pub struct SimulateTransport { + filters: Arc>>>, + ch: C, +} + +impl SimulateTransport { + pub fn new(ch: C) -> SimulateTransport { + Self { + filters: Arc::new(RwLock::new(vec![])), + ch, + } + } + + pub fn clear_filters(&mut self) { + self.filters.wl().clear(); + } + + pub fn add_filter(&mut self, filter: Box) { + self.filters.wl().push(filter); + } +} + +impl Transport for SimulateTransport { + fn send(&mut self, m: RaftMessage) -> Result<()> { + let ch = &mut self.ch; + filter_send(&self.filters, m, |m| ch.send(m)) + } + + fn set_store_allowlist(&mut self, allowlist: Vec) { + self.ch.set_store_allowlist(allowlist); + } + + fn need_flush(&self) -> bool { + self.ch.need_flush() + } + + fn flush(&mut self) { + self.ch.flush(); + } +} + +pub trait SnapshotRouter { + fn snapshot( + &mut self, + req: RaftCmdRequest, + timeout: Duration, + ) -> std::result::Result, RaftCmdResponse>; +} + +impl SnapshotRouter for RaftRouter { + fn snapshot( + &mut self, + req: RaftCmdRequest, + timeout: Duration, + ) -> std::result::Result, RaftCmdResponse> { + let timeout_f = GLOBAL_TIMER_HANDLE.delay(Instant::now() + timeout).compat(); + futures::executor::block_on(async move { + futures::select! { + res = self.snapshot(req).fuse() => res, + e = timeout_f.fuse() => { + Err(cmd_resp::new_error(Error::Timeout(format!("request timeout for {:?}: {:?}", timeout,e)))) + }, + } + }) + } +} + +impl> SnapshotRouter for SimulateTransport { + fn snapshot( + &mut self, + req: RaftCmdRequest, + timeout: Duration, + ) -> std::result::Result, RaftCmdResponse> { + self.ch.snapshot(req, timeout) + } +} + +pub trait RaftStoreRouter { + fn send_peer_msg(&self, region_id: u64, msg: PeerMsg) -> Result<()>; + + fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()>; +} + +impl RaftStoreRouter for RaftRouter { + fn send_peer_msg(&self, region_id: u64, msg: PeerMsg) -> RaftStoreResult<()> { + self.send(region_id, msg) + .map_err(|e| handle_send_error(region_id, e)) + } + + fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { + let region_id = msg.get_region_id(); + self.send_raft_message(Box::new(msg)) + .map_err(|e| handle_send_error(region_id, e)) + } +} + +impl RaftStoreRouter for SimulateTransport { + fn send_peer_msg(&self, region_id: u64, msg: PeerMsg) -> RaftStoreResult<()> { + self.ch.send_peer_msg(region_id, msg) + } + + fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { + filter_send(&self.filters, msg, |m| self.ch.send_raft_msg(m)) + } +} diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs new file mode 100644 index 00000000000..2bd9444b002 --- /dev/null +++ b/components/test_raftstore-v2/src/util.rs @@ -0,0 +1,191 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{fmt::Write, sync::Arc, thread, time::Duration}; + +use encryption_export::{data_key_manager_from_config, DataKeyManager}; +use engine_rocks::{RocksEngine, RocksStatistics}; +use engine_test::raft::RaftTestEngine; +use engine_traits::{TabletRegistry, CF_DEFAULT}; +use file_system::IoRateLimiter; +use kvproto::kvrpcpb::Context; +use rand::RngCore; +use server::server2::ConfiguredRaftEngine; +use tempfile::TempDir; +use test_raftstore::{new_put_cf_cmd, Config}; +use tikv::{ + server::KvEngineFactoryBuilder, + storage::{ + config::EngineType, + kv::{SnapContext, SnapshotExt}, + Engine, Snapshot, + }, +}; +use tikv_util::{config::ReadableDuration, worker::LazyWorker}; + +use crate::{bootstrap_store, cluster::Cluster, ServerCluster, Simulator}; + +pub fn create_test_engine( + // TODO: pass it in for all cases. + id: Option<(u64, u64)>, + limiter: Option>, + cfg: &Config, +) -> ( + TabletRegistry, + RaftTestEngine, + Option>, + TempDir, + LazyWorker, + Arc, + Option>, +) { + let dir = test_util::temp_dir("test_cluster", cfg.prefer_mem); + let mut cfg = cfg.clone(); + cfg.storage.data_dir = dir.path().to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + let key_manager = + data_key_manager_from_config(&cfg.security.encryption, dir.path().to_str().unwrap()) + .unwrap() + .map(Arc::new); + let cache = cfg + .storage + .block_cache + .build_shared_cache(EngineType::RaftKv2); + let env = cfg + .build_shared_rocks_env(key_manager.clone(), limiter) + .unwrap(); + + let sst_worker = LazyWorker::new("sst-recovery"); + let scheduler = sst_worker.scheduler(); + + let (raft_engine, raft_statistics) = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); + + if let Some((cluster_id, store_id)) = id { + assert_ne!(store_id, 0); + bootstrap_store(&raft_engine, cluster_id, store_id).unwrap(); + } + + let builder = + KvEngineFactoryBuilder::new(env, &cfg.tikv, cache).sst_recovery_sender(Some(scheduler)); + + let factory = Box::new(builder.build()); + let rocks_statistics = factory.rocks_statistics(); + let reg = TabletRegistry::new(factory, dir.path().join("tablet")).unwrap(); + + ( + reg, + raft_engine, + key_manager, + dir, + sst_worker, + rocks_statistics, + raft_statistics, + ) +} + +/// Keep putting random kvs until specified size limit is reached. +pub fn put_till_size( + cluster: &mut Cluster, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + put_cf_till_size(cluster, CF_DEFAULT, limit, range) +} + +pub fn put_cf_till_size( + cluster: &mut Cluster, + cf: &'static str, + limit: u64, + range: &mut dyn Iterator, +) -> Vec { + assert!(limit > 0); + let mut len = 0; + let mut rng = rand::thread_rng(); + let mut key = String::new(); + let mut value = vec![0; 64]; + while len < limit { + let batch_size = std::cmp::min(1024, limit - len); + let mut reqs = vec![]; + for _ in 0..batch_size / 74 + 1 { + key.clear(); + let key_id = range.next().unwrap(); + write!(key, "{:09}", key_id).unwrap(); + rng.fill_bytes(&mut value); + // plus 1 for the extra encoding prefix + len += key.len() as u64 + 1; + len += value.len() as u64; + reqs.push(new_put_cf_cmd(cf, key.as_bytes(), &value)); + } + cluster.batch_put(key.as_bytes(), reqs).unwrap(); + // Approximate size of memtable is inaccurate for small data, + // we flush it to SST so we can use the size properties instead. + cluster.must_flush_cf(cf, true); + } + key.into_bytes() +} + +pub fn configure_for_snapshot(cluster: &mut Cluster) { + // Truncate the log quickly so that we can force sending snapshot. + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(2); + cluster.cfg.raft_store.merge_max_log_gap = 1; + cluster.cfg.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); +} + +pub fn configure_for_lease_read_v2( + cluster: &mut Cluster, + base_tick_ms: Option, + election_ticks: Option, +) -> Duration { + if let Some(base_tick_ms) = base_tick_ms { + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); + } + let base_tick_interval = cluster.cfg.raft_store.raft_base_tick_interval.0; + if let Some(election_ticks) = election_ticks { + cluster.cfg.raft_store.raft_election_timeout_ticks = election_ticks; + } + let election_ticks = cluster.cfg.raft_store.raft_election_timeout_ticks as u32; + let election_timeout = base_tick_interval * election_ticks; + // Adjust max leader lease. + cluster.cfg.raft_store.raft_store_max_leader_lease = + ReadableDuration(election_timeout - base_tick_interval); + // Use large peer check interval, abnormal and max leader missing duration to + // make a valid config, that is election timeout x 2 < peer stale state + // check < abnormal < max leader missing duration. + cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration(election_timeout * 3); + cluster.cfg.raft_store.abnormal_leader_missing_duration = + ReadableDuration(election_timeout * 4); + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration(election_timeout * 5); + + election_timeout +} + +pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, region_id: u64) { + let mut storage = cluster + .sim + .read() + .unwrap() + .storages + .get(&node_id) + .unwrap() + .clone(); + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + let snap_ctx = SnapContext { + pb_ctx: &ctx, + ..Default::default() + }; + let snapshot = storage.snapshot(snap_ctx).unwrap(); + let txn_ext = snapshot.txn_ext.clone().unwrap(); + for retry in 0..10 { + if txn_ext.is_max_ts_synced() { + break; + } + thread::sleep(Duration::from_millis(1 << retry)); + } + assert!(snapshot.ext().is_max_ts_synced()); +} diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 9d6444904f2..d5842bf6659 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -12,11 +12,11 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; use encryption_export::DataKeyManager; -use engine_rocks::{RocksEngine, RocksSnapshot, RocksStatistics}; +use engine_rocks::{RocksDbVector, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, WriteBatch, - WriteBatchExt, CF_DEFAULT, CF_RAFT, + CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, SyncMutable, + WriteBatch, WriteBatchExt, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; use futures::{self, channel::oneshot, executor::block_on}; @@ -1335,6 +1335,10 @@ impl Cluster { kv_wb.write().unwrap(); } + pub fn add_send_filter_on_node(&mut self, node_id: u64, filter: Box) { + self.sim.wl().add_send_filter(node_id, filter); + } + pub fn add_send_filter(&self, factory: F) { let mut sim = self.sim.wl(); for node_id in sim.get_node_ids() { @@ -1899,6 +1903,25 @@ impl Cluster { .unwrap(); rx.recv_timeout(Duration::from_secs(5)).unwrap(); } + + pub fn scan( + &self, + store_id: u64, + cf: &str, + start_key: &[u8], + end_key: &[u8], + fill_cache: bool, + f: F, + ) -> engine_traits::Result<()> + where + F: FnMut(&[u8], &[u8]) -> engine_traits::Result, + { + self.engines[&store_id] + .kv + .scan(cf, start_key, end_key, fill_cache, f)?; + + Ok(()) + } } impl Drop for Cluster { @@ -1907,3 +1930,21 @@ impl Drop for Cluster { self.shutdown(); } } + +pub trait RawEngine: Peekable + SyncMutable { + fn region_local_state(&self, region_id: u64) + -> engine_traits::Result>; + + fn raft_apply_state(&self, _region_id: u64) -> engine_traits::Result> { + unimplemented!() + } +} + +impl RawEngine for RocksEngine { + fn region_local_state( + &self, + region_id: u64, + ) -> engine_traits::Result> { + self.get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) + } +} diff --git a/components/test_raftstore/src/lib.rs b/components/test_raftstore/src/lib.rs index 8893d8a7ca4..950581a6ce8 100644 --- a/components/test_raftstore/src/lib.rs +++ b/components/test_raftstore/src/lib.rs @@ -11,7 +11,7 @@ mod node; mod router; mod server; mod transport_simulate; -mod util; +pub mod util; pub use crate::{ cluster::*, config::Config, node::*, router::*, server::*, transport_simulate::*, util::*, diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index 06ff550aa64..a49a41af4e3 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -162,7 +162,7 @@ impl SimulateTransport { } } -fn filter_send( +pub fn filter_send( filters: &Arc>>>, msg: RaftMessage, mut h: H, diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index e765cfb883f..b7a9ea6f1af 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -46,14 +46,21 @@ use rand::RngCore; use server::server::ConfiguredRaftEngine; use tempfile::TempDir; use test_pd_client::TestPdClient; -use tikv::{config::*, server::KvEngineFactoryBuilder, storage::point_key_range}; +use tikv::{ + config::*, + server::KvEngineFactoryBuilder, + storage::{ + kv::{SnapContext, SnapshotExt}, + point_key_range, Engine, Snapshot, + }, +}; pub use tikv_util::store::{find_peer, new_learner_peer, new_peer}; use tikv_util::{config::*, escape, time::ThreadReadId, worker::LazyWorker, HandyRwLock}; use txn_types::Key; -use crate::{Cluster, Config, ServerCluster, Simulator}; +use crate::{Cluster, Config, RawEngine, ServerCluster, Simulator}; -pub fn must_get(engine: &RocksEngine, cf: &str, key: &[u8], value: Option<&[u8]>) { +pub fn must_get(engine: &impl RawEngine, cf: &str, key: &[u8], value: Option<&[u8]>) { for _ in 1..300 { let res = engine.get_value_cf(cf, &keys::data_key(key)).unwrap(); if let (Some(value), Some(res)) = (value, res.as_ref()) { @@ -79,19 +86,19 @@ pub fn must_get(engine: &RocksEngine, cf: &str, key: &[u8], value: Option<&[u8]> ) } -pub fn must_get_equal(engine: &RocksEngine, key: &[u8], value: &[u8]) { +pub fn must_get_equal(engine: &impl RawEngine, key: &[u8], value: &[u8]) { must_get(engine, "default", key, Some(value)); } -pub fn must_get_none(engine: &RocksEngine, key: &[u8]) { +pub fn must_get_none(engine: &impl RawEngine, key: &[u8]) { must_get(engine, "default", key, None); } -pub fn must_get_cf_equal(engine: &RocksEngine, cf: &str, key: &[u8], value: &[u8]) { +pub fn must_get_cf_equal(engine: &impl RawEngine, cf: &str, key: &[u8], value: &[u8]) { must_get(engine, cf, key, Some(value)); } -pub fn must_get_cf_none(engine: &RocksEngine, cf: &str, key: &[u8]) { +pub fn must_get_cf_none(engine: &impl RawEngine, cf: &str, key: &[u8]) { must_get(engine, cf, key, None); } @@ -129,7 +136,7 @@ pub fn must_region_cleared(engine: &Engines, region } lazy_static! { - static ref TEST_CONFIG: TikvConfig = { + pub static ref TEST_CONFIG: TikvConfig = { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let common_test_cfg = manifest_dir.join("src/common-test.toml"); TikvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { @@ -635,24 +642,24 @@ pub fn configure_for_hibernate(cluster: &mut Cluster) { cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::secs(10); } -pub fn configure_for_snapshot(cluster: &mut Cluster) { +pub fn configure_for_snapshot(config: &mut Config) { // Truncate the log quickly so that we can force sending snapshot. - cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(2); - cluster.cfg.raft_store.merge_max_log_gap = 1; - cluster.cfg.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); + config.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); + config.raft_store.raft_log_gc_count_limit = Some(2); + config.raft_store.merge_max_log_gap = 1; + config.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); } -pub fn configure_for_merge(cluster: &mut Cluster) { +pub fn configure_for_merge(config: &mut Config) { // Avoid log compaction which will prevent merge. - cluster.cfg.raft_store.raft_log_gc_threshold = 1000; - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); - cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); + config.raft_store.raft_log_gc_threshold = 1000; + config.raft_store.raft_log_gc_count_limit = Some(1000); + config.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); // Make merge check resume quickly. - cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(100); + config.raft_store.merge_check_tick_interval = ReadableDuration::millis(100); // When isolated, follower relies on stale check tick to detect failure leader, // choose a smaller number to make it recover faster. - cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(500); + config.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(500); } pub fn ignore_merge_target_integrity(cluster: &mut Cluster) { @@ -660,30 +667,29 @@ pub fn ignore_merge_target_integrity(cluster: &mut Cluster) { cluster.pd_client.ignore_merge_target_integrity(); } -pub fn configure_for_lease_read( - cluster: &mut Cluster, +pub fn configure_for_lease_read( + cfg: &mut Config, base_tick_ms: Option, election_ticks: Option, ) -> Duration { if let Some(base_tick_ms) = base_tick_ms { - cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); + cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); } - let base_tick_interval = cluster.cfg.raft_store.raft_base_tick_interval.0; + let base_tick_interval = cfg.raft_store.raft_base_tick_interval.0; if let Some(election_ticks) = election_ticks { - cluster.cfg.raft_store.raft_election_timeout_ticks = election_ticks; + cfg.raft_store.raft_election_timeout_ticks = election_ticks; } - let election_ticks = cluster.cfg.raft_store.raft_election_timeout_ticks as u32; + let election_ticks = cfg.raft_store.raft_election_timeout_ticks as u32; let election_timeout = base_tick_interval * election_ticks; // Adjust max leader lease. - cluster.cfg.raft_store.raft_store_max_leader_lease = + cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(election_timeout - base_tick_interval); // Use large peer check interval, abnormal and max leader missing duration to // make a valid config, that is election timeout x 2 < peer stale state // check < abnormal < max leader missing duration. - cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration(election_timeout * 3); - cluster.cfg.raft_store.abnormal_leader_missing_duration = - ReadableDuration(election_timeout * 4); - cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration(election_timeout * 5); + cfg.raft_store.peer_stale_state_check_interval = ReadableDuration(election_timeout * 3); + cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration(election_timeout * 4); + cfg.raft_store.max_leader_missing_duration = ReadableDuration(election_timeout * 5); election_timeout } @@ -1384,3 +1390,33 @@ pub fn peer_on_store(region: &metapb::Region, store_id: u64) -> metapb::Peer { .unwrap() .clone() } + +pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, region_id: u64) { + let mut storage = cluster + .sim + .read() + .unwrap() + .storages + .get(&node_id) + .unwrap() + .clone(); + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + let snap_ctx = SnapContext { + pb_ctx: &ctx, + ..Default::default() + }; + let snapshot = storage.snapshot(snap_ctx).unwrap(); + let txn_ext = snapshot.txn_ext.clone().unwrap(); + for retry in 0..10 { + if txn_ext.is_max_ts_synced() { + break; + } + thread::sleep(Duration::from_millis(1 << retry)); + } + assert!(snapshot.ext().is_max_ts_synced()); +} diff --git a/components/test_raftstore_macro/src/lib.rs b/components/test_raftstore_macro/src/lib.rs index 59a2c6f1273..3c8239d9f3b 100644 --- a/components/test_raftstore_macro/src/lib.rs +++ b/components/test_raftstore_macro/src/lib.rs @@ -6,6 +6,8 @@ use quote::{quote, ToTokens}; use syn::{parse_macro_input, parse_quote, Ident, ItemFn, Path}; /// test_case generate test cases using cluster creation method provided. +/// It also import the package related util module, which means we should locate +/// methods using Cluster in the related util modules. /// /// ex: /// #[test_case(test_raftstore::new_node_cluster)] @@ -21,19 +23,19 @@ use syn::{parse_macro_input, parse_quote, Ident, ItemFn, Path}; /// mod test_something { /// #[test] /// fn test_raftstore_new_node_cluster() { -/// use test_raftstore::new_node_cluster as new_cluster; +/// use test_raftstore::(util::*, new_node_cluster as new_cluster); /// let mut cluster = new_cluster(0, 1); /// } /// /// #[test] /// fn test_raftstore_new_server_cluster() { -/// use test_raftstore::new_server_cluster as new_cluster; +/// use test_raftstore::(util::*, new_server_cluster as new_cluster); /// let mut cluster = new_cluster(0, 1); /// } /// /// #[test] /// fn test_raftstore_v2_new_server_cluster() { -/// use test_raftstore::test_raftstore_v2 as new_cluster; +/// use test_raftstore::(util::*, test_raftstore_v2 as new_cluster); /// let mut cluster = new_cluster(0, 1); /// } /// } @@ -73,7 +75,7 @@ fn render_test_cases(test_cases: Vec, fn_item: ItemFn) -> TokenStr 0, syn::parse( quote! { - use #package::#method as new_cluster; + use #package::{util::*, #method as new_cluster}; } .into(), ) diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 413adf0d415..eb49775e5c1 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -236,6 +236,14 @@ impl TabletFactory for KvEngineFactory { fn exists(&self, path: &Path) -> bool { RocksEngine::exists(path.to_str().unwrap()) } + + #[cfg(any(test, feature = "testexport"))] + fn set_state_storage(&self, state_storage: Arc) { + let inner = Arc::as_ptr(&self.inner) as *mut FactoryInner; + unsafe { + (*inner).state_storage = Some(state_storage); + } + } } #[cfg(test)] diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index da292eca17d..ce6971eb8fb 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -88,6 +88,12 @@ pub struct Service { reject_messages_on_memory_ratio: f64, } +impl Drop for Service { + fn drop(&mut self) { + self.check_leader_scheduler.stop(); + } +} + impl Clone for Service { fn clone(&self) -> Self { Service { diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index b5d989d5370..0fc836f36c2 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -1,5 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +#[cfg(any(test, feature = "testexport"))] +use std::io; use std::{ convert::{TryFrom, TryInto}, fs::{self, File}, @@ -461,6 +463,43 @@ impl Runnable for TabletRunner { } } +// A helper function to copy snapshot. +#[cfg(any(test, feature = "testexport"))] +pub fn copy_tablet_snapshot( + key: TabletSnapKey, + msg: RaftMessage, + sender_snap_mgr: &TabletSnapManager, + recver_snap_mgr: &TabletSnapManager, +) -> Result<()> { + let sender_path = sender_snap_mgr.tablet_gen_path(&key); + let files = fs::read_dir(sender_path)? + .map(|f| Ok(f?.path())) + .filter(|f| f.is_ok() && f.as_ref().unwrap().is_file()) + .collect::>>()?; + + let mut head = SnapshotChunk::default(); + head.set_message(msg); + head.set_data(usize::to_ne_bytes(SNAP_CHUNK_LEN).to_vec()); + + let recv_context = RecvTabletSnapContext::new(head)?; + let recv_path = recver_snap_mgr.tmp_recv_path(&recv_context.key); + fs::create_dir_all(&recv_path)?; + + for path in files { + let sender_name = path.file_name().unwrap().to_str().unwrap(); + let mut sender_f = File::open(&path)?; + + let recv_p = recv_path.join(sender_name); + let mut recv_f = File::create(recv_p)?; + + while io::copy(&mut sender_f, &mut recv_f)? != 0 {} + } + + let final_path = recver_snap_mgr.final_recv_path(&recv_context.key); + fs::rename(&recv_path, final_path)?; + Ok(()) +} + #[cfg(test)] mod tests { use std::{ diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 6fb05f19cd1..331575339a5 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -143,6 +143,7 @@ test_coprocessor = { workspace = true } test_pd = { workspace = true } test_pd_client = { workspace = true } test_raftstore = { workspace = true } +test_raftstore-v2 = { workspace = true } test_raftstore_macro = { workspace = true } test_sst_importer = { workspace = true } test_storage = { workspace = true } diff --git a/tests/failpoints/cases/test_async_io.rs b/tests/failpoints/cases/test_async_io.rs index 43ed82d4cdd..3d53b9c5f14 100644 --- a/tests/failpoints/cases/test_async_io.rs +++ b/tests/failpoints/cases/test_async_io.rs @@ -97,7 +97,7 @@ fn test_async_io_delay_destroy_after_conf_change() { fn test_async_io_cannot_destroy_when_persist_snapshot() { let mut cluster = new_node_cluster(0, 3); cluster.cfg.raft_store.store_io_pool_size = 2; - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -180,7 +180,7 @@ fn test_async_io_cannot_destroy_when_persist_snapshot() { fn test_async_io_cannot_handle_ready_when_persist_snapshot() { let mut cluster = new_node_cluster(0, 3); cluster.cfg.raft_store.store_io_pool_size = 2; - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); diff --git a/tests/failpoints/cases/test_cmd_epoch_checker.rs b/tests/failpoints/cases/test_cmd_epoch_checker.rs index d96c467d487..feaa1af76ef 100644 --- a/tests/failpoints/cases/test_cmd_epoch_checker.rs +++ b/tests/failpoints/cases/test_cmd_epoch_checker.rs @@ -159,7 +159,7 @@ fn test_reject_proposal_during_region_split() { #[test] fn test_reject_proposal_during_region_merge() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = cluster.pd_client.clone(); pd_client.disable_default_operator(); cluster.run(); @@ -284,7 +284,7 @@ fn test_reject_proposal_during_region_merge() { #[test] fn test_reject_proposal_during_rollback_region_merge() { let mut cluster = new_node_cluster(0, 2); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = cluster.pd_client.clone(); pd_client.disable_default_operator(); cluster.run_conf_change(); diff --git a/tests/failpoints/cases/test_conf_change.rs b/tests/failpoints/cases/test_conf_change.rs index d4219808af0..7821c8be5df 100644 --- a/tests/failpoints/cases/test_conf_change.rs +++ b/tests/failpoints/cases/test_conf_change.rs @@ -19,7 +19,7 @@ fn test_destroy_local_reader() { let mut cluster = new_node_cluster(0, 3); // Set election timeout and max leader lease to 1s. - configure_for_lease_read(&mut cluster, Some(100), Some(10)); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); let pd_client = cluster.pd_client.clone(); // Disable default max peer count check. diff --git a/tests/failpoints/cases/test_early_apply.rs b/tests/failpoints/cases/test_early_apply.rs index acac65cd397..a194ef74d8f 100644 --- a/tests/failpoints/cases/test_early_apply.rs +++ b/tests/failpoints/cases/test_early_apply.rs @@ -97,7 +97,7 @@ fn test_early_apply_yield_followed_with_many_entries() { let mut cluster = new_node_cluster(0, 3); cluster.pd_client.disable_default_operator(); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); cluster.must_put(b"k1", b"v1"); diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index fa4f6e9cb42..c22136d04de 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -28,7 +28,7 @@ use txn_types::{Key, PessimisticLock}; #[test] fn test_node_merge_rollback() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -116,7 +116,7 @@ fn test_node_merge_rollback() { #[test] fn test_node_merge_restart() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -201,7 +201,7 @@ fn test_node_merge_restart() { #[test] fn test_node_merge_catch_up_logs_restart() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); cluster.must_put(b"k1", b"v1"); @@ -242,7 +242,7 @@ fn test_node_merge_catch_up_logs_restart() { #[test] fn test_node_merge_catch_up_logs_leader_election() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); cluster.cfg.raft_store.raft_election_timeout_ticks = 25; cluster.cfg.raft_store.raft_log_gc_threshold = 12; @@ -296,7 +296,7 @@ fn test_node_merge_catch_up_logs_leader_election() { #[test] fn test_node_merge_catch_up_logs_no_need() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(10); cluster.cfg.raft_store.raft_election_timeout_ticks = 25; cluster.cfg.raft_store.raft_log_gc_threshold = 12; @@ -366,7 +366,7 @@ fn test_node_merge_catch_up_logs_no_need() { #[test] fn test_node_merge_recover_snapshot() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.raft_log_gc_threshold = 12; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); let pd_client = Arc::clone(&cluster.pd_client); @@ -424,7 +424,7 @@ fn test_node_merge_multiple_snapshots_not_together() { fn test_node_merge_multiple_snapshots(together: bool) { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); ignore_merge_target_integrity(&mut cluster); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -534,7 +534,7 @@ fn test_node_merge_multiple_snapshots(together: bool) { #[test] fn test_node_merge_restart_after_apply_premerge_before_apply_compact_log() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.merge_max_log_gap = 10; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(11); // Rely on this config to trigger a compact log @@ -617,7 +617,7 @@ fn test_node_merge_restart_after_apply_premerge_before_apply_compact_log() { #[test] fn test_node_failed_merge_before_succeed_merge() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.merge_max_log_gap = 30; cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); cluster.cfg.raft_store.store_batch_system.pool_size = 2; @@ -706,7 +706,7 @@ fn test_node_failed_merge_before_succeed_merge() { #[test] fn test_node_merge_transfer_leader() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); cluster.cfg.raft_store.store_batch_system.pool_size = 2; let pd_client = Arc::clone(&cluster.pd_client); @@ -768,7 +768,7 @@ fn test_node_merge_transfer_leader() { #[test] fn test_node_merge_cascade_merge_with_apply_yield() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -807,7 +807,7 @@ fn test_node_merge_cascade_merge_with_apply_yield() { #[test] fn test_node_multiple_rollback_merge() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; cluster.cfg.raft_store.merge_check_tick_interval = ReadableDuration::millis(20); let pd_client = Arc::clone(&cluster.pd_client); @@ -1208,7 +1208,7 @@ fn test_node_merge_crash_when_snapshot() { #[test] fn test_prewrite_before_max_ts_is_synced() { let mut cluster = new_server_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); // Transfer leader to node 1 first to ensure all operations happen on node 1 @@ -1265,7 +1265,7 @@ fn test_prewrite_before_max_ts_is_synced() { #[test] fn test_source_peer_read_delegate_after_apply() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1314,7 +1314,7 @@ fn test_source_peer_read_delegate_after_apply() { #[test] fn test_merge_with_concurrent_pessimistic_locking() { let mut cluster = new_server_cluster(0, 2); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.pessimistic_txn.pipelined = true; cluster.cfg.pessimistic_txn.in_memory = true; cluster.run(); @@ -1402,7 +1402,7 @@ fn test_merge_with_concurrent_pessimistic_locking() { #[test] fn test_merge_pessimistic_locks_with_concurrent_prewrite() { let mut cluster = new_server_cluster(0, 2); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.pessimistic_txn.pipelined = true; cluster.cfg.pessimistic_txn.in_memory = true; let pd_client = Arc::clone(&cluster.pd_client); @@ -1487,7 +1487,7 @@ fn test_merge_pessimistic_locks_with_concurrent_prewrite() { #[test] fn test_retry_pending_prepare_merge_fail() { let mut cluster = new_server_cluster(0, 2); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.pessimistic_txn.pipelined = true; cluster.cfg.pessimistic_txn.in_memory = true; let pd_client = Arc::clone(&cluster.pd_client); @@ -1564,7 +1564,7 @@ fn test_retry_pending_prepare_merge_fail() { #[test] fn test_merge_pessimistic_locks_propose_fail() { let mut cluster = new_server_cluster(0, 2); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.pessimistic_txn.pipelined = true; cluster.cfg.pessimistic_txn.in_memory = true; let pd_client = Arc::clone(&cluster.pd_client); @@ -1633,7 +1633,7 @@ fn test_merge_pessimistic_locks_propose_fail() { #[test] fn test_destroy_source_peer_while_merging() { let mut cluster = new_node_cluster(0, 5); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); diff --git a/tests/failpoints/cases/test_pending_peers.rs b/tests/failpoints/cases/test_pending_peers.rs index 5618bc9ab8e..c41c97034b4 100644 --- a/tests/failpoints/cases/test_pending_peers.rs +++ b/tests/failpoints/cases/test_pending_peers.rs @@ -41,8 +41,8 @@ fn test_pending_peers() { #[test] fn test_pending_snapshot() { let mut cluster = new_node_cluster(0, 3); - configure_for_snapshot(&mut cluster); - let election_timeout = configure_for_lease_read(&mut cluster, None, Some(15)); + configure_for_snapshot(&mut cluster.cfg); + let election_timeout = configure_for_lease_read(&mut cluster.cfg, None, Some(15)); let gc_limit = cluster.cfg.raft_store.raft_log_gc_count_limit(); cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); diff --git a/tests/failpoints/cases/test_rawkv.rs b/tests/failpoints/cases/test_rawkv.rs index e228e82830c..a795422c120 100644 --- a/tests/failpoints/cases/test_rawkv.rs +++ b/tests/failpoints/cases/test_rawkv.rs @@ -24,7 +24,7 @@ impl TestSuite { // Disable background renew by setting `renew_interval` to 0, to make timestamp // allocation predictable. configure_for_causal_ts(&mut cluster, "0s", 100); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); cluster.pd_client.disable_default_operator(); diff --git a/tests/failpoints/cases/test_replica_read.rs b/tests/failpoints/cases/test_replica_read.rs index 5fe71834e45..9f844f582e4 100644 --- a/tests/failpoints/cases/test_replica_read.rs +++ b/tests/failpoints/cases/test_replica_read.rs @@ -20,7 +20,7 @@ fn test_wait_for_apply_index() { let mut cluster = new_server_cluster(0, 3); // Increase the election tick to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(50), Some(10_000)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -76,7 +76,7 @@ fn test_wait_for_apply_index() { fn test_duplicate_read_index_ctx() { // Initialize cluster let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(10_000)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); cluster.cfg.raft_store.raft_heartbeat_ticks = 1; let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -155,7 +155,7 @@ fn test_duplicate_read_index_ctx() { fn test_read_before_init() { // Initialize cluster let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(10_000)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -206,7 +206,7 @@ fn test_read_before_init() { fn test_read_applying_snapshot() { // Initialize cluster let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(10_000)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -270,8 +270,8 @@ fn test_read_applying_snapshot() { #[test] fn test_read_after_cleanup_range_for_snap() { let mut cluster = new_server_cluster(1, 3); - configure_for_snapshot(&mut cluster); - configure_for_lease_read(&mut cluster, Some(100), Some(10)); + configure_for_snapshot(&mut cluster.cfg); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -365,7 +365,7 @@ fn test_read_after_cleanup_range_for_snap() { #[test] fn test_new_split_learner_can_not_find_leader() { let mut cluster = new_node_cluster(0, 4); - configure_for_lease_read(&mut cluster, Some(5000), None); + configure_for_lease_read(&mut cluster.cfg, Some(5000), None); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -411,7 +411,7 @@ fn test_new_split_learner_can_not_find_leader() { fn test_replica_read_after_transfer_leader() { let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(100)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(100)); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -488,7 +488,7 @@ fn test_read_index_after_transfer_leader() { let mut cluster = new_node_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); - configure_for_lease_read(&mut cluster, Some(50), Some(100)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(100)); // Setup cluster and check all peers have data. let region_id = cluster.run_conf_change(); pd_client.must_add_peer(region_id, new_peer(2, 2)); @@ -579,7 +579,7 @@ fn test_read_index_after_transfer_leader() { #[test] fn test_batch_read_index_after_transfer_leader() { let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(100)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(100)); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -730,7 +730,7 @@ fn test_read_index_lock_checking_on_follower() { fn test_read_index_lock_checking_on_false_leader() { let mut cluster = new_node_cluster(0, 5); // Use long election timeout and short lease. - configure_for_lease_read(&mut cluster, Some(50), Some(200)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(200)); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(Duration::from_millis(100)); diff --git a/tests/failpoints/cases/test_replica_stale_read.rs b/tests/failpoints/cases/test_replica_stale_read.rs index 3dc7223ae41..b7d436d92d7 100644 --- a/tests/failpoints/cases/test_replica_stale_read.rs +++ b/tests/failpoints/cases/test_replica_stale_read.rs @@ -14,14 +14,14 @@ fn prepare_for_stale_read(leader: Peer) -> (Cluster, Arc)>>, + before_run: Option>, ) -> (Cluster, Arc, PeerClient) { let mut cluster = new_server_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); if let Some(f) = before_run { - f(&mut cluster); + f(&mut cluster.cfg); }; cluster.cfg.resolved_ts.enable = true; cluster.run(); diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index a6a4a1824f3..ca329896df1 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -62,7 +62,7 @@ fn test_overlap_cleanup() { #[test] fn test_server_snapshot_on_resolve_failure() { let mut cluster = new_server_cluster(1, 2); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); let on_send_store_fp = "transport_on_send_snapshot"; @@ -195,7 +195,7 @@ fn assert_snapshot(snap_dir: &str, region_id: u64, exist: bool) { #[test] fn test_destroy_peer_on_pending_snapshot() { let mut cluster = new_server_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -261,7 +261,7 @@ fn test_destroy_peer_on_pending_snapshot() { #[test] fn test_destroy_peer_on_pending_snapshot_and_restart() { let mut cluster = new_server_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -387,7 +387,7 @@ fn test_shutdown_when_snap_gc() { #[test] fn test_receive_old_snapshot() { let mut cluster = new_node_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; let pd_client = Arc::clone(&cluster.pd_client); @@ -482,7 +482,7 @@ fn test_receive_old_snapshot() { #[test] fn test_gen_snapshot_with_no_committed_entries_ready() { let mut cluster = new_node_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -571,7 +571,7 @@ fn test_cancel_snapshot_generating() { #[test] fn test_snapshot_gc_after_failed() { let mut cluster = new_server_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); cluster.cfg.raft_store.snap_gc_timeout = ReadableDuration::millis(300); let pd_client = Arc::clone(&cluster.pd_client); @@ -641,7 +641,7 @@ fn test_snapshot_gc_after_failed() { #[test] fn test_sending_fail_with_net_error() { let mut cluster = new_server_cluster(1, 2); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); cluster.cfg.raft_store.snap_gc_timeout = ReadableDuration::millis(300); let pd_client = Arc::clone(&cluster.pd_client); @@ -742,7 +742,7 @@ fn test_snapshot_clean_up_logs_with_unfinished_log_gc() { #[test] fn test_snapshot_recover_from_raft_write_failure() { let mut cluster = new_server_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); // Avoid triggering snapshot at final step. cluster.cfg.raft_store.raft_log_gc_count_limit = Some(10); let pd_client = Arc::clone(&cluster.pd_client); @@ -800,7 +800,7 @@ fn test_snapshot_recover_from_raft_write_failure() { #[test] fn test_snapshot_recover_from_raft_write_failure_with_uncommitted_log() { let mut cluster = new_server_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); // Avoid triggering snapshot at final step. cluster.cfg.raft_store.raft_log_gc_count_limit = Some(10); let pd_client = Arc::clone(&cluster.pd_client); diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 09e87bb8d4d..f3a052c8027 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -268,7 +268,7 @@ impl Filter for PrevoteRangeFilter { #[test] fn test_split_not_to_split_existing_region() { let mut cluster = new_node_cluster(0, 4); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; cluster.cfg.raft_store.apply_batch_system.max_batch_size = Some(1); cluster.cfg.raft_store.apply_batch_system.pool_size = 2; @@ -341,7 +341,7 @@ fn test_split_not_to_split_existing_region() { #[test] fn test_split_not_to_split_existing_tombstone_region() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); cluster.cfg.raft_store.store_batch_system.pool_size = 2; @@ -409,7 +409,7 @@ fn test_split_not_to_split_existing_tombstone_region() { #[test] fn test_split_continue_when_destroy_peer_after_mem_check() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); cluster.cfg.raft_store.store_batch_system.pool_size = 2; @@ -496,7 +496,7 @@ fn test_split_continue_when_destroy_peer_after_mem_check() { #[test] fn test_split_should_split_existing_same_uninitialied_peer() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); cluster.cfg.raft_store.store_batch_system.pool_size = 2; @@ -549,7 +549,7 @@ fn test_split_should_split_existing_same_uninitialied_peer() { #[test] fn test_split_not_to_split_existing_different_uninitialied_peer() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; cluster.cfg.raft_store.store_batch_system.max_batch_size = Some(1); cluster.cfg.raft_store.store_batch_system.pool_size = 2; diff --git a/tests/failpoints/cases/test_stale_peer.rs b/tests/failpoints/cases/test_stale_peer.rs index 1a4ef0b0afc..b171cebd173 100644 --- a/tests/failpoints/cases/test_stale_peer.rs +++ b/tests/failpoints/cases/test_stale_peer.rs @@ -138,7 +138,7 @@ fn test_stale_learner_restart() { #[test] fn test_stale_peer_destroy_when_apply_snapshot() { let mut cluster = new_node_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); diff --git a/tests/failpoints/cases/test_stale_read.rs b/tests/failpoints/cases/test_stale_read.rs index 18ddb865fd9..475ed71a1b0 100644 --- a/tests/failpoints/cases/test_stale_read.rs +++ b/tests/failpoints/cases/test_stale_read.rs @@ -17,7 +17,7 @@ fn stale_read_during_splitting(right_derive: bool) { let count = 3; let mut cluster = new_node_cluster(0, count); cluster.cfg.raft_store.right_derive_when_split = right_derive; - let election_timeout = configure_for_lease_read(&mut cluster, None, None); + let election_timeout = configure_for_lease_read(&mut cluster.cfg, None, None); cluster.run(); // Write the initial values. @@ -215,8 +215,8 @@ fn test_node_stale_read_during_splitting_right_derive() { fn test_stale_read_during_merging() { let count = 3; let mut cluster = new_node_cluster(0, count); - configure_for_merge(&mut cluster); - let election_timeout = configure_for_lease_read(&mut cluster, None, None); + configure_for_merge(&mut cluster.cfg); + let election_timeout = configure_for_lease_read(&mut cluster.cfg, None, None); cluster.cfg.raft_store.right_derive_when_split = false; cluster.cfg.raft_store.pd_heartbeat_tick_interval = cluster.cfg.raft_store.raft_base_tick_interval; @@ -323,7 +323,7 @@ fn test_read_index_when_transfer_leader_2() { let mut cluster = new_node_cluster(0, 3); // Increase the election tick to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(50), Some(10_000)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); // Stop log compaction to transfer leader with filter easier. configure_for_request_snapshot(&mut cluster); let max_lease = Duration::from_secs(2); @@ -482,8 +482,8 @@ fn test_stale_read_during_merging_2() { let pd_client = cluster.pd_client.clone(); pd_client.disable_default_operator(); - configure_for_merge(&mut cluster); - configure_for_lease_read(&mut cluster, Some(50), Some(20)); + configure_for_merge(&mut cluster.cfg); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(20)); cluster.run(); diff --git a/tests/integrations/raftstore/test_conf_change.rs b/tests/integrations/raftstore/test_conf_change.rs index 63b3feff0e0..500a27ae266 100644 --- a/tests/integrations/raftstore/test_conf_change.rs +++ b/tests/integrations/raftstore/test_conf_change.rs @@ -9,11 +9,10 @@ use std::{ time::Duration, }; -use engine_traits::{Peekable, CF_RAFT}; +use engine_traits::Peekable; use futures::executor::block_on; use kvproto::{ metapb::{self, PeerRole}, - raft_cmdpb::{RaftCmdResponse, RaftResponseHeader}, raft_serverpb::*, }; use pd_client::PdClient; @@ -21,9 +20,27 @@ use raft::eraftpb::{ConfChangeType, MessageType}; use raftstore::Result; use test_pd_client::TestPdClient; use test_raftstore::*; -use tikv_util::{config::ReadableDuration, store::is_learner, time::Instant, HandyRwLock}; +use test_raftstore_macro::test_case; +use tikv_util::{config::ReadableDuration, store::is_learner, time::Instant}; + +macro_rules! call_conf_change { + ($cluster:expr, $region_id:expr, $conf_change_type:expr, $peer:expr) => {{ + let conf_change = new_change_peer_request($conf_change_type, $peer); + let epoch = $cluster.pd_client.get_region_epoch($region_id); + let admin_req = new_admin_request($region_id, &epoch, conf_change); + $cluster.call_command_on_leader(admin_req, Duration::from_secs(3)) + }}; +} + +fn new_conf_change_peer(store: &metapb::Store, pd_client: &Arc) -> metapb::Peer { + let peer_id = pd_client.alloc_id().unwrap(); + new_peer(store.get_id(), peer_id) +} -fn test_simple_conf_change(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_server_cluster)] +fn test_server_simple_conf_change() { + let count = 5; + let mut cluster = new_cluster(0, count); let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer count check. pd_client.disable_default_operator(); @@ -99,7 +116,7 @@ fn test_simple_conf_change(cluster: &mut Cluster) { assert_eq!(cluster.get(b"k4"), Some(b"v4".to_vec())); must_get_equal(&engine_2, b"k4", b"v4"); - let resp = call_conf_change(cluster, r1, ConfChangeType::AddNode, new_peer(2, 2)).unwrap(); + let resp = call_conf_change!(cluster, r1, ConfChangeType::AddNode, new_peer(2, 2)).unwrap(); let exec_res = resp .get_header() .get_error() @@ -138,12 +155,13 @@ fn test_simple_conf_change(cluster: &mut Cluster) { // TODO: add more tests. } -fn new_conf_change_peer(store: &metapb::Store, pd_client: &Arc) -> metapb::Peer { - let peer_id = pd_client.alloc_id().unwrap(); - new_peer(store.get_id(), peer_id) -} - -fn test_pd_conf_change(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_pd_conf_change() { + let count = 5; + let mut cluster = new_cluster(0, count); let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer count check. pd_client.disable_default_operator(); @@ -233,27 +251,6 @@ fn test_pd_conf_change(cluster: &mut Cluster) { // TODO: add more tests. } -#[test] -fn test_server_simple_conf_change() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - test_simple_conf_change(&mut cluster); -} - -#[test] -fn test_node_pd_conf_change() { - let count = 5; - let mut cluster = new_node_cluster(0, count); - test_pd_conf_change(&mut cluster); -} - -#[test] -fn test_server_pd_conf_change() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - test_pd_conf_change(&mut cluster); -} - fn wait_till_reach_count(pd_client: Arc, region_id: u64, c: usize) { let mut replica_count = 0; for _ in 0..1000 { @@ -273,7 +270,13 @@ fn wait_till_reach_count(pd_client: Arc, region_id: u64, c: usize) ); } -fn test_auto_adjust_replica(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_auto_adjust_replica() { + let count = 7; + let mut cluster = new_cluster(0, count); cluster.start().unwrap(); let pd_client = Arc::clone(&cluster.pd_client); @@ -333,21 +336,19 @@ fn test_auto_adjust_replica(cluster: &mut Cluster) { wait_till_reach_count(Arc::clone(&pd_client), region_id, 5); } -#[test] -fn test_node_auto_adjust_replica() { - let count = 7; - let mut cluster = new_node_cluster(0, count); - test_auto_adjust_replica(&mut cluster); -} - -#[test] -fn test_server_auto_adjust_replica() { - let count = 7; - let mut cluster = new_server_cluster(0, count); - test_auto_adjust_replica(&mut cluster); +macro_rules! find_leader_response_header { + ($cluster:expr, $region_id:expr, $peer:expr) => {{ + let find_leader = new_status_request($region_id, $peer, new_region_leader_cmd()); + let resp = $cluster.call_command(find_leader, Duration::from_secs(5)); + resp.unwrap().take_header() + }}; } -fn test_after_remove_itself(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +fn test_after_remove_itself() { + let count = 3; + let mut cluster = new_cluster(0, count); let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer count check. pd_client.disable_default_operator(); @@ -400,39 +401,25 @@ fn test_after_remove_itself(cluster: &mut Cluster) { cluster.run_node(3).unwrap(); for _ in 0..250 { - let region: RegionLocalState = engine1 - .get_msg_cf(CF_RAFT, &keys::region_state_key(r1)) - .unwrap() - .unwrap(); + let region: RegionLocalState = engine1.region_local_state(r1).unwrap().unwrap(); if region.get_state() == PeerState::Tombstone { return; } sleep_ms(20); } - let region: RegionLocalState = engine1 - .get_msg_cf(CF_RAFT, &keys::region_state_key(r1)) - .unwrap() - .unwrap(); + let region: RegionLocalState = engine1.region_local_state(r1).unwrap().unwrap(); assert_eq!(region.get_state(), PeerState::Tombstone); // TODO: add split after removing itself test later. } -#[test] -fn test_node_after_remove_itself() { - let count = 3; - let mut cluster = new_node_cluster(0, count); - test_after_remove_itself(&mut cluster); -} - -#[test] -fn test_server_after_remove_itself() { - let count = 3; - let mut cluster = new_server_cluster(0, count); - test_after_remove_itself(&mut cluster); -} - -fn test_split_brain(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_split_brain() { + let count = 6; + let mut cluster = new_cluster(0, count); let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer number check. pd_client.disable_default_operator(); @@ -488,7 +475,7 @@ fn test_split_brain(cluster: &mut Cluster) { // check whether a new cluster [1,2,3] is formed // if so, both [1,2,3] and [4,5,6] think they serve for region r1 // result in split brain - let header0 = find_leader_response_header(cluster, r1, new_peer(2, 2)); + let header0 = find_leader_response_header!(cluster, r1, new_peer(2, 2)); assert!(header0.get_error().has_region_not_found()); // at least wait for a round of election timeout and check again @@ -496,36 +483,17 @@ fn test_split_brain(cluster: &mut Cluster) { let election_timeout = base_tick * cluster.cfg.raft_store.raft_election_timeout_ticks as u32; thread::sleep(election_timeout * 2); - let header1 = find_leader_response_header(cluster, r1, new_peer(2, 2)); + let header1 = find_leader_response_header!(cluster, r1, new_peer(2, 2)); assert!(header1.get_error().has_region_not_found()); } -fn find_leader_response_header( - cluster: &mut Cluster, - region_id: u64, - peer: metapb::Peer, -) -> RaftResponseHeader { - let find_leader = new_status_request(region_id, peer, new_region_leader_cmd()); - let resp = cluster.call_command(find_leader, Duration::from_secs(5)); - resp.unwrap().take_header() -} - -#[test] -fn test_server_split_brain() { - let count = 6; - let mut cluster = new_server_cluster(0, count); - test_split_brain(&mut cluster); -} - -#[test] -fn test_node_split_brain() { - let count = 6; - let mut cluster = new_node_cluster(0, count); - test_split_brain(&mut cluster); -} - -/// A helper function for testing the conf change is safe. -fn test_conf_change_safe(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_conf_change_safe() { + let count = 5; + let mut cluster = new_cluster(0, count); let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer count check. pd_client.disable_default_operator(); @@ -589,7 +557,11 @@ fn test_conf_change_safe(cluster: &mut Cluster) { pd_client.must_remove_peer(region_id, new_peer(2, 2)); } -fn test_transfer_leader_safe(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_transfer_leader_safe() { + let count = 5; + let mut cluster = new_cluster(0, count); let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer count check. pd_client.disable_default_operator(); @@ -636,7 +608,40 @@ fn test_transfer_leader_safe(cluster: &mut Cluster) { } } -fn test_learner_conf_change(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_conf_change_remove_leader() { + let mut cluster = new_cluster(0, 3); + cluster.cfg.raft_store.allow_remove_leader = false; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let r1 = cluster.run_conf_change(); + pd_client.must_add_peer(r1, new_peer(2, 2)); + pd_client.must_add_peer(r1, new_peer(3, 3)); + + // Transfer leader to the first peer. + cluster.must_transfer_leader(r1, new_peer(1, 1)); + // Put a new kv to ensure leader has applied to newest log, so that to avoid + // false warning about pending conf change. + cluster.must_put(b"k1", b"v1"); + + // Try to remove leader, which should be ignored. + let res = call_conf_change!(cluster, r1, ConfChangeType::RemoveNode, new_peer(1, 1)).unwrap(); + assert!( + res.get_header() + .get_error() + .get_message() + .contains("ignore remove leader"), + "{:?}", + res + ); +} + +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_learner_conf_change() { + let count = 5; + let mut cluster = new_cluster(0, count); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); let r1 = cluster.run_conf_change(); @@ -654,11 +659,11 @@ fn test_learner_conf_change(cluster: &mut Cluster) { must_get_equal(&engine_4, b"k2", b"v2"); // Can't add duplicate learner. - let resp = call_conf_change( + let resp = call_conf_change!( cluster, r1, ConfChangeType::AddLearnerNode, - new_learner_peer(4, 11), + new_learner_peer(4, 11) ) .unwrap(); let err_msg = resp.get_header().get_error().get_message(); @@ -702,7 +707,7 @@ fn test_learner_conf_change(cluster: &mut Cluster) { } else { ConfChangeType::AddNode }; - call_conf_change(cluster, r1, conf_type, peer).unwrap() + call_conf_change!(cluster, r1, conf_type, peer).unwrap() }; // Add learner on store which already has peer. @@ -726,67 +731,10 @@ fn test_learner_conf_change(cluster: &mut Cluster) { pd_client.must_none_peer(r1, new_peer(4, 15)); } -#[test] -fn test_node_conf_change_safe() { - let count = 5; - let mut cluster = new_node_cluster(0, count); - test_conf_change_safe(&mut cluster); -} - -#[test] -fn test_server_safe_conf_change() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - test_conf_change_safe(&mut cluster); -} - -#[test] -fn test_server_transfer_leader_safe() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - test_transfer_leader_safe(&mut cluster); -} - -#[test] -fn test_conf_change_remove_leader() { - let mut cluster = new_node_cluster(0, 3); - cluster.cfg.raft_store.allow_remove_leader = false; - let pd_client = Arc::clone(&cluster.pd_client); - pd_client.disable_default_operator(); - let r1 = cluster.run_conf_change(); - pd_client.must_add_peer(r1, new_peer(2, 2)); - pd_client.must_add_peer(r1, new_peer(3, 3)); - - // Transfer leader to the first peer. - cluster.must_transfer_leader(r1, new_peer(1, 1)); - // Put a new kv to ensure leader has applied to newest log, so that to avoid - // false warning about pending conf change. - cluster.must_put(b"k1", b"v1"); - - // Try to remove leader, which should be ignored. - let res = - call_conf_change(&mut cluster, r1, ConfChangeType::RemoveNode, new_peer(1, 1)).unwrap(); - assert!( - res.get_header() - .get_error() - .get_message() - .contains("ignore remove leader"), - "{:?}", - res - ); -} - -#[test] -fn test_node_learner_conf_change() { - let count = 5; - let mut cluster = new_node_cluster(0, count); - test_learner_conf_change(&mut cluster); -} - -#[test] +#[test_case(test_raftstore::new_server_cluster)] fn test_learner_with_slow_snapshot() { - let mut cluster = new_server_cluster(0, 3); - configure_for_snapshot(&mut cluster); + let mut cluster = new_cluster(0, 3); + configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); let r1 = cluster.run_conf_change(); @@ -829,7 +777,7 @@ fn test_learner_with_slow_snapshot() { }); // New added learner should keep pending until snapshot is applied. - cluster.sim.wl().add_send_filter(1, snap_filter); + cluster.add_send_filter_on_node(1, snap_filter); pd_client.must_add_peer(r1, new_learner_peer(2, 2)); for _ in 0..500 { sleep_ms(10); @@ -864,7 +812,12 @@ fn test_learner_with_slow_snapshot() { assert!(count.load(Ordering::SeqCst) > 0); } -fn test_stale_peer(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_stale_peer() { + let mut cluster = new_cluster(0, 4); + // To avoid stale peers know they are stale from PD. + cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::hours(2); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -888,37 +841,15 @@ fn test_stale_peer(cluster: &mut Cluster) { must_get_none(&cluster.get_engine(3), b"k1"); } -#[test] -fn test_node_stale_peer() { - let mut cluster = new_node_cluster(0, 4); - // To avoid stale peers know they are stale from PD. - cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::hours(2); - test_stale_peer(&mut cluster); -} - -fn call_conf_change( - cluster: &mut Cluster, - region_id: u64, - conf_change_type: ConfChangeType, - peer: metapb::Peer, -) -> Result -where - T: Simulator, -{ - let conf_change = new_change_peer_request(conf_change_type, peer); - let epoch = cluster.pd_client.get_region_epoch(region_id); - let admin_req = new_admin_request(region_id, &epoch, conf_change); - cluster.call_command_on_leader(admin_req, Duration::from_secs(3)) -} - /// Tests if conf change relies on heartbeat. -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_conf_change_fast() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // Sets heartbeat timeout to more than 5 seconds. It also changes the election // timeout, but it's OK as the cluster starts with only one peer, it will // campaigns immediately. - configure_for_lease_read(&mut cluster, Some(5000), None); + configure_for_lease_read(&mut cluster.cfg, Some(5000), None); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); let r1 = cluster.run_conf_change(); diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index e50ca59fdff..eec5ea9b94c 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -285,7 +285,7 @@ fn test_flashback_for_read() { #[test] fn test_flashback_for_local_read() { let mut cluster = new_node_cluster(0, 3); - let election_timeout = configure_for_lease_read(&mut cluster, Some(50), None); + let election_timeout = configure_for_lease_read(&mut cluster.cfg, Some(50), None); // Avoid triggering the log compaction in this test case. cluster.cfg.raft_store.raft_log_gc_threshold = 100; cluster.run(); @@ -384,7 +384,7 @@ fn test_flashback_for_check_is_in_persist() { #[test] fn test_flashback_for_apply_snapshot() { let mut cluster = new_node_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); cluster.run(); cluster.must_transfer_leader(1, new_peer(3, 3)); diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index 855063bae98..6d8319ebae6 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -37,7 +37,7 @@ fn test_renew_lease(cluster: &mut Cluster) { cluster.cfg.raft_store.raft_log_gc_threshold = 100; // Increase the Raft tick interval to make this test case running reliably. // Use large election timeout to make leadership stable. - configure_for_lease_read(cluster, Some(50), Some(10_000)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); // Override max leader lease to 2 seconds. let max_lease = Duration::from_secs(2); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); @@ -132,7 +132,7 @@ fn test_lease_expired(cluster: &mut Cluster) { // Avoid triggering the log compaction in this test case. cluster.cfg.raft_store.raft_log_gc_threshold = 100; // Increase the Raft tick interval to make this test case running reliably. - let election_timeout = configure_for_lease_read(cluster, Some(50), None); + let election_timeout = configure_for_lease_read(&mut cluster.cfg, Some(50), None); let node_id = 3u64; let store_id = 3u64; @@ -174,7 +174,7 @@ fn test_lease_unsafe_during_leader_transfers(cluster: &mut Cluster // Avoid triggering the log compaction in this test case. cluster.cfg.raft_store.raft_log_gc_threshold = 100; // Increase the Raft tick interval to make this test case running reliably. - let election_timeout = configure_for_lease_read(cluster, Some(500), Some(5)); + let election_timeout = configure_for_lease_read(&mut cluster.cfg, Some(500), Some(5)); cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::hours(10); cluster.cfg.raft_store.renew_leader_lease_advance_duration = ReadableDuration::secs(0); @@ -297,7 +297,7 @@ fn test_batch_id_in_lease(cluster: &mut Cluster) { cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::hours(10); // Increase the Raft tick interval to make this test case running reliably. - let election_timeout = configure_for_lease_read(cluster, Some(100), None); + let election_timeout = configure_for_lease_read(&mut cluster.cfg, Some(100), None); cluster.run(); let (split_key1, split_key2) = (b"k22", b"k44"); @@ -402,7 +402,7 @@ fn test_node_callback_when_destroyed() { let count = 3; let mut cluster = new_node_cluster(0, count); // Increase the election tick to make this test case running reliably. - configure_for_lease_read(&mut cluster, None, Some(50)); + configure_for_lease_read(&mut cluster.cfg, None, Some(50)); cluster.run(); cluster.must_put(b"k1", b"v1"); let leader = cluster.leader_of_region(1).unwrap(); @@ -457,7 +457,7 @@ fn test_lease_read_callback_destroy() { // Only server cluster can fake sending message successfully in raftstore layer. let mut cluster = new_server_cluster(0, 3); // Increase the Raft tick interval to make this test case running reliably. - let election_timeout = configure_for_lease_read(&mut cluster, Some(50), None); + let election_timeout = configure_for_lease_read(&mut cluster.cfg, Some(50), None); cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k1", b"v1"); @@ -480,7 +480,7 @@ fn test_read_index_stale_in_suspect_lease() { let mut cluster = new_node_cluster(0, 3); // Increase the election tick to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(50), Some(10_000)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); let max_lease = Duration::from_secs(2); // Stop log compaction to transfer leader with filter easier. configure_for_request_snapshot(&mut cluster); @@ -581,7 +581,7 @@ fn test_read_index_stale_in_suspect_lease() { #[test] fn test_local_read_cache() { let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), None); + configure_for_lease_read(&mut cluster.cfg, Some(50), None); cluster.pd_client.disable_default_operator(); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -616,7 +616,7 @@ fn test_not_leader_read_lease() { // Avoid triggering the log compaction in this test case. cluster.cfg.raft_store.raft_log_gc_threshold = 100; // Increase the Raft tick interval to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(50), None); + configure_for_lease_read(&mut cluster.cfg, Some(50), None); let heartbeat_interval = cluster.cfg.raft_store.raft_heartbeat_interval(); cluster.run(); @@ -671,7 +671,7 @@ fn test_not_leader_read_lease() { #[test] fn test_read_index_after_write() { let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(10)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10)); let heartbeat_interval = cluster.cfg.raft_store.raft_heartbeat_interval(); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -737,7 +737,7 @@ fn test_infinite_lease() { cluster.cfg.raft_store.raft_log_gc_threshold = 100; // Increase the Raft tick interval to make this test case running reliably. // Use large election timeout to make leadership stable. - configure_for_lease_read(&mut cluster, Some(50), Some(10_000)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); // Override max leader lease to 2 seconds. let max_lease = Duration::from_secs(2); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); @@ -792,7 +792,7 @@ fn test_node_local_read_renew_lease() { let mut cluster = new_node_cluster(0, 3); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration::millis(500); let (base_tick_ms, election_ticks) = (50, 10); - configure_for_lease_read(&mut cluster, Some(50), Some(10)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10)); cluster.pd_client.disable_default_operator(); let region_id = cluster.run_conf_change(); diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index c72ba5ac595..de1187f35b1 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -5,7 +5,6 @@ use std::{iter::*, sync::*, thread, time::*}; use api_version::{test_kv_format_impl, KvFormat}; use engine_traits::{Peekable, CF_LOCK, CF_RAFT, CF_WRITE}; use kvproto::{ - kvrpcpb::Context, raft_cmdpb::CmdType, raft_serverpb::{PeerState, RaftMessage, RegionLocalState}, }; @@ -13,10 +12,7 @@ use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; use raftstore::store::{Callback, LocksStatus}; use test_raftstore::*; -use tikv::storage::{ - kv::{SnapContext, SnapshotExt}, - Engine, Snapshot, -}; +use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::{config::*, HandyRwLock}; use txn_types::{Key, PessimisticLock}; @@ -25,7 +21,7 @@ use txn_types::{Key, PessimisticLock}; fn test_node_base_merge() { let mut cluster = new_node_cluster(0, 3); cluster.cfg.rocksdb.titan.enabled = true; - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); @@ -113,7 +109,7 @@ fn test_node_base_merge() { #[test] fn test_node_merge_with_slow_learner() { let mut cluster = new_node_cluster(0, 2); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.raft_log_gc_threshold = 40; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(40); cluster.cfg.raft_store.merge_max_log_gap = 15; @@ -188,7 +184,7 @@ fn test_node_merge_with_slow_learner() { #[test] fn test_node_merge_prerequisites_check() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); cluster.run(); @@ -269,7 +265,7 @@ fn test_node_merge_prerequisites_check() { #[test] fn test_node_check_merged_message() { let mut cluster = new_node_cluster(0, 4); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); ignore_merge_target_integrity(&mut cluster); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -340,7 +336,7 @@ fn test_node_merge_slow_split_left() { // merge. fn test_node_merge_slow_split(is_right_derive: bool) { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); ignore_merge_target_integrity(&mut cluster); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -405,7 +401,7 @@ fn test_node_merge_slow_split(is_right_derive: bool) { #[test] fn test_node_merge_dist_isolation() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); ignore_merge_target_integrity(&mut cluster); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -481,7 +477,7 @@ fn test_node_merge_dist_isolation() { #[test] fn test_node_merge_brain_split() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); ignore_merge_target_integrity(&mut cluster); cluster.cfg.raft_store.raft_log_gc_threshold = 12; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); @@ -660,9 +656,9 @@ fn test_merge_approximate_size_and_keys() { #[test] fn test_node_merge_update_region() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); // Election timeout and max leader lease is 1s. - configure_for_lease_read(&mut cluster, Some(100), Some(10)); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); cluster.run(); @@ -740,7 +736,7 @@ fn test_node_merge_update_region() { #[test] fn test_node_merge_catch_up_logs_empty_entries() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); cluster.must_put(b"k1", b"v1"); @@ -795,7 +791,7 @@ fn test_node_merge_catch_up_logs_empty_entries() { #[test] fn test_merge_with_slow_promote() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -837,7 +833,7 @@ fn test_merge_with_slow_promote() { #[test] fn test_merge_isolated_store_with_no_target_peer() { let mut cluster = new_node_cluster(0, 4); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); ignore_merge_target_integrity(&mut cluster); cluster.cfg.raft_store.right_derive_when_split = true; let pd_client = Arc::clone(&cluster.pd_client); @@ -896,7 +892,7 @@ fn test_merge_isolated_store_with_no_target_peer() { #[test] fn test_merge_cascade_merge_isolated() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -947,7 +943,7 @@ fn test_merge_cascade_merge_isolated() { #[test] fn test_merge_isolated_not_in_merge_learner() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -993,7 +989,7 @@ fn test_merge_isolated_not_in_merge_learner() { #[test] fn test_merge_isolated_stale_learner() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; // Do not rely on pd to remove stale peer cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::hours(2); @@ -1044,7 +1040,7 @@ fn test_merge_isolated_stale_learner() { #[test] fn test_merge_isolated_not_in_merge_learner_2() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1089,7 +1085,7 @@ fn test_merge_isolated_not_in_merge_learner_2() { #[test] fn test_merge_remove_target_peer_isolated() { let mut cluster = new_node_cluster(0, 4); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1151,7 +1147,7 @@ fn test_sync_max_ts_after_region_merge() { fn test_sync_max_ts_after_region_merge_impl() { let mut cluster = new_server_cluster_with_api_ver(0, 3, F::TAG); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); // Transfer leader to node 1 first to ensure all operations happen on node 1 @@ -1166,45 +1162,14 @@ fn test_sync_max_ts_after_region_merge_impl() { let right = cluster.get_region(b"k3"); let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); - let mut storage = cluster - .sim - .read() - .unwrap() - .storages - .get(&1) - .unwrap() - .clone(); - let mut wait_for_synced = |cluster: &mut Cluster| { - let region_id = right.get_id(); - let leader = cluster.leader_of_region(region_id).unwrap(); - let epoch = cluster.get_region_epoch(region_id); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader); - ctx.set_region_epoch(epoch); - let snap_ctx = SnapContext { - pb_ctx: &ctx, - ..Default::default() - }; - let snapshot = storage.snapshot(snap_ctx).unwrap(); - let txn_ext = snapshot.txn_ext.clone().unwrap(); - for retry in 0..10 { - if txn_ext.is_max_ts_synced() { - break; - } - thread::sleep(Duration::from_millis(1 << retry)); - } - assert!(snapshot.ext().is_max_ts_synced()); - }; - - wait_for_synced(&mut cluster); + wait_for_synced(&mut cluster, 1, 1); let max_ts = cm.max_ts(); cluster.pd_client.trigger_tso_failure(); // Merge left to right cluster.pd_client.must_merge(left.get_id(), right.get_id()); - wait_for_synced(&mut cluster); + wait_for_synced(&mut cluster, 1, 1); let new_max_ts = cm.max_ts(); assert!(new_max_ts > max_ts); } @@ -1214,8 +1179,8 @@ fn test_sync_max_ts_after_region_merge_impl() { #[test] fn test_merge_snapshot_demote() { let mut cluster = new_node_cluster(0, 4); - configure_for_merge(&mut cluster); - configure_for_snapshot(&mut cluster); + configure_for_merge(&mut cluster.cfg); + configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1270,7 +1235,7 @@ fn test_merge_snapshot_demote() { #[test] fn test_propose_in_memory_pessimistic_locks() { let mut cluster = new_server_cluster(0, 2); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1349,7 +1314,7 @@ fn test_propose_in_memory_pessimistic_locks() { #[test] fn test_merge_pessimistic_locks_when_gap_is_too_large() { let mut cluster = new_server_cluster(0, 2); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.pessimistic_txn.pipelined = true; cluster.cfg.pessimistic_txn.in_memory = true; // Set raft_entry_max_size to 64 KiB. We will try to make the gap larger than @@ -1399,7 +1364,7 @@ fn test_merge_pessimistic_locks_when_gap_is_too_large() { #[test] fn test_merge_pessimistic_locks_repeated_merge() { let mut cluster = new_server_cluster(0, 2); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.cfg.pessimistic_txn.pipelined = true; cluster.cfg.pessimistic_txn.in_memory = true; let pd_client = Arc::clone(&cluster.pd_client); @@ -1466,7 +1431,7 @@ fn test_merge_pessimistic_locks_repeated_merge() { #[test] fn test_node_merge_long_isolated() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); ignore_merge_target_integrity(&mut cluster); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1528,7 +1493,7 @@ fn test_node_merge_long_isolated() { #[test] fn test_stale_message_after_merge() { let mut cluster = new_server_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1576,7 +1541,7 @@ fn test_stale_message_after_merge() { #[test] fn test_prepare_merge_with_reset_matched() { let mut cluster = new_server_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); let r = cluster.run_conf_change(); @@ -1625,7 +1590,7 @@ fn test_prepare_merge_with_reset_matched() { #[test] fn test_prepare_merge_with_5_nodes_snapshot() { let mut cluster = new_server_cluster(0, 5); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); cluster.run(); diff --git a/tests/integrations/raftstore/test_prevote.rs b/tests/integrations/raftstore/test_prevote.rs index a4336e9f3ed..c81b34f0435 100644 --- a/tests/integrations/raftstore/test_prevote.rs +++ b/tests/integrations/raftstore/test_prevote.rs @@ -50,7 +50,7 @@ fn test_prevote( cluster.cfg.raft_store.hibernate_regions = false; // To stable the test, we use a large election timeout to make // leader's readiness get handle within an election timeout - configure_for_lease_read(cluster, Some(20), Some(10)); + configure_for_lease_read(&mut cluster.cfg, Some(20), Some(10)); let leader_id = 1; let detect_during_failure = detect_during_failure.into(); diff --git a/tests/integrations/raftstore/test_region_info_accessor.rs b/tests/integrations/raftstore/test_region_info_accessor.rs index 9bff000194a..24d90b66327 100644 --- a/tests/integrations/raftstore/test_region_info_accessor.rs +++ b/tests/integrations/raftstore/test_region_info_accessor.rs @@ -172,7 +172,7 @@ fn test_region_info_accessor_impl(cluster: &mut Cluster, c: &Region #[test] fn test_node_cluster_region_info_accessor() { let mut cluster = new_node_cluster(1, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); diff --git a/tests/integrations/raftstore/test_replica_read.rs b/tests/integrations/raftstore/test_replica_read.rs index 6deccad3a5e..16fad00a59b 100644 --- a/tests/integrations/raftstore/test_replica_read.rs +++ b/tests/integrations/raftstore/test_replica_read.rs @@ -58,7 +58,7 @@ fn test_replica_read_not_applied() { let mut cluster = new_node_cluster(0, 3); // Increase the election tick to make this test case running reliably. - configure_for_lease_read(&mut cluster, Some(50), Some(30)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(30)); let max_lease = Duration::from_secs(1); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(max_lease); // After the leader has committed to its term, pending reads on followers can be @@ -129,7 +129,7 @@ fn test_replica_read_not_applied() { fn test_replica_read_on_hibernate() { let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(20)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(20)); cluster.pd_client.disable_default_operator(); let r1 = cluster.run_conf_change(); @@ -191,7 +191,7 @@ fn test_replica_read_on_hibernate() { fn test_read_hibernated_region() { let mut cluster = new_node_cluster(0, 3); // Initialize the cluster. - configure_for_lease_read(&mut cluster, Some(100), Some(8)); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(8)); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(Duration::from_millis(1)); cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::hours(10); cluster.pd_client.disable_default_operator(); @@ -254,7 +254,7 @@ fn test_read_hibernated_region() { fn test_replica_read_on_stale_peer() { let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), Some(30)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(30)); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -288,7 +288,7 @@ fn test_read_index_out_of_order() { let mut cluster = new_node_cluster(0, 2); // Use long election timeout and short lease. - configure_for_lease_read(&mut cluster, Some(1000), Some(10)); + configure_for_lease_read(&mut cluster.cfg, Some(1000), Some(10)); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(Duration::from_millis(100)); @@ -328,7 +328,7 @@ fn test_read_index_retry_lock_checking() { let mut cluster = new_node_cluster(0, 2); // Use long election timeout and short lease. - configure_for_lease_read(&mut cluster, Some(50), Some(20)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(20)); cluster.cfg.raft_store.raft_store_max_leader_lease = ReadableDuration(Duration::from_millis(100)); @@ -402,7 +402,7 @@ fn test_split_isolation() { let mut cluster = new_node_cluster(0, 2); // Use long election timeout and short lease. configure_for_hibernate(&mut cluster); - configure_for_lease_read(&mut cluster, Some(50), Some(20)); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(20)); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(11); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -461,7 +461,7 @@ fn test_split_isolation() { #[test] fn test_read_local_after_snapshpot_replace_peer() { let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), None); + configure_for_lease_read(&mut cluster.cfg, Some(50), None); cluster.cfg.raft_store.raft_log_gc_threshold = 12; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); @@ -527,7 +527,7 @@ fn test_read_local_after_snapshpot_replace_peer() { #[test] fn test_malformed_read_index() { let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(50), None); + configure_for_lease_read(&mut cluster.cfg, Some(50), None); cluster.cfg.raft_store.raft_log_gc_threshold = 12; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); cluster.cfg.raft_store.hibernate_regions = true; diff --git a/tests/integrations/raftstore/test_single.rs b/tests/integrations/raftstore/test_single.rs index b7fcb6a7b34..7fedc3c1cd4 100644 --- a/tests/integrations/raftstore/test_single.rs +++ b/tests/integrations/raftstore/test_single.rs @@ -3,15 +3,18 @@ use std::time::Duration; use engine_traits::{CfName, CF_DEFAULT, CF_WRITE}; -use raftstore::store::*; +use raftstore::store::RAFT_INIT_LOG_INDEX; use rand::prelude::*; -use test_raftstore::*; +use test_raftstore::{new_put_cf_cmd, new_put_cmd, new_request, sleep_ms}; use test_raftstore_macro::test_case; use tikv_util::{config::*, time::Instant}; // TODO add epoch not match test cases. -fn test_delete_range(cluster: &mut Cluster, cf: CfName) { +fn test_delete_range( + cluster: &mut test_raftstore::Cluster, + cf: CfName, +) { let data_set: Vec<_> = (1..500) .map(|i| { ( @@ -44,19 +47,10 @@ fn test_delete_range(cluster: &mut Cluster, cf: CfName) { } } -fn test_put_large_entry(cluster: &mut Cluster) { - let max_size: usize = 1024; - cluster.cfg.raft_store.raft_entry_max_size = ReadableSize(max_size as u64); - - cluster.run(); - - let large_value = vec![b'v'; max_size + 1]; - let res = cluster.put(b"key", large_value.as_slice()); - assert!(res.as_ref().err().unwrap().has_raft_entry_too_large()); -} - #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_put() { let mut cluster = new_cluster(0, 1); cluster.run(); @@ -103,6 +97,8 @@ fn test_put() { #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_delete() { let mut cluster = new_cluster(0, 1); cluster.run(); @@ -133,7 +129,7 @@ fn test_delete() { #[test] fn test_node_use_delete_range() { - let mut cluster = new_node_cluster(0, 1); + let mut cluster = test_raftstore::new_node_cluster(0, 1); cluster.cfg.raft_store.use_delete_range = true; cluster.run(); test_delete_range(&mut cluster, CF_DEFAULT); @@ -143,7 +139,7 @@ fn test_node_use_delete_range() { #[test] fn test_node_not_use_delete_range() { - let mut cluster = new_node_cluster(0, 1); + let mut cluster = test_raftstore::new_node_cluster(0, 1); cluster.cfg.raft_store.use_delete_range = false; cluster.run(); test_delete_range(&mut cluster, CF_DEFAULT); @@ -153,6 +149,8 @@ fn test_node_not_use_delete_range() { #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_wrong_store_id() { let mut cluster = new_cluster(0, 1); cluster.run(); @@ -178,21 +176,25 @@ fn test_wrong_store_id() { ); } -#[test] -fn test_node_put_large_entry() { - let mut cluster = new_node_cluster(0, 1); - test_put_large_entry(&mut cluster); -} +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_put_large_entry() { + let mut cluster = new_cluster(0, 1); + let max_size: usize = 1024; + cluster.cfg.raft_store.raft_entry_max_size = ReadableSize(max_size as u64); -#[test] -fn test_server_put_large_entry() { - let mut cluster = new_server_cluster(0, 1); - test_put_large_entry(&mut cluster); + cluster.run(); + + let large_value = vec![b'v'; max_size + 1]; + let res = cluster.put(b"key", large_value.as_slice()); + assert!(res.as_ref().err().unwrap().has_raft_entry_too_large()); } #[test] fn test_node_apply_no_op() { - let mut cluster = new_node_cluster(0, 1); + let mut cluster = test_raftstore::new_node_cluster(0, 1); cluster.pd_client.disable_default_operator(); cluster.run(); diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 8d3212ad4a6..d18f42ec8ca 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -100,7 +100,7 @@ fn test_server_huge_snapshot_multi_files() { fn test_server_snap_gc_internal(version: &str) { let mut cluster = new_server_cluster(0, 3); - configure_for_snapshot(&mut cluster); + configure_for_snapshot(&mut cluster.cfg); cluster.pd_client.reset_version(version); cluster.cfg.raft_store.snap_gc_timeout = ReadableDuration::millis(300); cluster.cfg.raft_store.max_snapshot_file_raw_size = ReadableSize::mb(100); @@ -269,7 +269,7 @@ fn test_server_concurrent_snap() { } fn test_cf_snapshot(cluster: &mut Cluster) { - configure_for_snapshot(cluster); + configure_for_snapshot(&mut cluster.cfg); cluster.run(); let cf = "lock"; @@ -443,7 +443,7 @@ impl Filter for SnapshotAppendFilter { } fn test_snapshot_with_append(cluster: &mut Cluster) { - configure_for_snapshot(cluster); + configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer count check. diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 23c3b0b41c2..4b7914f7324 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -17,6 +17,7 @@ use raftstore::{ Result, }; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::config::*; use txn_types::{Key, PessimisticLock}; @@ -24,88 +25,83 @@ use txn_types::{Key, PessimisticLock}; pub const REGION_MAX_SIZE: u64 = 50000; pub const REGION_SPLIT_SIZE: u64 = 30000; -fn test_base_split_region(cluster: &mut Cluster, split: F, right_derive: bool) -where - T: Simulator, - F: Fn(&mut Cluster, &metapb::Region, &[u8]), -{ - cluster.cfg.raft_store.right_derive_when_split = right_derive; - cluster.run(); - - let pd_client = Arc::clone(&cluster.pd_client); - - let tbls = vec![ - (b"k22", b"k11", b"k33"), - (b"k11", b"k00", b"k11"), - (b"k33", b"k22", b"k33"), - ]; - - for (split_key, left_key, right_key) in tbls { - cluster.must_put(left_key, b"v1"); - cluster.must_put(right_key, b"v3"); - - // Left and right key must be in same region before split. - let region = pd_client.get_region(left_key).unwrap(); - let region2 = pd_client.get_region(right_key).unwrap(); - assert_eq!(region.get_id(), region2.get_id()); - - // Split with split_key, so left_key must in left, and right_key in right. - split(cluster, ®ion, split_key); - - let left = pd_client.get_region(left_key).unwrap(); - let right = pd_client.get_region(right_key).unwrap(); - - assert_eq!( - region.get_id(), - if right_derive { - right.get_id() - } else { - left.get_id() - } - ); - assert_eq!(region.get_start_key(), left.get_start_key()); - assert_eq!(left.get_end_key(), right.get_start_key()); - assert_eq!(region.get_end_key(), right.get_end_key()); - - cluster.must_put(left_key, b"vv1"); - assert_eq!(cluster.get(left_key).unwrap(), b"vv1".to_vec()); - - cluster.must_put(right_key, b"vv3"); - assert_eq!(cluster.get(right_key).unwrap(), b"vv3".to_vec()); - - let epoch = left.get_region_epoch().clone(); - let get = new_request(left.get_id(), epoch, vec![new_get_cmd(right_key)], false); - debug!("requesting {:?}", get); - let resp = cluster - .call_command_on_leader(get, Duration::from_secs(5)) - .unwrap(); - assert!(resp.get_header().has_error(), "{:?}", resp); - assert!( - resp.get_header().get_error().has_key_not_in_region(), - "{:?}", - resp - ); - } -} - -#[test] -fn test_server_base_split_region_left_derive() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - test_base_split_region(&mut cluster, Cluster::must_split, false); -} +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_server_base_split_region() { + let test_base_split_region = |right_derive| { + let count = 5; + let mut cluster = new_cluster(0, count); + + cluster.cfg.raft_store.right_derive_when_split = right_derive; + cluster.run(); + + let pd_client = Arc::clone(&cluster.pd_client); + + let tbls = vec![ + (b"k22", b"k11", b"k33"), + (b"k11", b"k00", b"k11"), + (b"k33", b"k22", b"k33"), + ]; + + for (split_key, left_key, right_key) in tbls { + cluster.must_put(left_key, b"v1"); + cluster.must_put(right_key, b"v3"); + + // Left and right key must be in same region before split. + let region = pd_client.get_region(left_key).unwrap(); + let region2 = pd_client.get_region(right_key).unwrap(); + assert_eq!(region.get_id(), region2.get_id()); + + // Split with split_key, so left_key must in left, and right_key in right. + cluster.must_split(®ion, split_key); + + let left = pd_client.get_region(left_key).unwrap(); + let right = pd_client.get_region(right_key).unwrap(); + + assert_eq!( + region.get_id(), + if right_derive { + right.get_id() + } else { + left.get_id() + } + ); + assert_eq!(region.get_start_key(), left.get_start_key()); + assert_eq!(left.get_end_key(), right.get_start_key()); + assert_eq!(region.get_end_key(), right.get_end_key()); + + cluster.must_put(left_key, b"vv1"); + assert_eq!(cluster.get(left_key).unwrap(), b"vv1".to_vec()); + + cluster.must_put(right_key, b"vv3"); + assert_eq!(cluster.get(right_key).unwrap(), b"vv3".to_vec()); + + let epoch = left.get_region_epoch().clone(); + let get = new_request(left.get_id(), epoch, vec![new_get_cmd(right_key)], false); + debug!("requesting {:?}", get); + let resp = cluster + .call_command_on_leader(get, Duration::from_secs(5)) + .unwrap(); + assert!(resp.get_header().has_error(), "{:?}", resp); + assert!( + resp.get_header().get_error().has_key_not_in_region(), + "{:?}", + resp + ); + } + }; -#[test] -fn test_server_base_split_region_right_derive() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - test_base_split_region(&mut cluster, Cluster::must_split, true); + // left derive + test_base_split_region(false); + // right derive + test_base_split_region(true); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_server_split_region_twice() { let count = 5; - let mut cluster = new_server_cluster(0, count); + let mut cluster = new_cluster(0, count); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -308,13 +304,13 @@ fn check_cluster(cluster: &mut Cluster, k: &[u8], v: &[u8], all_ /// sure broadcast commit is disabled when split. #[test] fn test_delay_split_region() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = test_raftstore::new_server_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(500); cluster.cfg.raft_store.merge_max_log_gap = 100; cluster.cfg.raft_store.raft_log_gc_threshold = 500; // To stable the test, we use a large hearbeat timeout 200ms(100ms * 2). // And to elect leader quickly, set election timeout to 1s(100ms * 10). - configure_for_lease_read(&mut cluster, Some(100), Some(10)); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); // We use three nodes for this test. cluster.run(); @@ -359,7 +355,12 @@ fn test_delay_split_region() { check_cluster(&mut cluster, b"k6", b"v6", false); } -fn test_split_overlap_snapshot(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_node_split_overlap_snapshot() { + let mut cluster = new_cluster(0, 3); // We use three nodes([1, 2, 3]) for this test. cluster.run(); @@ -410,19 +411,9 @@ fn test_split_overlap_snapshot(cluster: &mut Cluster) { must_get_equal(&engine3, b"k3", b"v3"); } -#[test] -fn test_node_split_overlap_snapshot() { - let mut cluster = new_node_cluster(0, 3); - test_split_overlap_snapshot(&mut cluster); -} - -#[test] -fn test_server_split_overlap_snapshot() { - let mut cluster = new_server_cluster(0, 3); - test_split_overlap_snapshot(&mut cluster); -} - -fn test_apply_new_version_snapshot(cluster: &mut Cluster) { +fn test_apply_new_version_snapshot( + cluster: &mut test_raftstore::Cluster, +) { // truncate the log quickly so that we can force sending snapshot. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(5); @@ -477,19 +468,19 @@ fn test_apply_new_version_snapshot(cluster: &mut Cluster) { #[test] fn test_node_apply_new_version_snapshot() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = test_raftstore::new_node_cluster(0, 3); test_apply_new_version_snapshot(&mut cluster); } #[test] fn test_server_apply_new_version_snapshot() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = test_raftstore::new_server_cluster(0, 3); test_apply_new_version_snapshot(&mut cluster); } #[test] fn test_server_split_with_stale_peer() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = test_raftstore::new_server_cluster(0, 3); // disable raft log gc. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(500); @@ -557,7 +548,11 @@ fn test_server_split_with_stale_peer() { must_get_equal(&engine3, b"k3", b"v3"); } -fn test_split_region_diff_check(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +fn test_split_region_diff_check() { + let count = 1; + let mut cluster = new_cluster(0, count); let region_max_size = 2000; let region_split_size = 1000; cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(100); @@ -575,7 +570,7 @@ fn test_split_region_diff_check(cluster: &mut Cluster) { // The default size index distance is too large for small data, we flush // multiple times to generate more size index handles. for _ in 0..10 { - put_till_size(cluster, region_max_size, &mut range); + put_till_size(&mut cluster, region_max_size, &mut range); } // Peer will split when size of region meet region_max_size, so assume the last @@ -602,20 +597,6 @@ fn test_split_region_diff_check(cluster: &mut Cluster) { } } -#[test] -fn test_server_split_region_diff_check() { - let count = 1; - let mut cluster = new_server_cluster(0, count); - test_split_region_diff_check(&mut cluster); -} - -#[test] -fn test_node_split_region_diff_check() { - let count = 1; - let mut cluster = new_node_cluster(0, count); - test_split_region_diff_check(&mut cluster); -} - // Test steps // set max region size/split size 2000 and put data till 1000 // set max region size/split size < 1000 and reboot @@ -623,7 +604,7 @@ fn test_node_split_region_diff_check() { #[test] fn test_node_split_region_after_reboot_with_config_change() { let count = 1; - let mut cluster = new_server_cluster(0, count); + let mut cluster = test_raftstore::new_server_cluster(0, count); let region_max_size = 2000; let region_split_size = 2000; cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); @@ -664,7 +645,10 @@ fn test_node_split_region_after_reboot_with_config_change() { } } -fn test_split_epoch_not_match(cluster: &mut Cluster, right_derive: bool) { +fn test_split_epoch_not_match( + cluster: &mut test_raftstore::Cluster, + right_derive: bool, +) { cluster.cfg.raft_store.right_derive_when_split = right_derive; cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -736,33 +720,39 @@ fn test_split_epoch_not_match(cluster: &mut Cluster, right_deri #[test] fn test_server_split_epoch_not_match_left_derive() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = test_raftstore::new_server_cluster(0, 3); test_split_epoch_not_match(&mut cluster, false); } #[test] fn test_server_split_epoch_not_match_right_derive() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = test_raftstore::new_server_cluster(0, 3); test_split_epoch_not_match(&mut cluster, true); } #[test] fn test_node_split_epoch_not_match_left_derive() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = test_raftstore::new_node_cluster(0, 3); test_split_epoch_not_match(&mut cluster, false); } #[test] fn test_node_split_epoch_not_match_right_derive() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = test_raftstore::new_node_cluster(0, 3); test_split_epoch_not_match(&mut cluster, true); } -// For the peer which is the leader of the region before split, it should -// campaigns immediately. and then this peer may take the leadership -// earlier. `test_quick_election_after_split` is a helper function for testing -// this feature. -fn test_quick_election_after_split(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_node_quick_election_after_split() { + let mut cluster = new_cluster(0, 3); + + // For the peer which is the leader of the region before split, it should + // campaigns immediately. and then this peer may take the leadership + // earlier. `test_quick_election_after_split` is a helper function for testing + // this feature. // Calculate the reserved time before a new campaign after split. let reserved_time = Duration::from_millis(cluster.cfg.raft_store.raft_base_tick_interval.as_millis() * 2); @@ -790,33 +780,10 @@ fn test_quick_election_after_split(cluster: &mut Cluster) { assert!(new_leader.is_some()); } -#[test] -fn test_node_quick_election_after_split() { - let mut cluster = new_node_cluster(0, 3); - test_quick_election_after_split(&mut cluster); -} - -#[test] -fn test_server_quick_election_after_split() { - let mut cluster = new_server_cluster(0, 3); - test_quick_election_after_split(&mut cluster); -} - #[test] fn test_node_split_region() { let count = 5; - let mut cluster = new_node_cluster(0, count); - test_split_region(&mut cluster); -} - -#[test] -fn test_server_split_region() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - test_split_region(&mut cluster); -} - -fn test_split_region(cluster: &mut Cluster) { + let mut cluster = test_raftstore::new_node_cluster(0, count); // length of each key+value let item_len = 74; // make bucket's size to item_len, which means one row one bucket @@ -825,8 +792,8 @@ fn test_split_region(cluster: &mut Cluster) { cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); let region = pd_client.get_region(b"").unwrap(); - let mid_key = put_till_size(cluster, 11 * item_len, &mut range); - let max_key = put_till_size(cluster, 9 * item_len, &mut range); + let mid_key = put_till_size(&mut cluster, 11 * item_len, &mut range); + let max_key = put_till_size(&mut cluster, 9 * item_len, &mut range); let target = pd_client.get_region(&max_key).unwrap(); assert_eq!(region, target); pd_client.must_split_region(target, pdpb::CheckPolicy::Scan, vec![]); @@ -852,11 +819,12 @@ fn test_split_region(cluster: &mut Cluster) { assert_eq!(y2.get_end_key(), b""); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_split_update_region_right_derive() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // Election timeout and max leader lease is 1s. - configure_for_lease_read(&mut cluster, Some(100), Some(10)); + configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); cluster.run(); @@ -906,9 +874,10 @@ fn test_node_split_update_region_right_derive() { ); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_split_with_epoch_not_match() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -940,9 +909,10 @@ fn test_split_with_epoch_not_match() { assert!(resp.get_header().get_error().has_epoch_not_match()); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_split_with_in_memory_pessimistic_locks() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1018,7 +988,7 @@ fn test_split_with_in_memory_pessimistic_locks() { #[test] fn test_refresh_region_bucket_keys() { let count = 5; - let mut cluster = new_server_cluster(0, count); + let mut cluster = test_raftstore::new_server_cluster(0, count); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -1204,7 +1174,7 @@ fn test_refresh_region_bucket_keys() { #[test] fn test_gen_split_check_bucket_ranges() { let count = 5; - let mut cluster = new_server_cluster(0, count); + let mut cluster = test_raftstore::new_server_cluster(0, count); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(5); cluster.cfg.coprocessor.enable_region_bucket = true; // disable report buckets; as it will reset the user traffic stats to randomize diff --git a/tests/integrations/raftstore/test_transfer_leader.rs b/tests/integrations/raftstore/test_transfer_leader.rs index b4f8c33d54d..b97191d1a13 100644 --- a/tests/integrations/raftstore/test_transfer_leader.rs +++ b/tests/integrations/raftstore/test_transfer_leader.rs @@ -4,18 +4,18 @@ use std::{sync::Arc, thread, time::Duration}; use api_version::{test_kv_format_impl, KvFormat}; use engine_traits::CF_LOCK; -use kvproto::kvrpcpb::Context; use raft::eraftpb::MessageType; use raftstore::store::LocksStatus; use test_raftstore::*; -use tikv::storage::{ - kv::{SnapContext, SnapshotExt}, - Engine, Snapshot, -}; +use test_raftstore_macro::test_case; +use tikv::storage::Snapshot; use tikv_util::config::*; use txn_types::{Key, PessimisticLock}; -fn test_basic_transfer_leader(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_server_basic_transfer_leader() { + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.raft_heartbeat_ticks = 20; let reserved_time = Duration::from_millis( cluster.cfg.raft_store.raft_base_tick_interval.as_millis() @@ -64,13 +64,10 @@ fn test_basic_transfer_leader(cluster: &mut Cluster) { assert!(resp.get_header().get_error().has_not_leader()); } -#[test] -fn test_server_basic_transfer_leader() { - let mut cluster = new_server_cluster(0, 3); - test_basic_transfer_leader(&mut cluster); -} - -fn test_pd_transfer_leader(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_server_pd_transfer_leader() { + let mut cluster = new_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -118,7 +115,10 @@ fn test_pd_transfer_leader(cluster: &mut Cluster) { } } -fn test_pd_transfer_leader_multi_target(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_server_pd_transfer_leader_multi_target() { + let mut cluster = new_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -155,6 +155,11 @@ fn test_pd_transfer_leader_multi_target(cluster: &mut Cluster) } } + // Give some time for leader to commit the first entry + // todo: It shouldn't need this, but for now and for v2, without it, the test is + // not stable. + thread::sleep(Duration::from_millis(100)); + // call command on this leader directly, must successfully. let mut req = new_request( region.get_id(), @@ -170,19 +175,9 @@ fn test_pd_transfer_leader_multi_target(cluster: &mut Cluster) assert_eq!(resp.get_responses()[0].get_get().get_value(), b"v1"); } -#[test] -fn test_server_pd_transfer_leader() { - let mut cluster = new_server_cluster(0, 3); - test_pd_transfer_leader(&mut cluster); -} - -#[test] -fn test_server_pd_transfer_leader_multi_target() { - let mut cluster = new_server_cluster(0, 3); - test_pd_transfer_leader_multi_target(&mut cluster); -} - -fn test_transfer_leader_during_snapshot(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_server_cluster)] +fn test_server_transfer_leader_during_snapshot() { + let mut cluster = new_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); // Disable default max peer count check. pd_client.disable_default_operator(); @@ -225,71 +220,42 @@ fn test_transfer_leader_during_snapshot(cluster: &mut Cluster) must_get_equal(&cluster.get_engine(1), b"k1", b"v1"); } -#[test] -fn test_server_transfer_leader_during_snapshot() { - let mut cluster = new_server_cluster(0, 3); - test_transfer_leader_during_snapshot(&mut cluster); -} - -#[test] +#[test_case(test_raftstore::new_server_cluster_with_api_ver)] +#[test_case(test_raftstore_v2::new_server_cluster_with_api_ver)] fn test_sync_max_ts_after_leader_transfer() { - test_kv_format_impl!(test_sync_max_ts_after_leader_transfer_impl); -} - -fn test_sync_max_ts_after_leader_transfer_impl() { - let mut cluster = new_server_cluster_with_api_ver(0, 3, F::TAG); - cluster.cfg.raft_store.raft_heartbeat_ticks = 20; - cluster.run(); - - let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); - let mut storage = cluster - .sim - .read() - .unwrap() - .storages - .get(&1) - .unwrap() - .clone(); - let mut wait_for_synced = |cluster: &mut Cluster| { - let region_id = 1; - let leader = cluster.leader_of_region(region_id).unwrap(); - let epoch = cluster.get_region_epoch(region_id); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader); - ctx.set_region_epoch(epoch); - let snap_ctx = SnapContext { - pb_ctx: &ctx, - ..Default::default() - }; - let snapshot = storage.snapshot(snap_ctx).unwrap(); - let txn_ext = snapshot.txn_ext.clone().unwrap(); - for retry in 0..10 { - if txn_ext.is_max_ts_synced() { - break; - } - thread::sleep(Duration::from_millis(1 << retry)); - } - assert!(snapshot.ext().is_max_ts_synced()); - }; - - cluster.must_transfer_leader(1, new_peer(1, 1)); - wait_for_synced(&mut cluster); - let max_ts = cm.max_ts(); - - cluster.pd_client.trigger_tso_failure(); - // Transfer the leader out and back - cluster.must_transfer_leader(1, new_peer(2, 2)); - cluster.must_transfer_leader(1, new_peer(1, 1)); + // This method should be modified with + // `test_sync_max_ts_after_leader_transfer_impl_v2` simultaneously + fn test_sync_max_ts_after_leader_transfer_impl() { + let mut cluster = new_cluster(0, 3, F::TAG); + cluster.cfg.raft_store.raft_heartbeat_ticks = 20; + cluster.run(); + + let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); + cluster.must_transfer_leader(1, new_peer(1, 1)); + // Give some time for leader to commit the first entry + // todo: It shouldn't need this, but for now and for v2, without it, the test is + // not stable. + thread::sleep(Duration::from_millis(100)); + wait_for_synced(&mut cluster, 1, 1); + let max_ts = cm.max_ts(); + + cluster.pd_client.trigger_tso_failure(); + // Transfer the leader out and back + cluster.must_transfer_leader(1, new_peer(2, 2)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + wait_for_synced(&mut cluster, 1, 1); + let new_max_ts = cm.max_ts(); + assert!(new_max_ts > max_ts); + } - wait_for_synced(&mut cluster); - let new_max_ts = cm.max_ts(); - assert!(new_max_ts > max_ts); + test_kv_format_impl!(test_sync_max_ts_after_leader_transfer_impl); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_propose_in_memory_pessimistic_locks() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.raft_heartbeat_ticks = 20; cluster.run(); @@ -328,9 +294,10 @@ fn test_propose_in_memory_pessimistic_locks() { assert_eq!(value, lock.into_lock().to_bytes()); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_memory_pessimistic_locks_status_after_transfer_leader_failure() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.raft_heartbeat_ticks = 20; cluster.cfg.raft_store.reactive_memory_lock_tick_interval = ReadableDuration::millis(200); cluster.cfg.raft_store.reactive_memory_lock_timeout_tick = 3; diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs index a2c2ea75c64..e3f22afe6d9 100644 --- a/tests/integrations/raftstore/test_unsafe_recovery.rs +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -1160,7 +1160,7 @@ fn test_force_leader_multiple_election_rounds() { #[test] fn test_unsafe_recovery_has_commit_merge() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); @@ -1218,7 +1218,7 @@ fn test_unsafe_recovery_has_commit_merge() { #[test] fn test_unsafe_recovery_during_merge() { let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster); + configure_for_merge(&mut cluster.cfg); cluster.run(); diff --git a/tests/integrations/storage/test_raftkv.rs b/tests/integrations/storage/test_raftkv.rs index 01993fb89cd..3dcdab0cf6b 100644 --- a/tests/integrations/storage/test_raftkv.rs +++ b/tests/integrations/storage/test_raftkv.rs @@ -276,7 +276,7 @@ fn test_read_on_replica_check_memory_locks() { fn test_invalid_read_index_when_no_leader() { // Initialize cluster let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster, Some(10), Some(6)); + configure_for_lease_read(&mut cluster.cfg, Some(10), Some(6)); cluster.cfg.raft_store.raft_heartbeat_ticks = 1; cluster.cfg.raft_store.hibernate_regions = false; let pd_client = Arc::clone(&cluster.pd_client); From c4b38e8fa7db5392bb5b424856c9066f95a23886 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 14 Feb 2023 10:50:01 +0800 Subject: [PATCH 523/676] Raftstore-v2: support split operator (#14199) ref tikv/tikv#12842 Support split operator got from PD. Signed-off-by: SpadeA-Tang --- components/raftstore-v2/src/fsm/peer.rs | 4 ++ .../src/operation/command/admin/mod.rs | 4 +- .../src/operation/command/admin/split.rs | 60 ++++++++++++++++++- .../raftstore-v2/src/operation/command/mod.rs | 4 +- components/raftstore-v2/src/operation/mod.rs | 2 +- components/raftstore-v2/src/router/message.rs | 6 +- .../raftstore-v2/src/worker/pd/region.rs | 49 ++++++++++++++- .../raftstore/test_split_region.rs | 38 ++++++------ 8 files changed, 137 insertions(+), 30 deletions(-) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 47d23a67d1d..1b127e5851b 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -305,6 +305,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .peer_mut() .on_request_split(self.store_ctx, request, ch) } + PeerMsg::RequestHalfSplit { request, ch } => self + .fsm + .peer_mut() + .on_request_half_split(self.store_ctx, request, ch), PeerMsg::UpdateRegionSize { size } => { self.fsm.peer_mut().on_update_region_size(size) } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index a912cb7a3d5..ca91e597cb9 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -15,8 +15,8 @@ use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; use split::SplitResult; pub use split::{ - report_split_init_finish, temp_split_path, RequestSplit, SplitFlowControl, SplitInit, - SPLIT_PREFIX, + report_split_init_finish, temp_split_path, RequestHalfSplit, RequestSplit, SplitFlowControl, + SplitInit, SPLIT_PREFIX, }; use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 0fbe31277ed..86b0aab558e 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -53,7 +53,7 @@ use raftstore::{ }, Result, }; -use slog::{error, info}; +use slog::{error, info, warn}; use tikv_util::{log::SlogFormat, slog_panic}; use crate::{ @@ -149,6 +149,15 @@ pub struct RequestSplit { pub source: Cow<'static, str>, } +#[derive(Debug)] +pub struct RequestHalfSplit { + pub epoch: RegionEpoch, + pub start_key: Option>, + pub end_key: Option>, + pub policy: CheckPolicy, + pub source: Cow<'static, str>, +} + #[derive(Default, Debug)] pub struct SplitFlowControl { size_diff_hint: i64, @@ -280,6 +289,55 @@ impl Peer { self.ask_batch_split_pd(ctx, rs.split_keys, ch); } + pub fn on_request_half_split( + &mut self, + ctx: &mut StoreContext, + rhs: RequestHalfSplit, + _ch: CmdResChannel, + ) { + let is_key_range = rhs.start_key.is_some() && rhs.end_key.is_some(); + info!( + self.logger, + "on half split"; + "is_key_range" => is_key_range, + "policy" => ?rhs.policy, + "source" => ?rhs.source, + ); + if !self.is_leader() { + // region on this store is no longer leader, skipped. + info!(self.logger, "not leader, skip."); + return; + } + + let region = self.region(); + if util::is_epoch_stale(&rhs.epoch, region.get_region_epoch()) { + warn!( + self.logger, + "receive a stale halfsplit message"; + "is_key_range" => is_key_range, + ); + return; + } + + let task = SplitCheckTask::split_check_key_range( + region.clone(), + rhs.start_key, + rhs.end_key, + false, + rhs.policy, + // todo: bucket range + None, + ); + if let Err(e) = ctx.schedulers.split_check.schedule(task) { + error!( + self.logger, + "failed to schedule split check"; + "is_key_range" => is_key_range, + "err" => %e, + ); + } + } + pub fn propose_split( &mut self, store_ctx: &mut StoreContext, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index d887af7d6d6..d06e43c0303 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -59,8 +59,8 @@ mod control; mod write; pub use admin::{ - report_split_init_finish, temp_split_path, AdminCmdResult, CompactLogContext, RequestSplit, - SplitFlowControl, SplitInit, SPLIT_PREFIX, + report_split_init_finish, temp_split_path, AdminCmdResult, CompactLogContext, RequestHalfSplit, + RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, }; pub use control::ProposalControl; pub use write::{ diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 9cdd78dcb4c..f022ab91109 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -9,7 +9,7 @@ mod txn_ext; pub use command::{ AdminCmdResult, ApplyFlowControl, CommittedEntries, CompactLogContext, ProposalControl, - RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, + RequestHalfSplit, RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, SPLIT_PREFIX, }; pub use life::{DestroyProgress, GcPeerContext}; diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 91efc54c867..04bc5dbab10 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -17,7 +17,7 @@ use super::{ }, ApplyRes, }; -use crate::operation::{RequestSplit, SimpleWriteBinary, SplitInit}; +use crate::operation::{RequestHalfSplit, RequestSplit, SimpleWriteBinary, SplitInit}; #[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] @@ -185,6 +185,10 @@ pub enum PeerMsg { request: RequestSplit, ch: CmdResChannel, }, + RequestHalfSplit { + request: RequestHalfSplit, + ch: CmdResChannel, + }, UpdateRegionSize { size: u64, }, diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index d282534329b..c862d1f208b 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -10,10 +10,14 @@ use pd_client::{ }; use raftstore::store::{ReadStats, WriteStats}; use resource_metering::RawRecords; -use slog::{debug, info}; +use slog::{debug, error, info}; use tikv_util::{store::QueryStats, time::UnixSecs}; use super::{requests::*, Runner}; +use crate::{ + operation::{RequestHalfSplit, RequestSplit}, + router::{CmdResChannel, PeerMsg}, +}; pub struct RegionHeartbeatTask { pub term: u64, @@ -276,8 +280,47 @@ where ); send_admin_request(&logger, &router, region_id, epoch, peer, req, None); } else if resp.has_split_region() { - // TODO - info!(logger, "pd asks for split but ignored"); + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["split region"]) + .inc(); + + let mut split_region = resp.take_split_region(); + info!( + logger, + "try to split"; + "region_id" => region_id, + "region_epoch" => ?epoch, + ); + + let (ch, _) = CmdResChannel::pair(); + let msg = if split_region.get_policy() == pdpb::CheckPolicy::Usekey { + PeerMsg::RequestSplit { + request: RequestSplit { + epoch, + split_keys: split_region.take_keys().into(), + source: "pd".into(), + }, + ch, + } + } else { + PeerMsg::RequestHalfSplit { + request: RequestHalfSplit { + epoch, + start_key: None, + end_key: None, + policy: split_region.get_policy(), + source: "pd".into(), + }, + ch, + } + }; + if let Err(e) = router.send(region_id, msg) { + error!(logger, + "send split request failed"; + "region_id" => region_id, + "err" => ?e + ); + } } else if resp.has_merge() { // TODO info!(logger, "pd asks for merge but ignored"); diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 4b7914f7324..55cc642aca1 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -304,7 +304,7 @@ fn check_cluster(cluster: &mut Cluster, k: &[u8], v: &[u8], all_ /// sure broadcast commit is disabled when split. #[test] fn test_delay_split_region() { - let mut cluster = test_raftstore::new_server_cluster(0, 3); + let mut cluster = new_server_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(500); cluster.cfg.raft_store.merge_max_log_gap = 100; cluster.cfg.raft_store.raft_log_gc_threshold = 500; @@ -411,9 +411,7 @@ fn test_node_split_overlap_snapshot() { must_get_equal(&engine3, b"k3", b"v3"); } -fn test_apply_new_version_snapshot( - cluster: &mut test_raftstore::Cluster, -) { +fn test_apply_new_version_snapshot(cluster: &mut Cluster) { // truncate the log quickly so that we can force sending snapshot. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(5); @@ -468,19 +466,19 @@ fn test_apply_new_version_snapshot( #[test] fn test_node_apply_new_version_snapshot() { - let mut cluster = test_raftstore::new_node_cluster(0, 3); + let mut cluster = new_node_cluster(0, 3); test_apply_new_version_snapshot(&mut cluster); } #[test] fn test_server_apply_new_version_snapshot() { - let mut cluster = test_raftstore::new_server_cluster(0, 3); + let mut cluster = new_server_cluster(0, 3); test_apply_new_version_snapshot(&mut cluster); } #[test] fn test_server_split_with_stale_peer() { - let mut cluster = test_raftstore::new_server_cluster(0, 3); + let mut cluster = new_server_cluster(0, 3); // disable raft log gc. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(500); @@ -604,7 +602,7 @@ fn test_split_region_diff_check() { #[test] fn test_node_split_region_after_reboot_with_config_change() { let count = 1; - let mut cluster = test_raftstore::new_server_cluster(0, count); + let mut cluster = new_server_cluster(0, count); let region_max_size = 2000; let region_split_size = 2000; cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); @@ -645,10 +643,7 @@ fn test_node_split_region_after_reboot_with_config_change() { } } -fn test_split_epoch_not_match( - cluster: &mut test_raftstore::Cluster, - right_derive: bool, -) { +fn test_split_epoch_not_match(cluster: &mut Cluster, right_derive: bool) { cluster.cfg.raft_store.right_derive_when_split = right_derive; cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -720,25 +715,25 @@ fn test_split_epoch_not_match( #[test] fn test_server_split_epoch_not_match_left_derive() { - let mut cluster = test_raftstore::new_server_cluster(0, 3); + let mut cluster = new_server_cluster(0, 3); test_split_epoch_not_match(&mut cluster, false); } #[test] fn test_server_split_epoch_not_match_right_derive() { - let mut cluster = test_raftstore::new_server_cluster(0, 3); + let mut cluster = new_server_cluster(0, 3); test_split_epoch_not_match(&mut cluster, true); } #[test] fn test_node_split_epoch_not_match_left_derive() { - let mut cluster = test_raftstore::new_node_cluster(0, 3); + let mut cluster = new_node_cluster(0, 3); test_split_epoch_not_match(&mut cluster, false); } #[test] fn test_node_split_epoch_not_match_right_derive() { - let mut cluster = test_raftstore::new_node_cluster(0, 3); + let mut cluster = new_node_cluster(0, 3); test_split_epoch_not_match(&mut cluster, true); } @@ -780,10 +775,13 @@ fn test_node_quick_election_after_split() { assert!(new_leader.is_some()); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_node_split_region() { let count = 5; - let mut cluster = test_raftstore::new_node_cluster(0, count); + let mut cluster = new_cluster(0, count); // length of each key+value let item_len = 74; // make bucket's size to item_len, which means one row one bucket @@ -988,7 +986,7 @@ fn test_split_with_in_memory_pessimistic_locks() { #[test] fn test_refresh_region_bucket_keys() { let count = 5; - let mut cluster = test_raftstore::new_server_cluster(0, count); + let mut cluster = new_server_cluster(0, count); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -1174,7 +1172,7 @@ fn test_refresh_region_bucket_keys() { #[test] fn test_gen_split_check_bucket_ranges() { let count = 5; - let mut cluster = test_raftstore::new_server_cluster(0, count); + let mut cluster = new_server_cluster(0, count); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(5); cluster.cfg.coprocessor.enable_region_bucket = true; // disable report buckets; as it will reset the user traffic stats to randomize From 6d17e254c4a284ce26c9b0bf0e4883fde6e593a1 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 14 Feb 2023 11:24:01 +0800 Subject: [PATCH 524/676] Raftstore-v2: use appropriate default region split size when integration test suit start (#14210) ref tikv/tikv#12842 use appropriate default region split size when integration test suit start Signed-off-by: SpadeA-Tang Signed-off-by: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> --- components/test_raftstore-v2/src/node.rs | 10 +-- components/test_raftstore-v2/src/server.rs | 2 +- .../raftstore/test_split_region.rs | 73 ++++++++++--------- 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index 96275cc8383..f8c8d84bc9b 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -190,7 +190,7 @@ impl Simulator for NodeCluster { let mut raft_store = cfg.raft_store.clone(); raft_store .validate( - cfg.coprocessor.region_split_size.unwrap_or_default(), + cfg.coprocessor.region_split_size(), cfg.coprocessor.enable_region_bucket, cfg.coprocessor.region_bucket_size, ) @@ -285,16 +285,12 @@ impl Simulator for NodeCluster { assert!(node_id == 0 || node_id == node.id()); let node_id = node.id(); - let region_split_size = cfg.coprocessor.region_split_size; + let region_split_size = cfg.coprocessor.region_split_size(); let enable_region_bucket = cfg.coprocessor.enable_region_bucket; let region_bucket_size = cfg.coprocessor.region_bucket_size; let mut raftstore_cfg = cfg.tikv.raft_store; raftstore_cfg - .validate( - region_split_size.unwrap_or_default(), - enable_region_bucket, - region_bucket_size, - ) + .validate(region_split_size, enable_region_bucket, region_bucket_size) .unwrap(); // let raft_store = Arc::new(VersionTrack::new(raftstore_cfg)); diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index e64844bb490..64e05d6b766 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -196,7 +196,7 @@ impl ServerCluster { let mut raft_store = cfg.raft_store.clone(); raft_store .validate( - cfg.coprocessor.region_split_size.unwrap_or_default(), + cfg.coprocessor.region_split_size(), cfg.coprocessor.enable_region_bucket, cfg.coprocessor.region_bucket_size, ) diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 55cc642aca1..20a7c3f503a 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -266,45 +266,48 @@ impl Filter for EraseHeartbeatCommit { } } -fn check_cluster(cluster: &mut Cluster, k: &[u8], v: &[u8], all_committed: bool) { - let region = cluster.pd_client.get_region(k).unwrap(); - let mut tried_cnt = 0; - let leader = loop { - match cluster.leader_of_region(region.get_id()) { - None => { - tried_cnt += 1; - if tried_cnt >= 3 { - panic!("leader should be elected"); +macro_rules! check_cluster { + ($cluster:expr, $k:expr, $v:expr, $all_committed:expr) => { + let region = $cluster.pd_client.get_region($k).unwrap(); + let mut tried_cnt = 0; + let leader = loop { + match $cluster.leader_of_region(region.get_id()) { + None => { + tried_cnt += 1; + if tried_cnt >= 3 { + panic!("leader should be elected"); + } + continue; } - continue; + Some(l) => break l, } - Some(l) => break l, - } - }; - let mut missing_count = 0; - for i in 1..=region.get_peers().len() as u64 { - let engine = cluster.get_engine(i); - if all_committed || i == leader.get_store_id() { - must_get_equal(&engine, k, v); - } else { - // Note that a follower can still commit the log by an empty MsgAppend - // when bcast commit is disabled. A heartbeat response comes to leader - // before MsgAppendResponse will trigger MsgAppend. - match engine.get_value(&keys::data_key(k)).unwrap() { - Some(res) => assert_eq!(v, &res[..]), - None => missing_count += 1, + }; + let mut missing_count = 0; + for i in 1..=region.get_peers().len() as u64 { + let engine = $cluster.get_engine(i); + if $all_committed || i == leader.get_store_id() { + must_get_equal(&engine, $k, $v); + } else { + // Note that a follower can still commit the log by an empty MsgAppend + // when bcast commit is disabled. A heartbeat response comes to leader + // before MsgAppendResponse will trigger MsgAppend. + match engine.get_value(&keys::data_key($k)).unwrap() { + Some(res) => assert_eq!($v, &res[..]), + None => missing_count += 1, + } } } - } - assert!(all_committed || missing_count > 0); + assert!($all_committed || missing_count > 0); + }; } /// TiKV enables lazy broadcast commit optimization, which can delay split /// on follower node. So election of new region will delay. We need to make /// sure broadcast commit is disabled when split. -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_delay_split_region() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(500); cluster.cfg.raft_store.merge_max_log_gap = 100; cluster.cfg.raft_store.raft_log_gc_threshold = 500; @@ -323,8 +326,8 @@ fn test_delay_split_region() { cluster.must_put(b"k3", b"v3"); // Although skip bcast is enabled, but heartbeat will commit the log in period. - check_cluster(&mut cluster, b"k1", b"v1", true); - check_cluster(&mut cluster, b"k3", b"v3", true); + check_cluster!(cluster, b"k1", b"v1", true); + check_cluster!(cluster, b"k3", b"v3", true); cluster.must_transfer_leader(region.get_id(), new_peer(1, 1)); cluster.add_send_filter(CloneFilterFactory(EraseHeartbeatCommit)); @@ -333,14 +336,14 @@ fn test_delay_split_region() { sleep_ms(100); // skip bcast is enabled by default, so all followers should not commit // the log. - check_cluster(&mut cluster, b"k4", b"v4", false); + check_cluster!(cluster, b"k4", b"v4", false); cluster.must_transfer_leader(region.get_id(), new_peer(3, 3)); // New leader should flush old committed entries eagerly. - check_cluster(&mut cluster, b"k4", b"v4", true); + check_cluster!(cluster, b"k4", b"v4", true); cluster.must_put(b"k5", b"v5"); // New committed entries should be broadcast lazily. - check_cluster(&mut cluster, b"k5", b"v5", false); + check_cluster!(cluster, b"k5", b"v5", false); cluster.add_send_filter(CloneFilterFactory(EraseHeartbeatCommit)); let k2 = b"k2"; @@ -352,7 +355,7 @@ fn test_delay_split_region() { sleep_ms(100); // After split, skip bcast is enabled again, so all followers should not // commit the log. - check_cluster(&mut cluster, b"k6", b"v6", false); + check_cluster!(cluster, b"k6", b"v6", false); } #[test_case(test_raftstore::new_node_cluster)] From 8e5e5ea571411729bb2507c773ac1271d8d35fa1 Mon Sep 17 00:00:00 2001 From: lijie Date: Tue, 14 Feb 2023 14:01:32 +0800 Subject: [PATCH 525/676] chore: bump version to v6.7.0-alpha (#14216) Signed-off-by: lijie --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6083f14bad7..c8cf54fd4a9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6334,7 +6334,7 @@ dependencies = [ [[package]] name = "tikv" -version = "6.6.0-alpha" +version = "6.7.0-alpha" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index 63be8944f5f..cf66773b576 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "6.6.0-alpha" +version = "6.7.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 54c5ec7f557ebe9947b391ddd8874b9debd7e89a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 15 Feb 2023 10:46:01 +0800 Subject: [PATCH 526/676] log-backup: make the download more finer-grained (#14203) close tikv/tikv#14206 Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- components/sst_importer/src/metrics.rs | 10 + components/sst_importer/src/sst_importer.rs | 341 +++++++++++++++++--- 2 files changed, 306 insertions(+), 45 deletions(-) diff --git a/components/sst_importer/src/metrics.rs b/components/sst_importer/src/metrics.rs index e7eeefd3e82..6b4af299ba8 100644 --- a/components/sst_importer/src/metrics.rs +++ b/components/sst_importer/src/metrics.rs @@ -106,4 +106,14 @@ lazy_static! { "The operations over storage cache", &["operation"] ).unwrap(); + + pub static ref CACHED_FILE_IN_MEM: IntGauge = register_int_gauge!( + "tikv_import_apply_cached_bytes", + "The files cached by the apply requests of importer." + ).unwrap(); + pub static ref CACHE_EVENT: IntCounterVec = register_int_counter_vec!( + "tikv_import_apply_cache_event", + "The events of caching. event = {add, remove, out-of-quota}", + &["type"] + ).unwrap(); } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 384a48e96a8..42a96e21652 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -9,12 +9,12 @@ use std::{ path::{Path, PathBuf}, sync::{ atomic::{AtomicU64, Ordering}, - Arc, + Arc, Condvar, Mutex, }, time::Duration, }; -use dashmap::DashMap; +use dashmap::{mapref::entry::Entry, DashMap}; use encryption::{to_engine_encryption_method, DataKeyManager}; use engine_rocks::{get_env, RocksSstReader}; use engine_traits::{ @@ -45,7 +45,7 @@ use tokio::runtime::{Handle, Runtime}; use txn_types::{Key, TimeStamp, WriteRef}; use crate::{ - caching::cache_map::CacheMap, + caching::cache_map::{CacheMap, ShareOwned}, import_file::{ImportDir, ImportFile}, import_mode::{ImportModeSwitcher, RocksDbMetricsFn}, metrics::*, @@ -53,6 +53,28 @@ use crate::{ util, Config, Error, Result, }; +pub struct LoadedFile { + permit: MemUsePermit, + content: Arc<[u8]>, +} + +impl std::fmt::Debug for LoadedFile { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LoadedFileInner") + .field("permit", &self.permit) + .field("content.len()", &self.content.len()) + .finish() + } +} + +impl ShareOwned for LoadedFile { + type Shared = Arc<[u8]>; + + fn share_owned(&self) -> Self::Shared { + Arc::clone(&self.content) + } +} + #[derive(Default, Debug, Clone)] pub struct DownloadExt<'a> { cache_key: Option<&'a str>, @@ -71,17 +93,164 @@ impl<'a> DownloadExt<'a> { } } -#[derive(Clone, PartialEq, Debug)] +#[derive(Debug)] +struct MemUsePermit { + amount: u64, + statistic: Arc, +} + +impl Drop for MemUsePermit { + fn drop(&mut self) { + self.statistic.fetch_sub(self.amount, Ordering::SeqCst); + } +} + +#[derive(Clone, Debug)] pub enum CacheKvFile { - Mem(Arc>), + Mem(Remote), Fs(Arc), } +/// Remote presents a "remote" object which can be downloaded and then cached. +/// The remote object should generally implement the `ShareOwned` trait. +/// This structure doesn't manage how it is downloaded, it just manages the +/// state. You need to provide the manually downloaded data to the +/// [`DownloadPromise`]. +/// Below is the state transform of this: +/// ```text +/// DownloadPromise::fulfill +/// +-----------+ +-----------+ +/// |Downloading+-------->|Cached | +/// +--+--------+ +-----------+ +/// | ^ +/// | | +/// DownloadPromise | | Somebody takes +/// dropped | | over the duty. +/// v | +/// +--------+--+ +/// |Leaked | +/// +-----------+ +/// ``` +#[derive(Debug)] +pub struct Remote(Arc<(Mutex>, Condvar)>); + +impl Clone for Remote { + fn clone(&self) -> Self { + Self(Arc::clone(&self.0)) + } +} + +/// When holding this, the holder has promised to downloading the remote object +/// into local, then provide it to others waiting the object, by +/// [`Self::fulfill()`]. +pub struct DownloadPromise(Arc<(Mutex>, Condvar)>); + +impl DownloadPromise { + /// provide the downloaded data and make it cached. + pub fn fulfill(self, item: T) -> Remote { + let mut l = self.0.as_ref().0.lock().unwrap(); + debug_assert!(matches!(*l, FileCacheInner::Downloading)); + *l = FileCacheInner::Cached(item); + self.0.as_ref().1.notify_all(); + drop(l); + Remote(Arc::clone(&self.0)) + } +} + +impl Drop for DownloadPromise { + fn drop(&mut self) { + let mut l = self.0.as_ref().0.lock().unwrap(); + if matches!(*l, FileCacheInner::Downloading) { + *l = FileCacheInner::Leaked; + self.0.as_ref().1.notify_one(); + } + } +} + +impl Remote { + /// create a downloading remote object. + /// it returns the handle to the remote object and a [`DownloadPromise`], + /// the latter can be used to fulfill the remote object. + /// + /// # Examples + /// ``` + /// # use sst_importer::sst_importer::Remote; + /// let (remote_obj, promise) = Remote::download(); + /// promise.fulfill(42); + /// assert_eq!(remote_obj.get(), Some(42)); + /// ``` + pub fn download() -> (Self, DownloadPromise) { + let inner = Arc::new((Mutex::new(FileCacheInner::Downloading), Condvar::new())); + (Self(Arc::clone(&inner)), DownloadPromise(inner)) + } + + /// Block and wait until the remote object is downloaded. + /// # Returns + /// If the remote object has been fulfilled, return `None`. + /// If the remote object hasn't been fulfilled, return a + /// [`DownloadPromise`]: it is time to take over the duty of downloading. + /// + /// # Examples + /// ``` + /// # use sst_importer::sst_importer::Remote; + /// let (remote_obj, promise) = Remote::download(); + /// drop(promise); + /// let new_promise = remote_obj.wait_until_fill(); + /// new_promise + /// .expect("wait_until_fill should return new promise when old promise dropped") + /// .fulfill(42); + /// assert!(remote_obj.wait_until_fill().is_none()); + /// ``` + pub fn wait_until_fill(&self) -> Option> { + let mut l = self.0.as_ref().0.lock().unwrap(); + loop { + match *l { + FileCacheInner::Downloading => { + l = self.0.as_ref().1.wait(l).unwrap(); + } + FileCacheInner::Leaked => { + *l = FileCacheInner::Downloading; + return Some(DownloadPromise(Arc::clone(&self.0))); + } + FileCacheInner::Cached(_) => return None, + } + } + } +} + +impl Remote { + /// Fetch the internal object of the remote object. + pub fn get(&self) -> Option<::Shared> { + let l = self.0.as_ref().0.lock().unwrap(); + match *l { + FileCacheInner::Downloading | FileCacheInner::Leaked => None, + FileCacheInner::Cached(ref t) => Some(t.share_owned()), + } + } +} + +/// returns a error indices that we are going to panic in a invalid state. +/// (Rust panic information cannot be send to BR, hence client cannot know +/// what happens, so we pack it into a `Result`.) +fn bug(message: impl std::fmt::Display) -> Error { + Error::Io(std::io::Error::new( + std::io::ErrorKind::Other, + format!("BUG in TiKV: {}", message), + )) +} + +#[derive(Clone, Debug, PartialEq, Eq)] +enum FileCacheInner { + Downloading, + Leaked, + Cached(T), +} + impl CacheKvFile { // get the ref count of item. pub fn ref_count(&self) -> usize { match self { - CacheKvFile::Mem(buff) => Arc::strong_count(buff), + CacheKvFile::Mem(buff) => Arc::strong_count(&buff.0), CacheKvFile::Fs(path) => Arc::strong_count(path), } } @@ -109,7 +278,7 @@ pub struct SstImporter { cached_storage: CacheMap, download_rt: Runtime, file_locks: Arc>, - mem_use: AtomicU64, + mem_use: Arc, mem_limit: ReadableSize, } @@ -139,7 +308,7 @@ impl SstImporter { file_locks: Arc::new(DashMap::default()), cached_storage, download_rt, - mem_use: AtomicU64::new(0), + mem_use: Arc::new(AtomicU64::new(0)), mem_limit: ReadableSize(memory_limit as u64), }) } @@ -412,9 +581,10 @@ impl SstImporter { let mut need_retain = true; match c { CacheKvFile::Mem(buff) => { - let buflen = buff.len(); + let buflen = buff.get().map(|v| v.len()).unwrap_or_default(); // The term of recycle memeory is 60s. if c.ref_count() == 1 && c.is_expired(start) { + CACHE_EVENT.with_label_values(&["remove"]).inc(); need_retain = false; shrink_buff_size += buflen; } else { @@ -436,6 +606,8 @@ impl SstImporter { need_retain }); + CACHED_FILE_IN_MEM.set(self.mem_use.load(Ordering::SeqCst) as _); + if self.import_support_download() { let shrink_file_count = shrink_files.len(); if shrink_file_count > 0 || retain_file_count > 0 { @@ -452,7 +624,6 @@ impl SstImporter { if shrink_buff_size > 0 || retain_buff_size > 0 { info!("shrink cache by tick"; "shrink size" => shrink_buff_size, "retain size" => retain_buff_size); } - self.dec_mem(shrink_buff_size as _); shrink_buff_size } } @@ -463,23 +634,24 @@ impl SstImporter { self.mem_limit == ReadableSize(0) } - fn inc_mem_and_check(&self, meta: &KvMeta) -> bool { + fn request_memory(&self, meta: &KvMeta) -> Option { let size = meta.get_length(); let old = self.mem_use.fetch_add(size, Ordering::SeqCst); // If the memory is limited, roll backup the mem_use and return false. if old + size > self.mem_limit.0 { self.mem_use.fetch_sub(size, Ordering::SeqCst); - false + CACHE_EVENT.with_label_values(&["out-of-quota"]).inc(); + None } else { - true + CACHE_EVENT.with_label_values(&["add"]).inc(); + Some(MemUsePermit { + amount: size, + statistic: Arc::clone(&self.mem_use), + }) } } - fn dec_mem(&self, size: u64) { - self.mem_use.fetch_sub(size, Ordering::SeqCst); - } - pub fn do_read_kv_file( &self, meta: &KvMeta, @@ -490,21 +662,41 @@ impl SstImporter { let start = Instant::now(); let dst_name = format!("{}_{}", meta.get_name(), meta.get_range_offset()); - let mut lock = self - .file_locks - .entry(dst_name) - .or_insert((CacheKvFile::Mem(Arc::default()), Instant::now())); - - if let CacheKvFile::Mem(buff) = &lock.0 { - if !buff.is_empty() { - lock.1 = Instant::now(); - return Ok(lock.0.clone()); + let promise = { + let lock = self.file_locks.entry(dst_name); + IMPORTER_APPLY_DURATION + .with_label_values(&["download-get-lock"]) + .observe(start.saturating_elapsed().as_secs_f64()); + + match lock { + Entry::Occupied(mut ent) => match ent.get_mut() { + (CacheKvFile::Mem(buff), last_used) => { + *last_used = Instant::now(); + match buff.wait_until_fill() { + Some(handle) => handle, + None => return Ok(ent.get().0.clone()), + } + } + _ => { + return Err(bug(concat!( + "using both read-to-memory and download-to-file is unacceptable for now.", + "(If you think it is possible in the future you are reading this, ", + "please change this line to `return item.get.0.clone()`)", + "(Please also check the state transform is OK too.)", + ))); + } + }, + Entry::Vacant(ent) => { + let (cache, handle) = Remote::download(); + ent.insert((CacheKvFile::Mem(cache), Instant::now())); + handle + } } - } + }; - if !self.inc_mem_and_check(meta) { - return Err(Error::ResourceNotEnough(String::from("memory is limited"))); - } + let permit = self + .request_memory(meta) + .ok_or_else(|| Error::ResourceNotEnough(String::from("memory is limited")))?; let expected_sha256 = { let sha256 = meta.get_sha256().to_vec(); @@ -544,8 +736,10 @@ impl SstImporter { .observe(start.saturating_elapsed().as_secs_f64()); let rewrite_buff = self.rewrite_kv_file(buff, rewrite_rule)?; - *lock = (CacheKvFile::Mem(Arc::new(rewrite_buff)), Instant::now()); - Ok(lock.0.clone()) + Ok(CacheKvFile::Mem(promise.fulfill(LoadedFile { + content: Arc::from(rewrite_buff.into_boxed_slice()), + permit, + }))) } pub fn wrap_kms( @@ -619,7 +813,7 @@ impl SstImporter { ext_storage: Arc, backend: &StorageBackend, speed_limiter: &Limiter, - ) -> Result>> { + ) -> Result> { let c = if self.import_support_download() { self.do_download_kv_file(meta, backend, speed_limiter)? } else { @@ -627,7 +821,7 @@ impl SstImporter { }; match c { // If cache memroy, it has been rewrite, return buffer directly. - CacheKvFile::Mem(buff) => Ok(buff), + CacheKvFile::Mem(buff) => buff.get().ok_or_else(|| bug("invalid cache state")), // If cache file name, it need to read and rewrite. CacheKvFile::Fs(path) => { let file = File::open(path.as_ref())?; @@ -636,7 +830,7 @@ impl SstImporter { reader.read_to_end(&mut buffer)?; let rewrite_buff = self.rewrite_kv_file(buffer, rewrite_rule)?; - Ok(Arc::new(rewrite_buff)) + Ok(Arc::from(rewrite_buff.into_boxed_slice())) } } } @@ -774,10 +968,10 @@ impl SstImporter { end_key: &[u8], start_ts: u64, restore_ts: u64, - file_buff: Arc>, + file_buff: Arc<[u8]>, mut build_fn: impl FnMut(Vec, Vec), ) -> Result> { - let mut event_iter = EventIterator::new(file_buff.as_slice()); + let mut event_iter = EventIterator::new(file_buff.as_ref()); let mut smallest_key = None; let mut largest_key = None; let mut total_key = 0; @@ -1245,7 +1439,10 @@ mod tests { use tempfile::Builder; use test_sst_importer::*; use test_util::new_test_key_manager; - use tikv_util::{codec::stream_event::EventEncoder, stream::block_on_external_io}; + use tikv_util::{ + codec::stream_event::EventEncoder, stream::block_on_external_io, + sys::thread::StdThreadBuildWrapper, + }; use txn_types::{Value, WriteType}; use uuid::Uuid; @@ -1781,7 +1978,11 @@ mod tests { ) .unwrap(); - assert_eq!(CacheKvFile::Mem(Arc::new(buff.clone())), output); + assert!( + matches!(output.clone(), CacheKvFile::Mem(rc) if &*rc.get().unwrap() == buff.as_slice()), + "{:?}", + output + ); // Do not shrint nothing. let shrink_size = importer.shrink_by_tick(); @@ -2794,11 +2995,11 @@ mod tests { length: 100, ..Default::default() }; - let check = importer.inc_mem_and_check(&meta); - assert!(check); + let check = importer.request_memory(&meta); + assert!(check.is_some()); assert_eq!(importer.mem_use.load(Ordering::SeqCst), meta.get_length()); - importer.dec_mem(meta.get_length()); + drop(check); assert_eq!(importer.mem_use.load(Ordering::SeqCst), 0); // test inc_mem_and_check() failed. @@ -2806,8 +3007,8 @@ mod tests { length: u64::MAX, ..Default::default() }; - let check = importer.inc_mem_and_check(&meta); - assert!(!check); + let check = importer.request_memory(&meta); + assert!(check.is_none()); } #[test] @@ -2817,7 +3018,8 @@ mod tests { SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); let key = "file1"; - let value = (CacheKvFile::Mem(Arc::default()), Instant::now()); + let (r, _) = Remote::download(); + let value = (CacheKvFile::Mem(r), Instant::now()); let lock = importer.file_locks.entry(key.to_string()).or_insert(value); // test locked by try_entry() @@ -2834,4 +3036,53 @@ mod tests { let _buff = v.0.clone(); assert_eq!(v.0.ref_count(), 2); } + + #[test] + fn test_remote_waiting() { + let (r, dl) = Remote::download(); + let r2 = r.clone(); + let js = (0..2) + .map(|_| { + let r = r.clone(); + std::thread::spawn(move || { + assert!(r.wait_until_fill().is_none()); + r.get() + }) + }) + .collect::>(); + dl.fulfill(42); + for j in js { + assert!(matches!(j.join(), Ok(Some(42)))); + } + assert_eq!(r2.get(), Some(42)); + } + + #[test] + fn test_remote_drop_in_one_thread() { + let (r, dl) = Remote::download(); + drop(dl); + let p = r.wait_until_fill(); + assert!(p.is_some()); + p.unwrap().fulfill("Kitty"); + assert_eq!(r.get(), Some("Kitty")); + } + + #[test] + fn test_remote_take_duty() { + let (r, dl) = Remote::download(); + let js = (0..4).map(|i| { + let r = r.clone(); + std::thread::Builder::new() + .name(format!("rd-{}", i)) + .spawn_wrapper(move || match r.wait_until_fill() { + Some(x) => x.fulfill(42).get(), + None => r.get(), + }) + .unwrap() + }); + drop(dl); + for j in js { + assert!(matches!(j.join(), Ok(Some(42)))); + } + } } From ff0a79bfa4c7f4a806d7e52ca107a6244afd0102 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 15 Feb 2023 12:08:02 +0800 Subject: [PATCH 527/676] raftstore-v2: fix wrong peer cache (#14212) close tikv/tikv#14211 Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- .../raftstore-v2/src/operation/ready/mod.rs | 2 +- .../tests/integrations/test_conf_change.rs | 39 +++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index a88df2245cc..fea60049b93 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -250,7 +250,7 @@ impl Peer { if self.is_leader() && from_peer.get_id() != INVALID_ID { self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); } - self.insert_peer_cache(msg.take_from_peer()); + self.insert_peer_cache(from_peer); let pre_committed_index = self.raft_group().raft.raft_log.committed; if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { self.on_transfer_leader_msg(ctx, msg.get_message(), msg.disk_usage) diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 7ea49c02a6b..c1c7861fd54 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -6,14 +6,14 @@ use engine_traits::{Peekable, RaftEngineReadOnly, CF_DEFAULT}; use futures::executor::block_on; use kvproto::{ raft_cmdpb::{AdminCmdType, RaftCmdRequest}, - raft_serverpb::PeerState, + raft_serverpb::{PeerState, RaftMessage}, }; -use raft::prelude::ConfChangeType; +use raft::prelude::{ConfChangeType, MessageType}; use raftstore_v2::{ router::{PeerMsg, PeerTick}, SimpleWriteEncoder, }; -use tikv_util::store::new_learner_peer; +use tikv_util::store::{new_learner_peer, new_peer}; use crate::cluster::{check_skip_wal, Cluster}; @@ -199,3 +199,36 @@ fn remove_peer(cluster: &Cluster, offset_id: usize, region_id: u64, peer_id: u64 assert_eq!(region_state.get_state(), PeerState::Tombstone); assert_eq!(raft_engine.get_raft_state(region_id).unwrap(), None); } + +/// The peer should be able to respond an unknown sender, otherwise the +/// liveness of configuration change can't be guaranteed. +#[test] +fn test_unknown_peer() { + let cluster = Cluster::with_node_count(1, None); + + let router = &cluster.routers[0]; + let header = router.new_request_for(2).take_header(); + + // Create a fake message to see whether it's responded. + let from_peer = new_peer(10, 10); + let mut msg = Box::::default(); + msg.set_region_id(2); + msg.set_to_peer(header.get_peer().clone()); + msg.set_region_epoch(header.get_region_epoch().clone()); + msg.set_from_peer(from_peer.clone()); + let raft_message = msg.mut_message(); + raft_message.set_msg_type(raft::prelude::MessageType::MsgHeartbeat); + raft_message.set_from(10); + raft_message.set_term(10); + + router.send_raft_message(msg).unwrap(); + router.wait_flush(2, Duration::from_secs(3)); + // If peer cache is updated correctly, it should be able to respond. + let msg = cluster.receiver(0).try_recv().unwrap(); + assert_eq!(*msg.get_to_peer(), from_peer); + assert_eq!(msg.get_from_peer(), header.get_peer()); + assert_eq!( + msg.get_message().get_msg_type(), + MessageType::MsgHeartbeatResponse + ); +} From c072027294b5f0ffa7e1b47c181d4d4bc40df958 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Wed, 15 Feb 2023 15:40:02 +0800 Subject: [PATCH 528/676] limit should await (#14222) close tikv/tikv#14221 Signed-off-by: bufferflies <1045931706@qq.com> --- src/server/tablet_snap.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index 0fc836f36c2..a54c5461e0d 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -60,7 +60,7 @@ impl RecvTabletSnapContext { } let chunk_size = match head.take_data().try_into() { - Ok(buff) => usize::from_ne_bytes(buff), + Ok(buff) => usize::from_le_bytes(buff), Err(_) => return Err(box_err!("failed to get chunk size")), }; let meta = head.take_message(); @@ -119,7 +119,7 @@ async fn send_snap_files( let mut total_sent = msg.compute_size() as u64; let mut chunk = SnapshotChunk::default(); chunk.set_message(msg); - chunk.set_data(usize::to_ne_bytes(SNAP_CHUNK_LEN).to_vec()); + chunk.set_data(usize::to_le_bytes(SNAP_CHUNK_LEN).to_vec()); sender .feed((chunk, WriteFlags::default().buffer_hint(true))) .await?; @@ -145,7 +145,7 @@ async fn send_snap_files( } off += readed; } - limiter.consume(off); + limiter.consume(off).await; total_sent += off as u64; let mut chunk = SnapshotChunk::default(); chunk.set_data(buffer); @@ -260,7 +260,7 @@ async fn recv_snap_files( None => return Err(box_err!("missing chunk")), }; f.write_all(&chunk[..])?; - limit.consume(chunk.len()); + limit.consume(chunk.len()).await; size += chunk.len(); } debug!("received snap file"; "file" => %p.display(), "size" => size); @@ -479,7 +479,7 @@ pub fn copy_tablet_snapshot( let mut head = SnapshotChunk::default(); head.set_message(msg); - head.set_data(usize::to_ne_bytes(SNAP_CHUNK_LEN).to_vec()); + head.set_data(usize::to_le_bytes(SNAP_CHUNK_LEN).to_vec()); let recv_context = RecvTabletSnapContext::new(head)?; let recv_path = recver_snap_mgr.tmp_recv_path(&recv_context.key); From 13c2e545a475d776a1202d9baf4029710c9043ca Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 15 Feb 2023 21:56:01 +0800 Subject: [PATCH 529/676] causal_ts: simplify the logic of pop ts (#14227) ref tikv/tikv#12794 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- components/causal_ts/src/tso.rs | 39 ++++++++++++++------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index e63c3c2c3ba..2c99d8c068a 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -75,30 +75,25 @@ const MAX_TSO_BATCH_LIST_CAPACITY: u32 = 1024; /// TSO range: [(physical, logical_start), (physical, logical_end)) #[derive(Debug)] struct TsoBatch { - size: u32, physical: u64, + logical_start: u64, logical_end: u64, // exclusive - logical_start: AtomicU64, + // current valid logical_tso offset, alloc_offset >= logical_end means + // the batch is exhausted. + alloc_offset: AtomicU64, } impl TsoBatch { pub fn pop(&self) -> Option<(TimeStamp, bool /* is_used_up */)> { - let mut logical = self.logical_start.load(Ordering::Relaxed); - while logical < self.logical_end { - match self.logical_start.compare_exchange_weak( - logical, - logical + 1, - Ordering::Relaxed, - Ordering::Relaxed, - ) { - Ok(_) => { - return Some(( - TimeStamp::compose(self.physical, logical), - logical + 1 == self.logical_end, - )); - } - Err(x) => logical = x, - } + // alloc_offset might be far bigger than logical_end if the concurrency is + // *very* high, but it won't overflow in practice, so no need to do an + // extra load check here. + let ts = self.alloc_offset.fetch_add(1, Ordering::Relaxed); + if ts < self.logical_end { + return Some(( + TimeStamp::compose(self.physical, ts), + ts + 1 == self.logical_end, + )); } None } @@ -109,22 +104,22 @@ impl TsoBatch { let logical_start = logical_end.checked_sub(batch_size as u64).unwrap(); Self { - size: batch_size, physical, + logical_start, logical_end, - logical_start: AtomicU64::new(logical_start), + alloc_offset: AtomicU64::new(logical_start), } } /// Number of remaining (available) TSO in the batch. pub fn remain(&self) -> u32 { self.logical_end - .saturating_sub(self.logical_start.load(Ordering::Relaxed)) as u32 + .saturating_sub(self.alloc_offset.load(Ordering::Relaxed)) as u32 } /// The original start timestamp in the batch. pub fn original_start(&self) -> TimeStamp { - TimeStamp::compose(self.physical, self.logical_end - self.size as u64) + TimeStamp::compose(self.physical, self.logical_start) } /// The excluded end timestamp after the last in batch. From 3df8a7a8aed9dc34498cc105af9b730bac336b18 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 16 Feb 2023 10:58:01 +0800 Subject: [PATCH 530/676] Raftstore-v2: snap status should be cleared when sending snapshot failed (#14230) ref tikv/tikv#12842 snap status should be cleared when sending snapshot failed Signed-off-by: SpadeA-Tang Co-authored-by: Xinye Tao --- .../src/operation/command/admin/mod.rs | 8 ++++- .../raftstore-v2/src/operation/ready/mod.rs | 6 +++- components/test_raftstore-v2/src/util.rs | 10 +++--- tests/integrations/raftstore/test_snap.rs | 32 ++++++++----------- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index ca91e597cb9..0661d1c15dc 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -69,10 +69,16 @@ impl Peer { return; } + let pre_transfer_leader = cmd_type == AdminCmdType::TransferLeader + && !WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + .contains(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL); + // The admin request is rejected because it may need to update epoch checker // which introduces an uncertainty and may breaks the correctness of epoch // checker. - if !self.applied_to_current_term() { + // As pre transfer leader is just a warmup phase, applying to the current term + // is not required. + if !self.applied_to_current_term() && !pre_transfer_leader { let e = box_err!( "{} peer has not applied to current term, applied_term {}, current_term {}", SlogFormat(&self.logger), diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index fea60049b93..943e3b6ba2f 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -30,7 +30,7 @@ use kvproto::{ raft_serverpb::{ExtraMessageType, RaftMessage}, }; use protobuf::Message as _; -use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; +use raft::{eraftpb, prelude::MessageType, Ready, SnapshotStatus, StateRole, INVALID_ID}; use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, store::{ @@ -362,6 +362,10 @@ impl Peer { ); // unreachable store self.raft_group_mut().report_unreachable(to_peer_id); + if msg_type == eraftpb::MessageType::MsgSnapshot { + self.raft_group_mut() + .report_snapshot(to_peer_id, SnapshotStatus::Failure); + } ctx.raft_metrics.send_message.add(msg_type, false); } } diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index 2bd9444b002..2f512982019 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -124,12 +124,12 @@ pub fn put_cf_till_size( key.into_bytes() } -pub fn configure_for_snapshot(cluster: &mut Cluster) { +pub fn configure_for_snapshot(config: &mut Config) { // Truncate the log quickly so that we can force sending snapshot. - cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(2); - cluster.cfg.raft_store.merge_max_log_gap = 1; - cluster.cfg.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); + config.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); + config.raft_store.raft_log_gc_count_limit = Some(2); + config.raft_store.merge_max_log_gap = 1; + config.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); } pub fn configure_for_lease_read_v2( diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index d18f42ec8ca..ddc4bb50406 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -21,6 +21,7 @@ use raftstore::{store::*, Result}; use rand::Rng; use security::SecurityManager; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::server::snap::send_snap; use tikv_util::{config::*, time::Instant, HandyRwLock}; @@ -268,7 +269,12 @@ fn test_server_concurrent_snap() { test_concurrent_snap(&mut cluster); } -fn test_cf_snapshot(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_cf_snapshot() { + let mut cluster = new_cluster(0, 3); configure_for_snapshot(&mut cluster.cfg); cluster.run(); @@ -306,18 +312,6 @@ fn test_cf_snapshot(cluster: &mut Cluster) { must_get_cf_equal(&engine1, cf, b"k3", b"v3"); } -#[test] -fn test_node_cf_snapshot() { - let mut cluster = new_node_cluster(0, 3); - test_cf_snapshot(&mut cluster); -} - -#[test] -fn test_server_snapshot() { - let mut cluster = new_server_cluster(0, 3); - test_cf_snapshot(&mut cluster); -} - // replace content of all the snapshots with the first snapshot it received. #[derive(Clone)] struct StaleSnap { @@ -661,11 +655,12 @@ fn random_long_vec(length: usize) -> Vec { /// Snapshot is generated using apply term from apply thread, which should be /// set correctly otherwise lead to inconsistency. -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_correct_snapshot_term() { // Use five replicas so leader can send a snapshot to a new peer without // committing extra logs. - let mut cluster = new_server_cluster(0, 5); + let mut cluster = new_cluster(0, 5); let pd_client = cluster.pd_client.clone(); pd_client.disable_default_operator(); @@ -714,9 +709,10 @@ fn test_correct_snapshot_term() { } /// Test when applying a snapshot, old logs should be cleaned up. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_snapshot_clean_up_logs_with_log_gc() { - let mut cluster = new_node_cluster(0, 4); + let mut cluster = new_cluster(0, 4); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(50); cluster.cfg.raft_store.raft_log_gc_threshold = 50; // Speed up log gc. @@ -739,7 +735,7 @@ fn test_snapshot_clean_up_logs_with_log_gc() { // Peer (4, 4) must become leader at the end and send snapshot to 2. must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); - let raft_engine = cluster.engines[&2].raft.clone(); + let raft_engine = cluster.get_raft_engine(2); let mut dest = vec![]; raft_engine.get_all_entries_to(1, &mut dest).unwrap(); // No new log is proposed, so there should be no log at all. From c6aa5e35cd00dcd62262f9dd4531ac44e5bf8bd1 Mon Sep 17 00:00:00 2001 From: woofyzhao <490467089@qq.com> Date: Thu, 16 Feb 2023 16:40:02 +0800 Subject: [PATCH 531/676] Revise CONTRIBUTING.md contents (#14032) close tikv/tikv#14031 Signed-off-by: woofyzhao <490467089@qq.com> Co-authored-by: Ti Chi Robot --- CONTRIBUTING.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 711b2bdb192..41b2ef7a528 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,6 +19,7 @@ To build TiKV you'll need to at least have the following installed: * `make` - Build tool (run common workflows) * `cmake` - Build tool (required for gRPC) * `awk` - Pattern scanning/processing language +* [`protoc`](https://github.com/protocolbuffers/protobuf/releases) - Google protocol buffer compiler * C++ compiler - gcc 5+ (required for gRPC) If you are targeting platforms other than x86_64/aarch64 Linux or macOS, you'll also need: @@ -92,7 +93,7 @@ make format make clippy ``` -See the [style doc](https://github.com/rust-lang/rfcs/blob/master/style-guide/README.md) and the [API guidelines](https://rust-lang-nursery.github.io/api-guidelines/) for details on the conventions. +See the [style doc](https://github.com/rust-lang/fmt-rfcs/blob/master/guide/guide.md) and the [API guidelines](https://rust-lang-nursery.github.io/api-guidelines/) for details on the conventions. Please follow this style to make TiKV easy to review, maintain, and develop. @@ -115,13 +116,13 @@ To run TiKV as an actual key-value store, you will need to run it as a cluster ( Use [PD](https://github.com/tikv/pd) to manage the cluster (even if just one node on a single machine). -Instructions are in our [docs](https://tikv.org/docs/dev/tasks/deploy/binary/) (if you build TiKV from source, you could skip `1. Download package` and `tikv-server` is in directory `/target`). +Instructions are in our [docs](https://tikv.org/docs/latest/deploy/install/test/#install-binary-manually) (if you build TiKV from source, you could skip `1. Download package` and `tikv-server` is in directory `/target`). Tips: It's recommended to increase the open file limit above 82920. WSL2 users may refer to [the comment](https://github.com/Microsoft/WSL/issues/1688#issuecomment-532767317) if having difficulty in changing the `ulimit`. ### Configuration -Read our configuration guide to learn about various [configuration options](https://tikv.org/docs/dev/tasks/configure/introduction/). There is also a [configuration template](./etc/config-template.toml). +Read our configuration guide to learn about various [configuration options](https://tikv.org/docs/latest/deploy/configure/introduction/). There is also a [configuration template](./etc/config-template.toml). ## Contribution flow @@ -133,7 +134,7 @@ This is a rough outline of what a contributor's workflow looks like: - Write code, add test cases, and commit your work (see below for message format). - Run tests and make sure all tests pass. - Push your changes to a branch in your fork of the repository and submit a pull request. - * Make sure mention the issue, which is created at step 1, in the commit meesage. + * Make sure to mention the issue, which is created at step 1, in the commit message. - Your PR will be reviewed and may be requested some changes. * Once you've made changes, your PR must be re-reviewed and approved. * If the PR becomes out of date, you can use GitHub's 'update branch' button. From bbe06e97e43272be41de3e61fe4118607ec8055a Mon Sep 17 00:00:00 2001 From: Yifan Xu <30385241+xuyifangreeneyes@users.noreply.github.com> Date: Thu, 16 Feb 2023 17:20:03 +0800 Subject: [PATCH 532/676] coprocessor: use mur3 to calculate fmsketch (#14204) ref tikv/tikv#14231 Signed-off-by: xuyifan <675434007@qq.com> --- Cargo.lock | 8 +- Cargo.toml | 2 +- src/coprocessor/statistics/analyze.rs | 129 ++++++++++++++++++++++--- src/coprocessor/statistics/cmsketch.rs | 7 +- src/coprocessor/statistics/fmsketch.rs | 11 +-- 5 files changed, 125 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c8cf54fd4a9..d87014110fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3209,10 +3209,10 @@ dependencies = [ ] [[package]] -name = "murmur3" -version = "0.5.1" +name = "mur3" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ead5388e485d38e622630c6b05afd3761a6701ff15c55b279ea5b31dcb62cff" +checksum = "97af489e1e21b68de4c390ecca6703318bc1aa16e9733bcb62c089b73c6fbb1b" [[package]] name = "native-tls" @@ -6392,7 +6392,7 @@ dependencies = [ "memory_trace_macros", "mime", "more-asserts", - "murmur3", + "mur3", "nom 5.1.0", "notify", "num-traits", diff --git a/Cargo.toml b/Cargo.toml index cf66773b576..509f9514b10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -118,7 +118,7 @@ match-template = "0.0.1" memory_trace_macros = { workspace = true } mime = "0.3.13" more-asserts = "0.2" -murmur3 = "0.5.1" +mur3 = "0.1" nom = { version = "5.1.0", default-features = false, features = ["std"] } notify = "4" num-traits = "0.2.14" diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 25ecf95653d..85e0281064e 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -1,10 +1,13 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Reverse, collections::BinaryHeap, marker::PhantomData, mem, sync::Arc}; +use std::{ + cmp::Reverse, collections::BinaryHeap, hash::Hasher, marker::PhantomData, mem, sync::Arc, +}; use api_version::{keyspace::KvPair, KvFormat}; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; +use mur3::Hasher128; use protobuf::Message; use rand::{rngs::StdRng, Rng}; use tidb_query_common::storage::{ @@ -411,7 +414,7 @@ impl RowSampleBuilder { } else { // Only if the `decoded_val` is Datum::Null, `decoded_val` is a Ok(None). // So it is safe the unwrap the Ok value. - let decoded_sorted_val = TT::sort_key(&decoded_val.as_string()?.unwrap().into_owned())?; + let decoded_sorted_val = TT::sort_key(&decoded_val.as_string()?.unwrap())?; decoded_sorted_val } } @@ -488,7 +491,6 @@ struct BaseRowSampleCollector { fm_sketches: Vec, rng: StdRng, total_sizes: Vec, - row_buf: Vec, memory_usage: usize, reported_memory_usage: usize, } @@ -501,7 +503,6 @@ impl Default for BaseRowSampleCollector { fm_sketches: vec![], rng: StdRng::from_entropy(), total_sizes: vec![], - row_buf: Vec::new(), memory_usage: 0, reported_memory_usage: 0, } @@ -516,11 +517,11 @@ impl BaseRowSampleCollector { fm_sketches: vec![FmSketch::new(max_fm_sketch_size); col_and_group_len], rng: StdRng::from_entropy(), total_sizes: vec![0; col_and_group_len], - row_buf: Vec::new(), memory_usage: 0, reported_memory_usage: 0, } } + pub fn collect_column_group( &mut self, columns_val: &[Vec], @@ -530,7 +531,6 @@ impl BaseRowSampleCollector { ) { let col_len = columns_val.len(); for i in 0..column_groups.len() { - self.row_buf.clear(); let offsets = column_groups[i].get_column_offsets(); let mut has_null = true; for j in offsets { @@ -545,23 +545,31 @@ impl BaseRowSampleCollector { self.null_count[col_len + i] += 1; continue; } - // Use a in place murmur3 to replace this memory copy. - for j in offsets { - if columns_info[*j as usize].as_accessor().is_string_like() { - self.row_buf - .extend_from_slice(&collation_keys_val[*j as usize]); + if offsets.len() == 1 { + let offset = offsets[0] as usize; + if columns_info[offset].as_accessor().is_string_like() { + self.fm_sketches[col_len + i].insert(&collation_keys_val[offset]); } else { - self.row_buf.extend_from_slice(&columns_val[*j as usize]); + self.fm_sketches[col_len + i].insert(&columns_val[offset]); + } + } else { + let mut hasher = Hasher128::with_seed(0); + for j in offsets { + if columns_info[*j as usize].as_accessor().is_string_like() { + hasher.write(&collation_keys_val[*j as usize]); + } else { + hasher.write(&columns_val[*j as usize]); + } } + self.fm_sketches[col_len + i].insert_hash_value(hasher.finish()); } - self.fm_sketches[col_len + i].insert(&self.row_buf); } } pub fn collect_column( &mut self, columns_val: &[Vec], - collation_keys_val: Vec>, + collation_keys_val: &[Vec], columns_info: &[tipb::ColumnInfo], ) { for i in 0..columns_val.len() { @@ -659,7 +667,7 @@ impl RowSampleCollector for BernoulliRowSampleCollector { columns_info: &[tipb::ColumnInfo], ) { self.base - .collect_column(&columns_val, collation_keys_val, columns_info); + .collect_column(&columns_val, &collation_keys_val, columns_info); self.sampling(columns_val); } fn sampling(&mut self, data: Vec>) { @@ -736,7 +744,7 @@ impl RowSampleCollector for ReservoirRowSampleCollector { columns_info: &[tipb::ColumnInfo], ) { self.base - .collect_column(&columns_val, collation_keys_val, columns_info); + .collect_column(&columns_val, &collation_keys_val, columns_info); self.sampling(columns_val); } @@ -1357,3 +1365,92 @@ mod tests { } } } + +#[cfg(test)] +mod benches { + use tidb_query_datatype::{ + codec::{ + batch::LazyBatchColumn, + collation::{collator::CollatorUtf8Mb4Bin, Collator}, + }, + EvalType, FieldTypeTp, + }; + + use super::*; + + fn prepare_arguments() -> ( + Vec>, + Vec>, + Vec, + Vec, + ) { + let mut columns_info = Vec::new(); + for i in 1..4 { + let mut col_info = tipb::ColumnInfo::default(); + col_info.set_column_id(i as i64); + col_info.as_mut_accessor().set_tp(FieldTypeTp::VarChar); + col_info + .as_mut_accessor() + .set_collation(Collation::Utf8Mb4Bin); + columns_info.push(col_info); + } + let mut columns_slice = Vec::new(); + for _ in 0..3 { + let mut col = LazyBatchColumn::decoded_with_capacity_and_tp(1, EvalType::Bytes); + col.mut_decoded().push_bytes(Some(b"abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789abcdefghijklmnopqrstuvwxyz0123456789".to_vec())); + columns_slice.push(col) + } + let mut column_vals = Vec::new(); + let mut collation_key_vals = Vec::new(); + for i in 0..columns_info.len() { + let mut val = vec![]; + columns_slice[i] + .encode(0, &columns_info[i], &mut EvalContext::default(), &mut val) + .unwrap(); + if columns_info[i].as_accessor().is_string_like() { + let mut mut_val = &val[..]; + let decoded_val = table::decode_col_value( + &mut mut_val, + &mut EvalContext::default(), + &columns_info[i], + ) + .unwrap(); + let decoded_sorted_val = + CollatorUtf8Mb4Bin::sort_key(&decoded_val.as_string().unwrap().unwrap()) + .unwrap(); + collation_key_vals.push(decoded_sorted_val); + } else { + collation_key_vals.push(Vec::new()); + } + column_vals.push(val); + } + let mut column_group = tipb::AnalyzeColumnGroup::default(); + column_group.set_column_offsets(vec![0, 1, 2]); + column_group.set_prefix_lengths(vec![-1, -1, -1]); + let column_groups = vec![column_group]; + (column_vals, collation_key_vals, columns_info, column_groups) + } + + #[bench] + fn bench_collect_column(b: &mut test::Bencher) { + let mut collector = BaseRowSampleCollector::new(10000, 4); + let (column_vals, collation_key_vals, columns_info, _) = prepare_arguments(); + b.iter(|| { + collector.collect_column(&column_vals, &collation_key_vals, &columns_info); + }) + } + + #[bench] + fn bench_collect_column_group(b: &mut test::Bencher) { + let mut collector = BaseRowSampleCollector::new(10000, 4); + let (column_vals, collation_key_vals, columns_info, column_groups) = prepare_arguments(); + b.iter(|| { + collector.collect_column_group( + &column_vals, + &collation_key_vals, + &columns_info, + &column_groups, + ); + }) + } +} diff --git a/src/coprocessor/statistics/cmsketch.rs b/src/coprocessor/statistics/cmsketch.rs index 6a3042c8ee7..754a05b0bb2 100644 --- a/src/coprocessor/statistics/cmsketch.rs +++ b/src/coprocessor/statistics/cmsketch.rs @@ -1,6 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use murmur3::murmur3_x64_128; +use mur3::murmurhash3_x64_128; /// `CmSketch` is used to estimate point queries. /// Refer:[Count-Min Sketch](https://en.wikipedia.org/wiki/Count-min_sketch) @@ -30,9 +30,8 @@ impl CmSketch { } // `hash` hashes the data into two u64 using murmur hash. - fn hash(mut bytes: &[u8]) -> (u64, u64) { - let out = murmur3_x64_128(&mut bytes, 0).unwrap(); - (out as u64, (out >> 64) as u64) + fn hash(bytes: &[u8]) -> (u64, u64) { + murmurhash3_x64_128(bytes, 0) } // `insert` inserts the data into cm sketch. For each row i, the position at diff --git a/src/coprocessor/statistics/fmsketch.rs b/src/coprocessor/statistics/fmsketch.rs index b52559434c7..341223215f3 100644 --- a/src/coprocessor/statistics/fmsketch.rs +++ b/src/coprocessor/statistics/fmsketch.rs @@ -1,7 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use collections::HashSet; -use murmur3::murmur3_x64_128; +use mur3::murmurhash3_x64_128; /// `FmSketch` is used to count the approximate number of distinct /// elements in multiset. @@ -22,11 +22,8 @@ impl FmSketch { } } - pub fn insert(&mut self, mut bytes: &[u8]) { - let hash = { - let out = murmur3_x64_128(&mut bytes, 0).unwrap(); - out as u64 - }; + pub fn insert(&mut self, bytes: &[u8]) { + let hash = murmurhash3_x64_128(bytes, 0).0; self.insert_hash_value(hash); } @@ -38,7 +35,7 @@ impl FmSketch { proto } - fn insert_hash_value(&mut self, hash_val: u64) { + pub fn insert_hash_value(&mut self, hash_val: u64) { if (hash_val & self.mask) != 0 { return; } From 7a045008b28f7879b140a7960237d99d3a7d7381 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Thu, 16 Feb 2023 18:22:02 +0800 Subject: [PATCH 533/676] cdc: filter changes based on the range in request (#14213) close pingcap/tiflow#6346, close tikv/tikv#10073 cdc: filter changes based on the range in request Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/cdc/src/delegate.rs | 292 +++++++++++++++--- components/cdc/src/endpoint.rs | 23 +- components/cdc/src/initializer.rs | 24 +- components/cdc/src/service.rs | 26 +- components/cdc/tests/integrations/mod.rs | 2 + components/cdc/tests/integrations/test_cdc.rs | 133 ++++++++ 6 files changed, 444 insertions(+), 56 deletions(-) diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index 120806588dc..c4212c426be 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -130,6 +130,7 @@ pub struct Downstream { state: Arc>, kv_api: ChangeDataRequestKvApi, filter_loop: bool, + pub(crate) observed_range: ObservedRange, } impl Downstream { @@ -144,6 +145,7 @@ impl Downstream { conn_id: ConnId, kv_api: ChangeDataRequestKvApi, filter_loop: bool, + observed_range: ObservedRange, ) -> Downstream { Downstream { id: DownstreamId::new(), @@ -155,6 +157,7 @@ impl Downstream { state: Arc::new(AtomicCell::new(DownstreamState::default())), kv_api, filter_loop, + observed_range, } } @@ -395,6 +398,11 @@ impl Delegate { self.region_id, ); + // Check observed key range in region. + for downstream in self.downstreams_mut() { + downstream.observed_range.update_region_key_range(®ion); + } + // Mark the delegate as initialized. let mut pending = self.pending.take().unwrap(); self.region = Some(region); @@ -479,6 +487,7 @@ impl Delegate { request_id: u64, entries: Vec>, filter_loop: bool, + observed_range: &ObservedRange, ) -> Result> { let entries_len = entries.len(); let mut rows = vec![Vec::with_capacity(entries_len)]; @@ -496,6 +505,9 @@ impl Delegate { lock, old_value, })) => { + if !observed_range.contains_encoded_key(&lock.0) { + continue; + } let l = Lock::parse(&lock.1).unwrap(); if decode_lock(lock.0, l, &mut row, &mut _has_value) { continue; @@ -509,6 +521,9 @@ impl Delegate { write, old_value, })) => { + if !observed_range.contains_encoded_key(&write.0) { + continue; + } if decode_write(write.0, &write.1, &mut row, &mut _has_value, false) { continue; } @@ -640,51 +655,25 @@ impl Delegate { self.region_id ); - let mut need_filter = false; - for ds in downstreams { - if ds.filter_loop { - need_filter = true; - break; - } - } - // collect the change event cause by user write, which is `txn_source` = 0. // for changefeed which only need the user write, send the `filtered`, or else, // send them all. - let filtered = if need_filter { - let filtered = entries - .iter() - .filter(|x| x.txn_source == 0) - .cloned() - .collect::>(); - if filtered.is_empty() { - None - } else { - Some(Event { - region_id: self.region_id, - index, - event: Some(Event_oneof_event::Entries(EventEntries { - entries: filtered.into(), - ..Default::default() - })), - ..Default::default() - }) + let mut filtered_entries = None; + for downstream in downstreams { + if downstream.filter_loop { + let filtered = entries + .iter() + .filter(|x| x.txn_source == 0) + .cloned() + .collect::>(); + if !filtered.is_empty() { + filtered_entries = Some(filtered); + } + break; } - } else { - None - }; - - let event_entries = EventEntries { - entries: entries.into(), - ..Default::default() - }; - let change_data_event = Event { - region_id: self.region_id, - index, - event: Some(Event_oneof_event::Entries(event_entries)), - ..Default::default() - }; + } + let region_id = self.region_id; let send = move |downstream: &Downstream| { // No ready downstream or a downstream that does not match the kv_api type, will // be ignored. There will be one region that contains both Txn & Raw entries. @@ -692,15 +681,30 @@ impl Delegate { if !downstream.state.load().ready_for_change_events() || downstream.kv_api != kv_api { return Ok(()); } - if downstream.filter_loop && filtered.is_none() { + if downstream.filter_loop && filtered_entries.is_none() { return Ok(()); } - let event = if downstream.filter_loop { - filtered.clone().unwrap() + let entries_clone = if downstream.filter_loop { + downstream + .observed_range + .filter_entries(filtered_entries.clone().unwrap()) } else { - change_data_event.clone() + downstream.observed_range.filter_entries(entries.clone()) + }; + if entries_clone.is_empty() { + return Ok(()); + } + let event = Event { + region_id, + index, + event: Some(Event_oneof_event::Entries(EventEntries { + entries: entries_clone.into(), + ..Default::default() + })), + ..Default::default() }; + // Do not force send for real time change data events. let force_send = false; downstream.sink_event(event, force_send) @@ -1059,6 +1063,70 @@ fn decode_default(value: Vec, row: &mut EventRow, has_value: &mut bool) { *has_value = true; } +/// Observed key range. +#[derive(Clone, Default)] +pub struct ObservedRange { + start_key_encoded: Vec, + end_key_encoded: Vec, + start_key_raw: Vec, + end_key_raw: Vec, + pub(crate) all_key_covered: bool, +} + +impl ObservedRange { + pub fn new(start_key_encoded: Vec, end_key_encoded: Vec) -> Result { + let start_key_raw = Key::from_encoded(start_key_encoded.clone()) + .into_raw() + .map_err(|e| Error::Other(e.into()))?; + let end_key_raw = Key::from_encoded(end_key_encoded.clone()) + .into_raw() + .map_err(|e| Error::Other(e.into()))?; + Ok(ObservedRange { + start_key_encoded, + end_key_encoded, + start_key_raw, + end_key_raw, + all_key_covered: false, + }) + } + + #[allow(clippy::collapsible_if)] + pub fn update_region_key_range(&mut self, region: &Region) { + // Check observed key range in region. + if self.start_key_encoded <= region.start_key { + if self.end_key_encoded.is_empty() + || (region.end_key <= self.end_key_encoded && !region.end_key.is_empty()) + { + // Observed range covers the region. + self.all_key_covered = true; + } + } + } + + fn is_key_in_range(&self, start_key: &[u8], end_key: &[u8], key: &[u8]) -> bool { + if self.all_key_covered { + return true; + } + if start_key <= key && (key < end_key || end_key.is_empty()) { + return true; + } + false + } + + pub fn contains_encoded_key(&self, key: &[u8]) -> bool { + self.is_key_in_range(&self.start_key_encoded, &self.end_key_encoded, key) + } + + pub fn filter_entries(&self, mut entries: Vec) -> Vec { + if self.all_key_covered { + return entries; + } + // Entry's key is in raw key format. + entries.retain(|e| self.is_key_in_range(&self.start_key_raw, &self.end_key_raw, &e.key)); + entries + } +} + #[cfg(test)] mod tests { use std::cell::Cell; @@ -1068,6 +1136,7 @@ mod tests { use kvproto::{errorpb::Error as ErrorHeader, metapb::Region}; use super::*; + use crate::channel::{channel, recv_timeout, MemoryQuota}; #[test] fn test_error() { @@ -1090,6 +1159,7 @@ mod tests { ConnId::new(), ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); downstream.set_sink(sink); let mut delegate = Delegate::new(region_id, Default::default()); @@ -1097,6 +1167,7 @@ mod tests { assert!(delegate.handle.is_observing()); let resolver = Resolver::new(region_id); assert!(delegate.on_region_ready(resolver, region).is_empty()); + assert!(delegate.downstreams()[0].observed_range.all_key_covered); let rx_wrap = Cell::new(Some(rx)); let receive_error = || { @@ -1214,6 +1285,7 @@ mod tests { ConnId::new(), ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ) }; @@ -1264,6 +1336,138 @@ mod tests { assert!(!delegate.handle.is_observing()); } + #[test] + fn test_observed_range() { + for case in vec![ + (b"".as_slice(), b"".as_slice(), false), + (b"a", b"", false), + (b"", b"b", false), + (b"a", b"b", true), + (b"a", b"bb", false), + (b"a", b"aa", true), + (b"aa", b"aaa", true), + ] { + let start_key = if !case.0.is_empty() { + Key::from_raw(case.0).into_encoded() + } else { + case.0.to_owned() + }; + let end_key = if !case.1.is_empty() { + Key::from_raw(case.1).into_encoded() + } else { + case.1.to_owned() + }; + let mut region = Region::default(); + region.start_key = start_key.to_owned(); + region.end_key = end_key.to_owned(); + + for k in 0..=0xff { + let mut observed_range = ObservedRange::default(); + observed_range.update_region_key_range(®ion); + assert!(observed_range.contains_encoded_key(&Key::from_raw(&[k]).into_encoded())); + } + let mut observed_range = ObservedRange::new( + Key::from_raw(b"a").into_encoded(), + Key::from_raw(b"b").into_encoded(), + ) + .unwrap(); + observed_range.update_region_key_range(®ion); + assert_eq!(observed_range.all_key_covered, case.2, "{:?}", case); + assert!( + observed_range.contains_encoded_key(&Key::from_raw(b"a").into_encoded()), + "{:?}", + case + ); + assert!( + observed_range.contains_encoded_key(&Key::from_raw(b"ab").into_encoded()), + "{:?}", + case + ); + if observed_range.all_key_covered { + assert!( + observed_range.contains_encoded_key(&Key::from_raw(b"b").into_encoded()), + "{:?}", + case + ); + } else { + assert!( + !observed_range.contains_encoded_key(&Key::from_raw(b"b").into_encoded()), + "{:?}", + case + ); + } + } + } + + #[test] + fn test_downstream_filter_entires() { + // Create a new delegate that observes [b, d). + let observed_range = ObservedRange::new( + Key::from_raw(b"b").into_encoded(), + Key::from_raw(b"d").into_encoded(), + ) + .unwrap(); + let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); + let mut delegate = Delegate::new(1, txn_extra_op); + assert!(delegate.handle.is_observing()); + + let mut map = HashMap::default(); + for k in b'a'..=b'e' { + let mut put = PutRequest::default(); + put.key = Key::from_raw(&[k]).into_encoded(); + put.cf = "lock".to_owned(); + put.value = Lock::new( + LockType::Put, + put.key.clone(), + 1.into(), + 10, + None, + TimeStamp::zero(), + 0, + TimeStamp::zero(), + ) + .to_bytes(); + delegate + .sink_txn_put( + put, + false, + &mut map, + |_: &mut EventRow, _: TimeStamp| Ok(()), + ) + .unwrap(); + } + assert_eq!(map.len(), 5); + + let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let downstream = Downstream { + id: DownstreamId::new(), + req_id: 1, + conn_id: ConnId::new(), + peer: String::new(), + region_epoch: RegionEpoch::default(), + sink: Some(sink), + state: Arc::new(AtomicCell::new(DownstreamState::Normal)), + kv_api: ChangeDataRequestKvApi::TiDb, + filter_loop: false, + observed_range, + }; + delegate.add_downstream(downstream); + let entries = map.values().map(|(r, _)| r).cloned().collect(); + delegate + .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) + .unwrap(); + + let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.spawn(async move { + drain.forward(&mut tx).await.unwrap(); + }); + let (e, _) = recv_timeout(&mut rx, std::time::Duration::from_secs(5)) + .unwrap() + .unwrap(); + assert_eq!(e.events[0].get_entries().get_entries().len(), 2, "{:?}", e); + } + #[test] fn test_decode_rawkv() { let cases = vec![ diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 2b4eb9ff226..b5e15ceee23 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -727,11 +727,12 @@ impl, E: KvEngine> Endpoint { }; let change_cmd = ChangeObserver::from_cdc(region_id, delegate.handle.clone()); - + let observed_range = downstream_.observed_range; let region_epoch = request.take_region_epoch(); let mut init = Initializer { engine: self.engine.clone(), sched, + observed_range, region_id, region_epoch, conn_id, @@ -1275,7 +1276,7 @@ mod tests { }; use super::*; - use crate::{channel, recv_timeout}; + use crate::{channel, delegate::ObservedRange, recv_timeout}; struct TestEndpointSuite { // The order must ensure `endpoint` be dropped before other fields. @@ -1426,6 +1427,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::RawKv, false, + ObservedRange::default(), ); req.set_kv_api(ChangeDataRequestKvApi::RawKv); suite.run(Task::Register { @@ -1462,6 +1464,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TxnKv, false, + ObservedRange::default(), ); req.set_kv_api(ChangeDataRequestKvApi::TxnKv); suite.run(Task::Register { @@ -1499,6 +1502,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TxnKv, false, + ObservedRange::default(), ); req.set_kv_api(ChangeDataRequestKvApi::TxnKv); suite.run(Task::Register { @@ -1678,6 +1682,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); suite.run(Task::Register { request: req, @@ -1725,6 +1730,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); // Enable batch resolved ts in the test. let version = FeatureGate::batch_resolved_ts(); @@ -1748,6 +1754,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); suite.run(Task::Register { request: req.clone(), @@ -1785,6 +1792,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); suite.run(Task::Register { request: req, @@ -1830,6 +1838,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); suite.add_local_reader(100); suite.run(Task::Register { @@ -1862,6 +1871,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); suite.run(Task::Register { request: req, @@ -1938,6 +1948,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); downstream.get_state().store(DownstreamState::Normal); // Enable batch resolved ts in the test. @@ -1975,6 +1986,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); downstream.get_state().store(DownstreamState::Normal); suite.add_region(2, 100); @@ -2021,6 +2033,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); downstream.get_state().store(DownstreamState::Normal); suite.add_region(3, 100); @@ -2092,6 +2105,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); let downstream_id = downstream.get_id(); suite.run(Task::Register { @@ -2135,6 +2149,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); let new_downstream_id = downstream.get_id(); suite.run(Task::Register { @@ -2187,6 +2202,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); suite.run(Task::Register { request: req, @@ -2242,6 +2258,7 @@ mod tests { conn_id, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); downstream.get_state().store(DownstreamState::Normal); suite.run(Task::Register { @@ -2360,6 +2377,7 @@ mod tests { conn_id_a, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); suite.run(Task::Register { request: req.clone(), @@ -2384,6 +2402,7 @@ mod tests { conn_id_b, ChangeDataRequestKvApi::TiDb, false, + ObservedRange::default(), ); suite.run(Task::Register { request: req.clone(), diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 38c8603900e..68850ac55ac 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -47,7 +47,7 @@ use txn_types::{Key, KvPair, Lock, LockType, OldValue, TimeStamp}; use crate::{ channel::CdcEvent, - delegate::{post_init_downstream, Delegate, DownstreamId, DownstreamState}, + delegate::{post_init_downstream, Delegate, DownstreamId, DownstreamState, ObservedRange}, endpoint::Deregister, metrics::*, old_value::{near_seek_old_value, new_old_value_cursor, OldValueCursors}, @@ -79,6 +79,7 @@ pub(crate) struct Initializer { pub(crate) sched: Scheduler, pub(crate) sink: crate::channel::Sink, + pub(crate) observed_range: ObservedRange, pub(crate) region_id: u64, pub(crate) region_epoch: RegionEpoch, pub(crate) observe_id: ObserveId, @@ -206,10 +207,12 @@ impl Initializer { let region_id = region.get_id(); let observe_id = self.observe_id; let kv_api = self.kv_api; + self.observed_range.update_region_key_range(®ion); debug!("cdc async incremental scan"; "region_id" => region_id, "downstream_id" => ?downstream_id, "observe_id" => ?self.observe_id, + "all_key_covered" => ?self.observed_range.all_key_covered, "start_key" => log_wrappers::Value::key(snap.lower_bound().unwrap_or_default()), "end_key" => log_wrappers::Value::key(snap.upper_bound().unwrap_or_default())); @@ -432,6 +435,7 @@ impl Initializer { self.request_id, entries, self.filter_loop, + &self.observed_range, )?; if done { let (cb, fut) = tikv_util::future::paired_future_callback(); @@ -641,7 +645,7 @@ mod tests { }), sched: receiver_worker.scheduler(), sink, - + observed_range: ObservedRange::default(), region_id: 1, region_epoch: RegionEpoch::default(), observe_id: ObserveId::new(), @@ -668,6 +672,12 @@ mod tests { let mut expected_locks = BTreeMap::>>::new(); + // Only observe ["", "b\0x90"] + let observed_range = ObservedRange::new( + Key::from_raw(&[]).into_encoded(), + Key::from_raw(&[b'k', 90]).into_encoded(), + ) + .unwrap(); let mut total_bytes = 0; // Pessimistic locks should not be tracked for i in 0..10 { @@ -700,6 +710,7 @@ mod tests { ChangeDataRequestKvApi::TiDb, false, ); + initializer.observed_range = observed_range.clone(); let check_result = || loop { let task = rx.recv().unwrap(); match task { @@ -713,7 +724,14 @@ mod tests { // To not block test by barrier. pool.spawn(async move { let mut d = drain.drain(); - while d.next().await.is_some() {} + while let Some((e, _)) = d.next().await { + if let CdcEvent::Event(e) = e { + for e in e.get_entries().get_entries() { + let key = Key::from_raw(&e.key).into_encoded(); + assert!(observed_range.contains_encoded_key(&key), "{:?}", e); + } + } + } }); block_on(initializer.async_incremental_scan(snap.clone(), region.clone())).unwrap(); diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index f9665283c45..215f2cdebca 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -26,7 +26,7 @@ use tikv_util::{error, info, warn, worker::*}; use crate::{ channel::{channel, MemoryQuota, Sink, CDC_CHANNLE_CAPACITY}, - delegate::{Downstream, DownstreamId, DownstreamState}, + delegate::{Downstream, DownstreamId, DownstreamState, ObservedRange}, endpoint::{Deregister, Task}, }; @@ -207,7 +207,7 @@ impl ChangeData for Service { let (event_sink, mut event_drain) = channel(CDC_CHANNLE_CAPACITY, self.memory_quota.clone()); let peer = ctx.peer(); - let conn = Conn::new(event_sink, peer); + let conn = Conn::new(event_sink, peer.clone()); let conn_id = conn.get_id(); if let Err(status) = self @@ -217,11 +217,12 @@ impl ChangeData for Service { RpcStatus::with_message(RpcStatusCode::INVALID_ARGUMENT, format!("{:?}", e)) }) { - error!("cdc connection initiate failed"; "error" => ?status); - ctx.spawn( - sink.fail(status) - .unwrap_or_else(|e| error!("cdc failed to send error"; "error" => ?e)), - ); + error!("cdc connection initiate failed"; + "downstream" => ?peer, "error" => ?status); + ctx.spawn(sink.fail(status).unwrap_or_else(move |e| { + error!("cdc failed to send error"; + "downstream" => ?peer, "error" => ?e) + })); return; } @@ -236,10 +237,20 @@ impl ChangeData for Service { Err(e) => { warn!("empty or invalid TiCDC version, please upgrading TiCDC"; "version" => request.get_header().get_ticdc_version(), + "downstream" => ?peer, "error" => ?e); semver::Version::new(0, 0, 0) } }; + let observed_range = + match ObservedRange::new(request.start_key.clone(), request.end_key.clone()) { + Ok(observed_range) => observed_range, + Err(e) => { + warn!("cdc invalid observed start key or end key version"; + "downstream" => ?peer, "error" => ?e); + ObservedRange::default() + } + }; let downstream = Downstream::new( peer.clone(), region_epoch, @@ -247,6 +258,7 @@ impl ChangeData for Service { conn_id, req_kvapi, request.filter_loop, + observed_range, ); let ret = scheduler .schedule(Task::Register { diff --git a/components/cdc/tests/integrations/mod.rs b/components/cdc/tests/integrations/mod.rs index 821e4ad186e..c60a1fe8cb9 100644 --- a/components/cdc/tests/integrations/mod.rs +++ b/components/cdc/tests/integrations/mod.rs @@ -1,5 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(assert_matches)] + mod test_cdc; mod test_flow_control; diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index 9de1a77a8ad..3e5345e51f8 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -2597,3 +2597,136 @@ fn test_flashback() { } } } + +#[test] +fn test_cdc_filter_key_range() { + let mut suite = TestSuite::new(1, ApiVersion::V1); + + let req = suite.new_changedata_request(1); + + // Observe range [key1, key3). + let mut req_1_3 = req.clone(); + req_1_3.request_id = 13; + req_1_3.start_key = Key::from_raw(b"key1").into_encoded(); + req_1_3.end_key = Key::from_raw(b"key3").into_encoded(); + let (mut req_tx13, _event_feed_wrap13, receive_event13) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx13.send((req_1_3, WriteFlags::default()))).unwrap(); + let event = receive_event13(false); + event + .events + .into_iter() + .for_each(|e| match e.event.unwrap() { + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + }); + + let (mut req_tx24, _event_feed_wrap24, receive_event24) = + new_event_feed(suite.get_region_cdc_client(1)); + let mut req_2_4 = req; + req_2_4.request_id = 24; + req_2_4.start_key = Key::from_raw(b"key2").into_encoded(); + req_2_4.end_key = Key::from_raw(b"key4").into_encoded(); + block_on(req_tx24.send((req_2_4, WriteFlags::default()))).unwrap(); + let event = receive_event24(false); + event + .events + .into_iter() + .for_each(|e| match e.event.unwrap() { + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + }); + + // Sleep a while to make sure the stream is registered. + sleep_ms(1000); + + let receive_and_check_events = |is13: bool, is24: bool| -> Vec { + if is13 && is24 { + let mut events = receive_event13(false).events.to_vec(); + let mut events24 = receive_event24(false).events.to_vec(); + events.append(&mut events24); + events + } else if is13 { + let events = receive_event13(false).events.to_vec(); + let event = receive_event24(true); + assert!(event.resolved_ts.is_some(), "{:?}", event); + events + } else if is24 { + let events = receive_event24(false).events.to_vec(); + let event = receive_event13(true); + assert!(event.resolved_ts.is_some(), "{:?}", event); + events + } else { + let event = receive_event13(true); + assert!(event.resolved_ts.is_some(), "{:?}", event); + let event = receive_event24(true); + assert!(event.resolved_ts.is_some(), "{:?}", event); + vec![] + } + }; + for case in &[ + ("key1", true, false, true /* commit */), + ("key1", true, false, false /* rollback */), + ("key2", true, true, true), + ("key3", false, true, true), + ("key4", false, false, true), + ] { + let (k, v) = (case.0.to_owned(), "value".to_owned()); + // Prewrite + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone().into_bytes(); + mutation.value = v.into_bytes(); + suite.must_kv_prewrite(1, vec![mutation], k.clone().into_bytes(), start_ts); + let mut events = receive_and_check_events(case.1, case.2); + while let Some(event) = events.pop() { + match event.event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + assert_eq!(entries.entries[0].get_type(), EventLogType::Prewrite); + } + other => panic!("unknown event {:?}", other), + } + } + + if case.3 { + // Commit + let commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_commit(1, vec![k.into_bytes()], start_ts, commit_ts); + let mut events = receive_and_check_events(case.1, case.2); + while let Some(event) = events.pop() { + match event.event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + assert_eq!(entries.entries[0].get_type(), EventLogType::Commit); + } + other => panic!("unknown event {:?}", other), + } + } + } else { + // Rollback + suite.must_kv_rollback(1, vec![k.into_bytes()], start_ts); + let mut events = receive_and_check_events(case.1, case.2); + while let Some(event) = events.pop() { + match event.event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + assert_eq!(entries.entries[0].get_type(), EventLogType::Rollback); + } + other => panic!("unknown event {:?}", other), + } + } + } + } + + suite.stop(); +} From c5ce165ad35121114892fb7b27aeeaae3c3e7afc Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 16 Feb 2023 18:48:02 +0800 Subject: [PATCH 534/676] raftstore-v2: fix destroy blocked by apply progress (#14223) close tikv/tikv#14215 If a peer is marked for destroy, it will skip all apply result, which will make it never apply to committed index. This PR relaxes the check to last_applying_index and always process apply result. It also fixes a bug that new peer created by large ID may not survive restart. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- .../raftstore-v2/src/operation/command/mod.rs | 9 +- components/raftstore-v2/src/operation/life.rs | 5 +- .../raftstore-v2/src/operation/ready/mod.rs | 14 +++ .../raftstore-v2/tests/failpoints/mod.rs | 1 + .../tests/failpoints/test_life.rs | 67 +++++++++++++++ .../tests/integrations/cluster.rs | 75 +++++++++++++++- .../tests/integrations/test_life.rs | 85 +++---------------- .../raftstore/src/store/worker/region.rs | 17 ++-- 8 files changed, 183 insertions(+), 90 deletions(-) create mode 100644 components/raftstore-v2/tests/failpoints/test_life.rs diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index d06e43c0303..9f24241b039 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -320,11 +320,10 @@ impl Peer { } pub fn on_apply_res(&mut self, ctx: &mut StoreContext, apply_res: ApplyRes) { - if !self.serving() { - return; + if !self.serving() || !apply_res.admin_result.is_empty() { + // TODO: remove following log once stable. + info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res, "apply_trace" => ?self.storage().apply_trace()); } - // TODO: remove following log once stable. - info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res, "apply_trace" => ?self.storage().apply_trace()); // It must just applied a snapshot. if apply_res.applied_index < self.entry_storage().first_index() { // Ignore admin command side effects, otherwise it may split incomplete @@ -391,7 +390,7 @@ impl Peer { } let last_applying_index = self.compact_log_context().last_applying_index(); let committed_index = self.entry_storage().commit_index(); - if last_applying_index < committed_index { + if last_applying_index < committed_index || !self.serving() { // We need to continue to apply after previous page is finished. self.set_has_ready(); } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 8b63f9aae89..fdba7efdf4d 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -570,12 +570,13 @@ impl Peer { /// tablet. #[inline] pub fn postponed_destroy(&self) -> bool { + let last_applying_index = self.compact_log_context().last_applying_index(); let entry_storage = self.storage().entry_storage(); // If it's marked as tombstone, then it must be changed by conf change. In // this case, all following entries are skipped so applied_index never equals - // to commit_index. + // to last_applying_index. (self.storage().region_state().get_state() != PeerState::Tombstone - && entry_storage.applied_index() != entry_storage.commit_index()) + && entry_storage.applied_index() != last_applying_index) // Wait for critical commands like split. || self.has_pending_tombstone_tablets() } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 943e3b6ba2f..adb0edf82e4 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -236,6 +236,7 @@ impl Peer { } cmp::Ordering::Greater => { // We need to create the target peer. + info!(self.logger, "mark for destroy for larger ID"; "larger_id" => to_peer.get_id()); self.mark_for_destroy(Some(msg)); return; } @@ -943,6 +944,19 @@ impl Storage { } // If snapshot initializes the peer, we don't need to write apply trace again. if !self.ever_persisted() { + let region_id = self.region().get_id(); + let entry_storage = self.entry_storage(); + let raft_engine = entry_storage.raft_engine(); + if write_task.raft_wb.is_none() { + write_task.raft_wb = Some(raft_engine.log_batch(64)); + } + let wb = write_task.raft_wb.as_mut().unwrap(); + // There may be tombstone key from last peer. + raft_engine + .clean(region_id, 0, entry_storage.raft_state(), wb) + .unwrap_or_else(|e| { + slog_panic!(self.logger(), "failed to clean up region"; "error" => ?e); + }); self.init_apply_trace(write_task); self.set_ever_persisted(); } diff --git a/components/raftstore-v2/tests/failpoints/mod.rs b/components/raftstore-v2/tests/failpoints/mod.rs index d04ad2cafc2..e2f6884dd54 100644 --- a/components/raftstore-v2/tests/failpoints/mod.rs +++ b/components/raftstore-v2/tests/failpoints/mod.rs @@ -10,5 +10,6 @@ mod cluster; mod test_basic_write; mod test_bootstrap; +mod test_life; mod test_split; mod test_trace_apply; diff --git a/components/raftstore-v2/tests/failpoints/test_life.rs b/components/raftstore-v2/tests/failpoints/test_life.rs new file mode 100644 index 00000000000..ed05c1c6fad --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_life.rs @@ -0,0 +1,67 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::CF_DEFAULT; +use futures::executor::block_on; +use kvproto::raft_serverpb::RaftMessage; +use raft::prelude::MessageType; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; +use tikv_util::store::new_peer; + +use crate::cluster::{life_helper::assert_peer_not_exist, Cluster}; + +/// Test if a peer can be destroyed when it's applying entries +#[test] +fn test_destroy_by_larger_id_while_applying() { + let fp = "APPLY_COMMITTED_ENTRIES"; + let mut cluster = Cluster::default(); + let router = &cluster.routers[0]; + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + fail::cfg(fp, "pause").unwrap(); + + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + let (msg, mut sub) = PeerMsg::simple_write(header.clone(), put.clone().encode()); + router.send(2, msg).unwrap(); + assert!(block_on(sub.wait_committed())); + + let mut larger_id_msg = Box::::default(); + larger_id_msg.set_region_id(2); + let mut target_peer = header.get_peer().clone(); + target_peer.set_id(target_peer.get_id() + 1); + larger_id_msg.set_to_peer(target_peer.clone()); + larger_id_msg.set_region_epoch(header.get_region_epoch().clone()); + larger_id_msg + .mut_region_epoch() + .set_conf_ver(header.get_region_epoch().get_conf_ver() + 1); + larger_id_msg.set_from_peer(new_peer(2, 8)); + let raft_message = larger_id_msg.mut_message(); + raft_message.set_msg_type(MessageType::MsgHeartbeat); + raft_message.set_from(8); + raft_message.set_to(target_peer.get_id()); + raft_message.set_term(10); + + // Larger ID should trigger destroy. + router.send_raft_message(larger_id_msg).unwrap(); + fail::remove(fp); + assert_peer_not_exist(2, header.get_peer().get_id(), router); + let meta = router + .must_query_debug_info(2, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.id, target_peer.get_id()); + assert_eq!(meta.raft_status.hard_state.term, 10); + + std::thread::sleep(Duration::from_millis(10)); + + // New peer should survive restart. + cluster.restart(0); + let router = &cluster.routers[0]; + let meta = router + .must_query_debug_info(2, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.id, target_peer.get_id()); + assert_eq!(meta.raft_status.hard_state.term, 10); +} diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index ac3f30c7107..4a14b85f616 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -98,7 +98,10 @@ impl TestRouter { thread::sleep(Duration::from_millis(10)); continue; } - return block_on(sub.result()); + let res = block_on(sub.result()); + if res.is_some() { + return res; + } } None } @@ -721,3 +724,73 @@ pub mod split_helper { (left, right) } } + +pub mod life_helper { + use std::assert_matches::assert_matches; + + use engine_traits::RaftEngine; + use kvproto::raft_serverpb::{ExtraMessageType, PeerState}; + + use super::*; + + pub fn assert_peer_not_exist(region_id: u64, peer_id: u64, router: &TestRouter) { + let timer = Instant::now(); + loop { + let (ch, sub) = DebugInfoChannel::pair(); + let msg = PeerMsg::QueryDebugInfo(ch); + match router.send(region_id, msg) { + Err(TrySendError::Disconnected(_)) => return, + Ok(()) => { + if let Some(m) = block_on(sub.result()) { + if m.raft_status.id != peer_id { + return; + } + } + } + Err(_) => (), + } + if timer.elapsed() < Duration::from_secs(3) { + thread::sleep(Duration::from_millis(10)); + } else { + panic!("peer of {} still exists", region_id); + } + } + } + + // TODO: make raft engine support more suitable way to verify range is empty. + /// Verify all states in raft engine are cleared. + pub fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb::Peer) { + let mut buf = vec![]; + raft_engine.get_all_entries_to(region_id, &mut buf).unwrap(); + assert!(buf.is_empty(), "{:?}", buf); + assert_matches!(raft_engine.get_raft_state(region_id), Ok(None)); + assert_matches!(raft_engine.get_apply_state(region_id, u64::MAX), Ok(None)); + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_matches!(region_state.get_state(), PeerState::Tombstone); + assert!( + region_state.get_region().get_peers().contains(peer), + "{:?}", + region_state + ); + } + + #[track_caller] + pub fn assert_valid_report(report: &RaftMessage, region_id: u64, peer_id: u64) { + assert_eq!( + report.get_extra_msg().get_type(), + ExtraMessageType::MsgGcPeerResponse + ); + assert_eq!(report.get_region_id(), region_id); + assert_eq!(report.get_from_peer().get_id(), peer_id); + } + + #[track_caller] + pub fn assert_tombstone_msg(msg: &RaftMessage, region_id: u64, peer_id: u64) { + assert_eq!(msg.get_region_id(), region_id); + assert_eq!(msg.get_to_peer().get_id(), peer_id); + assert!(msg.get_is_tombstone()); + } +} diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs index 2a5dfafc509..373763a53ef 100644 --- a/components/raftstore-v2/tests/integrations/test_life.rs +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -1,88 +1,23 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - assert_matches::assert_matches, - thread, - time::{Duration, Instant}, -}; +use std::time::Duration; -use crossbeam::channel::TrySendError; -use engine_traits::{RaftEngine, RaftEngineReadOnly, CF_DEFAULT}; +use engine_traits::{RaftEngineReadOnly, CF_DEFAULT}; use futures::executor::block_on; -use kvproto::{ - metapb, - raft_cmdpb::AdminCmdType, - raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}, -}; +use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; use raft::prelude::{ConfChangeType, MessageType}; use raftstore_v2::{ - router::{DebugInfoChannel, PeerMsg, PeerTick}, + router::{PeerMsg, PeerTick}, SimpleWriteEncoder, }; use tikv_util::store::{new_learner_peer, new_peer}; -use crate::cluster::{Cluster, TestRouter}; - -fn assert_peer_not_exist(region_id: u64, peer_id: u64, router: &TestRouter) { - let timer = Instant::now(); - loop { - let (ch, sub) = DebugInfoChannel::pair(); - let msg = PeerMsg::QueryDebugInfo(ch); - match router.send(region_id, msg) { - Err(TrySendError::Disconnected(_)) => return, - Ok(()) => { - if let Some(m) = block_on(sub.result()) { - if m.raft_status.id != peer_id { - return; - } - } - } - Err(_) => (), - } - if timer.elapsed() < Duration::from_secs(3) { - thread::sleep(Duration::from_millis(10)); - } else { - panic!("peer of {} still exists", region_id); - } - } -} - -// TODO: make raft engine support more suitable way to verify range is empty. -/// Verify all states in raft engine are cleared. -fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb::Peer) { - let mut buf = vec![]; - raft_engine.get_all_entries_to(region_id, &mut buf).unwrap(); - assert!(buf.is_empty(), "{:?}", buf); - assert_matches!(raft_engine.get_raft_state(region_id), Ok(None)); - assert_matches!(raft_engine.get_apply_state(region_id, u64::MAX), Ok(None)); - let region_state = raft_engine - .get_region_state(region_id, u64::MAX) - .unwrap() - .unwrap(); - assert_matches!(region_state.get_state(), PeerState::Tombstone); - assert!( - region_state.get_region().get_peers().contains(peer), - "{:?}", - region_state - ); -} - -#[track_caller] -fn assert_valid_report(report: &RaftMessage, region_id: u64, peer_id: u64) { - assert_eq!( - report.get_extra_msg().get_type(), - ExtraMessageType::MsgGcPeerResponse - ); - assert_eq!(report.get_region_id(), region_id); - assert_eq!(report.get_from_peer().get_id(), peer_id); -} - -#[track_caller] -fn assert_tombstone_msg(msg: &RaftMessage, region_id: u64, peer_id: u64) { - assert_eq!(msg.get_region_id(), region_id); - assert_eq!(msg.get_to_peer().get_id(), peer_id); - assert!(msg.get_is_tombstone()); -} +use crate::cluster::{ + life_helper::{ + assert_peer_not_exist, assert_tombstone, assert_tombstone_msg, assert_valid_report, + }, + Cluster, +}; /// Test a peer can be created by general raft message and destroyed tombstone /// message. diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 694be1a2b8c..84bc3b27084 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -929,14 +929,13 @@ pub(crate) mod tests { }, }; - const PENDING_APPLY_CHECK_INTERVAL: u64 = 200; + const PENDING_APPLY_CHECK_INTERVAL: Duration = Duration::from_millis(200); const STALE_PEER_CHECK_TICK: usize = 1; pub fn make_raftstore_cfg(use_delete_range: bool) -> Arc> { let mut store_cfg = Config::default(); store_cfg.snap_apply_batch_size = ReadableSize(0); - store_cfg.region_worker_tick_interval = - ReadableDuration::millis(PENDING_APPLY_CHECK_INTERVAL); + store_cfg.region_worker_tick_interval = ReadableDuration(PENDING_APPLY_CHECK_INTERVAL); store_cfg.clean_stale_ranges_tick = STALE_PEER_CHECK_TICK; store_cfg.use_delete_range = use_delete_range; store_cfg.snap_generator_pool_size = 2; @@ -1349,7 +1348,7 @@ pub(crate) mod tests { ); gen_and_apply_snap(5); destroy_region(6); - thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); assert!(check_region_exist(6)); assert_eq!( engine @@ -1406,7 +1405,7 @@ pub(crate) mod tests { .unwrap(), 2 ); - thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); assert!(!check_region_exist(6)); #[cfg(feature = "failpoints")] @@ -1414,12 +1413,16 @@ pub(crate) mod tests { engine.kv.compact_files_in_range(None, None, None).unwrap(); fail::cfg("handle_new_pending_applies", "return").unwrap(); gen_and_apply_snap(7); - thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); must_not_finish(&[7]); fail::remove("handle_new_pending_applies"); - thread::sleep(Duration::from_millis(PENDING_APPLY_CHECK_INTERVAL * 2)); + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); wait_apply_finish(&[7]); } + bg_worker.stop(); + // Wait the timer fired. Otherwise deletion of directory may race with timer + // task. + thread::sleep(PENDING_APPLY_CHECK_INTERVAL * 2); } #[derive(Clone, Default)] From e784a50463b40250d273b0c21a6417bdb374379b Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Fri, 17 Feb 2023 15:50:02 +0800 Subject: [PATCH 535/676] raftstore-v2: impl report buckets (#14044) ref tikv/tikv#12842 1. implement bucket split and report to pd Signed-off-by: bufferflies <1045931706@qq.com> --- components/pd_client/src/client.rs | 137 ++++++----- components/pd_client/src/client_v2.rs | 90 ++++--- components/pd_client/src/lib.rs | 5 + components/raftstore-v2/src/fsm/peer.rs | 8 +- .../raftstore-v2/src/operation/bucket.rs | 223 ++++++++++++++++++ .../src/operation/command/admin/split.rs | 2 + components/raftstore-v2/src/operation/mod.rs | 1 + .../raftstore-v2/src/operation/ready/mod.rs | 1 + components/raftstore-v2/src/raft/peer.rs | 26 +- components/raftstore-v2/src/router/imp.rs | 24 +- components/raftstore-v2/src/router/message.rs | 6 + components/raftstore-v2/src/worker/pd/mod.rs | 3 + .../tests/integrations/cluster.rs | 37 ++- .../tests/integrations/test_pd_heartbeat.rs | 92 +++++++- components/test_pd/src/mocker/mod.rs | 4 + components/test_pd/src/mocker/service.rs | 17 +- components/test_pd/src/server.rs | 27 ++- 17 files changed, 560 insertions(+), 143 deletions(-) create mode 100644 components/raftstore-v2/src/operation/bucket.rs diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 402192596b5..917176b454e 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -269,6 +269,41 @@ impl RpcClient { } } +fn get_region_resp_by_id( + pd_client: Arc, + header: pdpb::RequestHeader, + region_id: u64, +) -> PdFuture { + let timer = Instant::now(); + let mut req = pdpb::GetRegionByIdRequest::default(); + req.set_header(header); + req.set_region_id(region_id); + + let executor = move |client: &Client, req: pdpb::GetRegionByIdRequest| { + let handler = { + let inner = client.inner.rl(); + inner + .client_stub + .get_region_by_id_async_opt(&req, call_option_inner(&inner)) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_by_id", e); + }) + }; + Box::pin(async move { + let resp = handler.await?; + PD_REQUEST_HISTOGRAM_VEC + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); + check_resp_header(resp.get_header())?; + Ok(resp) + }) as PdFuture<_> + }; + + pd_client + .request(req, executor, LEADER_CHANGE_RETRY) + .execute() +} + impl fmt::Debug for RpcClient { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { fmt.debug_struct("RpcClient") @@ -532,82 +567,46 @@ impl PdClient for RpcClient { .boxed() } - fn get_region_by_id(&self, region_id: u64) -> PdFuture> { - let timer = Instant::now(); - - let mut req = pdpb::GetRegionByIdRequest::default(); - req.set_header(self.header()); - req.set_region_id(region_id); - - let executor = move |client: &Client, req: pdpb::GetRegionByIdRequest| { - let handler = { - let inner = client.inner.rl(); - inner - .client_stub - .get_region_by_id_async_opt(&req, call_option_inner(&inner)) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_by_id", e); - }) - }; - Box::pin(async move { - let mut resp = handler.await?; - PD_REQUEST_HISTOGRAM_VEC - .get_region_by_id - .observe(timer.saturating_elapsed_secs()); - check_resp_header(resp.get_header())?; - if resp.has_region() { - Ok(Some(resp.take_region())) - } else { - Ok(None) - } - }) as PdFuture<_> - }; + fn get_buckets_by_id(&self, region_id: u64) -> PdFuture> { + let header = self.header(); + let pd_client = self.pd_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, header, region_id).await?; + if resp.has_buckets() { + Ok(Some(resp.take_buckets())) + } else { + Ok(None) + } + }) as PdFuture> + } - self.pd_client - .request(req, executor, LEADER_CHANGE_RETRY) - .execute() + fn get_region_by_id(&self, region_id: u64) -> PdFuture> { + let header = self.header(); + let pd_client = self.pd_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, header, region_id).await?; + if resp.has_region() { + Ok(Some(resp.take_region())) + } else { + Ok(None) + } + }) } fn get_region_leader_by_id( &self, region_id: u64, ) -> PdFuture> { - let timer = Instant::now(); - - let mut req = pdpb::GetRegionByIdRequest::default(); - req.set_header(self.header()); - req.set_region_id(region_id); - - let executor = move |client: &Client, req: pdpb::GetRegionByIdRequest| { - let handler = { - let inner = client.inner.rl(); - inner - .client_stub - .get_region_by_id_async_opt(&req, call_option_inner(&inner)) - .unwrap_or_else(|e| { - panic!( - "fail to request PD {} err {:?}", - "get_region_leader_by_id", e - ) - }) - }; - Box::pin(async move { - let mut resp = handler.await?; - PD_REQUEST_HISTOGRAM_VEC - .get_region_leader_by_id - .observe(timer.saturating_elapsed_secs()); - check_resp_header(resp.get_header())?; - if resp.has_region() && resp.has_leader() { - Ok(Some((resp.take_region(), resp.take_leader()))) - } else { - Ok(None) - } - }) as PdFuture<_> - }; - - self.pd_client - .request(req, executor, LEADER_CHANGE_RETRY) - .execute() + let header = self.header(); + let pd_client = self.pd_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, header, region_id).await?; + if resp.has_region() && resp.has_leader() { + Ok(Some((resp.take_region(), resp.take_leader()))) + } else { + Ok(None) + } + }) } fn region_heartbeat( diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index cfa0d46303c..b583772bb72 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -515,6 +515,29 @@ impl RpcClient { } } +async fn get_region_resp_by_id( + mut raw_client: CachedRawClient, + region_id: u64, +) -> Result { + let timer = Instant::now_coarse(); + let mut req = pdpb::GetRegionByIdRequest::default(); + req.set_region_id(region_id); + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_region_by_id_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_by_id", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp) +} pub trait PdClient { type ResponseChannel: Stream>; @@ -584,6 +607,8 @@ pub trait PdClient { fn get_region_by_id(&mut self, region_id: u64) -> PdFuture>; + fn get_buckets_by_id(&self, region_id: u64) -> PdFuture>; + fn get_region_leader_by_id( &mut self, region_id: u64, @@ -1045,31 +1070,22 @@ impl PdClient for RpcClient { }) } - fn get_region_by_id(&mut self, region_id: u64) -> PdFuture> { - let timer = Instant::now_coarse(); - - let mut req = pdpb::GetRegionByIdRequest::default(); - req.set_region_id(region_id); + fn get_buckets_by_id(&self, region_id: u64) -> PdFuture> { + let pd_client = self.raw_client.clone(); + Box::pin(async move { + let mut resp = get_region_resp_by_id(pd_client, region_id).await?; + if resp.has_buckets() { + Ok(Some(resp.take_buckets())) + } else { + Ok(None) + } + }) + } - let mut raw_client = self.raw_client.clone(); + fn get_region_by_id(&mut self, region_id: u64) -> PdFuture> { + let pd_client = self.raw_client.clone(); Box::pin(async move { - raw_client.wait_for_ready().await?; - req.set_header(raw_client.header()); - let resp = raw_client - .stub() - .get_region_by_id_async_opt( - &req, - raw_client.call_option().timeout(request_timeout()), - ) - .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_by_id", e); - }) - .await; - PD_REQUEST_HISTOGRAM_VEC - .get_region_by_id - .observe(timer.saturating_elapsed_secs()); - let mut resp = raw_client.check_resp(resp)?; - check_resp_header(resp.get_header())?; + let mut resp = get_region_resp_by_id(pd_client, region_id).await?; if resp.has_region() { Ok(Some(resp.take_region())) } else { @@ -1082,33 +1098,9 @@ impl PdClient for RpcClient { &mut self, region_id: u64, ) -> PdFuture> { - let timer = Instant::now_coarse(); - - let mut req = pdpb::GetRegionByIdRequest::default(); - req.set_region_id(region_id); - - let mut raw_client = self.raw_client.clone(); + let pd_client = self.raw_client.clone(); Box::pin(async move { - raw_client.wait_for_ready().await?; - req.set_header(raw_client.header()); - let resp = raw_client - .stub() - .get_region_by_id_async_opt( - &req, - raw_client.call_option().timeout(request_timeout()), - ) - .unwrap_or_else(|e| { - panic!( - "fail to request PD {} err {:?}", - "get_region_leader_by_id", e - ); - }) - .await; - PD_REQUEST_HISTOGRAM_VEC - .get_region_leader_by_id - .observe(timer.saturating_elapsed_secs()); - let mut resp = raw_client.check_resp(resp)?; - check_resp_header(resp.get_header())?; + let mut resp = get_region_resp_by_id(pd_client, region_id).await?; if resp.has_region() && resp.has_leader() { Ok(Some((resp.take_region(), resp.take_leader()))) } else { diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index b877750770d..00b5efff23b 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -345,6 +345,11 @@ pub trait PdClient: Send + Sync { unimplemented!(); } + // Gets Buckets by Region id. + fn get_buckets_by_id(&self, _region_id: u64) -> PdFuture> { + unimplemented!(); + } + /// Gets Region and its leader by Region id. fn get_region_leader_by_id( &self, diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 1b127e5851b..814a0b1311a 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -195,6 +195,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } + self.fsm.peer.maybe_gen_approximate_buckets(self.store_ctx); // Speed up setup if there is only one peer. if self.fsm.peer.is_leader() { self.fsm.peer.set_has_ready(); @@ -223,7 +224,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerTick::ReactivateMemoryLock => { self.fsm.peer.on_reactivate_memory_lock_tick(self.store_ctx) } - PeerTick::ReportBuckets => unimplemented!(), + PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted(), PeerTick::GcPeer => self.fsm.peer_mut().on_gc_peer_tick(self.store_ctx), } @@ -305,6 +306,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .peer_mut() .on_request_split(self.store_ctx, request, ch) } + PeerMsg::RefreshRegionBuckets { + region_epoch, + buckets, + bucket_ranges, + } => self.on_refresh_region_buckets(region_epoch, buckets, bucket_ranges), PeerMsg::RequestHalfSplit { request, ch } => self .fsm .peer_mut() diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs new file mode 100644 index 00000000000..2bc2d232b12 --- /dev/null +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -0,0 +1,223 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements the interactions with bucket. + +use std::sync::Arc; + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::metapb::RegionEpoch; +use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; +use raftstore::{ + coprocessor::RegionChangeEvent, + store::{util, Bucket, BucketRange, ReadProgress, SplitCheckTask, Transport}, +}; +use slog::{error, warn}; + +use crate::{batch::StoreContext, fsm::PeerFsmDelegate, raft::Peer, router::PeerTick, worker::pd}; + +impl Peer { + #[inline] + pub fn on_refresh_region_buckets( + &mut self, + store_ctx: &mut StoreContext, + region_epoch: RegionEpoch, + mut buckets: Vec, + bucket_ranges: Option>, + ) { + // bucket version layout + // term logical counter + // |-----------|-----------| + // high bits low bits + // term: given 10s election timeout, the 32 bit means 1362 year running time + let gen_bucket_version = |term, current_version| { + let current_version_term = current_version >> 32; + let bucket_version: u64 = if current_version_term == term { + current_version + 1 + } else { + if term > u32::MAX.into() { + error!( + self.logger, + "unexpected term {} more than u32::MAX. Bucket + version will be backward.", + term + ); + } + term << 32 + }; + bucket_version + }; + + let region = self.region(); + let current_version = self + .region_buckets() + .as_ref() + .or_else(|| self.last_region_buckets().as_ref()) + .map(|b| b.meta.version) + .unwrap_or_default(); + let mut region_buckets: BucketStat; + // The region buckets reset after this region happened split or merge. + // The message should be dropped if it's epoch is lower than the regions. + // The bucket ranges is none when the region buckets is also none. + // So this condition indicates that the region buckets needs to refresh not + // renew. + if let (Some(bucket_ranges), Some(peer_region_buckets)) = + (bucket_ranges, self.region_buckets()) + { + assert_eq!(buckets.len(), bucket_ranges.len()); + let mut meta_idx = 0; + region_buckets = peer_region_buckets.clone(); + let mut meta = (*region_buckets.meta).clone(); + if !buckets.is_empty() { + meta.version = gen_bucket_version(self.term(), current_version); + } + meta.region_epoch = region_epoch; + for (bucket, bucket_range) in buckets.into_iter().zip(bucket_ranges) { + // the bucket ranges maybe need to split or merge not all the meta keys, so it + // needs to find the first keys. + while meta_idx < meta.keys.len() && meta.keys[meta_idx] != bucket_range.0 { + meta_idx += 1; + } + // meta_idx can't be not the last entry (which is end key) + if meta_idx >= meta.keys.len() - 1 { + warn!( + self.logger, + "can't find the bucket key"; + "bucket_range_key" => log_wrappers::Value::key(&bucket_range.0)); + break; + } + // the bucket size is small and does not have split keys, + // then it should be merged with its left neighbor + let region_bucket_merge_size = store_ctx + .coprocessor_host + .cfg + .region_bucket_merge_size_ratio + * (store_ctx.coprocessor_host.cfg.region_bucket_size.0 as f64); + if bucket.keys.is_empty() && bucket.size <= (region_bucket_merge_size as u64) { + meta.sizes[meta_idx] = bucket.size; + // the region has more than one bucket + // and the left neighbor + current bucket size is not very big + if meta.keys.len() > 2 + && meta_idx != 0 + && meta.sizes[meta_idx - 1] + bucket.size + < store_ctx.coprocessor_host.cfg.region_bucket_size.0 * 2 + { + // bucket is too small + region_buckets.left_merge(meta_idx); + meta.left_merge(meta_idx); + continue; + } + } else { + // update size + meta.sizes[meta_idx] = bucket.size / (bucket.keys.len() + 1) as u64; + // insert new bucket keys (split the original bucket) + for bucket_key in bucket.keys { + meta_idx += 1; + region_buckets.split(meta_idx); + meta.split(meta_idx, bucket_key); + } + } + meta_idx += 1; + } + region_buckets.meta = Arc::new(meta); + } else { + // when the region buckets is none, the exclusive buckets includes all the + // bucket keys. + assert_eq!(buckets.len(), 1); + let bucket_keys = buckets.pop().unwrap().keys; + let bucket_count = bucket_keys.len() + 1; + let mut meta = BucketMeta { + region_id: self.region_id(), + region_epoch, + version: gen_bucket_version(self.term(), current_version), + keys: bucket_keys, + sizes: vec![store_ctx.coprocessor_host.cfg.region_bucket_size.0; bucket_count], + }; + // padding the boundary keys and initialize the flow. + meta.keys.insert(0, region.get_start_key().to_vec()); + meta.keys.push(region.get_end_key().to_vec()); + let stats = new_bucket_stats(&meta); + region_buckets = BucketStat::new(Arc::new(meta), stats); + } + + let buckets_count = region_buckets.meta.keys.len() - 1; + store_ctx.coprocessor_host.on_region_changed( + region, + RegionChangeEvent::UpdateBuckets(buckets_count), + self.state_role(), + ); + let meta = region_buckets.meta.clone(); + self.set_region_buckets(Some(region_buckets)); + let mut store_meta = store_ctx.store_meta.lock().unwrap(); + if let Some(reader) = store_meta.readers.get_mut(&self.region_id()) { + reader.0.update(ReadProgress::region_buckets(meta)); + } + } + + #[inline] + pub fn report_region_buckets_pd(&mut self, ctx: &StoreContext) { + let region_buckets = self.region_buckets().as_ref().unwrap(); + let task = pd::Task::ReportBuckets(region_buckets.clone()); + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!( + self.logger, + "failed to report buckets to pd"; + "err" => ?e, + ); + } + } + + pub fn maybe_gen_approximate_buckets(&self, ctx: &StoreContext) { + if ctx.coprocessor_host.cfg.enable_region_bucket && self.storage().is_initialized() { + if let Err(e) = ctx + .schedulers + .split_check + .schedule(SplitCheckTask::ApproximateBuckets(self.region().clone())) + { + error!( + self.logger, + "failed to schedule check approximate buckets"; + "err" => %e, + ); + } + } + } +} + +impl<'a, EK, ER, T: Transport> PeerFsmDelegate<'a, EK, ER, T> +where + EK: KvEngine, + ER: RaftEngine, +{ + #[inline] + pub fn on_report_region_buckets_tick(&mut self) { + if !self.fsm.peer().is_leader() || self.fsm.peer().region_buckets().is_none() { + return; + } + self.fsm.peer_mut().report_region_buckets_pd(self.store_ctx); + self.schedule_tick(PeerTick::ReportBuckets); + } + + pub fn on_refresh_region_buckets( + &mut self, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + if util::is_epoch_stale(®ion_epoch, self.fsm.peer().region().get_region_epoch()) { + error!( + self.fsm.peer().logger, + "receive a stale refresh region bucket message"; + "epoch" => ?region_epoch, + "current_epoch" => ?self.fsm.peer().region().get_region_epoch(), + ); + return; + } + self.fsm.peer_mut().on_refresh_region_buckets( + self.store_ctx, + region_epoch, + buckets, + bucket_ranges, + ); + self.schedule_tick(PeerTick::ReportBuckets); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 86b0aab558e..b4e2b4654e7 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -215,6 +215,8 @@ impl Peer { control.skip_split_count += 1; return false; } + // todo: the suspected buckets range should generated by the diff write bytes. + // it will be done in next pr. let task = SplitCheckTask::split_check(self.region().clone(), true, CheckPolicy::Scan, None); if let Err(e) = ctx.schedulers.split_check.schedule(task) { diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index f022ab91109..ee0680f7fbb 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod bucket; mod command; mod life; mod pd; diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index adb0edf82e4..ebff7ad44ce 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -790,6 +790,7 @@ impl Peer { self.add_pending_tick(PeerTick::CompactLog); self.add_pending_tick(PeerTick::SplitRegionCheck); self.add_pending_tick(PeerTick::CheckLongUncommitted); + self.add_pending_tick(PeerTick::ReportBuckets); self.maybe_schedule_gc_peer_tick(); } StateRole::Follower => { diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 814dc72e622..142b4e91943 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -210,6 +210,24 @@ impl Peer { Ok(peer) } + #[inline] + pub fn region_buckets(&self) -> &Option { + &self.region_buckets + } + + #[inline] + pub fn set_region_buckets(&mut self, buckets: Option) { + if let Some(b) = self.region_buckets.take() { + self.last_region_buckets = Some(b); + } + self.region_buckets = buckets; + } + + #[inline] + pub fn last_region_buckets(&self) -> &Option { + &self.last_region_buckets + } + #[inline] pub fn region(&self) -> &metapb::Region { self.raft_group.store().region() @@ -646,13 +664,7 @@ impl Peer { #[inline] pub fn post_split(&mut self) { - self.reset_region_buckets(); - } - - pub fn reset_region_buckets(&mut self) { - if self.region_buckets.is_some() { - self.last_region_buckets = self.region_buckets.take(); - } + self.set_region_buckets(None); } pub fn maybe_campaign(&mut self) -> bool { diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index a9a8b23b571..bcda7298bd4 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -72,12 +72,26 @@ impl raftstore::coprocessor::StoreHandle for Store fn refresh_region_buckets( &self, - _region_id: u64, - _region_epoch: kvproto::metapb::RegionEpoch, - _buckets: Vec, - _bucket_ranges: Option>, + region_id: u64, + region_epoch: kvproto::metapb::RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, ) { - // TODO + let res = self.send( + region_id, + PeerMsg::RefreshRegionBuckets { + region_epoch, + buckets, + bucket_ranges, + }, + ); + if let Err(e) = res { + warn!( + self.logger(), + "failed to refresh region buckets"; + "err" => %e, + ); + } } fn update_compute_hash_result( diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 04bc5dbab10..317ba74d4d6 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -4,6 +4,7 @@ use kvproto::{ metapb, + metapb::RegionEpoch, raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, raft_serverpb::RaftMessage, }; @@ -185,6 +186,11 @@ pub enum PeerMsg { request: RequestSplit, ch: CmdResChannel, }, + RefreshRegionBuckets { + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + }, RequestHalfSplit { request: RequestHalfSplit, ch: CmdResChannel, diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index b23d1500914..e529f7dddee 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -79,6 +79,7 @@ pub enum Task { initial_status: u64, txn_ext: Arc, }, + ReportBuckets(BucketStat), ReportMinResolvedTs { store_id: u64, min_resolved_ts: u64, @@ -138,6 +139,7 @@ impl Display for Task { "update the max timestamp for region {} in the concurrency manager", region_id ), + Task::ReportBuckets(ref buckets) => write!(f, "report buckets: {:?}", buckets), Task::ReportMinResolvedTs { store_id, min_resolved_ts, @@ -282,6 +284,7 @@ where initial_status, txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), + Task::ReportBuckets(buckets) => self.handle_report_region_buckets(buckets), Task::ReportMinResolvedTs { store_id, min_resolved_ts, diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 4a14b85f616..065d032eaa2 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -30,7 +30,7 @@ use kvproto::{ use pd_client::RpcClient; use raft::eraftpb::MessageType; use raftstore::{ - coprocessor::CoprocessorHost, + coprocessor::{Config as CopConfig, CoprocessorHost}, store::{ region_meta::{RegionLocalState, RegionMeta}, AutoSplitController, Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, @@ -237,6 +237,7 @@ pub struct RunningState { pub registry: TabletRegistry, pub system: StoreSystem, pub cfg: Arc>, + pub cop_cfg: Arc>, pub transport: TestTransport, snap_mgr: TabletSnapManager, background: Worker, @@ -247,6 +248,7 @@ impl RunningState { pd_client: &Arc, path: &Path, cfg: Arc>, + cop_cfg: Arc>, transport: TestTransport, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, @@ -293,11 +295,9 @@ impl RunningState { let router = RaftRouter::new(store_id, router); let store_meta = router.store_meta().clone(); let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()).unwrap(); + let coprocessor_host = + CoprocessorHost::new(router.store_router().clone(), cop_cfg.value().clone()); - let coprocessor_host = CoprocessorHost::new( - router.store_router().clone(), - raftstore::coprocessor::Config::default(), - ); let background = Worker::new("background"); let pd_worker = LazyWorker::new("pd-worker"); system @@ -330,6 +330,7 @@ impl RunningState { transport, snap_mgr, background, + cop_cfg, }; (TestRouter(router), state) } @@ -361,11 +362,17 @@ impl TestNode { } } - fn start(&mut self, cfg: Arc>, trans: TestTransport) -> TestRouter { + fn start( + &mut self, + cfg: Arc>, + cop_cfg: Arc>, + trans: TestTransport, + ) -> TestRouter { let (router, state) = RunningState::new( &self.pd_client, self.path.path(), cfg, + cop_cfg, trans, ConcurrencyManager::new(1.into()), None, @@ -392,8 +399,9 @@ impl TestNode { let state = self.running_state().unwrap(); let prev_transport = state.transport.clone(); let cfg = state.cfg.clone(); + let cop_cfg = state.cop_cfg.clone(); self.stop(); - self.start(cfg, prev_transport) + self.start(cfg, cop_cfg, prev_transport) } pub fn running_state(&self) -> Option<&RunningState> { @@ -492,6 +500,14 @@ impl Cluster { } pub fn with_node_count(count: usize, config: Option) -> Self { + Cluster::with_configs(count, config, None) + } + + pub fn with_cop_cfg(coprocessor_cfg: CopConfig) -> Cluster { + Cluster::with_configs(1, None, Some(coprocessor_cfg)) + } + + pub fn with_configs(count: usize, config: Option, cop_cfg: Option) -> Self { let pd_server = test_pd::Server::new(1); let logger = slog_global::borrow_global().new(o!()); let mut cluster = Cluster { @@ -507,10 +523,15 @@ impl Cluster { v2_default_config() }; disable_all_auto_ticks(&mut cfg); + let cop_cfg = cop_cfg.unwrap_or_default(); for _ in 1..=count { let mut node = TestNode::with_pd(&cluster.pd_server, cluster.logger.clone()); let (tx, rx) = new_test_transport(); - let router = node.start(Arc::new(VersionTrack::new(cfg.clone())), tx); + let router = node.start( + Arc::new(VersionTrack::new(cfg.clone())), + Arc::new(VersionTrack::new(cop_cfg.clone())), + tx, + ); cluster.nodes.push(node); cluster.receivers.push(rx); cluster.routers.push(router); diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs index 09ead81c0c2..11ff6bd4d02 100644 --- a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -1,9 +1,17 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::time::Duration; + +use engine_traits::{MiscExt, CF_DEFAULT}; use futures::executor::block_on; use kvproto::raft_cmdpb::{RaftCmdRequest, StatusCmdType}; use pd_client::PdClient; -use tikv_util::store::new_peer; +use raftstore::coprocessor::Config as CopConfig; +use raftstore_v2::{ + router::{PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; +use tikv_util::{config::ReadableSize, store::new_peer}; use crate::cluster::Cluster; @@ -59,3 +67,85 @@ fn test_store_heartbeat() { } panic!("failed to get store stats"); } + +#[test] +fn test_report_buckets() { + let region_id = 2; + let mut cop_cfg = CopConfig::default(); + cop_cfg.enable_region_bucket = true; + cop_cfg.region_bucket_size = ReadableSize::kb(1); + let cluster = Cluster::with_cop_cfg(cop_cfg); + let store_id = cluster.node(0).id(); + let router = &cluster.routers[0]; + + // When there is only one peer, it should campaign immediately. + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(store_id, 3)); + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionLeader); + let res = router.query(region_id, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + assert_eq!( + *status_resp.get_region_leader().get_leader(), + new_peer(store_id, 3) + ); + router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + + // load data to split bucket. + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut suffix = String::from(""); + for _ in 0..200 { + suffix.push_str("fake "); + } + for i in 0..10 { + let mut put = SimpleWriteEncoder::with_capacity(64); + let mut key = format!("key-{}", i); + key.push_str(&suffix); + put.put(CF_DEFAULT, key.as_bytes(), b"value"); + let (msg, sub) = PeerMsg::simple_write(header.clone(), put.clone().encode()); + router.send(region_id, msg).unwrap(); + let _resp = block_on(sub.result()).unwrap(); + } + // To find the split keys, it should flush memtable manually. + let mut cached = cluster.node(0).tablet_registry().get(region_id).unwrap(); + cached.latest().unwrap().flush_cf(CF_DEFAULT, true).unwrap(); + // send split region check to split bucket. + router + .send(region_id, PeerMsg::Tick(PeerTick::SplitRegionCheck)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + // report buckets to pd. + router + .send(region_id, PeerMsg::Tick(PeerTick::ReportBuckets)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + let resp = block_on(cluster.node(0).pd_client().get_buckets_by_id(region_id)).unwrap(); + let mut buckets_tmp = vec![]; + let mut bucket_ranges = vec![]; + if let Some(buckets) = resp { + assert!(buckets.get_keys().len() > 2); + assert_eq!(buckets.get_region_id(), region_id); + for i in 0..buckets.keys.len() - 1 { + buckets_tmp.push(raftstore::store::Bucket::default()); + let bucket_range = + raftstore::store::BucketRange(buckets.keys[i].clone(), buckets.keys[i + 1].clone()); + bucket_ranges.push(bucket_range); + } + } + + // send the same region buckets to refresh which needs to merge the last. + let resp = block_on(cluster.node(0).pd_client().get_region_by_id(region_id)).unwrap(); + if let Some(region) = resp { + let region_epoch = region.get_region_epoch().clone(); + for _ in 0..2 { + let msg = PeerMsg::RefreshRegionBuckets { + region_epoch: region_epoch.clone(), + buckets: buckets_tmp.clone(), + bucket_ranges: Some(bucket_ranges.clone()), + }; + router.send(region_id, msg).unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + } + } +} diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index fc257b12a9f..d8282ca3df0 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -133,6 +133,10 @@ pub trait PdMocker { None } + fn report_buckets(&self, _: &ReportBucketsRequest) -> Option> { + None + } + fn get_region(&self, _: &GetRegionRequest) -> Option> { None } diff --git a/components/test_pd/src/mocker/service.rs b/components/test_pd/src/mocker/service.rs index 45dd6e5661d..330a5375fb2 100644 --- a/components/test_pd/src/mocker/service.rs +++ b/components/test_pd/src/mocker/service.rs @@ -8,7 +8,7 @@ use std::sync::{ use collections::HashMap; use fail::fail_point; use kvproto::{ - metapb::{Peer, Region, Store, StoreState}, + metapb::{Buckets, Peer, Region, Store, StoreState}, pdpb::*, }; @@ -21,6 +21,7 @@ pub struct Service { is_bootstrapped: AtomicBool, stores: Mutex>, regions: Mutex>, + buckets: Mutex>, leaders: Mutex>, feature_gate: Mutex, } @@ -35,6 +36,7 @@ impl Service { regions: Mutex::new(HashMap::default()), leaders: Mutex::new(HashMap::default()), feature_gate: Mutex::new(String::default()), + buckets: Mutex::new(HashMap::default()), } } @@ -210,6 +212,9 @@ impl PdMocker for Service { Some(region) => { resp.set_header(Service::header()); resp.set_region(region.clone()); + if let Some(bucket) = self.buckets.lock().unwrap().get(&req.get_region_id()) { + resp.set_buckets(bucket.clone()); + } if let Some(leader) = leaders.get(®ion.get_id()) { resp.set_leader(leader.clone()); } @@ -227,6 +232,16 @@ impl PdMocker for Service { } } + fn report_buckets(&self, req: &ReportBucketsRequest) -> Option> { + let buckets = req.get_buckets(); + let region_id = req.get_buckets().get_region_id(); + self.buckets + .lock() + .unwrap() + .insert(region_id, buckets.clone()); + None + } + fn region_heartbeat( &self, req: &RegionHeartbeatRequest, diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index 28d4077b674..b1909485ac8 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -13,8 +13,8 @@ use std::{ use fail::fail_point; use futures::{future, SinkExt, TryFutureExt, TryStreamExt}; use grpcio::{ - DuplexSink, EnvBuilder, RequestStream, RpcContext, RpcStatus, RpcStatusCode, - Server as GrpcServer, ServerBuilder, ServerStreamingSink, UnarySink, WriteFlags, + ClientStreamingSink, DuplexSink, EnvBuilder, RequestStream, RpcContext, RpcStatus, + RpcStatusCode, Server as GrpcServer, ServerBuilder, ServerStreamingSink, UnarySink, WriteFlags, }; use kvproto::pdpb::*; use pd_client::Error as PdError; @@ -360,6 +360,29 @@ impl Pd for PdMock { hijack_unary(self, ctx, sink, |c| c.store_heartbeat(&req)) } + fn report_buckets( + &mut self, + ctx: grpcio::RpcContext<'_>, + stream: RequestStream, + sink: ClientStreamingSink, + ) { + let mock = self.clone(); + ctx.spawn(async move { + let mut stream = stream.map_err(PdError::from); + while let Ok(Some(req)) = stream.try_next().await { + let resp = mock + .case + .as_ref() + .and_then(|case| case.report_buckets(&req)) + .or_else(|| mock.default_handler.report_buckets(&req)); + if let Some(Ok(resp)) = resp { + sink.success(resp); + break; + } + } + }); + } + fn region_heartbeat( &mut self, ctx: RpcContext<'_>, From 07b2bde11fe072bfe36a41b24758f103a7dcbecd Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 17 Feb 2023 17:07:04 +0800 Subject: [PATCH 536/676] raftstore,resolved_ts: advance resolved ts as needed (#14123) close tikv/tikv#13110, close tikv/tikv#14122, close pingcap/tidb#40903 Fix an issue that stale read fails when a leader is slowly. Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot Co-authored-by: Jay --- components/raftstore/src/store/peer.rs | 9 +- components/raftstore/src/store/util.rs | 14 +++ components/raftstore/src/store/worker/read.rs | 106 +++++++++++++++++- components/resolved_ts/src/advance.rs | 25 +++-- components/resolved_ts/src/endpoint.rs | 23 +++- components/resolved_ts/src/resolver.rs | 2 +- 6 files changed, 162 insertions(+), 17 deletions(-) diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index a6010a6761f..c788256799b 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -65,7 +65,7 @@ use tikv_util::{ }; use time::{Duration as TimeDuration, Timespec}; use tracker::GLOBAL_TRACKERS; -use txn_types::WriteBatchFlags; +use txn_types::{TimeStamp, WriteBatchFlags}; use uuid::Uuid; use super::{ @@ -109,8 +109,8 @@ use crate::{ }; const SHRINK_CACHE_CAPACITY: usize = 64; -const MIN_BCAST_WAKE_UP_INTERVAL: u64 = 1_000; // 1s +const MIN_BCAST_WAKE_UP_INTERVAL: u64 = 1_000; const REGION_READ_PROGRESS_CAP: usize = 128; #[doc(hidden)] pub const MAX_COMMITTED_SIZE_PER_READY: u64 = 16 * 1024 * 1024; @@ -4796,6 +4796,11 @@ where let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); let safe_ts = self.read_progress.safe_ts(); if safe_ts < read_ts { + // Advancing resolved ts may be expensive, only notify if read_ts - safe_ts > + // 200ms. + if TimeStamp::from(read_ts).physical() > TimeStamp::from(safe_ts).physical() + 200 { + self.read_progress.notify_advance_resolved_ts(); + } warn!( "read rejected by safe timestamp"; "safe ts" => safe_ts, diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 0127cc5c7e6..d48c5e78e7c 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -39,6 +39,7 @@ use tikv_util::{ Either, }; use time::{Duration, Timespec}; +use tokio::sync::Notify; use txn_types::WriteBatchFlags; use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; @@ -1234,6 +1235,16 @@ impl RegionReadProgress { } } + pub fn update_advance_resolved_ts_notify(&self, advance_notify: Arc) { + self.core.lock().unwrap().advance_notify = Some(advance_notify); + } + + pub fn notify_advance_resolved_ts(&self) { + if let Ok(core) = self.core.try_lock() && let Some(advance_notify) = &core.advance_notify { + advance_notify.notify_waiters(); + } + } + pub fn update_applied(&self, applied: u64, coprocessor: &CoprocessorHost) { let mut core = self.core.lock().unwrap(); if let Some(ts) = core.update_applied(applied) { @@ -1397,6 +1408,8 @@ pub struct RegionReadProgressCore { pause: bool, // Discard incoming `(idx, ts)` discard: bool, + // A notify to trigger advancing resolved ts immediately. + advance_notify: Option>, } // A helpful wrapper of `(apply_index, safe_ts)` item @@ -1468,6 +1481,7 @@ impl RegionReadProgressCore { last_merge_index: 0, pause: is_witness, discard: is_witness, + advance_notify: None, } } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 379af09eb2e..826537f4e44 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -29,6 +29,7 @@ use tikv_util::{ time::{monotonic_raw_now, ThreadReadId}, }; use time::Timespec; +use txn_types::TimeStamp; use super::metrics::*; use crate::{ @@ -563,11 +564,15 @@ impl ReadDelegate { if safe_ts >= read_ts { return Ok(()); } + // Advancing resolved ts may be expensive, only notify if read_ts - safe_ts > + // 200ms. + if TimeStamp::from(read_ts).physical() > TimeStamp::from(safe_ts).physical() + 200 { + self.read_progress.notify_advance_resolved_ts(); + } debug!( "reject stale read by safe ts"; "safe_ts" => safe_ts, "read_ts" => read_ts, - "region_id" => self.region.get_id(), "peer_id" => self.peer_id, ); @@ -2014,4 +2019,103 @@ mod tests { .is_none() ); } + + #[test] + fn test_stale_read_notify() { + let store_id = 2; + let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); + let (_tmp, mut reader, rx) = new_reader("test-local-reader", store_id, store_meta.clone()); + reader.kv_engine.put(b"key", b"value").unwrap(); + + let epoch13 = { + let mut ep = metapb::RegionEpoch::default(); + ep.set_conf_ver(1); + ep.set_version(3); + ep + }; + let term6 = 6; + + // Register region1 + let pr_ids1 = vec![2, 3, 4]; + let prs1 = new_peers(store_id, pr_ids1.clone()); + prepare_read_delegate( + store_id, + 1, + term6, + pr_ids1, + epoch13.clone(), + store_meta.clone(), + ); + let leader1 = prs1[0].clone(); + + // Local read + let mut cmd = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_region_id(1); + header.set_peer(leader1); + header.set_region_epoch(epoch13); + header.set_term(term6); + header.set_flags(header.get_flags() | WriteBatchFlags::STALE_READ.bits()); + cmd.set_header(header.clone()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + cmd.set_requests(vec![req].into()); + + // A peer can serve read_ts < safe_ts. + let safe_ts = TimeStamp::compose(2, 0); + { + let mut meta = store_meta.lock().unwrap(); + let delegate = meta.readers.get_mut(&1).unwrap(); + delegate + .read_progress + .update_safe_ts(1, safe_ts.into_inner()); + assert_eq!(delegate.read_progress.safe_ts(), safe_ts.into_inner()); + } + let read_ts_1 = TimeStamp::compose(1, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_1.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let (snap_tx, snap_rx) = channel(); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |resp: ReadResponse| { + snap_tx.send(resp).unwrap(); + })), + ); + must_not_redirect(&mut reader, &rx, task); + snap_rx.recv().unwrap().snapshot.unwrap(); + + // A peer has to notify advancing resolved ts if read_ts >= safe_ts. + let notify = Arc::new(tokio::sync::Notify::new()); + { + let mut meta = store_meta.lock().unwrap(); + let delegate = meta.readers.get_mut(&1).unwrap(); + delegate + .read_progress + .update_advance_resolved_ts_notify(notify.clone()); + } + // 201ms larger than safe_ts. + let read_ts_2 = TimeStamp::compose(safe_ts.physical() + 201, 0); + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(read_ts_2.into_inner()).unwrap(); + header.set_flag_data(data.into()); + cmd.set_header(header.clone()); + let task = RaftCommand::::new( + cmd.clone(), + Callback::read(Box::new(move |_: ReadResponse| {})), + ); + let (notify_tx, notify_rx) = channel(); + let (wait_spawn_tx, wait_spawn_rx) = channel(); + let runtime = tokio::runtime::Runtime::new().unwrap(); + let _ = runtime.spawn(async move { + wait_spawn_tx.send(()).unwrap(); + notify.notified().await; + notify_tx.send(()).unwrap(); + }); + wait_spawn_rx.recv().unwrap(); + thread::sleep(std::time::Duration::from_millis(500)); // Prevent lost notify. + must_not_redirect(&mut reader, &rx, task); + notify_rx.recv().unwrap(); + } } diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index fd58fac1601..611d8a84424 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -1,6 +1,7 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cmp, ffi::CString, sync::{ atomic::{AtomicI32, Ordering}, @@ -45,10 +46,11 @@ use txn_types::TimeStamp; use crate::{endpoint::Task, metrics::*}; -const DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS: u64 = 5_000; // 5s +const DEFAULT_CHECK_LEADER_TIMEOUT_DURATION: Duration = Duration::from_secs(5); // 5s pub struct AdvanceTsWorker { pd_client: Arc, + advance_ts_interval: Duration, timer: SteadyTimer, worker: Runtime, scheduler: Scheduler, @@ -59,6 +61,7 @@ pub struct AdvanceTsWorker { impl AdvanceTsWorker { pub fn new( + advance_ts_interval: Duration, pd_client: Arc, scheduler: Scheduler, concurrency_manager: ConcurrencyManager, @@ -75,6 +78,7 @@ impl AdvanceTsWorker { scheduler, pd_client, worker, + advance_ts_interval, timer: SteadyTimer::default(), concurrency_manager, } @@ -88,15 +92,19 @@ impl AdvanceTsWorker { regions: Vec, mut leader_resolver: LeadershipResolver, advance_ts_interval: Duration, - cfg_update_notify: Arc, + advance_notify: Arc, ) { let cm = self.concurrency_manager.clone(); let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); let timeout = self.timer.delay(advance_ts_interval); + let min_timeout = self.timer.delay(cmp::min( + DEFAULT_CHECK_LEADER_TIMEOUT_DURATION, + self.advance_ts_interval, + )); let fut = async move { - // Ignore get tso errors since we will retry every `advance_ts_interval`. + // Ignore get tso errors since we will retry every `advdance_ts_interval`. let mut min_ts = pd_client.get_tso().await.unwrap_or_default(); // Sync with concurrency manager so that it can work correctly when @@ -122,9 +130,12 @@ impl AdvanceTsWorker { futures::select! { _ = timeout.compat().fuse() => (), - // Skip wait timeout if cfg is updated. - _ = cfg_update_notify.notified().fuse() => (), + // Skip wait timeout if a notify is arrived. + _ = advance_notify.notified().fuse() => (), }; + // Wait min timeout to prevent from overloading advancing resolved ts. + let _ = min_timeout.compat().await; + // NB: We must schedule the leader resolver even if there is no region, // otherwise we can not advance resolved ts next time. if let Err(e) = scheduler.schedule(Task::AdvanceResolvedTs { leader_resolver }) { @@ -386,7 +397,7 @@ impl LeadershipResolver { PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); - let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); + let timeout = DEFAULT_CHECK_LEADER_TIMEOUT_DURATION; let resp = tokio::time::timeout(timeout, rpc) .map_err(|e| (to_store, true, format!("[timeout] {}", e))) .await? @@ -509,7 +520,7 @@ async fn get_tikv_client( return Ok(client); } } - let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); + let timeout = DEFAULT_CHECK_LEADER_TIMEOUT_DURATION; let store = tokio::time::timeout(timeout, pd_client.get_store_async(store_id)) .await .map_err(|e| pd_client::Error::Other(Box::new(e))) diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index def3d512d3a..8d2ee1631b4 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -90,6 +90,10 @@ impl ObserveRegion { } } + fn read_progress(&self) -> &RegionReadProgress { + self.resolver.read_progress.as_ref().unwrap() + } + fn track_change_log(&mut self, change_logs: &[ChangeLog]) -> std::result::Result<(), String> { match &mut self.resolver_status { ResolverStatus::Pending { @@ -265,7 +269,7 @@ impl ObserveRegion { pub struct Endpoint { store_id: Option, cfg: ResolvedTsConfig, - cfg_update_notify: Arc, + advance_notify: Arc, store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, regions: HashMap, @@ -294,8 +298,12 @@ where let meta = store_meta.lock().unwrap(); (meta.region_read_progress.clone(), meta.store_id) }; - let advance_worker = - AdvanceTsWorker::new(pd_client.clone(), scheduler.clone(), concurrency_manager); + let advance_worker = AdvanceTsWorker::new( + cfg.advance_ts_interval.0, + pd_client.clone(), + scheduler.clone(), + concurrency_manager, + ); let scanner_pool = ScannerPool::new(cfg.scan_lock_pool_size, raft_router); let store_resolver_gc_interval = Duration::from_secs(60); let leader_resolver = LeadershipResolver::new( @@ -309,7 +317,7 @@ where let ep = Self { store_id, cfg: cfg.clone(), - cfg_update_notify: Arc::new(Notify::new()), + advance_notify: Arc::new(Notify::new()), scheduler, store_meta, region_read_progress, @@ -345,6 +353,9 @@ where ResolverStatus::Pending { ref cancelled, .. } => cancelled.clone(), ResolverStatus::Ready => panic!("resolved ts illeagal created observe region"), }; + observe_region + .read_progress() + .update_advance_resolved_ts_notify(self.advance_notify.clone()); self.regions.insert(region_id, observe_region); let scan_task = self.build_scan_task(region, observe_handle, cancelled); @@ -560,7 +571,7 @@ where regions, leader_resolver, self.cfg.advance_ts_interval.0, - self.cfg_update_notify.clone(), + self.advance_notify.clone(), ); } @@ -569,7 +580,7 @@ where if let Err(e) = self.cfg.update(change) { warn!("resolved-ts config fails"; "error" => ?e); } else { - self.cfg_update_notify.notify_waiters(); + self.advance_notify.notify_waiters(); info!( "resolved-ts config changed"; "prev" => prev, diff --git a/components/resolved_ts/src/resolver.rs b/components/resolved_ts/src/resolver.rs index 514f812665a..b341c546940 100644 --- a/components/resolved_ts/src/resolver.rs +++ b/components/resolved_ts/src/resolver.rs @@ -21,7 +21,7 @@ pub struct Resolver { // The highest index `Resolver` had been tracked tracked_index: u64, // The region read progress used to utilize `resolved_ts` to serve stale read request - read_progress: Option>, + pub(crate) read_progress: Option>, // The timestamps that advance the resolved_ts when there is no more write. min_ts: TimeStamp, // Whether the `Resolver` is stopped From 728511e8be17a50c03b6a82bc073008192533c7a Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Fri, 17 Feb 2023 23:25:04 -0800 Subject: [PATCH 537/676] rocksdb: reduce rocksdb block size to 32KB (#14053) (#14244) close tikv/tikv#14052 Because of memory fragment issue 16KB causes, we change it to 32KB and the result shows there's no significant memory fragment. Signed-off-by: qi.xu Co-authored-by: qi.xu --- etc/config-template.toml | 4 ++-- src/config/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/config-template.toml b/etc/config-template.toml index 59152570da1..38082367d40 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -679,7 +679,7 @@ ## The data block size. RocksDB compresses data based on the unit of block. ## Similar to page in other databases, block is the smallest unit cached in block-cache. Note that ## the block size specified here corresponds to uncompressed data. -# block-size = "16KB" +# block-size = "32KB" ## If you're doing point lookups you definitely want to turn bloom filters on. We use bloom filters ## to avoid unnecessary disk reads. Default bits_per_key is 10, which yields ~1% false positive @@ -915,7 +915,7 @@ [rocksdb.writecf] ## Recommend to set it the same as `rocksdb.defaultcf.compression-per-level`. # compression-per-level = ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] -# block-size = "16KB" +# block-size = "32KB" ## Recommend to set it the same as `rocksdb.defaultcf.write-buffer-size`. # write-buffer-size = "128MB" diff --git a/src/config/mod.rs b/src/config/mod.rs index a8e15c38642..e633b76d2db 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -634,7 +634,7 @@ impl Default for DefaultCfConfig { let total_mem = SysQuota::memory_limit_in_bytes(); DefaultCfConfig { - block_size: ReadableSize::kb(16), + block_size: ReadableSize::kb(32), block_cache_size: memory_limit_for_cf(false, CF_DEFAULT, total_mem), disable_block_cache: false, cache_index_and_filter_blocks: true, @@ -759,7 +759,7 @@ impl Default for WriteCfConfig { }; WriteCfConfig { - block_size: ReadableSize::kb(16), + block_size: ReadableSize::kb(32), block_cache_size: memory_limit_for_cf(false, CF_WRITE, total_mem), disable_block_cache: false, cache_index_and_filter_blocks: true, From b82036eae1a4f01b607af79352e42480cb88f3e5 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Sat, 18 Feb 2023 15:47:04 +0800 Subject: [PATCH 538/676] config: increase resolved-ts.advance-ts-interval to 20s (#14136) close tikv/tikv#14100 Save network traffic by increasing resolved-ts.advance-ts-interval Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- src/config/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index e633b76d2db..4be54665443 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2745,7 +2745,7 @@ impl Default for ResolvedTsConfig { fn default() -> Self { Self { enable: true, - advance_ts_interval: ReadableDuration::secs(1), + advance_ts_interval: ReadableDuration::secs(20), scan_lock_pool_size: 2, } } @@ -4731,7 +4731,7 @@ mod tests { // Default value assert_eq!( resolved_ts_cfg.advance_ts_interval, - ReadableDuration::secs(1) + ReadableDuration::secs(20) ); // Update `advance-ts-interval` to 100ms From 1216d5efa99f512a5505998524013233c198363b Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Tue, 21 Feb 2023 17:33:06 +0800 Subject: [PATCH 539/676] pd_client: add some function to buckets (#14239) close tikv/tikv#14240 1. add `from_meta` constructor 2. add `merge` to merge delta flow 3. add `add_flow` to add flow for given key range Signed-off-by: bufferflies <1045931706@qq.com> --- components/pd_client/src/lib.rs | 23 ++++++++++++ .../raftstore-v2/src/operation/bucket.rs | 5 ++- .../raftstore-v2/src/worker/pd/region.rs | 24 +++---------- components/raftstore/src/store/fsm/apply.rs | 14 ++++---- components/raftstore/src/store/fsm/peer.rs | 14 +++----- components/raftstore/src/store/worker/pd.rs | 22 +++--------- .../src/store/worker/split_controller.rs | 36 ++++++++----------- components/test_pd_client/src/pd.rs | 8 +---- 8 files changed, 59 insertions(+), 87 deletions(-) diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 00b5efff23b..05b5729e98c 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -152,6 +152,29 @@ impl BucketStat { } } + pub fn from_meta(meta: Arc) -> Self { + let stats = new_bucket_stats(&meta); + Self::new(meta, stats) + } + + pub fn set_meta(&mut self, meta: Arc) { + self.stats = new_bucket_stats(&meta); + self.meta = meta; + } + + pub fn merge(&mut self, delta: &BucketStat) { + merge_bucket_stats( + &self.meta.keys, + &mut self.stats, + &delta.meta.keys, + &delta.stats, + ); + } + + pub fn add_flows>(&mut self, incoming: &[I], delta_stats: &metapb::BucketStats) { + merge_bucket_stats(&self.meta.keys, &mut self.stats, incoming, delta_stats); + } + pub fn write_key(&mut self, key: &[u8], value_size: u64) { let idx = match util::find_bucket_index(key, &self.meta.keys) { Some(idx) => idx, diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs index 2bc2d232b12..efff68fc453 100644 --- a/components/raftstore-v2/src/operation/bucket.rs +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use engine_traits::{KvEngine, RaftEngine}; use kvproto::metapb::RegionEpoch; -use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::{BucketMeta, BucketStat}; use raftstore::{ coprocessor::RegionChangeEvent, store::{util, Bucket, BucketRange, ReadProgress, SplitCheckTask, Transport}, @@ -135,8 +135,7 @@ impl Peer { // padding the boundary keys and initialize the flow. meta.keys.insert(0, region.get_start_key().to_vec()); meta.keys.push(region.get_end_key().to_vec()); - let stats = new_bucket_stats(&meta); - region_buckets = BucketStat::new(Arc::new(meta), stats); + region_buckets = BucketStat::from_meta(Arc::new(meta)); } let buckets_count = region_buckets.meta.keys.len() - 1; diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index c862d1f208b..bd4925e8563 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -5,9 +5,7 @@ use std::{sync::Arc, time::Duration}; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{metapb, pdpb}; -use pd_client::{ - merge_bucket_stats, metrics::PD_HEARTBEAT_COUNTER_VEC, BucketStat, PdClient, RegionStat, -}; +use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, BucketStat, PdClient, RegionStat}; use raftstore::store::{ReadStats, WriteStats}; use resource_metering::RawRecords; use slog::{debug, error, info}; @@ -72,17 +70,9 @@ impl ReportBucket { self.last_report_ts = report_ts; match self.last_report_stat.replace(self.current_stat.clone()) { Some(last) => { - let mut delta = BucketStat::new( - self.current_stat.meta.clone(), - pd_client::new_bucket_stats(&self.current_stat.meta), - ); + let mut delta = BucketStat::from_meta(self.current_stat.meta.clone()); // Buckets may be changed, recalculate last stats according to current meta. - merge_bucket_stats( - &delta.meta.keys, - &mut delta.stats, - &last.meta.keys, - &last.stats, - ); + delta.merge(&last); for i in 0..delta.meta.keys.len() - 1 { delta.stats.write_bytes[i] = self.current_stat.stats.write_bytes[i] - delta.stats.write_bytes[i]; @@ -438,13 +428,7 @@ where if current.meta < buckets.meta { std::mem::swap(current, &mut buckets); } - - merge_bucket_stats( - ¤t.meta.keys, - &mut current.stats, - &buckets.meta.keys, - &buckets.stats, - ); + current.merge(&buckets); }) .or_insert_with(|| ReportBucket::new(buckets)); } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index fba17db7391..7afb188a4b0 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -44,7 +44,7 @@ use kvproto::{ }, raft_serverpb::{MergeState, PeerState, RaftApplyState, RaftTruncatedState, RegionLocalState}, }; -use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::{BucketMeta, BucketStat}; use prometheus::local::LocalHistogram; use protobuf::{wire_format::WireType, CodedInputStream, Message}; use raft::eraftpb::{ @@ -3941,12 +3941,12 @@ where self.delegate.term = apply.term; if let Some(meta) = apply.bucket_meta.clone() { - let buckets = self - .delegate - .buckets - .get_or_insert_with(BucketStat::default); - buckets.stats = new_bucket_stats(&meta); - buckets.meta = meta; + if let Some(old) = &mut self.delegate.buckets { + old.set_meta(meta); + } else { + let new = BucketStat::from_meta(meta); + self.delegate.buckets.replace(new); + } } let prev_state = ( diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 05b443be4eb..30420668164 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -41,7 +41,7 @@ use kvproto::{ replication_modepb::{DrAutoSyncState, ReplicationMode}, }; use parking_lot::RwLockWriteGuard; -use pd_client::{merge_bucket_stats, new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; use protobuf::Message; use raft::{ self, @@ -2273,12 +2273,7 @@ where let applied_index = res.apply_state.applied_index; let buckets = self.fsm.peer.region_buckets.as_mut(); if let (Some(delta), Some(buckets)) = (res.bucket_stat, buckets) { - merge_bucket_stats( - &buckets.meta.keys, - &mut buckets.stats, - &delta.meta.keys, - &delta.stats, - ); + buckets.merge(&delta); } self.fsm.has_ready |= self.fsm.peer.post_apply( self.ctx, @@ -5946,9 +5941,7 @@ where }; meta.keys.insert(0, region.get_start_key().to_vec()); meta.keys.push(region.get_end_key().to_vec()); - - let stats = new_bucket_stats(&meta); - region_buckets = BucketStat::new(Arc::new(meta), stats); + region_buckets = BucketStat::from_meta(Arc::new(meta)); } let buckets_count = region_buckets.meta.keys.len() - 1; @@ -6341,6 +6334,7 @@ where "err" => ?e, ); } + // todo: it will delete in next pr. region_buckets.stats = new_bucket_stats(®ion_buckets.meta); self.register_report_region_buckets_tick(); diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index f43e1ec33d5..74fa4d046f1 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -32,7 +32,7 @@ use kvproto::{ replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, }; use ordered_float::OrderedFloat; -use pd_client::{merge_bucket_stats, metrics::*, BucketStat, Error, PdClient, RegionStat}; +use pd_client::{metrics::*, BucketStat, Error, PdClient, RegionStat}; use prometheus::local::LocalHistogram; use raft::eraftpb::ConfChangeType; use resource_metering::{Collector, CollectorGuard, CollectorRegHandle, RawRecords}; @@ -287,17 +287,9 @@ impl ReportBucket { self.last_report_ts = report_ts; match self.last_report_stat.replace(self.current_stat.clone()) { Some(last) => { - let mut delta = BucketStat::new( - self.current_stat.meta.clone(), - pd_client::new_bucket_stats(&self.current_stat.meta), - ); + let mut delta = BucketStat::from_meta(self.current_stat.meta.clone()); // Buckets may be changed, recalculate last stats according to current meta. - merge_bucket_stats( - &delta.meta.keys, - &mut delta.stats, - &last.meta.keys, - &last.stats, - ); + delta.merge(&last); for i in 0..delta.meta.keys.len() - 1 { delta.stats.write_bytes[i] = self.current_stat.stats.write_bytes[i] - delta.stats.write_bytes[i]; @@ -1891,13 +1883,7 @@ where if current.meta < buckets.meta { mem::swap(current, &mut buckets); } - - merge_bucket_stats( - ¤t.meta.keys, - &mut current.stats, - &buckets.meta.keys, - &buckets.stats, - ); + current.merge(&buckets); }) .or_insert_with(|| ReportBucket::new(buckets)); } diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index 7e00daa2764..6d556d1c283 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -13,7 +13,7 @@ use kvproto::{ metapb::{self, Peer}, pdpb::QueryKind, }; -use pd_client::{merge_bucket_stats, new_bucket_stats, BucketMeta, BucketStat}; +use pd_client::{BucketMeta, BucketStat}; use rand::Rng; use resource_metering::RawRecords; use tikv_util::{ @@ -451,30 +451,22 @@ impl ReadStats { region_info.flow.add(write); region_info.flow.add(data); if let Some(buckets) = buckets { - let bucket_stat = self.region_buckets.entry(region_id).or_insert_with(|| { - let stats = new_bucket_stats(buckets); - BucketStat::new(buckets.clone(), stats) - }); - if bucket_stat.meta < *buckets { - let stats = new_bucket_stats(buckets); - let mut new = BucketStat::new(buckets.clone(), stats); - merge_bucket_stats( - &new.meta.keys, - &mut new.stats, - &bucket_stat.meta.keys, - &bucket_stat.stats, - ); - *bucket_stat = new; - } + let bucket_stat = self + .region_buckets + .entry(region_id) + .and_modify(|current| { + if current.meta < *buckets { + let mut new = BucketStat::from_meta(buckets.clone()); + std::mem::swap(current, &mut new); + current.merge(&new); + } + }) + .or_insert_with(|| BucketStat::from_meta(buckets.clone())); let mut delta = metapb::BucketStats::default(); delta.set_read_bytes(vec![(write.read_bytes + data.read_bytes) as u64]); delta.set_read_keys(vec![(write.read_keys + data.read_keys) as u64]); - let start = start.unwrap_or_default(); - let end = end.unwrap_or_default(); - merge_bucket_stats( - &bucket_stat.meta.keys, - &mut bucket_stat.stats, - &[start, end], + bucket_stat.add_flows( + &[start.unwrap_or_default(), end.unwrap_or_default()], &delta, ); } diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index a76692c4a67..d3bbce685c0 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -1946,13 +1946,7 @@ impl PdClient for TestPdClient { if current.meta < buckets.meta { std::mem::swap(current, &mut buckets); } - - pd_client::merge_bucket_stats( - ¤t.meta.keys, - &mut current.stats, - &buckets.meta.keys, - &buckets.stats, - ); + current.merge(&buckets); }) .or_insert(buckets); ready(Ok(())).boxed() From 9a91e60b0677def44f35f5cc2b9ad8f5a4df8a0a Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 21 Feb 2023 20:53:05 +0800 Subject: [PATCH 540/676] Raftstore-v2: update peer state after persisting snapshot (#14248) ref tikv/tikv#12842 Update peer state after persisting snapshot Signed-off-by: SpadeA-Tang --- components/raftstore-v2/src/operation/ready/snapshot.rs | 4 ++++ tests/integrations/raftstore/test_conf_change.rs | 1 + 2 files changed, 5 insertions(+) diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 87a1496be15..29d94c955af 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -259,6 +259,10 @@ impl Peer { meta.region_read_progress .insert(region_id, self.read_progress().clone()); } + + let region_state = self.raft_group().store().region_state().clone(); + self.storage_mut().set_region_state(region_state); + if let Some(tablet) = self.set_tablet(tablet) { self.record_tombstone_tablet(ctx, tablet, snapshot_index); } diff --git a/tests/integrations/raftstore/test_conf_change.rs b/tests/integrations/raftstore/test_conf_change.rs index 500a27ae266..79b3488d868 100644 --- a/tests/integrations/raftstore/test_conf_change.rs +++ b/tests/integrations/raftstore/test_conf_change.rs @@ -732,6 +732,7 @@ fn test_node_learner_conf_change() { } #[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_learner_with_slow_snapshot() { let mut cluster = new_cluster(0, 3); configure_for_snapshot(&mut cluster.cfg); From 061d874c297307a0c6184d827577c58378ae4b72 Mon Sep 17 00:00:00 2001 From: zyguan Date: Tue, 21 Feb 2023 21:07:05 +0800 Subject: [PATCH 541/676] read_pool: avoid tail latency of spawning (#14207) ref tikv/tikv#14118, close tikv/tikv#14188 Each multilevel/priority pool maintains an internal map for tracking the elapsed time of running tasks. Previously we try to cleanup the map every 10s on spawning new tasks, which leads to the tail latency issue described in #14118. This PR tries to resolve the issue by spawning a background task for cleaning up the map. Signed-off-by: zyguan Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/server/src/server.rs | 2 + components/server/src/server2.rs | 2 + components/tikv_util/src/yatp_pool/mod.rs | 221 ++++++++++++++++++++-- src/read_pool.rs | 13 +- 5 files changed, 224 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d87014110fd..1fa0937ce40 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7491,7 +7491,7 @@ checksum = "541b12c998c5b56aa2b4e6f18f03664eef9a4fd0a246a55594efae6cc2d964b5" [[package]] name = "yatp" version = "0.0.1" -source = "git+https://github.com/tikv/yatp.git?branch=master#bcf431a2619c06ab7fa0c72073a0c775646c484f" +source = "git+https://github.com/tikv/yatp.git?branch=master#7ed25299d60a5338bea4ac0ed7470887ab74a010" dependencies = [ "crossbeam-deque", "crossbeam-skiplist", diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 99d56ac10cd..9576cb91423 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -128,6 +128,7 @@ use tikv_util::{ thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + yatp_pool::CleanupMethod, Either, }; use tokio::runtime::Builder; @@ -768,6 +769,7 @@ where pd_sender.clone(), engines.engine.clone(), resource_ctl, + CleanupMethod::Remote(self.background_worker.remote()), )) } else { None diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 2a67318439b..9a2a1a1e8e0 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -108,6 +108,7 @@ use tikv_util::{ thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + yatp_pool::CleanupMethod, Either, }; use tokio::runtime::Builder; @@ -664,6 +665,7 @@ where pd_sender.clone(), engines.engine.clone(), resource_ctl, + CleanupMethod::Remote(self.background_worker.remote()), )) } else { None diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index 305d2162482..05c245bd5a3 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -7,10 +7,11 @@ use std::sync::Arc; use fail::fail_point; pub use future_pool::{Full, FuturePool}; +use futures::{compat::Stream01CompatExt, StreamExt}; use prometheus::{local::LocalHistogram, Histogram}; use yatp::{ - pool::{CloneRunnerBuilder, Local, Runner}, - queue::{multilevel, priority, QueueType, TaskCell as _}, + pool::{CloneRunnerBuilder, Local, Remote, Runner}, + queue::{multilevel, priority, Extras, QueueType, TaskCell as _}, task::future::{Runner as FutureRunner, TaskCell}, ThreadPool, }; @@ -18,8 +19,77 @@ use yatp::{ use crate::{ thread_group::GroupProperties, time::{Duration, Instant}, + timer::GLOBAL_TIMER_HANDLE, }; +const DEFAULT_CLEANUP_INTERVAL: Duration = if cfg!(test) { + Duration::from_millis(100) +} else { + Duration::from_secs(10) +}; + +fn background_cleanup_task(cleanup: F) -> TaskCell +where + F: Fn() -> Option + Send + 'static, +{ + let mut interval = GLOBAL_TIMER_HANDLE + .interval( + std::time::Instant::now() + DEFAULT_CLEANUP_INTERVAL, + DEFAULT_CLEANUP_INTERVAL, + ) + .compat(); + TaskCell::new( + async move { + while let Some(Ok(_)) = interval.next().await { + cleanup(); + } + }, + Extras::multilevel_default(), + ) +} + +/// CleanupMethod describes how a pool cleanup its internal task-elapsed map. A +/// task-elapsed map is used for tracking how long each task has been running, +/// so that the pool can adjust the level of a task according to its running +/// time. To prevent a task-elapsed map from growing too large, the following +/// strategies are provided for cleaning up it periodically. +pub enum CleanupMethod { + /// Cleanup in place on spawning. + InPlace, + /// Cleanup in this pool (the one to be built) locally. + Local, + /// Cleanup in the given remote pool. + Remote(Remote), +} + +impl CleanupMethod { + /// Returns the perferred cleanup interval used for creating a queue + /// builder. + fn preferred_interval(&self) -> Option { + match self { + Self::InPlace => Some(DEFAULT_CLEANUP_INTERVAL), + _ => None, + } + } + + /// Tries to create a task from the cleanup function and spawn it if + /// possible, returns Some(task) if there is a task shall be spawned but + /// hasn't been spawned (that is, need to be spawned locally later). + fn try_spawn(&self, cleanup: F) -> Option + where + F: Fn() -> Option + Send + 'static, + { + match self { + Self::InPlace => None, + Self::Local => Some(background_cleanup_task(cleanup)), + Self::Remote(remote) => { + remote.spawn(background_cleanup_task(cleanup)); + None + } + } + } +} + pub(crate) const TICK_INTERVAL: Duration = Duration::from_secs(1); fn tick_interval() -> Duration { @@ -180,6 +250,10 @@ pub struct YatpPoolBuilder { max_thread_count: usize, stack_size: usize, max_tasks: usize, + cleanup_method: CleanupMethod, + + #[cfg(test)] + background_cleanup_hook: Option>, } impl YatpPoolBuilder { @@ -195,6 +269,10 @@ impl YatpPoolBuilder { max_thread_count: 1, stack_size: 0, max_tasks: std::usize::MAX, + cleanup_method: CleanupMethod::InPlace, + + #[cfg(test)] + background_cleanup_hook: None, } } @@ -233,6 +311,11 @@ impl YatpPoolBuilder { self } + pub fn cleanup_method(mut self, method: CleanupMethod) -> Self { + self.cleanup_method = method; + self + } + pub fn before_stop(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, @@ -295,13 +378,21 @@ impl YatpPoolBuilder { .name_prefix .clone() .unwrap_or_else(|| "yatp_pool".to_string()); + let multilevel_builder = multilevel::Builder::new( + multilevel::Config::default() + .name(Some(name)) + .cleanup_interval(self.cleanup_method.preferred_interval()), + ); + let pending_task = self.try_spawn_cleanup(multilevel_builder.cleanup_fn()); let (builder, read_pool_runner) = self.create_builder(); - let multilevel_builder = - multilevel::Builder::new(multilevel::Config::default().name(Some(name))); let runner_builder = multilevel_builder.runner_builder(CloneRunnerBuilder(read_pool_runner)); - builder - .build_with_queue_and_runner(QueueType::Multilevel(multilevel_builder), runner_builder) + let pool = builder + .build_with_queue_and_runner(QueueType::Multilevel(multilevel_builder), runner_builder); + if let Some(task) = pending_task { + pool.spawn(task); + } + pool } pub fn build_priority_pool( @@ -312,13 +403,54 @@ impl YatpPoolBuilder { .name_prefix .clone() .unwrap_or_else(|| "yatp_pool".to_string()); - let (builder, read_pool_runner) = self.create_builder(); let priority_builder = priority::Builder::new( - priority::Config::default().name(Some(name)), + priority::Config::default() + .name(Some(name)) + .cleanup_interval(self.cleanup_method.preferred_interval()), priority_provider, ); + let pending_task = self.try_spawn_cleanup(priority_builder.cleanup_fn()); + let (builder, read_pool_runner) = self.create_builder(); let runner_builder = priority_builder.runner_builder(CloneRunnerBuilder(read_pool_runner)); - builder.build_with_queue_and_runner(QueueType::Priority(priority_builder), runner_builder) + let pool = builder + .build_with_queue_and_runner(QueueType::Priority(priority_builder), runner_builder); + if let Some(task) = pending_task { + pool.spawn(task); + } + pool + } + + #[cfg(test)] + fn background_cleanup_hook(mut self, f: F) -> Self + where + F: Fn() + Send + Sync + 'static, + { + self.background_cleanup_hook = Some(Arc::new(f)); + self + } + + #[cfg(test)] + fn try_spawn_cleanup(&self, cleanup: F) -> Option + where + F: Fn() -> Option + Send + 'static, + { + if let Some(hook) = &self.background_cleanup_hook { + let on_cleanup = hook.clone(); + self.cleanup_method.try_spawn(move || { + on_cleanup(); + cleanup() + }) + } else { + self.cleanup_method.try_spawn(cleanup) + } + } + + #[cfg(not(test))] + fn try_spawn_cleanup(&self, cleanup: F) -> Option + where + F: Fn() -> Option + Send + 'static, + { + self.cleanup_method.try_spawn(cleanup) } fn create_builder(mut self) -> (yatp::Builder, YatpPoolRunner) { @@ -349,12 +481,15 @@ impl YatpPoolBuilder { #[cfg(test)] mod tests { - use std::sync::mpsc; + use std::{ + sync::{atomic, mpsc}, + thread, + }; use futures::compat::Future01CompatExt; use super::*; - use crate::timer::GLOBAL_TIMER_HANDLE; + use crate::{timer::GLOBAL_TIMER_HANDLE, worker}; #[test] fn test_record_schedule_wait_duration() { @@ -382,4 +517,68 @@ mod tests { let histogram = metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); assert_eq!(histogram.get_sample_count() as u32, 6, "{:?}", histogram); } + + #[test] + fn test_cleanup_in_place_by_default() { + let name = "test_cleanup_default"; + let count = Arc::new(atomic::AtomicU32::new(0)); + let n = count.clone(); + let pool = YatpPoolBuilder::new(DefaultTicker::default()) + .name_prefix(name) + .background_cleanup_hook(move || { + n.fetch_add(1, atomic::Ordering::SeqCst); + }) + .build_multi_level_pool(); + + thread::sleep(3 * DEFAULT_CLEANUP_INTERVAL); + drop(pool); + assert_eq!(0, count.load(atomic::Ordering::SeqCst)); + } + + #[test] + fn test_cleanup_in_local_pool() { + let name = "test_cleanup_local"; + let count = Arc::new(atomic::AtomicU32::new(0)); + let n = count.clone(); + let pool = YatpPoolBuilder::new(DefaultTicker::default()) + .name_prefix(name) + .cleanup_method(CleanupMethod::Local) + .background_cleanup_hook(move || { + n.fetch_add(1, atomic::Ordering::SeqCst); + let t = thread::current(); + assert!(t.name().unwrap().starts_with(name)); + }) + .build_multi_level_pool(); + + thread::sleep(3 * DEFAULT_CLEANUP_INTERVAL + DEFAULT_CLEANUP_INTERVAL / 2); + drop(pool); + thread::sleep(2 * DEFAULT_CLEANUP_INTERVAL); + assert!(3 == count.load(atomic::Ordering::SeqCst)); + } + + #[test] + fn test_cleanup_in_remote_pool() { + let name = "test_cleanup_remote"; + let bg_name = "test_background"; + let bg_pool = worker::Builder::new(bg_name).create(); + let count = Arc::new(atomic::AtomicU32::new(0)); + let n = count.clone(); + let pool = YatpPoolBuilder::new(DefaultTicker::default()) + .name_prefix(name) + .cleanup_method(CleanupMethod::Remote(bg_pool.remote())) + .background_cleanup_hook(move || { + n.fetch_add(1, atomic::Ordering::SeqCst); + let t = thread::current(); + assert!(t.name().unwrap().starts_with(bg_name)); + }) + .build_multi_level_pool(); + + thread::sleep(3 * DEFAULT_CLEANUP_INTERVAL + DEFAULT_CLEANUP_INTERVAL / 2); + drop(pool); + thread::sleep(2 * DEFAULT_CLEANUP_INTERVAL); + assert!(5 == count.load(atomic::Ordering::SeqCst)); + drop(bg_pool); + thread::sleep(2 * DEFAULT_CLEANUP_INTERVAL); + assert!(5 == count.load(atomic::Ordering::SeqCst)); + } } diff --git a/src/read_pool.rs b/src/read_pool.rs index 1488ffada15..4852caa181b 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -22,7 +22,7 @@ use tikv_util::{ sys::{cpu_time::ProcessStat, SysQuota}, time::Instant, worker::{Runnable, RunnableWithTimer, Scheduler, Worker}, - yatp_pool::{self, FuturePool, PoolTicker, YatpPoolBuilder}, + yatp_pool::{self, CleanupMethod, FuturePool, PoolTicker, YatpPoolBuilder}, }; use tracker::TrackedFuture; use yatp::{ @@ -418,11 +418,13 @@ pub fn build_yatp_read_pool( reporter: R, engine: E, resource_ctl: Option>, + cleanup_method: CleanupMethod, ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); let raftkv = Arc::new(Mutex::new(engine)); let builder = YatpPoolBuilder::new(ReporterTicker { reporter }) .name_prefix(&unified_read_pool_name) + .cleanup_method(cleanup_method) .stack_size(config.stack_size.0 as usize) .thread_count( config.min_thread_count, @@ -765,7 +767,8 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); + let pool = + build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -806,7 +809,8 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); + let pool = + build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -855,7 +859,8 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); + let pool = + build_yatp_read_pool(&config, DummyReporter, engine, None, CleanupMethod::InPlace); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); From e247c7686dce08f7243d7ce286764250723d4a76 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 22 Feb 2023 14:25:06 +0800 Subject: [PATCH 542/676] integration test v2: strip off the data prefix when getting the region id when necessary (#14235) ref tikv/tikv#12842 strip off the data prefix when getting the region id when necessary Signed-off-by: SpadeA-Tang --- components/test_raftstore-v2/src/cluster.rs | 15 +++++- components/test_raftstore-v2/src/node.rs | 10 ++++ components/test_raftstore-v2/src/server.rs | 8 ++++ components/test_raftstore/src/cluster.rs | 4 ++ tests/integrations/raftstore/test_snap.rs | 46 +++++++++++++++++-- .../raftstore/test_split_region.rs | 24 ++++------ .../raftstore/test_transfer_leader.rs | 1 + 7 files changed, 88 insertions(+), 20 deletions(-) diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index c935040055f..b9d057d33c5 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -18,7 +18,7 @@ use engine_traits::{ }; use file_system::IoRateLimiter; use futures::{compat::Future01CompatExt, executor::block_on, select, FutureExt}; -use keys::data_key; +use keys::{data_key, validate_data_key, DATA_PREFIX_KEY}; use kvproto::{ errorpb::Error as PbError, kvrpcpb::ApiVersion, @@ -81,8 +81,13 @@ pub trait Simulator { fn stop_node(&mut self, node_id: u64); fn get_node_ids(&self) -> HashSet; + fn add_send_filter(&mut self, node_id: u64, filter: Box); fn clear_send_filters(&mut self, node_id: u64); + + fn add_recv_filter(&mut self, node_id: u64, filter: Box); + fn clear_recv_filters(&mut self, node_id: u64); + fn get_router(&self, node_id: u64) -> Option>; fn get_snap_dir(&self, node_id: u64) -> String; @@ -1102,6 +1107,10 @@ impl Cluster { self.sim.wl().add_send_filter(node_id, filter); } + pub fn add_recv_filter_on_node(&mut self, node_id: u64, filter: Box) { + self.sim.wl().add_recv_filter(node_id, filter); + } + pub fn add_send_filter(&self, factory: F) { let mut sim = self.sim.wl(); for node_id in sim.get_node_ids() { @@ -1392,7 +1401,9 @@ impl WrapFactory { } } - fn region_id_of_key(&self, key: &[u8]) -> u64 { + fn region_id_of_key(&self, mut key: &[u8]) -> u64 { + assert!(validate_data_key(key)); + key = &key[DATA_PREFIX_KEY.len()..]; self.pd_client.get_region(key).unwrap().get_id() } diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index f8c8d84bc9b..f6211c09748 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -404,6 +404,16 @@ impl Simulator for NodeCluster { .unwrap() .to_owned() } + + fn add_recv_filter(&mut self, node_id: u64, filter: Box) { + let mut trans = self.trans.core.lock().unwrap(); + trans.routers.get_mut(&node_id).unwrap().add_filter(filter); + } + + fn clear_recv_filters(&mut self, node_id: u64) { + let mut trans = self.trans.core.lock().unwrap(); + trans.routers.get_mut(&node_id).unwrap().clear_filters(); + } } pub fn new_node_cluster(id: u64, count: usize) -> Cluster { diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 64e05d6b766..8804f0c0f8c 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -525,6 +525,14 @@ impl Simulator for ServerCluster { .clear_filters(); } + fn add_recv_filter(&mut self, _node_id: u64, _filter: Box) { + unimplemented!() + } + + fn clear_recv_filters(&mut self, _node_id: u64) { + unimplemented!() + } + fn run_node( &mut self, node_id: u64, diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index d5842bf6659..d4668fe4928 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1339,6 +1339,10 @@ impl Cluster { self.sim.wl().add_send_filter(node_id, filter); } + pub fn add_recv_filter_on_node(&mut self, node_id: u64, filter: Box) { + self.sim.wl().add_recv_filter(node_id, filter); + } + pub fn add_send_filter(&self, factory: F) { let mut sim = self.sim.wl(); for node_id in sim.get_node_ids() { diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index ddc4bb50406..e8a0730488a 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -257,10 +257,50 @@ fn test_concurrent_snap(cluster: &mut Cluster) { must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_concurrent_snap() { - let mut cluster = new_node_cluster(0, 3); - test_concurrent_snap(&mut cluster); + let mut cluster = new_cluster(0, 3); + // Test that the handling of snapshot is correct when there are multiple + // snapshots which have overlapped region ranges arrive at the same + // raftstore. + cluster.cfg.rocksdb.titan.enabled = true; + // Disable raft log gc in this test case. + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let r1 = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + pd_client.must_add_peer(r1, new_peer(2, 2)); + // Force peer 2 to be followers all the way. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(r1, 2) + .msg_type(MessageType::MsgRequestVote) + .direction(Direction::Send), + )); + cluster.must_transfer_leader(r1, new_peer(1, 1)); + cluster.must_put(b"k3", b"v3"); + // Pile up snapshots of overlapped region ranges and deliver them all at once. + let (tx, rx) = mpsc::channel(); + cluster.add_recv_filter_on_node(3, Box::new(CollectSnapshotFilter::new(tx))); + pd_client.must_add_peer(r1, new_peer(3, 3)); + let region = cluster.get_region(b"k1"); + // Ensure the snapshot of range ("", "") is sent and piled in filter. + if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { + panic!("the snapshot is not sent before split, e: {:?}", e); + } + // Split the region range and then there should be another snapshot for the + // split ranges. + cluster.must_split(®ion, b"k2"); + must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); + // Ensure the regions work after split. + cluster.must_put(b"k11", b"v11"); + must_get_equal(&cluster.get_engine(3), b"k11", b"v11"); + cluster.must_put(b"k4", b"v4"); + must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } #[test] diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 20a7c3f503a..963424d8986 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -414,7 +414,12 @@ fn test_node_split_overlap_snapshot() { must_get_equal(&engine3, b"k3", b"v3"); } -fn test_apply_new_version_snapshot(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_apply_new_version_snapshot() { + let mut cluster = new_cluster(0, 3); // truncate the log quickly so that we can force sending snapshot. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(5); @@ -467,21 +472,10 @@ fn test_apply_new_version_snapshot(cluster: &mut Cluster) { must_get_equal(&engine3, b"k2", b"v2"); } -#[test] -fn test_node_apply_new_version_snapshot() { - let mut cluster = new_node_cluster(0, 3); - test_apply_new_version_snapshot(&mut cluster); -} - -#[test] -fn test_server_apply_new_version_snapshot() { - let mut cluster = new_server_cluster(0, 3); - test_apply_new_version_snapshot(&mut cluster); -} - -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_server_split_with_stale_peer() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // disable raft log gc. cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(500); diff --git a/tests/integrations/raftstore/test_transfer_leader.rs b/tests/integrations/raftstore/test_transfer_leader.rs index b97191d1a13..6ed9b3c487b 100644 --- a/tests/integrations/raftstore/test_transfer_leader.rs +++ b/tests/integrations/raftstore/test_transfer_leader.rs @@ -176,6 +176,7 @@ fn test_server_pd_transfer_leader_multi_target() { } #[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_server_transfer_leader_during_snapshot() { let mut cluster = new_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); From 13eb4f606bfe93ac1a33709729a11913689633a4 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 22 Feb 2023 15:53:06 +0800 Subject: [PATCH 543/676] raftstore-v2: prepare merge (#14226) ref tikv/tikv#12842, ref tikv/tikv#13818 Implement prepare merge for raftstore-v2 Signed-off-by: tabokie Signed-off-by: Xinye Tao --- components/raftstore-v2/src/lib.rs | 1 + .../operation/command/admin/compact_log.rs | 23 +- .../operation/command/admin/conf_change.rs | 2 +- .../src/operation/command/admin/merge/mod.rs | 112 ++++ .../operation/command/admin/merge/prepare.rs | 507 ++++++++++++++++++ .../src/operation/command/admin/mod.rs | 18 +- .../src/operation/command/admin/split.rs | 10 +- .../src/operation/command/control.rs | 35 +- .../raftstore-v2/src/operation/command/mod.rs | 26 +- .../src/operation/command/write/mod.rs | 13 +- components/raftstore-v2/src/operation/mod.rs | 6 +- .../raftstore-v2/src/operation/query/mod.rs | 2 +- .../raftstore-v2/src/operation/ready/mod.rs | 8 + .../raftstore-v2/src/operation/txn_ext.rs | 1 - components/raftstore-v2/src/raft/apply.rs | 10 + components/raftstore-v2/src/raft/peer.rs | 53 +- components/raftstore-v2/src/router/imp.rs | 22 +- components/raftstore-v2/src/worker/pd/mod.rs | 8 + .../raftstore-v2/src/worker/pd/region.rs | 8 +- components/raftstore/src/store/fsm/peer.rs | 2 +- 20 files changed, 799 insertions(+), 68 deletions(-) create mode 100644 components/raftstore-v2/src/operation/command/admin/merge/mod.rs create mode 100644 components/raftstore-v2/src/operation/command/admin/merge/prepare.rs diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 8af6b57e9bc..bbb73676ffb 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -26,6 +26,7 @@ #![feature(div_duration)] #![feature(box_into_inner)] #![feature(assert_matches)] +#![feature(option_get_or_insert_default)] mod batch; mod bootstrap; diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 8e83387012e..af61434041a 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -351,7 +351,7 @@ impl Peer { pub fn on_apply_res_compact_log( &mut self, store_ctx: &mut StoreContext, - res: CompactLogResult, + mut res: CompactLogResult, ) { let first_index = self.entry_storage().first_index(); if res.compact_index <= first_index { @@ -363,7 +363,17 @@ impl Peer { ); return; } - // TODO: check is_merging + if let Some(i) = self.merge_context().and_then(|c| c.max_compact_log_index()) + && res.compact_index > i + { + info!( + self.logger, + "in merging mode, adjust compact index"; + "old_index" => res.compact_index, + "new_index" => i, + ); + res.compact_index = i; + } // TODO: check entry_cache_warmup_state self.entry_storage_mut() .compact_entry_cache(res.compact_index); @@ -388,10 +398,10 @@ impl Peer { // All logs < perssited_apply will be deleted, so should check with +1. if old_truncated + 1 < self.storage().apply_trace().persisted_apply_index() - && let Some(index) = self.compact_log_index() { + && let Some(index) = self.compact_log_index() + { // Raft Engine doesn't care about first index. - if let Err(e) = - store_ctx + if let Err(e) = store_ctx .engine .gc(self.region_id(), 0, index, self.state_changes_mut()) { @@ -432,7 +442,8 @@ impl Peer { // If it's snapshot, logs are gc already. if !task.has_snapshot && old_persisted < self.entry_storage().truncated_index() + 1 - && let Some(index) = self.compact_log_index() { + && let Some(index) = self.compact_log_index() + { let batch = task.extra_write.ensure_v2(|| self.entry_storage().raft_engine().log_batch(0)); // Raft Engine doesn't care about first index. if let Err(e) = diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 1b8d29a7a54..7bc20068736 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -255,7 +255,7 @@ impl Apply { cc: ConfChangeV2, legacy: bool, ) -> Result<(AdminResponse, AdminCmdResult)> { - let region = self.region_state().get_region(); + let region = self.region(); let change_kind = ConfChangeKind::confchange_kind(changes.len()); info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch(), "index" => index); let mut new_region = region.clone(); diff --git a/components/raftstore-v2/src/operation/command/admin/merge/mod.rs b/components/raftstore-v2/src/operation/command/admin/merge/mod.rs new file mode 100644 index 00000000000..a3895a1b435 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/merge/mod.rs @@ -0,0 +1,112 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +pub mod prepare; + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + raft_cmdpb::RaftCmdRequest, + raft_serverpb::{PeerState, RegionLocalState}, +}; +use prepare::PrepareStatus; +use raft::{ProgressState, INVALID_INDEX}; +use raftstore::Result; +use slog::{info, warn, Logger}; +use tikv_util::box_err; + +use crate::raft::Peer; + +#[derive(Default)] +pub struct MergeContext { + prepare_status: Option, +} + +impl MergeContext { + #[inline] + pub fn from_region_state(logger: &Logger, state: &RegionLocalState) -> Option { + if state.get_state() == PeerState::Merging { + info!(logger, "region is merging"; "region_state" => ?state); + let mut ctx = Self::default(); + ctx.prepare_status = Some(PrepareStatus::Applied(state.get_merge_state().clone())); + Some(ctx) + } else { + None + } + } + + #[inline] + pub fn maybe_take_pending_prepare(&mut self, applied: u64) -> Option { + if let Some(PrepareStatus::WaitForFence { + fence, + req, + .. + }) = self.prepare_status.as_mut() + && applied >= *fence + { + // The status will be updated during processing the proposal. + return req.take(); + } + None + } + + #[inline] + pub fn max_compact_log_index(&self) -> Option { + if let Some(PrepareStatus::WaitForFence { ctx, .. }) = self.prepare_status.as_ref() { + Some(ctx.min_matched) + } else { + None + } + } +} + +impl Peer { + #[inline] + pub fn update_merge_progress_on_became_follower(&mut self) { + if let Some(ctx) = self.merge_context() + && matches!(ctx.prepare_status, Some(PrepareStatus::WaitForFence { .. })) + { + self.take_merge_context(); + self.proposal_control_mut().set_pending_prepare_merge(false); + } + } + + /// Returns (minimal matched, minimal committed) + pub fn calculate_min_progress(&self) -> Result<(u64, u64)> { + let (mut min_m, mut min_c) = (None, None); + if let Some(progress) = self.raft_group().status().progress { + for (id, pr) in progress.iter() { + // Reject merge if there is any pending request snapshot, + // because a target region may merge a source region which is in + // an invalid state. + if pr.state == ProgressState::Snapshot + || pr.pending_request_snapshot != INVALID_INDEX + { + return Err(box_err!( + "there is a pending snapshot peer {} [{:?}], skip merge", + id, + pr + )); + } + if min_m.unwrap_or(u64::MAX) > pr.matched { + min_m = Some(pr.matched); + } + if min_c.unwrap_or(u64::MAX) > pr.committed_index { + min_c = Some(pr.committed_index); + } + } + } + let (mut min_m, min_c) = (min_m.unwrap_or(0), min_c.unwrap_or(0)); + if min_m < min_c { + warn!( + self.logger, + "min_matched < min_committed, raft progress is inaccurate"; + "min_matched" => min_m, + "min_committed" => min_c, + ); + // Reset `min_matched` to `min_committed`, since the raft log at `min_committed` + // is known to be committed in all peers, all of the peers should also have + // replicated it + min_m = min_c; + } + Ok((min_m, min_c)) + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs new file mode 100644 index 00000000000..f9df2d9ea1a --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -0,0 +1,507 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! The handling of `PrepareMerge` command. +//! +//! ## Propose (`Peer::propose_prepare_merge`) +//! +//! Checks for these requirements: +//! +//! - Validate the request. (`Peer::validate_prepare_merge_command`) +//! - Log gap between source region leader and peers is not too large. This is +//! because these logs need to be embeded in the later `CommitMerge` command. +//! - Logs that aren't fully committed (to all peers) does not contains +//! `CompactLog` or certain admin commands. +//! +//! Then, transfer all in-memory pessimistic locks to the target region as a +//! Raft proposal. To guarantee the consistency of lock serialization, we might +//! need to wait for some in-flight logs to be applied. During the wait, all +//! incoming write proposals will be rejected. Read the comments of +//! `PrepareStatus::WaitForFence` for more details. +//! +//! ## Apply (`Apply::apply_prepare_merge`) +//! +//! Increase region epoch and write the merge state. +//! +//! ## On Apply Result (`Peer::on_apply_res_prepare_merge`) +//! +//! Start the tick (`Peer::on_check_merge`) to periodically check the +//! eligibility of merge. + +use std::mem; + +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, CF_LOCK}; +use kvproto::{ + raft_cmdpb::{ + AdminCmdType, AdminRequest, AdminResponse, CmdType, PrepareMergeRequest, PutRequest, + RaftCmdRequest, Request, + }, + raft_serverpb::{MergeState, PeerState, RegionLocalState}, +}; +use parking_lot::RwLockUpgradableReadGuard; +use protobuf::Message; +use raft::{eraftpb::EntryType, GetEntriesContext, NO_LIMIT}; +use raftstore::{ + coprocessor::RegionChangeReason, + store::{metrics::PEER_ADMIN_CMD_COUNTER, util, LocksStatus, ProposalContext, Transport}, + Error, Result, +}; +use slog::{debug, info}; +use tikv_util::{box_err, log::SlogFormat, store::region_on_same_stores}; + +use crate::{ + batch::StoreContext, + fsm::ApplyResReporter, + operation::AdminCmdResult, + raft::{Apply, Peer}, + router::CmdResChannel, +}; + +#[derive(Clone)] +pub struct PreProposeContext { + pub min_matched: u64, + lock_size_limit: usize, +} + +pub enum PrepareStatus { + /// When a fence is present, we (1) delay the PrepareMerge + /// command `cmd` until all writes before `idx` are applied (2) reject all + /// in-coming write proposals. + /// Before proposing `PrepareMerge`, we first serialize and propose the lock + /// table. Locks marked as deleted (but not removed yet) will be + /// serialized as normal locks. + /// Thanks to the fence, we can ensure at the time of lock transfer, locks + /// are either removed (when applying logs) or won't be removed before + /// merge (the proposals to remove them are rejected). + /// + /// The request can be `None` because we needs to take it out to redo the + /// propose. In the meantime the fence is needed to bypass the check. + WaitForFence { + fence: u64, + ctx: PreProposeContext, + req: Option, + }, + /// In this state, all write proposals except for `RollbackMerge` will be + /// rejected. + Applied(MergeState), +} + +#[derive(Debug)] +pub struct PrepareMergeResult { + region_state: RegionLocalState, + state: MergeState, +} + +impl Peer { + pub fn propose_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + mut req: RaftCmdRequest, + ) -> Result { + if self.storage().has_dirty_data() { + return Err(box_err!( + "{} source peer has dirty data, try again later", + SlogFormat(&self.logger) + )); + } + self.validate_prepare_merge_command( + store_ctx, + req.get_admin_request().get_prepare_merge(), + )?; + let pre_propose = if let Some(r) = self.already_checked_pessimistic_locks()? { + r + } else { + let r = self.check_logs_before_prepare_merge(store_ctx)?; + self.check_pessimistic_locks(r, &mut req)? + }; + req.mut_admin_request() + .mut_prepare_merge() + .set_min_index(pre_propose.min_matched + 1); + let r = self + .propose_locks_before_prepare_merge(store_ctx, pre_propose.lock_size_limit) + .and_then(|_| { + let mut proposal_ctx = ProposalContext::empty(); + proposal_ctx.insert(ProposalContext::PREPARE_MERGE); + let data = req.write_to_bytes().unwrap(); + self.propose_with_ctx(store_ctx, data, proposal_ctx.to_vec()) + }); + if r.is_ok() { + self.proposal_control_mut().set_pending_prepare_merge(false); + } else { + // Match v1::post_propose_fail. + // If we just failed to propose PrepareMerge, the pessimistic locks status + // may become MergingRegion incorrectly. So, we have to revert it here. + // Note: The `is_merging` check from v1 is removed because proposed + // `PrepareMerge` rejects all writes (in `ProposalControl::check_conflict`). + assert!( + !self.proposal_control().is_merging(), + "{}", + SlogFormat(&self.logger) + ); + self.take_merge_context(); + self.proposal_control_mut().set_pending_prepare_merge(false); + let mut pessimistic_locks = self.txn_context().ext().pessimistic_locks.write(); + if pessimistic_locks.status == LocksStatus::MergingRegion { + pessimistic_locks.status = LocksStatus::Normal; + } + } + r + } + + /// Match v1::check_merge_proposal. + /// - Target region epoch as requested is identical with the local version. + /// - Target region is a sibling to the source region. + /// - Peers of both source and target region are aligned, i.e. located on + /// the same set of stores. + fn validate_prepare_merge_command( + &mut self, + store_ctx: &mut StoreContext, + req: &PrepareMergeRequest, + ) -> Result<()> { + // Just for simplicity, do not start region merge while in joint state + if self.in_joint_state() { + return Err(box_err!( + "{} region in joint state, can not propose merge command, command: {:?}", + SlogFormat(&self.logger), + req + )); + } + let region = self.region(); + let target_region = req.get_target(); + { + let store_meta = store_ctx.store_meta.lock().unwrap(); + match store_meta.regions.get(&target_region.get_id()) { + Some((region, _)) if *region != *target_region => { + return Err(box_err!( + "target region not matched, skip proposing: {:?} != {:?}", + region, + target_region + )); + } + None => { + return Err(box_err!( + "target region {} doesn't exist.", + target_region.get_id() + )); + } + _ => {} + } + } + + if !util::is_sibling_regions(target_region, region) { + return Err(box_err!( + "{:?} and {:?} are not sibling, skip proposing.", + target_region, + region + )); + } + if !region_on_same_stores(target_region, region) { + return Err(box_err!( + "peers doesn't match {:?} != {:?}, reject merge", + region.get_peers(), + target_region.get_peers() + )); + } + Ok(()) + } + + // Match v1::pre_propose_prepare_merge. + fn check_logs_before_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + ) -> Result { + let last_index = self.raft_group().raft.raft_log.last_index(); + let (min_matched, min_committed) = self.calculate_min_progress()?; + if min_matched == 0 + || min_committed == 0 + || last_index - min_matched > store_ctx.cfg.merge_max_log_gap + || last_index - min_committed > store_ctx.cfg.merge_max_log_gap * 2 + || min_matched < self.last_sent_snapshot_index() + { + return Err(box_err!( + "log gap too large, skip merge: matched: {}, committed: {}, last index: {}", + min_matched, + min_committed, + last_index + )); + } + let mut entry_size = 0; + for entry in self.raft_group().raft.raft_log.entries( + min_committed + 1, + NO_LIMIT, + GetEntriesContext::empty(false), + )? { + // commit merge only contains entries start from min_matched + 1 + if entry.index > min_matched { + entry_size += entry.get_data().len(); + } + if entry.get_entry_type() == EntryType::EntryConfChange + || entry.get_entry_type() == EntryType::EntryConfChangeV2 + { + return Err(box_err!( + "{} log gap contains conf change, skip merging.", + "tag" + )); + } + if entry.get_data().is_empty() { + continue; + } + let cmd: RaftCmdRequest = + util::parse_data_at(entry.get_data(), entry.get_index(), "tag"); + if !cmd.has_admin_request() { + continue; + } + let cmd_type = cmd.get_admin_request().get_cmd_type(); + match cmd_type { + AdminCmdType::TransferLeader + | AdminCmdType::ComputeHash + | AdminCmdType::VerifyHash + | AdminCmdType::InvalidAdmin => continue, + _ => {} + } + // Any command that can change epoch or log gap should be rejected. + return Err(box_err!( + "log gap contains admin request {:?}, skip merging.", + cmd_type + )); + } + let entry_size_limit = store_ctx.cfg.raft_entry_max_size.0 as usize * 9 / 10; + if entry_size > entry_size_limit { + return Err(box_err!( + "log gap size exceed entry size limit, skip merging." + )); + }; + Ok(PreProposeContext { + min_matched, + lock_size_limit: entry_size_limit - entry_size, + }) + } + + fn check_pessimistic_locks( + &mut self, + ctx: PreProposeContext, + req: &mut RaftCmdRequest, + ) -> Result { + let has_locks = { + let pessimistic_locks = self.txn_context().ext().pessimistic_locks.read(); + if pessimistic_locks.status != LocksStatus::Normal { + // If `status` is not `Normal`, it means the in-memory pessimistic locks are + // being transferred, probably triggered by transferring leader. In this case, + // we abort merging to simplify the situation. + return Err(box_err!( + "pessimistic locks status is {:?}, skip merging.", + pessimistic_locks.status + )); + } + !pessimistic_locks.is_empty() + }; + let last_index = self.raft_group().raft.raft_log.last_index(); + if has_locks && self.entry_storage().applied_index() < last_index { + self.merge_context_mut().prepare_status = Some(PrepareStatus::WaitForFence { + fence: last_index, + ctx, + req: Some(mem::take(req)), + }); + self.proposal_control_mut().set_pending_prepare_merge(true); + info!( + self.logger, + "start rejecting new proposals before prepare merge"; + "prepare_merge_fence" => last_index + ); + return Err(Error::PendingPrepareMerge); + } + Ok(ctx) + } + + fn already_checked_pessimistic_locks(&mut self) -> Result> { + let applied_index = self.entry_storage().applied_index(); + match self + .merge_context() + .as_ref() + .and_then(|c| c.prepare_status.as_ref()) + { + Some(PrepareStatus::WaitForFence { fence, ctx, .. }) => { + if applied_index < *fence { + info!( + self.logger, + "reject PrepareMerge because applied_index has not reached prepare_merge_fence"; + "applied_index" => applied_index, + "prepare_merge_fence" => fence, + ); + Err(Error::PendingPrepareMerge) + } else { + Ok(Some(ctx.clone())) + } + } + Some(PrepareStatus::Applied(state)) => Err(box_err!( + "another merge is in-progress, merge_state: {:?}.", + state + )), + None => Ok(None), + } + } + + /// Called after some new entries have been applied and the fence can + /// probably be lifted. + pub fn retry_pending_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + applied_index: u64, + ) { + if let Some(req) = self + .merge_context_mut() + .maybe_take_pending_prepare(applied_index) + { + let (ch, _) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + } + } + + fn propose_locks_before_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + size_limit: usize, + ) -> Result<()> { + let pessimistic_locks = self.txn_context().ext().pessimistic_locks.upgradable_read(); + if pessimistic_locks.is_empty() { + let mut pessimistic_locks = RwLockUpgradableReadGuard::upgrade(pessimistic_locks); + pessimistic_locks.status = LocksStatus::MergingRegion; + return Ok(()); + } + + // The proposed pessimistic locks here will also be carried in CommitMerge. + // Check the size to avoid CommitMerge exceeding the size limit of a raft entry. + // This check is a inaccurate check. We will check the size again accurately + // later using the protobuf encoding. + if pessimistic_locks.memory_size > size_limit { + return Err(box_err!( + "pessimistic locks size {} exceed size limit {}, skip merging.", + pessimistic_locks.memory_size, + size_limit + )); + } + + let mut cmd = RaftCmdRequest::default(); + for (key, (lock, _deleted)) in &*pessimistic_locks { + let mut put = PutRequest::default(); + put.set_cf(CF_LOCK.to_string()); + put.set_key(key.as_encoded().to_owned()); + put.set_value(lock.to_lock().to_bytes()); + let mut req = Request::default(); + req.set_cmd_type(CmdType::Put); + req.set_put(put); + cmd.mut_requests().push(req); + } + cmd.mut_header().set_region_id(self.region_id()); + cmd.mut_header() + .set_region_epoch(self.region().get_region_epoch().clone()); + cmd.mut_header().set_peer(self.peer().clone()); + let proposal_size = cmd.compute_size(); + if proposal_size as usize > size_limit { + return Err(box_err!( + "pessimistic locks size {} exceed size limit {}, skip merging.", + proposal_size, + size_limit + )); + } + + { + let mut pessimistic_locks = RwLockUpgradableReadGuard::upgrade(pessimistic_locks); + pessimistic_locks.status = LocksStatus::MergingRegion; + } + debug!( + self.logger, + "propose {} pessimistic locks before prepare merge", + cmd.get_requests().len(); + ); + self.propose(store_ctx, cmd.write_to_bytes().unwrap())?; + Ok(()) + } +} + +impl Apply { + // Match v1::exec_prepare_merge. + pub fn apply_prepare_merge( + &mut self, + req: &AdminRequest, + log_index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + PEER_ADMIN_CMD_COUNTER.prepare_merge.all.inc(); + + let prepare_merge = req.get_prepare_merge(); + let index = prepare_merge.get_min_index(); + // Note: the check against first_index is removed in v2. + let mut region = self.region().clone(); + let region_version = region.get_region_epoch().get_version() + 1; + region.mut_region_epoch().set_version(region_version); + // In theory conf version should not be increased when executing prepare_merge. + // However, we don't want to do conf change after prepare_merge is committed. + // This can also be done by iterating all proposal to find if prepare_merge is + // proposed before proposing conf change, but it make things complicated. + // Another way is make conf change also check region version, but this is not + // backward compatible. + let conf_version = region.get_region_epoch().get_conf_ver() + 1; + region.mut_region_epoch().set_conf_ver(conf_version); + let mut merging_state = MergeState::default(); + merging_state.set_min_index(index); + merging_state.set_target(prepare_merge.get_target().to_owned()); + merging_state.set_commit(log_index); + + self.region_state_mut().set_region(region.clone()); + self.region_state_mut().set_state(PeerState::Merging); + assert!( + !self.region_state().has_merge_state(), + "{:?}", + self.region_state() + ); + self.region_state_mut() + .set_merge_state(merging_state.clone()); + + PEER_ADMIN_CMD_COUNTER.prepare_merge.success.inc(); + + Ok(( + AdminResponse::default(), + AdminCmdResult::PrepareMerge(PrepareMergeResult { + region_state: self.region_state().clone(), + state: merging_state, + }), + )) + } +} + +impl Peer { + // Match v1::on_ready_prepare_merge. + pub fn on_apply_res_prepare_merge( + &mut self, + store_ctx: &mut StoreContext, + res: PrepareMergeResult, + ) { + let region = res.region_state.get_region().clone(); + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + meta.set_region(®ion, true, &self.logger); + let (reader, _) = meta.readers.get_mut(®ion.get_id()).unwrap(); + self.set_region( + &store_ctx.coprocessor_host, + reader, + region, + RegionChangeReason::PrepareMerge, + res.state.get_commit(), + ); + } + + self.storage_mut() + .set_region_state(res.region_state.clone()); + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, res.state.get_commit(), &res.region_state) + .unwrap(); + self.set_has_extra_write(); + + self.proposal_control_mut() + .enter_prepare_merge(res.state.get_commit()); + self.merge_context_mut().prepare_status = Some(PrepareStatus::Applied(res.state)); + + // TODO: self. + // update_merge_progress_on_apply_res_prepare_merge(store_ctx); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 0661d1c15dc..fe84413ff28 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -2,6 +2,7 @@ mod compact_log; mod conf_change; +mod merge; mod split; mod transfer_leader; @@ -10,8 +11,13 @@ use compact_log::CompactLogResult; use conf_change::{ConfChangeResult, UpdateGcPeersResult}; use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; +use merge::prepare::PrepareMergeResult; +pub use merge::MergeContext; use protobuf::Message; -use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; +use raftstore::{ + store::{cmd_resp, fsm::apply, msg::ErrorCallback}, + Error, +}; use slog::info; use split::SplitResult; pub use split::{ @@ -32,6 +38,7 @@ pub enum AdminCmdResult { TransferLeader(u64), CompactLog(CompactLogResult), UpdateGcPeers(UpdateGcPeersResult), + PrepareMerge(PrepareMergeResult), } impl Peer { @@ -93,6 +100,14 @@ impl Peer { conflict.delay_channel(ch); return; } + if self.proposal_control().has_pending_prepare_merge() + && cmd_type != AdminCmdType::PrepareMerge + || self.proposal_control().is_merging() && cmd_type != AdminCmdType::RollbackMerge + { + let resp = cmd_resp::new_error(Error::ProposalInMergingMode(self.region_id())); + ch.report_error(resp); + return; + } // To maintain propose order, we need to make pending proposal first. self.propose_pending_writes(ctx); let res = if apply::is_conf_change_cmd(&req) { @@ -124,6 +139,7 @@ impl Peer { let data = req.write_to_bytes().unwrap(); self.propose(ctx, data) } + AdminCmdType::PrepareMerge => self.propose_prepare_merge(ctx, req), _ => unimplemented!(), } }; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index b4e2b4654e7..bbc6aac058e 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -187,7 +187,7 @@ pub fn temp_split_path(registry: &TabletRegistry, region_id: u64) -> Pat impl PeerFsmDelegate<'_, EK, ER, T> { pub fn on_split_region_check(&mut self) { if !self.fsm.peer_mut().on_split_region_check(self.store_ctx) { - self.schedule_tick(PeerTick::SplitRegionCheck) + self.schedule_tick(PeerTick::SplitRegionCheck); } } } @@ -382,16 +382,16 @@ impl Apply { ) -> Result<(AdminResponse, AdminCmdResult)> { PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); - let region = self.region_state().get_region(); + let region = self.region(); let region_id = region.get_id(); - validate_batch_split(req, self.region_state().get_region())?; + validate_batch_split(req, self.region())?; let mut boundaries: Vec<&[u8]> = Vec::default(); - boundaries.push(self.region_state().get_region().get_start_key()); + boundaries.push(self.region().get_start_key()); for req in req.get_splits().get_requests() { boundaries.push(req.get_split_key()); } - boundaries.push(self.region_state().get_region().get_end_key()); + boundaries.push(self.region().get_end_key()); info!( self.logger, diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs index fd53090fd65..586d9f5c019 100644 --- a/components/raftstore-v2/src/operation/command/control.rs +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -77,10 +77,12 @@ impl ProposedAdminCmd { /// Compared to `CmdEpochChecker`, `ProposalControl` also traces the whole /// lifetime of prepare merge. pub struct ProposalControl { + // Admin commands that are proposed but not applied. // Use `LinkedList` to reduce memory footprint. In most cases, the list // should be empty or 1 element. And access speed is not a concern. proposed_admin_cmd: LinkedList, - pending_merge_index: u64, + has_pending_prepare_merge: bool, + applied_prepare_merge_index: u64, term: u64, } @@ -88,7 +90,8 @@ impl ProposalControl { pub fn new(term: u64) -> ProposalControl { ProposalControl { proposed_admin_cmd: LinkedList::new(), - pending_merge_index: 0, + has_pending_prepare_merge: false, + applied_prepare_merge_index: 0, term, } } @@ -135,6 +138,7 @@ impl ProposalControl { self.proposed_admin_cmd.iter_mut().rev().find(|cmd| { (check_ver && cmd.epoch_state.change_ver) || (check_conf_ver && cmd.epoch_state.change_conf_ver) + || cmd.cmd_type == AdminCmdType::PrepareMerge }) } @@ -209,19 +213,34 @@ impl ProposalControl { } } + #[inline] + pub fn set_pending_prepare_merge(&mut self, v: bool) { + self.has_pending_prepare_merge = v; + } + + #[inline] + pub fn has_pending_prepare_merge(&self) -> bool { + self.has_pending_prepare_merge + } + #[inline] pub fn enter_prepare_merge(&mut self, prepare_merge_index: u64) { - self.pending_merge_index = prepare_merge_index; + self.applied_prepare_merge_index = prepare_merge_index; } #[inline] pub fn leave_prepare_merge(&mut self, prepare_merge_index: u64) { - if self.pending_merge_index != 0 { - assert_eq!(self.pending_merge_index, prepare_merge_index); - self.pending_merge_index = 0; + if self.applied_prepare_merge_index != 0 { + assert_eq!(self.applied_prepare_merge_index, prepare_merge_index); + self.applied_prepare_merge_index = 0; } } + #[inline] + pub fn has_applied_prepare_merge(&self) -> bool { + self.applied_prepare_merge_index != 0 + } + /// Check if there is an on-going split command on current term. /// /// The answer is reliable only when the peer is leader. @@ -242,8 +261,8 @@ impl ProposalControl { /// applied. #[inline] pub fn is_merging(&self) -> bool { - if self.proposed_admin_cmd.is_empty() { - return self.pending_merge_index != 0; + if self.applied_prepare_merge_index != 0 { + return true; } self.proposed_admin_cmd .iter() diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 9f24241b039..8bff64e66c9 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -35,7 +35,7 @@ use raftstore::{ local_metrics::RaftMetrics, metrics::{APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM}, msg::ErrorCallback, - util, Config, WriteCallback, + util, Config, Transport, WriteCallback, }, Error, Result, }; @@ -59,8 +59,8 @@ mod control; mod write; pub use admin::{ - report_split_init_finish, temp_split_path, AdminCmdResult, CompactLogContext, RequestHalfSplit, - RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, + report_split_init_finish, temp_split_path, AdminCmdResult, CompactLogContext, MergeContext, + RequestHalfSplit, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, }; pub use control::ProposalControl; pub use write::{ @@ -319,7 +319,11 @@ impl Peer { } } - pub fn on_apply_res(&mut self, ctx: &mut StoreContext, apply_res: ApplyRes) { + pub fn on_apply_res( + &mut self, + ctx: &mut StoreContext, + apply_res: ApplyRes, + ) { if !self.serving() || !apply_res.admin_result.is_empty() { // TODO: remove following log once stable. info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res, "apply_trace" => ?self.storage().apply_trace()); @@ -346,6 +350,7 @@ impl Peer { AdminCmdResult::TransferLeader(term) => self.on_transfer_leader(ctx, term), AdminCmdResult::CompactLog(res) => self.on_apply_res_compact_log(ctx, res), AdminCmdResult::UpdateGcPeers(state) => self.on_apply_res_update_gc_peers(state), + AdminCmdResult::PrepareMerge(res) => self.on_apply_res_prepare_merge(ctx, res), } } @@ -365,6 +370,9 @@ impl Peer { if !is_leader { entry_storage.compact_entry_cache(apply_res.applied_index + 1); } + if is_leader { + self.retry_pending_prepare_merge(ctx, apply_res.applied_index); + } self.on_data_modified(apply_res.modifications); self.handle_read_on_apply( ctx, @@ -482,7 +490,7 @@ impl Apply { .observe(duration_to_sec(ce.committed_time.saturating_elapsed())); for (e, ch) in ce.entry_and_proposals { if self.tombstone() { - apply::notify_req_region_removed(self.region_state().get_region().get_id(), ch); + apply::notify_req_region_removed(self.region_id(), ch); continue; } if !e.get_data().is_empty() { @@ -528,7 +536,7 @@ impl Apply { Ok(decoder) => { util::compare_region_epoch( decoder.header().get_region_epoch(), - self.region_state().get_region(), + self.region(), false, true, true, @@ -575,14 +583,14 @@ impl Apply { } }; - util::check_req_region_epoch(&req, self.region_state().get_region(), true)?; + util::check_req_region_epoch(&req, self.region(), true)?; if req.has_admin_request() { let admin_req = req.get_admin_request(); let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { - AdminCmdType::CompactLog => self.apply_compact_log(admin_req, entry.index)?, + AdminCmdType::CompactLog => self.apply_compact_log(admin_req, log_index)?, AdminCmdType::Split => self.apply_split(admin_req, log_index)?, AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, log_index)?, - AdminCmdType::PrepareMerge => unimplemented!(), + AdminCmdType::PrepareMerge => self.apply_prepare_merge(admin_req, log_index)?, AdminCmdType::CommitMerge => unimplemented!(), AdminCmdType::RollbackMerge => unimplemented!(), AdminCmdType::TransferLeader => { diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index e958a3ec08f..988b7cf4b2d 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -9,7 +9,7 @@ use raftstore::{ msg::ErrorCallback, util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, }, - Result, + Error, Result, }; use tikv_util::slog_panic; @@ -58,6 +58,13 @@ impl Peer { conflict.delay_channel(ch); return; } + if self.proposal_control().has_pending_prepare_merge() + || self.proposal_control().is_merging() + { + let resp = cmd_resp::new_error(Error::ProposalInMergingMode(self.region_id())); + ch.report_error(resp); + return; + } // ProposalControl is reliable only when applied to current term. let call_proposed_on_success = self.applied_to_current_term(); let mut encoder = SimpleWriteReqEncoder::new( @@ -132,7 +139,7 @@ impl Apply { if self.should_skip(off, index) { return Ok(()); } - util::check_key_in_region(key, self.region_state().get_region())?; + util::check_key_in_region(key, self.region())?; // Technically it's OK to remove prefix for raftstore v2. But rocksdb doesn't // support specifying infinite upper bound in various APIs. keys::data_key_with_buffer(key, &mut self.key_buffer); @@ -175,7 +182,7 @@ impl Apply { if self.should_skip(off, index) { return Ok(()); } - util::check_key_in_region(key, self.region_state().get_region())?; + util::check_key_in_region(key, self.region())?; keys::data_key_with_buffer(key, &mut self.key_buffer); self.ensure_write_buffer(); let res = if cf.is_empty() || cf == CF_DEFAULT { diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index ee0680f7fbb..0ba7de2c3e5 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -9,9 +9,9 @@ mod ready; mod txn_ext; pub use command::{ - AdminCmdResult, ApplyFlowControl, CommittedEntries, CompactLogContext, ProposalControl, - RequestHalfSplit, RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, - SimpleWriteReqEncoder, SplitFlowControl, SPLIT_PREFIX, + AdminCmdResult, ApplyFlowControl, CommittedEntries, CompactLogContext, MergeContext, + ProposalControl, RequestHalfSplit, RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, + SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, SPLIT_PREFIX, }; pub use life::{DestroyProgress, GcPeerContext}; pub use ready::{ diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 305cdb666cc..fc7cee35fa5 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -281,7 +281,7 @@ impl Peer { self.storage().apply_state().get_applied_index() >= read_index // If it is in pending merge state(i.e. applied PrepareMerge), the data may be stale. // TODO: Add a test to cover this case - && !self.has_pending_merge_state() + && self.proposal_control().has_applied_prepare_merge() } #[inline] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index ebff7ad44ce..bf7b8ec8858 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -340,6 +340,10 @@ impl Peer { let msg_type = msg.get_message().get_msg_type(); let to_peer_id = msg.get_to_peer().get_id(); let to_store_id = msg.get_to_peer().get_store_id(); + if msg_type == MessageType::MsgSnapshot { + let index = msg.get_message().get_snapshot().get_metadata().get_index(); + self.update_last_sent_snapshot_index(index); + } trace!( self.logger, @@ -775,6 +779,9 @@ impl Peer { // current term to apply the read. So broadcast eagerly to avoid unexpected // latency. self.raft_group_mut().skip_bcast_commit(false); + self.update_last_sent_snapshot_index( + self.raft_group().raft.raft_log.last_index(), + ); self.txn_context().on_became_leader( ctx, @@ -798,6 +805,7 @@ impl Peer { self.storage_mut().cancel_generating_snap(None); self.txn_context() .on_became_follower(self.term(), self.region()); + self.update_merge_progress_on_became_follower(); } _ => {} } diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs index 911c1eaab78..e30bc25eec4 100644 --- a/components/raftstore-v2/src/operation/txn_ext.rs +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -88,7 +88,6 @@ impl TxnContext { &self.extra_op } - // TODO: find a better place to put all txn related stuff. fn require_updating_max_ts( &self, ctx: &StoreContext, diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 7a1a22a5a95..a7af3c470ae 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -162,6 +162,16 @@ impl Apply { &mut self.region_state } + #[inline] + pub fn region(&self) -> &metapb::Region { + self.region_state.get_region() + } + + #[inline] + pub fn region_id(&self) -> u64 { + self.region().get_id() + } + /// The tablet can't be public yet, otherwise content of latest tablet /// doesn't matches its epoch in both readers and peer fsm. #[inline] diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 142b4e91943..bcf92471ebe 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -11,7 +11,8 @@ use engine_traits::{ CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, }; use kvproto::{ - metapb, pdpb, + metapb::{self, PeerRole}, + pdpb, raft_serverpb::{RaftMessage, RegionLocalState}, }; use pd_client::BucketStat; @@ -31,8 +32,8 @@ use super::storage::Storage; use crate::{ fsm::ApplyScheduler, operation::{ - AsyncWriter, CompactLogContext, DestroyProgress, GcPeerContext, ProposalControl, - SimpleWriteReqEncoder, SplitFlowControl, TxnContext, + AsyncWriter, CompactLogContext, DestroyProgress, GcPeerContext, MergeContext, + ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, TxnContext, }, router::{CmdResChannel, PeerTick, QueryResChannel}, Result, @@ -58,6 +59,9 @@ pub struct Peer { /// For raft log compaction. compact_log_context: CompactLogContext, + merge_context: Option>, + last_sent_snapshot_index: u64, + /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. raw_write_encoder: Option, @@ -132,6 +136,7 @@ impl Peer { let region_id = storage.region().get_id(); let tablet_index = storage.region_state().get_tablet_index(); + let merge_context = MergeContext::from_region_state(&logger, storage.region_state()); let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); @@ -156,6 +161,8 @@ impl Peer { peer_cache: vec![], peer_heartbeats: HashMap::default(), compact_log_context: CompactLogContext::new(applied_index), + merge_context: merge_context.map(|c| Box::new(c)), + last_sent_snapshot_index: 0, raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), @@ -378,6 +385,21 @@ impl Peer { &self.compact_log_context } + #[inline] + pub fn merge_context(&self) -> Option<&MergeContext> { + self.merge_context.as_deref() + } + + #[inline] + pub fn merge_context_mut(&mut self) -> &mut MergeContext { + self.merge_context.get_or_insert_default() + } + + #[inline] + pub fn take_merge_context(&mut self) -> Option> { + self.merge_context.take() + } + #[inline] pub fn raft_group(&self) -> &RawNode> { &self.raft_group @@ -578,12 +600,6 @@ impl Peer { false } - #[inline] - // TODO - pub fn has_pending_merge_state(&self) -> bool { - false - } - pub fn serving(&self) -> bool { matches!(self.destroy_progress, DestroyProgress::None) } @@ -722,6 +738,13 @@ impl Peer { .advance_apply(apply_index, term, region); } + #[inline] + pub fn in_joint_state(&self) -> bool { + self.region().get_peers().iter().any(|p| { + p.get_role() == PeerRole::IncomingVoter || p.get_role() == PeerRole::DemotingVoter + }) + } + #[inline] pub fn split_trace_mut(&mut self) -> &mut Vec<(u64, HashSet)> { &mut self.split_trace @@ -804,4 +827,16 @@ impl Peer { pub fn gc_peer_context_mut(&mut self) -> &mut GcPeerContext { &mut self.gc_peer_context } + + #[inline] + pub fn update_last_sent_snapshot_index(&mut self, i: u64) { + if i > self.last_sent_snapshot_index { + self.last_sent_snapshot_index = i; + } + } + + #[inline] + pub fn last_sent_snapshot_index(&self) -> u64 { + self.last_sent_snapshot_index + } } diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index bcda7298bd4..67b0a7adeb7 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -15,12 +15,8 @@ use kvproto::{ use raftstore::store::{AsyncReadNotifier, FetchedLogs, GenSnapRes, RegionSnapshot}; use slog::warn; -use super::{CmdResChannel, PeerMsg}; -use crate::{ - batch::StoreRouter, - operation::{LocalReader, RequestSplit}, - StoreMeta, -}; +use super::PeerMsg; +use crate::{batch::StoreRouter, operation::LocalReader, StoreMeta}; impl AsyncReadNotifier for StoreRouter { fn notify_logs_fetched(&self, region_id: u64, fetched_logs: FetchedLogs) { @@ -48,18 +44,8 @@ impl raftstore::coprocessor::StoreHandle for Store split_keys: Vec>, source: Cow<'static, str>, ) { - let (ch, _) = CmdResChannel::pair(); - let res = self.send( - region_id, - PeerMsg::RequestSplit { - request: RequestSplit { - epoch: region_epoch, - split_keys, - source, - }, - ch, - }, - ); + let (msg, _) = PeerMsg::request_split(region_epoch, split_keys, source.to_string()); + let res = self.send(region_id, msg); if let Err(e) = res { warn!( self.logger(), diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index e529f7dddee..e06d161fe08 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -455,4 +455,12 @@ mod requests { req.mut_transfer_leader().set_peers(peers.into()); req } + + pub fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::PrepareMerge); + req.mut_prepare_merge() + .set_target(merge.get_target().to_owned()); + req + } } diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index bd4925e8563..bca48412aa6 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -312,8 +312,12 @@ where ); } } else if resp.has_merge() { - // TODO - info!(logger, "pd asks for merge but ignored"); + PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["merge"]).inc(); + + let merge = resp.take_merge(); + info!(logger, "try to merge"; "region_id" => region_id, "merge" => ?merge); + let req = new_merge_request(merge); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); } else { PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["noop"]).inc(); } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 30420668164..6d0801696cb 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5627,7 +5627,7 @@ where } fn register_split_region_check_tick(&mut self) { - self.schedule_tick(PeerTick::SplitRegionCheck) + self.schedule_tick(PeerTick::SplitRegionCheck); } #[inline] From e99ebbc8148c1462453358dc43f2cd78265a8a5d Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 22 Feb 2023 16:21:06 +0800 Subject: [PATCH 544/676] *: enable bucket automatically if region size is large enough (#14255) ref tikv/tikv#12842 So that it's easier to use v2. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_rocks/src/engine.rs | 2 +- .../raftstore-v2/src/operation/bucket.rs | 2 +- .../tests/integrations/test_pd_heartbeat.rs | 2 +- .../raftstore/src/coprocessor/config.rs | 83 ++++++++++--------- .../src/coprocessor/split_check/half.rs | 4 +- .../src/coprocessor/split_check/keys.rs | 2 +- .../src/coprocessor/split_check/mod.rs | 2 +- .../src/coprocessor/split_check/size.rs | 6 +- components/raftstore/src/store/config.rs | 2 +- components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/peer.rs | 3 +- .../raftstore/src/store/worker/split_check.rs | 2 +- components/server/src/server.rs | 2 +- components/server/src/server2.rs | 2 +- components/test_raftstore-v2/src/node.rs | 4 +- components/test_raftstore-v2/src/server.rs | 2 +- components/test_raftstore/src/node.rs | 4 +- components/test_raftstore/src/server.rs | 2 +- src/config/mod.rs | 24 ++---- tests/failpoints/cases/test_stats.rs | 2 +- tests/integrations/config/mod.rs | 2 +- .../raftstore/test_split_region.rs | 4 +- 22 files changed, 80 insertions(+), 80 deletions(-) diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index de29e676277..6499880490f 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -151,7 +151,7 @@ pub struct RocksEngine { } impl RocksEngine { - pub(crate) fn new(db: DB) -> RocksEngine { + pub fn new(db: DB) -> RocksEngine { let db = Arc::new(db); RocksEngine { support_multi_batch_write: db.get_db_options().is_enable_multi_batch_write(), diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs index efff68fc453..05976d49d97 100644 --- a/components/raftstore-v2/src/operation/bucket.rs +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -166,7 +166,7 @@ impl Peer { } pub fn maybe_gen_approximate_buckets(&self, ctx: &StoreContext) { - if ctx.coprocessor_host.cfg.enable_region_bucket && self.storage().is_initialized() { + if ctx.coprocessor_host.cfg.enable_region_bucket() && self.storage().is_initialized() { if let Err(e) = ctx .schedulers .split_check diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs index 11ff6bd4d02..56159538836 100644 --- a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -72,7 +72,7 @@ fn test_store_heartbeat() { fn test_report_buckets() { let region_id = 2; let mut cop_cfg = CopConfig::default(); - cop_cfg.enable_region_bucket = true; + cop_cfg.enable_region_bucket = Some(true); cop_cfg.region_bucket_size = ReadableSize::kb(1); let cluster = Cluster::with_cop_cfg(cop_cfg); let store_id = cluster.node(0).id(); diff --git a/components/raftstore/src/coprocessor/config.rs b/components/raftstore/src/coprocessor/config.rs index 137de200b71..c05a8e89a41 100644 --- a/components/raftstore/src/coprocessor/config.rs +++ b/components/raftstore/src/coprocessor/config.rs @@ -46,7 +46,7 @@ pub struct Config { pub perf_level: PerfLevel, // enable subsplit ranges (aka bucket) within the region - pub enable_region_bucket: bool, + pub enable_region_bucket: Option, pub region_bucket_size: ReadableSize, // region size threshold for using approximate size instead of scan pub region_size_threshold_for_approximate: ReadableSize, @@ -70,9 +70,8 @@ pub enum ConsistencyCheckMethod { } /// Default region split size. -pub const SPLIT_SIZE_MB: u64 = 96; -pub const LARGE_REGION_SPLIT_SIZE_MB: u64 = 1024; -pub const RAFTSTORE_V2_SPLIT_SIZE_MB: u64 = 10240; +pub const SPLIT_SIZE: ReadableSize = ReadableSize::mb(96); +pub const RAFTSTORE_V2_SPLIT_SIZE: ReadableSize = ReadableSize::gb(10); /// Default batch split limit. pub const BATCH_SPLIT_LIMIT: u64 = 10; @@ -92,7 +91,7 @@ impl Default for Config { region_max_keys: None, consistency_check_method: ConsistencyCheckMethod::Mvcc, perf_level: PerfLevel::Uninitialized, - enable_region_bucket: false, + enable_region_bucket: None, region_bucket_size: DEFAULT_BUCKET_SIZE, region_size_threshold_for_approximate: DEFAULT_BUCKET_SIZE * BATCH_SPLIT_LIMIT / 2 * 3, region_bucket_merge_size_ratio: DEFAULT_REGION_BUCKET_MERGE_SIZE_RATIO, @@ -103,12 +102,7 @@ impl Default for Config { impl Config { pub fn region_split_size(&self) -> ReadableSize { - self.region_split_size - .unwrap_or(/* v1 only */ if self.enable_region_bucket { - ReadableSize::mb(LARGE_REGION_SPLIT_SIZE_MB) - } else { - ReadableSize::mb(SPLIT_SIZE_MB) - }) + self.region_split_size.unwrap_or(SPLIT_SIZE) } pub fn region_max_keys(&self) -> u64 { @@ -128,17 +122,48 @@ impl Config { .unwrap_or((self.region_split_size().as_mb_f64() * 10000.0) as u64) } + pub fn enable_region_bucket(&self) -> bool { + self.enable_region_bucket.unwrap_or(false) + } + pub fn optimize_for(&mut self, raftstore_v2: bool) { // overwrite the default region_split_size when it's multi-rocksdb if self.region_split_size.is_none() { if raftstore_v2 { - self.region_split_size = Some(ReadableSize::mb(RAFTSTORE_V2_SPLIT_SIZE_MB)); + self.region_split_size = Some(RAFTSTORE_V2_SPLIT_SIZE); } else { self.region_split_size = Some(self.region_split_size()); } } } + fn validate_bucket_size(&self) -> Result<()> { + if self.region_split_size().0 < self.region_bucket_size.0 { + return Err(box_err!( + "region split size {} must >= region bucket size {}", + self.region_split_size().0, + self.region_bucket_size.0 + )); + } + if self.region_size_threshold_for_approximate.0 < self.region_bucket_size.0 { + return Err(box_err!( + "large region threshold size {} must >= region bucket size {}", + self.region_size_threshold_for_approximate.0, + self.region_bucket_size.0 + )); + } + if self.region_bucket_size.0 == 0 { + return Err(box_err!("region_bucket size cannot be 0.")); + } + if self.region_bucket_merge_size_ratio <= 0.0 || self.region_bucket_merge_size_ratio >= 0.5 + { + return Err(box_err!( + "region-bucket-merge-size-ratio should be 0 to 0.5 (not include both ends)." + )); + } + Ok(()) + } + pub fn validate(&mut self) -> Result<()> { if self.region_split_keys.is_none() { self.region_split_keys = Some((self.region_split_size().as_mb_f64() * 10000.0) as u64); @@ -169,31 +194,13 @@ impl Config { } None => self.region_max_keys = Some(self.region_split_keys() / 2 * 3), } - if self.enable_region_bucket { - if self.region_split_size().0 < self.region_bucket_size.0 { - return Err(box_err!( - "region split size {} must >= region bucket size {}", - self.region_split_size().0, - self.region_bucket_size.0 - )); - } - if self.region_size_threshold_for_approximate.0 < self.region_bucket_size.0 { - return Err(box_err!( - "large region threshold size {} must >= region bucket size {}", - self.region_size_threshold_for_approximate.0, - self.region_bucket_size.0 - )); - } - if self.region_bucket_size.0 == 0 { - return Err(box_err!("region_bucket size cannot be 0.")); - } - if self.region_bucket_merge_size_ratio <= 0.0 - || self.region_bucket_merge_size_ratio >= 0.5 - { - return Err(box_err!( - "region-bucket-merge-size-ratio should be 0 to 0.5 (not include both ends)." - )); - } + let res = self.validate_bucket_size(); + // If it's OK to enable bucket, we will prefer to enable it if useful. + if let Ok(()) = res && self.enable_region_bucket.is_none() { + let useful = self.region_split_size() >= self.region_bucket_size * 2; + self.enable_region_bucket = Some(useful); + } else if let Err(e) = res && self.enable_region_bucket() { + return Err(e); } Ok(()) } @@ -251,7 +258,7 @@ mod tests { assert_eq!(cfg.region_max_keys, Some(30)); cfg = Config::default(); - cfg.enable_region_bucket = false; + cfg.enable_region_bucket = Some(false); cfg.region_split_size = Some(ReadableSize(20)); cfg.region_bucket_size = ReadableSize(30); cfg.validate().unwrap(); diff --git a/components/raftstore/src/coprocessor/split_check/half.rs b/components/raftstore/src/coprocessor/split_check/half.rs index 259334d2f42..1f4527128d8 100644 --- a/components/raftstore/src/coprocessor/split_check/half.rs +++ b/components/raftstore/src/coprocessor/split_check/half.rs @@ -268,7 +268,7 @@ mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { region_split_size: Some(ReadableSize(130_u64)), - enable_region_bucket: true, + enable_region_bucket: Some(true), region_bucket_size: ReadableSize(20_u64), // so that each key below will form a bucket ..Default::default() }; @@ -392,7 +392,7 @@ mod tests { let (tx, rx) = mpsc::sync_channel(100); let cfg = Config { region_split_size: Some(ReadableSize(130_u64)), - enable_region_bucket: true, + enable_region_bucket: Some(true), region_bucket_size: ReadableSize(20_u64), // so that each key below will form a bucket ..Default::default() }; diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index 58c42d55513..2c0e71dd8cb 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -555,7 +555,7 @@ mod tests { region_max_keys: Some(159), region_split_keys: Some(80), batch_split_limit: 5, - enable_region_bucket: true, + enable_region_bucket: Some(true), // need check split region buckets, but region size does not exceed the split threshold region_bucket_size: ReadableSize(100), ..Default::default() diff --git a/components/raftstore/src/coprocessor/split_check/mod.rs b/components/raftstore/src/coprocessor/split_check/mod.rs index 3978789db91..e92000f2c95 100644 --- a/components/raftstore/src/coprocessor/split_check/mod.rs +++ b/components/raftstore/src/coprocessor/split_check/mod.rs @@ -120,7 +120,7 @@ impl<'a, E> Host<'a, E> { #[inline] pub fn enable_region_bucket(&self) -> bool { - self.cfg.enable_region_bucket + self.cfg.enable_region_bucket() } #[inline] diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index 8a1a5558c7d..4b320bef1b6 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -161,7 +161,7 @@ impl SplitCheckObserver for SizeCheckObserver self.router.update_approximate_size(region_id, region_size); let need_bucket_checker = - host.cfg.enable_region_bucket && region_size >= 2 * host.cfg.region_bucket_size.0; + host.cfg.enable_region_bucket() && region_size >= 2 * host.cfg.region_bucket_size.0; REGION_SIZE_HISTOGRAM.observe(region_size as f64); let need_split_region = region_size >= host.cfg.region_max_size().0; @@ -549,7 +549,7 @@ pub mod tests { region_max_keys: Some(1000000), region_split_keys: Some(1000000), batch_split_limit: 5, - enable_region_bucket: true, + enable_region_bucket: Some(true), region_bucket_size: ReadableSize(3000), region_size_threshold_for_approximate: ReadableSize(50000), ..Default::default() @@ -675,7 +675,7 @@ pub mod tests { region_max_keys: Some(1000000), region_split_keys: Some(1000000), batch_split_limit: 5, - enable_region_bucket: true, + enable_region_bucket: Some(true), region_bucket_size: ReadableSize(1), // minimal bucket size region_size_threshold_for_approximate: ReadableSize(500000000), // follow split region's check policy, not force to use approximate diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 342ace1139e..301f3cea0cc 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -1109,7 +1109,7 @@ mod tests { #[test] fn test_config_validate() { - let split_size = ReadableSize::mb(coprocessor::config::SPLIT_SIZE_MB); + let split_size = coprocessor::config::SPLIT_SIZE; let mut cfg = Config::new(); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!( diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 6d0801696cb..6acddde2257 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5982,7 +5982,7 @@ where // generate bucket range list to run split-check (to further split buckets) fn gen_bucket_range_for_update(&self) -> Option> { - if !self.ctx.coprocessor_host.cfg.enable_region_bucket { + if !self.ctx.coprocessor_host.cfg.enable_region_bucket() { return None; } let region_buckets = self.fsm.peer.region_buckets.as_ref()?; diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index c788256799b..a1817edd17b 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -5090,7 +5090,8 @@ where } pub fn maybe_gen_approximate_buckets(&self, ctx: &PollContext) { - if ctx.coprocessor_host.cfg.enable_region_bucket && !self.region().get_peers().is_empty() { + if ctx.coprocessor_host.cfg.enable_region_bucket() && !self.region().get_peers().is_empty() + { if let Err(e) = ctx .split_check_scheduler .schedule(SplitCheckTask::ApproximateBuckets(self.region().clone())) diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index b6bc5fca65f..1335ed5d5e8 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -667,7 +667,7 @@ where ), Task::ChangeConfig(c) => self.change_cfg(c), Task::ApproximateBuckets(region) => { - if self.coprocessor.cfg.enable_region_bucket { + if self.coprocessor.cfg.enable_region_bucket() { let mut cached; let tablet = match &self.engine { Either::Left(e) => e, diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 9576cb91423..5ba70b5db5a 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -969,7 +969,7 @@ where .raft_store .validate( self.config.coprocessor.region_split_size(), - self.config.coprocessor.enable_region_bucket, + self.config.coprocessor.enable_region_bucket(), self.config.coprocessor.region_bucket_size, ) .unwrap_or_else(|e| fatal!("failed to validate raftstore config {}", e)); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 9a2a1a1e8e0..7b391c20bb8 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -828,7 +828,7 @@ where .raft_store .validate( self.config.coprocessor.region_split_size(), - self.config.coprocessor.enable_region_bucket, + self.config.coprocessor.enable_region_bucket(), self.config.coprocessor.region_bucket_size, ) .unwrap_or_else(|e| fatal!("failed to validate raftstore config {}", e)); diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index f6211c09748..b9609ad2783 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -191,7 +191,7 @@ impl Simulator for NodeCluster { raft_store .validate( cfg.coprocessor.region_split_size(), - cfg.coprocessor.enable_region_bucket, + cfg.coprocessor.enable_region_bucket(), cfg.coprocessor.region_bucket_size, ) .unwrap(); @@ -286,7 +286,7 @@ impl Simulator for NodeCluster { let node_id = node.id(); let region_split_size = cfg.coprocessor.region_split_size(); - let enable_region_bucket = cfg.coprocessor.enable_region_bucket; + let enable_region_bucket = cfg.coprocessor.enable_region_bucket(); let region_bucket_size = cfg.coprocessor.region_bucket_size; let mut raftstore_cfg = cfg.tikv.raft_store; raftstore_cfg diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 8804f0c0f8c..d02dffa73fc 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -197,7 +197,7 @@ impl ServerCluster { raft_store .validate( cfg.coprocessor.region_split_size(), - cfg.coprocessor.enable_region_bucket, + cfg.coprocessor.enable_region_bucket(), cfg.coprocessor.region_bucket_size, ) .unwrap(); diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 78e1dbb36c3..618b760e29e 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -240,7 +240,7 @@ impl Simulator for NodeCluster { raft_store .validate( cfg.coprocessor.region_split_size(), - cfg.coprocessor.enable_region_bucket, + cfg.coprocessor.enable_region_bucket(), cfg.coprocessor.region_bucket_size, ) .unwrap(); @@ -348,7 +348,7 @@ impl Simulator for NodeCluster { ); let region_split_size = cfg.coprocessor.region_split_size(); - let enable_region_bucket = cfg.coprocessor.enable_region_bucket; + let enable_region_bucket = cfg.coprocessor.enable_region_bucket(); let region_bucket_size = cfg.coprocessor.region_bucket_size; let mut raftstore_cfg = cfg.tikv.raft_store; raftstore_cfg diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index a17c65b8aec..3f6b704687a 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -500,7 +500,7 @@ impl ServerCluster { raft_store .validate( cfg.coprocessor.region_split_size(), - cfg.coprocessor.enable_region_bucket, + cfg.coprocessor.enable_region_bucket(), cfg.coprocessor.region_bucket_size, ) .unwrap(); diff --git a/src/config/mod.rs b/src/config/mod.rs index 4be54665443..dff0fcb2436 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3241,7 +3241,7 @@ impl TikvConfig { self.coprocessor.validate()?; self.raft_store.validate( self.coprocessor.region_split_size(), - self.coprocessor.enable_region_bucket, + self.coprocessor.enable_region_bucket(), self.coprocessor.region_bucket_size, )?; self.security @@ -4177,7 +4177,7 @@ mod tests { use itertools::Itertools; use kvproto::kvrpcpb::CommandPri; use raftstore::coprocessor::{ - config::{LARGE_REGION_SPLIT_SIZE_MB, RAFTSTORE_V2_SPLIT_SIZE_MB, SPLIT_SIZE_MB}, + config::{RAFTSTORE_V2_SPLIT_SIZE, SPLIT_SIZE}, region_info_accessor::MockRegionInfoProvider, }; use slog::Level; @@ -5596,27 +5596,17 @@ mod tests { let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.optimize_for(false); default_cfg.coprocessor.validate().unwrap(); - assert_eq!( - default_cfg.coprocessor.region_split_size(), - ReadableSize::mb(SPLIT_SIZE_MB) - ); - - let mut default_cfg = TikvConfig::default(); - default_cfg.coprocessor.enable_region_bucket = true; - default_cfg.coprocessor.optimize_for(false); - default_cfg.coprocessor.validate().unwrap(); - assert_eq!( - default_cfg.coprocessor.region_split_size(), - ReadableSize::mb(LARGE_REGION_SPLIT_SIZE_MB) - ); + assert_eq!(default_cfg.coprocessor.region_split_size(), SPLIT_SIZE); + assert!(!default_cfg.coprocessor.enable_region_bucket()); let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.optimize_for(true); default_cfg.coprocessor.validate().unwrap(); assert_eq!( default_cfg.coprocessor.region_split_size(), - ReadableSize::mb(RAFTSTORE_V2_SPLIT_SIZE_MB) + RAFTSTORE_V2_SPLIT_SIZE ); + assert!(default_cfg.coprocessor.enable_region_bucket()); let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); @@ -5626,6 +5616,7 @@ mod tests { default_cfg.coprocessor.region_split_size(), ReadableSize::mb(500) ); + assert!(default_cfg.coprocessor.enable_region_bucket()); let mut default_cfg = TikvConfig::default(); default_cfg.coprocessor.region_split_size = Some(ReadableSize::mb(500)); @@ -5635,6 +5626,7 @@ mod tests { default_cfg.coprocessor.region_split_size(), ReadableSize::mb(500) ); + assert!(default_cfg.coprocessor.enable_region_bucket()); } #[test] diff --git a/tests/failpoints/cases/test_stats.rs b/tests/failpoints/cases/test_stats.rs index 37c87fa4547..7bc97edf759 100644 --- a/tests/failpoints/cases/test_stats.rs +++ b/tests/failpoints/cases/test_stats.rs @@ -7,7 +7,7 @@ use tikv_util::config::*; #[test] fn test_bucket_stats() { let (mut cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { - cluster.cfg.coprocessor.enable_region_bucket = true; + cluster.cfg.coprocessor.enable_region_bucket = Some(true); cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::days(1); cluster.cfg.raft_store.report_region_buckets_tick_interval = ReadableDuration::millis(100); }); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 351e9d74ca0..80cab3aca43 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -733,7 +733,7 @@ fn test_serde_custom_tikv_config() { region_split_keys: Some(100000), consistency_check_method: ConsistencyCheckMethod::Raw, perf_level: PerfLevel::Uninitialized, - enable_region_bucket: true, + enable_region_bucket: Some(true), region_bucket_size: ReadableSize::mb(1), region_size_threshold_for_approximate: ReadableSize::mb(3), prefer_approximate_bucket: false, diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 963424d8986..7f907970a72 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -604,7 +604,7 @@ fn test_node_split_region_after_reboot_with_config_change() { let region_split_size = 2000; cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(20); - cluster.cfg.coprocessor.enable_region_bucket = true; + cluster.cfg.coprocessor.enable_region_bucket = Some(true); cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(region_max_size)); cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(region_split_size)); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(region_split_size); @@ -1171,7 +1171,7 @@ fn test_gen_split_check_bucket_ranges() { let count = 5; let mut cluster = new_server_cluster(0, count); cluster.cfg.coprocessor.region_bucket_size = ReadableSize(5); - cluster.cfg.coprocessor.enable_region_bucket = true; + cluster.cfg.coprocessor.enable_region_bucket = Some(true); // disable report buckets; as it will reset the user traffic stats to randomize // the test result cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::secs(5); From 6ea9c3af2c3d94b6f6e22311a7709a4b78872f62 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 22 Feb 2023 17:35:06 +0800 Subject: [PATCH 545/676] raftstore-v2: consider `None` when getting mailbox (#14234) ref tikv/tikv#12842, close tikv/tikv#14233 consider None when getting mailbox Signed-off-by: SpadeA-Tang --- .../src/operation/command/admin/split.rs | 28 +++++++++++-------- .../raftstore/test_split_region.rs | 7 +++-- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index bbc6aac058e..4e14c7e016d 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -706,17 +706,23 @@ impl Peer { let region_id = self.region_id(); if self.storage().has_dirty_data() { let tablet_index = self.storage().tablet_index(); - let mailbox = store_ctx.router.mailbox(region_id).unwrap(); - let _ = store_ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::trim( - self.tablet().unwrap().clone(), - self.region(), - move || { - let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); - }, - )); + if let Some(mailbox) = store_ctx.router.mailbox(region_id) { + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + } else { + // None means the node is shutdown concurrently and thus the + // mailboxes in router have been cleared + assert!(store_ctx.router.is_shutdown()); + return; + } } if split_init.derived_leader && self.leader_id() == INVALID_ID diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 7f907970a72..f8d6ff9b468 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -545,6 +545,8 @@ fn test_server_split_with_stale_peer() { #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_split_region_diff_check() { let count = 1; let mut cluster = new_cluster(0, count); @@ -596,10 +598,11 @@ fn test_split_region_diff_check() { // set max region size/split size 2000 and put data till 1000 // set max region size/split size < 1000 and reboot // verify the region is splitted. -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_node_split_region_after_reboot_with_config_change() { let count = 1; - let mut cluster = new_server_cluster(0, count); + let mut cluster = new_cluster(0, count); let region_max_size = 2000; let region_split_size = 2000; cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(50); From a675ca8eacbdbe1a0e5e08d446af998f0691eb04 Mon Sep 17 00:00:00 2001 From: you06 Date: Wed, 22 Feb 2023 17:49:06 +0800 Subject: [PATCH 546/676] copr: early stop paging copr when resultset is drained. (#14209) close tikv/tikv#14254 When the result set is drained, it indicates that no more data is required in the range. This PR set the scanned range to None to avoid the following paging requests in the current range. Co-authored-by: Ti Chi Robot --- components/tidb_query_executors/src/runner.rs | 9 +- tests/failpoints/cases/test_coprocessor.rs | 172 ++++++++---------- 2 files changed, 77 insertions(+), 104 deletions(-) diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index f4a3ea8a2ad..3093b9bb24b 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -510,9 +510,12 @@ impl BatchExecutorsRunner { self.out_most_executor .collect_exec_stats(&mut self.exec_stats); - let range = self - .paging_size - .map(|_| self.out_most_executor.take_scanned_range()); + let range = if drained { + None + } else { + self.paging_size + .map(|_| self.out_most_executor.take_scanned_range()) + }; let mut sel_resp = SelectResponse::default(); sel_resp.set_chunks(chunks.into()); diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index c515b8d66cb..b3a6bf76c01 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -251,6 +251,16 @@ fn test_paging_scan() { assert_ge!(res_end_key, end_key.get_start()); assert_le!(res_end_key, end_key.get_end()); } + + // test limit with early return + let req = DagSelect::from(&product) + .paging_size(2) + .limit(1) + .desc(desc) + .build(); + let resp = handle_request(&endpoint, req); + assert!(resp.range.is_none()); + assert!(resp.range.is_none()); } } @@ -271,113 +281,73 @@ fn test_paging_scan_multi_ranges() { fail::cfg("copr_batch_grow_size", "return(1)").unwrap(); // test multi ranges with gap - for desc in [true] { - let paging_size = 3; - let mut exp = [data[0], data[1], data[3], data[4]]; - if desc { - exp.reverse(); - } - - let builder = DagSelect::from(&product) - .paging_size(paging_size) - .desc(desc); - let mut range1 = builder.key_ranges[0].clone(); - range1.set_end(product.get_record_range_one(data[1].0).get_end().into()); - let mut range2 = builder.key_ranges[0].clone(); - range2.set_start(product.get_record_range_one(data[3].0).get_start().into()); - let key_ranges = vec![range1.clone(), range2.clone()]; + for desc in [true, false] { + for paging_size in [3, 5] { + let mut exp = [data[0], data[1], data[3], data[4]]; + if desc { + exp.reverse(); + } - let req = builder.key_ranges(key_ranges).build(); - let resp = handle_request(&endpoint, req); - let mut select_resp = SelectResponse::default(); - select_resp.merge_from_bytes(resp.get_data()).unwrap(); - - let mut row_count = 0; - let spliter = DagChunkSpliter::new(select_resp.take_chunks().into(), 3); - for (row, (id, name, cnt)) in spliter.zip(exp) { - let name_datum = name.unwrap().as_bytes().into(); - let expected_encoded = datum::encode_value( - &mut EvalContext::default(), - &[Datum::I64(id), name_datum, Datum::I64(cnt)], - ) - .unwrap(); - let result_encoded = datum::encode_value(&mut EvalContext::default(), &row).unwrap(); - assert_eq!(result_encoded, &*expected_encoded); - row_count += 1; - } - assert_eq!(row_count, paging_size); + let builder = DagSelect::from(&product) + .paging_size(paging_size) + .desc(desc); + let mut range1 = builder.key_ranges[0].clone(); + range1.set_end(product.get_record_range_one(data[1].0).get_end().into()); + let mut range2 = builder.key_ranges[0].clone(); + range2.set_start(product.get_record_range_one(data[3].0).get_start().into()); + let key_ranges = vec![range1.clone(), range2.clone()]; - let res_range = resp.get_range(); - let (res_start_key, res_end_key) = match desc { - true => (res_range.get_end(), res_range.get_start()), - false => (res_range.get_start(), res_range.get_end()), - }; - let start_key = match desc { - true => range2.get_end(), - false => range1.get_start(), - }; - let end_id = match desc { - true => data[1].0, - false => data[3].0, - }; - let end_key = product.get_record_range_one(end_id); - assert_eq!(res_start_key, start_key); - assert_ge!(res_end_key, end_key.get_start()); - assert_le!(res_end_key, end_key.get_end()); - } + let req = builder.key_ranges(key_ranges).build(); + let resp = handle_request(&endpoint, req); + let mut select_resp = SelectResponse::default(); + select_resp.merge_from_bytes(resp.get_data()).unwrap(); - // test drained - for desc in [false, true] { - let paging_size = 5; - let mut exp = [data[0], data[1], data[3], data[4]]; - if desc { - exp.reverse(); - } + let mut row_count = 0; + let spliter = DagChunkSpliter::new(select_resp.take_chunks().into(), 3); + for (row, (id, name, cnt)) in spliter.zip(exp) { + let name_datum = name.unwrap().as_bytes().into(); + let expected_encoded = datum::encode_value( + &mut EvalContext::default(), + &[Datum::I64(id), name_datum, Datum::I64(cnt)], + ) + .unwrap(); + let result_encoded = + datum::encode_value(&mut EvalContext::default(), &row).unwrap(); + assert_eq!(result_encoded, &*expected_encoded); + row_count += 1; + } + let exp_len = if paging_size <= 4 { + paging_size + } else { + exp.len() as u64 + }; + assert_eq!(row_count, exp_len); - let builder = DagSelect::from(&product) - .paging_size(paging_size) - .desc(desc); - let mut range1 = builder.key_ranges[0].clone(); - range1.set_end(product.get_record_range_one(data[1].0).get_end().into()); - let mut range2 = builder.key_ranges[0].clone(); - range2.set_start(product.get_record_range_one(data[3].0).get_start().into()); - let key_ranges = vec![range1.clone(), range2.clone()]; + let res_range = resp.get_range(); - let req = builder.key_ranges(key_ranges).build(); - let resp = handle_request(&endpoint, req); - let mut select_resp = SelectResponse::default(); - select_resp.merge_from_bytes(resp.get_data()).unwrap(); - - let mut row_count = 0; - let spliter = DagChunkSpliter::new(select_resp.take_chunks().into(), 3); - for (row, (id, name, cnt)) in spliter.zip(exp) { - let name_datum = name.unwrap().as_bytes().into(); - let expected_encoded = datum::encode_value( - &mut EvalContext::default(), - &[Datum::I64(id), name_datum, Datum::I64(cnt)], - ) - .unwrap(); - let result_encoded = datum::encode_value(&mut EvalContext::default(), &row).unwrap(); - assert_eq!(result_encoded, &*expected_encoded); - row_count += 1; + let (res_start_key, res_end_key) = match desc { + true => (res_range.get_end(), res_range.get_start()), + false => (res_range.get_start(), res_range.get_end()), + }; + if paging_size != 5 { + let start_key = match desc { + true => range2.get_end(), + false => range1.get_start(), + }; + let end_id = match desc { + true => data[1].0, + false => data[3].0, + }; + let end_key = product.get_record_range_one(end_id); + assert_eq!(res_start_key, start_key); + assert_ge!(res_end_key, end_key.get_start()); + assert_le!(res_end_key, end_key.get_end()); + } else { + // drained. + assert!(res_start_key.is_empty()); + assert!(res_end_key.is_empty()); + } } - assert_eq!(row_count, exp.len()); - - let res_range = resp.get_range(); - let (res_start_key, res_end_key) = match desc { - true => (res_range.get_end(), res_range.get_start()), - false => (res_range.get_start(), res_range.get_end()), - }; - let start_key = match desc { - true => range2.get_end(), - false => range1.get_start(), - }; - let end_key = match desc { - true => product.get_record_range_one(i64::MIN), - false => product.get_record_range_one(i64::MAX), - }; - assert_eq!(res_start_key, start_key); - assert_eq!(res_end_key, end_key.get_start(), "{}", desc); } } From 0368d0a6e5416aea5cc16546a58d091e3bbc504f Mon Sep 17 00:00:00 2001 From: Shaowen Yin Date: Thu, 23 Feb 2023 09:29:07 +0800 Subject: [PATCH 547/676] *: update openssl-src version to fix CVE-2023-0286 (#14258) close tikv/tikv#14257 Upgrade openssl-src version to fix CVE-2023-0286. Signed-off-by: cosven Co-authored-by: Ti Chi Robot --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1fa0937ce40..7add84159b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3579,9 +3579,9 @@ checksum = "77af24da69f9d9341038eba93a073b1fdaaa1b788221b00a69bce9e762cb32de" [[package]] name = "openssl-src" -version = "111.20.0+1.1.1o" +version = "111.25.0+1.1.1t" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92892c4f87d56e376e469ace79f1128fdaded07646ddf73aa0be4706ff712dec" +checksum = "3173cd3626c43e3854b1b727422a276e568d9ec5fe8cec197822cf52cfb743d6" dependencies = [ "cc", ] From f0af6ff1f5a01def628ddc6fe61bbb5d005cfc9d Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 23 Feb 2023 15:25:07 +0800 Subject: [PATCH 548/676] integration test v2: report snapshot status after sending (#14252) ref tikv/tikv#12842 report snapshot status after sending Signed-off-by: SpadeA-Tang Co-authored-by: Ti Chi Robot --- components/test_raftstore-v2/src/node.rs | 12 ++++++-- .../src/transport_simulate.rs | 28 +++++++++++++++++++ tests/integrations/raftstore/test_snap.rs | 28 +++++++++++++++++-- 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index b9609ad2783..6c71e2d9cdc 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -15,7 +15,7 @@ use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, }; -use raft::prelude::MessageType; +use raft::{prelude::MessageType, SnapshotStatus}; use raftstore::{ coprocessor::CoprocessorHost, errors::Error as RaftError, @@ -73,6 +73,8 @@ impl Transport for ChannelTransport { fn send(&mut self, msg: RaftMessage) -> raftstore::Result<()> { let from_store = msg.get_from_peer().get_store_id(); let to_store = msg.get_to_peer().get_store_id(); + let to_peer_id = msg.get_to_peer().get_id(); + let region_id = msg.get_region_id(); let is_snapshot = msg.get_message().get_msg_type() == MessageType::MsgSnapshot; if is_snapshot { @@ -102,7 +104,13 @@ impl Transport for ChannelTransport { match core.routers.get(&to_store) { Some(h) => { h.send_raft_msg(msg)?; - // report snapshot status if needed + if is_snapshot { + let _ = core.routers[&from_store].report_snapshot_status( + region_id, + to_peer_id, + SnapshotStatus::Finish, + ); + } Ok(()) } _ => Err(box_err!("missing sender for store {}", to_store)), diff --git a/components/test_raftstore-v2/src/transport_simulate.rs b/components/test_raftstore-v2/src/transport_simulate.rs index f42a891e60f..b55c29dbd3a 100644 --- a/components/test_raftstore-v2/src/transport_simulate.rs +++ b/components/test_raftstore-v2/src/transport_simulate.rs @@ -11,6 +11,7 @@ use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, }; +use raft::SnapshotStatus; use raftstore::{ router::handle_send_error, store::{cmd_resp, RegionSnapshot, Transport}, @@ -102,6 +103,14 @@ pub trait RaftStoreRouter { fn send_peer_msg(&self, region_id: u64, msg: PeerMsg) -> Result<()>; fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()>; + + /// Reports the sending snapshot status to the peer of the Region. + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: SnapshotStatus, + ) -> RaftStoreResult<()>; } impl RaftStoreRouter for RaftRouter { @@ -115,6 +124,15 @@ impl RaftStoreRouter for RaftRouter { self.send_raft_message(Box::new(msg)) .map_err(|e| handle_send_error(region_id, e)) } + + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: SnapshotStatus, + ) -> RaftStoreResult<()> { + self.send_peer_msg(region_id, PeerMsg::SnapshotSent { to_peer_id, status }) + } } impl RaftStoreRouter for SimulateTransport { @@ -125,4 +143,14 @@ impl RaftStoreRouter for SimulateTransport { fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { filter_send(&self.filters, msg, |m| self.ch.send_raft_msg(m)) } + + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: SnapshotStatus, + ) -> RaftStoreResult<()> { + self.ch + .report_snapshot_status(region_id, to_peer_id, status) + } } diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index e8a0730488a..a69a2216cd4 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -476,6 +476,8 @@ impl Filter for SnapshotAppendFilter { } } +// todo(SpadeA): to be removed when receive filter is supported on ServerCluster +// V2 fn test_snapshot_with_append(cluster: &mut Cluster) { configure_for_snapshot(&mut cluster.cfg); @@ -502,10 +504,30 @@ fn test_snapshot_with_append(cluster: &mut Cluster) { must_get_equal(&engine4, b"k2", b"v2"); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_snapshot_with_append() { - let mut cluster = new_node_cluster(0, 4); - test_snapshot_with_append(&mut cluster); + let mut cluster = new_cluster(0, 4); + configure_for_snapshot(&mut cluster.cfg); + + let pd_client = Arc::clone(&cluster.pd_client); + // Disable default max peer count check. + pd_client.disable_default_operator(); + cluster.run(); + + // In case of removing leader, let's transfer leader to some node first. + cluster.must_transfer_leader(1, new_peer(1, 1)); + pd_client.must_remove_peer(1, new_peer(4, 4)); + + let (tx, rx) = mpsc::channel(); + cluster.add_recv_filter_on_node(4, Box::new(SnapshotAppendFilter::new(tx))); + pd_client.add_peer(1, new_peer(4, 5)); + rx.recv_timeout(Duration::from_secs(3)).unwrap(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k2", b"v2"); + let engine4 = cluster.get_engine(4); + must_get_equal(&engine4, b"k1", b"v1"); + must_get_equal(&engine4, b"k2", b"v2"); } #[test] From 5f5bb766ea056f1eb1320e32364084818a3eca64 Mon Sep 17 00:00:00 2001 From: Yifan Xu <30385241+xuyifangreeneyes@users.noreply.github.com> Date: Mon, 27 Feb 2023 01:15:07 +0800 Subject: [PATCH 549/676] coprocessor: avoid unnecessary vec allocation in collect_column_stats (#14280) ref tikv/tikv#14231 When collect_column_stats handles each row, reuse column_vals and collation_key_vals to avoid allocating many small objects. Signed-off-by: xuyifan <675434007@qq.com> --- src/coprocessor/statistics/analyze.rs | 67 +++++++++++++-------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 85e0281064e..f292b5220e3 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -391,41 +391,36 @@ impl RowSampleBuilder { is_drained = result.is_drained?; let columns_slice = result.physical_columns.as_slice(); - + let mut column_vals: Vec> = vec![vec![]; self.columns_info.len()]; + let mut collation_key_vals: Vec> = vec![vec![]; self.columns_info.len()]; for logical_row in &result.logical_rows { - let mut column_vals: Vec> = Vec::new(); - let mut collation_key_vals: Vec> = Vec::new(); for i in 0..self.columns_info.len() { - let mut val = vec![]; + column_vals[i].clear(); + collation_key_vals[i].clear(); columns_slice[i].encode( *logical_row, &self.columns_info[i], &mut EvalContext::default(), - &mut val, + &mut column_vals[i], )?; if self.columns_info[i].as_accessor().is_string_like() { - let sorted_val = match_template_collator! { + match_template_collator! { TT, match self.columns_info[i].as_accessor().collation()? { Collation::TT => { - let mut mut_val = &val[..]; + let mut mut_val = &column_vals[i][..]; let decoded_val = table::decode_col_value(&mut mut_val, &mut EvalContext::default(), &self.columns_info[i])?; if decoded_val == Datum::Null { - val.clone() + collation_key_vals[i].clone_from(&column_vals[i]); } else { // Only if the `decoded_val` is Datum::Null, `decoded_val` is a Ok(None). // So it is safe the unwrap the Ok value. - let decoded_sorted_val = TT::sort_key(&decoded_val.as_string()?.unwrap())?; - decoded_sorted_val + TT::write_sort_key(&mut collation_key_vals[i], &decoded_val.as_string()?.unwrap())?; } } } }; - collation_key_vals.push(sorted_val); - } else { - collation_key_vals.push(Vec::new()); } - read_size += val.len(); - column_vals.push(val); + read_size += column_vals[i].len(); } collector.mut_base().count += 1; collector.collect_column_group( @@ -434,7 +429,7 @@ impl RowSampleBuilder { &self.columns_info, &self.column_groups, ); - collector.collect_column(column_vals, collation_key_vals, &self.columns_info); + collector.collect_column(&column_vals, &collation_key_vals, &self.columns_info); } } @@ -470,11 +465,11 @@ trait RowSampleCollector: Send { ); fn collect_column( &mut self, - columns_val: Vec>, - collation_keys_val: Vec>, + columns_val: &[Vec], + collation_keys_val: &[Vec], columns_info: &[tipb::ColumnInfo], ); - fn sampling(&mut self, data: Vec>); + fn sampling(&mut self, data: &[Vec]); fn to_proto(&mut self) -> tipb::RowSampleCollector; fn get_reported_memory_usage(&mut self) -> usize { self.mut_base().reported_memory_usage @@ -662,22 +657,23 @@ impl RowSampleCollector for BernoulliRowSampleCollector { } fn collect_column( &mut self, - columns_val: Vec>, - collation_keys_val: Vec>, + columns_val: &[Vec], + collation_keys_val: &[Vec], columns_info: &[tipb::ColumnInfo], ) { self.base - .collect_column(&columns_val, &collation_keys_val, columns_info); + .collect_column(columns_val, collation_keys_val, columns_info); self.sampling(columns_val); } - fn sampling(&mut self, data: Vec>) { + fn sampling(&mut self, data: &[Vec]) { let cur_rng = self.base.rng.gen_range(0.0, 1.0); if cur_rng >= self.sample_rate { return; } - self.base.memory_usage += data.iter().map(|x| x.capacity()).sum::(); + let sample = data.to_vec(); + self.base.memory_usage += sample.iter().map(|x| x.capacity()).sum::(); self.base.report_memory_usage(false); - self.samples.push(data); + self.samples.push(sample); } fn to_proto(&mut self) -> tipb::RowSampleCollector { self.base.memory_usage = 0; @@ -739,16 +735,16 @@ impl RowSampleCollector for ReservoirRowSampleCollector { fn collect_column( &mut self, - columns_val: Vec>, - collation_keys_val: Vec>, + columns_val: &[Vec], + collation_keys_val: &[Vec], columns_info: &[tipb::ColumnInfo], ) { self.base - .collect_column(&columns_val, &collation_keys_val, columns_info); + .collect_column(columns_val, collation_keys_val, columns_info); self.sampling(columns_val); } - fn sampling(&mut self, data: Vec>) { + fn sampling(&mut self, data: &[Vec]) { // We should tolerate the abnormal case => `self.max_sample_size == 0`. if self.max_sample_size == 0 { return; @@ -764,9 +760,10 @@ impl RowSampleCollector for ReservoirRowSampleCollector { } if need_push { - self.base.memory_usage += data.iter().map(|x| x.capacity()).sum::(); - self.samples.push(Reverse((cur_rng, data))); + let sample = data.to_vec(); + self.base.memory_usage += sample.iter().map(|x| x.capacity()).sum::(); self.base.report_memory_usage(false); + self.samples.push(Reverse((cur_rng, sample))); } } @@ -1255,7 +1252,7 @@ mod tests { for loop_i in 0..loop_cnt { let mut collector = ReservoirRowSampleCollector::new(sample_num, 1000, 1); for row in &nums { - collector.sampling([row.clone()].to_vec()); + collector.sampling(&[row.clone()]); } assert_eq!(collector.samples.len(), sample_num); for sample in &collector.samples { @@ -1304,7 +1301,7 @@ mod tests { let mut collector = BernoulliRowSampleCollector::new(sample_num as f64 / row_num as f64, 1000, 1); for row in &nums { - collector.sampling([row.clone()].to_vec()); + collector.sampling(&[row.clone()]); } for sample in &collector.samples { *item_cnt.entry(sample[0].clone()).or_insert(0) += 1; @@ -1350,7 +1347,7 @@ mod tests { // Test for ReservoirRowSampleCollector let mut collector = ReservoirRowSampleCollector::new(sample_num, 1000, 1); for row in &nums { - collector.sampling([row.clone()].to_vec()); + collector.sampling(&[row.clone()]); } assert_eq!(collector.samples.len(), 0); } @@ -1359,7 +1356,7 @@ mod tests { let mut collector = BernoulliRowSampleCollector::new(sample_num as f64 / row_num as f64, 1000, 1); for row in &nums { - collector.sampling([row.clone()].to_vec()); + collector.sampling(&[row.clone()]); } assert_eq!(collector.samples.len(), 0); } From b4da741a49517553765b2e44709ed2e02a2c2324 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 27 Feb 2023 13:09:08 +0800 Subject: [PATCH 550/676] raftstore-v2: update region state in apply_snapshot (#14279) ref tikv/tikv#12842 update region state in apply_snapshot Signed-off-by: SpadeA-Tang Co-authored-by: Xinye Tao --- components/raftstore-v2/src/operation/ready/snapshot.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 29d94c955af..5eae3078a0a 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -259,10 +259,6 @@ impl Peer { meta.region_read_progress .insert(region_id, self.read_progress().clone()); } - - let region_state = self.raft_group().store().region_state().clone(); - self.storage_mut().set_region_state(region_state); - if let Some(tablet) = self.set_tablet(tablet) { self.record_tombstone_tablet(ctx, tablet, snapshot_index); } @@ -596,12 +592,15 @@ impl Storage { "{}", SlogFormat(self.logger()) ); - let region_state = self.region_state_mut(); + let mut region_state = self.region_state().clone(); region_state.set_state(PeerState::Normal); region_state.set_region(region); region_state.set_removed_records(removed_records); region_state.set_merged_records(merged_records); region_state.set_tablet_index(last_index); + // We need set_region_state here to update the peer. + self.set_region_state(region_state); + let entry_storage = self.entry_storage_mut(); entry_storage.raft_state_mut().set_last_index(last_index); entry_storage.set_truncated_index(last_index); From 6a906dae15f62be7df7c707c43550e25c0599a9a Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Mon, 27 Feb 2023 13:27:08 +0800 Subject: [PATCH 551/676] impl buckets flow report (#14238) ref tikv/tikv#12842, ref tikv/tikv#14044 collect bucket flow: - write/read flow include keys and bytes not include qps Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot --- components/pd_client/src/lib.rs | 4 + components/raftstore-v2/src/fsm/apply.rs | 6 + .../raftstore-v2/src/operation/bucket.rs | 160 ++++++++++++++++-- .../src/operation/command/admin/split.rs | 20 ++- .../raftstore-v2/src/operation/command/mod.rs | 24 ++- .../src/operation/command/write/mod.rs | 6 + components/raftstore-v2/src/operation/mod.rs | 1 + components/raftstore-v2/src/raft/apply.rs | 4 + components/raftstore-v2/src/raft/peer.rs | 37 ++-- components/raftstore-v2/src/raft/storage.rs | 1 + .../src/router/internal_message.rs | 3 + .../raftstore-v2/src/worker/pd/region.rs | 6 +- .../tests/integrations/cluster.rs | 4 +- .../tests/integrations/test_pd_heartbeat.rs | 73 ++++++-- components/raftstore/src/store/worker/read.rs | 14 +- 15 files changed, 299 insertions(+), 64 deletions(-) diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 05b5729e98c..86e52eaf2a5 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -162,6 +162,10 @@ impl BucketStat { self.meta = meta; } + pub fn clear_stats(&mut self) { + self.stats = new_bucket_stats(&self.meta); + } + pub fn merge(&mut self, delta: &BucketStat) { merge_bucket_stats( &self.meta.keys, diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 1544a703c6d..67e8d557dd9 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -10,6 +10,7 @@ use crossbeam::channel::TryRecvError; use engine_traits::{FlushState, KvEngine, TabletRegistry}; use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; +use pd_client::BucketStat; use raftstore::store::{Config, ReadTask}; use slog::Logger; use tikv_util::{ @@ -68,6 +69,7 @@ impl ApplyFsm { log_recovery: Option>, applied_term: u64, logger: Logger, + buckets: Option, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); let apply = Apply::new( @@ -81,6 +83,7 @@ impl ApplyFsm { log_recovery, applied_term, logger, + buckets, ); ( ApplyScheduler { sender: tx }, @@ -120,6 +123,9 @@ impl ApplyFsm { ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), ApplyTask::UnsafeWrite(raw_write) => self.apply.apply_unsafe_write(raw_write), ApplyTask::ManualFlush => self.apply.on_manual_flush().await, + ApplyTask::RefreshBucketStat(bucket_meta) => { + self.apply.on_refresh_buckets(bucket_meta) + } } self.apply.maybe_flush().await; diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs index 05976d49d97..be4ca092d98 100644 --- a/components/raftstore-v2/src/operation/bucket.rs +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::metapb::RegionEpoch; +use kvproto::metapb::{self, RegionEpoch}; use pd_client::{BucketMeta, BucketStat}; use raftstore::{ coprocessor::RegionChangeEvent, @@ -13,7 +13,119 @@ use raftstore::{ }; use slog::{error, warn}; -use crate::{batch::StoreContext, fsm::PeerFsmDelegate, raft::Peer, router::PeerTick, worker::pd}; +use crate::{ + batch::StoreContext, + fsm::PeerFsmDelegate, + raft::Peer, + router::{ApplyTask, PeerTick}, + worker::pd, +}; + +#[derive(Debug, Clone, Default)] +pub struct BucketStatsInfo { + bucket_stat: Option, + // the last buckets records the stats that the recently refreshed. + last_bucket_stat: Option, + // the report bucket stat records the increment stats after last report pd. + // it will be reset after report pd. + report_bucket_stat: Option, +} + +impl BucketStatsInfo { + /// returns all bucket ranges those's write_bytes exceed the given + /// diff_size_threshold. + pub fn gen_bucket_range_for_update( + &self, + diff_size_threshold: u64, + ) -> Option> { + let region_buckets = self.bucket_stat.as_ref()?; + let stats = ®ion_buckets.stats; + let keys = ®ion_buckets.meta.keys; + + let empty_last_keys = vec![]; + let empty_last_stats = metapb::BucketStats::default(); + let (last_keys, last_stats, stats_reset) = self + .last_bucket_stat + .as_ref() + .map(|b| { + ( + &b.meta.keys, + &b.stats, + region_buckets.create_time != b.create_time, + ) + }) + .unwrap_or((&empty_last_keys, &empty_last_stats, false)); + + let mut bucket_ranges = vec![]; + let mut j = 0; + assert_eq!(keys.len(), stats.write_bytes.len() + 1); + for i in 0..stats.write_bytes.len() { + let mut diff_in_bytes = stats.write_bytes[i]; + while j < last_keys.len() && keys[i] > last_keys[j] { + j += 1; + } + if j < last_keys.len() && keys[i] == last_keys[j] { + if !stats_reset { + diff_in_bytes -= last_stats.write_bytes[j]; + } + j += 1; + } + if diff_in_bytes >= diff_size_threshold { + bucket_ranges.push(BucketRange(keys[i].clone(), keys[i + 1].clone())); + } + } + Some(bucket_ranges) + } + + #[inline] + pub fn version(&self) -> u64 { + self.bucket_stat + .as_ref() + .or(self.last_bucket_stat.as_ref()) + .map(|b| b.meta.version) + .unwrap_or_default() + } + #[inline] + pub fn add_bucket_flow(&mut self, delta: &Option) { + if let (Some(buckets), Some(report_buckets), Some(delta)) = ( + self.bucket_stat.as_mut(), + self.report_bucket_stat.as_mut(), + delta, + ) { + buckets.merge(delta); + report_buckets.merge(delta); + } + } + + #[inline] + pub fn set_bucket_stat(&mut self, buckets: Option) { + if let Some(b) = self.bucket_stat.take() { + self.last_bucket_stat = Some(b); + } + self.report_bucket_stat = buckets.clone(); + self.bucket_stat = buckets; + } + + #[inline] + pub fn clear_bucket_stat(&mut self) { + if let Some(bucket) = self.report_bucket_stat.as_mut() { + bucket.clear_stats(); + } + } + + #[inline] + pub fn report_bucket_stat(&mut self) -> BucketStat { + let current = self.report_bucket_stat.as_mut().unwrap(); + let delta = current.clone(); + current.clear_stats(); + delta + } + + #[inline] + pub fn bucket_stat(&self) -> &Option { + &self.bucket_stat + } +} impl Peer { #[inline] @@ -48,12 +160,7 @@ impl Peer { }; let region = self.region(); - let current_version = self - .region_buckets() - .as_ref() - .or_else(|| self.last_region_buckets().as_ref()) - .map(|b| b.meta.version) - .unwrap_or_default(); + let current_version = self.region_buckets_info().version(); let mut region_buckets: BucketStat; // The region buckets reset after this region happened split or merge. // The message should be dropped if it's epoch is lower than the regions. @@ -61,7 +168,7 @@ impl Peer { // So this condition indicates that the region buckets needs to refresh not // renew. if let (Some(bucket_ranges), Some(peer_region_buckets)) = - (bucket_ranges, self.region_buckets()) + (bucket_ranges, self.region_buckets_info().bucket_stat()) { assert_eq!(buckets.len(), bucket_ranges.len()); let mut meta_idx = 0; @@ -145,17 +252,22 @@ impl Peer { self.state_role(), ); let meta = region_buckets.meta.clone(); - self.set_region_buckets(Some(region_buckets)); + self.region_buckets_info_mut() + .set_bucket_stat(Some(region_buckets.clone())); + let mut store_meta = store_ctx.store_meta.lock().unwrap(); if let Some(reader) = store_meta.readers.get_mut(&self.region_id()) { reader.0.update(ReadProgress::region_buckets(meta)); } + self.apply_scheduler() + .unwrap() + .send(ApplyTask::RefreshBucketStat(region_buckets.meta.clone())); } #[inline] pub fn report_region_buckets_pd(&mut self, ctx: &StoreContext) { - let region_buckets = self.region_buckets().as_ref().unwrap(); - let task = pd::Task::ReportBuckets(region_buckets.clone()); + let delta = self.region_buckets_info_mut().report_bucket_stat(); + let task = pd::Task::ReportBuckets(delta); if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, @@ -180,6 +292,21 @@ impl Peer { } } } + + // generate bucket range list to run split-check (to further split buckets) + // It will return the suspected bucket ranges whose write bytes exceed the + // threshold. + pub fn gen_bucket_range_for_update( + &self, + ctx: &StoreContext, + ) -> Option> { + if !ctx.coprocessor_host.cfg.enable_region_bucket() { + return None; + } + let bucket_update_diff_size_threshold = ctx.coprocessor_host.cfg.region_bucket_size.0 / 2; + self.region_buckets_info() + .gen_bucket_range_for_update(bucket_update_diff_size_threshold) + } } impl<'a, EK, ER, T: Transport> PeerFsmDelegate<'a, EK, ER, T> @@ -189,7 +316,14 @@ where { #[inline] pub fn on_report_region_buckets_tick(&mut self) { - if !self.fsm.peer().is_leader() || self.fsm.peer().region_buckets().is_none() { + if !self.fsm.peer().is_leader() + || self + .fsm + .peer() + .region_buckets_info() + .bucket_stat() + .is_none() + { return; } self.fsm.peer_mut().report_region_buckets_pd(self.store_ctx); diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 4e14c7e016d..260fb8700b8 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -217,8 +217,12 @@ impl Peer { } // todo: the suspected buckets range should generated by the diff write bytes. // it will be done in next pr. - let task = - SplitCheckTask::split_check(self.region().clone(), true, CheckPolicy::Scan, None); + let task = SplitCheckTask::split_check( + self.region().clone(), + true, + CheckPolicy::Scan, + self.gen_bucket_range_for_update(ctx), + ); if let Err(e) = ctx.schedulers.split_check.schedule(task) { info!(self.logger, "failed to schedule split check"; "err" => ?e); } @@ -321,14 +325,21 @@ impl Peer { return; } + // Do not check the bucket ranges if we want to split the region with a given + // key range, this is to avoid compatibility issues. + let split_check_bucket_ranges = if !is_key_range { + self.gen_bucket_range_for_update(ctx) + } else { + None + }; + let task = SplitCheckTask::split_check_key_range( region.clone(), rhs.start_key, rhs.end_key, false, rhs.policy, - // todo: bucket range - None, + split_check_bucket_ranges, ); if let Err(e) = ctx.schedulers.split_check.schedule(task) { error!( @@ -966,6 +977,7 @@ mod test { None, 5, logger.clone(), + None, ); let mut splits = BatchSplitRequest::default(); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 8bff64e66c9..76a7741134e 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -16,7 +16,11 @@ //! - Applied result are sent back to peer fsm, and update memory state in //! `on_apply_res`. -use std::{mem, sync::atomic::Ordering, time::Duration}; +use std::{ + mem, + sync::{atomic::Ordering, Arc}, + time::Duration, +}; use engine_traits::{KvEngine, PerfContext, RaftEngine, WriteBatch, WriteOptions}; use kvproto::raft_cmdpb::{ @@ -63,6 +67,7 @@ pub use admin::{ RequestHalfSplit, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, }; pub use control::ProposalControl; +use pd_client::{BucketMeta, BucketStat}; pub use write::{ SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, }; @@ -122,6 +127,7 @@ impl Peer { }; let logger = self.logger.clone(); let read_scheduler = self.storage().read_scheduler(); + let buckets = self.region_buckets_info().bucket_stat().clone(); let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( &store_ctx.cfg, self.peer().clone(), @@ -133,6 +139,7 @@ impl Peer { self.storage().apply_trace().log_recovery(), self.entry_storage().applied_term(), logger, + buckets, ); store_ctx @@ -353,7 +360,8 @@ impl Peer { AdminCmdResult::PrepareMerge(res) => self.on_apply_res_prepare_merge(ctx, res), } } - + self.region_buckets_info_mut() + .add_bucket_flow(&apply_res.bucket_stat); self.update_split_flow_control(&apply_res.metrics); self.update_stat(&apply_res.metrics); @@ -483,6 +491,14 @@ impl Apply { self.maybe_reschedule(written_bytes).await } + pub fn on_refresh_buckets(&mut self, meta: Arc) { + let mut new = BucketStat::from_meta(meta); + if let Some(origin) = self.buckets.as_ref() { + new.merge(origin); + } + self.buckets.replace(new); + } + #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); @@ -727,8 +743,12 @@ impl Apply { apply_res.admin_result = self.take_admin_result().into_boxed_slice(); apply_res.modifications = *self.modifications_mut(); apply_res.metrics = mem::take(&mut self.metrics); + apply_res.bucket_stat = self.buckets.clone(); let written_bytes = apply_res.metrics.written_bytes; self.res_reporter().report(apply_res); + if let Some(buckets) = &mut self.buckets { + buckets.clear_stats(); + } // Report result first and then invoking callbacks. This may delays callback a // little bit, but can make sure all following messages must see the side diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 988b7cf4b2d..ca4c7152364 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -140,6 +140,9 @@ impl Apply { return Ok(()); } util::check_key_in_region(key, self.region())?; + if let Some(s) = self.buckets.as_mut() { + s.write_key(key, value.len() as u64); + } // Technically it's OK to remove prefix for raftstore v2. But rocksdb doesn't // support specifying infinite upper bound in various APIs. keys::data_key_with_buffer(key, &mut self.key_buffer); @@ -183,6 +186,9 @@ impl Apply { return Ok(()); } util::check_key_in_region(key, self.region())?; + if let Some(s) = self.buckets.as_mut() { + s.write_key(key, 0); + } keys::data_key_with_buffer(key, &mut self.key_buffer); self.ensure_write_buffer(); let res = if cf.is_empty() || cf == CF_DEFAULT { diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 0ba7de2c3e5..68acac6668b 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -19,6 +19,7 @@ pub use ready::{ }; pub(crate) use self::{ + bucket::BucketStatsInfo, command::SplitInit, query::{LocalReader, ReadDelegatePair, SharedReadTablet}, txn_ext::TxnContext, diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index a7af3c470ae..b4109fd9de0 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -6,6 +6,7 @@ use engine_traits::{ FlushState, KvEngine, PerfContextKind, TabletRegistry, WriteBatch, DATA_CFS_LEN, }; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; +use pd_client::BucketStat; use raftstore::store::{ fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, Config, ReadTask, @@ -58,6 +59,7 @@ pub struct Apply { read_scheduler: Scheduler>, pub(crate) metrics: ApplyMetrics, pub(crate) logger: Logger, + pub(crate) buckets: Option, } impl Apply { @@ -73,6 +75,7 @@ impl Apply { log_recovery: Option>, applied_term: u64, logger: Logger, + buckets: Option, ) -> Self { let mut remote_tablet = tablet_registry .get(region_state.get_region().get_id()) @@ -103,6 +106,7 @@ impl Apply { log_recovery, metrics: ApplyMetrics::default(), logger, + buckets, } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index bcf92471ebe..e510c85cbf9 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -15,7 +15,6 @@ use kvproto::{ pdpb, raft_serverpb::{RaftMessage, RegionLocalState}, }; -use pd_client::BucketStat; use raft::{RawNode, StateRole}; use raftstore::{ coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, @@ -32,8 +31,8 @@ use super::storage::Storage; use crate::{ fsm::ApplyScheduler, operation::{ - AsyncWriter, CompactLogContext, DestroyProgress, GcPeerContext, MergeContext, - ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, TxnContext, + AsyncWriter, BucketStatsInfo, CompactLogContext, DestroyProgress, GcPeerContext, + MergeContext, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, TxnContext, }, router::{CmdResChannel, PeerTick, QueryResChannel}, Result, @@ -83,9 +82,7 @@ pub struct Peer { read_progress: Arc, leader_lease: Lease, - /// region buckets. - region_buckets: Option, - last_region_buckets: Option, + region_buckets_info: BucketStatsInfo, /// Transaction extensions related to this peer. txn_context: TxnContext, @@ -184,8 +181,7 @@ impl Peer { cfg.raft_store_max_leader_lease(), cfg.renew_leader_lease_advance_duration(), ), - region_buckets: None, - last_region_buckets: None, + region_buckets_info: BucketStatsInfo::default(), txn_context: TxnContext::default(), proposal_control: ProposalControl::new(0), pending_ticks: Vec::new(), @@ -217,22 +213,12 @@ impl Peer { Ok(peer) } - #[inline] - pub fn region_buckets(&self) -> &Option { - &self.region_buckets - } - - #[inline] - pub fn set_region_buckets(&mut self, buckets: Option) { - if let Some(b) = self.region_buckets.take() { - self.last_region_buckets = Some(b); - } - self.region_buckets = buckets; + pub fn region_buckets_info_mut(&mut self) -> &mut BucketStatsInfo { + &mut self.region_buckets_info } - #[inline] - pub fn last_region_buckets(&self) -> &Option { - &self.last_region_buckets + pub fn region_buckets_info(&self) -> &BucketStatsInfo { + &self.region_buckets_info } #[inline] @@ -680,7 +666,7 @@ impl Peer { #[inline] pub fn post_split(&mut self) { - self.set_region_buckets(None); + self.region_buckets_info_mut().set_bucket_stat(None); } pub fn maybe_campaign(&mut self) -> bool { @@ -716,7 +702,10 @@ impl Peer { self.txn_context.extra_op().clone(), self.txn_context.ext().clone(), self.read_progress().clone(), - self.region_buckets.as_ref().map(|b| b.meta.clone()), + self.region_buckets_info() + .bucket_stat() + .as_ref() + .map(|b| b.meta.clone()), ) } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index ce15ac20621..8f9fe2d8947 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -507,6 +507,7 @@ mod tests { None, 5, logger, + None, ); // Test get snapshot diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 092e7e21b5f..764e8df7dfd 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use pd_client::{BucketMeta, BucketStat}; use raftstore::store::fsm::ApplyMetrics; use crate::operation::{AdminCmdResult, CommittedEntries, DataTrace, GenSnapTask}; @@ -11,6 +12,7 @@ pub enum ApplyTask { /// Writes that doesn't care consistency. UnsafeWrite(Box<[u8]>), ManualFlush, + RefreshBucketStat(std::sync::Arc), } #[derive(Debug, Default)] @@ -20,4 +22,5 @@ pub struct ApplyRes { pub admin_result: Box<[AdminCmdResult]>, pub modifications: DataTrace, pub metrics: ApplyMetrics, + pub bucket_stat: Option, } diff --git a/components/raftstore-v2/src/worker/pd/region.rs b/components/raftstore-v2/src/worker/pd/region.rs index bca48412aa6..e825dd54c32 100644 --- a/components/raftstore-v2/src/worker/pd/region.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -66,8 +66,7 @@ impl ReportBucket { } } - fn report(&mut self, report_ts: UnixSecs) -> BucketStat { - self.last_report_ts = report_ts; + fn report(&mut self) -> BucketStat { match self.last_report_stat.replace(self.current_stat.clone()) { Some(last) => { let mut delta = BucketStat::from_meta(self.current_stat.meta.clone()); @@ -350,7 +349,8 @@ where }; let now = UnixSecs::now(); let interval_second = now.into_inner() - last_report_ts.into_inner(); - let delta = report_buckets.report(now); + report_buckets.last_report_ts = now; + let delta = report_buckets.report(); let resp = self .pd_client .report_region_buckets(&delta, Duration::from_secs(interval_second)); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 065d032eaa2..264d127cc8c 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -503,8 +503,8 @@ impl Cluster { Cluster::with_configs(count, config, None) } - pub fn with_cop_cfg(coprocessor_cfg: CopConfig) -> Cluster { - Cluster::with_configs(1, None, Some(coprocessor_cfg)) + pub fn with_cop_cfg(config: Option, coprocessor_cfg: CopConfig) -> Cluster { + Cluster::with_configs(1, config, Some(coprocessor_cfg)) } pub fn with_configs(count: usize, config: Option, cop_cfg: Option) -> Self { diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs index 56159538836..b9dea63bbfe 100644 --- a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -13,7 +13,7 @@ use raftstore_v2::{ }; use tikv_util::{config::ReadableSize, store::new_peer}; -use crate::cluster::Cluster; +use crate::cluster::{v2_default_config, Cluster}; #[test] fn test_region_heartbeat() { @@ -74,7 +74,9 @@ fn test_report_buckets() { let mut cop_cfg = CopConfig::default(); cop_cfg.enable_region_bucket = Some(true); cop_cfg.region_bucket_size = ReadableSize::kb(1); - let cluster = Cluster::with_cop_cfg(cop_cfg); + let mut config = v2_default_config(); + config.region_split_check_diff = Some(ReadableSize::kb(1)); + let cluster = Cluster::with_cop_cfg(Some(config), cop_cfg); let store_id = cluster.node(0).id(); let router = &cluster.routers[0]; @@ -92,20 +94,13 @@ fn test_report_buckets() { router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); // load data to split bucket. - let header = Box::new(router.new_request_for(region_id).take_header()); let mut suffix = String::from(""); for _ in 0..200 { suffix.push_str("fake "); } - for i in 0..10 { - let mut put = SimpleWriteEncoder::with_capacity(64); - let mut key = format!("key-{}", i); - key.push_str(&suffix); - put.put(CF_DEFAULT, key.as_bytes(), b"value"); - let (msg, sub) = PeerMsg::simple_write(header.clone(), put.clone().encode()); - router.send(region_id, msg).unwrap(); - let _resp = block_on(sub.result()).unwrap(); - } + + let repeat: u64 = 10; + let bytes = write_keys(&cluster, region_id, &suffix, repeat.try_into().unwrap()); // To find the split keys, it should flush memtable manually. let mut cached = cluster.node(0).tablet_registry().get(region_id).unwrap(); cached.latest().unwrap().flush_cf(CF_DEFAULT, true).unwrap(); @@ -126,6 +121,12 @@ fn test_report_buckets() { if let Some(buckets) = resp { assert!(buckets.get_keys().len() > 2); assert_eq!(buckets.get_region_id(), region_id); + let write_bytes = buckets.get_stats().get_write_bytes(); + let write_keys = buckets.get_stats().get_write_keys(); + for i in 0..buckets.keys.len() - 1 { + assert!(write_bytes[i] >= bytes); + assert!(write_keys[i] >= repeat); + } for i in 0..buckets.keys.len() - 1 { buckets_tmp.push(raftstore::store::Bucket::default()); let bucket_range = @@ -134,6 +135,23 @@ fn test_report_buckets() { } } + // report buckets to pd again, the write bytes and keys should be zero. + router + .send(region_id, PeerMsg::Tick(PeerTick::ReportBuckets)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + let resp = block_on(cluster.node(0).pd_client().get_buckets_by_id(region_id)).unwrap(); + if let Some(buckets) = resp { + assert_eq!(buckets.get_region_id(), region_id); + let write_bytes = buckets.get_stats().get_write_bytes(); + let write_keys = buckets.get_stats().get_write_keys(); + for i in 0..buckets.keys.len() - 1 { + assert!(write_bytes[i] == 0); + assert!(write_keys[i] == 0); + } + } + // send the same region buckets to refresh which needs to merge the last. let resp = block_on(cluster.node(0).pd_client().get_region_by_id(region_id)).unwrap(); if let Some(region) = resp { @@ -148,4 +166,35 @@ fn test_report_buckets() { std::thread::sleep(std::time::Duration::from_millis(50)); } } + // report buckets to pd again, the write bytes and keys should be zero. + router + .send(region_id, PeerMsg::Tick(PeerTick::ReportBuckets)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + let resp = block_on(cluster.node(0).pd_client().get_buckets_by_id(region_id)).unwrap(); + if let Some(buckets) = resp { + assert_eq!(buckets.get_region_id(), region_id); + let write_bytes = buckets.get_stats().get_write_bytes(); + let write_keys = buckets.get_stats().get_write_keys(); + assert_eq!(write_bytes.len(), 1); + assert_eq!(write_keys.len(), 1); + } + + fn write_keys(cluster: &Cluster, region_id: u64, suffix: &str, repeat: usize) -> u64 { + let router = &cluster.routers[0]; + let header = Box::new(router.new_request_for(region_id).take_header()); + for i in 0..repeat { + let mut put = SimpleWriteEncoder::with_capacity(64); + let mut key = format!("key-{}", i); + key.push_str(suffix); + put.put(CF_DEFAULT, key.as_bytes(), b"value"); + let (msg, sub) = PeerMsg::simple_write(header.clone(), put.clone().encode()); + router.send(region_id, msg).unwrap(); + let _resp = block_on(sub.result()).unwrap(); + } + ((suffix.as_bytes().len() + 10) * repeat) + .try_into() + .unwrap() + } } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 826537f4e44..022bd457cd5 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -967,8 +967,11 @@ where } let region = Arc::clone(&delegate.region); - let response = delegate.execute(&req, ®ion, None, Some(local_read_ctx)); - + let mut response = + delegate.execute(&req, ®ion, None, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } // Try renew lease in advance delegate.maybe_renew_lease_advance(&self.router, snapshot_ts); response @@ -992,8 +995,11 @@ where let region = Arc::clone(&delegate.region); // Getting the snapshot - let response = delegate.execute(&req, ®ion, None, Some(local_read_ctx)); - + let mut response = + delegate.execute(&req, ®ion, None, Some(local_read_ctx)); + if let Some(snap) = response.snapshot.as_mut() { + snap.bucket_meta = delegate.bucket_meta.clone(); + } // Double check in case `safe_ts` change after the first check and before // getting snapshot if let Err(resp) = delegate.check_stale_read_safe(read_ts) { From de78be91b59be3351e80c37380e16348108425de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 28 Feb 2023 10:27:08 +0800 Subject: [PATCH 552/676] log_backup: move all download request into the same runtime (#14286) ref hyperium/hyper#2112, close tikv/tikv#14285 Download tasks will executed in a tiny runtime for now. Signed-off-by: hillium --- components/file_system/src/io_stats/proc.rs | 5 +++ components/sst_importer/src/sst_importer.rs | 36 ++++++++++++++------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/components/file_system/src/io_stats/proc.rs b/components/file_system/src/io_stats/proc.rs index 51c74ae56a8..652fe05c658 100644 --- a/components/file_system/src/io_stats/proc.rs +++ b/components/file_system/src/io_stats/proc.rs @@ -141,6 +141,11 @@ pub fn init() -> Result<(), String> { Ok(()) } +/// Bind I/O type for the current thread. +/// Following calls to the [`file_system`](crate) APIs would be throttled and +/// recorded via this information. +/// Generally, when you are creating new threads playing with the local disks, +/// you should call this before doing so. pub fn set_io_type(new_io_type: IoType) { IO_TYPE.with(|io_type| { if io_type.get() != new_io_type { diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 42a96e21652..0da45c195be 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -25,7 +25,7 @@ use engine_traits::{ use external_storage_export::{ compression_reader_dispatcher, encrypt_wrap_reader, ExternalStorage, RestoreConfig, }; -use file_system::{get_io_rate_limiter, OpenOptions}; +use file_system::{get_io_rate_limiter, IoType, OpenOptions}; use kvproto::{ brpb::{CipherInfo, StorageBackend}, import_sstpb::*, @@ -37,8 +37,7 @@ use tikv_util::{ stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, }, config::ReadableSize, - stream::block_on_external_io, - sys::SysQuota, + sys::{thread::ThreadBuildWrapper, SysQuota}, time::{Instant, Limiter}, }; use tokio::runtime::{Handle, Runtime}; @@ -291,7 +290,20 @@ impl SstImporter { ) -> Result { let switcher = ImportModeSwitcher::new(cfg); let cached_storage = CacheMap::default(); - let download_rt = tokio::runtime::Builder::new_current_thread() + // We are going to run some background tasks here, (hyper needs to maintain the + // connection, the cache map needs gc intervally.) so we must create a + // multi-thread runtime, given there isn't blocking, a single thread runtime is + // enough. + let download_rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(1) + .thread_name("sst_import_misc") + .after_start_wrapper(|| { + tikv_alloc::add_thread_memory_accessor(); + file_system::set_io_type(IoType::Import); + }) + .before_stop_wrapper(|| { + tikv_alloc::remove_thread_memory_accessor(); + }) .enable_all() .build()?; download_rt.spawn(cached_storage.gc_loop()); @@ -788,13 +800,15 @@ impl SstImporter { encrypt_wrap_reader(file_crypter, inner)? }; - let r = block_on_external_io(external_storage_export::read_external_storage_info_buff( - &mut reader, - speed_limiter, - file_length, - expected_sha256, - external_storage_export::MIN_READ_SPEED, - )); + let r = + self.download_rt + .block_on(external_storage_export::read_external_storage_info_buff( + &mut reader, + speed_limiter, + file_length, + expected_sha256, + external_storage_export::MIN_READ_SPEED, + )); let url = ext_storage.url()?.to_string(); let buff = r.map_err(|e| Error::CannotReadExternalStorage { url: url.to_string(), From 192dff638d05724759e3dee642639a86b20e4565 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 28 Feb 2023 16:03:08 +0800 Subject: [PATCH 553/676] importer: use kv engine instead of raw API (#14294) ref tikv/tikv#12842 So that it can support both v1 and v2. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/raft_log_engine/src/engine.rs | 6 +- components/raftstore-v2/Cargo.toml | 1 + components/raftstore-v2/src/batch/store.rs | 8 + components/raftstore-v2/src/fsm/apply.rs | 7 +- .../src/operation/command/admin/split.rs | 9 +- .../raftstore-v2/src/operation/command/mod.rs | 14 +- .../src/operation/command/write/mod.rs | 42 +- .../operation/command/write/simple_write.rs | 101 +++- components/raftstore-v2/src/operation/mod.rs | 17 + components/raftstore-v2/src/raft/apply.rs | 13 +- components/raftstore-v2/src/raft/storage.rs | 9 +- .../tests/integrations/cluster.rs | 12 + components/server/src/server.rs | 2 +- components/server/src/server2.rs | 47 +- components/test_raftstore-v2/src/cluster.rs | 4 + components/test_raftstore-v2/src/node.rs | 13 +- components/test_raftstore-v2/src/server.rs | 25 +- components/test_raftstore/src/server.rs | 4 +- components/tikv_kv/src/btree_engine.rs | 1 + components/tikv_kv/src/lib.rs | 30 +- src/import/sst_service.rs | 439 +++++++----------- src/server/gc_worker/gc_worker.rs | 2 + src/server/raftkv/mod.rs | 6 + src/server/raftkv2/mod.rs | 16 +- src/server/raftkv2/node.rs | 5 + src/storage/mvcc/reader/reader.rs | 1 + tests/integrations/server/kv_service.rs | 2 +- 28 files changed, 536 insertions(+), 301 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7add84159b6..2bd382ee8f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4384,6 +4384,7 @@ dependencies = [ "slog", "slog-global", "smallvec", + "sst_importer", "tempfile", "test_pd", "test_util", diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 92d7a4f7353..a9e75ca9580 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -337,10 +337,6 @@ impl RaftLogEngine { ))) } - pub fn path(&self) -> &str { - self.0.path() - } - /// If path is not an empty directory, we say db exists. pub fn exists(path: &str) -> bool { let path = Path::new(path); @@ -780,7 +776,7 @@ impl RaftEngine for RaftLogEngine { } fn get_engine_path(&self) -> &str { - self.path() + self.0.path() } fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index c7e403afebe..3dfeb512980 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -56,6 +56,7 @@ resource_control = { workspace = true } resource_metering = { workspace = true } slog = "2.3" smallvec = "1.4" +sst_importer = { workspace = true } thiserror = "1.0" tikv_util = { workspace = true } time = "0.1" diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 1a507bb7f10..4693b0db369 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -32,6 +32,7 @@ use raftstore::{ }; use resource_metering::CollectorRegHandle; use slog::{warn, Logger}; +use sst_importer::SstImporter; use tikv_util::{ box_err, config::{Tracker, VersionTrack}, @@ -84,6 +85,7 @@ pub struct StoreContext { pub self_disk_usage: DiskUsage, pub snap_mgr: TabletSnapManager, + pub sst_importer: Arc, } impl StoreContext { @@ -277,6 +279,7 @@ struct StorePollerBuilder { store_meta: Arc>>, shutdown: Arc, snap_mgr: TabletSnapManager, + sst_importer: Arc, } impl StorePollerBuilder { @@ -293,6 +296,7 @@ impl StorePollerBuilder { shutdown: Arc, snap_mgr: TabletSnapManager, coprocessor_host: CoprocessorHost, + sst_importer: Arc, ) -> Self { let pool_size = cfg.value().apply_batch_system.pool_size; let max_pool_size = std::cmp::max( @@ -318,6 +322,7 @@ impl StorePollerBuilder { snap_mgr, shutdown, coprocessor_host, + sst_importer, } } @@ -435,6 +440,7 @@ where self_disk_usage: DiskUsage::Normal, snap_mgr: self.snap_mgr.clone(), coprocessor_host: self.coprocessor_host.clone(), + sst_importer: self.sst_importer.clone(), }; poll_ctx.update_ticks_timeout(); let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); @@ -527,6 +533,7 @@ impl StoreSystem { collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, + sst_importer: Arc, ) -> Result<()> where T: Transport + 'static, @@ -627,6 +634,7 @@ impl StoreSystem { self.shutdown.clone(), snap_mgr, coprocessor_host, + sst_importer, ); self.workers = Some(workers); self.schedulers = Some(schedulers); diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 67e8d557dd9..e1bf5169d55 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -13,6 +13,7 @@ use kvproto::{metapb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raftstore::store::{Config, ReadTask}; use slog::Logger; +use sst_importer::SstImporter; use tikv_util::{ mpsc::future::{self, Receiver, Sender, WakePolicy}, timer::GLOBAL_TIMER_HANDLE, @@ -68,8 +69,9 @@ impl ApplyFsm { flush_state: Arc, log_recovery: Option>, applied_term: u64, - logger: Logger, buckets: Option, + sst_importer: Arc, + logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); let apply = Apply::new( @@ -82,8 +84,9 @@ impl ApplyFsm { flush_state, log_recovery, applied_term, - logger, buckets, + sst_importer, + logger, ); ( ApplyScheduler { sender: tx }, diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 260fb8700b8..b31fc7e7471 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -830,7 +830,10 @@ mod test { }; use super::*; - use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes}; + use crate::{ + fsm::ApplyResReporter, operation::test_util::create_tmp_importer, raft::Apply, + router::ApplyRes, + }; struct MockReporter { sender: Sender, @@ -961,6 +964,7 @@ mod test { let (read_scheduler, _rx) = dummy_scheduler(); let (reporter, _) = MockReporter::new(); + let (_tmp_dir, importer) = create_tmp_importer(); let mut apply = Apply::new( &Config::default(), region @@ -976,8 +980,9 @@ mod test { Arc::new(FlushState::new(5)), None, 5, - logger.clone(), None, + importer, + logger.clone(), ); let mut splits = BatchSplitRequest::default(); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 76a7741134e..ea8c8c227d0 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -43,7 +43,7 @@ use raftstore::{ }, Error, Result, }; -use slog::{info, warn}; +use slog::{error, info, warn}; use tikv_util::{ box_err, log::SlogFormat, @@ -138,8 +138,9 @@ impl Peer { self.flush_state().clone(), self.storage().apply_trace().log_recovery(), self.entry_storage().applied_term(), - logger, buckets, + store_ctx.sst_importer.clone(), + logger, ); store_ctx @@ -478,6 +479,12 @@ impl Apply { dr.notify_only, ); } + SimpleWrite::Ingest(_) => { + error!( + self.logger, + "IngestSST is not supposed to be called on local engine" + ); + } } } self.apply_flow_control_mut().need_flush = true; @@ -575,6 +582,9 @@ impl Apply { dr.notify_only, )?; } + SimpleWrite::Ingest(ssts) => { + self.apply_ingest(ssts)?; + } } } return res; diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index ca4c7152364..a461420f75b 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -1,20 +1,23 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{data_cf_offset, KvEngine, Mutable, RaftEngine, CF_DEFAULT}; -use kvproto::raft_cmdpb::RaftRequestHeader; +use kvproto::{import_sstpb::SstMeta, raft_cmdpb::RaftRequestHeader}; use raftstore::{ store::{ - cmd_resp, + check_sst_for_ingestion, cmd_resp, fsm::{apply, MAX_PROPOSAL_SIZE_RATIO}, + metrics::PEER_WRITE_CMD_COUNTER, msg::ErrorCallback, util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, }, Error, Result, }; +use slog::error; use tikv_util::slog_panic; use crate::{ batch::StoreContext, + fsm::ApplyResReporter, raft::{Apply, Peer}, router::{ApplyTask, CmdResChannel}, }; @@ -132,9 +135,10 @@ impl Peer { } } -impl Apply { +impl Apply { #[inline] pub fn apply_put(&mut self, cf: &str, index: u64, key: &[u8], value: &[u8]) -> Result<()> { + PEER_WRITE_CMD_COUNTER.put.inc(); let off = data_cf_offset(cf); if self.should_skip(off, index) { return Ok(()); @@ -181,6 +185,7 @@ impl Apply { #[inline] pub fn apply_delete(&mut self, cf: &str, index: u64, key: &[u8]) -> Result<()> { + PEER_WRITE_CMD_COUNTER.delete.inc(); let off = data_cf_offset(cf); if self.should_skip(off, index) { return Ok(()); @@ -228,4 +233,35 @@ impl Apply { // TODO: reuse the same delete as split/merge. Ok(()) } + + #[inline] + pub fn apply_ingest(&mut self, ssts: Vec) -> Result<()> { + PEER_WRITE_CMD_COUNTER.ingest_sst.inc(); + let mut infos = Vec::with_capacity(ssts.len()); + for sst in &ssts { + if let Err(e) = check_sst_for_ingestion(sst, self.region()) { + error!( + self.logger, + "ingest fail"; + "sst" => ?sst, + "region" => ?self.region(), + "error" => ?e + ); + let _ = self.sst_importer().delete(sst); + return Err(e); + } + match self.sst_importer().validate(sst) { + Ok(meta_info) => infos.push(meta_info), + Err(e) => { + slog_panic!(self.logger, "corrupted sst"; "sst" => ?sst, "error" => ?e); + } + } + } + // Unlike v1, we can't batch ssts accross regions. + self.flush(); + if let Err(e) = self.sst_importer().ingest(&infos, self.tablet()) { + slog_panic!(self.logger, "ingest fail"; "ssts" => ?ssts, "error" => ?e); + } + Ok(()) + } } diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index e6f81b20af1..cf267f854b7 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -1,7 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::assert_matches::debug_assert_matches; + use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; -use kvproto::raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}; +use kvproto::{ + import_sstpb::SstMeta, + raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, +}; use protobuf::{CodedInputStream, Message}; use raftstore::store::WriteCallback; use slog::Logger; @@ -15,9 +20,16 @@ use crate::{operation::command::parse_at, router::CmdResChannel}; const MAGIC_PREFIX: u8 = 0x00; #[derive(Clone, Debug)] -#[repr(transparent)] pub struct SimpleWriteBinary { buf: Box<[u8]>, + write_type: WriteType, +} + +impl SimpleWriteBinary { + /// Freeze the binary will forbid further batching. + pub fn freeze(&mut self) { + self.write_type = WriteType::Unspecified; + } } /// We usually use `RaftCmdRequest` for read write request. But the codec is @@ -29,6 +41,7 @@ pub struct SimpleWriteReqEncoder { buf: Vec, channels: Vec, size_limit: usize, + write_type: WriteType, notify_proposed: bool, } @@ -53,19 +66,24 @@ impl SimpleWriteReqEncoder { buf, channels: vec![], size_limit, + write_type: bin.write_type, notify_proposed, } } - /// Encode the simple write into the buffer dispite header check. + /// Encode the simple write into the buffer. /// - /// Return false if the buffer limit is reached or the write can be amended. + /// Return false if the buffer limit is reached or the binary type not + /// match. #[inline] pub fn amend(&mut self, header: &RaftRequestHeader, bin: &SimpleWriteBinary) -> bool { if *self.header != *header { return false; } - if self.buf.len() + bin.buf.len() < self.size_limit { + if self.write_type == bin.write_type + && bin.write_type != WriteType::Unspecified + && self.buf.len() + bin.buf.len() < self.size_limit + { self.buf.extend_from_slice(&bin.buf); true } else { @@ -128,11 +146,21 @@ pub enum SimpleWrite<'a> { Put(Put<'a>), Delete(Delete<'a>), DeleteRange(DeleteRange<'a>), + Ingest(Vec), +} + +#[derive(Clone, Copy, Debug, PartialEq)] +enum WriteType { + Unspecified, + PutDelete, + DeleteRange, + Ingest, } #[derive(Clone)] pub struct SimpleWriteEncoder { buf: Vec, + write_type: WriteType, } impl SimpleWriteEncoder { @@ -140,21 +168,36 @@ impl SimpleWriteEncoder { pub fn with_capacity(cap: usize) -> SimpleWriteEncoder { SimpleWriteEncoder { buf: Vec::with_capacity(cap), + write_type: WriteType::Unspecified, } } #[inline] pub fn put(&mut self, cf: &str, key: &[u8], value: &[u8]) { + debug_assert_matches!( + self.write_type, + WriteType::Unspecified | WriteType::PutDelete + ); encode(SimpleWrite::Put(Put { cf, key, value }), &mut self.buf); + self.write_type = WriteType::PutDelete; } #[inline] pub fn delete(&mut self, cf: &str, key: &[u8]) { + debug_assert_matches!( + self.write_type, + WriteType::Unspecified | WriteType::PutDelete + ); encode(SimpleWrite::Delete(Delete { cf, key }), &mut self.buf); + self.write_type = WriteType::PutDelete; } #[inline] pub fn delete_range(&mut self, cf: &str, start_key: &[u8], end_key: &[u8], notify_only: bool) { + debug_assert_matches!( + self.write_type, + WriteType::Unspecified | WriteType::DeleteRange + ); encode( SimpleWrite::DeleteRange(DeleteRange { cf, @@ -164,12 +207,21 @@ impl SimpleWriteEncoder { }), &mut self.buf, ); + self.write_type = WriteType::DeleteRange; + } + + #[inline] + pub fn ingest(&mut self, sst: Vec) { + debug_assert_matches!(self.write_type, WriteType::Unspecified | WriteType::Ingest); + encode(SimpleWrite::Ingest(sst), &mut self.buf); + self.write_type = WriteType::Ingest; } #[inline] pub fn encode(self) -> SimpleWriteBinary { SimpleWriteBinary { buf: self.buf.into_boxed_slice(), + write_type: self.write_type, } } } @@ -228,6 +280,7 @@ impl<'a> Iterator for SimpleWriteReqDecoder<'a> { const PUT_TAG: u8 = 0; const DELETE_TAG: u8 = 1; const DELETE_RANGE_TAG: u8 = 2; +const INGEST_TAG: u8 = 3; const DEFAULT_CF_TAG: u8 = 0; const WRITE_CF_TAG: u8 = 1; @@ -353,6 +406,14 @@ fn encode(simple_write: SimpleWrite<'_>, buf: &mut Vec) { encode_bytes(dr.end_key, buf); buf.push(dr.notify_only as u8); } + SimpleWrite::Ingest(ssts) => { + buf.push(INGEST_TAG); + encode_len(ssts.len() as u32, buf); + // IngestSST is not a frequent operation, use protobuf to reduce complexity. + for sst in ssts { + sst.write_length_delimited_to_vec(buf).unwrap(); + } + } } } @@ -386,12 +447,28 @@ fn decode<'a>(buf: &mut &'a [u8]) -> Option> { notify_only: *notify_only != 0, })) } + INGEST_TAG => { + let (len, left) = decode_len(left); + let mut ssts = Vec::with_capacity(len as usize); + let mut is = CodedInputStream::from_bytes(left); + for _ in 0..len { + let sst = match is.read_message() { + Ok(sst) => sst, + Err(e) => panic!("data corrupted {:?}", e), + }; + ssts.push(sst); + } + *buf = left; + Some(SimpleWrite::Ingest(ssts)) + } tag => panic!("corrupted data: invalid tag {}", tag), } } #[cfg(test)] mod tests { + use std::assert_matches::assert_matches; + use kvproto::raft_cmdpb::{CmdType, Request}; use slog::o; @@ -412,7 +489,9 @@ mod tests { let mut encoder = SimpleWriteEncoder::with_capacity(512); encoder.delete_range(CF_LOCK, b"key", b"key", true); encoder.delete_range("cf", b"key", b"key", false); - req_encoder.amend(&header, &encoder.encode()); + let bin = encoder.encode(); + assert!(!req_encoder.amend(&header, &bin)); + let req_encoder2 = SimpleWriteReqEncoder::new(header.clone(), bin, 0, false); let (bytes, _) = req_encoder.encode(); let logger = slog_global::borrow_global().new(o!()); @@ -428,7 +507,10 @@ mod tests { let SimpleWrite::Delete(delete) = write else { panic!("should be delete") }; assert_eq!(delete.cf, CF_WRITE); assert_eq!(delete.key, &delete_key); + assert_matches!(decoder.next(), None); + let (bytes, _) = req_encoder2.encode(); + decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); let write = decoder.next().unwrap(); let SimpleWrite::DeleteRange(dr) = write else { panic!("should be delete range") }; assert_eq!(dr.cf, CF_LOCK); @@ -500,6 +582,13 @@ mod tests { // Only simple write command with same header can be batched. assert!(!req_encoder.amend(&header2, &bin)); + let mut bin2 = bin.clone(); + bin2.freeze(); + // Frozen bin can't be merged with other bin. + assert!(!req_encoder.amend(&header, &bin2)); + let mut req_encoder2 = SimpleWriteReqEncoder::new(header.clone(), bin2.clone(), 512, false); + assert!(!req_encoder2.amend(&header, &bin)); + // Batch should not excceed max size limit. let large_value = vec![0; 512]; let mut encoder = SimpleWriteEncoder::with_capacity(512); diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 68acac6668b..5514d966cea 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -24,3 +24,20 @@ pub(crate) use self::{ query::{LocalReader, ReadDelegatePair, SharedReadTablet}, txn_ext::TxnContext, }; + +#[cfg(test)] +pub mod test_util { + use std::sync::Arc; + + use kvproto::kvrpcpb::ApiVersion; + use sst_importer::SstImporter; + use tempfile::TempDir; + + pub fn create_tmp_importer() -> (TempDir, Arc) { + let dir = TempDir::new().unwrap(); + let importer = Arc::new( + SstImporter::new(&Default::default(), dir.path(), None, ApiVersion::V1).unwrap(), + ); + (dir, importer) + } +} diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index b4109fd9de0..5e7c7e84f84 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -12,6 +12,7 @@ use raftstore::store::{ Config, ReadTask, }; use slog::Logger; +use sst_importer::SstImporter; use tikv_util::{log::SlogFormat, worker::Scheduler}; use crate::{ @@ -57,6 +58,7 @@ pub struct Apply { res_reporter: R, read_scheduler: Scheduler>, + sst_importer: Arc, pub(crate) metrics: ApplyMetrics, pub(crate) logger: Logger, pub(crate) buckets: Option, @@ -74,8 +76,9 @@ impl Apply { flush_state: Arc, log_recovery: Option>, applied_term: u64, - logger: Logger, buckets: Option, + sst_importer: Arc, + logger: Logger, ) -> Self { let mut remote_tablet = tablet_registry .get(region_state.get_region().get_id()) @@ -105,8 +108,9 @@ impl Apply { flush_state, log_recovery, metrics: ApplyMetrics::default(), - logger, buckets, + sst_importer, + logger, } } @@ -260,4 +264,9 @@ impl Apply { pub fn apply_flow_control(&self) -> &ApplyFlowControl { &self.flow_control } + + #[inline] + pub fn sst_importer(&self) -> &SstImporter { + &self.sst_importer + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 8f9fe2d8947..d386ed0acae 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -339,7 +339,10 @@ mod tests { use super::*; use crate::{ - fsm::ApplyResReporter, operation::write_initial_states, raft::Apply, router::ApplyRes, + fsm::ApplyResReporter, + operation::{test_util::create_tmp_importer, write_initial_states}, + raft::Apply, + router::ApplyRes, }; #[derive(Clone)] @@ -495,6 +498,7 @@ mod tests { worker.start(read_runner); let mut state = RegionLocalState::default(); state.set_region(region.clone()); + let (_tmp_dir, importer) = create_tmp_importer(); // setup peer applyer let mut apply = Apply::new( &Config::default(), @@ -506,8 +510,9 @@ mod tests { Arc::new(FlushState::new(5)), None, 5, - logger, None, + importer, + logger, ); // Test get snapshot diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 264d127cc8c..451f7131cc9 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -23,6 +23,7 @@ use engine_test::{ use engine_traits::{TabletContext, TabletRegistry, DATA_CFS}; use futures::executor::block_on; use kvproto::{ + kvrpcpb::ApiVersion, metapb::{self, RegionEpoch, Store}, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, Request}, raft_serverpb::RaftMessage, @@ -44,6 +45,7 @@ use raftstore_v2::{ }; use resource_metering::CollectorRegHandle; use slog::{debug, o, Logger}; +use sst_importer::SstImporter; use tempfile::TempDir; use test_pd::mocker::Service; use tikv_util::{ @@ -297,6 +299,15 @@ impl RunningState { let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()).unwrap(); let coprocessor_host = CoprocessorHost::new(router.store_router().clone(), cop_cfg.value().clone()); + let importer = Arc::new( + SstImporter::new( + &Default::default(), + path.join("importer"), + None, + ApiVersion::V1, + ) + .unwrap(), + ); let background = Worker::new("background"); let pd_worker = LazyWorker::new("pd-worker"); @@ -318,6 +329,7 @@ impl RunningState { CollectorRegHandle::new_for_test(), background.clone(), pd_worker, + importer, ) .unwrap(); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 5ba70b5db5a..2cde9e9cb78 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1245,7 +1245,7 @@ where let import_service = ImportSstService::new( self.config.import.clone(), self.config.raft_store.raft_entry_max_size, - self.router.clone(), + engines.engine.clone(), engines.engines.kv.clone(), servers.importer.clone(), ); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 7b391c20bb8..01a76dfffbc 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -31,7 +31,7 @@ use causal_ts::CausalTsProviderImpl; use concurrency_manager::ConcurrencyManager; use encryption_export::{data_key_manager_from_config, DataKeyManager}; use engine_rocks::{ - flush_engine_statistics, + flush_engine_statistics, from_rocks_compression_type, raw::{Cache, Env}, FlowInfo, RocksEngine, RocksStatistics, }; @@ -73,6 +73,7 @@ use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, + import::SstImporter, read_pool::{ build_yatp_read_pool, ReadPool, ReadPoolConfigManager, UPDATE_EWMA_TIME_SLICE_INTERVAL, }, @@ -243,6 +244,7 @@ struct TikvEngines { struct Servers { lock_mgr: LockManager, server: LocalServer, + _importer: Arc, rsmeter_pubsub_service: resource_metering::PubSubService, } @@ -871,6 +873,30 @@ where )), ); + let import_path = self.store_path.join("import"); + let mut importer = SstImporter::new( + &self.config.import, + import_path, + self.encryption_key_manager.clone(), + self.config.storage.api_version(), + ) + .unwrap(); + for (cf_name, compression_type) in &[ + ( + CF_DEFAULT, + self.config.rocksdb.defaultcf.bottommost_level_compression, + ), + ( + CF_WRITE, + self.config.rocksdb.writecf.bottommost_level_compression, + ), + ] { + importer.set_compression_type(cf_name, from_rocks_compression_type(*compression_type)); + } + let importer = Arc::new(importer); + + // V2 starts split-check worker within raftstore. + let split_config_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(self.config.split.clone()))); cfg_controller.register( @@ -919,6 +945,7 @@ where pd_worker, raft_store, &state, + importer.clone(), ) .unwrap_or_else(|e| fatal!("failed to start node: {}", e)); @@ -942,6 +969,7 @@ where self.servers = Some(Servers { lock_mgr, server, + _importer: importer, rsmeter_pubsub_service, }); @@ -950,6 +978,23 @@ where fn register_services(&mut self) { let servers = self.servers.as_mut().unwrap(); + let _engines = self.engines.as_ref().unwrap(); + + // Import SST service. + // let import_service = ImportSstService::new( + // self.config.import.clone(), + // self.config.raft_store.raft_entry_max_size, + // engines.engine.clone(), + // self.tablet_registry.as_ref().unwrap().clone(), + // servers.importer.clone(), + // ); + // if servers + // .server + // .register_service(create_import_sst(import_service)) + // .is_some() + // { + // fatal!("failed to register import service"); + // } // Create Diagnostics service let diag_service = DiagnosticsService::new( diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index b9d057d33c5..30d3456d652 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -74,6 +74,7 @@ pub trait Simulator { node_id: u64, cfg: Config, store_meta: Arc>>, + key_mgr: Option>, raft_engine: RaftTestEngine, tablet_registry: TabletRegistry, resource_manager: &Option>, @@ -383,6 +384,7 @@ impl Cluster { id, self.cfg.clone(), store_meta.clone(), + key_mgr.clone(), raft_engine.clone(), tablet_registry.clone(), &self.resource_manager, @@ -424,10 +426,12 @@ impl Cluster { tikv_util::thread_group::set_properties(Some(props)); debug!("calling run node"; "node_id" => node_id); + let key_mgr = self.key_managers_map.get(&node_id).unwrap().clone(); self.sim.wl().run_node( node_id, cfg, store_meta, + key_mgr, raft_engine, tablet_registry, &self.resource_manager, diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index 6c71e2d9cdc..0fde6ba42c5 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -1,15 +1,17 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + path::Path, sync::{Arc, Mutex, RwLock}, time::Duration, }; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; +use encryption_export::DataKeyManager; use engine_rocks::RocksEngine; use engine_test::raft::RaftTestEngine; -use engine_traits::{RaftEngineReadOnly, TabletRegistry}; +use engine_traits::{RaftEngine, RaftEngineReadOnly, TabletRegistry}; use kvproto::{ kvrpcpb::ApiVersion, raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, @@ -36,6 +38,7 @@ use test_pd_client::TestPdClient; use test_raftstore::{Config, Filter}; use tikv::{ config::{ConfigController, Module}, + import::SstImporter, server::{ raftkv::ReplicaReadLockChecker, tablet_snap::copy_tablet_snapshot, NodeV2, Result as ServerResult, @@ -187,6 +190,7 @@ impl Simulator for NodeCluster { node_id: u64, cfg: Config, store_meta: Arc>>, + key_manager: Option>, raft_engine: RaftTestEngine, tablet_registry: TabletRegistry, _resource_manager: &Option>, @@ -265,6 +269,12 @@ impl Simulator for NodeCluster { // todo: Is None sufficient for test? None, ); + let importer = { + let dir = Path::new(raft_engine.get_engine_path()).join("../import-sst"); + Arc::new( + SstImporter::new(&cfg.import, dir, key_manager, cfg.storage.api_version()).unwrap(), + ) + }; let bg_worker = WorkerBuilder::new("background").thread_count(2).create(); let state: Arc> = Arc::default(); @@ -283,6 +293,7 @@ impl Simulator for NodeCluster { pd_worker, Arc::new(VersionTrack::new(raft_store)), &state, + importer, )?; assert!( raft_engine diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index d02dffa73fc..1aa3bfc47f8 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -1,6 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + path::Path, sync::{Arc, Mutex, RwLock}, thread, time::Duration, @@ -10,9 +11,10 @@ use api_version::{dispatch_api_version, KvFormat}; use causal_ts::CausalTsProviderImpl; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; +use encryption_export::DataKeyManager; use engine_rocks::{RocksEngine, RocksSnapshot}; use engine_test::raft::RaftTestEngine; -use engine_traits::{KvEngine, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use futures::executor::block_on; use grpcio::{ChannelBuilder, EnvBuilder, Environment, Error as GrpcError, Service}; use grpcio_health::HealthService; @@ -45,6 +47,7 @@ use test_pd_client::TestPdClient; use test_raftstore::{AddressMap, Config}; use tikv::{ coprocessor, coprocessor_v2, + import::SstImporter, read_pool::ReadPool, server::{ gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, @@ -168,6 +171,7 @@ impl ServerCluster { node_id: u64, mut cfg: Config, store_meta: Arc>>, + key_manager: Option>, raft_engine: RaftTestEngine, tablet_registry: TabletRegistry, resource_manager: &Option>, @@ -317,7 +321,20 @@ impl ServerCluster { ReplicaReadLockChecker::new(concurrency_manager.clone()).register(&mut coprocessor_host); - // todo: Import Sst Service + // Create import service. + let importer = { + let dir = Path::new(raft_engine.get_engine_path()).join("../import-sst"); + Arc::new( + SstImporter::new(&cfg.import, dir, key_manager, cfg.storage.api_version()).unwrap(), + ) + }; + // let import_service = ImportSstService::new( + // cfg.import.clone(), + // cfg.raft_store.raft_entry_max_size, + // raft_kv_2.clone(), + // tablet_registry.clone(), + // Arc::clone(&importer), + // ); // Create deadlock service. let deadlock_service = lock_mgr.deadlock_service(); @@ -382,6 +399,7 @@ impl ServerCluster { .unwrap(); svr.register_service(create_diagnostics(diag_service.clone())); svr.register_service(create_deadlock(deadlock_service.clone())); + // svr.register_service(create_import_sst(import_service.clone())); if let Some(svcs) = self.pending_services.get(&node_id) { for fact in svcs { svr.register_service(fact()); @@ -428,6 +446,7 @@ impl ServerCluster { pd_worker, Arc::new(VersionTrack::new(raft_store)), &state, + importer, )?; assert!(node_id == 0 || node_id == node.id()); let node_id = node.id(); @@ -538,6 +557,7 @@ impl Simulator for ServerCluster { node_id: u64, cfg: Config, store_meta: Arc>>, + key_manager: Option>, raft_engine: RaftTestEngine, tablet_registry: TabletRegistry, resource_manager: &Option>, @@ -548,6 +568,7 @@ impl Simulator for ServerCluster { node_id, cfg, store_meta, + key_manager, raft_engine, tablet_registry, resource_manager diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 3f6b704687a..8c2297fbc45 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -404,7 +404,7 @@ impl ServerCluster { )); let extension = engine.raft_extension(); let store = Storage::<_, _, F>::from_engine( - engine, + engine.clone(), &cfg.storage, storage_read_pool.handle(), lock_mgr.clone(), @@ -440,7 +440,7 @@ impl ServerCluster { let import_service = ImportSstService::new( cfg.import.clone(), cfg.raft_store.raft_entry_max_size, - sim_router.clone(), + engine, engines.kv.clone(), Arc::clone(&importer), ); diff --git a/components/tikv_kv/src/btree_engine.rs b/components/tikv_kv/src/btree_engine.rs index 35f666896f3..336523dd60c 100644 --- a/components/tikv_kv/src/btree_engine.rs +++ b/components/tikv_kv/src/btree_engine.rs @@ -290,6 +290,7 @@ fn write_modifies(engine: &BTreeEngine, modifies: Vec) -> EngineResult<( cf_tree.write().unwrap().insert(k, v); } Modify::DeleteRange(_cf, _start_key, _end_key, _notify_only) => unimplemented!(), + Modify::Ingest(_) => unimplemented!(), }; } Ok(()) diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 02bfc1c9c55..22b11e425c5 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -43,6 +43,7 @@ use futures::{compat::Future01CompatExt, future::BoxFuture, prelude::*}; use into_other::IntoOther; use kvproto::{ errorpb::Error as ErrorHeader, + import_sstpb::SstMeta, kvrpcpb::{Context, DiskFullOpt, ExtraOp as TxnExtraOp, KeyRange}, raft_cmdpb, }; @@ -80,6 +81,7 @@ pub enum Modify { PessimisticLock(Key, PessimisticLock), // cf_name, start_key, end_key, notify_only DeleteRange(CfName, Key, Key, bool), + Ingest(Box), } impl Modify { @@ -88,7 +90,7 @@ impl Modify { Modify::Delete(cf, _) => cf, Modify::Put(cf, ..) => cf, Modify::PessimisticLock(..) => &CF_LOCK, - Modify::DeleteRange(..) => unreachable!(), + Modify::DeleteRange(..) | Modify::Ingest(_) => unreachable!(), }; let cf_size = if cf == &CF_DEFAULT { 0 } else { cf.len() }; @@ -96,7 +98,7 @@ impl Modify { Modify::Delete(_, k) => cf_size + k.as_encoded().len(), Modify::Put(_, k, v) => cf_size + k.as_encoded().len() + v.len(), Modify::PessimisticLock(k, _) => cf_size + k.as_encoded().len(), // FIXME: inaccurate - Modify::DeleteRange(..) => unreachable!(), + Modify::DeleteRange(..) | Modify::Ingest(_) => unreachable!(), } } @@ -105,7 +107,7 @@ impl Modify { Modify::Delete(_, ref k) => k, Modify::Put(_, ref k, _) => k, Modify::PessimisticLock(ref k, _) => k, - Modify::DeleteRange(..) => unreachable!(), + Modify::DeleteRange(..) | Modify::Ingest(_) => unreachable!(), } } } @@ -151,6 +153,10 @@ impl From for raft_cmdpb::Request { req.set_cmd_type(raft_cmdpb::CmdType::DeleteRange); req.set_delete_range(delete_range); } + Modify::Ingest(sst) => { + req.set_cmd_type(raft_cmdpb::CmdType::IngestSst); + req.mut_ingest_sst().set_sst(*sst); + } }; req } @@ -191,6 +197,10 @@ impl From for Modify { delete_range.get_notify_only(), ) } + raft_cmdpb::CmdType::IngestSst => { + let sst = req.mut_ingest_sst().take_sst(); + Modify::Ingest(Box::new(sst)) + } _ => { unimplemented!() } @@ -220,6 +230,7 @@ pub struct WriteData { pub extra: TxnExtra, pub deadline: Option, pub disk_full_opt: DiskFullOpt, + pub avoid_batch: bool, } impl WriteData { @@ -229,6 +240,7 @@ impl WriteData { extra, deadline: None, disk_full_opt: DiskFullOpt::NotAllowedOnFull, + avoid_batch: false, } } @@ -251,9 +263,18 @@ impl WriteData { pub fn set_disk_full_opt(&mut self, level: DiskFullOpt) { self.disk_full_opt = level } + + /// Underlying engine may batch up several requests to increase throughput. + /// + /// If external correctness depends on isolation of requests, you may need + /// to set this flag to true. + pub fn set_avoid_batch(&mut self, avoid_batch: bool) { + self.avoid_batch = avoid_batch + } } /// Events that can subscribed from the `WriteSubscriber`. +#[derive(Debug)] pub enum WriteEvent { Proposed, Committed, @@ -746,6 +767,9 @@ pub fn write_modifies(kv_engine: &impl LocalEngine, modifies: Vec) -> Re Ok(()) } } + Modify::Ingest(_) => { + unimplemented!("IngestSST is not implemented for local engine yet.") + } }; // TODO: turn the error into an engine error. if let Err(msg) = res { diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 08eabe32f0c..b589da50b76 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -9,9 +9,9 @@ use std::{ }; use collections::HashSet; -use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; +use engine_traits::{CompactExt, MiscExt, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; -use futures::{sink::SinkExt, stream::TryStreamExt, TryFutureExt}; +use futures::{sink::SinkExt, stream::TryStreamExt, Stream, StreamExt, TryFutureExt}; use futures_executor::{ThreadPool, ThreadPoolBuilder}; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, @@ -24,20 +24,15 @@ use kvproto::{ SwitchMode, WriteRequest_oneof_chunk as Chunk, *, }, kvrpcpb::Context, - raft_cmdpb::{CmdType, DeleteRequest, PutRequest, RaftCmdRequest, RaftRequestHeader, Request}, -}; -use protobuf::Message; -use raftstore::{ - router::RaftStoreRouter, - store::{Callback, RaftCmdExtraOpts, RegionSnapshot}, }; use sst_importer::{ error_inc, metrics::*, sst_importer::DownloadExt, sst_meta_to_path, Config, Error, Result, SstImporter, }; +use tikv_kv::{Engine, Modify, SnapContext, Snapshot, SnapshotExt, WriteData, WriteEvent}; use tikv_util::{ config::ReadableSize, - future::{create_stream_with_buffer, paired_future_callback}, + future::create_stream_with_buffer, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, }; @@ -45,22 +40,41 @@ use tokio::{runtime::Runtime, time::sleep}; use txn_types::{Key, WriteRef, WriteType}; use super::make_rpc_error; -use crate::{import::duplicate_detect::DuplicateDetector, server::CONFIG_ROCKSDB_GAUGE}; +use crate::{ + import::duplicate_detect::DuplicateDetector, + server::CONFIG_ROCKSDB_GAUGE, + storage::{self, errors::extract_region_error_from_error}, +}; const MAX_INFLIGHT_RAFT_MSGS: usize = 64; +fn transfer_error(err: storage::Error) -> ImportPbError { + let mut e = ImportPbError::default(); + if let Some(region_error) = extract_region_error_from_error(&err) { + e.set_store_error(region_error); + } + e.set_message(format!("failed to complete raft command: {:?}", err)); + e +} + +async fn wait_write(mut s: impl Stream + Send + Unpin) -> storage::Result<()> { + match s.next().await { + Some(WriteEvent::Finished(Ok(()))) => Ok(()), + Some(WriteEvent::Finished(Err(e))) => Err(e.into()), + Some(e) => Err(box_err!("unexpected event: {:?}", e)), + None => Err(box_err!("stream closed")), + } +} + /// ImportSstService provides tikv-server with the ability to ingest SST files. /// /// It saves the SST sent from client to a file and then sends a command to /// raftstore to trigger the ingest process. #[derive(Clone)] -pub struct ImportSstService -where - E: KvEngine, -{ +pub struct ImportSstService { cfg: Config, + tablet_registry: E::Local, engine: E, - router: Router, threads: Arc, // For now, PiTR cannot be executed in the tokio runtime because it is synchronous and may // blocks. (tokio is so strict... it panics if we do insane things like blocking in an async @@ -74,36 +88,29 @@ where raft_entry_max_size: ReadableSize, } -pub struct SnapshotResult { - snapshot: RegionSnapshot, - term: u64, -} - struct RequestCollector { - context: Context, max_raft_req_size: usize, /// Retain the last ts of each key in each request. /// This is used for write CF because resolved ts observer hates duplicated /// key in the same request. - write_reqs: HashMap, (Request, u64)>, + write_reqs: HashMap, (Modify, u64)>, /// Collector favor that simple collect all items, and it do not contains /// duplicated key-value. This is used for default CF. - default_reqs: HashMap, Request>, + default_reqs: HashMap, Modify>, /// Size of all `Request`s. unpacked_size: usize, - pending_raft_reqs: Vec, + pending_writes: Vec, } impl RequestCollector { - fn new(context: Context, max_raft_req_size: usize) -> Self { + fn new(max_raft_req_size: usize) -> Self { Self { - context, max_raft_req_size, write_reqs: HashMap::default(), default_reqs: HashMap::default(), unpacked_size: 0, - pending_raft_reqs: Vec::new(), + pending_writes: Vec::new(), } } @@ -113,41 +120,37 @@ impl RequestCollector { if k.is_empty() || (!is_delete && v.is_empty()) { return; } - let mut req = Request::default(); - if is_delete { - let mut del = DeleteRequest::default(); - del.set_key(k); - del.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Delete); - req.set_delete(del); + // Filter out not supported CF. + let cf = match cf { + CF_WRITE => CF_WRITE, + CF_DEFAULT => CF_DEFAULT, + _ => return, + }; + let m = if is_delete { + Modify::Delete(cf, Key::from_encoded(k)) } else { if cf == CF_WRITE && !write_needs_restore(&v) { return; } - let mut put = PutRequest::default(); - put.set_key(k); - put.set_value(v); - put.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Put); - req.set_put(put); - } - self.accept(cf, req); + Modify::Put(cf, Key::from_encoded(k), v) + }; + self.accept(cf, m); } // we need to remove duplicate keys in here, since // in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 // will panic if found duplicated entry during Vec. - fn accept(&mut self, cf: &str, req: Request) { - let k = key_from_request(&req); + fn accept(&mut self, cf: &str, m: Modify) { + let k = m.key(); match cf { CF_WRITE => { - let (encoded_key, ts) = match Key::split_on_ts_for(k) { + let (encoded_key, ts) = match Key::split_on_ts_for(k.as_encoded()) { Ok(k) => k, Err(err) => { warn!( "key without ts, skipping"; - "key" => %log_wrappers::Value::key(k), + "key" => %k, "err" => %err ); return; @@ -159,19 +162,19 @@ impl RequestCollector { .map(|(_, old_ts)| *old_ts < ts.into_inner()) .unwrap_or(true) { - self.unpacked_size += req.compute_size() as usize; + self.unpacked_size += m.size(); if let Some((v, _)) = self .write_reqs - .insert(encoded_key.to_owned(), (req, ts.into_inner())) + .insert(encoded_key.to_owned(), (m, ts.into_inner())) { - self.unpacked_size -= v.get_cached_size() as usize; + self.unpacked_size -= v.size(); } } } CF_DEFAULT => { - self.unpacked_size += req.compute_size() as usize; - if let Some(v) = self.default_reqs.insert(k.to_owned(), req) { - self.unpacked_size -= v.get_cached_size() as usize; + self.unpacked_size += m.size(); + if let Some(v) = self.default_reqs.insert(k.as_encoded().clone(), m) { + self.unpacked_size -= v.size(); } } _ => unreachable!(), @@ -183,69 +186,61 @@ impl RequestCollector { } #[cfg(test)] - fn drain_unpacked_reqs(&mut self, cf: &str) -> Vec { - let res: Vec = if cf == CF_DEFAULT { - self.default_reqs.drain().map(|(_, req)| req).collect() + fn drain_unpacked_reqs(&mut self, cf: &str) -> Vec { + let res: Vec = if cf == CF_DEFAULT { + self.default_reqs.drain().map(|(_, m)| m).collect() } else { - self.write_reqs.drain().map(|(_, (req, _))| req).collect() + self.write_reqs.drain().map(|(_, (m, _))| m).collect() }; for r in &res { - self.unpacked_size -= r.get_cached_size() as usize; + self.unpacked_size -= r.size(); } res } #[inline] - fn drain_raft_reqs(&mut self, take_unpacked: bool) -> std::vec::Drain<'_, RaftCmdRequest> { + fn drain_pending_writes(&mut self, take_unpacked: bool) -> std::vec::Drain<'_, WriteData> { if take_unpacked { self.pack_all(); } - self.pending_raft_reqs.drain(..) + self.pending_writes.drain(..) } fn pack_all(&mut self) { if self.unpacked_size == 0 { return; } - let mut cmd = RaftCmdRequest::default(); - let mut header = make_request_header(self.context.clone()); // Set the UUID of header to prevent raftstore batching our requests. // The current `resolved_ts` observer assumes that each batch of request doesn't // has two writes to the same key. (Even with 2 different TS). That was true // for normal cases because the latches reject concurrency write to keys. // However we have bypassed the latch layer :( - header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); - cmd.set_header(header); let mut reqs: Vec<_> = self.write_reqs.drain().map(|(_, (req, _))| req).collect(); reqs.append(&mut self.default_reqs.drain().map(|(_, req)| req).collect()); if reqs.is_empty() { debug_assert!(false, "attempt to pack an empty request"); return; } - cmd.set_requests(reqs.into()); - - self.pending_raft_reqs.push(cmd); + let mut data = WriteData::from_modifies(reqs); + data.set_avoid_batch(true); + self.pending_writes.push(data); self.unpacked_size = 0; } #[inline] fn is_empty(&self) -> bool { - self.pending_raft_reqs.is_empty() && self.unpacked_size == 0 + self.pending_writes.is_empty() && self.unpacked_size == 0 } } -impl ImportSstService -where - E: KvEngine, - Router: 'static + RaftStoreRouter, -{ +impl ImportSstService { pub fn new( cfg: Config, raft_entry_max_size: ReadableSize, - router: Router, engine: E, + tablet_registry: E::Local, importer: Arc, - ) -> ImportSstService { + ) -> Self { let props = tikv_util::thread_group::current_properties(); let threads = tokio::runtime::Builder::new_multi_thread() .worker_threads(cfg.num_threads) @@ -271,15 +266,15 @@ where .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) .create() .unwrap(); - importer.start_switch_mode_check(threads.handle(), engine.clone()); + importer.start_switch_mode_check(threads.handle(), tablet_registry.clone()); threads.spawn(Self::tick(importer.clone())); ImportSstService { cfg, - engine, + tablet_registry, threads: Arc::new(threads), block_threads: Arc::new(block_threads), - router, + engine, importer, limiter: Limiter::new(f64::INFINITY), task_slots: Arc::new(Mutex::new(HashSet::default())), @@ -306,46 +301,36 @@ where Ok(slots.remove(&p)) } - async fn async_snapshot( - router: Router, - header: RaftRequestHeader, - ) -> std::result::Result, errorpb::Error> { - let mut req = Request::default(); - req.set_cmd_type(CmdType::Snap); - let mut cmd = RaftCmdRequest::default(); - cmd.set_header(header); - cmd.set_requests(vec![req].into()); - let (cb, future) = paired_future_callback(); - if let Err(e) = router.send_command(cmd, Callback::read(cb), RaftCmdExtraOpts::default()) { - return Err(e.into()); - } - let mut res = future.await.map_err(|_| { - let mut err = errorpb::Error::default(); - let err_str = "too many sst files are ingesting"; - let mut server_is_busy_err = errorpb::ServerIsBusy::default(); - server_is_busy_err.set_reason(err_str.to_string()); - err.set_message(err_str.to_string()); - err.set_server_is_busy(server_is_busy_err); - err - })?; - let mut header = res.response.take_header(); - if header.has_error() { - return Err(header.take_error()); + fn async_snapshot( + engine: &mut E, + context: &Context, + ) -> impl Future> { + let res = engine.async_snapshot(SnapContext { + pb_ctx: context, + ..Default::default() + }); + async move { + res.await.map_err(|e| { + let err: storage::Error = e.into(); + if let Some(e) = extract_region_error_from_error(&err) { + e + } else { + let mut e = errorpb::Error::default(); + e.set_message(format!("{}", err)); + e + } + }) } - Ok(SnapshotResult { - snapshot: res.snapshot.unwrap(), - term: header.get_current_term(), - }) } fn check_write_stall(&self) -> Option { if self.importer.get_mode() == SwitchMode::Normal && self - .engine + .tablet_registry .ingest_maybe_slowdown_writes(CF_WRITE) .expect("cf") { - match self.engine.get_sst_key_ranges(CF_WRITE, 0) { + match self.tablet_registry.get_sst_key_ranges(CF_WRITE, 0) { Ok(l0_sst_ranges) => { warn!( "sst ingest is too slow"; @@ -368,14 +353,13 @@ where } fn ingest_files( - &self, - context: Context, + &mut self, + mut context: Context, label: &'static str, ssts: Vec, ) -> impl Future> { - let header = make_request_header(context); - let snapshot_res = Self::async_snapshot(self.router.clone(), header.clone()); - let router = self.router.clone(); + let snapshot_res = Self::async_snapshot(&mut self.engine, &context); + let engine = self.engine.clone(); let importer = self.importer.clone(); async move { // check api version @@ -394,17 +378,6 @@ where }; fail_point!("import::sst_service::ingest"); - // Make ingest command. - let mut cmd = RaftCmdRequest::default(); - cmd.set_header(header); - cmd.mut_header().set_term(res.term); - for sst in ssts.iter() { - let mut ingest = Request::default(); - ingest.set_cmd_type(CmdType::IngestSst); - ingest.mut_ingest_sst().set_sst(sst.clone()); - cmd.mut_requests().push(ingest); - } - // Here we shall check whether the file has been ingested before. This operation // must execute after geting a snapshot from raftstore to make sure that the // current leader has applied to current term. @@ -423,20 +396,31 @@ where return Ok(resp); } } + let modifies = ssts + .iter() + .map(|s| Modify::Ingest(Box::new(s.clone()))) + .collect(); + context.set_term(res.ext().get_term().unwrap().into()); + let region_id = context.get_region_id(); + let res = engine.async_write( + &context, + WriteData::from_modifies(modifies), + WriteEvent::BASIC_EVENT, + None, + ); - let (cb, future) = paired_future_callback(); - if let Err(e) = - router.send_command(cmd, Callback::write(cb), RaftCmdExtraOpts::default()) - { - resp.set_error(e.into()); - return Ok(resp); - } - - let mut res = future.await.map_err(Error::from)?; - let mut header = res.response.take_header(); - if header.has_error() { - pb_error_inc(label, header.get_error()); - resp.set_error(header.take_error()); + let mut resp = IngestResponse::default(); + if let Err(e) = wait_write(res).await { + if let Some(e) = extract_region_error_from_error(&e) { + pb_error_inc(label, &e); + resp.set_error(e); + } else { + IMPORTER_ERROR_VEC + .with_label_values(&[label, "unknown"]) + .inc(); + resp.mut_error() + .set_message(format!("[region {}] ingest failed: {:?}", region_id, e)); + } } Ok(resp) } @@ -445,33 +429,14 @@ where async fn apply_imp( mut req: ApplyRequest, importer: Arc, - router: Router, + engine: E, limiter: Limiter, max_raft_size: usize, ) -> std::result::Result, ImportPbError> { - type RaftWriteFuture = futures::channel::oneshot::Receiver; - async fn handle_raft_write(fut: RaftWriteFuture) -> std::result::Result<(), ImportPbError> { - match fut.await { - Err(e) => { - let msg = format!("failed to complete raft command: {}", e); - let mut e = ImportPbError::default(); - e.set_message(msg); - return Err(e); - } - Ok(mut r) if r.response.get_header().has_error() => { - let mut e = ImportPbError::default(); - e.set_message("failed to complete raft command".to_string()); - e.set_store_error(r.response.take_header().take_error()); - return Err(e); - } - _ => {} - } - Ok(()) - } - let mut range: Option = None; - let mut collector = RequestCollector::new(req.take_context(), max_raft_size * 7 / 8); + let mut collector = RequestCollector::new(max_raft_size * 7 / 8); + let context = req.take_context(); let mut metas = req.take_metas(); let mut rules = req.take_rewrite_rules(); // For compatibility with old requests. @@ -485,7 +450,7 @@ where false, ); - let mut inflight_futures: VecDeque = VecDeque::new(); + let mut inflight_futures = VecDeque::new(); let mut tasks = metas.iter().zip(rules.iter()).peekable(); while let Some((meta, rule)) = tasks.next() { @@ -513,25 +478,19 @@ where } let is_last_task = tasks.peek().is_none(); - for req in collector.drain_raft_reqs(is_last_task) { - while inflight_futures.len() >= MAX_INFLIGHT_RAFT_MSGS { - handle_raft_write(inflight_futures.pop_front().unwrap()).await?; - } - let (cb, future) = paired_future_callback(); - match router.send_command(req, Callback::write(cb), RaftCmdExtraOpts::default()) { - Ok(_) => inflight_futures.push_back(future), - Err(e) => { - let msg = format!("failed to send raft command: {}", e); - let mut e = ImportPbError::default(); - e.set_message(msg); - return Err(e); - } + for req in collector.drain_pending_writes(is_last_task) { + let f = engine.async_write(&context, req, WriteEvent::BASIC_EVENT, None); + inflight_futures.push_back(f); + if inflight_futures.len() >= MAX_INFLIGHT_RAFT_MSGS { + wait_write(inflight_futures.pop_front().unwrap()) + .await + .map_err(transfer_error)?; } } } assert!(collector.is_empty()); - for fut in inflight_futures { - handle_raft_write(fut).await?; + for f in inflight_futures { + wait_write(f).await.map_err(transfer_error)?; } Ok(range) @@ -548,7 +507,7 @@ macro_rules! impl_write { sink: ClientStreamingSink<$resp_ty>, ) { let import = self.importer.clone(); - let engine = self.engine.clone(); + let tablet_registry = self.tablet_registry.clone(); let (rx, buf_driver) = create_stream_with_buffer(stream, self.cfg.stream_channel_window); let mut rx = rx.map_err(Error::from); @@ -566,7 +525,7 @@ macro_rules! impl_write { _ => return Err(Error::InvalidChunk), }; - let writer = match import.$writer_fn(&engine, meta) { + let writer = match import.$writer_fn(&tablet_registry, meta) { Ok(w) => w, Err(e) => { error!("build writer failed {:?}", e); @@ -600,11 +559,7 @@ macro_rules! impl_write { }; } -impl ImportSst for ImportSstService -where - E: KvEngine, - Router: 'static + RaftStoreRouter, -{ +impl ImportSst for ImportSstService { fn switch_mode( &mut self, ctx: RpcContext<'_>, @@ -620,8 +575,12 @@ where } match req.get_mode() { - SwitchMode::Normal => self.importer.enter_normal_mode(self.engine.clone(), mf), - SwitchMode::Import => self.importer.enter_import_mode(self.engine.clone(), mf), + SwitchMode::Normal => self + .importer + .enter_normal_mode(self.tablet_registry.clone(), mf), + SwitchMode::Import => self + .importer + .enter_import_mode(self.tablet_registry.clone(), mf), } }; match res { @@ -721,7 +680,7 @@ where let label = "apply"; let start = Instant::now(); let importer = self.importer.clone(); - let router = self.router.clone(); + let engine = self.engine.clone(); let limiter = self.limiter.clone(); let max_raft_size = self.raft_entry_max_size.0 as usize; @@ -733,7 +692,7 @@ where let mut resp = ApplyResponse::default(); - match Self::apply_imp(req, importer, router, limiter, max_raft_size).await { + match Self::apply_imp(req, importer, engine, limiter, max_raft_size).await { Ok(Some(r)) => resp.set_range(r), Err(e) => resp.set_error(e), _ => {} @@ -756,7 +715,7 @@ where let timer = Instant::now_coarse(); let importer = Arc::clone(&self.importer); let limiter = self.limiter.clone(); - let engine = self.engine.clone(); + let tablet_registry = self.tablet_registry.clone(); let start = Instant::now(); let handle_task = async move { @@ -775,14 +734,14 @@ where .into_option() .filter(|c| c.cipher_type != EncryptionMethod::Plaintext); - let res = importer.download_ext::( + let res = importer.download_ext::( req.get_sst(), req.get_storage_backend(), req.get_name(), req.get_rewrite_rule(), cipher, limiter, - engine, + tablet_registry, DownloadExt::default() .cache_key(req.get_storage_cache_id()) .req_type(req.get_request_type()), @@ -906,7 +865,7 @@ where ) { let label = "compact"; let timer = Instant::now_coarse(); - let engine = self.engine.clone(); + let tablet_registry = self.tablet_registry.clone(); let handle_task = async move { let (start, end) = if !req.has_range() { @@ -923,7 +882,7 @@ where Some(req.get_output_level()) }; - let res = engine.compact_files_in_range(start, end, output_level); + let res = tablet_registry.compact_files_in_range(start, end, output_level); match res { Ok(_) => info!( "compact files in range"; @@ -984,7 +943,6 @@ where let label = "duplicate_detect"; let timer = Instant::now_coarse(); let context = request.take_context(); - let router = self.router.clone(); let start_key = request.take_start_key(); let min_commit_ts = request.get_min_commit_ts(); let end_key = if request.get_end_key().is_empty() { @@ -993,11 +951,11 @@ where Some(request.take_end_key()) }; let key_only = request.get_key_only(); - let snap_res = Self::async_snapshot(router, make_request_header(context)); + let snap_res = Self::async_snapshot(&mut self.engine, &context); let handle_task = async move { let res = snap_res.await; let snapshot = match res { - Ok(snap) => snap.snapshot, + Ok(snap) => snap, Err(e) => { let mut resp = DuplicateDetectResponse::default(); pb_error_inc(label, &e); @@ -1078,25 +1036,6 @@ fn pb_error_inc(type_: &str, e: &errorpb::Error) { IMPORTER_ERROR_VEC.with_label_values(&[type_, label]).inc(); } -fn key_from_request(req: &Request) -> &[u8] { - if req.has_put() { - return req.get_put().get_key(); - } - if req.has_delete() { - return req.get_delete().get_key(); - } - panic!("trying to extract key from request is neither put nor delete.") -} - -fn make_request_header(mut context: Context) -> RaftRequestHeader { - let region_id = context.get_region_id(); - let mut header = RaftRequestHeader::default(); - header.set_peer(context.take_peer()); - header.set_region_id(region_id); - header.set_region_epoch(context.take_region_epoch()); - header -} - fn write_needs_restore(write: &[u8]) -> bool { let w = WriteRef::parse(write); match w { @@ -1127,10 +1066,10 @@ mod test { use std::collections::HashMap; use engine_traits::{CF_DEFAULT, CF_WRITE}; - use kvproto::{kvrpcpb::Context, raft_cmdpb::*}; + use tikv_kv::Modify; use txn_types::{Key, TimeStamp, Write, WriteType}; - use crate::import::sst_service::{key_from_request, RequestCollector}; + use crate::import::sst_service::RequestCollector; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1143,45 +1082,18 @@ mod test { (k.into_encoded(), val.to_owned()) } - fn default_req(key: &[u8], val: &[u8], start_ts: u64) -> Request { + fn default_req(key: &[u8], val: &[u8], start_ts: u64) -> Modify { let (k, v) = default(key, val, start_ts); - req(k, v, CF_DEFAULT, CmdType::Put) + Modify::Put(CF_DEFAULT, Key::from_encoded(k), v) } - fn write_req(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> Request { + fn write_req(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> Modify { let (k, v) = write(key, ty, commit_ts, start_ts); - let cmd_type = if ty == WriteType::Delete { - CmdType::Delete + if ty == WriteType::Delete { + Modify::Delete(CF_WRITE, Key::from_encoded(k)) } else { - CmdType::Put - }; - - req(k, v, CF_WRITE, cmd_type) - } - - fn req(k: Vec, v: Vec, cf: &str, cmd_type: CmdType) -> Request { - let mut req = Request::default(); - req.set_cmd_type(cmd_type); - - match cmd_type { - CmdType::Put => { - let mut put = PutRequest::default(); - put.set_key(k); - put.set_value(v); - put.set_cf(cf.to_string()); - - req.set_put(put) - } - CmdType::Delete => { - let mut del = DeleteRequest::default(); - del.set_cf(cf.to_string()); - del.set_key(k); - - req.set_delete(del); - } - _ => panic!("invalid input cmd_type"), + Modify::Put(CF_WRITE, Key::from_encoded(k), v) } - req } #[test] @@ -1191,27 +1103,30 @@ mod test { cf: &'static str, is_delete: bool, mutations: Vec<(Vec, Vec)>, - expected_reqs: Vec, + expected_reqs: Vec, } fn run_case(c: &Case) { - let mut collector = RequestCollector::new(Context::new(), 1024); + let mut collector = RequestCollector::new(1024); for (k, v) in c.mutations.clone() { collector.accept_kv(c.cf, c.is_delete, k, v); } - let reqs = collector.drain_raft_reqs(true); + let reqs = collector.drain_pending_writes(true); let mut req1: HashMap<_, _> = reqs .into_iter() - .flat_map(|mut x| x.take_requests().into_iter()) + .flat_map(|x| { + assert!(x.avoid_batch); + x.modifies.into_iter() + }) .map(|req| { - let key = key_from_request(&req).to_owned(); + let key = req.key().to_owned(); (key, req) }) .collect(); for req in c.expected_reqs.iter() { - let r = req1.remove(key_from_request(req)); + let r = req1.remove(req.key()); assert_eq!(r.as_ref(), Some(req), "{:?}", c); } assert!(req1.is_empty(), "{:?}\ncase = {:?}", req1, c); @@ -1284,7 +1199,7 @@ mod test { #[test] fn test_request_collector_with_write_cf() { - let mut request_collector = RequestCollector::new(Context::new(), 102400); + let mut request_collector = RequestCollector::new(102400); let reqs = vec![ write_req(b"foo", WriteType::Put, 40, 39), write_req(b"aar", WriteType::Put, 38, 37), @@ -1301,18 +1216,14 @@ mod test { request_collector.accept(CF_WRITE, req); } let mut reqs: Vec<_> = request_collector.drain_unpacked_reqs(CF_WRITE); - reqs.sort_by(|r1, r2| { - let k1 = key_from_request(r1); - let k2 = key_from_request(r2); - k1.cmp(k2) - }); + reqs.sort_by(|r1, r2| r1.key().cmp(r2.key())); assert_eq!(reqs, reqs_result); assert!(request_collector.is_empty()); } #[test] fn test_request_collector_with_default_cf() { - let mut request_collector = RequestCollector::new(Context::new(), 102400); + let mut request_collector = RequestCollector::new(102400); let reqs = vec![ default_req(b"foo", b"", 39), default_req(b"zzz", b"", 40), @@ -1330,10 +1241,8 @@ mod test { } let mut reqs: Vec<_> = request_collector.drain_unpacked_reqs(CF_DEFAULT); reqs.sort_by(|r1, r2| { - let k1 = key_from_request(r1); - let (k1, ts1) = Key::split_on_ts_for(k1).unwrap(); - let k2 = key_from_request(r2); - let (k2, ts2) = Key::split_on_ts_for(k2).unwrap(); + let (k1, ts1) = Key::split_on_ts_for(r1.key().as_encoded()).unwrap(); + let (k2, ts2) = Key::split_on_ts_for(r2.key().as_encoded()).unwrap(); k1.cmp(k2).then(ts1.cmp(&ts2)) }); diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 106b36f61ad..87ab5c10575 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -1361,6 +1361,7 @@ pub mod test_gc_worker { let bytes = keys::data_end_key(key2.as_encoded()); *key2 = Key::from_encoded(bytes); } + Modify::Ingest(_) => unimplemented!(), } } write_modifies(&self.kv_engine().unwrap(), modifies) @@ -1388,6 +1389,7 @@ pub mod test_gc_worker { *start_key = Key::from_encoded(keys::data_key(start_key.as_encoded())); *end_key = Key::from_encoded(keys::data_end_key(end_key.as_encoded())); } + Modify::Ingest(_) => unimplemented!(), }); self.0.async_write(ctx, batch, subscribed, on_applied) } diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 0f0d8fa5689..e175fa502f8 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -393,6 +393,9 @@ where let bytes = keys::data_end_key(key2.as_encoded()); *key2 = Key::from_encoded(bytes); } + Modify::Ingest(_) => { + return Err(box_err!("ingest sst is not supported in local engine")); + } } } } @@ -449,6 +452,9 @@ where let reqs: Vec = batch.modifies.into_iter().map(Into::into).collect(); let txn_extra = batch.extra; let mut header = new_request_header(ctx); + if batch.avoid_batch { + header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); + } let mut flags = 0; if txn_extra.one_pc { flags |= WriteBatchFlags::ONE_PC.bits(); diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 28f2a1d5d25..9fb4ef70b03 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -69,6 +69,8 @@ impl Stream for Transform { fn modifies_to_simple_write(modifies: Vec) -> SimpleWriteBinary { let mut encoder = SimpleWriteEncoder::with_capacity(128); + let modifies_len = modifies.len(); + let mut ssts = vec![]; for m in modifies { match m { Modify::Put(cf, k, v) => encoder.put(cf, k.as_encoded(), &v), @@ -82,8 +84,17 @@ fn modifies_to_simple_write(modifies: Vec) -> SimpleWriteBinary { end_key.as_encoded(), notify_only, ), + Modify::Ingest(sst) => { + if ssts.capacity() == 0 { + ssts.reserve(modifies_len); + } + ssts.push(*sst); + } } } + if !ssts.is_empty() { + encoder.ingest(ssts); + } encoder.encode() } @@ -228,7 +239,10 @@ impl tikv_kv::Engine for RaftKv2 { header.set_flags(flags); self.schedule_txn_extra(batch.extra); - let data = modifies_to_simple_write(batch.modifies); + let mut data = modifies_to_simple_write(batch.modifies); + if batch.avoid_batch { + data.freeze(); + } let mut builder = CmdResChannelBuilder::default(); if WriteEvent::subscribed_proposed(subscribed) { builder.subscribe_proposed(); diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index 588e8ae9e9b..b9cc956d40e 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -17,6 +17,7 @@ use raftstore::{ use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreRouter, StoreSystem}; use resource_metering::CollectorRegHandle; use slog::{info, o, Logger}; +use sst_importer::SstImporter; use tikv_util::{ config::VersionTrack, worker::{LazyWorker, Worker}, @@ -102,6 +103,7 @@ where pd_worker: LazyWorker, store_cfg: Arc>, state: &Mutex, + sst_importer: Arc, ) -> Result<()> where T: Transport + 'static, @@ -140,6 +142,7 @@ where background, pd_worker, store_cfg, + sst_importer, )?; Ok(()) @@ -201,6 +204,7 @@ where background: Worker, pd_worker: LazyWorker, store_cfg: Arc>, + sst_importer: Arc, ) -> Result<()> where T: Transport + 'static, @@ -232,6 +236,7 @@ where collector_reg_handle, background, pd_worker, + sst_importer, )?; Ok(()) } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index e530cc56577..d8f31ba77a8 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -1015,6 +1015,7 @@ pub mod tests { wb.delete_range_cf(cf, &k1, &k2).unwrap(); } } + Modify::Ingest(_) => unimplemented!(), } } wb.write().unwrap(); diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 30dd3b120ca..4a981bdfa53 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -1207,7 +1207,7 @@ fn test_double_run_node() { let snap_mgr = SnapManager::new(tmp.path().to_str().unwrap()); let coprocessor_host = CoprocessorHost::new(router, raftstore::coprocessor::Config::default()); let importer = { - let dir = Path::new(engines.kv.path()).join("import-sst"); + let dir = Path::new(MiscExt::path(&engines.kv)).join("import-sst"); Arc::new(SstImporter::new(&ImportConfig::default(), dir, None, ApiVersion::V1).unwrap()) }; let (split_check_scheduler, _) = dummy_scheduler(); From 76bbf9504eabc80c142167b41a691560ae813938 Mon Sep 17 00:00:00 2001 From: Nathan Date: Wed, 1 Mar 2023 14:47:08 +0800 Subject: [PATCH 554/676] raft_client: report SnapshotStatus for witness (#14267) close tikv/tikv#14228 raft_client: report SnapshotStatus for witness Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot --- src/server/raft_client.rs | 6 +- tests/integrations/raftstore/test_witness.rs | 60 +++++++++++++++++++- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index fa12600bb98..17de1d3365d 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -461,10 +461,14 @@ where snapshot .merge_from_bytes(msg.get_message().get_snapshot().get_data()) .unwrap(); - // Witness's snapshot must be empty, no need to send snapshot files + // Witness's snapshot must be empty, no need to send snapshot files, report + // immediately if !snapshot.get_meta().get_for_witness() { self.send_snapshot_sock(msg); continue; + } else { + let rep = self.new_snapshot_reporter(&msg); + rep.report(SnapshotStatus::Finish); } } self.buffer.push(msg); diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index d5a9992bc3a..d4332403cea 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -1,6 +1,10 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{iter::FromIterator, sync::Arc, time::Duration}; +use std::{ + iter::FromIterator, + sync::{Arc, Mutex}, + time::Duration, +}; use collections::HashMap; use futures::executor::block_on; @@ -10,9 +14,13 @@ use kvproto::{ raft_serverpb::{PeerState, RaftApplyState}, }; use pd_client::PdClient; -use raft::eraftpb::ConfChangeType; +use raft::eraftpb::{ConfChangeType, MessageType}; use test_raftstore::*; -use tikv_util::{config::ReadableDuration, store::find_peer}; +use tikv_util::{ + config::ReadableDuration, + store::{find_peer, new_witness_peer}, + HandyRwLock, +}; // Test the case that region split or merge with witness peer #[test] @@ -598,3 +606,49 @@ fn test_witness_ignore_consistency_check() { std::thread::sleep(Duration::from_millis(10)); } } + +// Test the case that witness apply snapshot with network isolation +#[test] +fn test_witness_apply_snapshot_with_network_isolation() { + let mut cluster = new_server_cluster(0, 3); + configure_for_snapshot(&mut cluster.cfg); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let r1 = cluster.run_conf_change(); + pd_client.must_add_peer(r1, new_peer(2, 2)); + pd_client.must_add_peer(r1, new_witness_peer(3, 3)); + // Ensure all peers are initialized. + std::thread::sleep(Duration::from_millis(100)); + + cluster.must_transfer_leader(1, new_peer(1, 1)); + + cluster.add_send_filter(IsolationFilterFactory::new(3)); + + for i in 0..20 { + cluster.must_put(format!("k{}", i).as_bytes(), b"v1"); + } + sleep_ms(500); + + // Ignore witness's MsgAppendResponse, after applying snaphost + let dropped_msgs = Arc::new(Mutex::new(Vec::new())); + let recv_filter = Box::new( + RegionPacketFilter::new(r1, 1) + .direction(Direction::Recv) + .msg_type(MessageType::MsgAppendResponse) + .reserve_dropped(Arc::clone(&dropped_msgs)), + ); + cluster.sim.wl().add_recv_filter(1, recv_filter); + + cluster.clear_send_filters(); + // Wait for leader send snapshot. + sleep_ms(500); + + cluster.sim.wl().clear_recv_filters(1); + + // Witness's ProgressState must have been changed to Probe + cluster.must_transfer_leader(1, new_peer(2, 2)); + + for i in 20..25 { + cluster.must_put(format!("k{}", i).as_bytes(), b"v1"); + } +} From f3cb8ed3e81a1cbc112126dd2806a0213a434eaf Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 28 Feb 2023 23:33:09 -0800 Subject: [PATCH 555/676] tablet_flow_control polish (#14232) ref tikv/tikv#12842 1) Remove the suffix parameter in tablet flow control code as it's not needed anymore by using the FlowControlFactorStore. 2) Before this change, it the tablet flow control may not work properly because of unnecessarily checking the suffix value. 3) Fix the scheduler flow's metrics for multi-rocksdb Signed-off-by: qi.xu Co-authored-by: qi.xu --- components/engine_rocks/src/flow_listener.rs | 27 +- src/server/engine_factory.rs | 12 +- .../singleton_flow_controller.rs | 329 +++++------------- .../flow_controller/tablet_flow_controller.rs | 160 +++++---- 4 files changed, 203 insertions(+), 325 deletions(-) diff --git a/components/engine_rocks/src/flow_listener.rs b/components/engine_rocks/src/flow_listener.rs index f36b5393f7a..4a4f80cc46f 100644 --- a/components/engine_rocks/src/flow_listener.rs +++ b/components/engine_rocks/src/flow_listener.rs @@ -7,21 +7,20 @@ use rocksdb::{CompactionJobInfo, EventListener, FlushJobInfo, IngestionInfo}; #[derive(Clone)] pub enum FlowInfo { - L0(String, u64, u64, u64), - L0Intra(String, u64, u64, u64), - Flush(String, u64, u64, u64), - Compaction(String, u64, u64), + L0(String, u64, u64), + L0Intra(String, u64, u64), + Flush(String, u64, u64), + Compaction(String, u64), BeforeUnsafeDestroyRange(u64), AfterUnsafeDestroyRange(u64), - Created(u64, u64), - Destroyed(u64, u64), + Created(u64), + Destroyed(u64), } #[derive(Clone)] pub struct FlowListener { flow_info_sender: Arc>>, region_id: u64, - suffix_id: u64, } impl FlowListener { @@ -29,15 +28,13 @@ impl FlowListener { Self { flow_info_sender: Arc::new(Mutex::new(flow_info_sender)), region_id: 0, - suffix_id: 0, } } - pub fn clone_with(&self, region_id: u64, suffix_id: u64) -> Self { + pub fn clone_with(&self, region_id: u64) -> Self { Self { flow_info_sender: self.flow_info_sender.clone(), region_id, - suffix_id, } } @@ -46,7 +43,7 @@ impl FlowListener { .flow_info_sender .lock() .unwrap() - .send(FlowInfo::Created(self.region_id, self.suffix_id)); + .send(FlowInfo::Created(self.region_id)); } pub fn on_destroyed(&self) { @@ -54,7 +51,7 @@ impl FlowListener { .flow_info_sender .lock() .unwrap() - .send(FlowInfo::Destroyed(self.region_id, self.suffix_id)); + .send(FlowInfo::Destroyed(self.region_id)); } } @@ -67,7 +64,6 @@ impl EventListener for FlowListener { info.cf_name().to_owned(), total, self.region_id, - self.suffix_id, )); } @@ -81,7 +77,6 @@ impl EventListener for FlowListener { info.cf_name().to_owned(), total, self.region_id, - self.suffix_id, )); } else { // ingestion may change the pending bytes. @@ -92,7 +87,6 @@ impl EventListener for FlowListener { .send(FlowInfo::Compaction( info.cf_name().to_owned(), self.region_id, - self.suffix_id, )); } } @@ -138,7 +132,6 @@ impl EventListener for FlowListener { info.cf_name().to_owned(), diff, self.region_id, - self.suffix_id, )); } else { let l0_input_file_at_input_level = @@ -162,7 +155,6 @@ impl EventListener for FlowListener { info.cf_name().to_owned(), read_bytes, self.region_id, - self.suffix_id, )); } } @@ -174,7 +166,6 @@ impl EventListener for FlowListener { .send(FlowInfo::Compaction( info.cf_name().to_owned(), self.region_id, - self.suffix_id, )); } } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index eb49775e5c1..f50afe4bc44 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -192,8 +192,8 @@ impl TabletFactory for KvEngineFactory { let tablet_name = path.file_name().unwrap().to_str().unwrap().to_string(); db_opts.set_info_log(TabletLogger::new(tablet_name)); let cf_opts = self.cf_opts(EngineType::RaftKv2); - if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { - db_opts.add_event_listener(listener.clone_with(ctx.id, suffix)); + if let Some(listener) = &self.inner.flow_listener { + db_opts.add_event_listener(listener.clone_with(ctx.id)); } if let Some(storage) = &self.inner.state_storage && let Some(flush_state) = ctx.flush_state { @@ -209,8 +209,8 @@ impl TabletFactory for KvEngineFactory { engine_rocks::util::new_engine_opt(path.to_str().unwrap(), db_opts, cf_opts); if let Err(e) = &kv_engine { error!("failed to create tablet"; "id" => ctx.id, "suffix" => ?ctx.suffix, "path" => %path.display(), "err" => ?e); - } else if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { - listener.clone_with(ctx.id, suffix).on_created(); + } else if let Some(listener) = &self.inner.flow_listener { + listener.clone_with(ctx.id).on_created(); } kv_engine } @@ -227,8 +227,8 @@ impl TabletFactory for KvEngineFactory { // kv_cfs_opts, // )?; let _ = std::fs::remove_dir_all(path); - if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { - listener.clone_with(ctx.id, suffix).on_destroyed(); + if let Some(listener) = &self.inner.flow_listener { + listener.clone_with(ctx.id).on_destroyed(); } Ok(()) } diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index 801d3d27280..f51249facfc 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -505,8 +505,7 @@ pub(super) struct FlowChecker { wait_for_destroy_range_finish: bool, region_id: u64, - #[getset(get_copy = "pub", set = "pub")] - tablet_suffix: u64, + rc: AtomicU32, } impl FlowChecker { @@ -516,12 +515,11 @@ impl FlowChecker { discard_ratio: Arc, limiter: Arc, ) -> Self { - Self::new_with_region_id(0, 0, config, engine, discard_ratio, limiter) + Self::new_with_region_id(0, config, engine, discard_ratio, limiter) } pub fn new_with_region_id( region_id: u64, - tablet_suffix: u64, config: &FlowControlConfig, engine: E, discard_ratio: Arc, @@ -535,7 +533,6 @@ impl FlowChecker { Self { region_id, - tablet_suffix, soft_pending_compaction_bytes_limit: config.soft_pending_compaction_bytes_limit.0, hard_pending_compaction_bytes_limit: config.hard_pending_compaction_bytes_limit.0, memtables_threshold: config.memtables_threshold, @@ -549,6 +546,7 @@ impl FlowChecker { last_record_time: Instant::now_coarse(), last_speed: 0.0, wait_for_destroy_range_finish: false, + rc: AtomicU32::new(1), } } @@ -653,7 +651,11 @@ impl FlowChecker { let msg = flow_info_receiver.recv_deadline(deadline); if let Err(RecvTimeoutError::Timeout) = msg { - checker.update_statistics(); + let (rate, cf_throttle_flags) = checker.update_statistics(); + for (cf, val) in cf_throttle_flags { + SCHED_THROTTLE_CF_GAUGE.with_label_values(&[cf]).set(val); + } + SCHED_WRITE_FLOW_GAUGE.set(rate as i64); deadline = std::time::Instant::now() + TICK_DURATION; } else { checker.on_flow_info_msg(enabled, msg); @@ -684,26 +686,25 @@ impl FlowChecker { self.discard_ratio.store(0, Ordering::Relaxed); } - pub fn update_statistics(&mut self) { + pub fn update_statistics(&mut self) -> (f64, HashMap<&str, i64>) { + let mut cf_throttle_flags = HashMap::default(); if let Some(throttle_cf) = self.throttle_cf.as_ref() { - SCHED_THROTTLE_CF_GAUGE - .with_label_values(&[throttle_cf]) - .set(1); + cf_throttle_flags.insert(throttle_cf.as_str(), 1); for cf in self.cf_checkers.keys() { if cf != throttle_cf { - SCHED_THROTTLE_CF_GAUGE.with_label_values(&[cf]).set(0); + cf_throttle_flags.insert(cf.as_str(), 0); } } } else { for cf in self.cf_checkers.keys() { - SCHED_THROTTLE_CF_GAUGE.with_label_values(&[cf]).set(0); + cf_throttle_flags.insert(cf.as_str(), 0); } } // calculate foreground write flow let dur = self.last_record_time.saturating_elapsed_secs(); if dur < f64::EPSILON { - return; + return (0.0, cf_throttle_flags); } let rate = self.limiter.total_bytes_consumed() as f64 / dur; // don't record those write rate of 0. @@ -713,10 +714,11 @@ impl FlowChecker { if self.limiter.total_bytes_consumed() != 0 { self.write_flow_recorder.observe(rate as u64); } - SCHED_WRITE_FLOW_GAUGE.set(rate as i64); + self.last_record_time = Instant::now_coarse(); self.limiter.reset_statistics(); + (rate, cf_throttle_flags) } fn on_pending_compaction_bytes_change(&mut self, cf: String) { @@ -1005,6 +1007,14 @@ impl FlowChecker { }); self.limiter.set_speed_limit(throttle) } + + pub fn inc(&self) -> u32 { + self.rc.fetch_add(1, Ordering::SeqCst) + } + + pub fn dec(&self) -> u32 { + self.rc.fetch_sub(1, Ordering::SeqCst) + } } #[cfg(test)] @@ -1101,7 +1111,6 @@ pub(super) mod tests { stub: &EngineStub, tx: &mpsc::SyncSender, region_id: u64, - tablet_suffix: u64, ) { assert_eq!(flow_controller.consume(0, 2000), Duration::ZERO); loop { @@ -1121,95 +1130,45 @@ pub(super) mod tests { // exceeds the threshold on start stub.0.num_memtables.store(8, Ordering::Relaxed); - tx.send(FlowInfo::Flush( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert_eq!(flow_controller.should_drop(region_id), false); // on start check forbids flow control assert_eq!(flow_controller.is_unlimited(region_id), true); // once falls below the threshold, pass the on start check stub.0.num_memtables.store(1, Ordering::Relaxed); - tx.send(FlowInfo::Flush( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); // not throttle when the average of the sliding window doesn't exceeds the // threshold stub.0.num_memtables.store(6, Ordering::Relaxed); - tx.send(FlowInfo::Flush( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert_eq!(flow_controller.should_drop(region_id), false); assert_eq!(flow_controller.is_unlimited(region_id), true); // the average of sliding window exceeds the threshold stub.0.num_memtables.store(6, Ordering::Relaxed); - tx.send(FlowInfo::Flush( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert_eq!(flow_controller.should_drop(region_id), false); assert_eq!(flow_controller.is_unlimited(region_id), false); assert_ne!(flow_controller.consume(region_id, 2000), Duration::ZERO); // not throttle once the number of memtables falls below the threshold stub.0.num_memtables.store(1, Ordering::Relaxed); - tx.send(FlowInfo::Flush( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert_eq!(flow_controller.should_drop(region_id), false); assert_eq!(flow_controller.is_unlimited(region_id), true); } @@ -1220,7 +1179,7 @@ pub(super) mod tests { let flow_controller = EngineFlowController::new(&FlowControlConfig::default(), stub.clone(), rx); let flow_controller = FlowController::Singleton(flow_controller); - test_flow_controller_memtable_impl(&flow_controller, &stub, &tx, 0, 0); + test_flow_controller_memtable_impl(&flow_controller, &stub, &tx, 0); } pub fn test_flow_controller_l0_impl( @@ -1228,7 +1187,6 @@ pub(super) mod tests { stub: &EngineStub, tx: &mpsc::SyncSender, region_id: u64, - tablet_suffix: u64, ) { assert_eq!(flow_controller.consume(region_id, 2000), Duration::ZERO); loop { @@ -1240,56 +1198,26 @@ pub(super) mod tests { // exceeds the threshold stub.0.num_l0_files.store(30, Ordering::Relaxed); - tx.send(FlowInfo::L0( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::L0("default".to_string(), 0, region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert_eq!(flow_controller.should_drop(region_id), false); // on start check forbids flow control assert_eq!(flow_controller.is_unlimited(region_id), true); // once fall below the threshold, pass the on start check stub.0.num_l0_files.store(10, Ordering::Relaxed); - tx.send(FlowInfo::L0( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::L0("default".to_string(), 0, region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); // exceeds the threshold, throttle now stub.0.num_l0_files.store(30, Ordering::Relaxed); - tx.send(FlowInfo::L0( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::L0("default".to_string(), 0, region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert_eq!(flow_controller.should_drop(region_id), false); assert_eq!(flow_controller.is_unlimited(region_id), false); assert_ne!(flow_controller.consume(region_id, 2000), Duration::ZERO); @@ -1302,7 +1230,7 @@ pub(super) mod tests { let flow_controller = EngineFlowController::new(&FlowControlConfig::default(), stub.clone(), rx); let flow_controller = FlowController::Singleton(flow_controller); - test_flow_controller_l0_impl(&flow_controller, &stub, &tx, 0, 0); + test_flow_controller_l0_impl(&flow_controller, &stub, &tx, 0); } pub fn test_flow_controller_pending_compaction_bytes_impl( @@ -1310,25 +1238,15 @@ pub(super) mod tests { stub: &EngineStub, tx: &mpsc::SyncSender, region_id: u64, - tablet_suffix: u64, ) { // exceeds the threshold stub.0 .pending_compaction_bytes .store(1000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction( - "default".to_string(), - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Compaction("default".to_string(), region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); // on start check forbids flow control assert!( flow_controller.discard_ratio(region_id) < f64::EPSILON, @@ -1339,60 +1257,33 @@ pub(super) mod tests { stub.0 .pending_compaction_bytes .store(100 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction( - "default".to_string(), - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Compaction("default".to_string(), region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); stub.0 .pending_compaction_bytes .store(1000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction( - "default".to_string(), - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Compaction("default".to_string(), region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert!(flow_controller.discard_ratio(region_id) > f64::EPSILON); stub.0 .pending_compaction_bytes .store(1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction( - "default".to_string(), - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Compaction("default".to_string(), region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); // pending compaction bytes jump after unsafe destroy range tx.send(FlowInfo::BeforeUnsafeDestroyRange(region_id)) .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id, 0)) + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) .unwrap(); assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); @@ -1400,39 +1291,21 @@ pub(super) mod tests { stub.0 .pending_compaction_bytes .store(1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction( - "default".to_string(), - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Compaction("default".to_string(), region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); stub.0 .pending_compaction_bytes .store(10000000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction( - "default".to_string(), - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Compaction("default".to_string(), region_id)) + .unwrap(); tx.send(FlowInfo::AfterUnsafeDestroyRange(region_id)) .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert!( flow_controller.discard_ratio(region_id) < f64::EPSILON, "discard_ratio {}", @@ -1443,37 +1316,19 @@ pub(super) mod tests { stub.0 .pending_compaction_bytes .store(1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction( - "default".to_string(), - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Compaction("default".to_string(), region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); stub.0 .pending_compaction_bytes .store(1000000000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction( - "default".to_string(), - region_id, - tablet_suffix, - )) - .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + tx.send(FlowInfo::Compaction("default".to_string(), region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); assert!(flow_controller.discard_ratio(region_id) > f64::EPSILON); } @@ -1484,7 +1339,7 @@ pub(super) mod tests { let flow_controller = EngineFlowController::new(&FlowControlConfig::default(), stub.clone(), rx); let flow_controller = FlowController::Singleton(flow_controller); - test_flow_controller_pending_compaction_bytes_impl(&flow_controller, &stub, &tx, 0, 0); + test_flow_controller_pending_compaction_bytes_impl(&flow_controller, &stub, &tx, 0); } #[test] diff --git a/src/storage/txn/flow_controller/tablet_flow_controller.rs b/src/storage/txn/flow_controller/tablet_flow_controller.rs index 556b5f4a8fa..d4590b90acc 100644 --- a/src/storage/txn/flow_controller/tablet_flow_controller.rs +++ b/src/storage/txn/flow_controller/tablet_flow_controller.rs @@ -20,7 +20,7 @@ use tikv_util::{sys::thread::StdThreadBuildWrapper, time::Limiter}; use super::singleton_flow_controller::{ FlowChecker, FlowControlFactorStore, Msg, RATIO_SCALE_FACTOR, TICK_DURATION, }; -use crate::storage::config::FlowControlConfig; +use crate::storage::{config::FlowControlConfig, metrics::*}; pub struct TabletFlowFactorStore { registry: TabletRegistry, @@ -156,15 +156,12 @@ impl FlowInfoDispatcher { let msg = flow_info_receiver.recv_deadline(deadline); match msg.clone() { - Ok(FlowInfo::L0(_cf, _, region_id, suffix)) - | Ok(FlowInfo::L0Intra(_cf, _, region_id, suffix)) - | Ok(FlowInfo::Flush(_cf, _, region_id, suffix)) - | Ok(FlowInfo::Compaction(_cf, region_id, suffix)) => { + Ok(FlowInfo::L0(_cf, _, region_id)) + | Ok(FlowInfo::L0Intra(_cf, _, region_id)) + | Ok(FlowInfo::Flush(_cf, _, region_id)) + | Ok(FlowInfo::Compaction(_cf, region_id)) => { let mut checkers = flow_checkers.as_ref().write().unwrap(); if let Some(checker) = checkers.get_mut(®ion_id) { - if checker.tablet_suffix() != suffix { - continue; - } checker.on_flow_info_msg(enabled, msg); } } @@ -175,10 +172,14 @@ impl FlowInfoDispatcher { checker.on_flow_info_msg(enabled, msg); } } - Ok(FlowInfo::Created(region_id, suffix)) => { + Ok(FlowInfo::Created(region_id)) => { let mut checkers = flow_checkers.as_ref().write().unwrap(); match checkers.entry(region_id) { - HashMapEntry::Occupied(e) => e.into_mut(), + HashMapEntry::Occupied(e) => { + let val = e.into_mut(); + val.inc(); + val + } HashMapEntry::Vacant(e) => { let engine = TabletFlowFactorStore::new(registry.clone()); let mut v = limiters.as_ref().write().unwrap(); @@ -193,7 +194,6 @@ impl FlowInfoDispatcher { )); e.insert(FlowChecker::new_with_region_id( region_id, - suffix, &config, engine, limiter.1.clone(), @@ -202,12 +202,14 @@ impl FlowInfoDispatcher { } }; } - Ok(FlowInfo::Destroyed(region_id, suffix)) => { + Ok(FlowInfo::Destroyed(region_id)) => { let mut remove_limiter = false; { let mut checkers = flow_checkers.as_ref().write().unwrap(); - if let Some(checker) = checkers.get_mut(®ion_id) { - if checker.tablet_suffix() == suffix { + if let Some(checker) = checkers.get(®ion_id) { + // if the previous value is 1, then the updated reference count + // will be 0 + if checker.dec() == 1 { checkers.remove(®ion_id); remove_limiter = true; } @@ -219,8 +221,22 @@ impl FlowInfoDispatcher { } Err(RecvTimeoutError::Timeout) => { let mut checkers = flow_checkers.as_ref().write().unwrap(); + let mut total_rate = 0.0; + let mut cf_throttle_flags = HashMap::default(); for checker in (*checkers).values_mut() { - checker.update_statistics(); + let (rate, tablet_cf_throttle_flags) = checker.update_statistics(); + total_rate += rate; + for (key, val) in tablet_cf_throttle_flags { + if let Some(value) = cf_throttle_flags.get_mut(key) { + *value += val; + } else { + cf_throttle_flags.insert(key, val); + } + } + } + SCHED_WRITE_FLOW_GAUGE.set(total_rate as i64); + for (cf, val) in cf_throttle_flags { + SCHED_THROTTLE_CF_GAUGE.with_label_values(&[cf]).set(val); } deadline = std::time::Instant::now() + TICK_DURATION; } @@ -353,25 +369,65 @@ mod tests { let tablet_suffix = 5_u64; let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); reg.load(tablet_context, false).unwrap(); - tx.send(FlowInfo::Created(region_id, tablet_suffix)) + tx.send(FlowInfo::Created(region_id)).unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); test_flow_controller_basic_impl(&flow_controller, region_id); - tx.send(FlowInfo::Destroyed(region_id, tablet_suffix)) + tx.send(FlowInfo::Destroyed(region_id)).unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); + } + + #[test] + fn test_tablet_flow_controller_life_cycle() { + const WAIT_TICK: Duration = Duration::from_millis(100); + let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); + let region_id = 5_u64; + let tablet_suffix = 5_u64; + let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); + reg.load(tablet_context, false).unwrap(); + tx.send(FlowInfo::Created(region_id)).unwrap(); + for _ in 0..30 { + std::thread::sleep(WAIT_TICK); + flow_controller.set_speed_limit(region_id, 1000.0); + if !flow_controller.is_unlimited(region_id) { + break; + } + } + tx.send(FlowInfo::Destroyed(region_id)).unwrap(); + for _ in 0..30 { + std::thread::sleep(WAIT_TICK); + if flow_controller.is_unlimited(region_id) { + break; + } + } + // the region's limiter is removed so it's unlimited + assert!(flow_controller.is_unlimited(region_id)); + + tx.send(FlowInfo::Created(region_id)).unwrap(); + tx.send(FlowInfo::Created(region_id)).unwrap(); + for _ in 0..30 { + std::thread::sleep(WAIT_TICK); + flow_controller.set_speed_limit(region_id, 1000.0); + if !flow_controller.is_unlimited(region_id) { + break; + } + } + tx.send(FlowInfo::Destroyed(region_id)).unwrap(); + std::thread::sleep(TICK_DURATION); + // the region's limiter should not be removed as the reference count is still 1 + assert!(!flow_controller.is_unlimited(region_id)); + tx.send(FlowInfo::Destroyed(region_id)).unwrap(); + for _ in 0..30 { + std::thread::sleep(WAIT_TICK); + if flow_controller.is_unlimited(region_id) { + break; + } + } + // the region's limiter is removed so it's unlimited + assert!(flow_controller.is_unlimited(region_id)); + // no-op it should not crash + tx.send(FlowInfo::Destroyed(region_id)).unwrap(); } #[test] @@ -382,16 +438,10 @@ mod tests { let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); let mut cached = reg.load(tablet_context, false).unwrap(); let stub = cached.latest().unwrap().clone(); - tx.send(FlowInfo::Created(region_id, tablet_suffix)) + tx.send(FlowInfo::Created(region_id)).unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - test_flow_controller_memtable_impl(&flow_controller, &stub, &tx, region_id, tablet_suffix); + test_flow_controller_memtable_impl(&flow_controller, &stub, &tx, region_id); } #[test] @@ -402,16 +452,10 @@ mod tests { let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); let mut cached = reg.load(tablet_context, false).unwrap(); let stub = cached.latest().unwrap().clone(); - tx.send(FlowInfo::Created(region_id, tablet_suffix)) + tx.send(FlowInfo::Created(region_id)).unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - test_flow_controller_l0_impl(&flow_controller, &stub, &tx, region_id, tablet_suffix); + test_flow_controller_l0_impl(&flow_controller, &stub, &tx, region_id); } #[test] @@ -422,22 +466,10 @@ mod tests { let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); let mut cached = reg.load(tablet_context, false).unwrap(); let stub = cached.latest().unwrap().clone(); - tx.send(FlowInfo::Created(region_id, tablet_suffix)) + tx.send(FlowInfo::Created(region_id)).unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) .unwrap(); - tx.send(FlowInfo::L0Intra( - "default".to_string(), - 0, - region_id, - tablet_suffix, - )) - .unwrap(); - test_flow_controller_pending_compaction_bytes_impl( - &flow_controller, - &stub, - &tx, - region_id, - tablet_suffix, - ); + test_flow_controller_pending_compaction_bytes_impl(&flow_controller, &stub, &tx, region_id); } } From b5508681f09828d3c3e7336f592ffcfca0b58091 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 1 Mar 2023 17:25:09 +0800 Subject: [PATCH 556/676] pd_client_v2: fix version race (#14310) close tikv/tikv#14309 Version should be updated before broadcast updates, otherwise the update will be just ignore. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/pd_client/src/client_v2.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index b583772bb72..11224ad894e 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -245,8 +245,9 @@ impl CachedRawClient { let latest_version = { let mut latest = self.core.latest.lock().unwrap(); *latest = self.cache.clone(); + let v = self.core.version.fetch_add(1, Ordering::Relaxed) + 1; let _ = self.core.on_reconnect_tx.send(()); - self.core.version.fetch_add(1, Ordering::Relaxed) + 1 + v }; debug_assert!(self.cache_version < latest_version); self.cache_version = latest_version; From ddb4e729d530b6dfed06a5bde9aa58e3fc8ff11d Mon Sep 17 00:00:00 2001 From: you06 Date: Wed, 1 Mar 2023 22:03:10 +0800 Subject: [PATCH 557/676] copr: fix paging stop early unexpectedly with agg executors (#14292) ref tikv/tikv#14209, close tikv/tikv#14291 #14209 stop paging when the result set is drained. But when supporting agg paging, the executors will also return drained when there is enough data returned. This PR label the drain reasons and stop following paging when the resultset is really drained. Signed-off-by: you06 Co-authored-by: Liqi Geng --- .../src/fast_hash_aggr_executor.rs | 34 ++-- .../src/index_scan_executor.rs | 28 ++-- .../tidb_query_executors/src/interface.rs | 44 ++++- .../src/limit_executor.rs | 38 ++--- .../src/partition_top_n_executor.rs | 150 +++++++++--------- .../src/projection_executor.rs | 46 +++--- components/tidb_query_executors/src/runner.rs | 12 +- .../src/selection_executor.rs | 64 ++++---- .../src/simple_aggr_executor.rs | 24 +-- .../src/slow_hash_aggr_executor.rs | 8 +- .../src/stream_aggr_executor.rs | 22 +-- .../src/table_scan_executor.rs | 8 +- .../src/top_n_executor.rs | 82 +++++----- .../src/util/aggr_executor.rs | 41 ++--- .../src/util/mock_executor.rs | 6 +- .../src/util/scan_executor.rs | 13 +- src/coprocessor/statistics/analyze.rs | 4 +- .../coprocessor_executors/util/bencher.rs | 2 +- .../coprocessor_executors/util/fixture.rs | 6 +- tests/failpoints/cases/test_coprocessor.rs | 10 ++ 20 files changed, 350 insertions(+), 292 deletions(-) diff --git a/components/tidb_query_executors/src/fast_hash_aggr_executor.rs b/components/tidb_query_executors/src/fast_hash_aggr_executor.rs index 174912ca0b0..a878347fc68 100644 --- a/components/tidb_query_executors/src/fast_hash_aggr_executor.rs +++ b/components/tidb_query_executors/src/fast_hash_aggr_executor.rs @@ -361,10 +361,10 @@ impl AggregationExecutorImpl for FastHashAggregationImp fn iterate_available_groups( &mut self, entities: &mut Entities, - src_is_drained: bool, + src_is_drained: BatchExecIsDrain, mut iteratee: impl FnMut(&mut Entities, &[Box]) -> Result<()>, ) -> Result> { - assert!(src_is_drained); + assert!(src_is_drained.stop()); let aggr_fns_len = entities.each_aggr_fn.len(); let mut group_by_column = LazyBatchColumn::decoded_with_capacity_and_tp( @@ -545,12 +545,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let mut r = block_on(exec.next_batch(1)); // col_0 + col_1 can result in [NULL, 9.0, 6.0], thus there will be three @@ -681,12 +681,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let mut r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); @@ -765,12 +765,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let mut r = block_on(exec.next_batch(1)); // col_4 can result in [NULL, "aa", "aaa"], thus there will be three groups. @@ -935,13 +935,13 @@ mod tests { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); @@ -950,12 +950,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } } @@ -998,12 +998,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let mut r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2]); @@ -1069,12 +1069,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let mut r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); @@ -1135,7 +1135,7 @@ mod tests { )]), logical_rows: vec![6, 4, 5, 1, 3, 2, 0], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ); let mut exec = exec_builder(src_exec); diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index 9e415918541..de59b843eb5 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -991,7 +991,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 3); assert!(result.physical_columns[0].is_raw()); @@ -1048,7 +1048,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 3); assert!(result.physical_columns[0].is_raw()); @@ -1108,7 +1108,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 2); assert_eq!(result.physical_columns.rows_len(), 3); assert!(result.physical_columns[0].is_raw()); @@ -1153,7 +1153,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 3); assert!(result.physical_columns[0].is_raw()); @@ -1205,7 +1205,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 2); assert!(result.physical_columns[0].is_raw()); @@ -1282,7 +1282,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 2); assert!(result.physical_columns[0].is_raw()); @@ -1339,7 +1339,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_raw()); @@ -1449,7 +1449,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_raw()); @@ -1492,7 +1492,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_raw()); @@ -1588,7 +1588,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_raw()); @@ -1688,7 +1688,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_raw()); @@ -1782,7 +1782,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_raw()); @@ -1875,7 +1875,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 3); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_raw()); @@ -2001,7 +2001,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert!(result.is_drained.as_ref().unwrap()); + assert!(result.is_drained.as_ref().unwrap().stop()); assert_eq!(result.physical_columns.columns_len(), 4); assert_eq!(result.physical_columns.rows_len(), 1); assert!(result.physical_columns[0].is_raw()); diff --git a/components/tidb_query_executors/src/interface.rs b/components/tidb_query_executors/src/interface.rs index 611516ab6bc..352fbab4720 100644 --- a/components/tidb_query_executors/src/interface.rs +++ b/components/tidb_query_executors/src/interface.rs @@ -174,17 +174,45 @@ pub struct BatchExecuteResult { /// Whether or not there is no more data. /// /// This structure is a `Result`. When it is: - /// - `Ok(false)`: The normal case, means that there could be more data. The - /// caller should continue calling `next_batch()` although for each call - /// the returned data may be empty. - /// - `Ok(true)`: Means that the executor is drained and no more data will - /// be returned in future. However there could be some (last) data in the - /// `data` field this time. The caller should NOT call `next_batch()` any - /// more. + /// - `Ok(batch_exec_is_drain)`: See the comment of `BatchExecIsDrain`. /// - `Err(_)`: Means that there is an error when trying to retrieve more /// data. In this case, the error is returned and the executor is also /// drained. Similar to `Ok(true)`, there could be some remaining data in /// the `data` field which is valid data and should be processed. The /// caller should NOT call `next_batch()` any more. - pub is_drained: Result, + pub is_drained: Result, +} + +/// The result of batch execution. +/// - `Drain`: The executor is completely drained and no more data will be +/// returned in the given range.However there could be some (last) data in +/// `data` field this time. The caller should NOT call `next_batch()` any +/// more. +/// - `PagingDrain`: The executor output enough rows of the paging request, +/// there may be following data in the next paging request, the paging request +/// should be returned with scanned range in this case. Only used in paging +/// mode, Also check the last data in `data` field. +/// - `Remain`: The normal case, means that there could be more data. The caller +/// should continue calling `next_batch()` although for each call the returned +/// data may be empty. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum BatchExecIsDrain { + Remain, + Drain, + PagingDrain, +} + +impl BatchExecIsDrain { + #[inline] + pub fn is_remain(&self) -> bool { + *self == BatchExecIsDrain::Remain + } + + /// the batch execution need to stop when the result status is Drain or + /// PagingDrain, but only when we meet Drain, the resultset is really + /// drained. + #[inline] + pub fn stop(&self) -> bool { + !self.is_remain() + } } diff --git a/components/tidb_query_executors/src/limit_executor.rs b/components/tidb_query_executors/src/limit_executor.rs index a9cd2cae482..bbbe5d576d0 100644 --- a/components/tidb_query_executors/src/limit_executor.rs +++ b/components/tidb_query_executors/src/limit_executor.rs @@ -46,7 +46,7 @@ impl BatchExecutor for BatchLimitExecutor { } else { // We don't need to touch the physical data. result.logical_rows.truncate(self.remaining_rows); - result.is_drained = Ok(true); + result.is_drained = Ok(BatchExecIsDrain::Drain); self.remaining_rows = 0; } @@ -96,7 +96,7 @@ mod tests { )]), logical_rows: vec![1, 2], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ); @@ -105,7 +105,7 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 3); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -141,7 +141,7 @@ mod tests { )]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![VectorValue::Int( @@ -149,7 +149,7 @@ mod tests { )]), logical_rows: vec![1, 2], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); @@ -159,12 +159,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 3); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -178,7 +178,7 @@ mod tests { )]), logical_rows: vec![1, 2], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![VectorValue::Int( @@ -196,12 +196,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 2]); assert_eq!(r.physical_columns.rows_len(), 3); - assert!(r.is_drained.unwrap()); // No errors + assert!(r.is_drained.unwrap().stop()); // No errors } #[test] @@ -215,13 +215,13 @@ mod tests { )]), logical_rows: vec![1, 2], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![VectorValue::Int( @@ -229,7 +229,7 @@ mod tests { )]), logical_rows: vec![0, 4, 1, 3], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); @@ -239,17 +239,17 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1, 2]); assert_eq!(r.physical_columns.rows_len(), 3); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 4]); assert_eq!(r.physical_columns.rows_len(), 5); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -262,7 +262,7 @@ mod tests { let r = block_on(exec.next_batch(100)); assert_eq!(r.logical_rows, &[0, 1, 2, 3, 4]); let r = block_on(exec.next_batch(2)); - assert_eq!(r.is_drained.unwrap(), true); + assert!(r.is_drained.unwrap().stop()); let schema = vec![FieldTypeTp::LongLong.into()]; let rows = (0..1024).collect(); @@ -270,9 +270,9 @@ mod tests { let mut exec = BatchLimitExecutor::new(src_exec, 1024, true).unwrap(); for _i in 0..1023 { let r = block_on(exec.next_batch(1)); - assert_eq!(r.is_drained.unwrap(), false); + assert!(r.is_drained.unwrap().is_remain()); } let r = block_on(exec.next_batch(1)); - assert_eq!(r.is_drained.unwrap(), true); + assert!(r.is_drained.unwrap().stop()); } } diff --git a/components/tidb_query_executors/src/partition_top_n_executor.rs b/components/tidb_query_executors/src/partition_top_n_executor.rs index 52cf2e85925..980adb3e459 100644 --- a/components/tidb_query_executors/src/partition_top_n_executor.rs +++ b/components/tidb_query_executors/src/partition_top_n_executor.rs @@ -199,7 +199,7 @@ impl BatchPartitionTopNExecutor { } #[inline] - async fn handle_next_batch(&mut self) -> Result<(LazyBatchColumnVec, bool)> { + async fn handle_next_batch(&mut self) -> Result<(LazyBatchColumnVec, BatchExecIsDrain)> { let mut result = LazyBatchColumnVec::empty(); let src_result = self.src.next_batch(BATCH_MAX_SIZE).await; self.context.warnings = src_result.warnings; @@ -282,7 +282,7 @@ impl BatchPartitionTopNExecutor { self.heap.add_row(row)?; } } - if src_is_drained { + if src_is_drained.stop() { self.heap.take_all_append_to(&mut result); } @@ -326,7 +326,7 @@ impl BatchExecutor for BatchPartitionTopNExecutor { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }; } @@ -407,7 +407,7 @@ mod tests { ]), logical_rows: (0..1).collect(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ); @@ -429,7 +429,7 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -459,7 +459,7 @@ mod tests { ]), logical_rows: (0..4).collect(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ); @@ -491,7 +491,7 @@ mod tests { r.physical_columns[1].decoded().to_real_vec(), &[Real::new(6.0).ok(), Real::new(5.0).ok(),] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -531,7 +531,7 @@ mod tests { ]), logical_rows: (0..9).collect(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ); @@ -562,7 +562,7 @@ mod tests { r.physical_columns[1].decoded().to_int_vec(), &[Some(1), None, None, Some(2), Some(1), None] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } fn make_expr_case() -> MockExecutor { @@ -619,7 +619,7 @@ mod tests { ]), logical_rows: (0..9).collect(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ) } @@ -657,7 +657,7 @@ mod tests { r.physical_columns[2].decoded().to_int_vec(), &[Some(1), Some(2), Some(4), Some(6), Some(8)] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// partition by col0 + col1, order by col2 @@ -689,7 +689,7 @@ mod tests { r.physical_columns[2].decoded().to_int_vec(), &[Some(2), Some(4), Some(7), Some(9)] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Currently, When the data is not ordered by partition key, e.g. 1 1 2 1, @@ -713,7 +713,7 @@ mod tests { ]), logical_rows: (0..4).collect(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ); @@ -745,7 +745,7 @@ mod tests { r.physical_columns[1].decoded().to_real_vec(), &[None, Real::new(7.0).ok(), Real::new(4.0).ok()] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } fn make_integrated_data() -> MockExecutor { @@ -851,7 +851,7 @@ mod tests { ]), logical_rows: (0..16).collect(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ) } @@ -881,7 +881,7 @@ mod tests { assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6, 7]); assert_eq!(r.physical_columns.rows_len(), 8); assert_eq!(r.physical_columns.columns_len(), 4); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); assert_eq!( r.physical_columns[2].decoded().to_int_vec(), @@ -927,8 +927,8 @@ mod tests { /// 2 9,223,372,036,854,775,807 9,223,372,036,854,775,807 /// 2 300 300 /// 2 9,223,372,036,854,775,808 -9,223,372,036,854,775,808 - /// 2 NULL NULL - /// 3 NULL NULL + /// 2 NULL NULL + /// 3 NULL NULL /// == Call #4 == /// (drained) (drained) (drained) fn make_full_batch() -> MockExecutor { @@ -964,13 +964,13 @@ mod tests { ]), logical_rows: vec![0, 1, 2, 3, 4], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -998,13 +998,13 @@ mod tests { ]), logical_rows: vec![0, 1, 2, 3, 4], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -1036,18 +1036,18 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3]); assert_eq!(r.physical_columns.rows_len(), 4); assert_eq!(r.physical_columns.columns_len(), 3); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); assert_eq!( r.physical_columns[0].decoded().to_int_vec(), &[Some(1), Some(1), Some(2), Some(2)] @@ -1056,7 +1056,7 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); assert_eq!(r.physical_columns[0].decoded().to_int_vec(), &[Some(3)]); } @@ -1082,18 +1082,18 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3]); assert_eq!(r.physical_columns.rows_len(), 4); assert_eq!(r.physical_columns.columns_len(), 3); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); assert_eq!( r.physical_columns[0].decoded().to_int_vec(), &[Some(1), Some(1), Some(2), Some(2)] @@ -1102,7 +1102,7 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); assert_eq!(r.physical_columns[0].decoded().to_int_vec(), &[Some(3)]); } @@ -1132,23 +1132,23 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6, 7, 8]); assert_eq!(r.physical_columns.rows_len(), 9); assert_eq!(r.physical_columns.columns_len(), 3); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -1178,23 +1178,23 @@ mod tests { assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); assert_eq!(r.physical_columns.rows_len(), 5); assert_eq!(r.physical_columns.columns_len(), 3); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); assert_eq!(r.physical_columns.rows_len(), 5); assert_eq!(r.physical_columns.columns_len(), 3); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// The following tests are copied from `batch_top_n_executor.rs`. @@ -1208,7 +1208,7 @@ mod tests { )]), logical_rows: (0..1).collect(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ); @@ -1226,7 +1226,7 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -1240,13 +1240,13 @@ mod tests { )]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); @@ -1265,11 +1265,11 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Builds an executor that will return these data: @@ -1311,7 +1311,7 @@ mod tests { ]), logical_rows: vec![3, 0, 1], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -1321,7 +1321,7 @@ mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -1345,7 +1345,7 @@ mod tests { ]), logical_rows: vec![1, 2, 0, 4], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -1388,12 +1388,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6]); @@ -1419,7 +1419,7 @@ mod tests { Real::new(4.0).ok() ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -1459,12 +1459,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6]); @@ -1490,7 +1490,7 @@ mod tests { Real::new(4.0).ok() ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -1543,12 +1543,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -1572,7 +1572,7 @@ mod tests { Real::new(4.0).ok() ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Builds an executor that will return these data: @@ -1624,13 +1624,13 @@ mod tests { ]), logical_rows: vec![2, 1, 0], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -1664,7 +1664,7 @@ mod tests { ]), logical_rows: vec![0, 1, 2, 3], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -1708,12 +1708,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -1749,7 +1749,7 @@ mod tests { Some(b"aa".to_vec()), ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -1790,12 +1790,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -1831,7 +1831,7 @@ mod tests { Some(b"aa".to_vec()), ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Builds an executor that will return these data: @@ -1887,13 +1887,13 @@ mod tests { ]), logical_rows: vec![2, 1, 0], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -1927,7 +1927,7 @@ mod tests { ]), logical_rows: vec![2, 1, 0, 3], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -1952,12 +1952,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -1967,7 +1967,7 @@ mod tests { r.physical_columns[col_index].decoded().to_int_vec(), expected ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); }; test_top5( @@ -2067,12 +2067,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -2082,7 +2082,7 @@ mod tests { r.physical_columns[col_index].decoded().to_int_vec(), expected ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); }; test_top5_paging6( @@ -2191,7 +2191,7 @@ mod tests { ); let r1_is_drained = r1.is_drained.unwrap(); assert_eq!(r1_is_drained, r2.is_drained.unwrap()); - if r1_is_drained { + if r1_is_drained.stop() { break; } } diff --git a/components/tidb_query_executors/src/projection_executor.rs b/components/tidb_query_executors/src/projection_executor.rs index 962cd8698e5..2e88767ecbe 100644 --- a/components/tidb_query_executors/src/projection_executor.rs +++ b/components/tidb_query_executors/src/projection_executor.rs @@ -183,7 +183,7 @@ mod tests { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -192,13 +192,13 @@ mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); @@ -222,15 +222,15 @@ mod tests { // | assert_eq!(r.logical_rows.as_slice(), &[]); // | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ cannot infer type assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Builds an executor that will return these logical data: @@ -258,7 +258,7 @@ mod tests { ]), logical_rows: vec![2, 0], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -267,7 +267,7 @@ mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -276,7 +276,7 @@ mod tests { ]), logical_rows: vec![1], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -299,12 +299,12 @@ mod tests { r.physical_columns[0].decoded().to_int_vec(), vec![Some(1), Some(1), Some(1), Some(1), Some(1)] ); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.columns_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1]); @@ -313,7 +313,7 @@ mod tests { r.physical_columns[0].decoded().to_int_vec(), vec![Some(1), Some(1)] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -340,12 +340,12 @@ mod tests { r.physical_columns[1].decoded().to_real_vec(), vec![Real::new(7.0).ok(), Real::new(-5.0).ok(), None, None, None] ); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.columns_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1]); @@ -358,7 +358,7 @@ mod tests { r.physical_columns[1].decoded().to_real_vec(), vec![None, None] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// This function returns 1 when the value is even, 0 otherwise. @@ -406,13 +406,13 @@ mod tests { ]), logical_rows: vec![3, 4, 0, 2], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -422,7 +422,7 @@ mod tests { ]), logical_rows: vec![0], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -452,17 +452,17 @@ mod tests { r.physical_columns[1].decoded().to_int_vec(), vec![Some(0), Some(1), Some(0), Some(1)] ); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(r.logical_rows, &[0]); assert_eq!(r.physical_columns[0].decoded().to_int_vec(), vec![None]); assert_eq!(r.physical_columns[1].decoded().to_int_vec(), vec![Some(1)]); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -497,7 +497,7 @@ mod tests { ]), logical_rows: vec![1, 3, 4, 0], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -506,7 +506,7 @@ mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 3093b9bb24b..60857dda80d 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -28,7 +28,7 @@ use tipb::{ }; use super::{ - interface::{BatchExecutor, ExecuteStats}, + interface::{BatchExecIsDrain, BatchExecutor, ExecuteStats}, *, }; @@ -506,13 +506,13 @@ impl BatchExecutorsRunner { record_all += record_len; } - if drained || self.paging_size.map_or(false, |p| record_all >= p as usize) { + if drained.stop() || self.paging_size.map_or(false, |p| record_all >= p as usize) { self.out_most_executor .collect_exec_stats(&mut self.exec_stats); - - let range = if drained { + let range = if drained == BatchExecIsDrain::Drain { None } else { + // It's not allowed to stop paging when BatchExecIsDrain::PagingDrain. self.paging_size .map(|_| self.out_most_executor.take_scanned_range()) }; @@ -583,7 +583,7 @@ impl BatchExecutorsRunner { .mut_rows_data() .extend_from_slice(current_chunk.get_rows_data()); record_len += len; - is_drained = drained; + is_drained = drained.stop(); } if !is_drained || record_len > 0 { @@ -617,7 +617,7 @@ impl BatchExecutorsRunner { chunk: &mut Chunk, warnings: &mut EvalWarnings, ctx: &mut EvalContext, - ) -> Result<(bool, usize)> { + ) -> Result<(BatchExecIsDrain, usize)> { let mut record_len = 0; self.deadline.check()?; diff --git a/components/tidb_query_executors/src/selection_executor.rs b/components/tidb_query_executors/src/selection_executor.rs index 60459229f4f..bd65547109d 100644 --- a/components/tidb_query_executors/src/selection_executor.rs +++ b/components/tidb_query_executors/src/selection_executor.rs @@ -237,7 +237,7 @@ mod tests { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -246,13 +246,13 @@ mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); @@ -276,15 +276,15 @@ mod tests { // | assert_eq!(r.logical_rows.as_slice(), &[]); // | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ cannot infer type assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Builds an executor that will return these logical data: @@ -312,7 +312,7 @@ mod tests { ]), logical_rows: vec![2, 0], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -321,7 +321,7 @@ mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -330,7 +330,7 @@ mod tests { ]), logical_rows: vec![1], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -364,15 +364,15 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[2, 0]); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[1]); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } } @@ -390,15 +390,15 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// This function returns 1 when the value is even, 0 otherwise. @@ -446,13 +446,13 @@ mod tests { ]), logical_rows: vec![3, 4, 0, 2], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -462,7 +462,7 @@ mod tests { ]), logical_rows: vec![0], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -484,15 +484,15 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[3, 0]); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -509,15 +509,15 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 2]); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Tests the scenario that there are multiple predicates. Only the row that @@ -547,15 +547,15 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } } @@ -582,15 +582,15 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } } @@ -626,7 +626,7 @@ mod tests { ]), logical_rows: vec![1, 3, 4, 0], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -635,7 +635,7 @@ mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); diff --git a/components/tidb_query_executors/src/simple_aggr_executor.rs b/components/tidb_query_executors/src/simple_aggr_executor.rs index b6717a40fb5..e2138394d99 100644 --- a/components/tidb_query_executors/src/simple_aggr_executor.rs +++ b/components/tidb_query_executors/src/simple_aggr_executor.rs @@ -214,10 +214,10 @@ impl AggregationExecutorImpl for SimpleAggregationImpl fn iterate_available_groups( &mut self, entities: &mut Entities, - src_is_drained: bool, + src_is_drained: BatchExecIsDrain, mut iteratee: impl FnMut(&mut Entities, &[Box]) -> Result<()>, ) -> Result> { - assert!(src_is_drained); + assert!(src_is_drained.stop()); if self.has_input_rows { iteratee(entities, &self.states)?; } @@ -465,11 +465,11 @@ mod tests { // The scan rows parameter has no effect for mock executor. We don't care. let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); @@ -502,7 +502,7 @@ mod tests { r.physical_columns[11].decoded().to_real_vec(), &[Real::new(12.0).ok()] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -553,11 +553,11 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); @@ -586,7 +586,7 @@ mod tests { r.physical_columns[9].decoded().to_real_vec(), &[Real::new(8.5).ok()] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -629,13 +629,13 @@ mod tests { )]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); @@ -671,11 +671,11 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } } diff --git a/components/tidb_query_executors/src/slow_hash_aggr_executor.rs b/components/tidb_query_executors/src/slow_hash_aggr_executor.rs index ee076b652a7..a086e574506 100644 --- a/components/tidb_query_executors/src/slow_hash_aggr_executor.rs +++ b/components/tidb_query_executors/src/slow_hash_aggr_executor.rs @@ -435,10 +435,10 @@ impl AggregationExecutorImpl for SlowHashAggregationImp fn iterate_available_groups( &mut self, entities: &mut Entities, - src_is_drained: bool, + src_is_drained: BatchExecIsDrain, mut iteratee: impl FnMut(&mut Entities, &[Box]) -> Result<()>, ) -> Result> { - assert!(src_is_drained); + assert!(src_is_drained.stop()); let number_of_groups = self.groups.len(); let mut group_by_columns: Vec<_> = self @@ -577,12 +577,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let mut r = block_on(exec.next_batch(1)); // col_4 (sort_key), col_0 + 1 can result in: diff --git a/components/tidb_query_executors/src/stream_aggr_executor.rs b/components/tidb_query_executors/src/stream_aggr_executor.rs index d8a0599bf87..7ec683affa0 100644 --- a/components/tidb_query_executors/src/stream_aggr_executor.rs +++ b/components/tidb_query_executors/src/stream_aggr_executor.rs @@ -349,10 +349,10 @@ impl AggregationExecutorImpl for BatchStreamAggregation fn iterate_available_groups( &mut self, entities: &mut Entities, - src_is_drained: bool, + src_is_drained: BatchExecIsDrain, mut iteratee: impl FnMut(&mut Entities, &[Box]) -> Result<()>, ) -> Result> { - let number_of_groups = if src_is_drained { + let number_of_groups = if src_is_drained.stop() { AggregationExecutorImpl::::groups_len(self) } else { // don't include the partial group @@ -518,7 +518,7 @@ mod tests { assert_eq!(&r.logical_rows, &[0, 1]); assert_eq!(r.physical_columns.rows_len(), 2); assert_eq!(r.physical_columns.columns_len(), 5); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); // COUNT assert_eq!( r.physical_columns[0].decoded().to_int_vec(), @@ -548,13 +548,13 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); assert_eq!(r.physical_columns.columns_len(), 5); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); // COUNT assert_eq!(r.physical_columns[0].decoded().to_int_vec(), &[Some(5)]); // AVG_COUNT @@ -602,7 +602,7 @@ mod tests { assert_eq!(&r.logical_rows, &[0, 1]); assert_eq!(r.physical_columns.rows_len(), 2); assert_eq!(r.physical_columns.columns_len(), 2); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); // col_0 assert_eq!( r.physical_columns[0].decoded().to_bytes_vec(), @@ -617,13 +617,13 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0]); assert_eq!(r.physical_columns.rows_len(), 1); assert_eq!(r.physical_columns.columns_len(), 2); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); // col_0 assert_eq!( r.physical_columns[0].decoded().to_bytes_vec(), @@ -691,7 +691,7 @@ mod tests { ]), logical_rows: vec![3, 1, 4, 2, 6], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -702,7 +702,7 @@ mod tests { ]), logical_rows: vec![2], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -713,7 +713,7 @@ mod tests { ]), logical_rows: (0..2).collect(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) diff --git a/components/tidb_query_executors/src/table_scan_executor.rs b/components/tidb_query_executors/src/table_scan_executor.rs index 4397869fcaa..fa05071e8bd 100644 --- a/components/tidb_query_executors/src/table_scan_executor.rs +++ b/components/tidb_query_executors/src/table_scan_executor.rs @@ -723,7 +723,7 @@ mod tests { let expect_rows = *expect_rows; let expect_drained = start_row + expect_rows > total_rows; let result = block_on(executor.next_batch(expect_rows)); - assert_eq!(*result.is_drained.as_ref().unwrap(), expect_drained); + assert_eq!(result.is_drained.as_ref().unwrap().stop(), expect_drained); if expect_drained { // all remaining rows are fetched helper.expect_table_values( @@ -1286,7 +1286,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert_eq!(result.is_drained.unwrap(), true); + assert!(result.is_drained.unwrap().stop()); assert_eq!(result.logical_rows.len(), 1); assert_eq!(result.physical_columns.columns_len(), columns_is_pk.len()); for i in 0..columns_is_pk.len() { @@ -1394,7 +1394,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert_eq!(result.is_drained.unwrap(), true); + assert!(result.is_drained.unwrap().stop()); assert_eq!(result.logical_rows.len(), 1); // We expect we fill the primary column with the value embedded in the common @@ -1575,7 +1575,7 @@ mod tests { .unwrap(); let mut result = block_on(executor.next_batch(10)); - assert_eq!(result.is_drained.unwrap(), true); + assert!(result.is_drained.unwrap().stop()); if !columns_info.is_empty() { assert_eq!(result.logical_rows.len(), 1); } diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index dd6b7be2dba..670b0e0a879 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -194,7 +194,7 @@ impl BatchTopNExecutor { self.process_batch_input(src_result.physical_columns, src_result.logical_rows)?; } - if src_is_drained { + if src_is_drained.stop() { Ok(Some(self.heap.take_all())) } else { Ok(None) @@ -268,7 +268,7 @@ impl BatchExecutor for BatchTopNExecutor { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }; } @@ -298,14 +298,14 @@ impl BatchExecutor for BatchTopNExecutor { physical_columns: logical_columns, logical_rows, warnings: self.context.take_warnings(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), } } Ok(None) => BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: self.context.take_warnings(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, } } @@ -352,7 +352,7 @@ mod tests { )]), logical_rows: (0..1).collect(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }], ); @@ -369,7 +369,7 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -383,13 +383,13 @@ mod tests { )]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ); @@ -407,11 +407,11 @@ mod tests { let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Builds an executor that will return these data: @@ -453,7 +453,7 @@ mod tests { ]), logical_rows: vec![3, 0, 1], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -463,7 +463,7 @@ mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -487,7 +487,7 @@ mod tests { ]), logical_rows: vec![1, 2, 0, 4], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -529,12 +529,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6]); @@ -560,7 +560,7 @@ mod tests { Real::new(4.0).ok() ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -599,12 +599,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4, 5, 6]); @@ -630,7 +630,7 @@ mod tests { Real::new(4.0).ok() ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -682,12 +682,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -711,7 +711,7 @@ mod tests { Real::new(4.0).ok() ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Builds an executor that will return these data: @@ -763,13 +763,13 @@ mod tests { ]), logical_rows: vec![2, 1, 0], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -803,7 +803,7 @@ mod tests { ]), logical_rows: vec![0, 1, 2, 3], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -846,12 +846,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -887,7 +887,7 @@ mod tests { Some(b"aa".to_vec()), ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } #[test] @@ -927,12 +927,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -968,7 +968,7 @@ mod tests { Some(b"aa".to_vec()), ] ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } /// Builds an executor that will return these data: @@ -1024,13 +1024,13 @@ mod tests { ]), logical_rows: vec![2, 1, 0], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::empty(), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -1064,7 +1064,7 @@ mod tests { ]), logical_rows: vec![2, 1, 0, 3], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -1088,12 +1088,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -1103,7 +1103,7 @@ mod tests { r.physical_columns[col_index].decoded().to_int_vec(), expected ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); }; test_top5( @@ -1202,12 +1202,12 @@ mod tests { let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert!(r.logical_rows.is_empty()); assert_eq!(r.physical_columns.rows_len(), 0); - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); let r = block_on(exec.next_batch(1)); assert_eq!(&r.logical_rows, &[0, 1, 2, 3, 4]); @@ -1217,7 +1217,7 @@ mod tests { r.physical_columns[col_index].decoded().to_int_vec(), expected ); - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); }; test_top5_paging6( @@ -1325,7 +1325,7 @@ mod tests { ); let r1_is_drained = r1.is_drained.unwrap(); assert_eq!(r1_is_drained, r2.is_drained.unwrap()); - if r1_is_drained { + if r1_is_drained.stop() { break; } } diff --git a/components/tidb_query_executors/src/util/aggr_executor.rs b/components/tidb_query_executors/src/util/aggr_executor.rs index ceb9949f83b..0535e8dbd83 100644 --- a/components/tidb_query_executors/src/util/aggr_executor.rs +++ b/components/tidb_query_executors/src/util/aggr_executor.rs @@ -86,7 +86,7 @@ pub trait AggregationExecutorImpl: Send { fn iterate_available_groups( &mut self, entities: &mut Entities, - src_is_drained: bool, + src_is_drained: BatchExecIsDrain, iteratee: impl FnMut(&mut Entities, &[Box]) -> Result<()>, ) -> Result>; @@ -203,7 +203,9 @@ impl> AggregationExecutor Result<(Option, bool)> { + async fn handle_next_batch( + &mut self, + ) -> Result<(Option, BatchExecIsDrain)> { // Use max batch size from the beginning because aggregation // always needs to calculate over all data. let src_result = self @@ -231,16 +233,16 @@ impl> AggregationExecutor= required_row as usize { - src_is_drained = true + src_is_drained = BatchExecIsDrain::PagingDrain; } // StreamAgg will return groups_len - 1 rows immediately - if !src_is_drained && self.imp.is_partial_results_ready() { + if src_is_drained.is_remain() && self.imp.is_partial_results_ready() { self.required_row = Some(required_row + 1 - self.imp.groups_len() as u64) } } // aggregate result is always available when source is drained - let result = if src_is_drained || self.imp.is_partial_results_ready() { + let result = if src_is_drained.stop() || self.imp.is_partial_results_ready() { Some(self.aggregate_partial_results(src_is_drained)?) } else { None @@ -249,7 +251,10 @@ impl> AggregationExecutor Result { + fn aggregate_partial_results( + &mut self, + src_is_drained: BatchExecIsDrain, + ) -> Result { let groups_len = self.imp.groups_len(); let mut all_result_columns: Vec<_> = self .entities @@ -324,7 +329,7 @@ impl> BatchExecutor } } Ok((data, src_is_drained)) => { - self.is_ended = src_is_drained; + self.is_ended = src_is_drained.stop(); let logical_columns = data.unwrap_or_else(LazyBatchColumnVec::empty); let logical_rows = (0..logical_columns.rows_len()).collect(); BatchExecuteResult { @@ -464,7 +469,7 @@ pub mod tests { ]), logical_rows: vec![2, 4, 0, 1], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -476,7 +481,7 @@ pub mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -494,7 +499,7 @@ pub mod tests { ]), logical_rows: vec![1], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -540,7 +545,7 @@ pub mod tests { ]), logical_rows: vec![2, 4, 0, 1], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -549,7 +554,7 @@ pub mod tests { ]), logical_rows: Vec::new(), warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -567,7 +572,7 @@ pub mod tests { ]), logical_rows: vec![1, 2], warnings: EvalWarnings::default(), - is_drained: Ok(false), + is_drained: Ok(BatchExecIsDrain::Remain), }, BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![ @@ -576,7 +581,7 @@ pub mod tests { ]), logical_rows: vec![1, 0], warnings: EvalWarnings::default(), - is_drained: Ok(true), + is_drained: Ok(BatchExecIsDrain::Drain), }, ], ) @@ -651,9 +656,9 @@ pub mod tests { for nth_call in 0..call_num { let r = block_on(exec.next_batch(1)); if nth_call == call_num - 1 { - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } else { - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); } assert_eq!(r.physical_columns.rows_len(), row_num[nth_call]); } @@ -681,9 +686,9 @@ pub mod tests { for nth_call in 0..call_num { let r = block_on(exec.next_batch(1)); if nth_call == call_num - 1 { - assert!(r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().stop()); } else { - assert!(!r.is_drained.unwrap()); + assert!(r.is_drained.unwrap().is_remain()); } assert_eq!(r.physical_columns.rows_len(), row_num[nth_call]); } diff --git a/components/tidb_query_executors/src/util/mock_executor.rs b/components/tidb_query_executors/src/util/mock_executor.rs index a6f11904b33..aee7e526425 100644 --- a/components/tidb_query_executors/src/util/mock_executor.rs +++ b/components/tidb_query_executors/src/util/mock_executor.rs @@ -95,7 +95,11 @@ impl BatchExecutor for MockScanExecutor { self.pos += 1; cur_row_idx += 1; } - let is_drained = self.pos >= self.rows.len(); + let is_drained = if self.pos >= self.rows.len() { + BatchExecIsDrain::Drain + } else { + BatchExecIsDrain::Remain + }; BatchExecuteResult { physical_columns: LazyBatchColumnVec::from(vec![VectorValue::Int(res_col.into())]), logical_rows: res_logical_rows, diff --git a/components/tidb_query_executors/src/util/scan_executor.rs b/components/tidb_query_executors/src/util/scan_executor.rs index 75c7cdc9fe3..be134725de6 100644 --- a/components/tidb_query_executors/src/util/scan_executor.rs +++ b/components/tidb_query_executors/src/util/scan_executor.rs @@ -188,10 +188,17 @@ impl BatchExecutor for ScanExecuto // *successfully* retrieving these rows. After that, if we only consumes // some of the rows (TopN / Limit), we should ignore this error. - match &is_drained { + let is_drained = match is_drained { // Note: `self.is_ended` is only used for assertion purpose. - Err(_) | Ok(true) => self.is_ended = true, - Ok(false) => {} + Err(e) => { + self.is_ended = true; + Err(e) + } + Ok(true) => { + self.is_ended = true; + Ok(BatchExecIsDrain::Drain) + } + Ok(false) => Ok(BatchExecIsDrain::Remain), }; BatchExecuteResult { diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index f292b5220e3..25fd67b9a99 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -388,7 +388,7 @@ impl RowSampleBuilder { res }; let _guard = sample.observe_cpu(); - is_drained = result.is_drained?; + is_drained = result.is_drained?.stop(); let columns_slice = result.physical_columns.as_slice(); let mut column_vals: Vec> = vec![vec![]; self.columns_info.len()]; @@ -887,7 +887,7 @@ impl SampleBuilder { let mut common_handle_fms = FmSketch::new(self.max_fm_sketch_size); while !is_drained { let result = self.data.next_batch(BATCH_MAX_SIZE).await; - is_drained = result.is_drained?; + is_drained = result.is_drained?.stop(); let mut columns_slice = result.physical_columns.as_slice(); let mut columns_info = &self.columns_info[..]; diff --git a/tests/benches/coprocessor_executors/util/bencher.rs b/tests/benches/coprocessor_executors/util/bencher.rs index 246510f991b..4b4734f3038 100644 --- a/tests/benches/coprocessor_executors/util/bencher.rs +++ b/tests/benches/coprocessor_executors/util/bencher.rs @@ -64,7 +64,7 @@ impl E> Bencher for BatchNextAllBencher { loop { let r = block_on(executor.next_batch(1024)); black_box(&r); - if r.is_drained.unwrap() { + if r.is_drained.unwrap().stop() { break; } } diff --git a/tests/benches/coprocessor_executors/util/fixture.rs b/tests/benches/coprocessor_executors/util/fixture.rs index 24062c7a2da..e3306d3e0ed 100644 --- a/tests/benches/coprocessor_executors/util/fixture.rs +++ b/tests/benches/coprocessor_executors/util/fixture.rs @@ -314,7 +314,11 @@ impl BatchExecutor for BatchFixtureExecutor { physical_columns, logical_rows, warnings: EvalWarnings::default(), - is_drained: Ok(self.columns[0].is_empty()), + is_drained: Ok(if self.columns[0].is_empty() { + BatchExecIsDrain::Drain + } else { + BatchExecIsDrain::Remain + }), } } diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index b3a6bf76c01..d7f6540a3c6 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -261,6 +261,16 @@ fn test_paging_scan() { let resp = handle_request(&endpoint, req); assert!(resp.range.is_none()); assert!(resp.range.is_none()); + + let agg_req = DagSelect::from(&product) + .count(&product["count"]) + .group_by(&[&product["name"]]) + .output_offsets(Some(vec![0, 1])) + .desc(desc) + .paging_size(2) + .build(); + let resp = handle_request(&endpoint, agg_req); + assert!(resp.range.is_some()); } } From 1208135d166ef91faa3997119604ca8290452380 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 2 Mar 2023 09:13:09 +0800 Subject: [PATCH 558/676] file_system: initialize io stats sentinel on thread start (#14319) ref tikv/tikv#10867 Fix thread I/O not monitored if I/O type isn't set. Signed-off-by: tabokie --- components/file_system/src/io_stats/proc.rs | 63 +++++++++++++++++++++ components/tikv_util/src/sys/thread.rs | 14 +++++ components/tikv_util/src/yatp_pool/mod.rs | 1 + 3 files changed, 78 insertions(+) diff --git a/components/file_system/src/io_stats/proc.rs b/components/file_system/src/io_stats/proc.rs index 652fe05c658..fca0f6a64b1 100644 --- a/components/file_system/src/io_stats/proc.rs +++ b/components/file_system/src/io_stats/proc.rs @@ -138,6 +138,12 @@ pub fn init() -> Result<(), String> { ThreadId::current() .fetch_io_bytes() .map_err(|e| format!("failed to fetch I/O bytes from proc: {}", e))?; + // Manually initialize the sentinel so that `fetch_io_bytes` doesn't miss any + // thread. + LOCAL_IO_STATS.get_or(|| CachePadded::new(Mutex::new(LocalIoStats::current()))); + tikv_util::sys::thread::hook_thread_start(Box::new(|| { + LOCAL_IO_STATS.get_or(|| CachePadded::new(Mutex::new(LocalIoStats::current()))); + })); Ok(()) } @@ -179,11 +185,13 @@ mod tests { use std::{ io::{Read, Write}, os::unix::fs::OpenOptionsExt, + sync::mpsc, }; use libc::O_DIRECT; use maligned::{AsBytes, AsBytesMut, A512}; use tempfile::{tempdir, tempdir_in}; + use tikv_util::sys::thread::StdThreadBuildWrapper; use super::*; use crate::{OpenOptions, WithIoType}; @@ -243,6 +251,61 @@ mod tests { } } + #[test] + fn test_fetch_all_io_bytes() { + let tmp = tempdir_in("/var/tmp").unwrap_or_else(|_| tempdir().unwrap()); + + init().unwrap(); + + let file_path = tmp.path().join("test_fetch_all_io_bytes_1.txt"); + let (tx1, rx1) = mpsc::sync_channel(0); + let t1 = std::thread::Builder::new() + .spawn_wrapper(move || { + set_io_type(IoType::ForegroundWrite); + let mut f = OpenOptions::new() + .write(true) + .create(true) + .custom_flags(O_DIRECT) + .open(file_path) + .unwrap(); + let w = vec![A512::default(); 8]; + f.write_all(w.as_bytes()).unwrap(); + f.sync_all().unwrap(); + tx1.send(()).unwrap(); + tx1.send(()).unwrap(); + }) + .unwrap(); + + let file_path = tmp.path().join("test_fetch_all_io_bytes_2.txt"); + let (tx2, rx2) = mpsc::sync_channel(0); + let t2 = std::thread::Builder::new() + .spawn_wrapper(move || { + let mut f = OpenOptions::new() + .write(true) + .create(true) + .custom_flags(O_DIRECT) + .open(file_path) + .unwrap(); + let w = vec![A512::default(); 8]; + f.write_all(w.as_bytes()).unwrap(); + f.sync_all().unwrap(); + tx2.send(()).unwrap(); + tx2.send(()).unwrap(); + }) + .unwrap(); + + rx1.recv().unwrap(); + rx2.recv().unwrap(); + let bytes = fetch_io_bytes(); + assert_eq!(bytes[IoType::ForegroundWrite as usize].write, 4096); + assert_eq!(bytes[IoType::Other as usize].write, 4096); + + rx1.recv().unwrap(); + rx2.recv().unwrap(); + t1.join().unwrap(); + t2.join().unwrap(); + } + #[bench] fn bench_fetch_thread_io_bytes(b: &mut test::Bencher) { let mut id = ThreadId::current(); diff --git a/components/tikv_util/src/sys/thread.rs b/components/tikv_util/src/sys/thread.rs index 60c420661d0..1f138669b96 100644 --- a/components/tikv_util/src/sys/thread.rs +++ b/components/tikv_util/src/sys/thread.rs @@ -384,6 +384,17 @@ pub trait ThreadBuildWrapper { lazy_static::lazy_static! { pub static ref THREAD_NAME_HASHMAP: Mutex> = Mutex::new(HashMap::default()); + pub static ref THREAD_START_HOOKS: Mutex>> = Mutex::new(Vec::new()); +} + +pub fn hook_thread_start(f: Box) { + THREAD_START_HOOKS.lock().unwrap().push(f); +} + +pub(crate) fn call_thread_start_hooks() { + for f in THREAD_START_HOOKS.lock().unwrap().iter() { + f(); + } } pub(crate) fn add_thread_name_to_map() { @@ -411,6 +422,7 @@ impl StdThreadBuildWrapper for std::thread::Builder { { #[allow(clippy::disallowed_methods)] self.spawn(|| { + call_thread_start_hooks(); add_thread_name_to_map(); let res = f(); remove_thread_name_from_map(); @@ -426,6 +438,7 @@ impl ThreadBuildWrapper for tokio::runtime::Builder { { #[allow(clippy::disallowed_methods)] self.on_thread_start(move || { + call_thread_start_hooks(); add_thread_name_to_map(); f(); }) @@ -450,6 +463,7 @@ impl ThreadBuildWrapper for futures::executor::ThreadPoolBuilder { { #[allow(clippy::disallowed_methods)] self.after_start(move |_| { + call_thread_start_hooks(); add_thread_name_to_map(); f(); }) diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index 05c245bd5a3..930185a1440 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -172,6 +172,7 @@ impl Runner for YatpPoolRunner { type TaskCell = TaskCell; fn start(&mut self, local: &mut Local) { + crate::sys::thread::call_thread_start_hooks(); crate::sys::thread::add_thread_name_to_map(); if let Some(props) = self.props.take() { crate::thread_group::set_properties(Some(props)); From b050f07c403d6bf63ea54b56e8adc0c542acb1fd Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 2 Mar 2023 09:43:10 +0800 Subject: [PATCH 559/676] integration test v2: introduce TestRaftKv2 (#14300) ref tikv/tikv#12842 introduce TestRaftKv2 Signed-off-by: SpadeA-Tang Co-authored-by: Xinye Tao --- components/test_raftstore-v2/src/cluster.rs | 106 +++++++++- components/test_raftstore-v2/src/server.rs | 185 ++++++++++++++++-- .../src/transport_simulate.rs | 4 + components/test_raftstore-v2/src/util.rs | 24 ++- components/test_raftstore/src/cluster.rs | 12 ++ src/server/mod.rs | 2 +- src/server/raftkv2/mod.rs | 1 + tests/failpoints/cases/test_conf_change.rs | 32 +-- tests/failpoints/cases/test_snap.rs | 15 +- tests/integrations/raftstore/test_snap.rs | 97 +-------- 10 files changed, 348 insertions(+), 130 deletions(-) diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 30d3456d652..164794aca56 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -27,7 +27,7 @@ use kvproto::{ AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RegionDetailResponse, Request, Response, StatusCmdType, }, - raft_serverpb::{PeerState, RaftApplyState, RegionLocalState, StoreIdent}, + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, }; use pd_client::PdClient; use raftstore::{ @@ -53,8 +53,13 @@ use test_raftstore::{ }; use tikv::server::Result as ServerResult; use tikv_util::{ - box_err, box_try, debug, error, safe_panic, thread_group::GroupProperties, time::Instant, - timer::GLOBAL_TIMER_HANDLE, warn, worker::LazyWorker, HandyRwLock, + box_err, box_try, debug, error, safe_panic, + thread_group::GroupProperties, + time::{Instant, ThreadReadId}, + timer::GLOBAL_TIMER_HANDLE, + warn, + worker::LazyWorker, + HandyRwLock, }; use crate::create_test_engine; @@ -314,6 +319,17 @@ impl Cluster { self.cfg.server.cluster_id } + pub fn flush_data(&self) { + for reg in self.tablet_registries.values() { + reg.for_each_opened_tablet(|_, cached| -> bool { + if let Some(tablet) = cached.latest() { + tablet.flush_cf(CF_DEFAULT, true /* sync */).unwrap(); + } + true + }); + } + } + // Bootstrap the store with fixed ID (like 1, 2, .. 5) and // initialize first region in all stores, then start the cluster. pub fn run(&mut self) { @@ -569,6 +585,22 @@ impl Cluster { ) } + pub fn read( + &self, + // v2 does not need this + _batch_id: Option, + request: RaftCmdRequest, + timeout: Duration, + ) -> Result { + match self.sim.wl().read(request.clone(), timeout) { + Err(e) => { + warn!("failed to read {:?}: {:?}", request, e); + Err(e) + } + a => a, + } + } + // mixed read and write requests are not supportted pub fn call_command( &mut self, @@ -1111,10 +1143,18 @@ impl Cluster { self.sim.wl().add_send_filter(node_id, filter); } + pub fn clear_send_filter_on_node(&mut self, node_id: u64) { + self.sim.wl().clear_send_filters(node_id); + } + pub fn add_recv_filter_on_node(&mut self, node_id: u64, filter: Box) { self.sim.wl().add_recv_filter(node_id, filter); } + pub fn clear_recv_filter_on_node(&mut self, node_id: u64) { + self.sim.wl().clear_recv_filters(node_id); + } + pub fn add_send_filter(&self, factory: F) { let mut sim = self.sim.wl(); for node_id in sim.get_node_ids() { @@ -1312,6 +1352,10 @@ impl Cluster { self.sim.rl().get_snap_dir(node_id) } + pub fn get_router(&self, node_id: u64) -> Option> { + self.sim.rl().get_router(node_id) + } + pub fn refresh_region_bucket_keys( &mut self, _region: &metapb::Region, @@ -1330,6 +1374,58 @@ impl Cluster { unimplemented!() } + pub fn wait_tombstone(&self, region_id: u64, peer: metapb::Peer, check_exist: bool) { + let timer = Instant::now(); + let mut state; + loop { + state = self.region_local_state(region_id, peer.get_store_id()); + if state.get_state() == PeerState::Tombstone + && (!check_exist || state.get_region().get_peers().contains(&peer)) + { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + break; + } + thread::sleep(Duration::from_millis(10)); + } + panic!( + "{:?} is still not gc in region {} {:?}", + peer, region_id, state + ); + } + + pub fn wait_destroy_and_clean(&self, region_id: u64, peer: metapb::Peer) { + let timer = Instant::now(); + self.wait_tombstone(region_id, peer.clone(), false); + let mut state; + loop { + state = self.get_raft_local_state(region_id, peer.get_store_id()); + if state.is_none() { + return; + } + if timer.saturating_elapsed() > Duration::from_secs(5) { + break; + } + thread::sleep(Duration::from_millis(10)); + } + panic!( + "{:?} is still not cleaned in region {} {:?}", + peer, region_id, state + ); + } + + pub fn region_local_state(&self, region_id: u64, store_id: u64) -> RegionLocalState { + self.get_engine(store_id) + .get_region_state(region_id) + .unwrap() + .unwrap() + } + + pub fn get_raft_local_state(&self, region_id: u64, store_id: u64) -> Option { + self.get_engine(store_id).get_raft_local_state(region_id) + } + pub fn shutdown(&mut self) { debug!("about to shutdown cluster"); let keys = match self.sim.read() { @@ -1427,6 +1523,10 @@ impl WrapFactory { pub fn get_apply_state(&self, region_id: u64) -> engine_traits::Result> { self.raft_engine.get_apply_state(region_id, u64::MAX) } + + pub fn get_raft_local_state(&self, region_id: u64) -> Option { + self.raft_engine.get_raft_state(region_id).unwrap() + } } impl Peekable for WrapFactory { diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 1aa3bfc47f8..dbcede48a6a 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -25,6 +25,7 @@ use kvproto::{ kvrpcpb::{ApiVersion, Context}, metapb, raft_cmdpb::RaftCmdResponse, + raft_serverpb::RaftMessage, tikvpb_grpc::TikvClient, }; use pd_client::PdClient; @@ -32,8 +33,8 @@ use raftstore::{ coprocessor::CoprocessorHost, errors::Error as RaftError, store::{ - AutoSplitController, CheckLeaderRunner, FlowStatsReporter, ReadStats, RegionSnapshot, - TabletSnapManager, WriteStats, + region_meta, AutoSplitController, CheckLeaderRunner, FlowStatsReporter, ReadStats, + RegionSnapshot, TabletSnapManager, WriteStats, }, RegionInfoAccessor, }; @@ -44,7 +45,7 @@ use security::SecurityManager; use slog_global::debug; use tempfile::TempDir; use test_pd_client::TestPdClient; -use test_raftstore::{AddressMap, Config}; +use test_raftstore::{filter_send, AddressMap, Config, Filter}; use tikv::{ coprocessor, coprocessor_v2, import::SstImporter, @@ -52,12 +53,12 @@ use tikv::{ server::{ gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, raftkv::ReplicaReadLockChecker, resolve, service::DiagnosticsService, ConnectionBuilder, - Error, NodeV2, PdStoreAddrResolver, RaftClient, RaftKv2, Result as ServerResult, Server, - ServerTransport, + Error, Extension, NodeV2, PdStoreAddrResolver, RaftClient, RaftKv2, Result as ServerResult, + Server, ServerTransport, }, storage::{ self, - kv::{FakeExtension, SnapContext}, + kv::{FakeExtension, RaftExtension, SnapContext}, txn::flow_controller::{EngineFlowController, FlowController}, Engine, Storage, }, @@ -84,16 +85,159 @@ impl FlowStatsReporter for DummyReporter { fn report_write_stats(&self, _write_stats: WriteStats) {} } -type SimulateRaftExtension = ::RaftExtension; +type SimulateRaftExtension = ::RaftExtension; type SimulateStoreTransport = SimulateTransport>; type SimulateServerTransport = SimulateTransport>; pub type SimulateEngine = RaftKv2; +// TestRaftKvv2 behaves the same way with RaftKv2, except that it has filters +// that can mock various network conditions. +#[derive(Clone)] +pub struct TestRaftKv2 { + raftkv: SimulateEngine, + filters: Arc>>>, +} + +impl TestRaftKv2 { + pub fn new(raftkv: SimulateEngine, filters: Arc>>>) -> TestRaftKv2 { + TestRaftKv2 { raftkv, filters } + } + + pub fn set_txn_extra_scheduler(&mut self, txn_extra_scheduler: Arc) { + self.raftkv.set_txn_extra_scheduler(txn_extra_scheduler); + } +} + +impl Engine for TestRaftKv2 { + type Snap = RegionSnapshot<::Snapshot>; + type Local = RocksEngine; + + fn kv_engine(&self) -> Option { + self.raftkv.kv_engine() + } + + type RaftExtension = TestExtension; + fn raft_extension(&self) -> Self::RaftExtension { + TestExtension::new(self.raftkv.raft_extension(), self.filters.clone()) + } + + fn modify_on_kv_engine( + &self, + region_modifies: HashMap>, + ) -> storage::kv::Result<()> { + self.raftkv.modify_on_kv_engine(region_modifies) + } + + type SnapshotRes = ::SnapshotRes; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { + self.raftkv.async_snapshot(ctx) + } + + type WriteRes = ::WriteRes; + fn async_write( + &self, + ctx: &Context, + batch: storage::kv::WriteData, + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + self.raftkv.async_write(ctx, batch, subscribed, on_applied) + } + + #[inline] + fn precheck_write_with_ctx(&self, ctx: &Context) -> storage::kv::Result<()> { + self.raftkv.precheck_write_with_ctx(ctx) + } + + #[inline] + fn schedule_txn_extra(&self, txn_extra: txn_types::TxnExtra) { + self.raftkv.schedule_txn_extra(txn_extra) + } +} + +#[derive(Clone)] +pub struct TestExtension { + extension: Extension, + filters: Arc>>>, +} + +impl TestExtension { + pub fn new( + extension: Extension, + filters: Arc>>>, + ) -> Self { + TestExtension { extension, filters } + } +} + +impl RaftExtension for TestExtension { + fn feed(&self, msg: RaftMessage, key_message: bool) { + let send = |msg| -> raftstore::Result<()> { + self.extension.feed(msg, key_message); + Ok(()) + }; + + let _ = filter_send(&self.filters, msg, send); + } + + #[inline] + fn report_reject_message(&self, region_id: u64, from_peer_id: u64) { + self.extension + .report_reject_message(region_id, from_peer_id) + } + + #[inline] + fn report_peer_unreachable(&self, region_id: u64, to_peer_id: u64) { + self.extension + .report_peer_unreachable(region_id, to_peer_id) + } + + #[inline] + fn report_store_unreachable(&self, store_id: u64) { + self.extension.report_store_unreachable(store_id) + } + + #[inline] + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: raft::SnapshotStatus, + ) { + self.extension + .report_snapshot_status(region_id, to_peer_id, status) + } + + #[inline] + fn report_resolved(&self, store_id: u64, group_id: u64) { + self.extension.report_resolved(store_id, group_id) + } + + #[inline] + fn split( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + split_keys: Vec>, + source: String, + ) -> futures::future::BoxFuture<'static, storage::kv::Result>> { + self.extension + .split(region_id, region_epoch, split_keys, source) + } + + fn query_region( + &self, + region_id: u64, + ) -> futures::future::BoxFuture<'static, storage::kv::Result> { + self.extension.query_region(region_id) + } +} + pub struct ServerMeta { node: NodeV2, - server: Server, + server: Server, sim_router: SimulateStoreTransport, sim_trans: SimulateServerTransport, raw_router: StoreRouter, @@ -105,7 +249,7 @@ type PendingServices = Vec Service>>; pub struct ServerCluster { metas: HashMap, addrs: AddressMap, - pub storages: HashMap, + pub storages: HashMap, pub region_info_accessors: HashMap, snap_paths: HashMap, snap_mgrs: HashMap, @@ -229,9 +373,10 @@ impl ServerCluster { let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); let sim_router = SimulateTransport::new(raft_router.clone()); - // todo(SpadeA): simulate transport - let mut raft_kv_v2 = - RaftKv2::new(raft_router.clone(), region_info_accessor.region_leaders()); + let mut raft_kv_v2 = TestRaftKv2::new( + RaftKv2::new(raft_router.clone(), region_info_accessor.region_leaders()), + sim_router.filters().clone(), + ); // Create storage. let pd_worker = LazyWorker::new("test-pd-worker"); @@ -544,12 +689,20 @@ impl Simulator for ServerCluster { .clear_filters(); } - fn add_recv_filter(&mut self, _node_id: u64, _filter: Box) { - unimplemented!() + fn add_recv_filter(&mut self, node_id: u64, filter: Box) { + self.metas + .get_mut(&node_id) + .unwrap() + .sim_router + .add_filter(filter); } - fn clear_recv_filters(&mut self, _node_id: u64) { - unimplemented!() + fn clear_recv_filters(&mut self, node_id: u64) { + self.metas + .get_mut(&node_id) + .unwrap() + .sim_router + .clear_filters(); } fn run_node( diff --git a/components/test_raftstore-v2/src/transport_simulate.rs b/components/test_raftstore-v2/src/transport_simulate.rs index b55c29dbd3a..9c11505d75f 100644 --- a/components/test_raftstore-v2/src/transport_simulate.rs +++ b/components/test_raftstore-v2/src/transport_simulate.rs @@ -42,6 +42,10 @@ impl SimulateTransport { pub fn add_filter(&mut self, filter: Box) { self.filters.wl().push(filter); } + + pub fn filters(&self) -> &Arc>>> { + &self.filters + } } impl Transport for SimulateTransport { diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index 2f512982019..d9a0377210b 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -7,11 +7,12 @@ use engine_rocks::{RocksEngine, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{TabletRegistry, CF_DEFAULT}; use file_system::IoRateLimiter; -use kvproto::kvrpcpb::Context; +use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb::RaftCmdResponse}; +use raftstore::Result; use rand::RngCore; use server::server2::ConfiguredRaftEngine; use tempfile::TempDir; -use test_raftstore::{new_put_cf_cmd, Config}; +use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, Config}; use tikv::{ server::KvEngineFactoryBuilder, storage::{ @@ -189,3 +190,22 @@ pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, regio } assert!(snapshot.ext().is_max_ts_synced()); } + +// Issue a read request on the specified peer. +pub fn read_on_peer( + cluster: &mut Cluster, + peer: metapb::Peer, + region: metapb::Region, + key: &[u8], + read_quorum: bool, + timeout: Duration, +) -> Result { + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(key)], + read_quorum, + ); + request.mut_header().set_peer(peer); + cluster.read(None, request, timeout) +} diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index d4668fe4928..2a73f5e239c 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1339,6 +1339,10 @@ impl Cluster { self.sim.wl().add_send_filter(node_id, filter); } + pub fn clear_send_filter_on_node(&mut self, node_id: u64) { + self.sim.wl().clear_send_filters(node_id); + } + pub fn add_recv_filter_on_node(&mut self, node_id: u64, filter: Box) { self.sim.wl().add_recv_filter(node_id, filter); } @@ -1352,6 +1356,10 @@ impl Cluster { } } + pub fn clear_recv_filter_on_node(&mut self, node_id: u64) { + self.sim.wl().clear_recv_filters(node_id); + } + pub fn transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) { let epoch = self.get_region_epoch(region_id); let transfer_leader = new_admin_request(region_id, &epoch, new_transfer_leader_cmd(leader)); @@ -1830,6 +1838,10 @@ impl Cluster { ctx } + pub fn get_router(&self, node_id: u64) -> Option> { + self.sim.rl().get_router(node_id) + } + pub fn refresh_region_bucket_keys( &mut self, region: &metapb::Region, diff --git a/src/server/mod.rs b/src/server/mod.rs index 0bb6da62ac7..773e2040f17 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -36,7 +36,7 @@ pub use self::{ proxy::{build_forward_option, get_target_address, Proxy}, raft_client::{ConnectionBuilder, RaftClient}, raftkv::RaftKv, - raftkv2::{NodeV2, RaftKv2}, + raftkv2::{Extension, NodeV2, RaftKv2}, resolve::{PdStoreAddrResolver, StoreAddrResolver}, server::{Server, GRPC_THREAD_PREFIX}, transport::ServerTransport, diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 9fb4ef70b03..60e0a53a20a 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -15,6 +15,7 @@ use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; use futures::{Future, Stream, StreamExt}; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; pub use node::NodeV2; +pub use raft_extension::Extension; use raftstore::store::{util::encode_start_ts_into_flag_data, RegionSnapshot}; use raftstore_v2::{ router::{ diff --git a/tests/failpoints/cases/test_conf_change.rs b/tests/failpoints/cases/test_conf_change.rs index 7821c8be5df..0a1be37cab6 100644 --- a/tests/failpoints/cases/test_conf_change.rs +++ b/tests/failpoints/cases/test_conf_change.rs @@ -11,12 +11,14 @@ use kvproto::raft_serverpb::RaftMessage; use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::ReadableDuration, HandyRwLock}; -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_destroy_local_reader() { // 3 nodes cluster. - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // Set election timeout and max leader lease to 1s. configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); @@ -141,10 +143,11 @@ fn test_write_after_destroy() { must_region_cleared(&engines_3, ®ion); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_tick_after_destroy() { // 3 nodes cluster. - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); let pd_client = cluster.pd_client.clone(); @@ -186,10 +189,11 @@ fn test_tick_after_destroy() { must_get_equal(&cluster.get_engine(1), b"k2", b"v2"); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_stale_peer_cache() { // 3 nodes cluster. - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.run(); // Now region 1 only has peer (1, 1); @@ -213,9 +217,10 @@ fn test_stale_peer_cache() { // 6. then peer 3 calling `Raft::apply_conf_change` to add peer 4; // 7. so the disk configuration `[1, 2, 3]` is different from memory // configuration `[1, 2, 3, 4]`. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_redundant_conf_change_by_snapshot() { - let mut cluster = new_node_cluster(0, 4); + let mut cluster = new_cluster(0, 4); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(5); cluster.cfg.raft_store.merge_max_log_gap = 4; cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(20); @@ -239,7 +244,7 @@ fn test_redundant_conf_change_by_snapshot() { .direction(Direction::Recv) .msg_type(MessageType::MsgAppend), ); - cluster.sim.wl().add_recv_filter(3, filter); + cluster.add_recv_filter_on_node(3, filter); // propose to remove peer 4, and append more entries to compact raft logs. cluster.pd_client.must_remove_peer(1, new_peer(4, 4)); @@ -247,7 +252,7 @@ fn test_redundant_conf_change_by_snapshot() { sleep_ms(50); // Clear filters on peer 3, so it can receive and restore a snapshot. - cluster.sim.wl().clear_recv_filters(3); + cluster.clear_recv_filter_on_node(3); sleep_ms(100); // Use a filter to capture messages sent from 3 to 4. @@ -264,7 +269,7 @@ fn test_redundant_conf_change_by_snapshot() { .when(Arc::new(AtomicBool::new(false))) .set_msg_callback(cb), ); - cluster.sim.wl().add_send_filter(3, filter); + cluster.add_recv_filter_on_node(3, filter); // Unpause the fail point, so peer 3 can apply the redundant conf change result. fail::cfg("apply_on_conf_change_3_1", "off").unwrap(); @@ -275,9 +280,10 @@ fn test_redundant_conf_change_by_snapshot() { fail::remove("apply_on_conf_change_3_1"); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_handle_conf_change_when_apply_fsm_resume_pending_state() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index ca329896df1..64b03f6d0b3 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -15,6 +15,7 @@ use engine_traits::RaftEngineReadOnly; use kvproto::raft_serverpb::RaftMessage; use raft::eraftpb::MessageType; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv_util::{config::*, time::Instant, HandyRwLock}; #[test] @@ -384,9 +385,10 @@ fn test_shutdown_when_snap_gc() { } // Test if a peer handle the old snapshot properly. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_receive_old_snapshot() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_snapshot(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; @@ -420,7 +422,7 @@ fn test_receive_old_snapshot() { .msg_type(MessageType::MsgSnapshot) .reserve_dropped(Arc::clone(&dropped_msgs)), ); - cluster.sim.wl().add_recv_filter(2, recv_filter); + cluster.add_recv_filter_on_node(2, recv_filter); cluster.clear_send_filters(); for _ in 0..20 { @@ -440,17 +442,18 @@ fn test_receive_old_snapshot() { std::mem::take(guard.as_mut()) }; - cluster.sim.wl().clear_recv_filters(2); + cluster.clear_recv_filter_on_node(2); for i in 20..40 { cluster.must_put(format!("k{}", i).as_bytes(), b"v1"); } must_get_equal(&cluster.get_engine(2), b"k39", b"v1"); - let router = cluster.sim.wl().get_router(2).unwrap(); + let router = cluster.get_router(2).unwrap(); // Send the old snapshot for raft_msg in msgs { - router.send_raft_message(raft_msg).unwrap(); + #[allow(clippy::useless_conversion)] + router.send_raft_message(raft_msg.into()).unwrap(); } cluster.must_put(b"k40", b"v1"); diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index a69a2216cd4..4d9290b4eff 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -211,55 +211,11 @@ fn test_server_snap_gc() { test_server_snap_gc_internal("5.1.0"); } -/// A helper function for testing the handling of snapshot is correct -/// when there are multiple snapshots which have overlapped region ranges -/// arrive at the same raftstore. -fn test_concurrent_snap(cluster: &mut Cluster) { - cluster.cfg.rocksdb.titan.enabled = true; - // Disable raft log gc in this test case. - cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); - - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer count check. - pd_client.disable_default_operator(); - - let r1 = cluster.run_conf_change(); - cluster.must_put(b"k1", b"v1"); - pd_client.must_add_peer(r1, new_peer(2, 2)); - // Force peer 2 to be followers all the way. - cluster.add_send_filter(CloneFilterFactory( - RegionPacketFilter::new(r1, 2) - .msg_type(MessageType::MsgRequestVote) - .direction(Direction::Send), - )); - cluster.must_transfer_leader(r1, new_peer(1, 1)); - cluster.must_put(b"k3", b"v3"); - // Pile up snapshots of overlapped region ranges and deliver them all at once. - let (tx, rx) = mpsc::channel(); - cluster - .sim - .wl() - .add_recv_filter(3, Box::new(CollectSnapshotFilter::new(tx))); - pd_client.must_add_peer(r1, new_peer(3, 3)); - let region = cluster.get_region(b"k1"); - // Ensure the snapshot of range ("", "") is sent and piled in filter. - if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { - panic!("the snapshot is not sent before split, e: {:?}", e); - } - // Split the region range and then there should be another snapshot for the - // split ranges. - cluster.must_split(®ion, b"k2"); - must_get_equal(&cluster.get_engine(3), b"k3", b"v3"); - // Ensure the regions work after split. - cluster.must_put(b"k11", b"v11"); - must_get_equal(&cluster.get_engine(3), b"k11", b"v11"); - cluster.must_put(b"k4", b"v4"); - must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); -} - #[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] #[test_case(test_raftstore_v2::new_node_cluster)] -fn test_node_concurrent_snap() { +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_concurrent_snap() { let mut cluster = new_cluster(0, 3); // Test that the handling of snapshot is correct when there are multiple // snapshots which have overlapped region ranges arrive at the same @@ -303,12 +259,6 @@ fn test_node_concurrent_snap() { must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } -#[test] -fn test_server_concurrent_snap() { - let mut cluster = new_server_cluster(0, 3); - test_concurrent_snap(&mut cluster); -} - #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] #[test_case(test_raftstore_v2::new_node_cluster)] @@ -389,9 +339,10 @@ impl Filter for StaleSnap { } } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_stale_snap() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // disable compact log to make snapshot only be sent when peer is first added. cluster.cfg.raft_store.raft_log_gc_threshold = 1000; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); @@ -476,36 +427,10 @@ impl Filter for SnapshotAppendFilter { } } -// todo(SpadeA): to be removed when receive filter is supported on ServerCluster -// V2 -fn test_snapshot_with_append(cluster: &mut Cluster) { - configure_for_snapshot(&mut cluster.cfg); - - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer count check. - pd_client.disable_default_operator(); - cluster.run(); - - // In case of removing leader, let's transfer leader to some node first. - cluster.must_transfer_leader(1, new_peer(1, 1)); - pd_client.must_remove_peer(1, new_peer(4, 4)); - - let (tx, rx) = mpsc::channel(); - cluster - .sim - .wl() - .add_recv_filter(4, Box::new(SnapshotAppendFilter::new(tx))); - pd_client.add_peer(1, new_peer(4, 5)); - rx.recv_timeout(Duration::from_secs(3)).unwrap(); - cluster.must_put(b"k1", b"v1"); - cluster.must_put(b"k2", b"v2"); - let engine4 = cluster.get_engine(4); - must_get_equal(&engine4, b"k1", b"v1"); - must_get_equal(&engine4, b"k2", b"v2"); -} - #[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] #[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_node_snapshot_with_append() { let mut cluster = new_cluster(0, 4); configure_for_snapshot(&mut cluster.cfg); @@ -530,12 +455,6 @@ fn test_node_snapshot_with_append() { must_get_equal(&engine4, b"k2", b"v2"); } -#[test] -fn test_server_snapshot_with_append() { - let mut cluster = new_server_cluster(0, 4); - test_snapshot_with_append(&mut cluster); -} - #[test] fn test_inspected_snapshot() { let mut cluster = new_server_cluster(1, 3); From 69dba51a41f1d24a7740e64ad1cb1725a93c29e1 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 2 Mar 2023 13:49:10 +0800 Subject: [PATCH 560/676] raftstore-v2: reduce file count (#14318) close tikv/tikv#14306, close tikv/tikv#14316, close tikv/tikv#14324 Compaction guard is disabled in v2, which will use 8MiB for file size. We need set multiplier to reduce sst file count. This PR also fixes a race between region creation and destroy. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 6 +-- components/raftstore-v2/src/fsm/peer.rs | 33 +++++++++++- .../raftstore-v2/src/operation/command/mod.rs | 12 ++--- components/raftstore-v2/src/operation/life.rs | 5 +- components/test_util/src/lib.rs | 36 +++++++++++++ etc/config-template.toml | 7 ++- src/config/mod.rs | 50 ++++++++++++++----- tests/integrations/config/mod.rs | 46 ++++------------- tests/integrations/config/test-custom.toml | 5 ++ 9 files changed, 138 insertions(+), 62 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2bd382ee8f0..ea1ebcfbb3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2862,7 +2862,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#b2cd42588ac62e40e297fea56a2286c0c389aade" +source = "git+https://github.com/tikv/rust-rocksdb.git#cd8b60758b46afbbde6fde52fa86a2776b401723" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2881,7 +2881,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#b2cd42588ac62e40e297fea56a2286c0c389aade" +source = "git+https://github.com/tikv/rust-rocksdb.git#cd8b60758b46afbbde6fde52fa86a2776b401723" dependencies = [ "bzip2-sys", "cc", @@ -4799,7 +4799,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#b2cd42588ac62e40e297fea56a2286c0c389aade" +source = "git+https://github.com/tikv/rust-rocksdb.git#cd8b60758b46afbbde6fde52fa86a2776b401723" dependencies = [ "libc 0.2.139", "librocksdb_sys", diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 814a0b1311a..2c47ab165f2 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -7,6 +7,7 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use kvproto::{errorpb, raft_cmdpb::RaftCmdResponse}; use raftstore::store::{Config, TabletSnapManager, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ @@ -18,7 +19,7 @@ use tikv_util::{ use crate::{ batch::StoreContext, raft::{Peer, Storage}, - router::{PeerMsg, PeerTick}, + router::{PeerMsg, PeerTick, QueryResult}, Result, }; @@ -335,3 +336,33 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.schedule_pending_ticks(); } } + +impl Drop for PeerFsm { + fn drop(&mut self) { + self.peer_mut().pending_reads_mut().clear_all(None); + + let region_id = self.peer().region_id(); + + let build_resp = || { + let mut err = errorpb::Error::default(); + err.set_message("region is not found".to_owned()); + err.mut_region_not_found().set_region_id(region_id); + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + resp + }; + while let Ok(msg) = self.receiver.try_recv() { + match msg { + // Only these messages need to be responded explicitly as they rely on + // deterministic response. + PeerMsg::RaftQuery(query) => { + query.ch.set_result(QueryResult::Response(build_resp())); + } + PeerMsg::SimpleWrite(w) => { + w.ch.set_result(build_resp()); + } + _ => continue, + } + } + } +} diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index ea8c8c227d0..6cb4460428d 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -300,13 +300,13 @@ impl Peer { committed_time: Instant::now(), }; assert!( - self.apply_scheduler().is_some(), - "apply_scheduler should be something. region_id {}", - self.region_id() + self.apply_scheduler().is_some() || ctx.router.is_shutdown(), + "{} apply_scheduler should not be None", + SlogFormat(&self.logger) ); - self.apply_scheduler() - .unwrap() - .send(ApplyTask::CommittedEntries(apply)); + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::CommittedEntries(apply)); + } } #[inline] diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index fdba7efdf4d..9e9cc2f5fc0 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -618,18 +618,21 @@ impl Peer { pub fn finish_destroy(&mut self, ctx: &mut StoreContext) { info!(self.logger, "peer destroyed"); let region_id = self.region_id(); - ctx.router.close(region_id); { let mut meta = ctx.store_meta.lock().unwrap(); meta.remove_region(region_id); meta.readers.remove(®ion_id); ctx.tablet_registry.remove(region_id); } + // Remove tablet first, otherwise in extreme cases, a new peer can be created + // and race on tablet record removal and creation. + ctx.router.close(region_id); if let Some(msg) = self.destroy_progress_mut().finish() { // The message will be dispatched to store fsm, which will create a // new peer. Ignore error as it's just a best effort. let _ = ctx.router.send_raft_message(msg); } + self.pending_reads_mut().clear_all(Some(region_id)); self.clear_apply_scheduler(); } } diff --git a/components/test_util/src/lib.rs b/components/test_util/src/lib.rs index d2096e74c82..453ed7fb7f1 100644 --- a/components/test_util/src/lib.rs +++ b/components/test_util/src/lib.rs @@ -15,6 +15,7 @@ mod security; use std::{ env, + fmt::Debug, sync::atomic::{AtomicU16, Ordering}, thread, }; @@ -118,3 +119,38 @@ pub fn temp_dir(prefix: impl Into>, prefer_mem: bool) -> te _ => builder.tempdir().unwrap(), } } + +/// Compare two structs and provide more helpful debug difference. +#[track_caller] +pub fn assert_eq_debug(lhs: &C, rhs: &C) { + if lhs == rhs { + return; + } + let lhs_str = format!("{:?}", lhs); + let rhs_str = format!("{:?}", rhs); + + fn find_index(l: impl Iterator) -> usize { + let it = l + .enumerate() + .take_while(|(_, (l, r))| l == r) + .filter(|(_, (l, _))| *l == b' '); + let mut last = None; + let mut second = None; + for a in it { + second = last; + last = Some(a); + } + second.map_or(0, |(i, _)| i) + } + let cpl = find_index(lhs_str.bytes().zip(rhs_str.bytes())); + let csl = find_index(lhs_str.bytes().rev().zip(rhs_str.bytes().rev())); + if cpl + csl > lhs_str.len() || cpl + csl > rhs_str.len() { + assert_eq!(lhs, rhs); + } + let lhs_diff = String::from_utf8_lossy(&lhs_str.as_bytes()[cpl..lhs_str.len() - csl]); + let rhs_diff = String::from_utf8_lossy(&rhs_str.as_bytes()[cpl..rhs_str.len() - csl]); + panic!( + "config not matched:\nlhs: ...{}...,\nrhs: ...{}...", + lhs_diff, rhs_diff + ); +} diff --git a/etc/config-template.toml b/etc/config-template.toml index 38082367d40..9b9a81d4106 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -739,8 +739,10 @@ ## Target file size for compaction. ## The SST file size of level-0 is influenced by the compaction algorithm of `write-buffer-size` ## and level0. `target-file-size-base` is used to control the size of a single SST file of level1 to -## level6. +## level6. Each level will have `target-file-size-base * (target-file-size-multiplier ^ (level - 1))`. # target-file-size-base = "8MB" +## In partitioned-raft-kv, the default value of target-file-size-multiplier is 2 for write and default cf. +# target-file-size-multiplier = 1 ## Max bytes for `compaction.max_compaction_bytes`. ## If it's necessary to enlarge value of this entry, it's better to also enlarge `reserve-space` @@ -925,6 +927,7 @@ ## Recommend to set it the same as `rocksdb.defaultcf.max-bytes-for-level-base`. # max-bytes-for-level-base = "512MB" # target-file-size-base = "8MB" +# target-file-size-multiplier = 1 # level0-file-num-compaction-trigger = 4 # level0-slowdown-writes-trigger = 20 @@ -953,6 +956,7 @@ # min-write-buffer-number-to-merge = 1 # max-bytes-for-level-base = "128MB" # target-file-size-base = "8MB" +# target-file-size-multiplier = 1 # level0-file-num-compaction-trigger = 1 # level0-slowdown-writes-trigger = 20 # level0-stop-writes-trigger = 20 @@ -1014,6 +1018,7 @@ ## Recommend to set it the same as `rocksdb.defaultcf.max-bytes-for-level-base`. # max-bytes-for-level-base = "512MB" # target-file-size-base = "8MB" +# target-file-size-multiplier = 1 # level0-file-num-compaction-trigger = 4 # level0-slowdown-writes-trigger = 20 diff --git a/src/config/mod.rs b/src/config/mod.rs index dff0fcb2436..0f97487edcf 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -321,6 +321,7 @@ macro_rules! cf_config { pub min_write_buffer_number_to_merge: i32, pub max_bytes_for_level_base: ReadableSize, pub target_file_size_base: ReadableSize, + pub target_file_size_multiplier: i32, pub level0_file_num_compaction_trigger: i32, pub level0_slowdown_writes_trigger: Option, pub level0_stop_writes_trigger: Option, @@ -572,6 +573,9 @@ macro_rules! build_cf_opt { cf_opts.set_min_write_buffer_number_to_merge($opt.min_write_buffer_number_to_merge); cf_opts.set_max_bytes_for_level_base($opt.max_bytes_for_level_base.0); cf_opts.set_target_file_size_base($opt.target_file_size_base.0); + if $opt.target_file_size_multiplier != 0 { + cf_opts.set_target_file_size_multiplier($opt.target_file_size_multiplier); + } cf_opts.set_level_zero_file_num_compaction_trigger($opt.level0_file_num_compaction_trigger); cf_opts.set_level_zero_slowdown_writes_trigger( $opt.level0_slowdown_writes_trigger.unwrap_or_default(), @@ -659,6 +663,7 @@ impl Default for DefaultCfConfig { min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(512), target_file_size_base: ReadableSize::mb(8), + target_file_size_multiplier: 0, level0_file_num_compaction_trigger: 4, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -784,6 +789,7 @@ impl Default for WriteCfConfig { min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(512), target_file_size_base: ReadableSize::mb(8), + target_file_size_multiplier: 0, level0_file_num_compaction_trigger: 4, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -890,6 +896,7 @@ impl Default for LockCfConfig { min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(128), target_file_size_base: ReadableSize::mb(8), + target_file_size_multiplier: 0, level0_file_num_compaction_trigger: 1, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -973,6 +980,7 @@ impl Default for RaftCfConfig { min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(128), target_file_size_base: ReadableSize::mb(8), + target_file_size_multiplier: 0, level0_file_num_compaction_trigger: 1, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -1233,6 +1241,16 @@ impl DbConfig { self.write_buffer_limit.get_or_insert(ReadableSize( (total_mem * WRITE_BUFFER_MEMORY_LIMIT_RATE) as u64, )); + if self.writecf.enable_compaction_guard != Some(true) + && self.writecf.target_file_size_multiplier == 0 + { + self.writecf.target_file_size_multiplier = 2; + } + if self.defaultcf.enable_compaction_guard != Some(true) + && self.defaultcf.target_file_size_multiplier == 0 + { + self.defaultcf.target_file_size_multiplier = 2; + } self.defaultcf.disable_write_stall = true; self.writecf.disable_write_stall = true; self.lockcf.disable_write_stall = true; @@ -1475,6 +1493,7 @@ impl Default for RaftDefaultCfConfig { min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(512), target_file_size_base: ReadableSize::mb(8), + target_file_size_multiplier: 0, level0_file_num_compaction_trigger: 4, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -4182,6 +4201,7 @@ mod tests { }; use slog::Level; use tempfile::Builder; + use test_util::assert_eq_debug; use tikv_kv::RocksEngine as RocksDBEngine; use tikv_util::{ config::VersionTrack, @@ -5001,25 +5021,25 @@ mod tests { Module::Quota, Box::new(QuotaLimitConfigManager::new(Arc::clone("a_limiter))), ); - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); // u64::MAX ns casts to 213503d. cfg_controller .update_config("quota.max-delay-duration", "213504d") .unwrap_err(); - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); cfg_controller .update_config("quota.foreground-cpu-time", "2000") .unwrap(); cfg.quota.foreground_cpu_time = 2000; - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); cfg_controller .update_config("quota.foreground-write-bandwidth", "256MB") .unwrap(); cfg.quota.foreground_write_bandwidth = ReadableSize::mb(256); - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); let mut sample = quota_limiter.new_sample(true); sample.add_read_bytes(ReadableSize::mb(32).0 as usize); @@ -5040,13 +5060,13 @@ mod tests { .update_config("quota.background-cpu-time", "2000") .unwrap(); cfg.quota.background_cpu_time = 2000; - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); cfg_controller .update_config("quota.background-write-bandwidth", "256MB") .unwrap(); cfg.quota.background_write_bandwidth = ReadableSize::mb(256); - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); let mut sample = quota_limiter.new_sample(false); sample.add_read_bytes(ReadableSize::mb(32).0 as usize); @@ -5057,7 +5077,7 @@ mod tests { .update_config("quota.background-read-bandwidth", "512MB") .unwrap(); cfg.quota.background_read_bandwidth = ReadableSize::mb(512); - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); let mut sample = quota_limiter.new_sample(false); sample.add_write_bytes(ReadableSize::mb(128).0 as usize); let should_delay = block_on(quota_limiter.consume_sample(sample, false)); @@ -5067,7 +5087,7 @@ mod tests { .update_config("quota.max-delay-duration", "50ms") .unwrap(); cfg.quota.max_delay_duration = ReadableDuration::millis(50); - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); let mut sample = quota_limiter.new_sample(true); sample.add_write_bytes(ReadableSize::mb(128).0 as usize); let should_delay = block_on(quota_limiter.consume_sample(sample, true)); @@ -5083,7 +5103,7 @@ mod tests { .update_config("quota.enable-auto-tune", "true") .unwrap(); cfg.quota.enable_auto_tune = true; - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); } #[test] @@ -5103,7 +5123,7 @@ mod tests { ); let check_cfg = |cfg: &TikvConfig| { - assert_eq!(&cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), cfg); assert_eq!(&*version_tracker.value(), &cfg.server); }; @@ -5117,7 +5137,7 @@ mod tests { .update_config("server.raft-msg-max-batch-size", "32") .unwrap(); cfg.server.raft_msg_max_batch_size = 32; - assert_eq!(cfg_controller.get_current(), cfg); + assert_eq_debug(&cfg_controller.get_current(), &cfg); check_cfg(&cfg); } @@ -5133,7 +5153,7 @@ mod tests { for _ in 0..10 { cfg.compatible_adjust(); cfg.validate().unwrap(); - assert_eq!(c, cfg); + assert_eq_debug(&c, &cfg); } } @@ -5552,6 +5572,10 @@ mod tests { Some(default_cfg.coprocessor.region_split_size() * 3 / 4 / ReadableSize::kb(1)); default_cfg.raft_store.region_split_check_diff = Some(default_cfg.coprocessor.region_split_size() / 16); + default_cfg.rocksdb.writecf.target_file_size_multiplier = 1; + default_cfg.rocksdb.defaultcf.target_file_size_multiplier = 1; + default_cfg.rocksdb.lockcf.target_file_size_multiplier = 1; + default_cfg.raftdb.defaultcf.target_file_size_multiplier = 1; // Other special cases. cfg.pd.retry_max_count = default_cfg.pd.retry_max_count; // Both -1 and isize::MAX are the same. @@ -5588,7 +5612,7 @@ mod tests { cfg.coprocessor .optimize_for(default_cfg.storage.engine == EngineType::RaftKv2); - assert_eq!(cfg, default_cfg); + assert_eq_debug(&cfg, &default_cfg); } #[test] diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 80cab3aca43..672fd79ee12 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -24,6 +24,7 @@ use raftstore::{ }; use security::SecurityConfig; use slog::Level; +use test_util::assert_eq_debug; use tikv::{ config::*, import::Config as ImportConfig, @@ -345,6 +346,7 @@ fn test_serde_custom_tikv_config() { min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), + target_file_size_multiplier: 3, level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), @@ -400,6 +402,7 @@ fn test_serde_custom_tikv_config() { min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), + target_file_size_multiplier: 3, level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), @@ -469,6 +472,7 @@ fn test_serde_custom_tikv_config() { min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), + target_file_size_multiplier: 3, level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), @@ -538,6 +542,7 @@ fn test_serde_custom_tikv_config() { min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), + target_file_size_multiplier: 3, level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), @@ -636,6 +641,7 @@ fn test_serde_custom_tikv_config() { min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), target_file_size_base: ReadableSize::kb(123), + target_file_size_multiplier: 3, level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), @@ -827,45 +833,11 @@ fn test_serde_custom_tikv_config() { let custom = read_file_in_project_dir("integrations/config/test-custom.toml"); let load = toml::from_str(&custom).unwrap(); - if value != load { - diff_config(&value, &load); - } + assert_eq_debug(&value, &load); + let dump = toml::to_string_pretty(&load).unwrap(); let load_from_dump = toml::from_str(&dump).unwrap(); - if load != load_from_dump { - diff_config(&load, &load_from_dump); - } -} - -#[track_caller] -fn diff_config(lhs: &TikvConfig, rhs: &TikvConfig) { - let lhs_str = format!("{:?}", lhs); - let rhs_str = format!("{:?}", rhs); - - fn find_index(l: impl Iterator) -> usize { - let it = l - .enumerate() - .take_while(|(_, (l, r))| l == r) - .filter(|(_, (l, _))| *l == b' '); - let mut last = None; - let mut second = None; - for a in it { - second = last; - last = Some(a); - } - second.map_or(0, |(i, _)| i) - } - let cpl = find_index(lhs_str.bytes().zip(rhs_str.bytes())); - let csl = find_index(lhs_str.bytes().rev().zip(rhs_str.bytes().rev())); - if cpl + csl > lhs_str.len() || cpl + csl > rhs_str.len() { - assert_eq!(lhs, rhs); - } - let lhs_diff = String::from_utf8_lossy(&lhs_str.as_bytes()[cpl..lhs_str.len() - csl]); - let rhs_diff = String::from_utf8_lossy(&rhs_str.as_bytes()[cpl..rhs_str.len() - csl]); - panic!( - "config not matched:\nlhs: ...{}...,\nrhs: ...{}...", - lhs_diff, rhs_diff - ); + assert_eq_debug(&load, &load_from_dump); } #[test] diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index d79ec7899e2..f8931cbddac 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -309,6 +309,7 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" +target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 @@ -373,6 +374,7 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" +target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 @@ -423,6 +425,7 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" +target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 @@ -473,6 +476,7 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" +target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 @@ -555,6 +559,7 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" +target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 From 4f2430d7265d126ced0402342de5d6f16e0bc158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 2 Mar 2023 17:45:10 +0800 Subject: [PATCH 561/676] sst_importer: add wire extra bytes into the packed size (#14312) close tikv/tikv#14313 Signed-off-by: hillium Co-authored-by: Xinye Tao --- src/import/sst_service.rs | 59 +++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index b589da50b76..02e7297bea8 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -47,6 +47,19 @@ use crate::{ }; const MAX_INFLIGHT_RAFT_MSGS: usize = 64; +/// The extra bytes required by the wire encoding. +/// Generally, a field (and a embedded message) would introduce 2 extra +/// bytes. In detail, they are: +/// - 2 bytes for the request type (Tag+Value). +/// - 2 bytes for every string or bytes field (Tag+Length), they are: +/// . + the key field +/// . + the value field +/// . + the CF field (None for CF_DEFAULT) +/// - 2 bytes for the embedded message field `PutRequest` (Tag+Length). +/// In fact, the length field is encoded by varint, which may grow when the +/// content length is greater than 128, however when the length is greater than +/// 128, the extra 1~4 bytes can be ignored. +const WIRE_EXTRA_BYTES: usize = 10; fn transfer_error(err: storage::Error) -> ImportPbError { let mut e = ImportPbError::default(); @@ -104,6 +117,20 @@ struct RequestCollector { } impl RequestCollector { + fn record_size_of_message(&mut self, size: usize) { + // We make a raft command entry when we unpacked size grows to 7/8 of the max + // raft entry size. + // + // Which means, if we don't add the extra bytes, when the amplification by the + // extra bytes is greater than 8/7 (i.e. the average size of entry is + // less than 70B), we may encounter the "raft entry is too large" error. + self.unpacked_size += size + WIRE_EXTRA_BYTES; + } + + fn release_message_of_size(&mut self, size: usize) { + self.unpacked_size -= size + WIRE_EXTRA_BYTES; + } + fn new(max_raft_req_size: usize) -> Self { Self { max_raft_req_size, @@ -162,19 +189,19 @@ impl RequestCollector { .map(|(_, old_ts)| *old_ts < ts.into_inner()) .unwrap_or(true) { - self.unpacked_size += m.size(); + self.record_size_of_message(m.size()); if let Some((v, _)) = self .write_reqs .insert(encoded_key.to_owned(), (m, ts.into_inner())) { - self.unpacked_size -= v.size(); + self.release_message_of_size(v.size()) } } } CF_DEFAULT => { - self.unpacked_size += m.size(); + self.record_size_of_message(m.size()); if let Some(v) = self.default_reqs.insert(k.as_encoded().clone(), m) { - self.unpacked_size -= v.size(); + self.release_message_of_size(v.size()); } } _ => unreachable!(), @@ -193,7 +220,7 @@ impl RequestCollector { self.write_reqs.drain().map(|(_, (m, _))| m).collect() }; for r in &res { - self.unpacked_size -= r.size(); + self.release_message_of_size(r.size()); } res } @@ -1066,6 +1093,8 @@ mod test { use std::collections::HashMap; use engine_traits::{CF_DEFAULT, CF_WRITE}; + use kvproto::raft_cmdpb::Request; + use protobuf::Message; use tikv_kv::Modify; use txn_types::{Key, TimeStamp, Write, WriteType}; @@ -1249,4 +1278,24 @@ mod test { assert_eq!(reqs, reqs_result); assert!(request_collector.is_empty()); } + + #[test] + fn test_collector_size() { + let mut request_collector = RequestCollector::new(1024); + + for i in 0..100u64 { + request_collector.accept(CF_DEFAULT, default_req(&i.to_ne_bytes(), b"egg", i)); + } + + let pws = request_collector.pending_writes; + for w in pws { + let req_size = w + .modifies + .into_iter() + .map(Request::from) + .map(|x| x.compute_size()) + .sum::(); + assert!(req_size < 1024, "{}", req_size); + } + } } From d74fd1325280999b367424ee332c49e11bbf80b0 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Thu, 2 Mar 2023 20:05:10 +0800 Subject: [PATCH 562/676] raftstore-v2: store heartbeat supports write keys and bytes. (#14271) ref tikv/tikv#12842 1. store heartbeat supports write keys and bytes. Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 13 ++++++- .../raftstore-v2/src/operation/command/mod.rs | 2 + .../operation/command/write/simple_write.rs | 5 +++ components/raftstore-v2/src/operation/pd.rs | 16 +++++++- .../tests/integrations/test_pd_heartbeat.rs | 37 ++++++++++++++----- components/raftstore/src/store/fsm/mod.rs | 1 + components/raftstore/src/store/fsm/store.rs | 12 +++--- 7 files changed, 67 insertions(+), 19 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 4693b0db369..83fa6b7a018 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -24,7 +24,10 @@ use raft::{StateRole, INVALID_ID}; use raftstore::{ coprocessor::{CoprocessorHost, RegionChangeEvent}, store::{ - fsm::store::{PeerTickBatch, ENTRY_CACHE_EVICT_TICK_DURATION}, + fsm::{ + store::{PeerTickBatch, ENTRY_CACHE_EVICT_TICK_DURATION}, + GlobalStoreStat, LocalStoreStat, + }, local_metrics::RaftMetrics, AutoSplitController, Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, StoreWriters, TabletSnapManager, Transport, WriteSenders, @@ -85,6 +88,8 @@ pub struct StoreContext { pub self_disk_usage: DiskUsage, pub snap_mgr: TabletSnapManager, + pub global_stat: GlobalStoreStat, + pub store_stat: LocalStoreStat, pub sst_importer: Arc, } @@ -162,6 +167,7 @@ impl StorePoller { fn flush_events(&mut self) { self.schedule_ticks(); self.poll_ctx.raft_metrics.maybe_flush(); + self.poll_ctx.store_stat.flush(); } fn schedule_ticks(&mut self) { @@ -279,6 +285,7 @@ struct StorePollerBuilder { store_meta: Arc>>, shutdown: Arc, snap_mgr: TabletSnapManager, + global_stat: GlobalStoreStat, sst_importer: Arc, } @@ -308,6 +315,7 @@ impl StorePollerBuilder { .after_start(move || set_io_type(IoType::ForegroundWrite)) .name_prefix("apply") .build_future_pool(); + let global_stat = GlobalStoreStat::default(); StorePollerBuilder { cfg, store_id, @@ -322,6 +330,7 @@ impl StorePollerBuilder { snap_mgr, shutdown, coprocessor_host, + global_stat, sst_importer, } } @@ -440,6 +449,8 @@ where self_disk_usage: DiskUsage::Normal, snap_mgr: self.snap_mgr.clone(), coprocessor_host: self.coprocessor_host.clone(), + global_stat: self.global_stat.clone(), + store_stat: self.global_stat.local(), sst_importer: self.sst_importer.clone(), }; poll_ctx.update_ticks_timeout(); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 6cb4460428d..df289a26f4a 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -365,6 +365,8 @@ impl Peer { .add_bucket_flow(&apply_res.bucket_stat); self.update_split_flow_control(&apply_res.metrics); self.update_stat(&apply_res.metrics); + ctx.store_stat.engine_total_bytes_written += apply_res.metrics.written_bytes; + ctx.store_stat.engine_total_keys_written += apply_res.metrics.written_keys; self.raft_group_mut() .advance_apply_to(apply_res.applied_index); diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index cf267f854b7..a2c378cb04b 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -30,6 +30,11 @@ impl SimpleWriteBinary { pub fn freeze(&mut self) { self.write_type = WriteType::Unspecified; } + + #[inline] + pub fn data_size(&self) -> usize { + self.buf.len() + } } /// We usually use `RaftCmdRequest` for read write request. But the codec is diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 17abdd85cf0..4fd21a32488 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -2,6 +2,8 @@ //! This module implements the interactions with pd. +use std::sync::atomic::Ordering; + use engine_traits::{KvEngine, RaftEngine}; use fail::fail_point; use kvproto::{metapb, pdpb}; @@ -47,8 +49,18 @@ impl Store { stats.set_start_time(self.start_time().unwrap() as u32); - stats.set_bytes_written(0); - stats.set_keys_written(0); + stats.set_bytes_written( + ctx.global_stat + .stat + .engine_total_bytes_written + .swap(0, Ordering::Relaxed), + ); + stats.set_keys_written( + ctx.global_stat + .stat + .engine_total_keys_written + .swap(0, Ordering::Relaxed), + ); stats.set_is_busy(false); // TODO: add query stats let task = pd::Task::StoreHeartbeat { stats }; diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs index b9dea63bbfe..679183735b6 100644 --- a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -8,7 +8,7 @@ use kvproto::raft_cmdpb::{RaftCmdRequest, StatusCmdType}; use pd_client::PdClient; use raftstore::coprocessor::Config as CopConfig; use raftstore_v2::{ - router::{PeerMsg, PeerTick}, + router::{PeerMsg, PeerTick, StoreMsg, StoreTick}, SimpleWriteEncoder, }; use tikv_util::{config::ReadableSize, store::new_peer}; @@ -54,18 +54,35 @@ fn test_region_heartbeat() { #[test] fn test_store_heartbeat() { + let region_id = 2; let cluster = Cluster::with_node_count(1, None); let store_id = cluster.node(0).id(); - for _ in 0..5 { - let stats = block_on(cluster.node(0).pd_client().get_store_stats_async(store_id)).unwrap(); - if stats.get_start_time() > 0 { - assert_ne!(stats.get_capacity(), 0); - assert_ne!(stats.get_used_size(), 0); - return; - } - std::thread::sleep(std::time::Duration::from_millis(50)); + let router = &cluster.routers[0]; + // load data to split bucket. + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); + let data = put.encode(); + let write_bytes = data.data_size(); + let (msg, sub) = PeerMsg::simple_write(header, data); + router.send(region_id, msg).unwrap(); + let _resp = block_on(sub.result()).unwrap(); + + // report store heartbeat to pd. + std::thread::sleep(std::time::Duration::from_millis(50)); + router + .store_router() + .send_control(StoreMsg::Tick(StoreTick::PdStoreHeartbeat)) + .unwrap(); + std::thread::sleep(std::time::Duration::from_millis(50)); + + let stats = block_on(cluster.node(0).pd_client().get_store_stats_async(store_id)).unwrap(); + if stats.get_start_time() > 0 { + assert_ne!(stats.get_capacity(), 0); + assert_ne!(stats.get_used_size(), 0); + assert_eq!(stats.get_keys_written(), 1); + assert!(stats.get_bytes_written() > write_bytes.try_into().unwrap()); } - panic!("failed to get store stats"); } #[test] diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index b481caf4f74..ffba120056c 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -16,6 +16,7 @@ pub use self::{ ChangePeer, ExecResult, GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, Registration, SwitchWitness, TaskRes as ApplyTaskRes, }, + metrics::{GlobalStoreStat, LocalStoreStat}, peer::{new_admin_request, DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO}, store::{ create_raft_batch_system, RaftBatchSystem, RaftPollerBuilder, RaftRouter, StoreInfo, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 4b9e69f9763..4fafc049bee 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -2479,14 +2479,14 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .global_stat .stat .engine_total_bytes_written - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); stats.set_keys_written( self.ctx .global_stat .stat .engine_total_keys_written - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); stats.set_is_busy( @@ -2494,7 +2494,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .global_stat .stat .is_busy - .swap(false, Ordering::SeqCst), + .swap(false, Ordering::Relaxed), ); let mut query_stats = QueryStats::default(); @@ -2503,21 +2503,21 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .global_stat .stat .engine_total_query_put - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); query_stats.set_delete( self.ctx .global_stat .stat .engine_total_query_delete - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); query_stats.set_delete_range( self.ctx .global_stat .stat .engine_total_query_delete_range - .swap(0, Ordering::SeqCst), + .swap(0, Ordering::Relaxed), ); stats.set_query_stats(query_stats); From 8bfa12dea2d5bd0918b35e5ae041676aeeee77ab Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Sat, 4 Mar 2023 23:13:10 +0800 Subject: [PATCH 563/676] raftstore-v2: add some missing metrics (#14326) ref tikv/tikv#12842 None Signed-off-by: tabokie --- .../operation/command/admin/compact_log.rs | 28 +- .../raftstore/src/store/worker/metrics.rs | 6 - metrics/alertmanager/tikv.rules.yml | 4 +- metrics/grafana/performance_write.json | 2 +- metrics/grafana/tikv_details.json | 390 +----------------- metrics/grafana/tikv_summary.json | 2 +- 6 files changed, 36 insertions(+), 396 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index af61434041a..1ce118a957f 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -18,7 +18,8 @@ use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequ use protobuf::Message; use raftstore::{ store::{ - fsm::new_admin_request, needs_evict_entry_cache, Transport, WriteTask, RAFT_INIT_LOG_INDEX, + fsm::new_admin_request, metrics::REGION_MAX_LOG_LAG, needs_evict_entry_cache, Transport, + WriteTask, RAFT_INIT_LOG_INDEX, }, Result, }; @@ -167,6 +168,7 @@ impl Peer { last_idx, replicated_idx ); + REGION_MAX_LOG_LAG.observe((last_idx - replicated_idx) as f64); } // leader may call `get_term()` on the latest replicated index, so compact @@ -182,13 +184,19 @@ impl Peer { >= store_ctx.cfg.raft_log_gc_size_limit().0 { std::cmp::max(first_idx + (last_idx - first_idx) / 2, replicated_idx) - } else if replicated_idx < first_idx - || last_idx - first_idx < 3 - || replicated_idx - first_idx < store_ctx.cfg.raft_log_gc_threshold - && self - .compact_log_context_mut() - .maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) + } else if replicated_idx < first_idx || last_idx - first_idx < 3 { + store_ctx.raft_metrics.raft_log_gc_skipped.reserve_log.inc(); + return; + } else if replicated_idx - first_idx < store_ctx.cfg.raft_log_gc_threshold + && self + .compact_log_context_mut() + .maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) { + store_ctx + .raft_metrics + .raft_log_gc_skipped + .threshold_limit + .inc(); return; } else { replicated_idx @@ -197,6 +205,12 @@ impl Peer { // Have no idea why subtract 1 here, but original code did this by magic. compact_idx -= 1; if compact_idx < first_idx { + // In case compact_idx == first_idx before subtraction. + store_ctx + .raft_metrics + .raft_log_gc_skipped + .compact_idx_too_small + .inc(); return; } diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index e6c3c505cdf..36a217be607 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -191,12 +191,6 @@ lazy_static! { "Total number of seek operations from raft log gc." ) .unwrap(); - pub static ref RAFT_LOG_GC_DELETED_KEYS_HISTOGRAM: Histogram = register_histogram!( - "tikv_raftstore_raft_log_gc_deleted_keys", - "Bucket of number of deleted keys from raft log gc.", - exponential_buckets(1.0, 2.0, 20).unwrap() - ) - .unwrap(); pub static ref RAFT_LOG_GC_FAILED: IntCounter = register_int_counter!( "tikv_raftstore_raft_log_gc_failed", "Total number of failed raft log gc." diff --git a/metrics/alertmanager/tikv.rules.yml b/metrics/alertmanager/tikv.rules.yml index 19f8085866e..e43ca401d42 100644 --- a/metrics/alertmanager/tikv.rules.yml +++ b/metrics/alertmanager/tikv.rules.yml @@ -110,12 +110,12 @@ groups: summary: TiKV coprocessor request wait seconds more than 10s - alert: TiKV_raftstore_thread_cpu_seconds_total - expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance) > 1.6 + expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"(raftstore|rs)_.*"}[1m])) by (instance) > 1.6 for: 1m labels: env: ENV_LABELS_ENV level: critical - expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance) > 1.6 + expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"(raftstore|rs)_.*"}[1m])) by (instance) > 1.6 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' diff --git a/metrics/grafana/performance_write.json b/metrics/grafana/performance_write.json index ddb9621b97a..695e96725c3 100644 --- a/metrics/grafana/performance_write.json +++ b/metrics/grafana/performance_write.json @@ -3104,7 +3104,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 334c3c119f7..ead1e842d0c 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -6000,7 +6000,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", @@ -16605,116 +16605,6 @@ "y": 24 }, "hiddenSeries": false, - "id": 12972, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_raft_log_gc_deleted_keys_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "tikv_raftstore_check_split_total", - "refId": "A", - "step": 4 - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Raft log GC deleted keys", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", - "editable": true, - "error": false, - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "grid": {}, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 32 - }, - "hiddenSeries": false, "id": 12884, "legend": { "alignAsTable": true, @@ -16801,264 +16691,6 @@ "alignLevel": null } }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 32 - }, - "hiddenSeries": false, - "id": 12973, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_raft_log_gc_deleted_keys_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "99% - {{instance}}", - "refId": "A", - "step": 10 - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_raft_log_gc_deleted_keys_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "95% - {{instance}}", - "refId": "B", - "step": 10 - }, - { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_raft_log_gc_deleted_keys_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance) / sum(rate(tikv_raftstore_raft_log_gc_deleted_keys_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "avg - {{instance}}", - "refId": "C", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Raft log GC write batch size", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 40 - }, - "hiddenSeries": false, - "id": 12883, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "hideEmpty": false, - "hideZero": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "7.5.7", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "exemplar": true, - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_raft_log_gc_purge_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "99% - {{instance}}", - "refId": "A", - "step": 10 - }, - { - "exemplar": true, - "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_raft_log_gc_purge_duration_secs_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, instance))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "95% - {{instance}}", - "refId": "B", - "step": 10 - }, - { - "exemplar": true, - "expr": "sum(rate(tikv_raftstore_raft_log_gc_purge_duration_secs_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance) / sum(rate(tikv_raftstore_raft_log_gc_purge_duration_secs_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (instance)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "avg - {{instance}}", - "refId": "C", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Raft log GC purge duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 10, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "aliasColors": {}, "bars": false, @@ -17079,8 +16711,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 40 + "x": 0, + "y": 32 }, "hiddenSeries": false, "id": 12887, @@ -17189,8 +16821,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 48 + "x": 12, + "y": 32 }, "hiddenSeries": false, "id": 12975, @@ -17299,8 +16931,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 48 + "x": 0, + "y": 40 }, "hiddenSeries": false, "id": 12974, @@ -17409,8 +17041,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 0, - "y": 56 + "x": 12, + "y": 40 }, "hiddenSeries": false, "id": 23763572229, @@ -17516,8 +17148,8 @@ "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 56 + "x": 0, + "y": 48 }, "hiddenSeries": false, "id": 23763572555, diff --git a/metrics/grafana/tikv_summary.json b/metrics/grafana/tikv_summary.json index 847ac5ef289..528fe04b7fb 100644 --- a/metrics/grafana/tikv_summary.json +++ b/metrics/grafana/tikv_summary.json @@ -3185,7 +3185,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", From 199e63f3eb036e9d20ec8003276d339c5eaa25b8 Mon Sep 17 00:00:00 2001 From: Yifan Xu <30385241+xuyifangreeneyes@users.noreply.github.com> Date: Mon, 6 Mar 2023 22:45:12 +0800 Subject: [PATCH 564/676] coprocessor: avoid fmsketch calculation for single-column index (#14345) ref tikv/tikv#14231 Signed-off-by: xuyifan <675434007@qq.com> Co-authored-by: Ti Chi Robot --- src/coprocessor/statistics/analyze.rs | 54 ++++++++++++++++----------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 25fd67b9a99..6b486c3bb7e 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -450,6 +450,24 @@ impl RowSampleBuilder { .inc_by(quota_delay.as_micros() as u64); } } + for i in 0..self.column_groups.len() { + let offsets = self.column_groups[i].get_column_offsets(); + if offsets.len() != 1 { + continue; + } + // For the single-column group, its fm_sketch is the same as that of the + // corresponding column. Hence, we don't maintain its fm_sketch in + // collect_column_group. We just copy the corresponding column's fm_sketch after + // iterating all rows. Also, we can directly copy total_size and null_count. + let col_pos = offsets[0] as usize; + let col_group_pos = self.columns_info.len() + i; + collector.mut_base().fm_sketches[col_group_pos] = + collector.mut_base().fm_sketches[col_pos].clone(); + collector.mut_base().null_count[col_group_pos] = + collector.mut_base().null_count[col_pos]; + collector.mut_base().total_sizes[col_group_pos] = + collector.mut_base().total_sizes[col_pos]; + } Ok(AnalyzeSamplingResult::new(collector)) } } @@ -527,37 +545,29 @@ impl BaseRowSampleCollector { let col_len = columns_val.len(); for i in 0..column_groups.len() { let offsets = column_groups[i].get_column_offsets(); - let mut has_null = true; + if offsets.len() == 1 { + // For the single-column group, its fm_sketch is the same as that of the + // corresponding column. Hence, we don't need to maintain its + // fm_sketch. We just copy the corresponding column's fm_sketch after iterating + // all rows. Also, we can directly copy total_size and null_count. + continue; + } + // We don't maintain the null count information for the multi-column group. for j in offsets { if columns_val[*j as usize][0] == NIL_FLAG { continue; } - has_null = false; self.total_sizes[col_len + i] += columns_val[*j as usize].len() as i64 - 1 } - // We only maintain the null count for single column case. - if has_null && offsets.len() == 1 { - self.null_count[col_len + i] += 1; - continue; - } - if offsets.len() == 1 { - let offset = offsets[0] as usize; - if columns_info[offset].as_accessor().is_string_like() { - self.fm_sketches[col_len + i].insert(&collation_keys_val[offset]); + let mut hasher = Hasher128::with_seed(0); + for j in offsets { + if columns_info[*j as usize].as_accessor().is_string_like() { + hasher.write(&collation_keys_val[*j as usize]); } else { - self.fm_sketches[col_len + i].insert(&columns_val[offset]); - } - } else { - let mut hasher = Hasher128::with_seed(0); - for j in offsets { - if columns_info[*j as usize].as_accessor().is_string_like() { - hasher.write(&collation_keys_val[*j as usize]); - } else { - hasher.write(&columns_val[*j as usize]); - } + hasher.write(&columns_val[*j as usize]); } - self.fm_sketches[col_len + i].insert_hash_value(hasher.finish()); } + self.fm_sketches[col_len + i].insert_hash_value(hasher.finish()); } } From 38568e46b27878222961f64e600d4be317bf288f Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Tue, 7 Mar 2023 12:15:12 +0800 Subject: [PATCH 565/676] metrics: add some metrics for snapshot-v2 (#14195) ref tikv/tikv#12842 1. add snapshot size metrics 2. add snapshot send/recv count metrics Signed-off-by: bufferflies <1045931706@qq.com> --- components/raftstore-v2/src/operation/pd.rs | 11 +++++++++-- components/raftstore/src/store/async_io/read.rs | 15 ++++++++++++++- src/server/engine_factory.rs | 2 +- src/server/tablet_snap.rs | 3 +++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 4fd21a32488..3b5e7d32f89 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -7,7 +7,7 @@ use std::sync::atomic::Ordering; use engine_traits::{KvEngine, RaftEngine}; use fail::fail_point; use kvproto::{metapb, pdpb}; -use raftstore::store::Transport; +use raftstore::store::{metrics::STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC, Transport}; use slog::error; use tikv_util::slog_panic; @@ -43,10 +43,17 @@ impl Store { let meta = ctx.store_meta.lock().unwrap(); stats.set_region_count(meta.readers.len() as u32); } - + // todo: imple snapshot status report stats.set_sending_snap_count(0); stats.set_receiving_snap_count(0); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["sending"]) + .set(stats.get_sending_snap_count() as i64); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["receiving"]) + .set(stats.get_receiving_snap_count() as i64); + stats.set_start_time(self.start_time().unwrap() as u32); stats.set_bytes_written( diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index 5b53ad499b5..b02992bbeb0 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -18,6 +18,7 @@ use raft::{eraftpb::Snapshot, GetEntriesContext}; use tikv_util::{error, info, time::Instant, worker::Runnable}; use crate::store::{ + metrics::{SNAPSHOT_KV_COUNT_HISTOGRAM, SNAPSHOT_SIZE_HISTOGRAM}, snap::TABLET_SNAPSHOT_VERSION, util, worker::metrics::{SNAP_COUNTER, SNAP_HISTOGRAM}, @@ -225,6 +226,8 @@ where // create checkpointer. let snap_key = TabletSnapKey::from_region_snap(region_id, to_peer, &snapshot); let mut res = None; + let total_size = tablet.get_engine_used_size().unwrap_or(0); + let total_keys = tablet.get_num_keys().unwrap_or(0); if let Err(e) = self.generate_snap(&snap_key, tablet) { error!("failed to create checkpointer"; "region_id" => region_id, "error" => %e); SNAP_COUNTER.generate.fail.inc(); @@ -232,7 +235,17 @@ where let elapsed = start.saturating_elapsed_secs(); SNAP_COUNTER.generate.success.inc(); SNAP_HISTOGRAM.generate.observe(elapsed); - info!("snapshot generated"; "region_id" => region_id, "elapsed" => elapsed, "key" => ?snap_key, "for_balance" => for_balance); + SNAPSHOT_SIZE_HISTOGRAM.observe(total_size as f64); + SNAPSHOT_KV_COUNT_HISTOGRAM.observe(total_keys as f64); + info!( + "snapshot generated"; + "region_id" => region_id, + "elapsed" => elapsed, + "key" => ?snap_key, + "for_balance" => for_balance, + "total_size" => total_size, + "total_keys" => total_keys, + ); res = Some(Box::new((snapshot, to_peer))) } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index f50afe4bc44..e9b59141da2 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -216,7 +216,7 @@ impl TabletFactory for KvEngineFactory { } fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()> { - info!("destroy tablet"; "path" => %path.display(), "id" => ctx.id, "suffix" => ?ctx.suffix); + info!("destroy tablet"; "path" => %path.display(), "region_id" => ctx.id, "suffix" => ?ctx.suffix); // Create kv engine. let _db_opts = self.db_opts(EngineType::RaftKv2); let _cf_opts = self.cf_opts(EngineType::RaftKv2); diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index a54c5461e0d..c0ecf4db611 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -162,6 +162,9 @@ async fn send_snap_files( } } info!("sent all snap file finish"; "snap_key" => %key); + SNAP_LIMIT_TRANSPORT_BYTES_COUNTER_STATIC + .send + .inc_by(total_sent); sender.close().await?; Ok(total_sent) } From bb6f5e84762c63d2092bc4741b13db179423aced Mon Sep 17 00:00:00 2001 From: ShuNing Date: Tue, 7 Mar 2023 14:21:12 +0800 Subject: [PATCH 566/676] resource_control: enable by default (#14354) close tikv/tikv#14353 resource_control: enable resource control by default Signed-off-by: nolouch --- components/resource_control/src/lib.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index 1c4c93c82d2..99645688cf7 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -17,10 +17,16 @@ pub use service::ResourceManagerService; pub mod channel; pub use channel::ResourceMetered; -#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig, Default)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct Config { #[online_config(skip)] pub enabled: bool, } + +impl Default for Config { + fn default() -> Self { + Self { enabled: true } + } +} From 3b56cfb2a8620894911e898ecc844ea361cc9ddc Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Tue, 7 Mar 2023 15:09:12 +0800 Subject: [PATCH 567/676] Metrics: add snapshot transport to grafana (#14337) ref tikv/tikv#13409 Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Ti Chi Robot --- metrics/grafana/tikv_details.json | 104 ++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index ead1e842d0c..d4374fda369 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -23641,6 +23641,110 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The speed of sending or receiving snapshot", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 66 + }, + "id": 4201, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(tikv_snapshot_limit_transport_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}--{{type}}", + "metric": "tikv_snapshot_limit_transport_bytes", + "refId": "A", + "step": 40 + },{ + "exemplar": true, + "expr": "rate(tikv_snapshot_limit_generate_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", + "hide": true, + "interval": "", + "legendFormat": "{{instance}}--generate", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Snapshot transport speed", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, From 02490222cd4bb770668f010bcf427d0604e71f96 Mon Sep 17 00:00:00 2001 From: Zhi Qi <30543181+LittleFall@users.noreply.github.com> Date: Tue, 7 Mar 2023 18:13:12 +0800 Subject: [PATCH 568/676] copr: (enhance) support executor limit with partition_by fields (#14359) ref tikv/tikv#13936 Signed-off-by: Zhi Qi --- Cargo.lock | 2 +- components/tidb_query_executors/src/runner.rs | 41 +++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ea1ebcfbb3b..d939343c06a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6701,7 +6701,7 @@ dependencies = [ [[package]] name = "tipb" version = "0.0.1" -source = "git+https://github.com/pingcap/tipb.git#614f3ffd42ddc84b78ff59d65f105f2099a6f1b1" +source = "git+https://github.com/pingcap/tipb.git#955fbdc879517f16b7a2f5967f143b92a6ab03dd" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 60857dda80d..60359f22c55 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -329,14 +329,39 @@ pub fn build_executors( ExecType::TypeLimit => { EXECUTOR_COUNT_METRICS.batch_limit.inc(); - Box::new( - BatchLimitExecutor::new( - executor, - ed.get_limit().get_limit() as usize, - is_src_scan_executor, - )? - .collect_summary(summary_slot_index), - ) + let mut d = ed.take_limit(); + + // If there is partition_by field in Limit, we treat it as a + // partitionTopN without order_by. + // todo: refine those logics. + let partition_by = d + .take_partition_by() + .into_iter() + .map(|mut item| item.take_expr()) + .collect_vec(); + + if partition_by.is_empty() { + Box::new( + BatchLimitExecutor::new( + executor, + d.get_limit() as usize, + is_src_scan_executor, + )? + .collect_summary(summary_slot_index), + ) + } else { + Box::new( + BatchPartitionTopNExecutor::new( + config.clone(), + executor, + partition_by, + vec![], + vec![], + d.get_limit() as usize, + )? + .collect_summary(summary_slot_index), + ) + } } ExecType::TypeTopN => { EXECUTOR_COUNT_METRICS.batch_top_n.inc(); From 32925ca564ddb801950942315818929265f58222 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Tue, 7 Mar 2023 21:43:13 +0800 Subject: [PATCH 569/676] txn: Well-defined behavior of allow_lock_with_conflict with should_not_exist or lock_if_exists (#14330) close tikv/tikv#14293 Signed-off-by: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot Co-authored-by: cfzjywxk Co-authored-by: ekexium --- .../txn/actions/acquire_pessimistic_lock.rs | 568 +++++++++++++++++- src/storage/types.rs | 8 +- 2 files changed, 555 insertions(+), 21 deletions(-) diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 86b9ddeab41..987af9fbed7 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -7,7 +7,7 @@ use txn_types::{Key, LockType, OldValue, PessimisticLock, TimeStamp, Value, Writ use crate::storage::{ mvcc::{ metrics::{MVCC_CONFLICT_COUNTER, MVCC_DUPLICATE_CMD_COUNTER_VEC}, - ErrorInner, MvccTxn, Result as MvccResult, SnapshotReader, + Error as MvccError, ErrorInner, MvccTxn, Result as MvccResult, SnapshotReader, }, txn::{ actions::check_data_constraint::check_data_constraint, sched_pool::tls_can_enable, @@ -117,13 +117,14 @@ pub fn acquire_pessimistic_lock( .into()); } + let requested_for_update_ts = for_update_ts; let locked_with_conflict_ts = if allow_lock_with_conflict && for_update_ts < lock.for_update_ts { // If the key is already locked by the same transaction with larger // for_update_ts, and the current request has // `allow_lock_with_conflict` set, we must consider // these possibilities: - // * If a previous request successfully locked the key with conflict, but the + // * A previous request successfully locked the key with conflict, but the // response is lost due to some errors such as RPC failures. In this case, we // return like the current request's result is locked_with_conflict, for // idempotency concern. @@ -147,11 +148,33 @@ pub fn acquire_pessimistic_lock( if let Some((write, commit_ts)) = write { // Here `get_write_with_commit_ts` returns only the latest PUT if it exists and // is not deleted. It's still ok to pass it into `check_data_constraint`. - // In case we are going to lock it with write conflict, we do not check it since - // the statement will then retry. - if locked_with_conflict_ts.is_none() { - check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; - } + check_data_constraint(reader, should_not_exist, &write, commit_ts, &key).or_else( + |e| { + if is_already_exist(&e) && commit_ts > requested_for_update_ts { + // If `allow_lock_with_conflict` is set and there is write conflict, + // and the constraint check doesn't pass on the latest version, + // return a WriteConflict error instead of AlreadyExist, to inform the + // client to retry. + // Note the conflict_info may be not consistent with the + // `locked_with_conflict_ts` we got before. + // This is possible if the key is locked by a newer request with + // larger for_update_ts, in which case the result of this request + // doesn't matter at all. So we don't need + // to care about it. + let conflict_info = ConflictInfo { + conflict_start_ts: write.start_ts, + conflict_commit_ts: commit_ts, + }; + return Err(conflict_info.into_write_conflict_error( + reader.start_ts, + primary.to_vec(), + key.to_raw()?, + )); + } + Err(e) + }, + )?; + if need_load_value { val = Some(reader.load_data(&key, write)?); } else if need_check_existence { @@ -159,7 +182,7 @@ pub fn acquire_pessimistic_lock( } } } - // Pervious write is not loaded. + // Previous write is not loaded. let (prev_write_loaded, prev_write) = (false, None); let old_value = load_old_value( need_old_value, @@ -200,7 +223,7 @@ pub fn acquire_pessimistic_lock( )); } - let mut locked_with_conflict_ts = None; + let mut conflict_info = None; // Following seek_write read the previous write. let (prev_write_loaded, mut prev_write) = (true, None); @@ -221,7 +244,10 @@ pub fn acquire_pessimistic_lock( .inc(); if allow_lock_with_conflict { // TODO: New metrics. - locked_with_conflict_ts = Some(commit_ts); + conflict_info = Some(ConflictInfo { + conflict_start_ts: write.start_ts, + conflict_commit_ts: commit_ts, + }); for_update_ts = commit_ts; need_load_value = true; } else { @@ -269,19 +295,30 @@ pub fn acquire_pessimistic_lock( } } - // Check data constraint when acquiring pessimistic lock. But in case we are - // going to lock it with write conflict, we do not check it since the - // statement will then retry. - if locked_with_conflict_ts.is_none() { - check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; - } + // Check data constraint when acquiring pessimistic lock. + check_data_constraint(reader, should_not_exist, &write, commit_ts, &key).or_else(|e| { + if is_already_exist(&e) { + // If `allow_lock_with_conflict` is set and there is write conflict, + // and the constraint check doesn't pass on the latest version, + // return a WriteConflict error instead of AlreadyExist, to inform the + // client to retry. + if let Some(conflict_info) = conflict_info { + return Err(conflict_info.into_write_conflict_error( + reader.start_ts, + primary.to_vec(), + key.to_raw()?, + )); + } + } + Err(e) + })?; (last_change_ts, versions_to_last_change) = write.next_last_change_info(commit_ts); // Load value if locked_with_conflict, so that when the client (TiDB) need to // read the value during statement retry, it will be possible to read the value // from cache instead of RPC. - if need_value || need_check_existence || locked_with_conflict_ts.is_some() { + if need_value || need_check_existence || conflict_info.is_some() { val = match write.write_type { // If it's a valid Write, no need to read again. WriteType::Put @@ -338,6 +375,12 @@ pub fn acquire_pessimistic_lock( // do it when val exists if !lock_only_if_exists || val.is_some() { txn.put_pessimistic_lock(key, lock, true); + } else if let Some(conflict_info) = conflict_info { + return Err(conflict_info.into_write_conflict_error( + reader.start_ts, + primary.to_vec(), + key.into_raw()?, + )); } // TODO don't we need to commit the modifies in txn? @@ -345,13 +388,46 @@ pub fn acquire_pessimistic_lock( PessimisticLockKeyResult::new_success( need_value, need_check_existence, - locked_with_conflict_ts, + conflict_info.map(ConflictInfo::into_locked_with_conflict_ts), val, ), old_value, )) } +#[derive(Clone, Copy)] +struct ConflictInfo { + conflict_start_ts: TimeStamp, + conflict_commit_ts: TimeStamp, +} + +impl ConflictInfo { + fn into_locked_with_conflict_ts(self) -> TimeStamp { + self.conflict_commit_ts + } + + fn into_write_conflict_error( + self, + start_ts: TimeStamp, + primary: Vec, + key: Vec, + ) -> MvccError { + ErrorInner::WriteConflict { + start_ts, + conflict_start_ts: self.conflict_start_ts, + conflict_commit_ts: self.conflict_commit_ts, + key, + primary, + reason: WriteConflictReason::PessimisticRetry, + } + .into() + } +} + +fn is_already_exist(res: &MvccError) -> bool { + matches!(res, MvccError(box ErrorInner::AlreadyExist { .. })) +} + pub mod tests { use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::Context; @@ -385,6 +461,8 @@ pub mod tests { for_update_ts: impl Into, need_value: bool, need_check_existence: bool, + should_not_exist: bool, + lock_only_if_exists: bool, ) -> MvccResult { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); @@ -397,14 +475,14 @@ pub mod tests { &mut reader, Key::from_raw(key), pk, - false, + should_not_exist, 1, for_update_ts.into(), need_value, need_check_existence, 0.into(), false, - false, + lock_only_if_exists, true, ); if res.is_ok() { @@ -436,6 +514,8 @@ pub mod tests { for_update_ts, need_value, need_check_existence, + false, + false, ) .unwrap() } @@ -1826,6 +1906,8 @@ pub mod tests { 55, false, false, + false, + false, ) .unwrap_err(); assert!(matches!(err, MvccError(box ErrorInner::KeyIsLocked(_)))); @@ -1837,6 +1919,8 @@ pub mod tests { 9, false, false, + false, + false, ) .unwrap_err(); assert!(matches!(err, MvccError(box ErrorInner::KeyIsLocked(_)))); @@ -1974,4 +2058,448 @@ pub mod tests { must_commit(&mut engine, key, 60, 69); } } + + #[test] + fn test_lock_with_conflict_should_not_exist() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 20); + must_commit(&mut engine, b"k1", 20, 30); + + // Key already exists. + let e = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 10, + 10, + false, + false, + true, + false, + ) + .unwrap_err(); + match e { + MvccError(box ErrorInner::WriteConflict { .. }) => (), + e => panic!("unexpected error: {:?}", e), + } + must_unlocked(&mut engine, b"k1"); + + // Key already exists and already locked by the same txn. + must_succeed(&mut engine, b"k1", b"k1", 10, 30); + must_pessimistic_locked(&mut engine, b"k1", 10, 30); + let e = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 10, + 10, + false, + false, + true, + false, + ) + .unwrap_err(); + match e { + MvccError(box ErrorInner::WriteConflict { .. }) => (), + e => panic!("unexpected error: {:?}", e), + } + must_pessimistic_locked(&mut engine, b"k1", 10, 30); + + // Key already exists and already locked by a larger for_update_ts (stale + // request). + must_succeed(&mut engine, b"k1", b"k1", 10, 40); + must_pessimistic_locked(&mut engine, b"k1", 10, 40); + let e = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 10, + 10, + false, + false, + true, + false, + ) + .unwrap_err(); + match e { + MvccError(box ErrorInner::WriteConflict { .. }) => (), + e => panic!("unexpected error: {:?}", e), + } + must_pessimistic_locked(&mut engine, b"k1", 10, 40); + + // Key not exist. + must_pessimistic_prewrite_delete(&mut engine, b"k1", b"k1", 10, 40, DoPessimisticCheck); + must_commit(&mut engine, b"k1", 10, 60); + must_unlocked(&mut engine, b"k1"); + + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 50, + 50, + false, + false, + true, + false, + ) + .unwrap() + .assert_locked_with_conflict(None, 60); + must_pessimistic_locked(&mut engine, b"k1", 50, 60); + // Key not exist and key is already locked (idempotency). + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 50, + 50, + false, + false, + true, + false, + ) + .unwrap() + .assert_locked_with_conflict(None, 60); + must_pessimistic_locked(&mut engine, b"k1", 50, 60); + + // Key not exist and key is locked with a larger for_update_ts (stale request). + must_succeed(&mut engine, b"k1", b"k1", 50, 70); + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 50, + 50, + false, + false, + true, + false, + ) + .unwrap() + .assert_locked_with_conflict(None, 70); + must_pessimistic_locked(&mut engine, b"k1", 50, 70); + + // The following test cases tests if `allow_lock_with_conflict` causes any + // problem when there's no write conflict. + + // Key not exist and no conflict. + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 10, + 10, + false, + false, + true, + false, + ) + .unwrap() + .assert_empty(); + must_pessimistic_locked(&mut engine, b"k2", 10, 10); + + // Idempotency + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 10, + 10, + false, + false, + true, + false, + ) + .unwrap() + .assert_empty(); + must_pessimistic_locked(&mut engine, b"k2", 10, 10); + + // Locked by a larger for_update_ts (stale request). + // Note that in this case, the client must have been requested a lock with + // larger for_update_ts, and the current request must be stale. + // Therefore it doesn't matter what result this request returns. It only + // need to guarantee the data won't be broken. + must_succeed(&mut engine, b"k2", b"k2", 10, 20); + must_pessimistic_locked(&mut engine, b"k2", 10, 20); + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 10, + 10, + false, + false, + true, + false, + ) + .unwrap() + .assert_locked_with_conflict(None, 20); + must_pessimistic_locked(&mut engine, b"k2", 10, 20); + + // Locked by a smaller for_update_ts. + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 10, + 25, + false, + false, + true, + false, + ) + .unwrap() + .assert_empty(); + must_pessimistic_locked(&mut engine, b"k2", 10, 25); + + // Key exists and no conflict. + must_pessimistic_prewrite_put(&mut engine, b"k2", b"v2", b"k2", 10, 20, DoPessimisticCheck); + must_commit(&mut engine, b"k2", 10, 30); + must_unlocked(&mut engine, b"k2"); + + let e = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 40, + 40, + false, + false, + true, + false, + ) + .unwrap_err(); + match e { + MvccError(box ErrorInner::AlreadyExist { .. }) => (), + e => panic!("unexpected error: {:?}", e), + } + must_unlocked(&mut engine, b"k2"); + + // Key exists, no conflict, and key is already locked. + must_succeed(&mut engine, b"k2", b"k2", 40, 40); + must_pessimistic_locked(&mut engine, b"k2", 40, 40); + let e = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 40, + 40, + false, + false, + true, + false, + ) + .unwrap_err(); + match e { + MvccError(box ErrorInner::AlreadyExist { .. }) => (), + e => panic!("unexpected error: {:?}", e), + } + must_pessimistic_locked(&mut engine, b"k2", 40, 40); + + // Key exists, no conflict, and key is locked with a larger for_update_ts (stale + // request). + // Note that in this case, the client must have been requested a lock with + // larger for_update_ts, and the current request must be stale. + // Therefore it doesn't matter what result this request returns. It only + // need to guarantee the data won't be broken. + must_succeed(&mut engine, b"k2", b"k2", 40, 50); + must_pessimistic_locked(&mut engine, b"k2", 40, 50); + let e = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 40, + 40, + false, + false, + true, + false, + ) + .unwrap_err(); + match e { + MvccError(box ErrorInner::AlreadyExist { .. }) => (), + e => panic!("unexpected error: {:?}", e), + } + must_pessimistic_locked(&mut engine, b"k2", 40, 50); + + // Key exists, no conflict, and key is locked with a smaller for_update_ts. + let e = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 40, + 60, + false, + false, + true, + false, + ) + .unwrap_err(); + match e { + MvccError(box ErrorInner::AlreadyExist { .. }) => (), + e => panic!("unexpected error: {:?}", e), + } + must_pessimistic_locked(&mut engine, b"k2", 40, 50); + } + + #[test] + fn test_lock_with_conflict_lock_only_if_exists() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 20); + must_commit(&mut engine, b"k1", 20, 30); + + // Key exists. + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 10, + 10, + true, + false, + false, + true, + ) + .unwrap() + .assert_locked_with_conflict(Some(b"v1"), 30); + must_pessimistic_locked(&mut engine, b"k1", 10, 30); + + // Key exists and already locked (idempotency). + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 10, + 10, + true, + false, + false, + true, + ) + .unwrap() + .assert_locked_with_conflict(Some(b"v1"), 30); + must_pessimistic_locked(&mut engine, b"k1", 10, 30); + + // Key exists and is locked with a larger for_update_ts (stale request) + must_succeed(&mut engine, b"k1", b"k1", 10, 40); + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 10, + 10, + true, + false, + false, + true, + ) + .unwrap() + .assert_locked_with_conflict(Some(b"v1"), 40); + must_pessimistic_locked(&mut engine, b"k1", 10, 40); + + // Key not exist. + must_pessimistic_prewrite_delete(&mut engine, b"k1", b"k1", 10, 40, DoPessimisticCheck); + must_commit(&mut engine, b"k1", 10, 60); + must_unlocked(&mut engine, b"k1"); + + let e = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 50, + 50, + true, + false, + false, + true, + ) + .unwrap_err(); + match e { + MvccError(box ErrorInner::WriteConflict { .. }) => (), + e => panic!("unexpected error: {:?}", e), + } + must_unlocked(&mut engine, b"k1"); + + // lock_only_if_exists didn't handle the case that the key doesn't exist but + // already locked. So do not test it in this case. + + // The following test cases tests if `allow_lock_with_conflict` causes any + // problem when there's no write conflict. + + // Key not exist and no conflict. + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 10, + 10, + true, + false, + false, + true, + ) + .unwrap() + .assert_value(None); + must_unlocked(&mut engine, b"k2"); + + // Key exists and no conflict. + must_prewrite_put(&mut engine, b"k2", b"v2", b"k2", 10); + must_commit(&mut engine, b"k2", 10, 30); + + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 40, + 40, + true, + false, + false, + true, + ) + .unwrap() + .assert_value(Some(b"v2")); + must_pessimistic_locked(&mut engine, b"k2", 40, 40); + + // Key exists, no conflict and already locked (idempotency). + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 40, + 40, + true, + false, + false, + true, + ) + .unwrap() + .assert_value(Some(b"v2")); + must_pessimistic_locked(&mut engine, b"k2", 40, 40); + + // Key exists, no conflict and locked with a larger for_update_ts (stale + // request). + // Note that in this case, the client must have been requested a lock with + // larger for_update_ts, and the current request must be stale. + // Therefore it doesn't matter what result this request returns. It only + // need to guarantee the data won't be broken. + must_succeed(&mut engine, b"k2", b"k2", 40, 50); + must_pessimistic_locked(&mut engine, b"k2", 40, 50); + acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k2", + b"k2", + 40, + 40, + true, + false, + false, + true, + ) + .unwrap() + .assert_locked_with_conflict(Some(b"v2"), 50); + must_pessimistic_locked(&mut engine, b"k2", 40, 50); + } } diff --git a/src/storage/types.rs b/src/storage/types.rs index b4e91811843..7774dcda9ec 100644 --- a/src/storage/types.rs +++ b/src/storage/types.rs @@ -223,7 +223,13 @@ impl PessimisticLockKeyResult { } pub fn assert_empty(&self) { - assert!(matches!(self, Self::Empty)); + match self { + Self::Empty => (), + x => panic!( + "pessimistic lock key result not match, expected Empty, got {:?}", + x + ), + } } #[cfg(test)] From 404c777e710e6fece2e917fdb99ce970d5a85dd1 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 8 Mar 2023 10:53:12 +0800 Subject: [PATCH 570/676] raftstore-v2: consider None when getting mailbox (#14348) close tikv/tikv#14347 consider None when getting mailbox Signed-off-by: SpadeA-Tang --- .../src/operation/command/admin/split.rs | 44 ++++++++++++++----- tests/failpoints/cases/test_split_region.rs | 34 +++++++++++++- 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index b31fc7e7471..e6cd7511801 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -603,7 +603,23 @@ impl Peer { self.add_pending_tick(PeerTick::SplitRegionCheck); } self.storage_mut().set_has_dirty_data(true); - let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); + + fail_point!("before_cluster_shutdown1"); + let mailbox = { + match store_ctx.router.mailbox(self.region_id()) { + Some(mailbox) => mailbox, + None => { + // None means the node is shutdown concurrently and thus the + // mailboxes in router have been cleared + assert!( + store_ctx.router.is_shutdown(), + "{} router should have been closed", + SlogFormat(&self.logger) + ); + return; + } + } + }; let tablet_index = res.tablet_index; let _ = store_ctx .schedulers @@ -641,16 +657,20 @@ impl Peer { match store_ctx.router.force_send(new_region_id, split_init) { Ok(_) => {} Err(SendError(PeerMsg::SplitInit(msg))) => { - store_ctx + fail_point!("before_cluster_shutdown2", |_| {}); + if let Err(e) = store_ctx .router .force_send_control(StoreMsg::SplitInit(msg)) - .unwrap_or_else(|e| { - slog_panic!( - self.logger, - "fails to send split peer intialization msg to store"; - "error" => ?e, - ) - }); + { + if store_ctx.router.is_shutdown() { + return; + } + slog_panic!( + self.logger, + "fails to send split peer intialization msg to store"; + "error" => ?e, + ); + } } _ => unreachable!(), } @@ -731,7 +751,11 @@ impl Peer { } else { // None means the node is shutdown concurrently and thus the // mailboxes in router have been cleared - assert!(store_ctx.router.is_shutdown()); + assert!( + store_ctx.router.is_shutdown(), + "{} router should have been closed", + SlogFormat(&self.logger) + ); return; } } diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index f3a052c8027..94dfd1b5648 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -23,7 +23,9 @@ use kvproto::{ use pd_client::PdClient; use raft::eraftpb::MessageType; use raftstore::{ - store::{config::Config as RaftstoreConfig, util::is_vote_msg, Callback, PeerMsg}, + store::{ + config::Config as RaftstoreConfig, util::is_vote_msg, Callback, PeerMsg, WriteResponse, + }, Result, }; use test_raftstore::*; @@ -1104,3 +1106,33 @@ fn test_split_store_channel_full() { assert_ne!(region.id, 1); fail::remove(sender_fp); } + +#[test] +fn test_split_during_cluster_shutdown() { + // test case for raftstore-v2 + use test_raftstore_v2::*; + + let test_split = |split_fp| { + let count = 1; + let mut cluster = new_server_cluster(0, count); + cluster.run(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k2", b"v2"); + cluster.must_put(b"k3", b"v3"); + fail::cfg_callback(split_fp, move || { + // After one second, mailboxes will be cleared in shutdown + thread::sleep(Duration::from_secs(1)); + }) + .unwrap(); + + let pd_client = cluster.pd_client.clone(); + let region = pd_client.get_region(b"k2").unwrap(); + let c = Box::new(move |_write_resp: WriteResponse| {}); + cluster.split_region(®ion, b"k2", Callback::write(c)); + + cluster.shutdown(); + }; + + test_split("before_cluster_shutdown1"); + test_split("before_cluster_shutdown2"); +} From e186703363e101934868f71d74a7978f5ce44b1d Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 8 Mar 2023 13:11:11 +0800 Subject: [PATCH 571/676] raftstore-v2: use compaction filter to trim tablet (#14350) ref tikv/tikv#12842 None Signed-off-by: tabokie --- Cargo.lock | 6 +- Cargo.toml | 1 + components/engine_rocks/src/raw.rs | 18 +- components/engine_rocks/src/util.rs | 180 +++++++++++++++++- components/engine_test/src/lib.rs | 12 +- components/engine_traits/src/tablet.rs | 2 +- .../raftstore-v2/src/worker/tablet_gc.rs | 9 +- components/tikv_util/src/sys/cgroup.rs | 1 - src/config/mod.rs | 145 ++++++++++---- src/server/engine_factory.rs | 64 ++++++- src/server/gc_worker/compaction_filter.rs | 27 +-- .../gc_worker/rawkv_compaction_filter.rs | 21 +- src/server/ttl/ttl_compaction_filter.rs | 19 +- src/storage/kv/test_engine_builder.rs | 16 +- src/storage/mod.rs | 7 +- tests/integrations/storage/test_titan.rs | 1 + 16 files changed, 423 insertions(+), 106 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d939343c06a..77d24e482d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2862,7 +2862,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#cd8b60758b46afbbde6fde52fa86a2776b401723" +source = "git+https://github.com/tikv/rust-rocksdb.git#9e4678857e5b4c738e95c7ee1a35ee962264f4e9" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2881,7 +2881,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#cd8b60758b46afbbde6fde52fa86a2776b401723" +source = "git+https://github.com/tikv/rust-rocksdb.git#9e4678857e5b4c738e95c7ee1a35ee962264f4e9" dependencies = [ "bzip2-sys", "cc", @@ -4799,7 +4799,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#cd8b60758b46afbbde6fde52fa86a2776b401723" +source = "git+https://github.com/tikv/rust-rocksdb.git#9e4678857e5b4c738e95c7ee1a35ee962264f4e9" dependencies = [ "libc 0.2.139", "librocksdb_sys", diff --git a/Cargo.toml b/Cargo.toml index 509f9514b10..a559fa22474 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -185,6 +185,7 @@ api_version = { workspace = true, features = ["testexport"] } example_coprocessor_plugin = { workspace = true } # should be a binary dependency hyper-openssl = "0.9" panic_hook = { workspace = true } +raftstore = { workspace = true, features = ["testexport"] } reqwest = { version = "0.11", features = ["blocking"] } test_sst_importer = { workspace = true } test_util = { workspace = true } diff --git a/components/engine_rocks/src/raw.rs b/components/engine_rocks/src/raw.rs index e940fdd2cd7..474137534f8 100644 --- a/components/engine_rocks/src/raw.rs +++ b/components/engine_rocks/src/raw.rs @@ -7,13 +7,13 @@ //! crate, but only until the engine interface is completely abstracted. pub use rocksdb::{ - new_compaction_filter_raw, run_ldb_tool, run_sst_dump_tool, BlockBasedOptions, Cache, - ChecksumType, CompactOptions, CompactionFilter, CompactionFilterContext, - CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, - CompactionJobInfo, CompactionOptions, CompactionPriority, ConcurrentTaskLimiter, - DBBottommostLevelCompaction, DBCompactionFilter, DBCompactionStyle, DBCompressionType, - DBEntryType, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, DBTitanDBBlobRunMode, - Env, EventListener, IngestExternalFileOptions, LRUCacheOptions, MemoryAllocator, PerfContext, - PrepopulateBlockCache, Range, RateLimiter, SliceTransform, Statistics, - TablePropertiesCollector, TablePropertiesCollectorFactory, WriteBufferManager, + run_ldb_tool, run_sst_dump_tool, BlockBasedOptions, Cache, ChecksumType, CompactOptions, + CompactionFilter, CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, + CompactionFilterValueType, CompactionJobInfo, CompactionOptions, CompactionPriority, + ConcurrentTaskLimiter, DBBottommostLevelCompaction, DBCompactionFilter, DBCompactionStyle, + DBCompressionType, DBEntryType, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, + DBTableFileCreationReason, DBTitanDBBlobRunMode, Env, EventListener, IngestExternalFileOptions, + LRUCacheOptions, MemoryAllocator, PerfContext, PrepopulateBlockCache, Range, RateLimiter, + SliceTransform, Statistics, TablePropertiesCollector, TablePropertiesCollectorFactory, + WriteBufferManager, }; diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 407cf8ee611..52b1364c3ce 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -1,11 +1,13 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fs, path::Path, str::FromStr, sync::Arc}; +use std::{ffi::CString, fs, path::Path, str::FromStr, sync::Arc}; use engine_traits::{Engines, Range, Result, CF_DEFAULT}; use rocksdb::{ - load_latest_options, CColumnFamilyDescriptor, CFHandle, ColumnFamilyOptions, Env, - Range as RocksRange, SliceTransform, DB, + load_latest_options, CColumnFamilyDescriptor, CFHandle, ColumnFamilyOptions, CompactionFilter, + CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, + CompactionFilterValueType, DBTableFileCreationReason, Env, Range as RocksRange, SliceTransform, + DB, }; use slog_global::warn; @@ -331,6 +333,178 @@ pub fn from_raw_perf_level(level: rocksdb::PerfLevel) -> engine_traits::PerfLeve } } +struct OwnedRange { + start_key: Box<[u8]>, + end_key: Box<[u8]>, +} + +type FilterByReason = [bool; 4]; + +fn reason_to_index(reason: DBTableFileCreationReason) -> usize { + match reason { + DBTableFileCreationReason::Flush => 0, + DBTableFileCreationReason::Compaction => 1, + DBTableFileCreationReason::Recovery => 2, + DBTableFileCreationReason::Misc => 3, + } +} + +fn filter_by_reason(factory: &impl CompactionFilterFactory) -> FilterByReason { + let mut r = FilterByReason::default(); + r[reason_to_index(DBTableFileCreationReason::Flush)] = + factory.should_filter_table_file_creation(DBTableFileCreationReason::Flush); + r[reason_to_index(DBTableFileCreationReason::Compaction)] = + factory.should_filter_table_file_creation(DBTableFileCreationReason::Compaction); + r[reason_to_index(DBTableFileCreationReason::Recovery)] = + factory.should_filter_table_file_creation(DBTableFileCreationReason::Recovery); + r[reason_to_index(DBTableFileCreationReason::Misc)] = + factory.should_filter_table_file_creation(DBTableFileCreationReason::Misc); + r +} + +pub struct StackingCompactionFilterFactory { + outer_should_filter: FilterByReason, + outer: A, + inner_should_filter: FilterByReason, + inner: B, +} + +impl StackingCompactionFilterFactory { + /// Creates a factory of stacked filter with `outer` on top of `inner`. + /// Table keys will be filtered through `outer` first before reaching + /// `inner`. + pub fn new(outer: A, inner: B) -> Self { + let outer_should_filter = filter_by_reason(&outer); + let inner_should_filter = filter_by_reason(&inner); + Self { + outer_should_filter, + outer, + inner_should_filter, + inner, + } + } +} + +impl CompactionFilterFactory + for StackingCompactionFilterFactory +{ + type Filter = StackingCompactionFilter; + + fn create_compaction_filter( + &self, + context: &CompactionFilterContext, + ) -> Option<(CString, Self::Filter)> { + let i = reason_to_index(context.reason()); + let mut outer_filter = None; + let mut inner_filter = None; + let mut full_name = String::new(); + if self.outer_should_filter[i] + && let Some((name, filter)) = self.outer.create_compaction_filter(context) + { + outer_filter = Some(filter); + full_name = name.into_string().unwrap(); + } + if self.inner_should_filter[i] + && let Some((name, filter)) = self.inner.create_compaction_filter(context) + { + inner_filter = Some(filter); + if !full_name.is_empty() { + full_name += "."; + } + full_name += name.to_str().unwrap(); + } + if outer_filter.is_none() && inner_filter.is_none() { + None + } else { + let filter = StackingCompactionFilter { + outer: outer_filter, + inner: inner_filter, + }; + Some((CString::new(full_name).unwrap(), filter)) + } + } + + fn should_filter_table_file_creation(&self, reason: DBTableFileCreationReason) -> bool { + let i = reason_to_index(reason); + self.outer_should_filter[i] || self.inner_should_filter[i] + } +} + +pub struct StackingCompactionFilter { + outer: Option, + inner: Option, +} + +impl CompactionFilter for StackingCompactionFilter { + fn featured_filter( + &mut self, + level: usize, + key: &[u8], + seqno: u64, + value: &[u8], + value_type: CompactionFilterValueType, + ) -> CompactionFilterDecision { + if let Some(outer) = self.outer.as_mut() + && let r = outer.featured_filter(level, key, seqno, value, value_type) + && !matches!(r, CompactionFilterDecision::Keep) + { + r + } else if let Some(inner) = self.inner.as_mut() { + inner.featured_filter(level, key, seqno, value, value_type) + } else { + CompactionFilterDecision::Keep + } + } +} + +#[derive(Clone)] +pub struct RangeCompactionFilterFactory(Arc); + +impl RangeCompactionFilterFactory { + pub fn new(start_key: Box<[u8]>, end_key: Box<[u8]>) -> Self { + let range = OwnedRange { start_key, end_key }; + Self(Arc::new(range)) + } +} + +impl CompactionFilterFactory for RangeCompactionFilterFactory { + type Filter = RangeCompactionFilter; + + fn create_compaction_filter( + &self, + _context: &CompactionFilterContext, + ) -> Option<(CString, Self::Filter)> { + Some(( + CString::new("range_filter").unwrap(), + RangeCompactionFilter(self.0.clone()), + )) + } + + fn should_filter_table_file_creation(&self, _reason: DBTableFileCreationReason) -> bool { + true + } +} + +/// Filters out all keys outside the key range. +pub struct RangeCompactionFilter(Arc); + +impl CompactionFilter for RangeCompactionFilter { + fn featured_filter( + &mut self, + _level: usize, + key: &[u8], + _seqno: u64, + _value: &[u8], + _value_type: CompactionFilterValueType, + ) -> CompactionFilterDecision { + if key < self.0.start_key.as_ref() || key >= self.0.end_key.as_ref() { + CompactionFilterDecision::Remove + } else { + CompactionFilterDecision::Keep + } + } +} + #[cfg(test)] mod tests { use engine_traits::{CfOptionsExt, Peekable, SyncMutable, CF_DEFAULT}; diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 1b0dbfbddb6..bc8b2f8baf2 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -365,7 +365,7 @@ pub mod ctor { use engine_rocks::{ get_env, properties::{MvccPropertiesCollectorFactory, RangePropertiesCollectorFactory}, - util::new_engine_opt as rocks_new_engine_opt, + util::{new_engine_opt as rocks_new_engine_opt, RangeCompactionFilterFactory}, RocksCfOptions, RocksDbOptions, RocksPersistenceListener, }; use engine_traits::{ @@ -425,9 +425,17 @@ pub mod ctor { ); rocks_db_opts.add_event_listener(RocksPersistenceListener::new(listener)); } + let factory = + RangeCompactionFilterFactory::new(ctx.start_key.clone(), ctx.end_key.clone()); let rocks_cfs_opts = cf_opts .iter() - .map(|(name, opt)| (*name, get_rocks_cf_opts(opt))) + .map(|(name, opt)| { + let mut opt = get_rocks_cf_opts(opt); + // We assume `get_rocks_cf_opts` didn't set a factory already. + opt.set_compaction_filter_factory("range_filter_factory", factory.clone()) + .unwrap(); + (*name, opt) + }) .collect(); rocks_new_engine_opt(path, rocks_db_opts, rocks_cfs_opts) } diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index 79512a99f64..14f7d186f76 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -149,7 +149,7 @@ pub trait TabletFactory: Send + Sync { /// Check if the tablet with specified path exists fn exists(&self, path: &Path) -> bool; - #[cfg(any(test, feature = "testexport"))] + #[cfg(feature = "testexport")] fn set_state_storage(&self, _: Arc) { unimplemented!() } diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index d9bd03b326a..0be8fdaa901 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -162,13 +162,7 @@ impl Runner { let end_key = keys::data_end_key(&end); let range1 = Range::new(&[], &start_key); let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); - // TODO: Avoid `DeleteByRange` after compaction filter is ready. - if let Err(e) = tablet - .delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[range1, range2]) - .and_then(|_| { - tablet.delete_ranges_cfs(DeleteStrategy::DeleteByRange, &[range1, range2]) - }) - { + if let Err(e) = tablet.delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[range1, range2]) { error!( self.logger, "failed to trim tablet"; @@ -184,6 +178,7 @@ impl Runner { let range1 = Range::new(&[], &start_key); let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); for r in [range1, range2] { + // When compaction filter is present, trivial move is disallowed. if let Err(e) = tablet.compact_range(Some(r.start_key), Some(r.end_key), false, 1) { diff --git a/components/tikv_util/src/sys/cgroup.rs b/components/tikv_util/src/sys/cgroup.rs index 2cd420e5d51..371d51e0b70 100644 --- a/components/tikv_util/src/sys/cgroup.rs +++ b/components/tikv_util/src/sys/cgroup.rs @@ -560,7 +560,6 @@ mod tests { ("-18446744073709551610", None), // Raise InvalidDigit instead of NegOverflow. ("0.1", None), ]; - println!("{:?}", "-18446744073709551610".parse::()); for (content, expect) in cases.into_iter() { let limit = parse_memory_max(content); assert_eq!(limit, expect); diff --git a/src/config/mod.rs b/src/config/mod.rs index 0f97487edcf..b51883826c8 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -33,7 +33,10 @@ use engine_rocks::{ DBCompactionStyle, DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, PrepopulateBlockCache, RateLimiter, WriteBufferManager, }, - util::{FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform}, + util::{ + FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform, + RangeCompactionFilterFactory, StackingCompactionFilterFactory, + }, RaftDbLogger, RangePropertiesCollectorFactory, RawMvccPropertiesCollectorFactory, RocksCfOptions, RocksDbOptions, RocksEngine, RocksEventListener, RocksStatistics, RocksTitanDbOptions, RocksdbLogger, TtlPropertiesCollectorFactory, @@ -702,6 +705,7 @@ impl DefaultCfConfig { shared: &CfResources, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, + filter_factory: Option<&RangeCompactionFilterFactory>, for_engine: EngineType, ) -> RocksCfOptions { let mut cf_opts = build_cf_opt!( @@ -721,29 +725,67 @@ impl DefaultCfConfig { RawMvccPropertiesCollectorFactory::default(), ); cf_opts.add_table_properties_collector_factory("tikv.range-properties-collector", f); - match api_version { - ApiVersion::V1 => { - // nothing to do - } - ApiVersion::V1ttl => { - cf_opts.add_table_properties_collector_factory( - "tikv.ttl-properties-collector", - TtlPropertiesCollectorFactory::::default(), - ); - cf_opts - .set_compaction_filter_factory( - "ttl_compaction_filter_factory", + if let Some(factory) = filter_factory { + match api_version { + ApiVersion::V1 => { + cf_opts + .set_compaction_filter_factory("range_filter_factory", factory.clone()) + .unwrap(); + } + ApiVersion::V1ttl => { + cf_opts.add_table_properties_collector_factory( + "tikv.ttl-properties-collector", + TtlPropertiesCollectorFactory::::default(), + ); + let factory = StackingCompactionFilterFactory::new( + factory.clone(), TtlCompactionFilterFactory::::default(), - ) - .unwrap(); - } - ApiVersion::V2 => { - cf_opts - .set_compaction_filter_factory( - "apiv2_gc_compaction_filter_factory", + ); + cf_opts + .set_compaction_filter_factory( + "range_filter_factory.ttl_compaction_filter_factory", + factory, + ) + .unwrap(); + } + ApiVersion::V2 => { + let factory = StackingCompactionFilterFactory::new( + factory.clone(), RawCompactionFilterFactory, - ) - .unwrap(); + ); + cf_opts + .set_compaction_filter_factory( + "range_filter_factory.apiv2_gc_compaction_filter_factory", + factory, + ) + .unwrap(); + } + } + } else { + match api_version { + ApiVersion::V1 => { + // nothing to do + } + ApiVersion::V1ttl => { + cf_opts.add_table_properties_collector_factory( + "tikv.ttl-properties-collector", + TtlPropertiesCollectorFactory::::default(), + ); + cf_opts + .set_compaction_filter_factory( + "ttl_compaction_filter_factory", + TtlCompactionFilterFactory::::default(), + ) + .unwrap(); + } + ApiVersion::V2 => { + cf_opts + .set_compaction_filter_factory( + "apiv2_gc_compaction_filter_factory", + RawCompactionFilterFactory, + ) + .unwrap(); + } } } cf_opts.set_titan_cf_options(&self.titan.build_opts()); @@ -827,6 +869,7 @@ impl WriteCfConfig { &self, shared: &CfResources, region_info_accessor: Option<&RegionInfoAccessor>, + filter_factory: Option<&RangeCompactionFilterFactory>, for_engine: EngineType, ) -> RocksCfOptions { let mut cf_opts = build_cf_opt!( @@ -855,12 +898,23 @@ impl WriteCfConfig { prop_keys_index_distance: self.prop_keys_index_distance, }; cf_opts.add_table_properties_collector_factory("tikv.range-properties-collector", f); - cf_opts - .set_compaction_filter_factory( - "write_compaction_filter_factory", - WriteCompactionFilterFactory, - ) - .unwrap(); + if let Some(factory) = filter_factory { + let factory = + StackingCompactionFilterFactory::new(factory.clone(), WriteCompactionFilterFactory); + cf_opts + .set_compaction_filter_factory( + "range_filter_factory.write_compaction_filter_factory", + factory, + ) + .unwrap(); + } else { + cf_opts + .set_compaction_filter_factory( + "write_compaction_filter_factory", + WriteCompactionFilterFactory, + ) + .unwrap(); + } cf_opts.set_titan_cf_options(&self.titan.build_opts()); cf_opts } @@ -930,7 +984,12 @@ impl Default for LockCfConfig { } impl LockCfConfig { - pub fn build_opt(&self, shared: &CfResources, for_engine: EngineType) -> RocksCfOptions { + pub fn build_opt( + &self, + shared: &CfResources, + filter_factory: Option<&RangeCompactionFilterFactory>, + for_engine: EngineType, + ) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; let mut cf_opts = build_cf_opt!( self, @@ -948,6 +1007,11 @@ impl LockCfConfig { }; cf_opts.add_table_properties_collector_factory("tikv.range-properties-collector", f); cf_opts.set_memtable_prefix_bloom_size_ratio(bloom_filter_ratio(for_engine)); + if let Some(factory) = filter_factory { + cf_opts + .set_compaction_filter_factory("range_filter_factory", factory.clone()) + .unwrap(); + } cf_opts.set_titan_cf_options(&self.titan.build_opts()); cf_opts } @@ -1386,19 +1450,28 @@ impl DbConfig { shared: &CfResources, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, + filter_factory: Option<&RangeCompactionFilterFactory>, for_engine: EngineType, ) -> Vec<(&'static str, RocksCfOptions)> { let mut cf_opts = Vec::with_capacity(4); cf_opts.push(( CF_DEFAULT, - self.defaultcf - .build_opt(shared, region_info_accessor, api_version, for_engine), + self.defaultcf.build_opt( + shared, + region_info_accessor, + api_version, + filter_factory, + for_engine, + ), + )); + cf_opts.push(( + CF_LOCK, + self.lockcf.build_opt(shared, filter_factory, for_engine), )); - cf_opts.push((CF_LOCK, self.lockcf.build_opt(shared, for_engine))); cf_opts.push(( CF_WRITE, self.writecf - .build_opt(shared, region_info_accessor, for_engine), + .build_opt(shared, region_info_accessor, filter_factory, for_engine), )); if for_engine == EngineType::RaftKv { cf_opts.push((CF_RAFT, self.raftcf.build_opt(shared))); @@ -3159,7 +3232,10 @@ impl TikvConfig { if self.storage.engine == EngineType::RaftKv2 { self.raft_store.store_io_pool_size = cmp::max(self.raft_store.store_io_pool_size, 1); if !self.raft_engine.enable { - panic!("partitioned-raft-kv only supports raft log engine."); + return Err("partitioned-raft-kv only supports raft log engine.".into()); + } + if self.rocksdb.titan.enabled { + return Err("partitioned-raft-kv doesn't support titan.".into()); } } @@ -4634,6 +4710,7 @@ mod tests { ), None, cfg.storage.api_version(), + None, cfg.storage.engine, ), None, diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index e9b59141da2..9d2c03998e6 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -4,6 +4,7 @@ use std::{path::Path, sync::Arc}; use engine_rocks::{ raw::{Cache, Env}, + util::RangeCompactionFilterFactory, CompactedEventSender, CompactionListener, FlowListener, RocksCfOptions, RocksCompactionJobInfo, RocksDbOptions, RocksEngine, RocksEventListener, RocksPersistenceListener, RocksStatistics, TabletLogger, @@ -153,11 +154,16 @@ impl KvEngineFactory { db_opts } - fn cf_opts(&self, for_engine: EngineType) -> Vec<(&str, RocksCfOptions)> { + fn cf_opts( + &self, + filter_factory: Option<&RangeCompactionFilterFactory>, + for_engine: EngineType, + ) -> Vec<(&str, RocksCfOptions)> { self.inner.rocksdb_config.build_cf_opts( &self.inner.cf_resources, self.inner.region_info_accessor.as_ref(), self.inner.api_version, + filter_factory, for_engine, ) } @@ -172,7 +178,7 @@ impl KvEngineFactory { pub fn create_shared_db(&self, path: impl AsRef) -> Result { let path = path.as_ref(); let mut db_opts = self.db_opts(EngineType::RaftKv); - let cf_opts = self.cf_opts(EngineType::RaftKv); + let cf_opts = self.cf_opts(None, EngineType::RaftKv); if let Some(listener) = &self.inner.flow_listener { db_opts.add_event_listener(listener.clone()); } @@ -191,7 +197,8 @@ impl TabletFactory for KvEngineFactory { let mut db_opts = self.db_opts(EngineType::RaftKv2); let tablet_name = path.file_name().unwrap().to_str().unwrap().to_string(); db_opts.set_info_log(TabletLogger::new(tablet_name)); - let cf_opts = self.cf_opts(EngineType::RaftKv2); + let factory = RangeCompactionFilterFactory::new(ctx.start_key.clone(), ctx.end_key.clone()); + let cf_opts = self.cf_opts(Some(&factory), EngineType::RaftKv2); if let Some(listener) = &self.inner.flow_listener { db_opts.add_event_listener(listener.clone_with(ctx.id)); } @@ -219,7 +226,7 @@ impl TabletFactory for KvEngineFactory { info!("destroy tablet"; "path" => %path.display(), "region_id" => ctx.id, "suffix" => ?ctx.suffix); // Create kv engine. let _db_opts = self.db_opts(EngineType::RaftKv2); - let _cf_opts = self.cf_opts(EngineType::RaftKv2); + let _cf_opts = self.cf_opts(None, EngineType::RaftKv2); // TODOTODO: call rust-rocks or tirocks to destroy_engine; // engine_rocks::util::destroy_engine( // path.to_str().unwrap(), @@ -237,7 +244,7 @@ impl TabletFactory for KvEngineFactory { RocksEngine::exists(path.to_str().unwrap()) } - #[cfg(any(test, feature = "testexport"))] + #[cfg(feature = "testexport")] fn set_state_storage(&self, state_storage: Arc) { let inner = Arc::as_ptr(&self.inner) as *mut FactoryInner; unsafe { @@ -250,13 +257,13 @@ impl TabletFactory for KvEngineFactory { mod tests { use std::path::Path; - use engine_traits::TabletRegistry; + use engine_traits::{Peekable, SyncMutable, TabletRegistry}; + use kvproto::metapb::Region; use super::*; use crate::config::TikvConfig; - #[test] - fn test_engine_factory() { + fn build_test(name: &'static str) -> (tempfile::TempDir, TabletRegistry) { let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); let common_test_cfg = manifest_dir.join("components/test_raftstore/src/common-test.toml"); let cfg = TikvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { @@ -270,11 +277,18 @@ mod tests { .storage .block_cache .build_shared_cache(cfg.storage.engine); - let dir = test_util::temp_dir("test-engine-factory", false); + let dir = test_util::temp_dir(name, false); let env = cfg.build_shared_rocks_env(None, None).unwrap(); let factory = KvEngineFactoryBuilder::new(env, &cfg, cache).build(); let reg = TabletRegistry::new(Box::new(factory), dir.path()).unwrap(); + (dir, reg) + } + + #[test] + fn test_engine_factory() { + let (_dir, reg) = build_test("test_engine_factory"); + let path = reg.tablet_path(1, 3); assert!(!reg.tablet_factory().exists(&path)); let mut tablet_ctx = TabletContext::with_infinite_region(1, Some(3)); @@ -294,4 +308,36 @@ mod tests { .unwrap(); assert!(!reg.tablet_factory().exists(&path)); } + + #[test] + fn test_engine_factory_compaction_filter() { + let (_dir, reg) = build_test("test_engine_factory_compaction_filter"); + + let region = Region { + id: 1, + start_key: b"k1".to_vec(), + end_key: b"k3".to_vec(), + ..Default::default() + }; + let tablet_ctx = TabletContext::new(®ion, Some(3)); + let path = reg.tablet_path(1, 3); + let engine = reg.tablet_factory().open_tablet(tablet_ctx, &path).unwrap(); + engine.put(&keys::data_key(b"k0"), b"v0").unwrap(); + engine.put(&keys::data_key(b"k1"), b"v1").unwrap(); + engine.put(&keys::data_key(b"k2"), b"v2").unwrap(); + engine.put(&keys::data_key(b"k3"), b"v3").unwrap(); + engine.put(&keys::data_key(b"k4"), b"v4").unwrap(); + engine.flush_cfs(&[], true).unwrap(); + assert!(engine.get_value(&keys::data_key(b"k0")).unwrap().is_none()); + assert_eq!( + engine.get_value(&keys::data_key(b"k1")).unwrap().unwrap(), + b"v1" + ); + assert_eq!( + engine.get_value(&keys::data_key(b"k2")).unwrap().unwrap(), + b"v2" + ); + assert!(engine.get_value(&keys::data_key(b"k3")).unwrap().is_none()); + assert!(engine.get_value(&keys::data_key(b"k4")).unwrap().is_none()); + } } diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index 5d33346a844..e6a5b923628 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -14,9 +14,8 @@ use std::{ use engine_rocks::{ raw::{ - new_compaction_filter_raw, CompactionFilter, CompactionFilterContext, - CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, - DBCompactionFilter, + CompactionFilter, CompactionFilterContext, CompactionFilterDecision, + CompactionFilterFactory, CompactionFilterValueType, }, RocksEngine, RocksMvccProperties, RocksWriteBatchVec, }; @@ -199,21 +198,23 @@ impl CompactionFilterInitializer for Option { pub struct WriteCompactionFilterFactory; impl CompactionFilterFactory for WriteCompactionFilterFactory { + type Filter = WriteCompactionFilter; + fn create_compaction_filter( &self, context: &CompactionFilterContext, - ) -> *mut DBCompactionFilter { + ) -> Option<(CString, Self::Filter)> { let gc_context_option = GC_CONTEXT.lock().unwrap(); let gc_context = match *gc_context_option { Some(ref ctx) => ctx, - None => return std::ptr::null_mut(), + None => return None, }; let safe_point = gc_context.safe_point.load(Ordering::Relaxed); if safe_point == 0 { // Safe point has not been initialized yet. debug!("skip gc in compaction filter because of no safe point"); - return std::ptr::null_mut(); + return None; } let (enable, skip_vcheck, ratio_threshold) = { @@ -241,12 +242,12 @@ impl CompactionFilterFactory for WriteCompactionFilterFactory { .map_or(false, RocksEngine::is_stalled_or_stopped) { debug!("skip gc in compaction filter because the DB is stalled"); - return std::ptr::null_mut(); + return None; } if !do_check_allowed(enable, skip_vcheck, &gc_context.feature_gate) { debug!("skip gc in compaction filter because it's not allowed"); - return std::ptr::null_mut(); + return None; } drop(gc_context_option); GC_COMPACTION_FILTER_PERFORM @@ -257,7 +258,7 @@ impl CompactionFilterFactory for WriteCompactionFilterFactory { GC_COMPACTION_FILTER_SKIP .with_label_values(&[STAT_TXN_KEYMODE]) .inc(); - return std::ptr::null_mut(); + return None; } debug!( @@ -275,7 +276,7 @@ impl CompactionFilterFactory for WriteCompactionFilterFactory { (store_id, region_info_provider), ); let name = CString::new("write_compaction_filter").unwrap(); - unsafe { new_compaction_filter_raw(name, filter) } + Some((name, filter)) } } @@ -326,7 +327,7 @@ impl DeleteBatch { } } -struct WriteCompactionFilter { +pub struct WriteCompactionFilter { safe_point: u64, engine: Option, is_bottommost_level: bool, @@ -1067,7 +1068,7 @@ pub mod tests { // Wait up to 1 second, and treat as no task if timeout. if let Ok(Some(task)) = gc_runner.gc_receiver.recv_timeout(Duration::new(1, 0)) { - assert!(expect_tasks, "a GC task is expected"); + assert!(expect_tasks, "unexpected GC task"); match task { GcTask::GcKeys { keys, .. } => { assert_eq!(keys.len(), 1); @@ -1079,7 +1080,7 @@ pub mod tests { } return; } - assert!(!expect_tasks, "no GC task is expected"); + assert!(!expect_tasks, "no GC task after 1 second"); }; // No key switch after the deletion mark. diff --git a/src/server/gc_worker/rawkv_compaction_filter.rs b/src/server/gc_worker/rawkv_compaction_filter.rs index 5e3913f4d40..b2af5b73118 100644 --- a/src/server/gc_worker/rawkv_compaction_filter.rs +++ b/src/server/gc_worker/rawkv_compaction_filter.rs @@ -9,9 +9,8 @@ use std::{ use api_version::{ApiV2, KeyMode, KvFormat}; use engine_rocks::{ raw::{ - new_compaction_filter_raw, CompactionFilter, CompactionFilterContext, - CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, - DBCompactionFilter, + CompactionFilter, CompactionFilterContext, CompactionFilterDecision, + CompactionFilterFactory, CompactionFilterValueType, }, RocksEngine, }; @@ -36,15 +35,17 @@ use crate::{ pub struct RawCompactionFilterFactory; impl CompactionFilterFactory for RawCompactionFilterFactory { + type Filter = RawCompactionFilter; + fn create_compaction_filter( &self, context: &CompactionFilterContext, - ) -> *mut DBCompactionFilter { + ) -> Option<(CString, Self::Filter)> { //---------------- GC context -------------- let gc_context_option = GC_CONTEXT.lock().unwrap(); let gc_context = match *gc_context_option { Some(ref ctx) => ctx, - None => return std::ptr::null_mut(), + None => return None, }; //---------------- GC context END -------------- @@ -57,7 +58,7 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { if safe_point == 0 { // Safe point has not been initialized yet. debug!("skip gc in compaction filter because of no safe point"); - return std::ptr::null_mut(); + return None; } let ratio_threshold = { @@ -76,7 +77,7 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { .map_or(false, RocksEngine::is_stalled_or_stopped) { debug!("skip gc in compaction filter because the DB is stalled"); - return std::ptr::null_mut(); + return None; } drop(gc_context_option); @@ -90,7 +91,7 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { GC_COMPACTION_FILTER_SKIP .with_label_values(&[STAT_RAW_KEYMODE]) .inc(); - return std::ptr::null_mut(); + return None; } let filter = RawCompactionFilter::new( @@ -101,11 +102,11 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { (store_id, region_info_provider), ); let name = CString::new("raw_compaction_filter").unwrap(); - unsafe { new_compaction_filter_raw(name, filter) } + Some((name, filter)) } } -struct RawCompactionFilter { +pub struct RawCompactionFilter { safe_point: u64, is_bottommost_level: bool, gc_scheduler: Scheduler>, diff --git a/src/server/ttl/ttl_compaction_filter.rs b/src/server/ttl/ttl_compaction_filter.rs index a53a766f235..7fdb3c686b7 100644 --- a/src/server/ttl/ttl_compaction_filter.rs +++ b/src/server/ttl/ttl_compaction_filter.rs @@ -5,9 +5,8 @@ use std::{ffi::CString, marker::PhantomData}; use api_version::{KeyMode, KvFormat, RawValue}; use engine_rocks::{ raw::{ - new_compaction_filter_raw, CompactionFilter, CompactionFilterContext, - CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, - DBCompactionFilter, + CompactionFilter, CompactionFilterContext, CompactionFilterDecision, + CompactionFilterFactory, CompactionFilterValueType, DBTableFileCreationReason, }, RocksTtlProperties, }; @@ -21,10 +20,12 @@ pub struct TtlCompactionFilterFactory { } impl CompactionFilterFactory for TtlCompactionFilterFactory { + type Filter = TtlCompactionFilter; + fn create_compaction_filter( &self, context: &CompactionFilterContext, - ) -> *mut DBCompactionFilter { + ) -> Option<(CString, Self::Filter)> { let current = ttl_current_ts(); let mut min_expire_ts = u64::MAX; @@ -38,7 +39,7 @@ impl CompactionFilterFactory for TtlCompactionFilterFactory { } } if min_expire_ts > current { - return std::ptr::null_mut(); + return None; } let name = CString::new("ttl_compaction_filter").unwrap(); @@ -46,11 +47,15 @@ impl CompactionFilterFactory for TtlCompactionFilterFactory { ts: current, _phantom: PhantomData, }; - unsafe { new_compaction_filter_raw(name, filter) } + Some((name, filter)) + } + + fn should_filter_table_file_creation(&self, _reason: DBTableFileCreationReason) -> bool { + true } } -struct TtlCompactionFilter { +pub struct TtlCompactionFilter { ts: u64, _phantom: PhantomData, } diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index aff54a41faa..c6a7cb7f20d 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -103,19 +103,25 @@ impl TestEngineBuilder { .map(|cf| match *cf { CF_DEFAULT => ( CF_DEFAULT, - cfg_rocksdb - .defaultcf - .build_opt(&shared, None, api_version, EngineType::RaftKv), + cfg_rocksdb.defaultcf.build_opt( + &shared, + None, + api_version, + None, + EngineType::RaftKv, + ), ), CF_LOCK => ( CF_LOCK, - cfg_rocksdb.lockcf.build_opt(&shared, EngineType::RaftKv), + cfg_rocksdb + .lockcf + .build_opt(&shared, None, EngineType::RaftKv), ), CF_WRITE => ( CF_WRITE, cfg_rocksdb .writecf - .build_opt(&shared, None, EngineType::RaftKv), + .build_opt(&shared, None, None, EngineType::RaftKv), ), CF_RAFT => (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&shared)), _ => (*cf, RocksCfOptions::default()), diff --git a/src/storage/mod.rs b/src/storage/mod.rs index ca35018e01e..8c58274bc33 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -4205,18 +4205,21 @@ mod tests { &shared, None, ApiVersion::V1, + None, EngineType::RaftKv, ), ), ( CF_LOCK, - cfg_rocksdb.lockcf.build_opt(&shared, EngineType::RaftKv), + cfg_rocksdb + .lockcf + .build_opt(&shared, None, EngineType::RaftKv), ), ( CF_WRITE, cfg_rocksdb .writecf - .build_opt(&shared, None, EngineType::RaftKv), + .build_opt(&shared, None, None, EngineType::RaftKv), ), (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&shared)), ]; diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 921dcf3615f..76eee9b1322 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -168,6 +168,7 @@ fn test_delete_files_in_range_for_titan() { &cfg.rocksdb.build_cf_resources(cache), None, cfg.storage.api_version(), + None, cfg.storage.engine, ); From c4bc6d9a4929d773660591fff5b6a21d5a07fc93 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 8 Mar 2023 16:33:12 +0800 Subject: [PATCH 572/676] raftstore-v2: use larger target file size (#14361) ref tikv/tikv#14352 Use the same target file size as if compaction guard is enabled. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- etc/config-template.toml | 11 +--- src/config/mod.rs | 68 ++++++++++------------ tests/integrations/config/mod.rs | 15 ++--- tests/integrations/config/test-custom.toml | 5 -- 4 files changed, 39 insertions(+), 60 deletions(-) diff --git a/etc/config-template.toml b/etc/config-template.toml index 9b9a81d4106..ea73efdf59e 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -515,8 +515,8 @@ ## Value -1 means files opened are always kept open and RocksDB will prefetch index and filter ## blocks into block cache at startup. So if your database has a large working set, it will take ## several minutes to open the DB. You may need to increase this if your database has a large -## working set. You can estimate the number of files based on `target-file-size-base` and -## `target_file_size_multiplier` for level-based compaction. +## working set. You can estimate the number of files based on `target-file-size-base` for +## level-based compaction. # max-open-files = 40960 ## Max size of RocksDB's MANIFEST file. @@ -739,10 +739,8 @@ ## Target file size for compaction. ## The SST file size of level-0 is influenced by the compaction algorithm of `write-buffer-size` ## and level0. `target-file-size-base` is used to control the size of a single SST file of level1 to -## level6. Each level will have `target-file-size-base * (target-file-size-multiplier ^ (level - 1))`. +## level6. # target-file-size-base = "8MB" -## In partitioned-raft-kv, the default value of target-file-size-multiplier is 2 for write and default cf. -# target-file-size-multiplier = 1 ## Max bytes for `compaction.max_compaction_bytes`. ## If it's necessary to enlarge value of this entry, it's better to also enlarge `reserve-space` @@ -927,7 +925,6 @@ ## Recommend to set it the same as `rocksdb.defaultcf.max-bytes-for-level-base`. # max-bytes-for-level-base = "512MB" # target-file-size-base = "8MB" -# target-file-size-multiplier = 1 # level0-file-num-compaction-trigger = 4 # level0-slowdown-writes-trigger = 20 @@ -956,7 +953,6 @@ # min-write-buffer-number-to-merge = 1 # max-bytes-for-level-base = "128MB" # target-file-size-base = "8MB" -# target-file-size-multiplier = 1 # level0-file-num-compaction-trigger = 1 # level0-slowdown-writes-trigger = 20 # level0-stop-writes-trigger = 20 @@ -1018,7 +1014,6 @@ ## Recommend to set it the same as `rocksdb.defaultcf.max-bytes-for-level-base`. # max-bytes-for-level-base = "512MB" # target-file-size-base = "8MB" -# target-file-size-multiplier = 1 # level0-file-num-compaction-trigger = 4 # level0-slowdown-writes-trigger = 20 diff --git a/src/config/mod.rs b/src/config/mod.rs index b51883826c8..57c2e935d78 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -323,8 +323,7 @@ macro_rules! cf_config { #[online_config(skip)] pub min_write_buffer_number_to_merge: i32, pub max_bytes_for_level_base: ReadableSize, - pub target_file_size_base: ReadableSize, - pub target_file_size_multiplier: i32, + pub target_file_size_base: Option, pub level0_file_num_compaction_trigger: i32, pub level0_slowdown_writes_trigger: Option, pub level0_stop_writes_trigger: Option, @@ -380,6 +379,11 @@ macro_rules! cf_config { } impl $name { + #[inline] + fn target_file_size_base(&self) -> u64 { + self.target_file_size_base.unwrap_or(ReadableSize::mb(8)).0 + } + fn validate(&self) -> Result<(), Box> { if self.block_size.0 as usize > MAX_BLOCK_SIZE { return Err(format!( @@ -453,7 +457,7 @@ macro_rules! write_into_metrics { .set($cf.max_bytes_for_level_base.0 as f64); $metrics .with_label_values(&[$tag, "target_file_size_base"]) - .set($cf.target_file_size_base.0 as f64); + .set($cf.target_file_size_base() as f64); $metrics .with_label_values(&[$tag, "level0_file_num_compaction_trigger"]) .set($cf.level0_file_num_compaction_trigger.into()); @@ -575,10 +579,7 @@ macro_rules! build_cf_opt { cf_opts.set_max_write_buffer_number($opt.max_write_buffer_number); cf_opts.set_min_write_buffer_number_to_merge($opt.min_write_buffer_number_to_merge); cf_opts.set_max_bytes_for_level_base($opt.max_bytes_for_level_base.0); - cf_opts.set_target_file_size_base($opt.target_file_size_base.0); - if $opt.target_file_size_multiplier != 0 { - cf_opts.set_target_file_size_multiplier($opt.target_file_size_multiplier); - } + cf_opts.set_target_file_size_base($opt.target_file_size_base()); cf_opts.set_level_zero_file_num_compaction_trigger($opt.level0_file_num_compaction_trigger); cf_opts.set_level_zero_slowdown_writes_trigger( $opt.level0_slowdown_writes_trigger.unwrap_or_default(), @@ -665,8 +666,7 @@ impl Default for DefaultCfConfig { max_write_buffer_number: 5, min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(512), - target_file_size_base: ReadableSize::mb(8), - target_file_size_multiplier: 0, + target_file_size_base: None, level0_file_num_compaction_trigger: 4, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -830,8 +830,7 @@ impl Default for WriteCfConfig { max_write_buffer_number: 5, min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(512), - target_file_size_base: ReadableSize::mb(8), - target_file_size_multiplier: 0, + target_file_size_base: None, level0_file_num_compaction_trigger: 4, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -949,8 +948,7 @@ impl Default for LockCfConfig { max_write_buffer_number: 5, min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(128), - target_file_size_base: ReadableSize::mb(8), - target_file_size_multiplier: 0, + target_file_size_base: None, level0_file_num_compaction_trigger: 1, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -1043,8 +1041,7 @@ impl Default for RaftCfConfig { max_write_buffer_number: 5, min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(128), - target_file_size_base: ReadableSize::mb(8), - target_file_size_multiplier: 0, + target_file_size_base: None, level0_file_num_compaction_trigger: 1, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -1305,16 +1302,14 @@ impl DbConfig { self.write_buffer_limit.get_or_insert(ReadableSize( (total_mem * WRITE_BUFFER_MEMORY_LIMIT_RATE) as u64, )); - if self.writecf.enable_compaction_guard != Some(true) - && self.writecf.target_file_size_multiplier == 0 - { - self.writecf.target_file_size_multiplier = 2; - } - if self.defaultcf.enable_compaction_guard != Some(true) - && self.defaultcf.target_file_size_multiplier == 0 - { - self.defaultcf.target_file_size_multiplier = 2; - } + // In RaftKv2, every region uses its own rocksdb instance, it's actually the + // even stricter compaction guard, so use the same output file size base. + self.writecf + .target_file_size_base + .get_or_insert(self.writecf.compaction_guard_max_output_file_size); + self.defaultcf + .target_file_size_base + .get_or_insert(self.defaultcf.compaction_guard_max_output_file_size); self.defaultcf.disable_write_stall = true; self.writecf.disable_write_stall = true; self.lockcf.disable_write_stall = true; @@ -1565,8 +1560,7 @@ impl Default for RaftDefaultCfConfig { max_write_buffer_number: 5, min_write_buffer_number_to_merge: 1, max_bytes_for_level_base: ReadableSize::mb(512), - target_file_size_base: ReadableSize::mb(8), - target_file_size_multiplier: 0, + target_file_size_base: None, level0_file_num_compaction_trigger: 4, level0_slowdown_writes_trigger: None, level0_stop_writes_trigger: None, @@ -4867,7 +4861,7 @@ mod tests { cfg.rocksdb.max_background_jobs = 4; cfg.rocksdb.max_background_flushes = 2; cfg.rocksdb.defaultcf.disable_auto_compactions = false; - cfg.rocksdb.defaultcf.target_file_size_base = ReadableSize::mb(64); + cfg.rocksdb.defaultcf.target_file_size_base = Some(ReadableSize::mb(64)); cfg.rocksdb.defaultcf.block_cache_size = ReadableSize::mb(8); cfg.rocksdb.rate_bytes_per_sec = ReadableSize::mb(64); cfg.rocksdb.rate_limiter_auto_tuned = false; @@ -5315,33 +5309,33 @@ mod tests { let no_limiter: Option = None; // Test comopaction guard disabled. let config = DefaultCfConfig { - target_file_size_base: ReadableSize::mb(16), + target_file_size_base: Some(ReadableSize::mb(16)), enable_compaction_guard: Some(false), ..Default::default() }; let provider = Some(MockRegionInfoProvider::new(vec![])); let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, no_limiter.as_ref(), provider); assert_eq!( - config.target_file_size_base.0, + config.target_file_size_base(), cf_opts.get_target_file_size_base() ); // Test compaction guard enabled but region info provider is missing. let config = DefaultCfConfig { - target_file_size_base: ReadableSize::mb(16), + target_file_size_base: Some(ReadableSize::mb(16)), enable_compaction_guard: Some(true), ..Default::default() }; let provider: Option = None; let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, no_limiter.as_ref(), provider); assert_eq!( - config.target_file_size_base.0, + config.target_file_size_base(), cf_opts.get_target_file_size_base() ); // Test compaction guard enabled. let config = DefaultCfConfig { - target_file_size_base: ReadableSize::mb(16), + target_file_size_base: Some(ReadableSize::mb(16)), enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(4), compaction_guard_max_output_file_size: ReadableSize::mb(64), @@ -5649,10 +5643,10 @@ mod tests { Some(default_cfg.coprocessor.region_split_size() * 3 / 4 / ReadableSize::kb(1)); default_cfg.raft_store.region_split_check_diff = Some(default_cfg.coprocessor.region_split_size() / 16); - default_cfg.rocksdb.writecf.target_file_size_multiplier = 1; - default_cfg.rocksdb.defaultcf.target_file_size_multiplier = 1; - default_cfg.rocksdb.lockcf.target_file_size_multiplier = 1; - default_cfg.raftdb.defaultcf.target_file_size_multiplier = 1; + default_cfg.rocksdb.writecf.target_file_size_base = Some(ReadableSize::mb(8)); + default_cfg.rocksdb.defaultcf.target_file_size_base = Some(ReadableSize::mb(8)); + default_cfg.rocksdb.lockcf.target_file_size_base = Some(ReadableSize::mb(8)); + default_cfg.raftdb.defaultcf.target_file_size_base = Some(ReadableSize::mb(8)); // Other special cases. cfg.pd.retry_max_count = default_cfg.pd.retry_max_count; // Both -1 and isize::MAX are the same. diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 672fd79ee12..a25a43ce6e1 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -345,8 +345,7 @@ fn test_serde_custom_tikv_config() { max_write_buffer_number: 12, min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), - target_file_size_base: ReadableSize::kb(123), - target_file_size_multiplier: 3, + target_file_size_base: Some(ReadableSize::kb(123)), level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), @@ -401,8 +400,7 @@ fn test_serde_custom_tikv_config() { max_write_buffer_number: 12, min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), - target_file_size_base: ReadableSize::kb(123), - target_file_size_multiplier: 3, + target_file_size_base: Some(ReadableSize::kb(123)), level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), @@ -471,8 +469,7 @@ fn test_serde_custom_tikv_config() { max_write_buffer_number: 12, min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), - target_file_size_base: ReadableSize::kb(123), - target_file_size_multiplier: 3, + target_file_size_base: Some(ReadableSize::kb(123)), level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), @@ -541,8 +538,7 @@ fn test_serde_custom_tikv_config() { max_write_buffer_number: 12, min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), - target_file_size_base: ReadableSize::kb(123), - target_file_size_multiplier: 3, + target_file_size_base: Some(ReadableSize::kb(123)), level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), @@ -640,8 +636,7 @@ fn test_serde_custom_tikv_config() { max_write_buffer_number: 12, min_write_buffer_number_to_merge: 12, max_bytes_for_level_base: ReadableSize::kb(12), - target_file_size_base: ReadableSize::kb(123), - target_file_size_multiplier: 3, + target_file_size_base: Some(ReadableSize::kb(123)), level0_file_num_compaction_trigger: 123, level0_slowdown_writes_trigger: Some(123), level0_stop_writes_trigger: Some(123), diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index f8931cbddac..d79ec7899e2 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -309,7 +309,6 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" -target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 @@ -374,7 +373,6 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" -target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 @@ -425,7 +423,6 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" -target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 @@ -476,7 +473,6 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" -target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 @@ -559,7 +555,6 @@ max-write-buffer-number = 12 min-write-buffer-number-to-merge = 12 max-bytes-for-level-base = "12KB" target-file-size-base = "123KB" -target-file-size-multiplier = 3 level0-file-num-compaction-trigger = 123 level0-slowdown-writes-trigger = 123 level0-stop-writes-trigger = 123 From cc72dc9ba921ac3e1a6501e435aa754cd96dd543 Mon Sep 17 00:00:00 2001 From: Jay Date: Wed, 8 Mar 2023 17:31:12 +0800 Subject: [PATCH 573/676] importer: support raftstore v2 (#14305) ref tikv/tikv#12842 A few behavior changes: - In v2, normal mode is always used, trying to switch to import mode will get error response. - A context is added to compact range request. If not compact with region ID, the request will be rejected. - SSTs are cleaned up immediately if its corresponding regions doesn't exist on the store anymore. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/raftstore-v2/src/batch/store.rs | 6 +- components/raftstore-v2/src/fsm/peer.rs | 4 + components/raftstore-v2/src/fsm/store.rs | 5 + .../src/operation/command/write/ingest.rs | 114 ++++++++++++++++++ .../src/operation/command/write/mod.rs | 37 +----- components/raftstore-v2/src/router/message.rs | 2 + .../raftstore-v2/src/worker/tablet_gc.rs | 30 ++++- components/server/src/server.rs | 4 +- components/server/src/server2.rs | 39 +++--- components/test_raftstore-v2/src/server.rs | 21 ++-- components/test_raftstore/src/server.rs | 4 +- src/import/mod.rs | 26 +++- src/import/sst_service.rs | 98 +++++++++++---- 14 files changed, 293 insertions(+), 99 deletions(-) create mode 100644 components/raftstore-v2/src/operation/command/write/ingest.rs diff --git a/Cargo.lock b/Cargo.lock index 77d24e482d7..90e77ce6e56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2733,7 +2733,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#0561adc3754362675cc08b5203d8b6444e645395" +source = "git+https://github.com/pingcap/kvproto.git#02fc19e8abc41245e286d4a70f23e5139e3a33fe" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 83fa6b7a018..4833030fec3 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -621,7 +621,11 @@ impl StoreSystem { let tablet_gc_scheduler = workers.tablet_gc.start_with_timer( "tablet-gc-worker", - tablet_gc::Runner::new(tablet_registry.clone(), self.logger.clone()), + tablet_gc::Runner::new( + tablet_registry.clone(), + sst_importer.clone(), + self.logger.clone(), + ), ); let schedulers = Schedulers { diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 2c47ab165f2..388cdbbcce5 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -327,6 +327,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerMsg::TabletTrimmed { tablet_index } => { self.fsm.peer_mut().on_tablet_trimmed(tablet_index) } + PeerMsg::CleanupImportSst(ssts) => self + .fsm + .peer_mut() + .on_cleanup_import_sst(self.store_ctx, ssts), #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index fef433f04f5..afb7aa5d0d8 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -230,6 +230,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { ); self.on_pd_store_heartbeat(); + self.schedule_tick( + StoreTick::CleanupImportSst, + self.store_ctx.cfg.cleanup_import_sst_interval.0, + ); } pub fn schedule_tick(&mut self, tick: StoreTick, timeout: Duration) { @@ -253,6 +257,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { fn on_tick(&mut self, tick: StoreTick) { match tick { StoreTick::PdStoreHeartbeat => self.on_pd_store_heartbeat(), + StoreTick::CleanupImportSst => self.on_cleanup_import_sst(), _ => unimplemented!(), } } diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs new file mode 100644 index 00000000000..c39fc25a28b --- /dev/null +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -0,0 +1,114 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use collections::HashMap; +use crossbeam::channel::TrySendError; +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::import_sstpb::SstMeta; +use raftstore::{ + store::{check_sst_for_ingestion, metrics::PEER_WRITE_CMD_COUNTER, util}, + Result, +}; +use slog::error; +use tikv_util::{box_try, slog_panic}; + +use crate::{ + batch::StoreContext, + fsm::{ApplyResReporter, Store, StoreFsmDelegate}, + raft::{Apply, Peer}, + router::{PeerMsg, StoreTick}, + worker::tablet_gc, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { + #[inline] + pub fn on_cleanup_import_sst(&mut self) { + if let Err(e) = self.fsm.store.on_cleanup_import_sst(self.store_ctx) { + error!(self.fsm.store.logger(), "cleanup import sst failed"; "error" => ?e); + } + self.schedule_tick( + StoreTick::CleanupImportSst, + self.store_ctx.cfg.cleanup_import_sst_interval.0, + ); + } +} + +impl Store { + #[inline] + fn on_cleanup_import_sst( + &mut self, + ctx: &mut StoreContext, + ) -> Result<()> { + let ssts = box_try!(ctx.sst_importer.list_ssts()); + if ssts.is_empty() { + return Ok(()); + } + let mut region_ssts: HashMap<_, Vec<_>> = HashMap::default(); + for sst in ssts { + region_ssts + .entry(sst.get_region_id()) + .or_default() + .push(sst); + } + for (region_id, ssts) in region_ssts { + if let Err(TrySendError::Disconnected(msg)) = ctx.router.send(region_id, PeerMsg::CleanupImportSst(ssts.into())) + && !ctx.router.is_shutdown() { + let PeerMsg::CleanupImportSst(ssts) = msg else { unreachable!() }; + let _ = ctx.schedulers.tablet_gc.schedule(tablet_gc::Task::CleanupImportSst(ssts)); + } + } + + Ok(()) + } +} + +impl Peer { + pub fn on_cleanup_import_sst( + &mut self, + ctx: &mut StoreContext, + ssts: Box<[SstMeta]>, + ) { + let epoch = self.region().get_region_epoch(); + let mut stale_ssts = Vec::from(ssts); + stale_ssts.retain(|sst| util::is_epoch_stale(sst.get_region_epoch(), epoch)); + if stale_ssts.is_empty() { + return; + } + let _ = ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::CleanupImportSst(stale_ssts.into())); + } +} + +impl Apply { + #[inline] + pub fn apply_ingest(&mut self, ssts: Vec) -> Result<()> { + PEER_WRITE_CMD_COUNTER.ingest_sst.inc(); + let mut infos = Vec::with_capacity(ssts.len()); + for sst in &ssts { + if let Err(e) = check_sst_for_ingestion(sst, self.region()) { + error!( + self.logger, + "ingest fail"; + "sst" => ?sst, + "region" => ?self.region(), + "error" => ?e + ); + let _ = self.sst_importer().delete(sst); + return Err(e); + } + match self.sst_importer().validate(sst) { + Ok(meta_info) => infos.push(meta_info), + Err(e) => { + slog_panic!(self.logger, "corrupted sst"; "sst" => ?sst, "error" => ?e); + } + } + } + // Unlike v1, we can't batch ssts accross regions. + self.flush(); + if let Err(e) = self.sst_importer().ingest(&infos, self.tablet()) { + slog_panic!(self.logger, "ingest fail"; "ssts" => ?ssts, "error" => ?e); + } + Ok(()) + } +} diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index a461420f75b..b017a7b0ef7 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -1,10 +1,10 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{data_cf_offset, KvEngine, Mutable, RaftEngine, CF_DEFAULT}; -use kvproto::{import_sstpb::SstMeta, raft_cmdpb::RaftRequestHeader}; +use kvproto::raft_cmdpb::RaftRequestHeader; use raftstore::{ store::{ - check_sst_for_ingestion, cmd_resp, + cmd_resp, fsm::{apply, MAX_PROPOSAL_SIZE_RATIO}, metrics::PEER_WRITE_CMD_COUNTER, msg::ErrorCallback, @@ -12,7 +12,6 @@ use raftstore::{ }, Error, Result, }; -use slog::error; use tikv_util::slog_panic; use crate::{ @@ -22,6 +21,7 @@ use crate::{ router::{ApplyTask, CmdResChannel}, }; +mod ingest; mod simple_write; pub use simple_write::{ @@ -233,35 +233,4 @@ impl Apply { // TODO: reuse the same delete as split/merge. Ok(()) } - - #[inline] - pub fn apply_ingest(&mut self, ssts: Vec) -> Result<()> { - PEER_WRITE_CMD_COUNTER.ingest_sst.inc(); - let mut infos = Vec::with_capacity(ssts.len()); - for sst in &ssts { - if let Err(e) = check_sst_for_ingestion(sst, self.region()) { - error!( - self.logger, - "ingest fail"; - "sst" => ?sst, - "region" => ?self.region(), - "error" => ?e - ); - let _ = self.sst_importer().delete(sst); - return Err(e); - } - match self.sst_importer().validate(sst) { - Ok(meta_info) => infos.push(meta_info), - Err(e) => { - slog_panic!(self.logger, "corrupted sst"; "sst" => ?sst, "error" => ?e); - } - } - } - // Unlike v1, we can't batch ssts accross regions. - self.flush(); - if let Err(e) = self.sst_importer().ingest(&infos, self.tablet()) { - slog_panic!(self.logger, "ingest fail"; "ssts" => ?ssts, "error" => ?e); - } - Ok(()) - } } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 317ba74d4d6..88ac0ba7948 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -3,6 +3,7 @@ // #[PerformanceCriticalPath] use kvproto::{ + import_sstpb::SstMeta, metapb, metapb::RegionEpoch, raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, @@ -206,6 +207,7 @@ pub enum PeerMsg { TabletTrimmed { tablet_index: u64, }, + CleanupImportSst(Box<[SstMeta]>), /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index 0be8fdaa901..5799398c080 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -3,13 +3,15 @@ use std::{ fmt::{self, Display, Formatter}, path::{Path, PathBuf}, + sync::Arc, time::Duration, }; use collections::HashMap; use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry}; -use kvproto::metapb::Region; +use kvproto::{import_sstpb::SstMeta, metapb::Region}; use slog::{debug, error, info, warn, Logger}; +use sst_importer::SstImporter; use tikv_util::{ worker::{Runnable, RunnableWithTimer}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, @@ -37,6 +39,8 @@ pub enum Task { }, /// Sometimes we know for sure a tablet can be destroyed directly. DirectDestroy { tablet: Either }, + /// Cleanup ssts. + CleanupImportSst(Box<[SstMeta]>), } impl Display for Task { @@ -70,6 +74,9 @@ impl Display for Task { Task::DirectDestroy { .. } => { write!(f, "direct destroy tablet") } + Task::CleanupImportSst(ssts) => { + write!(f, "cleanup import ssts {:?}", ssts) + } } } } @@ -128,6 +135,7 @@ impl Task { pub struct Runner { tablet_registry: TabletRegistry, + sst_importer: Arc, logger: Logger, // region_id -> [(tablet_path, wait_for_persisted)]. @@ -140,9 +148,14 @@ pub struct Runner { } impl Runner { - pub fn new(tablet_registry: TabletRegistry, logger: Logger) -> Self { + pub fn new( + tablet_registry: TabletRegistry, + sst_importer: Arc, + logger: Logger, + ) -> Self { Self { tablet_registry, + sst_importer, logger, waiting_destroy_tasks: HashMap::default(), pending_destroy_tasks: Vec::new(), @@ -290,6 +303,14 @@ impl Runner { } false } + + fn cleanup_ssts(&self, ssts: Box<[SstMeta]>) { + for sst in Vec::from(ssts) { + if let Err(e) = self.sst_importer.delete(&sst) { + warn!(self.logger, "failed to cleanup sst"; "err" => ?e, "sst" => ?sst); + } + } + } } impl Runnable for Runner @@ -316,6 +337,7 @@ where persisted_index, } => self.destroy(region_id, persisted_index), Task::DirectDestroy { tablet, .. } => self.direct_destroy(tablet), + Task::CleanupImportSst(ssts) => self.cleanup_ssts(ssts), } } } @@ -344,6 +366,7 @@ mod tests { use tempfile::Builder; use super::*; + use crate::operation::test_util::create_tmp_importer; #[test] fn test_race_between_destroy_and_trim() { @@ -357,7 +380,8 @@ mod tests { )); let registry = TabletRegistry::new(factory, dir.path()).unwrap(); let logger = slog_global::borrow_global().new(slog::o!()); - let mut runner = Runner::new(registry.clone(), logger); + let (_dir, importer) = create_tmp_importer(); + let mut runner = Runner::new(registry.clone(), importer, logger); let mut region = Region::default(); let rid = 1; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 2cde9e9cb78..ae6b86bbbd3 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -91,7 +91,7 @@ use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, - import::{ImportSstService, SstImporter}, + import::{ImportSstService, LocalTablets, SstImporter}, read_pool::{ build_yatp_read_pool, ReadPool, ReadPoolConfigManager, UPDATE_EWMA_TIME_SLICE_INTERVAL, }, @@ -1246,7 +1246,7 @@ where self.config.import.clone(), self.config.raft_store.raft_entry_max_size, engines.engine.clone(), - engines.engines.kv.clone(), + LocalTablets::Singleton(engines.engines.kv.clone()), servers.importer.clone(), ); if servers diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 01a76dfffbc..50b75f27c23 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -48,7 +48,8 @@ use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; use kvproto::{ - deadlock::create_deadlock, diagnosticspb::create_diagnostics, kvrpcpb::ApiVersion, + deadlock::create_deadlock, diagnosticspb::create_diagnostics, + import_sstpb_grpc::create_import_sst, kvrpcpb::ApiVersion, resource_usage_agent::create_resource_metering_pub_sub, }; use pd_client::{PdClient, RpcClient}; @@ -73,7 +74,7 @@ use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, - import::SstImporter, + import::{ImportSstService, LocalTablets, SstImporter}, read_pool::{ build_yatp_read_pool, ReadPool, ReadPoolConfigManager, UPDATE_EWMA_TIME_SLICE_INTERVAL, }, @@ -244,7 +245,7 @@ struct TikvEngines { struct Servers { lock_mgr: LockManager, server: LocalServer, - _importer: Arc, + importer: Arc, rsmeter_pubsub_service: resource_metering::PubSubService, } @@ -969,7 +970,7 @@ where self.servers = Some(Servers { lock_mgr, server, - _importer: importer, + importer, rsmeter_pubsub_service, }); @@ -978,23 +979,23 @@ where fn register_services(&mut self) { let servers = self.servers.as_mut().unwrap(); - let _engines = self.engines.as_ref().unwrap(); + let engines = self.engines.as_ref().unwrap(); // Import SST service. - // let import_service = ImportSstService::new( - // self.config.import.clone(), - // self.config.raft_store.raft_entry_max_size, - // engines.engine.clone(), - // self.tablet_registry.as_ref().unwrap().clone(), - // servers.importer.clone(), - // ); - // if servers - // .server - // .register_service(create_import_sst(import_service)) - // .is_some() - // { - // fatal!("failed to register import service"); - // } + let import_service = ImportSstService::new( + self.config.import.clone(), + self.config.raft_store.raft_entry_max_size, + engines.engine.clone(), + LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), + servers.importer.clone(), + ); + if servers + .server + .register_service(create_import_sst(import_service)) + .is_some() + { + fatal!("failed to register import service"); + } // Create Diagnostics service let diag_service = DiagnosticsService::new( diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index dbcede48a6a..347b6010669 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -22,6 +22,7 @@ use kvproto::{ deadlock_grpc::create_deadlock, debugpb_grpc::DebugClient, diagnosticspb_grpc::create_diagnostics, + import_sstpb_grpc::create_import_sst, kvrpcpb::{ApiVersion, Context}, metapb, raft_cmdpb::RaftCmdResponse, @@ -48,7 +49,7 @@ use test_pd_client::TestPdClient; use test_raftstore::{filter_send, AddressMap, Config, Filter}; use tikv::{ coprocessor, coprocessor_v2, - import::SstImporter, + import::{ImportSstService, LocalTablets, SstImporter}, read_pool::ReadPool, server::{ gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, @@ -462,7 +463,7 @@ impl ServerCluster { .as_ref() .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), )?; - self.storages.insert(node_id, raft_kv_v2); + self.storages.insert(node_id, raft_kv_v2.clone()); ReplicaReadLockChecker::new(concurrency_manager.clone()).register(&mut coprocessor_host); @@ -473,13 +474,13 @@ impl ServerCluster { SstImporter::new(&cfg.import, dir, key_manager, cfg.storage.api_version()).unwrap(), ) }; - // let import_service = ImportSstService::new( - // cfg.import.clone(), - // cfg.raft_store.raft_entry_max_size, - // raft_kv_2.clone(), - // tablet_registry.clone(), - // Arc::clone(&importer), - // ); + let import_service = ImportSstService::new( + cfg.import.clone(), + cfg.raft_store.raft_entry_max_size, + raft_kv_v2, + LocalTablets::Registry(tablet_registry.clone()), + Arc::clone(&importer), + ); // Create deadlock service. let deadlock_service = lock_mgr.deadlock_service(); @@ -544,7 +545,7 @@ impl ServerCluster { .unwrap(); svr.register_service(create_diagnostics(diag_service.clone())); svr.register_service(create_deadlock(deadlock_service.clone())); - // svr.register_service(create_import_sst(import_service.clone())); + svr.register_service(create_import_sst(import_service.clone())); if let Some(svcs) = self.pending_services.get(&node_id) { for fact in svcs { svr.register_service(fact()); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 8c2297fbc45..9fd1229e6e5 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -50,7 +50,7 @@ use test_pd_client::TestPdClient; use tikv::{ config::ConfigController, coprocessor, coprocessor_v2, - import::{ImportSstService, SstImporter}, + import::{ImportSstService, LocalTablets, SstImporter}, read_pool::ReadPool, server::{ gc_worker::GcWorker, @@ -441,7 +441,7 @@ impl ServerCluster { cfg.import.clone(), cfg.raft_store.raft_entry_max_size, engine, - engines.kv.clone(), + LocalTablets::Singleton(engines.kv.clone()), Arc::clone(&importer), ); diff --git a/src/import/mod.rs b/src/import/mod.rs index e2fa3729e52..7ee5647f723 100644 --- a/src/import/mod.rs +++ b/src/import/mod.rs @@ -15,8 +15,9 @@ mod duplicate_detect; mod sst_service; -use std::fmt::Debug; +use std::{borrow::Cow, fmt::Debug}; +use engine_traits::TabletRegistry; use grpcio::{RpcStatus, RpcStatusCode}; pub use sst_importer::{Config, Error, Result, SstImporter, TxnSstWriter}; @@ -48,3 +49,26 @@ macro_rules! send_rpc_response { let _ = res.map_err(|e| warn!("send rpc response"; "err" => %e)).await; }}; } + +#[derive(Clone)] +pub enum LocalTablets { + Singleton(EK), + Registry(TabletRegistry), +} + +impl LocalTablets { + /// Get the tablet of the given region. + /// + /// If `None` is returned, the region may not exist or may not initialized. + /// If there are multiple versions of tablet, the latest one is returned + /// with best effort. + fn get(&self, region_id: u64) -> Option> { + match self { + LocalTablets::Singleton(tablet) => Some(Cow::Borrowed(tablet)), + LocalTablets::Registry(registry) => { + let mut cached = registry.get(region_id)?; + cached.latest().cloned().map(Cow::Owned) + } + } + } +} diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 02e7297bea8..291841facde 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -39,7 +39,7 @@ use tikv_util::{ use tokio::{runtime::Runtime, time::sleep}; use txn_types::{Key, WriteRef, WriteType}; -use super::make_rpc_error; +use super::{make_rpc_error, LocalTablets}; use crate::{ import::duplicate_detect::DuplicateDetector, server::CONFIG_ROCKSDB_GAUGE, @@ -86,7 +86,7 @@ async fn wait_write(mut s: impl Stream + Send + Unpin) -> sto #[derive(Clone)] pub struct ImportSstService { cfg: Config, - tablet_registry: E::Local, + tablets: LocalTablets, engine: E, threads: Arc, // For now, PiTR cannot be executed in the tokio runtime because it is synchronous and may @@ -265,7 +265,7 @@ impl ImportSstService { cfg: Config, raft_entry_max_size: ReadableSize, engine: E, - tablet_registry: E::Local, + tablets: LocalTablets, importer: Arc, ) -> Self { let props = tikv_util::thread_group::current_properties(); @@ -293,12 +293,14 @@ impl ImportSstService { .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) .create() .unwrap(); - importer.start_switch_mode_check(threads.handle(), tablet_registry.clone()); + if let LocalTablets::Singleton(tablet) = &tablets { + importer.start_switch_mode_check(threads.handle(), tablet.clone()); + } threads.spawn(Self::tick(importer.clone())); ImportSstService { cfg, - tablet_registry, + tablets, threads: Arc::new(threads), block_threads: Arc::new(block_threads), engine, @@ -350,14 +352,20 @@ impl ImportSstService { } } - fn check_write_stall(&self) -> Option { + fn check_write_stall(&self, region_id: u64) -> Option { + let tablet = match self.tablets.get(region_id) { + Some(tablet) => tablet, + None => { + let mut errorpb = errorpb::Error::default(); + errorpb.set_message(format!("region {} not found", region_id)); + errorpb.mut_region_not_found().set_region_id(region_id); + return Some(errorpb); + } + }; if self.importer.get_mode() == SwitchMode::Normal - && self - .tablet_registry - .ingest_maybe_slowdown_writes(CF_WRITE) - .expect("cf") + && tablet.ingest_maybe_slowdown_writes(CF_WRITE).expect("cf") { - match self.tablet_registry.get_sst_key_ranges(CF_WRITE, 0) { + match tablet.get_sst_key_ranges(CF_WRITE, 0) { Ok(l0_sst_ranges) => { warn!( "sst ingest is too slow"; @@ -534,7 +542,7 @@ macro_rules! impl_write { sink: ClientStreamingSink<$resp_ty>, ) { let import = self.importer.clone(); - let tablet_registry = self.tablet_registry.clone(); + let tablets = self.tablets.clone(); let (rx, buf_driver) = create_stream_with_buffer(stream, self.cfg.stream_channel_window); let mut rx = rx.map_err(Error::from); @@ -551,8 +559,17 @@ macro_rules! impl_write { }, _ => return Err(Error::InvalidChunk), }; + let region_id = meta.get_region_id(); + let tablet = match tablets.get(region_id) { + Some(t) => t, + None => { + return Err(Error::Engine( + format!("region {} not found", region_id).into(), + )); + } + }; - let writer = match import.$writer_fn(&tablet_registry, meta) { + let writer = match import.$writer_fn(&*tablet, meta) { Ok(w) => w, Err(e) => { error!("build writer failed {:?}", e); @@ -601,13 +618,17 @@ impl ImportSst for ImportSstService { CONFIG_ROCKSDB_GAUGE.with_label_values(&[cf, name]).set(v); } - match req.get_mode() { - SwitchMode::Normal => self - .importer - .enter_normal_mode(self.tablet_registry.clone(), mf), - SwitchMode::Import => self - .importer - .enter_import_mode(self.tablet_registry.clone(), mf), + if let LocalTablets::Singleton(tablet) = &self.tablets { + match req.get_mode() { + SwitchMode::Normal => self.importer.enter_normal_mode(tablet.clone(), mf), + SwitchMode::Import => self.importer.enter_import_mode(tablet.clone(), mf), + } + } else if req.get_mode() != SwitchMode::Normal { + Err(sst_importer::Error::Engine( + "partitioned-raft-kv doesn't support import mode".into(), + )) + } else { + Ok(false) } }; match res { @@ -742,7 +763,8 @@ impl ImportSst for ImportSstService { let timer = Instant::now_coarse(); let importer = Arc::clone(&self.importer); let limiter = self.limiter.clone(); - let tablet_registry = self.tablet_registry.clone(); + let region_id = req.get_sst().get_region_id(); + let tablets = self.tablets.clone(); let start = Instant::now(); let handle_task = async move { @@ -761,6 +783,19 @@ impl ImportSst for ImportSstService { .into_option() .filter(|c| c.cipher_type != EncryptionMethod::Plaintext); + let tablet = match tablets.get(region_id) { + Some(tablet) => tablet, + None => { + let error = sst_importer::Error::Engine(box_err!( + "region {} not found, maybe it's not a replica of this store", + region_id + )); + let mut resp = DownloadResponse::default(); + resp.set_error(error.into()); + return crate::send_rpc_response!(Ok(resp), sink, label, timer); + } + }; + let res = importer.download_ext::( req.get_sst(), req.get_storage_backend(), @@ -768,7 +803,7 @@ impl ImportSst for ImportSstService { req.get_rewrite_rule(), cipher, limiter, - tablet_registry, + tablet.into_owned(), DownloadExt::default() .cache_key(req.get_storage_cache_id()) .req_type(req.get_request_type()), @@ -802,7 +837,8 @@ impl ImportSst for ImportSstService { let timer = Instant::now_coarse(); let mut resp = IngestResponse::default(); - if let Some(errorpb) = self.check_write_stall() { + let region_id = req.get_context().get_region_id(); + if let Some(errorpb) = self.check_write_stall(region_id) { resp.set_error(errorpb); ctx.spawn( sink.success(resp) @@ -844,7 +880,7 @@ impl ImportSst for ImportSstService { let timer = Instant::now_coarse(); let mut resp = IngestResponse::default(); - if let Some(errorpb) = self.check_write_stall() { + if let Some(errorpb) = self.check_write_stall(req.get_context().get_region_id()) { resp.set_error(errorpb); ctx.spawn( sink.success(resp) @@ -892,7 +928,7 @@ impl ImportSst for ImportSstService { ) { let label = "compact"; let timer = Instant::now_coarse(); - let tablet_registry = self.tablet_registry.clone(); + let tablets = self.tablets.clone(); let handle_task = async move { let (start, end) = if !req.has_range() { @@ -909,7 +945,17 @@ impl ImportSst for ImportSstService { Some(req.get_output_level()) }; - let res = tablet_registry.compact_files_in_range(start, end, output_level); + let region_id = req.get_context().get_region_id(); + let tablet = match tablets.get(region_id) { + Some(tablet) => tablet, + None => { + let e = Error::Engine(format!("region {} not found", region_id).into()); + crate::send_rpc_response!(Err(e), sink, label, timer); + return; + } + }; + + let res = tablet.compact_files_in_range(start, end, output_level); match res { Ok(_) => info!( "compact files in range"; From 0aa9e14962db2a3c1bfdc591b5e97b014eb8b16a Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 9 Mar 2023 10:41:12 +0800 Subject: [PATCH 574/676] raftstore: fix incorrect cond judgment leading to rejection of pre_proposal (#14283) close tikv/tikv#14219 raftstore: fix incorrect cond judgment leading to rejection of pre_proposal Signed-off-by: Wenbo Zhang Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/apply.rs | 5 + components/raftstore/src/store/fsm/peer.rs | 31 ++- components/raftstore/src/store/peer.rs | 15 ++ .../raftstore/src/store/peer_storage.rs | 7 +- tests/failpoints/cases/test_witness.rs | 205 +++++++++++++++++- 5 files changed, 258 insertions(+), 5 deletions(-) diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 7afb188a4b0..b9f737158fc 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -3242,6 +3242,11 @@ where ctx: &mut ApplyContext, request: &AdminRequest, ) -> Result<(AdminResponse, ApplyResult)> { + fail_point!( + "before_exec_batch_switch_witness", + self.id() == 2, + |_| unimplemented!() + ); assert!(request.has_switch_witnesses()); let switches = request .get_switch_witnesses() diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 6acddde2257..3eca179d770 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2652,6 +2652,7 @@ where return; } if !msg.wait_data { + let original_remains_nr = self.fsm.peer.wait_data_peers.len(); self.fsm .peer .wait_data_peers @@ -2660,6 +2661,15 @@ where "receive peer ready info"; "peer_id" => self.fsm.peer.peer.get_id(), ); + if original_remains_nr != self.fsm.peer.wait_data_peers.len() { + info!( + "notify pd with change peer region"; + "region_id" => self.fsm.region_id(), + "peer_id" => from.get_id(), + "region" => ?self.fsm.peer.region(), + ); + self.fsm.peer.heartbeat_pd(self.ctx); + } return; } self.register_check_peers_availability_tick(); @@ -5149,6 +5159,8 @@ where return Err(Error::IsWitness(self.region_id())); } + fail_point!("ignore_forbid_leader_to_be_witness", |_| Ok(None)); + // Forbid requests to switch it into a witness when it's a leader if self.fsm.peer.is_leader() && msg.has_admin_request() @@ -5567,7 +5579,14 @@ where fail_point!("ignore request snapshot", |_| { self.schedule_tick(PeerTick::RequestSnapshot); }); - if !self.fsm.peer.wait_data || self.fsm.peer.is_leader() { + if !self.fsm.peer.wait_data { + return; + } + if self.fsm.peer.is_leader() + || self.fsm.peer.is_handling_snapshot() + || self.fsm.peer.has_pending_snapshot() + { + self.schedule_tick(PeerTick::RequestSnapshot); return; } self.fsm.peer.request_index = self.fsm.peer.raft_group.raft.raft_log.last_index(); @@ -6455,9 +6474,15 @@ where for s in sw.switches { let (peer_id, is_witness) = (s.get_peer_id(), s.get_is_witness()); if self.fsm.peer_id() == peer_id { - if is_witness && !self.fsm.peer.is_leader() { - let _ = self.fsm.peer.get_store().clear_data(); + if is_witness { self.fsm.peer.raft_group.set_priority(-1); + if !self.fsm.peer.is_leader() { + let _ = self.fsm.peer.get_store().clear_data(); + } else { + // Avoid calling `clear_data` as the region worker may be scanning snapshot, + // to avoid problems (although no problems were found by testing). + self.fsm.peer.delay_clean_data = true; + } } else { self.fsm .peer diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index a1817edd17b..8dc69a0def4 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -899,6 +899,13 @@ where /// the request index for retrying. pub request_index: u64, + /// It's used to identify the situation where the region worker is + /// generating and sending snapshots when the newly elected leader by Raft + /// applies the switch witness cmd which commited before the election. This + /// flag will prevent immediate data clearing and will be cleared after + /// the successful transfer of leadership. + pub delay_clean_data: bool, + /// When the witness becomes non-witness, it need to actively request a /// snapshot from the leader, In order to avoid log lag, we need to reject /// the leader's `MsgAppend` request unless the `term` of the `last index` @@ -1133,6 +1140,7 @@ where pending_remove: false, wait_data, request_index: last_index, + delay_clean_data: false, should_reject_msgappend: false, should_wake_up: false, force_leader: None, @@ -2323,6 +2331,10 @@ where self.mut_store().cancel_generating_snap(None); self.clear_disk_full_peers(ctx); self.clear_in_memory_pessimistic_locks(); + if self.peer.is_witness && self.delay_clean_data { + let _ = self.get_store().clear_data(); + self.delay_clean_data = false; + } } _ => {} } @@ -2614,6 +2626,7 @@ where ctx.apply_router .schedule_task(self.region_id, ApplyTask::Recover(self.region_id)); self.wait_data = false; + self.should_reject_msgappend = false; return false; } } @@ -5730,6 +5743,7 @@ fn is_request_urgent(req: &RaftCmdRequest) -> bool { | AdminCmdType::PrepareMerge | AdminCmdType::CommitMerge | AdminCmdType::RollbackMerge + | AdminCmdType::BatchSwitchWitness ) } @@ -5828,6 +5842,7 @@ mod tests { AdminCmdType::PrepareMerge, AdminCmdType::CommitMerge, AdminCmdType::RollbackMerge, + AdminCmdType::BatchSwitchWitness, ]; for tp in AdminCmdType::values() { let mut req = RaftCmdRequest::default(); diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 8dc8a18906c..470cdfee998 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -524,7 +524,12 @@ where panic!("{} unexpected state: {:?}", self.tag, *snap_state); } - if *tried_cnt >= MAX_SNAP_TRY_CNT { + let max_snap_try_cnt = (|| { + fail_point!("ignore_snap_try_cnt", |_| usize::MAX); + MAX_SNAP_TRY_CNT + })(); + + if *tried_cnt >= max_snap_try_cnt { let cnt = *tried_cnt; *tried_cnt = 0; return Err(raft::Error::Store(box_err!( diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index ef178ee8aa0..02411ba1b76 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -4,7 +4,7 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; use collections::HashMap; use futures::executor::block_on; -use kvproto::raft_serverpb::RaftApplyState; +use kvproto::{metapb, raft_serverpb::RaftApplyState}; use pd_client::PdClient; use test_raftstore::*; use tikv_util::{config::ReadableDuration, store::find_peer}; @@ -473,3 +473,206 @@ fn test_non_witness_replica_read() { .unwrap(); assert_eq!(resp.get_header().has_error(), false); } + +fn must_get_error_is_witness( + cluster: &mut Cluster, + region: &metapb::Region, + cmd: kvproto::raft_cmdpb::Request, +) { + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![cmd], + true, + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { + region_id: region.get_id(), + ..Default::default() + }, + "{:?}", + resp + ); +} + +// Test the case that once a Raft election elects a voter as the leader, and +// then this voter applies the switch witness cmd, it becomes a witness and can +// correctly transfer the leader identity. +#[test] +fn test_witness_leader_transfer_out() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + + // prevent this peer from applying the switch witness command until it's elected + // as the Raft leader + fail::cfg("before_exec_batch_switch_witness", "pause").unwrap(); + let peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); + // nonwitness -> witness + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store2.get_id()], vec![true]); + // make sure the left peers have applied switch witness cmd + std::thread::sleep(Duration::from_millis(500)); + + // the other follower is isolated + cluster.add_send_filter(IsolationFilterFactory::new(3)); + for i in 1..10 { + cluster.must_put(format!("k{}", i).as_bytes(), format!("v{}", i).as_bytes()); + } + // the leader is down + cluster.stop_node(1); + + // new leader would help to replicate the logs + cluster.clear_send_filters(); + std::thread::sleep(Duration::from_millis(1000)); + // make sure the new leader has became to the witness + fail::remove("before_exec_batch_switch_witness"); + std::thread::sleep(Duration::from_millis(500)); + + // forbid writes + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_is_witness(&mut cluster, ®ion, put); + // forbid reads + let get = new_get_cmd(b"k1"); + must_get_error_is_witness(&mut cluster, ®ion, get); + // forbid read index + let read_index = new_read_index_cmd(); + must_get_error_is_witness(&mut cluster, ®ion, read_index); + + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + + cluster.must_transfer_leader(region.get_id(), peer_on_store3); + cluster.must_put(b"k1", b"v1"); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[2], + ); + assert_eq!(cluster.must_get(b"k9"), Some(b"v9".to_vec())); +} + +// Test the case that once a Raft election elects a voter as the leader, +// and is currently generating a snapshot for another peer, then applies the +// switch witness cmd to be a witness, the generated snapshot will be checked as +// invalidated and will not be regenerated +#[test] +fn test_witness_leader_ignore_gen_snapshot() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + configure_for_snapshot(&mut cluster.cfg); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // the other follower is isolated + cluster.add_send_filter(IsolationFilterFactory::new(3)); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } + + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } + + std::thread::sleep(Duration::from_millis(200)); + + // the truncated index is advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + let diff = state.get_truncated_state().get_index() - before_states[&id].get_index(); + error!("EEEEE"; + "id" => &id, + "diff" => diff, + "state.get_truncated_state().get_index()" => state.get_truncated_state().get_index(), + "before_states[&id].get_index()" => before_states[&id].get_index() + ); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + + // ingore raft log gc to avoid canceling snapshots + fail::cfg("on_raft_gc_log_tick", "return").unwrap(); + // wait for leader applied switch to witness + fail::cfg("before_region_gen_snap", "pause").unwrap(); + fail::cfg("ignore_snap_try_cnt", "return").unwrap(); + // After the snapshot is generated, it will be checked as invalidated and will + // not be regenerated (handle_snapshot will not generate a snapshot for + // witness) + cluster.clear_send_filters(); + std::thread::sleep(Duration::from_millis(500)); + + // non-witness -> witness + fail::cfg("ignore_forbid_leader_to_be_witness", "return").unwrap(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store1.get_id()], + vec![true], + ); + fail::remove("before_region_gen_snap"); + + std::thread::sleep(Duration::from_millis(500)); + + // forbid writes + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_is_witness(&mut cluster, ®ion, put); + // forbid reads + let get = new_get_cmd(b"k1"); + must_get_error_is_witness(&mut cluster, ®ion, get); + // forbid read index + let read_index = new_read_index_cmd(); + must_get_error_is_witness(&mut cluster, ®ion, read_index); + + // reject to transfer, as can't send snapshot to peer_on_store3, there's a log + // gap + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + let _ = cluster.try_transfer_leader(region.get_id(), peer_on_store3); + std::thread::sleep(Duration::from_secs(5)); + assert_eq!(cluster.leader_of_region(1).unwrap(), peer_on_store1); + + // should be enable to transfer leader to peer_on_store2 + let peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); + cluster.must_transfer_leader(1, peer_on_store2); + cluster.must_put(b"k1", b"v1"); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[1], + ); + assert_eq!(cluster.must_get(b"k9"), Some(b"v9".to_vec())); + + fail::remove("on_raft_gc_log_tick"); + fail::remove("ignore_snap_try_cnt"); + fail::remove("ignore_forbid_leader_to_be_witness"); +} From 82ac84bb485d4cd2f7a2e6714e7170485efb6296 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 9 Mar 2023 10:55:12 +0800 Subject: [PATCH 575/676] integration test v2: mvcc resolve lock gc test (#14360) ref tikv/tikv#12842 mvcc resolve lock gc test for v2 Signed-off-by: Spade A Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot --- .../raftstore-v2/src/operation/ready/mod.rs | 19 ++++++++++++------- components/test_raftstore-v2/src/server.rs | 6 ++++++ tests/integrations/server/kv_service.rs | 6 ++++-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index bf7b8ec8858..4c0bf9cbe88 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -673,14 +673,14 @@ impl Peer { fn report_persist_log_duration( &self, ctx: &mut StoreContext, - from: u64, - to: u64, + old_index: u64, + new_index: u64, ) { - if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || from >= to { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || old_index >= new_index { return; } let now = Instant::now(); - for i in from + 1..to { + for i in old_index + 1..=new_index { if let Some((term, trackers)) = self.proposals().find_trackers(i) { if self.entry_storage().term(i).map_or(false, |t| t == term) { for tracker in trackers { @@ -694,12 +694,17 @@ impl Peer { } #[inline] - fn report_commit_log_duration(&self, ctx: &mut StoreContext, from: u64, to: u64) { - if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || from >= to { + fn report_commit_log_duration( + &self, + ctx: &mut StoreContext, + old_index: u64, + new_index: u64, + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || old_index >= new_index { return; } let now = Instant::now(); - for i in from + 1..to { + for i in old_index + 1..=new_index { if let Some((term, trackers)) = self.proposals().find_trackers(i) { if self.entry_storage().term(i).map_or(false, |t| t == term) { let commit_persisted = i <= self.persisted_index(); diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 347b6010669..b105f52be39 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -242,6 +242,7 @@ pub struct ServerMeta { sim_router: SimulateStoreTransport, sim_trans: SimulateServerTransport, raw_router: StoreRouter, + gc_worker: GcWorker, rsmeter_cleanup: Box, } @@ -624,6 +625,7 @@ impl ServerCluster { node, server, sim_router, + gc_worker, sim_trans: simulate_trans, rsmeter_cleanup, }, @@ -635,6 +637,10 @@ impl ServerCluster { Ok(node_id) } + pub fn get_gc_worker(&self, node_id: u64) -> &GcWorker { + &self.metas.get(&node_id).unwrap().gc_worker + } + pub fn get_causal_ts_provider(&self, node_id: u64) -> Option> { self.causal_ts_providers.get(&node_id).cloned() } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 4a981bdfa53..44d16961f7d 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -34,6 +34,7 @@ use raftstore::{ use resource_metering::CollectorRegHandle; use tempfile::Builder; use test_raftstore::*; +use test_raftstore_macro::test_case; use tikv::{ config::QuotaConfig, coprocessor::REQ_TYPE_DAG, @@ -415,11 +416,12 @@ fn test_mvcc_rollback_and_cleanup() { assert_eq!(scan_lock_resp.locks.len(), 0); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_mvcc_resolve_lock_gc_and_delete() { use kvproto::kvrpcpb::*; - let (cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (cluster, client, ctx) = new_cluster(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; From 6342e0efbd6ea08ca95118a4bac53b1dea90ec52 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 9 Mar 2023 16:37:12 +0800 Subject: [PATCH 576/676] server: support sending incremental snapshot (#14275) ref tikv/tikv#14256 This PR tries to reduce the bandwidth consumption by utilizing local tablet. If a file already exists in the local tablet, it will be skipped. This PR also fixes a race of receiving snapshot. Signed-off-by: Jay Lee --- Cargo.lock | 2 +- components/batch-system/src/router.rs | 10 +- components/file_system/src/lib.rs | 66 ++ components/raftstore-v2/src/fsm/peer.rs | 13 +- .../raftstore/src/store/async_io/read.rs | 2 +- components/raftstore/src/store/snap.rs | 37 +- components/server/src/server.rs | 7 +- components/server/src/server2.rs | 6 +- components/test_raftstore-v2/src/server.rs | 6 +- components/test_raftstore/src/node.rs | 2 +- components/test_raftstore/src/server.rs | 7 +- etc/config-template.toml | 8 +- src/lib.rs | 1 + src/server/config.rs | 5 +- src/server/engine_factory.rs | 2 +- src/server/server.rs | 7 +- src/server/service/kv.rs | 18 + src/server/snap.rs | 27 +- src/server/tablet_snap.rs | 825 ++++++++++++------ tests/integrations/config/dynamic/snap.rs | 4 +- tests/integrations/config/mod.rs | 2 +- tests/integrations/config/test-custom.toml | 2 +- tests/integrations/raftstore/test_snap.rs | 2 +- 23 files changed, 764 insertions(+), 297 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 90e77ce6e56..5fb51b4fcdf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2733,7 +2733,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#02fc19e8abc41245e286d4a70f23e5139e3a33fe" +source = "git+https://github.com/pingcap/kvproto.git#60b33e619c70d8abe151f086a19a82895965f28f" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 4238929d1d4..119b7875506 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -183,10 +183,18 @@ where mailbox: BasicMailbox, msg: N::Message, ) -> Result<(), (BasicMailbox, N::Message)> { + let mut normals = self.normals.lock().unwrap(); + // Send has to be done within lock, otherwise the message may be handled + // before the mailbox is register. if let Err(SendError(m)) = mailbox.force_send(msg, &self.normal_scheduler) { return Err((mailbox, m)); } - self.register(addr, mailbox); + if let Some(mailbox) = normals.map.insert(addr, mailbox) { + mailbox.close(); + } + normals + .alive_cnt + .store(normals.map.len(), Ordering::Relaxed); Ok(()) } diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index 058b2a3a5f9..0b6213094af 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -442,6 +442,42 @@ pub fn reserve_space_for_recover>(data_dir: P, file_size: u64) -> } } +const TRASH_PREFIX: &str = "TRASH-"; + +/// Remove a directory. +/// +/// Rename it before actually removal. +#[inline] +pub fn trash_dir_all(path: impl AsRef) -> io::Result<()> { + let path = path.as_ref(); + let name = match path.file_name() { + Some(n) => n, + None => return Err(io::Error::new(ErrorKind::InvalidInput, "path is invalid")), + }; + let trash_path = path.with_file_name(format!("{}{}", TRASH_PREFIX, name.to_string_lossy())); + if let Err(e) = rename(path, &trash_path) { + if e.kind() == ErrorKind::NotFound { + return Ok(()); + } + return Err(e); + } + remove_dir_all(trash_path) +} + +/// When using `trash_dir_all`, it's possible the directory is marked as trash +/// but not being actually deleted after a restart. This function can be used +/// to resume all those removal in the given directory. +#[inline] +pub fn clean_up_trash(path: impl AsRef) -> io::Result<()> { + for e in read_dir(path)? { + let e = e?; + if e.file_name().to_string_lossy().starts_with(TRASH_PREFIX) { + remove_dir_all(e.path())?; + } + } + Ok(()) +} + #[cfg(test)] mod tests { use std::{io::Write, iter}; @@ -608,4 +644,34 @@ mod tests { reserve_space_for_recover(data_path, 0).unwrap(); assert!(!file.exists()); } + + #[test] + fn test_trash_dir_all() { + let tmp_dir = Builder::new() + .prefix("test_reserve_space_for_recover") + .tempdir() + .unwrap(); + let data_path = tmp_dir.path(); + let sub_dir0 = data_path.join("sub_dir0"); + let trash_sub_dir0 = data_path.join(format!("{}sub_dir0", TRASH_PREFIX)); + create_dir_all(&sub_dir0).unwrap(); + assert!(sub_dir0.exists()); + + trash_dir_all(&sub_dir0).unwrap(); + assert!(!sub_dir0.exists()); + assert!(!trash_sub_dir0.exists()); + + create_dir_all(&sub_dir0).unwrap(); + create_dir_all(&trash_sub_dir0).unwrap(); + trash_dir_all(&sub_dir0).unwrap(); + assert!(!sub_dir0.exists()); + assert!(!trash_sub_dir0.exists()); + + clean_up_trash(data_path).unwrap(); + + create_dir_all(&trash_sub_dir0).unwrap(); + assert!(trash_sub_dir0.exists()); + clean_up_trash(data_path).unwrap(); + assert!(!trash_sub_dir0.exists()); + } } diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 388cdbbcce5..77860b0ff49 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -9,10 +9,11 @@ use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{errorpb, raft_cmdpb::RaftCmdResponse}; use raftstore::store::{Config, TabletSnapManager, Transport}; -use slog::{debug, error, info, trace, Logger}; +use slog::{debug, info, trace, Logger}; use tikv_util::{ is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, + slog_panic, time::{duration_to_sec, Instant}, }; @@ -159,12 +160,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, let mb = match self.store_ctx.router.mailbox(region_id) { Some(mb) => mb, None => { - error!( - self.fsm.logger(), - "failed to get mailbox"; - "tick" => ?tick, - ); - return; + if !self.fsm.peer.serving() || self.store_ctx.router.is_shutdown() { + return; + } + slog_panic!(self.fsm.logger(), "failed to get mailbox"; "tick" => ?tick); } }; self.fsm.tick_registry[idx] = true; diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index b02992bbeb0..985134048dd 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -124,7 +124,7 @@ impl ReadRunner { let checkpointer_path = self.snap_mgr().tablet_gen_path(snap_key); if checkpointer_path.as_path().exists() { // Remove the old checkpoint directly. - std::fs::remove_dir_all(checkpointer_path.as_path())?; + file_system::trash_dir_all(&checkpointer_path)?; } // Here not checkpoint to a temporary directory first, the temporary directory // logic already implemented in rocksdb. diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 5f971818e9a..37189d2e52b 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -4,13 +4,12 @@ use std::{ cmp::{self, Ordering as CmpOrdering, Reverse}, error::Error as StdError, fmt::{self, Display, Formatter}, - fs, io::{self, ErrorKind, Read, Write}, path::{Path, PathBuf}, result, str, sync::{ atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, - Arc, RwLock, + Arc, Mutex, RwLock, }, thread, time, u64, }; @@ -1932,6 +1931,19 @@ impl Display for TabletSnapKey { } } +pub struct ReceivingGuard<'a> { + receiving: &'a Mutex>, + key: TabletSnapKey, +} + +impl Drop for ReceivingGuard<'_> { + fn drop(&mut self) { + let mut receiving = self.receiving.lock().unwrap(); + let pos = receiving.iter().position(|k| k == &self.key).unwrap(); + receiving.swap_remove(pos); + } +} + /// `TabletSnapManager` manager tablet snapshot and shared between raftstore v2. /// It's similar `SnapManager`, but simpler in tablet version. /// @@ -1941,6 +1953,7 @@ impl Display for TabletSnapKey { pub struct TabletSnapManager { // directory to store snapfile. base: PathBuf, + receiving: Arc>>, } impl TabletSnapManager { @@ -1956,7 +1969,11 @@ impl TabletSnapManager { format!("{} should be a directory", path.display()), )); } - Ok(Self { base: path }) + file_system::clean_up_trash(&path)?; + Ok(Self { + base: path, + receiving: Arc::default(), + }) } pub fn tablet_gen_path(&self, key: &TabletSnapKey) -> PathBuf { @@ -1976,7 +1993,7 @@ impl TabletSnapManager { pub fn delete_snapshot(&self, key: &TabletSnapKey) -> bool { let path = self.tablet_gen_path(key); - if path.exists() && let Err(e) = fs::remove_dir_all(path.as_path()) { + if path.exists() && let Err(e) = file_system::trash_dir_all(&path) { error!( "delete snapshot failed"; "path" => %path.display(), @@ -2026,6 +2043,18 @@ impl TabletSnapManager { pub fn root_path(&self) -> &Path { self.base.as_path() } + + pub fn start_receive(&self, key: TabletSnapKey) -> Option> { + let mut receiving = self.receiving.lock().unwrap(); + if receiving.iter().any(|k| k == &key) { + return None; + } + receiving.push(key.clone()); + Some(ReceivingGuard { + receiving: &self.receiving, + key, + }) + } } #[cfg(test)] diff --git a/components/server/src/server.rs b/components/server/src/server.rs index ae6b86bbbd3..e77197a7737 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -103,6 +103,7 @@ use tikv::{ resolve, service::{DebugService, DiagnosticsService}, status_server::StatusServer, + tablet_snap::NoSnapshotCache, ttl::TtlChecker, KvEngineFactoryBuilder, Node, RaftKv, Server, CPU_CORES_QUOTA_GAUGE, DEFAULT_CLUSTER_ID, GRPC_THREAD_PREFIX, @@ -885,8 +886,8 @@ where .unwrap() .to_owned(); - let bps = i64::try_from(self.config.server.snap_max_write_bytes_per_sec.0) - .unwrap_or_else(|_| fatal!("snap_max_write_bytes_per_sec > i64::max_value")); + let bps = i64::try_from(self.config.server.snap_io_max_bytes_per_sec.0) + .unwrap_or_else(|_| fatal!("snap_io_max_bytes_per_sec > i64::max_value")); let snap_mgr = SnapManagerBuilder::default() .max_write_bytes_per_sec(bps) @@ -1685,7 +1686,7 @@ where .unwrap_or_else(|e| fatal!("failed to build server: {}", e)); server .server - .start(server_config, self.security_mgr.clone()) + .start(server_config, self.security_mgr.clone(), NoSnapshotCache) .unwrap_or_else(|e| fatal!("failed to start server: {}", e)); } diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 50b75f27c23..9ae032dca7a 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -1345,7 +1345,11 @@ where .unwrap_or_else(|e| fatal!("failed to build server: {}", e)); server .server - .start(server_config, self.security_mgr.clone()) + .start( + server_config, + self.security_mgr.clone(), + self.tablet_registry.clone().unwrap(), + ) .unwrap_or_else(|e| fatal!("failed to start server: {}", e)); } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index b105f52be39..1c6d956d1a8 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -580,7 +580,7 @@ impl ServerCluster { let pessimistic_txn_cfg = cfg.tikv.pessimistic_txn; node.start( raft_engine, - tablet_registry, + tablet_registry.clone(), &raft_router, simulate_trans.clone(), snap_mgr.clone(), @@ -616,7 +616,9 @@ impl ServerCluster { ) .unwrap(); - server.start(server_cfg, security_mgr).unwrap(); + server + .start(server_cfg, security_mgr, tablet_registry) + .unwrap(); self.metas.insert( node_id, diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 618b760e29e..c75adf33645 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -268,7 +268,7 @@ impl Simulator for NodeCluster { { let tmp = test_util::temp_dir("test_cluster", cfg.prefer_mem); let snap_mgr = SnapManagerBuilder::default() - .max_write_bytes_per_sec(cfg.server.snap_max_write_bytes_per_sec.0 as i64) + .max_write_bytes_per_sec(cfg.server.snap_io_max_bytes_per_sec.0 as i64) .max_total_size(cfg.server.snap_max_total_size.0) .encryption_key_manager(key_manager) .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 9fd1229e6e5..54da33fa3dd 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -59,6 +59,7 @@ use tikv::{ raftkv::ReplicaReadLockChecker, resolve::{self, StoreAddrResolver}, service::DebugService, + tablet_snap::NoSnapshotCache, ConnectionBuilder, Error, Node, PdStoreAddrResolver, RaftClient, RaftKv, Result as ServerResult, Server, ServerTransport, }, @@ -452,7 +453,7 @@ impl ServerCluster { let (resolver, state) = resolve::new_resolver(Arc::clone(&self.pd_client), &bg_worker, extension.clone()); let snap_mgr = SnapManagerBuilder::default() - .max_write_bytes_per_sec(cfg.server.snap_max_write_bytes_per_sec.0 as i64) + .max_write_bytes_per_sec(cfg.server.snap_io_max_bytes_per_sec.0 as i64) .max_total_size(cfg.server.snap_max_total_size.0) .encryption_key_manager(key_manager) .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) @@ -620,7 +621,9 @@ impl ServerCluster { ) .unwrap(); - server.start(server_cfg, security_mgr).unwrap(); + server + .start(server_cfg, security_mgr, NoSnapshotCache) + .unwrap(); self.metas.insert( node_id, diff --git a/etc/config-template.toml b/etc/config-template.toml index ea73efdf59e..3930a247374 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -205,9 +205,11 @@ ## Max time to handle Coprocessor requests before timeout. # end-point-request-max-handle-duration = "60s" -## Max bytes that snapshot can be written to disk in one second. -## It should be set based on your disk performance. -# snap-max-write-bytes-per-sec = "100MB" +## Max bytes that snapshot can interact with disk in one second. It should be +## set based on your disk performance. Only write flow is considered, if +## partiioned-raft-kv is used, read flow is also considered and it will be estimated +## as read_size * 0.5 to get around errors from page cache. +# snap-io-max-bytes-per-sec = "100MB" ## Whether to enable request batch. # enable-request-batch = true diff --git a/src/lib.rs b/src/lib.rs index 43d5db81458..4da16ee0e74 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,6 +26,7 @@ #![feature(drain_filter)] #![feature(deadline_api)] #![feature(let_chains)] +#![feature(read_buf)] #![feature(type_alias_impl_trait)] #[macro_use(fail_point)] diff --git a/src/server/config.rs b/src/server/config.rs index ae5c70abe1d..5f15e72ae2f 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -146,7 +146,8 @@ pub struct Config { #[serde(with = "perf_level_serde")] #[online_config(skip)] pub end_point_perf_level: PerfLevel, - pub snap_max_write_bytes_per_sec: ReadableSize, + #[serde(alias = "snap_max_write_bytes_per_sec")] + pub snap_io_max_bytes_per_sec: ReadableSize, pub snap_max_total_size: ReadableSize, #[online_config(skip)] pub stats_concurrency: usize, @@ -251,7 +252,7 @@ impl Default for Config { ), end_point_max_concurrency: cmp::max(cpu_num as usize, MIN_ENDPOINT_MAX_CONCURRENCY), end_point_perf_level: PerfLevel::Uninitialized, - snap_max_write_bytes_per_sec: ReadableSize(DEFAULT_SNAP_MAX_BYTES_PER_SEC), + snap_io_max_bytes_per_sec: ReadableSize(DEFAULT_SNAP_MAX_BYTES_PER_SEC), snap_max_total_size: ReadableSize(0), stats_concurrency: 1, // 75 means a gRPC thread is under heavy load if its total CPU usage diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 9d2c03998e6..c3976b8eeac 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -233,7 +233,7 @@ impl TabletFactory for KvEngineFactory { // kv_db_opts, // kv_cfs_opts, // )?; - let _ = std::fs::remove_dir_all(path); + let _ = file_system::trash_dir_all(path); if let Some(listener) = &self.inner.flow_listener { listener.clone_with(ctx.id).on_destroyed(); } diff --git a/src/server/server.rs b/src/server/server.rs index 4c1f5e7ef69..15de7f0d4e7 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -32,6 +32,7 @@ use super::{ resolve::StoreAddrResolver, service::*, snap::{Runner as SnapHandler, Task as SnapTask}, + tablet_snap::SnapCacheBuilder, transport::ServerTransport, Config, Error, Result, }; @@ -251,6 +252,7 @@ where &mut self, cfg: Arc>, security_mgr: Arc, + snap_cache_builder: impl SnapCacheBuilder + Clone + 'static, ) -> Result<()> { match self.snap_mgr.clone() { Either::Left(mgr) => { @@ -267,6 +269,7 @@ where let snap_runner = TabletRunner::new( self.env.clone(), mgr, + snap_cache_builder, self.raft_router.clone(), security_mgr, cfg, @@ -458,7 +461,7 @@ mod tests { use crate::{ config::CoprReadPoolConfig, coprocessor::{self, readpool_impl}, - server::{raftkv::RaftRouterWrap, TestRaftStoreRouter}, + server::{raftkv::RaftRouterWrap, tablet_snap::NoSnapshotCache, TestRaftStoreRouter}, storage::{lock_manager::MockLockManager, TestEngineBuilder, TestStorageBuilderApiV1}, }; @@ -589,7 +592,7 @@ mod tests { .unwrap(); server.build_and_bind().unwrap(); - server.start(cfg, security_mgr).unwrap(); + server.start(cfg, security_mgr, NoSnapshotCache).unwrap(); let mut trans = server.transport(); router.report_unreachable(0, 0).unwrap(); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index ce6971eb8fb..f0d0009b8e6 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -702,6 +702,24 @@ impl Tikv for Service { } } + fn tablet_snapshot( + &mut self, + ctx: RpcContext<'_>, + stream: RequestStream, + sink: DuplexSink, + ) { + let task = SnapTask::RecvTablet { stream, sink }; + if let Err(e) = self.snap_scheduler.schedule(task) { + let err_msg = format!("{}", e); + let sink = match e.into_inner() { + SnapTask::Recv { sink, .. } => sink, + _ => unreachable!(), + }; + let status = RpcStatus::with_message(RpcStatusCode::RESOURCE_EXHAUSTED, err_msg); + ctx.spawn(sink.fail(status).map(|_| ())); + } + } + #[allow(clippy::collapsible_else_if)] fn split_region( &mut self, diff --git a/src/server/snap.rs b/src/server/snap.rs index 8fe737c2e60..bae0587c505 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -13,17 +13,20 @@ use std::{ use file_system::{IoType, WithIoType}; use futures::{ - future::{Future, TryFutureExt}, + future::{Future, FutureExt, TryFutureExt}, sink::SinkExt, stream::{Stream, StreamExt, TryStreamExt}, task::{Context, Poll}, }; use grpcio::{ - ChannelBuilder, ClientStreamingSink, Environment, RequestStream, RpcStatus, RpcStatusCode, - WriteFlags, + ChannelBuilder, ClientStreamingSink, DuplexSink, Environment, RequestStream, RpcStatus, + RpcStatusCode, WriteFlags, }; use kvproto::{ - raft_serverpb::{Done, RaftMessage, RaftSnapshotData, SnapshotChunk}, + raft_serverpb::{ + Done, RaftMessage, RaftSnapshotData, SnapshotChunk, TabletSnapshotRequest, + TabletSnapshotResponse, + }, tikvpb::TikvClient, }; use protobuf::Message; @@ -51,6 +54,10 @@ pub enum Task { stream: RequestStream, sink: ClientStreamingSink, }, + RecvTablet { + stream: RequestStream, + sink: DuplexSink, + }, Send { addr: String, msg: RaftMessage, @@ -64,6 +71,7 @@ impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match *self { Task::Recv { .. } => write!(f, "Recv"), + Task::RecvTablet { .. } => write!(f, "RecvTablet"), Task::Send { ref addr, ref msg, .. } => write!(f, "Send Snap[to: {}, snap: {:?}]", addr, msg), @@ -368,8 +376,8 @@ impl Runner { fn refresh_cfg(&mut self) { if let Some(incoming) = self.cfg_tracker.any_new() { - let limit = if incoming.snap_max_write_bytes_per_sec.0 > 0 { - incoming.snap_max_write_bytes_per_sec.0 as f64 + let limit = if incoming.snap_io_max_bytes_per_sec.0 > 0 { + incoming.snap_io_max_bytes_per_sec.0 as f64 } else { f64::INFINITY }; @@ -422,6 +430,13 @@ impl Runnable for Runner { }; self.pool.spawn(task); } + Task::RecvTablet { sink, .. } => { + let status = RpcStatus::with_message( + RpcStatusCode::UNIMPLEMENTED, + "tablet snap is not supported".to_string(), + ); + self.pool.spawn(sink.fail(status).map(|_| ())); + } Task::Send { addr, msg, cb } => { fail_point!("send_snapshot"); let region_id = msg.get_region_id(); diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index c0ecf4db611..a5a8b24d10b 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -1,11 +1,30 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +//! This file contains the implementation of sending and receiving tablet +//! snapshot. +//! +//! Different from v1, tablet snapshot always tries to use cache to speed up +//! transfering speed. The protocol is quite simple: +//! +//! sender receiver +//! send snapshot meta ----> receive snapshot meta +//! extra snapshot preview collect cache meta +//! send all preview ----> receive preview and clean up miss cache +//! files receive files list <----- send missing file list +//! send missing files ----> receive missing files +//! close sender ----> persist snapshot and report to raftstore +//! wait for receiver <----- close sender +//! finish + #[cfg(any(test, feature = "testexport"))] use std::io; use std::{ - convert::{TryFrom, TryInto}, + cmp, + convert::TryFrom, + fmt::Debug, fs::{self, File}, - io::{Read, Write}, + io::{BorrowedBuf, Read, Seek, SeekFrom, Write}, + path::Path, sync::{ atomic::{AtomicUsize, Ordering}, Arc, @@ -13,26 +32,32 @@ use std::{ time::Duration, }; -use file_system::{IoType, WithIoType}; +use collections::HashMap; +use crc64fast::Digest; +use engine_traits::{Checkpointer, KvEngine, TabletRegistry}; +use file_system::{IoType, OpenOptions, WithIoType}; use futures::{ - future::{Future, TryFutureExt}, + future::{Future, FutureExt}, sink::{Sink, SinkExt}, stream::{Stream, StreamExt, TryStreamExt}, }; use grpcio::{ - self, ChannelBuilder, ClientStreamingSink, Environment, RequestStream, RpcStatus, - RpcStatusCode, WriteFlags, + self, ChannelBuilder, DuplexSink, Environment, RequestStream, RpcStatus, RpcStatusCode, + WriteFlags, }; use kvproto::{ - raft_serverpb::{Done, RaftMessage, RaftSnapshotData, SnapshotChunk}, + raft_serverpb::{ + RaftMessage, RaftSnapshotData, TabletSnapshotFileChunk, TabletSnapshotFileMeta, + TabletSnapshotPreview, TabletSnapshotRequest, TabletSnapshotResponse, + }, tikvpb::TikvClient, }; use protobuf::Message; -use raftstore::store::snap::{TabletSnapKey, TabletSnapManager}; +use raftstore::store::snap::{ReceivingGuard, TabletSnapKey, TabletSnapManager}; use security::SecurityManager; use tikv_kv::RaftExtension; use tikv_util::{ - config::{Tracker, VersionTrack}, + config::{ReadableSize, Tracker, VersionTrack}, time::Instant, worker::Runnable, }; @@ -40,29 +65,75 @@ use tokio::runtime::{Builder as RuntimeBuilder, Runtime}; use super::{ metrics::*, - snap::{Task, DEFAULT_POOL_SIZE, SNAP_CHUNK_LEN}, + snap::{Task, DEFAULT_POOL_SIZE}, Config, Error, Result, }; use crate::tikv_util::{sys::thread::ThreadBuildWrapper, time::Limiter}; -struct RecvTabletSnapContext { +const PREVIEW_CHUNK_LEN: usize = ReadableSize::kb(1).0 as usize; +const PREVIEW_BATCH_SIZE: usize = 256; +const FILE_CHUNK_LEN: usize = ReadableSize::mb(1).0 as usize; +const USE_CACHE_THRESHOLD: u64 = ReadableSize::mb(4).0; + +fn is_sst(file_name: &str) -> bool { + file_name.ends_with(".sst") +} + +async fn read_to(f: &mut File, to: &mut Vec, size: usize, limiter: &Limiter) -> Result<()> { + // It's likely in page cache already. + limiter.consume(size / 2).await; + to.clear(); + to.reserve_exact(size); + let mut buf: BorrowedBuf<'_> = to.spare_capacity_mut().into(); + f.read_buf_exact(buf.unfilled())?; + unsafe { + to.set_len(size); + } + Ok(()) +} + +pub trait SnapCacheBuilder: Send + Sync { + fn build(&self, region_id: u64, path: &Path) -> Result<()>; +} + +impl SnapCacheBuilder for TabletRegistry { + fn build(&self, region_id: u64, path: &Path) -> Result<()> { + if let Some(mut c) = self.get(region_id) && let Some(db) = c.latest() { + let mut checkpointer = db.new_checkpointer()?; + // Avoid flush. + checkpointer.create_at(path, None, u64::MAX)?; + Ok(()) + } else { + Err(Error::Other(format!("region {} not found", region_id).into())) + } + } +} + +#[derive(Clone)] +pub struct NoSnapshotCache; + +impl SnapCacheBuilder for NoSnapshotCache { + fn build(&self, _: u64, _: &Path) -> Result<()> { + Err(Error::Other("cache is disabled".into())) + } +} + +struct RecvTabletSnapContext<'a> { key: TabletSnapKey, raft_msg: RaftMessage, + use_cache: bool, io_type: IoType, + // Lock to avoid receive the same snapshot concurrently. + _receiving_guard: ReceivingGuard<'a>, start: Instant, - chunk_size: usize, } -impl RecvTabletSnapContext { - fn new(mut head: SnapshotChunk) -> Result { - if !head.has_message() { +impl<'a> RecvTabletSnapContext<'a> { + fn new(mut head: TabletSnapshotRequest, mgr: &'a TabletSnapManager) -> Result { + if !head.has_head() { return Err(box_err!("no raft message in the first chunk")); } - - let chunk_size = match head.take_data().try_into() { - Ok(buff) => usize::from_le_bytes(buff), - Err(_) => return Err(box_err!("failed to get chunk size")), - }; + let mut head = head.take_head(); let meta = head.take_message(); let key = TabletSnapKey::from_region_snap( meta.get_region_id(), @@ -70,13 +141,18 @@ impl RecvTabletSnapContext { meta.get_message().get_snapshot(), ); let io_type = io_type_from_raft_message(&meta)?; + let receiving_guard = match mgr.start_receive(key.clone()) { + Some(g) => g, + None => return Err(box_err!("failed to start receive snapshot")), + }; Ok(RecvTabletSnapContext { key, raft_msg: meta, + use_cache: head.use_cache, io_type, + _receiving_guard: receiving_guard, start: Instant::now(), - chunk_size, }) } @@ -101,70 +177,437 @@ fn io_type_from_raft_message(msg: &RaftMessage) -> Result { } } -async fn send_snap_files( - mgr: &TabletSnapManager, - mut sender: impl Sink<(SnapshotChunk, WriteFlags), Error = Error> + Unpin, - msg: RaftMessage, - key: TabletSnapKey, - limiter: Limiter, -) -> Result { - let path = mgr.tablet_gen_path(&key); - info!("begin to send snapshot file";"snap_key" => %key); - let files = fs::read_dir(&path)? - .map(|f| Ok(f?.path())) - .filter(|f| f.is_ok() && f.as_ref().unwrap().is_file()) - .collect::>>()?; - let io_type = io_type_from_raft_message(&msg)?; - let _with_io_type = WithIoType::new(io_type); - let mut total_sent = msg.compute_size() as u64; - let mut chunk = SnapshotChunk::default(); - chunk.set_message(msg); - chunk.set_data(usize::to_le_bytes(SNAP_CHUNK_LEN).to_vec()); - sender - .feed((chunk, WriteFlags::default().buffer_hint(true))) - .await?; - for path in files { - let name = path.file_name().unwrap().to_str().unwrap(); - let mut buffer = Vec::with_capacity(SNAP_CHUNK_LEN); - buffer.push(name.len() as u8); - buffer.extend_from_slice(name.as_bytes()); - let mut f = File::open(&path)?; - let mut off = buffer.len(); - loop { - unsafe { - buffer.set_len(SNAP_CHUNK_LEN); +fn protocol_error(exp: &str, act: impl Debug) -> Error { + Error::Other(format!("protocol error: expect {exp}, but got {act:?}").into()) +} + +/// Check if a local SST file matches the preview meta. +/// +/// It's considered matched when: +/// 1. Have the same file size; +/// 2. The first `PREVIEW_CHUNK_LEN` bytes are the same, this contains the +/// actual data of an SST; +/// 3. The last `PREVIEW_CHUNK_LEN` bytes are the same, this contains checksum, +/// properties and other medata of an SST. +async fn is_sst_match_preview( + preview_meta: &TabletSnapshotFileMeta, + target: &Path, + buffer: &mut Vec, + limiter: &Limiter, +) -> Result { + let mut f = File::open(target)?; + let exist_len = f.metadata()?.len(); + if exist_len != preview_meta.file_size { + return Ok(false); + } + + let head_len = preview_meta.head_chunk.len(); + let trailing_len = preview_meta.trailing_chunk.len(); + if head_len as u64 > preview_meta.file_size || trailing_len as u64 > preview_meta.file_size { + return Err(Error::Other( + format!( + "invalid chunk length {} {} {}", + preview_meta.file_size, head_len, trailing_len + ) + .into(), + )); + } + read_to(&mut f, buffer, head_len, limiter).await?; + if *buffer != preview_meta.head_chunk { + return Ok(false); + } + + if preview_meta.trailing_chunk.is_empty() { + // A safet check to detect wrong protocol implementation. Only head chunk + // contains all the data can trailing chunk be empty. + return Ok(head_len as u64 == preview_meta.file_size); + } + + f.seek(SeekFrom::End(-(trailing_len as i64)))?; + read_to(&mut f, buffer, trailing_len, limiter).await?; + Ok(*buffer == preview_meta.trailing_chunk) +} + +async fn cleanup_cache( + path: &Path, + stream: &mut (impl Stream> + Unpin), + sink: &mut (impl Sink<(TabletSnapshotResponse, WriteFlags), Error = grpcio::Error> + Unpin), + limiter: &Limiter, +) -> Result<(u64, Vec)> { + let mut reused = 0; + let mut exists = HashMap::default(); + for entry in fs::read_dir(path)? { + let entry = entry?; + let ft = entry.file_type()?; + if ft.is_dir() { + fs::remove_dir_all(entry.path())?; + continue; + } + if ft.is_file() { + let os_name = entry.file_name(); + let name = os_name.to_str().unwrap(); + if is_sst(name) { + // Collect length requires another IO, delay till we are sure + // it's probably be reused. + exists.insert(name.to_string(), entry.path()); + continue; } - // it should break if readed len is zero or the buffer is full. - while off < SNAP_CHUNK_LEN { - let readed = f.read(&mut buffer[off..])?; - if readed == 0 { - unsafe { - buffer.set_len(off); - } - break; + } + fs::remove_file(entry.path())?; + } + let mut missing = vec![]; + loop { + let mut preview = match stream.next().await { + Some(Ok(mut req)) if req.has_preview() => req.take_preview(), + res => return Err(protocol_error("preview", res)), + }; + let mut buffer = Vec::with_capacity(PREVIEW_CHUNK_LEN); + for meta in preview.take_metas().into_vec() { + if is_sst(&meta.file_name) && let Some(p) = exists.remove(&meta.file_name) { + if is_sst_match_preview(&meta, &p, &mut buffer, limiter).await? { + reused += meta.file_size; + continue; + } + // We should not write to the file directly as it's hard linked. + fs::remove_file(p)?; + } + missing.push(meta.file_name); + } + if preview.end { + break; + } + } + for (_, p) in exists { + fs::remove_file(p)?; + } + let mut resp = TabletSnapshotResponse::default(); + resp.mut_files().set_file_name(missing.clone().into()); + sink.send((resp, WriteFlags::default())).await?; + Ok((reused, missing)) +} + +async fn accept_one_file( + path: &Path, + mut chunk: TabletSnapshotFileChunk, + stream: &mut (impl Stream> + Unpin), + limiter: &Limiter, + digest: &mut Digest, +) -> Result { + let name = chunk.file_name; + digest.write(name.as_bytes()); + let mut f = OpenOptions::new() + .write(true) + .create_new(true) + .open(path.join(&name))?; + let exp_size = chunk.file_size; + let mut file_size = 0; + loop { + let chunk_len = chunk.data.len(); + file_size += chunk_len as u64; + if file_size > exp_size { + return Err(Error::Other( + format!("file {} too long {} {}", name, file_size, exp_size).into(), + )); + } + limiter.consume(chunk_len).await; + digest.write(&chunk.data); + f.write_all(&chunk.data)?; + if exp_size == file_size { + f.sync_data()?; + return Ok(exp_size); + } + chunk = match stream.next().await { + Some(Ok(mut req)) if req.has_chunk() => req.take_chunk(), + res => return Err(protocol_error("chunk", res)), + }; + if !chunk.file_name.is_empty() { + return Err(protocol_error(&name, &chunk.file_name)); + } + } +} + +async fn accept_missing( + path: &Path, + missing_ssts: Vec, + stream: &mut (impl Stream> + Unpin), + limiter: &Limiter, +) -> Result { + let mut digest = Digest::default(); + let mut received_bytes: u64 = 0; + for name in missing_ssts { + let chunk = match stream.next().await { + Some(Ok(mut req)) if req.has_chunk() => req.take_chunk(), + res => return Err(protocol_error("chunk", res)), + }; + if chunk.file_name != name { + return Err(protocol_error(&name, &chunk.file_name)); + } + received_bytes += accept_one_file(path, chunk, stream, limiter, &mut digest).await?; + } + // Now receive other files. + loop { + let chunk = match stream.next().await { + Some(Ok(mut req)) if req.has_chunk() => req.take_chunk(), + Some(Ok(req)) if req.has_end() => { + let checksum = req.get_end().get_checksum(); + if checksum != digest.sum64() { + return Err(Error::Other( + format!("checksum mismatch {} {}", checksum, digest.sum64()).into(), + )); } - off += readed; + File::open(path)?.sync_data()?; + let res = stream.next().await; + return if res.is_none() { + Ok(received_bytes) + } else { + Err(protocol_error("None", res)) + }; } - limiter.consume(off).await; - total_sent += off as u64; - let mut chunk = SnapshotChunk::default(); - chunk.set_data(buffer); + res => return Err(protocol_error("chunk", res)), + }; + if chunk.file_name.is_empty() { + return Err(protocol_error("file_name", &chunk.file_name)); + } + received_bytes += accept_one_file(path, chunk, stream, limiter, &mut digest).await?; + } +} + +async fn recv_snap_files<'a>( + snap_mgr: &'a TabletSnapManager, + cache_builder: impl SnapCacheBuilder, + mut stream: impl Stream> + Unpin, + sink: &mut (impl Sink<(TabletSnapshotResponse, WriteFlags), Error = grpcio::Error> + Unpin), + limiter: Limiter, +) -> Result> { + let head = stream + .next() + .await + .transpose()? + .ok_or_else(|| Error::Other("empty gRPC stream".into()))?; + let context = RecvTabletSnapContext::new(head, snap_mgr)?; + let _with_io_type = WithIoType::new(context.io_type); + let region_id = context.key.region_id; + let final_path = snap_mgr.final_recv_path(&context.key); + if final_path.exists() { + // The snapshot is received already, should wait for peer to apply. If the + // snapshot is corrupted, the peer should destroy it first then request again. + return Err(Error::Other( + format!("snapshot {} already exists", final_path.display()).into(), + )); + } + let path = snap_mgr.tmp_recv_path(&context.key); + info!("begin to receive tablet snapshot files"; "file" => %path.display(), "region_id" => region_id); + if path.exists() { + fs::remove_dir_all(&path)?; + } + let (reused, missing_ssts) = if context.use_cache { + if let Err(e) = cache_builder.build(region_id, &path) { + info!("not using cache"; "region_id" => region_id, "err" => ?e); + fs::create_dir_all(&path)?; + } + cleanup_cache(&path, &mut stream, sink, &limiter).await? + } else { + info!("not using cache"; "region_id" => region_id); + fs::create_dir_all(&path)?; + (0, vec![]) + }; + let received = accept_missing(&path, missing_ssts, &mut stream, &limiter).await?; + info!("received all tablet snapshot file"; "snap_key" => %context.key, "region_id" => region_id, "received" => received, "reused" => reused); + let final_path = snap_mgr.final_recv_path(&context.key); + fs::rename(&path, final_path)?; + Ok(context) +} + +async fn recv_snap( + stream: RequestStream, + sink: DuplexSink, + snap_mgr: TabletSnapManager, + raft_router: R, + cache_builder: impl SnapCacheBuilder, + limiter: Limiter, +) -> Result<()> { + let stream = stream.map_err(Error::from); + let mut sink = sink; + let res = recv_snap_files(&snap_mgr, cache_builder, stream, &mut sink, limiter) + .await + .and_then(|context| context.finish(raft_router)); + match res { + Ok(()) => sink.close().await.map_err(Error::from), + Err(e) => { + let status = RpcStatus::with_message(RpcStatusCode::UNKNOWN, format!("{:?}", e)); + sink.fail(status).await.map_err(Error::from) + } + } +} + +async fn build_one_preview( + path: &Path, + iter: &mut impl Iterator, + limiter: &Limiter, +) -> Result { + let mut preview = TabletSnapshotPreview::default(); + for _ in 0..PREVIEW_BATCH_SIZE { + let (name, size) = match iter.next() { + Some((name, size)) => (name, *size), + None => break, + }; + let mut meta = TabletSnapshotFileMeta::default(); + meta.file_name = name.clone(); + meta.file_size = size; + let mut f = File::open(path.join(name))?; + let to_read = cmp::min(size as usize, PREVIEW_CHUNK_LEN); + read_to(&mut f, &mut meta.head_chunk, to_read, limiter).await?; + if size > PREVIEW_CHUNK_LEN as u64 { + f.seek(SeekFrom::End(-(to_read as i64)))?; + read_to(&mut f, &mut meta.trailing_chunk, to_read, limiter).await?; + } + preview.mut_metas().push(meta); + } + let mut req = TabletSnapshotRequest::default(); + req.set_preview(preview); + Ok(req) +} + +async fn find_missing( + path: &Path, + mut head: TabletSnapshotRequest, + sender: &mut (impl Sink<(TabletSnapshotRequest, WriteFlags), Error = Error> + Unpin), + receiver: &mut (impl Stream> + Unpin), + limiter: &Limiter, +) -> Result> { + let mut sst_sizes = 0; + let mut ssts = HashMap::default(); + let mut other_files = vec![]; + for f in fs::read_dir(path)? { + let entry = f?; + let ft = entry.file_type()?; + // What if it's titan? + if !ft.is_file() { + continue; + } + let os_name = entry.file_name(); + let name = os_name.to_str().unwrap().to_string(); + let file_size = entry.metadata()?.len(); + if is_sst(&name) { + sst_sizes += file_size; + ssts.insert(name, file_size); + } else { + other_files.push((name, file_size)); + } + } + if sst_sizes < USE_CACHE_THRESHOLD { + sender + .send((head, WriteFlags::default().buffer_hint(true))) + .await?; + other_files.extend(ssts); + return Ok(other_files); + } + + head.mut_head().set_use_cache(true); + // Send immediately to make receiver collect cache earlier. + sender.send((head, WriteFlags::default())).await?; + let sst_count = ssts.len(); + // PREVIEW_BATCH_SIZE -> 1, PREVIEW_BATCH_SIZE + 1 = 2. sst_count can't be 0. + let batch_count = (sst_count - 1) / PREVIEW_BATCH_SIZE + 1; + let mut ssts_iter = ssts.iter(); + for _ in 0..batch_count { + let req = build_one_preview(path, &mut ssts_iter, limiter).await?; + sender + .send((req, WriteFlags::default().buffer_hint(true))) + .await?; + } + let mut req = build_one_preview(path, &mut ssts_iter, limiter).await?; + req.mut_preview().end = true; + sender.send((req, WriteFlags::default())).await?; + + let accepted = match receiver.next().await { + Some(Ok(mut req)) if req.has_files() => req.take_files().take_file_name(), + res => return Err(protocol_error("missing files", res)), + }; + let mut missing = Vec::with_capacity(accepted.len()); + for name in &accepted { + let s = match ssts.remove_entry(name) { + Some(s) => s, + None => return Err(Error::Other(format!("missing file {}", name).into())), + }; + missing.push(s); + } + missing.extend(other_files); + Ok(missing) +} + +async fn send_missing( + path: &Path, + missing: Vec<(String, u64)>, + sender: &mut (impl Sink<(TabletSnapshotRequest, WriteFlags), Error = Error> + Unpin), + limiter: &Limiter, +) -> Result<(u64, u64)> { + let mut total_sent = 0; + let mut digest = Digest::default(); + for (name, mut file_size) in missing { + let mut chunk = TabletSnapshotFileChunk::default(); + chunk.file_name = name; + digest.write(chunk.file_name.as_bytes()); + chunk.file_size = file_size; + total_sent += file_size; + if file_size == 0 { + let mut req = TabletSnapshotRequest::default(); + req.set_chunk(chunk); sender - .feed((chunk, WriteFlags::default().buffer_hint(true))) + .send((req, WriteFlags::default().buffer_hint(true))) .await?; - // It should switch the next file if the read buffer len is less than the - // SNAP_CHUNK_LEN. - if off < SNAP_CHUNK_LEN { + continue; + } + + let mut f = File::open(path.join(&chunk.file_name))?; + loop { + let to_read = cmp::min(FILE_CHUNK_LEN as u64, file_size) as usize; + read_to(&mut f, &mut chunk.data, to_read, limiter).await?; + digest.write(&chunk.data); + let mut req = TabletSnapshotRequest::default(); + req.set_chunk(chunk); + sender + .send((req, WriteFlags::default().buffer_hint(true))) + .await?; + if file_size == to_read as u64 { break; } - buffer = Vec::with_capacity(SNAP_CHUNK_LEN); - off = 0 + chunk = TabletSnapshotFileChunk::default(); + file_size -= to_read as u64; } } - info!("sent all snap file finish"; "snap_key" => %key); + Ok((total_sent, digest.sum64())) +} + +async fn send_snap_files( + mgr: &TabletSnapManager, + mut sender: impl Sink<(TabletSnapshotRequest, WriteFlags), Error = Error> + Unpin, + receiver: &mut (impl Stream> + Unpin), + msg: RaftMessage, + key: TabletSnapKey, + limiter: Limiter, +) -> Result { + let region_id = key.region_id; + let to_peer = key.to_peer; + let path = mgr.tablet_gen_path(&key); + info!("begin to send snapshot file"; "snap_key" => %key, "region_id" => region_id, "to_peer" => to_peer); + let io_type = io_type_from_raft_message(&msg)?; + let _with_io_type = WithIoType::new(io_type); + let mut head = TabletSnapshotRequest::default(); + head.mut_head().set_message(msg); + let missing = find_missing(&path, head, &mut sender, receiver, &limiter).await?; + let (total_sent, checksum) = send_missing(&path, missing, &mut sender, &limiter).await?; + // In gRPC, stream in serverside can finish without error (when the connection + // is closed). So we need to use an explicit `Done` to indicate all messages + // are sent. In V1, we have checksum and meta list, so this is not a + // problem. + let mut req = TabletSnapshotRequest::default(); + req.mut_end().set_checksum(checksum); + sender.send((req, WriteFlags::default())).await?; SNAP_LIMIT_TRANSPORT_BYTES_COUNTER_STATIC .send .inc_by(total_sent); + info!("sent all snap file finish"; "snap_key" => %key, "region_id" => region_id, "to_peer" => to_peer); sender.close().await?; Ok(total_sent) } @@ -201,107 +644,31 @@ pub fn send_snap( let channel = security_mgr.connect(cb, addr); let client = TikvClient::new(channel); - let (sink, receiver) = client.snapshot()?; + let (sink, mut receiver) = client.tablet_snapshot()?; let send_task = async move { let sink = sink.sink_map_err(Error::from); - let total_size = send_snap_files(&mgr, sink, msg, key.clone(), limiter).await?; - let recv_result = receiver.map_err(Error::from).await; + let total_size = + send_snap_files(&mgr, sink, &mut receiver, msg, key.clone(), limiter).await?; + let recv_result = receiver.next().await; send_timer.observe_duration(); drop(client); + mgr.delete_snapshot(&key); match recv_result { - Ok(_) => { - mgr.delete_snapshot(&key); - Ok(SendStat { - key, - total_size, - elapsed: timer.saturating_elapsed(), - }) - } - Err(e) => Err(e), + None => Ok(SendStat { + key, + total_size, + elapsed: timer.saturating_elapsed(), + }), + Some(Err(e)) => Err(e.into()), + Some(Ok(resp)) => Err(Error::Other( + format!("receive unexpected response {:?}", resp).into(), + )), } }; Ok(send_task) } -async fn recv_snap_files( - snap_mgr: TabletSnapManager, - mut stream: impl Stream> + Unpin, - limit: Limiter, -) -> Result { - let head = stream - .next() - .await - .transpose()? - .ok_or_else(|| Error::Other("empty gRPC stream".into()))?; - let context = RecvTabletSnapContext::new(head)?; - let chunk_size = context.chunk_size; - let path = snap_mgr.tmp_recv_path(&context.key); - info!("begin to receive tablet snapshot files"; "file" => %path.display()); - fs::create_dir_all(&path)?; - let _with_io_type = WithIoType::new(context.io_type); - loop { - let mut chunk = match stream.next().await { - Some(Ok(mut c)) if !c.has_message() => c.take_data(), - Some(_) => { - return Err(box_err!("duplicated metadata")); - } - None => break, - }; - // the format of chunk: - // |--name_len--|--name--|--content--| - let len = chunk[0] as usize; - let file_name = box_try!(std::str::from_utf8(&chunk[1..len + 1])); - let p = path.join(file_name); - let mut f = File::create(&p)?; - let mut size = chunk.len() - len - 1; - f.write_all(&chunk[len + 1..])?; - // It should switch next file if the chunk size is less than the SNAP_CHUNK_LEN. - while chunk.len() >= chunk_size { - chunk = match stream.next().await { - Some(Ok(mut c)) if !c.has_message() => c.take_data(), - Some(_) => return Err(box_err!("duplicated metadata")), - None => return Err(box_err!("missing chunk")), - }; - f.write_all(&chunk[..])?; - limit.consume(chunk.len()).await; - size += chunk.len(); - } - debug!("received snap file"; "file" => %p.display(), "size" => size); - SNAP_LIMIT_TRANSPORT_BYTES_COUNTER_STATIC - .recv - .inc_by(size as u64); - f.sync_data()?; - } - info!("received all tablet snapshot file"; "snap_key" => %context.key); - let final_path = snap_mgr.final_recv_path(&context.key); - fs::rename(&path, final_path)?; - Ok(context) -} - -fn recv_snap( - stream: RequestStream, - sink: ClientStreamingSink, - snap_mgr: TabletSnapManager, - raft_router: R, - limit: Limiter, -) -> impl Future> { - let recv_task = async move { - let stream = stream.map_err(Error::from); - let context = recv_snap_files(snap_mgr, stream, limit).await?; - context.finish(raft_router) - }; - async move { - match recv_task.await { - Ok(()) => sink.success(Done::default()).await.map_err(Error::from), - Err(e) => { - let status = RpcStatus::with_message(RpcStatusCode::UNKNOWN, format!("{:?}", e)); - sink.fail(status).await.map_err(Error::from) - } - } - } -} - -pub struct TabletRunner { +pub struct TabletRunner { env: Arc, snap_mgr: TabletSnapManager, security_mgr: Arc, @@ -311,21 +678,23 @@ pub struct TabletRunner { cfg: Config, sending_count: Arc, recving_count: Arc, + cache_builder: B, limiter: Limiter, } -impl TabletRunner { +impl TabletRunner { pub fn new( env: Arc, snap_mgr: TabletSnapManager, + cache_builder: B, r: R, security_mgr: Arc, cfg: Arc>, ) -> Self { let config = cfg.value().clone(); let cfg_tracker = cfg.tracker("tablet-sender".to_owned()); - let limit = i64::try_from(config.snap_max_write_bytes_per_sec.0) - .unwrap_or_else(|_| panic!("snap_max_write_bytes_per_sec > i64::max_value")); + let limit = i64::try_from(config.snap_io_max_bytes_per_sec.0) + .unwrap_or_else(|_| panic!("snap_io_max_bytes_per_sec > i64::max_value")); let limiter = Limiter::new(if limit > 0 { limit as f64 } else { @@ -348,6 +717,7 @@ impl TabletRunner { cfg: config, sending_count: Arc::new(AtomicUsize::new(0)), recving_count: Arc::new(AtomicUsize::new(0)), + cache_builder, limiter, }; snap_worker @@ -355,8 +725,8 @@ impl TabletRunner { fn refresh_cfg(&mut self) { if let Some(incoming) = self.cfg_tracker.any_new() { - let limit = if incoming.snap_max_write_bytes_per_sec.0 > 0 { - incoming.snap_max_write_bytes_per_sec.0 as f64 + let limit = if incoming.snap_io_max_bytes_per_sec.0 > 0 { + incoming.snap_io_max_bytes_per_sec.0 as f64 } else { f64::INFINITY }; @@ -374,12 +744,23 @@ pub struct SendStat { elapsed: Duration, } -impl Runnable for TabletRunner { +impl Runnable for TabletRunner +where + B: SnapCacheBuilder + Clone + 'static, + R: RaftExtension, +{ type Task = Task; fn run(&mut self, task: Task) { match task { - Task::Recv { stream, sink } => { + Task::Recv { sink, .. } => { + let status = RpcStatus::with_message( + RpcStatusCode::UNIMPLEMENTED, + "tablet snap is not supported".to_string(), + ); + self.pool.spawn(sink.fail(status).map(|_| ())); + } + Task::RecvTablet { stream, sink } => { let task_num = self.recving_count.load(Ordering::SeqCst); if task_num >= self.cfg.concurrent_recv_snap_limit { warn!("too many recving snapshot tasks, ignore"); @@ -399,9 +780,12 @@ impl Runnable for TabletRunner { let raft_router = self.raft_router.clone(); let recving_count = self.recving_count.clone(); recving_count.fetch_add(1, Ordering::SeqCst); - let limit = self.limiter.clone(); + let limiter = self.limiter.clone(); + let cache_builder = self.cache_builder.clone(); let task = async move { - let result = recv_snap(stream, sink, snap_mgr, raft_router, limit).await; + let result = + recv_snap(stream, sink, snap_mgr, raft_router, cache_builder, limiter) + .await; recving_count.fetch_sub(1, Ordering::SeqCst); if let Err(e) = result { error!("failed to recv snapshot"; "err" => %e); @@ -427,9 +811,16 @@ impl Runnable for TabletRunner { let security_mgr = Arc::clone(&self.security_mgr); let sending_count = Arc::clone(&self.sending_count); sending_count.fetch_add(1, Ordering::SeqCst); - let limit = self.limiter.clone(); - let send_task = - send_snap(env, mgr, security_mgr, &self.cfg.clone(), &addr, msg, limit); + let limiter = self.limiter.clone(); + let send_task = send_snap( + env, + mgr, + security_mgr, + &self.cfg.clone(), + &addr, + msg, + limiter, + ); let task = async move { let res = match send_task { Err(e) => Err(e), @@ -480,11 +871,10 @@ pub fn copy_tablet_snapshot( .filter(|f| f.is_ok() && f.as_ref().unwrap().is_file()) .collect::>>()?; - let mut head = SnapshotChunk::default(); - head.set_message(msg); - head.set_data(usize::to_le_bytes(SNAP_CHUNK_LEN).to_vec()); + let mut head = TabletSnapshotRequest::default(); + head.mut_head().set_message(msg); - let recv_context = RecvTabletSnapContext::new(head)?; + let recv_context = RecvTabletSnapContext::new(head, recver_snap_mgr)?; let recv_path = recver_snap_mgr.tmp_recv_path(&recv_context.key); fs::create_dir_all(&recv_path)?; @@ -502,78 +892,3 @@ pub fn copy_tablet_snapshot( fs::rename(&recv_path, final_path)?; Ok(()) } - -#[cfg(test)] -mod tests { - use std::{ - fs::{create_dir_all, File}, - io::Write, - }; - - use futures::{ - channel::mpsc::{self}, - executor::block_on, - sink::SinkExt, - }; - use futures_util::StreamExt; - use grpcio::WriteFlags; - use kvproto::raft_serverpb::{RaftMessage, SnapshotChunk}; - use raftstore::store::snap::{TabletSnapKey, TabletSnapManager}; - use tempfile::TempDir; - use tikv_util::{store::new_peer, time::Limiter}; - - use super::{super::Error, recv_snap_files, send_snap_files, SNAP_CHUNK_LEN}; - - #[test] - fn test_send_tablet() { - let limiter = Limiter::new(f64::INFINITY); - let snap_key = TabletSnapKey::new(1, 1, 1, 1); - let mut msg = RaftMessage::default(); - msg.set_region_id(1); - msg.set_to_peer(new_peer(1, 1)); - msg.mut_message().mut_snapshot().mut_metadata().set_index(1); - msg.mut_message().mut_snapshot().mut_metadata().set_term(1); - let send_path = TempDir::new().unwrap(); - let send_snap_mgr = - TabletSnapManager::new(send_path.path().join("snap_dir").to_str().unwrap()).unwrap(); - let snap_path = send_snap_mgr.tablet_gen_path(&snap_key); - create_dir_all(snap_path.as_path()).unwrap(); - // send file should skip directory - create_dir_all(snap_path.join("dir")).unwrap(); - for i in 0..2 { - let mut f = File::create(snap_path.join(i.to_string())).unwrap(); - let count = SNAP_CHUNK_LEN - 2; - let mut data = std::iter::repeat("a".as_bytes()) - .take(count) - .collect::>(); - for buffer in data.iter_mut() { - f.write_all(buffer).unwrap(); - } - f.sync_data().unwrap(); - } - - let recv_path = TempDir::new().unwrap(); - let recv_snap_manager = - TabletSnapManager::new(recv_path.path().join("snap_dir").to_str().unwrap()).unwrap(); - let (tx, rx) = mpsc::unbounded(); - let sink = tx.sink_map_err(Error::from); - block_on(send_snap_files( - &send_snap_mgr, - sink, - msg, - snap_key.clone(), - limiter.clone(), - )) - .unwrap(); - - let stream = rx.map(|x: (SnapshotChunk, WriteFlags)| Ok(x.0)); - let final_path = recv_snap_manager.final_recv_path(&snap_key); - let r = block_on(recv_snap_files(recv_snap_manager, stream, limiter)).unwrap(); - assert_eq!(r.key, snap_key); - std::thread::sleep(std::time::Duration::from_secs(1)); - let dir = std::fs::read_dir(final_path).unwrap(); - assert_eq!(2, dir.count()); - send_snap_mgr.delete_snapshot(&snap_key); - assert!(!snap_path.exists()); - } -} diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index af03246acf4..bb91d0d62eb 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -95,7 +95,7 @@ fn test_update_server_config() { let change = { let mut m = std::collections::HashMap::new(); m.insert( - "server.snap-max-write-bytes-per-sec".to_owned(), + "server.snap-io-max-bytes-per-sec".to_owned(), "512MB".to_owned(), ); m.insert( @@ -106,7 +106,7 @@ fn test_update_server_config() { }; cfg_controller.update(change).unwrap(); - svr_cfg.snap_max_write_bytes_per_sec = ReadableSize::mb(512); + svr_cfg.snap_io_max_bytes_per_sec = ReadableSize::mb(512); svr_cfg.concurrent_send_snap_limit = 100; // config should be updated assert_eq!(snap_mgr.get_speed_limit() as u64, 536870912); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index a25a43ce6e1..102d695b2de 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -113,7 +113,7 @@ fn test_serde_custom_tikv_config() { end_point_request_max_handle_duration: ReadableDuration::secs(12), end_point_max_concurrency: 10, end_point_perf_level: PerfLevel::EnableTime, - snap_max_write_bytes_per_sec: ReadableSize::mb(10), + snap_io_max_bytes_per_sec: ReadableSize::mb(10), snap_max_total_size: ReadableSize::gb(10), stats_concurrency: 10, heavy_load_threshold: 25, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index d79ec7899e2..722bdf0c56b 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -74,7 +74,7 @@ end-point-enable-batch-if-possible = true end-point-request-max-handle-duration = "12s" end-point-max-concurrency = 10 end-point-perf-level = 5 -snap-max-write-bytes-per-sec = "10MB" +snap-io-max-bytes-per-sec = "10MB" snap-max-total-size = "10GB" stats-concurrency = 10 heavy-load-threshold = 25 diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 4d9290b4eff..0ca576e5e9a 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -508,7 +508,7 @@ fn test_inspected_snapshot() { #[test] fn test_gen_during_heavy_recv() { let mut cluster = new_server_cluster(0, 3); - cluster.cfg.server.snap_max_write_bytes_per_sec = ReadableSize(5 * 1024 * 1024); + cluster.cfg.server.snap_io_max_bytes_per_sec = ReadableSize(5 * 1024 * 1024); cluster.cfg.raft_store.snap_mgr_gc_tick_interval = ReadableDuration(Duration::from_secs(100)); let pd_client = Arc::clone(&cluster.pd_client); From 64a41f042283255e43a285357b999b26853a3e8f Mon Sep 17 00:00:00 2001 From: Yifan Xu <30385241+xuyifangreeneyes@users.noreply.github.com> Date: Fri, 10 Mar 2023 17:59:13 +0800 Subject: [PATCH 577/676] coprocessor: reuse EvalContext in collect_column_stats (#14376) ref tikv/tikv#14231 Signed-off-by: xuyifan <675434007@qq.com> --- src/coprocessor/statistics/analyze.rs | 60 ++++++++++----------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 6b486c3bb7e..a49ac72398e 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -376,6 +376,7 @@ impl RowSampleBuilder { let mut is_drained = false; let mut collector = self.new_collector(); + let mut ctx = EvalContext::default(); while !is_drained { let mut sample = self.quota_limiter.new_sample(!self.is_auto_analyze); let mut read_size: usize = 0; @@ -400,7 +401,7 @@ impl RowSampleBuilder { columns_slice[i].encode( *logical_row, &self.columns_info[i], - &mut EvalContext::default(), + &mut ctx, &mut column_vals[i], )?; if self.columns_info[i].as_accessor().is_string_like() { @@ -408,7 +409,7 @@ impl RowSampleBuilder { TT, match self.columns_info[i].as_accessor().collation()? { Collation::TT => { let mut mut_val = &column_vals[i][..]; - let decoded_val = table::decode_col_value(&mut mut_val, &mut EvalContext::default(), &self.columns_info[i])?; + let decoded_val = table::decode_col_value(&mut mut_val, &mut ctx, &self.columns_info[i])?; if decoded_val == Datum::Null { collation_key_vals[i].clone_from(&column_vals[i]); } else { @@ -895,6 +896,7 @@ impl SampleBuilder { let mut common_handle_hist = Histogram::new(self.max_bucket_size); let mut common_handle_cms = CmSketch::new(self.cm_sketch_depth, self.cm_sketch_width); let mut common_handle_fms = FmSketch::new(self.max_fm_sketch_size); + let mut ctx = EvalContext::default(); while !is_drained { let result = self.data.next_batch(BATCH_MAX_SIZE).await; is_drained = result.is_drained?.stop(); @@ -904,12 +906,7 @@ impl SampleBuilder { if columns_without_handle_len + 1 == columns_slice.len() { for logical_row in &result.logical_rows { let mut data = vec![]; - columns_slice[0].encode( - *logical_row, - &columns_info[0], - &mut EvalContext::default(), - &mut data, - )?; + columns_slice[0].encode(*logical_row, &columns_info[0], &mut ctx, &mut data)?; pk_builder.append(&data, false); } columns_slice = &columns_slice[1..]; @@ -929,7 +926,7 @@ impl SampleBuilder { columns_slice[i].encode( *logical_row, &columns_info[i], - &mut EvalContext::default(), + &mut ctx, &mut handle_col_val, )?; data.extend_from_slice(&handle_col_val); @@ -974,12 +971,7 @@ impl SampleBuilder { for (i, collector) in collectors.iter_mut().enumerate() { for logical_row in &result.logical_rows { let mut val = vec![]; - columns_slice[i].encode( - *logical_row, - &columns_info[i], - &mut EvalContext::default(), - &mut val, - )?; + columns_slice[i].encode(*logical_row, &columns_info[i], &mut ctx, &mut val)?; // This is a workaround for different encoding methods used by TiDB and TiKV for // CM Sketch. We need this because we must ensure we are using the same encoding @@ -1000,9 +992,8 @@ impl SampleBuilder { INT_FLAG | UINT_FLAG | DURATION_FLAG => { let mut mut_val = &val[..]; let decoded_val = mut_val.read_datum()?; - let flattened = - table::flatten(&mut EvalContext::default(), decoded_val)?; - encode_value(&mut EvalContext::default(), &[flattened])? + let flattened = table::flatten(&mut ctx, decoded_val)?; + encode_value(&mut ctx, &[flattened])? } _ => val, }; @@ -1012,14 +1003,14 @@ impl SampleBuilder { TT, match columns_info[i].as_accessor().collation()? { Collation::TT => { let mut mut_val = &val[..]; - let decoded_val = table::decode_col_value(&mut mut_val, &mut EvalContext::default(), &columns_info[i])?; + let decoded_val = table::decode_col_value(&mut mut_val, &mut ctx, &columns_info[i])?; if decoded_val == Datum::Null { val } else { // Only if the `decoded_val` is Datum::Null, `decoded_val` is a Ok(None). // So it is safe the unwrap the Ok value. let decoded_sorted_val = TT::sort_key(&decoded_val.as_string()?.unwrap().into_owned())?; - encode_value(&mut EvalContext::default(), &[Datum::Bytes(decoded_sorted_val)])? + encode_value(&mut ctx, &[Datum::Bytes(decoded_sorted_val)])? } } } @@ -1237,8 +1228,9 @@ mod tests { ); let cases = vec![Datum::I64(1), Datum::Null, Datum::I64(2), Datum::I64(5)]; + let mut ctx = EvalContext::default(); for data in cases { - sample.collect(datum::encode_value(&mut EvalContext::default(), &[data]).unwrap()); + sample.collect(datum::encode_value(&mut ctx, &[data]).unwrap()); } assert_eq!(sample.samples.len(), max_sample_size); assert_eq!(sample.null_count, 1); @@ -1254,10 +1246,9 @@ mod tests { let loop_cnt = 1000; let mut item_cnt: HashMap, usize> = HashMap::new(); let mut nums: Vec> = Vec::with_capacity(row_num); + let mut ctx = EvalContext::default(); for i in 0..row_num { - nums.push( - datum::encode_value(&mut EvalContext::default(), &[Datum::I64(i as i64)]).unwrap(), - ); + nums.push(datum::encode_value(&mut ctx, &[Datum::I64(i as i64)]).unwrap()); } for loop_i in 0..loop_cnt { let mut collector = ReservoirRowSampleCollector::new(sample_num, 1000, 1); @@ -1302,10 +1293,9 @@ mod tests { let loop_cnt = 1000; let mut item_cnt: HashMap, usize> = HashMap::new(); let mut nums: Vec> = Vec::with_capacity(row_num); + let mut ctx = EvalContext::default(); for i in 0..row_num { - nums.push( - datum::encode_value(&mut EvalContext::default(), &[Datum::I64(i as i64)]).unwrap(), - ); + nums.push(datum::encode_value(&mut ctx, &[Datum::I64(i as i64)]).unwrap()); } for loop_i in 0..loop_cnt { let mut collector = @@ -1348,10 +1338,9 @@ mod tests { let sample_num = 0; // abnormal. let row_num = 100; let mut nums: Vec> = Vec::with_capacity(row_num); + let mut ctx = EvalContext::default(); for i in 0..row_num { - nums.push( - datum::encode_value(&mut EvalContext::default(), &[Datum::I64(i as i64)]).unwrap(), - ); + nums.push(datum::encode_value(&mut ctx, &[Datum::I64(i as i64)]).unwrap()); } { // Test for ReservoirRowSampleCollector @@ -1409,19 +1398,16 @@ mod benches { } let mut column_vals = Vec::new(); let mut collation_key_vals = Vec::new(); + let mut ctx = EvalContext::default(); for i in 0..columns_info.len() { let mut val = vec![]; columns_slice[i] - .encode(0, &columns_info[i], &mut EvalContext::default(), &mut val) + .encode(0, &columns_info[i], &mut ctx, &mut val) .unwrap(); if columns_info[i].as_accessor().is_string_like() { let mut mut_val = &val[..]; - let decoded_val = table::decode_col_value( - &mut mut_val, - &mut EvalContext::default(), - &columns_info[i], - ) - .unwrap(); + let decoded_val = + table::decode_col_value(&mut mut_val, &mut ctx, &columns_info[i]).unwrap(); let decoded_sorted_val = CollatorUtf8Mb4Bin::sort_key(&decoded_val.as_string().unwrap().unwrap()) .unwrap(); From fc9cf096b3eddc48e2f1df09dd6d716198067410 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 10 Mar 2023 18:15:13 +0800 Subject: [PATCH 578/676] *: more accurate time detail in kv response (#14358) close tikv/tikv#14262 Co-authored-by: Ti Chi Robot --- components/tikv_kv/src/stats.rs | 8 +- src/coprocessor/endpoint.rs | 348 +++++++++++------- src/coprocessor/tracker.rs | 28 +- src/server/service/kv.rs | 54 ++- src/storage/mod.rs | 16 +- tests/integrations/config/mod.rs | 2 + tests/integrations/config/test-custom.toml | 3 + tests/integrations/coprocessor/test_select.rs | 3 + tests/integrations/server/kv_service.rs | 17 +- 9 files changed, 303 insertions(+), 176 deletions(-) diff --git a/components/tikv_kv/src/stats.rs b/components/tikv_kv/src/stats.rs index 4362f5d57ca..d38c97397ee 100644 --- a/components/tikv_kv/src/stats.rs +++ b/components/tikv_kv/src/stats.rs @@ -283,8 +283,8 @@ impl StatisticsSummary { /// ``` #[derive(Debug, Default, Copy, Clone)] pub struct StageLatencyStats { - pub schedule_wait_time_ms: u64, - pub snapshot_wait_time_ms: u64, - pub wait_wall_time_ms: u64, - pub process_wall_time_ms: u64, + pub schedule_wait_time_ns: u64, + pub snapshot_wait_time_ns: u64, + pub wait_wall_time_ns: u64, + pub process_wall_time_ns: u64, } diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 3ba320149ac..43bf20f582b 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -878,7 +878,7 @@ mod tests { /// A unary `RequestHandler` that always produces a fixture. struct UnaryFixture { - handle_duration_millis: u64, + handle_duration: Duration, yieldable: bool, result: Option>, } @@ -886,7 +886,7 @@ mod tests { impl UnaryFixture { pub fn new(result: Result) -> UnaryFixture { UnaryFixture { - handle_duration_millis: 0, + handle_duration: Default::default(), yieldable: false, result: Some(result), } @@ -894,10 +894,10 @@ mod tests { pub fn new_with_duration( result: Result, - handle_duration_millis: u64, + handle_duration: Duration, ) -> UnaryFixture { UnaryFixture { - handle_duration_millis, + handle_duration, yieldable: false, result: Some(result), } @@ -905,10 +905,10 @@ mod tests { pub fn new_with_duration_yieldable( result: Result, - handle_duration_millis: u64, + handle_duration: Duration, ) -> UnaryFixture { UnaryFixture { - handle_duration_millis, + handle_duration, yieldable: true, result: Some(result), } @@ -920,13 +920,15 @@ mod tests { async fn handle_request(&mut self) -> Result> { if self.yieldable { // We split the task into small executions of 100 milliseconds. - for _ in 0..self.handle_duration_millis / 100 { + for _ in 0..self.handle_duration.as_millis() as u64 / 100 { thread::sleep(Duration::from_millis(100)); yatp::task::future::reschedule().await; } - thread::sleep(Duration::from_millis(self.handle_duration_millis % 100)); + thread::sleep(Duration::from_millis( + self.handle_duration.as_millis() as u64 % 100, + )); } else { - thread::sleep(Duration::from_millis(self.handle_duration_millis)); + thread::sleep(self.handle_duration); } self.result.take().unwrap().map(|x| x.into()) @@ -937,7 +939,7 @@ mod tests { struct StreamFixture { result_len: usize, result_iter: vec::IntoIter>, - handle_durations_millis: vec::IntoIter, + handle_durations: vec::IntoIter, nth: usize, } @@ -947,20 +949,20 @@ mod tests { StreamFixture { result_len: len, result_iter: result.into_iter(), - handle_durations_millis: vec![0; len].into_iter(), + handle_durations: vec![Duration::default(); len].into_iter(), nth: 0, } } pub fn new_with_duration( result: Vec>, - handle_durations_millis: Vec, + handle_durations: Vec, ) -> StreamFixture { - assert_eq!(result.len(), handle_durations_millis.len()); + assert_eq!(result.len(), handle_durations.len()); StreamFixture { result_len: result.len(), result_iter: result.into_iter(), - handle_durations_millis: handle_durations_millis.into_iter(), + handle_durations: handle_durations.into_iter(), nth: 0, } } @@ -980,8 +982,8 @@ mod tests { Ok((None, is_finished)) } Some(val) => { - let handle_duration_ms = self.handle_durations_millis.next().unwrap(); - thread::sleep(Duration::from_millis(handle_duration_ms)); + let handle_duration = self.handle_durations.next().unwrap(); + thread::sleep(handle_duration); match val { Ok(resp) => Ok((Some(resp), is_finished)), Err(e) => Err(e), @@ -1198,7 +1200,10 @@ mod tests { context.set_priority(kvrpcpb::CommandPri::Normal); let handler_builder = Box::new(|_, _: &_| { - Ok(UnaryFixture::new_with_duration(Ok(response), 1000).into_boxed()) + Ok( + UnaryFixture::new_with_duration(Ok(response), Duration::from_millis(1000)) + .into_boxed(), + ) }); let future = copr.handle_unary_request(ReqContext::default_for_test(), handler_builder); let tx = tx.clone(); @@ -1466,20 +1471,20 @@ mod tests { use tikv_util::config::ReadableDuration; /// Asserted that the snapshot can be retrieved in 500ms. - const SNAPSHOT_DURATION_MS: u64 = 500; + const SNAPSHOT_DURATION: Duration = Duration::from_millis(500); /// Asserted that the delay caused by OS scheduling other tasks is /// smaller than 200ms. This is mostly for CI. - const HANDLE_ERROR_MS: u64 = 200; + const HANDLE_ERROR: Duration = Duration::from_millis(200); /// The acceptable error range for a coarse timer. Note that we use /// CLOCK_MONOTONIC_COARSE which can be slewed by time /// adjustment code (e.g., NTP, PTP). - const COARSE_ERROR_MS: u64 = 50; + const COARSE_ERROR: Duration = Duration::from_millis(50); /// The duration that payload executes. - const PAYLOAD_SMALL: u64 = 3000; - const PAYLOAD_LARGE: u64 = 6000; + const PAYLOAD_SMALL: Duration = Duration::from_millis(3000); + const PAYLOAD_LARGE: Duration = Duration::from_millis(6000); let engine = TestEngineBuilder::new().build().unwrap(); @@ -1494,7 +1499,7 @@ mod tests { )); let config = Config { - end_point_request_max_handle_duration: ReadableDuration::millis( + end_point_request_max_handle_duration: ReadableDuration( (PAYLOAD_SMALL + PAYLOAD_LARGE) * 2, ), ..Default::default() @@ -1516,7 +1521,7 @@ mod tests { req_with_exec_detail.context.set_record_time_stat(true); { - let mut wait_time: u64 = 0; + let mut wait_time: Duration = Duration::default(); // Request 1: Unary, success response. let handler_builder = Box::new(|_, _: &_| { @@ -1530,7 +1535,7 @@ mod tests { let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_1).unwrap()]).unwrap()); // Sleep a while to make sure that thread is spawn and snapshot is taken. - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); + thread::sleep(SNAPSHOT_DURATION); // Request 2: Unary, error response. let handler_builder = Box::new(|_, _: &_| { @@ -1543,63 +1548,95 @@ mod tests { copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_2).unwrap()]).unwrap()); - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); + thread::sleep(SNAPSHOT_DURATION); // Response 1 let resp = &rx.recv().unwrap()[0]; assert!(resp.get_other_error().is_empty()); assert_ge!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_SMALL.saturating_sub(COARSE_ERROR_MS) + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_SMALL.saturating_sub(COARSE_ERROR) ); assert_lt!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_SMALL + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_SMALL + HANDLE_ERROR + COARSE_ERROR ); assert_ge!( - resp.get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time.saturating_sub(HANDLE_ERROR + COARSE_ERROR) ); assert_lt!( - resp.get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time + HANDLE_ERROR + COARSE_ERROR ); - wait_time += PAYLOAD_SMALL - SNAPSHOT_DURATION_MS; + wait_time += PAYLOAD_SMALL - SNAPSHOT_DURATION; // Response 2 let resp = &rx.recv().unwrap()[0]; assert!(!resp.get_other_error().is_empty()); assert_ge!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_LARGE.saturating_sub(COARSE_ERROR_MS) + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_LARGE.saturating_sub(COARSE_ERROR) ); assert_lt!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_LARGE + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_LARGE + HANDLE_ERROR + COARSE_ERROR ); assert_ge!( - resp.get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time.saturating_sub(HANDLE_ERROR + COARSE_ERROR) ); assert_lt!( - resp.get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time + HANDLE_ERROR + COARSE_ERROR + ); + + // check TimeDetail and TimeDetailV2 has the same value. + let time_detail = resp.get_exec_details_v2().get_time_detail(); + let time_detail_v2 = resp.get_exec_details_v2().get_time_detail_v2(); + assert_eq!( + time_detail.get_process_wall_time_ms(), + time_detail_v2.get_process_wall_time_ns() / 1_000_000, + ); + assert_eq!( + time_detail.get_wait_wall_time_ms(), + time_detail_v2.get_wait_wall_time_ns() / 1_000_000, + ); + assert_eq!( + time_detail.get_kv_read_wall_time_ms(), + time_detail_v2.get_kv_read_wall_time_ns() / 1_000_000, ); } @@ -1618,7 +1655,7 @@ mod tests { let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_1).unwrap()]).unwrap()); // Sleep a while to make sure that thread is spawn and snapshot is taken. - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); + thread::sleep(SNAPSHOT_DURATION); // Request 2: Unary, error response. let handler_builder = Box::new(|_, _: &_| { @@ -1631,7 +1668,7 @@ mod tests { copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_2).unwrap()]).unwrap()); - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); + thread::sleep(SNAPSHOT_DURATION); // Response 1 // @@ -1644,16 +1681,20 @@ mod tests { let resp = &rx.recv().unwrap()[0]; assert!(resp.get_other_error().is_empty()); assert_ge!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_SMALL.saturating_sub(COARSE_ERROR_MS) + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_SMALL.saturating_sub(COARSE_ERROR) ); assert_lt!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_SMALL + PAYLOAD_LARGE + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_SMALL + PAYLOAD_LARGE + HANDLE_ERROR + COARSE_ERROR ); // Response 2 @@ -1667,21 +1708,25 @@ mod tests { let resp = &rx.recv().unwrap()[0]; assert!(!resp.get_other_error().is_empty()); assert_ge!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_LARGE.saturating_sub(COARSE_ERROR_MS) + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_LARGE.saturating_sub(COARSE_ERROR) ); assert_lt!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_SMALL + PAYLOAD_LARGE + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_SMALL + PAYLOAD_LARGE + HANDLE_ERROR + COARSE_ERROR ); } { - let mut wait_time: u64 = 0; + let mut wait_time = Duration::default(); // Request 1: Unary, success response. let handler_builder = Box::new(|_, _: &_| { @@ -1695,7 +1740,7 @@ mod tests { let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_1).unwrap()]).unwrap()); // Sleep a while to make sure that thread is spawn and snapshot is taken. - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); + thread::sleep(SNAPSHOT_DURATION); // Request 2: Stream. let handler_builder = Box::new(|_, _: &_| { @@ -1726,92 +1771,116 @@ mod tests { let resp = &rx.recv().unwrap()[0]; assert!(resp.get_other_error().is_empty()); assert_ge!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_LARGE.saturating_sub(COARSE_ERROR_MS) + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_LARGE.saturating_sub(COARSE_ERROR) ); assert_lt!( - resp.get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_LARGE + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_LARGE + HANDLE_ERROR + COARSE_ERROR ); assert_ge!( - resp.get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time.saturating_sub(HANDLE_ERROR + COARSE_ERROR) ); assert_lt!( - resp.get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp.get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time + HANDLE_ERROR + COARSE_ERROR ); - wait_time += PAYLOAD_LARGE - SNAPSHOT_DURATION_MS; + wait_time += PAYLOAD_LARGE - SNAPSHOT_DURATION; // Response 2 let resp = &rx.recv().unwrap(); assert_eq!(resp.len(), 2); assert!(resp[0].get_other_error().is_empty()); assert_ge!( - resp[0] - .get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_SMALL.saturating_sub(COARSE_ERROR_MS) + Duration::from_nanos( + resp[0] + .get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_SMALL.saturating_sub(COARSE_ERROR) ); assert_lt!( - resp[0] - .get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_SMALL + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp[0] + .get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_SMALL + HANDLE_ERROR + COARSE_ERROR ); assert_ge!( - resp[0] - .get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) + Duration::from_nanos( + resp[0] + .get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time.saturating_sub(HANDLE_ERROR + COARSE_ERROR) ); assert_lt!( - resp[0] - .get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp[0] + .get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time + HANDLE_ERROR + COARSE_ERROR ); assert!(!resp[1].get_other_error().is_empty()); assert_ge!( - resp[1] - .get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_LARGE.saturating_sub(COARSE_ERROR_MS) + Duration::from_nanos( + resp[1] + .get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_LARGE.saturating_sub(COARSE_ERROR) ); assert_lt!( - resp[1] - .get_exec_details() - .get_time_detail() - .get_process_wall_time_ms(), - PAYLOAD_LARGE + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp[1] + .get_exec_details_v2() + .get_time_detail_v2() + .get_process_wall_time_ns() + ), + PAYLOAD_LARGE + HANDLE_ERROR + COARSE_ERROR ); assert_ge!( - resp[1] - .get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time.saturating_sub(HANDLE_ERROR_MS + COARSE_ERROR_MS) + Duration::from_nanos( + resp[1] + .get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time.saturating_sub(HANDLE_ERROR + COARSE_ERROR) ); assert_lt!( - resp[1] - .get_exec_details() - .get_time_detail() - .get_wait_wall_time_ms(), - wait_time + HANDLE_ERROR_MS + COARSE_ERROR_MS + Duration::from_nanos( + resp[1] + .get_exec_details_v2() + .get_time_detail_v2() + .get_wait_wall_time_ns() + ), + wait_time + HANDLE_ERROR + COARSE_ERROR ); } } @@ -1848,10 +1917,11 @@ mod tests { { let handler_builder = Box::new(|_, _: &_| { - Ok( - UnaryFixture::new_with_duration_yieldable(Ok(coppb::Response::default()), 1500) - .into_boxed(), + Ok(UnaryFixture::new_with_duration_yieldable( + Ok(coppb::Response::default()), + Duration::from_millis(1500), ) + .into_boxed()) }); let mut config = ReqContext::default_for_test(); diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index 9c0b79ff8b8..ca726be9a43 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -67,7 +67,7 @@ pub struct Tracker { total_process_time: Duration, total_storage_stats: Statistics, slow_log_threshold: Duration, - scan_process_time_ms: u64, + scan_process_time_ns: u64, pub buckets: Option>, @@ -96,7 +96,7 @@ impl Tracker { total_suspend_time: Duration::default(), total_process_time: Duration::default(), total_storage_stats: Statistics::default(), - scan_process_time_ms: 0, + scan_process_time_ns: 0, slow_log_threshold, req_ctx, buckets: None, @@ -175,7 +175,7 @@ impl Tracker { } pub fn collect_scan_process_time(&mut self, exec_summary: ExecSummary) { - self.scan_process_time_ms = (exec_summary.time_processed_ns / 1000000) as u64; + self.scan_process_time_ns = exec_summary.time_processed_ns as u64; } /// Get current item's ExecDetail according to previous collected metrics. @@ -183,7 +183,7 @@ impl Tracker { /// WARN: TRY BEST NOT TO USE THIS FUNCTION. pub fn get_item_exec_details(&self) -> (kvrpcpb::ExecDetails, kvrpcpb::ExecDetailsV2) { if let TrackerState::ItemFinished(_) = self.current_stage { - self.exec_details(self.item_process_time) + self.exec_details(self.item_process_time, self.item_suspend_time) } else { unreachable!() } @@ -194,27 +194,39 @@ impl Tracker { pub fn get_exec_details(&self) -> (kvrpcpb::ExecDetails, kvrpcpb::ExecDetailsV2) { if let TrackerState::ItemFinished(_) = self.current_stage { // TODO: Separate process time and suspend time - self.exec_details(self.total_process_time + self.total_suspend_time) + self.exec_details(self.total_process_time, self.total_suspend_time) } else { unreachable!() } } - fn exec_details(&self, measure: Duration) -> (kvrpcpb::ExecDetails, kvrpcpb::ExecDetailsV2) { + fn exec_details( + &self, + process_time: Duration, + suspend_time: Duration, + ) -> (kvrpcpb::ExecDetails, kvrpcpb::ExecDetailsV2) { // For compatibility, ExecDetails field is still filled. let mut exec_details = kvrpcpb::ExecDetails::default(); + // TimeDetail is deprecated, we only keep it for backward compatibility. let mut td = kvrpcpb::TimeDetail::default(); - td.set_process_wall_time_ms(time::duration_to_ms(measure)); + td.set_process_wall_time_ms(time::duration_to_ms(process_time)); td.set_wait_wall_time_ms(time::duration_to_ms(self.wait_time)); - td.set_kv_read_wall_time_ms(self.scan_process_time_ms); + td.set_kv_read_wall_time_ms(self.scan_process_time_ns / 1_000_000); exec_details.set_time_detail(td.clone()); let detail = self.total_storage_stats.scan_detail(); exec_details.set_scan_detail(detail); + let mut td_v2 = kvrpcpb::TimeDetailV2::default(); + td_v2.set_process_wall_time_ns(process_time.as_nanos() as u64); + td_v2.set_process_suspend_wall_time_ns(suspend_time.as_nanos() as u64); + td_v2.set_wait_wall_time_ns(self.wait_time.as_nanos() as u64); + td_v2.set_kv_read_wall_time_ns(self.scan_process_time_ns); + let mut exec_details_v2 = kvrpcpb::ExecDetailsV2::default(); exec_details_v2.set_time_detail(td); + exec_details_v2.set_time_detail_v2(td_v2); let mut detail_v2 = ScanDetailV2::default(); detail_v2.set_processed_versions(self.total_storage_stats.write.processed_keys as u64); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index f0d0009b8e6..6fc3a3ebd76 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1,7 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath]: TiKV gRPC APIs implementation -use std::{mem, sync::Arc}; +use std::{mem, sync::Arc, time::Duration}; use api_version::KvFormat; use fail::fail_point; @@ -27,12 +27,12 @@ use raftstore::{ Error as RaftStoreError, Result as RaftStoreResult, }; use tikv_alloc::trace::MemoryTraceGuard; -use tikv_kv::RaftExtension; +use tikv_kv::{RaftExtension, StageLatencyStats}; use tikv_util::{ future::{paired_future_callback, poll_future_notify}, mpsc::future::{unbounded, BatchReceiver, Sender, WakePolicy}, sys::memory_usage_reaches_high_water, - time::{duration_to_ms, duration_to_sec, Instant}, + time::Instant, worker::Scheduler, }; use tracker::{set_tls_tracker_token, RequestInfo, RequestType, Tracker, GLOBAL_TRACKERS}; @@ -214,6 +214,10 @@ macro_rules! set_total_time { .mut_exec_details_v2() .mut_time_detail() .set_total_rpc_wall_time_ns($duration.as_nanos() as u64); + $resp + .mut_exec_details_v2() + .mut_time_detail_v2() + .set_total_rpc_wall_time_ns($duration.as_nanos() as u64); }; } @@ -578,7 +582,7 @@ impl Tikv for Service { Ok(_) => { GRPC_MSG_HISTOGRAM_STATIC .coprocessor_stream - .observe(duration_to_sec(begin_instant.saturating_elapsed())); + .observe(begin_instant.saturating_elapsed().as_secs_f64()); let _ = sink.close().await; } Err(e) => { @@ -795,7 +799,7 @@ impl Tikv for Service { sink.success(resp).await?; GRPC_MSG_HISTOGRAM_STATIC .split_region - .observe(duration_to_sec(begin_instant.saturating_elapsed())); + .observe(begin_instant.saturating_elapsed().as_secs_f64()); ServerResult::Ok(()) } .map_err(|e| { @@ -1220,6 +1224,9 @@ fn handle_measures_for_batch_commands(measures: &mut MeasuredBatchResponse) { exec_details .mut_time_detail() .set_total_rpc_wall_time_ns(elapsed.as_nanos() as u64); + exec_details + .mut_time_detail_v2() + .set_total_rpc_wall_time_ns(elapsed.as_nanos() as u64); } } } @@ -1262,7 +1269,7 @@ fn future_get( async move { let v = v.await; - let duration_ms = duration_to_ms(start.saturating_elapsed()); + let duration = start.saturating_elapsed(); let mut resp = GetResponse::default(); if let Some(err) = extract_region_error(&v) { resp.set_region_error(err); @@ -1275,10 +1282,7 @@ fn future_get( GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { tracker.write_scan_detail(scan_detail_v2); }); - let time_detail = exec_detail_v2.mut_time_detail(); - time_detail.set_kv_read_wall_time_ms(duration_ms); - time_detail.set_wait_wall_time_ms(stats.latency_stats.wait_wall_time_ms); - time_detail.set_process_wall_time_ms(stats.latency_stats.process_wall_time_ms); + set_time_detail(exec_detail_v2, duration, &stats.latency_stats); match val { Some(val) => resp.set_value(val), None => resp.set_not_found(true), @@ -1292,6 +1296,29 @@ fn future_get( } } +fn set_time_detail( + exec_detail_v2: &mut ExecDetailsV2, + total_dur: Duration, + stats: &StageLatencyStats, +) { + let duration_ns = total_dur.as_nanos() as u64; + // deprecated. we will remove the `time_detail` field in future version. + { + let time_detail = exec_detail_v2.mut_time_detail(); + time_detail.set_kv_read_wall_time_ms(duration_ns / 1_000_000); + time_detail.set_wait_wall_time_ms(stats.wait_wall_time_ns / 1_000_000); + time_detail.set_process_wall_time_ms(stats.process_wall_time_ns / 1_000_000); + } + + let time_detail_v2 = exec_detail_v2.mut_time_detail_v2(); + time_detail_v2.set_kv_read_wall_time_ns(duration_ns); + time_detail_v2.set_wait_wall_time_ns(stats.wait_wall_time_ns); + time_detail_v2.set_process_wall_time_ns(stats.process_wall_time_ns); + // currently, the schedule suspend_wall_time is always 0 for get and + // batch_get. TODO: once we support aync-io, we may also count the + // schedule suspend duration here. +} + fn future_scan( storage: &Storage, mut req: ScanRequest, @@ -1356,7 +1383,7 @@ fn future_batch_get( async move { let v = v.await; - let duration_ms = duration_to_ms(start.saturating_elapsed()); + let duration = start.saturating_elapsed(); let mut resp = BatchGetResponse::default(); if let Some(err) = extract_region_error(&v) { resp.set_region_error(err); @@ -1370,10 +1397,7 @@ fn future_batch_get( GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { tracker.write_scan_detail(scan_detail_v2); }); - let time_detail = exec_detail_v2.mut_time_detail(); - time_detail.set_kv_read_wall_time_ms(duration_ms); - time_detail.set_wait_wall_time_ms(stats.latency_stats.wait_wall_time_ms); - time_detail.set_process_wall_time_ms(stats.latency_stats.process_wall_time_ms); + set_time_detail(exec_detail_v2, duration, &stats.latency_stats); resp.set_pairs(pairs.into()); } Err(e) => { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 8c58274bc33..8f955f3850d 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -713,10 +713,10 @@ impl Storage { let process_wall_time = stage_finished_ts.saturating_duration_since(stage_snap_recv_ts); let latency_stats = StageLatencyStats { - schedule_wait_time_ms: duration_to_ms(schedule_wait_time), - snapshot_wait_time_ms: duration_to_ms(snapshot_wait_time), - wait_wall_time_ms: duration_to_ms(wait_wall_time), - process_wall_time_ms: duration_to_ms(process_wall_time), + schedule_wait_time_ns: schedule_wait_time.as_nanos() as u64, + snapshot_wait_time_ns: snapshot_wait_time.as_nanos() as u64, + wait_wall_time_ns: wait_wall_time.as_nanos() as u64, + process_wall_time_ns: process_wall_time.as_nanos() as u64, }; with_tls_tracker(|tracker| { tracker.metrics.read_pool_schedule_wait_nanos = @@ -1077,10 +1077,10 @@ impl Storage { schedule_wait_time.as_nanos() as u64; }); let latency_stats = StageLatencyStats { - schedule_wait_time_ms: duration_to_ms(schedule_wait_time), - snapshot_wait_time_ms: duration_to_ms(snapshot_wait_time), - wait_wall_time_ms: duration_to_ms(wait_wall_time), - process_wall_time_ms: duration_to_ms(process_wall_time), + schedule_wait_time_ns: duration_to_ms(schedule_wait_time), + snapshot_wait_time_ns: duration_to_ms(snapshot_wait_time), + wait_wall_time_ns: duration_to_ms(wait_wall_time), + process_wall_time_ns: duration_to_ms(process_wall_time), }; Ok(( result?, diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 102d695b2de..8ff9e2f93af 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -22,6 +22,7 @@ use raftstore::{ coprocessor::{Config as CopConfig, ConsistencyCheckMethod}, store::Config as RaftstoreConfig, }; +use resource_control::Config as ResourceControlConfig; use security::SecurityConfig; use slog::Level; use test_util::assert_eq_debug; @@ -825,6 +826,7 @@ fn test_serde_custom_tikv_config() { renew_batch_max_size: 8192, alloc_ahead_buffer: ReadableDuration::millis(3000), }; + value.resource_control = ResourceControlConfig { enabled: false }; let custom = read_file_in_project_dir("integrations/config/test-custom.toml"); let load = toml::from_str(&custom).unwrap(); diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 722bdf0c56b..ecab04350b6 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -688,3 +688,6 @@ sample-threshold = 100 byte-threshold = 31457280 split.split-balance-score = 0.25 split.split-contained-score = 0.5 + +[resource-control] +enabled = false diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index fe545d07ec1..d5f8d55e320 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -303,6 +303,7 @@ fn test_scan_detail() { assert_eq!(scan_detail.get_lock().get_total(), 1); assert!(resp.get_exec_details_v2().has_time_detail()); + assert!(resp.get_exec_details_v2().has_time_detail_v2()); let scan_detail_v2 = resp.get_exec_details_v2().get_scan_detail_v2(); assert_eq!(scan_detail_v2.get_total_versions(), 5); assert_eq!(scan_detail_v2.get_processed_versions(), 4); @@ -1017,6 +1018,7 @@ fn test_del_select() { assert_eq!(row_count, 5); assert!(resp.get_exec_details_v2().has_time_detail()); + assert!(resp.get_exec_details_v2().has_time_detail_v2()); let scan_detail_v2 = resp.get_exec_details_v2().get_scan_detail_v2(); assert_eq!(scan_detail_v2.get_total_versions(), 8); assert_eq!(scan_detail_v2.get_processed_versions(), 5); @@ -1722,6 +1724,7 @@ fn test_exec_details() { assert!(resp.has_exec_details_v2()); let exec_details = resp.get_exec_details_v2(); assert!(exec_details.has_time_detail()); + assert!(exec_details.has_time_detail_v2()); assert!(exec_details.has_scan_detail_v2()); } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 44d16961f7d..284a3f1cb89 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -290,6 +290,7 @@ fn test_mvcc_basic() { assert!(!get_resp.has_region_error()); assert!(!get_resp.has_error()); assert!(get_resp.get_exec_details_v2().has_time_detail()); + assert!(get_resp.get_exec_details_v2().has_time_detail_v2()); let scan_detail_v2 = get_resp.get_exec_details_v2().get_scan_detail_v2(); assert_eq!(scan_detail_v2.get_total_versions(), 1); assert_eq!(scan_detail_v2.get_processed_versions(), 1); @@ -322,6 +323,7 @@ fn test_mvcc_basic() { batch_get_req.version = batch_get_version; let batch_get_resp = client.kv_batch_get(&batch_get_req).unwrap(); assert!(batch_get_resp.get_exec_details_v2().has_time_detail()); + assert!(batch_get_resp.get_exec_details_v2().has_time_detail_v2()); let scan_detail_v2 = batch_get_resp.get_exec_details_v2().get_scan_detail_v2(); assert_eq!(scan_detail_v2.get_total_versions(), 1); assert_eq!(scan_detail_v2.get_processed_versions(), 1); @@ -2312,6 +2314,7 @@ fn test_txn_api_version() { assert!(!get_resp.has_region_error()); assert!(!get_resp.has_error()); assert!(get_resp.get_exec_details_v2().has_time_detail()); + assert!(get_resp.get_exec_details_v2().has_time_detail_v2()); } { // Pessimistic Lock @@ -2491,10 +2494,20 @@ fn test_rpc_wall_time() { assert!( get_resp .get_exec_details_v2() - .get_time_detail() + .get_time_detail_v2() .get_total_rpc_wall_time_ns() > 0 ); + assert_eq!( + get_resp + .get_exec_details_v2() + .get_time_detail_v2() + .get_total_rpc_wall_time_ns(), + get_resp + .get_exec_details_v2() + .get_time_detail() + .get_total_rpc_wall_time_ns() + ); let (mut sender, receiver) = client.batch_commands().unwrap(); let mut batch_req = BatchCommandsRequest::default(); @@ -2525,7 +2538,7 @@ fn test_rpc_wall_time() { assert!( resp.get_get() .get_exec_details_v2() - .get_time_detail() + .get_time_detail_v2() .get_total_rpc_wall_time_ns() > 0 ); From ee57a81270f2def812336a6c5f3395c0cafa7ded Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 13 Mar 2023 12:08:37 +0800 Subject: [PATCH 579/676] raftstore-v2: fix ingest codec (#14373) ref tikv/tikv#12842 Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- .../raftstore-v2/src/operation/command/mod.rs | 2 +- .../src/operation/command/write/ingest.rs | 19 +++++++++++------ .../operation/command/write/simple_write.rs | 21 ++++++++++++++++++- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index df289a26f4a..f14c2c905a3 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -585,7 +585,7 @@ impl Apply { )?; } SimpleWrite::Ingest(ssts) => { - self.apply_ingest(ssts)?; + self.apply_ingest(log_index, ssts)?; } } } diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index c39fc25a28b..73459740393 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -2,7 +2,7 @@ use collections::HashMap; use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{data_cf_offset, KvEngine, RaftEngine}; use kvproto::import_sstpb::SstMeta; use raftstore::{ store::{check_sst_for_ingestion, metrics::PEER_WRITE_CMD_COUNTER, util}, @@ -82,10 +82,15 @@ impl Peer { impl Apply { #[inline] - pub fn apply_ingest(&mut self, ssts: Vec) -> Result<()> { + pub fn apply_ingest(&mut self, index: u64, ssts: Vec) -> Result<()> { PEER_WRITE_CMD_COUNTER.ingest_sst.inc(); let mut infos = Vec::with_capacity(ssts.len()); for sst in &ssts { + // This may not be enough as ingest sst may not trigger flush at all. + let off = data_cf_offset(sst.get_cf_name()); + if self.should_skip(off, index) { + continue; + } if let Err(e) = check_sst_for_ingestion(sst, self.region()) { error!( self.logger, @@ -104,10 +109,12 @@ impl Apply { } } } - // Unlike v1, we can't batch ssts accross regions. - self.flush(); - if let Err(e) = self.sst_importer().ingest(&infos, self.tablet()) { - slog_panic!(self.logger, "ingest fail"; "ssts" => ?ssts, "error" => ?e); + if !infos.is_empty() { + // Unlike v1, we can't batch ssts accross regions. + self.flush(); + if let Err(e) = self.sst_importer().ingest(&infos, self.tablet()) { + slog_panic!(self.logger, "ingest fail"; "ssts" => ?ssts, "error" => ?e); + } } Ok(()) } diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index a2c378cb04b..5f72fa62738 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -463,7 +463,8 @@ fn decode<'a>(buf: &mut &'a [u8]) -> Option> { }; ssts.push(sst); } - *buf = left; + let read = is.pos(); + *buf = &left[read as usize..]; Some(SimpleWrite::Ingest(ssts)) } tag => panic!("corrupted data: invalid tag {}", tag), @@ -532,6 +533,24 @@ mod tests { let res = decoder.next(); assert!(res.is_none(), "{:?}", res); + + let mut encoder = SimpleWriteEncoder::with_capacity(512); + let exp: Vec<_> = (0..10) + .map(|id| { + let mut meta = SstMeta::default(); + meta.set_region_id(id); + meta + }) + .collect(); + encoder.ingest(exp.clone()); + let bin = encoder.encode(); + let req_encoder = SimpleWriteReqEncoder::new(header, bin, 0, false); + let (bytes, _) = req_encoder.encode(); + let mut decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); + let write = decoder.next().unwrap(); + let SimpleWrite::Ingest(ssts) = write else { panic!("should be ingest") }; + assert_eq!(exp, ssts); + assert_matches!(decoder.next(), None); } #[test] From 571e513d6c2089ba4ceaf50051dbf81fd221db8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 13 Mar 2023 16:22:40 +0800 Subject: [PATCH 580/676] log-backup: added intervally resolve regions (#14180) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ref tikv/tikv#13638 This PR added a “two phase” flush to log backup for reducing checkpoint lag. Generally, we added a `MinTs` task, where resolve the regions and advance the `resolved_ts` in the checkpoint manager. then, once we are doing flush, we would make current `resolved_ts` become `checkpoint_ts`. This allows us to advance checkpoint_ts even the leader has gone. When the leader changes frequently, this can greatly reduce checkpoint lag. Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- .../backup-stream/src/checkpoint_manager.rs | 235 +++++++++++------- components/backup-stream/src/endpoint.rs | 93 ++++++- components/backup-stream/src/metrics.rs | 6 + .../backup-stream/src/subscription_manager.rs | 22 +- components/backup-stream/tests/mod.rs | 61 ++++- src/config/mod.rs | 17 ++ tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + 8 files changed, 326 insertions(+), 110 deletions(-) diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 47ec34d2113..50a6ac27864 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -1,10 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::{cell::RefCell, collections::HashMap, sync::Arc, time::Duration}; use futures::{ channel::mpsc::{self as async_mpsc, Receiver, Sender}, - SinkExt, StreamExt, + future::BoxFuture, + FutureExt, SinkExt, StreamExt, }; use grpcio::{RpcStatus, RpcStatusCode, ServerStreamingSink, WriteFlags}; use kvproto::{ @@ -13,7 +14,7 @@ use kvproto::{ metapb::Region, }; use pd_client::PdClient; -use tikv_util::{box_err, defer, info, warn, worker::Scheduler}; +use tikv_util::{box_err, defer, info, time::Instant, warn, worker::Scheduler}; use txn_types::TimeStamp; use uuid::Uuid; @@ -22,7 +23,9 @@ use crate::{ errors::{Error, ReportableResult, Result}, future, metadata::{store::MetaStore, Checkpoint, CheckpointProvider, MetadataClient}, - metrics, try_send, RegionCheckpointOperation, Task, + metrics, + subscription_track::ResolveResult, + try_send, RegionCheckpointOperation, Task, }; /// A manager for maintaining the last flush ts. @@ -31,14 +34,16 @@ use crate::{ /// checkpoint then advancing the global checkpoint. #[derive(Default)] pub struct CheckpointManager { - items: HashMap, + checkpoint_ts: HashMap, + resolved_ts: HashMap, manager_handle: Option>, } impl std::fmt::Debug for CheckpointManager { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("CheckpointManager") - .field("items", &self.items) + .field("checkpoints", &self.checkpoint_ts) + .field("resolved-ts", &self.resolved_ts) .finish() } } @@ -60,49 +65,59 @@ impl SubscriptionManager { while let Some(msg) = self.input.next().await { match msg { SubscriptionOp::Add(sub) => { - self.subscribers.insert(Uuid::new_v4(), sub); + let uid = Uuid::new_v4(); + info!("log backup adding new subscriber"; "id" => %uid); + self.subscribers.insert(uid, sub); } SubscriptionOp::Emit(events) => { - let mut canceled = vec![]; - for (id, sub) in &mut self.subscribers { - let send_all = async { - for es in events.chunks(1024) { - let mut resp = SubscribeFlushEventResponse::new(); - resp.set_events(es.to_vec().into()); - sub.feed((resp, WriteFlags::default())).await?; - } - sub.flush().await - }; - - match send_all.await { - Err(grpcio::Error::RemoteStopped) => { - canceled.push(*id); - } - Err(err) => { - Error::from(err).report("sending subscription"); - } - _ => {} - } - } - - for c in canceled { - match self.subscribers.remove(&c) { - Some(mut sub) => { - info!("client is gone, removing subscription"; "id" => %c); - sub.close().await.report_if_err(format_args!( - "during removing subscription {}", - c - )) - } - None => { - warn!("BUG: the subscriber has been removed before we are going to remove it."; "id" => %c); - } - } - } + self.emit_events(events).await; } } } } + + async fn emit_events(&mut self, events: Box<[FlushEvent]>) { + let mut canceled = vec![]; + info!("log backup sending events"; "event_len" => %events.len(), "downstream" => %self.subscribers.len()); + for (id, sub) in &mut self.subscribers { + let send_all = async { + for es in events.chunks(1024) { + let mut resp = SubscribeFlushEventResponse::new(); + resp.set_events(es.to_vec().into()); + sub.feed((resp, WriteFlags::default())).await?; + } + sub.flush().await + }; + + match send_all.await { + Err(grpcio::Error::RemoteStopped) => { + canceled.push(*id); + } + Err(err) => { + Error::from(err).report("sending subscription"); + } + _ => {} + } + } + + for c in canceled { + self.remove_subscription(&c).await; + } + } + + async fn remove_subscription(&mut self, id: &Uuid) { + match self.subscribers.remove(id) { + Some(mut sub) => { + info!("client is gone, removing subscription"; "id" => %id); + sub.close() + .await + .report_if_err(format_args!("during removing subscription {}", id)) + } + None => { + warn!("BUG: the subscriber has been removed before we are going to remove it."; "id" => %id); + } + } + } } // Note: can we make it more generic...? @@ -154,11 +169,6 @@ impl GetCheckpointResult { } impl CheckpointManager { - /// clear the manager. - pub fn clear(&mut self) { - self.items.clear(); - } - pub fn spawn_subscription_mgr(&mut self) -> future![()] { let (tx, rx) = async_mpsc::channel(1024); let sub = SubscriptionManager { @@ -169,25 +179,67 @@ impl CheckpointManager { sub.main_loop() } - pub fn update_region_checkpoints(&mut self, region_and_checkpoint: Vec<(Region, TimeStamp)>) { - for (region, checkpoint) in ®ion_and_checkpoint { - self.do_update(region, *checkpoint); + pub fn resolve_regions(&mut self, region_and_checkpoint: Vec) { + for res in region_and_checkpoint { + self.do_update(res.region, res.checkpoint); } + } - self.notify(region_and_checkpoint.into_iter()); + pub fn flush(&mut self) { + info!("log backup checkpoint manager flushing."; "resolved_ts_len" => %self.resolved_ts.len(), "resolved_ts" => ?self.get_resolved_ts()); + self.checkpoint_ts = std::mem::take(&mut self.resolved_ts); + // Clippy doesn't know this iterator borrows `self.checkpoint_ts` :( + #[allow(clippy::needless_collect)] + let items = self + .checkpoint_ts + .values() + .cloned() + .map(|x| (x.region, x.checkpoint)) + .collect::>(); + self.notify(items.into_iter()); } /// update a region checkpoint in need. #[cfg(test)] pub fn update_region_checkpoint(&mut self, region: &Region, checkpoint: TimeStamp) { - self.do_update(region, checkpoint); - self.notify(std::iter::once((region.clone(), checkpoint))); + Self::update_ts(&mut self.checkpoint_ts, region.clone(), checkpoint) + } + + fn update_ts( + container: &mut HashMap, + region: Region, + checkpoint: TimeStamp, + ) { + let e = container.entry(region.get_id()); + let ver = region.get_region_epoch().get_version(); + // A hacky way to allow the two closures move out the region. + // It is safe given the two closures would only be called once. + let r = RefCell::new(Some(region)); + e.and_modify(|old_cp| { + let old_ver = old_cp.region.get_region_epoch().get_version(); + let checkpoint_is_newer = old_cp.checkpoint < checkpoint; + if old_ver < ver || (old_ver == ver && checkpoint_is_newer) { + *old_cp = LastFlushTsOfRegion { + checkpoint, + region: r.borrow_mut().take().expect( + "unreachable: `and_modify` and `or_insert_with` called at the same time.", + ), + }; + } + }) + .or_insert_with(|| LastFlushTsOfRegion { + checkpoint, + region: r + .borrow_mut() + .take() + .expect("unreachable: `and_modify` and `or_insert_with` called at the same time."), + }); } - pub fn add_subscriber(&mut self, sub: Subscription) -> future![Result<()>] { + pub fn add_subscriber(&mut self, sub: Subscription) -> BoxFuture<'static, Result<()>> { let mgr = self.manager_handle.as_ref().cloned(); let initial_data = self - .items + .checkpoint_ts .values() .map(|v| FlushEvent { start_key: v.region.start_key.clone(), @@ -225,6 +277,7 @@ impl CheckpointManager { })?; Ok(()) } + .boxed() } fn notify(&mut self, items: impl Iterator) { @@ -248,28 +301,13 @@ impl CheckpointManager { } } - fn do_update(&mut self, region: &Region, checkpoint: TimeStamp) { - let e = self.items.entry(region.get_id()); - e.and_modify(|old_cp| { - if old_cp.checkpoint < checkpoint - && old_cp.region.get_region_epoch().get_version() - <= region.get_region_epoch().get_version() - { - *old_cp = LastFlushTsOfRegion { - checkpoint, - region: region.clone(), - }; - } - }) - .or_insert_with(|| LastFlushTsOfRegion { - checkpoint, - region: region.clone(), - }); + fn do_update(&mut self, region: Region, checkpoint: TimeStamp) { + Self::update_ts(&mut self.resolved_ts, region, checkpoint) } /// get checkpoint from a region. pub fn get_from_region(&self, region: RegionIdWithVersion) -> GetCheckpointResult { - let checkpoint = self.items.get(®ion.region_id); + let checkpoint = self.checkpoint_ts.get(®ion.region_id); if checkpoint.is_none() { return GetCheckpointResult::not_found(region); } @@ -282,7 +320,11 @@ impl CheckpointManager { /// get all checkpoints stored. pub fn get_all(&self) -> Vec { - self.items.values().cloned().collect() + self.checkpoint_ts.values().cloned().collect() + } + + pub fn get_resolved_ts(&self) -> Option { + self.resolved_ts.values().map(|x| x.checkpoint).min() } } @@ -333,7 +375,7 @@ pub struct LastFlushTsOfRegion { #[async_trait::async_trait] pub trait FlushObserver: Send + 'static { /// The callback when the flush has advanced the resolver. - async fn before(&mut self, checkpoints: Vec<(Region, TimeStamp)>); + async fn before(&mut self, checkpoints: Vec); /// The callback when the flush is done. (Files are fully written to /// external storage.) async fn after(&mut self, task: &str, rts: u64) -> Result<()>; @@ -363,7 +405,7 @@ impl BasicFlushObserver { #[async_trait::async_trait] impl FlushObserver for BasicFlushObserver { - async fn before(&mut self, _checkpoints: Vec<(Region, TimeStamp)>) {} + async fn before(&mut self, _checkpoints: Vec) {} async fn after(&mut self, task: &str, rts: u64) -> Result<()> { if let Err(err) = self @@ -401,8 +443,9 @@ pub struct CheckpointV3FlushObserver { sched: Scheduler, meta_cli: MetadataClient, - checkpoints: Vec<(Region, TimeStamp)>, + checkpoints: Vec, global_checkpoint_cache: HashMap, + start_time: Instant, } impl CheckpointV3FlushObserver { @@ -414,6 +457,7 @@ impl CheckpointV3FlushObserver { // We almost always have only one entry. global_checkpoint_cache: HashMap::with_capacity(1), baseline, + start_time: Instant::now(), } } } @@ -443,15 +487,19 @@ where S: MetaStore + 'static, O: FlushObserver + Send, { - async fn before(&mut self, checkpoints: Vec<(Region, TimeStamp)>) { + async fn before(&mut self, checkpoints: Vec) { self.checkpoints = checkpoints; } async fn after(&mut self, task: &str, _rts: u64) -> Result<()> { - let t = Task::RegionCheckpointsOp(RegionCheckpointOperation::Update(std::mem::take( - &mut self.checkpoints, - ))); - try_send!(self.sched, t); + let resolve_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::Resolved { + checkpoints: std::mem::take(&mut self.checkpoints), + start_time: self.start_time, + }); + let flush_task = Task::RegionCheckpointsOp(RegionCheckpointOperation::Flush); + try_send!(self.sched, resolve_task); + try_send!(self.sched, flush_task); + let global_checkpoint = self.get_checkpoint(task).await?; info!("getting global checkpoint from cache for updating."; "checkpoint" => ?global_checkpoint); self.baseline @@ -499,6 +547,26 @@ pub mod tests { r } + #[test] + fn test_flush() { + let mut mgr = super::CheckpointManager::default(); + mgr.do_update(region(1, 32, 8), TimeStamp::new(8)); + mgr.do_update(region(2, 34, 8), TimeStamp::new(15)); + mgr.do_update(region(2, 35, 8), TimeStamp::new(16)); + mgr.do_update(region(2, 35, 8), TimeStamp::new(14)); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + + mgr.flush(); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok { checkpoint , .. } if checkpoint.into_inner() == 8); + let r = mgr.get_from_region(RegionIdWithVersion::new(2, 35)); + assert_matches::assert_matches!(r, GetCheckpointResult::Ok { checkpoint , .. } if checkpoint.into_inner() == 16); + mgr.flush(); + let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); + assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + } + #[test] fn test_mgr() { let mut mgr = super::CheckpointManager::default(); @@ -510,6 +578,7 @@ pub mod tests { assert_matches::assert_matches!(r, GetCheckpointResult::EpochNotMatch { .. }); let r = mgr.get_from_region(RegionIdWithVersion::new(3, 44)); assert_matches::assert_matches!(r, GetCheckpointResult::NotFound { .. }); + mgr.update_region_checkpoint(®ion(1, 30, 8), TimeStamp::new(16)); let r = mgr.get_from_region(RegionIdWithVersion::new(1, 32)); assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 8); diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index a13c52c9212..49ca811285b 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -56,7 +56,7 @@ use crate::{ observer::BackupStreamObserver, router::{ApplyEvents, Router, TaskSelector}, subscription_manager::{RegionSubscriptionManager, ResolvedRegions}, - subscription_track::SubscriptionTracer, + subscription_track::{ResolveResult, SubscriptionTracer}, try_send, utils::{self, CallbackWaitGroup, StopWatch, Work}, }; @@ -93,7 +93,6 @@ pub struct Endpoint { failover_time: Option, // We holds the config before, even it is useless for now, // however probably it would be useful in the future. - #[allow(dead_code)] config: BackupStreamConfig, checkpoint_mgr: CheckpointManager, } @@ -184,7 +183,7 @@ where pool.spawn(op_loop); let mut checkpoint_mgr = CheckpointManager::default(); pool.spawn(checkpoint_mgr.spawn_subscription_mgr()); - Endpoint { + let ep = Endpoint { meta_client, range_router, scheduler, @@ -203,7 +202,9 @@ where failover_time: None, config, checkpoint_mgr, - } + }; + ep.pool.spawn(ep.min_ts_worker()); + ep } } @@ -763,7 +764,7 @@ where let mut resolved = get_rts.await?; let mut new_rts = resolved.global_checkpoint(); fail::fail_point!("delay_on_flush"); - flush_ob.before(resolved.take_region_checkpoints()).await; + flush_ob.before(resolved.take_resolve_result()).await; if let Some(rewritten_rts) = flush_ob.rewrite_resolved_ts(&task).await { info!("rewriting resolved ts"; "old" => %new_rts, "new" => %rewritten_rts); new_rts = rewritten_rts.min(new_rts); @@ -919,13 +920,31 @@ where } } + fn min_ts_worker(&self) -> future![()] { + let sched = self.scheduler.clone(); + let interval = self.config.min_ts_interval.0; + async move { + loop { + tokio::time::sleep(interval).await; + try_send!( + sched, + Task::RegionCheckpointsOp(RegionCheckpointOperation::PrepareMinTsForResolve) + ); + } + } + } + pub fn handle_region_checkpoints_op(&mut self, op: RegionCheckpointOperation) { match op { - RegionCheckpointOperation::Update(u) => { - // Let's clear all stale checkpoints first. - // Or they may slow down the global checkpoint. - self.checkpoint_mgr.clear(); - self.checkpoint_mgr.update_region_checkpoints(u); + RegionCheckpointOperation::Resolved { + checkpoints, + start_time, + } => { + self.checkpoint_mgr.resolve_regions(checkpoints); + metrics::MIN_TS_RESOLVE_DURATION.observe(start_time.saturating_elapsed_secs()); + } + RegionCheckpointOperation::Flush => { + self.checkpoint_mgr.flush(); } RegionCheckpointOperation::Get(g, cb) => { let _guard = self.pool.handle().enter(); @@ -953,6 +972,37 @@ where } }); } + RegionCheckpointOperation::PrepareMinTsForResolve => { + let min_ts = self.pool.block_on(self.prepare_min_ts()); + let start_time = Instant::now(); + // We need to reschedule the `Resolve` task to queue, because the subscription + // is asynchronous -- there may be transactions committed before + // the min_ts we prepared but haven't been observed yet. + try_send!( + self.scheduler, + Task::RegionCheckpointsOp(RegionCheckpointOperation::Resolve { + min_ts, + start_time + }) + ); + } + RegionCheckpointOperation::Resolve { min_ts, start_time } => { + let sched = self.scheduler.clone(); + try_send!( + self.scheduler, + Task::ModifyObserve(ObserveOp::ResolveRegions { + callback: Box::new(move |mut resolved| { + let t = + Task::RegionCheckpointsOp(RegionCheckpointOperation::Resolved { + checkpoints: resolved.take_resolve_result(), + start_time, + }); + try_send!(sched, t); + }), + min_ts + }) + ); + } } } @@ -997,7 +1047,16 @@ pub enum RegionSet { } pub enum RegionCheckpointOperation { - Update(Vec<(Region, TimeStamp)>), + Flush, + PrepareMinTsForResolve, + Resolve { + min_ts: TimeStamp, + start_time: Instant, + }, + Resolved { + checkpoints: Vec, + start_time: Instant, + }, Get(RegionSet, Box) + Send>), Subscribe(Subscription), } @@ -1005,9 +1064,17 @@ pub enum RegionCheckpointOperation { impl fmt::Debug for RegionCheckpointOperation { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Update(arg0) => f.debug_tuple("Update").field(arg0).finish(), + Self::Flush => f.debug_tuple("Flush").finish(), Self::Get(arg0, _) => f.debug_tuple("Get").field(arg0).finish(), + Self::Subscribe(_) => f.debug_tuple("Subscription").finish(), + Self::Resolved { checkpoints, .. } => { + f.debug_tuple("Resolved").field(checkpoints).finish() + } + Self::PrepareMinTsForResolve => f.debug_tuple("PrepareMinTsForResolve").finish(), + Self::Resolve { min_ts, .. } => { + f.debug_struct("Resolve").field("min_ts", min_ts).finish() + } } } } @@ -1185,7 +1252,7 @@ impl Task { ObserveOp::NotifyFailToStartObserve { .. } => "modify_observe.retry", ObserveOp::ResolveRegions { .. } => "modify_observe.resolve", }, - Task::ForceFlush(_) => "force_flush", + Task::ForceFlush(..) => "force_flush", Task::FatalError(..) => "fatal_error", Task::Sync(..) => "sync", Task::MarkFailover(_) => "mark_failover", diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index 0805dae5f77..d7f836833b0 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -160,4 +160,10 @@ lazy_static! { "The regions that lost leadership during resolving" ) .unwrap(); + pub static ref MIN_TS_RESOLVE_DURATION: Histogram = register_histogram!( + "tikv_log_backup_resolve_duration_sec", + "The duration of resolving.", + exponential_buckets(0.001, 2.0, 16).unwrap() + ) + .unwrap(); } diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index a31a43980b5..4f75423a241 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -38,7 +38,7 @@ use crate::{ metrics, observer::BackupStreamObserver, router::{Router, TaskSelector}, - subscription_track::SubscriptionTracer, + subscription_track::{ResolveResult, SubscriptionTracer}, try_send, utils::{self, CallbackWaitGroup, Work}, Task, @@ -58,7 +58,7 @@ struct ScanCmd { /// The response of requesting resolve the new checkpoint of regions. pub struct ResolvedRegions { - items: Vec<(Region, TimeStamp)>, + items: Vec, checkpoint: TimeStamp, } @@ -67,7 +67,7 @@ impl ResolvedRegions { /// Note: Maybe we can compute the global checkpoint internal and getting /// the interface clear. However we must take the `min_ts` or we cannot /// provide valid global checkpoint if there isn't any region checkpoint. - pub fn new(checkpoint: TimeStamp, checkpoints: Vec<(Region, TimeStamp)>) -> Self { + pub fn new(checkpoint: TimeStamp, checkpoints: Vec) -> Self { Self { items: checkpoints, checkpoint, @@ -75,7 +75,16 @@ impl ResolvedRegions { } /// take the region checkpoints from the structure. + #[deprecated = "please use `take_resolve_result` instead."] pub fn take_region_checkpoints(&mut self) -> Vec<(Region, TimeStamp)> { + std::mem::take(&mut self.items) + .into_iter() + .map(|x| (x.region, x.checkpoint)) + .collect() + } + + /// take the resolve result from this struct. + pub fn take_resolve_result(&mut self) -> Vec { std::mem::take(&mut self.items) } @@ -455,7 +464,7 @@ where } ObserveOp::ResolveRegions { callback, min_ts } => { let now = Instant::now(); - let timedout = self.wait(Duration::from_secs(30)).await; + let timedout = self.wait(Duration::from_secs(5)).await; if timedout { warn!("waiting for initial scanning done timed out, forcing progress!"; "take" => ?now.saturating_elapsed(), "timedout" => %timedout); @@ -470,10 +479,7 @@ where let rts = min_region.map(|rs| rs.checkpoint).unwrap_or(min_ts); info!("getting checkpoint"; "defined_by_region" => ?min_region); self.subs.warn_if_gap_too_huge(rts); - callback(ResolvedRegions::new( - rts, - cps.into_iter().map(|r| (r.region, r.checkpoint)).collect(), - )); + callback(ResolvedRegions::new(rts, cps)); } } } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index b7afcd1441f..d6dfb2b2839 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -21,7 +21,7 @@ use backup_stream::{ router::Router, Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Service, Task, }; -use futures::{executor::block_on, AsyncWriteExt, Future, Stream, StreamExt, TryStreamExt}; +use futures::{executor::block_on, AsyncWriteExt, Future, Stream, StreamExt}; use grpcio::{ChannelBuilder, Server, ServerBuilder}; use kvproto::{ brpb::{CompressionType, Local, Metadata, StorageBackend}, @@ -275,7 +275,10 @@ impl Suite { /// create a subscription stream. this has simply asserted no error, because /// in theory observing flushing should not emit error. change that if /// needed. - fn flush_stream(&self) -> impl Stream { + fn flush_stream( + &self, + panic_while_fail: bool, + ) -> impl Stream { let streams = self .log_backup_cli .iter() @@ -288,8 +291,18 @@ impl Suite { }) .unwrap_or_else(|err| panic!("failed to subscribe on {} because {}", id, err)); let id = *id; - stream.map_ok(move |x| (id, x)).map(move |x| { - x.unwrap_or_else(move |err| panic!("failed to rec from {} because {}", id, err)) + stream.filter_map(move |x| { + futures::future::ready(match x { + Ok(x) => Some((id, x)), + Err(err) => { + if panic_while_fail { + panic!("failed to rec from {} because {}", id, err) + } else { + println!("[WARN] failed to rec from {} because {}", id, err); + None + } + } + }) }) }) .collect::>(); @@ -463,6 +476,7 @@ impl Suite { } fn force_flush_files(&self, task: &str) { + // TODO: use the callback to make the test more stable. self.run(|| Task::ForceFlush(task.to_owned())); self.sync(); } @@ -1264,7 +1278,7 @@ mod test { #[test] fn subscribe_flushing() { let mut suite = super::SuiteBuilder::new_named("sub_flush").build(); - let stream = suite.flush_stream(); + let stream = suite.flush_stream(true); for i in 1..10 { let split_key = make_split_key_at_record(1, i * 20); suite.must_split(&split_key); @@ -1306,12 +1320,47 @@ mod test { )); } + #[test] + fn resolved_follower() { + let mut suite = super::SuiteBuilder::new_named("r").build(); + let round1 = run_async_test(suite.write_records(0, 128, 1)); + suite.must_register_task(1, "r"); + suite.run(|| Task::RegionCheckpointsOp(RegionCheckpointOperation::PrepareMinTsForResolve)); + suite.sync(); + std::thread::sleep(Duration::from_secs(1)); + + let leader = suite.cluster.leader_of_region(1).unwrap(); + suite.must_shuffle_leader(1); + let round2 = run_async_test(suite.write_records(256, 128, 1)); + suite + .endpoints + .get(&leader.store_id) + .unwrap() + .scheduler() + .schedule(Task::ForceFlush("r".to_owned())) + .unwrap(); + suite.sync(); + std::thread::sleep(Duration::from_secs(1)); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.iter().map(|x| x.as_slice()), + )); + assert!(suite.global_checkpoint() > 256); + suite.force_flush_files("r"); + suite.wait_for_flush(); + assert!(suite.global_checkpoint() > 512); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(|x| x.as_slice()), + )); + } + #[test] fn network_partition() { let mut suite = super::SuiteBuilder::new_named("network_partition") .nodes(3) .build(); - let stream = suite.flush_stream(); + let stream = suite.flush_stream(true); suite.must_register_task(1, "network_partition"); let leader = suite.cluster.leader_of_region(1).unwrap(); let round1 = run_async_test(suite.write_records(0, 64, 1)); diff --git a/src/config/mod.rs b/src/config/mod.rs index 57c2e935d78..689e0330a2b 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2654,6 +2654,8 @@ impl Default for BackupConfig { #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct BackupStreamConfig { + #[online_config(skip)] + pub min_ts_interval: ReadableDuration, #[online_config(skip)] pub max_flush_interval: ReadableDuration, #[online_config(skip)] @@ -2681,6 +2683,20 @@ impl BackupStreamConfig { ); self.num_threads = default_cfg.num_threads; } + if self.max_flush_interval < ReadableDuration::secs(10) { + return Err(format!( + "the max_flush_interval is too small, it is {}, and should be greater than 10s.", + self.max_flush_interval + ) + .into()); + } + if self.min_ts_interval < ReadableDuration::secs(1) { + return Err(format!( + "the min_ts_interval is too small, it is {}, and should be greater than 1s.", + self.min_ts_interval + ) + .into()); + } Ok(()) } } @@ -2691,6 +2707,7 @@ impl Default for BackupStreamConfig { let total_mem = SysQuota::memory_limit_in_bytes(); let quota_size = (total_mem as f64 * 0.1).min(ReadableSize::mb(512).0 as _); Self { + min_ts_interval: ReadableDuration::secs(10), max_flush_interval: ReadableDuration::minutes(3), // use at most 50% of vCPU by default num_threads: (cpu_num * 0.5).clamp(2.0, 12.0) as usize, diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 8ff9e2f93af..ff6807fa6a1 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -782,6 +782,7 @@ fn test_serde_custom_tikv_config() { file_size_limit: ReadableSize::gb(5), initial_scan_pending_memory_quota: ReadableSize::kb(2), initial_scan_rate_limit: ReadableSize::mb(3), + min_ts_interval: ReadableDuration::secs(2), }; value.import = ImportConfig { num_threads: 123, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index ecab04350b6..7f5dbfa1db7 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -632,6 +632,7 @@ s3-multi-part-size = "15MB" sst-max-size = "789MB" [log-backup] +min-ts-interval = "2s" max-flush-interval = "11s" num-threads = 7 enable = true From dcd15aee8fd5f3a11b9438eac2ed3cd7935a4e04 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Tue, 14 Mar 2023 12:26:40 +0800 Subject: [PATCH 581/676] server: Record the duration of executing the snapshot and feedback to PD Server (#13410) close tikv/tikv#13409 1. collect the duration of generating and sending snapshot 2. records the the total duration between receiving the snapshot task and finish to sending all snapshot . 3. report the metrics to the pd server. Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Ping Yu --- components/raftstore/src/store/fsm/store.rs | 2 + .../raftstore/src/store/peer_storage.rs | 7 +- components/raftstore/src/store/snap.rs | 71 +++++++++++++++---- .../raftstore/src/store/worker/region.rs | 5 +- src/server/snap.rs | 28 ++++++-- tests/integrations/raftstore/test_snap.rs | 19 +++-- 6 files changed, 108 insertions(+), 24 deletions(-) diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 4fafc049bee..09d6db62764 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -2464,6 +2464,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER let snap_stats = self.ctx.snap_mgr.stats(); stats.set_sending_snap_count(snap_stats.sending_count as u32); stats.set_receiving_snap_count(snap_stats.receiving_count as u32); + stats.set_snapshot_stats(snap_stats.stats.into()); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC .with_label_values(&["sending"]) .set(snap_stats.sending_count as i64); diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 470cdfee998..6ac38b60dfe 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -30,7 +30,10 @@ use raft::{ Error as RaftError, GetEntriesContext, RaftState, Ready, Storage, StorageError, }; use tikv_util::{ - box_err, box_try, debug, defer, error, info, store::find_peer_by_id, time::Instant, warn, + box_err, box_try, debug, defer, error, info, + store::find_peer_by_id, + time::{Instant, UnixSecs}, + warn, worker::Scheduler, }; @@ -1060,6 +1063,7 @@ pub fn do_snapshot( last_applied_state: RaftApplyState, for_balance: bool, allow_multi_files_snapshot: bool, + start: UnixSecs, ) -> raft::Result where E: KvEngine, @@ -1117,6 +1121,7 @@ where region_state.get_region(), allow_multi_files_snapshot, for_balance, + start, )?; snapshot.set_data(snap_data.write_to_bytes()?.into()); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 37189d2e52b..57cdbd2a75c 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -27,6 +27,7 @@ use keys::{enc_end_key, enc_start_key}; use kvproto::{ encryptionpb::EncryptionMethod, metapb::Region, + pdpb::SnapshotStat, raft_serverpb::{RaftSnapshotData, SnapshotCfFile, SnapshotMeta}, }; use openssl::symm::{Cipher, Crypter, Mode}; @@ -35,7 +36,7 @@ use raft::eraftpb::Snapshot as RaftSnapshot; use thiserror::Error; use tikv_util::{ box_err, box_try, debug, error, info, - time::{duration_to_sec, Instant, Limiter}, + time::{duration_to_sec, Instant, Limiter, UnixSecs}, warn, HandyRwLock, }; @@ -145,7 +146,6 @@ impl SnapKey { if let Err(e) = snap_data.merge_from_bytes(snap.get_data()) { return Err(io::Error::new(ErrorKind::Other, e)); } - Ok(SnapKey::from_region_snap( snap_data.get_region().get_id(), snap, @@ -1032,6 +1032,7 @@ impl Snapshot { region: &Region, allow_multi_files_snapshot: bool, for_balance: bool, + start: UnixSecs, ) -> RaftStoreResult { let mut snap_data = RaftSnapshotData::default(); snap_data.set_region(region.clone()); @@ -1050,7 +1051,10 @@ impl Snapshot { // set snapshot meta data snap_data.set_file_size(total_size); snap_data.set_version(SNAPSHOT_VERSION); - snap_data.set_meta(self.meta_file.meta.as_ref().unwrap().clone()); + let meta = self.meta_file.meta.as_mut().unwrap(); + meta.set_start(start.into_inner()); + meta.set_generate_duration_sec(t.saturating_elapsed().as_secs()); + snap_data.set_meta(meta.clone()); SNAPSHOT_BUILD_TIME_HISTOGRAM.observe(duration_to_sec(t.saturating_elapsed())); SNAPSHOT_KV_COUNT_HISTOGRAM.observe(total_count as f64); @@ -1362,6 +1366,7 @@ pub enum SnapEntry { pub struct SnapStats { pub sending_count: usize, pub receiving_count: usize, + pub stats: Vec, } #[derive(Clone)] @@ -1375,6 +1380,7 @@ struct SnapManagerCore { encryption_key_manager: Option>, max_per_file_size: Arc, enable_multi_snapshot_files: Arc, + stats: Arc>>, } /// `SnapManagerCore` trace all current processing snapshots. @@ -1656,6 +1662,18 @@ impl SnapManager { self.core.limiter.speed_limit() } + pub fn collect_stat(&self, snap: SnapshotStat) { + debug!( + "collect snapshot stat"; + "region_id" => snap.region_id, + "total_size" => snap.get_transport_size(), + "total_duration_sec" => snap.get_total_duration_sec(), + "generate_duration_sec" => snap.get_generate_duration_sec(), + "send_duration_sec" => snap.get_generate_duration_sec(), + ); + self.core.stats.lock().unwrap().push(snap); + } + pub fn register(&self, key: SnapKey, entry: SnapEntry) { debug!( "register snapshot"; @@ -1726,9 +1744,11 @@ impl SnapManager { } } + let stats = std::mem::take(self.core.stats.lock().unwrap().as_mut()); SnapStats { sending_count: sending_cnt, receiving_count: receiving_cnt, + stats, } } @@ -1887,6 +1907,7 @@ impl SnapManagerBuilder { enable_multi_snapshot_files: Arc::new(AtomicBool::new( self.enable_multi_snapshot_files, )), + stats: Default::default(), }, max_total_size: Arc::new(AtomicU64::new(max_total_size)), }; @@ -2269,6 +2290,7 @@ pub mod tests { encryption_key_manager: None, max_per_file_size: Arc::new(AtomicU64::new(max_per_file_size)), enable_multi_snapshot_files: Arc::new(AtomicBool::new(true)), + stats: Default::default(), } } @@ -2405,7 +2427,9 @@ pub mod tests { assert!(!s1.exists()); assert_eq!(mgr_core.get_total_snap_size().unwrap(), 0); - let mut snap_data = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); + let mut snap_data = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); // Ensure that this snapshot file does exist after being built. assert!(s1.exists()); @@ -2505,13 +2529,17 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let _ = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); + let _ = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s1.exists()); let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(s2.exists()); - let _ = s2.build(&db, &snapshot, ®ion, true, false).unwrap(); + let _ = s2 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s2.exists()); } @@ -2654,7 +2682,9 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let _ = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); + let _ = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s1.exists()); corrupt_snapshot_size_in(dir.path()); @@ -2663,7 +2693,9 @@ pub mod tests { let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s2.exists()); - let snap_data = s2.build(&db, &snapshot, ®ion, true, false).unwrap(); + let snap_data = s2 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s2.exists()); let dst_dir = Builder::new() @@ -2724,7 +2756,9 @@ pub mod tests { let mut s1 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s1.exists()); - let _ = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); + let _ = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s1.exists()); assert_eq!(1, corrupt_snapshot_meta_file(dir.path())); @@ -2733,7 +2767,9 @@ pub mod tests { let mut s2 = Snapshot::new_for_building(dir.path(), &key, &mgr_core).unwrap(); assert!(!s2.exists()); - let mut snap_data = s2.build(&db, &snapshot, ®ion, true, false).unwrap(); + let mut snap_data = s2 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(s2.exists()); let dst_dir = Builder::new() @@ -2795,7 +2831,9 @@ pub mod tests { let mgr_core = create_manager_core(&path, u64::MAX); let mut s1 = Snapshot::new_for_building(&path, &key1, &mgr_core).unwrap(); let mut region = gen_test_region(1, 1, 1); - let mut snap_data = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); + let mut snap_data = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); let mut s = Snapshot::new_for_sending(&path, &key1, &mgr_core).unwrap(); let expected_size = s.total_size(); let mut s2 = @@ -2867,7 +2905,9 @@ pub mod tests { // Ensure the snapshot being built will not be deleted on GC. src_mgr.register(key.clone(), SnapEntry::Generating); let mut s1 = src_mgr.get_snapshot_for_building(&key).unwrap(); - let mut snap_data = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); + let mut snap_data = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); check_registry_around_deregister(&src_mgr, &key, &SnapEntry::Generating); @@ -2950,6 +2990,7 @@ pub mod tests { &gen_test_region(100, 1, 1), true, false, + UnixSecs::now(), ) .unwrap() }; @@ -2973,7 +3014,7 @@ pub mod tests { let region = gen_test_region(region_id, 1, 1); let mut s = snap_mgr.get_snapshot_for_building(&key).unwrap(); let _ = s - .build(&engine.kv, &snapshot, ®ion, true, false) + .build(&engine.kv, &snapshot, ®ion, true, false, UnixSecs::now()) .unwrap(); // The first snap_size is for region 100. @@ -3043,7 +3084,9 @@ pub mod tests { // correctly. for _ in 0..2 { let mut s1 = snap_mgr.get_snapshot_for_building(&key).unwrap(); - let _ = s1.build(&db, &snapshot, ®ion, true, false).unwrap(); + let _ = s1 + .build(&db, &snapshot, ®ion, true, false, UnixSecs::now()) + .unwrap(); assert!(snap_mgr.delete_snapshot(&key, &s1, false)); } } diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 84bc3b27084..7dc894204ec 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -26,7 +26,7 @@ use tikv_util::{ box_err, box_try, config::VersionTrack, defer, error, info, thd_name, - time::Instant, + time::{Instant, UnixSecs}, warn, worker::{Runnable, RunnableWithTimer}, }; @@ -241,6 +241,7 @@ struct SnapGenContext { engine: EK, mgr: SnapManager, router: R, + start: UnixSecs, } impl SnapGenContext @@ -269,6 +270,7 @@ where last_applied_state, for_balance, allow_multi_files_snapshot, + self.start )); // Only enable the fail point when the region id is equal to 1, which is // the id of bootstrapped region in tests. @@ -821,6 +823,7 @@ where engine: self.engine.clone(), mgr: self.mgr.clone(), router: self.router.clone(), + start: UnixSecs::now(), }; self.pool.spawn(async move { tikv_alloc::add_thread_memory_accessor(); diff --git a/src/server/snap.rs b/src/server/snap.rs index bae0587c505..afce0e8a2fd 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -2,7 +2,7 @@ use std::{ fmt::{self, Display, Formatter}, - io::{Read, Write}, + io::{Error as IoError, ErrorKind, Read, Write}, pin::Pin, sync::{ atomic::{AtomicUsize, Ordering}, @@ -23,6 +23,7 @@ use grpcio::{ RpcStatusCode, WriteFlags, }; use kvproto::{ + pdpb::SnapshotStat, raft_serverpb::{ Done, RaftMessage, RaftSnapshotData, SnapshotChunk, TabletSnapshotRequest, TabletSnapshotResponse, @@ -35,7 +36,7 @@ use security::SecurityManager; use tikv_kv::RaftExtension; use tikv_util::{ config::{Tracker, VersionTrack}, - time::Instant, + time::{Instant, UnixSecs}, worker::Runnable, DeferContext, }; @@ -139,9 +140,16 @@ pub fn send_snap( let send_timer = SEND_SNAP_HISTOGRAM.start_coarse_timer(); - let key = { + let (key, snap_start, generate_duration_sec) = { let snap = msg.get_message().get_snapshot(); - SnapKey::from_snap(snap)? + let mut snap_data = RaftSnapshotData::default(); + if let Err(e) = snap_data.merge_from_bytes(snap.get_data()) { + return Err(Error::Io(IoError::new(ErrorKind::Other, e))); + } + let key = SnapKey::from_region_snap(snap_data.get_region().get_id(), snap); + let snap_start = snap_data.get_meta().get_start(); + let generate_duration_sec = snap_data.get_meta().get_generate_duration_sec(); + (key, snap_start, generate_duration_sec) }; mgr.register(key.clone(), SnapEntry::Sending); @@ -193,6 +201,18 @@ pub fn send_snap( Ok(_) => { fail_point!("snapshot_delete_after_send"); mgr.delete_snapshot(&key, &chunks.snap, true); + let cost = UnixSecs::now().into_inner().saturating_sub(snap_start); + // it should ignore if the duration of snapshot is less than 1s to decrease the + // grpc data size. + if cost >= 1 { + let mut stat = SnapshotStat::default(); + stat.set_region_id(key.region_id); + stat.set_transport_size(total_size); + stat.set_generate_duration_sec(generate_duration_sec); + stat.set_send_duration_sec(timer.saturating_elapsed().as_secs()); + stat.set_total_duration_sec(cost); + mgr.collect_stat(stat); + } // TODO: improve it after rustc resolves the bug. // Call `info` in the closure directly will cause rustc // panic with `Cannot create local mono-item for DefId`. diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 0ca576e5e9a..fc0364c13b0 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -23,7 +23,11 @@ use security::SecurityManager; use test_raftstore::*; use test_raftstore_macro::test_case; use tikv::server::snap::send_snap; -use tikv_util::{config::*, time::Instant, HandyRwLock}; +use tikv_util::{ + config::*, + time::{Instant, UnixSecs}, + HandyRwLock, +}; fn test_huge_snapshot(cluster: &mut Cluster, max_snapshot_file_size: u64) { cluster.cfg.rocksdb.titan.enabled = true; @@ -508,7 +512,7 @@ fn test_inspected_snapshot() { #[test] fn test_gen_during_heavy_recv() { let mut cluster = new_server_cluster(0, 3); - cluster.cfg.server.snap_io_max_bytes_per_sec = ReadableSize(5 * 1024 * 1024); + cluster.cfg.server.snap_io_max_bytes_per_sec = ReadableSize(1024 * 1024); cluster.cfg.raft_store.snap_mgr_gc_tick_interval = ReadableDuration(Duration::from_secs(100)); let pd_client = Arc::clone(&cluster.pd_client); @@ -554,6 +558,7 @@ fn test_gen_during_heavy_recv() { snap_apply_state, true, true, + UnixSecs::now(), ) .unwrap(); @@ -593,8 +598,14 @@ fn test_gen_during_heavy_recv() { pd_client.must_add_peer(r1, new_learner_peer(3, 3)); sleep_ms(500); must_get_equal(&cluster.get_engine(3), b"zzz-0000", b"value"); - assert_eq!(cluster.get_snap_mgr(1).stats().sending_count, 0); - assert_eq!(cluster.get_snap_mgr(2).stats().receiving_count, 0); + + // store 1 and store 2 must send snapshot, so stats should record the snapshot. + let send_stats = cluster.get_snap_mgr(1).stats(); + let recv_stats = cluster.get_snap_mgr(2).stats(); + assert_eq!(send_stats.sending_count, 0); + assert_eq!(recv_stats.receiving_count, 0); + assert_ne!(send_stats.stats.len(), 0); + assert_ne!(recv_stats.stats.len(), 0); drop(cluster); let _ = th.join(); } From 852af464cd48a97ec2b88c6a183a4f1ec4a84938 Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 14 Mar 2023 14:10:39 +0800 Subject: [PATCH 582/676] raftstore: optimize write priority scheduling path (#14335) ref tikv/tikv#14353 Optimize write priority scheduling path including: - replace `DashMap` with `RwLock` in the resource controller - use visit pattern for consuming msg resource to avoid constructing hashmap - introduce `ParsedEntry` to avoid parsing raft command from entry data repeatedly in different places Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/batch-system/src/test_runner.rs | 10 +- .../operation/command/admin/merge/prepare.rs | 3 +- .../raftstore/src/store/async_io/write.rs | 19 +- .../raftstore/src/store/entry_storage.rs | 32 ++- components/raftstore/src/store/fsm/apply.rs | 159 +++--------- components/raftstore/src/store/peer.rs | 3 +- components/raftstore/src/store/util.rs | 235 +++++++++++++++++- components/resource_control/Cargo.toml | 1 + components/resource_control/src/channel.rs | 76 ++++-- components/resource_control/src/lib.rs | 4 + .../resource_control/src/resource_group.rs | 110 ++++---- 12 files changed, 443 insertions(+), 210 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5fb51b4fcdf..94c562c5c6d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4702,6 +4702,7 @@ dependencies = [ "kvproto", "lazy_static", "online_config", + "parking_lot 0.12.1", "pd_client", "pin-project", "prometheus", diff --git a/components/batch-system/src/test_runner.rs b/components/batch-system/src/test_runner.rs index 9a84a5fe545..ad9c3f54d04 100644 --- a/components/batch-system/src/test_runner.rs +++ b/components/batch-system/src/test_runner.rs @@ -11,9 +11,8 @@ use std::{ }, }; -use collections::HashMap; use derive_more::{Add, AddAssign}; -use resource_control::ResourceMetered; +use resource_control::{ResourceConsumeType, ResourceController, ResourceMetered}; use tikv_util::mpsc; use crate::*; @@ -29,12 +28,11 @@ pub enum Message { } impl ResourceMetered for Message { - fn get_resource_consumptions(&self) -> Option> { + fn consume_resource(&self, resource_ctl: &Arc) -> Option { match self { Message::Resource(group_name, bytes) => { - let mut map = HashMap::default(); - map.insert(group_name.to_owned(), *bytes); - Some(map) + resource_ctl.consume(group_name.as_bytes(), ResourceConsumeType::IoBytes(*bytes)); + Some(group_name.to_owned()) } _ => None, } diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index f9df2d9ea1a..378e3d2e7c8 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -245,8 +245,7 @@ impl Peer { if entry.get_data().is_empty() { continue; } - let cmd: RaftCmdRequest = - util::parse_data_at(entry.get_data(), entry.get_index(), "tag"); + let cmd: RaftCmdRequest = util::parse_data_at(entry.get_data(), entry.get_index()); if !cmd.has_admin_request() { continue; } diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 9b25d7de806..d20b9d0bec0 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -26,7 +26,7 @@ use protobuf::Message; use raft::eraftpb::Entry; use resource_control::{ channel::{bounded, Receiver}, - ResourceController, ResourceMetered, + ResourceConsumeType, ResourceController, ResourceMetered, }; use tikv_util::{ box_err, @@ -283,16 +283,25 @@ where EK: KvEngine, ER: RaftEngine, { - fn get_resource_consumptions(&self) -> Option> { + fn consume_resource(&self, resource_ctl: &Arc) -> Option { match self { WriteMsg::WriteTask(t) => { - let mut map = HashMap::default(); + let mut dominant_group = "".to_owned(); + let mut max_write_bytes = 0; for entry in &t.entries { let header = util::get_entry_header(entry); let group_name = header.get_resource_group_name().to_owned(); - *map.entry(group_name).or_default() += entry.compute_size() as u64; + let write_bytes = entry.compute_size() as u64; + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + if write_bytes > max_write_bytes { + dominant_group = group_name; + max_write_bytes = write_bytes; + } } - Some(map) + Some(dominant_group) } _ => None, } diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index afa13730ccf..1e2e40b2da6 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -30,7 +30,11 @@ use super::{ metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use crate::{bytes_capacity, store::ReadTask, Result}; +use crate::{ + bytes_capacity, + store::{util::ParsedEntry, ReadTask}, + Result, +}; const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; const SHRINK_CACHE_CAPACITY: usize = 64; @@ -54,7 +58,7 @@ pub fn last_index(state: &RaftLocalState) -> u64 { pub struct CachedEntries { pub range: Range, // Entries and dangle size for them. `dangle` means not in entry cache. - entries: Arc, usize)>>, + entries: Arc, usize)>>, } impl CachedEntries { @@ -64,21 +68,24 @@ impl CachedEntries { let end = entries.last().map(|x| x.index).unwrap() + 1; let range = Range { start, end }; CachedEntries { - entries: Arc::new(Mutex::new((entries, 0))), + entries: Arc::new(Mutex::new(( + entries.into_iter().map(|e| ParsedEntry::new(e)).collect(), + 0, + ))), range, } } - pub fn iter_entries(&self, mut f: impl FnMut(&Entry)) { - let entries = self.entries.lock().unwrap(); - for entry in &entries.0 { + pub fn iter_entries_mut(&self, mut f: impl FnMut(&mut ParsedEntry)) { + let mut entries = self.entries.lock().unwrap(); + for entry in &mut entries.0 { f(entry); } } /// Take cached entries and dangle size for them. `dangle` means not in /// entry cache. - pub fn take_entries(&self) -> (Vec, usize) { + pub fn take_entries(&self) -> (Vec, usize) { mem::take(&mut *self.entries.lock().unwrap()) } } @@ -325,8 +332,8 @@ impl EntryCache { let dangle_size = { let mut guard = entries.entries.lock().unwrap(); - let last_idx = guard.0.last().map(|e| e.index).unwrap(); - let cache_front = match self.cache.front().map(|e| e.index) { + let last_idx = guard.0.last().map(|e| e.get_index()).unwrap(); + let cache_front = match self.cache.front().map(|e| e.get_index()) { Some(i) => i, None => u64::MAX, }; @@ -334,7 +341,10 @@ impl EntryCache { let dangle_range = if last_idx < cache_front { // All entries are not in entry cache. 0..guard.0.len() - } else if let Ok(i) = guard.0.binary_search_by(|e| e.index.cmp(&cache_front)) { + } else if let Ok(i) = guard + .0 + .binary_search_by(|e| e.get_index().cmp(&cache_front)) + { // Some entries are in entry cache. 0..i } else { @@ -344,7 +354,7 @@ impl EntryCache { let mut size = 0; for e in &guard.0[dangle_range] { - size += bytes_capacity(&e.data) + bytes_capacity(&e.context); + size += e.bytes_capacity(); } guard.1 = size; size diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index b9f737158fc..181ff207c0b 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -9,7 +9,6 @@ use std::{ cmp::{Ord, Ordering as CmpOrdering}, collections::VecDeque, fmt::{self, Debug, Formatter}, - io::BufRead, mem, ops::{Deref, DerefMut, Range as StdRange}, sync::{ @@ -46,12 +45,8 @@ use kvproto::{ }; use pd_client::{BucketMeta, BucketStat}; use prometheus::local::LocalHistogram; -use protobuf::{wire_format::WireType, CodedInputStream, Message}; -use raft::eraftpb::{ - ConfChange, ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot, -}; -use raft_proto::ConfChangeI; -use resource_control::{ResourceController, ResourceMetered}; +use raft::eraftpb::{ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot}; +use resource_control::{ResourceConsumeType, ResourceController, ResourceMetered}; use smallvec::{smallvec, SmallVec}; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -93,6 +88,7 @@ use crate::{ util::{ self, admin_cmd_epoch_lookup, check_flashback_state, check_req_region_epoch, compare_region_epoch, ChangePeerI, ConfChangeKind, KeysInfoFormatter, LatencyInspector, + ParsedEntry, }, Config, RegionSnapshot, RegionTask, WriteCallback, }, @@ -852,43 +848,6 @@ fn should_sync_log(cmd: &RaftCmdRequest) -> bool { false } -fn can_witness_skip(entry: &Entry) -> bool { - // need to handle ConfChange entry type - if entry.get_entry_type() != EntryType::EntryNormal { - return false; - } - - // HACK: check admin request field in serialized data from `RaftCmdRequest` - // without deserializing all. It's done by checking the existence of the - // field number of `admin_request`. - // See the encoding in `write_to_with_cached_sizes()` of `RaftCmdRequest` in - // `raft_cmdpb.rs` for reference. - let mut is = CodedInputStream::from_bytes(entry.get_data()); - if is.eof().unwrap() { - return true; - } - let (mut field_number, wire_type) = is.read_tag_unpack().unwrap(); - // Header field is of number 1 - if field_number == 1 { - if wire_type != WireType::WireTypeLengthDelimited { - panic!("unexpected wire type"); - } - let len = is.read_raw_varint32().unwrap(); - // skip parsing the content of `Header` - is.consume(len as usize); - // read next field number - (field_number, _) = is.read_tag_unpack().unwrap(); - } - - // `Requests` field is of number 2 and `AdminRequest` field is of number 3. - // - If the next field is 2, there must be no admin request as in one - // `RaftCmdRequest`, either requests or admin_request is filled. - // - If the next field is 3, it's exactly an admin request. - // - If the next field is others, neither requests nor admin_request is filled, - // so there is no admin request. - field_number != 3 -} - /// A struct that stores the state related to Merge. /// /// When executing a `CommitMerge`, the source peer may have not applied @@ -911,7 +870,7 @@ where { /// All of the entries that need to continue to be applied after /// the source peer has applied its logs. - pending_entries: Vec, + pending_entries: Vec, /// All of messages that need to continue to be handled after /// the source peer has applied its logs and pending entries /// are all handled. @@ -1091,7 +1050,7 @@ where fn handle_raft_committed_entries( &mut self, apply_ctx: &mut ApplyContext, - mut committed_entries_drainer: Drain<'_, Entry>, + mut committed_entries_drainer: Drain<'_, ParsedEntry>, ) { if committed_entries_drainer.len() == 0 { return; @@ -1102,7 +1061,7 @@ where // must re-propose these commands again. apply_ctx.committed_count += committed_entries_drainer.len(); let mut results = VecDeque::new(); - while let Some(entry) = committed_entries_drainer.next() { + while let Some(mut entry) = committed_entries_drainer.next() { if self.pending_remove { // This peer is about to be destroyed, skip everything. break; @@ -1124,9 +1083,9 @@ where // running on data written by new version tikv), but PD will reject old version // tikv join the cluster, so this should not happen. let res = match entry.get_entry_type() { - EntryType::EntryNormal => self.handle_raft_entry_normal(apply_ctx, &entry), + EntryType::EntryNormal => self.handle_raft_entry_normal(apply_ctx, &mut entry), EntryType::EntryConfChange | EntryType::EntryConfChangeV2 => { - self.handle_raft_entry_conf_change(apply_ctx, &entry) + self.handle_raft_entry_conf_change(apply_ctx, &mut entry) } }; @@ -1196,7 +1155,7 @@ where fn handle_raft_entry_normal( &mut self, apply_ctx: &mut ApplyContext, - entry: &Entry, + entry: &mut ParsedEntry, ) -> ApplyResult { fail_point!( "yield_apply_first_region", @@ -1206,11 +1165,10 @@ where let index = entry.get_index(); let term = entry.get_term(); - let data = entry.get_data(); - if !data.is_empty() { - if !self.peer.is_witness || !can_witness_skip(entry) { - let cmd = util::parse_data_at(data, index, &self.tag); + if !entry.is_empty() { + if !self.peer.is_witness || !entry.can_witness_skip() { + let cmd = entry.take_cmd(); if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { self.priority = Priority::Low; } @@ -1269,7 +1227,7 @@ where fn handle_raft_entry_conf_change( &mut self, apply_ctx: &mut ApplyContext, - entry: &Entry, + entry: &mut ParsedEntry, ) -> ApplyResult { // Although conf change can't yield in normal case, it is convenient to // simulate yield before applying a conf change log. @@ -1277,16 +1235,7 @@ where ApplyResult::Yield }); let (index, term) = (entry.get_index(), entry.get_term()); - let conf_change: ConfChangeV2 = match entry.get_entry_type() { - EntryType::EntryConfChange => { - let conf_change: ConfChange = - util::parse_data_at(entry.get_data(), index, &self.tag); - conf_change.into_v2() - } - EntryType::EntryConfChangeV2 => util::parse_data_at(entry.get_data(), index, &self.tag), - _ => unreachable!(), - }; - let cmd = util::parse_data_at(conf_change.get_context(), index, &self.tag); + let (conf_change, cmd) = entry.take_conf_change(); match self.process_raft_cmd(apply_ctx, index, term, cmd) { ApplyResult::None => { // If failed, tell Raft that the `ConfChange` was aborted. @@ -3726,19 +3675,29 @@ where } impl ResourceMetered for Msg { - fn get_resource_consumptions(&self) -> Option> { + fn consume_resource(&self, resource_ctl: &Arc) -> Option { match self { Msg::Apply { apply, .. } => { - let mut map = HashMap::default(); + let mut dominant_group = "".to_owned(); + let mut max_write_bytes = 0; for cached_entries in &apply.entries { - cached_entries.iter_entries(|entry| { - // TODO: maybe use a more efficient way to get the resource group name. - let header = util::get_entry_header(entry); - let group_name = header.get_resource_group_name().to_owned(); - *map.entry(group_name).or_default() += entry.compute_size() as u64; + cached_entries.iter_entries_mut(|entry| { + if entry.is_empty() { + return; + } + let write_bytes = entry.compute_size() as u64; + let group_name = entry.get_cmd().get_header().get_resource_group_name(); + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + if write_bytes > max_write_bytes { + dominant_group = group_name.to_owned(); + max_write_bytes = write_bytes; + } }); } - Some(map) + Some(dominant_group) } _ => None, } @@ -3924,19 +3883,21 @@ where let mut dangle_size = 0; for cached_entries in apply.entries { - let (e, sz) = cached_entries.take_entries(); + let (ents, sz) = cached_entries.take_entries(); dangle_size += sz; - if e.is_empty() { + if ents.is_empty() { let rid = self.delegate.region_id(); let StdRange { start, end } = cached_entries.range; + let mut tmp_ents = Vec::new(); self.delegate .raft_engine - .fetch_entries_to(rid, start, end, None, &mut entries) + .fetch_entries_to(rid, start, end, None, &mut tmp_ents) .unwrap(); + entries.extend(tmp_ents.into_iter().map(|e| ParsedEntry::new(e))); } else if entries.is_empty() { - entries = e; + entries = ents; } else { - entries.extend(e); + entries.extend(ents); } } if dangle_size > 0 { @@ -4908,9 +4869,9 @@ mod memtrace { EK: KvEngine, { fn heap_size(&self) -> usize { - let mut size = self.pending_entries.capacity() * mem::size_of::(); + let mut size = self.pending_entries.capacity() * mem::size_of::(); for e in &self.pending_entries { - size += bytes_capacity(&e.data) + bytes_capacity(&e.context); + size += e.bytes_capacity(); } size += self.pending_msgs.capacity() * mem::size_of::>(); @@ -4967,7 +4928,6 @@ mod tests { time::*, }; - use bytes::Bytes; use engine_panic::PanicEngine; use engine_test::kv::{new_engine, KvTestEngine, KvTestSnapshot}; use engine_traits::{Peekable as PeekableTrait, SyncMutable, WriteBatchExt}; @@ -4977,7 +4937,6 @@ mod tests { raft_cmdpb::*, }; use protobuf::Message; - use raft::eraftpb::{ConfChange, ConfChangeV2}; use sst_importer::Config as ImportConfig; use tempfile::{Builder, TempDir}; use test_sst_importer::*; @@ -5084,42 +5043,6 @@ mod tests { } } - #[test] - fn test_can_witness_skip() { - let mut entry = Entry::new(); - let mut req = RaftCmdRequest::default(); - entry.set_entry_type(EntryType::EntryNormal); - let data = req.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(can_witness_skip(&entry)); - - req.mut_admin_request() - .set_cmd_type(AdminCmdType::CompactLog); - let data = req.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(!can_witness_skip(&entry)); - - let mut req = RaftCmdRequest::default(); - let mut request = Request::default(); - request.set_cmd_type(CmdType::Put); - req.set_requests(vec![request].into()); - let data = req.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(can_witness_skip(&entry)); - - entry.set_entry_type(EntryType::EntryConfChange); - let conf_change = ConfChange::new(); - let data = conf_change.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(!can_witness_skip(&entry)); - - entry.set_entry_type(EntryType::EntryConfChangeV2); - let conf_change_v2 = ConfChangeV2::new(); - let data = conf_change_v2.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(!can_witness_skip(&entry)); - } - #[test] fn test_should_sync_log() { // Admin command diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 8dc69a0def4..a0b28e44f07 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4211,8 +4211,7 @@ where if entry.get_data().is_empty() { continue; } - let cmd: RaftCmdRequest = - util::parse_data_at(entry.get_data(), entry.get_index(), &self.tag); + let cmd: RaftCmdRequest = util::parse_data_at(entry.get_data(), entry.get_index()); if !cmd.has_admin_request() { continue; } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index d48c5e78e7c..7408b540285 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -6,6 +6,7 @@ use std::{ collections::{HashMap, VecDeque}, fmt, fmt::Display, + io::BufRead, option::Option, sync::{ atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}, @@ -24,12 +25,15 @@ use kvproto::{ }, raft_serverpb::{RaftMessage, RaftSnapshotData}, }; -use protobuf::{self, CodedInputStream, Message}; +use protobuf::{self, wire_format::WireType, CodedInputStream, Message}; use raft::{ eraftpb::{self, ConfChangeType, ConfState, Entry, EntryType, MessageType, Snapshot}, Changer, RawNode, INVALID_INDEX, }; -use raft_proto::ConfChangeI; +use raft_proto::{ + eraftpb::{ConfChange, ConfChangeV2}, + ConfChangeI, +}; use tikv_util::{ box_err, codec::number::{decode_u64, NumberEncoder}, @@ -43,7 +47,9 @@ use tokio::sync::Notify; use txn_types::WriteBatchFlags; use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; -use crate::{coprocessor::CoprocessorHost, store::snap::SNAPSHOT_VERSION, Error, Result}; +use crate::{ + bytes_capacity, coprocessor::CoprocessorHost, store::snap::SNAPSHOT_VERSION, Error, Result, +}; const INVALID_TIMESTAMP: u64 = u64::MAX; @@ -745,6 +751,139 @@ pub(crate) fn u64_to_timespec(u: u64) -> Timespec { Timespec::new(sec as i64, nsec as i32) } +// ParsedEntry wraps raft-proto `Entry` and used to avoid parsing raft command +// from entry's data repeatedly. The parsed command may be used in multiple +// places, so cache it at the first place. +pub struct ParsedEntry { + entry: Entry, + cmd: Option, + conf_change: Option, + parsed: bool, +} + +impl ParsedEntry { + pub fn new(entry: Entry) -> ParsedEntry { + ParsedEntry { + entry, + cmd: None, + conf_change: None, + parsed: false, + } + } + + pub fn get_entry_type(&self) -> EntryType { + self.entry.get_entry_type() + } + + pub fn get_index(&self) -> u64 { + self.entry.get_index() + } + + pub fn get_term(&self) -> u64 { + self.entry.get_term() + } + + pub fn compute_size(&self) -> u32 { + self.entry.compute_size() + } + + pub fn is_empty(&self) -> bool { + self.entry.get_data().is_empty() + } + + pub fn bytes_capacity(&self) -> usize { + bytes_capacity(&self.entry.data) + bytes_capacity(&self.entry.context) + } + + fn parse(&mut self) { + assert!(!self.is_empty()); + + let data = self.entry.get_data(); + let index = self.entry.get_index(); + // lazy parse the cmd from entry context + let conf_change = match self.entry.get_entry_type() { + EntryType::EntryConfChange => { + let conf_change: ConfChange = parse_data_at(data, index); + Some(conf_change.into_v2()) + } + EntryType::EntryConfChangeV2 => Some(parse_data_at(data, index)), + EntryType::EntryNormal => { + self.cmd = Some(parse_data_at(data, index)); + None + } + }; + if let Some(conf_change) = conf_change { + self.cmd = Some(parse_data_at(conf_change.get_context(), index)); + self.conf_change = Some(conf_change); + } + self.parsed = true; + } + + pub fn get_cmd(&mut self) -> &RaftCmdRequest { + if !self.parsed { + self.parse(); + } + self.cmd.as_ref().unwrap() + } + + pub fn take_cmd(&mut self) -> RaftCmdRequest { + if !self.parsed { + self.parse(); + } + self.parsed = false; + self.cmd.take().unwrap() + } + + pub fn take_conf_change(&mut self) -> (ConfChangeV2, RaftCmdRequest) { + if !self.parsed { + self.parse(); + } + self.parsed = false; + (self.conf_change.take().unwrap(), self.cmd.take().unwrap()) + } + + pub fn can_witness_skip(&self) -> bool { + !has_admin_request(&self.entry) + } +} + +fn has_admin_request(entry: &Entry) -> bool { + // need to handle ConfChange entry type + if entry.get_entry_type() != EntryType::EntryNormal { + return true; + } + + // HACK: check admin request field in serialized data from `RaftCmdRequest` + // without deserializing all. It's done by checking the existence of the + // field number of `admin_request`. + // See the encoding in `write_to_with_cached_sizes()` of `RaftCmdRequest` in + // `raft_cmdpb.rs` for reference. + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return false; + } + let (mut field_number, wire_type) = is.read_tag_unpack().unwrap(); + // Header field is of number 1 + if field_number == 1 { + if wire_type != WireType::WireTypeLengthDelimited { + panic!("unexpected wire type"); + } + let len = is.read_raw_varint32().unwrap(); + // skip parsing the content of `Header` + is.consume(len as usize); + // read next field number + (field_number, _) = is.read_tag_unpack().unwrap(); + } + + // `Requests` field is of number 2 and `AdminRequest` field is of number 3. + // - If the next field is 2, there must be no admin request as in one + // `RaftCmdRequest`, either requests or admin_request is filled. + // - If the next field is 3, it's exactly an admin request. + // - If the next field is others, neither requests nor admin_request is filled, + // so there is no admin request. + field_number == 3 +} + pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { if entry.get_entry_type() != EntryType::EntryNormal { return RaftRequestHeader::default(); @@ -770,10 +909,10 @@ pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { /// If `data` is corrupted, this function will panic. // TODO: make sure received entries are not corrupted #[inline] -pub fn parse_data_at(data: &[u8], index: u64, tag: &str) -> T { +pub fn parse_data_at(data: &[u8], index: u64) -> T { let mut result = T::default(); result.merge_from_bytes(data).unwrap_or_else(|e| { - panic!("{} data is corrupted at {}: {:?}", tag, index, e); + panic!("{} data is corrupted : {:?}", index, e); }); result } @@ -1717,10 +1856,11 @@ pub fn validate_split_region( mod tests { use std::thread; + use bytes::Bytes; use engine_test::kv::KvTestEngine; use kvproto::{ metapb::{self, RegionEpoch}, - raft_cmdpb::AdminRequest, + raft_cmdpb::{AdminRequest, CmdType, Request}, }; use protobuf::Message as _; use raft::eraftpb::{ConfChangeType, Entry, Message, MessageType}; @@ -1801,6 +1941,53 @@ mod tests { assert_eq!(m1.inspect(Some(monotonic_raw_now())), LeaseState::Valid); } + #[test] + fn test_parsed_entry() { + let mut req = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_resource_group_name("test".to_owned()); + req.set_header(header); + + let mut entry = Entry::new(); + entry.set_term(1); + entry.set_index(2); + entry.set_entry_type(raft::eraftpb::EntryType::EntryNormal); + entry.set_data(req.write_to_bytes().unwrap().into()); + + let mut parsed = ParsedEntry::new(entry); + assert_eq!(parsed.get_term(), 1); + assert_eq!(parsed.get_index(), 2); + assert_eq!( + parsed.get_cmd().get_header().get_resource_group_name(), + "test" + ); + + let mut entry = Entry::new(); + entry.set_term(1); + entry.set_index(2); + entry.set_entry_type(raft::eraftpb::EntryType::EntryConfChangeV2); + let mut cc = ConfChangeV2::new(); + let mut ccs = eraftpb::ConfChangeSingle::default(); + ccs.set_change_type(ConfChangeType::AddNode); + ccs.set_node_id(3); + cc.set_changes(vec![ccs].into()); + cc.set_context(req.write_to_bytes().unwrap().into()); + entry.set_data(cc.write_to_bytes().unwrap().into()); + + let mut parsed = ParsedEntry::new(entry); + let (conf_change, cmd) = parsed.take_conf_change(); + assert_eq!( + conf_change.get_changes()[0].get_change_type(), + ConfChangeType::AddNode + ); + assert_eq!(conf_change.get_changes()[0].get_node_id(), 3); + assert_eq!(cmd.get_header().get_resource_group_name(), "test"); + assert_eq!( + parsed.get_cmd().get_header().get_resource_group_name(), + "test" + ); + } + #[test] fn test_get_entry_header() { let mut req = RaftCmdRequest::default(); @@ -2151,6 +2338,42 @@ mod tests { check_term(&header, 10).unwrap_err(); } + #[test] + fn test_has_admin_request() { + let mut entry = Entry::new(); + let mut req = RaftCmdRequest::default(); + entry.set_entry_type(EntryType::EntryNormal); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!has_admin_request(&entry)); + + req.mut_admin_request() + .set_cmd_type(AdminCmdType::CompactLog); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(has_admin_request(&entry)); + + let mut req = RaftCmdRequest::default(); + let mut request = Request::default(); + request.set_cmd_type(CmdType::Put); + req.set_requests(vec![request].into()); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!has_admin_request(&entry)); + + entry.set_entry_type(EntryType::EntryConfChange); + let conf_change = ConfChange::new(); + let data = conf_change.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(has_admin_request(&entry)); + + entry.set_entry_type(EntryType::EntryConfChangeV2); + let conf_change_v2 = ConfChangeV2::new(); + let data = conf_change_v2.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(has_admin_request(&entry)); + } + #[test] fn test_check_req_region_epoch() { let mut epoch = RegionEpoch::default(); diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml index 6cb7d547e6c..9a488b06d77 100644 --- a/components/resource_control/Cargo.toml +++ b/components/resource_control/Cargo.toml @@ -18,6 +18,7 @@ futures = { version = "0.3" } kvproto = { workspace = true } lazy_static = "1.0" online_config = { workspace = true } +parking_lot = "0.12" pd_client = { workspace = true } pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } diff --git a/components/resource_control/src/channel.rs b/components/resource_control/src/channel.rs index 55bc2ed33b9..a62b9636f83 100644 --- a/components/resource_control/src/channel.rs +++ b/components/resource_control/src/channel.rs @@ -1,17 +1,15 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - use std::{cell::RefCell, sync::Arc}; -use collections::HashMap; use crossbeam::channel::{self, RecvError, SendError, TryRecvError, TrySendError}; use kvproto::kvrpcpb::CommandPri; use tikv_util::mpsc::priority_queue; -use crate::{ResourceConsumeType, ResourceController}; +use crate::ResourceController; pub trait ResourceMetered { // returns the msg consumption of each hash map - fn get_resource_consumptions(&self) -> Option> { + fn consume_resource(&self, _: &Arc) -> Option { None } } @@ -132,19 +130,7 @@ impl Sender { last_msg_group, .. } => { - if let Some(mut groups) = msg.get_resource_consumptions() { - let mut dominant_group = "".to_owned(); - let mut max_write_bytes = 0; - for (group_name, write_bytes) in groups.drain() { - resource_ctl.consume( - group_name.as_bytes(), - ResourceConsumeType::IoBytes(write_bytes), - ); - if write_bytes > max_write_bytes { - dominant_group = group_name; - max_write_bytes = write_bytes; - } - } + if let Some(dominant_group) = msg.consume_resource(resource_ctl) { *last_msg_group.borrow_mut() = dominant_group; } } @@ -181,3 +167,59 @@ impl Receiver { } } } + +#[cfg(test)] +mod tests { + use std::{thread, usize}; + + use test::Bencher; + + use super::*; + use crate::ResourceConsumeType; + + struct Msg(usize); + + impl ResourceMetered for Msg { + fn consume_resource(&self, resource_ctl: &Arc) -> Option { + // None + let write_bytes = self.0 as u64; + let group_name = "test".to_owned(); + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + Some(group_name) + } + } + + #[bench] + fn bench_channel(b: &mut Bencher) { + let (tx, rx) = unbounded(Some(Arc::new(ResourceController::new( + "test".to_owned(), + false, + )))); + + let t = thread::spawn(move || { + let mut n2: usize = 0; + loop { + if let Ok(Msg(n)) = rx.recv() { + n2 += n; + } else { + return n2; + } + } + }); + + let mut n1 = 0; + b.iter(|| { + n1 += 1; + let msg = Msg(1); + tx.consume_msg_resource(&msg); + tx.send(msg, 0).unwrap(); + }); + + drop(tx); + let n2 = t.join().unwrap(); + assert_eq!(n1, n2); + } +} diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index 99645688cf7..b186cb8a0c7 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -1,4 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(test)] use online_config::OnlineConfig; use serde::{Deserialize, Serialize}; @@ -11,6 +12,9 @@ pub use resource_group::{ mod future; pub use future::ControlledFuture; +#[cfg(test)] +extern crate test; + mod service; pub use service::ResourceManagerService; diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 390214bc687..cea045dbf1a 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -8,11 +8,13 @@ use std::{ time::Duration, }; +use collections::HashMap; use dashmap::{mapref::one::Ref, DashMap}; use kvproto::{ kvrpcpb::CommandPri, resource_manager::{GroupMode, ResourceGroup}, }; +use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; use tikv_util::info; use yatp::queue::priority::TaskPriorityProvider; @@ -144,7 +146,7 @@ pub struct ResourceController { // groups' factors, it can't be changed concurrently. max_ru_quota: Mutex, // record consumption of each resource group, name --> resource_group - resource_consumptions: DashMap, GroupPriorityTracker>, + resource_consumptions: RwLock, GroupPriorityTracker>>, last_min_vt: AtomicU64, } @@ -155,7 +157,7 @@ impl ResourceController { name, is_read, max_ru_quota: Mutex::new(DEFAULT_MAX_RU_QUOTA), - resource_consumptions: DashMap::new(), + resource_consumptions: RwLock::new(HashMap::default()), last_min_vt: AtomicU64::new(0), }; // add the "default" resource group @@ -196,7 +198,7 @@ impl ResourceController { }; // maybe update existed group - self.resource_consumptions.insert(name, group); + self.resource_consumptions.write().insert(name, group); } // we calculate the weight of each resource group based on the currently maximum @@ -205,9 +207,12 @@ impl ResourceController { // often, and iterate 10k entry cost less than 5ms, so the performance is // acceptable. fn adjust_all_resource_group_factors(&self, max_ru_quota: u64) { - self.resource_consumptions.iter_mut().for_each(|mut g| { - g.value_mut().weight = Self::calculate_factor(max_ru_quota, g.ru_quota); - }); + self.resource_consumptions + .write() + .iter_mut() + .for_each(|(_, tracker)| { + tracker.weight = Self::calculate_factor(max_ru_quota, tracker.ru_quota); + }); } fn remove_resource_group(&self, name: &[u8]) { @@ -216,18 +221,19 @@ impl ResourceController { self.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); return; } - self.resource_consumptions.remove(name); + self.resource_consumptions.write().remove(name); } #[inline] - fn resource_group(&self, name: &[u8]) -> Ref<'_, Vec, GroupPriorityTracker> { - if let Some(g) = self.resource_consumptions.get(name) { - g - } else { - self.resource_consumptions - .get(DEFAULT_RESOURCE_GROUP_NAME.as_bytes()) - .unwrap() - } + fn resource_group(&self, name: &[u8]) -> MappedRwLockReadGuard<'_, GroupPriorityTracker> { + let guard = self.resource_consumptions.read(); + RwLockReadGuard::map(guard, |m| { + if let Some(g) = m.get(name) { + g + } else { + m.get(DEFAULT_RESOURCE_GROUP_NAME.as_bytes()).unwrap() + } + }) } pub fn consume(&self, name: &[u8], delta: ResourceConsumeType) { @@ -237,15 +243,18 @@ impl ResourceController { pub fn update_min_virtual_time(&self) { let mut min_vt = u64::MAX; let mut max_vt = 0; - self.resource_consumptions.iter().for_each(|g| { - let vt = g.current_vt(); - if min_vt > vt { - min_vt = vt; - } - if max_vt < vt { - max_vt = vt; - } - }); + self.resource_consumptions + .read() + .iter() + .for_each(|(_, tracker)| { + let vt = tracker.current_vt(); + if min_vt > vt { + min_vt = vt; + } + if max_vt < vt { + max_vt = vt; + } + }); // TODO: use different threshold for different resource type // needn't do update if the virtual different is less than 100ms/100KB. @@ -253,13 +262,16 @@ impl ResourceController { return; } - self.resource_consumptions.iter().for_each(|g| { - let vt = g.current_vt(); - if vt < max_vt { - // TODO: is increase by half is a good choice. - g.increase_vt((max_vt - vt) / 2); - } - }); + self.resource_consumptions + .read() + .iter() + .for_each(|(_, tracker)| { + let vt = tracker.current_vt(); + if vt < max_vt { + // TODO: is increase by half is a good choice. + tracker.increase_vt((max_vt - vt) / 2); + } + }); // max_vt is actually a little bigger than the current min vt, but we don't // need totally accurate here. self.last_min_vt.store(max_vt, Ordering::Relaxed); @@ -414,7 +426,7 @@ pub(crate) mod tests { assert_eq!(resource_manager.resource_groups.len(), 2); let resource_ctl = resource_manager.derive_controller("test_read".into(), true); - assert_eq!(resource_ctl.resource_consumptions.len(), 3); + assert_eq!(resource_ctl.resource_consumptions.read().len(), 3); let group1 = resource_ctl.resource_group("test".as_bytes()); assert_eq!(group1.weight, 500); @@ -473,7 +485,7 @@ pub(crate) mod tests { let new_group = new_resource_group_ru("new_group".into(), 500); resource_manager.add_resource_group(new_group); - assert_eq!(resource_ctl.resource_consumptions.len(), 4); + assert_eq!(resource_ctl.resource_consumptions.read().len(), 4); let group3 = resource_ctl.resource_group("new_group".as_bytes()); assert_eq!(group3.weight, 200); assert!(group3.current_vt() >= group1_vt / 2); @@ -524,22 +536,34 @@ pub(crate) mod tests { let group1 = new_resource_group_ru(format!("group{}", i), 100); resource_manager.add_resource_group(group1); } + // consume for default group + resource_ctl.consume( + b"default", + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + resource_ctl_write.consume(b"default", ResourceConsumeType::IoBytes(10000)); + assert_eq!(resource_manager.get_all_resource_groups().len(), 10); - assert_eq!(resource_ctl.resource_consumptions.len(), 11); // 10 + 1(default) - assert_eq!(resource_ctl_write.resource_consumptions.len(), 11); + assert_eq!(resource_ctl.resource_consumptions.read().len(), 11); // 10 + 1(default) + assert_eq!(resource_ctl_write.resource_consumptions.read().len(), 11); resource_manager.retain(|k, _v| k.starts_with("test")); assert_eq!(resource_manager.get_all_resource_groups().len(), 5); - assert_eq!(resource_ctl.resource_consumptions.len(), 6); - assert_eq!(resource_ctl_write.resource_consumptions.len(), 6); + assert_eq!(resource_ctl.resource_consumptions.read().len(), 6); + assert_eq!(resource_ctl_write.resource_consumptions.read().len(), 6); assert!(resource_manager.get_resource_group("group1").is_none()); - assert_eq!( - resource_ctl.resource_group("group2".as_bytes()).key(), - "default".as_bytes() + // should use the virtual time of default group for non-exist group + assert_ne!( + resource_ctl + .resource_group("group2".as_bytes()) + .current_vt(), + 0 ); - assert_eq!( - resource_ctl_write.resource_group("group2".as_bytes()).key(), - "default".as_bytes() + assert_ne!( + resource_ctl_write + .resource_group("group2".as_bytes()) + .current_vt(), + 0 ); } } From 6f85355d0b115d0cf59c7a37f587c96ad0cfa232 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 14 Mar 2023 17:34:39 +0800 Subject: [PATCH 583/676] txn: add more logs for panic (#14394) ref tikv/tikv#14390 Signed-off-by: Jay Lee --- components/tikv_kv/src/lib.rs | 4 ++++ components/tikv_kv/src/raftstore_impls.rs | 6 ++++++ src/storage/mvcc/reader/mod.rs | 8 ++++++-- src/storage/mvcc/reader/reader.rs | 8 ++++++-- src/storage/txn/actions/check_txn_status.rs | 14 ++++++++++++-- src/storage/txn/commands/check_secondary_locks.rs | 4 +++- 6 files changed, 37 insertions(+), 7 deletions(-) diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 22b11e425c5..c5313620995 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -514,6 +514,10 @@ pub trait SnapshotExt { None } + fn get_region_id(&self) -> Option { + None + } + fn get_txn_extra_op(&self) -> TxnExtraOp { TxnExtraOp::Noop } diff --git a/components/tikv_kv/src/raftstore_impls.rs b/components/tikv_kv/src/raftstore_impls.rs index c1384bdcd45..e89087e565f 100644 --- a/components/tikv_kv/src/raftstore_impls.rs +++ b/components/tikv_kv/src/raftstore_impls.rs @@ -40,10 +40,16 @@ impl<'a, S: Snapshot> SnapshotExt for RegionSnapshotExt<'a, S> { .unwrap_or(false) } + #[inline] fn get_term(&self) -> Option { self.snapshot.term } + #[inline] + fn get_region_id(&self) -> Option { + Some(self.snapshot.get_region().id) + } + fn get_txn_extra_op(&self) -> TxnExtraOp { self.snapshot.txn_extra_op } diff --git a/src/storage/mvcc/reader/mod.rs b/src/storage/mvcc/reader/mod.rs index 2e7d20ccf2b..949d8094e72 100644 --- a/src/storage/mvcc/reader/mod.rs +++ b/src/storage/mvcc/reader/mod.rs @@ -83,10 +83,14 @@ impl TxnCommitRecord { } } - pub fn unwrap_none(self) -> Option { + #[inline] + pub fn unwrap_none(self, region_id: u64) -> Option { match self { Self::None { overlapped_write } => overlapped_write, - _ => panic!("txn record found but not expected: {:?}", self), + _ => panic!( + "txn record found but not expected: {:?} [region_id={}]", + self, region_id + ), } } } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index d8f31ba77a8..36e8816ad25 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -755,6 +755,10 @@ impl MvccReader { pub fn set_hint_min_ts(&mut self, ts_bound: Option>) { self.hint_min_ts = ts_bound; } + + pub fn snapshot_ext(&self) -> S::Ext<'_> { + self.snapshot.ext() + } } #[cfg(test)] @@ -1222,7 +1226,7 @@ pub mod tests { let overlapped_write = reader .get_txn_commit_record(&key, 55.into()) .unwrap() - .unwrap_none(); + .unwrap_none(0); assert!(overlapped_write.is_none()); // When no such record is found but a record of another txn has a write record @@ -1230,7 +1234,7 @@ pub mod tests { let overlapped_write = reader .get_txn_commit_record(&key, 50.into()) .unwrap() - .unwrap_none() + .unwrap_none(0) .unwrap(); assert_eq!(overlapped_write.write.start_ts, 45.into()); assert_eq!(overlapped_write.write.write_type, WriteType::Put); diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index 88982d6da72..a3cd3253201 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -1,5 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +use tikv_kv::SnapshotExt; // #[PerformanceCriticalPath] use txn_types::{Key, Lock, TimeStamp, Write, WriteType}; @@ -154,8 +155,17 @@ pub fn rollback_lock( ) -> Result> { let overlapped_write = match reader.get_txn_commit_record(&key)? { TxnCommitRecord::None { overlapped_write } => overlapped_write, - TxnCommitRecord::SingleRecord { write, .. } if write.write_type != WriteType::Rollback => { - panic!("txn record found but not expected: {:?}", txn) + TxnCommitRecord::SingleRecord { write, commit_ts } + if write.write_type != WriteType::Rollback => + { + panic!( + "txn record found but not expected: {:?} {} {:?} {:?} [region_id={}]", + write, + commit_ts, + txn, + lock, + reader.reader.snapshot_ext().get_region_id().unwrap_or(0) + ) } _ => return Ok(txn.unlock_key(key, is_pessimistic_txn, TimeStamp::zero())), }; diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index a19a5d82bb6..d21d47871d4 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -58,6 +58,7 @@ impl WriteCommand for CheckSecondaryLocks { fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { // It is not allowed for commit to overwrite a protected rollback. So we update // max_ts to prevent this case from happening. + let region_id = self.ctx.get_region_id(); context.concurrency_manager.update_max_ts(self.start_ts); let mut txn = MvccTxn::new(self.start_ts, context.concurrency_manager); @@ -77,7 +78,8 @@ impl WriteCommand for CheckSecondaryLocks { Some(lock) if lock.ts == self.start_ts => { if lock.lock_type == LockType::Pessimistic { released_lock = txn.unlock_key(key.clone(), true, TimeStamp::zero()); - let overlapped_write = reader.get_txn_commit_record(&key)?.unwrap_none(); + let overlapped_write = + reader.get_txn_commit_record(&key)?.unwrap_none(region_id); (SecondaryLockStatus::RolledBack, true, overlapped_write) } else { (SecondaryLockStatus::Locked(lock), false, None) From c3e1cfb04046fe8ee1bc4a7ce453f273490e697a Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 14 Mar 2023 18:18:39 +0800 Subject: [PATCH 584/676] storage: Fix flow controller pending compaction bytes always be zero (#14393) close tikv/tikv#14392 Fix the issue that flow control may not work when pending compaction bytes is high. If the pending compaction bytes is 0, then 0.log2() is -INF which would cause the later average always be zero even if the pending compaction bytes is already high. Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- .../singleton_flow_controller.rs | 121 ++++++++---------- 1 file changed, 55 insertions(+), 66 deletions(-) diff --git a/src/storage/txn/flow_controller/singleton_flow_controller.rs b/src/storage/txn/flow_controller/singleton_flow_controller.rs index f51249facfc..abf0689f1fc 100644 --- a/src/storage/txn/flow_controller/singleton_flow_controller.rs +++ b/src/storage/txn/flow_controller/singleton_flow_controller.rs @@ -728,7 +728,11 @@ impl FlowChecker { // Because pending compaction bytes changes dramatically, take the // logarithm of pending compaction bytes to make the values fall into // a relative small range - let num = (self.engine.pending_compaction_bytes(self.region_id, &cf) as f64).log2(); + let mut num = (self.engine.pending_compaction_bytes(self.region_id, &cf) as f64).log2(); + if !num.is_finite() { + // 0.log2() == -inf, which is not expected and may lead to sum always be NaN + num = 0.0; + } let checker = self.cf_checkers.get_mut(&cf).unwrap(); checker.long_term_pending_bytes.observe(num); SCHED_PENDING_COMPACTION_BYTES_GAUGE @@ -1078,6 +1082,15 @@ pub(super) mod tests { } } + fn send_flow_info(tx: &mpsc::SyncSender, region_id: u64) { + tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) + .unwrap(); + tx.send(FlowInfo::Compaction("default".to_string(), region_id)) + .unwrap(); + tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) + .unwrap(); + } + pub fn test_flow_controller_basic_impl(flow_controller: &FlowController, region_id: u64) { // enable flow controller assert_eq!(flow_controller.enabled(), true); @@ -1130,48 +1143,34 @@ pub(super) mod tests { // exceeds the threshold on start stub.0.num_memtables.store(8, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert_eq!(flow_controller.should_drop(region_id), false); // on start check forbids flow control assert_eq!(flow_controller.is_unlimited(region_id), true); // once falls below the threshold, pass the on start check stub.0.num_memtables.store(1, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); // not throttle when the average of the sliding window doesn't exceeds the // threshold stub.0.num_memtables.store(6, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert_eq!(flow_controller.should_drop(region_id), false); assert_eq!(flow_controller.is_unlimited(region_id), true); // the average of sliding window exceeds the threshold stub.0.num_memtables.store(6, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert_eq!(flow_controller.should_drop(region_id), false); assert_eq!(flow_controller.is_unlimited(region_id), false); assert_ne!(flow_controller.consume(region_id, 2000), Duration::ZERO); // not throttle once the number of memtables falls below the threshold stub.0.num_memtables.store(1, Ordering::Relaxed); - tx.send(FlowInfo::Flush("default".to_string(), 0, region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert_eq!(flow_controller.should_drop(region_id), false); assert_eq!(flow_controller.is_unlimited(region_id), true); } + #[test] fn test_flow_controller_memtable() { let stub = EngineStub::new(); @@ -1198,26 +1197,17 @@ pub(super) mod tests { // exceeds the threshold stub.0.num_l0_files.store(30, Ordering::Relaxed); - tx.send(FlowInfo::L0("default".to_string(), 0, region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert_eq!(flow_controller.should_drop(region_id), false); // on start check forbids flow control assert_eq!(flow_controller.is_unlimited(region_id), true); // once fall below the threshold, pass the on start check stub.0.num_l0_files.store(10, Ordering::Relaxed); - tx.send(FlowInfo::L0("default".to_string(), 0, region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); // exceeds the threshold, throttle now stub.0.num_l0_files.store(30, Ordering::Relaxed); - tx.send(FlowInfo::L0("default".to_string(), 0, region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert_eq!(flow_controller.should_drop(region_id), false); assert_eq!(flow_controller.is_unlimited(region_id), false); assert_ne!(flow_controller.consume(region_id, 2000), Duration::ZERO); @@ -1243,41 +1233,25 @@ pub(super) mod tests { stub.0 .pending_compaction_bytes .store(1000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string(), region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); // on start check forbids flow control - assert!( - flow_controller.discard_ratio(region_id) < f64::EPSILON, - "discard_ratio {}", - flow_controller.discard_ratio(region_id) - ); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); // once fall below the threshold, pass the on start check stub.0 .pending_compaction_bytes .store(100 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string(), region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); stub.0 .pending_compaction_bytes .store(1000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string(), region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert!(flow_controller.discard_ratio(region_id) > f64::EPSILON); stub.0 .pending_compaction_bytes .store(1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string(), region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); // pending compaction bytes jump after unsafe destroy range @@ -1291,10 +1265,7 @@ pub(super) mod tests { stub.0 .pending_compaction_bytes .store(1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string(), region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); stub.0 @@ -1316,19 +1287,13 @@ pub(super) mod tests { stub.0 .pending_compaction_bytes .store(1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string(), region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); stub.0 .pending_compaction_bytes .store(1000000000 * 1024 * 1024 * 1024, Ordering::Relaxed); - tx.send(FlowInfo::Compaction("default".to_string(), region_id)) - .unwrap(); - tx.send(FlowInfo::L0Intra("default".to_string(), 0, region_id)) - .unwrap(); + send_flow_info(tx, region_id); assert!(flow_controller.discard_ratio(region_id) > f64::EPSILON); } @@ -1342,6 +1307,30 @@ pub(super) mod tests { test_flow_controller_pending_compaction_bytes_impl(&flow_controller, &stub, &tx, 0); } + #[test] + fn test_flow_controller_pending_compaction_bytes_of_zero() { + let region_id = 0; + let stub = EngineStub::new(); + let (tx, rx) = mpsc::sync_channel(0); + let flow_controller = + EngineFlowController::new(&FlowControlConfig::default(), stub.clone(), rx); + let flow_controller = FlowController::Singleton(flow_controller); + + // should handle zero pending compaction bytes properly + stub.0.pending_compaction_bytes.store(0, Ordering::Relaxed); + send_flow_info(&tx, region_id); + assert!(flow_controller.discard_ratio(region_id) < f64::EPSILON); + stub.0 + .pending_compaction_bytes + .store(10000000000 * 1024 * 1024 * 1024, Ordering::Relaxed); + send_flow_info(&tx, region_id); + stub.0 + .pending_compaction_bytes + .store(10000000000 * 1024 * 1024 * 1024, Ordering::Relaxed); + send_flow_info(&tx, region_id); + assert!(flow_controller.discard_ratio(region_id) > f64::EPSILON); + } + #[test] fn test_smoother() { let mut smoother = Smoother::::default(); From bff6695aef5200da447eab96838a199e63bed50d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 16 Mar 2023 14:46:39 +0800 Subject: [PATCH 585/676] log-backup: Fix sub tracking (#14185) close tikv/tikv#14184 Signed-off-by: hillium --- components/backup-stream/src/endpoint.rs | 4 +- components/backup-stream/src/event_loader.rs | 2 +- components/backup-stream/src/lib.rs | 4 +- components/backup-stream/src/router.rs | 2 +- .../backup-stream/src/subscription_manager.rs | 8 +- .../backup-stream/src/subscription_track.rs | 263 +++++++++++++----- components/backup-stream/src/utils.rs | 2 +- components/backup-stream/tests/mod.rs | 34 ++- 8 files changed, 246 insertions(+), 73 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 49ca811285b..d8c0e09744f 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -56,7 +56,7 @@ use crate::{ observer::BackupStreamObserver, router::{ApplyEvents, Router, TaskSelector}, subscription_manager::{RegionSubscriptionManager, ResolvedRegions}, - subscription_track::{ResolveResult, SubscriptionTracer}, + subscription_track::{Ref, RefMut, ResolveResult, SubscriptionTracer}, try_send, utils::{self, CallbackWaitGroup, StopWatch, Work}, }; @@ -477,7 +477,7 @@ where } fn backup_batch(&self, batch: CmdBatch, work: Work) { - let mut sw = StopWatch::new(); + let mut sw = StopWatch::by_now(); let router = self.range_router.clone(); let sched = self.scheduler.clone(); diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 13c958a499a..8b808a16cca 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -37,7 +37,7 @@ use crate::{ errors::{ContextualResultExt, Error, Result}, metrics, router::{ApplyEvent, ApplyEvents, Router}, - subscription_track::{SubscriptionTracer, TwoPhaseResolver}, + subscription_track::{Ref, RefMut, SubscriptionTracer, TwoPhaseResolver}, try_send, utils::{self, RegionPager}, Task, diff --git a/components/backup-stream/src/lib.rs b/components/backup-stream/src/lib.rs index 34dbfa33e4c..a36b42c227d 100644 --- a/components/backup-stream/src/lib.rs +++ b/components/backup-stream/src/lib.rs @@ -16,7 +16,9 @@ pub mod router; mod service; mod subscription_manager; mod subscription_track; -mod utils; +// Publish it for integration test. +// Perhaps we'd better move some of then into `tikv_util`. +pub mod utils; pub use checkpoint_manager::GetCheckpointResult; pub use endpoint::{Endpoint, ObserveOp, RegionCheckpointOperation, RegionSet, Task}; diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index ead124c103a..5b862f732a2 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -1174,7 +1174,7 @@ impl StreamTaskInfo { return Ok(None); } let begin = Instant::now_coarse(); - let mut sw = StopWatch::new(); + let mut sw = StopWatch::by_now(); // generate meta data and prepare to flush to storage let mut metadata_info = self diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 4f75423a241..6e72d66a98b 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -478,7 +478,6 @@ where // safely. let rts = min_region.map(|rs| rs.checkpoint).unwrap_or(min_ts); info!("getting checkpoint"; "defined_by_region" => ?min_region); - self.subs.warn_if_gap_too_huge(rts); callback(ResolvedRegions::new(rts, cps)); } } @@ -497,6 +496,7 @@ where .with_label_values(&["region-changed"]) .inc(); let r = async { + self.subs.add_pending_region(region); self.observe_over_with_initial_data_from_checkpoint( region, self.get_last_checkpoint_of(&for_task, region).await?, @@ -518,7 +518,7 @@ where } else { warn!( "BUG: the region {:?} is register to no task but being observed", - ®ion + utils::debug_region(region) ); } } @@ -538,6 +538,9 @@ where } Some(for_task) => { + // the extra failpoint is used to pause the thread. + // once it triggered "pause" it cannot trigger early return then. + fail::fail_point!("try_start_observe0"); fail::fail_point!("try_start_observe", |_| { Err(Error::Other(box_err!("Nature is boring"))) }); @@ -550,6 +553,7 @@ where async fn start_observe(&self, region: Region) { let handle = ObserveHandle::new(); + self.subs.add_pending_region(®ion); if let Err(err) = self.try_start_observe(®ion, handle.clone()).await { warn!("failed to start observe, retrying"; "err" => %err); try_send!( diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 1f823130d3b..c13339d1c29 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -1,9 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{collections::HashSet, sync::Arc, time::Duration}; +use std::{collections::HashSet, sync::Arc}; use dashmap::{ - mapref::{entry::Entry, one::RefMut}, + mapref::{entry::Entry, one::RefMut as DashRefMut}, DashMap, }; use kvproto::metapb::Region; @@ -16,15 +16,50 @@ use crate::{debug, metrics::TRACK_REGION, utils}; /// A utility to tracing the regions being subscripted. #[derive(Clone, Default, Debug)] -pub struct SubscriptionTracer(Arc>); +pub struct SubscriptionTracer(Arc>); + +/// The state of the subscription state machine: +/// Initial state is `ABSENT`, the subscription isn't in the tracer. +/// Once it becomes the leader, it would be in `PENDING` state, where we would +/// prepare the information needed for doing initial scanning. +/// When we are able to start execute initial scanning, it would be in `RUNNING` +/// state, where it starts to handle events. +/// You may notice there are also some state transforms in the +/// [`TwoPhaseResolver`] struct, states there are sub-states of the `RUNNING` +/// stage here. +enum SubscribeState { + // NOTE: shall we add `SubscriptionHandle` here? + // (So we can check this when calling `remove_if`.) + Pending(Region), + Running(ActiveSubscription), +} + +impl SubscribeState { + /// check whether the current state is pending. + fn is_pending(&self) -> bool { + matches!(self, SubscribeState::Pending(_)) + } +} -pub struct RegionSubscription { +impl std::fmt::Debug for SubscribeState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Pending(arg0) => f + .debug_tuple("Pending") + .field(&utils::debug_region(arg0)) + .finish(), + Self::Running(arg0) => f.debug_tuple("Running").field(arg0).finish(), + } + } +} + +pub struct ActiveSubscription { pub meta: Region, pub(crate) handle: ObserveHandle, pub(crate) resolver: TwoPhaseResolver, } -impl std::fmt::Debug for RegionSubscription { +impl std::fmt::Debug for ActiveSubscription { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_tuple("RegionSubscription") .field(&self.meta.get_id()) @@ -33,7 +68,7 @@ impl std::fmt::Debug for RegionSubscription { } } -impl RegionSubscription { +impl ActiveSubscription { pub fn new(region: Region, handle: ObserveHandle, start_ts: Option) -> Self { let resolver = TwoPhaseResolver::new(region.get_id(), start_ts); Self { @@ -100,7 +135,7 @@ impl std::fmt::Debug for ResolveResult { } impl ResolveResult { - fn resolve(sub: &mut RegionSubscription, min_ts: TimeStamp) -> Self { + fn resolve(sub: &mut ActiveSubscription, min_ts: TimeStamp) -> Self { let ts = sub.resolver.resolve(min_ts); let ty = if ts == min_ts { CheckpointType::MinTs @@ -121,12 +156,46 @@ impl SubscriptionTracer { /// clear the current `SubscriptionTracer`. pub fn clear(&self) { self.0.retain(|_, v| { - v.stop(); - TRACK_REGION.dec(); + if let SubscribeState::Running(s) = v { + s.stop(); + TRACK_REGION.dec(); + } false }); } + /// Add a pending region into the tracker. + /// A `PENDING` region is a region we are going to start subscribe however + /// there are still tiny impure things need to do. (e.g. getting the + /// checkpoint of this region.) + /// + /// This state is a placeholder for those regions: once they failed in the + /// impure operations, this would be the evidence proofing they were here. + /// + /// So we can do better when we are doing refreshing, say: + /// ```no_run + /// match task { + /// Task::RefreshObserve(r) if is_pending(r) => { /* Execute the refresh. */ } + /// Task::RefreshObserve(r) if is_absent(r) => { /* Do nothing. Maybe stale. */ } + /// } + /// ``` + /// + /// We should execute the refresh when it is pending, because the start may + /// fail and then a refresh fires. + /// We should skip when we are going to refresh absent regions because there + /// may be some stale commands. + pub fn add_pending_region(&self, region: &Region) { + let r = self + .0 + .insert(region.get_id(), SubscribeState::Pending(region.clone())); + if let Some(s) = r { + warn!( + "excepted state transform: running | pending -> pending"; + "old" => ?s, utils::slog_region(region), + ) + } + } + // Register a region as tracing. // The `start_ts` is used to tracking the progress of initial scanning. // Note: the `None` case of `start_ts` is for testing / refresh region status @@ -138,14 +207,25 @@ impl SubscriptionTracer { handle: ObserveHandle, start_ts: Option, ) { - info!("start listen stream from store"; "observer" => ?handle, "region_id" => %region.get_id()); + info!("start listen stream from store"; "observer" => ?handle); TRACK_REGION.inc(); - if let Some(mut o) = self.0.insert( - region.get_id(), - RegionSubscription::new(region.clone(), handle, start_ts), - ) { - TRACK_REGION.dec(); - o.stop(); + let e = self.0.entry(region.id); + match e { + Entry::Occupied(o) => { + let sub = ActiveSubscription::new(region.clone(), handle, start_ts); + let (_, s) = o.replace_entry(SubscribeState::Running(sub)); + if !s.is_pending() { + // If there is another subscription already (perhaps repeated Start), + // don't add the counter. + warn!("excepted state transform: running -> running"; "old" => ?s, utils::slog_region(region)); + TRACK_REGION.dec(); + } + } + Entry::Vacant(e) => { + warn!("excepted state transform: absent -> running"; utils::slog_region(region)); + let sub = ActiveSubscription::new(region.clone(), handle, start_ts); + e.insert(SubscribeState::Running(sub)); + } } } @@ -163,59 +243,54 @@ impl SubscriptionTracer { let rs = regions.into_iter().collect::>(); self.0 .iter_mut() - .filter(|s| { - let contains = rs.contains(s.key()); - if !contains { - crate::metrics::LOST_LEADER_REGION.inc(); + // Don't advance the checkpoint ts of pending region. + .filter_map(|mut s| { + let region_id = *s.key(); + match s.value_mut() { + SubscribeState::Running(sub) => { + let contains = rs.contains(®ion_id); + if !contains { + crate::metrics::LOST_LEADER_REGION.inc(); + } + contains.then(|| ResolveResult::resolve(sub, min_ts)) } - contains + SubscribeState::Pending(r) => {warn!("pending region, skip resolving"; utils::slog_region(r)); None}, + } }) - .map(|mut s| ResolveResult::resolve(s.value_mut(), min_ts)) .collect() } - #[inline(always)] - pub fn warn_if_gap_too_huge(&self, ts: TimeStamp) { - let gap = TimeStamp::physical_now() - ts.physical(); - if gap >= 10 * 60 * 1000 - // 10 mins - { - let far_resolver = self - .0 - .iter() - .min_by_key(|r| r.value().resolver.resolved_ts()); - warn!("log backup resolver ts advancing too slow"; - "far_resolver" => %{match far_resolver { - Some(r) => format!("{:?}", r.value().resolver), - None => "BUG[NoResolverButResolvedTSDoesNotAdvance]".to_owned() - }}, - "gap" => ?Duration::from_millis(gap), - ); - } - } - /// try to mark a region no longer be tracked by this observer. /// returns whether success (it failed if the region hasn't been observed /// when calling this.) pub fn deregister_region_if( &self, region: &Region, - if_cond: impl FnOnce(&RegionSubscription, &Region) -> bool, + if_cond: impl FnOnce(&ActiveSubscription, &Region) -> bool, ) -> bool { let region_id = region.get_id(); let remove_result = self.0.entry(region_id); match remove_result { - Entry::Occupied(mut x) => { - if if_cond(x.get(), region) { - TRACK_REGION.dec(); - x.get_mut().stop(); - let v = x.remove(); - info!("stop listen stream from store"; "observer" => ?v, "region_id"=> %region_id); - return true; - } - false - } Entry::Vacant(_) => false, + Entry::Occupied(mut o) => match o.get_mut() { + SubscribeState::Pending(r) => { + info!("remove pending subscription"; "region_id"=> %region_id, utils::slog_region(r)); + + o.remove(); + true + } + SubscribeState::Running(s) => { + if if_cond(s, region) { + TRACK_REGION.dec(); + s.stop(); + info!("stop listen stream from store"; "observer" => ?s, "region_id"=> %region_id); + + o.remove(); + return true; + } + false + } + }, } } @@ -229,8 +304,8 @@ impl SubscriptionTracer { let mut sub = match self.get_subscription_of(new_region.get_id()) { Some(sub) => sub, None => { - warn!("backup stream observer refreshing void subscription."; utils::slog_region(new_region)); - return true; + warn!("backup stream observer refreshing pending / absent subscription."; utils::slog_region(new_region)); + return false; } }; @@ -250,11 +325,10 @@ impl SubscriptionTracer { pub fn is_observing(&self, region_id: u64) -> bool { let sub = self.0.get_mut(®ion_id); match sub { - Some(mut sub) if !sub.is_observing() => { - sub.value_mut().stop(); - false - } - Some(_) => true, + Some(mut s) => match s.value_mut() { + SubscribeState::Pending(_) => false, + SubscribeState::Running(s) => s.is_observing(), + }, None => false, } } @@ -262,8 +336,68 @@ impl SubscriptionTracer { pub fn get_subscription_of( &self, region_id: u64, - ) -> Option> { - self.0.get_mut(®ion_id) + ) -> Option + '_> { + self.0 + .get_mut(®ion_id) + .and_then(|x| SubscriptionRef::try_from_dash(x)) + } +} + +pub trait Ref { + type Key; + type Value; + + fn key(&self) -> &Self::Key; + fn value(&self) -> &Self::Value; +} + +pub trait RefMut: Ref { + fn value_mut(&mut self) -> &mut ::Value; +} + +impl<'a> Ref for SubscriptionRef<'a> { + type Key = u64; + type Value = ActiveSubscription; + + fn key(&self) -> &Self::Key { + DashRefMut::key(&self.0) + } + + fn value(&self) -> &Self::Value { + self.sub() + } +} + +impl<'a> RefMut for SubscriptionRef<'a> { + fn value_mut(&mut self) -> &mut ::Value { + self.sub_mut() + } +} + +struct SubscriptionRef<'a>(DashRefMut<'a, u64, SubscribeState>); + +impl<'a> SubscriptionRef<'a> { + fn try_from_dash(mut d: DashRefMut<'a, u64, SubscribeState>) -> Option { + match d.value_mut() { + SubscribeState::Pending(_) => None, + SubscribeState::Running(_) => Some(Self(d)), + } + } + + fn sub(&self) -> &ActiveSubscription { + match self.0.value() { + // Panic Safety: the constructor would prevent us from creating pending subscription + // ref. + SubscribeState::Pending(_) => unreachable!(), + SubscribeState::Running(s) => s, + } + } + + fn sub_mut(&mut self) -> &mut ActiveSubscription { + match self.0.value_mut() { + SubscribeState::Pending(_) => unreachable!(), + SubscribeState::Running(s) => s, + } } } @@ -434,6 +568,7 @@ mod test { use txn_types::TimeStamp; use super::{SubscriptionTracer, TwoPhaseResolver}; + use crate::subscription_track::RefMut; #[test] fn test_two_phase_resolver() { @@ -498,6 +633,7 @@ mod test { ); subs.get_subscription_of(3) .unwrap() + .value_mut() .resolver .phase_one_done(); subs.register_region( @@ -506,8 +642,9 @@ mod test { Some(TimeStamp::new(92)), ); let mut region4_sub = subs.get_subscription_of(4).unwrap(); - region4_sub.resolver.phase_one_done(); + region4_sub.value_mut().resolver.phase_one_done(); region4_sub + .value_mut() .resolver .track_lock(TimeStamp::new(128), b"Alpi".to_vec()); subs.register_region(®ion(5, 8, 1), ObserveHandle::new(), None); diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index a5d83e50328..77c689da70d 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -144,7 +144,7 @@ pub struct StopWatch(Instant); impl StopWatch { /// Create a new stopwatch via current time. - pub fn new() -> Self { + pub fn by_now() -> Self { Self(Instant::now_coarse()) } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index d6dfb2b2839..db4f84924b0 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -19,7 +19,7 @@ use backup_stream::{ }, observer::BackupStreamObserver, router::Router, - Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Service, Task, + utils, Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Service, Task, }; use futures::{executor::block_on, AsyncWriteExt, Future, Stream, StreamExt}; use grpcio::{ChannelBuilder, Server, ServerBuilder}; @@ -403,7 +403,7 @@ impl Suite { rx.into_iter() .map(|r| match r { GetCheckpointResult::Ok { checkpoint, region } => { - info!("getting checkpoint"; "checkpoint" => %checkpoint, "region" => ?region); + info!("getting checkpoint"; "checkpoint" => %checkpoint, utils::slog_region(®ion)); checkpoint.into_inner() } GetCheckpointResult::NotFound { .. } @@ -1320,6 +1320,36 @@ mod test { )); } + #[test] + fn failure_and_split() { + let mut suite = super::SuiteBuilder::new_named("failure_and_split") + .nodes(1) + .build(); + fail::cfg("try_start_observe0", "pause").unwrap(); + + // write data before the task starting, for testing incremental scanning. + let round1 = run_async_test(suite.write_records(0, 128, 1)); + suite.must_register_task(1, "failure_and_split"); + suite.sync(); + + suite.must_split(&make_split_key_at_record(1, 42)); + suite.sync(); + std::thread::sleep(Duration::from_millis(200)); + fail::cfg("try_start_observe", "2*return").unwrap(); + fail::cfg("try_start_observe0", "off").unwrap(); + + let round2 = run_async_test(suite.write_records(256, 128, 1)); + suite.force_flush_files("failure_and_split"); + suite.wait_for_flush(); + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(Vec::as_slice), + )); + let cp = suite.global_checkpoint(); + assert!(cp > 512, "it is {}", cp); + suite.cluster.shutdown(); + } + #[test] fn resolved_follower() { let mut suite = super::SuiteBuilder::new_named("r").build(); From 09cd29f3f2eafbd835896db9e1f1e4a2e03123aa Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 16 Mar 2023 15:04:40 +0800 Subject: [PATCH 586/676] raftstore-v2: add some logs and fix possible race between clean and tick (#14399) ref tikv/tikv#14386 None Signed-off-by: tabokie Co-authored-by: Ti Chi Robot --- .../operation/command/admin/compact_log.rs | 17 +++++++++++++--- .../raftstore-v2/src/operation/ready/mod.rs | 20 +++++++++++-------- .../raftstore/src/store/async_io/write.rs | 3 ++- .../raftstore/src/store/entry_storage.rs | 7 ++++++- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 1ce118a957f..ed4d22a59b4 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -289,7 +289,8 @@ impl Peer { old_tablet: EK, new_tablet_index: u64, ) { - info!(self.logger, + info!( + self.logger, "record tombstone tablet"; "prev_tablet_path" => old_tablet.path(), "new_tablet_index" => new_tablet_index @@ -490,11 +491,21 @@ impl Peer { // There is no logs at RAFT_INIT_LOG_INDEX, nothing to delete. return None; } + assert!( + compact_index <= self.raft_group().raft.raft_log.committed, + "{}: compact_index={}, committed={}", + SlogFormat(&self.logger), + compact_index, + self.raft_group().raft.raft_log.committed, + ); // TODO: make this debug when stable. - info!(self.logger, "compact log"; + info!( + self.logger, + "compact log"; "index" => compact_index, "apply_trace" => ?self.storage().apply_trace(), - "truncated" => ?self.entry_storage().apply_state()); + "truncated" => ?self.entry_storage().apply_state() + ); Some(compact_index) } } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 4c0bf9cbe88..3755d92b587 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -154,7 +154,7 @@ impl Peer { // When it's handling snapshot, it's pointless to tick as all the side // affects have to wait till snapshot is applied. On the other hand, ticking // will bring other corner cases like elections. - !self.is_handling_snapshot() && self.raft_group_mut().tick() + !self.is_handling_snapshot() && self.serving() && self.raft_group_mut().tick() } pub fn on_peer_unreachable(&mut self, to_peer_id: u64) { @@ -932,7 +932,7 @@ impl Storage { write_task: &mut WriteTask, ) { let prev_raft_state = self.entry_storage().raft_state().clone(); - let ever_persisted = self.ever_persisted(); + let prev_ever_persisted = self.ever_persisted(); if !ready.snapshot().is_empty() { if let Err(e) = self.apply_snapshot( @@ -946,20 +946,24 @@ impl Storage { } } - let entry_storage = self.entry_storage_mut(); if !ready.entries().is_empty() { - entry_storage.append(ready.take_entries(), write_task); + assert!(self.ever_persisted(), "{}", SlogFormat(self.logger())); + self.entry_storage_mut() + .append(ready.take_entries(), write_task); } if let Some(hs) = ready.hs() { - entry_storage.raft_state_mut().set_hard_state(hs.clone()); + self.entry_storage_mut() + .raft_state_mut() + .set_hard_state(hs.clone()); } - if !ever_persisted || prev_raft_state != *entry_storage.raft_state() { + let entry_storage = self.entry_storage(); + if !prev_ever_persisted || prev_raft_state != *entry_storage.raft_state() { write_task.raft_state = Some(entry_storage.raft_state().clone()); } - // If snapshot initializes the peer, we don't need to write apply trace again. + // If snapshot initializes the peer (in `apply_snapshot`), we don't need to + // write apply trace again. if !self.ever_persisted() { let region_id = self.region().get_id(); - let entry_storage = self.entry_storage(); let raft_engine = entry_storage.raft_engine(); if write_task.raft_wb.is_none() { write_task.raft_wb = Some(raft_engine.log_batch(64)); diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index d20b9d0bec0..b58d2601d95 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -445,7 +445,8 @@ where .unwrap(); if let Some(raft_state) = task.raft_state.take() - && self.raft_states.insert(task.region_id, raft_state).is_none() { + && self.raft_states.insert(task.region_id, raft_state).is_none() + { self.state_size += std::mem::size_of::(); } self.extra_batch_write.merge(&mut task.extra_write); diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 1e2e40b2da6..4d3f487a499 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -970,7 +970,12 @@ impl EntryStorage { .raft_engine .get_entry(self.region_id, idx) .unwrap() - .unwrap() + .unwrap_or_else(|| { + panic!( + "region_id={}, peer_id={}, idx={idx}", + self.region_id, self.peer_id + ) + }) .get_term()) } } From 21ef364077bdefbd306a26019879c75a3687a27b Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Thu, 16 Mar 2023 18:36:40 +0800 Subject: [PATCH 587/676] raftstore-v2: filter read index msg when sending from self (#14396) close tikv/tikv#14388 filter read index msg when sending from self Signed-off-by: Spade A --- .../raftstore-v2/src/operation/ready/mod.rs | 21 ++- components/test_raftstore-v2/src/cluster.rs | 136 +++++++++++------- components/test_raftstore-v2/src/lib.rs | 3 + components/test_raftstore-v2/src/node.rs | 20 +-- components/test_raftstore-v2/src/server.rs | 19 ++- .../src/transport_simulate.rs | 36 ++--- components/test_raftstore-v2/src/util.rs | 22 ++- .../raftstore/test_replica_read.rs | 80 ++++++++++- 8 files changed, 236 insertions(+), 101 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 3755d92b587..3591a17d989 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -255,11 +255,24 @@ impl Peer { let pre_committed_index = self.raft_group().raft.raft_log.committed; if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { self.on_transfer_leader_msg(ctx, msg.get_message(), msg.disk_usage) - } else if let Err(e) = self.raft_group_mut().step(msg.take_message()) { - error!(self.logger, "raft step error"; "err" => ?e); } else { - let committed_index = self.raft_group().raft.raft_log.committed; - self.report_commit_log_duration(ctx, pre_committed_index, committed_index); + // This can be a message that sent when it's still a follower. Nevertheleast, + // it's meaningless to continue to handle the request as callbacks are cleared. + if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + && self.is_leader() + && (msg.get_message().get_from() == raft::INVALID_ID + || msg.get_message().get_from() == self.peer_id()) + { + ctx.raft_metrics.message_dropped.stale_msg.inc(); + return; + } + + if let Err(e) = self.raft_group_mut().step(msg.take_message()) { + error!(self.logger, "raft step error"; "err" => ?e); + } else { + let committed_index = self.raft_group().raft.raft_log.committed; + self.report_commit_log_duration(ctx, pre_committed_index, committed_index); + } } self.set_has_ready(); diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 164794aca56..44ce6a69358 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -13,11 +13,11 @@ use encryption_export::DataKeyManager; use engine_rocks::{RocksDbVector, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - Iterable, KvEngine, MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, RaftLogBatch, - ReadOptions, SyncMutable, TabletRegistry, CF_DEFAULT, + Iterable, MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, RaftLogBatch, ReadOptions, + SyncMutable, TabletRegistry, CF_DEFAULT, }; use file_system::IoRateLimiter; -use futures::{compat::Future01CompatExt, executor::block_on, select, FutureExt}; +use futures::{compat::Future01CompatExt, executor::block_on, select, Future, FutureExt}; use keys::{data_key, validate_data_key, DATA_PREFIX_KEY}; use kvproto::{ errorpb::Error as PbError, @@ -27,7 +27,9 @@ use kvproto::{ AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RegionDetailResponse, Request, Response, StatusCmdType, }, - raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, + raft_serverpb::{ + PeerState, RaftApplyState, RaftLocalState, RaftMessage, RegionLocalState, StoreIdent, + }, }; use pd_client::PdClient; use raftstore::{ @@ -96,71 +98,93 @@ pub trait Simulator { fn get_router(&self, node_id: u64) -> Option>; fn get_snap_dir(&self, node_id: u64) -> String; + fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()>; fn read(&mut self, request: RaftCmdRequest, timeout: Duration) -> Result { + let timeout_f = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + timeout) + .compat(); + futures::executor::block_on(async move { + futures::select! { + res = self.async_read(request).fuse() => res, + e = timeout_f.fuse() => { + Err(Error::Timeout(format!("request timeout for {:?}: {:?}", timeout,e))) + }, + } + }) + } + + fn async_read( + &mut self, + request: RaftCmdRequest, + ) -> impl Future> + Send { let mut req_clone = request.clone(); req_clone.clear_requests(); req_clone.mut_requests().push(new_snap_cmd()); - match self.snapshot(req_clone, timeout) { - Ok(snap) => { - let requests = request.get_requests(); - let mut response = RaftCmdResponse::default(); - let mut responses = Vec::with_capacity(requests.len()); - for req in requests { - let cmd_type = req.get_cmd_type(); - match cmd_type { - CmdType::Get => { - let mut resp = Response::default(); - let key = req.get_get().get_key(); - let cf = req.get_get().get_cf(); - let region = snap.get_region(); - - if let Err(e) = check_key_in_region(key, region) { - return Ok(cmd_resp::new_error(e)); + let snap = self.async_snapshot(req_clone); + async move { + match snap.await { + Ok(snap) => { + let requests = request.get_requests(); + let mut response = RaftCmdResponse::default(); + let mut responses = Vec::with_capacity(requests.len()); + for req in requests { + let cmd_type = req.get_cmd_type(); + match cmd_type { + CmdType::Get => { + let mut resp = Response::default(); + let key = req.get_get().get_key(); + let cf = req.get_get().get_cf(); + let region = snap.get_region(); + + if let Err(e) = check_key_in_region(key, region) { + return Ok(cmd_resp::new_error(e)); + } + + let res = if cf.is_empty() { + snap.get_value(key).unwrap_or_else(|e| { + panic!( + "[region {}] failed to get {} with cf {}: {:?}", + snap.get_region().get_id(), + log_wrappers::Value::key(key), + cf, + e + ) + }) + } else { + snap.get_value_cf(cf, key).unwrap_or_else(|e| { + panic!( + "[region {}] failed to get {}: {:?}", + snap.get_region().get_id(), + log_wrappers::Value::key(key), + e + ) + }) + }; + if let Some(res) = res { + resp.mut_get().set_value(res.to_vec()); + } + resp.set_cmd_type(cmd_type); + responses.push(resp); } - - let res = if cf.is_empty() { - snap.get_value(key).unwrap_or_else(|e| { - panic!( - "[region {}] failed to get {} with cf {}: {:?}", - snap.get_region().get_id(), - log_wrappers::Value::key(key), - cf, - e - ) - }) - } else { - snap.get_value_cf(cf, key).unwrap_or_else(|e| { - panic!( - "[region {}] failed to get {}: {:?}", - snap.get_region().get_id(), - log_wrappers::Value::key(key), - e - ) - }) - }; - if let Some(res) = res { - resp.mut_get().set_value(res.to_vec()); - } - resp.set_cmd_type(cmd_type); - responses.push(resp); + _ => unimplemented!(), } - _ => unimplemented!(), } - } - response.set_responses(responses.into()); + response.set_responses(responses.into()); - Ok(response) + Ok(response) + } + Err(e) => Ok(e), } - Err(e) => Ok(e), } } - fn snapshot( + fn async_snapshot( &mut self, request: RaftCmdRequest, - timeout: Duration, - ) -> std::result::Result::Snapshot>, RaftCmdResponse>; + ) -> impl Future< + Output = std::result::Result, RaftCmdResponse>, + > + Send; fn async_peer_msg_on_node(&self, node_id: u64, region_id: u64, msg: PeerMsg) -> Result<()>; @@ -666,6 +690,10 @@ impl Cluster { } } + pub fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()> { + self.sim.wl().send_raft_msg(msg) + } + pub fn call_command_on_node( &self, node_id: u64, diff --git a/components/test_raftstore-v2/src/lib.rs b/components/test_raftstore-v2/src/lib.rs index 101658ff57b..ea7e9f6f6e9 100644 --- a/components/test_raftstore-v2/src/lib.rs +++ b/components/test_raftstore-v2/src/lib.rs @@ -1,4 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +#![allow(incomplete_features)] +#![feature(type_alias_impl_trait)] +#![feature(return_position_impl_trait_in_trait)] mod cluster; mod node; diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index 0fde6ba42c5..5617787bb70 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -3,7 +3,6 @@ use std::{ path::Path, sync::{Arc, Mutex, RwLock}, - time::Duration, }; use collections::{HashMap, HashSet}; @@ -12,6 +11,7 @@ use encryption_export::DataKeyManager; use engine_rocks::RocksEngine; use engine_test::raft::RaftTestEngine; use engine_traits::{RaftEngine, RaftEngineReadOnly, TabletRegistry}; +use futures::Future; use kvproto::{ kvrpcpb::ApiVersion, raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, @@ -342,14 +342,12 @@ impl Simulator for NodeCluster { Ok(node_id) } - fn snapshot( + fn async_snapshot( &mut self, request: RaftCmdRequest, - timeout: Duration, - ) -> std::result::Result< - RegionSnapshot<::Snapshot>, - RaftCmdResponse, - > { + ) -> impl Future< + Output = std::result::Result, RaftCmdResponse>, + > + Send { let node_id = request.get_header().get_peer().get_store_id(); if !self .trans @@ -362,7 +360,7 @@ impl Simulator for NodeCluster { let mut resp = RaftCmdResponse::default(); let e: RaftError = box_err!("missing sender for store {}", node_id); resp.mut_header().set_error(e.into()); - return Err(resp); + // return async move {Err(resp)}; } let mut router = { @@ -370,7 +368,7 @@ impl Simulator for NodeCluster { guard.routers.get_mut(&node_id).unwrap().clone() }; - router.snapshot(request, timeout) + router.snapshot(request) } fn async_peer_msg_on_node(&self, node_id: u64, region_id: u64, msg: PeerMsg) -> Result<()> { @@ -433,6 +431,10 @@ impl Simulator for NodeCluster { let mut trans = self.trans.core.lock().unwrap(); trans.routers.get_mut(&node_id).unwrap().clear_filters(); } + + fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()> { + self.trans.send(msg) + } } pub fn new_node_cluster(id: u64, count: usize) -> Cluster { diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 1c6d956d1a8..ec8e3fe2635 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -15,7 +15,7 @@ use encryption_export::DataKeyManager; use engine_rocks::{RocksEngine, RocksSnapshot}; use engine_test::raft::RaftTestEngine; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; -use futures::executor::block_on; +use futures::{executor::block_on, Future}; use grpcio::{ChannelBuilder, EnvBuilder, Environment, Error as GrpcError, Service}; use grpcio_health::HealthService; use kvproto::{ @@ -751,24 +751,25 @@ impl Simulator for ServerCluster { self.storages.remove(&node_id); } - fn snapshot( + fn async_snapshot( &mut self, request: kvproto::raft_cmdpb::RaftCmdRequest, - timeout: Duration, - ) -> std::result::Result::Snapshot>, RaftCmdResponse> - { + ) -> impl Future< + Output = std::result::Result, RaftCmdResponse>, + > + Send { let node_id = request.get_header().get_peer().get_store_id(); let mut router = match self.metas.get(&node_id) { None => { let mut resp = RaftCmdResponse::default(); let e: RaftError = box_err!("missing sender for store {}", node_id); resp.mut_header().set_error(e.into()); - return Err(resp); + // return async move {Err(resp)}; + unreachable!() } Some(meta) => meta.sim_router.clone(), }; - router.snapshot(request, timeout) + router.snapshot(request) } fn async_peer_msg_on_node( @@ -796,6 +797,10 @@ impl Simulator for ServerCluster { .unwrap() .to_owned() } + + fn send_raft_msg(&mut self, _msg: RaftMessage) -> raftstore::Result<()> { + unimplemented!() + } } impl Cluster { diff --git a/components/test_raftstore-v2/src/transport_simulate.rs b/components/test_raftstore-v2/src/transport_simulate.rs index 9c11505d75f..7b9333aae83 100644 --- a/components/test_raftstore-v2/src/transport_simulate.rs +++ b/components/test_raftstore-v2/src/transport_simulate.rs @@ -1,12 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - sync::{Arc, RwLock}, - time::{Duration, Instant}, -}; +use std::sync::{Arc, RwLock}; use engine_traits::{KvEngine, RaftEngine}; -use futures::{compat::Future01CompatExt, FutureExt}; +use futures::Future; use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, @@ -14,12 +11,12 @@ use kvproto::{ use raft::SnapshotStatus; use raftstore::{ router::handle_send_error, - store::{cmd_resp, RegionSnapshot, Transport}, - Error, Result, Result as RaftStoreResult, + store::{RegionSnapshot, Transport}, + Result, Result as RaftStoreResult, }; use raftstore_v2::router::{PeerMsg, RaftRouter}; use test_raftstore::{filter_send, Filter}; -use tikv_util::{timer::GLOBAL_TIMER_HANDLE, HandyRwLock}; +use tikv_util::HandyRwLock; #[derive(Clone)] pub struct SimulateTransport { @@ -71,25 +68,16 @@ pub trait SnapshotRouter { fn snapshot( &mut self, req: RaftCmdRequest, - timeout: Duration, - ) -> std::result::Result, RaftCmdResponse>; + ) -> impl Future, RaftCmdResponse>> + Send; } impl SnapshotRouter for RaftRouter { fn snapshot( &mut self, req: RaftCmdRequest, - timeout: Duration, - ) -> std::result::Result, RaftCmdResponse> { - let timeout_f = GLOBAL_TIMER_HANDLE.delay(Instant::now() + timeout).compat(); - futures::executor::block_on(async move { - futures::select! { - res = self.snapshot(req).fuse() => res, - e = timeout_f.fuse() => { - Err(cmd_resp::new_error(Error::Timeout(format!("request timeout for {:?}: {:?}", timeout,e)))) - }, - } - }) + ) -> impl Future, RaftCmdResponse>> + Send + { + self.snapshot(req) } } @@ -97,9 +85,9 @@ impl> SnapshotRouter for SimulateTransport< fn snapshot( &mut self, req: RaftCmdRequest, - timeout: Duration, - ) -> std::result::Result, RaftCmdResponse> { - self.ch.snapshot(req, timeout) + ) -> impl Future, RaftCmdResponse>> + Send + { + self.ch.snapshot(req) } } diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index d9a0377210b..e2cc88c569c 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -7,6 +7,7 @@ use engine_rocks::{RocksEngine, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{TabletRegistry, CF_DEFAULT}; use file_system::IoRateLimiter; +use futures::Future; use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb::RaftCmdResponse}; use raftstore::Result; use rand::RngCore; @@ -21,7 +22,7 @@ use tikv::{ Engine, Snapshot, }, }; -use tikv_util::{config::ReadableDuration, worker::LazyWorker}; +use tikv_util::{config::ReadableDuration, worker::LazyWorker, HandyRwLock}; use crate::{bootstrap_store, cluster::Cluster, ServerCluster, Simulator}; @@ -209,3 +210,22 @@ pub fn read_on_peer( request.mut_header().set_peer(peer); cluster.read(None, request, timeout) } + +pub fn async_read_on_peer( + cluster: &mut Cluster, + peer: metapb::Peer, + region: metapb::Region, + key: &[u8], + read_quorum: bool, + replica_read: bool, +) -> impl Future> { + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(key)], + read_quorum, + ); + request.mut_header().set_peer(peer); + request.mut_header().set_replica_read(replica_read); + cluster.sim.wl().async_read(request) +} diff --git a/tests/integrations/raftstore/test_replica_read.rs b/tests/integrations/raftstore/test_replica_read.rs index 16fad00a59b..0359bacf436 100644 --- a/tests/integrations/raftstore/test_replica_read.rs +++ b/tests/integrations/raftstore/test_replica_read.rs @@ -12,13 +12,13 @@ use std::{ time::Duration, }; -use futures::executor::block_on; +use futures::{compat::Future01CompatExt, executor::block_on, FutureExt}; use kvproto::raft_serverpb::RaftMessage; use pd_client::PdClient; use raft::eraftpb::MessageType; use raftstore::{store::ReadIndexContext, Result}; use test_raftstore::*; -use tikv_util::{config::*, time::Instant, HandyRwLock}; +use tikv_util::{config::*, time::Instant, timer::GLOBAL_TIMER_HANDLE, HandyRwLock}; use txn_types::{Key, Lock, LockType}; use uuid::Uuid; @@ -583,3 +583,79 @@ fn test_malformed_read_index() { let resp = resp.recv_timeout(Duration::from_secs(10)).unwrap(); assert_eq!(resp.get_responses()[0].get_get().get_value(), b"v1"); } + +/// The case checks if a malformed request should not corrupt the leader's read +/// queue. +#[test] +fn test_malformed_read_index_v2() { + use test_raftstore_v2::*; + + let mut cluster = new_node_cluster(0, 3); + configure_for_lease_read(&mut cluster.cfg, Some(50), None); + cluster.cfg.raft_store.raft_log_gc_threshold = 12; + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); + cluster.cfg.raft_store.hibernate_regions = true; + cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::hours(10); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region_id = cluster.run_conf_change(); + pd_client.must_add_peer(region_id, new_peer(2, 2)); + pd_client.must_add_peer(region_id, new_peer(3, 3)); + cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.must_put(b"k1", b"v1"); + for i in 1..=3 { + must_get_equal(&cluster.get_engine(i), b"k1", b"v1"); + } + + // Wait till lease expires. + std::thread::sleep( + cluster + .cfg + .raft_store + .raft_store_max_leader_lease() + .to_std() + .unwrap(), + ); + let region = cluster.get_region(b"k1"); + // Send a malformed request to leader + let mut raft_msg = raft::eraftpb::Message::default(); + raft_msg.set_msg_type(MessageType::MsgReadIndex); + let rctx = ReadIndexContext { + id: Uuid::new_v4(), + request: None, + locked: None, + }; + let mut e = raft::eraftpb::Entry::default(); + e.set_data(rctx.to_bytes().into()); + raft_msg.mut_entries().push(e); + raft_msg.from = 1; + raft_msg.to = 1; + let mut message = RaftMessage::default(); + message.set_region_id(region_id); + message.set_from_peer(new_peer(1, 1)); + message.set_to_peer(new_peer(1, 1)); + message.set_region_epoch(region.get_region_epoch().clone()); + message.set_message(raft_msg); + // So the read won't be handled soon. + cluster.add_send_filter(IsolationFilterFactory::new(1)); + cluster.send_raft_msg(message).unwrap(); + // Also send a correct request. If the malformed request doesn't corrupt + // the read queue, the correct request should be responded. + let resp = async_read_on_peer(&mut cluster, new_peer(1, 1), region, b"k1", true, false); + cluster.clear_send_filters(); + + let timeout = Duration::from_secs(10); + let timeout_f = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + timeout) + .compat(); + let resp = futures::executor::block_on(async move { + futures::select! { + res = resp.fuse() => res.unwrap(), + e = timeout_f.fuse() => { + panic!("request timeout for {:?}: {:?}", timeout,e); + }, + } + }); + assert_eq!(resp.get_responses()[0].get_get().get_value(), b"v1"); +} From eb4ad726a34ab522f0dec6e94b20e67725a2fdc3 Mon Sep 17 00:00:00 2001 From: Lucas Date: Fri, 17 Mar 2023 10:24:40 +0800 Subject: [PATCH 588/676] raft-log-engine: supply `prefill-for-recycle` configuration to enable starting engine in cold state. (#14372) close tikv/tikv#14371 Adds a new configuration `raft-engine.prefill-for-recycle` for supporting to enable log recycling when starting TiKV in cold state. --- Cargo.lock | 4 ++-- etc/config-template.toml | 8 ++++++++ metrics/grafana/tikv_details.json | 10 +++++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94c562c5c6d..4123bc6377c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4204,7 +4204,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#33530112c3a4acaf8c50ca9d0470284109926296" +source = "git+https://github.com/tikv/raft-engine.git#404e3fefaeeb4da6b7650268d500cfd3fbd29cae" dependencies = [ "byteorder", "crc32fast", @@ -4238,7 +4238,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#33530112c3a4acaf8c50ca9d0470284109926296" +source = "git+https://github.com/tikv/raft-engine.git#404e3fefaeeb4da6b7650268d500cfd3fbd29cae" dependencies = [ "clap 3.1.6", "env_logger", diff --git a/etc/config-template.toml b/etc/config-template.toml index 3930a247374..80d9bc8a4d6 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -1104,6 +1104,14 @@ ## Default: false. # enable-log-recycle = false +## Whether to prepare log files for recycling when start. +## If `true`, batch empty log files will be prepared for recycling when +## starting engine. +## Only available for `enable-log-reycle` is true. +## +## Default: false +# prefill-for-recycle = false + [security] ## The path for TLS certificates. Empty string means disabling secure connections. # ca-path = "" diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index d4374fda369..f404ebc5376 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -23695,7 +23695,8 @@ "metric": "tikv_snapshot_limit_transport_bytes", "refId": "A", "step": 40 - },{ + }, + { "exemplar": true, "expr": "rate(tikv_snapshot_limit_generate_bytes{tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])", "hide": true, @@ -31976,6 +31977,13 @@ "intervalFactor": 1, "legendFormat": "swap", "refId": "B" + }, + { + "exemplar": true, + "expr": "avg(raft_engine_recycled_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"})", + "intervalFactor": 1, + "legendFormat": "recycle", + "refId": "C" } ], "thresholds": [], From 315d402780bf1ba8155aafaf57e1608809cedf1b Mon Sep 17 00:00:00 2001 From: Connor Date: Fri, 17 Mar 2023 13:30:39 +0800 Subject: [PATCH 589/676] Revert changes that may cause performance regression introduced by write prioirty scheduling (#14412) close tikv/tikv#14375 Revert the parsed entry that may lead to performance regression and disable priority pool for sched worker Signed-off-by: Connor1996 --- .../operation/command/admin/merge/prepare.rs | 3 +- .../raftstore/src/store/entry_storage.rs | 32 +-- components/raftstore/src/store/fsm/apply.rs | 141 ++++++++--- components/raftstore/src/store/peer.rs | 3 +- components/raftstore/src/store/util.rs | 235 +----------------- src/storage/txn/sched_pool.rs | 27 +- 6 files changed, 148 insertions(+), 293 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index 378e3d2e7c8..f9df2d9ea1a 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -245,7 +245,8 @@ impl Peer { if entry.get_data().is_empty() { continue; } - let cmd: RaftCmdRequest = util::parse_data_at(entry.get_data(), entry.get_index()); + let cmd: RaftCmdRequest = + util::parse_data_at(entry.get_data(), entry.get_index(), "tag"); if !cmd.has_admin_request() { continue; } diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 4d3f487a499..f5226961a6c 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -30,11 +30,7 @@ use super::{ metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use crate::{ - bytes_capacity, - store::{util::ParsedEntry, ReadTask}, - Result, -}; +use crate::{bytes_capacity, store::ReadTask, Result}; const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; const SHRINK_CACHE_CAPACITY: usize = 64; @@ -58,7 +54,7 @@ pub fn last_index(state: &RaftLocalState) -> u64 { pub struct CachedEntries { pub range: Range, // Entries and dangle size for them. `dangle` means not in entry cache. - entries: Arc, usize)>>, + entries: Arc, usize)>>, } impl CachedEntries { @@ -68,24 +64,21 @@ impl CachedEntries { let end = entries.last().map(|x| x.index).unwrap() + 1; let range = Range { start, end }; CachedEntries { - entries: Arc::new(Mutex::new(( - entries.into_iter().map(|e| ParsedEntry::new(e)).collect(), - 0, - ))), + entries: Arc::new(Mutex::new((entries, 0))), range, } } - pub fn iter_entries_mut(&self, mut f: impl FnMut(&mut ParsedEntry)) { - let mut entries = self.entries.lock().unwrap(); - for entry in &mut entries.0 { + pub fn iter_entries(&self, mut f: impl FnMut(&Entry)) { + let entries = self.entries.lock().unwrap(); + for entry in &entries.0 { f(entry); } } /// Take cached entries and dangle size for them. `dangle` means not in /// entry cache. - pub fn take_entries(&self) -> (Vec, usize) { + pub fn take_entries(&self) -> (Vec, usize) { mem::take(&mut *self.entries.lock().unwrap()) } } @@ -332,8 +325,8 @@ impl EntryCache { let dangle_size = { let mut guard = entries.entries.lock().unwrap(); - let last_idx = guard.0.last().map(|e| e.get_index()).unwrap(); - let cache_front = match self.cache.front().map(|e| e.get_index()) { + let last_idx = guard.0.last().map(|e| e.index).unwrap(); + let cache_front = match self.cache.front().map(|e| e.index) { Some(i) => i, None => u64::MAX, }; @@ -341,10 +334,7 @@ impl EntryCache { let dangle_range = if last_idx < cache_front { // All entries are not in entry cache. 0..guard.0.len() - } else if let Ok(i) = guard - .0 - .binary_search_by(|e| e.get_index().cmp(&cache_front)) - { + } else if let Ok(i) = guard.0.binary_search_by(|e| e.index.cmp(&cache_front)) { // Some entries are in entry cache. 0..i } else { @@ -354,7 +344,7 @@ impl EntryCache { let mut size = 0; for e in &guard.0[dangle_range] { - size += e.bytes_capacity(); + size += bytes_capacity(&e.data) + bytes_capacity(&e.context); } guard.1 = size; size diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 181ff207c0b..16a8bacbced 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -9,6 +9,7 @@ use std::{ cmp::{Ord, Ordering as CmpOrdering}, collections::VecDeque, fmt::{self, Debug, Formatter}, + io::BufRead, mem, ops::{Deref, DerefMut, Range as StdRange}, sync::{ @@ -45,7 +46,11 @@ use kvproto::{ }; use pd_client::{BucketMeta, BucketStat}; use prometheus::local::LocalHistogram; -use raft::eraftpb::{ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot}; +use protobuf::{wire_format::WireType, CodedInputStream, Message}; +use raft::eraftpb::{ + ConfChange, ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot, +}; +use raft_proto::ConfChangeI; use resource_control::{ResourceConsumeType, ResourceController, ResourceMetered}; use smallvec::{smallvec, SmallVec}; use sst_importer::SstImporter; @@ -88,7 +93,6 @@ use crate::{ util::{ self, admin_cmd_epoch_lookup, check_flashback_state, check_req_region_epoch, compare_region_epoch, ChangePeerI, ConfChangeKind, KeysInfoFormatter, LatencyInspector, - ParsedEntry, }, Config, RegionSnapshot, RegionTask, WriteCallback, }, @@ -848,6 +852,43 @@ fn should_sync_log(cmd: &RaftCmdRequest) -> bool { false } +fn can_witness_skip(entry: &Entry) -> bool { + // need to handle ConfChange entry type + if entry.get_entry_type() != EntryType::EntryNormal { + return false; + } + + // HACK: check admin request field in serialized data from `RaftCmdRequest` + // without deserializing all. It's done by checking the existence of the + // field number of `admin_request`. + // See the encoding in `write_to_with_cached_sizes()` of `RaftCmdRequest` in + // `raft_cmdpb.rs` for reference. + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return true; + } + let (mut field_number, wire_type) = is.read_tag_unpack().unwrap(); + // Header field is of number 1 + if field_number == 1 { + if wire_type != WireType::WireTypeLengthDelimited { + panic!("unexpected wire type"); + } + let len = is.read_raw_varint32().unwrap(); + // skip parsing the content of `Header` + is.consume(len as usize); + // read next field number + (field_number, _) = is.read_tag_unpack().unwrap(); + } + + // `Requests` field is of number 2 and `AdminRequest` field is of number 3. + // - If the next field is 2, there must be no admin request as in one + // `RaftCmdRequest`, either requests or admin_request is filled. + // - If the next field is 3, it's exactly an admin request. + // - If the next field is others, neither requests nor admin_request is filled, + // so there is no admin request. + field_number != 3 +} + /// A struct that stores the state related to Merge. /// /// When executing a `CommitMerge`, the source peer may have not applied @@ -870,7 +911,7 @@ where { /// All of the entries that need to continue to be applied after /// the source peer has applied its logs. - pending_entries: Vec, + pending_entries: Vec, /// All of messages that need to continue to be handled after /// the source peer has applied its logs and pending entries /// are all handled. @@ -1050,7 +1091,7 @@ where fn handle_raft_committed_entries( &mut self, apply_ctx: &mut ApplyContext, - mut committed_entries_drainer: Drain<'_, ParsedEntry>, + mut committed_entries_drainer: Drain<'_, Entry>, ) { if committed_entries_drainer.len() == 0 { return; @@ -1061,7 +1102,7 @@ where // must re-propose these commands again. apply_ctx.committed_count += committed_entries_drainer.len(); let mut results = VecDeque::new(); - while let Some(mut entry) = committed_entries_drainer.next() { + while let Some(entry) = committed_entries_drainer.next() { if self.pending_remove { // This peer is about to be destroyed, skip everything. break; @@ -1083,9 +1124,9 @@ where // running on data written by new version tikv), but PD will reject old version // tikv join the cluster, so this should not happen. let res = match entry.get_entry_type() { - EntryType::EntryNormal => self.handle_raft_entry_normal(apply_ctx, &mut entry), + EntryType::EntryNormal => self.handle_raft_entry_normal(apply_ctx, &entry), EntryType::EntryConfChange | EntryType::EntryConfChangeV2 => { - self.handle_raft_entry_conf_change(apply_ctx, &mut entry) + self.handle_raft_entry_conf_change(apply_ctx, &entry) } }; @@ -1155,7 +1196,7 @@ where fn handle_raft_entry_normal( &mut self, apply_ctx: &mut ApplyContext, - entry: &mut ParsedEntry, + entry: &Entry, ) -> ApplyResult { fail_point!( "yield_apply_first_region", @@ -1165,10 +1206,11 @@ where let index = entry.get_index(); let term = entry.get_term(); + let data = entry.get_data(); - if !entry.is_empty() { - if !self.peer.is_witness || !entry.can_witness_skip() { - let cmd = entry.take_cmd(); + if !data.is_empty() { + if !self.peer.is_witness || !can_witness_skip(entry) { + let cmd = util::parse_data_at(data, index, &self.tag); if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { self.priority = Priority::Low; } @@ -1227,7 +1269,7 @@ where fn handle_raft_entry_conf_change( &mut self, apply_ctx: &mut ApplyContext, - entry: &mut ParsedEntry, + entry: &Entry, ) -> ApplyResult { // Although conf change can't yield in normal case, it is convenient to // simulate yield before applying a conf change log. @@ -1235,7 +1277,16 @@ where ApplyResult::Yield }); let (index, term) = (entry.get_index(), entry.get_term()); - let (conf_change, cmd) = entry.take_conf_change(); + let conf_change: ConfChangeV2 = match entry.get_entry_type() { + EntryType::EntryConfChange => { + let conf_change: ConfChange = + util::parse_data_at(entry.get_data(), index, &self.tag); + conf_change.into_v2() + } + EntryType::EntryConfChangeV2 => util::parse_data_at(entry.get_data(), index, &self.tag), + _ => unreachable!(), + }; + let cmd = util::parse_data_at(conf_change.get_context(), index, &self.tag); match self.process_raft_cmd(apply_ctx, index, term, cmd) { ApplyResult::None => { // If failed, tell Raft that the `ConfChange` was aborted. @@ -3681,18 +3732,16 @@ impl ResourceMetered for Msg { let mut dominant_group = "".to_owned(); let mut max_write_bytes = 0; for cached_entries in &apply.entries { - cached_entries.iter_entries_mut(|entry| { - if entry.is_empty() { - return; - } + cached_entries.iter_entries(|entry| { + let header = util::get_entry_header(entry); + let group_name = header.get_resource_group_name().to_owned(); let write_bytes = entry.compute_size() as u64; - let group_name = entry.get_cmd().get_header().get_resource_group_name(); resource_ctl.consume( group_name.as_bytes(), ResourceConsumeType::IoBytes(write_bytes), ); if write_bytes > max_write_bytes { - dominant_group = group_name.to_owned(); + dominant_group = group_name; max_write_bytes = write_bytes; } }); @@ -3883,21 +3932,19 @@ where let mut dangle_size = 0; for cached_entries in apply.entries { - let (ents, sz) = cached_entries.take_entries(); + let (e, sz) = cached_entries.take_entries(); dangle_size += sz; - if ents.is_empty() { + if e.is_empty() { let rid = self.delegate.region_id(); let StdRange { start, end } = cached_entries.range; - let mut tmp_ents = Vec::new(); self.delegate .raft_engine - .fetch_entries_to(rid, start, end, None, &mut tmp_ents) + .fetch_entries_to(rid, start, end, None, &mut entries) .unwrap(); - entries.extend(tmp_ents.into_iter().map(|e| ParsedEntry::new(e))); } else if entries.is_empty() { - entries = ents; + entries = e; } else { - entries.extend(ents); + entries.extend(e); } } if dangle_size > 0 { @@ -4869,9 +4916,9 @@ mod memtrace { EK: KvEngine, { fn heap_size(&self) -> usize { - let mut size = self.pending_entries.capacity() * mem::size_of::(); + let mut size = self.pending_entries.capacity() * mem::size_of::(); for e in &self.pending_entries { - size += e.bytes_capacity(); + size += bytes_capacity(&e.data) + bytes_capacity(&e.context); } size += self.pending_msgs.capacity() * mem::size_of::>(); @@ -4928,6 +4975,7 @@ mod tests { time::*, }; + use bytes::Bytes; use engine_panic::PanicEngine; use engine_test::kv::{new_engine, KvTestEngine, KvTestSnapshot}; use engine_traits::{Peekable as PeekableTrait, SyncMutable, WriteBatchExt}; @@ -4937,6 +4985,7 @@ mod tests { raft_cmdpb::*, }; use protobuf::Message; + use raft::eraftpb::{ConfChange, ConfChangeV2}; use sst_importer::Config as ImportConfig; use tempfile::{Builder, TempDir}; use test_sst_importer::*; @@ -5043,6 +5092,42 @@ mod tests { } } + #[test] + fn test_can_witness_skip() { + let mut entry = Entry::new(); + let mut req = RaftCmdRequest::default(); + entry.set_entry_type(EntryType::EntryNormal); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(can_witness_skip(&entry)); + + req.mut_admin_request() + .set_cmd_type(AdminCmdType::CompactLog); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + + let mut req = RaftCmdRequest::default(); + let mut request = Request::default(); + request.set_cmd_type(CmdType::Put); + req.set_requests(vec![request].into()); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(can_witness_skip(&entry)); + + entry.set_entry_type(EntryType::EntryConfChange); + let conf_change = ConfChange::new(); + let data = conf_change.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + + entry.set_entry_type(EntryType::EntryConfChangeV2); + let conf_change_v2 = ConfChangeV2::new(); + let data = conf_change_v2.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + } + #[test] fn test_should_sync_log() { // Admin command diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index a0b28e44f07..8dc69a0def4 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4211,7 +4211,8 @@ where if entry.get_data().is_empty() { continue; } - let cmd: RaftCmdRequest = util::parse_data_at(entry.get_data(), entry.get_index()); + let cmd: RaftCmdRequest = + util::parse_data_at(entry.get_data(), entry.get_index(), &self.tag); if !cmd.has_admin_request() { continue; } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 7408b540285..d48c5e78e7c 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -6,7 +6,6 @@ use std::{ collections::{HashMap, VecDeque}, fmt, fmt::Display, - io::BufRead, option::Option, sync::{ atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering}, @@ -25,15 +24,12 @@ use kvproto::{ }, raft_serverpb::{RaftMessage, RaftSnapshotData}, }; -use protobuf::{self, wire_format::WireType, CodedInputStream, Message}; +use protobuf::{self, CodedInputStream, Message}; use raft::{ eraftpb::{self, ConfChangeType, ConfState, Entry, EntryType, MessageType, Snapshot}, Changer, RawNode, INVALID_INDEX, }; -use raft_proto::{ - eraftpb::{ConfChange, ConfChangeV2}, - ConfChangeI, -}; +use raft_proto::ConfChangeI; use tikv_util::{ box_err, codec::number::{decode_u64, NumberEncoder}, @@ -47,9 +43,7 @@ use tokio::sync::Notify; use txn_types::WriteBatchFlags; use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; -use crate::{ - bytes_capacity, coprocessor::CoprocessorHost, store::snap::SNAPSHOT_VERSION, Error, Result, -}; +use crate::{coprocessor::CoprocessorHost, store::snap::SNAPSHOT_VERSION, Error, Result}; const INVALID_TIMESTAMP: u64 = u64::MAX; @@ -751,139 +745,6 @@ pub(crate) fn u64_to_timespec(u: u64) -> Timespec { Timespec::new(sec as i64, nsec as i32) } -// ParsedEntry wraps raft-proto `Entry` and used to avoid parsing raft command -// from entry's data repeatedly. The parsed command may be used in multiple -// places, so cache it at the first place. -pub struct ParsedEntry { - entry: Entry, - cmd: Option, - conf_change: Option, - parsed: bool, -} - -impl ParsedEntry { - pub fn new(entry: Entry) -> ParsedEntry { - ParsedEntry { - entry, - cmd: None, - conf_change: None, - parsed: false, - } - } - - pub fn get_entry_type(&self) -> EntryType { - self.entry.get_entry_type() - } - - pub fn get_index(&self) -> u64 { - self.entry.get_index() - } - - pub fn get_term(&self) -> u64 { - self.entry.get_term() - } - - pub fn compute_size(&self) -> u32 { - self.entry.compute_size() - } - - pub fn is_empty(&self) -> bool { - self.entry.get_data().is_empty() - } - - pub fn bytes_capacity(&self) -> usize { - bytes_capacity(&self.entry.data) + bytes_capacity(&self.entry.context) - } - - fn parse(&mut self) { - assert!(!self.is_empty()); - - let data = self.entry.get_data(); - let index = self.entry.get_index(); - // lazy parse the cmd from entry context - let conf_change = match self.entry.get_entry_type() { - EntryType::EntryConfChange => { - let conf_change: ConfChange = parse_data_at(data, index); - Some(conf_change.into_v2()) - } - EntryType::EntryConfChangeV2 => Some(parse_data_at(data, index)), - EntryType::EntryNormal => { - self.cmd = Some(parse_data_at(data, index)); - None - } - }; - if let Some(conf_change) = conf_change { - self.cmd = Some(parse_data_at(conf_change.get_context(), index)); - self.conf_change = Some(conf_change); - } - self.parsed = true; - } - - pub fn get_cmd(&mut self) -> &RaftCmdRequest { - if !self.parsed { - self.parse(); - } - self.cmd.as_ref().unwrap() - } - - pub fn take_cmd(&mut self) -> RaftCmdRequest { - if !self.parsed { - self.parse(); - } - self.parsed = false; - self.cmd.take().unwrap() - } - - pub fn take_conf_change(&mut self) -> (ConfChangeV2, RaftCmdRequest) { - if !self.parsed { - self.parse(); - } - self.parsed = false; - (self.conf_change.take().unwrap(), self.cmd.take().unwrap()) - } - - pub fn can_witness_skip(&self) -> bool { - !has_admin_request(&self.entry) - } -} - -fn has_admin_request(entry: &Entry) -> bool { - // need to handle ConfChange entry type - if entry.get_entry_type() != EntryType::EntryNormal { - return true; - } - - // HACK: check admin request field in serialized data from `RaftCmdRequest` - // without deserializing all. It's done by checking the existence of the - // field number of `admin_request`. - // See the encoding in `write_to_with_cached_sizes()` of `RaftCmdRequest` in - // `raft_cmdpb.rs` for reference. - let mut is = CodedInputStream::from_bytes(entry.get_data()); - if is.eof().unwrap() { - return false; - } - let (mut field_number, wire_type) = is.read_tag_unpack().unwrap(); - // Header field is of number 1 - if field_number == 1 { - if wire_type != WireType::WireTypeLengthDelimited { - panic!("unexpected wire type"); - } - let len = is.read_raw_varint32().unwrap(); - // skip parsing the content of `Header` - is.consume(len as usize); - // read next field number - (field_number, _) = is.read_tag_unpack().unwrap(); - } - - // `Requests` field is of number 2 and `AdminRequest` field is of number 3. - // - If the next field is 2, there must be no admin request as in one - // `RaftCmdRequest`, either requests or admin_request is filled. - // - If the next field is 3, it's exactly an admin request. - // - If the next field is others, neither requests nor admin_request is filled, - // so there is no admin request. - field_number == 3 -} - pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { if entry.get_entry_type() != EntryType::EntryNormal { return RaftRequestHeader::default(); @@ -909,10 +770,10 @@ pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { /// If `data` is corrupted, this function will panic. // TODO: make sure received entries are not corrupted #[inline] -pub fn parse_data_at(data: &[u8], index: u64) -> T { +pub fn parse_data_at(data: &[u8], index: u64, tag: &str) -> T { let mut result = T::default(); result.merge_from_bytes(data).unwrap_or_else(|e| { - panic!("{} data is corrupted : {:?}", index, e); + panic!("{} data is corrupted at {}: {:?}", tag, index, e); }); result } @@ -1856,11 +1717,10 @@ pub fn validate_split_region( mod tests { use std::thread; - use bytes::Bytes; use engine_test::kv::KvTestEngine; use kvproto::{ metapb::{self, RegionEpoch}, - raft_cmdpb::{AdminRequest, CmdType, Request}, + raft_cmdpb::AdminRequest, }; use protobuf::Message as _; use raft::eraftpb::{ConfChangeType, Entry, Message, MessageType}; @@ -1941,53 +1801,6 @@ mod tests { assert_eq!(m1.inspect(Some(monotonic_raw_now())), LeaseState::Valid); } - #[test] - fn test_parsed_entry() { - let mut req = RaftCmdRequest::default(); - let mut header = RaftRequestHeader::default(); - header.set_resource_group_name("test".to_owned()); - req.set_header(header); - - let mut entry = Entry::new(); - entry.set_term(1); - entry.set_index(2); - entry.set_entry_type(raft::eraftpb::EntryType::EntryNormal); - entry.set_data(req.write_to_bytes().unwrap().into()); - - let mut parsed = ParsedEntry::new(entry); - assert_eq!(parsed.get_term(), 1); - assert_eq!(parsed.get_index(), 2); - assert_eq!( - parsed.get_cmd().get_header().get_resource_group_name(), - "test" - ); - - let mut entry = Entry::new(); - entry.set_term(1); - entry.set_index(2); - entry.set_entry_type(raft::eraftpb::EntryType::EntryConfChangeV2); - let mut cc = ConfChangeV2::new(); - let mut ccs = eraftpb::ConfChangeSingle::default(); - ccs.set_change_type(ConfChangeType::AddNode); - ccs.set_node_id(3); - cc.set_changes(vec![ccs].into()); - cc.set_context(req.write_to_bytes().unwrap().into()); - entry.set_data(cc.write_to_bytes().unwrap().into()); - - let mut parsed = ParsedEntry::new(entry); - let (conf_change, cmd) = parsed.take_conf_change(); - assert_eq!( - conf_change.get_changes()[0].get_change_type(), - ConfChangeType::AddNode - ); - assert_eq!(conf_change.get_changes()[0].get_node_id(), 3); - assert_eq!(cmd.get_header().get_resource_group_name(), "test"); - assert_eq!( - parsed.get_cmd().get_header().get_resource_group_name(), - "test" - ); - } - #[test] fn test_get_entry_header() { let mut req = RaftCmdRequest::default(); @@ -2338,42 +2151,6 @@ mod tests { check_term(&header, 10).unwrap_err(); } - #[test] - fn test_has_admin_request() { - let mut entry = Entry::new(); - let mut req = RaftCmdRequest::default(); - entry.set_entry_type(EntryType::EntryNormal); - let data = req.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(!has_admin_request(&entry)); - - req.mut_admin_request() - .set_cmd_type(AdminCmdType::CompactLog); - let data = req.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(has_admin_request(&entry)); - - let mut req = RaftCmdRequest::default(); - let mut request = Request::default(); - request.set_cmd_type(CmdType::Put); - req.set_requests(vec![request].into()); - let data = req.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(!has_admin_request(&entry)); - - entry.set_entry_type(EntryType::EntryConfChange); - let conf_change = ConfChange::new(); - let data = conf_change.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(has_admin_request(&entry)); - - entry.set_entry_type(EntryType::EntryConfChangeV2); - let conf_change_v2 = ConfChangeV2::new(); - let data = conf_change_v2.write_to_bytes().unwrap(); - entry.set_data(Bytes::copy_from_slice(&data)); - assert!(has_admin_request(&entry)); - } - #[test] fn test_check_req_region_epoch() { let mut epoch = RegionEpoch::default(); diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 0cff9d51d41..49539d51d8c 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -74,7 +74,7 @@ impl SchedPool { pool_size: usize, reporter: R, feature_gate: FeatureGate, - resource_ctl: Option>, + _resource_ctl: Option>, ) -> Self { let builder = |pool_size: usize, name_prefix: &str| { let engine = Arc::new(Mutex::new(engine.clone())); @@ -102,19 +102,20 @@ impl SchedPool { tls_flush(&reporter); }) }; - if let Some(ref r) = resource_ctl { - SchedPool::Priority { - worker_pool: builder(pool_size, "sched-worker-pool") - .build_priority_future_pool(r.clone()), - resource_ctl: r.clone(), - } - } else { - SchedPool::Vanilla { - worker_pool: builder(pool_size, "sched-worker-pool").build_future_pool(), - high_worker_pool: builder(std::cmp::max(1, pool_size / 2), "sched-high-pri-pool") - .build_future_pool(), - } + // FIXME: for performance issue, disable priority pool temporarily + // if let Some(ref r) = resource_ctl { + // SchedPool::Priority { + // worker_pool: builder(pool_size, "sched-worker-pool") + // .build_priority_future_pool(r.clone()), + // resource_ctl: r.clone(), + // } + // } else { + SchedPool::Vanilla { + worker_pool: builder(pool_size, "sched-worker-pool").build_future_pool(), + high_worker_pool: builder(std::cmp::max(1, pool_size / 2), "sched-high-pri-pool") + .build_future_pool(), } + // } } pub fn spawn( From b9bc478913da5bda779d9e59e10e649fe86f89d4 Mon Sep 17 00:00:00 2001 From: Lucas Date: Fri, 17 Mar 2023 13:46:39 +0800 Subject: [PATCH 590/676] engine: enable log recycling by default (#14380) close tikv/tikv#14379 Enable log recycling in `RaftLogEngine` by default Signed-off-by: Lucasliang Co-authored-by: Ti Chi Robot --- etc/config-template.toml | 4 ++-- src/config/mod.rs | 8 +++++++- tests/integrations/config/test-custom.toml | 1 + tests/integrations/config/test-default.toml | 1 + 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/etc/config-template.toml b/etc/config-template.toml index 80d9bc8a4d6..aec5e108949 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -1101,8 +1101,8 @@ ## Only available for `format-version` >= 2. This option is only ## available when TiKV >= 6.3.x. ## -## Default: false. -# enable-log-recycle = false +## Default: true. +# enable-log-recycle = true ## Whether to prepare log files for recycling when start. ## If `true`, batch empty log files will be prepared for recycling when diff --git a/src/config/mod.rs b/src/config/mod.rs index 689e0330a2b..c1c38e39d77 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1800,7 +1800,13 @@ impl Default for RaftEngineConfig { fn default() -> Self { Self { enable: true, - config: RawRaftEngineConfig::default(), + config: RawRaftEngineConfig { + // TODO: after update the dependency to `raft-engine` lib, revokes the + // following unelegant settings. + // Enable log recycling by default. + enable_log_recycle: true, + ..RawRaftEngineConfig::default() + }, } } } diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 7f5dbfa1db7..416505a7318 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -602,6 +602,7 @@ recovery-mode = "tolerate-tail-corruption" recovery-read-block-size = "1KB" recovery-threads = 2 memory-limit = "1GB" +enable-log-recycle = true # enable by default [security] ca-path = "invalid path" diff --git a/tests/integrations/config/test-default.toml b/tests/integrations/config/test-default.toml index 23e53b9daf3..ef3c83c00df 100644 --- a/tests/integrations/config/test-default.toml +++ b/tests/integrations/config/test-default.toml @@ -39,6 +39,7 @@ [raftdb.defaultcf] [raft-engine] +enable-log-recycle = true # enable by default [security] From fd2db9a796b16d7665927340d9ebdc83022ea0e2 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 17 Mar 2023 14:10:39 +0800 Subject: [PATCH 591/676] raftstore-v2: split init may be out of dated when conf change ocurred (#14407) close tikv/tikv#14389 split init may be out of dated Signed-off-by: Spade A Co-authored-by: Xinye Tao --- .../src/operation/command/admin/split.rs | 20 +++++++- tests/failpoints/cases/test_split_region.rs | 51 +++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index e6cd7511801..82bae03f062 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -391,6 +391,11 @@ impl Apply { req: &AdminRequest, log_index: u64, ) -> Result<(AdminResponse, AdminCmdResult)> { + fail_point!( + "on_apply_batch_split", + self.peer().get_store_id() == 3, + |_| { unreachable!() } + ); PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); let region = self.region(); @@ -692,7 +697,20 @@ impl Peer { mut split_init: Box, ) { let region_id = split_init.region.id; - if self.storage().is_initialized() && self.persisted_index() >= RAFT_INIT_LOG_INDEX { + let peer_id = split_init + .region + .get_peers() + .iter() + .find(|p| p.get_store_id() == self.peer().get_store_id()) + .unwrap() + .get_id(); + + // If peer_id in `split_init` is less than the current peer_id, the conf change + // for the peer should have occurred and we should just report finish to + // the source region of this out of dated peer initialization. + if self.storage().is_initialized() && self.persisted_index() >= RAFT_INIT_LOG_INDEX + || peer_id < self.peer().get_id() + { // Race with split operation. The tablet created by split will eventually be // deleted. We don't trim it. report_split_init_finish(store_ctx, split_init.derived_region_id, region_id, true); diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 94dfd1b5648..792a21217ad 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -1136,3 +1136,54 @@ fn test_split_during_cluster_shutdown() { test_split("before_cluster_shutdown1"); test_split("before_cluster_shutdown2"); } + +// Test that split is handled pretty slow in one node, say node 2. Before node 2 +// handles the split, the peer of the new split region on node 2 has been +// removed and added back sooner. So, when the new split region on node 2 +// receives a heartbeat from it's leader, it creates a peer with higher peer id +// than the peer created due to the split on this node. +#[test] +fn test_split_race_with_conf_change() { + // test case for raftstore-v2 + use test_raftstore_v2::*; + + let mut cluster = new_node_cluster(0, 3); + configure_for_snapshot(&mut cluster.cfg); + cluster.cfg.raft_store.right_derive_when_split = false; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.run(); + + let split_key1 = b"k05"; + let region = cluster.get_region(split_key1); + cluster.must_transfer_leader(region.get_id(), new_peer(1, 1)); + + fail::cfg("on_apply_batch_split", "pause").unwrap(); + cluster.must_split(®ion, split_key1); + + let region = pd_client.get_region(b"k10").unwrap(); + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(region.get_id(), 3) + .msg_type(MessageType::MsgSnapshot) + .msg_type(MessageType::MsgAppend) + .direction(Direction::Recv), + )); + + let mut peer3 = region + .get_peers() + .iter() + .find(|p| p.get_store_id() == 3) + .unwrap() + .clone(); + pd_client.must_remove_peer(region.get_id(), peer3.clone()); + peer3.set_id(2000); + pd_client.must_add_peer(region.get_id(), peer3.clone()); + + fail::remove("on_apply_batch_split"); + std::thread::sleep(Duration::from_millis(200)); + cluster.clear_send_filters(); + + cluster.stop_node(2); + cluster.must_put(b"k06", b"val"); + assert_eq!(cluster.must_get(b"k06").unwrap(), b"val".to_vec()); +} From 138e1cd3c819ef8e9388ab7dc06e7d43fd9a5896 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 17 Mar 2023 17:04:40 +0800 Subject: [PATCH 592/676] raftstore-v2: remove flashback context (#14404) ref tikv/tikv#12842, ref tikv/tikv#14405 Flashback is not fully implemented for raftkv2, setting fields may lead to request failure in normal cases due to https://github.com/tikv/tikv/issues/14405. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- src/server/raftkv2/mod.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 60e0a53a20a..5434da9ce91 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -169,9 +169,10 @@ impl tikv_kv::Engine for RaftKv2 { if ctx.pb_ctx.get_stale_read() && need_encoded_start_ts { flags |= WriteBatchFlags::STALE_READ.bits(); } - if ctx.allowed_in_flashback { - flags |= WriteBatchFlags::FLASHBACK.bits(); - } + // TODO: flashback is not supported yet. + // if ctx.allowed_in_flashback { + // flags |= WriteBatchFlags::FLASHBACK.bits(); + // } header.set_flags(flags); // Encode `start_ts` in `flag_data` for the check of stale read and flashback. if need_encoded_start_ts { @@ -234,9 +235,10 @@ impl tikv_kv::Engine for RaftKv2 { if batch.extra.one_pc { flags |= WriteBatchFlags::ONE_PC.bits(); } - if batch.extra.allowed_in_flashback { - flags |= WriteBatchFlags::FLASHBACK.bits(); - } + // TODO: flashback is not supported yet. + // if batch.extra.allowed_in_flashback { + // flags |= WriteBatchFlags::FLASHBACK.bits(); + // } header.set_flags(flags); self.schedule_txn_extra(batch.extra); From 4baf9e72b97b20d8199b25f97376f5204560ce46 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 17 Mar 2023 19:12:40 +0800 Subject: [PATCH 593/676] tikv-ctl,raftstore: add a log to output corrupted raft msg (#13669) ref tikv/tikv#13668 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- cmd/tikv-ctl/src/cmd.rs | 2 ++ cmd/tikv-ctl/src/executor.rs | 7 ++++++- cmd/tikv-ctl/src/main.rs | 9 +++++++-- components/raftstore/src/store/util.rs | 8 +++++++- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/cmd/tikv-ctl/src/cmd.rs b/cmd/tikv-ctl/src/cmd.rs index 657d296109c..42678386f5a 100644 --- a/cmd/tikv-ctl/src/cmd.rs +++ b/cmd/tikv-ctl/src/cmd.rs @@ -586,6 +586,8 @@ pub enum RaftCmd { help = RAW_KEY_HINT, )] key: Option, + #[structopt(short = "b")] + binary: bool, }, /// print region info Region { diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index 7dd00a1d29c..df095e44425 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -233,7 +233,7 @@ pub trait DebugExecutor { ); } - fn dump_raft_log(&self, region: u64, index: u64) { + fn dump_raft_log(&self, region: u64, index: u64, binary: bool) { let idx_key = keys::raft_log_key(region, index); println!("idx_key: {}", escape(&idx_key)); println!("region: {}", region); @@ -248,6 +248,11 @@ pub trait DebugExecutor { return; } + if binary { + println!("data: \n{}", hex::encode_upper(&data)); + return; + } + match entry.get_entry_type() { EntryType::EntryNormal => { let mut msg = RaftCmdRequest::default(); diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index e4c7be98dba..f547a2cee3a 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -258,7 +258,12 @@ fn main() { debug_executor.dump_value(&cf, key); } Cmd::Raft { cmd: subcmd } => match subcmd { - RaftCmd::Log { region, index, key } => { + RaftCmd::Log { + region, + index, + key, + binary, + } => { let (id, index) = if let Some(key) = key.as_deref() { keys::decode_raft_log_key(&unescape(key)).unwrap() } else { @@ -266,7 +271,7 @@ fn main() { let index = index.unwrap(); (id, index) }; - debug_executor.dump_raft_log(id, index); + debug_executor.dump_raft_log(id, index, binary); } RaftCmd::Region { regions, diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index d48c5e78e7c..82a04ec6f4b 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -773,7 +773,13 @@ pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { pub fn parse_data_at(data: &[u8], index: u64, tag: &str) -> T { let mut result = T::default(); result.merge_from_bytes(data).unwrap_or_else(|e| { - panic!("{} data is corrupted at {}: {:?}", tag, index, e); + panic!( + "{} data is corrupted at {}: {:?}. hex value: {}", + tag, + index, + e, + log_wrappers::Value::value(data) + ); }); result } From bec40346e641f480b35c6a83bfe6e4fd169ddc01 Mon Sep 17 00:00:00 2001 From: you06 Date: Sun, 19 Mar 2023 16:46:39 +0800 Subject: [PATCH 594/676] readpool: fix missing metric `tikv_yatp_task_poll_duration` (#14423) close tikv/tikv#14424 Fix the missing metric `tikv_yatp_task_poll_duration` by upgrading yatp. Signed-off-by: you06 --- Cargo.lock | 37 +++++---------------- Cargo.toml | 4 --- src/read_pool.rs | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4123bc6377c..f3ee64d058b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1184,7 +1184,7 @@ dependencies = [ "cfg-if 1.0.0", "crossbeam-channel", "crossbeam-deque", - "crossbeam-epoch 0.9.8", + "crossbeam-epoch", "crossbeam-queue", "crossbeam-utils 0.8.8", ] @@ -1201,12 +1201,13 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.2" -source = "git+https://github.com/crossbeam-rs/crossbeam?rev=41ed3d948720f26149b2ebeaf58fe8a193134056#41ed3d948720f26149b2ebeaf58fe8a193134056" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.10", - "crossbeam-utils 0.8.11", + "crossbeam-epoch", + "crossbeam-utils 0.8.8", ] [[package]] @@ -1223,19 +1224,6 @@ dependencies = [ "scopeguard", ] -[[package]] -name = "crossbeam-epoch" -version = "0.9.10" -source = "git+https://github.com/crossbeam-rs/crossbeam?rev=41ed3d948720f26149b2ebeaf58fe8a193134056#41ed3d948720f26149b2ebeaf58fe8a193134056" -dependencies = [ - "autocfg", - "cfg-if 1.0.0", - "crossbeam-utils 0.8.11", - "memoffset 0.6.4", - "once_cell", - "scopeguard", -] - [[package]] name = "crossbeam-queue" version = "0.3.5" @@ -1253,7 +1241,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "883a5821d7d079fcf34ac55f27a833ee61678110f6b97637cc74513c0d0b42fc" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.8", + "crossbeam-epoch", "crossbeam-utils 0.8.8", "scopeguard", ] @@ -1279,15 +1267,6 @@ dependencies = [ "lazy_static", ] -[[package]] -name = "crossbeam-utils" -version = "0.8.11" -source = "git+https://github.com/crossbeam-rs/crossbeam?rev=41ed3d948720f26149b2ebeaf58fe8a193134056#41ed3d948720f26149b2ebeaf58fe8a193134056" -dependencies = [ - "cfg-if 1.0.0", - "once_cell", -] - [[package]] name = "crypto-mac" version = "0.10.0" @@ -7493,7 +7472,7 @@ checksum = "541b12c998c5b56aa2b4e6f18f03664eef9a4fd0a246a55594efae6cc2d964b5" [[package]] name = "yatp" version = "0.0.1" -source = "git+https://github.com/tikv/yatp.git?branch=master#7ed25299d60a5338bea4ac0ed7470887ab74a010" +source = "git+https://github.com/tikv/yatp.git?branch=master#5523a9a6a4d0d6242bdb02b0a344f7ee1477b39b" dependencies = [ "crossbeam-deque", "crossbeam-skiplist", diff --git a/Cargo.toml b/Cargo.toml index a559fa22474..57a2ab4eced 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -215,10 +215,6 @@ fs2 = { git = "https://github.com/tabokie/fs2-rs", branch = "tikv" } # Remove this when a new version is release. We need to solve rust-lang/cmake-rs#143. cmake = { git = "https://github.com/rust-lang/cmake-rs" } -# TODO: remove this after crossbeam-deque is updated to the next release version. -# This is a workaround for cargo can't resolving the this patch in yatp. -crossbeam-deque = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "41ed3d948720f26149b2ebeaf58fe8a193134056" } - [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } # When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to diff --git a/src/read_pool.rs b/src/read_pool.rs index 4852caa181b..16d1a7091b7 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -413,6 +413,7 @@ fn get_unified_read_pool_name() -> String { "unified-read-pool".to_string() } +#[inline] pub fn build_yatp_read_pool( config: &UnifiedReadPoolConfig, reporter: R, @@ -421,6 +422,24 @@ pub fn build_yatp_read_pool( cleanup_method: CleanupMethod, ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); + build_yatp_read_pool_with_name( + config, + reporter, + engine, + resource_ctl, + cleanup_method, + unified_read_pool_name, + ) +} + +pub fn build_yatp_read_pool_with_name( + config: &UnifiedReadPoolConfig, + reporter: R, + engine: E, + resource_ctl: Option>, + cleanup_method: CleanupMethod, + unified_read_pool_name: String, +) -> ReadPool { let raftkv = Arc::new(Mutex::new(engine)); let builder = YatpPoolBuilder::new(ReporterTicker { reporter }) .name_prefix(&unified_read_pool_name) @@ -744,6 +763,7 @@ mod tests { use futures::channel::oneshot; use raftstore::store::{ReadStats, WriteStats}; + use resource_control::ResourceGroupManager; use super::*; use crate::storage::TestEngineBuilder; @@ -942,4 +962,69 @@ mod tests { let ewma = inspector.get_ewma_time_slice().as_secs_f64(); assert!((ewma - 0.01307).abs() < MARGIN); } + + #[test] + fn test_yatp_task_poll_duration_metric() { + let count_metric = |name: &str| -> u64 { + let mut sum = 0; + for i in 0..=2 { + let hist = + yatp::metrics::TASK_POLL_DURATION.with_label_values(&[name, &format!("{}", i)]); + sum += hist.get_sample_count(); + } + sum + }; + + for control in [false, true] { + let name = format!("test_yatp_task_poll_duration_metric_{}", control); + let resource_manager = if control { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller(name.clone(), true); + Some(resource_ctl) + } else { + None + }; + let config = UnifiedReadPoolConfig { + min_thread_count: 1, + max_thread_count: 2, + max_tasks_per_worker: 1, + ..Default::default() + }; + + let engine = TestEngineBuilder::new().build().unwrap(); + + let pool = build_yatp_read_pool_with_name( + &config, + DummyReporter, + engine, + resource_manager, + CleanupMethod::InPlace, + name.clone(), + ); + + let gen_task = || { + let (tx, rx) = oneshot::channel::<()>(); + let task = async move { + // sleep the thread 100ms to trigger flushing the metrics. + std::thread::sleep(std::time::Duration::from_millis(100)); + let _ = rx.await; + }; + (task, tx) + }; + + let handle = pool.handle(); + let (task1, tx1) = gen_task(); + let (task2, tx2) = gen_task(); + + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); + + tx1.send(()).unwrap(); + tx2.send(()).unwrap(); + + thread::sleep(Duration::from_millis(300)); + assert_eq!(count_metric(&name), 2); + drop(pool); + } + } } From 63d82f3404e3219f355f37d8360d33a0e303ab3c Mon Sep 17 00:00:00 2001 From: Connor Date: Mon, 20 Mar 2023 11:52:41 +0800 Subject: [PATCH 595/676] resource_control: Introduce resource group priority (#14414) ref tikv/tikv#13730 Introduce resource group priority, tasks are scheduled based on the order of (priority, virtual_time) Signed-off-by: Connor1996 --- Cargo.lock | 2 +- components/batch-system/src/scheduler.rs | 10 +- components/raftstore/src/coprocessor/mod.rs | 2 +- .../raftstore/src/store/async_io/write.rs | 2 +- .../src/store/async_io/write_router.rs | 9 +- .../src/store/async_io/write_tests.rs | 14 +-- components/raftstore/src/store/fsm/store.rs | 2 +- .../src/store/worker/refresh_config.rs | 2 +- components/resource_control/src/channel.rs | 35 ++++--- .../resource_control/src/resource_group.rs | 98 +++++++++++++++---- components/resource_control/src/service.rs | 12 +-- src/server/status_server/mod.rs | 8 +- 12 files changed, 130 insertions(+), 66 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f3ee64d058b..f313d747187 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2712,7 +2712,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#60b33e619c70d8abe151f086a19a82895965f28f" +source = "git+https://github.com/pingcap/kvproto.git#b47a4830141f7c8d2719db0f0184652e692eb672" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/batch-system/src/scheduler.rs b/components/batch-system/src/scheduler.rs index 9eadb125f78..723863249fb 100644 --- a/components/batch-system/src/scheduler.rs +++ b/components/batch-system/src/scheduler.rs @@ -44,7 +44,7 @@ where Priority::Low => &self.low_sender, }; - match sender.send(FsmTypes::Normal(fsm), 0) { + match sender.send(FsmTypes::Normal(fsm), None) { Ok(_) => {} Err(SendError(FsmTypes::Normal(fsm))) => warn!("failed to schedule fsm {:p}", fsm), _ => unreachable!(), @@ -55,8 +55,8 @@ where // TODO: close it explicitly once it's supported. // Magic number, actually any number greater than poll pool size works. for _ in 0..256 { - let _ = self.sender.send(FsmTypes::Empty, 0); - let _ = self.low_sender.send(FsmTypes::Empty, 0); + let _ = self.sender.send(FsmTypes::Empty, None); + let _ = self.low_sender.send(FsmTypes::Empty, None); } } } @@ -88,7 +88,7 @@ where #[inline] fn schedule(&self, fsm: Box) { - match self.sender.send(FsmTypes::Control(fsm), 0) { + match self.sender.send(FsmTypes::Control(fsm), None) { Ok(_) => {} Err(SendError(FsmTypes::Control(fsm))) => warn!("failed to schedule fsm {:p}", fsm), _ => unreachable!(), @@ -99,7 +99,7 @@ where // TODO: close it explicitly once it's supported. // Magic number, actually any number greater than poll pool size works. for _ in 0..256 { - let _ = self.sender.send(FsmTypes::Empty, 0); + let _ = self.sender.send(FsmTypes::Empty, None); } } } diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 98b045dbed8..82b6dce17ee 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -275,7 +275,7 @@ pub struct RoleChange { } impl RoleChange { - #[cfg(feature = "testexport")] + #[cfg(any(test, feature = "testexport"))] pub fn new(state: StateRole) -> Self { RoleChange { state, diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index b58d2601d95..e94f7360c23 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -965,7 +965,7 @@ where assert_eq!(writers.len(), handlers.len()); for (i, handler) in handlers.drain(..).enumerate() { info!("stopping store writer {}", i); - writers[i].send(WriteMsg::Shutdown, 0).unwrap(); + writers[i].send(WriteMsg::Shutdown, None).unwrap(); handler.join().unwrap(); } } diff --git a/components/raftstore/src/store/async_io/write_router.rs b/components/raftstore/src/store/async_io/write_router.rs index d00007a9485..3669fddd613 100644 --- a/components/raftstore/src/store/async_io/write_router.rs +++ b/components/raftstore/src/store/async_io/write_router.rs @@ -75,7 +75,7 @@ where pending_write_msgs: Vec>, /// The scheduling priority of the last msg, only valid when priority /// scheduling is enabled - last_msg_priority: u64, + last_msg_priority: Option, } impl WriteRouter @@ -91,7 +91,7 @@ where next_writer_id: None, last_unpersisted: None, pending_write_msgs: vec![], - last_msg_priority: 0, + last_msg_priority: None, } } @@ -103,6 +103,10 @@ where last_unpersisted: Option, msg: WriteMsg, ) { + if last_unpersisted.is_none() { + // reset when there is no pending write + self.last_msg_priority = None; + } if self.should_send(ctx, last_unpersisted) { self.send(ctx, msg); } else { @@ -238,6 +242,7 @@ where // pass the priority of last msg as low bound to make sure all messages of one // peer are handled sequentially. match sender.try_send(msg, self.last_msg_priority) { + // TODO: handle last msg priority properly Ok(priority) => self.last_msg_priority = priority, Err(TrySendError::Full(msg)) => { let now = Instant::now(); diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index cae5842c8b8..24abf24c4fd 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -499,7 +499,7 @@ fn test_basic_flow() { .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); t.write_sender(0) - .send(WriteMsg::WriteTask(task_1), 0) + .send(WriteMsg::WriteTask(task_1), None) .unwrap(); let mut task_2 = WriteTask::::new(2, 2, 20); @@ -515,7 +515,7 @@ fn test_basic_flow() { .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); t.write_sender(1) - .send(WriteMsg::WriteTask(task_2), 0) + .send(WriteMsg::WriteTask(task_2), None) .unwrap(); let mut task_3 = WriteTask::::new(region_1, 1, 15); @@ -531,7 +531,7 @@ fn test_basic_flow() { .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); t.write_sender(0) - .send(WriteMsg::WriteTask(task_3), 0) + .send(WriteMsg::WriteTask(task_3), None) .unwrap(); must_wait_same_notifies(vec![(region_1, (1, 15)), (region_2, (2, 20))], &t.notify_rx); @@ -601,7 +601,7 @@ fn test_basic_flow_with_states() { .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); t.write_sender(0) - .send(WriteMsg::WriteTask(task_1), 0) + .send(WriteMsg::WriteTask(task_1), None) .unwrap(); let mut task_2 = WriteTask::::new(2, 2, 20); @@ -620,7 +620,7 @@ fn test_basic_flow_with_states() { .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); t.write_sender(1) - .send(WriteMsg::WriteTask(task_2), 0) + .send(WriteMsg::WriteTask(task_2), None) .unwrap(); let mut task_3 = WriteTask::::new(region_1, 1, 15); @@ -638,7 +638,7 @@ fn test_basic_flow_with_states() { .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); t.write_sender(0) - .send(WriteMsg::WriteTask(task_3), 0) + .send(WriteMsg::WriteTask(task_3), None) .unwrap(); must_wait_same_notifies(vec![(region_1, (1, 15)), (region_2, (2, 20))], &t.notify_rx); @@ -714,7 +714,7 @@ fn test_resource_group() { let mut t = TestWriters::new(cfg, &engines, Some(resource_manager)); let (tx, rx) = mpsc::sync_channel(0); - t.write_sender(0).send(WriteMsg::Pause(rx), 0).unwrap(); + t.write_sender(0).send(WriteMsg::Pause(rx), None).unwrap(); let mut r = WriteRouter::new("1".to_string()); let mut task_1 = WriteTask::::new(region_1, 1, 10); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 09d6db62764..a546b286a68 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1066,7 +1066,7 @@ impl PollHandler, St send_time: write_begin, inspector: latency_inspect, }, - 0, + None, ) { warn!("send latency inspecting to write workers failed"; "err" => ?err); } diff --git a/components/raftstore/src/store/worker/refresh_config.rs b/components/raftstore/src/store/worker/refresh_config.rs index 7ba0476d381..6fcbd6a93e7 100644 --- a/components/raftstore/src/store/worker/refresh_config.rs +++ b/components/raftstore/src/store/worker/refresh_config.rs @@ -46,7 +46,7 @@ where { pub fn decrease_by(&mut self, size: usize) { for _ in 0..size { - if let Err(e) = self.state.fsm_sender.send(FsmTypes::Empty, 0) { + if let Err(e) = self.state.fsm_sender.send(FsmTypes::Empty, None) { error!( "failed to decrease thread pool"; "decrease to" => size, diff --git a/components/resource_control/src/channel.rs b/components/resource_control/src/channel.rs index a62b9636f83..ccad4aba4bb 100644 --- a/components/resource_control/src/channel.rs +++ b/components/resource_control/src/channel.rs @@ -85,39 +85,38 @@ impl Sender { // It's used to make sure messages from one peer are sent in order. // The returned value is the priority that the message sent with. It is // calculated by resource controller and compared with `low_bound`. - pub fn send(&self, m: T, low_bound: u64) -> Result> { + pub fn send(&self, m: T, low_bound: Option) -> Result, SendError> { match self { - Sender::Vanilla(sender) => sender.send(m).map(|_| 0), + Sender::Vanilla(sender) => sender.send(m).map(|_| None), Sender::Priority { resource_ctl, sender, last_msg_group, } => { - // TODO: pass different command priority - let priority = std::cmp::max( - resource_ctl - .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal), - low_bound, - ); - sender.send(m, priority).map(|_| priority) + let p = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + let priority = if let Some(low_bound) = low_bound { + std::cmp::max(p, low_bound) + } else { + p + }; + sender.send(m, priority).map(|_| Some(priority)) } } } - pub fn try_send(&self, m: T, low_bound: u64) -> Result> { + pub fn try_send(&self, m: T, low_bound: Option) -> Result, TrySendError> { match self { - Sender::Vanilla(sender) => sender.try_send(m).map(|_| 0), + Sender::Vanilla(sender) => sender.try_send(m).map(|_| None), Sender::Priority { resource_ctl, sender, last_msg_group, } => { - let priority = std::cmp::max( - resource_ctl - .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal), - low_bound, - ); - sender.try_send(m, priority).map(|_| priority) + let p = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + let priority = std::cmp::max(p, low_bound.unwrap_or(0)); + sender.try_send(m, priority).map(|_| Some(priority)) } } } @@ -215,7 +214,7 @@ mod tests { n1 += 1; let msg = Msg(1); tx.consume_msg_resource(&msg); - tx.send(msg, 0).unwrap(); + tx.send(msg, None).unwrap(); }); drop(tx); diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index cea045dbf1a..690a3e3812f 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -29,6 +29,12 @@ const DEFAULT_RESOURCE_GROUP_NAME: &str = "default"; /// default value of max RU quota. const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; +#[cfg(test)] +const LOW_PRIORITY: u32 = 1; +const MEDIUM_PRIORITY: u32 = 8; +#[cfg(test)] +const HIGH_PRIORITY: u32 = 16; + pub enum ResourceConsumeType { CpuTime(Duration), IoBytes(u64), @@ -71,7 +77,7 @@ impl ResourceGroupManager { let group_name = rg.get_name().to_ascii_lowercase(); self.registry.lock().unwrap().iter().for_each(|controller| { let ru_quota = Self::get_ru_setting(&rg, controller.is_read); - controller.add_resource_group(group_name.clone().into_bytes(), ru_quota); + controller.add_resource_group(group_name.clone().into_bytes(), ru_quota, rg.priority); }); info!("add resource group"; "name"=> &rg.name, "ru" => rg.get_r_u_settings().get_r_u().get_settings().get_fill_rate()); self.resource_groups.insert(group_name, rg); @@ -117,7 +123,7 @@ impl ResourceGroupManager { self.registry.lock().unwrap().push(controller.clone()); for g in &self.resource_groups { let ru_quota = Self::get_ru_setting(g.value(), controller.is_read); - controller.add_resource_group(g.key().clone().into_bytes(), ru_quota); + controller.add_resource_group(g.key().clone().into_bytes(), ru_quota, g.priority); } controller } @@ -161,7 +167,11 @@ impl ResourceController { last_min_vt: AtomicU64::new(0), }; // add the "default" resource group - controller.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); + controller.add_resource_group( + DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), + 0, + MEDIUM_PRIORITY, + ); controller } @@ -176,7 +186,11 @@ impl ResourceController { } } - fn add_resource_group(&self, name: Vec, ru_quota: u64) { + fn add_resource_group(&self, name: Vec, ru_quota: u64, mut group_priority: u32) { + if group_priority == 0 { + // map 0 to medium priority(default priority) + group_priority = MEDIUM_PRIORITY; + } let mut max_ru_quota = self.max_ru_quota.lock().unwrap(); if ru_quota > *max_ru_quota { *max_ru_quota = ru_quota; @@ -192,6 +206,7 @@ impl ResourceController { }; let group = GroupPriorityTracker { ru_quota, + group_priority, weight, virtual_time: AtomicU64::new(self.last_min_vt.load(Ordering::Acquire)), vt_delta_for_get, @@ -218,7 +233,11 @@ impl ResourceController { fn remove_resource_group(&self, name: &[u8]) { // do not remove the default resource group, reset to default setting instead. if DEFAULT_RESOURCE_GROUP_NAME.as_bytes() == name { - self.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); + self.add_resource_group( + DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), + 0, + MEDIUM_PRIORITY, + ); return; } self.resource_consumptions.write().remove(name); @@ -294,9 +313,19 @@ impl TaskPriorityProvider for ResourceController { } } +fn concat_priority_vt(group_priority: u32, vt: u64) -> u64 { + assert!((1..=16).contains(&group_priority)); + + // map group_priority from [1, 16] to [0, 15] to limit it 4 bits and get bitwise + // negation to replace leading 4 bits of vt. So that the priority is ordered in + // the descending order by group_priority first, then by vt in ascending order. + vt | (!((group_priority - 1) as u64) << 60) +} + struct GroupPriorityTracker { // the ru setting of this group. ru_quota: u64, + group_priority: u32, weight: u64, virtual_time: AtomicU64, // the constant delta value for each `get_priority` call, @@ -306,13 +335,14 @@ struct GroupPriorityTracker { impl GroupPriorityTracker { fn get_priority(&self, level: usize) -> u64 { let task_extra_priority = TASK_EXTRA_FACTOR_BY_LEVEL[level] * 1000 * self.weight; - (if self.vt_delta_for_get > 0 { + let vt = (if self.vt_delta_for_get > 0 { self.virtual_time .fetch_add(self.vt_delta_for_get, Ordering::Relaxed) + self.vt_delta_for_get } else { self.virtual_time.load(Ordering::Relaxed) - }) + task_extra_priority + }) + task_extra_priority; + concat_priority_vt(self.group_priority, vt) } #[inline] @@ -342,8 +372,8 @@ pub(crate) mod tests { use super::*; - pub fn new_resource_group_ru(name: String, ru: u64) -> ResourceGroup { - new_resource_group(name, true, ru, ru) + pub fn new_resource_group_ru(name: String, ru: u64, group_priority: u32) -> ResourceGroup { + new_resource_group(name, true, ru, ru, group_priority) } pub fn new_resource_group( @@ -351,6 +381,7 @@ pub(crate) mod tests { is_ru_mode: bool, read_tokens: u64, write_tokens: u64, + group_priority: u32, ) -> ResourceGroup { use kvproto::resource_manager::{GroupRawResourceSettings, GroupRequestUnitSettings}; @@ -362,6 +393,7 @@ pub(crate) mod tests { GroupMode::RawMode }; group.set_mode(mode); + group.set_priority(group_priority); if is_ru_mode { assert!(read_tokens == write_tokens); let mut ru_setting = GroupRequestUnitSettings::new(); @@ -389,7 +421,7 @@ pub(crate) mod tests { fn test_resource_group() { let resource_manager = ResourceGroupManager::default(); - let group1 = new_resource_group_ru("TEST".into(), 100); + let group1 = new_resource_group_ru("TEST".into(), 100, 0); resource_manager.add_resource_group(group1); assert!(resource_manager.get_resource_group("test1").is_none()); @@ -406,7 +438,7 @@ pub(crate) mod tests { drop(group); assert_eq!(resource_manager.resource_groups.len(), 1); - let group1 = new_resource_group_ru("Test".into(), 200); + let group1 = new_resource_group_ru("Test".into(), 200, LOW_PRIORITY); resource_manager.add_resource_group(group1); let group = resource_manager.get_resource_group("test").unwrap(); assert_eq!( @@ -418,10 +450,11 @@ pub(crate) mod tests { .get_fill_rate(), 200 ); + assert_eq!(group.value().get_priority(), 1); drop(group); assert_eq!(resource_manager.resource_groups.len(), 1); - let group2 = new_resource_group_ru("test2".into(), 400); + let group2 = new_resource_group_ru("test2".into(), 400, 0); resource_manager.add_resource_group(group2); assert_eq!(resource_manager.resource_groups.len(), 2); @@ -436,17 +469,26 @@ pub(crate) mod tests { let mut extras1 = Extras::single_level(); extras1.set_metadata("test".as_bytes().to_owned()); - assert_eq!(resource_ctl.priority_of(&extras1), 25_000); + assert_eq!( + resource_ctl.priority_of(&extras1), + concat_priority_vt(LOW_PRIORITY, 25_000) + ); assert_eq!(group1.current_vt(), 25_000); let mut extras2 = Extras::single_level(); extras2.set_metadata("test2".as_bytes().to_owned()); - assert_eq!(resource_ctl.priority_of(&extras2), 12_500); + assert_eq!( + resource_ctl.priority_of(&extras2), + concat_priority_vt(MEDIUM_PRIORITY, 12_500) + ); assert_eq!(group2.current_vt(), 12_500); let mut extras3 = Extras::single_level(); extras3.set_metadata("unknown_group".as_bytes().to_owned()); - assert_eq!(resource_ctl.priority_of(&extras3), 50); + assert_eq!( + resource_ctl.priority_of(&extras3), + concat_priority_vt(MEDIUM_PRIORITY, 50) + ); assert_eq!( resource_ctl .resource_group("default".as_bytes()) @@ -482,7 +524,7 @@ pub(crate) mod tests { drop(group2); // test add 1 new resource group - let new_group = new_resource_group_ru("new_group".into(), 500); + let new_group = new_resource_group_ru("new_group".into(), 500, HIGH_PRIORITY); resource_manager.add_resource_group(new_group); assert_eq!(resource_ctl.resource_consumptions.read().len(), 4); @@ -497,7 +539,7 @@ pub(crate) mod tests { let resource_ctl = resource_manager.derive_controller("test_read".into(), true); let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); - let group1 = new_resource_group_ru("test1".into(), 5000); + let group1 = new_resource_group_ru("test1".into(), 5000, 0); resource_manager.add_resource_group(group1); assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); assert_eq!( @@ -506,7 +548,7 @@ pub(crate) mod tests { ); // add a resource group with big ru - let group1 = new_resource_group_ru("test2".into(), 50000); + let group1 = new_resource_group_ru("test2".into(), 50000, 0); resource_manager.add_resource_group(group1); assert_eq!(*resource_ctl.max_ru_quota.lock().unwrap(), 50000); assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 100); @@ -530,10 +572,10 @@ pub(crate) mod tests { let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); for i in 0..5 { - let group1 = new_resource_group_ru(format!("test{}", i), 100); + let group1 = new_resource_group_ru(format!("test{}", i), 100, 0); resource_manager.add_resource_group(group1); // add a resource group with big ru - let group1 = new_resource_group_ru(format!("group{}", i), 100); + let group1 = new_resource_group_ru(format!("group{}", i), 100, 0); resource_manager.add_resource_group(group1); } // consume for default group @@ -566,4 +608,20 @@ pub(crate) mod tests { 0 ); } + + #[test] + fn test_concat_priority_vt() { + let v1 = concat_priority_vt(MEDIUM_PRIORITY, 1000); + let v2 = concat_priority_vt(MEDIUM_PRIORITY, 1111); + assert!(v1 < v2); + + let v3 = concat_priority_vt(LOW_PRIORITY, 1000); + assert!(v1 < v3); + + let v4 = concat_priority_vt(MEDIUM_PRIORITY, 1111); + assert_eq!(v2, v4); + + let v5 = concat_priority_vt(HIGH_PRIORITY, 10); + assert!(v5 < v1); + } } diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index a2d64f57c3b..82c01eae398 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -199,7 +199,7 @@ pub mod tests { let resource_manager = ResourceGroupManager::default(); let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); - let group = new_resource_group("TEST".into(), true, 100, 100); + let group = new_resource_group("TEST".into(), true, 100, 100, 0); add_resource_group(s.pd_client.clone(), group); block_on(s.reload_all_resource_groups()); assert_eq!(s.manager.get_all_resource_groups().len(), 1); @@ -244,12 +244,12 @@ pub mod tests { s_clone.watch_resource_groups().await; }); // Mock add - let group1 = new_resource_group_ru("TEST1".into(), 100); + let group1 = new_resource_group_ru("TEST1".into(), 100, 0); add_resource_group(s.pd_client.clone(), group1); - let group2 = new_resource_group_ru("TEST2".into(), 100); + let group2 = new_resource_group_ru("TEST2".into(), 100, 0); add_resource_group(s.pd_client.clone(), group2); // Mock modify - let group2 = new_resource_group_ru("TEST2".into(), 50); + let group2 = new_resource_group_ru("TEST2".into(), 50, 0); add_resource_group(s.pd_client.clone(), group2); wait_watch_ready(&s, 2); @@ -286,7 +286,7 @@ pub mod tests { s_clone.watch_resource_groups().await; }); // Mock add - let group1 = new_resource_group_ru("TEST1".into(), 100); + let group1 = new_resource_group_ru("TEST1".into(), 100, 0); add_resource_group(s.pd_client.clone(), group1); // Mock reboot watch server let watch_global_config_fp = "watch_global_config_return"; @@ -294,7 +294,7 @@ pub mod tests { std::thread::sleep(Duration::from_millis(100)); fail::remove(watch_global_config_fp); // Mock add after rebooting will success - let group1 = new_resource_group_ru("TEST2".into(), 100); + let group1 = new_resource_group_ru("TEST2".into(), 100, 0); add_resource_group(s.pd_client.clone(), group1); // Wait watcher update std::thread::sleep(Duration::from_secs(1)); diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 2beed27de8b..2ce7a8714c0 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -707,14 +707,15 @@ where } #[derive(Serialize)] -struct ResouceGroupSetting { +struct ResourceGroupSetting { name: String, ru: u64, + priority: u32, burst_limit: i64, } -fn into_debug_request_group(rg: ResourceGroup) -> ResouceGroupSetting { - ResouceGroupSetting { +fn into_debug_request_group(rg: ResourceGroup) -> ResourceGroupSetting { + ResourceGroupSetting { name: rg.name, ru: rg .r_u_settings @@ -722,6 +723,7 @@ fn into_debug_request_group(rg: ResourceGroup) -> ResouceGroupSetting { .get_r_u() .get_settings() .get_fill_rate(), + priority: rg.priority, burst_limit: rg .r_u_settings .get_ref() From 7b1fe9df07e6e58c759231b668f2765bdeddb583 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Mon, 20 Mar 2023 12:20:40 +0800 Subject: [PATCH 596/676] PITR: support modifying the config tikv.import.memory-use-ratio online when restore point. (#14408) close tikv/tikv#14409 Signed-off-by: joccau Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/server/src/server.rs | 7 ++ components/server/src/server2.rs | 7 ++ components/sst_importer/Cargo.toml | 1 + components/sst_importer/src/config.rs | 53 +++++++++++-- components/sst_importer/src/lib.rs | 2 +- components/sst_importer/src/sst_importer.rs | 85 +++++++++++++++++++-- src/config/mod.rs | 2 +- src/import/sst_service.rs | 26 +++++-- 9 files changed, 160 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f313d747187..4265565e353 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5499,6 +5499,7 @@ dependencies = [ "kvproto", "lazy_static", "log_wrappers", + "online_config", "openssl", "prometheus", "rand 0.8.5", diff --git a/components/server/src/server.rs b/components/server/src/server.rs index e77197a7737..b9563f295b5 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1250,6 +1250,8 @@ where LocalTablets::Singleton(engines.engines.kv.clone()), servers.importer.clone(), ); + let import_cfg_mgr = import_service.get_config_manager(); + if servers .server .register_service(create_import_sst(import_service)) @@ -1258,6 +1260,11 @@ where fatal!("failed to register import service"); } + self.cfg_controller + .as_mut() + .unwrap() + .register(tikv::config::Module::Import, Box::new(import_cfg_mgr)); + // Debug service. let debug_service = DebugService::new( engines.engines.clone(), diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 9ae032dca7a..ef38c3e2286 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -989,6 +989,8 @@ where LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), servers.importer.clone(), ); + let import_cfg_mgr = import_service.get_config_manager(); + if servers .server .register_service(create_import_sst(import_service)) @@ -997,6 +999,11 @@ where fatal!("failed to register import service"); } + self.cfg_controller + .as_mut() + .unwrap() + .register(tikv::config::Module::Import, Box::new(import_cfg_mgr)); + // Create Diagnostics service let diag_service = DiagnosticsService::new( servers.server.get_debug_thread_pool().clone(), diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index a21a58c0a6c..8e2799b7437 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -29,6 +29,7 @@ keys = { workspace = true } kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } +online_config = { workspace = true } openssl = "0.10" prometheus = { version = "0.13", default-features = false } rand = "0.8" diff --git a/components/sst_importer/src/config.rs b/components/sst_importer/src/config.rs index ac789e2f4ae..7434c5cf0cd 100644 --- a/components/sst_importer/src/config.rs +++ b/components/sst_importer/src/config.rs @@ -1,10 +1,15 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{error::Error, result::Result}; +use std::{ + error::Error, + result::Result, + sync::{Arc, RwLock}, +}; -use tikv_util::config::ReadableDuration; +use online_config::{self, OnlineConfig}; +use tikv_util::{config::ReadableDuration, HandyRwLock}; -#[derive(Clone, Serialize, Deserialize, PartialEq, Debug)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct Config { @@ -47,12 +52,48 @@ impl Config { self.stream_channel_window = default_cfg.stream_channel_window; } if self.memory_use_ratio > 0.5 || self.memory_use_ratio < 0.0 { + return Err("import.mem_ratio should belong to [0.0, 0.5].".into()); + } + Ok(()) + } +} + +#[derive(Clone)] +pub struct ConfigManager(pub Arc>); + +impl ConfigManager { + pub fn new(cfg: Config) -> Self { + ConfigManager(Arc::new(RwLock::new(cfg))) + } +} + +impl online_config::ConfigManager for ConfigManager { + fn dispatch(&mut self, change: online_config::ConfigChange) -> online_config::Result<()> { + info!( + "import config changed"; + "change" => ?change, + ); + + let mut cfg = self.rl().clone(); + cfg.update(change)?; + + if let Err(e) = cfg.validate() { warn!( - "import.mem_ratio should belong to [0.0, 0.5], change it to {}", - default_cfg.memory_use_ratio, + "import config changed"; + "change" => ?cfg, ); - self.memory_use_ratio = default_cfg.memory_use_ratio; + return Err(e); } + + *self.wl() = cfg; Ok(()) } } + +impl std::ops::Deref for ConfigManager { + type Target = RwLock; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} diff --git a/components/sst_importer/src/lib.rs b/components/sst_importer/src/lib.rs index 4d25201253a..e073ff941ae 100644 --- a/components/sst_importer/src/lib.rs +++ b/components/sst_importer/src/lib.rs @@ -24,7 +24,7 @@ pub mod metrics; pub mod sst_importer; pub use self::{ - config::Config, + config::{Config, ConfigManager}, errors::{error_inc, Error, Result}, import_file::sst_meta_to_path, sst_importer::SstImporter, diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 0da45c195be..5b55974dff3 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -36,9 +36,9 @@ use tikv_util::{ bytes::{decode_bytes_in_place, encode_bytes}, stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, }, - config::ReadableSize, sys::{thread::ThreadBuildWrapper, SysQuota}, time::{Instant, Limiter}, + HandyRwLock, }; use tokio::runtime::{Handle, Runtime}; use txn_types::{Key, TimeStamp, WriteRef}; @@ -49,7 +49,7 @@ use crate::{ import_mode::{ImportModeSwitcher, RocksDbMetricsFn}, metrics::*, sst_writer::{RawSstWriter, TxnSstWriter}, - util, Config, Error, Result, + util, Config, ConfigManager as ImportConfigManager, Error, Result, }; pub struct LoadedFile { @@ -278,7 +278,7 @@ pub struct SstImporter { download_rt: Runtime, file_locks: Arc>, mem_use: Arc, - mem_limit: ReadableSize, + mem_limit: Arc, } impl SstImporter { @@ -308,8 +308,12 @@ impl SstImporter { .build()?; download_rt.spawn(cached_storage.gc_loop()); - let memory_limit = (SysQuota::memory_limit_in_bytes() as f64) * cfg.memory_use_ratio; - info!("sst importer memory limit when apply"; "size" => ?memory_limit); + let memory_limit = Self::calcualte_usage_mem(cfg.memory_use_ratio); + info!( + "sst importer memory limit when apply"; + "ratio" => cfg.memory_use_ratio, + "size" => ?memory_limit, + ); Ok(SstImporter { dir: ImportDir::new(root)?, @@ -321,10 +325,14 @@ impl SstImporter { cached_storage, download_rt, mem_use: Arc::new(AtomicU64::new(0)), - mem_limit: ReadableSize(memory_limit as u64), + mem_limit: Arc::new(AtomicU64::new(memory_limit)), }) } + fn calcualte_usage_mem(mem_ratio: f64) -> u64 { + ((SysQuota::memory_limit_in_bytes() as f64) * mem_ratio) as u64 + } + pub fn set_compression_type( &mut self, cf_name: CfName, @@ -583,6 +591,19 @@ impl SstImporter { Ok(()) } + pub fn update_config_memory_use_ratio(&self, cfg_mgr: &ImportConfigManager) { + let mem_ratio = cfg_mgr.rl().memory_use_ratio; + let memory_limit = Self::calcualte_usage_mem(mem_ratio); + + if self.mem_limit.load(Ordering::SeqCst) != memory_limit { + self.mem_limit.store(memory_limit, Ordering::SeqCst); + info!("update importer config"; + "memory-use-ratio" => mem_ratio, + "size" => memory_limit, + ) + } + } + pub fn shrink_by_tick(&self) -> usize { let mut shrink_buff_size: usize = 0; let mut retain_buff_size: usize = 0; @@ -643,7 +664,7 @@ impl SstImporter { // If mem_limit is 0, which represent download kv-file when import. // Or read kv-file into buffer directly. pub fn import_support_download(&self) -> bool { - self.mem_limit == ReadableSize(0) + self.mem_limit.load(Ordering::SeqCst) == 0 } fn request_memory(&self, meta: &KvMeta) -> Option { @@ -651,7 +672,7 @@ impl SstImporter { let old = self.mem_use.fetch_add(size, Ordering::SeqCst); // If the memory is limited, roll backup the mem_use and return false. - if old + size > self.mem_limit.0 { + if old + size > self.mem_limit.load(Ordering::SeqCst) { self.mem_use.fetch_sub(size, Ordering::SeqCst); CACHE_EVENT.with_label_values(&["out-of-quota"]).inc(); None @@ -1449,6 +1470,7 @@ mod tests { }; use external_storage_export::read_external_storage_info_buff; use file_system::File; + use online_config::{ConfigManager, OnlineConfig}; use openssl::hash::{Hasher, MessageDigest}; use tempfile::Builder; use test_sst_importer::*; @@ -1958,6 +1980,53 @@ mod tests { assert_eq!(err.kind(), io::ErrorKind::TimedOut); } + #[test] + fn test_update_config_memory_use_ratio() { + // create SstImpoter with default. + let cfg = Config { + memory_use_ratio: 0.3, + ..Default::default() + }; + let import_dir = tempfile::tempdir().unwrap(); + let importer = SstImporter::new(&cfg, import_dir, None, ApiVersion::V1).unwrap(); + let mem_limit_old = importer.mem_limit.load(Ordering::SeqCst); + + // create new config and get the diff config. + let cfg_new = Config { + memory_use_ratio: 0.1, + ..Default::default() + }; + let change = cfg.diff(&cfg_new); + + // create config manager and update config. + let mut cfg_mgr = ImportConfigManager::new(cfg); + cfg_mgr.dispatch(change).unwrap(); + importer.update_config_memory_use_ratio(&cfg_mgr); + + let mem_limit_new = importer.mem_limit.load(Ordering::SeqCst); + assert!(mem_limit_old > mem_limit_new); + assert_eq!( + mem_limit_old / 3, + mem_limit_new, + "mem_limit_old / 3 = {} mem_limit_new = {}", + mem_limit_old / 3, + mem_limit_new + ); + } + + #[test] + fn test_update_config_with_invalid_conifg() { + let cfg = Config::default(); + let cfg_new = Config { + memory_use_ratio: -0.1, + ..Default::default() + }; + let change = cfg.diff(&cfg_new); + let mut cfg_mgr = ImportConfigManager::new(cfg); + let r = cfg_mgr.dispatch(change); + assert!(r.is_err()); + } + #[test] fn test_do_read_kv_file() { // create a sample kv file. diff --git a/src/config/mod.rs b/src/config/mod.rs index c1c38e39d77..3eb15ba8ace 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3127,7 +3127,7 @@ pub struct TikvConfig { #[online_config(skip)] pub security: SecurityConfig, - #[online_config(skip)] + #[online_config(submodule)] pub import: ImportConfig, #[online_config(submodule)] diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 291841facde..b23046bfe4b 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -26,8 +26,8 @@ use kvproto::{ kvrpcpb::Context, }; use sst_importer::{ - error_inc, metrics::*, sst_importer::DownloadExt, sst_meta_to_path, Config, Error, Result, - SstImporter, + error_inc, metrics::*, sst_importer::DownloadExt, sst_meta_to_path, Config, ConfigManager, + Error, Result, SstImporter, }; use tikv_kv::{Engine, Modify, SnapContext, Snapshot, SnapshotExt, WriteData, WriteEvent}; use tikv_util::{ @@ -35,6 +35,7 @@ use tikv_util::{ future::create_stream_with_buffer, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, + HandyRwLock, }; use tokio::{runtime::Runtime, time::sleep}; use txn_types::{Key, WriteRef, WriteType}; @@ -85,7 +86,7 @@ async fn wait_write(mut s: impl Stream + Send + Unpin) -> sto /// raftstore to trigger the ingest process. #[derive(Clone)] pub struct ImportSstService { - cfg: Config, + cfg: ConfigManager, tablets: LocalTablets, engine: E, threads: Arc, @@ -296,10 +297,12 @@ impl ImportSstService { if let LocalTablets::Singleton(tablet) = &tablets { importer.start_switch_mode_check(threads.handle(), tablet.clone()); } - threads.spawn(Self::tick(importer.clone())); + + let cfg_mgr = ConfigManager::new(cfg); + threads.spawn(Self::tick(importer.clone(), cfg_mgr.clone())); ImportSstService { - cfg, + cfg: cfg_mgr, tablets, threads: Arc::new(threads), block_threads: Arc::new(block_threads), @@ -311,9 +314,15 @@ impl ImportSstService { } } - async fn tick(importer: Arc) { + pub fn get_config_manager(&self) -> ConfigManager { + self.cfg.clone() + } + + async fn tick(importer: Arc, cfg: ConfigManager) { loop { sleep(Duration::from_secs(10)).await; + + importer.update_config_memory_use_ratio(&cfg); importer.shrink_by_tick(); } } @@ -544,7 +553,7 @@ macro_rules! impl_write { let import = self.importer.clone(); let tablets = self.tablets.clone(); let (rx, buf_driver) = - create_stream_with_buffer(stream, self.cfg.stream_channel_window); + create_stream_with_buffer(stream, self.cfg.rl().stream_channel_window); let mut rx = rx.map_err(Error::from); let timer = Instant::now_coarse(); @@ -652,7 +661,8 @@ impl ImportSst for ImportSstService { let label = "upload"; let timer = Instant::now_coarse(); let import = self.importer.clone(); - let (rx, buf_driver) = create_stream_with_buffer(stream, self.cfg.stream_channel_window); + let (rx, buf_driver) = + create_stream_with_buffer(stream, self.cfg.rl().stream_channel_window); let mut map_rx = rx.map_err(Error::from); let handle_task = async move { From 4dc1a5a94b0f88257a11c0733937a4892c70518d Mon Sep 17 00:00:00 2001 From: Connor Date: Mon, 20 Mar 2023 13:40:40 +0800 Subject: [PATCH 597/676] grafana: fix grafana display anomaly (#14428) close tikv/tikv#14427 Fix grafana display anomaly. The `pessimistic lock activities` panel's id is the same as that of `gRPC resource group QPS` panel which makes grafana display anomaly. So change the duplicated id. Signed-off-by: Connor1996 --- metrics/grafana/tikv_details.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index f404ebc5376..9600222547e 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -5797,7 +5797,7 @@ "y": 37 }, "hiddenSeries": false, - "id": 23763573091, + "id": 23763573090, "legend": { "alignAsTable": true, "avg": false, From 5a8f970d313a6dc640ee02a9fc71020b1215a31a Mon Sep 17 00:00:00 2001 From: lijie Date: Mon, 20 Mar 2023 14:48:07 +0800 Subject: [PATCH 598/676] feat: bump version to 7.1.0-alpha (#14431) Signed-off-by: lijie --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4265565e353..14e351effba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6316,7 +6316,7 @@ dependencies = [ [[package]] name = "tikv" -version = "6.7.0-alpha" +version = "7.1.0-alpha" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index 57a2ab4eced..f8e67d70c04 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "6.7.0-alpha" +version = "7.1.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 27f4d8c9fa86ee7c1e7631c42c869632db418d85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 22 Mar 2023 16:36:42 +0800 Subject: [PATCH 599/676] backup-stream: don't close the server stream when encountered errors (#14432) close tikv/tikv#14426 Signed-off-by: hillium Co-authored-by: Ti Chi Robot --- .../backup-stream/src/checkpoint_manager.rs | 206 ++++++++++++++++-- components/backup-stream/src/service.rs | 7 +- 2 files changed, 191 insertions(+), 22 deletions(-) diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 50a6ac27864..d32c2ea7c00 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -7,7 +7,7 @@ use futures::{ future::BoxFuture, FutureExt, SinkExt, StreamExt, }; -use grpcio::{RpcStatus, RpcStatusCode, ServerStreamingSink, WriteFlags}; +use grpcio::{RpcStatus, RpcStatusCode, WriteFlags}; use kvproto::{ errorpb::{Error as PbError, *}, logbackuppb::{FlushEvent, SubscribeFlushEventResponse}, @@ -20,7 +20,7 @@ use uuid::Uuid; use crate::{ annotate, - errors::{Error, ReportableResult, Result}, + errors::{Error, Result}, future, metadata::{store::MetaStore, Checkpoint, CheckpointProvider, MetadataClient}, metrics, @@ -51,9 +51,11 @@ impl std::fmt::Debug for CheckpointManager { enum SubscriptionOp { Add(Subscription), Emit(Box<[FlushEvent]>), + #[cfg(test)] + Inspect(Box), } -struct SubscriptionManager { +pub struct SubscriptionManager { subscribers: HashMap, input: Receiver, } @@ -72,8 +74,13 @@ impl SubscriptionManager { SubscriptionOp::Emit(events) => { self.emit_events(events).await; } + #[cfg(test)] + SubscriptionOp::Inspect(f) => { + f(&self); + } } } + // NOTE: Maybe close all subscription streams here. } async fn emit_events(&mut self, events: Box<[FlushEvent]>) { @@ -89,14 +96,9 @@ impl SubscriptionManager { sub.flush().await }; - match send_all.await { - Err(grpcio::Error::RemoteStopped) => { - canceled.push(*id); - } - Err(err) => { - Error::from(err).report("sending subscription"); - } - _ => {} + if let Err(err) = send_all.await { + canceled.push(*id); + Error::from(err).report("sending subscription"); } } @@ -107,11 +109,10 @@ impl SubscriptionManager { async fn remove_subscription(&mut self, id: &Uuid) { match self.subscribers.remove(id) { - Some(mut sub) => { + Some(sub) => { info!("client is gone, removing subscription"; "id" => %id); - sub.close() - .await - .report_if_err(format_args!("during removing subscription {}", id)) + // The stream is an endless stream -- we don't need to close it. + drop(sub); } None => { warn!("BUG: the subscriber has been removed before we are going to remove it."; "id" => %id); @@ -121,7 +122,12 @@ impl SubscriptionManager { } // Note: can we make it more generic...? -pub type Subscription = ServerStreamingSink; +#[cfg(not(test))] +pub type Subscription = + grpcio::ServerStreamingSink; + +#[cfg(test)] +pub type Subscription = tests::MockSink; /// The result of getting a checkpoint. /// The possibility of failed to getting checkpoint is pretty high: @@ -201,7 +207,7 @@ impl CheckpointManager { /// update a region checkpoint in need. #[cfg(test)] - pub fn update_region_checkpoint(&mut self, region: &Region, checkpoint: TimeStamp) { + fn update_region_checkpoint(&mut self, region: &Region, checkpoint: TimeStamp) { Self::update_ts(&mut self.checkpoint_ts, region.clone(), checkpoint) } @@ -326,6 +332,29 @@ impl CheckpointManager { pub fn get_resolved_ts(&self) -> Option { self.resolved_ts.values().map(|x| x.checkpoint).min() } + + #[cfg(test)] + fn sync_with_subs_mgr( + &mut self, + f: impl FnOnce(&SubscriptionManager) -> T + Send + 'static, + ) -> T { + use std::sync::Mutex; + + let (tx, rx) = std::sync::mpsc::sync_channel(1); + let t = Arc::new(Mutex::new(None)); + let tr = Arc::clone(&t); + self.manager_handle + .as_mut() + .unwrap() + .try_send(SubscriptionOp::Inspect(Box::new(move |x| { + *tr.lock().unwrap() = Some(f(x)); + tx.send(()).unwrap(); + }))) + .unwrap(); + rx.recv().unwrap(); + let mut t = t.lock().unwrap(); + t.take().unwrap() + } } fn not_leader(r: u64) -> PbError { @@ -525,17 +554,21 @@ pub mod tests { use std::{ assert_matches, collections::HashMap, - sync::{Arc, RwLock}, + sync::{Arc, Mutex, RwLock}, time::Duration, }; - use futures::future::ok; - use kvproto::metapb::*; + use futures::{future::ok, Sink}; + use grpcio::{RpcStatus, RpcStatusCode}; + use kvproto::{logbackuppb::SubscribeFlushEventResponse, metapb::*}; use pd_client::{PdClient, PdFuture}; use txn_types::TimeStamp; use super::{BasicFlushObserver, FlushObserver, RegionIdWithVersion}; - use crate::GetCheckpointResult; + use crate::{ + subscription_track::{CheckpointType, ResolveResult}, + GetCheckpointResult, + }; fn region(id: u64, version: u64, conf_version: u64) -> Region { let mut r = Region::new(); @@ -547,6 +580,137 @@ pub mod tests { r } + #[derive(Clone)] + pub struct MockSink(Arc>); + + impl MockSink { + fn with_fail_once(code: RpcStatusCode) -> Self { + let mut failed = false; + let inner = MockSinkInner { + items: Vec::default(), + closed: false, + on_error: Box::new(move || { + if failed { + RpcStatusCode::OK + } else { + failed = true; + code + } + }), + }; + Self(Arc::new(Mutex::new(inner))) + } + + fn trivial() -> Self { + let inner = MockSinkInner { + items: Vec::default(), + closed: false, + on_error: Box::new(|| RpcStatusCode::OK), + }; + Self(Arc::new(Mutex::new(inner))) + } + + pub async fn fail(&self, status: RpcStatus) -> crate::errors::Result<()> { + panic!("failed in a case should never fail: {}", status); + } + } + + struct MockSinkInner { + items: Vec, + closed: bool, + on_error: Box grpcio::RpcStatusCode + Send>, + } + + impl Sink<(SubscribeFlushEventResponse, grpcio::WriteFlags)> for MockSink { + type Error = grpcio::Error; + + fn poll_ready( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + Ok(()).into() + } + + fn start_send( + self: std::pin::Pin<&mut Self>, + item: (SubscribeFlushEventResponse, grpcio::WriteFlags), + ) -> Result<(), Self::Error> { + let mut guard = self.0.lock().unwrap(); + let code = (guard.on_error)(); + if code != RpcStatusCode::OK { + return Err(grpcio::Error::RpcFailure(RpcStatus::new(code))); + } + guard.items.push(item.0); + Ok(()) + } + + fn poll_flush( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + Ok(()).into() + } + + fn poll_close( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let mut guard = self.0.lock().unwrap(); + guard.closed = true; + Ok(()).into() + } + } + + fn simple_resolve_result() -> ResolveResult { + let mut region = Region::new(); + region.set_id(42); + ResolveResult { + region, + checkpoint: 42.into(), + checkpoint_type: CheckpointType::MinTs, + } + } + + #[test] + fn test_rpc_sub() { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(1) + .build() + .unwrap(); + let mut mgr = super::CheckpointManager::default(); + rt.spawn(mgr.spawn_subscription_mgr()); + + let trivial_sink = MockSink::trivial(); + rt.block_on(mgr.add_subscriber(trivial_sink.clone())) + .unwrap(); + + mgr.resolve_regions(vec![simple_resolve_result()]); + mgr.flush(); + mgr.sync_with_subs_mgr(|_| {}); + assert_eq!(trivial_sink.0.lock().unwrap().items.len(), 1); + } + + #[test] + fn test_rpc_failure() { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(1) + .build() + .unwrap(); + let mut mgr = super::CheckpointManager::default(); + rt.spawn(mgr.spawn_subscription_mgr()); + + let error_sink = MockSink::with_fail_once(RpcStatusCode::INTERNAL); + rt.block_on(mgr.add_subscriber(error_sink.clone())).unwrap(); + + mgr.resolve_regions(vec![simple_resolve_result()]); + mgr.flush(); + assert_eq!(mgr.sync_with_subs_mgr(|item| { item.subscribers.len() }), 0); + let sink = error_sink.0.lock().unwrap(); + assert_eq!(sink.items.len(), 0); + // The stream shouldn't be closed when exit by a failure. + assert_eq!(sink.closed, false); + } + #[test] fn test_flush() { let mut mgr = super::CheckpointManager::default(); diff --git a/components/backup-stream/src/service.rs b/components/backup-stream/src/service.rs index 9d312a984d1..43d4ede2f27 100644 --- a/components/backup-stream/src/service.rs +++ b/components/backup-stream/src/service.rs @@ -94,8 +94,13 @@ impl LogBackup for Service { &mut self, _ctx: grpcio::RpcContext<'_>, _req: kvproto::logbackuppb::SubscribeFlushEventRequest, - sink: grpcio::ServerStreamingSink, + #[allow(unused_variables)] sink: grpcio::ServerStreamingSink< + kvproto::logbackuppb::SubscribeFlushEventResponse, + >, ) { + #[cfg(test)] + panic!("Service should not be used in an unit test"); + #[cfg(not(test))] try_send!( self.endpoint, Task::RegionCheckpointsOp(RegionCheckpointOperation::Subscribe(sink)) From 4b2dda4823b5b329798a2aff1109167534560313 Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Thu, 23 Mar 2023 18:28:43 +0800 Subject: [PATCH 600/676] storage: fix the apply write wal tracking time (#14444) ref tikv/tikv#12362 Fix the returned apply write wal tracking time. Signed-off-by: cfzjywxk --- components/tracker/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index 56ce2aa3280..35ae0fc15f2 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -68,7 +68,7 @@ impl Tracker { detail.set_apply_log_nanos(self.metrics.apply_time_nanos - self.metrics.apply_wait_nanos); detail.set_apply_mutex_lock_nanos(self.metrics.apply_mutex_lock_nanos); detail.set_apply_write_leader_wait_nanos(self.metrics.apply_thread_wait_nanos); - detail.set_apply_write_wal_nanos(self.metrics.apply_wait_nanos); + detail.set_apply_write_wal_nanos(self.metrics.apply_write_wal_nanos); detail.set_apply_write_memtable_nanos(self.metrics.apply_write_memtable_nanos); } } From ffaf4862c2fedd1eaf154f1cdf057b00210670b1 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 24 Mar 2023 09:18:43 +0800 Subject: [PATCH 601/676] raftstore-v2: commit merge (#14253) ref tikv/tikv#12842, ref tikv/tikv#13818 Implement commit merge for raftstore-v2 Signed-off-by: tabokie Co-authored-by: tonyxuqqi --- Cargo.lock | 14 +- components/engine_panic/src/checkpoint.rs | 4 + components/engine_rocks/src/checkpoint.rs | 8 + components/engine_rocks/src/event_listener.rs | 4 +- components/engine_traits/src/checkpoint.rs | 2 + components/engine_traits/src/flush.rs | 15 +- components/raftstore-v2/src/batch/store.rs | 17 +- components/raftstore-v2/src/fsm/apply.rs | 9 +- components/raftstore-v2/src/fsm/peer.rs | 16 +- components/raftstore-v2/src/fsm/store.rs | 3 + .../operation/command/admin/compact_log.rs | 54 +- .../operation/command/admin/conf_change.rs | 4 + .../operation/command/admin/merge/commit.rs | 792 ++++++++++++++++++ .../src/operation/command/admin/merge/mod.rs | 37 +- .../operation/command/admin/merge/prepare.rs | 45 +- .../operation/command/admin/merge/rollback.rs | 12 + .../src/operation/command/admin/mod.rs | 9 +- .../src/operation/command/admin/split.rs | 12 +- .../raftstore-v2/src/operation/command/mod.rs | 8 +- components/raftstore-v2/src/operation/life.rs | 44 +- components/raftstore-v2/src/operation/mod.rs | 7 +- .../raftstore-v2/src/operation/ready/mod.rs | 7 + .../src/operation/ready/snapshot.rs | 4 + .../raftstore-v2/src/operation/txn_ext.rs | 14 + components/raftstore-v2/src/raft/peer.rs | 9 + components/raftstore-v2/src/raft/storage.rs | 3 +- components/raftstore-v2/src/router/message.rs | 15 +- components/raftstore-v2/src/router/mod.rs | 5 +- .../raftstore-v2/tests/failpoints/mod.rs | 1 + .../tests/failpoints/test_merge.rs | 109 +++ .../tests/integrations/cluster.rs | 63 +- .../raftstore-v2/tests/integrations/mod.rs | 1 + .../tests/integrations/test_merge.rs | 113 +++ .../tests/integrations/test_split.rs | 59 +- components/test_raftstore-v2/Cargo.toml | 2 +- components/test_raftstore-v2/src/cluster.rs | 147 +++- components/test_raftstore-v2/src/lib.rs | 1 + components/test_raftstore-v2/src/server.rs | 16 +- components/test_raftstore/src/util.rs | 6 +- tests/failpoints/cases/test_merge.rs | 2 +- tests/integrations/raftstore/test_merge.rs | 431 ++++++---- 41 files changed, 1852 insertions(+), 272 deletions(-) create mode 100644 components/raftstore-v2/src/operation/command/admin/merge/commit.rs create mode 100644 components/raftstore-v2/src/operation/command/admin/merge/rollback.rs create mode 100644 components/raftstore-v2/tests/failpoints/test_merge.rs create mode 100644 components/raftstore-v2/tests/integrations/test_merge.rs diff --git a/Cargo.lock b/Cargo.lock index 14e351effba..e12ee05562d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1792,12 +1792,12 @@ dependencies = [ [[package]] name = "fail" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" +checksum = "fe5e43d0f78a42ad591453aedb1d7ae631ce7ee445c7643691055a9ed8d3b01c" dependencies = [ - "lazy_static", "log", + "once_cell", "rand 0.8.5", ] @@ -2712,7 +2712,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#b47a4830141f7c8d2719db0f0184652e692eb672" +source = "git+https://github.com/pingcap/kvproto.git#df1ae63d0cfe2f5e01d2016a1839a7e88ef2da38" dependencies = [ "futures 0.3.15", "grpcio", @@ -2841,7 +2841,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#9e4678857e5b4c738e95c7ee1a35ee962264f4e9" +source = "git+https://github.com/tikv/rust-rocksdb.git#a9fbe325939c166ffc5f80e63066f5d8594a1fff" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2860,7 +2860,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#9e4678857e5b4c738e95c7ee1a35ee962264f4e9" +source = "git+https://github.com/tikv/rust-rocksdb.git#a9fbe325939c166ffc5f80e63066f5d8594a1fff" dependencies = [ "bzip2-sys", "cc", @@ -4779,7 +4779,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#9e4678857e5b4c738e95c7ee1a35ee962264f4e9" +source = "git+https://github.com/tikv/rust-rocksdb.git#a9fbe325939c166ffc5f80e63066f5d8594a1fff" dependencies = [ "libc 0.2.139", "librocksdb_sys", diff --git a/components/engine_panic/src/checkpoint.rs b/components/engine_panic/src/checkpoint.rs index 6743810eb90..bed49c8e55b 100644 --- a/components/engine_panic/src/checkpoint.rs +++ b/components/engine_panic/src/checkpoint.rs @@ -15,6 +15,10 @@ impl Checkpointable for PanicEngine { fn new_checkpointer(&self) -> Result { panic!() } + + fn merge(&self, dbs: &[&Self]) -> Result<()> { + panic!() + } } impl Checkpointer for PanicCheckpointer { diff --git a/components/engine_rocks/src/checkpoint.rs b/components/engine_rocks/src/checkpoint.rs index 8b82043a392..0f86aa29945 100644 --- a/components/engine_rocks/src/checkpoint.rs +++ b/components/engine_rocks/src/checkpoint.rs @@ -15,6 +15,14 @@ impl Checkpointable for RocksEngine { Err(e) => Err(r2e(e)), } } + + fn merge(&self, dbs: &[&Self]) -> Result<()> { + let mut mopts = rocksdb::MergeInstanceOptions::default(); + mopts.merge_memtable = false; + mopts.allow_source_write = true; + let inner: Vec<_> = dbs.iter().map(|e| e.as_inner().as_ref()).collect(); + self.as_inner().merge_instances(&mopts, &inner).map_err(r2e) + } } pub struct RocksEngineCheckpointer(rocksdb::Checkpointer); diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index 23ff7cf5f50..1cbef379e3c 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -189,8 +189,10 @@ impl RocksPersistenceListener { impl rocksdb::EventListener for RocksPersistenceListener { fn on_memtable_sealed(&self, info: &MemTableInfo) { + // Note: first_seqno is effectively the smallest seqno of memtable. + // earliest_seqno has ambiguous semantics. self.0 - .on_memtable_sealed(info.cf_name().to_string(), info.earliest_seqno()); + .on_memtable_sealed(info.cf_name().to_string(), info.first_seqno()); } fn on_flush_completed(&self, job: &FlushJobInfo) { diff --git a/components/engine_traits/src/checkpoint.rs b/components/engine_traits/src/checkpoint.rs index 6ea3556938f..6b966d806fe 100644 --- a/components/engine_traits/src/checkpoint.rs +++ b/components/engine_traits/src/checkpoint.rs @@ -8,6 +8,8 @@ pub trait Checkpointable { type Checkpointer: Checkpointer; fn new_checkpointer(&self) -> Result; + + fn merge(&self, dbs: &[&Self]) -> Result<()>; } pub trait Checkpointer { diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index d35233bc310..8b0566f2cfb 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -29,7 +29,7 @@ use crate::{data_cf_offset, RaftEngine, RaftLogBatch, DATA_CFS_LEN}; pub struct ApplyProgress { cf: String, apply_index: u64, - earliest_seqno: u64, + smallest_seqno: u64, } impl ApplyProgress { @@ -123,8 +123,8 @@ impl PersistenceListener { /// Called when memtable is frozen. /// - /// `earliest_seqno` should be the smallest seqno of the memtable. - pub fn on_memtable_sealed(&self, cf: String, earliest_seqno: u64) { + /// `smallest_seqno` should be the smallest seqno of the memtable. + pub fn on_memtable_sealed(&self, cf: String, smallest_seqno: u64) { // The correctness relies on the assumption that there will be only one // thread writting to the DB and increasing apply index. // Apply index will be set within DB lock, so it's correct even with manual @@ -133,16 +133,16 @@ impl PersistenceListener { let apply_index = self.state.applied_index.load(Ordering::SeqCst); let mut prs = self.progress.lock().unwrap(); let flushed = prs.last_flushed[offset]; - if flushed > earliest_seqno { + if flushed > smallest_seqno { panic!( "sealed seqno has been flushed {} {} {} <= {}", - cf, apply_index, earliest_seqno, flushed + cf, apply_index, smallest_seqno, flushed ); } prs.prs.push_back(ApplyProgress { cf, apply_index, - earliest_seqno, + smallest_seqno, }); } @@ -170,8 +170,7 @@ impl PersistenceListener { cursor.move_next(); continue; } - // Note flushed largest_seqno equals to earliest_seqno of next memtable. - if pr.earliest_seqno < largest_seqno { + if pr.smallest_seqno <= largest_seqno { match &mut flushed_pr { None => flushed_pr = cursor.remove_current(), Some(flushed_pr) => { diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 4833030fec3..3f7bf408aa8 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -51,7 +51,7 @@ use time::Timespec; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, - operation::{SharedReadTablet, SPLIT_PREFIX}, + operation::{SharedReadTablet, MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, worker::{pd, tablet_gc}, @@ -62,6 +62,7 @@ use crate::{ pub struct StoreContext { /// A logger without any KV. It's clean for creating new PeerFSM. pub logger: Logger, + pub store_id: u64, pub coprocessor_host: CoprocessorHost, /// The transport for sending messages to peers on other stores. pub trans: T, @@ -392,6 +393,10 @@ impl StorePollerBuilder { continue; } let Some((prefix, region_id, tablet_index)) = self.tablet_registry.parse_tablet_name(&path) else { continue }; + // Keep the checkpoint even if source is destroyed. + if prefix == MERGE_SOURCE_PREFIX { + continue; + } let fsm = match peers.get(®ion_id) { Some((_, fsm)) => fsm, None => { @@ -405,14 +410,17 @@ impl StorePollerBuilder { if prefix == SPLIT_PREFIX { file_system::remove_dir_all(&path)?; continue; - } - if prefix.is_empty() { + } else if prefix == MERGE_IN_PROGRESS_PREFIX { + continue; + } else if prefix.is_empty() { // Stale split data can be deleted. if fsm.peer().storage().tablet_index() > tablet_index { file_system::remove_dir_all(&path)?; } + } else { + debug_assert!(false, "unexpected tablet prefix: {}", path.display()); + warn!(self.logger, "unexpected tablet prefix"; "path" => %path.display()); } - // TODO: handle other prefix } // TODO: list all available tablets and destroy those which are not in the // peers. @@ -432,6 +440,7 @@ where let cfg = self.cfg.value().clone(); let mut poll_ctx = StoreContext { logger: self.logger.clone(), + store_id: self.store_id, trans: self.trans.clone(), current_time: None, has_ready: false, diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index e1bf5169d55..2afd8fbf773 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -21,7 +21,7 @@ use tikv_util::{ }; use crate::{ - operation::DataTrace, + operation::{CatchUpLogs, DataTrace}, raft::Apply, router::{ApplyRes, ApplyTask, PeerMsg}, }; @@ -31,6 +31,8 @@ use crate::{ /// Using a trait to make signiture simpler. pub trait ApplyResReporter { fn report(&self, apply_res: ApplyRes); + + fn redirect_catch_up_logs(&self, c: CatchUpLogs); } impl, S: FsmScheduler> ApplyResReporter for Mailbox { @@ -38,6 +40,11 @@ impl, S: FsmScheduler> ApplyResReporter for M // TODO: check shutdown. let _ = self.force_send(PeerMsg::ApplyRes(apply_res)); } + + fn redirect_catch_up_logs(&self, c: CatchUpLogs) { + let msg = PeerMsg::RedirectCatchUpLogs(c); + let _ = self.force_send(msg); + } } /// Schedule task to `ApplyFsm`. diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 77860b0ff49..5e10aa0ef72 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -217,7 +217,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerTick::PdHeartbeat => self.on_pd_heartbeat(), PeerTick::CompactLog => self.on_compact_log_tick(false), PeerTick::SplitRegionCheck => self.on_split_region_check(), - PeerTick::CheckMerge => unimplemented!(), + PeerTick::CheckMerge => self.fsm.peer_mut().on_check_merge(self.store_ctx), PeerTick::CheckPeerStaleState => unimplemented!(), PeerTick::EntryCacheEvict => self.on_entry_cache_evict(), PeerTick::CheckLeaderLease => unimplemented!(), @@ -330,6 +330,20 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .fsm .peer_mut() .on_cleanup_import_sst(self.store_ctx, ssts), + PeerMsg::AskCommitMerge(req) => { + self.fsm.peer_mut().on_ask_commit_merge(self.store_ctx, req) + } + PeerMsg::AckCommitMerge { index, target_id } => { + self.fsm.peer_mut().on_ack_commit_merge(index, target_id) + } + PeerMsg::RejectCommitMerge { index } => { + self.fsm.peer_mut().on_reject_commit_merge(index) + } + PeerMsg::RedirectCatchUpLogs(c) => self + .fsm + .peer_mut() + .on_redirect_catch_up_logs(self.store_ctx, c), + PeerMsg::CatchUpLogs(c) => self.fsm.peer_mut().on_catch_up_logs(self.store_ctx, c), #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index afb7aa5d0d8..4b4255b3d3e 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -276,6 +276,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { .fsm .store .on_store_unreachable(self.store_ctx, to_store_id), + StoreMsg::AskCommitMerge(req) => { + self.fsm.store.on_ask_commit_merge(self.store_ctx, req) + } #[cfg(feature = "testexport")] StoreMsg::WaitFlush { region_id, ch } => { self.fsm.store.on_wait_flush(self.store_ctx, region_id, ch) diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index ed4d22a59b4..8ae195539b2 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -13,6 +13,8 @@ //! Updates truncated index, and compacts logs if the corresponding changes have //! been persisted in kvdb. +use std::path::PathBuf; + use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; use protobuf::Message; @@ -309,6 +311,33 @@ impl Peer { )); } + #[inline] + pub fn record_tombstone_tablet_path( + &mut self, + ctx: &StoreContext, + old_tablet: PathBuf, + new_tablet_index: u64, + ) { + info!( + self.logger, + "record tombstone tablet"; + "prev_tablet_path" => old_tablet.display(), + "new_tablet_index" => new_tablet_index + ); + let compact_log_context = self.compact_log_context_mut(); + compact_log_context + .tombstone_tablets_wait_index + .push(new_tablet_index); + let _ = ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::prepare_destroy_path( + old_tablet, + self.region_id(), + new_tablet_index, + )); + } + /// Returns if there's any tombstone being removed. #[inline] pub fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { @@ -369,15 +398,6 @@ impl Peer { mut res: CompactLogResult, ) { let first_index = self.entry_storage().first_index(); - if res.compact_index <= first_index { - debug!( - self.logger, - "compact index <= first index, no need to compact"; - "compact_index" => res.compact_index, - "first_index" => first_index, - ); - return; - } if let Some(i) = self.merge_context().and_then(|c| c.max_compact_log_index()) && res.compact_index > i { @@ -389,6 +409,22 @@ impl Peer { ); res.compact_index = i; } + if res.compact_index <= first_index { + debug!( + self.logger, + "compact index <= first index, no need to compact"; + "compact_index" => res.compact_index, + "first_index" => first_index, + ); + return; + } + assert!( + res.compact_index < self.compact_log_context().last_applying_index, + "{}: {}, {}", + SlogFormat(&self.logger), + res.compact_index, + self.compact_log_context().last_applying_index + ); // TODO: check entry_cache_warmup_state self.entry_storage_mut() .compact_entry_cache(res.compact_index); diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 7bc20068736..b2bea379299 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -192,6 +192,10 @@ impl Peer { self.maybe_schedule_gc_peer_tick(); } } + ctx.store_meta + .lock() + .unwrap() + .set_region(self.region(), true, &self.logger); ctx.coprocessor_host.on_region_changed( self.region(), RegionChangeEvent::Update(RegionChangeReason::ChangePeer), diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs new file mode 100644 index 00000000000..876ba5b1a95 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -0,0 +1,792 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains merge related processing logic. +//! +//! ## Propose +//! +//! The proposal is initiated by the source region. After `PrepareMerge` is +//! applied, the source peer will send an `AskCommitMerge` message to the target +//! peer. (For simplicity, we send this message regardless of whether the target +//! peer is leader.) The message will also carry some source region logs that +//! may not be committed by some source peers. +//! +//! The source region cannot serve any writes until the merge is committed or +//! rollback-ed. This is guaranteed by `MergeContext::prepare_status`. +//! +//! ## Apply (`Apply::apply_commit_merge`) +//! +//! At first, target region will not apply the `CommitMerge` command. Instead +//! the apply progress will be paused and it redirects the log entries from +//! source region, as a `CatchUpLogs` message, to the local source region peer. +//! When the source region peer has applied all logs up to the prior +//! `PrepareMerge` command, it will signal the target peer. Here we use a +//! temporary channel instead of directly sending message between apply FSMs +//! like in v1. +//! +//! Here is a complete view of the process: +//! +//! ```text +//! | Store 1 | Store 2 | +//! | Source Peer | Target Leader | Source Peer | Target Peer | +//! | +//! apply PrepareMerge +//! \ +//! +--------------+ +//! `AskCommitMerge`\ +//! \ +//! propose CommitMerge ---------------> append CommitMerge +//! apply CommitMerge apply CommitMerge +//! on apply res /| +//! /| +------------+ | +//! +---------------+ | / `CatchUpLogs` | +//! / `AckCommitMerge` | / | +//! / (complete) append logs (pause) +//! destroy self | . +//! apply PrepareMerge . +//! | . +//! +-----------> (continue) +//! | | +//! destroy self (complete) +//! ``` + +use std::{ + any::Any, + cmp, fs, io, + path::{Path, PathBuf}, +}; + +use crossbeam::channel::SendError; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry}; +use futures::channel::oneshot; +use kvproto::{ + metapb::Region, + raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CommitMergeRequest, RaftCmdRequest}, + raft_serverpb::{MergedRecord, PeerState, RegionLocalState}, +}; +use protobuf::Message; +use raft::{GetEntriesContext, Storage, INVALID_ID, NO_LIMIT}; +use raftstore::{ + coprocessor::RegionChangeReason, + store::{ + fsm::new_admin_request, metrics::PEER_ADMIN_CMD_COUNTER, util, ProposalContext, Transport, + }, + Result, +}; +use slog::{debug, error, info, Logger}; +use tikv_util::{ + config::ReadableDuration, + log::SlogFormat, + slog_panic, + store::{find_peer, region_on_same_stores}, + time::Instant, +}; + +use super::merge_source_path; +use crate::{ + batch::StoreContext, + fsm::ApplyResReporter, + operation::{AdminCmdResult, SharedReadTablet}, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, +}; + +#[derive(Debug)] +pub struct CommitMergeResult { + pub index: u64, + // Only used to respond `CatchUpLogs` to source peer. + prepare_merge_index: u64, + source_path: PathBuf, + region_state: RegionLocalState, + source: Region, + source_safe_ts: u64, + tablet: Box, +} + +#[derive(Debug)] +pub struct CatchUpLogs { + target_region_id: u64, + merge: CommitMergeRequest, + // safe_ts. + tx: oneshot::Sender, +} + +pub const MERGE_IN_PROGRESS_PREFIX: &str = "merge-in-progress"; + +struct MergeInProgressGuard(PathBuf); + +impl MergeInProgressGuard { + // `index` is the commit index of `CommitMergeRequest` + fn new( + logger: &Logger, + registry: &TabletRegistry, + target_region_id: u64, + index: u64, + tablet_path: &Path, + ) -> io::Result> { + let name = registry.tablet_name(MERGE_IN_PROGRESS_PREFIX, target_region_id, index); + let marker_path = registry.tablet_root().join(name); + if !marker_path.exists() { + if tablet_path.exists() { + return Ok(None); + } else { + fs::create_dir(&marker_path)?; + file_system::sync_dir(marker_path.parent().unwrap())?; + } + } else if tablet_path.exists() { + info!(logger, "remove incomplete merged tablet"; "path" => %tablet_path.display()); + fs::remove_dir_all(tablet_path)?; + } + Ok(Some(Self(marker_path))) + } + + fn defuse(self) -> io::Result<()> { + fs::remove_dir(&self.0)?; + file_system::sync_dir(self.0.parent().unwrap()) + } +} + +fn commit_of_merge(r: &CommitMergeRequest) -> u64 { + r.get_source_state().get_merge_state().get_commit() +} + +// Source peer initiates commit merge on target peer. +impl Peer { + // Called after applying `PrepareMerge`. + pub fn start_commit_merge(&mut self, store_ctx: &mut StoreContext) { + assert!(self.applied_merge_state().is_some()); + // Target already committed `CommitMerge`. + if let Some(c) = &self.merge_context().unwrap().catch_up_logs { + if self.catch_up_logs_ready(c) { + let c = self.merge_context_mut().catch_up_logs.take().unwrap(); + self.finish_catch_up_logs(store_ctx, c); + } + } else { + self.on_check_merge(store_ctx); + } + } + + // Match v1::on_check_merge. + pub fn on_check_merge(&mut self, store_ctx: &mut StoreContext) { + if !self.serving() || self.applied_merge_state().is_none() { + return; + } + self.add_pending_tick(PeerTick::CheckMerge); + self.ask_target_peer_to_commit_merge(store_ctx); + } + + // Match v1::schedule_merge. + fn ask_target_peer_to_commit_merge(&mut self, store_ctx: &mut StoreContext) { + let state = self.applied_merge_state().unwrap(); + let target = state.get_target(); + let target_id = target.get_id(); + + let (min_index, _) = self.calculate_min_progress().unwrap(); + let low = cmp::max(min_index + 1, state.get_min_index()); + // TODO: move this into raft module. + // > over >= to include the PrepareMerge proposal. + let entries = if low > state.get_commit() { + Vec::new() + } else { + // TODO: fetch entries in async way + match self.storage().entries( + low, + state.get_commit() + 1, + NO_LIMIT, + GetEntriesContext::empty(false), + ) { + Ok(ents) => ents, + Err(e) => slog_panic!( + self.logger, + "failed to get merge entires"; + "err" => ?e, + "low" => low, + "commit" => state.get_commit() + ), + } + }; + + let target_peer = find_peer(target, store_ctx.store_id).unwrap(); + let mut request = new_admin_request(target.get_id(), target_peer.clone()); + request + .mut_header() + .set_region_epoch(target.get_region_epoch().clone()); + let mut admin = AdminRequest::default(); + admin.set_cmd_type(AdminCmdType::CommitMerge); + admin.mut_commit_merge().set_entries(entries.into()); + admin + .mut_commit_merge() + .set_source_state(self.storage().region_state().clone()); + request.set_admin_request(admin); + // Please note that, here assumes that the unit of network isolation is store + // rather than peer. So a quorum stores of source region should also be the + // quorum stores of target region. Otherwise we need to enable proposal + // forwarding. + let msg = PeerMsg::AskCommitMerge(request); + // If target peer is destroyed, life.rs is responsible for telling us to + // rollback. + match store_ctx.router.force_send(target_id, msg) { + Ok(_) => (), + Err(SendError(PeerMsg::AskCommitMerge(msg))) => { + if let Err(e) = store_ctx + .router + .force_send_control(StoreMsg::AskCommitMerge(msg)) + { + if store_ctx.router.is_shutdown() { + return; + } + slog_panic!( + self.logger, + "fails to send `AskCommitMerge` msg to store"; + "error" => ?e, + ); + } + } + _ => unreachable!(), + } + } +} + +// Target peer handles the commit merge request. +impl Peer { + pub fn on_ask_commit_merge( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) { + match self.validate_commit_merge(&req) { + Some(true) if self.is_leader() => { + let (ch, _) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + } + Some(false) => { + let commit_merge = req.get_admin_request().get_commit_merge(); + let source_id = commit_merge.get_source_state().get_region().get_id(); + let _ = store_ctx.router.force_send( + source_id, + PeerMsg::RejectCommitMerge { + index: commit_of_merge(commit_merge), + }, + ); + } + _ => (), + } + } + + fn validate_commit_merge(&self, req: &RaftCmdRequest) -> Option { + let expected_epoch = req.get_header().get_region_epoch(); + let merge = req.get_admin_request().get_commit_merge(); + assert!(merge.has_source_state() && merge.get_source_state().has_merge_state()); + let source_region = merge.get_source_state().get_region(); + let region = self.region(); + if self + .storage() + .region_state() + .get_merged_records() + .iter() + .any(|p| p.get_source_region_id() == source_region.get_id()) + { + info!( + self.logger, + "ignore commit merge because peer is already in merged_records"; + "source" => ?source_region, + ); + None + } else if util::is_epoch_stale(expected_epoch, region.get_region_epoch()) { + info!( + self.logger, + "reject commit merge because of stale"; + "current_epoch" => ?region.get_region_epoch(), + "expected_epoch" => ?expected_epoch, + ); + Some(false) + } else if expected_epoch == region.get_region_epoch() { + assert!( + util::is_sibling_regions(source_region, region), + "{}: {:?}, {:?}", + SlogFormat(&self.logger), + source_region, + region + ); + assert!( + region_on_same_stores(source_region, region), + "{:?}, {:?}", + source_region, + region + ); + // Best effort. Remove when trim check is implemented. + if self.storage().has_dirty_data() { + info!(self.logger, "ignore commit merge because of dirty data"); + None + } else { + Some(true) + } + } else { + info!( + self.logger, + "ignore commit merge because self epoch is stale"; + "source" => ?source_region, + ); + None + } + } + + pub fn propose_commit_merge( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) -> Result { + let mut proposal_ctx = ProposalContext::empty(); + proposal_ctx.insert(ProposalContext::COMMIT_MERGE); + let data = req.write_to_bytes().unwrap(); + self.propose_with_ctx(store_ctx, data, proposal_ctx.to_vec()) + } +} + +impl Apply { + // Match v1::exec_commit_merge. + pub async fn apply_commit_merge( + &mut self, + req: &AdminRequest, + index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + PEER_ADMIN_CMD_COUNTER.commit_merge.all.inc(); + + self.flush(); + + // Note: compared to v1, doesn't validate region state from kvdb any more. + let reg = self.tablet_registry(); + let merge = req.get_commit_merge(); + let merge_commit = commit_of_merge(merge); + let source_state = merge.get_source_state(); + let source_region = source_state.get_region(); + let source_path = merge_source_path(reg, source_region.get_id(), merge_commit); + let mut source_safe_ts = 0; + + let mut start_time = Instant::now_coarse(); + let mut wait_duration = None; + let force_send = (|| { + fail::fail_point!("force_send_catch_up_logs", |_| true); + false + })(); + if !source_path.exists() || force_send { + let (tx, rx) = oneshot::channel(); + self.res_reporter().redirect_catch_up_logs(CatchUpLogs { + target_region_id: self.region_id(), + merge: merge.clone(), + tx, + }); + match rx.await { + Ok(ts) => { + source_safe_ts = ts; + } + Err(_) => { + if tikv_util::thread_group::is_shutdown(!cfg!(test)) { + return futures::future::pending().await; + } else { + slog_panic!( + self.logger, + "source peer is missing when getting checkpoint for merge" + ); + } + } + } + let now = Instant::now_coarse(); + wait_duration = Some(now.saturating_duration_since(start_time)); + start_time = now; + }; + fail::fail_point!("after_acquire_source_checkpoint", |_| Err( + tikv_util::box_err!("fp") + )); + + info!( + self.logger, + "execute CommitMerge"; + "commit" => merge_commit, + "entries" => merge.get_entries().len(), + "index" => index, + "source_region" => ?source_region, + ); + + let ctx = TabletContext::new(source_region, None); + let source_tablet = reg + .tablet_factory() + .open_tablet(ctx, &source_path) + .unwrap_or_else(|e| { + slog_panic!(self.logger, "failed to open source checkpoint"; "err" => ?e); + }); + let open_time = Instant::now_coarse(); + + let mut region = self.region().clone(); + // Use a max value so that pd can ensure overlapped region has a priority. + let version = cmp::max( + source_region.get_region_epoch().get_version(), + region.get_region_epoch().get_version(), + ) + 1; + region.mut_region_epoch().set_version(version); + if keys::enc_end_key(®ion) == keys::enc_start_key(source_region) { + region.set_end_key(source_region.get_end_key().to_vec()); + } else { + region.set_start_key(source_region.get_start_key().to_vec()); + } + + let path = reg.tablet_path(self.region_id(), index); + + // Avoid seqno jump back between self.tablet and the newly created tablet. + // If we are recovering, this flush would just be a noop. + self.tablet().flush_cfs(&[], true).unwrap(); + let flush_time = Instant::now_coarse(); + + let mut ctx = TabletContext::new(®ion, Some(index)); + ctx.flush_state = Some(self.flush_state().clone()); + let guard = MergeInProgressGuard::new(&self.logger, reg, self.region_id(), index, &path) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create MergeInProgressGuard"; + "path" => %path.display(), + "error" => ?e + ) + }); + let tablet = reg.tablet_factory().open_tablet(ctx, &path).unwrap(); + if let Some(guard) = guard { + tablet + .merge(&[&source_tablet, self.tablet()]) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to merge tablet"; + "path" => %path.display(), + "error" => ?e + ) + }); + guard.defuse().unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to defuse MergeInProgressGuard"; + "path" => %path.display(), + "error" => ?e + ) + }); + } else { + info!(self.logger, "reuse merged tablet"); + } + let merge_time = Instant::now_coarse(); + fail::fail_point!("after_merge_source_checkpoint", |_| Err( + tikv_util::box_err!("fp") + )); + + info!( + self.logger, + "applied CommitMerge"; + "source_region" => ?source_region, + "wait" => ?wait_duration.map(|d| format!("{}", ReadableDuration(d))), + "open" => %ReadableDuration(open_time.saturating_duration_since(start_time)), + "merge" => %ReadableDuration(flush_time.saturating_duration_since(open_time)), + "flush" => %ReadableDuration(merge_time.saturating_duration_since(flush_time)), + ); + + self.set_tablet(tablet.clone()); + + let state = self.region_state_mut(); + state.set_region(region.clone()); + state.set_state(PeerState::Normal); + assert!(!state.has_merge_state()); + state.set_tablet_index(index); + let mut removed_records: Vec<_> = state.take_removed_records().into(); + removed_records.append(&mut source_state.get_removed_records().into()); + state.set_removed_records(removed_records.into()); + let mut merged_records: Vec<_> = state.take_merged_records().into(); + merged_records.append(&mut source_state.get_merged_records().into()); + state.set_merged_records(merged_records.into()); + let mut merged_record = MergedRecord::default(); + merged_record.set_source_region_id(source_region.get_id()); + merged_record.set_source_epoch(source_region.get_region_epoch().clone()); + merged_record.set_source_peers(source_region.get_peers().into()); + merged_record.set_target_region_id(region.get_id()); + merged_record.set_target_epoch(region.get_region_epoch().clone()); + merged_record.set_target_peers(region.get_peers().into()); + merged_record.set_index(index); + state.mut_merged_records().push(merged_record); + + PEER_ADMIN_CMD_COUNTER.commit_merge.success.inc(); + + Ok(( + AdminResponse::default(), + AdminCmdResult::CommitMerge(CommitMergeResult { + index, + prepare_merge_index: merge_commit, + source_path, + region_state: self.region_state().clone(), + source: source_region.to_owned(), + source_safe_ts, + tablet: Box::new(tablet), + }), + )) + } +} + +// Source peer catches up logs (optionally), and destroy itself. +impl Peer { + // Target peer. + #[inline] + pub fn on_redirect_catch_up_logs( + &mut self, + store_ctx: &mut StoreContext, + catch_up_logs: CatchUpLogs, + ) { + let source_id = catch_up_logs.merge.get_source_state().get_region().get_id(); + assert_eq!(catch_up_logs.target_region_id, self.region_id()); + let _ = store_ctx + .router + .force_send(source_id, PeerMsg::CatchUpLogs(catch_up_logs)); + } + + // Match v1::on_catch_up_logs_for_merge. + pub fn on_catch_up_logs( + &mut self, + store_ctx: &mut StoreContext, + mut catch_up_logs: CatchUpLogs, + ) { + let source_id = catch_up_logs.merge.get_source_state().get_region().get_id(); + if source_id != self.region_id() { + slog_panic!( + self.logger, + "get unexpected catch_up_logs"; + "merge" => ?catch_up_logs.merge, + ); + } + + // Context would be empty if this peer hasn't applied PrepareMerge. + if let Some(cul) = self.merge_context().and_then(|c| c.catch_up_logs.as_ref()) { + slog_panic!( + self.logger, + "get conflicting catch_up_logs"; + "new" => ?catch_up_logs.merge, + "current" => ?cul.merge, + ); + } + if !self.catch_up_logs_ready(&catch_up_logs) { + // Directly append these logs to raft log and then commit them. + match self.maybe_append_merge_entries(&catch_up_logs.merge) { + Some(last_index) => { + info!( + self.logger, + "append and commit entries to source region"; + "last_index" => last_index, + ); + self.set_has_ready(); + } + None => { + info!(self.logger, "no need to catch up logs"); + } + } + catch_up_logs.merge.clear_entries(); + self.merge_context_mut().catch_up_logs = Some(catch_up_logs); + } else { + self.finish_catch_up_logs(store_ctx, catch_up_logs); + } + } + + #[inline] + fn catch_up_logs_ready(&self, catch_up_logs: &CatchUpLogs) -> bool { + if let Some(state) = self.applied_merge_state() + && state.get_commit() == commit_of_merge(&catch_up_logs.merge) + { + assert_eq!( + state.get_target().get_id(), + catch_up_logs.target_region_id + ); + true + } else { + false + } + } + + fn maybe_append_merge_entries(&mut self, merge: &CommitMergeRequest) -> Option { + let mut entries = merge.get_entries(); + let merge_commit = commit_of_merge(merge); + if entries.is_empty() { + // Though the entries is empty, it is possible that one source peer has caught + // up the logs but commit index is not updated. If other source peers are + // already destroyed, so the raft group will not make any progress, namely the + // source peer can not get the latest commit index anymore. + // Here update the commit index to let source apply rest uncommitted entries. + return if merge_commit > self.raft_group().raft.raft_log.committed { + self.raft_group_mut().raft.raft_log.commit_to(merge_commit); + Some(merge_commit) + } else { + None + }; + } + let first = entries.first().unwrap(); + // make sure message should be with index not smaller than committed + let mut log_idx = first.get_index() - 1; + debug!( + self.logger, + "append merge entries"; + "log_index" => log_idx, + "merge_commit" => merge_commit, + "commit_index" => self.raft_group().raft.raft_log.committed, + ); + if log_idx < self.raft_group().raft.raft_log.committed { + // There may be some logs not included in CommitMergeRequest's entries, like + // CompactLog, so the commit index may exceed the last index of the entires from + // CommitMergeRequest. If that, no need to append + if self.raft_group().raft.raft_log.committed - log_idx >= entries.len() as u64 { + return None; + } + entries = &entries[(self.raft_group().raft.raft_log.committed - log_idx) as usize..]; + log_idx = self.raft_group().raft.raft_log.committed; + } + let log_term = self.index_term(log_idx); + + let last_log = entries.last().unwrap(); + if last_log.term > self.term() { + // Hack: In normal flow, when leader sends the entries, it will use a term + // that's not less than the last log term. And follower will update its states + // correctly. For merge, we append the log without raft, so we have to take care + // of term explicitly to get correct metadata. + info!( + self.logger, + "become follower for new logs"; + "new_log_term" => last_log.term, + "new_log_index" => last_log.index, + "term" => self.term(), + ); + self.raft_group_mut() + .raft + .become_follower(last_log.term, INVALID_ID); + } + + self.raft_group_mut() + .raft + .raft_log + .maybe_append(log_idx, log_term, merge_commit, entries) + .map(|(_, last_index)| last_index) + } + + #[inline] + fn finish_catch_up_logs(&mut self, store_ctx: &mut StoreContext, c: CatchUpLogs) { + let safe_ts = store_ctx + .store_meta + .lock() + .unwrap() + .region_read_progress + .get(&self.region_id()) + .unwrap() + .safe_ts(); + if c.tx.send(safe_ts).is_err() { + error!( + self.logger, + "failed to respond to merge target, are we shutting down?" + ); + } + self.take_merge_context(); + self.mark_for_destroy(None); + } +} + +impl Peer { + // Match v1::on_ready_commit_merge. + pub fn on_apply_res_commit_merge( + &mut self, + store_ctx: &mut StoreContext, + mut res: CommitMergeResult, + ) { + let region = res.region_state.get_region(); + assert!( + res.source.get_end_key() == region.get_end_key() + || res.source.get_start_key() == region.get_start_key() + ); + let tablet: EK = match res.tablet.downcast() { + Ok(t) => *t, + Err(t) => unreachable!("tablet type should be the same: {:?}", t), + }; + let acquired_source_safe_ts_before = res.source_safe_ts > 0; + + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + if let Some(p) = meta.region_read_progress.get(&res.source.get_id()) { + res.source_safe_ts = p.safe_ts(); + } + meta.set_region(region, true, &self.logger); + let (reader, read_tablet) = meta.readers.get_mut(®ion.get_id()).unwrap(); + self.set_region( + &store_ctx.coprocessor_host, + reader, + region.clone(), + RegionChangeReason::CommitMerge, + res.index, + ); + + // Tablet should be updated in lock to match the epoch. + *read_tablet = SharedReadTablet::new(tablet.clone()); + + // After the region commit merged, the region's key range is extended and the + // region's `safe_ts` should reset to `min(source_safe_ts, target_safe_ts)` + self.read_progress_mut().merge_safe_ts( + res.source_safe_ts, + res.index, + &store_ctx.coprocessor_host, + ); + self.txn_context() + .after_commit_merge(store_ctx, self.term(), region, &self.logger); + } + + // We could only have gotten safe ts by sending `CatchUpLogs` earlier. If we + // haven't, need to acknowledge that we have committed the merge, so that the + // source peer can destroy itself. Note that the timing is deliberately + // delayed after reading `store_ctx.meta` to get the source safe ts + // before its meta gets cleaned up. + if !acquired_source_safe_ts_before { + let _ = store_ctx.router.force_send( + res.source.get_id(), + PeerMsg::AckCommitMerge { + index: res.prepare_merge_index, + target_id: self.region_id(), + }, + ); + } + + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(store_ctx, tablet, res.index); + } + self.record_tombstone_tablet_path(store_ctx, res.source_path, res.index); + + // make approximate size and keys updated in time. + // the reason why follower need to update is that there is a issue that after + // merge and then transfer leader, the new leader may have stale size and keys. + self.force_split_check(store_ctx); + self.region_buckets_info_mut().set_bucket_stat(None); + + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, res.index, &res.region_state) + .unwrap(); + self.storage_mut().set_region_state(res.region_state); + self.storage_mut() + .apply_trace_mut() + .on_admin_flush(res.index); + self.set_has_extra_write(); + + if self.is_leader() { + self.region_heartbeat_pd(store_ctx); + info!( + self.logger, + "notify pd with merge"; + "source_region" => ?res.source, + "target_region" => ?self.region(), + ); + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + } + + // Called on source peer. + pub fn on_ack_commit_merge(&mut self, index: u64, target_id: u64) { + // We don't check it against merge state because source peer might just restart + // and haven't replayed `PrepareMerge` yet. + info!(self.logger, "destroy self on AckCommitMerge"; "index" => index, "target_id" => target_id); + self.take_merge_context(); + self.mark_for_destroy(None); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/merge/mod.rs b/components/raftstore-v2/src/operation/command/admin/merge/mod.rs index a3895a1b435..0b198eec2a6 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/mod.rs @@ -1,11 +1,16 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +pub mod commit; pub mod prepare; +pub mod rollback; -use engine_traits::{KvEngine, RaftEngine}; +use std::path::PathBuf; + +use commit::CatchUpLogs; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{ raft_cmdpb::RaftCmdRequest, - raft_serverpb::{PeerState, RegionLocalState}, + raft_serverpb::{MergeState, PeerState, RegionLocalState}, }; use prepare::PrepareStatus; use raft::{ProgressState, INVALID_INDEX}; @@ -15,9 +20,24 @@ use tikv_util::box_err; use crate::raft::Peer; +pub const MERGE_SOURCE_PREFIX: &str = "merge-source"; + +// `index` is the commit index of `PrepareMergeRequest`, `commit` field of +// `CommitMergeRequest`. +fn merge_source_path( + registry: &TabletRegistry, + source_region_id: u64, + index: u64, +) -> PathBuf { + let tablet_name = registry.tablet_name(MERGE_SOURCE_PREFIX, source_region_id, index); + registry.tablet_root().join(tablet_name) +} + +/// This context is only used at source region. #[derive(Default)] pub struct MergeContext { prepare_status: Option, + catch_up_logs: Option, } impl MergeContext { @@ -70,7 +90,7 @@ impl Peer { } /// Returns (minimal matched, minimal committed) - pub fn calculate_min_progress(&self) -> Result<(u64, u64)> { + fn calculate_min_progress(&self) -> Result<(u64, u64)> { let (mut min_m, mut min_c) = (None, None); if let Some(progress) = self.raft_group().status().progress { for (id, pr) in progress.iter() { @@ -109,4 +129,15 @@ impl Peer { } Ok((min_m, min_c)) } + + #[inline] + fn applied_merge_state(&self) -> Option<&MergeState> { + self.merge_context().and_then(|ctx| { + if let Some(PrepareStatus::Applied(state)) = ctx.prepare_status.as_ref() { + Some(state) + } else { + None + } + }) + } } diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index f9df2d9ea1a..f031ac5d20e 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -29,7 +29,7 @@ use std::mem; -use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, CF_LOCK}; +use engine_traits::{Checkpointer, KvEngine, RaftEngine, RaftLogBatch, CF_LOCK}; use kvproto::{ raft_cmdpb::{ AdminCmdType, AdminRequest, AdminResponse, CmdType, PrepareMergeRequest, PutRequest, @@ -46,12 +46,13 @@ use raftstore::{ Error, Result, }; use slog::{debug, info}; -use tikv_util::{box_err, log::SlogFormat, store::region_on_same_stores}; +use tikv_util::{box_err, log::SlogFormat, slog_panic, store::region_on_same_stores}; +use super::merge_source_path; use crate::{ batch::StoreContext, fsm::ApplyResReporter, - operation::AdminCmdResult, + operation::{AdminCmdResult, SimpleWriteReqDecoder}, raft::{Apply, Peer}, router::CmdResChannel, }; @@ -97,6 +98,7 @@ impl Peer { store_ctx: &mut StoreContext, mut req: RaftCmdRequest, ) -> Result { + // Best effort. Remove when trim check is implemented. if self.storage().has_dirty_data() { return Err(box_err!( "{} source peer has dirty data, try again later", @@ -245,11 +247,12 @@ impl Peer { if entry.get_data().is_empty() { continue; } - let cmd: RaftCmdRequest = - util::parse_data_at(entry.get_data(), entry.get_index(), "tag"); - if !cmd.has_admin_request() { - continue; - } + let Err(cmd) = SimpleWriteReqDecoder::new( + &self.logger, + entry.get_data(), + entry.get_index(), + entry.get_term(), + ) else { continue }; let cmd_type = cmd.get_admin_request().get_cmd_type(); match cmd_type { AdminCmdType::TransferLeader @@ -458,6 +461,29 @@ impl Apply { PEER_ADMIN_CMD_COUNTER.prepare_merge.success.inc(); + let _ = self.flush(); + let tablet = self.tablet().clone(); + let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint object"; + "error" => ?e + ) + }); + let reg = self.tablet_registry(); + let path = merge_source_path(reg, self.region_id(), log_index); + // We might be replaying this command. + if !path.exists() { + checkpointer.create_at(&path, None, 0).unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %path.display(), + "error" => ?e + ) + }); + } + Ok(( AdminResponse::default(), AdminCmdResult::PrepareMerge(PrepareMergeResult { @@ -501,7 +527,6 @@ impl Peer { .enter_prepare_merge(res.state.get_commit()); self.merge_context_mut().prepare_status = Some(PrepareStatus::Applied(res.state)); - // TODO: self. - // update_merge_progress_on_apply_res_prepare_merge(store_ctx); + self.start_commit_merge(store_ctx); } } diff --git a/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs new file mode 100644 index 00000000000..ab571298bb0 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/merge/rollback.rs @@ -0,0 +1,12 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use slog::warn; + +use crate::raft::Peer; + +impl Peer { + pub fn on_reject_commit_merge(&mut self, index: u64) { + warn!(self.logger, "target peer rejected commit merge"; "index" => index); + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index fe84413ff28..f59a5e6e0f2 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -11,8 +11,11 @@ use compact_log::CompactLogResult; use conf_change::{ConfChangeResult, UpdateGcPeersResult}; use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; -use merge::prepare::PrepareMergeResult; -pub use merge::MergeContext; +use merge::{commit::CommitMergeResult, prepare::PrepareMergeResult}; +pub use merge::{ + commit::{CatchUpLogs, MERGE_IN_PROGRESS_PREFIX}, + MergeContext, MERGE_SOURCE_PREFIX, +}; use protobuf::Message; use raftstore::{ store::{cmd_resp, fsm::apply, msg::ErrorCallback}, @@ -39,6 +42,7 @@ pub enum AdminCmdResult { CompactLog(CompactLogResult), UpdateGcPeers(UpdateGcPeersResult), PrepareMerge(PrepareMergeResult), + CommitMerge(CommitMergeResult), } impl Peer { @@ -140,6 +144,7 @@ impl Peer { self.propose(ctx, data) } AdminCmdType::PrepareMerge => self.propose_prepare_merge(ctx, req), + AdminCmdType::CommitMerge => self.propose_commit_merge(ctx, req), _ => unimplemented!(), } }; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 82bae03f062..4560fa93689 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -260,6 +260,12 @@ impl Peer { } } + pub fn force_split_check(&mut self, ctx: &mut StoreContext) { + let control = self.split_flow_control_mut(); + control.size_diff_hint = ctx.cfg.region_split_check_diff().0 as i64; + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + pub fn on_request_split( &mut self, ctx: &mut StoreContext, @@ -873,7 +879,9 @@ mod test { use super::*; use crate::{ - fsm::ApplyResReporter, operation::test_util::create_tmp_importer, raft::Apply, + fsm::ApplyResReporter, + operation::{test_util::create_tmp_importer, CatchUpLogs}, + raft::Apply, router::ApplyRes, }; @@ -892,6 +900,8 @@ mod test { fn report(&self, apply_res: ApplyRes) { let _ = self.sender.send(apply_res); } + + fn redirect_catch_up_logs(&self, _c: CatchUpLogs) {} } fn new_split_req(key: &[u8], id: u64, children: Vec) -> SplitRequest { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index f14c2c905a3..0337c0cf32a 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -63,8 +63,9 @@ mod control; mod write; pub use admin::{ - report_split_init_finish, temp_split_path, AdminCmdResult, CompactLogContext, MergeContext, - RequestHalfSplit, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, + report_split_init_finish, temp_split_path, AdminCmdResult, CatchUpLogs, CompactLogContext, + MergeContext, RequestHalfSplit, RequestSplit, SplitFlowControl, SplitInit, + MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX, }; pub use control::ProposalControl; use pd_client::{BucketMeta, BucketStat}; @@ -359,6 +360,7 @@ impl Peer { AdminCmdResult::CompactLog(res) => self.on_apply_res_compact_log(ctx, res), AdminCmdResult::UpdateGcPeers(state) => self.on_apply_res_update_gc_peers(state), AdminCmdResult::PrepareMerge(res) => self.on_apply_res_prepare_merge(ctx, res), + AdminCmdResult::CommitMerge(res) => self.on_apply_res_commit_merge(ctx, res), } } self.region_buckets_info_mut() @@ -619,7 +621,7 @@ impl Apply { AdminCmdType::Split => self.apply_split(admin_req, log_index)?, AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, log_index)?, AdminCmdType::PrepareMerge => self.apply_prepare_merge(admin_req, log_index)?, - AdminCmdType::CommitMerge => unimplemented!(), + AdminCmdType::CommitMerge => self.apply_commit_merge(admin_req, log_index).await?, AdminCmdType::RollbackMerge => unimplemented!(), AdminCmdType::TransferLeader => { self.apply_transfer_leader(admin_req, entry.term)? diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 9e9cc2f5fc0..525be1991bd 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -194,6 +194,40 @@ impl Store { } } + #[inline] + pub fn on_ask_commit_merge( + &mut self, + ctx: &mut StoreContext, + req: RaftCmdRequest, + ) where + EK: KvEngine, + ER: RaftEngine, + T: Transport, + { + let region_id = req.get_header().get_region_id(); + let mut raft_msg = Box::::default(); + raft_msg.set_region_id(region_id); + raft_msg.set_region_epoch(req.get_header().get_region_epoch().clone()); + raft_msg.set_to_peer(req.get_header().get_peer().clone()); + + // It will create the peer if it does not exist + self.on_raft_message(ctx, raft_msg); + + if let Err(SendError(PeerMsg::AskCommitMerge(req))) = ctx + .router + .force_send(region_id, PeerMsg::AskCommitMerge(req)) + { + let commit_merge = req.get_admin_request().get_commit_merge(); + let source_id = commit_merge.get_source().get_id(); + let _ = ctx.router.force_send( + source_id, + PeerMsg::RejectCommitMerge { + index: commit_merge.get_commit(), + }, + ); + } + } + /// When a message's recipient doesn't exist, it will be redirected to /// store. Store is responsible for checking if it's neccessary to create /// a peer to handle the message. @@ -256,10 +290,12 @@ impl Store { } if msg.has_extra_msg() { let extra_msg = msg.get_extra_msg(); + // Only the direct request has `is_tombstone` set to false. We are certain this + // message needs to be forwarded. if extra_msg.get_type() == ExtraMessageType::MsgGcPeerRequest && extra_msg.has_check_gc_peer() { - forward_destroy_source_peer(ctx, &msg); + forward_destroy_to_source_peer(ctx, &msg); return; } } @@ -356,7 +392,7 @@ fn build_peer_destroyed_report(tombstone_msg: &mut RaftMessage) -> Option(ctx: &mut StoreContext, msg: &RaftMessage) +fn forward_destroy_to_source_peer(ctx: &mut StoreContext, msg: &RaftMessage) where EK: KvEngine, ER: RaftEngine, @@ -373,6 +409,8 @@ where tombstone_msg.set_region_epoch(check_gc_peer.get_check_region_epoch().clone()); tombstone_msg.set_is_tombstone(true); // No need to set epoch as we don't know what it is. + // This message will not be handled by `on_gc_peer_request` due to + // `is_tombstone` being true. tombstone_msg .mut_extra_msg() .set_type(ExtraMessageType::MsgGcPeerRequest); @@ -455,7 +493,7 @@ impl Peer { return; } - forward_destroy_source_peer(ctx, msg); + forward_destroy_to_source_peer(ctx, msg); } /// A peer confirms it's destroyed. diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 5514d966cea..3511a432c15 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -9,9 +9,10 @@ mod ready; mod txn_ext; pub use command::{ - AdminCmdResult, ApplyFlowControl, CommittedEntries, CompactLogContext, MergeContext, - ProposalControl, RequestHalfSplit, RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, - SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, SPLIT_PREFIX, + AdminCmdResult, ApplyFlowControl, CatchUpLogs, CommittedEntries, CompactLogContext, + MergeContext, ProposalControl, RequestHalfSplit, RequestSplit, SimpleWriteBinary, + SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, + MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX, }; pub use life::{DestroyProgress, GcPeerContext}; pub use ready::{ diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 3591a17d989..d93502a734d 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -213,6 +213,13 @@ impl Peer { self.on_gc_peer_request(ctx, &msg); return; } + ExtraMessageType::MsgWantRollbackMerge => { + if self.is_leader() { + // TODO: + // self.merge_context_mut().maybe_add_rollback_peer(); + return; + } + } _ => (), } } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 5eae3078a0a..3db8590d7ed 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -250,6 +250,10 @@ impl Peer { self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(snapshot_index); + if self.proposal_control().is_merging() { + // After applying a snapshot, merge is rollbacked implicitly. + // TODO: self.rollback_merge(ctx); + } let read_tablet = SharedReadTablet::new(tablet.clone()); { let mut meta = ctx.store_meta.lock().unwrap(); diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs index e30bc25eec4..272b2526b39 100644 --- a/components/raftstore-v2/src/operation/txn_ext.rs +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -69,6 +69,20 @@ impl TxnContext { pessimistic_locks.version = region.get_region_epoch().get_version(); } + #[inline] + pub fn after_commit_merge( + &self, + ctx: &StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) { + // If a follower merges into a leader, a more recent read may happen + // on the leader of the follower. So max ts should be updated after + // a region merge. + self.require_updating_max_ts(ctx, term, region, logger); + } + #[inline] pub fn on_became_follower(&self, term: u64, region: &Region) { let mut pessimistic_locks = self.ext.pessimistic_locks.write(); diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index e510c85cbf9..8ee311401a9 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -26,6 +26,7 @@ use raftstore::{ }, }; use slog::Logger; +use tikv_util::slog_panic; use super::storage::Storage; use crate::{ @@ -828,4 +829,12 @@ impl Peer { pub fn last_sent_snapshot_index(&self) -> u64 { self.last_sent_snapshot_index } + + #[inline] + pub fn index_term(&self, idx: u64) -> u64 { + match self.raft_group.raft.raft_log.term(idx) { + Ok(t) => t, + Err(e) => slog_panic!(self.logger, "failed to load term"; "index" => idx, "err" => ?e), + } + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index d386ed0acae..cff915fd248 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -340,7 +340,7 @@ mod tests { use super::*; use crate::{ fsm::ApplyResReporter, - operation::{test_util::create_tmp_importer, write_initial_states}, + operation::{test_util::create_tmp_importer, write_initial_states, CatchUpLogs}, raft::Apply, router::ApplyRes, }; @@ -369,6 +369,7 @@ mod tests { impl ApplyResReporter for TestRouter { fn report(&self, _res: ApplyRes) {} + fn redirect_catch_up_logs(&self, _c: CatchUpLogs) {} } fn new_region() -> Region { diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 88ac0ba7948..26fbde3644a 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -19,7 +19,7 @@ use super::{ }, ApplyRes, }; -use crate::operation::{RequestHalfSplit, RequestSplit, SimpleWriteBinary, SplitInit}; +use crate::operation::{CatchUpLogs, RequestHalfSplit, RequestSplit, SimpleWriteBinary, SplitInit}; #[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] @@ -208,6 +208,18 @@ pub enum PeerMsg { tablet_index: u64, }, CleanupImportSst(Box<[SstMeta]>), + AskCommitMerge(RaftCmdRequest), + AckCommitMerge { + index: u64, + target_id: u64, + }, + RejectCommitMerge { + index: u64, + }, + // From target [`Apply`] to target [`Peer`]. + RedirectCatchUpLogs(CatchUpLogs), + // From target [`Peer`] to source [`Peer`]. + CatchUpLogs(CatchUpLogs), /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), @@ -278,6 +290,7 @@ pub enum StoreMsg { StoreUnreachable { to_store_id: u64, }, + AskCommitMerge(RaftCmdRequest), /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush { diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index d6846f61e4b..703f38c3516 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -15,7 +15,8 @@ pub use self::{ internal_message::ApplyRes, message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, response_channel::{ - CmdResChannel, CmdResChannelBuilder, CmdResEvent, CmdResStream, CmdResSubscriber, - DebugInfoChannel, DebugInfoSubscriber, QueryResChannel, QueryResult, ReadResponse, + BaseSubscriber, CmdResChannel, CmdResChannelBuilder, CmdResEvent, CmdResStream, + CmdResSubscriber, DebugInfoChannel, DebugInfoSubscriber, QueryResChannel, QueryResult, + ReadResponse, }, }; diff --git a/components/raftstore-v2/tests/failpoints/mod.rs b/components/raftstore-v2/tests/failpoints/mod.rs index e2f6884dd54..f73b9398df6 100644 --- a/components/raftstore-v2/tests/failpoints/mod.rs +++ b/components/raftstore-v2/tests/failpoints/mod.rs @@ -11,5 +11,6 @@ mod cluster; mod test_basic_write; mod test_bootstrap; mod test_life; +mod test_merge; mod test_split; mod test_trace_apply; diff --git a/components/raftstore-v2/tests/failpoints/test_merge.rs b/components/raftstore-v2/tests/failpoints/test_merge.rs new file mode 100644 index 00000000000..3979d61743a --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_merge.rs @@ -0,0 +1,109 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::Peekable; +use tikv_util::store::new_peer; + +use crate::cluster::{ + life_helper::assert_peer_not_exist, merge_helper::merge_region, split_helper::split_region, + Cluster, +}; + +#[test] +fn test_source_and_target_both_replay() { + let mut cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let router = &mut cluster.routers[0]; + + let region_1 = router.region_detail(2); + let peer_1 = region_1.get_peers()[0].clone(); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + let peer_2 = new_peer(store_id, peer_1.get_id() + 1); + let region_1_id = region_1.get_id(); + let region_2_id = region_1_id + 1; + let (region_1, region_2) = split_region( + router, + region_1, + peer_1.clone(), + region_2_id, + peer_2, + Some(format!("k{}k", region_1_id).as_bytes()), + Some(format!("k{}k", region_2_id).as_bytes()), + format!("k{}", region_2_id).as_bytes(), + format!("k{}", region_2_id).as_bytes(), + false, + ); + + { + let _fp = fail::FailGuard::new("after_acquire_source_checkpoint", "1*return->off"); + merge_region(router, region_1, peer_1, region_2, false); + } + + cluster.restart(0); + let router = &mut cluster.routers[0]; + // Wait for replay. + let mut retry = 0; + while retry < 50 { + // Read region 1 data from region 2. + let snapshot = router.stale_snapshot(region_2_id); + let key = format!("k{region_1_id}k"); + if let Ok(Some(_)) = snapshot.get_value(key.as_bytes()) { + return; + } + retry += 1; + std::thread::sleep(Duration::from_millis(100)); + } + panic!("merge not replayed after 5s"); +} + +#[test] +fn test_source_destroy_before_target_apply() { + let mut cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let router = &mut cluster.routers[0]; + + let region_1 = router.region_detail(2); + let peer_1 = region_1.get_peers()[0].clone(); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + let peer_2 = new_peer(store_id, peer_1.get_id() + 1); + let region_1_id = region_1.get_id(); + let region_2_id = region_1_id + 1; + let (region_1, region_2) = split_region( + router, + region_1, + peer_1.clone(), + region_2_id, + peer_2, + Some(format!("k{}k", region_1_id).as_bytes()), + Some(format!("k{}k", region_2_id).as_bytes()), + format!("k{}", region_2_id).as_bytes(), + format!("k{}", region_2_id).as_bytes(), + false, + ); + + { + // Sending CatchUpLogs will make source destroy early (without waiting for + // AckCommitMerge). + let _fp1 = fail::FailGuard::new("force_send_catch_up_logs", "1*return->off"); + let _fp2 = fail::FailGuard::new("after_acquire_source_checkpoint", "1*return->off"); + merge_region(router, region_1, peer_1.clone(), region_2, false); + } + assert_peer_not_exist(region_1_id, peer_1.get_id(), router); + + cluster.restart(0); + let router = &mut cluster.routers[0]; + // Wait for replay. + let mut retry = 0; + while retry < 50 { + // Read region 1 data from region 2. + let snapshot = router.stale_snapshot(region_2_id); + let key = format!("k{region_1_id}k"); + if let Ok(Some(_)) = snapshot.get_value(key.as_bytes()) { + return; + } + retry += 1; + std::thread::sleep(Duration::from_millis(100)); + } + panic!("merge not replayed after 5s"); +} diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 451f7131cc9..1685b5154e7 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -49,7 +49,7 @@ use sst_importer::SstImporter; use tempfile::TempDir; use test_pd::mocker::Service; use tikv_util::{ - config::{ReadableDuration, VersionTrack}, + config::{ReadableDuration, ReadableSize, VersionTrack}, store::new_peer, worker::{LazyWorker, Worker}, }; @@ -67,6 +67,7 @@ pub fn check_skip_wal(path: &str) { assert!(found, "no WAL found in {}", path); } +#[derive(Clone)] pub struct TestRouter(RaftRouter); impl Deref for TestRouter { @@ -464,6 +465,9 @@ impl Transport for TestTransport { pub fn v2_default_config() -> Config { let mut config = Config::default(); config.store_io_pool_size = 1; + if config.region_split_check_diff.is_none() { + config.region_split_check_diff = Some(ReadableSize::mb(96 / 16)); + } config } @@ -758,6 +762,63 @@ pub mod split_helper { } } +pub mod merge_helper { + use std::{thread, time::Duration}; + + use futures::executor::block_on; + use kvproto::{ + metapb, + raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest}, + }; + use raftstore_v2::router::PeerMsg; + + use super::TestRouter; + + pub fn merge_region( + router: &mut TestRouter, + source: metapb::Region, + source_peer: metapb::Peer, + target: metapb::Region, + check: bool, + ) -> metapb::Region { + let region_id = source.id; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(source.get_region_epoch().clone()); + req.mut_header().set_peer(source_peer); + + let mut admin_req = AdminRequest::default(); + admin_req.set_cmd_type(AdminCmdType::PrepareMerge); + admin_req.mut_prepare_merge().set_target(target.clone()); + req.set_admin_request(admin_req); + + let (msg, sub) = PeerMsg::admin_command(req); + router.send(region_id, msg).unwrap(); + let resp = block_on(sub.result()).unwrap(); + if check { + assert!(!resp.get_header().has_error(), "{:?}", resp); + } + + // TODO: when persistent implementation is ready, we can use tablet index of + // the parent to check whether the split is done. Now, just sleep a second. + thread::sleep(Duration::from_secs(1)); + + let new_target = router.region_detail(target.id); + if check { + if new_target.get_start_key() == source.get_start_key() { + // [source, target] => new_target + assert_eq!(new_target.get_end_key(), target.get_end_key()); + } else { + // [target, source] => new_target + assert_eq!(new_target.get_start_key(), target.get_start_key()); + assert_eq!(new_target.get_end_key(), source.get_end_key()); + } + } + new_target + } +} + pub mod life_helper { use std::assert_matches::assert_matches; diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index fbf54eaa243..12fe47ec48a 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -11,6 +11,7 @@ mod cluster; mod test_basic_write; mod test_conf_change; mod test_life; +mod test_merge; mod test_pd_heartbeat; mod test_read; mod test_split; diff --git a/components/raftstore-v2/tests/integrations/test_merge.rs b/components/raftstore-v2/tests/integrations/test_merge.rs new file mode 100644 index 00000000000..c08c2bde484 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_merge.rs @@ -0,0 +1,113 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::{Peekable, RaftEngineReadOnly}; +use kvproto::metapb::{Peer, Region}; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use tikv_util::store::new_peer; + +use crate::cluster::{merge_helper::merge_region, split_helper::split_region, Cluster, TestRouter}; + +#[test] +fn test_merge() { + let mut cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); + let router = &mut cluster.routers[0]; + + let do_split = + |r: &mut TestRouter, region: Region, peer: &Peer, v: u64| -> (Region, Region, Peer) { + let rid = region.get_id(); + let old_region_state = raft_engine + .get_region_state(rid, u64::MAX) + .unwrap() + .unwrap(); + let new_peer = new_peer(store_id, peer.get_id() + 1); + let (lhs, rhs) = split_region( + r, + region, + peer.clone(), + rid + 1, + new_peer.clone(), + Some(format!("k{}{}", rid, v).as_bytes()), + Some(format!("k{}{}", rid + 1, v).as_bytes()), + format!("k{}", rid + 1).as_bytes(), + format!("k{}", rid + 1).as_bytes(), + false, + ); + let region_state = raft_engine + .get_region_state(rid, u64::MAX) + .unwrap() + .unwrap(); + assert!(region_state.get_tablet_index() > old_region_state.get_tablet_index()); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + old_region_state + .get_region() + .get_region_epoch() + .get_version() + + 1, + ); + let region_state = raft_engine + .get_region_state(rid + 1, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + (lhs, rhs, new_peer) + }; + + let region_1 = router.region_detail(2); + let peer_1 = region_1.get_peers()[0].clone(); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + // Split into 6. + let (region_1, region_2, peer_2) = do_split(router, region_1, &peer_1, 1); + let (region_2, region_3, peer_3) = do_split(router, region_2, &peer_2, 2); + let (region_3, region_4, peer_4) = do_split(router, region_3, &peer_3, 3); + let (region_4, region_5, peer_5) = do_split(router, region_4, &peer_4, 4); + let (region_5, region_6, peer_6) = do_split(router, region_5, &peer_5, 5); + drop(raft_engine); + // The last region version is smaller. + for (i, v) in [1, 2, 3, 4, 5, 5].iter().enumerate() { + let rid = region_1.get_id() + i as u64; + let snapshot = router.stale_snapshot(rid); + let key = format!("k{rid}{v}"); + assert!( + snapshot.get_value(key.as_bytes()).unwrap().is_some(), + "{} {:?}", + rid, + key + ); + } + + let region_2 = merge_region(router, region_1.clone(), peer_1, region_2, true); + { + let snapshot = router.stale_snapshot(region_2.get_id()); + let key = format!("k{}1", region_1.get_id()); + assert!(snapshot.get_value(key.as_bytes()).unwrap().is_some()); + } + let region_5 = merge_region(router, region_6.clone(), peer_6, region_5, true); + { + let snapshot = router.stale_snapshot(region_5.get_id()); + let key = format!("k{}5", region_6.get_id()); + assert!(snapshot.get_value(key.as_bytes()).unwrap().is_some()); + } + let region_3 = merge_region(router, region_2, peer_2, region_3, true); + let region_4 = merge_region(router, region_3, peer_3, region_4, true); + let region_5 = merge_region(router, region_4, peer_4, region_5, true); + + cluster.restart(0); + let router = &mut cluster.routers[0]; + let snapshot = router.stale_snapshot(region_5.get_id()); + for (i, v) in [1, 2, 3, 4, 5, 5].iter().enumerate() { + let rid = region_1.get_id() + i as u64; + let key = format!("k{rid}{v}"); + assert!( + snapshot.get_value(key.as_bytes()).unwrap().is_some(), + "{} {:?}", + rid, + key + ); + } +} diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs index 7cea980beac..9dab98be598 100644 --- a/components/raftstore-v2/tests/integrations/test_split.rs +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -15,17 +15,19 @@ fn test_split() { let store_id = cluster.node(0).id(); let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); let router = &mut cluster.routers[0]; - // let factory = cluster.node(0).tablet_factory(); - let region_id = 2; - let peer = new_peer(store_id, 3); - let region = router.region_detail(region_id); - router.wait_applied_to_current_term(2, Duration::from_secs(3)); + let region_2 = 2; + let region = router.region_detail(region_2); + let peer = region.get_peers()[0].clone(); + router.wait_applied_to_current_term(region_2, Duration::from_secs(3)); - // Region 2 ["", ""] peer(1, 3) - // -> Region 2 ["", "k22"] peer(1, 3) + // Region 2 ["", ""] + // -> Region 2 ["", "k22"] // Region 1000 ["k22", ""] peer(1, 10) - let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); + let region_state = raft_engine + .get_region_state(region_2, u64::MAX) + .unwrap() + .unwrap(); assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); let (left, mut right) = split_region( router, @@ -39,26 +41,32 @@ fn test_split() { b"k22", false, ); - let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); + let region_state = raft_engine + .get_region_state(region_2, u64::MAX) + .unwrap() + .unwrap(); assert_ne!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); assert_eq!( region_state.get_region().get_region_epoch().get_version(), INIT_EPOCH_VER + 1 ); let region_state0 = raft_engine - .get_region_state(2, region_state.get_tablet_index()) + .get_region_state(region_2, region_state.get_tablet_index()) .unwrap() .unwrap(); assert_eq!(region_state, region_state0); - let flushed_index = raft_engine.get_flushed_index(2, CF_RAFT).unwrap().unwrap(); + let flushed_index = raft_engine + .get_flushed_index(region_2, CF_RAFT) + .unwrap() + .unwrap(); assert!( flushed_index >= region_state.get_tablet_index(), "{flushed_index} >= {}", region_state.get_tablet_index() ); - // Region 2 ["", "k22"] peer(1, 3) - // -> Region 2 ["", "k11"] peer(1, 3) + // Region 2 ["", "k22"] + // -> Region 2 ["", "k11"] // Region 1001 ["k11", "k22"] peer(1, 11) let _ = split_region( router, @@ -72,7 +80,10 @@ fn test_split() { b"k11", false, ); - let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); + let region_state = raft_engine + .get_region_state(region_2, u64::MAX) + .unwrap() + .unwrap(); assert_ne!( region_state.get_tablet_index(), region_state0.get_tablet_index() @@ -82,11 +93,14 @@ fn test_split() { INIT_EPOCH_VER + 2 ); let region_state1 = raft_engine - .get_region_state(2, region_state.get_tablet_index()) + .get_region_state(region_2, region_state.get_tablet_index()) .unwrap() .unwrap(); assert_eq!(region_state, region_state1); - let flushed_index = raft_engine.get_flushed_index(2, CF_RAFT).unwrap().unwrap(); + let flushed_index = raft_engine + .get_flushed_index(region_2, CF_RAFT) + .unwrap() + .unwrap(); assert!( flushed_index >= region_state.get_tablet_index(), "{flushed_index} >= {}", @@ -96,8 +110,9 @@ fn test_split() { // Region 1000 ["k22", ""] peer(1, 10) // -> Region 1000 ["k22", "k33"] peer(1, 10) // Region 1002 ["k33", ""] peer(1, 12) + let region_1000 = 1000; let region_state = raft_engine - .get_region_state(1000, u64::MAX) + .get_region_state(region_1000, u64::MAX) .unwrap() .unwrap(); assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); @@ -115,7 +130,7 @@ fn test_split() { ) .1; let region_state = raft_engine - .get_region_state(1000, u64::MAX) + .get_region_state(region_1000, u64::MAX) .unwrap() .unwrap(); assert_ne!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); @@ -124,17 +139,21 @@ fn test_split() { INIT_EPOCH_VER + 2 ); let region_state2 = raft_engine - .get_region_state(1000, region_state.get_tablet_index()) + .get_region_state(region_1000, region_state.get_tablet_index()) .unwrap() .unwrap(); assert_eq!(region_state, region_state2); - let flushed_index = raft_engine.get_flushed_index(2, CF_RAFT).unwrap().unwrap(); + let flushed_index = raft_engine + .get_flushed_index(region_1000, CF_RAFT) + .unwrap() + .unwrap(); assert!( flushed_index >= region_state.get_tablet_index(), "{flushed_index} >= {}", region_state.get_tablet_index() ); + // 1002 -> 1002, 1003 let split_key = Key::from_raw(b"k44").append_ts(TimeStamp::zero()); let actual_split_key = split_key.clone().truncate_ts().unwrap(); split_region( diff --git a/components/test_raftstore-v2/Cargo.toml b/components/test_raftstore-v2/Cargo.toml index 9ccfdb93cfe..5c6297c124d 100644 --- a/components/test_raftstore-v2/Cargo.toml +++ b/components/test_raftstore-v2/Cargo.toml @@ -40,7 +40,7 @@ futures = "0.3" grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } pd_client = { workspace = true } diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 44ce6a69358..6a953ed9ca2 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -28,7 +28,8 @@ use kvproto::{ Response, StatusCmdType, }, raft_serverpb::{ - PeerState, RaftApplyState, RaftLocalState, RaftMessage, RegionLocalState, StoreIdent, + PeerState, RaftApplyState, RaftLocalState, RaftMessage, RaftTruncatedState, + RegionLocalState, StoreIdent, }, }; use pd_client::PdClient; @@ -48,8 +49,8 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use test_raftstore::{ is_error_response, new_admin_request, new_delete_cmd, new_delete_range_cmd, new_get_cf_cmd, - new_peer, new_put_cf_cmd, new_region_detail_cmd, new_region_leader_cmd, new_request, - new_snap_cmd, new_status_request, new_store, new_tikv_config_with_api_ver, + new_peer, new_prepare_merge, new_put_cf_cmd, new_region_detail_cmd, new_region_leader_cmd, + new_request, new_snap_cmd, new_status_request, new_store, new_tikv_config_with_api_ver, new_transfer_leader_cmd, sleep_ms, Config, Filter, FilterFactory, PartitionFilterFactory, RawEngine, }; @@ -276,10 +277,40 @@ pub trait Simulator { // todo: unwrap? res = sub.result().fuse() => Ok(res.unwrap()), _ = timeout_f.compat().fuse() => Err(Error::Timeout(format!("request timeout for {:?}", timeout))), - } }) } + + fn async_command_on_node(&self, node_id: u64, mut request: RaftCmdRequest) { + let region_id = request.get_header().get_region_id(); + + let (msg, _sub) = if request.has_admin_request() { + PeerMsg::admin_command(request) + } else { + let requests = request.get_requests(); + let mut write_encoder = SimpleWriteEncoder::with_capacity(64); + for req in requests { + match req.get_cmd_type() { + CmdType::Put => { + let put = req.get_put(); + write_encoder.put(put.get_cf(), put.get_key(), put.get_value()); + } + CmdType::Delete => { + let delete = req.get_delete(); + write_encoder.delete(delete.get_cf(), delete.get_key()); + } + CmdType::DeleteRange => { + unimplemented!() + } + _ => unreachable!(), + } + } + PeerMsg::simple_write(Box::new(request.take_header()), write_encoder.encode()) + }; + + self.async_peer_msg_on_node(node_id, region_id, msg) + .unwrap(); + } } pub struct Cluster { @@ -1033,6 +1064,27 @@ impl Cluster { status_resp.take_region_detail() } + pub fn truncated_state(&self, region_id: u64, store_id: u64) -> RaftTruncatedState { + self.apply_state(region_id, store_id).take_truncated_state() + } + + pub fn wait_log_truncated(&self, region_id: u64, store_id: u64, index: u64) { + let timer = Instant::now(); + loop { + let truncated_state = self.truncated_state(region_id, store_id); + if truncated_state.get_index() >= index { + return; + } + if timer.saturating_elapsed() >= Duration::from_secs(5) { + panic!( + "[region {}] log is still not truncated to {}: {:?} on store {}", + region_id, index, truncated_state, store_id, + ); + } + thread::sleep(Duration::from_millis(10)); + } + } + pub fn get(&mut self, key: &[u8]) -> Option> { self.get_impl(CF_DEFAULT, key, false) } @@ -1376,6 +1428,73 @@ impl Cluster { } } + fn new_prepare_merge(&self, source: u64, target: u64) -> RaftCmdRequest { + let region = block_on(self.pd_client.get_region_by_id(target)) + .unwrap() + .unwrap(); + let prepare_merge = new_prepare_merge(region); + let source_region = block_on(self.pd_client.get_region_by_id(source)) + .unwrap() + .unwrap(); + new_admin_request( + source_region.get_id(), + source_region.get_region_epoch(), + prepare_merge, + ) + } + + pub fn merge_region(&mut self, source: u64, target: u64, _cb: Callback) { + // FIXME: callback is ignored. + let mut req = self.new_prepare_merge(source, target); + let leader = self.leader_of_region(source).unwrap(); + req.mut_header().set_peer(leader.clone()); + self.sim + .rl() + .async_command_on_node(leader.get_store_id(), req); + } + + pub fn try_merge(&mut self, source: u64, target: u64) -> RaftCmdResponse { + self.call_command_on_leader( + self.new_prepare_merge(source, target), + Duration::from_secs(5), + ) + .unwrap() + } + + pub fn must_try_merge(&mut self, source: u64, target: u64) { + let resp = self.try_merge(source, target); + if is_error_response(&resp) { + panic!( + "{} failed to try merge to {}, resp {:?}", + source, target, resp + ); + } + } + + /// Make sure region not exists on that store. + pub fn must_region_not_exist(&mut self, region_id: u64, store_id: u64) { + let mut try_cnt = 0; + loop { + let status_cmd = new_region_detail_cmd(); + let peer = new_peer(store_id, 0); + let req = new_status_request(region_id, peer, status_cmd); + let resp = self.call_command(req, Duration::from_secs(5)).unwrap(); + if resp.get_header().has_error() && resp.get_header().get_error().has_region_not_found() + { + return; + } + + if try_cnt > 250 { + panic!( + "region {} still exists on store {} after {} tries: {:?}", + region_id, store_id, try_cnt, resp + ); + } + try_cnt += 1; + sleep_ms(20); + } + } + pub fn get_snap_dir(&self, node_id: u64) -> String { self.sim.rl().get_snap_dir(node_id) } @@ -1454,6 +1573,10 @@ impl Cluster { self.get_engine(store_id).get_raft_local_state(region_id) } + pub fn raft_local_state(&self, region_id: u64, store_id: u64) -> RaftLocalState { + self.get_raft_local_state(region_id, store_id).unwrap() + } + pub fn shutdown(&mut self) { debug!("about to shutdown cluster"); let keys = match self.sim.read() { @@ -1567,10 +1690,10 @@ impl Peekable for WrapFactory { ) -> engine_traits::Result> { let region_id = self.region_id_of_key(key); - if let Ok(Some(state)) = self.get_region_state(region_id) { - if state.state == PeerState::Tombstone { - return Ok(None); - } + if let Ok(Some(state)) = self.get_region_state(region_id) + && state.state == PeerState::Tombstone + { + return Ok(None); } match self.get_tablet(key) { @@ -1587,10 +1710,10 @@ impl Peekable for WrapFactory { ) -> engine_traits::Result> { let region_id = self.region_id_of_key(key); - if let Ok(Some(state)) = self.get_region_state(region_id) { - if state.state == PeerState::Tombstone { - return Ok(None); - } + if let Ok(Some(state)) = self.get_region_state(region_id) + && state.state == PeerState::Tombstone + { + return Ok(None); } match self.get_tablet(key) { diff --git a/components/test_raftstore-v2/src/lib.rs b/components/test_raftstore-v2/src/lib.rs index ea7e9f6f6e9..685affe45d0 100644 --- a/components/test_raftstore-v2/src/lib.rs +++ b/components/test_raftstore-v2/src/lib.rs @@ -2,6 +2,7 @@ #![allow(incomplete_features)] #![feature(type_alias_impl_trait)] #![feature(return_position_impl_trait_in_trait)] +#![feature(let_chains)] mod cluster; mod node; diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index ec8e3fe2635..cc09dd09c4c 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -256,7 +256,7 @@ pub struct ServerCluster { snap_paths: HashMap, snap_mgrs: HashMap, pd_client: Arc, - // raft_client: RaftClient, + raft_client: RaftClient, concurrency_managers: HashMap, env: Arc, pub pending_services: HashMap, @@ -288,7 +288,7 @@ impl ServerCluster { worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); - let _raft_client = RaftClient::new(conn_builder); + let raft_client = RaftClient::new(conn_builder); ServerCluster { metas: HashMap::default(), addrs: map, @@ -300,7 +300,7 @@ impl ServerCluster { snap_paths: HashMap::default(), pending_services: HashMap::default(), health_services: HashMap::default(), - // raft_client, + raft_client, concurrency_managers: HashMap::default(), env, txn_extra_schedulers: HashMap::default(), @@ -786,6 +786,12 @@ impl Simulator for ServerCluster { router.send_peer_msg(region_id, msg) } + fn send_raft_msg(&mut self, msg: RaftMessage) -> raftstore::Result<()> { + self.raft_client.send(msg).unwrap(); + self.raft_client.flush(); + Ok(()) + } + fn get_router(&self, node_id: u64) -> Option> { self.metas.get(&node_id).map(|m| m.raw_router.clone()) } @@ -797,10 +803,6 @@ impl Simulator for ServerCluster { .unwrap() .to_owned() } - - fn send_raft_msg(&mut self, _msg: RaftMessage) -> raftstore::Result<()> { - unimplemented!() - } } impl Cluster { diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index b7a9ea6f1af..5c9d9ac5d54 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -662,9 +662,9 @@ pub fn configure_for_merge(config: &mut Config) { config.raft_store.peer_stale_state_check_interval = ReadableDuration::millis(500); } -pub fn ignore_merge_target_integrity(cluster: &mut Cluster) { - cluster.cfg.raft_store.dev_assert = false; - cluster.pd_client.ignore_merge_target_integrity(); +pub fn ignore_merge_target_integrity(config: &mut Config, pd_client: &TestPdClient) { + config.raft_store.dev_assert = false; + pd_client.ignore_merge_target_integrity(); } pub fn configure_for_lease_read( diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index c22136d04de..1a733be5d8c 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -425,7 +425,7 @@ fn test_node_merge_multiple_snapshots_not_together() { fn test_node_merge_multiple_snapshots(together: bool) { let mut cluster = new_node_cluster(0, 3); configure_for_merge(&mut cluster.cfg); - ignore_merge_target_integrity(&mut cluster); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); // make it gc quickly to trigger snapshot easily diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index de1187f35b1..151e278d0d1 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -3,7 +3,7 @@ use std::{iter::*, sync::*, thread, time::*}; use api_version::{test_kv_format_impl, KvFormat}; -use engine_traits::{Peekable, CF_LOCK, CF_RAFT, CF_WRITE}; +use engine_traits::{CF_LOCK, CF_WRITE}; use kvproto::{ raft_cmdpb::CmdType, raft_serverpb::{PeerState, RaftMessage, RegionLocalState}, @@ -12,14 +12,16 @@ use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; use raftstore::store::{Callback, LocksStatus}; use test_raftstore::*; +use test_raftstore_macro::test_case; +use test_raftstore_v2::Simulator as _; use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::{config::*, HandyRwLock}; use txn_types::{Key, PessimisticLock}; /// Test if merge is working as expected in a general condition. -#[test] +#[test_case(test_raftstore::new_node_cluster)] fn test_node_base_merge() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.rocksdb.titan.enabled = true; configure_for_merge(&mut cluster.cfg); @@ -84,14 +86,9 @@ fn test_node_base_merge() { let version = left.get_region_epoch().get_version(); let conf_ver = left.get_region_epoch().get_conf_ver(); 'outer: for i in 1..4 { - let state_key = keys::region_state_key(left.get_id()); let mut state = RegionLocalState::default(); for _ in 0..3 { - state = cluster - .get_engine(i) - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + state = cluster.region_local_state(left.get_id(), i); if state.get_state() == PeerState::Tombstone { let epoch = state.get_region().get_region_epoch(); assert_eq!(epoch.get_version(), version + 1); @@ -106,9 +103,95 @@ fn test_node_base_merge() { cluster.must_put(b"k4", b"v4"); } -#[test] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_base_merge_v2() { + let mut cluster = new_cluster(0, 3); + // TODO: v2 doesn't support titan yet. + // cluster.cfg.rocksdb.titan.enabled = true; + configure_for_merge(&mut cluster.cfg); + + cluster.run(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + for i in 0..3 { + must_get_equal(&cluster.get_engine(i + 1), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(i + 1), b"k3", b"v3"); + } + + let pd_client = Arc::clone(&cluster.pd_client); + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k2").unwrap(); + assert_eq!(region.get_id(), right.get_id()); + assert_eq!(left.get_end_key(), right.get_start_key()); + assert_eq!(right.get_start_key(), b"k2"); + let get = new_request( + right.get_id(), + right.get_region_epoch().clone(), + vec![new_get_cmd(b"k1")], + false, + ); + debug!("requesting {:?}", get); + let resp = cluster + .call_command_on_leader(get, Duration::from_secs(5)) + .unwrap(); + assert!(resp.get_header().has_error(), "{:?}", resp); + assert!( + resp.get_header().get_error().has_key_not_in_region(), + "{:?}", + resp + ); + + pd_client.must_merge(left.get_id(), right.get_id()); + + let region = pd_client.get_region(b"k1").unwrap(); + assert_eq!(region.get_id(), right.get_id()); + assert_eq!(region.get_start_key(), left.get_start_key()); + assert_eq!(region.get_end_key(), right.get_end_key()); + let origin_epoch = left.get_region_epoch(); + let new_epoch = region.get_region_epoch(); + // PrepareMerge + CommitMerge, so it should be 2. + assert_eq!(new_epoch.get_version(), origin_epoch.get_version() + 2); + assert_eq!(new_epoch.get_conf_ver(), origin_epoch.get_conf_ver()); + let get = new_request( + region.get_id(), + new_epoch.to_owned(), + vec![new_get_cmd(b"k1")], + false, + ); + debug!("requesting {:?}", get); + let resp = cluster + .call_command_on_leader(get, Duration::from_secs(5)) + .unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert_eq!(resp.get_responses()[0].get_get().get_value(), b"v1"); + + let version = left.get_region_epoch().get_version(); + let conf_ver = left.get_region_epoch().get_conf_ver(); + 'outer: for i in 1..4 { + let mut state = RegionLocalState::default(); + for _ in 0..3 { + state = cluster.region_local_state(left.get_id(), i); + if state.get_state() == PeerState::Tombstone { + let epoch = state.get_region().get_region_epoch(); + assert_eq!(epoch.get_version(), version + 1); + assert_eq!(epoch.get_conf_ver(), conf_ver + 1); + continue 'outer; + } + thread::sleep(Duration::from_millis(500)); + } + panic!("store {} is still not merged: {:?}", i, state); + } + + cluster.must_put(b"k4", b"v4"); +} + +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_with_slow_learner() { - let mut cluster = new_node_cluster(0, 2); + let mut cluster = new_cluster(0, 2); configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.raft_log_gc_threshold = 40; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(40); @@ -181,9 +264,10 @@ fn test_node_merge_with_slow_learner() { } /// Test whether merge will be aborted if prerequisites is not met. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_prerequisites_check() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); @@ -229,13 +313,14 @@ fn test_node_merge_prerequisites_check() { 3, ))); // It doesn't matter if the index and term is correct. - let compact_log = new_compact_log_request(100, 10); + let compact_log = new_compact_log_request(0, 10); let req = new_admin_request(right.get_id(), right.get_region_epoch(), compact_log); debug!("requesting {:?}", req); - let res = cluster + let _res = cluster .call_command_on_leader(req, Duration::from_secs(3)) .unwrap(); - assert!(res.get_header().has_error(), "{:?}", res); + // v2 doesn't respond error. + // assert!(res.get_header().has_error(), "{:?}", res); let res = cluster.try_merge(right.get_id(), left.get_id()); // log gap (min_matched, last_index] contains admin entries. assert!(res.get_header().has_error(), "{:?}", res); @@ -262,11 +347,12 @@ fn test_node_merge_prerequisites_check() { } /// Test if stale peer will be handled properly after merge. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_check_merged_message() { - let mut cluster = new_node_cluster(0, 4); + let mut cluster = new_cluster(0, 4); configure_for_merge(&mut cluster.cfg); - ignore_merge_target_integrity(&mut cluster); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -322,87 +408,84 @@ fn test_node_check_merged_message() { must_get_none(&engine3, b"v5"); } -#[test] -fn test_node_merge_slow_split_right() { - test_node_merge_slow_split(true); -} - -#[test] -fn test_node_merge_slow_split_left() { - test_node_merge_slow_split(false); -} - // Test if a merge handled properly when there is a unfinished slow split before // merge. -fn test_node_merge_slow_split(is_right_derive: bool) { - let mut cluster = new_node_cluster(0, 3); - configure_for_merge(&mut cluster.cfg); - ignore_merge_target_integrity(&mut cluster); - let pd_client = Arc::clone(&cluster.pd_client); - pd_client.disable_default_operator(); - cluster.cfg.raft_store.right_derive_when_split = is_right_derive; - - cluster.run(); - - cluster.must_put(b"k1", b"v1"); - cluster.must_put(b"k3", b"v3"); - - let region = pd_client.get_region(b"k1").unwrap(); - cluster.must_split(®ion, b"k2"); - let left = pd_client.get_region(b"k1").unwrap(); - let right = pd_client.get_region(b"k3").unwrap(); - - let target_leader = right - .get_peers() - .iter() - .find(|p| p.get_store_id() == 1) - .unwrap() - .clone(); - cluster.must_transfer_leader(right.get_id(), target_leader); - let target_leader = left - .get_peers() - .iter() - .find(|p| p.get_store_id() == 2) - .unwrap() - .clone(); - cluster.must_transfer_leader(left.get_id(), target_leader); - must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); - - // So cluster becomes: - // left region: 1 2(leader) I 3 - // right region: 1(leader) 2 I 3 - // I means isolation.(here just means 3 can not receive append log) - cluster.add_send_filter(CloneFilterFactory( - RegionPacketFilter::new(left.get_id(), 3) - .direction(Direction::Recv) - .msg_type(MessageType::MsgAppend), - )); - cluster.add_send_filter(CloneFilterFactory( - RegionPacketFilter::new(right.get_id(), 3) - .direction(Direction::Recv) - .msg_type(MessageType::MsgAppend), - )); - cluster.must_split(&right, b"k3"); - - // left region and right region on store 3 fall behind - // so after split, the new generated region is not on store 3 now - let right1 = pd_client.get_region(b"k2").unwrap(); - let right2 = pd_client.get_region(b"k3").unwrap(); - assert_ne!(right1.get_id(), right2.get_id()); - pd_client.must_merge(left.get_id(), right1.get_id()); - // after merge, the left region still exists on store 3 - - cluster.must_put(b"k0", b"v0"); - cluster.clear_send_filters(); - must_get_equal(&cluster.get_engine(3), b"k0", b"v0"); +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_node_merge_slow_split() { + fn imp(is_right_derive: bool) { + let mut cluster = new_cluster(0, 3); + configure_for_merge(&mut cluster.cfg); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.cfg.raft_store.right_derive_when_split = is_right_derive; + + cluster.run(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region = pd_client.get_region(b"k1").unwrap(); + cluster.must_split(®ion, b"k2"); + let left = pd_client.get_region(b"k1").unwrap(); + let right = pd_client.get_region(b"k3").unwrap(); + + let target_leader = right + .get_peers() + .iter() + .find(|p| p.get_store_id() == 1) + .unwrap() + .clone(); + cluster.must_transfer_leader(right.get_id(), target_leader); + let target_leader = left + .get_peers() + .iter() + .find(|p| p.get_store_id() == 2) + .unwrap() + .clone(); + cluster.must_transfer_leader(left.get_id(), target_leader); + must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); + + // So cluster becomes: + // left region: 1 2(leader) I 3 + // right region: 1(leader) 2 I 3 + // I means isolation.(here just means 3 can not receive append log) + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(left.get_id(), 3) + .direction(Direction::Recv) + .msg_type(MessageType::MsgAppend), + )); + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(right.get_id(), 3) + .direction(Direction::Recv) + .msg_type(MessageType::MsgAppend), + )); + cluster.must_split(&right, b"k3"); + + // left region and right region on store 3 fall behind + // so after split, the new generated region is not on store 3 now + let right1 = pd_client.get_region(b"k2").unwrap(); + let right2 = pd_client.get_region(b"k3").unwrap(); + assert_ne!(right1.get_id(), right2.get_id()); + pd_client.must_merge(left.get_id(), right1.get_id()); + // after merge, the left region still exists on store 3 + + cluster.must_put(b"k0", b"v0"); + cluster.clear_send_filters(); + must_get_equal(&cluster.get_engine(3), b"k0", b"v0"); + } + imp(true); + imp(false); } /// Test various cases that a store is isolated during merge. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_dist_isolation() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); - ignore_merge_target_integrity(&mut cluster); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -474,11 +557,12 @@ fn test_node_merge_dist_isolation() { /// Similar to `test_node_merge_dist_isolation`, but make the isolated store /// way behind others so others have to send it a snapshot. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_brain_split() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); - ignore_merge_target_integrity(&mut cluster); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); cluster.cfg.raft_store.raft_log_gc_threshold = 12; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); @@ -528,12 +612,7 @@ fn test_node_merge_brain_split() { cluster.must_put(b"k40", b"v5"); // Make sure the two regions are already merged on store 3. - let state_key = keys::region_state_key(left.get_id()); - let state: RegionLocalState = cluster - .get_engine(3) - .get_msg_cf(CF_RAFT, &state_key) - .unwrap() - .unwrap(); + let state = cluster.region_local_state(left.get_id(), 3); assert_eq!(state.get_state(), PeerState::Tombstone); must_get_equal(&cluster.get_engine(3), b"k40", b"v5"); for i in 1..100 { @@ -577,9 +656,10 @@ fn test_node_merge_brain_split() { } /// Test whether approximate size and keys are updated after merge -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_approximate_size_and_keys() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(20); cluster.run(); @@ -653,9 +733,10 @@ fn test_merge_approximate_size_and_keys() { ); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_update_region() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); // Election timeout and max leader lease is 1s. configure_for_lease_read(&mut cluster.cfg, Some(100), Some(10)); @@ -733,9 +814,10 @@ fn test_node_merge_update_region() { /// Test if merge is working properly when merge entries is empty but commit /// index is not updated. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_catch_up_logs_empty_entries() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); cluster.run(); @@ -788,9 +870,10 @@ fn test_node_merge_catch_up_logs_empty_entries() { cluster.must_region_not_exist(left.get_id(), 3); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_with_slow_promote() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -830,11 +913,12 @@ fn test_merge_with_slow_promote() { /// logically) /// - A split => C (-∞, k3), A [k3, +∞) /// - Then network recovery -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_isolated_store_with_no_target_peer() { - let mut cluster = new_node_cluster(0, 4); + let mut cluster = new_cluster(0, 4); configure_for_merge(&mut cluster.cfg); - ignore_merge_target_integrity(&mut cluster); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); cluster.cfg.raft_store.right_derive_when_split = true; let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -889,9 +973,10 @@ fn test_merge_isolated_store_with_no_target_peer() { /// Test whether a isolated peer can recover when two other regions merge to its /// region -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_cascade_merge_isolated() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -940,9 +1025,10 @@ fn test_merge_cascade_merge_isolated() { // Test if a learner can be destroyed properly when it's isolated and removed by // conf change before its region merge to another region -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_isolated_not_in_merge_learner() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -986,9 +1072,10 @@ fn test_merge_isolated_not_in_merge_learner() { // Test if a learner can be destroyed properly when it's isolated and removed by // conf change before another region merge to its region -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_isolated_stale_learner() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); cluster.cfg.raft_store.right_derive_when_split = true; // Do not rely on pd to remove stale peer @@ -1037,9 +1124,10 @@ fn test_merge_isolated_stale_learner() { /// 2. Be the last removed peer in its peer list /// 3. Then its region merges to another region. /// 4. Isolation disappears -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_isolated_not_in_merge_learner_2() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1082,9 +1170,10 @@ fn test_merge_isolated_not_in_merge_learner_2() { /// Test if a peer can be removed if its target peer has been removed and /// doesn't apply the CommitMerge log. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_remove_target_peer_isolated() { - let mut cluster = new_node_cluster(0, 4); + let mut cluster = new_cluster(0, 4); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1140,45 +1229,46 @@ fn test_merge_remove_target_peer_isolated() { } } -#[test] +#[test_case(test_raftstore::new_server_cluster_with_api_ver)] +#[test_case(test_raftstore_v2::new_server_cluster_with_api_ver)] fn test_sync_max_ts_after_region_merge() { - test_kv_format_impl!(test_sync_max_ts_after_region_merge_impl); -} + fn imp() { + let mut cluster = new_cluster(0, 3, F::TAG); + configure_for_merge(&mut cluster.cfg); + cluster.run(); -fn test_sync_max_ts_after_region_merge_impl() { - let mut cluster = new_server_cluster_with_api_ver(0, 3, F::TAG); - configure_for_merge(&mut cluster.cfg); - cluster.run(); + // Transfer leader to node 1 first to ensure all operations happen on node 1 + cluster.must_transfer_leader(1, new_peer(1, 1)); - // Transfer leader to node 1 first to ensure all operations happen on node 1 - cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); - cluster.must_put(b"k1", b"v1"); - cluster.must_put(b"k3", b"v3"); - - let region = cluster.get_region(b"k1"); - cluster.must_split(®ion, b"k2"); - let left = cluster.get_region(b"k1"); - let right = cluster.get_region(b"k3"); + let region = cluster.get_region(b"k1"); + cluster.must_split(®ion, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k3"); - let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); - wait_for_synced(&mut cluster, 1, 1); - let max_ts = cm.max_ts(); + let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); + wait_for_synced(&mut cluster, 1, 1); + let max_ts = cm.max_ts(); - cluster.pd_client.trigger_tso_failure(); - // Merge left to right - cluster.pd_client.must_merge(left.get_id(), right.get_id()); + cluster.pd_client.trigger_tso_failure(); + // Merge left to right + cluster.pd_client.must_merge(left.get_id(), right.get_id()); - wait_for_synced(&mut cluster, 1, 1); - let new_max_ts = cm.max_ts(); - assert!(new_max_ts > max_ts); + wait_for_synced(&mut cluster, 1, 1); + let new_max_ts = cm.max_ts(); + assert!(new_max_ts > max_ts); + } + test_kv_format_impl!(imp); } /// If a follower is demoted by a snapshot, its meta will be changed. The case /// is to ensure asserts in code can tolerate the change. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_snapshot_demote() { - let mut cluster = new_node_cluster(0, 4); + let mut cluster = new_cluster(0, 4); configure_for_merge(&mut cluster.cfg); configure_for_snapshot(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); @@ -1232,9 +1322,10 @@ fn test_merge_snapshot_demote() { must_get_equal(&cluster.get_engine(3), b"k4", b"v4"); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_propose_in_memory_pessimistic_locks() { - let mut cluster = new_server_cluster(0, 2); + let mut cluster = new_cluster(0, 2); configure_for_merge(&mut cluster.cfg); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -1311,9 +1402,10 @@ fn test_propose_in_memory_pessimistic_locks() { ); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +// #[test_case(test_raftstore_v2::new_server_cluster)] fn test_merge_pessimistic_locks_when_gap_is_too_large() { - let mut cluster = new_server_cluster(0, 2); + let mut cluster = new_cluster(0, 2); configure_for_merge(&mut cluster.cfg); cluster.cfg.pessimistic_txn.pipelined = true; cluster.cfg.pessimistic_txn.in_memory = true; @@ -1361,9 +1453,10 @@ fn test_merge_pessimistic_locks_when_gap_is_too_large() { assert_eq!(cluster.must_get(b"k1").unwrap(), b"new_val"); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_merge_pessimistic_locks_repeated_merge() { - let mut cluster = new_server_cluster(0, 2); + let mut cluster = new_cluster(0, 2); configure_for_merge(&mut cluster.cfg); cluster.cfg.pessimistic_txn.pipelined = true; cluster.cfg.pessimistic_txn.in_memory = true; @@ -1428,11 +1521,12 @@ fn test_merge_pessimistic_locks_repeated_merge() { /// Check if merge is cleaned up if the merge target is destroyed several times /// before it's ever scheduled. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_long_isolated() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); - ignore_merge_target_integrity(&mut cluster); + ignore_merge_target_integrity(&mut cluster.cfg, &cluster.pd_client); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1447,8 +1541,8 @@ fn test_node_merge_long_isolated() { let right = pd_client.get_region(b"k3").unwrap(); cluster.must_transfer_leader(right.get_id(), new_peer(3, 3)); - let target_leader = peer_on_store(&left, 3); - cluster.must_transfer_leader(left.get_id(), target_leader); + let left_leader = peer_on_store(&left, 3); + cluster.must_transfer_leader(left.get_id(), left_leader); must_get_equal(&cluster.get_engine(1), b"k3", b"v3"); // So cluster becomes: @@ -1490,9 +1584,10 @@ fn test_node_merge_long_isolated() { must_get_none(&cluster.get_engine(1), b"k1"); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_stale_message_after_merge() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); cluster.run(); let pd_client = Arc::clone(&cluster.pd_client); @@ -1538,9 +1633,10 @@ fn test_stale_message_after_merge() { /// Check whether merge should be prevented if follower may not have enough /// logs. -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_prepare_merge_with_reset_matched() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -1587,9 +1683,10 @@ fn test_prepare_merge_with_reset_matched() { /// Check if prepare merge min index is chosen correctly even if all match /// indexes are correct. -#[test] +#[test_case(test_raftstore::new_server_cluster)] +// #[test_case(test_raftstore_v2::new_server_cluster)] fn test_prepare_merge_with_5_nodes_snapshot() { - let mut cluster = new_server_cluster(0, 5); + let mut cluster = new_cluster(0, 5); configure_for_merge(&mut cluster.cfg); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); From 7ab1702fd84e500a428aa3d02baae24aa3ba46c4 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 24 Mar 2023 12:18:42 +0800 Subject: [PATCH 602/676] raftstore-v2: flush memtable before proposing split (#14437) close tikv/tikv#14447 flush memtable before proposing split Signed-off-by: SpadeA-Tang --- Cargo.lock | 2 +- components/raftstore-v2/src/batch/store.rs | 17 ++- components/raftstore-v2/src/lib.rs | 5 +- .../operation/command/admin/compact_log.rs | 2 +- .../operation/command/admin/merge/commit.rs | 2 +- .../src/operation/command/admin/mod.rs | 96 ++++++++++++++- .../src/operation/command/admin/split.rs | 13 +- .../command/admin/transfer_leader.rs | 4 +- .../raftstore-v2/src/operation/ready/mod.rs | 18 ++- components/raftstore-v2/src/raft/peer.rs | 12 ++ components/raftstore-v2/src/worker/mod.rs | 1 + .../raftstore-v2/src/worker/tablet_flush.rs | 115 ++++++++++++++++++ components/raftstore/src/store/fsm/peer.rs | 1 + components/test_raftstore-v2/src/cluster.rs | 4 + components/test_raftstore-v2/src/node.rs | 4 + components/test_raftstore/src/node.rs | 4 + components/txn_types/src/types.rs | 2 + .../raftstore/test_split_region.rs | 47 ++----- 18 files changed, 299 insertions(+), 50 deletions(-) create mode 100644 components/raftstore-v2/src/worker/tablet_flush.rs diff --git a/Cargo.lock b/Cargo.lock index e12ee05562d..1cb40d842cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2712,7 +2712,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#df1ae63d0cfe2f5e01d2016a1839a7e88ef2da38" +source = "git+https://github.com/pingcap/kvproto.git#af969693ce8a7884e5bdc5d81c728f657d33065a" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 3f7bf408aa8..fe152bb3990 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -43,7 +43,7 @@ use tikv_util::{ sys::SysQuota, time::{duration_to_sec, Instant as TiInstant}, timer::SteadyTimer, - worker::{LazyWorker, Scheduler, Worker}, + worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, Either, }; @@ -54,7 +54,7 @@ use crate::{ operation::{SharedReadTablet, MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, - worker::{pd, tablet_gc}, + worker::{pd, tablet_flush, tablet_gc}, Error, Result, }; @@ -474,6 +474,7 @@ pub struct Schedulers { pub pd: Scheduler, pub tablet_gc: Scheduler>, pub write: WriteSenders, + pub tablet_flush: Scheduler, // Following is not maintained by raftstore itself. pub split_check: Scheduler, @@ -497,6 +498,7 @@ struct Workers { tablet_gc: Worker, async_write: StoreWriters, purge: Option, + tablet_flush: Worker, // Following is not maintained by raftstore itself. background: Worker, @@ -504,12 +506,16 @@ struct Workers { impl Workers { fn new(background: Worker, pd: LazyWorker, purge: Option) -> Self { + let tablet_flush = WorkerBuilder::new("tablet_flush-worker") + .thread_count(2) + .create(); Self { async_read: Worker::new("async-read-worker"), pd, tablet_gc: Worker::new("tablet-gc-worker"), async_write: StoreWriters::new(None), purge, + tablet_flush, background, } } @@ -519,6 +525,7 @@ impl Workers { self.async_read.stop(); self.pd.stop(); self.tablet_gc.stop(); + self.tablet_flush.stop(); if let Some(w) = self.purge { w.stop(); } @@ -637,12 +644,18 @@ impl StoreSystem { ), ); + let tablet_flush_scheduler = workers.tablet_flush.start( + "tablet-flush-worker", + tablet_flush::Runner::new(router.clone(), tablet_registry.clone(), self.logger.clone()), + ); + let schedulers = Schedulers { read: read_scheduler, pd: workers.pd.scheduler(), tablet_gc: tablet_gc_scheduler, write: workers.async_write.senders(), split_check: split_check_scheduler, + tablet_flush: tablet_flush_scheduler, }; let builder = StorePollerBuilder::new( diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index bbb73676ffb..04745d01fbe 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -42,4 +42,7 @@ pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; pub use operation::{write_initial_states, SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{store::Config, Error, Result}; -pub use worker::pd::{PdReporter, Task as PdTask}; +pub use worker::{ + pd::{PdReporter, Task as PdTask}, + tablet_flush::Task as TabletFlushTask, +}; diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 8ae195539b2..1cc9ccbb1c3 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -115,7 +115,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, impl Peer { // Mirrors v1::on_raft_gc_log_tick. - fn maybe_propose_compact_log( + fn maybe_propose_compact_log( &mut self, store_ctx: &mut StoreContext, force: bool, diff --git a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs index 876ba5b1a95..2756d0174dd 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/commit.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/commit.rs @@ -248,7 +248,7 @@ impl Peer { // Target peer handles the commit merge request. impl Peer { - pub fn on_ask_commit_merge( + pub fn on_ask_commit_merge( &mut self, store_ctx: &mut StoreContext, req: RaftCmdRequest, diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index f59a5e6e0f2..28fceb2d95b 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -10,7 +10,11 @@ pub use compact_log::CompactLogContext; use compact_log::CompactLogResult; use conf_change::{ConfChangeResult, UpdateGcPeersResult}; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; +use kvproto::{ + metapb::PeerRole, + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, + raft_serverpb::{ExtraMessageType, FlushMemtable, RaftMessage}, +}; use merge::{commit::CommitMergeResult, prepare::PrepareMergeResult}; pub use merge::{ commit::{CatchUpLogs, MERGE_IN_PROGRESS_PREFIX}, @@ -18,10 +22,15 @@ pub use merge::{ }; use protobuf::Message; use raftstore::{ - store::{cmd_resp, fsm::apply, msg::ErrorCallback}, + store::{ + cmd_resp, + fsm::{apply, apply::validate_batch_split}, + msg::ErrorCallback, + Transport, + }, Error, }; -use slog::info; +use slog::{error, info}; use split::SplitResult; pub use split::{ report_split_init_finish, temp_split_path, RequestHalfSplit, RequestSplit, SplitFlowControl, @@ -47,7 +56,7 @@ pub enum AdminCmdResult { impl Peer { #[inline] - pub fn on_admin_command( + pub fn on_admin_command( &mut self, ctx: &mut StoreContext, mut req: RaftCmdRequest, @@ -122,7 +131,84 @@ impl Peer { AdminCmdType::Split => Err(box_err!( "Split is deprecated. Please use BatchSplit instead." )), - AdminCmdType::BatchSplit => self.propose_split(ctx, req), + AdminCmdType::BatchSplit => { + #[allow(clippy::question_mark)] + if let Err(err) = validate_batch_split(req.get_admin_request(), self.region()) { + Err(err) + } else { + // To reduce the impact of the expensive operation of `checkpoint` (it will + // flush memtables of the rocksdb) in applying batch split, we split the + // BatchSplit cmd into two phases: + // + // 1. Schedule flush memtable task so that the memtables of the rocksdb can + // be flushed in advance in a way that will not block the normal raft + // operations (`checkpoint` will still cause flush but it will be + // significantly lightweight). At the same time, send flush memtable msgs to + // the follower so that they can flush memtalbes in advance too. + // + // 2. When the task finishes, it will propose a batch split with + // `SPLIT_SECOND_PHASE` flag. + if !WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + .contains(WriteBatchFlags::SPLIT_SECOND_PHASE) + { + if self.tablet_being_flushed() { + return; + } + + let region_id = self.region().get_id(); + self.set_tablet_being_flushed(true); + info!( + self.logger, + "Schedule flush tablet"; + ); + if let Err(e) = ctx.schedulers.tablet_flush.schedule( + crate::TabletFlushTask::TabletFlush { + region_id, + req: Some(req), + is_leader: true, + ch: Some(ch), + }, + ) { + error!( + self.logger, + "Fail to schedule flush task"; + "err" => ?e, + ) + } + + let peers = self.region().get_peers().to_vec(); + for p in peers { + if p == *self.peer() + || p.get_role() != PeerRole::Voter + || p.is_witness + { + continue; + } + let mut msg = RaftMessage::default(); + msg.set_region_id(region_id); + msg.set_from_peer(self.peer().clone()); + msg.set_to_peer(p.clone()); + msg.set_region_epoch(self.region().get_region_epoch().clone()); + let extra_msg = msg.mut_extra_msg(); + extra_msg.set_type(ExtraMessageType::MsgFlushMemtable); + let mut flush_memtable = FlushMemtable::new(); + flush_memtable.set_region_id(region_id); + extra_msg.set_flush_memtable(flush_memtable); + + self.send_raft_message(ctx, msg); + } + + return; + } + + info!( + self.logger, + "Propose split"; + ); + self.set_tablet_being_flushed(false); + self.propose_split(ctx, req) + } + } AdminCmdType::TransferLeader => { // Containing TRANSFER_LEADER_PROPOSAL flag means the this transfer leader // request should be proposed to the raft group diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 4560fa93689..e1577830d25 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -54,7 +54,7 @@ use raftstore::{ Result, }; use slog::{error, info, warn}; -use tikv_util::{log::SlogFormat, slog_panic}; +use tikv_util::{log::SlogFormat, slog_panic, time::Instant}; use crate::{ batch::StoreContext, @@ -362,7 +362,6 @@ impl Peer { store_ctx: &mut StoreContext, req: RaftCmdRequest, ) -> Result { - validate_batch_split(req.get_admin_request(), self.region())?; // We rely on ConflictChecker to detect conflicts, so no need to set proposal // context. let data = req.write_to_bytes().unwrap(); @@ -484,6 +483,7 @@ impl Apply { ) }); + let now = Instant::now(); let reg = self.tablet_registry(); for new_region in ®ions { let new_region_id = new_region.id; @@ -521,6 +521,15 @@ impl Apply { ) }); } + let elapsed = now.saturating_elapsed(); + // to be removed after when it's stable + info!( + self.logger, + "create checkpoint time consumes"; + "region" => ?self.region(), + "duration" => ?elapsed + ); + let reg = self.tablet_registry(); let path = reg.tablet_path(region_id, log_index); let mut ctx = TabletContext::new(®ions[derived_index], Some(log_index)); diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 54aa9845e17..e7bd84c973c 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -15,7 +15,7 @@ use raft::{eraftpb, ProgressState, Storage}; use raftstore::{ store::{ fsm::new_admin_request, make_transfer_leader_response, metrics::PEER_ADMIN_CMD_COUNTER, - TRANSFER_LEADER_COMMAND_REPLY_CTX, + Transport, TRANSFER_LEADER_COMMAND_REPLY_CTX, }, Result, }; @@ -146,7 +146,7 @@ impl Peer { true } - pub fn on_transfer_leader_msg( + pub fn on_transfer_leader_msg( &mut self, ctx: &mut StoreContext, msg: &eraftpb::Message, diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index d93502a734d..68da61cf45e 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -213,6 +213,22 @@ impl Peer { self.on_gc_peer_request(ctx, &msg); return; } + ExtraMessageType::MsgFlushMemtable => { + let region_epoch = msg.as_ref().get_region_epoch(); + if util::is_epoch_stale(region_epoch, self.region().get_region_epoch()) { + return; + } + let _ = + ctx.schedulers + .tablet_flush + .schedule(crate::TabletFlushTask::TabletFlush { + region_id: self.region().get_id(), + req: None, + is_leader: false, + ch: None, + }); + return; + } ExtraMessageType::MsgWantRollbackMerge => { if self.is_leader() { // TODO: @@ -352,7 +368,7 @@ impl Peer { /// /// The message is pushed into the send buffer, it may not be sent out until /// transport is flushed explicitly. - fn send_raft_message( + pub(crate) fn send_raft_message( &mut self, ctx: &mut StoreContext, msg: RaftMessage, diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 8ee311401a9..b93fc0f5047 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -45,6 +45,7 @@ const REGION_READ_PROGRESS_CAP: usize = 128; pub struct Peer { raft_group: RawNode>, tablet: CachedTablet, + tablet_being_flushed: bool, /// Statistics for self. self_stat: PeerStat, @@ -155,6 +156,7 @@ impl Peer { let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { tablet: cached_tablet, + tablet_being_flushed: false, self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), @@ -302,6 +304,16 @@ impl Peer { self.peer().get_id() } + #[inline] + pub fn tablet_being_flushed(&self) -> bool { + self.tablet_being_flushed + } + + #[inline] + pub fn set_tablet_being_flushed(&mut self, v: bool) { + self.tablet_being_flushed = v; + } + #[inline] pub fn storage(&self) -> &Storage { self.raft_group.store() diff --git a/components/raftstore-v2/src/worker/mod.rs b/components/raftstore-v2/src/worker/mod.rs index 6fafd01df85..121c41906d7 100644 --- a/components/raftstore-v2/src/worker/mod.rs +++ b/components/raftstore-v2/src/worker/mod.rs @@ -1,4 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. pub mod pd; +pub mod tablet_flush; pub mod tablet_gc; diff --git a/components/raftstore-v2/src/worker/tablet_flush.rs b/components/raftstore-v2/src/worker/tablet_flush.rs new file mode 100644 index 00000000000..c53296a5cb6 --- /dev/null +++ b/components/raftstore-v2/src/worker/tablet_flush.rs @@ -0,0 +1,115 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::fmt::{Display, Formatter}; + +use engine_traits::{KvEngine, RaftEngine, TabletRegistry, DATA_CFS}; +use kvproto::raft_cmdpb::RaftCmdRequest; +use slog::{error, info, Logger}; +use tikv_util::{time::Instant, worker::Runnable}; +use txn_types::WriteBatchFlags; + +use crate::{ + router::{CmdResChannel, PeerMsg, RaftRequest}, + StoreRouter, +}; + +pub enum Task { + TabletFlush { + region_id: u64, + req: Option, + is_leader: bool, + ch: Option, + }, +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Task::TabletFlush { region_id, .. } => { + write!(f, "Flush tablet before split for region {}", region_id) + } + } + } +} + +pub struct Runner { + router: StoreRouter, + tablet_registry: TabletRegistry, + logger: Logger, +} + +impl Runner { + pub fn new( + router: StoreRouter, + tablet_registry: TabletRegistry, + logger: Logger, + ) -> Self { + Self { + router, + tablet_registry, + logger, + } + } + + fn flush_tablet( + &mut self, + region_id: u64, + req: Option, + is_leader: bool, + ch: Option, + ) { + let Some(Some(tablet)) = self + .tablet_registry + .get(region_id) + .map(|mut cache| cache.latest().cloned()) else {return}; + let now = Instant::now(); + tablet.flush_cfs(DATA_CFS, true).unwrap(); + let elapsed = now.saturating_elapsed(); + // to be removed after when it's stable + info!( + self.logger, + "flush memtable time consumes"; + "region_id" => region_id, + "duration" => ?elapsed, + "is_leader" => is_leader, + ); + + if !is_leader { + return; + } + + let mut req = req.unwrap(); + req.mut_header() + .set_flags(WriteBatchFlags::SPLIT_SECOND_PHASE.bits()); + if let Err(e) = self.router.send( + region_id, + PeerMsg::AdminCommand(RaftRequest::new(req, ch.unwrap())), + ) { + error!( + self.logger, + "send split request fail in the second phase"; + "region_id" => region_id, + "err" => ?e, + ); + } + } +} + +impl Runnable for Runner +where + EK: KvEngine, + ER: RaftEngine, +{ + type Task = Task; + + fn run(&mut self, task: Self::Task) { + match task { + Task::TabletFlush { + region_id, + req, + is_leader, + ch, + } => self.flush_tablet(region_id, req, is_leader, ch), + } + } +} diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 3eca179d770..67054d5bd11 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2797,6 +2797,7 @@ where } // It's v2 only message and ignore does no harm. ExtraMessageType::MsgGcPeerRequest | ExtraMessageType::MsgGcPeerResponse => (), + ExtraMessageType::MsgFlushMemtable => (), } } diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 6a953ed9ca2..0f352ebc5bf 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -1035,6 +1035,10 @@ impl Cluster { region_end_key }; + if amended_start_key > amended_end_key { + return Ok(()); + } + tablet.scan(cf, amended_start_key, amended_end_key, fill_cache, f) } diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index 5617787bb70..058a9caf186 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -437,12 +437,16 @@ impl Simulator for NodeCluster { } } +// Compare to server cluster, node cluster does not have server layer and +// storage layer. pub fn new_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) } +// This cluster does not support batch split, we expect it to transfer the +// `BatchSplit` request to `split` request pub fn new_incompatible_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index c75adf33645..c4c516fb7f9 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -505,12 +505,16 @@ impl Simulator for NodeCluster { } } +// Compare to server cluster, node cluster does not have server layer and +// storage layer. pub fn new_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); Cluster::new(id, count, sim, pd_client, ApiVersion::V1) } +// This cluster does not support batch split, we expect it to transfer the +// `BatchSplit` request to `split` request pub fn new_incompatible_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 15779df426a..23df1a89940 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -570,6 +570,8 @@ bitflags! { const TRANSFER_LEADER_PROPOSAL = 0b00000100; /// Indicates this request is a flashback transaction. const FLASHBACK = 0b00001000; + /// Indicates the relevant tablet has been flushed, and we can propose split now. + const SPLIT_SECOND_PHASE = 0b00010000; } } diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index f8d6ff9b468..2673a34b0d2 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -7,7 +7,7 @@ use std::{ time::Duration, }; -use engine_traits::{Iterable, Peekable, CF_DEFAULT, CF_WRITE}; +use engine_traits::{Peekable, CF_DEFAULT, CF_WRITE}; use keys::data_key; use kvproto::{metapb, pdpb, raft_cmdpb::*, raft_serverpb::RaftMessage}; use pd_client::PdClient; @@ -145,7 +145,14 @@ fn test_server_split_region_twice() { rx1.recv_timeout(Duration::from_secs(5)).unwrap(); } -fn test_auto_split_region(cluster: &mut Cluster) { +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore::new_incompatible_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_auto_split_region() { + let count = 5; + let mut cluster = new_cluster(0, count); cluster.cfg.raft_store.split_region_check_tick_interval = ReadableDuration::millis(100); cluster.cfg.coprocessor.region_max_size = Some(ReadableSize(REGION_MAX_SIZE)); cluster.cfg.coprocessor.region_split_size = Some(ReadableSize(REGION_SPLIT_SIZE)); @@ -159,7 +166,7 @@ fn test_auto_split_region(cluster: &mut Cluster) { let region = pd_client.get_region(b"").unwrap(); - let last_key = put_till_size(cluster, REGION_SPLIT_SIZE, &mut range); + let last_key = put_till_size(&mut cluster, REGION_SPLIT_SIZE, &mut range); // it should be finished in millis if split. thread::sleep(Duration::from_millis(300)); @@ -169,7 +176,7 @@ fn test_auto_split_region(cluster: &mut Cluster) { assert_eq!(region, target); let max_key = put_cf_till_size( - cluster, + &mut cluster, CF_WRITE, REGION_MAX_SIZE - REGION_SPLIT_SIZE + check_size_diff, &mut range, @@ -195,9 +202,9 @@ fn test_auto_split_region(cluster: &mut Cluster) { let leader = cluster.leader_of_region(left.get_id()).unwrap(); let store_id = leader.get_store_id(); let mut size = 0; - cluster.engines[&store_id] - .kv + cluster .scan( + store_id, CF_DEFAULT, &data_key(b""), &data_key(middle_key), @@ -223,34 +230,6 @@ fn test_auto_split_region(cluster: &mut Cluster) { assert!(resp.get_header().get_error().has_key_not_in_region()); } -#[test] -fn test_node_auto_split_region() { - let count = 5; - let mut cluster = new_node_cluster(0, count); - test_auto_split_region(&mut cluster); -} - -#[test] -fn test_incompatible_node_auto_split_region() { - let count = 5; - let mut cluster = new_incompatible_node_cluster(0, count); - test_auto_split_region(&mut cluster); -} - -#[test] -fn test_server_auto_split_region() { - let count = 5; - let mut cluster = new_server_cluster(0, count); - test_auto_split_region(&mut cluster); -} - -#[test] -fn test_incompatible_server_auto_split_region() { - let count = 5; - let mut cluster = new_incompatible_server_cluster(0, count); - test_auto_split_region(&mut cluster); -} - // A filter that disable commitment by heartbeat. #[derive(Clone)] struct EraseHeartbeatCommit; From 6d4f40c0a5d2d9f42edb738b0ba22d2ce1a9491e Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Mon, 27 Mar 2023 14:26:44 +0800 Subject: [PATCH 603/676] snapshot: feedback multi rocksdb (#14400) close tikv/tikv#14436 Signed-off-by: bufferflies <1045931706@qq.com> --- components/raftstore-v2/src/operation/pd.rs | 3 + .../raftstore/src/store/async_io/read.rs | 13 ++-- components/raftstore/src/store/snap.rs | 61 +++++++++++++++++++ src/server/snap.rs | 5 +- src/server/tablet_snap.rs | 18 ++++-- 5 files changed, 89 insertions(+), 11 deletions(-) diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 3b5e7d32f89..4bb6a06c162 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -43,9 +43,12 @@ impl Store { let meta = ctx.store_meta.lock().unwrap(); stats.set_region_count(meta.readers.len() as u32); } + + let snap_stats = ctx.snap_mgr.stats(); // todo: imple snapshot status report stats.set_sending_snap_count(0); stats.set_receiving_snap_count(0); + stats.set_snapshot_stats(snap_stats.stats.into()); STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC .with_label_values(&["sending"]) diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index 985134048dd..cee6373c5bd 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -214,6 +214,7 @@ where snapshot.mut_metadata().set_index(last_applied_index); let conf_state = util::conf_state_from_region(region_state.get_region()); snapshot.mut_metadata().set_conf_state(conf_state); + // Set snapshot data. let mut snap_data = RaftSnapshotData::default(); snap_data.set_region(region_state.get_region().clone()); @@ -222,7 +223,6 @@ where snap_data.set_removed_records(region_state.get_removed_records().into()); snap_data.set_merged_records(region_state.get_merged_records().into()); snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); - // create checkpointer. let snap_key = TabletSnapKey::from_region_snap(region_id, to_peer, &snapshot); let mut res = None; @@ -232,11 +232,8 @@ where error!("failed to create checkpointer"; "region_id" => region_id, "error" => %e); SNAP_COUNTER.generate.fail.inc(); } else { + let generate_duration_secs = start.saturating_elapsed().as_secs(); let elapsed = start.saturating_elapsed_secs(); - SNAP_COUNTER.generate.success.inc(); - SNAP_HISTOGRAM.generate.observe(elapsed); - SNAPSHOT_SIZE_HISTOGRAM.observe(total_size as f64); - SNAPSHOT_KV_COUNT_HISTOGRAM.observe(total_keys as f64); info!( "snapshot generated"; "region_id" => region_id, @@ -246,6 +243,12 @@ where "total_size" => total_size, "total_keys" => total_keys, ); + self.snap_mgr() + .begin_snapshot(snap_key, start, generate_duration_secs); + SNAP_COUNTER.generate.success.inc(); + SNAP_HISTOGRAM.generate.observe(elapsed); + SNAPSHOT_SIZE_HISTOGRAM.observe(total_size as f64); + SNAPSHOT_KV_COUNT_HISTOGRAM.observe(total_keys as f64); res = Some(Box::new((snapshot, to_peer))) } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 57cdbd2a75c..d0c55c144ed 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1975,6 +1975,7 @@ pub struct TabletSnapManager { // directory to store snapfile. base: PathBuf, receiving: Arc>>, + stats: Arc>>, } impl TabletSnapManager { @@ -1994,9 +1995,45 @@ impl TabletSnapManager { Ok(Self { base: path, receiving: Arc::default(), + stats: Arc::default(), }) } + pub fn begin_snapshot(&self, key: TabletSnapKey, start: Instant, generate_duration_sec: u64) { + let mut stat = SnapshotStat::default(); + stat.set_generate_duration_sec(generate_duration_sec); + self.stats.lock().unwrap().insert(key, (start, stat)); + } + + pub fn finish_snapshot(&self, key: TabletSnapKey, send: Instant) { + let region_id = key.region_id; + self.stats + .lock() + .unwrap() + .entry(key) + .and_modify(|(start, stat)| { + stat.set_send_duration_sec(send.saturating_elapsed().as_secs()); + stat.set_total_duration_sec(start.saturating_elapsed().as_secs()); + stat.set_region_id(region_id); + }); + } + + pub fn stats(&self) -> SnapStats { + let stats: Vec = self + .stats + .lock() + .unwrap() + .drain_filter(|_, (_, stat)| stat.get_region_id() > 0) + .map(|(_, (_, stat))| stat) + .filter(|stat| stat.get_total_duration_sec() > 1) + .collect(); + SnapStats { + sending_count: 0, + receiving_count: 0, + stats, + } + } + pub fn tablet_gen_path(&self, key: &TabletSnapKey) -> PathBuf { let prefix = format!("{}_{}", SNAP_GEN_PREFIX, key); PathBuf::from(&self.base).join(prefix) @@ -3056,6 +3093,30 @@ pub mod tests { assert!(!file_system::file_exists(&sst_path)); } + #[test] + fn test_snapshot_stats() { + let snap_dir = Builder::new() + .prefix("test_snapshot_stats") + .tempdir() + .unwrap(); + let start = Instant::now(); + let mgr = TabletSnapManager::new(snap_dir.path()).unwrap(); + let key = TabletSnapKey::new(1, 1, 1, 1); + mgr.begin_snapshot(key.clone(), start - time::Duration::from_secs(2), 1); + // filter out the snapshot that is not finished + assert!(mgr.stats().stats.is_empty()); + mgr.finish_snapshot(key.clone(), start - time::Duration::from_secs(1)); + let stats = mgr.stats().stats; + assert_eq!(stats.len(), 1); + assert_eq!(stats[0].get_total_duration_sec(), 2); + assert!(mgr.stats().stats.is_empty()); + + // filter out the total duration seconds less than one sencond. + mgr.begin_snapshot(key.clone(), start, 1); + mgr.finish_snapshot(key, start); + assert_eq!(mgr.stats().stats.len(), 0); + } + #[test] fn test_build_with_encryption() { let (_enc_dir, key_manager) = diff --git a/src/server/snap.rs b/src/server/snap.rs index afce0e8a2fd..d06e49ab7a8 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -146,7 +146,7 @@ pub fn send_snap( if let Err(e) = snap_data.merge_from_bytes(snap.get_data()) { return Err(Error::Io(IoError::new(ErrorKind::Other, e))); } - let key = SnapKey::from_region_snap(snap_data.get_region().get_id(), snap); + let key = SnapKey::from_region_snap(msg.get_region_id(), snap); let snap_start = snap_data.get_meta().get_start(); let generate_duration_sec = snap_data.get_meta().get_generate_duration_sec(); (key, snap_start, generate_duration_sec) @@ -202,6 +202,7 @@ pub fn send_snap( fail_point!("snapshot_delete_after_send"); mgr.delete_snapshot(&key, &chunks.snap, true); let cost = UnixSecs::now().into_inner().saturating_sub(snap_start); + let send_duration_sec = timer.saturating_elapsed().as_secs(); // it should ignore if the duration of snapshot is less than 1s to decrease the // grpc data size. if cost >= 1 { @@ -209,7 +210,7 @@ pub fn send_snap( stat.set_region_id(key.region_id); stat.set_transport_size(total_size); stat.set_generate_duration_sec(generate_duration_sec); - stat.set_send_duration_sec(timer.saturating_elapsed().as_secs()); + stat.set_send_duration_sec(send_duration_sec); stat.set_total_duration_sec(cost); mgr.collect_stat(stat); } diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index a5a8b24d10b..4524b8645ff 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -60,6 +60,7 @@ use tikv_util::{ config::{ReadableSize, Tracker, VersionTrack}, time::Instant, worker::Runnable, + DeferContext, }; use tokio::runtime::{Builder as RuntimeBuilder, Runtime}; @@ -81,7 +82,11 @@ fn is_sst(file_name: &str) -> bool { async fn read_to(f: &mut File, to: &mut Vec, size: usize, limiter: &Limiter) -> Result<()> { // It's likely in page cache already. - limiter.consume(size / 2).await; + let cost = size / 2; + limiter.consume(cost).await; + SNAP_LIMIT_TRANSPORT_BYTES_COUNTER_STATIC + .send + .inc_by(cost as u64); to.clear(); to.reserve_exact(size); let mut buf: BorrowedBuf<'_> = to.spare_capacity_mut().into(); @@ -310,6 +315,9 @@ async fn accept_one_file( )); } limiter.consume(chunk_len).await; + SNAP_LIMIT_TRANSPORT_BYTES_COUNTER_STATIC + .recv + .inc_by(chunk_len as u64); digest.write(&chunk.data); f.write_all(&chunk.data)?; if exp_size == file_size { @@ -604,9 +612,6 @@ async fn send_snap_files( let mut req = TabletSnapshotRequest::default(); req.mut_end().set_checksum(checksum); sender.send((req, WriteFlags::default())).await?; - SNAP_LIMIT_TRANSPORT_BYTES_COUNTER_STATIC - .send - .inc_by(total_sent); info!("sent all snap file finish"; "snap_key" => %key, "region_id" => region_id, "to_peer" => to_peer); sender.close().await?; Ok(total_sent) @@ -633,6 +638,10 @@ pub fn send_snap( msg.get_to_peer().get_id(), msg.get_message().get_snapshot(), ); + let deregister = { + let (mgr, key) = (mgr.clone(), key.clone()); + DeferContext::new(move || mgr.finish_snapshot(key.clone(), timer)) + }; let cb = ChannelBuilder::new(env) .stream_initial_window_size(cfg.grpc_stream_initial_window_size.0 as i32) @@ -652,6 +661,7 @@ pub fn send_snap( let recv_result = receiver.next().await; send_timer.observe_duration(); drop(client); + drop(deregister); mgr.delete_snapshot(&key); match recv_result { None => Ok(SendStat { From 88eb52d38431bdc4b2ff4fc30d30c085f7c72596 Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 27 Mar 2023 21:18:06 +0800 Subject: [PATCH 604/676] resource_control: reset resource group virtual time when it is about to overflow (#14464) ref tikv/tikv#14353, ref pingcap/tidb#42595, ref pingcap/tidb#42596 Signed-off-by: glorv --- .../resource_control/src/resource_group.rs | 155 +++++++++--------- 1 file changed, 75 insertions(+), 80 deletions(-) diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 690a3e3812f..0b0f24e8f62 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -1,6 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cmp::{max, min}, sync::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, @@ -26,8 +27,8 @@ const TASK_EXTRA_FACTOR_BY_LEVEL: [u64; 3] = [0, 20, 100]; pub const MIN_PRIORITY_UPDATE_INTERVAL: Duration = Duration::from_secs(1); /// default resource group name const DEFAULT_RESOURCE_GROUP_NAME: &str = "default"; -/// default value of max RU quota. -const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; +/// The maximum RU quota that can be configured. +const MAX_RU_QUOTA: u64 = i32::MAX as u64; #[cfg(test)] const LOW_PRIORITY: u32 = 1; @@ -35,6 +36,11 @@ const MEDIUM_PRIORITY: u32 = 8; #[cfg(test)] const HIGH_PRIORITY: u32 = 16; +// the global maxinum of virtual time is u64::MAX / 16, so when the virtual +// time of all groups are bigger than half of this value, we rest them to avoid +// virtual time overflow. +const RESET_VT_THRESHOLD: u64 = (u64::MAX >> 4) / 2; + pub enum ResourceConsumeType { CpuTime(Duration), IoBytes(u64), @@ -146,11 +152,6 @@ pub struct ResourceController { // increase the real cost after task is executed; but don't increase it at write because // the cost is known so we just pre-consume it. is_read: bool, - // Track the maximum ru quota used to calculate the factor of each resource group. - // factor = max_ru_quota / group_ru_quota * 10.0 - // We use mutex here to ensure when we need to change this value and do adjust all resource - // groups' factors, it can't be changed concurrently. - max_ru_quota: Mutex, // record consumption of each resource group, name --> resource_group resource_consumptions: RwLock, GroupPriorityTracker>>, @@ -162,7 +163,6 @@ impl ResourceController { let controller = Self { name, is_read, - max_ru_quota: Mutex::new(DEFAULT_MAX_RU_QUOTA), resource_consumptions: RwLock::new(HashMap::default()), last_min_vt: AtomicU64::new(0), }; @@ -175,12 +175,12 @@ impl ResourceController { controller } - fn calculate_factor(max_quota: u64, quota: u64) -> u64 { + fn calculate_factor(mut quota: u64) -> u64 { + quota = min(quota, MAX_RU_QUOTA); if quota > 0 { - // we use max_quota / quota as the resource group factor, but because we need to - // cast the value to integer, so we times it by 10 to ensure the accuracy is - // enough. - (max_quota as f64 / quota as f64 * 10.0).round() as u64 + // the maxinum ru quota is very big, so the precision lost due to + // integer division is very small. + MAX_RU_QUOTA / quota } else { 1 } @@ -191,13 +191,8 @@ impl ResourceController { // map 0 to medium priority(default priority) group_priority = MEDIUM_PRIORITY; } - let mut max_ru_quota = self.max_ru_quota.lock().unwrap(); - if ru_quota > *max_ru_quota { - *max_ru_quota = ru_quota; - // adjust all group weight because the current value is too small. - self.adjust_all_resource_group_factors(ru_quota); - } - let weight = Self::calculate_factor(*max_ru_quota, ru_quota); + + let weight = Self::calculate_factor(ru_quota); let vt_delta_for_get = if self.is_read { DEFAULT_PRIORITY_PER_READ_TASK * weight @@ -205,7 +200,6 @@ impl ResourceController { 0 }; let group = GroupPriorityTracker { - ru_quota, group_priority, weight, virtual_time: AtomicU64::new(self.last_min_vt.load(Ordering::Acquire)), @@ -216,20 +210,6 @@ impl ResourceController { self.resource_consumptions.write().insert(name, group); } - // we calculate the weight of each resource group based on the currently maximum - // ru quota, if a incoming resource group has a bigger quota, we need to - // adjust all the existing groups. As we expect this won't happen very - // often, and iterate 10k entry cost less than 5ms, so the performance is - // acceptable. - fn adjust_all_resource_group_factors(&self, max_ru_quota: u64) { - self.resource_consumptions - .write() - .iter_mut() - .for_each(|(_, tracker)| { - tracker.weight = Self::calculate_factor(max_ru_quota, tracker.ru_quota); - }); - } - fn remove_resource_group(&self, name: &[u8]) { // do not remove the default resource group, reset to default setting instead. if DEFAULT_RESOURCE_GROUP_NAME.as_bytes() == name { @@ -267,30 +247,36 @@ impl ResourceController { .iter() .for_each(|(_, tracker)| { let vt = tracker.current_vt(); - if min_vt > vt { - min_vt = vt; - } - if max_vt < vt { - max_vt = vt; - } + min_vt = min(min_vt, vt); + max_vt = max(max_vt, vt); }); // TODO: use different threshold for different resource type // needn't do update if the virtual different is less than 100ms/100KB. - if min_vt + 100_000 >= max_vt { + if min_vt + 100_000 >= max_vt && max_vt < RESET_VT_THRESHOLD { return; } + let near_overflow = min_vt > RESET_VT_THRESHOLD; self.resource_consumptions .read() .iter() .for_each(|(_, tracker)| { let vt = tracker.current_vt(); - if vt < max_vt { + // NOTE: this decrease vt is not atomic across all resource groups, + // but it should be ok as this operation should be extremely rare + // and the impact is not big. + if near_overflow { + tracker.decrease_vt(RESET_VT_THRESHOLD - (max_vt - vt) / 2); + } else if vt < max_vt { // TODO: is increase by half is a good choice. tracker.increase_vt((max_vt - vt) / 2); } }); + if near_overflow { + info!("all reset groups' virtual time are near overflow, do reset"); + max_vt -= RESET_VT_THRESHOLD; + } // max_vt is actually a little bigger than the current min vt, but we don't // need totally accurate here. self.last_min_vt.store(max_vt, Ordering::Relaxed); @@ -323,8 +309,6 @@ fn concat_priority_vt(group_priority: u32, vt: u64) -> u64 { } struct GroupPriorityTracker { - // the ru setting of this group. - ru_quota: u64, group_priority: u32, weight: u64, virtual_time: AtomicU64, @@ -355,6 +339,11 @@ impl GroupPriorityTracker { self.virtual_time.fetch_add(vt_delta, Ordering::Relaxed); } + #[inline] + fn decrease_vt(&self, vt_delta: u64) { + self.virtual_time.fetch_sub(vt_delta, Ordering::Relaxed); + } + // TODO: make it delta type as generic to avoid mixed consume different types. #[inline] fn consume(&self, delta: ResourceConsumeType) { @@ -462,26 +451,25 @@ pub(crate) mod tests { assert_eq!(resource_ctl.resource_consumptions.read().len(), 3); let group1 = resource_ctl.resource_group("test".as_bytes()); - assert_eq!(group1.weight, 500); let group2 = resource_ctl.resource_group("test2".as_bytes()); - assert_eq!(group2.weight, 250); + assert_eq!(group1.weight, group2.weight * 2); assert_eq!(group1.current_vt(), 0); let mut extras1 = Extras::single_level(); extras1.set_metadata("test".as_bytes().to_owned()); assert_eq!( resource_ctl.priority_of(&extras1), - concat_priority_vt(LOW_PRIORITY, 25_000) + concat_priority_vt(LOW_PRIORITY, group1.weight * 50) ); - assert_eq!(group1.current_vt(), 25_000); + assert_eq!(group1.current_vt(), group1.weight * 50); let mut extras2 = Extras::single_level(); extras2.set_metadata("test2".as_bytes().to_owned()); assert_eq!( resource_ctl.priority_of(&extras2), - concat_priority_vt(MEDIUM_PRIORITY, 12_500) + concat_priority_vt(MEDIUM_PRIORITY, group2.weight * 50) ); - assert_eq!(group2.current_vt(), 12_500); + assert_eq!(group2.current_vt(), group2.weight * 50); let mut extras3 = Extras::single_level(); extras3.set_metadata("unknown_group".as_bytes().to_owned()); @@ -505,13 +493,14 @@ pub(crate) mod tests { ResourceConsumeType::CpuTime(Duration::from_micros(10000)), ); - assert_eq!(group1.current_vt(), 5_025_000); + assert_eq!(group1.current_vt(), group1.weight * 10050); assert_eq!(group1.current_vt(), group2.current_vt() * 2); // test update all group vts resource_manager.advance_min_virtual_time(); let group1_vt = group1.current_vt(); - assert_eq!(group1_vt, 5_025_000); + let group1_weight = group1.weight; + assert_eq!(group1_vt, group1.weight * 10050); assert!(group2.current_vt() >= group1.current_vt() * 3 / 4); assert!( resource_ctl @@ -524,45 +513,51 @@ pub(crate) mod tests { drop(group2); // test add 1 new resource group - let new_group = new_resource_group_ru("new_group".into(), 500, HIGH_PRIORITY); + let new_group = new_resource_group_ru("new_group".into(), 600, HIGH_PRIORITY); resource_manager.add_resource_group(new_group); assert_eq!(resource_ctl.resource_consumptions.read().len(), 4); let group3 = resource_ctl.resource_group("new_group".as_bytes()); - assert_eq!(group3.weight, 200); + assert!(group1_weight - 10 <= group3.weight * 3 && group3.weight * 3 <= group1_weight + 10); assert!(group3.current_vt() >= group1_vt / 2); } #[test] - fn test_adjust_resource_group_weight() { + fn test_reset_resource_group_vt() { let resource_manager = ResourceGroupManager::default(); - let resource_ctl = resource_manager.derive_controller("test_read".into(), true); - let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); + let resource_ctl = resource_manager.derive_controller("test_write".into(), false); - let group1 = new_resource_group_ru("test1".into(), 5000, 0); + let group1 = new_resource_group_ru("g1".into(), i32::MAX as u64, 1); resource_manager.add_resource_group(group1); - assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); - assert_eq!( - resource_ctl_write.resource_group("test1".as_bytes()).weight, - 20 - ); + let group2 = new_resource_group_ru("g2".into(), 1, 16); + resource_manager.add_resource_group(group2); - // add a resource group with big ru - let group1 = new_resource_group_ru("test2".into(), 50000, 0); - resource_manager.add_resource_group(group1); - assert_eq!(*resource_ctl.max_ru_quota.lock().unwrap(), 50000); - assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 100); - assert_eq!(resource_ctl.resource_group("test2".as_bytes()).weight, 10); - // resource_ctl_write should be unchanged. - assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 50000); - assert_eq!( - resource_ctl_write.resource_group("test1".as_bytes()).weight, - 100 - ); - assert_eq!( - resource_ctl_write.resource_group("test2".as_bytes()).weight, - 10 - ); + let g1 = resource_ctl.resource_group("g1".as_bytes()); + let g2 = resource_ctl.resource_group("g2".as_bytes()); + let threshold = 1 << 59; + let mut last_g2_vt = 0; + for i in 0..8 { + resource_ctl.consume("g2".as_bytes(), ResourceConsumeType::IoBytes(1 << 25)); + resource_manager.advance_min_virtual_time(); + if i < 7 { + assert!(g2.current_vt() < threshold); + } + // after 8 round, g1's vt still under the threshold and is still increasing. + assert!(g1.current_vt() < threshold && g1.current_vt() > last_g2_vt); + last_g2_vt = g2.current_vt(); + } + + resource_ctl.consume("g2".as_bytes(), ResourceConsumeType::IoBytes(1 << 25)); + resource_manager.advance_min_virtual_time(); + assert!(g1.current_vt() > threshold); + + // adjust again, the virtual time of each group should decrease + resource_manager.advance_min_virtual_time(); + let g1_vt = g1.current_vt(); + let g2_vt = g2.current_vt(); + assert!(g2_vt < threshold / 2); + assert!(g1_vt < threshold / 2 && g1_vt < g2_vt); + assert_eq!(resource_ctl.last_min_vt.load(Ordering::Relaxed), g2_vt); } #[test] From c930237abab7f125c9d516e3e40d829d423e7a49 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Tue, 28 Mar 2023 13:26:53 +0800 Subject: [PATCH 605/676] raftstore-v2: implement snapshot backup for raftstore v2 (#14438) ref tikv/tikv#12842 Signed-off-by: 3pointer Co-authored-by: Ti Chi Robot --- components/backup/src/endpoint.rs | 67 ++++++++-------- components/backup/src/service.rs | 90 ++++++++++++---------- components/backup/src/writer.rs | 57 +++++++------- components/engine_traits/src/sst.rs | 2 +- components/server/src/server.rs | 11 ++- components/server/src/server2.rs | 33 +++++++- components/test_backup/src/lib.rs | 4 +- components/test_raftstore-v2/src/server.rs | 4 +- components/test_raftstore/src/server.rs | 4 +- components/tikv_kv/src/lib.rs | 28 ++++++- src/import/mod.rs | 26 +------ src/import/sst_service.rs | 6 +- 12 files changed, 189 insertions(+), 143 deletions(-) diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 896020cf51a..4fb1705ebab 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -11,8 +11,7 @@ use std::{ use async_channel::SendError; use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; use concurrency_manager::ConcurrencyManager; -use engine_rocks::RocksEngine; -use engine_traits::{name_to_cf, raw_ttl::ttl_current_ts, CfName, SstCompressionType}; +use engine_traits::{name_to_cf, raw_ttl::ttl_current_ts, CfName, KvEngine, SstCompressionType}; use external_storage::{BackendConfig, HdfsConfig}; use external_storage_export::{create_storage, ExternalStorage}; use futures::{channel::mpsc::*, executor::block_on}; @@ -28,7 +27,7 @@ use raftstore::coprocessor::RegionInfoProvider; use tikv::{ config::BackupConfig, storage::{ - kv::{CursorBuilder, Engine, ScanMode, SnapContext}, + kv::{CursorBuilder, Engine, LocalTablets, ScanMode, SnapContext}, mvcc::Error as MvccError, raw::raw_mvcc::RawMvccSnapshot, txn::{EntryBatch, Error as TxnError, SnapshotStore, TxnEntryScanner, TxnEntryStore}, @@ -163,12 +162,12 @@ pub struct BackupRange { /// The generic saveable writer. for generic `InMemBackupFiles`. /// Maybe what we really need is make Writer a trait... -enum KvWriter { - Txn(BackupWriter), - Raw(BackupRawKvWriter), +enum KvWriter { + Txn(BackupWriter), + Raw(BackupRawKvWriter), } -impl std::fmt::Debug for KvWriter { +impl std::fmt::Debug for KvWriter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Txn(_) => f.debug_tuple("Txn").finish(), @@ -177,7 +176,7 @@ impl std::fmt::Debug for KvWriter { } } -impl KvWriter { +impl KvWriter { async fn save(self, storage: &dyn ExternalStorage) -> Result> { match self { Self::Txn(writer) => writer.save(storage).await, @@ -194,8 +193,8 @@ impl KvWriter { } #[derive(Debug)] -struct InMemBackupFiles { - files: KvWriter, +struct InMemBackupFiles { + files: KvWriter, start_key: Vec, end_key: Vec, start_version: TimeStamp, @@ -203,8 +202,8 @@ struct InMemBackupFiles { region: Region, } -async fn save_backup_file_worker( - rx: async_channel::Receiver, +async fn save_backup_file_worker( + rx: async_channel::Receiver>, tx: UnboundedSender, storage: Arc, codec: KeyValueCodec, @@ -276,10 +275,10 @@ async fn save_backup_file_worker( /// Send the save task to the save worker. /// Record the wait time at the same time. -async fn send_to_worker_with_metrics( - tx: &async_channel::Sender, - files: InMemBackupFiles, -) -> std::result::Result<(), SendError> { +async fn send_to_worker_with_metrics( + tx: &async_channel::Sender>, + files: InMemBackupFiles, +) -> std::result::Result<(), SendError>> { let files = match tx.try_send(files) { Ok(_) => return Ok(()), Err(e) => e.into_inner(), @@ -294,12 +293,12 @@ impl BackupRange { /// Get entries from the scanner and save them to storage async fn backup( &self, - writer_builder: BackupWriterBuilder, + writer_builder: BackupWriterBuilder, mut engine: E, concurrency_manager: ConcurrencyManager, backup_ts: TimeStamp, begin_ts: TimeStamp, - saver: async_channel::Sender, + saver: async_channel::Sender>, storage_name: &str, ) -> Result { assert!(!self.codec.is_raw_kv); @@ -460,9 +459,9 @@ impl BackupRange { Ok(stat) } - fn backup_raw( + fn backup_raw( &self, - writer: &mut BackupRawKvWriter, + writer: &mut BackupRawKvWriter, snapshot: &S, ) -> Result { assert!(self.codec.is_raw_kv); @@ -524,14 +523,14 @@ impl BackupRange { async fn backup_raw_kv_to_file( &self, mut engine: E, - db: RocksEngine, + db: E::Local, limiter: &Limiter, file_name: String, cf: CfNameWrap, compression_type: Option, compression_level: i32, cipher: CipherInfo, - saver_tx: async_channel::Sender, + saver_tx: async_channel::Sender>, ) -> Result { let mut writer = match BackupRawKvWriter::new( db, @@ -679,7 +678,7 @@ pub struct Endpoint { store_id: u64, pool: RefCell, io_pool: Runtime, - db: RocksEngine, + tablets: LocalTablets, config_manager: ConfigManager, concurrency_manager: ConcurrencyManager, softlimit: SoftLimitKeeper, @@ -834,7 +833,7 @@ impl Endpoint { store_id: u64, engine: E, region_info: R, - db: RocksEngine, + tablets: LocalTablets, config: BackupConfig, concurrency_manager: ConcurrencyManager, api_version: ApiVersion, @@ -850,7 +849,7 @@ impl Endpoint { engine, region_info, pool: RefCell::new(pool), - db, + tablets, io_pool: rt, softlimit, config_manager, @@ -885,14 +884,14 @@ impl Endpoint { &self, prs: Arc>>, request: Request, - saver_tx: async_channel::Sender, + saver_tx: async_channel::Sender>, resp_tx: UnboundedSender, _backend: Arc, ) { let start_ts = request.start_ts; let backup_ts = request.end_ts; let engine = self.engine.clone(); - let db = self.db.clone(); + let tablets = self.tablets.clone(); let store_id = self.store_id; let concurrency_manager = self.concurrency_manager.clone(); let batch_size = self.config_manager.0.read().unwrap().batch_size; @@ -947,12 +946,19 @@ impl Endpoint { }); let name = backup_file_name(store_id, &brange.region, key, _backend.name()); let ct = to_sst_compression_type(request.compression_type); + let db = match tablets.get(brange.region.id) { + Some(t) => t, + None => { + warn!("backup region not found"; "region" => ?brange.region.id); + return; + } + }; let stat = if is_raw_kv { brange .backup_raw_kv_to_file( engine, - db.clone(), + db.into_owned(), &request.limiter, name, cf.into(), @@ -967,7 +973,7 @@ impl Endpoint { store_id, request.limiter.clone(), brange.region.clone(), - db.clone(), + db.into_owned(), ct, request.compression_level, sst_max_size, @@ -1270,6 +1276,7 @@ pub mod tests { use tikv::{ coprocessor::checksum_crc64_xor, storage::{ + kv::LocalTablets, txn::tests::{must_commit, must_prewrite_put}, RocksEngine, TestEngineBuilder, }, @@ -1402,7 +1409,7 @@ pub mod tests { 1, rocks, MockRegionInfoProvider::new(need_encode_key), - db, + LocalTablets::Singleton(db), BackupConfig { num_threads: 4, batch_size: 8, diff --git a/components/backup/src/service.rs b/components/backup/src/service.rs index dd3355b1e92..237234c061e 100644 --- a/components/backup/src/service.rs +++ b/components/backup/src/service.rs @@ -1,47 +1,53 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::{marker::PhantomData, sync::atomic::*}; +use std::sync::atomic::*; -use engine_traits::KvEngine; +use engine_traits::{KvEngine, RaftEngine}; use futures::{channel::mpsc, FutureExt, SinkExt, StreamExt, TryFutureExt}; use grpcio::{self, *}; use kvproto::brpb::*; -use raftstore::{ - router::RaftStoreRouter, - store::msg::{PeerMsg, SignificantMsg}, +use raftstore::store::{ + fsm::store::RaftRouter, + msg::{PeerMsg, SignificantMsg}, }; use tikv_util::{error, info, worker::*}; use super::Task; /// Service handles the RPC messages for the `Backup` service. - #[derive(Clone)] -pub struct Service { +pub struct Service { scheduler: Scheduler, - router: RR, - _phantom: PhantomData, + router: Option>, } -impl Service +impl Service where - E: KvEngine, - RR: RaftStoreRouter, + EK: KvEngine, + ER: RaftEngine, { - /// Create a new backup service. - pub fn new(scheduler: Scheduler, router: RR) -> Self { + // Create a new backup service without router, this used for raftstore v2. + // because we don't have RaftStoreRouter any more. + pub fn new(scheduler: Scheduler) -> Self { + Service { + scheduler, + router: None, + } + } + + // Create a new backup service with router, this used for raftstore v1. + pub fn with_router(scheduler: Scheduler, router: RaftRouter) -> Self { Service { scheduler, - router, - _phantom: PhantomData, + router: Some(router), } } } -impl Backup for Service +impl Backup for Service where - E: KvEngine, - RR: RaftStoreRouter, + EK: KvEngine, + ER: RaftEngine, { fn check_pending_admin_op( &mut self, @@ -50,25 +56,33 @@ where mut sink: ServerStreamingSink, ) { let (tx, rx) = mpsc::unbounded(); - self.router.broadcast_normal(|| { - PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) - }); - - let send_task = async move { - let mut s = rx.map(|resp| Ok((resp, WriteFlags::default()))); - sink.send_all(&mut s).await?; - sink.close().await?; - Ok(()) - } - .map(|res: Result<()>| match res { - Ok(_) => { - info!("check admin closed"); + match &self.router { + Some(router) => { + router.broadcast_normal(|| { + PeerMsg::SignificantMsg(SignificantMsg::CheckPendingAdmin(tx.clone())) + }); + let send_task = async move { + let mut s = rx.map(|resp| Ok((resp, WriteFlags::default()))); + sink.send_all(&mut s).await?; + sink.close().await?; + Ok(()) + } + .map(|res: Result<()>| match res { + Ok(_) => { + info!("check admin closed"); + } + Err(e) => { + error!("check admin canceled"; "error" => ?e); + } + }); + ctx.spawn(send_task); } - Err(e) => { - error!("check admin canceled"; "error" => ?e); + None => { + // check pending admin reqeust is used for EBS Backup. + // for raftstore v2. we don't need it for now. so just return unimplemented + unimplemented_call!(ctx, sink) } - }); - ctx.spawn(send_task); + } } fn backup( @@ -131,7 +145,6 @@ mod tests { use engine_rocks::RocksEngine; use external_storage_export::make_local_backend; - use raftstore::router::RaftStoreBlackHole; use tikv::storage::txn::tests::{must_commit, must_prewrite_put}; use tikv_util::worker::{dummy_scheduler, ReceiverWrapper}; use txn_types::TimeStamp; @@ -142,8 +155,7 @@ mod tests { fn new_rpc_suite() -> (Server, BackupClient, ReceiverWrapper) { let env = Arc::new(EnvBuilder::new().build()); let (scheduler, rx) = dummy_scheduler(); - let backup_service = - super::Service::::new(scheduler, RaftStoreBlackHole); + let backup_service = super::Service::::new(scheduler); let builder = ServerBuilder::new(env.clone()).register_service(create_backup(backup_service)); let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); diff --git a/components/backup/src/writer.rs b/components/backup/src/writer.rs index 7a853fe485f..715c4f68291 100644 --- a/components/backup/src/writer.rs +++ b/components/backup/src/writer.rs @@ -3,10 +3,9 @@ use std::{fmt::Display, io::Read}; use encryption::{EncrypterReader, Iv}; -use engine_rocks::{RocksEngine, RocksSstWriter, RocksSstWriterBuilder}; use engine_traits::{ - CfName, ExternalSstFileInfo, SstCompressionType, SstWriter, SstWriterBuilder, CF_DEFAULT, - CF_WRITE, + CfName, ExternalSstFileInfo, KvEngine, SstCompressionType, SstExt, SstWriter, SstWriterBuilder, + CF_DEFAULT, CF_WRITE, }; use external_storage_export::{ExternalStorage, UnpinReader}; use file_system::Sha256Reader; @@ -49,16 +48,16 @@ impl From for CfName { } } -struct Writer { - writer: RocksSstWriter, +struct Writer { + writer: W, total_kvs: u64, total_bytes: u64, checksum: u64, digest: crc64fast::Digest, } -impl Writer { - fn new(writer: RocksSstWriter) -> Self { +impl Writer { + fn new(writer: W) -> Self { Writer { writer, total_kvs: 0, @@ -98,9 +97,7 @@ impl Writer { Ok(()) } - // FIXME: we cannot get sst_info in [save_and_build_file], which may cause the - // !Send type [RocksEnternalSstFileInfo] sent between threads. - fn finish_read(writer: RocksSstWriter) -> Result<(u64, impl Read)> { + fn finish_read(writer: W) -> Result<(u64, impl Read)> { let (sst_info, sst_reader) = writer.finish_read()?; Ok((sst_info.file_size(), sst_reader)) } @@ -163,28 +160,28 @@ impl Writer { } } -pub struct BackupWriterBuilder { +pub struct BackupWriterBuilder { store_id: u64, limiter: Limiter, region: Region, - db: RocksEngine, + db: EK, compression_type: Option, compression_level: i32, sst_max_size: u64, cipher: CipherInfo, } -impl BackupWriterBuilder { +impl BackupWriterBuilder { pub fn new( store_id: u64, limiter: Limiter, region: Region, - db: RocksEngine, + db: EK, compression_type: Option, compression_level: i32, sst_max_size: u64, cipher: CipherInfo, - ) -> BackupWriterBuilder { + ) -> BackupWriterBuilder { Self { store_id, limiter, @@ -197,7 +194,7 @@ impl BackupWriterBuilder { } } - pub fn build(&self, start_key: Vec, storage_name: &str) -> Result { + pub fn build(&self, start_key: Vec, storage_name: &str) -> Result> { let key = file_system::sha256(&start_key).ok().map(hex::encode); let store_id = self.store_id; let name = backup_file_name(store_id, &self.region, key, storage_name); @@ -214,34 +211,34 @@ impl BackupWriterBuilder { } /// A writer writes txn entries into SST files. -pub struct BackupWriter { +pub struct BackupWriter { name: String, - default: Writer, - write: Writer, + default: Writer<::SstWriter>, + write: Writer<::SstWriter>, limiter: Limiter, sst_max_size: u64, cipher: CipherInfo, } -impl BackupWriter { +impl BackupWriter { /// Create a new BackupWriter. pub fn new( - db: RocksEngine, + db: EK, name: &str, compression_type: Option, compression_level: i32, limiter: Limiter, sst_max_size: u64, cipher: CipherInfo, - ) -> Result { - let default = RocksSstWriterBuilder::new() + ) -> Result> { + let default = ::SstWriterBuilder::new() .set_in_memory(true) .set_cf(CF_DEFAULT) .set_db(&db) .set_compression_type(compression_type) .set_compression_level(compression_level) .build(name)?; - let write = RocksSstWriterBuilder::new() + let write = ::SstWriterBuilder::new() .set_in_memory(true) .set_cf(CF_WRITE) .set_db(&db) @@ -338,19 +335,19 @@ impl BackupWriter { } /// A writer writes Raw kv into SST files. -pub struct BackupRawKvWriter { +pub struct BackupRawKvWriter { name: String, cf: CfName, - writer: Writer, + writer: Writer<::SstWriter>, limiter: Limiter, cipher: CipherInfo, codec: KeyValueCodec, } -impl BackupRawKvWriter { +impl BackupRawKvWriter { /// Create a new BackupRawKvWriter. pub fn new( - db: RocksEngine, + db: EK, name: &str, cf: CfNameWrap, limiter: Limiter, @@ -358,8 +355,8 @@ impl BackupRawKvWriter { compression_level: i32, cipher: CipherInfo, codec: KeyValueCodec, - ) -> Result { - let writer = RocksSstWriterBuilder::new() + ) -> Result> { + let writer = ::SstWriterBuilder::new() .set_in_memory(true) .set_cf(cf.into()) .set_db(&db) diff --git a/components/engine_traits/src/sst.rs b/components/engine_traits/src/sst.rs index a97fe7a8b87..ea08df3bb50 100644 --- a/components/engine_traits/src/sst.rs +++ b/components/engine_traits/src/sst.rs @@ -28,7 +28,7 @@ pub trait SstReader: RefIterable + Sized { /// SstWriter is used to create sst files that can be added to database later. pub trait SstWriter: Send { type ExternalSstFileInfo: ExternalSstFileInfo; - type ExternalSstFileReader: std::io::Read; + type ExternalSstFileReader: std::io::Read + Send; /// Add key, value to currently opened file /// REQUIRES: key is after any previously added key according to comparator. diff --git a/components/server/src/server.rs b/components/server/src/server.rs index b9563f295b5..e37c6f9fe3b 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -91,7 +91,7 @@ use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, - import::{ImportSstService, LocalTablets, SstImporter}, + import::{ImportSstService, SstImporter}, read_pool::{ build_yatp_read_pool, ReadPool, ReadPoolConfigManager, UPDATE_EWMA_TIME_SLICE_INTERVAL, }, @@ -111,6 +111,7 @@ use tikv::{ storage::{ self, config_manager::StorageConfigManger, + kv::LocalTablets, mvcc::MvccConsistencyCheckObserver, txn::flow_controller::{EngineFlowController, FlowController}, Engine, Storage, @@ -1319,10 +1320,8 @@ where // Backup service. let mut backup_worker = Box::new(self.background_worker.lazy_build("backup-endpoint")); let backup_scheduler = backup_worker.scheduler(); - let backup_service = backup::Service::>::new( - backup_scheduler, - self.router.clone(), - ); + let backup_service = + backup::Service::::with_router(backup_scheduler, self.router.clone()); if servers .server .register_service(create_backup(backup_service)) @@ -1335,7 +1334,7 @@ where servers.node.id(), engines.engine.clone(), self.region_info_accessor.clone(), - engines.engines.kv.clone(), + LocalTablets::Singleton(engines.engines.kv.clone()), self.config.backup.clone(), self.concurrency_manager.clone(), self.config.storage.api_version(), diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index ef38c3e2286..6c96ce62ffb 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -48,7 +48,7 @@ use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; use kvproto::{ - deadlock::create_deadlock, diagnosticspb::create_diagnostics, + brpb::create_backup, deadlock::create_deadlock, diagnosticspb::create_diagnostics, import_sstpb_grpc::create_import_sst, kvrpcpb::ApiVersion, resource_usage_agent::create_resource_metering_pub_sub, }; @@ -74,7 +74,7 @@ use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, - import::{ImportSstService, LocalTablets, SstImporter}, + import::{ImportSstService, SstImporter}, read_pool::{ build_yatp_read_pool, ReadPool, ReadPoolConfigManager, UPDATE_EWMA_TIME_SLICE_INTERVAL, }, @@ -92,6 +92,7 @@ use tikv::{ storage::{ self, config_manager::StorageConfigManger, + kv::LocalTablets, mvcc::MvccConsistencyCheckObserver, txn::flow_controller::{FlowController, TabletFlowController}, Engine, Storage, @@ -981,6 +982,34 @@ where let servers = self.servers.as_mut().unwrap(); let engines = self.engines.as_ref().unwrap(); + // Backup service. + let mut backup_worker = Box::new(self.background_worker.lazy_build("backup-endpoint")); + let backup_scheduler = backup_worker.scheduler(); + let backup_service = backup::Service::::new(backup_scheduler); + if servers + .server + .register_service(create_backup(backup_service)) + .is_some() + { + fatal!("failed to register backup service"); + } + + let backup_endpoint = backup::Endpoint::new( + self.node.as_ref().unwrap().id(), + engines.engine.clone(), + self.region_info_accessor.clone().unwrap(), + LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), + self.config.backup.clone(), + self.concurrency_manager.clone(), + self.config.storage.api_version(), + self.causal_ts_provider.clone(), + ); + self.cfg_controller.as_mut().unwrap().register( + tikv::config::Module::Backup, + Box::new(backup_endpoint.get_config_manager()), + ); + backup_worker.start(backup_endpoint); + // Import SST service. let import_service = ImportSstService::new( self.config.import.clone(), diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index cb669070b9e..34eb6e8aa9e 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -26,7 +26,7 @@ use tikv::{ config::BackupConfig, coprocessor::{checksum_crc64_xor, dag::TikvStorage}, storage::{ - kv::{Engine, SnapContext}, + kv::{Engine, LocalTablets, SnapContext}, SnapshotStore, }, }; @@ -85,7 +85,7 @@ impl TestSuite { *id, sim.storages[id].clone(), sim.region_info_accessors[id].clone(), - engines.kv.clone(), + LocalTablets::Singleton(engines.kv.clone()), BackupConfig { num_threads: 4, batch_size: 8, diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index cc09dd09c4c..ed2a44d80fa 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -49,7 +49,7 @@ use test_pd_client::TestPdClient; use test_raftstore::{filter_send, AddressMap, Config, Filter}; use tikv::{ coprocessor, coprocessor_v2, - import::{ImportSstService, LocalTablets, SstImporter}, + import::{ImportSstService, SstImporter}, read_pool::ReadPool, server::{ gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, @@ -59,7 +59,7 @@ use tikv::{ }, storage::{ self, - kv::{FakeExtension, RaftExtension, SnapContext}, + kv::{FakeExtension, LocalTablets, RaftExtension, SnapContext}, txn::flow_controller::{EngineFlowController, FlowController}, Engine, Storage, }, diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 54da33fa3dd..e7b43850e27 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -50,7 +50,7 @@ use test_pd_client::TestPdClient; use tikv::{ config::ConfigController, coprocessor, coprocessor_v2, - import::{ImportSstService, LocalTablets, SstImporter}, + import::{ImportSstService, SstImporter}, read_pool::ReadPool, server::{ gc_worker::GcWorker, @@ -65,7 +65,7 @@ use tikv::{ }, storage::{ self, - kv::{FakeExtension, SnapContext}, + kv::{FakeExtension, LocalTablets, SnapContext}, txn::flow_controller::{EngineFlowController, FlowController}, Engine, Storage, }, diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index c5313620995..05d039d2690 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -25,6 +25,7 @@ mod rocksdb_engine; mod stats; use std::{ + borrow::Cow, cell::UnsafeCell, error, num::NonZeroU64, @@ -35,8 +36,8 @@ use std::{ use collections::HashMap; use engine_traits::{ - CfName, IterOptions, KvEngine as LocalEngine, Mutable, MvccProperties, ReadOptions, WriteBatch, - CF_DEFAULT, CF_LOCK, + CfName, IterOptions, KvEngine as LocalEngine, Mutable, MvccProperties, ReadOptions, + TabletRegistry, WriteBatch, CF_DEFAULT, CF_LOCK, }; use error_code::{self, ErrorCode, ErrorCodeExt}; use futures::{compat::Future01CompatExt, future::BoxFuture, prelude::*}; @@ -784,6 +785,29 @@ pub fn write_modifies(kv_engine: &impl LocalEngine, modifies: Vec) -> Re Ok(()) } +#[derive(Clone)] +pub enum LocalTablets { + Singleton(EK), + Registry(TabletRegistry), +} + +impl LocalTablets { + /// Get the tablet of the given region. + /// + /// If `None` is returned, the region may not exist or may not initialized. + /// If there are multiple versions of tablet, the latest one is returned + /// with best effort. + pub fn get(&self, region_id: u64) -> Option> { + match self { + LocalTablets::Singleton(tablet) => Some(Cow::Borrowed(tablet)), + LocalTablets::Registry(registry) => { + let mut cached = registry.get(region_id)?; + cached.latest().cloned().map(Cow::Owned) + } + } + } +} + pub const TEST_ENGINE_CFS: &[CfName] = &[CF_DEFAULT, "cf"]; pub mod tests { diff --git a/src/import/mod.rs b/src/import/mod.rs index 7ee5647f723..e2fa3729e52 100644 --- a/src/import/mod.rs +++ b/src/import/mod.rs @@ -15,9 +15,8 @@ mod duplicate_detect; mod sst_service; -use std::{borrow::Cow, fmt::Debug}; +use std::fmt::Debug; -use engine_traits::TabletRegistry; use grpcio::{RpcStatus, RpcStatusCode}; pub use sst_importer::{Config, Error, Result, SstImporter, TxnSstWriter}; @@ -49,26 +48,3 @@ macro_rules! send_rpc_response { let _ = res.map_err(|e| warn!("send rpc response"; "err" => %e)).await; }}; } - -#[derive(Clone)] -pub enum LocalTablets { - Singleton(EK), - Registry(TabletRegistry), -} - -impl LocalTablets { - /// Get the tablet of the given region. - /// - /// If `None` is returned, the region may not exist or may not initialized. - /// If there are multiple versions of tablet, the latest one is returned - /// with best effort. - fn get(&self, region_id: u64) -> Option> { - match self { - LocalTablets::Singleton(tablet) => Some(Cow::Borrowed(tablet)), - LocalTablets::Registry(registry) => { - let mut cached = registry.get(region_id)?; - cached.latest().cloned().map(Cow::Owned) - } - } - } -} diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index b23046bfe4b..12cb0ca892b 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -29,7 +29,9 @@ use sst_importer::{ error_inc, metrics::*, sst_importer::DownloadExt, sst_meta_to_path, Config, ConfigManager, Error, Result, SstImporter, }; -use tikv_kv::{Engine, Modify, SnapContext, Snapshot, SnapshotExt, WriteData, WriteEvent}; +use tikv_kv::{ + Engine, LocalTablets, Modify, SnapContext, Snapshot, SnapshotExt, WriteData, WriteEvent, +}; use tikv_util::{ config::ReadableSize, future::create_stream_with_buffer, @@ -40,7 +42,7 @@ use tikv_util::{ use tokio::{runtime::Runtime, time::sleep}; use txn_types::{Key, WriteRef, WriteType}; -use super::{make_rpc_error, LocalTablets}; +use super::make_rpc_error; use crate::{ import::duplicate_detect::DuplicateDetector, server::CONFIG_ROCKSDB_GAUGE, From 5a2ff323d6fae82c624ad802e8100d3154d01ba1 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 28 Mar 2023 16:26:54 +0800 Subject: [PATCH 606/676] config: fix alias name snap-max-write-bytes-per-sec (#14463) close tikv/tikv#14455 Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- src/server/config.rs | 2 +- tests/integrations/config/mod.rs | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/server/config.rs b/src/server/config.rs index 5f15e72ae2f..d954ebac36f 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -146,7 +146,7 @@ pub struct Config { #[serde(with = "perf_level_serde")] #[online_config(skip)] pub end_point_perf_level: PerfLevel, - #[serde(alias = "snap_max_write_bytes_per_sec")] + #[serde(alias = "snap-max-write-bytes-per-sec")] pub snap_io_max_bytes_per_sec: ReadableSize, pub snap_max_total_size: ReadableSize, #[online_config(skip)] diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index ff6807fa6a1..02b5c711e96 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -903,3 +903,24 @@ fn test_log_backward_compatible() { assert_eq!(cfg.log.format, LogFormat::Json); assert_eq!(cfg.log.file.max_size, 1024); } + +#[test] +fn test_rename_compatibility() { + let old_content = r#" +[server] +snap-max-write-bytes-per-sec = "10MiB" + +[storage] +engine = "raft-kv2" + "#; + let new_content = r#" +[server] +snap-io-max-bytes-per-sec = "10MiB" + +[storage] +engine = "partitioned-raft-kv" + "#; + let old_cfg: TikvConfig = toml::from_str(old_content).unwrap(); + let new_cfg: TikvConfig = toml::from_str(new_content).unwrap(); + assert_eq_debug(&old_cfg, &new_cfg); +} From 9eeda1416ff050cc6468c9ded18b9719545a6691 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 28 Mar 2023 17:12:54 -0700 Subject: [PATCH 607/676] fix io breakdown for foreground write (#14456) ref tikv/tikv#12842 async io thread's write should be foreground write Signed-off-by: qi.xu Co-authored-by: qi.xu --- components/raftstore/src/store/async_io/write.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index e94f7360c23..0da8d1546b5 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -20,6 +20,7 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; +use file_system::{set_io_type, IoType}; use kvproto::raft_serverpb::{RaftLocalState, RaftMessage}; use parking_lot::Mutex; use protobuf::Message; @@ -1026,6 +1027,7 @@ where thread::Builder::new() .name(thd_name!(tag)) .spawn_wrapper(move || { + set_io_type(IoType::ForegroundWrite); worker.run(); })?; cached_senders.push(tx); From f8bf08c567ada625db421c949b0de3757e16589b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Wed, 29 Mar 2023 11:52:55 +0800 Subject: [PATCH 608/676] log-backup: Using PD as metastore (#14278) close tikv/tikv#13867 This also makes `etcd-client` and `tonic` optional requirements, you can enable them by `metastore-etcd`. Signed-off-by: hillium Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- Cargo.lock | 3 + components/backup-stream/Cargo.toml | 11 +- components/backup-stream/src/endpoint.rs | 1 - components/backup-stream/src/errors.rs | 5 + .../backup-stream/src/metadata/client.rs | 165 +++------ components/backup-stream/src/metadata/keys.rs | 30 +- components/backup-stream/src/metadata/mod.rs | 1 + .../backup-stream/src/metadata/store/mod.rs | 35 +- .../backup-stream/src/metadata/store/pd.rs | 324 ++++++++++++++++++ components/backup-stream/src/metadata/test.rs | 16 +- components/pd_client/src/client.rs | 98 +++++- components/pd_client/src/lib.rs | 1 + components/pd_client/src/meta_storage.rs | 302 ++++++++++++++++ components/pd_client/src/metrics.rs | 4 + components/pd_client/src/util.rs | 6 + components/server/src/server.rs | 22 +- components/test_pd/src/mocker/meta_storage.rs | 113 ++++++ components/test_pd/src/mocker/mod.rs | 21 +- components/test_pd/src/server.rs | 46 ++- components/tikv_util/src/codec/mod.rs | 28 ++ 20 files changed, 1058 insertions(+), 174 deletions(-) create mode 100644 components/backup-stream/src/metadata/store/pd.rs create mode 100644 components/pd_client/src/meta_storage.rs create mode 100644 components/test_pd/src/mocker/meta_storage.rs diff --git a/Cargo.lock b/Cargo.lock index 1cb40d842cd..62746ba6bcb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -511,6 +511,7 @@ dependencies = [ "async-compression", "async-trait", "bytes", + "cfg-if 1.0.0", "chrono", "concurrency_manager", "crossbeam", @@ -536,6 +537,7 @@ dependencies = [ "online_config", "openssl", "pd_client", + "pin-project", "prometheus", "protobuf", "raft", @@ -548,6 +550,7 @@ dependencies = [ "slog-global", "tempdir", "tempfile", + "test_pd", "test_raftstore", "test_util", "thiserror", diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index f3f1b482be0..d6d6f7a6fc4 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -11,6 +11,8 @@ test-engines-rocksdb = ["tikv/test-engines-rocksdb"] failpoints = ["tikv/failpoints", "fail/failpoints"] backup-stream-debug = [] +metastore-etcd = ["tonic", "etcd-client"] + [[test]] name = "integration" path = "tests/mod.rs" @@ -22,6 +24,7 @@ harness = true async-compression = { version = "0.3.14", features = ["tokio", "zstd"] } async-trait = { version = "0.1" } bytes = "1" +cfg-if = "1" chrono = "0.4" concurrency_manager = { workspace = true } crossbeam = "0.8" @@ -32,7 +35,7 @@ engine_traits = { workspace = true } error_code = { workspace = true } # We cannot update the etcd-client to latest version because of the cyclic requirement. # Also we need wait until https://github.com/etcdv3/etcd-client/pull/43/files to be merged. -etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "41d393c32a7a7c728550cee1d9a138dafe6f3e27", features = ["pub-response-field", "tls-openssl-vendored"] } +etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "41d393c32a7a7c728550cee1d9a138dafe6f3e27", features = ["pub-response-field", "tls-openssl-vendored"], optional = true } external_storage = { workspace = true } external_storage_export = { workspace = true } fail = "0.5" @@ -49,10 +52,12 @@ log_wrappers = { workspace = true } online_config = { workspace = true } openssl = "0.10" pd_client = { workspace = true } +pin-project = "1.0" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raftstore = { workspace = true } +rand = "0.8.0" regex = "1" resolved_ts = { workspace = true } security = { path = "../security" } @@ -67,7 +72,7 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "macros", "time", "sync"] } tokio-stream = "0.1" tokio-util = { version = "0.7", features = ["compat"] } -tonic = "0.8" +tonic = { version = "0.8", optional = true } txn_types = { workspace = true } uuid = "0.8" yatp = { workspace = true } @@ -78,9 +83,9 @@ engine_panic = { workspace = true } grpcio = { workspace = true } hex = "0.4" protobuf = { version = "2.8", features = ["bytes"] } -rand = "0.8.0" tempdir = "0.3" tempfile = "3.0" +test_pd = { workspace = true } test_raftstore = { workspace = true } test_util = { workspace = true } url = "2" diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index d8c0e09744f..68f040217ea 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -620,7 +620,6 @@ where let task_clone = task.clone(); let run = async move { let task_name = task.info.get_name(); - cli.init_task(&task.info).await?; let ranges = cli.ranges_of_task(task_name).await?; info!( "register backup stream ranges"; diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index 2fecf0ac514..c3cc91da9ff 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -5,6 +5,7 @@ use std::{ }; use error_code::ErrorCodeExt; +#[cfg(feature = "metastore-etcd")] use etcd_client::Error as EtcdError; use grpcio::Error as GrpcError; use kvproto::{errorpb::Error as StoreError, metapb::*}; @@ -21,6 +22,7 @@ use crate::{endpoint::Task, metrics}; pub enum Error { #[error("gRPC meet error {0}")] Grpc(#[from] GrpcError), + #[cfg(feature = "metasotre-etcd")] #[error("Etcd meet error {0}")] Etcd(#[from] EtcdErrorExt), #[error("Protobuf meet error {0}")] @@ -52,12 +54,14 @@ pub enum Error { Other(#[from] Box), } +#[cfg(feature = "metastore-etcd")] impl From for Error { fn from(value: EtcdError) -> Self { Self::Etcd(value.into()) } } +#[cfg(feature = "metastore-etcd")] #[derive(ThisError, Debug)] pub enum EtcdErrorExt { #[error("{0}")] @@ -72,6 +76,7 @@ impl ErrorCodeExt for Error { fn error_code(&self) -> error_code::ErrorCode { use error_code::backup_stream::*; match self { + #[cfg(feature = "metastore-etcd")] Error::Etcd(_) => ETCD, Error::Protobuf(_) => PROTO, Error::NoSuchTask { .. } => NO_SUCH_TASK, diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 97e8d2140b5..fca8a07b654 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -15,8 +15,8 @@ use super::{ checkpoint_cache::CheckpointCache, keys::{self, KeyValue, MetaKey}, store::{ - CondTransaction, Condition, GetExtra, Keys, KvEvent, KvEventType, MetaStore, Snapshot, - Subscription, Transaction, WithRevision, + CondTransaction, Condition, Keys, KvEvent, KvEventType, MetaStore, Snapshot, Subscription, + Transaction, WithRevision, }, }; use crate::{ @@ -48,6 +48,7 @@ impl Debug for StreamTask { .field("table_filter", &self.info.table_filter) .field("start_ts", &self.info.start_ts) .field("end_ts", &self.info.end_ts) + .field("is_paused", &self.is_paused) .finish() } } @@ -292,8 +293,7 @@ impl MetadataClient { ) -> Result> { let key = MetaKey::last_error_of(name, store_id); - let s = self.meta_store.snapshot().await?; - let r = s.get(Keys::Key(key)).await?; + let r = self.meta_store.get_latest(Keys::Key(key)).await?.inner; if r.is_empty() { return Ok(None); } @@ -304,8 +304,11 @@ impl MetadataClient { /// check whether the task is paused. pub async fn check_task_paused(&self, name: &str) -> Result { - let snap = self.meta_store.snapshot().await?; - let kvs = snap.get(Keys::Key(MetaKey::pause_of(name))).await?; + let kvs = self + .meta_store + .get_latest(Keys::Key(MetaKey::pause_of(name))) + .await? + .inner; Ok(!kvs.is_empty()) } @@ -317,8 +320,11 @@ impl MetadataClient { } pub async fn get_tasks_pause_status(&self) -> Result, bool>> { - let snap = self.meta_store.snapshot().await?; - let kvs = snap.get(Keys::Prefix(MetaKey::pause_prefix())).await?; + let kvs = self + .meta_store + .get_latest(Keys::Prefix(MetaKey::pause_prefix())) + .await? + .inner; let mut pause_hash = HashMap::new(); let prefix_len = MetaKey::pause_prefix_len(); @@ -338,10 +344,9 @@ impl MetadataClient { } let items = self .meta_store - .snapshot() + .get_latest(Keys::Key(MetaKey::task_of(name))) .await? - .get(Keys::Key(MetaKey::task_of(name))) - .await?; + .inner; if items.is_empty() { return Ok(None); } @@ -362,11 +367,13 @@ impl MetadataClient { "faild to connect etcd client".to_string(), )) }); - let snap = self.meta_store.snapshot().await?; - let kvs = snap.get(Keys::Prefix(MetaKey::tasks())).await?; + let kvs = self + .meta_store + .get_latest(Keys::Prefix(MetaKey::tasks())) + .await?; - let mut tasks = Vec::with_capacity(kvs.len()); - for kv in kvs { + let mut tasks = Vec::with_capacity(kvs.inner.len()); + for kv in kvs.inner { let t = protobuf::parse_from_bytes::(kv.value())?; let paused = self.check_task_paused(t.get_name()).await?; tasks.push(StreamTask { @@ -376,7 +383,7 @@ impl MetadataClient { } Ok(WithRevision { inner: tasks, - revision: snap.revision(), + revision: kvs.revision, }) } @@ -455,13 +462,14 @@ impl MetadataClient { defer! { super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_step"]).observe(now.saturating_elapsed().as_secs_f64()) } - let snap = self.meta_store.snapshot().await?; - let ts = snap - .get(Keys::Key(MetaKey::storage_checkpoint_of( + let ts = self + .meta_store + .get_latest(Keys::Key(MetaKey::storage_checkpoint_of( task_name, self.store_id, ))) - .await?; + .await? + .inner; match ts.as_slice() { [ts, ..] => Ok(TimeStamp::new(parse_ts_from_bytes(ts.value())?)), @@ -488,13 +496,14 @@ impl MetadataClient { defer! { super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_step"]).observe(now.saturating_elapsed().as_secs_f64()) } - let snap = self.meta_store.snapshot().await?; - let ts = snap - .get(Keys::Key(MetaKey::next_backup_ts_of( + let ts = self + .meta_store + .get_latest(Keys::Key(MetaKey::next_backup_ts_of( task_name, self.store_id, ))) - .await?; + .await? + .inner; match ts.as_slice() { [ts, ..] => Ok(TimeStamp::new(parse_ts_from_bytes(ts.value())?)), @@ -507,96 +516,16 @@ impl MetadataClient { &self, task_name: &str, ) -> Result, Vec)>>> { - let snap = self.meta_store.snapshot().await?; - let ranges = snap - .get(Keys::Prefix(MetaKey::ranges_of(task_name))) + let ranges = self + .meta_store + .get_latest(Keys::Prefix(MetaKey::ranges_of(task_name))) .await?; - Ok(WithRevision { - revision: snap.revision(), - inner: ranges - .into_iter() + Ok(ranges.map(|rs| { + rs.into_iter() .map(|mut kv: KeyValue| kv.take_range(task_name)) - .collect(), - }) - } - - /// Perform a two-phase bisection search algorithm for the intersection of - /// all ranges and the specificated range (usually region range.) - /// TODO: explain the algorithm? - pub async fn range_overlap_of_task( - &self, - task_name: &str, - (start_key, end_key): (Vec, Vec), - ) -> Result, Vec)>>> { - let now = Instant::now(); - defer! { - super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_range_search"]).observe(now.saturating_elapsed().as_secs_f64()) - } - let snap = self.meta_store.snapshot().await?; - - let mut prev = snap - .get_extra( - Keys::Range( - MetaKey::ranges_of(task_name), - MetaKey::range_of(task_name, &start_key), - ), - GetExtra { - desc_order: true, - limit: 1, - ..Default::default() - }, - ) - .await?; - let all = snap - .get(Keys::Range( - MetaKey::range_of(task_name, &start_key), - MetaKey::range_of(task_name, &end_key), - )) - .await?; - - let mut result = Vec::with_capacity(all.len() + 1); - if !prev.kvs.is_empty() { - let kv = &mut prev.kvs[0]; - if kv.value() > start_key.as_slice() { - result.push(kv.take_range(task_name)); - } - } - for mut kv in all { - result.push(kv.take_range(task_name)); - } - Ok(WithRevision { - revision: snap.revision(), - inner: result, - }) - } - - /// access the next backup ts of some task and some region. - pub async fn progress_of_task(&self, task_name: &str) -> Result { - let now = Instant::now(); - defer! { - super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_progress_get"]).observe(now.saturating_elapsed().as_secs_f64()) - } - let task = self.get_task(task_name).await?; - if task.is_none() { - return Err(Error::NoSuchTask { - task_name: task_name.to_owned(), - }); - } - - let timestamp = self.meta_store.snapshot().await?; - let items = timestamp - .get(Keys::Key(MetaKey::next_backup_ts_of( - task_name, - self.store_id, - ))) - .await?; - if items.is_empty() { - Ok(task.unwrap().info.start_ts) - } else { - assert_eq!(items.len(), 1); - parse_ts_from_bytes(items[0].1.as_slice()) - } + .collect() + })) } pub async fn checkpoints_of(&self, task_name: &str) -> Result> { @@ -604,10 +533,10 @@ impl MetadataClient { defer! { super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["checkpoints_of"]).observe(now.saturating_elapsed().as_secs_f64()) } - let snap = self.meta_store.snapshot().await?; - let checkpoints = snap - .get(Keys::Prefix(MetaKey::next_backup_ts(task_name))) + let checkpoints = self.meta_store + .get_latest(Keys::Prefix(MetaKey::next_backup_ts(task_name))) .await? + .inner .iter() .filter_map(|kv| { Checkpoint::from_kv(kv) @@ -674,6 +603,7 @@ impl MetadataClient { /// remove some task, without the ranges. /// only for testing. + #[cfg(test)] pub async fn remove_task(&self, name: &str) -> Result<()> { self.meta_store .delete(Keys::Key(MetaKey::task_of(name))) @@ -722,8 +652,11 @@ impl MetadataClient { return Ok(c); } let key = MetaKey::next_bakcup_ts_of_region(task, region); - let s = self.meta_store.snapshot().await?; - let r = s.get(Keys::Key(key.clone())).await?; + let r = self + .meta_store + .get_latest(Keys::Key(key.clone())) + .await? + .inner; let cp = match r.len() { 0 => { let global_cp = self.global_checkpoint_of(task).await?; diff --git a/components/backup-stream/src/metadata/keys.rs b/components/backup-stream/src/metadata/keys.rs index f7a2c960ec4..26b04abe16f 100644 --- a/components/backup-stream/src/metadata/keys.rs +++ b/components/backup-stream/src/metadata/keys.rs @@ -2,7 +2,7 @@ use kvproto::metapb::Region; -const PREFIX: &str = "/tidb/br-stream"; +pub(super) const PREFIX: &str = "/tidb/br-stream"; const PATH_INFO: &str = "/info"; const PATH_NEXT_BACKUP_TS: &str = "/checkpoint"; const PATH_STORAGE_CHECKPOINT: &str = "/storage-checkpoint"; @@ -28,17 +28,26 @@ const TASKS_PREFIX: &str = "/tidb/br-stream/info/"; /// For the storage checkpoint ts of tasks: /// /storage-checkpoint// -> /// ``` -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq)] pub struct MetaKey(pub Vec); /// A simple key value pair of metadata. -#[derive(Clone, Debug)] +#[derive(Clone, Eq, PartialEq)] pub struct KeyValue(pub MetaKey, pub Vec); +impl std::fmt::Debug for KeyValue { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("KV") + .field(&self.0) + .field(&format_args!("{}", self.1.escape_ascii())) + .finish() + } +} + impl std::fmt::Debug for MetaKey { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_tuple("MetaKey") - .field(&self.0.escape_ascii()) + f.debug_tuple("K") + .field(&format_args!("{}", self.0.escape_ascii())) .finish() } } @@ -177,16 +186,7 @@ impl MetaKey { /// return the key that keeps the range [self, self.next_prefix()) contains /// all keys with the prefix `self`. pub fn next_prefix(&self) -> Self { - let mut next_prefix = self.clone(); - for i in (0..next_prefix.0.len()).rev() { - if next_prefix.0[i] == u8::MAX { - next_prefix.0.pop(); - } else { - next_prefix.0[i] += 1; - break; - } - } - next_prefix + Self(tikv_util::codec::next_prefix_of(self.0.clone())) } } diff --git a/components/backup-stream/src/metadata/mod.rs b/components/backup-stream/src/metadata/mod.rs index 20887a24b02..a96e2f9bcb6 100644 --- a/components/backup-stream/src/metadata/mod.rs +++ b/components/backup-stream/src/metadata/mod.rs @@ -8,4 +8,5 @@ pub mod store; pub mod test; pub use client::{Checkpoint, CheckpointProvider, MetadataClient, MetadataEvent, StreamTask}; +#[cfg(feature = "metastore-etcd")] pub use store::lazy_etcd::{ConnectionConfig, LazyEtcdClient}; diff --git a/components/backup-stream/src/metadata/store/mod.rs b/components/backup-stream/src/metadata/store/mod.rs index e5d1f03e715..7cecda9720e 100644 --- a/components/backup-stream/src/metadata/store/mod.rs +++ b/components/backup-stream/src/metadata/store/mod.rs @@ -1,6 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -pub mod lazy_etcd; +cfg_if::cfg_if! { + if #[cfg(feature = "metastore-etcd")] { + pub mod etcd; + pub mod lazy_etcd; + pub use etcd::EtcdStore; + } +} // Note: these mods also used for integration tests, // so we cannot compile them only when `#[cfg(test)]`. @@ -9,11 +15,11 @@ pub mod lazy_etcd; pub mod slash_etc; pub use slash_etc::SlashEtcStore; -pub mod etcd; +pub mod pd; + use std::{cmp::Ordering, future::Future, pin::Pin, time::Duration}; use async_trait::async_trait; -pub use etcd::EtcdStore; use tokio_stream::Stream; // ==== Generic interface definition ==== @@ -22,6 +28,7 @@ use crate::errors::Result; pub type BoxStream = Pin + Send>>; pub type BoxFuture = Pin + Send>>; +pub use pd::PdStore; #[derive(Debug, Default)] pub struct Transaction { @@ -108,10 +115,19 @@ pub struct WithRevision { pub inner: T, } +impl WithRevision { + pub fn map(self, f: impl FnOnce(T) -> R) -> WithRevision { + WithRevision { + revision: self.revision, + inner: f(self.inner), + } + } +} + /// The key set for getting. /// I guess there should be a `&[u8]` in meta key, /// but the etcd client requires Into> :( -#[derive(Debug)] +#[derive(Debug, Clone)] pub enum Keys { Prefix(MetaKey), Range(MetaKey, MetaKey), @@ -160,7 +176,7 @@ pub trait Snapshot: Send + Sync + 'static { } } -#[derive(Debug)] +#[derive(Debug, Eq, PartialEq, Clone, Copy)] pub enum KvEventType { Put, Delete, @@ -207,4 +223,13 @@ pub trait MetaStore: Clone + Send + Sync { async fn delete(&self, keys: Keys) -> Result<()> { self.txn(Transaction::default().delete(keys)).await } + /// Get the latest version of some keys. + async fn get_latest(&self, keys: Keys) -> Result>> { + let s = self.snapshot().await?; + let keys = s.get(keys).await?; + Ok(WithRevision { + revision: s.revision(), + inner: keys, + }) + } } diff --git a/components/backup-stream/src/metadata/store/pd.rs b/components/backup-stream/src/metadata/store/pd.rs new file mode 100644 index 00000000000..5b2e2b466e5 --- /dev/null +++ b/components/backup-stream/src/metadata/store/pd.rs @@ -0,0 +1,324 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::VecDeque, fmt::Display, pin::Pin, task::ready}; + +use async_trait::async_trait; +use futures::{stream, Stream}; +use kvproto::meta_storagepb::{self as mpb, WatchResponse}; +use pd_client::meta_storage::{Get, MetaStorageClient, Put, Watch}; +use pin_project::pin_project; +use tikv_util::{box_err, info}; + +use super::{ + GetResponse, Keys, KvChangeSubscription, KvEvent, KvEventType, MetaStore, Snapshot, + WithRevision, +}; +use crate::{ + debug, + errors::{Error, Result}, + metadata::keys::{KeyValue, MetaKey, PREFIX}, +}; + +fn convert_kv(mut kv: mpb::KeyValue) -> KeyValue { + let k = kv.take_key(); + let v = kv.take_value(); + KeyValue(MetaKey(k), v) +} + +#[derive(Clone)] +pub struct PdStore { + client: M, +} + +impl PdStore { + pub fn new(s: M) -> Self { + Self { client: s } + } +} + +fn unimplemented(name: impl Display) -> Error { + Error::Io(std::io::Error::new( + std::io::ErrorKind::Unsupported, + format!("the behavior {} hasn't been implemented yet.", name), + )) +} + +#[pin_project] +struct PdWatchStream { + #[pin] + inner: S, + buf: VecDeque, +} + +impl PdWatchStream { + /// Create a new Watch Stream from PD, with a function to cancel the stream. + fn new(inner: S) -> Self { + Self { + inner, + buf: Default::default(), + } + } +} + +impl>> Stream for PdWatchStream { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + loop { + let this = self.as_mut().project(); + let buf = this.buf; + if let Some(x) = buf.pop_front() { + return Some(Ok(x)).into(); + } + let resp = ready!(this.inner.poll_next(cx)); + match resp { + None => return None.into(), + Some(Err(err)) => return Some(Err(Error::Pd(err))).into(), + Some(Ok(mut x)) => { + if x.get_header().has_error() { + return Some(Err(Error::Other(box_err!( + "watch stream returns error: {:?}", + x.get_header().get_error() + )))) + .into(); + } + assert!(buf.is_empty()); + for mut e in x.take_events().into_iter() { + let ty = match e.get_type() { + kvproto::meta_storagepb::EventEventType::Put => KvEventType::Put, + kvproto::meta_storagepb::EventEventType::Delete => KvEventType::Delete, + }; + let kv = KvEvent { + kind: ty, + pair: convert_kv(e.take_kv()), + }; + buf.push_back(kv); + } + } + } + } + } +} + +#[async_trait] +impl Snapshot for RevOnly { + async fn get_extra(&self, _keys: Keys, _extra: super::GetExtra) -> Result { + Err(unimplemented("PdStore::snapshot::get")) + } + + fn revision(&self) -> i64 { + self.0 + } +} + +pub struct RevOnly(i64); + +#[async_trait] +impl< + St: Stream> + Send + 'static, + PD: MetaStorageClient + Clone, +> MetaStore for PdStore +{ + type Snap = RevOnly; + + async fn snapshot(&self) -> Result { + // hacking here: when we are doing point querying, the server won't return + // revision. So we are going to query a non-exist prefix here. + let rev = self + .client + .get(Get::of(PREFIX.as_bytes().to_vec()).prefixed().limit(0)) + .await? + .get_header() + .get_revision(); + info!("pd meta client getting snapshot."; "rev" => %rev); + Ok(RevOnly(rev)) + } + + async fn watch( + &self, + keys: super::Keys, + start_rev: i64, + ) -> Result { + info!("pd meta client creating watch stream."; "keys" => ?keys, "rev" => %start_rev); + match keys { + Keys::Prefix(k) => { + use futures::stream::StreamExt; + let stream = self + .client + .watch(Watch::of(k).prefixed().from_rev(start_rev)); + let (stream, cancel) = stream::abortable(PdWatchStream::new(stream)); + Ok(KvChangeSubscription { + stream: stream.boxed(), + cancel: Box::pin(async move { cancel.abort() }), + }) + } + _ => Err(unimplemented("watch distinct keys or range of keys")), + } + } + + async fn txn(&self, _txn: super::Transaction) -> Result<()> { + Err(unimplemented("PdStore::txn")) + } + + async fn txn_cond(&self, _txn: super::CondTransaction) -> Result<()> { + Err(unimplemented("PdStore::txn_cond")) + } + + async fn set(&self, mut kv: KeyValue) -> Result<()> { + debug!("pd meta client setting."; "pair" => ?kv); + self.client + .put(Put::of(kv.take_key(), kv.take_value())) + .await?; + Ok(()) + } + + async fn get_latest(&self, keys: Keys) -> Result>> { + let spec = match keys.clone() { + Keys::Prefix(p) => Get::of(p).prefixed(), + Keys::Key(k) => Get::of(k), + Keys::Range(s, e) => Get::of(s).range_to(e), + }; + // Note: we skipped check `more` here, because we haven't make pager. + let mut resp = self.client.get(spec).await?; + let inner = resp + .take_kvs() + .into_iter() + .map(convert_kv) + .collect::>(); + let revision = resp.get_header().get_revision(); + debug!("pd meta client getting."; "range" => ?keys, "rev" => %revision, "result" => ?inner); + Ok(WithRevision { inner, revision }) + } +} + +#[cfg(test)] +mod tests { + use std::{sync::Arc, time::Duration}; + + use futures::{Future, StreamExt}; + use pd_client::{ + meta_storage::{Checked, Source, Sourced}, + RpcClient, + }; + use test_pd::{mocker::MetaStorage, util::*, Server as PdServer}; + use tikv_util::config::ReadableDuration; + + use super::PdStore; + use crate::metadata::{ + keys::{KeyValue, MetaKey}, + store::{Keys, MetaStore}, + }; + + fn new_test_server_and_client( + factory: impl FnOnce(RpcClient) -> C, + ) -> (PdServer, PdStore) { + let server = PdServer::with_case(1, Arc::::default()); + let eps = server.bind_addrs(); + let client = + new_client_with_update_interval(eps, None, ReadableDuration(Duration::from_secs(99))); + (server, PdStore::new(factory(client))) + } + + fn w(f: impl Future) -> T { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(f) + } + + #[test] + fn test_query() { + let (_s, c) = new_test_server_and_client(|c| Sourced::new(Arc::new(c), Source::LogBackup)); + + let kv = |k, v: &str| KeyValue(MetaKey::task_of(k), v.as_bytes().to_vec()); + let insert = |k, v| w(c.set(kv(k, v))).unwrap(); + insert("a", "the signpost of flowers"); + insert("b", "the milky hills"); + insert("c", "the rusty sky"); + + let k = w(c.get_latest(Keys::Key(MetaKey::task_of("a")))).unwrap(); + assert_eq!( + k.inner.as_slice(), + [kv("a", "the signpost of flowers")].as_slice() + ); + let k = w(c.get_latest(Keys::Key(MetaKey::task_of("d")))).unwrap(); + assert_eq!(k.inner.as_slice(), [].as_slice()); + + let k = w(c.get_latest(Keys::Prefix(MetaKey::tasks()))).unwrap(); + assert_eq!( + k.inner.as_slice(), + [ + kv("a", "the signpost of flowers"), + kv("b", "the milky hills"), + kv("c", "the rusty sky"), + ] + .as_slice() + ) + } + + #[test] + fn test_watch() { + let (_s, c) = new_test_server_and_client(|c| Sourced::new(Arc::new(c), Source::LogBackup)); + let kv = |k, v: &str| KeyValue(MetaKey::task_of(k), v.as_bytes().to_vec()); + let insert = |k, v| w(c.set(kv(k, v))).unwrap(); + + insert("a", "the guest in vermilion"); + let res = w(c.get_latest(Keys::Prefix(MetaKey::tasks()))).unwrap(); + assert_eq!(res.inner.as_slice(), &[kv("a", "the guest in vermilion")]); + let mut ws = w(c.watch(Keys::Prefix(MetaKey::tasks()), res.revision + 1)).unwrap(); + let mut items = vec![]; + insert("a", "looking up at the ocean"); + items.push(w(ws.stream.next()).unwrap().unwrap()); + insert("b", "a folktale in the polar day"); + items.push(w(ws.stream.next()).unwrap().unwrap()); + w(ws.cancel); + assert!(w(ws.stream.next()).is_none()); + + assert_eq!(items[0].pair, kv("a", "looking up at the ocean")); + assert_eq!(items[1].pair, kv("b", "a folktale in the polar day")); + } + + #[test] + fn test_check_error() { + // Without AutoHeader, it will fail due to the source is empty. + let (_s, c) = new_test_server_and_client(|c| Checked::new(Arc::new(c))); + let kv = |k, v: &str| KeyValue(MetaKey::task_of(k), v.as_bytes().to_vec()); + let insert = |k, v| w(c.set(kv(k, v))); + + insert("c", "the rainbow-like summer").unwrap_err(); + w(c.get_latest(Keys::Key(MetaKey(vec![42u8])))).unwrap_err(); + assert!(w(c.watch(Keys::Key(MetaKey(vec![42u8])), 42)).is_err()); + } + + #[test] + fn test_retry() { + use tikv_util::defer; + + defer! {{ + fail::remove("meta_storage_get"); + }}; + let (_s, c) = new_test_server_and_client(|c| Sourced::new(Arc::new(c), Source::LogBackup)); + + let kv = |k, v: &str| KeyValue(MetaKey::task_of(k), v.as_bytes().to_vec()); + let insert = |k, v| w(c.set(kv(k, v))).unwrap(); + insert("rejectme", "this key would be rejected by the failpoint."); + + fail::cfg("meta_storage_get", "4*return").unwrap(); + let res = w(c.get_latest(Keys::Key(MetaKey::task_of("rejectme")))) + .expect("should success when temporary failing"); + assert_eq!(res.inner.len(), 1); + assert_eq!( + res.inner[0], + kv("rejectme", "this key would be rejected by the failpoint.") + ); + + // FIXME: this would take about 10s to run and influences unit tests run... + fail::cfg("meta_storage_get", "return").unwrap(); + w(c.get_latest(Keys::Key(MetaKey::task_of("rejectme")))) + .expect_err("should fail when ever failing"); + } +} diff --git a/components/backup-stream/src/metadata/test.rs b/components/backup-stream/src/metadata/test.rs index a57722089bf..bb2b7fe1577 100644 --- a/components/backup-stream/src/metadata/test.rs +++ b/components/backup-stream/src/metadata/test.rs @@ -54,21 +54,7 @@ async fn test_basic() -> Result<()> { cli.insert_task_with_range(&task, ranges).await?; let remote_ranges = cli.ranges_of_task(name).await?.inner; assert_range_matches(remote_ranges, ranges); - let overlap_ranges = cli - .range_overlap_of_task(name, (b"7".to_vec(), b"9".to_vec())) - .await? - .inner; - assert_range_matches(overlap_ranges, &[(b"6", b"8"), (b"8", b"9")]); - let overlap_ranges = cli - .range_overlap_of_task(name, (b"1".to_vec(), b"5".to_vec())) - .await? - .inner; - assert_range_matches(overlap_ranges, &[(b"1", b"2"), (b"4", b"5")]); - let overlap_ranges = cli - .range_overlap_of_task(name, (b"1".to_vec(), b"4".to_vec())) - .await? - .inner; - assert_range_matches(overlap_ranges, &[(b"1", b"2")]); + Ok(()) } diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 917176b454e..36f7aaa983b 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -14,12 +14,16 @@ use futures::{ channel::mpsc, compat::{Compat, Future01CompatExt}, executor::block_on, - future::{self, BoxFuture, FutureExt, TryFutureExt}, + future::{self, BoxFuture, FutureExt, TryFlattenStream, TryFutureExt}, sink::SinkExt, - stream::StreamExt, + stream::{ErrInto, StreamExt}, + TryStreamExt, }; use grpcio::{EnvBuilder, Environment, WriteFlags}; use kvproto::{ + meta_storagepb::{ + self as mpb, GetRequest, GetResponse, PutRequest, WatchRequest, WatchResponse, + }, metapb, pdpb::{self, Member}, replication_modepb::{RegionReplicationStatus, ReplicationStatus, StoreDrAutoSyncStatus}, @@ -33,6 +37,7 @@ use txn_types::TimeStamp; use yatp::{task::future::TaskCell, ThreadPool}; use super::{ + meta_storage::{Get, MetaStorageClient, Put, Watch}, metrics::*, util::{call_option_inner, check_resp_header, sync_request, Client, PdConnector}, BucketStat, Config, Error, FeatureGate, PdClient, PdFuture, RegionInfo, RegionStat, Result, @@ -42,6 +47,7 @@ use super::{ pub const CQ_COUNT: usize = 1; pub const CLIENT_PREFIX: &str = "pd"; +#[derive(Clone)] pub struct RpcClient { cluster_id: u64, pd_client: Arc, @@ -1117,3 +1123,91 @@ impl PdClient for RpcClient { .execute() } } + +impl RpcClient { + fn fill_cluster_id_for(&self, header: &mut mpb::RequestHeader) { + header.cluster_id = self.cluster_id; + } +} + +impl MetaStorageClient for RpcClient { + fn get(&self, mut req: Get) -> PdFuture { + let timer = Instant::now(); + self.fill_cluster_id_for(req.inner.mut_header()); + let executor = move |client: &Client, req: GetRequest| { + let handler = { + let inner = client.inner.rl(); + let r = inner + .meta_storage + .get_async_opt(&req, call_option_inner(&inner)); + futures::future::ready(r).err_into().try_flatten() + }; + Box::pin(async move { + fail::fail_point!("meta_storage_get", req.key.ends_with(b"rejectme"), |_| { + Err(super::Error::Grpc(grpcio::Error::RemoteStopped)) + }); + let resp = handler.await?; + PD_REQUEST_HISTOGRAM_VEC + .meta_storage_get + .observe(timer.saturating_elapsed_secs()); + Ok(resp) + }) as _ + }; + + self.pd_client + .request(req.into(), executor, LEADER_CHANGE_RETRY) + .execute() + } + + fn put(&self, mut req: Put) -> PdFuture { + let timer = Instant::now(); + self.fill_cluster_id_for(req.inner.mut_header()); + let executor = move |client: &Client, req: PutRequest| { + let handler = { + let inner = client.inner.rl(); + let r = inner + .meta_storage + .put_async_opt(&req, call_option_inner(&inner)); + futures::future::ready(r).err_into().try_flatten() + }; + Box::pin(async move { + let resp = handler.await?; + PD_REQUEST_HISTOGRAM_VEC + .meta_storage_put + .observe(timer.saturating_elapsed_secs()); + Ok(resp) + }) as _ + }; + + self.pd_client + .request(req.into(), executor, LEADER_CHANGE_RETRY) + .execute() + } + + fn watch(&self, mut req: Watch) -> Self::WatchStream { + let timer = Instant::now(); + self.fill_cluster_id_for(req.inner.mut_header()); + let executor = move |client: &Client, req: WatchRequest| { + let handler = { + let inner = client.inner.rl(); + inner.meta_storage.watch(&req) + }; + Box::pin(async move { + let resp = handler?; + PD_REQUEST_HISTOGRAM_VEC + .meta_storage_watch + .observe(timer.saturating_elapsed_secs()); + Ok(resp.err_into()) + }) as _ + }; + + self.pd_client + .request(req.into(), executor, LEADER_CHANGE_RETRY) + .execute() + .try_flatten_stream() + } + + type WatchStream = TryFlattenStream< + PdFuture, crate::Error>>, + >; +} diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 86e52eaf2a5..ba287621272 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -14,6 +14,7 @@ mod util; mod config; pub mod errors; +pub mod meta_storage; use std::{cmp::Ordering, ops::Deref, sync::Arc, time::Duration}; use futures::future::BoxFuture; diff --git a/components/pd_client/src/meta_storage.rs b/components/pd_client/src/meta_storage.rs new file mode 100644 index 00000000000..109986665bd --- /dev/null +++ b/components/pd_client/src/meta_storage.rs @@ -0,0 +1,302 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! `meta_storage` is the API set for storing generic KV pairs. +//! It is a trimmed version of the KV service of etcd, along with some metrics. + +use std::{pin::Pin, sync::Arc, task::ready}; + +use futures::{FutureExt, Stream}; +use kvproto::meta_storagepb as pb; +use tikv_util::{box_err, codec}; + +use crate::{Error, PdFuture, Result}; + +/// The etcd INF end key. +/// Unlike TiKV, they have chosen the slice `[0u8]` as the infinity. +const INF: [u8; 1] = [0u8]; + +/// A Get request to the meta storage. +#[derive(Clone, Debug)] +pub struct Get { + pub(crate) inner: pb::GetRequest, +} + +impl From for pb::GetRequest { + fn from(value: Get) -> Self { + value.inner + } +} + +impl Get { + /// Create a new get request, querying for exactly one key. + pub fn of(key: impl Into>) -> Self { + let mut inner = pb::GetRequest::default(); + inner.set_key(key.into()); + Self { inner } + } + + /// Enhance the query, make it be able to query the prefix of keys. + /// The prefix is the key passed to the method [`of`](Get::of). + pub fn prefixed(mut self) -> Self { + let mut next = codec::next_prefix_of(self.inner.key.clone()); + if next.is_empty() { + next = INF.to_vec(); + } + self.inner.set_range_end(next); + self + } + + /// Enhance the query, make it be able to query a range of keys. + /// The prefix is the key passed to the method [`of`](Get::of). + pub fn range_to(mut self, to: impl Into>) -> Self { + self.inner.set_range_end(to.into()); + self + } + + /// Specify the revision of the query. + pub fn rev(mut self, rev: i64) -> Self { + self.inner.set_revision(rev); + self + } + + pub fn limit(mut self, limit: i64) -> Self { + self.inner.set_limit(limit); + self + } +} + +/// A Put request to the meta store. +#[derive(Clone, Debug)] +pub struct Put { + pub(crate) inner: pb::PutRequest, +} + +impl Put { + /// Create a put request of the key value. + pub fn of(key: impl Into>, value: impl Into>) -> Self { + let mut inner = pb::PutRequest::default(); + inner.set_key(key.into()); + inner.set_value(value.into()); + Self { inner } + } + + /// Enhance the put request, allow it to return the previous kv pair. + pub fn fetch_prev_kv(mut self) -> Self { + self.inner.prev_kv = true; + self + } +} + +impl From for pb::PutRequest { + fn from(value: Put) -> Self { + value.inner + } +} + +#[derive(Clone, Debug)] +pub struct Watch { + pub(crate) inner: pb::WatchRequest, +} + +impl Watch { + /// Create a watch request for a key. + pub fn of(key: impl Into>) -> Self { + let mut inner = pb::WatchRequest::default(); + inner.set_key(key.into()); + + Self { inner } + } + + /// Enhance the request to allow it watch keys with the same prefix. + pub fn prefixed(mut self) -> Self { + let mut next = codec::next_prefix_of(self.inner.key.clone()); + if next.is_empty() { + next = INF.to_vec(); + } + self.inner.set_range_end(next); + self + } + + /// Enhance the request to allow it watch keys until the range end. + pub fn range_to(mut self, to: impl Into>) -> Self { + self.inner.set_range_end(to.into()); + self + } + + /// Enhance the request to make it watch from a specified revision. + pub fn from_rev(mut self, rev: i64) -> Self { + self.inner.set_start_revision(rev); + self + } +} + +impl From for pb::WatchRequest { + fn from(value: Watch) -> Self { + value.inner + } +} + +/// The descriptor of source (caller) of the requests. +#[derive(Clone, Copy)] +pub enum Source { + LogBackup = 0, +} + +impl std::fmt::Display for Source { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Source::LogBackup => f.write_str("log_backup"), + } + } +} + +/// A wrapper over client which would fill the source field in the header for +/// all requests. +#[derive(Clone)] +pub struct Sourced { + inner: S, + source: Source, +} + +impl Sourced { + pub fn new(inner: S, source: Source) -> Self { + Self { inner, source } + } + + fn prepare_header(&self, h: &mut pb::RequestHeader) { + h.set_source(self.source.to_string()); + } +} + +impl MetaStorageClient for Sourced { + type WatchStream = S::WatchStream; + + fn get(&self, mut req: Get) -> PdFuture { + self.prepare_header(req.inner.mut_header()); + self.inner.get(req) + } + + fn put(&self, mut req: Put) -> PdFuture { + self.prepare_header(req.inner.mut_header()); + self.inner.put(req) + } + + fn watch(&self, mut req: Watch) -> Self::WatchStream { + self.prepare_header(req.inner.mut_header()); + self.inner.watch(req) + } +} + +/// A wrapper that makes every response and stream event get checked. +/// When there is an error in the header, this client would return a [`Err`] +/// variant directly. +#[derive(Clone)] +pub struct Checked(S); + +impl Checked { + pub fn new(client: S) -> Self { + Self(client) + } +} + +/// A wrapper that checks every event in the stream and returns an error +/// variant when there is error in the header. +pub struct CheckedStream(S); + +fn check_resp_header(header: &pb::ResponseHeader) -> Result<()> { + if header.has_error() { + match header.get_error().get_type() { + pb::ErrorType::Ok => Ok(()), + pb::ErrorType::Unknown => Err(Error::Other(box_err!( + "{}", + header.get_error().get_message() + ))), + pb::ErrorType::DataCompacted => Err(Error::DataCompacted( + header.get_error().get_message().to_owned(), + )), + }?; + } + Ok(()) +} + +impl>> Stream for CheckedStream { + type Item = Result; + + fn poll_next( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + // SAFETY: trivial projection. + let inner = unsafe { Pin::new_unchecked(&mut self.get_unchecked_mut().0) }; + let item = ready!(inner.poll_next(cx)); + item.map(|r| { + r.and_then(|resp| { + check_resp_header(resp.get_header())?; + Ok(resp) + }) + }) + .into() + } +} + +impl MetaStorageClient for Checked { + type WatchStream = CheckedStream; + + fn get(&self, req: Get) -> PdFuture { + self.0 + .get(req) + .map(|resp| { + resp.and_then(|r| { + check_resp_header(r.get_header())?; + Ok(r) + }) + }) + .boxed() + } + + fn put(&self, req: Put) -> PdFuture { + self.0 + .put(req) + .map(|resp| { + resp.and_then(|r| { + check_resp_header(r.get_header())?; + Ok(r) + }) + }) + .boxed() + } + + fn watch(&self, req: Watch) -> Self::WatchStream { + CheckedStream(self.0.watch(req)) + } +} + +impl MetaStorageClient for Arc { + type WatchStream = S::WatchStream; + + fn get(&self, req: Get) -> PdFuture { + Arc::as_ref(self).get(req) + } + + fn put(&self, req: Put) -> PdFuture { + Arc::as_ref(self).put(req) + } + + fn watch(&self, req: Watch) -> Self::WatchStream { + Arc::as_ref(self).watch(req) + } +} + +/// A client which is able to play with the `meta_storage` service. +pub trait MetaStorageClient: Send + Sync + 'static { + // Note: Perhaps we'd better make it generic over response here, however that + // would make `CheckedStream` impossible(How can we check ALL types? Or we may + // make traits like `MetaStorageResponse` and constraint over the T), thankfully + // there is only one streaming RPC in this service. + /// The stream that yielded by the watch RPC. + type WatchStream: Stream>; + + fn get(&self, req: Get) -> PdFuture; + fn put(&self, req: Put) -> PdFuture; + fn watch(&self, req: Watch) -> Self::WatchStream; +} diff --git a/components/pd_client/src/metrics.rs b/components/pd_client/src/metrics.rs index a4ef9c5ce4e..e1f1100444a 100644 --- a/components/pd_client/src/metrics.rs +++ b/components/pd_client/src/metrics.rs @@ -32,6 +32,10 @@ make_static_metric! { is_recovering_marked, store_heartbeat, tso, + + meta_storage_put, + meta_storage_get, + meta_storage_watch, } pub struct PDRequestEventHistogramVec: Histogram { diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index fd58cd921d8..f3a8451f321 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -22,6 +22,7 @@ use grpcio::{ Environment, Error::RpcFailure, MetadataBuilder, Result as GrpcResult, RpcStatusCode, }; use kvproto::{ + meta_storagepb::MetaStorageClient as MetaStorageStub, metapb::BucketStats, pdpb::{ ErrorType, GetMembersRequest, GetMembersResponse, Member, PdClient as PdClientStub, @@ -104,6 +105,7 @@ pub struct Inner { pub pending_heartbeat: Arc, pub pending_buckets: Arc, pub tso: TimestampOracle, + pub meta_storage: MetaStorageStub, last_try_reconnect: Instant, } @@ -181,6 +183,8 @@ impl Client { let (buckets_tx, buckets_resp) = client_stub .report_buckets_opt(target.call_option()) .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "report_buckets", e)); + let meta_storage = + kvproto::meta_storagepb::MetaStorageClient::new(client_stub.client.channel().clone()); Client { timer: GLOBAL_TIMER_HANDLE.clone(), inner: RwLock::new(Inner { @@ -198,6 +202,7 @@ impl Client { pending_buckets: Arc::default(), last_try_reconnect: Instant::now(), tso, + meta_storage, }), feature_gate: FeatureGate::default(), enable_forwarding, @@ -238,6 +243,7 @@ impl Client { inner.buckets_sender = Either::Left(Some(buckets_tx)); inner.buckets_resp = Some(buckets_resp); + inner.meta_storage = MetaStorageStub::new(client_stub.client.channel().clone()); inner.client_stub = client_stub; inner.members = members; inner.tso = tso; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index e37c6f9fe3b..f721097a514 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -29,9 +29,7 @@ use std::{ use api_version::{dispatch_api_version, KvFormat}; use backup_stream::{ - config::BackupStreamConfigManager, - metadata::{ConnectionConfig, LazyEtcdClient}, - observer::BackupStreamObserver, + config::BackupStreamConfigManager, metadata::store::PdStore, observer::BackupStreamObserver, }; use causal_ts::CausalTsProviderImpl; use cdc::{CdcConfigManager, MemoryQuota}; @@ -62,7 +60,10 @@ use kvproto::{ kvrpcpb::ApiVersion, logbackuppb::create_log_backup, recoverdatapb::create_recover_data, resource_usage_agent::create_resource_metering_pub_sub, }; -use pd_client::{PdClient, RpcClient}; +use pd_client::{ + meta_storage::{Checked, Sourced}, + PdClient, RpcClient, +}; use raft_log_engine::RaftLogEngine; use raftstore::{ coprocessor::{ @@ -1040,17 +1041,12 @@ where Box::new(BackupStreamConfigManager(backup_stream_worker.scheduler())), ); - let etcd_cli = LazyEtcdClient::new( - self.config.pd.endpoints.as_slice(), - ConnectionConfig { - keep_alive_interval: self.config.server.grpc_keepalive_time.0, - keep_alive_timeout: self.config.server.grpc_keepalive_timeout.0, - tls: Arc::clone(&self.security_mgr), - }, - ); let backup_stream_endpoint = backup_stream::Endpoint::new( node.id(), - etcd_cli, + PdStore::new(Checked::new(Sourced::new( + Arc::clone(&self.pd_client), + pd_client::meta_storage::Source::LogBackup, + ))), self.config.backup_stream.clone(), backup_stream_scheduler.clone(), backup_stream_ob, diff --git a/components/test_pd/src/mocker/meta_storage.rs b/components/test_pd/src/mocker/meta_storage.rs new file mode 100644 index 00000000000..311c3884722 --- /dev/null +++ b/components/test_pd/src/mocker/meta_storage.rs @@ -0,0 +1,113 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::{Arc, Mutex}; + +use futures::{executor::block_on, SinkExt, StreamExt}; +use grpcio::{RpcStatus, RpcStatusCode}; +use kvproto::meta_storagepb as mpb; + +use super::etcd::{Etcd, KeyValue, Keys, KvEventType, MetaKey}; +use crate::PdMocker; + +#[derive(Default)] +pub struct MetaStorage { + store: Arc>, +} + +fn convert_kv(from: KeyValue) -> mpb::KeyValue { + let mut kv = mpb::KeyValue::default(); + kv.set_key(from.0.0); + kv.set_value(from.1); + kv +} + +fn check_header(h: &mpb::RequestHeader) -> super::Result<()> { + if h.get_source().is_empty() { + return Err(format!("Please provide header.source; req = {:?}", h)); + } + Ok(()) +} + +fn header_of_revision(r: i64) -> mpb::ResponseHeader { + let mut h = mpb::ResponseHeader::default(); + h.set_revision(r); + h +} + +impl PdMocker for MetaStorage { + fn meta_store_get(&self, req: mpb::GetRequest) -> Option> { + if let Err(err) = check_header(req.get_header()) { + return Some(Err(err)); + } + + let store = self.store.lock().unwrap(); + let key = if req.get_range_end().is_empty() { + Keys::Key(MetaKey(req.get_key().to_vec())) + } else { + Keys::Range( + MetaKey(req.get_key().to_vec()), + MetaKey(req.get_range_end().to_vec()), + ) + }; + let (items, rev) = store.get_key(key); + let mut resp = mpb::GetResponse::new(); + resp.set_kvs(items.into_iter().map(convert_kv).collect()); + resp.set_header(header_of_revision(rev)); + Some(Ok(resp)) + } + + fn meta_store_put(&self, mut req: mpb::PutRequest) -> Option> { + if let Err(err) = check_header(req.get_header()) { + return Some(Err(err)); + } + + let mut store = self.store.lock().unwrap(); + block_on(store.set(KeyValue(MetaKey(req.take_key()), req.take_value()))).unwrap(); + Some(Ok(Default::default())) + } + + fn meta_store_watch( + &self, + req: mpb::WatchRequest, + mut sink: grpcio::ServerStreamingSink, + ctx: &grpcio::RpcContext<'_>, + ) -> bool { + if let Err(err) = check_header(req.get_header()) { + ctx.spawn(async move { + sink.fail(RpcStatus::with_message( + RpcStatusCode::INVALID_ARGUMENT, + err, + )) + .await + .unwrap() + }); + return true; + } + + let mut store = self.store.lock().unwrap(); + let key = if req.get_range_end().is_empty() { + Keys::Key(MetaKey(req.get_key().to_vec())) + } else { + Keys::Range( + MetaKey(req.get_key().to_vec()), + MetaKey(req.get_range_end().to_vec()), + ) + }; + let mut watcher = + block_on(store.watch(key, req.get_start_revision())).expect("should be infallible"); + ctx.spawn(async move { + while let Some(x) = watcher.next().await { + let mut event = mpb::Event::new(); + event.set_kv(convert_kv(x.pair)); + event.set_type(match x.kind { + KvEventType::Put => mpb::EventEventType::Put, + KvEventType::Delete => mpb::EventEventType::Delete, + }); + let mut resp = mpb::WatchResponse::default(); + resp.set_events(vec![event].into()); + sink.send((resp, Default::default())).await.unwrap(); + } + }); + true + } +} diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index d8282ca3df0..f4b6dafb6b6 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -3,12 +3,13 @@ use std::result; use futures::executor::block_on; -use kvproto::pdpb::*; +use kvproto::{meta_storagepb as mpb, pdpb::*}; mod bootstrap; pub mod etcd; mod incompatible; mod leader_change; +mod meta_storage; mod retry; mod service; mod split; @@ -18,6 +19,7 @@ pub use self::{ bootstrap::AlreadyBootstrapped, incompatible::Incompatible, leader_change::LeaderChange, + meta_storage::MetaStorage, retry::{NotRetry, Retry}, service::Service, split::Split, @@ -28,6 +30,23 @@ pub const DEFAULT_CLUSTER_ID: u64 = 42; pub type Result = result::Result; pub trait PdMocker { + fn meta_store_get(&self, _req: mpb::GetRequest) -> Option> { + None + } + + fn meta_store_put(&self, _req: mpb::PutRequest) -> Option> { + None + } + + fn meta_store_watch( + &self, + _req: mpb::WatchRequest, + _sink: grpcio::ServerStreamingSink, + _ctx: &grpcio::RpcContext<'_>, + ) -> bool { + false + } + fn load_global_config( &self, _req: &LoadGlobalConfigRequest, diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index b1909485ac8..1662e27f00f 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -16,7 +16,10 @@ use grpcio::{ ClientStreamingSink, DuplexSink, EnvBuilder, RequestStream, RpcContext, RpcStatus, RpcStatusCode, Server as GrpcServer, ServerBuilder, ServerStreamingSink, UnarySink, WriteFlags, }; -use kvproto::pdpb::*; +use kvproto::{ + meta_storagepb_grpc::{create_meta_storage, MetaStorage}, + pdpb::*, +}; use pd_client::Error as PdError; use security::*; @@ -70,14 +73,17 @@ impl Server { } pub fn start(&mut self, mgr: &SecurityManager, eps: Vec<(String, u16)>) { - let service = create_pd(self.mocker.clone()); + let pd = create_pd(self.mocker.clone()); + let meta_store = create_meta_storage(self.mocker.clone()); let env = Arc::new( EnvBuilder::new() .cq_count(1) .name_prefix(thd_name!("mock-server")) .build(), ); - let mut sb = ServerBuilder::new(env).register_service(service); + let mut sb = ServerBuilder::new(env) + .register_service(pd) + .register_service(meta_store); for (host, port) in eps { sb = mgr.bind(sb, &host, port); } @@ -187,6 +193,40 @@ impl Clone for PdMock { } } +impl MetaStorage for PdMock { + fn watch( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: kvproto::meta_storagepb::WatchRequest, + sink: grpcio::ServerStreamingSink, + ) { + match &self.case { + Some(x) => { + x.meta_store_watch(req, sink, &ctx); + } + None => grpcio::unimplemented_call!(ctx, sink), + } + } + + fn get( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: kvproto::meta_storagepb::GetRequest, + sink: grpcio::UnarySink, + ) { + hijack_unary(self, ctx, sink, |m| m.meta_store_get(req.clone())) + } + + fn put( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: kvproto::meta_storagepb::PutRequest, + sink: grpcio::UnarySink, + ) { + hijack_unary(self, ctx, sink, |m| m.meta_store_put(req.clone())) + } +} + impl Pd for PdMock { fn load_global_config( &mut self, diff --git a/components/tikv_util/src/codec/mod.rs b/components/tikv_util/src/codec/mod.rs index fa0ec4d7d16..0e1e7aa6fdb 100644 --- a/components/tikv_util/src/codec/mod.rs +++ b/components/tikv_util/src/codec/mod.rs @@ -22,6 +22,34 @@ pub fn read_slice<'a>(data: &mut BytesSlice<'a>, size: usize) -> Result) -> Vec { + let mut next_prefix = key; + for i in (0..next_prefix.len()).rev() { + if next_prefix[i] == u8::MAX { + next_prefix.pop(); + } else { + next_prefix[i] += 1; + break; + } + } + // By definition, the empty key means infinity. + // When we have meet keys like [0xff], return empty slice here is expected. + next_prefix +} + #[derive(Debug, Error)] pub enum Error { #[error("{0}")] From 6f0f814b3981185105014405a2102bd849b0af06 Mon Sep 17 00:00:00 2001 From: Zak Zhao <57036248+joccau@users.noreply.github.com> Date: Wed, 29 Mar 2023 15:38:54 +0800 Subject: [PATCH 609/676] pitr: support modifying the config tikv.log-backup.max-flush-interval online. (#14425) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#14433 Signed-off-by: joccau Signed-off-by: Zak Zhao <57036248+joccau@users.noreply.github.com> Co-authored-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> --- components/backup-stream/src/config.rs | 38 +++++++--- components/backup-stream/src/endpoint.rs | 16 +++- components/backup-stream/src/router.rs | 94 +++++++++++++++++++----- components/server/src/server.rs | 9 ++- src/config/mod.rs | 21 +++--- tests/integrations/config/mod.rs | 2 +- 6 files changed, 131 insertions(+), 49 deletions(-) diff --git a/components/backup-stream/src/config.rs b/components/backup-stream/src/config.rs index dfee838c333..03afa47dd97 100644 --- a/components/backup-stream/src/config.rs +++ b/components/backup-stream/src/config.rs @@ -1,26 +1,40 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use online_config::{ConfigChange, ConfigManager}; -use tikv_util::worker::Scheduler; +use std::sync::{Arc, RwLock}; + +use online_config::{ConfigChange, ConfigManager, OnlineConfig}; +use tikv::config::BackupStreamConfig; +use tikv_util::{info, worker::Scheduler}; use crate::endpoint::Task; -pub struct BackupStreamConfigManager(pub Scheduler); +#[derive(Clone)] +pub struct BackupStreamConfigManager { + pub scheduler: Scheduler, + pub config: Arc>, +} + +impl BackupStreamConfigManager { + pub fn new(scheduler: Scheduler, cfg: BackupStreamConfig) -> Self { + let config = Arc::new(RwLock::new(cfg)); + Self { scheduler, config } + } +} impl ConfigManager for BackupStreamConfigManager { fn dispatch( &mut self, change: ConfigChange, ) -> std::result::Result<(), Box> { - self.0.schedule(Task::ChangeConfig(change))?; - Ok(()) - } -} + info!( + "log backup config changed"; + "change" => ?change, + ); + let mut cfg = self.config.as_ref().write().unwrap(); + cfg.update(change)?; + cfg.validate()?; -impl std::ops::Deref for BackupStreamConfigManager { - type Target = Scheduler; - - fn deref(&self) -> &Self::Target { - &self.0 + self.scheduler.schedule(Task::ChangeConfig(cfg.clone()))?; + Ok(()) } } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 68f040217ea..c8302f6dd9e 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -13,7 +13,6 @@ use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, metapb::Region, }; -use online_config::ConfigChange; use pd_client::PdClient; use raftstore::{ coprocessor::{CmdBatch, ObserveHandle, RegionInfoProvider}, @@ -877,6 +876,15 @@ where } } + fn on_update_change_config(&mut self, cfg: BackupStreamConfig) { + info!( + "update log backup config"; + "config" => ?cfg, + ); + self.range_router.udpate_config(&cfg); + self.config = cfg; + } + /// Modify observe over some region. /// This would register the region to the RaftStore. pub fn on_modify_observe(&self, op: ObserveOp) { @@ -898,8 +906,8 @@ where Task::ModifyObserve(op) => self.on_modify_observe(op), Task::ForceFlush(task) => self.on_force_flush(task), Task::FatalError(task, err) => self.on_fatal_error(task, err), - Task::ChangeConfig(_) => { - warn!("change config online isn't supported for now.") + Task::ChangeConfig(cfg) => { + self.on_update_change_config(cfg); } Task::Sync(cb, mut cond) => { if cond(&self.range_router) { @@ -1081,7 +1089,7 @@ impl fmt::Debug for RegionCheckpointOperation { pub enum Task { WatchTask(TaskOp), BatchEvent(Vec), - ChangeConfig(ConfigChange), + ChangeConfig(BackupStreamConfig), /// Change the observe status of some region. ModifyObserve(ObserveOp), /// Convert status of some task into `flushing` and do flush then. diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 5b862f732a2..4b1022e7b39 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -31,6 +31,7 @@ use protobuf::Message; use raftstore::coprocessor::CmdBatch; use slog_global::debug; use tidb_query_datatype::codec::table::decode_table_id; +use tikv::config::BackupStreamConfig; use tikv_util::{ box_err, codec::stream_event::EventEncoder, @@ -341,9 +342,9 @@ pub struct RouterInner { /// too many temporary files. scheduler: Scheduler, /// The size limit of temporary file per task. - temp_file_size_limit: u64, + temp_file_size_limit: AtomicU64, /// The max duration the local data can be pending. - max_flush_interval: Duration, + max_flush_interval: SyncRwLock, } impl std::fmt::Debug for RouterInner { @@ -368,11 +369,17 @@ impl RouterInner { tasks: Mutex::new(HashMap::default()), prefix, scheduler, - temp_file_size_limit, - max_flush_interval, + temp_file_size_limit: AtomicU64::new(temp_file_size_limit), + max_flush_interval: SyncRwLock::new(max_flush_interval), } } + pub fn udpate_config(&self, config: &BackupStreamConfig) { + *self.max_flush_interval.write().unwrap() = config.max_flush_interval.0; + self.temp_file_size_limit + .store(config.file_size_limit.0, Ordering::SeqCst); + } + /// Find the task for a region. If `end_key` is empty, search from start_key /// to +inf. It simply search for a random possible overlapping range and /// get its task. @@ -430,7 +437,6 @@ impl RouterInner { let stream_task = StreamTaskInfo::new( prefix_path, task, - self.max_flush_interval, ranges.clone(), merged_file_size_limit, compression_type, @@ -507,6 +513,7 @@ impl RouterInner { async fn on_event(&self, task: String, events: ApplyEvents) -> Result<()> { let task_info = self.get_task_info(&task).await?; task_info.on_events(events).await?; + let file_size_limit = self.temp_file_size_limit.load(Ordering::SeqCst); // When this event make the size of temporary files exceeds the size limit, make // a flush. Note that we only flush if the size is less than the limit before @@ -515,10 +522,10 @@ impl RouterInner { "backup stream statics size"; "task" => ?task, "next_size" => task_info.total_size(), - "size_limit" => self.temp_file_size_limit, + "size_limit" => file_size_limit, ); let cur_size = task_info.total_size(); - if cur_size > self.temp_file_size_limit && !task_info.is_flushing() { + if cur_size > file_size_limit && !task_info.is_flushing() { info!("try flushing task"; "task" => %task, "size" => %cur_size); if task_info.set_flushing_status_cas(false, true).is_ok() { if let Err(e) = self.scheduler.schedule(Task::Flush(task)) { @@ -592,6 +599,8 @@ impl RouterInner { /// tick aims to flush log/meta to extern storage periodically. pub async fn tick(&self) { + let max_flush_interval = self.max_flush_interval.rl().to_owned(); + for (name, task_info) in self.tasks.lock().await.iter() { if let Err(e) = self .scheduler @@ -602,7 +611,9 @@ impl RouterInner { // if stream task need flush this time, schedule Task::Flush, or update time // justly. - if task_info.should_flush() && task_info.set_flushing_status_cas(false, true).is_ok() { + if task_info.should_flush(&max_flush_interval) + && task_info.set_flushing_status_cas(false, true).is_ok() + { info!( "backup stream trigger flush task by tick"; "task" => ?task_info, @@ -763,8 +774,6 @@ pub struct StreamTaskInfo { flushing_meta_files: RwLock>, /// last_flush_ts represents last time this task flushed to storage. last_flush_time: AtomicPtr, - /// flush_interval represents the tick interval of flush, setting by users. - flush_interval: Duration, /// The min resolved TS of all regions involved. min_resolved_ts: TimeStamp, /// Total size of all temporary files in byte. @@ -825,7 +834,6 @@ impl StreamTaskInfo { pub async fn new( temp_dir: PathBuf, task: StreamTask, - flush_interval: Duration, ranges: Vec<(Vec, Vec)>, merged_file_size_limit: u64, compression_type: CompressionType, @@ -846,7 +854,6 @@ impl StreamTaskInfo { flushing_files: RwLock::default(), flushing_meta_files: RwLock::default(), last_flush_time: AtomicPtr::new(Box::into_raw(Box::new(Instant::now()))), - flush_interval, total_size: AtomicUsize::new(0), flushing: AtomicBool::new(false), flush_fail_count: AtomicUsize::new(0), @@ -946,12 +953,11 @@ impl StreamTaskInfo { unsafe { Box::from_raw(ptr) }; } - pub fn should_flush(&self) -> bool { + pub fn should_flush(&self, flush_interval: &Duration) -> bool { // When it doesn't flush since 0.8x of auto-flush interval, we get ready to // start flushing. So that we will get a buffer for the cost of actual // flushing. - self.get_last_flush_time().saturating_elapsed_secs() - >= self.flush_interval.as_secs_f64() * 0.8 + self.get_last_flush_time().saturating_elapsed_secs() >= flush_interval.as_secs_f64() * 0.8 } pub fn is_flushing(&self) -> bool { @@ -1511,15 +1517,17 @@ mod tests { use external_storage::{ExternalData, NoopStorage}; use futures::AsyncReadExt; use kvproto::brpb::{Local, Noop, StorageBackend, StreamBackupTaskInfo}; + use online_config::{ConfigManager, OnlineConfig}; use tikv_util::{ codec::number::NumberEncoder, + config::ReadableDuration, worker::{dummy_scheduler, ReceiverWrapper}, }; use tokio::fs::File; use txn_types::{Write, WriteType}; use super::*; - use crate::utils; + use crate::{config::BackupStreamConfigManager, utils}; #[derive(Debug)] struct KvEventsBuilder { @@ -1835,7 +1843,6 @@ mod tests { let task = StreamTaskInfo::new( tmp_dir.path().to_path_buf(), stream_task, - Duration::from_secs(300), vec![(vec![], vec![])], merged_file_size_limit, CompressionType::Zstd, @@ -2194,7 +2201,6 @@ mod tests { let task = StreamTaskInfo::new( tmp_dir.path().to_path_buf(), stream_task, - Duration::from_secs(300), vec![(vec![], vec![])], 0x100000, CompressionType::Zstd, @@ -2308,4 +2314,56 @@ mod tests { assert_eq!(result.is_ok(), true); Ok(()) } + + #[test] + fn test_update_config() { + let (sched, rx) = dummy_scheduler(); + let cfg = BackupStreamConfig::default(); + let router = Arc::new(RouterInner::new( + PathBuf::new(), + sched.clone(), + 1, + cfg.max_flush_interval.0, + )); + + let mut cfg_manager = BackupStreamConfigManager::new(sched, cfg.clone()); + + let _new_cfg = BackupStreamConfig { + max_flush_interval: ReadableDuration::minutes(2), + ..Default::default() + }; + + let changed = cfg.diff(&_new_cfg); + cfg_manager.dispatch(changed).unwrap(); + + let cmds = collect_recv(rx); + assert_eq!(cmds.len(), 1); + match &cmds[0] { + Task::ChangeConfig(cfg) => { + assert!(matches!(cfg, _new_cfg)); + router.udpate_config(cfg); + assert_eq!( + router.max_flush_interval.rl().to_owned(), + _new_cfg.max_flush_interval.0 + ); + } + _ => panic!("unexpected cmd!"), + } + } + + #[test] + fn test_udpate_invalid_config() { + let cfg = BackupStreamConfig::default(); + let (sched, _) = dummy_scheduler(); + let mut cfg_manager = BackupStreamConfigManager::new(sched, cfg.clone()); + + let new_cfg = BackupStreamConfig { + max_flush_interval: ReadableDuration::secs(0), + ..Default::default() + }; + + let changed = cfg.diff(&new_cfg); + let r = cfg_manager.dispatch(changed); + assert!(r.is_err()); + } } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index f721097a514..06df19da1d6 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1027,7 +1027,7 @@ where ); // Start backup stream - let backup_stream_scheduler = if self.config.backup_stream.enable { + let backup_stream_scheduler = if self.config.log_backup.enable { // Create backup stream. let mut backup_stream_worker = Box::new(LazyWorker::new("backup-stream")); let backup_stream_scheduler = backup_stream_worker.scheduler(); @@ -1038,7 +1038,10 @@ where // Register config manager. cfg_controller.register( tikv::config::Module::BackupStream, - Box::new(BackupStreamConfigManager(backup_stream_worker.scheduler())), + Box::new(BackupStreamConfigManager::new( + backup_stream_worker.scheduler(), + self.config.log_backup.clone(), + )), ); let backup_stream_endpoint = backup_stream::Endpoint::new( @@ -1047,7 +1050,7 @@ where Arc::clone(&self.pd_client), pd_client::meta_storage::Source::LogBackup, ))), - self.config.backup_stream.clone(), + self.config.log_backup.clone(), backup_stream_scheduler.clone(), backup_stream_ob, self.region_info_accessor.clone(), diff --git a/src/config/mod.rs b/src/config/mod.rs index 3eb15ba8ace..f8bbd1be9f5 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2662,7 +2662,7 @@ impl Default for BackupConfig { pub struct BackupStreamConfig { #[online_config(skip)] pub min_ts_interval: ReadableDuration, - #[online_config(skip)] + pub max_flush_interval: ReadableDuration, #[online_config(skip)] pub num_threads: usize, @@ -2670,7 +2670,7 @@ pub struct BackupStreamConfig { pub enable: bool, #[online_config(skip)] pub temp_path: String, - #[online_config(skip)] + pub file_size_limit: ReadableSize, #[online_config(skip)] pub initial_scan_pending_memory_quota: ReadableSize, @@ -3136,8 +3136,7 @@ pub struct TikvConfig { #[online_config(submodule)] // The term "log backup" and "backup stream" are identity. // The "log backup" should be the only product name exposed to the user. - #[serde(rename = "log-backup")] - pub backup_stream: BackupStreamConfig, + pub log_backup: BackupStreamConfig, #[online_config(submodule)] pub pessimistic_txn: PessimisticTxnConfig, @@ -3202,7 +3201,7 @@ impl Default for TikvConfig { cdc: CdcConfig::default(), resolved_ts: ResolvedTsConfig::default(), resource_metering: ResourceMeteringConfig::default(), - backup_stream: BackupStreamConfig::default(), + log_backup: BackupStreamConfig::default(), causal_ts: CausalTsConfig::default(), resource_control: ResourceControlConfig::default(), } @@ -3333,8 +3332,8 @@ impl TikvConfig { ); } - if self.backup_stream.temp_path.is_empty() { - self.backup_stream.temp_path = + if self.log_backup.temp_path.is_empty() { + self.log_backup.temp_path = config::canonicalize_sub_path(&self.storage.data_dir, "log-backup-temp")?; } @@ -3360,7 +3359,7 @@ impl TikvConfig { .validate(self.storage.engine == EngineType::RaftKv2)?; self.import.validate()?; self.backup.validate()?; - self.backup_stream.validate()?; + self.log_backup.validate()?; self.cdc.validate()?; self.pessimistic_txn.validate()?; self.gc.validate()?; @@ -4149,7 +4148,7 @@ impl From<&str> for Module { "security" => Module::Security, "import" => Module::Import, "backup" => Module::Backup, - "backup_stream" => Module::BackupStream, + "log_backup" => Module::BackupStream, "pessimistic_txn" => Module::PessimisticTxn, "gc" => Module::Gc, "cdc" => Module::Cdc, @@ -5645,7 +5644,7 @@ mod tests { cfg.raftdb.max_sub_compactions = default_cfg.raftdb.max_sub_compactions; cfg.raftdb.titan.max_background_gc = default_cfg.raftdb.titan.max_background_gc; cfg.backup.num_threads = default_cfg.backup.num_threads; - cfg.backup_stream.num_threads = default_cfg.backup_stream.num_threads; + cfg.log_backup.num_threads = default_cfg.log_backup.num_threads; // There is another set of config values that we can't directly compare: // When the default values are `None`, but are then resolved to `Some(_)` later @@ -5835,7 +5834,7 @@ mod tests { ("security", Module::Security), ("import", Module::Import), ("backup", Module::Backup), - ("backup_stream", Module::BackupStream), + ("log_backup", Module::BackupStream), ("pessimistic_txn", Module::PessimisticTxn), ("gc", Module::Gc), ("cdc", Module::Cdc), diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 02b5c711e96..7d40cde87d5 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -774,7 +774,7 @@ fn test_serde_custom_tikv_config() { }, ..Default::default() }; - value.backup_stream = BackupStreamConfig { + value.log_backup = BackupStreamConfig { max_flush_interval: ReadableDuration::secs(11), num_threads: 7, enable: true, From d269912fa7ede76f66ca80830701d9fc260bb5c4 Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Wed, 29 Mar 2023 17:38:54 +0800 Subject: [PATCH 610/676] txn: record the latch wait, flow control throttle, quota delay and scheduler process duration (#14476) ref tikv/tikv#12362 Signed-off-by: cfzjywxk Co-authored-by: Ti Chi Robot --- components/tracker/src/lib.rs | 6 ++ src/storage/txn/scheduler.rs | 95 ++++++++++++++++++++----- tests/integrations/server/kv_service.rs | 1 + 3 files changed, 86 insertions(+), 16 deletions(-) diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index 35ae0fc15f2..fafd8415039 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -43,6 +43,9 @@ impl Tracker { } pub fn write_write_detail(&self, detail: &mut pb::WriteDetail) { + detail.set_latch_wait_nanos(self.metrics.latch_wait_nanos); + detail.set_process_nanos(self.metrics.scheduler_process_nanos); + detail.set_throttle_nanos(self.metrics.scheduler_throttle_nanos); detail.set_pessimistic_lock_wait_nanos(self.metrics.pessimistic_lock_wait_nanos); detail.set_store_batch_wait_nanos(self.metrics.wf_batch_wait_nanos); detail.set_propose_send_wait_nanos( @@ -132,6 +135,9 @@ pub struct RequestMetrics { pub internal_key_skipped_count: u64, pub deleted_key_skipped_count: u64, pub pessimistic_lock_wait_nanos: u64, + pub latch_wait_nanos: u64, + pub scheduler_process_nanos: u64, + pub scheduler_throttle_nanos: u64, // temp instant used in raftstore metrics, first be the instant when creating the write // callback, then reset when it is ready to apply pub write_instant: Option, diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 17110a07e7b..85c41124b89 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -52,7 +52,7 @@ use resource_metering::{FutureExt, ResourceTagFactory}; use smallvec::{smallvec, SmallVec}; use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData, WriteEvent}; use tikv_util::{quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE}; -use tracker::{get_tls_tracker_token, set_tls_tracker_token, TrackerToken}; +use tracker::{get_tls_tracker_token, set_tls_tracker_token, TrackerToken, GLOBAL_TRACKERS}; use txn_types::TimeStamp; use crate::{ @@ -188,9 +188,15 @@ impl TaskContext { } fn on_schedule(&mut self) { + let elapsed = self.latch_timer.saturating_elapsed(); + if let Some(task) = &self.task.as_ref() { + GLOBAL_TRACKERS.with_tracker(task.tracker, |tracker| { + tracker.metrics.latch_wait_nanos = elapsed.as_nanos() as u64; + }); + } SCHED_LATCH_HISTOGRAM_VEC .get(self.tag) - .observe(self.latch_timer.saturating_elapsed_secs()); + .observe(elapsed.as_secs_f64()); } // Try to own this TaskContext by setting `owned` from false to true. @@ -779,6 +785,7 @@ impl TxnScheduler { new_acquired_locks: Vec, tag: CommandKind, group_name: &str, + sched_details: &SchedulerDetails, ) { // TODO: Does async apply prewrite worth a special metric here? if pipelined { @@ -820,6 +827,15 @@ impl TxnScheduler { SCHED_STAGE_COUNTER_VEC.get(tag).next_cmd.inc(); self.schedule_command(None, cmd, cb, None); } else { + GLOBAL_TRACKERS.with_tracker(sched_details.tracker, |tracker| { + tracker.metrics.scheduler_process_nanos = sched_details + .start_process_instant + .saturating_elapsed() + .as_nanos() + as u64; + tracker.metrics.scheduler_throttle_nanos = + sched_details.flow_control_nanos + sched_details.quota_limit_delay_nanos; + }); cb.execute(pr); } } else { @@ -1073,7 +1089,7 @@ impl TxnScheduler { let region_id = task.cmd.ctx().get_region_id(); let ts = task.cmd.ts(); - let mut statistics = Statistics::default(); + let mut sched_details = SchedulerDetails::new(task.tracker, timer); match &task.cmd { Command::Prewrite(_) | Command::PrewritePessimistic(_) => { tls_collect_query(region_id, QueryKind::Prewrite); @@ -1092,18 +1108,19 @@ impl TxnScheduler { fail_point!("scheduler_process"); if task.cmd.readonly() { - self.process_read(snapshot, task, &mut statistics); + self.process_read(snapshot, task, &mut sched_details); } else { - self.process_write(snapshot, task, &mut statistics).await; + self.process_write(snapshot, task, &mut sched_details).await; }; - tls_collect_scan_details(tag.get_str(), &statistics); + tls_collect_scan_details(tag.get_str(), &sched_details.stat); let elapsed = timer.saturating_elapsed(); slow_log!( elapsed, - "[region {}] scheduler handle command: {}, ts: {}", + "[region {}] scheduler handle command: {}, ts: {}, details: {:?}", region_id, tag, - ts + ts, + sched_details, ); } .in_resource_metering_tag(resource_tag) @@ -1112,7 +1129,7 @@ impl TxnScheduler { /// Processes a read command within a worker thread, then posts /// `ReadFinished` message back to the `TxnScheduler`. - fn process_read(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { + fn process_read(self, snapshot: E::Snap, task: Task, sched_details: &mut SchedulerDetails) { fail_point!("txn_before_process_read"); debug!("process read cmd in worker pool"; "cid" => task.cid); @@ -1122,7 +1139,7 @@ impl TxnScheduler { let cmd = task.cmd; let pr = unsafe { with_perf_context::(tag, || { - cmd.process_read(snapshot, statistics) + cmd.process_read(snapshot, &mut sched_details.stat) .unwrap_or_else(|e| ProcessResult::Failed { err: e.into() }) }) }; @@ -1135,7 +1152,12 @@ impl TxnScheduler { /// Processes a write command within a worker thread, then posts either a /// `WriteFinished` message if successful or a `FinishedWithErr` message /// back to the `TxnScheduler`. - async fn process_write(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { + async fn process_write( + self, + snapshot: E::Snap, + task: Task, + sched_details: &mut SchedulerDetails, + ) { fail_point!("txn_before_process_write"); let write_bytes = task.cmd.write_bytes(); let tag = task.cmd.tag(); @@ -1174,7 +1196,7 @@ impl TxnScheduler { lock_mgr: &self.inner.lock_mgr, concurrency_manager, extra_op: task.extra_op, - statistics, + statistics: &mut sched_details.stat, async_apply_prewrite: self.inner.enable_async_apply_prewrite, raw_ext, }; @@ -1192,17 +1214,32 @@ impl TxnScheduler { res }; + let process_end = Instant::now(); if write_result.is_ok() { // TODO: write bytes can be a bit inaccurate due to error requests or in-memory // pessimistic locks. sample.add_write_bytes(write_bytes); } - let read_bytes = statistics.cf_statistics(CF_DEFAULT).flow_stats.read_bytes - + statistics.cf_statistics(CF_LOCK).flow_stats.read_bytes - + statistics.cf_statistics(CF_WRITE).flow_stats.read_bytes; + let read_bytes = sched_details + .stat + .cf_statistics(CF_DEFAULT) + .flow_stats + .read_bytes + + sched_details + .stat + .cf_statistics(CF_LOCK) + .flow_stats + .read_bytes + + sched_details + .stat + .cf_statistics(CF_WRITE) + .flow_stats + .read_bytes; sample.add_read_bytes(read_bytes); let quota_delay = quota_limiter.consume_sample(sample, true).await; if !quota_delay.is_zero() { + let actual_quota_delay = process_end.saturating_elapsed(); + sched_details.quota_limit_delay_nanos = actual_quota_delay.as_nanos() as u64; TXN_COMMAND_THROTTLE_TIME_COUNTER_VEC_STATIC .get(tag) .inc_by(quota_delay.as_micros() as u64); @@ -1298,6 +1335,7 @@ impl TxnScheduler { new_acquired_locks, tag, &group_name, + sched_details, ); return; } @@ -1329,6 +1367,7 @@ impl TxnScheduler { new_acquired_locks, tag, &group_name, + sched_details, ); return; } @@ -1383,7 +1422,9 @@ impl TxnScheduler { .await .unwrap(); } - SCHED_THROTTLE_TIME.observe(start.saturating_elapsed_secs()); + let elapsed = start.saturating_elapsed(); + SCHED_THROTTLE_TIME.observe(elapsed.as_secs_f64()); + sched_details.flow_control_nanos = elapsed.as_nanos() as u64; } } @@ -1516,6 +1557,7 @@ impl TxnScheduler { new_acquired_locks, tag, &group_name, + sched_details, ); KV_COMMAND_KEYWRITE_HISTOGRAM_VEC .get(tag) @@ -1788,6 +1830,27 @@ enum PessimisticLockMode { InMemory, } +#[derive(Debug)] +struct SchedulerDetails { + tracker: TrackerToken, + stat: Statistics, + start_process_instant: Instant, + quota_limit_delay_nanos: u64, + flow_control_nanos: u64, +} + +impl SchedulerDetails { + fn new(tracker: TrackerToken, start_process_instant: Instant) -> Self { + SchedulerDetails { + tracker, + stat: Default::default(), + start_process_instant, + quota_limit_delay_nanos: 0, + flow_control_nanos: 0, + } + } +} + #[cfg(test)] mod tests { use std::thread; diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 284a3f1cb89..5e47ad4745b 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -2430,6 +2430,7 @@ fn test_commands_write_detail() { // assert!(wd.get_apply_mutex_lock_nanos() > 0); assert!(wd.get_apply_write_wal_nanos() > 0); assert!(wd.get_apply_write_memtable_nanos() > 0); + assert!(wd.get_process_nanos() > 0); }; let mut mutation = Mutation::default(); From 7e6dac46bfdca76b4b40c55845c4250e163b6405 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 30 Mar 2023 14:36:55 +0800 Subject: [PATCH 611/676] sst_import: make apply asynchronous (#14363) ref tikv/tikv#13848 Signed-off-by: hillium Signed-off-by: Yu Juncen --- components/backup-stream/src/utils.rs | 18 +- components/external_storage/src/lib.rs | 2 +- components/sst_importer/src/metrics.rs | 17 +- components/sst_importer/src/sst_importer.rs | 430 ++++++------------- components/tikv_kv/src/lib.rs | 2 +- src/import/mod.rs | 1 + src/import/raft_writer.rs | 451 ++++++++++++++++++++ src/import/sst_service.rs | 128 +++--- 8 files changed, 695 insertions(+), 354 deletions(-) create mode 100644 src/import/raft_writer.rs diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 77c689da70d..d94ba59b2d5 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -445,15 +445,6 @@ pub struct CallbackWaitGroup { on_finish_all: std::sync::Mutex>>, } -/// A shortcut for making an opaque future type for return type or argument -/// type, which is sendable and not borrowing any variables. -/// -/// `fut![T]` == `impl Future + Send + 'static` -#[macro_export(crate)] -macro_rules! future { - ($t:ty) => { impl core::future::Future + Send + 'static }; -} - impl CallbackWaitGroup { pub fn new() -> Arc { Arc::new(Self { @@ -831,6 +822,15 @@ impl<'a> slog::KV for SlogRegion<'a> { } } +/// A shortcut for making an opaque future type for return type or argument +/// type, which is sendable and not borrowing any variables. +/// +/// `future![T]` == `impl Future + Send + 'static` +#[macro_export] +macro_rules! future { + ($t:ty) => { impl core::future::Future + Send + 'static }; +} + pub fn debug_iter(t: impl Iterator) -> impl std::fmt::Debug { DebugIter(RefCell::new(t)) } diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index c344f09968b..211a1b52ad6 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -345,7 +345,7 @@ where pub const MIN_READ_SPEED: usize = 8192; pub async fn read_external_storage_info_buff( - reader: &mut (dyn AsyncRead + Unpin), + reader: &mut (dyn AsyncRead + Unpin + Send), speed_limiter: &Limiter, expected_length: u64, expected_sha256: Option>, diff --git a/components/sst_importer/src/metrics.rs b/components/sst_importer/src/metrics.rs index 6b4af299ba8..2737d592fc0 100644 --- a/components/sst_importer/src/metrics.rs +++ b/components/sst_importer/src/metrics.rs @@ -55,12 +55,12 @@ lazy_static! { pub static ref IMPORTER_DOWNLOAD_BYTES: Histogram = register_histogram!( "tikv_import_download_bytes", "Bucketed histogram of importer download bytes", - exponential_buckets(1024.0, 2.0, 20).unwrap() + exponential_buckets(16.0, 2.0, 20).unwrap() ).unwrap(); pub static ref IMPORTER_APPLY_BYTES: Histogram = register_histogram!( "tikv_import_apply_bytes", "Bucketed histogram of importer apply bytes", - exponential_buckets(1024.0, 2.0, 20).unwrap() + exponential_buckets(16.0, 2.0, 20).unwrap() ) .unwrap(); pub static ref IMPORTER_INGEST_DURATION: HistogramVec = register_histogram_vec!( @@ -113,7 +113,18 @@ lazy_static! { ).unwrap(); pub static ref CACHE_EVENT: IntCounterVec = register_int_counter_vec!( "tikv_import_apply_cache_event", - "The events of caching. event = {add, remove, out-of-quota}", + "The events of caching. event = {add, remove, out-of-quota, hit}", + &["type"] + ).unwrap(); + pub static ref APPLIER_EVENT: IntCounterVec = register_int_counter_vec!( + "tikv_import_applier_event", + "The events of applier event.", &["type"] ).unwrap(); + pub static ref APPLIER_ENGINE_REQUEST_DURATION: HistogramVec = register_histogram_vec!( + "tikv_import_engine_request", + "The request lifetime track of requesting the RaftKv.", + &["type"], + exponential_buckets(0.01, 4.0, 8).unwrap() + ).unwrap(); } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 5b55974dff3..907874c6928 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -9,7 +9,7 @@ use std::{ path::{Path, PathBuf}, sync::{ atomic::{AtomicU64, Ordering}, - Arc, Condvar, Mutex, + Arc, }, time::Duration, }; @@ -40,7 +40,10 @@ use tikv_util::{ time::{Instant, Limiter}, HandyRwLock, }; -use tokio::runtime::{Handle, Runtime}; +use tokio::{ + runtime::{Handle, Runtime}, + sync::OnceCell, +}; use txn_types::{Key, TimeStamp, WriteRef}; use crate::{ @@ -106,128 +109,10 @@ impl Drop for MemUsePermit { #[derive(Clone, Debug)] pub enum CacheKvFile { - Mem(Remote), + Mem(Arc>), Fs(Arc), } -/// Remote presents a "remote" object which can be downloaded and then cached. -/// The remote object should generally implement the `ShareOwned` trait. -/// This structure doesn't manage how it is downloaded, it just manages the -/// state. You need to provide the manually downloaded data to the -/// [`DownloadPromise`]. -/// Below is the state transform of this: -/// ```text -/// DownloadPromise::fulfill -/// +-----------+ +-----------+ -/// |Downloading+-------->|Cached | -/// +--+--------+ +-----------+ -/// | ^ -/// | | -/// DownloadPromise | | Somebody takes -/// dropped | | over the duty. -/// v | -/// +--------+--+ -/// |Leaked | -/// +-----------+ -/// ``` -#[derive(Debug)] -pub struct Remote(Arc<(Mutex>, Condvar)>); - -impl Clone for Remote { - fn clone(&self) -> Self { - Self(Arc::clone(&self.0)) - } -} - -/// When holding this, the holder has promised to downloading the remote object -/// into local, then provide it to others waiting the object, by -/// [`Self::fulfill()`]. -pub struct DownloadPromise(Arc<(Mutex>, Condvar)>); - -impl DownloadPromise { - /// provide the downloaded data and make it cached. - pub fn fulfill(self, item: T) -> Remote { - let mut l = self.0.as_ref().0.lock().unwrap(); - debug_assert!(matches!(*l, FileCacheInner::Downloading)); - *l = FileCacheInner::Cached(item); - self.0.as_ref().1.notify_all(); - drop(l); - Remote(Arc::clone(&self.0)) - } -} - -impl Drop for DownloadPromise { - fn drop(&mut self) { - let mut l = self.0.as_ref().0.lock().unwrap(); - if matches!(*l, FileCacheInner::Downloading) { - *l = FileCacheInner::Leaked; - self.0.as_ref().1.notify_one(); - } - } -} - -impl Remote { - /// create a downloading remote object. - /// it returns the handle to the remote object and a [`DownloadPromise`], - /// the latter can be used to fulfill the remote object. - /// - /// # Examples - /// ``` - /// # use sst_importer::sst_importer::Remote; - /// let (remote_obj, promise) = Remote::download(); - /// promise.fulfill(42); - /// assert_eq!(remote_obj.get(), Some(42)); - /// ``` - pub fn download() -> (Self, DownloadPromise) { - let inner = Arc::new((Mutex::new(FileCacheInner::Downloading), Condvar::new())); - (Self(Arc::clone(&inner)), DownloadPromise(inner)) - } - - /// Block and wait until the remote object is downloaded. - /// # Returns - /// If the remote object has been fulfilled, return `None`. - /// If the remote object hasn't been fulfilled, return a - /// [`DownloadPromise`]: it is time to take over the duty of downloading. - /// - /// # Examples - /// ``` - /// # use sst_importer::sst_importer::Remote; - /// let (remote_obj, promise) = Remote::download(); - /// drop(promise); - /// let new_promise = remote_obj.wait_until_fill(); - /// new_promise - /// .expect("wait_until_fill should return new promise when old promise dropped") - /// .fulfill(42); - /// assert!(remote_obj.wait_until_fill().is_none()); - /// ``` - pub fn wait_until_fill(&self) -> Option> { - let mut l = self.0.as_ref().0.lock().unwrap(); - loop { - match *l { - FileCacheInner::Downloading => { - l = self.0.as_ref().1.wait(l).unwrap(); - } - FileCacheInner::Leaked => { - *l = FileCacheInner::Downloading; - return Some(DownloadPromise(Arc::clone(&self.0))); - } - FileCacheInner::Cached(_) => return None, - } - } - } -} - -impl Remote { - /// Fetch the internal object of the remote object. - pub fn get(&self) -> Option<::Shared> { - let l = self.0.as_ref().0.lock().unwrap(); - match *l { - FileCacheInner::Downloading | FileCacheInner::Leaked => None, - FileCacheInner::Cached(ref t) => Some(t.share_owned()), - } - } -} - /// returns a error indices that we are going to panic in a invalid state. /// (Rust panic information cannot be send to BR, hence client cannot know /// what happens, so we pack it into a `Result`.) @@ -238,18 +123,16 @@ fn bug(message: impl std::fmt::Display) -> Error { )) } -#[derive(Clone, Debug, PartialEq, Eq)] -enum FileCacheInner { - Downloading, - Leaked, - Cached(T), -} - impl CacheKvFile { // get the ref count of item. pub fn ref_count(&self) -> usize { match self { - CacheKvFile::Mem(buff) => Arc::strong_count(&buff.0), + CacheKvFile::Mem(buff) => { + if let Some(a) = buff.get() { + return Arc::strong_count(&a.content); + } + Arc::strong_count(buff) + } CacheKvFile::Fs(path) => Arc::strong_count(path), } } @@ -257,7 +140,7 @@ impl CacheKvFile { // check the item is expired. pub fn is_expired(&self, start: &Instant) -> bool { match self { - // The expired duration for memeory is 60s. + // The expired duration for memory is 60s. CacheKvFile::Mem(_) => start.saturating_elapsed() >= Duration::from_secs(60), // The expired duration for local file is 10min. CacheKvFile::Fs(_) => start.saturating_elapsed() >= Duration::from_secs(600), @@ -275,7 +158,8 @@ pub struct SstImporter { compression_types: HashMap, cached_storage: CacheMap, - download_rt: Runtime, + // We need to keep reference to the runtime so background tasks won't be dropped. + _download_rt: Runtime, file_locks: Arc>, mem_use: Arc, mem_limit: Arc, @@ -323,7 +207,7 @@ impl SstImporter { compression_types: HashMap::with_capacity(2), file_locks: Arc::new(DashMap::default()), cached_storage, - download_rt, + _download_rt: download_rt, mem_use: Arc::new(AtomicU64::new(0)), mem_limit: Arc::new(AtomicU64::new(memory_limit)), }) @@ -491,6 +375,7 @@ impl SstImporter { self.switcher.get_mode() } + #[cfg(test)] fn download_file_from_external_storage( &self, file_length: u64, @@ -501,7 +386,7 @@ impl SstImporter { speed_limiter: &Limiter, restore_config: external_storage_export::RestoreConfig, ) -> Result<()> { - self.download_rt + self._download_rt .block_on(self.async_download_file_from_external_storage( file_length, src_file_name, @@ -614,7 +499,7 @@ impl SstImporter { let mut need_retain = true; match c { CacheKvFile::Mem(buff) => { - let buflen = buff.get().map(|v| v.len()).unwrap_or_default(); + let buflen = buff.get().map(|v| v.content.len()).unwrap_or_default(); // The term of recycle memeory is 60s. if c.ref_count() == 1 && c.is_expired(start) { CACHE_EVENT.with_label_values(&["remove"]).inc(); @@ -685,48 +570,14 @@ impl SstImporter { } } - pub fn do_read_kv_file( + async fn exec_download( &self, meta: &KvMeta, rewrite_rule: &RewriteRule, ext_storage: Arc, speed_limiter: &Limiter, - ) -> Result { + ) -> Result { let start = Instant::now(); - let dst_name = format!("{}_{}", meta.get_name(), meta.get_range_offset()); - - let promise = { - let lock = self.file_locks.entry(dst_name); - IMPORTER_APPLY_DURATION - .with_label_values(&["download-get-lock"]) - .observe(start.saturating_elapsed().as_secs_f64()); - - match lock { - Entry::Occupied(mut ent) => match ent.get_mut() { - (CacheKvFile::Mem(buff), last_used) => { - *last_used = Instant::now(); - match buff.wait_until_fill() { - Some(handle) => handle, - None => return Ok(ent.get().0.clone()), - } - } - _ => { - return Err(bug(concat!( - "using both read-to-memory and download-to-file is unacceptable for now.", - "(If you think it is possible in the future you are reading this, ", - "please change this line to `return item.get.0.clone()`)", - "(Please also check the state transform is OK too.)", - ))); - } - }, - Entry::Vacant(ent) => { - let (cache, handle) = Remote::download(); - ent.insert((CacheKvFile::Mem(cache), Instant::now())); - handle - } - } - }; - let permit = self .request_memory(meta) .ok_or_else(|| Error::ResourceNotEnough(String::from("memory is limited")))?; @@ -755,24 +606,75 @@ impl SstImporter { file_crypter: None, }; - let buff = self.read_kv_files_from_external_storage( - file_length, - meta.get_name(), - ext_storage, - speed_limiter, - restore_config, - )?; + let buff = self + .read_kv_files_from_external_storage( + file_length, + meta.get_name(), + ext_storage, + speed_limiter, + restore_config, + ) + .await?; IMPORTER_DOWNLOAD_BYTES.observe(file_length as _); IMPORTER_APPLY_DURATION - .with_label_values(&["download"]) + .with_label_values(&["exec_download"]) .observe(start.saturating_elapsed().as_secs_f64()); let rewrite_buff = self.rewrite_kv_file(buff, rewrite_rule)?; - Ok(CacheKvFile::Mem(promise.fulfill(LoadedFile { + Ok(LoadedFile { content: Arc::from(rewrite_buff.into_boxed_slice()), permit, - }))) + }) + } + + pub async fn do_read_kv_file( + &self, + meta: &KvMeta, + rewrite_rule: &RewriteRule, + ext_storage: Arc, + speed_limiter: &Limiter, + ) -> Result { + let start = Instant::now(); + let dst_name = format!("{}_{}", meta.get_name(), meta.get_range_offset()); + + let cache = { + let lock = self.file_locks.entry(dst_name); + IMPORTER_APPLY_DURATION + .with_label_values(&["download-get-lock"]) + .observe(start.saturating_elapsed().as_secs_f64()); + + match lock { + Entry::Occupied(mut ent) => match ent.get_mut() { + (CacheKvFile::Mem(buff), last_used) => { + *last_used = Instant::now(); + Arc::clone(buff) + } + _ => { + return Err(bug(concat!( + "using both read-to-memory and download-to-file is unacceptable for now.", + "(If you think it is possible in the future you are reading this, ", + "please change this line to `return item.get.0.clone()`)", + "(Please also check the state transform is OK too.)", + ))); + } + }, + Entry::Vacant(ent) => { + let cache = Arc::new(OnceCell::new()); + ent.insert((CacheKvFile::Mem(Arc::clone(&cache)), Instant::now())); + cache + } + } + }; + + if cache.initialized() { + CACHE_EVENT.with_label_values(&["hit"]).inc(); + } + + cache + .get_or_try_init(|| self.exec_download(meta, rewrite_rule, ext_storage, speed_limiter)) + .await?; + Ok(CacheKvFile::Mem(cache)) } pub fn wrap_kms( @@ -795,7 +697,7 @@ impl SstImporter { } } - fn read_kv_files_from_external_storage( + async fn read_kv_files_from_external_storage( &self, file_length: u64, file_name: &str, @@ -821,15 +723,14 @@ impl SstImporter { encrypt_wrap_reader(file_crypter, inner)? }; - let r = - self.download_rt - .block_on(external_storage_export::read_external_storage_info_buff( - &mut reader, - speed_limiter, - file_length, - expected_sha256, - external_storage_export::MIN_READ_SPEED, - )); + let r = external_storage_export::read_external_storage_info_buff( + &mut reader, + speed_limiter, + file_length, + expected_sha256, + external_storage_export::MIN_READ_SPEED, + ) + .await; let url = ext_storage.url()?.to_string(); let buff = r.map_err(|e| Error::CannotReadExternalStorage { url: url.to_string(), @@ -841,7 +742,7 @@ impl SstImporter { Ok(buff) } - pub fn read_from_kv_file( + pub async fn read_from_kv_file( &self, meta: &KvMeta, rewrite_rule: &RewriteRule, @@ -850,13 +751,20 @@ impl SstImporter { speed_limiter: &Limiter, ) -> Result> { let c = if self.import_support_download() { - self.do_download_kv_file(meta, backend, speed_limiter)? + self.do_download_kv_file(meta, backend, speed_limiter) + .await? } else { - self.do_read_kv_file(meta, rewrite_rule, ext_storage, speed_limiter)? + self.do_read_kv_file(meta, rewrite_rule, ext_storage, speed_limiter) + .await? }; match c { // If cache memroy, it has been rewrite, return buffer directly. - CacheKvFile::Mem(buff) => buff.get().ok_or_else(|| bug("invalid cache state")), + CacheKvFile::Mem(buff) => Ok(Arc::clone( + &buff + .get() + .ok_or_else(|| bug("invalid cache state"))? + .content, + )), // If cache file name, it need to read and rewrite. CacheKvFile::Fs(path) => { let file = File::open(path.as_ref())?; @@ -870,7 +778,7 @@ impl SstImporter { } } - pub fn do_download_kv_file( + pub async fn do_download_kv_file( &self, meta: &KvMeta, backend: &StorageBackend, @@ -910,7 +818,7 @@ impl SstImporter { expected_sha256, file_crypter: None, }; - self.download_file_from_external_storage( + self.async_download_file_from_external_storage( meta.get_length(), src_name, path.temp.clone(), @@ -918,8 +826,10 @@ impl SstImporter { false, // don't support encrypt for now. speed_limiter, + "", restore_config, - )?; + ) + .await?; info!( "download file finished {}, offset {}, length {}", src_name, @@ -1082,7 +992,7 @@ impl SstImporter { speed_limiter: Limiter, engine: E, ) -> Result> { - self.download_rt.block_on(self.download_ext( + self._download_rt.block_on(self.download_ext( meta, backend, name, @@ -1475,10 +1385,7 @@ mod tests { use tempfile::Builder; use test_sst_importer::*; use test_util::new_test_key_manager; - use tikv_util::{ - codec::stream_event::EventEncoder, stream::block_on_external_io, - sys::thread::StdThreadBuildWrapper, - }; + use tikv_util::{codec::stream_event::EventEncoder, stream::block_on_external_io}; use txn_types::{Value, WriteType}; use uuid::Uuid; @@ -2052,17 +1959,16 @@ mod tests { // test do_read_kv_file() let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); - let output = importer - .do_read_kv_file( - &kv_meta, - rewrite_rule, - ext_storage, - &Limiter::new(f64::INFINITY), - ) - .unwrap(); + let output = block_on_external_io(importer.do_read_kv_file( + &kv_meta, + rewrite_rule, + ext_storage, + &Limiter::new(f64::INFINITY), + )) + .unwrap(); assert!( - matches!(output.clone(), CacheKvFile::Mem(rc) if &*rc.get().unwrap() == buff.as_slice()), + matches!(output.clone(), CacheKvFile::Mem(rc) if &*rc.get().unwrap().content == buff.as_slice()), "{:?}", output ); @@ -2116,15 +2022,14 @@ mod tests { ..Default::default() }; - let output = importer - .read_kv_files_from_external_storage( - kv_meta.get_length(), - kv_meta.get_name(), - ext_storage.clone(), - &Limiter::new(f64::INFINITY), - restore_config, - ) - .unwrap(); + let output = block_on_external_io(importer.read_kv_files_from_external_storage( + kv_meta.get_length(), + kv_meta.get_name(), + ext_storage.clone(), + &Limiter::new(f64::INFINITY), + restore_config, + )) + .unwrap(); assert_eq!( buff, output, @@ -2140,15 +2045,14 @@ mod tests { ..Default::default() }; - let output = importer - .read_kv_files_from_external_storage( - len, - kv_meta.get_name(), - ext_storage, - &Limiter::new(f64::INFINITY), - restore_config, - ) - .unwrap(); + let output = block_on_external_io(importer.read_kv_files_from_external_storage( + len, + kv_meta.get_name(), + ext_storage, + &Limiter::new(f64::INFINITY), + restore_config, + )) + .unwrap(); assert_eq!(&buff[offset as _..(offset + len) as _], &output[..]); } @@ -2182,15 +2086,14 @@ mod tests { // test do_download_kv_file(). assert!(importer.import_support_download()); - let output = importer - .read_from_kv_file( - &kv_meta, - rewrite_rule, - ext_storage, - &backend, - &Limiter::new(f64::INFINITY), - ) - .unwrap(); + let output = block_on_external_io(importer.read_from_kv_file( + &kv_meta, + rewrite_rule, + ext_storage, + &backend, + &Limiter::new(f64::INFINITY), + )) + .unwrap(); assert_eq!(*output, buff); check_file_exists(&path.save, None); @@ -3101,7 +3004,7 @@ mod tests { SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); let key = "file1"; - let (r, _) = Remote::download(); + let r = Arc::new(OnceCell::new()); let value = (CacheKvFile::Mem(r), Instant::now()); let lock = importer.file_locks.entry(key.to_string()).or_insert(value); @@ -3119,53 +3022,4 @@ mod tests { let _buff = v.0.clone(); assert_eq!(v.0.ref_count(), 2); } - - #[test] - fn test_remote_waiting() { - let (r, dl) = Remote::download(); - let r2 = r.clone(); - let js = (0..2) - .map(|_| { - let r = r.clone(); - std::thread::spawn(move || { - assert!(r.wait_until_fill().is_none()); - r.get() - }) - }) - .collect::>(); - dl.fulfill(42); - for j in js { - assert!(matches!(j.join(), Ok(Some(42)))); - } - assert_eq!(r2.get(), Some(42)); - } - - #[test] - fn test_remote_drop_in_one_thread() { - let (r, dl) = Remote::download(); - drop(dl); - let p = r.wait_until_fill(); - assert!(p.is_some()); - p.unwrap().fulfill("Kitty"); - assert_eq!(r.get(), Some("Kitty")); - } - - #[test] - fn test_remote_take_duty() { - let (r, dl) = Remote::download(); - let js = (0..4).map(|i| { - let r = r.clone(); - std::thread::Builder::new() - .name(format!("rd-{}", i)) - .spawn_wrapper(move || match r.wait_until_fill() { - Some(x) => x.fulfill(42).get(), - None => r.get(), - }) - .unwrap() - }); - drop(dl); - for j in js { - assert!(matches!(j.join(), Ok(Some(42)))); - } - } } diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 05d039d2690..9e6c1b9ca3a 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -225,7 +225,7 @@ impl PessimisticLockPair for Modify { } } -#[derive(Default)] +#[derive(Default, Debug)] pub struct WriteData { pub modifies: Vec, pub extra: TxnExtra, diff --git a/src/import/mod.rs b/src/import/mod.rs index e2fa3729e52..6fe43b9aa32 100644 --- a/src/import/mod.rs +++ b/src/import/mod.rs @@ -13,6 +13,7 @@ //! inside TiKV because it needs to interact with raftstore. mod duplicate_detect; +mod raft_writer; mod sst_service; use std::fmt::Debug; diff --git a/src/import/raft_writer.rs b/src/import/raft_writer.rs new file mode 100644 index 00000000000..a40297b932e --- /dev/null +++ b/src/import/raft_writer.rs @@ -0,0 +1,451 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +//! This module contains types for asynchronously applying the write batches +//! into the storage. + +use std::{ + collections::HashMap, + sync::{Arc, Mutex}, +}; + +use futures::{Future, Stream, StreamExt}; +use kvproto::kvrpcpb::Context; +use sst_importer::metrics::{APPLIER_ENGINE_REQUEST_DURATION, APPLIER_EVENT, IMPORTER_APPLY_BYTES}; +use tikv_kv::{with_tls_engine, Engine, WriteData, WriteEvent}; +use tikv_util::time::Instant; +use tokio::sync::{Semaphore, SemaphorePermit}; + +use crate::storage; + +pub async fn wait_write( + mut s: impl Stream + Send + Unpin, +) -> storage::Result<()> { + match s.next().await { + Some(WriteEvent::Finished(Ok(()))) => Ok(()), + Some(WriteEvent::Finished(Err(e))) => Err(e.into()), + Some(e) => Err(box_err!("unexpected event: {:?}", e)), + None => Err(box_err!("stream closed")), + } +} + +const MAX_CONCURRENCY_PER_REGION: usize = 16; + +async fn acquire_semaphore(smp: &Arc) -> Option> { + if let Ok(pmt) = smp.try_acquire() { + return Some(pmt); + } + APPLIER_EVENT.with_label_values(&["raft-throttled"]).inc(); + smp.acquire().await.ok() +} + +#[derive(Clone, Default)] +/// A structure for throttling write throughput by region. +/// It uses the [`Engine`] stored in the thread local storage to write data. +/// Check the method [`tikv_kv::set_tls_engine`] for more details about the +/// thread local engine. +pub(crate) struct ThrottledTlsEngineWriter(Arc>); + +impl ThrottledTlsEngineWriter { + /// Write into the thread local storage engine. + /// + /// # Safety + /// + /// Before polling the future this returns, make sure the carrier thread's + /// `TLS_ENGINE_ANY` is an engine typed `E`, or at least has the same + /// memory layout of `E`. + pub unsafe fn write( + &self, + wd: WriteData, + ctx: Context, + ) -> impl Future> + Send + 'static { + let mut this = self.0.lock().unwrap(); + let max_permit = this.max_permit; + let start = Instant::now_coarse(); + let sem = this + .sems + .entry(ctx.get_region_id()) + .or_insert_with(|| { + APPLIER_EVENT.with_label_values(&["new-writer"]).inc(); + Arc::new(Semaphore::new(max_permit)) + }) + .clone(); + async move { + APPLIER_ENGINE_REQUEST_DURATION + .with_label_values(&["queuing"]) + .observe(start.saturating_elapsed_secs()); + let start = Instant::now_coarse(); + let _prm = match acquire_semaphore(&sem).await { + Some(prm) => prm, + // When the permit has been closed. (Maybe tikv is shutting down?) + None => { + return Err(box_err!( + "the semaphore bind to region {} has been closed", + ctx.get_region_id() + )); + } + }; + + APPLIER_ENGINE_REQUEST_DURATION + .with_label_values(&["get_permit"]) + .observe(start.saturating_elapsed_secs()); + let start = Instant::now_coarse(); + let size = wd.size(); + let fut = with_tls_engine::(move |engine| { + engine.async_write(&ctx, wd, WriteEvent::BASIC_EVENT, None) + }); + let res = wait_write(fut).await; + + APPLIER_ENGINE_REQUEST_DURATION + .with_label_values(&["apply"]) + .observe(start.saturating_elapsed_secs()); + IMPORTER_APPLY_BYTES.observe(size as _); + res + } + } + + /// try to trigger a run of GC. + /// + /// # Returns + /// + /// If we still need to do keep doing GC (there are other references to the + /// handle), return `true`, otherwise `false`. + pub fn try_gc(&self) -> bool { + if Arc::strong_count(&self.0) == 1 { + return false; + } + + let mut this = self.0.lock().unwrap(); + + let before_count = this.sems.len(); + this.sems.retain(|_, v| Arc::strong_count(v) > 1); + let after_count = this.sems.len(); + + APPLIER_EVENT + .with_label_values(&["gc-writer"]) + .inc_by((before_count.saturating_sub(after_count)) as _); + true + } + + #[cfg(test)] + pub fn with_max_concurrency_per_region(conc: usize) -> Self { + let mut inner = Inner::default(); + inner.max_permit = conc; + Self(Arc::new(Mutex::new(inner))) + } + + #[cfg(test)] + pub fn inspect_inflight(&self) -> HashMap { + let this = self.0.lock().unwrap(); + let max_permit = this.max_permit; + this.sems + .iter() + .map(|(rid, sem)| (*rid, max_permit - sem.available_permits())) + .collect() + } + + #[cfg(test)] + pub fn inspect_worker(&self) -> usize { + let this = self.0.lock().unwrap(); + this.sems.len() + } +} + +struct Inner { + sems: HashMap>, + max_permit: usize, +} + +impl Default for Inner { + fn default() -> Self { + Self { + sems: Default::default(), + max_permit: MAX_CONCURRENCY_PER_REGION, + } + } +} + +#[cfg(test)] +mod test { + use std::{convert::identity, iter::IntoIterator, sync::Mutex, time::Duration}; + + use engine_rocks::RocksEngineIterator; + use engine_traits::{Iterator, ALL_CFS, CF_DEFAULT, CF_WRITE}; + use futures::{future::join_all, Future}; + use kvproto::kvrpcpb::Context; + use tempfile::TempDir; + use tikv_kv::{Engine, Modify, RocksEngine, SnapContext, Snapshot, WriteData, WriteEvent}; + use tikv_util::sys::thread::ThreadBuildWrapper; + use tokio::runtime::{Builder, Runtime}; + use txn_types::{Key, TimeStamp, Write, WriteType}; + + use super::ThrottledTlsEngineWriter; + use crate::storage::TestEngineBuilder; + + struct Suite { + handle: ThrottledTlsEngineWriter, + rt: Runtime, + eng: RocksEngine, + + tso: u64, + mirror: RocksEngine, + + _temp_dirs: [TempDir; 2], + } + + impl Suite { + fn wait(&self, fut: impl Future) -> T { + self.rt.block_on(fut) + } + + fn batch<'a, 'b, 'this: 'a + 'b>( + &mut self, + region_id: u64, + f: impl FnOnce(&mut dyn FnMut(&'a str, &'b str)) + 'this, + ) -> (WriteData, Context) { + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + let mut b = vec![]; + let mut t = |key, value| txn(key, value, &mut self.tso, &mut b); + f(&mut t); + let batch = WriteData::new(b.clone(), Default::default()); + let batch2 = WriteData::new(b, Default::default()); + self.wait(write_to_engine(&ctx, &self.mirror, batch)); + (batch2, ctx) + } + + fn send_to_applier( + &self, + args: impl std::iter::Iterator, + ) -> impl Future { + let fut = args + .map(|arg| { + self.rt.spawn( + // SAFETY: we have already register the engine. + unsafe { self.handle.write::(arg.0, arg.1) }, + ) + }) + .collect::>(); + async { + join_all( + fut.into_iter() + .map(|fut| async move { fut.await.unwrap().unwrap() }), + ) + .await; + } + } + + fn check(&mut self, name: &str) { + for cf in ALL_CFS { + let the_mirror = iterate_over(&mut self.mirror, cf); + let real_world = iterate_over(&mut self.eng, cf); + compare_iter(the_mirror, real_world) + .map_err(|err| format!("case {name}: {err}")) + .unwrap(); + } + } + } + + fn create_applier(max_pending_raft_cmd: usize) -> Suite { + let temp_dirs = [TempDir::new().unwrap(), TempDir::new().unwrap()]; + let engine = TestEngineBuilder::new() + .path(temp_dirs[0].path()) + .build() + .unwrap(); + let eng = engine.clone(); + let engine = Mutex::new(engine); + let mirror = TestEngineBuilder::new() + .path(temp_dirs[1].path()) + .build() + .unwrap(); + let rt = Builder::new_multi_thread() + .enable_all() + .worker_threads(1) + .after_start_wrapper(move || tikv_kv::set_tls_engine(engine.lock().unwrap().clone())) + // SAFETY: see the line above. + .before_stop_wrapper(|| unsafe { tikv_kv::destroy_tls_engine::() }) + .build() + .unwrap(); + let handle = + ThrottledTlsEngineWriter::with_max_concurrency_per_region(max_pending_raft_cmd); + Suite { + handle, + rt, + eng, + tso: 1u64, + mirror, + _temp_dirs: temp_dirs, + } + } + + async fn write_to_engine(ctx: &Context, e: &RocksEngine, batch: WriteData) { + use futures_util::StreamExt; + e.async_write(ctx, batch, WriteEvent::BASIC_EVENT, None) + .next() + .await + .unwrap(); + } + + fn iterate_over(e: &mut RocksEngine, cf: &'static str) -> RocksEngineIterator { + let snap = e.snapshot(SnapContext::default()).unwrap(); + let mut iter = snap.iter(cf, Default::default()).unwrap(); + iter.seek_to_first().unwrap(); + iter + } + + fn check_eq( + a: T, + b: T, + tag: &str, + show: impl Fn(T) -> D, + ) -> Result<(), String> { + if a != b { + return Err(format!("{} not match: {} vs {}", tag, show(a), show(b))); + } + Ok(()) + } + + fn compare_iter(mut i1: impl Iterator, mut i2: impl Iterator) -> Result<(), String> { + while i1.valid().unwrap() && i2.valid().unwrap() { + check_eq(i1.key(), i2.key(), "key", <[u8]>::escape_ascii)?; + check_eq(i1.value(), i2.value(), "value", <[u8]>::escape_ascii)?; + i1.next().unwrap(); + i2.next().unwrap(); + } + check_eq(i1.valid().unwrap(), i2.valid().unwrap(), "length", identity)?; + Ok(()) + } + + fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { + let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); + let v = Write::new(ty, TimeStamp::new(start_ts), None); + (k.into_encoded(), v.as_ref().to_bytes()) + } + + fn default(key: &[u8], val: &[u8], start_ts: u64) -> (Vec, Vec) { + let k = Key::from_raw(key).append_ts(TimeStamp::new(start_ts)); + (k.into_encoded(), val.to_owned()) + } + + fn default_req(key: &[u8], val: &[u8], start_ts: u64) -> Modify { + let (k, v) = default(key, val, start_ts); + Modify::Put(CF_DEFAULT, Key::from_encoded(k), v) + } + + fn write_req(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> Modify { + let (k, v) = write(key, ty, commit_ts, start_ts); + if ty == WriteType::Delete { + Modify::Delete(CF_WRITE, Key::from_encoded(k)) + } else { + Modify::Put(CF_WRITE, Key::from_encoded(k), v) + } + } + + fn txn(key: &str, value: &str, tso: &mut u64, append_to: &mut Vec) { + let start = *tso; + let commit = *tso + 1; + *tso += 2; + append_to.extend([ + default_req(key.as_bytes(), value.as_bytes(), start), + write_req(key.as_bytes(), WriteType::Put, start, commit), + ]) + } + + #[test] + fn test_basic() { + let mut suite = create_applier(16); + let b1 = suite.batch(1, |t| { + t("1", "amazing world in my dream"); + t("2", "gazing at the abyss"); + }); + let b2 = suite.batch(2, |t| { + t("3", "the forest leaves drop"); + t("4", "the meaningless words in a test case"); + }); + let fut = suite.send_to_applier(vec![b1, b2].into_iter()); + suite.wait(fut); + + suite.check("basic"); + } + + #[test] + // Clippy doesn't know about the romantic relationship between lazy evaluation and + // side-effective ;) + #[allow(clippy::needless_collect)] + fn test_inflight_max() { + let mut suite = create_applier(3); + + let b1 = (1..6) + .map(|_| { + suite.batch(1, move |t| { + t("al-kīmiyā", "following the light of the moon and stars, the guide of the sun and winds."); + }) + }) + .collect::>(); + let b2 = (1..3) + .map(|_| { + suite.batch(2, move |t| { + t( + "sole key to this mystery", + "fib this n = if n < 2 then n else this (n-1) + this (n-2)", + ); + }) + }) + .collect::>(); + fail::cfg("rockskv_write_modifies", "sleep(5000)").unwrap(); + let fut = suite.send_to_applier(b1.into_iter().chain(b2)); + std::thread::sleep(Duration::from_secs(1)); + let pending_requests = suite.handle.inspect_inflight(); + assert_eq!(*pending_requests.get(&1).unwrap(), 3usize); + assert_eq!(*pending_requests.get(&2).unwrap(), 2usize); + fail::cfg("rockskv_write_modifies", "off").unwrap(); + suite.wait(fut); + + suite.check("inflight_max"); + } + + #[test] + fn test_gc() { + let mut suite = create_applier(16); + let b1 = suite.batch(1, |t| { + t("where is the sun", "it is in the clear sky"); + t("where are the words", "they are in some language model"); + t( + "where is the language model", + "I dunno, these sentences are generated by a human.", + ); + }); + let b2 = suite.batch(2, |t| { + t("...and this case needs two batches", "why?"); + t( + "It is by... tradition.", + "If a case is TOO short, who will believe it is effective?", + ); + t( + "Perhaps we should make the `RocksEngine` be able to distinguish requests.", + "So...", + ); + t( + "We can block `b2` but not for `b1`", + "then we can check there should be only one running worker.", + ); + }); + assert_eq!(suite.handle.inspect_worker(), 0); + fail::cfg("rockskv_async_write", "sleep(5000)").unwrap(); + let fut = suite.send_to_applier(std::iter::once(b1)); + assert_eq!(suite.handle.inspect_worker(), 1); + let fut2 = suite.send_to_applier(std::iter::once(b2)); + assert_eq!(suite.handle.inspect_worker(), 2); + + fail::cfg("rockskv_async_write", "off").unwrap(); + suite.wait(async move { + fut.await; + fut2.await; + }); + + let hnd = suite.handle.clone(); + assert!(hnd.try_gc()); + assert_eq!(suite.handle.inspect_worker(), 0); + + drop(suite); + assert!(!hnd.try_gc()); + } +} diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 12cb0ca892b..4707b348bc5 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -2,6 +2,7 @@ use std::{ collections::{HashMap, VecDeque}, + convert::identity, future::Future, path::PathBuf, sync::{Arc, Mutex}, @@ -11,8 +12,7 @@ use std::{ use collections::HashSet; use engine_traits::{CompactExt, MiscExt, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; -use futures::{sink::SinkExt, stream::TryStreamExt, Stream, StreamExt, TryFutureExt}; -use futures_executor::{ThreadPool, ThreadPoolBuilder}; +use futures::{sink::SinkExt, stream::TryStreamExt, FutureExt, TryFutureExt}; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, }; @@ -42,14 +42,24 @@ use tikv_util::{ use tokio::{runtime::Runtime, time::sleep}; use txn_types::{Key, WriteRef, WriteType}; -use super::make_rpc_error; +use super::{ + make_rpc_error, + raft_writer::{self, wait_write}, +}; use crate::{ import::duplicate_detect::DuplicateDetector, server::CONFIG_ROCKSDB_GAUGE, storage::{self, errors::extract_region_error_from_error}, }; -const MAX_INFLIGHT_RAFT_MSGS: usize = 64; +/// The concurrency of sending raft request for every `apply` requests. +/// This value `16` would mainly influence the speed of applying a huge file: +/// when we downloading the files into disk, loading all of them into memory may +/// lead to OOM. This would be able to back-pressure them. +/// (only log files greater than 16 * 7M = 112M would be throttled by this.) +/// NOTE: Perhaps add a memory quota for download to disk mode and get rid of +/// this value? +const REQUEST_WRITE_CONCURRENCY: usize = 16; /// The extra bytes required by the wire encoding. /// Generally, a field (and a embedded message) would introduce 2 extra /// bytes. In detail, they are: @@ -63,6 +73,10 @@ const MAX_INFLIGHT_RAFT_MSGS: usize = 64; /// content length is greater than 128, however when the length is greater than /// 128, the extra 1~4 bytes can be ignored. const WIRE_EXTRA_BYTES: usize = 10; +/// The interval of running the GC for +/// [`raft_writer::ThrottledTlsEngineWriter`]. There aren't too many items held +/// in the writer. So we can run the GC less frequently. +const WRITER_GC_INTERVAL: Duration = Duration::from_secs(300); fn transfer_error(err: storage::Error) -> ImportPbError { let mut e = ImportPbError::default(); @@ -73,13 +87,15 @@ fn transfer_error(err: storage::Error) -> ImportPbError { e } -async fn wait_write(mut s: impl Stream + Send + Unpin) -> storage::Result<()> { - match s.next().await { - Some(WriteEvent::Finished(Ok(()))) => Ok(()), - Some(WriteEvent::Finished(Err(e))) => Err(e.into()), - Some(e) => Err(box_err!("unexpected event: {:?}", e)), - None => Err(box_err!("stream closed")), +fn convert_join_error(err: tokio::task::JoinError) -> ImportPbError { + let mut e = ImportPbError::default(); + if err.is_cancelled() { + e.set_message("task canceled, probably runtime is shutting down.".to_owned()); } + if err.is_panic() { + e.set_message(format!("panicked! {}", err)) + } + e } /// ImportSstService provides tikv-server with the ability to ingest SST files. @@ -92,16 +108,12 @@ pub struct ImportSstService { tablets: LocalTablets, engine: E, threads: Arc, - // For now, PiTR cannot be executed in the tokio runtime because it is synchronous and may - // blocks. (tokio is so strict... it panics if we do insane things like blocking in an async - // context.) - // We need to execute these code in a context which allows blocking. - // FIXME: Make PiTR restore asynchronous. Get rid of this pool. - block_threads: Arc, importer: Arc, limiter: Limiter, task_slots: Arc>>, raft_entry_max_size: ReadableSize, + + writer: raft_writer::ThrottledTlsEngineWriter, } struct RequestCollector { @@ -272,6 +284,7 @@ impl ImportSstService { importer: Arc, ) -> Self { let props = tikv_util::thread_group::current_properties(); + let eng = Mutex::new(engine.clone()); let threads = tokio::runtime::Builder::new_multi_thread() .worker_threads(cfg.num_threads) .enable_all() @@ -280,26 +293,27 @@ impl ImportSstService { tikv_util::thread_group::set_properties(props.clone()); tikv_alloc::add_thread_memory_accessor(); set_io_type(IoType::Import); + tikv_kv::set_tls_engine(eng.lock().unwrap().clone()); }) - .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) - .build() - .unwrap(); - let props = tikv_util::thread_group::current_properties(); - let block_threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads) - .name_prefix("sst-importer") - .after_start_wrapper(move || { - tikv_util::thread_group::set_properties(props.clone()); - tikv_alloc::add_thread_memory_accessor(); - set_io_type(IoType::Import); + .before_stop_wrapper(move || { + tikv_alloc::remove_thread_memory_accessor(); + // SAFETY: we have set the engine at some lines above with type `E`. + unsafe { tikv_kv::destroy_tls_engine::() }; }) - .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) - .create() + .build() .unwrap(); if let LocalTablets::Singleton(tablet) = &tablets { importer.start_switch_mode_check(threads.handle(), tablet.clone()); } + let writer = raft_writer::ThrottledTlsEngineWriter::default(); + let gc_handle = writer.clone(); + threads.spawn(async move { + while gc_handle.try_gc() { + tokio::time::sleep(WRITER_GC_INTERVAL).await; + } + }); + let cfg_mgr = ConfigManager::new(cfg); threads.spawn(Self::tick(importer.clone(), cfg_mgr.clone())); @@ -307,12 +321,12 @@ impl ImportSstService { cfg: cfg_mgr, tablets, threads: Arc::new(threads), - block_threads: Arc::new(block_threads), engine, importer, limiter: Limiter::new(f64::INFINITY), task_slots: Arc::new(Mutex::new(HashSet::default())), raft_entry_max_size, + writer, } } @@ -475,7 +489,7 @@ impl ImportSstService { async fn apply_imp( mut req: ApplyRequest, importer: Arc, - engine: E, + writer: raft_writer::ThrottledTlsEngineWriter, limiter: Limiter, max_raft_size: usize, ) -> std::result::Result, ImportPbError> { @@ -500,13 +514,15 @@ impl ImportSstService { let mut tasks = metas.iter().zip(rules.iter()).peekable(); while let Some((meta, rule)) = tasks.next() { - let buff = importer.read_from_kv_file( - meta, - rule, - ext_storage.clone(), - req.get_storage_backend(), - &limiter, - )?; + let buff = importer + .read_from_kv_file( + meta, + rule, + ext_storage.clone(), + req.get_storage_backend(), + &limiter, + ) + .await?; if let Some(mut r) = importer.do_apply_kv_file( meta.get_start_key(), meta.get_end_key(), @@ -524,20 +540,29 @@ impl ImportSstService { } let is_last_task = tasks.peek().is_none(); - for req in collector.drain_pending_writes(is_last_task) { - let f = engine.async_write(&context, req, WriteEvent::BASIC_EVENT, None); - inflight_futures.push_back(f); - if inflight_futures.len() >= MAX_INFLIGHT_RAFT_MSGS { - wait_write(inflight_futures.pop_front().unwrap()) - .await - .map_err(transfer_error)?; + for w in collector.drain_pending_writes(is_last_task) { + // Record the start of a task would greatly help us to inspect pending + // tasks. + APPLIER_EVENT.with_label_values(&["begin_req"]).inc(); + // SAFETY: we have registered the thread local storage engine into the thread + // when creating them. + let task = unsafe { + writer + .write::(w, context.clone()) + .map_err(transfer_error) + }; + inflight_futures.push_back( + tokio::spawn(task) + .map_err(convert_join_error) + .map(|x| x.and_then(identity)), + ); + if inflight_futures.len() >= REQUEST_WRITE_CONCURRENCY { + inflight_futures.pop_front().unwrap().await?; } } } assert!(collector.is_empty()); - for f in inflight_futures { - wait_write(f).await.map_err(transfer_error)?; - } + futures::future::try_join_all(inflight_futures).await?; Ok(range) } @@ -728,7 +753,6 @@ impl ImportSst for ImportSstService { sst_importer::metrics::IMPORTER_APPLY_DURATION .with_label_values(&[label]) .observe(start.saturating_elapsed().as_secs_f64()); - crate::send_rpc_response!(Ok(resp), sink, label, timer); }; self.threads.spawn(handle_task); @@ -740,9 +764,9 @@ impl ImportSst for ImportSstService { let label = "apply"; let start = Instant::now(); let importer = self.importer.clone(); - let engine = self.engine.clone(); let limiter = self.limiter.clone(); let max_raft_size = self.raft_entry_max_size.0 as usize; + let applier = self.writer.clone(); let handle_task = async move { // Records how long the apply task waits to be scheduled. @@ -752,7 +776,7 @@ impl ImportSst for ImportSstService { let mut resp = ApplyResponse::default(); - match Self::apply_imp(req, importer, engine, limiter, max_raft_size).await { + match Self::apply_imp(req, importer, applier, limiter, max_raft_size).await { Ok(Some(r)) => resp.set_range(r), Err(e) => resp.set_error(e), _ => {} @@ -761,7 +785,7 @@ impl ImportSst for ImportSstService { debug!("finished apply kv file with {:?}", resp); crate::send_rpc_response!(Ok(resp), sink, label, start); }; - self.block_threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } /// Downloads the file and performs key-rewrite for later ingesting. From e0d25f90fff0bda07976cdee2c81ec1cba0029b1 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 30 Mar 2023 16:08:55 +0800 Subject: [PATCH 612/676] server: Introduce a common layer between server and server2 (#14395) ref tikv/tikv#14401 server: Introduce a common layer between server and server2 Signed-off-by: CalvinNeo Co-authored-by: Ti Chi Robot --- components/server/src/common.rs | 238 +++++++++++ components/server/src/lib.rs | 1 + components/server/src/server.rs | 441 ++++++-------------- components/server/src/server2.rs | 403 +++++------------- components/test_raftstore-v2/src/cluster.rs | 49 +-- components/test_raftstore/src/cluster.rs | 14 +- 6 files changed, 499 insertions(+), 647 deletions(-) create mode 100644 components/server/src/common.rs diff --git a/components/server/src/common.rs b/components/server/src/common.rs new file mode 100644 index 00000000000..5c6dfa16120 --- /dev/null +++ b/components/server/src/common.rs @@ -0,0 +1,238 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +use std::{ + cmp, env, + net::SocketAddr, + path::{Path, PathBuf}, + sync::{mpsc, Arc}, + u64, +}; + +use encryption_export::{data_key_manager_from_config, DataKeyManager}; +use engine_rocks::FlowInfo; +use error_code::ErrorCodeExt; +use file_system::{set_io_rate_limiter, BytesFetcher, File}; +use tikv::config::TikvConfig; +use tikv_util::sys::{disk, path_in_diff_mount_point}; + +/// This is the common layer of TiKV-like servers. By holding it in its own +/// TikvServer implementation, one can easily access the common ability of a +/// TiKV server. +pub struct TikvServerCore { + pub config: TikvConfig, + pub store_path: PathBuf, + pub lock_files: Vec, + pub encryption_key_manager: Option>, + pub flow_info_sender: Option>, + pub flow_info_receiver: Option>, +} + +impl TikvServerCore { + pub fn check_conflict_addr(&mut self) { + let cur_addr: SocketAddr = self + .config + .server + .addr + .parse() + .expect("failed to parse into a socket address"); + let cur_ip = cur_addr.ip(); + let cur_port = cur_addr.port(); + let lock_dir = get_lock_dir(); + + let search_base = env::temp_dir().join(lock_dir); + file_system::create_dir_all(&search_base) + .unwrap_or_else(|_| panic!("create {} failed", search_base.display())); + + for entry in file_system::read_dir(&search_base).unwrap().flatten() { + if !entry.file_type().unwrap().is_file() { + continue; + } + let file_path = entry.path(); + let file_name = file_path.file_name().unwrap().to_str().unwrap(); + if let Ok(addr) = file_name.replace('_', ":").parse::() { + let ip = addr.ip(); + let port = addr.port(); + if cur_port == port + && (cur_ip == ip || cur_ip.is_unspecified() || ip.is_unspecified()) + { + let _ = try_lock_conflict_addr(file_path); + } + } + } + + let cur_path = search_base.join(cur_addr.to_string().replace(':', "_")); + let cur_file = try_lock_conflict_addr(cur_path); + self.lock_files.push(cur_file); + } + + pub fn init_fs(&mut self) { + let lock_path = self.store_path.join(Path::new("LOCK")); + + let f = File::create(lock_path.as_path()) + .unwrap_or_else(|e| fatal!("failed to create lock at {}: {}", lock_path.display(), e)); + if f.try_lock_exclusive().is_err() { + fatal!( + "lock {} failed, maybe another instance is using this directory.", + self.store_path.display() + ); + } + self.lock_files.push(f); + + if tikv_util::panic_mark_file_exists(&self.config.storage.data_dir) { + fatal!( + "panic_mark_file {} exists, there must be something wrong with the db. \ + Do not remove the panic_mark_file and force the TiKV node to restart. \ + Please contact TiKV maintainers to investigate the issue. \ + If needed, use scale in and scale out to replace the TiKV node. \ + https://docs.pingcap.com/tidb/stable/scale-tidb-using-tiup", + tikv_util::panic_mark_file_path(&self.config.storage.data_dir).display() + ); + } + + // Allocate a big file to make sure that TiKV have enough space to + // recover from disk full errors. This file is created in data_dir rather than + // db_path, because we must not increase store size of db_path. + fn calculate_reserved_space(capacity: u64, reserved_size_from_config: u64) -> u64 { + let mut reserved_size = reserved_size_from_config; + if reserved_size_from_config != 0 { + reserved_size = + cmp::max((capacity as f64 * 0.05) as u64, reserved_size_from_config); + } + reserved_size + } + fn reserve_physical_space(data_dir: &String, available: u64, reserved_size: u64) { + let path = Path::new(data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); + if let Err(e) = file_system::remove_file(path) { + warn!("failed to remove space holder on starting: {}", e); + } + + // place holder file size is 20% of total reserved space. + if available > reserved_size { + file_system::reserve_space_for_recover(data_dir, reserved_size / 5) + .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) + .unwrap(); + } else { + warn!("no enough disk space left to create the place holder file"); + } + } + + let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); + let mut capacity = disk_stats.total_space(); + if self.config.raft_store.capacity.0 > 0 { + capacity = cmp::min(capacity, self.config.raft_store.capacity.0); + } + // reserve space for kv engine + let kv_reserved_size = + calculate_reserved_space(capacity, self.config.storage.reserve_space.0); + disk::set_disk_reserved_space(kv_reserved_size); + reserve_physical_space( + &self.config.storage.data_dir, + disk_stats.available_space(), + kv_reserved_size, + ); + + let raft_data_dir = if self.config.raft_engine.enable { + self.config.raft_engine.config().dir + } else { + self.config.raft_store.raftdb_path.clone() + }; + + let separated_raft_mount_path = + path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); + if separated_raft_mount_path { + let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); + // reserve space for raft engine if raft engine is deployed separately + let raft_reserved_size = calculate_reserved_space( + raft_disk_stats.total_space(), + self.config.storage.reserve_raft_space.0, + ); + disk::set_raft_disk_reserved_space(raft_reserved_size); + reserve_physical_space( + &raft_data_dir, + raft_disk_stats.available_space(), + raft_reserved_size, + ); + } + } + + pub fn init_yatp(&self) { + yatp::metrics::set_namespace(Some("tikv")); + prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL0_CHANCE.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL_ELAPSED.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_POLL_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_TIMES.clone())).unwrap(); + } + + pub fn init_encryption(&mut self) { + self.encryption_key_manager = data_key_manager_from_config( + &self.config.security.encryption, + &self.config.storage.data_dir, + ) + .map_err(|e| { + panic!( + "Encryption failed to initialize: {}. code: {}", + e, + e.error_code() + ) + }) + .unwrap() + .map(Arc::new); + } + + pub fn init_io_utility(&mut self) -> BytesFetcher { + let stats_collector_enabled = file_system::init_io_stats_collector() + .map_err(|e| warn!("failed to init I/O stats collector: {}", e)) + .is_ok(); + + let limiter = Arc::new( + self.config + .storage + .io_rate_limit + .build(!stats_collector_enabled /* enable_statistics */), + ); + let fetcher = if stats_collector_enabled { + BytesFetcher::FromIoStatsCollector() + } else { + BytesFetcher::FromRateLimiter(limiter.statistics().unwrap()) + }; + // Set up IO limiter even when rate limit is disabled, so that rate limits can + // be dynamically applied later on. + set_io_rate_limiter(Some(limiter)); + fetcher + } + + pub fn init_flow_receiver(&mut self) -> engine_rocks::FlowListener { + let (tx, rx) = mpsc::channel(); + self.flow_info_sender = Some(tx.clone()); + self.flow_info_receiver = Some(rx); + engine_rocks::FlowListener::new(tx) + } +} + +#[cfg(unix)] +fn get_lock_dir() -> String { + format!("{}_TIKV_LOCK_FILES", unsafe { libc::getuid() }) +} + +#[cfg(not(unix))] +fn get_lock_dir() -> String { + "TIKV_LOCK_FILES".to_owned() +} + +fn try_lock_conflict_addr>(path: P) -> File { + let f = File::create(path.as_ref()).unwrap_or_else(|e| { + fatal!( + "failed to create lock at {}: {}", + path.as_ref().display(), + e + ) + }); + + if f.try_lock_exclusive().is_err() { + fatal!( + "{} already in use, maybe another instance is binding with this address.", + path.as_ref().file_name().unwrap().to_str().unwrap() + ); + } + f +} diff --git a/components/server/src/lib.rs b/components/server/src/lib.rs index d5c8e352a88..144cc1885d5 100644 --- a/components/server/src/lib.rs +++ b/components/server/src/lib.rs @@ -9,6 +9,7 @@ extern crate tikv_util; #[macro_use] pub mod setup; +pub mod common; pub mod memory; pub mod raft_engine_switch; pub mod server; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 06df19da1d6..35fc96a3460 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -15,8 +15,7 @@ use std::{ cmp, collections::HashMap, convert::TryFrom, - env, fmt, - net::SocketAddr, + fmt, path::{Path, PathBuf}, str::FromStr, sync::{ @@ -34,11 +33,11 @@ use backup_stream::{ use causal_ts::CausalTsProviderImpl; use cdc::{CdcConfigManager, MemoryQuota}; use concurrency_manager::ConcurrencyManager; -use encryption_export::{data_key_manager_from_config, DataKeyManager}; +use encryption_export::DataKeyManager; use engine_rocks::{ flush_engine_statistics, from_rocks_compression_type, raw::{Cache, Env}, - FlowInfo, RocksEngine, RocksStatistics, + RocksEngine, RocksStatistics, }; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ @@ -46,10 +45,8 @@ use engine_traits::{ RaftEngine, SingletonFactory, StatisticsReporter, TabletContext, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, }; -use error_code::ErrorCodeExt; use file_system::{ - get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor, - MetricsManager as IoMetricsManager, + get_io_rate_limiter, BytesFetcher, IoBudgetAdjustor, MetricsManager as IoMetricsManager, }; use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; @@ -137,7 +134,7 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - memory::*, raft_engine_switch::*, setup::*, signal_handler, + common::TikvServerCore, memory::*, raft_engine_switch::*, setup::*, signal_handler, tikv_util::sys::thread::ThreadBuildWrapper, }; @@ -159,16 +156,16 @@ fn run_impl(config: TikvConfig) { let mut tikv = TikvServer::::init::(config); // Must be called after `TikvServer::init`. - let memory_limit = tikv.config.memory_usage_limit.unwrap().0; - let high_water = (tikv.config.memory_usage_high_water * memory_limit as f64) as u64; + let memory_limit = tikv.core.config.memory_usage_limit.unwrap().0; + let high_water = (tikv.core.config.memory_usage_high_water * memory_limit as f64) as u64; register_memory_usage_high_water(high_water); - tikv.check_conflict_addr(); - tikv.init_fs(); - tikv.init_yatp(); - tikv.init_encryption(); - let fetcher = tikv.init_io_utility(); - let listener = tikv.init_flow_receiver(); + tikv.core.check_conflict_addr(); + tikv.core.init_fs(); + tikv.core.init_yatp(); + tikv.core.init_encryption(); + let fetcher = tikv.core.init_io_utility(); + let listener = tikv.core.init_flow_receiver(); let (engines, engines_info) = tikv.init_raw_engines(listener); tikv.init_engines(engines.clone()); let server_config = tikv.init_servers::(); @@ -227,18 +224,14 @@ const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); /// A complete TiKV server. struct TikvServer { - config: TikvConfig, + core: TikvServerCore, cfg_controller: Option, security_mgr: Arc, pd_client: Arc, router: RaftRouter, - flow_info_sender: Option>, - flow_info_receiver: Option>, system: Option>, resolver: Option, - store_path: PathBuf, snap_mgr: Option, // Will be filled in `init_servers`. - encryption_key_manager: Option>, engines: Option>, kv_statistics: Option>, raft_statistics: Option>, @@ -246,7 +239,6 @@ struct TikvServer { region_info_accessor: RegionInfoAccessor, coprocessor_host: Option>, to_stop: Vec>, - lock_files: Vec, concurrency_manager: ConcurrencyManager, env: Arc, background_worker: Worker, @@ -401,16 +393,21 @@ where let check_leader_worker = WorkerBuilder::new("check_leader").thread_count(1).create(); TikvServer { - config, + core: TikvServerCore { + config, + store_path, + lock_files: vec![], + encryption_key_manager: None, + flow_info_sender: None, + flow_info_receiver: None, + }, cfg_controller: Some(cfg_controller), security_mgr, pd_client, router, system: Some(system), resolver: None, - store_path, snap_mgr: None, - encryption_key_manager: None, engines: None, kv_statistics: None, raft_statistics: None, @@ -418,13 +415,10 @@ where region_info_accessor, coprocessor_host, to_stop: vec![], - lock_files: vec![], concurrency_manager, env, background_worker, check_leader_worker, - flow_info_sender: None, - flow_info_receiver: None, sst_worker: None, quota_limiter, resource_manager, @@ -505,166 +499,6 @@ where pd_client } - fn check_conflict_addr(&mut self) { - let cur_addr: SocketAddr = self - .config - .server - .addr - .parse() - .expect("failed to parse into a socket address"); - let cur_ip = cur_addr.ip(); - let cur_port = cur_addr.port(); - let lock_dir = get_lock_dir(); - - let search_base = env::temp_dir().join(lock_dir); - file_system::create_dir_all(&search_base) - .unwrap_or_else(|_| panic!("create {} failed", search_base.display())); - - for entry in file_system::read_dir(&search_base).unwrap().flatten() { - if !entry.file_type().unwrap().is_file() { - continue; - } - let file_path = entry.path(); - let file_name = file_path.file_name().unwrap().to_str().unwrap(); - if let Ok(addr) = file_name.replace('_', ":").parse::() { - let ip = addr.ip(); - let port = addr.port(); - if cur_port == port - && (cur_ip == ip || cur_ip.is_unspecified() || ip.is_unspecified()) - { - let _ = try_lock_conflict_addr(file_path); - } - } - } - - let cur_path = search_base.join(cur_addr.to_string().replace(':', "_")); - let cur_file = try_lock_conflict_addr(cur_path); - self.lock_files.push(cur_file); - } - - fn init_fs(&mut self) { - let lock_path = self.store_path.join(Path::new("LOCK")); - - let f = File::create(lock_path.as_path()) - .unwrap_or_else(|e| fatal!("failed to create lock at {}: {}", lock_path.display(), e)); - if f.try_lock_exclusive().is_err() { - fatal!( - "lock {} failed, maybe another instance is using this directory.", - self.store_path.display() - ); - } - self.lock_files.push(f); - - if tikv_util::panic_mark_file_exists(&self.config.storage.data_dir) { - fatal!( - "panic_mark_file {} exists, there must be something wrong with the db. \ - Do not remove the panic_mark_file and force the TiKV node to restart. \ - Please contact TiKV maintainers to investigate the issue. \ - If needed, use scale in and scale out to replace the TiKV node. \ - https://docs.pingcap.com/tidb/stable/scale-tidb-using-tiup", - tikv_util::panic_mark_file_path(&self.config.storage.data_dir).display() - ); - } - - // We truncate a big file to make sure that both raftdb and kvdb of TiKV have - // enough space to do compaction and region migration when TiKV recover. - // This file is created in data_dir rather than db_path, because we must not - // increase store size of db_path. - fn calculate_reserved_space(capacity: u64, reserved_size_from_config: u64) -> u64 { - let mut reserved_size = reserved_size_from_config; - if reserved_size_from_config != 0 { - reserved_size = - cmp::max((capacity as f64 * 0.05) as u64, reserved_size_from_config); - } - reserved_size - } - fn reserve_physical_space(data_dir: &String, available: u64, reserved_size: u64) { - let path = Path::new(data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); - if let Err(e) = file_system::remove_file(path) { - warn!("failed to remove space holder on starting: {}", e); - } - - // place holder file size is 20% of total reserved space. - if available > reserved_size { - file_system::reserve_space_for_recover(data_dir, reserved_size / 5) - .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) - .unwrap(); - } else { - warn!("no enough disk space left to create the place holder file"); - } - } - - let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); - let mut capacity = disk_stats.total_space(); - if self.config.raft_store.capacity.0 > 0 { - capacity = cmp::min(capacity, self.config.raft_store.capacity.0); - } - // reserve space for kv engine - let kv_reserved_size = - calculate_reserved_space(capacity, self.config.storage.reserve_space.0); - disk::set_disk_reserved_space(kv_reserved_size); - reserve_physical_space( - &self.config.storage.data_dir, - disk_stats.available_space(), - kv_reserved_size, - ); - - let raft_data_dir = if self.config.raft_engine.enable { - self.config.raft_engine.config().dir - } else { - self.config.raft_store.raftdb_path.clone() - }; - - let separated_raft_mount_path = - path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); - if separated_raft_mount_path { - let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); - // reserve space for raft engine if raft engine is deployed separately - let raft_reserved_size = calculate_reserved_space( - raft_disk_stats.total_space(), - self.config.storage.reserve_raft_space.0, - ); - disk::set_raft_disk_reserved_space(raft_reserved_size); - reserve_physical_space( - &raft_data_dir, - raft_disk_stats.available_space(), - raft_reserved_size, - ); - } - } - - fn init_yatp(&self) { - yatp::metrics::set_namespace(Some("tikv")); - prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL0_CHANCE.clone())).unwrap(); - prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL_ELAPSED.clone())).unwrap(); - prometheus::register(Box::new(yatp::metrics::TASK_EXEC_DURATION.clone())).unwrap(); - prometheus::register(Box::new(yatp::metrics::TASK_POLL_DURATION.clone())).unwrap(); - prometheus::register(Box::new(yatp::metrics::TASK_EXEC_TIMES.clone())).unwrap(); - } - - fn init_encryption(&mut self) { - self.encryption_key_manager = data_key_manager_from_config( - &self.config.security.encryption, - &self.config.storage.data_dir, - ) - .map_err(|e| { - panic!( - "Encryption failed to initialize: {}. code: {}", - e, - e.error_code() - ) - }) - .unwrap() - .map(Arc::new); - } - - fn init_flow_receiver(&mut self) -> engine_rocks::FlowListener { - let (tx, rx) = mpsc::channel(); - self.flow_info_sender = Some(tx.clone()); - self.flow_info_receiver = Some(rx); - engine_rocks::FlowListener::new(tx) - } - fn init_engines(&mut self, engines: Engines) { let store_meta = Arc::new(Mutex::new(StoreMeta::new(PENDING_MSG_CAP))); let engine = RaftKv::new( @@ -693,8 +527,8 @@ where let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( engines.engine.clone(), - self.flow_info_sender.take().unwrap(), - self.config.gc.clone(), + self.core.flow_info_sender.take().unwrap(), + self.core.config.gc.clone(), self.pd_client.feature_gate().clone(), Arc::new(self.region_info_accessor.clone()), ); @@ -710,9 +544,9 @@ where fn init_servers(&mut self) -> Arc> { let flow_controller = Arc::new(FlowController::Singleton(EngineFlowController::new( - &self.config.storage.flow_control, + &self.core.config.storage.flow_control, self.engines.as_ref().unwrap().engine.kv_engine().unwrap(), - self.flow_info_receiver.take().unwrap(), + self.core.flow_info_receiver.take().unwrap(), ))); let mut gc_worker = self.init_gc_worker(); let mut ttl_checker = Box::new(LazyWorker::new("ttl-checker")); @@ -740,7 +574,7 @@ where .engine .set_txn_extra_scheduler(Arc::new(txn_extra_scheduler)); - let lock_mgr = LockManager::new(&self.config.pessimistic_txn); + let lock_mgr = LockManager::new(&self.core.config.pessimistic_txn); cfg_controller.register( tikv::config::Module::PessimisticTxn, Box::new(lock_mgr.config_manager()), @@ -756,19 +590,23 @@ where let sst_runner = RecoveryRunner::new( engines.engines.kv.clone(), engines.store_meta.clone(), - self.config.storage.background_error_recovery_window.into(), + self.core + .config + .storage + .background_error_recovery_window + .into(), DEFAULT_CHECK_INTERVAL, ); sst_worker.start_with_timer(sst_runner); } - let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let unified_read_pool = if self.core.config.readpool.is_unified_pool_enabled() { let resource_ctl = self .resource_manager .as_ref() .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( - &self.config.readpool.unified, + &self.core.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), resource_ctl, @@ -804,16 +642,18 @@ where // Start resource metering. let (recorder_notifier, collector_reg_handle, resource_tag_factory, recorder_worker) = - resource_metering::init_recorder(self.config.resource_metering.precision.as_millis()); + resource_metering::init_recorder( + self.core.config.resource_metering.precision.as_millis(), + ); self.to_stop.push(recorder_worker); let (reporter_notifier, data_sink_reg_handle, reporter_worker) = resource_metering::init_reporter( - self.config.resource_metering.clone(), + self.core.config.resource_metering.clone(), collector_reg_handle.clone(), ); self.to_stop.push(reporter_worker); let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( - self.config.resource_metering.receiver_address.clone(), + self.core.config.resource_metering.receiver_address.clone(), self.env.clone(), data_sink_reg_handle.clone(), ); @@ -821,7 +661,7 @@ where let rsmeter_pubsub_service = resource_metering::PubSubService::new(data_sink_reg_handle); let cfg_manager = resource_metering::ConfigManager::new( - self.config.resource_metering.clone(), + self.core.config.resource_metering.clone(), recorder_notifier, reporter_notifier, address_change_notifier, @@ -831,11 +671,11 @@ where Box::new(cfg_manager), ); - let storage_read_pool_handle = if self.config.readpool.storage.use_unified_pool() { + let storage_read_pool_handle = if self.core.config.readpool.storage.use_unified_pool() { unified_read_pool.as_ref().unwrap().handle() } else { let storage_read_pools = ReadPool::from(storage::build_read_pool( - &self.config.readpool.storage, + &self.core.config.readpool.storage, pd_sender.clone(), engines.engine.clone(), )); @@ -844,7 +684,7 @@ where let storage = Storage::<_, _, F>::from_engine( engines.engine.clone(), - &self.config.storage, + &self.core.config.storage, storage_read_pool_handle, lock_mgr.clone(), self.concurrency_manager.clone(), @@ -882,20 +722,21 @@ where // Create snapshot manager, server. let snap_path = self + .core .store_path .join(Path::new("snap")) .to_str() .unwrap() .to_owned(); - let bps = i64::try_from(self.config.server.snap_io_max_bytes_per_sec.0) + let bps = i64::try_from(self.core.config.server.snap_io_max_bytes_per_sec.0) .unwrap_or_else(|_| fatal!("snap_io_max_bytes_per_sec > i64::max_value")); let snap_mgr = SnapManagerBuilder::default() .max_write_bytes_per_sec(bps) - .max_total_size(self.config.server.snap_max_total_size.0) - .encryption_key_manager(self.encryption_key_manager.clone()) - .max_per_file_size(self.config.raft_store.max_snapshot_file_raw_size.0) + .max_total_size(self.core.config.server.snap_max_total_size.0) + .encryption_key_manager(self.core.encryption_key_manager.clone()) + .max_per_file_size(self.core.config.raft_store.max_snapshot_file_raw_size.0) .enable_multi_snapshot_files( self.pd_client .feature_gate() @@ -904,11 +745,11 @@ where .build(snap_path); // Create coprocessor endpoint. - let cop_read_pool_handle = if self.config.readpool.coprocessor.use_unified_pool() { + let cop_read_pool_handle = if self.core.config.readpool.coprocessor.use_unified_pool() { unified_read_pool.as_ref().unwrap().handle() } else { let cop_read_pools = ReadPool::from(coprocessor::readpool_impl::build_read_pool( - &self.config.readpool.coprocessor, + &self.core.config.readpool.coprocessor, pd_sender, engines.engine.clone(), )); @@ -916,7 +757,7 @@ where }; let mut unified_read_pool_scale_receiver = None; - if self.config.readpool.is_unified_pool_enabled() { + if self.core.config.readpool.is_unified_pool_enabled() { let (unified_read_pool_scale_notifier, rx) = mpsc::sync_channel(10); cfg_controller.register( tikv::config::Module::Readpool, @@ -924,8 +765,8 @@ where unified_read_pool.as_ref().unwrap().handle(), unified_read_pool_scale_notifier, &self.background_worker, - self.config.readpool.unified.max_thread_count, - self.config.readpool.unified.auto_adjust_pool_size, + self.core.config.readpool.unified.max_thread_count, + self.core.config.readpool.unified.auto_adjust_pool_size, )), ); unified_read_pool_scale_receiver = Some(rx); @@ -941,7 +782,7 @@ where ); // Create resolved ts worker - let rts_worker = if self.config.resolved_ts.enable { + let rts_worker = if self.core.config.resolved_ts.enable { let worker = Box::new(LazyWorker::new("resolved-ts")); // Register the resolved ts observer let resolved_ts_ob = resolved_ts::Observer::new(worker.scheduler()); @@ -966,23 +807,24 @@ where .check_leader_worker .start("check-leader", check_leader_runner); - let server_config = Arc::new(VersionTrack::new(self.config.server.clone())); + let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); - self.config + self.core + .config .raft_store .validate( - self.config.coprocessor.region_split_size(), - self.config.coprocessor.enable_region_bucket(), - self.config.coprocessor.region_bucket_size, + self.core.config.coprocessor.region_split_size(), + self.core.config.coprocessor.enable_region_bucket(), + self.core.config.coprocessor.region_bucket_size, ) .unwrap_or_else(|e| fatal!("failed to validate raftstore config {}", e)); - let raft_store = Arc::new(VersionTrack::new(self.config.raft_store.clone())); + let raft_store = Arc::new(VersionTrack::new(self.core.config.raft_store.clone())); let health_service = HealthService::default(); let mut node = Node::new( self.system.take().unwrap(), &server_config.value().clone(), raft_store.clone(), - self.config.storage.api_version(), + self.core.config.storage.api_version(), self.pd_client.clone(), state, self.background_worker.clone(), @@ -1006,7 +848,7 @@ where resource_tag_factory, Arc::clone(&self.quota_limiter), ), - coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), + coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), self.resolver.clone().unwrap(), Either::Left(snap_mgr.clone()), gc_worker.clone(), @@ -1027,7 +869,7 @@ where ); // Start backup stream - let backup_stream_scheduler = if self.config.log_backup.enable { + let backup_stream_scheduler = if self.core.config.log_backup.enable { // Create backup stream. let mut backup_stream_worker = Box::new(LazyWorker::new("backup-stream")); let backup_stream_scheduler = backup_stream_worker.scheduler(); @@ -1040,7 +882,7 @@ where tikv::config::Module::BackupStream, Box::new(BackupStreamConfigManager::new( backup_stream_worker.scheduler(), - self.config.log_backup.clone(), + self.core.config.log_backup.clone(), )), ); @@ -1050,7 +892,7 @@ where Arc::clone(&self.pd_client), pd_client::meta_storage::Source::LogBackup, ))), - self.config.log_backup.clone(), + self.core.config.log_backup.clone(), backup_stream_scheduler.clone(), backup_stream_ob, self.region_info_accessor.clone(), @@ -1073,22 +915,30 @@ where None }; - let import_path = self.store_path.join("import"); + let import_path = self.core.store_path.join("import"); let mut importer = SstImporter::new( - &self.config.import, + &self.core.config.import, import_path, - self.encryption_key_manager.clone(), - self.config.storage.api_version(), + self.core.encryption_key_manager.clone(), + self.core.config.storage.api_version(), ) .unwrap(); for (cf_name, compression_type) in &[ ( CF_DEFAULT, - self.config.rocksdb.defaultcf.bottommost_level_compression, + self.core + .config + .rocksdb + .defaultcf + .bottommost_level_compression, ), ( CF_WRITE, - self.config.rocksdb.writecf.bottommost_level_compression, + self.core + .config + .rocksdb + .writecf + .bottommost_level_compression, ), ] { importer.set_compression_type(cf_name, from_rocks_compression_type(*compression_type)); @@ -1109,7 +959,7 @@ where ); let split_config_manager = - SplitConfigManager::new(Arc::new(VersionTrack::new(self.config.split.clone()))); + SplitConfigManager::new(Arc::new(VersionTrack::new(self.core.config.split.clone()))); cfg_controller.register( tikv::config::Module::Split, Box::new(split_config_manager.clone()), @@ -1117,14 +967,14 @@ where let auto_split_controller = AutoSplitController::new( split_config_manager, - self.config.server.grpc_concurrency, - self.config.readpool.unified.max_thread_count, + self.core.config.server.grpc_concurrency, + self.core.config.readpool.unified.max_thread_count, unified_read_pool_scale_receiver, ); // `ConsistencyCheckObserver` must be registered before `Node::start`. let safe_point = Arc::new(AtomicU64::new(0)); - let observer = match self.config.coprocessor.consistency_check_method { + let observer = match self.core.config.coprocessor.consistency_check_method { ConsistencyCheckMethod::Mvcc => BoxConsistencyCheckObserver::new( MvccConsistencyCheckObserver::new(safe_point.clone()), ), @@ -1169,22 +1019,22 @@ where fatal!("failed to start auto_gc on storage, error: {}", e); } - initial_metric(&self.config.metric); - if self.config.storage.enable_ttl { + initial_metric(&self.core.config.metric); + if self.core.config.storage.enable_ttl { ttl_checker.start_with_timer(TtlChecker::new( self.engines.as_ref().unwrap().engine.kv_engine().unwrap(), self.region_info_accessor.clone(), - self.config.storage.ttl_check_poll_interval.into(), + self.core.config.storage.ttl_check_poll_interval.into(), )); self.to_stop.push(ttl_checker); } // Start CDC. - let cdc_memory_quota = MemoryQuota::new(self.config.cdc.sink_memory_quota.0 as _); + let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); let cdc_endpoint = cdc::Endpoint::new( - self.config.server.cluster_id, - &self.config.cdc, - self.config.storage.api_version(), + self.core.config.server.cluster_id, + &self.core.config.cdc, + self.core.config.storage.api_version(), self.pd_client.clone(), cdc_scheduler.clone(), self.router.clone(), @@ -1203,7 +1053,7 @@ where // Start resolved ts if let Some(mut rts_worker) = rts_worker { let rts_endpoint = resolved_ts::Endpoint::new( - &self.config.resolved_ts, + &self.core.config.resolved_ts, rts_worker.scheduler(), self.router.clone(), engines.store_meta.clone(), @@ -1244,8 +1094,8 @@ where // Import SST service. let import_service = ImportSstService::new( - self.config.import.clone(), - self.config.raft_store.raft_entry_max_size, + self.core.config.import.clone(), + self.core.config.raft_store.raft_entry_max_size, engines.engine.clone(), LocalTablets::Singleton(engines.engines.kv.clone()), servers.importer.clone(), @@ -1285,8 +1135,8 @@ where // Create Diagnostics service let diag_service = DiagnosticsService::new( servers.server.get_debug_thread_pool().clone(), - self.config.log.file.filename.clone(), - self.config.slow_log_file.clone(), + self.core.config.log.file.filename.clone(), + self.core.config.slow_log_file.clone(), ); if servers .server @@ -1312,7 +1162,7 @@ where self.pd_client.clone(), self.resolver.clone().unwrap(), self.security_mgr.clone(), - &self.config.pessimistic_txn, + &self.core.config.pessimistic_txn, ) .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); @@ -1334,9 +1184,9 @@ where engines.engine.clone(), self.region_info_accessor.clone(), LocalTablets::Singleton(engines.engines.kv.clone()), - self.config.backup.clone(), + self.core.config.backup.clone(), self.concurrency_manager.clone(), - self.config.storage.api_version(), + self.core.config.storage.api_version(), self.causal_ts_provider.clone(), ); self.cfg_controller.as_mut().unwrap().register( @@ -1392,28 +1242,6 @@ where } } - fn init_io_utility(&mut self) -> BytesFetcher { - let stats_collector_enabled = file_system::init_io_stats_collector() - .map_err(|e| warn!("failed to init I/O stats collector: {}", e)) - .is_ok(); - - let limiter = Arc::new( - self.config - .storage - .io_rate_limit - .build(!stats_collector_enabled /* enable_statistics */), - ); - let fetcher = if stats_collector_enabled { - BytesFetcher::FromIoStatsCollector() - } else { - BytesFetcher::FromRateLimiter(limiter.statistics().unwrap()) - }; - // Set up IO limiter even when rate limit is disabled, so that rate limits can - // be dynamically applied later on. - set_io_rate_limiter(Some(limiter)); - fetcher - } - fn init_metrics_flusher( &mut self, fetcher: BytesFetcher, @@ -1422,7 +1250,7 @@ where let mut engine_metrics = EngineMetricsManager::::new( self.tablet_registry.clone().unwrap(), self.kv_statistics.clone(), - self.config.rocksdb.titan.enabled, + self.core.config.rocksdb.titan.enabled, self.engines.as_ref().unwrap().engines.raft.clone(), self.raft_statistics.clone(), ); @@ -1538,9 +1366,9 @@ where } fn init_storage_stats_task(&self, engines: Engines) { - let config_disk_capacity: u64 = self.config.raft_store.capacity.0; - let data_dir = self.config.storage.data_dir.clone(); - let store_path = self.store_path.clone(); + let config_disk_capacity: u64 = self.core.config.raft_store.capacity.0; + let data_dir = self.core.config.storage.data_dir.clone(); + let store_path = self.core.store_path.clone(); let snap_mgr = self.snap_mgr.clone().unwrap(); let reserve_space = disk::get_disk_reserved_space(); let reserve_raft_space = disk::get_raft_disk_reserved_space(); @@ -1669,6 +1497,7 @@ where fn init_sst_recovery_sender(&mut self) -> Option> { if !self + .core .config .storage .background_error_recovery_window @@ -1697,14 +1526,14 @@ where fn run_status_server(&mut self) { // Create a status server. - let status_enabled = !self.config.server.status_addr.is_empty(); + let status_enabled = !self.core.config.server.status_addr.is_empty(); if status_enabled { let mut status_server = match StatusServer::new( - self.config.server.status_thread_pool_size, + self.core.config.server.status_thread_pool_size, self.cfg_controller.take().unwrap(), - Arc::new(self.config.security.clone()), + Arc::new(self.core.config.security.clone()), self.engines.as_ref().unwrap().engine.raft_extension(), - self.store_path.clone(), + self.core.store_path.clone(), self.resource_manager.clone(), ) { Ok(status_server) => Box::new(status_server), @@ -1714,7 +1543,7 @@ where } }; // Start the status server. - if let Err(e) = status_server.start(self.config.server.status_addr.clone()) { + if let Err(e) = status_server.start(self.core.config.server.status_addr.clone()) { error_unknown!(%e; "failed to bind addr for status service"); } else { self.to_stop.push(status_server); @@ -1859,26 +1688,31 @@ impl TikvServer { flow_listener: engine_rocks::FlowListener, ) -> (Engines, Arc) { let block_cache = self + .core .config .storage .block_cache - .build_shared_cache(self.config.storage.engine); + .build_shared_cache(self.core.config.storage.engine); let env = self + .core .config - .build_shared_rocks_env(self.encryption_key_manager.clone(), get_io_rate_limiter()) + .build_shared_rocks_env( + self.core.encryption_key_manager.clone(), + get_io_rate_limiter(), + ) .unwrap(); // Create raft engine let (raft_engine, raft_statistics) = CER::build( - &self.config, + &self.core.config, &env, - &self.encryption_key_manager, + &self.core.encryption_key_manager, &block_cache, ); self.raft_statistics = raft_statistics; // Create kv engine. - let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) + let builder = KvEngineFactoryBuilder::new(env, &self.core.config, block_cache) .compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { router: Mutex::new(self.router.clone()), })) @@ -1887,7 +1721,7 @@ impl TikvServer { .flow_listener(flow_listener); let factory = Box::new(builder.build()); let kv_engine = factory - .create_shared_db(&self.store_path) + .create_shared_db(&self.core.store_path) .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); self.kv_statistics = Some(factory.rocks_statistics()); let engines = Engines::new(kv_engine.clone(), raft_engine); @@ -1897,8 +1731,11 @@ impl TikvServer { tikv::config::Module::Rocksdb, Box::new(DbConfigManger::new(kv_engine.clone(), DbType::Kv)), ); - let reg = TabletRegistry::new(Box::new(SingletonFactory::new(kv_engine)), &self.store_path) - .unwrap(); + let reg = TabletRegistry::new( + Box::new(SingletonFactory::new(kv_engine)), + &self.core.store_path, + ) + .unwrap(); // It always use the singleton kv_engine, use arbitrary id and suffix. let ctx = TabletContext::with_infinite_region(0, Some(0)); reg.load(ctx, false).unwrap(); @@ -1977,34 +1814,6 @@ fn check_system_config(config: &TikvConfig) { } } -fn try_lock_conflict_addr>(path: P) -> File { - let f = File::create(path.as_ref()).unwrap_or_else(|e| { - fatal!( - "failed to create lock at {}: {}", - path.as_ref().display(), - e - ) - }); - - if f.try_lock_exclusive().is_err() { - fatal!( - "{} already in use, maybe another instance is binding with this address.", - path.as_ref().file_name().unwrap().to_str().unwrap() - ); - } - f -} - -#[cfg(unix)] -fn get_lock_dir() -> String { - format!("{}_TIKV_LOCK_FILES", unsafe { libc::getuid() }) -} - -#[cfg(not(unix))] -fn get_lock_dir() -> String { - "TIKV_LOCK_FILES".to_owned() -} - /// A small trait for components which can be trivially stopped. Lets us keep /// a list of these in `TiKV`, rather than storing each component individually. pub(crate) trait Stop { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 6c96ce62ffb..8bc898d50b4 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -14,8 +14,6 @@ use std::{ cmp, collections::HashMap, - env, - net::SocketAddr, path::{Path, PathBuf}, str::FromStr, sync::{ @@ -29,20 +27,18 @@ use std::{ use api_version::{dispatch_api_version, KvFormat}; use causal_ts::CausalTsProviderImpl; use concurrency_manager::ConcurrencyManager; -use encryption_export::{data_key_manager_from_config, DataKeyManager}; +use encryption_export::DataKeyManager; use engine_rocks::{ flush_engine_statistics, from_rocks_compression_type, raw::{Cache, Env}, - FlowInfo, RocksEngine, RocksStatistics, + RocksEngine, RocksStatistics, }; use engine_traits::{ CachedTablet, CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, StatisticsReporter, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, }; -use error_code::ErrorCodeExt; use file_system::{ - get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor, - MetricsManager as IoMetricsManager, + get_io_rate_limiter, BytesFetcher, IoBudgetAdjustor, MetricsManager as IoMetricsManager, }; use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; @@ -117,8 +113,8 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - memory::*, raft_engine_switch::*, server::Stop, setup::*, signal_handler, - tikv_util::sys::thread::ThreadBuildWrapper, + common::TikvServerCore, memory::*, raft_engine_switch::*, server::Stop, setup::*, + signal_handler, tikv_util::sys::thread::ThreadBuildWrapper, }; // minimum number of core kept for background requests @@ -139,16 +135,16 @@ fn run_impl(config: TikvConfig) { let mut tikv = TikvServer::::init::(config); // Must be called after `TikvServer::init`. - let memory_limit = tikv.config.memory_usage_limit.unwrap().0; - let high_water = (tikv.config.memory_usage_high_water * memory_limit as f64) as u64; + let memory_limit = tikv.core.config.memory_usage_limit.unwrap().0; + let high_water = (tikv.core.config.memory_usage_high_water * memory_limit as f64) as u64; register_memory_usage_high_water(high_water); - tikv.check_conflict_addr(); - tikv.init_fs(); - tikv.init_yatp(); - tikv.init_encryption(); - let fetcher = tikv.init_io_utility(); - let listener = tikv.init_flow_receiver(); + tikv.core.check_conflict_addr(); + tikv.core.init_fs(); + tikv.core.init_yatp(); + tikv.core.init_encryption(); + let fetcher = tikv.core.init_io_utility(); + let listener = tikv.core.init_flow_receiver(); let engines_info = tikv.init_engines(listener); let server_config = tikv.init_servers::(); tikv.register_services(); @@ -207,18 +203,14 @@ const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); /// A complete TiKV server. struct TikvServer { - config: TikvConfig, + core: TikvServerCore, cfg_controller: Option, security_mgr: Arc, pd_client: Arc, - flow_info_sender: Option>, - flow_info_receiver: Option>, router: Option>, node: Option>, resolver: Option, - store_path: PathBuf, snap_mgr: Option, // Will be filled in `init_servers`. - encryption_key_manager: Option>, engines: Option>, kv_statistics: Option>, raft_statistics: Option>, @@ -226,7 +218,6 @@ struct TikvServer { region_info_accessor: Option, coprocessor_host: Option>, to_stop: Vec>, - lock_files: Vec, concurrency_manager: ConcurrencyManager, env: Arc, background_worker: Worker, @@ -341,16 +332,21 @@ where let check_leader_worker = WorkerBuilder::new("check_leader").thread_count(1).create(); TikvServer { - config, + core: TikvServerCore { + config, + store_path, + lock_files: vec![], + encryption_key_manager: None, + flow_info_sender: None, + flow_info_receiver: None, + }, cfg_controller: Some(cfg_controller), security_mgr, pd_client, router: None, node: None, resolver: None, - store_path, snap_mgr: None, - encryption_key_manager: None, engines: None, kv_statistics: None, raft_statistics: None, @@ -358,13 +354,10 @@ where region_info_accessor: None, coprocessor_host: None, to_stop: vec![], - lock_files: vec![], concurrency_manager, env, background_worker, check_leader_worker, - flow_info_sender: None, - flow_info_receiver: None, sst_worker: None, quota_limiter, resource_manager, @@ -444,172 +437,12 @@ where pd_client } - fn check_conflict_addr(&mut self) { - let cur_addr: SocketAddr = self - .config - .server - .addr - .parse() - .expect("failed to parse into a socket address"); - let cur_ip = cur_addr.ip(); - let cur_port = cur_addr.port(); - let lock_dir = get_lock_dir(); - - let search_base = env::temp_dir().join(lock_dir); - file_system::create_dir_all(&search_base) - .unwrap_or_else(|_| panic!("create {} failed", search_base.display())); - - for entry in file_system::read_dir(&search_base).unwrap().flatten() { - if !entry.file_type().unwrap().is_file() { - continue; - } - let file_path = entry.path(); - let file_name = file_path.file_name().unwrap().to_str().unwrap(); - if let Ok(addr) = file_name.replace('_', ":").parse::() { - let ip = addr.ip(); - let port = addr.port(); - if cur_port == port - && (cur_ip == ip || cur_ip.is_unspecified() || ip.is_unspecified()) - { - let _ = try_lock_conflict_addr(file_path); - } - } - } - - let cur_path = search_base.join(cur_addr.to_string().replace(':', "_")); - let cur_file = try_lock_conflict_addr(cur_path); - self.lock_files.push(cur_file); - } - - fn init_fs(&mut self) { - let lock_path = self.store_path.join(Path::new("LOCK")); - - let f = File::create(lock_path.as_path()) - .unwrap_or_else(|e| fatal!("failed to create lock at {}: {}", lock_path.display(), e)); - if f.try_lock_exclusive().is_err() { - fatal!( - "lock {} failed, maybe another instance is using this directory.", - self.store_path.display() - ); - } - self.lock_files.push(f); - - if tikv_util::panic_mark_file_exists(&self.config.storage.data_dir) { - fatal!( - "panic_mark_file {} exists, there must be something wrong with the db. \ - Do not remove the panic_mark_file and force the TiKV node to restart. \ - Please contact TiKV maintainers to investigate the issue. \ - If needed, use scale in and scale out to replace the TiKV node. \ - https://docs.pingcap.com/tidb/stable/scale-tidb-using-tiup", - tikv_util::panic_mark_file_path(&self.config.storage.data_dir).display() - ); - } - - // We truncate a big file to make sure that both raftdb and kvdb of TiKV have - // enough space to do compaction and region migration when TiKV recover. - // This file is created in data_dir rather than db_path, because we must not - // increase store size of db_path. - fn calculate_reserved_space(capacity: u64, reserved_size_from_config: u64) -> u64 { - let mut reserved_size = reserved_size_from_config; - if reserved_size_from_config != 0 { - reserved_size = - cmp::max((capacity as f64 * 0.05) as u64, reserved_size_from_config); - } - reserved_size - } - fn reserve_physical_space(data_dir: &String, available: u64, reserved_size: u64) { - let path = Path::new(data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); - if let Err(e) = file_system::remove_file(path) { - warn!("failed to remove space holder on starting: {}", e); - } - - // place holder file size is 20% of total reserved space. - if available > reserved_size { - file_system::reserve_space_for_recover(data_dir, reserved_size / 5) - .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) - .unwrap(); - } else { - warn!("no enough disk space left to create the place holder file"); - } - } - - let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); - let mut capacity = disk_stats.total_space(); - if self.config.raft_store.capacity.0 > 0 { - capacity = cmp::min(capacity, self.config.raft_store.capacity.0); - } - // reserve space for kv engine - let kv_reserved_size = - calculate_reserved_space(capacity, self.config.storage.reserve_space.0); - disk::set_disk_reserved_space(kv_reserved_size); - reserve_physical_space( - &self.config.storage.data_dir, - disk_stats.available_space(), - kv_reserved_size, - ); - - let raft_data_dir = if self.config.raft_engine.enable { - self.config.raft_engine.config().dir - } else { - self.config.raft_store.raftdb_path.clone() - }; - - let separated_raft_mount_path = - path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); - if separated_raft_mount_path { - let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); - // reserve space for raft engine if raft engine is deployed separately - let raft_reserved_size = calculate_reserved_space( - raft_disk_stats.total_space(), - self.config.storage.reserve_raft_space.0, - ); - disk::set_raft_disk_reserved_space(raft_reserved_size); - reserve_physical_space( - &raft_data_dir, - raft_disk_stats.available_space(), - raft_reserved_size, - ); - } - } - - fn init_yatp(&self) { - yatp::metrics::set_namespace(Some("tikv")); - prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL0_CHANCE.clone())).unwrap(); - prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL_ELAPSED.clone())).unwrap(); - prometheus::register(Box::new(yatp::metrics::TASK_EXEC_DURATION.clone())).unwrap(); - prometheus::register(Box::new(yatp::metrics::TASK_POLL_DURATION.clone())).unwrap(); - prometheus::register(Box::new(yatp::metrics::TASK_EXEC_TIMES.clone())).unwrap(); - } - - fn init_encryption(&mut self) { - self.encryption_key_manager = data_key_manager_from_config( - &self.config.security.encryption, - &self.config.storage.data_dir, - ) - .map_err(|e| { - panic!( - "Encryption failed to initialize: {}. code: {}", - e, - e.error_code() - ) - }) - .unwrap() - .map(Arc::new); - } - - fn init_flow_receiver(&mut self) -> engine_rocks::FlowListener { - let (tx, rx) = mpsc::channel(); - self.flow_info_sender = Some(tx.clone()); - self.flow_info_receiver = Some(rx); - engine_rocks::FlowListener::new(tx) - } - fn init_gc_worker(&mut self) -> GcWorker> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( engines.engine.clone(), - self.flow_info_sender.take().unwrap(), - self.config.gc.clone(), + self.core.flow_info_sender.take().unwrap(), + self.core.config.gc.clone(), self.pd_client.feature_gate().clone(), Arc::new(self.region_info_accessor.clone().unwrap()), ); @@ -625,9 +458,9 @@ where fn init_servers(&mut self) -> Arc> { let flow_controller = Arc::new(FlowController::Tablet(TabletFlowController::new( - &self.config.storage.flow_control, + &self.core.config.storage.flow_control, self.tablet_registry.clone().unwrap(), - self.flow_info_receiver.take().unwrap(), + self.core.flow_info_receiver.take().unwrap(), ))); let mut gc_worker = self.init_gc_worker(); let ttl_checker = Box::new(LazyWorker::new("ttl-checker")); @@ -644,7 +477,7 @@ where cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); - let lock_mgr = LockManager::new(&self.config.pessimistic_txn); + let lock_mgr = LockManager::new(&self.core.config.pessimistic_txn); cfg_controller.register( tikv::config::Module::PessimisticTxn, Box::new(lock_mgr.config_manager()), @@ -659,13 +492,13 @@ where slog_global::borrow_global().new(slog::o!()), ); - let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let unified_read_pool = if self.core.config.readpool.is_unified_pool_enabled() { let resource_ctl = self .resource_manager .as_ref() .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( - &self.config.readpool.unified, + &self.core.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), resource_ctl, @@ -701,16 +534,18 @@ where // Start resource metering. let (recorder_notifier, collector_reg_handle, resource_tag_factory, recorder_worker) = - resource_metering::init_recorder(self.config.resource_metering.precision.as_millis()); + resource_metering::init_recorder( + self.core.config.resource_metering.precision.as_millis(), + ); self.to_stop.push(recorder_worker); let (reporter_notifier, data_sink_reg_handle, reporter_worker) = resource_metering::init_reporter( - self.config.resource_metering.clone(), + self.core.config.resource_metering.clone(), collector_reg_handle.clone(), ); self.to_stop.push(reporter_worker); let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( - self.config.resource_metering.receiver_address.clone(), + self.core.config.resource_metering.receiver_address.clone(), self.env.clone(), data_sink_reg_handle.clone(), ); @@ -718,7 +553,7 @@ where let rsmeter_pubsub_service = resource_metering::PubSubService::new(data_sink_reg_handle); let cfg_manager = resource_metering::ConfigManager::new( - self.config.resource_metering.clone(), + self.core.config.resource_metering.clone(), recorder_notifier, reporter_notifier, address_change_notifier, @@ -728,11 +563,11 @@ where Box::new(cfg_manager), ); - let storage_read_pool_handle = if self.config.readpool.storage.use_unified_pool() { + let storage_read_pool_handle = if self.core.config.readpool.storage.use_unified_pool() { unified_read_pool.as_ref().unwrap().handle() } else { let storage_read_pools = ReadPool::from(storage::build_read_pool( - &self.config.readpool.storage, + &self.core.config.readpool.storage, pd_sender.clone(), engines.engine.clone(), )); @@ -741,7 +576,7 @@ where let storage = Storage::<_, _, F>::from_engine( engines.engine.clone(), - &self.config.storage, + &self.core.config.storage, storage_read_pool_handle, lock_mgr.clone(), self.concurrency_manager.clone(), @@ -779,6 +614,7 @@ where // Create snapshot manager, server. let snap_path = self + .core .store_path .join(Path::new("tablet_snap")) .to_str() @@ -791,11 +627,11 @@ where }; // Create coprocessor endpoint. - let cop_read_pool_handle = if self.config.readpool.coprocessor.use_unified_pool() { + let cop_read_pool_handle = if self.core.config.readpool.coprocessor.use_unified_pool() { unified_read_pool.as_ref().unwrap().handle() } else { let cop_read_pools = ReadPool::from(coprocessor::readpool_impl::build_read_pool( - &self.config.readpool.coprocessor, + &self.core.config.readpool.coprocessor, pd_sender, engines.engine.clone(), )); @@ -803,7 +639,7 @@ where }; let mut unified_read_pool_scale_receiver = None; - if self.config.readpool.is_unified_pool_enabled() { + if self.core.config.readpool.is_unified_pool_enabled() { let (unified_read_pool_scale_notifier, rx) = mpsc::sync_channel(10); cfg_controller.register( tikv::config::Module::Readpool, @@ -811,8 +647,8 @@ where unified_read_pool.as_ref().unwrap().handle(), unified_read_pool_scale_notifier, &self.background_worker, - self.config.readpool.unified.max_thread_count, - self.config.readpool.unified.auto_adjust_pool_size, + self.core.config.readpool.unified.max_thread_count, + self.core.config.readpool.unified.auto_adjust_pool_size, )), ); unified_read_pool_scale_receiver = Some(rx); @@ -826,17 +662,18 @@ where .check_leader_worker .start("check-leader", check_leader_runner); - let server_config = Arc::new(VersionTrack::new(self.config.server.clone())); + let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); - self.config + self.core + .config .raft_store .validate( - self.config.coprocessor.region_split_size(), - self.config.coprocessor.enable_region_bucket(), - self.config.coprocessor.region_bucket_size, + self.core.config.coprocessor.region_split_size(), + self.core.config.coprocessor.enable_region_bucket(), + self.core.config.coprocessor.region_bucket_size, ) .unwrap_or_else(|e| fatal!("failed to validate raftstore config {}", e)); - let raft_store = Arc::new(VersionTrack::new(self.config.raft_store.clone())); + let raft_store = Arc::new(VersionTrack::new(self.core.config.raft_store.clone())); let health_service = HealthService::default(); let node = self.node.as_ref().unwrap(); @@ -855,7 +692,7 @@ where resource_tag_factory, Arc::clone(&self.quota_limiter), ), - coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), + coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), self.resolver.clone().unwrap(), Either::Right(snap_mgr.clone()), gc_worker.clone(), @@ -875,22 +712,30 @@ where )), ); - let import_path = self.store_path.join("import"); + let import_path = self.core.store_path.join("import"); let mut importer = SstImporter::new( - &self.config.import, + &self.core.config.import, import_path, - self.encryption_key_manager.clone(), - self.config.storage.api_version(), + self.core.encryption_key_manager.clone(), + self.core.config.storage.api_version(), ) .unwrap(); for (cf_name, compression_type) in &[ ( CF_DEFAULT, - self.config.rocksdb.defaultcf.bottommost_level_compression, + self.core + .config + .rocksdb + .defaultcf + .bottommost_level_compression, ), ( CF_WRITE, - self.config.rocksdb.writecf.bottommost_level_compression, + self.core + .config + .rocksdb + .writecf + .bottommost_level_compression, ), ] { importer.set_compression_type(cf_name, from_rocks_compression_type(*compression_type)); @@ -900,7 +745,7 @@ where // V2 starts split-check worker within raftstore. let split_config_manager = - SplitConfigManager::new(Arc::new(VersionTrack::new(self.config.split.clone()))); + SplitConfigManager::new(Arc::new(VersionTrack::new(self.core.config.split.clone()))); cfg_controller.register( tikv::config::Module::Split, Box::new(split_config_manager.clone()), @@ -908,14 +753,14 @@ where let auto_split_controller = AutoSplitController::new( split_config_manager, - self.config.server.grpc_concurrency, - self.config.readpool.unified.max_thread_count, + self.core.config.server.grpc_concurrency, + self.core.config.readpool.unified.max_thread_count, unified_read_pool_scale_receiver, ); // `ConsistencyCheckObserver` must be registered before `Node::start`. let safe_point = Arc::new(AtomicU64::new(0)); - let observer = match self.config.coprocessor.consistency_check_method { + let observer = match self.core.config.coprocessor.consistency_check_method { ConsistencyCheckMethod::Mvcc => BoxConsistencyCheckObserver::new( MvccConsistencyCheckObserver::new(safe_point.clone()), ), @@ -966,7 +811,7 @@ where fatal!("failed to start auto_gc on storage, error: {}", e); } - initial_metric(&self.config.metric); + initial_metric(&self.core.config.metric); self.servers = Some(Servers { lock_mgr, @@ -999,9 +844,9 @@ where engines.engine.clone(), self.region_info_accessor.clone().unwrap(), LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), - self.config.backup.clone(), + self.core.config.backup.clone(), self.concurrency_manager.clone(), - self.config.storage.api_version(), + self.core.config.storage.api_version(), self.causal_ts_provider.clone(), ); self.cfg_controller.as_mut().unwrap().register( @@ -1012,8 +857,8 @@ where // Import SST service. let import_service = ImportSstService::new( - self.config.import.clone(), - self.config.raft_store.raft_entry_max_size, + self.core.config.import.clone(), + self.core.config.raft_store.raft_entry_max_size, engines.engine.clone(), LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), servers.importer.clone(), @@ -1036,8 +881,8 @@ where // Create Diagnostics service let diag_service = DiagnosticsService::new( servers.server.get_debug_thread_pool().clone(), - self.config.log.file.filename.clone(), - self.config.slow_log_file.clone(), + self.core.config.log.file.filename.clone(), + self.core.config.slow_log_file.clone(), ); if servers .server @@ -1063,7 +908,7 @@ where self.pd_client.clone(), self.resolver.clone().unwrap(), self.security_mgr.clone(), - &self.config.pessimistic_txn, + &self.core.config.pessimistic_txn, ) .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); @@ -1078,28 +923,6 @@ where } } - fn init_io_utility(&mut self) -> BytesFetcher { - let stats_collector_enabled = file_system::init_io_stats_collector() - .map_err(|e| warn!("failed to init I/O stats collector: {}", e)) - .is_ok(); - - let limiter = Arc::new( - self.config - .storage - .io_rate_limit - .build(!stats_collector_enabled /* enable_statistics */), - ); - let fetcher = if stats_collector_enabled { - BytesFetcher::FromIoStatsCollector() - } else { - BytesFetcher::FromRateLimiter(limiter.statistics().unwrap()) - }; - // Set up IO limiter even when rate limit is disabled, so that rate limits can - // be dynamically applied later on. - set_io_rate_limiter(Some(limiter)); - fetcher - } - fn init_metrics_flusher( &mut self, fetcher: BytesFetcher, @@ -1108,7 +931,7 @@ where let mut engine_metrics = EngineMetricsManager::::new( self.tablet_registry.clone().unwrap(), self.kv_statistics.clone(), - self.config.rocksdb.titan.enabled, + self.core.config.rocksdb.titan.enabled, self.engines.as_ref().unwrap().raft_engine.clone(), self.raft_statistics.clone(), ); @@ -1224,9 +1047,9 @@ where } fn init_storage_stats_task(&self) { - let config_disk_capacity: u64 = self.config.raft_store.capacity.0; - let data_dir = self.config.storage.data_dir.clone(); - let store_path = self.store_path.clone(); + let config_disk_capacity: u64 = self.core.config.raft_store.capacity.0; + let data_dir = self.core.config.storage.data_dir.clone(); + let store_path = self.core.store_path.clone(); let snap_mgr = self.snap_mgr.clone().unwrap(); let reserve_space = disk::get_disk_reserved_space(); let reserve_raft_space = disk::get_raft_disk_reserved_space(); @@ -1359,6 +1182,7 @@ where fn init_sst_recovery_sender(&mut self) -> Option> { if !self + .core .config .storage .background_error_recovery_window @@ -1391,14 +1215,14 @@ where fn run_status_server(&mut self) { // Create a status server. - let status_enabled = !self.config.server.status_addr.is_empty(); + let status_enabled = !self.core.config.server.status_addr.is_empty(); if status_enabled { let mut status_server = match StatusServer::new( - self.config.server.status_thread_pool_size, + self.core.config.server.status_thread_pool_size, self.cfg_controller.take().unwrap(), - Arc::new(self.config.security.clone()), + Arc::new(self.core.config.security.clone()), self.engines.as_ref().unwrap().engine.raft_extension(), - self.store_path.clone(), + self.core.store_path.clone(), self.resource_manager.clone(), ) { Ok(status_server) => Box::new(status_server), @@ -1408,7 +1232,7 @@ where } }; // Start the status server. - if let Err(e) = status_server.start(self.config.server.status_addr.clone()) { + if let Err(e) = status_server.start(self.core.config.server.status_addr.clone()) { error_unknown!(%e; "failed to bind addr for status service"); } else { self.to_stop.push(status_server); @@ -1553,31 +1377,36 @@ impl TikvServer { flow_listener: engine_rocks::FlowListener, ) -> Arc { let block_cache = self + .core .config .storage .block_cache - .build_shared_cache(self.config.storage.engine); + .build_shared_cache(self.core.config.storage.engine); let env = self + .core .config - .build_shared_rocks_env(self.encryption_key_manager.clone(), get_io_rate_limiter()) + .build_shared_rocks_env( + self.core.encryption_key_manager.clone(), + get_io_rate_limiter(), + ) .unwrap(); // Create raft engine let (raft_engine, raft_statistics) = CER::build( - &self.config, + &self.core.config, &env, - &self.encryption_key_manager, + &self.core.encryption_key_manager, &block_cache, ); self.raft_statistics = raft_statistics; // Create kv engine. - let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) + let builder = KvEngineFactoryBuilder::new(env, &self.core.config, block_cache) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); - let mut node = NodeV2::new(&self.config.server, self.pd_client.clone(), None); - node.try_bootstrap_store(&self.config.raft_store, &raft_engine) + let mut node = NodeV2::new(&self.core.config.server, self.pd_client.clone(), None); + node.try_bootstrap_store(&self.core.config.raft_store, &raft_engine) .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); assert_ne!(node.id(), 0); @@ -1590,7 +1419,7 @@ impl TikvServer { ))); let factory = Box::new(builder.build()); self.kv_statistics = Some(factory.rocks_statistics()); - let registry = TabletRegistry::new(factory, self.store_path.join("tablets")) + let registry = TabletRegistry::new(factory, self.core.store_path.join("tablets")) .unwrap_or_else(|e| fatal!("failed to create tablet registry {:?}", e)); let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( @@ -1609,7 +1438,7 @@ impl TikvServer { let router = RaftRouter::new(node.id(), router); let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( router.store_router().clone(), - self.config.coprocessor.clone(), + self.core.config.coprocessor.clone(), ); let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); @@ -1690,34 +1519,6 @@ fn check_system_config(config: &TikvConfig) { } } -fn try_lock_conflict_addr>(path: P) -> File { - let f = File::create(path.as_ref()).unwrap_or_else(|e| { - fatal!( - "failed to create lock at {}: {}", - path.as_ref().display(), - e - ) - }); - - if f.try_lock_exclusive().is_err() { - fatal!( - "{} already in use, maybe another instance is binding with this address.", - path.as_ref().file_name().unwrap().to_str().unwrap() - ); - } - f -} - -#[cfg(unix)] -fn get_lock_dir() -> String { - format!("{}_TIKV_LOCK_FILES", unsafe { libc::getuid() }) -} - -#[cfg(not(unix))] -fn get_lock_dir() -> String { - "TIKV_LOCK_FILES".to_owned() -} - pub struct EngineMetricsManager { tablet_registry: TabletRegistry, kv_statistics: Option>, diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 0f352ebc5bf..307b399b29e 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -1218,7 +1218,7 @@ impl Cluster { pub fn apply_state(&self, region_id: u64, store_id: u64) -> RaftApplyState { self.get_engine(store_id) - .get_apply_state(region_id) + .raft_apply_state(region_id) .unwrap() .unwrap() } @@ -1568,13 +1568,15 @@ impl Cluster { pub fn region_local_state(&self, region_id: u64, store_id: u64) -> RegionLocalState { self.get_engine(store_id) - .get_region_state(region_id) + .region_local_state(region_id) .unwrap() .unwrap() } pub fn get_raft_local_state(&self, region_id: u64, store_id: u64) -> Option { - self.get_engine(store_id).get_raft_local_state(region_id) + self.get_engine(store_id) + .raft_local_state(region_id) + .unwrap() } pub fn raft_local_state(&self, region_id: u64, store_id: u64) -> RaftLocalState { @@ -1667,21 +1669,6 @@ impl WrapFactory { let region_id = self.region_id_of_key(key); self.tablet_registry.get(region_id)?.latest().cloned() } - - pub fn get_region_state( - &self, - region_id: u64, - ) -> engine_traits::Result> { - self.raft_engine.get_region_state(region_id, u64::MAX) - } - - pub fn get_apply_state(&self, region_id: u64) -> engine_traits::Result> { - self.raft_engine.get_apply_state(region_id, u64::MAX) - } - - pub fn get_raft_local_state(&self, region_id: u64) -> Option { - self.raft_engine.get_raft_state(region_id).unwrap() - } } impl Peekable for WrapFactory { @@ -1694,10 +1681,10 @@ impl Peekable for WrapFactory { ) -> engine_traits::Result> { let region_id = self.region_id_of_key(key); - if let Ok(Some(state)) = self.get_region_state(region_id) - && state.state == PeerState::Tombstone - { - return Ok(None); + if let Ok(Some(state)) = self.region_local_state(region_id) { + if state.state == PeerState::Tombstone { + return Ok(None); + } } match self.get_tablet(key) { @@ -1714,10 +1701,10 @@ impl Peekable for WrapFactory { ) -> engine_traits::Result> { let region_id = self.region_id_of_key(key); - if let Ok(Some(state)) = self.get_region_state(region_id) - && state.state == PeerState::Tombstone - { - return Ok(None); + if let Ok(Some(state)) = self.region_local_state(region_id) { + if state.state == PeerState::Tombstone { + return Ok(None); + } } match self.get_tablet(key) { @@ -1783,6 +1770,14 @@ impl RawEngine for WrapFactory { &self, region_id: u64, ) -> engine_traits::Result> { - self.get_region_state(region_id) + self.raft_engine.get_region_state(region_id, u64::MAX) + } + + fn raft_apply_state(&self, region_id: u64) -> engine_traits::Result> { + self.raft_engine.get_apply_state(region_id, u64::MAX) + } + + fn raft_local_state(&self, region_id: u64) -> engine_traits::Result> { + self.raft_engine.get_raft_state(region_id) } } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 2a73f5e239c..14bf1d280d5 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1951,9 +1951,9 @@ pub trait RawEngine: Peekable + SyncMutable { fn region_local_state(&self, region_id: u64) -> engine_traits::Result>; - fn raft_apply_state(&self, _region_id: u64) -> engine_traits::Result> { - unimplemented!() - } + fn raft_apply_state(&self, _region_id: u64) -> engine_traits::Result>; + + fn raft_local_state(&self, _region_id: u64) -> engine_traits::Result>; } impl RawEngine for RocksEngine { @@ -1963,4 +1963,12 @@ impl RawEngine for RocksEngine { ) -> engine_traits::Result> { self.get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) } + + fn raft_apply_state(&self, region_id: u64) -> engine_traits::Result> { + self.get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) + } + + fn raft_local_state(&self, region_id: u64) -> engine_traits::Result> { + self.get_msg_cf(CF_RAFT, &keys::raft_state_key(region_id)) + } } From ee13695952073c51e6a036c03aef427c3c46538f Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 31 Mar 2023 15:28:55 +0800 Subject: [PATCH 613/676] raftstore-v2: change log level (#14500) ref tikv/tikv#12842 Signed-off-by: Spade A --- components/raftstore-v2/src/operation/command/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 0337c0cf32a..0ae2f1741c3 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -43,7 +43,7 @@ use raftstore::{ }, Error, Result, }; -use slog::{error, info, warn}; +use slog::{debug, error, info, warn}; use tikv_util::{ box_err, log::SlogFormat, @@ -335,7 +335,7 @@ impl Peer { ) { if !self.serving() || !apply_res.admin_result.is_empty() { // TODO: remove following log once stable. - info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res, "apply_trace" => ?self.storage().apply_trace()); + debug!(self.logger, "on_apply_res"; "apply_res" => ?apply_res, "apply_trace" => ?self.storage().apply_trace()); } // It must just applied a snapshot. if apply_res.applied_index < self.entry_storage().first_index() { From d42aa0b47b8a4a39fa91529b350111cea9441b35 Mon Sep 17 00:00:00 2001 From: Lucas Date: Fri, 31 Mar 2023 17:24:55 +0800 Subject: [PATCH 614/676] engine: update raft-engine (#14495) close tikv/tikv#14468 Update the dependency to `raft-engine` lib, to fix the bug that the size of `prefill-for-recycle` is not adaptive to dynamic regions. Signed-off-by: Lucasliang --- Cargo.lock | 118 ++++++++++++++++++++--- components/raft_log_engine/src/engine.rs | 6 +- metrics/grafana/tikv_details.json | 4 +- 3 files changed, 112 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 62746ba6bcb..e8162267354 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -52,6 +52,17 @@ dependencies = [ "version_check 0.9.4", ] +[[package]] +name = "ahash" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +dependencies = [ + "cfg-if 1.0.0", + "once_cell", + "version_check 0.9.4", +] + [[package]] name = "aho-corasick" version = "0.7.18" @@ -648,7 +659,7 @@ dependencies = [ "cexpr 0.6.0", "clang-sys", "clap 2.33.0", - "env_logger", + "env_logger 0.9.0", "lazy_static", "lazycell", "log", @@ -1650,6 +1661,40 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc 0.2.139", + "winapi 0.3.9", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc 0.2.139", +] + [[package]] name = "error-chain" version = "0.12.1" @@ -2349,11 +2394,11 @@ checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" [[package]] name = "hashbrown" -version = "0.12.0" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c21d40587b92fa6a6c6e3c1bdbf87d75511db5672f9c93175574b3a00df1758" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" dependencies = [ - "ahash", + "ahash 0.8.3", ] [[package]] @@ -2380,6 +2425,15 @@ dependencies = [ "libc 0.2.139", ] +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc 0.2.139", +] + [[package]] name = "hex" version = "0.3.2" @@ -2565,7 +2619,7 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "16d4bde3a7105e59c66a4104cfe9606453af1c7a0eac78cb7d5bc263eb762a70" dependencies = [ - "ahash", + "ahash 0.7.4", "atty", "indexmap", "itoa 1.0.1", @@ -2615,6 +2669,16 @@ dependencies = [ "raft", ] +[[package]] +name = "io-lifetimes" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" +dependencies = [ + "libc 0.2.139", + "windows-sys 0.42.0", +] + [[package]] name = "iovec" version = "0.1.4" @@ -2639,6 +2703,18 @@ dependencies = [ "serde", ] +[[package]] +name = "is-terminal" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" +dependencies = [ + "hermit-abi 0.2.6", + "io-lifetimes", + "rustix", + "windows-sys 0.42.0", +] + [[package]] name = "itertools" version = "0.10.0" @@ -2902,6 +2978,12 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "linux-raw-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" + [[package]] name = "lock_api" version = "0.4.6" @@ -3457,7 +3539,7 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.3", "libc 0.2.139", ] @@ -4186,14 +4268,14 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#404e3fefaeeb4da6b7650268d500cfd3fbd29cae" +source = "git+https://github.com/tikv/raft-engine.git#39f4db451295dbd8b30db4f94f220182c2c65be9" dependencies = [ "byteorder", "crc32fast", "crossbeam", "fail", "fs2", - "hashbrown 0.12.0", + "hashbrown 0.13.2", "hex 0.4.2", "if_chain", "lazy_static", @@ -4220,10 +4302,10 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#404e3fefaeeb4da6b7650268d500cfd3fbd29cae" +source = "git+https://github.com/tikv/raft-engine.git#39f4db451295dbd8b30db4f94f220182c2c65be9" dependencies = [ "clap 3.1.6", - "env_logger", + "env_logger 0.10.0", "raft-engine", ] @@ -4744,7 +4826,7 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f06953bb8b9e4307cb7ccc0d9d018e2ddd25a30d32831f631ce4fe8f17671f7" dependencies = [ - "ahash", + "ahash 0.7.4", "bitflags", "instant", "num-traits", @@ -4960,6 +5042,20 @@ dependencies = [ "semver 1.0.4", ] +[[package]] +name = "rustix" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc 0.2.139", + "linux-raw-sys", + "windows-sys 0.42.0", +] + [[package]] name = "rustversion" version = "1.0.4" diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index a9e75ca9580..7b107bc0cc9 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -23,7 +23,7 @@ use kvproto::{ }; use raft::eraftpb::Entry; use raft_engine::{ - env::{DefaultFileSystem, FileSystem, Handle, WriteExt}, + env::{DefaultFileSystem, FileSystem, Handle, Permission, WriteExt}, Command, Engine as RawRaftEngine, Error as RaftEngineError, LogBatch, MessageExt, }; pub use raft_engine::{Config as RaftEngineConfig, ReadableSize, RecoveryMode}; @@ -180,10 +180,10 @@ impl FileSystem for ManagedFileSystem { }) } - fn open>(&self, path: P) -> IoResult { + fn open>(&self, path: P, perm: Permission) -> IoResult { Ok(ManagedHandle { path: path.as_ref().to_path_buf(), - base: Arc::new(self.base_file_system.open(path.as_ref())?), + base: Arc::new(self.base_file_system.open(path.as_ref(), perm)?), }) } diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 9600222547e..184ad7a756b 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -31980,9 +31980,9 @@ }, { "exemplar": true, - "expr": "avg(raft_engine_recycled_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"})", + "expr": "avg(raft_engine_recycled_file_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}) by (type)", "intervalFactor": 1, - "legendFormat": "recycle", + "legendFormat": "{{type}} - recycle", "refId": "C" } ], From 503174b571788f01aef154157cafd53bacd6b860 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Tue, 4 Apr 2023 11:56:56 +0800 Subject: [PATCH 615/676] Parameterize Simulator and Cluster in test_raftstore-v2 (#14493) ref tikv/tikv#14401 Simulator and Cluster in test_raftstore-v2 Signed-off-by: CalvinNeo Co-authored-by: Xinye Tao --- Cargo.toml | 2 +- components/engine_panic/Cargo.toml | 3 + components/engine_panic/src/engine.rs | 4 + components/engine_rocks/Cargo.toml | 1 + components/engine_rocks/src/engine.rs | 5 ++ components/engine_traits/src/engine.rs | 5 ++ components/server/src/common.rs | 34 ++++++++ components/server/src/server.rs | 41 ++-------- components/server/src/server2.rs | 45 ++-------- components/test_raftstore-v2/src/cluster.rs | 91 +++++++++++++-------- components/test_raftstore-v2/src/node.rs | 24 ++++-- components/test_raftstore-v2/src/server.rs | 64 +++++++++++---- components/test_raftstore-v2/src/util.rs | 28 ++++--- components/test_raftstore/src/cluster.rs | 8 +- components/test_raftstore/src/util.rs | 24 ++++-- 15 files changed, 227 insertions(+), 152 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f8e67d70c04..1bd9377d5f0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,7 +39,7 @@ cloud-azure = [ "encryption_export/cloud-azure", "sst_importer/cloud-azure", ] -testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport", "engine_traits/testexport"] +testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport", "engine_traits/testexport", "engine_rocks/testexport", "engine_panic/testexport"] test-engine-kv-rocksdb = [ "engine_test/test-engine-kv-rocksdb" ] diff --git a/components/engine_panic/Cargo.toml b/components/engine_panic/Cargo.toml index 55e42f2595f..ec77e2b715f 100644 --- a/components/engine_panic/Cargo.toml +++ b/components/engine_panic/Cargo.toml @@ -5,6 +5,9 @@ description = "An example TiKV storage engine that does nothing but panic" edition = "2018" publish = false +[features] +testexport = [] + [dependencies] engine_traits = { workspace = true } kvproto = { workspace = true } diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index 6bca7d46485..d8faf8fee01 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -22,6 +22,10 @@ impl KvEngine for PanicEngine { fn bad_downcast(&self) -> &T { panic!() } + #[cfg(any(test, feature = "testexport"))] + fn inner_refcount(&self) -> usize { + panic!() + } } impl Peekable for PanicEngine { diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index 4c2b7bf5a52..d31ed947520 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -10,6 +10,7 @@ jemalloc = ["rocksdb/jemalloc"] portable = ["rocksdb/portable"] sse = ["rocksdb/sse"] failpoints = ["fail/failpoints"] +testexport = [] # Disables runtime checks of invariants required by RocksDB that are redundant # with assertions inside RocksDB itself. This makes it possible to test those diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 6499880490f..6c6231ca42f 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -194,6 +194,11 @@ impl KvEngine for RocksEngine { let e: &dyn Any = &self.db; e.downcast_ref().expect("bad engine downcast") } + + #[cfg(any(test, feature = "testexport"))] + fn inner_refcount(&self) -> usize { + Arc::strong_count(&self.db) + } } impl Iterable for RocksEngine { diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index e76765e2ed6..aa90c23b429 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -66,4 +66,9 @@ pub trait KvEngine: fn can_apply_snapshot(&self, _is_timeout: bool, _new_batch: bool, _region_id: u64) -> bool { true } + + /// A method for test to expose inner db refcount in order to make sure a + /// full release of engine. + #[cfg(any(test, feature = "testexport"))] + fn inner_refcount(&self) -> usize; } diff --git a/components/server/src/common.rs b/components/server/src/common.rs index 5c6dfa16120..67044dafc00 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -236,3 +236,37 @@ fn try_lock_conflict_addr>(path: P) -> File { } f } + +const RESERVED_OPEN_FDS: u64 = 1000; +pub fn check_system_config(config: &TikvConfig) { + info!("beginning system configuration check"); + let mut rocksdb_max_open_files = config.rocksdb.max_open_files; + if config.rocksdb.titan.enabled { + // Titan engine maintains yet another pool of blob files and uses the same max + // number of open files setup as rocksdb does. So we double the max required + // open files here + rocksdb_max_open_files *= 2; + } + if let Err(e) = tikv_util::config::check_max_open_fds( + RESERVED_OPEN_FDS + (rocksdb_max_open_files + config.raftdb.max_open_files) as u64, + ) { + fatal!("{}", e); + } + + // Check RocksDB data dir + if let Err(e) = tikv_util::config::check_data_dir(&config.storage.data_dir) { + warn!( + "check: rocksdb-data-dir"; + "path" => &config.storage.data_dir, + "err" => %e + ); + } + // Check raft data dir + if let Err(e) = tikv_util::config::check_data_dir(&config.raft_store.raftdb_path) { + warn!( + "check: raftdb-path"; + "path" => &config.raft_store.raftdb_path, + "err" => %e + ); + } +} diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 35fc96a3460..718ebbc0b3b 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -134,7 +134,11 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::TikvServerCore, memory::*, raft_engine_switch::*, setup::*, signal_handler, + common::{check_system_config, TikvServerCore}, + memory::*, + raft_engine_switch::*, + setup::*, + signal_handler, tikv_util::sys::thread::ThreadBuildWrapper, }; @@ -214,8 +218,6 @@ pub fn run_tikv(config: TikvConfig) { }) } -const RESERVED_OPEN_FDS: u64 = 1000; - const DEFAULT_METRICS_FLUSH_INTERVAL: Duration = Duration::from_millis(10_000); const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); @@ -1781,39 +1783,6 @@ fn pre_start() { } } -fn check_system_config(config: &TikvConfig) { - info!("beginning system configuration check"); - let mut rocksdb_max_open_files = config.rocksdb.max_open_files; - if config.rocksdb.titan.enabled { - // Titan engine maintains yet another pool of blob files and uses the same max - // number of open files setup as rocksdb does. So we double the max required - // open files here - rocksdb_max_open_files *= 2; - } - if let Err(e) = tikv_util::config::check_max_open_fds( - RESERVED_OPEN_FDS + (rocksdb_max_open_files + config.raftdb.max_open_files) as u64, - ) { - fatal!("{}", e); - } - - // Check RocksDB data dir - if let Err(e) = tikv_util::config::check_data_dir(&config.storage.data_dir) { - warn!( - "check: rocksdb-data-dir"; - "path" => &config.storage.data_dir, - "err" => %e - ); - } - // Check raft data dir - if let Err(e) = tikv_util::config::check_data_dir(&config.raft_store.raftdb_path) { - warn!( - "check: raftdb-path"; - "path" => &config.raft_store.raftdb_path, - "err" => %e - ); - } -} - /// A small trait for components which can be trivially stopped. Lets us keep /// a list of these in `TiKV`, rather than storing each component individually. pub(crate) trait Stop { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 8bc898d50b4..31ced8547ea 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -113,8 +113,13 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::TikvServerCore, memory::*, raft_engine_switch::*, server::Stop, setup::*, - signal_handler, tikv_util::sys::thread::ThreadBuildWrapper, + common::{check_system_config, TikvServerCore}, + memory::*, + raft_engine_switch::*, + server::Stop, + setup::*, + signal_handler, + tikv_util::sys::thread::ThreadBuildWrapper, }; // minimum number of core kept for background requests @@ -193,8 +198,6 @@ pub fn run_tikv(config: TikvConfig) { }) } -const RESERVED_OPEN_FDS: u64 = 1000; - const DEFAULT_METRICS_FLUSH_INTERVAL: Duration = Duration::from_millis(10_000); const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); @@ -1485,40 +1488,6 @@ fn pre_start() { ); } } - -fn check_system_config(config: &TikvConfig) { - info!("beginning system configuration check"); - let mut rocksdb_max_open_files = config.rocksdb.max_open_files; - if config.rocksdb.titan.enabled { - // Titan engine maintains yet another pool of blob files and uses the same max - // number of open files setup as rocksdb does. So we double the max required - // open files here - rocksdb_max_open_files *= 2; - } - if let Err(e) = tikv_util::config::check_max_open_fds( - RESERVED_OPEN_FDS + (rocksdb_max_open_files + config.raftdb.max_open_files) as u64, - ) { - fatal!("{}", e); - } - - // Check RocksDB data dir - if let Err(e) = tikv_util::config::check_data_dir(&config.storage.data_dir) { - warn!( - "check: rocksdb-data-dir"; - "path" => &config.storage.data_dir, - "err" => %e - ); - } - // Check raft data dir - if let Err(e) = tikv_util::config::check_data_dir(&config.raft_store.raftdb_path) { - warn!( - "check: raftdb-path"; - "path" => &config.raft_store.raftdb_path, - "err" => %e - ); - } -} - pub struct EngineMetricsManager { tablet_registry: TabletRegistry, kv_statistics: Option>, diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 307b399b29e..015062534e4 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -10,11 +10,11 @@ use std::{ use collections::{HashMap, HashSet}; use encryption_export::DataKeyManager; -use engine_rocks::{RocksDbVector, RocksEngine, RocksSnapshot, RocksStatistics}; +use engine_rocks::{RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - Iterable, MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, RaftLogBatch, ReadOptions, - SyncMutable, TabletRegistry, CF_DEFAULT, + KvEngine, Peekable, RaftEngine, RaftEngineReadOnly, RaftLogBatch, ReadOptions, SyncMutable, + TabletRegistry, CF_DEFAULT, }; use file_system::IoRateLimiter; use futures::{compat::Future01CompatExt, executor::block_on, select, Future, FutureExt}; @@ -65,13 +65,11 @@ use tikv_util::{ HandyRwLock, }; -use crate::create_test_engine; - // We simulate 3 or 5 nodes, each has a store. // Sometimes, we use fixed id to test, which means the id // isn't allocated by pd, and node id, store id are same. // E,g, for node 1, the node id and store id are both 1. -pub trait Simulator { +pub trait Simulator { // Pass 0 to let pd allocate a node id if db is empty. // If node id > 0, the node must be created in db already, // and the node id must be the same as given argument. @@ -81,10 +79,10 @@ pub trait Simulator { &mut self, node_id: u64, cfg: Config, - store_meta: Arc>>, + store_meta: Arc>>, key_mgr: Option>, raft_engine: RaftTestEngine, - tablet_registry: TabletRegistry, + tablet_registry: TabletRegistry, resource_manager: &Option>, ) -> ServerResult; @@ -97,7 +95,7 @@ pub trait Simulator { fn add_recv_filter(&mut self, node_id: u64, filter: Box); fn clear_recv_filters(&mut self, node_id: u64); - fn get_router(&self, node_id: u64) -> Option>; + fn get_router(&self, node_id: u64) -> Option>; fn get_snap_dir(&self, node_id: u64) -> String; fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()>; @@ -183,9 +181,7 @@ pub trait Simulator { fn async_snapshot( &mut self, request: RaftCmdRequest, - ) -> impl Future< - Output = std::result::Result, RaftCmdResponse>, - > + Send; + ) -> impl Future, RaftCmdResponse>> + Send; fn async_peer_msg_on_node(&self, node_id: u64, region_id: u64, msg: PeerMsg) -> Result<()>; @@ -313,16 +309,16 @@ pub trait Simulator { } } -pub struct Cluster { +pub struct Cluster, EK: KvEngine> { pub cfg: Config, leaders: HashMap, pub count: usize, pub paths: Vec, - pub engines: Vec<(TabletRegistry, RaftTestEngine)>, - pub tablet_registries: HashMap>, + pub engines: Vec<(TabletRegistry, RaftTestEngine)>, + pub tablet_registries: HashMap>, pub raft_engines: HashMap, - pub store_metas: HashMap>>>, + pub store_metas: HashMap>>>, key_managers: Vec>>, pub io_rate_limiter: Option>, key_managers_map: HashMap>>, @@ -334,16 +330,46 @@ pub struct Cluster { pub sim: Arc>, pub pd_client: Arc, resource_manager: Option>, + pub engine_creator: Box< + dyn Fn( + Option<(u64, u64)>, + Option>, + &Config, + ) -> ( + TabletRegistry, + RaftTestEngine, + Option>, + TempDir, + LazyWorker, + Arc, + Option>, + ), + >, } -impl Cluster { +impl, EK: KvEngine> Cluster { pub fn new( id: u64, count: usize, sim: Arc>, pd_client: Arc, api_version: ApiVersion, - ) -> Cluster { + engine_creator: Box< + dyn Fn( + Option<(u64, u64)>, + Option>, + &Config, + ) -> ( + TabletRegistry, + RaftTestEngine, + Option>, + TempDir, + LazyWorker, + Arc, + Option>, + ), + >, + ) -> Cluster { Cluster { cfg: Config { tikv: new_tikv_config_with_api_ver(id, api_version), @@ -367,6 +393,7 @@ impl Cluster { resource_manager: Some(Arc::new(ResourceGroupManager::default())), sim, pd_client, + engine_creator, } } @@ -417,7 +444,7 @@ impl Cluster { // id indicates cluster id store_id fn create_engine(&mut self, id: Option<(u64, u64)>) { let (reg, raft_engine, key_manager, dir, sst_worker, kv_statistics, raft_statistics) = - create_test_engine(id, self.io_rate_limiter.clone(), &self.cfg); + (self.engine_creator)(id, self.io_rate_limiter.clone(), &self.cfg); self.engines.push((reg, raft_engine)); self.key_managers.push(key_manager); self.paths.push(dir); @@ -536,7 +563,7 @@ impl Cluster { if let Some(tablet) = tablet.latest() { let mut tried = 0; while tried < 10 { - if Arc::strong_count(tablet.as_inner()) <= 3 { + if tablet.inner_refcount() <= 3 { break; } thread::sleep(Duration::from_millis(10)); @@ -632,7 +659,7 @@ impl Cluster { } } - pub fn get_engine(&self, node_id: u64) -> WrapFactory { + pub fn get_engine(&self, node_id: u64) -> WrapFactory { WrapFactory::new( self.pd_client.clone(), self.raft_engines[&node_id].clone(), @@ -1503,7 +1530,7 @@ impl Cluster { self.sim.rl().get_snap_dir(node_id) } - pub fn get_router(&self, node_id: u64) -> Option> { + pub fn get_router(&self, node_id: u64) -> Option> { self.sim.rl().get_router(node_id) } @@ -1632,24 +1659,24 @@ pub fn bootstrap_store( Ok(()) } -impl Drop for Cluster { +impl, EK: KvEngine> Drop for Cluster { fn drop(&mut self) { test_util::clear_failpoints(); self.shutdown(); } } -pub struct WrapFactory { +pub struct WrapFactory { pd_client: Arc, raft_engine: RaftTestEngine, - tablet_registry: TabletRegistry, + tablet_registry: TabletRegistry, } -impl WrapFactory { +impl WrapFactory { pub fn new( pd_client: Arc, raft_engine: RaftTestEngine, - tablet_registry: TabletRegistry, + tablet_registry: TabletRegistry, ) -> Self { Self { raft_engine, @@ -1664,15 +1691,15 @@ impl WrapFactory { self.pd_client.get_region(key).unwrap().get_id() } - fn get_tablet(&self, key: &[u8]) -> Option { + fn get_tablet(&self, key: &[u8]) -> Option { // todo: unwrap let region_id = self.region_id_of_key(key); self.tablet_registry.get(region_id)?.latest().cloned() } } -impl Peekable for WrapFactory { - type DbVector = RocksDbVector; +impl Peekable for WrapFactory { + type DbVector = EK::DbVector; fn get_value_opt( &self, @@ -1722,7 +1749,7 @@ impl Peekable for WrapFactory { } } -impl SyncMutable for WrapFactory { +impl SyncMutable for WrapFactory { fn put(&self, key: &[u8], value: &[u8]) -> engine_traits::Result<()> { match self.get_tablet(key) { Some(tablet) => tablet.put(key, value), @@ -1765,7 +1792,7 @@ impl SyncMutable for WrapFactory { } } -impl RawEngine for WrapFactory { +impl RawEngine for WrapFactory { fn region_local_state( &self, region_id: u64, diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index 058a9caf186..a02af6ad177 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -166,7 +166,7 @@ impl NodeCluster { } } -impl Simulator for NodeCluster { +impl Simulator for NodeCluster { fn get_node_ids(&self) -> HashSet { self.nodes.keys().cloned().collect() } @@ -439,16 +439,30 @@ impl Simulator for NodeCluster { // Compare to server cluster, node cluster does not have server layer and // storage layer. -pub fn new_node_cluster(id: u64, count: usize) -> Cluster { +pub fn new_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); - Cluster::new(id, count, sim, pd_client, ApiVersion::V1) + Cluster::new( + id, + count, + sim, + pd_client, + ApiVersion::V1, + Box::new(&crate::create_test_engine), + ) } // This cluster does not support batch split, we expect it to transfer the // `BatchSplit` request to `split` request -pub fn new_incompatible_node_cluster(id: u64, count: usize) -> Cluster { +pub fn new_incompatible_node_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); - Cluster::new(id, count, sim, pd_client, ApiVersion::V1) + Cluster::new( + id, + count, + sim, + pd_client, + ApiVersion::V1, + Box::new(&crate::create_test_engine), + ) } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index ed2a44d80fa..921d3b991ab 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -677,7 +677,7 @@ impl ServerCluster { } } -impl Simulator for ServerCluster { +impl Simulator for ServerCluster { fn get_node_ids(&self) -> HashSet { self.metas.keys().cloned().collect() } @@ -805,7 +805,7 @@ impl Simulator for ServerCluster { } } -impl Cluster { +impl Cluster { pub fn must_get_snapshot_of_region(&mut self, region_id: u64) -> RegionSnapshot { let mut try_snapshot = || -> Option> { let leader = self.leader_of_region(region_id)?; @@ -833,35 +833,60 @@ impl Cluster { } } -pub fn new_server_cluster(id: u64, count: usize) -> Cluster { +pub fn new_server_cluster(id: u64, count: usize) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); - Cluster::new(id, count, sim, pd_client, ApiVersion::V1) + Cluster::new( + id, + count, + sim, + pd_client, + ApiVersion::V1, + Box::new(crate::create_test_engine), + ) } -pub fn new_incompatible_server_cluster(id: u64, count: usize) -> Cluster { +pub fn new_incompatible_server_cluster( + id: u64, + count: usize, +) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); - Cluster::new(id, count, sim, pd_client, ApiVersion::V1) + Cluster::new( + id, + count, + sim, + pd_client, + ApiVersion::V1, + Box::new(crate::create_test_engine), + ) } pub fn new_server_cluster_with_api_ver( id: u64, count: usize, api_ver: ApiVersion, -) -> Cluster { +) -> Cluster { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); - Cluster::new(id, count, sim, pd_client, api_ver) + Cluster::new( + id, + count, + sim, + pd_client, + api_ver, + Box::new(crate::create_test_engine), + ) } -pub fn must_new_cluster_and_kv_client() -> (Cluster, TikvClient, Context) { +pub fn must_new_cluster_and_kv_client() -> (Cluster, TikvClient, Context) +{ must_new_cluster_and_kv_client_mul(1) } pub fn must_new_cluster_and_kv_client_mul( count: usize, -) -> (Cluster, TikvClient, Context) { +) -> (Cluster, TikvClient, Context) { let (cluster, leader, ctx) = must_new_cluster_mul(count); let env = Arc::new(Environment::new(1)); @@ -871,14 +896,16 @@ pub fn must_new_cluster_and_kv_client_mul( (cluster, client, ctx) } -pub fn must_new_cluster_mul(count: usize) -> (Cluster, metapb::Peer, Context) { +pub fn must_new_cluster_mul( + count: usize, +) -> (Cluster, metapb::Peer, Context) { must_new_and_configure_cluster_mul(count, |_| ()) } fn must_new_and_configure_cluster_mul( count: usize, - mut configure: impl FnMut(&mut Cluster), -) -> (Cluster, metapb::Peer, Context) { + mut configure: impl FnMut(&mut Cluster), +) -> (Cluster, metapb::Peer, Context) { let mut cluster = new_server_cluster(0, count); configure(&mut cluster); cluster.run(); @@ -894,8 +921,8 @@ fn must_new_and_configure_cluster_mul( } pub fn must_new_and_configure_cluster_and_kv_client( - configure: impl FnMut(&mut Cluster), -) -> (Cluster, TikvClient, Context) { + configure: impl FnMut(&mut Cluster), +) -> (Cluster, TikvClient, Context) { let (cluster, leader, ctx) = must_new_and_configure_cluster(configure); let env = Arc::new(Environment::new(1)); @@ -907,12 +934,13 @@ pub fn must_new_and_configure_cluster_and_kv_client( } pub fn must_new_and_configure_cluster( - configure: impl FnMut(&mut Cluster), -) -> (Cluster, metapb::Peer, Context) { + configure: impl FnMut(&mut Cluster), +) -> (Cluster, metapb::Peer, Context) { must_new_and_configure_cluster_mul(1, configure) } -pub fn must_new_cluster_and_debug_client() -> (Cluster, DebugClient, u64) { +pub fn must_new_cluster_and_debug_client() -> (Cluster, DebugClient, u64) +{ let (cluster, leader, _) = must_new_cluster_mul(1); let env = Arc::new(Environment::new(1)); diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index e2cc88c569c..9f68beaad35 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -5,7 +5,7 @@ use std::{fmt::Write, sync::Arc, thread, time::Duration}; use encryption_export::{data_key_manager_from_config, DataKeyManager}; use engine_rocks::{RocksEngine, RocksStatistics}; use engine_test::raft::RaftTestEngine; -use engine_traits::{TabletRegistry, CF_DEFAULT}; +use engine_traits::{KvEngine, TabletRegistry, CF_DEFAULT}; use file_system::IoRateLimiter; use futures::Future; use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb::RaftCmdResponse}; @@ -86,16 +86,16 @@ pub fn create_test_engine( } /// Keep putting random kvs until specified size limit is reached. -pub fn put_till_size( - cluster: &mut Cluster, +pub fn put_till_size, EK: KvEngine>( + cluster: &mut Cluster, limit: u64, range: &mut dyn Iterator, ) -> Vec { put_cf_till_size(cluster, CF_DEFAULT, limit, range) } -pub fn put_cf_till_size( - cluster: &mut Cluster, +pub fn put_cf_till_size, EK: KvEngine>( + cluster: &mut Cluster, cf: &'static str, limit: u64, range: &mut dyn Iterator, @@ -134,8 +134,8 @@ pub fn configure_for_snapshot(config: &mut Config) { config.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(50); } -pub fn configure_for_lease_read_v2( - cluster: &mut Cluster, +pub fn configure_for_lease_read_v2, EK: KvEngine>( + cluster: &mut Cluster, base_tick_ms: Option, election_ticks: Option, ) -> Duration { @@ -162,7 +162,11 @@ pub fn configure_for_lease_read_v2( election_timeout } -pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, region_id: u64) { +pub fn wait_for_synced( + cluster: &mut Cluster, + node_id: u64, + region_id: u64, +) { let mut storage = cluster .sim .read() @@ -193,8 +197,8 @@ pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, regio } // Issue a read request on the specified peer. -pub fn read_on_peer( - cluster: &mut Cluster, +pub fn read_on_peer, EK: KvEngine>( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], @@ -211,8 +215,8 @@ pub fn read_on_peer( cluster.read(None, request, timeout) } -pub fn async_read_on_peer( - cluster: &mut Cluster, +pub fn async_read_on_peer, EK: KvEngine>( + cluster: &mut Cluster, peer: metapb::Peer, region: metapb::Region, key: &[u8], diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 14bf1d280d5..988625d3750 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -12,7 +12,7 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; use encryption_export::DataKeyManager; -use engine_rocks::{RocksDbVector, RocksEngine, RocksSnapshot, RocksStatistics}; +use engine_rocks::{RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, SyncMutable, @@ -1947,7 +1947,9 @@ impl Drop for Cluster { } } -pub trait RawEngine: Peekable + SyncMutable { +pub trait RawEngine: + Peekable + SyncMutable +{ fn region_local_state(&self, region_id: u64) -> engine_traits::Result>; @@ -1956,7 +1958,7 @@ pub trait RawEngine: Peekable + SyncMutable { fn raft_local_state(&self, _region_id: u64) -> engine_traits::Result>; } -impl RawEngine for RocksEngine { +impl RawEngine for RocksEngine { fn region_local_state( &self, region_id: u64, diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 5c9d9ac5d54..81753d49600 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -16,8 +16,8 @@ use encryption_export::{ use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - CfNamesExt, Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, CF_DEFAULT, - CF_RAFT, + CfNamesExt, Engines, Iterable, KvEngine, Peekable, RaftEngineDebug, RaftEngineReadOnly, + CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; use futures::executor::block_on; @@ -60,7 +60,12 @@ use txn_types::Key; use crate::{Cluster, Config, RawEngine, ServerCluster, Simulator}; -pub fn must_get(engine: &impl RawEngine, cf: &str, key: &[u8], value: Option<&[u8]>) { +pub fn must_get( + engine: &impl RawEngine, + cf: &str, + key: &[u8], + value: Option<&[u8]>, +) { for _ in 1..300 { let res = engine.get_value_cf(cf, &keys::data_key(key)).unwrap(); if let (Some(value), Some(res)) = (value, res.as_ref()) { @@ -86,19 +91,24 @@ pub fn must_get(engine: &impl RawEngine, cf: &str, key: &[u8], value: Option<&[u ) } -pub fn must_get_equal(engine: &impl RawEngine, key: &[u8], value: &[u8]) { +pub fn must_get_equal(engine: &impl RawEngine, key: &[u8], value: &[u8]) { must_get(engine, "default", key, Some(value)); } -pub fn must_get_none(engine: &impl RawEngine, key: &[u8]) { +pub fn must_get_none(engine: &impl RawEngine, key: &[u8]) { must_get(engine, "default", key, None); } -pub fn must_get_cf_equal(engine: &impl RawEngine, cf: &str, key: &[u8], value: &[u8]) { +pub fn must_get_cf_equal( + engine: &impl RawEngine, + cf: &str, + key: &[u8], + value: &[u8], +) { must_get(engine, cf, key, Some(value)); } -pub fn must_get_cf_none(engine: &impl RawEngine, cf: &str, key: &[u8]) { +pub fn must_get_cf_none(engine: &impl RawEngine, cf: &str, key: &[u8]) { must_get(engine, cf, key, None); } From b778db2e4766a71edd24023d409bd778961120f0 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 7 Apr 2023 05:24:57 +0800 Subject: [PATCH 616/676] raftstore-v2: thread name fix (#14461) ref tikv/tikv#12842 Signed-off-by: Spade A --- components/causal_ts/src/tso.rs | 2 +- components/raftstore-v2/src/batch/store.rs | 2 +- components/raftstore-v2/src/worker/tablet_flush.rs | 3 ++- components/server/src/server.rs | 2 +- components/server/src/server2.rs | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 2c99d8c068a..51f1824f7a6 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -355,7 +355,7 @@ impl BatchTsoProvider { let s = Self { pd_client: pd_client.clone(), batch_list: Arc::new(TsoBatchList::new(cache_multiplier)), - causal_ts_worker: WorkerBuilder::new("causal_ts_batch_tso_worker").create(), + causal_ts_worker: WorkerBuilder::new("causal-ts-batch-tso-worker").create(), renew_interval, renew_parameter, renew_request_tx, diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index fe152bb3990..66b0414b7c3 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -506,7 +506,7 @@ struct Workers { impl Workers { fn new(background: Worker, pd: LazyWorker, purge: Option) -> Self { - let tablet_flush = WorkerBuilder::new("tablet_flush-worker") + let tablet_flush = WorkerBuilder::new("tablet-flush-worker") .thread_count(2) .create(); Self { diff --git a/components/raftstore-v2/src/worker/tablet_flush.rs b/components/raftstore-v2/src/worker/tablet_flush.rs index c53296a5cb6..e7d2c534f80 100644 --- a/components/raftstore-v2/src/worker/tablet_flush.rs +++ b/components/raftstore-v2/src/worker/tablet_flush.rs @@ -3,7 +3,7 @@ use std::fmt::{Display, Formatter}; use engine_traits::{KvEngine, RaftEngine, TabletRegistry, DATA_CFS}; -use kvproto::raft_cmdpb::RaftCmdRequest; +use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; use slog::{error, info, Logger}; use tikv_util::{time::Instant, worker::Runnable}; use txn_types::WriteBatchFlags; @@ -79,6 +79,7 @@ impl Runner { } let mut req = req.unwrap(); + assert!(req.get_admin_request().get_cmd_type() == AdminCmdType::BatchSplit); req.mut_header() .set_flags(WriteBatchFlags::SPLIT_SECOND_PHASE.bits()); if let Err(e) = self.router.send( diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 718ebbc0b3b..be5edf0cf41 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -392,7 +392,7 @@ where // Run check leader in a dedicate thread, because it is time sensitive // and crucial to TiCDC replication lag. - let check_leader_worker = WorkerBuilder::new("check_leader").thread_count(1).create(); + let check_leader_worker = WorkerBuilder::new("check-leader").thread_count(1).create(); TikvServer { core: TikvServerCore { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 31ced8547ea..81ec94207a9 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -332,7 +332,7 @@ where // Run check leader in a dedicate thread, because it is time sensitive // and crucial to TiCDC replication lag. - let check_leader_worker = WorkerBuilder::new("check_leader").thread_count(1).create(); + let check_leader_worker = WorkerBuilder::new("check-leader").thread_count(1).create(); TikvServer { core: TikvServerCore { From f1d2de3580ec1d74a8d9b107b8729dffb953afee Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 7 Apr 2023 13:58:58 +0800 Subject: [PATCH 617/676] server, test: Refactor NodeCluster and ServerCluster (#14512) ref tikv/tikv#14401 Parameterize NodeCluster and ServerCluster Signed-off-by: CalvinNeo Co-authored-by: Xinye Tao --- components/server/src/common.rs | 525 ++++++++++++++++++- components/server/src/server.rs | 571 ++------------------- components/server/src/server2.rs | 534 ++----------------- components/test_raftstore-v2/src/node.rs | 54 +- components/test_raftstore-v2/src/server.rs | 153 +++--- components/test_raftstore-v2/src/util.rs | 4 +- components/test_raftstore/src/util.rs | 2 +- 7 files changed, 723 insertions(+), 1120 deletions(-) diff --git a/components/server/src/common.rs b/components/server/src/common.rs index 67044dafc00..2d2ae7bd398 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -1,22 +1,73 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. +//! This mod is exported to make convenience for creating TiKV-like servers. + use std::{ - cmp, env, + cmp, + collections::HashMap, + env, fmt, net::SocketAddr, path::{Path, PathBuf}, - sync::{mpsc, Arc}, + sync::{ + atomic::{AtomicU32, Ordering}, + mpsc, Arc, + }, + time::Duration, u64, }; use encryption_export::{data_key_manager_from_config, DataKeyManager}; -use engine_rocks::FlowInfo; +use engine_rocks::{ + flush_engine_statistics, + raw::{Cache, Env}, + FlowInfo, RocksEngine, RocksStatistics, +}; +use engine_traits::{ + CachedTablet, CfOptionsExt, FlowControlFactorsExt, KvEngine, RaftEngine, StatisticsReporter, + TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, +}; use error_code::ErrorCodeExt; -use file_system::{set_io_rate_limiter, BytesFetcher, File}; -use tikv::config::TikvConfig; -use tikv_util::sys::{disk, path_in_diff_mount_point}; +use file_system::{get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor}; +use grpcio::Environment; +use pd_client::{PdClient, RpcClient}; +use raft_log_engine::RaftLogEngine; +use security::SecurityManager; +use tikv::{ + config::{ConfigController, DbConfigManger, DbType, TikvConfig}, + server::{status_server::StatusServer, DEFAULT_CLUSTER_ID}, +}; +use tikv_util::{ + config::{ensure_dir_exist, RaftDataStateMachine}, + math::MovingAvgU32, + metrics::INSTANCE_BACKEND_CPU_QUOTA, + quota_limiter::QuotaLimiter, + sys::{cpu_time::ProcessStat, disk, path_in_diff_mount_point, SysQuota}, + time::Instant, + worker::{LazyWorker, Worker}, +}; + +use crate::{raft_engine_switch::*, setup::validate_and_persist_config}; -/// This is the common layer of TiKV-like servers. By holding it in its own -/// TikvServer implementation, one can easily access the common ability of a -/// TiKV server. +// minimum number of core kept for background requests +const BACKGROUND_REQUEST_CORE_LOWER_BOUND: f64 = 1.0; +// max ratio of core quota for background requests +const BACKGROUND_REQUEST_CORE_MAX_RATIO: f64 = 0.95; +// default ratio of core quota for background requests = core_number * 0.5 +const BACKGROUND_REQUEST_CORE_DEFAULT_RATIO: f64 = 0.5; +// indication of TiKV instance is short of cpu +const SYSTEM_BUSY_THRESHOLD: f64 = 0.80; +// indication of TiKV instance in healthy state when cpu usage is in [0.5, 0.80) +const SYSTEM_HEALTHY_THRESHOLD: f64 = 0.50; +// pace of cpu quota adjustment +const CPU_QUOTA_ADJUSTMENT_PACE: f64 = 200.0; // 0.2 vcpu +const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); + +/// This is the common part of TiKV-like servers. It is a collection of all +/// capabilities a TikvServer should have or may take advantage of. By holding +/// it in its own TikvServer implementation, one can easily access the common +/// ability of a TiKV server. +// Fields in this struct are all public since they are open for other TikvServer +// to use, e.g. a custom TikvServer may alter some fields in `config` or push +// some services into `to_stop`. pub struct TikvServerCore { pub config: TikvConfig, pub store_path: PathBuf, @@ -24,9 +75,57 @@ pub struct TikvServerCore { pub encryption_key_manager: Option>, pub flow_info_sender: Option>, pub flow_info_receiver: Option>, + pub to_stop: Vec>, + pub background_worker: Worker, } impl TikvServerCore { + /// Initialize and check the config + /// + /// Warnings are logged and fatal errors exist. + /// + /// # Fatal errors + /// + /// - If `dynamic config` feature is enabled and failed to register config + /// to PD + /// - If some critical configs (like data dir) are differrent from last run + /// - If the config can't pass `validate()` + /// - If the max open file descriptor limit is not high enough to support + /// the main database and the raft database. + pub fn init_config(mut config: TikvConfig) -> ConfigController { + validate_and_persist_config(&mut config, true); + + ensure_dir_exist(&config.storage.data_dir).unwrap(); + if !config.rocksdb.wal_dir.is_empty() { + ensure_dir_exist(&config.rocksdb.wal_dir).unwrap(); + } + if config.raft_engine.enable { + ensure_dir_exist(&config.raft_engine.config().dir).unwrap(); + } else { + ensure_dir_exist(&config.raft_store.raftdb_path).unwrap(); + if !config.raftdb.wal_dir.is_empty() { + ensure_dir_exist(&config.raftdb.wal_dir).unwrap(); + } + } + + check_system_config(&config); + + tikv_util::set_panic_hook(config.abort_on_panic, &config.storage.data_dir); + + info!( + "using config"; + "config" => serde_json::to_string(&config).unwrap(), + ); + if config.panic_when_unexpected_key_or_data { + info!("panic-when-unexpected-key-or-data is on"); + tikv_util::set_panic_when_unexpected_key_or_data(true); + } + + config.write_into_metrics(); + + ConfigController::new(config) + } + pub fn check_conflict_addr(&mut self) { let cur_addr: SocketAddr = self .config @@ -207,6 +306,112 @@ impl TikvServerCore { self.flow_info_receiver = Some(rx); engine_rocks::FlowListener::new(tx) } + + pub fn connect_to_pd_cluster( + config: &mut TikvConfig, + env: Arc, + security_mgr: Arc, + ) -> Arc { + let pd_client = Arc::new( + RpcClient::new(&config.pd, Some(env), security_mgr) + .unwrap_or_else(|e| fatal!("failed to create rpc client: {}", e)), + ); + + let cluster_id = pd_client + .get_cluster_id() + .unwrap_or_else(|e| fatal!("failed to get cluster id: {}", e)); + if cluster_id == DEFAULT_CLUSTER_ID { + fatal!("cluster id can't be {}", DEFAULT_CLUSTER_ID); + } + config.server.cluster_id = cluster_id; + info!( + "connect to PD cluster"; + "cluster_id" => cluster_id + ); + + pd_client + } + + // Only background cpu quota tuning is implemented at present. iops and frontend + // quota tuning is on the way + pub fn init_quota_tuning_task(&self, quota_limiter: Arc) { + // No need to do auto tune when capacity is really low + if SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO + < BACKGROUND_REQUEST_CORE_LOWER_BOUND + { + return; + }; + + // Determine the base cpu quota + let base_cpu_quota = + // if cpu quota is not specified, start from optimistic case + if quota_limiter.cputime_limiter(false).is_infinite() { + 1000_f64 + * f64::max( + BACKGROUND_REQUEST_CORE_LOWER_BOUND, + SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_DEFAULT_RATIO, + ) + } else { + quota_limiter.cputime_limiter(false) / 1000_f64 + }; + + // Calculate the celling and floor quota + let celling_quota = f64::min( + base_cpu_quota * 2.0, + 1_000_f64 * SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO, + ); + let floor_quota = f64::max( + base_cpu_quota * 0.5, + 1_000_f64 * BACKGROUND_REQUEST_CORE_LOWER_BOUND, + ); + + let mut proc_stats: ProcessStat = ProcessStat::cur_proc_stat().unwrap(); + self.background_worker.spawn_interval_task( + DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL, + move || { + if quota_limiter.auto_tune_enabled() { + let cputime_limit = quota_limiter.cputime_limiter(false); + let old_quota = if cputime_limit.is_infinite() { + base_cpu_quota + } else { + cputime_limit / 1000_f64 + }; + let cpu_usage = match proc_stats.cpu_usage() { + Ok(r) => r, + Err(_e) => 0.0, + }; + // Try tuning quota when cpu_usage is correctly collected. + // rule based tuning: + // - if instance is busy, shrink cpu quota for analyze by one quota pace until + // lower bound is hit; + // - if instance cpu usage is healthy, no op; + // - if instance is idle, increase cpu quota by one quota pace until upper + // bound is hit. + if cpu_usage > 0.0f64 { + let mut target_quota = old_quota; + + let cpu_util = cpu_usage / SysQuota::cpu_cores_quota(); + if cpu_util >= SYSTEM_BUSY_THRESHOLD { + target_quota = + f64::max(target_quota - CPU_QUOTA_ADJUSTMENT_PACE, floor_quota); + } else if cpu_util < SYSTEM_HEALTHY_THRESHOLD { + target_quota = + f64::min(target_quota + CPU_QUOTA_ADJUSTMENT_PACE, celling_quota); + } + + if old_quota != target_quota { + quota_limiter.set_cpu_time_limit(target_quota as usize, false); + debug!( + "cpu_time_limiter tuned for backend request"; + "cpu_util" => ?cpu_util, + "new quota" => ?target_quota); + INSTANCE_BACKEND_CPU_QUOTA.set(target_quota as i64); + } + } + } + }, + ); + } } #[cfg(unix)] @@ -270,3 +475,305 @@ pub fn check_system_config(config: &TikvConfig) { ); } } + +pub struct EnginesResourceInfo { + tablet_registry: TabletRegistry, + raft_engine: Option, + latest_normalized_pending_bytes: AtomicU32, + normalized_pending_bytes_collector: MovingAvgU32, +} + +impl EnginesResourceInfo { + const SCALE_FACTOR: u64 = 100; + + pub fn new( + tablet_registry: TabletRegistry, + raft_engine: Option, + max_samples_to_preserve: usize, + ) -> Self { + EnginesResourceInfo { + tablet_registry, + raft_engine, + latest_normalized_pending_bytes: AtomicU32::new(0), + normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), + } + } + + pub fn update( + &self, + _now: Instant, + cached_latest_tablets: &mut HashMap>, + ) { + let mut normalized_pending_bytes = 0; + + fn fetch_engine_cf(engine: &RocksEngine, cf: &str, normalized_pending_bytes: &mut u32) { + if let Ok(cf_opts) = engine.get_options_cf(cf) { + if let Ok(Some(b)) = engine.get_cf_pending_compaction_bytes(cf) { + if cf_opts.get_soft_pending_compaction_bytes_limit() > 0 { + *normalized_pending_bytes = std::cmp::max( + *normalized_pending_bytes, + (b * EnginesResourceInfo::SCALE_FACTOR + / cf_opts.get_soft_pending_compaction_bytes_limit()) + as u32, + ); + } + } + } + } + + if let Some(raft_engine) = &self.raft_engine { + fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); + } + + self.tablet_registry + .for_each_opened_tablet(|id, db: &mut CachedTablet| { + cached_latest_tablets.insert(id, db.clone()); + true + }); + + // todo(SpadeA): Now, there's a potential race condition problem where the + // tablet could be destroyed after the clone and before the fetching + // which could result in programme panic. It's okay now as the single global + // kv_engine will not be destroyed in normal operation and v2 is not + // ready for operation. Furthermore, this race condition is general to v2 as + // tablet clone is not a case exclusively happened here. We should + // propose another PR to tackle it such as destory tablet lazily in a GC + // thread. + + for (_, cache) in cached_latest_tablets.iter_mut() { + let Some(tablet) = cache.latest() else { continue }; + for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { + fetch_engine_cf(tablet, cf, &mut normalized_pending_bytes); + } + } + + // Clear ensures that these tablets are not hold forever. + cached_latest_tablets.clear(); + + let (_, avg) = self + .normalized_pending_bytes_collector + .add(normalized_pending_bytes); + self.latest_normalized_pending_bytes.store( + std::cmp::max(normalized_pending_bytes, avg), + Ordering::Relaxed, + ); + } + + #[cfg(any(test, feature = "testexport"))] + pub fn latest_normalized_pending_bytes(&self) -> u32 { + self.latest_normalized_pending_bytes.load(Ordering::Relaxed) + } +} + +impl IoBudgetAdjustor for EnginesResourceInfo { + fn adjust(&self, total_budgets: usize) -> usize { + let score = self.latest_normalized_pending_bytes.load(Ordering::Relaxed) as f32 + / Self::SCALE_FACTOR as f32; + // Two reasons for adding `sqrt` on top: + // 1) In theory the convergence point is independent of the value of pending + // bytes (as long as backlog generating rate equals consuming rate, which is + // determined by compaction budgets), a convex helps reach that point while + // maintaining low level of pending bytes. + // 2) Variance of compaction pending bytes grows with its magnitude, a filter + // with decreasing derivative can help balance such trend. + let score = score.sqrt(); + // The target global write flow slides between Bandwidth / 2 and Bandwidth. + let score = 0.5 + score / 2.0; + (total_budgets as f32 * score) as usize + } +} + +/// A small trait for components which can be trivially stopped. Lets us keep +/// a list of these in `TiKV`, rather than storing each component individually. +pub trait Stop { + fn stop(self: Box); +} + +impl Stop for StatusServer +where + R: 'static + Send, +{ + fn stop(self: Box) { + (*self).stop() + } +} + +impl Stop for Worker { + fn stop(self: Box) { + Worker::stop(&self); + } +} + +impl Stop for LazyWorker { + fn stop(self: Box) { + self.stop_worker(); + } +} + +pub trait ConfiguredRaftEngine: RaftEngine { + fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Cache, + ) -> (Self, Option>); + fn as_rocks_engine(&self) -> Option<&RocksEngine>; + fn register_config(&self, _cfg_controller: &mut ConfigController); +} + +impl ConfiguredRaftEngine for T { + default fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Cache, + ) -> (Self, Option>) { + unimplemented!() + } + default fn as_rocks_engine(&self) -> Option<&RocksEngine> { + None + } + default fn register_config(&self, _cfg_controller: &mut ConfigController) {} +} + +impl ConfiguredRaftEngine for RocksEngine { + fn build( + config: &TikvConfig, + env: &Arc, + key_manager: &Option>, + block_cache: &Cache, + ) -> (Self, Option>) { + let mut raft_data_state_machine = RaftDataStateMachine::new( + &config.storage.data_dir, + &config.raft_engine.config().dir, + &config.raft_store.raftdb_path, + ); + let should_dump = raft_data_state_machine.before_open_target(); + + let raft_db_path = &config.raft_store.raftdb_path; + let config_raftdb = &config.raftdb; + let statistics = Arc::new(RocksStatistics::new_titan()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), Some(&statistics)); + let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); + let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) + .expect("failed to open raftdb"); + + if should_dump { + let raft_engine = + RaftLogEngine::new(config.raft_engine.config(), key_manager.clone(), None) + .expect("failed to open raft engine for migration"); + dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /* threads */); + raft_engine.stop(); + drop(raft_engine); + raft_data_state_machine.after_dump_data(); + } + (raftdb, Some(statistics)) + } + + fn as_rocks_engine(&self) -> Option<&RocksEngine> { + Some(self) + } + + fn register_config(&self, cfg_controller: &mut ConfigController) { + cfg_controller.register( + tikv::config::Module::Raftdb, + Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), + ); + } +} + +impl ConfiguredRaftEngine for RaftLogEngine { + fn build( + config: &TikvConfig, + env: &Arc, + key_manager: &Option>, + block_cache: &Cache, + ) -> (Self, Option>) { + let mut raft_data_state_machine = RaftDataStateMachine::new( + &config.storage.data_dir, + &config.raft_store.raftdb_path, + &config.raft_engine.config().dir, + ); + let should_dump = raft_data_state_machine.before_open_target(); + + let raft_config = config.raft_engine.config(); + let raft_engine = + RaftLogEngine::new(raft_config, key_manager.clone(), get_io_rate_limiter()) + .expect("failed to open raft engine"); + + if should_dump { + let config_raftdb = &config.raftdb; + let raft_db_opts = config_raftdb.build_opt(env.clone(), None); + let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); + let raftdb = engine_rocks::util::new_engine_opt( + &config.raft_store.raftdb_path, + raft_db_opts, + raft_cf_opts, + ) + .expect("failed to open raftdb for migration"); + dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /* threads */); + raftdb.stop(); + drop(raftdb); + raft_data_state_machine.after_dump_data(); + } + (raft_engine, None) + } +} + +const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); +pub struct EngineMetricsManager { + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + last_reset: Instant, +} + +impl EngineMetricsManager { + pub fn new( + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + ) -> Self { + EngineMetricsManager { + tablet_registry, + kv_statistics, + kv_is_titan, + raft_engine, + raft_statistics, + last_reset: Instant::now(), + } + } + + pub fn flush(&mut self, now: Instant) { + let mut reporter = EK::StatisticsReporter::new("kv"); + self.tablet_registry + .for_each_opened_tablet(|_, db: &mut CachedTablet| { + if let Some(db) = db.latest() { + reporter.collect(db); + } + true + }); + reporter.flush(); + self.raft_engine.flush_metrics("raft"); + + if let Some(s) = self.kv_statistics.as_ref() { + flush_engine_statistics(s, "kv", self.kv_is_titan); + } + if let Some(s) = self.raft_statistics.as_ref() { + flush_engine_statistics(s, "raft", false); + } + if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { + if let Some(s) = self.kv_statistics.as_ref() { + s.reset(); + } + if let Some(s) = self.raft_statistics.as_ref() { + s.reset(); + } + self.last_reset = now; + } + } +} diff --git a/components/server/src/server.rs b/components/server/src/server.rs index be5edf0cf41..cc07ff85471 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -15,13 +15,9 @@ use std::{ cmp, collections::HashMap, convert::TryFrom, - fmt, path::{Path, PathBuf}, str::FromStr, - sync::{ - atomic::{AtomicU32, AtomicU64, Ordering}, - mpsc, Arc, Mutex, - }, + sync::{atomic::AtomicU64, mpsc, Arc, Mutex}, time::Duration, u64, }; @@ -33,21 +29,13 @@ use backup_stream::{ use causal_ts::CausalTsProviderImpl; use cdc::{CdcConfigManager, MemoryQuota}; use concurrency_manager::ConcurrencyManager; -use encryption_export::DataKeyManager; -use engine_rocks::{ - flush_engine_statistics, from_rocks_compression_type, - raw::{Cache, Env}, - RocksEngine, RocksStatistics, -}; +use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ - CachedTablet, CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, - RaftEngine, SingletonFactory, StatisticsReporter, TabletContext, TabletRegistry, CF_DEFAULT, - CF_LOCK, CF_WRITE, -}; -use file_system::{ - get_io_rate_limiter, BytesFetcher, IoBudgetAdjustor, MetricsManager as IoMetricsManager, + Engines, KvEngine, MiscExt, RaftEngine, SingletonFactory, TabletContext, TabletRegistry, + CF_DEFAULT, CF_WRITE, }; +use file_system::{get_io_rate_limiter, BytesFetcher, MetricsManager as IoMetricsManager}; use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; @@ -103,8 +91,7 @@ use tikv::{ status_server::StatusServer, tablet_snap::NoSnapshotCache, ttl::TtlChecker, - KvEngineFactoryBuilder, Node, RaftKv, Server, CPU_CORES_QUOTA_GAUGE, DEFAULT_CLUSTER_ID, - GRPC_THREAD_PREFIX, + KvEngineFactoryBuilder, Node, RaftKv, Server, CPU_CORES_QUOTA_GAUGE, GRPC_THREAD_PREFIX, }, storage::{ self, @@ -117,14 +104,9 @@ use tikv::{ }; use tikv_util::{ check_environment_variables, - config::{ensure_dir_exist, RaftDataStateMachine, VersionTrack}, - math::MovingAvgU32, - metrics::INSTANCE_BACKEND_CPU_QUOTA, + config::VersionTrack, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, - sys::{ - cpu_time::ProcessStat, disk, path_in_diff_mount_point, register_memory_usage_high_water, - SysQuota, - }, + sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, @@ -134,27 +116,13 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::{check_system_config, TikvServerCore}, + common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, memory::*, - raft_engine_switch::*, setup::*, signal_handler, tikv_util::sys::thread::ThreadBuildWrapper, }; -// minimum number of core kept for background requests -const BACKGROUND_REQUEST_CORE_LOWER_BOUND: f64 = 1.0; -// max ratio of core quota for background requests -const BACKGROUND_REQUEST_CORE_MAX_RATIO: f64 = 0.95; -// default ratio of core quota for background requests = core_number * 0.5 -const BACKGROUND_REQUEST_CORE_DEFAULT_RATIO: f64 = 0.5; -// indication of TiKV instance is short of cpu -const SYSTEM_BUSY_THRESHOLD: f64 = 0.80; -// indication of TiKV instance in healthy state when cpu usage is in [0.5, 0.80) -const SYSTEM_HEALTHY_THRESHOLD: f64 = 0.50; -// pace of cpu quota adjustment -const CPU_QUOTA_ADJUSTMENT_PACE: f64 = 200.0; // 0.2 vcpu - #[inline] fn run_impl(config: TikvConfig) { let mut tikv = TikvServer::::init::(config); @@ -178,7 +146,7 @@ fn run_impl(config: TikvConfig) { tikv.init_storage_stats_task(engines); tikv.run_server(server_config); tikv.run_status_server(); - tikv.init_quota_tuning_task(tikv.quota_limiter.clone()); + tikv.core.init_quota_tuning_task(tikv.quota_limiter.clone()); signal_handler::wait_for_signal( Some(tikv.engines.take().unwrap().engines), @@ -220,9 +188,7 @@ pub fn run_tikv(config: TikvConfig) { const DEFAULT_METRICS_FLUSH_INTERVAL: Duration = Duration::from_millis(10_000); const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); -const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); -const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); /// A complete TiKV server. struct TikvServer { @@ -240,10 +206,8 @@ struct TikvServer { servers: Option>, region_info_accessor: RegionInfoAccessor, coprocessor_host: Option>, - to_stop: Vec>, concurrency_manager: ConcurrencyManager, env: Arc, - background_worker: Worker, check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, @@ -292,8 +256,11 @@ where .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) .build(), ); - let pd_client = - Self::connect_to_pd_cluster(&mut config, env.clone(), Arc::clone(&security_mgr)); + let pd_client = TikvServerCore::connect_to_pd_cluster( + &mut config, + env.clone(), + Arc::clone(&security_mgr), + ); // check if TiKV need to run in snapshot recovery mode let is_recovering_marked = match pd_client.is_recovering_marked() { Err(e) => { @@ -320,7 +287,7 @@ where } // Initialize and check config - let cfg_controller = Self::init_config(config); + let cfg_controller = TikvServerCore::init_config(config); let config = cfg_controller.get_current(); let store_path = Path::new(&config.storage.data_dir).to_owned(); @@ -402,6 +369,8 @@ where encryption_key_manager: None, flow_info_sender: None, flow_info_receiver: None, + to_stop: vec![], + background_worker, }, cfg_controller: Some(cfg_controller), security_mgr, @@ -416,10 +385,8 @@ where servers: None, region_info_accessor, coprocessor_host, - to_stop: vec![], concurrency_manager, env, - background_worker, check_leader_worker, sst_worker: None, quota_limiter, @@ -430,77 +397,6 @@ where } } - /// Initialize and check the config - /// - /// Warnings are logged and fatal errors exist. - /// - /// # Fatal errors - /// - /// - If `dynamic config` feature is enabled and failed to register config - /// to PD - /// - If some critical configs (like data dir) are differrent from last run - /// - If the config can't pass `validate()` - /// - If the max open file descriptor limit is not high enough to support - /// the main database and the raft database. - fn init_config(mut config: TikvConfig) -> ConfigController { - validate_and_persist_config(&mut config, true); - - ensure_dir_exist(&config.storage.data_dir).unwrap(); - if !config.rocksdb.wal_dir.is_empty() { - ensure_dir_exist(&config.rocksdb.wal_dir).unwrap(); - } - if config.raft_engine.enable { - ensure_dir_exist(&config.raft_engine.config().dir).unwrap(); - } else { - ensure_dir_exist(&config.raft_store.raftdb_path).unwrap(); - if !config.raftdb.wal_dir.is_empty() { - ensure_dir_exist(&config.raftdb.wal_dir).unwrap(); - } - } - - check_system_config(&config); - - tikv_util::set_panic_hook(config.abort_on_panic, &config.storage.data_dir); - - info!( - "using config"; - "config" => serde_json::to_string(&config).unwrap(), - ); - if config.panic_when_unexpected_key_or_data { - info!("panic-when-unexpected-key-or-data is on"); - tikv_util::set_panic_when_unexpected_key_or_data(true); - } - - config.write_into_metrics(); - - ConfigController::new(config) - } - - fn connect_to_pd_cluster( - config: &mut TikvConfig, - env: Arc, - security_mgr: Arc, - ) -> Arc { - let pd_client = Arc::new( - RpcClient::new(&config.pd, Some(env), security_mgr) - .unwrap_or_else(|e| fatal!("failed to create rpc client: {}", e)), - ); - - let cluster_id = pd_client - .get_cluster_id() - .unwrap_or_else(|e| fatal!("failed to get cluster id: {}", e)); - if cluster_id == DEFAULT_CLUSTER_ID { - fatal!("cluster id can't be {}", DEFAULT_CLUSTER_ID); - } - config.server.cluster_id = cluster_id; - info!( - "connect to PD cluster"; - "cluster_id" => cluster_id - ); - - pd_client - } - fn init_engines(&mut self, engines: Engines) { let store_meta = Arc::new(Mutex::new(StoreMeta::new(PENDING_MSG_CAP))); let engine = RaftKv::new( @@ -612,14 +508,14 @@ where pd_sender.clone(), engines.engine.clone(), resource_ctl, - CleanupMethod::Remote(self.background_worker.remote()), + CleanupMethod::Remote(self.core.background_worker.remote()), )) } else { None }; if let Some(unified_read_pool) = &unified_read_pool { let handle = unified_read_pool.handle(); - self.background_worker.spawn_interval_task( + self.core.background_worker.spawn_interval_task( UPDATE_EWMA_TIME_SLICE_INTERVAL, move || { handle.update_ewma_time_slice(); @@ -647,19 +543,19 @@ where resource_metering::init_recorder( self.core.config.resource_metering.precision.as_millis(), ); - self.to_stop.push(recorder_worker); + self.core.to_stop.push(recorder_worker); let (reporter_notifier, data_sink_reg_handle, reporter_worker) = resource_metering::init_reporter( self.core.config.resource_metering.clone(), collector_reg_handle.clone(), ); - self.to_stop.push(reporter_worker); + self.core.to_stop.push(reporter_worker); let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( self.core.config.resource_metering.receiver_address.clone(), self.env.clone(), data_sink_reg_handle.clone(), ); - self.to_stop.push(single_target_worker); + self.core.to_stop.push(single_target_worker); let rsmeter_pubsub_service = resource_metering::PubSubService::new(data_sink_reg_handle); let cfg_manager = resource_metering::ConfigManager::new( @@ -714,7 +610,7 @@ where let (resolver, state) = resolve::new_resolver( self.pd_client.clone(), - &self.background_worker, + &self.core.background_worker, storage.get_engine().raft_extension(), ); self.resolver = Some(resolver); @@ -766,7 +662,7 @@ where Box::new(ReadPoolConfigManager::new( unified_read_pool.as_ref().unwrap().handle(), unified_read_pool_scale_notifier, - &self.background_worker, + &self.core.background_worker, self.core.config.readpool.unified.max_thread_count, self.core.config.readpool.unified.auto_adjust_pool_size, )), @@ -829,7 +725,7 @@ where self.core.config.storage.api_version(), self.pd_client.clone(), state, - self.background_worker.clone(), + self.core.background_worker.clone(), Some(health_service.clone()), None, ); @@ -911,7 +807,7 @@ where Arc::clone(&self.security_mgr), ); backup_stream_worker.start(backup_stream_endpoint); - self.to_stop.push(backup_stream_worker); + self.core.to_stop.push(backup_stream_worker); Some(backup_stream_scheduler) } else { None @@ -953,6 +849,7 @@ where self.coprocessor_host.clone().unwrap(), ); let split_check_scheduler = self + .core .background_worker .start("split-check", split_check_runner); cfg_controller.register( @@ -1028,7 +925,7 @@ where self.region_info_accessor.clone(), self.core.config.storage.ttl_check_poll_interval.into(), )); - self.to_stop.push(ttl_checker); + self.core.to_stop.push(ttl_checker); } // Start CDC. @@ -1050,7 +947,7 @@ where self.causal_ts_provider.clone(), ); cdc_worker.start_with_timer(cdc_endpoint); - self.to_stop.push(cdc_worker); + self.core.to_stop.push(cdc_worker); // Start resolved ts if let Some(mut rts_worker) = rts_worker { @@ -1065,7 +962,7 @@ where self.security_mgr.clone(), ); rts_worker.start_with_timer(rts_endpoint); - self.to_stop.push(rts_worker); + self.core.to_stop.push(rts_worker); } cfg_controller.register( @@ -1169,7 +1066,7 @@ where .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); // Backup service. - let mut backup_worker = Box::new(self.background_worker.lazy_build("backup-endpoint")); + let mut backup_worker = Box::new(self.core.background_worker.lazy_build("backup-endpoint")); let backup_scheduler = backup_worker.scheduler(); let backup_service = backup::Service::::with_router(backup_scheduler, self.router.clone()); @@ -1265,13 +1162,15 @@ where // `cached_latest_tablets` is passed to `update` to avoid memory // allocation each time when calling `update`. let mut cached_latest_tablets = HashMap::default(); - self.background_worker - .spawn_interval_task(DEFAULT_METRICS_FLUSH_INTERVAL, move || { + self.core.background_worker.spawn_interval_task( + DEFAULT_METRICS_FLUSH_INTERVAL, + move || { let now = Instant::now(); engine_metrics.flush(now); io_metrics.flush(now); engines_info_clone.update(now, &mut cached_latest_tablets); - }); + }, + ); if let Some(limiter) = get_io_rate_limiter() { limiter.set_low_priority_io_adjustor_if_needed(Some(engines_info)); } @@ -1279,90 +1178,11 @@ where let mut mem_trace_metrics = MemoryTraceManager::default(); mem_trace_metrics.register_provider(MEMTRACE_RAFTSTORE.clone()); mem_trace_metrics.register_provider(MEMTRACE_COPROCESSOR.clone()); - self.background_worker - .spawn_interval_task(DEFAULT_MEMTRACE_FLUSH_INTERVAL, move || { + self.core.background_worker.spawn_interval_task( + DEFAULT_MEMTRACE_FLUSH_INTERVAL, + move || { let now = Instant::now(); mem_trace_metrics.flush(now); - }); - } - - // Only background cpu quota tuning is implemented at present. iops and frontend - // quota tuning is on the way - fn init_quota_tuning_task(&self, quota_limiter: Arc) { - // No need to do auto tune when capacity is really low - if SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO - < BACKGROUND_REQUEST_CORE_LOWER_BOUND - { - return; - }; - - // Determine the base cpu quota - let base_cpu_quota = - // if cpu quota is not specified, start from optimistic case - if quota_limiter.cputime_limiter(false).is_infinite() { - 1000_f64 - * f64::max( - BACKGROUND_REQUEST_CORE_LOWER_BOUND, - SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_DEFAULT_RATIO, - ) - } else { - quota_limiter.cputime_limiter(false) / 1000_f64 - }; - - // Calculate the celling and floor quota - let celling_quota = f64::min( - base_cpu_quota * 2.0, - 1_000_f64 * SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO, - ); - let floor_quota = f64::max( - base_cpu_quota * 0.5, - 1_000_f64 * BACKGROUND_REQUEST_CORE_LOWER_BOUND, - ); - - let mut proc_stats: ProcessStat = ProcessStat::cur_proc_stat().unwrap(); - self.background_worker.spawn_interval_task( - DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL, - move || { - if quota_limiter.auto_tune_enabled() { - let cputime_limit = quota_limiter.cputime_limiter(false); - let old_quota = if cputime_limit.is_infinite() { - base_cpu_quota - } else { - cputime_limit / 1000_f64 - }; - let cpu_usage = match proc_stats.cpu_usage() { - Ok(r) => r, - Err(_e) => 0.0, - }; - // Try tuning quota when cpu_usage is correctly collected. - // rule based tuning: - // - if instance is busy, shrink cpu quota for analyze by one quota pace until - // lower bound is hit; - // - if instance cpu usage is healthy, no op; - // - if instance is idle, increase cpu quota by one quota pace until upper - // bound is hit. - if cpu_usage > 0.0f64 { - let mut target_quota = old_quota; - - let cpu_util = cpu_usage / SysQuota::cpu_cores_quota(); - if cpu_util >= SYSTEM_BUSY_THRESHOLD { - target_quota = - f64::max(target_quota - CPU_QUOTA_ADJUSTMENT_PACE, floor_quota); - } else if cpu_util < SYSTEM_HEALTHY_THRESHOLD { - target_quota = - f64::min(target_quota + CPU_QUOTA_ADJUSTMENT_PACE, celling_quota); - } - - if old_quota != target_quota { - quota_limiter.set_cpu_time_limit(target_quota as usize, false); - debug!( - "cpu_time_limiter tuned for backend request"; - "cpu_util" => ?cpu_util, - "new quota" => ?target_quota); - INSTANCE_BACKEND_CPU_QUOTA.set(target_quota as i64); - } - } - } }, ); } @@ -1395,7 +1215,7 @@ where (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, } } - self.background_worker + self.core.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { let disk_stats = match fs2::statvfs(&store_path) { Err(e) => { @@ -1548,7 +1368,7 @@ where if let Err(e) = status_server.start(self.core.config.server.status_addr.clone()) { error_unknown!(%e; "failed to bind addr for status service"); } else { - self.to_stop.push(status_server); + self.core.to_stop.push(status_server); } } } @@ -1570,117 +1390,7 @@ where sst_worker.stop_worker(); } - self.to_stop.into_iter().for_each(|s| s.stop()); - } -} - -pub trait ConfiguredRaftEngine: RaftEngine { - fn build( - _: &TikvConfig, - _: &Arc, - _: &Option>, - _: &Cache, - ) -> (Self, Option>); - fn as_rocks_engine(&self) -> Option<&RocksEngine>; - fn register_config(&self, _cfg_controller: &mut ConfigController); -} - -impl ConfiguredRaftEngine for T { - default fn build( - _: &TikvConfig, - _: &Arc, - _: &Option>, - _: &Cache, - ) -> (Self, Option>) { - unimplemented!() - } - default fn as_rocks_engine(&self) -> Option<&RocksEngine> { - None - } - default fn register_config(&self, _cfg_controller: &mut ConfigController) {} -} - -impl ConfiguredRaftEngine for RocksEngine { - fn build( - config: &TikvConfig, - env: &Arc, - key_manager: &Option>, - block_cache: &Cache, - ) -> (Self, Option>) { - let mut raft_data_state_machine = RaftDataStateMachine::new( - &config.storage.data_dir, - &config.raft_engine.config().dir, - &config.raft_store.raftdb_path, - ); - let should_dump = raft_data_state_machine.before_open_target(); - - let raft_db_path = &config.raft_store.raftdb_path; - let config_raftdb = &config.raftdb; - let statistics = Arc::new(RocksStatistics::new_titan()); - let raft_db_opts = config_raftdb.build_opt(env.clone(), Some(&statistics)); - let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) - .expect("failed to open raftdb"); - - if should_dump { - let raft_engine = - RaftLogEngine::new(config.raft_engine.config(), key_manager.clone(), None) - .expect("failed to open raft engine for migration"); - dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /* threads */); - raft_engine.stop(); - drop(raft_engine); - raft_data_state_machine.after_dump_data(); - } - (raftdb, Some(statistics)) - } - - fn as_rocks_engine(&self) -> Option<&RocksEngine> { - Some(self) - } - - fn register_config(&self, cfg_controller: &mut ConfigController) { - cfg_controller.register( - tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), - ); - } -} - -impl ConfiguredRaftEngine for RaftLogEngine { - fn build( - config: &TikvConfig, - env: &Arc, - key_manager: &Option>, - block_cache: &Cache, - ) -> (Self, Option>) { - let mut raft_data_state_machine = RaftDataStateMachine::new( - &config.storage.data_dir, - &config.raft_store.raftdb_path, - &config.raft_engine.config().dir, - ); - let should_dump = raft_data_state_machine.before_open_target(); - - let raft_config = config.raft_engine.config(); - let raft_engine = - RaftLogEngine::new(raft_config, key_manager.clone(), get_io_rate_limiter()) - .expect("failed to open raft engine"); - - if should_dump { - let config_raftdb = &config.raftdb; - let raft_db_opts = config_raftdb.build_opt(env.clone(), None); - let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let raftdb = engine_rocks::util::new_engine_opt( - &config.raft_store.raftdb_path, - raft_db_opts, - raft_cf_opts, - ) - .expect("failed to open raftdb for migration"); - dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /* threads */); - raftdb.stop(); - drop(raftdb); - raft_data_state_machine.after_dump_data(); - } - (raft_engine, None) + self.core.to_stop.into_iter().for_each(|s| s.stop()); } } @@ -1783,198 +1493,9 @@ fn pre_start() { } } -/// A small trait for components which can be trivially stopped. Lets us keep -/// a list of these in `TiKV`, rather than storing each component individually. -pub(crate) trait Stop { - fn stop(self: Box); -} - -impl Stop for StatusServer -where - R: 'static + Send, -{ - fn stop(self: Box) { - (*self).stop() - } -} - -impl Stop for Worker { - fn stop(self: Box) { - Worker::stop(&self); - } -} - -impl Stop for LazyWorker { - fn stop(self: Box) { - self.stop_worker(); - } -} - -pub struct EngineMetricsManager { - tablet_registry: TabletRegistry, - kv_statistics: Option>, - kv_is_titan: bool, - raft_engine: ER, - raft_statistics: Option>, - last_reset: Instant, -} - -impl EngineMetricsManager { - pub fn new( - tablet_registry: TabletRegistry, - kv_statistics: Option>, - kv_is_titan: bool, - raft_engine: ER, - raft_statistics: Option>, - ) -> Self { - EngineMetricsManager { - tablet_registry, - kv_statistics, - kv_is_titan, - raft_engine, - raft_statistics, - last_reset: Instant::now(), - } - } - - pub fn flush(&mut self, now: Instant) { - let mut reporter = EK::StatisticsReporter::new("kv"); - self.tablet_registry - .for_each_opened_tablet(|_, db: &mut CachedTablet| { - if let Some(db) = db.latest() { - reporter.collect(db); - } - true - }); - reporter.flush(); - self.raft_engine.flush_metrics("raft"); - - if let Some(s) = self.kv_statistics.as_ref() { - flush_engine_statistics(s, "kv", self.kv_is_titan); - } - if let Some(s) = self.raft_statistics.as_ref() { - flush_engine_statistics(s, "raft", false); - } - if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { - if let Some(s) = self.kv_statistics.as_ref() { - s.reset(); - } - if let Some(s) = self.raft_statistics.as_ref() { - s.reset(); - } - self.last_reset = now; - } - } -} - -pub struct EnginesResourceInfo { - tablet_registry: TabletRegistry, - raft_engine: Option, - latest_normalized_pending_bytes: AtomicU32, - normalized_pending_bytes_collector: MovingAvgU32, -} - -impl EnginesResourceInfo { - const SCALE_FACTOR: u64 = 100; - - fn new( - tablet_registry: TabletRegistry, - raft_engine: Option, - max_samples_to_preserve: usize, - ) -> Self { - EnginesResourceInfo { - tablet_registry, - raft_engine, - latest_normalized_pending_bytes: AtomicU32::new(0), - normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), - } - } - - pub fn update( - &self, - _now: Instant, - cached_latest_tablets: &mut HashMap>, - ) { - let mut normalized_pending_bytes = 0; - - fn fetch_engine_cf(engine: &RocksEngine, cf: &str, normalized_pending_bytes: &mut u32) { - if let Ok(cf_opts) = engine.get_options_cf(cf) { - if let Ok(Some(b)) = engine.get_cf_pending_compaction_bytes(cf) { - if cf_opts.get_soft_pending_compaction_bytes_limit() > 0 { - *normalized_pending_bytes = std::cmp::max( - *normalized_pending_bytes, - (b * EnginesResourceInfo::SCALE_FACTOR - / cf_opts.get_soft_pending_compaction_bytes_limit()) - as u32, - ); - } - } - } - } - - if let Some(raft_engine) = &self.raft_engine { - fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); - } - - self.tablet_registry - .for_each_opened_tablet(|id, db: &mut CachedTablet| { - cached_latest_tablets.insert(id, db.clone()); - true - }); - - // todo(SpadeA): Now, there's a potential race condition problem where the - // tablet could be destroyed after the clone and before the fetching - // which could result in programme panic. It's okay now as the single global - // kv_engine will not be destroyed in normal operation and v2 is not - // ready for operation. Furthermore, this race condition is general to v2 as - // tablet clone is not a case exclusively happened here. We should - // propose another PR to tackle it such as destory tablet lazily in a GC - // thread. - - for (_, cache) in cached_latest_tablets.iter_mut() { - let Some(tablet) = cache.latest() else { continue }; - for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { - fetch_engine_cf(tablet, cf, &mut normalized_pending_bytes); - } - } - - // Clear ensures that these tablets are not hold forever. - cached_latest_tablets.clear(); - - let (_, avg) = self - .normalized_pending_bytes_collector - .add(normalized_pending_bytes); - self.latest_normalized_pending_bytes.store( - std::cmp::max(normalized_pending_bytes, avg), - Ordering::Relaxed, - ); - } -} - -impl IoBudgetAdjustor for EnginesResourceInfo { - fn adjust(&self, total_budgets: usize) -> usize { - let score = self.latest_normalized_pending_bytes.load(Ordering::Relaxed) as f32 - / Self::SCALE_FACTOR as f32; - // Two reasons for adding `sqrt` on top: - // 1) In theory the convergence point is independent of the value of pending - // bytes (as long as backlog generating rate equals consuming rate, which is - // determined by compaction budgets), a convex helps reach that point while - // maintaining low level of pending bytes. - // 2) Variance of compaction pending bytes grows with its magnitude, a filter - // with decreasing derivative can help balance such trend. - let score = score.sqrt(); - // The target global write flow slides between Bandwidth / 2 and Bandwidth. - let score = 0.5 + score / 2.0; - (total_budgets as f32 * score) as usize - } -} - #[cfg(test)] mod test { - use std::{ - collections::HashMap, - sync::{atomic::Ordering, Arc}, - }; + use std::{collections::HashMap, sync::Arc}; use engine_rocks::raw::Env; use engine_traits::{ @@ -2054,9 +1575,7 @@ mod test { // bytes of tablet_1_20 assert_eq!( (new_pending_compaction_bytes * 100) as u32, - engines_info - .latest_normalized_pending_bytes - .load(Ordering::Relaxed) + engines_info.latest_normalized_pending_bytes() ); } } diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 81ec94207a9..a29c344884f 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -16,10 +16,7 @@ use std::{ collections::HashMap, path::{Path, PathBuf}, str::FromStr, - sync::{ - atomic::{AtomicU32, AtomicU64, Ordering}, - mpsc, Arc, - }, + sync::{atomic::AtomicU64, mpsc, Arc}, time::Duration, u64, }; @@ -27,19 +24,9 @@ use std::{ use api_version::{dispatch_api_version, KvFormat}; use causal_ts::CausalTsProviderImpl; use concurrency_manager::ConcurrencyManager; -use encryption_export::DataKeyManager; -use engine_rocks::{ - flush_engine_statistics, from_rocks_compression_type, - raw::{Cache, Env}, - RocksEngine, RocksStatistics, -}; -use engine_traits::{ - CachedTablet, CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, - RaftEngine, StatisticsReporter, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, -}; -use file_system::{ - get_io_rate_limiter, BytesFetcher, IoBudgetAdjustor, MetricsManager as IoMetricsManager, -}; +use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; +use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine, TabletRegistry, CF_DEFAULT, CF_WRITE}; +use file_system::{get_io_rate_limiter, BytesFetcher, MetricsManager as IoMetricsManager}; use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; @@ -82,8 +69,7 @@ use tikv::{ resolve, service::DiagnosticsService, status_server::StatusServer, - KvEngineFactoryBuilder, NodeV2, RaftKv2, Server, CPU_CORES_QUOTA_GAUGE, DEFAULT_CLUSTER_ID, - GRPC_THREAD_PREFIX, + KvEngineFactoryBuilder, NodeV2, RaftKv2, Server, CPU_CORES_QUOTA_GAUGE, GRPC_THREAD_PREFIX, }, storage::{ self, @@ -96,14 +82,9 @@ use tikv::{ }; use tikv_util::{ check_environment_variables, - config::{ensure_dir_exist, RaftDataStateMachine, VersionTrack}, - math::MovingAvgU32, - metrics::INSTANCE_BACKEND_CPU_QUOTA, + config::VersionTrack, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, - sys::{ - cpu_time::ProcessStat, disk, path_in_diff_mount_point, register_memory_usage_high_water, - SysQuota, - }, + sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, @@ -113,28 +94,13 @@ use tikv_util::{ use tokio::runtime::Builder; use crate::{ - common::{check_system_config, TikvServerCore}, + common::{ConfiguredRaftEngine, EngineMetricsManager, EnginesResourceInfo, TikvServerCore}, memory::*, - raft_engine_switch::*, - server::Stop, setup::*, signal_handler, tikv_util::sys::thread::ThreadBuildWrapper, }; -// minimum number of core kept for background requests -const BACKGROUND_REQUEST_CORE_LOWER_BOUND: f64 = 1.0; -// max ratio of core quota for background requests -const BACKGROUND_REQUEST_CORE_MAX_RATIO: f64 = 0.95; -// default ratio of core quota for background requests = core_number * 0.5 -const BACKGROUND_REQUEST_CORE_DEFAULT_RATIO: f64 = 0.5; -// indication of TiKV instance is short of cpu -const SYSTEM_BUSY_THRESHOLD: f64 = 0.80; -// indication of TiKV instance in healthy state when cpu usage is in [0.5, 0.80) -const SYSTEM_HEALTHY_THRESHOLD: f64 = 0.50; -// pace of cpu quota adjustment -const CPU_QUOTA_ADJUSTMENT_PACE: f64 = 200.0; // 0.2 vcpu - #[inline] fn run_impl(config: TikvConfig) { let mut tikv = TikvServer::::init::(config); @@ -157,7 +123,7 @@ fn run_impl(config: TikvConfig) { tikv.init_storage_stats_task(); tikv.run_server(server_config); tikv.run_status_server(); - tikv.init_quota_tuning_task(tikv.quota_limiter.clone()); + tikv.core.init_quota_tuning_task(tikv.quota_limiter.clone()); // TODO: support signal dump stats signal_handler::wait_for_signal( @@ -200,9 +166,7 @@ pub fn run_tikv(config: TikvConfig) { const DEFAULT_METRICS_FLUSH_INTERVAL: Duration = Duration::from_millis(10_000); const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); -const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); -const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); /// A complete TiKV server. struct TikvServer { @@ -220,10 +184,8 @@ struct TikvServer { servers: Option>, region_info_accessor: Option, coprocessor_host: Option>, - to_stop: Vec>, concurrency_manager: ConcurrencyManager, env: Arc, - background_worker: Worker, check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, @@ -265,11 +227,14 @@ where .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) .build(), ); - let pd_client = - Self::connect_to_pd_cluster(&mut config, env.clone(), Arc::clone(&security_mgr)); + let pd_client = TikvServerCore::connect_to_pd_cluster( + &mut config, + env.clone(), + Arc::clone(&security_mgr), + ); // Initialize and check config - let cfg_controller = Self::init_config(config); + let cfg_controller = TikvServerCore::init_config(config); let config = cfg_controller.get_current(); let store_path = Path::new(&config.storage.data_dir).to_owned(); @@ -342,6 +307,8 @@ where encryption_key_manager: None, flow_info_sender: None, flow_info_receiver: None, + to_stop: vec![], + background_worker, }, cfg_controller: Some(cfg_controller), security_mgr, @@ -356,10 +323,8 @@ where servers: None, region_info_accessor: None, coprocessor_host: None, - to_stop: vec![], concurrency_manager, env, - background_worker, check_leader_worker, sst_worker: None, quota_limiter, @@ -369,77 +334,6 @@ where } } - /// Initialize and check the config - /// - /// Warnings are logged and fatal errors exist. - /// - /// # Fatal errors - /// - /// - If `dynamic config` feature is enabled and failed to register config - /// to PD - /// - If some critical configs (like data dir) are differrent from last run - /// - If the config can't pass `validate()` - /// - If the max open file descriptor limit is not high enough to support - /// the main database and the raft database. - fn init_config(mut config: TikvConfig) -> ConfigController { - validate_and_persist_config(&mut config, true); - - ensure_dir_exist(&config.storage.data_dir).unwrap(); - if !config.rocksdb.wal_dir.is_empty() { - ensure_dir_exist(&config.rocksdb.wal_dir).unwrap(); - } - if config.raft_engine.enable { - ensure_dir_exist(&config.raft_engine.config().dir).unwrap(); - } else { - ensure_dir_exist(&config.raft_store.raftdb_path).unwrap(); - if !config.raftdb.wal_dir.is_empty() { - ensure_dir_exist(&config.raftdb.wal_dir).unwrap(); - } - } - - check_system_config(&config); - - tikv_util::set_panic_hook(config.abort_on_panic, &config.storage.data_dir); - - info!( - "using config"; - "config" => serde_json::to_string(&config).unwrap(), - ); - if config.panic_when_unexpected_key_or_data { - info!("panic-when-unexpected-key-or-data is on"); - tikv_util::set_panic_when_unexpected_key_or_data(true); - } - - config.write_into_metrics(); - - ConfigController::new(config) - } - - fn connect_to_pd_cluster( - config: &mut TikvConfig, - env: Arc, - security_mgr: Arc, - ) -> Arc { - let pd_client = Arc::new( - RpcClient::new(&config.pd, Some(env), security_mgr) - .unwrap_or_else(|e| fatal!("failed to create rpc client: {}", e)), - ); - - let cluster_id = pd_client - .get_cluster_id() - .unwrap_or_else(|e| fatal!("failed to get cluster id: {}", e)); - if cluster_id == DEFAULT_CLUSTER_ID { - fatal!("cluster id can't be {}", DEFAULT_CLUSTER_ID); - } - config.server.cluster_id = cluster_id; - info!( - "connect to PD cluster"; - "cluster_id" => cluster_id - ); - - pd_client - } - fn init_gc_worker(&mut self) -> GcWorker> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( @@ -505,14 +399,14 @@ where pd_sender.clone(), engines.engine.clone(), resource_ctl, - CleanupMethod::Remote(self.background_worker.remote()), + CleanupMethod::Remote(self.core.background_worker.remote()), )) } else { None }; if let Some(unified_read_pool) = &unified_read_pool { let handle = unified_read_pool.handle(); - self.background_worker.spawn_interval_task( + self.core.background_worker.spawn_interval_task( UPDATE_EWMA_TIME_SLICE_INTERVAL, move || { handle.update_ewma_time_slice(); @@ -540,19 +434,19 @@ where resource_metering::init_recorder( self.core.config.resource_metering.precision.as_millis(), ); - self.to_stop.push(recorder_worker); + self.core.to_stop.push(recorder_worker); let (reporter_notifier, data_sink_reg_handle, reporter_worker) = resource_metering::init_reporter( self.core.config.resource_metering.clone(), collector_reg_handle.clone(), ); - self.to_stop.push(reporter_worker); + self.core.to_stop.push(reporter_worker); let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( self.core.config.resource_metering.receiver_address.clone(), self.env.clone(), data_sink_reg_handle.clone(), ); - self.to_stop.push(single_target_worker); + self.core.to_stop.push(single_target_worker); let rsmeter_pubsub_service = resource_metering::PubSubService::new(data_sink_reg_handle); let cfg_manager = resource_metering::ConfigManager::new( @@ -607,7 +501,7 @@ where let (resolver, state) = resolve::new_resolver( self.pd_client.clone(), - &self.background_worker, + &self.core.background_worker, storage.get_engine().raft_extension(), ); self.resolver = Some(resolver); @@ -649,7 +543,7 @@ where Box::new(ReadPoolConfigManager::new( unified_read_pool.as_ref().unwrap().handle(), unified_read_pool_scale_notifier, - &self.background_worker, + &self.core.background_worker, self.core.config.readpool.unified.max_thread_count, self.core.config.readpool.unified.auto_adjust_pool_size, )), @@ -791,7 +685,7 @@ where self.coprocessor_host.clone().unwrap(), auto_split_controller, collector_reg_handle, - self.background_worker.clone(), + self.core.background_worker.clone(), pd_worker, raft_store, &state, @@ -831,7 +725,7 @@ where let engines = self.engines.as_ref().unwrap(); // Backup service. - let mut backup_worker = Box::new(self.background_worker.lazy_build("backup-endpoint")); + let mut backup_worker = Box::new(self.core.background_worker.lazy_build("backup-endpoint")); let backup_scheduler = backup_worker.scheduler(); let backup_service = backup::Service::::new(backup_scheduler); if servers @@ -947,13 +841,15 @@ where // `cached_latest_tablets` is passed to `update` to avoid memory // allocation each time when calling `update`. let mut cached_latest_tablets = HashMap::default(); - self.background_worker - .spawn_interval_task(DEFAULT_METRICS_FLUSH_INTERVAL, move || { + self.core.background_worker.spawn_interval_task( + DEFAULT_METRICS_FLUSH_INTERVAL, + move || { let now = Instant::now(); engine_metrics.flush(now); io_metrics.flush(now); engines_info_clone.update(now, &mut cached_latest_tablets); - }); + }, + ); if let Some(limiter) = get_io_rate_limiter() { limiter.set_low_priority_io_adjustor_if_needed(Some(engines_info)); } @@ -961,90 +857,11 @@ where let mut mem_trace_metrics = MemoryTraceManager::default(); mem_trace_metrics.register_provider(MEMTRACE_RAFTSTORE.clone()); mem_trace_metrics.register_provider(MEMTRACE_COPROCESSOR.clone()); - self.background_worker - .spawn_interval_task(DEFAULT_MEMTRACE_FLUSH_INTERVAL, move || { + self.core.background_worker.spawn_interval_task( + DEFAULT_MEMTRACE_FLUSH_INTERVAL, + move || { let now = Instant::now(); mem_trace_metrics.flush(now); - }); - } - - // Only background cpu quota tuning is implemented at present. iops and frontend - // quota tuning is on the way - fn init_quota_tuning_task(&self, quota_limiter: Arc) { - // No need to do auto tune when capacity is really low - if SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO - < BACKGROUND_REQUEST_CORE_LOWER_BOUND - { - return; - }; - - // Determine the base cpu quota - let base_cpu_quota = - // if cpu quota is not specified, start from optimistic case - if quota_limiter.cputime_limiter(false).is_infinite() { - 1000_f64 - * f64::max( - BACKGROUND_REQUEST_CORE_LOWER_BOUND, - SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_DEFAULT_RATIO, - ) - } else { - quota_limiter.cputime_limiter(false) / 1000_f64 - }; - - // Calculate the celling and floor quota - let celling_quota = f64::min( - base_cpu_quota * 2.0, - 1_000_f64 * SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO, - ); - let floor_quota = f64::max( - base_cpu_quota * 0.5, - 1_000_f64 * BACKGROUND_REQUEST_CORE_LOWER_BOUND, - ); - - let mut proc_stats: ProcessStat = ProcessStat::cur_proc_stat().unwrap(); - self.background_worker.spawn_interval_task( - DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL, - move || { - if quota_limiter.auto_tune_enabled() { - let cputime_limit = quota_limiter.cputime_limiter(false); - let old_quota = if cputime_limit.is_infinite() { - base_cpu_quota - } else { - cputime_limit / 1000_f64 - }; - let cpu_usage = match proc_stats.cpu_usage() { - Ok(r) => r, - Err(_e) => 0.0, - }; - // Try tuning quota when cpu_usage is correctly collected. - // rule based tuning: - // - if instance is busy, shrink cpu quota for analyze by one quota pace until - // lower bound is hit; - // - if instance cpu usage is healthy, no op; - // - if instance is idle, increase cpu quota by one quota pace until upper - // bound is hit. - if cpu_usage > 0.0f64 { - let mut target_quota = old_quota; - - let cpu_util = cpu_usage / SysQuota::cpu_cores_quota(); - if cpu_util >= SYSTEM_BUSY_THRESHOLD { - target_quota = - f64::max(target_quota - CPU_QUOTA_ADJUSTMENT_PACE, floor_quota); - } else if cpu_util < SYSTEM_HEALTHY_THRESHOLD { - target_quota = - f64::min(target_quota + CPU_QUOTA_ADJUSTMENT_PACE, celling_quota); - } - - if old_quota != target_quota { - quota_limiter.set_cpu_time_limit(target_quota as usize, false); - debug!( - "cpu_time_limiter tuned for backend request"; - "cpu_util" => ?cpu_util, - "new quota" => ?target_quota); - INSTANCE_BACKEND_CPU_QUOTA.set(target_quota as i64); - } - } - } }, ); } @@ -1079,7 +896,7 @@ where (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, } } - self.background_worker + self.core.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { let disk_stats = match fs2::statvfs(&store_path) { Err(e) => { @@ -1238,7 +1055,7 @@ where if let Err(e) = status_server.start(self.core.config.server.status_addr.clone()) { error_unknown!(%e; "failed to bind addr for status service"); } else { - self.to_stop.push(status_server); + self.core.to_stop.push(status_server); } } } @@ -1260,117 +1077,7 @@ where sst_worker.stop_worker(); } - self.to_stop.into_iter().for_each(|s| s.stop()); - } -} - -pub trait ConfiguredRaftEngine: RaftEngine { - fn build( - _: &TikvConfig, - _: &Arc, - _: &Option>, - _: &Cache, - ) -> (Self, Option>); - fn as_rocks_engine(&self) -> Option<&RocksEngine>; - fn register_config(&self, _cfg_controller: &mut ConfigController); -} - -impl ConfiguredRaftEngine for T { - default fn build( - _: &TikvConfig, - _: &Arc, - _: &Option>, - _: &Cache, - ) -> (Self, Option>) { - unimplemented!() - } - default fn as_rocks_engine(&self) -> Option<&RocksEngine> { - None - } - default fn register_config(&self, _cfg_controller: &mut ConfigController) {} -} - -impl ConfiguredRaftEngine for RocksEngine { - fn build( - config: &TikvConfig, - env: &Arc, - key_manager: &Option>, - block_cache: &Cache, - ) -> (Self, Option>) { - let mut raft_data_state_machine = RaftDataStateMachine::new( - &config.storage.data_dir, - &config.raft_engine.config().dir, - &config.raft_store.raftdb_path, - ); - let should_dump = raft_data_state_machine.before_open_target(); - - let raft_db_path = &config.raft_store.raftdb_path; - let config_raftdb = &config.raftdb; - let statistics = Arc::new(RocksStatistics::new_titan()); - let raft_db_opts = config_raftdb.build_opt(env.clone(), Some(&statistics)); - let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) - .expect("failed to open raftdb"); - - if should_dump { - let raft_engine = - RaftLogEngine::new(config.raft_engine.config(), key_manager.clone(), None) - .expect("failed to open raft engine for migration"); - dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /* threads */); - raft_engine.stop(); - drop(raft_engine); - raft_data_state_machine.after_dump_data(); - } - (raftdb, Some(statistics)) - } - - fn as_rocks_engine(&self) -> Option<&RocksEngine> { - Some(self) - } - - fn register_config(&self, cfg_controller: &mut ConfigController) { - cfg_controller.register( - tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), - ); - } -} - -impl ConfiguredRaftEngine for RaftLogEngine { - fn build( - config: &TikvConfig, - env: &Arc, - key_manager: &Option>, - block_cache: &Cache, - ) -> (Self, Option>) { - let mut raft_data_state_machine = RaftDataStateMachine::new( - &config.storage.data_dir, - &config.raft_store.raftdb_path, - &config.raft_engine.config().dir, - ); - let should_dump = raft_data_state_machine.before_open_target(); - - let raft_config = config.raft_engine.config(); - let raft_engine = - RaftLogEngine::new(raft_config, key_manager.clone(), get_io_rate_limiter()) - .expect("failed to open raft engine"); - - if should_dump { - let config_raftdb = &config.raftdb; - let raft_db_opts = config_raftdb.build_opt(env.clone(), None); - let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let raftdb = engine_rocks::util::new_engine_opt( - &config.raft_store.raftdb_path, - raft_db_opts, - raft_cf_opts, - ) - .expect("failed to open raftdb for migration"); - dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /* threads */); - raftdb.stop(); - drop(raftdb); - raft_data_state_machine.after_dump_data(); - } - (raft_engine, None) + self.core.to_stop.into_iter().for_each(|s| s.stop()); } } @@ -1488,171 +1195,10 @@ fn pre_start() { ); } } -pub struct EngineMetricsManager { - tablet_registry: TabletRegistry, - kv_statistics: Option>, - kv_is_titan: bool, - raft_engine: ER, - raft_statistics: Option>, - last_reset: Instant, -} - -impl EngineMetricsManager { - pub fn new( - tablet_registry: TabletRegistry, - kv_statistics: Option>, - kv_is_titan: bool, - raft_engine: ER, - raft_statistics: Option>, - ) -> Self { - EngineMetricsManager { - tablet_registry, - kv_statistics, - kv_is_titan, - raft_engine, - raft_statistics, - last_reset: Instant::now(), - } - } - - pub fn flush(&mut self, now: Instant) { - let mut reporter = EK::StatisticsReporter::new("kv"); - self.tablet_registry - .for_each_opened_tablet(|_, db: &mut CachedTablet| { - if let Some(db) = db.latest() { - reporter.collect(db); - } - true - }); - reporter.flush(); - self.raft_engine.flush_metrics("raft"); - - if let Some(s) = self.kv_statistics.as_ref() { - flush_engine_statistics(s, "kv", self.kv_is_titan); - } - if let Some(s) = self.raft_statistics.as_ref() { - flush_engine_statistics(s, "raft", false); - } - if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { - if let Some(s) = self.kv_statistics.as_ref() { - s.reset(); - } - if let Some(s) = self.raft_statistics.as_ref() { - s.reset(); - } - self.last_reset = now; - } - } -} - -pub struct EnginesResourceInfo { - tablet_registry: TabletRegistry, - raft_engine: Option, - latest_normalized_pending_bytes: AtomicU32, - normalized_pending_bytes_collector: MovingAvgU32, -} - -impl EnginesResourceInfo { - const SCALE_FACTOR: u64 = 100; - - fn new( - tablet_registry: TabletRegistry, - raft_engine: Option, - max_samples_to_preserve: usize, - ) -> Self { - EnginesResourceInfo { - tablet_registry, - raft_engine, - latest_normalized_pending_bytes: AtomicU32::new(0), - normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), - } - } - - pub fn update( - &self, - _now: Instant, - cached_latest_tablets: &mut HashMap>, - ) { - let mut normalized_pending_bytes = 0; - - fn fetch_engine_cf(engine: &RocksEngine, cf: &str, normalized_pending_bytes: &mut u32) { - if let Ok(cf_opts) = engine.get_options_cf(cf) { - if let Ok(Some(b)) = engine.get_cf_pending_compaction_bytes(cf) { - if cf_opts.get_soft_pending_compaction_bytes_limit() > 0 { - *normalized_pending_bytes = std::cmp::max( - *normalized_pending_bytes, - (b * EnginesResourceInfo::SCALE_FACTOR - / cf_opts.get_soft_pending_compaction_bytes_limit()) - as u32, - ); - } - } - } - } - - if let Some(raft_engine) = &self.raft_engine { - fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); - } - - self.tablet_registry - .for_each_opened_tablet(|id, db: &mut CachedTablet| { - cached_latest_tablets.insert(id, db.clone()); - true - }); - - // todo(SpadeA): Now, there's a potential race condition problem where the - // tablet could be destroyed after the clone and before the fetching - // which could result in programme panic. It's okay now as the single global - // kv_engine will not be destroyed in normal operation and v2 is not - // ready for operation. Furthermore, this race condition is general to v2 as - // tablet clone is not a case exclusively happened here. We should - // propose another PR to tackle it such as destory tablet lazily in a GC - // thread. - - for (_, cache) in cached_latest_tablets.iter_mut() { - let Some(tablet) = cache.latest() else { continue }; - for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { - fetch_engine_cf(tablet, cf, &mut normalized_pending_bytes); - } - } - - // Clear ensures that these tablets are not hold forever. - cached_latest_tablets.clear(); - - let (_, avg) = self - .normalized_pending_bytes_collector - .add(normalized_pending_bytes); - self.latest_normalized_pending_bytes.store( - std::cmp::max(normalized_pending_bytes, avg), - Ordering::Relaxed, - ); - } -} - -impl IoBudgetAdjustor for EnginesResourceInfo { - fn adjust(&self, total_budgets: usize) -> usize { - let score = self.latest_normalized_pending_bytes.load(Ordering::Relaxed) as f32 - / Self::SCALE_FACTOR as f32; - // Two reasons for adding `sqrt` on top: - // 1) In theory the convergence point is independent of the value of pending - // bytes (as long as backlog generating rate equals consuming rate, which is - // determined by compaction budgets), a convex helps reach that point while - // maintaining low level of pending bytes. - // 2) Variance of compaction pending bytes grows with its magnitude, a filter - // with decreasing derivative can help balance such trend. - let score = score.sqrt(); - // The target global write flow slides between Bandwidth / 2 and Bandwidth. - let score = 0.5 + score / 2.0; - (total_budgets as f32 * score) as usize - } -} #[cfg(test)] mod test { - use std::{ - collections::HashMap, - sync::{atomic::Ordering, Arc}, - }; + use std::{collections::HashMap, sync::Arc}; use engine_rocks::raw::Env; use engine_traits::{ @@ -1732,9 +1278,7 @@ mod test { // bytes of tablet_1_20 assert_eq!( (new_pending_compaction_bytes * 100) as u32, - engines_info - .latest_normalized_pending_bytes - .load(Ordering::Relaxed) + engines_info.latest_normalized_pending_bytes() ); } } diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index a02af6ad177..0e96d976449 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -10,7 +10,7 @@ use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; use engine_rocks::RocksEngine; use engine_test::raft::RaftTestEngine; -use engine_traits::{RaftEngine, RaftEngineReadOnly, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine, RaftEngineReadOnly, TabletRegistry}; use futures::Future; use kvproto::{ kvrpcpb::ApiVersion, @@ -53,12 +53,12 @@ use tikv_util::{ use crate::{Cluster, RaftStoreRouter, SimulateTransport, Simulator, SnapshotRouter}; #[derive(Clone)] -pub struct ChannelTransport { - core: Arc>, +pub struct ChannelTransport { + core: Arc>>, } -impl ChannelTransport { - pub fn new() -> ChannelTransport { +impl ChannelTransport { + pub fn new() -> Self { ChannelTransport { core: Arc::new(Mutex::new(ChannelTransportCore { snap_paths: HashMap::default(), @@ -67,12 +67,12 @@ impl ChannelTransport { } } - pub fn core(&self) -> &Arc> { + pub fn core(&self) -> &Arc>> { &self.core } } -impl Transport for ChannelTransport { +impl Transport for ChannelTransport { fn send(&mut self, msg: RaftMessage) -> raftstore::Result<()> { let from_store = msg.get_from_peer().get_store_id(); let to_store = msg.get_to_peer().get_store_id(); @@ -131,30 +131,30 @@ impl Transport for ChannelTransport { fn flush(&mut self) {} } -pub struct ChannelTransportCore { +pub struct ChannelTransportCore { pub snap_paths: HashMap, - pub routers: HashMap>>, + pub routers: HashMap>>, } -impl Default for ChannelTransport { +impl Default for ChannelTransport { fn default() -> Self { Self::new() } } -type SimulateChannelTransport = SimulateTransport; +type SimulateChannelTransport = SimulateTransport>; -pub struct NodeCluster { - trans: ChannelTransport, +pub struct NodeCluster { + trans: ChannelTransport, pd_client: Arc, - nodes: HashMap>, - simulate_trans: HashMap, + nodes: HashMap>, + simulate_trans: HashMap>, concurrency_managers: HashMap, // snap_mgrs: HashMap, } -impl NodeCluster { - pub fn new(pd_client: Arc) -> NodeCluster { +impl NodeCluster { + pub fn new(pd_client: Arc) -> Self { NodeCluster { trans: ChannelTransport::new(), pd_client, @@ -166,7 +166,7 @@ impl NodeCluster { } } -impl Simulator for NodeCluster { +impl Simulator for NodeCluster { fn get_node_ids(&self) -> HashSet { self.nodes.keys().cloned().collect() } @@ -189,10 +189,10 @@ impl Simulator for NodeCluster { &mut self, node_id: u64, cfg: Config, - store_meta: Arc>>, + store_meta: Arc>>, key_manager: Option>, raft_engine: RaftTestEngine, - tablet_registry: TabletRegistry, + tablet_registry: TabletRegistry, _resource_manager: &Option>, ) -> ServerResult { assert!(!self.nodes.contains_key(&node_id)); @@ -345,9 +345,8 @@ impl Simulator for NodeCluster { fn async_snapshot( &mut self, request: RaftCmdRequest, - ) -> impl Future< - Output = std::result::Result, RaftCmdResponse>, - > + Send { + ) -> impl Future, RaftCmdResponse>> + Send + { let node_id = request.get_header().get_peer().get_store_id(); if !self .trans @@ -409,7 +408,7 @@ impl Simulator for NodeCluster { .unwrap(); } - fn get_router(&self, node_id: u64) -> Option> { + fn get_router(&self, node_id: u64) -> Option> { self.nodes.get(&node_id).map(|node| node.router().clone()) } @@ -439,7 +438,7 @@ impl Simulator for NodeCluster { // Compare to server cluster, node cluster does not have server layer and // storage layer. -pub fn new_node_cluster(id: u64, count: usize) -> Cluster { +pub fn new_node_cluster(id: u64, count: usize) -> Cluster, RocksEngine> { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); Cluster::new( @@ -454,7 +453,10 @@ pub fn new_node_cluster(id: u64, count: usize) -> Cluster Cluster { +pub fn new_incompatible_node_cluster( + id: u64, + count: usize, +) -> Cluster, RocksEngine> { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(NodeCluster::new(Arc::clone(&pd_client)))); Cluster::new( diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 921d3b991ab..804a5e4a22f 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -12,7 +12,7 @@ use causal_ts::CausalTsProviderImpl; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; -use engine_rocks::{RocksEngine, RocksSnapshot}; +use engine_rocks::RocksEngine; use engine_test::raft::RaftTestEngine; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use futures::{executor::block_on, Future}; @@ -86,23 +86,26 @@ impl FlowStatsReporter for DummyReporter { fn report_write_stats(&self, _write_stats: WriteStats) {} } -type SimulateRaftExtension = ::RaftExtension; -type SimulateStoreTransport = SimulateTransport>; -type SimulateServerTransport = - SimulateTransport>; +type SimulateRaftExtension = as Engine>::RaftExtension; +type SimulateStoreTransport = SimulateTransport>; +type SimulateServerTransport = + SimulateTransport, PdStoreAddrResolver>>; -pub type SimulateEngine = RaftKv2; +pub type SimulateEngine = RaftKv2; // TestRaftKvv2 behaves the same way with RaftKv2, except that it has filters // that can mock various network conditions. #[derive(Clone)] -pub struct TestRaftKv2 { - raftkv: SimulateEngine, +pub struct TestRaftKv2 { + raftkv: SimulateEngine, filters: Arc>>>, } -impl TestRaftKv2 { - pub fn new(raftkv: SimulateEngine, filters: Arc>>>) -> TestRaftKv2 { +impl TestRaftKv2 { + pub fn new( + raftkv: SimulateEngine, + filters: Arc>>>, + ) -> TestRaftKv2 { TestRaftKv2 { raftkv, filters } } @@ -111,15 +114,15 @@ impl TestRaftKv2 { } } -impl Engine for TestRaftKv2 { - type Snap = RegionSnapshot<::Snapshot>; - type Local = RocksEngine; +impl Engine for TestRaftKv2 { + type Snap = RegionSnapshot; + type Local = EK; fn kv_engine(&self) -> Option { self.raftkv.kv_engine() } - type RaftExtension = TestExtension; + type RaftExtension = TestExtension; fn raft_extension(&self) -> Self::RaftExtension { TestExtension::new(self.raftkv.raft_extension(), self.filters.clone()) } @@ -131,12 +134,12 @@ impl Engine for TestRaftKv2 { self.raftkv.modify_on_kv_engine(region_modifies) } - type SnapshotRes = ::SnapshotRes; + type SnapshotRes = as Engine>::SnapshotRes; fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { self.raftkv.async_snapshot(ctx) } - type WriteRes = ::WriteRes; + type WriteRes = as Engine>::WriteRes; fn async_write( &self, ctx: &Context, @@ -159,21 +162,21 @@ impl Engine for TestRaftKv2 { } #[derive(Clone)] -pub struct TestExtension { - extension: Extension, +pub struct TestExtension { + extension: Extension, filters: Arc>>>, } -impl TestExtension { +impl TestExtension { pub fn new( - extension: Extension, + extension: Extension, filters: Arc>>>, ) -> Self { TestExtension { extension, filters } } } -impl RaftExtension for TestExtension { +impl RaftExtension for TestExtension { fn feed(&self, msg: RaftMessage, key_message: bool) { let send = |msg| -> raftstore::Result<()> { self.extension.feed(msg, key_message); @@ -236,22 +239,22 @@ impl RaftExtension for TestExtension { } } -pub struct ServerMeta { - node: NodeV2, - server: Server, - sim_router: SimulateStoreTransport, - sim_trans: SimulateServerTransport, - raw_router: StoreRouter, - gc_worker: GcWorker, +pub struct ServerMeta { + node: NodeV2, + server: Server>, + sim_router: SimulateStoreTransport, + sim_trans: SimulateServerTransport, + raw_router: StoreRouter, + gc_worker: GcWorker>, rsmeter_cleanup: Box, } type PendingServices = Vec Service>>; -pub struct ServerCluster { - metas: HashMap, +pub struct ServerCluster { + metas: HashMap>, addrs: AddressMap, - pub storages: HashMap, + pub storages: HashMap>, pub region_info_accessors: HashMap, snap_paths: HashMap, snap_mgrs: HashMap, @@ -266,8 +269,8 @@ pub struct ServerCluster { pub causal_ts_providers: HashMap>, } -impl ServerCluster { - pub fn new(pd_client: Arc) -> ServerCluster { +impl ServerCluster { + pub fn new(pd_client: Arc) -> Self { let env = Arc::new( EnvBuilder::new() .cq_count(2) @@ -316,10 +319,10 @@ impl ServerCluster { &mut self, node_id: u64, mut cfg: Config, - store_meta: Arc>>, + store_meta: Arc>>, key_manager: Option>, raft_engine: RaftTestEngine, - tablet_registry: TabletRegistry, + tablet_registry: TabletRegistry, resource_manager: &Option>, ) -> ServerResult { let (snap_mgr, snap_mgs_path) = if !self.snap_mgrs.contains_key(&node_id) { @@ -639,7 +642,7 @@ impl ServerCluster { Ok(node_id) } - pub fn get_gc_worker(&self, node_id: u64) -> &GcWorker { + pub fn get_gc_worker(&self, node_id: u64) -> &GcWorker> { &self.metas.get(&node_id).unwrap().gc_worker } @@ -677,7 +680,7 @@ impl ServerCluster { } } -impl Simulator for ServerCluster { +impl Simulator for ServerCluster { fn get_node_ids(&self) -> HashSet { self.metas.keys().cloned().collect() } @@ -718,10 +721,10 @@ impl Simulator for ServerCluster { &mut self, node_id: u64, cfg: Config, - store_meta: Arc>>, + store_meta: Arc>>, key_manager: Option>, raft_engine: RaftTestEngine, - tablet_registry: TabletRegistry, + tablet_registry: TabletRegistry, resource_manager: &Option>, ) -> ServerResult { dispatch_api_version!( @@ -754,9 +757,8 @@ impl Simulator for ServerCluster { fn async_snapshot( &mut self, request: kvproto::raft_cmdpb::RaftCmdRequest, - ) -> impl Future< - Output = std::result::Result, RaftCmdResponse>, - > + Send { + ) -> impl Future, RaftCmdResponse>> + Send + { let node_id = request.get_header().get_peer().get_store_id(); let mut router = match self.metas.get(&node_id) { None => { @@ -792,7 +794,7 @@ impl Simulator for ServerCluster { Ok(()) } - fn get_router(&self, node_id: u64) -> Option> { + fn get_router(&self, node_id: u64) -> Option> { self.metas.get(&node_id).map(|m| m.raw_router.clone()) } @@ -805,9 +807,9 @@ impl Simulator for ServerCluster { } } -impl Cluster { - pub fn must_get_snapshot_of_region(&mut self, region_id: u64) -> RegionSnapshot { - let mut try_snapshot = || -> Option> { +impl Cluster, EK> { + pub fn must_get_snapshot_of_region(&mut self, region_id: u64) -> RegionSnapshot { + let mut try_snapshot = || -> Option> { let leader = self.leader_of_region(region_id)?; let store_id = leader.store_id; let epoch = self.get_region_epoch(region_id); @@ -833,7 +835,10 @@ impl Cluster { } } -pub fn new_server_cluster(id: u64, count: usize) -> Cluster { +pub fn new_server_cluster( + id: u64, + count: usize, +) -> Cluster, RocksEngine> { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); Cluster::new( @@ -849,7 +854,7 @@ pub fn new_server_cluster(id: u64, count: usize) -> Cluster Cluster { +) -> Cluster, RocksEngine> { let pd_client = Arc::new(TestPdClient::new(id, true)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); Cluster::new( @@ -866,7 +871,7 @@ pub fn new_server_cluster_with_api_ver( id: u64, count: usize, api_ver: ApiVersion, -) -> Cluster { +) -> Cluster, RocksEngine> { let pd_client = Arc::new(TestPdClient::new(id, false)); let sim = Arc::new(RwLock::new(ServerCluster::new(Arc::clone(&pd_client)))); Cluster::new( @@ -879,14 +884,21 @@ pub fn new_server_cluster_with_api_ver( ) } -pub fn must_new_cluster_and_kv_client() -> (Cluster, TikvClient, Context) -{ +pub fn must_new_cluster_and_kv_client() -> ( + Cluster, RocksEngine>, + TikvClient, + Context, +) { must_new_cluster_and_kv_client_mul(1) } pub fn must_new_cluster_and_kv_client_mul( count: usize, -) -> (Cluster, TikvClient, Context) { +) -> ( + Cluster, RocksEngine>, + TikvClient, + Context, +) { let (cluster, leader, ctx) = must_new_cluster_mul(count); let env = Arc::new(Environment::new(1)); @@ -898,14 +910,22 @@ pub fn must_new_cluster_and_kv_client_mul( } pub fn must_new_cluster_mul( count: usize, -) -> (Cluster, metapb::Peer, Context) { +) -> ( + Cluster, RocksEngine>, + metapb::Peer, + Context, +) { must_new_and_configure_cluster_mul(count, |_| ()) } fn must_new_and_configure_cluster_mul( count: usize, - mut configure: impl FnMut(&mut Cluster), -) -> (Cluster, metapb::Peer, Context) { + mut configure: impl FnMut(&mut Cluster, RocksEngine>), +) -> ( + Cluster, RocksEngine>, + metapb::Peer, + Context, +) { let mut cluster = new_server_cluster(0, count); configure(&mut cluster); cluster.run(); @@ -921,8 +941,12 @@ fn must_new_and_configure_cluster_mul( } pub fn must_new_and_configure_cluster_and_kv_client( - configure: impl FnMut(&mut Cluster), -) -> (Cluster, TikvClient, Context) { + configure: impl FnMut(&mut Cluster, RocksEngine>), +) -> ( + Cluster, RocksEngine>, + TikvClient, + Context, +) { let (cluster, leader, ctx) = must_new_and_configure_cluster(configure); let env = Arc::new(Environment::new(1)); @@ -934,13 +958,20 @@ pub fn must_new_and_configure_cluster_and_kv_client( } pub fn must_new_and_configure_cluster( - configure: impl FnMut(&mut Cluster), -) -> (Cluster, metapb::Peer, Context) { + configure: impl FnMut(&mut Cluster, RocksEngine>), +) -> ( + Cluster, RocksEngine>, + metapb::Peer, + Context, +) { must_new_and_configure_cluster_mul(1, configure) } -pub fn must_new_cluster_and_debug_client() -> (Cluster, DebugClient, u64) -{ +pub fn must_new_cluster_and_debug_client() -> ( + Cluster, RocksEngine>, + DebugClient, + u64, +) { let (cluster, leader, _) = must_new_cluster_mul(1); let env = Arc::new(Environment::new(1)); diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index 9f68beaad35..b9e6464c5d8 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -11,7 +11,7 @@ use futures::Future; use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb::RaftCmdResponse}; use raftstore::Result; use rand::RngCore; -use server::server2::ConfiguredRaftEngine; +use server::common::ConfiguredRaftEngine; use tempfile::TempDir; use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, Config}; use tikv::{ @@ -163,7 +163,7 @@ pub fn configure_for_lease_read_v2, EK: KvEngine>( } pub fn wait_for_synced( - cluster: &mut Cluster, + cluster: &mut Cluster, RocksEngine>, node_id: u64, region_id: u64, ) { diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 81753d49600..cdfe5c8f475 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -43,7 +43,7 @@ use raftstore::{ RaftRouterCompactedEventSender, Result, }; use rand::RngCore; -use server::server::ConfiguredRaftEngine; +use server::common::ConfiguredRaftEngine; use tempfile::TempDir; use test_pd_client::TestPdClient; use tikv::{ From 83ce09188780c40be9b780d4995f1ae26f32995d Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 7 Apr 2023 16:20:58 +0800 Subject: [PATCH 618/676] cdc: batch send resolved ts exponentially to speed up TiCDC resolve lock (#14465) close pingcap/tiflow#8561, ref tikv/tikv#11993 cdc: batch send resolved ts exponentially to speed up TiCDC resolve lock Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/cdc/src/endpoint.rs | 144 +++++++++++++++++++++++++-------- 1 file changed, 110 insertions(+), 34 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index b5e15ceee23..efc82e27d6c 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cell::RefCell, cmp::{Ord, Ordering as CmpOrdering, PartialOrd, Reverse}, collections::BinaryHeap, fmt, @@ -297,16 +298,8 @@ impl ResolvedRegionHeap { (min_resolved_ts, outliers) } - fn to_hash_set(&self) -> (TimeStamp, HashSet) { - let mut min_resolved_ts = TimeStamp::max(); - let mut regions = HashSet::with_capacity_and_hasher(self.heap.len(), Default::default()); - for resolved_region in &self.heap { - regions.insert(resolved_region.0.region_id); - if min_resolved_ts > resolved_region.0.resolved_ts { - min_resolved_ts = resolved_region.0.resolved_ts; - } - } - (min_resolved_ts, regions) + fn is_empty(&self) -> bool { + self.heap.is_empty() } fn clear(&mut self) { @@ -349,7 +342,7 @@ pub struct Endpoint { sink_memory_quota: MemoryQuota, old_value_cache: OldValueCache, - resolved_region_heap: ResolvedRegionHeap, + resolved_region_heap: RefCell, causal_ts_provider: Option>, @@ -444,9 +437,9 @@ impl, E: KvEngine> Endpoint { concurrency_manager, min_resolved_ts: TimeStamp::max(), min_ts_region_id: 0, - resolved_region_heap: ResolvedRegionHeap { + resolved_region_heap: RefCell::new(ResolvedRegionHeap { heap: BinaryHeap::new(), - }, + }), old_value_cache, resolved_region_count: 0, unresolved_region_count: 0, @@ -837,7 +830,7 @@ impl, E: KvEngine> Endpoint { fn on_min_ts(&mut self, regions: Vec, min_ts: TimeStamp, current_ts: TimeStamp) { // Reset resolved_regions to empty. - let resolved_regions = &mut self.resolved_region_heap; + let mut resolved_regions = self.resolved_region_heap.borrow_mut(); resolved_regions.clear(); let total_region_count = regions.len(); @@ -883,6 +876,7 @@ impl, E: KvEngine> Endpoint { "min_resolved_ts" => self.min_resolved_ts, "min_ts_region_id" => self.min_ts_region_id, "min_ts" => min_ts, + "lag" => ?Duration::from_millis(lag_millis), "ok" => advance_ok, "none" => advance_failed_none, "stale" => advance_failed_stale, @@ -896,13 +890,14 @@ impl, E: KvEngine> Endpoint { // so 1) downstreams know where they should send resolve lock requests, // and 2) resolved ts of normal regions does not fallback. // - // Max number of outliers, in most cases, only a few regions are outliers. - // TODO: figure out how to avoid create hashset every time, saving some CPU. - let max_outlier_count = 32; - let (outlier_min_resolved_ts, outlier_regions) = resolved_regions.pop(max_outlier_count); - let (normal_min_resolved_ts, normal_regions) = resolved_regions.to_hash_set(); - self.broadcast_resolved_ts(outlier_min_resolved_ts, outlier_regions); - self.broadcast_resolved_ts(normal_min_resolved_ts, normal_regions); + // Regions are separated exponentially to reduce resolved ts events and + // save CPU for both TiKV and TiCDC. + let mut batch_count = 8; + while !resolved_regions.is_empty() { + let (outlier_min_resolved_ts, outlier_regions) = resolved_regions.pop(batch_count); + self.broadcast_resolved_ts(outlier_min_resolved_ts, outlier_regions); + batch_count *= 4; + } } fn broadcast_resolved_ts(&self, min_resolved_ts: TimeStamp, regions: HashSet) { @@ -1194,6 +1189,7 @@ impl, E: KvEngine> RunnableWithTimer for Endpoin // Reclaim resolved_region_heap memory. self.resolved_region_heap + .borrow_mut() .reset_and_shrink_to(self.capture_regions.len()); CDC_CAPTURED_REGION_COUNT.set(self.capture_regions.len() as i64); @@ -1276,7 +1272,11 @@ mod tests { }; use super::*; - use crate::{channel, delegate::ObservedRange, recv_timeout}; + use crate::{ + channel, + delegate::{post_init_downstream, ObservedRange}, + recv_timeout, + }; struct TestEndpointSuite { // The order must ensure `endpoint` be dropped before other fields. @@ -2477,11 +2477,6 @@ mod tests { assert!(regions.contains(&5)); assert!(regions.contains(&6)); - // Empty regions - let (ts, regions) = heap.to_hash_set(); - assert_eq!(ts, TimeStamp::max()); - assert!(regions.is_empty()); - let mut heap1 = ResolvedRegionHeap { heap: BinaryHeap::new(), }; @@ -2495,13 +2490,6 @@ mod tests { assert_eq!(regions.len(), 1); assert!(regions.contains(&3)); - let (ts, regions) = heap1.to_hash_set(); - assert_eq!(ts, 4.into()); - assert_eq!(regions.len(), 3); - assert!(regions.contains(&4)); - assert!(regions.contains(&5)); - assert!(regions.contains(&6)); - heap1.reset_and_shrink_to(3); assert_eq!(3, heap1.heap.capacity()); assert!(heap1.heap.is_empty()); @@ -2510,4 +2498,92 @@ mod tests { heap1.clear(); assert!(heap1.heap.is_empty()); } + + #[test] + fn test_on_min_ts() { + let cfg = CdcConfig { + // Disable automatic advance resolved ts during test. + min_ts_interval: ReadableDuration(Duration::from_secs(1000)), + ..Default::default() + }; + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); + let quota = crate::channel::MemoryQuota::new(usize::MAX); + let (tx, mut rx) = channel::channel(1, quota); + let mut rx = rx.drain(); + + let conn = Conn::new(tx, String::new()); + let conn_id = conn.get_id(); + suite.run(Task::OpenConn { conn }); + let mut req_header = Header::default(); + req_header.set_cluster_id(0); + + let mut regions = vec![]; + for id in 1..4097 { + regions.push(id); + suite.add_region(id, 100); + + let mut req = ChangeDataRequest::default(); + req.set_region_id(id); + let region_epoch = req.get_region_epoch().clone(); + let downstream = Downstream::new( + "".to_string(), + region_epoch.clone(), + id, + conn_id, + ChangeDataRequestKvApi::TiDb, + false, + ObservedRange::default(), + ); + on_init_downstream(&downstream.get_state()); + post_init_downstream(&downstream.get_state()); + // Enable batch resolved ts in the test. + let version = FeatureGate::batch_resolved_ts(); + suite.run(Task::Register { + request: req.clone(), + downstream, + conn_id, + version: version.clone(), + }); + + let mut resolver = Resolver::new(id); + resolver.track_lock(TimeStamp::compose(0, id), vec![], None); + let mut region = Region::default(); + region.id = id; + region.set_region_epoch(region_epoch); + let failed = suite + .capture_regions + .get_mut(&id) + .unwrap() + .on_region_ready(resolver, region); + assert!(failed.is_empty()); + } + suite + .task_rx + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + + suite.run(Task::MinTs { + regions, + min_ts: TimeStamp::compose(0, 4096), + current_ts: TimeStamp::compose(0, 4096), + }); + + // There should be at least 3 resolved ts events. + let mut last_resolved_ts = 0; + let mut last_batch_count = 0; + for _ in 0..3 { + let event = recv_timeout(&mut rx, Duration::from_millis(100)) + .unwrap() + .unwrap() + .0; + assert!(last_resolved_ts < event.resolved_ts().ts, "{:?}", event); + assert!( + last_batch_count < event.resolved_ts().regions.len(), + "{:?}", + event + ); + last_resolved_ts = event.resolved_ts().ts; + last_batch_count = event.resolved_ts().regions.len(); + } + } } From 4199ed9ddd307d74656ac25d7c2c1692fdac9f8b Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 7 Apr 2023 17:46:58 +0800 Subject: [PATCH 619/676] tikv_util: cgroup path parsing fix (#14537) close tikv/tikv#14538 Signed-off-by: Spade A --- components/tikv_util/src/sys/cgroup.rs | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/components/tikv_util/src/sys/cgroup.rs b/components/tikv_util/src/sys/cgroup.rs index 371d51e0b70..052a607a8c9 100644 --- a/components/tikv_util/src/sys/cgroup.rs +++ b/components/tikv_util/src/sys/cgroup.rs @@ -183,15 +183,19 @@ fn is_cgroup2_unified_mode() -> Result { // // The format is "::". For example, // "10:cpuset:/test-cpuset". +// +// Note: path may contains ":" in some envrionment. fn parse_proc_cgroup_v1(lines: &str) -> HashMap { let mut subsystems = HashMap::new(); for line in lines.lines().map(|s| s.trim()).filter(|s| !s.is_empty()) { let mut iter = line.split(':'); if let Some(_id) = iter.next() { if let Some(systems) = iter.next() { - if let Some(path) = iter.next() { + // If the path itself contains ":", we need to concat them + let path = iter.collect::>().join(":"); + if !path.is_empty() { for system in systems.split(',') { - subsystems.insert(system.to_owned(), path.to_owned()); + subsystems.insert(system.to_owned(), path.clone()); } continue; } @@ -697,4 +701,19 @@ mod tests { .unwrap(); assert!(child.wait().unwrap().success()); } + + #[test] + fn test_cgroup_path_with_semicolon() { + let id = "1"; + let devices = "test_device"; + let path = "/dir1:dir2:dir3"; + let mut lines = String::new(); + lines.push_str(id); + lines.push(':'); + lines.push_str(devices); + lines.push(':'); + lines.push_str(path); + let ret = parse_proc_cgroup_v1(&lines); + assert_eq!(ret.get(devices).unwrap(), path); + } } From abb672b8218307e3811281c22643e0eb2e13cc2c Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Fri, 7 Apr 2023 17:02:58 -0700 Subject: [PATCH 620/676] [raftstore-v2]: check apply_scheduler before using in on_refresh_region_buckets (#14526) close tikv/tikv#14506 check apply_scheduler before using it in on_refresh_region_buckets. This is to solve the race condition when the peer is just created by split meanwhile a refresh bucket is called immediately. Signed-off-by: tonyxuqqi Co-authored-by: buffer --- .../raftstore-v2/src/operation/bucket.rs | 16 ++--- .../raftstore-v2/src/operation/command/mod.rs | 1 + .../raftstore-v2/src/operation/query/mod.rs | 3 + .../raftstore-v2/tests/failpoints/mod.rs | 1 + .../tests/failpoints/test_bucket.rs | 58 ++++++++++++++++++ .../tests/integrations/cluster.rs | 60 ++++++++++++++++++- .../raftstore-v2/tests/integrations/mod.rs | 1 + components/raftstore/src/store/region_meta.rs | 2 + 8 files changed, 132 insertions(+), 10 deletions(-) create mode 100644 components/raftstore-v2/tests/failpoints/test_bucket.rs diff --git a/components/raftstore-v2/src/operation/bucket.rs b/components/raftstore-v2/src/operation/bucket.rs index be4ca092d98..317ed89ef8d 100644 --- a/components/raftstore-v2/src/operation/bucket.rs +++ b/components/raftstore-v2/src/operation/bucket.rs @@ -254,14 +254,16 @@ impl Peer { let meta = region_buckets.meta.clone(); self.region_buckets_info_mut() .set_bucket_stat(Some(region_buckets.clone())); - - let mut store_meta = store_ctx.store_meta.lock().unwrap(); - if let Some(reader) = store_meta.readers.get_mut(&self.region_id()) { - reader.0.update(ReadProgress::region_buckets(meta)); + { + let mut store_meta = store_ctx.store_meta.lock().unwrap(); + if let Some(reader) = store_meta.readers.get_mut(&self.region_id()) { + reader.0.update(ReadProgress::region_buckets(meta)); + } + } + // it's possible that apply_scheduler is not initialized yet + if let Some(apply_scheduler) = self.apply_scheduler() { + apply_scheduler.send(ApplyTask::RefreshBucketStat(region_buckets.meta.clone())); } - self.apply_scheduler() - .unwrap() - .send(ApplyTask::RefreshBucketStat(region_buckets.meta.clone())); } #[inline] diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 0ae2f1741c3..9ef5592c64e 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -148,6 +148,7 @@ impl Peer { .apply_pool .spawn(async move { apply_fsm.handle_all_tasks().await }) .unwrap(); + fail::fail_point!("delay_set_apply_scheduler", |_| {}); self.set_apply_scheduler(apply_scheduler); } diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index fc7cee35fa5..55bc100dec2 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -401,6 +401,9 @@ impl Peer { .raft_log .term(meta.raft_apply.commit_index) .unwrap(); + if let Some(bucket_stats) = self.region_buckets_info().bucket_stat() { + meta.bucket_keys = bucket_stats.meta.keys.clone(); + } debug!(self.logger, "on query debug info"; "tick" => self.raft_group().raft.election_elapsed, "election_timeout" => self.raft_group().raft.randomized_election_timeout(), diff --git a/components/raftstore-v2/tests/failpoints/mod.rs b/components/raftstore-v2/tests/failpoints/mod.rs index f73b9398df6..6148cb4eae1 100644 --- a/components/raftstore-v2/tests/failpoints/mod.rs +++ b/components/raftstore-v2/tests/failpoints/mod.rs @@ -10,6 +10,7 @@ mod cluster; mod test_basic_write; mod test_bootstrap; +mod test_bucket; mod test_life; mod test_merge; mod test_split; diff --git a/components/raftstore-v2/tests/failpoints/test_bucket.rs b/components/raftstore-v2/tests/failpoints/test_bucket.rs new file mode 100644 index 00000000000..f136cf6dc53 --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_bucket.rs @@ -0,0 +1,58 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::RaftEngineReadOnly; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use tikv_util::store::new_peer; + +use crate::cluster::{split_helper::split_region_and_refresh_bucket, Cluster}; + +/// Test refresh bucket. +#[test] +fn test_refresh_bucket() { + let mut cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); + let router = &mut cluster.routers[0]; + + let region_2 = 2; + let region = router.region_detail(region_2); + let peer = region.get_peers()[0].clone(); + router.wait_applied_to_current_term(region_2, Duration::from_secs(3)); + + // Region 2 ["", ""] + // -> Region 2 ["", "k22"] + // Region 1000 ["k22", ""] peer(1, 10) + let region_state = raft_engine + .get_region_state(region_2, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + + // to simulate the delay of set_apply_scheduler + fail::cfg("delay_set_apply_scheduler", "sleep(1000)").unwrap(); + split_region_and_refresh_bucket( + router, + region, + peer, + 1000, + new_peer(store_id, 10), + b"k22", + false, + ); + + for _i in 1..100 { + std::thread::sleep(Duration::from_millis(50)); + let meta = router + .must_query_debug_info(1000, Duration::from_secs(1)) + .unwrap(); + if !meta.bucket_keys.is_empty() { + assert_eq!(meta.bucket_keys.len(), 4); // include region start/end keys + assert_eq!(meta.bucket_keys[1], b"1".to_vec()); + assert_eq!(meta.bucket_keys[2], b"2".to_vec()); + return; + } + } + panic!("timeout for updating buckets"); // timeout +} diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 1685b5154e7..9c81f9545a3 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -31,11 +31,11 @@ use kvproto::{ use pd_client::RpcClient; use raft::eraftpb::MessageType; use raftstore::{ - coprocessor::{Config as CopConfig, CoprocessorHost}, + coprocessor::{Config as CopConfig, CoprocessorHost, StoreHandle}, store::{ region_meta::{RegionLocalState, RegionMeta}, - AutoSplitController, Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, - RAFT_INIT_LOG_INDEX, + AutoSplitController, Bucket, Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, + Transport, RAFT_INIT_LOG_INDEX, }, }; use raftstore_v2::{ @@ -232,6 +232,11 @@ impl TestRouter { } region } + + pub fn refresh_bucket(&self, region_id: u64, region_epoch: RegionEpoch, buckets: Vec) { + self.store_router() + .refresh_region_buckets(region_id, region_epoch, buckets, None); + } } pub struct RunningState { @@ -653,6 +658,7 @@ pub mod split_helper { metapb, pdpb, raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest, RaftCmdResponse, SplitRequest}, }; + use raftstore::store::Bucket; use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use super::TestRouter; @@ -760,6 +766,54 @@ pub mod split_helper { (left, right) } + + // Split the region and refresh bucket immediately + // This is to simulate the case when the splitted peer's storage is not + // initialized yet when refresh bucket happens + pub fn split_region_and_refresh_bucket( + router: &mut TestRouter, + region: metapb::Region, + peer: metapb::Peer, + split_region_id: u64, + split_peer: metapb::Peer, + propose_key: &[u8], + right_derive: bool, + ) { + let region_id = region.id; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.mut_header().set_peer(peer); + + let mut split_id = pdpb::SplitId::new(); + split_id.new_region_id = split_region_id; + split_id.new_peer_ids = vec![split_peer.id]; + let admin_req = new_batch_split_region_request( + vec![propose_key.to_vec()], + vec![split_id], + right_derive, + ); + req.mut_requests().clear(); + req.set_admin_request(admin_req); + + let (msg, sub) = PeerMsg::admin_command(req); + router.send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + + let meta = router + .must_query_debug_info(split_region_id, Duration::from_secs(1)) + .unwrap(); + let epoch = &meta.region_state.epoch; + let buckets = vec![Bucket { + keys: vec![b"1".to_vec(), b"2".to_vec()], + size: 100, + }]; + let mut region_epoch = kvproto::metapb::RegionEpoch::default(); + region_epoch.set_conf_ver(epoch.conf_ver); + region_epoch.set_version(epoch.version); + router.refresh_bucket(split_region_id, region_epoch, buckets); + } } pub mod merge_helper { diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index 12fe47ec48a..a4cdfda9179 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -7,6 +7,7 @@ // TODO: test conflict control in integration tests after split is supported. +#[allow(dead_code)] mod cluster; mod test_basic_write; mod test_conf_change; diff --git a/components/raftstore/src/store/region_meta.rs b/components/raftstore/src/store/region_meta.rs index 4d44673e057..30239be528c 100644 --- a/components/raftstore/src/store/region_meta.rs +++ b/components/raftstore/src/store/region_meta.rs @@ -246,6 +246,7 @@ pub struct RegionMeta { pub raft_status: RaftStatus, pub raft_apply: RaftApplyState, pub region_state: RegionLocalState, + pub bucket_keys: Vec>, } impl RegionMeta { @@ -308,6 +309,7 @@ impl RegionMeta { }), tablet_index: local_state.get_tablet_index(), }, + bucket_keys: vec![], } } } From 68298d834be1844eaf254d5237eed4856605833c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 10 Apr 2023 15:36:59 +0800 Subject: [PATCH 621/676] log-backup: use conservativer batch strategy (#14490) close tikv/tikv#14313 Signed-off-by: hillium Co-authored-by: Xinye Tao --- src/import/sst_service.rs | 142 ++++++++++++++++++++++++++++++++------ 1 file changed, 122 insertions(+), 20 deletions(-) diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 4707b348bc5..c235c60a4e6 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -61,7 +61,7 @@ use crate::{ /// this value? const REQUEST_WRITE_CONCURRENCY: usize = 16; /// The extra bytes required by the wire encoding. -/// Generally, a field (and a embedded message) would introduce 2 extra +/// Generally, a field (and a embedded message) would introduce some extra /// bytes. In detail, they are: /// - 2 bytes for the request type (Tag+Value). /// - 2 bytes for every string or bytes field (Tag+Length), they are: @@ -69,10 +69,12 @@ const REQUEST_WRITE_CONCURRENCY: usize = 16; /// . + the value field /// . + the CF field (None for CF_DEFAULT) /// - 2 bytes for the embedded message field `PutRequest` (Tag+Length). +/// - 2 bytes for the request itself (which would be embedded into a +/// [`RaftCmdRequest`].) /// In fact, the length field is encoded by varint, which may grow when the /// content length is greater than 128, however when the length is greater than /// 128, the extra 1~4 bytes can be ignored. -const WIRE_EXTRA_BYTES: usize = 10; +const WIRE_EXTRA_BYTES: usize = 12; /// The interval of running the GC for /// [`raft_writer::ThrottledTlsEngineWriter`]. There aren't too many items held /// in the writer. So we can run the GC less frequently. @@ -118,6 +120,7 @@ pub struct ImportSstService { struct RequestCollector { max_raft_req_size: usize, + /// Retain the last ts of each key in each request. /// This is used for write CF because resolved ts observer hates duplicated /// key in the same request. @@ -180,10 +183,25 @@ impl RequestCollector { self.accept(cf, m); } + /// check whether the unpacked size would exceed the max_raft_req_size after + /// accepting the modify. + fn should_send_batch_before_adding(&self, m: &Modify) -> bool { + let message_size = m.size() + WIRE_EXTRA_BYTES; + // If there isn't any records in the collector, and there is a huge modify, we + // should give it a change to enter the collector. Or we may generate empty + // batch. + self.unpacked_size != 0 /* batched */ + && message_size + self.unpacked_size > self.max_raft_req_size /* exceed the max_raft_req_size */ + } + // we need to remove duplicate keys in here, since // in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 // will panic if found duplicated entry during Vec. fn accept(&mut self, cf: &str, m: Modify) { + if self.should_send_batch_before_adding(&m) { + self.pack_all(); + } + let k = m.key(); match cf { CF_WRITE => { @@ -221,10 +239,6 @@ impl RequestCollector { } _ => unreachable!(), } - - if self.unpacked_size >= self.max_raft_req_size { - self.pack_all(); - } } #[cfg(test)] @@ -495,7 +509,7 @@ impl ImportSstService { ) -> std::result::Result, ImportPbError> { let mut range: Option = None; - let mut collector = RequestCollector::new(max_raft_size * 7 / 8); + let mut collector = RequestCollector::new(max_raft_size / 2); let context = req.take_context(); let mut metas = req.take_metas(); let mut rules = req.take_rewrite_rules(); @@ -1175,12 +1189,16 @@ mod test { use std::collections::HashMap; use engine_traits::{CF_DEFAULT, CF_WRITE}; - use kvproto::raft_cmdpb::Request; + use kvproto::{ + kvrpcpb::Context, + metapb::RegionEpoch, + raft_cmdpb::{RaftCmdRequest, Request}, + }; use protobuf::Message; - use tikv_kv::Modify; - use txn_types::{Key, TimeStamp, Write, WriteType}; + use tikv_kv::{Modify, WriteData}; + use txn_types::{Key, TimeStamp, Write, WriteBatchFlags, WriteType}; - use crate::import::sst_service::RequestCollector; + use crate::{import::sst_service::RequestCollector, server::raftkv}; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1361,23 +1379,107 @@ mod test { assert!(request_collector.is_empty()); } + fn convert_write_batch_to_request_raftkv1(ctx: &Context, batch: WriteData) -> RaftCmdRequest { + let reqs: Vec = batch.modifies.into_iter().map(Into::into).collect(); + let txn_extra = batch.extra; + let mut header = raftkv::new_request_header(ctx); + if batch.avoid_batch { + header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); + } + let mut flags = 0; + if txn_extra.one_pc { + flags |= WriteBatchFlags::ONE_PC.bits(); + } + if txn_extra.allowed_in_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + + let mut cmd = RaftCmdRequest::default(); + cmd.set_header(header); + cmd.set_requests(reqs.into()); + cmd + } + + fn fake_ctx() -> Context { + let mut fake_ctx = Context::new(); + fake_ctx.set_region_id(42); + fake_ctx.set_region_epoch({ + let mut e = RegionEpoch::new(); + e.set_version(1024); + e.set_conf_ver(56); + e + }); + fake_ctx + } + #[test] fn test_collector_size() { let mut request_collector = RequestCollector::new(1024); - for i in 0..100u64 { - request_collector.accept(CF_DEFAULT, default_req(&i.to_ne_bytes(), b"egg", i)); + for i in 0..100u8 { + request_collector.accept(CF_DEFAULT, default_req(&i.to_ne_bytes(), b"egg", i as _)); } - let pws = request_collector.pending_writes; + let pws = request_collector.drain_pending_writes(true); for w in pws { - let req_size = w - .modifies - .into_iter() - .map(Request::from) - .map(|x| x.compute_size()) - .sum::(); + let req_size = convert_write_batch_to_request_raftkv1(&fake_ctx(), w).compute_size(); + assert!(req_size < 1024, "{}", req_size); + } + } + + #[test] + fn test_collector_huge_write_liveness() { + let mut request_collector = RequestCollector::new(1024); + for i in 0..100u8 { + if i % 10 == 2 { + // Inject some huge requests. + request_collector.accept( + CF_DEFAULT, + default_req(&i.to_ne_bytes(), &[42u8; 1025], i as _), + ); + } else { + request_collector.accept(CF_DEFAULT, default_req(&i.to_ne_bytes(), b"egg", i as _)); + } + } + let pws = request_collector.drain_pending_writes(true); + let mut total = 0; + for w in pws { + let req = convert_write_batch_to_request_raftkv1(&fake_ctx(), w); + let req_size = req.compute_size(); + total += req.get_requests().len(); + assert!(req_size < 2048, "{}", req_size); + } + assert_eq!(total, 100); + } + + #[test] + fn test_collector_mid_size_write_no_exceed_max() { + let mut request_collector = RequestCollector::new(1024); + for i in 0..100u8 { + if i % 10 == 2 { + let huge_req = default_req(&i.to_ne_bytes(), &[42u8; 960], i as _); + // Inject some huge requests. + request_collector.accept(CF_DEFAULT, huge_req); + } else { + request_collector.accept( + CF_DEFAULT, + default_req( + &i.to_ne_bytes(), + b"noodles with beef, egg, bacon and spinach; in chicken soup", + i as _, + ), + ); + } + } + let pws = request_collector.drain_pending_writes(true); + let mut total = 0; + for w in pws { + let req = convert_write_batch_to_request_raftkv1(&fake_ctx(), w); + let req_size = req.compute_size(); + total += req.get_requests().len(); assert!(req_size < 1024, "{}", req_size); } + assert_eq!(total, 100); } } From 6433784b557c99518dea89612cf699b490c8f35f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 11 Apr 2023 11:57:00 +0800 Subject: [PATCH 622/676] log-backup: eliminate some verbose logs (#14454) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#14453, ref tikv/tikv#14453 Signed-off-by: hillium Signed-off-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: qupeng Co-authored-by: Jianjun Liao <36503113+Leavrth@users.noreply.github.com> --- Cargo.lock | 1 + components/backup-stream/Cargo.toml | 1 + components/backup-stream/src/endpoint.rs | 4 ++++ components/backup-stream/src/metrics.rs | 23 ++++++++++++++++--- components/backup-stream/src/observer.rs | 2 +- .../backup-stream/src/subscription_manager.rs | 14 ++++++++--- .../backup-stream/src/subscription_track.rs | 2 +- 7 files changed, 39 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e8162267354..a508216a0e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -550,6 +550,7 @@ dependencies = [ "pd_client", "pin-project", "prometheus", + "prometheus-static-metric", "protobuf", "raft", "raftstore", diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index d6d6f7a6fc4..005849391e9 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -54,6 +54,7 @@ openssl = "0.10" pd_client = { workspace = true } pin-project = "1.0" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } +prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raftstore = { workspace = true } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index c8302f6dd9e..c5ab6352b31 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -980,6 +980,10 @@ where }); } RegionCheckpointOperation::PrepareMinTsForResolve => { + if self.observer.is_hibernating() { + metrics::MISC_EVENTS.skip_resolve_no_subscription.inc(); + return; + } let min_ts = self.pool.block_on(self.prepare_min_ts()); let start_time = Instant::now(); // We need to reschedule the `Resolve` task to queue, because the subscription diff --git a/components/backup-stream/src/metrics.rs b/components/backup-stream/src/metrics.rs index d7f836833b0..225d583ca5c 100644 --- a/components/backup-stream/src/metrics.rs +++ b/components/backup-stream/src/metrics.rs @@ -2,6 +2,7 @@ use lazy_static::lazy_static; use prometheus::*; +use prometheus_static_metric::*; /// The status of a task. /// The ordering of this imples the priority for presenting to the user. @@ -155,9 +156,11 @@ lazy_static! { &["stage"] ) .unwrap(); - pub static ref LOST_LEADER_REGION: IntCounter = register_int_counter!( - "tikv_log_backup_lost_leader_region", - "The regions that lost leadership during resolving" + pub static ref MISC_EVENTS: MiscEvents = register_static_int_counter_vec!( + MiscEvents, + "tikv_log_backup_misc_events", + "Events counter, including 'plain' events(i.e. events without extra information).", + &["name"] ) .unwrap(); pub static ref MIN_TS_RESOLVE_DURATION: Histogram = register_histogram!( @@ -167,3 +170,17 @@ lazy_static! { ) .unwrap(); } + +make_static_metric! { + pub label_enum MiscEventsName { + skip_resolve_non_leader, + skip_resolve_no_subscription, + } + + pub struct MiscEvents: IntCounter { + "name" => { + skip_resolve_non_leader, + skip_resolve_no_subscription, + } + } +} diff --git a/components/backup-stream/src/observer.rs b/components/backup-stream/src/observer.rs index 1a0a0f7cc9e..92ab6bc757e 100644 --- a/components/backup-stream/src/observer.rs +++ b/components/backup-stream/src/observer.rs @@ -100,7 +100,7 @@ impl BackupStreamObserver { /// Check whether there are any task range registered to the observer. /// when there isn't any task, we can ignore the events, so we don't need to /// handle useless events. (Also won't yield verbose logs.) - fn is_hibernating(&self) -> bool { + pub fn is_hibernating(&self) -> bool { self.ranges.rl().is_empty() } } diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 6e72d66a98b..e4ce02c9e27 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -38,7 +38,7 @@ use crate::{ metrics, observer::BackupStreamObserver, router::{Router, TaskSelector}, - subscription_track::{ResolveResult, SubscriptionTracer}, + subscription_track::{CheckpointType, ResolveResult, SubscriptionTracer}, try_send, utils::{self, CallbackWaitGroup, Work}, Task, @@ -407,7 +407,10 @@ where mut leader_checker: LeadershipResolver, ) { while let Some(op) = message_box.recv().await { - info!("backup stream: on_modify_observe"; "op" => ?op); + // Skip some trivial resolve commands. + if !matches!(op, ObserveOp::ResolveRegions { .. }) { + info!("backup stream: on_modify_observe"; "op" => ?op); + } match op { ObserveOp::Start { region } => { fail::fail_point!("delay_on_start_observe"); @@ -477,7 +480,12 @@ where // If there isn't any region observed, the `min_ts` can be used as resolved ts // safely. let rts = min_region.map(|rs| rs.checkpoint).unwrap_or(min_ts); - info!("getting checkpoint"; "defined_by_region" => ?min_region); + if min_region + .map(|mr| mr.checkpoint_type != CheckpointType::MinTs) + .unwrap_or(false) + { + info!("getting non-trivial checkpoint"; "defined_by_region" => ?min_region); + } callback(ResolvedRegions::new(rts, cps)); } } diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index c13339d1c29..7fee1b1b438 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -250,7 +250,7 @@ impl SubscriptionTracer { SubscribeState::Running(sub) => { let contains = rs.contains(®ion_id); if !contains { - crate::metrics::LOST_LEADER_REGION.inc(); + crate::metrics::MISC_EVENTS.skip_resolve_non_leader.inc(); } contains.then(|| ResolveResult::resolve(sub, min_ts)) } From e61b51df06539ee50eeaea18dff81c6d72f1fdd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Tue, 11 Apr 2023 13:51:00 +0800 Subject: [PATCH 623/676] log-backup: make initial scanning more robust (#14403) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit close tikv/tikv#14451 Signed-off-by: hillium Signed-off-by: 山岚 <36239017+YuJuncen@users.noreply.github.com> Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 3 + components/backup-stream/src/event_loader.rs | 11 +- .../backup-stream/src/metadata/client.rs | 5 +- components/backup-stream/src/metadata/keys.rs | 4 + .../src/metadata/store/slash_etc.rs | 6 +- components/backup-stream/src/observer.rs | 24 +--- .../backup-stream/src/subscription_manager.rs | 110 +++++++++++++++--- components/backup-stream/tests/mod.rs | 63 +++++++++- 8 files changed, 171 insertions(+), 55 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index c5ab6352b31..45d132b001b 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -1159,6 +1159,7 @@ pub enum ObserveOp { region: Region, handle: ObserveHandle, err: Box, + has_failed_for: u8, }, ResolveRegions { callback: ResolveRegionsCallback, @@ -1189,11 +1190,13 @@ impl std::fmt::Debug for ObserveOp { region, handle, err, + has_failed_for, } => f .debug_struct("NotifyFailToStartObserve") .field("region", &utils::debug_region(region)) .field("handle", handle) .field("err", err) + .field("has_failed_for", has_failed_for) .finish(), Self::ResolveRegions { min_ts, .. } => f .debug_struct("ResolveRegions") diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 8b808a16cca..6c825bf30c5 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -1,10 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - marker::PhantomData, - sync::{atomic::Ordering, Arc}, - time::Duration, -}; +use std::{marker::PhantomData, sync::Arc, time::Duration}; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use futures::executor::block_on; @@ -488,13 +484,10 @@ where // is still little chance to lost data: For example, if a region cannot elect // the leader for long time. (say, net work partition) At that time, we have // nowhere to record the lock status of this region. - let success = try_send!( + try_send!( self.scheduler, Task::ModifyObserve(ObserveOp::Start { region: r.region }) ); - if success { - crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.fetch_add(1, Ordering::SeqCst); - } } } Ok(()) diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index fca8a07b654..1fdc1b3b1e8 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -426,7 +426,10 @@ impl MetadataClient { let stream = watcher .stream .filter_map(|item| match item { - Ok(kv_event) => MetadataEvent::from_watch_pause_event(&kv_event), + Ok(kv_event) => { + debug!("watch pause event"; "raw" => ?kv_event); + MetadataEvent::from_watch_pause_event(&kv_event) + } Err(err) => Some(MetadataEvent::Error { err }), }) .map(|event| { diff --git a/components/backup-stream/src/metadata/keys.rs b/components/backup-stream/src/metadata/keys.rs index 26b04abe16f..87c0e036172 100644 --- a/components/backup-stream/src/metadata/keys.rs +++ b/components/backup-stream/src/metadata/keys.rs @@ -167,6 +167,10 @@ impl MetaKey { Self(format!("{}{}/{}", PREFIX, PATH_PAUSE, name).into_bytes()) } + pub fn last_errors_of(name: &str) -> Self { + Self(format!("{}{}/{}", PREFIX, PATH_LAST_ERROR, name).into_bytes()) + } + pub fn last_error_of(name: &str, store: u64) -> Self { Self(format!("{}{}/{}/{}", PREFIX, PATH_LAST_ERROR, name, store).into_bytes()) } diff --git a/components/backup-stream/src/metadata/store/slash_etc.rs b/components/backup-stream/src/metadata/store/slash_etc.rs index 0d6484b0c1e..a564d069d14 100644 --- a/components/backup-stream/src/metadata/store/slash_etc.rs +++ b/components/backup-stream/src/metadata/store/slash_etc.rs @@ -39,11 +39,7 @@ struct Key(Vec, i64); impl std::fmt::Debug for Key { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_tuple("Key") - .field(&format_args!( - "{}@{}", - log_wrappers::Value::key(&self.0), - self.1 - )) + .field(&format_args!("{}@{}", self.0.escape_ascii(), self.1)) .finish() } } diff --git a/components/backup-stream/src/observer.rs b/components/backup-stream/src/observer.rs index 92ab6bc757e..169c3b72268 100644 --- a/components/backup-stream/src/observer.rs +++ b/components/backup-stream/src/observer.rs @@ -1,9 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, RwLock, -}; +use std::sync::{Arc, RwLock}; use engine_traits::KvEngine; use kvproto::metapb::Region; @@ -18,20 +15,6 @@ use crate::{ utils::SegmentSet, }; -/// The inflight `StartObserve` message count. -/// Currently, we handle the `StartObserve` message in the main loop(endpoint -/// thread), which may take longer time than expected. So when we are starting -/// to observe many region (e.g. failover), there may be many pending messages, -/// those messages won't block the advancing of checkpoint ts. So the checkpoint -/// ts may be too late and losing some data. -/// -/// This is a temporary solution for this problem: If this greater than (1), -/// then it implies that there are some inflight wait-for-initialized regions, -/// we should block the resolved ts from advancing in that condition. -/// -/// FIXME: Move handler of `ModifyObserve` to another thread, and remove this :( -pub static IN_FLIGHT_START_OBSERVE_MESSAGE: AtomicUsize = AtomicUsize::new(0); - /// An Observer for Backup Stream. /// /// It observes raftstore internal events, such as: @@ -141,15 +124,12 @@ impl CmdObserver for BackupStreamObserver { fn on_applied_current_term(&self, role: StateRole, region: &Region) { if role == StateRole::Leader && self.should_register_region(region) { - let success = try_send!( + try_send!( self.scheduler, Task::ModifyObserve(ObserveOp::Start { region: region.clone(), }) ); - if success { - IN_FLIGHT_START_OBSERVE_MESSAGE.fetch_add(1, Ordering::SeqCst); - } } } } diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index e4ce02c9e27..316f0d9fb53 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -48,6 +48,21 @@ type ScanPool = yatp::ThreadPool; const INITIAL_SCAN_FAILURE_MAX_RETRY_TIME: usize = 10; +// The retry parameters for failed to get last checkpoint ts. +// When PD is temporarily disconnected, we may need this retry. +// The total duration of retrying is about 345s ( 20 * 16 + 15 ), +// which is longer than the RPO promise. +const TRY_START_OBSERVE_MAX_RETRY_TIME: u8 = 24; +const RETRY_AWAIT_BASIC_DURATION: Duration = Duration::from_secs(1); +const RETRY_AWAIT_MAX_DURATION: Duration = Duration::from_secs(16); + +fn backoff_for_start_observe(failed_for: u8) -> Duration { + Ord::min( + RETRY_AWAIT_BASIC_DURATION * (1 << failed_for), + RETRY_AWAIT_MAX_DURATION, + ) +} + /// a request for doing initial scanning. struct ScanCmd { region: Region, @@ -418,7 +433,6 @@ where metrics::INITIAL_SCAN_REASON .with_label_values(&["leader-changed"]) .inc(); - crate::observer::IN_FLIGHT_START_OBSERVE_MESSAGE.fetch_sub(1, Ordering::SeqCst); } ObserveOp::Stop { ref region } => { self.subs.deregister_region_if(region, |_, _| true); @@ -441,6 +455,7 @@ where region, handle, err, + has_failed_for, } => { info!("retry observe region"; "region" => %region.get_id(), "err" => %err); // No need for retrying observe canceled. @@ -451,7 +466,7 @@ where region.get_start_key().to_owned(), region.get_end_key().to_owned(), ); - match self.retry_observe(region, handle).await { + match self.retry_observe(region, handle, has_failed_for).await { Ok(()) => {} Err(e) => { let msg = Task::FatalError( @@ -519,7 +534,8 @@ where Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { region: region.clone(), handle, - err: Box::new(e) + err: Box::new(e), + has_failed_for: 0, }) ); } @@ -560,22 +576,59 @@ where } async fn start_observe(&self, region: Region) { + self.start_observe_with_failure_count(region, 0).await + } + + async fn start_observe_with_failure_count(&self, region: Region, has_failed_for: u8) { let handle = ObserveHandle::new(); + let schd = self.scheduler.clone(); self.subs.add_pending_region(®ion); if let Err(err) = self.try_start_observe(®ion, handle.clone()).await { - warn!("failed to start observe, retrying"; "err" => %err); - try_send!( - self.scheduler, - Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { - region, - handle, - err: Box::new(err) - }) - ); + warn!("failed to start observe, would retry"; "err" => %err, utils::slog_region(®ion)); + tokio::spawn(async move { + #[cfg(not(feature = "failpoints"))] + let delay = backoff_for_start_observe(has_failed_for); + #[cfg(feature = "failpoints")] + let delay = (|| { + fail::fail_point!("subscribe_mgr_retry_start_observe_delay", |v| { + let dur = v + .expect("should provide delay time (in ms)") + .parse::() + .expect("should be number (in ms)"); + Duration::from_millis(dur) + }); + backoff_for_start_observe(has_failed_for) + })(); + tokio::time::sleep(delay).await; + try_send!( + schd, + Task::ModifyObserve(ObserveOp::NotifyFailToStartObserve { + region, + handle, + err: Box::new(err), + has_failed_for: has_failed_for + 1 + }) + ) + }); } } - async fn retry_observe(&self, region: Region, handle: ObserveHandle) -> Result<()> { + async fn retry_observe( + &self, + region: Region, + handle: ObserveHandle, + failure_count: u8, + ) -> Result<()> { + if failure_count > TRY_START_OBSERVE_MAX_RETRY_TIME { + return Err(Error::Other( + format!( + "retry time exceeds for region {:?}", + utils::debug_region(®ion) + ) + .into(), + )); + } + let (tx, rx) = crossbeam::channel::bounded(1); self.regions .find_region_by_id( @@ -626,7 +679,8 @@ where metrics::INITIAL_SCAN_REASON .with_label_values(&["retry"]) .inc(); - self.start_observe(region).await; + self.start_observe_with_failure_count(region, failure_count) + .await; Ok(()) } @@ -750,4 +804,32 @@ mod test { should_finish_in(move || drop(pool), Duration::from_secs(5)); } + + #[test] + fn test_backoff_for_start_observe() { + assert_eq!( + super::backoff_for_start_observe(0), + super::RETRY_AWAIT_BASIC_DURATION + ); + assert_eq!( + super::backoff_for_start_observe(1), + super::RETRY_AWAIT_BASIC_DURATION * 2 + ); + assert_eq!( + super::backoff_for_start_observe(2), + super::RETRY_AWAIT_BASIC_DURATION * 4 + ); + assert_eq!( + super::backoff_for_start_observe(3), + super::RETRY_AWAIT_BASIC_DURATION * 8 + ); + assert_eq!( + super::backoff_for_start_observe(4), + super::RETRY_AWAIT_MAX_DURATION + ); + assert_eq!( + super::backoff_for_start_observe(5), + super::RETRY_AWAIT_MAX_DURATION + ); + } } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index db4f84924b0..7b2fe88b8a1 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -819,10 +819,15 @@ mod test { use std::time::{Duration, Instant}; use backup_stream::{ - errors::Error, router::TaskSelector, GetCheckpointResult, RegionCheckpointOperation, - RegionSet, Task, + errors::Error, + metadata::{ + keys::MetaKey, + store::{Keys, MetaStore}, + }, + router::TaskSelector, + GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; - use futures::{Stream, StreamExt}; + use futures::{executor::block_on, Stream, StreamExt}; use pd_client::PdClient; use test_raftstore::IsolationFilterFactory; use tikv_util::{box_err, defer, info, HandyRwLock}; @@ -1370,7 +1375,7 @@ mod test { .schedule(Task::ForceFlush("r".to_owned())) .unwrap(); suite.sync(); - std::thread::sleep(Duration::from_secs(1)); + std::thread::sleep(Duration::from_secs(2)); run_async_test(suite.check_for_write_records( suite.flushed_files.path(), round1.iter().map(|x| x.as_slice()), @@ -1429,4 +1434,54 @@ mod test { round1.iter().map(|k| k.as_slice()), )) } + + #[test] + fn test_retry_abort() { + let mut suite = super::SuiteBuilder::new_named("retry_abort") + .nodes(1) + .build(); + defer! { + fail::list().into_iter().for_each(|(name, _)| fail::remove(name)) + }; + + suite.must_register_task(1, "retry_abort"); + fail::cfg("subscribe_mgr_retry_start_observe_delay", "return(10)").unwrap(); + fail::cfg("try_start_observe", "return()").unwrap(); + + suite.must_split(&make_split_key_at_record(1, 42)); + std::thread::sleep(Duration::from_secs(2)); + + let error = run_async_test(suite.get_meta_cli().get_last_error("retry_abort", 1)).unwrap(); + let error = error.expect("no error uploaded"); + error + .get_error_message() + .find("retry") + .expect("error doesn't contain retry"); + fail::cfg("try_start_observe", "10*return()").unwrap(); + // Resume the task manually... + run_async_test(async { + suite + .meta_store + .delete(Keys::Key(MetaKey::pause_of("retry_abort"))) + .await?; + suite + .meta_store + .delete(Keys::Prefix(MetaKey::last_errors_of("retry_abort"))) + .await?; + backup_stream::errors::Result::Ok(()) + }) + .unwrap(); + + suite.sync(); + suite.wait_with(move |r| block_on(r.get_task_info("retry_abort")).is_ok()); + let items = run_async_test(suite.write_records(0, 128, 1)); + suite.force_flush_files("retry_abort"); + suite.wait_for_flush(); + run_async_test( + suite.check_for_write_records( + suite.flushed_files.path(), + items.iter().map(Vec::as_slice), + ), + ); + } } From 9a073f274735052c846b2f66c20fa4b155d8ac6f Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 11 Apr 2023 15:59:01 +0800 Subject: [PATCH 624/676] *: register cdc/resolved_ts endpoint in server2 (#14543) ref tikv/tikv#14542 Register cdc/resolved_ts endpoint in server2 Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/cdc/src/endpoint.rs | 78 +++++++------- components/cdc/src/initializer.rs | 57 +++++------ components/cdc/tests/mod.rs | 8 +- components/raftstore-v2/src/fsm/store.rs | 7 +- components/raftstore-v2/src/router/imp.rs | 28 +++++- components/raftstore/src/router.rs | 67 ++++++++++++- components/raftstore/src/store/fsm/store.rs | 6 ++ components/resolved_ts/src/advance.rs | 19 ++-- components/resolved_ts/src/endpoint.rs | 43 ++++---- components/resolved_ts/src/scanner.rs | 32 +++--- components/resolved_ts/tests/mod.rs | 4 +- components/server/src/server.rs | 8 +- components/server/src/server2.rs | 106 +++++++++++++++++--- components/test_raftstore/src/server.rs | 4 +- 14 files changed, 326 insertions(+), 141 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index efc82e27d6c..68650130211 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -30,12 +30,15 @@ use online_config::{ConfigChange, OnlineConfig}; use pd_client::{Feature, PdClient}; use raftstore::{ coprocessor::{CmdBatch, ObserveId}, - router::RaftStoreRouter, - store::fsm::{ChangeObserver, StoreMeta}, + router::CdcHandle, + store::fsm::{store::StoreRegionMeta, ChangeObserver}, }; use resolved_ts::{LeadershipResolver, Resolver}; use security::SecurityManager; -use tikv::{config::CdcConfig, storage::Statistics}; +use tikv::{ + config::CdcConfig, + storage::{kv::LocalTablets, Statistics}, +}; use tikv_util::{ debug, defer, error, impl_display_as_debug, info, mpsc::bounded, @@ -312,20 +315,20 @@ impl ResolvedRegionHeap { } } -pub struct Endpoint { +pub struct Endpoint { cluster_id: u64, capture_regions: HashMap, connections: HashMap, scheduler: Scheduler, - raft_router: T, - engine: E, + cdc_handle: T, + tablets: LocalTablets, observer: CdcObserver, pd_client: Arc, timer: SteadyTimer, tso_worker: Runtime, - store_meta: Arc>, + store_meta: Arc>, /// The concurrency manager for transactions. It's needed for CDC to check /// locks when calculating resolved_ts. concurrency_manager: ConcurrencyManager, @@ -355,23 +358,23 @@ pub struct Endpoint { warn_resolved_ts_repeat_count: usize, } -impl, E: KvEngine> Endpoint { +impl, E: KvEngine, S: StoreRegionMeta> Endpoint { pub fn new( cluster_id: u64, config: &CdcConfig, api_version: ApiVersion, pd_client: Arc, scheduler: Scheduler, - raft_router: T, - engine: E, + cdc_handle: T, + tablets: LocalTablets, observer: CdcObserver, - store_meta: Arc>, + store_meta: Arc>, concurrency_manager: ConcurrencyManager, env: Arc, security_mgr: Arc, sink_memory_quota: MemoryQuota, causal_ts_provider: Option>, - ) -> Endpoint { + ) -> Endpoint { let workers = Builder::new_multi_thread() .thread_name("cdcwkr") .worker_threads(config.incremental_scan_threads) @@ -405,10 +408,10 @@ impl, E: KvEngine> Endpoint { // Assume 1KB per entry. let max_scan_batch_size = 1024; - let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); + let region_read_progress = store_meta.lock().unwrap().region_read_progress().clone(); let store_resolver_gc_interval = Duration::from_secs(60); let leader_resolver = LeadershipResolver::new( - store_meta.lock().unwrap().store_id.unwrap(), + store_meta.lock().unwrap().store_id(), pd_client.clone(), env, security_mgr, @@ -430,8 +433,8 @@ impl, E: KvEngine> Endpoint { api_version, workers, scan_concurrency_semaphore, - raft_router, - engine, + cdc_handle, + tablets, observer, store_meta, concurrency_manager, @@ -643,7 +646,7 @@ impl, E: KvEngine> Endpoint { return; } - let txn_extra_op = match self.store_meta.lock().unwrap().readers.get(®ion_id) { + let txn_extra_op = match self.store_meta.lock().unwrap().reader(region_id) { Some(reader) => reader.txn_extra_op.clone(), None => { error!("cdc register for a not found region"; "region_id" => region_id); @@ -723,7 +726,7 @@ impl, E: KvEngine> Endpoint { let observed_range = downstream_.observed_range; let region_epoch = request.take_region_epoch(); let mut init = Initializer { - engine: self.engine.clone(), + tablet: self.tablets.get(region_id).map(|t| t.into_owned()), sched, observed_range, region_id, @@ -744,12 +747,12 @@ impl, E: KvEngine> Endpoint { filter_loop, }; - let raft_router = self.raft_router.clone(); + let cdc_handle = self.cdc_handle.clone(); let concurrency_semaphore = self.scan_concurrency_semaphore.clone(); self.workers.spawn(async move { CDC_SCAN_TASKS.with_label_values(&["total"]).inc(); match init - .initialize(change_cmd, raft_router, concurrency_semaphore) + .initialize(change_cmd, cdc_handle, concurrency_semaphore) .await { Ok(()) => { @@ -1009,7 +1012,7 @@ impl, E: KvEngine> Endpoint { let timeout = self.timer.delay(interval.unwrap_or_default()); let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); - let raft_router = self.raft_router.clone(); + let cdc_handle = self.cdc_handle.clone(); let regions: Vec = self.capture_regions.keys().copied().collect(); let cm: ConcurrencyManager = self.concurrency_manager.clone(); let hibernate_regions_compatible = self.config.hibernate_regions_compatible; @@ -1074,7 +1077,7 @@ impl, E: KvEngine> Endpoint { } else { CDC_RESOLVED_TS_ADVANCE_METHOD.set(0); leader_resolver - .resolve_by_raft(regions, min_ts, raft_router) + .resolve_by_raft(regions, min_ts, cdc_handle) .await }; leader_resolver_tx.send(leader_resolver).unwrap(); @@ -1107,7 +1110,9 @@ impl, E: KvEngine> Endpoint { } } -impl, E: KvEngine> Runnable for Endpoint { +impl, E: KvEngine, S: StoreRegionMeta + Send> Runnable + for Endpoint +{ type Task = Task; fn run(&mut self, task: Task) { @@ -1183,7 +1188,9 @@ impl, E: KvEngine> Runnable for Endpoint { } } -impl, E: KvEngine> RunnableWithTimer for Endpoint { +impl, E: KvEngine, S: StoreRegionMeta + Send> RunnableWithTimer + for Endpoint +{ fn on_timeout(&mut self) { CDC_ENDPOINT_PENDING_TASKS.set(self.scheduler.pending_tasks() as _); @@ -1258,7 +1265,8 @@ mod tests { }; use raftstore::{ errors::{DiscardReason, Error as RaftStoreError}, - store::{msg::CasualMessage, PeerMsg, ReadDelegate}, + router::{CdcRaftRouter, RaftStoreRouter}, + store::{fsm::StoreMeta, msg::CasualMessage, PeerMsg, ReadDelegate}, }; use test_pd_client::TestPdClient; use test_raftstore::MockRaftStoreRouter; @@ -1280,8 +1288,8 @@ mod tests { struct TestEndpointSuite { // The order must ensure `endpoint` be dropped before other fields. - endpoint: Endpoint, - raft_router: MockRaftStoreRouter, + endpoint: Endpoint, RocksEngine, StoreMeta>, + cdc_handle: CdcRaftRouter, task_rx: ReceiverWrapper, raft_rxs: HashMap>>, leader_resolver: Option, @@ -1291,7 +1299,7 @@ mod tests { // It's important to matain raft receivers in `raft_rxs`, otherwise all cases // need to drop `endpoint` and `rx` in order manually. fn add_region(&mut self, region_id: u64, cap: usize) { - let rx = self.raft_router.add_region(region_id, cap); + let rx = self.cdc_handle.add_region(region_id, cap); self.raft_rxs.insert(region_id, rx); self.add_local_reader(region_id); } @@ -1305,7 +1313,7 @@ mod tests { } fn fill_raft_rx(&self, region_id: u64) { - let router = &self.raft_router; + let router = &self.cdc_handle; loop { match router.send_casual_msg(region_id, CasualMessage::ClearRegionSize) { Ok(_) => continue, @@ -1321,7 +1329,7 @@ mod tests { } impl Deref for TestEndpointSuite { - type Target = Endpoint; + type Target = Endpoint, RocksEngine, StoreMeta>; fn deref(&self) -> &Self::Target { &self.endpoint } @@ -1348,7 +1356,7 @@ mod tests { causal_ts_provider: Option>, ) -> TestEndpointSuite { let (task_sched, task_rx) = dummy_scheduler(); - let raft_router = MockRaftStoreRouter::new(); + let cdc_handle = CdcRaftRouter(MockRaftStoreRouter::new()); let mut store_meta = StoreMeta::new(0); store_meta.store_id = Some(1); let region_read_progress = store_meta.region_read_progress.clone(); @@ -1370,14 +1378,14 @@ mod tests { api_version, pd_client, task_sched.clone(), - raft_router.clone(), - engine.unwrap_or_else(|| { + cdc_handle.clone(), + LocalTablets::Singleton(engine.unwrap_or_else(|| { TestEngineBuilder::new() .build_without_cache() .unwrap() .kv_engine() .unwrap() - }), + })), CdcObserver::new(task_sched), Arc::new(StdMutex::new(store_meta)), ConcurrencyManager::new(1.into()), @@ -1389,7 +1397,7 @@ mod tests { TestEndpointSuite { endpoint: ep, - raft_router, + cdc_handle, task_rx, raft_rxs: HashMap::default(), leader_resolver: Some(leader_resolver), diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 68850ac55ac..8f6f8ed38a7 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -17,10 +17,10 @@ use kvproto::{ }; use raftstore::{ coprocessor::ObserveId, - router::RaftStoreRouter, + router::CdcHandle, store::{ fsm::ChangeObserver, - msg::{Callback, ReadResponse, SignificantMsg}, + msg::{Callback, ReadResponse}, }, }; use resolved_ts::Resolver; @@ -75,7 +75,7 @@ pub(crate) enum Scanner { } pub(crate) struct Initializer { - pub(crate) engine: E, + pub(crate) tablet: Option, pub(crate) sched: Scheduler, pub(crate) sink: crate::channel::Sink, @@ -102,10 +102,10 @@ pub(crate) struct Initializer { } impl Initializer { - pub(crate) async fn initialize>( + pub(crate) async fn initialize>( &mut self, - change_cmd: ChangeObserver, - raft_router: T, + change_observer: ChangeObserver, + cdc_handle: T, concurrency_semaphore: Arc, ) -> Result<()> { fail_point!("cdc_before_initialize"); @@ -142,24 +142,22 @@ impl Initializer { let (incremental_scan_barrier_cb, incremental_scan_barrier_fut) = tikv_util::future::paired_future_callback(); let barrier = CdcEvent::Barrier(Some(incremental_scan_barrier_cb)); - if let Err(e) = raft_router.significant_send( + if let Err(e) = cdc_handle.capture_change( self.region_id, - SignificantMsg::CaptureChange { - cmd: change_cmd, - region_epoch, - callback: Callback::read(Box::new(move |resp| { - if let Err(e) = sched.schedule(Task::InitDownstream { - region_id, - downstream_id, - downstream_state, - sink, - incremental_scan_barrier: barrier, - cb: Box::new(move || cb(resp)), - }) { - error!("cdc schedule cdc task failed"; "error" => ?e); - } - })), - }, + region_epoch, + change_observer, + Callback::read(Box::new(move |resp| { + if let Err(e) = sched.schedule(Task::InitDownstream { + region_id, + downstream_id, + downstream_state, + sink, + incremental_scan_barrier: barrier, + cb: Box::new(move || cb(resp)), + }) { + error!("cdc schedule cdc task failed"; "error" => ?e); + } + })), ) { warn!("cdc send capture change cmd failed"; "region_id" => self.region_id, "error" => ?e); @@ -515,7 +513,11 @@ impl Initializer { let start_key = data_key(snap.lower_bound().unwrap_or_default()); let end_key = data_end_key(snap.upper_bound().unwrap_or_default()); let range = Range::new(&start_key, &end_key); - let collection = match self.engine.table_properties_collection(CF_WRITE, &[range]) { + let tablet = match self.tablet.as_ref() { + Some(t) => t, + None => return false, + }; + let collection = match tablet.table_properties_collection(CF_WRITE, &[range]) { Ok(collection) => collection, Err(_) => return false, }; @@ -572,7 +574,7 @@ mod tests { cdcpb::{EventLogType, Event_oneof_event}, errorpb::Error as ErrorHeader, }; - use raftstore::{coprocessor::ObserveHandle, store::RegionSnapshot}; + use raftstore::{coprocessor::ObserveHandle, router::CdcRaftRouter, store::RegionSnapshot}; use test_raftstore::MockRaftStoreRouter; use tikv::storage::{ kv::Engine, @@ -636,12 +638,11 @@ mod tests { .unwrap(); let downstream_state = Arc::new(AtomicCell::new(DownstreamState::Initializing)); let initializer = Initializer { - engine: engine.unwrap_or_else(|| { + tablet: engine.or_else(|| { TestEngineBuilder::new() .build_without_cache() .unwrap() .kv_engine() - .unwrap() }), sched: receiver_worker.scheduler(), sink, @@ -978,7 +979,7 @@ mod tests { mock_initializer(total_bytes, buffer, None, kv_api, false); let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); - let raft_router = MockRaftStoreRouter::new(); + let raft_router = CdcRaftRouter(MockRaftStoreRouter::new()); let concurrency_semaphore = Arc::new(Semaphore::new(1)); initializer.downstream_state.store(DownstreamState::Stopped); diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index 843b6b2f1d0..89ed4e6dbb1 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -20,9 +20,9 @@ use kvproto::{ tikvpb::TikvClient, }; use online_config::OnlineConfig; -use raftstore::coprocessor::CoprocessorHost; +use raftstore::{coprocessor::CoprocessorHost, router::CdcRaftRouter}; use test_raftstore::*; -use tikv::{config::CdcConfig, server::DEFAULT_CLUSTER_ID}; +use tikv::{config::CdcConfig, server::DEFAULT_CLUSTER_ID, storage::kv::LocalTablets}; use tikv_util::{ config::ReadableDuration, worker::{LazyWorker, Runnable}, @@ -185,8 +185,8 @@ impl TestSuiteBuilder { cluster.cfg.storage.api_version(), pd_cli.clone(), worker.scheduler(), - raft_router, - cluster.engines[id].kv.clone(), + CdcRaftRouter(raft_router), + LocalTablets::Singleton(cluster.engines[id].kv.clone()), cdc_ob, cluster.store_metas[id].clone(), cm.clone(), diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 4b4255b3d3e..e9b224b7375 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -13,7 +13,7 @@ use futures::{compat::Future01CompatExt, FutureExt}; use keys::{data_end_key, data_key}; use kvproto::metapb::Region; use raftstore::store::{ - fsm::store::StoreRegionMeta, Config, RegionReadProgressRegistry, Transport, + fsm::store::StoreRegionMeta, Config, ReadDelegate, RegionReadProgressRegistry, Transport, }; use slog::{info, o, Logger}; use tikv_util::{ @@ -133,6 +133,11 @@ impl StoreRegionMeta for StoreMeta { } } } + + #[inline] + fn reader(&self, region_id: u64) -> Option<&ReadDelegate> { + self.readers.get(®ion_id).map(|e| &e.0) + } } pub struct Store { diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 67b0a7adeb7..9bffe2b7983 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -9,10 +9,16 @@ use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine}; use futures::Future; use kvproto::{ + metapb::RegionEpoch, raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, }; -use raftstore::store::{AsyncReadNotifier, FetchedLogs, GenSnapRes, RegionSnapshot}; +use raftstore::{ + router::CdcHandle, + store::{ + fsm::ChangeObserver, AsyncReadNotifier, Callback, FetchedLogs, GenSnapRes, RegionSnapshot, + }, +}; use slog::warn; use super::PeerMsg; @@ -169,3 +175,23 @@ impl RaftRouter { } } } + +impl CdcHandle for RaftRouter { + fn capture_change( + &self, + _region_id: u64, + _region_epoch: RegionEpoch, + _change_observer: ChangeObserver, + _callback: Callback, + ) -> crate::Result<()> { + unimplemented!() + } + + fn check_leadership( + &self, + _region_id: u64, + _callback: Callback, + ) -> crate::Result<()> { + unimplemented!() + } +} diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 0f22eb483a0..3a76a5ad26f 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -13,7 +13,7 @@ use tikv_util::time::ThreadReadId; use crate::{ store::{ - fsm::RaftRouter, + fsm::{ChangeObserver, RaftRouter}, transport::{CasualRouter, ProposalRouter, SignificantRouter}, Callback, CasualMessage, LocalReader, PeerMsg, RaftCmdExtraOpts, RaftCommand, SignificantMsg, StoreMsg, StoreRouter, @@ -384,3 +384,68 @@ impl crate::coprocessor::StoreHandle for RaftRoute ); } } + +/// A handle for cdc and pitr to schedule some command back to raftstore. +pub trait CdcHandle: Clone + Send +where + EK: KvEngine, +{ + fn capture_change( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + change_observer: ChangeObserver, + callback: Callback, + ) -> RaftStoreResult<()>; + + fn check_leadership( + &self, + region_id: u64, + callback: Callback, + ) -> RaftStoreResult<()>; +} + +/// A wrapper of SignificantRouter that is specialized for implementing +/// CdcHandle. +#[derive(Clone)] +pub struct CdcRaftRouter(pub T); + +impl std::ops::Deref for CdcRaftRouter { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl CdcHandle for CdcRaftRouter +where + EK: KvEngine, + T: SignificantRouter + Send + Clone, +{ + fn capture_change( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + change_observer: ChangeObserver, + callback: Callback, + ) -> RaftStoreResult<()> { + self.0.significant_send( + region_id, + SignificantMsg::CaptureChange { + cmd: change_observer, + region_epoch, + callback, + }, + ) + } + + fn check_leadership( + &self, + region_id: u64, + callback: Callback, + ) -> RaftStoreResult<()> { + self.0 + .significant_send(region_id, SignificantMsg::LeaderCallback(callback)) + } +} diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index a546b286a68..7c71dc3825e 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -120,6 +120,7 @@ pub struct StoreInfo { /// of raftstore. pub trait StoreRegionMeta: Send { fn store_id(&self) -> u64; + fn reader(&self, region_id: u64) -> Option<&ReadDelegate>; fn region_read_progress(&self) -> &RegionReadProgressRegistry; fn search_region(&self, start_key: &[u8], end_key: &[u8], visitor: impl FnMut(&Region)); } @@ -189,6 +190,11 @@ impl StoreRegionMeta for StoreMeta { fn region_read_progress(&self) -> &RegionReadProgressRegistry { &self.region_read_progress } + + #[inline] + fn reader(&self, region_id: u64) -> Option<&ReadDelegate> { + self.readers.get(®ion_id) + } } impl StoreMeta { diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 611d8a84424..4739b679393 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -24,11 +24,8 @@ use kvproto::{ use pd_client::PdClient; use protobuf::Message; use raftstore::{ - router::RaftStoreRouter, - store::{ - msg::{Callback, SignificantMsg}, - util::RegionReadProgressRegistry, - }, + router::CdcHandle, + store::{msg::Callback, util::RegionReadProgressRegistry}, }; use security::SecurityManager; use tikv_util::{ @@ -225,18 +222,18 @@ impl LeadershipResolver { &self, regions: Vec, min_ts: TimeStamp, - raft_router: T, + cdc_handle: T, ) -> Vec where - T: 'static + RaftStoreRouter, + T: 'static + CdcHandle, E: KvEngine, { let mut reqs = Vec::with_capacity(regions.len()); for region_id in regions { - let raft_router_clone = raft_router.clone(); + let cdc_handle_clone = cdc_handle.clone(); let req = async move { let (tx, rx) = tokio::sync::oneshot::channel(); - let msg = SignificantMsg::LeaderCallback(Callback::read(Box::new(move |resp| { + let callback = Callback::read(Box::new(move |resp| { let resp = if resp.response.get_header().has_error() { None } else { @@ -245,8 +242,8 @@ impl LeadershipResolver { if tx.send(resp).is_err() { error!("cdc send tso response failed"; "region_id" => region_id); } - }))); - if let Err(e) = raft_router_clone.significant_send(region_id, msg) { + })); + if let Err(e) = cdc_handle_clone.check_leadership(region_id, callback) { warn!("cdc send LeaderCallback failed"; "err" => ?e, "min_ts" => min_ts); return None; } diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 8d2ee1631b4..23be4a62fc5 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -19,9 +19,9 @@ use online_config::{self, ConfigChange, ConfigManager, OnlineConfig}; use pd_client::PdClient; use raftstore::{ coprocessor::{CmdBatch, ObserveHandle, ObserveId}, - router::RaftStoreRouter, + router::CdcHandle, store::{ - fsm::StoreMeta, + fsm::store::StoreRegionMeta, util::{self, RegionReadProgress, RegionReadProgressRegistry}, }, }; @@ -266,11 +266,11 @@ impl ObserveRegion { } } -pub struct Endpoint { +pub struct Endpoint { store_id: Option, cfg: ResolvedTsConfig, advance_notify: Arc, - store_meta: Arc>, + store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, regions: HashMap, scanner_pool: ScannerPool, @@ -279,16 +279,17 @@ pub struct Endpoint { _phantom: PhantomData<(T, E)>, } -impl Endpoint +impl Endpoint where - T: 'static + RaftStoreRouter, + T: 'static + CdcHandle, E: KvEngine, + S: StoreRegionMeta, { pub fn new( cfg: &ResolvedTsConfig, scheduler: Scheduler, - raft_router: T, - store_meta: Arc>, + cdc_handle: T, + store_meta: Arc>, pd_client: Arc, concurrency_manager: ConcurrencyManager, env: Arc, @@ -296,7 +297,7 @@ where ) -> Self { let (region_read_progress, store_id) = { let meta = store_meta.lock().unwrap(); - (meta.region_read_progress.clone(), meta.store_id) + (meta.region_read_progress().clone(), meta.store_id()) }; let advance_worker = AdvanceTsWorker::new( cfg.advance_ts_interval.0, @@ -304,10 +305,10 @@ where scheduler.clone(), concurrency_manager, ); - let scanner_pool = ScannerPool::new(cfg.scan_lock_pool_size, raft_router); + let scanner_pool = ScannerPool::new(cfg.scan_lock_pool_size, cdc_handle); let store_resolver_gc_interval = Duration::from_secs(60); let leader_resolver = LeadershipResolver::new( - store_id.unwrap(), + store_id, pd_client.clone(), env, security_mgr, @@ -315,7 +316,7 @@ where store_resolver_gc_interval, ); let ep = Self { - store_id, + store_id: Some(store_id), cfg: cfg.clone(), advance_notify: Arc::new(Notify::new()), scheduler, @@ -492,8 +493,8 @@ where let region; { let meta = self.store_meta.lock().unwrap(); - match meta.regions.get(®ion_id) { - Some(r) => region = r.clone(), + match meta.reader(region_id) { + Some(r) => region = r.region.as_ref().clone(), None => return, } } @@ -592,8 +593,8 @@ where fn get_or_init_store_id(&mut self) -> Option { self.store_id.or_else(|| { let meta = self.store_meta.lock().unwrap(); - self.store_id = meta.store_id; - meta.store_id + self.store_id = Some(meta.store_id()); + self.store_id }) } } @@ -698,10 +699,11 @@ impl fmt::Display for Task { } } -impl Runnable for Endpoint +impl Runnable for Endpoint where - T: 'static + RaftStoreRouter, + T: 'static + CdcHandle, E: KvEngine, + S: StoreRegionMeta, { type Task = Task; @@ -754,10 +756,11 @@ impl ConfigManager for ResolvedTsConfigManager { const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s -impl RunnableWithTimer for Endpoint +impl RunnableWithTimer for Endpoint where - T: 'static + RaftStoreRouter, + T: 'static + CdcHandle, E: KvEngine, + S: StoreRegionMeta, { fn on_timeout(&mut self) { let store_id = self.get_or_init_store_id(); diff --git a/components/resolved_ts/src/scanner.rs b/components/resolved_ts/src/scanner.rs index 7877de718ba..a8c4e5bb44f 100644 --- a/components/resolved_ts/src/scanner.rs +++ b/components/resolved_ts/src/scanner.rs @@ -7,12 +7,8 @@ use futures::compat::Future01CompatExt; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb::Region}; use raftstore::{ coprocessor::{ObserveHandle, ObserveId}, - router::RaftStoreRouter, - store::{ - fsm::ChangeObserver, - msg::{Callback, SignificantMsg}, - RegionSnapshot, - }, + router::CdcHandle, + store::{fsm::ChangeObserver, msg::Callback, RegionSnapshot}, }; use tikv::storage::{ kv::{ScanMode as MvccScanMode, Snapshot}, @@ -64,12 +60,12 @@ pub enum ScanEntry { #[derive(Clone)] pub struct ScannerPool { workers: Arc, - raft_router: T, + cdc_handle: T, _phantom: PhantomData, } -impl, E: KvEngine> ScannerPool { - pub fn new(count: usize, raft_router: T) -> Self { +impl, E: KvEngine> ScannerPool { + pub fn new(count: usize, cdc_handle: T) -> Self { let workers = Arc::new( Builder::new_multi_thread() .thread_name("inc-scan") @@ -81,15 +77,15 @@ impl, E: KvEngine> ScannerPool { ); Self { workers, - raft_router, + cdc_handle, _phantom: PhantomData::default(), } } pub fn spawn_task(&self, mut task: ScanTask) { - let raft_router = self.raft_router.clone(); + let cdc_handle = self.cdc_handle.clone(); let fut = async move { - let snap = match Self::get_snapshot(&mut task, raft_router).await { + let snap = match Self::get_snapshot(&mut task, cdc_handle).await { Ok(snap) => snap, Err(e) => { warn!("resolved_ts scan get snapshot failed"; "err" => ?e); @@ -181,7 +177,7 @@ impl, E: KvEngine> ScannerPool { async fn get_snapshot( task: &mut ScanTask, - raft_router: T, + cdc_handle: T, ) -> Result> { let mut last_err = None; for retry_times in 0..=GET_SNAPSHOT_RETRY_TIME { @@ -201,13 +197,11 @@ impl, E: KvEngine> ScannerPool { } let (cb, fut) = tikv_util::future::paired_future_callback(); let change_cmd = ChangeObserver::from_rts(task.region.id, task.handle.clone()); - raft_router.significant_send( + cdc_handle.capture_change( task.region.id, - SignificantMsg::CaptureChange { - cmd: change_cmd, - region_epoch: task.region.get_region_epoch().clone(), - callback: Callback::read(Box::new(cb)), - }, + task.region.get_region_epoch().clone(), + change_cmd, + Callback::read(Box::new(cb)), )?; let mut resp = box_try!(fut.await); if resp.response.get_header().has_error() { diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 314a11db1a2..36705f9c015 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -13,7 +13,7 @@ use kvproto::{ tikvpb::TikvClient, }; use online_config::ConfigValue; -use raftstore::coprocessor::CoprocessorHost; +use raftstore::{coprocessor::CoprocessorHost, router::CdcRaftRouter}; use resolved_ts::{Observer, Task}; use test_raftstore::*; use tikv::config::ResolvedTsConfig; @@ -81,7 +81,7 @@ impl TestSuite { let rts_endpoint = resolved_ts::Endpoint::new( &cfg, worker.scheduler(), - raft_router, + CdcRaftRouter(raft_router), cluster.store_metas[id].clone(), pd_cli.clone(), cm.clone(), diff --git a/components/server/src/server.rs b/components/server/src/server.rs index cc07ff85471..3243b207aca 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -55,7 +55,7 @@ use raftstore::{ config::SplitCheckConfigManager, BoxConsistencyCheckObserver, ConsistencyCheckMethod, CoprocessorHost, RawConsistencyCheckObserver, RegionInfoAccessor, }, - router::ServerRaftStoreRouter, + router::{CdcRaftRouter, ServerRaftStoreRouter}, store::{ config::RaftstoreConfigManager, fsm, @@ -936,8 +936,8 @@ where self.core.config.storage.api_version(), self.pd_client.clone(), cdc_scheduler.clone(), - self.router.clone(), - self.engines.as_ref().unwrap().engines.kv.clone(), + CdcRaftRouter(self.router.clone()), + LocalTablets::Singleton(self.engines.as_ref().unwrap().engines.kv.clone()), cdc_ob, engines.store_meta.clone(), self.concurrency_manager.clone(), @@ -954,7 +954,7 @@ where let rts_endpoint = resolved_ts::Endpoint::new( &self.core.config.resolved_ts, rts_worker.scheduler(), - self.router.clone(), + CdcRaftRouter(self.router.clone()), engines.store_meta.clone(), self.pd_client.clone(), self.concurrency_manager.clone(), diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index a29c344884f..83b83ad190e 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -23,6 +23,7 @@ use std::{ use api_version::{dispatch_api_version, KvFormat}; use causal_ts::CausalTsProviderImpl; +use cdc::{CdcConfigManager, MemoryQuota}; use concurrency_manager::ConcurrencyManager; use engine_rocks::{from_rocks_compression_type, RocksEngine, RocksStatistics}; use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine, TabletRegistry, CF_DEFAULT, CF_WRITE}; @@ -31,8 +32,8 @@ use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; use kvproto::{ - brpb::create_backup, deadlock::create_deadlock, diagnosticspb::create_diagnostics, - import_sstpb_grpc::create_import_sst, kvrpcpb::ApiVersion, + brpb::create_backup, cdcpb_grpc::create_change_data, deadlock::create_deadlock, + diagnosticspb::create_diagnostics, import_sstpb_grpc::create_import_sst, kvrpcpb::ApiVersion, resource_usage_agent::create_resource_metering_pub_sub, }; use pd_client::{PdClient, RpcClient}; @@ -87,7 +88,7 @@ use tikv_util::{ sys::{disk, path_in_diff_mount_point, register_memory_usage_high_water, SysQuota}, thread_group::GroupProperties, time::{Instant, Monitor}, - worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + worker::{Builder as WorkerBuilder, LazyWorker, Scheduler}, yatp_pool::CleanupMethod, Either, }; @@ -186,7 +187,8 @@ struct TikvServer { coprocessor_host: Option>, concurrency_manager: ConcurrencyManager, env: Arc, - check_leader_worker: Worker, + cdc_scheduler: Option>, + cdc_memory_quota: Option, sst_worker: Option>>, quota_limiter: Arc, resource_manager: Option>, @@ -295,10 +297,6 @@ where info!("Causal timestamp provider startup."); } - // Run check leader in a dedicate thread, because it is time sensitive - // and crucial to TiCDC replication lag. - let check_leader_worker = WorkerBuilder::new("check-leader").thread_count(1).create(); - TikvServer { core: TikvServerCore { config, @@ -325,7 +323,8 @@ where coprocessor_host: None, concurrency_manager, env, - check_leader_worker, + cdc_scheduler: None, + cdc_memory_quota: None, sst_worker: None, quota_limiter, resource_manager, @@ -381,7 +380,7 @@ where ); lock_mgr.register_detector_role_change_observer(self.coprocessor_host.as_mut().unwrap()); - let engines = self.engines.as_ref().unwrap(); + let engines = self.engines.as_mut().unwrap(); let pd_worker = LazyWorker::new("pd-worker"); let pd_sender = raftstore_v2::PdReporter::new( @@ -551,13 +550,82 @@ where unified_read_pool_scale_receiver = Some(rx); } + // Run check leader in a dedicate thread, because it is time sensitive + // and crucial to TiCDC replication lag. + let check_leader_worker = + Box::new(WorkerBuilder::new("check-leader").thread_count(1).create()); + // Create check leader runer. let check_leader_runner = CheckLeaderRunner::new( self.router.as_ref().unwrap().store_meta().clone(), self.coprocessor_host.clone().unwrap(), ); - let check_leader_scheduler = self - .check_leader_worker - .start("check-leader", check_leader_runner); + let check_leader_scheduler = check_leader_worker.start("check-leader", check_leader_runner); + self.core.to_stop.push(check_leader_worker); + + // Create cdc worker. + let mut cdc_worker = Box::new(LazyWorker::new("cdc")); + let cdc_scheduler = cdc_worker.scheduler(); + let txn_extra_scheduler = cdc::CdcTxnExtraScheduler::new(cdc_scheduler.clone()); + engines + .engine + .set_txn_extra_scheduler(Arc::new(txn_extra_scheduler)); + // Register cdc observer. + let cdc_ob = cdc::CdcObserver::new(cdc_scheduler.clone()); + cdc_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + // Register cdc config manager. + cfg_controller.register( + tikv::config::Module::Cdc, + Box::new(CdcConfigManager(cdc_worker.scheduler())), + ); + // Start cdc endpoint. + let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); + let cdc_endpoint = cdc::Endpoint::new( + self.core.config.server.cluster_id, + &self.core.config.cdc, + self.core.config.storage.api_version(), + self.pd_client.clone(), + cdc_scheduler.clone(), + self.router.clone().unwrap(), + LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), + cdc_ob, + self.router.as_ref().unwrap().store_meta().clone(), + self.concurrency_manager.clone(), + self.env.clone(), + self.security_mgr.clone(), + cdc_memory_quota.clone(), + self.causal_ts_provider.clone(), + ); + cdc_worker.start_with_timer(cdc_endpoint); + self.core.to_stop.push(cdc_worker); + self.cdc_scheduler = Some(cdc_scheduler); + self.cdc_memory_quota = Some(cdc_memory_quota); + + // Create resolved ts. + if self.core.config.resolved_ts.enable { + let mut rts_worker = Box::new(LazyWorker::new("resolved-ts")); + // Register the resolved ts observer + let resolved_ts_ob = resolved_ts::Observer::new(rts_worker.scheduler()); + resolved_ts_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + // Register config manager for resolved ts worker + cfg_controller.register( + tikv::config::Module::ResolvedTs, + Box::new(resolved_ts::ResolvedTsConfigManager::new( + rts_worker.scheduler(), + )), + ); + let rts_endpoint = resolved_ts::Endpoint::new( + &self.core.config.resolved_ts, + rts_worker.scheduler(), + self.router.clone().unwrap(), + self.router.as_ref().unwrap().store_meta().clone(), + self.pd_client.clone(), + self.concurrency_manager.clone(), + self.env.clone(), + self.security_mgr.clone(), + ); + rts_worker.start_with_timer(rts_endpoint); + self.core.to_stop.push(rts_worker); + } let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); @@ -775,6 +843,18 @@ where .unwrap() .register(tikv::config::Module::Import, Box::new(import_cfg_mgr)); + let cdc_service = cdc::Service::new( + self.cdc_scheduler.as_ref().unwrap().clone(), + self.cdc_memory_quota.as_ref().unwrap().clone(), + ); + if servers + .server + .register_service(create_change_data(cdc_service)) + .is_some() + { + fatal!("failed to register cdc service"); + } + // Create Diagnostics service let diag_service = DiagnosticsService::new( servers.server.get_debug_thread_pool().clone(), diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index e7b43850e27..967ae4b980c 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -33,7 +33,7 @@ use pd_client::PdClient; use raftstore::{ coprocessor::{CoprocessorHost, RegionInfoAccessor}, errors::Error as RaftError, - router::{LocalReadRouter, RaftStoreRouter, ServerRaftStoreRouter}, + router::{CdcRaftRouter, LocalReadRouter, RaftStoreRouter, ServerRaftStoreRouter}, store::{ fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, msg::RaftCmdExtraOpts, @@ -355,7 +355,7 @@ impl ServerCluster { let rts_endpoint = resolved_ts::Endpoint::new( &cfg.resolved_ts, rts_worker.scheduler(), - raft_router, + CdcRaftRouter(raft_router), store_meta.clone(), self.pd_client.clone(), concurrency_manager.clone(), From ba805983dbadacee514bc01918d6c544f3426f4f Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Wed, 12 Apr 2023 15:09:01 +0800 Subject: [PATCH 625/676] storage: implement the row value checksum encode logic in tikv side (#14529) ref tikv/tikv#14528 Signed-off-by: cfzjywxk Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/tidb_query_datatype/Cargo.toml | 1 + .../src/codec/row/v2/encoder_for_test.rs | 186 ++++++++++++++++++ 3 files changed, 188 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index a508216a0e9..ff47b828c17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6321,6 +6321,7 @@ dependencies = [ "chrono-tz", "codec", "collections", + "crc32fast", "encoding_rs 0.8.29 (git+https://github.com/xiongjiwei/encoding_rs.git?rev=68e0bc5a72a37a78228d80cd98047326559cf43c)", "error_code", "hex 0.4.2", diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index c1be29a956d..97fb2d101b6 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -16,6 +16,7 @@ chrono = "0.4" chrono-tz = "0.5.1" codec = { workspace = true } collections = { workspace = true } +crc32fast = "1.2" encoding_rs = { git = "https://github.com/xiongjiwei/encoding_rs.git", rev = "68e0bc5a72a37a78228d80cd98047326559cf43c" } error_code = { workspace = true } hex = "0.4" diff --git a/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs b/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs index bedbc7324ce..343f2520230 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs @@ -47,6 +47,7 @@ const MAX_U8: u64 = u8::MAX as u64; const MAX_U16: u64 = u16::MAX as u64; const MAX_U32: u64 = u32::MAX as u64; +#[derive(Clone)] pub struct Column { id: i64, value: ScalarValue, @@ -89,8 +90,107 @@ impl Column { } } +/// Checksum +/// - HEADER(1 byte) +/// - VER: version(3 bit) +/// - E: has extra checksum +/// - CHECKSUM(4 bytes) +/// - little-endian CRC32(IEEE) when hdr.ver = 0 (default) +pub trait ChecksumHandler { + // update_col updates the checksum with the encoded value of the column. + fn checksum(&mut self, buf: &[u8]) -> Result<()>; + + // header_value returns the checksum header value. + fn header_value(&self) -> u8; + + // value returns the checksum value. + fn value(&self) -> u32; +} + +pub struct Crc32RowChecksumHandler { + header: ChecksumHeader, + hasher: crc32fast::Hasher, +} + +impl ChecksumHandler for Crc32RowChecksumHandler { + fn checksum(&mut self, buf: &[u8]) -> Result<()> { + self.hasher.update(buf); + Ok(()) + } + + fn header_value(&self) -> u8 { + self.header.value() + } + + fn value(&self) -> u32 { + self.hasher.clone().finalize() + } +} + +pub struct ChecksumHeader(u8); + +impl ChecksumHeader { + fn new() -> Self { + ChecksumHeader(0) + } + + #[cfg(test)] + fn set_version(&mut self, ver: u8) { + self.0 &= !0b111; + self.0 |= ver & 0b111; + } + + fn set_extra_checksum(&mut self) { + self.0 |= 0b1000; + } + + fn value(&self) -> u8 { + self.0 + } +} + +impl Crc32RowChecksumHandler { + pub fn new(has_extra_checksum: bool) -> Self { + let mut res = Crc32RowChecksumHandler { + header: ChecksumHeader::new(), + hasher: crc32fast::Hasher::new(), + }; + if has_extra_checksum { + res.header.set_extra_checksum(); + } + + res + } +} + +impl Default for Crc32RowChecksumHandler { + fn default() -> Self { + Self::new(false) + } +} + pub trait RowEncoder: NumberEncoder { fn write_row(&mut self, ctx: &mut EvalContext, columns: Vec) -> Result<()> { + self.write_row_impl(ctx, columns, None, None) + } + + fn write_row_with_checksum( + &mut self, + ctx: &mut EvalContext, + columns: Vec, + extra_checksum: Option, + ) -> Result<()> { + let mut handler = Crc32RowChecksumHandler::new(extra_checksum.is_some()); + self.write_row_impl(ctx, columns, Some(&mut handler), extra_checksum) + } + + fn write_row_impl( + &mut self, + ctx: &mut EvalContext, + columns: Vec, + mut checksum_handler: Option<&mut dyn ChecksumHandler>, + extra_checksum: Option, + ) -> Result<()> { let mut is_big = false; let mut null_ids = Vec::with_capacity(columns.len()); let mut non_null_ids = Vec::with_capacity(columns.len()); @@ -140,6 +240,18 @@ pub trait RowEncoder: NumberEncoder { } self.write_bytes(&offset_wtr)?; self.write_bytes(&value_wtr)?; + + if let Some(checksum_handler) = checksum_handler.as_mut() { + let header_val = checksum_handler.header_value(); + checksum_handler.checksum(value_wtr.as_slice())?; + let val = checksum_handler.value(); + self.write_u8(header_val)?; + self.write_u32_le(val)?; + if let Some(extra) = extra_checksum { + self.write_u32_le(extra)?; + } + } + Ok(()) } @@ -226,11 +338,16 @@ impl ScalarValueEncoder for T {} mod tests { use std::str::FromStr; + use codec::number::NumberDecoder; + use super::{Column, RowEncoder}; use crate::{ codec::{ data_type::ScalarValue, mysql::{duration::NANOS_PER_SEC, Decimal, Duration, Json, Time}, + row::v2::encoder_for_test::{ + ChecksumHandler, Crc32RowChecksumHandler, ScalarValueEncoder, + }, }, expr::EvalContext, }; @@ -303,4 +420,73 @@ mod tests { assert_eq!(exp, buf); } + + #[test] + fn test_encode_checksum() { + let encode_col_values = |ctx: &mut EvalContext, non_null_cols: Vec| -> Vec { + let mut res = vec![]; + for col in non_null_cols { + res.write_value(ctx, &col).unwrap(); + } + res + }; + let get_non_null_columns = |cols: &Vec| -> Vec { + let mut res = vec![]; + for col in cols { + if col.value.is_some() { + res.push(col.clone()); + } + } + res.sort_by_key(|c| c.id); + res + }; + let cols = vec![ + Column::new(1, 1000), + Column::new(12, 2), + Column::new(335, ScalarValue::Int(None)), + Column::new(3, 3), + Column::new(8, 32767), + ]; + + let mut buf = vec![]; + let mut handler = Crc32RowChecksumHandler::new(false); + handler.header.set_version(0); + buf.write_row_impl( + &mut EvalContext::default(), + cols.clone(), + Some(&mut handler), + None, + ) + .unwrap(); + + let exp = { + let mut hasher = crc32fast::Hasher::new(); + hasher.update( + encode_col_values(&mut EvalContext::default(), get_non_null_columns(&cols)) + .as_slice(), + ); + hasher.finalize() + }; + let mut val_slice = &buf[buf.len() - 4..]; + assert_eq!(exp, handler.value()); + assert_eq!(exp, val_slice.read_u32_le().unwrap()); + assert_eq!(0, handler.header_value()); + + buf.clear(); + let mut handler = Crc32RowChecksumHandler::new(true); + handler.header.set_version(1); + buf.write_row_impl( + &mut EvalContext::default(), + cols, + Some(&mut handler), + Some(exp), + ) + .unwrap(); + let mut val_slice = &buf[buf.len() - 4..]; + let mut extra_val_slice = &buf[buf.len() - 8..buf.len() - 4]; + assert_eq!(exp, handler.value()); + assert_eq!(exp, val_slice.read_u32_le().unwrap()); + assert_eq!(exp, extra_val_slice.read_u32_le().unwrap()); + assert_eq!(9, handler.header_value()); + } } From a4f995fb138c4aaa51475a625fa06795e92525ce Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Wed, 12 Apr 2023 16:31:01 +0800 Subject: [PATCH 626/676] server2: disable cdc and resolved_ts as they are not fully implemented (#14560) ref tikv/tikv#14542 Disable cdc and resolved_ts as they are not fully implemented Signed-off-by: Neil Shen --- components/server/src/server2.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 83b83ad190e..86d3a9a696f 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -563,7 +563,7 @@ where self.core.to_stop.push(check_leader_worker); // Create cdc worker. - let mut cdc_worker = Box::new(LazyWorker::new("cdc")); + let cdc_worker = Box::new(LazyWorker::new("cdc")); let cdc_scheduler = cdc_worker.scheduler(); let txn_extra_scheduler = cdc::CdcTxnExtraScheduler::new(cdc_scheduler.clone()); engines @@ -579,7 +579,7 @@ where ); // Start cdc endpoint. let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); - let cdc_endpoint = cdc::Endpoint::new( + let _cdc_endpoint = cdc::Endpoint::new( self.core.config.server.cluster_id, &self.core.config.cdc, self.core.config.storage.api_version(), @@ -595,14 +595,15 @@ where cdc_memory_quota.clone(), self.causal_ts_provider.clone(), ); - cdc_worker.start_with_timer(cdc_endpoint); - self.core.to_stop.push(cdc_worker); + // TODO: enable cdc. + // cdc_worker.start_with_timer(cdc_endpoint); + // self.core.to_stop.push(cdc_worker); self.cdc_scheduler = Some(cdc_scheduler); self.cdc_memory_quota = Some(cdc_memory_quota); // Create resolved ts. if self.core.config.resolved_ts.enable { - let mut rts_worker = Box::new(LazyWorker::new("resolved-ts")); + let rts_worker = Box::new(LazyWorker::new("resolved-ts")); // Register the resolved ts observer let resolved_ts_ob = resolved_ts::Observer::new(rts_worker.scheduler()); resolved_ts_ob.register_to(self.coprocessor_host.as_mut().unwrap()); @@ -613,7 +614,7 @@ where rts_worker.scheduler(), )), ); - let rts_endpoint = resolved_ts::Endpoint::new( + let _rts_endpoint = resolved_ts::Endpoint::new( &self.core.config.resolved_ts, rts_worker.scheduler(), self.router.clone().unwrap(), @@ -623,8 +624,9 @@ where self.env.clone(), self.security_mgr.clone(), ); - rts_worker.start_with_timer(rts_endpoint); - self.core.to_stop.push(rts_worker); + // TODO: enable resolved_ts. + // rts_worker.start_with_timer(rts_endpoint); + // self.core.to_stop.push(rts_worker); } let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); From 3630ba96c37092f7309eb7e858500dbd76d8614d Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Wed, 12 Apr 2023 19:11:02 +0800 Subject: [PATCH 627/676] txn: Support check for_update_ts when prewriting (#14492) ref tikv/tikv#14311 Supports checking for_update_ts for specific keys during prewrite to avoid potential lost update that might be caused by allowing locking with conflict. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/resolved_ts/src/cmd.rs | 1 + src/storage/mod.rs | 6 + src/storage/mvcc/mod.rs | 34 ++- src/storage/mvcc/reader/reader.rs | 2 + src/storage/mvcc/txn.rs | 9 +- .../txn/actions/acquire_pessimistic_lock.rs | 2 - src/storage/txn/actions/prewrite.rs | 196 ++++++++++++- src/storage/txn/actions/tests.rs | 90 +++++- src/storage/txn/commands/mod.rs | 23 ++ src/storage/txn/commands/prewrite.rs | 263 ++++++++++++++++-- src/storage/txn/mod.rs | 1 + src/storage/txn/store.rs | 1 + tests/benches/hierarchy/mvcc/mod.rs | 2 + tests/benches/hierarchy/txn/mod.rs | 2 + tests/failpoints/cases/test_storage.rs | 4 + 16 files changed, 594 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ff47b828c17..b654e34fb77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2792,7 +2792,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#af969693ce8a7884e5bdc5d81c728f657d33065a" +source = "git+https://github.com/pingcap/kvproto.git#ce835ae20dfcb5f69f0aea04236070932c815b6a" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index d3bda563a4f..47d14304112 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -424,6 +424,7 @@ mod tests { Mutation::make_put(k1.clone(), b"v4".to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap(); one_pc_commit(true, &mut txn, 10.into()); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 8f955f3850d..11740bcc2bf 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -8254,6 +8254,7 @@ mod tests { None, false, AssertionLevel::Off, + vec![], Context::default(), ), expect_ok_callback(tx.clone(), 0), @@ -9620,6 +9621,7 @@ mod tests { Some(vec![b"e".to_vec()]), false, AssertionLevel::Off, + vec![], Context::default(), ), Box::new(move |res| { @@ -9718,6 +9720,7 @@ mod tests { None, false, AssertionLevel::Off, + vec![], Default::default(), ), expect_ok_callback(tx.clone(), 0), @@ -9768,6 +9771,7 @@ mod tests { Some(vec![k2.to_vec()]), false, AssertionLevel::Off, + vec![], Default::default(), ), expect_ok_callback(tx.clone(), 0), @@ -10604,6 +10608,7 @@ mod tests { None, false, AssertionLevel::Off, + vec![], Context::default(), ), Box::new(move |res| { @@ -10662,6 +10667,7 @@ mod tests { None, false, AssertionLevel::Off, + vec![], Context::default(), ), Box::new(move |res| { diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 3dca7a219f9..0f133b99941 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -132,10 +132,14 @@ pub enum ErrorInner { KeyVersion, #[error( - "pessimistic lock not found, start_ts:{}, key:{}", - .start_ts, log_wrappers::Value::key(.key) + "pessimistic lock not found, start_ts:{}, key:{}, reason: {:?}", + .start_ts, log_wrappers::Value::key(.key), .reason )] - PessimisticLockNotFound { start_ts: TimeStamp, key: Vec }, + PessimisticLockNotFound { + start_ts: TimeStamp, + key: Vec, + reason: PessimisticLockNotFoundReason, + }, #[error( "min_commit_ts {} is larger than max_commit_ts {}, start_ts: {}", @@ -257,12 +261,15 @@ impl ErrorInner { key: key.to_owned(), }) } - ErrorInner::PessimisticLockNotFound { start_ts, key } => { - Some(ErrorInner::PessimisticLockNotFound { - start_ts: *start_ts, - key: key.to_owned(), - }) - } + ErrorInner::PessimisticLockNotFound { + start_ts, + key, + reason, + } => Some(ErrorInner::PessimisticLockNotFound { + start_ts: *start_ts, + key: key.to_owned(), + reason: *reason, + }), ErrorInner::CommitTsTooLarge { start_ts, min_commit_ts, @@ -421,6 +428,15 @@ pub fn default_not_found_error(key: Vec, hint: &str) -> Error { } } +#[derive(Debug, Clone, Copy)] +pub enum PessimisticLockNotFoundReason { + LockTsMismatch, + LockMissingAmendFail, + LockForUpdateTsMismatch, + NonLockKeyConflict, + FailpointInjected, +} + pub mod tests { use std::borrow::Cow; diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 36e8816ad25..7c15c6d7735 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -886,6 +886,7 @@ pub mod tests { m, &None, SkipPessimisticCheck, + None, ) .unwrap(); self.write(txn.into_modifies()); @@ -910,6 +911,7 @@ pub mod tests { m, &None, DoPessimisticCheck, + None, ) .unwrap(); self.write(txn.into_modifies()); diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index 9e87bf748b7..f395b07e7f8 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -9,7 +9,7 @@ use kvproto::kvrpcpb::LockInfo; use txn_types::{Key, Lock, PessimisticLock, TimeStamp, Value}; use super::metrics::{GC_DELETE_VERSIONS_HISTOGRAM, MVCC_VERSIONS_HISTOGRAM}; -use crate::storage::kv::Modify; +use crate::storage::{kv::Modify, mvcc::PessimisticLockNotFoundReason}; pub const MAX_TXN_WRITE_SIZE: usize = 32 * 1024; @@ -306,6 +306,7 @@ pub(crate) fn make_txn_error( "pessimisticlocknotfound" => ErrorInner::PessimisticLockNotFound { start_ts, key: key.to_raw().unwrap(), + reason: PessimisticLockNotFoundReason::FailpointInjected, }, _ => ErrorInner::Other(box_err!("unexpected error string")), } @@ -815,6 +816,7 @@ pub(crate) mod tests { Mutation::make_put(key.clone(), v.to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap(); assert!(txn.write_size() > 0); @@ -859,6 +861,7 @@ pub(crate) mod tests { Mutation::make_put(Key::from_raw(key), value.to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap_err(); @@ -872,6 +875,7 @@ pub(crate) mod tests { Mutation::make_put(Key::from_raw(key), value.to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap(); } @@ -1312,6 +1316,7 @@ pub(crate) mod tests { mutation, &Some(vec![b"key1".to_vec(), b"key2".to_vec(), b"key3".to_vec()]), SkipPessimisticCheck, + None, ) .unwrap(); let modifies = txn.into_modifies(); @@ -1370,6 +1375,7 @@ pub(crate) mod tests { mutation, &Some(vec![b"key1".to_vec(), b"key2".to_vec(), b"key3".to_vec()]), DoPessimisticCheck, + None, ) .unwrap(); let modifies = txn.into_modifies(); @@ -1439,6 +1445,7 @@ pub(crate) mod tests { mutation, &Some(vec![b"key1".to_vec(), b"key2".to_vec(), b"key3".to_vec()]), DoPessimisticCheck, + None, ) .unwrap(); assert_eq!(min_commit_ts.into_inner(), 100); diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 987af9fbed7..afdbace9e7a 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -452,7 +452,6 @@ pub mod tests { TestEngineBuilder, }; - #[cfg(test)] pub fn acquire_pessimistic_lock_allow_lock_with_conflict( engine: &mut E, key: &[u8], @@ -496,7 +495,6 @@ pub mod tests { res.map(|r| r.0) } - #[cfg(test)] pub fn must_succeed_allow_lock_with_conflict( engine: &mut E, key: &[u8], diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 1e655846d08..69cf8b32578 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -19,7 +19,8 @@ use crate::storage::{ MVCC_CONFLICT_COUNTER, MVCC_DUPLICATE_CMD_COUNTER_VEC, MVCC_PREWRITE_ASSERTION_PERF_COUNTER_VEC, }, - Error, ErrorInner, Lock, LockType, MvccTxn, Result, SnapshotReader, + Error, ErrorInner, Lock, LockType, MvccTxn, PessimisticLockNotFoundReason, Result, + SnapshotReader, }, txn::{ actions::check_data_constraint::check_data_constraint, sched_pool::tls_can_enable, @@ -36,6 +37,7 @@ pub fn prewrite( mutation: Mutation, secondary_keys: &Option>>, pessimistic_action: PrewriteRequestPessimisticAction, + expected_for_update_ts: Option, ) -> Result<(TimeStamp, OldValue)> { let mut mutation = PrewriteMutation::from_mutation(mutation, secondary_keys, pessimistic_action, txn_props)?; @@ -63,7 +65,7 @@ pub fn prewrite( let mut lock_amended = false; let lock_status = match reader.load_lock(&mutation.key)? { - Some(lock) => mutation.check_lock(lock, pessimistic_action)?, + Some(lock) => mutation.check_lock(lock, pessimistic_action, expected_for_update_ts)?, None if matches!(pessimistic_action, DoPessimisticCheck) => { amend_pessimistic_lock(&mut mutation, reader)?; lock_amended = true; @@ -218,16 +220,18 @@ pub enum TransactionKind { Pessimistic(TimeStamp), } +#[derive(Clone, Copy)] enum LockStatus { // Lock has already been locked; min_commit_ts of lock. Locked(TimeStamp), - Pessimistic, + // Key is pessimistic-locked; for_update_ts of lock. + Pessimistic(TimeStamp), None, } impl LockStatus { fn has_pessimistic_lock(&self) -> bool { - matches!(self, LockStatus::Pessimistic) + matches!(self, LockStatus::Pessimistic(_)) } } @@ -309,6 +313,7 @@ impl<'a> PrewriteMutation<'a> { &mut self, lock: Lock, pessimistic_action: PrewriteRequestPessimisticAction, + expected_for_update_ts: Option, ) -> Result { if lock.ts != self.txn_props.start_ts { // Abort on lock belonging to other transaction if @@ -323,6 +328,7 @@ impl<'a> PrewriteMutation<'a> { return Err(ErrorInner::PessimisticLockNotFound { start_ts: self.txn_props.start_ts, key: self.key.to_raw()?, + reason: PessimisticLockNotFoundReason::LockTsMismatch, } .into()); } @@ -344,12 +350,59 @@ impl<'a> PrewriteMutation<'a> { .into()); } + if let Some(ts) = expected_for_update_ts && lock.for_update_ts != ts { + // The constraint on for_update_ts of the pessimistic lock is violated. + // Consider the following case: + // + // 1. A pessimistic lock of transaction `T1` succeeded with`WakeUpModeForceLock` + // enabled, then it returns to the client and the client continues its + // execution. + // 2. The lock is lost for some reason such as pipelined locking or in-memory + // pessimistic lock. + // 3. Another transaction `T2` writes the key and committed. + // 4. The key then receives a stale pessimistic lock request of `T1` that has + // been received in step 1 (maybe because of retrying due to network issue + // in step 1). Since it allows locking with conflict, though there's a newer + // version that's later than the request's `for_update_ts`, the request can + // still acquire the lock. However no one will check the response, which + // tells the latest commit_ts it met. + // 5. The transaction `T1` commits. When it prewrites it checks if each key is + // pessimistic-locked. + // + // Transaction `T1` won't notice anything wrong without this check since it + // does have a pessimistic lock of the same transaction. However, actually + // one of the key is locked in a larger version than that the client would + // expect. As a result, the conflict between transaction `T1` and `T2` is + // missed. + // To avoid this problem, we check the for_update_ts written on the + // pessimistic locks that's acquired in force-locking mode. If it doesn't match + // the one known by the client, the lock that we expected to have will be + // regarded as missing. + // + // It's actually theoretically safe to allow `lock.for_update_ts` < + // `expected_for_update_ts`, but the possibility to encounter this case is very + // low. For simplicity, we don't consider that case and only allow + // `lock.for_update_ts` to exactly match that we expect. + warn!("pessimistic lock have different for_update_ts than expected. the expected lock must have been lost"; + "key" => %self.key, + "start_ts" => self.txn_props.start_ts, + "expected_for_update_ts" => ts, + "lock" => ?lock); + + return Err(ErrorInner::PessimisticLockNotFound { + start_ts: self.txn_props.start_ts, + key: self.key.to_raw()?, + reason: PessimisticLockNotFoundReason::LockForUpdateTsMismatch, + } + .into()); + } + // The lock is pessimistic and owned by this txn, go through to overwrite it. // The ttl and min_commit_ts of the lock may have been pushed forward. self.lock_ttl = std::cmp::max(self.lock_ttl, lock.ttl); self.min_commit_ts = std::cmp::max(self.min_commit_ts, lock.min_commit_ts); - return Ok(LockStatus::Pessimistic); + return Ok(LockStatus::Pessimistic(lock.for_update_ts)); } // Duplicated command. No need to overwrite the lock and data. @@ -430,6 +483,7 @@ impl<'a> PrewriteMutation<'a> { return Err(ErrorInner::PessimisticLockNotFound { start_ts: self.txn_props.start_ts, key: self.key.clone().into_raw()?, + reason: PessimisticLockNotFoundReason::NonLockKeyConflict, } .into()); } @@ -458,13 +512,20 @@ impl<'a> PrewriteMutation<'a> { ) -> Result { let mut try_one_pc = self.try_one_pc(); + let for_update_ts_to_write = match (self.txn_props.for_update_ts(), lock_status) { + (from_prewrite_req, LockStatus::Pessimistic(from_pessimistic_lock)) => { + std::cmp::max(from_prewrite_req, from_pessimistic_lock) + } + (for_update_ts_from_req, _) => for_update_ts_from_req, + }; + let mut lock = Lock::new( self.lock_type.unwrap(), self.txn_props.primary.to_vec(), self.txn_props.start_ts, self.lock_ttl, None, - self.txn_props.for_update_ts(), + for_update_ts_to_write, self.txn_props.txn_size, self.min_commit_ts, ) @@ -758,6 +819,7 @@ fn amend_pessimistic_lock( return Err(ErrorInner::PessimisticLockNotFound { start_ts: reader.start_ts, key: mutation.key.clone().into_raw()?, + reason: PessimisticLockNotFoundReason::LockMissingAmendFail, } .into()); } @@ -798,7 +860,10 @@ pub mod tests { #[cfg(test)] use crate::storage::{ kv::RocksSnapshot, - txn::{commands::prewrite::fallback_1pc_locks, tests::*}, + txn::{ + commands::pessimistic_rollback::tests::must_success as must_pessimistic_rollback, + commands::prewrite::fallback_1pc_locks, tests::*, + }, }; use crate::storage::{mvcc::tests::*, Engine}; @@ -869,6 +934,7 @@ pub mod tests { Mutation::make_insert(Key::from_raw(key), value.to_vec()), &None, SkipPessimisticCheck, + None, )?; // Insert must be None if the key is not lock, or be Unspecified if the // key is already locked. @@ -900,6 +966,7 @@ pub mod tests { Mutation::make_check_not_exists(Key::from_raw(key)), &None, DoPessimisticCheck, + None, )?; assert_eq!(old_value, OldValue::Unspecified); Ok(()) @@ -922,6 +989,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &Some(vec![b"k2".to_vec()]), SkipPessimisticCheck, + None, ) .unwrap(); assert_eq!(old_value, OldValue::None); @@ -935,6 +1003,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()), &Some(vec![]), SkipPessimisticCheck, + None, ) .unwrap_err(); assert!(matches!( @@ -970,6 +1039,7 @@ pub mod tests { Mutation::make_check_not_exists(Key::from_raw(b"k0")), &Some(vec![]), SkipPessimisticCheck, + None, ) .unwrap(); assert!(min_ts > props.start_ts); @@ -990,6 +1060,7 @@ pub mod tests { Mutation::make_check_not_exists(Key::from_raw(b"k0")), &Some(vec![]), SkipPessimisticCheck, + None, ) .unwrap(); assert_eq!(cm.max_ts(), props.start_ts); @@ -1005,6 +1076,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &Some(vec![b"k2".to_vec()]), SkipPessimisticCheck, + None, ) .unwrap(); assert!(min_ts > 42.into()); @@ -1028,6 +1100,7 @@ pub mod tests { mutation.clone(), &Some(vec![b"k4".to_vec()]), SkipPessimisticCheck, + None, ) .unwrap(); assert!(min_ts > 44.into()); @@ -1050,6 +1123,7 @@ pub mod tests { mutation.clone(), &Some(vec![b"k6".to_vec()]), SkipPessimisticCheck, + None, ) .unwrap(); assert!(min_ts > 45.into()); @@ -1069,6 +1143,7 @@ pub mod tests { mutation.clone(), &Some(vec![b"k8".to_vec()]), SkipPessimisticCheck, + None, ) .unwrap(); assert!(min_ts >= 46.into()); @@ -1099,6 +1174,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap(); assert_eq!(old_value, OldValue::None); @@ -1112,6 +1188,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap_err(); assert!(matches!( @@ -1159,6 +1236,7 @@ pub mod tests { Mutation::make_check_not_exists(Key::from_raw(key)), &None, SkipPessimisticCheck, + None, )?; assert_eq!(old_value, OldValue::Unspecified); Ok(()) @@ -1197,6 +1275,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &Some(vec![b"k2".to_vec()]), DoPessimisticCheck, + None, ) .unwrap(); // Pessimistic txn skips constraint check, does not read previous write. @@ -1211,6 +1290,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()), &Some(vec![]), DoPessimisticCheck, + None, ) .unwrap_err(); } @@ -1248,6 +1328,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k1"), b"v1".to_vec()), &None, DoPessimisticCheck, + None, ) .unwrap(); // Pessimistic txn skips constraint check, does not read previous write. @@ -1262,6 +1343,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k2"), b"v2".to_vec()), &None, DoPessimisticCheck, + None, ) .unwrap_err(); } @@ -1369,6 +1451,7 @@ pub mod tests { Mutation::make_check_not_exists(Key::from_raw(key)), &None, SkipPessimisticCheck, + None, ); if success { let res = res.unwrap(); @@ -1384,6 +1467,7 @@ pub mod tests { Mutation::make_insert(Key::from_raw(key), b"value".to_vec()), &None, SkipPessimisticCheck, + None, ); if success { let res = res.unwrap(); @@ -1440,6 +1524,7 @@ pub mod tests { Mutation::make_put(key.clone(), b"value".to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap(); assert_eq!(&old_value, expected_value, "key: {}", key); @@ -1694,6 +1779,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(b"k1"), b"value".to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap(); assert_eq!( @@ -1749,6 +1835,7 @@ pub mod tests { Mutation::make_insert(Key::from_raw(b"k1"), b"v2".to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap(); assert_eq!(old_value, OldValue::None); @@ -1887,6 +1974,7 @@ pub mod tests { Mutation::make_put(Key::from_raw(key), b"v2".to_vec()), &None, SkipPessimisticCheck, + None, )?; Ok(old_value) })], @@ -1924,6 +2012,7 @@ pub mod tests { Mutation::make_insert(Key::from_raw(key), b"v2".to_vec()), &None, SkipPessimisticCheck, + None, )?; Ok(old_value) })], @@ -2507,4 +2596,97 @@ pub mod tests { assert_eq!(lock.versions_to_last_change, 0); must_rollback(&mut engine, key, 40, false); } + + #[test] + fn test_pessimistic_prewrite_check_for_update_ts() { + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let key = b"k"; + let value = b"v"; + + let prewrite = &must_pessimistic_prewrite_put_check_for_update_ts; + let prewrite_err = &must_pessimistic_prewrite_put_check_for_update_ts_err; + + let mut test_normal = |start_ts: u64, + lock_for_update_ts: u64, + prewrite_req_for_update_ts: u64, + expected_for_update_ts: u64, + success: bool, + commit_ts: u64| { + // In actual cases these kinds of pessimistic locks should be locked in + // `allow_locking_with_conflict` mode. For simplicity, we pass a large + // for_update_ts to the pessimistic lock to simulate that case. + must_acquire_pessimistic_lock(&mut engine, key, key, start_ts, lock_for_update_ts); + must_pessimistic_locked(&mut engine, key, start_ts, lock_for_update_ts); + if success { + prewrite( + &mut engine, + key, + value, + key, + start_ts, + prewrite_req_for_update_ts, + Some(expected_for_update_ts), + ); + must_locked(&mut engine, key, start_ts); + // Test idempotency. + prewrite( + &mut engine, + key, + value, + key, + start_ts, + prewrite_req_for_update_ts, + Some(expected_for_update_ts), + ); + let prewrite_lock = must_locked(&mut engine, key, start_ts); + assert_le!( + TimeStamp::from(lock_for_update_ts), + prewrite_lock.for_update_ts + ); + must_commit(&mut engine, key, start_ts, commit_ts); + must_unlocked(&mut engine, key); + } else { + let e = prewrite_err( + &mut engine, + key, + value, + key, + start_ts, + prewrite_req_for_update_ts, + Some(expected_for_update_ts), + ); + match e { + Error(box ErrorInner::PessimisticLockNotFound { .. }) => (), + e => panic!("unexpected error: {:?}", e), + } + must_pessimistic_locked(&mut engine, key, start_ts, lock_for_update_ts); + must_pessimistic_rollback(&mut engine, key, start_ts, lock_for_update_ts); + must_unlocked(&mut engine, key); + } + }; + + test_normal(10, 10, 10, 10, true, 19); + // Note that the `for_update_ts` field in prewrite request is not guaranteed to + // be greater or equal to the max for_update_ts that has been written to + // a pessimistic lock during the transaction. + test_normal(20, 20, 20, 24, false, 0); + test_normal(30, 35, 30, 35, true, 39); + test_normal(40, 45, 40, 40, false, 0); + test_normal(50, 55, 56, 51, false, 0); + + // Amend pessimistic lock cases. Once amend-lock is passed, it can be guaranteed + // there are no conflict, so the check won't fail. + // Amending succeeds. + must_unlocked(&mut engine, key); + prewrite(&mut engine, key, value, key, 100, 105, Some(102)); + must_locked(&mut engine, key, 100); + must_commit(&mut engine, key, 100, 125); + + // Amending fails. + must_unlocked(&mut engine, key); + prewrite_err(&mut engine, key, value, key, 120, 120, Some(120)); + must_unlocked(&mut engine, key); + prewrite_err(&mut engine, key, value, key, 120, 130, Some(130)); + must_unlocked(&mut engine, key); + } } diff --git a/src/storage/txn/actions/tests.rs b/src/storage/txn/actions/tests.rs index e6872ef493f..0fc73804aff 100644 --- a/src/storage/txn/actions/tests.rs +++ b/src/storage/txn/actions/tests.rs @@ -33,7 +33,7 @@ pub fn must_prewrite_put_impl( is_retry_request: bool, assertion: Assertion, assertion_level: AssertionLevel, -) { +) -> TimeStamp { must_prewrite_put_impl_with_should_not_exist( engine, key, @@ -42,6 +42,7 @@ pub fn must_prewrite_put_impl( secondary_keys, ts, pessimistic_action, + None, lock_ttl, for_update_ts, txn_size, @@ -53,7 +54,7 @@ pub fn must_prewrite_put_impl( false, None, 0, - ); + ) } pub fn must_prewrite_insert_impl( @@ -81,6 +82,7 @@ pub fn must_prewrite_insert_impl( secondary_keys, ts, pessimistic_action, + None, lock_ttl, for_update_ts, txn_size, @@ -103,6 +105,7 @@ pub fn must_prewrite_put_impl_with_should_not_exist( secondary_keys: &Option>>, ts: TimeStamp, pessimistic_action: PrewriteRequestPessimisticAction, + expected_for_update_ts: Option, lock_ttl: u64, for_update_ts: TimeStamp, txn_size: u64, @@ -114,7 +117,7 @@ pub fn must_prewrite_put_impl_with_should_not_exist( should_not_exist: bool, region_id: Option, txn_source: u64, -) { +) -> TimeStamp { let mut ctx = Context::default(); ctx.set_txn_source(txn_source); if let Some(region_id) = region_id { @@ -144,7 +147,7 @@ pub fn must_prewrite_put_impl_with_should_not_exist( } else { CommitKind::TwoPc }; - prewrite( + let (min_commit_ts, _) = prewrite( &mut txn, &mut reader, &TransactionProperties { @@ -163,9 +166,11 @@ pub fn must_prewrite_put_impl_with_should_not_exist( mutation, secondary_keys, pessimistic_action, + expected_for_update_ts, ) .unwrap(); write(engine, &ctx, txn.into_modifies()); + min_commit_ts } pub fn must_prewrite_put( @@ -210,6 +215,7 @@ pub fn must_prewrite_put_on_region( &None, ts.into(), SkipPessimisticCheck, + None, 0, TimeStamp::default(), 0, @@ -240,6 +246,7 @@ pub fn must_prewrite_put_with_txn_soucre( &None, ts.into(), SkipPessimisticCheck, + None, 0, TimeStamp::default(), 0, @@ -415,7 +422,7 @@ pub fn must_pessimistic_prewrite_put_async_commit( for_update_ts: impl Into, pessimistic_action: PrewriteRequestPessimisticAction, min_commit_ts: impl Into, -) { +) -> TimeStamp { assert!(secondary_keys.is_some()); must_prewrite_put_impl( engine, @@ -433,6 +440,38 @@ pub fn must_pessimistic_prewrite_put_async_commit( false, Assertion::None, AssertionLevel::Off, + ) +} + +pub fn must_pessimistic_prewrite_put_check_for_update_ts( + engine: &mut E, + key: &[u8], + value: &[u8], + pk: &[u8], + ts: impl Into, + for_update_ts: impl Into, + expected_for_update_ts: Option, +) { + must_prewrite_put_impl_with_should_not_exist( + engine, + key, + value, + pk, + &None, + ts.into(), + DoPessimisticCheck, + expected_for_update_ts.map(Into::into), + 0, + for_update_ts.into(), + 0, + TimeStamp::default(), + TimeStamp::default(), + false, + Assertion::None, + AssertionLevel::Off, + false, + None, + 0, ); } @@ -485,6 +524,8 @@ pub fn must_prewrite_put_err_impl( ts.into(), for_update_ts.into(), pessimistic_action, + None, + 0, max_commit_ts.into(), is_retry_request, assertion, @@ -516,6 +557,8 @@ pub fn must_prewrite_insert_err_impl( ts.into(), for_update_ts.into(), pessimistic_action, + None, + 0, max_commit_ts.into(), is_retry_request, assertion, @@ -533,6 +576,8 @@ pub fn must_prewrite_put_err_impl_with_should_not_exist( ts: impl Into, for_update_ts: impl Into, pessimistic_action: PrewriteRequestPessimisticAction, + expected_for_update_ts: Option, + min_commit_ts: impl Into, max_commit_ts: impl Into, is_retry_request: bool, assertion: Assertion, @@ -559,14 +604,16 @@ pub fn must_prewrite_put_err_impl_with_should_not_exist( props.is_retry_request = is_retry_request; props.commit_kind = commit_kind; props.assertion_level = assertion_level; + props.min_commit_ts = min_commit_ts.into(); prewrite( &mut txn, &mut reader, &props, mutation, - &None, + secondary_keys, pessimistic_action, + expected_for_update_ts, ) .unwrap_err() } @@ -644,6 +691,34 @@ pub fn must_pessimistic_prewrite_insert_err( ) } +pub fn must_pessimistic_prewrite_put_check_for_update_ts_err( + engine: &mut E, + key: &[u8], + value: &[u8], + pk: &[u8], + ts: impl Into, + for_update_ts: impl Into, + expected_for_update_ts: Option, +) -> Error { + must_prewrite_put_err_impl_with_should_not_exist( + engine, + key, + value, + pk, + &None, + ts, + for_update_ts, + DoPessimisticCheck, + expected_for_update_ts.map(Into::into), + 0, + 0, + false, + Assertion::None, + AssertionLevel::Off, + false, + ) +} + pub fn must_retry_pessimistic_prewrite_put_err( engine: &mut E, key: &[u8], @@ -703,6 +778,7 @@ fn must_prewrite_delete_impl( mutation, &None, pessimistic_action, + None, ) .unwrap(); @@ -781,6 +857,7 @@ fn must_prewrite_lock_impl( mutation, &None, pessimistic_action, + None, ) .unwrap(); @@ -817,6 +894,7 @@ pub fn must_prewrite_lock_err( Mutation::make_lock(Key::from_raw(key)), &None, SkipPessimisticCheck, + None, ) .unwrap_err(); } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 5b94ea5bd85..54f5029bd6c 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -197,6 +197,7 @@ impl From for TypedCommand { secondary_keys, req.get_try_one_pc(), req.get_assertion_level(), + req.take_for_update_ts_constraints().into(), req.take_context(), ) } @@ -932,6 +933,28 @@ pub mod test_util { prewrite_command(engine, cm, statistics, cmd) } + pub fn pessimistic_prewrite_check_for_update_ts( + engine: &mut E, + statistics: &mut Statistics, + mutations: Vec<(Mutation, PrewriteRequestPessimisticAction)>, + primary: Vec, + start_ts: u64, + for_update_ts: u64, + for_update_ts_constraints: impl IntoIterator, + ) -> Result { + let cmd = PrewritePessimistic::with_for_update_ts_constraints( + mutations, + primary, + start_ts.into(), + for_update_ts.into(), + for_update_ts_constraints + .into_iter() + .map(|(size, ts)| (size, TimeStamp::from(ts))), + ); + let cm = ConcurrencyManager::new(start_ts.into()); + prewrite_command(engine, cm, statistics, cmd) + } + pub fn commit( engine: &mut E, statistics: &mut Statistics, diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index fbd4bf5984a..feaa641300f 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -10,7 +10,7 @@ use std::mem; use engine_traits::CF_WRITE; use kvproto::kvrpcpb::{ - AssertionLevel, ExtraOp, + AssertionLevel, ExtraOp, PrewriteRequestForUpdateTsConstraint, PrewriteRequestPessimisticAction::{self, *}, }; use tikv_kv::SnapshotExt; @@ -283,6 +283,8 @@ command! { /// Assertions is a mechanism to check the constraint on the previous version of data /// that must be satisfied as long as data is consistent. assertion_level: AssertionLevel, + /// Constraints on the pessimistic locks that have to be checked when prewriting. + for_update_ts_constraints: Vec, } } @@ -290,7 +292,7 @@ impl std::fmt::Display for PrewritePessimistic { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "kv::command::pessimistic_prewrite mutations({:?}) primary({:?}) secondary_len({:?})@ {} {} {} {} {} {} {:?}| {:?}", + "kv::command::pessimistic_prewrite mutations({:?}) primary({:?}) secondary_len({:?})@ {} {} {} {} {} {} {:?} (for_update_ts constraints: {:?}) | {:?}", self.mutations, log_wrappers::Value::key(self.primary.as_slice()), self.secondary_keys.as_ref().map(|sk| sk.len()), @@ -301,6 +303,7 @@ impl std::fmt::Display for PrewritePessimistic { self.max_commit_ts, self.try_one_pc, self.assertion_level, + self.for_update_ts_constraints, self.ctx, ) } @@ -331,6 +334,7 @@ impl PrewritePessimistic { None, false, AssertionLevel::Off, + vec![], Context::default(), ) } @@ -355,19 +359,62 @@ impl PrewritePessimistic { None, true, AssertionLevel::Off, + vec![], Context::default(), ) } - fn into_prewriter(self) -> Prewriter { - Prewriter { + #[cfg(test)] + pub fn with_for_update_ts_constraints( + mutations: Vec<(Mutation, PrewriteRequestPessimisticAction)>, + primary: Vec, + start_ts: TimeStamp, + for_update_ts: TimeStamp, + for_update_ts_constraints: impl IntoIterator, + ) -> TypedCommand { + PrewritePessimistic::new( + mutations, + primary, + start_ts, + 0, + for_update_ts, + 0, + TimeStamp::default(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + for_update_ts_constraints + .into_iter() + .map(|(index, expected_for_update_ts)| { + let mut constraint = PrewriteRequestForUpdateTsConstraint::default(); + constraint.set_index(index as u32); + constraint.set_expected_for_update_ts(expected_for_update_ts.into_inner()); + constraint + }) + .collect(), + Context::default(), + ) + } + + fn into_prewriter(self) -> Result> { + let mut mutations: Vec = + self.mutations.into_iter().map(Into::into).collect(); + for item in self.for_update_ts_constraints { + let index = item.index as usize; + if index >= mutations.len() { + return Err(ErrorInner::Other(box_err!("prewrite request invalid: for_update_ts constraint set for index {} while {} mutations were given", index, mutations.len())).into()); + } + mutations[index].expected_for_update_ts = Some(item.expected_for_update_ts.into()); + } + Ok(Prewriter { kind: Pessimistic { for_update_ts: self.for_update_ts, }, start_ts: self.start_ts, txn_size: self.txn_size, primary: self.primary, - mutations: self.mutations, + mutations, try_one_pc: self.try_one_pc, secondary_keys: self.secondary_keys, @@ -379,7 +426,7 @@ impl PrewritePessimistic { ctx: self.ctx, old_values: OldValues::default(), - } + }) } } @@ -392,7 +439,7 @@ impl CommandExt for PrewritePessimistic { fn write_bytes(&self) -> usize { let mut bytes = 0; for (m, _) in &self.mutations { - match *m { + match m { Mutation::Put((ref key, ref value), _) | Mutation::Insert((ref key, ref value), _) => { bytes += key.as_encoded().len(); @@ -412,7 +459,7 @@ impl CommandExt for PrewritePessimistic { impl WriteCommand for PrewritePessimistic { fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { - self.into_prewriter().process_write(snapshot, context) + self.into_prewriter()?.process_write(snapshot, context) } } @@ -556,6 +603,7 @@ impl Prewriter { for m in mem::take(&mut self.mutations) { let pessimistic_action = m.pessimistic_action(); + let expected_for_update_ts = m.pessimistic_expected_for_update_ts(); let m = m.into_mutation(); let key = m.key().clone(); let mutation_type = m.mutation_type(); @@ -566,7 +614,15 @@ impl Prewriter { } let need_min_commit_ts = secondaries.is_some() || self.try_one_pc; - let prewrite_result = prewrite(txn, reader, &props, m, secondaries, pessimistic_action); + let prewrite_result = prewrite( + txn, + reader, + &props, + m, + secondaries, + pessimistic_action, + expected_for_update_ts, + ); match prewrite_result { Ok((ts, old_value)) if !(need_min_commit_ts && ts.is_zero()) => { if need_min_commit_ts && final_min_commit_ts < ts { @@ -791,7 +847,7 @@ struct Pessimistic { } impl PrewriteKind for Pessimistic { - type Mutation = (Mutation, PrewriteRequestPessimisticAction); + type Mutation = PessimisticMutation; fn txn_kind(&self) -> TransactionKind { TransactionKind::Pessimistic(self.for_update_ts) @@ -801,11 +857,11 @@ impl PrewriteKind for Pessimistic { /// The type of mutation and, optionally, its extra information, differing for /// the optimistic and pessimistic transaction. /// For optimistic txns, this is `Mutation`. -/// For pessimistic txns, this is `(Mutation, PessimisticAction)`, where the -/// action indicates what kind of operations(checks) need to be performed. -/// The action also implies the type of the lock status. +/// For pessimistic txns, this is `PessimisticMutation` which contains a +/// `Mutation` and some other extra information necessary for pessimistic txns. trait MutationLock { fn pessimistic_action(&self) -> PrewriteRequestPessimisticAction; + fn pessimistic_expected_for_update_ts(&self) -> Option; fn into_mutation(self) -> Mutation; } @@ -814,18 +870,55 @@ impl MutationLock for Mutation { SkipPessimisticCheck } + fn pessimistic_expected_for_update_ts(&self) -> Option { + None + } + fn into_mutation(self) -> Mutation { self } } -impl MutationLock for (Mutation, PrewriteRequestPessimisticAction) { +#[derive(Debug)] +pub struct PessimisticMutation { + pub mutation: Mutation, + /// Indicates what kind of operations(checks) need to be performed, and also + /// implies the type of the lock status. + pub pessimistic_action: PrewriteRequestPessimisticAction, + /// Specifies whether it needs to check the `for_update_ts` field in the + /// pessimistic lock during prewrite. If any, the check only passes if the + /// `for_update_ts` field in pessimistic lock is not greater than the + /// expected value. + pub expected_for_update_ts: Option, +} + +impl MutationLock for PessimisticMutation { fn pessimistic_action(&self) -> PrewriteRequestPessimisticAction { - self.1 + self.pessimistic_action + } + + fn pessimistic_expected_for_update_ts(&self) -> Option { + self.expected_for_update_ts } fn into_mutation(self) -> Mutation { - self.0 + self.mutation + } +} + +impl PessimisticMutation { + pub fn new(mutation: Mutation, pessimistic_action: PrewriteRequestPessimisticAction) -> Self { + Self { + mutation, + pessimistic_action, + expected_for_update_ts: None, + } + } +} + +impl From<(Mutation, PrewriteRequestPessimisticAction)> for PessimisticMutation { + fn from(value: (Mutation, PrewriteRequestPessimisticAction)) -> Self { + PessimisticMutation::new(value.0, value.1) } } @@ -901,8 +994,8 @@ mod tests { commands::{ check_txn_status::tests::must_success as must_check_txn_status, test_util::{ - commit, pessimistic_prewrite_with_cm, prewrite, prewrite_command, - prewrite_with_cm, rollback, + commit, pessimistic_prewrite_check_for_update_ts, pessimistic_prewrite_with_cm, + prewrite, prewrite_command, prewrite_with_cm, rollback, }, }, tests::{ @@ -1451,6 +1544,7 @@ mod tests { Some(vec![]), false, AssertionLevel::Off, + vec![], Context::default(), ); @@ -1491,6 +1585,7 @@ mod tests { Some(vec![k2.to_vec()]), false, AssertionLevel::Off, + vec![], Context::default(), ); @@ -1697,6 +1792,7 @@ mod tests { secondary_keys, case.one_pc, AssertionLevel::Off, + vec![], Context::default(), ) } else { @@ -1937,6 +2033,7 @@ mod tests { Some(vec![]), false, AssertionLevel::Off, + vec![], Context::default(), ); let context = WriteContext { @@ -2076,6 +2173,7 @@ mod tests { secondary_keys, false, AssertionLevel::Off, + vec![], ctx, ); prewrite_command(engine, cm.clone(), statistics, cmd) @@ -2546,6 +2644,7 @@ mod tests { Some(vec![]), false, AssertionLevel::Off, + vec![], Context::default(), ); let res = prewrite_command(&mut engine, cm, &mut statistics, cmd).unwrap(); @@ -2736,4 +2835,132 @@ mod tests { assert_eq!(write.last_change_ts, TimeStamp::zero()); assert_eq!(write.versions_to_last_change, 0); } + + #[test] + fn test_pessimistic_prewrite_check_for_update_ts() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let mut statistics = Statistics::default(); + + let k1 = b"k1"; + let k2 = b"k2"; + let k3 = b"k3"; + + // In actual cases these kinds of pessimistic locks should be locked in + // `allow_locking_with_conflict` mode. For simplicity, we pass a large + // for_update_ts to the pessimistic lock to simulate that case. + must_acquire_pessimistic_lock(&mut engine, k1, k1, 10, 10); + must_acquire_pessimistic_lock(&mut engine, k2, k1, 10, 20); + must_acquire_pessimistic_lock(&mut engine, k3, k1, 10, 20); + + let check_lock_unchanged = |engine: &mut _| { + must_pessimistic_locked(engine, k1, 10, 10); + must_pessimistic_locked(engine, k2, 10, 20); + must_pessimistic_locked(engine, k3, 10, 20); + }; + + let must_be_pessimistic_lock_not_found = |e| match e { + Error(box ErrorInner::Mvcc(MvccError( + box MvccErrorInner::PessimisticLockNotFound { .. }, + ))) => (), + e => panic!( + "error type not match: expected PessimisticLockNotFound, got {:?}", + e + ), + }; + + let mutations = vec![ + ( + Mutation::make_put(Key::from_raw(k1), b"v1".to_vec()), + DoPessimisticCheck, + ), + ( + Mutation::make_put(Key::from_raw(k2), b"v2".to_vec()), + DoPessimisticCheck, + ), + ( + Mutation::make_put(Key::from_raw(k3), b"v3".to_vec()), + DoPessimisticCheck, + ), + ]; + + let e = pessimistic_prewrite_check_for_update_ts( + &mut engine, + &mut statistics, + mutations.clone(), + k1.to_vec(), + 10, + 15, + vec![(1, 15)], + ) + .unwrap_err(); + must_be_pessimistic_lock_not_found(e); + check_lock_unchanged(&mut engine); + + let e = pessimistic_prewrite_check_for_update_ts( + &mut engine, + &mut statistics, + mutations.clone(), + k1.to_vec(), + 10, + 15, + vec![(0, 15), (1, 15), (2, 15)], + ) + .unwrap_err(); + must_be_pessimistic_lock_not_found(e); + check_lock_unchanged(&mut engine); + + let e = pessimistic_prewrite_check_for_update_ts( + &mut engine, + &mut statistics, + mutations.clone(), + k1.to_vec(), + 10, + 15, + vec![(2, 15), (0, 20)], + ) + .unwrap_err(); + must_be_pessimistic_lock_not_found(e); + check_lock_unchanged(&mut engine); + + // lock.for_update_ts < expected is disallowed too. + let e = pessimistic_prewrite_check_for_update_ts( + &mut engine, + &mut statistics, + mutations.clone(), + k1.to_vec(), + 10, + 15, + vec![(0, 15), (2, 20)], + ) + .unwrap_err(); + must_be_pessimistic_lock_not_found(e); + check_lock_unchanged(&mut engine); + + // Index out of bound (invalid request). + pessimistic_prewrite_check_for_update_ts( + &mut engine, + &mut statistics, + mutations.clone(), + k1.to_vec(), + 10, + 15, + vec![(3, 30)], + ) + .unwrap_err(); + check_lock_unchanged(&mut engine); + + pessimistic_prewrite_check_for_update_ts( + &mut engine, + &mut statistics, + mutations, + k1.to_vec(), + 10, + 15, + vec![(0, 10), (2, 20)], + ) + .unwrap(); + must_locked(&mut engine, k1, 10); + must_locked(&mut engine, k2, 10); + must_locked(&mut engine, k3, 10); + } } diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index f43e309f503..640c534fc86 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -241,6 +241,7 @@ pub mod tests { must_err as must_acquire_pessimistic_lock_err, must_err_return_value as must_acquire_pessimistic_lock_return_value_err, must_pessimistic_locked, must_succeed as must_acquire_pessimistic_lock, + must_succeed_allow_lock_with_conflict as must_acquire_pessimistic_lock_allow_lock_with_conflict, must_succeed_for_large_txn as must_acquire_pessimistic_lock_for_large_txn, must_succeed_impl as must_acquire_pessimistic_lock_impl, must_succeed_return_value as must_acquire_pessimistic_lock_return_value, diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index 46879d38e9f..1b4a7d5624c 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -731,6 +731,7 @@ mod tests { Mutation::make_put(Key::from_raw(key), key.to_vec()), &None, SkipPessimisticCheck, + None, ) .unwrap(); } diff --git a/tests/benches/hierarchy/mvcc/mod.rs b/tests/benches/hierarchy/mvcc/mod.rs index 7a79b984aaf..92dacfe6dc9 100644 --- a/tests/benches/hierarchy/mvcc/mod.rs +++ b/tests/benches/hierarchy/mvcc/mod.rs @@ -56,6 +56,7 @@ where Mutation::make_put(Key::from_raw(k), v.clone()), &None, SkipPessimisticCheck, + None, ) .unwrap(); } @@ -107,6 +108,7 @@ fn mvcc_prewrite>(b: &mut Bencher<'_>, config: &B mutation, &None, SkipPessimisticCheck, + None, ) .unwrap(); } diff --git a/tests/benches/hierarchy/txn/mod.rs b/tests/benches/hierarchy/txn/mod.rs index 404266e2c6f..1a4d047562d 100644 --- a/tests/benches/hierarchy/txn/mod.rs +++ b/tests/benches/hierarchy/txn/mod.rs @@ -52,6 +52,7 @@ where Mutation::make_put(Key::from_raw(k), v.clone()), &None, SkipPessimisticCheck, + None, ) .unwrap(); } @@ -100,6 +101,7 @@ fn txn_prewrite>(b: &mut Bencher<'_>, config: &Be mutation, &None, SkipPessimisticCheck, + None, ) .unwrap(); let write_data = WriteData::from_modifies(txn.into_modifies()); diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 42cda54281e..ba6339b666d 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -408,6 +408,7 @@ fn test_pipelined_pessimistic_lock() { None, false, AssertionLevel::Off, + vec![], Context::default(), ), expect_ok_callback(tx.clone(), 0), @@ -759,6 +760,7 @@ fn test_async_commit_prewrite_with_stale_max_ts_impl() { Some(vec![b"xk2".to_vec()]), false, AssertionLevel::Off, + vec![], ctx.clone(), ), Box::new(move |res: storage::Result<_>| { @@ -898,6 +900,7 @@ fn test_async_apply_prewrite_impl( secondaries, false, AssertionLevel::Off, + vec![], ctx.clone(), ), Box::new(move |r| tx.send(r).unwrap()), @@ -1232,6 +1235,7 @@ fn test_async_apply_prewrite_1pc_impl( None, true, AssertionLevel::Off, + vec![], ctx.clone(), ), Box::new(move |r| tx.send(r).unwrap()), From 61380e35cad161576342ea895b63a99dfabb97da Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 12 Apr 2023 20:31:01 +0800 Subject: [PATCH 628/676] resource_control: fix virtual time overflow (#14509) close tikv/tikv#14507 Signed-off-by: glorv Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/resource_control/Cargo.toml | 3 + .../resource_control/src/resource_group.rs | 203 ++++++++++++++++-- 3 files changed, 194 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b654e34fb77..10d3a7f37eb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4772,6 +4772,7 @@ dependencies = [ "pin-project", "prometheus", "protobuf", + "rand 0.8.5", "serde", "slog", "slog-global", diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml index 9a488b06d77..ec13d9cdbdb 100644 --- a/components/resource_control/Cargo.toml +++ b/components/resource_control/Cargo.toml @@ -30,3 +30,6 @@ test_pd = { workspace = true } test_pd_client = { workspace = true } tikv_util = { workspace = true } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } + +[dev-dependencies] +rand = "0.8" diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 0b0f24e8f62..7435fc17d01 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -1,6 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cell::Cell, cmp::{max, min}, sync::{ atomic::{AtomicU64, Ordering}, @@ -11,12 +12,13 @@ use std::{ use collections::HashMap; use dashmap::{mapref::one::Ref, DashMap}; +use fail::fail_point; use kvproto::{ kvrpcpb::CommandPri, resource_manager::{GroupMode, ResourceGroup}, }; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; -use tikv_util::info; +use tikv_util::{info, time::Instant}; use yatp::queue::priority::TaskPriorityProvider; // a read task cost at least 50us. @@ -27,6 +29,8 @@ const TASK_EXTRA_FACTOR_BY_LEVEL: [u64; 3] = [0, 20, 100]; pub const MIN_PRIORITY_UPDATE_INTERVAL: Duration = Duration::from_secs(1); /// default resource group name const DEFAULT_RESOURCE_GROUP_NAME: &str = "default"; +/// default value of max RU quota. +const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; /// The maximum RU quota that can be configured. const MAX_RU_QUOTA: u64 = i32::MAX as u64; @@ -152,12 +156,26 @@ pub struct ResourceController { // increase the real cost after task is executed; but don't increase it at write because // the cost is known so we just pre-consume it. is_read: bool, + // Track the maximum ru quota used to calculate the factor of each resource group. + // factor = max_ru_quota / group_ru_quota * 10.0 + // We use mutex here to ensure when we need to change this value and do adjust all resource + // groups' factors, it can't be changed concurrently. + // NOTE: becuase the ru config for "default" group is very large and it can cause very big + // group weight, we will not count this value by default. + max_ru_quota: Mutex, // record consumption of each resource group, name --> resource_group resource_consumptions: RwLock, GroupPriorityTracker>>, - + // the latest min vt, this value is used to init new added group vt last_min_vt: AtomicU64, + // the last time min vt is overflow + last_rest_vt_time: Cell, } +// we are ensure to visit the `last_rest_vt_time` by only 1 thread so it's +// thread safe. +unsafe impl Send for ResourceController {} +unsafe impl Sync for ResourceController {} + impl ResourceController { pub fn new(name: String, is_read: bool) -> Self { let controller = Self { @@ -165,6 +183,8 @@ impl ResourceController { is_read, resource_consumptions: RwLock::new(HashMap::default()), last_min_vt: AtomicU64::new(0), + max_ru_quota: Mutex::new(DEFAULT_MAX_RU_QUOTA), + last_rest_vt_time: Cell::new(Instant::now_coarse()), }; // add the "default" resource group controller.add_resource_group( @@ -175,24 +195,39 @@ impl ResourceController { controller } - fn calculate_factor(mut quota: u64) -> u64 { - quota = min(quota, MAX_RU_QUOTA); - if quota > 0 { - // the maxinum ru quota is very big, so the precision lost due to - // integer division is very small. - MAX_RU_QUOTA / quota - } else { + fn calculate_factor(max_quota: u64, quota: u64) -> u64 { + // we don't adjust the max_quota if it's the "default" group's default + // value(u32::MAX), so here it is possible that the quota is bigger than + // the max quota + if quota == 0 || quota > max_quota { 1 + } else { + // we use max_quota / quota as the resource group factor, but because we need to + // cast the value to integer, so we times it by 10 to ensure the accuracy is + // enough. + let max_quota = min(max_quota * 10, MAX_RU_QUOTA); + (max_quota as f64 / quota as f64).round() as u64 } } - fn add_resource_group(&self, name: Vec, ru_quota: u64, mut group_priority: u32) { + fn add_resource_group(&self, name: Vec, mut ru_quota: u64, mut group_priority: u32) { if group_priority == 0 { // map 0 to medium priority(default priority) group_priority = MEDIUM_PRIORITY; } + if ru_quota > MAX_RU_QUOTA { + ru_quota = MAX_RU_QUOTA; + } - let weight = Self::calculate_factor(ru_quota); + let mut max_ru_quota = self.max_ru_quota.lock().unwrap(); + // skip to adjust max ru if it is the "default" group and the ru config eq + // MAX_RU_QUOTA + if ru_quota > *max_ru_quota && (name != "default".as_bytes() || ru_quota < MAX_RU_QUOTA) { + *max_ru_quota = ru_quota; + // adjust all group weight because the current value is too small. + self.adjust_all_resource_group_factors(ru_quota); + } + let weight = Self::calculate_factor(*max_ru_quota, ru_quota); let vt_delta_for_get = if self.is_read { DEFAULT_PRIORITY_PER_READ_TASK * weight @@ -200,6 +235,7 @@ impl ResourceController { 0 }; let group = GroupPriorityTracker { + ru_quota, group_priority, weight, virtual_time: AtomicU64::new(self.last_min_vt.load(Ordering::Acquire)), @@ -210,6 +246,20 @@ impl ResourceController { self.resource_consumptions.write().insert(name, group); } + // we calculate the weight of each resource group based on the currently maximum + // ru quota, if a incoming resource group has a bigger quota, we need to + // adjust all the existing groups. As we expect this won't happen very + // often, and iterate 10k entry cost less than 5ms, so the performance is + // acceptable. + fn adjust_all_resource_group_factors(&self, max_ru_quota: u64) { + self.resource_consumptions + .write() + .iter_mut() + .for_each(|(_, tracker)| { + tracker.weight = Self::calculate_factor(max_ru_quota, tracker.ru_quota); + }); + } + fn remove_resource_group(&self, name: &[u8]) { // do not remove the default resource group, reset to default setting instead. if DEFAULT_RESOURCE_GROUP_NAME.as_bytes() == name { @@ -240,6 +290,7 @@ impl ResourceController { } pub fn update_min_virtual_time(&self) { + let start = Instant::now_coarse(); let mut min_vt = u64::MAX; let mut max_vt = 0; self.resource_consumptions @@ -257,6 +308,8 @@ impl ResourceController { return; } + fail_point!("increase_vt_duration_update_min_vt"); + let near_overflow = min_vt > RESET_VT_THRESHOLD; self.resource_consumptions .read() @@ -267,15 +320,19 @@ impl ResourceController { // but it should be ok as this operation should be extremely rare // and the impact is not big. if near_overflow { - tracker.decrease_vt(RESET_VT_THRESHOLD - (max_vt - vt) / 2); + tracker.decrease_vt(RESET_VT_THRESHOLD); } else if vt < max_vt { // TODO: is increase by half is a good choice. tracker.increase_vt((max_vt - vt) / 2); } }); if near_overflow { - info!("all reset groups' virtual time are near overflow, do reset"); + let end = Instant::now_coarse(); + info!("all resource groups' virtual time are near overflow, do reset"; + "min" => min_vt, "max" => max_vt, "dur" => ?end.duration_since(start), + "reset_dur" => ?end.duration_since(self.last_rest_vt_time.get())); max_vt -= RESET_VT_THRESHOLD; + self.last_rest_vt_time.set(end); } // max_vt is actually a little bigger than the current min vt, but we don't // need totally accurate here. @@ -309,6 +366,8 @@ fn concat_priority_vt(group_priority: u32, vt: u64) -> u64 { } struct GroupPriorityTracker { + // the ru setting of this group. + ru_quota: u64, group_priority: u32, weight: u64, virtual_time: AtomicU64, @@ -357,6 +416,7 @@ impl GroupPriorityTracker { #[cfg(test)] pub(crate) mod tests { + use rand::{thread_rng, RngCore}; use yatp::queue::Extras; use super::*; @@ -560,6 +620,123 @@ pub(crate) mod tests { assert_eq!(resource_ctl.last_min_vt.load(Ordering::Relaxed), g2_vt); } + #[test] + fn test_adjust_resource_group_weight() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); + + let group1 = new_resource_group_ru("test1".into(), 5000, 0); + resource_manager.add_resource_group(group1); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 20 + ); + + // add a resource group with big ru + let group1 = new_resource_group_ru("test2".into(), 50000, 0); + resource_manager.add_resource_group(group1); + assert_eq!(*resource_ctl.max_ru_quota.lock().unwrap(), 50000); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 100); + assert_eq!(resource_ctl.resource_group("test2".as_bytes()).weight, 10); + // resource_ctl_write should be unchanged. + assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 50000); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 100 + ); + assert_eq!( + resource_ctl_write.resource_group("test2".as_bytes()).weight, + 10 + ); + + // add the default "default" group, the ru weight should not change. + // add a resource group with big ru + let group = new_resource_group_ru("default".into(), u32::MAX as u64, 0); + resource_manager.add_resource_group(group); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 100 + ); + assert_eq!( + resource_ctl_write + .resource_group("default".as_bytes()) + .weight, + 1 + ); + + // change the default group to another value, it can impact the ru then. + let group = new_resource_group_ru("default".into(), 100000, 0); + resource_manager.add_resource_group(group); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 200 + ); + assert_eq!( + resource_ctl_write + .resource_group("default".as_bytes()) + .weight, + 10 + ); + } + + #[test] + fn test_reset_resource_group_vt_overflow() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_write".into(), false); + let mut rng = thread_rng(); + + let mut min_delta = u64::MAX; + let mut max_delta = 0; + for i in 0..10 { + let name = format!("g{}", i); + let g = new_resource_group_ru(name.clone(), 100, 1); + resource_manager.add_resource_group(g); + let delta = rng.next_u64() % 10000 + 1; + min_delta = delta.min(min_delta); + max_delta = delta.max(max_delta); + resource_ctl + .resource_group(name.as_bytes()) + .increase_vt(RESET_VT_THRESHOLD + delta); + } + resource_ctl + .resource_group("default".as_bytes()) + .increase_vt(RESET_VT_THRESHOLD + 1); + + let old_max_vt = resource_ctl + .resource_consumptions + .read() + .iter() + .fold(0, |v, (_, g)| v.max(g.current_vt())); + let resource_ctl_cloned = resource_ctl.clone(); + fail::cfg_callback("increase_vt_duration_update_min_vt", move || { + resource_ctl_cloned + .resource_consumptions + .read() + .iter() + .enumerate() + .for_each(|(i, (_, tracker))| { + if i % 2 == 0 { + tracker.increase_vt(max_delta - min_delta); + } + }); + }) + .unwrap(); + resource_ctl.update_min_virtual_time(); + fail::remove("increase_vt_duration_update_min_vt"); + + let new_max_vt = resource_ctl + .resource_consumptions + .read() + .iter() + .fold(0, |v, (_, g)| v.max(g.current_vt())); + // check all vt has decreased by RESET_VT_THRESHOLD. + assert!(new_max_vt < max_delta * 2); + // check fail-point takes effect, the `new_max_vt` has increased. + assert!(old_max_vt - RESET_VT_THRESHOLD < new_max_vt); + } + #[test] fn test_retain_resource_groups() { let resource_manager = ResourceGroupManager::default(); From 515bebb405788442a56296429508e01e8d4bcb11 Mon Sep 17 00:00:00 2001 From: ekexium Date: Wed, 12 Apr 2023 21:35:01 +0800 Subject: [PATCH 629/676] txn: return duration_to_last_update when lock wait timeout (#14499) close tikv/tikv#14497 When a timeout occurs during waiting for a lock, provide the duration of time that has passed since the last update of the lock wait. Let the client decide whether it is necessary to resolve locks based on this info. Signed-off-by: ekexium Co-authored-by: Ti Chi Robot --- src/server/lock_manager/waiter_manager.rs | 95 +++++++++++++++++++++-- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index d8271998653..5f433571431 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -203,6 +203,7 @@ pub(crate) struct Waiter { pub diag_ctx: DiagnosticContext, delay: Delay, start_waiting_time: Instant, + last_updated_time: Option, } impl Waiter { @@ -224,6 +225,7 @@ impl Waiter { delay: Delay::new(deadline), diag_ctx, start_waiting_time, + last_updated_time: None, } } @@ -264,8 +266,13 @@ impl Waiter { self.cancel(None) } - fn cancel_for_timeout(self, _skip_resolving_lock: bool) -> KeyLockWaitInfo { - let lock_info = self.wait_info.lock_info.clone(); + fn cancel_for_timeout(self) -> KeyLockWaitInfo { + let mut lock_info = self.wait_info.lock_info.clone(); + lock_info.set_duration_to_last_update_ms( + self.last_updated_time + .map(|t| t.elapsed().as_millis() as u64) + .unwrap_or_default(), + ); // lock_info.set_skip_resolving_lock(skip_resolving_lock); let error = MvccError::from(MvccErrorInner::KeyIsLocked(lock_info)); self.cancel(Some(StorageError::from(TxnError::from(error)))) @@ -343,8 +350,10 @@ impl WaitTable { fn update_waiter( &mut self, update_event: &UpdateWaitForEvent, + now: Instant, ) -> Option<(KeyLockWaitInfo, DiagnosticContext)> { let waiter = self.waiter_pool.get_mut(&update_event.token)?; + waiter.last_updated_time = Some(now); assert_eq!(waiter.wait_info.key, update_event.wait_info.key); @@ -511,7 +520,7 @@ impl WaiterManager { let mut wait_table = wait_table.borrow_mut(); if let Some(waiter) = wait_table.take_waiter(token) { let start_ts = waiter.start_ts; - let wait_info = waiter.cancel_for_timeout(false); + let wait_info = waiter.cancel_for_timeout(); detector_scheduler.clean_up_wait_for(start_ts, wait_info); } }); @@ -537,8 +546,9 @@ impl WaiterManager { fn handle_update_wait_for(&mut self, events: Vec) { let mut wait_table = self.wait_table.borrow_mut(); + let now = Instant::now(); for event in events { - let previous_wait_info = wait_table.update_waiter(&event); + let previous_wait_info = wait_table.update_waiter(&event, now); if event.is_first_lock { continue; @@ -647,7 +657,7 @@ impl FutureRunnable for WaiterManager { #[cfg(test)] pub mod tests { - use std::{sync::mpsc, time::Duration}; + use std::{sync::mpsc, thread::sleep, time::Duration}; use futures::{executor::block_on, future::FutureExt}; use kvproto::kvrpcpb::LockInfo; @@ -673,6 +683,7 @@ pub mod tests { diag_ctx: DiagnosticContext::default(), delay: Delay::new(Instant::now()), start_waiting_time: Instant::now(), + last_updated_time: None, } } @@ -869,7 +880,7 @@ pub mod tests { #[test] fn test_waiter_notify() { let (waiter, lock_info, f) = new_test_waiter(10.into(), 20.into(), 20); - waiter.cancel_for_timeout(false); + waiter.cancel_for_timeout(); expect_key_is_locked(block_on(f).unwrap(), lock_info); // Deadlock @@ -902,7 +913,7 @@ pub mod tests { waiter.reset_timeout(Instant::now() + Duration::from_millis(100)); let (tx, rx) = mpsc::sync_channel(1); let f = waiter.on_timeout(move || tx.send(1).unwrap()); - waiter.cancel_for_timeout(false); + waiter.cancel_for_timeout(); assert_elapsed(|| block_on(f), 0, 200); rx.try_recv().unwrap_err(); } @@ -1140,4 +1151,74 @@ pub mod tests { ); worker.stop().unwrap(); } + + #[test] + fn test_duration_to_last_update() { + let (mut worker, scheduler) = start_waiter_manager(1000, 100); + let key = Key::from_raw(b"foo"); + let (waiter_ts, lock) = ( + 10.into(), + LockDigest { + ts: 20.into(), + hash: key.gen_hash(), + }, + ); + // waiter1 is updated when waiting, while waiter2(f2) is not. + let (waiter1, ..) = new_test_waiter_with_key(waiter_ts, lock.ts, &key.to_raw().unwrap()); + let (waiter2, _, f2) = new_test_waiter_with_key(100.into(), 100.into(), "foo".as_bytes()); + scheduler.wait_for( + LockWaitToken(Some(1)), + 1, + RegionEpoch::default(), + 1, + waiter1.start_ts, + waiter1.wait_info, + WaitTimeout::Millis(1000), + waiter1.cancel_callback, + DiagnosticContext::default(), + ); + scheduler.wait_for( + LockWaitToken(Some(2)), + 1, + RegionEpoch::default(), + 1, + waiter2.start_ts, + waiter2.wait_info, + WaitTimeout::Millis(1000), + waiter2.cancel_callback, + DiagnosticContext::default(), + ); + + // then update waiter + sleep(Duration::from_millis(500)); + let event = UpdateWaitForEvent { + token: LockWaitToken(Some(1)), + start_ts: waiter1.start_ts, + is_first_lock: false, + wait_info: KeyLockWaitInfo { + key: key.clone(), + lock_digest: Default::default(), + lock_info: LockInfo { + key: key.to_raw().unwrap(), + ..Default::default() + }, + }, + }; + scheduler.update_wait_for(vec![event]); + + assert_elapsed( + || match block_on(f2).unwrap() { + StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( + MvccError(box MvccErrorInner::KeyIsLocked(res)), + )))) => { + assert_eq!(res.duration_to_last_update_ms, 0); + } + e => panic!("unexpected error: {:?}", e), + }, + 400, + 600, + ); + + worker.stop().unwrap(); + } } From b9ca84e61ae8ad1794fec440bb20cc00c6ae7912 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 14 Apr 2023 15:45:01 +0800 Subject: [PATCH 630/676] raftstore-v2: implement CaptureChange and LeaderCallback (#14558) ref tikv/tikv#14542 raftstore-v2: implement CaptureChange and LeaderCallback Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/fsm/apply.rs | 4 + components/raftstore-v2/src/fsm/peer.rs | 2 + .../src/operation/query/capture.rs | 124 ++++++++++++++++++ .../raftstore-v2/src/operation/query/mod.rs | 1 + components/raftstore-v2/src/raft/apply.rs | 22 +++- components/raftstore-v2/src/router/imp.rs | 74 +++++++++-- .../src/router/internal_message.rs | 2 + components/raftstore-v2/src/router/message.rs | 18 ++- components/raftstore-v2/src/router/mod.rs | 6 +- .../src/router/response_channel.rs | 41 ++++++ components/raftstore/src/store/fsm/apply.rs | 8 +- components/raftstore/src/store/fsm/mod.rs | 4 +- components/raftstore/src/store/msg.rs | 2 +- 13 files changed, 284 insertions(+), 24 deletions(-) create mode 100644 components/raftstore-v2/src/operation/query/capture.rs diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 2afd8fbf773..6c0989e72ae 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -48,6 +48,7 @@ impl, S: FsmScheduler> ApplyResReporter for M } /// Schedule task to `ApplyFsm`. +#[derive(Clone)] pub struct ApplyScheduler { sender: Sender, } @@ -136,6 +137,9 @@ impl ApplyFsm { ApplyTask::RefreshBucketStat(bucket_meta) => { self.apply.on_refresh_buckets(bucket_meta) } + ApplyTask::CaptureApply(capture_change) => { + self.apply.on_capture_apply(capture_change) + } } self.apply.maybe_flush().await; diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 5e10aa0ef72..d2506d0dd21 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -344,6 +344,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .peer_mut() .on_redirect_catch_up_logs(self.store_ctx, c), PeerMsg::CatchUpLogs(c) => self.fsm.peer_mut().on_catch_up_logs(self.store_ctx, c), + PeerMsg::CaptureChange(capture_change) => self.on_capture_change(capture_change), + PeerMsg::LeaderCallback(ch) => self.on_leader_callback(ch), #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } diff --git a/components/raftstore-v2/src/operation/query/capture.rs b/components/raftstore-v2/src/operation/query/capture.rs new file mode 100644 index 00000000000..03014644261 --- /dev/null +++ b/components/raftstore-v2/src/operation/query/capture.rs @@ -0,0 +1,124 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use kvproto::raft_cmdpb::RaftCmdResponse; +use raftstore::{ + coprocessor::ObserveHandle, + store::{ + cmd_resp, + fsm::{ + apply::{notify_stale_req_with_msg, ObserverType}, + new_read_index_request, ChangeObserver, + }, + msg::ErrorCallback, + util::compare_region_epoch, + RegionSnapshot, + }, +}; + +use crate::{ + fsm::{ApplyResReporter, PeerFsmDelegate}, + raft::Apply, + router::{message::CaptureChange, ApplyTask, QueryResChannel, QueryResult}, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> + PeerFsmDelegate<'a, EK, ER, T> +{ + pub fn on_leader_callback(&mut self, ch: QueryResChannel) { + let peer = self.fsm.peer(); + let msg = new_read_index_request( + peer.region_id(), + peer.region().get_region_epoch().clone(), + peer.peer().clone(), + ); + self.on_query(msg, ch); + } + + pub fn on_capture_change(&mut self, capture_change: CaptureChange) { + fail_point!("raft_on_capture_change"); + + // TODO: Allow to capture change even is in flashback state. + // TODO: add a test case for this kind of situation. + + let apply_router = self.fsm.peer().apply_scheduler().unwrap().clone(); + let (ch, _) = QueryResChannel::with_callback(Box::new(move |res| { + if let QueryResult::Response(resp) = res && resp.get_header().has_error() { + // Return error + capture_change.snap_cb.report_error(resp.clone()); + return; + } + apply_router.send(ApplyTask::CaptureApply(capture_change)) + })); + self.on_leader_callback(ch); + } +} + +impl Apply { + pub fn on_capture_apply(&mut self, capture_change: CaptureChange) { + let CaptureChange { + observer, + region_epoch, + snap_cb, + } = capture_change; + let ChangeObserver { region_id, ty } = observer; + + let is_stale_cmd = match ty { + ObserverType::Cdc(ObserveHandle { id, .. }) => self.observe_info_mut().cdc_id.id > id, + ObserverType::Rts(ObserveHandle { id, .. }) => self.observe_info_mut().rts_id.id > id, + ObserverType::Pitr(ObserveHandle { id, .. }) => self.observe_info_mut().pitr_id.id > id, + }; + if is_stale_cmd { + notify_stale_req_with_msg( + self.term(), + format!( + "stale observe id {:?}, current id: {:?}", + ty.handle().id, + self.observe_info_mut().pitr_id.id + ), + snap_cb, + ); + return; + } + + assert_eq!(self.region_id(), region_id); + let snapshot = match compare_region_epoch( + ®ion_epoch, + self.region(), + false, // check_conf_ver + true, // check_ver + true, // include_region + ) { + Ok(()) => { + // Commit the writebatch for ensuring the following snapshot can get all + // previous writes. + self.flush(); + RegionSnapshot::from_snapshot( + Arc::new(self.tablet().snapshot()), + Arc::new(self.region().clone()), + ) + } + Err(e) => { + // Return error if epoch not match + snap_cb.report_error(cmd_resp::new_error(e)); + return; + } + }; + + match ty { + ObserverType::Cdc(id) => { + self.observe_info_mut().cdc_id = id; + } + ObserverType::Rts(id) => { + self.observe_info_mut().rts_id = id; + } + ObserverType::Pitr(id) => { + self.observe_info_mut().pitr_id = id; + } + } + snap_cb.set_result((RaftCmdResponse::default(), Some(Box::new(snapshot)))); + } +} diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 55bc100dec2..81fb4e5e9de 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -42,6 +42,7 @@ use crate::{ }, }; +mod capture; mod lease; mod local; mod replica; diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 5e7c7e84f84..d5ecb8c3026 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -7,9 +7,12 @@ use engine_traits::{ }; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; -use raftstore::store::{ - fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, - Config, ReadTask, +use raftstore::{ + coprocessor::CmdObserveInfo, + store::{ + fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, + Config, ReadTask, + }, }; use slog::Logger; use sst_importer::SstImporter; @@ -59,6 +62,8 @@ pub struct Apply { res_reporter: R, read_scheduler: Scheduler>, sst_importer: Arc, + observe_info: CmdObserveInfo, + pub(crate) metrics: ApplyMetrics, pub(crate) logger: Logger, pub(crate) buckets: Option, @@ -110,6 +115,7 @@ impl Apply { metrics: ApplyMetrics::default(), buckets, sst_importer, + observe_info: CmdObserveInfo::default(), logger, } } @@ -269,4 +275,14 @@ impl Apply { pub fn sst_importer(&self) -> &SstImporter { &self.sst_importer } + + #[inline] + pub fn observe_info_mut(&mut self) -> &mut CmdObserveInfo { + &mut self.observe_info + } + + #[inline] + pub fn term(&self) -> u64 { + self.applied_term + } } diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 9bffe2b7983..b28dc95aa35 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -5,10 +5,11 @@ use std::{ sync::{Arc, Mutex}, }; -use crossbeam::channel::TrySendError; +use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine}; use futures::Future; use kvproto::{ + kvrpcpb::ExtraOp, metapb::RegionEpoch, raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, @@ -21,7 +22,7 @@ use raftstore::{ }; use slog::warn; -use super::PeerMsg; +use super::{build_any_channel, message::CaptureChange, PeerMsg, QueryResChannel, QueryResult}; use crate::{batch::StoreRouter, operation::LocalReader, StoreMeta}; impl AsyncReadNotifier for StoreRouter { @@ -179,19 +180,72 @@ impl RaftRouter { impl CdcHandle for RaftRouter { fn capture_change( &self, - _region_id: u64, - _region_epoch: RegionEpoch, - _change_observer: ChangeObserver, - _callback: Callback, + region_id: u64, + region_epoch: RegionEpoch, + observer: ChangeObserver, + callback: Callback, ) -> crate::Result<()> { - unimplemented!() + let (snap_cb, _) = build_any_channel(Box::new(move |args| { + let (resp, snap) = (&args.0, args.1.take()); + if let Some(snap) = snap { + let snapshot: RegionSnapshot = match snap.downcast() { + Ok(s) => *s, + Err(t) => unreachable!("snapshot type should be the same: {:?}", t), + }; + callback.invoke_read(raftstore::store::ReadResponse { + response: Default::default(), + snapshot: Some(snapshot), + txn_extra_op: ExtraOp::Noop, + }) + } else { + callback.invoke_read(raftstore::store::ReadResponse { + response: resp.clone(), + snapshot: None, + txn_extra_op: ExtraOp::Noop, + }); + } + })); + if let Err(SendError(msg)) = self.router.force_send( + region_id, + PeerMsg::CaptureChange(CaptureChange { + observer, + region_epoch, + snap_cb, + }), + ) { + warn!(self.router.logger(), "failed to send capture change msg"; "msg" => ?msg); + return Err(crate::Error::RegionNotFound(region_id)); + } + Ok(()) } fn check_leadership( &self, - _region_id: u64, - _callback: Callback, + region_id: u64, + callback: Callback, ) -> crate::Result<()> { - unimplemented!() + let (ch, _) = QueryResChannel::with_callback(Box::new(|res| { + let resp = match res { + QueryResult::Read(_) => raftstore::store::ReadResponse { + response: Default::default(), + snapshot: None, + txn_extra_op: ExtraOp::Noop, + }, + QueryResult::Response(resp) => raftstore::store::ReadResponse { + response: resp.clone(), + snapshot: None, + txn_extra_op: ExtraOp::Noop, + }, + }; + callback.invoke_read(resp); + })); + if let Err(SendError(msg)) = self + .router + .force_send(region_id, PeerMsg::LeaderCallback(ch)) + { + warn!(self.router.logger(), "failed to send capture change msg"; "msg" => ?msg); + return Err(crate::Error::RegionNotFound(region_id)); + } + Ok(()) } } diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 764e8df7dfd..6c8d1136b3a 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -3,6 +3,7 @@ use pd_client::{BucketMeta, BucketStat}; use raftstore::store::fsm::ApplyMetrics; +use super::message::CaptureChange; use crate::operation::{AdminCmdResult, CommittedEntries, DataTrace, GenSnapTask}; #[derive(Debug)] @@ -13,6 +14,7 @@ pub enum ApplyTask { UnsafeWrite(Box<[u8]>), ManualFlush, RefreshBucketStat(std::sync::Arc), + CaptureApply(CaptureChange), } #[derive(Debug, Default)] diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 26fbde3644a..43dfab3ba98 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -9,13 +9,16 @@ use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, raft_serverpb::RaftMessage, }; -use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs, GenSnapRes}; +use raftstore::store::{ + fsm::ChangeObserver, metrics::RaftEventDurationType, FetchedLogs, GenSnapRes, +}; use resource_control::ResourceMetered; use tikv_util::time::Instant; use super::{ response_channel::{ - CmdResChannel, CmdResSubscriber, DebugInfoChannel, QueryResChannel, QueryResSubscriber, + AnyResChannel, CmdResChannel, CmdResSubscriber, DebugInfoChannel, QueryResChannel, + QueryResSubscriber, }, ApplyRes, }; @@ -131,6 +134,14 @@ pub struct UnsafeWrite { pub data: SimpleWriteBinary, } +#[derive(Debug)] +pub struct CaptureChange { + pub observer: ChangeObserver, + pub region_epoch: RegionEpoch, + // A callback accpets a snapshot. + pub snap_cb: AnyResChannel, +} + /// Message that can be sent to a peer. #[derive(Debug)] pub enum PeerMsg { @@ -220,6 +231,9 @@ pub enum PeerMsg { RedirectCatchUpLogs(CatchUpLogs), // From target [`Peer`] to source [`Peer`]. CatchUpLogs(CatchUpLogs), + /// Capture changes of a region. + CaptureChange(CaptureChange), + LeaderCallback(QueryResChannel), /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index 703f38c3516..2d0011c1ef0 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -15,8 +15,8 @@ pub use self::{ internal_message::ApplyRes, message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, response_channel::{ - BaseSubscriber, CmdResChannel, CmdResChannelBuilder, CmdResEvent, CmdResStream, - CmdResSubscriber, DebugInfoChannel, DebugInfoSubscriber, QueryResChannel, QueryResult, - ReadResponse, + build_any_channel, AnyResChannel, AnyResSubscriber, BaseSubscriber, CmdResChannel, + CmdResChannelBuilder, CmdResEvent, CmdResStream, CmdResSubscriber, DebugInfoChannel, + DebugInfoSubscriber, QueryResChannel, QueryResult, ReadResponse, }, }; diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index f70b6635982..97321aae9d1 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -13,6 +13,7 @@ //! 4. there will be no callback leak. use std::{ + any::Any, cell::UnsafeCell, fmt::{self, Debug, Formatter}, future::Future, @@ -471,6 +472,36 @@ impl CmdResChannelBuilder { } } +pub type AnyResChannel = BaseChannel<(RaftCmdResponse, Option>)>; + +impl Debug for AnyResChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "AnyResChannel") + } +} + +impl ErrorCallback for AnyResChannel { + fn report_error(self, err: RaftCmdResponse) { + self.set_result((err, None)); + } + + fn is_none(&self) -> bool { + false + } +} + +pub type AnyResSubscriber = BaseSubscriber<(RaftCmdResponse, Option>)>; + +pub fn build_any_channel( + f: Box>)) + Send>, +) -> (AnyResChannel, AnyResSubscriber) { + let (c, s) = pair(); + unsafe { + *c.core.before_set.get() = Some(f); + } + (c, s) +} + impl CmdResChannel { // Valid range is [1, 30] const PROPOSED_EVENT: u64 = 1; @@ -585,6 +616,16 @@ impl QueryResChannel { pub fn pair() -> (Self, QueryResSubscriber) { pair() } + + pub fn with_callback( + f: Box, + ) -> (Self, QueryResSubscriber) { + let (c, s) = pair(); + unsafe { + *c.core.before_set.get() = Some(f); + } + (c, s) + } } impl ErrorCallback for QueryResChannel { diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 16a8bacbced..54ca2274162 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -3651,14 +3651,14 @@ impl Debug for GenSnapTask { } #[derive(Debug)] -enum ObserverType { +pub enum ObserverType { Cdc(ObserveHandle), Rts(ObserveHandle), Pitr(ObserveHandle), } impl ObserverType { - fn handle(&self) -> &ObserveHandle { + pub fn handle(&self) -> &ObserveHandle { match self { ObserverType::Cdc(h) => h, ObserverType::Rts(h) => h, @@ -3669,8 +3669,8 @@ impl ObserverType { #[derive(Debug)] pub struct ChangeObserver { - ty: ObserverType, - region_id: u64, + pub ty: ObserverType, + pub region_id: u64, } impl ChangeObserver { diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index ffba120056c..6f51c97c0d5 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -17,7 +17,9 @@ pub use self::{ Registration, SwitchWitness, TaskRes as ApplyTaskRes, }, metrics::{GlobalStoreStat, LocalStoreStat}, - peer::{new_admin_request, DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO}, + peer::{ + new_admin_request, new_read_index_request, DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO, + }, store::{ create_raft_batch_system, RaftBatchSystem, RaftPollerBuilder, RaftRouter, StoreInfo, StoreMeta, diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 935210951f0..c36e9880694 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -507,7 +507,7 @@ where store_id: u64, group_id: u64, }, - /// Capture the changes of the region. + /// Capture changes of a region. CaptureChange { cmd: ChangeObserver, region_epoch: RegionEpoch, From 51b56135ed3be0f827e54564a9e621b34fb45938 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 14 Apr 2023 16:45:01 +0800 Subject: [PATCH 631/676] Move simple write into raftstore (#14544) ref tikv/tikv#14575 Move simple write into raftstore Signed-off-by: CalvinNeo --- .../operation/command/admin/merge/prepare.rs | 3 +- .../raftstore-v2/src/operation/command/mod.rs | 17 +++-- .../src/operation/command/write/mod.rs | 8 +-- components/raftstore-v2/src/router/message.rs | 17 ++--- components/raftstore/src/store/mod.rs | 1 + .../src/store}/simple_write.rs | 71 +++++++++++++------ 6 files changed, 77 insertions(+), 40 deletions(-) rename components/{raftstore-v2/src/operation/command/write => raftstore/src/store}/simple_write.rs (89%) diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index f031ac5d20e..16a8382cfad 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -52,7 +52,7 @@ use super::merge_source_path; use crate::{ batch::StoreContext, fsm::ApplyResReporter, - operation::{AdminCmdResult, SimpleWriteReqDecoder}, + operation::{command::parse_at, AdminCmdResult, SimpleWriteReqDecoder}, raft::{Apply, Peer}, router::CmdResChannel, }; @@ -248,6 +248,7 @@ impl Peer { continue; } let Err(cmd) = SimpleWriteReqDecoder::new( + |buf, index, term| parse_at(&self.logger, buf, index, term), &self.logger, entry.get_data(), entry.get_index(), diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 9ef5592c64e..b45ad23a1b1 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -26,7 +26,6 @@ use engine_traits::{KvEngine, PerfContext, RaftEngine, WriteBatch, WriteOptions} use kvproto::raft_cmdpb::{ AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, }; -use protobuf::Message; use raft::eraftpb::{ConfChange, ConfChangeV2, Entry, EntryType}; use raft_proto::ConfChangeI; use raftstore::{ @@ -69,9 +68,10 @@ pub use admin::{ }; pub use control::ProposalControl; use pd_client::{BucketMeta, BucketStat}; -pub use write::{ - SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, -}; +use protobuf::Message; +pub use write::{SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder}; +pub type SimpleWriteReqEncoder = + raftstore::store::simple_write::SimpleWriteReqEncoder; use self::write::SimpleWrite; @@ -463,7 +463,13 @@ impl Apply { impl Apply { pub fn apply_unsafe_write(&mut self, data: Box<[u8]>) { - let decoder = match SimpleWriteReqDecoder::new(&self.logger, &data, u64::MAX, u64::MAX) { + let decoder = match SimpleWriteReqDecoder::new( + |buf, index, term| parse_at(&self.logger, buf, index, term), + &self.logger, + &data, + u64::MAX, + u64::MAX, + ) { Ok(decoder) => decoder, Err(req) => unreachable!("unexpected request: {:?}", req), }; @@ -556,6 +562,7 @@ impl Apply { let log_index = entry.get_index(); let req = match entry.get_entry_type() { EntryType::EntryNormal => match SimpleWriteReqDecoder::new( + |buf, index, term| parse_at(&self.logger, buf, index, term), &self.logger, entry.get_data(), log_index, diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index b017a7b0ef7..9f4afec9ad6 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -17,19 +17,17 @@ use tikv_util::slog_panic; use crate::{ batch::StoreContext, fsm::ApplyResReporter, + operation::SimpleWriteReqEncoder, raft::{Apply, Peer}, router::{ApplyTask, CmdResChannel}, }; mod ingest; -mod simple_write; -pub use simple_write::{ - SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, +pub use raftstore::store::simple_write::{ + SimpleWrite, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, }; -pub use self::simple_write::SimpleWrite; - impl Peer { #[inline] pub fn on_simple_write( diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 43dfab3ba98..3f761c74f94 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -10,19 +10,20 @@ use kvproto::{ raft_serverpb::RaftMessage, }; use raftstore::store::{ - fsm::ChangeObserver, metrics::RaftEventDurationType, FetchedLogs, GenSnapRes, + fsm::ChangeObserver, metrics::RaftEventDurationType, simple_write::SimpleWriteBinary, + FetchedLogs, GenSnapRes, }; use resource_control::ResourceMetered; use tikv_util::time::Instant; -use super::{ - response_channel::{ - AnyResChannel, CmdResChannel, CmdResSubscriber, DebugInfoChannel, QueryResChannel, - QueryResSubscriber, - }, - ApplyRes, +use super::response_channel::{ + AnyResChannel, CmdResChannel, CmdResSubscriber, DebugInfoChannel, QueryResChannel, + QueryResSubscriber, +}; +use crate::{ + operation::{CatchUpLogs, RequestHalfSplit, RequestSplit, SplitInit}, + router::ApplyRes, }; -use crate::operation::{CatchUpLogs, RequestHalfSplit, RequestSplit, SimpleWriteBinary, SplitInit}; #[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index fe3c12427bd..c007b622ee1 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -22,6 +22,7 @@ mod hibernate_state; mod peer_storage; mod region_snapshot; mod replication_mode; +pub mod simple_write; pub mod snap; mod txn_ext; mod worker; diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore/src/store/simple_write.rs similarity index 89% rename from components/raftstore-v2/src/operation/command/write/simple_write.rs rename to components/raftstore/src/store/simple_write.rs index 5f72fa62738..cdae8f18c97 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore/src/store/simple_write.rs @@ -8,11 +8,10 @@ use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, }; use protobuf::{CodedInputStream, Message}; -use raftstore::store::WriteCallback; use slog::Logger; use tikv_util::slog_panic; -use crate::{operation::command::parse_at, router::CmdResChannel}; +use crate::store::{msg::ErrorCallback, WriteCallback}; // MAGIC number to hint simple write codec is used. If it's a protobuf message, // the first one or several bytes are for field tag, which can't be zero. @@ -38,19 +37,25 @@ impl SimpleWriteBinary { } /// We usually use `RaftCmdRequest` for read write request. But the codec is -/// not efficient enough for simple request. `SimpleWrite` is introduce to make -/// codec alloc less and fast. +/// not efficient enough for simple request. `SimpleWrite` is introduce to +/// make codec alloc less and fast. #[derive(Debug)] -pub struct SimpleWriteReqEncoder { +pub struct SimpleWriteReqEncoder +where + C: ErrorCallback + WriteCallback, +{ header: Box, buf: Vec, - channels: Vec, + channels: Vec, size_limit: usize, write_type: WriteType, notify_proposed: bool, } -impl SimpleWriteReqEncoder { +impl SimpleWriteReqEncoder +where + C: ErrorCallback + WriteCallback, +{ /// Create a request encoder. /// /// If `notify_proposed` is true, channels will be called `notify_proposed` @@ -60,7 +65,7 @@ impl SimpleWriteReqEncoder { bin: SimpleWriteBinary, size_limit: usize, notify_proposed: bool, - ) -> SimpleWriteReqEncoder { + ) -> SimpleWriteReqEncoder { let mut buf = Vec::with_capacity(256); buf.push(MAGIC_PREFIX); header.write_length_delimited_to_vec(&mut buf).unwrap(); @@ -102,12 +107,12 @@ impl SimpleWriteReqEncoder { } #[inline] - pub fn encode(self) -> (Vec, Vec) { + pub fn encode(self) -> (Vec, Vec) { (self.buf, self.channels) } #[inline] - pub fn add_response_channel(&mut self, mut ch: CmdResChannel) { + pub fn add_response_channel(&mut self, mut ch: C) { if self.notify_proposed { ch.notify_proposed(); } @@ -239,6 +244,7 @@ pub struct SimpleWriteReqDecoder<'a> { impl<'a> SimpleWriteReqDecoder<'a> { pub fn new( + fallback: impl FnOnce(&'a [u8], u64, u64) -> RaftCmdRequest, logger: &Logger, buf: &'a [u8], index: u64, @@ -263,7 +269,7 @@ impl<'a> SimpleWriteReqDecoder<'a> { buf: &buf[1 + read as usize..], }) } - _ => Err(parse_at(logger, buf, index, term)), + _ => Err(fallback(buf, index, term)), } } @@ -479,6 +485,11 @@ mod tests { use slog::o; use super::*; + use crate::store::Callback; + + fn decoder_fallback(data: &[u8], index: u64, _: u64) -> RaftCmdRequest { + crate::store::util::parse_data_at(data, index, "") + } #[test] fn test_codec() { @@ -490,18 +501,29 @@ mod tests { let mut header = Box::::default(); header.set_term(2); - let mut req_encoder = SimpleWriteReqEncoder::new(header.clone(), bin, usize::MAX, false); + let mut req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + bin, + usize::MAX, + false, + ); let mut encoder = SimpleWriteEncoder::with_capacity(512); encoder.delete_range(CF_LOCK, b"key", b"key", true); encoder.delete_range("cf", b"key", b"key", false); let bin = encoder.encode(); assert!(!req_encoder.amend(&header, &bin)); - let req_encoder2 = SimpleWriteReqEncoder::new(header.clone(), bin, 0, false); + let req_encoder2 = SimpleWriteReqEncoder::>::new( + header.clone(), + bin, + 0, + false, + ); let (bytes, _) = req_encoder.encode(); let logger = slog_global::borrow_global().new(o!()); - let mut decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); + let mut decoder = + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); assert_eq!(*decoder.header(), *header); let write = decoder.next().unwrap(); let SimpleWrite::Put(put) = write else { panic!("should be put") }; @@ -516,7 +538,7 @@ mod tests { assert_matches!(decoder.next(), None); let (bytes, _) = req_encoder2.encode(); - decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); + decoder = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); let write = decoder.next().unwrap(); let SimpleWrite::DeleteRange(dr) = write else { panic!("should be delete range") }; assert_eq!(dr.cf, CF_LOCK); @@ -544,9 +566,12 @@ mod tests { .collect(); encoder.ingest(exp.clone()); let bin = encoder.encode(); - let req_encoder = SimpleWriteReqEncoder::new(header, bin, 0, false); + let req_encoder = SimpleWriteReqEncoder::>::new( + header, bin, 0, false, + ); let (bytes, _) = req_encoder.encode(); - let mut decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); + let mut decoder = + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); let write = decoder.next().unwrap(); let SimpleWrite::Ingest(ssts) = write else { panic!("should be ingest") }; assert_eq!(exp, ssts); @@ -589,7 +614,8 @@ mod tests { raft_cmd.mut_requests().push(req); let bytes = raft_cmd.write_to_bytes().unwrap(); let logger = slog_global::borrow_global().new(o!()); - let decoded = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap_err(); + let decoded = + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap_err(); // SimpleWriteReqDecoder should be able to decode naive RaftCmdRequest. assert_eq!(decoded, raft_cmd); @@ -599,7 +625,8 @@ mod tests { let mut header = Box::::default(); header.set_term(2); - let mut req_encoder = SimpleWriteReqEncoder::new(header.clone(), bin.clone(), 512, false); + let mut req_encoder: SimpleWriteReqEncoder> = + SimpleWriteReqEncoder::new(header.clone(), bin.clone(), 512, false); let mut header2 = Box::::default(); header2.set_term(4); @@ -610,7 +637,8 @@ mod tests { bin2.freeze(); // Frozen bin can't be merged with other bin. assert!(!req_encoder.amend(&header, &bin2)); - let mut req_encoder2 = SimpleWriteReqEncoder::new(header.clone(), bin2.clone(), 512, false); + let mut req_encoder2: SimpleWriteReqEncoder> = + SimpleWriteReqEncoder::new(header.clone(), bin2.clone(), 512, false); assert!(!req_encoder2.amend(&header, &bin)); // Batch should not excceed max size limit. @@ -620,7 +648,8 @@ mod tests { assert!(!req_encoder.amend(&header, &encoder.encode())); let (bytes, _) = req_encoder.encode(); - let mut decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); + let mut decoder = + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bytes, 0, 0).unwrap(); assert_eq!(*decoder.header(), *header); let req = decoder.next().unwrap(); let SimpleWrite::Put(put) = req else { panic!("should be put") }; From a693d6305a87d22c67c661607d9004cceba14e13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 17 Apr 2023 10:03:17 +0800 Subject: [PATCH 632/676] importer: added grafana for point in time restore. (#14564) close tikv/tikv#14573 Signed-off-by: Yu Juncen Co-authored-by: Ti Chi Robot --- metrics/grafana/tikv_details.json | 1449 ++++++++++++++++++++++++++++- 1 file changed, 1448 insertions(+), 1 deletion(-) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 184ad7a756b..adb4aa34dcd 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -40951,6 +40951,1453 @@ "title": "Backup & Import", "type": "row" }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 23763573235, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 45 + }, + "hiddenSeries": false, + "id": 23763573350, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sst_.*\"}[1m])) by (instance)", + "interval": "", + "legendFormat": "{{instance}}", + "queryType": "randomWalk", + "refId": "A" + }, + { + "hide": false, + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "(AP)apply-99": "#88509f", + "(AP)get_permit-99": "#922870", + "(AP)queuing-99": "#9d0041", + "(DL)exec_download-99": "#73a0fe", + "(DL)queue-99": "#7d78ce", + "exec_download-99": "light-orange", + "get_permit-99": "red", + "queuing-99": "blue", + "total-99": "rgb(252, 252, 252)" + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 9, + "x": 6, + "y": 45 + }, + "hiddenSeries": false, + "id": 23763573351, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2473", + "alias": "total-99", + "bars": false, + "fill": 2, + "lines": true, + "linewidth": 0, + "stack": false, + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.99, avg(rate(tikv_import_rpc_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request=\"apply\"}[1m])) by (le, request))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "total-99", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, avg(rate(tikv_import_apply_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=~\"queue|exec_download\"}[1m])) by (le, type))", + "hide": false, + "interval": "", + "legendFormat": "(DL){{type}}-99", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, avg(rate(tikv_import_engine_request_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "hide": false, + "interval": "", + "legendFormat": "(AP){{type}}-99", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "P99 RPC Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2453", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2454", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "decimals": 1, + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 9, + "x": 15, + "y": 45 + }, + "hiddenSeries": false, + "id": 23763573352, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_import_rpc_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request=\"apply\"}[$__rate_interval])) by (instance, request)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}} :: {{request}}", + "metric": "tikv_grpc_msg_duration_seconds_bucket", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(rate(tikv_import_rpc_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request!=\"switch_mode\"}[30s])) by (request)", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "legendFormat": "total - {{request}}", + "metric": "tikv_grpc_msg_duration_seconds_bucket", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Import RPC Ops", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": { + "unit": "cps" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 52 + }, + "hiddenSeries": false, + "id": 23763573032, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tikv_import_apply_cache_event{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__rate_interval])) by (instance, type)", + "interval": "", + "legendFormat": "{{instance}} :: {{type}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cache Events", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "cps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": 2 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 52 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763573348, + "legend": { + "show": false + }, + "pluginVersion": "7.5.11", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tikv_import_rpc_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", request=\"apply\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Overall RPC Duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": 2 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 52 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763573558, + "legend": { + "show": false + }, + "pluginVersion": "7.5.11", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tikv_import_apply_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"exec_download\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Read File into Memory Duration", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": 2 + }, + "color": { + "cardColor": "#37872D", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 52 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763573229, + "legend": { + "show": false + }, + "pluginVersion": "7.5.11", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tikv_import_engine_request_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"queuing\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Queuing Time", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 60 + }, + "hiddenSeries": false, + "id": 23763573349, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "rate(tikv_import_apply_bytes_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{instance}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Apply Request Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1486", + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1487", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": 2 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateBlues", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 60 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763573344, + "legend": { + "show": false + }, + "pluginVersion": "7.5.11", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tikv_import_download_bytes_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Downloaded File Size", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "decbytes", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": 2 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolatePurples", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 60 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763573233, + "legend": { + "show": false + }, + "pluginVersion": "7.5.11", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tikv_import_apply_bytes_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Apply Batch Size", + "tooltip": { + "show": true, + "showHistogram": true + }, + "tooltipDecimals": null, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "decbytes", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": 2 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 60 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763573230, + "legend": { + "show": false + }, + "pluginVersion": "7.5.11", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tikv_import_engine_request_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"get_permit\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Blocked by Concurrency Time", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 66 + }, + "hiddenSeries": false, + "id": 23763573118, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "rate(tikv_import_applier_event{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"begin_req\"}[$__rate_interval])", + "hide": false, + "interval": "", + "legendFormat": "{{instance}} :: {{type}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Apply Request Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2886", + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2887", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": { + "unit": "decbytes" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 68 + }, + "hiddenSeries": false, + "id": 23763573346, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "tikv_import_apply_cached_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}", + "interval": "", + "legendFormat": "{{instance}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cached File in Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 6, + "x": 12, + "y": 68 + }, + "hiddenSeries": false, + "id": 23763573119, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "increase(tikv_import_applier_event{instance=~\"$instance\", type!=\"begin_req\"}[$__rate_interval])", + "interval": "", + "intervalFactor": 3, + "legendFormat": "{{instance}} :: {{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Engine Requests Unfinished", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:304", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:305", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": 2 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 68 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763573231, + "legend": { + "show": false + }, + "pluginVersion": "7.5.11", + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(increase(tikv_import_engine_request_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"apply\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "title": "Apply Time", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 71 + }, + "hiddenSeries": false, + "id": 23763573449, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.11", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(tikv_server_mem_trace_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore-.*\"}) by (instance)", + "hide": false, + "interval": "", + "legendFormat": "{{instance}}", + "queryType": "randomWalk", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Raft Store Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2886", + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2887", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Point In Time Restore", + "type": "row" + }, { "collapsed": true, "datasource": null, @@ -44799,7 +46246,7 @@ "h": 1, "w": 24, "x": 0, - "y": 48 + "y": 49 }, "id": 4466, "panels": [ From 58986c7725efda7276bd796e9c04a5c79ead84c9 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Mon, 17 Apr 2023 10:19:17 +0800 Subject: [PATCH 633/676] txn: Let commit rollback pessimistic lock instead of committing as WriteType::Lock (#14557) close tikv/tikv#14551 Changes the behavior of `commit` meeting pessimistic lock, from committing as WriteType::Lock to rolling-back. It's correct considering that the key is nolonger part of that transaction. This change fixes the problem that stale pessimistic lock requests with force-locking enabled (which is used by TiDB in fair-locking mode) may overwrite the commit record of another transaction and cause data loss. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- src/storage/mvcc/txn.rs | 2 +- src/storage/txn/actions/commit.rs | 70 ++++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index f395b07e7f8..d5e55e251ae 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -1238,7 +1238,7 @@ pub(crate) mod tests { must_acquire_pessimistic_lock(&mut engine, k, k, 10, 10); must_commit_err(&mut engine, k, 20, 30); must_commit(&mut engine, k, 10, 20); - must_seek_write(&mut engine, k, 30, 10, 20, WriteType::Lock); + must_seek_write_none(&mut engine, k, 30); } #[test] diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index bfb1d39f768..8259991dde6 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -21,8 +21,8 @@ pub fn commit( crate::storage::mvcc::txn::make_txn_error(err, &key, reader.start_ts,).into() )); - let mut lock = match reader.load_lock(&key)? { - Some(mut lock) if lock.ts == reader.start_ts => { + let (mut lock, commit) = match reader.load_lock(&key)? { + Some(lock) if lock.ts == reader.start_ts => { // A lock with larger min_commit_ts than current commit_ts can't be committed if commit_ts < lock.min_commit_ts { info!( @@ -43,20 +43,21 @@ pub fn commit( // It's an abnormal routine since pessimistic locks shouldn't be committed in // our transaction model. But a pessimistic lock will be left if the pessimistic - // rollback request fails to send and the transaction need not to acquire this - // lock again(due to WriteConflict). If the transaction is committed, we should - // commit this pessimistic lock too. + // rollback request fails to send or TiKV receives duplicated stale pessimistic + // lock request, and the transaction need not to acquire this lock again(due to + // WriteConflict). If the transaction is committed, we should remove the + // pessimistic lock (like pessimistic_rollback) instead of committing. if lock.lock_type == LockType::Pessimistic { warn!( - "commit a pessimistic lock with Lock type"; + "rollback a pessimistic lock when trying to commit"; "key" => %key, "start_ts" => reader.start_ts, "commit_ts" => commit_ts, ); - // Commit with WriteType::Lock. - lock.lock_type = LockType::Lock; + (lock, false) + } else { + (lock, true) } - lock } _ => { return match reader.get_txn_commit_record(&key)?.info() { @@ -87,6 +88,14 @@ pub fn commit( }; } }; + + if !commit { + // Rollback a stale pessimistic lock. This function must be called by + // resolve-lock in this case. + assert_eq!(lock.lock_type, LockType::Pessimistic); + return Ok(txn.unlock_key(key, lock.is_pessimistic_txn(), TimeStamp::zero())); + } + let mut write = Write::new( WriteType::from_lock_type(lock.lock_type).unwrap(), reader.start_ts, @@ -123,7 +132,10 @@ pub mod tests { }; #[cfg(test)] use crate::storage::{ - mvcc::SHORT_VALUE_MAX_LEN, txn::commands::check_txn_status, TestEngineBuilder, TxnStatus, + mvcc::SHORT_VALUE_MAX_LEN, + txn::commands::check_txn_status, + txn::tests::{must_acquire_pessimistic_lock, must_pessimistic_prewrite_put}, + TestEngineBuilder, TxnStatus, }; use crate::storage::{ mvcc::{tests::*, MvccTxn}, @@ -135,8 +147,8 @@ pub mod tests { key: &[u8], start_ts: impl Into, commit_ts: impl Into, - ) { - must_succeed_impl(engine, key, start_ts, commit_ts, None); + ) -> Option { + must_succeed_impl(engine, key, start_ts, commit_ts, None) } pub fn must_succeed_on_region( @@ -145,8 +157,8 @@ pub mod tests { key: &[u8], start_ts: impl Into, commit_ts: impl Into, - ) { - must_succeed_impl(engine, key, start_ts, commit_ts, Some(region_id)); + ) -> Option { + must_succeed_impl(engine, key, start_ts, commit_ts, Some(region_id)) } fn must_succeed_impl( @@ -155,7 +167,7 @@ pub mod tests { start_ts: impl Into, commit_ts: impl Into, region_id: Option, - ) { + ) -> Option { let mut ctx = Context::default(); if let Some(region_id) = region_id { ctx.region_id = region_id; @@ -169,8 +181,9 @@ pub mod tests { let cm = ConcurrencyManager::new(start_ts); let mut txn = MvccTxn::new(start_ts, cm); let mut reader = SnapshotReader::new(start_ts, snapshot, true); - commit(&mut txn, &mut reader, Key::from_raw(key), commit_ts.into()).unwrap(); + let res = commit(&mut txn, &mut reader, Key::from_raw(key), commit_ts.into()).unwrap(); write(engine, &ctx, txn.into_modifies()); + res } pub fn must_err( @@ -368,4 +381,29 @@ pub mod tests { assert_eq!(write.txn_source, source); } } + + #[test] + fn test_commit_rollback_pessimistic_lock() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let k1 = b"k1"; + let k2 = b"k2"; + + must_acquire_pessimistic_lock(&mut engine, k1, k1, 10, 10); + must_acquire_pessimistic_lock(&mut engine, k2, k1, 10, 10); + must_pessimistic_prewrite_put(&mut engine, k1, b"v1", k1, 10, 10, DoPessimisticCheck); + let res = must_succeed(&mut engine, k1, 10, 20).unwrap(); + assert_eq!(res.key, Key::from_raw(k1)); + assert_eq!(res.start_ts, 10.into()); + assert_eq!(res.commit_ts, 20.into()); + + let res = must_succeed(&mut engine, k2, 10, 20).unwrap(); + assert_eq!(res.key, Key::from_raw(k2)); + assert_eq!(res.start_ts, 10.into()); + assert_eq!(res.commit_ts, 0.into()); + + must_written(&mut engine, k1, 10, 20, WriteType::Put); + must_not_have_write(&mut engine, k2, 20); + must_not_have_write(&mut engine, k2, 10); + } } From 3bf312166e3e47d5c0755a0ba141aa252df5b7ff Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 17 Apr 2023 13:31:18 +0800 Subject: [PATCH 634/676] raftstore: enable v1 to receive snapshot from v2 (#14559) ref tikv/tikv#14579 enable v1 to receive snapshot from v2 Signed-off-by: Spade A Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/mod.rs | 4 +- .../raftstore/src/store/peer_storage.rs | 4 + components/raftstore/src/store/snap.rs | 50 +++++++- components/raftstore/src/store/worker/mod.rs | 2 + .../raftstore/src/store/worker/region.rs | 5 +- components/test_raftstore-v2/src/cluster.rs | 11 +- components/test_raftstore-v2/src/node.rs | 9 +- components/test_raftstore-v2/src/server.rs | 12 ++ components/test_raftstore/src/server.rs | 4 + src/server/server.rs | 11 +- src/server/snap.rs | 87 ++++++++++--- src/server/tablet_snap.rs | 20 +-- tests/integrations/config/dynamic/snap.rs | 1 + tests/integrations/raftstore/test_snap.rs | 120 +++++++++++++++++- 14 files changed, 296 insertions(+), 44 deletions(-) diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index c007b622ee1..ed97c58ab86 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -83,7 +83,7 @@ pub use self::{ LocalReadContext, LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, - StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, - NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, + StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, ENGINE, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, TIFLASH, }, }; diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 6ac38b60dfe..d89eafc3a46 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -1629,6 +1629,7 @@ pub mod tests { let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); let mut worker = Worker::new("region-worker").lazy_build("region-worker"); let sched = worker.scheduler(); let (dummy_scheduler, _) = dummy_scheduler(); @@ -1765,6 +1766,7 @@ pub mod tests { let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); let mut mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); mgr.set_enable_multi_snapshot_files(true); mgr.set_max_per_file_size(500); let mut worker = Worker::new("region-worker").lazy_build("region-worker"); @@ -1836,6 +1838,7 @@ pub mod tests { let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); let mut worker = Worker::new("region-worker").lazy_build("region-worker"); let sched = worker.scheduler(); let (dummy_scheduler, _) = dummy_scheduler(); @@ -1915,6 +1918,7 @@ pub mod tests { let td1 = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); let snap_dir = Builder::new().prefix("snap").tempdir().unwrap(); let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); let mut worker = LazyWorker::new("snap-manager"); let sched = worker.scheduler(); let (dummy_scheduler, _) = dummy_scheduler(); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index d0c55c144ed..bdf96126dd2 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1387,6 +1387,9 @@ struct SnapManagerCore { pub struct SnapManager { core: SnapManagerCore, max_total_size: Arc, + + // only used to receive snapshot from v2 + tablet_snap_manager: TabletSnapManager, } impl Clone for SnapManager { @@ -1394,6 +1397,7 @@ impl Clone for SnapManager { SnapManager { core: self.core.clone(), max_total_size: self.max_total_size.clone(), + tablet_snap_manager: self.tablet_snap_manager.clone(), } } } @@ -1433,6 +1437,8 @@ impl SnapManager { } } } + + self.tablet_snap_manager.init()?; Ok(()) } @@ -1620,7 +1626,9 @@ impl SnapManager { /// /// NOTE: don't call it in raftstore thread. pub fn get_total_snap_size(&self) -> Result { - self.core.get_total_snap_size() + let size_v1 = self.core.get_total_snap_size()?; + let size_v2 = self.tablet_snap_manager.total_snap_size()?; + Ok(size_v1 + size_v2) } pub fn max_total_snap_size(&self) -> u64 { @@ -1755,6 +1763,14 @@ impl SnapManager { pub fn delete_snapshot(&self, key: &SnapKey, snap: &Snapshot, check_entry: bool) -> bool { self.core.delete_snapshot(key, snap, check_entry) } + + pub fn tablet_snap_manager(&self) -> &TabletSnapManager { + &self.tablet_snap_manager + } + + pub fn limiter(&self) -> &Limiter { + &self.core.limiter + } } impl SnapManagerCore { @@ -1896,9 +1912,14 @@ impl SnapManagerBuilder { } else { u64::MAX }; + let path = path.into(); + let mut path_v2 = path.clone(); + // the path for tablet snap manager, it will be empty if the cluster is not + // to receive snapshot from cluster of raftstore-v2 + path_v2.push_str("_v2"); let mut snapshot = SnapManager { core: SnapManagerCore { - base: path.into(), + base: path, registry: Default::default(), limiter, temp_sst_id: Arc::new(AtomicU64::new(0)), @@ -1910,6 +1931,7 @@ impl SnapManagerBuilder { stats: Default::default(), }, max_total_size: Arc::new(AtomicU64::new(max_total_size)), + tablet_snap_manager: TabletSnapManager::new_without_init(&path_v2), }; snapshot.set_max_per_file_size(self.max_per_file_size); // set actual max_per_file_size snapshot @@ -1999,6 +2021,29 @@ impl TabletSnapManager { }) } + pub fn new_without_init>(path: T) -> Self { + let path = path.into(); + Self { + base: path, + receiving: Arc::default(), + stats: Arc::default(), + } + } + + pub fn init(&self) -> io::Result<()> { + if !self.base.exists() { + file_system::create_dir_all(&self.base)?; + } + if !self.base.is_dir() { + return Err(io::Error::new( + ErrorKind::Other, + format!("{} should be a directory", self.base.display()), + )); + } + file_system::clean_up_trash(&self.base)?; + Ok(()) + } + pub fn begin_snapshot(&self, key: TabletSnapKey, start: Instant, generate_duration_sec: u64) { let mut stat = SnapshotStat::default(); stat.set_generate_duration_sec(generate_duration_sec); @@ -3015,6 +3060,7 @@ pub mod tests { let snap_mgr = SnapManagerBuilder::default() .max_total_size(max_total_size) .build::<_>(snapfiles_path.path().to_str().unwrap()); + snap_mgr.init().unwrap(); let snapshot = engine.kv.snapshot(); // Add an oldest snapshot for receiving. diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index ac23f4e58d5..eddcfe1757a 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -16,6 +16,8 @@ mod split_check; mod split_config; mod split_controller; +pub use region::{ENGINE, TIFLASH}; + #[cfg(test)] pub use self::region::tests::make_raftstore_cfg as make_region_worker_raftstore_cfg; pub use self::{ diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 7dc894204ec..d6d9d0272d3 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -52,8 +52,8 @@ use crate::{ const CLEANUP_MAX_REGION_COUNT: usize = 64; -const TIFLASH: &str = "tiflash"; -const ENGINE: &str = "engine"; +pub const TIFLASH: &str = "tiflash"; +pub const ENGINE: &str = "engine"; /// Region related task #[derive(Debug)] @@ -1143,6 +1143,7 @@ pub(crate) mod tests { let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + mgr.init().unwrap(); let bg_worker = Worker::new("snap-manager"); let mut worker = bg_worker.lazy_build("snap-manager"); let sched = worker.scheduler(); diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index 015062534e4..eafa7a45403 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -36,7 +36,7 @@ use pd_client::PdClient; use raftstore::{ store::{ cmd_resp, initial_region, util::check_key_in_region, Bucket, BucketRange, Callback, - RegionSnapshot, WriteResponse, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, + RegionSnapshot, TabletSnapManager, WriteResponse, INIT_EPOCH_CONF_VER, INIT_EPOCH_VER, }, Error, Result, }; @@ -97,6 +97,7 @@ pub trait Simulator { fn get_router(&self, node_id: u64) -> Option>; fn get_snap_dir(&self, node_id: u64) -> String; + fn get_snap_mgr(&self, node_id: u64) -> &TabletSnapManager; fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()>; fn read(&mut self, request: RaftCmdRequest, timeout: Duration) -> Result { @@ -1530,6 +1531,10 @@ impl, EK: KvEngine> Cluster { self.sim.rl().get_snap_dir(node_id) } + pub fn get_snap_mgr(&self, node_id: u64) -> TabletSnapManager { + self.sim.rl().get_snap_mgr(node_id).clone() + } + pub fn get_router(&self, node_id: u64) -> Option> { self.sim.rl().get_router(node_id) } @@ -1696,6 +1701,10 @@ impl WrapFactory { let region_id = self.region_id_of_key(key); self.tablet_registry.get(region_id)?.latest().cloned() } + + pub fn get_tablet_by_id(&self, id: u64) -> Option { + self.tablet_registry.get(id)?.latest().cloned() + } } impl Peekable for WrapFactory { diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index 0e96d976449..c770a6144bd 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -150,7 +150,7 @@ pub struct NodeCluster { nodes: HashMap>, simulate_trans: HashMap>, concurrency_managers: HashMap, - // snap_mgrs: HashMap, + snap_mgrs: HashMap, } impl NodeCluster { @@ -161,7 +161,7 @@ impl NodeCluster { nodes: HashMap::default(), simulate_trans: HashMap::default(), concurrency_managers: HashMap::default(), - // snap_mgrs: HashMap::default(), + snap_mgrs: HashMap::default(), } } } @@ -237,6 +237,7 @@ impl Simulator for NodeCluster { let &(ref snap_mgr, _) = &trans.snap_paths[&node_id]; (snap_mgr.clone(), None) }; + self.snap_mgrs.insert(node_id, snap_mgr.clone()); let raft_router = RaftRouter::new_with_store_meta(node.router().clone(), store_meta); // Create coprocessor. @@ -421,6 +422,10 @@ impl Simulator for NodeCluster { .to_owned() } + fn get_snap_mgr(&self, node_id: u64) -> &TabletSnapManager { + self.snap_mgrs.get(&node_id).unwrap() + } + fn add_recv_filter(&mut self, node_id: u64, filter: Box) { let mut trans = self.trans.core.lock().unwrap(); trans.routers.get_mut(&node_id).unwrap().add_filter(filter); diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 804a5e4a22f..3de9e5aa956 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -805,6 +805,10 @@ impl Simulator for ServerCluster { .unwrap() .to_owned() } + + fn get_snap_mgr(&self, node_id: u64) -> &TabletSnapManager { + self.snap_mgrs.get(&node_id).unwrap() + } } impl Cluster, EK> { @@ -833,6 +837,14 @@ impl Cluster, EK> { } panic!("failed to get snapshot of region {}", region_id); } + + pub fn get_addr(&self, node_id: u64) -> String { + self.sim.rl().get_addr(node_id) + } + + pub fn get_security_mgr(&self) -> Arc { + self.sim.rl().security_mgr.clone() + } } pub fn new_server_cluster( diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 967ae4b980c..a77fc5d3dd2 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -809,6 +809,10 @@ impl Cluster { pub fn raft_extension(&self, node_id: u64) -> SimulateRaftExtension { self.sim.rl().storages[&node_id].raft_extension() } + + pub fn get_addr(&self, node_id: u64) -> String { + self.sim.rl().get_addr(node_id) + } } pub fn new_server_cluster(id: u64, count: usize) -> Cluster { diff --git a/src/server/server.rs b/src/server/server.rs index 15de7f0d4e7..8e1a33880d6 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -13,7 +13,7 @@ use futures::{compat::Stream01CompatExt, stream::StreamExt}; use grpcio::{ChannelBuilder, Environment, ResourceQuota, Server as GrpcServer, ServerBuilder}; use grpcio_health::{create_health, HealthService, ServingStatus}; use kvproto::tikvpb::*; -use raftstore::store::{CheckLeaderTask, SnapManager, TabletSnapManager}; +use raftstore::store::{CheckLeaderTask, SnapManager, TabletSnapManager, ENGINE, TIFLASH}; use security::SecurityManager; use tikv_util::{ config::VersionTrack, @@ -70,6 +70,7 @@ pub struct Server { // For sending/receiving snapshots. snap_mgr: Either, snap_worker: LazyWorker, + tiflash_engine: bool, // Currently load statistics is done in the thread. stats_pool: Option, @@ -178,6 +179,12 @@ where let trans = ServerTransport::new(raft_client); health_service.set_serving_status("", ServingStatus::NotServing); + let tiflash_engine = cfg + .value() + .labels + .iter() + .any(|entry| entry.0 == ENGINE && entry.1 == TIFLASH); + let svr = Server { env: Arc::clone(&env), builder_or_server: Some(builder), @@ -193,6 +200,7 @@ where debug_thread_pool, health_service, timer: GLOBAL_TIMER_HANDLE.clone(), + tiflash_engine, }; Ok(svr) @@ -262,6 +270,7 @@ where self.raft_router.clone(), security_mgr, cfg, + self.tiflash_engine, ); self.snap_worker.start(snap_runner); } diff --git a/src/server/snap.rs b/src/server/snap.rs index d06e49ab7a8..0512a75214a 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -13,11 +13,12 @@ use std::{ use file_system::{IoType, WithIoType}; use futures::{ - future::{Future, FutureExt, TryFutureExt}, + future::{Future, TryFutureExt}, sink::SinkExt, stream::{Stream, StreamExt, TryStreamExt}, task::{Context, Poll}, }; +use futures_util::FutureExt; use grpcio::{ ChannelBuilder, ClientStreamingSink, DuplexSink, Environment, RequestStream, RpcStatus, RpcStatusCode, WriteFlags, @@ -43,7 +44,7 @@ use tikv_util::{ use tokio::runtime::{Builder as RuntimeBuilder, Runtime}; use super::{metrics::*, Config, Error, Result}; -use crate::tikv_util::sys::thread::ThreadBuildWrapper; +use crate::{server::tablet_snap::NoSnapshotCache, tikv_util::sys::thread::ThreadBuildWrapper}; pub type Callback = Box) + Send>; @@ -342,7 +343,6 @@ fn recv_snap( } context.finish(raft_router) }; - async move { match recv_task.await { Ok(()) => sink.success(Done::default()).await.map_err(Error::from), @@ -364,17 +364,23 @@ pub struct Runner { cfg: Config, sending_count: Arc, recving_count: Arc, + can_receive_tablet_snapshot: bool, } impl Runner { + // `can_receive_tablet_snapshot` being true means we are using tiflash engine + // within a raft group with raftstore-v2. It is set be true to enable runner + // to receive tablet snapshot from v2. pub fn new( env: Arc, snap_mgr: SnapManager, r: R, security_mgr: Arc, cfg: Arc>, + can_receive_tablet_snapshot: bool, ) -> Self { let cfg_tracker = cfg.clone().tracker("snap-sender".to_owned()); + let config = cfg.value().clone(); let snap_worker = Runner { env, snap_mgr, @@ -388,9 +394,10 @@ impl Runner { raft_router: r, security_mgr, cfg_tracker, - cfg: cfg.value().clone(), + cfg: config, sending_count: Arc::new(AtomicUsize::new(0)), recving_count: Arc::new(AtomicUsize::new(0)), + can_receive_tablet_snapshot, }; snap_worker } @@ -415,6 +422,22 @@ impl Runner { self.cfg = incoming.clone(); } } + + fn receiving_busy(&self) -> Option { + let task_num = self.recving_count.load(Ordering::SeqCst); + if task_num >= self.cfg.concurrent_recv_snap_limit { + warn!("too many recving snapshot tasks, ignore"); + return Some(RpcStatus::with_message( + RpcStatusCode::RESOURCE_EXHAUSTED, + format!( + "the number of received snapshot tasks {} exceeded the limitation {}", + task_num, self.cfg.concurrent_recv_snap_limit + ), + )); + } + + None + } } impl Runnable for Runner { @@ -423,19 +446,11 @@ impl Runnable for Runner { fn run(&mut self, task: Task) { match task { Task::Recv { stream, sink } => { - let task_num = self.recving_count.load(Ordering::SeqCst); - if task_num >= self.cfg.concurrent_recv_snap_limit { - warn!("too many recving snapshot tasks, ignore"); - let status = RpcStatus::with_message( - RpcStatusCode::RESOURCE_EXHAUSTED, - format!( - "the number of received snapshot tasks {} exceeded the limitation {}", - task_num, self.cfg.concurrent_recv_snap_limit - ), - ); + if let Some(status) = self.receiving_busy() { self.pool.spawn(sink.fail(status)); return; } + SNAP_TASK_COUNTER_STATIC.recv.inc(); let snap_mgr = self.snap_mgr.clone(); @@ -451,12 +466,44 @@ impl Runnable for Runner { }; self.pool.spawn(task); } - Task::RecvTablet { sink, .. } => { - let status = RpcStatus::with_message( - RpcStatusCode::UNIMPLEMENTED, - "tablet snap is not supported".to_string(), - ); - self.pool.spawn(sink.fail(status).map(|_| ())); + Task::RecvTablet { stream, sink } => { + if !self.can_receive_tablet_snapshot { + let status = RpcStatus::with_message( + RpcStatusCode::UNIMPLEMENTED, + "tablet snap is not supported".to_string(), + ); + self.pool.spawn(sink.fail(status).map(|_| ())); + return; + } + + if let Some(status) = self.receiving_busy() { + self.pool.spawn(sink.fail(status)); + return; + } + + SNAP_TASK_COUNTER_STATIC.recv.inc(); + + let snap_mgr = self.snap_mgr.tablet_snap_manager().clone(); + let raft_router = self.raft_router.clone(); + let recving_count = self.recving_count.clone(); + recving_count.fetch_add(1, Ordering::SeqCst); + let limiter = self.snap_mgr.limiter().clone(); + let task = async move { + let result = crate::server::tablet_snap::recv_snap( + stream, + sink, + snap_mgr, + raft_router, + NoSnapshotCache, // do not use cache in v1 + limiter, + ) + .await; + recving_count.fetch_sub(1, Ordering::SeqCst); + if let Err(e) = result { + error!("failed to recv snapshot"; "err" => %e); + } + }; + self.pool.spawn(task); } Task::Send { addr, msg, cb } => { fail_point!("send_snapshot"); diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index 4524b8645ff..cbcd1a228f8 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -123,7 +123,7 @@ impl SnapCacheBuilder for NoSnapshotCache { } } -struct RecvTabletSnapContext<'a> { +pub(crate) struct RecvTabletSnapContext<'a> { key: TabletSnapKey, raft_msg: RaftMessage, use_cache: bool, @@ -134,7 +134,7 @@ struct RecvTabletSnapContext<'a> { } impl<'a> RecvTabletSnapContext<'a> { - fn new(mut head: TabletSnapshotRequest, mgr: &'a TabletSnapManager) -> Result { + pub(crate) fn new(mut head: TabletSnapshotRequest, mgr: &'a TabletSnapManager) -> Result { if !head.has_head() { return Err(box_err!("no raft message in the first chunk")); } @@ -161,7 +161,7 @@ impl<'a> RecvTabletSnapContext<'a> { }) } - fn finish(self, raft_router: R) -> Result<()> { + pub fn finish(self, raft_router: R) -> Result<()> { let key = self.key; raft_router.feed(self.raft_msg, true); info!("saving all snapshot files"; "snap_key" => %key, "takes" => ?self.start.saturating_elapsed()); @@ -169,7 +169,7 @@ impl<'a> RecvTabletSnapContext<'a> { } } -fn io_type_from_raft_message(msg: &RaftMessage) -> Result { +pub(crate) fn io_type_from_raft_message(msg: &RaftMessage) -> Result { let snapshot = msg.get_message().get_snapshot(); let data = snapshot.get_data(); let mut snapshot_data = RaftSnapshotData::default(); @@ -194,7 +194,7 @@ fn protocol_error(exp: &str, act: impl Debug) -> Error { /// actual data of an SST; /// 3. The last `PREVIEW_CHUNK_LEN` bytes are the same, this contains checksum, /// properties and other medata of an SST. -async fn is_sst_match_preview( +pub(crate) async fn is_sst_match_preview( preview_meta: &TabletSnapshotFileMeta, target: &Path, buffer: &mut Vec, @@ -233,7 +233,7 @@ async fn is_sst_match_preview( Ok(*buffer == preview_meta.trailing_chunk) } -async fn cleanup_cache( +pub(crate) async fn cleanup_cache( path: &Path, stream: &mut (impl Stream> + Unpin), sink: &mut (impl Sink<(TabletSnapshotResponse, WriteFlags), Error = grpcio::Error> + Unpin), @@ -291,7 +291,7 @@ async fn cleanup_cache( Ok((reused, missing)) } -async fn accept_one_file( +pub(crate) async fn accept_one_file( path: &Path, mut chunk: TabletSnapshotFileChunk, stream: &mut (impl Stream> + Unpin), @@ -334,7 +334,7 @@ async fn accept_one_file( } } -async fn accept_missing( +pub(crate) async fn accept_missing( path: &Path, missing_ssts: Vec, stream: &mut (impl Stream> + Unpin), @@ -380,7 +380,7 @@ async fn accept_missing( } } -async fn recv_snap_files<'a>( +pub(crate) async fn recv_snap_files<'a>( snap_mgr: &'a TabletSnapManager, cache_builder: impl SnapCacheBuilder, mut stream: impl Stream> + Unpin, @@ -426,7 +426,7 @@ async fn recv_snap_files<'a>( Ok(context) } -async fn recv_snap( +pub(crate) async fn recv_snap( stream: RequestStream, sink: DuplexSink, snap_mgr: TabletSnapManager, diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index bb91d0d62eb..fa1d6a6fe52 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -65,6 +65,7 @@ fn start_server( RaftRouterWrap::new(raft_router), security_mgr, Arc::clone(&server_config), + false, ); snap_worker.start(snap_runner); diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index fc0364c13b0..f3bd7583ab3 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -11,21 +11,26 @@ use std::{ time::Duration, }; -use engine_traits::{KvEngine, RaftEngineReadOnly}; +use engine_rocks::{RocksCfOptions, RocksDbOptions}; +use engine_traits::{Checkpointer, KvEngine, Peekable, RaftEngineReadOnly, SyncMutable, LARGE_CFS}; use file_system::{IoOp, IoType}; use futures::executor::block_on; use grpcio::Environment; use kvproto::raft_serverpb::*; use raft::eraftpb::{Message, MessageType, Snapshot}; -use raftstore::{store::*, Result}; +use raftstore::{ + store::{snap::TABLET_SNAPSHOT_VERSION, *}, + Result, +}; use rand::Rng; use security::SecurityManager; use test_raftstore::*; use test_raftstore_macro::test_case; -use tikv::server::snap::send_snap; +use test_raftstore_v2::WrapFactory; +use tikv::server::{snap::send_snap, tablet_snap::send_snap as send_snap_v2}; use tikv_util::{ config::*, - time::{Instant, UnixSecs}, + time::{Instant, Limiter, UnixSecs}, HandyRwLock, }; @@ -733,3 +738,110 @@ fn test_snapshot_clean_up_logs_with_log_gc() { // No new log is proposed, so there should be no log at all. assert!(dest.is_empty(), "{:?}", dest); } + +fn generate_snap( + engine: &WrapFactory, + region_id: u64, + snap_mgr: &TabletSnapManager, +) -> (RaftMessage, TabletSnapKey) { + let tablet = engine.get_tablet_by_id(region_id).unwrap(); + let region_state = engine.region_local_state(region_id).unwrap().unwrap(); + let apply_state = engine.raft_apply_state(region_id).unwrap().unwrap(); + + // Construct snapshot by hand + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().set_term(apply_state.commit_term); + snapshot.mut_metadata().set_index(apply_state.applied_index); + let conf_state = raftstore::store::util::conf_state_from_region(region_state.get_region()); + snapshot.mut_metadata().set_conf_state(conf_state); + + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region_state.get_region().clone()); + snap_data.set_version(TABLET_SNAPSHOT_VERSION); + use protobuf::Message; + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + let snap_key = TabletSnapKey::from_region_snap(region_id, 1, &snapshot); + let checkpointer_path = snap_mgr.tablet_gen_path(&snap_key); + let mut checkpointer = tablet.new_checkpointer().unwrap(); + checkpointer + .create_at(checkpointer_path.as_path(), None, 0) + .unwrap(); + + let mut msg = RaftMessage::default(); + msg.region_id = region_id; + msg.set_to_peer(new_peer(1, 1)); + msg.mut_message().set_snapshot(snapshot); + msg.mut_message().set_msg_type(MessageType::MsgSnapshot); + msg.set_region_epoch(region_state.get_region().get_region_epoch().clone()); + + (msg, snap_key) +} + +#[test] +fn test_v1_receive_snap_from_v2() { + let test_receive_snap = |key_num| { + let mut cluster_v1 = test_raftstore::new_server_cluster(1, 1); + let mut cluster_v2 = test_raftstore_v2::new_server_cluster(1, 1); + + cluster_v1 + .cfg + .server + .labels + .insert(String::from("engine"), String::from("tiflash")); + + cluster_v1.run(); + cluster_v2.run(); + + let s1_addr = cluster_v1.get_addr(1); + let region = cluster_v2.get_region(b""); + let region_id = region.get_id(); + let engine = cluster_v2.get_engine(1); + let tablet = engine.get_tablet_by_id(region_id).unwrap(); + + for i in 0..key_num { + let k = format!("zk{:04}", i); + tablet.put(k.as_bytes(), &random_long_vec(1024)).unwrap(); + } + + let snap_mgr = cluster_v2.get_snap_mgr(1); + let security_mgr = cluster_v2.get_security_mgr(); + let (msg, snap_key) = generate_snap(&engine, region_id, &snap_mgr); + let cfg = tikv::server::Config::default(); + let limit = Limiter::new(f64::INFINITY); + let env = Arc::new(Environment::new(1)); + let _ = block_on(async { + send_snap_v2(env, snap_mgr, security_mgr, &cfg, &s1_addr, msg, limit) + .unwrap() + .await + }); + + // The snapshot has been received by cluster v1, so check it's completeness + let snap_mgr = cluster_v1.get_snap_mgr(1); + let path = snap_mgr.tablet_snap_manager().final_recv_path(&snap_key); + let rocksdb = engine_rocks::util::new_engine_opt( + path.as_path().to_str().unwrap(), + RocksDbOptions::default(), + LARGE_CFS + .iter() + .map(|&cf| (cf, RocksCfOptions::default())) + .collect(), + ) + .unwrap(); + + for i in 0..key_num { + let k = format!("zk{:04}", i); + assert!( + rocksdb + .get_value_cf("default", k.as_bytes()) + .unwrap() + .is_some() + ); + } + }; + + // test small snapshot + test_receive_snap(20); + + // test large snapshot + test_receive_snap(5000); +} From 90477057e5b396de1593e8fb6d738469d9f987b4 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 18 Apr 2023 10:13:18 +0800 Subject: [PATCH 635/676] raftstore: fix snap manager init (#14591) ref tikv/tikv#14579 fix snap manager init Signed-off-by: Spade A --- components/raftstore/src/store/snap.rs | 51 +++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index bdf96126dd2..091609cf63e 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1408,6 +1408,12 @@ impl SnapManager { } pub fn init(&self) -> io::Result<()> { + self.init_core()?; + self.tablet_snap_manager.init()?; + Ok(()) + } + + fn init_core(&self) -> io::Result<()> { let enc_enabled = self.core.encryption_key_manager.is_some(); info!( "Initializing SnapManager, encryption is enabled: {}", @@ -1438,7 +1444,6 @@ impl SnapManager { } } - self.tablet_snap_manager.init()?; Ok(()) } @@ -2163,7 +2168,7 @@ impl TabletSnapManager { #[cfg(test)] pub mod tests { use std::{ - cmp, + cmp, fs, io::{self, Read, Seek, SeekFrom, Write}, path::{Path, PathBuf}, sync::{ @@ -3197,4 +3202,46 @@ pub mod tests { assert!(snap_mgr.delete_snapshot(&key, &s1, false)); } } + + #[test] + fn test_init() { + let builder = SnapManagerBuilder::default(); + let snap_dir = Builder::new() + .prefix("test_snap_path_does_not_exist") + .tempdir() + .unwrap(); + let path = snap_dir.path().join("snap"); + let snap_mgr = builder.build(path.as_path().to_str().unwrap()); + snap_mgr.init().unwrap(); + + assert!(path.exists()); + let mut path = path.as_path().to_str().unwrap().to_string(); + path.push_str("_v2"); + assert!(Path::new(&path).exists()); + + let builder = SnapManagerBuilder::default(); + let snap_dir = Builder::new() + .prefix("test_snap_path_exist") + .tempdir() + .unwrap(); + let path = snap_dir.path(); + let snap_mgr = builder.build(path.to_str().unwrap()); + snap_mgr.init().unwrap(); + + let mut path = path.to_str().unwrap().to_string(); + path.push_str("_v2"); + assert!(Path::new(&path).exists()); + + let builder = SnapManagerBuilder::default(); + let snap_dir = Builder::new() + .prefix("test_tablet_snap_path_exist") + .tempdir() + .unwrap(); + let path = snap_dir.path().join("snap/v2"); + fs::create_dir_all(path).unwrap(); + let path = snap_dir.path().join("snap"); + let snap_mgr = builder.build(path.to_str().unwrap()); + snap_mgr.init().unwrap(); + assert!(path.exists()); + } } From dee46499a6f288dc8222817fc1755aa4d667cefa Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 18 Apr 2023 13:51:19 +0800 Subject: [PATCH 636/676] *: support observe apply in raftstore v2 (#14562) ref tikv/tikv#14542 *: support observe apply in raftstore v2 Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/cdc/src/endpoint.rs | 6 +- components/cdc/tests/mod.rs | 1 + components/raftstore-v2/src/fsm/apply.rs | 7 ++- .../src/operation/command/admin/split.rs | 7 ++- .../raftstore-v2/src/operation/command/mod.rs | 35 ++++++++--- .../src/operation/query/capture.rs | 61 +++++++++++++++---- components/raftstore-v2/src/raft/apply.rs | 33 ++++++++-- components/raftstore-v2/src/raft/storage.rs | 15 +++-- components/raftstore/src/coprocessor/mod.rs | 10 ++- .../raftstore/src/store/region_snapshot.rs | 5 ++ .../raftstore/src/store/simple_write.rs | 55 ++++++++++++++++- components/server/src/server.rs | 2 + components/server/src/server2.rs | 42 +++++++------ src/config/mod.rs | 12 +++- 14 files changed, 234 insertions(+), 57 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 68650130211..dfeb4f78045 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -333,6 +333,7 @@ pub struct Endpoint { /// locks when calculating resolved_ts. concurrency_manager: ConcurrencyManager, + raftstore_v2: bool, config: CdcConfig, api_version: ApiVersion, @@ -362,6 +363,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, scheduler: Scheduler, @@ -430,6 +432,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint, E: KvEngine, S: StoreRegionMeta> Endpoint ?e); return; } - if let Err(e) = validate_cfg.validate() { + if let Err(e) = validate_cfg.validate(self.raftstore_v2) { warn!("cdc config update failed"; "error" => ?e); return; } @@ -1375,6 +1378,7 @@ mod tests { let ep = Endpoint::new( DEFAULT_CLUSTER_ID, cfg, + false, api_version, pd_client, task_sched.clone(), diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index 89ed4e6dbb1..f2663c79287 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -182,6 +182,7 @@ impl TestSuiteBuilder { let mut cdc_endpoint = cdc::Endpoint::new( DEFAULT_CLUSTER_ID, &cfg, + false, cluster.cfg.storage.api_version(), pd_cli.clone(), worker.scheduler(), diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 6c0989e72ae..08d7f7946ec 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -11,7 +11,10 @@ use engine_traits::{FlushState, KvEngine, TabletRegistry}; use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; -use raftstore::store::{Config, ReadTask}; +use raftstore::{ + coprocessor::CoprocessorHost, + store::{Config, ReadTask}, +}; use slog::Logger; use sst_importer::SstImporter; use tikv_util::{ @@ -79,6 +82,7 @@ impl ApplyFsm { applied_term: u64, buckets: Option, sst_importer: Arc, + coprocessor_host: CoprocessorHost, logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); @@ -94,6 +98,7 @@ impl ApplyFsm { applied_term, buckets, sst_importer, + coprocessor_host, logger, ); ( diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index e1577830d25..0b53476273f 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -878,7 +878,10 @@ mod test { raft_cmdpb::{BatchSplitRequest, SplitRequest}, raft_serverpb::{PeerState, RegionLocalState}, }; - use raftstore::store::{cmd_resp::new_error, Config}; + use raftstore::{ + coprocessor::CoprocessorHost, + store::{cmd_resp::new_error, Config}, + }; use slog::o; use tempfile::TempDir; use tikv_util::{ @@ -1026,6 +1029,7 @@ mod test { let (read_scheduler, _rx) = dummy_scheduler(); let (reporter, _) = MockReporter::new(); let (_tmp_dir, importer) = create_tmp_importer(); + let host = CoprocessorHost::::default(); let mut apply = Apply::new( &Config::default(), region @@ -1043,6 +1047,7 @@ mod test { 5, None, importer, + host, logger.clone(), ); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index b45ad23a1b1..af31dc5a397 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -29,6 +29,7 @@ use kvproto::raft_cmdpb::{ use raft::eraftpb::{ConfChange, ConfChangeV2, Entry, EntryType}; use raft_proto::ConfChangeI; use raftstore::{ + coprocessor::ObserveLevel, store::{ cmd_resp, fsm::{ @@ -75,7 +76,12 @@ pub type SimpleWriteReqEncoder = use self::write::SimpleWrite; -fn parse_at(logger: &slog::Logger, buf: &[u8], index: u64, term: u64) -> M { +pub(crate) fn parse_at( + logger: &slog::Logger, + buf: &[u8], + index: u64, + term: u64, +) -> M { let mut m = M::default(); match m.merge_from_bytes(buf) { Ok(()) => m, @@ -141,6 +147,7 @@ impl Peer { self.entry_storage().applied_term(), buckets, store_ctx.sst_importer.clone(), + store_ctx.coprocessor_host.clone(), logger, ); @@ -533,8 +540,8 @@ impl Apply { wb.set_save_point(); set_save_point = true; } - let resp = match self.apply_entry(&e).await { - Ok(resp) => resp, + let (req, resp) = match self.apply_entry(&e).await { + Ok(req_resp) => req_resp, Err(e) => { if let Some(wb) = &mut self.write_batch { if set_save_point { @@ -543,9 +550,10 @@ impl Apply { wb.clear(); } } - cmd_resp::new_error(e) + (RaftCmdRequest::default(), cmd_resp::new_error(e)) } }; + self.observe_apply(e.get_index(), e.get_term(), req, &resp); self.callbacks_mut().push((ch, resp)); } else { assert!(ch.is_empty()); @@ -557,7 +565,7 @@ impl Apply { } #[inline] - async fn apply_entry(&mut self, entry: &Entry) -> Result { + async fn apply_entry(&mut self, entry: &Entry) -> Result<(RaftCmdRequest, RaftCmdResponse)> { let mut conf_change = None; let log_index = entry.get_index(); let req = match entry.get_entry_type() { @@ -576,7 +584,11 @@ impl Apply { true, true, )?; - let res = Ok(new_response(decoder.header())); + let mut req = RaftCmdRequest::default(); + if self.observe().level != ObserveLevel::None { + req = decoder.to_raft_cmd_request(); + } + let resp = new_response(decoder.header()); for req in decoder { match req { SimpleWrite::Put(put) => { @@ -599,7 +611,7 @@ impl Apply { } } } - return res; + return Ok((req, resp)); } Err(req) => req, }, @@ -657,7 +669,7 @@ impl Apply { } let mut resp = new_response(req.get_header()); resp.set_admin_response(admin_resp); - Ok(resp) + Ok((req, resp)) } else { for r in req.get_requests() { match r.get_cmd_type() { @@ -684,7 +696,8 @@ impl Apply { _ => unimplemented!(), } } - Ok(new_response(req.get_header())) + let resp = new_response(req.get_header()); + Ok((req, resp)) } } @@ -772,6 +785,10 @@ impl Apply { buckets.clear_stats(); } + // Call it before invoking callback for preventing Commit is executed before + // Prewrite is observed. + self.flush_observed_apply(); + // Report result first and then invoking callbacks. This may delays callback a // little bit, but can make sure all following messages must see the side // effect of admin commands. diff --git a/components/raftstore-v2/src/operation/query/capture.rs b/components/raftstore-v2/src/operation/query/capture.rs index 03014644261..94b58f41809 100644 --- a/components/raftstore-v2/src/operation/query/capture.rs +++ b/components/raftstore-v2/src/operation/query/capture.rs @@ -1,12 +1,12 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; +use std::{mem, sync::Arc}; use engine_traits::{KvEngine, RaftEngine}; use fail::fail_point; -use kvproto::raft_cmdpb::RaftCmdResponse; +use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}; use raftstore::{ - coprocessor::ObserveHandle, + coprocessor::{Cmd, CmdBatch, ObserveHandle, ObserveLevel}, store::{ cmd_resp, fsm::{ @@ -18,6 +18,7 @@ use raftstore::{ RegionSnapshot, }, }; +use slog::info; use crate::{ fsm::{ApplyResReporter, PeerFsmDelegate}, @@ -67,9 +68,9 @@ impl Apply { let ChangeObserver { region_id, ty } = observer; let is_stale_cmd = match ty { - ObserverType::Cdc(ObserveHandle { id, .. }) => self.observe_info_mut().cdc_id.id > id, - ObserverType::Rts(ObserveHandle { id, .. }) => self.observe_info_mut().rts_id.id > id, - ObserverType::Pitr(ObserveHandle { id, .. }) => self.observe_info_mut().pitr_id.id > id, + ObserverType::Cdc(ObserveHandle { id, .. }) => self.observe().info.cdc_id.id > id, + ObserverType::Rts(ObserveHandle { id, .. }) => self.observe().info.rts_id.id > id, + ObserverType::Pitr(ObserveHandle { id, .. }) => self.observe().info.pitr_id.id > id, }; if is_stale_cmd { notify_stale_req_with_msg( @@ -77,7 +78,7 @@ impl Apply { format!( "stale observe id {:?}, current id: {:?}", ty.handle().id, - self.observe_info_mut().pitr_id.id + self.observe().info, ), snap_cb, ); @@ -96,10 +97,13 @@ impl Apply { // Commit the writebatch for ensuring the following snapshot can get all // previous writes. self.flush(); - RegionSnapshot::from_snapshot( + let (applied_index, _) = self.apply_progress(); + let snap = RegionSnapshot::from_snapshot( Arc::new(self.tablet().snapshot()), Arc::new(self.region().clone()), - ) + ); + snap.set_apply_index(applied_index); + snap } Err(e) => { // Return error if epoch not match @@ -108,17 +112,50 @@ impl Apply { } }; + let observe = self.observe_mut(); match ty { ObserverType::Cdc(id) => { - self.observe_info_mut().cdc_id = id; + observe.info.cdc_id = id; } ObserverType::Rts(id) => { - self.observe_info_mut().rts_id = id; + observe.info.rts_id = id; } ObserverType::Pitr(id) => { - self.observe_info_mut().pitr_id = id; + observe.info.pitr_id = id; } } + let level = observe.info.observe_level(); + observe.level = level; + info!(self.logger, "capture update observe level"; "level" => ?level); snap_cb.set_result((RaftCmdResponse::default(), Some(Box::new(snapshot)))); } + + pub fn observe_apply( + &mut self, + index: u64, + term: u64, + req: RaftCmdRequest, + resp: &RaftCmdResponse, + ) { + if self.observe().level == ObserveLevel::None { + return; + } + + let cmd = Cmd::new(index, term, req, resp.clone()); + self.observe_mut().cmds.push(cmd); + } + + pub fn flush_observed_apply(&mut self) { + let level = self.observe().level; + if level == ObserveLevel::None { + return; + } + + let region_id = self.region_id(); + let mut cmd_batch = CmdBatch::new(&self.observe().info, region_id); + let cmds = mem::take(&mut self.observe_mut().cmds); + cmd_batch.extend(&self.observe().info, region_id, cmds); + self.coprocessor_host() + .on_flush_applied_cmd_batch(level, vec![cmd_batch], self.tablet()); + } } diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index d5ecb8c3026..d32b8bdbb80 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -8,7 +8,7 @@ use engine_traits::{ use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raftstore::{ - coprocessor::CmdObserveInfo, + coprocessor::{Cmd, CmdObserveInfo, CoprocessorHost, ObserveLevel}, store::{ fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, Config, ReadTask, @@ -23,6 +23,12 @@ use crate::{ router::CmdResChannel, }; +pub(crate) struct Observe { + pub info: CmdObserveInfo, + pub level: ObserveLevel, + pub cmds: Vec, +} + /// Apply applies all the committed commands to kv db. pub struct Apply { peer: metapb::Peer, @@ -62,7 +68,8 @@ pub struct Apply { res_reporter: R, read_scheduler: Scheduler>, sst_importer: Arc, - observe_info: CmdObserveInfo, + observe: Observe, + coprocessor_host: CoprocessorHost, pub(crate) metrics: ApplyMetrics, pub(crate) logger: Logger, @@ -83,6 +90,7 @@ impl Apply { applied_term: u64, buckets: Option, sst_importer: Arc, + coprocessor_host: CoprocessorHost, logger: Logger, ) -> Self { let mut remote_tablet = tablet_registry @@ -115,7 +123,12 @@ impl Apply { metrics: ApplyMetrics::default(), buckets, sst_importer, - observe_info: CmdObserveInfo::default(), + observe: Observe { + info: CmdObserveInfo::default(), + level: ObserveLevel::None, + cmds: vec![], + }, + coprocessor_host, logger, } } @@ -277,12 +290,22 @@ impl Apply { } #[inline] - pub fn observe_info_mut(&mut self) -> &mut CmdObserveInfo { - &mut self.observe_info + pub(crate) fn observe(&mut self) -> &Observe { + &self.observe + } + + #[inline] + pub(crate) fn observe_mut(&mut self) -> &mut Observe { + &mut self.observe } #[inline] pub fn term(&self) -> u64 { self.applied_term } + + #[inline] + pub fn coprocessor_host(&self) -> &CoprocessorHost { + &self.coprocessor_host + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index cff915fd248..ee9be348c89 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -318,7 +318,7 @@ mod tests { use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::TestTabletFactory, + kv::{KvTestEngine, TestTabletFactory}, }; use engine_traits::{ FlushState, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS, @@ -328,10 +328,13 @@ mod tests { raft_serverpb::PeerState, }; use raft::{Error as RaftError, StorageError}; - use raftstore::store::{ - util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, Config, FetchedLogs, - GenSnapRes, ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, - RAFT_INIT_LOG_TERM, + use raftstore::{ + coprocessor::CoprocessorHost, + store::{ + util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, Config, FetchedLogs, + GenSnapRes, ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, + RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + }, }; use slog::o; use tempfile::TempDir; @@ -500,6 +503,7 @@ mod tests { let mut state = RegionLocalState::default(); state.set_region(region.clone()); let (_tmp_dir, importer) = create_tmp_importer(); + let host = CoprocessorHost::::default(); // setup peer applyer let mut apply = Apply::new( &Config::default(), @@ -513,6 +517,7 @@ mod tests { 5, None, importer, + host, logger, ); diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 82b6dce17ee..7dc5142e734 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -438,7 +438,7 @@ impl CmdObserveInfo { /// PiTR: Observer supports the `backup-log` function. /// RTS: Observer supports the `resolved-ts` advancing (and follower read, /// etc.). - fn observe_level(&self) -> ObserveLevel { + pub fn observe_level(&self) -> ObserveLevel { let cdc = if self.cdc_id.is_observing() { // `cdc` observe all data ObserveLevel::All @@ -512,6 +512,14 @@ impl CmdBatch { self.cmds.push(cmd) } + pub fn extend(&mut self, observe_info: &CmdObserveInfo, region_id: u64, cmds: Vec) { + assert_eq!(region_id, self.region_id); + assert_eq!(observe_info.cdc_id.id, self.cdc_id); + assert_eq!(observe_info.rts_id.id, self.rts_id); + assert_eq!(observe_info.pitr_id.id, self.pitr_id); + self.cmds.extend(cmds) + } + pub fn into_iter(self, region_id: u64) -> IntoIter { assert_eq!(region_id, self.region_id); self.cmds.into_iter() diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index ccf5f94e39e..4073b71c60d 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -85,6 +85,11 @@ where self.snap.as_ref() } + #[inline] + pub fn set_apply_index(&self, apply_index: u64) { + self.apply_index.store(apply_index, Ordering::SeqCst); + } + #[inline] pub fn get_apply_index(&self) -> Result { let apply_index = self.apply_index.load(Ordering::SeqCst); diff --git a/components/raftstore/src/store/simple_write.rs b/components/raftstore/src/store/simple_write.rs index cdae8f18c97..57056f984bd 100644 --- a/components/raftstore/src/store/simple_write.rs +++ b/components/raftstore/src/store/simple_write.rs @@ -5,7 +5,7 @@ use std::assert_matches::debug_assert_matches; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::{ import_sstpb::SstMeta, - raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request}, }; use protobuf::{CodedInputStream, Message}; use slog::Logger; @@ -277,6 +277,59 @@ impl<'a> SimpleWriteReqDecoder<'a> { pub fn header(&self) -> &RaftRequestHeader { &self.header } + + pub fn to_raft_cmd_request(&self) -> RaftCmdRequest { + let mut req = RaftCmdRequest::default(); + req.set_header(self.header().clone()); + let decoder = Self { + header: Default::default(), + buf: self.buf, + }; + for s in decoder { + match s { + SimpleWrite::Put(Put { cf, key, value }) => { + let mut request = Request::default(); + request.set_cmd_type(CmdType::Put); + request.mut_put().set_cf(cf.to_owned()); + request.mut_put().set_key(key.to_owned()); + request.mut_put().set_value(value.to_owned()); + req.mut_requests().push(request); + } + SimpleWrite::Delete(Delete { cf, key }) => { + let mut request = Request::default(); + request.set_cmd_type(CmdType::Delete); + request.mut_delete().set_cf(cf.to_owned()); + request.mut_delete().set_key(key.to_owned()); + req.mut_requests().push(request); + } + SimpleWrite::DeleteRange(DeleteRange { + cf, + start_key, + end_key, + notify_only, + }) => { + let mut request = Request::default(); + request.set_cmd_type(CmdType::DeleteRange); + request.mut_delete_range().set_cf(cf.to_owned()); + request + .mut_delete_range() + .set_start_key(start_key.to_owned()); + request.mut_delete_range().set_end_key(end_key.to_owned()); + request.mut_delete_range().set_notify_only(notify_only); + req.mut_requests().push(request); + } + SimpleWrite::Ingest(ssts) => { + for sst in ssts { + let mut request = Request::default(); + request.set_cmd_type(CmdType::IngestSst); + request.mut_ingest_sst().set_sst(sst); + req.mut_requests().push(request); + } + } + } + } + req + } } impl<'a> Iterator for SimpleWriteReqDecoder<'a> { diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 3243b207aca..d1c8e09ef96 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -95,6 +95,7 @@ use tikv::{ }, storage::{ self, + config::EngineType, config_manager::StorageConfigManger, kv::LocalTablets, mvcc::MvccConsistencyCheckObserver, @@ -933,6 +934,7 @@ where let cdc_endpoint = cdc::Endpoint::new( self.core.config.server.cluster_id, &self.core.config.cdc, + self.core.config.storage.engine == EngineType::RaftKv2, self.core.config.storage.api_version(), self.pd_client.clone(), cdc_scheduler.clone(), diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 86d3a9a696f..83bcc2a55fe 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -74,6 +74,7 @@ use tikv::{ }, storage::{ self, + config::EngineType, config_manager::StorageConfigManger, kv::LocalTablets, mvcc::MvccConsistencyCheckObserver, @@ -187,6 +188,7 @@ struct TikvServer { coprocessor_host: Option>, concurrency_manager: ConcurrencyManager, env: Arc, + cdc_worker: Option>>, cdc_scheduler: Option>, cdc_memory_quota: Option, sst_worker: Option>>, @@ -323,6 +325,7 @@ where coprocessor_host: None, concurrency_manager, env, + cdc_worker: None, cdc_scheduler: None, cdc_memory_quota: None, sst_worker: None, @@ -563,28 +566,25 @@ where self.core.to_stop.push(check_leader_worker); // Create cdc worker. - let cdc_worker = Box::new(LazyWorker::new("cdc")); - let cdc_scheduler = cdc_worker.scheduler(); - let txn_extra_scheduler = cdc::CdcTxnExtraScheduler::new(cdc_scheduler.clone()); - engines - .engine - .set_txn_extra_scheduler(Arc::new(txn_extra_scheduler)); + let mut cdc_worker = self.cdc_worker.take().unwrap(); + let cdc_scheduler = self.cdc_scheduler.clone().unwrap(); // Register cdc observer. let cdc_ob = cdc::CdcObserver::new(cdc_scheduler.clone()); cdc_ob.register_to(self.coprocessor_host.as_mut().unwrap()); // Register cdc config manager. cfg_controller.register( tikv::config::Module::Cdc, - Box::new(CdcConfigManager(cdc_worker.scheduler())), + Box::new(CdcConfigManager(cdc_scheduler.clone())), ); // Start cdc endpoint. let cdc_memory_quota = MemoryQuota::new(self.core.config.cdc.sink_memory_quota.0 as _); - let _cdc_endpoint = cdc::Endpoint::new( + let cdc_endpoint = cdc::Endpoint::new( self.core.config.server.cluster_id, &self.core.config.cdc, + self.core.config.storage.engine == EngineType::RaftKv2, self.core.config.storage.api_version(), self.pd_client.clone(), - cdc_scheduler.clone(), + cdc_scheduler, self.router.clone().unwrap(), LocalTablets::Registry(self.tablet_registry.as_ref().unwrap().clone()), cdc_ob, @@ -595,15 +595,13 @@ where cdc_memory_quota.clone(), self.causal_ts_provider.clone(), ); - // TODO: enable cdc. - // cdc_worker.start_with_timer(cdc_endpoint); - // self.core.to_stop.push(cdc_worker); - self.cdc_scheduler = Some(cdc_scheduler); + cdc_worker.start_with_timer(cdc_endpoint); + self.core.to_stop.push(cdc_worker); self.cdc_memory_quota = Some(cdc_memory_quota); // Create resolved ts. if self.core.config.resolved_ts.enable { - let rts_worker = Box::new(LazyWorker::new("resolved-ts")); + let mut rts_worker = Box::new(LazyWorker::new("resolved-ts")); // Register the resolved ts observer let resolved_ts_ob = resolved_ts::Observer::new(rts_worker.scheduler()); resolved_ts_ob.register_to(self.coprocessor_host.as_mut().unwrap()); @@ -614,7 +612,7 @@ where rts_worker.scheduler(), )), ); - let _rts_endpoint = resolved_ts::Endpoint::new( + let rts_endpoint = resolved_ts::Endpoint::new( &self.core.config.resolved_ts, rts_worker.scheduler(), self.router.clone().unwrap(), @@ -624,9 +622,8 @@ where self.env.clone(), self.security_mgr.clone(), ); - // TODO: enable resolved_ts. - // rts_worker.start_with_timer(rts_endpoint); - // self.core.to_stop.push(rts_worker); + rts_worker.start_with_timer(rts_endpoint); + self.core.to_stop.push(rts_worker); } let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); @@ -1234,7 +1231,12 @@ impl TikvServer { ); let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); - let engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); + let cdc_worker = Box::new(LazyWorker::new("cdc")); + let cdc_scheduler = cdc_worker.scheduler(); + let txn_extra_scheduler = cdc::CdcTxnExtraScheduler::new(cdc_scheduler.clone()); + + let mut engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); + engine.set_txn_extra_scheduler(Arc::new(txn_extra_scheduler)); self.engines = Some(TikvEngines { raft_engine, @@ -1244,6 +1246,8 @@ impl TikvServer { self.node = Some(node); self.coprocessor_host = Some(coprocessor_host); self.region_info_accessor = Some(region_info_accessor); + self.cdc_worker = Some(cdc_worker); + self.cdc_scheduler = Some(cdc_scheduler); engines_info } diff --git a/src/config/mod.rs b/src/config/mod.rs index f8bbd1be9f5..2e81de6d829 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2789,7 +2789,7 @@ impl Default for CdcConfig { } impl CdcConfig { - pub fn validate(&mut self) -> Result<(), Box> { + pub fn validate(&mut self, raftstore_v2: bool) -> Result<(), Box> { let default_cfg = CdcConfig::default(); if self.min_ts_interval.is_zero() { warn!( @@ -2823,6 +2823,13 @@ impl CdcConfig { ); self.incremental_scan_ts_filter_ratio = default_cfg.incremental_scan_ts_filter_ratio; } + if raftstore_v2 && self.hibernate_regions_compatible { + warn!( + "cdc.hibernate_regions_compatible is overwritten to false for partitioned-raft-kv" + ); + self.hibernate_regions_compatible = false; + } + Ok(()) } } @@ -3360,7 +3367,8 @@ impl TikvConfig { self.import.validate()?; self.backup.validate()?; self.log_backup.validate()?; - self.cdc.validate()?; + self.cdc + .validate(self.storage.engine == EngineType::RaftKv2)?; self.pessimistic_txn.validate()?; self.gc.validate()?; self.resolved_ts.validate()?; From e024556874a44a281cd9a50d454e905e03bd0eef Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Tue, 18 Apr 2023 14:51:18 +0800 Subject: [PATCH 637/676] raftstore-v2: schedule raft tick after apply snapshot (#14550) ref tikv/tikv#14532, close tikv/tikv#14548 schedule raft tick after apply snapshot Signed-off-by: Spade A --- .../raftstore-v2/src/operation/command/mod.rs | 16 ++------- .../src/operation/ready/apply_trace.rs | 8 +++++ .../raftstore-v2/src/operation/ready/mod.rs | 1 + components/raftstore-v2/src/raft/peer.rs | 27 +++++++++++++-- tests/failpoints/cases/test_snap.rs | 33 +++++++++++++++++++ 5 files changed, 70 insertions(+), 15 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index af31dc5a397..ce4a415cf00 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -43,7 +43,7 @@ use raftstore::{ }, Error, Result, }; -use slog::{debug, error, info, warn}; +use slog::{debug, error, warn}; use tikv_util::{ box_err, log::SlogFormat, @@ -55,7 +55,7 @@ use crate::{ batch::StoreContext, fsm::{ApplyFsm, ApplyResReporter}, raft::{Apply, Peer}, - router::{ApplyRes, ApplyTask, CmdResChannel, PeerTick}, + router::{ApplyRes, ApplyTask, CmdResChannel}, }; mod admin; @@ -401,17 +401,7 @@ impl Peer { apply_res.applied_index, progress_to_be_updated, ); - if self.pause_for_recovery() - && self.storage().entry_storage().commit_index() <= apply_res.applied_index - { - info!(self.logger, "recovery completed"; "apply_index" => apply_res.applied_index); - self.set_pause_for_recovery(false); - // Flush to avoid recover again and again. - if let Some(scheduler) = self.apply_scheduler() { - scheduler.send(ApplyTask::ManualFlush); - } - self.add_pending_tick(PeerTick::Raft); - } + self.try_compelete_recovery(); if !self.pause_for_recovery() && self.storage_mut().apply_trace_mut().should_flush() { if let Some(scheduler) = self.apply_scheduler() { scheduler.send(ApplyTask::ManualFlush); diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 71e282728f7..90b7930c368 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -33,6 +33,7 @@ use engine_traits::{ data_cf_offset, ApplyProgress, KvEngine, RaftEngine, RaftLogBatch, TabletRegistry, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, DATA_CFS, DATA_CFS_LEN, }; +use fail::fail_point; use kvproto::{ metapb::Region, raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, @@ -404,6 +405,13 @@ impl Storage { } }; apply_state.set_applied_index(applied_index); + let mut reset_apply_index = || { + // Make node reply from start. + fail_point!("RESET_APPLY_INDEX_WHEN_RESTART", |_| { + apply_state.set_applied_index(5); + }); + }; + reset_apply_index(); Self::create( store_id, diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 68da61cf45e..9419549e580 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -595,6 +595,7 @@ impl Peer { self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); + self.try_compelete_recovery(); self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); if !ready.persisted_messages().is_empty() { diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index b93fc0f5047..6b5898b6297 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -25,7 +25,7 @@ use raftstore::{ TabletSnapManager, WriteTask, }, }; -use slog::Logger; +use slog::{info, Logger}; use tikv_util::slog_panic; use super::storage::Storage; @@ -35,7 +35,7 @@ use crate::{ AsyncWriter, BucketStatsInfo, CompactLogContext, DestroyProgress, GcPeerContext, MergeContext, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, TxnContext, }, - router::{CmdResChannel, PeerTick, QueryResChannel}, + router::{ApplyTask, CmdResChannel, PeerTick, QueryResChannel}, Result, }; @@ -464,6 +464,29 @@ impl Peer { self.pause_for_recovery } + #[inline] + // we may have skipped scheduling raft tick when start due to noticable gap + // between commit index and apply index. We should scheduling it when raft log + // apply catches up. + pub fn try_compelete_recovery(&mut self) { + if self.pause_for_recovery() + && self.storage().entry_storage().commit_index() + <= self.storage().entry_storage().applied_index() + { + info!( + self.logger, + "recovery completed"; + "apply_index" => self.storage().entry_storage().applied_index() + ); + self.set_pause_for_recovery(false); + // Flush to avoid recover again and again. + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::ManualFlush); + } + self.add_pending_tick(PeerTick::Raft); + } + } + #[inline] pub fn insert_peer_cache(&mut self, peer: metapb::Peer) { for p in self.raft_group.store().region().get_peers() { diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index 64b03f6d0b3..4ca18dcd716 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -900,3 +900,36 @@ fn test_snapshot_recover_from_raft_write_failure_with_uncommitted_log() { cluster.must_put(format!("k1{}", i).as_bytes(), b"v1"); } } + +#[test] +fn test_snapshot_complete_recover_raft_tick() { + // https://github.com/tikv/tikv/issues/14548 gives the description of what the following tests. + let mut cluster = test_raftstore_v2::new_node_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(50); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10); + + cluster.run(); + + let region = cluster.get_region(b"k"); + cluster.must_transfer_leader(region.get_id(), new_peer(1, 1)); + for i in 0..200 { + let k = format!("k{:04}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + + cluster.stop_node(2); + for i in 200..300 { + let k = format!("k{:04}", i); + cluster.must_put(k.as_bytes(), b"val"); + } + + fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); + fail::cfg("RESET_APPLY_INDEX_WHEN_RESTART", "return").unwrap(); + cluster.run_node(2).unwrap(); + std::thread::sleep(Duration::from_millis(100)); + fail::remove("APPLY_COMMITTED_ENTRIES"); + cluster.stop_node(1); + + cluster.must_put(b"k0500", b"val"); + assert_eq!(cluster.must_get(b"k0500").unwrap(), b"val".to_vec()); +} From 8dd75f8693a99e33681a13939ac7e11d5ccff29f Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 18 Apr 2023 00:35:19 -0700 Subject: [PATCH 638/676] update storage.engine config to actual engine type used in existing cluster (#14541) ref tikv/tikv#12842 update storage.engine config to actual engine type used in existing cluster Signed-off-by: tonyxuqqi --- src/config/mod.rs | 5 ++- src/storage/config.rs | 74 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 2e81de6d829..dcbfdc4e441 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -89,6 +89,7 @@ use crate::{ }; pub const DEFAULT_ROCKSDB_SUB_DIR: &str = "db"; +pub const DEFAULT_TABLET_SUB_DIR: &str = "tablets"; /// By default, block cache size will be set to 45% of system memory. pub const BLOCK_CACHE_RATE: f64 = 0.45; @@ -3291,7 +3292,9 @@ impl TikvConfig { let kv_data_exists = if self.storage.engine == EngineType::RaftKv { RocksEngine::exists(&kv_db_path) } else { - Path::new(&self.storage.data_dir).join("tablets").exists() + Path::new(&self.storage.data_dir) + .join(DEFAULT_TABLET_SUB_DIR) + .exists() }; RaftDataStateMachine::new( diff --git a/src/storage/config.rs b/src/storage/config.rs index f65ed15cece..d301849528d 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -2,7 +2,7 @@ //! Storage configuration. -use std::{cmp::max, error::Error}; +use std::{cmp::max, error::Error, path::Path}; use engine_rocks::raw::{Cache, LRUCacheOptions, MemoryAllocator}; use file_system::{IoPriority, IoRateLimitMode, IoRateLimiter, IoType}; @@ -14,7 +14,10 @@ use tikv_util::{ sys::SysQuota, }; -use crate::config::{BLOCK_CACHE_RATE, MIN_BLOCK_CACHE_SHARD_SIZE, RAFTSTORE_V2_BLOCK_CACHE_RATE}; +use crate::config::{ + BLOCK_CACHE_RATE, DEFAULT_ROCKSDB_SUB_DIR, DEFAULT_TABLET_SUB_DIR, MIN_BLOCK_CACHE_SHARD_SIZE, + RAFTSTORE_V2_BLOCK_CACHE_RATE, +}; pub const DEFAULT_DATA_DIR: &str = "./"; const DEFAULT_GC_RATIO_THRESHOLD: f64 = 1.1; @@ -110,10 +113,43 @@ impl Default for Config { } impl Config { + fn validate_engine_type(&mut self) -> Result<(), Box> { + let v1_kv_db_path = + config::canonicalize_sub_path(&self.data_dir, DEFAULT_ROCKSDB_SUB_DIR).unwrap(); + let v2_tablet_path = + config::canonicalize_sub_path(&self.data_dir, DEFAULT_TABLET_SUB_DIR).unwrap(); + + let kv_data_exists = Path::new(&v1_kv_db_path).exists(); + let v2_tablet_exists = Path::new(&v2_tablet_path).exists(); + if kv_data_exists && v2_tablet_exists { + return Err("Both raft-kv and partitioned-raft-kv's data folders exist".into()); + } + + // v1's data exists, but the engine type is v2 + if kv_data_exists && self.engine == EngineType::RaftKv2 { + info!( + "TiKV has data for raft-kv engine but the engine type in config is partitioned-raft-kv. Ignore the config and keep raft-kv instead" + ); + self.engine = EngineType::RaftKv; + } + + // if v2's data exists, but the engine type is v1 + if v2_tablet_exists && self.engine == EngineType::RaftKv { + info!( + "TiKV has data for partitioned-raft-kv engine but the engine type in config is raft-kv. Ignore the config and keep partitioned-raft-kv instead" + ); + self.engine = EngineType::RaftKv2; + } + Ok(()) + } + pub fn validate(&mut self) -> Result<(), Box> { if self.data_dir != DEFAULT_DATA_DIR { self.data_dir = config::canonicalize_path(&self.data_dir)? } + + self.validate_engine_type()?; + if self.scheduler_concurrency > MAX_SCHED_CONCURRENCY { warn!( "TiKV has optimized latch since v4.0, so it is not necessary to set large schedule \ @@ -393,6 +429,8 @@ impl IoRateLimitConfig { #[cfg(test)] mod tests { + use std::fs; + use super::*; #[test] @@ -411,6 +449,38 @@ mod tests { cfg.validate().unwrap_err(); } + #[test] + fn test_validate_engine_type_config() { + let mut cfg = Config::default(); + cfg.engine = EngineType::RaftKv; + cfg.validate().unwrap(); + assert_eq!(cfg.engine, EngineType::RaftKv); + + cfg.engine = EngineType::RaftKv2; + cfg.validate().unwrap(); + assert_eq!(cfg.engine, EngineType::RaftKv2); + + let v1_kv_db_path = + config::canonicalize_sub_path(&cfg.data_dir, DEFAULT_ROCKSDB_SUB_DIR).unwrap(); + fs::create_dir_all(&v1_kv_db_path).unwrap(); + cfg.validate().unwrap(); + assert_eq!(cfg.engine, EngineType::RaftKv); + fs::remove_dir_all(&v1_kv_db_path).unwrap(); + + let v2_tablet_path = + config::canonicalize_sub_path(&cfg.data_dir, DEFAULT_TABLET_SUB_DIR).unwrap(); + fs::create_dir_all(&v2_tablet_path).unwrap(); + cfg.engine = EngineType::RaftKv; + cfg.validate().unwrap(); + assert_eq!(cfg.engine, EngineType::RaftKv2); + + // both v1 and v2 data exists, throw error + fs::create_dir_all(&v1_kv_db_path).unwrap(); + cfg.validate().unwrap_err(); + fs::remove_dir_all(&v1_kv_db_path).unwrap(); + fs::remove_dir_all(&v2_tablet_path).unwrap(); + } + #[test] fn test_adjust_shard_bits() { let config = BlockCacheConfig::default(); From 0c113a6370b205f13241890903dd166a839b7347 Mon Sep 17 00:00:00 2001 From: ekexium Date: Tue, 18 Apr 2023 15:59:20 +0800 Subject: [PATCH 639/676] txn: round up last_update_duration_ms (#14571) ref tikv/tikv#14497 Round up last_update_duration_ms, so that duration in (0, 1ms] won't be treated as 0. Co-authored-by: Ti Chi Robot --- src/server/lock_manager/waiter_manager.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index 5f433571431..c0e97e25e3a 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -270,10 +270,10 @@ impl Waiter { let mut lock_info = self.wait_info.lock_info.clone(); lock_info.set_duration_to_last_update_ms( self.last_updated_time - .map(|t| t.elapsed().as_millis() as u64) + // round up, so that duration in (0, 1ms] won't be treated as 0. + .map(|t| (t.elapsed().as_millis() as u64).max(1)) .unwrap_or_default(), ); - // lock_info.set_skip_resolving_lock(skip_resolving_lock); let error = MvccError::from(MvccErrorInner::KeyIsLocked(lock_info)); self.cancel(Some(StorageError::from(TxnError::from(error)))) } From 9fc86635ba57a79ea1ae25c393d847027e15824b Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 18 Apr 2023 16:35:20 +0800 Subject: [PATCH 640/676] raftstore-v2: check peer trim status before prepare merge (#14374) ref tikv/tikv#12842 None Signed-off-by: tabokie --- .../operation/command/admin/merge/prepare.rs | 169 ++++++++++++++++-- components/raftstore-v2/src/operation/life.rs | 33 +++- .../raftstore-v2/src/operation/ready/mod.rs | 19 ++ components/raftstore-v2/src/raft/peer.rs | 1 + .../tests/failpoints/test_merge.rs | 6 +- .../tests/integrations/cluster.rs | 41 +++-- .../tests/integrations/test_merge.rs | 17 +- components/raftstore/src/store/fsm/peer.rs | 5 +- tests/integrations/raftstore/test_merge.rs | 8 +- 9 files changed, 249 insertions(+), 50 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs index 16a8382cfad..601b4568866 100644 --- a/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs +++ b/components/raftstore-v2/src/operation/command/admin/merge/prepare.rs @@ -27,15 +27,19 @@ //! Start the tick (`Peer::on_check_merge`) to periodically check the //! eligibility of merge. -use std::mem; +use std::{mem, time::Duration}; +use collections::HashMap; use engine_traits::{Checkpointer, KvEngine, RaftEngine, RaftLogBatch, CF_LOCK}; use kvproto::{ + metapb::RegionEpoch, raft_cmdpb::{ AdminCmdType, AdminRequest, AdminResponse, CmdType, PrepareMergeRequest, PutRequest, RaftCmdRequest, Request, }, - raft_serverpb::{MergeState, PeerState, RegionLocalState}, + raft_serverpb::{ + ExtraMessage, ExtraMessageType, MergeState, PeerState, RaftMessage, RegionLocalState, + }, }; use parking_lot::RwLockUpgradableReadGuard; use protobuf::Message; @@ -46,7 +50,9 @@ use raftstore::{ Error, Result, }; use slog::{debug, info}; -use tikv_util::{box_err, log::SlogFormat, slog_panic, store::region_on_same_stores}; +use tikv_util::{ + box_err, log::SlogFormat, slog_panic, store::region_on_same_stores, time::Instant, +}; use super::merge_source_path; use crate::{ @@ -57,6 +63,8 @@ use crate::{ router::CmdResChannel, }; +const TRIM_CHECK_TIMEOUT: Duration = Duration::from_secs(10); + #[derive(Clone)] pub struct PreProposeContext { pub min_matched: u64, @@ -64,6 +72,12 @@ pub struct PreProposeContext { } pub enum PrepareStatus { + WaitForTrimStatus { + start_time: Instant, + // Peers that we are not sure if trimmed. + pending_peers: HashMap, + req: Option, + }, /// When a fence is present, we (1) delay the PrepareMerge /// command `cmd` until all writes before `idx` are applied (2) reject all /// in-coming write proposals. @@ -93,27 +107,28 @@ pub struct PrepareMergeResult { } impl Peer { - pub fn propose_prepare_merge( + pub fn propose_prepare_merge( &mut self, store_ctx: &mut StoreContext, mut req: RaftCmdRequest, ) -> Result { - // Best effort. Remove when trim check is implemented. - if self.storage().has_dirty_data() { - return Err(box_err!( - "{} source peer has dirty data, try again later", - SlogFormat(&self.logger) - )); - } self.validate_prepare_merge_command( store_ctx, req.get_admin_request().get_prepare_merge(), )?; + // We need to check three things in order: + // (1) `start_check_trim_status` + // (2) `check_logs_before_prepare_merge` + // (3) `check_pessimistic_locks` + // Check 1 and 3 are async, they yield by returning + // `Error::PendingPrepareMerge`. let pre_propose = if let Some(r) = self.already_checked_pessimistic_locks()? { r - } else { + } else if self.already_checked_trim_status()? { let r = self.check_logs_before_prepare_merge(store_ctx)?; self.check_pessimistic_locks(r, &mut req)? + } else { + return self.start_check_trim_status(store_ctx, &mut req); }; req.mut_admin_request() .mut_prepare_merge() @@ -280,6 +295,110 @@ impl Peer { }) } + fn start_check_trim_status( + &mut self, + store_ctx: &mut StoreContext, + req: &mut RaftCmdRequest, + ) -> Result { + if self.storage().has_dirty_data() { + return Err(box_err!( + "source peer {} not trimmed, skip merging.", + self.peer_id() + )); + } + let target = req.get_admin_request().get_prepare_merge().get_target(); + let mut pending_peers = HashMap::default(); + for region in [self.region(), target] { + for p in region.get_peers() { + if p.get_id() == self.peer_id() { + continue; + } + let mut msg = RaftMessage::default(); + msg.set_region_id(region.get_id()); + msg.set_from_peer(self.peer().clone()); + msg.set_to_peer(p.clone()); + msg.set_region_epoch(region.get_region_epoch().clone()); + msg.mut_extra_msg() + .set_type(ExtraMessageType::MsgAvailabilityRequest); + msg.mut_extra_msg() + .mut_availability_context() + .set_from_region_id(self.region_id()); + store_ctx.trans.send(msg)?; + pending_peers.insert(p.get_id(), region.get_region_epoch().clone()); + } + } + + let status = &mut self.merge_context_mut().prepare_status; + // Shouldn't enter this call if trim check is already underway. + assert!(status.is_none()); + *status = Some(PrepareStatus::WaitForTrimStatus { + start_time: Instant::now_coarse(), + pending_peers, + req: Some(mem::take(req)), + }); + Err(Error::PendingPrepareMerge) + } + + pub fn merge_on_availability_response( + &mut self, + store_ctx: &mut StoreContext, + from_peer: u64, + resp: &ExtraMessage, + ) { + if self.merge_context().is_some() + && let Some(PrepareStatus::WaitForTrimStatus { pending_peers, req, .. }) = self + .merge_context_mut() + .prepare_status + .as_mut() + && req.is_some() + { + assert!(resp.has_availability_context()); + let from_region = resp.get_availability_context().get_from_region_id(); + let from_epoch = resp.get_availability_context().get_from_region_epoch(); + let trimmed = resp.get_availability_context().get_trimmed(); + if let Some(epoch) = pending_peers.get(&from_peer) + && util::is_region_epoch_equal(from_epoch, epoch) + { + if !trimmed { + info!( + self.logger, + "cancel merge because source peer is not trimmed"; + "region_id" => from_region, + "peer_id" => from_peer, + ); + self.take_merge_context(); + return; + } else { + pending_peers.remove(&from_peer); + } + } + if pending_peers.is_empty() { + let (ch, _) = CmdResChannel::pair(); + let req = req.take().unwrap(); + self.on_admin_command(store_ctx, req, ch); + } + } + } + + fn already_checked_trim_status(&mut self) -> Result { + match self + .merge_context() + .as_ref() + .and_then(|c| c.prepare_status.as_ref()) + { + Some(PrepareStatus::WaitForTrimStatus { pending_peers, .. }) => { + if pending_peers.is_empty() { + Ok(true) + } else { + Err(Error::PendingPrepareMerge) + } + } + None => Ok(false), + // Shouldn't reach here after calling `already_checked_pessimistic_locks` first. + _ => unreachable!(), + } + } + fn check_pessimistic_locks( &mut self, ctx: PreProposeContext, @@ -327,7 +446,7 @@ impl Peer { if applied_index < *fence { info!( self.logger, - "reject PrepareMerge because applied_index has not reached prepare_merge_fence"; + "suspend PrepareMerge because applied_index has not reached prepare_merge_fence"; "applied_index" => applied_index, "prepare_merge_fence" => fence, ); @@ -340,7 +459,25 @@ impl Peer { "another merge is in-progress, merge_state: {:?}.", state )), - None => Ok(None), + _ => Ok(None), + } + } + + #[inline] + pub fn maybe_clean_up_stale_merge_context(&mut self) { + // Check if there's a stale trim check. Ideally this should be implemented as a + // tick. But this is simpler. + if let Some(PrepareStatus::WaitForTrimStatus { + start_time, req, .. + }) = self + .merge_context() + .as_ref() + .and_then(|c| c.prepare_status.as_ref()) + && req.is_some() + && start_time.saturating_elapsed() > TRIM_CHECK_TIMEOUT + { + info!(self.logger, "cancel merge because trim check timed out"); + self.take_merge_context(); } } @@ -351,6 +488,10 @@ impl Peer { store_ctx: &mut StoreContext, applied_index: u64, ) { + if self.merge_context().is_none() { + return; + } + // Check the fence. if let Some(req) = self .merge_context_mut() .maybe_take_pending_prepare(applied_index) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 525be1991bd..7c7d1f37275 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -34,7 +34,7 @@ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ metapb::{self, Region}, raft_cmdpb::{AdminCmdType, RaftCmdRequest}, - raft_serverpb::{ExtraMessageType, PeerState, RaftMessage}, + raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage}, }; use raftstore::store::{util, Transport, WriteTask}; use slog::{debug, error, info, warn}; @@ -422,6 +422,35 @@ where } impl Peer { + pub fn on_availability_request( + &mut self, + ctx: &mut StoreContext, + from_region_id: u64, + from_peer: &metapb::Peer, + ) { + let mut msg = RaftMessage::default(); + msg.set_region_id(from_region_id); + msg.set_from_peer(self.peer().clone()); + msg.set_to_peer(from_peer.clone()); + msg.mut_extra_msg() + .set_type(ExtraMessageType::MsgAvailabilityResponse); + let report = msg.mut_extra_msg().mut_availability_context(); + report.set_from_region_id(self.region_id()); + report.set_from_region_epoch(self.region().get_region_epoch().clone()); + report.set_trimmed(!self.storage().has_dirty_data()); + let _ = ctx.trans.send(msg); + } + + #[inline] + pub fn on_availability_response( + &mut self, + ctx: &mut StoreContext, + from_peer: u64, + resp: &ExtraMessage, + ) { + self.merge_on_availability_response(ctx, from_peer, resp); + } + pub fn maybe_schedule_gc_peer_tick(&mut self) { let region_state = self.storage().region_state(); if !region_state.get_removed_records().is_empty() @@ -441,7 +470,6 @@ impl Peer { { let tombstone_msg = self.tombstone_message_for_same_region(peer.clone()); self.add_message(tombstone_msg); - self.set_has_ready(); true } else { false @@ -464,7 +492,6 @@ impl Peer { cmp::Ordering::Less => { if let Some(msg) = build_peer_destroyed_report(msg) { self.add_message(msg); - self.set_has_ready(); } } // No matter it's greater or equal, the current peer must be destroyed. diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 9419549e580..a9e72f02f8e 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -95,6 +95,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, if self.fsm.peer_mut().tick() { self.fsm.peer_mut().set_has_ready(); } + self.fsm.peer_mut().maybe_clean_up_stale_merge_context(); self.schedule_tick(PeerTick::Raft); } @@ -236,6 +237,24 @@ impl Peer { return; } } + ExtraMessageType::MsgAvailabilityRequest => { + self.on_availability_request( + ctx, + msg.get_extra_msg() + .get_availability_context() + .get_from_region_id(), + msg.get_from_peer(), + ); + return; + } + ExtraMessageType::MsgAvailabilityResponse => { + self.on_availability_response( + ctx, + msg.get_from_peer().get_id(), + msg.get_extra_msg(), + ); + return; + } _ => (), } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 6b5898b6297..494ae183da6 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -831,6 +831,7 @@ impl Peer { #[inline] pub fn add_message(&mut self, msg: RaftMessage) { self.pending_messages.push(msg); + self.set_has_ready(); } #[inline] diff --git a/components/raftstore-v2/tests/failpoints/test_merge.rs b/components/raftstore-v2/tests/failpoints/test_merge.rs index 3979d61743a..d660221d5ee 100644 --- a/components/raftstore-v2/tests/failpoints/test_merge.rs +++ b/components/raftstore-v2/tests/failpoints/test_merge.rs @@ -37,7 +37,7 @@ fn test_source_and_target_both_replay() { { let _fp = fail::FailGuard::new("after_acquire_source_checkpoint", "1*return->off"); - merge_region(router, region_1, peer_1, region_2, false); + merge_region(&cluster, 0, region_1, peer_1, region_2, false); } cluster.restart(0); @@ -87,9 +87,9 @@ fn test_source_destroy_before_target_apply() { // AckCommitMerge). let _fp1 = fail::FailGuard::new("force_send_catch_up_logs", "1*return->off"); let _fp2 = fail::FailGuard::new("after_acquire_source_checkpoint", "1*return->off"); - merge_region(router, region_1, peer_1.clone(), region_2, false); + merge_region(&cluster, 0, region_1, peer_1.clone(), region_2, false); } - assert_peer_not_exist(region_1_id, peer_1.get_id(), router); + assert_peer_not_exist(region_1_id, peer_1.get_id(), &cluster.routers[0]); cluster.restart(0); let router = &mut cluster.routers[0]; diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 9c81f9545a3..4bd0cef8846 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -826,10 +826,11 @@ pub mod merge_helper { }; use raftstore_v2::router::PeerMsg; - use super::TestRouter; + use super::Cluster; pub fn merge_region( - router: &mut TestRouter, + cluster: &Cluster, + store_offset: usize, source: metapb::Region, source_peer: metapb::Peer, target: metapb::Region, @@ -848,25 +849,35 @@ pub mod merge_helper { req.set_admin_request(admin_req); let (msg, sub) = PeerMsg::admin_command(req); - router.send(region_id, msg).unwrap(); - let resp = block_on(sub.result()).unwrap(); - if check { - assert!(!resp.get_header().has_error(), "{:?}", resp); - } + cluster.routers[store_offset].send(region_id, msg).unwrap(); + // They may communicate about trimmed status. + cluster.dispatch(region_id, vec![]); + let _ = block_on(sub.result()).unwrap(); + // We don't check the response because it needs to do a lot of checks async + // before actually proposing the command. // TODO: when persistent implementation is ready, we can use tablet index of // the parent to check whether the split is done. Now, just sleep a second. thread::sleep(Duration::from_secs(1)); - let new_target = router.region_detail(target.id); + let mut new_target = cluster.routers[store_offset].region_detail(target.id); if check { - if new_target.get_start_key() == source.get_start_key() { - // [source, target] => new_target - assert_eq!(new_target.get_end_key(), target.get_end_key()); - } else { - // [target, source] => new_target - assert_eq!(new_target.get_start_key(), target.get_start_key()); - assert_eq!(new_target.get_end_key(), source.get_end_key()); + for i in 1..=100 { + let r1 = new_target.get_start_key() == source.get_start_key() + && new_target.get_end_key() == target.get_end_key(); + let r2 = new_target.get_start_key() == target.get_start_key() + && new_target.get_end_key() == source.get_end_key(); + if r1 || r2 { + break; + } else if i == 100 { + panic!( + "still not merged after 5s: {:?} + {:?} != {:?}", + source, target, new_target + ); + } else { + thread::sleep(Duration::from_millis(50)); + new_target = cluster.routers[store_offset].region_detail(target.id); + } } } new_target diff --git a/components/raftstore-v2/tests/integrations/test_merge.rs b/components/raftstore-v2/tests/integrations/test_merge.rs index c08c2bde484..7d9dbef720e 100644 --- a/components/raftstore-v2/tests/integrations/test_merge.rs +++ b/components/raftstore-v2/tests/integrations/test_merge.rs @@ -81,25 +81,24 @@ fn test_merge() { ); } - let region_2 = merge_region(router, region_1.clone(), peer_1, region_2, true); + let region_2 = merge_region(&cluster, 0, region_1.clone(), peer_1, region_2, true); { - let snapshot = router.stale_snapshot(region_2.get_id()); + let snapshot = cluster.routers[0].stale_snapshot(region_2.get_id()); let key = format!("k{}1", region_1.get_id()); assert!(snapshot.get_value(key.as_bytes()).unwrap().is_some()); } - let region_5 = merge_region(router, region_6.clone(), peer_6, region_5, true); + let region_5 = merge_region(&cluster, 0, region_6.clone(), peer_6, region_5, true); { - let snapshot = router.stale_snapshot(region_5.get_id()); + let snapshot = cluster.routers[0].stale_snapshot(region_5.get_id()); let key = format!("k{}5", region_6.get_id()); assert!(snapshot.get_value(key.as_bytes()).unwrap().is_some()); } - let region_3 = merge_region(router, region_2, peer_2, region_3, true); - let region_4 = merge_region(router, region_3, peer_3, region_4, true); - let region_5 = merge_region(router, region_4, peer_4, region_5, true); + let region_3 = merge_region(&cluster, 0, region_2, peer_2, region_3, true); + let region_4 = merge_region(&cluster, 0, region_3, peer_3, region_4, true); + let region_5 = merge_region(&cluster, 0, region_4, peer_4, region_5, true); cluster.restart(0); - let router = &mut cluster.routers[0]; - let snapshot = router.stale_snapshot(region_5.get_id()); + let snapshot = cluster.routers[0].stale_snapshot(region_5.get_id()); for (i, v) in [1, 2, 3, 4, 5, 5].iter().enumerate() { let rid = region_1.get_id() + i as u64; let key = format!("k{rid}{v}"); diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 67054d5bd11..f2d1c7ffc0e 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2796,8 +2796,9 @@ where self.on_voter_replicated_index_response(msg.get_extra_msg()); } // It's v2 only message and ignore does no harm. - ExtraMessageType::MsgGcPeerRequest | ExtraMessageType::MsgGcPeerResponse => (), - ExtraMessageType::MsgFlushMemtable => (), + ExtraMessageType::MsgGcPeerRequest + | ExtraMessageType::MsgGcPeerResponse + | ExtraMessageType::MsgFlushMemtable => (), } } diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 151e278d0d1..404cb418d33 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -189,7 +189,7 @@ fn test_node_base_merge_v2() { } #[test_case(test_raftstore::new_node_cluster)] -#[test_case(test_raftstore_v2::new_node_cluster)] +// No v2, it requires all peers to be available to check trim status. fn test_node_merge_with_slow_learner() { let mut cluster = new_cluster(0, 2); configure_for_merge(&mut cluster.cfg); @@ -410,8 +410,8 @@ fn test_node_check_merged_message() { // Test if a merge handled properly when there is a unfinished slow split before // merge. +// No v2, it requires all peers to be available to check trim status. #[test_case(test_raftstore::new_node_cluster)] -#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_merge_slow_split() { fn imp(is_right_derive: bool) { let mut cluster = new_cluster(0, 3); @@ -913,8 +913,8 @@ fn test_merge_with_slow_promote() { /// logically) /// - A split => C (-∞, k3), A [k3, +∞) /// - Then network recovery +// No v2, it requires all peers to be available to check trim status. #[test_case(test_raftstore::new_node_cluster)] -#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_isolated_store_with_no_target_peer() { let mut cluster = new_cluster(0, 4); configure_for_merge(&mut cluster.cfg); @@ -973,8 +973,8 @@ fn test_merge_isolated_store_with_no_target_peer() { /// Test whether a isolated peer can recover when two other regions merge to its /// region +// No v2, it requires all peers to be available to check trim status. #[test_case(test_raftstore::new_node_cluster)] -#[test_case(test_raftstore_v2::new_node_cluster)] fn test_merge_cascade_merge_isolated() { let mut cluster = new_cluster(0, 3); configure_for_merge(&mut cluster.cfg); From 58ed39a1e3009f0be39cb5607d8b0dae2632d2a5 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 18 Apr 2023 16:59:20 +0800 Subject: [PATCH 641/676] *: correct io type for raft engine write and purge (#14578) ref tikv/tikv#14462 so background purge can be rate limited. Co-authored-by: Ti Chi Robot --- components/file_system/src/lib.rs | 11 ++++- components/file_system/src/rate_limiter.rs | 10 ++--- components/raft_log_engine/src/engine.rs | 12 ++++-- components/raftstore-v2/src/batch/store.rs | 3 +- .../raftstore/src/store/async_io/read.rs | 1 + components/raftstore/src/store/fsm/store.rs | 2 + components/server/src/common.rs | 42 ++++++++++++------- src/storage/config.rs | 1 + 8 files changed, 54 insertions(+), 28 deletions(-) diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index 0b6213094af..413a4ef827e 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -72,6 +72,7 @@ pub enum IoType { Gc = 8, Import = 9, Export = 10, + RewriteLog = 11, } impl IoType { @@ -88,6 +89,7 @@ impl IoType { IoType::Gc => "gc", IoType::Import => "import", IoType::Export => "export", + IoType::RewriteLog => "log_rewrite", } } } @@ -145,8 +147,13 @@ impl IoPriority { } } - fn unsafe_from_u32(i: u32) -> Self { - unsafe { std::mem::transmute(i) } + fn from_u32(i: u32) -> Self { + match i { + 0 => IoPriority::Low, + 1 => IoPriority::Medium, + 2 => IoPriority::High, + _ => panic!("unknown io priority {}", i), + } } } diff --git a/components/file_system/src/rate_limiter.rs b/components/file_system/src/rate_limiter.rs index feffb6dcf14..79c7094b186 100644 --- a/components/file_system/src/rate_limiter.rs +++ b/components/file_system/src/rate_limiter.rs @@ -497,9 +497,7 @@ impl IoRateLimiter { pub fn request(&self, io_type: IoType, io_op: IoOp, mut bytes: usize) -> usize { if self.mode.contains(io_op) { bytes = self.throughput_limiter.request( - IoPriority::unsafe_from_u32( - self.priority_map[io_type as usize].load(Ordering::Relaxed), - ), + IoPriority::from_u32(self.priority_map[io_type as usize].load(Ordering::Relaxed)), bytes, ); } @@ -518,7 +516,7 @@ impl IoRateLimiter { bytes = self .throughput_limiter .async_request( - IoPriority::unsafe_from_u32( + IoPriority::from_u32( self.priority_map[io_type as usize].load(Ordering::Relaxed), ), bytes, @@ -535,9 +533,7 @@ impl IoRateLimiter { fn request_with_skewed_clock(&self, io_type: IoType, io_op: IoOp, mut bytes: usize) -> usize { if self.mode.contains(io_op) { bytes = self.throughput_limiter.request_with_skewed_clock( - IoPriority::unsafe_from_u32( - self.priority_map[io_type as usize].load(Ordering::Relaxed), - ), + IoPriority::from_u32(self.priority_map[io_type as usize].load(Ordering::Relaxed)), bytes, ); } diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 7b107bc0cc9..621d708b057 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -14,7 +14,7 @@ use engine_traits::{ RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, Result, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; -use file_system::{IoOp, IoRateLimiter, IoType}; +use file_system::{IoOp, IoRateLimiter, IoType, WithIoType}; use kvproto::{ metapb::Region, raft_serverpb::{ @@ -66,7 +66,8 @@ impl Read for ManagedReader { fn read(&mut self, buf: &mut [u8]) -> IoResult { let mut size = buf.len(); if let Some(ref mut limiter) = self.rate_limiter { - size = limiter.request(IoType::ForegroundRead, IoOp::Read, size); + let io_type = file_system::get_io_type(); + size = limiter.request(io_type, IoOp::Read, size); } match self.inner.as_mut() { Either::Left(reader) => reader.read(&mut buf[..size]), @@ -96,7 +97,8 @@ impl Write for ManagedWriter { fn write(&mut self, buf: &[u8]) -> IoResult { let mut size = buf.len(); if let Some(ref mut limiter) = self.rate_limiter { - size = limiter.request(IoType::ForegroundWrite, IoOp::Write, size); + let io_type = file_system::get_io_type(); + size = limiter.request(io_type, IoOp::Write, size); } match self.inner.as_mut() { Either::Left(writer) => writer.write(&buf[..size]), @@ -653,6 +655,8 @@ impl RaftEngine for RaftLogEngine { } fn consume(&self, batch: &mut Self::LogBatch, sync: bool) -> Result { + // Always use ForegroundWrite as all `consume` calls share the same write queue. + let _guard = WithIoType::new(IoType::ForegroundWrite); self.0.write(&mut batch.0, sync).map_err(transfer_error) } @@ -663,6 +667,8 @@ impl RaftEngine for RaftLogEngine { _: usize, _: usize, ) -> Result { + // Always use ForegroundWrite as all `consume` calls share the same write queue. + let _guard = WithIoType::new(IoType::ForegroundWrite); self.0.write(&mut batch.0, sync).map_err(transfer_error) } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 66b0414b7c3..14282cc09f9 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -17,7 +17,7 @@ use collections::HashMap; use concurrency_manager::ConcurrencyManager; use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; -use file_system::{set_io_type, IoType}; +use file_system::{set_io_type, IoType, WithIoType}; use kvproto::{disk_usage::DiskUsage, raft_serverpb::RaftMessage}; use pd_client::PdClient; use raft::{StateRole, INVALID_ID}; @@ -582,6 +582,7 @@ impl StoreSystem { let logger = self.logger.clone(); let router = router.clone(); worker.spawn_interval_task(cfg.value().raft_engine_purge_interval.0, move || { + let _guard = WithIoType::new(IoType::RewriteLog); match raft_clone.manual_purge() { Ok(regions) => { for r in regions { diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index cee6373c5bd..ced7b0f4418 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -153,6 +153,7 @@ where tried_cnt, term, } => { + let _guard = WithIoType::new(IoType::Replication); let mut ents = Vec::with_capacity(std::cmp::min((high - low) as usize, MAX_INIT_ENTRY_COUNT)); let res = self.raft_engine.fetch_entries_to( diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 7c71dc3825e..f28c4170459 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -28,6 +28,7 @@ use engine_traits::{ RaftLogBatch, Range, WriteBatch, WriteOptions, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use fail::fail_point; +use file_system::{IoType, WithIoType}; use futures::{compat::Future01CompatExt, FutureExt}; use grpcio_health::HealthService; use keys::{self, data_end_key, data_key, enc_end_key, enc_start_key}; @@ -1546,6 +1547,7 @@ impl RaftBatchSystem { let raft_clone = engines.raft.clone(); let router_clone = self.router(); worker.spawn_interval_task(cfg.value().raft_engine_purge_interval.0, move || { + let _guard = WithIoType::new(IoType::RewriteLog); match raft_clone.manual_purge() { Ok(regions) => { for region_id in regions { diff --git a/components/server/src/common.rs b/components/server/src/common.rs index 2d2ae7bd398..10da6ec9c74 100644 --- a/components/server/src/common.rs +++ b/components/server/src/common.rs @@ -22,8 +22,8 @@ use engine_rocks::{ FlowInfo, RocksEngine, RocksStatistics, }; use engine_traits::{ - CachedTablet, CfOptionsExt, FlowControlFactorsExt, KvEngine, RaftEngine, StatisticsReporter, - TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, + data_cf_offset, CachedTablet, CfOptionsExt, FlowControlFactorsExt, KvEngine, RaftEngine, + StatisticsReporter, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, }; use error_code::ErrorCodeExt; use file_system::{get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor}; @@ -504,25 +504,24 @@ impl EnginesResourceInfo { _now: Instant, cached_latest_tablets: &mut HashMap>, ) { - let mut normalized_pending_bytes = 0; + let mut compaction_pending_bytes = [0; DATA_CFS.len()]; + let mut soft_pending_compaction_bytes_limit = [0; DATA_CFS.len()]; - fn fetch_engine_cf(engine: &RocksEngine, cf: &str, normalized_pending_bytes: &mut u32) { + let mut fetch_engine_cf = |engine: &RocksEngine, cf: &str| { if let Ok(cf_opts) = engine.get_options_cf(cf) { if let Ok(Some(b)) = engine.get_cf_pending_compaction_bytes(cf) { - if cf_opts.get_soft_pending_compaction_bytes_limit() > 0 { - *normalized_pending_bytes = std::cmp::max( - *normalized_pending_bytes, - (b * EnginesResourceInfo::SCALE_FACTOR - / cf_opts.get_soft_pending_compaction_bytes_limit()) - as u32, - ); - } + let offset = data_cf_offset(cf); + compaction_pending_bytes[offset] += b; + soft_pending_compaction_bytes_limit[offset] = cmp::max( + cf_opts.get_soft_pending_compaction_bytes_limit(), + soft_pending_compaction_bytes_limit[offset], + ); } } - } + }; if let Some(raft_engine) = &self.raft_engine { - fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); + fetch_engine_cf(raft_engine, CF_DEFAULT); } self.tablet_registry @@ -543,13 +542,26 @@ impl EnginesResourceInfo { for (_, cache) in cached_latest_tablets.iter_mut() { let Some(tablet) = cache.latest() else { continue }; for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { - fetch_engine_cf(tablet, cf, &mut normalized_pending_bytes); + fetch_engine_cf(tablet, cf); } } // Clear ensures that these tablets are not hold forever. cached_latest_tablets.clear(); + let mut normalized_pending_bytes = 0; + for (pending, limit) in compaction_pending_bytes + .iter() + .zip(soft_pending_compaction_bytes_limit) + { + if limit > 0 { + normalized_pending_bytes = cmp::max( + normalized_pending_bytes, + (*pending * EnginesResourceInfo::SCALE_FACTOR / limit) as u32, + ) + } + } + let (_, avg) = self .normalized_pending_bytes_collector .add(normalized_pending_bytes); diff --git a/src/storage/config.rs b/src/storage/config.rs index d301849528d..63250176694 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -394,6 +394,7 @@ impl IoRateLimitConfig { limiter.set_io_priority(IoType::Gc, self.gc_priority); limiter.set_io_priority(IoType::Import, self.import_priority); limiter.set_io_priority(IoType::Export, self.export_priority); + limiter.set_io_priority(IoType::RewriteLog, self.compaction_priority); limiter.set_io_priority(IoType::Other, self.other_priority); limiter } From a4a287980c2b9b3cbcd6ef05a7ec5ac0e655146d Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Tue, 18 Apr 2023 17:33:20 +0800 Subject: [PATCH 642/676] encryption: fix key collision handling (#14586) close tikv/tikv#14585 Fix a bug that a newly generated encryption key might erase an old key and make data unreadable Signed-off-by: tabokie --- components/encryption/src/manager/mod.rs | 44 +++++++++++++++++------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 0f3233d7819..a367ad44df2 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + collections::hash_map::Entry, io::{Error as IoError, ErrorKind, Result as IoResult}, path::{Path, PathBuf}, sync::{ @@ -287,11 +288,15 @@ impl Dicts { Ok(Some(())) } - fn rotate_key(&self, key_id: u64, key: DataKey, master_key: &dyn Backend) -> Result<()> { + fn rotate_key(&self, key_id: u64, key: DataKey, master_key: &dyn Backend) -> Result { info!("encryption: rotate data key."; "key_id" => key_id); { let mut key_dict = self.key_dict.lock().unwrap(); - key_dict.keys.insert(key_id, key); + match key_dict.keys.entry(key_id) { + // key id collides + Entry::Occupied(_) => return Ok(false), + Entry::Vacant(e) => e.insert(key), + }; key_dict.current_key_id = key_id; }; @@ -299,7 +304,7 @@ impl Dicts { self.save_key_dict(master_key)?; // Update current data key id. self.current_key_id.store(key_id, Ordering::SeqCst); - Ok(()) + Ok(true) } fn maybe_rotate_data_key( @@ -337,15 +342,30 @@ impl Dicts { let duration = now.duration_since(UNIX_EPOCH).unwrap(); let creation_time = duration.as_secs(); - let (key_id, key) = generate_data_key(method); - let data_key = DataKey { - key, - method, - creation_time, - was_exposed: false, - ..Default::default() - }; - self.rotate_key(key_id, data_key, master_key) + // Generate new data key. + let generate_limit = 10; + for _ in 0..generate_limit { + let (key_id, key) = generate_data_key(method); + if key_id == 0 { + // 0 is invalid + continue; + } + let data_key = DataKey { + key, + method, + creation_time, + was_exposed: false, + ..Default::default() + }; + + let ok = self.rotate_key(key_id, data_key, master_key)?; + if !ok { + // key id collides, retry + continue; + } + return Ok(()); + } + Err(box_err!("key id collides {} times!", generate_limit)) } } From a473cb3e5ae3cd5a606db82e53b67b636eecfbf5 Mon Sep 17 00:00:00 2001 From: ShuNing Date: Wed, 19 Apr 2023 11:15:20 +0800 Subject: [PATCH 643/676] sched_pool: auto switch between the single queue and priority queue (#14582) ref tikv/tikv#14353, ref tikv/tikv#14375 Addresses performance regression in the priority queue caused by design. The scheduler worker will now automatically switch between the `single-queue pool` and `priority-queue pool` based on the resource group settings. - Once the group is reserved, use the `single-queue pool` - Once the group is customized, use the `priority-queue pool` Signed-off-by: nolouch --- .../resource_control/src/resource_group.rs | 26 ++- metrics/grafana/tikv_fast_tune.json | 2 +- src/storage/mod.rs | 34 +++ src/storage/mvcc/txn.rs | 4 +- src/storage/txn/sched_pool.rs | 215 ++++++++++++------ tests/failpoints/cases/test_storage.rs | 110 ++++++++- 6 files changed, 311 insertions(+), 80 deletions(-) diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 7435fc17d01..9a7a2e7b3cc 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -4,7 +4,7 @@ use std::{ cell::Cell, cmp::{max, min}, sync::{ - atomic::{AtomicU64, Ordering}, + atomic::{AtomicBool, AtomicU64, Ordering}, Arc, Mutex, }, time::Duration, @@ -169,6 +169,8 @@ pub struct ResourceController { last_min_vt: AtomicU64, // the last time min vt is overflow last_rest_vt_time: Cell, + // whether the settings is customized by user + customized: AtomicBool, } // we are ensure to visit the `last_rest_vt_time` by only 1 thread so it's @@ -185,6 +187,7 @@ impl ResourceController { last_min_vt: AtomicU64::new(0), max_ru_quota: Mutex::new(DEFAULT_MAX_RU_QUOTA), last_rest_vt_time: Cell::new(Instant::now_coarse()), + customized: AtomicBool::new(false), }; // add the "default" resource group controller.add_resource_group( @@ -244,6 +247,16 @@ impl ResourceController { // maybe update existed group self.resource_consumptions.write().insert(name, group); + self.check_customized(); + } + + fn check_customized(&self) { + let groups = self.resource_consumptions.read(); + if groups.len() == 1 && groups.get(DEFAULT_RESOURCE_GROUP_NAME.as_bytes()).is_some() { + self.customized.store(false, Ordering::Release); + return; + } + self.customized.store(true, Ordering::Release); } // we calculate the weight of each resource group based on the currently maximum @@ -268,9 +281,15 @@ impl ResourceController { 0, MEDIUM_PRIORITY, ); + self.check_customized(); return; } self.resource_consumptions.write().remove(name); + self.check_customized(); + } + + pub fn is_customized(&self) -> bool { + self.customized.load(Ordering::Acquire) } #[inline] @@ -625,7 +644,8 @@ pub(crate) mod tests { let resource_manager = ResourceGroupManager::default(); let resource_ctl = resource_manager.derive_controller("test_read".into(), true); let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); - + assert_eq!(resource_ctl.is_customized(), false); + assert_eq!(resource_ctl_write.is_customized(), false); let group1 = new_resource_group_ru("test1".into(), 5000, 0); resource_manager.add_resource_group(group1); assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); @@ -633,6 +653,8 @@ pub(crate) mod tests { resource_ctl_write.resource_group("test1".as_bytes()).weight, 20 ); + assert_eq!(resource_ctl.is_customized(), true); + assert_eq!(resource_ctl_write.is_customized(), true); // add a resource group with big ru let group1 = new_resource_group_ru("test2".into(), 50000, 0); diff --git a/metrics/grafana/tikv_fast_tune.json b/metrics/grafana/tikv_fast_tune.json index b096bb418fe..85e9d5c7f02 100644 --- a/metrics/grafana/tikv_fast_tune.json +++ b/metrics/grafana/tikv_fast_tune.json @@ -2712,7 +2712,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_futurepool_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sched-worker-pool\"}[1m]))", + "expr": "sum(rate(tikv_futurepool_pending_task_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"sched-worker-.*\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 1, diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 11740bcc2bf..faacc4cf4cb 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -3242,6 +3242,40 @@ impl TestStorageBuilder { Some(Arc::new(ResourceController::new("test".to_owned(), false))), ) } + + pub fn build_for_resource_controller( + self, + resource_controller: Arc, + ) -> Result, L, F>> { + let engine = TxnTestEngine { + engine: self.engine, + txn_ext: Arc::new(TxnExt::default()), + }; + let read_pool = build_read_pool_for_test( + &crate::config::StorageReadPoolConfig::default_for_test(), + engine.clone(), + ); + + Storage::from_engine( + engine, + &self.config, + ReadPool::from(read_pool).handle(), + self.lock_mgr, + ConcurrencyManager::new(1.into()), + DynamicConfigs { + pipelined_pessimistic_lock: self.pipelined_pessimistic_lock, + in_memory_pessimistic_lock: self.in_memory_pessimistic_lock, + wake_up_delay_duration_ms: self.wake_up_delay_duration_ms, + }, + Arc::new(FlowController::Singleton(EngineFlowController::empty())), + DummyReporter, + ResourceTagFactory::new_for_test(), + Arc::new(QuotaLimiter::default()), + latest_feature_gate(), + None, + Some(resource_controller), + ) + } } pub trait ResponseBatchConsumer: Send { diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index d5e55e251ae..0eaca54f226 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -9,7 +9,7 @@ use kvproto::kvrpcpb::LockInfo; use txn_types::{Key, Lock, PessimisticLock, TimeStamp, Value}; use super::metrics::{GC_DELETE_VERSIONS_HISTOGRAM, MVCC_VERSIONS_HISTOGRAM}; -use crate::storage::{kv::Modify, mvcc::PessimisticLockNotFoundReason}; +use crate::storage::kv::Modify; pub const MAX_TXN_WRITE_SIZE: usize = 32 * 1024; @@ -246,7 +246,7 @@ pub(crate) fn make_txn_error( ) -> crate::storage::mvcc::ErrorInner { use kvproto::kvrpcpb::WriteConflictReason; - use crate::storage::mvcc::ErrorInner; + use crate::storage::mvcc::{ErrorInner, PessimisticLockNotFoundReason}; if let Some(s) = s { match s.to_ascii_lowercase().as_str() { "keyislocked" => { diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 49539d51d8c..4036de7a8b2 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -55,17 +55,96 @@ impl PoolTicker for SchedTicker { } #[derive(Clone)] -pub enum SchedPool { +pub enum QueueType { // separated thread pools for different priority commands - Vanilla { - high_worker_pool: FuturePool, - worker_pool: FuturePool, - }, - // one priority based thread pool to handle all commands - Priority { - worker_pool: FuturePool, - resource_ctl: Arc, - }, + Vanilla, + // automatically switch between the `single-queue pool` and `priority-queue pool` based on the + // resource group settings, only used when the resource control feature is enabled. + Dynamic, +} + +#[derive(Clone)] +struct VanillaQueue { + high_worker_pool: FuturePool, + worker_pool: FuturePool, +} + +impl VanillaQueue { + fn spawn( + &self, + priority_level: CommandPri, + f: impl futures::Future + Send + 'static, + ) -> Result<(), Full> { + if priority_level == CommandPri::High { + self.high_worker_pool.spawn(f) + } else { + self.worker_pool.spawn(f) + } + } + + fn scale_pool_size(&self, pool_size: usize) { + self.high_worker_pool + .scale_pool_size(std::cmp::max(1, pool_size / 2)); + self.worker_pool.scale_pool_size(pool_size); + } + + fn get_pool_size(&self, priority_level: CommandPri) -> usize { + if priority_level == CommandPri::High { + self.high_worker_pool.get_pool_size() + } else { + self.worker_pool.get_pool_size() + } + } +} + +#[derive(Clone)] +struct PriorityQueue { + worker_pool: FuturePool, + resource_ctl: Arc, +} + +impl PriorityQueue { + fn spawn( + &self, + group_name: &str, + priority_level: CommandPri, + f: impl futures::Future + Send + 'static, + ) -> Result<(), Full> { + let fixed_level = match priority_level { + CommandPri::High => Some(0), + CommandPri::Normal => None, + CommandPri::Low => Some(2), + }; + // TODO: maybe use a better way to generate task_id + let task_id = rand::random::(); + let mut extras = Extras::new_multilevel(task_id, fixed_level); + extras.set_metadata(group_name.as_bytes().to_owned()); + self.worker_pool.spawn_with_extras( + ControlledFuture::new( + async move { + f.await; + }, + self.resource_ctl.clone(), + group_name.as_bytes().to_owned(), + ), + extras, + ) + } + + fn scale_pool_size(&self, pool_size: usize) { + self.worker_pool.scale_pool_size(pool_size); + } + + fn get_pool_size(&self) -> usize { + self.worker_pool.get_pool_size() + } +} + +#[derive(Clone)] +pub struct SchedPool { + vanilla: VanillaQueue, + priority: Option, + queue_type: QueueType, } impl SchedPool { @@ -74,7 +153,7 @@ impl SchedPool { pool_size: usize, reporter: R, feature_gate: FeatureGate, - _resource_ctl: Option>, + resource_ctl: Option>, ) -> Self { let builder = |pool_size: usize, name_prefix: &str| { let engine = Arc::new(Mutex::new(engine.clone())); @@ -102,94 +181,82 @@ impl SchedPool { tls_flush(&reporter); }) }; - // FIXME: for performance issue, disable priority pool temporarily - // if let Some(ref r) = resource_ctl { - // SchedPool::Priority { - // worker_pool: builder(pool_size, "sched-worker-pool") - // .build_priority_future_pool(r.clone()), - // resource_ctl: r.clone(), - // } - // } else { - SchedPool::Vanilla { + let vanilla = VanillaQueue { worker_pool: builder(pool_size, "sched-worker-pool").build_future_pool(), - high_worker_pool: builder(std::cmp::max(1, pool_size / 2), "sched-high-pri-pool") + high_worker_pool: builder(std::cmp::max(1, pool_size / 2), "sched-worker-high") .build_future_pool(), + }; + let priority = resource_ctl.as_ref().map(|r| PriorityQueue { + worker_pool: builder(pool_size, "sched-worker-priority") + .build_priority_future_pool(r.clone()), + resource_ctl: r.clone(), + }); + let queue_type = if resource_ctl.is_some() { + QueueType::Dynamic + } else { + QueueType::Vanilla + }; + + SchedPool { + vanilla, + priority, + queue_type, } - // } } pub fn spawn( &self, group_name: &str, - priority: CommandPri, + priority_level: CommandPri, f: impl futures::Future + Send + 'static, ) -> Result<(), Full> { - match self { - SchedPool::Vanilla { - high_worker_pool, - worker_pool, - } => { - if priority == CommandPri::High { - high_worker_pool.spawn(f) + match self.queue_type { + QueueType::Vanilla => self.vanilla.spawn(priority_level, f), + QueueType::Dynamic => { + if self.can_use_priority() { + fail_point!("priority_pool_task"); + self.priority + .as_ref() + .unwrap() + .spawn(group_name, priority_level, f) } else { - worker_pool.spawn(f) + fail_point!("single_queue_pool_task"); + self.vanilla.spawn(priority_level, f) } } - SchedPool::Priority { - worker_pool, - resource_ctl, - } => { - let fixed_level = match priority { - CommandPri::High => Some(0), - CommandPri::Normal => None, - CommandPri::Low => Some(2), - }; - // TODO: maybe use a better way to generate task_id - let task_id = rand::random::(); - let mut extras = Extras::new_multilevel(task_id, fixed_level); - extras.set_metadata(group_name.as_bytes().to_owned()); - worker_pool.spawn_with_extras( - ControlledFuture::new( - async move { - f.await; - }, - resource_ctl.clone(), - group_name.as_bytes().to_owned(), - ), - extras, - ) - } } } pub fn scale_pool_size(&self, pool_size: usize) { - match self { - SchedPool::Vanilla { - high_worker_pool, - worker_pool, - } => { - high_worker_pool.scale_pool_size(std::cmp::max(1, pool_size / 2)); - worker_pool.scale_pool_size(pool_size); + match self.queue_type { + QueueType::Vanilla => { + self.vanilla.scale_pool_size(pool_size); } - SchedPool::Priority { worker_pool, .. } => { - worker_pool.scale_pool_size(pool_size); + QueueType::Dynamic => { + let priority = self.priority.as_ref().unwrap(); + priority.scale_pool_size(pool_size); + self.vanilla.scale_pool_size(pool_size); } } } - pub fn get_pool_size(&self, priority: CommandPri) -> usize { - match self { - SchedPool::Vanilla { - high_worker_pool, - worker_pool, - } => { - if priority == CommandPri::High { - high_worker_pool.get_pool_size() + fn can_use_priority(&self) -> bool { + match self.queue_type { + QueueType::Vanilla => false, + QueueType::Dynamic => self.priority.as_ref().unwrap().resource_ctl.is_customized(), + } + } + + pub fn get_pool_size(&self, priority_level: CommandPri) -> usize { + match self.queue_type { + QueueType::Vanilla => self.vanilla.get_pool_size(priority_level), + QueueType::Dynamic => { + if self.can_use_priority() { + self.priority.as_ref().unwrap().get_pool_size() } else { - worker_pool.get_pool_size() + self.vanilla.get_pool_size(priority_level) } } - SchedPool::Priority { worker_pool, .. } => worker_pool.get_pool_size(), } } } diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index ba6339b666d..dd57f28ab94 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -22,6 +22,7 @@ use kvproto::{ }, tikvpb::TikvClient, }; +use resource_control::ResourceGroupManager; use test_raftstore::*; use tikv::{ config::{ConfigController, Module}, @@ -279,7 +280,6 @@ fn test_scale_scheduler_pool() { ctx.set_region_id(region.id); ctx.set_region_epoch(region.get_region_epoch().clone()); ctx.set_peer(cluster.leader_of_region(region.id).unwrap()); - let do_prewrite = |key: &[u8], val: &[u8]| { // prewrite let (prewrite_tx, prewrite_rx) = channel(); @@ -332,6 +332,114 @@ fn test_scale_scheduler_pool() { fail::remove(snapshot_fp); } +#[test] +fn test_scheduler_pool_auto_switch_for_resource_ctl() { + let mut cluster = new_server_cluster(0, 1); + cluster.run(); + + let engine = cluster + .sim + .read() + .unwrap() + .storages + .get(&1) + .unwrap() + .clone(); + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test".to_string(), true); + + let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) + .config(cluster.cfg.tikv.storage.clone()) + .build_for_resource_controller(resource_ctl) + .unwrap(); + + let region = cluster.get_region(b"k1"); + let mut ctx = Context::default(); + ctx.set_region_id(region.id); + ctx.set_region_epoch(region.get_region_epoch().clone()); + ctx.set_peer(cluster.leader_of_region(region.id).unwrap()); + + let do_prewrite = |key: &[u8], val: &[u8]| { + // prewrite + let (prewrite_tx, prewrite_rx) = channel(); + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(key), val.to_vec())], + key.to_vec(), + 10.into(), + 100, + false, + 2, + TimeStamp::default(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + ctx.clone(), + ), + Box::new(move |res: storage::Result<_>| { + let _ = prewrite_tx.send(res); + }), + ) + .unwrap(); + prewrite_rx.recv_timeout(Duration::from_secs(2)) + }; + + let (sender, receiver) = channel(); + let priority_queue_sender = Mutex::new(sender.clone()); + let single_queue_sender = Mutex::new(sender); + fail::cfg_callback("priority_pool_task", move || { + let sender = priority_queue_sender.lock().unwrap(); + sender.send("priority_queue").unwrap(); + }) + .unwrap(); + fail::cfg_callback("single_queue_pool_task", move || { + let sender = single_queue_sender.lock().unwrap(); + sender.send("single_queue").unwrap(); + }) + .unwrap(); + + // Default is use single queue + assert_eq!(do_prewrite(b"k1", b"v1").is_ok(), true); + assert_eq!( + receiver.recv_timeout(Duration::from_millis(500)).unwrap(), + "single_queue" + ); + + // Add group use priority queue + use kvproto::resource_manager::{GroupMode, GroupRequestUnitSettings, ResourceGroup}; + let mut group = ResourceGroup::new(); + group.set_name("rg1".to_string()); + group.set_mode(GroupMode::RuMode); + let mut ru_setting = GroupRequestUnitSettings::new(); + ru_setting.mut_r_u().mut_settings().set_fill_rate(100000); + group.set_r_u_settings(ru_setting); + resource_manager.add_resource_group(group); + thread::sleep(Duration::from_millis(200)); + assert_eq!(do_prewrite(b"k2", b"v2").is_ok(), true); + assert_eq!( + receiver.recv_timeout(Duration::from_millis(500)).unwrap(), + "priority_queue" + ); + + // Delete group use single queue + resource_manager.remove_resource_group("rg1"); + thread::sleep(Duration::from_millis(200)); + assert_eq!(do_prewrite(b"k3", b"v3").is_ok(), true); + assert_eq!( + receiver.recv_timeout(Duration::from_millis(500)).unwrap(), + "single_queue" + ); + + // Scale pool size + let scheduler = storage.get_scheduler(); + let pool = scheduler.get_sched_pool(); + assert_eq!(pool.get_pool_size(CommandPri::Normal), 1); + pool.scale_pool_size(2); + assert_eq!(pool.get_pool_size(CommandPri::Normal), 2); +} + #[test] fn test_pipelined_pessimistic_lock() { let rockskv_async_write_fp = "rockskv_async_write"; From 7d7786356a2e3cae03793542a6a3772fa8a8a348 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 18 Apr 2023 23:15:19 -0700 Subject: [PATCH 644/676] fix engine type config on existing cluster (#14603) ref tikv/tikv#12842 Adjust engine type config before running tikv server. Otherwise it will be too late. Fix some other compile warnings. Signed-off-by: qi.xu Signed-off-by: tonyxuqqi Co-authored-by: qi.xu --- cmd/tikv-server/src/main.rs | 7 +++++++ src/storage/config.rs | 10 +++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cmd/tikv-server/src/main.rs b/cmd/tikv-server/src/main.rs index 1d846d72bdb..e64afdf1868 100644 --- a/cmd/tikv-server/src/main.rs +++ b/cmd/tikv-server/src/main.rs @@ -210,6 +210,13 @@ fn main() { process::exit(0); } + // engine config needs to be validated + // so that it can adjust the engine type before too late + if let Err(e) = config.storage.validate_engine_type() { + println!("invalid storage.engine configuration: {}", e); + process::exit(1) + } + match config.storage.engine { EngineType::RaftKv => server::server::run_tikv(config), EngineType::RaftKv2 => server::server2::run_tikv(config), diff --git a/src/storage/config.rs b/src/storage/config.rs index 63250176694..9fc052e0ee0 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -113,7 +113,11 @@ impl Default for Config { } impl Config { - fn validate_engine_type(&mut self) -> Result<(), Box> { + pub fn validate_engine_type(&mut self) -> Result<(), Box> { + if self.data_dir != DEFAULT_DATA_DIR { + self.data_dir = config::canonicalize_path(&self.data_dir)? + } + let v1_kv_db_path = config::canonicalize_sub_path(&self.data_dir, DEFAULT_ROCKSDB_SUB_DIR).unwrap(); let v2_tablet_path = @@ -144,10 +148,6 @@ impl Config { } pub fn validate(&mut self) -> Result<(), Box> { - if self.data_dir != DEFAULT_DATA_DIR { - self.data_dir = config::canonicalize_path(&self.data_dir)? - } - self.validate_engine_type()?; if self.scheduler_concurrency > MAX_SCHED_CONCURRENCY { From dd8322ce7d1a6119780ca2f67e84ede0db9040fd Mon Sep 17 00:00:00 2001 From: 3pointer Date: Wed, 19 Apr 2023 15:21:19 +0800 Subject: [PATCH 645/676] raftstore-v2: adapt catch up new peer for pending peers after split (#14549) close tikv/tikv#14572 Signed-off-by: 3pointer Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/operation/life.rs | 65 +++++++++++++++++- components/raftstore-v2/src/operation/mod.rs | 2 +- components/raftstore-v2/src/operation/pd.rs | 27 ++++++-- .../raftstore-v2/src/operation/ready/mod.rs | 9 +++ components/raftstore-v2/src/raft/peer.rs | 68 +++++++++++++++++-- components/test_pd_client/src/pd.rs | 1 + .../raftstore/test_split_region.rs | 34 ++++++++++ 7 files changed, 194 insertions(+), 12 deletions(-) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 7c7d1f37275..65e7ab7906a 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -36,9 +36,12 @@ use kvproto::{ raft_cmdpb::{AdminCmdType, RaftCmdRequest}, raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage}, }; -use raftstore::store::{util, Transport, WriteTask}; +use raftstore::store::{metrics::RAFT_PEER_PENDING_DURATION, util, Transport, WriteTask}; use slog::{debug, error, info, warn}; -use tikv_util::store::find_peer; +use tikv_util::{ + store::find_peer, + time::{duration_to_sec, Instant}, +}; use super::command::SplitInit; use crate::{ @@ -107,6 +110,64 @@ impl DestroyProgress { } } +#[derive(Default)] +pub struct AbnormalPeerContext { + /// Record the instants of peers being added into the configuration. + /// Remove them after they are not pending any more. + /// (u64, Instant) represents (peer id, time when peer starts pending) + pending_peers: Vec<(u64, Instant)>, + /// A inaccurate cache about which peer is marked as down. + down_peers: Vec, +} + +impl AbnormalPeerContext { + #[inline] + pub fn is_empty(&self) -> bool { + self.pending_peers.is_empty() && self.down_peers.is_empty() + } + + #[inline] + pub fn reset(&mut self) { + self.pending_peers.clear(); + self.down_peers.clear(); + } + + #[inline] + pub fn down_peers(&self) -> &[u64] { + &self.down_peers + } + + #[inline] + pub fn down_peers_mut(&mut self) -> &mut Vec { + &mut self.down_peers + } + + #[inline] + pub fn pending_peers(&self) -> &[(u64, Instant)] { + &self.pending_peers + } + + #[inline] + pub fn pending_peers_mut(&mut self) -> &mut Vec<(u64, Instant)> { + &mut self.pending_peers + } + + #[inline] + pub fn retain_pending_peers(&mut self, f: impl FnMut(&mut (u64, Instant)) -> bool) -> bool { + let len = self.pending_peers.len(); + self.pending_peers.retain_mut(f); + len != self.pending_peers.len() + } + + #[inline] + pub fn flush_metrics(&self) { + let _ = self.pending_peers.iter().map(|(_, pending_after)| { + let elapsed = duration_to_sec(pending_after.saturating_elapsed()); + RAFT_PEER_PENDING_DURATION.observe(elapsed); + }); + } +} + #[derive(Default)] pub struct GcPeerContext { confirmed_ids: Vec, diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 3511a432c15..f5eb4ebdb6f 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -14,7 +14,7 @@ pub use command::{ SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX, }; -pub use life::{DestroyProgress, GcPeerContext}; +pub use life::{AbnormalPeerContext, DestroyProgress, GcPeerContext}; pub use ready::{ write_initial_states, ApplyTrace, AsyncWriter, DataTrace, GenSnapTask, SnapState, StateStorage, }; diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 4bb6a06c162..f45cae390da 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -8,8 +8,8 @@ use engine_traits::{KvEngine, RaftEngine}; use fail::fail_point; use kvproto::{metapb, pdpb}; use raftstore::store::{metrics::STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC, Transport}; -use slog::error; -use tikv_util::slog_panic; +use slog::{debug, error}; +use tikv_util::{slog_panic, time::Instant}; use crate::{ batch::StoreContext, @@ -122,7 +122,7 @@ impl Peer { } /// Collects all pending peers and update `peers_start_pending_time`. - fn collect_pending_peers(&self, ctx: &StoreContext) -> Vec { + fn collect_pending_peers(&mut self, ctx: &StoreContext) -> Vec { let mut pending_peers = Vec::with_capacity(self.region().get_peers().len()); let status = self.raft_group().status(); let truncated_idx = self @@ -135,9 +135,10 @@ impl Peer { return pending_peers; } - // TODO: update `peers_start_pending_time`. + self.abnormal_peer_context().flush_metrics(); let progresses = status.progress.unwrap().iter(); + let mut peers_start_pending_time = Vec::with_capacity(self.region().get_peers().len()); for (&id, progress) in progresses { if id == self.peer_id() { continue; @@ -156,6 +157,21 @@ impl Peer { if progress.matched < truncated_idx { if let Some(p) = self.peer_from_cache(id) { pending_peers.push(p); + if !self + .abnormal_peer_context() + .pending_peers() + .iter() + .any(|p| p.0 == id) + { + let now = Instant::now(); + peers_start_pending_time.push((id, now)); + debug!( + self.logger, + "peer start pending"; + "get_peer_id" => id, + "time" => ?now, + ); + } } else { if ctx.cfg.dev_assert { slog_panic!( @@ -172,6 +188,9 @@ impl Peer { } } } + self.abnormal_peer_context_mut() + .pending_peers_mut() + .append(&mut peers_start_pending_time); pending_peers } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index a9e72f02f8e..1222310d9a6 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -290,6 +290,7 @@ impl Peer { // TODO: drop all msg append when the peer is uninitialized and has conflict // ranges with other peers. let from_peer = msg.take_from_peer(); + let from_peer_id = from_peer.get_id(); if self.is_leader() && from_peer.get_id() != INVALID_ID { self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); } @@ -317,6 +318,14 @@ impl Peer { } } + // There are two different cases to check peers can be bring back. + // 1. If the peer is pending, then only AppendResponse can bring it back to up. + // 2. If the peer is down, then HeartbeatResponse and AppendResponse can bring + // it back to up. + if self.any_new_peer_catch_up(from_peer_id) { + self.region_heartbeat_pd(ctx) + } + self.set_has_ready(); } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 494ae183da6..d35dfe22184 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -20,20 +20,22 @@ use raftstore::{ coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, store::{ fsm::ApplyMetrics, + metrics::RAFT_PEER_PENDING_DURATION, util::{Lease, RegionReadProgress}, Config, EntryStorage, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, TabletSnapManager, WriteTask, }, }; -use slog::{info, Logger}; -use tikv_util::slog_panic; +use slog::{debug, info, Logger}; +use tikv_util::{slog_panic, time::duration_to_sec}; use super::storage::Storage; use crate::{ fsm::ApplyScheduler, operation::{ - AsyncWriter, BucketStatsInfo, CompactLogContext, DestroyProgress, GcPeerContext, - MergeContext, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, TxnContext, + AbnormalPeerContext, AsyncWriter, BucketStatsInfo, CompactLogContext, DestroyProgress, + GcPeerContext, MergeContext, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, + TxnContext, }, router::{ApplyTask, CmdResChannel, PeerTick, QueryResChannel}, Result, @@ -115,6 +117,8 @@ pub struct Peer { pending_messages: Vec, gc_peer_context: GcPeerContext, + + abnormal_peer_context: AbnormalPeerContext, } impl Peer { @@ -199,6 +203,7 @@ impl Peer { ), pending_messages: vec![], gc_peer_context: GcPeerContext::default(), + abnormal_peer_context: AbnormalPeerContext::default(), }; // If this region has only one peer and I am the one, campaign directly. @@ -561,8 +566,9 @@ impl Peer { ) } - pub fn collect_down_peers(&self, max_duration: Duration) -> Vec { + pub fn collect_down_peers(&mut self, max_duration: Duration) -> Vec { let mut down_peers = Vec::new(); + let mut down_peer_ids = Vec::new(); let now = Instant::now(); for p in self.region().get_peers() { if p.get_id() == self.peer_id() { @@ -575,9 +581,11 @@ impl Peer { stats.set_peer(p.clone()); stats.set_down_seconds(elapsed.as_secs()); down_peers.push(stats); + down_peer_ids.push(p.get_id()); } } } + *self.abnormal_peer_context_mut().down_peers_mut() = down_peer_ids; // TODO: `refill_disk_full_peers` down_peers } @@ -873,4 +881,54 @@ impl Peer { Err(e) => slog_panic!(self.logger, "failed to load term"; "index" => idx, "err" => ?e), } } + + #[inline] + pub fn abnormal_peer_context_mut(&mut self) -> &mut AbnormalPeerContext { + &mut self.abnormal_peer_context + } + + #[inline] + pub fn abnormal_peer_context(&self) -> &AbnormalPeerContext { + &self.abnormal_peer_context + } + + pub fn any_new_peer_catch_up(&mut self, from_peer_id: u64) -> bool { + // no pending or down peers + if self.abnormal_peer_context.is_empty() { + return false; + } + if !self.is_leader() { + self.abnormal_peer_context.reset(); + return false; + } + + if self + .abnormal_peer_context + .down_peers() + .contains(&from_peer_id) + { + return true; + } + + let logger = self.logger.clone(); + self.abnormal_peer_context + .retain_pending_peers(|(peer_id, pending_after)| { + // TODO check wait data peers here + let truncated_idx = self.raft_group.store().entry_storage().truncated_index(); + if let Some(progress) = self.raft_group.raft.prs().get(*peer_id) { + if progress.matched >= truncated_idx { + let elapsed = duration_to_sec(pending_after.saturating_elapsed()); + RAFT_PEER_PENDING_DURATION.observe(elapsed); + debug!( + logger, + "peer has caught up logs"; + "from_peer_id" => %from_peer_id, + "takes" => %elapsed, + ); + return false; + } + } + true + }) + } } diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index d3bbce685c0..1c2cc573eb9 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -439,6 +439,7 @@ struct PdCluster { // region id -> leader leaders: HashMap, down_peers: HashMap, + // peer id -> peer pending_peers: HashMap, is_bootstraped: bool, diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 2673a34b0d2..48b226ba40e 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -1225,3 +1225,37 @@ fn test_gen_split_check_bucket_ranges() { // the bucket_ranges should be None to refresh the bucket cluster.send_half_split_region_message(®ion, None); } + +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_catch_up_peers_after_split() { + let mut cluster = new_cluster(0, 3); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.run(); + + let left_key = b"k1"; + let right_key = b"k3"; + let split_key = b"k2"; + cluster.must_put(left_key, b"v1"); + cluster.must_put(right_key, b"v3"); + + // Left and right key must be in same region before split. + let region = pd_client.get_region(left_key).unwrap(); + let region2 = pd_client.get_region(right_key).unwrap(); + assert_eq!(region.get_id(), region2.get_id()); + + // Split with split_key, so left_key must in left, and right_key in right. + cluster.must_split(®ion, split_key); + + // Get new split region by right_key because default right_derive is false. + let right_region = pd_client.get_region(right_key).unwrap(); + + let pending_peers = pd_client.get_pending_peers(); + + // Ensure new split region has no pending peers. + for p in right_region.get_peers() { + assert!(!pending_peers.contains_key(&p.id)) + } +} From a4000f655fcfca9d2fbdf6932b46e3995c420bb9 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Wed, 19 Apr 2023 17:15:19 +0800 Subject: [PATCH 646/676] snapshot-v2: delete idle snapshot file (#14590) close tikv/tikv#14581 1. change `delete_snapshot` to delete some snapshot directory those has same region_id and to_peer_id 2. the delete_snapshot will be happened in peer destory or send_snapshot finished. Signed-off-by: bufferflies <1045931706@qq.com> --- components/file_system/src/lib.rs | 12 +++++- components/raftstore/src/store/snap.rs | 47 ++++++----------------- src/server/server.rs | 3 +- src/server/tablet_snap.rs | 12 +++++- tests/integrations/raftstore/test_snap.rs | 24 +++++++++++- 5 files changed, 57 insertions(+), 41 deletions(-) diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index 413a4ef827e..a3701c6ecac 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -476,9 +476,14 @@ pub fn trash_dir_all(path: impl AsRef) -> io::Result<()> { /// to resume all those removal in the given directory. #[inline] pub fn clean_up_trash(path: impl AsRef) -> io::Result<()> { + clean_up_dir(path, TRASH_PREFIX) +} + +/// clean up all files starts with the given prefix in the given directory. +pub fn clean_up_dir(path: impl AsRef, prefix: &str) -> io::Result<()> { for e in read_dir(path)? { let e = e?; - if e.file_name().to_string_lossy().starts_with(TRASH_PREFIX) { + if e.file_name().to_string_lossy().starts_with(prefix) { remove_dir_all(e.path())?; } } @@ -680,5 +685,10 @@ mod tests { assert!(trash_sub_dir0.exists()); clean_up_trash(data_path).unwrap(); assert!(!trash_sub_dir0.exists()); + + create_dir_all(&sub_dir0).unwrap(); + assert!(sub_dir0.exists()); + clean_up_dir(data_path, "sub").unwrap(); + assert!(!sub_dir0.exists()); } } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 091609cf63e..68d3c7fba51 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1408,12 +1408,6 @@ impl SnapManager { } pub fn init(&self) -> io::Result<()> { - self.init_core()?; - self.tablet_snap_manager.init()?; - Ok(()) - } - - fn init_core(&self) -> io::Result<()> { let enc_enabled = self.core.encryption_key_manager.is_some(); info!( "Initializing SnapManager, encryption is enabled: {}", @@ -1632,7 +1626,7 @@ impl SnapManager { /// NOTE: don't call it in raftstore thread. pub fn get_total_snap_size(&self) -> Result { let size_v1 = self.core.get_total_snap_size()?; - let size_v2 = self.tablet_snap_manager.total_snap_size()?; + let size_v2 = self.tablet_snap_manager.total_snap_size().unwrap_or(0); Ok(size_v1 + size_v2) } @@ -1918,10 +1912,11 @@ impl SnapManagerBuilder { u64::MAX }; let path = path.into(); + assert!(!path.is_empty()); let mut path_v2 = path.clone(); - // the path for tablet snap manager, it will be empty if the cluster is not - // to receive snapshot from cluster of raftstore-v2 path_v2.push_str("_v2"); + let tablet_snap_mgr = TabletSnapManager::new(&path_v2).unwrap(); + let mut snapshot = SnapManager { core: SnapManagerCore { base: path, @@ -1936,7 +1931,7 @@ impl SnapManagerBuilder { stats: Default::default(), }, max_total_size: Arc::new(AtomicU64::new(max_total_size)), - tablet_snap_manager: TabletSnapManager::new_without_init(&path_v2), + tablet_snap_manager: tablet_snap_mgr, }; snapshot.set_max_per_file_size(self.max_per_file_size); // set actual max_per_file_size snapshot @@ -2007,7 +2002,6 @@ pub struct TabletSnapManager { impl TabletSnapManager { pub fn new>(path: T) -> io::Result { - // Initialize the directory if it doesn't exist. let path = path.into(); if !path.exists() { file_system::create_dir_all(&path)?; @@ -2018,6 +2012,7 @@ impl TabletSnapManager { format!("{} should be a directory", path.display()), )); } + file_system::clean_up_dir(&path, SNAP_GEN_PREFIX)?; file_system::clean_up_trash(&path)?; Ok(Self { base: path, @@ -2026,29 +2021,6 @@ impl TabletSnapManager { }) } - pub fn new_without_init>(path: T) -> Self { - let path = path.into(); - Self { - base: path, - receiving: Arc::default(), - stats: Arc::default(), - } - } - - pub fn init(&self) -> io::Result<()> { - if !self.base.exists() { - file_system::create_dir_all(&self.base)?; - } - if !self.base.is_dir() { - return Err(io::Error::new( - ErrorKind::Other, - format!("{} should be a directory", self.base.display()), - )); - } - file_system::clean_up_trash(&self.base)?; - Ok(()) - } - pub fn begin_snapshot(&self, key: TabletSnapKey, start: Instant, generate_duration_sec: u64) { let mut stat = SnapshotStat::default(); stat.set_generate_duration_sec(generate_duration_sec); @@ -3163,9 +3135,12 @@ pub mod tests { assert!(mgr.stats().stats.is_empty()); // filter out the total duration seconds less than one sencond. - mgr.begin_snapshot(key.clone(), start, 1); - mgr.finish_snapshot(key, start); + let path = mgr.tablet_gen_path(&key); + std::fs::create_dir_all(&path).unwrap(); + assert!(path.exists()); + mgr.delete_snapshot(&key); assert_eq!(mgr.stats().stats.len(), 0); + assert!(!path.exists()); } #[test] diff --git a/src/server/server.rs b/src/server/server.rs index 8e1a33880d6..6e294eda45e 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -579,6 +579,7 @@ mod tests { ); let addr = Arc::new(Mutex::new(None)); let (check_leader_scheduler, _) = tikv_util::worker::dummy_scheduler(); + let path = tempfile::TempDir::new().unwrap(); let mut server = Server::new( mock_store_id, &cfg, @@ -590,7 +591,7 @@ mod tests { quick_fail: Arc::clone(&quick_fail), addr: Arc::clone(&addr), }, - Either::Left(SnapManager::new("")), + Either::Left(SnapManager::new(path.path().to_str().unwrap())), gc_worker, check_leader_scheduler, env, diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index cbcd1a228f8..07a85109006 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -640,7 +640,10 @@ pub fn send_snap( ); let deregister = { let (mgr, key) = (mgr.clone(), key.clone()); - DeferContext::new(move || mgr.finish_snapshot(key.clone(), timer)) + DeferContext::new(move || { + mgr.finish_snapshot(key.clone(), timer); + mgr.delete_snapshot(&key); + }) }; let cb = ChannelBuilder::new(env) @@ -662,7 +665,6 @@ pub fn send_snap( send_timer.observe_duration(); drop(client); drop(deregister); - mgr.delete_snapshot(&key); match recv_result { None => Ok(SendStat { key, @@ -807,6 +809,12 @@ where let region_id = msg.get_region_id(); if self.sending_count.load(Ordering::SeqCst) >= self.cfg.concurrent_send_snap_limit { + let key = TabletSnapKey::from_region_snap( + msg.get_region_id(), + msg.get_to_peer().get_id(), + msg.get_message().get_snapshot(), + ); + self.snap_mgr.delete_snapshot(&key); warn!( "too many sending snapshot tasks, drop Send Snap[to: {}, snap: {:?}]", addr, msg diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index f3bd7583ab3..3171aaa1a9e 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -782,6 +782,7 @@ fn test_v1_receive_snap_from_v2() { let test_receive_snap = |key_num| { let mut cluster_v1 = test_raftstore::new_server_cluster(1, 1); let mut cluster_v2 = test_raftstore_v2::new_server_cluster(1, 1); + let mut cluster_v1_tikv = test_raftstore::new_server_cluster(1, 1); cluster_v1 .cfg @@ -791,8 +792,10 @@ fn test_v1_receive_snap_from_v2() { cluster_v1.run(); cluster_v2.run(); + cluster_v1_tikv.run(); let s1_addr = cluster_v1.get_addr(1); + let s2_addr = cluster_v1_tikv.get_addr(1); let region = cluster_v2.get_region(b""); let region_id = region.get_id(); let engine = cluster_v2.get_engine(1); @@ -810,10 +813,29 @@ fn test_v1_receive_snap_from_v2() { let limit = Limiter::new(f64::INFINITY); let env = Arc::new(Environment::new(1)); let _ = block_on(async { - send_snap_v2(env, snap_mgr, security_mgr, &cfg, &s1_addr, msg, limit) + send_snap_v2( + env.clone(), + snap_mgr.clone(), + security_mgr.clone(), + &cfg, + &s1_addr, + msg.clone(), + limit.clone(), + ) + .unwrap() + .await + }); + let send_result = block_on(async { + send_snap_v2(env, snap_mgr, security_mgr, &cfg, &s2_addr, msg, limit) .unwrap() .await }); + // snapshot should be rejected by cluster v1 tikv, and the snapshot should be + // deleted. + assert!(send_result.is_err()); + let dir = cluster_v2.get_snap_dir(1); + let read_dir = std::fs::read_dir(dir).unwrap(); + assert_eq!(0, read_dir.count()); // The snapshot has been received by cluster v1, so check it's completeness let snap_mgr = cluster_v1.get_snap_mgr(1); From fdc01d17d43e6ce6848f661cb55180c022ddfde6 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 19 Apr 2023 17:35:19 +0800 Subject: [PATCH 647/676] raftstore-v2: memtable pre-flush should be async (#14567) ref tikv/tikv#14566 memtable pre-flush should be async Signed-off-by: Spade A Co-authored-by: Xinye Tao Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 35 ++---- components/raftstore-v2/src/lib.rs | 2 +- .../operation/command/admin/compact_log.rs | 24 ++-- .../src/operation/command/admin/mod.rs | 53 ++++++-- .../src/operation/command/admin/split.rs | 40 +++--- .../src/operation/command/write/ingest.rs | 8 +- .../raftstore-v2/src/operation/ready/mod.rs | 35 +++--- .../src/operation/ready/snapshot.rs | 6 +- components/raftstore-v2/src/worker/mod.rs | 3 +- .../src/worker/{tablet_gc.rs => tablet.rs} | 65 +++++++++- .../raftstore-v2/src/worker/tablet_flush.rs | 116 ------------------ components/txn_types/src/types.rs | 2 +- 12 files changed, 171 insertions(+), 218 deletions(-) rename components/raftstore-v2/src/worker/{tablet_gc.rs => tablet.rs} (85%) delete mode 100644 components/raftstore-v2/src/worker/tablet_flush.rs diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 14282cc09f9..1e72341d651 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -43,7 +43,7 @@ use tikv_util::{ sys::SysQuota, time::{duration_to_sec, Instant as TiInstant}, timer::SteadyTimer, - worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + worker::{LazyWorker, Scheduler, Worker}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, Either, }; @@ -54,7 +54,7 @@ use crate::{ operation::{SharedReadTablet, MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, - worker::{pd, tablet_flush, tablet_gc}, + worker::{pd, tablet}, Error, Result, }; @@ -472,9 +472,8 @@ where pub struct Schedulers { pub read: Scheduler>, pub pd: Scheduler, - pub tablet_gc: Scheduler>, + pub tablet: Scheduler>, pub write: WriteSenders, - pub tablet_flush: Scheduler, // Following is not maintained by raftstore itself. pub split_check: Scheduler, @@ -484,7 +483,7 @@ impl Schedulers { fn stop(&self) { self.read.stop(); self.pd.stop(); - self.tablet_gc.stop(); + self.tablet.stop(); self.split_check.stop(); } } @@ -495,10 +494,9 @@ struct Workers { /// Worker for fetching raft logs asynchronously async_read: Worker, pd: LazyWorker, - tablet_gc: Worker, + tablet: Worker, async_write: StoreWriters, purge: Option, - tablet_flush: Worker, // Following is not maintained by raftstore itself. background: Worker, @@ -506,16 +504,12 @@ struct Workers { impl Workers { fn new(background: Worker, pd: LazyWorker, purge: Option) -> Self { - let tablet_flush = WorkerBuilder::new("tablet-flush-worker") - .thread_count(2) - .create(); Self { async_read: Worker::new("async-read-worker"), pd, - tablet_gc: Worker::new("tablet-gc-worker"), + tablet: Worker::new("tablet-worker"), async_write: StoreWriters::new(None), purge, - tablet_flush, background, } } @@ -524,8 +518,7 @@ impl Workers { self.async_write.shutdown(); self.async_read.stop(); self.pd.stop(); - self.tablet_gc.stop(); - self.tablet_flush.stop(); + self.tablet.stop(); if let Some(w) = self.purge { w.stop(); } @@ -636,27 +629,21 @@ impl StoreSystem { ), ); - let tablet_gc_scheduler = workers.tablet_gc.start_with_timer( - "tablet-gc-worker", - tablet_gc::Runner::new( + let tablet_gc_scheduler = workers.tablet.start_with_timer( + "tablet-worker", + tablet::Runner::new( tablet_registry.clone(), sst_importer.clone(), self.logger.clone(), ), ); - let tablet_flush_scheduler = workers.tablet_flush.start( - "tablet-flush-worker", - tablet_flush::Runner::new(router.clone(), tablet_registry.clone(), self.logger.clone()), - ); - let schedulers = Schedulers { read: read_scheduler, pd: workers.pd.scheduler(), - tablet_gc: tablet_gc_scheduler, + tablet: tablet_gc_scheduler, write: workers.async_write.senders(), split_check: split_check_scheduler, - tablet_flush: tablet_flush_scheduler, }; let builder = StorePollerBuilder::new( diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 04745d01fbe..bcfaf383024 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -44,5 +44,5 @@ pub use operation::{write_initial_states, SimpleWriteBinary, SimpleWriteEncoder, pub use raftstore::{store::Config, Error, Result}; pub use worker::{ pd::{PdReporter, Task as PdTask}, - tablet_flush::Task as TabletFlushTask, + tablet::Task as TabletTask, }; diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 1cc9ccbb1c3..383b54aa3b4 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -34,7 +34,7 @@ use crate::{ operation::AdminCmdResult, raft::{Apply, Peer}, router::{CmdResChannel, PeerTick}, - worker::tablet_gc, + worker::tablet, }; #[derive(Debug)] @@ -44,7 +44,7 @@ pub struct CompactLogContext { last_applying_index: u64, /// Tombstone tablets can only be destroyed when the tablet that replaces it /// is persisted. This is a list of tablet index that awaits to be - /// persisted. When persisted_apply is advanced, we need to notify tablet_gc + /// persisted. When persisted_apply is advanced, we need to notify tablet /// worker to destroy them. tombstone_tablets_wait_index: Vec, } @@ -303,8 +303,8 @@ impl Peer { .push(new_tablet_index); let _ = ctx .schedulers - .tablet_gc - .schedule(tablet_gc::Task::prepare_destroy( + .tablet + .schedule(tablet::Task::prepare_destroy( old_tablet, self.region_id(), new_tablet_index, @@ -330,8 +330,8 @@ impl Peer { .push(new_tablet_index); let _ = ctx .schedulers - .tablet_gc - .schedule(tablet_gc::Task::prepare_destroy_path( + .tablet + .schedule(tablet::Task::prepare_destroy_path( old_tablet, self.region_id(), new_tablet_index, @@ -381,14 +381,14 @@ impl Peer { }; let region_id = self.region_id(); let applied_index = self.entry_storage().applied_index(); - let sched = ctx.schedulers.tablet_gc.clone(); - let _ = sched.schedule(tablet_gc::Task::prepare_destroy( + let sched = ctx.schedulers.tablet.clone(); + let _ = sched.schedule(tablet::Task::prepare_destroy( tablet, self.region_id(), applied_index, )); task.persisted_cbs.push(Box::new(move || { - let _ = sched.schedule(tablet_gc::Task::destroy(region_id, applied_index)); + let _ = sched.schedule(tablet::Task::destroy(region_id, applied_index)); })); } @@ -506,14 +506,14 @@ impl Peer { } } if self.remove_tombstone_tablets(new_persisted) { - let sched = store_ctx.schedulers.tablet_gc.clone(); + let sched = store_ctx.schedulers.tablet.clone(); if !task.has_snapshot { task.persisted_cbs.push(Box::new(move || { - let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); + let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); })); } else { // In snapshot, the index is persisted, tablet can be destroyed directly. - let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); + let _ = sched.schedule(tablet::Task::destroy(region_id, new_persisted)); } } } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 28fceb2d95b..69c9b39aaa2 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -39,7 +39,11 @@ pub use split::{ use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; -use crate::{batch::StoreContext, raft::Peer, router::CmdResChannel}; +use crate::{ + batch::StoreContext, + raft::Peer, + router::{CmdResChannel, PeerMsg, RaftRequest}, +}; #[derive(Debug)] pub enum AdminCmdResult { @@ -147,9 +151,9 @@ impl Peer { // the follower so that they can flush memtalbes in advance too. // // 2. When the task finishes, it will propose a batch split with - // `SPLIT_SECOND_PHASE` flag. + // `PRE_FLUSH_FINISHED` flag. if !WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) - .contains(WriteBatchFlags::SPLIT_SECOND_PHASE) + .contains(WriteBatchFlags::PRE_FLUSH_FINISHED) { if self.tablet_being_flushed() { return; @@ -161,14 +165,42 @@ impl Peer { self.logger, "Schedule flush tablet"; ); - if let Err(e) = ctx.schedulers.tablet_flush.schedule( - crate::TabletFlushTask::TabletFlush { + + let mailbox = match ctx.router.mailbox(region_id) { + Some(mailbox) => mailbox, + None => { + // None means the node is shutdown concurrently and thus the + // mailboxes in router have been cleared + assert!( + ctx.router.is_shutdown(), + "{} router should have been closed", + SlogFormat(&self.logger) + ); + return; + } + }; + + let logger = self.logger.clone(); + let on_flush_finish = move || { + req.mut_header() + .set_flags(WriteBatchFlags::PRE_FLUSH_FINISHED.bits()); + if let Err(e) = mailbox + .try_send(PeerMsg::AdminCommand(RaftRequest::new(req, ch))) + { + error!( + logger, + "send split request fail after pre-flush finished"; + "err" => ?e, + ); + } + }; + + if let Err(e) = + ctx.schedulers.tablet.schedule(crate::TabletTask::Flush { region_id, - req: Some(req), - is_leader: true, - ch: Some(ch), - }, - ) { + cb: Some(Box::new(on_flush_finish)), + }) + { error!( self.logger, "Fail to schedule flush task"; @@ -176,6 +208,7 @@ impl Peer { ) } + // Notify followers to flush their relevant memtables let peers = self.region().get_peers().to_vec(); for p in peers { if p == *self.peer() diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 0b53476273f..4c6fdad3aa2 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -62,7 +62,7 @@ use crate::{ operation::{AdminCmdResult, SharedReadTablet}, raft::{Apply, Peer}, router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, - worker::tablet_gc, + worker::tablet, Error, }; @@ -132,8 +132,8 @@ pub fn report_split_init_finish( if let Err(e) = ctx .schedulers - .tablet_gc - .schedule(tablet_gc::Task::direct_destroy_path(temp_split_path( + .tablet + .schedule(tablet::Task::direct_destroy_path(temp_split_path( &ctx.tablet_registry, finish_region_id, ))) @@ -641,16 +641,13 @@ impl Peer { } }; let tablet_index = res.tablet_index; - let _ = store_ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::trim( - self.tablet().unwrap().clone(), - derived, - move || { - let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); - }, - )); + let _ = store_ctx.schedulers.tablet.schedule(tablet::Task::trim( + self.tablet().unwrap().clone(), + derived, + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); let last_region_id = res.regions.last().unwrap().get_id(); let mut new_ids = HashSet::default(); @@ -771,16 +768,13 @@ impl Peer { if self.storage().has_dirty_data() { let tablet_index = self.storage().tablet_index(); if let Some(mailbox) = store_ctx.router.mailbox(region_id) { - let _ = store_ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::trim( - self.tablet().unwrap().clone(), - self.region(), - move || { - let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); - }, - )); + let _ = store_ctx.schedulers.tablet.schedule(tablet::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); } else { // None means the node is shutdown concurrently and thus the // mailboxes in router have been cleared diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index 73459740393..bc15765437f 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -16,7 +16,7 @@ use crate::{ fsm::{ApplyResReporter, Store, StoreFsmDelegate}, raft::{Apply, Peer}, router::{PeerMsg, StoreTick}, - worker::tablet_gc, + worker::tablet, }; impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { @@ -53,7 +53,7 @@ impl Store { if let Err(TrySendError::Disconnected(msg)) = ctx.router.send(region_id, PeerMsg::CleanupImportSst(ssts.into())) && !ctx.router.is_shutdown() { let PeerMsg::CleanupImportSst(ssts) = msg else { unreachable!() }; - let _ = ctx.schedulers.tablet_gc.schedule(tablet_gc::Task::CleanupImportSst(ssts)); + let _ = ctx.schedulers.tablet.schedule(tablet::Task::CleanupImportSst(ssts)); } } @@ -75,8 +75,8 @@ impl Peer { } let _ = ctx .schedulers - .tablet_gc - .schedule(tablet_gc::Task::CleanupImportSst(stale_ssts.into())); + .tablet + .schedule(tablet::Task::CleanupImportSst(stale_ssts.into())); } } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 1222310d9a6..9a29c705aff 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -56,7 +56,7 @@ use crate::{ fsm::{PeerFsmDelegate, Store}, raft::{Peer, Storage}, router::{PeerMsg, PeerTick}, - worker::tablet_gc, + worker::tablet, }; const PAUSE_FOR_RECOVERY_GAP: u64 = 128; @@ -119,16 +119,13 @@ impl Peer { let region_id = self.region_id(); let mailbox = store_ctx.router.mailbox(region_id).unwrap(); let tablet_index = self.storage().tablet_index(); - let _ = store_ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::trim( - self.tablet().unwrap().clone(), - self.region(), - move || { - let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); - }, - )); + let _ = store_ctx.schedulers.tablet.schedule(tablet::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); } let entry_storage = self.storage().entry_storage(); let committed_index = entry_storage.commit_index(); @@ -219,15 +216,13 @@ impl Peer { if util::is_epoch_stale(region_epoch, self.region().get_region_epoch()) { return; } - let _ = - ctx.schedulers - .tablet_flush - .schedule(crate::TabletFlushTask::TabletFlush { - region_id: self.region().get_id(), - req: None, - is_leader: false, - ch: None, - }); + let _ = ctx + .schedulers + .tablet + .schedule(crate::worker::tablet::Task::Flush { + region_id: self.region().get_id(), + cb: None, + }); return; } ExtraMessageType::MsgWantRollbackMerge => { diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 3db8590d7ed..12b4a97e710 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -49,7 +49,7 @@ use crate::{ operation::{command::temp_split_path, SharedReadTablet}, raft::{Apply, Peer, Storage}, router::ApplyTask, - worker::tablet_gc, + worker::tablet, Result, StoreContext, }; @@ -282,8 +282,8 @@ impl Peer { if self.remove_tombstone_tablets(snapshot_index) { let _ = ctx .schedulers - .tablet_gc - .schedule(tablet_gc::Task::destroy(region_id, snapshot_index)); + .tablet + .schedule(tablet::Task::destroy(region_id, snapshot_index)); } } } diff --git a/components/raftstore-v2/src/worker/mod.rs b/components/raftstore-v2/src/worker/mod.rs index 121c41906d7..2fa7255afd3 100644 --- a/components/raftstore-v2/src/worker/mod.rs +++ b/components/raftstore-v2/src/worker/mod.rs @@ -1,5 +1,4 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. pub mod pd; -pub mod tablet_flush; -pub mod tablet_gc; +pub mod tablet; diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet.rs similarity index 85% rename from components/raftstore-v2/src/worker/tablet_gc.rs rename to components/raftstore-v2/src/worker/tablet.rs index 5799398c080..db09c4ba3be 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet.rs @@ -8,11 +8,12 @@ use std::{ }; use collections::HashMap; -use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry}; +use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry, DATA_CFS}; use kvproto::{import_sstpb::SstMeta, metapb::Region}; use slog::{debug, error, info, warn, Logger}; use sst_importer::SstImporter; use tikv_util::{ + time::Instant, worker::{Runnable, RunnableWithTimer}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, Either, @@ -41,6 +42,14 @@ pub enum Task { DirectDestroy { tablet: Either }, /// Cleanup ssts. CleanupImportSst(Box<[SstMeta]>), + /// Flush memtable before split + /// + /// cb is some iff the task is sent from leader, it is used to real propose + /// split when flush finishes + Flush { + region_id: u64, + cb: Option>, + }, } impl Display for Task { @@ -77,6 +86,17 @@ impl Display for Task { Task::CleanupImportSst(ssts) => { write!(f, "cleanup import ssts {:?}", ssts) } + Task::Flush { + region_id, + cb: on_flush_finish, + } => { + write!( + f, + "flush tablet for region_id {}, is leader {}", + region_id, + on_flush_finish.is_some() + ) + } } } } @@ -160,7 +180,7 @@ impl Runner { waiting_destroy_tasks: HashMap::default(), pending_destroy_tasks: Vec::new(), background_pool: YatpPoolBuilder::new(DefaultTicker::default()) - .name_prefix("tablet-gc-bg") + .name_prefix("tablet-bg") .thread_count( 0, DEFAULT_BACKGROUND_POOL_SIZE, @@ -311,6 +331,46 @@ impl Runner { } } } + + fn flush_tablet(&self, region_id: u64, cb: Option>) { + let Some(Some(tablet)) = self + .tablet_registry + .get(region_id) + .map(|mut cache| cache.latest().cloned()) else {return}; + + // The callback `cb` being some means it's the task sent from + // leader, we should sync flush memtables and call it after the flush complete + // where the split will be proposed again with extra flag. + if let Some(cb) = cb { + let logger = self.logger.clone(); + let now = Instant::now(); + self.background_pool + .spawn(async move { + // sync flush for leader to let the flush happend before later checkpoint. + tablet.flush_cfs(DATA_CFS, true).unwrap(); + let elapsed = now.saturating_elapsed(); + // to be removed after when it's stable + info!( + logger, + "flush memtable for leader"; + "region_id" => region_id, + "duration" => ?elapsed, + ); + + drop(tablet); + cb(); + }) + .unwrap(); + } else { + info!( + self.logger, + "flush memtable for follower"; + "region_id" => region_id, + ); + + tablet.flush_cfs(DATA_CFS, false).unwrap(); + } + } } impl Runnable for Runner @@ -338,6 +398,7 @@ where } => self.destroy(region_id, persisted_index), Task::DirectDestroy { tablet, .. } => self.direct_destroy(tablet), Task::CleanupImportSst(ssts) => self.cleanup_ssts(ssts), + Task::Flush { region_id, cb } => self.flush_tablet(region_id, cb), } } } diff --git a/components/raftstore-v2/src/worker/tablet_flush.rs b/components/raftstore-v2/src/worker/tablet_flush.rs deleted file mode 100644 index e7d2c534f80..00000000000 --- a/components/raftstore-v2/src/worker/tablet_flush.rs +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. - -use std::fmt::{Display, Formatter}; - -use engine_traits::{KvEngine, RaftEngine, TabletRegistry, DATA_CFS}; -use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; -use slog::{error, info, Logger}; -use tikv_util::{time::Instant, worker::Runnable}; -use txn_types::WriteBatchFlags; - -use crate::{ - router::{CmdResChannel, PeerMsg, RaftRequest}, - StoreRouter, -}; - -pub enum Task { - TabletFlush { - region_id: u64, - req: Option, - is_leader: bool, - ch: Option, - }, -} - -impl Display for Task { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Task::TabletFlush { region_id, .. } => { - write!(f, "Flush tablet before split for region {}", region_id) - } - } - } -} - -pub struct Runner { - router: StoreRouter, - tablet_registry: TabletRegistry, - logger: Logger, -} - -impl Runner { - pub fn new( - router: StoreRouter, - tablet_registry: TabletRegistry, - logger: Logger, - ) -> Self { - Self { - router, - tablet_registry, - logger, - } - } - - fn flush_tablet( - &mut self, - region_id: u64, - req: Option, - is_leader: bool, - ch: Option, - ) { - let Some(Some(tablet)) = self - .tablet_registry - .get(region_id) - .map(|mut cache| cache.latest().cloned()) else {return}; - let now = Instant::now(); - tablet.flush_cfs(DATA_CFS, true).unwrap(); - let elapsed = now.saturating_elapsed(); - // to be removed after when it's stable - info!( - self.logger, - "flush memtable time consumes"; - "region_id" => region_id, - "duration" => ?elapsed, - "is_leader" => is_leader, - ); - - if !is_leader { - return; - } - - let mut req = req.unwrap(); - assert!(req.get_admin_request().get_cmd_type() == AdminCmdType::BatchSplit); - req.mut_header() - .set_flags(WriteBatchFlags::SPLIT_SECOND_PHASE.bits()); - if let Err(e) = self.router.send( - region_id, - PeerMsg::AdminCommand(RaftRequest::new(req, ch.unwrap())), - ) { - error!( - self.logger, - "send split request fail in the second phase"; - "region_id" => region_id, - "err" => ?e, - ); - } - } -} - -impl Runnable for Runner -where - EK: KvEngine, - ER: RaftEngine, -{ - type Task = Task; - - fn run(&mut self, task: Self::Task) { - match task { - Task::TabletFlush { - region_id, - req, - is_leader, - ch, - } => self.flush_tablet(region_id, req, is_leader, ch), - } - } -} diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 23df1a89940..a83a68c7ba6 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -571,7 +571,7 @@ bitflags! { /// Indicates this request is a flashback transaction. const FLASHBACK = 0b00001000; /// Indicates the relevant tablet has been flushed, and we can propose split now. - const SPLIT_SECOND_PHASE = 0b00010000; + const PRE_FLUSH_FINISHED = 0b00010000; } } From 56f5d93e5b2654a55d437bef26aa0b7c367baf68 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Wed, 19 Apr 2023 22:55:20 +0800 Subject: [PATCH 648/676] raftstore-v2: add capture tests (#14587) ref tikv/tikv#14542 raftstore-v2: add capture tests Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/raftstore-v2/Cargo.toml | 1 + .../raftstore-v2/src/operation/command/mod.rs | 4 +- .../src/operation/query/capture.rs | 264 +++++++++++++++++- components/raftstore/src/coprocessor/mod.rs | 7 +- .../raftstore/src/store/simple_write.rs | 117 +++++++- components/server/src/server2.rs | 2 + src/config/mod.rs | 10 + 8 files changed, 396 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 10d3a7f37eb..1f0011894b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4426,6 +4426,7 @@ dependencies = [ "collections", "concurrency_manager", "crossbeam", + "engine_rocks", "engine_test", "engine_traits", "error_code", diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 3dfeb512980..ad13ea5ab74 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -65,6 +65,7 @@ txn_types = { workspace = true } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] +engine_rocks = { workspace = true } engine_test = { workspace = true } slog-global = { workspace = true } tempfile = "3.0" diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index ce4a415cf00..2f2df5a0333 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -99,8 +99,8 @@ pub(crate) fn parse_at( pub struct CommittedEntries { /// Entries need to be applied. Note some entries may not be included for /// flow control. - entry_and_proposals: Vec<(Entry, Vec)>, - committed_time: Instant, + pub entry_and_proposals: Vec<(Entry, Vec)>, + pub committed_time: Instant, } fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { diff --git a/components/raftstore-v2/src/operation/query/capture.rs b/components/raftstore-v2/src/operation/query/capture.rs index 94b58f41809..5fdbde187e4 100644 --- a/components/raftstore-v2/src/operation/query/capture.rs +++ b/components/raftstore-v2/src/operation/query/capture.rs @@ -1,6 +1,6 @@ // Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. -use std::{mem, sync::Arc}; +use std::sync::Arc; use engine_traits::{KvEngine, RaftEngine}; use fail::fail_point; @@ -10,7 +10,7 @@ use raftstore::{ store::{ cmd_resp, fsm::{ - apply::{notify_stale_req_with_msg, ObserverType}, + apply::{notify_stale_req_with_msg, ObserverType, SHRINK_PENDING_CMD_QUEUE_CAP}, new_read_index_request, ChangeObserver, }, msg::ErrorCallback, @@ -152,10 +152,264 @@ impl Apply { } let region_id = self.region_id(); - let mut cmd_batch = CmdBatch::new(&self.observe().info, region_id); - let cmds = mem::take(&mut self.observe_mut().cmds); - cmd_batch.extend(&self.observe().info, region_id, cmds); + let observe = self.observe_mut(); + let mut cmd_batch = CmdBatch::new(&observe.info, region_id); + cmd_batch.extend(&observe.info, region_id, observe.cmds.drain(..)); + if observe.cmds.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { + observe.cmds.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); + } self.coprocessor_host() .on_flush_applied_cmd_batch(level, vec![cmd_batch], self.tablet()); } } + +#[cfg(test)] +mod test { + use std::sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, Mutex, + }; + + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactory}, + }; + use engine_traits::{ + FlushState, Peekable, TabletContext, TabletRegistry, CF_DEFAULT, DATA_CFS, + }; + use futures::executor::block_on; + use kvproto::{ + metapb::{Region, RegionEpoch}, + raft_cmdpb::RaftRequestHeader, + raft_serverpb::{PeerState, RegionLocalState}, + }; + use raft::{ + prelude::{Entry, EntryType}, + StateRole, + }; + use raftstore::{ + coprocessor::{BoxCmdObserver, CmdObserver, CoprocessorHost}, + store::Config, + }; + use slog::o; + use tempfile::TempDir; + use tikv_util::{store::new_peer, time::Instant, worker::dummy_scheduler}; + + use super::*; + use crate::{ + fsm::ApplyResReporter, + operation::{ + test_util::create_tmp_importer, CatchUpLogs, CommittedEntries, SimpleWriteReqEncoder, + }, + raft::Apply, + router::{build_any_channel, ApplyRes}, + SimpleWriteEncoder, + }; + + struct MockReporter { + sender: Sender, + } + + impl MockReporter { + fn new() -> (Self, Receiver) { + let (tx, rx) = channel(); + (MockReporter { sender: tx }, rx) + } + } + + impl ApplyResReporter for MockReporter { + fn report(&self, apply_res: ApplyRes) { + let _ = self.sender.send(apply_res); + } + + fn redirect_catch_up_logs(&self, _c: CatchUpLogs) {} + } + + #[derive(Clone)] + struct TestObserver { + sender: Sender>, + } + + impl TestObserver { + fn new() -> (Self, Receiver>) { + let (tx, rx) = channel(); + (TestObserver { sender: tx }, rx) + } + } + + impl raftstore::coprocessor::Coprocessor for TestObserver {} + impl CmdObserver for TestObserver { + fn on_flush_applied_cmd_batch( + &self, + _max_level: ObserveLevel, + cmd_batches: &mut Vec, + _engine: &E, + ) { + self.sender.send(cmd_batches.clone()).unwrap(); + } + + fn on_applied_current_term(&self, _: StateRole, _: &Region) {} + } + + fn new_put_entry( + region_id: u64, + region_epoch: RegionEpoch, + k: &[u8], + v: &[u8], + term: u64, + index: u64, + ) -> Entry { + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, k, v); + let mut header = Box::::default(); + header.set_region_id(region_id); + header.set_region_epoch(region_epoch); + let req_encoder = SimpleWriteReqEncoder::new(header, encoder.encode(), 512, false); + let (bin, _) = req_encoder.encode(); + let mut e = Entry::default(); + e.set_entry_type(EntryType::EntryNormal); + e.set_term(term); + e.set_index(index); + e.set_data(bin.into()); + e + } + + #[test] + fn test_capture_apply() { + let store_id = 2; + + let mut region = Region::default(); + region.set_id(1); + region.set_end_key(b"k20".to_vec()); + region.mut_region_epoch().set_version(3); + let peers = vec![new_peer(2, 3)]; + region.set_peers(peers.into()); + + let logger = slog_global::borrow_global().new(o!()); + let path = TempDir::new().unwrap(); + let cf_opts = DATA_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let factory = Box::new(TestTabletFactory::new(DbOptions::default(), cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); + let ctx = TabletContext::new(®ion, Some(5)); + reg.load(ctx, true).unwrap(); + + let mut region_state = RegionLocalState::default(); + region_state.set_state(PeerState::Normal); + region_state.set_region(region.clone()); + region_state.set_tablet_index(5); + + let (read_scheduler, _rx) = dummy_scheduler(); + let (reporter, _) = MockReporter::new(); + let (_tmp_dir, importer) = create_tmp_importer(); + let (ob, cmds_rx) = TestObserver::new(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_cmd_observer(0, BoxCmdObserver::new(ob)); + let mut apply = Apply::new( + &Config::default(), + region + .get_peers() + .iter() + .find(|p| p.store_id == store_id) + .unwrap() + .clone(), + region_state, + reporter, + reg, + read_scheduler, + Arc::new(FlushState::new(5)), + None, + 5, + None, + importer, + host, + logger.clone(), + ); + + let snap = Arc::new(Mutex::new(None)); + let snap_ = snap.clone(); + let (snap_cb, _) = build_any_channel(Box::new(move |args| { + let snap = args.1.take().unwrap(); + let snapshot: RegionSnapshot = match snap.downcast() { + Ok(s) => *s, + Err(t) => unreachable!("snapshot type should be the same: {:?}", t), + }; + *snap_.lock().unwrap() = Some(snapshot); + })); + + // put (k1, v1); + // capture_apply; + // put (k2, v2); + let apply_tasks = vec![ + ApplyTask::CommittedEntries(CommittedEntries { + entry_and_proposals: vec![( + new_put_entry( + region.id, + region.get_region_epoch().clone(), + b"k1", + b"v1", + 5, + 6, + ), + vec![], + )], + committed_time: Instant::now(), + }), + ApplyTask::CaptureApply(CaptureChange { + observer: ChangeObserver::from_cdc(region.id, ObserveHandle::new()), + region_epoch: region.get_region_epoch().clone(), + snap_cb, + }), + ApplyTask::CommittedEntries(CommittedEntries { + entry_and_proposals: vec![( + new_put_entry( + region.id, + region.get_region_epoch().clone(), + b"k2", + b"v2", + 5, + 7, + ), + vec![], + )], + committed_time: Instant::now(), + }), + ]; + + for task in apply_tasks { + match task { + ApplyTask::CommittedEntries(ce) => { + block_on(async { apply.apply_committed_entries(ce).await }); + } + ApplyTask::CaptureApply(capture_change) => { + apply.on_capture_apply(capture_change); + } + _ => unreachable!(), + } + } + apply.flush(); + + // must read (k1, v1) from snapshot and capture (k2, v2) + let snap = snap.lock().unwrap().take().unwrap(); + let v1 = snap.get_value_cf(CF_DEFAULT, b"k1").unwrap().unwrap(); + assert_eq!(v1, b"v1"); + let v2 = snap.get_value_cf(CF_DEFAULT, b"k2").unwrap(); + assert!(v2.is_none()); + + let cmds = cmds_rx.try_recv().unwrap(); + assert_eq!(cmds[0].len(), 1); + let put2 = &cmds[0].cmds[0]; + assert_eq!(put2.term, 5); + assert_eq!(put2.index, 7); + let request = &put2.request.requests[0]; + assert_eq!(request.get_put().get_cf(), CF_DEFAULT); + assert_eq!(request.get_put().get_key(), b"k2"); + assert_eq!(request.get_put().get_value(), b"v2"); + let response = &put2.response; + assert!(!response.get_header().has_error()); + } +} diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 7dc5142e734..f5bdd8664e6 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -512,7 +512,12 @@ impl CmdBatch { self.cmds.push(cmd) } - pub fn extend(&mut self, observe_info: &CmdObserveInfo, region_id: u64, cmds: Vec) { + pub fn extend>( + &mut self, + observe_info: &CmdObserveInfo, + region_id: u64, + cmds: I, + ) { assert_eq!(region_id, self.region_id); assert_eq!(observe_info.cdc_id.id, self.cdc_id); assert_eq!(observe_info.rts_id.id, self.rts_id); diff --git a/components/raftstore/src/store/simple_write.rs b/components/raftstore/src/store/simple_write.rs index 57056f984bd..a303a586935 100644 --- a/components/raftstore/src/store/simple_write.rs +++ b/components/raftstore/src/store/simple_write.rs @@ -679,7 +679,12 @@ mod tests { let mut header = Box::::default(); header.set_term(2); let mut req_encoder: SimpleWriteReqEncoder> = - SimpleWriteReqEncoder::new(header.clone(), bin.clone(), 512, false); + SimpleWriteReqEncoder::>::new( + header.clone(), + bin.clone(), + 512, + false, + ); let mut header2 = Box::::default(); header2.set_term(4); @@ -691,7 +696,12 @@ mod tests { // Frozen bin can't be merged with other bin. assert!(!req_encoder.amend(&header, &bin2)); let mut req_encoder2: SimpleWriteReqEncoder> = - SimpleWriteReqEncoder::new(header.clone(), bin2.clone(), 512, false); + SimpleWriteReqEncoder::>::new( + header.clone(), + bin2.clone(), + 512, + false, + ); assert!(!req_encoder2.amend(&header, &bin)); // Batch should not excceed max size limit. @@ -713,4 +723,107 @@ mod tests { let res = decoder.next(); assert!(res.is_none(), "{:?}", res); } + + #[test] + fn test_to_raft_cmd_request() { + let logger = slog_global::borrow_global().new(o!()); + + // Test header. + let mut header = Box::::default(); + header.set_term(2); + let req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + SimpleWriteEncoder::with_capacity(512).encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + assert_eq!( + header.as_ref(), + SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request() + .get_header(), + ); + + // Test put. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_WRITE, b"write", b"value"); + let req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + encoder.encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request(); + assert_eq!(req.get_requests().len(), 1); + assert_eq!(req.get_requests()[0].get_put().get_cf(), CF_WRITE); + assert_eq!(req.get_requests()[0].get_put().get_key(), b"write"); + assert_eq!(req.get_requests()[0].get_put().get_value(), b"value"); + + // Test delete. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.delete(CF_DEFAULT, b"write"); + let req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + encoder.encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request(); + assert_eq!(req.get_requests().len(), 1); + assert_eq!(req.get_requests()[0].get_delete().get_cf(), CF_DEFAULT); + assert_eq!(req.get_requests()[0].get_delete().get_key(), b"write"); + + // Test delete range. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.delete_range(CF_LOCK, b"start", b"end", true); + let req_encoder = SimpleWriteReqEncoder::>::new( + header.clone(), + encoder.encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request(); + assert_eq!(req.get_requests().len(), 1); + assert_eq!(req.get_requests()[0].get_delete_range().get_cf(), CF_LOCK); + assert_eq!( + req.get_requests()[0].get_delete_range().get_start_key(), + b"start" + ); + assert_eq!( + req.get_requests()[0].get_delete_range().get_end_key(), + b"end" + ); + assert_eq!( + req.get_requests()[0].get_delete_range().get_notify_only(), + true + ); + + // Test ingest. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.ingest(vec![SstMeta::default(); 5]); + let req_encoder = SimpleWriteReqEncoder::>::new( + header, + encoder.encode(), + 512, + false, + ); + let (bin, _) = req_encoder.encode(); + let req = SimpleWriteReqDecoder::new(decoder_fallback, &logger, &bin, 0, 0) + .unwrap() + .to_raft_cmd_request(); + assert_eq!(req.get_requests().len(), 5); + assert!(req.get_requests()[0].has_ingest_sst()); + assert!(req.get_requests()[4].has_ingest_sst()); + } } diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 83bcc2a55fe..0e11049c395 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -1236,6 +1236,8 @@ impl TikvServer { let txn_extra_scheduler = cdc::CdcTxnExtraScheduler::new(cdc_scheduler.clone()); let mut engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); + // Set txn extra scheduler immediately to make sure every clone has the + // scheduler. engine.set_txn_extra_scheduler(Arc::new(txn_extra_scheduler)); self.engines = Some(TikvEngines { diff --git a/src/config/mod.rs b/src/config/mod.rs index dcbfdc4e441..2115236ed71 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -5827,6 +5827,16 @@ mod tests { "#; let mut cfg: TikvConfig = toml::from_str(content).unwrap(); cfg.validate().unwrap(); + + let content = r#" + [storage] + engine = "partitioned-raft-kv" + [cdc] + hibernate-regions-compatible = true + "#; + let mut cfg: TikvConfig = toml::from_str(content).unwrap(); + cfg.validate().unwrap(); + assert!(!cfg.cdc.hibernate_regions_compatible); } #[test] From 5ef9d8abedf331def22f1ab9ad635ba12e4fbe4f Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Wed, 19 Apr 2023 23:53:19 +0800 Subject: [PATCH 649/676] rafstore-v2: fill start and end keys for initial messages (#14607) close tikv/tikv#14606 rafstore-v2: fill start and key for initial messages Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- .../raftstore-v2/src/operation/ready/mod.rs | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 9a29c705aff..f37791638d5 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -26,6 +26,7 @@ use std::{cmp, time::Instant}; use engine_traits::{KvEngine, RaftEngine}; use error_code::ErrorCodeExt; use kvproto::{ + metapb, raft_cmdpb::AdminCmdType, raft_serverpb::{ExtraMessageType, RaftMessage}, }; @@ -34,8 +35,10 @@ use raft::{eraftpb, prelude::MessageType, Ready, SnapshotStatus, StateRole, INVA use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, store::{ - needs_evict_entry_cache, util, worker_metrics::SNAP_COUNTER, FetchedLogs, ReadProgress, - Transport, WriteCallback, WriteTask, + needs_evict_entry_cache, + util::{self, is_initial_msg}, + worker_metrics::SNAP_COUNTER, + FetchedLogs, ReadProgress, Transport, WriteCallback, WriteTask, }, }; use slog::{debug, error, info, trace, warn}; @@ -370,6 +373,7 @@ impl Peer { return None; } }; + let to_peer_is_learner = to_peer.get_role() == metapb::PeerRole::Learner; let mut raft_msg = self.prepare_raft_message(); @@ -383,6 +387,25 @@ impl Peer { "to" => msg.get_to(), ); } + + // Filling start and end key is only needed for being compatible with + // raftstore v1 tiflash engine. + // + // There could be two cases: + // - Target peer already exists but has not established communication with + // leader yet + // - Target peer is added newly due to member change or region split, but it's + // not created yet + // For both cases the region start key and end key are attached in RequestVote + // and Heartbeat message for the store of that peer to check whether to create a + // new peer when receiving these messages, or just to wait for a pending region + // split to perform later. + if self.storage().is_initialized() && is_initial_msg(&msg) && to_peer_is_learner { + let region = self.region(); + raft_msg.set_start_key(region.get_start_key().to_vec()); + raft_msg.set_end_key(region.get_end_key().to_vec()); + } + raft_msg.set_message(msg); Some(raft_msg) } From 32b2a88e4b8388b1d687e4837837863437a3122d Mon Sep 17 00:00:00 2001 From: Connor Date: Thu, 20 Apr 2023 09:15:19 +0800 Subject: [PATCH 650/676] resource_control: take global resource consumption into consideration (#14605) close tikv/tikv#14604 resource control takes global resource consumption into consideration Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- .../resource_control/src/resource_group.rs | 41 +++-- components/server/src/server.rs | 3 +- components/server/src/server2.rs | 3 +- components/test_raftstore-v2/src/server.rs | 1 + components/test_raftstore/src/server.rs | 1 + src/coprocessor/endpoint.rs | 8 +- src/server/raftkv/mod.rs | 6 +- src/server/server.rs | 4 + src/server/service/kv.rs | 77 ++++++++-- src/storage/mod.rs | 142 +++++++++++++----- src/storage/txn/commands/mod.rs | 1 + 12 files changed, 218 insertions(+), 71 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1f0011894b6..32dfbbfc072 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2792,7 +2792,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#ce835ae20dfcb5f69f0aea04236070932c815b6a" +source = "git+https://github.com/pingcap/kvproto.git#dc3cd8784a19bc7f058dbeb19cd8cc4672ee9aad" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 9a7a2e7b3cc..0a808811217 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -14,7 +14,7 @@ use collections::HashMap; use dashmap::{mapref::one::Ref, DashMap}; use fail::fail_point; use kvproto::{ - kvrpcpb::CommandPri, + kvrpcpb::{CommandPri, ResourceControlContext}, resource_manager::{GroupMode, ResourceGroup}, }; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; @@ -54,7 +54,7 @@ pub enum ResourceConsumeType { #[derive(Default)] pub struct ResourceGroupManager { resource_groups: DashMap, - registry: Mutex>>, + registry: RwLock>>, } impl ResourceGroupManager { @@ -85,7 +85,7 @@ impl ResourceGroupManager { pub fn add_resource_group(&self, rg: ResourceGroup) { let group_name = rg.get_name().to_ascii_lowercase(); - self.registry.lock().unwrap().iter().for_each(|controller| { + self.registry.read().iter().for_each(|controller| { let ru_quota = Self::get_ru_setting(&rg, controller.is_read); controller.add_resource_group(group_name.clone().into_bytes(), ru_quota, rg.priority); }); @@ -95,7 +95,7 @@ impl ResourceGroupManager { pub fn remove_resource_group(&self, name: &str) { let group_name = name.to_ascii_lowercase(); - self.registry.lock().unwrap().iter().for_each(|controller| { + self.registry.read().iter().for_each(|controller| { controller.remove_resource_group(group_name.as_bytes()); }); info!("remove resource group"; "name"=> name); @@ -112,7 +112,7 @@ impl ResourceGroupManager { ret }); if !removed_names.is_empty() { - self.registry.lock().unwrap().iter().for_each(|controller| { + self.registry.read().iter().for_each(|controller| { for name in &removed_names { controller.remove_resource_group(name.as_bytes()); } @@ -130,7 +130,7 @@ impl ResourceGroupManager { pub fn derive_controller(&self, name: String, is_read: bool) -> Arc { let controller = Arc::new(ResourceController::new(name, is_read)); - self.registry.lock().unwrap().push(controller.clone()); + self.registry.write().push(controller.clone()); for g in &self.resource_groups { let ru_quota = Self::get_ru_setting(g.value(), controller.is_read); controller.add_resource_group(g.key().clone().into_bytes(), ru_quota, g.priority); @@ -139,10 +139,29 @@ impl ResourceGroupManager { } pub fn advance_min_virtual_time(&self) { - for controller in self.registry.lock().unwrap().iter() { + for controller in self.registry.read().iter() { controller.update_min_virtual_time(); } } + + pub fn consume_penalty(&self, ctx: &ResourceControlContext) { + for controller in self.registry.read().iter() { + // FIXME: Should consume CPU time for read controller and write bytes for write + // controller, once CPU process time of scheduler worker is tracked. Currently, + // we consume write bytes for read controller as the + // order of magnitude of CPU time and write bytes is similar. + controller.consume( + ctx.resource_group_name.as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_nanos( + (ctx.get_penalty().total_cpu_time_ms * 1_000_000.0) as u64, + )), + ); + controller.consume( + ctx.resource_group_name.as_bytes(), + ResourceConsumeType::IoBytes(ctx.get_penalty().write_bytes as u64), + ); + } + } } pub struct ResourceController { @@ -304,8 +323,8 @@ impl ResourceController { }) } - pub fn consume(&self, name: &[u8], delta: ResourceConsumeType) { - self.resource_group(name).consume(delta) + pub fn consume(&self, name: &[u8], resource: ResourceConsumeType) { + self.resource_group(name).consume(resource) } pub fn update_min_virtual_time(&self) { @@ -424,8 +443,8 @@ impl GroupPriorityTracker { // TODO: make it delta type as generic to avoid mixed consume different types. #[inline] - fn consume(&self, delta: ResourceConsumeType) { - let vt_delta = match delta { + fn consume(&self, resource: ResourceConsumeType) { + let vt_delta = match resource { ResourceConsumeType::CpuTime(dur) => dur.as_micros() as u64, ResourceConsumeType::IoBytes(bytes) => bytes, } * self.weight; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index d1c8e09ef96..890089a6950 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -745,7 +745,7 @@ where cop_read_pool_handle, self.concurrency_manager.clone(), resource_tag_factory, - Arc::clone(&self.quota_limiter), + self.quota_limiter.clone(), ), coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), self.resolver.clone().unwrap(), @@ -756,6 +756,7 @@ where unified_read_pool, debug_thread_pool, health_service, + self.resource_manager.clone(), ) .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); cfg_controller.register( diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 0e11049c395..81575b8cbf6 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -654,7 +654,7 @@ where cop_read_pool_handle, self.concurrency_manager.clone(), resource_tag_factory, - Arc::clone(&self.quota_limiter), + self.quota_limiter.clone(), ), coprocessor_v2::Endpoint::new(&self.core.config.coprocessor_v2), self.resolver.clone().unwrap(), @@ -665,6 +665,7 @@ where unified_read_pool, debug_thread_pool, health_service, + self.resource_manager.clone(), ) .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); cfg_controller.register( diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 3de9e5aa956..f110578784f 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -545,6 +545,7 @@ impl ServerCluster { None, debug_thread_pool.clone(), health_service.clone(), + resource_manager.clone(), ) .unwrap(); svr.register_service(create_diagnostics(diag_service.clone())); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index a77fc5d3dd2..4c060cef2ce 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -536,6 +536,7 @@ impl ServerCluster { None, debug_thread_pool.clone(), health_service.clone(), + resource_manager.clone(), ) .unwrap(); svr.register_service(create_import_sst(import_service.clone())); diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 43bf20f582b..71c3d5548a9 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -68,9 +68,9 @@ pub struct Endpoint { slow_log_threshold: Duration, - _phantom: PhantomData, - quota_limiter: Arc, + + _phantom: PhantomData, } impl tikv_util::AssertSend for Endpoint {} @@ -104,8 +104,8 @@ impl Endpoint { stream_channel_size: cfg.end_point_stream_channel_size, max_handle_duration: cfg.end_point_request_max_handle_duration.0, slow_log_threshold: cfg.end_point_slow_log_threshold.0, - _phantom: Default::default(), quota_limiter, + _phantom: Default::default(), } } @@ -488,6 +488,7 @@ impl Endpoint { .new_tag_with_key_ranges(&req_ctx.context, key_ranges); let group_name = req_ctx .context + .get_resource_control_context() .get_resource_group_name() .as_bytes() .to_owned(); @@ -727,6 +728,7 @@ impl Endpoint { let priority = req_ctx.context.get_priority(); let group_name = req_ctx .context + .get_resource_control_context() .get_resource_group_name() .as_bytes() .to_owned(); diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index e175fa502f8..697a4b39d63 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -160,7 +160,11 @@ pub fn new_request_header(ctx: &Context) -> RaftRequestHeader { } header.set_sync_log(ctx.get_sync_log()); header.set_replica_read(ctx.get_replica_read()); - header.set_resource_group_name(ctx.get_resource_group_name().to_owned()); + header.set_resource_group_name( + ctx.get_resource_control_context() + .get_resource_group_name() + .to_owned(), + ); header } diff --git a/src/server/server.rs b/src/server/server.rs index 6e294eda45e..b3db4b4b57f 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -14,6 +14,7 @@ use grpcio::{ChannelBuilder, Environment, ResourceQuota, Server as GrpcServer, S use grpcio_health::{create_health, HealthService, ServingStatus}; use kvproto::tikvpb::*; use raftstore::store::{CheckLeaderTask, SnapManager, TabletSnapManager, ENGINE, TIFLASH}; +use resource_control::ResourceGroupManager; use security::SecurityManager; use tikv_util::{ config::VersionTrack, @@ -103,6 +104,7 @@ where yatp_read_pool: Option, debug_thread_pool: Arc, health_service: HealthService, + resource_manager: Option>, ) -> Result { // A helper thread (or pool) for transport layer. let stats_pool = if cfg.value().stats_concurrency > 0 { @@ -139,6 +141,7 @@ where cfg.value().enable_request_batch, proxy, cfg.value().reject_messages_on_memory_ratio, + resource_manager, ); let addr = SocketAddr::from_str(&cfg.value().addr)?; @@ -598,6 +601,7 @@ mod tests { None, debug_thread_pool, HealthService::default(), + None, ) .unwrap(); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 6fc3a3ebd76..2c77ee4e0bd 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -26,6 +26,7 @@ use raftstore::{ }, Error as RaftStoreError, Result as RaftStoreResult, }; +use resource_control::ResourceGroupManager; use tikv_alloc::trace::MemoryTraceGuard; use tikv_kv::{RaftExtension, StageLatencyStats}; use tikv_util::{ @@ -86,6 +87,8 @@ pub struct Service { // Go `server::Config` to get more details. reject_messages_on_memory_ratio: f64, + + resource_manager: Option>, } impl Drop for Service { @@ -108,6 +111,7 @@ impl Clone for Service Service { enable_req_batch: bool, proxy: Proxy, reject_messages_on_memory_ratio: f64, + resource_manager: Option>, ) -> Self { Service { store_id, @@ -139,6 +144,7 @@ impl Service { grpc_thread_load, proxy, reject_messages_on_memory_ratio, + resource_manager, } } @@ -177,9 +183,12 @@ macro_rules! handle_request { let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); - let resource_group_name = req.get_context().get_resource_group_name(); + let resource_control_ctx = req.get_context().get_resource_control_context(); + if let Some(resource_manager) = &self.resource_manager { + resource_manager.consume_penalty(resource_control_ctx); + } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_group_name]) + .with_label_values(&[resource_control_ctx.get_resource_group_name()]) .inc(); let resp = $future_name(&self.storage, req); let task = async move { @@ -456,6 +465,14 @@ impl Tikv for Service { fn coprocessor(&mut self, ctx: RpcContext<'_>, mut req: Request, sink: UnarySink) { forward_unary!(self.proxy, coprocessor, ctx, req, sink); let source = req.mut_context().take_request_source(); + let resource_control_ctx = req.get_context().get_resource_control_context(); + if let Some(resource_manager) = &self.resource_manager { + resource_manager.consume_penalty(resource_control_ctx); + } + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .inc(); + let begin_instant = Instant::now(); let future = future_copr(&self.copr, Some(ctx.peer()), req); let task = async move { @@ -486,6 +503,14 @@ impl Tikv for Service { sink: UnarySink, ) { let source = req.mut_context().take_request_source(); + let resource_control_ctx = req.get_context().get_resource_control_context(); + if let Some(resource_manager) = &self.resource_manager { + resource_manager.consume_penalty(resource_control_ctx); + } + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .inc(); + let begin_instant = Instant::now(); let future = future_raw_coprocessor(&self.copr_v2, &self.storage, req); let task = async move { @@ -567,6 +592,13 @@ impl Tikv for Service { mut sink: ServerStreamingSink, ) { let begin_instant = Instant::now(); + let resource_control_ctx = req.get_context().get_resource_control_context(); + if let Some(resource_manager) = &self.resource_manager { + resource_manager.consume_penalty(resource_control_ctx); + } + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .inc(); let mut stream = self .copr @@ -820,8 +852,8 @@ impl Tikv for Service { mut sink: DuplexSink, ) { forward_duplex!(self.proxy, batch_commands, ctx, stream, sink); - let (tx, rx) = unbounded(WakePolicy::TillReach(GRPC_MSG_NOTIFY_SIZE)); + let (tx, rx) = unbounded(WakePolicy::TillReach(GRPC_MSG_NOTIFY_SIZE)); let ctx = Arc::new(ctx); let peer = ctx.peer(); let storage = self.storage.clone(); @@ -829,6 +861,7 @@ impl Tikv for Service { let copr_v2 = self.copr_v2.clone(); let pool_size = storage.get_normal_pool_size(); let batch_builder = BatcherBuilder::new(self.enable_req_batch, pool_size); + let resource_manager = self.resource_manager.clone(); let request_handler = stream.try_for_each(move |mut req| { let request_ids = req.take_request_ids(); let requests: Vec<_> = req.take_requests().into(); @@ -845,6 +878,7 @@ impl Tikv for Service { id, req, &tx, + &resource_manager, ); if let Some(batch) = batcher.as_mut() { batch.maybe_commit(&storage, &tx); @@ -1054,6 +1088,7 @@ fn handle_batch_commands_request( id: u64, req: batch_commands_request::Request, tx: &Sender, + resource_manager: &Option>, ) { // To simplify code and make the logic more clear. macro_rules! oneof { @@ -1075,10 +1110,13 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid, String::default()); }, Some(batch_commands_request::request::Cmd::Get(mut req)) => { - let resource_group_name = req.get_context().get_resource_group_name(); + let resource_control_ctx = req.get_context().get_resource_control_context(); + if let Some(resource_manager) = resource_manager { + resource_manager.consume_penalty(resource_control_ctx); + } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_group_name]) - .inc(); + .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_get(&req) }) { @@ -1093,10 +1131,13 @@ fn handle_batch_commands_request( } }, Some(batch_commands_request::request::Cmd::RawGet(mut req)) => { - let resource_group_name = req.get_context().get_resource_group_name(); + let resource_control_ctx = req.get_context().get_resource_control_context(); + if let Some(resource_manager) = resource_manager { + resource_manager.consume_penalty(resource_control_ctx); + } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_group_name]) - .inc(); + .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_raw_get(&req) }) { @@ -1111,10 +1152,13 @@ fn handle_batch_commands_request( } }, Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { - let resource_group_name = req.get_context().get_resource_group_name(); + let resource_control_ctx = req.get_context().get_resource_control_context(); + if let Some(resource_manager) = resource_manager { + resource_manager.consume_penalty(resource_control_ctx); + } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_group_name]) - .inc(); + .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .inc(); let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); let resp = future_copr(copr, Some(peer.to_string()), req) @@ -1142,10 +1186,13 @@ fn handle_batch_commands_request( ); } $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { - let resource_group_name = req.get_context().get_resource_group_name(); + let resource_control_ctx = req.get_context().get_resource_control_context(); + if let Some(resource_manager) = resource_manager { + resource_manager.consume_penalty(resource_control_ctx); + } GRPC_RESOURCE_GROUP_COUNTER_VEC - .with_label_values(&[resource_group_name]) - .inc(); + .with_label_values(&[resource_control_ctx.get_resource_group_name()]) + .inc(); let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); let resp = $future_fn($($arg,)* req) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index faacc4cf4cb..37263ce9a12 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -141,6 +141,25 @@ use crate::{ pub type Result = std::result::Result; pub type Callback = Box) + Send>; +macro_rules! check_key_size { + ($key_iter:expr, $max_key_size:expr, $callback:ident) => { + for k in $key_iter { + let key_size = k.len(); + if key_size > $max_key_size { + $callback(Err(Error::from(ErrorInner::KeyTooLarge { + size: key_size, + limit: $max_key_size, + }))); + return Ok(()); + } + } + }; +} + +/// Storage for Api V1 +/// To be convenience for test cases unrelated to RawKV. +pub type StorageApiV1 = Storage; + /// [`Storage`](Storage) implements transactional KV APIs and raw KV APIs on a /// given [`Engine`]. An [`Engine`] provides low level KV functionality. /// [`Engine`] has multiple implementations. When a TiKV server is running, a @@ -196,10 +215,6 @@ pub struct Storage { _phantom: PhantomData, } -/// Storage for Api V1 -/// To be convenience for test cases unrelated to RawKV. -pub type StorageApiV1 = Storage; - impl Clone for Storage { #[inline] fn clone(&self) -> Self { @@ -219,7 +234,7 @@ impl Clone for Storage { api_version: self.api_version, causal_ts_provider: self.causal_ts_provider.clone(), resource_tag_factory: self.resource_tag_factory.clone(), - quota_limiter: Arc::clone(&self.quota_limiter), + quota_limiter: self.quota_limiter.clone(), _phantom: PhantomData, } } @@ -242,21 +257,6 @@ impl Drop for Storage { } } -macro_rules! check_key_size { - ($key_iter:expr, $max_key_size:expr, $callback:ident) => { - for k in $key_iter { - let key_size = k.len(); - if key_size > $max_key_size { - $callback(Err(Error::from(ErrorInner::KeyTooLarge { - size: key_size, - limit: $max_key_size, - }))); - return Ok(()); - } - } - }; -} - impl Storage { /// Create a `Storage` from given engine. pub fn from_engine( @@ -598,7 +598,11 @@ impl Storage { let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::get; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -756,6 +760,7 @@ impl Storage { let priority = requests[0].get_context().get_priority(); let group_name = requests[0] .get_context() + .get_resource_control_context() .get_resource_group_name() .as_bytes() .to_owned(); @@ -938,7 +943,11 @@ impl Storage { let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::batch_get; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = keys .iter() @@ -1117,7 +1126,11 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::scan; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -1283,7 +1296,11 @@ impl Storage { ) -> impl Future>> { const CMD: CommandKind = CommandKind::scan_lock; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -1591,7 +1608,11 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_get; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self .resource_tag_factory @@ -1672,6 +1693,7 @@ impl Storage { let priority = gets[0].get_context().get_priority(); let group_name = gets[0] .get_context() + .get_resource_control_context() .get_resource_group_name() .as_bytes() .to_owned(); @@ -1803,7 +1825,11 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_batch_get; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = keys.iter().map(|k| (k.clone(), k.clone())).collect(); let resource_tag = self @@ -1949,7 +1975,10 @@ impl Storage { let concurrency_manager = self.concurrency_manager.clone(); let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .to_owned(); self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); @@ -2061,7 +2090,10 @@ impl Storage { let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .to_owned(); self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); @@ -2126,7 +2158,10 @@ impl Storage { let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .to_owned(); self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); @@ -2187,7 +2222,10 @@ impl Storage { let engine = self.engine.clone(); let deadline = Self::get_deadline(&ctx); let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .to_owned(); self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); @@ -2235,7 +2273,10 @@ impl Storage { let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .to_owned(); self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); @@ -2299,7 +2340,11 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_scan; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag(&ctx); let api_version = self.api_version; @@ -2426,7 +2471,11 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_batch_scan; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = ranges .iter() @@ -2578,7 +2627,11 @@ impl Storage { ) -> impl Future>> { const CMD: CommandKind = CommandKind::raw_get_key_ttl; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self .resource_tag_factory @@ -2667,7 +2720,10 @@ impl Storage { } let sched = self.get_scheduler(); let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .to_owned(); self.sched_raw_command(&group_name, priority, CMD, async move { let key = F::encode_raw_key_owned(key, None); let cmd = RawCompareAndSwap::new(cf, key, previous_value, value, ttl, api_version, ctx); @@ -2700,7 +2756,10 @@ impl Storage { let sched = self.get_scheduler(); let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .to_owned(); self.sched_raw_command(&group_name, priority, CMD, async move { let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls, None); let cmd = RawAtomicStore::new(cf, modifies, ctx); @@ -2725,7 +2784,10 @@ impl Storage { let cf = Self::rawkv_cf(&cf, self.api_version)?; let sched = self.get_scheduler(); let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .to_owned(); self.sched_raw_command(&group_name, priority, CMD, async move { // Do NOT encode ts here as RawAtomicStore use key to gen lock let modifies = keys @@ -2749,7 +2811,11 @@ impl Storage { ) -> impl Future> { const CMD: CommandKind = CommandKind::raw_checksum; let priority = ctx.get_priority(); - let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); + let group_name = ctx + .get_resource_control_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = ranges .iter() diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 54f5029bd6c..4c01629ef48 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -719,6 +719,7 @@ impl Command { pub fn group_name(&self) -> String { self.command_ext() .get_ctx() + .get_resource_control_context() .get_resource_group_name() .to_owned() } From 21a98d84312219997a1d34bc5c52e696ea40427f Mon Sep 17 00:00:00 2001 From: lijie Date: Thu, 20 Apr 2023 12:06:46 +0800 Subject: [PATCH 651/676] chore: bump version to v7.2.0-alpha (#14615) Signed-off-by: lijie --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 32dfbbfc072..7541dd2666c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6419,7 +6419,7 @@ dependencies = [ [[package]] name = "tikv" -version = "7.1.0-alpha" +version = "7.2.0-alpha" dependencies = [ "anyhow", "api_version", diff --git a/Cargo.toml b/Cargo.toml index 1bd9377d5f0..5363de8bd59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "7.1.0-alpha" +version = "7.2.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" From 4b5846f85b7f6a58c3d44f764d4fe722fccaa64b Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 20 Apr 2023 16:23:19 +0800 Subject: [PATCH 652/676] engine: add configurations for filter enhancements (#14527) ref tikv/tikv#12842 Add some configurations for RocksDB filter enhancements Signed-off-by: tabokie --- Cargo.lock | 6 +- etc/config-template.toml | 10 +++ src/config/mod.rs | 82 ++++++++++++++++------ tests/integrations/config/mod.rs | 10 +++ tests/integrations/config/test-custom.toml | 10 +++ 5 files changed, 95 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7541dd2666c..da49ade1d6b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2921,7 +2921,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#a9fbe325939c166ffc5f80e63066f5d8594a1fff" +source = "git+https://github.com/tikv/rust-rocksdb.git#ce788e498f1d70ab7cbf44dcaca5049bbc05a943" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2940,7 +2940,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#a9fbe325939c166ffc5f80e63066f5d8594a1fff" +source = "git+https://github.com/tikv/rust-rocksdb.git#ce788e498f1d70ab7cbf44dcaca5049bbc05a943" dependencies = [ "bzip2-sys", "cc", @@ -4867,7 +4867,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#a9fbe325939c166ffc5f80e63066f5d8594a1fff" +source = "git+https://github.com/tikv/rust-rocksdb.git#ce788e498f1d70ab7cbf44dcaca5049bbc05a943" dependencies = [ "libc 0.2.139", "librocksdb_sys", diff --git a/etc/config-template.toml b/etc/config-template.toml index aec5e108949..89f39be79ca 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -693,6 +693,12 @@ ## filter. # block-based-bloom-filter = false +## Use Ribbon filter for levels higher or equal to this value. Use non-block-based bloom filter for +## lower levels. When this is set, `block-based-bloom-filter` will be ignored. +## Only effective for `format-version` >= 5. +## Disabled by default. +# ribbon-filter-above-level = 0 + # level0-file-num-compaction-trigger = 4 ## Soft limit on number of level-0 files. @@ -789,6 +795,10 @@ ## while using `Raw` mode. # optimize-filters-for-hits = true +## Option to generate Bloom/Ribbon filters that minimize memory internal fragmentation. +## Only effective for `format-version` >= 5. +# optimize-filters-for-memory = false + ## Enable compaction guard, which is an optimization to split SST files at TiKV region boundaries. ## The optimization can help reduce compaction IO, and allow us to use larger SST file size ## (thus less SST files overall) while making sure we can still efficiently cleanup stale data on diff --git a/src/config/mod.rs b/src/config/mod.rs index 2115236ed71..2efe9ea4c9b 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -309,12 +309,16 @@ macro_rules! cf_config { #[online_config(skip)] pub optimize_filters_for_hits: bool, #[online_config(skip)] + pub optimize_filters_for_memory: bool, + #[online_config(skip)] pub whole_key_filtering: bool, #[online_config(skip)] pub bloom_filter_bits_per_key: i32, #[online_config(skip)] pub block_based_bloom_filter: bool, #[online_config(skip)] + pub ribbon_filter_above_level: Option, + #[online_config(skip)] pub read_amp_bytes_per_bit: u32, #[serde(with = "rocks_config::compression_type_level_serde")] #[online_config(skip)] @@ -431,6 +435,9 @@ macro_rules! write_into_metrics { $metrics .with_label_values(&[$tag, "optimize_filters_for_hits"]) .set(($cf.optimize_filters_for_hits as i32).into()); + $metrics + .with_label_values(&[$tag, "optimize_filters_for_memory"]) + .set(($cf.optimize_filters_for_memory as i32).into()); $metrics .with_label_values(&[$tag, "whole_key_filtering"]) .set(($cf.whole_key_filtering as i32).into()); @@ -440,6 +447,11 @@ macro_rules! write_into_metrics { $metrics .with_label_values(&[$tag, "block_based_bloom_filter"]) .set(($cf.block_based_bloom_filter as i32).into()); + if let Some(level) = $cf.ribbon_filter_above_level { + $metrics + .with_label_values(&[$tag, "ribbon_filter_above_level"]) + .set((level as i32).into()); + } $metrics .with_label_values(&[$tag, "read_amp_bytes_per_bit"]) @@ -548,16 +560,24 @@ macro_rules! build_cf_opt { block_base_opts .set_pin_l0_filter_and_index_blocks_in_cache($opt.pin_l0_filter_and_index_blocks); if $opt.use_bloom_filter { - block_base_opts.set_bloom_filter( - $opt.bloom_filter_bits_per_key as f64, - $opt.block_based_bloom_filter, - ); + if let Some(level) = $opt.ribbon_filter_above_level { + block_base_opts.set_ribbon_filter( + $opt.bloom_filter_bits_per_key as f64, + level as i32 - 1, // bloom_before_level + ); + } else { + block_base_opts.set_bloom_filter( + $opt.bloom_filter_bits_per_key as f64, + $opt.block_based_bloom_filter, + ); + } block_base_opts.set_whole_key_filtering($opt.whole_key_filtering); } block_base_opts.set_read_amp_bytes_per_bit($opt.read_amp_bytes_per_bit); block_base_opts.set_prepopulate_block_cache($opt.prepopulate_block_cache); block_base_opts.set_format_version($opt.format_version); block_base_opts.set_checksum($opt.checksum); + block_base_opts.set_optimize_filters_for_memory($opt.optimize_filters_for_memory); let mut cf_opts = RocksCfOptions::default(); cf_opts.set_block_based_table_factory(&block_base_opts); cf_opts.set_num_levels($opt.num_levels); @@ -650,9 +670,11 @@ impl Default for DefaultCfConfig { pin_l0_filter_and_index_blocks: true, use_bloom_filter: true, optimize_filters_for_hits: true, + optimize_filters_for_memory: false, whole_key_filtering: true, bloom_filter_bits_per_key: 10, block_based_bloom_filter: false, + ribbon_filter_above_level: None, read_amp_bytes_per_bit: 0, compression_per_level: [ DBCompressionType::No, @@ -814,9 +836,11 @@ impl Default for WriteCfConfig { pin_l0_filter_and_index_blocks: true, use_bloom_filter: true, optimize_filters_for_hits: false, + optimize_filters_for_memory: false, whole_key_filtering: false, bloom_filter_bits_per_key: 10, block_based_bloom_filter: false, + ribbon_filter_above_level: None, read_amp_bytes_per_bit: 0, compression_per_level: [ DBCompressionType::No, @@ -940,9 +964,11 @@ impl Default for LockCfConfig { pin_l0_filter_and_index_blocks: true, use_bloom_filter: true, optimize_filters_for_hits: false, + optimize_filters_for_memory: false, whole_key_filtering: true, bloom_filter_bits_per_key: 10, block_based_bloom_filter: false, + ribbon_filter_above_level: None, read_amp_bytes_per_bit: 0, compression_per_level: [DBCompressionType::No; 7], write_buffer_size: ReadableSize::mb(32), @@ -1033,9 +1059,11 @@ impl Default for RaftCfConfig { pin_l0_filter_and_index_blocks: true, use_bloom_filter: true, optimize_filters_for_hits: true, + optimize_filters_for_memory: false, whole_key_filtering: true, bloom_filter_bits_per_key: 10, block_based_bloom_filter: false, + ribbon_filter_above_level: None, read_amp_bytes_per_bit: 0, compression_per_level: [DBCompressionType::No; 7], write_buffer_size: ReadableSize::mb(128), @@ -1544,9 +1572,11 @@ impl Default for RaftDefaultCfConfig { pin_l0_filter_and_index_blocks: true, use_bloom_filter: false, optimize_filters_for_hits: true, + optimize_filters_for_memory: false, whole_key_filtering: true, bloom_filter_bits_per_key: 10, block_based_bloom_filter: false, + ribbon_filter_above_level: None, read_amp_bytes_per_bit: 0, compression_per_level: [ DBCompressionType::No, @@ -5688,31 +5718,43 @@ mod tests { cfg.raft_engine.mut_config().memory_limit = None; cfg.coprocessor_v2.coprocessor_plugin_directory = None; // Default is `None`, which is represented by not setting the key. cfg.rocksdb.write_buffer_limit = None; + // cfg.rocksdb.defaultcf.enable_compaction_guard = None; - cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger = None; - cfg.rocksdb.defaultcf.level0_stop_writes_trigger = None; - cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = None; - cfg.rocksdb.defaultcf.hard_pending_compaction_bytes_limit = None; cfg.rocksdb.writecf.enable_compaction_guard = None; - cfg.rocksdb.writecf.level0_slowdown_writes_trigger = None; - cfg.rocksdb.writecf.level0_stop_writes_trigger = None; - cfg.rocksdb.writecf.soft_pending_compaction_bytes_limit = None; - cfg.rocksdb.writecf.hard_pending_compaction_bytes_limit = None; cfg.rocksdb.lockcf.enable_compaction_guard = None; - cfg.rocksdb.lockcf.level0_slowdown_writes_trigger = None; - cfg.rocksdb.lockcf.level0_stop_writes_trigger = None; - cfg.rocksdb.lockcf.soft_pending_compaction_bytes_limit = None; - cfg.rocksdb.lockcf.hard_pending_compaction_bytes_limit = None; cfg.rocksdb.raftcf.enable_compaction_guard = None; - cfg.rocksdb.raftcf.level0_slowdown_writes_trigger = None; - cfg.rocksdb.raftcf.level0_stop_writes_trigger = None; - cfg.rocksdb.raftcf.soft_pending_compaction_bytes_limit = None; - cfg.rocksdb.raftcf.hard_pending_compaction_bytes_limit = None; cfg.raftdb.defaultcf.enable_compaction_guard = None; + // + cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger = None; + cfg.rocksdb.writecf.level0_slowdown_writes_trigger = None; + cfg.rocksdb.lockcf.level0_slowdown_writes_trigger = None; + cfg.rocksdb.raftcf.level0_slowdown_writes_trigger = None; cfg.raftdb.defaultcf.level0_slowdown_writes_trigger = None; + // + cfg.rocksdb.defaultcf.level0_stop_writes_trigger = None; + cfg.rocksdb.writecf.level0_stop_writes_trigger = None; + cfg.rocksdb.lockcf.level0_stop_writes_trigger = None; + cfg.rocksdb.raftcf.level0_stop_writes_trigger = None; cfg.raftdb.defaultcf.level0_stop_writes_trigger = None; + // + cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = None; + cfg.rocksdb.writecf.soft_pending_compaction_bytes_limit = None; + cfg.rocksdb.lockcf.soft_pending_compaction_bytes_limit = None; + cfg.rocksdb.raftcf.soft_pending_compaction_bytes_limit = None; cfg.raftdb.defaultcf.soft_pending_compaction_bytes_limit = None; + // + cfg.rocksdb.defaultcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.writecf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.lockcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.raftcf.hard_pending_compaction_bytes_limit = None; cfg.raftdb.defaultcf.hard_pending_compaction_bytes_limit = None; + // + cfg.rocksdb.defaultcf.ribbon_filter_above_level = None; + cfg.rocksdb.writecf.ribbon_filter_above_level = None; + cfg.rocksdb.lockcf.ribbon_filter_above_level = None; + cfg.rocksdb.raftcf.ribbon_filter_above_level = None; + cfg.raftdb.defaultcf.ribbon_filter_above_level = None; + cfg.coprocessor .optimize_for(default_cfg.storage.engine == EngineType::RaftKv2); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 7d40cde87d5..34b558f39c0 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -329,9 +329,11 @@ fn test_serde_custom_tikv_config() { pin_l0_filter_and_index_blocks: false, use_bloom_filter: false, optimize_filters_for_hits: false, + optimize_filters_for_memory: true, whole_key_filtering: true, bloom_filter_bits_per_key: 123, block_based_bloom_filter: true, + ribbon_filter_above_level: Some(1), read_amp_bytes_per_bit: 0, compression_per_level: [ DBCompressionType::No, @@ -384,9 +386,11 @@ fn test_serde_custom_tikv_config() { pin_l0_filter_and_index_blocks: false, use_bloom_filter: false, optimize_filters_for_hits: true, + optimize_filters_for_memory: true, whole_key_filtering: true, bloom_filter_bits_per_key: 123, block_based_bloom_filter: true, + ribbon_filter_above_level: Some(1), read_amp_bytes_per_bit: 0, compression_per_level: [ DBCompressionType::No, @@ -453,9 +457,11 @@ fn test_serde_custom_tikv_config() { pin_l0_filter_and_index_blocks: false, use_bloom_filter: false, optimize_filters_for_hits: true, + optimize_filters_for_memory: true, whole_key_filtering: true, bloom_filter_bits_per_key: 123, block_based_bloom_filter: true, + ribbon_filter_above_level: Some(1), read_amp_bytes_per_bit: 0, compression_per_level: [ DBCompressionType::No, @@ -522,9 +528,11 @@ fn test_serde_custom_tikv_config() { pin_l0_filter_and_index_blocks: false, use_bloom_filter: false, optimize_filters_for_hits: false, + optimize_filters_for_memory: true, whole_key_filtering: true, bloom_filter_bits_per_key: 123, block_based_bloom_filter: true, + ribbon_filter_above_level: Some(1), read_amp_bytes_per_bit: 0, compression_per_level: [ DBCompressionType::No, @@ -620,9 +628,11 @@ fn test_serde_custom_tikv_config() { pin_l0_filter_and_index_blocks: false, use_bloom_filter: false, optimize_filters_for_hits: false, + optimize_filters_for_memory: true, whole_key_filtering: true, bloom_filter_bits_per_key: 123, block_based_bloom_filter: true, + ribbon_filter_above_level: Some(1), read_amp_bytes_per_bit: 0, compression_per_level: [ DBCompressionType::No, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 416505a7318..28a30fcec04 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -288,9 +288,11 @@ cache-index-and-filter-blocks = false pin-l0-filter-and-index-blocks = false use-bloom-filter = false optimize-filters-for-hits = false +optimize-filters-for-memory = true whole-key-filtering = true bloom-filter-bits-per-key = 123 block-based-bloom-filter = true +ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 compression-per-level = [ "no", @@ -355,9 +357,11 @@ cache-index-and-filter-blocks = false pin-l0-filter-and-index-blocks = false use-bloom-filter = false optimize-filters-for-hits = true +optimize-filters-for-memory = true whole-key-filtering = true bloom-filter-bits-per-key = 123 block-based-bloom-filter = true +ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 compression-per-level = [ "no", @@ -405,9 +409,11 @@ cache-index-and-filter-blocks = false pin-l0-filter-and-index-blocks = false use-bloom-filter = false optimize-filters-for-hits = true +optimize-filters-for-memory = true whole-key-filtering = true bloom-filter-bits-per-key = 123 block-based-bloom-filter = true +ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 compression-per-level = [ "no", @@ -455,9 +461,11 @@ cache-index-and-filter-blocks = false pin-l0-filter-and-index-blocks = false use-bloom-filter = false optimize-filters-for-hits = false +optimize-filters-for-memory = true whole-key-filtering = true bloom-filter-bits-per-key = 123 block-based-bloom-filter = true +ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 compression-per-level = [ "no", @@ -537,9 +545,11 @@ cache-index-and-filter-blocks = false pin-l0-filter-and-index-blocks = false use-bloom-filter = false optimize-filters-for-hits = false +optimize-filters-for-memory = true whole-key-filtering = true bloom-filter-bits-per-key = 123 block-based-bloom-filter = true +ribbon-filter-above-level = 1 read-amp-bytes-per-bit = 0 compression-per-level = [ "no", From 3867b954fff137f24c26af9350252a6b5cdca6e1 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 20 Apr 2023 17:35:20 +0800 Subject: [PATCH 653/676] raftstore-v2: support renaming encrypted dir (inefficiently) and batch importing data keys (#14583) ref tikv/tikv#12842, ref tikv/tikv#14095, ref tikv/tikv#14097 support renaming encrypted dir (inefficiently) and batch importing data keys Signed-off-by: tabokie --- Cargo.lock | 12 +- components/encryption/Cargo.toml | 1 + components/encryption/export/src/lib.rs | 5 +- components/encryption/src/file_dict_file.rs | 56 ++- components/encryption/src/lib.rs | 119 ++++++ components/encryption/src/manager/mod.rs | 386 ++++++++++++++++-- components/engine_rocks/src/sst.rs | 4 +- components/engine_test/src/lib.rs | 5 +- components/engine_traits_tests/Cargo.toml | 4 + .../engine_traits_tests/src/checkpoint.rs | 49 +++ components/engine_traits_tests/src/ctor.rs | 40 +- components/engine_traits_tests/src/lib.rs | 1 + components/file_system/src/lib.rs | 76 ---- components/raftstore-v2/Cargo.toml | 1 + components/raftstore-v2/src/batch/store.rs | 35 +- components/raftstore-v2/src/fsm/peer.rs | 4 +- components/raftstore-v2/src/operation/life.rs | 11 +- .../src/operation/ready/apply_trace.rs | 12 +- .../raftstore-v2/src/operation/ready/mod.rs | 5 +- .../src/operation/ready/snapshot.rs | 31 +- components/raftstore-v2/src/raft/peer.rs | 6 +- components/raftstore-v2/src/raft/storage.rs | 8 +- .../tests/integrations/cluster.rs | 23 +- .../raftstore/src/store/async_io/read.rs | 9 +- components/raftstore/src/store/snap.rs | 39 +- components/server/src/server2.rs | 10 +- components/test_raftstore-v2/src/node.rs | 14 +- components/test_raftstore-v2/src/server.rs | 14 +- components/test_util/src/encryption.rs | 12 +- src/server/engine_factory.rs | 12 +- src/server/raftkv2/node.rs | 5 + src/server/tablet_snap.rs | 3 + tests/failpoints/cases/test_encryption.rs | 8 +- tests/integrations/import/util.rs | 2 +- 34 files changed, 828 insertions(+), 194 deletions(-) create mode 100644 components/engine_traits_tests/src/checkpoint.rs diff --git a/Cargo.lock b/Cargo.lock index da49ade1d6b..7f2a1e91650 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1486,6 +1486,7 @@ dependencies = [ "tikv_util", "tokio", "toml", + "walkdir", ] [[package]] @@ -1630,10 +1631,14 @@ dependencies = [ name = "engine_traits_tests" version = "0.0.1" dependencies = [ + "encryption", + "encryption_export", "engine_test", "engine_traits", + "kvproto", "panic_hook", "tempfile", + "test_util", "tikv_alloc", ] @@ -2921,7 +2926,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#ce788e498f1d70ab7cbf44dcaca5049bbc05a943" +source = "git+https://github.com/tikv/rust-rocksdb.git#062638a741adcd9074659eb28cbe7f6a676938d5" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -2940,7 +2945,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#ce788e498f1d70ab7cbf44dcaca5049bbc05a943" +source = "git+https://github.com/tikv/rust-rocksdb.git#062638a741adcd9074659eb28cbe7f6a676938d5" dependencies = [ "bzip2-sys", "cc", @@ -4426,6 +4431,7 @@ dependencies = [ "collections", "concurrency_manager", "crossbeam", + "encryption_export", "engine_rocks", "engine_test", "engine_traits", @@ -4867,7 +4873,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#ce788e498f1d70ab7cbf44dcaca5049bbc05a943" +source = "git+https://github.com/tikv/rust-rocksdb.git#062638a741adcd9074659eb28cbe7f6a676938d5" dependencies = [ "libc 0.2.139", "librocksdb_sys", diff --git a/components/encryption/Cargo.toml b/components/encryption/Cargo.toml index 94ab0d39957..deac60223a7 100644 --- a/components/encryption/Cargo.toml +++ b/components/encryption/Cargo.toml @@ -37,6 +37,7 @@ thiserror = "1.0" tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["time", "rt"] } +walkdir = "2" [dev-dependencies] matches = "0.1.8" diff --git a/components/encryption/export/src/lib.rs b/components/encryption/export/src/lib.rs index e29a41cd07e..be86db83082 100644 --- a/components/encryption/export/src/lib.rs +++ b/components/encryption/export/src/lib.rs @@ -14,8 +14,9 @@ use derive_more::Deref; #[cfg(feature = "cloud-aws")] pub use encryption::KmsBackend; pub use encryption::{ - from_engine_encryption_method, Backend, DataKeyManager, DataKeyManagerArgs, DecrypterReader, - EncryptionConfig, Error, FileConfig, Iv, KmsConfig, MasterKeyConfig, Result, + clean_up_dir, clean_up_trash, from_engine_encryption_method, trash_dir_all, Backend, + DataKeyManager, DataKeyManagerArgs, DecrypterReader, EncryptionConfig, Error, FileConfig, Iv, + KmsConfig, MasterKeyConfig, Result, }; use encryption::{ DataKeyPair, EncryptedKey, FileBackend, KmsProvider, PlainKey, PlaintextBackend, diff --git a/components/encryption/src/file_dict_file.rs b/components/encryption/src/file_dict_file.rs index 4a2609cacb5..cfa945a5cd7 100644 --- a/components/encryption/src/file_dict_file.rs +++ b/components/encryption/src/file_dict_file.rs @@ -217,10 +217,11 @@ impl FileDictionaryFile { Ok(file_dict) } - /// Append an insert operation to the log file. + /// Append an insert operation to the log file. The record is guaranteed to + /// be persisted if `sync` is set. /// /// Warning: `self.write(file_dict)` must be called before. - pub fn insert(&mut self, name: &str, info: &FileInfo) -> Result<()> { + pub fn insert(&mut self, name: &str, info: &FileInfo, sync: bool) -> Result<()> { self.file_dict.files.insert(name.to_owned(), info.clone()); if self.enable_log { let file = self.append_file.as_mut().unwrap(); @@ -231,12 +232,16 @@ impl FileDictionaryFile { let truncate_num: usize = truncate_num.map_or(0, |c| c.parse().unwrap()); bytes.truncate(truncate_num); file.write_all(&bytes)?; - file.sync_all()?; + if sync { + file.sync_all()?; + } Ok(()) }); file.write_all(&bytes)?; - file.sync_all()?; + if sync { + file.sync_all()?; + } self.file_size += bytes.len(); self.check_compact()?; @@ -250,13 +255,15 @@ impl FileDictionaryFile { /// Append a remove operation to the log file. /// /// Warning: `self.write(file_dict)` must be called before. - pub fn remove(&mut self, name: &str) -> Result<()> { + pub fn remove(&mut self, name: &str, sync: bool) -> Result<()> { self.file_dict.files.remove(name); if self.enable_log { let file = self.append_file.as_mut().unwrap(); let bytes = Self::convert_record_to_bytes(name, LogRecord::Remove)?; file.write_all(&bytes)?; - file.sync_all()?; + if sync { + file.sync_all()?; + } self.removed += 1; self.file_size += bytes.len(); @@ -268,6 +275,13 @@ impl FileDictionaryFile { Ok(()) } + pub fn sync(&mut self) -> Result<()> { + if self.enable_log { + self.append_file.as_mut().unwrap().sync_all()?; + } + Ok(()) + } + /// This function needs to be called after each append operation to check /// if compact is needed. fn check_compact(&mut self) -> Result<()> { @@ -407,9 +421,9 @@ mod tests { let info4 = create_file_info(4, EncryptionMethod::Aes128Ctr); let info5 = create_file_info(3, EncryptionMethod::Aes128Ctr); - file_dict_file.insert("info1", &info1).unwrap(); - file_dict_file.insert("info2", &info2).unwrap(); - file_dict_file.insert("info3", &info3).unwrap(); + file_dict_file.insert("info1", &info1, true).unwrap(); + file_dict_file.insert("info2", &info2, true).unwrap(); + file_dict_file.insert("info3", &info3, true).unwrap(); let file_dict = file_dict_file.recovery().unwrap(); @@ -418,9 +432,9 @@ mod tests { assert_eq!(*file_dict.files.get("info3").unwrap(), info3); assert_eq!(file_dict.files.len(), 3); - file_dict_file.remove("info2").unwrap(); - file_dict_file.remove("info1").unwrap(); - file_dict_file.insert("info2", &info4).unwrap(); + file_dict_file.remove("info2", true).unwrap(); + file_dict_file.remove("info1", true).unwrap(); + file_dict_file.insert("info2", &info4, true).unwrap(); let file_dict = file_dict_file.recovery().unwrap(); assert_eq!(file_dict.files.get("info1"), None); @@ -428,8 +442,8 @@ mod tests { assert_eq!(*file_dict.files.get("info3").unwrap(), info3); assert_eq!(file_dict.files.len(), 2); - file_dict_file.insert("info5", &info5).unwrap(); - file_dict_file.remove("info3").unwrap(); + file_dict_file.insert("info5", &info5, true).unwrap(); + file_dict_file.remove("info3", true).unwrap(); let file_dict = file_dict_file.recovery().unwrap(); assert_eq!(file_dict.files.get("info1"), None); @@ -460,7 +474,7 @@ mod tests { .unwrap(); let info = create_file_info(1, EncryptionMethod::Aes256Ctr); - file_dict_file.insert("info", &info).unwrap(); + file_dict_file.insert("info", &info, true).unwrap(); let (_, file_dict) = FileDictionaryFile::open( tempdir.path(), @@ -550,14 +564,14 @@ mod tests { ) .unwrap(); - file_dict.insert("f1", &info1).unwrap(); - file_dict.insert("f2", &info2).unwrap(); - file_dict.insert("f3", &info3).unwrap(); + file_dict.insert("f1", &info1, true).unwrap(); + file_dict.insert("f2", &info2, true).unwrap(); + file_dict.insert("f3", &info3, true).unwrap(); - file_dict.insert("f4", &info4).unwrap(); - file_dict.remove("f3").unwrap(); + file_dict.insert("f4", &info4, true).unwrap(); + file_dict.remove("f3", true).unwrap(); - file_dict.remove("f2").unwrap(); + file_dict.remove("f2", true).unwrap(); } // Try open as v1 file. Should fail. { diff --git a/components/encryption/src/lib.rs b/components/encryption/src/lib.rs index 7f9079ed030..c16142eb30b 100644 --- a/components/encryption/src/lib.rs +++ b/components/encryption/src/lib.rs @@ -10,6 +10,8 @@ mod manager; mod master_key; mod metrics; +use std::{io::ErrorKind, path::Path}; + pub use self::{ config::*, crypter::{ @@ -27,3 +29,120 @@ pub use self::{ Backend, DataKeyPair, EncryptedKey, FileBackend, KmsBackend, KmsProvider, PlaintextBackend, }, }; + +const TRASH_PREFIX: &str = "TRASH-"; + +/// Remove a directory. +/// +/// Rename it before actually removal. +#[inline] +pub fn trash_dir_all( + path: impl AsRef, + key_manager: Option<&DataKeyManager>, +) -> std::io::Result<()> { + let path = path.as_ref(); + let name = match path.file_name() { + Some(n) => n, + None => { + return Err(std::io::Error::new( + ErrorKind::InvalidInput, + "path is invalid", + )); + } + }; + let trash_path = path.with_file_name(format!("{}{}", TRASH_PREFIX, name.to_string_lossy())); + if let Err(e) = file_system::rename(path, &trash_path) { + if e.kind() == ErrorKind::NotFound { + return Ok(()); + } + return Err(e); + } else if let Some(m) = key_manager { + m.remove_dir(path, Some(&trash_path))?; + } + file_system::remove_dir_all(trash_path) +} + +/// When using `trash_dir_all`, it's possible the directory is marked as trash +/// but not being actually deleted after a restart. This function can be used +/// to resume all those removal in the given directory. +#[inline] +pub fn clean_up_trash( + path: impl AsRef, + key_manager: Option<&DataKeyManager>, +) -> std::io::Result<()> { + for e in file_system::read_dir(path)? { + let e = e?; + let os_fname = e.file_name(); + let fname = os_fname.to_str().unwrap(); + if let Some(original) = fname.strip_prefix(TRASH_PREFIX) { + let original = e.path().with_file_name(original); + if let Some(m) = &key_manager { + m.remove_dir(&original, Some(&e.path()))?; + } + file_system::remove_dir_all(e.path())?; + } + } + Ok(()) +} + +/// Removes all directories with the given prefix. +#[inline] +pub fn clean_up_dir( + path: impl AsRef, + prefix: &str, + key_manager: Option<&DataKeyManager>, +) -> std::io::Result<()> { + for e in file_system::read_dir(path)? { + let e = e?; + let fname = e.file_name().to_str().unwrap().to_owned(); + if fname.starts_with(prefix) { + if let Some(m) = &key_manager { + m.remove_dir(&e.path(), None)?; + } + file_system::remove_dir_all(e.path())?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use tempfile::Builder; + + use super::*; + + #[test] + fn test_trash_dir_all() { + let tmp_dir = Builder::new() + .prefix("test_reserve_space_for_recover") + .tempdir() + .unwrap(); + let data_path = tmp_dir.path(); + let sub_dir0 = data_path.join("sub_dir0"); + let trash_sub_dir0 = data_path.join(format!("{}sub_dir0", TRASH_PREFIX)); + file_system::create_dir_all(&sub_dir0).unwrap(); + assert!(sub_dir0.exists()); + + trash_dir_all(&sub_dir0, None).unwrap(); + assert!(!sub_dir0.exists()); + assert!(!trash_sub_dir0.exists()); + + file_system::create_dir_all(&sub_dir0).unwrap(); + file_system::create_dir_all(&trash_sub_dir0).unwrap(); + trash_dir_all(&sub_dir0, None).unwrap(); + assert!(!sub_dir0.exists()); + assert!(!trash_sub_dir0.exists()); + + clean_up_trash(data_path, None).unwrap(); + + file_system::create_dir_all(&trash_sub_dir0).unwrap(); + assert!(trash_sub_dir0.exists()); + clean_up_trash(data_path, None).unwrap(); + assert!(!trash_sub_dir0.exists()); + + file_system::create_dir_all(&sub_dir0).unwrap(); + assert!(sub_dir0.exists()); + clean_up_dir(data_path, "sub", None).unwrap(); + assert!(!sub_dir0.exists()); + } +} diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index a367ad44df2..be7008a33ae 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -2,7 +2,7 @@ use std::{ collections::hash_map::Entry, - io::{Error as IoError, ErrorKind, Result as IoResult}, + io::{self, Error as IoError, ErrorKind, Result as IoResult}, path::{Path, PathBuf}, sync::{ atomic::{AtomicU64, Ordering}, @@ -21,6 +21,7 @@ use file_system::File; use kvproto::encryptionpb::{DataKey, EncryptionMethod, FileDictionary, FileInfo, KeyDictionary}; use protobuf::Message; use tikv_util::{box_err, debug, error, info, sys::thread::StdThreadBuildWrapper, thd_name, warn}; +use tokio::sync::oneshot; use crate::{ config::EncryptionConfig, @@ -36,6 +37,7 @@ use crate::{ const KEY_DICT_NAME: &str = "key.dict"; const FILE_DICT_NAME: &str = "file.dict"; const ROTATE_CHECK_PERIOD: u64 = 600; // 10min +const GENERATE_DATA_KEY_LIMIT: usize = 10; struct Dicts { // Maps data file paths to key id and metadata. This file is stored as plaintext. @@ -193,7 +195,7 @@ impl Dicts { dict.files.get(fname).cloned() } - fn new_file(&self, fname: &str, method: EncryptionMethod) -> Result { + fn new_file(&self, fname: &str, method: EncryptionMethod, sync: bool) -> Result { let mut file_dict_file = self.file_dict_file.lock().unwrap(); let iv = if method != EncryptionMethod::Plaintext { Iv::new_ctr() @@ -212,7 +214,7 @@ impl Dicts { file_dict.files.len() as _ }; - file_dict_file.insert(fname, &file)?; + file_dict_file.insert(fname, &file, sync)?; ENCRYPTION_FILE_NUM_GAUGE.set(file_num); if method != EncryptionMethod::Plaintext { @@ -228,7 +230,7 @@ impl Dicts { // If the file does not exist, return Ok(()) // In either case the intent that the file not exist is achieved. - fn delete_file(&self, fname: &str) -> Result<()> { + fn delete_file(&self, fname: &str, sync: bool) -> Result<()> { let mut file_dict_file = self.file_dict_file.lock().unwrap(); let (file, file_num) = { let mut file_dict = self.file_dict.lock().unwrap(); @@ -246,7 +248,7 @@ impl Dicts { } }; - file_dict_file.remove(fname)?; + file_dict_file.remove(fname, sync)?; ENCRYPTION_FILE_NUM_GAUGE.set(file_num); if file.method != EncryptionMethod::Plaintext { debug!("delete encrypted file"; "fname" => fname); @@ -256,7 +258,7 @@ impl Dicts { Ok(()) } - fn link_file(&self, src_fname: &str, dst_fname: &str) -> Result> { + fn link_file(&self, src_fname: &str, dst_fname: &str, sync: bool) -> Result> { let mut file_dict_file = self.file_dict_file.lock().unwrap(); let (method, file, file_num) = { let mut file_dict = self.file_dict.lock().unwrap(); @@ -277,7 +279,7 @@ impl Dicts { let file_num = file_dict.files.len() as _; (method, file, file_num) }; - file_dict_file.insert(dst_fname, &file)?; + file_dict_file.insert(dst_fname, &file, sync)?; ENCRYPTION_FILE_NUM_GAUGE.set(file_num); if method != EncryptionMethod::Plaintext { @@ -343,8 +345,7 @@ impl Dicts { let creation_time = duration.as_secs(); // Generate new data key. - let generate_limit = 10; - for _ in 0..generate_limit { + for _ in 0..GENERATE_DATA_KEY_LIMIT { let (key_id, key) = generate_data_key(method); if key_id == 0 { // 0 is invalid @@ -365,7 +366,10 @@ impl Dicts { } return Ok(()); } - Err(box_err!("key id collides {} times!", generate_limit)) + Err(box_err!( + "key id collides {} times!", + GENERATE_DATA_KEY_LIMIT + )) } } @@ -385,17 +389,22 @@ fn check_stale_file_exist( "Clean stale file information in file dictionary: {:?}", fname ); - file_dict_file.remove(fname)?; + file_dict_file.remove(fname, true)?; let _ = file_dict.files.remove(fname); } Ok(()) } +enum RotateTask { + Terminate, + Save(oneshot::Sender<()>), +} + fn run_background_rotate_work( dict: Arc, method: EncryptionMethod, master_key: &dyn Backend, - terminal_recv: channel::Receiver<()>, + rx: channel::Receiver, ) { let check_period = std::cmp::min( Duration::from_secs(ROTATE_CHECK_PERIOD), @@ -409,9 +418,17 @@ fn run_background_rotate_work( dict.maybe_rotate_data_key(method, master_key) .expect("Rotating key operation encountered error in the background worker"); }, - recv(terminal_recv) -> _ => { - info!("Key rotate worker has been cancelled."); - break + recv(rx) -> r => { + match r { + Err(_) | Ok(RotateTask::Terminate) => { + info!("Key rotate worker has been cancelled."); + return; + } + Ok(RotateTask::Save(tx)) => { + dict.save_key_dict(master_key).expect("Saving key dict encountered error in the background worker"); + tx.send(()).unwrap(); + } + } }, } } @@ -430,7 +447,7 @@ fn generate_data_key(method: EncryptionMethod) -> (u64, Vec) { pub struct DataKeyManager { dicts: Arc, method: EncryptionMethod, - rotate_terminal: channel::Sender<()>, + rotate_tx: channel::Sender, background_worker: Option>, } @@ -499,7 +516,7 @@ impl DataKeyManager { if info.method != EncryptionMethod::Plaintext { let retain = f(fname); if !retain { - file_dict_file.remove(fname).unwrap(); + file_dict_file.remove(fname, true).unwrap(); } retain } else { @@ -598,7 +615,7 @@ impl DataKeyManager { dicts.maybe_rotate_data_key(method, &*master_key)?; let dicts = Arc::new(dicts); let dict_clone = dicts.clone(); - let (rotate_terminal, rx) = channel::bounded(1); + let (rotate_tx, rx) = channel::bounded(1); let background_worker = std::thread::Builder::new() .name(thd_name!("enc:key")) .spawn_wrapper(move || { @@ -610,7 +627,7 @@ impl DataKeyManager { Ok(DataKeyManager { dicts, method, - rotate_terminal, + rotate_tx, background_worker: Some(background_worker), }) } @@ -753,6 +770,50 @@ impl DataKeyManager { Ok(Some(encrypted_file)) } + /// Removes data keys under the directory `logical`. If `physical` is + /// present, if means the `logical` directory is already physically renamed + /// to `physical`. + /// There're two uses of this function: + /// + /// (1) without `physical`: `remove_dir` is called before + /// `fs::remove_dir_all`. User must guarantee that this directory won't be + /// read again even if the removal fails or panics. + /// + /// (2) with `physical`: Use `fs::rename` to rename the directory to trash. + /// Then `remove_dir` with `physical` set to the trash directory name. + /// Finally remove the trash directory. This is the safest way to delete a + /// directory. + pub fn remove_dir(&self, logical: &Path, physical: Option<&Path>) -> IoResult<()> { + let scan = physical.unwrap_or(logical); + debug_assert!(scan.is_dir()); + if !scan.exists() { + return Ok(()); + } + let mut iter = walkdir::WalkDir::new(scan).into_iter().peekable(); + while let Some(e) = iter.next() { + let e = e?; + if e.path_is_symlink() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("unexpected symbolic link: {}", e.path().display()), + )); + } + let fname = e.path().to_str().unwrap(); + let sync = iter.peek().is_none(); + if let Some(p) = physical { + let sub = fname + .strip_prefix(p.to_str().unwrap()) + .unwrap() + .trim_start_matches('/'); + self.dicts + .delete_file(logical.join(sub).to_str().unwrap(), sync)?; + } else { + self.dicts.delete_file(fname, sync)?; + } + } + Ok(()) + } + /// Return which method this manager is using. pub fn encryption_method(&self) -> engine_traits::EncryptionMethod { crypter::to_engine_encryption_method(self.method) @@ -761,7 +822,7 @@ impl DataKeyManager { impl Drop for DataKeyManager { fn drop(&mut self) { - if let Err(e) = self.rotate_terminal.send(()) { + if let Err(e) = self.rotate_tx.send(RotateTask::Terminate) { info!("failed to terminate background rotation, are we shutting down?"; "err" => %e); } if let Some(Err(e)) = self.background_worker.take().map(|w| w.join()) { @@ -793,7 +854,7 @@ impl EncryptionKeyManager for DataKeyManager { fn new_file(&self, fname: &str) -> IoResult { let (_, data_key) = self.dicts.current_data_key(); let key = data_key.get_key().to_owned(); - let file = self.dicts.new_file(fname, self.method)?; + let file = self.dicts.new_file(fname, self.method, true)?; let encrypted_file = FileEncryptionInfo { key, method: crypter::to_engine_encryption_method(file.method), @@ -806,16 +867,166 @@ impl EncryptionKeyManager for DataKeyManager { fail_point!("key_manager_fails_before_delete_file", |_| IoResult::Err( std::io::ErrorKind::Other.into() )); - self.dicts.delete_file(fname)?; + // `RemoveDir` is not managed, but RocksDB may use `RenameFile` on a directory, + // which internally calls `LinkFile` and `DeleteFile`. + let path = Path::new(fname); + if path.is_dir() { + let mut iter = walkdir::WalkDir::new(path).into_iter().peekable(); + while let Some(e) = iter.next() { + self.dicts + .delete_file(e?.path().to_str().unwrap(), iter.peek().is_none())?; + } + } else { + self.dicts.delete_file(fname, true)?; + } Ok(()) } fn link_file(&self, src_fname: &str, dst_fname: &str) -> IoResult<()> { - self.dicts.link_file(src_fname, dst_fname)?; + let src_path = Path::new(src_fname); + let dst_path = Path::new(dst_fname); + if src_path.is_dir() { + let mut iter = walkdir::WalkDir::new(src_path) + .into_iter() + .filter(|e| e.as_ref().map_or(true, |e| !e.path().is_dir())) + .peekable(); + while let Some(e) = iter.next() { + let e = e?; + if e.path_is_symlink() { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("unexpected symbolic link: {}", e.path().display()), + )); + } + let sub_path = e.path().strip_prefix(src_path).unwrap(); + let src = e.path().to_str().unwrap(); + let dst_path = dst_path.join(sub_path); + let dst = dst_path.to_str().unwrap(); + self.dicts.link_file(src, dst, iter.peek().is_none())?; + } + } else { + self.dicts.link_file(src_fname, dst_fname, true)?; + } Ok(()) } } +/// An RAII-style importer of data keys. It automatically creates data key that +/// doesn't exist locally. It synchronizes log file in batch. It automatically +/// reverts changes if caller aborts. +pub struct DataKeyImporter<'a> { + manager: &'a DataKeyManager, + // Added file names. + file_additions: Vec, + // Added key ids. + key_additions: Vec, + committed: bool, +} + +#[allow(dead_code)] +impl<'a> DataKeyImporter<'a> { + pub fn new(manager: &'a DataKeyManager) -> Self { + Self { + manager, + file_additions: Vec::new(), + key_additions: Vec::new(), + committed: false, + } + } + + pub fn add(&mut self, fname: &str, iv: Vec, new_key: DataKey) -> Result<()> { + let method = new_key.method; + let mut key_id = None; + { + let mut key_dict = self.manager.dicts.key_dict.lock().unwrap(); + for (id, data_key) in &key_dict.keys { + if data_key.key == new_key.key { + key_id = Some(*id); + } + } + if key_id.is_none() { + for _ in 0..GENERATE_DATA_KEY_LIMIT { + // Match `generate_data_key`. + use rand::{rngs::OsRng, RngCore}; + let id = OsRng.next_u64(); + if let Entry::Vacant(e) = key_dict.keys.entry(id) { + key_id = Some(id); + e.insert(new_key); + self.key_additions.push(id); + break; + } + } + if key_id.is_none() { + return Err(box_err!( + "key id collides {} times!", + GENERATE_DATA_KEY_LIMIT + )); + } + } + } + + let file = FileInfo { + iv, + key_id: key_id.unwrap(), + method, + ..Default::default() + }; + let mut file_dict_file = self.manager.dicts.file_dict_file.lock().unwrap(); + let file_num = { + let mut file_dict = self.manager.dicts.file_dict.lock().unwrap(); + if let Entry::Vacant(e) = file_dict.files.entry(fname.to_owned()) { + e.insert(file.clone()); + } else { + return Err(box_err!("file name collides with existing file: {}", fname)); + } + file_dict.files.len() as _ + }; + file_dict_file.insert(fname, &file, false)?; + self.file_additions.push(fname.to_owned()); + ENCRYPTION_FILE_NUM_GAUGE.set(file_num); + Ok(()) + } + + pub fn commit(mut self) -> Result<()> { + let (tx, rx) = oneshot::channel(); + if !self.key_additions.is_empty() { + self.manager.rotate_tx.send(RotateTask::Save(tx)).unwrap(); + rx.blocking_recv().unwrap(); + } + if !self.file_additions.is_empty() { + self.manager.dicts.file_dict_file.lock().unwrap().sync()?; + } + self.committed = true; + Ok(()) + } + + pub fn rollback(&mut self) -> Result<()> { + assert!(!self.committed); + let mut iter = self.file_additions.drain(..).peekable(); + while let Some(f) = iter.next() { + self.manager.dicts.delete_file(&f, iter.peek().is_none())?; + } + for key_id in self.key_additions.drain(..) { + let mut key_dict = self.manager.dicts.key_dict.lock().unwrap(); + key_dict.keys.remove(&key_id); + } + let (tx, rx) = oneshot::channel(); + self.manager.rotate_tx.send(RotateTask::Save(tx)).unwrap(); + rx.blocking_recv().unwrap(); + Ok(()) + } +} + +impl<'a> Drop for DataKeyImporter<'a> { + fn drop(&mut self) { + if !self.committed { + if let Err(e) = self.rollback() { + warn!("failed to rollback imported data keys"; "err" => ?e); + } + } + } +} + #[cfg(test)] mod tests { use engine_traits::EncryptionMethod as EtEncryptionMethod; @@ -864,7 +1075,7 @@ mod tests { rotation_period: Duration::from_secs(60), enable_file_dictionary_log: true, file_dictionary_rewrite_threshold: 2, - dict_path: tmp_dir.path().as_os_str().to_str().unwrap().to_string(), + dict_path: tmp_dir.path().to_str().unwrap().to_string(), } } @@ -1473,4 +1684,131 @@ mod tests { } } } + + #[test] + fn test_rename_dir() { + let _guard = LOCK_FOR_GAUGE.lock().unwrap(); + let tmp_dir = tempfile::TempDir::new().unwrap(); + let manager = new_key_manager_def(&tmp_dir, Some(EncryptionMethod::Aes192Ctr)).unwrap(); + let subdir = tmp_dir.path().join("foo"); + std::fs::create_dir(&subdir).unwrap(); + let file_a = manager + .new_file(subdir.join("a").to_str().unwrap()) + .unwrap(); + File::create(subdir.join("a")).unwrap(); + let file_b = manager + .new_file(subdir.join("b").to_str().unwrap()) + .unwrap(); + File::create(subdir.join("b")).unwrap(); + + let dstdir = tmp_dir.path().join("bar"); + manager + .link_file(subdir.to_str().unwrap(), dstdir.to_str().unwrap()) + .unwrap(); + manager.delete_file(subdir.to_str().unwrap()).unwrap(); + + assert_eq!( + manager + .get_file(dstdir.join("a").to_str().unwrap()) + .unwrap(), + file_a + ); + assert_eq!( + manager + .get_file_exists(subdir.join("a").to_str().unwrap()) + .unwrap(), + None + ); + + assert_eq!( + manager + .get_file(dstdir.join("b").to_str().unwrap()) + .unwrap(), + file_b + ); + assert_eq!( + manager + .get_file_exists(subdir.join("b").to_str().unwrap()) + .unwrap(), + None + ); + } + + #[test] + fn test_import_keys() { + let _guard = LOCK_FOR_GAUGE.lock().unwrap(); + let tmp_dir = tempfile::TempDir::new().unwrap(); + let manager = new_key_manager_def(&tmp_dir, Some(EncryptionMethod::Aes192Ctr)).unwrap(); + + let mut importer = DataKeyImporter::new(&manager); + let file0 = manager.new_file("0").unwrap(); + + // conflict + importer + .add("0", file0.iv.clone(), DataKey::default()) + .unwrap_err(); + // same key + importer + .add( + "1", + file0.iv.clone(), + DataKey { + key: file0.key.clone(), + method: EncryptionMethod::Aes192Ctr, + ..Default::default() + }, + ) + .unwrap(); + // different key + let (_, key2) = generate_data_key(EncryptionMethod::Aes192Ctr); + importer + .add( + "2", + Iv::new_ctr().as_slice().to_owned(), + DataKey { + key: key2.clone(), + method: EncryptionMethod::Aes192Ctr, + ..Default::default() + }, + ) + .unwrap(); + + assert_eq!(manager.get_file("0").unwrap(), file0); + assert_eq!(manager.get_file("1").unwrap(), file0); + assert_eq!(manager.get_file("2").unwrap().key, key2); + + drop(importer); + assert_eq!(manager.get_file_exists("1").unwrap(), None); + assert_eq!(manager.get_file_exists("2").unwrap(), None); + + let mut importer = DataKeyImporter::new(&manager); + // same key + importer + .add( + "1", + file0.iv.clone(), + DataKey { + key: file0.key.clone(), + method: EncryptionMethod::Aes192Ctr, + ..Default::default() + }, + ) + .unwrap(); + // different key + importer + .add( + "2", + Iv::new_ctr().as_slice().to_owned(), + DataKey { + key: key2.clone(), + method: EncryptionMethod::Aes192Ctr, + ..Default::default() + }, + ) + .unwrap(); + // importer is dropped here. + importer.commit().unwrap(); + assert_eq!(manager.get_file("1").unwrap(), file0); + assert_eq!(manager.get_file("2").unwrap().key, key2); + } } diff --git a/components/engine_rocks/src/sst.rs b/components/engine_rocks/src/sst.rs index 0518dd7feb5..85c30d74a87 100644 --- a/components/engine_rocks/src/sst.rs +++ b/components/engine_rocks/src/sst.rs @@ -376,7 +376,7 @@ mod tests { let mut writer = RocksSstWriterBuilder::new() .set_cf(CF_DEFAULT) .set_db(&engine) - .build(p.as_os_str().to_str().unwrap()) + .build(p.to_str().unwrap()) .unwrap(); writer.put(k, v).unwrap(); let sst_file = writer.finish().unwrap(); @@ -391,7 +391,7 @@ mod tests { .set_in_memory(true) .set_cf(CF_DEFAULT) .set_db(&engine) - .build(p.as_os_str().to_str().unwrap()) + .build(p.to_str().unwrap()) .unwrap(); writer.put(k, v).unwrap(); let mut buf = vec![]; diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index bc8b2f8baf2..932a1bcb51a 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -132,6 +132,9 @@ pub mod kv { let tombstone_path = path.with_extension(TOMBSTONE_SUFFIX); let _ = std::fs::remove_dir_all(&tombstone_path); std::fs::rename(path, &tombstone_path)?; + if let Some(m) = &self.db_opt.key_manager { + m.remove_dir(path, Some(&tombstone_path))?; + } std::fs::remove_dir_all(tombstone_path)?; Ok(()) } @@ -207,7 +210,7 @@ pub mod ctor { #[derive(Clone, Default)] pub struct DbOptions { - key_manager: Option>, + pub(crate) key_manager: Option>, rate_limiter: Option>, state_storage: Option>, enable_multi_batch_write: bool, diff --git a/components/engine_traits_tests/Cargo.toml b/components/engine_traits_tests/Cargo.toml index 301a7ee5d76..516135a86d2 100644 --- a/components/engine_traits_tests/Cargo.toml +++ b/components/engine_traits_tests/Cargo.toml @@ -25,8 +25,12 @@ test-engines-panic = [ ] [dependencies] +encryption = { workspace = true } +encryption_export = { workspace = true } engine_test = { workspace = true } engine_traits = { workspace = true } +kvproto = { workspace = true } panic_hook = { workspace = true } tempfile = "3.0" tikv_alloc = { workspace = true } +test_util = { workspace = true } diff --git a/components/engine_traits_tests/src/checkpoint.rs b/components/engine_traits_tests/src/checkpoint.rs new file mode 100644 index 00000000000..ad85b8f85ed --- /dev/null +++ b/components/engine_traits_tests/src/checkpoint.rs @@ -0,0 +1,49 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! Checkpoint tests + +use std::sync::Arc; + +use encryption_export::data_key_manager_from_config; +use engine_test::{ + ctor::{CfOptions, DbOptions, KvEngineConstructorExt}, + kv::KvTestEngine, +}; +use engine_traits::{ + Checkpointable, Checkpointer, KvEngine, Peekable, SyncMutable, ALL_CFS, CF_DEFAULT, +}; + +use super::tempdir; + +#[test] +fn test_encrypted_checkpoint() { + let dir = tempdir(); + let root_path = dir.path(); + + let encryption_cfg = test_util::new_file_security_config(root_path); + let key_manager = Arc::new( + data_key_manager_from_config(&encryption_cfg, root_path.to_str().unwrap()) + .unwrap() + .unwrap(), + ); + + let mut db_opts = DbOptions::default(); + db_opts.set_key_manager(Some(key_manager)); + let cf_opts: Vec<_> = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + + let path1 = root_path.join("1").to_str().unwrap().to_owned(); + let db1 = KvTestEngine::new_kv_engine_opt(&path1, db_opts.clone(), cf_opts.clone()).unwrap(); + db1.put(b"foo", b"bar").unwrap(); + db1.sync().unwrap(); + + let path2 = root_path.join("2"); + let mut checkpointer = db1.new_checkpointer().unwrap(); + checkpointer.create_at(&path2, None, 0).unwrap(); + let db2 = + KvTestEngine::new_kv_engine_opt(path2.to_str().unwrap(), db_opts.clone(), cf_opts.clone()) + .unwrap(); + assert_eq!( + db2.get_value_cf(CF_DEFAULT, b"foo").unwrap().unwrap(), + b"bar" + ); +} diff --git a/components/engine_traits_tests/src/ctor.rs b/components/engine_traits_tests/src/ctor.rs index ab1eea4d958..dce6a64dff2 100644 --- a/components/engine_traits_tests/src/ctor.rs +++ b/components/engine_traits_tests/src/ctor.rs @@ -4,11 +4,12 @@ use std::fs; +use encryption_export::data_key_manager_from_config; use engine_test::{ ctor::{CfOptions, DbOptions, KvEngineConstructorExt}, kv::KvTestEngine, }; -use engine_traits::{KvEngine, SyncMutable, ALL_CFS}; +use engine_traits::{EncryptionKeyManager, KvEngine, Peekable, SyncMutable, ALL_CFS, CF_DEFAULT}; use super::tempdir; @@ -90,3 +91,40 @@ fn new_engine_opt_readonly_dir() { err.unwrap_err(); } + +#[test] +fn new_engine_opt_renamed_dir() { + use std::sync::Arc; + let dir = tempdir(); + let root_path = dir.path(); + + let encryption_cfg = test_util::new_file_security_config(root_path); + let key_manager = Arc::new( + data_key_manager_from_config(&encryption_cfg, root_path.to_str().unwrap()) + .unwrap() + .unwrap(), + ); + + let mut db_opts = DbOptions::default(); + db_opts.set_key_manager(Some(key_manager.clone())); + let cf_opts: Vec<_> = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + + let path = root_path.join("missing").to_str().unwrap().to_owned(); + { + let db = KvTestEngine::new_kv_engine_opt(&path, db_opts.clone(), cf_opts.clone()).unwrap(); + db.put(b"foo", b"bar").unwrap(); + db.sync().unwrap(); + } + let new_path = root_path.join("new").to_str().unwrap().to_owned(); + key_manager.link_file(&path, &new_path).unwrap(); + fs::rename(&path, &new_path).unwrap(); + key_manager.delete_file(&path).unwrap(); + { + let db = + KvTestEngine::new_kv_engine_opt(&new_path, db_opts.clone(), cf_opts.clone()).unwrap(); + assert_eq!( + db.get_value_cf(CF_DEFAULT, b"foo").unwrap().unwrap(), + b"bar" + ); + } +} diff --git a/components/engine_traits_tests/src/lib.rs b/components/engine_traits_tests/src/lib.rs index d9b6af12f09..1d9b6b4fa53 100644 --- a/components/engine_traits_tests/src/lib.rs +++ b/components/engine_traits_tests/src/lib.rs @@ -40,6 +40,7 @@ mod basic_read_write; mod cf_names; +mod checkpoint; mod ctor; mod delete_range; mod iterator; diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index a3701c6ecac..91e0a35da80 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -449,47 +449,6 @@ pub fn reserve_space_for_recover>(data_dir: P, file_size: u64) -> } } -const TRASH_PREFIX: &str = "TRASH-"; - -/// Remove a directory. -/// -/// Rename it before actually removal. -#[inline] -pub fn trash_dir_all(path: impl AsRef) -> io::Result<()> { - let path = path.as_ref(); - let name = match path.file_name() { - Some(n) => n, - None => return Err(io::Error::new(ErrorKind::InvalidInput, "path is invalid")), - }; - let trash_path = path.with_file_name(format!("{}{}", TRASH_PREFIX, name.to_string_lossy())); - if let Err(e) = rename(path, &trash_path) { - if e.kind() == ErrorKind::NotFound { - return Ok(()); - } - return Err(e); - } - remove_dir_all(trash_path) -} - -/// When using `trash_dir_all`, it's possible the directory is marked as trash -/// but not being actually deleted after a restart. This function can be used -/// to resume all those removal in the given directory. -#[inline] -pub fn clean_up_trash(path: impl AsRef) -> io::Result<()> { - clean_up_dir(path, TRASH_PREFIX) -} - -/// clean up all files starts with the given prefix in the given directory. -pub fn clean_up_dir(path: impl AsRef, prefix: &str) -> io::Result<()> { - for e in read_dir(path)? { - let e = e?; - if e.file_name().to_string_lossy().starts_with(prefix) { - remove_dir_all(e.path())?; - } - } - Ok(()) -} - #[cfg(test)] mod tests { use std::{io::Write, iter}; @@ -656,39 +615,4 @@ mod tests { reserve_space_for_recover(data_path, 0).unwrap(); assert!(!file.exists()); } - - #[test] - fn test_trash_dir_all() { - let tmp_dir = Builder::new() - .prefix("test_reserve_space_for_recover") - .tempdir() - .unwrap(); - let data_path = tmp_dir.path(); - let sub_dir0 = data_path.join("sub_dir0"); - let trash_sub_dir0 = data_path.join(format!("{}sub_dir0", TRASH_PREFIX)); - create_dir_all(&sub_dir0).unwrap(); - assert!(sub_dir0.exists()); - - trash_dir_all(&sub_dir0).unwrap(); - assert!(!sub_dir0.exists()); - assert!(!trash_sub_dir0.exists()); - - create_dir_all(&sub_dir0).unwrap(); - create_dir_all(&trash_sub_dir0).unwrap(); - trash_dir_all(&sub_dir0).unwrap(); - assert!(!sub_dir0.exists()); - assert!(!trash_sub_dir0.exists()); - - clean_up_trash(data_path).unwrap(); - - create_dir_all(&trash_sub_dir0).unwrap(); - assert!(trash_sub_dir0.exists()); - clean_up_trash(data_path).unwrap(); - assert!(!trash_sub_dir0.exists()); - - create_dir_all(&sub_dir0).unwrap(); - assert!(sub_dir0.exists()); - clean_up_dir(data_path, "sub").unwrap(); - assert!(!sub_dir0.exists()); - } } diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index ad13ea5ab74..84daa4c40b5 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -35,6 +35,7 @@ causal_ts = { workspace = true } collections = { workspace = true } concurrency_manager = { workspace = true } crossbeam = "0.8" +encryption_export = { workspace = true } engine_traits = { workspace = true } error_code = { workspace = true } fail = "0.5" diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 1e72341d651..a9e3c223943 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -2,6 +2,7 @@ use std::{ ops::{Deref, DerefMut}, + path::Path, sync::{ atomic::{AtomicBool, Ordering}, Arc, Mutex, @@ -16,6 +17,7 @@ use causal_ts::CausalTsProviderImpl; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use crossbeam::channel::TrySendError; +use encryption_export::DataKeyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use file_system::{set_io_type, IoType, WithIoType}; use kvproto::{disk_usage::DiskUsage, raft_serverpb::RaftMessage}; @@ -92,6 +94,7 @@ pub struct StoreContext { pub global_stat: GlobalStoreStat, pub store_stat: LocalStoreStat, pub sst_importer: Arc, + pub key_manager: Option>, } impl StoreContext { @@ -288,6 +291,7 @@ struct StorePollerBuilder { snap_mgr: TabletSnapManager, global_stat: GlobalStoreStat, sst_importer: Arc, + key_manager: Option>, } impl StorePollerBuilder { @@ -305,6 +309,7 @@ impl StorePollerBuilder { snap_mgr: TabletSnapManager, coprocessor_host: CoprocessorHost, sst_importer: Arc, + key_manager: Option>, ) -> Self { let pool_size = cfg.value().apply_batch_system.pool_size; let max_pool_size = std::cmp::max( @@ -333,6 +338,7 @@ impl StorePollerBuilder { coprocessor_host, global_stat, sst_importer, + key_manager, } } @@ -364,8 +370,13 @@ impl StorePollerBuilder { } meta.set_region(storage.region(), storage.is_initialized(), &self.logger); - let (sender, peer_fsm) = - PeerFsm::new(&cfg, &self.tablet_registry, &self.snap_mgr, storage)?; + let (sender, peer_fsm) = PeerFsm::new( + &cfg, + &self.tablet_registry, + self.key_manager.as_deref(), + &self.snap_mgr, + storage, + )?; meta.region_read_progress .insert(region_id, peer_fsm.as_ref().peer().read_progress().clone()); @@ -383,13 +394,22 @@ impl StorePollerBuilder { Ok(regions) } + #[inline] + fn remove_dir(&self, p: &Path) -> Result<()> { + if let Some(m) = &self.key_manager { + m.remove_dir(p, None)?; + } + file_system::remove_dir_all(p)?; + Ok(()) + } + fn clean_up_tablets(&self, peers: &HashMap>) -> Result<()> { for entry in file_system::read_dir(self.tablet_registry.tablet_root())? { let entry = entry?; let path = entry.path(); if path.extension().map_or(false, |s| s == "tmp") { // The directory may be generated by an aborted checkpoint. - file_system::remove_dir_all(&path)?; + self.remove_dir(&path)?; continue; } let Some((prefix, region_id, tablet_index)) = self.tablet_registry.parse_tablet_name(&path) else { continue }; @@ -402,20 +422,20 @@ impl StorePollerBuilder { None => { // The peer is either destroyed or not created yet. It will be // recovered by leader heartbeats. - file_system::remove_dir_all(&path)?; + self.remove_dir(&path)?; continue; } }; // Valid split tablet should be installed during recovery. if prefix == SPLIT_PREFIX { - file_system::remove_dir_all(&path)?; + self.remove_dir(&path)?; continue; } else if prefix == MERGE_IN_PROGRESS_PREFIX { continue; } else if prefix.is_empty() { // Stale split data can be deleted. if fsm.peer().storage().tablet_index() > tablet_index { - file_system::remove_dir_all(&path)?; + self.remove_dir(&path)?; } } else { debug_assert!(false, "unexpected tablet prefix: {}", path.display()); @@ -461,6 +481,7 @@ where global_stat: self.global_stat.clone(), store_stat: self.global_stat.local(), sst_importer: self.sst_importer.clone(), + key_manager: self.key_manager.clone(), }; poll_ctx.update_ticks_timeout(); let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); @@ -554,6 +575,7 @@ impl StoreSystem { background: Worker, pd_worker: LazyWorker, sst_importer: Arc, + key_manager: Option>, ) -> Result<()> where T: Transport + 'static, @@ -660,6 +682,7 @@ impl StoreSystem { snap_mgr, coprocessor_host, sst_importer, + key_manager, ); self.workers = Some(workers); self.schedulers = Some(schedulers); diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index d2506d0dd21..3af66c4f81c 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -6,6 +6,7 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; +use encryption_export::DataKeyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{errorpb, raft_cmdpb::RaftCmdResponse}; use raftstore::store::{Config, TabletSnapManager, Transport}; @@ -40,10 +41,11 @@ impl PeerFsm { pub fn new( cfg: &Config, tablet_registry: &TabletRegistry, + key_manager: Option<&DataKeyManager>, snap_mgr: &TabletSnapManager, storage: Storage, ) -> Result> { - let peer = Peer::new(cfg, tablet_registry, snap_mgr, storage)?; + let peer = Peer::new(cfg, tablet_registry, key_manager, snap_mgr, storage)?; info!(peer.logger, "create peer"; "raft_state" => ?peer.storage().raft_state(), "apply_state" => ?peer.storage().apply_state(), diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 65e7ab7906a..9d3a32f8f72 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -393,8 +393,15 @@ impl Store { ctx.schedulers.read.clone(), &ctx.logger, ) - .and_then(|s| PeerFsm::new(&ctx.cfg, &ctx.tablet_registry, &ctx.snap_mgr, s)) - { + .and_then(|s| { + PeerFsm::new( + &ctx.cfg, + &ctx.tablet_registry, + ctx.key_manager.as_deref(), + &ctx.snap_mgr, + s, + ) + }) { Ok(p) => p, res => { error!(self.logger(), "failed to create peer"; "region_id" => region_id, "peer_id" => to_peer.id, "err" => ?res.err()); diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 90b7930c368..f1a65fc1768 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -29,6 +29,7 @@ use std::{cmp, sync::Mutex}; +use encryption_export::DataKeyManager; use engine_traits::{ data_cf_offset, ApplyProgress, KvEngine, RaftEngine, RaftLogBatch, TabletRegistry, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, DATA_CFS, DATA_CFS_LEN, @@ -430,7 +431,12 @@ impl Storage { /// Region state is written before actually moving data. It's possible that /// the tablet is missing after restart. We need to move the data again /// after being restarted. - pub fn recover_tablet(&self, registry: &TabletRegistry, snap_mgr: &TabletSnapManager) { + pub fn recover_tablet( + &self, + registry: &TabletRegistry, + key_manager: Option<&DataKeyManager>, + snap_mgr: &TabletSnapManager, + ) { let tablet_index = self.region_state().get_tablet_index(); if tablet_index == 0 { // It's an uninitialized peer, nothing to recover. @@ -445,7 +451,7 @@ impl Storage { if tablet_index == RAFT_INIT_LOG_INDEX { // Its data may come from split or snapshot. Try split first. let split_path = temp_split_path(registry, region_id); - if install_tablet(registry, &split_path, region_id, tablet_index) { + if install_tablet(registry, key_manager, &split_path, region_id, tablet_index) { return; } } @@ -460,7 +466,7 @@ impl Storage { self.entry_storage().truncated_term(), tablet_index, ); - if install_tablet(registry, &snap_path, region_id, tablet_index) { + if install_tablet(registry, key_manager, &snap_path, region_id, tablet_index) { return; } } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index f37791638d5..f63d9c97b86 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -1021,8 +1021,9 @@ impl Storage { if let Err(e) = self.apply_snapshot( ready.snapshot(), write_task, - ctx.snap_mgr.clone(), - ctx.tablet_registry.clone(), + &ctx.snap_mgr, + &ctx.tablet_registry, + ctx.key_manager.as_ref(), ) { SNAP_COUNTER.apply.fail.inc(); error!(self.logger(),"failed to apply snapshot";"error" => ?e) diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 12b4a97e710..5547df7d580 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -29,7 +29,11 @@ use std::{ }, }; -use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, ALL_CFS}; +use encryption_export::DataKeyManager; +use engine_traits::{ + EncryptionKeyManager, KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, + ALL_CFS, +}; use kvproto::raft_serverpb::{PeerState, RaftSnapshotData}; use protobuf::Message; use raft::{eraftpb::Snapshot, StateRole}; @@ -136,6 +140,7 @@ pub fn recv_snap_path( /// Returns false if `source` doesn't exist. pub fn install_tablet( registry: &TabletRegistry, + key_manager: Option<&DataKeyManager>, source: &Path, region_id: u64, tablet_index: u64, @@ -151,7 +156,14 @@ pub fn install_tablet( source.display(), target_path.display() ); + if let Some(m) = &key_manager { + m.link_file(source.to_str().unwrap(), target_path.to_str().unwrap()) + .unwrap(); + } if let Err(e) = fs::rename(source, &target_path) { + if let Some(m) = &key_manager { + m.delete_file(target_path.to_str().unwrap()).unwrap(); + } panic!( "failed to rename tablet {} => {}: {:?}", source.display(), @@ -159,6 +171,9 @@ pub fn install_tablet( e ); } + if let Some(m) = &key_manager { + m.delete_file(source.to_str().unwrap()).unwrap(); + } true } @@ -544,8 +559,9 @@ impl Storage { &mut self, snap: &Snapshot, task: &mut WriteTask, - snap_mgr: TabletSnapManager, - reg: TabletRegistry, + snap_mgr: &TabletSnapManager, + reg: &TabletRegistry, + key_manager: Option<&Arc>, ) -> Result<()> { let region_id = self.region().get_id(); let peer_id = self.peer().get_id(); @@ -632,10 +648,10 @@ impl Storage { Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => { lb.put_dirty_mark(region_id, last_index, true).unwrap(); self.set_has_dirty_data(true); - (temp_split_path(®, region_id), false) + (temp_split_path(reg, region_id), false) } si => ( - recv_snap_path(&snap_mgr, region_id, peer_id, last_term, last_index), + recv_snap_path(snap_mgr, region_id, peer_id, last_term, last_index), si.is_some(), ), }; @@ -643,8 +659,10 @@ impl Storage { let logger = self.logger().clone(); // The snapshot require no additional processing such as ingest them to DB, but // it should load it into the factory after it persisted. + let reg = reg.clone(); + let key_manager = key_manager.cloned(); let hook = move || { - if !install_tablet(®, &path, region_id, last_index) { + if !install_tablet(®, key_manager.as_deref(), &path, region_id, last_index) { slog_panic!( logger, "failed to install tablet"; @@ -654,6 +672,7 @@ impl Storage { } if clean_split { let path = temp_split_path(®, region_id); + // TODO(tabokie) let _ = fs::remove_dir_all(path); } }; diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index d35dfe22184..e11c96922cd 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -7,6 +7,7 @@ use std::{ }; use collections::{HashMap, HashSet}; +use encryption_export::DataKeyManager; use engine_traits::{ CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, }; @@ -128,6 +129,7 @@ impl Peer { pub fn new( cfg: &Config, tablet_registry: &TabletRegistry, + key_manager: Option<&DataKeyManager>, snap_mgr: &TabletSnapManager, storage: Storage, ) -> Result { @@ -149,7 +151,9 @@ impl Peer { // old tablet and create new peer. We also can't get the correct range of the // region, which is required for kv data gc. if tablet_index != 0 { - raft_group.store().recover_tablet(tablet_registry, snap_mgr); + raft_group + .store() + .recover_tablet(tablet_registry, key_manager, snap_mgr); let mut ctx = TabletContext::new(®ion, Some(tablet_index)); ctx.flush_state = Some(flush_state.clone()); // TODO: Perhaps we should stop create the tablet automatically. diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index ee9be348c89..7edf8c02f09 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -398,7 +398,8 @@ mod tests { fn test_apply_snapshot() { let region = new_region(); let path = TempDir::new().unwrap(); - let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()).unwrap(); + let mgr = + TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap(), None).unwrap(); let engines = engine_test::new_temp_engine(&path); let raft_engine = engines.raft.clone(); let mut wb = raft_engine.log_batch(10); @@ -437,7 +438,7 @@ mod tests { .unwrap(); let snapshot = new_empty_snapshot(region.clone(), snap_index, snap_term, false); let mut task = WriteTask::new(region.get_id(), 5, 1); - s.apply_snapshot(&snapshot, &mut task, mgr, reg.clone()) + s.apply_snapshot(&snapshot, &mut task, &mgr, ®, None) .unwrap(); // Add more entries to check if old entries are cleared. If not, it should panic // with memtable hole when using raft engine. @@ -481,7 +482,8 @@ mod tests { write_initial_states(&mut wb, region.clone()).unwrap(); assert!(!wb.is_empty()); raft_engine.consume(&mut wb, true).unwrap(); - let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()).unwrap(); + let mgr = + TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap(), None).unwrap(); // building a tablet factory let ops = DbOptions::default(); let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 4bd0cef8846..83cf3646b9b 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -262,8 +262,19 @@ impl RunningState { causal_ts_provider: Option>, logger: &Logger, ) -> (TestRouter, Self) { + // TODO(tabokie): Enable encryption by default. (after snapshot encryption) + // let encryption_cfg = test_util::new_file_security_config(path); + // let key_manager = Some(Arc::new( + // data_key_manager_from_config(&encryption_cfg, path.to_str().unwrap()) + // .unwrap() + // .unwrap(), + // )); + let key_manager = None; + + let mut opts = engine_test::ctor::RaftDbOptions::default(); + opts.set_key_manager(key_manager.clone()); let raft_engine = - engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), None) + engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), Some(opts)) .unwrap(); let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client.as_ref(), logger.clone()); @@ -286,6 +297,7 @@ impl RunningState { raft_engine.clone(), router.clone(), ))); + db_opt.set_key_manager(key_manager.clone()); let factory = Box::new(TestTabletFactory::new(db_opt, cf_opts)); let registry = TabletRegistry::new(factory, path.join("tablets")).unwrap(); if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { @@ -302,14 +314,18 @@ impl RunningState { let router = RaftRouter::new(store_id, router); let store_meta = router.store_meta().clone(); - let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()).unwrap(); + let snap_mgr = TabletSnapManager::new( + path.join("tablets_snap").to_str().unwrap(), + key_manager.clone(), + ) + .unwrap(); let coprocessor_host = CoprocessorHost::new(router.store_router().clone(), cop_cfg.value().clone()); let importer = Arc::new( SstImporter::new( &Default::default(), path.join("importer"), - None, + key_manager.clone(), ApiVersion::V1, ) .unwrap(), @@ -336,6 +352,7 @@ impl RunningState { background.clone(), pd_worker, importer, + key_manager, ) .unwrap(); diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index ced7b0f4418..006fe0eb24c 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -122,14 +122,17 @@ impl ReadRunner { fn generate_snap(&self, snap_key: &TabletSnapKey, tablet: EK) -> crate::Result<()> { let checkpointer_path = self.snap_mgr().tablet_gen_path(snap_key); - if checkpointer_path.as_path().exists() { + if checkpointer_path.exists() { + // TODO: make `delete_snapshot` return error so we can use it here. // Remove the old checkpoint directly. - file_system::trash_dir_all(&checkpointer_path)?; + encryption::trash_dir_all( + &checkpointer_path, + self.snap_mgr().key_manager().as_deref(), + )?; } // Here not checkpoint to a temporary directory first, the temporary directory // logic already implemented in rocksdb. let mut checkpointer = tablet.new_checkpointer()?; - checkpointer.create_at(checkpointer_path.as_path(), None, 0)?; Ok(()) } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 68d3c7fba51..69d948e3ae4 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1915,7 +1915,7 @@ impl SnapManagerBuilder { assert!(!path.is_empty()); let mut path_v2 = path.clone(); path_v2.push_str("_v2"); - let tablet_snap_mgr = TabletSnapManager::new(&path_v2).unwrap(); + let tablet_snap_mgr = TabletSnapManager::new(&path_v2, self.key_manager.clone()).unwrap(); let mut snapshot = SnapManager { core: SnapManagerCore { @@ -1996,12 +1996,16 @@ impl Drop for ReceivingGuard<'_> { pub struct TabletSnapManager { // directory to store snapfile. base: PathBuf, + key_manager: Option>, receiving: Arc>>, stats: Arc>>, } impl TabletSnapManager { - pub fn new>(path: T) -> io::Result { + pub fn new>( + path: T, + key_manager: Option>, + ) -> io::Result { let path = path.into(); if !path.exists() { file_system::create_dir_all(&path)?; @@ -2012,10 +2016,11 @@ impl TabletSnapManager { format!("{} should be a directory", path.display()), )); } - file_system::clean_up_dir(&path, SNAP_GEN_PREFIX)?; - file_system::clean_up_trash(&path)?; + encryption::clean_up_dir(&path, SNAP_GEN_PREFIX, key_manager.as_deref())?; + encryption::clean_up_trash(&path, key_manager.as_deref())?; Ok(Self { base: path, + key_manager, receiving: Arc::default(), stats: Arc::default(), }) @@ -2073,16 +2078,17 @@ impl TabletSnapManager { pub fn delete_snapshot(&self, key: &TabletSnapKey) -> bool { let path = self.tablet_gen_path(key); - if path.exists() && let Err(e) = file_system::trash_dir_all(&path) { - error!( - "delete snapshot failed"; - "path" => %path.display(), - "err" => ?e, - ); - false - } else { - true + if path.exists() { + if let Err(e) = encryption::trash_dir_all(&path, self.key_manager.as_deref()) { + error!( + "delete snapshot failed"; + "path" => %path.display(), + "err" => ?e, + ); + return false; + } } + true } pub fn total_snap_size(&self) -> Result { @@ -2135,6 +2141,11 @@ impl TabletSnapManager { key, }) } + + #[inline] + pub fn key_manager(&self) -> &Option> { + &self.key_manager + } } #[cfg(test)] @@ -3123,7 +3134,7 @@ pub mod tests { .tempdir() .unwrap(); let start = Instant::now(); - let mgr = TabletSnapManager::new(snap_dir.path()).unwrap(); + let mgr = TabletSnapManager::new(snap_dir.path(), None).unwrap(); let key = TabletSnapKey::new(1, 1, 1, 1); mgr.begin_snapshot(key.clone(), start - time::Duration::from_secs(2), 1); // filter out the snapshot that is not finished diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 81575b8cbf6..202307f7767 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -520,10 +520,11 @@ where .unwrap() .to_owned(); - let snap_mgr = match TabletSnapManager::new(&snap_path) { - Ok(mgr) => mgr, - Err(e) => fatal!("failed to create snapshot manager at {}: {}", snap_path, e), - }; + let snap_mgr = + match TabletSnapManager::new(&snap_path, self.core.encryption_key_manager.clone()) { + Ok(mgr) => mgr, + Err(e) => fatal!("failed to create snapshot manager at {}: {}", snap_path, e), + }; // Create coprocessor endpoint. let cop_read_pool_handle = if self.core.config.readpool.coprocessor.use_unified_pool() { @@ -758,6 +759,7 @@ where raft_store, &state, importer.clone(), + self.core.encryption_key_manager.clone(), ) .unwrap_or_else(|e| fatal!("failed to start node: {}", e)); diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index c770a6144bd..ffa38b51796 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -231,7 +231,10 @@ impl Simulator for NodeCluster { { let tmp = test_util::temp_dir("test_cluster", cfg.prefer_mem); let snap_path = tmp.path().to_str().unwrap().to_owned(); - (TabletSnapManager::new(snap_path)?, Some(tmp)) + ( + TabletSnapManager::new(snap_path, key_manager.clone())?, + Some(tmp), + ) } else { let trans = self.trans.core.lock().unwrap(); let &(ref snap_mgr, _) = &trans.snap_paths[&node_id]; @@ -273,7 +276,13 @@ impl Simulator for NodeCluster { let importer = { let dir = Path::new(raft_engine.get_engine_path()).join("../import-sst"); Arc::new( - SstImporter::new(&cfg.import, dir, key_manager, cfg.storage.api_version()).unwrap(), + SstImporter::new( + &cfg.import, + dir, + key_manager.clone(), + cfg.storage.api_version(), + ) + .unwrap(), ) }; @@ -295,6 +304,7 @@ impl Simulator for NodeCluster { Arc::new(VersionTrack::new(raft_store)), &state, importer, + key_manager, )?; assert!( raft_engine diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index f110578784f..9bdd8568418 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -328,7 +328,10 @@ impl ServerCluster { let (snap_mgr, snap_mgs_path) = if !self.snap_mgrs.contains_key(&node_id) { let tmp = test_util::temp_dir("test_cluster", cfg.prefer_mem); let snap_path = tmp.path().to_str().unwrap().to_owned(); - (TabletSnapManager::new(snap_path)?, Some(tmp)) + ( + TabletSnapManager::new(snap_path, key_manager.clone())?, + Some(tmp), + ) } else { (self.snap_mgrs[&node_id].clone(), None) }; @@ -475,7 +478,13 @@ impl ServerCluster { let importer = { let dir = Path::new(raft_engine.get_engine_path()).join("../import-sst"); Arc::new( - SstImporter::new(&cfg.import, dir, key_manager, cfg.storage.api_version()).unwrap(), + SstImporter::new( + &cfg.import, + dir, + key_manager.clone(), + cfg.storage.api_version(), + ) + .unwrap(), ) }; let import_service = ImportSstService::new( @@ -598,6 +607,7 @@ impl ServerCluster { Arc::new(VersionTrack::new(raft_store)), &state, importer, + key_manager, )?; assert!(node_id == 0 || node_id == node.id()); let node_id = node.id(); diff --git a/components/test_util/src/encryption.rs b/components/test_util/src/encryption.rs index ba6ab56cc52..e09c0ce7cbb 100644 --- a/components/test_util/src/encryption.rs +++ b/components/test_util/src/encryption.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fs::File, io::Write, time::Duration}; +use std::{fs::File, io::Write, path::Path, time::Duration}; use encryption_export::{ create_backend, DataKeyManager, DataKeyManagerArgs, EncryptionConfig, FileConfig, @@ -15,15 +15,15 @@ pub fn create_test_key_file(path: &str) { .unwrap(); } -fn new_test_file_master_key(tmp: &tempfile::TempDir) -> MasterKeyConfig { - let key_path = tmp.path().join("test_key").to_str().unwrap().to_owned(); +fn new_test_file_master_key(tmp: &Path) -> MasterKeyConfig { + let key_path = tmp.join("test_key").to_str().unwrap().to_owned(); create_test_key_file(&key_path); MasterKeyConfig::File { config: FileConfig { path: key_path }, } } -pub fn new_file_security_config(dir: &tempfile::TempDir) -> EncryptionConfig { +pub fn new_file_security_config(dir: &Path) -> EncryptionConfig { let master_key_cfg = new_test_file_master_key(dir); EncryptionConfig { data_encryption_method: EncryptionMethod::Aes256Ctr, @@ -41,7 +41,7 @@ pub fn new_test_key_manager( master_key: Option, previous_master_key: Option, ) -> Result> { - let default_config = new_test_file_master_key(tmp_dir); + let default_config = new_test_file_master_key(tmp_dir.path()); let master_key = master_key.unwrap_or_else(|| default_config.clone()); let previous_master_key = previous_master_key.unwrap_or(default_config); DataKeyManager::new( @@ -52,7 +52,7 @@ pub fn new_test_key_manager( rotation_period: Duration::from_secs(60), enable_file_dictionary_log: true, file_dictionary_rewrite_threshold: 2, - dict_path: tmp_dir.path().as_os_str().to_str().unwrap().to_string(), + dict_path: tmp_dir.path().to_str().unwrap().to_string(), }, ) } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index c3976b8eeac..bf70a63acdb 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -2,6 +2,7 @@ use std::{path::Path, sync::Arc}; +use encryption_export::DataKeyManager; use engine_rocks::{ raw::{Cache, Env}, util::RangeCompactionFilterFactory, @@ -28,6 +29,7 @@ struct FactoryInner { api_version: ApiVersion, flow_listener: Option, sst_recovery_sender: Option>, + encryption_key_manager: Option>, db_resources: DbResources, cf_resources: CfResources, state_storage: Option>, @@ -48,6 +50,7 @@ impl KvEngineFactoryBuilder { api_version: config.storage.api_version(), flow_listener: None, sst_recovery_sender: None, + encryption_key_manager: None, db_resources: config.rocksdb.build_resources(env), cf_resources: config.rocksdb.build_cf_resources(cache), state_storage: None, @@ -80,6 +83,11 @@ impl KvEngineFactoryBuilder { self } + pub fn encryption_key_manager(mut self, m: Option>) -> Self { + self.inner.encryption_key_manager = m; + self + } + /// Set whether enable lite mode. /// /// In lite mode, most listener/filters will not be installed. @@ -233,7 +241,9 @@ impl TabletFactory for KvEngineFactory { // kv_db_opts, // kv_cfs_opts, // )?; - let _ = file_system::trash_dir_all(path); + // TODO: use RocksDB::DestroyDB. + let _ = + encryption_export::trash_dir_all(path, self.inner.encryption_key_manager.as_deref()); if let Some(listener) = &self.inner.flow_listener { listener.clone_with(ctx.id).on_destroyed(); } diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index b9cc956d40e..f95e4a89848 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -4,6 +4,7 @@ use std::sync::{Arc, Mutex}; use causal_ts::CausalTsProviderImpl; use concurrency_manager::ConcurrencyManager; +use encryption_export::DataKeyManager; use engine_traits::{KvEngine, RaftEngine, TabletContext, TabletRegistry}; use kvproto::{metapb, replication_modepb::ReplicationStatus}; use pd_client::PdClient; @@ -104,6 +105,7 @@ where store_cfg: Arc>, state: &Mutex, sst_importer: Arc, + key_manager: Option>, ) -> Result<()> where T: Transport + 'static, @@ -143,6 +145,7 @@ where pd_worker, store_cfg, sst_importer, + key_manager, )?; Ok(()) @@ -205,6 +208,7 @@ where pd_worker: LazyWorker, store_cfg: Arc>, sst_importer: Arc, + key_manager: Option>, ) -> Result<()> where T: Transport + 'static, @@ -237,6 +241,7 @@ where background, pd_worker, sst_importer, + key_manager, )?; Ok(()) } diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index 07a85109006..f1044031d9f 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -245,6 +245,7 @@ pub(crate) async fn cleanup_cache( let entry = entry?; let ft = entry.file_type()?; if ft.is_dir() { + // TODO(tabokie) fs::remove_dir_all(entry.path())?; continue; } @@ -406,6 +407,7 @@ pub(crate) async fn recv_snap_files<'a>( let path = snap_mgr.tmp_recv_path(&context.key); info!("begin to receive tablet snapshot files"; "file" => %path.display(), "region_id" => region_id); if path.exists() { + // TODO(tabokie) fs::remove_dir_all(&path)?; } let (reused, missing_ssts) = if context.use_cache { @@ -422,6 +424,7 @@ pub(crate) async fn recv_snap_files<'a>( let received = accept_missing(&path, missing_ssts, &mut stream, &limiter).await?; info!("received all tablet snapshot file"; "snap_key" => %context.key, "region_id" => region_id, "received" => received, "reused" => reused); let final_path = snap_mgr.final_recv_path(&context.key); + // TODO(tabokie) fs::rename(&path, final_path)?; Ok(context) } diff --git a/tests/failpoints/cases/test_encryption.rs b/tests/failpoints/cases/test_encryption.rs index 8b73188e569..eba0a515893 100644 --- a/tests/failpoints/cases/test_encryption.rs +++ b/tests/failpoints/cases/test_encryption.rs @@ -19,9 +19,9 @@ fn test_file_dict_file_record_corrupted() { // Crc32 (4 bytes) + File name length (2 bytes) + FileInfo length (2 bytes) + // Log type (1 bytes) fail::cfg("file_dict_log_append_incomplete", "return(9)").unwrap(); - file_dict_file.insert("info1", &info1).unwrap(); + file_dict_file.insert("info1", &info1, true).unwrap(); fail::remove("file_dict_log_append_incomplete"); - file_dict_file.insert("info2", &info2).unwrap(); + file_dict_file.insert("info2", &info2, true).unwrap(); // Intermediate record damage is not allowed. file_dict_file.recovery().unwrap_err(); @@ -34,9 +34,9 @@ fn test_file_dict_file_record_corrupted() { .unwrap(); let info1 = create_file_info(1, EncryptionMethod::Aes256Ctr); let info2 = create_file_info(2, EncryptionMethod::Unknown); - file_dict_file.insert("info1", &info1).unwrap(); + file_dict_file.insert("info1", &info1, true).unwrap(); fail::cfg("file_dict_log_append_incomplete", "return(9)").unwrap(); - file_dict_file.insert("info2", &info2).unwrap(); + file_dict_file.insert("info2", &info2, true).unwrap(); fail::remove("file_dict_log_append_incomplete"); // The ending record can be discarded. let file_dict = file_dict_file.recovery().unwrap(); diff --git a/tests/integrations/import/util.rs b/tests/integrations/import/util.rs index e757e7685ba..e6e2121a479 100644 --- a/tests/integrations/import/util.rs +++ b/tests/integrations/import/util.rs @@ -81,7 +81,7 @@ pub fn new_cluster_and_tikv_import_client_tde() -> ( ImportSstClient, ) { let tmp_dir = tempfile::TempDir::new().unwrap(); - let encryption_cfg = test_util::new_file_security_config(&tmp_dir); + let encryption_cfg = test_util::new_file_security_config(tmp_dir.path()); let mut security = test_util::new_security_cfg(None); security.encryption = encryption_cfg; let mut config = TikvConfig::default(); From ac7f14819744c714de1b21952b3efddab883006c Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 20 Apr 2023 20:01:20 +0800 Subject: [PATCH 654/676] raftstore-v2: adapt backup stream for raftstore-v2 (#14589) ref tikv/tikv#14614 Signed-off-by: 3pointer Co-authored-by: Ti Chi Robot --- components/backup-stream/src/endpoint.rs | 66 +++++++++++------ components/backup-stream/src/event_loader.rs | 48 ++++++------- components/backup-stream/src/lib.rs | 4 +- .../backup-stream/src/subscription_manager.rs | 26 +++---- components/backup-stream/tests/mod.rs | 26 ++++--- components/cdc/src/endpoint.rs | 6 +- components/resolved_ts/src/advance.rs | 71 +++++++++---------- components/server/src/server.rs | 28 +++++--- components/server/src/server2.rs | 65 ++++++++++++++++- 9 files changed, 218 insertions(+), 122 deletions(-) diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 45d132b001b..c88b36da8db 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -8,7 +8,6 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use error_code::ErrorCodeExt; use futures::FutureExt; -use grpcio::Environment; use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, metapb::Region, @@ -16,11 +15,9 @@ use kvproto::{ use pd_client::PdClient; use raftstore::{ coprocessor::{CmdBatch, ObserveHandle, RegionInfoProvider}, - router::RaftStoreRouter, - store::RegionReadProgressRegistry, + router::CdcHandle, }; -use resolved_ts::LeadershipResolver; -use security::SecurityManager; +use resolved_ts::{resolve_by_raft, LeadershipResolver}; use tikv::config::BackupStreamConfig; use tikv_util::{ box_err, @@ -100,7 +97,7 @@ impl Endpoint where R: RegionInfoProvider + 'static + Clone, E: KvEngine, - RT: RaftStoreRouter + 'static, + RT: CdcHandle + 'static, PDC: PdClient + 'static, S: MetaStore + 'static, { @@ -114,10 +111,7 @@ where router: RT, pd_client: Arc, concurrency_manager: ConcurrencyManager, - // Required by Leadership Resolver. - env: Arc, - region_read_progress: RegionReadProgressRegistry, - security_mgr: Arc, + resolver: BackupStreamResolver, ) -> Self { crate::metrics::STREAM_ENABLED.inc(); let pool = create_tokio_runtime((config.num_threads / 2).max(1), "backup-stream") @@ -154,14 +148,7 @@ where let initial_scan_throughput_quota = Limiter::new(limit); info!("the endpoint of stream backup started"; "path" => %config.temp_path); let subs = SubscriptionTracer::default(); - let leadership_resolver = LeadershipResolver::new( - store_id, - Arc::clone(&pd_client) as _, - env, - security_mgr, - region_read_progress, - Duration::from_secs(60), - ); + let (region_operator, op_loop) = RegionSubscriptionManager::start( InitialDataLoader::new( router.clone(), @@ -177,7 +164,7 @@ where meta_client.clone(), pd_client.clone(), ((config.num_threads + 1) / 2).max(1), - leadership_resolver, + resolver, ); pool.spawn(op_loop); let mut checkpoint_mgr = CheckpointManager::default(); @@ -212,7 +199,7 @@ where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: RaftStoreRouter + 'static, + RT: CdcHandle + 'static, PDC: PdClient + 'static, { fn get_meta_client(&self) -> MetadataClient { @@ -1049,6 +1036,29 @@ fn create_tokio_runtime(thread_count: usize, thread_name: &str) -> TokioResult { + // for raftstore-v1, we use LeadershipResolver to check leadership of a region. + V1(LeadershipResolver), + // for raftstore-v2, it has less regions. we use CDCHandler to check leadership of a region. + V2(RT, PhantomData), +} + +impl BackupStreamResolver +where + RT: CdcHandle + 'static, + EK: KvEngine, +{ + pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + match self { + BackupStreamResolver::V1(x) => x.resolve(regions, min_ts).await, + BackupStreamResolver::V2(x, _) => { + let x = x.clone(); + resolve_by_raft(regions, min_ts, x).await + } + } + } +} + #[derive(Debug)] pub enum RegionSet { /// The universal set. @@ -1282,7 +1292,7 @@ where S: MetaStore + 'static, R: RegionInfoProvider + Clone + 'static, E: KvEngine, - RT: RaftStoreRouter + 'static, + RT: CdcHandle + 'static, PDC: PdClient + 'static, { type Task = Task; @@ -1295,7 +1305,9 @@ where #[cfg(test)] mod test { use engine_rocks::RocksEngine; - use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; + use raftstore::{ + coprocessor::region_info_accessor::MockRegionInfoProvider, router::CdcRaftRouter, + }; use test_raftstore::MockRaftStoreRouter; use tikv_util::worker::dummy_scheduler; @@ -1311,7 +1323,15 @@ mod test { cli.insert_task_with_range(&task, &[]).await.unwrap(); fail::cfg("failed_to_get_tasks", "1*return").unwrap(); - Endpoint::<_, MockRegionInfoProvider, RocksEngine, MockRaftStoreRouter, MockPdClient>::start_and_watch_tasks(cli, sched).await.unwrap(); + Endpoint::< + _, + MockRegionInfoProvider, + RocksEngine, + CdcRaftRouter, + MockPdClient, + >::start_and_watch_tasks(cli, sched) + .await + .unwrap(); fail::remove("failed_to_get_tasks"); let _t1 = rx.recv().unwrap(); diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 6c825bf30c5..1b663c0e982 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -7,8 +7,8 @@ use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::CmdType}; use raftstore::{ coprocessor::{ObserveHandle, RegionInfoProvider}, - router::RaftStoreRouter, - store::{fsm::ChangeObserver, Callback, SignificantMsg}, + router::CdcHandle, + store::{fsm::ChangeObserver, Callback}, }; use tikv::storage::{ kv::StatisticsSummary, @@ -200,7 +200,7 @@ impl InitialDataLoader where E: KvEngine, R: RegionInfoProvider + Clone + 'static, - RT: RaftStoreRouter, + RT: CdcHandle, { pub fn new( router: RT, @@ -288,33 +288,33 @@ where let (callback, fut) = tikv_util::future::paired_future_callback::>(); + self.router - .significant_send( - region.id, - SignificantMsg::CaptureChange { - cmd, - region_epoch: region.get_region_epoch().clone(), - callback: Callback::read(Box::new(|snapshot| { - if snapshot.response.get_header().has_error() { - callback(Err(Error::RaftRequest( - snapshot.response.get_header().get_error().clone(), - ))); - return; - } - if let Some(snap) = snapshot.snapshot { - callback(Ok(snap)); - return; - } - callback(Err(Error::Other(box_err!( - "PROBABLY BUG: the response contains neither error nor snapshot" - )))) - })), - }, + .capture_change( + region.get_id(), + region.get_region_epoch().clone(), + cmd, + Callback::read(Box::new(|snapshot| { + if snapshot.response.get_header().has_error() { + callback(Err(Error::RaftRequest( + snapshot.response.get_header().get_error().clone(), + ))); + return; + } + if let Some(snap) = snapshot.snapshot { + callback(Ok(snap)); + return; + } + callback(Err(Error::Other(box_err!( + "PROBABLY BUG: the response contains neither error nor snapshot" + )))) + })), ) .context(format_args!( "failed to register the observer to region {}", region.get_id() ))?; + let snap = block_on(fut) .map_err(|err| { annotate!( diff --git a/components/backup-stream/src/lib.rs b/components/backup-stream/src/lib.rs index a36b42c227d..ac7ab1f718f 100644 --- a/components/backup-stream/src/lib.rs +++ b/components/backup-stream/src/lib.rs @@ -21,5 +21,7 @@ mod subscription_track; pub mod utils; pub use checkpoint_manager::GetCheckpointResult; -pub use endpoint::{Endpoint, ObserveOp, RegionCheckpointOperation, RegionSet, Task}; +pub use endpoint::{ + BackupStreamResolver, Endpoint, ObserveOp, RegionCheckpointOperation, RegionSet, Task, +}; pub use service::Service; diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 316f0d9fb53..bf1a5552f71 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -18,10 +18,9 @@ use pd_client::PdClient; use raft::StateRole; use raftstore::{ coprocessor::{ObserveHandle, RegionInfoProvider}, - router::RaftStoreRouter, + router::CdcHandle, store::fsm::ChangeObserver, }; -use resolved_ts::LeadershipResolver; use tikv::storage::Statistics; use tikv_util::{box_err, debug, info, time::Instant, warn, worker::Scheduler}; use tokio::sync::mpsc::{channel, Receiver, Sender}; @@ -30,7 +29,7 @@ use yatp::task::callback::Handle as YatpHandle; use crate::{ annotate, - endpoint::ObserveOp, + endpoint::{BackupStreamResolver, ObserveOp}, errors::{Error, Result}, event_loader::InitialDataLoader, future, @@ -144,7 +143,7 @@ impl InitialScan for InitialDataLoader where E: KvEngine, R: RegionInfoProvider + Clone + 'static, - RT: RaftStoreRouter, + RT: CdcHandle, { fn do_initial_scan( &self, @@ -376,11 +375,11 @@ where meta_cli: MetadataClient, pd_client: Arc, scan_pool_size: usize, - leader_checker: LeadershipResolver, + resolver: BackupStreamResolver, ) -> (Self, future![()]) where E: KvEngine, - RT: RaftStoreRouter + 'static, + RT: CdcHandle + 'static, { let (tx, rx) = channel(MESSAGE_BUFFER_SIZE); let scan_pool_handle = spawn_executors(initial_loader.clone(), scan_pool_size); @@ -396,7 +395,7 @@ where scan_pool_handle: Arc::new(scan_pool_handle), scans: CallbackWaitGroup::new(), }; - let fut = op.clone().region_operator_loop(rx, leader_checker); + let fut = op.clone().region_operator_loop(rx, resolver); (op, fut) } @@ -416,11 +415,14 @@ where } /// the handler loop. - async fn region_operator_loop( + async fn region_operator_loop( self, mut message_box: Receiver, - mut leader_checker: LeadershipResolver, - ) { + mut resolver: BackupStreamResolver, + ) where + E: KvEngine, + RT: CdcHandle + 'static, + { while let Some(op) = message_box.recv().await { // Skip some trivial resolve commands. if !matches!(op, ObserveOp::ResolveRegions { .. }) { @@ -487,9 +489,7 @@ where warn!("waiting for initial scanning done timed out, forcing progress!"; "take" => ?now.saturating_elapsed(), "timedout" => %timedout); } - let regions = leader_checker - .resolve(self.subs.current_regions(), min_ts) - .await; + let regions = resolver.resolve(self.subs.current_regions(), min_ts).await; let cps = self.subs.resolve_with(min_ts, regions); let min_region = cps.iter().min_by_key(|rs| rs.checkpoint); // If there isn't any region observed, the `min_ts` can be used as resolved ts diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 7b2fe88b8a1..9dc38e36320 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -19,7 +19,8 @@ use backup_stream::{ }, observer::BackupStreamObserver, router::Router, - utils, Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Service, Task, + utils, BackupStreamResolver, Endpoint, GetCheckpointResult, RegionCheckpointOperation, + RegionSet, Service, Task, }; use futures::{executor::block_on, AsyncWriteExt, Future, Stream, StreamExt}; use grpcio::{ChannelBuilder, Server, ServerBuilder}; @@ -32,6 +33,8 @@ use kvproto::{ }; use pd_client::PdClient; use protobuf::parse_from_bytes; +use raftstore::router::CdcRaftRouter; +use resolved_ts::LeadershipResolver; use tempdir::TempDir; use test_raftstore::{new_server_cluster, Cluster, ServerCluster}; use test_util::retry; @@ -335,11 +338,24 @@ impl Suite { let worker = self.endpoints.get_mut(&id).unwrap(); let sim = cluster.sim.wl(); let raft_router = sim.get_server_router(id); + let raft_router = CdcRaftRouter(raft_router); let cm = sim.get_concurrency_manager(id); let regions = sim.region_info_accessors.get(&id).unwrap().clone(); let ob = self.obs.get(&id).unwrap().clone(); cfg.enable = true; cfg.temp_path = format!("/{}/{}", self.temp_files.path().display(), id); + let resolver = LeadershipResolver::new( + id, + cluster.pd_client.clone(), + Arc::clone(&self.env), + Arc::clone(&sim.security_mgr), + cluster.store_metas[&id] + .lock() + .unwrap() + .region_read_progress + .clone(), + Duration::from_secs(60), + ); let endpoint = Endpoint::new( id, self.meta_store.clone(), @@ -350,13 +366,7 @@ impl Suite { raft_router, cluster.pd_client.clone(), cm, - Arc::clone(&self.env), - cluster.store_metas[&id] - .lock() - .unwrap() - .region_read_progress - .clone(), - Arc::clone(&sim.security_mgr), + BackupStreamResolver::V1(resolver), ); worker.start(endpoint); } diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index dfeb4f78045..fd4580d4aea 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -33,7 +33,7 @@ use raftstore::{ router::CdcHandle, store::fsm::{store::StoreRegionMeta, ChangeObserver}, }; -use resolved_ts::{LeadershipResolver, Resolver}; +use resolved_ts::{resolve_by_raft, LeadershipResolver, Resolver}; use security::SecurityManager; use tikv::{ config::CdcConfig, @@ -1079,9 +1079,7 @@ impl, E: KvEngine, S: StoreRegionMeta> Endpoint( - &self, - regions: Vec, - min_ts: TimeStamp, - cdc_handle: T, - ) -> Vec - where - T: 'static + CdcHandle, - E: KvEngine, - { - let mut reqs = Vec::with_capacity(regions.len()); - for region_id in regions { - let cdc_handle_clone = cdc_handle.clone(); - let req = async move { - let (tx, rx) = tokio::sync::oneshot::channel(); - let callback = Callback::read(Box::new(move |resp| { - let resp = if resp.response.get_header().has_error() { - None - } else { - Some(region_id) - }; - if tx.send(resp).is_err() { - error!("cdc send tso response failed"; "region_id" => region_id); - } - })); - if let Err(e) = cdc_handle_clone.check_leadership(region_id, callback) { - warn!("cdc send LeaderCallback failed"; "err" => ?e, "min_ts" => min_ts); - return None; - } - rx.await.unwrap_or(None) - }; - reqs.push(req); - } - - let resps = futures::future::join_all(reqs).await; - resps.into_iter().flatten().collect::>() - } - // Confirms leadership of region peer before trying to advance resolved ts. // This function broadcasts a special message to all stores, gets the leader id // of them to confirm whether current peer has a quorum which accepts its @@ -454,6 +416,39 @@ impl LeadershipResolver { } } +pub async fn resolve_by_raft(regions: Vec, min_ts: TimeStamp, cdc_handle: T) -> Vec +where + T: 'static + CdcHandle, + E: KvEngine, +{ + let mut reqs = Vec::with_capacity(regions.len()); + for region_id in regions { + let cdc_handle_clone = cdc_handle.clone(); + let req = async move { + let (tx, rx) = tokio::sync::oneshot::channel(); + let callback = Callback::read(Box::new(move |resp| { + let resp = if resp.response.get_header().has_error() { + None + } else { + Some(region_id) + }; + if tx.send(resp).is_err() { + error!("cdc send tso response failed"; "region_id" => region_id); + } + })); + if let Err(e) = cdc_handle_clone.check_leadership(region_id, callback) { + warn!("cdc send LeaderCallback failed"; "err" => ?e, "min_ts" => min_ts); + return None; + } + rx.await.unwrap_or(None) + }; + reqs.push(req); + } + + let resps = futures::future::join_all(reqs).await; + resps.into_iter().flatten().collect::>() +} + fn region_has_quorum(peers: &[Peer], stores: &[u64]) -> bool { let mut voters = 0; let mut incoming_voters = 0; diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 890089a6950..ec3468c6c68 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -25,6 +25,7 @@ use std::{ use api_version::{dispatch_api_version, KvFormat}; use backup_stream::{ config::BackupStreamConfigManager, metadata::store::PdStore, observer::BackupStreamObserver, + BackupStreamResolver, }; use causal_ts::CausalTsProviderImpl; use cdc::{CdcConfigManager, MemoryQuota}; @@ -68,6 +69,7 @@ use raftstore::{ }, RaftRouterCompactedEventSender, }; +use resolved_ts::LeadershipResolver; use resource_control::{ ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, }; @@ -786,6 +788,21 @@ where )), ); + let region_read_progress = engines + .store_meta + .lock() + .unwrap() + .region_read_progress + .clone(); + let leadership_resolver = LeadershipResolver::new( + node.id(), + self.pd_client.clone(), + self.env.clone(), + self.security_mgr.clone(), + region_read_progress, + Duration::from_secs(60), + ); + let backup_stream_endpoint = backup_stream::Endpoint::new( node.id(), PdStore::new(Checked::new(Sourced::new( @@ -796,17 +813,10 @@ where backup_stream_scheduler.clone(), backup_stream_ob, self.region_info_accessor.clone(), - self.router.clone(), + CdcRaftRouter(self.router.clone()), self.pd_client.clone(), self.concurrency_manager.clone(), - Arc::clone(&self.env), - engines - .store_meta - .lock() - .unwrap() - .region_read_progress - .clone(), - Arc::clone(&self.security_mgr), + BackupStreamResolver::V1(leadership_resolver), ); backup_stream_worker.start(backup_stream_endpoint); self.core.to_stop.push(backup_stream_worker); diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 202307f7767..4d1a9f2daf6 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -14,6 +14,7 @@ use std::{ cmp, collections::HashMap, + marker::PhantomData, path::{Path, PathBuf}, str::FromStr, sync::{atomic::AtomicU64, mpsc, Arc}, @@ -22,6 +23,10 @@ use std::{ }; use api_version::{dispatch_api_version, KvFormat}; +use backup_stream::{ + config::BackupStreamConfigManager, metadata::store::PdStore, observer::BackupStreamObserver, + BackupStreamResolver, +}; use causal_ts::CausalTsProviderImpl; use cdc::{CdcConfigManager, MemoryQuota}; use concurrency_manager::ConcurrencyManager; @@ -34,9 +39,12 @@ use grpcio_health::HealthService; use kvproto::{ brpb::create_backup, cdcpb_grpc::create_change_data, deadlock::create_deadlock, diagnosticspb::create_diagnostics, import_sstpb_grpc::create_import_sst, kvrpcpb::ApiVersion, - resource_usage_agent::create_resource_metering_pub_sub, + logbackuppb::create_log_backup, resource_usage_agent::create_resource_metering_pub_sub, +}; +use pd_client::{ + meta_storage::{Checked, Sourced}, + PdClient, RpcClient, }; -use pd_client::{PdClient, RpcClient}; use raft_log_engine::RaftLogEngine; use raftstore::{ coprocessor::{ @@ -191,6 +199,7 @@ struct TikvServer { cdc_worker: Option>>, cdc_scheduler: Option>, cdc_memory_quota: Option, + backup_stream_scheduler: Option>, sst_worker: Option>>, quota_limiter: Arc, resource_manager: Option>, @@ -328,6 +337,7 @@ where cdc_worker: None, cdc_scheduler: None, cdc_memory_quota: None, + backup_stream_scheduler: None, sst_worker: None, quota_limiter, resource_manager, @@ -627,6 +637,46 @@ where self.core.to_stop.push(rts_worker); } + // Start backup stream + self.backup_stream_scheduler = if self.core.config.log_backup.enable { + // Create backup stream. + let mut backup_stream_worker = Box::new(LazyWorker::new("backup-stream")); + let backup_stream_scheduler = backup_stream_worker.scheduler(); + + // Register backup-stream observer. + let backup_stream_ob = BackupStreamObserver::new(backup_stream_scheduler.clone()); + backup_stream_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + // Register config manager. + cfg_controller.register( + tikv::config::Module::BackupStream, + Box::new(BackupStreamConfigManager::new( + backup_stream_worker.scheduler(), + self.core.config.log_backup.clone(), + )), + ); + + let backup_stream_endpoint = backup_stream::Endpoint::new( + self.node.as_ref().unwrap().id(), + PdStore::new(Checked::new(Sourced::new( + Arc::clone(&self.pd_client), + pd_client::meta_storage::Source::LogBackup, + ))), + self.core.config.log_backup.clone(), + backup_stream_scheduler.clone(), + backup_stream_ob, + self.region_info_accessor.as_ref().unwrap().clone(), + self.router.clone().unwrap(), + self.pd_client.clone(), + self.concurrency_manager.clone(), + BackupStreamResolver::V2(self.router.clone().unwrap(), PhantomData), + ); + backup_stream_worker.start(backup_stream_endpoint); + self.core.to_stop.push(backup_stream_worker); + Some(backup_stream_scheduler) + } else { + None + }; + let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); self.core @@ -840,6 +890,17 @@ where fatal!("failed to register import service"); } + if let Some(sched) = self.backup_stream_scheduler.take() { + let pitr_service = backup_stream::Service::new(sched); + if servers + .server + .register_service(create_log_backup(pitr_service)) + .is_some() + { + fatal!("failed to register log backup service"); + } + } + self.cfg_controller .as_mut() .unwrap() From dcf7f055f4ae869849161bd55f76dace8997be70 Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Fri, 21 Apr 2023 10:05:20 +0800 Subject: [PATCH 655/676] raftstore-v2: report sending/recving count (#14617) close tikv/tikv#14581 store heartbeat will report sending/recving count to the pd . Signed-off-by: bufferflies <1045931706@qq.com> --- components/raftstore-v2/src/operation/pd.rs | 4 +-- components/raftstore/src/store/snap.rs | 16 +++++++-- src/server/tablet_snap.rs | 21 +++++------- tests/failpoints/cases/test_snap.rs | 38 ++++++++++----------- 4 files changed, 42 insertions(+), 37 deletions(-) diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index f45cae390da..7ad82959fa8 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -46,8 +46,8 @@ impl Store { let snap_stats = ctx.snap_mgr.stats(); // todo: imple snapshot status report - stats.set_sending_snap_count(0); - stats.set_receiving_snap_count(0); + stats.set_sending_snap_count(snap_stats.sending_count as u32); + stats.set_receiving_snap_count(snap_stats.receiving_count as u32); stats.set_snapshot_stats(snap_stats.stats.into()); STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 69d948e3ae4..eb407b8d2bf 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1999,6 +1999,8 @@ pub struct TabletSnapManager { key_manager: Option>, receiving: Arc>>, stats: Arc>>, + sending_count: Arc, + recving_count: Arc, } impl TabletSnapManager { @@ -2023,6 +2025,8 @@ impl TabletSnapManager { key_manager, receiving: Arc::default(), stats: Arc::default(), + sending_count: Arc::default(), + recving_count: Arc::default(), }) } @@ -2055,8 +2059,8 @@ impl TabletSnapManager { .filter(|stat| stat.get_total_duration_sec() > 1) .collect(); SnapStats { - sending_count: 0, - receiving_count: 0, + sending_count: self.sending_count.load(Ordering::SeqCst), + receiving_count: self.recving_count.load(Ordering::SeqCst), stats, } } @@ -2142,6 +2146,14 @@ impl TabletSnapManager { }) } + pub fn sending_count(&self) -> &Arc { + &self.sending_count + } + + pub fn recving_count(&self) -> &Arc { + &self.recving_count + } + #[inline] pub fn key_manager(&self) -> &Option> { &self.key_manager diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index f1044031d9f..8e5a3293909 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -25,10 +25,7 @@ use std::{ fs::{self, File}, io::{BorrowedBuf, Read, Seek, SeekFrom, Write}, path::Path, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }, + sync::{atomic::Ordering, Arc}, time::Duration, }; @@ -355,6 +352,9 @@ pub(crate) async fn accept_missing( } // Now receive other files. loop { + fail_point!("receiving_snapshot_net_error", |_| { + Err(box_err!("failed to receive snapshot")) + }); let chunk = match stream.next().await { Some(Ok(mut req)) if req.has_chunk() => req.take_chunk(), Some(Ok(req)) if req.has_end() => { @@ -691,8 +691,6 @@ pub struct TabletRunner { raft_router: R, cfg_tracker: Tracker, cfg: Config, - sending_count: Arc, - recving_count: Arc, cache_builder: B, limiter: Limiter, } @@ -730,8 +728,6 @@ impl TabletRunner { security_mgr, cfg_tracker, cfg: config, - sending_count: Arc::new(AtomicUsize::new(0)), - recving_count: Arc::new(AtomicUsize::new(0)), cache_builder, limiter, }; @@ -776,7 +772,8 @@ where self.pool.spawn(sink.fail(status).map(|_| ())); } Task::RecvTablet { stream, sink } => { - let task_num = self.recving_count.load(Ordering::SeqCst); + let recving_count = self.snap_mgr.recving_count().clone(); + let task_num = recving_count.load(Ordering::SeqCst); if task_num >= self.cfg.concurrent_recv_snap_limit { warn!("too many recving snapshot tasks, ignore"); let status = RpcStatus::with_message( @@ -793,7 +790,6 @@ where let snap_mgr = self.snap_mgr.clone(); let raft_router = self.raft_router.clone(); - let recving_count = self.recving_count.clone(); recving_count.fetch_add(1, Ordering::SeqCst); let limiter = self.limiter.clone(); let cache_builder = self.cache_builder.clone(); @@ -810,8 +806,8 @@ where } Task::Send { addr, msg, cb } => { let region_id = msg.get_region_id(); - if self.sending_count.load(Ordering::SeqCst) >= self.cfg.concurrent_send_snap_limit - { + let sending_count = self.snap_mgr.sending_count().clone(); + if sending_count.load(Ordering::SeqCst) >= self.cfg.concurrent_send_snap_limit { let key = TabletSnapKey::from_region_snap( msg.get_region_id(), msg.get_to_peer().get_id(), @@ -830,7 +826,6 @@ where let env = Arc::clone(&self.env); let mgr = self.snap_mgr.clone(); let security_mgr = Arc::clone(&self.security_mgr); - let sending_count = Arc::clone(&self.sending_count); sending_count.fetch_add(1, Ordering::SeqCst); let limiter = self.limiter.clone(); let send_task = send_snap( diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index 4ca18dcd716..a090ba8530c 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -641,33 +641,31 @@ fn test_snapshot_gc_after_failed() { cluster.sim.wl().clear_recv_filters(3); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_sending_fail_with_net_error() { - let mut cluster = new_server_cluster(1, 2); + let mut cluster = new_cluster(1, 2); configure_for_snapshot(&mut cluster.cfg); cluster.cfg.raft_store.snap_gc_timeout = ReadableDuration::millis(300); - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer count check. + let pd_client = cluster.pd_client.clone(); + // Disable default max peer number check. pd_client.disable_default_operator(); let r1 = cluster.run_conf_change(); cluster.must_put(b"k1", b"v1"); let (send_tx, send_rx) = mpsc::sync_channel(1); // only send one MessageType::MsgSnapshot message - cluster.sim.wl().add_send_filter( - 1, - Box::new( - RegionPacketFilter::new(r1, 1) - .allow(1) - .direction(Direction::Send) - .msg_type(MessageType::MsgSnapshot) - .set_msg_callback(Arc::new(move |m: &RaftMessage| { - if m.get_message().get_msg_type() == MessageType::MsgSnapshot { - let _ = send_tx.send(()); - } - })), - ), - ); + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(r1, 1) + .allow(1) + .direction(Direction::Send) + .msg_type(MessageType::MsgSnapshot) + .set_msg_callback(Arc::new(move |m: &RaftMessage| { + if m.get_message().get_msg_type() == MessageType::MsgSnapshot { + let _ = send_tx.send(()); + } + })), + )); // peer2 will interrupt in receiving snapshot fail::cfg("receiving_snapshot_net_error", "return()").unwrap(); @@ -678,8 +676,8 @@ fn test_sending_fail_with_net_error() { // need to wait receiver handle the snapshot request sleep_ms(100); - // peer2 will not become learner so ti will has k1 key and receiving count will - // zero + // peer2 can't receive any snapshot, so it doesn't have any key valuse. + // but the receiving_count should be zero if receiving snapshot is failed. let engine2 = cluster.get_engine(2); must_get_none(&engine2, b"k1"); assert_eq!(cluster.get_snap_mgr(2).stats().receiving_count, 0); From 2984dd18a36833084cbc65509717c64d81944873 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Fri, 21 Apr 2023 13:05:20 +0800 Subject: [PATCH 656/676] raftstore,server: add enable_v2_compatible_learner config (#14616) ref tikv/tikv#14579 raftstore,server: add enable_v2_compatible_learner config The new config is added to clean up hard code tiflash check Signed-off-by: Neil Shen --- components/raftstore/src/store/config.rs | 9 +++++ components/raftstore/src/store/mod.rs | 4 +-- components/raftstore/src/store/snap.rs | 33 +++++++++++++------ components/raftstore/src/store/worker/mod.rs | 2 -- .../raftstore/src/store/worker/region.rs | 4 +-- components/server/src/server.rs | 3 ++ components/test_raftstore/src/node.rs | 1 + components/test_raftstore/src/server.rs | 1 + src/config/mod.rs | 16 +++++++++ src/server/server.rs | 11 +------ src/server/snap.rs | 25 +++++++------- tests/integrations/config/dynamic/snap.rs | 1 - tests/integrations/config/mod.rs | 1 + tests/integrations/config/test-custom.toml | 1 + tests/integrations/raftstore/test_snap.rs | 11 +++---- 15 files changed, 77 insertions(+), 46 deletions(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 301f3cea0cc..aabf173e674 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -334,6 +334,14 @@ pub struct Config { #[online_config(hidden)] // Interval to check if need to request snapshot. pub check_request_snapshot_interval: ReadableDuration, + + /// Make raftstore v1 learners compatible with raftstore v2 by: + /// * Recving tablet snapshot from v2. + /// * Responsing GcPeerRequest from v2. + #[doc(hidden)] + #[online_config(hidden)] + #[serde(alias = "enable-partitioned-raft-kv-compatible-learner")] + pub enable_v2_compatible_learner: bool, } impl Default for Config { @@ -449,6 +457,7 @@ impl Default for Config { check_peers_availability_interval: ReadableDuration::secs(30), // TODO: make its value reasonable check_request_snapshot_interval: ReadableDuration::minutes(1), + enable_v2_compatible_learner: false, } } } diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index ed97c58ab86..c007b622ee1 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -83,7 +83,7 @@ pub use self::{ LocalReadContext, LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, - StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, ENGINE, - NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, TIFLASH, + StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }, }; diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index eb407b8d2bf..ee488bbc5aa 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1389,7 +1389,7 @@ pub struct SnapManager { max_total_size: Arc, // only used to receive snapshot from v2 - tablet_snap_manager: TabletSnapManager, + tablet_snap_manager: Option, } impl Clone for SnapManager { @@ -1626,7 +1626,11 @@ impl SnapManager { /// NOTE: don't call it in raftstore thread. pub fn get_total_snap_size(&self) -> Result { let size_v1 = self.core.get_total_snap_size()?; - let size_v2 = self.tablet_snap_manager.total_snap_size().unwrap_or(0); + let size_v2 = self + .tablet_snap_manager + .as_ref() + .map(|s| s.total_snap_size().unwrap_or(0)) + .unwrap_or(0); Ok(size_v1 + size_v2) } @@ -1763,8 +1767,8 @@ impl SnapManager { self.core.delete_snapshot(key, snap, check_entry) } - pub fn tablet_snap_manager(&self) -> &TabletSnapManager { - &self.tablet_snap_manager + pub fn tablet_snap_manager(&self) -> Option<&TabletSnapManager> { + self.tablet_snap_manager.as_ref() } pub fn limiter(&self) -> &Limiter { @@ -1873,6 +1877,7 @@ pub struct SnapManagerBuilder { max_total_size: u64, max_per_file_size: u64, enable_multi_snapshot_files: bool, + enable_receive_tablet_snapshot: bool, key_manager: Option>, } @@ -1895,6 +1900,10 @@ impl SnapManagerBuilder { self.enable_multi_snapshot_files = enabled; self } + pub fn enable_receive_tablet_snapshot(mut self, enabled: bool) -> SnapManagerBuilder { + self.enable_receive_tablet_snapshot = enabled; + self + } #[must_use] pub fn encryption_key_manager(mut self, m: Option>) -> SnapManagerBuilder { self.key_manager = m; @@ -1915,7 +1924,11 @@ impl SnapManagerBuilder { assert!(!path.is_empty()); let mut path_v2 = path.clone(); path_v2.push_str("_v2"); - let tablet_snap_mgr = TabletSnapManager::new(&path_v2, self.key_manager.clone()).unwrap(); + let tablet_snap_manager = if self.enable_receive_tablet_snapshot { + Some(TabletSnapManager::new(&path_v2, self.key_manager.clone()).unwrap()) + } else { + None + }; let mut snapshot = SnapManager { core: SnapManagerCore { @@ -1931,7 +1944,7 @@ impl SnapManagerBuilder { stats: Default::default(), }, max_total_size: Arc::new(AtomicU64::new(max_total_size)), - tablet_snap_manager: tablet_snap_mgr, + tablet_snap_manager, }; snapshot.set_max_per_file_size(self.max_per_file_size); // set actual max_per_file_size snapshot @@ -3202,8 +3215,8 @@ pub mod tests { } #[test] - fn test_init() { - let builder = SnapManagerBuilder::default(); + fn test_init_enable_receive_tablet_snapshot() { + let builder = SnapManagerBuilder::default().enable_receive_tablet_snapshot(true); let snap_dir = Builder::new() .prefix("test_snap_path_does_not_exist") .tempdir() @@ -3217,7 +3230,7 @@ pub mod tests { path.push_str("_v2"); assert!(Path::new(&path).exists()); - let builder = SnapManagerBuilder::default(); + let builder = SnapManagerBuilder::default().enable_receive_tablet_snapshot(true); let snap_dir = Builder::new() .prefix("test_snap_path_exist") .tempdir() @@ -3230,7 +3243,7 @@ pub mod tests { path.push_str("_v2"); assert!(Path::new(&path).exists()); - let builder = SnapManagerBuilder::default(); + let builder = SnapManagerBuilder::default().enable_receive_tablet_snapshot(true); let snap_dir = Builder::new() .prefix("test_tablet_snap_path_exist") .tempdir() diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index eddcfe1757a..ac23f4e58d5 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -16,8 +16,6 @@ mod split_check; mod split_config; mod split_controller; -pub use region::{ENGINE, TIFLASH}; - #[cfg(test)] pub use self::region::tests::make_raftstore_cfg as make_region_worker_raftstore_cfg; pub use self::{ diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index d6d9d0272d3..d889047a0f9 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -52,8 +52,8 @@ use crate::{ const CLEANUP_MAX_REGION_COUNT: usize = 64; -pub const TIFLASH: &str = "tiflash"; -pub const ENGINE: &str = "engine"; +const TIFLASH: &str = "tiflash"; +const ENGINE: &str = "engine"; /// Region related task #[derive(Debug)] diff --git a/components/server/src/server.rs b/components/server/src/server.rs index ec3468c6c68..625e8d8a31b 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -643,6 +643,9 @@ where .feature_gate() .can_enable(MULTI_FILES_SNAPSHOT_FEATURE), ) + .enable_receive_tablet_snapshot( + self.core.config.raft_store.enable_v2_compatible_learner, + ) .build(snap_path); // Create coprocessor endpoint. diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index c4c516fb7f9..75ab0064a17 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -273,6 +273,7 @@ impl Simulator for NodeCluster { .encryption_key_manager(key_manager) .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) .enable_multi_snapshot_files(true) + .enable_receive_tablet_snapshot(cfg.raft_store.enable_v2_compatible_learner) .build(tmp.path().to_str().unwrap()); (snap_mgr, Some(tmp)) } else { diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 4c060cef2ce..da97b31ab3a 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -458,6 +458,7 @@ impl ServerCluster { .encryption_key_manager(key_manager) .max_per_file_size(cfg.raft_store.max_snapshot_file_raw_size.0) .enable_multi_snapshot_files(true) + .enable_receive_tablet_snapshot(cfg.raft_store.enable_v2_compatible_learner) .build(tmp_str); self.snap_mgrs.insert(node_id, snap_mgr.clone()); let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); diff --git a/src/config/mod.rs b/src/config/mod.rs index 2efe9ea4c9b..5d20b027c4e 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3291,6 +3291,15 @@ impl TikvConfig { if self.rocksdb.titan.enabled { return Err("partitioned-raft-kv doesn't support titan.".into()); } + + if self.raft_store.enable_v2_compatible_learner { + self.raft_store.enable_v2_compatible_learner = false; + warn!( + "raftstore.enable-partitioned-raft-kv-compatible-learner was true but \ + storage.engine was partitioned-raft-kv, no need to enable \ + enable-partitioned-raft-kv-compatible-learner, overwrite to false" + ); + } } self.raft_store.raftdb_path = self.infer_raft_db_path(None)?; @@ -5442,6 +5451,13 @@ mod tests { cfg.storage.block_cache.capacity = Some(ReadableSize(system * 3 / 4)); cfg.validate().unwrap(); assert_eq!(cfg.memory_usage_limit.unwrap(), ReadableSize(system)); + + // Test raftstore.enable-partitioned-raft-kv-compatible-learner. + let mut cfg = TikvConfig::default(); + cfg.raft_store.enable_v2_compatible_learner = true; + cfg.storage.engine = EngineType::RaftKv2; + cfg.validate().unwrap(); + assert!(!cfg.raft_store.enable_v2_compatible_learner); } #[test] diff --git a/src/server/server.rs b/src/server/server.rs index b3db4b4b57f..45778835d29 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -13,7 +13,7 @@ use futures::{compat::Stream01CompatExt, stream::StreamExt}; use grpcio::{ChannelBuilder, Environment, ResourceQuota, Server as GrpcServer, ServerBuilder}; use grpcio_health::{create_health, HealthService, ServingStatus}; use kvproto::tikvpb::*; -use raftstore::store::{CheckLeaderTask, SnapManager, TabletSnapManager, ENGINE, TIFLASH}; +use raftstore::store::{CheckLeaderTask, SnapManager, TabletSnapManager}; use resource_control::ResourceGroupManager; use security::SecurityManager; use tikv_util::{ @@ -71,7 +71,6 @@ pub struct Server { // For sending/receiving snapshots. snap_mgr: Either, snap_worker: LazyWorker, - tiflash_engine: bool, // Currently load statistics is done in the thread. stats_pool: Option, @@ -182,12 +181,6 @@ where let trans = ServerTransport::new(raft_client); health_service.set_serving_status("", ServingStatus::NotServing); - let tiflash_engine = cfg - .value() - .labels - .iter() - .any(|entry| entry.0 == ENGINE && entry.1 == TIFLASH); - let svr = Server { env: Arc::clone(&env), builder_or_server: Some(builder), @@ -203,7 +196,6 @@ where debug_thread_pool, health_service, timer: GLOBAL_TIMER_HANDLE.clone(), - tiflash_engine, }; Ok(svr) @@ -273,7 +265,6 @@ where self.raft_router.clone(), security_mgr, cfg, - self.tiflash_engine, ); self.snap_worker.start(snap_runner); } diff --git a/src/server/snap.rs b/src/server/snap.rs index 0512a75214a..00883094471 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -364,7 +364,6 @@ pub struct Runner { cfg: Config, sending_count: Arc, recving_count: Arc, - can_receive_tablet_snapshot: bool, } impl Runner { @@ -377,7 +376,6 @@ impl Runner { r: R, security_mgr: Arc, cfg: Arc>, - can_receive_tablet_snapshot: bool, ) -> Self { let cfg_tracker = cfg.clone().tracker("snap-sender".to_owned()); let config = cfg.value().clone(); @@ -397,7 +395,6 @@ impl Runner { cfg: config, sending_count: Arc::new(AtomicUsize::new(0)), recving_count: Arc::new(AtomicUsize::new(0)), - can_receive_tablet_snapshot, }; snap_worker } @@ -467,14 +464,17 @@ impl Runnable for Runner { self.pool.spawn(task); } Task::RecvTablet { stream, sink } => { - if !self.can_receive_tablet_snapshot { - let status = RpcStatus::with_message( - RpcStatusCode::UNIMPLEMENTED, - "tablet snap is not supported".to_string(), - ); - self.pool.spawn(sink.fail(status).map(|_| ())); - return; - } + let tablet_snap_mgr = match self.snap_mgr.tablet_snap_manager() { + Some(s) => s.clone(), + None => { + let status = RpcStatus::with_message( + RpcStatusCode::UNIMPLEMENTED, + "tablet snap is not supported".to_string(), + ); + self.pool.spawn(sink.fail(status).map(|_| ())); + return; + } + }; if let Some(status) = self.receiving_busy() { self.pool.spawn(sink.fail(status)); @@ -483,7 +483,6 @@ impl Runnable for Runner { SNAP_TASK_COUNTER_STATIC.recv.inc(); - let snap_mgr = self.snap_mgr.tablet_snap_manager().clone(); let raft_router = self.raft_router.clone(); let recving_count = self.recving_count.clone(); recving_count.fetch_add(1, Ordering::SeqCst); @@ -492,7 +491,7 @@ impl Runnable for Runner { let result = crate::server::tablet_snap::recv_snap( stream, sink, - snap_mgr, + tablet_snap_mgr, raft_router, NoSnapshotCache, // do not use cache in v1 limiter, diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index fa1d6a6fe52..bb91d0d62eb 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -65,7 +65,6 @@ fn start_server( RaftRouterWrap::new(raft_router), security_mgr, Arc::clone(&server_config), - false, ); snap_worker.start(snap_runner); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 34b558f39c0..cd7680e8147 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -260,6 +260,7 @@ fn test_serde_custom_tikv_config() { check_request_snapshot_interval: ReadableDuration::minutes(1), slow_trend_unsensitive_cause: 10.0, slow_trend_unsensitive_result: 0.5, + enable_v2_compatible_learner: false, }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 28a30fcec04..e3940cc7067 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -226,6 +226,7 @@ report-region-buckets-tick-interval = "1234s" max-snapshot-file-raw-size = "10GB" unreachable-backoff = "111s" max-entry-cache-warmup-duration = "2s" +enable-partitioned-raft-kv-compatible-learner = false [coprocessor] split-region-on-table = false diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index 3171aaa1a9e..f9a124a4395 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -784,11 +784,7 @@ fn test_v1_receive_snap_from_v2() { let mut cluster_v2 = test_raftstore_v2::new_server_cluster(1, 1); let mut cluster_v1_tikv = test_raftstore::new_server_cluster(1, 1); - cluster_v1 - .cfg - .server - .labels - .insert(String::from("engine"), String::from("tiflash")); + cluster_v1.cfg.raft_store.enable_v2_compatible_learner = true; cluster_v1.run(); cluster_v2.run(); @@ -839,7 +835,10 @@ fn test_v1_receive_snap_from_v2() { // The snapshot has been received by cluster v1, so check it's completeness let snap_mgr = cluster_v1.get_snap_mgr(1); - let path = snap_mgr.tablet_snap_manager().final_recv_path(&snap_key); + let path = snap_mgr + .tablet_snap_manager() + .unwrap() + .final_recv_path(&snap_key); let rocksdb = engine_rocks::util::new_engine_opt( path.as_path().to_str().unwrap(), RocksDbOptions::default(), From b11c299ff1f4f6051b6e534398e1f2aa1cc2fca1 Mon Sep 17 00:00:00 2001 From: Hangjie Mo Date: Fri, 21 Apr 2023 16:05:20 +0800 Subject: [PATCH 657/676] copr: fix extral physical table id when idx key < `MAX_OLD_ENCODED_VALUE_LEN` (#14618) close tikv/tikv#14619 fix a bug with `process_old_collation_kv` function. related with https://github.com/tikv/tikv/pull/11931, forget process `physical_table_id_column_cnt` in process_old_collation_kv function Signed-off-by: Jason Mo Co-authored-by: Ti Chi Robot --- .../src/index_scan_executor.rs | 86 +++++++++++++++++-- 1 file changed, 81 insertions(+), 5 deletions(-) diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index de59b843eb5..3a5c53a4d09 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -444,10 +444,12 @@ impl IndexScanExecutorImpl { Ok(()) } - // Process index values that are in old collation. - // NOTE: We should extract the index columns from the key first, and extract the - // handles from value if there is no handle in the key. Otherwise, extract the - // handles from the key. + // Process index values that are in old collation, when + // `new_collations_enabled_on_first_bootstrap` = true also will access this + // function. + // NOTE: We should extract the index columns from the key first, + // and extract the handles from value if there is no handle in the key. + // Otherwise, extract the handles from the key. fn process_old_collation_kv( &mut self, mut key_payload: &[u8], @@ -479,9 +481,11 @@ impl IndexScanExecutorImpl { } DecodeCommonHandle => { // Otherwise, if the handle is common handle, we extract it from the key. + let end_index = + columns.columns_len() - self.pid_column_cnt - self.physical_table_id_column_cnt; Self::extract_columns_from_datum_format( &mut key_payload, - &mut columns[self.columns_id_without_handle.len()..], + &mut columns[self.columns_id_without_handle.len()..end_index], )?; } } @@ -3295,6 +3299,78 @@ mod tests { ); } + #[test] + fn test_common_handle_with_physical_table_id() { + // CREATE TABLE `tcommonhash` ( + // `a` int(11) NOT NULL, + // `b` int(11) DEFAULT NULL, + // `c` int(11) NOT NULL, + // `d` int(11) NOT NUL, + // PRIMARY KEY (`a`,`c`,`d`) /*T![clustered_index] CLUSTERED */, + // KEY `idx_bc` (`b`,`c`) + // ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin + // insert into tcommonhash values (1, 2, 3, 1); + + // idx_bc + let mut idx_exe = IndexScanExecutorImpl { + context: Default::default(), + schema: vec![ + FieldTypeTp::Long.into(), + FieldTypeTp::Long.into(), + FieldTypeTp::Long.into(), + FieldTypeTp::Long.into(), + FieldTypeTp::Long.into(), + // EXTRA_PHYSICAL_TABLE_ID_COL + FieldTypeTp::Long.into(), + ], + columns_id_without_handle: vec![2, 3], + columns_id_for_common_handle: vec![1, 3, 4], + decode_handle_strategy: DecodeHandleStrategy::DecodeCommonHandle, + pid_column_cnt: 0, + physical_table_id_column_cnt: 1, + index_version: -1, + }; + let mut columns = idx_exe.build_column_vec(10); + idx_exe + .process_kv_pair( + &[ + 0x74, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5c, 0x5f, 0x69, 0x80, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x2, 0x3, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x3, + 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3, 0x3, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x1, 0x3, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3, 0x3, 0x80, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x1, + ], + &[0x0, 0x7d, 0x1], + &mut columns, + ) + .unwrap(); + assert_eq!( + columns[0].raw().last().unwrap().read_datum().unwrap(), + Datum::I64(2) + ); + assert_eq!( + columns[1].raw().last().unwrap().read_datum().unwrap(), + Datum::I64(3) + ); + assert_eq!( + columns[2].raw().last().unwrap().read_datum().unwrap(), + Datum::I64(1) + ); + assert_eq!( + columns[3].raw().last().unwrap().read_datum().unwrap(), + Datum::I64(3) + ); + assert_eq!( + columns[4].raw().last().unwrap().read_datum().unwrap(), + Datum::I64(1) + ); + assert_eq!( + // physical table id + columns[5].mut_decoded().to_int_vec()[0].unwrap(), + 92 + ); + } + #[test] fn test_common_handle_index_latin1_bin() { use tidb_query_datatype::builder::FieldTypeBuilder; From 666edeedaef2b326a7b5f2f96fac2473fcdd08fd Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Fri, 21 Apr 2023 16:47:20 +0800 Subject: [PATCH 658/676] raftstore: enable raftstore-v1 apply tablet snapshot sent from raftstore-v2 (#14584) ref tikv/tikv#14579 enable raftstore-v1 apply tablet snapshot sent from raftstore-v2 Signed-off-by: Spade A Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/raftstore/src/store/snap.rs | 76 ++++++++++ components/test_raftstore/src/server.rs | 13 ++ src/server/metrics.rs | 1 + src/server/snap.rs | 4 +- src/server/tablet_snap.rs | 33 ++++- tests/integrations/raftstore/test_snap.rs | 168 +++++++++++++++++++++- 7 files changed, 289 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7f2a1e91650..bda2a12187d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2797,7 +2797,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#dc3cd8784a19bc7f058dbeb19cd8cc4672ee9aad" +source = "git+https://github.com/pingcap/kvproto.git#10e7620a630db63d769503ba99c7389f19fb6516" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index ee488bbc5aa..12440abb5d0 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1013,6 +1013,28 @@ impl Snapshot { delete_file_if_exist(&self.meta_file.tmp_path).unwrap(); } } + + // This is only used for v2 compatibility. + fn new_for_tablet_snapshot>( + dir: T, + key: &SnapKey, + mgr: &SnapManagerCore, + tablet_snapshot_path: &str, + for_balance: bool, + ) -> RaftStoreResult { + let mut s = Self::new(dir, key, false, CheckPolicy::ErrNotAllowed, mgr)?; + s.init_for_building()?; + let mut meta = gen_snapshot_meta(&s.cf_files[..], for_balance)?; + meta.tablet_snap_path = tablet_snapshot_path.to_string(); + s.meta_file.meta = Some(meta); + s.save_meta_file()?; + Ok(s) + } + + #[cfg(any(test, feature = "testexport"))] + pub fn tablet_snap_path(&self) -> Option { + Some(self.meta_file.meta.as_ref()?.tablet_snap_path.clone()) + } } impl fmt::Debug for Snapshot { @@ -1606,6 +1628,38 @@ impl SnapManager { Ok(Box::new(f)) } + // Tablet snapshot is the snapshot sent from raftstore-v2. + // We enable v1 to receive it to enable tiflash node to receive and apply + // snapshot from raftstore-v2. + // To make it easy, we maintain an empty `store::snapshot` with tablet snapshot + // path storing in it. So tiflash node can detect it and apply properly. + pub fn gen_empty_snapshot_for_tablet_snapshot( + &self, + tablet_snap_key: &TabletSnapKey, + for_balance: bool, + ) -> RaftStoreResult<()> { + let _lock = self.core.registry.rl(); + let base = &self.core.base; + let tablet_snap_path = self + .tablet_snap_manager + .as_ref() + .unwrap() + .final_recv_path(tablet_snap_key); + let snap_key = SnapKey::new( + tablet_snap_key.region_id, + tablet_snap_key.term, + tablet_snap_key.idx, + ); + let _ = Snapshot::new_for_tablet_snapshot( + base, + &snap_key, + &self.core, + tablet_snap_path.to_str().unwrap(), + for_balance, + )?; + Ok(()) + } + pub fn get_snapshot_for_applying(&self, key: &SnapKey) -> RaftStoreResult> { let _lock = self.core.registry.rl(); let base = &self.core.base; @@ -3214,6 +3268,28 @@ pub mod tests { } } + #[test] + fn test_generate_snap_for_tablet_snapshot() { + let snap_dir = Builder::new().prefix("test_snapshot").tempdir().unwrap(); + let snap_mgr = SnapManagerBuilder::default() + .enable_receive_tablet_snapshot(true) + .build(snap_dir.path().to_str().unwrap()); + snap_mgr.init().unwrap(); + let tablet_snap_key = TabletSnapKey::new(1, 2, 3, 4); + snap_mgr + .gen_empty_snapshot_for_tablet_snapshot(&tablet_snap_key, false) + .unwrap(); + + let snap_key = SnapKey::new(1, 3, 4); + let s = snap_mgr.get_snapshot_for_applying(&snap_key).unwrap(); + let expect_path = snap_mgr + .tablet_snap_manager() + .as_ref() + .unwrap() + .final_recv_path(&tablet_snap_key); + assert_eq!(expect_path.to_str().unwrap(), s.tablet_snap_path().unwrap()); + } + #[test] fn test_init_enable_receive_tablet_snapshot() { let builder = SnapManagerBuilder::default().enable_receive_tablet_snapshot(true); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index da97b31ab3a..a59dafd4504 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -815,6 +815,19 @@ impl Cluster { pub fn get_addr(&self, node_id: u64) -> String { self.sim.rl().get_addr(node_id) } + + pub fn register_hook( + &self, + node_id: u64, + register: Box)>, + ) { + self.sim + .wl() + .coprocessor_hooks + .entry(node_id) + .or_default() + .push(register); + } } pub fn new_server_cluster(id: u64, count: usize) -> Cluster { diff --git a/src/server/metrics.rs b/src/server/metrics.rs index d35c58cbf34..37c3ce1048f 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -77,6 +77,7 @@ make_auto_flush_static_metric! { pub label_enum SnapTask { send, recv, + recv_v2, } pub label_enum ResolveStore { diff --git a/src/server/snap.rs b/src/server/snap.rs index 00883094471..4324f17459e 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -481,12 +481,13 @@ impl Runnable for Runner { return; } - SNAP_TASK_COUNTER_STATIC.recv.inc(); + SNAP_TASK_COUNTER_STATIC.recv_v2.inc(); let raft_router = self.raft_router.clone(); let recving_count = self.recving_count.clone(); recving_count.fetch_add(1, Ordering::SeqCst); let limiter = self.snap_mgr.limiter().clone(); + let snap_mgr_v1 = self.snap_mgr.clone(); let task = async move { let result = crate::server::tablet_snap::recv_snap( stream, @@ -495,6 +496,7 @@ impl Runnable for Runner { raft_router, NoSnapshotCache, // do not use cache in v1 limiter, + Some(snap_mgr_v1), ) .await; recving_count.fetch_sub(1, Ordering::SeqCst); diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index 8e5a3293909..cb7ec7c988a 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -50,7 +50,10 @@ use kvproto::{ tikvpb::TikvClient, }; use protobuf::Message; -use raftstore::store::snap::{ReceivingGuard, TabletSnapKey, TabletSnapManager}; +use raftstore::store::{ + snap::{ReceivingGuard, TabletSnapKey, TabletSnapManager}, + SnapManager, +}; use security::SecurityManager; use tikv_kv::RaftExtension; use tikv_util::{ @@ -426,6 +429,7 @@ pub(crate) async fn recv_snap_files<'a>( let final_path = snap_mgr.final_recv_path(&context.key); // TODO(tabokie) fs::rename(&path, final_path)?; + Ok(context) } @@ -436,12 +440,24 @@ pub(crate) async fn recv_snap( raft_router: R, cache_builder: impl SnapCacheBuilder, limiter: Limiter, + snap_mgr_v1: Option, ) -> Result<()> { let stream = stream.map_err(Error::from); let mut sink = sink; let res = recv_snap_files(&snap_mgr, cache_builder, stream, &mut sink, limiter) .await - .and_then(|context| context.finish(raft_router)); + .and_then(|context| { + // some means we are in raftstore-v1 config and received a tablet snapshot from + // raftstore-v2. Now, it can only happen in tiflash node within a raftstore-v2 + // cluster. + if let Some(snap_mgr_v1) = snap_mgr_v1 { + snap_mgr_v1.gen_empty_snapshot_for_tablet_snapshot( + &context.key, + context.io_type == IoType::LoadBalance, + )?; + } + context.finish(raft_router) + }); match res { Ok(()) => sink.close().await.map_err(Error::from), Err(e) => { @@ -794,9 +810,16 @@ where let limiter = self.limiter.clone(); let cache_builder = self.cache_builder.clone(); let task = async move { - let result = - recv_snap(stream, sink, snap_mgr, raft_router, cache_builder, limiter) - .await; + let result = recv_snap( + stream, + sink, + snap_mgr, + raft_router, + cache_builder, + limiter, + None, + ) + .await; recving_count.fetch_sub(1, Ordering::SeqCst); if let Err(e) = result { error!("failed to recv snapshot"; "err" => %e); diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index f9a124a4395..f474b5cdb8e 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -11,6 +11,7 @@ use std::{ time::Duration, }; +use collections::HashMap; use engine_rocks::{RocksCfOptions, RocksDbOptions}; use engine_traits::{Checkpointer, KvEngine, Peekable, RaftEngineReadOnly, SyncMutable, LARGE_CFS}; use file_system::{IoOp, IoType}; @@ -19,6 +20,7 @@ use grpcio::Environment; use kvproto::raft_serverpb::*; use raft::eraftpb::{Message, MessageType, Snapshot}; use raftstore::{ + coprocessor::{ApplySnapshotObserver, BoxApplySnapshotObserver, Coprocessor, CoprocessorHost}, store::{snap::TABLET_SNAPSHOT_VERSION, *}, Result, }; @@ -747,10 +749,14 @@ fn generate_snap( let tablet = engine.get_tablet_by_id(region_id).unwrap(); let region_state = engine.region_local_state(region_id).unwrap().unwrap(); let apply_state = engine.raft_apply_state(region_id).unwrap().unwrap(); + let raft_state = engine.raft_local_state(region_id).unwrap().unwrap(); // Construct snapshot by hand let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().set_term(apply_state.commit_term); + // use commit term for simplicity + snapshot + .mut_metadata() + .set_term(raft_state.get_hard_state().term + 1); snapshot.mut_metadata().set_index(apply_state.applied_index); let conf_state = raftstore::store::util::conf_state_from_region(region_state.get_region()); snapshot.mut_metadata().set_conf_state(conf_state); @@ -771,6 +777,8 @@ fn generate_snap( msg.region_id = region_id; msg.set_to_peer(new_peer(1, 1)); msg.mut_message().set_snapshot(snapshot); + msg.mut_message() + .set_term(raft_state.get_hard_state().commit + 1); msg.mut_message().set_msg_type(MessageType::MsgSnapshot); msg.set_region_epoch(region_state.get_region().get_region_epoch().clone()); @@ -866,3 +874,161 @@ fn test_v1_receive_snap_from_v2() { // test large snapshot test_receive_snap(5000); } + +#[derive(Clone)] +struct MockApplySnapshotObserver { + tablet_snap_paths: Arc>>, +} + +impl Coprocessor for MockApplySnapshotObserver {} + +impl ApplySnapshotObserver for MockApplySnapshotObserver { + fn should_pre_apply_snapshot(&self) -> bool { + true + } + + fn pre_apply_snapshot( + &self, + _: &mut raftstore::coprocessor::ObserverContext<'_>, + peer_id: u64, + _: &raftstore::store::SnapKey, + snap: Option<&raftstore::store::Snapshot>, + ) { + let tablet_path = snap.unwrap().tablet_snap_path().as_ref().unwrap().clone(); + self.tablet_snap_paths + .lock() + .unwrap() + .insert(peer_id, (false, tablet_path)); + } + + fn post_apply_snapshot( + &self, + _: &mut raftstore::coprocessor::ObserverContext<'_>, + peer_id: u64, + _: &raftstore::store::SnapKey, + snap: Option<&raftstore::store::Snapshot>, + ) { + let tablet_path = snap.unwrap().tablet_snap_path().as_ref().unwrap().clone(); + match self.tablet_snap_paths.lock().unwrap().entry(peer_id) { + collections::HashMapEntry::Occupied(mut entry) => { + if entry.get_mut().1 == tablet_path { + entry.get_mut().0 = true; + } + } + collections::HashMapEntry::Vacant(_) => {} + } + } +} + +#[test] +fn test_v1_apply_snap_from_v2() { + let mut cluster_v1 = test_raftstore::new_server_cluster(1, 1); + let mut cluster_v2 = test_raftstore_v2::new_server_cluster(1, 1); + cluster_v1.cfg.raft_store.enable_v2_compatible_learner = true; + + let observer = MockApplySnapshotObserver { + tablet_snap_paths: Arc::default(), + }; + let observer_clone = observer.clone(); + cluster_v1.register_hook( + 1, + Box::new(move |host: &mut CoprocessorHost<_>| { + host.registry.register_apply_snapshot_observer( + 1, + BoxApplySnapshotObserver::new(observer_clone.clone()), + ); + }), + ); + + cluster_v1.run(); + cluster_v2.run(); + + let region = cluster_v2.get_region(b""); + cluster_v2.must_split(®ion, b"k0010"); + + let s1_addr = cluster_v1.get_addr(1); + let region_id = region.get_id(); + let engine = cluster_v2.get_engine(1); + + for i in 0..50 { + let k = format!("k{:04}", i); + cluster_v2.must_put(k.as_bytes(), b"val"); + } + cluster_v2.flush_data(); + + let tablet_snap_mgr = cluster_v2.get_snap_mgr(1); + let security_mgr = cluster_v2.get_security_mgr(); + let (msg, snap_key) = generate_snap(&engine, region_id, &tablet_snap_mgr); + let cfg = tikv::server::Config::default(); + let limit = Limiter::new(f64::INFINITY); + let env = Arc::new(Environment::new(1)); + let _ = block_on(async { + send_snap_v2( + env.clone(), + tablet_snap_mgr.clone(), + security_mgr.clone(), + &cfg, + &s1_addr, + msg, + limit.clone(), + ) + .unwrap() + .await + }); + + let snap_mgr = cluster_v1.get_snap_mgr(region_id); + let path = snap_mgr + .tablet_snap_manager() + .as_ref() + .unwrap() + .final_recv_path(&snap_key); + let path_str = path.as_path().to_str().unwrap(); + + check_observer(&observer, region_id, path_str); + + let region = cluster_v2.get_region(b"k0011"); + let region_id = region.get_id(); + let (msg, snap_key) = generate_snap(&engine, region_id, &tablet_snap_mgr); + let _ = block_on(async { + send_snap_v2( + env, + tablet_snap_mgr, + security_mgr, + &cfg, + &s1_addr, + msg, + limit, + ) + .unwrap() + .await + }); + + let snap_mgr = cluster_v1.get_snap_mgr(region_id); + let path = snap_mgr + .tablet_snap_manager() + .as_ref() + .unwrap() + .final_recv_path(&snap_key); + let path_str = path.as_path().to_str().unwrap(); + + check_observer(&observer, region_id, path_str); +} + +fn check_observer(observer: &MockApplySnapshotObserver, region_id: u64, snap_path: &str) { + for _ in 0..10 { + if let Some(pair) = observer + .tablet_snap_paths + .as_ref() + .lock() + .unwrap() + .get(®ion_id) + { + if pair.0 && pair.1 == snap_path { + return; + } + } + std::thread::sleep(Duration::from_millis(200)); + } + + panic!("cannot find {:?} in observer", snap_path); +} From 20b75dc4436dc19f1d41acadde9704041dbb7c0c Mon Sep 17 00:00:00 2001 From: qupeng Date: Fri, 21 Apr 2023 19:09:20 +0800 Subject: [PATCH 659/676] raft: peers shouldn't hibernate incorrectly when one node fails (#14574) ref tikv/tikv#14547 raft: peers shouldn't hibernate incorrectly when one node fails Signed-off-by: qupeng Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/fsm/store.rs | 44 +++++++++----- components/raftstore/src/store/metrics.rs | 7 +++ components/test_raftstore-v2/src/server.rs | 18 ++++-- components/test_raftstore/src/server.rs | 18 ++++-- src/server/mod.rs | 2 +- src/server/raft_client.rs | 56 +++++++++++++----- src/server/server.rs | 2 +- src/server/service/kv.rs | 41 ++++++++++++- tests/failpoints/cases/test_hibernate.rs | 51 ++++++++++++++++ .../integrations/raftstore/test_tombstone.rs | 2 +- tests/integrations/server/raft_client.rs | 58 +------------------ 11 files changed, 198 insertions(+), 101 deletions(-) diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index f28c4170459..c64b2a53c37 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -678,7 +678,12 @@ struct Store { stopped: bool, start_time: Option, consistency_check_time: HashMap, - last_unreachable_report: HashMap, + store_reachability: HashMap, +} + +struct StoreReachability { + last_broadcast: Instant, + received_message_count: u64, } pub struct StoreFsm @@ -702,7 +707,7 @@ where stopped: false, start_time: None, consistency_check_time: HashMap::default(), - last_unreachable_report: HashMap::default(), + store_reachability: HashMap::default(), }, receiver: rx, }); @@ -2876,22 +2881,35 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER fn on_store_unreachable(&mut self, store_id: u64) { let now = Instant::now(); let unreachable_backoff = self.ctx.cfg.unreachable_backoff.0; - if self - .fsm - .store - .last_unreachable_report - .get(&store_id) - .map_or(unreachable_backoff, |t| now.saturating_duration_since(*t)) - < unreachable_backoff - { - return; - } + let new_messages = MESSAGE_RECV_BY_STORE + .with_label_values(&[&format!("{}", store_id)]) + .get(); + match self.fsm.store.store_reachability.entry(store_id) { + HashMapEntry::Vacant(x) => { + x.insert(StoreReachability { + last_broadcast: now, + received_message_count: new_messages, + }); + } + HashMapEntry::Occupied(x) => { + let ob = x.into_mut(); + if now.saturating_duration_since(ob.last_broadcast) < unreachable_backoff + // If there are no new messages come from `store_id`, it's not + // necessary to do redundant broadcasts. + || (new_messages <= ob.received_message_count && new_messages > 0) + { + return; + } + ob.last_broadcast = now; + ob.received_message_count = new_messages; + } + }; + info!( "broadcasting unreachable"; "store_id" => self.fsm.store.id, "unreachable_store_id" => store_id, ); - self.fsm.store.last_unreachable_report.insert(store_id, now); // It's possible to acquire the lock and only send notification to // involved regions. However loop over all the regions can take a // lot of time, which may block other operations. diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 7df8819c998..c69875ae998 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -868,4 +868,11 @@ lazy_static! { "Total snapshot generate limit used", ) .unwrap(); + + pub static ref MESSAGE_RECV_BY_STORE: IntCounterVec = register_int_counter_vec!( + "tikv_raftstore_message_recv_by_store", + "Messages received by store", + &["store"] + ) + .unwrap(); } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 9bdd8568418..85941088e2e 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -259,7 +259,8 @@ pub struct ServerCluster { snap_paths: HashMap, snap_mgrs: HashMap, pd_client: Arc, - raft_client: RaftClient, + raft_clients: HashMap>, + conn_builder: ConnectionBuilder, concurrency_managers: HashMap, env: Arc, pub pending_services: HashMap, @@ -291,7 +292,6 @@ impl ServerCluster { worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); - let raft_client = RaftClient::new(conn_builder); ServerCluster { metas: HashMap::default(), addrs: map, @@ -303,7 +303,8 @@ impl ServerCluster { snap_paths: HashMap::default(), pending_services: HashMap::default(), health_services: HashMap::default(), - raft_client, + raft_clients: HashMap::default(), + conn_builder, concurrency_managers: HashMap::default(), env, txn_extra_schedulers: HashMap::default(), @@ -650,6 +651,8 @@ impl ServerCluster { self.concurrency_managers .insert(node_id, concurrency_manager); + let client = RaftClient::new(node_id, self.conn_builder.clone()); + self.raft_clients.insert(node_id, client); Ok(node_id) } @@ -763,6 +766,7 @@ impl Simulator for ServerCluster { (meta.rsmeter_cleanup)(); } self.storages.remove(&node_id); + let _ = self.raft_clients.remove(&node_id); } fn async_snapshot( @@ -800,8 +804,12 @@ impl Simulator for ServerCluster { } fn send_raft_msg(&mut self, msg: RaftMessage) -> raftstore::Result<()> { - self.raft_client.send(msg).unwrap(); - self.raft_client.flush(); + let from_store = msg.get_from_peer().store_id; + assert_ne!(from_store, 0); + if let Some(client) = self.raft_clients.get_mut(&from_store) { + client.send(msg).unwrap(); + client.flush(); + } Ok(()) } diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index a59dafd4504..ec6cb0a235c 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -154,7 +154,8 @@ pub struct ServerCluster { snap_paths: HashMap, snap_mgrs: HashMap, pd_client: Arc, - raft_client: RaftClient, + raft_clients: HashMap>, + conn_builder: ConnectionBuilder, concurrency_managers: HashMap, env: Arc, pub causal_ts_providers: HashMap>, @@ -182,7 +183,6 @@ impl ServerCluster { worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); - let raft_client = RaftClient::new(conn_builder); ServerCluster { metas: HashMap::default(), addrs: map, @@ -196,7 +196,8 @@ impl ServerCluster { pending_services: HashMap::default(), coprocessor_hooks: HashMap::default(), health_services: HashMap::default(), - raft_client, + raft_clients: HashMap::default(), + conn_builder, concurrency_managers: HashMap::default(), env, txn_extra_schedulers: HashMap::default(), @@ -645,6 +646,8 @@ impl ServerCluster { self.concurrency_managers .insert(node_id, concurrency_manager); + let client = RaftClient::new(node_id, self.conn_builder.clone()); + self.raft_clients.insert(node_id, client); Ok(node_id) } } @@ -698,6 +701,7 @@ impl Simulator for ServerCluster { } (meta.rsmeter_cleanup)(); } + let _ = self.raft_clients.remove(&node_id); } fn get_node_ids(&self) -> HashSet { @@ -739,8 +743,12 @@ impl Simulator for ServerCluster { } fn send_raft_msg(&mut self, raft_msg: raft_serverpb::RaftMessage) -> Result<()> { - self.raft_client.send(raft_msg).unwrap(); - self.raft_client.flush(); + let from_store = raft_msg.get_from_peer().store_id; + assert_ne!(from_store, 0); + if let Some(client) = self.raft_clients.get_mut(&from_store) { + client.send(raft_msg).unwrap(); + client.flush(); + } Ok(()) } diff --git a/src/server/mod.rs b/src/server/mod.rs index 773e2040f17..e432b3aa51b 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -34,7 +34,7 @@ pub use self::{ metrics::{CONFIG_ROCKSDB_GAUGE, CPU_CORES_QUOTA_GAUGE, MEM_TRACE_SUM_GAUGE}, node::Node, proxy::{build_forward_option, get_target_address, Proxy}, - raft_client::{ConnectionBuilder, RaftClient}, + raft_client::{ConnectionBuilder, MetadataSourceStoreId, RaftClient}, raftkv::RaftKv, raftkv2::{Extension, NodeV2, RaftKv2}, resolve::{PdStoreAddrResolver, StoreAddrResolver}, diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index 17de1d3365d..f30e5b36045 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -25,8 +25,8 @@ use futures::{ }; use futures_timer::Delay; use grpcio::{ - Channel, ChannelBuilder, ClientCStreamReceiver, ClientCStreamSender, Environment, - RpcStatusCode, WriteFlags, + CallOption, Channel, ChannelBuilder, ClientCStreamReceiver, ClientCStreamSender, Environment, + MetadataBuilder, RpcStatusCode, WriteFlags, }; use kvproto::{ raft_serverpb::{Done, RaftMessage, RaftSnapshotData}, @@ -50,6 +50,21 @@ use crate::server::{ StoreAddrResolver, }; +pub struct MetadataSourceStoreId {} + +impl MetadataSourceStoreId { + pub const KEY: &str = "source_store_id"; + + pub fn parse(value: &[u8]) -> u64 { + let value = std::str::from_utf8(value).unwrap(); + value.parse::().unwrap() + } + + pub fn format(id: u64) -> String { + format!("{}", id) + } +} + static CONN_ID: AtomicI32 = AtomicI32::new(0); const _ON_RESOLVE_FP: &str = "transport_snapshot_on_resolve"; @@ -616,6 +631,7 @@ impl ConnectionBuilder { /// StreamBackEnd watches lifetime of a connection and handles reconnecting, /// spawn new RPC. struct StreamBackEnd { + self_store_id: u64, store_id: u64, queue: Arc, builder: ConnectionBuilder, @@ -697,7 +713,8 @@ where } fn batch_call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver { - let (batch_sink, batch_stream) = client.batch_raft().unwrap(); + let (batch_sink, batch_stream) = client.batch_raft_opt(self.get_call_option()).unwrap(); + let (tx, rx) = oneshot::channel(); let mut call = RaftCall { sender: AsyncRaftSender { @@ -721,7 +738,8 @@ where } fn call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver { - let (sink, stream) = client.raft().unwrap(); + let (sink, stream) = client.raft_opt(self.get_call_option()).unwrap(); + let (tx, rx) = oneshot::channel(); let mut call = RaftCall { sender: AsyncRaftSender { @@ -742,6 +760,15 @@ where }); rx } + + fn get_call_option(&self) -> CallOption { + let mut metadata = MetadataBuilder::with_capacity(1); + let value = MetadataSourceStoreId::format(self.self_store_id); + metadata + .add_str(MetadataSourceStoreId::KEY, &value) + .unwrap(); + CallOption::default().headers(metadata.build()) + } } async fn maybe_backoff(backoff: Duration, last_wake_time: &mut Option) { @@ -782,7 +809,6 @@ async fn start( R: RaftExtension + Unpin + Send + 'static, { let mut last_wake_time = None; - let mut first_time = true; let backoff_duration = back_end.builder.cfg.value().raft_client_max_backoff.0; let mut addr_channel = None; loop { @@ -828,15 +854,10 @@ async fn start( // shutdown. back_end.clear_pending_message("unreachable"); - // broadcast is time consuming operation which would blocks raftstore, so report - // unreachable only once until being connected again. - if first_time { - first_time = false; - back_end - .builder - .router - .report_store_unreachable(back_end.store_id); - } + back_end + .builder + .router + .report_store_unreachable(back_end.store_id); continue; } else { debug!("connection established"; "store_id" => back_end.store_id, "addr" => %addr); @@ -868,7 +889,6 @@ async fn start( .router .report_store_unreachable(back_end.store_id); addr_channel = None; - first_time = false; } } } @@ -926,6 +946,7 @@ struct CachedQueue { /// raft_client.flush(); /// ``` pub struct RaftClient { + self_store_id: u64, pool: Arc>, cache: LruCache<(u64, usize), CachedQueue>, need_flush: Vec<(u64, usize)>, @@ -940,13 +961,14 @@ where S: StoreAddrResolver + Send + 'static, R: RaftExtension + Unpin + Send + 'static, { - pub fn new(builder: ConnectionBuilder) -> Self { + pub fn new(self_store_id: u64, builder: ConnectionBuilder) -> Self { let future_pool = Arc::new( yatp::Builder::new(thd_name!("raft-stream")) .max_thread_count(1) .build_future_pool(), ); RaftClient { + self_store_id, pool: Arc::default(), cache: LruCache::with_capacity_and_sample(0, 7), need_flush: vec![], @@ -982,6 +1004,7 @@ where queue.set_conn_state(ConnState::Paused); } let back_end = StreamBackEnd { + self_store_id: self.self_store_id, store_id, queue: queue.clone(), builder: self.builder.clone(), @@ -1143,6 +1166,7 @@ where { fn clone(&self) -> Self { RaftClient { + self_store_id: self.self_store_id, pool: self.pool.clone(), cache: LruCache::with_capacity_and_sample(0, 7), need_flush: vec![], diff --git a/src/server/server.rs b/src/server/server.rs index 45778835d29..8a50f44f363 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -176,7 +176,7 @@ where lazy_worker.scheduler(), grpc_thread_load.clone(), ); - let raft_client = RaftClient::new(conn_builder); + let raft_client = RaftClient::new(store_id, conn_builder); let trans = ServerTransport::new(raft_client); health_service.set_serving_status("", ServingStatus::NotServing); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 2c77ee4e0bd..9895067fcb3 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -21,7 +21,7 @@ use raft::eraftpb::MessageType; use raftstore::{ store::{ memory::{MEMTRACE_APPLYS, MEMTRACE_RAFT_ENTRIES, MEMTRACE_RAFT_MESSAGES}, - metrics::RAFT_ENTRIES_CACHES_GAUGE, + metrics::{MESSAGE_RECV_BY_STORE, RAFT_ENTRIES_CACHES_GAUGE}, CheckLeaderTask, }, Error as RaftStoreError, Result as RaftStoreResult, @@ -45,7 +45,7 @@ use crate::{ coprocessor_v2, forward_duplex, forward_unary, log_net_error, server::{ gc_worker::GcWorker, load_statistics::ThreadLoadPool, metrics::*, snap::Task as SnapTask, - Error, Proxy, Result as ServerResult, + Error, MetadataSourceStoreId, Proxy, Result as ServerResult, }, storage::{ self, @@ -168,9 +168,23 @@ impl Service { ch.report_reject_message(id, peer_id); return Ok(()); } + + fail_point!("receive_raft_message_from_outside"); ch.feed(msg, false); Ok(()) } + + fn get_store_id_from_metadata(ctx: &RpcContext<'_>) -> Option { + let metadata = ctx.request_headers(); + for i in 0..metadata.len() { + let (key, value) = metadata.get(i).unwrap(); + if key == MetadataSourceStoreId::KEY { + let store_id = MetadataSourceStoreId::parse(value); + return Some(store_id); + } + } + None + } } macro_rules! handle_request { @@ -636,6 +650,14 @@ impl Tikv for Service { stream: RequestStream, sink: ClientStreamingSink, ) { + let source_store_id = Self::get_store_id_from_metadata(&ctx); + let message_received = + source_store_id.map(|x| MESSAGE_RECV_BY_STORE.with_label_values(&[&format!("{}", x)])); + info!( + "raft RPC is called, new gRPC stream established"; + "source_store_id" => ?source_store_id, + ); + let store_id = self.store_id; let ch = self.storage.get_engine().raft_extension(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; @@ -652,6 +674,9 @@ impl Tikv for Service { // `StoreNotMatch` to let tikv to resolve a correct address from PD return Err(Error::from(err)); } + if let Some(ref counter) = message_received { + counter.inc(); + } } Ok::<(), Error>(()) }; @@ -678,7 +703,14 @@ impl Tikv for Service { stream: RequestStream, sink: ClientStreamingSink, ) { - info!("batch_raft RPC is called, new gRPC stream established"); + let source_store_id = Self::get_store_id_from_metadata(&ctx); + let message_received = + source_store_id.map(|x| MESSAGE_RECV_BY_STORE.with_label_values(&[&format!("{}", x)])); + info!( + "batch_raft RPC is called, new gRPC stream established"; + "source_store_id" => ?source_store_id, + ); + let store_id = self.store_id; let ch = self.storage.get_engine().raft_extension(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; @@ -699,6 +731,9 @@ impl Tikv for Service { return Err(Error::from(err)); } } + if let Some(ref counter) = message_received { + counter.inc_by(len as u64); + } } Ok::<(), Error>(()) }; diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index 6bbed4ac641..4dc404e58b8 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -81,3 +81,54 @@ fn test_break_leadership_on_restart() { // incorrectly. rx.recv_timeout(Duration::from_secs(2)).unwrap_err(); } + +// This case creates a cluster with 3 TiKV instances, and then wait all peers +// hibernate. +// +// After that, propose a command and stop the leader node immediately. +// With failpoint `receive_raft_message_from_outside`, we can make the proposal +// reach 2 followers *after* `StoreUnreachable` is broadcasted. +// +// 2 followers may become GroupState::Chaos after `StoreUnreachable` is +// received, and become `GroupState::Ordered` after the proposal is received. +// But they should keep wakeful for a while. +#[test] +fn test_store_disconnect_with_hibernate() { + let mut cluster = new_server_cluster(0, 3); + let base_tick_ms = 50; + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); + cluster.cfg.raft_store.raft_heartbeat_ticks = 2; + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + cluster.cfg.raft_store.unreachable_backoff = ReadableDuration::millis(500); + cluster.cfg.server.raft_client_max_backoff = ReadableDuration::millis(200); + // So the random election timeout will always be 10, which makes the case more + // stable. + cluster.cfg.raft_store.raft_min_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_max_election_timeout_ticks = 11; + configure_for_hibernate(&mut cluster); + cluster.pd_client.disable_default_operator(); + let r = cluster.run_conf_change(); + cluster.pd_client.must_add_peer(r, new_peer(2, 2)); + cluster.pd_client.must_add_peer(r, new_peer(3, 3)); + + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + // Wait until all peers of region 1 hibernate. + thread::sleep(Duration::from_millis(base_tick_ms * 30)); + + // Stop the region leader. + fail::cfg("receive_raft_message_from_outside", "pause").unwrap(); + let _ = cluster.async_put(b"k2", b"v2").unwrap(); + cluster.stop_node(1); + + // Wait for a while so that the failpoint can be triggered on followers. + thread::sleep(Duration::from_millis(100)); + fail::remove("receive_raft_message_from_outside"); + + // Wait for a while. Peers of region 1 shouldn't hibernate. + thread::sleep(Duration::from_millis(base_tick_ms * 30)); + must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); + must_get_equal(&cluster.get_engine(3), b"k2", b"v2"); +} diff --git a/tests/integrations/raftstore/test_tombstone.rs b/tests/integrations/raftstore/test_tombstone.rs index 3d7fc235cad..972a75212b4 100644 --- a/tests/integrations/raftstore/test_tombstone.rs +++ b/tests/integrations/raftstore/test_tombstone.rs @@ -80,7 +80,7 @@ fn test_tombstone(cluster: &mut Cluster) { raft_msg.set_region_id(r1); // Use an invalid from peer to ignore gc peer message. - raft_msg.set_from_peer(new_peer(0, 0)); + raft_msg.set_from_peer(new_peer(100, 100)); raft_msg.set_to_peer(new_peer(2, 2)); raft_msg.mut_region_epoch().set_conf_ver(0); raft_msg.mut_region_epoch().set_version(0); diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index fa7a86f12c4..aad9ab7ceb1 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -19,7 +19,7 @@ use kvproto::{ tikvpb::BatchRaftMessage, }; use raft::eraftpb::Entry; -use raftstore::{errors::DiscardReason, store::StoreMsg}; +use raftstore::errors::DiscardReason; use tikv::server::{ self, load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, resolve::Callback, Config, ConnectionBuilder, RaftClient, StoreAddrResolver, TestRaftStoreRouter, @@ -28,7 +28,6 @@ use tikv_kv::{FakeExtension, RaftExtension}; use tikv_util::{ config::{ReadableDuration, VersionTrack}, worker::{Builder as WorkerBuilder, LazyWorker}, - Either, }; use super::*; @@ -73,7 +72,7 @@ where worker.scheduler(), loads, ); - RaftClient::new(builder) + RaftClient::new(0, builder) } fn get_raft_client_by_port(port: u16) -> RaftClient { @@ -206,59 +205,6 @@ fn test_raft_client_reconnect() { drop(mock_server); } -#[test] -// Test raft_client reports store unreachable only once until being connected -// again -fn test_raft_client_report_unreachable() { - let msg_count = Arc::new(AtomicUsize::new(0)); - let batch_msg_count = Arc::new(AtomicUsize::new(0)); - let service = MockKvForRaft::new(Arc::clone(&msg_count), Arc::clone(&batch_msg_count), true); - let (mut mock_server, port) = create_mock_server(service, 60100, 60200).unwrap(); - - let (tx, rx) = mpsc::channel(); - let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); - let router = TestRaftStoreRouter::new(tx, significant_msg_sender); - let wrap = RaftRouterWrap::new(router); - let mut raft_client = get_raft_client(wrap, StaticResolver::new(port)); - - // server is disconnected - mock_server.shutdown(); - drop(mock_server); - - raft_client.send(RaftMessage::default()).unwrap(); - let msg = rx.recv_timeout(Duration::from_millis(200)).unwrap(); - if let Either::Right(StoreMsg::StoreUnreachable { store_id }) = msg { - assert_eq!(store_id, 0); - } else { - panic!("expect StoreUnreachable"); - } - // no more unreachable message is sent until it's connected again. - rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); - - // restart the mock server. - let service = MockKvForRaft::new(Arc::clone(&msg_count), batch_msg_count, true); - let mut mock_server = create_mock_server_on(service, port); - - // make sure the connection is connected, otherwise the following sent messages - // may be dropped - std::thread::sleep(Duration::from_millis(200)); - (0..50).for_each(|_| raft_client.send(RaftMessage::default()).unwrap()); - raft_client.flush(); - check_msg_count(500, &msg_count, 50); - - // server is disconnected - mock_server.take().unwrap().shutdown(); - - let msg = rx.recv_timeout(Duration::from_millis(200)).unwrap(); - if let Either::Right(StoreMsg::StoreUnreachable { store_id }) = msg { - assert_eq!(store_id, 0); - } else { - panic!("expect StoreUnreachable"); - } - // no more unreachable message is sent until it's connected again. - rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); -} - #[test] fn test_batch_size_limit() { let msg_count = Arc::new(AtomicUsize::new(0)); From 0de1123800389db278b666a4180b1984d3407338 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Mon, 24 Apr 2023 16:46:45 +0800 Subject: [PATCH 660/676] raftstore-v2: prevent resolving store 0 (#14645) * raftstore-v2: prevent resolving store 0 Do not cache invaild peer otherwise it may send raft message to store 0 during region split. Signed-off-by: Neil Shen * address comments Signed-off-by: Neil Shen --------- Signed-off-by: Neil Shen --- components/raftstore-v2/src/operation/life.rs | 56 +++++++++--- .../src/operation/ready/apply_trace.rs | 5 +- .../raftstore-v2/src/operation/ready/mod.rs | 16 +++- .../test_raftstore/src/transport_simulate.rs | 8 +- tests/failpoints/cases/test_split_region.rs | 90 ++++++++++++++++++- .../integrations/raftstore/test_hibernate.rs | 6 +- 6 files changed, 155 insertions(+), 26 deletions(-) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 9d3a32f8f72..c9145e909d1 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -213,6 +213,26 @@ fn check_if_to_peer_destroyed( Ok(false) } +// An empty raft message for creating peer fsm. +fn empty_split_message(store_id: u64, region: &Region) -> Box { + let mut raft_msg = Box::::default(); + raft_msg.set_region_id(region.get_id()); + raft_msg.set_region_epoch(region.get_region_epoch().clone()); + raft_msg.set_to_peer( + region + .get_peers() + .iter() + .find(|p| p.get_store_id() == store_id) + .unwrap() + .clone(), + ); + raft_msg +} + +pub fn is_empty_split_message(msg: &RaftMessage) -> bool { + !msg.has_from_peer() && msg.has_to_peer() && msg.has_region_epoch() && !msg.has_message() +} + impl Store { /// The method is called during split. /// The creation process is: @@ -230,17 +250,31 @@ impl Store { { let derived_region_id = msg.derived_region_id; let region_id = msg.region.id; - let mut raft_msg = Box::::default(); - raft_msg.set_region_id(region_id); - raft_msg.set_region_epoch(msg.region.get_region_epoch().clone()); - raft_msg.set_to_peer( - msg.region - .get_peers() - .iter() - .find(|p| p.get_store_id() == self.store_id()) - .unwrap() - .clone(), - ); + let raft_msg = empty_split_message(self.store_id(), &msg.region); + + (|| { + fail::fail_point!( + "on_store_2_split_init_race_with_initial_message", + self.store_id() == 2, + |_| { + let mut initial_msg = raft_msg.clone(); + initial_msg.set_from_peer( + msg.region + .get_peers() + .iter() + .find(|p| p.get_store_id() != self.store_id()) + .unwrap() + .clone(), + ); + let m = initial_msg.mut_message(); + m.set_msg_type(raft::prelude::MessageType::MsgRequestPreVote); + m.set_term(raftstore::store::RAFT_INIT_LOG_TERM); + m.set_index(raftstore::store::RAFT_INIT_LOG_INDEX); + assert!(util::is_initial_msg(initial_msg.get_message())); + self.on_raft_message(ctx, initial_msg); + } + ) + })(); // It will create the peer if it does not exist self.on_raft_message(ctx, raft_msg); diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index f1a65fc1768..6c9c73479ba 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -406,13 +406,12 @@ impl Storage { } }; apply_state.set_applied_index(applied_index); - let mut reset_apply_index = || { + (|| { // Make node reply from start. fail_point!("RESET_APPLY_INDEX_WHEN_RESTART", |_| { apply_state.set_applied_index(5); }); - }; - reset_apply_index(); + })(); Self::create( store_id, diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index f63d9c97b86..009b31921b3 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -57,6 +57,7 @@ pub use self::{ use crate::{ batch::StoreContext, fsm::{PeerFsmDelegate, Store}, + operation::life::is_empty_split_message, raft::{Peer, Storage}, router::{PeerMsg, PeerTick}, worker::tablet, @@ -289,10 +290,14 @@ impl Peer { // ranges with other peers. let from_peer = msg.take_from_peer(); let from_peer_id = from_peer.get_id(); - if self.is_leader() && from_peer.get_id() != INVALID_ID { - self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); + if from_peer_id != INVALID_ID { + if self.is_leader() { + self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); + } + // We only cache peer with an vaild ID. + // It prevents cache peer(0,0) which is sent by region split. + self.insert_peer_cache(from_peer); } - self.insert_peer_cache(from_peer); let pre_committed_index = self.raft_group().raft.raft_log.committed; if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { self.on_transfer_leader_msg(ctx, msg.get_message(), msg.disk_usage) @@ -307,6 +312,11 @@ impl Peer { ctx.raft_metrics.message_dropped.stale_msg.inc(); return; } + // As this peer is already created, the empty split message is meaningless. + if is_empty_split_message(&msg) { + ctx.raft_metrics.message_dropped.stale_msg.inc(); + return; + } if let Err(e) = self.raft_group_mut().step(msg.take_message()) { error!(self.logger, "raft step error"; "err" => ?e); diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index a49a41af4e3..ef569e3987a 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -831,18 +831,18 @@ impl Filter for LeaseReadFilter { #[derive(Clone)] pub struct DropMessageFilter { - ty: MessageType, + retain: Arc bool + Sync + Send>, } impl DropMessageFilter { - pub fn new(ty: MessageType) -> DropMessageFilter { - DropMessageFilter { ty } + pub fn new(retain: Arc bool + Sync + Send>) -> DropMessageFilter { + DropMessageFilter { retain } } } impl Filter for DropMessageFilter { fn before(&self, msgs: &mut Vec) -> Result<()> { - msgs.retain(|m| m.get_message().get_msg_type() != self.ty); + msgs.retain(|m| (self.retain)(m)); Ok(()) } } diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 792a21217ad..096bbc12ed8 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -17,14 +17,16 @@ use kvproto::{ Mutation, Op, PessimisticLockRequest, PrewriteRequest, PrewriteRequestPessimisticAction::*, }, metapb::Region, - raft_serverpb::RaftMessage, + raft_serverpb::{PeerState, RaftMessage}, tikvpb::TikvClient, }; use pd_client::PdClient; use raft::eraftpb::MessageType; use raftstore::{ store::{ - config::Config as RaftstoreConfig, util::is_vote_msg, Callback, PeerMsg, WriteResponse, + config::Config as RaftstoreConfig, + util::{is_initial_msg, is_vote_msg}, + Callback, PeerMsg, WriteResponse, }, Result, }; @@ -32,6 +34,8 @@ use test_raftstore::*; use tikv::storage::{kv::SnapshotExt, Snapshot}; use tikv_util::{ config::{ReadableDuration, ReadableSize}, + mpsc::{unbounded, Sender}, + time::Instant, HandyRwLock, }; use txn_types::{Key, PessimisticLock}; @@ -1187,3 +1191,85 @@ fn test_split_race_with_conf_change() { cluster.must_put(b"k06", b"val"); assert_eq!(cluster.must_get(b"k06").unwrap(), b"val".to_vec()); } + +// split init races with request prevote should not send messages to store 0. +// +// 1. split region. +// 2. send split init to store because peer is no exist. +// 3. store receives request prevote from normal peer. +// 4. store receives split init. +// 5. store creates peer via request prevote. +// 6. store sends empty raft message to peer. +// 7. store sends split init to peer. +// 7. peer inserts peer(0,0) to cache and step the empty meassge. +// 8. peer handles split snapshot from split init and response to peer(0,0). +// 9. transport tries to resolve store 0. +// +// We must prevent peer incorrectly inserting peer(0,0) to cache and send +// messages to store 0. +#[test] +fn test_split_init_race_with_initial_msg_v2() { + // test case for raftstore-v2 + use test_raftstore_v2::*; + + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + + let split_key1 = b"k01"; + let region = cluster.get_region(split_key1); + cluster.must_transfer_leader( + region.get_id(), + region + .get_peers() + .iter() + .find(|p| p.get_store_id() == 1) + .unwrap() + .to_owned(), + ); + + // Drop initial messages to store 2. + cluster.add_recv_filter_on_node( + 2, + Box::new(DropMessageFilter::new(Arc::new(|m| { + !is_initial_msg(m.get_message()) + }))), + ); + let (tx, rx) = unbounded(); + cluster.add_send_filter_on_node(2, Box::new(TeeFilter { pipe: tx })); + + fail::cfg("on_store_2_split_init_race_with_initial_message", "return").unwrap(); + cluster.must_split(®ion, split_key1); + + // Wait for store 2 split. + let new_region = cluster.get_region(b"k00"); + let start = Instant::now(); + loop { + sleep_ms(500); + let region_state = cluster.region_local_state(new_region.get_id(), 2); + if region_state.get_state() == PeerState::Normal { + break; + } + if start.saturating_elapsed() > Duration::from_secs(5) { + panic!("timeout"); + } + } + cluster.clear_send_filter_on_node(2); + while let Ok(msg) = rx.recv_timeout(Duration::from_millis(500)) { + if msg.get_to_peer().get_store_id() == 0 { + panic!("must not send messages to store 0"); + } + } +} + +struct TeeFilter { + pipe: Sender, +} + +impl Filter for TeeFilter { + fn before(&self, msgs: &mut Vec) -> Result<()> { + for msg in msgs { + let _ = self.pipe.send(msg.clone()); + } + Ok(()) + } +} diff --git a/tests/integrations/raftstore/test_hibernate.rs b/tests/integrations/raftstore/test_hibernate.rs index 23c859a21bd..73156becb0d 100644 --- a/tests/integrations/raftstore/test_hibernate.rs +++ b/tests/integrations/raftstore/test_hibernate.rs @@ -199,9 +199,9 @@ fn test_transfer_leader_delay() { ); cluster.clear_send_filters(); - cluster.add_send_filter(CloneFilterFactory(DropMessageFilter::new( - MessageType::MsgTimeoutNow, - ))); + cluster.add_send_filter(CloneFilterFactory(DropMessageFilter::new(Arc::new(|m| { + m.get_message().get_msg_type() != MessageType::MsgTimeoutNow + })))); let router = cluster.sim.wl().get_router(1).unwrap(); router .send_raft_message(messages.lock().unwrap().pop().unwrap()) From 1674d3c487063425cfe865e047036daa22ec07de Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Mon, 24 Apr 2023 18:18:47 +0800 Subject: [PATCH 661/676] raftstore: delete tablet snap if exists (#14647) * done Signed-off-by: Spade A * add panic Signed-off-by: Spade A --------- Signed-off-by: Spade A Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore/src/store/snap.rs | 9 +++++++-- tests/integrations/raftstore/test_snap.rs | 11 +++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 12440abb5d0..4f347002f67 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -20,8 +20,8 @@ use engine_traits::{CfName, EncryptionKeyManager, KvEngine, CF_DEFAULT, CF_LOCK, use error_code::{self, ErrorCode, ErrorCodeExt}; use fail::fail_point; use file_system::{ - calc_crc32, calc_crc32_and_size, delete_file_if_exist, file_exists, get_file_size, sync_dir, - File, Metadata, OpenOptions, + calc_crc32, calc_crc32_and_size, delete_dir_if_exist, delete_file_if_exist, file_exists, + get_file_size, sync_dir, File, Metadata, OpenOptions, }; use keys::{enc_end_key, enc_start_key}; use kvproto::{ @@ -1008,6 +1008,11 @@ impl Snapshot { } } } + if let Some(ref meta) = self.meta_file.meta { + if !meta.tablet_snap_path.is_empty() { + delete_dir_if_exist(&meta.tablet_snap_path).unwrap(); + } + } delete_file_if_exist(&self.meta_file.path).unwrap(); if self.hold_tmp_files { delete_file_if_exist(&self.meta_file.tmp_path).unwrap(); diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index f474b5cdb8e..a620bb3a990 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -925,6 +925,7 @@ fn test_v1_apply_snap_from_v2() { let mut cluster_v1 = test_raftstore::new_server_cluster(1, 1); let mut cluster_v2 = test_raftstore_v2::new_server_cluster(1, 1); cluster_v1.cfg.raft_store.enable_v2_compatible_learner = true; + cluster_v1.cfg.raft_store.snap_mgr_gc_tick_interval = ReadableDuration::millis(200); let observer = MockApplySnapshotObserver { tablet_snap_paths: Arc::default(), @@ -1012,6 +1013,16 @@ fn test_v1_apply_snap_from_v2() { let path_str = path.as_path().to_str().unwrap(); check_observer(&observer, region_id, path_str); + + // Verify that the tablet snap will be gced + for _ in 0..10 { + if !path.exists() { + return; + } + std::thread::sleep(Duration::from_millis(200)); + } + + panic!("tablet snap {:?} still exists", path_str); } fn check_observer(observer: &MockApplySnapshotObserver, region_id: u64, snap_path: &str) { From 84955b61cc7eddd447c7f07cdce05b9f703e0969 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 24 Apr 2023 17:23:02 -0700 Subject: [PATCH 662/676] status_api: add get_engine_type api Signed-off-by: tonyxuqqi --- src/config/mod.rs | 7 +++++ src/server/status_server/mod.rs | 53 +++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/src/config/mod.rs b/src/config/mod.rs index 5d20b027c4e..a44cf20c066 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -4323,6 +4323,13 @@ impl ConfigController { pub fn get_current(&self) -> TikvConfig { self.inner.read().unwrap().current.clone() } + + pub fn get_engine_type(&self) -> &'static str { + if self.get_current().storage.engine == EngineType::RaftKv2 { + return "partitioned-raft-kv"; + } + return "raft-kv"; + } } #[cfg(test)] diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 2ce7a8714c0..1b689138f11 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -411,6 +411,16 @@ where } } + async fn get_engine_type(cfg_controller: &ConfigController) -> hyper::Result> { + let engine_type = cfg_controller.get_engine_type(); + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("Content-Length", engine_type.len()) + .body(engine_type.into()) + .unwrap(); + Ok(response) + } + pub fn stop(self) { let _ = self.tx.send(()); self.thread_pool.shutdown_timeout(Duration::from_secs(3)); @@ -609,6 +619,9 @@ where (Method::POST, "/config") => { Self::update_config(cfg_controller.clone(), req).await } + (Method::GET, "/engine_type") => { + Self::get_engine_type(&cfg_controller).await + } // This interface is used for configuration file hosting scenarios, // TiKV will not update configuration files, and this interface will // silently ignore configration items that cannot be updated online, @@ -1024,6 +1037,7 @@ mod tests { use crate::{ config::{ConfigController, TikvConfig}, server::status_server::{profile::TEST_PROFILE_MUTEX, LogLevelRequest, StatusServer}, + storage::config::EngineType, }; #[derive(Clone)] @@ -1573,4 +1587,43 @@ mod tests { block_on(handle).unwrap(); status_server.stop(); } + + #[test] + fn test_get_engine_type() { + let mut multi_rocks_cfg = TikvConfig::default(); + multi_rocks_cfg.storage.engine = EngineType::RaftKv2; + let cfgs = [TikvConfig::default(), multi_rocks_cfg]; + let resp_strs = ["raft-kv", "partitioned-raft-kv"]; + for (cfg, resp_str) in IntoIterator::into_iter(cfgs).zip(resp_strs) { + let temp_dir = tempfile::TempDir::new().unwrap(); + let mut status_server = StatusServer::new( + 1, + ConfigController::new(cfg), + Arc::new(SecurityConfig::default()), + MockRouter, + temp_dir.path().to_path_buf(), + None, + ) + .unwrap(); + let addr = "127.0.0.1:0".to_owned(); + let _ = status_server.start(addr); + let client = Client::new(); + let uri = Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/engine_type") + .build() + .unwrap(); + + let handle = status_server.thread_pool.spawn(async move { + let res = client.get(uri).await.unwrap(); + assert_eq!(res.status(), StatusCode::OK); + let body_bytes = hyper::body::to_bytes(res.into_body()).await.unwrap(); + let engine_type = String::from_utf8(body_bytes.as_ref().to_owned()).unwrap(); + assert_eq!(engine_type, resp_str); + }); + block_on(handle).unwrap(); + status_server.stop(); + } + } } From c44266730fb856145c46bc59ff5e0912d18744bb Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 24 Apr 2023 21:19:51 -0700 Subject: [PATCH 663/676] [raftstore-v2]: add the missed apply log duration and fix apply wait time (#14530) ref tikv/tikv#14321 Add the apply log duration metrics. Signed-off-by: tonyxuqqi --- .../raftstore-v2/src/operation/command/mod.rs | 22 ++++++++++++++----- .../src/operation/query/capture.rs | 4 +--- .../raftstore/src/store/local_metrics.rs | 5 +++-- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 2f2df5a0333..b9256f031fe 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -37,7 +37,9 @@ use raftstore::{ Proposal, }, local_metrics::RaftMetrics, - metrics::{APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM}, + metrics::{ + APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM, STORE_APPLY_LOG_HISTOGRAM, + }, msg::ErrorCallback, util, Config, Transport, WriteCallback, }, @@ -100,7 +102,6 @@ pub struct CommittedEntries { /// Entries need to be applied. Note some entries may not be included for /// flow control. pub entry_and_proposals: Vec<(Entry, Vec)>, - pub committed_time: Instant, } fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { @@ -306,7 +307,6 @@ impl Peer { // memtables in kv engine is flushed. let apply = CommittedEntries { entry_and_proposals, - committed_time: Instant::now(), }; assert!( self.apply_scheduler().is_some() || ctx.router.is_shutdown(), @@ -517,14 +517,17 @@ impl Apply { #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); - APPLY_TASK_WAIT_TIME_HISTOGRAM - .observe(duration_to_sec(ce.committed_time.saturating_elapsed())); + let now = std::time::Instant::now(); + let apply_wait_time = APPLY_TASK_WAIT_TIME_HISTOGRAM.local(); for (e, ch) in ce.entry_and_proposals { if self.tombstone() { apply::notify_req_region_removed(self.region_id(), ch); continue; } if !e.get_data().is_empty() { + for tracker in ch.write_trackers() { + tracker.observe(now, &apply_wait_time, |t| &mut t.metrics.apply_wait_nanos); + } let mut set_save_point = false; if let Some(wb) = &mut self.write_batch { wb.set_save_point(); @@ -787,7 +790,14 @@ impl Apply { let apply_time = APPLY_TIME_HISTOGRAM.local(); for (ch, resp) in callbacks.drain(..) { for tracker in ch.write_trackers() { - tracker.observe(now, &apply_time, |t| &mut t.metrics.apply_time_nanos); + let mut apply_wait_nanos = 0_u64; + let apply_time_nanos = tracker.observe(now, &apply_time, |t| { + apply_wait_nanos = t.metrics.apply_wait_nanos; + &mut t.metrics.apply_time_nanos + }); + STORE_APPLY_LOG_HISTOGRAM.observe(duration_to_sec(Duration::from_nanos( + apply_time_nanos - apply_wait_nanos, + ))); } ch.set_result(resp); } diff --git a/components/raftstore-v2/src/operation/query/capture.rs b/components/raftstore-v2/src/operation/query/capture.rs index 5fdbde187e4..5393dfacc98 100644 --- a/components/raftstore-v2/src/operation/query/capture.rs +++ b/components/raftstore-v2/src/operation/query/capture.rs @@ -193,7 +193,7 @@ mod test { }; use slog::o; use tempfile::TempDir; - use tikv_util::{store::new_peer, time::Instant, worker::dummy_scheduler}; + use tikv_util::{store::new_peer, worker::dummy_scheduler}; use super::*; use crate::{ @@ -357,7 +357,6 @@ mod test { ), vec![], )], - committed_time: Instant::now(), }), ApplyTask::CaptureApply(CaptureChange { observer: ChangeObserver::from_cdc(region.id, ObserveHandle::new()), @@ -376,7 +375,6 @@ mod test { ), vec![], )], - committed_time: Instant::now(), }), ]; diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 0e6a09cbf0b..baf63814416 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -246,11 +246,11 @@ impl TimeTracker { now: std::time::Instant, local_metric: &LocalHistogram, tracker_metric: impl FnOnce(&mut Tracker) -> &mut u64, - ) { + ) -> u64 { let dur = now.saturating_duration_since(self.start); local_metric.observe(dur.as_secs_f64()); if self.token == INVALID_TRACKER_TOKEN { - return; + return 0; } GLOBAL_TRACKERS.with_tracker(self.token, |tracker| { let metric = tracker_metric(tracker); @@ -258,6 +258,7 @@ impl TimeTracker { *metric = dur.as_nanos() as u64; } }); + dur.as_nanos() as u64 } #[inline] From c7cf0c667bd5cbe453c7571760be199196f7ef13 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 25 Apr 2023 15:07:51 +0800 Subject: [PATCH 664/676] raftstore: make v1 learner compatible with gc peer (#14601) close tikv/tikv#14595 Make tiflash engine compatible with gc peer Signed-off-by: Neil Shen Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/raftstore-v2/src/batch/store.rs | 4 +- components/raftstore-v2/src/operation/life.rs | 70 ++-------- .../raftstore-v2/src/operation/ready/mod.rs | 2 +- .../src/router/response_channel.rs | 18 ++- components/raftstore/src/store/fsm/life.rs | 92 +++++++++++++ components/raftstore/src/store/fsm/mod.rs | 1 + components/raftstore/src/store/fsm/peer.rs | 31 ++++- components/raftstore/src/store/fsm/store.rs | 18 +++ .../raftstore/src/store/worker/region.rs | 15 +-- tests/integrations/raftstore/mod.rs | 1 + tests/integrations/raftstore/test_life.rs | 126 ++++++++++++++++++ 11 files changed, 295 insertions(+), 83 deletions(-) create mode 100644 components/raftstore/src/store/fsm/life.rs create mode 100644 tests/integrations/raftstore/test_life.rs diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index a9e3c223943..1f6245cc010 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -1,6 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cmp, ops::{Deref, DerefMut}, path::Path, sync::{ @@ -120,7 +121,8 @@ impl StoreContext { self.cfg.report_region_buckets_tick_interval.0; self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = self.cfg.check_long_uncommitted_interval.0; - self.tick_batch[PeerTick::GcPeer as usize].wait_duration = Duration::from_secs(60); + self.tick_batch[PeerTick::GcPeer as usize].wait_duration = + 60 * cmp::min(Duration::from_secs(1), self.cfg.raft_base_tick_interval.0); } } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index c9145e909d1..8b431ad3a98 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -36,7 +36,11 @@ use kvproto::{ raft_cmdpb::{AdminCmdType, RaftCmdRequest}, raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage}, }; -use raftstore::store::{metrics::RAFT_PEER_PENDING_DURATION, util, Transport, WriteTask}; +use raftstore::store::{ + fsm::life::{build_peer_destroyed_report, forward_destroy_to_source_peer}, + metrics::RAFT_PEER_PENDING_DURATION, + util, Transport, WriteTask, +}; use slog::{debug, error, info, warn}; use tikv_util::{ store::find_peer, @@ -390,7 +394,9 @@ impl Store { if extra_msg.get_type() == ExtraMessageType::MsgGcPeerRequest && extra_msg.has_check_gc_peer() { - forward_destroy_to_source_peer(ctx, &msg); + forward_destroy_to_source_peer(&msg, |m| { + let _ = ctx.router.send_raft_message(m.into()); + }); return; } } @@ -467,62 +473,6 @@ impl Store { } } -/// Tell leader that `to_peer` from `tombstone_msg` is destroyed. -fn build_peer_destroyed_report(tombstone_msg: &mut RaftMessage) -> Option { - let to_region_id = if tombstone_msg.has_extra_msg() { - assert_eq!( - tombstone_msg.get_extra_msg().get_type(), - ExtraMessageType::MsgGcPeerRequest - ); - tombstone_msg - .get_extra_msg() - .get_check_gc_peer() - .get_from_region_id() - } else { - tombstone_msg.get_region_id() - }; - if to_region_id == 0 || tombstone_msg.get_from_peer().get_id() == 0 { - return None; - } - let mut msg = RaftMessage::default(); - msg.set_region_id(to_region_id); - msg.set_from_peer(tombstone_msg.take_to_peer()); - msg.set_to_peer(tombstone_msg.take_from_peer()); - msg.mut_extra_msg() - .set_type(ExtraMessageType::MsgGcPeerResponse); - Some(msg) -} - -/// Forward the destroy request from target peer to merged source peer. -fn forward_destroy_to_source_peer(ctx: &mut StoreContext, msg: &RaftMessage) -where - EK: KvEngine, - ER: RaftEngine, - T: Transport, -{ - let extra_msg = msg.get_extra_msg(); - // Instead of respond leader directly, send a message to target region to - // double check it's really destroyed. - let check_gc_peer = extra_msg.get_check_gc_peer(); - let mut tombstone_msg = Box::::default(); - tombstone_msg.set_region_id(check_gc_peer.get_check_region_id()); - tombstone_msg.set_from_peer(msg.get_from_peer().clone()); - tombstone_msg.set_to_peer(check_gc_peer.get_check_peer().clone()); - tombstone_msg.set_region_epoch(check_gc_peer.get_check_region_epoch().clone()); - tombstone_msg.set_is_tombstone(true); - // No need to set epoch as we don't know what it is. - // This message will not be handled by `on_gc_peer_request` due to - // `is_tombstone` being true. - tombstone_msg - .mut_extra_msg() - .set_type(ExtraMessageType::MsgGcPeerRequest); - tombstone_msg - .mut_extra_msg() - .mut_check_gc_peer() - .set_from_region_id(check_gc_peer.get_from_region_id()); - let _ = ctx.router.send_raft_message(tombstone_msg); -} - impl Peer { pub fn on_availability_request( &mut self, @@ -622,7 +572,9 @@ impl Peer { return; } - forward_destroy_to_source_peer(ctx, msg); + forward_destroy_to_source_peer(msg, |m| { + let _ = ctx.router.send_raft_message(m.into()); + }); } /// A peer confirms it's destroyed. diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 009b31921b3..5f294d7e5b6 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -399,7 +399,7 @@ impl Peer { } // Filling start and end key is only needed for being compatible with - // raftstore v1 tiflash engine. + // raftstore v1 learners (e.g. tiflash engine). // // There could be two cases: // - Target peer already exists but has not established communication with diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index 97321aae9d1..c300b6d8726 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -273,6 +273,14 @@ impl BaseChannel { pub fn set_result(self, res: Res) { self.core.set_result(res); } + + pub fn with_callback(f: Box) -> (Self, BaseSubscriber) { + let (c, s) = pair(); + unsafe { + *c.core.before_set.get() = Some(f); + } + (c, s) + } } impl Drop for BaseChannel { @@ -616,16 +624,6 @@ impl QueryResChannel { pub fn pair() -> (Self, QueryResSubscriber) { pair() } - - pub fn with_callback( - f: Box, - ) -> (Self, QueryResSubscriber) { - let (c, s) = pair(); - unsafe { - *c.core.before_set.get() = Some(f); - } - (c, s) - } } impl ErrorCallback for QueryResChannel { diff --git a/components/raftstore/src/store/fsm/life.rs b/components/raftstore/src/store/fsm/life.rs new file mode 100644 index 00000000000..59aa8b316f0 --- /dev/null +++ b/components/raftstore/src/store/fsm/life.rs @@ -0,0 +1,92 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains functions that relates to peer liftime management and +//! are shared with raftstore and raftstore v2. + +use engine_traits::{KvEngine, CF_RAFT}; +use kvproto::raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}; + +use crate::store::util::is_epoch_stale; + +/// Tell leader that `to_peer` from `tombstone_msg` is destroyed. +pub fn build_peer_destroyed_report(tombstone_msg: &mut RaftMessage) -> Option { + let to_region_id = if tombstone_msg.has_extra_msg() { + assert_eq!( + tombstone_msg.get_extra_msg().get_type(), + ExtraMessageType::MsgGcPeerRequest + ); + tombstone_msg + .get_extra_msg() + .get_check_gc_peer() + .get_from_region_id() + } else { + tombstone_msg.get_region_id() + }; + if to_region_id == 0 || tombstone_msg.get_from_peer().get_id() == 0 { + return None; + } + let mut msg = RaftMessage::default(); + msg.set_region_id(to_region_id); + msg.set_from_peer(tombstone_msg.take_to_peer()); + msg.set_to_peer(tombstone_msg.take_from_peer()); + msg.mut_extra_msg() + .set_type(ExtraMessageType::MsgGcPeerResponse); + Some(msg) +} + +/// Forward the destroy request from target peer to merged source peer. +pub fn forward_destroy_to_source_peer(msg: &RaftMessage, forward: T) { + let extra_msg = msg.get_extra_msg(); + // Instead of respond leader directly, send a message to target region to + // double check it's really destroyed. + let check_gc_peer = extra_msg.get_check_gc_peer(); + let mut tombstone_msg = RaftMessage::default(); + tombstone_msg.set_region_id(check_gc_peer.get_check_region_id()); + tombstone_msg.set_from_peer(msg.get_from_peer().clone()); + tombstone_msg.set_to_peer(check_gc_peer.get_check_peer().clone()); + tombstone_msg.set_region_epoch(check_gc_peer.get_check_region_epoch().clone()); + tombstone_msg.set_is_tombstone(true); + // No need to set epoch as we don't know what it is. + // This message will not be handled by `on_gc_peer_request` due to + // `is_tombstone` being true. + tombstone_msg + .mut_extra_msg() + .set_type(ExtraMessageType::MsgGcPeerRequest); + tombstone_msg + .mut_extra_msg() + .mut_check_gc_peer() + .set_from_region_id(check_gc_peer.get_from_region_id()); + forward(tombstone_msg); +} + +pub fn handle_tombstone_message_on_learner( + engine: &EK, + store_id: u64, + mut msg: RaftMessage, +) -> Option { + let region_id = msg.get_region_id(); + let region_state_key = keys::region_state_key(region_id); + let local_state: RegionLocalState = match engine.get_msg_cf(CF_RAFT, ®ion_state_key) { + Ok(Some(s)) => s, + e => panic!( + "[store {}] failed to get regions state of {:?}: {:?}", + store_id, + msg.get_region_id(), + e + ), + }; + + if local_state.get_state() != PeerState::Tombstone { + return None; + } + + // In v2, we rely on leader to confirm destroy actively. + let local_epoch = local_state.get_region().get_region_epoch(); + // The region in this peer is already destroyed + if msg.get_region_epoch() == local_epoch || is_epoch_stale(msg.get_region_epoch(), local_epoch) + { + return build_peer_destroyed_report(&mut msg); + } + + None +} diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index 6f51c97c0d5..f342c1ec733 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -5,6 +5,7 @@ //! stores. They are mixed for now, will be separated in the future. pub mod apply; +pub mod life; mod metrics; mod peer; pub mod store; diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index f2d1c7ffc0e..72eb3c59753 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -64,6 +64,7 @@ use tracker::GLOBAL_TRACKERS; use txn_types::WriteBatchFlags; use self::memtrace::*; +use super::life::forward_destroy_to_source_peer; #[cfg(any(test, feature = "testexport"))] use crate::store::PeerInternalStat; use crate::{ @@ -2740,6 +2741,25 @@ where } } + // In v1, gc_peer_request is handled to be compatible with v2. + // Note: it needs to be consistent with Peer::on_gc_peer_request in v2. + fn on_gc_peer_request(&mut self, msg: RaftMessage) { + let extra_msg = msg.get_extra_msg(); + + if !extra_msg.has_check_gc_peer() || extra_msg.get_index() == 0 { + // Corrupted message. + return; + } + if self.fsm.peer.get_store().applied_index() < extra_msg.get_index() { + // Merge not finish. + return; + } + + forward_destroy_to_source_peer(&msg, |m| { + let _ = self.ctx.router.send_raft_message(m); + }); + } + fn on_extra_message(&mut self, mut msg: RaftMessage) { match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { @@ -2795,10 +2815,15 @@ where ExtraMessageType::MsgVoterReplicatedIndexResponse => { self.on_voter_replicated_index_response(msg.get_extra_msg()); } + ExtraMessageType::MsgGcPeerRequest => { + // To make learner (e.g. tiflash engine) compatiable with raftstore v2, + // it needs to response GcPeerResponse. + if self.ctx.cfg.enable_v2_compatible_learner { + self.on_gc_peer_request(msg); + } + } // It's v2 only message and ignore does no harm. - ExtraMessageType::MsgGcPeerRequest - | ExtraMessageType::MsgGcPeerResponse - | ExtraMessageType::MsgFlushMemtable => (), + ExtraMessageType::MsgGcPeerResponse | ExtraMessageType::MsgFlushMemtable => (), } } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index c64b2a53c37..03c0688e8f2 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -78,6 +78,7 @@ use crate::{ config::Config, fsm::{ create_apply_batch_system, + life::handle_tombstone_message_on_learner, metrics::*, peer::{ maybe_destroy_source, new_admin_request, PeerFsm, PeerFsmDelegate, SenderFsmPair, @@ -2072,6 +2073,23 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER .inc(); return Ok(()); } + + // To make learner (e.g. tiflash engine) compatiable with raftstore v2, + // it needs to response GcPeerResponse. + if msg.get_is_tombstone() && self.ctx.cfg.enable_v2_compatible_learner { + if let Some(msg) = + handle_tombstone_message_on_learner(&self.ctx.engines.kv, self.fsm.store.id, msg) + { + let _ = self.ctx.trans.send(msg); + } + // else { + // TODO: we should create the peer and destroy immediately to leave + // a tombstone record, otherwise it leaks removed_record + // and merged_record. + // } + return Ok(()); + } + if msg.get_is_tombstone() || msg.has_merge_target() { // Target tombstone peer doesn't exist, so ignore it. return Ok(()); diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index d889047a0f9..0696e70b766 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -4,7 +4,7 @@ use std::{ collections::{ BTreeMap, Bound::{Excluded, Included, Unbounded}, - HashMap, VecDeque, + VecDeque, }, fmt::{self, Display, Formatter}, sync::{ @@ -16,6 +16,7 @@ use std::{ u64, }; +use collections::HashMap; use engine_traits::{DeleteStrategy, KvEngine, Mutable, Range, WriteBatch, CF_LOCK, CF_RAFT}; use fail::fail_point; use file_system::{IoType, WithIoType}; @@ -803,14 +804,10 @@ where } else { let is_tiflash = self.pd_client.as_ref().map_or(false, |pd_client| { if let Ok(s) = pd_client.get_store(to_store_id) { - if let Some(_l) = s.get_labels().iter().find(|l| { - l.key.to_lowercase() == ENGINE - && l.value.to_lowercase() == TIFLASH - }) { - return true; - } else { - return false; - } + return s.get_labels().iter().any(|label| { + label.get_key().to_lowercase() == ENGINE + && label.get_value().to_lowercase() == TIFLASH + }); } true }); diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index 08657f7e75a..5f6703afe05 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -11,6 +11,7 @@ mod test_flashback; mod test_hibernate; mod test_joint_consensus; mod test_lease_read; +mod test_life; mod test_merge; mod test_multi; mod test_prevote; diff --git a/tests/integrations/raftstore/test_life.rs b/tests/integrations/raftstore/test_life.rs new file mode 100644 index 00000000000..de394325f08 --- /dev/null +++ b/tests/integrations/raftstore/test_life.rs @@ -0,0 +1,126 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{Arc, Mutex}, + time::Duration, +}; + +use kvproto::raft_serverpb::{PeerState, RaftMessage}; +use raftstore::errors::Result; +use test_raftstore::{new_learner_peer, sleep_ms, Filter, FilterFactory, Simulator as S1}; +use test_raftstore_v2::Simulator as S2; +use tikv_util::time::Instant; + +struct ForwardFactory { + node_id: u64, + chain_send: Arc, +} + +impl FilterFactory for ForwardFactory { + fn generate(&self, _: u64) -> Vec> { + vec![Box::new(ForwardFilter { + node_id: self.node_id, + chain_send: self.chain_send.clone(), + })] + } +} + +struct ForwardFilter { + node_id: u64, + chain_send: Arc, +} + +impl Filter for ForwardFilter { + fn before(&self, msgs: &mut Vec) -> Result<()> { + for m in msgs.drain(..) { + if self.node_id == m.get_to_peer().get_store_id() { + (self.chain_send)(m); + } + } + Ok(()) + } +} + +// Create two clusters in v1 and v2, mock tiflash engine by adding tiflash +// labels to v1 cluster. Forwards v2 leader messages to v1 learner, and v1 +// learner messages to v2 leaders. +// Make sure when removing learner, v2 leader can clean up removed_record and +// merged_record eventually. +#[test] +fn test_gc_peer_tiflash_engine() { + let mut cluster_v1 = test_raftstore::new_node_cluster(1, 2); + let mut cluster_v2 = test_raftstore_v2::new_node_cluster(1, 2); + cluster_v1.cfg.raft_store.enable_v2_compatible_learner = true; + cluster_v1.pd_client.disable_default_operator(); + cluster_v2.pd_client.disable_default_operator(); + let r11 = cluster_v1.run_conf_change(); + let r21 = cluster_v2.run_conf_change(); + + // Add learner (2, 10). + cluster_v1 + .pd_client + .must_add_peer(r11, new_learner_peer(2, 10)); + cluster_v2 + .pd_client + .must_add_peer(r21, new_learner_peer(2, 10)); + // Make sure learner states are match. + let start = Instant::now(); + loop { + if cluster_v1.get_raft_local_state(r11, 2).is_some() + && cluster_v1.get_raft_local_state(r11, 2) == cluster_v2.get_raft_local_state(r21, 2) + && cluster_v1.region_local_state(r11, 2).state == PeerState::Normal + && cluster_v2.region_local_state(r21, 2).state == PeerState::Normal + && cluster_v1.apply_state(r11, 2).truncated_state + == cluster_v2.apply_state(r21, 2).truncated_state + { + break; + } + if start.saturating_elapsed() > Duration::from_secs(5) { + panic!("timeout"); + } + } + + let trans1 = Mutex::new(cluster_v1.sim.read().unwrap().get_router(2).unwrap()); + let trans2 = Mutex::new(cluster_v2.sim.read().unwrap().get_router(1).unwrap()); + + // For cluster 1, it intercepts msgs sent to leader node, and then + // forwards to cluster 2 leader node. + let factory1 = ForwardFactory { + node_id: 1, + chain_send: Arc::new(move |m| { + info!("send to trans2"; "msg" => ?m); + let _ = trans2.lock().unwrap().send_raft_message(Box::new(m)); + }), + }; + cluster_v1.add_send_filter(factory1); + // For cluster 2, it intercepts msgs sent to learner node, and then + // forwards to cluster 1 learner node. + let factory2 = ForwardFactory { + node_id: 2, + chain_send: Arc::new(move |m| { + info!("send to trans1"; "msg" => ?m); + let _ = trans1.lock().unwrap().send_raft_message(m); + }), + }; + cluster_v2.add_send_filter(factory2); + + cluster_v2 + .pd_client + .must_remove_peer(r21, new_learner_peer(2, 10)); + + // Make sure leader cleans up removed_records. + let start = Instant::now(); + loop { + sleep_ms(500); + if cluster_v2 + .region_local_state(r21, 1) + .get_removed_records() + .is_empty() + { + break; + } + if start.saturating_elapsed() > Duration::from_secs(5) { + panic!("timeout"); + } + } +} From 5dc8360d9f778ba52e409fcee966a838257d38c2 Mon Sep 17 00:00:00 2001 From: Rustin Date: Tue, 25 Apr 2023 15:23:51 +0800 Subject: [PATCH 665/676] cdc: support filter lossy DDL changes (#14629) close tikv/tikv#14630 cdc: support filter lossy DDL changes. We don't need to send those changes downstream. Signed-off-by: hi-rustin Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/cdc/src/delegate.rs | 126 ++++++++++++++++++++++++++++-- components/cdc/src/initializer.rs | 37 +++++++-- components/cdc/src/lib.rs | 1 + components/cdc/src/txn_source.rs | 116 +++++++++++++++++++++++++++ 4 files changed, 269 insertions(+), 11 deletions(-) create mode 100644 components/cdc/src/txn_source.rs diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index c4212c426be..adca54dace0 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -39,6 +39,7 @@ use crate::{ metrics::*, old_value::{OldValueCache, OldValueCallback}, service::ConnId, + txn_source::TxnSource, Error, Result, }; @@ -550,8 +551,10 @@ impl Delegate { row_size = 0; } } - // if the `txn_source` is not 0 and we should filter it out, skip this event. - if row.txn_source != 0 && filter_loop { + let lossy_ddl_filter = TxnSource::is_lossy_ddl_reorg_source_set(row.txn_source); + let cdc_write_filter = + TxnSource::is_cdc_write_source_set(row.txn_source) && filter_loop; + if lossy_ddl_filter || cdc_write_filter { continue; } if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { @@ -648,6 +651,14 @@ impl Delegate { return Ok(()); } + // Filter the entries which are lossy DDL events. + // We don't need to send them to downstream. + let entries = entries + .iter() + .filter(|x| !TxnSource::is_lossy_ddl_reorg_source_set(x.txn_source)) + .cloned() + .collect::>(); + let downstreams = self.downstreams(); assert!( !downstreams.is_empty(), @@ -655,15 +666,15 @@ impl Delegate { self.region_id ); - // collect the change event cause by user write, which is `txn_source` = 0. - // for changefeed which only need the user write, send the `filtered`, or else, - // send them all. + // Collect the change event cause by user write, which cdc write source is not + // set. For changefeed which only need the user write, + // send the `filtered_entries`, or else, send them all. let mut filtered_entries = None; for downstream in downstreams { if downstream.filter_loop { let filtered = entries .iter() - .filter(|x| x.txn_source == 0) + .filter(|x| !TxnSource::is_cdc_write_source_set(x.txn_source)) .cloned() .collect::>(); if !filtered.is_empty() { @@ -692,9 +703,11 @@ impl Delegate { } else { downstream.observed_range.filter_entries(entries.clone()) }; + if entries_clone.is_empty() { return Ok(()); } + let event = Event { region_id, index, @@ -1468,6 +1481,107 @@ mod tests { assert_eq!(e.events[0].get_entries().get_entries().len(), 2, "{:?}", e); } + fn test_downstream_txn_source_filter(txn_source: TxnSource, filter_loop: bool) { + // Create a new delegate that observes [a, f). + let observed_range = ObservedRange::new( + Key::from_raw(b"a").into_encoded(), + Key::from_raw(b"f").into_encoded(), + ) + .unwrap(); + let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); + let mut delegate = Delegate::new(1, txn_extra_op); + assert!(delegate.handle.is_observing()); + + let mut map = HashMap::default(); + for k in b'a'..=b'e' { + let mut put = PutRequest::default(); + put.key = Key::from_raw(&[k]).into_encoded(); + put.cf = "lock".to_owned(); + let mut lock = Lock::new( + LockType::Put, + put.key.clone(), + 1.into(), + 10, + None, + TimeStamp::zero(), + 0, + TimeStamp::zero(), + ); + // Only the key `a` is a normal write. + if k != b'a' { + lock = lock.set_txn_source(txn_source.into()); + } + put.value = lock.to_bytes(); + delegate + .sink_txn_put( + put, + false, + &mut map, + |_: &mut EventRow, _: TimeStamp| Ok(()), + ) + .unwrap(); + } + assert_eq!(map.len(), 5); + + let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let downstream = Downstream { + id: DownstreamId::new(), + req_id: 1, + conn_id: ConnId::new(), + peer: String::new(), + region_epoch: RegionEpoch::default(), + sink: Some(sink), + state: Arc::new(AtomicCell::new(DownstreamState::Normal)), + kv_api: ChangeDataRequestKvApi::TiDb, + filter_loop, + observed_range, + }; + delegate.add_downstream(downstream); + let entries = map.values().map(|(r, _)| r).cloned().collect(); + delegate + .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) + .unwrap(); + + let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.spawn(async move { + drain.forward(&mut tx).await.unwrap(); + }); + let (e, _) = recv_timeout(&mut rx, std::time::Duration::from_secs(5)) + .unwrap() + .unwrap(); + assert_eq!(e.events[0].get_entries().get_entries().len(), 1, "{:?}", e); + } + + #[test] + fn test_downstream_filter_cdc_write_entires() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + + test_downstream_txn_source_filter(txn_source, true); + } + + #[test] + fn test_downstream_filter_lossy_ddl_entires() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(1); + test_downstream_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is false, we should still ignore lossy + // ddl changes. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_downstream_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is true, we should still ignore some + // events. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_downstream_txn_source_filter(txn_source, true); + } + #[test] fn test_decode_rawkv() { let cases = vec![ diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 8f6f8ed38a7..c06b13424ba 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -591,6 +591,7 @@ mod tests { use tokio::runtime::{Builder, Runtime}; use super::*; + use crate::txn_source::TxnSource; struct ReceiverRunnable { tx: Sender, @@ -786,18 +787,16 @@ mod tests { worker.stop(); } - #[test] - fn test_initializer_filter_loop() { + fn test_initializer_txn_source_filter(txn_source: TxnSource, filter_loop: bool) { let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); let mut total_bytes = 0; - for i in 10..100 { let (k, v) = (&[b'k', i], &[b'v', i]); total_bytes += k.len(); total_bytes += v.len(); let ts = TimeStamp::new(i as _); - must_prewrite_put_with_txn_soucre(&mut engine, k, v, k, ts, 1); + must_prewrite_put_with_txn_soucre(&mut engine, k, v, k, ts, txn_source.into()); } let snap = engine.snapshot(Default::default()).unwrap(); @@ -808,7 +807,7 @@ mod tests { buffer, engine.kv_engine(), ChangeDataRequestKvApi::TiDb, - true, + filter_loop, ); let th = pool.spawn(async move { initializer @@ -833,6 +832,34 @@ mod tests { worker.stop(); } + #[test] + fn test_initializer_cdc_write_filter() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + test_initializer_txn_source_filter(txn_source, true); + } + + #[test] + fn test_initializer_lossy_ddl_filter() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(1); + test_initializer_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is false, we should still ignore lossy + // ddl changes. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_initializer_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is true, we should still ignore all + // events. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_initializer_txn_source_filter(txn_source, true); + } + // Test `hint_min_ts` works fine with `ExtraOp::ReadOldValue`. // Whether `DeltaScanner` emits correct old values or not is already tested by // another case `test_old_value_with_hint_min_ts`, so here we only care about diff --git a/components/cdc/src/lib.rs b/components/cdc/src/lib.rs index 7d63bf5c115..c913cefb92e 100644 --- a/components/cdc/src/lib.rs +++ b/components/cdc/src/lib.rs @@ -13,6 +13,7 @@ pub mod metrics; mod observer; mod old_value; mod service; +mod txn_source; pub use channel::{recv_timeout, CdcEvent, MemoryQuota}; pub use config::CdcConfigManager; diff --git a/components/cdc/src/txn_source.rs b/components/cdc/src/txn_source.rs new file mode 100644 index 00000000000..81dc9f95096 --- /dev/null +++ b/components/cdc/src/txn_source.rs @@ -0,0 +1,116 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +// The bitmap: +// |RESERVED|LOSSY_DDL_REORG_SOURCE_BITS|CDC_WRITE_SOURCE_BITS| +// | 48 | 8 | 4(RESERVED) | 4 | +// +// TiCDC uses 1 - 255 to indicate the source of TiDB. +// For now, 1 - 15 are reserved for TiCDC to implement BDR synchronization. +// 16 - 255 are reserved for extendability. +const CDC_WRITE_SOURCE_BITS: u64 = 8; +const CDC_WRITE_SOURCE_MAX: u64 = (1 << CDC_WRITE_SOURCE_BITS) - 1; + +// TiCDC uses 1-255 to indicate the change from a lossy DDL reorg Backfill job. +// For now, we only use 1 for column reorg backfill job. +#[cfg(test)] +const LOSSY_DDL_REORG_SOURCE_BITS: u64 = 8; +#[cfg(test)] +const LOSSY_DDL_COLUMN_REORG_SOURCE: u64 = 1; +#[cfg(test)] +const LOSSY_DDL_REORG_SOURCE_MAX: u64 = (1 << LOSSY_DDL_REORG_SOURCE_BITS) - 1; +const LOSSY_DDL_REORG_SOURCE_SHIFT: u64 = CDC_WRITE_SOURCE_BITS; + +/// For kv.TxnSource +/// We use an uint64 to represent the source of a transaction. +/// The first 8 bits are reserved for TiCDC, and the next 8 bits are reserved +/// for Lossy DDL reorg Backfill job. The remaining 48 bits are reserved for +/// extendability. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +pub(crate) struct TxnSource(u64); + +impl TxnSource { + #[cfg(test)] + pub(crate) fn set_cdc_write_source(&mut self, value: u64) { + if value > CDC_WRITE_SOURCE_MAX { + unreachable!("Only use it in tests") + } + self.0 |= value; + } + + #[cfg(test)] + pub(crate) fn get_cdc_write_source(&self) -> u64 { + self.0 & CDC_WRITE_SOURCE_MAX + } + + pub(crate) fn is_cdc_write_source_set(txn_source: u64) -> bool { + (txn_source & CDC_WRITE_SOURCE_MAX) != 0 + } + + #[cfg(test)] + pub(crate) fn set_lossy_ddl_reorg_source(&mut self, value: u64) { + if value > LOSSY_DDL_REORG_SOURCE_MAX { + unreachable!("Only use it in tests") + } + self.0 |= value << LOSSY_DDL_REORG_SOURCE_SHIFT; + } + + #[cfg(test)] + pub(crate) fn get_lossy_ddl_reorg_source(&self) -> u64 { + (self.0 >> LOSSY_DDL_REORG_SOURCE_SHIFT) & LOSSY_DDL_REORG_SOURCE_MAX + } + + pub(crate) fn is_lossy_ddl_reorg_source_set(txn_source: u64) -> bool { + (txn_source >> LOSSY_DDL_REORG_SOURCE_SHIFT) != 0 + } +} + +impl From for u64 { + fn from(val: TxnSource) -> Self { + val.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_cdc_write_source() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + assert_eq!(txn_source.get_cdc_write_source(), 1); + } + + #[test] + fn test_is_cdc_write_source_set() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + assert_eq!(TxnSource::is_cdc_write_source_set(txn_source.0), true); + + let txn_source = TxnSource::default(); + assert_eq!(TxnSource::is_cdc_write_source_set(txn_source.0), false); + } + + #[test] + fn test_get_lossy_ddl_reorg_source() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(LOSSY_DDL_COLUMN_REORG_SOURCE); + assert_eq!( + txn_source.get_lossy_ddl_reorg_source(), + LOSSY_DDL_COLUMN_REORG_SOURCE + ); + } + + #[test] + fn test_is_lossy_ddl_reorg_source_set() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(LOSSY_DDL_COLUMN_REORG_SOURCE); + assert_eq!(TxnSource::is_lossy_ddl_reorg_source_set(txn_source.0), true); + + let txn_source = TxnSource::default(); + assert_eq!( + TxnSource::is_lossy_ddl_reorg_source_set(txn_source.0), + false + ); + } +} From eb2ad9865a6f321a3612b07ca723436b99ea0255 Mon Sep 17 00:00:00 2001 From: Lucas Date: Wed, 26 Apr 2023 13:07:51 +0800 Subject: [PATCH 666/676] cloud: update dependencies on Azure. (#14610) close tikv/tikv#14609 Update the Azure SDK to latest version to support later developments. Signed-off-by: LykxSassinator --- Cargo.lock | 427 +++++++++++++++++++-------- components/cloud/azure/Cargo.toml | 13 +- components/cloud/azure/src/azblob.rs | 142 +++++---- 3 files changed, 413 insertions(+), 169 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bda2a12187d..48360c51100 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -175,6 +175,15 @@ dependencies = [ "zstd-safe", ] +[[package]] +name = "async-lock" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa24f727524730b077666307f2734b4a1a1c57acb79193127dcc8914d5242dd7" +dependencies = [ + "event-listener", +] + [[package]] name = "async-speed-limit" version = "0.4.0" @@ -229,18 +238,6 @@ dependencies = [ "syn", ] -[[package]] -name = "async-timer" -version = "1.0.0-beta.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d962799a5863fdf06fbf594e04102130582d010379137e9a98a7e2e693a5885" -dependencies = [ - "error-code", - "libc 0.2.139", - "wasm-bindgen", - "winapi 0.3.9", -] - [[package]] name = "async-trait" version = "0.1.58" @@ -282,7 +279,7 @@ name = "aws" version = "0.0.1" dependencies = [ "async-trait", - "base64", + "base64 0.13.0", "bytes", "cloud", "fail", @@ -364,91 +361,113 @@ dependencies = [ "azure_core", "azure_identity", "azure_storage", - "base64", - "chrono", + "azure_storage_blobs", + "base64 0.13.0", "cloud", "futures 0.3.15", "futures-util", "kvproto", "lazy_static", "oauth2", + "openssl", "regex", + "serde", + "serde_json", "slog", "slog-global", "tikv_util", + "time 0.3.20", "tokio", "url", + "uuid 1.2.1", ] [[package]] name = "azure_core" -version = "0.1.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#b3c53f4cec4a6b541e49388b51e696dc892f18a3" +version = "0.11.0" +source = "git+https://github.com/Azure/azure-sdk-for-rust#e21e2ec6bae784a717ac7b3cf1123d3a9596f074" dependencies = [ "async-trait", - "base64", + "base64 0.21.0", "bytes", - "chrono", "dyn-clone", "futures 0.3.15", "getrandom 0.2.3", - "http", + "http-types", "log", - "oauth2", + "paste", + "pin-project", + "quick-xml 0.28.2", "rand 0.8.5", "reqwest", "rustc_version 0.4.0", "serde", - "serde_derive", "serde_json", - "thiserror", + "time 0.3.20", "url", - "uuid 0.8.2", + "uuid 1.2.1", ] [[package]] name = "azure_identity" -version = "0.1.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#b3c53f4cec4a6b541e49388b51e696dc892f18a3" +version = "0.11.0" +source = "git+https://github.com/Azure/azure-sdk-for-rust#e21e2ec6bae784a717ac7b3cf1123d3a9596f074" dependencies = [ - "async-timer", + "async-lock", "async-trait", "azure_core", - "chrono", + "fix-hidden-lifetime-bug", "futures 0.3.15", "log", "oauth2", - "reqwest", + "pin-project", "serde", "serde_json", - "thiserror", + "time 0.3.20", "url", + "uuid 1.2.1", ] [[package]] name = "azure_storage" -version = "0.1.0" -source = "git+https://github.com/Azure/azure-sdk-for-rust#b3c53f4cec4a6b541e49388b51e696dc892f18a3" +version = "0.11.0" +source = "git+https://github.com/Azure/azure-sdk-for-rust#e21e2ec6bae784a717ac7b3cf1123d3a9596f074" dependencies = [ "RustyXML", "async-trait", "azure_core", - "base64", "bytes", - "chrono", "futures 0.3.15", - "http", + "hmac 0.12.1", "log", - "md5", "once_cell", - "ring", "serde", - "serde-xml-rs", "serde_derive", "serde_json", - "thiserror", + "sha2 0.10.6", + "time 0.3.20", "url", - "uuid 0.8.2", + "uuid 1.2.1", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.11.0" +source = "git+https://github.com/Azure/azure-sdk-for-rust#e21e2ec6bae784a717ac7b3cf1123d3a9596f074" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "bytes", + "futures 0.3.15", + "log", + "md5", + "serde", + "serde_derive", + "serde_json", + "time 0.3.20", + "url", + "uuid 1.2.1", ] [[package]] @@ -588,6 +607,12 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "base64" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" + [[package]] name = "batch-system" version = "0.1.0" @@ -700,6 +725,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "boolinator" version = "2.4.0" @@ -931,7 +965,7 @@ dependencies = [ "num-integer", "num-traits", "serde", - "time", + "time 0.1.42", ] [[package]] @@ -1113,6 +1147,15 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "cpufeatures" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181" +dependencies = [ + "libc 0.2.139", +] + [[package]] name = "cpuid-bool" version = "0.1.2" @@ -1282,6 +1325,16 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "crypto-mac" version = "0.10.0" @@ -1400,6 +1453,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "digest" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +dependencies = [ + "block-buffer 0.10.4", + "crypto-common", + "subtle", +] + [[package]] name = "dirs-next" version = "2.0.0" @@ -1556,7 +1620,7 @@ dependencies = [ "tempfile", "tikv_alloc", "tikv_util", - "time", + "time 0.1.42", "toml", "tracker", "txn_types", @@ -1711,16 +1775,6 @@ dependencies = [ "version_check 0.1.5", ] -[[package]] -name = "error-code" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5115567ac25674e0043e472be13d14e537f37ea8aa4bdc4aef0c89add1db1ff" -dependencies = [ - "libc 0.2.139", - "str-buf", -] - [[package]] name = "error_code" version = "0.0.1" @@ -1861,6 +1915,15 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f35ce9c8fb9891c75ceadbc330752951a4e369b50af10775955aeb9af3eee34b" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "ffi-support" version = "0.4.2" @@ -1935,6 +1998,26 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "fix-hidden-lifetime-bug" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4ae9c2016a663983d4e40a9ff967d6dcac59819672f0b47f2b17574e99c33c8" +dependencies = [ + "fix-hidden-lifetime-bug-proc_macros", +] + +[[package]] +name = "fix-hidden-lifetime-bug-proc_macros" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4c81935e123ab0741c4c4f0d9b8377e5fb21d3de7e062fa4b1263b1fbcba1ea" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "fixedbitset" version = "0.4.2" @@ -2106,6 +2189,21 @@ version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + [[package]] name = "futures-macro" version = "0.3.15" @@ -2459,7 +2557,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1441c6b1e930e2817404b5046f1f989899143a12bf92de603b69f4e0aee1e15" dependencies = [ "crypto-mac", - "digest", + "digest 0.9.0", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest 0.10.6", ] [[package]] @@ -2501,6 +2608,26 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel", + "base64 0.13.0", + "futures-lite", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.8.0" @@ -2619,6 +2746,12 @@ dependencies = [ "hashbrown 0.9.1", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "inferno" version = "0.11.3" @@ -2632,7 +2765,7 @@ dependencies = [ "lazy_static", "log", "num-format", - "quick-xml", + "quick-xml 0.22.0", "rgb", "str_stack", ] @@ -3064,8 +3197,8 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b5a279bb9607f9f53c22d496eade00d138d1bdcccd07d74650387cf94942a15" dependencies = [ - "block-buffer", - "digest", + "block-buffer 0.9.0", + "digest 0.9.0", "opaque-debug", ] @@ -3549,22 +3682,30 @@ dependencies = [ "libc 0.2.139", ] +[[package]] +name = "num_threads" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" +dependencies = [ + "libc 0.2.139", +] + [[package]] name = "oauth2" version = "4.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80e47cfc4c0a1a519d9a025ebfbac3a2439d1b5cdf397d72dcb79b11d9920dab" dependencies = [ - "base64", + "base64 0.13.0", "chrono", "getrandom 0.2.3", "http", "rand 0.8.5", - "reqwest", "serde", "serde_json", "serde_path_to_error", - "sha2", + "sha2 0.9.1", "thiserror", "url", ] @@ -3617,9 +3758,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "openssl" -version = "0.10.41" +version = "0.10.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "618febf65336490dfcf20b73f885f5651a0c89c64c2d4a8c3662585a70bf5bd0" +checksum = "7e30d8bc91859781f0a943411186324d580f2bbeb71b452fe91ae344806af3f1" dependencies = [ "bitflags", "cfg-if 1.0.0", @@ -3658,11 +3799,10 @@ dependencies = [ [[package]] name = "openssl-sys" -version = "0.9.75" +version = "0.9.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f" +checksum = "0d3d193fb1488ad46ffe3aaabc912cc931d02ee8518fe2959aea8ef52718b0c0" dependencies = [ - "autocfg", "cc", "libc 0.2.139", "openssl-src", @@ -3702,6 +3842,12 @@ dependencies = [ name = "panic_hook" version = "0.0.1" +[[package]] +name = "parking" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14f2252c834a40ed9bb5422029649578e63aa341ac401f74e719dd1afda8394e" + [[package]] name = "parking_lot" version = "0.11.1" @@ -4247,6 +4393,16 @@ dependencies = [ "memchr", ] +[[package]] +name = "quick-xml" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce5e73202a820a31f8a0ee32ada5e21029c81fd9e3ebf668a40832e4219d9d1" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quote" version = "1.0.18" @@ -4346,7 +4502,7 @@ dependencies = [ "slog-global", "tempfile", "tikv_util", - "time", + "time 0.1.42", "tracker", ] @@ -4413,7 +4569,7 @@ dependencies = [ "tidb_query_datatype", "tikv_alloc", "tikv_util", - "time", + "time 0.1.42", "tokio", "tracker", "txn_types", @@ -4462,7 +4618,7 @@ dependencies = [ "test_util", "thiserror", "tikv_util", - "time", + "time 0.1.42", "tracker", "txn_types", "yatp", @@ -4693,7 +4849,7 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0460542b551950620a3648c6aa23318ac6b3cd779114bd873209e6e8b5eb1c34" dependencies = [ - "base64", + "base64 0.13.0", "bytes", "encoding_rs 0.8.29 (registry+https://github.com/rust-lang/crates.io-index)", "futures-core", @@ -4711,7 +4867,6 @@ dependencies = [ "percent-encoding", "pin-project-lite", "serde", - "serde_json", "serde_urlencoded", "tokio", "tokio-native-tls", @@ -4885,7 +5040,7 @@ version = "0.46.0" source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", - "base64", + "base64 0.13.0", "bytes", "crc32fast", "futures 0.3.15", @@ -4966,13 +5121,13 @@ name = "rusoto_signature" version = "0.46.0" source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ - "base64", + "base64 0.13.0", "bytes", "chrono", - "digest", + "digest 0.9.0", "futures 0.3.15", "hex 0.4.2", - "hmac", + "hmac 0.10.1", "http", "hyper", "log", @@ -4982,7 +5137,7 @@ dependencies = [ "rusoto_credential", "rustc_version 0.3.3", "serde", - "sha2", + "sha2 0.9.1", "tokio", ] @@ -5203,25 +5358,13 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.106" +version = "1.0.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36df6ac6412072f67cf767ebbde4133a5b2e88e76dc6187fa7104cd16f783399" +checksum = "d193d69bae983fc11a79df82342761dfbf28a99fc8d203dca4c3c1b590948965" dependencies = [ "serde_derive", ] -[[package]] -name = "serde-xml-rs" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0bf1ba0696ccf0872866277143ff1fd14d22eec235d2b23702f95e6660f7dfa" -dependencies = [ - "log", - "serde", - "thiserror", - "xml-rs", -] - [[package]] name = "serde_cbor" version = "0.11.1" @@ -5234,9 +5377,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.106" +version = "1.0.147" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e549e3abf4fb8621bd1609f11dfc9f5e50320802273b12f3811a67e6716ea6c" +checksum = "4f1d362ca8fc9c3e3a7484440752472d68a6caa98f1ab81d99b5dfe517cec852" dependencies = [ "proc-macro2", "quote", @@ -5273,6 +5416,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror", +] + [[package]] name = "serde_repr" version = "0.1.9" @@ -5381,13 +5535,24 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2933378ddfeda7ea26f48c555bdad8bb446bf8a3d17832dc83e380d444cfb8c1" dependencies = [ - "block-buffer", + "block-buffer 0.9.0", "cfg-if 0.1.10", "cpuid-bool", - "digest", + "digest 0.9.0", "opaque-debug", ] +[[package]] +name = "sha2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +dependencies = [ + "cfg-if 1.0.0", + "cpufeatures", + "digest 0.10.6", +] + [[package]] name = "shlex" version = "0.1.1" @@ -5638,12 +5803,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" -[[package]] -name = "str-buf" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d44a3643b4ff9caf57abcee9c2c621d6c03d9135e0d8b589bd9afb5992cb176a" - [[package]] name = "str_stack" version = "0.1.0" @@ -5737,9 +5896,9 @@ dependencies = [ [[package]] name = "subtle" -version = "2.3.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343f3f510c2915908f155e94f17220b19ccfacf2a64a2a5d8004f2c3e311e7fd" +checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" [[package]] name = "symbolic-common" @@ -5807,7 +5966,7 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d20ec2d6525a66afebdff9e1d8ef143c9deae9a3b040c61d3cfa9ae6fda80060" dependencies = [ - "base64", + "base64 0.13.0", "bytes", "chrono", "futures-util", @@ -5827,7 +5986,7 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9435c9348e480fad0f2215d5602e2dfad03df8a6398c4e7ceaeaa42758f26a8a" dependencies = [ - "base64", + "base64 0.13.0", "chrono", "http", "lock_api", @@ -6135,7 +6294,7 @@ dependencies = [ "slog-global", "tempfile", "tikv_util", - "time", + "time 0.1.42", ] [[package]] @@ -6213,7 +6372,7 @@ dependencies = [ "tikv", "tikv_kv", "tikv_util", - "time", + "time 0.1.42", "tipb", "tipb_helper", "tokio", @@ -6311,7 +6470,7 @@ dependencies = [ "serde_json", "thiserror", "tikv_util", - "time", + "time 0.1.42", "yatp", ] @@ -6320,7 +6479,7 @@ name = "tidb_query_datatype" version = "0.0.1" dependencies = [ "api_version", - "base64", + "base64 0.13.0", "bitfield", "bitflags", "boolinator", @@ -6390,7 +6549,7 @@ dependencies = [ name = "tidb_query_expr" version = "0.0.1" dependencies = [ - "base64", + "base64 0.13.0", "bstr", "byteorder", "chrono", @@ -6416,7 +6575,7 @@ dependencies = [ "tidb_query_common", "tidb_query_datatype", "tikv_util", - "time", + "time 0.1.42", "tipb", "tipb_helper", "twoway", @@ -6537,7 +6696,7 @@ dependencies = [ "tikv_alloc", "tikv_kv", "tikv_util", - "time", + "time 0.1.42", "tipb", "tokio", "tokio-openssl", @@ -6598,7 +6757,7 @@ dependencies = [ "tikv", "tikv_alloc", "tikv_util", - "time", + "time 0.1.42", "tokio", "toml", "txn_types", @@ -6645,7 +6804,7 @@ dependencies = [ "serde_json", "server", "tikv", - "time", + "time 0.1.42", "toml", ] @@ -6756,7 +6915,7 @@ dependencies = [ "tempfile", "thiserror", "tikv_alloc", - "time", + "time 0.1.42", "tokio", "tokio-executor", "tokio-timer", @@ -6778,6 +6937,35 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "time" +version = "0.3.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" +dependencies = [ + "itoa 1.0.1", + "libc 0.2.139", + "num_threads", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" + +[[package]] +name = "time-macros" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" +dependencies = [ + "time-core", +] + [[package]] name = "tinytemplate" version = "1.2.0" @@ -6936,7 +7124,7 @@ dependencies = [ "async-stream 0.3.3", "async-trait", "axum", - "base64", + "base64 0.13.0", "bytes", "futures-core", "futures-util", @@ -7124,9 +7312,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.12.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "ucd-trie" @@ -7221,6 +7409,9 @@ name = "uuid" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "feb41e78f93363bb2df8b0e86a2ca30eed7806ea16ea0c790d757cf93f79be83" +dependencies = [ + "getrandom 0.2.3", +] [[package]] name = "valgrind_request" @@ -7262,6 +7453,12 @@ dependencies = [ "syn", ] +[[package]] +name = "waker-fn" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca" + [[package]] name = "walkdir" version = "2.3.1" diff --git a/components/cloud/azure/Cargo.toml b/components/cloud/azure/Cargo.toml index 0a45ccc2c63..b9ba7732e9e 100644 --- a/components/cloud/azure/Cargo.toml +++ b/components/cloud/azure/Cargo.toml @@ -6,20 +6,25 @@ publish = false [dependencies] async-trait = "0.1" -azure_core = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust" } -azure_identity = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust" } -azure_storage = { version = "0.1.0", git = "https://github.com/Azure/azure-sdk-for-rust", default-features = false, features = ["account", "blob"] } +azure_core = { version = "0.11.0", git = "https://github.com/Azure/azure-sdk-for-rust" } +azure_identity = { version = "0.11.0", git = "https://github.com/Azure/azure-sdk-for-rust" } +azure_storage = { version = "0.11.0", git = "https://github.com/Azure/azure-sdk-for-rust", default-features = false } +azure_storage_blobs = { version = "0.11.0", git = "https://github.com/Azure/azure-sdk-for-rust" } base64 = "0.13" -chrono = "0.4" cloud = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } kvproto = { workspace = true } lazy_static = "1.4.0" oauth2 = { version = "4.0.0", default-features = false } +openssl = { version = "0.10.50" } regex = "1" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" slog = { workspace = true } slog-global = { workspace = true } tikv_util = { workspace = true } +time = { version = "0.3", features = ["local-offset"] } tokio = { version = "1.5", features = ["time"] } url = "2.0" +uuid = { version = "1.0", features = ["v4"] } diff --git a/components/cloud/azure/src/azblob.rs b/components/cloud/azure/src/azblob.rs index 47d2d731da8..7f7483a3e8a 100644 --- a/components/cloud/azure/src/azblob.rs +++ b/components/cloud/azure/src/azblob.rs @@ -8,17 +8,15 @@ use std::{ use async_trait::async_trait; use azure_core::{ auth::{TokenCredential, TokenResponse}, - prelude::*, + new_http_client, }; -use azure_identity::token_credentials::{ClientSecretCredential, TokenCredentialOptions}; -use azure_storage::{ - blob::prelude::*, - core::{prelude::*, ConnectionStringBuilder}, -}; -use chrono::{Duration as ChronoDuration, Utc}; +use azure_identity::{ClientSecretCredential, TokenCredentialOptions}; +use azure_storage::{prelude::*, ConnectionString, ConnectionStringBuilder}; +use azure_storage_blobs::prelude::*; use cloud::blob::{ none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty, }; +use futures::TryFutureExt; use futures_util::{ io::{AsyncRead, AsyncReadExt}, stream, @@ -33,6 +31,7 @@ use tikv_util::{ debug, stream::{retry, RetryError}, }; +use time::OffsetDateTime; use tokio::{ sync::Mutex, time::{timeout, Duration}, @@ -310,10 +309,9 @@ impl AzureUploader { .get_client() .await .map_err(|e| e.to_string())? - .as_blob_client(&self.name) + .blob_client(&self.name) .put_block_blob(data.to_vec()) .access_tier(self.storage_class) - .execute() .await?; Ok(()) }) @@ -414,13 +412,13 @@ impl ContainerBuilder for TokenCredContainerBuilder { { let token_response = self.token_cache.read().unwrap(); if let Some(ref t) = *token_response { - let interval = t.0.expires_on - Utc::now(); + let interval = (t.0.expires_on - OffsetDateTime::now_utc()).whole_minutes(); // keep token updated 5 minutes before it expires - if interval > ChronoDuration::minutes(TOKEN_UPDATE_LEFT_TIME_MINS) { + if interval > TOKEN_UPDATE_LEFT_TIME_MINS { return Ok(t.1.clone()); } - if interval > ChronoDuration::minutes(TOKEN_EXPIRE_LEFT_TIME_MINS) { + if interval > TOKEN_EXPIRE_LEFT_TIME_MINS { // there still have time to use the token, // and only need one thread to update token. if let Ok(l) = self.modify_place.try_lock() { @@ -443,9 +441,9 @@ impl ContainerBuilder for TokenCredContainerBuilder { { let token_response = self.token_cache.read().unwrap(); if let Some(ref t) = *token_response { - let interval = t.0.expires_on - Utc::now(); + let interval = (t.0.expires_on - OffsetDateTime::now_utc()).whole_minutes(); // token is already updated - if interval > ChronoDuration::minutes(TOKEN_UPDATE_LEFT_TIME_MINS) { + if interval > TOKEN_UPDATE_LEFT_TIME_MINS { return Ok(t.1.clone()); } } @@ -457,14 +455,12 @@ impl ContainerBuilder for TokenCredContainerBuilder { .get_token(&self.token_resource) .await .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", &e)))?; - let http_client = new_http_client(); - let storage_client = StorageAccountClient::new_bearer_token( - http_client, + let blob_service = BlobServiceClient::new( self.account_name.clone(), - token.token.secret(), - ) - .as_storage_client() - .as_container_client(self.container_name.clone()); + StorageCredentials::BearerToken(token.token.secret().into()), + ); + let storage_client = + Arc::new(blob_service.container_client(self.container_name.clone())); { let mut token_response = self.token_cache.write().unwrap(); @@ -493,22 +489,54 @@ impl AzureStorage { Self::new(Config::from_input(input)?) } + /// Mock a dummpy AzureStorage with a shared key Config for + /// testing by Azurite tool. + /// + /// This function should only be used for testing Blob with a + /// local Azurite server. + #[cfg(test)] + #[allow(dead_code)] + fn from_dummy_input(input: InputConfig) -> io::Result { + let config = Config::from_input(input)?; + let bucket = (*config.bucket.bucket).to_owned(); + Ok(AzureStorage { + config, + client_builder: Arc::new(SharedKeyContainerBuilder { + container_client: Arc::new( + ClientBuilder::emulator() + .blob_service_client() + .container_client(bucket), + ), + }), + }) + } + pub fn from_cloud_dynamic(cloud_dynamic: &CloudDynamic) -> io::Result { Self::new(Config::from_cloud_dynamic(cloud_dynamic)?) } pub fn new(config: Config) -> io::Result { + let bucket = (*config.bucket.bucket).to_owned(); // priority: explicit shared key > env Azure AD > env shared key if let Some(connection_string) = config.parse_plaintext_account_url() { - let bucket = (*config.bucket.bucket).to_owned(); - let http_client = new_http_client(); - let container_client = StorageAccountClient::new_connection_string( - http_client.clone(), - connection_string.as_str(), - ) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", &e)))? - .as_storage_client() - .as_container_client(bucket); + let account_name = config.get_account_name()?; + let storage_credentials = ConnectionString::new(&connection_string) + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid configurations for SharedKey, err: {}", e), + ) + })? + .storage_credentials() + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid credentials for blob, err: {}", e), + ) + })?; + let container_client = Arc::new( + BlobServiceClient::new(account_name, storage_credentials).container_client(bucket), + ); let client_builder = Arc::new(SharedKeyContainerBuilder { container_client }); Ok(AzureStorage { @@ -516,10 +544,10 @@ impl AzureStorage { client_builder, }) } else if let Some(credential_info) = config.credential_info.as_ref() { - let bucket = (*config.bucket.bucket).to_owned(); let account_name = config.get_account_name()?; let token_resource = format!("https://{}.blob.core.windows.net", &account_name); let cred = ClientSecretCredential::new( + new_http_client(), credential_info.tenant_id.clone(), credential_info.client_id.to_string(), credential_info.client_secret.secret().clone(), @@ -538,15 +566,24 @@ impl AzureStorage { client_builder, }) } else if let Some(connection_string) = config.parse_env_plaintext_account_url() { - let bucket = (*config.bucket.bucket).to_owned(); - let http_client = new_http_client(); - let container_client = StorageAccountClient::new_connection_string( - http_client.clone(), - connection_string.as_str(), - ) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", &e)))? - .as_storage_client() - .as_container_client(bucket); + let account_name = config.get_account_name()?; + let storage_credentials = ConnectionString::new(&connection_string) + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invald configurations for SharedKey from ENV, err: {}", e), + ) + })? + .storage_credentials() + .map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid credentials for blob, err: {}", e), + ) + })?; + let container_client = Arc::new( + BlobServiceClient::new(account_name, storage_credentials).container_client(bucket), + ); let client_builder = Arc::new(SharedKeyContainerBuilder { container_client }); Ok(AzureStorage { @@ -576,7 +613,7 @@ impl AzureStorage { let name = self.maybe_prefix_key(name); debug!("read file from Azure storage"; "key" => %name); let t = async move { - let blob_client = self.client_builder.get_client().await?.as_blob_client(name); + let blob_client = self.client_builder.get_client().await?.blob_client(name); let builder = if let Some(r) = range { blob_client.get().range(r) @@ -584,15 +621,20 @@ impl AzureStorage { blob_client.get() }; - builder - .execute() - .await - .map(|res| res.data) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", e))) + let mut chunk: Vec = vec![]; + let mut stream = builder.into_stream(); + while let Some(value) = stream.next().await { + let value = value?.data.collect().await?; + chunk.extend(&value); + } + azure_core::Result::Ok(chunk) }; - let k = stream::once(t); - let t = k.boxed().into_async_read(); - Box::new(t) + let stream = stream::once( + t.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, format!("{}", e))), + ) + .boxed() + .into_async_read(); + Box::new(stream) } } @@ -716,7 +758,7 @@ mod tests { input.set_endpoint("http://127.0.0.1:10000/devstoreaccount1".to_owned()); input.set_prefix("backup 01/prefix/".to_owned()); - let storage = AzureStorage::from_input(input).unwrap(); + let storage = AzureStorage::from_dummy_input(input).unwrap(); assert_eq!(storage.maybe_prefix_key("t"), "backup 01/prefix/t"); let mut magic_contents = String::new(); for _ in 0..4096 { From b96fe4de8c028e9731eb7ee9c1a158cccd3ee8ea Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 26 Apr 2023 13:37:51 +0800 Subject: [PATCH 667/676] update pprof (#14635) close tikv/tikv#14224 Fix fd leak caused by continuous profiling Signed-off-by: tabokie --- Cargo.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 48360c51100..269b749145f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4129,9 +4129,9 @@ dependencies = [ [[package]] name = "pprof" -version = "0.11.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e20150f965e0e4c925982b9356da71c84bcd56cb66ef4e894825837cbcf6613e" +checksum = "196ded5d4be535690899a4631cc9f18cdc41b7ebf24a79400f46f48e49a11059" dependencies = [ "backtrace", "cfg-if 1.0.0", @@ -4139,7 +4139,7 @@ dependencies = [ "inferno", "libc 0.2.139", "log", - "nix 0.24.1", + "nix 0.26.2", "once_cell", "parking_lot 0.12.1", "protobuf", From 8656623b8b9c9a590b9f61aedcce23ee38ed5023 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Wed, 26 Apr 2023 14:21:52 +0800 Subject: [PATCH 668/676] txn: Check whether the primary matches when handling check_txn_status requests (#14637) close tikv/tikv#14636, ref pingcap/tidb#42937 Makes TiKV support checking whether the lock is primary when handling check_txn_status. Signed-off-by: MyonKeminta Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- Cargo.lock | 2 +- components/error_code/src/storage.rs | 2 + etc/error_code.toml | 5 ++ src/storage/errors.rs | 7 ++ src/storage/mod.rs | 10 ++- src/storage/mvcc/mod.rs | 5 ++ src/storage/txn/actions/check_txn_status.rs | 8 ++ src/storage/txn/commands/check_txn_status.rs | 88 ++++++++++++++++---- src/storage/txn/commands/mod.rs | 1 + 9 files changed, 110 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 269b749145f..7e5ea1bc862 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2930,7 +2930,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#10e7620a630db63d769503ba99c7389f19fb6516" +source = "git+https://github.com/pingcap/kvproto.git#14ac513b9eff75028da1a56f54d36bfb082ac54f" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/error_code/src/storage.rs b/components/error_code/src/storage.rs index e2cf34094c3..8b41e7a797e 100644 --- a/components/error_code/src/storage.rs +++ b/components/error_code/src/storage.rs @@ -43,5 +43,7 @@ define_error_codes!( ASSERTION_FAILED => ("AssertionFailed", "", ""), LOCK_IF_EXISTS_FAILED => ("LockIfExistsFailed", "", ""), + PRIMARY_MISMATCH => ("PrimaryMismatch", "", ""), + UNKNOWN => ("Unknown", "", "") ); diff --git a/etc/error_code.toml b/etc/error_code.toml index 4fae4d9ea57..839c4f33f32 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -753,6 +753,11 @@ error = ''' KV:Storage:LockIfExistsFailed ''' +["KV:Storage:PrimaryMismatch"] +error = ''' +KV:Storage:PrimaryMismatch +''' + ["KV:Storage:Unknown"] error = ''' KV:Storage:Unknown diff --git a/src/storage/errors.rs b/src/storage/errors.rs index 92568d22e45..07ea4b5589e 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -424,6 +424,13 @@ pub fn extract_key_error(err: &Error) -> kvrpcpb::KeyError { assertion_failed.set_existing_commit_ts(existing_commit_ts.into_inner()); key_error.set_assertion_failed(assertion_failed); } + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( + box MvccErrorInner::PrimaryMismatch(lock_info), + ))))) => { + let mut primary_mismatch = kvrpcpb::PrimaryMismatch::default(); + primary_mismatch.set_lock_info(lock_info.clone()); + key_error.set_primary_mismatch(primary_mismatch); + } _ => { error!(?*err; "txn aborts"); key_error.set_abort(format!("{:?}", err)); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 37263ce9a12..897968ef671 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -7910,6 +7910,7 @@ mod tests { false, false, false, + true, Context::default(), ), expect_fail_callback(tx.clone(), 0, |e| match e { @@ -7936,6 +7937,7 @@ mod tests { true, false, false, + true, Context::default(), ), expect_value_callback(tx.clone(), 0, LockNotExist), @@ -7993,6 +7995,7 @@ mod tests { true, false, false, + true, Context::default(), ), expect_value_callback( @@ -8038,6 +8041,7 @@ mod tests { true, false, false, + true, Context::default(), ), expect_value_callback(tx.clone(), 0, committed(ts(20, 0))), @@ -8049,7 +8053,7 @@ mod tests { .sched_txn_command( commands::Prewrite::with_lock_ttl( vec![Mutation::make_put(k.clone(), v)], - k.as_encoded().to_vec(), + k.to_raw().unwrap(), ts(25, 0), 100, ), @@ -8069,6 +8073,7 @@ mod tests { true, false, false, + true, Context::default(), ), expect_value_callback(tx.clone(), 0, TtlExpire), @@ -9411,6 +9416,7 @@ mod tests { false, false, false, + true, Context::default(), ), expect_value_callback( @@ -9447,6 +9453,7 @@ mod tests { false, false, false, + true, Context::default(), ), expect_value_callback(tx.clone(), 0, TxnStatus::TtlExpire), @@ -9840,6 +9847,7 @@ mod tests { true, false, false, + true, Default::default(), ), expect_ok_callback(tx.clone(), 0), diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 0f133b99941..2f9a75b2a03 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -169,6 +169,9 @@ pub enum ErrorInner { )] LockIfExistsFailed { start_ts: TimeStamp, key: Vec }, + #[error("check_txn_status sent to secondary lock, current lock: {0:?}")] + PrimaryMismatch(kvproto::kvrpcpb::LockInfo), + #[error("{0:?}")] Other(#[from] Box), } @@ -298,6 +301,7 @@ impl ErrorInner { key: key.clone(), }) } + ErrorInner::PrimaryMismatch(l) => Some(ErrorInner::PrimaryMismatch(l.clone())), ErrorInner::Io(_) | ErrorInner::Other(_) => None, } } @@ -400,6 +404,7 @@ impl ErrorCodeExt for Error { ErrorInner::CommitTsTooLarge { .. } => error_code::storage::COMMIT_TS_TOO_LARGE, ErrorInner::AssertionFailed { .. } => error_code::storage::ASSERTION_FAILED, ErrorInner::LockIfExistsFailed { .. } => error_code::storage::LOCK_IF_EXISTS_FAILED, + ErrorInner::PrimaryMismatch(_) => error_code::storage::PRIMARY_MISMATCH, ErrorInner::Other(_) => error_code::storage::UNKNOWN, } } diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index a3cd3253201..b0e1ff66232 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -24,7 +24,15 @@ pub fn check_txn_status_lock_exists( caller_start_ts: TimeStamp, force_sync_commit: bool, resolving_pessimistic_lock: bool, + verify_is_primary: bool, ) -> Result<(TxnStatus, Option)> { + if verify_is_primary && !primary_key.is_encoded_from(&lock.primary) { + // Return the current lock info to tell the client what the actual primary is. + return Err( + ErrorInner::PrimaryMismatch(lock.into_lock_info(primary_key.into_raw()?)).into(), + ); + } + // Never rollback or push forward min_commit_ts in check_txn_status if it's // using async commit. Rollback of async-commit locks are done during // ResolveLock. diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 895c753b160..e915c0357d4 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -51,6 +51,11 @@ command! { // lock, the transaction status could not be decided if the primary lock is pessimistic too and // it's still uncertain. resolving_pessimistic_lock: bool, + // Whether it's needed to check wheter the lock on the key (if any) is the primary lock. + // This is for handling some corner cases when pessimistic transactions changes its primary + // (see https://github.com/pingcap/tidb/issues/42937 for details). + // Must be set to true, unless the client is old version that doesn't support this behavior. + verify_is_primary: bool, } } @@ -107,6 +112,7 @@ impl WriteCommand for CheckTxnStatus { self.caller_start_ts, self.force_sync_commit, self.resolving_pessimistic_lock, + self.verify_is_primary, )?, l => ( check_txn_status_missing_lock( @@ -145,7 +151,7 @@ impl WriteCommand for CheckTxnStatus { #[cfg(test)] pub mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::{Context, PrewriteRequestPessimisticAction::*}; + use kvproto::kvrpcpb::{self, Context, LockInfo, PrewriteRequestPessimisticAction::*}; use tikv_util::deadline::Deadline; use txn_types::{Key, WriteType}; @@ -153,8 +159,10 @@ pub mod tests { use crate::storage::{ kv::Engine, lock_manager::MockLockManager, + mvcc, mvcc::tests::*, txn::{ + self, commands::{pessimistic_rollback, WriteCommand, WriteContext}, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, @@ -188,6 +196,7 @@ pub mod tests { rollback_if_not_exist, force_sync_commit, resolving_pessimistic_lock, + verify_is_primary: true, deadline: Deadline::from_now(DEFAULT_EXECUTION_DURATION_LIMIT), }; let result = command @@ -220,7 +229,7 @@ pub mod tests { rollback_if_not_exist: bool, force_sync_commit: bool, resolving_pessimistic_lock: bool, - ) { + ) -> txn::Error { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let current_ts = current_ts.into(); @@ -235,23 +244,28 @@ pub mod tests { rollback_if_not_exist, force_sync_commit, resolving_pessimistic_lock, + verify_is_primary: true, deadline: Deadline::from_now(DEFAULT_EXECUTION_DURATION_LIMIT), }; - assert!( - command - .process_write( - snapshot, - WriteContext { - lock_mgr: &MockLockManager::new(), - concurrency_manager: cm, - extra_op: Default::default(), - statistics: &mut Default::default(), - async_apply_prewrite: false, - raw_ext: None, - }, + command + .process_write( + snapshot, + WriteContext { + lock_mgr: &MockLockManager::new(), + concurrency_manager: cm, + extra_op: Default::default(), + statistics: &mut Default::default(), + async_apply_prewrite: false, + raw_ext: None, + }, + ) + .map(|r| { + panic!( + "expected check_txn_status fail but succeeded with result: {:?}", + r.pr ) - .is_err() - ); + }) + .unwrap_err() } fn committed(commit_ts: impl Into) -> impl FnOnce(TxnStatus) -> bool { @@ -1188,4 +1202,46 @@ pub mod tests { assert!(rollback.last_change_ts.is_zero()); assert_eq!(rollback.versions_to_last_change, 0); } + + #[test] + fn test_verify_is_primary() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let check_lock = |l: LockInfo, key: &'_ [u8], primary: &'_ [u8], lock_type| { + assert_eq!(&l.key, key); + assert_eq!(l.lock_type, lock_type); + assert_eq!(&l.primary_lock, primary); + }; + + let check_error = |e, key: &'_ [u8], primary: &'_ [u8], lock_type| match e { + txn::Error(box txn::ErrorInner::Mvcc(mvcc::Error( + box mvcc::ErrorInner::PrimaryMismatch(lock_info), + ))) => { + check_lock(lock_info, key, primary, lock_type); + } + e => panic!("unexpected error: {:?}", e), + }; + + must_acquire_pessimistic_lock(&mut engine, b"k1", b"k2", 1, 1); + let e = must_err(&mut engine, b"k1", 1, 1, 0, true, false, true); + check_error(e, b"k1", b"k2", kvrpcpb::Op::PessimisticLock); + let lock = must_pessimistic_locked(&mut engine, b"k1", 1, 1); + check_lock( + lock.into_lock_info(b"k1".to_vec()), + b"k1", + b"k2", + kvrpcpb::Op::PessimisticLock, + ); + + must_pessimistic_prewrite_put(&mut engine, b"k1", b"v1", b"k2", 1, 1, DoPessimisticCheck); + let e = must_err(&mut engine, b"k1", 1, 1, 0, true, false, true); + check_error(e, b"k1", b"k2", kvrpcpb::Op::Put); + let lock = must_locked(&mut engine, b"k1", 1); + check_lock( + lock.into_lock_info(b"k1".to_vec()), + b"k1", + b"k2", + kvrpcpb::Op::Put, + ); + } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 4c01629ef48..5e484d385f2 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -306,6 +306,7 @@ impl From for TypedCommand { req.get_rollback_if_not_exist(), req.get_force_sync_commit(), req.get_resolving_pessimistic_lock(), + req.get_verify_is_primary(), req.take_context(), ) } From 909ffe5513e8bb15ea3fe6fe3687f295368858d0 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Wed, 26 Apr 2023 16:22:41 -0700 Subject: [PATCH 669/676] address lint Signed-off-by: tonyxuqqi --- src/config/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index a44cf20c066..61ca4a8b0f7 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -4328,7 +4328,7 @@ impl ConfigController { if self.get_current().storage.engine == EngineType::RaftKv2 { return "partitioned-raft-kv"; } - return "raft-kv"; + "raft-kv" } } From 0f3013ed10d72ecc059cfe099316dfa17a49c47c Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Thu, 27 Apr 2023 11:01:52 +0800 Subject: [PATCH 670/676] raftstore-v2: fix stale read by correct updating peers (#14665) close tikv/tikv#14664 Fix stale read by correct updating peers Signed-off-by: Neil Shen Co-authored-by: tonyxuqqi --- .../raftstore-v2/src/operation/ready/mod.rs | 2 + components/raftstore/src/store/util.rs | 22 ++- components/test_raftstore-v2/src/server.rs | 35 ++++- tests/integrations/raftstore/mod.rs | 1 + .../integrations/raftstore/test_lease_read.rs | 46 +----- .../integrations/raftstore/test_stale_read.rs | 133 ++++++++++++++++++ 6 files changed, 188 insertions(+), 51 deletions(-) create mode 100644 tests/integrations/raftstore/test_stale_read.rs diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 5f294d7e5b6..58c7e904037 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -907,6 +907,8 @@ impl Peer { } _ => {} } + self.read_progress() + .update_leader_info(ss.leader_id, term, self.region()); let target = self.refresh_leader_transferee(); ctx.coprocessor_host.on_role_change( self.region(), diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 82a04ec6f4b..c3a553c89c1 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -1347,6 +1347,10 @@ impl RegionReadProgress { core.leader_info.leader_term = term; if !is_region_epoch_equal(region.get_region_epoch(), &core.leader_info.epoch) { core.leader_info.epoch = region.get_region_epoch().clone(); + } + if core.leader_info.peers != region.get_peers() { + // In v2, we check peers and region epoch independently, because + // peers are incomplete but epoch is set correctly during split. core.leader_info.peers = region.get_peers().to_vec(); } core.leader_info.leader_store_id = @@ -2275,7 +2279,8 @@ mod tests { } let cap = 10; - let rrp = RegionReadProgress::new(&Default::default(), 10, cap, 1); + let mut region = Region::default(); + let rrp = RegionReadProgress::new(®ion, 10, cap, 1); for i in 1..=20 { rrp.update_safe_ts(i, i); } @@ -2322,5 +2327,20 @@ mod tests { rrp.update_safe_ts(400, 0); rrp.update_safe_ts(0, 700); assert_eq!(pending_items_num(&rrp), 0); + + // update leader info, epoch + region.mut_region_epoch().version += 1; + rrp.update_leader_info(1, 5, ®ion); + assert_eq!( + rrp.core.lock().unwrap().get_local_leader_info().epoch, + *region.get_region_epoch(), + ); + // update leader info, peers + region.mut_peers().push(new_peer(1, 2)); + rrp.update_leader_info(1, 5, ®ion); + assert_eq!( + rrp.core.lock().unwrap().get_local_leader_info().peers, + *region.get_peers(), + ); } } diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 85941088e2e..35671c227f4 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -246,6 +246,7 @@ pub struct ServerMeta { sim_trans: SimulateServerTransport, raw_router: StoreRouter, gc_worker: GcWorker>, + rts_worker: Option>, rsmeter_cleanup: Box, } @@ -417,7 +418,30 @@ impl ServerCluster { ); gc_worker.start(node_id).unwrap(); - // todo: resolved ts + let rts_worker = if cfg.resolved_ts.enable { + // Resolved ts worker + let mut rts_worker = LazyWorker::new("resolved-ts"); + let rts_ob = resolved_ts::Observer::new(rts_worker.scheduler()); + rts_ob.register_to(&mut coprocessor_host); + // resolved ts endpoint needs store id. + store_meta.lock().unwrap().store_id = node_id; + // Resolved ts endpoint + let rts_endpoint = resolved_ts::Endpoint::new( + &cfg.resolved_ts, + rts_worker.scheduler(), + raft_router.clone(), + store_meta.clone(), + self.pd_client.clone(), + concurrency_manager.clone(), + self.env.clone(), + self.security_mgr.clone(), + ); + // Start the worker + rts_worker.start(rts_endpoint); + Some(rts_worker) + } else { + None + }; if ApiVersion::V2 == F::TAG { let casual_ts_provider: Arc = Arc::new( @@ -644,6 +668,7 @@ impl ServerCluster { sim_router, gc_worker, sim_trans: simulate_trans, + rts_worker, rsmeter_cleanup, }, ); @@ -759,10 +784,10 @@ impl Simulator for ServerCluster { if let Some(mut meta) = self.metas.remove(&node_id) { meta.server.stop().unwrap(); meta.node.stop(); - // // resolved ts worker started, let's stop it - // if let Some(worker) = meta.rts_worker { - // worker.stop_worker(); - // } + // resolved ts worker started, let's stop it + if let Some(worker) = meta.rts_worker { + worker.stop_worker(); + } (meta.rsmeter_cleanup)(); } self.storages.remove(&node_id); diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index 5f6703afe05..ce19c56e067 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -26,6 +26,7 @@ mod test_snap; mod test_snap_recovery; mod test_split_region; mod test_stale_peer; +mod test_stale_read; mod test_stats; mod test_status_command; mod test_tombstone; diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index 6d8319ebae6..8ac364faae9 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -10,7 +10,7 @@ use std::{ }; use engine_rocks::RocksSnapshot; -use kvproto::{kvrpcpb::Op, metapb}; +use kvproto::metapb; use more_asserts::assert_le; use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; @@ -828,47 +828,3 @@ fn test_node_local_read_renew_lease() { thread::sleep(request_wait); } } - -#[test] -fn test_stale_read_with_ts0() { - let mut cluster = new_server_cluster(0, 3); - let pd_client = Arc::clone(&cluster.pd_client); - pd_client.disable_default_operator(); - cluster.cfg.resolved_ts.enable = true; - cluster.run(); - - let leader = new_peer(1, 1); - cluster.must_transfer_leader(1, leader.clone()); - let mut leader_client = PeerClient::new(&cluster, 1, leader); - - let mut follower_client2 = PeerClient::new(&cluster, 1, new_peer(2, 2)); - - // Set the `stale_read` flag - leader_client.ctx.set_stale_read(true); - follower_client2.ctx.set_stale_read(true); - - let commit_ts1 = leader_client.must_kv_write( - &pd_client, - vec![new_mutation(Op::Put, &b"key1"[..], &b"value1"[..])], - b"key1".to_vec(), - ); - - let commit_ts2 = leader_client.must_kv_write( - &pd_client, - vec![new_mutation(Op::Put, &b"key1"[..], &b"value2"[..])], - b"key1".to_vec(), - ); - - follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), commit_ts1); - follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value2".to_vec(), commit_ts2); - assert!( - follower_client2 - .kv_read(b"key1".to_vec(), 0) - .region_error - .into_option() - .unwrap() - .not_leader - .is_some() - ); - assert!(leader_client.kv_read(b"key1".to_vec(), 0).not_found); -} diff --git a/tests/integrations/raftstore/test_stale_read.rs b/tests/integrations/raftstore/test_stale_read.rs new file mode 100644 index 00000000000..9cbbc6ca8ba --- /dev/null +++ b/tests/integrations/raftstore/test_stale_read.rs @@ -0,0 +1,133 @@ +// Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cell::RefCell, sync::Arc, time::Duration}; + +use grpcio::{ChannelBuilder, Environment}; +use kvproto::{ + kvrpcpb::{Context, Op}, + metapb::{Peer, Region}, + tikvpb_grpc::TikvClient, +}; +use test_raftstore::{new_mutation, new_peer, new_server_cluster, PeerClient}; +use test_raftstore_macro::test_case; +use tikv_util::{config::ReadableDuration, time::Instant}; + +use crate::tikv_util::HandyRwLock; + +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_stale_read_with_ts0() { + let mut cluster = new_cluster(0, 3); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + cluster.cfg.resolved_ts.enable = true; + cluster.cfg.resolved_ts.advance_ts_interval = ReadableDuration::millis(200); + cluster.run(); + + let region_id = 1; + let env = Arc::new(Environment::new(1)); + let new_client = |peer: Peer| { + let cli = TikvClient::new( + ChannelBuilder::new(env.clone()) + .connect(&cluster.sim.rl().get_addr(peer.get_store_id())), + ); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(peer); + ctx.set_region_epoch(epoch); + PeerClient { cli, ctx } + }; + let leader = new_peer(1, 1); + let mut leader_client = new_client(leader.clone()); + let follower = new_peer(2, 2); + let mut follower_client2 = new_client(follower); + + cluster.must_transfer_leader(1, leader); + + // Set the `stale_read` flag + leader_client.ctx.set_stale_read(true); + follower_client2.ctx.set_stale_read(true); + + let commit_ts1 = leader_client.must_kv_write( + &pd_client, + vec![new_mutation(Op::Put, &b"key1"[..], &b"value1"[..])], + b"key1".to_vec(), + ); + + let commit_ts2 = leader_client.must_kv_write( + &pd_client, + vec![new_mutation(Op::Put, &b"key1"[..], &b"value2"[..])], + b"key1".to_vec(), + ); + + follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value1".to_vec(), commit_ts1); + follower_client2.must_kv_read_equal(b"key1".to_vec(), b"value2".to_vec(), commit_ts2); + assert!( + follower_client2 + .kv_read(b"key1".to_vec(), 0) + .region_error + .into_option() + .unwrap() + .not_leader + .is_some() + ); + assert!(leader_client.kv_read(b"key1".to_vec(), 0).not_found); +} + +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] +fn test_stale_read_resolved_ts_advance() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.resolved_ts.enable = true; + cluster.cfg.resolved_ts.advance_ts_interval = ReadableDuration::millis(200); + + cluster.run(); + let cluster = RefCell::new(cluster); + + let must_resolved_ts_advance = |region: &Region| { + let cluster = cluster.borrow_mut(); + let ts = cluster.store_metas[®ion.get_peers()[0].get_store_id()] + .lock() + .unwrap() + .region_read_progress + .get_resolved_ts(®ion.get_id()) + .unwrap(); + let now = Instant::now(); + for peer in region.get_peers() { + loop { + let new_ts = cluster.store_metas[&peer.get_store_id()] + .lock() + .unwrap() + .region_read_progress + .get_resolved_ts(®ion.get_id()) + .unwrap(); + if new_ts <= ts { + if now.saturating_elapsed() > Duration::from_secs(5) { + panic!("timeout"); + } + continue; + } + break; + } + } + }; + + // Make sure resolved ts advances. + let region = cluster.borrow().get_region(&[]); + must_resolved_ts_advance(®ion); + + // Test transfer leader. + cluster + .borrow_mut() + .must_transfer_leader(region.get_id(), region.get_peers()[1].clone()); + must_resolved_ts_advance(®ion); + + // Test split. + let split_key = b"k1"; + cluster.borrow_mut().must_split(®ion, split_key); + let left = cluster.borrow().get_region(&[]); + let right = cluster.borrow().get_region(split_key); + must_resolved_ts_advance(&left); + must_resolved_ts_advance(&right); +} From 6b18e8f72ef36246b9e8aba8c4cd2983eff0e460 Mon Sep 17 00:00:00 2001 From: you06 Date: Thu, 27 Apr 2023 13:31:52 +0800 Subject: [PATCH 671/676] metrics: add missing `check_leader` gRPC metrics (#14662) close tikv/tikv#14658 Record the missing check_leader gRPC metrics. Signed-off-by: you06 --- src/server/service/kv.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 9895067fcb3..a1feb0f7b60 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1004,6 +1004,7 @@ impl Tikv for Service { mut request: CheckLeaderRequest, sink: UnarySink, ) { + let begin_instant = Instant::now(); let addr = ctx.peer(); let ts = request.get_ts(); let leaders = request.take_regions().into(); @@ -1025,6 +1026,10 @@ impl Tikv for Service { } return Err(Error::from(e)); } + let elapsed = begin_instant.saturating_elapsed(); + GRPC_MSG_HISTOGRAM_STATIC + .check_leader + .observe(elapsed.as_secs_f64()); ServerResult::Ok(()) } .map_err(move |e| { From 38462f242e9026710514139fa5ccf42429bec929 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Wed, 26 Apr 2023 23:39:51 -0700 Subject: [PATCH 672/676] [raftstore-v2]: optimize the load based split config based on region size (#14625) ref tikv/tikv#12842 1) optimize the load based split config based on region size 2) polish a log message when it cannot find a target peer of the message. Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- .../raftstore-v2/src/operation/ready/mod.rs | 2 +- components/raftstore/src/store/mod.rs | 4 +- components/raftstore/src/store/worker/mod.rs | 6 ++- .../src/store/worker/split_config.rs | 23 ++++++++-- src/config/mod.rs | 42 +++++++++++++++---- 5 files changed, 63 insertions(+), 14 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 58c7e904037..62e8fda7ba0 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -379,7 +379,7 @@ impl Peer { let to_peer = match self.peer_from_cache(msg.to) { Some(p) => p, None => { - warn!(self.logger, "failed to look up recipient peer"; "to_peer" => msg.to); + warn!(self.logger, "failed to look up recipient peer"; "to_peer" => msg.to, "message_type" => ?msg.msg_type); return None; } }; diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index c007b622ee1..7a2c04e2450 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -84,6 +84,8 @@ pub use self::{ ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, - NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_BIG_REGION_BYTE_THRESHOLD, + DEFAULT_BIG_REGION_QPS_THRESHOLD, DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, }, }; diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index ac23f4e58d5..62d27b2e88b 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -44,6 +44,10 @@ pub use self::{ split_check::{ Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, }, - split_config::{SplitConfig, SplitConfigManager}, + split_config::{ + SplitConfig, SplitConfigManager, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + DEFAULT_BIG_REGION_BYTE_THRESHOLD, DEFAULT_BIG_REGION_QPS_THRESHOLD, + DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + }, split_controller::{AutoSplitController, ReadStats, SplitConfigChange, SplitInfo, WriteStats}, }; diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index 7857ae10d8e..8fec853bb00 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -6,13 +6,18 @@ use lazy_static::lazy_static; use online_config::{ConfigChange, ConfigManager, OnlineConfig}; use parking_lot::Mutex; use serde::{Deserialize, Serialize}; -use tikv_util::{config::VersionTrack, info}; +use tikv_util::{ + config::{ReadableSize, VersionTrack}, + info, +}; const DEFAULT_DETECT_TIMES: u64 = 10; const DEFAULT_SAMPLE_THRESHOLD: u64 = 100; pub(crate) const DEFAULT_SAMPLE_NUM: usize = 20; -const DEFAULT_QPS_THRESHOLD: usize = 3000; -const DEFAULT_BYTE_THRESHOLD: usize = 30 * 1024 * 1024; +pub const DEFAULT_QPS_THRESHOLD: usize = 3000; +pub const DEFAULT_BIG_REGION_QPS_THRESHOLD: usize = 7000; +pub const DEFAULT_BYTE_THRESHOLD: usize = 30 * 1024 * 1024; +pub const DEFAULT_BIG_REGION_BYTE_THRESHOLD: usize = 100 * 1024 * 1024; // We get balance score by // abs(sample.left-sample.right)/(sample.right+sample.left). It will be used to @@ -43,7 +48,8 @@ const DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.8; // `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` as a percentage of the Unified Read // Poll, it will be added into the hot region list and may be split later as the // top hot CPU region. -pub(crate) const REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.25; +pub const REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.25; +pub const BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.75; lazy_static! { static ref SPLIT_CONFIG: Mutex>>> = Mutex::new(None); @@ -134,6 +140,15 @@ impl SplitConfig { } Ok(()) } + + pub fn optimize_for(&mut self, region_size: ReadableSize) { + const LARGE_REGION_SIZE_IN_MB: u64 = 4096; + if region_size.as_mb() >= LARGE_REGION_SIZE_IN_MB { + self.qps_threshold = DEFAULT_BIG_REGION_QPS_THRESHOLD; + self.region_cpu_overload_threshold_ratio = BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO; + self.byte_threshold = DEFAULT_BIG_REGION_BYTE_THRESHOLD; + } + } } #[derive(Clone)] diff --git a/src/config/mod.rs b/src/config/mod.rs index 5d20b027c4e..62a7de89130 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3399,6 +3399,8 @@ impl TikvConfig { self.coprocessor .optimize_for(self.storage.engine == EngineType::RaftKv2); self.coprocessor.validate()?; + self.split + .optimize_for(self.coprocessor.region_split_size()); self.raft_store.validate( self.coprocessor.region_split_size(), self.coprocessor.enable_region_bucket(), @@ -4337,9 +4339,16 @@ mod tests { use grpcio::ResourceQuota; use itertools::Itertools; use kvproto::kvrpcpb::CommandPri; - use raftstore::coprocessor::{ - config::{RAFTSTORE_V2_SPLIT_SIZE, SPLIT_SIZE}, - region_info_accessor::MockRegionInfoProvider, + use raftstore::{ + coprocessor::{ + config::{RAFTSTORE_V2_SPLIT_SIZE, SPLIT_SIZE}, + region_info_accessor::MockRegionInfoProvider, + }, + store::{ + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_BIG_REGION_BYTE_THRESHOLD, + DEFAULT_BIG_REGION_QPS_THRESHOLD, DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, + REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + }, }; use slog::Level; use tempfile::Builder; @@ -5780,18 +5789,37 @@ mod tests { #[test] fn test_region_size_config() { let mut default_cfg = TikvConfig::default(); - default_cfg.coprocessor.optimize_for(false); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.storage.engine = EngineType::RaftKv; + default_cfg.validate().unwrap(); assert_eq!(default_cfg.coprocessor.region_split_size(), SPLIT_SIZE); assert!(!default_cfg.coprocessor.enable_region_bucket()); + assert_eq!(default_cfg.split.qps_threshold, DEFAULT_QPS_THRESHOLD); + assert_eq!( + default_cfg.split.region_cpu_overload_threshold_ratio, + REGION_CPU_OVERLOAD_THRESHOLD_RATIO + ); + assert_eq!(default_cfg.split.byte_threshold, DEFAULT_BYTE_THRESHOLD); + let mut default_cfg = TikvConfig::default(); - default_cfg.coprocessor.optimize_for(true); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.storage.engine = EngineType::RaftKv2; + default_cfg.validate().unwrap(); assert_eq!( default_cfg.coprocessor.region_split_size(), RAFTSTORE_V2_SPLIT_SIZE ); + assert_eq!( + default_cfg.split.qps_threshold, + DEFAULT_BIG_REGION_QPS_THRESHOLD + ); + assert_eq!( + default_cfg.split.region_cpu_overload_threshold_ratio, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO + ); + assert_eq!( + default_cfg.split.byte_threshold, + DEFAULT_BIG_REGION_BYTE_THRESHOLD + ); assert!(default_cfg.coprocessor.enable_region_bucket()); let mut default_cfg = TikvConfig::default(); From 9e73fed6351ad8f4a2ab8aa7ad4ac76a344cb604 Mon Sep 17 00:00:00 2001 From: lidezhu <47731263+lidezhu@users.noreply.github.com> Date: Thu, 27 Apr 2023 15:35:51 +0800 Subject: [PATCH 673/676] raftstore: support decode simple write request in v1 (#14638) ref tikv/tikv#14575 Support decode simple write request in v1 Signed-off-by: lidezhu Co-authored-by: Xinye Tao --- components/raftstore/src/store/fsm/apply.rs | 455 +++++++++++++++++- components/raftstore/src/store/util.rs | 59 ++- tests/integrations/raftstore/mod.rs | 1 + tests/integrations/raftstore/test_snap.rs | 93 +--- .../raftstore/test_v1_v2_mixed.rs | 268 +++++++++++ 5 files changed, 770 insertions(+), 106 deletions(-) create mode 100644 tests/integrations/raftstore/test_v1_v2_mixed.rs diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 54ca2274162..d1ba6d4e774 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -463,6 +463,8 @@ where /// `ApplyRes` uncommitted. Data will finally be written to kvdb in /// `flush`. uncommitted_res_count: usize, + + enable_v2_compatible_learner: bool, } impl ApplyContext @@ -519,6 +521,7 @@ where key_buffer: Vec::with_capacity(1024), disable_wal: false, uncommitted_res_count: 0, + enable_v2_compatible_learner: cfg.enable_v2_compatible_learner, } } @@ -1210,7 +1213,18 @@ where if !data.is_empty() { if !self.peer.is_witness || !can_witness_skip(entry) { - let cmd = util::parse_data_at(data, index, &self.tag); + let cmd = match util::parse_raft_cmd_request(data, index, term, &self.tag) { + util::RaftCmd::V1(cmd) => cmd, + util::RaftCmd::V2(simple_write_decoder) => { + if !apply_ctx.enable_v2_compatible_learner { + panic!( + "{} can not handle v2 command when enable_v2_compatible_learner is false", + self.tag + ); + } + simple_write_decoder.to_raft_cmd_request() + } + }; if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { self.priority = Priority::Low; } @@ -5000,7 +5014,12 @@ mod tests { use super::*; use crate::{ coprocessor::*, - store::{msg::WriteResponse, peer_storage::RAFT_INIT_LOG_INDEX, Config, RegionTask}, + store::{ + msg::WriteResponse, + peer_storage::RAFT_INIT_LOG_INDEX, + simple_write::{SimpleWriteEncoder, SimpleWriteReqEncoder}, + Config, RegionTask, + }, }; impl GenSnapTask { @@ -5611,6 +5630,93 @@ mod tests { } } + struct EntryBuilderUsingSimpleWrite { + entry: Entry, + header: Box, + encoder: SimpleWriteEncoder, + } + + impl EntryBuilderUsingSimpleWrite { + fn new(index: u64, term: u64) -> EntryBuilderUsingSimpleWrite { + let encoder = SimpleWriteEncoder::with_capacity(64); + let header = Box::::default(); + let mut entry = Entry::default(); + entry.set_index(index); + entry.set_term(term); + EntryBuilderUsingSimpleWrite { + entry, + header, + encoder, + } + } + + fn epoch(mut self, conf_ver: u64, version: u64) -> EntryBuilderUsingSimpleWrite { + let mut epoch = RegionEpoch::default(); + epoch.set_version(version); + epoch.set_conf_ver(conf_ver); + self.header.set_region_epoch(epoch); + self + } + + fn put(mut self, key: &[u8], value: &[u8]) -> EntryBuilderUsingSimpleWrite { + self.encoder.put(CF_DEFAULT, key, value); + self + } + + fn put_cf(mut self, cf: &str, key: &[u8], value: &[u8]) -> EntryBuilderUsingSimpleWrite { + self.encoder.put(cf, key, value); + self + } + + fn delete(mut self, key: &[u8]) -> EntryBuilderUsingSimpleWrite { + self.encoder.delete(CF_DEFAULT, key); + self + } + + fn delete_cf(mut self, cf: &str, key: &[u8]) -> EntryBuilderUsingSimpleWrite { + self.encoder.delete(cf, key); + self + } + + fn delete_range( + mut self, + start_key: &[u8], + end_key: &[u8], + ) -> EntryBuilderUsingSimpleWrite { + self.encoder + .delete_range(CF_DEFAULT, start_key, end_key, false); + self + } + + fn delete_range_cf( + mut self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + ) -> EntryBuilderUsingSimpleWrite { + self.encoder.delete_range(cf, start_key, end_key, false); + self + } + + fn ingest_sst(mut self, meta: &SstMeta) -> EntryBuilderUsingSimpleWrite { + self.encoder.ingest(vec![meta.clone()]); + self + } + + fn build(mut self) -> Entry { + let bin = self.encoder.encode(); + let req_encoder = SimpleWriteReqEncoder::>::new( + self.header.clone(), + bin, + 1000, + false, + ); + let (bytes, _) = req_encoder.encode(); + self.entry.set_data(bytes.into()); + self.entry + } + } + #[derive(Clone, Default)] struct ApplyObserver { pre_query_count: Arc, @@ -6093,6 +6199,351 @@ mod tests { system.shutdown(); } + #[test] + fn test_handle_raft_committed_entries_from_v2() { + let (_path, engine) = create_tmp_engine("test-delegate"); + let (import_dir, importer) = create_tmp_importer("test-delegate"); + let obs = ApplyObserver::default(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_query_observer(1, BoxQueryObserver::new(obs.clone())); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let mut config = Config::default(); + config.enable_v2_compatible_learner = true; + let cfg = Arc::new(VersionTrack::new(config)); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg, + sender, + region_scheduler, + coprocessor_host: host, + importer: importer.clone(), + engine: engine.clone(), + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("test-handle-raft".to_owned(), builder); + + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(2, 3)); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + router.schedule_task(1, Msg::Registration(reg)); + + let (capture_tx, capture_rx) = mpsc::channel(); + let put_entry = EntryBuilderUsingSimpleWrite::new(1, 1) + .put(b"k1", b"v1") + .put(b"k2", b"v1") + .put(b"k3", b"v1") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 1, + vec![put_entry], + vec![cb(1, 1, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let dk_k1 = keys::data_key(b"k1"); + let dk_k2 = keys::data_key(b"k2"); + let dk_k3 = keys::data_key(b"k3"); + assert_eq!(engine.get_value(&dk_k1).unwrap().unwrap(), b"v1"); + assert_eq!(engine.get_value(&dk_k2).unwrap().unwrap(), b"v1"); + assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); + validate(&router, 1, |delegate| { + assert_eq!(delegate.applied_term, 1); + assert_eq!(delegate.apply_state.get_applied_index(), 1); + }); + fetch_apply_res(&rx); + + let put_entry = EntryBuilderUsingSimpleWrite::new(2, 2) + .put_cf(CF_LOCK, b"k1", b"v1") + .epoch(1, 3) + .build(); + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 2, vec![put_entry], vec![]))); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.region_id, 1); + assert_eq!(apply_res.apply_state.get_applied_index(), 2); + assert_eq!(apply_res.applied_term, 2); + assert!(apply_res.exec_res.is_empty()); + assert!(apply_res.metrics.written_bytes >= 5); + assert_eq!(apply_res.metrics.written_keys, 2); + assert_eq!(apply_res.metrics.size_diff_hint, 5); + assert_eq!(apply_res.metrics.lock_cf_written_bytes, 5); + assert_eq!( + engine.get_value_cf(CF_LOCK, &dk_k1).unwrap().unwrap(), + b"v1" + ); + + let put_entry = EntryBuilderUsingSimpleWrite::new(3, 2) + .put(b"k2", b"v2") + .epoch(1, 1) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 2, + vec![put_entry], + vec![cb(3, 2, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_epoch_not_match()); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 2); + assert_eq!(apply_res.apply_state.get_applied_index(), 3); + + let put_entry = EntryBuilderUsingSimpleWrite::new(4, 2) + .put(b"k3", b"v3") + .put(b"k5", b"v5") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 2, + vec![put_entry], + vec![cb(4, 2, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_key_not_in_region()); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 2); + assert_eq!(apply_res.apply_state.get_applied_index(), 4); + // a writebatch should be atomic. + assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); + + let put_entry = EntryBuilderUsingSimpleWrite::new(5, 3) + .delete(b"k1") + .delete_cf(CF_LOCK, b"k1") + .delete_cf(CF_WRITE, b"k1") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + vec![put_entry], + vec![cb(5, 2, capture_tx.clone()), cb(5, 3, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + // stale command should be cleared. + assert!(resp.get_header().get_error().has_stale_command()); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert!(engine.get_value(&dk_k1).unwrap().is_none()); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.metrics.lock_cf_written_bytes, 3); + assert_eq!(apply_res.metrics.delete_keys_hint, 2); + assert_eq!(apply_res.metrics.size_diff_hint, -9); + + let delete_entry = EntryBuilderUsingSimpleWrite::new(6, 3) + .delete(b"k5") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + vec![delete_entry], + vec![cb(6, 3, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_key_not_in_region()); + fetch_apply_res(&rx); + + let delete_range_entry = EntryBuilderUsingSimpleWrite::new(7, 3) + .delete_range(b"", b"") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + vec![delete_range_entry], + vec![cb(7, 3, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().get_error().has_key_not_in_region()); + assert_eq!(engine.get_value(&dk_k3).unwrap().unwrap(), b"v1"); + fetch_apply_res(&rx); + + let delete_range_entry = EntryBuilderUsingSimpleWrite::new(8, 3) + .delete_range_cf(CF_DEFAULT, b"", b"k5") + .delete_range_cf(CF_LOCK, b"", b"k5") + .delete_range_cf(CF_WRITE, b"", b"k5") + .epoch(1, 3) + .build(); + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + vec![delete_range_entry], + vec![cb(8, 3, capture_tx.clone())], + )), + ); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert!(engine.get_value(&dk_k1).unwrap().is_none()); + assert!(engine.get_value(&dk_k2).unwrap().is_none()); + assert!(engine.get_value(&dk_k3).unwrap().is_none()); + + // The region was rescheduled from normal-priority handler to + // low-priority handler, so the first apple_res.exec_res should be empty. + let apply_res = fetch_apply_res(&rx); + assert!(apply_res.exec_res.is_empty()); + // The entry should be applied now. + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 3); + assert_eq!(apply_res.apply_state.get_applied_index(), 8); + + // UploadSST + let sst_path = import_dir.path().join("test.sst"); + let mut sst_epoch = RegionEpoch::default(); + sst_epoch.set_conf_ver(1); + sst_epoch.set_version(3); + let sst_range = (0, 100); + let (mut meta1, data1) = gen_sst_file(&sst_path, sst_range); + meta1.set_region_id(1); + meta1.set_region_epoch(sst_epoch); + let mut file1 = importer.create(&meta1).unwrap(); + file1.append(&data1).unwrap(); + file1.finish().unwrap(); + let (mut meta2, data2) = gen_sst_file(&sst_path, sst_range); + meta2.set_region_id(1); + meta2.mut_region_epoch().set_conf_ver(1); + meta2.mut_region_epoch().set_version(1234); + let mut file2 = importer.create(&meta2).unwrap(); + file2.append(&data2).unwrap(); + file2.finish().unwrap(); + + // IngestSst + let put_ok = EntryBuilderUsingSimpleWrite::new(9, 3) + .put(&[sst_range.0], &[sst_range.1]) + .epoch(0, 3) + .build(); + // Add a put above to test flush before ingestion. + let capture_tx_clone = capture_tx.clone(); + let ingest_ok = EntryBuilderUsingSimpleWrite::new(10, 3) + .ingest_sst(&meta1) + .epoch(0, 3) + .build(); + let ingest_epoch_not_match = EntryBuilderUsingSimpleWrite::new(11, 3) + .ingest_sst(&meta2) + .epoch(0, 3) + .build(); + let entries = vec![put_ok, ingest_ok, ingest_epoch_not_match]; + router.schedule_task( + 1, + Msg::apply(apply( + peer_id, + 1, + 3, + entries, + vec![ + cb(9, 3, capture_tx.clone()), + proposal( + false, + 10, + 3, + Callback::write(Box::new(move |resp: WriteResponse| { + // Sleep until yield timeout. + thread::sleep(Duration::from_millis(500)); + capture_tx_clone.send(resp.response).unwrap(); + })), + ), + cb(11, 3, capture_tx.clone()), + ], + )), + ); + + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + check_db_range(&engine, sst_range); + let resp = capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + assert!(resp.get_header().has_error()); + + // The region was rescheduled to normal-priority handler because of + // nomral put command, so the first apple_res.exec_res should be empty. + let apply_res = fetch_apply_res(&rx); + assert!(apply_res.exec_res.is_empty()); + // The region was rescheduled low-priority becasuee of ingest command, + // only put entry has been applied; + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 3); + assert_eq!(apply_res.apply_state.get_applied_index(), 9); + // The region will yield after timeout. + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 3); + assert_eq!(apply_res.apply_state.get_applied_index(), 10); + // The third entry should be applied now. + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.applied_term, 3); + assert_eq!(apply_res.apply_state.get_applied_index(), 11); + + let write_batch_max_keys = ::WRITE_BATCH_MAX_KEYS; + + let mut props = vec![]; + let mut entries = vec![]; + for i in 0..write_batch_max_keys { + let put_entry = EntryBuilder::new(i as u64 + 12, 3) + .put(b"k", b"v") + .epoch(1, 3) + .build(); + entries.push(put_entry); + props.push(cb(i as u64 + 12, 3, capture_tx.clone())); + } + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 3, entries, props))); + for _ in 0..write_batch_max_keys { + capture_rx.recv_timeout(Duration::from_secs(3)).unwrap(); + } + let index = write_batch_max_keys + 11; + // The region was rescheduled to normal-priority handler. Discard the first + // apply_res. + fetch_apply_res(&rx); + let apply_res = fetch_apply_res(&rx); + assert_eq!(apply_res.apply_state.get_applied_index(), index as u64); + assert_eq!(obs.pre_query_count.load(Ordering::SeqCst), index); + assert_eq!(obs.post_query_count.load(Ordering::SeqCst), index); + + system.shutdown(); + } + #[test] fn test_apply_yield_with_msg_size() { let (_path, engine) = create_tmp_engine("test-apply-yield"); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index c3a553c89c1..f5a23538ad5 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -43,7 +43,11 @@ use tokio::sync::Notify; use txn_types::WriteBatchFlags; use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; -use crate::{coprocessor::CoprocessorHost, store::snap::SNAPSHOT_VERSION, Error, Result}; +use crate::{ + coprocessor::CoprocessorHost, + store::{simple_write::SimpleWriteReqDecoder, snap::SNAPSHOT_VERSION}, + Error, Result, +}; const INVALID_TIMESTAMP: u64 = u64::MAX; @@ -749,18 +753,30 @@ pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { if entry.get_entry_type() != EntryType::EntryNormal { return RaftRequestHeader::default(); } - // request header is encoded into data - let mut is = CodedInputStream::from_bytes(entry.get_data()); - if is.eof().unwrap() { - return RaftRequestHeader::default(); - } - let (field_number, _) = is.read_tag_unpack().unwrap(); - let t = is.read_message().unwrap(); - // Header field is of number 1 - if field_number != 1 { - panic!("unexpected field number: {} {:?}", field_number, t); + let logger = slog_global::get_global().new(slog::o!()); + match SimpleWriteReqDecoder::new( + |_, _, _| RaftCmdRequest::default(), + &logger, + entry.get_data(), + entry.get_index(), + entry.get_term(), + ) { + Ok(decoder) => decoder.header().clone(), + Err(_) => { + // request header is encoded into data + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return RaftRequestHeader::default(); + } + let (field_number, _) = is.read_tag_unpack().unwrap(); + let t = is.read_message().unwrap(); + // Header field is of number 1 + if field_number != 1 { + panic!("unexpected field number: {} {:?}", field_number, t); + } + t + } } - t } /// Parse data of entry `index`. @@ -784,6 +800,25 @@ pub fn parse_data_at(data: &[u8], index: u64, tag: &str) - result } +pub enum RaftCmd<'a> { + V1(RaftCmdRequest), + V2(SimpleWriteReqDecoder<'a>), +} + +pub fn parse_raft_cmd_request<'a>(data: &'a [u8], index: u64, term: u64, tag: &str) -> RaftCmd<'a> { + let logger = slog_global::get_global().new(slog::o!()); + match SimpleWriteReqDecoder::new( + |_, _, _| parse_data_at(data, index, tag), + &logger, + data, + index, + term, + ) { + Ok(simple_write_decoder) => RaftCmd::V2(simple_write_decoder), + Err(cmd) => RaftCmd::V1(cmd), + } +} + /// Check if two regions are sibling. /// /// They are sibling only when they share borders and don't overlap. diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index ce19c56e067..3bb93f6809b 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -34,4 +34,5 @@ mod test_transfer_leader; mod test_transport; mod test_unsafe_recovery; mod test_update_region_size; +mod test_v1_v2_mixed; mod test_witness; diff --git a/tests/integrations/raftstore/test_snap.rs b/tests/integrations/raftstore/test_snap.rs index a620bb3a990..c790d10be45 100644 --- a/tests/integrations/raftstore/test_snap.rs +++ b/tests/integrations/raftstore/test_snap.rs @@ -12,8 +12,7 @@ use std::{ }; use collections::HashMap; -use engine_rocks::{RocksCfOptions, RocksDbOptions}; -use engine_traits::{Checkpointer, KvEngine, Peekable, RaftEngineReadOnly, SyncMutable, LARGE_CFS}; +use engine_traits::{Checkpointer, KvEngine, RaftEngineReadOnly}; use file_system::{IoOp, IoType}; use futures::executor::block_on; use grpcio::Environment; @@ -785,96 +784,6 @@ fn generate_snap( (msg, snap_key) } -#[test] -fn test_v1_receive_snap_from_v2() { - let test_receive_snap = |key_num| { - let mut cluster_v1 = test_raftstore::new_server_cluster(1, 1); - let mut cluster_v2 = test_raftstore_v2::new_server_cluster(1, 1); - let mut cluster_v1_tikv = test_raftstore::new_server_cluster(1, 1); - - cluster_v1.cfg.raft_store.enable_v2_compatible_learner = true; - - cluster_v1.run(); - cluster_v2.run(); - cluster_v1_tikv.run(); - - let s1_addr = cluster_v1.get_addr(1); - let s2_addr = cluster_v1_tikv.get_addr(1); - let region = cluster_v2.get_region(b""); - let region_id = region.get_id(); - let engine = cluster_v2.get_engine(1); - let tablet = engine.get_tablet_by_id(region_id).unwrap(); - - for i in 0..key_num { - let k = format!("zk{:04}", i); - tablet.put(k.as_bytes(), &random_long_vec(1024)).unwrap(); - } - - let snap_mgr = cluster_v2.get_snap_mgr(1); - let security_mgr = cluster_v2.get_security_mgr(); - let (msg, snap_key) = generate_snap(&engine, region_id, &snap_mgr); - let cfg = tikv::server::Config::default(); - let limit = Limiter::new(f64::INFINITY); - let env = Arc::new(Environment::new(1)); - let _ = block_on(async { - send_snap_v2( - env.clone(), - snap_mgr.clone(), - security_mgr.clone(), - &cfg, - &s1_addr, - msg.clone(), - limit.clone(), - ) - .unwrap() - .await - }); - let send_result = block_on(async { - send_snap_v2(env, snap_mgr, security_mgr, &cfg, &s2_addr, msg, limit) - .unwrap() - .await - }); - // snapshot should be rejected by cluster v1 tikv, and the snapshot should be - // deleted. - assert!(send_result.is_err()); - let dir = cluster_v2.get_snap_dir(1); - let read_dir = std::fs::read_dir(dir).unwrap(); - assert_eq!(0, read_dir.count()); - - // The snapshot has been received by cluster v1, so check it's completeness - let snap_mgr = cluster_v1.get_snap_mgr(1); - let path = snap_mgr - .tablet_snap_manager() - .unwrap() - .final_recv_path(&snap_key); - let rocksdb = engine_rocks::util::new_engine_opt( - path.as_path().to_str().unwrap(), - RocksDbOptions::default(), - LARGE_CFS - .iter() - .map(|&cf| (cf, RocksCfOptions::default())) - .collect(), - ) - .unwrap(); - - for i in 0..key_num { - let k = format!("zk{:04}", i); - assert!( - rocksdb - .get_value_cf("default", k.as_bytes()) - .unwrap() - .is_some() - ); - } - }; - - // test small snapshot - test_receive_snap(20); - - // test large snapshot - test_receive_snap(5000); -} - #[derive(Clone)] struct MockApplySnapshotObserver { tablet_snap_paths: Arc>>, diff --git a/tests/integrations/raftstore/test_v1_v2_mixed.rs b/tests/integrations/raftstore/test_v1_v2_mixed.rs new file mode 100644 index 00000000000..1514529b209 --- /dev/null +++ b/tests/integrations/raftstore/test_v1_v2_mixed.rs @@ -0,0 +1,268 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{Arc, Mutex}, + time::Duration, +}; + +use engine_rocks::{RocksCfOptions, RocksDbOptions}; +use engine_traits::{Checkpointer, KvEngine, Peekable, SyncMutable, LARGE_CFS}; +use futures::executor::block_on; +use grpcio::Environment; +use kvproto::raft_serverpb::{RaftMessage, *}; +use raft::eraftpb::{MessageType, Snapshot}; +use raftstore::{ + errors::Result, + store::{snap::TABLET_SNAPSHOT_VERSION, TabletSnapKey, TabletSnapManager}, +}; +use rand::Rng; +use test_raftstore::{ + new_learner_peer, Direction, Filter, FilterFactory, RegionPacketFilter, Simulator as S1, *, +}; +use test_raftstore_v2::{Simulator as S2, WrapFactory}; +use tikv::server::tablet_snap::send_snap as send_snap_v2; +use tikv_util::time::Limiter; + +struct ForwardFactory { + node_id: u64, + chain_send: Arc, +} + +impl FilterFactory for ForwardFactory { + fn generate(&self, _: u64) -> Vec> { + vec![Box::new(ForwardFilter { + node_id: self.node_id, + chain_send: self.chain_send.clone(), + })] + } +} + +struct ForwardFilter { + node_id: u64, + chain_send: Arc, +} + +impl Filter for ForwardFilter { + fn before(&self, msgs: &mut Vec) -> Result<()> { + for m in msgs.drain(..) { + if self.node_id == m.get_to_peer().get_store_id() { + (self.chain_send)(m); + } + } + Ok(()) + } +} + +fn generate_snap( + engine: &WrapFactory, + region_id: u64, + snap_mgr: &TabletSnapManager, +) -> (RaftMessage, TabletSnapKey) { + let tablet = engine.get_tablet_by_id(region_id).unwrap(); + let region_state = engine.region_local_state(region_id).unwrap().unwrap(); + let apply_state = engine.raft_apply_state(region_id).unwrap().unwrap(); + let raft_state = engine.raft_local_state(region_id).unwrap().unwrap(); + + // Construct snapshot by hand + let mut snapshot = Snapshot::default(); + // use commit term for simplicity + snapshot + .mut_metadata() + .set_term(raft_state.get_hard_state().term + 1); + snapshot.mut_metadata().set_index(apply_state.applied_index); + let conf_state = raftstore::store::util::conf_state_from_region(region_state.get_region()); + snapshot.mut_metadata().set_conf_state(conf_state); + + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region_state.get_region().clone()); + snap_data.set_version(TABLET_SNAPSHOT_VERSION); + use protobuf::Message; + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + let snap_key = TabletSnapKey::from_region_snap(region_id, 1, &snapshot); + let checkpointer_path = snap_mgr.tablet_gen_path(&snap_key); + let mut checkpointer = tablet.new_checkpointer().unwrap(); + checkpointer + .create_at(checkpointer_path.as_path(), None, 0) + .unwrap(); + + let mut msg = RaftMessage::default(); + msg.region_id = region_id; + msg.set_to_peer(new_peer(1, 1)); + msg.mut_message().set_snapshot(snapshot); + msg.mut_message() + .set_term(raft_state.get_hard_state().commit + 1); + msg.mut_message().set_msg_type(MessageType::MsgSnapshot); + msg.set_region_epoch(region_state.get_region().get_region_epoch().clone()); + + (msg, snap_key) +} + +fn random_long_vec(length: usize) -> Vec { + let mut rng = rand::thread_rng(); + let mut value = Vec::with_capacity(1024); + (0..length).for_each(|_| value.push(rng.gen::())); + value +} + +#[test] +fn test_v1_receive_snap_from_v2() { + let test_receive_snap = |key_num| { + let mut cluster_v1 = test_raftstore::new_server_cluster(1, 1); + let mut cluster_v2 = test_raftstore_v2::new_server_cluster(1, 1); + let mut cluster_v1_tikv = test_raftstore::new_server_cluster(1, 1); + + cluster_v1.cfg.raft_store.enable_v2_compatible_learner = true; + + cluster_v1.run(); + cluster_v2.run(); + cluster_v1_tikv.run(); + + let s1_addr = cluster_v1.get_addr(1); + let s2_addr = cluster_v1_tikv.get_addr(1); + let region = cluster_v2.get_region(b""); + let region_id = region.get_id(); + let engine = cluster_v2.get_engine(1); + let tablet = engine.get_tablet_by_id(region_id).unwrap(); + + for i in 0..key_num { + let k = format!("zk{:04}", i); + tablet.put(k.as_bytes(), &random_long_vec(1024)).unwrap(); + } + + let snap_mgr = cluster_v2.get_snap_mgr(1); + let security_mgr = cluster_v2.get_security_mgr(); + let (msg, snap_key) = generate_snap(&engine, region_id, &snap_mgr); + let cfg = tikv::server::Config::default(); + let limit = Limiter::new(f64::INFINITY); + let env = Arc::new(Environment::new(1)); + let _ = block_on(async { + send_snap_v2( + env.clone(), + snap_mgr.clone(), + security_mgr.clone(), + &cfg, + &s1_addr, + msg.clone(), + limit.clone(), + ) + .unwrap() + .await + }); + let send_result = block_on(async { + send_snap_v2(env, snap_mgr, security_mgr, &cfg, &s2_addr, msg, limit) + .unwrap() + .await + }); + // snapshot should be rejected by cluster v1 tikv, and the snapshot should be + // deleted. + assert!(send_result.is_err()); + let dir = cluster_v2.get_snap_dir(1); + let read_dir = std::fs::read_dir(dir).unwrap(); + assert_eq!(0, read_dir.count()); + + // The snapshot has been received by cluster v1, so check it's completeness + let snap_mgr = cluster_v1.get_snap_mgr(1); + let path = snap_mgr + .tablet_snap_manager() + .unwrap() + .final_recv_path(&snap_key); + let rocksdb = engine_rocks::util::new_engine_opt( + path.as_path().to_str().unwrap(), + RocksDbOptions::default(), + LARGE_CFS + .iter() + .map(|&cf| (cf, RocksCfOptions::default())) + .collect(), + ) + .unwrap(); + + for i in 0..key_num { + let k = format!("zk{:04}", i); + assert!( + rocksdb + .get_value_cf("default", k.as_bytes()) + .unwrap() + .is_some() + ); + } + }; + + // test small snapshot + test_receive_snap(20); + + // test large snapshot + test_receive_snap(5000); +} + +#[test] +fn test_v1_simple_write() { + let mut cluster_v2 = test_raftstore_v2::new_node_cluster(1, 2); + let mut cluster_v1 = test_raftstore::new_node_cluster(1, 2); + cluster_v1.cfg.tikv.raft_store.enable_v2_compatible_learner = true; + cluster_v1.pd_client.disable_default_operator(); + cluster_v2.pd_client.disable_default_operator(); + let r11 = cluster_v1.run_conf_change(); + let r21 = cluster_v2.run_conf_change(); + + cluster_v1.must_put(b"k0", b"v0"); + cluster_v2.must_put(b"k0", b"v0"); + cluster_v1 + .pd_client + .must_add_peer(r11, new_learner_peer(2, 10)); + cluster_v2 + .pd_client + .must_add_peer(r21, new_learner_peer(2, 10)); + check_key_in_engine(&cluster_v1.get_engine(2), b"zk0", b"v0"); + check_key_in_engine(&cluster_v2.get_engine(2), b"zk0", b"v0"); + let trans1 = Mutex::new(cluster_v1.sim.read().unwrap().get_router(2).unwrap()); + let trans2 = Mutex::new(cluster_v2.sim.read().unwrap().get_router(1).unwrap()); + + let factory1 = ForwardFactory { + node_id: 1, + chain_send: Arc::new(move |m| { + info!("send to trans2"; "msg" => ?m); + let _ = trans2.lock().unwrap().send_raft_message(Box::new(m)); + }), + }; + cluster_v1.add_send_filter(factory1); + let factory2 = ForwardFactory { + node_id: 2, + chain_send: Arc::new(move |m| { + info!("send to trans1"; "msg" => ?m); + let _ = trans1.lock().unwrap().send_raft_message(m); + }), + }; + cluster_v2.add_send_filter(factory2); + let filter11 = Box::new( + RegionPacketFilter::new(r11, 2) + .direction(Direction::Recv) + .msg_type(MessageType::MsgAppend) + .msg_type(MessageType::MsgAppendResponse) + .msg_type(MessageType::MsgSnapshot) + .msg_type(MessageType::MsgHeartbeat) + .msg_type(MessageType::MsgHeartbeatResponse), + ); + cluster_v1.add_recv_filter_on_node(2, filter11); + + cluster_v2.must_put(b"k1", b"v1"); + assert_eq!( + cluster_v2.must_get(b"k1").unwrap(), + "v1".as_bytes().to_vec() + ); + check_key_in_engine(&cluster_v1.get_engine(2), b"zk1", b"v1"); + + cluster_v1.shutdown(); + cluster_v2.shutdown(); +} + +fn check_key_in_engine(engine: &T, key: &[u8], value: &[u8]) { + for _ in 0..10 { + if let Ok(Some(vec)) = engine.get_value(key) { + assert_eq!(vec.to_vec(), value.to_vec()); + return; + } + std::thread::sleep(Duration::from_millis(200)); + } + + panic!("cannot find key {:?} in engine", key); +} From 53a5f095e179f7fbaaaa0775871e1ce457dad251 Mon Sep 17 00:00:00 2001 From: cfzjywxk Date: Fri, 28 Apr 2023 15:11:53 +0800 Subject: [PATCH 674/676] storage: add checksum logic in row slice, add cop and get test cases (#14611) ref tikv/tikv#14528 Signed-off-by: cfzjywxk --- components/test_coprocessor/src/fixture.rs | 93 ++++++- components/test_coprocessor/src/store.rs | 68 ++++- .../src/codec/data_type/scalar.rs | 36 +++ .../src/codec/row/v2/encoder_for_test.rs | 260 ++++++++++++++---- .../src/codec/row/v2/mod.rs | 1 + .../src/codec/row/v2/row_slice.rs | 160 ++++++++++- .../tidb_query_datatype/src/def/field_type.rs | 2 +- src/storage/mvcc/reader/point_getter.rs | 27 ++ tests/integrations/coprocessor/test_select.rs | 33 +++ 9 files changed, 618 insertions(+), 62 deletions(-) diff --git a/components/test_coprocessor/src/fixture.rs b/components/test_coprocessor/src/fixture.rs index a53ba4500bc..5e94d3e47fe 100644 --- a/components/test_coprocessor/src/fixture.rs +++ b/components/test_coprocessor/src/fixture.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use concurrency_manager::ConcurrencyManager; use kvproto::kvrpcpb::Context; use resource_metering::ResourceTagFactory; -use tidb_query_datatype::codec::Datum; +use tidb_query_datatype::codec::{row::v2::CODEC_VERSION, Datum}; use tikv::{ config::CoprReadPoolConfig, coprocessor::{readpool_impl, Endpoint}, @@ -71,6 +71,27 @@ pub fn init_data_with_engine_and_commit( init_data_with_details(ctx, engine, tbl, vals, commit, &Config::default()) } +pub fn init_data_with_engine_and_commit_v2_checksum( + ctx: Context, + engine: E, + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], + commit: bool, + with_checksum: bool, + extra_checksum: Option, +) -> (Store, Endpoint, Arc) { + init_data_with_details_v2_checksum( + ctx, + engine, + tbl, + vals, + commit, + &Config::default(), + with_checksum, + extra_checksum, + ) +} + pub fn init_data_with_details( ctx: Context, engine: E, @@ -78,6 +99,43 @@ pub fn init_data_with_details( vals: &[(i64, Option<&str>, i64)], commit: bool, cfg: &Config, +) -> (Store, Endpoint, Arc) { + init_data_with_details_impl(ctx, engine, tbl, vals, commit, cfg, 0, false, None) +} + +pub fn init_data_with_details_v2_checksum( + ctx: Context, + engine: E, + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], + commit: bool, + cfg: &Config, + with_checksum: bool, + extra_checksum: Option, +) -> (Store, Endpoint, Arc) { + init_data_with_details_impl( + ctx, + engine, + tbl, + vals, + commit, + cfg, + CODEC_VERSION, + with_checksum, + extra_checksum, + ) +} + +fn init_data_with_details_impl( + ctx: Context, + engine: E, + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], + commit: bool, + cfg: &Config, + codec_ver: u8, + with_checksum: bool, + extra_checksum: Option, ) -> (Store, Endpoint, Arc) { let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) .build() @@ -86,12 +144,20 @@ pub fn init_data_with_details( store.begin(); for &(id, name, count) in vals { - store + let mut inserts = store .insert_into(tbl) .set(&tbl["id"], Datum::I64(id)) .set(&tbl["name"], name.map(str::as_bytes).into()) - .set(&tbl["count"], Datum::I64(count)) - .execute_with_ctx(ctx.clone()); + .set(&tbl["count"], Datum::I64(count)); + if codec_ver == CODEC_VERSION { + inserts = inserts + .set_v2(&tbl["id"], id.into()) + .set_v2(&tbl["name"], name.unwrap().into()) + .set_v2(&tbl["count"], count.into()); + inserts.execute_with_v2_checksum(ctx.clone(), with_checksum, extra_checksum); + } else { + inserts.execute_with_ctx(ctx.clone()); + } } if commit { store.commit_with_ctx(ctx); @@ -140,3 +206,22 @@ pub fn init_with_data_ext( ) -> (Store, Endpoint, Arc) { init_data_with_commit(tbl, vals, true) } + +pub fn init_data_with_commit_v2_checksum( + tbl: &ProductTable, + vals: &[(i64, Option<&str>, i64)], + with_checksum: bool, + extra_checksum: Option, +) -> (Store, Endpoint) { + let engine = TestEngineBuilder::new().build().unwrap(); + let (store, endpoint, _) = init_data_with_engine_and_commit_v2_checksum( + Context::default(), + engine, + tbl, + vals, + true, + with_checksum, + extra_checksum, + ); + (store, endpoint) +} diff --git a/components/test_coprocessor/src/store.rs b/components/test_coprocessor/src/store.rs index 278e210bc98..96f405d8f39 100644 --- a/components/test_coprocessor/src/store.rs +++ b/components/test_coprocessor/src/store.rs @@ -6,7 +6,12 @@ use collections::HashMap; use kvproto::kvrpcpb::{Context, IsolationLevel}; use test_storage::SyncTestStorageApiV1; use tidb_query_datatype::{ - codec::{datum, table, Datum}, + codec::{ + data_type::ScalarValue, + datum, + row::v2::encoder_for_test::{Column as ColumnV2, RowEncoder}, + table, Datum, + }, expr::EvalContext, }; use tikv::{ @@ -26,6 +31,7 @@ pub struct Insert<'a, E: Engine> { store: &'a mut Store, table: &'a Table, values: BTreeMap, + values_v2: BTreeMap, } impl<'a, E: Engine> Insert<'a, E> { @@ -34,6 +40,7 @@ impl<'a, E: Engine> Insert<'a, E> { store, table, values: BTreeMap::new(), + values_v2: BTreeMap::new(), } } @@ -44,10 +51,26 @@ impl<'a, E: Engine> Insert<'a, E> { self } + pub fn set_v2(mut self, col: &Column, value: ScalarValue) -> Self { + assert!(self.table.column_by_id(col.id).is_some()); + self.values_v2.insert(col.id, value); + self + } + pub fn execute(self) -> i64 { self.execute_with_ctx(Context::default()) } + fn prepare_index_kv(&self, handle: &Datum, buf: &mut Vec<(Vec, Vec)>) { + for (&id, idxs) in &self.table.idxs { + let mut v: Vec<_> = idxs.iter().map(|id| self.values[id].clone()).collect(); + v.push(handle.clone()); + let encoded = datum::encode_key(&mut EvalContext::default(), &v).unwrap(); + let idx_key = table::encode_index_seek_key(self.table.id, id, &encoded); + buf.push((idx_key, vec![0])); + } + } + pub fn execute_with_ctx(self, ctx: Context) -> i64 { let handle = self .values @@ -59,13 +82,44 @@ impl<'a, E: Engine> Insert<'a, E> { let values: Vec<_> = self.values.values().cloned().collect(); let value = table::encode_row(&mut EvalContext::default(), values, &ids).unwrap(); let mut kvs = vec![(key, value)]; - for (&id, idxs) in &self.table.idxs { - let mut v: Vec<_> = idxs.iter().map(|id| self.values[id].clone()).collect(); - v.push(handle.clone()); - let encoded = datum::encode_key(&mut EvalContext::default(), &v).unwrap(); - let idx_key = table::encode_index_seek_key(self.table.id, id, &encoded); - kvs.push((idx_key, vec![0])); + self.prepare_index_kv(&handle, &mut kvs); + self.store.put(ctx, kvs); + handle.i64() + } + + pub fn execute_with_v2_checksum( + self, + ctx: Context, + with_checksum: bool, + extra_checksum: Option, + ) -> i64 { + let handle = self + .values + .get(&self.table.handle_id) + .cloned() + .unwrap_or_else(|| Datum::I64(next_id())); + let key = table::encode_row_key(self.table.id, handle.i64()); + let mut columns: Vec = Vec::new(); + for (id, value) in self.values_v2.iter() { + let col_info = self.table.column_by_id(*id).unwrap(); + columns.push(ColumnV2::new_with_ft( + *id, + col_info.as_field_type(), + value.to_owned(), + )); + } + let mut val_buf = Vec::new(); + if with_checksum { + val_buf + .write_row_with_checksum(&mut EvalContext::default(), columns, extra_checksum) + .unwrap(); + } else { + val_buf + .write_row(&mut EvalContext::default(), columns) + .unwrap(); } + let mut kvs = vec![(key, val_buf)]; + self.prepare_index_kv(&handle, &mut kvs); self.store.put(ctx, kvs); handle.i64() } diff --git a/components/tidb_query_datatype/src/codec/data_type/scalar.rs b/components/tidb_query_datatype/src/codec/data_type/scalar.rs index d476fd2d370..c74423107e4 100644 --- a/components/tidb_query_datatype/src/codec/data_type/scalar.rs +++ b/components/tidb_query_datatype/src/codec/data_type/scalar.rs @@ -162,6 +162,14 @@ impl From for ScalarValue { } } +impl From<&str> for ScalarValue { + #[inline] + fn from(s: &str) -> ScalarValue { + let bytes = Bytes::from(s); + ScalarValue::Bytes(Some(bytes)) + } +} + impl From for Option { #[inline] fn from(s: ScalarValue) -> Option { @@ -401,6 +409,34 @@ impl_as_ref! { Decimal, as_decimal } impl_as_ref! { DateTime, as_date_time } impl_as_ref! { Duration, as_duration } +impl ScalarValue { + #[inline] + pub fn as_enum(&self) -> Option> { + match self { + ScalarValue::Enum(x) => x.as_ref().map(|x| x.as_ref()), + other => panic!( + "Cannot cast {} scalar value into {}", + other.eval_type(), + stringify!(Int), + ), + } + } +} + +impl ScalarValue { + #[inline] + pub fn as_set(&self) -> Option> { + match self { + ScalarValue::Set(x) => x.as_ref().map(|x| x.as_ref()), + other => panic!( + "Cannot cast {} scalar value into {}", + other.eval_type(), + stringify!(Int), + ), + } + } +} + impl ScalarValue { #[inline] pub fn as_json(&self) -> Option> { diff --git a/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs b/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs index 343f2520230..5ac1cad3b32 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/encoder_for_test.rs @@ -24,12 +24,13 @@ use std::{i16, i32, i8, u16, u32, u8}; use codec::prelude::*; +use num_traits::Zero; use tipb::FieldType; use crate::{ codec::{ data_type::ScalarValue, - mysql::{decimal::DecimalEncoder, json::JsonEncoder}, + mysql::{decimal::DecimalEncoder, json::JsonEncoder, Duration}, Error, Result, }, expr::EvalContext, @@ -63,6 +64,14 @@ impl Column { } } + pub fn new_with_ft(id: i64, ft: FieldType, value: impl Into) -> Self { + Column { + id, + ft, + value: value.into(), + } + } + pub fn ft(&self) -> &FieldType { &self.ft } @@ -88,6 +97,142 @@ impl Column { self.ft.as_mut_accessor().set_decimal(decimal); self } + + // The encode rule follows https://github.com/pingcap/tidb/pull/43141. + // It's different from the other encoding rules and used for verification + // test cases in tikv, the actual checksum encoding would be done on the + // tidb side with row value generation. + pub fn encode_for_checksum(&self, buf: &mut Vec) -> Result<()> { + match self.ft.as_accessor().tp() { + FieldTypeTp::Tiny + | FieldTypeTp::Short + | FieldTypeTp::Long + | FieldTypeTp::LongLong + | FieldTypeTp::Int24 + | FieldTypeTp::Year => { + let res = self.value.as_int().ok_or(Error::InvalidDataType(format!( + "invalid type: {:?}", + self.ft, + )))?; + buf.write_u64_le(*res as u64)?; + } + FieldTypeTp::VarChar + | FieldTypeTp::VarString + | FieldTypeTp::String + | FieldTypeTp::TinyBlob + | FieldTypeTp::MediumBlob + | FieldTypeTp::LongBlob + | FieldTypeTp::Blob => { + let res = self.value.as_bytes().ok_or(Error::InvalidDataType(format!( + "invalid type: {:?}", + self.ft, + )))?; + buf.write_u32_le(res.len() as u32)?; + buf.write_bytes(res)?; + } + FieldTypeTp::Timestamp + | FieldTypeTp::DateTime + | FieldTypeTp::Date + | FieldTypeTp::NewDate => { + let time = self + .value + .as_date_time() + .ok_or(Error::InvalidDataType(format!( + "invalid type: {:?}", + self.ft, + )))? + .to_numeric_string(); + buf.write_u32_le(time.len() as u32)?; + buf.write_bytes(time.as_bytes())?; + } + FieldTypeTp::Duration => { + let dur = self + .value + .as_duration() + .ok_or(Error::InvalidDataType(format!( + "invalid type: {:?}", + self.ft, + )))? + .to_numeric_string(); + buf.write_u32_le(dur.len() as u32)?; + buf.write_bytes(dur.as_bytes())?; + } + FieldTypeTp::Float | FieldTypeTp::Double => { + let mut val = self + .value + .as_real() + .ok_or(Error::InvalidDataType(format!( + "invalid type: {:?}", + self.ft, + )))? + .to_owned(); + if val.is_infinite() || val.is_nan() { + // Because ticdc has such a transform. + val.set_zero(); + } + buf.write_u64_le(val.to_bits())?; + } + FieldTypeTp::NewDecimal => { + let dec = self + .value + .as_decimal() + .ok_or(Error::InvalidDataType(format!( + "invalid type: {:?}", + self.ft, + )))? + .to_string(); + buf.write_u32_le(dec.len() as u32)?; + buf.write_bytes(dec.as_bytes())?; + } + FieldTypeTp::Enum => { + let res = self + .value + .as_enum() + .ok_or(Error::InvalidDataType(format!( + "invalid type: {:?}", + self.ft + )))? + .value(); + buf.write_u64_le(res)?; + } + FieldTypeTp::Set => { + let res = self + .value + .as_set() + .ok_or(Error::InvalidDataType(format!( + "invalid type: {:?}", + self.ft + )))? + .value(); + buf.write_u64_le(res)?; + } + FieldTypeTp::Bit => { + // TODO: it's not supported yet. In current test only `INT` and `Varchar` + // types would be used. + buf.write_u64_le(u64::MAX)?; + } + FieldTypeTp::Json => { + let res = self + .value + .as_json() + .ok_or(Error::InvalidDataType(format!( + "invalid type: {:?}", + self.ft, + )))? + .to_string(); + buf.write_u32_le(res.len() as u32)?; + buf.write_bytes(res.as_bytes())?; + } + FieldTypeTp::Null | FieldTypeTp::Geometry => {} + _ => { + return Err(Error::Other(box_err!( + "unsupported type {:?}", + self.ft.as_accessor().tp() + ))); + } + }; + Ok(()) + } } /// Checksum @@ -97,8 +242,8 @@ impl Column { /// - CHECKSUM(4 bytes) /// - little-endian CRC32(IEEE) when hdr.ver = 0 (default) pub trait ChecksumHandler { - // update_col updates the checksum with the encoded value of the column. - fn checksum(&mut self, buf: &[u8]) -> Result<()>; + // checksum calculates the checksum value according to the input column values. + fn checksum(&mut self, cols: &[Column]) -> Result<()>; // header_value returns the checksum header value. fn header_value(&self) -> u8; @@ -110,11 +255,31 @@ pub trait ChecksumHandler { pub struct Crc32RowChecksumHandler { header: ChecksumHeader, hasher: crc32fast::Hasher, + buf: Vec, +} + +fn get_non_null_columns(cols: &[Column]) -> Vec { + let mut res = vec![]; + for col in cols { + if col.value.is_some() { + res.push(col.clone()); + } + } + res.sort_by_key(|c| c.id); + res } impl ChecksumHandler for Crc32RowChecksumHandler { - fn checksum(&mut self, buf: &[u8]) -> Result<()> { - self.hasher.update(buf); + fn checksum(&mut self, cols: &[Column]) -> Result<()> { + // For testing purposes, the DDL compatibility was not fully considered for + // checksum calculation, using all non-null columns regardless of the column's + // DDL status, such as write-reorg. + // Reference: https://github.com/pingcap/tidb/pull/43141. + for col in get_non_null_columns(cols) { + self.buf.clear(); + col.encode_for_checksum(&mut self.buf)?; + self.hasher.update(self.buf.as_slice()); + } Ok(()) } @@ -154,6 +319,7 @@ impl Crc32RowChecksumHandler { let mut res = Crc32RowChecksumHandler { header: ChecksumHeader::new(), hasher: crc32fast::Hasher::new(), + buf: Vec::new(), }; if has_extra_checksum { res.header.set_extra_checksum(); @@ -181,14 +347,15 @@ pub trait RowEncoder: NumberEncoder { extra_checksum: Option, ) -> Result<()> { let mut handler = Crc32RowChecksumHandler::new(extra_checksum.is_some()); - self.write_row_impl(ctx, columns, Some(&mut handler), extra_checksum) + handler.checksum(&columns)?; + self.write_row_impl(ctx, columns, Some(&handler), extra_checksum) } fn write_row_impl( &mut self, ctx: &mut EvalContext, columns: Vec, - mut checksum_handler: Option<&mut dyn ChecksumHandler>, + checksum_handler: Option<&dyn ChecksumHandler>, extra_checksum: Option, ) -> Result<()> { let mut is_big = false; @@ -225,7 +392,7 @@ pub trait RowEncoder: NumberEncoder { // encode begins self.write_u8(super::CODEC_VERSION)?; - self.write_flag(is_big)?; + self.write_flag(is_big, checksum_handler.is_some())?; self.write_u16_le(non_null_ids.len() as u16)?; self.write_u16_le(null_ids.len() as u16)?; @@ -241,9 +408,8 @@ pub trait RowEncoder: NumberEncoder { self.write_bytes(&offset_wtr)?; self.write_bytes(&value_wtr)?; - if let Some(checksum_handler) = checksum_handler.as_mut() { + if let Some(checksum_handler) = checksum_handler { let header_val = checksum_handler.header_value(); - checksum_handler.checksum(value_wtr.as_slice())?; let val = checksum_handler.value(); self.write_u8(header_val)?; self.write_u32_le(val)?; @@ -256,11 +422,12 @@ pub trait RowEncoder: NumberEncoder { } #[inline] - fn write_flag(&mut self, is_big: bool) -> codec::Result<()> { - let flag = if is_big { - super::Flags::BIG - } else { - super::Flags::default() + fn write_flag(&mut self, is_big: bool, has_checksum: bool) -> codec::Result<()> { + let flag = match (is_big, has_checksum) { + (true, true) => super::Flags::BIG | super::Flags::WITH_CHECKSUM, + (true, false) => super::Flags::BIG, + (false, true) => super::Flags::WITH_CHECKSUM, + (false, false) => super::Flags::default(), }; self.write_u8(flag.bits) } @@ -334,6 +501,26 @@ pub trait ScalarValueEncoder: NumberEncoder + DecimalEncoder + JsonEncoder { } impl ScalarValueEncoder for T {} +// This is a helper function for test. +pub fn prepare_cols_for_test() -> Vec { + vec![ + Column::new_with_ft(1, FieldType::from(FieldTypeTp::Short), 1000), + Column::new_with_ft(12, FieldType::from(FieldTypeTp::Long), 2), + Column::new_with_ft( + 335, + FieldType::from(FieldTypeTp::Short), + ScalarValue::Int(None), + ), + Column::new_with_ft(3, FieldType::from(FieldTypeTp::Float), 3.55), + Column::new_with_ft(8, FieldType::from(FieldTypeTp::VarChar), b"abc".to_vec()), + Column::new_with_ft( + 17, + FieldType::from(FieldTypeTp::Duration), + Duration::from_millis(34, 2).unwrap(), + ), + ] +} + #[cfg(test)] mod tests { use std::str::FromStr; @@ -346,7 +533,8 @@ mod tests { data_type::ScalarValue, mysql::{duration::NANOS_PER_SEC, Decimal, Duration, Json, Time}, row::v2::encoder_for_test::{ - ChecksumHandler, Crc32RowChecksumHandler, ScalarValueEncoder, + get_non_null_columns, prepare_cols_for_test, ChecksumHandler, + Crc32RowChecksumHandler, }, }, expr::EvalContext, @@ -423,48 +611,30 @@ mod tests { #[test] fn test_encode_checksum() { - let encode_col_values = |ctx: &mut EvalContext, non_null_cols: Vec| -> Vec { + let encode_col_values = |non_null_cols: Vec| -> Vec { let mut res = vec![]; for col in non_null_cols { - res.write_value(ctx, &col).unwrap(); - } - res - }; - let get_non_null_columns = |cols: &Vec| -> Vec { - let mut res = vec![]; - for col in cols { - if col.value.is_some() { - res.push(col.clone()); - } + col.encode_for_checksum(&mut res).unwrap(); } - res.sort_by_key(|c| c.id); res }; - let cols = vec![ - Column::new(1, 1000), - Column::new(12, 2), - Column::new(335, ScalarValue::Int(None)), - Column::new(3, 3), - Column::new(8, 32767), - ]; + let cols = prepare_cols_for_test(); let mut buf = vec![]; let mut handler = Crc32RowChecksumHandler::new(false); handler.header.set_version(0); + handler.checksum(&cols).unwrap(); buf.write_row_impl( &mut EvalContext::default(), cols.clone(), - Some(&mut handler), + Some(&handler), None, ) .unwrap(); let exp = { let mut hasher = crc32fast::Hasher::new(); - hasher.update( - encode_col_values(&mut EvalContext::default(), get_non_null_columns(&cols)) - .as_slice(), - ); + hasher.update(encode_col_values(get_non_null_columns(&cols)).as_slice()); hasher.finalize() }; let mut val_slice = &buf[buf.len() - 4..]; @@ -475,13 +645,9 @@ mod tests { buf.clear(); let mut handler = Crc32RowChecksumHandler::new(true); handler.header.set_version(1); - buf.write_row_impl( - &mut EvalContext::default(), - cols, - Some(&mut handler), - Some(exp), - ) - .unwrap(); + handler.checksum(&cols).unwrap(); + buf.write_row_impl(&mut EvalContext::default(), cols, Some(&handler), Some(exp)) + .unwrap(); let mut val_slice = &buf[buf.len() - 4..]; let mut extra_val_slice = &buf[buf.len() - 8..buf.len() - 4]; assert_eq!(exp, handler.value()); diff --git a/components/tidb_query_datatype/src/codec/row/v2/mod.rs b/components/tidb_query_datatype/src/codec/row/v2/mod.rs index b0cec291410..d7a6578f74d 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/mod.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/mod.rs @@ -11,6 +11,7 @@ bitflags! { #[derive(Default)] struct Flags: u8 { const BIG = 1; + const WITH_CHECKSUM = 2; } } diff --git a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs index 5d0c7329d54..da117c96e2c 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/row_slice.rs @@ -10,6 +10,7 @@ use num_traits::PrimInt; use crate::codec::{Error, Result}; +#[derive(Debug)] pub enum RowSlice<'a> { Small { origin: &'a [u8], @@ -17,6 +18,7 @@ pub enum RowSlice<'a> { null_ids: LeBytes<'a, u8>, offsets: LeBytes<'a, u16>, values: LeBytes<'a, u8>, + checksum: Option, }, Big { origin: &'a [u8], @@ -24,9 +26,49 @@ pub enum RowSlice<'a> { null_ids: LeBytes<'a, u32>, offsets: LeBytes<'a, u32>, values: LeBytes<'a, u8>, + checksum: Option, }, } +/// Checksum +/// - HEADER(1 byte) +/// - VER: version(3 bit) +/// - E: has extra checksum +/// - CHECKSUM(4 bytes) +/// - little-endian CRC32(IEEE) when hdr.ver = 0 (default) +#[derive(Copy, Clone, Debug)] +pub struct Checksum { + header: u8, + val: u32, + extra_val: u32, +} + +impl Checksum { + fn new(header: u8, val: u32) -> Self { + Self { + header, + val, + extra_val: 0, + } + } + + pub fn get_checksum_val(&self) -> u32 { + self.val + } + + pub fn has_extra_checksum(&self) -> bool { + (self.header & 0b1000) > 0 + } + + fn set_extra_checksum(&mut self, extra_val: u32) { + self.extra_val = extra_val; + } + + pub fn get_extra_checksum_val(&self) -> u32 { + self.extra_val + } +} + impl RowSlice<'_> { /// # Panics /// @@ -34,18 +76,21 @@ impl RowSlice<'_> { pub fn from_bytes(mut data: &[u8]) -> Result> { let origin = data; assert_eq!(data.read_u8()?, super::CODEC_VERSION); - let is_big = super::Flags::from_bits_truncate(data.read_u8()?) == super::Flags::BIG; + let flags = super::Flags::from_bits_truncate(data.read_u8()?); + let is_big = flags.contains(super::Flags::BIG); + let with_checksum = flags.contains(super::Flags::WITH_CHECKSUM); // read ids count let non_null_cnt = data.read_u16_le()? as usize; let null_cnt = data.read_u16_le()? as usize; - let row = if is_big { + let mut row = if is_big { RowSlice::Big { origin, non_null_ids: read_le_bytes(&mut data, non_null_cnt)?, null_ids: read_le_bytes(&mut data, null_cnt)?, offsets: read_le_bytes(&mut data, non_null_cnt)?, values: LeBytes::new(data), + checksum: None, } } else { RowSlice::Small { @@ -54,7 +99,20 @@ impl RowSlice<'_> { null_ids: read_le_bytes(&mut data, null_cnt)?, offsets: read_le_bytes(&mut data, non_null_cnt)?, values: LeBytes::new(data), + checksum: None, + } + }; + if with_checksum { + let mut checksum_bytes = row.cut_checksum_bytes(non_null_cnt); + assert!(checksum_bytes.len() == 5 || checksum_bytes.len() == 9); + let header = checksum_bytes.read_u8()?; + let val = checksum_bytes.read_u32_le()?; + let mut checksum = Checksum::new(header, val); + if checksum.has_extra_checksum() { + let extra_val = checksum_bytes.read_u32_le()?; + checksum.set_extra_checksum(extra_val); } + row.set_checksum(Some(checksum)); }; Ok(row) } @@ -166,6 +224,46 @@ impl RowSlice<'_> { Ok(None) } } + + #[inline] + // Return the checksum byte slice, remove it from the `values` field of + // `RowSlice`. + pub fn cut_checksum_bytes(&mut self, non_null_col_num: usize) -> &[u8] { + match self { + RowSlice::Big { + offsets, values, .. + } => { + let last_slice_idx = offsets.get(non_null_col_num - 1).unwrap() as usize; + let slice = values.slice; + *values = LeBytes::new(&slice[..last_slice_idx]); + &slice[last_slice_idx..] + } + RowSlice::Small { + offsets, values, .. + } => { + let last_slice_idx = offsets.get(non_null_col_num - 1).unwrap() as usize; + let slice = values.slice; + *values = LeBytes::new(&slice[..last_slice_idx]); + &slice[last_slice_idx..] + } + } + } + + #[inline] + pub fn get_checksum(&self) -> Option { + match self { + RowSlice::Big { checksum, .. } => *checksum, + RowSlice::Small { checksum, .. } => *checksum, + } + } + + #[inline] + fn set_checksum(&mut self, checksum_input: Option) { + match self { + RowSlice::Big { checksum, .. } => *checksum = checksum_input, + RowSlice::Small { checksum, .. } => *checksum = checksum_input, + } + } } /// Decodes `len` number of ints from `buf` in little endian @@ -189,6 +287,7 @@ where } #[cfg(target_endian = "little")] +#[derive(Debug)] pub struct LeBytes<'a, T: PrimInt> { slice: &'a [u8], _marker: PhantomData, @@ -255,12 +354,17 @@ mod tests { use std::u16; use codec::prelude::NumberEncoder; + use tipb::FieldType; use super::{ super::encoder_for_test::{Column, RowEncoder}, read_le_bytes, RowSlice, }; - use crate::{codec::data_type::ScalarValue, expr::EvalContext}; + use crate::{ + codec::data_type::{Duration, ScalarValue}, + expr::EvalContext, + FieldTypeTp, + }; #[test] fn test_read_le_bytes() { @@ -354,6 +458,56 @@ mod tests { assert!(!row.search_in_null_ids(0xFF0021)); assert!(!row.search_in_null_ids(0xFF00000021)); } + + fn encoded_data_with_checksum(extra_checksum: Option, null_row_id: i64) -> Vec { + let cols = vec![ + Column::new_with_ft(1, FieldType::from(FieldTypeTp::Short), 1000), + Column::new_with_ft(12, FieldType::from(FieldTypeTp::Long), 2), + Column::new_with_ft( + null_row_id, + FieldType::from(FieldTypeTp::Short), + ScalarValue::Int(None), + ), + Column::new_with_ft(3, FieldType::from(FieldTypeTp::Float), 3.55), + Column::new_with_ft(8, FieldType::from(FieldTypeTp::VarChar), b"abc".to_vec()), + Column::new_with_ft( + 17, + FieldType::from(FieldTypeTp::Duration), + Duration::from_millis(34, 2).unwrap(), + ), + ]; + let mut buf = vec![]; + buf.write_row_with_checksum(&mut EvalContext::default(), cols, extra_checksum) + .unwrap(); + buf + } + + #[test] + fn test_decode_with_checksum() { + for null_row_id in [235, 355] { + for extra_checksum in [None, Some(37217)] { + let data = encoded_data_with_checksum(extra_checksum, null_row_id); + let row = RowSlice::from_bytes(&data).unwrap(); + assert_eq!(null_row_id > 255, row.is_big()); + assert_eq!(Some((0, 2)), row.search_in_non_null_ids(1).unwrap()); + assert_eq!(Some((2, 10)), row.search_in_non_null_ids(3).unwrap()); + assert_eq!(Some((10, 13)), row.search_in_non_null_ids(8).unwrap()); + assert_eq!(Some((13, 14)), row.search_in_non_null_ids(12).unwrap()); + assert_eq!(Some((14, 18)), row.search_in_non_null_ids(17).unwrap()); + assert_eq!(None, row.search_in_non_null_ids(235).unwrap()); + assert!(row.search_in_null_ids(null_row_id)); + assert!(!row.search_in_null_ids(8)); + + let checksum = row.get_checksum().unwrap(); + assert!(checksum.get_checksum_val() > 0); + assert_eq!(extra_checksum.is_some(), checksum.has_extra_checksum()); + assert_eq!( + extra_checksum.unwrap_or(0), + checksum.get_extra_checksum_val() + ); + } + } + } } #[cfg(test)] diff --git a/components/tidb_query_datatype/src/def/field_type.rs b/components/tidb_query_datatype/src/def/field_type.rs index 903ec738e89..e8debe626f7 100644 --- a/components/tidb_query_datatype/src/def/field_type.rs +++ b/components/tidb_query_datatype/src/def/field_type.rs @@ -50,7 +50,7 @@ pub enum FieldTypeTp { } impl FieldTypeTp { - fn from_i32(i: i32) -> Option { + pub fn from_i32(i: i32) -> Option { if (i >= FieldTypeTp::Unspecified as i32 && i <= FieldTypeTp::Bit as i32) || (i >= FieldTypeTp::Json as i32 && i <= FieldTypeTp::Geometry as i32) { diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 2f215986ca9..8b9399b7d05 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -415,6 +415,13 @@ impl PointGetter { mod tests { use engine_rocks::ReadPerfInstant; use kvproto::kvrpcpb::{Assertion, AssertionLevel, PrewriteRequestPessimisticAction::*}; + use tidb_query_datatype::{ + codec::row::v2::{ + encoder_for_test::{prepare_cols_for_test, RowEncoder}, + RowSlice, + }, + expr::EvalContext, + }; use txn_types::SHORT_VALUE_MAX_LEN; use super::*; @@ -1289,4 +1296,24 @@ mod tests { assert_eq!(s.write.next, 0); assert_eq!(s.write.get, 0); } + + #[test] + fn test_point_get_with_checksum() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let k = b"k"; + let mut val_buf = Vec::new(); + let columns = prepare_cols_for_test(); + val_buf + .write_row_with_checksum(&mut EvalContext::default(), columns, Some(123)) + .unwrap(); + + must_prewrite_put(&mut engine, k, val_buf.as_slice(), k, 1); + must_commit(&mut engine, k, 1, 2); + + let mut getter = new_point_getter(&mut engine, 40.into()); + let val = getter.get(&Key::from_raw(k)).unwrap().unwrap(); + assert_eq!(val, val_buf.as_slice()); + let row_slice = RowSlice::from_bytes(val.as_slice()).unwrap(); + assert!(row_slice.get_checksum().unwrap().get_checksum_val() > 0); + } } diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index d5f8d55e320..8c29ea8490d 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -2054,6 +2054,39 @@ fn test_buckets() { wait_refresh_buckets(0); } +#[test] +fn test_select_v2_format_with_checksum() { + let data = vec![ + (1, Some("name:0"), 2), + (2, Some("name:4"), 3), + (4, Some("name:3"), 1), + (5, Some("name:1"), 4), + (9, Some("name:8"), 7), + (10, Some("name:6"), 8), + ]; + + let product = ProductTable::new(); + for extra_checksum in [None, Some(132423)] { + // The row value encoded with checksum bytes should have no impact on cop task + // processing and related result chunk filling. + let (_, endpoint) = + init_data_with_commit_v2_checksum(&product, &data, true, extra_checksum); + let req = DagSelect::from(&product).build(); + let mut resp = handle_select(&endpoint, req); + let spliter = DagChunkSpliter::new(resp.take_chunks().into(), 3); + for (row, (id, name, cnt)) in spliter.zip(data.clone()) { + let name_datum = name.map(|s| s.as_bytes()).into(); + let expected_encoded = datum::encode_value( + &mut EvalContext::default(), + &[Datum::I64(id), name_datum, cnt.into()], + ) + .unwrap(); + let result_encoded = datum::encode_value(&mut EvalContext::default(), &row).unwrap(); + assert_eq!(result_encoded, &*expected_encoded); + } + } +} + #[test] fn test_batch_request() { let data = vec![ From 63d79d3cec8175dad0164ad6f7b390b573a54652 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 28 Apr 2023 18:13:52 +0800 Subject: [PATCH 675/676] raftstore-v2: support dynamic config write buffer settings (#14565) ref tikv/tikv#12842 support dynamically adjusting write buffer settings Signed-off-by: tabokie Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- components/engine_panic/src/db_options.rs | 8 ++++ components/engine_rocks/src/db_options.rs | 18 +++++++++ components/engine_traits/src/db_options.rs | 2 + components/tikv_util/src/config.rs | 2 +- src/config/configurable.rs | 46 ++++++++++++++++++++-- src/config/mod.rs | 16 +++++++- 6 files changed, 85 insertions(+), 7 deletions(-) diff --git a/components/engine_panic/src/db_options.rs b/components/engine_panic/src/db_options.rs index 47ce356deac..c081a5c1d12 100644 --- a/components/engine_panic/src/db_options.rs +++ b/components/engine_panic/src/db_options.rs @@ -44,6 +44,14 @@ impl DbOptions for PanicDbOptions { panic!() } + fn set_flush_size(&mut self, f: usize) -> Result<()> { + panic!() + } + + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()> { + panic!() + } + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { panic!() } diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index f437cc7b433..c9ef2cfda98 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -91,6 +91,24 @@ impl DbOptions for RocksDbOptions { Ok(()) } + fn set_flush_size(&mut self, f: usize) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_size(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()> { + if let Some(m) = self.0.get_write_buffer_manager() { + m.set_flush_oldest_first(f); + } else { + return Err(box_err!("write buffer manager not found")); + } + Ok(()) + } + fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { self.0.set_titandb_options(opts.as_raw()) } diff --git a/components/engine_traits/src/db_options.rs b/components/engine_traits/src/db_options.rs index fcfc17ea78f..2c6e9c3d4e8 100644 --- a/components/engine_traits/src/db_options.rs +++ b/components/engine_traits/src/db_options.rs @@ -20,6 +20,8 @@ pub trait DbOptions { fn set_rate_bytes_per_sec(&mut self, rate_bytes_per_sec: i64) -> Result<()>; fn get_rate_limiter_auto_tuned(&self) -> Option; fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()>; + fn set_flush_size(&mut self, f: usize) -> Result<()>; + fn set_flush_oldest_first(&mut self, f: bool) -> Result<()>; fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions); } diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index c55cebea0ff..c3ace2a5dfe 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -255,7 +255,7 @@ impl<'de> Deserialize<'de> for ReadableSize { } } -#[derive(Clone, Copy, Debug, PartialEq, Eq, Ord, PartialOrd)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Ord, PartialOrd, Default)] pub struct ReadableDuration(pub Duration); impl Add for ReadableDuration { diff --git a/src/config/configurable.rs b/src/config/configurable.rs index 7cbcc731eb6..142d14a0304 100644 --- a/src/config/configurable.rs +++ b/src/config/configurable.rs @@ -14,6 +14,8 @@ pub trait ConfigurableDb { fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> ConfigRes; fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes; fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes; + fn set_flush_size(&self, f: usize) -> ConfigRes; + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes; fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes; } @@ -49,6 +51,16 @@ impl ConfigurableDb for RocksEngine { } } + fn set_flush_size(&self, f: usize) -> ConfigRes { + let mut opt = self.get_db_options(); + opt.set_flush_size(f).map_err(Box::from) + } + + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes { + let mut opt = self.get_db_options(); + opt.set_flush_oldest_first(f).map_err(Box::from) + } + fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes { let opt = self.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap opt.set_block_cache_capacity(capacity as u64) @@ -113,18 +125,44 @@ impl ConfigurableDb for TabletRegistry { fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes { loop_registry(self, |cache| { if let Some(latest) = cache.latest() { - latest.set_rate_bytes_per_sec(rate_bytes_per_sec)? + latest.set_rate_bytes_per_sec(rate_bytes_per_sec)?; + Ok(false) + } else { + Ok(true) } - Ok(true) }) } fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes { loop_registry(self, |cache| { if let Some(latest) = cache.latest() { - latest.set_rate_limiter_auto_tuned(auto_tuned)? + latest.set_rate_limiter_auto_tuned(auto_tuned)?; + Ok(false) + } else { + Ok(true) + } + }) + } + + fn set_flush_size(&self, f: usize) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_flush_size(f)?; + Ok(false) + } else { + Ok(true) + } + }) + } + + fn set_flush_oldest_first(&self, f: bool) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_flush_oldest_first(f)?; + Ok(false) + } else { + Ok(true) } - Ok(true) }) } diff --git a/src/config/mod.rs b/src/config/mod.rs index 62a7de89130..7284fef25db 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1228,13 +1228,11 @@ pub struct DbConfig { pub enable_unordered_write: bool, #[online_config(skip)] pub allow_concurrent_memtable_write: Option, - #[online_config(skip)] pub write_buffer_limit: Option, #[online_config(skip)] #[doc(hidden)] #[serde(skip_serializing)] pub write_buffer_stall_ratio: f32, - #[online_config(skip)] #[doc(hidden)] #[serde(skip_serializing)] pub write_buffer_flush_oldest_first: bool, @@ -1958,6 +1956,20 @@ impl ConfigManager for DbConfigManger { .set_rate_limiter_auto_tuned(rate_limiter_auto_tuned)?; } + if let Some(size) = change + .drain_filter(|(name, _)| name == "write_buffer_limit") + .next() + { + self.db.set_flush_size(size.1.into())?; + } + + if let Some(f) = change + .drain_filter(|(name, _)| name == "write_buffer_flush_oldest_first") + .next() + { + self.db.set_flush_oldest_first(f.1.into())?; + } + if let Some(background_jobs_config) = change .drain_filter(|(name, _)| name == "max_background_jobs") .next() From d5c01113daa5a25d969020d852c9d098eb9ce749 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 5 May 2023 10:35:55 +0800 Subject: [PATCH 676/676] raftstore: pub snapshot_meta (#14674) ref tikv/tikv#14575 Make snapshot_meta accessible Signed-off-by: CalvinNeo --- components/raftstore/src/store/snap.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 4f347002f67..62744501195 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1040,6 +1040,10 @@ impl Snapshot { pub fn tablet_snap_path(&self) -> Option { Some(self.meta_file.meta.as_ref()?.tablet_snap_path.clone()) } + + pub fn snapshot_meta(&self) -> &Option { + &self.meta_file.meta + } } impl fmt::Debug for Snapshot {